diff --git a/aliyun/src/main/java/org/apache/iceberg/aliyun/AliyunClientFactories.java b/aliyun/src/main/java/org/apache/iceberg/aliyun/AliyunClientFactories.java index c512d718da75..5807f9bfe119 100644 --- a/aliyun/src/main/java/org/apache/iceberg/aliyun/AliyunClientFactories.java +++ b/aliyun/src/main/java/org/apache/iceberg/aliyun/AliyunClientFactories.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aliyun; import com.aliyun.oss.OSS; @@ -28,35 +27,41 @@ public class AliyunClientFactories { - private static final AliyunClientFactory ALIYUN_CLIENT_FACTORY_DEFAULT = new DefaultAliyunClientFactory(); + private static final AliyunClientFactory ALIYUN_CLIENT_FACTORY_DEFAULT = + new DefaultAliyunClientFactory(); - private AliyunClientFactories() { - } + private AliyunClientFactories() {} public static AliyunClientFactory defaultFactory() { return ALIYUN_CLIENT_FACTORY_DEFAULT; } public static AliyunClientFactory from(Map properties) { - String factoryImpl = PropertyUtil.propertyAsString( - properties, AliyunProperties.CLIENT_FACTORY, DefaultAliyunClientFactory.class.getName()); + String factoryImpl = + PropertyUtil.propertyAsString( + properties, + AliyunProperties.CLIENT_FACTORY, + DefaultAliyunClientFactory.class.getName()); return loadClientFactory(factoryImpl, properties); } /** * Load an implemented {@link AliyunClientFactory} based on the class name, and initialize it. * - * @param impl the class name. + * @param impl the class name. * @param properties to initialize the factory. * @return an initialized {@link AliyunClientFactory}. */ - private static AliyunClientFactory loadClientFactory(String impl, Map properties) { + private static AliyunClientFactory loadClientFactory( + String impl, Map properties) { DynConstructors.Ctor ctor; try { ctor = DynConstructors.builder(AliyunClientFactory.class).hiddenImpl(impl).buildChecked(); } catch (NoSuchMethodException e) { - throw new IllegalArgumentException(String.format( - "Cannot initialize AliyunClientFactory, missing no-arg constructor: %s", impl), e); + throw new IllegalArgumentException( + String.format( + "Cannot initialize AliyunClientFactory, missing no-arg constructor: %s", impl), + e); } AliyunClientFactory factory; @@ -64,7 +69,10 @@ private static AliyunClientFactory loadClientFactory(String impl, Map properties); - /** - * Returns an initialized {@link AliyunProperties} - */ + /** Returns an initialized {@link AliyunProperties} */ AliyunProperties aliyunProperties(); } diff --git a/aliyun/src/main/java/org/apache/iceberg/aliyun/AliyunProperties.java b/aliyun/src/main/java/org/apache/iceberg/aliyun/AliyunProperties.java index 7474e6a3b5a5..623b55263af7 100644 --- a/aliyun/src/main/java/org/apache/iceberg/aliyun/AliyunProperties.java +++ b/aliyun/src/main/java/org/apache/iceberg/aliyun/AliyunProperties.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aliyun; import java.io.Serializable; @@ -26,41 +25,43 @@ public class AliyunProperties implements Serializable { /** - * The domain name used to access OSS. OSS uses HTTP Restful APIs to provide services. Different regions are accessed - * by using different endpoints. For the same region, access over the internal network or over the Internet also uses - * different endpoints. For more information, see: + * The domain name used to access OSS. OSS uses HTTP Restful APIs to provide services. Different + * regions are accessed by using different endpoints. For the same region, access over the + * internal network or over the Internet also uses different endpoints. For more information, see: * https://www.alibabacloud.com/help/doc-detail/31837.htm */ public static final String OSS_ENDPOINT = "oss.endpoint"; /** - * Aliyun uses an AccessKey pair, which includes an AccessKey ID and an AccessKey secret to implement symmetric - * encryption and verify the identity of a requester. The AccessKey ID is used to identify a user. - *

- * For more information about how to obtain an AccessKey pair, see: + * Aliyun uses an AccessKey pair, which includes an AccessKey ID and an AccessKey secret to + * implement symmetric encryption and verify the identity of a requester. The AccessKey ID is used + * to identify a user. + * + *

For more information about how to obtain an AccessKey pair, see: * https://www.alibabacloud.com/help/doc-detail/53045.htm */ public static final String CLIENT_ACCESS_KEY_ID = "client.access-key-id"; /** - * Aliyun uses an AccessKey pair, which includes an AccessKey ID and an AccessKey secret to implement symmetric - * encryption and verify the identity of a requester. The AccessKey secret is used to encrypt and verify the - * signature string. - *

- * For more information about how to obtain an AccessKey pair, see: + * Aliyun uses an AccessKey pair, which includes an AccessKey ID and an AccessKey secret to + * implement symmetric encryption and verify the identity of a requester. The AccessKey secret is + * used to encrypt and verify the signature string. + * + *

For more information about how to obtain an AccessKey pair, see: * https://www.alibabacloud.com/help/doc-detail/53045.htm */ public static final String CLIENT_ACCESS_KEY_SECRET = "client.access-key-secret"; /** - * The implementation class of {@link AliyunClientFactory} to customize Aliyun client configurations. - * If set, all Aliyun clients will be initialized by the specified factory. - * If not set, {@link AliyunClientFactories#defaultFactory()} is used as default factory. + * The implementation class of {@link AliyunClientFactory} to customize Aliyun client + * configurations. If set, all Aliyun clients will be initialized by the specified factory. If not + * set, {@link AliyunClientFactories#defaultFactory()} is used as default factory. */ public static final String CLIENT_FACTORY = "client.factory-impl"; /** - * Location to put staging files for uploading to OSS, defaults to the directory value of java.io.tmpdir. + * Location to put staging files for uploading to OSS, defaults to the directory value of + * java.io.tmpdir. */ public static final String OSS_STAGING_DIRECTORY = "oss.staging-dir"; @@ -79,8 +80,9 @@ public AliyunProperties(Map properties) { this.accessKeyId = properties.get(CLIENT_ACCESS_KEY_ID); this.accessKeySecret = properties.get(CLIENT_ACCESS_KEY_SECRET); - this.ossStagingDirectory = PropertyUtil.propertyAsString(properties, OSS_STAGING_DIRECTORY, - System.getProperty("java.io.tmpdir")); + this.ossStagingDirectory = + PropertyUtil.propertyAsString( + properties, OSS_STAGING_DIRECTORY, System.getProperty("java.io.tmpdir")); } public String ossEndpoint() { diff --git a/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/BaseOSSFile.java b/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/BaseOSSFile.java index 48aca1523a4d..d957e82f92ed 100644 --- a/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/BaseOSSFile.java +++ b/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/BaseOSSFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aliyun.oss; import com.aliyun.oss.OSS; @@ -62,8 +61,8 @@ public boolean exists() { return objectMetadata() != null; } catch (OSSException e) { - if (e.getErrorCode().equals(OSSErrorCode.NO_SUCH_BUCKET) || - e.getErrorCode().equals(OSSErrorCode.NO_SUCH_KEY)) { + if (e.getErrorCode().equals(OSSErrorCode.NO_SUCH_BUCKET) + || e.getErrorCode().equals(OSSErrorCode.NO_SUCH_KEY)) { return false; } @@ -85,8 +84,6 @@ protected MetricsContext metrics() { @Override public String toString() { - return MoreObjects.toStringHelper(this) - .add("file", uri) - .toString(); + return MoreObjects.toStringHelper(this).add("file", uri).toString(); } } diff --git a/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSFileIO.java b/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSFileIO.java index 61570ef94c68..be85b93a75f5 100644 --- a/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSFileIO.java +++ b/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSFileIO.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aliyun.oss; import com.aliyun.oss.OSS; @@ -36,14 +35,15 @@ /** * FileIO implementation backed by OSS. - *

- * Locations used must follow the conventions for OSS URIs (e.g. oss://bucket/path...). - * URIs with scheme https are also treated as oss file paths. - * Using this FileIO with other schemes with result in {@link org.apache.iceberg.exceptions.ValidationException} + * + *

Locations used must follow the conventions for OSS URIs (e.g. oss://bucket/path...). URIs with + * scheme https are also treated as oss file paths. Using this FileIO with other schemes with result + * in {@link org.apache.iceberg.exceptions.ValidationException} */ public class OSSFileIO implements FileIO { private static final Logger LOG = LoggerFactory.getLogger(OSSFileIO.class); - private static final String DEFAULT_METRICS_IMPL = "org.apache.iceberg.hadoop.HadoopMetricsContext"; + private static final String DEFAULT_METRICS_IMPL = + "org.apache.iceberg.hadoop.HadoopMetricsContext"; private SerializableSupplier oss; private AliyunProperties aliyunProperties; @@ -53,16 +53,16 @@ public class OSSFileIO implements FileIO { /** * No-arg constructor to load the FileIO dynamically. - *

- * All fields are initialized by calling {@link OSSFileIO#initialize(Map)} later. + * + *

All fields are initialized by calling {@link OSSFileIO#initialize(Map)} later. */ - public OSSFileIO() { - } + public OSSFileIO() {} /** * Constructor with custom oss supplier and default aliyun properties. - *

- * Calling {@link OSSFileIO#initialize(Map)} will overwrite information set in this constructor. + * + *

Calling {@link OSSFileIO#initialize(Map)} will overwrite information set in this + * constructor. * * @param oss oss supplier */ @@ -107,12 +107,17 @@ public void initialize(Map properties) { // Report Hadoop metrics if Hadoop is available try { DynConstructors.Ctor ctor = - DynConstructors.builder(MetricsContext.class).hiddenImpl(DEFAULT_METRICS_IMPL, String.class).buildChecked(); + DynConstructors.builder(MetricsContext.class) + .hiddenImpl(DEFAULT_METRICS_IMPL, String.class) + .buildChecked(); MetricsContext context = ctor.newInstance("oss"); context.initialize(properties); this.metrics = context; } catch (NoClassDefFoundError | NoSuchMethodException | ClassCastException e) { - LOG.warn("Unable to load metrics class: '{}', falling back to null metrics", DEFAULT_METRICS_IMPL, e); + LOG.warn( + "Unable to load metrics class: '{}', falling back to null metrics", + DEFAULT_METRICS_IMPL, + e); } } diff --git a/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSInputFile.java b/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSInputFile.java index 4c5242721257..40ab3a021e08 100644 --- a/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSInputFile.java +++ b/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSInputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aliyun.oss; import com.aliyun.oss.OSS; @@ -26,9 +25,7 @@ import org.apache.iceberg.io.SeekableInputStream; import org.apache.iceberg.metrics.MetricsContext; -/** - * @deprecated moving to package-private in 0.15.0; use OSSFileIO to create InputFile instances - */ +/** @deprecated moving to package-private in 0.15.0; use OSSFileIO to create InputFile instances */ @Deprecated public class OSSInputFile extends BaseOSSFile implements InputFile { @@ -38,7 +35,12 @@ public class OSSInputFile extends BaseOSSFile implements InputFile { super(client, uri, aliyunProperties, metrics); } - OSSInputFile(OSS client, OSSURI uri, AliyunProperties aliyunProperties, long length, MetricsContext metrics) { + OSSInputFile( + OSS client, + OSSURI uri, + AliyunProperties aliyunProperties, + long length, + MetricsContext metrics) { super(client, uri, aliyunProperties, metrics); ValidationException.check(length >= 0, "Invalid file length: %s", length); this.length = length; diff --git a/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSInputStream.java b/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSInputStream.java index b68ba97c3387..58359faeb283 100644 --- a/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSInputStream.java +++ b/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSInputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aliyun.oss; import com.aliyun.oss.OSS; @@ -35,9 +34,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * @deprecated moving to package-private in 0.15.0 - */ +/** @deprecated moving to package-private in 0.15.0 */ @Deprecated public class OSSInputStream extends SeekableInputStream { private static final Logger LOG = LoggerFactory.getLogger(OSSInputStream.class); @@ -55,9 +52,7 @@ public class OSSInputStream extends SeekableInputStream { private final Counter readBytes; private final Counter readOperations; - /** - * @deprecated moving to package-private in 0.15.0 - */ + /** @deprecated moving to package-private in 0.15.0 */ @Deprecated public OSSInputStream(OSS client, OSSURI uri) { this(client, uri, MetricsContext.nullMetrics()); @@ -69,7 +64,8 @@ public OSSInputStream(OSS client, OSSURI uri) { this.createStack = Thread.currentThread().getStackTrace(); this.readBytes = metrics.counter(FileIOMetricsContext.READ_BYTES, Long.class, Unit.BYTES); - this.readOperations = metrics.counter(FileIOMetricsContext.READ_OPERATIONS, Integer.class, Unit.COUNT); + this.readOperations = + metrics.counter(FileIOMetricsContext.READ_OPERATIONS, Integer.class, Unit.COUNT); } @Override diff --git a/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSOutputFile.java b/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSOutputFile.java index d8865c7f78c9..944047809022 100644 --- a/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSOutputFile.java +++ b/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSOutputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aliyun.oss; import com.aliyun.oss.OSS; @@ -33,8 +32,10 @@ class OSSOutputFile extends BaseOSSFile implements OutputFile { super(client, uri, aliyunProperties, metrics); } - static OSSOutputFile fromLocation(OSS client, String location, AliyunProperties aliyunProperties) { - return new OSSOutputFile(client, new OSSURI(location), aliyunProperties, MetricsContext.nullMetrics()); + static OSSOutputFile fromLocation( + OSS client, String location, AliyunProperties aliyunProperties) { + return new OSSOutputFile( + client, new OSSURI(location), aliyunProperties, MetricsContext.nullMetrics()); } @Override diff --git a/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSOutputStream.java b/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSOutputStream.java index dbbd513f7a23..cd761434dc4b 100644 --- a/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSOutputStream.java +++ b/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSOutputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aliyun.oss; import com.aliyun.oss.OSS; @@ -59,7 +58,8 @@ public class OSSOutputStream extends PositionOutputStream { private final Counter writeBytes; private final Counter writeOperations; - OSSOutputStream(OSS client, OSSURI uri, AliyunProperties aliyunProperties, MetricsContext metrics) { + OSSOutputStream( + OSS client, OSSURI uri, AliyunProperties aliyunProperties, MetricsContext metrics) { this.client = client; this.uri = uri; this.createStack = Thread.currentThread().getStackTrace(); @@ -67,7 +67,8 @@ public class OSSOutputStream extends PositionOutputStream { this.currentStagingFile = newStagingFile(aliyunProperties.ossStagingDirectory()); this.stream = newStream(currentStagingFile); this.writeBytes = metrics.counter(FileIOMetricsContext.WRITE_BYTES, Long.class, Unit.BYTES); - this.writeOperations = metrics.counter(FileIOMetricsContext.WRITE_OPERATIONS, Integer.class, Unit.COUNT); + this.writeOperations = + metrics.counter(FileIOMetricsContext.WRITE_OPERATIONS, Integer.class, Unit.COUNT); } private static File newStagingFile(String ossStagingDirectory) { @@ -154,7 +155,8 @@ private void completeUploads() { ObjectMetadata metadata = new ObjectMetadata(); metadata.setContentLength(contentLength); - PutObjectRequest request = new PutObjectRequest(uri.bucket(), uri.key(), contentStream, metadata); + PutObjectRequest request = + new PutObjectRequest(uri.bucket(), uri.key(), contentStream, metadata); client.putObject(request); } diff --git a/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSURI.java b/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSURI.java index c9dfd8acf3fe..74b937ac9bf5 100644 --- a/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSURI.java +++ b/aliyun/src/main/java/org/apache/iceberg/aliyun/oss/OSSURI.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aliyun.oss; import com.aliyun.oss.internal.OSSUtils; @@ -26,13 +25,11 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; /** - * This class represents a fully qualified location in OSS for input/output - * operations expressed as as URI. This implementation is provided to - * ensure compatibility with Hadoop Path implementations that may introduce - * encoding issues with native URI implementation. + * This class represents a fully qualified location in OSS for input/output operations expressed as + * as URI. This implementation is provided to ensure compatibility with Hadoop Path implementations + * that may introduce encoding issues with native URI implementation. * - * Note: Path-style access is deprecated and not supported by this - * implementation. + *

Note: Path-style access is deprecated and not supported by this implementation. */ public class OSSURI { private static final String SCHEME_DELIM = "://"; @@ -45,19 +42,16 @@ public class OSSURI { private final String key; /** - * Creates a new OSSURI based on the bucket and key parsed from the location - * The location in string form has the syntax as below, which refers to RFC2396: - * [scheme:][//bucket][object key][#fragment] - * [scheme:][//bucket][object key][?query][#fragment] + * Creates a new OSSURI based on the bucket and key parsed from the location The location in + * string form has the syntax as below, which refers to RFC2396: [scheme:][//bucket][object + * key][#fragment] [scheme:][//bucket][object key][?query][#fragment] * - * It specifies precisely which characters are permitted in the various components of a URI reference - * in Aliyun OSS documentation as below: - * Bucket: https://help.aliyun.com/document_detail/257087.html - * Object: https://help.aliyun.com/document_detail/273129.html - * Scheme: https or oss + *

It specifies precisely which characters are permitted in the various components of a URI + * reference in Aliyun OSS documentation as below: Bucket: + * https://help.aliyun.com/document_detail/257087.html Object: + * https://help.aliyun.com/document_detail/273129.html Scheme: https or oss * - *

- * Supported access styles are https and oss://... URIs. + *

Supported access styles are https and oss://... URIs. * * @param location fully qualified URI. */ @@ -69,14 +63,17 @@ public OSSURI(String location) { ValidationException.check(schemeSplit.length == 2, "Invalid OSS location: %s", location); String scheme = schemeSplit[0]; - ValidationException.check(VALID_SCHEMES.contains(scheme.toLowerCase()), - "Invalid scheme: %s in OSS location %s", scheme, location); + ValidationException.check( + VALID_SCHEMES.contains(scheme.toLowerCase()), + "Invalid scheme: %s in OSS location %s", + scheme, + location); String[] authoritySplit = schemeSplit[1].split(PATH_DELIM, 2); - ValidationException.check(authoritySplit.length == 2, - "Invalid bucket or key in OSS location: %s", location); - ValidationException.check(!authoritySplit[1].trim().isEmpty(), - "Missing key in OSS location: %s", location); + ValidationException.check( + authoritySplit.length == 2, "Invalid bucket or key in OSS location: %s", location); + ValidationException.check( + !authoritySplit[1].trim().isEmpty(), "Missing key in OSS location: %s", location); this.bucket = authoritySplit[0]; OSSUtils.ensureBucketNameValid(bucket); @@ -88,23 +85,17 @@ public OSSURI(String location) { OSSUtils.ensureObjectKeyValid(key); } - /** - * Return OSS bucket name. - */ + /** Return OSS bucket name. */ public String bucket() { return bucket; } - /** - * Return OSS object key name. - */ + /** Return OSS object key name. */ public String key() { return key; } - /** - * Return original, unmodified OSS URI location. - */ + /** Return original, unmodified OSS URI location. */ public String location() { return location; } diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/TestAliyunClientFactories.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/TestAliyunClientFactories.java index d5518a1ba870..fa071e86051f 100644 --- a/aliyun/src/test/java/org/apache/iceberg/aliyun/TestAliyunClientFactories.java +++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/TestAliyunClientFactories.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aliyun; import com.aliyun.oss.OSS; @@ -30,23 +29,27 @@ public class TestAliyunClientFactories { @Test public void testLoadDefault() { - Assert.assertEquals("Default client should be singleton", - AliyunClientFactories.defaultFactory(), AliyunClientFactories.defaultFactory()); + Assert.assertEquals( + "Default client should be singleton", + AliyunClientFactories.defaultFactory(), + AliyunClientFactories.defaultFactory()); AliyunClientFactory defaultFactory = AliyunClientFactories.from(Maps.newHashMap()); Assert.assertTrue( "Should load default when factory impl not configured", - defaultFactory instanceof AliyunClientFactories.DefaultAliyunClientFactory); - Assert.assertNull("Should have no Aliyun properties set", - defaultFactory.aliyunProperties().accessKeyId()); + defaultFactory instanceof AliyunClientFactories.DefaultAliyunClientFactory); + Assert.assertNull( + "Should have no Aliyun properties set", defaultFactory.aliyunProperties().accessKeyId()); - AliyunClientFactory defaultFactoryWithConfig = AliyunClientFactories.from( - ImmutableMap.of(AliyunProperties.CLIENT_ACCESS_KEY_ID, "key")); + AliyunClientFactory defaultFactoryWithConfig = + AliyunClientFactories.from(ImmutableMap.of(AliyunProperties.CLIENT_ACCESS_KEY_ID, "key")); Assert.assertTrue( "Should load default when factory impl not configured", defaultFactoryWithConfig instanceof AliyunClientFactories.DefaultAliyunClientFactory); - Assert.assertEquals("Should have access key set", - "key", defaultFactoryWithConfig.aliyunProperties().accessKeyId()); + Assert.assertEquals( + "Should have access key set", + "key", + defaultFactoryWithConfig.aliyunProperties().accessKeyId()); } @Test @@ -62,8 +65,7 @@ public static class CustomFactory implements AliyunClientFactory { AliyunProperties aliyunProperties; - public CustomFactory() { - } + public CustomFactory() {} @Override public OSS newOSSClient() { diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/TestUtility.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/TestUtility.java index 762074de32a1..ac87a82fd7e0 100644 --- a/aliyun/src/test/java/org/apache/iceberg/aliyun/TestUtility.java +++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/TestUtility.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aliyun; import org.apache.iceberg.aliyun.oss.AliyunOSSTestRule; @@ -40,8 +39,7 @@ public class TestUtility { private static final String ALIYUN_TEST_OSS_ENDPOINT = "ALIYUN_TEST_OSS_ENDPOINT"; private static final String ALIYUN_TEST_OSS_WAREHOUSE = "ALIYUN_TEST_OSS_WAREHOUSE"; - private TestUtility() { - } + private TestUtility() {} public static AliyunOSSTestRule initialize() { AliyunOSSTestRule testRule; @@ -54,11 +52,15 @@ public static AliyunOSSTestRule initialize() { DynConstructors.builder(AliyunOSSTestRule.class).impl(implClass).buildChecked(); testRule = ctor.newInstance(); } catch (NoSuchMethodException e) { - throw new IllegalArgumentException(String.format( - "Cannot initialize AliyunOSSTestRule, missing no-arg constructor: %s", implClass), e); + throw new IllegalArgumentException( + String.format( + "Cannot initialize AliyunOSSTestRule, missing no-arg constructor: %s", implClass), + e); } catch (ClassCastException e) { - throw new IllegalArgumentException(String.format( - "Cannot initialize AliyunOSSTestRule, %s does not implement it.", implClass), e); + throw new IllegalArgumentException( + String.format( + "Cannot initialize AliyunOSSTestRule, %s does not implement it.", implClass), + e); } } else { LOG.info("Initializing AliyunOSSTestRule implementation with default AliyunOSSMockRule"); @@ -94,8 +96,10 @@ public static String ossKey() { private static OSSURI ossWarehouseURI() { String ossWarehouse = ossWarehouse(); - Preconditions.checkNotNull(ossWarehouse, - "Please set a correct Aliyun OSS path for environment variable '%s'", ALIYUN_TEST_OSS_WAREHOUSE); + Preconditions.checkNotNull( + ossWarehouse, + "Please set a correct Aliyun OSS path for environment variable '%s'", + ALIYUN_TEST_OSS_WAREHOUSE); return new OSSURI(ossWarehouse); } diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/AliyunOSSTestBase.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/AliyunOSSTestBase.java index 220a867832e5..8b42cfe9bd18 100644 --- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/AliyunOSSTestBase.java +++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/AliyunOSSTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aliyun.oss; import com.aliyun.oss.OSS; @@ -27,8 +26,7 @@ import org.junit.ClassRule; public abstract class AliyunOSSTestBase { - @ClassRule - public static final AliyunOSSTestRule OSS_TEST_RULE = TestUtility.initialize(); + @ClassRule public static final AliyunOSSTestRule OSS_TEST_RULE = TestUtility.initialize(); private final SerializableSupplier ossClient = OSS_TEST_RULE::createOSSClient; private final String bucketName = OSS_TEST_RULE.testBucketName(); diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/AliyunOSSTestRule.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/AliyunOSSTestRule.java index 3e43e5df6c1e..b9afa952aaa3 100644 --- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/AliyunOSSTestRule.java +++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/AliyunOSSTestRule.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aliyun.oss; import com.aliyun.oss.OSS; @@ -26,16 +25,15 @@ import org.junit.runners.model.Statement; /** - * API for test Aliyun Object Storage Service (OSS) which is either local mock http server or remote aliyun oss server - *

- * This API includes start,stop OSS service, create OSS client, setup bucket and teardown bucket. + * API for test Aliyun Object Storage Service (OSS) which is either local mock http server or remote + * aliyun oss server + * + *

This API includes start,stop OSS service, create OSS client, setup bucket and teardown bucket. */ public interface AliyunOSSTestRule extends TestRule { UUID RANDOM_UUID = java.util.UUID.randomUUID(); - /** - * Returns a specific bucket name for testing purpose. - */ + /** Returns a specific bucket name for testing purpose. */ default String testBucketName() { return String.format("oss-testing-bucket-%s", RANDOM_UUID); } @@ -56,9 +54,10 @@ public void evaluate() throws Throwable { } /** - * Returns the common key prefix for those newly created objects in test cases. For example, we set the test bucket - * to be 'oss-testing-bucket' and the key prefix to be 'iceberg-objects/', then the produced objects in test cases - * will be: + * Returns the common key prefix for those newly created objects in test cases. For example, we + * set the test bucket to be 'oss-testing-bucket' and the key prefix to be 'iceberg-objects/', + * then the produced objects in test cases will be: + * *

    *   oss://oss-testing-bucket/iceberg-objects/a.dat
    *   oss://oss-testing-bucket/iceberg-objects/b.dat
@@ -67,28 +66,21 @@ public void evaluate() throws Throwable {
    */
   String keyPrefix();
 
-  /**
-   * Start the Aliyun Object storage services application that the OSS client could connect to.
-   */
+  /** Start the Aliyun Object storage services application that the OSS client could connect to. */
   void start();
 
-  /**
-   * Stop the Aliyun object storage services.
-   */
+  /** Stop the Aliyun object storage services. */
   void stop();
 
-  /**
-   * Returns an newly created {@link OSS} client.
-   */
+  /** Returns an newly created {@link OSS} client. */
   OSS createOSSClient();
 
   /**
-   * Preparation work of bucket for the test case, for example we need to check the existence of specific bucket.
+   * Preparation work of bucket for the test case, for example we need to check the existence of
+   * specific bucket.
    */
   void setUpBucket(String bucket);
 
-  /**
-   * Clean all the objects that created from this test suite in the bucket.
-   */
+  /** Clean all the objects that created from this test suite in the bucket. */
   void tearDownBucket(String bucket);
 }
diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/OSSIntegrationTestRule.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/OSSIntegrationTestRule.java
index 691d6d02eb2a..21e427385a9d 100644
--- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/OSSIntegrationTestRule.java
+++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/OSSIntegrationTestRule.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.aliyun.oss;
 
 import com.aliyun.oss.OSS;
@@ -80,7 +79,8 @@ public OSS createOSSClient() {
   public void setUpBucket(String bucket) {
     Preconditions.checkArgument(
         ossClient().doesBucketExist(bucket),
-        "Bucket %s does not exist, please create it firstly.", bucket);
+        "Bucket %s does not exist, please create it firstly.",
+        bucket);
   }
 
   @Override
@@ -89,10 +89,11 @@ public void tearDownBucket(String bucket) {
     String nextContinuationToken = null;
     ListObjectsV2Result objectListingResult;
     do {
-      ListObjectsV2Request listObjectsV2Request = new ListObjectsV2Request(bucket)
-          .withMaxKeys(maxKeys)
-          .withPrefix(ossKey)
-          .withContinuationToken(nextContinuationToken);
+      ListObjectsV2Request listObjectsV2Request =
+          new ListObjectsV2Request(bucket)
+              .withMaxKeys(maxKeys)
+              .withPrefix(ossKey)
+              .withContinuationToken(nextContinuationToken);
       objectListingResult = ossClient().listObjectsV2(listObjectsV2Request);
 
       for (OSSObjectSummary s : objectListingResult.getObjectSummaries()) {
diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSFileIO.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSFileIO.java
index 9bebfae7c023..febbf3fe33b5 100644
--- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSFileIO.java
+++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSFileIO.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.aliyun.oss;
 
 import com.aliyun.oss.OSS;
@@ -74,7 +73,8 @@ public void testOutputFile() throws IOException {
     writeOSSData(out, data);
 
     OSSURI uri = new OSSURI(location);
-    Assert.assertTrue("OSS file should exist", ossClient().get().doesObjectExist(uri.bucket(), uri.key()));
+    Assert.assertTrue(
+        "OSS file should exist", ossClient().get().doesObjectExist(uri.bucket(), uri.key()));
     Assert.assertEquals("Should have expected location", location, out.location());
     Assert.assertEquals("Should have expected length", dataSize, ossDataLength(uri));
     Assert.assertArrayEquals("Should have expected content", data, ossDataContent(uri, dataSize));
@@ -118,7 +118,8 @@ public void testLoadFileIO() {
 
     byte[] data = SerializationUtil.serializeToBytes(file);
     FileIO expectedFileIO = SerializationUtil.deserializeFromBytes(data);
-    Assert.assertTrue("The deserialized FileIO should be OSSFileIO", expectedFileIO instanceof OSSFileIO);
+    Assert.assertTrue(
+        "The deserialized FileIO should be OSSFileIO", expectedFileIO instanceof OSSFileIO);
   }
 
   @Test
@@ -126,7 +127,8 @@ public void serializeClient() throws URISyntaxException {
     String endpoint = "iceberg-test-oss.aliyun.com";
     String accessKeyId = UUID.randomUUID().toString();
     String accessSecret = UUID.randomUUID().toString();
-    SerializableSupplier pre = () -> new OSSClientBuilder().build(endpoint, accessKeyId, accessSecret);
+    SerializableSupplier pre =
+        () -> new OSSClientBuilder().build(endpoint, accessKeyId, accessSecret);
 
     byte[] data = SerializationUtil.serializeToBytes(pre);
     SerializableSupplier post = SerializationUtil.deserializeFromBytes(data);
@@ -135,12 +137,16 @@ public void serializeClient() throws URISyntaxException {
     Assert.assertTrue("Should be instance of oss client", client instanceof OSSClient);
 
     OSSClient oss = (OSSClient) client;
-    Assert.assertEquals("Should have expected endpoint",
-        new URI("http://" + endpoint), oss.getEndpoint());
-    Assert.assertEquals("Should have expected access key",
-        accessKeyId, oss.getCredentialsProvider().getCredentials().getAccessKeyId());
-    Assert.assertEquals("Should have expected secret key",
-        accessSecret, oss.getCredentialsProvider().getCredentials().getSecretAccessKey());
+    Assert.assertEquals(
+        "Should have expected endpoint", new URI("http://" + endpoint), oss.getEndpoint());
+    Assert.assertEquals(
+        "Should have expected access key",
+        accessKeyId,
+        oss.getCredentialsProvider().getCredentials().getAccessKeyId());
+    Assert.assertEquals(
+        "Should have expected secret key",
+        accessSecret,
+        oss.getCredentialsProvider().getCredentials().getSecretAccessKey());
   }
 
   private FileIO fileIO() {
@@ -158,7 +164,11 @@ private byte[] randomData(int size) {
   }
 
   private long ossDataLength(OSSURI uri) {
-    return ossClient().get().getObject(uri.bucket(), uri.key()).getObjectMetadata().getContentLength();
+    return ossClient()
+        .get()
+        .getObject(uri.bucket(), uri.key())
+        .getObjectMetadata()
+        .getContentLength();
   }
 
   private byte[] ossDataContent(OSSURI uri, int dataSize) throws IOException {
@@ -170,7 +180,8 @@ private byte[] ossDataContent(OSSURI uri, int dataSize) throws IOException {
   }
 
   private void writeOSSData(OutputFile out, byte[] data) throws IOException {
-    try (OutputStream os = out.create(); InputStream is = new ByteArrayInputStream(data)) {
+    try (OutputStream os = out.create();
+        InputStream is = new ByteArrayInputStream(data)) {
       ByteStreams.copy(is, os);
     }
   }
diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSInputFile.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSInputFile.java
index 44d370ca9f59..2f67dea7c367 100644
--- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSInputFile.java
+++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSInputFile.java
@@ -16,9 +16,14 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.aliyun.oss;
 
+import static org.mockito.AdditionalAnswers.delegatesTo;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.reset;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+
 import com.aliyun.oss.OSS;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
@@ -35,12 +40,6 @@
 import org.junit.Assert;
 import org.junit.Test;
 
-import static org.mockito.AdditionalAnswers.delegatesTo;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.reset;
-import static org.mockito.Mockito.times;
-import static org.mockito.Mockito.verify;
-
 public class TestOSSInputFile extends AliyunOSSTestBase {
   private final OSS ossClient = ossClient().get();
   private final OSS ossMock = mock(OSS.class, delegatesTo(ossClient));
@@ -62,16 +61,21 @@ public void testReadFile() throws Exception {
   @Test
   public void testOSSInputFile() {
     OSSURI uri = randomURI();
-    AssertHelpers.assertThrows("File length should not be negative", ValidationException.class,
+    AssertHelpers.assertThrows(
+        "File length should not be negative",
+        ValidationException.class,
         "Invalid file length",
-        () -> new OSSInputFile(ossClient().get(), uri, aliyunProperties, -1, MetricsContext.nullMetrics()));
+        () ->
+            new OSSInputFile(
+                ossClient().get(), uri, aliyunProperties, -1, MetricsContext.nullMetrics()));
   }
 
   @Test
   public void testExists() {
     OSSURI uri = randomURI();
 
-    InputFile inputFile = new OSSInputFile(ossMock, uri, aliyunProperties, MetricsContext.nullMetrics());
+    InputFile inputFile =
+        new OSSInputFile(ossMock, uri, aliyunProperties, MetricsContext.nullMetrics());
     Assert.assertFalse("OSS file should not exist", inputFile.exists());
     verify(ossMock, times(1)).getSimplifiedObjectMeta(uri.bucket(), uri.key());
     reset(ossMock);
@@ -104,7 +108,8 @@ public void testGetLength() {
   }
 
   private void readAndVerify(OSSURI uri, byte[] data) throws IOException {
-    InputFile inputFile = new OSSInputFile(ossClient().get(), uri, aliyunProperties, MetricsContext.nullMetrics());
+    InputFile inputFile =
+        new OSSInputFile(ossClient().get(), uri, aliyunProperties, MetricsContext.nullMetrics());
     Assert.assertTrue("OSS file should exist", inputFile.exists());
     Assert.assertEquals("Should have expected file length", data.length, inputFile.getLength());
 
@@ -118,9 +123,12 @@ private void readAndVerify(OSSURI uri, byte[] data) throws IOException {
   private void verifyLength(OSS ossClientMock, OSSURI uri, byte[] data, boolean isCache) {
     InputFile inputFile;
     if (isCache) {
-      inputFile = new OSSInputFile(ossClientMock, uri, aliyunProperties, data.length, MetricsContext.nullMetrics());
+      inputFile =
+          new OSSInputFile(
+              ossClientMock, uri, aliyunProperties, data.length, MetricsContext.nullMetrics());
     } else {
-      inputFile = new OSSInputFile(ossClientMock, uri, aliyunProperties, MetricsContext.nullMetrics());
+      inputFile =
+          new OSSInputFile(ossClientMock, uri, aliyunProperties, MetricsContext.nullMetrics());
     }
     inputFile.getLength();
     Assert.assertEquals("Should have expected file length", data.length, inputFile.getLength());
diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSInputStream.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSInputStream.java
index 633efb48f173..49b9dbd3547a 100644
--- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSInputStream.java
+++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSInputStream.java
@@ -16,9 +16,12 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.aliyun.oss;
 
+import static org.apache.iceberg.AssertHelpers.assertThrows;
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.util.Arrays;
@@ -28,10 +31,6 @@
 import org.apache.iceberg.relocated.com.google.common.io.ByteStreams;
 import org.junit.Test;
 
-import static org.apache.iceberg.AssertHelpers.assertThrows;
-import static org.junit.Assert.assertArrayEquals;
-import static org.junit.Assert.assertEquals;
-
 public class TestOSSInputStream extends AliyunOSSTestBase {
   private final Random random = ThreadLocalRandom.current();
 
@@ -69,7 +68,8 @@ public void testRead() throws Exception {
     }
   }
 
-  private void readAndCheck(SeekableInputStream in, long rangeStart, int size, byte[] original, boolean buffered)
+  private void readAndCheck(
+      SeekableInputStream in, long rangeStart, int size, byte[] original, boolean buffered)
       throws IOException {
     in.seek(rangeStart);
     assertEquals("Should have the correct position", rangeStart, in.getPos());
@@ -88,8 +88,10 @@ private void readAndCheck(SeekableInputStream in, long rangeStart, int size, byt
 
     assertEquals("Should have the correct position", rangeEnd, in.getPos());
 
-    assertArrayEquals("Should have expected range data",
-        Arrays.copyOfRange(original, (int) rangeStart, (int) rangeEnd), actual);
+    assertArrayEquals(
+        "Should have expected range data",
+        Arrays.copyOfRange(original, (int) rangeStart, (int) rangeEnd),
+        actual);
   }
 
   @Test
@@ -97,7 +99,9 @@ public void testClose() throws Exception {
     OSSURI uri = new OSSURI(location("closed.dat"));
     SeekableInputStream closed = new OSSInputStream(ossClient().get(), uri);
     closed.close();
-    assertThrows("Cannot seek the input stream after closed.", IllegalStateException.class,
+    assertThrows(
+        "Cannot seek the input stream after closed.",
+        IllegalStateException.class,
         "Cannot seek: already closed",
         () -> {
           closed.seek(0);
@@ -116,8 +120,10 @@ public void testSeek() throws Exception {
       in.seek(expected.length / 2);
       byte[] actual = new byte[expected.length / 2];
       ByteStreams.readFully(in, actual);
-      assertArrayEquals("Should have expected seeking stream",
-          Arrays.copyOfRange(expected, expected.length / 2, expected.length), actual);
+      assertArrayEquals(
+          "Should have expected seeking stream",
+          Arrays.copyOfRange(expected, expected.length / 2, expected.length),
+          actual);
     }
   }
 
diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSOutputFile.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSOutputFile.java
index 2ba900b577da..5a63c9f55290 100644
--- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSOutputFile.java
+++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSOutputFile.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.aliyun.oss;
 
 import com.aliyun.oss.OSS;
@@ -50,7 +49,8 @@ public void testWriteFile() throws IOException {
     byte[] data = randomData(dataSize);
 
     OutputFile out = OSSOutputFile.fromLocation(ossClient, uri.location(), aliyunProperties);
-    try (OutputStream os = out.create(); InputStream is = new ByteArrayInputStream(data)) {
+    try (OutputStream os = out.create();
+        InputStream is = new ByteArrayInputStream(data)) {
       ByteStreams.copy(is, os);
     }
 
@@ -79,8 +79,11 @@ public void testCreate() {
     writeOSSData(uri, data);
 
     OutputFile out = OSSOutputFile.fromLocation(ossClient, uri.location(), aliyunProperties);
-    AssertHelpers.assertThrows("Should complain about location already exists",
-        AlreadyExistsException.class, "Location already exists", out::create);
+    AssertHelpers.assertThrows(
+        "Should complain about location already exists",
+        AlreadyExistsException.class,
+        "Location already exists",
+        out::create);
   }
 
   @Test
@@ -95,12 +98,15 @@ public void testCreateOrOverwrite() throws IOException {
     byte[] expect = randomData(expectSize);
 
     OutputFile out = OSSOutputFile.fromLocation(ossClient, uri.location(), aliyunProperties);
-    try (OutputStream os = out.createOrOverwrite(); InputStream is = new ByteArrayInputStream(expect)) {
+    try (OutputStream os = out.createOrOverwrite();
+        InputStream is = new ByteArrayInputStream(expect)) {
       ByteStreams.copy(is, os);
     }
 
-    Assert.assertEquals(String.format("Should overwrite object length from %d to %d", dataSize, expectSize),
-        expectSize, ossDataLength(uri));
+    Assert.assertEquals(
+        String.format("Should overwrite object length from %d to %d", dataSize, expectSize),
+        expectSize,
+        ossDataLength(uri));
 
     byte[] actual = ossDataContent(uri, expectSize);
     Assert.assertArrayEquals("Should overwrite object content", expect, actual);
@@ -109,7 +115,8 @@ public void testCreateOrOverwrite() throws IOException {
   @Test
   public void testLocation() {
     OSSURI uri = randomURI();
-    OutputFile out = new OSSOutputFile(ossClient, uri, aliyunProperties, MetricsContext.nullMetrics());
+    OutputFile out =
+        new OSSOutputFile(ossClient, uri, aliyunProperties, MetricsContext.nullMetrics());
     Assert.assertEquals("Location should match", uri.location(), out.location());
   }
 
@@ -118,8 +125,10 @@ public void testToInputFile() throws IOException {
     int dataSize = 1024 * 10;
     byte[] data = randomData(dataSize);
 
-    OutputFile out = new OSSOutputFile(ossClient, randomURI(), aliyunProperties, MetricsContext.nullMetrics());
-    try (OutputStream os = out.create(); InputStream is = new ByteArrayInputStream(data)) {
+    OutputFile out =
+        new OSSOutputFile(ossClient, randomURI(), aliyunProperties, MetricsContext.nullMetrics());
+    try (OutputStream os = out.create();
+        InputStream is = new ByteArrayInputStream(data)) {
       ByteStreams.copy(is, os);
     }
 
diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSOutputStream.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSOutputStream.java
index 0567cc91b538..9fa7a648f8dc 100644
--- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSOutputStream.java
+++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSOutputStream.java
@@ -16,9 +16,15 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.aliyun.oss;
 
+import static org.mockito.AdditionalAnswers.delegatesTo;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.reset;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+
 import com.aliyun.oss.OSS;
 import java.io.IOException;
 import java.io.InputStream;
@@ -37,13 +43,6 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import static org.mockito.AdditionalAnswers.delegatesTo;
-import static org.mockito.ArgumentMatchers.any;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.reset;
-import static org.mockito.Mockito.times;
-import static org.mockito.Mockito.verify;
-
 public class TestOSSOutputStream extends AliyunOSSTestBase {
   private static final Logger LOG = LoggerFactory.getLogger(TestOSSOutputStream.class);
 
@@ -53,12 +52,11 @@ public class TestOSSOutputStream extends AliyunOSSTestBase {
   private final Path tmpDir = Files.createTempDirectory("oss-file-io-test-");
   private static final Random random = ThreadLocalRandom.current();
 
-  private final AliyunProperties props = new AliyunProperties(ImmutableMap.of(
-      AliyunProperties.OSS_STAGING_DIRECTORY, tmpDir.toString()
-  ));
+  private final AliyunProperties props =
+      new AliyunProperties(
+          ImmutableMap.of(AliyunProperties.OSS_STAGING_DIRECTORY, tmpDir.toString()));
 
-  public TestOSSOutputStream() throws IOException {
-  }
+  public TestOSSOutputStream() throws IOException {}
 
   @Test
   public void testWrite() throws IOException {
@@ -80,10 +78,14 @@ public void testWrite() throws IOException {
 
   private void writeAndVerify(OSS mock, OSSURI uri, byte[] data, boolean arrayWrite)
       throws IOException {
-    LOG.info("Write and verify for arguments uri: {}, data length: {}, arrayWrite: {}",
-            uri, data.length, arrayWrite);
-
-    try (OSSOutputStream out = new OSSOutputStream(mock, uri, props, MetricsContext.nullMetrics())) {
+    LOG.info(
+        "Write and verify for arguments uri: {}, data length: {}, arrayWrite: {}",
+        uri,
+        data.length,
+        arrayWrite);
+
+    try (OSSOutputStream out =
+        new OSSOutputStream(mock, uri, props, MetricsContext.nullMetrics())) {
       if (arrayWrite) {
         out.write(data);
         Assert.assertEquals("OSSOutputStream position", data.length, out.getPos());
@@ -95,16 +97,21 @@ private void writeAndVerify(OSS mock, OSSURI uri, byte[] data, boolean arrayWrit
       }
     }
 
-    Assert.assertTrue("OSS object should exist", ossClient.doesObjectExist(uri.bucket(), uri.key()));
-    Assert.assertEquals("Object length",
-        ossClient.getObject(uri.bucket(), uri.key()).getObjectMetadata().getContentLength(), data.length);
+    Assert.assertTrue(
+        "OSS object should exist", ossClient.doesObjectExist(uri.bucket(), uri.key()));
+    Assert.assertEquals(
+        "Object length",
+        ossClient.getObject(uri.bucket(), uri.key()).getObjectMetadata().getContentLength(),
+        data.length);
 
     byte[] actual = ossDataContent(uri, data.length);
     Assert.assertArrayEquals("Object content", data, actual);
 
     // Verify all staging files are cleaned up.
-    Assert.assertEquals("Staging files should clean up",
-        0, Files.list(Paths.get(props.ossStagingDirectory())).count());
+    Assert.assertEquals(
+        "Staging files should clean up",
+        0,
+        Files.list(Paths.get(props.ossStagingDirectory())).count());
   }
 
   private OSSURI randomURI() {
diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSURI.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSURI.java
index f76383d0c0b2..3621151b6467 100644
--- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSURI.java
+++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/TestOSSURI.java
@@ -16,17 +16,16 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.aliyun.oss;
 
+import static com.aliyun.oss.internal.OSSUtils.OSS_RESOURCE_MANAGER;
+
 import org.apache.iceberg.AssertHelpers;
 import org.apache.iceberg.exceptions.ValidationException;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.junit.Assert;
 import org.junit.Test;
 
-import static com.aliyun.oss.internal.OSSUtils.OSS_RESOURCE_MANAGER;
-
 public class TestOSSURI {
   @Test
   public void testUrlParsing() {
@@ -50,34 +49,47 @@ public void testEncodedString() {
 
   @Test
   public void invalidBucket() {
-    AssertHelpers.assertThrows("Invalid bucket", IllegalArgumentException.class,
+    AssertHelpers.assertThrows(
+        "Invalid bucket",
+        IllegalArgumentException.class,
         OSS_RESOURCE_MANAGER.getFormattedString("BucketNameInvalid", "test_bucket"),
         () -> new OSSURI("https://test_bucket/path/to/file"));
   }
 
   @Test
   public void missingKey() {
-    AssertHelpers.assertThrows("Missing key", ValidationException.class,
-        "Missing key in OSS location", () -> new OSSURI("https://bucket/"));
+    AssertHelpers.assertThrows(
+        "Missing key",
+        ValidationException.class,
+        "Missing key in OSS location",
+        () -> new OSSURI("https://bucket/"));
   }
 
   @Test
   public void invalidKey() {
-    AssertHelpers.assertThrows("Invalid key", IllegalArgumentException.class,
+    AssertHelpers.assertThrows(
+        "Invalid key",
+        IllegalArgumentException.class,
         OSS_RESOURCE_MANAGER.getFormattedString("ObjectKeyInvalid", "\\path/to/file"),
         () -> new OSSURI("https://bucket/\\path/to/file"));
   }
 
   @Test
   public void relativePathing() {
-    AssertHelpers.assertThrows("Cannot use relative oss location.", ValidationException.class,
-        "Invalid OSS location", () -> new OSSURI("/path/to/file"));
+    AssertHelpers.assertThrows(
+        "Cannot use relative oss location.",
+        ValidationException.class,
+        "Invalid OSS location",
+        () -> new OSSURI("/path/to/file"));
   }
 
   @Test
   public void invalidScheme() {
-    AssertHelpers.assertThrows("Only support scheme: oss/https", ValidationException.class,
-        "Invalid scheme", () -> new OSSURI("invalid://bucket/"));
+    AssertHelpers.assertThrows(
+        "Only support scheme: oss/https",
+        ValidationException.class,
+        "Invalid scheme",
+        () -> new OSSURI("invalid://bucket/"));
   }
 
   @Test
diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockApp.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockApp.java
index 81e2e9115630..ea0ef0fe4de3 100644
--- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockApp.java
+++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockApp.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.aliyun.oss.mock;
 
 import java.util.List;
@@ -44,9 +43,11 @@
 
 @SuppressWarnings("checkstyle:AnnotationUseStyle")
 @Configuration
-@EnableAutoConfiguration(exclude = {SecurityAutoConfiguration.class}, excludeName = {
-    "org.springframework.boot.actuate.autoconfigure.security.servlet.ManagementWebSecurityAutoConfiguration"
-})
+@EnableAutoConfiguration(
+    exclude = {SecurityAutoConfiguration.class},
+    excludeName = {
+      "org.springframework.boot.actuate.autoconfigure.security.servlet.ManagementWebSecurityAutoConfiguration"
+    })
 @ComponentScan
 public class AliyunOSSMockApp {
 
@@ -57,8 +58,7 @@ public class AliyunOSSMockApp {
 
   static final String PROP_SILENT = "silent";
 
-  @Autowired
-  private ConfigurableApplicationContext context;
+  @Autowired private ConfigurableApplicationContext context;
 
   public static AliyunOSSMockApp start(Map properties, String... args) {
     Map defaults = Maps.newHashMap();
@@ -105,7 +105,8 @@ public MappingJackson2XmlHttpMessageConverter getMessageConverter() {
       mediaTypes.add(MediaType.APPLICATION_FORM_URLENCODED);
       mediaTypes.add(MediaType.APPLICATION_OCTET_STREAM);
 
-      final MappingJackson2XmlHttpMessageConverter xmlConverter = new MappingJackson2XmlHttpMessageConverter();
+      final MappingJackson2XmlHttpMessageConverter xmlConverter =
+          new MappingJackson2XmlHttpMessageConverter();
       xmlConverter.setSupportedMediaTypes(mediaTypes);
 
       return xmlConverter;
@@ -114,7 +115,8 @@ public MappingJackson2XmlHttpMessageConverter getMessageConverter() {
 
   private static class RangeConverter implements Converter {
 
-    private static final Pattern REQUESTED_RANGE_PATTERN = Pattern.compile("^bytes=((\\d*)-(\\d*))((,\\d*-\\d*)*)");
+    private static final Pattern REQUESTED_RANGE_PATTERN =
+        Pattern.compile("^bytes=((\\d*)-(\\d*))((,\\d*-\\d*)*)");
 
     @Override
     public Range convert(String rangeString) {
diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalController.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalController.java
index 1c8539b0f6ca..0cc76825c241 100644
--- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalController.java
+++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalController.java
@@ -16,9 +16,13 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.aliyun.oss.mock;
 
+import static org.springframework.http.HttpStatus.INTERNAL_SERVER_ERROR;
+import static org.springframework.http.HttpStatus.OK;
+import static org.springframework.http.HttpStatus.PARTIAL_CONTENT;
+import static org.springframework.http.HttpStatus.REQUESTED_RANGE_NOT_SATISFIABLE;
+
 import com.aliyun.oss.OSSErrorCode;
 import com.aliyun.oss.model.Bucket;
 import com.fasterxml.jackson.annotation.JsonProperty;
@@ -47,17 +51,11 @@
 import org.springframework.web.bind.annotation.RestController;
 import org.springframework.web.servlet.mvc.method.annotation.ResponseEntityExceptionHandler;
 
-import static org.springframework.http.HttpStatus.INTERNAL_SERVER_ERROR;
-import static org.springframework.http.HttpStatus.OK;
-import static org.springframework.http.HttpStatus.PARTIAL_CONTENT;
-import static org.springframework.http.HttpStatus.REQUESTED_RANGE_NOT_SATISFIABLE;
-
 @RestController
 public class AliyunOSSMockLocalController {
   private static final Logger LOG = LoggerFactory.getLogger(AliyunOSSMockLocalController.class);
 
-  @Autowired
-  private AliyunOSSMockLocalStore localStore;
+  @Autowired private AliyunOSSMockLocalStore localStore;
 
   private static String filenameFrom(@PathVariable String bucketName, HttpServletRequest request) {
     String requestUri = request.getRequestURI();
@@ -67,13 +65,17 @@ private static String filenameFrom(@PathVariable String bucketName, HttpServletR
   @RequestMapping(value = "/{bucketName}", method = RequestMethod.PUT, produces = "application/xml")
   public void putBucket(@PathVariable String bucketName) throws IOException {
     if (localStore.getBucket(bucketName) != null) {
-      throw new OssException(409, OSSErrorCode.BUCKET_ALREADY_EXISTS, bucketName + " already exists.");
+      throw new OssException(
+          409, OSSErrorCode.BUCKET_ALREADY_EXISTS, bucketName + " already exists.");
     }
 
     localStore.createBucket(bucketName);
   }
 
-  @RequestMapping(value = "/{bucketName}", method = RequestMethod.DELETE, produces = "application/xml")
+  @RequestMapping(
+      value = "/{bucketName}",
+      method = RequestMethod.DELETE,
+      produces = "application/xml")
   public void deleteBucket(@PathVariable String bucketName) throws IOException {
     verifyBucketExistence(bucketName);
 
@@ -81,17 +83,19 @@ public void deleteBucket(@PathVariable String bucketName) throws IOException {
   }
 
   @RequestMapping(value = "/{bucketName:.+}/**", method = RequestMethod.PUT)
-  public ResponseEntity putObject(@PathVariable String bucketName, HttpServletRequest request) {
+  public ResponseEntity putObject(
+      @PathVariable String bucketName, HttpServletRequest request) {
     verifyBucketExistence(bucketName);
     String filename = filenameFrom(bucketName, request);
     try (ServletInputStream inputStream = request.getInputStream()) {
-      ObjectMetadata metadata = localStore.putObject(
-          bucketName,
-          filename,
-          inputStream,
-          request.getContentType(),
-          request.getHeader(HttpHeaders.CONTENT_ENCODING),
-          ImmutableMap.of());
+      ObjectMetadata metadata =
+          localStore.putObject(
+              bucketName,
+              filename,
+              inputStream,
+              request.getContentType(),
+              request.getHeader(HttpHeaders.CONTENT_ENCODING),
+              ImmutableMap.of());
 
       HttpHeaders responseHeaders = new HttpHeaders();
       responseHeaders.setETag("\"" + metadata.getContentMD5() + "\"");
@@ -112,7 +116,8 @@ public void deleteObject(@PathVariable String bucketName, HttpServletRequest req
   }
 
   @RequestMapping(value = "/{bucketName:.+}/**", method = RequestMethod.HEAD)
-  public ResponseEntity getObjectMeta(@PathVariable String bucketName, HttpServletRequest request) {
+  public ResponseEntity getObjectMeta(
+      @PathVariable String bucketName, HttpServletRequest request) {
     verifyBucketExistence(bucketName);
     ObjectMetadata metadata = verifyObjectExistence(bucketName, filenameFrom(bucketName, request));
 
@@ -133,7 +138,8 @@ public void getObject(
       @PathVariable String bucketName,
       @RequestHeader(value = "Range", required = false) Range range,
       HttpServletRequest request,
-      HttpServletResponse response) throws IOException {
+      HttpServletResponse response)
+      throws IOException {
     verifyBucketExistence(bucketName);
 
     String filename = filenameFrom(bucketName, request);
@@ -158,8 +164,11 @@ public void getObject(
 
       response.setStatus(PARTIAL_CONTENT.value());
       response.setHeader(HttpHeaders.ACCEPT_RANGES, "bytes");
-      response.setHeader(HttpHeaders.CONTENT_RANGE, String.format("bytes %s-%s/%s",
-          range.start(), bytesToRead + range.start() + 1, metadata.getContentLength()));
+      response.setHeader(
+          HttpHeaders.CONTENT_RANGE,
+          String.format(
+              "bytes %s-%s/%s",
+              range.start(), bytesToRead + range.start() + 1, metadata.getContentLength()));
       response.setHeader(HttpHeaders.ETAG, "\"" + metadata.getContentMD5() + "\"");
       response.setDateHeader(HttpHeaders.LAST_MODIFIED, metadata.getLastModificationDate());
       response.setContentType(metadata.getContentType());
@@ -189,7 +198,8 @@ public void getObject(
   private void verifyBucketExistence(String bucketName) {
     Bucket bucket = localStore.getBucket(bucketName);
     if (bucket == null) {
-      throw new OssException(404, OSSErrorCode.NO_SUCH_BUCKET, "The specified bucket does not exist. ");
+      throw new OssException(
+          404, OSSErrorCode.NO_SUCH_BUCKET, "The specified bucket does not exist. ");
     }
   }
 
@@ -198,7 +208,8 @@ private ObjectMetadata verifyObjectExistence(String bucketName, String filename)
     try {
       objectMetadata = localStore.getObjectMetadata(bucketName, filename);
     } catch (IOException e) {
-      LOG.error("Failed to get the object metadata, bucket: {}, object: {}.", bucketName, filename, e);
+      LOG.error(
+          "Failed to get the object metadata, bucket: {}, object: {}.", bucketName, filename, e);
     }
 
     if (objectMetadata == null) {
@@ -222,9 +233,7 @@ public ResponseEntity handleOSSException(OssException ex) {
       HttpHeaders headers = new HttpHeaders();
       headers.setContentType(MediaType.APPLICATION_XML);
 
-      return ResponseEntity.status(ex.status)
-          .headers(headers)
-          .body(errorResponse);
+      return ResponseEntity.status(ex.status).headers(headers).body(errorResponse);
     }
   }
 
diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalStore.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalStore.java
index 22bf1dd18ebb..75766a671490 100644
--- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalStore.java
+++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockLocalStore.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.aliyun.oss.mock;
 
 import com.aliyun.oss.OSSErrorCode;
@@ -59,7 +58,8 @@ public class AliyunOSSMockLocalStore {
 
   private final ObjectMapper objectMapper = new ObjectMapper();
 
-  public AliyunOSSMockLocalStore(@Value("${" + AliyunOSSMockApp.PROP_ROOT_DIR + ":}") String rootDir) {
+  public AliyunOSSMockLocalStore(
+      @Value("${" + AliyunOSSMockApp.PROP_ROOT_DIR + ":}") String rootDir) {
     Preconditions.checkNotNull(rootDir, "Root directory cannot be null");
     this.root = new File(rootDir);
 
@@ -92,7 +92,8 @@ static String md5sum(InputStream is) throws IOException {
     return new String(Hex.encodeHex(md.digest())).toUpperCase(Locale.ROOT);
   }
 
-  private static void inputStreamToFile(InputStream inputStream, File targetFile) throws IOException {
+  private static void inputStreamToFile(InputStream inputStream, File targetFile)
+      throws IOException {
     try (OutputStream outputStream = new FileOutputStream(targetFile)) {
       ByteStreams.copy(inputStream, outputStream);
     }
@@ -104,8 +105,9 @@ void createBucket(String bucketName) throws IOException {
   }
 
   Bucket getBucket(String bucketName) {
-    List buckets = findBucketsByFilter(file ->
-        Files.isDirectory(file) && file.getFileName().endsWith(bucketName));
+    List buckets =
+        findBucketsByFilter(
+            file -> Files.isDirectory(file) && file.getFileName().endsWith(bucketName));
 
     return buckets.size() > 0 ? buckets.get(0) : null;
   }
@@ -116,8 +118,8 @@ void deleteBucket(String bucketName) throws IOException {
 
     File dir = new File(root, bucket.getName());
     if (Files.walk(dir.toPath()).anyMatch(p -> p.toFile().isFile())) {
-      throw new AliyunOSSMockLocalController.OssException(409, OSSErrorCode.BUCKET_NOT_EMPTY,
-          "The bucket you tried to delete is not empty. ");
+      throw new AliyunOSSMockLocalController.OssException(
+          409, OSSErrorCode.BUCKET_NOT_EMPTY, "The bucket you tried to delete is not empty. ");
     }
 
     FileUtils.deleteDirectory(dir);
@@ -129,7 +131,8 @@ ObjectMetadata putObject(
       InputStream dataStream,
       String contentType,
       String contentEncoding,
-      Map userMetaData) throws IOException {
+      Map userMetaData)
+      throws IOException {
     File bucketDir = new File(root, bucketName);
     assert bucketDir.exists() || bucketDir.mkdirs();
 
@@ -145,12 +148,14 @@ ObjectMetadata putObject(
     ObjectMetadata metadata = new ObjectMetadata();
     metadata.setContentLength(dataFile.length());
     metadata.setContentMD5(md5sum(dataFile.getAbsolutePath()));
-    metadata.setContentType(contentType != null ? contentType : MediaType.APPLICATION_OCTET_STREAM_VALUE);
+    metadata.setContentType(
+        contentType != null ? contentType : MediaType.APPLICATION_OCTET_STREAM_VALUE);
     metadata.setContentEncoding(contentEncoding);
     metadata.setDataFile(dataFile.getAbsolutePath());
     metadata.setMetaFile(metaFile.getAbsolutePath());
 
-    BasicFileAttributes attributes = Files.readAttributes(dataFile.toPath(), BasicFileAttributes.class);
+    BasicFileAttributes attributes =
+        Files.readAttributes(dataFile.toPath(), BasicFileAttributes.class);
     metadata.setLastModificationDate(attributes.lastModifiedTime().toMillis());
 
     metadata.setUserMetaData(userMetaData);
diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockRule.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockRule.java
index b0f3785692b7..12c1e0128de5 100644
--- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockRule.java
+++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/AliyunOSSMockRule.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.aliyun.oss.mock;
 
 import com.aliyun.oss.OSS;
@@ -63,9 +62,11 @@ public void stop() {
 
   @Override
   public OSS createOSSClient() {
-    String endpoint = String.format("http://localhost:%s", properties.getOrDefault(
-        AliyunOSSMockApp.PROP_HTTP_PORT,
-        AliyunOSSMockApp.PORT_HTTP_PORT_DEFAULT));
+    String endpoint =
+        String.format(
+            "http://localhost:%s",
+            properties.getOrDefault(
+                AliyunOSSMockApp.PROP_HTTP_PORT, AliyunOSSMockApp.PORT_HTTP_PORT_DEFAULT));
     return new OSSClientBuilder().build(endpoint, "foo", "bar");
   }
 
@@ -85,13 +86,14 @@ public void tearDownBucket(String bucket) {
     try {
       Files.walk(rootDir().toPath())
           .filter(p -> p.toFile().isFile())
-          .forEach(p -> {
-            try {
-              Files.delete(p);
-            } catch (IOException e) {
-              // delete this file quietly.
-            }
-          });
+          .forEach(
+              p -> {
+                try {
+                  Files.delete(p);
+                } catch (IOException e) {
+                  // delete this file quietly.
+                }
+              });
 
       createOSSClient().deleteBucket(bucket);
     } catch (IOException e) {
@@ -110,7 +112,9 @@ public Builder silent() {
     public AliyunOSSTestRule build() {
       String rootDir = (String) props.get(AliyunOSSMockApp.PROP_ROOT_DIR);
       if (Strings.isNullOrEmpty(rootDir)) {
-        File dir = new File(FileUtils.getTempDirectory(), "oss-mock-file-store-" + System.currentTimeMillis());
+        File dir =
+            new File(
+                FileUtils.getTempDirectory(), "oss-mock-file-store-" + System.currentTimeMillis());
         rootDir = dir.getAbsolutePath();
         props.put(AliyunOSSMockApp.PROP_ROOT_DIR, rootDir);
       }
diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/ObjectMetadata.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/ObjectMetadata.java
index 95fbd0198824..5c38f61e9ddd 100644
--- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/ObjectMetadata.java
+++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/ObjectMetadata.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.aliyun.oss.mock;
 
 import java.util.Map;
@@ -40,7 +39,8 @@ public class ObjectMetadata {
 
   private String metaFile;
 
-  // The following getters and setters are required for Jackson ObjectMapper serialization and deserialization.
+  // The following getters and setters are required for Jackson ObjectMapper serialization and
+  // deserialization.
 
   public long getContentLength() {
     return contentLength;
diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/Range.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/Range.java
index dcf1291b95f7..ff66e5c2a1cb 100644
--- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/Range.java
+++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/Range.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.aliyun.oss.mock;
 
 public class Range {
diff --git a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/TestLocalAliyunOSS.java b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/TestLocalAliyunOSS.java
index a2849f256d6e..b9acc226cc61 100644
--- a/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/TestLocalAliyunOSS.java
+++ b/aliyun/src/test/java/org/apache/iceberg/aliyun/oss/mock/TestLocalAliyunOSS.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.aliyun.oss.mock;
 
 import com.aliyun.oss.OSS;
@@ -42,8 +41,7 @@
 
 public class TestLocalAliyunOSS {
 
-  @ClassRule
-  public static final AliyunOSSTestRule OSS_TEST_RULE = TestUtility.initialize();
+  @ClassRule public static final AliyunOSSTestRule OSS_TEST_RULE = TestUtility.initialize();
 
   private final OSS oss = OSS_TEST_RULE.createOSSClient();
   private final String bucketName = OSS_TEST_RULE.testBucketName();
@@ -70,7 +68,8 @@ public void after() {
 
   @Test
   public void testBuckets() {
-    Assume.assumeTrue("Aliyun integration test cannot delete existing bucket from test environment.",
+    Assume.assumeTrue(
+        "Aliyun integration test cannot delete existing bucket from test environment.",
         OSS_TEST_RULE.getClass() == AliyunOSSMockRule.class);
 
     Assert.assertTrue(doesBucketExist(bucketName));
@@ -85,7 +84,8 @@ public void testBuckets() {
 
   @Test
   public void testDeleteBucket() {
-    Assume.assumeTrue("Aliyun integration test cannot delete existing bucket from test environment.",
+    Assume.assumeTrue(
+        "Aliyun integration test cannot delete existing bucket from test environment.",
         OSS_TEST_RULE.getClass() == AliyunOSSMockRule.class);
 
     String bucketNotExist = String.format("bucket-not-existing-%s", UUID.randomUUID());
@@ -116,7 +116,8 @@ public void testPutObject() throws IOException {
     random.nextBytes(bytes);
 
     String bucketNotExist = String.format("bucket-not-existing-%s", UUID.randomUUID());
-    assertThrows(() -> oss.putObject(bucketNotExist, "object", wrap(bytes)), OSSErrorCode.NO_SUCH_BUCKET);
+    assertThrows(
+        () -> oss.putObject(bucketNotExist, "object", wrap(bytes)), OSSErrorCode.NO_SUCH_BUCKET);
 
     PutObjectResult result = oss.putObject(bucketName, "object", wrap(bytes));
     Assert.assertEquals(AliyunOSSMockLocalStore.md5sum(wrap(bytes)), result.getETag());
diff --git a/api/src/main/java/org/apache/iceberg/Accessor.java b/api/src/main/java/org/apache/iceberg/Accessor.java
index 37f9f14a1fc0..2a20a04df91a 100644
--- a/api/src/main/java/org/apache/iceberg/Accessor.java
+++ b/api/src/main/java/org/apache/iceberg/Accessor.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg;
 
 import java.io.Serializable;
diff --git a/api/src/main/java/org/apache/iceberg/Accessors.java b/api/src/main/java/org/apache/iceberg/Accessors.java
index e18cf3272515..08233624f244 100644
--- a/api/src/main/java/org/apache/iceberg/Accessors.java
+++ b/api/src/main/java/org/apache/iceberg/Accessors.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg;
 
 import java.util.List;
@@ -28,21 +27,22 @@
 
 /**
  * Position2Accessor and Position3Accessor here is an optimization. For a nested schema like:
+ *
  * 
  * root
  *  |-- a: struct (nullable = false)
  *  |    |-- b: struct (nullable = false)
  *  |        | -- c: string (containsNull = false)
  * 
- * Then we will use Position3Accessor to access nested field 'c'. It can be accessed like this: - * {@code row.get(p0, StructLike.class).get(p1, StructLike.class).get(p2, javaClass)}. - * Commonly, Nested fields with depth=1 or 2 or 3 are the fields that will be accessed frequently, - * so this optimization will help to access this kind of schema. For schema whose depth is deeper than 3, - * then we will use the {@link WrappedPositionAccessor} to access recursively. + * + * Then we will use Position3Accessor to access nested field 'c'. It can be accessed like this: + * {@code row.get(p0, StructLike.class).get(p1, StructLike.class).get(p2, javaClass)}. Commonly, + * Nested fields with depth=1 or 2 or 3 are the fields that will be accessed frequently, so this + * optimization will help to access this kind of schema. For schema whose depth is deeper than 3, + * then we will use the {@link WrappedPositionAccessor} to access recursively. */ public class Accessors { - private Accessors() { - } + private Accessors() {} public static Integer toPosition(Accessor accessor) { if (accessor instanceof PositionAccessor) { @@ -187,8 +187,8 @@ private static Accessor newAccessor(int pos, Type type) { return new PositionAccessor(pos, type); } - private static Accessor newAccessor(int pos, boolean isOptional, - Accessor accessor) { + private static Accessor newAccessor( + int pos, boolean isOptional, Accessor accessor) { if (isOptional) { // the wrapped position handles null layers return new WrappedPositionAccessor(pos, accessor); @@ -201,7 +201,8 @@ private static Accessor newAccessor(int pos, boolean isOptional, } } - private static class BuildPositionAccessors extends TypeUtil.SchemaVisitor>> { + private static class BuildPositionAccessors + extends TypeUtil.SchemaVisitor>> { @Override public Map> schema( diff --git a/api/src/main/java/org/apache/iceberg/AddedRowsScanTask.java b/api/src/main/java/org/apache/iceberg/AddedRowsScanTask.java index cda925afa6c2..d48b268287c3 100644 --- a/api/src/main/java/org/apache/iceberg/AddedRowsScanTask.java +++ b/api/src/main/java/org/apache/iceberg/AddedRowsScanTask.java @@ -16,27 +16,29 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; /** * A scan task for inserts generated by adding a data file to the table. - *

- * Note that added data files may have matching delete files. This may happen if a matching position - * delete file is committed in the same snapshot or if changes for multiple snapshots are squashed together. - *

- * Suppose snapshot S1 adds data files F1, F2, F3 and a position delete file, D1, that marks particular - * records in F1 as deleted. A scan for changes generated by S1 should include the following tasks: + * + *

Note that added data files may have matching delete files. This may happen if a matching + * position delete file is committed in the same snapshot or if changes for multiple snapshots are + * squashed together. + * + *

Suppose snapshot S1 adds data files F1, F2, F3 and a position delete file, D1, that marks + * particular records in F1 as deleted. A scan for changes generated by S1 should include the + * following tasks: + * *

    - *
  • AddedRowsScanTask(file=F1, deletes=[D1], snapshot=S1)
  • - *
  • AddedRowsScanTask(file=F2, deletes=[], snapshot=S1)
  • - *
  • AddedRowsScanTask(file=F3, deletes=[], snapshot=S1)
  • + *
  • AddedRowsScanTask(file=F1, deletes=[D1], snapshot=S1) + *
  • AddedRowsScanTask(file=F2, deletes=[], snapshot=S1) + *
  • AddedRowsScanTask(file=F3, deletes=[], snapshot=S1) *
- *

- * Readers consuming these tasks should produce added records with metadata like change ordinal and - * commit snapshot ID. + * + *

Readers consuming these tasks should produce added records with metadata like change ordinal + * and commit snapshot ID. */ public interface AddedRowsScanTask extends ChangelogScanTask, ContentScanTask { /** diff --git a/api/src/main/java/org/apache/iceberg/AppendFiles.java b/api/src/main/java/org/apache/iceberg/AppendFiles.java index aefe1d1fdd69..1fc249acf6ec 100644 --- a/api/src/main/java/org/apache/iceberg/AppendFiles.java +++ b/api/src/main/java/org/apache/iceberg/AppendFiles.java @@ -16,16 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; /** * API for appending new files in a table. - *

- * This API accumulates file additions, produces a new {@link Snapshot} of the table, and commits + * + *

This API accumulates file additions, produces a new {@link Snapshot} of the table, and commits * that snapshot as the current. - *

- * When committing, these changes will be applied to the latest table snapshot. Commit conflicts + * + *

When committing, these changes will be applied to the latest table snapshot. Commit conflicts * will be resolved by applying the changes to the new latest snapshot and reattempting the commit. */ public interface AppendFiles extends SnapshotUpdate { @@ -39,20 +38,20 @@ public interface AppendFiles extends SnapshotUpdate { /** * Append a {@link ManifestFile} to the table. - *

- * The manifest must contain only appended files. All files in the manifest will be appended to + * + *

The manifest must contain only appended files. All files in the manifest will be appended to * the table in the snapshot created by this update. - *

- * By default, the manifest will be rewritten to assign all entries this update's snapshot ID. - * In that case, it is always the responsibility of the caller to manage the lifecycle of - * the original manifest. - *

- * If manifest entries are allowed to inherit the snapshot ID assigned on commit, the manifest + * + *

By default, the manifest will be rewritten to assign all entries this update's snapshot ID. + * In that case, it is always the responsibility of the caller to manage the lifecycle of the + * original manifest. + * + *

If manifest entries are allowed to inherit the snapshot ID assigned on commit, the manifest * should never be deleted manually if the commit succeeds as it will become part of the table * metadata and will be cleaned up on expiry. If the manifest gets merged with others while - * preparing a new snapshot, it will be deleted automatically if this operation is successful. - * If the commit fails, the manifest will never be deleted and it is up to the caller whether - * to delete or reuse it. + * preparing a new snapshot, it will be deleted automatically if this operation is successful. If + * the commit fails, the manifest will never be deleted and it is up to the caller whether to + * delete or reuse it. * * @param file a manifest file * @return this for method chaining diff --git a/api/src/main/java/org/apache/iceberg/BaseScanTaskGroup.java b/api/src/main/java/org/apache/iceberg/BaseScanTaskGroup.java index 1d85077bca3f..706ca344e6e3 100644 --- a/api/src/main/java/org/apache/iceberg/BaseScanTaskGroup.java +++ b/api/src/main/java/org/apache/iceberg/BaseScanTaskGroup.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Collection; @@ -41,7 +40,8 @@ public Collection tasks() { if (taskList == null) { synchronized (this) { if (taskList == null) { - ImmutableList.Builder listBuilder = ImmutableList.builderWithExpectedSize(tasks.length); + ImmutableList.Builder listBuilder = + ImmutableList.builderWithExpectedSize(tasks.length); for (Object task : tasks) { listBuilder.add((T) task); } @@ -55,8 +55,6 @@ public Collection tasks() { @Override public String toString() { - return MoreObjects.toStringHelper(this) - .add("tasks", Joiner.on(", ").join(tasks)) - .toString(); + return MoreObjects.toStringHelper(this).add("tasks", Joiner.on(", ").join(tasks)).toString(); } } diff --git a/api/src/main/java/org/apache/iceberg/ChangelogOperation.java b/api/src/main/java/org/apache/iceberg/ChangelogOperation.java index 3d6ad72c90bf..5a7c86d1ebc6 100644 --- a/api/src/main/java/org/apache/iceberg/ChangelogOperation.java +++ b/api/src/main/java/org/apache/iceberg/ChangelogOperation.java @@ -16,12 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -/** - * An enum representing possible operations in a changelog. - */ +/** An enum representing possible operations in a changelog. */ public enum ChangelogOperation { - INSERT, DELETE + INSERT, + DELETE } diff --git a/api/src/main/java/org/apache/iceberg/ChangelogScanTask.java b/api/src/main/java/org/apache/iceberg/ChangelogScanTask.java index f74fea7478c2..2de17fc0c763 100644 --- a/api/src/main/java/org/apache/iceberg/ChangelogScanTask.java +++ b/api/src/main/java/org/apache/iceberg/ChangelogScanTask.java @@ -16,26 +16,20 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -/** - * A changelog scan task. - */ +/** A changelog scan task. */ public interface ChangelogScanTask extends ScanTask { - /** - * Returns the type of changes produced by this task (i.e. insert/delete). - */ + /** Returns the type of changes produced by this task (i.e. insert/delete). */ ChangelogOperation operation(); /** * Returns the ordinal of changes produced by this task. This number indicates the order in which - * changes produced by this scan must be applied. Operations with a lower ordinal must be applied first. + * changes produced by this scan must be applied. Operations with a lower ordinal must be applied + * first. */ int changeOrdinal(); - /** - * Returns the snapshot ID in which the changes were committed. - */ + /** Returns the snapshot ID in which the changes were committed. */ long commitSnapshotId(); } diff --git a/api/src/main/java/org/apache/iceberg/CombinedScanTask.java b/api/src/main/java/org/apache/iceberg/CombinedScanTask.java index 956fc333d7f3..3d0ea33a9e4b 100644 --- a/api/src/main/java/org/apache/iceberg/CombinedScanTask.java +++ b/api/src/main/java/org/apache/iceberg/CombinedScanTask.java @@ -16,17 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Collection; -/** - * A scan task made of several ranges from files. - */ +/** A scan task made of several ranges from files. */ public interface CombinedScanTask extends ScanTaskGroup { /** * Return the {@link FileScanTask tasks} in this combined task. + * * @return a Collection of FileScanTask instances. */ Collection files(); diff --git a/api/src/main/java/org/apache/iceberg/ContentFile.java b/api/src/main/java/org/apache/iceberg/ContentFile.java index 1925ec0d0df1..d214ee6eb5ba 100644 --- a/api/src/main/java/org/apache/iceberg/ContentFile.java +++ b/api/src/main/java/org/apache/iceberg/ContentFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.nio.ByteBuffer; @@ -30,13 +29,12 @@ */ public interface ContentFile { /** - * Returns the ordinal position of the file in a manifest, or null if it was not read from a manifest. + * Returns the ordinal position of the file in a manifest, or null if it was not read from a + * manifest. */ Long pos(); - /** - * Returns id of the partition spec used for partition metadata. - */ + /** Returns id of the partition spec used for partition metadata. */ int specId(); /** @@ -44,29 +42,19 @@ public interface ContentFile { */ FileContent content(); - /** - * Returns fully qualified path to the file, suitable for constructing a Hadoop Path. - */ + /** Returns fully qualified path to the file, suitable for constructing a Hadoop Path. */ CharSequence path(); - /** - * Returns format of the file. - */ + /** Returns format of the file. */ FileFormat format(); - /** - * Returns partition for this file as a {@link StructLike}. - */ + /** Returns partition for this file as a {@link StructLike}. */ StructLike partition(); - /** - * Returns the number of top-level records in the file. - */ + /** Returns the number of top-level records in the file. */ long recordCount(); - /** - * Returns the file size in bytes. - */ + /** Returns the file size in bytes. */ long fileSizeInBytes(); /** @@ -79,24 +67,16 @@ public interface ContentFile { */ Map valueCounts(); - /** - * Returns if collected, map from column ID to its null value count, null otherwise. - */ + /** Returns if collected, map from column ID to its null value count, null otherwise. */ Map nullValueCounts(); - /** - * Returns if collected, map from column ID to its NaN value count, null otherwise. - */ + /** Returns if collected, map from column ID to its NaN value count, null otherwise. */ Map nanValueCounts(); - /** - * Returns if collected, map from column ID to value lower bounds, null otherwise. - */ + /** Returns if collected, map from column ID to value lower bounds, null otherwise. */ Map lowerBounds(); - /** - * Returns if collected, map from column ID to value upper bounds, null otherwise. - */ + /** Returns if collected, map from column ID to value upper bounds, null otherwise. */ Map upperBounds(); /** @@ -106,53 +86,53 @@ public interface ContentFile { /** * Returns list of recommended split locations, if applicable, null otherwise. - *

- * When available, this information is used for planning scan tasks whose boundaries - * are determined by these offsets. The returned list must be sorted in ascending order. + * + *

When available, this information is used for planning scan tasks whose boundaries are + * determined by these offsets. The returned list must be sorted in ascending order. */ List splitOffsets(); /** * Returns the set of field IDs used for equality comparison, in equality delete files. - *

- * An equality delete file may contain additional data fields that are not used by equality + * + *

An equality delete file may contain additional data fields that are not used by equality * comparison. The subset of columns in a delete file to be used in equality comparison are - * tracked by ID. Extra columns can be used to reconstruct changes and metrics from extra - * columns are used during job planning. + * tracked by ID. Extra columns can be used to reconstruct changes and metrics from extra columns + * are used during job planning. * * @return IDs of the fields used in equality comparison with the records in this delete file */ List equalityFieldIds(); /** - * Returns the sort order id of this file, which describes how the file is ordered. - * This information will be useful for merging data and equality delete files more efficiently - * when they share the same sort order id. + * Returns the sort order id of this file, which describes how the file is ordered. This + * information will be useful for merging data and equality delete files more efficiently when + * they share the same sort order id. */ default Integer sortOrderId() { return null; } /** - * Copies this file. Manifest readers can reuse file instances; use - * this method to copy data when collecting files from tasks. + * Copies this file. Manifest readers can reuse file instances; use this method to copy data when + * collecting files from tasks. * * @return a copy of this data file */ F copy(); /** - * Copies this file without file stats. Manifest readers can reuse file instances; use - * this method to copy data without stats when collecting files. + * Copies this file without file stats. Manifest readers can reuse file instances; use this method + * to copy data without stats when collecting files. * - * @return a copy of this data file, without lower bounds, upper bounds, value counts, - * null value counts, or nan value counts + * @return a copy of this data file, without lower bounds, upper bounds, value counts, null value + * counts, or nan value counts */ F copyWithoutStats(); /** - * Copies this file (potentially without file stats). Manifest readers can reuse file instances; use - * this method to copy data when collecting files from tasks. + * Copies this file (potentially without file stats). Manifest readers can reuse file instances; + * use this method to copy data when collecting files from tasks. * * @param withStats Will copy this file without file stats if set to false. * @return a copy of this data file. If withStats is set to false the diff --git a/api/src/main/java/org/apache/iceberg/ContentScanTask.java b/api/src/main/java/org/apache/iceberg/ContentScanTask.java index 0077c4c7813d..1afaf7f1d58a 100644 --- a/api/src/main/java/org/apache/iceberg/ContentScanTask.java +++ b/api/src/main/java/org/apache/iceberg/ContentScanTask.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.expressions.Expression; @@ -57,9 +56,9 @@ public interface ContentScanTask> extends ScanTask { /** * Returns the residual expression that should be applied to rows in this file scan. - *

- * The residual expression for a file is a filter expression created by partially evaluating the scan's filter - * using the file's partition data. + * + *

The residual expression for a file is a filter expression created by partially evaluating + * the scan's filter using the file's partition data. * * @return a residual expression to apply to rows from this scan */ diff --git a/api/src/main/java/org/apache/iceberg/DataFile.java b/api/src/main/java/org/apache/iceberg/DataFile.java index 3d75052924bd..59b329c500c7 100644 --- a/api/src/main/java/org/apache/iceberg/DataFile.java +++ b/api/src/main/java/org/apache/iceberg/DataFile.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.List; import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.BinaryType; @@ -29,39 +31,72 @@ import org.apache.iceberg.types.Types.StringType; import org.apache.iceberg.types.Types.StructType; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -/** - * Interface for data files listed in a table manifest. - */ +/** Interface for data files listed in a table manifest. */ public interface DataFile extends ContentFile { // fields for adding delete data files - Types.NestedField CONTENT = optional(134, "content", IntegerType.get(), - "Contents of the file: 0=data, 1=position deletes, 2=equality deletes"); - Types.NestedField FILE_PATH = required(100, "file_path", StringType.get(), "Location URI with FS scheme"); - Types.NestedField FILE_FORMAT = required(101, "file_format", StringType.get(), - "File format name: avro, orc, or parquet"); - Types.NestedField RECORD_COUNT = required(103, "record_count", LongType.get(), "Number of records in the file"); - Types.NestedField FILE_SIZE = required(104, "file_size_in_bytes", LongType.get(), "Total file size in bytes"); - Types.NestedField COLUMN_SIZES = optional(108, "column_sizes", MapType.ofRequired(117, 118, - IntegerType.get(), LongType.get()), "Map of column id to total size on disk"); - Types.NestedField VALUE_COUNTS = optional(109, "value_counts", MapType.ofRequired(119, 120, - IntegerType.get(), LongType.get()), "Map of column id to total count, including null and NaN"); - Types.NestedField NULL_VALUE_COUNTS = optional(110, "null_value_counts", MapType.ofRequired(121, 122, - IntegerType.get(), LongType.get()), "Map of column id to null value count"); - Types.NestedField NAN_VALUE_COUNTS = optional(137, "nan_value_counts", MapType.ofRequired(138, 139, - IntegerType.get(), LongType.get()), "Map of column id to number of NaN values in the column"); - Types.NestedField LOWER_BOUNDS = optional(125, "lower_bounds", MapType.ofRequired(126, 127, - IntegerType.get(), BinaryType.get()), "Map of column id to lower bound"); - Types.NestedField UPPER_BOUNDS = optional(128, "upper_bounds", MapType.ofRequired(129, 130, - IntegerType.get(), BinaryType.get()), "Map of column id to upper bound"); - Types.NestedField KEY_METADATA = optional(131, "key_metadata", BinaryType.get(), "Encryption key metadata blob"); - Types.NestedField SPLIT_OFFSETS = optional(132, "split_offsets", ListType.ofRequired(133, LongType.get()), - "Splittable offsets"); - Types.NestedField EQUALITY_IDS = optional(135, "equality_ids", ListType.ofRequired(136, IntegerType.get()), - "Equality comparison field IDs"); - Types.NestedField SORT_ORDER_ID = optional(140, "sort_order_id", IntegerType.get(), "Sort order ID"); + Types.NestedField CONTENT = + optional( + 134, + "content", + IntegerType.get(), + "Contents of the file: 0=data, 1=position deletes, 2=equality deletes"); + Types.NestedField FILE_PATH = + required(100, "file_path", StringType.get(), "Location URI with FS scheme"); + Types.NestedField FILE_FORMAT = + required(101, "file_format", StringType.get(), "File format name: avro, orc, or parquet"); + Types.NestedField RECORD_COUNT = + required(103, "record_count", LongType.get(), "Number of records in the file"); + Types.NestedField FILE_SIZE = + required(104, "file_size_in_bytes", LongType.get(), "Total file size in bytes"); + Types.NestedField COLUMN_SIZES = + optional( + 108, + "column_sizes", + MapType.ofRequired(117, 118, IntegerType.get(), LongType.get()), + "Map of column id to total size on disk"); + Types.NestedField VALUE_COUNTS = + optional( + 109, + "value_counts", + MapType.ofRequired(119, 120, IntegerType.get(), LongType.get()), + "Map of column id to total count, including null and NaN"); + Types.NestedField NULL_VALUE_COUNTS = + optional( + 110, + "null_value_counts", + MapType.ofRequired(121, 122, IntegerType.get(), LongType.get()), + "Map of column id to null value count"); + Types.NestedField NAN_VALUE_COUNTS = + optional( + 137, + "nan_value_counts", + MapType.ofRequired(138, 139, IntegerType.get(), LongType.get()), + "Map of column id to number of NaN values in the column"); + Types.NestedField LOWER_BOUNDS = + optional( + 125, + "lower_bounds", + MapType.ofRequired(126, 127, IntegerType.get(), BinaryType.get()), + "Map of column id to lower bound"); + Types.NestedField UPPER_BOUNDS = + optional( + 128, + "upper_bounds", + MapType.ofRequired(129, 130, IntegerType.get(), BinaryType.get()), + "Map of column id to upper bound"); + Types.NestedField KEY_METADATA = + optional(131, "key_metadata", BinaryType.get(), "Encryption key metadata blob"); + Types.NestedField SPLIT_OFFSETS = + optional( + 132, "split_offsets", ListType.ofRequired(133, LongType.get()), "Splittable offsets"); + Types.NestedField EQUALITY_IDS = + optional( + 135, + "equality_ids", + ListType.ofRequired(136, IntegerType.get()), + "Equality comparison field IDs"); + Types.NestedField SORT_ORDER_ID = + optional(140, "sort_order_id", IntegerType.get(), "Sort order ID"); Types.NestedField SPEC_ID = optional(141, "spec_id", IntegerType.get(), "Partition spec ID"); int PARTITION_ID = 102; @@ -88,13 +123,10 @@ static StructType getType(StructType partitionType) { KEY_METADATA, SPLIT_OFFSETS, EQUALITY_IDS, - SORT_ORDER_ID - ); + SORT_ORDER_ID); } - /** - * @return the content stored in the file; one of DATA, POSITION_DELETES, or EQUALITY_DELETES - */ + /** @return the content stored in the file; one of DATA, POSITION_DELETES, or EQUALITY_DELETES */ @Override default FileContent content() { return FileContent.DATA; diff --git a/api/src/main/java/org/apache/iceberg/DataOperations.java b/api/src/main/java/org/apache/iceberg/DataOperations.java index 143f16a7b3cb..6a80b6b712f6 100644 --- a/api/src/main/java/org/apache/iceberg/DataOperations.java +++ b/api/src/main/java/org/apache/iceberg/DataOperations.java @@ -16,45 +16,43 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; /** * Data operations that produce snapshots. - *

- * A snapshot can return the operation that created the snapshot to help other components ignore + * + *

A snapshot can return the operation that created the snapshot to help other components ignore * snapshots that are not needed for some tasks. For example, snapshot expiration does not need to * clean up deleted files for appends, which have no deleted files. */ public class DataOperations { - private DataOperations() { - } + private DataOperations() {} /** * New data is appended to the table and no data is removed or deleted. - *

- * This operation is implemented by {@link AppendFiles}. + * + *

This operation is implemented by {@link AppendFiles}. */ public static final String APPEND = "append"; /** * Files are removed and replaced, without changing the data in the table. - *

- * This operation is implemented by {@link RewriteFiles}. + * + *

This operation is implemented by {@link RewriteFiles}. */ public static final String REPLACE = "replace"; /** * New data is added to overwrite existing data. - *

- * This operation is implemented by {@link OverwriteFiles} and {@link ReplacePartitions}. + * + *

This operation is implemented by {@link OverwriteFiles} and {@link ReplacePartitions}. */ public static final String OVERWRITE = "overwrite"; /** * Data is deleted from the table and no data is added. - *

- * This operation is implemented by {@link DeleteFiles}. + * + *

This operation is implemented by {@link DeleteFiles}. */ public static final String DELETE = "delete"; } diff --git a/api/src/main/java/org/apache/iceberg/DataTask.java b/api/src/main/java/org/apache/iceberg/DataTask.java index f2a8d2a9d873..8ffca76829ce 100644 --- a/api/src/main/java/org/apache/iceberg/DataTask.java +++ b/api/src/main/java/org/apache/iceberg/DataTask.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.io.CloseableIterable; -/** - * A task that returns data as {@link StructLike rows} instead of where to read data. - */ +/** A task that returns data as {@link StructLike rows} instead of where to read data. */ public interface DataTask extends FileScanTask { @Override default boolean isDataTask() { @@ -35,8 +32,6 @@ default DataTask asDataTask() { return this; } - /** - * Returns an iterable of {@link StructLike} rows. - */ + /** Returns an iterable of {@link StructLike} rows. */ CloseableIterable rows(); } diff --git a/api/src/main/java/org/apache/iceberg/DeleteFile.java b/api/src/main/java/org/apache/iceberg/DeleteFile.java index 9adc0fb547c8..0f8087e6a055 100644 --- a/api/src/main/java/org/apache/iceberg/DeleteFile.java +++ b/api/src/main/java/org/apache/iceberg/DeleteFile.java @@ -16,19 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; -/** - * Interface for delete files listed in a table delete manifest. - */ +/** Interface for delete files listed in a table delete manifest. */ public interface DeleteFile extends ContentFile { /** - * @return List of recommended split locations, if applicable, null otherwise. - * When available, this information is used for planning scan tasks whose boundaries - * are determined by these offsets. The returned list must be sorted in ascending order. + * @return List of recommended split locations, if applicable, null otherwise. When available, + * this information is used for planning scan tasks whose boundaries are determined by these + * offsets. The returned list must be sorted in ascending order. */ @Override default List splitOffsets() { diff --git a/api/src/main/java/org/apache/iceberg/DeleteFiles.java b/api/src/main/java/org/apache/iceberg/DeleteFiles.java index 42a89528ffcb..74d31a6dad81 100644 --- a/api/src/main/java/org/apache/iceberg/DeleteFiles.java +++ b/api/src/main/java/org/apache/iceberg/DeleteFiles.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.exceptions.ValidationException; @@ -25,18 +24,18 @@ /** * API for deleting files from a table. - *

- * This API accumulates file deletions, produces a new {@link Snapshot} of the table, and commits + * + *

This API accumulates file deletions, produces a new {@link Snapshot} of the table, and commits * that snapshot as the current. - *

- * When committing, these changes will be applied to the latest table snapshot. Commit conflicts + * + *

When committing, these changes will be applied to the latest table snapshot. Commit conflicts * will be resolved by applying the changes to the new latest snapshot and reattempting the commit. */ public interface DeleteFiles extends SnapshotUpdate { /** * Delete a file path from the underlying table. - *

- * To remove a file from the table, this path must equal a path in the table's metadata. Paths + * + *

To remove a file from the table, this path must equal a path in the table's metadata. Paths * that are different but equivalent will not be removed. For example, file:/path/file.avro is * equivalent to file:///path/file.avro, but would not remove the latter path from the table. * @@ -58,15 +57,15 @@ default DeleteFiles deleteFile(DataFile file) { /** * Delete files that match an {@link Expression} on data rows from the table. - *

- * A file is selected to be deleted by the expression if it could contain any rows that match the - * expression (candidate files are selected using an - * {@link Projections#inclusive(PartitionSpec) inclusive projection}). These candidate files are - * deleted if all of the rows in the file must match the expression (the partition data matches - * the expression's {@link Projections#strict(PartitionSpec)} strict projection}). This guarantees + * + *

A file is selected to be deleted by the expression if it could contain any rows that match + * the expression (candidate files are selected using an {@link + * Projections#inclusive(PartitionSpec) inclusive projection}). These candidate files are deleted + * if all of the rows in the file must match the expression (the partition data matches the + * expression's {@link Projections#strict(PartitionSpec)} strict projection}). This guarantees * that files are deleted if and only if all rows in the file must match the expression. - *

- * Files that may contain some rows that match the expression and some rows that do not will + * + *

Files that may contain some rows that match the expression and some rows that do not will * result in a {@link ValidationException}. * * @param expr an expression on rows in the table diff --git a/api/src/main/java/org/apache/iceberg/DeletedDataFileScanTask.java b/api/src/main/java/org/apache/iceberg/DeletedDataFileScanTask.java index 8827796d3031..9edd6afd0cea 100644 --- a/api/src/main/java/org/apache/iceberg/DeletedDataFileScanTask.java +++ b/api/src/main/java/org/apache/iceberg/DeletedDataFileScanTask.java @@ -16,30 +16,32 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; /** * A scan task for deletes generated by removing a data file from the table. - *

- * Note that all historical delete files added earlier must be applied while reading the data file. - * This is required to output only those data records that were live when the data file was removed. - *

- * Suppose snapshot S1 contains data files F1, F2, F3. Then snapshot S2 adds a position delete file, D1, - * that deletes records from F2 and snapshot S3 removes F2 entirely. A scan for changes generated by S3 - * should include the following task: + * + *

Note that all historical delete files added earlier must be applied while reading the data + * file. This is required to output only those data records that were live when the data file was + * removed. + * + *

Suppose snapshot S1 contains data files F1, F2, F3. Then snapshot S2 adds a position delete + * file, D1, that deletes records from F2 and snapshot S3 removes F2 entirely. A scan for changes + * generated by S3 should include the following task: + * *

    - *
  • DeletedDataFileScanTask(file=F2, existing-deletes=[D1], snapshot=S3)
  • + *
  • DeletedDataFileScanTask(file=F2, existing-deletes=[D1], snapshot=S3) *
- *

- * Readers consuming these tasks should produce deleted records with metadata like change ordinal and - * commit snapshot ID. + * + *

Readers consuming these tasks should produce deleted records with metadata like change ordinal + * and commit snapshot ID. */ public interface DeletedDataFileScanTask extends ChangelogScanTask, ContentScanTask { /** - * A list of previously added {@link DeleteFile delete files} to apply when reading the data file in this task. + * A list of previously added {@link DeleteFile delete files} to apply when reading the data file + * in this task. * * @return a list of delete files to apply */ diff --git a/api/src/main/java/org/apache/iceberg/DeletedRowsScanTask.java b/api/src/main/java/org/apache/iceberg/DeletedRowsScanTask.java index da9f887a4163..131edfddd349 100644 --- a/api/src/main/java/org/apache/iceberg/DeletedRowsScanTask.java +++ b/api/src/main/java/org/apache/iceberg/DeletedRowsScanTask.java @@ -16,31 +16,32 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; /** * A scan task for deletes generated by adding delete files to the table. - *

- * Suppose snapshot S1 contains data files F1, F2, F3. Then snapshot S2 adds a position delete file, D1, - * that deletes records from F2 and snapshot S3 adds an equality delete file, D2, that removes records - * from F1, F2, F3. A scan for changes from S2 to S3 (inclusive) should include the following tasks: + * + *

Suppose snapshot S1 contains data files F1, F2, F3. Then snapshot S2 adds a position delete + * file, D1, that deletes records from F2 and snapshot S3 adds an equality delete file, D2, that + * removes records from F1, F2, F3. A scan for changes from S2 to S3 (inclusive) should include the + * following tasks: + * *

    - *
  • DeletedRowsScanTask(file=F2, added-deletes=[D1], existing-deletes=[], snapshot=S2)
  • - *
  • DeletedRowsScanTask(file=F1, added-deletes=[D2], existing-deletes=[], snapshot=S3)
  • - *
  • DeletedRowsScanTask(file=F2, added-deletes=[D2], existing-deletes=[D1], snapshot=S3)
  • - *
  • DeletedRowsScanTask(file=F3, added-deletes=[D2], existing-deletes=[], snapshot=S3)
  • + *
  • DeletedRowsScanTask(file=F2, added-deletes=[D1], existing-deletes=[], snapshot=S2) + *
  • DeletedRowsScanTask(file=F1, added-deletes=[D2], existing-deletes=[], snapshot=S3) + *
  • DeletedRowsScanTask(file=F2, added-deletes=[D2], existing-deletes=[D1], snapshot=S3) + *
  • DeletedRowsScanTask(file=F3, added-deletes=[D2], existing-deletes=[], snapshot=S3) *
- *

- * Readers consuming these tasks should produce deleted records with metadata like change ordinal and - * commit snapshot ID. + * + *

Readers consuming these tasks should produce deleted records with metadata like change ordinal + * and commit snapshot ID. */ public interface DeletedRowsScanTask extends ChangelogScanTask, ContentScanTask { /** - * A list of added {@link DeleteFile delete files} that apply to the task's data file. - * Records removed by these delete files should appear as deletes in the changelog. + * A list of added {@link DeleteFile delete files} that apply to the task's data file. Records + * removed by these delete files should appear as deletes in the changelog. * * @return a list of added delete files */ @@ -48,8 +49,8 @@ public interface DeletedRowsScanTask extends ChangelogScanTask, ContentScanTask< /** * A list of {@link DeleteFile delete files} that existed before and must be applied prior to - * determining which records are deleted by delete files in {@link #addedDeletes()}. - * Records removed by these delete files should not appear in the changelog. + * determining which records are deleted by delete files in {@link #addedDeletes()}. Records + * removed by these delete files should not appear in the changelog. * * @return a list of existing delete files */ @@ -62,9 +63,9 @@ default ChangelogOperation operation() { @Override default long sizeBytes() { - return length() + - addedDeletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum() + - existingDeletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum(); + return length() + + addedDeletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum() + + existingDeletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum(); } @Override diff --git a/api/src/main/java/org/apache/iceberg/DistributionMode.java b/api/src/main/java/org/apache/iceberg/DistributionMode.java index fbe6c6a558dd..b4f2649ce1cc 100644 --- a/api/src/main/java/org/apache/iceberg/DistributionMode.java +++ b/api/src/main/java/org/apache/iceberg/DistributionMode.java @@ -16,27 +16,30 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Locale; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; /** - * Enum of supported write distribution mode, it defines the write behavior of batch or streaming job: - *

- * 1. none: don't shuffle rows. It is suitable for scenarios where the rows are located in only few - * partitions, otherwise that may produce too many small files because each task is writing rows into different - * partitions randomly. - *

- * 2. hash: hash distribute by partition key, which is suitable for the scenarios where the rows are located - * into different partitions evenly. - *

- * 3. range: range distribute by partition key (or sort key if table has an {@link SortOrder}), which is suitable - * for the scenarios where rows are located into different partitions with skew distribution. + * Enum of supported write distribution mode, it defines the write behavior of batch or streaming + * job: + * + *

1. none: don't shuffle rows. It is suitable for scenarios where the rows are located in only + * few partitions, otherwise that may produce too many small files because each task is writing rows + * into different partitions randomly. + * + *

2. hash: hash distribute by partition key, which is suitable for the scenarios where the rows + * are located into different partitions evenly. + * + *

3. range: range distribute by partition key (or sort key if table has an {@link SortOrder}), + * which is suitable for the scenarios where rows are located into different partitions with skew + * distribution. */ public enum DistributionMode { - NONE("none"), HASH("hash"), RANGE("range"); + NONE("none"), + HASH("hash"), + RANGE("range"); private final String modeName; diff --git a/api/src/main/java/org/apache/iceberg/ExpireSnapshots.java b/api/src/main/java/org/apache/iceberg/ExpireSnapshots.java index 908b79ca830a..f6524a1d4fba 100644 --- a/api/src/main/java/org/apache/iceberg/ExpireSnapshots.java +++ b/api/src/main/java/org/apache/iceberg/ExpireSnapshots.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -25,18 +24,18 @@ /** * API for removing old {@link Snapshot snapshots} from a table. - *

- * This API accumulates snapshot deletions and commits the new list to the table. This API does not - * allow deleting the current snapshot. - *

- * When committing, these changes will be applied to the latest table metadata. Commit conflicts + * + *

This API accumulates snapshot deletions and commits the new list to the table. This API does + * not allow deleting the current snapshot. + * + *

When committing, these changes will be applied to the latest table metadata. Commit conflicts * will be resolved by applying the changes to the new latest metadata and reattempting the commit. - *

- * Manifest files that are no longer used by valid snapshots will be deleted. Data files that were - * deleted by snapshots that are expired will be deleted. {@link #deleteWith(Consumer)} can be used - * to pass an alternative deletion method. * - * {@link #apply()} returns a list of the snapshots that will be removed. + *

Manifest files that are no longer used by valid snapshots will be deleted. Data files that + * were deleted by snapshots that are expired will be deleted. {@link #deleteWith(Consumer)} can be + * used to pass an alternative deletion method. + * + *

{@link #apply()} returns a list of the snapshots that will be removed. */ public interface ExpireSnapshots extends PendingUpdate> { @@ -58,13 +57,14 @@ public interface ExpireSnapshots extends PendingUpdate> { /** * Retains the most recent ancestors of the current snapshot. - *

- * If a snapshot would be expired because it is older than the expiration timestamp, but is one of - * the {@code numSnapshots} most recent ancestors of the current state, it will be retained. This - * will not cause snapshots explicitly identified by id from expiring. - *

- * This may keep more than {@code numSnapshots} ancestors if snapshots are added concurrently. This - * may keep less than {@code numSnapshots} ancestors if the current table state does not have that many. + * + *

If a snapshot would be expired because it is older than the expiration timestamp, but is one + * of the {@code numSnapshots} most recent ancestors of the current state, it will be retained. + * This will not cause snapshots explicitly identified by id from expiring. + * + *

This may keep more than {@code numSnapshots} ancestors if snapshots are added concurrently. + * This may keep less than {@code numSnapshots} ancestors if the current table state does not have + * that many. * * @param numSnapshots the number of snapshots to retain * @return this for method chaining @@ -73,11 +73,11 @@ public interface ExpireSnapshots extends PendingUpdate> { /** * Passes an alternative delete implementation that will be used for manifests and data files. - *

- * Manifest files that are no longer used by valid snapshots will be deleted. Data files that were - * deleted by snapshots that are expired will be deleted. - *

- * If this method is not called, unnecessary manifests and data files will still be deleted. + * + *

Manifest files that are no longer used by valid snapshots will be deleted. Data files that + * were deleted by snapshots that are expired will be deleted. + * + *

If this method is not called, unnecessary manifests and data files will still be deleted. * * @param deleteFunc a function that will be called to delete manifests and data files * @return this for method chaining @@ -86,21 +86,22 @@ public interface ExpireSnapshots extends PendingUpdate> { /** * Passes an alternative executor service that will be used for manifests and data files deletion. - *

- * Manifest files that are no longer used by valid snapshots will be deleted. Data files that were - * deleted by snapshots that are expired will be deleted. - *

- * If this method is not called, unnecessary manifests and data files will still be deleted using a single threaded - * executor service. * - * @param executorService an executor service to parallelize tasks to delete manifests and data files + *

Manifest files that are no longer used by valid snapshots will be deleted. Data files that + * were deleted by snapshots that are expired will be deleted. + * + *

If this method is not called, unnecessary manifests and data files will still be deleted + * using a single threaded executor service. + * + * @param executorService an executor service to parallelize tasks to delete manifests and data + * files * @return this for method chaining */ ExpireSnapshots executeDeleteWith(ExecutorService executorService); /** - * Passes an alternative executor service that will be used for planning. - * If this method is not called, the default worker pool will be used. + * Passes an alternative executor service that will be used for planning. If this method is not + * called, the default worker pool will be used. * * @param executorService an executor service to plan * @return this for method chaining @@ -109,9 +110,9 @@ public interface ExpireSnapshots extends PendingUpdate> { /** * Allows expiration of snapshots without any cleanup of underlying manifest or data files. - *

- * Allows control in removing data and manifest files which may be more efficiently removed using - * a distributed framework through the actions API. + * + *

Allows control in removing data and manifest files which may be more efficiently removed + * using a distributed framework through the actions API. * * @param clean setting this to false will skip deleting expired manifests and files * @return this for method chaining diff --git a/api/src/main/java/org/apache/iceberg/FileContent.java b/api/src/main/java/org/apache/iceberg/FileContent.java index 67bfca79a5bd..2c9a2fa51bd2 100644 --- a/api/src/main/java/org/apache/iceberg/FileContent.java +++ b/api/src/main/java/org/apache/iceberg/FileContent.java @@ -16,12 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -/** - * Content type stored in a file, one of DATA, POSITION_DELETES, or EQUALITY_DELETES. - */ +/** Content type stored in a file, one of DATA, POSITION_DELETES, or EQUALITY_DELETES. */ public enum FileContent { DATA(0), POSITION_DELETES(1), diff --git a/api/src/main/java/org/apache/iceberg/FileFormat.java b/api/src/main/java/org/apache/iceberg/FileFormat.java index 6bcab8e81be2..e00be8ca4e47 100644 --- a/api/src/main/java/org/apache/iceberg/FileFormat.java +++ b/api/src/main/java/org/apache/iceberg/FileFormat.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.types.Comparators; -/** - * Enum of supported file formats. - */ +/** Enum of supported file formats. */ public enum FileFormat { ORC("orc", true), PARQUET("parquet", true), @@ -58,7 +55,9 @@ public String addExtension(String filename) { public static FileFormat fromFileName(CharSequence filename) { for (FileFormat format : FileFormat.values()) { int extStart = filename.length() - format.ext.length(); - if (Comparators.charSequences().compare(format.ext, filename.subSequence(extStart, filename.length())) == 0) { + if (Comparators.charSequences() + .compare(format.ext, filename.subSequence(extStart, filename.length())) + == 0) { return format; } } diff --git a/api/src/main/java/org/apache/iceberg/FileScanTask.java b/api/src/main/java/org/apache/iceberg/FileScanTask.java index 5cc91747a733..d99d924370ad 100644 --- a/api/src/main/java/org/apache/iceberg/FileScanTask.java +++ b/api/src/main/java/org/apache/iceberg/FileScanTask.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; -/** - * A scan task over a range of bytes in a single data file. - */ +/** A scan task over a range of bytes in a single data file. */ public interface FileScanTask extends ContentScanTask, SplittableScanTask { /** * A list of {@link DeleteFile delete files} to apply when reading the task's data file. diff --git a/api/src/main/java/org/apache/iceberg/Files.java b/api/src/main/java/org/apache/iceberg/Files.java index 705a3e3c93f3..16d3b663ad35 100644 --- a/api/src/main/java/org/apache/iceberg/Files.java +++ b/api/src/main/java/org/apache/iceberg/Files.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.File; @@ -34,8 +33,7 @@ public class Files { - private Files() { - } + private Files() {} public static OutputFile localOutput(File file) { return new LocalOutputFile(file); @@ -60,8 +58,7 @@ public PositionOutputStream create() { if (!file.getParentFile().isDirectory() && !file.getParentFile().mkdirs()) { throw new RuntimeIOException( - "Failed to create the file's directory at %s.", - file.getParentFile().getAbsolutePath()); + "Failed to create the file's directory at %s.", file.getParentFile().getAbsolutePath()); } try { diff --git a/api/src/main/java/org/apache/iceberg/HistoryEntry.java b/api/src/main/java/org/apache/iceberg/HistoryEntry.java index 49b12b5dd794..e61f45dc1024 100644 --- a/api/src/main/java/org/apache/iceberg/HistoryEntry.java +++ b/api/src/main/java/org/apache/iceberg/HistoryEntry.java @@ -16,25 +16,20 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; /** * Table history entry. - *

- * An entry contains a change to the table state. At the given timestamp, the current snapshot was - * set to the given snapshot ID. + * + *

An entry contains a change to the table state. At the given timestamp, the current snapshot + * was set to the given snapshot ID. */ public interface HistoryEntry extends Serializable { - /** - * Returns the timestamp in milliseconds of the change. - */ + /** Returns the timestamp in milliseconds of the change. */ long timestampMillis(); - /** - * Returns ID of the new current snapshot. - */ + /** Returns ID of the new current snapshot. */ long snapshotId(); } diff --git a/api/src/main/java/org/apache/iceberg/IcebergBuild.java b/api/src/main/java/org/apache/iceberg/IcebergBuild.java index 183c7c115ba7..e72a3c7a823c 100644 --- a/api/src/main/java/org/apache/iceberg/IcebergBuild.java +++ b/api/src/main/java/org/apache/iceberg/IcebergBuild.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -29,12 +28,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * Loads iceberg-version.properties with build information. - */ +/** Loads iceberg-version.properties with build information. */ public class IcebergBuild { - private IcebergBuild() { - } + private IcebergBuild() {} private static final Logger LOG = LoggerFactory.getLogger(IcebergBuild.class); private static final String VERSION_PROPERTIES_FILE = "/iceberg-build.properties"; @@ -42,16 +38,14 @@ private IcebergBuild() { private static volatile boolean isLoaded = false; - private static String shortId; // 10 character short git hash of the build - private static String commitId; // 40 character full git hash of the build + private static String shortId; // 10 character short git hash of the build + private static String commitId; // 40 character full git hash of the build private static String branch; private static List tags; private static String version; private static String fullVersion; - /** - * Loads the version.properties file for this module. - */ + /** Loads the version.properties file for this module. */ public static void loadBuildInfo() { Properties buildProperties = new Properties(); try (InputStream is = readResource(VERSION_PROPERTIES_FILE)) { @@ -115,6 +109,7 @@ private static void ensureLoaded() { } private static InputStream readResource(String resourceName) throws IOException { - return Resources.asByteSource(Resources.getResource(IcebergBuild.class, resourceName)).openStream(); + return Resources.asByteSource(Resources.getResource(IcebergBuild.class, resourceName)) + .openStream(); } } diff --git a/api/src/main/java/org/apache/iceberg/IncrementalAppendScan.java b/api/src/main/java/org/apache/iceberg/IncrementalAppendScan.java index 24b9ed7ccbc5..20a0c940b85d 100644 --- a/api/src/main/java/org/apache/iceberg/IncrementalAppendScan.java +++ b/api/src/main/java/org/apache/iceberg/IncrementalAppendScan.java @@ -16,12 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg; -/** - * API for configuring an incremental table scan for appends only snapshots - */ -public interface IncrementalAppendScan extends IncrementalScan { -} +/** API for configuring an incremental table scan for appends only snapshots */ +public interface IncrementalAppendScan + extends IncrementalScan {} diff --git a/api/src/main/java/org/apache/iceberg/IncrementalChangelogScan.java b/api/src/main/java/org/apache/iceberg/IncrementalChangelogScan.java index 169695058d31..de1c80ac7913 100644 --- a/api/src/main/java/org/apache/iceberg/IncrementalChangelogScan.java +++ b/api/src/main/java/org/apache/iceberg/IncrementalChangelogScan.java @@ -16,12 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -/** - * API for configuring a scan for table changes. - */ +/** API for configuring a scan for table changes. */ public interface IncrementalChangelogScan - extends IncrementalScan> { -} + extends IncrementalScan< + IncrementalChangelogScan, ChangelogScanTask, ScanTaskGroup> {} diff --git a/api/src/main/java/org/apache/iceberg/IncrementalScan.java b/api/src/main/java/org/apache/iceberg/IncrementalScan.java index 501c11bc11e1..1f7a8dff6671 100644 --- a/api/src/main/java/org/apache/iceberg/IncrementalScan.java +++ b/api/src/main/java/org/apache/iceberg/IncrementalScan.java @@ -16,18 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -/** - * API for configuring an incremental scan. - */ -public interface IncrementalScan> extends Scan { +/** API for configuring an incremental scan. */ +public interface IncrementalScan> + extends Scan { /** * Instructs this scan to look for changes starting from a particular snapshot (inclusive). - *

- * If the start snapshot is not configured, it is defaulted to the oldest ancestor - * of the end snapshot (inclusive). + * + *

If the start snapshot is not configured, it is defaulted to the oldest ancestor of the end + * snapshot (inclusive). * * @param fromSnapshotId the start snapshot ID (inclusive) * @return this for method chaining @@ -37,9 +35,9 @@ public interface IncrementalScan - * If the start snapshot is not configured, it is defaulted to the oldest ancestor - * of the end snapshot (inclusive). + * + *

If the start snapshot is not configured, it is defaulted to the oldest ancestor of the end + * snapshot (inclusive). * * @param fromSnapshotId the start snapshot ID (exclusive) * @return this for method chaining @@ -49,8 +47,9 @@ public interface IncrementalScan - * If the end snapshot is not configured, it is defaulted to the current table snapshot (inclusive). + * + *

If the end snapshot is not configured, it is defaulted to the current table snapshot + * (inclusive). * * @param toSnapshotId the end snapshot ID (inclusive) * @return this for method chaining diff --git a/api/src/main/java/org/apache/iceberg/LockManager.java b/api/src/main/java/org/apache/iceberg/LockManager.java index 3019687bce87..8fbc1448b427 100644 --- a/api/src/main/java/org/apache/iceberg/LockManager.java +++ b/api/src/main/java/org/apache/iceberg/LockManager.java @@ -16,18 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Map; -/** - * An interface for locking, used to ensure commit isolation. - */ +/** An interface for locking, used to ensure commit isolation. */ public interface LockManager extends AutoCloseable { /** * Try to acquire a lock + * * @param entityId ID of the entity to lock * @param ownerId ID of the owner if the lock * @return if the lock for the entity is acquired by the owner @@ -37,7 +35,7 @@ public interface LockManager extends AutoCloseable { /** * Release a lock * - * exception must not be thrown for this method. + *

exception must not be thrown for this method. * * @param entityId ID of the entity to lock * @param ownerId ID of the owner if the lock @@ -47,6 +45,7 @@ public interface LockManager extends AutoCloseable { /** * Initialize lock manager from catalog properties. + * * @param properties catalog properties */ void initialize(Map properties); diff --git a/api/src/main/java/org/apache/iceberg/ManageSnapshots.java b/api/src/main/java/org/apache/iceberg/ManageSnapshots.java index b534711d6500..81caf3a58de3 100644 --- a/api/src/main/java/org/apache/iceberg/ManageSnapshots.java +++ b/api/src/main/java/org/apache/iceberg/ManageSnapshots.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.exceptions.CommitFailedException; @@ -24,23 +23,23 @@ import org.apache.iceberg.exceptions.ValidationException; /** - * API for managing snapshots. Allows rolling table data back to a stated at an older table {@link Snapshot snapshot}. - * Rollback: - *

- * This API does not allow conflicting calls to {@link #setCurrentSnapshot(long)} and - * {@link #rollbackToTime(long)}. - *

- * When committing, these changes will be applied to the current table metadata. Commit conflicts - * will not be resolved and will result in a {@link CommitFailedException}. - * Cherrypick: - *

- * In an audit workflow, new data is written to an orphan {@link Snapshot snapshot} that is not committed as - * the table's current state until it is audited. After auditing a change, it may need to be applied or cherry-picked - * on top of the latest snapshot instead of the one that was current when the audited changes were created. - * This class adds support for cherry-picking the changes from an orphan snapshot by applying them to - * the current snapshot. The output of the operation is a new snapshot with the changes from cherry-picked - * snapshot. - *

+ * API for managing snapshots. Allows rolling table data back to a stated at an older table {@link + * Snapshot snapshot}. Rollback: + * + *

This API does not allow conflicting calls to {@link #setCurrentSnapshot(long)} and {@link + * #rollbackToTime(long)}. + * + *

When committing, these changes will be applied to the current table metadata. Commit conflicts + * will not be resolved and will result in a {@link CommitFailedException}. Cherrypick: + * + *

In an audit workflow, new data is written to an orphan {@link Snapshot snapshot} that is not + * committed as the table's current state until it is audited. After auditing a change, it may need + * to be applied or cherry-picked on top of the latest snapshot instead of the one that was current + * when the audited changes were created. This class adds support for cherry-picking the changes + * from an orphan snapshot by applying them to the current snapshot. The output of the operation is + * a new snapshot with the changes from cherry-picked snapshot. + * + *

*/ public interface ManageSnapshots extends PendingUpdate { @@ -64,7 +63,9 @@ public interface ManageSnapshots extends PendingUpdate { /** * Rollback table's state to a specific {@link Snapshot} identified by id. - * @param snapshotId long id of snapshot id to roll back table to. Must be an ancestor of the current snapshot + * + * @param snapshotId long id of snapshot id to roll back table to. Must be an ancestor of the + * current snapshot * @throws IllegalArgumentException If the table has no snapshot with the given id * @throws ValidationException If given snapshot id is not an ancestor of the current state */ @@ -73,11 +74,12 @@ public interface ManageSnapshots extends PendingUpdate { /** * Apply supported changes in given snapshot and create a new snapshot which will be set as the * current snapshot on commit. + * * @param snapshotId a snapshotId whose changes to apply * @return this for method chaining * @throws IllegalArgumentException If the table has no snapshot with the given id - * @throws DuplicateWAPCommitException In case of a WAP workflow and if the table has a duplicate commit with same - * wapId + * @throws DuplicateWAPCommitException In case of a WAP workflow and if the table has a duplicate + * commit with same wapId */ ManageSnapshots cherrypick(long snapshotId); @@ -115,8 +117,8 @@ public interface ManageSnapshots extends PendingUpdate { * * @param name name of branch to rename * @param newName the desired new name of the branch - * @throws IllegalArgumentException if the branch to rename does not exist or if there is already a branch - * with the same name as the desired new name. + * @throws IllegalArgumentException if the branch to rename does not exist or if there is already + * a branch with the same name as the desired new name. */ ManageSnapshots renameBranch(String name, String newName); @@ -148,8 +150,8 @@ public interface ManageSnapshots extends PendingUpdate { ManageSnapshots replaceBranch(String name, long snapshotId); /** - * Replaces the branch with the given name to point to the source snapshot. - * The source branch will remain unchanged, the target branch will retain its retention properties. + * Replaces the branch with the given name to point to the source snapshot. The source branch will + * remain unchanged, the target branch will retain its retention properties. * * @param name Branch to replace * @param source Source reference for the target to be replaced with @@ -158,8 +160,9 @@ public interface ManageSnapshots extends PendingUpdate { ManageSnapshots replaceBranch(String name, String source); /** - * Performs a fast-forward of the given target branch up to the source snapshot if target is an ancestor of source. - * The source branch will remain unchanged, the target branch will retain its retention properties. + * Performs a fast-forward of the given target branch up to the source snapshot if target is an + * ancestor of source. The source branch will remain unchanged, the target branch will retain its + * retention properties. * * @param name Branch to fast-forward * @param source Source reference for the target to be fast forwarded to diff --git a/api/src/main/java/org/apache/iceberg/ManifestContent.java b/api/src/main/java/org/apache/iceberg/ManifestContent.java index 1c32b9915682..264fc8256559 100644 --- a/api/src/main/java/org/apache/iceberg/ManifestContent.java +++ b/api/src/main/java/org/apache/iceberg/ManifestContent.java @@ -16,12 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -/** - * Content type stored in a manifest file, either DATA or DELETES. - */ +/** Content type stored in a manifest file, either DATA or DELETES. */ public enum ManifestContent { DATA(0), DELETES(1); @@ -38,8 +35,10 @@ public int id() { public static ManifestContent fromId(int id) { switch (id) { - case 0: return DATA; - case 1: return DELETES; + case 0: + return DATA; + case 1: + return DELETES; } throw new IllegalArgumentException("Unknown manifest content: " + id); } diff --git a/api/src/main/java/org/apache/iceberg/ManifestFile.java b/api/src/main/java/org/apache/iceberg/ManifestFile.java index 7eb89f49b5ed..e5cbfa170089 100644 --- a/api/src/main/java/org/apache/iceberg/ManifestFile.java +++ b/api/src/main/java/org/apache/iceberg/ManifestFile.java @@ -16,100 +16,120 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.nio.ByteBuffer; import java.util.List; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -/** - * Represents a manifest file that can be scanned to find data files in a table. - */ +/** Represents a manifest file that can be scanned to find data files in a table. */ public interface ManifestFile { - Types.NestedField PATH = required(500, "manifest_path", Types.StringType.get(), "Location URI with FS scheme"); - Types.NestedField LENGTH = required(501, "manifest_length", Types.LongType.get(), "Total file size in bytes"); - Types.NestedField SPEC_ID = required(502, "partition_spec_id", Types.IntegerType.get(), "Spec ID used to write"); - Types.NestedField MANIFEST_CONTENT = optional(517, "content", Types.IntegerType.get(), - "Contents of the manifest: 0=data, 1=deletes"); - Types.NestedField SEQUENCE_NUMBER = optional(515, "sequence_number", Types.LongType.get(), - "Sequence number when the manifest was added"); - Types.NestedField MIN_SEQUENCE_NUMBER = optional(516, "min_sequence_number", Types.LongType.get(), - "Lowest sequence number in the manifest"); - Types.NestedField SNAPSHOT_ID = optional(503, "added_snapshot_id", Types.LongType.get(), - "Snapshot ID that added the manifest"); - Types.NestedField ADDED_FILES_COUNT = optional(504, "added_data_files_count", Types.IntegerType.get(), - "Added entry count"); - Types.NestedField EXISTING_FILES_COUNT = optional(505, "existing_data_files_count", Types.IntegerType.get(), - "Existing entry count"); - Types.NestedField DELETED_FILES_COUNT = optional(506, "deleted_data_files_count", Types.IntegerType.get(), - "Deleted entry count"); - Types.NestedField ADDED_ROWS_COUNT = optional(512, "added_rows_count", Types.LongType.get(), - "Added rows count"); - Types.NestedField EXISTING_ROWS_COUNT = optional(513, "existing_rows_count", Types.LongType.get(), - "Existing rows count"); - Types.NestedField DELETED_ROWS_COUNT = optional(514, "deleted_rows_count", Types.LongType.get(), - "Deleted rows count"); - Types.StructType PARTITION_SUMMARY_TYPE = Types.StructType.of( - required(509, "contains_null", Types.BooleanType.get(), "True if any file has a null partition value"), - optional(518, "contains_nan", Types.BooleanType.get(), "True if any file has a nan partition value"), - optional(510, "lower_bound", Types.BinaryType.get(), "Partition lower bound for all files"), - optional(511, "upper_bound", Types.BinaryType.get(), "Partition upper bound for all files") - ); - Types.NestedField PARTITION_SUMMARIES = optional(507, "partitions", - Types.ListType.ofRequired(508, PARTITION_SUMMARY_TYPE), - "Summary for each partition"); - Types.NestedField KEY_METADATA = optional(519, "key_metadata", Types.BinaryType.get(), - "Encryption key metadata blob"); + Types.NestedField PATH = + required(500, "manifest_path", Types.StringType.get(), "Location URI with FS scheme"); + Types.NestedField LENGTH = + required(501, "manifest_length", Types.LongType.get(), "Total file size in bytes"); + Types.NestedField SPEC_ID = + required(502, "partition_spec_id", Types.IntegerType.get(), "Spec ID used to write"); + Types.NestedField MANIFEST_CONTENT = + optional( + 517, "content", Types.IntegerType.get(), "Contents of the manifest: 0=data, 1=deletes"); + Types.NestedField SEQUENCE_NUMBER = + optional( + 515, + "sequence_number", + Types.LongType.get(), + "Sequence number when the manifest was added"); + Types.NestedField MIN_SEQUENCE_NUMBER = + optional( + 516, + "min_sequence_number", + Types.LongType.get(), + "Lowest sequence number in the manifest"); + Types.NestedField SNAPSHOT_ID = + optional( + 503, "added_snapshot_id", Types.LongType.get(), "Snapshot ID that added the manifest"); + Types.NestedField ADDED_FILES_COUNT = + optional(504, "added_data_files_count", Types.IntegerType.get(), "Added entry count"); + Types.NestedField EXISTING_FILES_COUNT = + optional(505, "existing_data_files_count", Types.IntegerType.get(), "Existing entry count"); + Types.NestedField DELETED_FILES_COUNT = + optional(506, "deleted_data_files_count", Types.IntegerType.get(), "Deleted entry count"); + Types.NestedField ADDED_ROWS_COUNT = + optional(512, "added_rows_count", Types.LongType.get(), "Added rows count"); + Types.NestedField EXISTING_ROWS_COUNT = + optional(513, "existing_rows_count", Types.LongType.get(), "Existing rows count"); + Types.NestedField DELETED_ROWS_COUNT = + optional(514, "deleted_rows_count", Types.LongType.get(), "Deleted rows count"); + Types.StructType PARTITION_SUMMARY_TYPE = + Types.StructType.of( + required( + 509, + "contains_null", + Types.BooleanType.get(), + "True if any file has a null partition value"), + optional( + 518, + "contains_nan", + Types.BooleanType.get(), + "True if any file has a nan partition value"), + optional( + 510, "lower_bound", Types.BinaryType.get(), "Partition lower bound for all files"), + optional( + 511, "upper_bound", Types.BinaryType.get(), "Partition upper bound for all files")); + Types.NestedField PARTITION_SUMMARIES = + optional( + 507, + "partitions", + Types.ListType.ofRequired(508, PARTITION_SUMMARY_TYPE), + "Summary for each partition"); + Types.NestedField KEY_METADATA = + optional(519, "key_metadata", Types.BinaryType.get(), "Encryption key metadata blob"); // next ID to assign: 520 - Schema SCHEMA = new Schema( - PATH, LENGTH, SPEC_ID, MANIFEST_CONTENT, - SEQUENCE_NUMBER, MIN_SEQUENCE_NUMBER, SNAPSHOT_ID, - ADDED_FILES_COUNT, EXISTING_FILES_COUNT, DELETED_FILES_COUNT, - ADDED_ROWS_COUNT, EXISTING_ROWS_COUNT, DELETED_ROWS_COUNT, - PARTITION_SUMMARIES, KEY_METADATA); + Schema SCHEMA = + new Schema( + PATH, + LENGTH, + SPEC_ID, + MANIFEST_CONTENT, + SEQUENCE_NUMBER, + MIN_SEQUENCE_NUMBER, + SNAPSHOT_ID, + ADDED_FILES_COUNT, + EXISTING_FILES_COUNT, + DELETED_FILES_COUNT, + ADDED_ROWS_COUNT, + EXISTING_ROWS_COUNT, + DELETED_ROWS_COUNT, + PARTITION_SUMMARIES, + KEY_METADATA); static Schema schema() { return SCHEMA; } - /** - * Returns fully qualified path to the file, suitable for constructing a Hadoop Path. - */ + /** Returns fully qualified path to the file, suitable for constructing a Hadoop Path. */ String path(); - /** - * Returns length of the manifest file. - */ + /** Returns length of the manifest file. */ long length(); - /** - * Returns iD of the {@link PartitionSpec} used to write the manifest file. - */ + /** Returns iD of the {@link PartitionSpec} used to write the manifest file. */ int partitionSpecId(); - /** - * Returns the content stored in the manifest; either DATA or DELETES. - */ + /** Returns the content stored in the manifest; either DATA or DELETES. */ ManifestContent content(); - /** - * Returns the sequence number of the commit that added the manifest file. - */ + /** Returns the sequence number of the commit that added the manifest file. */ long sequenceNumber(); - /** - * Returns the lowest sequence number of any data file in the manifest. - */ + /** Returns the lowest sequence number of any data file in the manifest. */ long minSequenceNumber(); - /** - * Returns iD of the snapshot that added the manifest file to table metadata. - */ + /** Returns iD of the snapshot that added the manifest file to table metadata. */ Long snapshotId(); /** @@ -121,14 +141,10 @@ default boolean hasAddedFiles() { return addedFilesCount() == null || addedFilesCount() > 0; } - /** - * Returns the number of data files with status ADDED in the manifest file. - */ + /** Returns the number of data files with status ADDED in the manifest file. */ Integer addedFilesCount(); - /** - * Returns the total number of rows in all data files with status ADDED in the manifest file. - */ + /** Returns the total number of rows in all data files with status ADDED in the manifest file. */ Long addedRowsCount(); /** @@ -140,9 +156,7 @@ default boolean hasExistingFiles() { return existingFilesCount() == null || existingFilesCount() > 0; } - /** - * Returns the number of data files with status EXISTING in the manifest file. - */ + /** Returns the number of data files with status EXISTING in the manifest file. */ Integer existingFilesCount(); /** @@ -159,9 +173,7 @@ default boolean hasDeletedFiles() { return deletedFilesCount() == null || deletedFilesCount() > 0; } - /** - * Returns the number of data files with status DELETED in the manifest file. - */ + /** Returns the number of data files with status DELETED in the manifest file. */ Integer deletedFilesCount(); /** @@ -171,18 +183,18 @@ default boolean hasDeletedFiles() { /** * Returns a list of {@link PartitionFieldSummary partition field summaries}. - *

- * Each summary corresponds to a field in the manifest file's partition spec, by ordinal. For - * example, the partition spec [ ts_day=date(ts), type=identity(type) ] will have 2 summaries. - * The first summary is for the ts_day partition field and the second is for the type partition - * field. + * + *

Each summary corresponds to a field in the manifest file's partition spec, by ordinal. For + * example, the partition spec [ ts_day=date(ts), type=identity(type) ] will have 2 summaries. The + * first summary is for the ts_day partition field and the second is for the type partition field. * * @return a list of partition field summaries, one for each field in the manifest's spec */ List partitions(); /** - * Returns metadata about how this manifest file is encrypted, or null if the file is stored in plain text. + * Returns metadata about how this manifest file is encrypted, or null if the file is stored in + * plain text. */ default ByteBuffer keyMetadata() { return null; @@ -196,32 +208,26 @@ default ByteBuffer keyMetadata() { */ ManifestFile copy(); - /** - * Summarizes the values of one partition field stored in a manifest file. - */ + /** Summarizes the values of one partition field stored in a manifest file. */ interface PartitionFieldSummary { static Types.StructType getType() { return PARTITION_SUMMARY_TYPE; } - /** - * Returns true if at least one data file in the manifest has a null value for the field. - */ + /** Returns true if at least one data file in the manifest has a null value for the field. */ boolean containsNull(); /** - * Returns true if at least one data file in the manifest has a NaN value for the field. - * Null if this information doesn't exist. - *

- * Default to return null to ensure backward compatibility. + * Returns true if at least one data file in the manifest has a NaN value for the field. Null if + * this information doesn't exist. + * + *

Default to return null to ensure backward compatibility. */ default Boolean containsNaN() { return null; } - /** - * Returns a ByteBuffer that contains a serialized bound lower than all values of the field. - */ + /** Returns a ByteBuffer that contains a serialized bound lower than all values of the field. */ ByteBuffer lowerBound(); /** diff --git a/api/src/main/java/org/apache/iceberg/MergeableScanTask.java b/api/src/main/java/org/apache/iceberg/MergeableScanTask.java index e1f30a630f95..17aeb775dc1d 100644 --- a/api/src/main/java/org/apache/iceberg/MergeableScanTask.java +++ b/api/src/main/java/org/apache/iceberg/MergeableScanTask.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; /** @@ -35,8 +34,8 @@ public interface MergeableScanTask extends ScanTask { /** * Merges this task with a given task. - *

- * Note this method will be called only if {@link #canMerge(ScanTask)} returns true. + * + *

Note this method will be called only if {@link #canMerge(ScanTask)} returns true. * * @param other another task * @return a new merged task diff --git a/api/src/main/java/org/apache/iceberg/Metrics.java b/api/src/main/java/org/apache/iceberg/Metrics.java index 30a9a8cc7dbc..2f2cf89cdadd 100644 --- a/api/src/main/java/org/apache/iceberg/Metrics.java +++ b/api/src/main/java/org/apache/iceberg/Metrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -28,9 +27,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.util.ByteBuffers; -/** - * Iceberg file format metrics. - */ +/** Iceberg file format metrics. */ public class Metrics implements Serializable { private Long rowCount = null; @@ -41,14 +38,14 @@ public class Metrics implements Serializable { private Map lowerBounds = null; private Map upperBounds = null; - public Metrics() { - } + public Metrics() {} - public Metrics(Long rowCount, - Map columnSizes, - Map valueCounts, - Map nullValueCounts, - Map nanValueCounts) { + public Metrics( + Long rowCount, + Map columnSizes, + Map valueCounts, + Map nullValueCounts, + Map nanValueCounts) { this.rowCount = rowCount; this.columnSizes = columnSizes; this.valueCounts = valueCounts; @@ -56,13 +53,14 @@ public Metrics(Long rowCount, this.nanValueCounts = nanValueCounts; } - public Metrics(Long rowCount, - Map columnSizes, - Map valueCounts, - Map nullValueCounts, - Map nanValueCounts, - Map lowerBounds, - Map upperBounds) { + public Metrics( + Long rowCount, + Map columnSizes, + Map valueCounts, + Map nullValueCounts, + Map nanValueCounts, + Map lowerBounds, + Map upperBounds) { this.rowCount = rowCount; this.columnSizes = columnSizes; this.valueCounts = valueCounts; @@ -120,12 +118,12 @@ public Map nanValueCounts() { /** * Get the non-null lower bound values for all fields in a file. * - * To convert the {@link ByteBuffer} back to a value, use - * {@link org.apache.iceberg.types.Conversions#fromByteBuffer}. + *

To convert the {@link ByteBuffer} back to a value, use {@link + * org.apache.iceberg.types.Conversions#fromByteBuffer}. * * @return a Map of fieldId to the lower bound value as a ByteBuffer - * @see - * Iceberg Spec - Appendix D: Single-value serialization + * @see Iceberg + * Spec - Appendix D: Single-value serialization */ public Map lowerBounds() { return lowerBounds; @@ -142,6 +140,7 @@ public Map upperBounds() { /** * Implemented the method to enable serialization of ByteBuffers. + * * @param out The stream where to write * @throws IOException On serialization error */ @@ -156,8 +155,8 @@ private void writeObject(ObjectOutputStream out) throws IOException { writeByteBufferMap(out, upperBounds); } - private static void writeByteBufferMap(ObjectOutputStream out, Map byteBufferMap) - throws IOException { + private static void writeByteBufferMap( + ObjectOutputStream out, Map byteBufferMap) throws IOException { if (byteBufferMap == null) { out.writeInt(-1); @@ -175,6 +174,7 @@ private static void writeByteBufferMap(ObjectOutputStream out, Map - * This API accumulates file additions and produces a new {@link Snapshot} of the table by replacing - * all the deleted files with the set of additions. This operation is used to implement idempotent - * writes that always replace a section of a table with new data or update/delete operations that - * eagerly overwrite files. - *

- * Overwrites can be validated. The default validation mode is idempotent, meaning the overwrite is - * correct and should be committed out regardless of other concurrent changes to the table. - * For example, this can be used for replacing all the data for day D with query results. - * Alternatively, this API can be configured for overwriting certain files with their filtered - * versions while ensuring no new data that would need to be filtered has been added. - *

- * When committing, these changes will be applied to the latest table snapshot. Commit conflicts + * + *

This API accumulates file additions and produces a new {@link Snapshot} of the table by + * replacing all the deleted files with the set of additions. This operation is used to implement + * idempotent writes that always replace a section of a table with new data or update/delete + * operations that eagerly overwrite files. + * + *

Overwrites can be validated. The default validation mode is idempotent, meaning the overwrite + * is correct and should be committed out regardless of other concurrent changes to the table. For + * example, this can be used for replacing all the data for day D with query results. Alternatively, + * this API can be configured for overwriting certain files with their filtered versions while + * ensuring no new data that would need to be filtered has been added. + * + *

When committing, these changes will be applied to the latest table snapshot. Commit conflicts * will be resolved by applying the changes to the new latest snapshot and reattempting the commit. */ public interface OverwriteFiles extends SnapshotUpdate { /** * Delete files that match an {@link Expression} on data rows from the table. - *

- * A file is selected to be deleted by the expression if it could contain any rows that match the - * expression (candidate files are selected using an - * {@link Projections#inclusive(PartitionSpec) inclusive projection}). These candidate files are - * deleted if all of the rows in the file must match the expression (the partition data matches - * the expression's {@link Projections#strict(PartitionSpec)} strict projection}). This guarantees + * + *

A file is selected to be deleted by the expression if it could contain any rows that match + * the expression (candidate files are selected using an {@link + * Projections#inclusive(PartitionSpec) inclusive projection}). These candidate files are deleted + * if all of the rows in the file must match the expression (the partition data matches the + * expression's {@link Projections#strict(PartitionSpec)} strict projection}). This guarantees * that files are deleted if and only if all rows in the file must match the expression. - *

- * Files that may contain some rows that match the expression and some rows that do not will + * + *

Files that may contain some rows that match the expression and some rows that do not will * result in a {@link ValidationException}. * * @param expr an expression on rows in the table @@ -78,9 +77,9 @@ public interface OverwriteFiles extends SnapshotUpdate { /** * Signal that each file added to the table must match the overwrite expression. - *

- * If this method is called, each added file is validated on commit to ensure that it matches the - * overwrite row filter. This is used to ensure that writes are idempotent: that files cannot + * + *

If this method is called, each added file is validated on commit to ensure that it matches + * the overwrite row filter. This is used to ensure that writes are idempotent: that files cannot * be added during a commit that would not be removed if the operation were run a second time. * * @return this for method chaining @@ -89,9 +88,9 @@ public interface OverwriteFiles extends SnapshotUpdate { /** * Set the snapshot ID used in any reads for this operation. - *

- * Validations will check changes after this snapshot ID. If the from snapshot is not set, all ancestor snapshots - * through the table's initial snapshot are validated. + * + *

Validations will check changes after this snapshot ID. If the from snapshot is not set, all + * ancestor snapshots through the table's initial snapshot are validated. * * @param snapshotId a snapshot ID * @return this for method chaining @@ -107,23 +106,25 @@ public interface OverwriteFiles extends SnapshotUpdate { OverwriteFiles caseSensitive(boolean caseSensitive); /** - * Enables validation that data files added concurrently do not conflict with this commit's operation. - *

- * This method should be called while committing non-idempotent overwrite operations. - * If a concurrent operation commits a new file after the data was read and that file might - * contain rows matching the specified conflict detection filter, the overwrite operation - * will detect this and fail. - *

- * Calling this method with a correct conflict detection filter is required to maintain - * serializable isolation for overwrite operations. Otherwise, the isolation level - * will be snapshot isolation. - *

- * Validation applies to files added to the table since the snapshot passed to {@link #validateFromSnapshot(long)}. + * Enables validation that data files added concurrently do not conflict with this commit's + * operation. + * + *

This method should be called while committing non-idempotent overwrite operations. If a + * concurrent operation commits a new file after the data was read and that file might contain + * rows matching the specified conflict detection filter, the overwrite operation will detect this + * and fail. + * + *

Calling this method with a correct conflict detection filter is required to maintain + * serializable isolation for overwrite operations. Otherwise, the isolation level will be + * snapshot isolation. + * + *

Validation applies to files added to the table since the snapshot passed to {@link + * #validateFromSnapshot(long)}. * * @param conflictDetectionFilter an expression on rows in the table * @return this for method chaining - * @deprecated since 0.13.0, will be removed in 0.14.0; use {@link #conflictDetectionFilter(Expression)} and - * {@link #validateNoConflictingData()} instead. + * @deprecated since 0.13.0, will be removed in 0.14.0; use {@link + * #conflictDetectionFilter(Expression)} and {@link #validateNoConflictingData()} instead. */ @Deprecated default OverwriteFiles validateNoConflictingAppends(Expression conflictDetectionFilter) { @@ -140,39 +141,41 @@ default OverwriteFiles validateNoConflictingAppends(Expression conflictDetection /** * Enables validation that data added concurrently does not conflict with this commit's operation. - *

- * This method should be called while committing non-idempotent overwrite operations. - * If a concurrent operation commits a new file after the data was read and that file might - * contain rows matching the specified conflict detection filter, the overwrite operation - * will detect this and fail. - *

- * Calling this method with a correct conflict detection filter is required to maintain + * + *

This method should be called while committing non-idempotent overwrite operations. If a + * concurrent operation commits a new file after the data was read and that file might contain + * rows matching the specified conflict detection filter, the overwrite operation will detect this + * and fail. + * + *

Calling this method with a correct conflict detection filter is required to maintain * isolation for non-idempotent overwrite operations. - *

- * Validation uses the conflict detection filter passed to {@link #conflictDetectionFilter(Expression)} and - * applies to operations that happened after the snapshot passed to {@link #validateFromSnapshot(long)}. - * If the conflict detection filter is not set, any new data added concurrently will fail this - * overwrite operation. + * + *

Validation uses the conflict detection filter passed to {@link + * #conflictDetectionFilter(Expression)} and applies to operations that happened after the + * snapshot passed to {@link #validateFromSnapshot(long)}. If the conflict detection filter is not + * set, any new data added concurrently will fail this overwrite operation. * * @return this for method chaining */ OverwriteFiles validateNoConflictingData(); /** - * Enables validation that deletes that happened concurrently do not conflict with this commit's operation. - *

- * Validating concurrent deletes is required during non-idempotent overwrite operations. - * If a concurrent operation deletes data in one of the files being overwritten, the overwrite + * Enables validation that deletes that happened concurrently do not conflict with this commit's + * operation. + * + *

Validating concurrent deletes is required during non-idempotent overwrite operations. If a + * concurrent operation deletes data in one of the files being overwritten, the overwrite * operation must be aborted as it may undelete rows that were removed concurrently. - *

- * Calling this method with a correct conflict detection filter is required to maintain + * + *

Calling this method with a correct conflict detection filter is required to maintain * isolation for non-idempotent overwrite operations. - *

- * Validation uses the conflict detection filter passed to {@link #conflictDetectionFilter(Expression)} and - * applies to operations that happened after the snapshot passed to {@link #validateFromSnapshot(long)}. - * If the conflict detection filter is not set, this operation will use the row filter provided - * in {@link #overwriteByRowFilter(Expression)} to check for new delete files and will ensure - * there are no conflicting deletes for data files removed via {@link #deleteFile(DataFile)}. + * + *

Validation uses the conflict detection filter passed to {@link + * #conflictDetectionFilter(Expression)} and applies to operations that happened after the + * snapshot passed to {@link #validateFromSnapshot(long)}. If the conflict detection filter is not + * set, this operation will use the row filter provided in {@link + * #overwriteByRowFilter(Expression)} to check for new delete files and will ensure there are no + * conflicting deletes for data files removed via {@link #deleteFile(DataFile)}. * * @return this for method chaining */ diff --git a/api/src/main/java/org/apache/iceberg/PartitionField.java b/api/src/main/java/org/apache/iceberg/PartitionField.java index 2b97bdfa9198..5956e01d7bbe 100644 --- a/api/src/main/java/org/apache/iceberg/PartitionField.java +++ b/api/src/main/java/org/apache/iceberg/PartitionField.java @@ -16,16 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; import org.apache.iceberg.relocated.com.google.common.base.Objects; import org.apache.iceberg.transforms.Transform; -/** - * Represents a single field in a {@link PartitionSpec}. - */ +/** Represents a single field in a {@link PartitionSpec}. */ public class PartitionField implements Serializable { private final int sourceId; private final int fieldId; @@ -39,30 +36,22 @@ public class PartitionField implements Serializable { this.transform = transform; } - /** - * Returns the field id of the source field in the {@link PartitionSpec spec's} table schema. - */ + /** Returns the field id of the source field in the {@link PartitionSpec spec's} table schema. */ public int sourceId() { return sourceId; } - /** - * Returns the partition field id across all the table metadata's partition specs. - */ + /** Returns the partition field id across all the table metadata's partition specs. */ public int fieldId() { return fieldId; } - /** - * Returns the name of this partition field. - */ + /** Returns the name of this partition field. */ public String name() { return name; } - /** - * Returns the transform used to produce partition values from source values. - */ + /** Returns the transform used to produce partition values from source values. */ public Transform transform() { return transform; } @@ -81,10 +70,10 @@ public boolean equals(Object other) { } PartitionField that = (PartitionField) other; - return sourceId == that.sourceId && - fieldId == that.fieldId && - name.equals(that.name) && - transform.equals(that.transform); + return sourceId == that.sourceId + && fieldId == that.fieldId + && name.equals(that.name) + && transform.equals(that.transform); } @Override diff --git a/api/src/main/java/org/apache/iceberg/PartitionKey.java b/api/src/main/java/org/apache/iceberg/PartitionKey.java index 71cdb2756ed2..0f696b59c477 100644 --- a/api/src/main/java/org/apache/iceberg/PartitionKey.java +++ b/api/src/main/java/org/apache/iceberg/PartitionKey.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; @@ -28,8 +27,9 @@ /** * A struct of partition values. - *

- * Instances of this class can produce partition values from a data row passed to {@link #partition(StructLike)}. + * + *

Instances of this class can produce partition values from a data row passed to {@link + * #partition(StructLike)}. */ public class PartitionKey implements StructLike, Serializable { @@ -53,7 +53,8 @@ public PartitionKey(PartitionSpec spec, Schema inputSchema) { for (int i = 0; i < size; i += 1) { PartitionField field = fields.get(i); Accessor accessor = inputSchema.accessorForField(field.sourceId()); - Preconditions.checkArgument(accessor != null, + Preconditions.checkArgument( + accessor != null, "Cannot build accessor for field: " + schema.findField(field.sourceId())); this.accessors[i] = accessor; this.transforms[i] = field.transform(); diff --git a/api/src/main/java/org/apache/iceberg/PartitionSpec.java b/api/src/main/java/org/apache/iceberg/PartitionSpec.java index 288445f82a17..e984fc69d8ce 100644 --- a/api/src/main/java/org/apache/iceberg/PartitionSpec.java +++ b/api/src/main/java/org/apache/iceberg/PartitionSpec.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; @@ -45,8 +44,8 @@ /** * Represents how to produce partition data for a table. - *

- * Partition data is produced by transforming columns in a table. Each column transform is + * + *

Partition data is produced by transforming columns in a table. Each column transform is * represented by a named {@link PartitionField}. */ public class PartitionSpec implements Serializable { @@ -63,7 +62,8 @@ public class PartitionSpec implements Serializable { private transient volatile List fieldList = null; private final int lastAssignedFieldId; - private PartitionSpec(Schema schema, int specId, List fields, int lastAssignedFieldId) { + private PartitionSpec( + Schema schema, int specId, List fields, int lastAssignedFieldId) { this.schema = schema; this.specId = specId; this.fields = new PartitionField[fields.size()]; @@ -73,23 +73,17 @@ private PartitionSpec(Schema schema, int specId, List fields, in this.lastAssignedFieldId = lastAssignedFieldId; } - /** - * Returns the {@link Schema} for this spec. - */ + /** Returns the {@link Schema} for this spec. */ public Schema schema() { return schema; } - /** - * Returns the ID of this spec. - */ + /** Returns the ID of this spec. */ public int specId() { return specId; } - /** - * Returns the list of {@link PartitionField partition fields} for this spec. - */ + /** Returns the list of {@link PartitionField partition fields} for this spec. */ public List fields() { return lazyFieldList(); } @@ -110,7 +104,8 @@ public UnboundPartitionSpec toUnbound() { UnboundPartitionSpec.Builder builder = UnboundPartitionSpec.builder().withSpecId(specId); for (PartitionField field : fields) { - builder.addField(field.transform().toString(), field.sourceId(), field.fieldId(), field.name()); + builder.addField( + field.transform().toString(), field.sourceId(), field.fieldId(), field.name()); } return builder.build(); @@ -126,9 +121,7 @@ public List getFieldsBySourceId(int fieldId) { return lazyFieldsBySourceId().get(fieldId); } - /** - * Returns a {@link StructType} for partition data defined by this spec. - */ + /** Returns a {@link StructType} for partition data defined by this spec. */ public StructType partitionType() { List structFields = Lists.newArrayListWithExpectedSize(fields.length); @@ -136,8 +129,7 @@ public StructType partitionType() { PartitionField field = fields[i]; Type sourceType = schema.findType(field.sourceId()); Type resultType = field.transform().getResultType(sourceType); - structFields.add( - Types.NestedField.optional(field.fieldId(), field.name(), resultType)); + structFields.add(Types.NestedField.optional(field.fieldId(), field.name(), resultType)); } return Types.StructType.of(structFields); @@ -196,8 +188,9 @@ public String partitionToPath(StructLike data) { } /** - * Returns true if this spec is equivalent to the other, with partition field ids ignored. - * That is, if both specs have the same number of fields, field order, field name, source columns, and transforms. + * Returns true if this spec is equivalent to the other, with partition field ids ignored. That + * is, if both specs have the same number of fields, field order, field name, source columns, and + * transforms. * * @param other another PartitionSpec * @return true if the specs have the same fields, source columns, and transforms. @@ -214,9 +207,9 @@ public boolean compatibleWith(PartitionSpec other) { for (int i = 0; i < fields.length; i += 1) { PartitionField thisField = fields[i]; PartitionField thatField = other.fields[i]; - if (thisField.sourceId() != thatField.sourceId() || - !thisField.transform().toString().equals(thatField.transform().toString()) || - !thisField.name().equals(thatField.name())) { + if (thisField.sourceId() != thatField.sourceId() + || !thisField.transform().toString().equals(thatField.transform().toString()) + || !thisField.name().equals(thatField.name())) { return false; } } @@ -259,8 +252,9 @@ private ListMultimap lazyFieldsBySourceId() { if (fieldsBySourceId == null) { synchronized (this) { if (fieldsBySourceId == null) { - ListMultimap multiMap = Multimaps - .newListMultimap(Maps.newHashMap(), () -> Lists.newArrayListWithCapacity(fields.length)); + ListMultimap multiMap = + Multimaps.newListMultimap( + Maps.newHashMap(), () -> Lists.newArrayListWithCapacity(fields.length)); for (PartitionField field : fields) { multiMap.put(field.sourceId(), field); } @@ -331,8 +325,8 @@ public static Builder builderFor(Schema schema) { /** * Used to create valid {@link PartitionSpec partition specs}. - *

- * Call {@link #builderFor(Schema)} to create a new builder. + * + *

Call {@link #builderFor(Schema)} to create a new builder. */ public static class Builder { private final Schema schema; @@ -340,7 +334,8 @@ public static class Builder { private final Set partitionNames = Sets.newHashSet(); private Map, PartitionField> dedupFields = Maps.newHashMap(); private int specId = 0; - private final AtomicInteger lastAssignedFieldId = new AtomicInteger(unpartitionedLastAssignedId()); + private final AtomicInteger lastAssignedFieldId = + new AtomicInteger(unpartitionedLastAssignedId()); // check if there are conflicts between partition and schema field name private boolean checkConflicts = true; @@ -365,29 +360,38 @@ private void checkAndAddPartitionName(String name, Integer sourceColumnId) { Types.NestedField schemaField = schema.findField(name); if (checkConflicts) { if (sourceColumnId != null) { - // for identity transform case we allow conflicts between partition and schema field name as + // for identity transform case we allow conflicts between partition and schema field name + // as // long as they are sourced from the same schema field - Preconditions.checkArgument(schemaField == null || schemaField.fieldId() == sourceColumnId, - "Cannot create identity partition sourced from different field in schema: %s", name); + Preconditions.checkArgument( + schemaField == null || schemaField.fieldId() == sourceColumnId, + "Cannot create identity partition sourced from different field in schema: %s", + name); } else { - // for all other transforms we don't allow conflicts between partition name and schema field name - Preconditions.checkArgument(schemaField == null, - "Cannot create partition from name that exists in schema: %s", name); + // for all other transforms we don't allow conflicts between partition name and schema + // field name + Preconditions.checkArgument( + schemaField == null, + "Cannot create partition from name that exists in schema: %s", + name); } } - Preconditions.checkArgument(name != null && !name.isEmpty(), - "Cannot use empty or null partition name: %s", name); - Preconditions.checkArgument(!partitionNames.contains(name), - "Cannot use partition name more than once: %s", name); + Preconditions.checkArgument( + name != null && !name.isEmpty(), "Cannot use empty or null partition name: %s", name); + Preconditions.checkArgument( + !partitionNames.contains(name), "Cannot use partition name more than once: %s", name); partitionNames.add(name); } private void checkForRedundantPartitions(PartitionField field) { - Map.Entry dedupKey = new AbstractMap.SimpleEntry<>( - field.sourceId(), field.transform().dedupName()); + Map.Entry dedupKey = + new AbstractMap.SimpleEntry<>(field.sourceId(), field.transform().dedupName()); PartitionField partitionField = dedupFields.get(dedupKey); - Preconditions.checkArgument(partitionField == null, - "Cannot add redundant partition: %s conflicts with %s", partitionField, field); + Preconditions.checkArgument( + partitionField == null, + "Cannot add redundant partition: %s conflicts with %s", + partitionField, + field); dedupFields.put(dedupKey, field); } @@ -398,15 +402,20 @@ public Builder withSpecId(int newSpecId) { private Types.NestedField findSourceColumn(String sourceName) { Types.NestedField sourceColumn = schema.findField(sourceName); - Preconditions.checkArgument(sourceColumn != null, "Cannot find source column: %s", sourceName); + Preconditions.checkArgument( + sourceColumn != null, "Cannot find source column: %s", sourceName); return sourceColumn; } Builder identity(String sourceName, String targetName) { Types.NestedField sourceColumn = findSourceColumn(sourceName); checkAndAddPartitionName(targetName, sourceColumn.fieldId()); - PartitionField field = new PartitionField( - sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.identity(sourceColumn.type())); + PartitionField field = + new PartitionField( + sourceColumn.fieldId(), + nextFieldId(), + targetName, + Transforms.identity(sourceColumn.type())); checkForRedundantPartitions(field); fields.add(field); return this; @@ -419,8 +428,12 @@ public Builder identity(String sourceName) { public Builder year(String sourceName, String targetName) { checkAndAddPartitionName(targetName); Types.NestedField sourceColumn = findSourceColumn(sourceName); - PartitionField field = new PartitionField( - sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.year(sourceColumn.type())); + PartitionField field = + new PartitionField( + sourceColumn.fieldId(), + nextFieldId(), + targetName, + Transforms.year(sourceColumn.type())); checkForRedundantPartitions(field); fields.add(field); return this; @@ -433,8 +446,12 @@ public Builder year(String sourceName) { public Builder month(String sourceName, String targetName) { checkAndAddPartitionName(targetName); Types.NestedField sourceColumn = findSourceColumn(sourceName); - PartitionField field = new PartitionField( - sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.month(sourceColumn.type())); + PartitionField field = + new PartitionField( + sourceColumn.fieldId(), + nextFieldId(), + targetName, + Transforms.month(sourceColumn.type())); checkForRedundantPartitions(field); fields.add(field); return this; @@ -447,8 +464,12 @@ public Builder month(String sourceName) { public Builder day(String sourceName, String targetName) { checkAndAddPartitionName(targetName); Types.NestedField sourceColumn = findSourceColumn(sourceName); - PartitionField field = new PartitionField( - sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.day(sourceColumn.type())); + PartitionField field = + new PartitionField( + sourceColumn.fieldId(), + nextFieldId(), + targetName, + Transforms.day(sourceColumn.type())); checkForRedundantPartitions(field); fields.add(field); return this; @@ -461,8 +482,12 @@ public Builder day(String sourceName) { public Builder hour(String sourceName, String targetName) { checkAndAddPartitionName(targetName); Types.NestedField sourceColumn = findSourceColumn(sourceName); - PartitionField field = new PartitionField( - sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.hour(sourceColumn.type())); + PartitionField field = + new PartitionField( + sourceColumn.fieldId(), + nextFieldId(), + targetName, + Transforms.hour(sourceColumn.type())); checkForRedundantPartitions(field); fields.add(field); return this; @@ -475,8 +500,12 @@ public Builder hour(String sourceName) { public Builder bucket(String sourceName, int numBuckets, String targetName) { checkAndAddPartitionName(targetName); Types.NestedField sourceColumn = findSourceColumn(sourceName); - fields.add(new PartitionField( - sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.bucket(sourceColumn.type(), numBuckets))); + fields.add( + new PartitionField( + sourceColumn.fieldId(), + nextFieldId(), + targetName, + Transforms.bucket(sourceColumn.type(), numBuckets))); return this; } @@ -487,8 +516,12 @@ public Builder bucket(String sourceName, int numBuckets) { public Builder truncate(String sourceName, int width, String targetName) { checkAndAddPartitionName(targetName); Types.NestedField sourceColumn = findSourceColumn(sourceName); - fields.add(new PartitionField( - sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.truncate(sourceColumn.type(), width))); + fields.add( + new PartitionField( + sourceColumn.fieldId(), + nextFieldId(), + targetName, + Transforms.truncate(sourceColumn.type(), width))); return this; } @@ -498,8 +531,11 @@ public Builder truncate(String sourceName, int width) { public Builder alwaysNull(String sourceName, String targetName) { Types.NestedField sourceColumn = findSourceColumn(sourceName); - checkAndAddPartitionName(targetName, sourceColumn.fieldId()); // can duplicate a source column name - fields.add(new PartitionField(sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.alwaysNull())); + checkAndAddPartitionName( + targetName, sourceColumn.fieldId()); // can duplicate a source column name + fields.add( + new PartitionField( + sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.alwaysNull())); return this; } @@ -507,7 +543,8 @@ public Builder alwaysNull(String sourceName) { return alwaysNull(sourceName, sourceName + "_null"); } - // add a partition field with an auto-increment partition field id starting from PARTITION_DATA_ID_START + // add a partition field with an auto-increment partition field id starting from + // PARTITION_DATA_ID_START Builder add(int sourceId, String name, String transform) { return add(sourceId, nextFieldId(), name, transform); } @@ -539,14 +576,17 @@ PartitionSpec buildUnchecked() { static void checkCompatibility(PartitionSpec spec, Schema schema) { for (PartitionField field : spec.fields) { Type sourceType = schema.findType(field.sourceId()); - ValidationException.check(sourceType != null, - "Cannot find source column for partition field: %s", field); - ValidationException.check(sourceType.isPrimitiveType(), - "Cannot partition by non-primitive source field: %s", sourceType); + ValidationException.check( + sourceType != null, "Cannot find source column for partition field: %s", field); + ValidationException.check( + sourceType.isPrimitiveType(), + "Cannot partition by non-primitive source field: %s", + sourceType); ValidationException.check( field.transform().canTransform(sourceType), "Invalid source type %s for transform: %s", - sourceType, field.transform()); + sourceType, + field.transform()); } } diff --git a/api/src/main/java/org/apache/iceberg/PendingUpdate.java b/api/src/main/java/org/apache/iceberg/PendingUpdate.java index 9c1b18434527..f47b98238de0 100644 --- a/api/src/main/java/org/apache/iceberg/PendingUpdate.java +++ b/api/src/main/java/org/apache/iceberg/PendingUpdate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.exceptions.CommitFailedException; @@ -32,8 +31,8 @@ public interface PendingUpdate { /** * Apply the pending changes and return the uncommitted changes for validation. - *

- * This does not result in a permanent update. + * + *

This does not result in a permanent update. * * @return the uncommitted changes that would be committed by calling {@link #commit()} * @throws ValidationException If the pending changes cannot be applied to the current metadata @@ -43,20 +42,21 @@ public interface PendingUpdate { /** * Apply the pending changes and commit. - *

- * Changes are committed by calling the underlying table's commit method. - *

- * Once the commit is successful, the updated table will be refreshed. + * + *

Changes are committed by calling the underlying table's commit method. + * + *

Once the commit is successful, the updated table will be refreshed. * * @throws ValidationException If the update cannot be applied to the current table metadata. * @throws CommitFailedException If the update cannot be committed due to conflicts. - * @throws CommitStateUnknownException If the update success or failure is unknown, no cleanup should be done in - * this case. + * @throws CommitStateUnknownException If the update success or failure is unknown, no cleanup + * should be done in this case. */ void commit(); /** * Generates update event to notify about metadata changes + * * @return the generated event */ default Object updateEvent() { diff --git a/api/src/main/java/org/apache/iceberg/ReplacePartitions.java b/api/src/main/java/org/apache/iceberg/ReplacePartitions.java index fdf7ea1d1984..7e8ab65304c5 100644 --- a/api/src/main/java/org/apache/iceberg/ReplacePartitions.java +++ b/api/src/main/java/org/apache/iceberg/ReplacePartitions.java @@ -16,28 +16,27 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; /** * API for overwriting files in a table by partition. - *

- * This is provided to implement SQL compatible with Hive table operations but is not recommended. - * Instead, use the {@link OverwriteFiles overwrite API} to explicitly overwrite data. - *

- * The default validation mode is idempotent, meaning the overwrite is - * correct and should be committed out regardless of other concurrent changes to the table. - * Alternatively, this API can be configured to validate that no new data or deletes - * have been applied since a snapshot ID associated when this operation began. - * This can be done by calling {@link #validateNoConflictingDeletes()}, {@link #validateNoConflictingData()}, - * to ensure that no conflicting delete files or data files respectively have been written since the snapshot - * passed to {@link #validateFromSnapshot(long)}. - *

- * This API accumulates file additions and produces a new {@link Snapshot} of the table by replacing - * all files in partitions with new data with the new additions. This operation is used to implement - * dynamic partition replacement. - *

- * When committing, these changes will be applied to the latest table snapshot. Commit conflicts + * + *

This is provided to implement SQL compatible with Hive table operations but is not + * recommended. Instead, use the {@link OverwriteFiles overwrite API} to explicitly overwrite data. + * + *

The default validation mode is idempotent, meaning the overwrite is correct and should be + * committed out regardless of other concurrent changes to the table. Alternatively, this API can be + * configured to validate that no new data or deletes have been applied since a snapshot ID + * associated when this operation began. This can be done by calling {@link + * #validateNoConflictingDeletes()}, {@link #validateNoConflictingData()}, to ensure that no + * conflicting delete files or data files respectively have been written since the snapshot passed + * to {@link #validateFromSnapshot(long)}. + * + *

This API accumulates file additions and produces a new {@link Snapshot} of the table by + * replacing all files in partitions with new data with the new additions. This operation is used to + * implement dynamic partition replacement. + * + *

When committing, these changes will be applied to the latest table snapshot. Commit conflicts * will be resolved by applying the changes to the new latest snapshot and reattempting the commit. */ public interface ReplacePartitions extends SnapshotUpdate { @@ -59,24 +58,27 @@ public interface ReplacePartitions extends SnapshotUpdate { /** * Set the snapshot ID used in validations for this operation. * - * All validations will check changes after this snapshot ID. If this is not called, validation will occur - * from the beginning of the table's history. + *

All validations will check changes after this snapshot ID. If this is not called, validation + * will occur from the beginning of the table's history. * - * This method should be called before this operation is committed. - * If a concurrent operation committed a data or delta file or removed a data file after the given snapshot ID - * that might contain rows matching a partition marked for deletion, validation will detect this and fail. + *

This method should be called before this operation is committed. If a concurrent operation + * committed a data or delta file or removed a data file after the given snapshot ID that might + * contain rows matching a partition marked for deletion, validation will detect this and fail. * - * @param snapshotId a snapshot ID, it should be set to when this operation started to read the table. + * @param snapshotId a snapshot ID, it should be set to when this operation started to read the + * table. * @return this for method chaining */ ReplacePartitions validateFromSnapshot(long snapshotId); /** - * Enables validation that deletes that happened concurrently do not conflict with this commit's operation. - *

- * Validating concurrent deletes is required during non-idempotent replace partition operations. - * This will check if a concurrent operation deletes data in any of the partitions being overwritten, - * as the replace partition must be aborted to avoid undeleting rows that were removed concurrently. + * Enables validation that deletes that happened concurrently do not conflict with this commit's + * operation. + * + *

Validating concurrent deletes is required during non-idempotent replace partition + * operations. This will check if a concurrent operation deletes data in any of the partitions + * being overwritten, as the replace partition must be aborted to avoid undeleting rows that were + * removed concurrently. * * @return this for method chaining */ @@ -84,10 +86,11 @@ public interface ReplacePartitions extends SnapshotUpdate { /** * Enables validation that data added concurrently does not conflict with this commit's operation. - *

- * Validating concurrent data files is required during non-idempotent replace partition operations. - * This will check if a concurrent operation inserts data in any of the partitions being overwritten, - * as the replace partition must be aborted to avoid removing rows added concurrently. + * + *

Validating concurrent data files is required during non-idempotent replace partition + * operations. This will check if a concurrent operation inserts data in any of the partitions + * being overwritten, as the replace partition must be aborted to avoid removing rows added + * concurrently. * * @return this for method chaining */ diff --git a/api/src/main/java/org/apache/iceberg/ReplaceSortOrder.java b/api/src/main/java/org/apache/iceberg/ReplaceSortOrder.java index 825c86b1de76..0b63fdd51507 100644 --- a/api/src/main/java/org/apache/iceberg/ReplaceSortOrder.java +++ b/api/src/main/java/org/apache/iceberg/ReplaceSortOrder.java @@ -16,18 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; /** * API for replacing table sort order with a newly created order. - *

- * The table sort order is used to sort incoming records in engines that can request an ordering. - *

- * Apply returns the new sort order for validation. - *

- * When committing, these changes will be applied to the current table metadata. Commit conflicts + * + *

The table sort order is used to sort incoming records in engines that can request an ordering. + * + *

Apply returns the new sort order for validation. + * + *

When committing, these changes will be applied to the current table metadata. Commit conflicts * will be resolved by applying the pending changes to the new table metadata. */ -public interface ReplaceSortOrder extends PendingUpdate, SortOrderBuilder { -} +public interface ReplaceSortOrder + extends PendingUpdate, SortOrderBuilder {} diff --git a/api/src/main/java/org/apache/iceberg/RewriteFiles.java b/api/src/main/java/org/apache/iceberg/RewriteFiles.java index 1e13a534f6af..c392c7118d8a 100644 --- a/api/src/main/java/org/apache/iceberg/RewriteFiles.java +++ b/api/src/main/java/org/apache/iceberg/RewriteFiles.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Set; @@ -25,11 +24,11 @@ /** * API for replacing files in a table. - *

- * This API accumulates file additions and deletions, produces a new {@link Snapshot} of the + * + *

This API accumulates file additions and deletions, produces a new {@link Snapshot} of the * changes, and commits that snapshot as the current. - *

- * When committing, these changes will be applied to the latest table snapshot. Commit conflicts + * + *

When committing, these changes will be applied to the latest table snapshot. Commit conflicts * will be resolved by applying the changes to the new latest snapshot and reattempting the commit. * If any of the deleted files are no longer in the latest snapshot when reattempting, the commit * will throw a {@link ValidationException}. @@ -39,46 +38,45 @@ public interface RewriteFiles extends SnapshotUpdate { * Add a rewrite that replaces one set of data files with another set that contains the same data. * * @param filesToDelete files that will be replaced (deleted), cannot be null or empty. - * @param filesToAdd files that will be added, cannot be null or empty. + * @param filesToAdd files that will be added, cannot be null or empty. * @return this for method chaining */ default RewriteFiles rewriteFiles(Set filesToDelete, Set filesToAdd) { - return rewriteFiles( - filesToDelete, - ImmutableSet.of(), - filesToAdd, - ImmutableSet.of() - ); + return rewriteFiles(filesToDelete, ImmutableSet.of(), filesToAdd, ImmutableSet.of()); } /** * Add a rewrite that replaces one set of data files with another set that contains the same data. * The sequence number provided will be used for all the data files added. * - * @param filesToDelete files that will be replaced (deleted), cannot be null or empty. - * @param filesToAdd files that will be added, cannot be null or empty. + * @param filesToDelete files that will be replaced (deleted), cannot be null or empty. + * @param filesToAdd files that will be added, cannot be null or empty. * @param sequenceNumber sequence number to use for all data files added * @return this for method chaining */ - RewriteFiles rewriteFiles(Set filesToDelete, Set filesToAdd, long sequenceNumber); + RewriteFiles rewriteFiles( + Set filesToDelete, Set filesToAdd, long sequenceNumber); /** * Add a rewrite that replaces one set of files with another set that contains the same data. * - * @param dataFilesToReplace data files that will be replaced (deleted). + * @param dataFilesToReplace data files that will be replaced (deleted). * @param deleteFilesToReplace delete files that will be replaced (deleted). - * @param dataFilesToAdd data files that will be added. - * @param deleteFilesToAdd delete files that will be added. + * @param dataFilesToAdd data files that will be added. + * @param deleteFilesToAdd delete files that will be added. * @return this for method chaining. */ - RewriteFiles rewriteFiles(Set dataFilesToReplace, Set deleteFilesToReplace, - Set dataFilesToAdd, Set deleteFilesToAdd); + RewriteFiles rewriteFiles( + Set dataFilesToReplace, + Set deleteFilesToReplace, + Set dataFilesToAdd, + Set deleteFilesToAdd); /** * Set the snapshot ID used in any reads for this operation. - *

- * Validations will check changes after this snapshot ID. If this is not called, all ancestor snapshots through the - * table's initial snapshot are validated. + * + *

Validations will check changes after this snapshot ID. If this is not called, all ancestor + * snapshots through the table's initial snapshot are validated. * * @param snapshotId a snapshot ID * @return this for method chaining diff --git a/api/src/main/java/org/apache/iceberg/RewriteJobOrder.java b/api/src/main/java/org/apache/iceberg/RewriteJobOrder.java index 3b47dff78296..2face482a5dd 100644 --- a/api/src/main/java/org/apache/iceberg/RewriteJobOrder.java +++ b/api/src/main/java/org/apache/iceberg/RewriteJobOrder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Locale; @@ -25,17 +24,25 @@ /** * Enum of supported rewrite job order, it defines the order in which the file groups should be * written. - *

    - *
  • bytes-asc: rewrite the smallest job groups first. - *
  • bytes-desc: rewrite the largest job groups first. - *
  • files-asc: rewrite the job groups with the least files first. - *
  • files-desc: rewrite the job groups with the most files first. - *
  • none: rewrite job groups in the order they were planned (no specific ordering). - *

+ * + *

+ * + *

    + *
  • bytes-asc: rewrite the smallest job groups first. + *
  • bytes-desc: rewrite the largest job groups first. + *
  • files-asc: rewrite the job groups with the least files first. + *
  • files-desc: rewrite the job groups with the most files first. + *
  • none: rewrite job groups in the order they were planned (no specific ordering). + *
+ * + *

*/ public enum RewriteJobOrder { - BYTES_ASC("bytes-asc"), BYTES_DESC("bytes-desc"), - FILES_ASC("files-asc"), FILES_DESC("files-desc"), NONE("none"); + BYTES_ASC("bytes-asc"), + BYTES_DESC("bytes-desc"), + FILES_ASC("files-asc"), + FILES_DESC("files-desc"), + NONE("none"); private final String orderName; @@ -49,7 +56,8 @@ public String orderName() { public static RewriteJobOrder fromName(String orderName) { Preconditions.checkArgument(orderName != null, "Rewrite job order name should not be null"); - // Replace the hyphen in order name with underscore to map to the enum value. For example: bytes-asc to BYTES_ASC + // Replace the hyphen in order name with underscore to map to the enum value. For example: + // bytes-asc to BYTES_ASC return RewriteJobOrder.valueOf(orderName.replaceFirst("-", "_").toUpperCase(Locale.ENGLISH)); } } diff --git a/api/src/main/java/org/apache/iceberg/RewriteManifests.java b/api/src/main/java/org/apache/iceberg/RewriteManifests.java index f6ca9e239bd2..ca823e94d265 100644 --- a/api/src/main/java/org/apache/iceberg/RewriteManifests.java +++ b/api/src/main/java/org/apache/iceberg/RewriteManifests.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.function.Function; @@ -24,16 +23,16 @@ /** * API for rewriting manifests for a table. - *

- * This API accumulates manifest files, produces a new {@link Snapshot} of the table - * described only by the manifest files that were added, and commits that snapshot as the - * current. - *

- * This API can be used to rewrite matching manifests according to a clustering function as well as - * to replace specific manifests. Manifests that are deleted or added directly are ignored during - * the rewrite process. The set of active files in replaced manifests must be the same as in new manifests. - *

- * When committing, these changes will be applied to the latest table snapshot. Commit conflicts + * + *

This API accumulates manifest files, produces a new {@link Snapshot} of the table described + * only by the manifest files that were added, and commits that snapshot as the current. + * + *

This API can be used to rewrite matching manifests according to a clustering function as well + * as to replace specific manifests. Manifests that are deleted or added directly are ignored during + * the rewrite process. The set of active files in replaced manifests must be the same as in new + * manifests. + * + *

When committing, these changes will be applied to the latest table snapshot. Commit conflicts * will be resolved by applying the changes to the new latest snapshot and reattempting the commit. */ public interface RewriteManifests extends SnapshotUpdate { @@ -51,11 +50,11 @@ public interface RewriteManifests extends SnapshotUpdate { /** * Determines which existing {@link ManifestFile} for the table should be rewritten. Manifests - * that do not match the predicate are kept as-is. If this is not called and no predicate is set, then - * all manifests will be rewritten. + * that do not match the predicate are kept as-is. If this is not called and no predicate is set, + * then all manifests will be rewritten. * - * @param predicate Predicate used to determine which manifests to rewrite. If true then the manifest - * file will be included for rewrite. If false then then manifest is kept as-is. + * @param predicate Predicate used to determine which manifests to rewrite. If true then the + * manifest file will be included for rewrite. If false then then manifest is kept as-is. * @return this for method chaining */ RewriteManifests rewriteIf(Predicate predicate); @@ -71,17 +70,17 @@ public interface RewriteManifests extends SnapshotUpdate { /** * Adds a {@link ManifestFile manifest file} to the table. The added manifest cannot contain new * or deleted files. - *

- * By default, the manifest will be rewritten to ensure all entries have explicit snapshot IDs. - * In that case, it is always the responsibility of the caller to manage the lifecycle of - * the original manifest. - *

- * If manifest entries are allowed to inherit the snapshot ID assigned on commit, the manifest + * + *

By default, the manifest will be rewritten to ensure all entries have explicit snapshot IDs. + * In that case, it is always the responsibility of the caller to manage the lifecycle of the + * original manifest. + * + *

If manifest entries are allowed to inherit the snapshot ID assigned on commit, the manifest * should never be deleted manually if the commit succeeds as it will become part of the table * metadata and will be cleaned up on expiry. If the manifest gets merged with others while - * preparing a new snapshot, it will be deleted automatically if this operation is successful. - * If the commit fails, the manifest will never be deleted and it is up to the caller whether - * to delete or reuse it. + * preparing a new snapshot, it will be deleted automatically if this operation is successful. If + * the commit fails, the manifest will never be deleted and it is up to the caller whether to + * delete or reuse it. * * @param manifest a manifest to add * @return this for method chaining diff --git a/api/src/main/java/org/apache/iceberg/Rollback.java b/api/src/main/java/org/apache/iceberg/Rollback.java index b73353943670..48060beef35b 100644 --- a/api/src/main/java/org/apache/iceberg/Rollback.java +++ b/api/src/main/java/org/apache/iceberg/Rollback.java @@ -16,18 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.exceptions.CommitFailedException; /** * API for rolling table data back to the state at an older table {@link Snapshot snapshot}. - *

- * This API does not allow conflicting calls to {@link #toSnapshotId(long)} and - * {@link #toSnapshotAtTime(long)}. - *

- * When committing, these changes will be applied to the current table metadata. Commit conflicts + * + *

This API does not allow conflicting calls to {@link #toSnapshotId(long)} and {@link + * #toSnapshotAtTime(long)}. + * + *

When committing, these changes will be applied to the current table metadata. Commit conflicts * will not be resolved and will result in a {@link CommitFailedException}. */ public interface Rollback extends PendingUpdate { diff --git a/api/src/main/java/org/apache/iceberg/RowDelta.java b/api/src/main/java/org/apache/iceberg/RowDelta.java index dcf250aff12f..b8a44a602004 100644 --- a/api/src/main/java/org/apache/iceberg/RowDelta.java +++ b/api/src/main/java/org/apache/iceberg/RowDelta.java @@ -16,18 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.expressions.Expression; /** * API for encoding row-level changes to a table. - *

- * This API accumulates data and delete file changes, produces a new {@link Snapshot} of the table, and commits - * that snapshot as the current. - *

- * When committing, these changes will be applied to the latest table snapshot. Commit conflicts + * + *

This API accumulates data and delete file changes, produces a new {@link Snapshot} of the + * table, and commits that snapshot as the current. + * + *

When committing, these changes will be applied to the latest table snapshot. Commit conflicts * will be resolved by applying the changes to the new latest snapshot and reattempting the commit. */ public interface RowDelta extends SnapshotUpdate { @@ -49,9 +48,9 @@ public interface RowDelta extends SnapshotUpdate { /** * Set the snapshot ID used in any reads for this operation. - *

- * Validations will check changes after this snapshot ID. If the from snapshot is not set, all ancestor snapshots - * through the table's initial snapshot are validated. + * + *

Validations will check changes after this snapshot ID. If the from snapshot is not set, all + * ancestor snapshots through the table's initial snapshot are validated. * * @param snapshotId a snapshot ID * @return this for method chaining @@ -67,14 +66,15 @@ public interface RowDelta extends SnapshotUpdate { RowDelta caseSensitive(boolean caseSensitive); /** - * Add data file paths that must not be removed by conflicting commits for this RowDelta to succeed. - *

- * If any path has been removed by a conflicting commit in the table since the snapshot passed to - * {@link #validateFromSnapshot(long)}, the operation will fail with a - * {@link org.apache.iceberg.exceptions.ValidationException}. - *

- * By default, this validation checks only rewrite and overwrite commits. To apply validation to delete commits, call - * {@link #validateDeletedFiles()}. + * Add data file paths that must not be removed by conflicting commits for this RowDelta to + * succeed. + * + *

If any path has been removed by a conflicting commit in the table since the snapshot passed + * to {@link #validateFromSnapshot(long)}, the operation will fail with a {@link + * org.apache.iceberg.exceptions.ValidationException}. + * + *

By default, this validation checks only rewrite and overwrite commits. To apply validation + * to delete commits, call {@link #validateDeletedFiles()}. * * @param referencedFiles file paths that are referenced by a position delete file * @return this for method chaining @@ -82,35 +82,39 @@ public interface RowDelta extends SnapshotUpdate { RowDelta validateDataFilesExist(Iterable referencedFiles); /** - * Enable validation that referenced data files passed to {@link #validateDataFilesExist(Iterable)} have not been - * removed by a delete operation. - *

- * If a data file has a row deleted using a position delete file, rewriting or overwriting the data file concurrently - * would un-delete the row. Deleting the data file is normally allowed, but a delete may be part of a transaction - * that reads and re-appends a row. This method is used to validate deletes for the transaction case. + * Enable validation that referenced data files passed to {@link + * #validateDataFilesExist(Iterable)} have not been removed by a delete operation. + * + *

If a data file has a row deleted using a position delete file, rewriting or overwriting the + * data file concurrently would un-delete the row. Deleting the data file is normally allowed, but + * a delete may be part of a transaction that reads and re-appends a row. This method is used to + * validate deletes for the transaction case. * * @return this for method chaining */ RowDelta validateDeletedFiles(); /** - * Enables validation that data files added concurrently do not conflict with this commit's operation. - *

- * This method should be called when the table is queried to determine which files to delete/append. - * If a concurrent operation commits a new file after the data was read and that file might - * contain rows matching the specified conflict detection filter, the overwrite operation - * will detect this during retries and fail. - *

- * Calling this method with a correct conflict detection filter is required to maintain - * serializable isolation for update/delete operations. Otherwise, the isolation level - * will be snapshot isolation. - *

- * Validation applies to files added to the table since the snapshot passed to {@link #validateFromSnapshot(long)}. + * Enables validation that data files added concurrently do not conflict with this commit's + * operation. + * + *

This method should be called when the table is queried to determine which files to + * delete/append. If a concurrent operation commits a new file after the data was read and that + * file might contain rows matching the specified conflict detection filter, the overwrite + * operation will detect this during retries and fail. + * + *

Calling this method with a correct conflict detection filter is required to maintain + * serializable isolation for update/delete operations. Otherwise, the isolation level will be + * snapshot isolation. + * + *

Validation applies to files added to the table since the snapshot passed to {@link + * #validateFromSnapshot(long)}. * * @param conflictDetectionFilter an expression on rows in the table * @return this for method chaining - * @deprecated since 0.13.0, will be removed in 0.14.0; use {@link #conflictDetectionFilter(Expression)} and - * {@link #validateNoConflictingDataFiles()} instead. + * @deprecated since 0.13.0, will be removed in 0.14.0; use {@link + * #conflictDetectionFilter(Expression)} and {@link #validateNoConflictingDataFiles()} + * instead. */ @Deprecated default RowDelta validateNoConflictingAppends(Expression conflictDetectionFilter) { @@ -120,8 +124,8 @@ default RowDelta validateNoConflictingAppends(Expression conflictDetectionFilter /** * Sets a conflict detection filter used to validate concurrently added data and delete files. - *

- * If not called, a true literal will be used as the conflict detection filter. + * + *

If not called, a true literal will be used as the conflict detection filter. * * @param conflictDetectionFilter an expression on rows in the table * @return this for method chaining @@ -129,32 +133,36 @@ default RowDelta validateNoConflictingAppends(Expression conflictDetectionFilter RowDelta conflictDetectionFilter(Expression conflictDetectionFilter); /** - * Enables validation that data files added concurrently do not conflict with this commit's operation. - *

- * This method should be called when the table is queried to determine which files to delete/append. - * If a concurrent operation commits a new file after the data was read and that file might - * contain rows matching the specified conflict detection filter, this operation - * will detect this during retries and fail. - *

- * Calling this method is required to maintain serializable isolation for update/delete operations. - * Otherwise, the isolation level will be snapshot isolation. - *

- * Validation uses the conflict detection filter passed to {@link #conflictDetectionFilter(Expression)} and - * applies to operations that happened after the snapshot passed to {@link #validateFromSnapshot(long)}. + * Enables validation that data files added concurrently do not conflict with this commit's + * operation. + * + *

This method should be called when the table is queried to determine which files to + * delete/append. If a concurrent operation commits a new file after the data was read and that + * file might contain rows matching the specified conflict detection filter, this operation will + * detect this during retries and fail. + * + *

Calling this method is required to maintain serializable isolation for update/delete + * operations. Otherwise, the isolation level will be snapshot isolation. + * + *

Validation uses the conflict detection filter passed to {@link + * #conflictDetectionFilter(Expression)} and applies to operations that happened after the + * snapshot passed to {@link #validateFromSnapshot(long)}. * * @return this for method chaining */ RowDelta validateNoConflictingDataFiles(); /** - * Enables validation that delete files added concurrently do not conflict with this commit's operation. - *

- * This method must be called when the table is queried to produce a row delta for UPDATE and - * MERGE operations independently of the isolation level. Calling this method isn't required - * for DELETE operations as it is OK to delete a record that is also deleted concurrently. - *

- * Validation uses the conflict detection filter passed to {@link #conflictDetectionFilter(Expression)} and - * applies to operations that happened after the snapshot passed to {@link #validateFromSnapshot(long)}. + * Enables validation that delete files added concurrently do not conflict with this commit's + * operation. + * + *

This method must be called when the table is queried to produce a row delta for UPDATE and + * MERGE operations independently of the isolation level. Calling this method isn't required for + * DELETE operations as it is OK to delete a record that is also deleted concurrently. + * + *

Validation uses the conflict detection filter passed to {@link + * #conflictDetectionFilter(Expression)} and applies to operations that happened after the + * snapshot passed to {@link #validateFromSnapshot(long)}. * * @return this for method chaining */ diff --git a/api/src/main/java/org/apache/iceberg/Scan.java b/api/src/main/java/org/apache/iceberg/Scan.java index 72e005649093..118b9ce66b5a 100644 --- a/api/src/main/java/org/apache/iceberg/Scan.java +++ b/api/src/main/java/org/apache/iceberg/Scan.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg; import java.util.Collection; @@ -26,8 +24,8 @@ import org.apache.iceberg.io.CloseableIterable; /** - * Scan objects are immutable and can be shared between threads. Refinement methods, like - * {@link #select(Collection)} and {@link #filter(Expression)}, create new TableScan instances. + * Scan objects are immutable and can be shared between threads. Refinement methods, like {@link + * #select(Collection)} and {@link #filter(Expression)}, create new TableScan instances. * * @param the child Java API class, returned by method chaining * @param the Java type of tasks produces by this scan @@ -35,8 +33,8 @@ */ public interface Scan> { /** - * Create a new scan from this scan's configuration that will override the {@link Table}'s behavior based - * on the incoming pair. Unknown properties will be ignored. + * Create a new scan from this scan's configuration that will override the {@link Table}'s + * behavior based on the incoming pair. Unknown properties will be ignored. * * @param property name of the table property to be overridden * @param value value to override with @@ -53,9 +51,9 @@ public interface Scan> { ThisT project(Schema schema); /** - * Create a new scan from this that, if data columns where selected - * via {@link #select(java.util.Collection)}, controls whether the match to the schema will be done - * with case sensitivity. Default is true. + * Create a new scan from this that, if data columns where selected via {@link + * #select(java.util.Collection)}, controls whether the match to the schema will be done with case + * sensitivity. Default is true. * * @return a new scan based on this with case sensitivity as stated */ @@ -63,17 +61,17 @@ public interface Scan> { /** * Create a new scan from this that loads the column stats with each data file. - *

- * Column stats include: value count, null value count, lower bounds, and upper bounds. + * + *

Column stats include: value count, null value count, lower bounds, and upper bounds. * * @return a new scan based on this that loads column stats. */ ThisT includeColumnStats(); /** - * Create a new scan from this that will read the given data columns. This produces - * an expected schema that includes all fields that are either selected or used by this scan's - * filter expression. + * Create a new scan from this that will read the given data columns. This produces an expected + * schema that includes all fields that are either selected or used by this scan's filter + * expression. * * @param columns column names from the table's schema * @return a new scan based on this with the given projection columns @@ -89,15 +87,16 @@ public interface Scan> { ThisT filter(Expression expr); /** - * Create a new scan from this that applies data filtering to files but not to rows in those files. + * Create a new scan from this that applies data filtering to files but not to rows in those + * files. * * @return a new scan based on this that does not filter rows in files. */ ThisT ignoreResiduals(); /** - * Create a new scan to use a particular executor to plan. The default worker pool will be - * used by default. + * Create a new scan to use a particular executor to plan. The default worker pool will be used by + * default. * * @param executorService the provided executor * @return a table scan that uses the provided executor to access manifests @@ -106,11 +105,13 @@ public interface Scan> { /** * Returns this scan's projection {@link Schema}. - *

- * If the projection schema was set directly using {@link #project(Schema)}, returns that schema. - *

- * If the projection schema was set by calling {@link #select(Collection)}, returns a projection - * schema that includes the selected data fields and any fields used in the filter expression. + * + *

If the projection schema was set directly using {@link #project(Schema)}, returns that + * schema. + * + *

If the projection schema was set by calling {@link #select(Collection)}, returns a + * projection schema that includes the selected data fields and any fields used in the filter + * expression. * * @return this scan's projection schema */ @@ -118,9 +119,9 @@ public interface Scan> { /** * Plan tasks for this scan where each task reads a single file. - *

- * Use {@link #planTasks()} for planning balanced tasks where each task will read either a single file, - * a part of a file, or multiple files. + * + *

Use {@link #planTasks()} for planning balanced tasks where each task will read either a + * single file, a part of a file, or multiple files. * * @return an Iterable of tasks scanning entire files required by this scan */ @@ -128,25 +129,20 @@ public interface Scan> { /** * Plan balanced task groups for this scan by splitting large and combining small tasks. - *

- * Task groups created by this method may read partial input files, multiple input files or both. + * + *

Task groups created by this method may read partial input files, multiple input files or + * both. * * @return an Iterable of balanced task groups required by this scan */ CloseableIterable planTasks(); - /** - * Returns the target split size for this scan. - */ + /** Returns the target split size for this scan. */ long targetSplitSize(); - /** - * Returns the split lookback for this scan. - */ + /** Returns the split lookback for this scan. */ int splitLookback(); - /** - * Returns the split open file cost for this scan. - */ + /** Returns the split open file cost for this scan. */ long splitOpenFileCost(); } diff --git a/api/src/main/java/org/apache/iceberg/ScanTask.java b/api/src/main/java/org/apache/iceberg/ScanTask.java index 1b202f506a2b..3468f5e980f0 100644 --- a/api/src/main/java/org/apache/iceberg/ScanTask.java +++ b/api/src/main/java/org/apache/iceberg/ScanTask.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; -/** - * A scan task. - */ +/** A scan task. */ public interface ScanTask extends Serializable { /** * The number of bytes that should be read by this scan task. @@ -43,9 +40,7 @@ default int filesCount() { return 1; } - /** - * Returns true if this is a {@link FileScanTask}, false otherwise. - */ + /** Returns true if this is a {@link FileScanTask}, false otherwise. */ default boolean isFileScanTask() { return false; } @@ -60,9 +55,7 @@ default FileScanTask asFileScanTask() { throw new IllegalStateException("Not a FileScanTask: " + this); } - /** - * Returns true if this is a {@link DataTask}, false otherwise. - */ + /** Returns true if this is a {@link DataTask}, false otherwise. */ default boolean isDataTask() { return false; } diff --git a/api/src/main/java/org/apache/iceberg/ScanTaskGroup.java b/api/src/main/java/org/apache/iceberg/ScanTaskGroup.java index 71a2d3fa43c3..4aabc1a064d8 100644 --- a/api/src/main/java/org/apache/iceberg/ScanTaskGroup.java +++ b/api/src/main/java/org/apache/iceberg/ScanTaskGroup.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Collection; @@ -27,9 +26,7 @@ * @param the type of scan tasks */ public interface ScanTaskGroup extends ScanTask { - /** - * Returns scan tasks in this group. - */ + /** Returns scan tasks in this group. */ Collection tasks(); @Override diff --git a/api/src/main/java/org/apache/iceberg/Schema.java b/api/src/main/java/org/apache/iceberg/Schema.java index 987046a27bd6..34105a00adc2 100644 --- a/api/src/main/java/org/apache/iceberg/Schema.java +++ b/api/src/main/java/org/apache/iceberg/Schema.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; @@ -45,9 +44,9 @@ /** * The schema of a data table. - *

- * Schema ID will only be populated when reading from/writing to table metadata, - * otherwise it will be default to 0. + * + *

Schema ID will only be populated when reading from/writing to table metadata, otherwise it + * will be default to 0. */ public class Schema implements Serializable { private static final Joiner NEWLINE = Joiner.on('\n'); @@ -71,7 +70,8 @@ public Schema(List columns, Map aliases) { this(columns, aliases, ImmutableSet.of()); } - public Schema(List columns, Map aliases, Set identifierFieldIds) { + public Schema( + List columns, Map aliases, Set identifierFieldIds) { this(DEFAULT_SCHEMA_ID, columns, aliases, identifierFieldIds); } @@ -91,8 +91,11 @@ public Schema(int schemaId, List columns, Set identifierFi this(schemaId, columns, null, identifierFieldIds); } - public Schema(int schemaId, List columns, Map aliases, - Set identifierFieldIds) { + public Schema( + int schemaId, + List columns, + Map aliases, + Set identifierFieldIds) { this.schemaId = schemaId; this.struct = StructType.of(columns); this.aliasToId = aliases != null ? ImmutableBiMap.copyOf(aliases) : null; @@ -103,23 +106,31 @@ public Schema(int schemaId, List columns, Map alia identifierFieldIds.forEach(id -> validateIdentifierField(id, lazyIdToField(), idToParent)); } - this.identifierFieldIds = identifierFieldIds != null ? Ints.toArray(identifierFieldIds) : new int[0]; + this.identifierFieldIds = + identifierFieldIds != null ? Ints.toArray(identifierFieldIds) : new int[0]; this.highestFieldId = lazyIdToName().keySet().stream().mapToInt(i -> i).max().orElse(0); } - static void validateIdentifierField(int fieldId, Map idToField, - Map idToParent) { + static void validateIdentifierField( + int fieldId, Map idToField, Map idToParent) { Types.NestedField field = idToField.get(fieldId); - Preconditions.checkArgument(field != null, - "Cannot add fieldId %s as an identifier field: field does not exist", fieldId); - Preconditions.checkArgument(field.type().isPrimitiveType(), - "Cannot add field %s as an identifier field: not a primitive type field", field.name()); - Preconditions.checkArgument(field.isRequired(), - "Cannot add field %s as an identifier field: not a required field", field.name()); - Preconditions.checkArgument(!Types.DoubleType.get().equals(field.type()) && - !Types.FloatType.get().equals(field.type()), - "Cannot add field %s as an identifier field: must not be float or double field", field.name()); + Preconditions.checkArgument( + field != null, + "Cannot add fieldId %s as an identifier field: field does not exist", + fieldId); + Preconditions.checkArgument( + field.type().isPrimitiveType(), + "Cannot add field %s as an identifier field: not a primitive type field", + field.name()); + Preconditions.checkArgument( + field.isRequired(), + "Cannot add field %s as an identifier field: not a required field", + field.name()); + Preconditions.checkArgument( + !Types.DoubleType.get().equals(field.type()) && !Types.FloatType.get().equals(field.type()), + "Cannot add field %s as an identifier field: must not be float or double field", + field.name()); // check whether the nested field is in a chain of required struct fields // exploring from root for better error message for list and map types @@ -132,11 +143,16 @@ static void validateIdentifierField(int fieldId, Map while (!deque.isEmpty()) { Types.NestedField parent = idToField.get(deque.pop()); - Preconditions.checkArgument(parent.type().isStructType(), - "Cannot add field %s as an identifier field: must not be nested in %s", field.name(), parent); - Preconditions.checkArgument(parent.isRequired(), + Preconditions.checkArgument( + parent.type().isStructType(), + "Cannot add field %s as an identifier field: must not be nested in %s", + field.name(), + parent); + Preconditions.checkArgument( + parent.isRequired(), "Cannot add field %s as an identifier field: must not be nested in an optional field %s", - field.name(), parent); + field.name(), + parent); } } @@ -192,25 +208,23 @@ private Set lazyIdentifierFieldIdSet() { /** * Returns the schema ID for this schema. - *

- * Note that schema ID will only be populated when reading from/writing to table metadata, + * + *

Note that schema ID will only be populated when reading from/writing to table metadata, * otherwise it will be default to 0. */ public int schemaId() { return this.schemaId; } - /** - * Returns the highest field ID in this schema, including nested fields. - */ + /** Returns the highest field ID in this schema, including nested fields. */ public int highestFieldId() { return highestFieldId; } /** * Returns an alias map for this schema, if set. - *

- * Alias maps are created when translating an external schema, like an Avro Schema, to this + * + *

Alias maps are created when translating an external schema, like an Avro Schema, to this * format. The original column names can be provided in a Map when constructing this Schema. * * @return a Map of column aliases to field ids @@ -228,28 +242,28 @@ public StructType asStruct() { return struct; } - /** - * Returns a List of the {@link NestedField columns} in this Schema. - */ + /** Returns a List of the {@link NestedField columns} in this Schema. */ public List columns() { return struct.fields(); } /** * The set of identifier field IDs. - *

- * Identifier is a concept similar to primary key in a relational database system. - * It consists of a unique set of primitive fields in the schema. - * An identifier field must be at root, or nested in a chain of structs (no maps or lists). - * A row should be unique in a table based on the values of the identifier fields. - * Optional, float and double columns cannot be used as identifier fields. - * However, Iceberg identifier differs from primary key in the following ways: + * + *

Identifier is a concept similar to primary key in a relational database system. It consists + * of a unique set of primitive fields in the schema. An identifier field must be at root, or + * nested in a chain of structs (no maps or lists). A row should be unique in a table based on the + * values of the identifier fields. Optional, float and double columns cannot be used as + * identifier fields. However, Iceberg identifier differs from primary key in the following ways: + * *

    - *
  • Iceberg does not enforce the uniqueness of a row based on this identifier information. - * It is used for operations like upsert to define the default upsert key.
  • - *
  • A nested field in a struct can be used as an identifier. For example, if there is a "last_name" field - * inside a "user" struct in a schema, field "user.last_name" can be set as a part of the identifier field.
  • + *
  • Iceberg does not enforce the uniqueness of a row based on this identifier information. It + * is used for operations like upsert to define the default upsert key. + *
  • A nested field in a struct can be used as an identifier. For example, if there is a + * "last_name" field inside a "user" struct in a schema, field "user.last_name" can be set + * as a part of the identifier field. *
+ * *

* * @return the set of identifier field IDs in this schema. @@ -258,14 +272,11 @@ public Set identifierFieldIds() { return lazyIdentifierFieldIdSet(); } - /** - * Returns the set of identifier field names. - */ + /** Returns the set of identifier field names. */ public Set identifierFieldNames() { - return identifierFieldIds() - .stream() - .map(id -> lazyIdToName().get(id)) - .collect(Collectors.toSet()); + return identifierFieldIds().stream() + .map(id -> lazyIdToName().get(id)) + .collect(Collectors.toSet()); } /** @@ -277,7 +288,7 @@ public Set identifierFieldNames() { public Type findType(String name) { Preconditions.checkArgument(!name.isEmpty(), "Invalid column name: (empty)"); Integer id = lazyNameToId().get(name); - if (id != null) { // name is found + if (id != null) { // name is found return findType(id); } @@ -311,8 +322,8 @@ public NestedField findField(int id) { /** * Returns a sub-field by name as a {@link NestedField}. - *

- * The result may be a top-level or a nested field. + * + *

The result may be a top-level or a nested field. * * @param name a String name * @return a Type for the sub-field or null if it is not found @@ -328,8 +339,8 @@ public NestedField findField(String name) { /** * Returns a sub-field by name as a {@link NestedField}. - *

- * The result may be a top-level or a nested field. + * + *

The result may be a top-level or a nested field. * * @param name a String name * @return the sub-field or null if it is not found @@ -354,8 +365,8 @@ public String findColumnName(int id) { } /** - * Returns the column id for the given column alias. Column aliases are set - * by conversions from Parquet or Avro to this Schema type. + * Returns the column id for the given column alias. Column aliases are set by conversions from + * Parquet or Avro to this Schema type. * * @param alias a full column name in the unconverted data schema * @return the column id in this schema, or null if the column wasn't found @@ -368,8 +379,8 @@ public Integer aliasToId(String alias) { } /** - * Returns the full column name in the unconverted data schema for the given column id. - * Column aliases are set by conversions from Parquet or Avro to this Schema type. + * Returns the full column name in the unconverted data schema for the given column id. Column + * aliases are set by conversions from Parquet or Avro to this Schema type. * * @param fieldId a column id in this schema * @return the full column name in the unconverted data schema, or null if one wasn't found @@ -383,8 +394,8 @@ public String idToAlias(Integer fieldId) { /** * Returns an accessor for retrieving the data from {@link StructLike}. - *

- * Accessors do not retrieve data contained in lists or maps. + * + *

Accessors do not retrieve data contained in lists or maps. * * @param id a column id in this schema * @return an {@link Accessor} to retrieve values from a {@link StructLike} row @@ -395,8 +406,8 @@ public Accessor accessorForField(int id) { /** * Creates a projection schema for a subset of columns, selected by name. - *

- * Names that identify nested fields will select part or all of the field's top-level column. + * + *

Names that identify nested fields will select part or all of the field's top-level column. * * @param names String names for selected columns * @return a projection schema from this schema, by name @@ -407,8 +418,8 @@ public Schema select(String... names) { /** * Creates a projection schema for a subset of columns, selected by name. - *

- * Names that identify nested fields will select part or all of the field's top-level column. + * + *

Names that identify nested fields will select part or all of the field's top-level column. * * @param names a List of String names for selected columns * @return a projection schema from this schema, by name @@ -419,8 +430,8 @@ public Schema select(Collection names) { /** * Creates a projection schema for a subset of columns, selected by case insensitive names - *

- * Names that identify nested fields will select part or all of the field's top-level column. + * + *

Names that identify nested fields will select part or all of the field's top-level column. * * @param names a List of String names for selected columns * @return a projection schema from this schema, by names @@ -431,8 +442,8 @@ public Schema caseInsensitiveSelect(String... names) { /** * Creates a projection schema for a subset of columns, selected by case insensitive names - *

- * Names that identify nested fields will select part or all of the field's top-level column. + * + *

Names that identify nested fields will select part or all of the field's top-level column. * * @param names a List of String names for selected columns * @return a projection schema from this schema, by names @@ -443,12 +454,13 @@ public Schema caseInsensitiveSelect(Collection names) { /** * Checks whether this schema is equivalent to another schema while ignoring the schema ID. + * * @param anotherSchema another schema * @return true if this schema is equivalent to the given schema */ public boolean sameSchema(Schema anotherSchema) { - return asStruct().equals(anotherSchema.asStruct()) && - identifierFieldIds().equals(anotherSchema.identifierFieldIds()); + return asStruct().equals(anotherSchema.asStruct()) + && identifierFieldIds().equals(anotherSchema.identifierFieldIds()); } private Schema internalSelect(Collection names, boolean caseSensitive) { @@ -479,9 +491,11 @@ private String identifierFieldToString(Types.NestedField field) { @Override public String toString() { - return String.format("table {\n%s\n}", - NEWLINE.join(struct.fields().stream() - .map(this::identifierFieldToString) - .collect(Collectors.toList()))); + return String.format( + "table {\n%s\n}", + NEWLINE.join( + struct.fields().stream() + .map(this::identifierFieldToString) + .collect(Collectors.toList()))); } } diff --git a/api/src/main/java/org/apache/iceberg/Snapshot.java b/api/src/main/java/org/apache/iceberg/Snapshot.java index cfaa7f9b24e3..e998fbc4b6c4 100644 --- a/api/src/main/java/org/apache/iceberg/Snapshot.java +++ b/api/src/main/java/org/apache/iceberg/Snapshot.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; @@ -26,17 +25,17 @@ /** * A snapshot of the data in a table at a point in time. - *

- * A snapshot consist of one or more file manifests, and the complete table contents is the union + * + *

A snapshot consist of one or more file manifests, and the complete table contents is the union * of all the data files in those manifests. - *

- * Snapshots are created by table operations, like {@link AppendFiles} and {@link RewriteFiles}. + * + *

Snapshots are created by table operations, like {@link AppendFiles} and {@link RewriteFiles}. */ public interface Snapshot extends Serializable { /** * Return this snapshot's sequence number. - *

- * Sequence numbers are assigned when a snapshot is committed. + * + *

Sequence numbers are assigned when a snapshot is committed. * * @return a long sequence number */ @@ -58,8 +57,8 @@ public interface Snapshot extends Serializable { /** * Return this snapshot's timestamp. - *

- * This timestamp is the same as those produced by {@link System#currentTimeMillis()}. + * + *

This timestamp is the same as those produced by {@link System#currentTimeMillis()}. * * @return a long timestamp in milliseconds */ @@ -69,7 +68,8 @@ public interface Snapshot extends Serializable { * Return all {@link ManifestFile} instances for either data or delete manifests in this snapshot. * * @return a list of ManifestFile - * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link Snapshot#allManifests(FileIO)} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link Snapshot#allManifests(FileIO)} + * instead. */ @Deprecated List allManifests(); @@ -86,7 +86,8 @@ public interface Snapshot extends Serializable { * Return a {@link ManifestFile} for each data manifest in this snapshot. * * @return a list of ManifestFile - * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link Snapshot#dataManifests(FileIO)} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link Snapshot#dataManifests(FileIO)} + * instead. */ @Deprecated List dataManifests(); @@ -103,7 +104,8 @@ public interface Snapshot extends Serializable { * Return a {@link ManifestFile} for each delete manifest in this snapshot. * * @return a list of ManifestFile - * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link Snapshot#deleteManifests(FileIO)} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link + * Snapshot#deleteManifests(FileIO)} instead. */ @Deprecated List deleteManifests(); @@ -133,20 +135,21 @@ public interface Snapshot extends Serializable { /** * Return all data files added to the table in this snapshot. - *

- * The files returned include the following columns: file_path, file_format, partition, + * + *

The files returned include the following columns: file_path, file_format, partition, * record_count, and file_size_in_bytes. Other columns will be null. * * @return all data files added to the table in this snapshot. - * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link Snapshot#addedDataFiles(FileIO)} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link Snapshot#addedDataFiles(FileIO)} + * instead. */ @Deprecated Iterable addedFiles(); /** * Return all data files added to the table in this snapshot. - *

- * The files returned include the following columns: file_path, file_format, partition, + * + *

The files returned include the following columns: file_path, file_format, partition, * record_count, and file_size_in_bytes. Other columns will be null. * * @param io a {@link FileIO} instance used for reading files from storage @@ -156,20 +159,21 @@ public interface Snapshot extends Serializable { /** * Return all data files deleted from the table in this snapshot. - *

- * The files returned include the following columns: file_path, file_format, partition, + * + *

The files returned include the following columns: file_path, file_format, partition, * record_count, and file_size_in_bytes. Other columns will be null. * * @return all data files deleted from the table in this snapshot. - * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link Snapshot#removedDataFiles(FileIO)} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link + * Snapshot#removedDataFiles(FileIO)} instead. */ @Deprecated Iterable deletedFiles(); /** * Return all data files removed from the table in this snapshot. - *

- * The files returned include the following columns: file_path, file_format, partition, + * + *

The files returned include the following columns: file_path, file_format, partition, * record_count, and file_size_in_bytes. Other columns will be null. * * @param io a {@link FileIO} instance used for reading files from storage @@ -179,28 +183,30 @@ public interface Snapshot extends Serializable { /** * Return all delete files added to the table in this snapshot. - *

- * The files returned include the following columns: file_path, file_format, partition, + * + *

The files returned include the following columns: file_path, file_format, partition, * record_count, and file_size_in_bytes. Other columns will be null. * * @param io a {@link FileIO} instance used for reading files from storage * @return all delete files added to the table in this snapshot */ default Iterable addedDeleteFiles(FileIO io) { - throw new UnsupportedOperationException(this.getClass().getName() + " doesn't implement addedDeleteFiles"); + throw new UnsupportedOperationException( + this.getClass().getName() + " doesn't implement addedDeleteFiles"); } /** * Return all delete files removed from the table in this snapshot. - *

- * The files returned include the following columns: file_path, file_format, partition, + * + *

The files returned include the following columns: file_path, file_format, partition, * record_count, and file_size_in_bytes. Other columns will be null. * * @param io a {@link FileIO} instance used for reading files from storage * @return all delete files removed from the table in this snapshot */ default Iterable removedDeleteFiles(FileIO io) { - throw new UnsupportedOperationException(this.getClass().getName() + " doesn't implement removedDeleteFiles"); + throw new UnsupportedOperationException( + this.getClass().getName() + " doesn't implement removedDeleteFiles"); } /** @@ -211,7 +217,8 @@ default Iterable removedDeleteFiles(FileIO io) { String manifestListLocation(); /** - * Return the id of the schema used when this snapshot was created, or null if this information is not available. + * Return the id of the schema used when this snapshot was created, or null if this information is + * not available. * * @return schema id associated with this snapshot */ diff --git a/api/src/main/java/org/apache/iceberg/SnapshotRef.java b/api/src/main/java/org/apache/iceberg/SnapshotRef.java index 59ba74151175..917281a9d228 100644 --- a/api/src/main/java/org/apache/iceberg/SnapshotRef.java +++ b/api/src/main/java/org/apache/iceberg/SnapshotRef.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; @@ -86,11 +85,11 @@ public boolean equals(Object other) { } SnapshotRef ref = (SnapshotRef) other; - return ref.snapshotId == snapshotId && - Objects.equals(ref.type(), type) && - Objects.equals(ref.maxRefAgeMs(), maxRefAgeMs) && - Objects.equals(ref.minSnapshotsToKeep(), minSnapshotsToKeep) && - Objects.equals(ref.maxSnapshotAgeMs(), maxSnapshotAgeMs); + return ref.snapshotId == snapshotId + && Objects.equals(ref.type(), type) + && Objects.equals(ref.maxRefAgeMs(), maxRefAgeMs) + && Objects.equals(ref.minSnapshotsToKeep(), minSnapshotsToKeep) + && Objects.equals(ref.maxSnapshotAgeMs(), maxSnapshotAgeMs); } @Override @@ -100,8 +99,7 @@ public int hashCode() { this.type, this.maxRefAgeMs, this.maxSnapshotAgeMs, - this.minSnapshotsToKeep - ); + this.minSnapshotsToKeep); } public static Builder tagBuilder(long snapshotId) { @@ -120,11 +118,13 @@ public static Builder builderFrom(SnapshotRef ref) { } /** - * Creates a ref builder from the given ref and its properties but the ref will now point to the given snapshotId. + * Creates a ref builder from the given ref and its properties but the ref will now point to the + * given snapshotId. * * @param ref Ref to build from * @param snapshotId snapshotID to use. - * @return ref builder with the same retention properties as given ref, but the ref will point to the passed in id + * @return ref builder with the same retention properties as given ref, but the ref will point to + * the passed in id */ public static Builder builderFrom(SnapshotRef ref, long snapshotId) { return new Builder(ref.type(), snapshotId) @@ -152,25 +152,28 @@ public static class Builder { } public Builder minSnapshotsToKeep(Integer value) { - Preconditions.checkArgument(value == null || !type.equals(SnapshotRefType.TAG), + Preconditions.checkArgument( + value == null || !type.equals(SnapshotRefType.TAG), "Tags do not support setting minSnapshotsToKeep"); - Preconditions.checkArgument(value == null || value > 0, - "Min snapshots to keep must be greater than 0"); + Preconditions.checkArgument( + value == null || value > 0, "Min snapshots to keep must be greater than 0"); this.minSnapshotsToKeep = value; return this; } public Builder maxSnapshotAgeMs(Long value) { - Preconditions.checkArgument(value == null || !type.equals(SnapshotRefType.TAG), + Preconditions.checkArgument( + value == null || !type.equals(SnapshotRefType.TAG), "Tags do not support setting maxSnapshotAgeMs"); - Preconditions.checkArgument(value == null || value > 0, - "Max snapshot age must be greater than 0 ms"); + Preconditions.checkArgument( + value == null || value > 0, "Max snapshot age must be greater than 0 ms"); this.maxSnapshotAgeMs = value; return this; } public Builder maxRefAgeMs(Long value) { - Preconditions.checkArgument(value == null || value > 0, "Max reference age must be greater than 0"); + Preconditions.checkArgument( + value == null || value > 0, "Max reference age must be greater than 0"); this.maxRefAgeMs = value; return this; } diff --git a/api/src/main/java/org/apache/iceberg/SnapshotRefType.java b/api/src/main/java/org/apache/iceberg/SnapshotRefType.java index 18d5d1f2afec..b878d9d0ddc6 100644 --- a/api/src/main/java/org/apache/iceberg/SnapshotRefType.java +++ b/api/src/main/java/org/apache/iceberg/SnapshotRefType.java @@ -16,10 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; enum SnapshotRefType { - BRANCH, - TAG + BRANCH, + TAG } diff --git a/api/src/main/java/org/apache/iceberg/SnapshotUpdate.java b/api/src/main/java/org/apache/iceberg/SnapshotUpdate.java index c1742f82ca84..2c5ab790083e 100644 --- a/api/src/main/java/org/apache/iceberg/SnapshotUpdate.java +++ b/api/src/main/java/org/apache/iceberg/SnapshotUpdate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.concurrent.ExecutorService; diff --git a/api/src/main/java/org/apache/iceberg/SortDirection.java b/api/src/main/java/org/apache/iceberg/SortDirection.java index 3be60b6520b9..5436f14c9c0d 100644 --- a/api/src/main/java/org/apache/iceberg/SortDirection.java +++ b/api/src/main/java/org/apache/iceberg/SortDirection.java @@ -16,9 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; public enum SortDirection { - ASC, DESC + ASC, + DESC } diff --git a/api/src/main/java/org/apache/iceberg/SortField.java b/api/src/main/java/org/apache/iceberg/SortField.java index f980d6f8e4de..91a82b2bd61d 100644 --- a/api/src/main/java/org/apache/iceberg/SortField.java +++ b/api/src/main/java/org/apache/iceberg/SortField.java @@ -16,16 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; import java.util.Objects; import org.apache.iceberg.transforms.Transform; -/** - * A field in a {@link SortOrder}. - */ +/** A field in a {@link SortOrder}. */ public class SortField implements Serializable { private final Transform transform; @@ -52,23 +49,17 @@ public Transform transform() { return (Transform) transform; } - /** - * Returns the field id of the source field in the {@link SortOrder sort order's} table schema - */ + /** Returns the field id of the source field in the {@link SortOrder sort order's} table schema */ public int sourceId() { return sourceId; } - /** - * Returns the sort direction - */ + /** Returns the sort direction */ public SortDirection direction() { return direction; } - /** - * Returns the null order - */ + /** Returns the null order */ public NullOrder nullOrder() { return nullOrder; } @@ -82,7 +73,9 @@ public NullOrder nullOrder() { public boolean satisfies(SortField other) { if (Objects.equals(this, other)) { return true; - } else if (sourceId != other.sourceId || direction != other.direction || nullOrder != other.nullOrder) { + } else if (sourceId != other.sourceId + || direction != other.direction + || nullOrder != other.nullOrder) { return false; } @@ -103,10 +96,10 @@ public boolean equals(Object other) { } SortField that = (SortField) other; - return transform.equals(that.transform) && - sourceId == that.sourceId && - direction == that.direction && - nullOrder == that.nullOrder; + return transform.equals(that.transform) + && sourceId == that.sourceId + && direction == that.direction + && nullOrder == that.nullOrder; } @Override diff --git a/api/src/main/java/org/apache/iceberg/SortOrder.java b/api/src/main/java/org/apache/iceberg/SortOrder.java index 91b6177ecec7..ee02318de6fd 100644 --- a/api/src/main/java/org/apache/iceberg/SortOrder.java +++ b/api/src/main/java/org/apache/iceberg/SortOrder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; @@ -39,11 +38,10 @@ import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; -/** - * A sort order that defines how data and delete files should be ordered in a table. - */ +/** A sort order that defines how data and delete files should be ordered in a table. */ public class SortOrder implements Serializable { - private static final SortOrder UNSORTED_ORDER = new SortOrder(new Schema(), 0, Collections.emptyList()); + private static final SortOrder UNSORTED_ORDER = + new SortOrder(new Schema(), 0, Collections.emptyList()); private final Schema schema; private final int orderId; @@ -57,37 +55,27 @@ private SortOrder(Schema schema, int orderId, List fields) { this.fields = fields.toArray(new SortField[0]); } - /** - * Returns the {@link Schema} for this sort order - */ + /** Returns the {@link Schema} for this sort order */ public Schema schema() { return schema; } - /** - * Returns the ID of this sort order - */ + /** Returns the ID of this sort order */ public int orderId() { return orderId; } - /** - * Returns the list of {@link SortField sort fields} for this sort order - */ + /** Returns the list of {@link SortField sort fields} for this sort order */ public List fields() { return lazyFieldList(); } - /** - * Returns true if the sort order is sorted - */ + /** Returns true if the sort order is sorted */ public boolean isSorted() { return fields.length >= 1; } - /** - * Returns true if the sort order is unsorted - */ + /** Returns true if the sort order is unsorted */ public boolean isUnsorted() { return fields.length < 1; } @@ -139,7 +127,8 @@ public UnboundSortOrder toUnbound() { UnboundSortOrder.Builder builder = UnboundSortOrder.builder().withOrderId(orderId); for (SortField field : fields) { - builder.addSortField(field.transform().toString(), field.sourceId(), field.direction(), field.nullOrder()); + builder.addSortField( + field.transform().toString(), field.sourceId(), field.direction(), field.nullOrder()); } return builder.build(); @@ -198,8 +187,8 @@ public static Builder builderFor(Schema schema) { /** * A builder used to create valid {@link SortOrder sort orders}. - *

- * Call {@link #builderFor(Schema)} to create a new builder. + * + *

Call {@link #builderFor(Schema)} to create a new builder. */ public static class Builder implements SortOrderBuilder { private final Schema schema; @@ -243,7 +232,6 @@ public Builder sortBy(Term term, SortDirection direction, NullOrder nullOrder) { return addSortField(term, direction, nullOrder); } - public Builder withOrderId(int newOrderId) { this.orderId = newOrderId; return this; @@ -256,7 +244,8 @@ public Builder caseSensitive(boolean sortCaseSensitive) { private Builder addSortField(Term term, SortDirection direction, NullOrder nullOrder) { Preconditions.checkArgument(term instanceof UnboundTerm, "Term must be unbound"); - // ValidationException is thrown by bind if binding fails so we assume that boundTerm is correct + // ValidationException is thrown by bind if binding fails so we assume that boundTerm is + // correct BoundTerm boundTerm = ((UnboundTerm) term).bind(schema.asStruct(), caseSensitive); int sourceId = boundTerm.ref().fieldId(); SortField sortField = new SortField(toTransform(boundTerm), sourceId, direction, nullOrder); @@ -264,14 +253,16 @@ private Builder addSortField(Term term, SortDirection direction, NullOrder nullO return this; } - Builder addSortField(String transformAsString, int sourceId, SortDirection direction, NullOrder nullOrder) { + Builder addSortField( + String transformAsString, int sourceId, SortDirection direction, NullOrder nullOrder) { Types.NestedField column = schema.findField(sourceId); ValidationException.check(column != null, "Cannot find source column: %s", sourceId); Transform transform = Transforms.fromString(column.type(), transformAsString); return addSortField(transform, sourceId, direction, nullOrder); } - Builder addSortField(Transform transform, int sourceId, SortDirection direction, NullOrder nullOrder) { + Builder addSortField( + Transform transform, int sourceId, SortDirection direction, NullOrder nullOrder) { SortField sortField = new SortField(transform, sourceId, direction, nullOrder); fields.add(sortField); return this; @@ -306,7 +297,8 @@ SortOrder buildUnchecked() { } else if (term instanceof BoundTransform) { return ((BoundTransform) term).transform(); } else { - throw new ValidationException("Invalid term: %s, expected either a bound reference or transform", term); + throw new ValidationException( + "Invalid term: %s, expected either a bound reference or transform", term); } } } @@ -315,15 +307,16 @@ public static void checkCompatibility(SortOrder sortOrder, Schema schema) { for (SortField field : sortOrder.fields) { Type sourceType = schema.findType(field.sourceId()); ValidationException.check( - sourceType != null, - "Cannot find source column for sort field: %s", field); + sourceType != null, "Cannot find source column for sort field: %s", field); ValidationException.check( sourceType.isPrimitiveType(), - "Cannot sort by non-primitive source field: %s", sourceType); + "Cannot sort by non-primitive source field: %s", + sourceType); ValidationException.check( field.transform().canTransform(sourceType), "Invalid source type %s for transform: %s", - sourceType, field.transform()); + sourceType, + field.transform()); } } } diff --git a/api/src/main/java/org/apache/iceberg/SortOrderBuilder.java b/api/src/main/java/org/apache/iceberg/SortOrderBuilder.java index ac622a2f0531..1538586df11d 100644 --- a/api/src/main/java/org/apache/iceberg/SortOrderBuilder.java +++ b/api/src/main/java/org/apache/iceberg/SortOrderBuilder.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.expressions.Term; -/** - * Methods for building a sort order. - */ +/** Methods for building a sort order. */ public interface SortOrderBuilder { /** diff --git a/api/src/main/java/org/apache/iceberg/SplittableScanTask.java b/api/src/main/java/org/apache/iceberg/SplittableScanTask.java index 0385e55972b9..4df61cb90861 100644 --- a/api/src/main/java/org/apache/iceberg/SplittableScanTask.java +++ b/api/src/main/java/org/apache/iceberg/SplittableScanTask.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; /** @@ -26,10 +25,12 @@ */ public interface SplittableScanTask extends ScanTask { /** - * Attempts to split this scan task into several smaller scan tasks, each close to {@code splitSize} size. - *

- * Note the target split size is just guidance and the actual split size may be either smaller or larger. - * File formats like Parquet may leverage the row group offset information while splitting tasks. + * Attempts to split this scan task into several smaller scan tasks, each close to {@code + * splitSize} size. + * + *

Note the target split size is just guidance and the actual split size may be either smaller + * or larger. File formats like Parquet may leverage the row group offset information while + * splitting tasks. * * @param targetSplitSize the target size of each new scan task in bytes * @return an Iterable of smaller tasks diff --git a/api/src/main/java/org/apache/iceberg/StructLike.java b/api/src/main/java/org/apache/iceberg/StructLike.java index 0d3ddb40c788..9ff59aa9ac58 100644 --- a/api/src/main/java/org/apache/iceberg/StructLike.java +++ b/api/src/main/java/org/apache/iceberg/StructLike.java @@ -16,13 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; /** * Interface for accessing data by position in a schema. - *

- * This interface supports accessing data in top-level fields, not in nested fields. + * + *

This interface supports accessing data in top-level fields, not in nested fields. */ public interface StructLike { int size(); diff --git a/api/src/main/java/org/apache/iceberg/Table.java b/api/src/main/java/org/apache/iceberg/Table.java index b83f07c6f8ee..8278c99bfc2d 100644 --- a/api/src/main/java/org/apache/iceberg/Table.java +++ b/api/src/main/java/org/apache/iceberg/Table.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -25,9 +24,7 @@ import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.LocationProvider; -/** - * Represents a table. - */ +/** Represents a table. */ public interface Table { /** @@ -39,15 +36,13 @@ default String name() { return toString(); } - /** - * Refresh the current table metadata. - */ + /** Refresh the current table metadata. */ void refresh(); /** * Create a new {@link TableScan scan} for this table. - *

- * Once a table scan is created, it can be refined to project columns and filter data. + * + *

Once a table scan is created, it can be refined to project columns and filter data. * * @return a table scan for this table */ @@ -55,8 +50,8 @@ default String name() { /** * Create a new {@link IncrementalAppendScan scan} for this table. - *

- * Once a scan is created, it can be refined to project columns and filter data. + * + *

Once a scan is created, it can be refined to project columns and filter data. * * @return an incremental scan for appends only snapshots */ @@ -66,8 +61,8 @@ default IncrementalAppendScan newIncrementalAppendScan() { /** * Create a new {@link IncrementalChangelogScan} for this table. - *

- * Once a scan is created, it can be refined to project columns and filter data. + * + *

Once a scan is created, it can be refined to project columns and filter data. * * @return an incremental changelog scan */ @@ -168,7 +163,8 @@ default IncrementalChangelogScan newIncrementalChangelogScan() { UpdateSchema updateSchema(); /** - * Create a new {@link UpdatePartitionSpec} to alter the partition spec of this table and commit the change. + * Create a new {@link UpdatePartitionSpec} to alter the partition spec of this table and commit + * the change. * * @return a new {@link UpdatePartitionSpec} */ @@ -204,13 +200,13 @@ default IncrementalChangelogScan newIncrementalChangelogScan() { /** * Create a new {@link AppendFiles append API} to add files to this table and commit. - *

- * Using this method signals to the underlying implementation that the append should not perform - * extra work in order to commit quickly. Fast appends are not recommended for normal writes - * because the fast commit may cause split planning to slow down over time. - *

- * Implementations may not support fast appends, in which case this will return the same appender - * as {@link #newAppend()}. + * + *

Using this method signals to the underlying implementation that the append should not + * perform extra work in order to commit quickly. Fast appends are not recommended for normal + * writes because the fast commit may cause split planning to slow down over time. + * + *

Implementations may not support fast appends, in which case this will return the same + * appender as {@link #newAppend()}. * * @return a new {@link AppendFiles} */ @@ -226,8 +222,8 @@ default AppendFiles newFastAppend() { RewriteFiles newRewrite(); /** - * Create a new {@link RewriteManifests rewrite manifests API} to replace manifests for this - * table and commit. + * Create a new {@link RewriteManifests rewrite manifests API} to replace manifests for this table + * and commit. * * @return a new {@link RewriteManifests} */ @@ -241,7 +237,8 @@ default AppendFiles newFastAppend() { OverwriteFiles newOverwrite(); /** - * Create a new {@link RowDelta row-level delta API} to remove or replace rows in existing data files. + * Create a new {@link RowDelta row-level delta API} to remove or replace rows in existing data + * files. * * @return a new {@link RowDelta} */ @@ -250,9 +247,10 @@ default AppendFiles newFastAppend() { /** * Not recommended: Create a new {@link ReplacePartitions replace partitions API} to dynamically * overwrite partitions in the table with new data. - *

- * This is provided to implement SQL compatible with Hive table operations but is not recommended. - * Instead, use the {@link OverwriteFiles overwrite API} to explicitly overwrite data. + * + *

This is provided to implement SQL compatible with Hive table operations but is not + * recommended. Instead, use the {@link OverwriteFiles overwrite API} to explicitly overwrite + * data. * * @return a new {@link ReplacePartitions} */ @@ -282,7 +280,9 @@ default AppendFiles newFastAppend() { Rollback rollback(); /** - * Create a new {@link ManageSnapshots manage snapshots API} to manage snapshots in this table and commit. + * Create a new {@link ManageSnapshots manage snapshots API} to manage snapshots in this table and + * commit. + * * @return a new {@link ManageSnapshots} */ ManageSnapshots manageSnapshots(); @@ -294,18 +294,15 @@ default AppendFiles newFastAppend() { */ Transaction newTransaction(); - /** - * Returns a {@link FileIO} to read and write table data and metadata files. - */ + /** Returns a {@link FileIO} to read and write table data and metadata files. */ FileIO io(); /** - * Returns an {@link org.apache.iceberg.encryption.EncryptionManager} to encrypt and decrypt data files. + * Returns an {@link org.apache.iceberg.encryption.EncryptionManager} to encrypt and decrypt data + * files. */ EncryptionManager encryption(); - /** - * Returns a {@link LocationProvider} to provide locations for new data files. - */ + /** Returns a {@link LocationProvider} to provide locations for new data files. */ LocationProvider locationProvider(); } diff --git a/api/src/main/java/org/apache/iceberg/TableScan.java b/api/src/main/java/org/apache/iceberg/TableScan.java index fedeb05203f1..de0e76b8b136 100644 --- a/api/src/main/java/org/apache/iceberg/TableScan.java +++ b/api/src/main/java/org/apache/iceberg/TableScan.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -/** - * API for configuring a table scan. - */ +/** API for configuring a table scan. */ public interface TableScan extends Scan { /** * Returns the {@link Table} from which this scan loads data. @@ -73,25 +70,25 @@ default TableScan select(String... columns) { Expression filter(); /** - * Create a new {@link TableScan} to read appended data from {@code fromSnapshotId} exclusive to {@code toSnapshotId} - * inclusive. + * Create a new {@link TableScan} to read appended data from {@code fromSnapshotId} exclusive to + * {@code toSnapshotId} inclusive. * * @param fromSnapshotId the last snapshot id read by the user, exclusive * @param toSnapshotId read append data up to this snapshot id - * @return a table scan which can read append data from {@code fromSnapshotId} - * exclusive and up to {@code toSnapshotId} inclusive + * @return a table scan which can read append data from {@code fromSnapshotId} exclusive and up to + * {@code toSnapshotId} inclusive */ default TableScan appendsBetween(long fromSnapshotId, long toSnapshotId) { throw new UnsupportedOperationException("Incremental scan is not supported"); } /** - * Create a new {@link TableScan} to read appended data from {@code fromSnapshotId} exclusive to the current snapshot - * inclusive. + * Create a new {@link TableScan} to read appended data from {@code fromSnapshotId} exclusive to + * the current snapshot inclusive. * * @param fromSnapshotId - the last snapshot id read by the user, exclusive - * @return a table scan which can read append data from {@code fromSnapshotId} - * exclusive and up to current snapshot inclusive + * @return a table scan which can read append data from {@code fromSnapshotId} exclusive and up to + * current snapshot inclusive */ default TableScan appendsAfter(long fromSnapshotId) { throw new UnsupportedOperationException("Incremental scan is not supported"); @@ -99,16 +96,18 @@ default TableScan appendsAfter(long fromSnapshotId) { /** * Returns the {@link Snapshot} that will be used by this scan. - *

- * If the snapshot was not configured using {@link #asOfTime(long)} or {@link #useSnapshot(long)}, the current table - * snapshot will be used. + * + *

If the snapshot was not configured using {@link #asOfTime(long)} or {@link + * #useSnapshot(long)}, the current table snapshot will be used. * * @return the Snapshot this scan will use */ Snapshot snapshot(); /** - * Returns whether this scan should apply column name case sensitiveness as per {@link Scan#caseSensitive(boolean)}. + * Returns whether this scan should apply column name case sensitiveness as per {@link + * Scan#caseSensitive(boolean)}. + * * @return true if case sensitive, false otherwise. */ boolean isCaseSensitive(); diff --git a/api/src/main/java/org/apache/iceberg/Tables.java b/api/src/main/java/org/apache/iceberg/Tables.java index 1c1daafed506..eae6146a76a1 100644 --- a/api/src/main/java/org/apache/iceberg/Tables.java +++ b/api/src/main/java/org/apache/iceberg/Tables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Map; @@ -25,8 +24,8 @@ /** * Generic interface for creating and loading a table implementation. * - * The 'tableIdentifier' field should be interpreted by the underlying - * implementation (e.g. database.table_name) + *

The 'tableIdentifier' field should be interpreted by the underlying implementation (e.g. + * database.table_name) */ public interface Tables { default Table create(Schema schema, String tableIdentifier) { @@ -37,16 +36,19 @@ default Table create(Schema schema, PartitionSpec spec, String tableIdentifier) return create(schema, spec, ImmutableMap.of(), tableIdentifier); } - default Table create(Schema schema, PartitionSpec spec, Map properties, String tableIdentifier) { + default Table create( + Schema schema, PartitionSpec spec, Map properties, String tableIdentifier) { return create(schema, spec, SortOrder.unsorted(), properties, tableIdentifier); } - default Table create(Schema schema, - PartitionSpec spec, - SortOrder order, - Map properties, - String tableIdentifier) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement create with a sort order"); + default Table create( + Schema schema, + PartitionSpec spec, + SortOrder order, + Map properties, + String tableIdentifier) { + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement create with a sort order"); } Table load(String tableIdentifier); diff --git a/api/src/main/java/org/apache/iceberg/Transaction.java b/api/src/main/java/org/apache/iceberg/Transaction.java index 609f86d1a098..9368150e69fc 100644 --- a/api/src/main/java/org/apache/iceberg/Transaction.java +++ b/api/src/main/java/org/apache/iceberg/Transaction.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.exceptions.CommitFailedException; import org.apache.iceberg.exceptions.ValidationException; -/** - * A transaction for performing multiple updates to a table. - */ +/** A transaction for performing multiple updates to a table. */ public interface Transaction { /** * Return the {@link Table} that this transaction will update. @@ -77,13 +74,13 @@ public interface Transaction { /** * Create a new {@link AppendFiles append API} to add files to this table. - *

- * Using this method signals to the underlying implementation that the append should not perform - * extra work in order to commit quickly. Fast appends are not recommended for normal writes - * because the fast commit may cause split planning to slow down over time. - *

- * Implementations may not support fast appends, in which case this will return the same appender - * as {@link #newAppend()}. + * + *

Using this method signals to the underlying implementation that the append should not + * perform extra work in order to commit quickly. Fast appends are not recommended for normal + * writes because the fast commit may cause split planning to slow down over time. + * + *

Implementations may not support fast appends, in which case this will return the same + * appender as {@link #newAppend()}. * * @return a new {@link AppendFiles} */ @@ -99,7 +96,8 @@ default AppendFiles newFastAppend() { RewriteFiles newRewrite(); /** - * Create a new {@link RewriteManifests rewrite manifests API} to replace manifests for this table. + * Create a new {@link RewriteManifests rewrite manifests API} to replace manifests for this + * table. * * @return a new {@link RewriteManifests} */ @@ -113,7 +111,8 @@ default AppendFiles newFastAppend() { OverwriteFiles newOverwrite(); /** - * Create a new {@link RowDelta row-level delta API} to remove or replace rows in existing data files. + * Create a new {@link RowDelta row-level delta API} to remove or replace rows in existing data + * files. * * @return a new {@link RowDelta} */ @@ -122,9 +121,10 @@ default AppendFiles newFastAppend() { /** * Not recommended: Create a new {@link ReplacePartitions replace partitions API} to dynamically * overwrite partitions in the table with new data. - *

- * This is provided to implement SQL compatible with Hive table operations but is not recommended. - * Instead, use the {@link OverwriteFiles overwrite API} to explicitly overwrite data. + * + *

This is provided to implement SQL compatible with Hive table operations but is not + * recommended. Instead, use the {@link OverwriteFiles overwrite API} to explicitly overwrite + * data. * * @return a new {@link ReplacePartitions} */ diff --git a/api/src/main/java/org/apache/iceberg/UnboundPartitionSpec.java b/api/src/main/java/org/apache/iceberg/UnboundPartitionSpec.java index 5cde0f6324a3..530d3d442c58 100644 --- a/api/src/main/java/org/apache/iceberg/UnboundPartitionSpec.java +++ b/api/src/main/java/org/apache/iceberg/UnboundPartitionSpec.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -116,7 +115,8 @@ public String name() { return name; } - private UnboundPartitionField(String transformAsString, int sourceId, Integer partitionId, String name) { + private UnboundPartitionField( + String transformAsString, int sourceId, Integer partitionId, String name) { this.transformAsString = transformAsString; this.sourceId = sourceId; this.partitionId = partitionId; diff --git a/api/src/main/java/org/apache/iceberg/UnboundSortOrder.java b/api/src/main/java/org/apache/iceberg/UnboundSortOrder.java index 32ef18f88faa..1181b665f87c 100644 --- a/api/src/main/java/org/apache/iceberg/UnboundSortOrder.java +++ b/api/src/main/java/org/apache/iceberg/UnboundSortOrder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Collections; @@ -24,7 +23,8 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; public class UnboundSortOrder { - private static final UnboundSortOrder UNSORTED_ORDER = new UnboundSortOrder(0, Collections.emptyList()); + private static final UnboundSortOrder UNSORTED_ORDER = + new UnboundSortOrder(0, Collections.emptyList()); private final int orderId; private final List fields; @@ -38,7 +38,8 @@ public SortOrder bind(Schema schema) { SortOrder.Builder builder = SortOrder.builderFor(schema).withOrderId(orderId); for (UnboundSortField field : fields) { - builder.addSortField(field.transformAsString, field.sourceId, field.direction, field.nullOrder); + builder.addSortField( + field.transformAsString, field.sourceId, field.direction, field.nullOrder); } return builder.build(); @@ -48,7 +49,8 @@ SortOrder bindUnchecked(Schema schema) { SortOrder.Builder builder = SortOrder.builderFor(schema).withOrderId(orderId); for (UnboundSortField field : fields) { - builder.addSortField(field.transformAsString, field.sourceId, field.direction, field.nullOrder); + builder.addSortField( + field.transformAsString, field.sourceId, field.direction, field.nullOrder); } return builder.buildUnchecked(); @@ -73,22 +75,22 @@ static Builder builder() { /** * A builder used to create {@link UnboundSortOrder unbound sort orders}. - *

- * Call {@link #builder()} to create a new builder. + * + *

Call {@link #builder()} to create a new builder. */ static class Builder { private final List fields = Lists.newArrayList(); private Integer orderId = null; - private Builder() { - } + private Builder() {} Builder withOrderId(int newOrderId) { this.orderId = newOrderId; return this; } - Builder addSortField(String transformAsString, int sourceId, SortDirection direction, NullOrder nullOrder) { + Builder addSortField( + String transformAsString, int sourceId, SortDirection direction, NullOrder nullOrder) { fields.add(new UnboundSortField(transformAsString, sourceId, direction, nullOrder)); return this; } @@ -117,7 +119,8 @@ static class UnboundSortField { private final SortDirection direction; private final NullOrder nullOrder; - private UnboundSortField(String transformAsString, int sourceId, SortDirection direction, NullOrder nullOrder) { + private UnboundSortField( + String transformAsString, int sourceId, SortDirection direction, NullOrder nullOrder) { this.transformAsString = transformAsString; this.sourceId = sourceId; this.direction = direction; diff --git a/api/src/main/java/org/apache/iceberg/UpdateLocation.java b/api/src/main/java/org/apache/iceberg/UpdateLocation.java index 4513749b2cf0..646fbb1229ad 100644 --- a/api/src/main/java/org/apache/iceberg/UpdateLocation.java +++ b/api/src/main/java/org/apache/iceberg/UpdateLocation.java @@ -16,12 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -/** - * API for setting a table's base location. - */ +/** API for setting a table's base location. */ public interface UpdateLocation extends PendingUpdate { /** * Set the table's location. diff --git a/api/src/main/java/org/apache/iceberg/UpdatePartitionSpec.java b/api/src/main/java/org/apache/iceberg/UpdatePartitionSpec.java index 5e85dd991925..f48d590af1ce 100644 --- a/api/src/main/java/org/apache/iceberg/UpdatePartitionSpec.java +++ b/api/src/main/java/org/apache/iceberg/UpdatePartitionSpec.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.exceptions.CommitFailedException; @@ -25,8 +24,8 @@ /** * API for partition spec evolution. - *

- * When committing, these changes will be applied to the current table metadata. Commit conflicts + * + *

When committing, these changes will be applied to the current table metadata. Commit conflicts * will not be resolved and will result in a {@link CommitFailedException}. */ public interface UpdatePartitionSpec extends PendingUpdate { @@ -40,48 +39,53 @@ public interface UpdatePartitionSpec extends PendingUpdate { /** * Add a new partition field from a source column. - *

- * The partition field will be created as an identity partition field for the given source column, with the same name - * as the source column. - *

- * The source column is located using {@link Schema#findField(String)}. + * + *

The partition field will be created as an identity partition field for the given source + * column, with the same name as the source column. + * + *

The source column is located using {@link Schema#findField(String)}. * * @param sourceName source column name in the table schema * @return this for method chaining - * @throws IllegalArgumentException If the an identity partition field for the source already exists, or if this - * change conflicts with other additions, removals, or renames. + * @throws IllegalArgumentException If the an identity partition field for the source already + * exists, or if this change conflicts with other additions, removals, or renames. */ UpdatePartitionSpec addField(String sourceName); /** * Add a new partition field from an {@link Expressions expression term}. - *

- * The partition field will use the term's transform or the identity transform if the term is a reference. - *

- * The term's reference is used to locate the source column using {@link Schema#findField(String)}. - *

- * The new partition field will be named for the source column and the transform. + * + *

The partition field will use the term's transform or the identity transform if the term is a + * reference. + * + *

The term's reference is used to locate the source column using {@link + * Schema#findField(String)}. + * + *

The new partition field will be named for the source column and the transform. * * @param term source column name in the table schema * @return this for method chaining - * @throws IllegalArgumentException If the a partition field for the transform and source already exists, or if this - * change conflicts with other additions, removals, or renames. + * @throws IllegalArgumentException If the a partition field for the transform and source already + * exists, or if this change conflicts with other additions, removals, or renames. */ UpdatePartitionSpec addField(Term term); /** - * Add a new partition field from an {@link Expressions expression term}, with the given partition field name. - *

- * The partition field will use the term's transform or the identity transform if the term is a reference. - *

- * The term's reference is used to locate the source column using {@link Schema#findField(String)}. + * Add a new partition field from an {@link Expressions expression term}, with the given partition + * field name. + * + *

The partition field will use the term's transform or the identity transform if the term is a + * reference. + * + *

The term's reference is used to locate the source column using {@link + * Schema#findField(String)}. * * @param name name for the partition field * @param term expression for the partition transform * @return this for method chaining - * @throws IllegalArgumentException If the a partition field for the transform and source already exists, if a - * partition field with the given name already exists, or if this change conflicts - * with other additions, removals, or renames. + * @throws IllegalArgumentException If the a partition field for the transform and source already + * exists, if a partition field with the given name already exists, or if this change + * conflicts with other additions, removals, or renames. */ UpdatePartitionSpec addField(String name, Term term); @@ -90,21 +94,21 @@ public interface UpdatePartitionSpec extends PendingUpdate { * * @param name name of the partition field to remove * @return this for method chaining - * @throws IllegalArgumentException If the a partition field with the given name does not exist, or if this change - * conflicts with other additions, removals, or renames. + * @throws IllegalArgumentException If the a partition field with the given name does not exist, + * or if this change conflicts with other additions, removals, or renames. */ UpdatePartitionSpec removeField(String name); /** * Remove a partition field by its transform {@link Expressions expression term}. - *

- * The partition field with the same transform and source reference will be removed. If the term is a reference and - * does not have a transform, the identity transform is used. + * + *

The partition field with the same transform and source reference will be removed. If the + * term is a reference and does not have a transform, the identity transform is used. * * @param term expression for the partition transform to remove * @return this for method chaining - * @throws IllegalArgumentException If the a partition field with the given transform and source does not exist, or - * if this change conflicts with other additions, removals, or renames. + * @throws IllegalArgumentException If the a partition field with the given transform and source + * does not exist, or if this change conflicts with other additions, removals, or renames. */ UpdatePartitionSpec removeField(Term term); @@ -114,8 +118,8 @@ public interface UpdatePartitionSpec extends PendingUpdate { * @param name name of the partition field to rename * @param newName replacement name for the partition field * @return this for method chaining - * @throws IllegalArgumentException If name doesn't identify a column in the schema or if this change conflicts with - * other additions, removals, or renames. + * @throws IllegalArgumentException If name doesn't identify a column in the schema or if this + * change conflicts with other additions, removals, or renames. */ UpdatePartitionSpec renameField(String name, String newName); } diff --git a/api/src/main/java/org/apache/iceberg/UpdateProperties.java b/api/src/main/java/org/apache/iceberg/UpdateProperties.java index 28c0675078ea..42df96d9502c 100644 --- a/api/src/main/java/org/apache/iceberg/UpdateProperties.java +++ b/api/src/main/java/org/apache/iceberg/UpdateProperties.java @@ -16,17 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Map; /** * API for updating table properties. - *

- * Apply returns the updated table properties as a map for validation. - *

- * When committing, these changes will be applied to the current table metadata. Commit conflicts + * + *

Apply returns the updated table properties as a map for validation. + * + *

When committing, these changes will be applied to the current table metadata. Commit conflicts * will be resolved by applying the pending changes to the new table metadata. */ public interface UpdateProperties extends PendingUpdate> { @@ -52,6 +51,7 @@ public interface UpdateProperties extends PendingUpdate> { /** * Set the default file format for the table. + * * @param format a file format * @return this */ diff --git a/api/src/main/java/org/apache/iceberg/UpdateSchema.java b/api/src/main/java/org/apache/iceberg/UpdateSchema.java index f10680ea6fe8..2b2a5bfe4c2c 100644 --- a/api/src/main/java/org/apache/iceberg/UpdateSchema.java +++ b/api/src/main/java/org/apache/iceberg/UpdateSchema.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Collection; @@ -26,23 +25,24 @@ /** * API for schema evolution. - *

- * When committing, these changes will be applied to the current table metadata. Commit conflicts + * + *

When committing, these changes will be applied to the current table metadata. Commit conflicts * will not be resolved and will result in a {@link CommitFailedException}. */ public interface UpdateSchema extends PendingUpdate { /** * Allow incompatible changes to the schema. - *

- * Incompatible changes can cause failures when attempting to read older data files. For example, adding a required - * column and attempting to read data files without that column will cause a failure. However, if there are no data - * files that are not compatible with the change, it can be allowed. - *

- * This option allows incompatible changes to be made to a schema. This should be used when the caller has validated - * that the change will not break. For example, if a column is added as optional but always populated and data older - * than the column addition has been deleted from the table, this can be used with {@link #requireColumn(String)} to - * mark the column required. + * + *

Incompatible changes can cause failures when attempting to read older data files. For + * example, adding a required column and attempting to read data files without that column will + * cause a failure. However, if there are no data files that are not compatible with the change, + * it can be allowed. + * + *

This option allows incompatible changes to be made to a schema. This should be used when the + * caller has validated that the change will not break. For example, if a column is added as + * optional but always populated and data older than the column addition has been deleted from the + * table, this can be used with {@link #requireColumn(String)} to mark the column required. * * @return this for method chaining */ @@ -50,12 +50,12 @@ public interface UpdateSchema extends PendingUpdate { /** * Add a new top-level column. - *

- * Because "." may be interpreted as a column path separator or may be used in field names, it is - * not allowed in names passed to this method. To add to nested structures or to add fields with - * names that contain ".", use {@link #addColumn(String, String, Type)}. - *

- * If type is a nested type, its field IDs are reassigned when added to the existing schema. + * + *

Because "." may be interpreted as a column path separator or may be used in field names, it + * is not allowed in names passed to this method. To add to nested structures or to add fields + * with names that contain ".", use {@link #addColumn(String, String, Type)}. + * + *

If type is a nested type, its field IDs are reassigned when added to the existing schema. * * @param name name for the new column * @param type type for the new column @@ -68,12 +68,12 @@ default UpdateSchema addColumn(String name, Type type) { /** * Add a new top-level column. - *

- * Because "." may be interpreted as a column path separator or may be used in field names, it is - * not allowed in names passed to this method. To add to nested structures or to add fields with - * names that contain ".", use {@link #addColumn(String, String, Type)}. - *

- * If type is a nested type, its field IDs are reassigned when added to the existing schema. + * + *

Because "." may be interpreted as a column path separator or may be used in field names, it + * is not allowed in names passed to this method. To add to nested structures or to add fields + * with names that contain ".", use {@link #addColumn(String, String, Type)}. + * + *

If type is a nested type, its field IDs are reassigned when added to the existing schema. * * @param name name for the new column * @param type type for the new column @@ -85,17 +85,17 @@ default UpdateSchema addColumn(String name, Type type) { /** * Add a new column to a nested struct. - *

- * The parent name is used to find the parent using {@link Schema#findField(String)}. If the + * + *

The parent name is used to find the parent using {@link Schema#findField(String)}. If the * parent name is null, the new column will be added to the root as a top-level column. If parent * identifies a struct, a new column is added to that struct. If it identifies a list, the column - * is added to the list element struct, and if it identifies a map, the new column is added to - * the map's value struct. - *

- * The given name is used to name the new column and names containing "." are not handled + * is added to the list element struct, and if it identifies a map, the new column is added to the + * map's value struct. + * + *

The given name is used to name the new column and names containing "." are not handled * differently. - *

- * If type is a nested type, its field IDs are reassigned when added to the existing schema. + * + *

If type is a nested type, its field IDs are reassigned when added to the existing schema. * * @param parent name of the parent struct to the column will be added to * @param name name for the new column @@ -109,17 +109,17 @@ default UpdateSchema addColumn(String parent, String name, Type type) { /** * Add a new column to a nested struct. - *

- * The parent name is used to find the parent using {@link Schema#findField(String)}. If the + * + *

The parent name is used to find the parent using {@link Schema#findField(String)}. If the * parent name is null, the new column will be added to the root as a top-level column. If parent * identifies a struct, a new column is added to that struct. If it identifies a list, the column - * is added to the list element struct, and if it identifies a map, the new column is added to - * the map's value struct. - *

- * The given name is used to name the new column and names containing "." are not handled + * is added to the list element struct, and if it identifies a map, the new column is added to the + * map's value struct. + * + *

The given name is used to name the new column and names containing "." are not handled * differently. - *

- * If type is a nested type, its field IDs are reassigned when added to the existing schema. + * + *

If type is a nested type, its field IDs are reassigned when added to the existing schema. * * @param parent name of the parent struct to the column will be added to * @param name name for the new column @@ -132,15 +132,15 @@ default UpdateSchema addColumn(String parent, String name, Type type) { /** * Add a new required top-level column. - *

- * This is an incompatible change that can break reading older data. This method will result in an exception unless - * {@link #allowIncompatibleChanges()} has been called. - *

- * Because "." may be interpreted as a column path separator or may be used in field names, it is - * not allowed in names passed to this method. To add to nested structures or to add fields with - * names that contain ".", use {@link #addRequiredColumn(String, String, Type)}. - *

- * If type is a nested type, its field IDs are reassigned when added to the existing schema. + * + *

This is an incompatible change that can break reading older data. This method will result in + * an exception unless {@link #allowIncompatibleChanges()} has been called. + * + *

Because "." may be interpreted as a column path separator or may be used in field names, it + * is not allowed in names passed to this method. To add to nested structures or to add fields + * with names that contain ".", use {@link #addRequiredColumn(String, String, Type)}. + * + *

If type is a nested type, its field IDs are reassigned when added to the existing schema. * * @param name name for the new column * @param type type for the new column @@ -153,15 +153,15 @@ default UpdateSchema addRequiredColumn(String name, Type type) { /** * Add a new required top-level column. - *

- * This is an incompatible change that can break reading older data. This method will result in an exception unless - * {@link #allowIncompatibleChanges()} has been called. - *

- * Because "." may be interpreted as a column path separator or may be used in field names, it is - * not allowed in names passed to this method. To add to nested structures or to add fields with - * names that contain ".", use {@link #addRequiredColumn(String, String, Type)}. - *

- * If type is a nested type, its field IDs are reassigned when added to the existing schema. + * + *

This is an incompatible change that can break reading older data. This method will result in + * an exception unless {@link #allowIncompatibleChanges()} has been called. + * + *

Because "." may be interpreted as a column path separator or may be used in field names, it + * is not allowed in names passed to this method. To add to nested structures or to add fields + * with names that contain ".", use {@link #addRequiredColumn(String, String, Type)}. + * + *

If type is a nested type, its field IDs are reassigned when added to the existing schema. * * @param name name for the new column * @param type type for the new column @@ -173,20 +173,20 @@ default UpdateSchema addRequiredColumn(String name, Type type) { /** * Add a new required top-level column. - *

- * This is an incompatible change that can break reading older data. This method will result in an exception unless - * {@link #allowIncompatibleChanges()} has been called. - *

- * The parent name is used to find the parent using {@link Schema#findField(String)}. If the + * + *

This is an incompatible change that can break reading older data. This method will result in + * an exception unless {@link #allowIncompatibleChanges()} has been called. + * + *

The parent name is used to find the parent using {@link Schema#findField(String)}. If the * parent name is null, the new column will be added to the root as a top-level column. If parent * identifies a struct, a new column is added to that struct. If it identifies a list, the column - * is added to the list element struct, and if it identifies a map, the new column is added to - * the map's value struct. - *

- * The given name is used to name the new column and names containing "." are not handled + * is added to the list element struct, and if it identifies a map, the new column is added to the + * map's value struct. + * + *

The given name is used to name the new column and names containing "." are not handled * differently. - *

- * If type is a nested type, its field IDs are reassigned when added to the existing schema. + * + *

If type is a nested type, its field IDs are reassigned when added to the existing schema. * * @param parent name of the parent struct to the column will be added to * @param name name for the new column @@ -200,20 +200,20 @@ default UpdateSchema addRequiredColumn(String parent, String name, Type type) { /** * Add a new required top-level column. - *

- * This is an incompatible change that can break reading older data. This method will result in an exception unless - * {@link #allowIncompatibleChanges()} has been called. - *

- * The parent name is used to find the parent using {@link Schema#findField(String)}. If the + * + *

This is an incompatible change that can break reading older data. This method will result in + * an exception unless {@link #allowIncompatibleChanges()} has been called. + * + *

The parent name is used to find the parent using {@link Schema#findField(String)}. If the * parent name is null, the new column will be added to the root as a top-level column. If parent * identifies a struct, a new column is added to that struct. If it identifies a list, the column - * is added to the list element struct, and if it identifies a map, the new column is added to - * the map's value struct. - *

- * The given name is used to name the new column and names containing "." are not handled + * is added to the list element struct, and if it identifies a map, the new column is added to the + * map's value struct. + * + *

The given name is used to name the new column and names containing "." are not handled * differently. - *

- * If type is a nested type, its field IDs are reassigned when added to the existing schema. + * + *

If type is a nested type, its field IDs are reassigned when added to the existing schema. * * @param parent name of the parent struct to the column will be added to * @param name name for the new column @@ -226,55 +226,55 @@ default UpdateSchema addRequiredColumn(String parent, String name, Type type) { /** * Rename a column in the schema. - *

- * The name is used to find the column to rename using {@link Schema#findField(String)}. - *

- * The new name may contain "." and such names are not parsed or handled differently. - *

- * Columns may be updated and renamed in the same schema update. + * + *

The name is used to find the column to rename using {@link Schema#findField(String)}. + * + *

The new name may contain "." and such names are not parsed or handled differently. + * + *

Columns may be updated and renamed in the same schema update. * * @param name name of the column to rename * @param newName replacement name for the column * @return this for method chaining * @throws IllegalArgumentException If name doesn't identify a column in the schema or if this - * change conflicts with other additions, renames, or updates. + * change conflicts with other additions, renames, or updates. */ UpdateSchema renameColumn(String name, String newName); /** * Update a column in the schema to a new primitive type. - *

- * The name is used to find the column to update using {@link Schema#findField(String)}. - *

- * Only updates that widen types are allowed. - *

- * Columns may be updated and renamed in the same schema update. + * + *

The name is used to find the column to update using {@link Schema#findField(String)}. + * + *

Only updates that widen types are allowed. + * + *

Columns may be updated and renamed in the same schema update. * * @param name name of the column to rename * @param newType replacement type for the column * @return this for method chaining * @throws IllegalArgumentException If name doesn't identify a column in the schema or if this - * change introduces a type incompatibility or if it conflicts - * with other additions, renames, or updates. + * change introduces a type incompatibility or if it conflicts with other additions, renames, + * or updates. */ UpdateSchema updateColumn(String name, Type.PrimitiveType newType); /** * Update a column in the schema to a new primitive type. - *

- * The name is used to find the column to update using {@link Schema#findField(String)}. - *

- * Only updates that widen types are allowed. - *

- * Columns may be updated and renamed in the same schema update. + * + *

The name is used to find the column to update using {@link Schema#findField(String)}. + * + *

Only updates that widen types are allowed. + * + *

Columns may be updated and renamed in the same schema update. * * @param name name of the column to rename * @param newType replacement type for the column * @param newDoc replacement documentation string for the column * @return this for method chaining * @throws IllegalArgumentException If name doesn't identify a column in the schema or if this - * change introduces a type incompatibility or if it conflicts - * with other additions, renames, or updates. + * change introduces a type incompatibility or if it conflicts with other additions, renames, + * or updates. */ default UpdateSchema updateColumn(String name, Type.PrimitiveType newType, String newDoc) { return updateColumn(name, newType).updateColumnDoc(name, newDoc); @@ -282,17 +282,17 @@ default UpdateSchema updateColumn(String name, Type.PrimitiveType newType, Strin /** * Update a column in the schema to a new primitive type. - *

- * The name is used to find the column to update using {@link Schema#findField(String)}. - *

- * Columns may be updated and renamed in the same schema update. + * + *

The name is used to find the column to update using {@link Schema#findField(String)}. + * + *

Columns may be updated and renamed in the same schema update. * * @param name name of the column to rename * @param newDoc replacement documentation string for the column * @return this for method chaining * @throws IllegalArgumentException If name doesn't identify a column in the schema or if this - * change introduces a type incompatibility or if it conflicts - * with other additions, renames, or updates. + * change introduces a type incompatibility or if it conflicts with other additions, renames, + * or updates. */ UpdateSchema updateColumnDoc(String name, String newDoc); @@ -306,9 +306,9 @@ default UpdateSchema updateColumn(String name, Type.PrimitiveType newType, Strin /** * Update a column to required. - *

- * This is an incompatible change that can break reading older data. This method will result in an exception unless - * {@link #allowIncompatibleChanges()} has been called. + * + *

This is an incompatible change that can break reading older data. This method will result in + * an exception unless {@link #allowIncompatibleChanges()} has been called. * * @param name name of the column to mark required * @return this for method chaining @@ -317,81 +317,83 @@ default UpdateSchema updateColumn(String name, Type.PrimitiveType newType, Strin /** * Delete a column in the schema. - *

- * The name is used to find the column to delete using {@link Schema#findField(String)}. + * + *

The name is used to find the column to delete using {@link Schema#findField(String)}. * * @param name name of the column to delete * @return this for method chaining * @throws IllegalArgumentException If name doesn't identify a column in the schema or if this - * change conflicts with other additions, renames, or updates. + * change conflicts with other additions, renames, or updates. */ UpdateSchema deleteColumn(String name); /** * Move a column from its current position to the start of the schema or its parent struct. + * * @param name name of the column to move * @return this for method chaining * @throws IllegalArgumentException If name doesn't identify a column in the schema or if this - * change conflicts with other changes. + * change conflicts with other changes. */ UpdateSchema moveFirst(String name); /** * Move a column from its current position to directly before a reference column. - *

- * The name is used to find the column to move using {@link Schema#findField(String)}. If the name identifies a nested - * column, it can only be moved within the nested struct that contains it. + * + *

The name is used to find the column to move using {@link Schema#findField(String)}. If the + * name identifies a nested column, it can only be moved within the nested struct that contains + * it. * * @param name name of the column to move * @param beforeName name of the reference column * @return this for method chaining * @throws IllegalArgumentException If name doesn't identify a column in the schema or if this - * change conflicts with other changes. + * change conflicts with other changes. */ UpdateSchema moveBefore(String name, String beforeName); /** * Move a column from its current position to directly after a reference column. - *

- * The name is used to find the column to move using {@link Schema#findField(String)}. If the name identifies a nested - * column, it can only be moved within the nested struct that contains it. + * + *

The name is used to find the column to move using {@link Schema#findField(String)}. If the + * name identifies a nested column, it can only be moved within the nested struct that contains + * it. * * @param name name of the column to move * @param afterName name of the reference column * @return this for method chaining * @throws IllegalArgumentException If name doesn't identify a column in the schema or if this - * change conflicts with other changes. + * change conflicts with other changes. */ UpdateSchema moveAfter(String name, String afterName); - /** * Applies all field additions and updates from the provided new schema to the existing schema so * to create a union schema. - *

- * For fields with same canonical names in both schemas it is required that the widen types is + * + *

For fields with same canonical names in both schemas it is required that the widen types is * supported using {@link UpdateSchema#updateColumn(String, Type.PrimitiveType)} - *

- * Only supports turning a previously required field into an optional one if it is marked + * + *

Only supports turning a previously required field into an optional one if it is marked * optional in the provided new schema using {@link UpdateSchema#makeColumnOptional(String)} - *

- * Only supports updating existing field docs with fields docs from the provided new schema using - * {@link UpdateSchema#updateColumnDoc(String, String)} + * + *

Only supports updating existing field docs with fields docs from the provided new schema + * using {@link UpdateSchema#updateColumnDoc(String, String)} * * @param newSchema a schema used in conjunction with the existing schema to create a union schema * @return this for method chaining * @throws IllegalStateException If it encounters errors during provided schema traversal * @throws IllegalArgumentException If name doesn't identify a column in the schema or if this - * change introduces a type incompatibility or if it conflicts - * with other additions, renames, or updates. + * change introduces a type incompatibility or if it conflicts with other additions, renames, + * or updates. */ UpdateSchema unionByNameWith(Schema newSchema); /** * Set the identifier fields given a set of field names. - *

- * Because identifier fields are unique, duplicated names will be ignored. - * See {@link Schema#identifierFieldIds()} to learn more about Iceberg identifier. + * + *

Because identifier fields are unique, duplicated names will be ignored. See {@link + * Schema#identifierFieldIds()} to learn more about Iceberg identifier. * * @param names names of the columns to set as identifier fields * @return this for method chaining @@ -399,8 +401,8 @@ default UpdateSchema updateColumn(String name, Type.PrimitiveType newType, Strin UpdateSchema setIdentifierFields(Collection names); /** - * Set the identifier fields given some field names. - * See {@link UpdateSchema#setIdentifierFields(Collection)} for more details. + * Set the identifier fields given some field names. See {@link + * UpdateSchema#setIdentifierFields(Collection)} for more details. * * @param names names of the columns to set as identifier fields * @return this for method chaining diff --git a/api/src/main/java/org/apache/iceberg/actions/Action.java b/api/src/main/java/org/apache/iceberg/actions/Action.java index 7966a8e451b5..bd617c2e0744 100644 --- a/api/src/main/java/org/apache/iceberg/actions/Action.java +++ b/api/src/main/java/org/apache/iceberg/actions/Action.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.util.Map; @@ -30,27 +29,29 @@ public interface Action { /** * Configures this action with an extra option. - *

- * Certain actions allow users to control internal details of their execution via options. + * + *

Certain actions allow users to control internal details of their execution via options. * * @param name an option name * @param value an option value * @return this for method chaining */ default ThisT option(String name, String value) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement option"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement option"); } /** * Configures this action with extra options. - *

- * Certain actions allow users to control internal details of their execution via options. + * + *

Certain actions allow users to control internal details of their execution via options. * * @param options a map of extra options * @return this for method chaining */ default ThisT options(Map options) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement options"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement options"); } /** diff --git a/api/src/main/java/org/apache/iceberg/actions/ActionsProvider.java b/api/src/main/java/org/apache/iceberg/actions/ActionsProvider.java index f2564ddb703b..e5b5766f918d 100644 --- a/api/src/main/java/org/apache/iceberg/actions/ActionsProvider.java +++ b/api/src/main/java/org/apache/iceberg/actions/ActionsProvider.java @@ -16,62 +16,52 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import org.apache.iceberg.Table; -/** - * An API that should be implemented by query engine integrations for providing actions. - */ +/** An API that should be implemented by query engine integrations for providing actions. */ public interface ActionsProvider { - /** - * Instantiates an action to snapshot an existing table as a new Iceberg table. - */ + /** Instantiates an action to snapshot an existing table as a new Iceberg table. */ default SnapshotTable snapshotTable(String sourceTableIdent) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement snapshotTable"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement snapshotTable"); } - /** - * Instantiates an action to migrate an existing table to Iceberg. - */ + /** Instantiates an action to migrate an existing table to Iceberg. */ default MigrateTable migrateTable(String tableIdent) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement migrateTable"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement migrateTable"); } - /** - * Instantiates an action to delete orphan files. - */ + /** Instantiates an action to delete orphan files. */ default DeleteOrphanFiles deleteOrphanFiles(Table table) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement deleteOrphanFiles"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement deleteOrphanFiles"); } - /** - * Instantiates an action to rewrite manifests. - */ + /** Instantiates an action to rewrite manifests. */ default RewriteManifests rewriteManifests(Table table) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement rewriteManifests"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement rewriteManifests"); } - /** - * Instantiates an action to rewrite data files. - */ + /** Instantiates an action to rewrite data files. */ default RewriteDataFiles rewriteDataFiles(Table table) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement rewriteDataFiles"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement rewriteDataFiles"); } - /** - * Instantiates an action to expire snapshots. - */ + /** Instantiates an action to expire snapshots. */ default ExpireSnapshots expireSnapshots(Table table) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement expireSnapshots"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement expireSnapshots"); } - /** - * Instantiates an action to delete all the files reachable from given metadata location. - */ + /** Instantiates an action to delete all the files reachable from given metadata location. */ default DeleteReachableFiles deleteReachableFiles(String metadataLocation) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement deleteReachableFiles"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement deleteReachableFiles"); } } diff --git a/api/src/main/java/org/apache/iceberg/actions/ConvertEqualityDeleteFiles.java b/api/src/main/java/org/apache/iceberg/actions/ConvertEqualityDeleteFiles.java index 9247f209364e..c91ced399cd6 100644 --- a/api/src/main/java/org/apache/iceberg/actions/ConvertEqualityDeleteFiles.java +++ b/api/src/main/java/org/apache/iceberg/actions/ConvertEqualityDeleteFiles.java @@ -16,41 +16,32 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import org.apache.iceberg.expressions.Expression; -/** - * An action for converting the equality delete files to position delete files. - */ +/** An action for converting the equality delete files to position delete files. */ public interface ConvertEqualityDeleteFiles extends SnapshotUpdate { /** * A filter for finding the equality deletes to convert. - *

- * The filter will be converted to a partition filter with an inclusive projection. Any file that may contain rows - * matching this filter will be used by the action. The matching delete files will be converted to position delete - * files. + * + *

The filter will be converted to a partition filter with an inclusive projection. Any file + * that may contain rows matching this filter will be used by the action. The matching delete + * files will be converted to position delete files. * * @param expression An iceberg expression used to find deletes. * @return this for method chaining */ ConvertEqualityDeleteFiles filter(Expression expression); - /** - * The action result that contains a summary of the execution. - */ + /** The action result that contains a summary of the execution. */ interface Result { - /** - * Returns the count of the deletes that been converted. - */ + /** Returns the count of the deletes that been converted. */ int convertedEqualityDeleteFilesCount(); - /** - * Returns the count of the added position delete files. - */ + /** Returns the count of the added position delete files. */ int addedPositionDeleteFilesCount(); } } diff --git a/api/src/main/java/org/apache/iceberg/actions/DeleteOrphanFiles.java b/api/src/main/java/org/apache/iceberg/actions/DeleteOrphanFiles.java index 75e593f2768c..2dd741770cb4 100644 --- a/api/src/main/java/org/apache/iceberg/actions/DeleteOrphanFiles.java +++ b/api/src/main/java/org/apache/iceberg/actions/DeleteOrphanFiles.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.util.Locale; @@ -27,17 +26,16 @@ /** * An action that deletes orphan metadata, data and delete files in a table. - *

- * A file is considered orphan if it is not reachable by any valid snapshot. - * The set of actual files is built by listing the underlying storage which makes this operation - * expensive. + * + *

A file is considered orphan if it is not reachable by any valid snapshot. The set of actual + * files is built by listing the underlying storage which makes this operation expensive. */ public interface DeleteOrphanFiles extends Action { /** * Passes a location which should be scanned for orphan files. - *

- * If not set, the root table location will be scanned potentially removing both orphan data and - * metadata files. + * + *

If not set, the root table location will be scanned potentially removing both orphan data + * and metadata files. * * @param location the location where to look for orphan files * @return this for method chaining @@ -46,12 +44,12 @@ public interface DeleteOrphanFiles extends Action - * This is a safety measure to avoid removing files that are being added to the table. - * For example, there may be a concurrent operation adding new files while this action searches - * for orphan files. New files may not be referenced by the metadata yet but they are not orphan. - *

- * If not set, defaults to a timestamp 3 days ago. + * + *

This is a safety measure to avoid removing files that are being added to the table. For + * example, there may be a concurrent operation adding new files while this action searches for + * orphan files. New files may not be referenced by the metadata yet but they are not orphan. + * + *

If not set, defaults to a timestamp 3 days ago. * * @param olderThanTimestamp a long timestamp, as returned by {@link System#currentTimeMillis()} * @return this for method chaining @@ -60,11 +58,12 @@ public interface DeleteOrphanFiles extends Action - * This method allows users to customize the delete func. For example, one may set a custom delete - * func and collect all orphan files into a set instead of physically removing them. - *

- * If not set, defaults to using the table's {@link org.apache.iceberg.io.FileIO io} implementation. + * + *

This method allows users to customize the delete func. For example, one may set a custom + * delete func and collect all orphan files into a set instead of physically removing them. + * + *

If not set, defaults to using the table's {@link org.apache.iceberg.io.FileIO io} + * implementation. * * @param deleteFunc a function that will be called to delete files * @return this for method chaining @@ -73,9 +72,10 @@ public interface DeleteOrphanFiles extends Action - * If this method is not called, orphaned manifests and data files will still be deleted in - * the current thread. + * + *

If this method is not called, orphaned manifests and data files will still be deleted in the + * current thread. + * *

* * @param executorService the service to use @@ -84,71 +84,73 @@ public interface DeleteOrphanFiles extends Action - * Possible values are "ERROR", "IGNORE", "DELETE". The default mismatch mode is "ERROR", - * which means an exception is thrown whenever there is a mismatch in authority/scheme. - * It's the recommended mismatch mode and should be changed only in some rare circumstances. - * If there is a mismatch, use {@link #equalSchemes(Map)} and {@link #equalAuthorities(Map)} - * to resolve conflicts by providing equivalent schemes and authorities. If it is impossible - * to determine whether the conflicting authorities/schemes are equal, set the prefix mismatch - * mode to "IGNORE" to skip files with mismatches. If you have manually inspected all conflicting - * authorities/schemes, provided equivalent schemes/authorities and are absolutely confident - * the remaining ones are different, set the prefix mismatch mode to "DELETE" to consider files - * with mismatches as orphan. It will be impossible to recover files after deletion, - * so the "DELETE" prefix mismatch mode must be used with extreme caution. + * Passes a prefix mismatch mode that determines how this action should handle situations when the + * metadata references files that match listed/provided files except for authority/scheme. + * + *

Possible values are "ERROR", "IGNORE", "DELETE". The default mismatch mode is "ERROR", which + * means an exception is thrown whenever there is a mismatch in authority/scheme. It's the + * recommended mismatch mode and should be changed only in some rare circumstances. If there is a + * mismatch, use {@link #equalSchemes(Map)} and {@link #equalAuthorities(Map)} to resolve + * conflicts by providing equivalent schemes and authorities. If it is impossible to determine + * whether the conflicting authorities/schemes are equal, set the prefix mismatch mode to "IGNORE" + * to skip files with mismatches. If you have manually inspected all conflicting + * authorities/schemes, provided equivalent schemes/authorities and are absolutely confident the + * remaining ones are different, set the prefix mismatch mode to "DELETE" to consider files with + * mismatches as orphan. It will be impossible to recover files after deletion, so the "DELETE" + * prefix mismatch mode must be used with extreme caution. * * @param newPrefixMismatchMode mode for handling prefix mismatches * @return this for method chaining */ default DeleteOrphanFiles prefixMismatchMode(PrefixMismatchMode newPrefixMismatchMode) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement prefixMismatchMode"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement prefixMismatchMode"); } /** * Passes schemes that should be considered equal. - *

- * The key may include a comma-separated list of schemes. For instance, Map("s3a,s3,s3n", "s3"). + * + *

The key may include a comma-separated list of schemes. For instance, Map("s3a,s3,s3n", + * "s3"). * * @param newEqualSchemes list of equal schemes * @return this for method chaining */ default DeleteOrphanFiles equalSchemes(Map newEqualSchemes) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement equalSchemes"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement equalSchemes"); } /** * Passes authorities that should be considered equal. - *

- * The key may include a comma-separate list of authorities. For instance, Map("s1name,s2name", "servicename"). + * + *

The key may include a comma-separate list of authorities. For instance, Map("s1name,s2name", + * "servicename"). * * @param newEqualAuthorities list of equal authorities * @return this for method chaining */ default DeleteOrphanFiles equalAuthorities(Map newEqualAuthorities) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement equalAuthorities"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement equalAuthorities"); } - /** - * The action result that contains a summary of the execution. - */ + /** The action result that contains a summary of the execution. */ interface Result { - /** - * Returns locations of orphan files. - */ + /** Returns locations of orphan files. */ Iterable orphanFileLocations(); } /** * Defines the action behavior when location prefixes (scheme/authority) mismatch. - *

- * {@link #ERROR} - throw an exception. - * {@link #IGNORE} - no action. - * {@link #DELETE} - delete files. + * + *

{@link #ERROR} - throw an exception. {@link #IGNORE} - no action. {@link #DELETE} - delete + * files. */ enum PrefixMismatchMode { - ERROR, IGNORE, DELETE; + ERROR, + IGNORE, + DELETE; public static PrefixMismatchMode fromString(String modeAsString) { Preconditions.checkArgument(modeAsString != null, "Mode should not be null"); diff --git a/api/src/main/java/org/apache/iceberg/actions/DeleteReachableFiles.java b/api/src/main/java/org/apache/iceberg/actions/DeleteReachableFiles.java index 3b0eb96b55cd..e2ab755e2f87 100644 --- a/api/src/main/java/org/apache/iceberg/actions/DeleteReachableFiles.java +++ b/api/src/main/java/org/apache/iceberg/actions/DeleteReachableFiles.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.util.concurrent.ExecutorService; @@ -25,30 +24,31 @@ /** * An action that deletes all files referenced by a table metadata file. - *

- * This action will irreversibly delete all reachable files such as data files, manifests, - * manifest lists and should be used to clean up the underlying storage once a table is dropped - * and no longer needed. - *

- * Implementations may use a query engine to distribute parts of work. + * + *

This action will irreversibly delete all reachable files such as data files, manifests, + * manifest lists and should be used to clean up the underlying storage once a table is dropped and + * no longer needed. + * + *

Implementations may use a query engine to distribute parts of work. */ -public interface DeleteReachableFiles extends Action { +public interface DeleteReachableFiles + extends Action { /** * Passes an alternative delete implementation that will be used for files. * - * @param deleteFunc a function that will be called to delete files. - * The function accepts path to file as an argument. + * @param deleteFunc a function that will be called to delete files. The function accepts path to + * file as an argument. * @return this for method chaining */ DeleteReachableFiles deleteWith(Consumer deleteFunc); /** * Passes an alternative executor service that will be used for files removal. - *

- * If this method is not called, files will be deleted in the current thread. * - * @param executorService the service to use + *

If this method is not called, files will be deleted in the current thread. + * + * @param executorService the service to use * @return this for method chaining */ DeleteReachableFiles executeDeleteWith(ExecutorService executorService); @@ -61,29 +61,19 @@ public interface DeleteReachableFiles extends Action - * Similar to {@link org.apache.iceberg.ExpireSnapshots} but may use a query engine to distribute + * + *

Similar to {@link org.apache.iceberg.ExpireSnapshots} but may use a query engine to distribute * parts of the work. */ public interface ExpireSnapshots extends Action { /** * Expires a specific {@link Snapshot} identified by id. - *

- * Identical to {@link org.apache.iceberg.ExpireSnapshots#expireSnapshotId(long)} + * + *

Identical to {@link org.apache.iceberg.ExpireSnapshots#expireSnapshotId(long)} * * @param snapshotId id of the snapshot to expire * @return this for method chaining @@ -42,8 +41,8 @@ public interface ExpireSnapshots extends Action - * Identical to {@link org.apache.iceberg.ExpireSnapshots#expireOlderThan(long)} + * + *

Identical to {@link org.apache.iceberg.ExpireSnapshots#expireOlderThan(long)} * * @param timestampMillis a long timestamp, as returned by {@link System#currentTimeMillis()} * @return this for method chaining @@ -52,12 +51,12 @@ public interface ExpireSnapshots extends Action - * If a snapshot would be expired because it is older than the expiration timestamp, but is one of - * the {@code numSnapshots} most recent ancestors of the current state, it will be retained. This - * will not cause snapshots explicitly identified by id from expiring. - *

- * Identical to {@link org.apache.iceberg.ExpireSnapshots#retainLast(int)} + * + *

If a snapshot would be expired because it is older than the expiration timestamp, but is one + * of the {@code numSnapshots} most recent ancestors of the current state, it will be retained. + * This will not cause snapshots explicitly identified by id from expiring. + * + *

Identical to {@link org.apache.iceberg.ExpireSnapshots#retainLast(int)} * * @param numSnapshots the number of snapshots to retain * @return this for method chaining @@ -65,14 +64,15 @@ public interface ExpireSnapshots extends Action - * Manifest files that are no longer used by valid snapshots will be deleted. Content files that were - * marked as logically deleted by snapshots that are expired will be deleted as well. - *

- * If this method is not called, unnecessary manifests and content files will still be deleted. - *

- * Identical to {@link org.apache.iceberg.ExpireSnapshots#deleteWith(Consumer)} + * Passes an alternative delete implementation that will be used for manifests, data and delete + * files. + * + *

Manifest files that are no longer used by valid snapshots will be deleted. Content files + * that were marked as logically deleted by snapshots that are expired will be deleted as well. + * + *

If this method is not called, unnecessary manifests and content files will still be deleted. + * + *

Identical to {@link org.apache.iceberg.ExpireSnapshots#deleteWith(Consumer)} * * @param deleteFunc a function that will be called to delete manifests and data files * @return this for method chaining @@ -80,45 +80,34 @@ public interface ExpireSnapshots extends Action deleteFunc); /** - * Passes an alternative executor service that will be used for manifests, data and delete files deletion. - *

- * If this method is not called, unnecessary manifests and content files will still be deleted in - * the current thread. - *

- * Identical to {@link org.apache.iceberg.ExpireSnapshots#executeDeleteWith(ExecutorService)} + * Passes an alternative executor service that will be used for manifests, data and delete files + * deletion. + * + *

If this method is not called, unnecessary manifests and content files will still be deleted + * in the current thread. + * + *

Identical to {@link org.apache.iceberg.ExpireSnapshots#executeDeleteWith(ExecutorService)} * * @param executorService the service to use * @return this for method chaining */ ExpireSnapshots executeDeleteWith(ExecutorService executorService); - /** - * The action result that contains a summary of the execution. - */ + /** The action result that contains a summary of the execution. */ interface Result { - /** - * Returns the number of deleted data files. - */ + /** Returns the number of deleted data files. */ long deletedDataFilesCount(); - /** - * Returns the number of deleted equality delete files. - */ + /** Returns the number of deleted equality delete files. */ long deletedEqualityDeleteFilesCount(); - /** - * Returns the number of deleted position delete files. - */ + /** Returns the number of deleted position delete files. */ long deletedPositionDeleteFilesCount(); - /** - * Returns the number of deleted manifests. - */ + /** Returns the number of deleted manifests. */ long deletedManifestsCount(); - /** - * Returns the number of deleted manifest lists. - */ + /** Returns the number of deleted manifest lists. */ long deletedManifestListsCount(); } } diff --git a/api/src/main/java/org/apache/iceberg/actions/MigrateTable.java b/api/src/main/java/org/apache/iceberg/actions/MigrateTable.java index 629fa5cd2b91..e645fad21781 100644 --- a/api/src/main/java/org/apache/iceberg/actions/MigrateTable.java +++ b/api/src/main/java/org/apache/iceberg/actions/MigrateTable.java @@ -16,18 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.util.Map; -/** - * An action that migrates an existing table to Iceberg. - */ +/** An action that migrates an existing table to Iceberg. */ public interface MigrateTable extends Action { /** - * Sets table properties in the newly created Iceberg table. Any properties with - * the same key name will be overwritten. + * Sets table properties in the newly created Iceberg table. Any properties with the same key name + * will be overwritten. * * @param properties a map of properties to set * @return this for method chaining @@ -35,8 +32,8 @@ public interface MigrateTable extends Action MigrateTable tableProperties(Map properties); /** - * Sets a table property in the newly created Iceberg table. Any properties - * with the same key will be overwritten. + * Sets a table property in the newly created Iceberg table. Any properties with the same key will + * be overwritten. * * @param name a table property name * @param value a table property value @@ -44,13 +41,9 @@ public interface MigrateTable extends Action */ MigrateTable tableProperty(String name, String value); - /** - * The action result that contains a summary of the execution. - */ + /** The action result that contains a summary of the execution. */ interface Result { - /** - * Returns the number of migrated data files. - */ + /** Returns the number of migrated data files. */ long migratedDataFilesCount(); } } diff --git a/api/src/main/java/org/apache/iceberg/actions/RewriteDataFiles.java b/api/src/main/java/org/apache/iceberg/actions/RewriteDataFiles.java index 4aae9639fc18..39e2b9bc66f3 100644 --- a/api/src/main/java/org/apache/iceberg/actions/RewriteDataFiles.java +++ b/api/src/main/java/org/apache/iceberg/actions/RewriteDataFiles.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.util.List; @@ -26,86 +25,102 @@ import org.apache.iceberg.expressions.Expression; /** - * An action for rewriting data files according to a rewrite strategy. - * Generally used for optimizing the sizing and layout of data files within a table. + * An action for rewriting data files according to a rewrite strategy. Generally used for optimizing + * the sizing and layout of data files within a table. */ -public interface RewriteDataFiles extends SnapshotUpdate { +public interface RewriteDataFiles + extends SnapshotUpdate { /** - * Enable committing groups of files (see max-file-group-size-bytes) prior to the entire rewrite completing. - * This will produce additional commits but allow for progress even if some groups fail to commit. This setting - * will not change the correctness of the rewrite operation as file groups can be compacted independently. - *

- * The default is false, which produces a single commit when the entire job has completed. + * Enable committing groups of files (see max-file-group-size-bytes) prior to the entire rewrite + * completing. This will produce additional commits but allow for progress even if some groups + * fail to commit. This setting will not change the correctness of the rewrite operation as file + * groups can be compacted independently. + * + *

The default is false, which produces a single commit when the entire job has completed. */ String PARTIAL_PROGRESS_ENABLED = "partial-progress.enabled"; + boolean PARTIAL_PROGRESS_ENABLED_DEFAULT = false; /** - * The maximum amount of Iceberg commits that this rewrite is allowed to produce if partial progress is enabled. This - * setting has no effect if partial progress is disabled. + * The maximum amount of Iceberg commits that this rewrite is allowed to produce if partial + * progress is enabled. This setting has no effect if partial progress is disabled. */ String PARTIAL_PROGRESS_MAX_COMMITS = "partial-progress.max-commits"; + int PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT = 10; /** - * The entire rewrite operation is broken down into pieces based on partitioning and within partitions based - * on size into groups. These sub-units of the rewrite are referred to as file groups. The largest amount of data that - * should be compacted in a single group is controlled by {@link #MAX_FILE_GROUP_SIZE_BYTES}. This helps with - * breaking down the rewriting of very large partitions which may not be rewritable otherwise due to the resource - * constraints of the cluster. For example a sort based rewrite may not scale to terabyte sized partitions, those + * The entire rewrite operation is broken down into pieces based on partitioning and within + * partitions based on size into groups. These sub-units of the rewrite are referred to as file + * groups. The largest amount of data that should be compacted in a single group is controlled by + * {@link #MAX_FILE_GROUP_SIZE_BYTES}. This helps with breaking down the rewriting of very large + * partitions which may not be rewritable otherwise due to the resource constraints of the + * cluster. For example a sort based rewrite may not scale to terabyte sized partitions, those * partitions need to be worked on in small subsections to avoid exhaustion of resources. - *

- * When grouping files, the underlying rewrite strategy will use this value as to limit the files which - * will be included in a single file group. A group will be processed by a single framework "action". For example, - * in Spark this means that each group would be rewritten in its own Spark action. A group will never contain files - * for multiple output partitions. + * + *

When grouping files, the underlying rewrite strategy will use this value as to limit the + * files which will be included in a single file group. A group will be processed by a single + * framework "action". For example, in Spark this means that each group would be rewritten in its + * own Spark action. A group will never contain files for multiple output partitions. */ String MAX_FILE_GROUP_SIZE_BYTES = "max-file-group-size-bytes"; + long MAX_FILE_GROUP_SIZE_BYTES_DEFAULT = 1024L * 1024L * 1024L * 100L; // 100 Gigabytes /** - * The max number of file groups to be simultaneously rewritten by the rewrite strategy. The structure and - * contents of the group is determined by the rewrite strategy. Each file group will be rewritten - * independently and asynchronously. - **/ + * The max number of file groups to be simultaneously rewritten by the rewrite strategy. The + * structure and contents of the group is determined by the rewrite strategy. Each file group will + * be rewritten independently and asynchronously. + */ String MAX_CONCURRENT_FILE_GROUP_REWRITES = "max-concurrent-file-group-rewrites"; + int MAX_CONCURRENT_FILE_GROUP_REWRITES_DEFAULT = 1; /** - * The output file size that this rewrite strategy will attempt to generate when rewriting files. By default this - * will use the "write.target-file-size-bytes value" in the table properties of the table being updated. + * The output file size that this rewrite strategy will attempt to generate when rewriting files. + * By default this will use the "write.target-file-size-bytes value" in the table properties of + * the table being updated. */ String TARGET_FILE_SIZE_BYTES = "target-file-size-bytes"; /** - * If the compaction should use the sequence number of the snapshot at compaction start time for new data files, - * instead of using the sequence number of the newly produced snapshot. - *

- * This avoids commit conflicts with updates that add newer equality deletes at a higher sequence number. - *

- * Defaults to true. + * If the compaction should use the sequence number of the snapshot at compaction start time for + * new data files, instead of using the sequence number of the newly produced snapshot. + * + *

This avoids commit conflicts with updates that add newer equality deletes at a higher + * sequence number. + * + *

Defaults to true. */ String USE_STARTING_SEQUENCE_NUMBER = "use-starting-sequence-number"; + boolean USE_STARTING_SEQUENCE_NUMBER_DEFAULT = true; /** * Forces the rewrite job order based on the value. - *

    - *
  • If rewrite-job-order=bytes-asc, then rewrite the smallest job groups first. - *
  • If rewrite-job-order=bytes-desc, then rewrite the largest job groups first. - *
  • If rewrite-job-order=files-asc, then rewrite the job groups with the least files first. - *
  • If rewrite-job-order=files-desc, then rewrite the job groups with the most files first. - *
  • If rewrite-job-order=none, then rewrite job groups in the order they were planned (no - * specific ordering). - *

- * Defaults to none. + * + *

+ * + *

    + *
  • If rewrite-job-order=bytes-asc, then rewrite the smallest job groups first. + *
  • If rewrite-job-order=bytes-desc, then rewrite the largest job groups first. + *
  • If rewrite-job-order=files-asc, then rewrite the job groups with the least files first. + *
  • If rewrite-job-order=files-desc, then rewrite the job groups with the most files first. + *
  • If rewrite-job-order=none, then rewrite job groups in the order they were planned (no + * specific ordering). + *
+ * + *

Defaults to none. */ String REWRITE_JOB_ORDER = "rewrite-job-order"; + String REWRITE_JOB_ORDER_DEFAULT = RewriteJobOrder.NONE.orderName(); /** * Choose BINPACK as a strategy for this rewrite operation + * * @return this for method chaining */ default RewriteDataFiles binPack() { @@ -114,44 +129,51 @@ default RewriteDataFiles binPack() { /** * Choose SORT as a strategy for this rewrite operation using the table's sortOrder + * * @return this for method chaining */ default RewriteDataFiles sort() { - throw new UnsupportedOperationException("SORT Rewrite Strategy not implemented for this framework"); + throw new UnsupportedOperationException( + "SORT Rewrite Strategy not implemented for this framework"); } /** * Choose SORT as a strategy for this rewrite operation and manually specify the sortOrder to use + * * @param sortOrder user defined sortOrder * @return this for method chaining */ default RewriteDataFiles sort(SortOrder sortOrder) { - throw new UnsupportedOperationException("SORT Rewrite Strategy not implemented for this framework"); + throw new UnsupportedOperationException( + "SORT Rewrite Strategy not implemented for this framework"); } /** * Choose Z-ORDER as a strategy for this rewrite operation with a specified list of columns to use + * * @param columns Columns to be used to generate Z-Values * @return this for method chaining */ default RewriteDataFiles zOrder(String... columns) { - throw new UnsupportedOperationException("Z-ORDER Rewrite Strategy not implemented for this framework"); + throw new UnsupportedOperationException( + "Z-ORDER Rewrite Strategy not implemented for this framework"); } /** - * A user provided filter for determining which files will be considered by the rewrite strategy. This will be used - * in addition to whatever rules the rewrite strategy generates. For example this would be used for providing a - * restriction to only run rewrite on a specific partition. + * A user provided filter for determining which files will be considered by the rewrite strategy. + * This will be used in addition to whatever rules the rewrite strategy generates. For example + * this would be used for providing a restriction to only run rewrite on a specific partition. * - * @param expression An iceberg expression used to determine which files will be considered for rewriting + * @param expression An iceberg expression used to determine which files will be considered for + * rewriting * @return this for chaining */ RewriteDataFiles filter(Expression expression); /** - * A map of file group information to the results of rewriting that file group. If the results are null then - * that particular file group failed. We should only have failed groups if partial progress is enabled otherwise we - * will report a total failure for the job. + * A map of file group information to the results of rewriting that file group. If the results are + * null then that particular file group failed. We should only have failed groups if partial + * progress is enabled otherwise we will report a total failure for the job. */ interface Result { List rewriteResults(); @@ -161,13 +183,15 @@ default int addedDataFilesCount() { } default int rewrittenDataFilesCount() { - return rewriteResults().stream().mapToInt(FileGroupRewriteResult::rewrittenDataFilesCount).sum(); + return rewriteResults().stream() + .mapToInt(FileGroupRewriteResult::rewrittenDataFilesCount) + .sum(); } } /** - * For a particular file group, the number of files which are newly created and the number of files - * which were formerly part of the table but have been rewritten. + * For a particular file group, the number of files which are newly created and the number of + * files which were formerly part of the table but have been rewritten. */ interface FileGroupRewriteResult { FileGroupInfo info(); @@ -183,19 +207,13 @@ interface FileGroupRewriteResult { */ interface FileGroupInfo { - /** - * returns which file group this is out of the total set of file groups for this rewrite - */ + /** returns which file group this is out of the total set of file groups for this rewrite */ int globalIndex(); - /** - * returns which file group this is out of the set of file groups for this partition - */ + /** returns which file group this is out of the set of file groups for this partition */ int partitionIndex(); - /** - * returns which partition this file group contains files from - */ + /** returns which partition this file group contains files from */ StructLike partition(); } } diff --git a/api/src/main/java/org/apache/iceberg/actions/RewriteManifests.java b/api/src/main/java/org/apache/iceberg/actions/RewriteManifests.java index 1b2ae9c3ba8c..57b35aaf6596 100644 --- a/api/src/main/java/org/apache/iceberg/actions/RewriteManifests.java +++ b/api/src/main/java/org/apache/iceberg/actions/RewriteManifests.java @@ -16,20 +16,18 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.util.function.Predicate; import org.apache.iceberg.ManifestFile; -/** - * An action that rewrites manifests. - */ -public interface RewriteManifests extends SnapshotUpdate { +/** An action that rewrites manifests. */ +public interface RewriteManifests + extends SnapshotUpdate { /** * Rewrites manifests for a given spec id. - *

- * If not set, defaults to the table's default spec ID. + * + *

If not set, defaults to the table's default spec ID. * * @param specId a spec id * @return this for method chaining @@ -38,8 +36,8 @@ public interface RewriteManifests extends SnapshotUpdate - * If not set, all manifests will be rewritten. + * + *

If not set, all manifests will be rewritten. * * @param predicate a predicate * @return this for method chaining @@ -48,26 +46,20 @@ public interface RewriteManifests extends SnapshotUpdate - * If not set, defaults to the table's metadata location. + * + *

If not set, defaults to the table's metadata location. * * @param stagingLocation a staging location * @return this for method chaining */ RewriteManifests stagingLocation(String stagingLocation); - /** - * The action result that contains a summary of the execution. - */ + /** The action result that contains a summary of the execution. */ interface Result { - /** - * Returns rewritten manifests. - */ + /** Returns rewritten manifests. */ Iterable rewrittenManifests(); - /** - * Returns added manifests. - */ + /** Returns added manifests. */ Iterable addedManifests(); } } diff --git a/api/src/main/java/org/apache/iceberg/actions/RewritePositionDeleteFiles.java b/api/src/main/java/org/apache/iceberg/actions/RewritePositionDeleteFiles.java index 1cd79915c0f2..8085c35e9a3b 100644 --- a/api/src/main/java/org/apache/iceberg/actions/RewritePositionDeleteFiles.java +++ b/api/src/main/java/org/apache/iceberg/actions/RewritePositionDeleteFiles.java @@ -16,42 +16,36 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import org.apache.iceberg.expressions.Expression; /** * An action for rewriting position delete files. - *

- * Generally used for optimizing the size and layout of position delete files within a table. + * + *

Generally used for optimizing the size and layout of position delete files within a table. */ public interface RewritePositionDeleteFiles extends SnapshotUpdate { /** * A filter for finding deletes to rewrite. - *

- * The filter will be converted to a partition filter with an inclusive projection. Any file that may contain rows - * matching this filter will be used by the action. The matching delete files will be rewritten. + * + *

The filter will be converted to a partition filter with an inclusive projection. Any file + * that may contain rows matching this filter will be used by the action. The matching delete + * files will be rewritten. * * @param expression An iceberg expression used to find deletes. * @return this for method chaining */ RewritePositionDeleteFiles filter(Expression expression); - /** - * The action result that contains a summary of the execution. - */ + /** The action result that contains a summary of the execution. */ interface Result { - /** - * Returns the count of the position deletes that been rewritten. - */ + /** Returns the count of the position deletes that been rewritten. */ int rewrittenDeleteFilesCount(); - /** - * Returns the count of the added delete files. - */ + /** Returns the count of the added delete files. */ int addedDeleteFilesCount(); } } diff --git a/api/src/main/java/org/apache/iceberg/actions/SnapshotTable.java b/api/src/main/java/org/apache/iceberg/actions/SnapshotTable.java index ccd7e2811ade..37c600ab0392 100644 --- a/api/src/main/java/org/apache/iceberg/actions/SnapshotTable.java +++ b/api/src/main/java/org/apache/iceberg/actions/SnapshotTable.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.util.Map; -/** - * An action that creates an independent snapshot of an existing table. - */ +/** An action that creates an independent snapshot of an existing table. */ public interface SnapshotTable extends Action { /** * Sets the table identifier for the newly created Iceberg table. @@ -42,8 +39,8 @@ public interface SnapshotTable extends Action properties); /** - * Sets a table property in the newly created Iceberg table. Any properties - * with the same key name will be overwritten. + * Sets a table property in the newly created Iceberg table. Any properties with the same key name + * will be overwritten. * - * @param key the key of the property to add + * @param key the key of the property to add * @param value the value of the property to add * @return this for method chaining */ SnapshotTable tableProperty(String key, String value); - /** - * The action result that contains a summary of the execution. - */ + /** The action result that contains a summary of the execution. */ interface Result { - /** - * Returns the number of imported data files. - */ + /** Returns the number of imported data files. */ long importedDataFilesCount(); } } diff --git a/api/src/main/java/org/apache/iceberg/actions/SnapshotUpdate.java b/api/src/main/java/org/apache/iceberg/actions/SnapshotUpdate.java index 00fce31f918d..2f36656476e6 100644 --- a/api/src/main/java/org/apache/iceberg/actions/SnapshotUpdate.java +++ b/api/src/main/java/org/apache/iceberg/actions/SnapshotUpdate.java @@ -16,14 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import org.apache.iceberg.Snapshot; /** - * An action that produces snapshots. This interface contains common methods for all - * actions that create a new {@link Snapshot}. + * An action that produces snapshots. This interface contains common methods for all actions that + * create a new {@link Snapshot}. * * @param the child Java API class, returned by method chaining * @param the Java type of the result produced by this action diff --git a/api/src/main/java/org/apache/iceberg/catalog/Catalog.java b/api/src/main/java/org/apache/iceberg/catalog/Catalog.java index 99d154382053..b458589a34b8 100644 --- a/api/src/main/java/org/apache/iceberg/catalog/Catalog.java +++ b/api/src/main/java/org/apache/iceberg/catalog/Catalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.catalog; import java.util.List; @@ -30,9 +29,7 @@ import org.apache.iceberg.exceptions.NoSuchTableException; import org.apache.iceberg.exceptions.NotFoundException; -/** - * A Catalog API for table create, drop, and load operations. - */ +/** A Catalog API for table create, drop, and load operations. */ public interface Catalog { /** @@ -49,7 +46,7 @@ default String name() { * * @param namespace a namespace * @return a list of identifiers for tables - * @throws NotFoundException if the namespace is not found + * @throws NotFoundException if the namespace is not found */ List listTables(Namespace namespace); @@ -105,10 +102,7 @@ default Table createTable( * @return a Table instance * @throws AlreadyExistsException if the table already exists */ - default Table createTable( - TableIdentifier identifier, - Schema schema, - PartitionSpec spec) { + default Table createTable(TableIdentifier identifier, Schema schema, PartitionSpec spec) { return createTable(identifier, schema, spec, null, null); } @@ -120,9 +114,7 @@ default Table createTable( * @return a Table instance * @throws AlreadyExistsException if the table already exists */ - default Table createTable( - TableIdentifier identifier, - Schema schema) { + default Table createTable(TableIdentifier identifier, Schema schema) { return createTable(identifier, schema, PartitionSpec.unpartitioned(), null, null); } @@ -179,9 +171,7 @@ default Transaction newCreateTableTransaction( * @throws AlreadyExistsException if the table already exists */ default Transaction newCreateTableTransaction( - TableIdentifier identifier, - Schema schema, - PartitionSpec spec) { + TableIdentifier identifier, Schema schema, PartitionSpec spec) { return newCreateTableTransaction(identifier, schema, spec, null, null); } @@ -193,9 +183,7 @@ default Transaction newCreateTableTransaction( * @return a {@link Transaction} to create the table * @throws AlreadyExistsException if the table already exists */ - default Transaction newCreateTableTransaction( - TableIdentifier identifier, - Schema schema) { + default Transaction newCreateTableTransaction(TableIdentifier identifier, Schema schema) { return newCreateTableTransaction(identifier, schema, PartitionSpec.unpartitioned(), null, null); } @@ -219,10 +207,11 @@ default Transaction newReplaceTableTransaction( Map properties, boolean orCreate) { - TableBuilder tableBuilder = buildTable(identifier, schema) - .withPartitionSpec(spec) - .withLocation(location) - .withProperties(properties); + TableBuilder tableBuilder = + buildTable(identifier, schema) + .withPartitionSpec(spec) + .withLocation(location) + .withProperties(properties); if (orCreate) { return tableBuilder.createOrReplaceTransaction(); @@ -262,10 +251,7 @@ default Transaction newReplaceTableTransaction( * @throws NoSuchTableException if the table doesn't exist and orCreate is false */ default Transaction newReplaceTableTransaction( - TableIdentifier identifier, - Schema schema, - PartitionSpec spec, - boolean orCreate) { + TableIdentifier identifier, Schema schema, PartitionSpec spec, boolean orCreate) { return newReplaceTableTransaction(identifier, schema, spec, null, null, orCreate); } @@ -279,10 +265,9 @@ default Transaction newReplaceTableTransaction( * @throws NoSuchTableException if the table doesn't exist and orCreate is false */ default Transaction newReplaceTableTransaction( - TableIdentifier identifier, - Schema schema, - boolean orCreate) { - return newReplaceTableTransaction(identifier, schema, PartitionSpec.unpartitioned(), null, null, orCreate); + TableIdentifier identifier, Schema schema, boolean orCreate) { + return newReplaceTableTransaction( + identifier, schema, PartitionSpec.unpartitioned(), null, null, orCreate); } /** @@ -312,8 +297,8 @@ default boolean dropTable(TableIdentifier identifier) { /** * Drop a table; optionally delete data and metadata files. - *

- * If purge is set to true the implementation should delete all data and metadata files. + * + *

If purge is set to true the implementation should delete all data and metadata files. * * @param identifier a table identifier * @param purge if true, delete all data and metadata files in the table @@ -342,14 +327,13 @@ default boolean dropTable(TableIdentifier identifier) { /** * Invalidate cached table metadata from current catalog. - *

- * If the table is already loaded or cached, drop cached data. If the table does not exist or is - * not cached, do nothing. + * + *

If the table is already loaded or cached, drop cached data. If the table does not exist or + * is not cached, do nothing. * * @param identifier a table identifier */ - default void invalidateTable(TableIdentifier identifier) { - } + default void invalidateTable(TableIdentifier identifier) {} /** * Register a table with the catalog if it does not exist. @@ -364,34 +348,34 @@ default Table registerTable(TableIdentifier identifier, String metadataFileLocat } /** - /** - * Instantiate a builder to either create a table or start a create/replace transaction. + * /** Instantiate a builder to either create a table or start a create/replace transaction. * * @param identifier a table identifier * @param schema a schema * @return the builder to create a table or start a create/replace transaction */ default TableBuilder buildTable(TableIdentifier identifier, Schema schema) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement buildTable"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement buildTable"); } /** * Initialize a catalog given a custom name and a map of catalog properties. - *

- * A custom Catalog implementation must have a no-arg constructor. - * A compute engine like Spark or Flink will first initialize the catalog without any arguments, - * and then call this method to complete catalog initialization with properties passed into the engine. + * + *

A custom Catalog implementation must have a no-arg constructor. A compute engine like Spark + * or Flink will first initialize the catalog without any arguments, and then call this method to + * complete catalog initialization with properties passed into the engine. * * @param name a custom name for the catalog * @param properties catalog properties */ - default void initialize(String name, Map properties) { - } + default void initialize(String name, Map properties) {} /** - * A builder used to create valid {@link Table tables} or start create/replace {@link Transaction transactions}. - *

- * Call {@link #buildTable(TableIdentifier, Schema)} to create a new builder. + * A builder used to create valid {@link Table tables} or start create/replace {@link Transaction + * transactions}. + * + *

Call {@link #buildTable(TableIdentifier, Schema)} to create a new builder. */ interface TableBuilder { /** @@ -429,7 +413,7 @@ interface TableBuilder { /** * Adds a key/value property to the table. * - * @param key a key + * @param key a key * @param value a value * @return this for method chaining */ diff --git a/api/src/main/java/org/apache/iceberg/catalog/Namespace.java b/api/src/main/java/org/apache/iceberg/catalog/Namespace.java index de237a9712b6..e66be71cfcae 100644 --- a/api/src/main/java/org/apache/iceberg/catalog/Namespace.java +++ b/api/src/main/java/org/apache/iceberg/catalog/Namespace.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.catalog; import java.util.Arrays; @@ -25,9 +24,7 @@ import org.apache.iceberg.relocated.com.google.common.base.Joiner; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -/** - * A namespace in a {@link Catalog}. - */ +/** A namespace in a {@link Catalog}. */ public class Namespace { private static final Namespace EMPTY_NAMESPACE = new Namespace(new String[] {}); private static final Joiner DOT = Joiner.on('.'); @@ -45,9 +42,9 @@ public static Namespace of(String... levels) { } for (String level : levels) { - Preconditions.checkNotNull(level, - "Cannot create a namespace with a null level"); - Preconditions.checkArgument(!CONTAINS_NULL_CHARACTER.test(level), + Preconditions.checkNotNull(level, "Cannot create a namespace with a null level"); + Preconditions.checkArgument( + !CONTAINS_NULL_CHARACTER.test(level), "Cannot create a namespace with the null-byte character"); } diff --git a/api/src/main/java/org/apache/iceberg/catalog/SessionCatalog.java b/api/src/main/java/org/apache/iceberg/catalog/SessionCatalog.java index aef26e67ba3f..5e0c0e230fcd 100644 --- a/api/src/main/java/org/apache/iceberg/catalog/SessionCatalog.java +++ b/api/src/main/java/org/apache/iceberg/catalog/SessionCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.catalog; import java.util.List; @@ -31,13 +30,9 @@ import org.apache.iceberg.exceptions.NoSuchTableException; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -/** - * A Catalog API for table and namespace operations that includes session context. - */ +/** A Catalog API for table and namespace operations that includes session context. */ public interface SessionCatalog { - /** - * Context for a session. - */ + /** Context for a session. */ final class SessionContext { private final String sessionId; private final String identity; @@ -48,8 +43,11 @@ public static SessionContext createEmpty() { return new SessionContext(UUID.randomUUID().toString(), null, null, ImmutableMap.of()); } - public SessionContext(String sessionId, String identity, Map credentials, - Map properties) { + public SessionContext( + String sessionId, + String identity, + Map credentials, + Map properties) { this.sessionId = sessionId; this.identity = identity; this.credentials = credentials; @@ -58,8 +56,8 @@ public SessionContext(String sessionId, String identity, Map cre /** * Returns a string that identifies this session. - *

- * This can be used for caching state within a session. + * + *

This can be used for caching state within a session. * * @return a string that identifies this session */ @@ -69,8 +67,8 @@ public String sessionId() { /** * Returns a string that identifies the current user or principal. - *

- * This identity cannot change for a given session ID. + * + *

This identity cannot change for a given session ID. * * @return a user or principal identity string */ @@ -80,8 +78,8 @@ public String identity() { /** * Returns the session's credential map. - *

- * This cannot change for a given session ID. + * + *

This cannot change for a given session ID. * * @return a credential string */ @@ -180,8 +178,8 @@ default boolean tableExists(SessionContext context, TableIdentifier ident) { /** * Drop a table, without requesting that files are immediately deleted. - *

- * Data and metadata files should be deleted according to the catalog's policy. + * + *

Data and metadata files should be deleted according to the catalog's policy. * * @param context session context * @param ident a table identifier @@ -212,9 +210,9 @@ default boolean tableExists(SessionContext context, TableIdentifier ident) { /** * Invalidate cached table metadata from current catalog. - *

- * If the table is already loaded or cached, drop cached data. If the table does not exist or is - * not cached, do nothing. + * + *

If the table is already loaded or cached, drop cached data. If the table does not exist or + * is not cached, do nothing. * * @param context session context * @param ident a table identifier @@ -246,10 +244,10 @@ default void createNamespace(SessionContext context, Namespace namespace) { /** * List top-level namespaces from the catalog. - *

- * If an object such as a table, view, or function exists, its parent namespaces must also exist - * and must be returned by this discovery method. For example, if table a.b.t exists, this method - * must return ["a"] in the result array. + * + *

If an object such as a table, view, or function exists, its parent namespaces must also + * exist and must be returned by this discovery method. For example, if table a.b.t exists, this + * method must return ["a"] in the result array. * * @param context session context * @return an List of namespace {@link Namespace} names @@ -260,9 +258,9 @@ default List listNamespaces(SessionContext context) { /** * List namespaces from the namespace. - *

- * For example, if table a.b.t exists, use 'SELECT NAMESPACE IN a' this method - * must return Namepace.of("a","b") {@link Namespace}. + * + *

For example, if table a.b.t exists, use 'SELECT NAMESPACE IN a' this method must return + * Namepace.of("a","b") {@link Namespace}. * * @param context session context * @param namespace a {@link Namespace namespace} @@ -293,8 +291,8 @@ default List listNamespaces(SessionContext context) { /** * Set a collection of properties on a namespace in the catalog. - *

- * Properties that are not in the given map are not modified or removed by this method. + * + *

Properties that are not in the given map are not modified or removed by this method. * * @param context session context * @param namespace a {@link Namespace namespace} @@ -303,8 +301,11 @@ default List listNamespaces(SessionContext context) { * @throws NoSuchNamespaceException If the namespace does not exist (optional) * @throws UnsupportedOperationException If namespace properties are not supported */ - boolean updateNamespaceMetadata(SessionContext context, Namespace namespace, - Map updates, Set removals); + boolean updateNamespaceMetadata( + SessionContext context, + Namespace namespace, + Map updates, + Set removals); /** * Checks whether the Namespace exists. diff --git a/api/src/main/java/org/apache/iceberg/catalog/SupportsNamespaces.java b/api/src/main/java/org/apache/iceberg/catalog/SupportsNamespaces.java index 82e6739734aa..14cece611f76 100644 --- a/api/src/main/java/org/apache/iceberg/catalog/SupportsNamespaces.java +++ b/api/src/main/java/org/apache/iceberg/catalog/SupportsNamespaces.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.catalog; import java.util.List; @@ -29,13 +28,13 @@ /** * Catalog methods for working with namespaces. - *

- * If an object such as a table, view, or function exists, its parent namespaces must also exist - * and must be returned by the discovery methods {@link #listNamespaces()} and - * {@link #listNamespaces(Namespace namespace)}. - *

- * Catalog implementations are not required to maintain the existence of namespaces independent of - * objects in a namespace. For example, a function catalog that loads functions using reflection + * + *

If an object such as a table, view, or function exists, its parent namespaces must also exist + * and must be returned by the discovery methods {@link #listNamespaces()} and {@link + * #listNamespaces(Namespace namespace)}. + * + *

Catalog implementations are not required to maintain the existence of namespaces independent + * of objects in a namespace. For example, a function catalog that loads functions using reflection * and uses Java packages as namespaces is not required to support the methods to create, alter, or * drop a namespace. Implementations are allowed to discover the existence of objects or namespaces * without throwing {@link NoSuchNamespaceException} when no namespace is found. @@ -64,10 +63,10 @@ default void createNamespace(Namespace namespace) { /** * List top-level namespaces from the catalog. - *

- * If an object such as a table, view, or function exists, its parent namespaces must also exist - * and must be returned by this discovery method. For example, if table a.b.t exists, this method - * must return ["a"] in the result array. + * + *

If an object such as a table, view, or function exists, its parent namespaces must also + * exist and must be returned by this discovery method. For example, if table a.b.t exists, this + * method must return ["a"] in the result array. * * @return an List of namespace {@link Namespace} names */ @@ -77,9 +76,9 @@ default List listNamespaces() { /** * List namespaces from the namespace. - *

- * For example, if table a.b.t exists, use 'SELECT NAMESPACE IN a' this method - * must return Namepace.of("a","b") {@link Namespace}. + * + *

For example, if table a.b.t exists, use 'SELECT NAMESPACE IN a' this method must return + * Namepace.of("a","b") {@link Namespace}. * * @return a List of namespace {@link Namespace} names * @throws NoSuchNamespaceException If the namespace does not exist (optional) @@ -106,27 +105,29 @@ default List listNamespaces() { /** * Set a collection of properties on a namespace in the catalog. - *

- * Properties that are not in the given map are not modified or removed by this method. + * + *

Properties that are not in the given map are not modified or removed by this method. * * @param namespace a namespace. {@link Namespace} * @param properties a collection of metadata to apply to the namespace * @throws NoSuchNamespaceException If the namespace does not exist (optional) * @throws UnsupportedOperationException If namespace properties are not supported */ - boolean setProperties(Namespace namespace, Map properties) throws NoSuchNamespaceException; + boolean setProperties(Namespace namespace, Map properties) + throws NoSuchNamespaceException; /** * Remove a set of property keys from a namespace in the catalog. - *

- * Properties that are not in the given set are not modified or removed by this method. + * + *

Properties that are not in the given set are not modified or removed by this method. * * @param namespace a namespace. {@link Namespace} * @param properties a collection of metadata to apply to the namespace * @throws NoSuchNamespaceException If the namespace does not exist (optional) * @throws UnsupportedOperationException If namespace properties are not supported */ - boolean removeProperties(Namespace namespace, Set properties) throws NoSuchNamespaceException; + boolean removeProperties(Namespace namespace, Set properties) + throws NoSuchNamespaceException; /** * Checks whether the Namespace exists. diff --git a/api/src/main/java/org/apache/iceberg/catalog/TableIdentifier.java b/api/src/main/java/org/apache/iceberg/catalog/TableIdentifier.java index ecbaa0646f84..8531fc63615a 100644 --- a/api/src/main/java/org/apache/iceberg/catalog/TableIdentifier.java +++ b/api/src/main/java/org/apache/iceberg/catalog/TableIdentifier.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.catalog; import java.util.Arrays; @@ -25,9 +24,7 @@ import org.apache.iceberg.relocated.com.google.common.base.Splitter; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -/** - * Identifies a table in iceberg catalog. - */ +/** Identifies a table in iceberg catalog. */ public class TableIdentifier { private static final Splitter DOT = Splitter.on('.'); @@ -37,8 +34,10 @@ public class TableIdentifier { public static TableIdentifier of(String... names) { Preconditions.checkArgument(names != null, "Cannot create table identifier from null array"); - Preconditions.checkArgument(names.length > 0, "Cannot create table identifier without a table name"); - return new TableIdentifier(Namespace.of(Arrays.copyOf(names, names.length - 1)), names[names.length - 1]); + Preconditions.checkArgument( + names.length > 0, "Cannot create table identifier without a table name"); + return new TableIdentifier( + Namespace.of(Arrays.copyOf(names, names.length - 1)), names[names.length - 1]); } public static TableIdentifier of(Namespace namespace, String name) { @@ -52,7 +51,8 @@ public static TableIdentifier parse(String identifier) { } private TableIdentifier(Namespace namespace, String name) { - Preconditions.checkArgument(name != null && !name.isEmpty(), "Invalid table name: null or empty"); + Preconditions.checkArgument( + name != null && !name.isEmpty(), "Invalid table name: null or empty"); Preconditions.checkArgument(namespace != null, "Invalid Namespace: null"); this.namespace = namespace; this.name = name; @@ -60,30 +60,26 @@ private TableIdentifier(Namespace namespace, String name) { /** * Whether the namespace is empty. + * * @return true if the namespace is not empty, false otherwise */ public boolean hasNamespace() { return !namespace.isEmpty(); } - /** - * Returns the identifier namespace. - */ + /** Returns the identifier namespace. */ public Namespace namespace() { return namespace; } - /** - * Returns the identifier name. - */ + /** Returns the identifier name. */ public String name() { return name; } public TableIdentifier toLowerCase() { - String[] newLevels = Arrays.stream(namespace().levels()) - .map(String::toLowerCase) - .toArray(String[]::new); + String[] newLevels = + Arrays.stream(namespace().levels()).map(String::toLowerCase).toArray(String[]::new); String newName = name().toLowerCase(); return TableIdentifier.of(Namespace.of(newLevels), newName); } diff --git a/api/src/main/java/org/apache/iceberg/data/Record.java b/api/src/main/java/org/apache/iceberg/data/Record.java index 496201ceb159..73448ec1e801 100644 --- a/api/src/main/java/org/apache/iceberg/data/Record.java +++ b/api/src/main/java/org/apache/iceberg/data/Record.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; import java.util.Map; @@ -50,12 +49,12 @@ default Record copy(String field1, Object value1, String field2, Object value2) return copy(overwriteValues); } - default Record copy(String field1, Object value1, String field2, Object value2, String field3, Object value3) { + default Record copy( + String field1, Object value1, String field2, Object value2, String field3, Object value3) { Map overwriteValues = Maps.newHashMapWithExpectedSize(3); overwriteValues.put(field1, value1); overwriteValues.put(field2, value2); overwriteValues.put(field3, value3); return copy(overwriteValues); } - } diff --git a/api/src/main/java/org/apache/iceberg/encryption/EncryptedInputFile.java b/api/src/main/java/org/apache/iceberg/encryption/EncryptedInputFile.java index e990d1f5bf3a..4a0b77062a8a 100644 --- a/api/src/main/java/org/apache/iceberg/encryption/EncryptedInputFile.java +++ b/api/src/main/java/org/apache/iceberg/encryption/EncryptedInputFile.java @@ -16,23 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; import org.apache.iceberg.io.InputFile; /** * Thin wrapper around an {@link InputFile} instance that is encrypted. - *

- * The {@link EncryptionManager} takes instances of these and uses the attached - * {@link #keyMetadata()} to find an encryption key and decrypt the enclosed - * {@link #encryptedInputFile()}. + * + *

The {@link EncryptionManager} takes instances of these and uses the attached {@link + * #keyMetadata()} to find an encryption key and decrypt the enclosed {@link #encryptedInputFile()}. */ public interface EncryptedInputFile { - /** - * The {@link InputFile} that is reading raw encrypted bytes from the underlying file system. - */ + /** The {@link InputFile} that is reading raw encrypted bytes from the underlying file system. */ InputFile encryptedInputFile(); /** diff --git a/api/src/main/java/org/apache/iceberg/encryption/EncryptedOutputFile.java b/api/src/main/java/org/apache/iceberg/encryption/EncryptedOutputFile.java index d05033ebe150..1686342c776d 100644 --- a/api/src/main/java/org/apache/iceberg/encryption/EncryptedOutputFile.java +++ b/api/src/main/java/org/apache/iceberg/encryption/EncryptedOutputFile.java @@ -16,29 +16,25 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; import org.apache.iceberg.io.OutputFile; /** - * Thin wrapper around a {@link OutputFile} that is encrypting bytes written to the underlying - * file system, via an encryption key that is symbolized by the enclosed - * {@link EncryptionKeyMetadata}. - *

- * The {@link EncryptionManager} returns instances of these when passed output files that should + * Thin wrapper around a {@link OutputFile} that is encrypting bytes written to the underlying file + * system, via an encryption key that is symbolized by the enclosed {@link EncryptionKeyMetadata}. + * + *

The {@link EncryptionManager} returns instances of these when passed output files that should * be encrypted as they are being written to the backing file system. */ public interface EncryptedOutputFile { - /** - * An OutputFile instance that encrypts the bytes that are written to its output streams. - */ + /** An OutputFile instance that encrypts the bytes that are written to its output streams. */ OutputFile encryptingOutputFile(); /** - * Metadata about the encryption key that is being used to encrypt the associated - * {@link #encryptingOutputFile()}. + * Metadata about the encryption key that is being used to encrypt the associated {@link + * #encryptingOutputFile()}. */ EncryptionKeyMetadata keyMetadata(); } diff --git a/api/src/main/java/org/apache/iceberg/encryption/EncryptionKeyMetadata.java b/api/src/main/java/org/apache/iceberg/encryption/EncryptionKeyMetadata.java index 02ff8e212085..1ce1c337a809 100644 --- a/api/src/main/java/org/apache/iceberg/encryption/EncryptionKeyMetadata.java +++ b/api/src/main/java/org/apache/iceberg/encryption/EncryptionKeyMetadata.java @@ -16,38 +16,36 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; import java.nio.ByteBuffer; /** - * Light typedef over a ByteBuffer that indicates that the given bytes represent metadata about - * an encrypted data file's encryption key. - *

- * This is preferred over passing a ByteBuffer directly in order to be more explicit. + * Light typedef over a ByteBuffer that indicates that the given bytes represent metadata about an + * encrypted data file's encryption key. + * + *

This is preferred over passing a ByteBuffer directly in order to be more explicit. */ public interface EncryptionKeyMetadata { - EncryptionKeyMetadata EMPTY = new EncryptionKeyMetadata() { - @Override - public ByteBuffer buffer() { - return null; - } + EncryptionKeyMetadata EMPTY = + new EncryptionKeyMetadata() { + @Override + public ByteBuffer buffer() { + return null; + } - @Override - public EncryptionKeyMetadata copy() { - return this; - } - }; + @Override + public EncryptionKeyMetadata copy() { + return this; + } + }; static EncryptionKeyMetadata empty() { return EMPTY; } - /** - * Opaque blob representing metadata about a file's encryption key. - */ + /** Opaque blob representing metadata about a file's encryption key. */ ByteBuffer buffer(); EncryptionKeyMetadata copy(); diff --git a/api/src/main/java/org/apache/iceberg/encryption/EncryptionManager.java b/api/src/main/java/org/apache/iceberg/encryption/EncryptionManager.java index 97f6f639311b..22d2858599a8 100644 --- a/api/src/main/java/org/apache/iceberg/encryption/EncryptionManager.java +++ b/api/src/main/java/org/apache/iceberg/encryption/EncryptionManager.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; import java.io.Serializable; @@ -26,25 +25,25 @@ /** * Module for encrypting and decrypting table data files. - *

- * This must be serializable because an instance may be instantiated in one place and sent across + * + *

This must be serializable because an instance may be instantiated in one place and sent across * the wire in some Iceberg integrations, notably Spark. */ public interface EncryptionManager extends Serializable { /** * Given an {@link EncryptedInputFile#encryptedInputFile()} representing the raw encrypted bytes - * from the underlying file system, and given metadata about how the file was encrypted via - * {@link EncryptedInputFile#keyMetadata()}, return an {@link InputFile} that returns decrypted - * input streams. + * from the underlying file system, and given metadata about how the file was encrypted via {@link + * EncryptedInputFile#keyMetadata()}, return an {@link InputFile} that returns decrypted input + * streams. */ InputFile decrypt(EncryptedInputFile encrypted); /** - * Variant of {@link #decrypt(EncryptedInputFile)} that provides a sequence of files that all - * need to be decrypted in a single context. - *

- * By default this calls the single-file decryption method for each element in the iterator. + * Variant of {@link #decrypt(EncryptedInputFile)} that provides a sequence of files that all need + * to be decrypted in a single context. + * + *

By default this calls the single-file decryption method for each element in the iterator. * Implementations can override this for a variety of optimizations. For example, an * implementation can perform lookahead on the input iterator and fetch encryption keys in batch. */ @@ -63,8 +62,8 @@ default Iterable decrypt(Iterable encrypted) { /** * Variant of {@link #encrypt(OutputFile)} that provides a sequence of files that all need to be * encrypted in a single context. - *

- * By default this calls the single-file encryption method for each element in the iterator. + * + *

By default this calls the single-file encryption method for each element in the iterator. * Implementations can override this for a variety of optimizations. For example, an * implementation can perform lookahead on the input iterator and fetch encryption keys in batch. */ diff --git a/api/src/main/java/org/apache/iceberg/encryption/KmsClient.java b/api/src/main/java/org/apache/iceberg/encryption/KmsClient.java index 2a546a5e1252..3e115d8b2ceb 100644 --- a/api/src/main/java/org/apache/iceberg/encryption/KmsClient.java +++ b/api/src/main/java/org/apache/iceberg/encryption/KmsClient.java @@ -16,22 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; import java.io.Serializable; import java.nio.ByteBuffer; import java.util.Map; -/** - * A minimum client interface to connect to a key management service (KMS). - */ +/** A minimum client interface to connect to a key management service (KMS). */ public interface KmsClient extends Serializable { /** * Wrap a secret key, using a wrapping/master key which is stored in KMS and referenced by an ID. - * Wrapping means encryption of the secret key with the master key, and adding optional KMS-specific metadata - * that allows the KMS to decrypt the secret key in an unwrapping call. + * Wrapping means encryption of the secret key with the master key, and adding optional + * KMS-specific metadata that allows the KMS to decrypt the secret key in an unwrapping call. * * @param key a secret key being wrapped * @param wrappingKeyId a key ID that represents a wrapping key stored in KMS @@ -42,19 +39,19 @@ public interface KmsClient extends Serializable { /** * Some KMS systems support generation of secret keys inside the KMS server. * - * @return true if KMS server supports key generation and KmsClient implementation - * is interested to leverage this capability. Otherwise, return false - Iceberg will - * then generate secret keys locally (using the SecureRandom mechanism) and call - * {@link #wrapKey(ByteBuffer, String)} to wrap them in KMS. + * @return true if KMS server supports key generation and KmsClient implementation is interested + * to leverage this capability. Otherwise, return false - Iceberg will then generate secret + * keys locally (using the SecureRandom mechanism) and call {@link #wrapKey(ByteBuffer, + * String)} to wrap them in KMS. */ default boolean supportsKeyGeneration() { return false; } /** - * Generate a new secret key in the KMS server, and wrap it using a wrapping/master key - * which is stored in KMS and referenced by an ID. This method will be called only if - * supportsKeyGeneration returns true. + * Generate a new secret key in the KMS server, and wrap it using a wrapping/master key which is + * stored in KMS and referenced by an ID. This method will be called only if supportsKeyGeneration + * returns true. * * @param wrappingKeyId a key ID that represents a wrapping key stored in KMS * @return key in two forms: raw, and wrapped with the given wrappingKeyId @@ -64,9 +61,11 @@ default KeyGenerationResult generateKey(String wrappingKeyId) { } /** - * Unwrap a secret key, using a wrapping/master key which is stored in KMS and referenced by an ID. + * Unwrap a secret key, using a wrapping/master key which is stored in KMS and referenced by an + * ID. * - * @param wrappedKey wrapped key material (encrypted key and optional KMS metadata, returned by the wrapKey method) + * @param wrappedKey wrapped key material (encrypted key and optional KMS metadata, returned by + * the wrapKey method) * @param wrappingKeyId a key ID that represents a wrapping key stored in KMS * @return raw key bytes */ @@ -80,8 +79,8 @@ default KeyGenerationResult generateKey(String wrappingKeyId) { void initialize(Map properties); /** - * For KMS systems that support key generation, this class keeps the key generation result - - * the raw secret key, and its wrap. + * For KMS systems that support key generation, this class keeps the key generation result - the + * raw secret key, and its wrap. */ class KeyGenerationResult { private final ByteBuffer key; diff --git a/api/src/main/java/org/apache/iceberg/events/IncrementalScanEvent.java b/api/src/main/java/org/apache/iceberg/events/IncrementalScanEvent.java index f9265fa71f7d..d72bb682cad2 100644 --- a/api/src/main/java/org/apache/iceberg/events/IncrementalScanEvent.java +++ b/api/src/main/java/org/apache/iceberg/events/IncrementalScanEvent.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.events; import org.apache.iceberg.Schema; import org.apache.iceberg.expressions.Expression; -/** - * Event sent to listeners when an incremental table scan is planned. - */ +/** Event sent to listeners when an incremental table scan is planned. */ public final class IncrementalScanEvent { private final String tableName; private final long fromSnapshotId; @@ -33,8 +30,13 @@ public final class IncrementalScanEvent { private final Schema projection; private final boolean fromSnapshotInclusive; - public IncrementalScanEvent(String tableName, long fromSnapshotId, long toSnapshotId, Expression filter, - Schema projection, boolean fromSnapshotInclusive) { + public IncrementalScanEvent( + String tableName, + long fromSnapshotId, + long toSnapshotId, + Expression filter, + Schema projection, + boolean fromSnapshotInclusive) { this.tableName = tableName; this.fromSnapshotId = fromSnapshotId; this.toSnapshotId = toSnapshotId; diff --git a/api/src/main/java/org/apache/iceberg/events/Listener.java b/api/src/main/java/org/apache/iceberg/events/Listener.java index 843a1de91402..68fd09f1c05f 100644 --- a/api/src/main/java/org/apache/iceberg/events/Listener.java +++ b/api/src/main/java/org/apache/iceberg/events/Listener.java @@ -16,12 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.events; -/** - * A listener interface that can receive notifications. - */ +/** A listener interface that can receive notifications. */ public interface Listener { void notify(E event); } diff --git a/api/src/main/java/org/apache/iceberg/events/Listeners.java b/api/src/main/java/org/apache/iceberg/events/Listeners.java index 23ecd1e26dba..27c9c0590658 100644 --- a/api/src/main/java/org/apache/iceberg/events/Listeners.java +++ b/api/src/main/java/org/apache/iceberg/events/Listeners.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.events; import java.util.Map; @@ -25,17 +24,15 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; -/** - * Static registration and notification for listeners. - */ +/** Static registration and notification for listeners. */ public class Listeners { - private Listeners() { - } + private Listeners() {} private static final Map, Queue>> listeners = Maps.newConcurrentMap(); public static void register(Listener listener, Class eventType) { - Queue> list = listeners.computeIfAbsent(eventType, k -> new ConcurrentLinkedQueue<>()); + Queue> list = + listeners.computeIfAbsent(eventType, k -> new ConcurrentLinkedQueue<>()); list.add(listener); } diff --git a/api/src/main/java/org/apache/iceberg/events/ScanEvent.java b/api/src/main/java/org/apache/iceberg/events/ScanEvent.java index 5c07de849bd7..64fb63874e23 100644 --- a/api/src/main/java/org/apache/iceberg/events/ScanEvent.java +++ b/api/src/main/java/org/apache/iceberg/events/ScanEvent.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.events; import org.apache.iceberg.Schema; import org.apache.iceberg.expressions.Expression; -/** - * Event sent to listeners when a table scan is planned. - */ +/** Event sent to listeners when a table scan is planned. */ public final class ScanEvent { private final String tableName; private final long snapshotId; diff --git a/api/src/main/java/org/apache/iceberg/exceptions/AlreadyExistsException.java b/api/src/main/java/org/apache/iceberg/exceptions/AlreadyExistsException.java index ef34103c1b4b..5edc1cf01ebc 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/AlreadyExistsException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/AlreadyExistsException.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; import com.google.errorprone.annotations.FormatMethod; -/** - * Exception raised when attempting to create a table that already exists. - */ +/** Exception raised when attempting to create a table that already exists. */ public class AlreadyExistsException extends RuntimeException { @FormatMethod public AlreadyExistsException(String message, Object... args) { diff --git a/api/src/main/java/org/apache/iceberg/exceptions/BadRequestException.java b/api/src/main/java/org/apache/iceberg/exceptions/BadRequestException.java index 9cacee3e51aa..54c47eda91c8 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/BadRequestException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/BadRequestException.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; import com.google.errorprone.annotations.FormatMethod; -/** - * Exception thrown on HTTP 400 - Bad Request - */ +/** Exception thrown on HTTP 400 - Bad Request */ public class BadRequestException extends RuntimeException { @FormatMethod public BadRequestException(String message, Object... args) { diff --git a/api/src/main/java/org/apache/iceberg/exceptions/CherrypickAncestorCommitException.java b/api/src/main/java/org/apache/iceberg/exceptions/CherrypickAncestorCommitException.java index 11b875aaf876..082db7a3a4a8 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/CherrypickAncestorCommitException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/CherrypickAncestorCommitException.java @@ -16,12 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; /** - * This exception occurs when one cherrypicks an ancestor or when the picked snapshot is already linked to - * a published ancestor. This additionally helps avoid duplicate cherrypicks on non-WAP snapshots. + * This exception occurs when one cherrypicks an ancestor or when the picked snapshot is already + * linked to a published ancestor. This additionally helps avoid duplicate cherrypicks on non-WAP + * snapshots. */ public class CherrypickAncestorCommitException extends ValidationException { @@ -30,7 +30,8 @@ public CherrypickAncestorCommitException(long snapshotId) { } public CherrypickAncestorCommitException(long snapshotId, long publishedAncestorId) { - super("Cannot cherrypick snapshot %s: already picked to create ancestor %s", + super( + "Cannot cherrypick snapshot %s: already picked to create ancestor %s", String.valueOf(snapshotId), String.valueOf(publishedAncestorId)); } } diff --git a/api/src/main/java/org/apache/iceberg/exceptions/CommitFailedException.java b/api/src/main/java/org/apache/iceberg/exceptions/CommitFailedException.java index 8dc72f4481a9..ca9d1e4a5bfb 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/CommitFailedException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/CommitFailedException.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; import com.google.errorprone.annotations.FormatMethod; -/** - * Exception raised when a commit fails because of out of date metadata. - */ +/** Exception raised when a commit fails because of out of date metadata. */ public class CommitFailedException extends RuntimeException { @FormatMethod public CommitFailedException(String message, Object... args) { diff --git a/api/src/main/java/org/apache/iceberg/exceptions/CommitStateUnknownException.java b/api/src/main/java/org/apache/iceberg/exceptions/CommitStateUnknownException.java index d6e36f5031c2..2ecb97fe125a 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/CommitStateUnknownException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/CommitStateUnknownException.java @@ -16,22 +16,21 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; /** - * Exception for a failure to confirm either affirmatively or negatively that a commit was applied. The client - * cannot take any further action without possibly corrupting the table. + * Exception for a failure to confirm either affirmatively or negatively that a commit was applied. + * The client cannot take any further action without possibly corrupting the table. */ public class CommitStateUnknownException extends RuntimeException { private static final String COMMON_INFO = - "Cannot determine whether the commit was successful or not, the underlying data files may or " + - "may not be needed. Manual intervention via the Remove Orphan Files Action can remove these " + - "files when a connection to the Catalog can be re-established if the commit was actually unsuccessful.\n" + - "Please check to see whether or not your commit was successful before retrying this commit. Retrying " + - "an already successful operation will result in duplicate records or unintentional modifications.\n" + - "At this time no files will be deleted including possibly unused manifest lists."; + "Cannot determine whether the commit was successful or not, the underlying data files may or " + + "may not be needed. Manual intervention via the Remove Orphan Files Action can remove these " + + "files when a connection to the Catalog can be re-established if the commit was actually unsuccessful.\n" + + "Please check to see whether or not your commit was successful before retrying this commit. Retrying " + + "an already successful operation will result in duplicate records or unintentional modifications.\n" + + "At this time no files will be deleted including possibly unused manifest lists."; public CommitStateUnknownException(Throwable cause) { super(cause.getMessage() + "\n" + COMMON_INFO, cause); diff --git a/api/src/main/java/org/apache/iceberg/exceptions/DuplicateWAPCommitException.java b/api/src/main/java/org/apache/iceberg/exceptions/DuplicateWAPCommitException.java index e32f36e4de6f..2af1978614db 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/DuplicateWAPCommitException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/DuplicateWAPCommitException.java @@ -16,12 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; /** - * This exception occurs when the WAP workflow detects a duplicate wap commit. This helps clients - * to detect duplicate snapshots that are connected by the same wap id. + * This exception occurs when the WAP workflow detects a duplicate wap commit. This helps clients to + * detect duplicate snapshots that are connected by the same wap id. */ public class DuplicateWAPCommitException extends ValidationException { diff --git a/api/src/main/java/org/apache/iceberg/exceptions/ForbiddenException.java b/api/src/main/java/org/apache/iceberg/exceptions/ForbiddenException.java index 5be5e16a99e3..0babdb48c77d 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/ForbiddenException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/ForbiddenException.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; import com.google.errorprone.annotations.FormatMethod; -/** - * Exception thrown on HTTP 403 Forbidden - Failed authorization checks. - */ +/** Exception thrown on HTTP 403 Forbidden - Failed authorization checks. */ public class ForbiddenException extends RuntimeException { @FormatMethod public ForbiddenException(String message, Object... args) { diff --git a/api/src/main/java/org/apache/iceberg/exceptions/NamespaceNotEmptyException.java b/api/src/main/java/org/apache/iceberg/exceptions/NamespaceNotEmptyException.java index 3c2dda8f76ed..c4f7b5bd4143 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/NamespaceNotEmptyException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/NamespaceNotEmptyException.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; import com.google.errorprone.annotations.FormatMethod; -/** - * Exception raised when attempting to drop a namespace that is not empty. - */ +/** Exception raised when attempting to drop a namespace that is not empty. */ public class NamespaceNotEmptyException extends RuntimeException { @FormatMethod public NamespaceNotEmptyException(String message, Object... args) { diff --git a/api/src/main/java/org/apache/iceberg/exceptions/NoSuchIcebergTableException.java b/api/src/main/java/org/apache/iceberg/exceptions/NoSuchIcebergTableException.java index cb137216e00d..fb4da52a4439 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/NoSuchIcebergTableException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/NoSuchIcebergTableException.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; import com.google.errorprone.annotations.FormatMethod; -/** - * NoSuchTableException thrown when a table is found but it is not an Iceberg table. - */ +/** NoSuchTableException thrown when a table is found but it is not an Iceberg table. */ public class NoSuchIcebergTableException extends NoSuchTableException { @FormatMethod public NoSuchIcebergTableException(String message, Object... args) { diff --git a/api/src/main/java/org/apache/iceberg/exceptions/NoSuchNamespaceException.java b/api/src/main/java/org/apache/iceberg/exceptions/NoSuchNamespaceException.java index 3710d6c52392..ce8bb8d6a224 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/NoSuchNamespaceException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/NoSuchNamespaceException.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; import com.google.errorprone.annotations.FormatMethod; -/** - * Exception raised when attempting to load a namespace that does not exist. - */ +/** Exception raised when attempting to load a namespace that does not exist. */ public class NoSuchNamespaceException extends RuntimeException { @FormatMethod public NoSuchNamespaceException(String message, Object... args) { diff --git a/api/src/main/java/org/apache/iceberg/exceptions/NoSuchTableException.java b/api/src/main/java/org/apache/iceberg/exceptions/NoSuchTableException.java index cd432368a6c9..54b270d93ab1 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/NoSuchTableException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/NoSuchTableException.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; import com.google.errorprone.annotations.FormatMethod; -/** - * Exception raised when attempting to load a table that does not exist. - */ +/** Exception raised when attempting to load a table that does not exist. */ public class NoSuchTableException extends RuntimeException { @FormatMethod public NoSuchTableException(String message, Object... args) { diff --git a/api/src/main/java/org/apache/iceberg/exceptions/NotAuthorizedException.java b/api/src/main/java/org/apache/iceberg/exceptions/NotAuthorizedException.java index 0713bbf5b837..208c2f460a43 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/NotAuthorizedException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/NotAuthorizedException.java @@ -16,14 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; import com.google.errorprone.annotations.FormatMethod; /** - * Exception thrown on HTTP 401 Unauthorized. - * The user is either not authenticated or failed authorization checks. + * Exception thrown on HTTP 401 Unauthorized. The user is either not authenticated or failed + * authorization checks. */ public class NotAuthorizedException extends RESTException { @FormatMethod diff --git a/api/src/main/java/org/apache/iceberg/exceptions/NotFoundException.java b/api/src/main/java/org/apache/iceberg/exceptions/NotFoundException.java index 35b9e8798ce6..25160d800c30 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/NotFoundException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/NotFoundException.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; import com.google.errorprone.annotations.FormatMethod; -/** - * Exception raised when attempting to read a file that does not exist. - */ +/** Exception raised when attempting to read a file that does not exist. */ public class NotFoundException extends RuntimeException { @FormatMethod public NotFoundException(String message, Object... args) { diff --git a/api/src/main/java/org/apache/iceberg/exceptions/RESTException.java b/api/src/main/java/org/apache/iceberg/exceptions/RESTException.java index 3572903f34a6..a61f468583ce 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/RESTException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/RESTException.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; import com.google.errorprone.annotations.FormatMethod; -/** - * Base class for REST client exceptions - */ +/** Base class for REST client exceptions */ public class RESTException extends RuntimeException { @FormatMethod public RESTException(String message, Object... args) { diff --git a/api/src/main/java/org/apache/iceberg/exceptions/RuntimeIOException.java b/api/src/main/java/org/apache/iceberg/exceptions/RuntimeIOException.java index 984460072c07..ddf16abad4a8 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/RuntimeIOException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/RuntimeIOException.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; import com.google.errorprone.annotations.FormatMethod; @@ -25,8 +24,7 @@ /** * @deprecated Use java.io.UncheckedIOException directly instead. - * - * Exception used to wrap {@link IOException} as a {@link RuntimeException} and add context. + *

Exception used to wrap {@link IOException} as a {@link RuntimeException} and add context. */ @Deprecated public class RuntimeIOException extends UncheckedIOException { @@ -41,7 +39,7 @@ public RuntimeIOException(IOException cause, String message, Object... args) { } @FormatMethod - public RuntimeIOException(String message, Object...args) { + public RuntimeIOException(String message, Object... args) { super(new IOException(String.format(message, args))); } } diff --git a/api/src/main/java/org/apache/iceberg/exceptions/ServiceFailureException.java b/api/src/main/java/org/apache/iceberg/exceptions/ServiceFailureException.java index 9049d725efb3..b3395f80fca7 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/ServiceFailureException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/ServiceFailureException.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; import com.google.errorprone.annotations.FormatMethod; -/** - * Exception thrown on HTTP 5XX Server Error. - */ +/** Exception thrown on HTTP 5XX Server Error. */ public class ServiceFailureException extends RuntimeException { @FormatMethod public ServiceFailureException(String message, Object... args) { diff --git a/api/src/main/java/org/apache/iceberg/exceptions/UnprocessableEntityException.java b/api/src/main/java/org/apache/iceberg/exceptions/UnprocessableEntityException.java index 9c5e0f8852bd..90076fd23159 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/UnprocessableEntityException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/UnprocessableEntityException.java @@ -16,15 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; import com.google.errorprone.annotations.FormatMethod; /** * REST exception thrown when a request is well-formed but cannot be applied. - *

- * For example, this is used when a property update requests that properties are simultaneously set and removed. + * + *

For example, this is used when a property update requests that properties are simultaneously + * set and removed. */ public class UnprocessableEntityException extends RESTException { @FormatMethod diff --git a/api/src/main/java/org/apache/iceberg/exceptions/ValidationException.java b/api/src/main/java/org/apache/iceberg/exceptions/ValidationException.java index a4489475a6d8..dc40469df0c5 100644 --- a/api/src/main/java/org/apache/iceberg/exceptions/ValidationException.java +++ b/api/src/main/java/org/apache/iceberg/exceptions/ValidationException.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.exceptions; import com.google.errorprone.annotations.FormatMethod; @@ -25,13 +24,13 @@ /** * Exception which is raised when the arguments are valid in isolation, but not in conjunction with - * other arguments or state, as opposed to {@link IllegalArgumentException} which is raised when - * an argument value is always invalid. - *

- * A ValidationException will cause the operation to abort. - *

- * For example, this is thrown when attempting to create a table with a {@link PartitionSpec} that - * is not compatible with the table {@link Schema} + * other arguments or state, as opposed to {@link IllegalArgumentException} which is raised when an + * argument value is always invalid. + * + *

A ValidationException will cause the operation to abort. + * + *

For example, this is thrown when attempting to create a table with a {@link PartitionSpec} + * that is not compatible with the table {@link Schema} */ public class ValidationException extends RuntimeException { @FormatMethod diff --git a/api/src/main/java/org/apache/iceberg/expressions/And.java b/api/src/main/java/org/apache/iceberg/expressions/And.java index 2fa899ba132a..ec310264da08 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/And.java +++ b/api/src/main/java/org/apache/iceberg/expressions/And.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; public class And implements Expression { @@ -45,8 +44,8 @@ public Operation op() { public boolean isEquivalentTo(Expression expr) { if (expr.op() == Operation.AND) { And other = (And) expr; - return (left.isEquivalentTo(other.left()) && right.isEquivalentTo(other.right())) || - (left.isEquivalentTo(other.right()) && right.isEquivalentTo(other.left())); + return (left.isEquivalentTo(other.left()) && right.isEquivalentTo(other.right())) + || (left.isEquivalentTo(other.right()) && right.isEquivalentTo(other.left())); } return false; diff --git a/api/src/main/java/org/apache/iceberg/expressions/Binder.java b/api/src/main/java/org/apache/iceberg/expressions/Binder.java index 1ffa95f4c4ee..d2a7b1d09e0b 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Binder.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Binder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.util.List; @@ -34,32 +33,30 @@ * fields in a struct schema. */ public class Binder { - private Binder() { - } + private Binder() {} /** * Replaces all unbound/named references with bound references to fields in the given struct. - *

- * When a reference is resolved, any literal used in a predicate for that field is converted to + * + *

When a reference is resolved, any literal used in a predicate for that field is converted to * the field's type using {@link Literal#to(Type)}. If automatic conversion to that type isn't * allowed, a {@link ValidationException validation exception} is thrown. - *

- * The result expression may be simplified when constructed. For example, {@code isNull("a")} is - * replaced with {@code alwaysFalse()} when {@code "a"} is resolved to a required field. - *

- * The expression cannot contain references that are already bound, or an - * {@link IllegalStateException} will be thrown. + * + *

The result expression may be simplified when constructed. For example, {@code isNull("a")} + * is replaced with {@code alwaysFalse()} when {@code "a"} is resolved to a required field. + * + *

The expression cannot contain references that are already bound, or an {@link + * IllegalStateException} will be thrown. * * @param struct The {@link StructType struct type} to resolve references by name. * @param expr An {@link Expression expression} to rewrite with bound references. - * @param caseSensitive A boolean flag to control whether the bind should enforce case sensitivity. + * @param caseSensitive A boolean flag to control whether the bind should enforce case + * sensitivity. * @return the expression rewritten with bound references * @throws ValidationException if literals do not match bound references * @throws IllegalStateException if any references are already bound */ - public static Expression bind(StructType struct, - Expression expr, - boolean caseSensitive) { + public static Expression bind(StructType struct, Expression expr, boolean caseSensitive) { return ExpressionVisitors.visit(expr, new BindVisitor(struct, caseSensitive)); } @@ -67,31 +64,29 @@ public static Expression bind(StructType struct, * Replaces all unbound/named references with bound references to fields in the given struct, * defaulting to case sensitive mode. * - * Access modifier is package-private, to only allow use from existing tests. + *

Access modifier is package-private, to only allow use from existing tests. * - *

- * When a reference is resolved, any literal used in a predicate for that field is converted to + *

When a reference is resolved, any literal used in a predicate for that field is converted to * the field's type using {@link Literal#to(Type)}. If automatic conversion to that type isn't * allowed, a {@link ValidationException validation exception} is thrown. - *

- * The result expression may be simplified when constructed. For example, {@code isNull("a")} is - * replaced with {@code alwaysFalse()} when {@code "a"} is resolved to a required field. - *

- * The expression cannot contain references that are already bound, or an - * {@link IllegalStateException} will be thrown. + * + *

The result expression may be simplified when constructed. For example, {@code isNull("a")} + * is replaced with {@code alwaysFalse()} when {@code "a"} is resolved to a required field. + * + *

The expression cannot contain references that are already bound, or an {@link + * IllegalStateException} will be thrown. * * @param struct The {@link StructType struct type} to resolve references by name. * @param expr An {@link Expression expression} to rewrite with bound references. * @return the expression rewritten with bound references - * * @throws IllegalStateException if any references are already bound */ - static Expression bind(StructType struct, - Expression expr) { + static Expression bind(StructType struct, Expression expr) { return Binder.bind(struct, expr, true); } - public static Set boundReferences(StructType struct, List exprs, boolean caseSensitive) { + public static Set boundReferences( + StructType struct, List exprs, boolean caseSensitive) { if (exprs == null) { return ImmutableSet.of(); } @@ -108,8 +103,8 @@ public static Set boundReferences(StructType struct, List e /** * Returns whether an expression is bound. - *

- * An expression is bound if all of its predicates are bound. + * + *

An expression is bound if all of its predicates are bound. * * @param expr an {@link Expression} * @return true if the expression is bound @@ -228,7 +223,8 @@ public Boolean predicate(UnboundPredicate pred) { private Boolean combineResults(Boolean isLeftBound, Boolean isRightBound) { if (isLeftBound != null) { - Preconditions.checkArgument(isRightBound == null || isLeftBound.equals(isRightBound), + Preconditions.checkArgument( + isRightBound == null || isLeftBound.equals(isRightBound), "Found partially bound expression"); return isLeftBound; } else { diff --git a/api/src/main/java/org/apache/iceberg/expressions/Bound.java b/api/src/main/java/org/apache/iceberg/expressions/Bound.java index 6a753a1f7872..e2434fbf5a79 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Bound.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Bound.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import org.apache.iceberg.StructLike; @@ -27,9 +26,7 @@ * @param the Java type of values produced by this expression */ public interface Bound { - /** - * Returns the underlying reference. - */ + /** Returns the underlying reference. */ BoundReference ref(); /** diff --git a/api/src/main/java/org/apache/iceberg/expressions/BoundLiteralPredicate.java b/api/src/main/java/org/apache/iceberg/expressions/BoundLiteralPredicate.java index 3c89895f33d0..02dc31c6a6c5 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/BoundLiteralPredicate.java +++ b/api/src/main/java/org/apache/iceberg/expressions/BoundLiteralPredicate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.util.Comparator; @@ -26,8 +25,13 @@ import org.apache.iceberg.types.Type; public class BoundLiteralPredicate extends BoundPredicate { - private static final Set INTEGRAL_TYPES = Sets.newHashSet( - Type.TypeID.INTEGER, Type.TypeID.LONG, Type.TypeID.DATE, Type.TypeID.TIME, Type.TypeID.TIMESTAMP); + private static final Set INTEGRAL_TYPES = + Sets.newHashSet( + Type.TypeID.INTEGER, + Type.TypeID.LONG, + Type.TypeID.DATE, + Type.TypeID.TIME, + Type.TypeID.TIMESTAMP); private static long toLong(Literal lit) { return ((Number) lit.value()).longValue(); @@ -37,8 +41,10 @@ private static long toLong(Literal lit) { BoundLiteralPredicate(Operation op, BoundTerm term, Literal lit) { super(op, term); - Preconditions.checkArgument(op != Operation.IN && op != Operation.NOT_IN, - "Bound literal predicate does not support operation: %s", op); + Preconditions.checkArgument( + op != Operation.IN && op != Operation.NOT_IN, + "Bound literal predicate does not support operation: %s", + op); this.literal = lit; } diff --git a/api/src/main/java/org/apache/iceberg/expressions/BoundPredicate.java b/api/src/main/java/org/apache/iceberg/expressions/BoundPredicate.java index fde4b1e0f069..95e1aeaa2592 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/BoundPredicate.java +++ b/api/src/main/java/org/apache/iceberg/expressions/BoundPredicate.java @@ -16,12 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import org.apache.iceberg.StructLike; -public abstract class BoundPredicate extends Predicate> implements Bound { +public abstract class BoundPredicate extends Predicate> + implements Bound { protected BoundPredicate(Operation op, BoundTerm term) { super(op, term); } diff --git a/api/src/main/java/org/apache/iceberg/expressions/BoundReference.java b/api/src/main/java/org/apache/iceberg/expressions/BoundReference.java index c2be9b653486..26410e03450f 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/BoundReference.java +++ b/api/src/main/java/org/apache/iceberg/expressions/BoundReference.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import org.apache.iceberg.Accessor; @@ -58,9 +57,9 @@ public boolean isEquivalentTo(BoundTerm other) { if (other instanceof BoundReference) { Types.NestedField otherField = ((BoundReference) other).field(); // equivalence only depends on the field ID, type, and optional. name and accessor are ignored - return field.fieldId() == otherField.fieldId() && - field.type().equals(otherField.type()) && - field.isOptional() == otherField.isOptional(); + return field.fieldId() == otherField.fieldId() + && field.type().equals(otherField.type()) + && field.isOptional() == otherField.isOptional(); } return other.isEquivalentTo(this); diff --git a/api/src/main/java/org/apache/iceberg/expressions/BoundSetPredicate.java b/api/src/main/java/org/apache/iceberg/expressions/BoundSetPredicate.java index 3c1ad28936dd..025b723a21cf 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/BoundSetPredicate.java +++ b/api/src/main/java/org/apache/iceberg/expressions/BoundSetPredicate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.util.Set; @@ -29,8 +28,10 @@ public class BoundSetPredicate extends BoundPredicate { BoundSetPredicate(Operation op, BoundTerm term, Set lits) { super(op, term); - Preconditions.checkArgument(op == Operation.IN || op == Operation.NOT_IN, - "%s predicate does not support a literal set", op); + Preconditions.checkArgument( + op == Operation.IN || op == Operation.NOT_IN, + "%s predicate does not support a literal set", + op); this.literalSet = lits; } @@ -67,7 +68,8 @@ public boolean test(T value) { @Override public boolean isEquivalentTo(Expression other) { - // only check bound set predicate; binding will convert sets of a single item to a literal predicate + // only check bound set predicate; binding will convert sets of a single item to a literal + // predicate if (op() == other.op()) { BoundSetPredicate pred = (BoundSetPredicate) other; return literalSet().equals(pred.literalSet()); diff --git a/api/src/main/java/org/apache/iceberg/expressions/BoundTerm.java b/api/src/main/java/org/apache/iceberg/expressions/BoundTerm.java index 3691bbf5a86c..5ded3c903d19 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/BoundTerm.java +++ b/api/src/main/java/org/apache/iceberg/expressions/BoundTerm.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.util.Comparator; @@ -29,20 +28,17 @@ * @param the Java type of values produced by this term */ public interface BoundTerm extends Bound, Term { - /** - * Returns the type produced by this expression. - */ + /** Returns the type produced by this expression. */ Type type(); - /** - * Returns a {@link Comparator} for values produced by this term. - */ + /** Returns a {@link Comparator} for values produced by this term. */ default Comparator comparator() { return Comparators.forType(type().asPrimitiveType()); } /** * Returns whether this term is equivalent to another. + * * @param other a term * @return true if this term returns the same values as the other, false otherwise */ diff --git a/api/src/main/java/org/apache/iceberg/expressions/BoundTransform.java b/api/src/main/java/org/apache/iceberg/expressions/BoundTransform.java index 2e4e21c2b400..32f91018c604 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/BoundTransform.java +++ b/api/src/main/java/org/apache/iceberg/expressions/BoundTransform.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import org.apache.iceberg.StructLike; diff --git a/api/src/main/java/org/apache/iceberg/expressions/BoundUnaryPredicate.java b/api/src/main/java/org/apache/iceberg/expressions/BoundUnaryPredicate.java index c5f0c0d9fa22..fb7995a71877 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/BoundUnaryPredicate.java +++ b/api/src/main/java/org/apache/iceberg/expressions/BoundUnaryPredicate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import org.apache.iceberg.util.NaNUtil; diff --git a/api/src/main/java/org/apache/iceberg/expressions/Evaluator.java b/api/src/main/java/org/apache/iceberg/expressions/Evaluator.java index abe3ff0db1d6..96e148a2d438 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Evaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Evaluator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.io.Serializable; @@ -29,10 +28,10 @@ /** * Evaluates an {@link Expression} for data described by a {@link StructType}. - *

- * Data rows must implement {@link StructLike} and are passed to {@link #eval(StructLike)}. - *

- * This class is thread-safe. + * + *

Data rows must implement {@link StructLike} and are passed to {@link #eval(StructLike)}. + * + *

This class is thread-safe. */ public class Evaluator implements Serializable { private final Expression expr; diff --git a/api/src/main/java/org/apache/iceberg/expressions/Expression.java b/api/src/main/java/org/apache/iceberg/expressions/Expression.java index 887385935e98..b013be939570 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Expression.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Expression.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.io.Serializable; -/** - * Represents a boolean expression tree. - */ +/** Represents a boolean expression tree. */ public interface Expression extends Serializable { enum Operation { TRUE, @@ -46,9 +43,7 @@ enum Operation { STARTS_WITH, NOT_STARTS_WITH; - /** - * Returns the operation used when this is negated. - */ + /** Returns the operation used when this is negated. */ public Operation negate() { switch (this) { case IS_NULL: @@ -84,9 +79,7 @@ public Operation negate() { } } - /** - * Returns the equivalent operation when the left and right operands are exchanged. - */ + /** Returns the equivalent operation when the left and right operands are exchanged. */ // Allow flipLR as a name because it's a public API @SuppressWarnings("checkstyle:AbbreviationAsWordInName") public Operation flipLR() { @@ -113,26 +106,22 @@ public Operation flipLR() { } } - /** - * Returns the operation for an expression node. - */ + /** Returns the operation for an expression node. */ Operation op(); - /** - * Returns the negation of this expression, equivalent to not(this). - */ + /** Returns the negation of this expression, equivalent to not(this). */ default Expression negate() { throw new UnsupportedOperationException(String.format("%s cannot be negated", this)); } /** * Returns whether this expression will accept the same values as another. - *

- * If this returns true, the expressions are guaranteed to return the same evaluation for the same input. However, if - * this returns false the expressions may return the same evaluation for the same input. That is, expressions may - * be equivalent even if this returns false. - *

- * For best results, rewrite not and bind expressions before calling this method. + * + *

If this returns true, the expressions are guaranteed to return the same evaluation for the + * same input. However, if this returns false the expressions may return the same evaluation for + * the same input. That is, expressions may be equivalent even if this returns false. + * + *

For best results, rewrite not and bind expressions before calling this method. * * @param other another expression * @return true if the expressions are equivalent diff --git a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java index 2e9a0bb2f2e1..fe180a4f0506 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.util.regex.Pattern; @@ -26,26 +25,24 @@ import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; -/** - * Expression utility methods. - */ +/** Expression utility methods. */ public class ExpressionUtil { - private static final Transform HASH_FUNC = Transforms - .bucket(Types.StringType.get(), Integer.MAX_VALUE); + private static final Transform HASH_FUNC = + Transforms.bucket(Types.StringType.get(), Integer.MAX_VALUE); private static final Pattern DATE = Pattern.compile("\\d\\d\\d\\d-\\d\\d-\\d\\d"); - private static final Pattern TIME = Pattern.compile( - "\\d\\d:\\d\\d(:\\d\\d(.\\d{1,6})?)?"); - private static final Pattern TIMESTAMP = Pattern.compile( - "\\d\\d\\d\\d-\\d\\d-\\d\\dT\\d\\d:\\d\\d(:\\d\\d(.\\d{1,6})?)?([-+]\\d\\d:\\d\\d)?"); + private static final Pattern TIME = Pattern.compile("\\d\\d:\\d\\d(:\\d\\d(.\\d{1,6})?)?"); + private static final Pattern TIMESTAMP = + Pattern.compile( + "\\d\\d\\d\\d-\\d\\d-\\d\\dT\\d\\d:\\d\\d(:\\d\\d(.\\d{1,6})?)?([-+]\\d\\d:\\d\\d)?"); - private ExpressionUtil() { - } + private ExpressionUtil() {} /** - * Produces an unbound {@link Expression} with the same structure, but with data values replaced by descriptions. - *

- * Numbers are replaced with magnitude and type, string-like values are replaced by hashes, and date/time values are - * replaced by the type. + * Produces an unbound {@link Expression} with the same structure, but with data values replaced + * by descriptions. + * + *

Numbers are replaced with magnitude and type, string-like values are replaced by hashes, and + * date/time values are replaced by the type. * * @param expr an Expression to sanitize * @return a sanitized Expression @@ -55,10 +52,11 @@ public static Expression sanitize(Expression expr) { } /** - * Produces a sanitized expression string with the same structure, but with data values replaced by descriptions. - *

- * Numbers are replaced with magnitude and type, string-like values are replaced by hashes, and date/time values are - * replaced by the type. + * Produces a sanitized expression string with the same structure, but with data values replaced + * by descriptions. + * + *

Numbers are replaced with magnitude and type, string-like values are replaced by hashes, and + * date/time values are replaced by the type. * * @param expr an Expression to sanitize * @return a sanitized expression string @@ -69,10 +67,10 @@ public static String toSanitizedString(Expression expr) { /** * Returns whether two unbound expressions will accept the same inputs. - *

- * If this returns true, the expressions are guaranteed to return the same evaluation for the same input. However, if - * this returns false the expressions may return the same evaluation for the same input. That is, expressions may - * be equivalent even if this returns false. + * + *

If this returns true, the expressions are guaranteed to return the same evaluation for the + * same input. However, if this returns false the expressions may return the same evaluation for + * the same input. That is, expressions may be equivalent even if this returns false. * * @param left an unbound expression * @param right an unbound expression @@ -80,29 +78,33 @@ public static String toSanitizedString(Expression expr) { * @param caseSensitive whether to bind expressions using case-sensitive matching * @return true if the expressions are equivalent */ - public static boolean equivalent(Expression left, Expression right, Types.StructType struct, boolean caseSensitive) { + public static boolean equivalent( + Expression left, Expression right, Types.StructType struct, boolean caseSensitive) { return Binder.bind(struct, Expressions.rewriteNot(left), caseSensitive) .isEquivalentTo(Binder.bind(struct, Expressions.rewriteNot(right), caseSensitive)); } /** * Returns whether an expression selects whole partitions for a partition spec. - *

- * For example, ts < '2021-03-09T10:00:00.000' selects whole partitions in an hourly spec, [hours(ts)], but does - * not select whole partitions in a daily spec, [days(ts)]. + * + *

For example, ts < '2021-03-09T10:00:00.000' selects whole partitions in an hourly spec, + * [hours(ts)], but does not select whole partitions in a daily spec, [days(ts)]. * * @param expr an unbound expression * @param spec a partition spec * @return true if the expression will select whole partitions in the given spec */ - public static boolean selectsPartitions(Expression expr, PartitionSpec spec, boolean caseSensitive) { + public static boolean selectsPartitions( + Expression expr, PartitionSpec spec, boolean caseSensitive) { return equivalent( Projections.inclusive(spec, caseSensitive).project(expr), Projections.strict(spec, caseSensitive).project(expr), - spec.partitionType(), caseSensitive); + spec.partitionType(), + caseSensitive); } - private static class ExpressionSanitizer extends ExpressionVisitors.ExpressionVisitor { + private static class ExpressionSanitizer + extends ExpressionVisitors.ExpressionVisitor { private static final ExpressionSanitizer INSTANCE = new ExpressionSanitizer(); @Override @@ -156,10 +158,12 @@ public Expression predicate(UnboundPredicate pred) { return new UnboundPredicate<>(pred.op(), pred.term(), (T) sanitize(pred.literal())); case IN: case NOT_IN: - Iterable iter = () -> pred.literals().stream().map(ExpressionUtil::sanitize).iterator(); + Iterable iter = + () -> pred.literals().stream().map(ExpressionUtil::sanitize).iterator(); return new UnboundPredicate<>(pred.op(), pred.term(), (Iterable) iter); default: - throw new UnsupportedOperationException("Cannot sanitize unsupported predicate type: " + pred.op()); + throw new UnsupportedOperationException( + "Cannot sanitize unsupported predicate type: " + pred.op()); } } } @@ -232,19 +236,24 @@ public String predicate(UnboundPredicate pred) { case NOT_EQ: return term + " != " + sanitize(pred.literal()); case IN: - return term + " IN " + pred.literals().stream() - .map(ExpressionUtil::sanitize) - .collect(Collectors.joining(", ", "(", ")")); + return term + + " IN " + + pred.literals().stream() + .map(ExpressionUtil::sanitize) + .collect(Collectors.joining(", ", "(", ")")); case NOT_IN: - return term + " NOT IN " + pred.literals().stream() - .map(ExpressionUtil::sanitize) - .collect(Collectors.joining(", ", "(", ")")); + return term + + " NOT IN " + + pred.literals().stream() + .map(ExpressionUtil::sanitize) + .collect(Collectors.joining(", ", "(", ")")); case STARTS_WITH: return term + " STARTS WITH " + sanitize(pred.literal()); case NOT_STARTS_WITH: return term + " NOT STARTS WITH " + sanitize(pred.literal()); default: - throw new UnsupportedOperationException("Cannot sanitize unsupported predicate type: " + pred.op()); + throw new UnsupportedOperationException( + "Cannot sanitize unsupported predicate type: " + pred.op()); } } } diff --git a/api/src/main/java/org/apache/iceberg/expressions/ExpressionVisitors.java b/api/src/main/java/org/apache/iceberg/expressions/ExpressionVisitors.java index 389e25ba5d1b..ff76ab261818 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ExpressionVisitors.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ExpressionVisitors.java @@ -16,19 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.util.Set; import org.apache.iceberg.exceptions.ValidationException; -/** - * Utils for traversing {@link Expression expressions}. - */ +/** Utils for traversing {@link Expression expressions}. */ public class ExpressionVisitors { - private ExpressionVisitors() { - } + private ExpressionVisitors() {} public abstract static class ExpressionVisitor { public R alwaysTrue() { @@ -70,11 +66,13 @@ public R notNull(BoundReference ref) { } public R isNaN(BoundReference ref) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement isNaN"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement isNaN"); } public R notNaN(BoundReference ref) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement notNaN"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement notNaN"); } public R lt(BoundReference ref, Literal lit) { @@ -110,19 +108,21 @@ public R notIn(BoundReference ref, Set literalSet) { } public R startsWith(BoundReference ref, Literal lit) { - throw new UnsupportedOperationException("startsWith expression is not supported by the visitor"); + throw new UnsupportedOperationException( + "startsWith expression is not supported by the visitor"); } public R notStartsWith(BoundReference ref, Literal lit) { - throw new UnsupportedOperationException("notStartsWith expression is not supported by the visitor"); + throw new UnsupportedOperationException( + "notStartsWith expression is not supported by the visitor"); } /** * Handle a non-reference value in this visitor. - *

- * Visitors that require {@link BoundReference references} and not {@link Bound terms} can use this method to - * return a default value for expressions with non-references. The default implementation will throw a validation - * exception because the non-reference is not supported. + * + *

Visitors that require {@link BoundReference references} and not {@link Bound terms} can + * use this method to return a default value for expressions with non-references. The default + * implementation will throw a validation exception because the non-reference is not supported. * * @param term a non-reference bound expression * @param a Java return type @@ -154,11 +154,12 @@ public R predicate(BoundPredicate pred) { case NOT_EQ: return notEq((BoundReference) pred.term(), literalPred.literal()); case STARTS_WITH: - return startsWith((BoundReference) pred.term(), literalPred.literal()); + return startsWith((BoundReference) pred.term(), literalPred.literal()); case NOT_STARTS_WITH: - return notStartsWith((BoundReference) pred.term(), literalPred.literal()); + return notStartsWith((BoundReference) pred.term(), literalPred.literal()); default: - throw new IllegalStateException("Invalid operation for BoundLiteralPredicate: " + pred.op()); + throw new IllegalStateException( + "Invalid operation for BoundLiteralPredicate: " + pred.op()); } } else if (pred.isUnaryPredicate()) { @@ -172,7 +173,8 @@ public R predicate(BoundPredicate pred) { case NOT_NAN: return notNaN((BoundReference) pred.term()); default: - throw new IllegalStateException("Invalid operation for BoundUnaryPredicate: " + pred.op()); + throw new IllegalStateException( + "Invalid operation for BoundUnaryPredicate: " + pred.op()); } } else if (pred.isSetPredicate()) { @@ -182,7 +184,8 @@ public R predicate(BoundPredicate pred) { case NOT_IN: return notIn((BoundReference) pred.term(), pred.asSetPredicate().literalSet()); default: - throw new IllegalStateException("Invalid operation for BoundSetPredicate: " + pred.op()); + throw new IllegalStateException( + "Invalid operation for BoundSetPredicate: " + pred.op()); } } @@ -205,11 +208,13 @@ public R notNull(Bound expr) { } public R isNaN(Bound expr) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement isNaN"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement isNaN"); } public R notNaN(Bound expr) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement notNaN"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement notNaN"); } public R lt(Bound expr, Literal lit) { @@ -270,11 +275,12 @@ public R predicate(BoundPredicate pred) { case NOT_EQ: return notEq(pred.term(), literalPred.literal()); case STARTS_WITH: - return startsWith(pred.term(), literalPred.literal()); + return startsWith(pred.term(), literalPred.literal()); case NOT_STARTS_WITH: return notStartsWith(pred.term(), literalPred.literal()); default: - throw new IllegalStateException("Invalid operation for BoundLiteralPredicate: " + pred.op()); + throw new IllegalStateException( + "Invalid operation for BoundLiteralPredicate: " + pred.op()); } } else if (pred.isUnaryPredicate()) { @@ -288,7 +294,8 @@ public R predicate(BoundPredicate pred) { case NOT_NAN: return notNaN(pred.term()); default: - throw new IllegalStateException("Invalid operation for BoundUnaryPredicate: " + pred.op()); + throw new IllegalStateException( + "Invalid operation for BoundUnaryPredicate: " + pred.op()); } } else if (pred.isSetPredicate()) { @@ -298,7 +305,8 @@ public R predicate(BoundPredicate pred) { case NOT_IN: return notIn(pred.term(), pred.asSetPredicate().literalSet()); default: - throw new IllegalStateException("Invalid operation for BoundSetPredicate: " + pred.op()); + throw new IllegalStateException( + "Invalid operation for BoundSetPredicate: " + pred.op()); } } @@ -313,9 +321,9 @@ public R predicate(UnboundPredicate pred) { /** * Traverses the given {@link Expression expression} with a {@link ExpressionVisitor visitor}. - *

- * The visitor will be called to handle each node in the expression tree in postfix order. Result - * values produced by child nodes are passed when parent nodes are handled. + * + *

The visitor will be called to handle each node in the expression tree in postfix order. + * Result values produced by child nodes are passed when parent nodes are handled. * * @param expr an expression to traverse * @param visitor a visitor that will be called to handle each node in the expression tree @@ -345,18 +353,17 @@ public static R visit(Expression expr, ExpressionVisitor visitor) { Or or = (Or) expr; return visitor.or(visit(or.left(), visitor), visit(or.right(), visitor)); default: - throw new UnsupportedOperationException( - "Unknown operation: " + expr.op()); + throw new UnsupportedOperationException("Unknown operation: " + expr.op()); } } } /** * Traverses the given {@link Expression expression} with a {@link ExpressionVisitor visitor}. - *

- * The visitor will be called to handle only nodes required for determining result - * in the expression tree in postfix order. Result values produced by child nodes - * are passed when parent nodes are handled. + * + *

The visitor will be called to handle only nodes required for determining result in the + * expression tree in postfix order. Result values produced by child nodes are passed when parent + * nodes are handled. * * @param expr an expression to traverse * @param visitor a visitor that will be called to handle each node in the expression tree @@ -393,8 +400,7 @@ public static Boolean visitEvaluator(Expression expr, ExpressionVisitor } return visitor.or(Boolean.FALSE, visitEvaluator(or.right(), visitor)); default: - throw new UnsupportedOperationException( - "Unknown operation: " + expr.op()); + throw new UnsupportedOperationException("Unknown operation: " + expr.op()); } } } diff --git a/api/src/main/java/org/apache/iceberg/expressions/Expressions.java b/api/src/main/java/org/apache/iceberg/expressions/Expressions.java index a0690f1e9433..35938a8adfe5 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Expressions.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Expressions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.util.stream.Stream; @@ -27,12 +26,9 @@ import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; -/** - * Factory methods for creating {@link Expression expressions}. - */ +/** Factory methods for creating {@link Expression expressions}. */ public class Expressions { - private Expressions() { - } + private Expressions() {} public static Expression and(Expression left, Expression right) { Preconditions.checkNotNull(left, "Left expression cannot be null."); @@ -48,8 +44,7 @@ public static Expression and(Expression left, Expression right) { } public static Expression and(Expression left, Expression right, Expression... expressions) { - return Stream.of(expressions) - .reduce(and(left, right), Expressions::and); + return Stream.of(expressions).reduce(and(left, right), Expressions::and); } public static Expression or(Expression left, Expression right) { @@ -79,28 +74,33 @@ public static Expression not(Expression child) { @SuppressWarnings("unchecked") public static UnboundTerm bucket(String name, int numBuckets) { - Transform transform = (Transform) Transforms.bucket(Types.StringType.get(), numBuckets); + Transform transform = + (Transform) Transforms.bucket(Types.StringType.get(), numBuckets); return new UnboundTransform<>(ref(name), transform); } @SuppressWarnings("unchecked") public static UnboundTerm year(String name) { - return new UnboundTransform<>(ref(name), (Transform) Transforms.year(Types.TimestampType.withZone())); + return new UnboundTransform<>( + ref(name), (Transform) Transforms.year(Types.TimestampType.withZone())); } @SuppressWarnings("unchecked") public static UnboundTerm month(String name) { - return new UnboundTransform<>(ref(name), (Transform) Transforms.month(Types.TimestampType.withZone())); + return new UnboundTransform<>( + ref(name), (Transform) Transforms.month(Types.TimestampType.withZone())); } @SuppressWarnings("unchecked") public static UnboundTerm day(String name) { - return new UnboundTransform<>(ref(name), (Transform) Transforms.day(Types.TimestampType.withZone())); + return new UnboundTransform<>( + ref(name), (Transform) Transforms.day(Types.TimestampType.withZone())); } @SuppressWarnings("unchecked") public static UnboundTerm hour(String name) { - return new UnboundTransform<>(ref(name), (Transform) Transforms.hour(Types.TimestampType.withZone())); + return new UnboundTransform<>( + ref(name), (Transform) Transforms.hour(Types.TimestampType.withZone())); } public static UnboundTerm truncate(String name, int width) { @@ -245,8 +245,12 @@ public static UnboundPredicate predicate(Operation op, String name, T val public static UnboundPredicate predicate(Operation op, String name, Literal lit) { Preconditions.checkArgument( - op != Operation.IS_NULL && op != Operation.NOT_NULL && op != Operation.IS_NAN && op != Operation.NOT_NAN, - "Cannot create %s predicate inclusive a value", op); + op != Operation.IS_NULL + && op != Operation.NOT_NULL + && op != Operation.IS_NAN + && op != Operation.NOT_NAN, + "Cannot create %s predicate inclusive a value", + op); return new UnboundPredicate(op, ref(name), lit); } @@ -256,12 +260,17 @@ public static UnboundPredicate predicate(Operation op, String name, Itera public static UnboundPredicate predicate(Operation op, String name) { Preconditions.checkArgument( - op == Operation.IS_NULL || op == Operation.NOT_NULL || op == Operation.IS_NAN || op == Operation.NOT_NAN, - "Cannot create %s predicate without a value", op); + op == Operation.IS_NULL + || op == Operation.NOT_NULL + || op == Operation.IS_NAN + || op == Operation.NOT_NAN, + "Cannot create %s predicate without a value", + op); return new UnboundPredicate<>(op, ref(name)); } - public static UnboundPredicate predicate(Operation op, UnboundTerm expr, Iterable values) { + public static UnboundPredicate predicate( + Operation op, UnboundTerm expr, Iterable values) { return new UnboundPredicate<>(op, expr, values); } @@ -279,8 +288,8 @@ public static Expression rewriteNot(Expression expr) { /** * Constructs a reference for a given column. - *

- * The following are equivalent: equals("a", 5) and equals(ref("a"), 5). + * + *

The following are equivalent: equals("a", 5) and equals(ref("a"), 5). * * @param name a column name * @param the Java type of this reference diff --git a/api/src/main/java/org/apache/iceberg/expressions/False.java b/api/src/main/java/org/apache/iceberg/expressions/False.java index 21d81cd31890..54bf8900424b 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/False.java +++ b/api/src/main/java/org/apache/iceberg/expressions/False.java @@ -16,19 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.io.ObjectStreamException; -/** - * An {@link Expression expression} that is always false. - */ +/** An {@link Expression expression} that is always false. */ public class False implements Expression { static final False INSTANCE = new False(); - private False() { - } + private False() {} @Override public Operation op() { diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index 7bfa91fcc4df..172b6a727ddc 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; +import static org.apache.iceberg.expressions.Expressions.rewriteNot; + import java.nio.ByteBuffer; import java.util.Collection; import java.util.Comparator; @@ -35,21 +36,20 @@ import org.apache.iceberg.util.BinaryUtil; import org.apache.iceberg.util.NaNUtil; -import static org.apache.iceberg.expressions.Expressions.rewriteNot; - /** * Evaluates an {@link Expression} on a {@link DataFile} to test whether rows in the file may match. - *

- * This evaluation is inclusive: it returns true if a file may match and false if it cannot match. - *

- * Files are passed to {@link #eval(ContentFile)}, which returns true if the file may contain matching - * rows and false if the file cannot contain matching rows. Files may be skipped if and only if the - * return value of {@code eval} is false. - *

- * Due to the comparison implementation of ORC stats, for float/double columns in ORC files, if the first - * value in a file is NaN, metrics of this file will report NaN for both upper and lower bound despite - * that the column could contain non-NaN data. Thus in some scenarios explicitly checks for NaN is necessary - * in order to not skip files that may contain matching data. + * + *

This evaluation is inclusive: it returns true if a file may match and false if it cannot + * match. + * + *

Files are passed to {@link #eval(ContentFile)}, which returns true if the file may contain + * matching rows and false if the file cannot contain matching rows. Files may be skipped if and + * only if the return value of {@code eval} is false. + * + *

Due to the comparison implementation of ORC stats, for float/double columns in ORC files, if + * the first value in a file is NaN, metrics of this file will report NaN for both upper and lower + * bound despite that the column could contain non-NaN data. Thus in some scenarios explicitly + * checks for NaN is necessary in order to not skip files that may contain matching data. */ public class InclusiveMetricsEvaluator { private static final int IN_PREDICATE_LIMIT = 200; @@ -109,10 +109,14 @@ private boolean eval(ContentFile file) { @Override public Boolean handleNonReference(Bound term) { - // If the term in any expression is not a direct reference, assume that rows may match. This happens when - // transforms or other expressions are passed to this evaluator. For example, bucket16(x) = 0 can't be determined - // because this visitor operates on data metrics and not partition values. It may be possible to un-transform - // expressions for order preserving transforms in the future, but this is not currently supported. + // If the term in any expression is not a direct reference, assume that rows may match. This + // happens when + // transforms or other expressions are passed to this evaluator. For example, bucket16(x) = 0 + // can't be determined + // because this visitor operates on data metrics and not partition values. It may be possible + // to un-transform + // expressions for order preserving transforms in the future, but this is not currently + // supported. return ROWS_MIGHT_MATCH; } @@ -349,7 +353,10 @@ public Boolean in(BoundReference ref, Set literalSet) { return ROWS_MIGHT_MATCH; } - literals = literals.stream().filter(v -> ref.comparator().compare(lower, v) <= 0).collect(Collectors.toList()); + literals = + literals.stream() + .filter(v -> ref.comparator().compare(lower, v) <= 0) + .collect(Collectors.toList()); if (literals.isEmpty()) { // if all values are less than lower bound, rows cannot match. return ROWS_CANNOT_MATCH; } @@ -357,8 +364,13 @@ public Boolean in(BoundReference ref, Set literalSet) { if (upperBounds != null && upperBounds.containsKey(id)) { T upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id)); - literals = literals.stream().filter(v -> ref.comparator().compare(upper, v) >= 0).collect(Collectors.toList()); - if (literals.isEmpty()) { // if all remaining values are greater than upper bound, rows cannot match. + literals = + literals.stream() + .filter(v -> ref.comparator().compare(upper, v) >= 0) + .collect(Collectors.toList()); + if (literals + .isEmpty()) { // if all remaining values are greater than upper bound, rows cannot + // match. return ROWS_CANNOT_MATCH; } } @@ -420,17 +432,22 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { Comparator comparator = Comparators.unsignedBytes(); - // notStartsWith will match unless all values must start with the prefix. This happens when the lower and upper + // notStartsWith will match unless all values must start with the prefix. This happens when + // the lower and upper // bounds both start with the prefix. - if (lowerBounds != null && upperBounds != null && - lowerBounds.containsKey(id) && upperBounds.containsKey(id)) { + if (lowerBounds != null + && upperBounds != null + && lowerBounds.containsKey(id) + && upperBounds.containsKey(id)) { ByteBuffer lower = lowerBounds.get(id); // if lower is shorter than the prefix then lower doesn't start with the prefix if (lower.remaining() < prefixAsBytes.remaining()) { return ROWS_MIGHT_MATCH; } - int cmp = comparator.compare(BinaryUtil.truncateBinary(lower, prefixAsBytes.remaining()), prefixAsBytes); + int cmp = + comparator.compare( + BinaryUtil.truncateBinary(lower, prefixAsBytes.remaining()), prefixAsBytes); if (cmp == 0) { ByteBuffer upper = upperBounds.get(id); // if upper is shorter than the prefix then upper can't start with the prefix @@ -438,9 +455,12 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; } - cmp = comparator.compare(BinaryUtil.truncateBinary(upper, prefixAsBytes.remaining()), prefixAsBytes); + cmp = + comparator.compare( + BinaryUtil.truncateBinary(upper, prefixAsBytes.remaining()), prefixAsBytes); if (cmp == 0) { - // both bounds match the prefix, so all rows must match the prefix and therefore do not satisfy + // both bounds match the prefix, so all rows must match the prefix and therefore do not + // satisfy // the predicate return ROWS_CANNOT_MATCH; } @@ -455,14 +475,18 @@ private boolean mayContainNull(Integer id) { } private boolean containsNullsOnly(Integer id) { - return valueCounts != null && valueCounts.containsKey(id) && - nullCounts != null && nullCounts.containsKey(id) && - valueCounts.get(id) - nullCounts.get(id) == 0; + return valueCounts != null + && valueCounts.containsKey(id) + && nullCounts != null + && nullCounts.containsKey(id) + && valueCounts.get(id) - nullCounts.get(id) == 0; } private boolean containsNaNsOnly(Integer id) { - return nanCounts != null && nanCounts.containsKey(id) && - valueCounts != null && nanCounts.get(id).equals(valueCounts.get(id)); + return nanCounts != null + && nanCounts.containsKey(id) + && valueCounts != null + && nanCounts.get(id).equals(valueCounts.get(id)); } } } diff --git a/api/src/main/java/org/apache/iceberg/expressions/Literal.java b/api/src/main/java/org/apache/iceberg/expressions/Literal.java index 282351e1496e..b5d6f72f74d0 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Literal.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Literal.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.io.Serializable; @@ -28,6 +27,7 @@ /** * Represents a literal fixed value in an expression predicate + * * @param The Java type of the value wrapped by a {@link Literal} */ public interface Literal extends Serializable { @@ -71,25 +71,24 @@ static Literal of(BigDecimal value) { return new Literals.DecimalLiteral(value); } - /** - * Returns the value wrapped by this literal. - */ + /** Returns the value wrapped by this literal. */ T value(); /** * Converts this literal to a literal of the given type. - *

- * When a predicate is bound to a concrete data column, literals are converted to match the bound - * column's type. This conversion process is more narrow than a cast and is only intended for - * cases where substituting one type is a common mistake (e.g. 34 instead of 34L) or where this - * API avoids requiring a concrete class (e.g., dates). - *

- * If conversion to a target type is not supported, this method returns null. - *

- * This method may return {@link Literals#aboveMax} or {@link Literals#belowMin} when the target - * type is not as wide as the original type. These values indicate that the containing predicate - * can be simplified. For example, Integer.MAX_VALUE+1 converted to an int will result in - * {@code aboveMax} and can simplify a < Integer.MAX_VALUE+1 to {@link Expressions#alwaysTrue} + * + *

When a predicate is bound to a concrete data column, literals are converted to match the + * bound column's type. This conversion process is more narrow than a cast and is only intended + * for cases where substituting one type is a common mistake (e.g. 34 instead of 34L) or where + * this API avoids requiring a concrete class (e.g., dates). + * + *

If conversion to a target type is not supported, this method returns null. + * + *

This method may return {@link Literals#aboveMax} or {@link Literals#belowMin} when the + * target type is not as wide as the original type. These values indicate that the containing + * predicate can be simplified. For example, Integer.MAX_VALUE+1 converted to an int will result + * in {@code aboveMax} and can simplify a < Integer.MAX_VALUE+1 to {@link + * Expressions#alwaysTrue} * * @param type A primitive {@link Type} * @param The Java type of value the new literal contains @@ -99,6 +98,7 @@ static Literal of(BigDecimal value) { /** * Return a {@link Comparator} for values. + * * @return a comparator for T objects */ Comparator comparator(); diff --git a/api/src/main/java/org/apache/iceberg/expressions/Literals.java b/api/src/main/java/org/apache/iceberg/expressions/Literals.java index 483c2a1a7a79..79d7190c49df 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Literals.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Literals.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.io.ObjectStreamException; @@ -44,8 +43,7 @@ import org.apache.iceberg.util.NaNUtil; class Literals { - private Literals() { - } + private Literals() {} private static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC); private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); @@ -84,8 +82,9 @@ static Literal from(T value) { return (Literal) new Literals.DecimalLiteral((BigDecimal) value); } - throw new IllegalArgumentException(String.format( - "Cannot create expression literal from %s: %s", value.getClass().getName(), value)); + throw new IllegalArgumentException( + String.format( + "Cannot create expression literal from %s: %s", value.getClass().getName(), value)); } @SuppressWarnings("unchecked") @@ -149,7 +148,6 @@ public boolean equals(Object other) { public int hashCode() { return Objects.hashCode(value); } - } private abstract static class ComparableLiteral> extends BaseLiteral { @@ -171,8 +169,7 @@ public Comparator comparator() { static class AboveMax implements Literal { private static final AboveMax INSTANCE = new AboveMax(); - private AboveMax() { - } + private AboveMax() {} @Override public T value() { @@ -198,8 +195,7 @@ public String toString() { static class BelowMin implements Literal { private static final BelowMin INSTANCE = new BelowMin(); - private BelowMin() { - } + private BelowMin() {} @Override public T value() { @@ -264,8 +260,8 @@ public Literal to(Type type) { case DECIMAL: int scale = ((Types.DecimalType) type).scale(); // rounding mode isn't necessary, but pass one to avoid warnings - return (Literal) new DecimalLiteral( - BigDecimal.valueOf(value()).setScale(scale, RoundingMode.HALF_UP)); + return (Literal) + new DecimalLiteral(BigDecimal.valueOf(value()).setScale(scale, RoundingMode.HALF_UP)); default: return null; } @@ -313,8 +309,8 @@ public Literal to(Type type) { case DECIMAL: int scale = ((Types.DecimalType) type).scale(); // rounding mode isn't necessary, but pass one to avoid warnings - return (Literal) new DecimalLiteral( - BigDecimal.valueOf(value()).setScale(scale, RoundingMode.HALF_UP)); + return (Literal) + new DecimalLiteral(BigDecimal.valueOf(value()).setScale(scale, RoundingMode.HALF_UP)); default: return null; } @@ -341,8 +337,8 @@ public Literal to(Type type) { return (Literal) new DoubleLiteral(value().doubleValue()); case DECIMAL: int scale = ((Types.DecimalType) type).scale(); - return (Literal) new DecimalLiteral( - BigDecimal.valueOf(value()).setScale(scale, RoundingMode.HALF_UP)); + return (Literal) + new DecimalLiteral(BigDecimal.valueOf(value()).setScale(scale, RoundingMode.HALF_UP)); default: return null; } @@ -376,8 +372,8 @@ public Literal to(Type type) { return (Literal) this; case DECIMAL: int scale = ((Types.DecimalType) type).scale(); - return (Literal) new DecimalLiteral( - BigDecimal.valueOf(value()).setScale(scale, RoundingMode.HALF_UP)); + return (Literal) + new DecimalLiteral(BigDecimal.valueOf(value()).setScale(scale, RoundingMode.HALF_UP)); default: return null; } @@ -441,8 +437,11 @@ public Literal to(Type type) { case TIMESTAMP: return (Literal) this; case DATE: - return (Literal) new DateLiteral((int) ChronoUnit.DAYS.between( - EPOCH_DAY, EPOCH.plus(value(), ChronoUnit.MICROS).toLocalDate())); + return (Literal) + new DateLiteral( + (int) + ChronoUnit.DAYS.between( + EPOCH_DAY, EPOCH.plus(value(), ChronoUnit.MICROS).toLocalDate())); default: } return null; @@ -490,24 +489,29 @@ static class StringLiteral extends BaseLiteral { public Literal to(Type type) { switch (type.typeId()) { case DATE: - int date = (int) ChronoUnit.DAYS.between(EPOCH_DAY, - LocalDate.parse(value(), DateTimeFormatter.ISO_LOCAL_DATE)); + int date = + (int) + ChronoUnit.DAYS.between( + EPOCH_DAY, LocalDate.parse(value(), DateTimeFormatter.ISO_LOCAL_DATE)); return (Literal) new DateLiteral(date); case TIME: - long timeMicros = LocalTime.parse(value(), DateTimeFormatter.ISO_LOCAL_TIME) - .toNanoOfDay() / 1000; + long timeMicros = + LocalTime.parse(value(), DateTimeFormatter.ISO_LOCAL_TIME).toNanoOfDay() / 1000; return (Literal) new TimeLiteral(timeMicros); case TIMESTAMP: if (((Types.TimestampType) type).shouldAdjustToUTC()) { - long timestampMicros = ChronoUnit.MICROS.between(EPOCH, - OffsetDateTime.parse(value(), DateTimeFormatter.ISO_DATE_TIME)); + long timestampMicros = + ChronoUnit.MICROS.between( + EPOCH, OffsetDateTime.parse(value(), DateTimeFormatter.ISO_DATE_TIME)); return (Literal) new TimestampLiteral(timestampMicros); } else { - long timestampMicros = ChronoUnit.MICROS.between(EPOCH, - LocalDateTime.parse(value(), DateTimeFormatter.ISO_LOCAL_DATE_TIME) - .atOffset(ZoneOffset.UTC)); + long timestampMicros = + ChronoUnit.MICROS.between( + EPOCH, + LocalDateTime.parse(value(), DateTimeFormatter.ISO_LOCAL_DATE_TIME) + .atOffset(ZoneOffset.UTC)); return (Literal) new TimestampLiteral(timestampMicros); } diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java index 721beab4df48..c60852dacb19 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; +import static org.apache.iceberg.expressions.Expressions.rewriteNot; + import java.nio.ByteBuffer; import java.util.Collection; import java.util.Comparator; @@ -36,17 +37,16 @@ import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.BinaryUtil; -import static org.apache.iceberg.expressions.Expressions.rewriteNot; - /** * Evaluates an {@link Expression} on a {@link ManifestFile} to test whether the file contains * matching partitions. - *

- * For row expressions, evaluation is inclusive: it returns true if a file may match and false if it cannot match. - *

- * Files are passed to {@link #eval(ManifestFile)}, which returns true if the manifest may contain - * data files that match the partition expression. Manifest files may be skipped if and only if the - * return value of {@code eval} is false. + * + *

For row expressions, evaluation is inclusive: it returns true if a file may match and false if + * it cannot match. + * + *

Files are passed to {@link #eval(ManifestFile)}, which returns true if the manifest may + * contain data files that match the partition expression. Manifest files may be skipped if and only + * if the return value of {@code eval} is false. */ public class ManifestEvaluator { private static final int IN_PREDICATE_LIMIT = 200; @@ -54,8 +54,10 @@ public class ManifestEvaluator { private final StructType struct; private final Expression expr; - public static ManifestEvaluator forRowFilter(Expression rowFilter, PartitionSpec spec, boolean caseSensitive) { - return new ManifestEvaluator(spec, Projections.inclusive(spec, caseSensitive).project(rowFilter), caseSensitive); + public static ManifestEvaluator forRowFilter( + Expression rowFilter, PartitionSpec spec, boolean caseSensitive) { + return new ManifestEvaluator( + spec, Projections.inclusive(spec, caseSensitive).project(rowFilter), caseSensitive); } public static ManifestEvaluator forPartitionFilter( @@ -161,8 +163,10 @@ public Boolean notNaN(BoundReference ref) { PartitionFieldSummary fieldSummary = stats.get(Accessors.toPosition(ref.accessor())); // if containsNaN is true, containsNull is false and lowerBound is null, all values are NaN - if (fieldSummary.containsNaN() != null && fieldSummary.containsNaN() && - !fieldSummary.containsNull() && fieldSummary.lowerBound() == null) { + if (fieldSummary.containsNaN() != null + && fieldSummary.containsNaN() + && !fieldSummary.containsNull() + && fieldSummary.lowerBound() == null) { return ROWS_CANNOT_MATCH; } @@ -287,14 +291,21 @@ public Boolean in(BoundReference ref, Set literalSet) { } T lower = Conversions.fromByteBuffer(ref.type(), fieldStats.lowerBound()); - literals = literals.stream().filter(v -> ref.comparator().compare(lower, v) <= 0).collect(Collectors.toList()); + literals = + literals.stream() + .filter(v -> ref.comparator().compare(lower, v) <= 0) + .collect(Collectors.toList()); if (literals.isEmpty()) { // if all values are less than lower bound, rows cannot match. return ROWS_CANNOT_MATCH; } T upper = Conversions.fromByteBuffer(ref.type(), fieldStats.upperBound()); - literals = literals.stream().filter(v -> ref.comparator().compare(upper, v) >= 0).collect(Collectors.toList()); - if (literals.isEmpty()) { // if all remaining values are greater than upper bound, rows cannot match. + literals = + literals.stream() + .filter(v -> ref.comparator().compare(upper, v) >= 0) + .collect(Collectors.toList()); + if (literals + .isEmpty()) { // if all remaining values are greater than upper bound, rows cannot match. return ROWS_CANNOT_MATCH; } @@ -324,7 +335,8 @@ public Boolean startsWith(BoundReference ref, Literal lit) { ByteBuffer lower = fieldStats.lowerBound(); // truncate lower bound so that its length in bytes is not greater than the length of prefix int lowerLength = Math.min(prefixAsBytes.remaining(), lower.remaining()); - int lowerCmp = comparator.compare(BinaryUtil.truncateBinary(lower, lowerLength), prefixAsBytes); + int lowerCmp = + comparator.compare(BinaryUtil.truncateBinary(lower, lowerLength), prefixAsBytes); if (lowerCmp > 0) { return ROWS_CANNOT_MATCH; } @@ -332,7 +344,8 @@ public Boolean startsWith(BoundReference ref, Literal lit) { ByteBuffer upper = fieldStats.upperBound(); // truncate upper bound so that its length in bytes is not greater than the length of prefix int upperLength = Math.min(prefixAsBytes.remaining(), upper.remaining()); - int upperCmp = comparator.compare(BinaryUtil.truncateBinary(upper, upperLength), prefixAsBytes); + int upperCmp = + comparator.compare(BinaryUtil.truncateBinary(upper, upperLength), prefixAsBytes); if (upperCmp < 0) { return ROWS_CANNOT_MATCH; } @@ -352,7 +365,8 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { ByteBuffer lower = fieldStats.lowerBound(); ByteBuffer upper = fieldStats.upperBound(); - // notStartsWith will match unless all values must start with the prefix. This happens when the lower and upper + // notStartsWith will match unless all values must start with the prefix. This happens when + // the lower and upper // bounds both start with the prefix. if (lower != null && upper != null) { ByteBuffer prefixAsBytes = lit.toByteBuffer(); @@ -364,7 +378,9 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { } // truncate lower bound to the prefix and check for equality - int cmp = comparator.compare(BinaryUtil.truncateBinary(lower, prefixAsBytes.remaining()), prefixAsBytes); + int cmp = + comparator.compare( + BinaryUtil.truncateBinary(lower, prefixAsBytes.remaining()), prefixAsBytes); if (cmp == 0) { // the lower bound starts with the prefix; check the upper bound // if upper is shorter than the prefix, it can't start with the prefix @@ -372,8 +388,11 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; } - // truncate upper bound so that its length in bytes is not greater than the length of prefix - cmp = comparator.compare(BinaryUtil.truncateBinary(upper, prefixAsBytes.remaining()), prefixAsBytes); + // truncate upper bound so that its length in bytes is not greater than the length of + // prefix + cmp = + comparator.compare( + BinaryUtil.truncateBinary(upper, prefixAsBytes.remaining()), prefixAsBytes); if (cmp == 0) { // both bounds match the prefix, so all rows must match the prefix and none do not match return ROWS_CANNOT_MATCH; diff --git a/api/src/main/java/org/apache/iceberg/expressions/NamedReference.java b/api/src/main/java/org/apache/iceberg/expressions/NamedReference.java index 6f599f514305..95f462a9c089 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/NamedReference.java +++ b/api/src/main/java/org/apache/iceberg/expressions/NamedReference.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import org.apache.iceberg.Schema; @@ -39,12 +38,11 @@ public String name() { @Override public BoundReference bind(Types.StructType struct, boolean caseSensitive) { Schema schema = new Schema(struct.fields()); - Types.NestedField field = caseSensitive ? - schema.findField(name) : - schema.caseInsensitiveFindField(name); + Types.NestedField field = + caseSensitive ? schema.findField(name) : schema.caseInsensitiveFindField(name); - ValidationException.check(field != null, - "Cannot find field '%s' in struct: %s", name, schema.asStruct()); + ValidationException.check( + field != null, "Cannot find field '%s' in struct: %s", name, schema.asStruct()); return new BoundReference<>(field, schema.accessorForField(field.fieldId())); } diff --git a/api/src/main/java/org/apache/iceberg/expressions/Not.java b/api/src/main/java/org/apache/iceberg/expressions/Not.java index ad71f41f0432..a7fa725e1a1e 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Not.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Not.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; public class Not implements Expression { diff --git a/api/src/main/java/org/apache/iceberg/expressions/Or.java b/api/src/main/java/org/apache/iceberg/expressions/Or.java index 20a213a5d644..ae5901c4de4c 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Or.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Or.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; public class Or implements Expression { @@ -45,8 +44,8 @@ public Operation op() { public boolean isEquivalentTo(Expression expr) { if (expr.op() == Operation.OR) { Or other = (Or) expr; - return (left.isEquivalentTo(other.left()) && right.isEquivalentTo(other.right())) || - (left.isEquivalentTo(other.right()) && right.isEquivalentTo(other.left())); + return (left.isEquivalentTo(other.left()) && right.isEquivalentTo(other.right())) + || (left.isEquivalentTo(other.right()) && right.isEquivalentTo(other.left())); } return false; diff --git a/api/src/main/java/org/apache/iceberg/expressions/Predicate.java b/api/src/main/java/org/apache/iceberg/expressions/Predicate.java index 9642468ad409..645170335497 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Predicate.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Predicate.java @@ -16,10 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; - import org.apache.iceberg.relocated.com.google.common.base.Preconditions; public abstract class Predicate implements Expression { diff --git a/api/src/main/java/org/apache/iceberg/expressions/Projections.java b/api/src/main/java/org/apache/iceberg/expressions/Projections.java index f800b350858e..f873edfff5b1 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Projections.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Projections.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.util.Collection; @@ -25,23 +24,20 @@ import org.apache.iceberg.expressions.ExpressionVisitors.ExpressionVisitor; import org.apache.iceberg.transforms.Transform; -/** - * Utils to project expressions on rows to expressions on partitions. - */ +/** Utils to project expressions on rows to expressions on partitions. */ public class Projections { - private Projections() { - } + private Projections() {} /** * A class that projects expressions for a table's data rows into expressions on the table's * partition values, for a table's {@link PartitionSpec partition spec}. - *

- * There are two types of projections: inclusive and strict. - *

- * An inclusive projection guarantees that if an expression matches a row, the projected + * + *

There are two types of projections: inclusive and strict. + * + *

An inclusive projection guarantees that if an expression matches a row, the projected * expression will match the row's partition. - *

- * A strict projection guarantees that if a partition matches a projected expression, then all + * + *

A strict projection guarantees that if a partition matches a projected expression, then all * rows in that partition will match the original expression. */ public abstract static class ProjectionEvaluator extends ExpressionVisitor { @@ -57,14 +53,14 @@ public abstract static class ProjectionEvaluator extends ExpressionVisitor - * An evaluator is used to project expressions for a table's data rows into expressions on the + * + *

An evaluator is used to project expressions for a table's data rows into expressions on the * table's partition values. The evaluator returned by this function is inclusive and will build * expressions with the following guarantee: if the original expression matches a row, then the * projected expression will match that row's partition. - *

- * Each predicate in the expression is projected using - * {@link Transform#project(String, BoundPredicate)}. + * + *

Each predicate in the expression is projected using {@link Transform#project(String, + * BoundPredicate)}. * * @param spec a partition spec * @return an inclusive projection evaluator for the partition spec @@ -76,17 +72,18 @@ public static ProjectionEvaluator inclusive(PartitionSpec spec) { /** * Creates an inclusive {@code ProjectionEvaluator} for the {@link PartitionSpec spec}. - *

- * An evaluator is used to project expressions for a table's data rows into expressions on the + * + *

An evaluator is used to project expressions for a table's data rows into expressions on the * table's partition values. The evaluator returned by this function is inclusive and will build * expressions with the following guarantee: if the original expression matches a row, then the * projected expression will match that row's partition. - *

- * Each predicate in the expression is projected using - * {@link Transform#project(String, BoundPredicate)}. + * + *

Each predicate in the expression is projected using {@link Transform#project(String, + * BoundPredicate)}. * * @param spec a partition spec - * @param caseSensitive whether the Projection should consider case sensitivity on column names or not. + * @param caseSensitive whether the Projection should consider case sensitivity on column names or + * not. * @return an inclusive projection evaluator for the partition spec * @see Transform#project(String, BoundPredicate) Inclusive transform used for each predicate */ @@ -95,16 +92,16 @@ public static ProjectionEvaluator inclusive(PartitionSpec spec, boolean caseSens } /** - * Creates a strict {@code ProjectionEvaluator} for the {@link PartitionSpec spec}, defaulting - * to case sensitive mode. - *

- * An evaluator is used to project expressions for a table's data rows into expressions on the + * Creates a strict {@code ProjectionEvaluator} for the {@link PartitionSpec spec}, defaulting to + * case sensitive mode. + * + *

An evaluator is used to project expressions for a table's data rows into expressions on the * table's partition values. The evaluator returned by this function is strict and will build - * expressions with the following guarantee: if the projected expression matches a partition, - * then the original expression will match all rows in that partition. - *

- * Each predicate in the expression is projected using - * {@link Transform#projectStrict(String, BoundPredicate)}. + * expressions with the following guarantee: if the projected expression matches a partition, then + * the original expression will match all rows in that partition. + * + *

Each predicate in the expression is projected using {@link Transform#projectStrict(String, + * BoundPredicate)}. * * @param spec a partition spec * @return a strict projection evaluator for the partition spec @@ -116,17 +113,18 @@ public static ProjectionEvaluator strict(PartitionSpec spec) { /** * Creates a strict {@code ProjectionEvaluator} for the {@link PartitionSpec spec}. - *

- * An evaluator is used to project expressions for a table's data rows into expressions on the + * + *

An evaluator is used to project expressions for a table's data rows into expressions on the * table's partition values. The evaluator returned by this function is strict and will build - * expressions with the following guarantee: if the projected expression matches a partition, - * then the original expression will match all rows in that partition. - *

- * Each predicate in the expression is projected using - * {@link Transform#projectStrict(String, BoundPredicate)}. + * expressions with the following guarantee: if the projected expression matches a partition, then + * the original expression will match all rows in that partition. + * + *

Each predicate in the expression is projected using {@link Transform#projectStrict(String, + * BoundPredicate)}. * * @param spec a partition spec - * @param caseSensitive whether the Projection should consider case sensitivity on column names or not. + * @param caseSensitive whether the Projection should consider case sensitivity on column names or + * not. * @return a strict projection evaluator for the partition spec * @see Transform#projectStrict(String, BoundPredicate) Strict transform used for each predicate */ @@ -221,7 +219,8 @@ public Expression predicate(BoundPredicate pred) { // similarly, if partitioning by day(ts) and hour(ts), the more restrictive // projection should be used. ts = 2019-01-01T01:00:00 produces day=2019-01-01 and // hour=2019-01-01-01. the value will be in 2019-01-01-01 and not in 2019-01-01-02. - UnboundPredicate inclusiveProjection = ((Transform) part.transform()).project(part.name(), pred); + UnboundPredicate inclusiveProjection = + ((Transform) part.transform()).project(part.name(), pred); if (inclusiveProjection != null) { result = Expressions.and(result, inclusiveProjection); } @@ -252,7 +251,8 @@ public Expression predicate(BoundPredicate pred) { // any timestamp where either projection predicate is true must match the original // predicate. For example, ts = 2019-01-01T03:00:00 matches the hour projection but not // the day, but does match the original predicate. - UnboundPredicate strictProjection = ((Transform) part.transform()).projectStrict(part.name(), pred); + UnboundPredicate strictProjection = + ((Transform) part.transform()).projectStrict(part.name(), pred); if (strictProjection != null) { result = Expressions.or(result, strictProjection); } diff --git a/api/src/main/java/org/apache/iceberg/expressions/Reference.java b/api/src/main/java/org/apache/iceberg/expressions/Reference.java index 41c7b0fb44c7..6d7dc313b363 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Reference.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Reference.java @@ -16,13 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; /** * Represents a variable reference in an {@link Expression expression}. + * * @see BoundReference * @see NamedReference */ -public interface Reference extends Term { -} +public interface Reference extends Term {} diff --git a/api/src/main/java/org/apache/iceberg/expressions/ResidualEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ResidualEvaluator.java index eb765900d4f7..b3d19ce089c4 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ResidualEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ResidualEvaluator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.io.Serializable; @@ -32,23 +31,23 @@ /** * Finds the residuals for an {@link Expression} the partitions in the given {@link PartitionSpec}. - *

- * A residual expression is made by partially evaluating an expression using partition values. For - * example, if a table is partitioned by day(utc_timestamp) and is read with a filter expression + * + *

A residual expression is made by partially evaluating an expression using partition values. + * For example, if a table is partitioned by day(utc_timestamp) and is read with a filter expression * utc_timestamp >= a and utc_timestamp <= b, then there are 4 possible residuals expressions * for the partition data, d: + * *

    - *
  • If d > day(a) and d < day(b), the residual is always true
  • - *
  • If d == day(a) and d != day(b), the residual is utc_timestamp >= a
  • - *
  • if d == day(b) and d != day(a), the residual is utc_timestamp <= b
  • - *
  • If d == day(a) == day(b), the residual is utc_timestamp >= a and utc_timestamp <= b - *
  • + *
  • If d > day(a) and d < day(b), the residual is always true + *
  • If d == day(a) and d != day(b), the residual is utc_timestamp >= a + *
  • if d == day(b) and d != day(a), the residual is utc_timestamp <= b + *
  • If d == day(a) == day(b), the residual is utc_timestamp >= a and utc_timestamp <= b *
- *

- * Partition data is passed using {@link StructLike}. Residuals are returned by - * {@link #residualFor(StructLike)}. - *

- * This class is thread-safe. + * + *

Partition data is passed using {@link StructLike}. Residuals are returned by {@link + * #residualFor(StructLike)}. + * + *

This class is thread-safe. */ public class ResidualEvaluator implements Serializable { private static class UnpartitionedResidualEvaluator extends ResidualEvaluator { @@ -211,12 +210,16 @@ public Expression notIn(BoundReference ref, Set literalSet) { @Override public Expression startsWith(BoundReference ref, Literal lit) { - return ((String) ref.eval(struct)).startsWith((String) lit.value()) ? alwaysTrue() : alwaysFalse(); + return ((String) ref.eval(struct)).startsWith((String) lit.value()) + ? alwaysTrue() + : alwaysFalse(); } @Override public Expression notStartsWith(BoundReference ref, Literal lit) { - return ((String) ref.eval(struct)).startsWith((String) lit.value()) ? alwaysFalse() : alwaysTrue(); + return ((String) ref.eval(struct)).startsWith((String) lit.value()) + ? alwaysFalse() + : alwaysTrue(); } @Override @@ -238,7 +241,8 @@ public Expression predicate(BoundPredicate pred) { for (PartitionField part : parts) { // checking the strict projection - UnboundPredicate strictProjection = ((Transform) part.transform()).projectStrict(part.name(), pred); + UnboundPredicate strictProjection = + ((Transform) part.transform()).projectStrict(part.name(), pred); Expression strictResult = null; if (strictProjection != null) { @@ -246,7 +250,8 @@ public Expression predicate(BoundPredicate pred) { if (bound instanceof BoundPredicate) { strictResult = super.predicate((BoundPredicate) bound); } else { - // if the result is not a predicate, then it must be a constant like alwaysTrue or alwaysFalse + // if the result is not a predicate, then it must be a constant like alwaysTrue or + // alwaysFalse strictResult = bound; } } @@ -257,7 +262,8 @@ public Expression predicate(BoundPredicate pred) { } // checking the inclusive projection - UnboundPredicate inclusiveProjection = ((Transform) part.transform()).project(part.name(), pred); + UnboundPredicate inclusiveProjection = + ((Transform) part.transform()).project(part.name(), pred); Expression inclusiveResult = null; if (inclusiveProjection != null) { Expression boundInclusive = inclusiveProjection.bind(spec.partitionType(), caseSensitive); @@ -265,7 +271,8 @@ public Expression predicate(BoundPredicate pred) { // using predicate method specific to inclusive inclusiveResult = super.predicate((BoundPredicate) boundInclusive); } else { - // if the result is not a predicate, then it must be a constant like alwaysTrue or alwaysFalse + // if the result is not a predicate, then it must be a constant like alwaysTrue or + // alwaysFalse inclusiveResult = boundInclusive; } } @@ -274,7 +281,6 @@ public Expression predicate(BoundPredicate pred) { // If inclusive is false, returning false return Expressions.alwaysFalse(); } - } // neither strict not inclusive predicate was conclusive, returning the original pred diff --git a/api/src/main/java/org/apache/iceberg/expressions/RewriteNot.java b/api/src/main/java/org/apache/iceberg/expressions/RewriteNot.java index cc5b3c674de1..ad6ab52b9da8 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/RewriteNot.java +++ b/api/src/main/java/org/apache/iceberg/expressions/RewriteNot.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; class RewriteNot extends ExpressionVisitors.ExpressionVisitor { @@ -26,8 +25,7 @@ static RewriteNot get() { return INSTANCE; } - private RewriteNot() { - } + private RewriteNot() {} @Override public Expression alwaysTrue() { diff --git a/api/src/main/java/org/apache/iceberg/expressions/SerializationProxies.java b/api/src/main/java/org/apache/iceberg/expressions/SerializationProxies.java index bf93a3f7b925..59fd231cd368 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/SerializationProxies.java +++ b/api/src/main/java/org/apache/iceberg/expressions/SerializationProxies.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.io.ObjectStreamException; @@ -25,18 +24,15 @@ /** * Stand-in classes for expression classes in Java Serialization. - *

- * These are used so that expression classes are immutable and can use final fields. + * + *

These are used so that expression classes are immutable and can use final fields. */ class SerializationProxies { static class ConstantExpressionProxy implements Serializable { private Boolean trueOrFalse = null; - /** - * Constructor for Java serialization. - */ - ConstantExpressionProxy() { - } + /** Constructor for Java serialization. */ + ConstantExpressionProxy() {} ConstantExpressionProxy(boolean trueOrFalse) { this.trueOrFalse = trueOrFalse; @@ -52,11 +48,8 @@ Object readResolve() throws ObjectStreamException { } static class BinaryLiteralProxy extends FixedLiteralProxy { - /** - * Constructor for Java serialization. - */ - BinaryLiteralProxy() { - } + /** Constructor for Java serialization. */ + BinaryLiteralProxy() {} BinaryLiteralProxy(ByteBuffer buffer) { super(buffer); @@ -68,17 +61,12 @@ Object readResolve() throws ObjectStreamException { } } - /** - * Replacement for FixedLiteral in Java Serialization. - */ + /** Replacement for FixedLiteral in Java Serialization. */ static class FixedLiteralProxy implements Serializable { private byte[] bytes; - /** - * Constructor for Java serialization. - */ - FixedLiteralProxy() { - } + /** Constructor for Java serialization. */ + FixedLiteralProxy() {} FixedLiteralProxy(ByteBuffer buffer) { this.bytes = new byte[buffer.remaining()]; diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java index 3351669914b7..4aee75c447d3 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; +import static org.apache.iceberg.expressions.Expressions.rewriteNot; + import java.nio.ByteBuffer; import java.util.Collection; import java.util.Map; @@ -34,22 +35,20 @@ import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.NaNUtil; -import static org.apache.iceberg.expressions.Expressions.rewriteNot; - /** * Evaluates an {@link Expression} on a {@link DataFile} to test whether all rows in the file match. - *

- * This evaluation is strict: it returns true if all rows in a file must match the expression. For - * example, if a file's ts column has min X and max Y, this evaluator will return true for ts < Y+1 - * but not for ts < Y-1. - *

- * Files are passed to {@link #eval(ContentFile)}, which returns true if all rows in the file must - * contain matching rows and false if the file may contain rows that do not match. - *

- * Due to the comparison implementation of ORC stats, for float/double columns in ORC files, if the first - * value in a file is NaN, metrics of this file will report NaN for both upper and lower bound despite - * that the column could contain non-NaN data. Thus in some scenarios explicitly checks for NaN is necessary - * in order to not include files that may contain rows that don't match. + * + *

This evaluation is strict: it returns true if all rows in a file must match the expression. + * For example, if a file's ts column has min X and max Y, this evaluator will return true for ts + * < Y+1 but not for ts < Y-1. + * + *

Files are passed to {@link #eval(ContentFile)}, which returns true if all rows in the file + * must contain matching rows and false if the file may contain rows that do not match. + * + *

Due to the comparison implementation of ORC stats, for float/double columns in ORC files, if + * the first value in a file is NaN, metrics of this file will report NaN for both upper and lower + * bound despite that the column could contain non-NaN data. Thus in some scenarios explicitly + * checks for NaN is necessary in order to not include files that may contain rows that don't match. */ public class StrictMetricsEvaluator { private final Schema schema; @@ -70,7 +69,8 @@ public StrictMetricsEvaluator(Schema schema, Expression unbound, boolean caseSen * Test whether all records within the file match the expression. * * @param file a data file - * @return false if the file may contain any row that doesn't match the expression, true otherwise. + * @return false if the file may contain any row that doesn't match the expression, true + * otherwise. */ public boolean eval(ContentFile file) { // TODO: detect the case where a column is missing from the file using file's max field id. @@ -103,10 +103,14 @@ private boolean eval(ContentFile file) { @Override public Boolean handleNonReference(Bound term) { - // If the term in any expression is not a direct reference, assume that rows may not match. This happens when - // transforms or other expressions are passed to this evaluator. For example, bucket16(x) = 0 can't be determined - // because this visitor operates on data metrics and not partition values. It may be possible to un-transform - // expressions for order preserving transforms in the future, but this is not currently supported. + // If the term in any expression is not a direct reference, assume that rows may not match. + // This happens when + // transforms or other expressions are passed to this evaluator. For example, bucket16(x) = 0 + // can't be determined + // because this visitor operates on data metrics and not partition values. It may be possible + // to un-transform + // expressions for order preserving transforms in the future, but this is not currently + // supported. return ROWS_MIGHT_NOT_MATCH; } @@ -140,8 +144,8 @@ public Boolean isNull(BoundReference ref) { // no need to check whether the field is required because binding evaluates that case // if the column has any non-null values, the expression does not match int id = ref.fieldId(); - Preconditions.checkNotNull(struct.field(id), - "Cannot filter by nested column: %s", schema.findField(id)); + Preconditions.checkNotNull( + struct.field(id), "Cannot filter by nested column: %s", schema.findField(id)); if (containsNullsOnly(id)) { return ROWS_MUST_MATCH; @@ -155,8 +159,8 @@ public Boolean notNull(BoundReference ref) { // no need to check whether the field is required because binding evaluates that case // if the column has any null values, the expression does not match int id = ref.fieldId(); - Preconditions.checkNotNull(struct.field(id), - "Cannot filter by nested column: %s", schema.findField(id)); + Preconditions.checkNotNull( + struct.field(id), "Cannot filter by nested column: %s", schema.findField(id)); if (nullCounts != null && nullCounts.containsKey(id) && nullCounts.get(id) == 0) { return ROWS_MUST_MATCH; @@ -304,8 +308,10 @@ public Boolean eq(BoundReference ref, Literal lit) { return ROWS_MIGHT_NOT_MATCH; } - if (lowerBounds != null && lowerBounds.containsKey(id) && - upperBounds != null && upperBounds.containsKey(id)) { + if (lowerBounds != null + && lowerBounds.containsKey(id) + && upperBounds != null + && upperBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id)); int cmp = lit.comparator().compare(lower, lit.value()); @@ -373,8 +379,10 @@ public Boolean in(BoundReference ref, Set literalSet) { return ROWS_MIGHT_NOT_MATCH; } - if (lowerBounds != null && lowerBounds.containsKey(id) && - upperBounds != null && upperBounds.containsKey(id)) { + if (lowerBounds != null + && lowerBounds.containsKey(id) + && upperBounds != null + && upperBounds.containsKey(id)) { // similar to the implementation in eq, first check if the lower bound is in the set T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id)); if (!literalSet.contains(lower)) { @@ -392,7 +400,8 @@ public Boolean in(BoundReference ref, Set literalSet) { return ROWS_MIGHT_NOT_MATCH; } - // All values must be in the set if the lower bound and the upper bound are in the set and are equal. + // All values must be in the set if the lower bound and the upper bound are in the set and + // are equal. return ROWS_MUST_MATCH; } @@ -419,16 +428,25 @@ public Boolean notIn(BoundReference ref, Set literalSet) { return ROWS_MIGHT_NOT_MATCH; } - literals = literals.stream().filter(v -> ref.comparator().compare(lower, v) <= 0).collect(Collectors.toList()); - if (literals.isEmpty()) { // if all values are less than lower bound, rows must match (notIn). + literals = + literals.stream() + .filter(v -> ref.comparator().compare(lower, v) <= 0) + .collect(Collectors.toList()); + if (literals + .isEmpty()) { // if all values are less than lower bound, rows must match (notIn). return ROWS_MUST_MATCH; } } if (upperBounds != null && upperBounds.containsKey(id)) { T upper = Conversions.fromByteBuffer(field.type(), upperBounds.get(id)); - literals = literals.stream().filter(v -> ref.comparator().compare(upper, v) >= 0).collect(Collectors.toList()); - if (literals.isEmpty()) { // if all remaining values are greater than upper bound, rows must match (notIn). + literals = + literals.stream() + .filter(v -> ref.comparator().compare(upper, v) >= 0) + .collect(Collectors.toList()); + if (literals + .isEmpty()) { // if all remaining values are greater than upper bound, rows must match + // (notIn). return ROWS_MUST_MATCH; } } @@ -443,7 +461,8 @@ public Boolean startsWith(BoundReference ref, Literal lit) { @Override public Boolean notStartsWith(BoundReference ref, Literal lit) { - // TODO: Handle cases that definitely cannot match, such as notStartsWith("x") when the bounds are ["a", "b"]. + // TODO: Handle cases that definitely cannot match, such as notStartsWith("x") when the bounds + // are ["a", "b"]. return ROWS_MIGHT_NOT_MATCH; } @@ -457,14 +476,18 @@ private boolean canContainNaNs(Integer id) { } private boolean containsNullsOnly(Integer id) { - return valueCounts != null && valueCounts.containsKey(id) && - nullCounts != null && nullCounts.containsKey(id) && - valueCounts.get(id) - nullCounts.get(id) == 0; + return valueCounts != null + && valueCounts.containsKey(id) + && nullCounts != null + && nullCounts.containsKey(id) + && valueCounts.get(id) - nullCounts.get(id) == 0; } private boolean containsNaNsOnly(Integer id) { - return nanCounts != null && nanCounts.containsKey(id) && - valueCounts != null && nanCounts.get(id).equals(valueCounts.get(id)); + return nanCounts != null + && nanCounts.containsKey(id) + && valueCounts != null + && nanCounts.get(id).equals(valueCounts.get(id)); } } } diff --git a/api/src/main/java/org/apache/iceberg/expressions/Term.java b/api/src/main/java/org/apache/iceberg/expressions/Term.java index 0d2276b0447e..81ed3a8c0e7b 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Term.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Term.java @@ -16,13 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.io.Serializable; -/** - * An expression that evaluates to a value. - */ -public interface Term extends Serializable { -} +/** An expression that evaluates to a value. */ +public interface Term extends Serializable {} diff --git a/api/src/main/java/org/apache/iceberg/expressions/True.java b/api/src/main/java/org/apache/iceberg/expressions/True.java index dc24a2bcbfec..42d395abab02 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/True.java +++ b/api/src/main/java/org/apache/iceberg/expressions/True.java @@ -16,19 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.io.ObjectStreamException; -/** - * An {@link Expression expression} that is always true. - */ +/** An {@link Expression expression} that is always true. */ public class True implements Expression { static final True INSTANCE = new True(); - private True() { - } + private True() {} @Override public Operation op() { diff --git a/api/src/main/java/org/apache/iceberg/expressions/Unbound.java b/api/src/main/java/org/apache/iceberg/expressions/Unbound.java index d71d36dd2255..557ac3fd26be 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/Unbound.java +++ b/api/src/main/java/org/apache/iceberg/expressions/Unbound.java @@ -16,15 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import org.apache.iceberg.types.Types; /** * Represents an unbound expression node. + * * @param the Java type of values produced by this node - * @param the Java type produced when this node is bound using {@link #bind(Types.StructType, boolean)} + * @param the Java type produced when this node is bound using {@link #bind(Types.StructType, + * boolean)} */ public interface Unbound { /** @@ -36,8 +37,6 @@ public interface Unbound { */ B bind(Types.StructType struct, boolean caseSensitive); - /** - * Returns this expression's underlying reference. - */ + /** Returns this expression's underlying reference. */ NamedReference ref(); } diff --git a/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java b/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java index 10a459e78ecf..04513086e042 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java +++ b/api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.util.List; @@ -31,7 +30,8 @@ import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.CharSequenceSet; -public class UnboundPredicate extends Predicate> implements Unbound { +public class UnboundPredicate extends Predicate> + implements Unbound { private static final Joiner COMMA = Joiner.on(", "); private final List> literals; @@ -71,8 +71,10 @@ public Expression negate() { } public Literal literal() { - Preconditions.checkArgument(op() != Operation.IN && op() != Operation.NOT_IN, - "%s predicate cannot return a literal", op()); + Preconditions.checkArgument( + op() != Operation.IN && op() != Operation.NOT_IN, + "%s predicate cannot return a literal", + op()); return literals == null ? null : Iterables.getOnlyElement(literals); } @@ -83,11 +85,12 @@ public List> literals() { /** * Bind this UnboundPredicate, defaulting to case sensitive mode. * - * Access modifier is package-private, to only allow use from existing tests. + *

Access modifier is package-private, to only allow use from existing tests. * * @param struct The {@link StructType struct type} to resolve references by name. * @return an {@link Expression} - * @throws ValidationException if literals do not match bound references, or if comparison on expression is invalid + * @throws ValidationException if literals do not match bound references, or if comparison on + * expression is invalid */ Expression bind(StructType struct) { return bind(struct, true); @@ -97,9 +100,11 @@ Expression bind(StructType struct) { * Bind this UnboundPredicate. * * @param struct The {@link StructType struct type} to resolve references by name. - * @param caseSensitive A boolean flag to control whether the bind should enforce case sensitivity. + * @param caseSensitive A boolean flag to control whether the bind should enforce case + * sensitivity. * @return an {@link Expression} - * @throws ValidationException if literals do not match bound references, or if comparison on expression is invalid + * @throws ValidationException if literals do not match bound references, or if comparison on + * expression is invalid */ @Override public Expression bind(StructType struct, boolean caseSensitive) { @@ -153,7 +158,8 @@ private Expression bindLiteralOperation(BoundTerm boundTerm) { Literal lit = literal().to(boundTerm.type()); if (lit == null) { - throw new ValidationException("Invalid value for conversion to type %s: %s (%s)", + throw new ValidationException( + "Invalid value for conversion to type %s: %s (%s)", boundTerm.type(), literal().value(), literal().value().getClass().getName()); } else if (lit == Literals.aboveMax()) { @@ -185,14 +191,22 @@ private Expression bindLiteralOperation(BoundTerm boundTerm) { } private Expression bindInOperation(BoundTerm boundTerm) { - List> convertedLiterals = Lists.newArrayList(Iterables.filter( - Lists.transform(literals, lit -> { - Literal converted = lit.to(boundTerm.type()); - ValidationException.check(converted != null, - "Invalid value for conversion to type %s: %s (%s)", boundTerm.type(), lit, lit.getClass().getName()); - return converted; - }), - lit -> lit != Literals.aboveMax() && lit != Literals.belowMin())); + List> convertedLiterals = + Lists.newArrayList( + Iterables.filter( + Lists.transform( + literals, + lit -> { + Literal converted = lit.to(boundTerm.type()); + ValidationException.check( + converted != null, + "Invalid value for conversion to type %s: %s (%s)", + boundTerm.type(), + lit, + lit.getClass().getName()); + return converted; + }), + lit -> lit != Literals.aboveMax() && lit != Literals.belowMin())); if (convertedLiterals.isEmpty()) { switch (op()) { @@ -209,9 +223,11 @@ private Expression bindInOperation(BoundTerm boundTerm) { if (literalSet.size() == 1) { switch (op()) { case IN: - return new BoundLiteralPredicate<>(Operation.EQ, boundTerm, Iterables.get(convertedLiterals, 0)); + return new BoundLiteralPredicate<>( + Operation.EQ, boundTerm, Iterables.get(convertedLiterals, 0)); case NOT_IN: - return new BoundLiteralPredicate<>(Operation.NOT_EQ, boundTerm, Iterables.get(convertedLiterals, 0)); + return new BoundLiteralPredicate<>( + Operation.NOT_EQ, boundTerm, Iterables.get(convertedLiterals, 0)); default: throw new ValidationException("Operation must be IN or NOT_IN"); } diff --git a/api/src/main/java/org/apache/iceberg/expressions/UnboundTerm.java b/api/src/main/java/org/apache/iceberg/expressions/UnboundTerm.java index 412c7c0e4ff2..57f17f0921ca 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/UnboundTerm.java +++ b/api/src/main/java/org/apache/iceberg/expressions/UnboundTerm.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; /** @@ -24,5 +23,4 @@ * * @param the Java type of values produced by this term */ -public interface UnboundTerm extends Unbound>, Term { -} +public interface UnboundTerm extends Unbound>, Term {} diff --git a/api/src/main/java/org/apache/iceberg/expressions/UnboundTransform.java b/api/src/main/java/org/apache/iceberg/expressions/UnboundTransform.java index be3cec188935..cd92fce3916e 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/UnboundTransform.java +++ b/api/src/main/java/org/apache/iceberg/expressions/UnboundTransform.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import org.apache.iceberg.exceptions.ValidationException; @@ -50,12 +49,18 @@ public BoundTransform bind(Types.StructType struct, boolean caseSensitive) Transform typeTransform; try { // TODO: Avoid using toString/fromString - typeTransform = (Transform) Transforms.fromString(boundRef.type(), transform.toString()); - ValidationException.check(typeTransform.canTransform(boundRef.type()), - "Cannot bind: %s cannot transform %s values from '%s'", transform, boundRef.type(), ref.name()); + typeTransform = + (Transform) Transforms.fromString(boundRef.type(), transform.toString()); + ValidationException.check( + typeTransform.canTransform(boundRef.type()), + "Cannot bind: %s cannot transform %s values from '%s'", + transform, + boundRef.type(), + ref.name()); } catch (IllegalArgumentException e) { throw new ValidationException( - "Cannot bind: %s cannot transform %s values from '%s'", transform, boundRef.type(), ref.name()); + "Cannot bind: %s cannot transform %s values from '%s'", + transform, boundRef.type(), ref.name()); } return new BoundTransform<>(boundRef, typeTransform); diff --git a/api/src/main/java/org/apache/iceberg/io/BulkDeletionFailureException.java b/api/src/main/java/org/apache/iceberg/io/BulkDeletionFailureException.java index af9513785213..535be5f64ec8 100644 --- a/api/src/main/java/org/apache/iceberg/io/BulkDeletionFailureException.java +++ b/api/src/main/java/org/apache/iceberg/io/BulkDeletionFailureException.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; public class BulkDeletionFailureException extends RuntimeException { diff --git a/api/src/main/java/org/apache/iceberg/io/CloseableGroup.java b/api/src/main/java/org/apache/iceberg/io/CloseableGroup.java index b9e04e99b948..d20b93bdda1a 100644 --- a/api/src/main/java/org/apache/iceberg/io/CloseableGroup.java +++ b/api/src/main/java/org/apache/iceberg/io/CloseableGroup.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.Closeable; @@ -28,15 +27,16 @@ import org.slf4j.LoggerFactory; /** - * This class acts as a helper for handling the closure of multiple resource. - * It can be used for either inheritance or composition. - * To use it, register resources to be closed via the add() calls, and call the corresponding close method when needed. - *

- * It can take both closeable and autocloseable objects, and handle closeable as autocloseable and guarantee close - * idempotency by ensuring that each resource will be closed once even with concurrent close calls. It will also - * wrap checked non-IO exceptions into runtime exceptions. - *

- * Users can choose to suppress close failure with this class. By default such failures are not suppressed. + * This class acts as a helper for handling the closure of multiple resource. It can be used for + * either inheritance or composition. To use it, register resources to be closed via the add() + * calls, and call the corresponding close method when needed. + * + *

It can take both closeable and autocloseable objects, and handle closeable as autocloseable + * and guarantee close idempotency by ensuring that each resource will be closed once even with + * concurrent close calls. It will also wrap checked non-IO exceptions into runtime exceptions. + * + *

Users can choose to suppress close failure with this class. By default such failures are not + * suppressed. */ public class CloseableGroup implements Closeable { private static final Logger LOG = LoggerFactory.getLogger(CloseableGroup.class); @@ -44,23 +44,23 @@ public class CloseableGroup implements Closeable { private final Deque closeables = Lists.newLinkedList(); private boolean suppressCloseFailure = false; - /** - * Register a closeable to be managed by this class. - */ + /** Register a closeable to be managed by this class. */ public void addCloseable(Closeable closeable) { closeables.add(closeable); } /** - * Register an autocloseables to be managed by this class. It will be handled as a closeable object. + * Register an autocloseables to be managed by this class. It will be handled as a closeable + * object. */ public void addCloseable(AutoCloseable autoCloseable) { closeables.add(autoCloseable); } /** - * Whether to suppress failure when any of the closeable this class tracks throws exception during closing. - * This could be helpful to ensure the close method of all resources to be called. + * Whether to suppress failure when any of the closeable this class tracks throws exception during + * closing. This could be helpful to ensure the close method of all resources to be called. + * * @param shouldSuppress true if user wants to suppress close failures */ public void setSuppressCloseFailure(boolean shouldSuppress) { diff --git a/api/src/main/java/org/apache/iceberg/io/CloseableIterable.java b/api/src/main/java/org/apache/iceberg/io/CloseableIterable.java index cc4be871feaa..068584689eb5 100644 --- a/api/src/main/java/org/apache/iceberg/io/CloseableIterable.java +++ b/api/src/main/java/org/apache/iceberg/io/CloseableIterable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.Closeable; @@ -47,8 +46,7 @@ static CloseableIterable withNoopClose(E entry) { static CloseableIterable withNoopClose(Iterable iterable) { return new CloseableIterable() { @Override - public void close() { - } + public void close() {} @Override public CloseableIterator iterator() { @@ -76,15 +74,19 @@ public CloseableIterator iterator() { } static CloseableIterable filter(CloseableIterable iterable, Predicate pred) { - return combine(() -> new FilterIterator(iterable.iterator()) { - @Override - protected boolean shouldKeep(E item) { - return pred.test(item); - } - }, iterable); + return combine( + () -> + new FilterIterator(iterable.iterator()) { + @Override + protected boolean shouldKeep(E item) { + return pred.test(item); + } + }, + iterable); } - static CloseableIterable transform(CloseableIterable iterable, Function transform) { + static CloseableIterable transform( + CloseableIterable iterable, Function transform) { Preconditions.checkNotNull(transform, "Cannot apply a null transform"); return new CloseableIterable() { @@ -207,5 +209,4 @@ public E next() { } } } - } diff --git a/api/src/main/java/org/apache/iceberg/io/CloseableIterator.java b/api/src/main/java/org/apache/iceberg/io/CloseableIterator.java index 079190b12c26..3a79b843538a 100644 --- a/api/src/main/java/org/apache/iceberg/io/CloseableIterator.java +++ b/api/src/main/java/org/apache/iceberg/io/CloseableIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.Closeable; @@ -57,7 +56,8 @@ public E next() { }; } - static CloseableIterator transform(CloseableIterator iterator, Function transform) { + static CloseableIterator transform( + CloseableIterator iterator, Function transform) { Preconditions.checkNotNull(transform, "Cannot apply a null transform"); return new CloseableIterator() { diff --git a/api/src/main/java/org/apache/iceberg/io/ClosingIterator.java b/api/src/main/java/org/apache/iceberg/io/ClosingIterator.java index 58fc931b156d..d5ddb3cd3c4d 100644 --- a/api/src/main/java/org/apache/iceberg/io/ClosingIterator.java +++ b/api/src/main/java/org/apache/iceberg/io/ClosingIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -24,8 +23,8 @@ import java.util.Iterator; /** - * A convenience wrapper around {@link CloseableIterator}, providing auto-close - * functionality when all of the elements in the iterator are consumed. + * A convenience wrapper around {@link CloseableIterator}, providing auto-close functionality when + * all of the elements in the iterator are consumed. */ public class ClosingIterator implements Iterator { private final CloseableIterator iterator; diff --git a/api/src/main/java/org/apache/iceberg/io/CredentialSupplier.java b/api/src/main/java/org/apache/iceberg/io/CredentialSupplier.java index 4211306f3041..e5180bb1c16d 100644 --- a/api/src/main/java/org/apache/iceberg/io/CredentialSupplier.java +++ b/api/src/main/java/org/apache/iceberg/io/CredentialSupplier.java @@ -16,19 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; /** * Interface used to expose credentials held by a FileIO instance. - *

- * Tables supply a FileIO instance to use for file access that is configured with the credentials needed to access the - * table's files. Systems that do not use FileIO can use this interface to get the configured credential as a string, - * and use the credential for file access via other IO libraries. + * + *

Tables supply a FileIO instance to use for file access that is configured with the credentials + * needed to access the table's files. Systems that do not use FileIO can use this interface to get + * the configured credential as a string, and use the credential for file access via other IO + * libraries. */ public interface CredentialSupplier { - /** - * Returns the credential string - */ + /** Returns the credential string */ String getCredential(); } diff --git a/api/src/main/java/org/apache/iceberg/io/DelegatingInputStream.java b/api/src/main/java/org/apache/iceberg/io/DelegatingInputStream.java index 701286250258..d76dae91852d 100644 --- a/api/src/main/java/org/apache/iceberg/io/DelegatingInputStream.java +++ b/api/src/main/java/org/apache/iceberg/io/DelegatingInputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.InputStream; diff --git a/api/src/main/java/org/apache/iceberg/io/DelegatingOutputStream.java b/api/src/main/java/org/apache/iceberg/io/DelegatingOutputStream.java index 7228d0874154..38774c065e2b 100644 --- a/api/src/main/java/org/apache/iceberg/io/DelegatingOutputStream.java +++ b/api/src/main/java/org/apache/iceberg/io/DelegatingOutputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.OutputStream; diff --git a/api/src/main/java/org/apache/iceberg/io/FileAppender.java b/api/src/main/java/org/apache/iceberg/io/FileAppender.java index 4de3f6849c6e..f98a72128359 100644 --- a/api/src/main/java/org/apache/iceberg/io/FileAppender.java +++ b/api/src/main/java/org/apache/iceberg/io/FileAppender.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.Closeable; @@ -37,22 +36,18 @@ default void addAll(Iterable values) { addAll(values.iterator()); } - /** - * Returns {@link Metrics} for this file. Only valid after the file is closed. - */ + /** Returns {@link Metrics} for this file. Only valid after the file is closed. */ Metrics metrics(); - /** - * Returns the length of this file. - */ + /** Returns the length of this file. */ long length(); /** * Returns a list of recommended split locations, if applicable, null otherwise. - *

- * When available, this information is used for planning scan tasks whose boundaries - * are determined by these offsets. The returned list must be sorted in ascending order. - * Only valid after the file is closed. + * + *

When available, this information is used for planning scan tasks whose boundaries are + * determined by these offsets. The returned list must be sorted in ascending order. Only valid + * after the file is closed. */ default List splitOffsets() { return null; diff --git a/api/src/main/java/org/apache/iceberg/io/FileIO.java b/api/src/main/java/org/apache/iceberg/io/FileIO.java index 23eb8fe033b1..928f09eb20b6 100644 --- a/api/src/main/java/org/apache/iceberg/io/FileIO.java +++ b/api/src/main/java/org/apache/iceberg/io/FileIO.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.Closeable; @@ -25,72 +24,64 @@ /** * Pluggable module for reading, writing, and deleting files. - *

- * Both table metadata files and data files can be written and read by this module. Implementations - * must be serializable because various clients of Spark tables may initialize this once and pass - * it off to a separate module that would then interact with the streams. + * + *

Both table metadata files and data files can be written and read by this module. + * Implementations must be serializable because various clients of Spark tables may initialize this + * once and pass it off to a separate module that would then interact with the streams. */ public interface FileIO extends Serializable, Closeable { - /** - * Get a {@link InputFile} instance to read bytes from the file at the given path. - */ + /** Get a {@link InputFile} instance to read bytes from the file at the given path. */ InputFile newInputFile(String path); /** - * Get a {@link InputFile} instance to read bytes from the file at the given path, with a known file length. + * Get a {@link InputFile} instance to read bytes from the file at the given path, with a known + * file length. */ default InputFile newInputFile(String path, long length) { return newInputFile(path); } - /** - * Get a {@link OutputFile} instance to write bytes to the file at the given path. - */ + /** Get a {@link OutputFile} instance to write bytes to the file at the given path. */ OutputFile newOutputFile(String path); - /** - * Delete the file at the given path. - */ + /** Delete the file at the given path. */ void deleteFile(String path); - /** - * Convenience method to {@link #deleteFile(String) delete} an {@link InputFile}. - */ + /** Convenience method to {@link #deleteFile(String) delete} an {@link InputFile}. */ default void deleteFile(InputFile file) { deleteFile(file.location()); } - /** - * Convenience method to {@link #deleteFile(String) delete} an {@link OutputFile}. - */ + /** Convenience method to {@link #deleteFile(String) delete} an {@link OutputFile}. */ default void deleteFile(OutputFile file) { deleteFile(file.location()); } /** * Returns the property map used to configure this FileIO - * @throws UnsupportedOperationException if this FileIO does not expose its configuration properties + * + * @throws UnsupportedOperationException if this FileIO does not expose its configuration + * properties */ default Map properties() { - throw new UnsupportedOperationException(String.format( - "%s does not expose configuration properties", this.getClass().toString())); + throw new UnsupportedOperationException( + String.format("%s does not expose configuration properties", this.getClass().toString())); } /** * Initialize File IO from catalog properties. + * * @param properties catalog properties */ - default void initialize(Map properties) { - } + default void initialize(Map properties) {} /** * Close File IO to release underlying resources. - *

- * Calling this method is only required when this FileIO instance is no longer expected to be used, - * and the resources it holds need to be explicitly released to avoid resource leaks. + * + *

Calling this method is only required when this FileIO instance is no longer expected to be + * used, and the resources it holds need to be explicitly released to avoid resource leaks. */ @Override - default void close() { - } + default void close() {} } diff --git a/api/src/main/java/org/apache/iceberg/io/FileIOMetricsContext.java b/api/src/main/java/org/apache/iceberg/io/FileIOMetricsContext.java index 6b73b80d5a55..80be6e1ae5fc 100644 --- a/api/src/main/java/org/apache/iceberg/io/FileIOMetricsContext.java +++ b/api/src/main/java/org/apache/iceberg/io/FileIOMetricsContext.java @@ -16,14 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import org.apache.iceberg.metrics.MetricsContext; /** - * Extension of MetricsContext for use with FileIO to define standard metrics - * that should be reported. + * Extension of MetricsContext for use with FileIO to define standard metrics that should be + * reported. */ public interface FileIOMetricsContext extends MetricsContext { String READ_BYTES = "read.bytes"; diff --git a/api/src/main/java/org/apache/iceberg/io/FileInfo.java b/api/src/main/java/org/apache/iceberg/io/FileInfo.java index 63a72c283c1e..f2325ca0d444 100644 --- a/api/src/main/java/org/apache/iceberg/io/FileInfo.java +++ b/api/src/main/java/org/apache/iceberg/io/FileInfo.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; public class FileInfo { diff --git a/api/src/main/java/org/apache/iceberg/io/FilterIterator.java b/api/src/main/java/org/apache/iceberg/io/FilterIterator.java index 974619e08866..f721e19dd106 100644 --- a/api/src/main/java/org/apache/iceberg/io/FilterIterator.java +++ b/api/src/main/java/org/apache/iceberg/io/FilterIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.Closeable; diff --git a/api/src/main/java/org/apache/iceberg/io/InputFile.java b/api/src/main/java/org/apache/iceberg/io/InputFile.java index 6ad3f32a907c..64e09fcb2ad3 100644 --- a/api/src/main/java/org/apache/iceberg/io/InputFile.java +++ b/api/src/main/java/org/apache/iceberg/io/InputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -25,8 +24,8 @@ /** * An interface used to read input files using {@link SeekableInputStream} instances. - *

- * This class is based on Parquet's InputFile. + * + *

This class is based on Parquet's InputFile. */ public interface InputFile { /** diff --git a/api/src/main/java/org/apache/iceberg/io/LocationProvider.java b/api/src/main/java/org/apache/iceberg/io/LocationProvider.java index 58f70c1de883..3ed770753734 100644 --- a/api/src/main/java/org/apache/iceberg/io/LocationProvider.java +++ b/api/src/main/java/org/apache/iceberg/io/LocationProvider.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.Serializable; @@ -25,8 +24,8 @@ /** * Interface for providing data file locations to write tasks. - *

- * Implementations must be {@link Serializable} because instances will be serialized to tasks. + * + *

Implementations must be {@link Serializable} because instances will be serialized to tasks. */ public interface LocationProvider extends Serializable { /** diff --git a/api/src/main/java/org/apache/iceberg/io/OutputFile.java b/api/src/main/java/org/apache/iceberg/io/OutputFile.java index 67195c46c448..991471fd8555 100644 --- a/api/src/main/java/org/apache/iceberg/io/OutputFile.java +++ b/api/src/main/java/org/apache/iceberg/io/OutputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -25,15 +24,15 @@ /** * An interface used to create output files using {@link PositionOutputStream} instances. - *

- * This class is based on Parquet's InputFile. + * + *

This class is based on Parquet's InputFile. */ public interface OutputFile { /** * Create a new file and return a {@link PositionOutputStream} to it. - *

- * If the file already exists, this will throw an exception. + * + *

If the file already exists, this will throw an exception. * * @return an output stream that can report its position * @throws AlreadyExistsException If the path already exists @@ -43,12 +42,13 @@ public interface OutputFile { /** * Create a new file and return a {@link PositionOutputStream} to it. - *

- * If the file already exists, this will not throw an exception and will replace the file. + * + *

If the file already exists, this will not throw an exception and will replace the file. * * @return an output stream that can report its position * @throws RuntimeIOException If the implementation throws an {@link IOException} - * @throws SecurityException If staging directory creation fails due to missing JVM level permission + * @throws SecurityException If staging directory creation fails due to missing JVM level + * permission */ PositionOutputStream createOrOverwrite(); diff --git a/api/src/main/java/org/apache/iceberg/io/PositionOutputStream.java b/api/src/main/java/org/apache/iceberg/io/PositionOutputStream.java index a6b34e092d8d..b80549fbdef8 100644 --- a/api/src/main/java/org/apache/iceberg/io/PositionOutputStream.java +++ b/api/src/main/java/org/apache/iceberg/io/PositionOutputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -30,5 +29,4 @@ public abstract class PositionOutputStream extends OutputStream { * @throws IOException If the underlying stream throws IOException */ public abstract long getPos() throws IOException; - } diff --git a/api/src/main/java/org/apache/iceberg/io/RangeReadable.java b/api/src/main/java/org/apache/iceberg/io/RangeReadable.java index fafdd7b02037..c2b113b2b27d 100644 --- a/api/src/main/java/org/apache/iceberg/io/RangeReadable.java +++ b/api/src/main/java/org/apache/iceberg/io/RangeReadable.java @@ -16,32 +16,28 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.Closeable; import java.io.IOException; /** - * {@code RangeReadable} is an interface that allows for implementations - * of {@link InputFile} streams to perform positional, range-based reads, which - * are more efficient than unbounded reads in many cloud provider object stores. - * - * Thread safety is not a requirement of the interface and is left to the - * implementation. + * {@code RangeReadable} is an interface that allows for implementations of {@link InputFile} + * streams to perform positional, range-based reads, which are more efficient than unbounded reads + * in many cloud provider object stores. * - * If the implementation is also a {@link SeekableInputStream}, the position - * of the stream is not required to be updated based on the positional reads - * performed by this interface. Usage of {@link SeekableInputStream} should - * always seek to the appropriate position for {@link java.io.InputStream} - * based reads. + *

Thread safety is not a requirement of the interface and is left to the implementation. * + *

If the implementation is also a {@link SeekableInputStream}, the position of the stream is not + * required to be updated based on the positional reads performed by this interface. Usage of {@link + * SeekableInputStream} should always seek to the appropriate position for {@link + * java.io.InputStream} based reads. */ public interface RangeReadable extends Closeable { /** - * Fill the provided buffer with the contents of the input source starting - * at {@code position} for the given {@code offset} and {@code length}. + * Fill the provided buffer with the contents of the input source starting at {@code position} for + * the given {@code offset} and {@code length}. * * @param position start position of the read * @param buffer target buffer to copy data @@ -51,8 +47,7 @@ public interface RangeReadable extends Closeable { void readFully(long position, byte[] buffer, int offset, int length) throws IOException; /** - * Fill the entire buffer with the contents of the input source starting - * at {@code position}. + * Fill the entire buffer with the contents of the input source starting at {@code position}. * * @param position start position of the read * @param buffer target buffer to copy data @@ -70,7 +65,7 @@ default void readFully(long position, byte[] buffer) throws IOException { * @return the actual number of bytes read * @throws IOException if an error occurs while reading */ - int readTail(byte [] buffer, int offset, int length) throws IOException; + int readTail(byte[] buffer, int offset, int length) throws IOException; /** * Read the full size of the buffer from the end of the file. @@ -79,7 +74,7 @@ default void readFully(long position, byte[] buffer) throws IOException { * @return the actual number of bytes read * @throws IOException if an error occurs while reading */ - default int readTail(byte [] buffer) throws IOException { + default int readTail(byte[] buffer) throws IOException { return readTail(buffer, 0, buffer.length); } } diff --git a/api/src/main/java/org/apache/iceberg/io/SeekableInputStream.java b/api/src/main/java/org/apache/iceberg/io/SeekableInputStream.java index f20d0c8bceee..290e9661da69 100644 --- a/api/src/main/java/org/apache/iceberg/io/SeekableInputStream.java +++ b/api/src/main/java/org/apache/iceberg/io/SeekableInputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -26,7 +25,7 @@ * {@code SeekableInputStream} is an interface with the methods needed to read data from a file or * Hadoop data stream. * - * This class is based on Parquet's SeekableInputStream. + *

This class is based on Parquet's SeekableInputStream. */ public abstract class SeekableInputStream extends InputStream { /** diff --git a/api/src/main/java/org/apache/iceberg/io/SupportsBulkOperations.java b/api/src/main/java/org/apache/iceberg/io/SupportsBulkOperations.java index 4599029baea8..f9f46b90c8f8 100644 --- a/api/src/main/java/org/apache/iceberg/io/SupportsBulkOperations.java +++ b/api/src/main/java/org/apache/iceberg/io/SupportsBulkOperations.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; public interface SupportsBulkOperations { diff --git a/api/src/main/java/org/apache/iceberg/io/SupportsPrefixOperations.java b/api/src/main/java/org/apache/iceberg/io/SupportsPrefixOperations.java index fc65f38ab32b..85516518bde3 100644 --- a/api/src/main/java/org/apache/iceberg/io/SupportsPrefixOperations.java +++ b/api/src/main/java/org/apache/iceberg/io/SupportsPrefixOperations.java @@ -16,22 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; /** - * This interface is intended as an extension for FileIO implementations - * to provide additional prefix based operations that may be useful in - * performing supporting operations. + * This interface is intended as an extension for FileIO implementations to provide additional + * prefix based operations that may be useful in performing supporting operations. */ public interface SupportsPrefixOperations { /** * Return an iterable of all files under a prefix. - *

- * Hierarchical file systems (e.g. HDFS) may impose additional restrictions - * like the prefix must fully match a directory whereas key/value object - * stores may allow for arbitrary prefixes. + * + *

Hierarchical file systems (e.g. HDFS) may impose additional restrictions like the prefix + * must fully match a directory whereas key/value object stores may allow for arbitrary prefixes. * * @param prefix prefix to list * @return iterable of file information @@ -40,10 +37,9 @@ public interface SupportsPrefixOperations { /** * Delete all files under a prefix. - *

- * Hierarchical file systems (e.g. HDFS) may impose additional restrictions - * like the prefix must fully match a directory whereas key/value object - * stores may allow for arbitrary prefixes. + * + *

Hierarchical file systems (e.g. HDFS) may impose additional restrictions like the prefix + * must fully match a directory whereas key/value object stores may allow for arbitrary prefixes. * * @param prefix prefix to delete */ diff --git a/api/src/main/java/org/apache/iceberg/metrics/DefaultMetricsContext.java b/api/src/main/java/org/apache/iceberg/metrics/DefaultMetricsContext.java index 2a2a6e6bba5a..531307ae94a4 100644 --- a/api/src/main/java/org/apache/iceberg/metrics/DefaultMetricsContext.java +++ b/api/src/main/java/org/apache/iceberg/metrics/DefaultMetricsContext.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.metrics; import java.util.concurrent.TimeUnit; -/** - * A default {@link MetricsContext} implementation that uses native Java counters/timers. - */ +/** A default {@link MetricsContext} implementation that uses native Java counters/timers. */ public class DefaultMetricsContext implements MetricsContext { @Override @@ -35,7 +32,8 @@ public Counter counter(String name, Class type, Unit un if (Long.class.equals(type)) { return (Counter) new LongCounter(); } - throw new IllegalArgumentException(String.format("Counter for type %s is not supported", type.getName())); + throw new IllegalArgumentException( + String.format("Counter for type %s is not supported", type.getName())); } @Override diff --git a/api/src/main/java/org/apache/iceberg/metrics/DefaultTimer.java b/api/src/main/java/org/apache/iceberg/metrics/DefaultTimer.java index 474b867bfc7b..24767295aa76 100644 --- a/api/src/main/java/org/apache/iceberg/metrics/DefaultTimer.java +++ b/api/src/main/java/org/apache/iceberg/metrics/DefaultTimer.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.metrics; import java.time.Duration; @@ -30,7 +29,8 @@ import org.apache.iceberg.relocated.com.google.common.base.Stopwatch; /** - * A default {@link Timer} implementation that uses a {@link Stopwatch} instance internally to measure time. + * A default {@link Timer} implementation that uses a {@link Stopwatch} instance internally to + * measure time. */ public class DefaultTimer implements Timer { private final TimeUnit defaultTimeUnit; diff --git a/api/src/main/java/org/apache/iceberg/metrics/IntCounter.java b/api/src/main/java/org/apache/iceberg/metrics/IntCounter.java index e29febe89275..728a5ccb677f 100644 --- a/api/src/main/java/org/apache/iceberg/metrics/IntCounter.java +++ b/api/src/main/java/org/apache/iceberg/metrics/IntCounter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.metrics; import java.util.Optional; @@ -24,8 +23,8 @@ import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; /** - * A default {@link org.apache.iceberg.metrics.MetricsContext.Counter} implementation that uses an {@link Integer} to - * count events. + * A default {@link org.apache.iceberg.metrics.MetricsContext.Counter} implementation that uses an + * {@link Integer} to count events. */ class IntCounter implements MetricsContext.Counter { private final AtomicInteger counter; @@ -56,8 +55,6 @@ public Integer value() { @Override public String toString() { - return MoreObjects.toStringHelper(this) - .add("counter", counter) - .toString(); + return MoreObjects.toStringHelper(this).add("counter", counter).toString(); } } diff --git a/api/src/main/java/org/apache/iceberg/metrics/LongCounter.java b/api/src/main/java/org/apache/iceberg/metrics/LongCounter.java index 32cbe1e31070..eade8327a011 100644 --- a/api/src/main/java/org/apache/iceberg/metrics/LongCounter.java +++ b/api/src/main/java/org/apache/iceberg/metrics/LongCounter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.metrics; import java.util.Optional; @@ -24,8 +23,8 @@ import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; /** - * A default {@link org.apache.iceberg.metrics.MetricsContext.Counter} implementation that uses a {@link Long} to count - * events. + * A default {@link org.apache.iceberg.metrics.MetricsContext.Counter} implementation that uses a + * {@link Long} to count events. */ class LongCounter implements MetricsContext.Counter { private final AtomicLong counter; @@ -56,8 +55,6 @@ public Long value() { @Override public String toString() { - return MoreObjects.toStringHelper(this) - .add("counter", counter) - .toString(); + return MoreObjects.toStringHelper(this).add("counter", counter).toString(); } } diff --git a/api/src/main/java/org/apache/iceberg/metrics/MetricsContext.java b/api/src/main/java/org/apache/iceberg/metrics/MetricsContext.java index 6b1fec3b8375..d8c0f6a3fa5a 100644 --- a/api/src/main/java/org/apache/iceberg/metrics/MetricsContext.java +++ b/api/src/main/java/org/apache/iceberg/metrics/MetricsContext.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.metrics; import java.io.Serializable; @@ -25,9 +24,8 @@ import java.util.concurrent.TimeUnit; /** - * Generalized interface for creating telemetry related instances for tracking - * operations. Implementations must take into account usage considerations - * like thread safety and serialization. + * Generalized interface for creating telemetry related instances for tracking operations. + * Implementations must take into account usage considerations like thread safety and serialization. */ public interface MetricsContext extends Serializable { enum Unit { @@ -46,13 +44,10 @@ public String displayName() { } } - default void initialize(Map properties) { - } + default void initialize(Map properties) {} interface Counter { - /** - * Increment the counter by a single whole number value (i.e. 1). - */ + /** Increment the counter by a single whole number value (i.e. 1). */ void increment(); /** @@ -88,8 +83,8 @@ default Unit unit() { } /** - * Get a named counter of a specific type. Metric implementations may impose - * restrictions on what types are supported for specific counters. + * Get a named counter of a specific type. Metric implementations may impose restrictions on what + * types are supported for specific counters. * * @param name name of the metric * @param type numeric type of the counter value @@ -128,12 +123,10 @@ public Timer timer(String name, TimeUnit unit) { public Counter counter(String name, Class type, Unit unit) { return new Counter() { @Override - public void increment() { - } + public void increment() {} @Override - public void increment(T amount) { - } + public void increment(T amount) {} }; } }; diff --git a/api/src/main/java/org/apache/iceberg/metrics/Timer.java b/api/src/main/java/org/apache/iceberg/metrics/Timer.java index c12cafe05173..37cd1786ae34 100644 --- a/api/src/main/java/org/apache/iceberg/metrics/Timer.java +++ b/api/src/main/java/org/apache/iceberg/metrics/Timer.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.metrics; import java.time.Duration; @@ -25,7 +24,8 @@ import java.util.function.Supplier; /** - * Generalized Timer interface for creating telemetry related instances for measuring duration of operations. + * Generalized Timer interface for creating telemetry related instances for measuring duration of + * operations. */ public interface Timer { @@ -44,7 +44,8 @@ public interface Timer { Duration totalDuration(); /** - * Starts the timer and returns a {@link Timed} instance. Call {@link Timed#stop()} to complete the timing. + * Starts the timer and returns a {@link Timed} instance. Call {@link Timed#stop()} to complete + * the timing. * * @return A {@link Timed} instance with the start time recorded. */ @@ -54,7 +55,7 @@ public interface Timer { * Records a custom amount in the given time unit. * * @param amount The amount to record - * @param unit The time unit of the amount + * @param unit The time unit of the amount */ void record(long amount, TimeUnit unit); @@ -78,7 +79,7 @@ default void time(Duration duration) { * Executes and measures the given {@link Callable} and returns its result. * * @param callable The {@link Callable} to execute and measure. - * @param The type of the {@link Callable} + * @param The type of the {@link Callable} * @return The result of the underlying {@link Callable}. * @throws Exception In case the {@link Callable} fails. */ @@ -88,19 +89,17 @@ default void time(Duration duration) { * Gets the result from the given {@link Supplier} and measures its execution time. * * @param supplier The {@link Supplier} to execute and measure. - * @param The type of the {@link Supplier}. + * @param The type of the {@link Supplier}. * @return The result of the underlying {@link Supplier}. */ T time(Supplier supplier); /** - * A timing sample that carries internal state about the Timer's start position. The timing can be completed by - * calling {@link Timed#stop()}. + * A timing sample that carries internal state about the Timer's start position. The timing can be + * completed by calling {@link Timed#stop()}. */ interface Timed extends AutoCloseable { - /** - * Stops the timer and records the total duration up until {@link Timer#start()} was called. - */ + /** Stops the timer and records the total duration up until {@link Timer#start()} was called. */ void stop(); @Override @@ -108,41 +107,40 @@ default void close() { stop(); } - Timed NOOP = () -> { }; + Timed NOOP = () -> {}; } - Timer NOOP = new Timer() { - @Override - public Timed start() { - return Timed.NOOP; - } - - @Override - public long count() { - return 0; - } - - @Override - public Duration totalDuration() { - return Duration.ZERO; - } - - @Override - public void record(long amount, TimeUnit unit) { - } - - @Override - public void time(Runnable runnable) { - } - - @Override - public T timeCallable(Callable callable) throws Exception { - return callable.call(); - } - - @Override - public T time(Supplier supplier) { - return supplier.get(); - } - }; + Timer NOOP = + new Timer() { + @Override + public Timed start() { + return Timed.NOOP; + } + + @Override + public long count() { + return 0; + } + + @Override + public Duration totalDuration() { + return Duration.ZERO; + } + + @Override + public void record(long amount, TimeUnit unit) {} + + @Override + public void time(Runnable runnable) {} + + @Override + public T timeCallable(Callable callable) throws Exception { + return callable.call(); + } + + @Override + public T time(Supplier supplier) { + return supplier.get(); + } + }; } diff --git a/api/src/main/java/org/apache/iceberg/transforms/Bucket.java b/api/src/main/java/org/apache/iceberg/transforms/Bucket.java index 0ff2ff3db486..ecbefa5cf015 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/Bucket.java +++ b/api/src/main/java/org/apache/iceberg/transforms/Bucket.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; +import static org.apache.iceberg.types.Type.TypeID; + import java.math.BigDecimal; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; @@ -38,15 +39,13 @@ import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Type.TypeID; - abstract class Bucket implements Transform { private static final HashFunction MURMUR3 = Hashing.murmur3_32_fixed(); @SuppressWarnings("unchecked") static Bucket get(Type type, int numBuckets) { - Preconditions.checkArgument(numBuckets > 0, - "Invalid number of buckets: %s (must be > 0)", numBuckets); + Preconditions.checkArgument( + numBuckets > 0, "Invalid number of buckets: %s (must be > 0)", numBuckets); switch (type.typeId()) { case DATE: @@ -124,7 +123,8 @@ public UnboundPredicate project(String name, BoundPredicate predicat } else if (predicate.isLiteralPredicate() && predicate.op() == Expression.Operation.EQ) { return Expressions.predicate( predicate.op(), name, apply(predicate.asLiteralPredicate().literal().value())); - } else if (predicate.isSetPredicate() && predicate.op() == Expression.Operation.IN) { // notIn can't be projected + } else if (predicate.isSetPredicate() + && predicate.op() == Expression.Operation.IN) { // notIn can't be projected return ProjectionUtil.transformSet(name, predicate.asSetPredicate(), this); } @@ -144,7 +144,8 @@ public UnboundPredicate projectStrict(String name, BoundPredicate pr return Expressions.predicate(predicate.op(), name); } else if (predicate.isLiteralPredicate() && predicate.op() == Expression.Operation.NOT_EQ) { // TODO: need to translate not(eq(...)) into notEq in expressions - return Expressions.predicate(predicate.op(), name, apply(predicate.asLiteralPredicate().literal().value())); + return Expressions.predicate( + predicate.op(), name, apply(predicate.asLiteralPredicate().literal().value())); } else if (predicate.isSetPredicate() && predicate.op() == Expression.Operation.NOT_IN) { return ProjectionUtil.transformSet(name, predicate.asSetPredicate(), this); } @@ -186,10 +187,9 @@ public int hash(Long value) { @Override public boolean canTransform(Type type) { - return type.typeId() == TypeID.LONG || - type.typeId() == TypeID.TIME || - type.typeId() == TypeID.TIMESTAMP; - + return type.typeId() == TypeID.LONG + || type.typeId() == TypeID.TIME + || type.typeId() == TypeID.TIMESTAMP; } } @@ -246,8 +246,7 @@ public boolean canTransform(Type type) { } private static class BucketByteBuffer extends Bucket { - private static final Set SUPPORTED_TYPES = Sets.newHashSet( - TypeID.BINARY, TypeID.FIXED); + private static final Set SUPPORTED_TYPES = Sets.newHashSet(TypeID.BINARY, TypeID.FIXED); private BucketByteBuffer(int numBuckets) { super(numBuckets); @@ -256,9 +255,12 @@ private BucketByteBuffer(int numBuckets) { @Override public int hash(ByteBuffer value) { if (value.hasArray()) { - return MURMUR3.hashBytes(value.array(), - value.arrayOffset() + value.position(), - value.arrayOffset() + value.remaining()).asInt(); + return MURMUR3 + .hashBytes( + value.array(), + value.arrayOffset() + value.position(), + value.arrayOffset() + value.remaining()) + .asInt(); } else { int position = value.position(); byte[] copy = new byte[value.remaining()]; @@ -285,10 +287,12 @@ private BucketUUID(int numBuckets) { @Override public int hash(UUID value) { - return MURMUR3.newHasher(16) - .putLong(Long.reverseBytes(value.getMostSignificantBits())) - .putLong(Long.reverseBytes(value.getLeastSignificantBits())) - .hash().asInt(); + return MURMUR3 + .newHasher(16) + .putLong(Long.reverseBytes(value.getMostSignificantBits())) + .putLong(Long.reverseBytes(value.getLeastSignificantBits())) + .hash() + .asInt(); } @Override diff --git a/api/src/main/java/org/apache/iceberg/transforms/Dates.java b/api/src/main/java/org/apache/iceberg/transforms/Dates.java index e8fb1230a384..b313a2f154e9 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/Dates.java +++ b/api/src/main/java/org/apache/iceberg/transforms/Dates.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; import java.time.Instant; @@ -36,7 +35,8 @@ enum Dates implements Transform { MONTH(ChronoUnit.MONTHS, "month"), DAY(ChronoUnit.DAYS, "day"); - private static final LocalDate EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC).toLocalDate(); + private static final LocalDate EPOCH = + Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC).toLocalDate(); private final ChronoUnit granularity; private final String name; @@ -59,7 +59,8 @@ public Integer apply(Integer days) { LocalDate date = EPOCH.plusDays(days); return (int) granularity.between(EPOCH, date); } else { - // add 1 day to the value to account for the case where there is exactly 1 unit between the date and epoch + // add 1 day to the value to account for the case where there is exactly 1 unit between the + // date and epoch // because the result will always be decremented. LocalDate date = EPOCH.plusDays(days + 1); return (int) granularity.between(EPOCH, date) - 1; @@ -91,9 +92,11 @@ public boolean satisfiesOrderOf(Transform other) { } if (other instanceof Dates) { - // test the granularity, in days. day(ts) => 1 day, months(ts) => 30 days, and day satisfies the order of months + // test the granularity, in days. day(ts) => 1 day, months(ts) => 30 days, and day satisfies + // the order of months Dates otherTransform = (Dates) other; - return granularity.getDuration().toDays() <= otherTransform.granularity.getDuration().toDays(); + return granularity.getDuration().toDays() + <= otherTransform.granularity.getDuration().toDays(); } return false; @@ -109,7 +112,8 @@ public UnboundPredicate project(String fieldName, BoundPredicate projected = ProjectionUtil.truncateInteger(fieldName, pred.asLiteralPredicate(), this); + UnboundPredicate projected = + ProjectionUtil.truncateInteger(fieldName, pred.asLiteralPredicate(), this); if (this != DAY) { return ProjectionUtil.fixInclusiveTimeProjection(projected); } @@ -117,7 +121,8 @@ public UnboundPredicate project(String fieldName, BoundPredicate projected = ProjectionUtil.transformSet(fieldName, pred.asSetPredicate(), this); + UnboundPredicate projected = + ProjectionUtil.transformSet(fieldName, pred.asSetPredicate(), this); if (this != DAY) { return ProjectionUtil.fixInclusiveTimeProjection(projected); } @@ -138,8 +143,8 @@ public UnboundPredicate projectStrict(String fieldName, BoundPredicate< return Expressions.predicate(pred.op(), fieldName); } else if (pred.isLiteralPredicate()) { - UnboundPredicate projected = ProjectionUtil.truncateIntegerStrict( - fieldName, pred.asLiteralPredicate(), this); + UnboundPredicate projected = + ProjectionUtil.truncateIntegerStrict(fieldName, pred.asLiteralPredicate(), this); if (this != DAY) { return ProjectionUtil.fixStrictTimeProjection(projected); } @@ -147,7 +152,8 @@ public UnboundPredicate projectStrict(String fieldName, BoundPredicate< return projected; } else if (pred.isSetPredicate() && pred.op() == Expression.Operation.NOT_IN) { - UnboundPredicate projected = ProjectionUtil.transformSet(fieldName, pred.asSetPredicate(), this); + UnboundPredicate projected = + ProjectionUtil.transformSet(fieldName, pred.asSetPredicate(), this); if (this != DAY) { return ProjectionUtil.fixStrictTimeProjection(projected); } diff --git a/api/src/main/java/org/apache/iceberg/transforms/Identity.java b/api/src/main/java/org/apache/iceberg/transforms/Identity.java index 93e16a7ec762..78586543a24f 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/Identity.java +++ b/api/src/main/java/org/apache/iceberg/transforms/Identity.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; import java.nio.ByteBuffer; @@ -75,7 +74,8 @@ public UnboundPredicate projectStrict(String name, BoundPredicate predicat if (predicate.isUnaryPredicate()) { return Expressions.predicate(predicate.op(), name); } else if (predicate.isLiteralPredicate()) { - return Expressions.predicate(predicate.op(), name, predicate.asLiteralPredicate().literal().value()); + return Expressions.predicate( + predicate.op(), name, predicate.asLiteralPredicate().literal().value()); } else if (predicate.isSetPredicate()) { return Expressions.predicate(predicate.op(), name, predicate.asSetPredicate().literalSet()); } diff --git a/api/src/main/java/org/apache/iceberg/transforms/PartitionSpecVisitor.java b/api/src/main/java/org/apache/iceberg/transforms/PartitionSpecVisitor.java index 230ee3f5c103..eee174bc2d39 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/PartitionSpecVisitor.java +++ b/api/src/main/java/org/apache/iceberg/transforms/PartitionSpecVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; import java.util.List; @@ -87,7 +86,8 @@ default T alwaysNull(int fieldId, String sourceName, int sourceId) { } default T unknown(int fieldId, String sourceName, int sourceId, String transform) { - throw new UnsupportedOperationException(String.format("Unknown transform %s is not supported", transform)); + throw new UnsupportedOperationException( + String.format("Unknown transform %s is not supported", transform)); } /** diff --git a/api/src/main/java/org/apache/iceberg/transforms/ProjectionUtil.java b/api/src/main/java/org/apache/iceberg/transforms/ProjectionUtil.java index 4f12cbfc96cf..a359c9b88492 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/ProjectionUtil.java +++ b/api/src/main/java/org/apache/iceberg/transforms/ProjectionUtil.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; +import static org.apache.iceberg.expressions.Expressions.predicate; + import java.math.BigDecimal; import java.math.BigInteger; import java.util.Set; @@ -33,12 +34,9 @@ import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import static org.apache.iceberg.expressions.Expressions.predicate; - class ProjectionUtil { - private ProjectionUtil() { - } + private ProjectionUtil() {} static UnboundPredicate truncateInteger( String name, BoundLiteralPredicate pred, Transform transform) { @@ -76,7 +74,8 @@ static UnboundPredicate truncateIntegerStrict( case NOT_EQ: return predicate(Expression.Operation.NOT_EQ, name, transform.apply(boundary)); case EQ: - // there is no predicate that guarantees equality because adjacent ints transform to the same value + // there is no predicate that guarantees equality because adjacent ints transform to the + // same value return null; default: return null; @@ -98,7 +97,8 @@ static UnboundPredicate truncateLongStrict( case NOT_EQ: return predicate(Expression.Operation.NOT_EQ, name, transform.apply(boundary)); case EQ: - // there is no predicate that guarantees equality because adjacent longs transform to the same value + // there is no predicate that guarantees equality because adjacent longs transform to the + // same value return null; default: return null; @@ -127,23 +127,20 @@ static UnboundPredicate truncateLong( } static UnboundPredicate truncateDecimal( - String name, BoundLiteralPredicate pred, - Transform transform) { + String name, BoundLiteralPredicate pred, Transform transform) { BigDecimal boundary = pred.literal().value(); switch (pred.op()) { case LT: // adjust closed and then transform ltEq - BigDecimal minusOne = new BigDecimal( - boundary.unscaledValue().subtract(BigInteger.ONE), - boundary.scale()); + BigDecimal minusOne = + new BigDecimal(boundary.unscaledValue().subtract(BigInteger.ONE), boundary.scale()); return predicate(Expression.Operation.LT_EQ, name, transform.apply(minusOne)); case LT_EQ: return predicate(Expression.Operation.LT_EQ, name, transform.apply(boundary)); case GT: // adjust closed and then transform gtEq - BigDecimal plusOne = new BigDecimal( - boundary.unscaledValue().add(BigInteger.ONE), - boundary.scale()); + BigDecimal plusOne = + new BigDecimal(boundary.unscaledValue().add(BigInteger.ONE), boundary.scale()); return predicate(Expression.Operation.GT_EQ, name, transform.apply(plusOne)); case GT_EQ: return predicate(Expression.Operation.GT_EQ, name, transform.apply(boundary)); @@ -155,17 +152,14 @@ static UnboundPredicate truncateDecimal( } static UnboundPredicate truncateDecimalStrict( - String name, BoundLiteralPredicate pred, - Transform transform) { + String name, BoundLiteralPredicate pred, Transform transform) { BigDecimal boundary = pred.literal().value(); - BigDecimal minusOne = new BigDecimal( - boundary.unscaledValue().subtract(BigInteger.ONE), - boundary.scale()); + BigDecimal minusOne = + new BigDecimal(boundary.unscaledValue().subtract(BigInteger.ONE), boundary.scale()); - BigDecimal plusOne = new BigDecimal( - boundary.unscaledValue().add(BigInteger.ONE), - boundary.scale()); + BigDecimal plusOne = + new BigDecimal(boundary.unscaledValue().add(BigInteger.ONE), boundary.scale()); switch (pred.op()) { case LT: @@ -179,7 +173,8 @@ static UnboundPredicate truncateDecimalStrict( case NOT_EQ: return predicate(Expression.Operation.NOT_EQ, name, transform.apply(boundary)); case EQ: - // there is no predicate that guarantees equality because adjacent decimals transform to the same value + // there is no predicate that guarantees equality because adjacent decimals transform to the + // same value return null; default: return null; @@ -200,8 +195,8 @@ static UnboundPredicate truncateArray( return predicate(Expression.Operation.EQ, name, transform.apply(boundary)); case STARTS_WITH: return predicate(Expression.Operation.STARTS_WITH, name, transform.apply(boundary)); -// case IN: // TODO -// return Expressions.predicate(Operation.IN, name, transform.apply(boundary)); + // case IN: // TODO + // return Expressions.predicate(Operation.IN, name, transform.apply(boundary)); default: return null; } @@ -220,7 +215,8 @@ static UnboundPredicate truncateArrayStrict( case NOT_EQ: return predicate(Expression.Operation.NOT_EQ, name, transform.apply(boundary)); case EQ: - // there is no predicate that guarantees equality because adjacent values transform to the same partition + // there is no predicate that guarantees equality because adjacent values transform to the + // same partition return null; default: return null; @@ -231,16 +227,18 @@ static UnboundPredicate truncateArrayStrict( * If the predicate has a transformed child that matches the given transform, return a predicate. */ @SuppressWarnings("unchecked") - static UnboundPredicate projectTransformPredicate(Transform transform, - String partitionName, BoundPredicate pred) { - if (pred.term() instanceof BoundTransform && transform.equals(((BoundTransform) pred.term()).transform())) { + static UnboundPredicate projectTransformPredicate( + Transform transform, String partitionName, BoundPredicate pred) { + if (pred.term() instanceof BoundTransform + && transform.equals(((BoundTransform) pred.term()).transform())) { // the bound value must be a T because the transform matches return (UnboundPredicate) removeTransform(partitionName, pred); } return null; } - private static UnboundPredicate removeTransform(String partitionName, BoundPredicate pred) { + private static UnboundPredicate removeTransform( + String partitionName, BoundPredicate pred) { if (pred.isUnaryPredicate()) { return Expressions.predicate(pred.op(), partitionName); } else if (pred.isLiteralPredicate()) { @@ -248,24 +246,28 @@ private static UnboundPredicate removeTransform(String partitionName, Bou } else if (pred.isSetPredicate()) { return Expressions.predicate(pred.op(), partitionName, pred.asSetPredicate().literalSet()); } - throw new UnsupportedOperationException("Cannot replace transform in unknown predicate: " + pred); + throw new UnsupportedOperationException( + "Cannot replace transform in unknown predicate: " + pred); } - static UnboundPredicate transformSet(String fieldName, - BoundSetPredicate predicate, - Transform transform) { - return predicate(predicate.op(), fieldName, + static UnboundPredicate transformSet( + String fieldName, BoundSetPredicate predicate, Transform transform) { + return predicate( + predicate.op(), + fieldName, Iterables.transform(predicate.asSetPredicate().literalSet(), transform::apply)); } /** * Fixes an inclusive projection to account for incorrectly transformed values. - *

- * A bug in 0.10.0 and earlier caused negative values to be incorrectly transformed by date and timestamp transforms - * to 1 larger than the correct value. For example, day(1969-12-31 10:00:00) produced 0 instead of -1. To read data - * written by versions with this bug, this method adjusts the inclusive projection. The current inclusive projection - * is correct, so this modifies the "correct" projection when needed. For example, < day(1969-12-31 10:00:00) will - * produce <= -1 (= 1969-12-31) and is adjusted to <= 0 (= 1970-01-01) because the incorrect transformed value was 0. + * + *

A bug in 0.10.0 and earlier caused negative values to be incorrectly transformed by date and + * timestamp transforms to 1 larger than the correct value. For example, day(1969-12-31 10:00:00) + * produced 0 instead of -1. To read data written by versions with this bug, this method adjusts + * the inclusive projection. The current inclusive projection is correct, so this modifies the + * "correct" projection when needed. For example, < day(1969-12-31 10:00:00) will produce <= -1 (= + * 1969-12-31) and is adjusted to <= 0 (= 1970-01-01) because the incorrect transformed value was + * 0. */ static UnboundPredicate fixInclusiveTimeProjection(UnboundPredicate projected) { if (projected == null) { @@ -295,8 +297,10 @@ static UnboundPredicate fixInclusiveTimeProjection(UnboundPredicate fixInclusiveTimeProjection(UnboundPredicate - * A bug in 0.10.0 and earlier caused negative values to be incorrectly transformed by date and timestamp transforms - * to 1 larger than the correct value. For example, day(1969-12-31 10:00:00) produced 0 instead of -1. To read data - * written by versions with this bug, this method adjusts the strict projection. + * + *

A bug in 0.10.0 and earlier caused negative values to be incorrectly transformed by date and + * timestamp transforms to 1 larger than the correct value. For example, day(1969-12-31 10:00:00) + * produced 0 instead of -1. To read data written by versions with this bug, this method adjusts + * the strict projection. */ static UnboundPredicate fixStrictTimeProjection(UnboundPredicate projected) { if (projected == null) { @@ -348,9 +353,12 @@ static UnboundPredicate fixStrictTimeProjection(UnboundPredicate= month(1969-11-31) is > -2, but - // 1969-10-31 was previously transformed to month -2 instead of -3. This must use the more strict value. + // GT and GT_EQ need to be adjusted because values that do not match the predicate may have + // been transformed + // into partition values that match the projected predicate. For example, >= + // month(1969-11-31) is > -2, but + // 1969-10-31 was previously transformed to month -2 instead of -3. This must use the more + // strict value. if (projected.literal().value() <= 0) { return Expressions.greaterThan(projected.term(), projected.literal().value() + 1); } @@ -371,7 +379,8 @@ static UnboundPredicate fixStrictTimeProjection(UnboundPredicate - * These are used so that transform classes can be singletons and use identical equality. + * + *

These are used so that transform classes can be singletons and use identical equality. */ class SerializationProxies { - private SerializationProxies() { - } + private SerializationProxies() {} static class VoidTransformProxy implements Serializable { private static final VoidTransformProxy INSTANCE = new VoidTransformProxy(); @@ -38,11 +36,8 @@ static VoidTransformProxy get() { return INSTANCE; } - /** - * Constructor for Java serialization. - */ - VoidTransformProxy() { - } + /** Constructor for Java serialization. */ + VoidTransformProxy() {} Object readResolve() throws ObjectStreamException { return VoidTransform.get(); diff --git a/api/src/main/java/org/apache/iceberg/transforms/SortOrderVisitor.java b/api/src/main/java/org/apache/iceberg/transforms/SortOrderVisitor.java index cd41c7818264..ed3327571f74 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/SortOrderVisitor.java +++ b/api/src/main/java/org/apache/iceberg/transforms/SortOrderVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; import java.util.List; @@ -31,9 +30,11 @@ public interface SortOrderVisitor { T field(String sourceName, int sourceId, SortDirection direction, NullOrder nullOrder); - T bucket(String sourceName, int sourceId, int width, SortDirection direction, NullOrder nullOrder); + T bucket( + String sourceName, int sourceId, int width, SortDirection direction, NullOrder nullOrder); - T truncate(String sourceName, int sourceId, int width, SortDirection direction, NullOrder nullOrder); + T truncate( + String sourceName, int sourceId, int width, SortDirection direction, NullOrder nullOrder); T year(String sourceName, int sourceId, SortDirection direction, NullOrder nullOrder); @@ -43,8 +44,14 @@ public interface SortOrderVisitor { T hour(String sourceName, int sourceId, SortDirection direction, NullOrder nullOrder); - default T unknown(String sourceName, int sourceId, String transform, SortDirection direction, NullOrder nullOrder) { - throw new UnsupportedOperationException(String.format("Unknown transform %s is not supported", transform)); + default T unknown( + String sourceName, + int sourceId, + String transform, + SortDirection direction, + NullOrder nullOrder) { + throw new UnsupportedOperationException( + String.format("Unknown transform %s is not supported", transform)); } /** @@ -65,24 +72,38 @@ static List visit(SortOrder sortOrder, SortOrderVisitor visitor) { Transform transform = field.transform(); if (transform == null || transform instanceof Identity) { - results.add(visitor.field(sourceName, field.sourceId(), field.direction(), field.nullOrder())); + results.add( + visitor.field(sourceName, field.sourceId(), field.direction(), field.nullOrder())); } else if (transform instanceof Bucket) { int numBuckets = ((Bucket) transform).numBuckets(); - results.add(visitor.bucket(sourceName, field.sourceId(), numBuckets, field.direction(), field.nullOrder())); + results.add( + visitor.bucket( + sourceName, field.sourceId(), numBuckets, field.direction(), field.nullOrder())); } else if (transform instanceof Truncate) { int width = ((Truncate) transform).width(); - results.add(visitor.truncate(sourceName, field.sourceId(), width, field.direction(), field.nullOrder())); + results.add( + visitor.truncate( + sourceName, field.sourceId(), width, field.direction(), field.nullOrder())); } else if (transform == Dates.YEAR || transform == Timestamps.YEAR) { - results.add(visitor.year(sourceName, field.sourceId(), field.direction(), field.nullOrder())); + results.add( + visitor.year(sourceName, field.sourceId(), field.direction(), field.nullOrder())); } else if (transform == Dates.MONTH || transform == Timestamps.MONTH) { - results.add(visitor.month(sourceName, field.sourceId(), field.direction(), field.nullOrder())); + results.add( + visitor.month(sourceName, field.sourceId(), field.direction(), field.nullOrder())); } else if (transform == Dates.DAY || transform == Timestamps.DAY) { - results.add(visitor.day(sourceName, field.sourceId(), field.direction(), field.nullOrder())); + results.add( + visitor.day(sourceName, field.sourceId(), field.direction(), field.nullOrder())); } else if (transform == Timestamps.HOUR) { - results.add(visitor.hour(sourceName, field.sourceId(), field.direction(), field.nullOrder())); + results.add( + visitor.hour(sourceName, field.sourceId(), field.direction(), field.nullOrder())); } else if (transform instanceof UnknownTransform) { - results.add(visitor.unknown( - sourceName, field.sourceId(), transform.toString(), field.direction(), field.nullOrder())); + results.add( + visitor.unknown( + sourceName, + field.sourceId(), + transform.toString(), + field.direction(), + field.nullOrder())); } } diff --git a/api/src/main/java/org/apache/iceberg/transforms/Timestamps.java b/api/src/main/java/org/apache/iceberg/transforms/Timestamps.java index 29f1b2dd7e19..476033707293 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/Timestamps.java +++ b/api/src/main/java/org/apache/iceberg/transforms/Timestamps.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; import java.time.Instant; @@ -53,20 +52,21 @@ public Integer apply(Long timestampMicros) { } if (timestampMicros >= 0) { - OffsetDateTime timestamp = Instant - .ofEpochSecond( - Math.floorDiv(timestampMicros, 1_000_000), - Math.floorMod(timestampMicros, 1_000_000) * 1000) - .atOffset(ZoneOffset.UTC); + OffsetDateTime timestamp = + Instant.ofEpochSecond( + Math.floorDiv(timestampMicros, 1_000_000), + Math.floorMod(timestampMicros, 1_000_000) * 1000) + .atOffset(ZoneOffset.UTC); return (int) granularity.between(EPOCH, timestamp); } else { - // add 1 micro to the value to account for the case where there is exactly 1 unit between the timestamp and epoch + // add 1 micro to the value to account for the case where there is exactly 1 unit between the + // timestamp and epoch // because the result will always be decremented. - OffsetDateTime timestamp = Instant - .ofEpochSecond( - Math.floorDiv(timestampMicros, 1_000_000), - Math.floorMod(timestampMicros + 1, 1_000_000) * 1000) - .atOffset(ZoneOffset.UTC); + OffsetDateTime timestamp = + Instant.ofEpochSecond( + Math.floorDiv(timestampMicros, 1_000_000), + Math.floorMod(timestampMicros + 1, 1_000_000) * 1000) + .atOffset(ZoneOffset.UTC); return (int) granularity.between(EPOCH, timestamp) - 1; } } @@ -96,9 +96,11 @@ public boolean satisfiesOrderOf(Transform other) { } if (other instanceof Timestamps) { - // test the granularity, in hours. hour(ts) => 1 hour, day(ts) => 24 hours, and hour satisfies the order of day + // test the granularity, in hours. hour(ts) => 1 hour, day(ts) => 24 hours, and hour satisfies + // the order of day Timestamps otherTransform = (Timestamps) other; - return granularity.getDuration().toHours() <= otherTransform.granularity.getDuration().toHours(); + return granularity.getDuration().toHours() + <= otherTransform.granularity.getDuration().toHours(); } return false; @@ -114,11 +116,13 @@ public UnboundPredicate project(String fieldName, BoundPredicate return Expressions.predicate(pred.op(), fieldName); } else if (pred.isLiteralPredicate()) { - UnboundPredicate projected = ProjectionUtil.truncateLong(fieldName, pred.asLiteralPredicate(), this); + UnboundPredicate projected = + ProjectionUtil.truncateLong(fieldName, pred.asLiteralPredicate(), this); return ProjectionUtil.fixInclusiveTimeProjection(projected); } else if (pred.isSetPredicate() && pred.op() == Expression.Operation.IN) { - UnboundPredicate projected = ProjectionUtil.transformSet(fieldName, pred.asSetPredicate(), this); + UnboundPredicate projected = + ProjectionUtil.transformSet(fieldName, pred.asSetPredicate(), this); return ProjectionUtil.fixInclusiveTimeProjection(projected); } @@ -135,12 +139,13 @@ public UnboundPredicate projectStrict(String fieldName, BoundPredicate< return Expressions.predicate(pred.op(), fieldName); } else if (pred.isLiteralPredicate()) { - UnboundPredicate projected = ProjectionUtil.truncateLongStrict( - fieldName, pred.asLiteralPredicate(), this); + UnboundPredicate projected = + ProjectionUtil.truncateLongStrict(fieldName, pred.asLiteralPredicate(), this); return ProjectionUtil.fixStrictTimeProjection(projected); } else if (pred.isSetPredicate() && pred.op() == Expression.Operation.NOT_IN) { - UnboundPredicate projected = ProjectionUtil.transformSet(fieldName, pred.asSetPredicate(), this); + UnboundPredicate projected = + ProjectionUtil.transformSet(fieldName, pred.asSetPredicate(), this); return ProjectionUtil.fixStrictTimeProjection(projected); } diff --git a/api/src/main/java/org/apache/iceberg/transforms/Transform.java b/api/src/main/java/org/apache/iceberg/transforms/Transform.java index a107952906f0..9e61bf377e72 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/Transform.java +++ b/api/src/main/java/org/apache/iceberg/transforms/Transform.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; import java.io.Serializable; @@ -26,8 +25,8 @@ /** * A transform function used for partitioning. - *

- * Implementations of this interface can be used to transform values, check or types, and project + * + *

Implementations of this interface can be used to transform values, check or types, and project * {@link BoundPredicate predicates} to predicates on partition values. * * @param Java class of source values @@ -60,8 +59,9 @@ public interface Transform extends Serializable { /** * Whether the transform preserves the order of values (is monotonic). - *

- * A transform preserves order for values when for any given a and b, if a < b then apply(a) <= apply(b). + * + *

A transform preserves order for values when for any given a and b, if a < b then apply(a) + * <= apply(b). * * @return true if the transform preserves the order of values */ @@ -70,10 +70,11 @@ default boolean preservesOrder() { } /** - * Whether ordering by this transform's result satisfies the ordering of another transform's result. - *

- * For example, sorting by day(ts) will produce an ordering that is also by month(ts) or year(ts). However, sorting - * by day(ts) will not satisfy the order of hour(ts) or identity(ts). + * Whether ordering by this transform's result satisfies the ordering of another transform's + * result. + * + *

For example, sorting by day(ts) will produce an ordering that is also by month(ts) or + * year(ts). However, sorting by day(ts) will not satisfy the order of hour(ts) or identity(ts). * * @return true if ordering by this transform is equivalent to ordering by the other transform */ @@ -82,10 +83,11 @@ default boolean satisfiesOrderOf(Transform other) { } /** - * Transforms a {@link BoundPredicate predicate} to an inclusive predicate on the partition - * values produced by {@link #apply(Object)}. - *

- * This inclusive transform guarantees that if pred(v) is true, then projected(apply(v)) is true. + * Transforms a {@link BoundPredicate predicate} to an inclusive predicate on the partition values + * produced by {@link #apply(Object)}. + * + *

This inclusive transform guarantees that if pred(v) is true, then projected(apply(v)) is + * true. * * @param name the field name for partition values * @param predicate a predicate for source values @@ -96,8 +98,9 @@ default boolean satisfiesOrderOf(Transform other) { /** * Transforms a {@link BoundPredicate predicate} to a strict predicate on the partition values * produced by {@link #apply(Object)}. - *

- * This strict transform guarantees that if strict(apply(v)) is true, then pred(v) is also true. + * + *

This strict transform guarantees that if strict(apply(v)) is true, then pred(v) is also + * true. * * @param name the field name for partition values * @param predicate a predicate for source values @@ -116,8 +119,8 @@ default boolean isIdentity() { /** * Returns a human-readable String representation of a transformed value. - *

- * null values will return "null" + * + *

null values will return "null" * * @param value a transformed value * @return a human-readable String representation of the value @@ -127,8 +130,8 @@ default String toHumanString(T value) { } /** - * Return the unique transform name to check if similar transforms for the same source field - * are added multiple times in partition spec builder. + * Return the unique transform name to check if similar transforms for the same source field are + * added multiple times in partition spec builder. * * @return a name used for dedup */ diff --git a/api/src/main/java/org/apache/iceberg/transforms/TransformUtil.java b/api/src/main/java/org/apache/iceberg/transforms/TransformUtil.java index 5e9cd5aa5403..53bc23a49888 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/TransformUtil.java +++ b/api/src/main/java/org/apache/iceberg/transforms/TransformUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; import java.nio.ByteBuffer; @@ -30,8 +29,7 @@ class TransformUtil { - private TransformUtil() { - } + private TransformUtil() {} private static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC); private static final int EPOCH_YEAR = EPOCH.getYear(); @@ -41,14 +39,15 @@ static String humanYear(int yearOrdinal) { } static String humanMonth(int monthOrdinal) { - return String.format("%04d-%02d", + return String.format( + "%04d-%02d", EPOCH_YEAR + Math.floorDiv(monthOrdinal, 12), 1 + Math.floorMod(monthOrdinal, 12)); } static String humanDay(int dayOrdinal) { OffsetDateTime day = EPOCH.plusDays(dayOrdinal); - return String.format("%04d-%02d-%02d", - day.getYear(), day.getMonth().getValue(), day.getDayOfMonth()); + return String.format( + "%04d-%02d-%02d", day.getYear(), day.getMonth().getValue(), day.getDayOfMonth()); } static String humanTime(Long microsFromMidnight) { @@ -65,7 +64,8 @@ static String humanTimestampWithoutZone(Long timestampMicros) { static String humanHour(int hourOrdinal) { OffsetDateTime time = EPOCH.plusHours(hourOrdinal); - return String.format("%04d-%02d-%02d-%02d", + return String.format( + "%04d-%02d-%02d-%02d", time.getYear(), time.getMonth().getValue(), time.getDayOfMonth(), time.getHour()); } diff --git a/api/src/main/java/org/apache/iceberg/transforms/Transforms.java b/api/src/main/java/org/apache/iceberg/transforms/Transforms.java index 15c6bf55ccf0..35ace7a80079 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/Transforms.java +++ b/api/src/main/java/org/apache/iceberg/transforms/Transforms.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; import java.util.Locale; @@ -29,15 +28,14 @@ /** * Factory methods for transforms. - *

- * Most users should create transforms using a - * {@link PartitionSpec#builderFor(Schema)} partition spec builder}. + * + *

Most users should create transforms using a {@link PartitionSpec#builderFor(Schema)} partition + * spec builder}. * * @see PartitionSpec#builderFor(Schema) The partition spec builder. */ public class Transforms { - private Transforms() { - } + private Transforms() {} private static final Pattern HAS_WIDTH = Pattern.compile("(\\w+)\\[(\\d+)\\]"); @@ -100,8 +98,7 @@ public static Transform year(Type type) { case TIMESTAMP: return (Transform) Timestamps.YEAR; default: - throw new IllegalArgumentException( - "Cannot partition type " + type + " by year"); + throw new IllegalArgumentException("Cannot partition type " + type + " by year"); } } @@ -120,8 +117,7 @@ public static Transform month(Type type) { case TIMESTAMP: return (Transform) Timestamps.MONTH; default: - throw new IllegalArgumentException( - "Cannot partition type " + type + " by month"); + throw new IllegalArgumentException("Cannot partition type " + type + " by month"); } } @@ -140,8 +136,7 @@ public static Transform day(Type type) { case TIMESTAMP: return (Transform) Timestamps.DAY; default: - throw new IllegalArgumentException( - "Cannot partition type " + type + " by day"); + throw new IllegalArgumentException("Cannot partition type " + type + " by day"); } } @@ -154,8 +149,8 @@ public static Transform day(Type type) { */ @SuppressWarnings("unchecked") public static Transform hour(Type type) { - Preconditions.checkArgument(type.typeId() == Type.TypeID.TIMESTAMP, - "Cannot partition type %s by hour", type); + Preconditions.checkArgument( + type.typeId() == Type.TypeID.TIMESTAMP, "Cannot partition type %s by hour", type); return (Transform) Timestamps.HOUR; } diff --git a/api/src/main/java/org/apache/iceberg/transforms/Truncate.java b/api/src/main/java/org/apache/iceberg/transforms/Truncate.java index b00ec6b1d80f..f0ac2c63d033 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/Truncate.java +++ b/api/src/main/java/org/apache/iceberg/transforms/Truncate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; import java.math.BigDecimal; @@ -37,8 +36,7 @@ abstract class Truncate implements Transform { @SuppressWarnings("unchecked") static Truncate get(Type type, int width) { - Preconditions.checkArgument(width > 0, - "Invalid truncate width: %s (must be > 0)", width); + Preconditions.checkArgument(width > 0, "Invalid truncate width: %s (must be > 0)", width); switch (type.typeId()) { case INTEGER: @@ -52,8 +50,7 @@ static Truncate get(Type type, int width) { case BINARY: return (Truncate) new TruncateByteBuffer(width); default: - throw new UnsupportedOperationException( - "Cannot truncate type: " + type); + throw new UnsupportedOperationException("Cannot truncate type: " + type); } } @@ -276,8 +273,8 @@ public boolean satisfiesOrderOf(Transform other) { } @Override - public UnboundPredicate project(String name, - BoundPredicate predicate) { + public UnboundPredicate project( + String name, BoundPredicate predicate) { if (predicate.term() instanceof BoundTransform) { return ProjectionUtil.projectTransformPredicate(this, name, predicate); } @@ -315,8 +312,8 @@ public UnboundPredicate project(String name, } @Override - public UnboundPredicate projectStrict(String name, - BoundPredicate predicate) { + public UnboundPredicate projectStrict( + String name, BoundPredicate predicate) { if (predicate.term() instanceof BoundTransform) { return ProjectionUtil.projectTransformPredicate(this, name, predicate); } @@ -405,8 +402,7 @@ public boolean canTransform(Type type) { } @Override - public UnboundPredicate project(String name, - BoundPredicate pred) { + public UnboundPredicate project(String name, BoundPredicate pred) { if (pred.term() instanceof BoundTransform) { return ProjectionUtil.projectTransformPredicate(this, name, pred); } @@ -422,8 +418,8 @@ public UnboundPredicate project(String name, } @Override - public UnboundPredicate projectStrict(String name, - BoundPredicate pred) { + public UnboundPredicate projectStrict( + String name, BoundPredicate pred) { if (pred.term() instanceof BoundTransform) { return ProjectionUtil.projectTransformPredicate(this, name, pred); } @@ -484,12 +480,14 @@ public BigDecimal apply(BigDecimal value) { return null; } - BigDecimal remainder = new BigDecimal( - value.unscaledValue() - .remainder(unscaledWidth) - .add(unscaledWidth) - .remainder(unscaledWidth), - value.scale()); + BigDecimal remainder = + new BigDecimal( + value + .unscaledValue() + .remainder(unscaledWidth) + .add(unscaledWidth) + .remainder(unscaledWidth), + value.scale()); return value.subtract(remainder); } @@ -500,8 +498,7 @@ public boolean canTransform(Type type) { } @Override - public UnboundPredicate project(String name, - BoundPredicate pred) { + public UnboundPredicate project(String name, BoundPredicate pred) { if (pred.term() instanceof BoundTransform) { return ProjectionUtil.projectTransformPredicate(this, name, pred); } @@ -517,8 +514,8 @@ public UnboundPredicate project(String name, } @Override - public UnboundPredicate projectStrict(String name, - BoundPredicate pred) { + public UnboundPredicate projectStrict( + String name, BoundPredicate pred) { if (pred.term() instanceof BoundTransform) { return ProjectionUtil.projectTransformPredicate(this, name, pred); } diff --git a/api/src/main/java/org/apache/iceberg/transforms/UnknownTransform.java b/api/src/main/java/org/apache/iceberg/transforms/UnknownTransform.java index 9c5c8daa24d2..95a30beac23e 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/UnknownTransform.java +++ b/api/src/main/java/org/apache/iceberg/transforms/UnknownTransform.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; import java.util.Objects; @@ -31,19 +30,20 @@ public class UnknownTransform implements Transform { private final String transform; UnknownTransform(Type sourceType, String transform) { - this.sourceType - = sourceType; + this.sourceType = sourceType; this.transform = transform; } @Override public T apply(S value) { - throw new UnsupportedOperationException(String.format("Cannot apply unsupported transform: %s", transform)); + throw new UnsupportedOperationException( + String.format("Cannot apply unsupported transform: %s", transform)); } @Override public boolean canTransform(Type type) { - // assume the transform function can be applied for this type because unknown transform is only used when parsing + // assume the transform function can be applied for this type because unknown transform is only + // used when parsing // a transform in an existing table. a different Iceberg version must have already validated it. return this.sourceType.equals(type); } diff --git a/api/src/main/java/org/apache/iceberg/transforms/VoidTransform.java b/api/src/main/java/org/apache/iceberg/transforms/VoidTransform.java index d2ecbda13d90..83f7f76bf014 100644 --- a/api/src/main/java/org/apache/iceberg/transforms/VoidTransform.java +++ b/api/src/main/java/org/apache/iceberg/transforms/VoidTransform.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; import java.io.ObjectStreamException; @@ -32,8 +31,7 @@ static VoidTransform get() { return (VoidTransform) INSTANCE; } - private VoidTransform() { - } + private VoidTransform() {} @Override public Void apply(Object value) { diff --git a/api/src/main/java/org/apache/iceberg/types/AssignFreshIds.java b/api/src/main/java/org/apache/iceberg/types/AssignFreshIds.java index d55514ab270e..e58f76a8de56 100644 --- a/api/src/main/java/org/apache/iceberg/types/AssignFreshIds.java +++ b/api/src/main/java/org/apache/iceberg/types/AssignFreshIds.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.util.Iterator; diff --git a/api/src/main/java/org/apache/iceberg/types/CheckCompatibility.java b/api/src/main/java/org/apache/iceberg/types/CheckCompatibility.java index 7b462a33e7f4..502e52c345e5 100644 --- a/api/src/main/java/org/apache/iceberg/types/CheckCompatibility.java +++ b/api/src/main/java/org/apache/iceberg/types/CheckCompatibility.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.util.List; @@ -31,8 +30,8 @@ public class CheckCompatibility extends TypeUtil.CustomOrderSchemaVisitor> { /** - * Returns a list of compatibility errors for writing with the given write schema. - * This includes nullability: writing optional (nullable) values to a required field is an error. + * Returns a list of compatibility errors for writing with the given write schema. This includes + * nullability: writing optional (nullable) values to a required field is an error. * * @param readSchema a read schema * @param writeSchema a write schema @@ -43,39 +42,42 @@ public static List writeCompatibilityErrors(Schema readSchema, Schema wr } /** - * Returns a list of compatibility errors for writing with the given write schema. - * This includes nullability: writing optional (nullable) values to a required field is an error - * Optionally this method allows case where input schema has different ordering than table schema. + * Returns a list of compatibility errors for writing with the given write schema. This includes + * nullability: writing optional (nullable) values to a required field is an error Optionally this + * method allows case where input schema has different ordering than table schema. + * * @param readSchema a read schema * @param writeSchema a write schema * @param checkOrdering If false, allow input schema to have different ordering than table schema * @return a list of error details, or an empty list if there are no compatibility problems */ - public static List writeCompatibilityErrors(Schema readSchema, Schema writeSchema, boolean checkOrdering) { + public static List writeCompatibilityErrors( + Schema readSchema, Schema writeSchema, boolean checkOrdering) { return TypeUtil.visit(readSchema, new CheckCompatibility(writeSchema, checkOrdering, true)); } /** - * Returns a list of compatibility errors for writing with the given write schema. - * This checks type compatibility and not nullability: writing optional (nullable) values - * to a required field is not an error. To check nullability as well as types, - * Optionally this method allows case where input schema has different ordering than table schema. - * use {@link #writeCompatibilityErrors(Schema, Schema)}. + * Returns a list of compatibility errors for writing with the given write schema. This checks + * type compatibility and not nullability: writing optional (nullable) values to a required field + * is not an error. To check nullability as well as types, Optionally this method allows case + * where input schema has different ordering than table schema. use {@link + * #writeCompatibilityErrors(Schema, Schema)}. * * @param readSchema a read schema * @param writeSchema a write schema * @param checkOrdering If false, allow input schema to have different ordering than table schema * @return a list of error details, or an empty list if there are no compatibility problems */ - public static List typeCompatibilityErrors(Schema readSchema, Schema writeSchema, boolean checkOrdering) { + public static List typeCompatibilityErrors( + Schema readSchema, Schema writeSchema, boolean checkOrdering) { return TypeUtil.visit(readSchema, new CheckCompatibility(writeSchema, checkOrdering, false)); } /** - * Returns a list of compatibility errors for writing with the given write schema. - * This checks type compatibility and not nullability: writing optional (nullable) values - * to a required field is not an error. To check nullability as well as types, - * use {@link #writeCompatibilityErrors(Schema, Schema)}. + * Returns a list of compatibility errors for writing with the given write schema. This checks + * type compatibility and not nullability: writing optional (nullable) values to a required field + * is not an error. To check nullability as well as types, use {@link + * #writeCompatibilityErrors(Schema, Schema)}. * * @param readSchema a read schema * @param writeSchema a write schema @@ -255,13 +257,15 @@ public List primitive(Type.PrimitiveType readPrimitive) { } if (!currentType.isPrimitiveType()) { - return ImmutableList.of(String.format(": %s cannot be read as a %s", - currentType.typeId().toString().toLowerCase(Locale.ENGLISH), readPrimitive)); + return ImmutableList.of( + String.format( + ": %s cannot be read as a %s", + currentType.typeId().toString().toLowerCase(Locale.ENGLISH), readPrimitive)); } if (!TypeUtil.isPromotionAllowed(currentType.asPrimitiveType(), readPrimitive)) { - return ImmutableList.of(String.format(": %s cannot be promoted to %s", - currentType, readPrimitive)); + return ImmutableList.of( + String.format(": %s cannot be promoted to %s", currentType, readPrimitive)); } // both are primitives and promotion is allowed to the read type diff --git a/api/src/main/java/org/apache/iceberg/types/Comparators.java b/api/src/main/java/org/apache/iceberg/types/Comparators.java index d876815ea9cc..5feb4cfd259f 100644 --- a/api/src/main/java/org/apache/iceberg/types/Comparators.java +++ b/api/src/main/java/org/apache/iceberg/types/Comparators.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.nio.ByteBuffer; @@ -29,24 +28,23 @@ public class Comparators { - private Comparators() { - } - - private static final ImmutableMap> COMPARATORS = ImmutableMap - .>builder() - .put(Types.BooleanType.get(), Comparator.naturalOrder()) - .put(Types.IntegerType.get(), Comparator.naturalOrder()) - .put(Types.LongType.get(), Comparator.naturalOrder()) - .put(Types.FloatType.get(), Comparator.naturalOrder()) - .put(Types.DoubleType.get(), Comparator.naturalOrder()) - .put(Types.DateType.get(), Comparator.naturalOrder()) - .put(Types.TimeType.get(), Comparator.naturalOrder()) - .put(Types.TimestampType.withZone(), Comparator.naturalOrder()) - .put(Types.TimestampType.withoutZone(), Comparator.naturalOrder()) - .put(Types.StringType.get(), Comparators.charSequences()) - .put(Types.UUIDType.get(), Comparator.naturalOrder()) - .put(Types.BinaryType.get(), Comparators.unsignedBytes()) - .build(); + private Comparators() {} + + private static final ImmutableMap> COMPARATORS = + ImmutableMap.>builder() + .put(Types.BooleanType.get(), Comparator.naturalOrder()) + .put(Types.IntegerType.get(), Comparator.naturalOrder()) + .put(Types.LongType.get(), Comparator.naturalOrder()) + .put(Types.FloatType.get(), Comparator.naturalOrder()) + .put(Types.DoubleType.get(), Comparator.naturalOrder()) + .put(Types.DateType.get(), Comparator.naturalOrder()) + .put(Types.TimeType.get(), Comparator.naturalOrder()) + .put(Types.TimestampType.withZone(), Comparator.naturalOrder()) + .put(Types.TimestampType.withoutZone(), Comparator.naturalOrder()) + .put(Types.StringType.get(), Comparators.charSequences()) + .put(Types.UUIDType.get(), Comparator.naturalOrder()) + .put(Types.BinaryType.get(), Comparators.unsignedBytes()) + .build(); public static Comparator forType(Types.StructType struct) { return new StructLikeComparator(struct); @@ -88,15 +86,18 @@ private static class StructLikeComparator implements Comparator { private final Class[] classes; private StructLikeComparator(Types.StructType struct) { - this.comparators = struct.fields().stream() - .map(field -> field.isOptional() ? - Comparators.nullsFirst().thenComparing(internal(field.type())) : - internal(field.type()) - ) - .toArray((IntFunction[]>) Comparator[]::new); - this.classes = struct.fields().stream() - .map(field -> field.type().typeId().javaClass()) - .toArray(Class[]::new); + this.comparators = + struct.fields().stream() + .map( + field -> + field.isOptional() + ? Comparators.nullsFirst().thenComparing(internal(field.type())) + : internal(field.type())) + .toArray((IntFunction[]>) Comparator[]::new); + this.classes = + struct.fields().stream() + .map(field -> field.type().typeId().javaClass()) + .toArray(Class[]::new); } @Override @@ -122,9 +123,10 @@ private static class ListComparator implements Comparator> { private ListComparator(Types.ListType list) { Comparator elemComparator = internal(list.elementType()); - this.elementComparator = list.isElementOptional() ? - Comparators.nullsFirst().thenComparing(elemComparator) : - elemComparator; + this.elementComparator = + list.isElementOptional() + ? Comparators.nullsFirst().thenComparing(elemComparator) + : elemComparator; } @Override @@ -174,8 +176,7 @@ public static Comparator charSequences() { private static class NullsFirst implements Comparator { private static final NullsFirst INSTANCE = new NullsFirst<>(); - private NullsFirst() { - } + private NullsFirst() {} @Override public int compare(T o1, T o2) { @@ -202,8 +203,7 @@ public Comparator thenComparing(Comparator other) { private static class NullsLast implements Comparator { private static final NullsLast INSTANCE = new NullsLast<>(); - private NullsLast() { - } + private NullsLast() {} @Override public int compare(T o1, T o2) { @@ -253,8 +253,7 @@ public int compare(T o1, T o2) { private static class UnsignedByteBufComparator implements Comparator { private static final UnsignedByteBufComparator INSTANCE = new UnsignedByteBufComparator(); - private UnsignedByteBufComparator() { - } + private UnsignedByteBufComparator() {} @Override public int compare(ByteBuffer buf1, ByteBuffer buf2) { @@ -269,9 +268,8 @@ public int compare(ByteBuffer buf1, ByteBuffer buf2) { int b2pos = buf2.position(); for (int i = 0; i < len; i += 1) { // Conversion to int is what Byte.toUnsignedInt would do - int cmp = Integer.compare( - ((int) buf1.get(b1pos + i)) & 0xff, - ((int) buf2.get(b2pos + i)) & 0xff); + int cmp = + Integer.compare(((int) buf1.get(b1pos + i)) & 0xff, ((int) buf2.get(b2pos + i)) & 0xff); if (cmp != 0) { return cmp; } @@ -285,8 +283,7 @@ public int compare(ByteBuffer buf1, ByteBuffer buf2) { private static class UnsignedByteArrayComparator implements Comparator { private static final UnsignedByteArrayComparator INSTANCE = new UnsignedByteArrayComparator(); - private UnsignedByteArrayComparator() { - } + private UnsignedByteArrayComparator() {} @Override public int compare(byte[] array1, byte[] array2) { @@ -313,16 +310,16 @@ public int compare(byte[] array1, byte[] array2) { private static class CharSeqComparator implements Comparator { private static final CharSeqComparator INSTANCE = new CharSeqComparator(); - private CharSeqComparator() { - } + private CharSeqComparator() {} /** - * Java character supports only upto 3 byte UTF-8 characters. 4 byte UTF-8 character is represented using two Java - * characters (using UTF-16 surrogate pairs). Character by character comparison may yield incorrect results - * while comparing a 4 byte UTF-8 character to a java char. Character by character comparison works as expected - * if both characters are <= 3 byte UTF-8 character or both characters are 4 byte UTF-8 characters. - * isCharInUTF16HighSurrogateRange method detects a 4-byte character and considers that character to be - * lexicographically greater than any 3 byte or lower UTF-8 character. + * Java character supports only upto 3 byte UTF-8 characters. 4 byte UTF-8 character is + * represented using two Java characters (using UTF-16 surrogate pairs). Character by character + * comparison may yield incorrect results while comparing a 4 byte UTF-8 character to a java + * char. Character by character comparison works as expected if both characters are <= 3 byte + * UTF-8 character or both characters are 4 byte UTF-8 characters. + * isCharInUTF16HighSurrogateRange method detects a 4-byte character and considers that + * character to be lexicographically greater than any 3 byte or lower UTF-8 character. */ @Override public int compare(CharSequence s1, CharSequence s2) { diff --git a/api/src/main/java/org/apache/iceberg/types/Conversions.java b/api/src/main/java/org/apache/iceberg/types/Conversions.java index d6ef24449b82..1d2539514954 100644 --- a/api/src/main/java/org/apache/iceberg/types/Conversions.java +++ b/api/src/main/java/org/apache/iceberg/types/Conversions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.math.BigDecimal; @@ -36,8 +35,7 @@ public class Conversions { - private Conversions() { - } + private Conversions() {} private static final String HIVE_NULL = "__HIVE_DEFAULT_PARTITION__"; @@ -63,8 +61,7 @@ public static Object fromPartitionString(Type type, String asString) { return UUID.fromString(asString); case FIXED: Types.FixedType fixed = (Types.FixedType) type; - return Arrays.copyOf( - asString.getBytes(StandardCharsets.UTF_8), fixed.length()); + return Arrays.copyOf(asString.getBytes(StandardCharsets.UTF_8), fixed.length()); case BINARY: return asString.getBytes(StandardCharsets.UTF_8); case DECIMAL: diff --git a/api/src/main/java/org/apache/iceberg/types/FindTypeVisitor.java b/api/src/main/java/org/apache/iceberg/types/FindTypeVisitor.java index 4febeb7e3685..d1dc4adc214a 100644 --- a/api/src/main/java/org/apache/iceberg/types/FindTypeVisitor.java +++ b/api/src/main/java/org/apache/iceberg/types/FindTypeVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.util.List; diff --git a/api/src/main/java/org/apache/iceberg/types/GetProjectedIds.java b/api/src/main/java/org/apache/iceberg/types/GetProjectedIds.java index 985663bf224e..a8a7de065ece 100644 --- a/api/src/main/java/org/apache/iceberg/types/GetProjectedIds.java +++ b/api/src/main/java/org/apache/iceberg/types/GetProjectedIds.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.util.List; diff --git a/api/src/main/java/org/apache/iceberg/types/IndexById.java b/api/src/main/java/org/apache/iceberg/types/IndexById.java index 4670ce96a3ca..40280c5ed9dd 100644 --- a/api/src/main/java/org/apache/iceberg/types/IndexById.java +++ b/api/src/main/java/org/apache/iceberg/types/IndexById.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.util.List; @@ -28,7 +27,8 @@ class IndexById extends TypeUtil.SchemaVisitor> private final Map index = Maps.newHashMap(); @Override - public Map schema(Schema schema, Map structResult) { + public Map schema( + Schema schema, Map structResult) { return index; } @@ -46,7 +46,8 @@ public Map field( } @Override - public Map list(Types.ListType list, Map elementResult) { + public Map list( + Types.ListType list, Map elementResult) { for (Types.NestedField field : list.fields()) { index.put(field.fieldId(), field); } @@ -55,7 +56,9 @@ public Map list(Types.ListType list, Map map( - Types.MapType map, Map keyResult, Map valueResult) { + Types.MapType map, + Map keyResult, + Map valueResult) { for (Types.NestedField field : map.fields()) { index.put(field.fieldId(), field); } diff --git a/api/src/main/java/org/apache/iceberg/types/IndexByName.java b/api/src/main/java/org/apache/iceberg/types/IndexByName.java index a2f265233f32..9183ea85f467 100644 --- a/api/src/main/java/org/apache/iceberg/types/IndexByName.java +++ b/api/src/main/java/org/apache/iceberg/types/IndexByName.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.util.Deque; @@ -51,9 +50,10 @@ public IndexByName(Function quotingFunc) { /** * Returns a mapping from full field name to ID. - *

- * Short names for maps and lists are included for any name that does not conflict with a canonical name. For example, - * a list, 'l', of structs with field 'x' will produce short name 'l.x' in addition to canonical name 'l.element.x'. + * + *

Short names for maps and lists are included for any name that does not conflict with a + * canonical name. For example, a list, 'l', of structs with field 'x' will produce short name + * 'l.x' in addition to canonical name 'l.element.x'. * * @return a map from name to field ID */ @@ -69,8 +69,9 @@ public Map byName() { /** * Returns a mapping from field ID to full name. - *

- * Canonical names, not short names are returned, for example 'list.element.field' instead of 'list.field'. + * + *

Canonical names, not short names are returned, for example 'list.element.field' instead of + * 'list.field'. * * @return a map from field ID to name */ @@ -96,7 +97,8 @@ public void afterField(Types.NestedField field) { public void beforeListElement(Types.NestedField elementField) { fieldNames.push(elementField.name()); - // only add "element" to the short name if the element is not a struct, so that names are more natural + // only add "element" to the short name if the element is not a struct, so that names are more + // natural // for example, locations.latitude instead of locations.element.latitude if (!elementField.type().isStructType()) { shortFieldNames.push(elementField.name()); @@ -149,7 +151,8 @@ public Map schema(Schema schema, Map structRes } @Override - public Map struct(Types.StructType struct, List> fieldResults) { + public Map struct( + Types.StructType struct, List> fieldResults) { return nameToId; } @@ -166,7 +169,8 @@ public Map list(Types.ListType list, Map eleme } @Override - public Map map(Types.MapType map, Map keyResult, Map valueResult) { + public Map map( + Types.MapType map, Map keyResult, Map valueResult) { addField("key", map.keyId()); addField("value", map.valueId()); return nameToId; @@ -182,19 +186,23 @@ private void addField(String name, int fieldId) { String fullName = quotedName; if (!fieldNames.isEmpty()) { - Iterator quotedFieldNames = Iterators.transform(fieldNames.descendingIterator(), quotingFunc::apply); + Iterator quotedFieldNames = + Iterators.transform(fieldNames.descendingIterator(), quotingFunc::apply); fullName = DOT.join(DOT.join(quotedFieldNames), quotedName); } Integer existingFieldId = nameToId.put(fullName, fieldId); - ValidationException.check(existingFieldId == null, - "Invalid schema: multiple fields for name %s: %s and %s", fullName, existingFieldId, fieldId); + ValidationException.check( + existingFieldId == null, + "Invalid schema: multiple fields for name %s: %s and %s", + fullName, + existingFieldId, + fieldId); // also track the short name, if this is a nested field if (!shortFieldNames.isEmpty()) { - Iterator quotedShortFieldNames = Iterators.transform( - shortFieldNames.descendingIterator(), - quotingFunc::apply); + Iterator quotedShortFieldNames = + Iterators.transform(shortFieldNames.descendingIterator(), quotingFunc::apply); String shortName = DOT.join(DOT.join(quotedShortFieldNames), quotedName); if (!shortNameToId.containsKey(shortName)) { shortNameToId.put(shortName, fieldId); diff --git a/api/src/main/java/org/apache/iceberg/types/IndexParents.java b/api/src/main/java/org/apache/iceberg/types/IndexParents.java index 8224a775ed5a..bcd1e1ee900c 100644 --- a/api/src/main/java/org/apache/iceberg/types/IndexParents.java +++ b/api/src/main/java/org/apache/iceberg/types/IndexParents.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.util.Deque; @@ -46,7 +45,8 @@ public Map schema(Schema schema, Map structR } @Override - public Map struct(Types.StructType struct, List> fieldResults) { + public Map struct( + Types.StructType struct, List> fieldResults) { for (Types.NestedField field : struct.fields()) { Integer parentId = idStack.peek(); if (parentId != null) { @@ -69,7 +69,8 @@ public Map list(Types.ListType list, Map ele } @Override - public Map map(Types.MapType map, Map key, Map value) { + public Map map( + Types.MapType map, Map key, Map value) { idToParent.put(map.keyId(), idStack.peek()); idToParent.put(map.valueId(), idStack.peek()); return idToParent; diff --git a/api/src/main/java/org/apache/iceberg/types/JavaHash.java b/api/src/main/java/org/apache/iceberg/types/JavaHash.java index d64fc1c1e67c..1988a90322e4 100644 --- a/api/src/main/java/org/apache/iceberg/types/JavaHash.java +++ b/api/src/main/java/org/apache/iceberg/types/JavaHash.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.util.Objects; diff --git a/api/src/main/java/org/apache/iceberg/types/JavaHashes.java b/api/src/main/java/org/apache/iceberg/types/JavaHashes.java index 55124b4fce8c..f495a50a7301 100644 --- a/api/src/main/java/org/apache/iceberg/types/JavaHashes.java +++ b/api/src/main/java/org/apache/iceberg/types/JavaHashes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.util.List; @@ -24,8 +23,7 @@ import org.apache.iceberg.StructLike; public class JavaHashes { - private JavaHashes() { - } + private JavaHashes() {} public static int hashCode(CharSequence str) { int result = 177; @@ -51,16 +49,17 @@ static JavaHash> list(Types.ListType list) { private static class CharSequenceHash implements JavaHash { private static final CharSequenceHash INSTANCE = new CharSequenceHash(); - private CharSequenceHash() { - } + private CharSequenceHash() {} @Override public int hash(Object str) { if (str instanceof CharSequence) { return JavaHashes.hashCode((CharSequence) str); } else if (str != null) { - // UnknownTransform results are assumed to be string, the most generic type. But there is no guarantee that the - // values actually are strings so this can receive non-string values to hash. To get a consistent hash code for + // UnknownTransform results are assumed to be string, the most generic type. But there is no + // guarantee that the + // values actually are strings so this can receive non-string values to hash. To get a + // consistent hash code for // those values, convert to string an hash the string. return JavaHashes.hashCode(str.toString()); } @@ -73,10 +72,11 @@ private static class StructLikeHash implements JavaHash { private final JavaHash[] hashes; private StructLikeHash(Types.StructType struct) { - this.hashes = struct.fields().stream() - .map(Types.NestedField::type) - .map(JavaHash::forType) - .toArray((IntFunction[]>) JavaHash[]::new); + this.hashes = + struct.fields().stream() + .map(Types.NestedField::type) + .map(JavaHash::forType) + .toArray((IntFunction[]>) JavaHash[]::new); } @Override diff --git a/api/src/main/java/org/apache/iceberg/types/PrimitiveHolder.java b/api/src/main/java/org/apache/iceberg/types/PrimitiveHolder.java index 5571babdc1d5..42f0da38167d 100644 --- a/api/src/main/java/org/apache/iceberg/types/PrimitiveHolder.java +++ b/api/src/main/java/org/apache/iceberg/types/PrimitiveHolder.java @@ -16,23 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.io.ObjectStreamException; import java.io.Serializable; -/** - * Replacement for primitive types in Java Serialization. - */ +/** Replacement for primitive types in Java Serialization. */ class PrimitiveHolder implements Serializable { private String typeAsString = null; - /** - * Constructor for Java serialization. - */ - PrimitiveHolder() { - } + /** Constructor for Java serialization. */ + PrimitiveHolder() {} PrimitiveHolder(String typeAsString) { this.typeAsString = typeAsString; diff --git a/api/src/main/java/org/apache/iceberg/types/PruneColumns.java b/api/src/main/java/org/apache/iceberg/types/PruneColumns.java index 2944ec7bb5c0..daf2e6bbc0ca 100644 --- a/api/src/main/java/org/apache/iceberg/types/PruneColumns.java +++ b/api/src/main/java/org/apache/iceberg/types/PruneColumns.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.util.List; @@ -34,8 +33,8 @@ class PruneColumns extends TypeUtil.SchemaVisitor { /** * Visits a schema and returns only the fields selected by the id set. - *

- * When selectFullTypes is false selecting list or map types is undefined and forbidden. + * + *

When selectFullTypes is false selecting list or map types is undefined and forbidden. * * @param selected ids of elements to return * @param selectFullTypes whether to select all subfields of a selected nested type @@ -68,10 +67,12 @@ public Type struct(Types.StructType struct, List fieldResults) { sameTypes = false; // signal that some types were altered if (field.isOptional()) { selectedFields.add( - Types.NestedField.optional(field.fieldId(), field.name(), projectedType, field.doc())); + Types.NestedField.optional( + field.fieldId(), field.name(), projectedType, field.doc())); } else { selectedFields.add( - Types.NestedField.required(field.fieldId(), field.name(), projectedType, field.doc())); + Types.NestedField.required( + field.fieldId(), field.name(), projectedType, field.doc())); } } } @@ -95,9 +96,12 @@ public Type field(Types.NestedField field, Type fieldResult) { } else if (field.type().isStructType()) { return projectSelectedStruct(fieldResult); } else { - Preconditions.checkArgument(!field.type().isNestedType(), + Preconditions.checkArgument( + !field.type().isNestedType(), "Cannot explicitly project List or Map types, %s:%s of type %s was selected", - field.fieldId(), field.name(), field.type()); + field.fieldId(), + field.name(), + field.type()); // Selected non-struct field return field.type(); } @@ -117,9 +121,11 @@ public Type list(Types.ListType list, Type elementResult) { StructType projectedStruct = projectSelectedStruct(elementResult); return projectList(list, projectedStruct); } else { - Preconditions.checkArgument(list.elementType().isPrimitiveType(), + Preconditions.checkArgument( + list.elementType().isPrimitiveType(), "Cannot explicitly project List or Map types, List element %s of type %s was selected", - list.elementId(), list.elementType()); + list.elementId(), + list.elementType()); return list; } } else if (elementResult != null) { @@ -137,9 +143,11 @@ public Type map(Types.MapType map, Type ignored, Type valueResult) { Type projectedStruct = projectSelectedStruct(valueResult); return projectMap(map, projectedStruct); } else { - Preconditions.checkArgument(map.valueType().isPrimitiveType(), + Preconditions.checkArgument( + map.valueType().isPrimitiveType(), "Cannot explicitly project List or Map types, Map value %s of type %s was selected", - map.valueId(), map.valueType()); + map.valueId(), + map.valueType()); return map; } } else if (valueResult != null) { @@ -157,7 +165,8 @@ public Type primitive(Type.PrimitiveType primitive) { } private ListType projectList(ListType list, Type elementResult) { - Preconditions.checkArgument(elementResult != null, "Cannot project a list when the element result is null"); + Preconditions.checkArgument( + elementResult != null, "Cannot project a list when the element result is null"); if (list.elementType() == elementResult) { return list; } else if (list.isElementOptional()) { @@ -168,7 +177,8 @@ private ListType projectList(ListType list, Type elementResult) { } private MapType projectMap(MapType map, Type valueResult) { - Preconditions.checkArgument(valueResult != null, "Attempted to project a map without a defined map value type"); + Preconditions.checkArgument( + valueResult != null, "Attempted to project a map without a defined map value type"); if (map.valueType() == valueResult) { return map; } else if (map.isValueOptional()) { @@ -181,6 +191,7 @@ private MapType projectMap(MapType map, Type valueResult) { /** * If select full types is disabled we need to recreate the struct with only the selected * subfields. If no subfields are selected we return an empty struct. + * * @param projectedField subfields already selected in this projection * @return projected struct */ diff --git a/api/src/main/java/org/apache/iceberg/types/ReassignIds.java b/api/src/main/java/org/apache/iceberg/types/ReassignIds.java index f35012de0891..3db43dafebe5 100644 --- a/api/src/main/java/org/apache/iceberg/types/ReassignIds.java +++ b/api/src/main/java/org/apache/iceberg/types/ReassignIds.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.util.List; diff --git a/api/src/main/java/org/apache/iceberg/types/Type.java b/api/src/main/java/org/apache/iceberg/types/Type.java index 8a5c9b66a9a9..05a253ef6d7b 100644 --- a/api/src/main/java/org/apache/iceberg/types/Type.java +++ b/api/src/main/java/org/apache/iceberg/types/Type.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.io.ObjectStreamException; @@ -132,5 +131,4 @@ public NestedType asNestedType() { public abstract Types.NestedField field(int id); } - } diff --git a/api/src/main/java/org/apache/iceberg/types/TypeUtil.java b/api/src/main/java/org/apache/iceberg/types/TypeUtil.java index e4791ee02cdc..a718e51a9536 100644 --- a/api/src/main/java/org/apache/iceberg/types/TypeUtil.java +++ b/api/src/main/java/org/apache/iceberg/types/TypeUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.util.Collections; @@ -41,15 +40,15 @@ public class TypeUtil { - private TypeUtil() { - } + private TypeUtil() {} /** * Project extracts particular fields from a schema by ID. - *

- * Unlike {@link TypeUtil#select(Schema, Set)}, project will pick out only the fields enumerated. Structs that are - * explicitly projected are empty unless sub-fields are explicitly projected. Maps and lists cannot be explicitly - * selected in fieldIds. + * + *

Unlike {@link TypeUtil#select(Schema, Set)}, project will pick out only the fields + * enumerated. Structs that are explicitly projected are empty unless sub-fields are explicitly + * projected. Maps and lists cannot be explicitly selected in fieldIds. + * * @param schema to project fields from * @param fieldIds list of explicit fields to extract * @return the schema with all fields fields not selected removed @@ -161,8 +160,8 @@ public static Map indexNameById(Types.StructType struct) { return indexer.byId(); } - public static Map indexQuotedNameById(Types.StructType struct, - Function quotingFunc) { + public static Map indexQuotedNameById( + Types.StructType struct, Function quotingFunc) { IndexByName indexer = new IndexByName(quotingFunc); visit(struct, indexer); return indexer.byId(); @@ -170,8 +169,9 @@ public static Map indexQuotedNameById(Types.StructType struct, public static Map indexByLowerCaseName(Types.StructType struct) { Map indexByLowerCaseName = Maps.newHashMap(); - indexByName(struct).forEach((name, integer) -> - indexByLowerCaseName.put(name.toLowerCase(Locale.ROOT), integer)); + indexByName(struct) + .forEach( + (name, integer) -> indexByLowerCaseName.put(name.toLowerCase(Locale.ROOT), integer)); return indexByLowerCaseName; } @@ -202,7 +202,8 @@ public static Type assignFreshIds(Type type, NextID nextId) { * @return a structurally identical schema with new ids assigned by the nextId function */ public static Schema assignFreshIds(Schema schema, NextID nextId) { - Types.StructType struct = TypeUtil.visit(schema.asStruct(), new AssignFreshIds(nextId)).asStructType(); + Types.StructType struct = + TypeUtil.visit(schema.asStruct(), new AssignFreshIds(nextId)).asStructType(); return new Schema(struct.fields(), refreshIdentifierFields(struct, schema)); } @@ -215,12 +216,14 @@ public static Schema assignFreshIds(Schema schema, NextID nextId) { * @return a structurally identical schema with new ids assigned by the nextId function */ public static Schema assignFreshIds(int schemaId, Schema schema, NextID nextId) { - Types.StructType struct = TypeUtil.visit(schema.asStruct(), new AssignFreshIds(nextId)).asStructType(); + Types.StructType struct = + TypeUtil.visit(schema.asStruct(), new AssignFreshIds(nextId)).asStructType(); return new Schema(schemaId, struct.fields(), refreshIdentifierFields(struct, schema)); } /** - * Assigns ids to match a given schema, and fresh ids from the {@link NextID nextId function} for all other fields. + * Assigns ids to match a given schema, and fresh ids from the {@link NextID nextId function} for + * all other fields. * * @param schema a schema * @param baseSchema a schema with existing IDs to copy by name @@ -228,23 +231,31 @@ public static Schema assignFreshIds(int schemaId, Schema schema, NextID nextId) * @return a structurally identical schema with new ids assigned by the nextId function */ public static Schema assignFreshIds(Schema schema, Schema baseSchema, NextID nextId) { - Types.StructType struct = TypeUtil - .visit(schema.asStruct(), new AssignFreshIds(schema, baseSchema, nextId)) - .asStructType(); + Types.StructType struct = + TypeUtil.visit(schema.asStruct(), new AssignFreshIds(schema, baseSchema, nextId)) + .asStructType(); return new Schema(struct.fields(), refreshIdentifierFields(struct, schema)); } /** - * Get the identifier fields in the fresh schema based on the identifier fields in the base schema. + * Get the identifier fields in the fresh schema based on the identifier fields in the base + * schema. + * * @param freshSchema fresh schema * @param baseSchema base schema * @return identifier fields in the fresh schema */ - public static Set refreshIdentifierFields(Types.StructType freshSchema, Schema baseSchema) { + public static Set refreshIdentifierFields( + Types.StructType freshSchema, Schema baseSchema) { Map nameToId = TypeUtil.indexByName(freshSchema); Set identifierFieldNames = baseSchema.identifierFieldNames(); - identifierFieldNames.forEach(name -> Preconditions.checkArgument(nameToId.containsKey(name), - "Cannot find ID for identifier field %s in schema %s", name, freshSchema)); + identifierFieldNames.forEach( + name -> + Preconditions.checkArgument( + nameToId.containsKey(name), + "Cannot find ID for identifier field %s in schema %s", + name, + freshSchema)); return identifierFieldNames.stream().map(nameToId::get).collect(Collectors.toSet()); } @@ -261,11 +272,11 @@ public static Schema assignIncreasingFreshIds(Schema schema) { /** * Reassigns ids in a schema from another schema. - *

- * Ids are determined by field names. If a field in the schema cannot be found in the source + * + *

Ids are determined by field names. If a field in the schema cannot be found in the source * schema, this will throw IllegalArgumentException. - *

- * This will not alter a schema's structure, nullability, or types. + * + *

This will not alter a schema's structure, nullability, or types. * * @param schema the schema to have ids reassigned * @param idSourceSchema the schema from which field ids will be used @@ -279,7 +290,8 @@ public static Schema reassignIds(Schema schema, Schema idSourceSchema) { public static Schema reassignOrRefreshIds(Schema schema, Schema idSourceSchema) { AtomicInteger highest = new AtomicInteger(schema.highestFieldId()); - Types.StructType struct = visit(schema, new ReassignIds(idSourceSchema, highest::incrementAndGet)).asStructType(); + Types.StructType struct = + visit(schema, new ReassignIds(idSourceSchema, highest::incrementAndGet)).asStructType(); return new Schema(struct.fields(), refreshIdentifierFields(struct, schema)); } @@ -308,8 +320,8 @@ public static boolean isPromotionAllowed(Type from, Type.PrimitiveType to) { } Types.DecimalType toDecimal = (Types.DecimalType) to; - return fromDecimal.scale() == toDecimal.scale() && - fromDecimal.precision() <= toDecimal.precision(); + return fromDecimal.scale() == toDecimal.scale() + && fromDecimal.precision() <= toDecimal.precision(); } return false; @@ -318,13 +330,14 @@ public static boolean isPromotionAllowed(Type from, Type.PrimitiveType to) { /** * Check whether we could write the iceberg table with the user-provided write schema. * - * @param tableSchema the table schema written in iceberg meta data. - * @param writeSchema the user-provided write schema. + * @param tableSchema the table schema written in iceberg meta data. + * @param writeSchema the user-provided write schema. * @param checkNullability If true, not allow to write optional values to a required field. - * @param checkOrdering If true, not allow input schema to have different ordering than table schema. + * @param checkOrdering If true, not allow input schema to have different ordering than table + * schema. */ - public static void validateWriteSchema(Schema tableSchema, Schema writeSchema, - Boolean checkNullability, Boolean checkOrdering) { + public static void validateWriteSchema( + Schema tableSchema, Schema writeSchema, Boolean checkNullability, Boolean checkOrdering) { String errMsg = "Cannot write incompatible dataset to table with schema:"; checkSchemaCompatibility(errMsg, tableSchema, writeSchema, checkNullability, checkOrdering); } @@ -338,14 +351,24 @@ public static void validateWriteSchema(Schema tableSchema, Schema writeSchema, * @param checkNullability whether to check field nullability * @param checkOrdering whether to check field ordering */ - public static void validateSchema(String context, Schema expectedSchema, Schema providedSchema, - boolean checkNullability, boolean checkOrdering) { - String errMsg = String.format("Provided %s schema is incompatible with expected schema:", context); - checkSchemaCompatibility(errMsg, expectedSchema, providedSchema, checkNullability, checkOrdering); - } - - private static void checkSchemaCompatibility(String errMsg, Schema schema, Schema providedSchema, - boolean checkNullability, boolean checkOrdering) { + public static void validateSchema( + String context, + Schema expectedSchema, + Schema providedSchema, + boolean checkNullability, + boolean checkOrdering) { + String errMsg = + String.format("Provided %s schema is incompatible with expected schema:", context); + checkSchemaCompatibility( + errMsg, expectedSchema, providedSchema, checkNullability, checkOrdering); + } + + private static void checkSchemaCompatibility( + String errMsg, + Schema schema, + Schema providedSchema, + boolean checkNullability, + boolean checkOrdering) { List errors; if (checkNullability) { errors = CheckCompatibility.writeCompatibilityErrors(schema, providedSchema, checkOrdering); @@ -371,19 +394,15 @@ private static void checkSchemaCompatibility(String errMsg, Schema schema, Schem } } - /** - * Interface for passing a function that assigns column IDs. - */ + /** Interface for passing a function that assigns column IDs. */ public interface NextID { int get(); } public static class SchemaVisitor { - public void beforeField(Types.NestedField field) { - } + public void beforeField(Types.NestedField field) {} - public void afterField(Types.NestedField field) { - } + public void afterField(Types.NestedField field) {} public void beforeListElement(Types.NestedField elementField) { beforeField(elementField); @@ -559,12 +578,12 @@ public static T visit(Schema schema, CustomOrderSchemaVisitor visitor) { /** * Used to traverse types with traversals other than pre-order. - *

- * This passes a {@link Supplier} to each {@link CustomOrderSchemaVisitor visitor} method that + * + *

This passes a {@link Supplier} to each {@link CustomOrderSchemaVisitor visitor} method that * returns the result of traversing child types. Structs are passed an {@link Iterable} that * traverses child fields during iteration. - *

- * An example use is assigning column IDs, which should be done with a post-order traversal. + * + *

An example use is assigning column IDs, which should be done with a post-order traversal. * * @param type a type to traverse with a visitor * @param visitor a custom order visitor @@ -575,11 +594,10 @@ public static T visit(Type type, CustomOrderSchemaVisitor visitor) { switch (type.typeId()) { case STRUCT: Types.StructType struct = type.asNestedType().asStructType(); - List> results = Lists - .newArrayListWithExpectedSize(struct.fields().size()); + List> results = + Lists.newArrayListWithExpectedSize(struct.fields().size()); for (Types.NestedField field : struct.fields()) { - results.add( - new VisitFieldFuture<>(field, visitor)); + results.add(new VisitFieldFuture<>(field, visitor)); } return visitor.struct(struct, Iterables.transform(results, VisitFieldFuture::get)); @@ -590,7 +608,8 @@ public static T visit(Type type, CustomOrderSchemaVisitor visitor) { case MAP: Types.MapType map = type.asNestedType().asMapType(); - return visitor.map(map, + return visitor.map( + map, new VisitFuture<>(map.keyType(), visitor), new VisitFuture<>(map.valueType(), visitor)); @@ -600,14 +619,14 @@ public static T visit(Type type, CustomOrderSchemaVisitor visitor) { } static int decimalMaxPrecision(int numBytes) { - Preconditions.checkArgument(numBytes >= 0 && numBytes < 24, - "Unsupported decimal length: %s", numBytes); + Preconditions.checkArgument( + numBytes >= 0 && numBytes < 24, "Unsupported decimal length: %s", numBytes); return MAX_PRECISION[numBytes]; } public static int decimalRequiredBytes(int precision) { - Preconditions.checkArgument(precision >= 0 && precision < 40, - "Unsupported decimal precision: %s", precision); + Preconditions.checkArgument( + precision >= 0 && precision < 40, "Unsupported decimal precision: %s", precision); return REQUIRED_LENGTH[precision]; } diff --git a/api/src/main/java/org/apache/iceberg/types/Types.java b/api/src/main/java/org/apache/iceberg/types/Types.java index 1b050725afd0..354899409b42 100644 --- a/api/src/main/java/org/apache/iceberg/types/Types.java +++ b/api/src/main/java/org/apache/iceberg/types/Types.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.io.Serializable; @@ -36,24 +35,23 @@ public class Types { - private Types() { - } - - private static final ImmutableMap TYPES = ImmutableMap - .builder() - .put(BooleanType.get().toString(), BooleanType.get()) - .put(IntegerType.get().toString(), IntegerType.get()) - .put(LongType.get().toString(), LongType.get()) - .put(FloatType.get().toString(), FloatType.get()) - .put(DoubleType.get().toString(), DoubleType.get()) - .put(DateType.get().toString(), DateType.get()) - .put(TimeType.get().toString(), TimeType.get()) - .put(TimestampType.withZone().toString(), TimestampType.withZone()) - .put(TimestampType.withoutZone().toString(), TimestampType.withoutZone()) - .put(StringType.get().toString(), StringType.get()) - .put(UUIDType.get().toString(), UUIDType.get()) - .put(BinaryType.get().toString(), BinaryType.get()) - .build(); + private Types() {} + + private static final ImmutableMap TYPES = + ImmutableMap.builder() + .put(BooleanType.get().toString(), BooleanType.get()) + .put(IntegerType.get().toString(), IntegerType.get()) + .put(LongType.get().toString(), LongType.get()) + .put(FloatType.get().toString(), FloatType.get()) + .put(DoubleType.get().toString(), DoubleType.get()) + .put(DateType.get().toString(), DateType.get()) + .put(TimeType.get().toString(), TimeType.get()) + .put(TimestampType.withZone().toString(), TimestampType.withZone()) + .put(TimestampType.withoutZone().toString(), TimestampType.withoutZone()) + .put(StringType.get().toString(), StringType.get()) + .put(UUIDType.get().toString(), UUIDType.get()) + .put(BinaryType.get().toString(), BinaryType.get()) + .build(); private static final Pattern FIXED = Pattern.compile("fixed\\[(\\d+)\\]"); private static final Pattern DECIMAL = Pattern.compile("decimal\\((\\d+),\\s+(\\d+)\\)"); @@ -71,9 +69,7 @@ public static PrimitiveType fromPrimitiveString(String typeString) { Matcher decimal = DECIMAL.matcher(lowerTypeString); if (decimal.matches()) { - return DecimalType.of( - Integer.parseInt(decimal.group(1)), - Integer.parseInt(decimal.group(2))); + return DecimalType.of(Integer.parseInt(decimal.group(1)), Integer.parseInt(decimal.group(2))); } throw new IllegalArgumentException("Cannot parse type string to primitive: " + typeString); @@ -194,8 +190,7 @@ public static TimeType get() { return INSTANCE; } - private TimeType() { - } + private TimeType() {} @Override public TypeID typeId() { @@ -368,8 +363,10 @@ public static DecimalType of(int precision, int scale) { private final int precision; private DecimalType(int precision, int scale) { - Preconditions.checkArgument(precision <= 38, - "Decimals with precision larger than 38 are not supported: %s", precision); + Preconditions.checkArgument( + precision <= 38, + "Decimals with precision larger than 38 are not supported: %s", + precision); this.scale = scale; this.precision = precision; } @@ -494,9 +491,8 @@ public String doc() { @Override public String toString() { - return String.format("%d: %s: %s %s", - id, name, isOptional ? "optional" : "required", type) + - (doc != null ? " (" + doc + ")" : ""); + return String.format("%d: %s: %s %s", id, name, isOptional ? "optional" : "required", type) + + (doc != null ? " (" + doc + ")" : ""); } @Override diff --git a/api/src/main/java/org/apache/iceberg/util/BinaryUtil.java b/api/src/main/java/org/apache/iceberg/util/BinaryUtil.java index eb3bdcef692c..312fcf5d23c7 100644 --- a/api/src/main/java/org/apache/iceberg/util/BinaryUtil.java +++ b/api/src/main/java/org/apache/iceberg/util/BinaryUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.nio.ByteBuffer; @@ -25,17 +24,16 @@ public class BinaryUtil { // not meant to be instantiated - private BinaryUtil() { - } + private BinaryUtil() {} private static final ByteBuffer EMPTY_BYTE_BUFFER = ByteBuffer.allocate(0); /** * Truncates the input byte buffer to the given length. - *

- * We allow for a length of zero so that rows with empty string can be evaluated. - * Partition specs still cannot be created with a length of zero due to a constraint - * when parsing column truncation specs in {@code org.apache.iceberg.MetricsModes}. + * + *

We allow for a length of zero so that rows with empty string can be evaluated. Partition + * specs still cannot be created with a length of zero due to a constraint when parsing column + * truncation specs in {@code org.apache.iceberg.MetricsModes}. * * @param input The ByteBuffer to be truncated * @param length The non-negative length to truncate input to @@ -53,7 +51,8 @@ public static ByteBuffer truncateBinary(ByteBuffer input, int length) { } /** - * Returns a byte buffer whose length is lesser than or equal to truncateLength and is lower than the given input + * Returns a byte buffer whose length is lesser than or equal to truncateLength and is lower than + * the given input */ public static Literal truncateBinaryMin(Literal input, int length) { ByteBuffer inputBuffer = input.value(); @@ -64,7 +63,8 @@ public static Literal truncateBinaryMin(Literal input, i } /** - * Returns a byte buffer whose length is lesser than or equal to truncateLength and is greater than the given input + * Returns a byte buffer whose length is lesser than or equal to truncateLength and is greater + * than the given input */ public static Literal truncateBinaryMax(Literal input, int length) { ByteBuffer inputBuffer = input.value(); @@ -75,7 +75,8 @@ public static Literal truncateBinaryMax(Literal input, i // Truncate the input to the specified truncate length. ByteBuffer truncatedInput = truncateBinary(inputBuffer, length); - // Try incrementing the bytes from the end. If all bytes overflow after incrementing, then return null + // Try incrementing the bytes from the end. If all bytes overflow after incrementing, then + // return null for (int i = length - 1; i >= 0; --i) { byte element = truncatedInput.get(i); element = (byte) (element + 1); diff --git a/api/src/main/java/org/apache/iceberg/util/ByteBuffers.java b/api/src/main/java/org/apache/iceberg/util/ByteBuffers.java index 4451a262be6a..a679bd7ad0ac 100644 --- a/api/src/main/java/org/apache/iceberg/util/ByteBuffers.java +++ b/api/src/main/java/org/apache/iceberg/util/ByteBuffers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.nio.ByteBuffer; @@ -32,8 +31,9 @@ public static byte[] toByteArray(ByteBuffer buffer) { if (buffer.hasArray()) { byte[] array = buffer.array(); - if (buffer.arrayOffset() == 0 && buffer.position() == 0 && - array.length == buffer.remaining()) { + if (buffer.arrayOffset() == 0 + && buffer.position() == 0 + && array.length == buffer.remaining()) { return array; } else { int start = buffer.arrayOffset() + buffer.position(); @@ -49,9 +49,13 @@ public static byte[] toByteArray(ByteBuffer buffer) { public static ByteBuffer reuse(ByteBuffer reuse, int length) { Preconditions.checkArgument(reuse.hasArray(), "Cannot reuse a buffer not backed by an array"); - Preconditions.checkArgument(reuse.arrayOffset() == 0, "Cannot reuse a buffer whose array offset is not 0"); - Preconditions.checkArgument(reuse.capacity() == length, - "Cannot use a buffer whose capacity (%s) is not equal to the requested length (%s)", length, reuse.capacity()); + Preconditions.checkArgument( + reuse.arrayOffset() == 0, "Cannot reuse a buffer whose array offset is not 0"); + Preconditions.checkArgument( + reuse.capacity() == length, + "Cannot use a buffer whose capacity (%s) is not equal to the requested length (%s)", + length, + reuse.capacity()); reuse.position(0); reuse.limit(length); return reuse; @@ -69,6 +73,5 @@ public static ByteBuffer copy(ByteBuffer buffer) { return ByteBuffer.wrap(copyArray); } - private ByteBuffers() { - } + private ByteBuffers() {} } diff --git a/api/src/main/java/org/apache/iceberg/util/CharSequenceSet.java b/api/src/main/java/org/apache/iceberg/util/CharSequenceSet.java index 38b8536d5e63..0dfb3cd16dd0 100644 --- a/api/src/main/java/org/apache/iceberg/util/CharSequenceSet.java +++ b/api/src/main/java/org/apache/iceberg/util/CharSequenceSet.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.io.Serializable; @@ -32,8 +31,8 @@ import org.apache.iceberg.relocated.com.google.common.collect.Streams; public class CharSequenceSet implements Set, Serializable { - private static final ThreadLocal wrappers = ThreadLocal.withInitial( - () -> CharSequenceWrapper.wrap(null)); + private static final ThreadLocal wrappers = + ThreadLocal.withInitial(() -> CharSequenceWrapper.wrap(null)); public static CharSequenceSet of(Iterable charSequences) { return new CharSequenceSet(charSequences); @@ -46,7 +45,8 @@ public static CharSequenceSet empty() { private final Set wrapperSet; private CharSequenceSet(Iterable charSequences) { - this.wrapperSet = Sets.newHashSet(Iterables.transform(charSequences, CharSequenceWrapper::wrap)); + this.wrapperSet = + Sets.newHashSet(Iterables.transform(charSequences, CharSequenceWrapper::wrap)); } @Override @@ -130,7 +130,8 @@ public boolean containsAll(Collection objects) { @Override public boolean addAll(Collection charSequences) { if (charSequences != null) { - return Iterables.addAll(wrapperSet, Iterables.transform(charSequences, CharSequenceWrapper::wrap)); + return Iterables.addAll( + wrapperSet, Iterables.transform(charSequences, CharSequenceWrapper::wrap)); } return false; } diff --git a/api/src/main/java/org/apache/iceberg/util/CharSequenceWrapper.java b/api/src/main/java/org/apache/iceberg/util/CharSequenceWrapper.java index 9405bbba5bef..28c44497012b 100644 --- a/api/src/main/java/org/apache/iceberg/util/CharSequenceWrapper.java +++ b/api/src/main/java/org/apache/iceberg/util/CharSequenceWrapper.java @@ -16,16 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.io.Serializable; import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.JavaHashes; -/** - * Wrapper class to adapt CharSequence for use in maps and sets. - */ +/** Wrapper class to adapt CharSequence for use in maps and sets. */ public class CharSequenceWrapper implements CharSequence, Serializable { public static CharSequenceWrapper wrap(CharSequence seq) { return new CharSequenceWrapper(seq); diff --git a/api/src/main/java/org/apache/iceberg/util/ExceptionUtil.java b/api/src/main/java/org/apache/iceberg/util/ExceptionUtil.java index 553d9ca22563..570d3fc11fd0 100644 --- a/api/src/main/java/org/apache/iceberg/util/ExceptionUtil.java +++ b/api/src/main/java/org/apache/iceberg/util/ExceptionUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import org.slf4j.Logger; @@ -25,8 +24,7 @@ public class ExceptionUtil { private static final Logger LOG = LoggerFactory.getLogger(ExceptionUtil.class); - private ExceptionUtil() { - } + private ExceptionUtil() {} @SuppressWarnings("unchecked") public static void castAndThrow( @@ -57,16 +55,23 @@ public static R runSafely( Block block, CatchBlock catchBlock, FinallyBlock finallyBlock) { - return runSafely(block, catchBlock, finallyBlock, - RuntimeException.class, RuntimeException.class, RuntimeException.class); + return runSafely( + block, + catchBlock, + finallyBlock, + RuntimeException.class, + RuntimeException.class, + RuntimeException.class); } public static R runSafely( Block block, CatchBlock catchBlock, FinallyBlock finallyBlock, - Class e1Class) throws E1 { - return runSafely(block, catchBlock, finallyBlock, e1Class, RuntimeException.class, RuntimeException.class); + Class e1Class) + throws E1 { + return runSafely( + block, catchBlock, finallyBlock, e1Class, RuntimeException.class, RuntimeException.class); } public static R runSafely( @@ -74,7 +79,8 @@ public static R runSafely( CatchBlock catchBlock, FinallyBlock finallyBlock, Class e1Class, - Class e2Class) throws E1, E2 { + Class e2Class) + throws E1, E2 { return runSafely(block, catchBlock, finallyBlock, e1Class, e2Class, RuntimeException.class); } @@ -85,7 +91,8 @@ public static e1Class, Class e2Class, - Class e3Class) throws E1, E2, E3 { + Class e3Class) + throws E1, E2, E3 { Throwable failure = null; try { @@ -129,7 +136,8 @@ public static void tryThrowAs(Throwable failure, Class excClass) throws E { + private static void tryThrowAs(Throwable failure, Class excClass) + throws E { if (excClass.isInstance(failure)) { throw excClass.cast(failure); } diff --git a/api/src/main/java/org/apache/iceberg/util/NaNUtil.java b/api/src/main/java/org/apache/iceberg/util/NaNUtil.java index 4a0176629bc2..c3c903b1954c 100644 --- a/api/src/main/java/org/apache/iceberg/util/NaNUtil.java +++ b/api/src/main/java/org/apache/iceberg/util/NaNUtil.java @@ -16,13 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; public class NaNUtil { - private NaNUtil() { - } + private NaNUtil() {} public static boolean isNaN(Object value) { if (value == null) { diff --git a/api/src/main/java/org/apache/iceberg/util/StructProjection.java b/api/src/main/java/org/apache/iceberg/util/StructProjection.java index cdeb7d0a2d6e..1bff6afdabce 100644 --- a/api/src/main/java/org/apache/iceberg/util/StructProjection.java +++ b/api/src/main/java/org/apache/iceberg/util/StructProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.List; @@ -33,8 +32,8 @@ public class StructProjection implements StructLike { /** * Creates a projecting wrapper for {@link StructLike} rows. - *

- * This projection does not work with repeated types like lists and maps. + * + *

This projection does not work with repeated types like lists and maps. * * @param schema schema of rows wrapped by this projection * @param ids field ids from the row schema to project @@ -47,8 +46,8 @@ public static StructProjection create(Schema schema, Set ids) { /** * Creates a projecting wrapper for {@link StructLike} rows. - *

- * This projection does not work with repeated types like lists and maps. + * + *

This projection does not work with repeated types like lists and maps. * * @param dataSchema schema of rows wrapped by this projection * @param projectedSchema result schema of the projected rows @@ -60,8 +59,8 @@ public static StructProjection create(Schema dataSchema, Schema projectedSchema) /** * Creates a projecting wrapper for {@link StructLike} rows. - *

- * This projection does not work with repeated types like lists and maps. + * + *

This projection does not work with repeated types like lists and maps. * * @param structType type of rows wrapped by this projection * @param projectedStructType result type of the projected rows @@ -73,14 +72,16 @@ public static StructProjection create(StructType structType, StructType projecte /** * Creates a projecting wrapper for {@link StructLike} rows. - *

- * This projection allows missing fields and does not work with repeated types like lists and maps. + * + *

This projection allows missing fields and does not work with repeated types like lists and + * maps. * * @param structType type of rows wrapped by this projection * @param projectedStructType result type of the projected rows * @return a wrapper to project rows */ - public static StructProjection createAllowMissing(StructType structType, StructType projectedStructType) { + public static StructProjection createAllowMissing( + StructType structType, StructType projectedStructType) { return new StructProjection(structType, projectedStructType, true); } @@ -112,20 +113,25 @@ private StructProjection(StructType structType, StructType projection, boolean a positionMap[pos] = i; switch (projectedField.type().typeId()) { case STRUCT: - nestedProjections[pos] = new StructProjection( - dataField.type().asStructType(), projectedField.type().asStructType()); + nestedProjections[pos] = + new StructProjection( + dataField.type().asStructType(), projectedField.type().asStructType()); break; case MAP: MapType projectedMap = projectedField.type().asMapType(); MapType originalMap = dataField.type().asMapType(); - boolean keyProjectable = !projectedMap.keyType().isNestedType() || - projectedMap.keyType().equals(originalMap.keyType()); - boolean valueProjectable = !projectedMap.valueType().isNestedType() || - projectedMap.valueType().equals(originalMap.valueType()); - Preconditions.checkArgument(keyProjectable && valueProjectable, + boolean keyProjectable = + !projectedMap.keyType().isNestedType() + || projectedMap.keyType().equals(originalMap.keyType()); + boolean valueProjectable = + !projectedMap.valueType().isNestedType() + || projectedMap.valueType().equals(originalMap.valueType()); + Preconditions.checkArgument( + keyProjectable && valueProjectable, "Cannot project a partial map key or value struct. Trying to project %s out of %s", - projectedField, dataField); + projectedField, + dataField); nestedProjections[pos] = null; break; @@ -133,11 +139,14 @@ private StructProjection(StructType structType, StructType projection, boolean a ListType projectedList = projectedField.type().asListType(); ListType originalList = dataField.type().asListType(); - boolean elementProjectable = !projectedList.elementType().isNestedType() || - projectedList.elementType().equals(originalList.elementType()); - Preconditions.checkArgument(elementProjectable, + boolean elementProjectable = + !projectedList.elementType().isNestedType() + || projectedList.elementType().equals(originalList.elementType()); + Preconditions.checkArgument( + elementProjectable, "Cannot project a partial list element struct. Trying to project %s out of %s", - projectedField, dataField); + projectedField, + dataField); nestedProjections[pos] = null; break; @@ -151,7 +160,8 @@ private StructProjection(StructType structType, StructType projection, boolean a positionMap[pos] = -1; nestedProjections[pos] = null; } else if (!found) { - throw new IllegalArgumentException(String.format("Cannot find field %s in %s", projectedField, structType)); + throw new IllegalArgumentException( + String.format("Cannot find field %s in %s", projectedField, structType)); } } } diff --git a/api/src/main/java/org/apache/iceberg/util/UUIDUtil.java b/api/src/main/java/org/apache/iceberg/util/UUIDUtil.java index fe13611a8afc..4cedb5bd2288 100644 --- a/api/src/main/java/org/apache/iceberg/util/UUIDUtil.java +++ b/api/src/main/java/org/apache/iceberg/util/UUIDUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.nio.ByteBuffer; @@ -25,8 +24,7 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; public class UUIDUtil { - private UUIDUtil() { - } + private UUIDUtil() {} public static UUID convert(byte[] buf) { Preconditions.checkArgument(buf.length == 16, "UUID require 16 bytes"); @@ -36,10 +34,16 @@ public static UUID convert(byte[] buf) { } public static UUID convert(byte[] buf, int offset) { - Preconditions.checkArgument(offset >= 0 && offset < buf.length, - "Offset overflow, offset=%s, length=%s", offset, buf.length); - Preconditions.checkArgument(offset + 16 <= buf.length, - "UUID require 16 bytes, offset=%s, length=%s", offset, buf.length); + Preconditions.checkArgument( + offset >= 0 && offset < buf.length, + "Offset overflow, offset=%s, length=%s", + offset, + buf.length); + Preconditions.checkArgument( + offset + 16 <= buf.length, + "UUID require 16 bytes, offset=%s, length=%s", + offset, + buf.length); ByteBuffer bb = ByteBuffer.wrap(buf, offset, 16); bb.order(ByteOrder.BIG_ENDIAN); diff --git a/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java b/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java index 40f475ad163d..4dd2afa123ac 100644 --- a/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java +++ b/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import org.apache.iceberg.expressions.Literal; @@ -24,12 +23,11 @@ public class UnicodeUtil { // not meant to be instantiated - private UnicodeUtil() { - } + private UnicodeUtil() {} /** - * Determines if the given character value is a unicode high-surrogate code unit. - * The range of high-surrogates is 0xD800 - 0xDBFF. + * Determines if the given character value is a unicode high-surrogate code unit. The range of + * high-surrogates is 0xD800 - 0xDBFF. */ public static boolean isCharHighSurrogate(char ch) { return (ch & '\uFC00') == '\uD800'; // 0xDC00 - 0xDFFF shouldn't match @@ -37,25 +35,28 @@ public static boolean isCharHighSurrogate(char ch) { /** * Truncates the input charSequence such that the truncated charSequence is a valid unicode string - * and the number of unicode characters in the truncated charSequence is lesser than or equal to length + * and the number of unicode characters in the truncated charSequence is lesser than or equal to + * length */ public static CharSequence truncateString(CharSequence input, int length) { Preconditions.checkArgument(length > 0, "Truncate length should be positive"); StringBuilder sb = new StringBuilder(input); // Get the number of unicode characters in the input int numUniCodeCharacters = sb.codePointCount(0, sb.length()); - // No need to truncate if the number of unicode characters in the char sequence is <= truncate length + // No need to truncate if the number of unicode characters in the char sequence is <= truncate + // length if (length >= numUniCodeCharacters) { return input; } - // Get the offset in the input charSequence where the number of unicode characters = truncate length + // Get the offset in the input charSequence where the number of unicode characters = truncate + // length int offsetByCodePoint = sb.offsetByCodePoints(0, length); return input.subSequence(0, offsetByCodePoint); } /** - * Returns a valid unicode charsequence that is lower than the given input such that the - * number of unicode characters in the truncated charSequence is lesser than or equal to length + * Returns a valid unicode charsequence that is lower than the given input such that the number of + * unicode characters in the truncated charSequence is lesser than or equal to length */ public static Literal truncateStringMin(Literal input, int length) { // Truncate the input to the specified truncate length. @@ -64,8 +65,8 @@ public static Literal truncateStringMin(Literal inpu } /** - * Returns a valid unicode charsequence that is greater than the given input such that the - * number of unicode characters in the truncated charSequence is lesser than or equal to length + * Returns a valid unicode charsequence that is greater than the given input such that the number + * of unicode characters in the truncated charSequence is lesser than or equal to length */ public static Literal truncateStringMax(Literal input, int length) { CharSequence inputCharSeq = input.value(); diff --git a/api/src/test/java/org/apache/iceberg/AssertHelpers.java b/api/src/test/java/org/apache/iceberg/AssertHelpers.java index d63c5d3d1133..e56d1b965d9e 100644 --- a/api/src/test/java/org/apache/iceberg/AssertHelpers.java +++ b/api/src/test/java/org/apache/iceberg/AssertHelpers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.concurrent.Callable; @@ -27,24 +26,23 @@ public class AssertHelpers { - private AssertHelpers() { - } + private AssertHelpers() {} /** * A convenience method to avoid a large number of @Test(expected=...) tests + * * @param message A String message to describe this assertion * @param expected An Exception class that the Runnable should throw - * @param containedInMessage A String that should be contained by the thrown - * exception's message + * @param containedInMessage A String that should be contained by the thrown exception's message * @param callable A Callable that is expected to throw the exception */ - public static void assertThrows(String message, - Class expected, - String containedInMessage, - Callable callable) { - AbstractThrowableAssert check = Assertions.assertThatThrownBy(callable::call) - .as(message) - .isInstanceOf(expected); + public static void assertThrows( + String message, + Class expected, + String containedInMessage, + Callable callable) { + AbstractThrowableAssert check = + Assertions.assertThatThrownBy(callable::call).as(message).isInstanceOf(expected); if (null != containedInMessage) { check.hasMessageContaining(containedInMessage); } @@ -52,19 +50,19 @@ public static void assertThrows(String message, /** * A convenience method to avoid a large number of @Test(expected=...) tests + * * @param message A String message to describe this assertion * @param expected An Exception class that the Runnable should throw - * @param containedInMessage A String that should be contained by the thrown - * exception's message + * @param containedInMessage A String that should be contained by the thrown exception's message * @param runnable A Runnable that is expected to throw the runtime exception */ - public static void assertThrows(String message, - Class expected, - String containedInMessage, - Runnable runnable) { - AbstractThrowableAssert check = Assertions.assertThatThrownBy(runnable::run) - .as(message) - .isInstanceOf(expected); + public static void assertThrows( + String message, + Class expected, + String containedInMessage, + Runnable runnable) { + AbstractThrowableAssert check = + Assertions.assertThatThrownBy(runnable::run).as(message).isInstanceOf(expected); if (null != containedInMessage) { check.hasMessageContaining(containedInMessage); } @@ -72,40 +70,42 @@ public static void assertThrows(String message, /** * A convenience method to avoid a large number of @Test(expected=...) tests + * * @param message A String message to describe this assertion * @param expected An Exception class that the Runnable should throw * @param callable A Callable that is expected to throw the exception */ - public static void assertThrows(String message, - Class expected, - Callable callable) { + public static void assertThrows( + String message, Class expected, Callable callable) { assertThrows(message, expected, null, callable); } /** * A convenience method to avoid a large number of @Test(expected=...) tests + * * @param message A String message to describe this assertion * @param expected An Exception class that the Runnable should throw * @param runnable A Runnable that is expected to throw the runtime exception */ - public static void assertThrows(String message, - Class expected, - Runnable runnable) { + public static void assertThrows( + String message, Class expected, Runnable runnable) { assertThrows(message, expected, null, runnable); } /** * A convenience method to assert the cause of thrown exception. + * * @param message A String message to describe this assertion * @param expected An Exception class that the cause of the Runnable should throw * @param containedInMessage A String that should be contained by the cause of the thrown - * exception's message + * exception's message * @param runnable A Runnable that is expected to throw the runtime exception */ - public static void assertThrowsCause(String message, - Class expected, - String containedInMessage, - Runnable runnable) { + public static void assertThrowsCause( + String message, + Class expected, + String containedInMessage, + Runnable runnable) { Assertions.assertThatThrownBy(runnable::run) .as(message) .getCause() @@ -115,24 +115,25 @@ public static void assertThrowsCause(String message, /** * A convenience method to assert both the thrown exception and the cause of thrown exception. + * * @param message A String message to describe this assertion - * @param expected An Exception class that the Runnable should throw - * @param expectedContainedInMessage A String that should be contained by the thrown exception's message, - * will be skipped if null. + * @param expected An Exception class that the Runnable should throw + * @param expectedContainedInMessage A String that should be contained by the thrown exception's + * message, will be skipped if null. * @param cause An Exception class that the cause of the Runnable should throw - * @param causeContainedInMessage A String that should be contained by the cause of the thrown exception's message, - * will be skipped if null. + * @param causeContainedInMessage A String that should be contained by the cause of the thrown + * exception's message, will be skipped if null. * @param runnable A Runnable that is expected to throw the runtime exception */ - public static void assertThrowsWithCause(String message, - Class expected, - String expectedContainedInMessage, - Class cause, - String causeContainedInMessage, - Runnable runnable) { - AbstractThrowableAssert chain = Assertions.assertThatThrownBy(runnable::run) - .as(message) - .isInstanceOf(expected); + public static void assertThrowsWithCause( + String message, + Class expected, + String expectedContainedInMessage, + Class cause, + String causeContainedInMessage, + Runnable runnable) { + AbstractThrowableAssert chain = + Assertions.assertThatThrownBy(runnable::run).as(message).isInstanceOf(expected); if (expectedContainedInMessage != null) { chain = chain.hasMessageContaining(expectedContainedInMessage); @@ -146,23 +147,21 @@ public static void assertThrowsWithCause(String message, /** * A convenience method to check if an Avro field is empty. + * * @param record The record to read from * @param field The name of the field */ public static void assertEmptyAvroField(GenericRecord record, String field) { AssertHelpers.assertThrows( - "Not a valid schema field: " + field, - AvroRuntimeException.class, - () -> record.get(field)); + "Not a valid schema field: " + field, AvroRuntimeException.class, () -> record.get(field)); } - /** - * Same as {@link AssertHelpers#assertThrowsCause}, but this method compares root cause. - */ - public static void assertThrowsRootCause(String message, - Class expected, - String containedInMessage, - Runnable runnable) { + /** Same as {@link AssertHelpers#assertThrowsCause}, but this method compares root cause. */ + public static void assertThrowsRootCause( + String message, + Class expected, + String containedInMessage, + Runnable runnable) { Assertions.assertThatThrownBy(runnable::run) .as(message) .getRootCause() diff --git a/api/src/test/java/org/apache/iceberg/PartitionSpecTestBase.java b/api/src/test/java/org/apache/iceberg/PartitionSpecTestBase.java index d8d64831d1d2..f6d076a4465a 100644 --- a/api/src/test/java/org/apache/iceberg/PartitionSpecTestBase.java +++ b/api/src/test/java/org/apache/iceberg/PartitionSpecTestBase.java @@ -16,61 +16,61 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.types.Types; @SuppressWarnings("checkstyle:HideUtilityClassConstructor") public class PartitionSpecTestBase { - public static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "i", Types.IntegerType.get()), - Types.NestedField.required(2, "l", Types.LongType.get()), - Types.NestedField.required(3, "d", Types.DateType.get()), - Types.NestedField.required(4, "t", Types.TimeType.get()), - Types.NestedField.required(5, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.required(6, "dec", Types.DecimalType.of(9, 2)), - Types.NestedField.required(7, "s", Types.StringType.get()), - Types.NestedField.required(8, "u", Types.UUIDType.get()), - Types.NestedField.required(9, "f", Types.FixedType.ofLength(3)), - Types.NestedField.required(10, "b", Types.BinaryType.get()) - ); + public static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "i", Types.IntegerType.get()), + Types.NestedField.required(2, "l", Types.LongType.get()), + Types.NestedField.required(3, "d", Types.DateType.get()), + Types.NestedField.required(4, "t", Types.TimeType.get()), + Types.NestedField.required(5, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.required(6, "dec", Types.DecimalType.of(9, 2)), + Types.NestedField.required(7, "s", Types.StringType.get()), + Types.NestedField.required(8, "u", Types.UUIDType.get()), + Types.NestedField.required(9, "f", Types.FixedType.ofLength(3)), + Types.NestedField.required(10, "b", Types.BinaryType.get())); // a spec with all of the allowed transform/type pairs - public static final PartitionSpec[] SPECS = new PartitionSpec[] { - PartitionSpec.builderFor(SCHEMA).identity("i").build(), - PartitionSpec.builderFor(SCHEMA).identity("l").build(), - PartitionSpec.builderFor(SCHEMA).identity("d").build(), - PartitionSpec.builderFor(SCHEMA).identity("t").build(), - PartitionSpec.builderFor(SCHEMA).identity("ts").build(), - PartitionSpec.builderFor(SCHEMA).identity("dec").build(), - PartitionSpec.builderFor(SCHEMA).identity("s").build(), - PartitionSpec.builderFor(SCHEMA).identity("u").build(), - PartitionSpec.builderFor(SCHEMA).identity("f").build(), - PartitionSpec.builderFor(SCHEMA).identity("b").build(), - PartitionSpec.builderFor(SCHEMA).bucket("i", 128).build(), - PartitionSpec.builderFor(SCHEMA).bucket("l", 128).build(), - PartitionSpec.builderFor(SCHEMA).bucket("d", 128).build(), - PartitionSpec.builderFor(SCHEMA).bucket("t", 128).build(), - PartitionSpec.builderFor(SCHEMA).bucket("ts", 128).build(), - PartitionSpec.builderFor(SCHEMA).bucket("dec", 128).build(), - PartitionSpec.builderFor(SCHEMA).bucket("s", 128).build(), - PartitionSpec.builderFor(SCHEMA).bucket("u", 128).build(), - PartitionSpec.builderFor(SCHEMA).bucket("f", 128).build(), - PartitionSpec.builderFor(SCHEMA).bucket("b", 128).build(), - PartitionSpec.builderFor(SCHEMA).year("d").build(), - PartitionSpec.builderFor(SCHEMA).month("d").build(), - PartitionSpec.builderFor(SCHEMA).day("d").build(), - PartitionSpec.builderFor(SCHEMA).year("ts").build(), - PartitionSpec.builderFor(SCHEMA).month("ts").build(), - PartitionSpec.builderFor(SCHEMA).day("ts").build(), - PartitionSpec.builderFor(SCHEMA).hour("ts").build(), - PartitionSpec.builderFor(SCHEMA).truncate("i", 10).build(), - PartitionSpec.builderFor(SCHEMA).truncate("l", 10).build(), - PartitionSpec.builderFor(SCHEMA).truncate("dec", 10).build(), - PartitionSpec.builderFor(SCHEMA).truncate("s", 10).build(), - PartitionSpec.builderFor(SCHEMA).add(6, "dec_unsupported", "unsupported").build(), - PartitionSpec.builderFor(SCHEMA).add(6, 1111, "dec_unsupported", "unsupported").build(), - PartitionSpec.builderFor(SCHEMA).alwaysNull("ts").build(), - }; + public static final PartitionSpec[] SPECS = + new PartitionSpec[] { + PartitionSpec.builderFor(SCHEMA).identity("i").build(), + PartitionSpec.builderFor(SCHEMA).identity("l").build(), + PartitionSpec.builderFor(SCHEMA).identity("d").build(), + PartitionSpec.builderFor(SCHEMA).identity("t").build(), + PartitionSpec.builderFor(SCHEMA).identity("ts").build(), + PartitionSpec.builderFor(SCHEMA).identity("dec").build(), + PartitionSpec.builderFor(SCHEMA).identity("s").build(), + PartitionSpec.builderFor(SCHEMA).identity("u").build(), + PartitionSpec.builderFor(SCHEMA).identity("f").build(), + PartitionSpec.builderFor(SCHEMA).identity("b").build(), + PartitionSpec.builderFor(SCHEMA).bucket("i", 128).build(), + PartitionSpec.builderFor(SCHEMA).bucket("l", 128).build(), + PartitionSpec.builderFor(SCHEMA).bucket("d", 128).build(), + PartitionSpec.builderFor(SCHEMA).bucket("t", 128).build(), + PartitionSpec.builderFor(SCHEMA).bucket("ts", 128).build(), + PartitionSpec.builderFor(SCHEMA).bucket("dec", 128).build(), + PartitionSpec.builderFor(SCHEMA).bucket("s", 128).build(), + PartitionSpec.builderFor(SCHEMA).bucket("u", 128).build(), + PartitionSpec.builderFor(SCHEMA).bucket("f", 128).build(), + PartitionSpec.builderFor(SCHEMA).bucket("b", 128).build(), + PartitionSpec.builderFor(SCHEMA).year("d").build(), + PartitionSpec.builderFor(SCHEMA).month("d").build(), + PartitionSpec.builderFor(SCHEMA).day("d").build(), + PartitionSpec.builderFor(SCHEMA).year("ts").build(), + PartitionSpec.builderFor(SCHEMA).month("ts").build(), + PartitionSpec.builderFor(SCHEMA).day("ts").build(), + PartitionSpec.builderFor(SCHEMA).hour("ts").build(), + PartitionSpec.builderFor(SCHEMA).truncate("i", 10).build(), + PartitionSpec.builderFor(SCHEMA).truncate("l", 10).build(), + PartitionSpec.builderFor(SCHEMA).truncate("dec", 10).build(), + PartitionSpec.builderFor(SCHEMA).truncate("s", 10).build(), + PartitionSpec.builderFor(SCHEMA).add(6, "dec_unsupported", "unsupported").build(), + PartitionSpec.builderFor(SCHEMA).add(6, 1111, "dec_unsupported", "unsupported").build(), + PartitionSpec.builderFor(SCHEMA).alwaysNull("ts").build(), + }; } diff --git a/api/src/test/java/org/apache/iceberg/TestAccessors.java b/api/src/test/java/org/apache/iceberg/TestAccessors.java index 170d4129b913..a6e1d646212f 100644 --- a/api/src/test/java/org/apache/iceberg/TestAccessors.java +++ b/api/src/test/java/org/apache/iceberg/TestAccessors.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.math.BigDecimal; import java.nio.ByteBuffer; import java.util.UUID; @@ -30,9 +32,6 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestAccessors { private static Accessor direct(Type type) { @@ -41,40 +40,92 @@ private static Accessor direct(Type type) { } private static Accessor nested1(Type type) { - Schema schema = new Schema(required(11, "struct1", Types.StructType.of( - Types.NestedField.required(17, "field_" + type.typeId(), type)))); + Schema schema = + new Schema( + required( + 11, + "struct1", + Types.StructType.of( + Types.NestedField.required(17, "field_" + type.typeId(), type)))); return schema.accessorForField(17); } private static Accessor nested2(Type type) { - Schema schema = new Schema(required(11, "s", Types.StructType.of( - Types.NestedField.required(22, "s2", Types.StructType.of( - Types.NestedField.required(17, "field_" + type.typeId(), type)))))); + Schema schema = + new Schema( + required( + 11, + "s", + Types.StructType.of( + Types.NestedField.required( + 22, + "s2", + Types.StructType.of( + Types.NestedField.required(17, "field_" + type.typeId(), type)))))); return schema.accessorForField(17); } private static Accessor nested3(Type type) { - Schema schema = new Schema(required(11, "s", Types.StructType.of( - Types.NestedField.required(22, "s2", Types.StructType.of( - Types.NestedField.required(33, "s3", Types.StructType.of( - Types.NestedField.required(17, "field_" + type.typeId(), type)))))))); + Schema schema = + new Schema( + required( + 11, + "s", + Types.StructType.of( + Types.NestedField.required( + 22, + "s2", + Types.StructType.of( + Types.NestedField.required( + 33, + "s3", + Types.StructType.of( + Types.NestedField.required( + 17, "field_" + type.typeId(), type)))))))); return schema.accessorForField(17); } private static Accessor nested3optional(Type type) { - Schema schema = new Schema(optional(11, "s", Types.StructType.of( - Types.NestedField.optional(22, "s2", Types.StructType.of( - Types.NestedField.optional(33, "s3", Types.StructType.of( - Types.NestedField.optional(17, "field_" + type.typeId(), type)))))))); + Schema schema = + new Schema( + optional( + 11, + "s", + Types.StructType.of( + Types.NestedField.optional( + 22, + "s2", + Types.StructType.of( + Types.NestedField.optional( + 33, + "s3", + Types.StructType.of( + Types.NestedField.optional( + 17, "field_" + type.typeId(), type)))))))); return schema.accessorForField(17); } private static Accessor nested4(Type type) { - Schema schema = new Schema(required(11, "s", Types.StructType.of( - Types.NestedField.required(22, "s2", Types.StructType.of( - Types.NestedField.required(33, "s3", Types.StructType.of( - Types.NestedField.required(44, "s4", Types.StructType.of( - Types.NestedField.required(17, "field_" + type.typeId(), type)))))))))); + Schema schema = + new Schema( + required( + 11, + "s", + Types.StructType.of( + Types.NestedField.required( + 22, + "s2", + Types.StructType.of( + Types.NestedField.required( + 33, + "s3", + Types.StructType.of( + Types.NestedField.required( + 44, + "s4", + Types.StructType.of( + Types.NestedField.required( + 17, "field_" + type.typeId(), type)))))))))); return schema.accessorForField(17); } @@ -158,8 +209,10 @@ public void testDecimal() { @Test public void testList() { - assertAccessorReturns(Types.ListType.ofRequired(18, Types.IntegerType.get()), ImmutableList.of(1, 2, 3)); - assertAccessorReturns(Types.ListType.ofRequired(18, Types.StringType.get()), ImmutableList.of("a", "b", "c")); + assertAccessorReturns( + Types.ListType.ofRequired(18, Types.IntegerType.get()), ImmutableList.of(1, 2, 3)); + assertAccessorReturns( + Types.ListType.ofRequired(18, Types.StringType.get()), ImmutableList.of("a", "b", "c")); } @Test @@ -181,8 +234,7 @@ public void testStructAsObject() { @Test public void testEmptyStructAsObject() { assertAccessorReturns( - Types.StructType.of( - Types.NestedField.optional(19, "int19", Types.IntegerType.get())), + Types.StructType.of(Types.NestedField.optional(19, "int19", Types.IntegerType.get())), Row.of()); assertAccessorReturns(Types.StructType.of(), Row.of()); diff --git a/api/src/test/java/org/apache/iceberg/TestHelpers.java b/api/src/test/java/org/apache/iceberg/TestHelpers.java index f82269060e0a..59ae5d26eb6a 100644 --- a/api/src/test/java/org/apache/iceberg/TestHelpers.java +++ b/api/src/test/java/org/apache/iceberg/TestHelpers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.ByteArrayInputStream; @@ -39,12 +38,9 @@ public class TestHelpers { - private TestHelpers() { - } + private TestHelpers() {} - /** - * Wait in a tight check loop until system clock is past {@code timestampMillis} - */ + /** Wait in a tight check loop until system clock is past {@code timestampMillis} */ public static long waitUntilAfter(long timestampMillis) { long current = System.currentTimeMillis(); while (current <= timestampMillis) { @@ -54,29 +50,29 @@ public static long waitUntilAfter(long timestampMillis) { } public static T assertAndUnwrap(Expression expr, Class expected) { - Assert.assertTrue("Expression should have expected type: " + expected, - expected.isInstance(expr)); + Assert.assertTrue( + "Expression should have expected type: " + expected, expected.isInstance(expr)); return expected.cast(expr); } @SuppressWarnings("unchecked") public static BoundPredicate assertAndUnwrap(Expression expr) { - Assert.assertTrue("Expression should be a bound predicate: " + expr, - expr instanceof BoundPredicate); + Assert.assertTrue( + "Expression should be a bound predicate: " + expr, expr instanceof BoundPredicate); return (BoundPredicate) expr; } @SuppressWarnings("unchecked") public static BoundSetPredicate assertAndUnwrapBoundSet(Expression expr) { - Assert.assertTrue("Expression should be a bound set predicate: " + expr, - expr instanceof BoundSetPredicate); + Assert.assertTrue( + "Expression should be a bound set predicate: " + expr, expr instanceof BoundSetPredicate); return (BoundSetPredicate) expr; } @SuppressWarnings("unchecked") public static UnboundPredicate assertAndUnwrapUnbound(Expression expr) { - Assert.assertTrue("Expression should be an unbound predicate: " + expr, - expr instanceof UnboundPredicate); + Assert.assertTrue( + "Expression should be an unbound predicate: " + expr, expr instanceof UnboundPredicate); return (UnboundPredicate) expr; } @@ -91,8 +87,8 @@ public static T roundTripSerialize(T type) throws IOException, ClassNotFound out.writeObject(type); } - try (ObjectInputStream in = new ObjectInputStream( - new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { return (T) in.readObject(); } } @@ -102,23 +98,24 @@ public static void assertSameSchemaList(List list1, List list2) Assert.fail("Should have same number of schemas in both lists"); } - IntStream.range(0, list1.size()).forEach( - index -> { - Schema schema1 = list1.get(index); - Schema schema2 = list2.get(index); - Assert.assertEquals("Should have matching schema id", - schema1.schemaId(), schema2.schemaId()); - Assert.assertEquals("Should have matching schema struct", - schema1.asStruct(), schema2.asStruct()); - } - ); + IntStream.range(0, list1.size()) + .forEach( + index -> { + Schema schema1 = list1.get(index); + Schema schema2 = list2.get(index); + Assert.assertEquals( + "Should have matching schema id", schema1.schemaId(), schema2.schemaId()); + Assert.assertEquals( + "Should have matching schema struct", schema1.asStruct(), schema2.asStruct()); + }); } public static void assertSerializedMetadata(Table expected, Table actual) { Assert.assertEquals("Name must match", expected.name(), actual.name()); Assert.assertEquals("Location must match", expected.location(), actual.location()); Assert.assertEquals("Props must match", expected.properties(), actual.properties()); - Assert.assertEquals("Schema must match", expected.schema().asStruct(), actual.schema().asStruct()); + Assert.assertEquals( + "Schema must match", expected.schema().asStruct(), actual.schema().asStruct()); Assert.assertEquals("Spec must match", expected.spec(), actual.spec()); Assert.assertEquals("Sort order must match", expected.sortOrder(), actual.sortOrder()); } @@ -127,7 +124,8 @@ public static void assertSerializedAndLoadedMetadata(Table expected, Table actua assertSerializedMetadata(expected, actual); Assert.assertEquals("Specs must match", expected.specs(), actual.specs()); Assert.assertEquals("Sort orders must match", expected.sortOrders(), actual.sortOrders()); - Assert.assertEquals("Current snapshot must match", expected.currentSnapshot(), actual.currentSnapshot()); + Assert.assertEquals( + "Current snapshot must match", expected.currentSnapshot(), actual.currentSnapshot()); Assert.assertEquals("Snapshots must match", expected.snapshots(), actual.snapshots()); Assert.assertEquals("History must match", expected.history(), actual.history()); } @@ -137,14 +135,19 @@ public static void assertSameSchemaMap(Map map1, Map { - Schema schema2 = map2.get(schemaId); - Assert.assertNotNull(String.format("Schema ID %s does not exist in map: %s", schemaId, map2), schema2); + map1.forEach( + (schemaId, schema1) -> { + Schema schema2 = map2.get(schemaId); + Assert.assertNotNull( + String.format("Schema ID %s does not exist in map: %s", schemaId, map2), schema2); - Assert.assertEquals("Should have matching schema id", schema1.schemaId(), schema2.schemaId()); - Assert.assertTrue(String.format("Should be the same schema. Schema 1: %s, schema 2: %s", schema1, schema2), - schema1.sameSchema(schema2)); - }); + Assert.assertEquals( + "Should have matching schema id", schema1.schemaId(), schema2.schemaId()); + Assert.assertTrue( + String.format( + "Should be the same schema. Schema 1: %s, schema 2: %s", schema1, schema2), + schema1.sameSchema(schema2)); + }); } private static class CheckReferencesBound extends ExpressionVisitors.ExpressionVisitor { @@ -161,9 +164,7 @@ public Void predicate(UnboundPredicate pred) { } } - /** - * Implements {@link StructLike#get} for passing data in tests. - */ + /** Implements {@link StructLike#get} for passing data in tests. */ public static class Row implements StructLike { public static Row of(Object... values) { return new Row(values); @@ -221,7 +222,8 @@ public TestFieldSummary(boolean containsNull, ByteBuffer lowerBound, ByteBuffer this(containsNull, null, lowerBound, upperBound); } - public TestFieldSummary(boolean containsNull, Boolean containsNaN, ByteBuffer lowerBound, ByteBuffer upperBound) { + public TestFieldSummary( + boolean containsNull, Boolean containsNaN, ByteBuffer lowerBound, ByteBuffer upperBound) { this.containsNull = containsNull; this.containsNaN = containsNaN; this.lowerBound = lowerBound; @@ -269,9 +271,16 @@ public static class TestManifestFile implements ManifestFile { private final List partitions; private final byte[] keyMetadata; - public TestManifestFile(String path, long length, int specId, Long snapshotId, - Integer addedFiles, Integer existingFiles, Integer deletedFiles, - List partitions, ByteBuffer keyMetadata) { + public TestManifestFile( + String path, + long length, + int specId, + Long snapshotId, + Integer addedFiles, + Integer existingFiles, + Integer deletedFiles, + List partitions, + ByteBuffer keyMetadata) { this.path = path; this.length = length; this.specId = specId; @@ -287,10 +296,20 @@ public TestManifestFile(String path, long length, int specId, Long snapshotId, this.keyMetadata = ByteBuffers.toByteArray(keyMetadata); } - public TestManifestFile(String path, long length, int specId, ManifestContent content, Long snapshotId, - Integer addedFiles, Long addedRows, Integer existingFiles, - Long existingRows, Integer deletedFiles, Long deletedRows, - List partitions, ByteBuffer keyMetadata) { + public TestManifestFile( + String path, + long length, + int specId, + ManifestContent content, + Long snapshotId, + Integer addedFiles, + Long addedRows, + Integer existingFiles, + Long existingRows, + Integer deletedFiles, + Long deletedRows, + List partitions, + ByteBuffer keyMetadata) { this.path = path; this.length = length; this.specId = specId; @@ -401,12 +420,15 @@ public TestDataFile(String path, StructLike partition, long recordCount) { this(path, partition, recordCount, null, null, null, null, null); } - public TestDataFile(String path, StructLike partition, long recordCount, - Map valueCounts, - Map nullValueCounts, - Map nanValueCounts, - Map lowerBounds, - Map upperBounds) { + public TestDataFile( + String path, + StructLike partition, + long recordCount, + Map valueCounts, + Map nullValueCounts, + Map nanValueCounts, + Map lowerBounds, + Map upperBounds) { this.path = path; this.partition = partition; this.recordCount = recordCount; diff --git a/api/src/test/java/org/apache/iceberg/TestIcebergBuild.java b/api/src/test/java/org/apache/iceberg/TestIcebergBuild.java index 9a6337a69cbd..d1697609d896 100644 --- a/api/src/test/java/org/apache/iceberg/TestIcebergBuild.java +++ b/api/src/test/java/org/apache/iceberg/TestIcebergBuild.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Locale; @@ -40,8 +39,12 @@ public void testVersion() { @Test public void testGitCommitId() { - Assert.assertNotEquals("Should not use unknown commit ID", "unknown", IcebergBuild.gitCommitId()); - Assert.assertTrue("Should be a hexadecimal string of 20 bytes", - Pattern.compile("[0-9a-f]{40}").matcher(IcebergBuild.gitCommitId().toLowerCase(Locale.ROOT)).matches()); + Assert.assertNotEquals( + "Should not use unknown commit ID", "unknown", IcebergBuild.gitCommitId()); + Assert.assertTrue( + "Should be a hexadecimal string of 20 bytes", + Pattern.compile("[0-9a-f]{40}") + .matcher(IcebergBuild.gitCommitId().toLowerCase(Locale.ROOT)) + .matches()); } } diff --git a/api/src/test/java/org/apache/iceberg/TestMetricsSerialization.java b/api/src/test/java/org/apache/iceberg/TestMetricsSerialization.java index 788a2faa133c..4795eaedf9ab 100644 --- a/api/src/test/java/org/apache/iceberg/TestMetricsSerialization.java +++ b/api/src/test/java/org/apache/iceberg/TestMetricsSerialization.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.ByteArrayInputStream; @@ -113,7 +112,8 @@ private static void assertEquals(Metrics expected, Metrics actual) { assertEquals(expected.upperBounds(), actual.upperBounds()); } - private static void assertEquals(Map expected, Map actual) { + private static void assertEquals( + Map expected, Map actual) { if (expected == null) { Assert.assertNull(actual); } else { diff --git a/api/src/test/java/org/apache/iceberg/TestPartitionPaths.java b/api/src/test/java/org/apache/iceberg/TestPartitionPaths.java index 8f2dedc33d75..d5a8767b2640 100644 --- a/api/src/test/java/org/apache/iceberg/TestPartitionPaths.java +++ b/api/src/test/java/org/apache/iceberg/TestPartitionPaths.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.TestHelpers.Row; @@ -27,41 +26,40 @@ import org.junit.Test; public class TestPartitionPaths { - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "ts", Types.TimestampType.withoutZone()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "ts", Types.TimestampType.withoutZone())); @Test @SuppressWarnings("unchecked") public void testPartitionPath() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .hour("ts") - .bucket("id", 10) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).hour("ts").bucket("id", 10).build(); Transform hour = spec.getFieldsBySourceId(3).get(0).transform(); Transform bucket = spec.getFieldsBySourceId(1).get(0).transform(); - Literal ts = Literal.of("2017-12-01T10:12:55.038194").to(Types.TimestampType.withoutZone()); + Literal ts = + Literal.of("2017-12-01T10:12:55.038194").to(Types.TimestampType.withoutZone()); Object tsHour = hour.apply(ts.value()); Object idBucket = bucket.apply(1); Row partition = Row.of(tsHour, idBucket); - Assert.assertEquals("Should produce expected partition key", - "ts_hour=2017-12-01-10/id_bucket=" + idBucket, spec.partitionToPath(partition)); + Assert.assertEquals( + "Should produce expected partition key", + "ts_hour=2017-12-01-10/id_bucket=" + idBucket, + spec.partitionToPath(partition)); } @Test public void testEscapedStrings() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("data") - .truncate("data", 10) - .build(); + PartitionSpec spec = + PartitionSpec.builderFor(SCHEMA).identity("data").truncate("data", 10).build(); - Assert.assertEquals("Should escape / as %2F", + Assert.assertEquals( + "Should escape / as %2F", "data=a%2Fb%2Fc%2Fd/data_trunc=a%2Fb%2Fc%2Fd", spec.partitionToPath(Row.of("a/b/c/d", "a/b/c/d"))); } diff --git a/api/src/test/java/org/apache/iceberg/TestPartitionSpecValidation.java b/api/src/test/java/org/apache/iceberg/TestPartitionSpecValidation.java index 4834e63903bd..14c2a9ab1b6e 100644 --- a/api/src/test/java/org/apache/iceberg/TestPartitionSpecValidation.java +++ b/api/src/test/java/org/apache/iceberg/TestPartitionSpecValidation.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.types.Types; @@ -25,73 +24,105 @@ import org.junit.Test; public class TestPartitionSpecValidation { - private static final Schema SCHEMA = new Schema( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.required(2, "ts", Types.TimestampType.withZone()), - NestedField.required(3, "another_ts", Types.TimestampType.withZone()), - NestedField.required(4, "d", Types.TimestampType.withZone()), - NestedField.required(5, "another_d", Types.TimestampType.withZone()), - NestedField.required(6, "s", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.required(2, "ts", Types.TimestampType.withZone()), + NestedField.required(3, "another_ts", Types.TimestampType.withZone()), + NestedField.required(4, "d", Types.TimestampType.withZone()), + NestedField.required(5, "another_d", Types.TimestampType.withZone()), + NestedField.required(6, "s", Types.StringType.get())); @Test public void testMultipleTimestampPartitions() { - AssertHelpers.assertThrows("Should not allow year(ts) and year(ts)", - IllegalArgumentException.class, "Cannot use partition name more than once", + AssertHelpers.assertThrows( + "Should not allow year(ts) and year(ts)", + IllegalArgumentException.class, + "Cannot use partition name more than once", () -> PartitionSpec.builderFor(SCHEMA).year("ts").year("ts").build()); - AssertHelpers.assertThrows("Should not allow year(ts) and month(ts)", - IllegalArgumentException.class, "Cannot add redundant partition", + AssertHelpers.assertThrows( + "Should not allow year(ts) and month(ts)", + IllegalArgumentException.class, + "Cannot add redundant partition", () -> PartitionSpec.builderFor(SCHEMA).year("ts").month("ts").build()); - AssertHelpers.assertThrows("Should not allow year(ts) and day(ts)", - IllegalArgumentException.class, "Cannot add redundant partition", + AssertHelpers.assertThrows( + "Should not allow year(ts) and day(ts)", + IllegalArgumentException.class, + "Cannot add redundant partition", () -> PartitionSpec.builderFor(SCHEMA).year("ts").day("ts").build()); - AssertHelpers.assertThrows("Should not allow year(ts) and hour(ts)", - IllegalArgumentException.class, "Cannot add redundant partition", + AssertHelpers.assertThrows( + "Should not allow year(ts) and hour(ts)", + IllegalArgumentException.class, + "Cannot add redundant partition", () -> PartitionSpec.builderFor(SCHEMA).year("ts").hour("ts").build()); - AssertHelpers.assertThrows("Should not allow month(ts) and month(ts)", - IllegalArgumentException.class, "Cannot use partition name more than once", + AssertHelpers.assertThrows( + "Should not allow month(ts) and month(ts)", + IllegalArgumentException.class, + "Cannot use partition name more than once", () -> PartitionSpec.builderFor(SCHEMA).month("ts").month("ts").build()); - AssertHelpers.assertThrows("Should not allow month(ts) and day(ts)", - IllegalArgumentException.class, "Cannot add redundant partition", + AssertHelpers.assertThrows( + "Should not allow month(ts) and day(ts)", + IllegalArgumentException.class, + "Cannot add redundant partition", () -> PartitionSpec.builderFor(SCHEMA).month("ts").day("ts").build()); - AssertHelpers.assertThrows("Should not allow month(ts) and hour(ts)", - IllegalArgumentException.class, "Cannot add redundant partition", + AssertHelpers.assertThrows( + "Should not allow month(ts) and hour(ts)", + IllegalArgumentException.class, + "Cannot add redundant partition", () -> PartitionSpec.builderFor(SCHEMA).month("ts").hour("ts").build()); - AssertHelpers.assertThrows("Should not allow day(ts) and day(ts)", - IllegalArgumentException.class, "Cannot use partition name more than once", + AssertHelpers.assertThrows( + "Should not allow day(ts) and day(ts)", + IllegalArgumentException.class, + "Cannot use partition name more than once", () -> PartitionSpec.builderFor(SCHEMA).day("ts").day("ts").build()); - AssertHelpers.assertThrows("Should not allow day(ts) and hour(ts)", - IllegalArgumentException.class, "Cannot add redundant partition", + AssertHelpers.assertThrows( + "Should not allow day(ts) and hour(ts)", + IllegalArgumentException.class, + "Cannot add redundant partition", () -> PartitionSpec.builderFor(SCHEMA).day("ts").hour("ts").build()); - AssertHelpers.assertThrows("Should not allow hour(ts) and hour(ts)", - IllegalArgumentException.class, "Cannot use partition name more than once", + AssertHelpers.assertThrows( + "Should not allow hour(ts) and hour(ts)", + IllegalArgumentException.class, + "Cannot use partition name more than once", () -> PartitionSpec.builderFor(SCHEMA).hour("ts").hour("ts").build()); } @Test public void testMultipleDatePartitions() { - AssertHelpers.assertThrows("Should not allow year(d) and year(d)", - IllegalArgumentException.class, "Cannot use partition name more than once", + AssertHelpers.assertThrows( + "Should not allow year(d) and year(d)", + IllegalArgumentException.class, + "Cannot use partition name more than once", () -> PartitionSpec.builderFor(SCHEMA).year("d").year("d").build()); - AssertHelpers.assertThrows("Should not allow year(d) and month(d)", - IllegalArgumentException.class, "Cannot add redundant partition", + AssertHelpers.assertThrows( + "Should not allow year(d) and month(d)", + IllegalArgumentException.class, + "Cannot add redundant partition", () -> PartitionSpec.builderFor(SCHEMA).year("d").month("d").build()); - AssertHelpers.assertThrows("Should not allow year(d) and day(d)", - IllegalArgumentException.class, "Cannot add redundant partition", + AssertHelpers.assertThrows( + "Should not allow year(d) and day(d)", + IllegalArgumentException.class, + "Cannot add redundant partition", () -> PartitionSpec.builderFor(SCHEMA).year("d").day("d").build()); - AssertHelpers.assertThrows("Should not allow month(d) and month(d)", - IllegalArgumentException.class, "Cannot use partition name more than once", + AssertHelpers.assertThrows( + "Should not allow month(d) and month(d)", + IllegalArgumentException.class, + "Cannot use partition name more than once", () -> PartitionSpec.builderFor(SCHEMA).month("d").month("d").build()); - AssertHelpers.assertThrows("Should not allow month(d) and day(d)", - IllegalArgumentException.class, "Cannot add redundant partition", + AssertHelpers.assertThrows( + "Should not allow month(d) and day(d)", + IllegalArgumentException.class, + "Cannot add redundant partition", () -> PartitionSpec.builderFor(SCHEMA).month("d").day("d").build()); - AssertHelpers.assertThrows("Should not allow day(d) and day(d)", - IllegalArgumentException.class, "Cannot use partition name more than once", + AssertHelpers.assertThrows( + "Should not allow day(d) and day(d)", + IllegalArgumentException.class, + "Cannot use partition name more than once", () -> PartitionSpec.builderFor(SCHEMA).day("d").day("d").build()); } @@ -126,70 +157,100 @@ public void testMultipleDatePartitionsWithDifferentSourceColumns() { @Test public void testMultipleIdentityPartitions() { PartitionSpec.builderFor(SCHEMA).year("d").identity("id").identity("d").identity("s").build(); - AssertHelpers.assertThrows("Should not allow identity(id) and identity(id)", - IllegalArgumentException.class, "Cannot use partition name more than once", + AssertHelpers.assertThrows( + "Should not allow identity(id) and identity(id)", + IllegalArgumentException.class, + "Cannot use partition name more than once", () -> PartitionSpec.builderFor(SCHEMA).identity("id").identity("id").build()); - AssertHelpers.assertThrows("Should not allow identity(id) and identity(id, name)", - IllegalArgumentException.class, "Cannot add redundant partition", + AssertHelpers.assertThrows( + "Should not allow identity(id) and identity(id, name)", + IllegalArgumentException.class, + "Cannot add redundant partition", () -> PartitionSpec.builderFor(SCHEMA).identity("id").identity("id", "test-id").build()); - AssertHelpers.assertThrows("Should not allow identity(id) and identity(id, name)", - IllegalArgumentException.class, "Cannot use partition name more than once", - () -> PartitionSpec.builderFor(SCHEMA) - .identity("id", "test-id").identity("d", "test-id").build()); + AssertHelpers.assertThrows( + "Should not allow identity(id) and identity(id, name)", + IllegalArgumentException.class, + "Cannot use partition name more than once", + () -> + PartitionSpec.builderFor(SCHEMA) + .identity("id", "test-id") + .identity("d", "test-id") + .build()); } @Test public void testSettingPartitionTransformsWithCustomTargetNames() { - Assert.assertEquals(PartitionSpec.builderFor(SCHEMA).year("ts", "custom_year") - .build().fields().get(0).name(), "custom_year"); - Assert.assertEquals(PartitionSpec.builderFor(SCHEMA).month("ts", "custom_month") - .build().fields().get(0).name(), "custom_month"); - Assert.assertEquals(PartitionSpec.builderFor(SCHEMA).day("ts", "custom_day") - .build().fields().get(0).name(), "custom_day"); - Assert.assertEquals(PartitionSpec.builderFor(SCHEMA).hour("ts", "custom_hour") - .build().fields().get(0).name(), "custom_hour"); - Assert.assertEquals(PartitionSpec.builderFor(SCHEMA) - .bucket("ts", 4, "custom_bucket") - .build().fields().get(0).name(), "custom_bucket"); - Assert.assertEquals(PartitionSpec.builderFor(SCHEMA) - .truncate("s", 1, "custom_truncate") - .build().fields().get(0).name(), "custom_truncate"); + Assert.assertEquals( + PartitionSpec.builderFor(SCHEMA).year("ts", "custom_year").build().fields().get(0).name(), + "custom_year"); + Assert.assertEquals( + PartitionSpec.builderFor(SCHEMA).month("ts", "custom_month").build().fields().get(0).name(), + "custom_month"); + Assert.assertEquals( + PartitionSpec.builderFor(SCHEMA).day("ts", "custom_day").build().fields().get(0).name(), + "custom_day"); + Assert.assertEquals( + PartitionSpec.builderFor(SCHEMA).hour("ts", "custom_hour").build().fields().get(0).name(), + "custom_hour"); + Assert.assertEquals( + PartitionSpec.builderFor(SCHEMA) + .bucket("ts", 4, "custom_bucket") + .build() + .fields() + .get(0) + .name(), + "custom_bucket"); + Assert.assertEquals( + PartitionSpec.builderFor(SCHEMA) + .truncate("s", 1, "custom_truncate") + .build() + .fields() + .get(0) + .name(), + "custom_truncate"); } @Test public void testSettingPartitionTransformsWithCustomTargetNamesThatAlreadyExist() { - AssertHelpers.assertThrows("Should not allow target column name that exists in schema", + AssertHelpers.assertThrows( + "Should not allow target column name that exists in schema", IllegalArgumentException.class, "Cannot create partition from name that exists in schema: another_ts", () -> PartitionSpec.builderFor(SCHEMA).year("ts", "another_ts")); - AssertHelpers.assertThrows("Should not allow target column name that exists in schema", + AssertHelpers.assertThrows( + "Should not allow target column name that exists in schema", IllegalArgumentException.class, "Cannot create partition from name that exists in schema: another_ts", () -> PartitionSpec.builderFor(SCHEMA).month("ts", "another_ts")); - AssertHelpers.assertThrows("Should not allow target column name that exists in schema", + AssertHelpers.assertThrows( + "Should not allow target column name that exists in schema", IllegalArgumentException.class, "Cannot create partition from name that exists in schema: another_ts", () -> PartitionSpec.builderFor(SCHEMA).day("ts", "another_ts")); - AssertHelpers.assertThrows("Should not allow target column name that exists in schema", + AssertHelpers.assertThrows( + "Should not allow target column name that exists in schema", IllegalArgumentException.class, "Cannot create partition from name that exists in schema: another_ts", () -> PartitionSpec.builderFor(SCHEMA).hour("ts", "another_ts")); - AssertHelpers.assertThrows("Should not allow target column name that exists in schema", + AssertHelpers.assertThrows( + "Should not allow target column name that exists in schema", IllegalArgumentException.class, "Cannot create partition from name that exists in schema: another_ts", () -> PartitionSpec.builderFor(SCHEMA).truncate("ts", 2, "another_ts")); - AssertHelpers.assertThrows("Should not allow target column name that exists in schema", + AssertHelpers.assertThrows( + "Should not allow target column name that exists in schema", IllegalArgumentException.class, "Cannot create partition from name that exists in schema: another_ts", () -> PartitionSpec.builderFor(SCHEMA).bucket("ts", 4, "another_ts")); - AssertHelpers.assertThrows("Should not allow target column name sourced from a different column", + AssertHelpers.assertThrows( + "Should not allow target column name sourced from a different column", IllegalArgumentException.class, "Cannot create identity partition sourced from different field in schema: another_ts", () -> PartitionSpec.builderFor(SCHEMA).identity("ts", "another_ts")); @@ -197,37 +258,52 @@ public void testSettingPartitionTransformsWithCustomTargetNamesThatAlreadyExist( @Test public void testMissingSourceColumn() { - AssertHelpers.assertThrows("Should detect missing source column", - IllegalArgumentException.class, "Cannot find source column", + AssertHelpers.assertThrows( + "Should detect missing source column", + IllegalArgumentException.class, + "Cannot find source column", () -> PartitionSpec.builderFor(SCHEMA).year("missing").build()); - AssertHelpers.assertThrows("Should detect missing source column", - IllegalArgumentException.class, "Cannot find source column", + AssertHelpers.assertThrows( + "Should detect missing source column", + IllegalArgumentException.class, + "Cannot find source column", () -> PartitionSpec.builderFor(SCHEMA).month("missing").build()); - AssertHelpers.assertThrows("Should detect missing source column", - IllegalArgumentException.class, "Cannot find source column", + AssertHelpers.assertThrows( + "Should detect missing source column", + IllegalArgumentException.class, + "Cannot find source column", () -> PartitionSpec.builderFor(SCHEMA).day("missing").build()); - AssertHelpers.assertThrows("Should detect missing source column", - IllegalArgumentException.class, "Cannot find source column", + AssertHelpers.assertThrows( + "Should detect missing source column", + IllegalArgumentException.class, + "Cannot find source column", () -> PartitionSpec.builderFor(SCHEMA).hour("missing").build()); - AssertHelpers.assertThrows("Should detect missing source column", - IllegalArgumentException.class, "Cannot find source column", + AssertHelpers.assertThrows( + "Should detect missing source column", + IllegalArgumentException.class, + "Cannot find source column", () -> PartitionSpec.builderFor(SCHEMA).bucket("missing", 4).build()); - AssertHelpers.assertThrows("Should detect missing source column", - IllegalArgumentException.class, "Cannot find source column", + AssertHelpers.assertThrows( + "Should detect missing source column", + IllegalArgumentException.class, + "Cannot find source column", () -> PartitionSpec.builderFor(SCHEMA).truncate("missing", 5).build()); - AssertHelpers.assertThrows("Should detect missing source column", - IllegalArgumentException.class, "Cannot find source column", + AssertHelpers.assertThrows( + "Should detect missing source column", + IllegalArgumentException.class, + "Cannot find source column", () -> PartitionSpec.builderFor(SCHEMA).identity("missing").build()); } @Test public void testAutoSettingPartitionFieldIds() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .year("ts", "custom_year") - .bucket("ts", 4, "custom_bucket") - .add(1, "id_partition2", "bucket[4]") - .truncate("s", 1, "custom_truncate") - .build(); + PartitionSpec spec = + PartitionSpec.builderFor(SCHEMA) + .year("ts", "custom_year") + .bucket("ts", 4, "custom_bucket") + .add(1, "id_partition2", "bucket[4]") + .truncate("s", 1, "custom_truncate") + .build(); Assert.assertEquals(1000, spec.fields().get(0).fieldId()); Assert.assertEquals(1001, spec.fields().get(1).fieldId()); @@ -238,11 +314,12 @@ public void testAutoSettingPartitionFieldIds() { @Test public void testAddPartitionFieldsWithFieldIds() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .add(1, 1005, "id_partition1", "bucket[4]") - .add(1, 1006, "id_partition2", "bucket[5]") - .add(1, 1002, "id_partition3", "bucket[6]") - .build(); + PartitionSpec spec = + PartitionSpec.builderFor(SCHEMA) + .add(1, 1005, "id_partition1", "bucket[4]") + .add(1, 1006, "id_partition2", "bucket[5]") + .add(1, 1002, "id_partition3", "bucket[6]") + .build(); Assert.assertEquals(1005, spec.fields().get(0).fieldId()); Assert.assertEquals(1006, spec.fields().get(1).fieldId()); @@ -252,11 +329,12 @@ public void testAddPartitionFieldsWithFieldIds() { @Test public void testAddPartitionFieldsWithAndWithoutFieldIds() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .add(1, "id_partition2", "bucket[5]") - .add(1, 1005, "id_partition1", "bucket[4]") - .truncate("s", 1, "custom_truncate") - .build(); + PartitionSpec spec = + PartitionSpec.builderFor(SCHEMA) + .add(1, "id_partition2", "bucket[5]") + .add(1, 1005, "id_partition1", "bucket[4]") + .truncate("s", 1, "custom_truncate") + .build(); Assert.assertEquals(1000, spec.fields().get(0).fieldId()); Assert.assertEquals(1005, spec.fields().get(1).fieldId()); diff --git a/api/src/test/java/org/apache/iceberg/TestSnapshotRef.java b/api/src/test/java/org/apache/iceberg/TestSnapshotRef.java index f594f72d17a1..2b14168d1680 100644 --- a/api/src/test/java/org/apache/iceberg/TestSnapshotRef.java +++ b/api/src/test/java/org/apache/iceberg/TestSnapshotRef.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.junit.Assert; @@ -45,9 +44,7 @@ public void testBranchDefault() { @Test public void testTagWithOverride() { - SnapshotRef ref = SnapshotRef.branchBuilder(1L) - .maxRefAgeMs(10L) - .build(); + SnapshotRef ref = SnapshotRef.branchBuilder(1L).maxRefAgeMs(10L).build(); Assert.assertEquals(1L, ref.snapshotId()); Assert.assertEquals(SnapshotRefType.BRANCH, ref.type()); Assert.assertEquals(10L, (long) ref.maxRefAgeMs()); @@ -55,11 +52,12 @@ public void testTagWithOverride() { @Test public void testBranchWithOverride() { - SnapshotRef ref = SnapshotRef.branchBuilder(1L) - .minSnapshotsToKeep(10) - .maxSnapshotAgeMs(20L) - .maxRefAgeMs(30L) - .build(); + SnapshotRef ref = + SnapshotRef.branchBuilder(1L) + .minSnapshotsToKeep(10) + .maxSnapshotAgeMs(20L) + .maxRefAgeMs(30L) + .build(); Assert.assertEquals(1L, ref.snapshotId()); Assert.assertEquals(SnapshotRefType.BRANCH, ref.type()); Assert.assertEquals(10, (int) ref.minSnapshotsToKeep()); @@ -69,7 +67,8 @@ public void testBranchWithOverride() { @Test public void testNoTypeFailure() { - AssertHelpers.assertThrows("Snapshot reference type must be specified", + AssertHelpers.assertThrows( + "Snapshot reference type must be specified", IllegalArgumentException.class, "Snapshot reference type must not be null", () -> SnapshotRef.builderFor(1L, null).build()); @@ -77,49 +76,43 @@ public void testNoTypeFailure() { @Test public void testTagBuildFailures() { - AssertHelpers.assertThrows("Max reference age must be greater than 0 for tag", + AssertHelpers.assertThrows( + "Max reference age must be greater than 0 for tag", IllegalArgumentException.class, "Max reference age must be greater than 0", - () -> SnapshotRef.tagBuilder(1L) - .maxRefAgeMs(-1L) - .build()); + () -> SnapshotRef.tagBuilder(1L).maxRefAgeMs(-1L).build()); - AssertHelpers.assertThrows("Tags do not support setting minSnapshotsToKeep", + AssertHelpers.assertThrows( + "Tags do not support setting minSnapshotsToKeep", IllegalArgumentException.class, "Tags do not support setting minSnapshotsToKeep", - () -> SnapshotRef.tagBuilder(1L) - .minSnapshotsToKeep(2) - .build()); + () -> SnapshotRef.tagBuilder(1L).minSnapshotsToKeep(2).build()); - AssertHelpers.assertThrows("Tags do not support setting maxSnapshotAgeMs", + AssertHelpers.assertThrows( + "Tags do not support setting maxSnapshotAgeMs", IllegalArgumentException.class, "Tags do not support setting maxSnapshotAgeMs", - () -> SnapshotRef.tagBuilder(1L) - .maxSnapshotAgeMs(2L) - .build()); + () -> SnapshotRef.tagBuilder(1L).maxSnapshotAgeMs(2L).build()); } @Test public void testBranchBuildFailures() { - AssertHelpers.assertThrows("Max snapshot age must be greater than 0", + AssertHelpers.assertThrows( + "Max snapshot age must be greater than 0", IllegalArgumentException.class, "Max snapshot age must be greater than 0", - () -> SnapshotRef.branchBuilder(1L) - .maxSnapshotAgeMs(-1L) - .build()); + () -> SnapshotRef.branchBuilder(1L).maxSnapshotAgeMs(-1L).build()); - AssertHelpers.assertThrows("Min snapshots to keep must be greater than 0", + AssertHelpers.assertThrows( + "Min snapshots to keep must be greater than 0", IllegalArgumentException.class, "Min snapshots to keep must be greater than 0", - () -> SnapshotRef.branchBuilder(1L) - .minSnapshotsToKeep(-1) - .build()); + () -> SnapshotRef.branchBuilder(1L).minSnapshotsToKeep(-1).build()); - AssertHelpers.assertThrows("Max reference age must be greater than 0 for branch", + AssertHelpers.assertThrows( + "Max reference age must be greater than 0 for branch", IllegalArgumentException.class, "Max reference age must be greater than 0", - () -> SnapshotRef.branchBuilder(1L) - .maxRefAgeMs(-1L) - .build()); + () -> SnapshotRef.branchBuilder(1L).maxRefAgeMs(-1L).build()); } } diff --git a/api/src/test/java/org/apache/iceberg/TestTransformSerialization.java b/api/src/test/java/org/apache/iceberg/TestTransformSerialization.java index 74f4ea83108e..9494398d68dd 100644 --- a/api/src/test/java/org/apache/iceberg/TestTransformSerialization.java +++ b/api/src/test/java/org/apache/iceberg/TestTransformSerialization.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.junit.Assert; @@ -26,8 +25,10 @@ public class TestTransformSerialization extends PartitionSpecTestBase { @Test public void testTransforms() throws Exception { for (PartitionSpec spec : SPECS) { - Assert.assertEquals("Deserialization should produce equal partition spec", - spec, TestHelpers.roundTripSerialize(spec)); + Assert.assertEquals( + "Deserialization should produce equal partition spec", + spec, + TestHelpers.roundTripSerialize(spec)); } } } diff --git a/api/src/test/java/org/apache/iceberg/catalog/TestNamespace.java b/api/src/test/java/org/apache/iceberg/catalog/TestNamespace.java index cbb69ddfedaa..cb8547052ed1 100644 --- a/api/src/test/java/org/apache/iceberg/catalog/TestNamespace.java +++ b/api/src/test/java/org/apache/iceberg/catalog/TestNamespace.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.catalog; import org.apache.iceberg.AssertHelpers; diff --git a/api/src/test/java/org/apache/iceberg/catalog/TestTableIdentifier.java b/api/src/test/java/org/apache/iceberg/catalog/TestTableIdentifier.java index 39f05ddc1986..61eb41aa3460 100644 --- a/api/src/test/java/org/apache/iceberg/catalog/TestTableIdentifier.java +++ b/api/src/test/java/org/apache/iceberg/catalog/TestTableIdentifier.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.catalog; import org.assertj.core.api.Assertions; @@ -45,12 +44,9 @@ public void testTableIdentifierParsing() { @Test public void testToLowerCase() { + Assert.assertEquals(TableIdentifier.of("Tbl").toLowerCase(), TableIdentifier.of("tbl")); Assert.assertEquals( - TableIdentifier.of("Tbl").toLowerCase(), - TableIdentifier.of("tbl")); - Assert.assertEquals( - TableIdentifier.of("dB", "TBL").toLowerCase(), - TableIdentifier.of("db", "tbl")); + TableIdentifier.of("dB", "TBL").toLowerCase(), TableIdentifier.of("db", "tbl")); Assert.assertEquals( TableIdentifier.of("Catalog", "dB", "TBL").toLowerCase(), TableIdentifier.of("catalog", "db", "tbl")); diff --git a/api/src/test/java/org/apache/iceberg/events/TestListeners.java b/api/src/test/java/org/apache/iceberg/events/TestListeners.java index 301d36aba196..0c56a52bad4e 100644 --- a/api/src/test/java/org/apache/iceberg/events/TestListeners.java +++ b/api/src/test/java/org/apache/iceberg/events/TestListeners.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.events; import org.junit.Assert; @@ -28,11 +27,9 @@ public class TestListeners { Listeners.register(TestListener.get()::event2, Event2.class); } - public static class Event1 { - } + public static class Event1 {} - public static class Event2 { - } + public static class Event2 {} public static class TestListener { private static final TestListener INSTANCE = new TestListener(); diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java index d8d26bbbda6e..3add078184a8 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java @@ -16,21 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import org.apache.avro.util.Utf8; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.types.Types.StructType; -import org.junit.Assert; -import org.junit.Test; - import static org.apache.iceberg.expressions.Expressions.alwaysFalse; import static org.apache.iceberg.expressions.Expressions.alwaysTrue; import static org.apache.iceberg.expressions.Expressions.and; @@ -54,19 +41,51 @@ import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import org.apache.avro.util.Utf8; +import org.apache.iceberg.AssertHelpers; +import org.apache.iceberg.TestHelpers; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.types.Types.StructType; +import org.junit.Assert; +import org.junit.Test; + public class TestEvaluator { - private static final StructType STRUCT = StructType.of( - required(13, "x", Types.IntegerType.get()), - required(14, "y", Types.DoubleType.get()), - optional(15, "z", Types.IntegerType.get()), - optional(16, "s1", Types.StructType.of( - Types.NestedField.required(17, "s2", Types.StructType.of( - Types.NestedField.required(18, "s3", Types.StructType.of( - Types.NestedField.required(19, "s4", Types.StructType.of( - Types.NestedField.required(20, "i", Types.IntegerType.get()))))))))), - optional(21, "s5", Types.StructType.of( - Types.NestedField.required(22, "s6", Types.StructType.of( - Types.NestedField.required(23, "f", Types.FloatType.get())))))); + private static final StructType STRUCT = + StructType.of( + required(13, "x", Types.IntegerType.get()), + required(14, "y", Types.DoubleType.get()), + optional(15, "z", Types.IntegerType.get()), + optional( + 16, + "s1", + Types.StructType.of( + Types.NestedField.required( + 17, + "s2", + Types.StructType.of( + Types.NestedField.required( + 18, + "s3", + Types.StructType.of( + Types.NestedField.required( + 19, + "s4", + Types.StructType.of( + Types.NestedField.required( + 20, "i", Types.IntegerType.get()))))))))), + optional( + 21, + "s5", + Types.StructType.of( + Types.NestedField.required( + 22, + "s6", + Types.StructType.of( + Types.NestedField.required(23, "f", Types.FloatType.get())))))); @Test public void testLessThan() { @@ -75,18 +94,24 @@ public void testLessThan() { Assert.assertTrue("6 < 7 => true", evaluator.eval(TestHelpers.Row.of(6, 8, null, null))); Evaluator structEvaluator = new Evaluator(STRUCT, lessThan("s1.s2.s3.s4.i", 7)); - Assert.assertFalse("7 < 7 => false", - structEvaluator.eval(TestHelpers.Row.of(7, 8, null, + Assert.assertFalse( + "7 < 7 => false", + structEvaluator.eval( TestHelpers.Row.of( + 7, + 8, + null, TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(7))))))); - Assert.assertTrue("6 < 7 => true", - structEvaluator.eval(TestHelpers.Row.of(6, 8, null, + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(7))))))); + Assert.assertTrue( + "6 < 7 => true", + structEvaluator.eval( TestHelpers.Row.of( + 6, + 8, + null, TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(6))))))); + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(6))))))); } @Test @@ -97,26 +122,35 @@ public void testLessThanOrEqual() { Assert.assertFalse("8 <= 7 => false", evaluator.eval(TestHelpers.Row.of(8, 8, null))); Evaluator structEvaluator = new Evaluator(STRUCT, lessThanOrEqual("s1.s2.s3.s4.i", 7)); - Assert.assertTrue("7 <= 7 => true", - structEvaluator.eval(TestHelpers.Row.of(7, 8, null, + Assert.assertTrue( + "7 <= 7 => true", + structEvaluator.eval( TestHelpers.Row.of( + 7, + 8, + null, TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(7))))))); + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(7))))))); - Assert.assertTrue("6 <= 7 => true", - structEvaluator.eval(TestHelpers.Row.of(6, 8, null, + Assert.assertTrue( + "6 <= 7 => true", + structEvaluator.eval( TestHelpers.Row.of( + 6, + 8, + null, TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(6))))))); + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(6))))))); - Assert.assertFalse("8 <= 7 => false", - structEvaluator.eval(TestHelpers.Row.of(6, 8, null, + Assert.assertFalse( + "8 <= 7 => false", + structEvaluator.eval( TestHelpers.Row.of( + 6, + 8, + null, TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(8))))))); + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(8))))))); } @Test @@ -127,24 +161,33 @@ public void testGreaterThan() { Assert.assertTrue("8 > 7 => true", evaluator.eval(TestHelpers.Row.of(8, 8, null))); Evaluator structEvaluator = new Evaluator(STRUCT, greaterThan("s1.s2.s3.s4.i", 7)); - Assert.assertFalse("7 > 7 => false", - structEvaluator.eval(TestHelpers.Row.of(7, 8, null, + Assert.assertFalse( + "7 > 7 => false", + structEvaluator.eval( TestHelpers.Row.of( + 7, + 8, + null, TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(7))))))); - Assert.assertFalse("6 > 7 => false", - structEvaluator.eval(TestHelpers.Row.of(7, 8, null, + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(7))))))); + Assert.assertFalse( + "6 > 7 => false", + structEvaluator.eval( TestHelpers.Row.of( + 7, + 8, + null, TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(6))))))); - Assert.assertTrue("8 > 7 => true", - structEvaluator.eval(TestHelpers.Row.of(7, 8, null, + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(6))))))); + Assert.assertTrue( + "8 > 7 => true", + structEvaluator.eval( TestHelpers.Row.of( + 7, + 8, + null, TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(8))))))); + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(8))))))); } @Test @@ -155,24 +198,33 @@ public void testGreaterThanOrEqual() { Assert.assertTrue("8 >= 7 => true", evaluator.eval(TestHelpers.Row.of(8, 8, null))); Evaluator structEvaluator = new Evaluator(STRUCT, greaterThanOrEqual("s1.s2.s3.s4.i", 7)); - Assert.assertTrue("7 >= 7 => true", - structEvaluator.eval(TestHelpers.Row.of(7, 8, null, + Assert.assertTrue( + "7 >= 7 => true", + structEvaluator.eval( TestHelpers.Row.of( + 7, + 8, + null, TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(7))))))); - Assert.assertFalse("6 >= 7 => false", - structEvaluator.eval(TestHelpers.Row.of(7, 8, null, + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(7))))))); + Assert.assertFalse( + "6 >= 7 => false", + structEvaluator.eval( TestHelpers.Row.of( + 7, + 8, + null, TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(6))))))); - Assert.assertTrue("8 >= 7 => true", - structEvaluator.eval(TestHelpers.Row.of(7, 8, null, + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(6))))))); + Assert.assertTrue( + "8 >= 7 => true", + structEvaluator.eval( TestHelpers.Row.of( + 7, + 8, + null, TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(8))))))); + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(8))))))); } @Test @@ -184,18 +236,24 @@ public void testEqual() { Assert.assertFalse("6 == 7 => false", evaluator.eval(TestHelpers.Row.of(6, 8, null))); Evaluator structEvaluator = new Evaluator(STRUCT, equal("s1.s2.s3.s4.i", 7)); - Assert.assertTrue("7 == 7 => true", - structEvaluator.eval(TestHelpers.Row.of(7, 8, null, + Assert.assertTrue( + "7 == 7 => true", + structEvaluator.eval( TestHelpers.Row.of( + 7, + 8, + null, TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(7))))))); - Assert.assertFalse("6 == 7 => false", - structEvaluator.eval(TestHelpers.Row.of(6, 8, null, + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(7))))))); + Assert.assertFalse( + "6 == 7 => false", + structEvaluator.eval( TestHelpers.Row.of( + 6, + 8, + null, TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(6))))))); + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(6))))))); } @Test @@ -207,44 +265,59 @@ public void testNotEqual() { Assert.assertTrue("6 != 7 => true", evaluator.eval(TestHelpers.Row.of(6, 8, null))); Evaluator structEvaluator = new Evaluator(STRUCT, notEqual("s1.s2.s3.s4.i", 7)); - Assert.assertFalse("7 != 7 => false", - structEvaluator.eval(TestHelpers.Row.of(7, 8, null, + Assert.assertFalse( + "7 != 7 => false", + structEvaluator.eval( TestHelpers.Row.of( + 7, + 8, + null, TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(7))))))); - Assert.assertTrue("6 != 7 => true", - structEvaluator.eval(TestHelpers.Row.of(6, 8, null, + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(7))))))); + Assert.assertTrue( + "6 != 7 => true", + structEvaluator.eval( TestHelpers.Row.of( + 6, + 8, + null, TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(6))))))); - + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(6))))))); } @Test public void testStartsWith() { StructType struct = StructType.of(required(24, "s", Types.StringType.get())); Evaluator evaluator = new Evaluator(struct, startsWith("s", "abc")); - Assert.assertTrue("abc startsWith abc should be true", evaluator.eval(TestHelpers.Row.of("abc"))); - Assert.assertFalse("xabc startsWith abc should be false", evaluator.eval(TestHelpers.Row.of("xabc"))); - Assert.assertFalse("Abc startsWith abc should be false", evaluator.eval(TestHelpers.Row.of("Abc"))); + Assert.assertTrue( + "abc startsWith abc should be true", evaluator.eval(TestHelpers.Row.of("abc"))); + Assert.assertFalse( + "xabc startsWith abc should be false", evaluator.eval(TestHelpers.Row.of("xabc"))); + Assert.assertFalse( + "Abc startsWith abc should be false", evaluator.eval(TestHelpers.Row.of("Abc"))); Assert.assertFalse("a startsWith abc should be false", evaluator.eval(TestHelpers.Row.of("a"))); - Assert.assertTrue("abcd startsWith abc should be true", evaluator.eval(TestHelpers.Row.of("abcd"))); - Assert.assertFalse("null startsWith abc should be false", - evaluator.eval(TestHelpers.Row.of((String) null))); + Assert.assertTrue( + "abcd startsWith abc should be true", evaluator.eval(TestHelpers.Row.of("abcd"))); + Assert.assertFalse( + "null startsWith abc should be false", evaluator.eval(TestHelpers.Row.of((String) null))); } @Test public void testNotStartsWith() { StructType struct = StructType.of(required(24, "s", Types.StringType.get())); Evaluator evaluator = new Evaluator(struct, notStartsWith("s", "abc")); - Assert.assertFalse("abc notStartsWith abc should be false", evaluator.eval(TestHelpers.Row.of("abc"))); - Assert.assertTrue("xabc notStartsWith abc should be true", evaluator.eval(TestHelpers.Row.of("xabc"))); - Assert.assertTrue("Abc notStartsWith abc should be true", evaluator.eval(TestHelpers.Row.of("Abc"))); - Assert.assertTrue("a notStartsWith abc should be true", evaluator.eval(TestHelpers.Row.of("a"))); - Assert.assertFalse("abcde notStartsWith abc should be false", evaluator.eval(TestHelpers.Row.of("abcde"))); - Assert.assertTrue("Abcde notStartsWith abc should be true", evaluator.eval(TestHelpers.Row.of("Abcde"))); + Assert.assertFalse( + "abc notStartsWith abc should be false", evaluator.eval(TestHelpers.Row.of("abc"))); + Assert.assertTrue( + "xabc notStartsWith abc should be true", evaluator.eval(TestHelpers.Row.of("xabc"))); + Assert.assertTrue( + "Abc notStartsWith abc should be true", evaluator.eval(TestHelpers.Row.of("Abc"))); + Assert.assertTrue( + "a notStartsWith abc should be true", evaluator.eval(TestHelpers.Row.of("a"))); + Assert.assertFalse( + "abcde notStartsWith abc should be false", evaluator.eval(TestHelpers.Row.of("abcde"))); + Assert.assertTrue( + "Abcde notStartsWith abc should be true", evaluator.eval(TestHelpers.Row.of("Abcde"))); } @Test @@ -266,11 +339,15 @@ public void testIsNull() { Assert.assertFalse("3 is not null", evaluator.eval(TestHelpers.Row.of(1, 2, 3))); Evaluator structEvaluator = new Evaluator(STRUCT, isNull("s1.s2.s3.s4.i")); - Assert.assertFalse("3 is not null", structEvaluator.eval(TestHelpers.Row.of(1, 2, 3, - TestHelpers.Row.of( + Assert.assertFalse( + "3 is not null", + structEvaluator.eval( TestHelpers.Row.of( + 1, + 2, + 3, TestHelpers.Row.of( - TestHelpers.Row.of(3))))))); + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(3))))))); } @Test @@ -279,13 +356,16 @@ public void testNotNull() { Assert.assertFalse("null is null", evaluator.eval(TestHelpers.Row.of(1, 2, null))); Assert.assertTrue("3 is not null", evaluator.eval(TestHelpers.Row.of(1, 2, 3))); - Evaluator structEvaluator = new Evaluator(STRUCT, notNull("s1.s2.s3.s4.i")); - Assert.assertTrue("3 is not null", structEvaluator.eval(TestHelpers.Row.of(1, 2, 3, - TestHelpers.Row.of( + Assert.assertTrue( + "3 is not null", + structEvaluator.eval( TestHelpers.Row.of( + 1, + 2, + 3, TestHelpers.Row.of( - TestHelpers.Row.of(3))))))); + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(3))))))); } @Test @@ -295,10 +375,14 @@ public void testIsNan() { Assert.assertFalse("2 is not NaN", evaluator.eval(TestHelpers.Row.of(1, 2.0, 3))); Evaluator structEvaluator = new Evaluator(STRUCT, isNaN("s5.s6.f")); - Assert.assertTrue("NaN is NaN", structEvaluator.eval(TestHelpers.Row.of(1, 2, 3, null, - TestHelpers.Row.of(TestHelpers.Row.of(Float.NaN))))); - Assert.assertFalse("4F is not NaN", structEvaluator.eval(TestHelpers.Row.of(1, 2, 3, null, - TestHelpers.Row.of(TestHelpers.Row.of(4F))))); + Assert.assertTrue( + "NaN is NaN", + structEvaluator.eval( + TestHelpers.Row.of(1, 2, 3, null, TestHelpers.Row.of(TestHelpers.Row.of(Float.NaN))))); + Assert.assertFalse( + "4F is not NaN", + structEvaluator.eval( + TestHelpers.Row.of(1, 2, 3, null, TestHelpers.Row.of(TestHelpers.Row.of(4F))))); } @Test @@ -308,10 +392,14 @@ public void testNotNaN() { Assert.assertTrue("2 is not NaN", evaluator.eval(TestHelpers.Row.of(1, 2.0, 3))); Evaluator structEvaluator = new Evaluator(STRUCT, notNaN("s5.s6.f")); - Assert.assertFalse("NaN is NaN", structEvaluator.eval(TestHelpers.Row.of(1, 2, 3, null, - TestHelpers.Row.of(TestHelpers.Row.of(Float.NaN))))); - Assert.assertTrue("4F is not NaN", structEvaluator.eval(TestHelpers.Row.of(1, 2, 3, null, - TestHelpers.Row.of(TestHelpers.Row.of(4F))))); + Assert.assertFalse( + "NaN is NaN", + structEvaluator.eval( + TestHelpers.Row.of(1, 2, 3, null, TestHelpers.Row.of(TestHelpers.Row.of(Float.NaN))))); + Assert.assertTrue( + "4F is not NaN", + structEvaluator.eval( + TestHelpers.Row.of(1, 2, 3, null, TestHelpers.Row.of(TestHelpers.Row.of(4F))))); } @Test @@ -322,27 +410,40 @@ public void testAnd() { Assert.assertFalse("7, null => false", evaluator.eval(TestHelpers.Row.of(7, 0, null))); Assert.assertFalse("8, null => false", evaluator.eval(TestHelpers.Row.of(8, 0, null))); - Evaluator structEvaluator = new Evaluator(STRUCT, and(equal("s1.s2.s3.s4.i", 7), - notNull("s1.s2.s3.s4.i"))); + Evaluator structEvaluator = + new Evaluator(STRUCT, and(equal("s1.s2.s3.s4.i", 7), notNull("s1.s2.s3.s4.i"))); - Assert.assertTrue("7, 7 => true", structEvaluator.eval(TestHelpers.Row.of(7, 0, 3, - TestHelpers.Row.of( + Assert.assertTrue( + "7, 7 => true", + structEvaluator.eval( TestHelpers.Row.of( + 7, + 0, + 3, TestHelpers.Row.of( - TestHelpers.Row.of(7))))))); - Assert.assertFalse("8, 8 => false", structEvaluator.eval(TestHelpers.Row.of(8, 0, 3, - TestHelpers.Row.of( + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(7))))))); + Assert.assertFalse( + "8, 8 => false", + structEvaluator.eval( TestHelpers.Row.of( + 8, + 0, + 3, TestHelpers.Row.of( - TestHelpers.Row.of(8))))))); + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(8))))))); - Assert.assertFalse("7, null => false", structEvaluator.eval(TestHelpers.Row.of(7, 0, null, null))); + Assert.assertFalse( + "7, null => false", structEvaluator.eval(TestHelpers.Row.of(7, 0, null, null))); - Assert.assertFalse("8, notnull => false", structEvaluator.eval(TestHelpers.Row.of(8, 0, null, - TestHelpers.Row.of( + Assert.assertFalse( + "8, notnull => false", + structEvaluator.eval( TestHelpers.Row.of( + 8, + 0, + null, TestHelpers.Row.of( - TestHelpers.Row.of(8))))))); + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(8))))))); } @Test @@ -353,26 +454,37 @@ public void testOr() { Assert.assertTrue("7, null => true", evaluator.eval(TestHelpers.Row.of(7, 0, null))); Assert.assertFalse("8, null => false", evaluator.eval(TestHelpers.Row.of(8, 0, null))); + Evaluator structEvaluator = + new Evaluator(STRUCT, or(equal("s1.s2.s3.s4.i", 7), notNull("s1.s2.s3.s4.i"))); - Evaluator structEvaluator = new Evaluator(STRUCT, or(equal("s1.s2.s3.s4.i", 7), - notNull("s1.s2.s3.s4.i"))); - - Assert.assertTrue("7, 7 => true", structEvaluator.eval(TestHelpers.Row.of(7, 0, 3, - TestHelpers.Row.of( + Assert.assertTrue( + "7, 7 => true", + structEvaluator.eval( TestHelpers.Row.of( + 7, + 0, + 3, TestHelpers.Row.of( - TestHelpers.Row.of(7))))))); - Assert.assertTrue("8, 8 => false", structEvaluator.eval(TestHelpers.Row.of(8, 0, 3, - TestHelpers.Row.of( + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(7))))))); + Assert.assertTrue( + "8, 8 => false", + structEvaluator.eval( TestHelpers.Row.of( + 8, + 0, + 3, TestHelpers.Row.of( - TestHelpers.Row.of(8))))))); + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(8))))))); - Assert.assertTrue("7, notnull => false", structEvaluator.eval(TestHelpers.Row.of(7, 0, null, - TestHelpers.Row.of( + Assert.assertTrue( + "7, notnull => false", + structEvaluator.eval( TestHelpers.Row.of( + 7, + 0, + null, TestHelpers.Row.of( - TestHelpers.Row.of(7))))))); + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(7))))))); } @Test @@ -382,16 +494,24 @@ public void testNot() { Assert.assertTrue("not(8 == 7) => false", evaluator.eval(TestHelpers.Row.of(8))); Evaluator structEvaluator = new Evaluator(STRUCT, not(equal("s1.s2.s3.s4.i", 7))); - Assert.assertFalse("not(7 == 7) => false", structEvaluator.eval(TestHelpers.Row.of(7, null, null, - TestHelpers.Row.of( + Assert.assertFalse( + "not(7 == 7) => false", + structEvaluator.eval( TestHelpers.Row.of( + 7, + null, + null, TestHelpers.Row.of( - TestHelpers.Row.of(7))))))); - Assert.assertTrue("not(8 == 7) => false", structEvaluator.eval(TestHelpers.Row.of(8, null, null, - TestHelpers.Row.of( + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(7))))))); + Assert.assertTrue( + "not(8 == 7) => false", + structEvaluator.eval( TestHelpers.Row.of( + 8, + null, + null, TestHelpers.Row.of( - TestHelpers.Row.of(8))))))); + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(8))))))); } @Test @@ -401,16 +521,24 @@ public void testCaseInsensitiveNot() { Assert.assertTrue("not(8 == 7) => false", evaluator.eval(TestHelpers.Row.of(8))); Evaluator structEvaluator = new Evaluator(STRUCT, not(equal("s1.s2.s3.s4.i", 7)), false); - Assert.assertFalse("not(7 == 7) => false", structEvaluator.eval(TestHelpers.Row.of(7, null, null, - TestHelpers.Row.of( + Assert.assertFalse( + "not(7 == 7) => false", + structEvaluator.eval( TestHelpers.Row.of( + 7, + null, + null, TestHelpers.Row.of( - TestHelpers.Row.of(7))))))); - Assert.assertTrue("not(8 == 7) => false", structEvaluator.eval(TestHelpers.Row.of(8, null, null, - TestHelpers.Row.of( + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(7))))))); + Assert.assertTrue( + "not(8 == 7) => false", + structEvaluator.eval( TestHelpers.Row.of( + 8, + null, + null, TestHelpers.Row.of( - TestHelpers.Row.of(8))))))); + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(8))))))); } @Test @@ -426,10 +554,10 @@ public void testCaseSensitiveNot() { public void testCharSeqValue() { StructType struct = StructType.of(required(34, "s", Types.StringType.get())); Evaluator evaluator = new Evaluator(struct, equal("s", "abc")); - Assert.assertTrue("string(abc) == utf8(abc) => true", - evaluator.eval(TestHelpers.Row.of(new Utf8("abc")))); - Assert.assertFalse("string(abc) == utf8(abcd) => false", - evaluator.eval(TestHelpers.Row.of(new Utf8("abcd")))); + Assert.assertTrue( + "string(abc) == utf8(abc) => true", evaluator.eval(TestHelpers.Row.of(new Utf8("abc")))); + Assert.assertFalse( + "string(abc) == utf8(abcd) => false", evaluator.eval(TestHelpers.Row.of(new Utf8("abcd")))); } @Test @@ -447,41 +575,51 @@ public void testIn() { Assert.assertTrue("7 in [7, 8] => true", evaluator.eval(TestHelpers.Row.of(7, 8, null))); Assert.assertFalse("9 in [7, 8] => false", evaluator.eval(TestHelpers.Row.of(9, 8, null))); - Evaluator intSetEvaluator = new Evaluator(STRUCT, - in("x", Long.MAX_VALUE, Integer.MAX_VALUE, Long.MIN_VALUE)); - Assert.assertTrue("Integer.MAX_VALUE in [Integer.MAX_VALUE] => true", + Evaluator intSetEvaluator = + new Evaluator(STRUCT, in("x", Long.MAX_VALUE, Integer.MAX_VALUE, Long.MIN_VALUE)); + Assert.assertTrue( + "Integer.MAX_VALUE in [Integer.MAX_VALUE] => true", intSetEvaluator.eval(TestHelpers.Row.of(Integer.MAX_VALUE, 7.0, null))); - Assert.assertFalse("6 in [Integer.MAX_VALUE] => false", + Assert.assertFalse( + "6 in [Integer.MAX_VALUE] => false", intSetEvaluator.eval(TestHelpers.Row.of(6, 6.8, null))); Evaluator integerEvaluator = new Evaluator(STRUCT, in("y", 7, 8, 9.1)); - Assert.assertTrue("7.0 in [7, 8, 9.1] => true", - integerEvaluator.eval(TestHelpers.Row.of(0, 7.0, null))); - Assert.assertTrue("9.1 in [7, 8, 9.1] => true", - integerEvaluator.eval(TestHelpers.Row.of(7, 9.1, null))); - Assert.assertFalse("6.8 in [7, 8, 9.1] => false", - integerEvaluator.eval(TestHelpers.Row.of(6, 6.8, null))); + Assert.assertTrue( + "7.0 in [7, 8, 9.1] => true", integerEvaluator.eval(TestHelpers.Row.of(0, 7.0, null))); + Assert.assertTrue( + "9.1 in [7, 8, 9.1] => true", integerEvaluator.eval(TestHelpers.Row.of(7, 9.1, null))); + Assert.assertFalse( + "6.8 in [7, 8, 9.1] => false", integerEvaluator.eval(TestHelpers.Row.of(6, 6.8, null))); Evaluator structEvaluator = new Evaluator(STRUCT, in("s1.s2.s3.s4.i", 7, 8, 9)); - Assert.assertTrue("7 in [7, 8, 9] => true", - structEvaluator.eval(TestHelpers.Row.of(7, 8, null, - TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(7))))))); - Assert.assertFalse("6 in [7, 8, 9] => false", - structEvaluator.eval(TestHelpers.Row.of(6, 8, null, - TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(6))))))); + Assert.assertTrue( + "7 in [7, 8, 9] => true", + structEvaluator.eval( + TestHelpers.Row.of( + 7, + 8, + null, + TestHelpers.Row.of( + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(7))))))); + Assert.assertFalse( + "6 in [7, 8, 9] => false", + structEvaluator.eval( + TestHelpers.Row.of( + 6, + 8, + null, + TestHelpers.Row.of( + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(6))))))); StructType charSeqStruct = StructType.of(required(34, "s", Types.StringType.get())); Evaluator charSeqEvaluator = new Evaluator(charSeqStruct, in("s", "abc", "abd", "abc")); - Assert.assertTrue("utf8(abc) in [string(abc), string(abd)] => true", - charSeqEvaluator.eval(TestHelpers.Row.of(new Utf8("abc")))); - Assert.assertFalse("utf8(abcd) in [string(abc), string(abd)] => false", - charSeqEvaluator.eval(TestHelpers.Row.of(new Utf8("abcd")))); + Assert.assertTrue( + "utf8(abc) in [string(abc), string(abd)] => true", + charSeqEvaluator.eval(TestHelpers.Row.of(new Utf8("abc")))); + Assert.assertFalse( + "utf8(abcd) in [string(abc), string(abd)] => false", + charSeqEvaluator.eval(TestHelpers.Row.of(new Utf8("abcd")))); } @Test @@ -544,41 +682,51 @@ public void testNotIn() { Assert.assertFalse("7 not in [7, 8] => false", evaluator.eval(TestHelpers.Row.of(7, 8, null))); Assert.assertTrue("6 not in [7, 8] => true", evaluator.eval(TestHelpers.Row.of(9, 8, null))); - Evaluator intSetEvaluator = new Evaluator(STRUCT, - notIn("x", Long.MAX_VALUE, Integer.MAX_VALUE, Long.MIN_VALUE)); - Assert.assertFalse("Integer.MAX_VALUE not_in [Integer.MAX_VALUE] => false", + Evaluator intSetEvaluator = + new Evaluator(STRUCT, notIn("x", Long.MAX_VALUE, Integer.MAX_VALUE, Long.MIN_VALUE)); + Assert.assertFalse( + "Integer.MAX_VALUE not_in [Integer.MAX_VALUE] => false", intSetEvaluator.eval(TestHelpers.Row.of(Integer.MAX_VALUE, 7.0, null))); - Assert.assertTrue("6 not_in [Integer.MAX_VALUE] => true", + Assert.assertTrue( + "6 not_in [Integer.MAX_VALUE] => true", intSetEvaluator.eval(TestHelpers.Row.of(6, 6.8, null))); Evaluator integerEvaluator = new Evaluator(STRUCT, notIn("y", 7, 8, 9.1)); - Assert.assertFalse("7.0 not in [7, 8, 9] => false", - integerEvaluator.eval(TestHelpers.Row.of(0, 7.0, null))); - Assert.assertFalse("9.1 not in [7, 8, 9.1] => false", - integerEvaluator.eval(TestHelpers.Row.of(7, 9.1, null))); - Assert.assertTrue("6.8 not in [7, 8, 9.1] => true", - integerEvaluator.eval(TestHelpers.Row.of(6, 6.8, null))); + Assert.assertFalse( + "7.0 not in [7, 8, 9] => false", integerEvaluator.eval(TestHelpers.Row.of(0, 7.0, null))); + Assert.assertFalse( + "9.1 not in [7, 8, 9.1] => false", integerEvaluator.eval(TestHelpers.Row.of(7, 9.1, null))); + Assert.assertTrue( + "6.8 not in [7, 8, 9.1] => true", integerEvaluator.eval(TestHelpers.Row.of(6, 6.8, null))); Evaluator structEvaluator = new Evaluator(STRUCT, notIn("s1.s2.s3.s4.i", 7, 8, 9)); - Assert.assertFalse("7 not in [7, 8, 9] => false", - structEvaluator.eval(TestHelpers.Row.of(7, 8, null, - TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(7))))))); - Assert.assertTrue("6 not in [7, 8, 9] => true", - structEvaluator.eval(TestHelpers.Row.of(6, 8, null, - TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of( - TestHelpers.Row.of(6))))))); + Assert.assertFalse( + "7 not in [7, 8, 9] => false", + structEvaluator.eval( + TestHelpers.Row.of( + 7, + 8, + null, + TestHelpers.Row.of( + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(7))))))); + Assert.assertTrue( + "6 not in [7, 8, 9] => true", + structEvaluator.eval( + TestHelpers.Row.of( + 6, + 8, + null, + TestHelpers.Row.of( + TestHelpers.Row.of(TestHelpers.Row.of(TestHelpers.Row.of(6))))))); StructType charSeqStruct = StructType.of(required(34, "s", Types.StringType.get())); Evaluator charSeqEvaluator = new Evaluator(charSeqStruct, notIn("s", "abc", "abd", "abc")); - Assert.assertFalse("utf8(abc) not in [string(abc), string(abd)] => false", - charSeqEvaluator.eval(TestHelpers.Row.of(new Utf8("abc")))); - Assert.assertTrue("utf8(abcd) not in [string(abc), string(abd)] => true", - charSeqEvaluator.eval(TestHelpers.Row.of(new Utf8("abcd")))); + Assert.assertFalse( + "utf8(abc) not in [string(abc), string(abd)] => false", + charSeqEvaluator.eval(TestHelpers.Row.of(new Utf8("abc")))); + Assert.assertTrue( + "utf8(abcd) not in [string(abc), string(abd)] => true", + charSeqEvaluator.eval(TestHelpers.Row.of(new Utf8("abcd")))); } @Test diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionBinding.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionBinding.java index 50ed13dceee4..fad0b9157150 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionBinding.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionBinding.java @@ -16,17 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.types.Types.StructType; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Test; - import static org.apache.iceberg.expressions.Expressions.alwaysFalse; import static org.apache.iceberg.expressions.Expressions.alwaysTrue; import static org.apache.iceberg.expressions.Expressions.and; @@ -40,13 +31,21 @@ import static org.apache.iceberg.expressions.Expressions.startsWith; import static org.apache.iceberg.types.Types.NestedField.required; +import org.apache.iceberg.TestHelpers; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.types.Types.StructType; +import org.assertj.core.api.Assertions; +import org.junit.Assert; +import org.junit.Test; + public class TestExpressionBinding { - private static final StructType STRUCT = StructType.of( - required(0, "x", Types.IntegerType.get()), - required(1, "y", Types.IntegerType.get()), - required(2, "z", Types.IntegerType.get()), - required(3, "data", Types.StringType.get()) - ); + private static final StructType STRUCT = + StructType.of( + required(0, "x", Types.IntegerType.get()), + required(1, "y", Types.IntegerType.get()), + required(2, "z", Types.IntegerType.get()), + required(3, "data", Types.StringType.get())); @Test public void testMissingReference() { @@ -156,22 +155,21 @@ public void testNotStartsWith() { TestHelpers.assertAllReferencesBound("NotStartsWith", boundExpr); // Make sure the expression is a NotStartsWith BoundPredicate pred = TestHelpers.assertAndUnwrap(boundExpr, BoundPredicate.class); - Assert.assertEquals("Should be right operation", Expression.Operation.NOT_STARTS_WITH, pred.op()); + Assert.assertEquals( + "Should be right operation", Expression.Operation.NOT_STARTS_WITH, pred.op()); Assert.assertEquals("Should bind term to correct field id", 21, pred.term().ref().fieldId()); } @Test public void testAlwaysTrue() { - Assert.assertEquals("Should not change alwaysTrue", - alwaysTrue(), - Binder.bind(STRUCT, alwaysTrue())); + Assert.assertEquals( + "Should not change alwaysTrue", alwaysTrue(), Binder.bind(STRUCT, alwaysTrue())); } @Test public void testAlwaysFalse() { - Assert.assertEquals("Should not change alwaysFalse", - alwaysFalse(), - Binder.bind(STRUCT, alwaysFalse())); + Assert.assertEquals( + "Should not change alwaysFalse", alwaysFalse(), Binder.bind(STRUCT, alwaysFalse())); } @Test @@ -181,11 +179,15 @@ public void testBasicSimplification() { // the second predicate is always true once it is bound because z is an integer and the literal // is less than any 32-bit integer value - Assert.assertEquals("Should simplify or expression to alwaysTrue", - alwaysTrue(), Binder.bind(STRUCT, or(lessThan("y", 100), greaterThan("z", -9999999999L)))); + Assert.assertEquals( + "Should simplify or expression to alwaysTrue", + alwaysTrue(), + Binder.bind(STRUCT, or(lessThan("y", 100), greaterThan("z", -9999999999L)))); // similarly, the second predicate is always false - Assert.assertEquals("Should simplify and expression to predicate", - alwaysFalse(), Binder.bind(STRUCT, and(lessThan("y", 100), lessThan("z", -9999999999L)))); + Assert.assertEquals( + "Should simplify and expression to predicate", + alwaysFalse(), + Binder.bind(STRUCT, and(lessThan("y", 100), lessThan("z", -9999999999L)))); Expression bound = Binder.bind(STRUCT, not(not(lessThan("y", 100)))); BoundPredicate pred = TestHelpers.assertAndUnwrap(bound); @@ -197,8 +199,11 @@ public void testTransformExpressionBinding() { Expression bound = Binder.bind(STRUCT, equal(bucket("x", 16), 10)); TestHelpers.assertAllReferencesBound("BoundTransform", bound); BoundPredicate pred = TestHelpers.assertAndUnwrap(bound); - Assertions.assertThat(pred.term()).as("Should use a BoundTransform child").isInstanceOf(BoundTransform.class); + Assertions.assertThat(pred.term()) + .as("Should use a BoundTransform child") + .isInstanceOf(BoundTransform.class); BoundTransform transformExpr = (BoundTransform) pred.term(); - Assert.assertEquals("Should use a bucket[16] transform", "bucket[16]", transformExpr.transform().toString()); + Assert.assertEquals( + "Should use a bucket[16] transform", "bucket[16]", transformExpr.transform().toString()); } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionHelpers.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionHelpers.java index 4a67a5af11b6..62f6ff9b21a0 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionHelpers.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionHelpers.java @@ -16,18 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; -import java.util.concurrent.Callable; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.transforms.Transforms; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.types.Types.NestedField; -import org.apache.iceberg.types.Types.StructType; -import org.junit.Assert; -import org.junit.Test; - import static org.apache.iceberg.expressions.Expressions.alwaysFalse; import static org.apache.iceberg.expressions.Expressions.alwaysTrue; import static org.apache.iceberg.expressions.Expressions.and; @@ -55,145 +45,156 @@ import static org.apache.iceberg.expressions.Expressions.truncate; import static org.apache.iceberg.expressions.Expressions.year; +import java.util.concurrent.Callable; +import org.apache.iceberg.AssertHelpers; +import org.apache.iceberg.transforms.Transforms; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.types.Types.StructType; +import org.junit.Assert; +import org.junit.Test; + public class TestExpressionHelpers { private final UnboundPredicate pred = lessThan("x", 7); @Test public void testSimplifyOr() { - Assert.assertEquals("alwaysTrue or pred => alwaysTrue", - alwaysTrue(), or(alwaysTrue(), pred)); - Assert.assertEquals("pred or alwaysTrue => alwaysTrue", - alwaysTrue(), or(pred, alwaysTrue())); - - Assert.assertEquals("alwaysFalse or pred => pred", - pred, or(alwaysFalse(), pred)); - Assert.assertEquals("pred or alwaysTrue => pred", - pred, or(pred, alwaysFalse())); + Assert.assertEquals("alwaysTrue or pred => alwaysTrue", alwaysTrue(), or(alwaysTrue(), pred)); + Assert.assertEquals("pred or alwaysTrue => alwaysTrue", alwaysTrue(), or(pred, alwaysTrue())); + + Assert.assertEquals("alwaysFalse or pred => pred", pred, or(alwaysFalse(), pred)); + Assert.assertEquals("pred or alwaysTrue => pred", pred, or(pred, alwaysFalse())); } @Test public void testSimplifyAnd() { - Assert.assertEquals("alwaysTrue and pred => pred", - pred, and(alwaysTrue(), pred)); - Assert.assertEquals("pred and alwaysTrue => pred", - pred, and(pred, alwaysTrue())); - - Assert.assertEquals("alwaysFalse and pred => alwaysFalse", - alwaysFalse(), and(alwaysFalse(), pred)); - Assert.assertEquals("pred and alwaysFalse => alwaysFalse", - alwaysFalse(), and(pred, alwaysFalse())); + Assert.assertEquals("alwaysTrue and pred => pred", pred, and(alwaysTrue(), pred)); + Assert.assertEquals("pred and alwaysTrue => pred", pred, and(pred, alwaysTrue())); + + Assert.assertEquals( + "alwaysFalse and pred => alwaysFalse", alwaysFalse(), and(alwaysFalse(), pred)); + Assert.assertEquals( + "pred and alwaysFalse => alwaysFalse", alwaysFalse(), and(pred, alwaysFalse())); } @Test public void testSimplifyNot() { - Assert.assertEquals("not(alwaysTrue) => alwaysFalse", - alwaysFalse(), not(alwaysTrue())); - Assert.assertEquals("not(alwaysFalse) => alwaysTrue", - alwaysTrue(), not(alwaysFalse())); + Assert.assertEquals("not(alwaysTrue) => alwaysFalse", alwaysFalse(), not(alwaysTrue())); + Assert.assertEquals("not(alwaysFalse) => alwaysTrue", alwaysTrue(), not(alwaysFalse())); - Assert.assertEquals("not(not(pred)) => pred", - pred, not(not(pred))); + Assert.assertEquals("not(not(pred)) => pred", pred, not(not(pred))); } @Test public void testRewriteNot() { - StructType struct = StructType.of( - NestedField.optional(1, "a", Types.IntegerType.get()), - NestedField.optional(2, "s", Types.StringType.get())); - Expression[][] expressions = new Expression[][] { - // (rewritten pred, original pred) pairs - { isNull("a"), isNull("a") }, - { notNull("a"), not(isNull("a")) }, - { notNull("a"), notNull("a") }, - { isNull("a"), not(notNull("a")) }, - { equal("a", 5), equal("a", 5) }, - { notEqual("a", 5), not(equal("a", 5)) }, - { notEqual("a", 5), notEqual("a", 5) }, - { equal("a", 5), not(notEqual("a", 5)) }, - { in("a", 5, 6), in("a", 5, 6) }, - { notIn("a", 5, 6), not(in("a", 5, 6)) }, - { notIn("a", 5, 6), notIn("a", 5, 6) }, - { in("a", 5, 6), not(notIn("a", 5, 6)) }, - { lessThan("a", 5), lessThan("a", 5) }, - { greaterThanOrEqual("a", 5), not(lessThan("a", 5)) }, - { greaterThanOrEqual("a", 5), greaterThanOrEqual("a", 5) }, - { lessThan("a", 5), not(greaterThanOrEqual("a", 5)) }, - { lessThanOrEqual("a", 5), lessThanOrEqual("a", 5) }, - { greaterThan("a", 5), not(lessThanOrEqual("a", 5)) }, - { greaterThan("a", 5), greaterThan("a", 5) }, - { lessThanOrEqual("a", 5), not(greaterThan("a", 5)) }, - { or(equal("a", 5), isNull("a")), or(equal("a", 5), isNull("a")) }, - { and(notEqual("a", 5), notNull("a")), not(or(equal("a", 5), isNull("a"))) }, - { and(notEqual("a", 5), notNull("a")), and(notEqual("a", 5), notNull("a")) }, - { or(equal("a", 5), isNull("a")), not(and(notEqual("a", 5), notNull("a"))) }, - { or(equal("a", 5), notNull("a")), or(equal("a", 5), not(isNull("a"))) }, - { startsWith("s", "hello"), not(notStartsWith("s", "hello")) }, - { notStartsWith("s", "world"), not(startsWith("s", "world")) } - }; + StructType struct = + StructType.of( + NestedField.optional(1, "a", Types.IntegerType.get()), + NestedField.optional(2, "s", Types.StringType.get())); + Expression[][] expressions = + new Expression[][] { + // (rewritten pred, original pred) pairs + {isNull("a"), isNull("a")}, + {notNull("a"), not(isNull("a"))}, + {notNull("a"), notNull("a")}, + {isNull("a"), not(notNull("a"))}, + {equal("a", 5), equal("a", 5)}, + {notEqual("a", 5), not(equal("a", 5))}, + {notEqual("a", 5), notEqual("a", 5)}, + {equal("a", 5), not(notEqual("a", 5))}, + {in("a", 5, 6), in("a", 5, 6)}, + {notIn("a", 5, 6), not(in("a", 5, 6))}, + {notIn("a", 5, 6), notIn("a", 5, 6)}, + {in("a", 5, 6), not(notIn("a", 5, 6))}, + {lessThan("a", 5), lessThan("a", 5)}, + {greaterThanOrEqual("a", 5), not(lessThan("a", 5))}, + {greaterThanOrEqual("a", 5), greaterThanOrEqual("a", 5)}, + {lessThan("a", 5), not(greaterThanOrEqual("a", 5))}, + {lessThanOrEqual("a", 5), lessThanOrEqual("a", 5)}, + {greaterThan("a", 5), not(lessThanOrEqual("a", 5))}, + {greaterThan("a", 5), greaterThan("a", 5)}, + {lessThanOrEqual("a", 5), not(greaterThan("a", 5))}, + {or(equal("a", 5), isNull("a")), or(equal("a", 5), isNull("a"))}, + {and(notEqual("a", 5), notNull("a")), not(or(equal("a", 5), isNull("a")))}, + {and(notEqual("a", 5), notNull("a")), and(notEqual("a", 5), notNull("a"))}, + {or(equal("a", 5), isNull("a")), not(and(notEqual("a", 5), notNull("a")))}, + {or(equal("a", 5), notNull("a")), or(equal("a", 5), not(isNull("a")))}, + {startsWith("s", "hello"), not(notStartsWith("s", "hello"))}, + {notStartsWith("s", "world"), not(startsWith("s", "world"))} + }; for (Expression[] pair : expressions) { // unbound rewrite - Assert.assertEquals(String.format("rewriteNot(%s) should be %s", pair[1], pair[0]), - pair[0].toString(), rewriteNot(pair[1]).toString()); + Assert.assertEquals( + String.format("rewriteNot(%s) should be %s", pair[1], pair[0]), + pair[0].toString(), + rewriteNot(pair[1]).toString()); // bound rewrite Expression expectedBound = Binder.bind(struct, pair[0]); Expression toRewriteBound = Binder.bind(struct, pair[1]); - Assert.assertEquals(String.format("rewriteNot(%s) should be %s", toRewriteBound, expectedBound), - expectedBound.toString(), rewriteNot(toRewriteBound).toString()); + Assert.assertEquals( + String.format("rewriteNot(%s) should be %s", toRewriteBound, expectedBound), + expectedBound.toString(), + rewriteNot(toRewriteBound).toString()); } } @Test public void testTransformExpressions() { - Assert.assertEquals("Should produce the correct expression string", + Assert.assertEquals( + "Should produce the correct expression string", "year(ref(name=\"ts\")) == \"2019\"", equal(year("ts"), "2019").toString()); - Assert.assertEquals("Should produce the correct expression string", + Assert.assertEquals( + "Should produce the correct expression string", "month(ref(name=\"ts\")) == 1234", equal(month("ts"), 1234).toString()); - Assert.assertEquals("Should produce the correct expression string", + Assert.assertEquals( + "Should produce the correct expression string", "day(ref(name=\"ts\")) == \"2019-12-04\"", equal(day("ts"), "2019-12-04").toString()); - Assert.assertEquals("Should produce the correct expression string", + Assert.assertEquals( + "Should produce the correct expression string", "hour(ref(name=\"ts\")) == \"2019-12-04-10\"", equal(hour("ts"), "2019-12-04-10").toString()); - Assert.assertEquals("Should produce the correct expression string", + Assert.assertEquals( + "Should produce the correct expression string", "truncate[6](ref(name=\"str\")) == \"abcdef\"", equal(truncate("str", 6), "abcdef").toString()); - Assert.assertEquals("Should produce the correct expression string", + Assert.assertEquals( + "Should produce the correct expression string", "truncate[5](ref(name=\"i\")) == 10", equal(truncate("i", 5), 10).toString()); - Assert.assertEquals("Should produce the correct expression string", + Assert.assertEquals( + "Should produce the correct expression string", "bucket[16](ref(name=\"id\")) == 12", equal(bucket("id", 16), 12).toString()); } @Test public void testNullName() { - AssertHelpers.assertThrows("Should catch null column names when creating expressions", - NullPointerException.class, "Name cannot be null", () -> equal((String) null, 5)); + AssertHelpers.assertThrows( + "Should catch null column names when creating expressions", + NullPointerException.class, + "Name cannot be null", + () -> equal((String) null, 5)); } @Test public void testNullValueExpr() { - AssertHelpers.assertThrows("Should catch null value expressions", - NullPointerException.class, "Term cannot be null", + AssertHelpers.assertThrows( + "Should catch null value expressions", + NullPointerException.class, + "Term cannot be null", () -> equal((UnboundTerm) null, 5)); } @Test public void testMultiAnd() { - Expression expected = and( - and( - equal("a", 1), - equal("b", 2)), - equal("c", 3)); + Expression expected = and(and(equal("a", 1), equal("b", 2)), equal("c", 3)); - Expression actual = and( - equal("a", 1), - equal("b", 2), - equal("c", 3)); + Expression actual = and(equal("a", 1), equal("b", 2), equal("c", 3)); Assert.assertEquals(expected.toString(), actual.toString()); } @@ -228,13 +229,14 @@ public void testInvalidateNaNInput() { } private void assertInvalidateNaNThrows(Callable> callable) { - AssertHelpers.assertThrows("Should invalidate NaN input", - IllegalArgumentException.class, "Cannot create expression literal from NaN", + AssertHelpers.assertThrows( + "Should invalidate NaN input", + IllegalArgumentException.class, + "Cannot create expression literal from NaN", callable); } private UnboundTerm self(String name) { return new UnboundTransform<>(ref(name), Transforms.identity(Types.DoubleType.get())); } - } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionSerialization.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionSerialization.java index 42c9a89299b0..d8a30c5fcf52 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionSerialization.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionSerialization.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.util.Collection; @@ -30,38 +29,39 @@ public class TestExpressionSerialization { @Test public void testExpressions() throws Exception { - Schema schema = new Schema( - Types.NestedField.optional(34, "a", Types.IntegerType.get()), - Types.NestedField.required(35, "s", Types.StringType.get()) - ); - - Expression[] expressions = new Expression[] { - Expressions.alwaysFalse(), - Expressions.alwaysTrue(), - Expressions.lessThan("x", 5), - Expressions.lessThanOrEqual("y", -3), - Expressions.greaterThan("z", 0), - Expressions.greaterThanOrEqual("t", 129), - Expressions.equal("col", "data"), - Expressions.in("col", "a", "b"), - Expressions.notIn("col", 1, 2, 3), - Expressions.notEqual("col", "abc"), - Expressions.notNull("maybeNull"), - Expressions.isNull("maybeNull2"), - Expressions.isNaN("maybeNaN"), - Expressions.notNaN("maybeNaN2"), - Expressions.startsWith("col", "abc"), - Expressions.notStartsWith("col", "abc"), - Expressions.not(Expressions.greaterThan("a", 10)), - Expressions.and(Expressions.greaterThanOrEqual("a", 0), Expressions.lessThan("a", 3)), - Expressions.or(Expressions.lessThan("a", 0), Expressions.greaterThan("a", 10)), - Expressions.equal("a", 5).bind(schema.asStruct()), - Expressions.in("a", 5, 6, 7).bind(schema.asStruct()), - Expressions.notIn("s", "abc", "xyz").bind(schema.asStruct()), - Expressions.isNull("a").bind(schema.asStruct()), - Expressions.startsWith("s", "abc").bind(schema.asStruct()), - Expressions.notStartsWith("s", "xyz").bind(schema.asStruct()) - }; + Schema schema = + new Schema( + Types.NestedField.optional(34, "a", Types.IntegerType.get()), + Types.NestedField.required(35, "s", Types.StringType.get())); + + Expression[] expressions = + new Expression[] { + Expressions.alwaysFalse(), + Expressions.alwaysTrue(), + Expressions.lessThan("x", 5), + Expressions.lessThanOrEqual("y", -3), + Expressions.greaterThan("z", 0), + Expressions.greaterThanOrEqual("t", 129), + Expressions.equal("col", "data"), + Expressions.in("col", "a", "b"), + Expressions.notIn("col", 1, 2, 3), + Expressions.notEqual("col", "abc"), + Expressions.notNull("maybeNull"), + Expressions.isNull("maybeNull2"), + Expressions.isNaN("maybeNaN"), + Expressions.notNaN("maybeNaN2"), + Expressions.startsWith("col", "abc"), + Expressions.notStartsWith("col", "abc"), + Expressions.not(Expressions.greaterThan("a", 10)), + Expressions.and(Expressions.greaterThanOrEqual("a", 0), Expressions.lessThan("a", 3)), + Expressions.or(Expressions.lessThan("a", 0), Expressions.greaterThan("a", 10)), + Expressions.equal("a", 5).bind(schema.asStruct()), + Expressions.in("a", 5, 6, 7).bind(schema.asStruct()), + Expressions.notIn("s", "abc", "xyz").bind(schema.asStruct()), + Expressions.isNull("a").bind(schema.asStruct()), + Expressions.startsWith("s", "abc").bind(schema.asStruct()), + Expressions.notStartsWith("s", "xyz").bind(schema.asStruct()) + }; for (Expression expression : expressions) { Expression copy = TestHelpers.roundTripSerialize(expression); @@ -95,11 +95,11 @@ private static boolean equals(Expression left, Expression right) { case NOT: return equals(((Not) left).child(), ((Not) right).child()); case AND: - return equals(((And) left).left(), ((And) right).left()) && - equals(((And) left).right(), ((And) right).right()); + return equals(((And) left).left(), ((And) right).left()) + && equals(((And) left).right(), ((And) right).right()); case OR: - return equals(((Or) left).left(), ((Or) right).left()) && - equals(((Or) left).right(), ((Or) right).right()); + return equals(((Or) left).left(), ((Or) right).left()) + && equals(((Or) left).right(), ((Or) right).right()); default: return false; } @@ -111,16 +111,16 @@ private static boolean equals(Term left, Term right) { } else if (left instanceof UnboundTransform && right instanceof UnboundTransform) { UnboundTransform unboundLeft = (UnboundTransform) left; UnboundTransform unboundRight = (UnboundTransform) right; - if (equals(unboundLeft.ref(), unboundRight.ref()) && - unboundLeft.transform().toString().equals(unboundRight.transform().toString())) { + if (equals(unboundLeft.ref(), unboundRight.ref()) + && unboundLeft.transform().toString().equals(unboundRight.transform().toString())) { return true; } } else if (left instanceof BoundTransform && right instanceof BoundTransform) { BoundTransform boundLeft = (BoundTransform) left; BoundTransform boundRight = (BoundTransform) right; - if (equals(boundLeft.ref(), boundRight.ref()) && - boundLeft.transform().toString().equals(boundRight.transform().toString())) { + if (equals(boundLeft.ref(), boundRight.ref()) + && boundLeft.transform().toString().equals(boundRight.transform().toString())) { return true; } } @@ -138,8 +138,10 @@ private static boolean equals(Predicate left, Predicate right) { return false; } - if (left.op() == Operation.IS_NULL || left.op() == Operation.NOT_NULL || - left.op() == Operation.IS_NAN || left.op() == Operation.NOT_NAN) { + if (left.op() == Operation.IS_NULL + || left.op() == Operation.NOT_NULL + || left.op() == Operation.IS_NAN + || left.op() == Operation.NOT_NAN) { return true; } @@ -153,15 +155,21 @@ private static boolean equals(Predicate left, Predicate right) { if (left.op() == Operation.IN || left.op() == Operation.NOT_IN) { return equals(lpred.literals(), rpred.literals()); } - return lpred.literal().comparator() - .compare(lpred.literal().value(), rpred.literal().value()) == 0; + return lpred.literal().comparator().compare(lpred.literal().value(), rpred.literal().value()) + == 0; } else if (left instanceof BoundPredicate) { BoundPredicate lpred = (BoundPredicate) left; BoundPredicate rpred = (BoundPredicate) right; if (lpred.isLiteralPredicate() && rpred.isLiteralPredicate()) { - return lpred.asLiteralPredicate().literal().comparator() - .compare(lpred.asLiteralPredicate().literal().value(), rpred.asLiteralPredicate().literal().value()) == 0; + return lpred + .asLiteralPredicate() + .literal() + .comparator() + .compare( + lpred.asLiteralPredicate().literal().value(), + rpred.asLiteralPredicate().literal().value()) + == 0; } else if (lpred.isSetPredicate() && rpred.isSetPredicate()) { return equals(lpred.asSetPredicate().literalSet(), rpred.asSetPredicate().literalSet()); } else { @@ -169,8 +177,8 @@ private static boolean equals(Predicate left, Predicate right) { } } else { - throw new UnsupportedOperationException(String.format( - "Predicate equality check for %s is not supported", left.getClass())); + throw new UnsupportedOperationException( + String.format("Predicate equality check for %s is not supported", left.getClass())); } } @@ -199,8 +207,7 @@ private static boolean equals(Reference left, Reference right) { BoundReference lref = (BoundReference) left; BoundReference rref = (BoundReference) right; - return lref.fieldId() == rref.fieldId() && - lref.type().equals(rref.type()); + return lref.fieldId() == rref.fieldId() && lref.type().equals(rref.type()); } return false; diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java index e35a73acf1e5..21b5f3ca6e43 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import org.apache.iceberg.PartitionSpec; @@ -27,23 +26,27 @@ import org.junit.Test; public class TestExpressionUtil { - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.required(2, "val", Types.IntegerType.get()), - Types.NestedField.required(3, "val2", Types.IntegerType.get()), - Types.NestedField.required(4, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.required(5, "date", Types.DateType.get()), - Types.NestedField.required(6, "time", Types.DateType.get()), - Types.NestedField.optional(7, "data", Types.StringType.get()), - Types.NestedField.optional(8, "measurement", Types.DoubleType.get())); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.required(2, "val", Types.IntegerType.get()), + Types.NestedField.required(3, "val2", Types.IntegerType.get()), + Types.NestedField.required(4, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.required(5, "date", Types.DateType.get()), + Types.NestedField.required(6, "time", Types.DateType.get()), + Types.NestedField.optional(7, "data", Types.StringType.get()), + Types.NestedField.optional(8, "measurement", Types.DoubleType.get())); private static final Types.StructType STRUCT = SCHEMA.asStruct(); @Test public void testUnchangedUnaryPredicates() { - for (Expression unary : Lists.newArrayList( - Expressions.isNull("test"), Expressions.notNull("test"), Expressions.isNaN("test"), Expressions.notNaN("test")) - ) { + for (Expression unary : + Lists.newArrayList( + Expressions.isNull("test"), + Expressions.notNull("test"), + Expressions.isNaN("test"), + Expressions.notNaN("test"))) { assertEquals(unary, ExpressionUtil.sanitize(unary)); } } @@ -54,7 +57,8 @@ public void testSanitizeIn() { Expressions.in("test", "(2-digit-int)", "(3-digit-int)"), ExpressionUtil.sanitize(Expressions.in("test", 34, 345))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "test IN ((2-digit-int), (3-digit-int))", ExpressionUtil.toSanitizedString(Expressions.in("test", 34, 345))); } @@ -65,7 +69,8 @@ public void testSanitizeNotIn() { Expressions.notIn("test", "(2-digit-int)", "(3-digit-int)"), ExpressionUtil.sanitize(Expressions.notIn("test", 34, 345))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "test NOT IN ((2-digit-int), (3-digit-int))", ExpressionUtil.toSanitizedString(Expressions.notIn("test", 34, 345))); } @@ -76,7 +81,8 @@ public void testSanitizeLessThan() { Expressions.lessThan("test", "(2-digit-int)"), ExpressionUtil.sanitize(Expressions.lessThan("test", 34))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "test < (2-digit-int)", ExpressionUtil.toSanitizedString(Expressions.lessThan("test", 34))); } @@ -87,7 +93,8 @@ public void testSanitizeLessThanOrEqual() { Expressions.lessThanOrEqual("test", "(2-digit-int)"), ExpressionUtil.sanitize(Expressions.lessThanOrEqual("test", 34))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "test <= (2-digit-int)", ExpressionUtil.toSanitizedString(Expressions.lessThanOrEqual("test", 34))); } @@ -98,7 +105,8 @@ public void testSanitizeGreaterThan() { Expressions.greaterThan("test", "(2-digit-int)"), ExpressionUtil.sanitize(Expressions.greaterThan("test", 34))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "test > (2-digit-int)", ExpressionUtil.toSanitizedString(Expressions.greaterThan("test", 34))); } @@ -109,7 +117,8 @@ public void testSanitizeGreaterThanOrEqual() { Expressions.greaterThanOrEqual("test", "(2-digit-int)"), ExpressionUtil.sanitize(Expressions.greaterThanOrEqual("test", 34))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "test >= (2-digit-int)", ExpressionUtil.toSanitizedString(Expressions.greaterThanOrEqual("test", 34))); } @@ -120,7 +129,8 @@ public void testSanitizeEqual() { Expressions.equal("test", "(2-digit-int)"), ExpressionUtil.sanitize(Expressions.equal("test", 34))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "test = (2-digit-int)", ExpressionUtil.toSanitizedString(Expressions.equal("test", 34))); } @@ -131,7 +141,8 @@ public void testSanitizeNotEqual() { Expressions.notEqual("test", "(2-digit-int)"), ExpressionUtil.sanitize(Expressions.notEqual("test", 34))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "test != (2-digit-int)", ExpressionUtil.toSanitizedString(Expressions.notEqual("test", 34))); } @@ -142,7 +153,8 @@ public void testSanitizeStartsWith() { Expressions.startsWith("test", "(hash-34d05fb7)"), ExpressionUtil.sanitize(Expressions.startsWith("test", "aaa"))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "test STARTS WITH (hash-34d05fb7)", ExpressionUtil.toSanitizedString(Expressions.startsWith("test", "aaa"))); } @@ -153,7 +165,8 @@ public void testSanitizeNotStartsWith() { Expressions.notStartsWith("test", "(hash-34d05fb7)"), ExpressionUtil.sanitize(Expressions.notStartsWith("test", "aaa"))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "test NOT STARTS WITH (hash-34d05fb7)", ExpressionUtil.toSanitizedString(Expressions.notStartsWith("test", "aaa"))); } @@ -164,7 +177,8 @@ public void testSanitizeTransformedTerm() { Expressions.equal(Expressions.truncate("test", 2), "(2-digit-int)"), ExpressionUtil.sanitize(Expressions.equal(Expressions.truncate("test", 2), 34))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "truncate[2](test) = (2-digit-int)", ExpressionUtil.toSanitizedString(Expressions.equal(Expressions.truncate("test", 2), 34))); } @@ -175,7 +189,8 @@ public void testSanitizeLong() { Expressions.equal("test", "(2-digit-int)"), ExpressionUtil.sanitize(Expressions.equal("test", 34L))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "test = (2-digit-int)", ExpressionUtil.toSanitizedString(Expressions.equal("test", 34L))); } @@ -186,7 +201,8 @@ public void testSanitizeFloat() { Expressions.equal("test", "(2-digit-float)"), ExpressionUtil.sanitize(Expressions.equal("test", 34.12F))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "test = (2-digit-float)", ExpressionUtil.toSanitizedString(Expressions.equal("test", 34.12F))); } @@ -197,7 +213,8 @@ public void testSanitizeDouble() { Expressions.equal("test", "(2-digit-float)"), ExpressionUtil.sanitize(Expressions.equal("test", 34.12D))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "test = (2-digit-float)", ExpressionUtil.toSanitizedString(Expressions.equal("test", 34.12D))); } @@ -208,7 +225,8 @@ public void testSanitizeDate() { Expressions.equal("test", "(date)"), ExpressionUtil.sanitize(Expressions.equal("test", "2022-04-29"))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "test = (date)", ExpressionUtil.toSanitizedString(Expressions.equal("test", "2022-04-29"))); } @@ -219,20 +237,26 @@ public void testSanitizeTime() { Expressions.equal("test", "(time)"), ExpressionUtil.sanitize(Expressions.equal("test", "23:49:51"))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "test = (time)", ExpressionUtil.toSanitizedString(Expressions.equal("test", "23:49:51"))); } @Test public void testSanitizeTimestamp() { - for (String timestamp : Lists.newArrayList("2022-04-29T23:49:51", "2022-04-29T23:49:51.123456", - "2022-04-29T23:49:51-07:00", "2022-04-29T23:49:51.123456+01:00")) { + for (String timestamp : + Lists.newArrayList( + "2022-04-29T23:49:51", + "2022-04-29T23:49:51.123456", + "2022-04-29T23:49:51-07:00", + "2022-04-29T23:49:51.123456+01:00")) { assertEquals( Expressions.equal("test", "(timestamp)"), ExpressionUtil.sanitize(Expressions.equal("test", timestamp))); - Assert.assertEquals("Sanitized string should be identical except for descriptive literal", + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", "test = (timestamp)", ExpressionUtil.toSanitizedString(Expressions.equal("test", timestamp))); } @@ -240,29 +264,31 @@ public void testSanitizeTimestamp() { @Test public void testIdenticalExpressionIsEquivalent() { - Expression[] exprs = new Expression[] { - Expressions.isNull("data"), - Expressions.notNull("data"), - Expressions.isNaN("measurement"), - Expressions.notNaN("measurement"), - Expressions.lessThan("id", 5), - Expressions.lessThanOrEqual("id", 5), - Expressions.greaterThan("id", 5), - Expressions.greaterThanOrEqual("id", 5), - Expressions.equal("id", 5), - Expressions.notEqual("id", 5), - Expressions.in("id", 5, 6), - Expressions.notIn("id", 5, 6), - Expressions.startsWith("data", "aaa"), - Expressions.notStartsWith("data", "aaa"), - Expressions.alwaysTrue(), - Expressions.alwaysFalse(), - Expressions.and(Expressions.lessThan("id", 5), Expressions.notNull("data")), - Expressions.or(Expressions.lessThan("id", 5), Expressions.notNull("data")), - }; + Expression[] exprs = + new Expression[] { + Expressions.isNull("data"), + Expressions.notNull("data"), + Expressions.isNaN("measurement"), + Expressions.notNaN("measurement"), + Expressions.lessThan("id", 5), + Expressions.lessThanOrEqual("id", 5), + Expressions.greaterThan("id", 5), + Expressions.greaterThanOrEqual("id", 5), + Expressions.equal("id", 5), + Expressions.notEqual("id", 5), + Expressions.in("id", 5, 6), + Expressions.notIn("id", 5, 6), + Expressions.startsWith("data", "aaa"), + Expressions.notStartsWith("data", "aaa"), + Expressions.alwaysTrue(), + Expressions.alwaysFalse(), + Expressions.and(Expressions.lessThan("id", 5), Expressions.notNull("data")), + Expressions.or(Expressions.lessThan("id", 5), Expressions.notNull("data")), + }; for (Expression expr : exprs) { - Assert.assertTrue("Should accept identical expression: " + expr, + Assert.assertTrue( + "Should accept identical expression: " + expr, ExpressionUtil.equivalent(expr, expr, STRUCT, true)); for (Expression other : exprs) { @@ -275,15 +301,16 @@ public void testIdenticalExpressionIsEquivalent() { @Test public void testIdenticalTermIsEquivalent() { - UnboundTerm[] terms = new UnboundTerm[] { - Expressions.ref("id"), - Expressions.truncate("id", 2), - Expressions.bucket("id", 16), - Expressions.year("ts"), - Expressions.month("ts"), - Expressions.day("ts"), - Expressions.hour("ts"), - }; + UnboundTerm[] terms = + new UnboundTerm[] { + Expressions.ref("id"), + Expressions.truncate("id", 2), + Expressions.bucket("id", 16), + Expressions.year("ts"), + Expressions.month("ts"), + Expressions.day("ts"), + Expressions.hour("ts"), + }; for (UnboundTerm term : terms) { BoundTerm bound = term.bind(STRUCT, true); @@ -299,34 +326,50 @@ public void testIdenticalTermIsEquivalent() { @Test public void testRefEquivalence() { - Assert.assertFalse("Should not find different refs equivalent", - Expressions.ref("val").bind(STRUCT, true).isEquivalentTo(Expressions.ref("val2").bind(STRUCT, true))); + Assert.assertFalse( + "Should not find different refs equivalent", + Expressions.ref("val") + .bind(STRUCT, true) + .isEquivalentTo(Expressions.ref("val2").bind(STRUCT, true))); } @Test public void testInEquivalence() { - Assert.assertTrue("Should ignore duplicate longs (in)", - ExpressionUtil.equivalent(Expressions.in("id", 1, 2, 1), Expressions.in("id", 2, 1, 2), STRUCT, true)); - Assert.assertTrue("Should ignore duplicate longs (notIn)", - ExpressionUtil.equivalent(Expressions.notIn("id", 1, 2, 1), Expressions.notIn("id", 2, 1, 2), STRUCT, true)); + Assert.assertTrue( + "Should ignore duplicate longs (in)", + ExpressionUtil.equivalent( + Expressions.in("id", 1, 2, 1), Expressions.in("id", 2, 1, 2), STRUCT, true)); + Assert.assertTrue( + "Should ignore duplicate longs (notIn)", + ExpressionUtil.equivalent( + Expressions.notIn("id", 1, 2, 1), Expressions.notIn("id", 2, 1, 2), STRUCT, true)); - Assert.assertTrue("Should ignore duplicate strings (in)", + Assert.assertTrue( + "Should ignore duplicate strings (in)", + ExpressionUtil.equivalent( + Expressions.in("data", "a", "b", "a"), Expressions.in("data", "b", "a"), STRUCT, true)); + Assert.assertTrue( + "Should ignore duplicate strings (notIn)", ExpressionUtil.equivalent( - Expressions.in("data", "a", "b", "a"), - Expressions.in("data", "b", "a"), - STRUCT, true)); - Assert.assertTrue("Should ignore duplicate strings (notIn)", - ExpressionUtil.equivalent(Expressions.notIn("data", "b", "b"), Expressions.notIn("data", "b"), STRUCT, true)); + Expressions.notIn("data", "b", "b"), Expressions.notIn("data", "b"), STRUCT, true)); - Assert.assertTrue("Should detect equivalence with equal (in, string)", - ExpressionUtil.equivalent(Expressions.in("data", "a"), Expressions.equal("data", "a"), STRUCT, true)); - Assert.assertTrue("Should detect equivalence with notEqual (notIn, long)", - ExpressionUtil.equivalent(Expressions.notIn("id", 1), Expressions.notEqual("id", 1), STRUCT, true)); + Assert.assertTrue( + "Should detect equivalence with equal (in, string)", + ExpressionUtil.equivalent( + Expressions.in("data", "a"), Expressions.equal("data", "a"), STRUCT, true)); + Assert.assertTrue( + "Should detect equivalence with notEqual (notIn, long)", + ExpressionUtil.equivalent( + Expressions.notIn("id", 1), Expressions.notEqual("id", 1), STRUCT, true)); - Assert.assertFalse("Should detect different sets (in, long)", - ExpressionUtil.equivalent(Expressions.in("id", 1, 2, 3), Expressions.in("id", 1, 2), STRUCT, true)); - Assert.assertFalse("Should detect different sets (notIn, string)", - ExpressionUtil.equivalent(Expressions.notIn("data", "a", "b"), Expressions.notIn("data", "a"), STRUCT, true)); + Assert.assertFalse( + "Should detect different sets (in, long)", + ExpressionUtil.equivalent( + Expressions.in("id", 1, 2, 3), Expressions.in("id", 1, 2), STRUCT, true)); + Assert.assertFalse( + "Should detect different sets (notIn, string)", + ExpressionUtil.equivalent( + Expressions.notIn("data", "a", "b"), Expressions.notIn("data", "a"), STRUCT, true)); } @Test @@ -334,83 +377,107 @@ public void testInequalityEquivalence() { String[] cols = new String[] {"id", "val", "ts", "date", "time"}; for (String col : cols) { - Assert.assertTrue("Should detect < to <= equivalence: " + col, + Assert.assertTrue( + "Should detect < to <= equivalence: " + col, ExpressionUtil.equivalent( - Expressions.lessThan(col, 34L), - Expressions.lessThanOrEqual(col, 33L), - STRUCT, true)); - Assert.assertTrue("Should detect <= to < equivalence: " + col, + Expressions.lessThan(col, 34L), Expressions.lessThanOrEqual(col, 33L), STRUCT, true)); + Assert.assertTrue( + "Should detect <= to < equivalence: " + col, ExpressionUtil.equivalent( - Expressions.lessThanOrEqual(col, 34L), - Expressions.lessThan(col, 35L), - STRUCT, true)); - Assert.assertTrue("Should detect > to >= equivalence: " + col, + Expressions.lessThanOrEqual(col, 34L), Expressions.lessThan(col, 35L), STRUCT, true)); + Assert.assertTrue( + "Should detect > to >= equivalence: " + col, ExpressionUtil.equivalent( Expressions.greaterThan(col, 34L), Expressions.greaterThanOrEqual(col, 35L), - STRUCT, true)); - Assert.assertTrue("Should detect >= to > equivalence: " + col, + STRUCT, + true)); + Assert.assertTrue( + "Should detect >= to > equivalence: " + col, ExpressionUtil.equivalent( Expressions.greaterThanOrEqual(col, 34L), Expressions.greaterThan(col, 33L), - STRUCT, true)); + STRUCT, + true)); } - Assert.assertFalse("Should not detect equivalence for different columns", + Assert.assertFalse( + "Should not detect equivalence for different columns", ExpressionUtil.equivalent( Expressions.lessThan("val", 34L), Expressions.lessThanOrEqual("val2", 33L), - STRUCT, true)); - Assert.assertFalse("Should not detect equivalence for different types", + STRUCT, + true)); + Assert.assertFalse( + "Should not detect equivalence for different types", ExpressionUtil.equivalent( Expressions.lessThan("val", 34L), Expressions.lessThanOrEqual("id", 33L), - STRUCT, true)); + STRUCT, + true)); } @Test public void testAndEquivalence() { - Assert.assertTrue("Should detect and equivalence in any order", + Assert.assertTrue( + "Should detect and equivalence in any order", ExpressionUtil.equivalent( - Expressions.and(Expressions.lessThan("id", 34), Expressions.greaterThanOrEqual("id", 20)), - Expressions.and(Expressions.greaterThan("id", 19L), Expressions.lessThanOrEqual("id", 33L)), - STRUCT, true)); + Expressions.and( + Expressions.lessThan("id", 34), Expressions.greaterThanOrEqual("id", 20)), + Expressions.and( + Expressions.greaterThan("id", 19L), Expressions.lessThanOrEqual("id", 33L)), + STRUCT, + true)); } @Test public void testOrEquivalence() { - Assert.assertTrue("Should detect or equivalence in any order", + Assert.assertTrue( + "Should detect or equivalence in any order", ExpressionUtil.equivalent( - Expressions.or(Expressions.lessThan("id", 20), Expressions.greaterThanOrEqual("id", 34)), - Expressions.or(Expressions.greaterThan("id", 33L), Expressions.lessThanOrEqual("id", 19L)), - STRUCT, true)); + Expressions.or( + Expressions.lessThan("id", 20), Expressions.greaterThanOrEqual("id", 34)), + Expressions.or( + Expressions.greaterThan("id", 33L), Expressions.lessThanOrEqual("id", 19L)), + STRUCT, + true)); } @Test public void testNotEquivalence() { - Assert.assertTrue("Should detect not equivalence by rewriting", + Assert.assertTrue( + "Should detect not equivalence by rewriting", ExpressionUtil.equivalent( - Expressions.not(Expressions.or(Expressions.in("data", "a"), Expressions.greaterThanOrEqual("id", 34))), + Expressions.not( + Expressions.or( + Expressions.in("data", "a"), Expressions.greaterThanOrEqual("id", 34))), Expressions.and(Expressions.lessThan("id", 34L), Expressions.notEqual("data", "a")), - STRUCT, true)); + STRUCT, + true)); } @Test public void testSelectsPartitions() { - Assert.assertTrue("Should select partitions, on boundary", + Assert.assertTrue( + "Should select partitions, on boundary", ExpressionUtil.selectsPartitions( Expressions.lessThan("ts", "2021-03-09T10:00:00.000000"), - PartitionSpec.builderFor(SCHEMA).hour("ts").build(), true)); + PartitionSpec.builderFor(SCHEMA).hour("ts").build(), + true)); - Assert.assertFalse("Should not select partitions, 1 ms off boundary", + Assert.assertFalse( + "Should not select partitions, 1 ms off boundary", ExpressionUtil.selectsPartitions( Expressions.lessThanOrEqual("ts", "2021-03-09T10:00:00.000000"), - PartitionSpec.builderFor(SCHEMA).hour("ts").build(), true)); + PartitionSpec.builderFor(SCHEMA).hour("ts").build(), + true)); - Assert.assertFalse("Should not select partitions, on hour not day boundary", + Assert.assertFalse( + "Should not select partitions, on hour not day boundary", ExpressionUtil.selectsPartitions( Expressions.lessThan("ts", "2021-03-09T10:00:00.000000"), - PartitionSpec.builderFor(SCHEMA).day("ts").build(), true)); + PartitionSpec.builderFor(SCHEMA).day("ts").build(), + true)); } private void assertEquals(Expression expected, Expression actual) { @@ -445,8 +512,10 @@ private void assertEquals(NamedReference expected, NamedReference actual) } private void assertEquals(UnboundTransform expected, UnboundTransform actual) { - Assert.assertEquals("Should apply the same transform", - expected.transform().toString(), actual.transform().toString()); + Assert.assertEquals( + "Should apply the same transform", + expected.transform().toString(), + actual.transform().toString()); assertEquals(expected.ref(), actual.ref()); } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java index de60e7853a0d..8dc9221001e1 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveManifestEvaluator.java @@ -16,21 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; -import java.nio.ByteBuffer; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.types.Types; -import org.junit.Assert; -import org.junit.Test; - import static org.apache.iceberg.expressions.Expressions.and; import static org.apache.iceberg.expressions.Expressions.equal; import static org.apache.iceberg.expressions.Expressions.greaterThan; @@ -52,39 +39,52 @@ import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import java.nio.ByteBuffer; +import org.apache.iceberg.AssertHelpers; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.TestHelpers; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.types.Types; +import org.junit.Assert; +import org.junit.Test; + public class TestInclusiveManifestEvaluator { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.IntegerType.get()), - optional(4, "all_nulls_missing_nan", Types.StringType.get()), - optional(5, "some_nulls", Types.StringType.get()), - optional(6, "no_nulls", Types.StringType.get()), - optional(7, "float", Types.FloatType.get()), - optional(8, "all_nulls_double", Types.DoubleType.get()), - optional(9, "all_nulls_no_nans", Types.FloatType.get()), - optional(10, "all_nans", Types.DoubleType.get()), - optional(11, "both_nan_and_null", Types.FloatType.get()), - optional(12, "no_nan_or_null", Types.DoubleType.get()), - optional(13, "all_nulls_missing_nan_float", Types.FloatType.get()), - optional(14, "all_same_value_or_null", Types.StringType.get()), - optional(15, "no_nulls_same_value_a", Types.StringType.get()) - ); - - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .withSpecId(0) - .identity("id") - .identity("all_nulls_missing_nan") - .identity("some_nulls") - .identity("no_nulls") - .identity("float") - .identity("all_nulls_double") - .identity("all_nulls_no_nans") - .identity("all_nans") - .identity("both_nan_and_null") - .identity("no_nan_or_null") - .identity("all_nulls_missing_nan_float") - .identity("all_same_value_or_null") - .identity("no_nulls_same_value_a") - .build(); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.IntegerType.get()), + optional(4, "all_nulls_missing_nan", Types.StringType.get()), + optional(5, "some_nulls", Types.StringType.get()), + optional(6, "no_nulls", Types.StringType.get()), + optional(7, "float", Types.FloatType.get()), + optional(8, "all_nulls_double", Types.DoubleType.get()), + optional(9, "all_nulls_no_nans", Types.FloatType.get()), + optional(10, "all_nans", Types.DoubleType.get()), + optional(11, "both_nan_and_null", Types.FloatType.get()), + optional(12, "no_nan_or_null", Types.DoubleType.get()), + optional(13, "all_nulls_missing_nan_float", Types.FloatType.get()), + optional(14, "all_same_value_or_null", Types.StringType.get()), + optional(15, "no_nulls_same_value_a", Types.StringType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA) + .withSpecId(0) + .identity("id") + .identity("all_nulls_missing_nan") + .identity("some_nulls") + .identity("no_nulls") + .identity("float") + .identity("all_nulls_double") + .identity("all_nulls_no_nans") + .identity("all_nans") + .identity("both_nan_and_null") + .identity("no_nan_or_null") + .identity("all_nulls_missing_nan_float") + .identity("all_same_value_or_null") + .identity("no_nulls_same_value_a") + .build(); private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 79; @@ -95,37 +95,54 @@ public class TestInclusiveManifestEvaluator { private static final ByteBuffer STRING_MIN = toByteBuffer(Types.StringType.get(), "a"); private static final ByteBuffer STRING_MAX = toByteBuffer(Types.StringType.get(), "z"); - private static final ManifestFile NO_STATS = new TestHelpers.TestManifestFile( - "manifest-list.avro", 1024, 0, System.currentTimeMillis(), null, null, null, null, null); - - private static final ManifestFile FILE = new TestHelpers.TestManifestFile("manifest-list.avro", - 1024, 0, System.currentTimeMillis(), 5, 10, 0, ImmutableList.of( - new TestHelpers.TestFieldSummary(false, INT_MIN, INT_MAX), - new TestHelpers.TestFieldSummary(true, null, null), - new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MAX), - new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MAX), - new TestHelpers.TestFieldSummary(false, - toByteBuffer(Types.FloatType.get(), 0F), - toByteBuffer(Types.FloatType.get(), 20F)), - new TestHelpers.TestFieldSummary(true, null, null), - new TestHelpers.TestFieldSummary(true, false, null, null), - new TestHelpers.TestFieldSummary(false, true, null, null), - new TestHelpers.TestFieldSummary(true, true, null, null), - new TestHelpers.TestFieldSummary(false, false, - toByteBuffer(Types.FloatType.get(), 0F), - toByteBuffer(Types.FloatType.get(), 20F)), - new TestHelpers.TestFieldSummary(true, null, null), - new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN), - new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN) - ), null); + private static final ManifestFile NO_STATS = + new TestHelpers.TestManifestFile( + "manifest-list.avro", 1024, 0, System.currentTimeMillis(), null, null, null, null, null); + + private static final ManifestFile FILE = + new TestHelpers.TestManifestFile( + "manifest-list.avro", + 1024, + 0, + System.currentTimeMillis(), + 5, + 10, + 0, + ImmutableList.of( + new TestHelpers.TestFieldSummary(false, INT_MIN, INT_MAX), + new TestHelpers.TestFieldSummary(true, null, null), + new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MAX), + new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MAX), + new TestHelpers.TestFieldSummary( + false, + toByteBuffer(Types.FloatType.get(), 0F), + toByteBuffer(Types.FloatType.get(), 20F)), + new TestHelpers.TestFieldSummary(true, null, null), + new TestHelpers.TestFieldSummary(true, false, null, null), + new TestHelpers.TestFieldSummary(false, true, null, null), + new TestHelpers.TestFieldSummary(true, true, null, null), + new TestHelpers.TestFieldSummary( + false, + false, + toByteBuffer(Types.FloatType.get(), 0F), + toByteBuffer(Types.FloatType.get(), 20F)), + new TestHelpers.TestFieldSummary(true, null, null), + new TestHelpers.TestFieldSummary(true, STRING_MIN, STRING_MIN), + new TestHelpers.TestFieldSummary(false, STRING_MIN, STRING_MIN)), + null); @Test public void testAllNulls() { - boolean shouldRead = ManifestEvaluator.forRowFilter(notNull("all_nulls_missing_nan"), SPEC, true).eval(FILE); - Assert.assertFalse("Should skip: all nulls column with non-floating type contains all null", shouldRead); + boolean shouldRead = + ManifestEvaluator.forRowFilter(notNull("all_nulls_missing_nan"), SPEC, true).eval(FILE); + Assert.assertFalse( + "Should skip: all nulls column with non-floating type contains all null", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notNull("all_nulls_missing_nan_float"), SPEC, true).eval(FILE); - Assert.assertTrue("Should read: no NaN information may indicate presence of NaN value", shouldRead); + shouldRead = + ManifestEvaluator.forRowFilter(notNull("all_nulls_missing_nan_float"), SPEC, true) + .eval(FILE); + Assert.assertTrue( + "Should read: no NaN information may indicate presence of NaN value", shouldRead); shouldRead = ManifestEvaluator.forRowFilter(notNull("some_nulls"), SPEC, true).eval(FILE); Assert.assertTrue("Should read: column with some nulls contains a non-null value", shouldRead); @@ -133,16 +150,21 @@ public void testAllNulls() { shouldRead = ManifestEvaluator.forRowFilter(notNull("no_nulls"), SPEC, true).eval(FILE); Assert.assertTrue("Should read: non-null column contains a non-null value", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(startsWith("all_nulls_missing_nan", "asad"), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(startsWith("all_nulls_missing_nan", "asad"), SPEC, true) + .eval(FILE); Assert.assertFalse("Should skip: startsWith on all null column", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("all_nulls_missing_nan", "asad"), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notStartsWith("all_nulls_missing_nan", "asad"), SPEC, true) + .eval(FILE); Assert.assertTrue("Should read: notStartsWith on all null column", shouldRead); } @Test public void testNoNulls() { - boolean shouldRead = ManifestEvaluator.forRowFilter(isNull("all_nulls_missing_nan"), SPEC, true).eval(FILE); + boolean shouldRead = + ManifestEvaluator.forRowFilter(isNull("all_nulls_missing_nan"), SPEC, true).eval(FILE); Assert.assertTrue("Should read: at least one null value in all null column", shouldRead); shouldRead = ManifestEvaluator.forRowFilter(isNull("some_nulls"), SPEC, true).eval(FILE); @@ -158,13 +180,17 @@ public void testNoNulls() { @Test public void testIsNaN() { boolean shouldRead = ManifestEvaluator.forRowFilter(isNaN("float"), SPEC, true).eval(FILE); - Assert.assertTrue("Should read: no information on if there are nan value in float column", shouldRead); + Assert.assertTrue( + "Should read: no information on if there are nan value in float column", shouldRead); shouldRead = ManifestEvaluator.forRowFilter(isNaN("all_nulls_double"), SPEC, true).eval(FILE); - Assert.assertTrue("Should read: no NaN information may indicate presence of NaN value", shouldRead); + Assert.assertTrue( + "Should read: no NaN information may indicate presence of NaN value", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(isNaN("all_nulls_missing_nan_float"), SPEC, true).eval(FILE); - Assert.assertTrue("Should read: no NaN information may indicate presence of NaN value", shouldRead); + shouldRead = + ManifestEvaluator.forRowFilter(isNaN("all_nulls_missing_nan_float"), SPEC, true).eval(FILE); + Assert.assertTrue( + "Should read: no NaN information may indicate presence of NaN value", shouldRead); shouldRead = ManifestEvaluator.forRowFilter(isNaN("all_nulls_no_nans"), SPEC, true).eval(FILE); Assert.assertFalse("Should skip: no nan column doesn't contain nan value", shouldRead); @@ -182,7 +208,8 @@ public void testIsNaN() { @Test public void testNotNaN() { boolean shouldRead = ManifestEvaluator.forRowFilter(notNaN("float"), SPEC, true).eval(FILE); - Assert.assertTrue("Should read: no information on if there are nan value in float column", shouldRead); + Assert.assertTrue( + "Should read: no information on if there are nan value in float column", shouldRead); shouldRead = ManifestEvaluator.forRowFilter(notNaN("all_nulls_double"), SPEC, true).eval(FILE); Assert.assertTrue("Should read: all null column contains non nan value", shouldRead); @@ -194,7 +221,8 @@ public void testNotNaN() { Assert.assertFalse("Should skip: all nans column doesn't contain non nan value", shouldRead); shouldRead = ManifestEvaluator.forRowFilter(notNaN("both_nan_and_null"), SPEC, true).eval(FILE); - Assert.assertTrue("Should read: both_nan_and_null nans column contains non nan value", shouldRead); + Assert.assertTrue( + "Should read: both_nan_and_null nans column contains non nan value", shouldRead); shouldRead = ManifestEvaluator.forRowFilter(notNaN("no_nan_or_null"), SPEC, true).eval(FILE); Assert.assertTrue("Should read: no_nan_or_null column contains non nan value", shouldRead); @@ -202,19 +230,22 @@ public void testNotNaN() { @Test public void testMissingColumn() { - AssertHelpers.assertThrows("Should complain about missing column in expression", - ValidationException.class, "Cannot find field 'missing'", + AssertHelpers.assertThrows( + "Should complain about missing column in expression", + ValidationException.class, + "Cannot find field 'missing'", () -> ManifestEvaluator.forRowFilter(lessThan("missing", 5), SPEC, true).eval(FILE)); } @Test public void testMissingStats() { - Expression[] exprs = new Expression[] { - lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), - greaterThan("id", 78), greaterThanOrEqual("id", 90), notEqual("id", 101), - isNull("id"), notNull("id"), startsWith("all_nulls_missing_nan", "a"), - isNaN("float"), notNaN("float"), notStartsWith("all_nulls_missing_nan", "a") - }; + Expression[] exprs = + new Expression[] { + lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), + greaterThan("id", 78), greaterThanOrEqual("id", 90), notEqual("id", 101), + isNull("id"), notNull("id"), startsWith("all_nulls_missing_nan", "a"), + isNaN("float"), notNaN("float"), notStartsWith("all_nulls_missing_nan", "a") + }; for (Expression expr : exprs) { boolean shouldRead = ManifestEvaluator.forRowFilter(expr, SPEC, true).eval(NO_STATS); @@ -225,354 +256,482 @@ public void testMissingStats() { @Test public void testNot() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = ManifestEvaluator.forRowFilter(not(lessThan("id", INT_MIN_VALUE - 25)), SPEC, true).eval(FILE); + boolean shouldRead = + ManifestEvaluator.forRowFilter(not(lessThan("id", INT_MIN_VALUE - 25)), SPEC, true) + .eval(FILE); Assert.assertTrue("Should read: not(false)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(not(greaterThan("id", INT_MIN_VALUE - 25)), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(not(greaterThan("id", INT_MIN_VALUE - 25)), SPEC, true) + .eval(FILE); Assert.assertFalse("Should skip: not(true)", shouldRead); } @Test public void testAnd() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = ManifestEvaluator.forRowFilter( - and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MIN_VALUE - 30)), SPEC, true).eval(FILE); + boolean shouldRead = + ManifestEvaluator.forRowFilter( + and( + lessThan("id", INT_MIN_VALUE - 25), + greaterThanOrEqual("id", INT_MIN_VALUE - 30)), + SPEC, + true) + .eval(FILE); Assert.assertFalse("Should skip: and(false, true)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1)), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter( + and( + lessThan("id", INT_MIN_VALUE - 25), + greaterThanOrEqual("id", INT_MAX_VALUE + 1)), + SPEC, + true) + .eval(FILE); Assert.assertFalse("Should skip: and(false, false)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - and(greaterThan("id", INT_MIN_VALUE - 25), lessThanOrEqual("id", INT_MIN_VALUE)), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter( + and(greaterThan("id", INT_MIN_VALUE - 25), lessThanOrEqual("id", INT_MIN_VALUE)), + SPEC, + true) + .eval(FILE); Assert.assertTrue("Should read: and(true, true)", shouldRead); } @Test public void testOr() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = ManifestEvaluator.forRowFilter( - or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1)), SPEC, true).eval(FILE); + boolean shouldRead = + ManifestEvaluator.forRowFilter( + or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1)), + SPEC, + true) + .eval(FILE); Assert.assertFalse("Should skip: or(false, false)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE - 19)), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter( + or( + lessThan("id", INT_MIN_VALUE - 25), + greaterThanOrEqual("id", INT_MAX_VALUE - 19)), + SPEC, + true) + .eval(FILE); Assert.assertTrue("Should read: or(false, true)", shouldRead); } @Test public void testIntegerLt() { - boolean shouldRead = ManifestEvaluator.forRowFilter(lessThan("id", INT_MIN_VALUE - 25), SPEC, true).eval(FILE); + boolean shouldRead = + ManifestEvaluator.forRowFilter(lessThan("id", INT_MIN_VALUE - 25), SPEC, true).eval(FILE); Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(lessThan("id", INT_MIN_VALUE), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(lessThan("id", INT_MIN_VALUE), SPEC, true).eval(FILE); Assert.assertFalse("Should not read: id range below lower bound (30 is not < 30)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(lessThan("id", INT_MIN_VALUE + 1), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(lessThan("id", INT_MIN_VALUE + 1), SPEC, true).eval(FILE); Assert.assertTrue("Should read: one possible id", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(lessThan("id", INT_MAX_VALUE), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(lessThan("id", INT_MAX_VALUE), SPEC, true).eval(FILE); Assert.assertTrue("Should read: may possible ids", shouldRead); } @Test public void testIntegerLtEq() { - boolean shouldRead = ManifestEvaluator - .forRowFilter(lessThanOrEqual("id", INT_MIN_VALUE - 25), SPEC, true).eval(FILE); + boolean shouldRead = + ManifestEvaluator.forRowFilter(lessThanOrEqual("id", INT_MIN_VALUE - 25), SPEC, true) + .eval(FILE); Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(lessThanOrEqual("id", INT_MIN_VALUE - 1), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(lessThanOrEqual("id", INT_MIN_VALUE - 1), SPEC, true) + .eval(FILE); Assert.assertFalse("Should not read: id range below lower bound (29 < 30)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(lessThanOrEqual("id", INT_MIN_VALUE), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(lessThanOrEqual("id", INT_MIN_VALUE), SPEC, true).eval(FILE); Assert.assertTrue("Should read: one possible id", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(lessThanOrEqual("id", INT_MAX_VALUE), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(lessThanOrEqual("id", INT_MAX_VALUE), SPEC, true).eval(FILE); Assert.assertTrue("Should read: many possible ids", shouldRead); } @Test public void testIntegerGt() { - boolean shouldRead = ManifestEvaluator.forRowFilter(greaterThan("id", INT_MAX_VALUE + 6), SPEC, true).eval(FILE); + boolean shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("id", INT_MAX_VALUE + 6), SPEC, true).eval(FILE); Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(greaterThan("id", INT_MAX_VALUE), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("id", INT_MAX_VALUE), SPEC, true).eval(FILE); Assert.assertFalse("Should not read: id range above upper bound (79 is not > 79)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(greaterThan("id", INT_MAX_VALUE - 1), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("id", INT_MAX_VALUE - 1), SPEC, true).eval(FILE); Assert.assertTrue("Should read: one possible id", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(greaterThan("id", INT_MAX_VALUE - 4), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(greaterThan("id", INT_MAX_VALUE - 4), SPEC, true).eval(FILE); Assert.assertTrue("Should read: may possible ids", shouldRead); } @Test public void testIntegerGtEq() { - boolean shouldRead = ManifestEvaluator - .forRowFilter(greaterThanOrEqual("id", INT_MAX_VALUE + 6), SPEC, true).eval(FILE); + boolean shouldRead = + ManifestEvaluator.forRowFilter(greaterThanOrEqual("id", INT_MAX_VALUE + 6), SPEC, true) + .eval(FILE); Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(greaterThanOrEqual("id", INT_MAX_VALUE + 1), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(greaterThanOrEqual("id", INT_MAX_VALUE + 1), SPEC, true) + .eval(FILE); Assert.assertFalse("Should not read: id range above upper bound (80 > 79)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(greaterThanOrEqual("id", INT_MAX_VALUE), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(greaterThanOrEqual("id", INT_MAX_VALUE), SPEC, true) + .eval(FILE); Assert.assertTrue("Should read: one possible id", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(greaterThanOrEqual("id", INT_MAX_VALUE - 4), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(greaterThanOrEqual("id", INT_MAX_VALUE - 4), SPEC, true) + .eval(FILE); Assert.assertTrue("Should read: may possible ids", shouldRead); } @Test public void testIntegerEq() { - boolean shouldRead = ManifestEvaluator.forRowFilter(equal("id", INT_MIN_VALUE - 25), SPEC, true).eval(FILE); + boolean shouldRead = + ManifestEvaluator.forRowFilter(equal("id", INT_MIN_VALUE - 25), SPEC, true).eval(FILE); Assert.assertFalse("Should not read: id below lower bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(equal("id", INT_MIN_VALUE - 1), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(equal("id", INT_MIN_VALUE - 1), SPEC, true).eval(FILE); Assert.assertFalse("Should not read: id below lower bound", shouldRead); shouldRead = ManifestEvaluator.forRowFilter(equal("id", INT_MIN_VALUE), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(equal("id", INT_MAX_VALUE - 4), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(equal("id", INT_MAX_VALUE - 4), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); shouldRead = ManifestEvaluator.forRowFilter(equal("id", INT_MAX_VALUE), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(equal("id", INT_MAX_VALUE + 1), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(equal("id", INT_MAX_VALUE + 1), SPEC, true).eval(FILE); Assert.assertFalse("Should not read: id above upper bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(equal("id", INT_MAX_VALUE + 6), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(equal("id", INT_MAX_VALUE + 6), SPEC, true).eval(FILE); Assert.assertFalse("Should not read: id above upper bound", shouldRead); } @Test public void testIntegerNotEq() { - boolean shouldRead = ManifestEvaluator.forRowFilter(notEqual("id", INT_MIN_VALUE - 25), SPEC, true).eval(FILE); + boolean shouldRead = + ManifestEvaluator.forRowFilter(notEqual("id", INT_MIN_VALUE - 25), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id below lower bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notEqual("id", INT_MIN_VALUE - 1), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("id", INT_MIN_VALUE - 1), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id below lower bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notEqual("id", INT_MIN_VALUE), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("id", INT_MIN_VALUE), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notEqual("id", INT_MAX_VALUE - 4), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("id", INT_MAX_VALUE - 4), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notEqual("id", INT_MAX_VALUE), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("id", INT_MAX_VALUE), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notEqual("id", INT_MAX_VALUE + 1), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("id", INT_MAX_VALUE + 1), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id above upper bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notEqual("id", INT_MAX_VALUE + 6), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notEqual("id", INT_MAX_VALUE + 6), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id above upper bound", shouldRead); } @Test public void testIntegerNotEqRewritten() { - boolean shouldRead = ManifestEvaluator.forRowFilter(not(equal("id", INT_MIN_VALUE - 25)), SPEC, true).eval(FILE); + boolean shouldRead = + ManifestEvaluator.forRowFilter(not(equal("id", INT_MIN_VALUE - 25)), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id below lower bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(not(equal("id", INT_MIN_VALUE - 1)), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(not(equal("id", INT_MIN_VALUE - 1)), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id below lower bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(not(equal("id", INT_MIN_VALUE)), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(not(equal("id", INT_MIN_VALUE)), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(not(equal("id", INT_MAX_VALUE - 4)), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(not(equal("id", INT_MAX_VALUE - 4)), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(not(equal("id", INT_MAX_VALUE)), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(not(equal("id", INT_MAX_VALUE)), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(not(equal("id", INT_MAX_VALUE + 1)), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(not(equal("id", INT_MAX_VALUE + 1)), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id above upper bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(not(equal("id", INT_MAX_VALUE + 6)), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(not(equal("id", INT_MAX_VALUE + 6)), SPEC, true).eval(FILE); Assert.assertTrue("Should read: id above upper bound", shouldRead); } @Test public void testCaseInsensitiveIntegerNotEqRewritten() { - boolean shouldRead = ManifestEvaluator.forRowFilter(not(equal("ID", INT_MIN_VALUE - 25)), SPEC, false).eval(FILE); + boolean shouldRead = + ManifestEvaluator.forRowFilter(not(equal("ID", INT_MIN_VALUE - 25)), SPEC, false) + .eval(FILE); Assert.assertTrue("Should read: id below lower bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(not(equal("ID", INT_MIN_VALUE - 1)), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(not(equal("ID", INT_MIN_VALUE - 1)), SPEC, false).eval(FILE); Assert.assertTrue("Should read: id below lower bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(not(equal("ID", INT_MIN_VALUE)), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(not(equal("ID", INT_MIN_VALUE)), SPEC, false).eval(FILE); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(not(equal("ID", INT_MAX_VALUE - 4)), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(not(equal("ID", INT_MAX_VALUE - 4)), SPEC, false).eval(FILE); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(not(equal("ID", INT_MAX_VALUE)), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(not(equal("ID", INT_MAX_VALUE)), SPEC, false).eval(FILE); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(not(equal("ID", INT_MAX_VALUE + 1)), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(not(equal("ID", INT_MAX_VALUE + 1)), SPEC, false).eval(FILE); Assert.assertTrue("Should read: id above upper bound", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(not(equal("ID", INT_MAX_VALUE + 6)), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(not(equal("ID", INT_MAX_VALUE + 6)), SPEC, false).eval(FILE); Assert.assertTrue("Should read: id above upper bound", shouldRead); } @Test public void testCaseSensitiveIntegerNotEqRewritten() { - AssertHelpers.assertThrows("Should complain about missing column in expression", - ValidationException.class, "Cannot find field 'ID'", + AssertHelpers.assertThrows( + "Should complain about missing column in expression", + ValidationException.class, + "Cannot find field 'ID'", () -> ManifestEvaluator.forRowFilter(not(equal("ID", 5)), SPEC, true).eval(FILE)); } @Test public void testStringStartsWith() { - boolean shouldRead = ManifestEvaluator.forRowFilter(startsWith("some_nulls", "a"), SPEC, false).eval(FILE); + boolean shouldRead = + ManifestEvaluator.forRowFilter(startsWith("some_nulls", "a"), SPEC, false).eval(FILE); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(startsWith("some_nulls", "aa"), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(startsWith("some_nulls", "aa"), SPEC, false).eval(FILE); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(startsWith("some_nulls", "dddd"), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(startsWith("some_nulls", "dddd"), SPEC, false).eval(FILE); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(startsWith("some_nulls", "z"), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(startsWith("some_nulls", "z"), SPEC, false).eval(FILE); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(startsWith("no_nulls", "a"), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(startsWith("no_nulls", "a"), SPEC, false).eval(FILE); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(startsWith("some_nulls", "zzzz"), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(startsWith("some_nulls", "zzzz"), SPEC, false).eval(FILE); Assert.assertFalse("Should skip: range doesn't match", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(startsWith("some_nulls", "1"), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(startsWith("some_nulls", "1"), SPEC, false).eval(FILE); Assert.assertFalse("Should skip: range doesn't match", shouldRead); } @Test public void testStringNotStartsWith() { - boolean shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "a"), SPEC, false).eval(FILE); + boolean shouldRead = + ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "a"), SPEC, false).eval(FILE); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "aa"), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "aa"), SPEC, false).eval(FILE); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "dddd"), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "dddd"), SPEC, false).eval(FILE); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "z"), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "z"), SPEC, false).eval(FILE); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("no_nulls", "a"), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notStartsWith("no_nulls", "a"), SPEC, false).eval(FILE); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "zzzz"), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "zzzz"), SPEC, false).eval(FILE); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "1"), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notStartsWith("some_nulls", "1"), SPEC, false).eval(FILE); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("all_same_value_or_null", "a"), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notStartsWith("all_same_value_or_null", "a"), SPEC, false) + .eval(FILE); Assert.assertTrue("Should read: range matches on null", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("all_same_value_or_null", "aa"), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notStartsWith("all_same_value_or_null", "aa"), SPEC, false) + .eval(FILE); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("all_same_value_or_null", "A"), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notStartsWith("all_same_value_or_null", "A"), SPEC, false) + .eval(FILE); Assert.assertTrue("Should read: range matches", shouldRead); - // Iceberg does not implement SQL's 3-way boolean logic, so the choice of an all null column matching is - // by definition in order to surface more values to the query engine to allow it to make its own decision. - shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("all_nulls_missing_nan", "A"), SPEC, false).eval(FILE); + // Iceberg does not implement SQL's 3-way boolean logic, so the choice of an all null column + // matching is + // by definition in order to surface more values to the query engine to allow it to make its own + // decision. + shouldRead = + ManifestEvaluator.forRowFilter(notStartsWith("all_nulls_missing_nan", "A"), SPEC, false) + .eval(FILE); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter(notStartsWith("no_nulls_same_value_a", "a"), SPEC, false).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notStartsWith("no_nulls_same_value_a", "a"), SPEC, false) + .eval(FILE); Assert.assertFalse("Should not read: all values start with the prefix", shouldRead); } @Test public void testIntegerIn() { - boolean shouldRead = ManifestEvaluator.forRowFilter( - in("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24), SPEC, true).eval(FILE); + boolean shouldRead = + ManifestEvaluator.forRowFilter(in("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24), SPEC, true) + .eval(FILE); Assert.assertFalse("Should not read: id below lower bound (5 < 30, 6 < 30)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - in("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(in("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1), SPEC, true) + .eval(FILE); Assert.assertFalse("Should not read: id below lower bound (28 < 30, 29 < 30)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - in("id", INT_MIN_VALUE - 1, INT_MIN_VALUE), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(in("id", INT_MIN_VALUE - 1, INT_MIN_VALUE), SPEC, true) + .eval(FILE); Assert.assertTrue("Should read: id equal to lower bound (30 == 30)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - in("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3), SPEC, true).eval(FILE); - Assert.assertTrue("Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)", shouldRead); + shouldRead = + ManifestEvaluator.forRowFilter(in("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3), SPEC, true) + .eval(FILE); + Assert.assertTrue( + "Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - in("id", INT_MAX_VALUE, INT_MAX_VALUE + 1), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(in("id", INT_MAX_VALUE, INT_MAX_VALUE + 1), SPEC, true) + .eval(FILE); Assert.assertTrue("Should read: id equal to upper bound (79 == 79)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - in("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(in("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2), SPEC, true) + .eval(FILE); Assert.assertFalse("Should not read: id above upper bound (80 > 79, 81 > 79)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - in("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(in("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7), SPEC, true) + .eval(FILE); Assert.assertFalse("Should not read: id above upper bound (85 > 79, 86 > 79)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - in("all_nulls_missing_nan", "abc", "def"), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(in("all_nulls_missing_nan", "abc", "def"), SPEC, true) + .eval(FILE); Assert.assertFalse("Should skip: in on all nulls column", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - in("some_nulls", "abc", "def"), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(in("some_nulls", "abc", "def"), SPEC, true).eval(FILE); Assert.assertTrue("Should read: in on some nulls column", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - in("no_nulls", "abc", "def"), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(in("no_nulls", "abc", "def"), SPEC, true).eval(FILE); Assert.assertTrue("Should read: in on no nulls column", shouldRead); } @Test public void testIntegerNotIn() { - boolean shouldRead = ManifestEvaluator.forRowFilter( - notIn("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24), SPEC, true).eval(FILE); + boolean shouldRead = + ManifestEvaluator.forRowFilter( + notIn("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24), SPEC, true) + .eval(FILE); Assert.assertTrue("Should read: id below lower bound (5 < 30, 6 < 30)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - notIn("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter( + notIn("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1), SPEC, true) + .eval(FILE); Assert.assertTrue("Should read: id below lower bound (28 < 30, 29 < 30)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - notIn("id", INT_MIN_VALUE - 1, INT_MIN_VALUE), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("id", INT_MIN_VALUE - 1, INT_MIN_VALUE), SPEC, true) + .eval(FILE); Assert.assertTrue("Should read: id equal to lower bound (30 == 30)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - notIn("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3), SPEC, true).eval(FILE); - Assert.assertTrue("Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)", shouldRead); + shouldRead = + ManifestEvaluator.forRowFilter( + notIn("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3), SPEC, true) + .eval(FILE); + Assert.assertTrue( + "Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - notIn("id", INT_MAX_VALUE, INT_MAX_VALUE + 1), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("id", INT_MAX_VALUE, INT_MAX_VALUE + 1), SPEC, true) + .eval(FILE); Assert.assertTrue("Should read: id equal to upper bound (79 == 79)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - notIn("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter( + notIn("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2), SPEC, true) + .eval(FILE); Assert.assertTrue("Should read: id above upper bound (80 > 79, 81 > 79)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - notIn("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter( + notIn("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7), SPEC, true) + .eval(FILE); Assert.assertTrue("Should read: id above upper bound (85 > 79, 86 > 79)", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - notIn("all_nulls_missing_nan", "abc", "def"), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("all_nulls_missing_nan", "abc", "def"), SPEC, true) + .eval(FILE); Assert.assertTrue("Should read: notIn on all nulls column", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - notIn("some_nulls", "abc", "def"), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("some_nulls", "abc", "def"), SPEC, true).eval(FILE); Assert.assertTrue("Should read: notIn on some nulls column", shouldRead); - shouldRead = ManifestEvaluator.forRowFilter( - notIn("no_nulls", "abc", "def"), SPEC, true).eval(FILE); + shouldRead = + ManifestEvaluator.forRowFilter(notIn("no_nulls", "abc", "def"), SPEC, true).eval(FILE); Assert.assertTrue("Should read: notIn on no nulls column", shouldRead); } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java index fcaa352ef4aa..f385836361e0 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java @@ -16,25 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; -import java.util.List; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.Schema; -import org.apache.iceberg.TestHelpers.Row; -import org.apache.iceberg.TestHelpers.TestDataFile; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.types.Types.IntegerType; -import org.apache.iceberg.types.Types.StringType; -import org.apache.iceberg.util.UnicodeUtil; -import org.junit.Assert; -import org.junit.Test; - import static org.apache.iceberg.expressions.Expressions.and; import static org.apache.iceberg.expressions.Expressions.equal; import static org.apache.iceberg.expressions.Expressions.greaterThan; @@ -56,105 +39,137 @@ import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import java.util.List; +import org.apache.iceberg.AssertHelpers; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.Schema; +import org.apache.iceberg.TestHelpers.Row; +import org.apache.iceberg.TestHelpers.TestDataFile; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.types.Types.IntegerType; +import org.apache.iceberg.types.Types.StringType; +import org.apache.iceberg.util.UnicodeUtil; +import org.junit.Assert; +import org.junit.Test; + public class TestInclusiveMetricsEvaluator { - private static final Schema SCHEMA = new Schema( - required(1, "id", IntegerType.get()), - optional(2, "no_stats", Types.IntegerType.get()), - required(3, "required", Types.StringType.get()), - optional(4, "all_nulls", Types.StringType.get()), - optional(5, "some_nulls", Types.StringType.get()), - optional(6, "no_nulls", Types.StringType.get()), - optional(7, "all_nans", Types.DoubleType.get()), - optional(8, "some_nans", Types.FloatType.get()), - optional(9, "no_nans", Types.FloatType.get()), - optional(10, "all_nulls_double", Types.DoubleType.get()), - optional(11, "all_nans_v1_stats", Types.FloatType.get()), - optional(12, "nan_and_null_only", Types.DoubleType.get()), - optional(13, "no_nan_stats", Types.DoubleType.get()), - optional(14, "some_empty", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + required(1, "id", IntegerType.get()), + optional(2, "no_stats", Types.IntegerType.get()), + required(3, "required", Types.StringType.get()), + optional(4, "all_nulls", Types.StringType.get()), + optional(5, "some_nulls", Types.StringType.get()), + optional(6, "no_nulls", Types.StringType.get()), + optional(7, "all_nans", Types.DoubleType.get()), + optional(8, "some_nans", Types.FloatType.get()), + optional(9, "no_nans", Types.FloatType.get()), + optional(10, "all_nulls_double", Types.DoubleType.get()), + optional(11, "all_nans_v1_stats", Types.FloatType.get()), + optional(12, "nan_and_null_only", Types.DoubleType.get()), + optional(13, "no_nan_stats", Types.DoubleType.get()), + optional(14, "some_empty", Types.StringType.get())); private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 79; - private static final DataFile FILE = new TestDataFile("file.avro", Row.of(), 50, - // any value counts, including nulls - ImmutableMap.builder() - .put(4, 50L) - .put(5, 50L) - .put(6, 50L) - .put(7, 50L) - .put(8, 50L) - .put(9, 50L) - .put(10, 50L) - .put(11, 50L) - .put(12, 50L) - .put(13, 50L) - .put(14, 50L) - .build(), - // null value counts - ImmutableMap.builder() - .put(4, 50L) - .put(5, 10L) - .put(6, 0L) - .put(10, 50L) - .put(11, 0L) - .put(12, 1L) - .put(14, 0L) - .build(), - // nan value counts - ImmutableMap.of( - 7, 50L, - 8, 10L, - 9, 0L), - // lower bounds - ImmutableMap.of( - 1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE), - 11, toByteBuffer(Types.FloatType.get(), Float.NaN), - 12, toByteBuffer(Types.DoubleType.get(), Double.NaN), - 14, toByteBuffer(Types.StringType.get(), "")), - // upper bounds - ImmutableMap.of( - 1, toByteBuffer(IntegerType.get(), INT_MAX_VALUE), - 11, toByteBuffer(Types.FloatType.get(), Float.NaN), - 12, toByteBuffer(Types.DoubleType.get(), Double.NaN), - 14, toByteBuffer(Types.StringType.get(), "房东整租霍营小区二层两居室"))); - - private static final DataFile FILE_2 = new TestDataFile("file_2.avro", Row.of(), 50, - // any value counts, including nulls - ImmutableMap.of(3, 20L), - // null value counts - ImmutableMap.of(3, 2L), - // nan value counts - null, - // lower bounds - ImmutableMap.of(3, toByteBuffer(StringType.get(), "aa")), - // upper bounds - ImmutableMap.of(3, toByteBuffer(StringType.get(), "dC"))); - - private static final DataFile FILE_3 = new TestDataFile("file_3.avro", Row.of(), 50, - // any value counts, including nulls - ImmutableMap.of(3, 20L), - // null value counts - ImmutableMap.of(3, 2L), - // nan value counts - null, - // lower bounds - ImmutableMap.of(3, toByteBuffer(StringType.get(), "1str1")), - // upper bounds - ImmutableMap.of(3, toByteBuffer(StringType.get(), "3str3"))); - - private static final DataFile FILE_4 = new TestDataFile("file_4.avro", Row.of(), 50, - // any value counts, including nulls - ImmutableMap.of(3, 20L), - // null value counts - ImmutableMap.of(3, 2L), - // nan value counts - null, - // lower bounds - ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc")), - // upper bounds - ImmutableMap.of(3, toByteBuffer(StringType.get(), "イロハニホヘト"))); + private static final DataFile FILE = + new TestDataFile( + "file.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.builder() + .put(4, 50L) + .put(5, 50L) + .put(6, 50L) + .put(7, 50L) + .put(8, 50L) + .put(9, 50L) + .put(10, 50L) + .put(11, 50L) + .put(12, 50L) + .put(13, 50L) + .put(14, 50L) + .build(), + // null value counts + ImmutableMap.builder() + .put(4, 50L) + .put(5, 10L) + .put(6, 0L) + .put(10, 50L) + .put(11, 0L) + .put(12, 1L) + .put(14, 0L) + .build(), + // nan value counts + ImmutableMap.of( + 7, 50L, + 8, 10L, + 9, 0L), + // lower bounds + ImmutableMap.of( + 1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE), + 11, toByteBuffer(Types.FloatType.get(), Float.NaN), + 12, toByteBuffer(Types.DoubleType.get(), Double.NaN), + 14, toByteBuffer(Types.StringType.get(), "")), + // upper bounds + ImmutableMap.of( + 1, toByteBuffer(IntegerType.get(), INT_MAX_VALUE), + 11, toByteBuffer(Types.FloatType.get(), Float.NaN), + 12, toByteBuffer(Types.DoubleType.get(), Double.NaN), + 14, toByteBuffer(Types.StringType.get(), "房东整租霍营小区二层两居室"))); + + private static final DataFile FILE_2 = + new TestDataFile( + "file_2.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(3, 20L), + // null value counts + ImmutableMap.of(3, 2L), + // nan value counts + null, + // lower bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "aa")), + // upper bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "dC"))); + + private static final DataFile FILE_3 = + new TestDataFile( + "file_3.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(3, 20L), + // null value counts + ImmutableMap.of(3, 2L), + // nan value counts + null, + // lower bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "1str1")), + // upper bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "3str3"))); + + private static final DataFile FILE_4 = + new TestDataFile( + "file_4.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(3, 20L), + // null value counts + ImmutableMap.of(3, 2L), + // nan value counts + null, + // lower bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc")), + // upper bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "イロハニホヘト"))); @Test public void testAllNulls() { @@ -164,13 +179,15 @@ public void testAllNulls() { shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("all_nulls", "a")).eval(FILE); Assert.assertFalse("Should skip: lessThan on all null column", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("all_nulls", "a")).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("all_nulls", "a")).eval(FILE); Assert.assertFalse("Should skip: lessThanOrEqual on all null column", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("all_nulls", "a")).eval(FILE); Assert.assertFalse("Should skip: greaterThan on all null column", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("all_nulls", "a")).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("all_nulls", "a")).eval(FILE); Assert.assertFalse("Should skip: greaterThanOrEqual on all null column", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("all_nulls", "a")).eval(FILE); @@ -216,13 +233,15 @@ public void testIsNaN() { Assert.assertFalse("Should skip: all-null column doesn't contain nan value", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, isNaN("no_nan_stats")).eval(FILE); - Assert.assertTrue("Should read: no guarantee on if contains nan value without nan stats", shouldRead); + Assert.assertTrue( + "Should read: no guarantee on if contains nan value without nan stats", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, isNaN("all_nans_v1_stats")).eval(FILE); Assert.assertTrue("Should read: at least one nan value in all nan column", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, isNaN("nan_and_null_only")).eval(FILE); - Assert.assertTrue("Should read: at least one nan value in nan and nulls only column", shouldRead); + Assert.assertTrue( + "Should read: at least one nan value in nan and nulls only column", shouldRead); } @Test @@ -240,13 +259,16 @@ public void testNotNaN() { Assert.assertTrue("Should read: at least one non-nan value in all null column", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notNaN("no_nan_stats")).eval(FILE); - Assert.assertTrue("Should read: no guarantee on if contains nan value without nan stats", shouldRead); + Assert.assertTrue( + "Should read: no guarantee on if contains nan value without nan stats", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notNaN("all_nans_v1_stats")).eval(FILE); - Assert.assertTrue("Should read: no guarantee on if contains nan value without nan stats", shouldRead); + Assert.assertTrue( + "Should read: no guarantee on if contains nan value without nan stats", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notNaN("nan_and_null_only")).eval(FILE); - Assert.assertTrue("Should read: at least one null value in nan and nulls only column", shouldRead); + Assert.assertTrue( + "Should read: at least one null value in nan and nulls only column", shouldRead); } @Test @@ -260,8 +282,10 @@ public void testRequiredColumn() { @Test public void testMissingColumn() { - AssertHelpers.assertThrows("Should complain about missing column in expression", - ValidationException.class, "Cannot find field 'missing'", + AssertHelpers.assertThrows( + "Should complain about missing column in expression", + ValidationException.class, + "Cannot find field 'missing'", () -> new InclusiveMetricsEvaluator(SCHEMA, lessThan("missing", 5)).eval(FILE)); } @@ -269,11 +293,19 @@ public void testMissingColumn() { public void testMissingStats() { DataFile missingStats = new TestDataFile("file.parquet", Row.of(), 50); - Expression[] exprs = new Expression[] { - lessThan("no_stats", 5), lessThanOrEqual("no_stats", 30), equal("no_stats", 70), - greaterThan("no_stats", 78), greaterThanOrEqual("no_stats", 90), notEqual("no_stats", 101), - isNull("no_stats"), notNull("no_stats"), isNaN("some_nans"), notNaN("some_nans") - }; + Expression[] exprs = + new Expression[] { + lessThan("no_stats", 5), + lessThanOrEqual("no_stats", 30), + equal("no_stats", 70), + greaterThan("no_stats", 78), + greaterThanOrEqual("no_stats", 90), + notEqual("no_stats", 101), + isNull("no_stats"), + notNull("no_stats"), + isNaN("some_nans"), + notNaN("some_nans") + }; for (Expression expr : exprs) { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, expr).eval(missingStats); @@ -285,11 +317,19 @@ public void testMissingStats() { public void testZeroRecordFile() { DataFile empty = new TestDataFile("file.parquet", Row.of(), 0); - Expression[] exprs = new Expression[] { - lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78), - greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"), - notNull("some_nulls"), isNaN("some_nans"), notNaN("some_nans"), - }; + Expression[] exprs = + new Expression[] { + lessThan("id", 5), + lessThanOrEqual("id", 30), + equal("id", 70), + greaterThan("id", 78), + greaterThanOrEqual("id", 90), + notEqual("id", 101), + isNull("some_nulls"), + notNull("some_nulls"), + isNaN("some_nans"), + notNaN("some_nans"), + }; for (Expression expr : exprs) { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, expr).eval(empty); @@ -300,50 +340,76 @@ public void testZeroRecordFile() { @Test public void testNot() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(lessThan("id", INT_MIN_VALUE - 25))).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, not(lessThan("id", INT_MIN_VALUE - 25))).eval(FILE); Assert.assertTrue("Should read: not(false)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(greaterThan("id", INT_MIN_VALUE - 25))).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, not(greaterThan("id", INT_MIN_VALUE - 25))) + .eval(FILE); Assert.assertFalse("Should skip: not(true)", shouldRead); } @Test public void testAnd() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MIN_VALUE - 30))).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator( + SCHEMA, + and( + lessThan("id", INT_MIN_VALUE - 25), + greaterThanOrEqual("id", INT_MIN_VALUE - 30))) + .eval(FILE); Assert.assertFalse("Should skip: and(false, true)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator( + SCHEMA, + and( + lessThan("id", INT_MIN_VALUE - 25), + greaterThanOrEqual("id", INT_MAX_VALUE + 1))) + .eval(FILE); Assert.assertFalse("Should skip: and(false, false)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - and(greaterThan("id", INT_MIN_VALUE - 25), lessThanOrEqual("id", INT_MIN_VALUE))).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator( + SCHEMA, + and(greaterThan("id", INT_MIN_VALUE - 25), lessThanOrEqual("id", INT_MIN_VALUE))) + .eval(FILE); Assert.assertTrue("Should read: and(true, true)", shouldRead); } @Test public void testOr() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator( + SCHEMA, + or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))) + .eval(FILE); Assert.assertFalse("Should skip: or(false, false)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE - 19))).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator( + SCHEMA, + or( + lessThan("id", INT_MIN_VALUE - 25), + greaterThanOrEqual("id", INT_MAX_VALUE - 19))) + .eval(FILE); Assert.assertTrue("Should read: or(false, true)", shouldRead); } @Test public void testIntegerLt() { - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("id", INT_MIN_VALUE - 25)).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThan("id", INT_MIN_VALUE - 25)).eval(FILE); Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("id", INT_MIN_VALUE)).eval(FILE); Assert.assertFalse("Should not read: id range below lower bound (30 is not < 30)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("id", INT_MIN_VALUE + 1)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThan("id", INT_MIN_VALUE + 1)).eval(FILE); Assert.assertTrue("Should read: one possible id", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThan("id", INT_MAX_VALUE)).eval(FILE); @@ -352,52 +418,67 @@ public void testIntegerLt() { @Test public void testIntegerLtEq() { - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE - 25)).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE - 25)).eval(FILE); Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE - 1)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE - 1)).eval(FILE); Assert.assertFalse("Should not read: id range below lower bound (29 < 30)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE)).eval(FILE); Assert.assertTrue("Should read: one possible id", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("id", INT_MAX_VALUE)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, lessThanOrEqual("id", INT_MAX_VALUE)).eval(FILE); Assert.assertTrue("Should read: many possible ids", shouldRead); } @Test public void testIntegerGt() { - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("id", INT_MAX_VALUE + 6)).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("id", INT_MAX_VALUE + 6)).eval(FILE); Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("id", INT_MAX_VALUE)).eval(FILE); Assert.assertFalse("Should not read: id range above upper bound (79 is not > 79)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("id", INT_MAX_VALUE - 1)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("id", INT_MAX_VALUE - 1)).eval(FILE); Assert.assertTrue("Should read: one possible id", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThan("id", INT_MAX_VALUE - 4)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThan("id", INT_MAX_VALUE - 4)).eval(FILE); Assert.assertTrue("Should read: may possible ids", shouldRead); } @Test public void testIntegerGtEq() { - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE + 6)).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE + 6)) + .eval(FILE); Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE + 1)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE + 1)) + .eval(FILE); Assert.assertFalse("Should not read: id range above upper bound (80 > 79)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE)).eval(FILE); Assert.assertTrue("Should read: one possible id", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE - 4)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE - 4)) + .eval(FILE); Assert.assertTrue("Should read: may possible ids", shouldRead); } @Test public void testIntegerEq() { - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("id", INT_MIN_VALUE - 25)).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, equal("id", INT_MIN_VALUE - 25)).eval(FILE); Assert.assertFalse("Should not read: id below lower bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, equal("id", INT_MIN_VALUE - 1)).eval(FILE); @@ -421,209 +502,267 @@ public void testIntegerEq() { @Test public void testIntegerNotEq() { - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", INT_MIN_VALUE - 25)).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", INT_MIN_VALUE - 25)).eval(FILE); Assert.assertTrue("Should read: id below lower bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", INT_MIN_VALUE - 1)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", INT_MIN_VALUE - 1)).eval(FILE); Assert.assertTrue("Should read: id below lower bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", INT_MIN_VALUE)).eval(FILE); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", INT_MAX_VALUE - 4)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", INT_MAX_VALUE - 4)).eval(FILE); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", INT_MAX_VALUE)).eval(FILE); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", INT_MAX_VALUE + 1)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", INT_MAX_VALUE + 1)).eval(FILE); Assert.assertTrue("Should read: id above upper bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", INT_MAX_VALUE + 6)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notEqual("id", INT_MAX_VALUE + 6)).eval(FILE); Assert.assertTrue("Should read: id above upper bound", shouldRead); } @Test public void testIntegerNotEqRewritten() { - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", INT_MIN_VALUE - 25))).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", INT_MIN_VALUE - 25))).eval(FILE); Assert.assertTrue("Should read: id below lower bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", INT_MIN_VALUE - 1))).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", INT_MIN_VALUE - 1))).eval(FILE); Assert.assertTrue("Should read: id below lower bound", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", INT_MIN_VALUE))).eval(FILE); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", INT_MAX_VALUE - 4))).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", INT_MAX_VALUE - 4))).eval(FILE); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", INT_MAX_VALUE))).eval(FILE); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", INT_MAX_VALUE + 1))).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", INT_MAX_VALUE + 1))).eval(FILE); Assert.assertTrue("Should read: id above upper bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", INT_MAX_VALUE + 6))).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, not(equal("id", INT_MAX_VALUE + 6))).eval(FILE); Assert.assertTrue("Should read: id above upper bound", shouldRead); } @Test public void testCaseInsensitiveIntegerNotEqRewritten() { - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", INT_MIN_VALUE - 25)), false).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", INT_MIN_VALUE - 25)), false) + .eval(FILE); Assert.assertTrue("Should read: id below lower bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", INT_MIN_VALUE - 1)), false).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", INT_MIN_VALUE - 1)), false) + .eval(FILE); Assert.assertTrue("Should read: id below lower bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", INT_MIN_VALUE)), false).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", INT_MIN_VALUE)), false).eval(FILE); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", INT_MAX_VALUE - 4)), false).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", INT_MAX_VALUE - 4)), false) + .eval(FILE); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", INT_MAX_VALUE)), false).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", INT_MAX_VALUE)), false).eval(FILE); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", INT_MAX_VALUE + 1)), false).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", INT_MAX_VALUE + 1)), false) + .eval(FILE); Assert.assertTrue("Should read: id above upper bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", INT_MAX_VALUE + 6)), false).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", INT_MAX_VALUE + 6)), false) + .eval(FILE); Assert.assertTrue("Should read: id above upper bound", shouldRead); } @Test public void testCaseSensitiveIntegerNotEqRewritten() { - AssertHelpers.assertThrows("Should complain about missing column in expression", - ValidationException.class, "Cannot find field 'ID'", + AssertHelpers.assertThrows( + "Should complain about missing column in expression", + ValidationException.class, + "Cannot find field 'ID'", () -> new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", 5)), true).eval(FILE)); } @Test public void testStringStartsWith() { - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "a"), true).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "a"), true).eval(FILE); Assert.assertTrue("Should read: no stats", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "a"), true).eval(FILE_2); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "a"), true).eval(FILE_2); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "aa"), true).eval(FILE_2); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "aa"), true).eval(FILE_2); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "aaa"), true).eval(FILE_2); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "aaa"), true).eval(FILE_2); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "1s"), true).eval(FILE_3); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "1s"), true).eval(FILE_3); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "1str1x"), true).eval(FILE_3); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "1str1x"), true).eval(FILE_3); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "ff"), true).eval(FILE_4); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "ff"), true).eval(FILE_4); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "aB"), true).eval(FILE_2); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "aB"), true).eval(FILE_2); Assert.assertFalse("Should not read: range doesn't match", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "dWX"), true).eval(FILE_2); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "dWX"), true).eval(FILE_2); Assert.assertFalse("Should not read: range doesn't match", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "5"), true).eval(FILE_3); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "5"), true).eval(FILE_3); Assert.assertFalse("Should not read: range doesn't match", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "3str3x"), true).eval(FILE_3); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "3str3x"), true).eval(FILE_3); Assert.assertFalse("Should not read: range doesn't match", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("some_empty", "房东整租霍"), true).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, startsWith("some_empty", "房东整租霍"), true).eval(FILE); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("all_nulls", ""), true).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, startsWith("all_nulls", ""), true).eval(FILE); Assert.assertFalse("Should not read: range doesn't match", shouldRead); String aboveMax = UnicodeUtil.truncateStringMax(Literal.of("イロハニホヘト"), 4).value().toString(); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", aboveMax), true).eval(FILE_4); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", aboveMax), true).eval(FILE_4); Assert.assertFalse("Should not read: range doesn't match", shouldRead); } @Test public void testStringNotStartsWith() { - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "a"), true).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "a"), true).eval(FILE); Assert.assertTrue("Should read: no stats", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "a"), true).eval(FILE_2); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "a"), true).eval(FILE_2); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "aa"), true).eval(FILE_2); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "aa"), true).eval(FILE_2); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "aaa"), true).eval(FILE_2); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "aaa"), true).eval(FILE_2); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "1s"), true).eval(FILE_3); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "1s"), true).eval(FILE_3); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "1str1x"), true).eval(FILE_3); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "1str1x"), true) + .eval(FILE_3); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "ff"), true).eval(FILE_4); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "ff"), true).eval(FILE_4); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "aB"), true).eval(FILE_2); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "aB"), true).eval(FILE_2); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "dWX"), true).eval(FILE_2); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "dWX"), true).eval(FILE_2); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "5"), true).eval(FILE_3); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "5"), true).eval(FILE_3); Assert.assertTrue("Should read: range matches", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "3str3x"), true).eval(FILE_3); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", "3str3x"), true) + .eval(FILE_3); Assert.assertTrue("Should read: range matches", shouldRead); String aboveMax = UnicodeUtil.truncateStringMax(Literal.of("イロハニホヘト"), 4).value().toString(); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", aboveMax), true).eval(FILE_4); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notStartsWith("required", aboveMax), true) + .eval(FILE_4); Assert.assertTrue("Should read: range matches", shouldRead); } @Test public void testIntegerIn() { - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - in("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)) + .eval(FILE); Assert.assertFalse("Should not read: id below lower bound (5 < 30, 6 < 30)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - in("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)) + .eval(FILE); Assert.assertFalse("Should not read: id below lower bound (28 < 30, 29 < 30)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - in("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)) + .eval(FILE); Assert.assertTrue("Should read: id equal to lower bound (30 == 30)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - in("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)).eval(FILE); - Assert.assertTrue("Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)", shouldRead); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)) + .eval(FILE); + Assert.assertTrue( + "Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - in("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)) + .eval(FILE); Assert.assertTrue("Should read: id equal to upper bound (79 == 79)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - in("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)) + .eval(FILE); Assert.assertFalse("Should not read: id above upper bound (80 > 79, 81 > 79)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - in("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, in("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)) + .eval(FILE); Assert.assertFalse("Should not read: id above upper bound (85 > 79, 86 > 79)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - in("all_nulls", "abc", "def")).eval(FILE); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, in("all_nulls", "abc", "def")).eval(FILE); Assert.assertFalse("Should skip: in on all nulls column", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - in("some_nulls", "abc", "def")).eval(FILE); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, in("some_nulls", "abc", "def")).eval(FILE); Assert.assertTrue("Should read: in on some nulls column", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - in("no_nulls", "abc", "def")).eval(FILE); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, in("no_nulls", "abc", "def")).eval(FILE); Assert.assertTrue("Should read: in on no nulls column", shouldRead); // should read as the number of elements in the in expression is too big @@ -637,45 +776,50 @@ public void testIntegerIn() { @Test public void testIntegerNotIn() { - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - notIn("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)) + .eval(FILE); Assert.assertTrue("Should read: id below lower bound (5 < 30, 6 < 30)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - notIn("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)) + .eval(FILE); Assert.assertTrue("Should read: id below lower bound (28 < 30, 29 < 30)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - notIn("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)) + .eval(FILE); Assert.assertTrue("Should read: id equal to lower bound (30 == 30)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - notIn("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)).eval(FILE); - Assert.assertTrue("Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)", shouldRead); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)) + .eval(FILE); + Assert.assertTrue( + "Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - notIn("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)) + .eval(FILE); Assert.assertTrue("Should read: id equal to upper bound (79 == 79)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - notIn("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)) + .eval(FILE); Assert.assertTrue("Should read: id above upper bound (80 > 79, 81 > 79)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - notIn("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)) + .eval(FILE); Assert.assertTrue("Should read: id above upper bound (85 > 79, 86 > 79)", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - notIn("all_nulls", "abc", "def")).eval(FILE); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notIn("all_nulls", "abc", "def")).eval(FILE); Assert.assertTrue("Should read: notIn on all nulls column", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - notIn("some_nulls", "abc", "def")).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notIn("some_nulls", "abc", "def")).eval(FILE); Assert.assertTrue("Should read: notIn on some nulls column", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, - notIn("no_nulls", "abc", "def")).eval(FILE); + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notIn("no_nulls", "abc", "def")).eval(FILE); Assert.assertTrue("Should read: notIn on no nulls column", shouldRead); - } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestLiteralSerialization.java b/api/src/test/java/org/apache/iceberg/expressions/TestLiteralSerialization.java index 1a3e6487a921..cb951764e750 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestLiteralSerialization.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestLiteralSerialization.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.math.BigDecimal; @@ -29,22 +28,23 @@ public class TestLiteralSerialization { @Test public void testLiterals() throws Exception { - Literal[] literals = new Literal[] { - Literal.of(false), - Literal.of(34), - Literal.of(35L), - Literal.of(36.75F), - Literal.of(8.75D), - Literal.of("2017-11-29").to(Types.DateType.get()), - Literal.of("11:30:07").to(Types.TimeType.get()), - Literal.of("2017-11-29T11:30:07.123").to(Types.TimestampType.withoutZone()), - Literal.of("2017-11-29T11:30:07.123+01:00").to(Types.TimestampType.withZone()), - Literal.of("abc"), - Literal.of(UUID.randomUUID()), - Literal.of(new byte[] { 1, 2, 3 }).to(Types.FixedType.ofLength(3)), - Literal.of(new byte[] { 3, 4, 5, 6 }).to(Types.BinaryType.get()), - Literal.of(new BigDecimal("122.50")), - }; + Literal[] literals = + new Literal[] { + Literal.of(false), + Literal.of(34), + Literal.of(35L), + Literal.of(36.75F), + Literal.of(8.75D), + Literal.of("2017-11-29").to(Types.DateType.get()), + Literal.of("11:30:07").to(Types.TimeType.get()), + Literal.of("2017-11-29T11:30:07.123").to(Types.TimestampType.withoutZone()), + Literal.of("2017-11-29T11:30:07.123+01:00").to(Types.TimestampType.withZone()), + Literal.of("abc"), + Literal.of(UUID.randomUUID()), + Literal.of(new byte[] {1, 2, 3}).to(Types.FixedType.ofLength(3)), + Literal.of(new byte[] {3, 4, 5, 6}).to(Types.BinaryType.get()), + Literal.of(new BigDecimal("122.50")), + }; for (Literal lit : literals) { checkValue(lit); @@ -53,9 +53,13 @@ public void testLiterals() throws Exception { private void checkValue(Literal lit) throws Exception { Literal copy = TestHelpers.roundTripSerialize(lit); - Assert.assertEquals("Literal's comparator should consider values equal", - 0, lit.comparator().compare(lit.value(), copy.value())); - Assert.assertEquals("Copy's comparator should consider values equal", - 0, copy.comparator().compare(lit.value(), copy.value())); + Assert.assertEquals( + "Literal's comparator should consider values equal", + 0, + lit.comparator().compare(lit.value(), copy.value())); + Assert.assertEquals( + "Copy's comparator should consider values equal", + 0, + copy.comparator().compare(lit.value(), copy.value())); } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java b/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java index 415f36126a3b..43aa26ca567a 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; +import static org.apache.iceberg.types.Conversions.toByteBuffer; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.nio.ByteBuffer; import java.util.Set; import java.util.function.BiFunction; @@ -31,59 +34,55 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Conversions.toByteBuffer; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** * This test class ensures that metrics evaluators could handle NaN as upper/lower bounds correctly. */ public class TestMetricsEvaluatorsNaNHandling { - private static final Schema SCHEMA = new Schema( - required(1, "all_nan", Types.DoubleType.get()), - required(2, "max_nan", Types.DoubleType.get()), - optional(3, "min_max_nan", Types.FloatType.get()), - required(4, "all_nan_null_bounds", Types.DoubleType.get()), - optional(5, "some_nan_correct_bounds", Types.FloatType.get()) - ); - - private static final DataFile FILE = new TestHelpers.TestDataFile("file.avro", TestHelpers.Row.of(), 50, - // any value counts, including nulls - ImmutableMap.builder() - .put(1, 10L) - .put(2, 10L) - .put(3, 10L) - .put(4, 10L) - .put(5, 10L) - .build(), - // null value counts - ImmutableMap.builder() - .put(1, 0L) - .put(2, 0L) - .put(3, 0L) - .put(4, 0L) - .put(5, 0L) - .build(), - // nan value counts - ImmutableMap.builder() - .put(1, 10L) - .put(4, 10L) - .put(5, 5L) - .build(), - // lower bounds - ImmutableMap.builder() - .put(1, toByteBuffer(Types.DoubleType.get(), Double.NaN)) - .put(2, toByteBuffer(Types.DoubleType.get(), 7D)) - .put(3, toByteBuffer(Types.FloatType.get(), Float.NaN)) - .put(5, toByteBuffer(Types.FloatType.get(), 7F)) - .build(), - // upper bounds - ImmutableMap.builder() - .put(1, toByteBuffer(Types.DoubleType.get(), Double.NaN)) - .put(2, toByteBuffer(Types.DoubleType.get(), Double.NaN)) - .put(3, toByteBuffer(Types.FloatType.get(), Float.NaN)) - .put(5, toByteBuffer(Types.FloatType.get(), 22F)) - .build()); + private static final Schema SCHEMA = + new Schema( + required(1, "all_nan", Types.DoubleType.get()), + required(2, "max_nan", Types.DoubleType.get()), + optional(3, "min_max_nan", Types.FloatType.get()), + required(4, "all_nan_null_bounds", Types.DoubleType.get()), + optional(5, "some_nan_correct_bounds", Types.FloatType.get())); + + private static final DataFile FILE = + new TestHelpers.TestDataFile( + "file.avro", + TestHelpers.Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.builder() + .put(1, 10L) + .put(2, 10L) + .put(3, 10L) + .put(4, 10L) + .put(5, 10L) + .build(), + // null value counts + ImmutableMap.builder() + .put(1, 0L) + .put(2, 0L) + .put(3, 0L) + .put(4, 0L) + .put(5, 0L) + .build(), + // nan value counts + ImmutableMap.builder().put(1, 10L).put(4, 10L).put(5, 5L).build(), + // lower bounds + ImmutableMap.builder() + .put(1, toByteBuffer(Types.DoubleType.get(), Double.NaN)) + .put(2, toByteBuffer(Types.DoubleType.get(), 7D)) + .put(3, toByteBuffer(Types.FloatType.get(), Float.NaN)) + .put(5, toByteBuffer(Types.FloatType.get(), 7F)) + .build(), + // upper bounds + ImmutableMap.builder() + .put(1, toByteBuffer(Types.DoubleType.get(), Double.NaN)) + .put(2, toByteBuffer(Types.DoubleType.get(), Double.NaN)) + .put(3, toByteBuffer(Types.FloatType.get(), Float.NaN)) + .put(5, toByteBuffer(Types.FloatType.get(), 22F)) + .build()); private static final Set> LESS_THAN_EXPRESSIONS = ImmutableSet.of(Expressions::lessThan, Expressions::lessThanOrEqual); @@ -94,7 +93,8 @@ public class TestMetricsEvaluatorsNaNHandling { @Test public void testInclusiveMetricsEvaluatorLessThanAndLessThanOrEqual() { for (BiFunction func : LESS_THAN_EXPRESSIONS) { - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan", 1D)).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan", 1D)).eval(FILE); Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("max_nan", 1D)).eval(FILE); @@ -106,13 +106,18 @@ public void testInclusiveMetricsEvaluatorLessThanAndLessThanOrEqual() { shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("min_max_nan", 1F)).eval(FILE); Assert.assertTrue("Should match: no visibility", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 1F)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 1F)) + .eval(FILE); Assert.assertFalse("Should not match: 1 is smaller than lower bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 10F)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 10F)) + .eval(FILE); Assert.assertTrue("Should match: 10 larger than lower bound", shouldRead); } } @@ -120,7 +125,8 @@ public void testInclusiveMetricsEvaluatorLessThanAndLessThanOrEqual() { @Test public void testInclusiveMetricsEvaluatorGreaterThanAndGreaterThanOrEqual() { for (BiFunction func : GREATER_THAN_EXPRESSIONS) { - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan", 1D)).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan", 1D)).eval(FILE); Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("max_nan", 1D)).eval(FILE); @@ -132,118 +138,167 @@ public void testInclusiveMetricsEvaluatorGreaterThanAndGreaterThanOrEqual() { shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("min_max_nan", 1F)).eval(FILE); Assert.assertTrue("Should match: no visibility", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 1F)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 1F)) + .eval(FILE); Assert.assertTrue("Should match: 1 is smaller than upper bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 10F)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 10F)) + .eval(FILE); Assert.assertTrue("Should match: 10 is smaller than upper bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 30)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 30)) + .eval(FILE); Assert.assertFalse("Should not match: 30 is greater than upper bound", shouldRead); } } @Test public void testInclusiveMetricsEvaluatorEquals() { - boolean shouldRead = new InclusiveMetricsEvaluator( - SCHEMA, Expressions.equal("all_nan", 1D)).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("all_nan", 1D)).eval(FILE); Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("max_nan", 1D)).eval(FILE); Assert.assertFalse("Should not match: 1 is smaller than lower bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("max_nan", 10D)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("max_nan", 10D)).eval(FILE); Assert.assertTrue("Should match: 10 is within bounds", shouldRead); - shouldRead = new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("min_max_nan", 1F)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("min_max_nan", 1F)).eval(FILE); Assert.assertTrue("Should match: no visibility", shouldRead); - shouldRead = new InclusiveMetricsEvaluator( - SCHEMA, Expressions.equal("all_nan_null_bounds", 1D)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("all_nan_null_bounds", 1D)) + .eval(FILE); Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); - shouldRead = new InclusiveMetricsEvaluator( - SCHEMA, Expressions.equal("some_nan_correct_bounds", 1F)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("some_nan_correct_bounds", 1F)) + .eval(FILE); Assert.assertFalse("Should not match: 1 is smaller than lower bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator( - SCHEMA, Expressions.equal("some_nan_correct_bounds", 10F)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("some_nan_correct_bounds", 10F)) + .eval(FILE); Assert.assertTrue("Should match: 10 is within bounds", shouldRead); - shouldRead = new InclusiveMetricsEvaluator( - SCHEMA, Expressions.equal("some_nan_correct_bounds", 30)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("some_nan_correct_bounds", 30)) + .eval(FILE); Assert.assertFalse("Should not match: 30 is greater than upper bound", shouldRead); } @Test public void testInclusiveMetricsEvaluatorNotEquals() { - boolean shouldRead = new InclusiveMetricsEvaluator( - SCHEMA, Expressions.notEqual("all_nan", 1D)).eval(FILE); - shouldRead = shouldRead & new InclusiveMetricsEvaluator( - SCHEMA, Expressions.notEqual("max_nan", 1D)).eval(FILE); - shouldRead = shouldRead & new InclusiveMetricsEvaluator( - SCHEMA, Expressions.notEqual("max_nan", 10D)).eval(FILE); - shouldRead = shouldRead & new InclusiveMetricsEvaluator( - SCHEMA, Expressions.notEqual("min_max_nan", 1F)).eval(FILE); - shouldRead = shouldRead & new InclusiveMetricsEvaluator( - SCHEMA, Expressions.notEqual("all_nan_null_bounds", 1D)).eval(FILE); - shouldRead = shouldRead & new InclusiveMetricsEvaluator( - SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 1F)).eval(FILE); - shouldRead = shouldRead & new InclusiveMetricsEvaluator( - SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 10F)).eval(FILE); - shouldRead = shouldRead & new InclusiveMetricsEvaluator( - SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 30)).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, Expressions.notEqual("all_nan", 1D)).eval(FILE); + shouldRead = + shouldRead + & new InclusiveMetricsEvaluator(SCHEMA, Expressions.notEqual("max_nan", 1D)).eval(FILE); + shouldRead = + shouldRead + & new InclusiveMetricsEvaluator(SCHEMA, Expressions.notEqual("max_nan", 10D)) + .eval(FILE); + shouldRead = + shouldRead + & new InclusiveMetricsEvaluator(SCHEMA, Expressions.notEqual("min_max_nan", 1F)) + .eval(FILE); + shouldRead = + shouldRead + & new InclusiveMetricsEvaluator(SCHEMA, Expressions.notEqual("all_nan_null_bounds", 1D)) + .eval(FILE); + shouldRead = + shouldRead + & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 1F)) + .eval(FILE); + shouldRead = + shouldRead + & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 10F)) + .eval(FILE); + shouldRead = + shouldRead + & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 30)) + .eval(FILE); Assert.assertTrue("Should match: no visibility", shouldRead); } @Test public void testInclusiveMetricsEvaluatorIn() { - boolean shouldRead = new InclusiveMetricsEvaluator( - SCHEMA, Expressions.in("all_nan", 1D, 10D, 30D)).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, Expressions.in("all_nan", 1D, 10D, 30D)).eval(FILE); Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); - shouldRead = new InclusiveMetricsEvaluator( - SCHEMA, Expressions.in("max_nan", 1D, 10D, 30D)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, Expressions.in("max_nan", 1D, 10D, 30D)).eval(FILE); Assert.assertTrue("Should match: 10 and 30 are greater than lower bound", shouldRead); - shouldRead = new InclusiveMetricsEvaluator( - SCHEMA, Expressions.in("min_max_nan", 1F, 10F, 30F)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, Expressions.in("min_max_nan", 1F, 10F, 30F)) + .eval(FILE); Assert.assertTrue("Should match: no visibility", shouldRead); - shouldRead = new InclusiveMetricsEvaluator( - SCHEMA, Expressions.in("all_nan_null_bounds", 1D, 10D, 30D)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, Expressions.in("all_nan_null_bounds", 1D, 10D, 30D)) + .eval(FILE); Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); - shouldRead = new InclusiveMetricsEvaluator( - SCHEMA, Expressions.in("some_nan_correct_bounds", 1F, 10F, 30F)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator( + SCHEMA, Expressions.in("some_nan_correct_bounds", 1F, 10F, 30F)) + .eval(FILE); Assert.assertTrue("Should match: 10 within bounds", shouldRead); - shouldRead = new InclusiveMetricsEvaluator( - SCHEMA, Expressions.in("some_nan_correct_bounds", 1F, 30F)).eval(FILE); + shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, Expressions.in("some_nan_correct_bounds", 1F, 30F)) + .eval(FILE); Assert.assertFalse("Should not match: 1 not within bounds", shouldRead); } @Test public void testInclusiveMetricsEvaluatorNotIn() { - boolean shouldRead = new InclusiveMetricsEvaluator( - SCHEMA, Expressions.notIn("all_nan", 1D)).eval(FILE); - shouldRead = shouldRead & new InclusiveMetricsEvaluator( - SCHEMA, Expressions.notIn("max_nan", 1D)).eval(FILE); - shouldRead = shouldRead & new InclusiveMetricsEvaluator( - SCHEMA, Expressions.notIn("max_nan", 10D)).eval(FILE); - shouldRead = shouldRead & new InclusiveMetricsEvaluator( - SCHEMA, Expressions.notIn("min_max_nan", 1F)).eval(FILE); - shouldRead = shouldRead & new InclusiveMetricsEvaluator( - SCHEMA, Expressions.notIn("all_nan_null_bounds", 1D)).eval(FILE); - shouldRead = shouldRead & new InclusiveMetricsEvaluator( - SCHEMA, Expressions.notIn("some_nan_correct_bounds", 1F)).eval(FILE); - shouldRead = shouldRead & new InclusiveMetricsEvaluator( - SCHEMA, Expressions.notIn("some_nan_correct_bounds", 10F)).eval(FILE); - shouldRead = shouldRead & new InclusiveMetricsEvaluator( - SCHEMA, Expressions.notIn("some_nan_correct_bounds", 30)).eval(FILE); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, Expressions.notIn("all_nan", 1D)).eval(FILE); + shouldRead = + shouldRead + & new InclusiveMetricsEvaluator(SCHEMA, Expressions.notIn("max_nan", 1D)).eval(FILE); + shouldRead = + shouldRead + & new InclusiveMetricsEvaluator(SCHEMA, Expressions.notIn("max_nan", 10D)).eval(FILE); + shouldRead = + shouldRead + & new InclusiveMetricsEvaluator(SCHEMA, Expressions.notIn("min_max_nan", 1F)) + .eval(FILE); + shouldRead = + shouldRead + & new InclusiveMetricsEvaluator(SCHEMA, Expressions.notIn("all_nan_null_bounds", 1D)) + .eval(FILE); + shouldRead = + shouldRead + & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("some_nan_correct_bounds", 1F)) + .eval(FILE); + shouldRead = + shouldRead + & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("some_nan_correct_bounds", 10F)) + .eval(FILE); + shouldRead = + shouldRead + & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("some_nan_correct_bounds", 30)) + .eval(FILE); Assert.assertTrue("Should match: no visibility", shouldRead); } @@ -259,10 +314,12 @@ public void testStrictMetricsEvaluatorLessThanAndLessThanOrEqual() { shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("min_max_nan", 1F)).eval(FILE); Assert.assertFalse("Should not match: no visibility", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 30F)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 30F)).eval(FILE); Assert.assertFalse("Should not match: nan value exists", shouldRead); } } @@ -282,120 +339,153 @@ public void testStrictMetricsEvaluatorGreaterThanAndGreaterThanOrEqual() { shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("min_max_nan", 1F)).eval(FILE); Assert.assertFalse("Should not match: no visibility", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 30)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 30)).eval(FILE); Assert.assertFalse("Should not match: nan value exists", shouldRead); } } @Test public void testStrictMetricsEvaluatorNotEquals() { - boolean shouldRead = new StrictMetricsEvaluator( - SCHEMA, Expressions.notEqual("all_nan", 1D)).eval(FILE); + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("all_nan", 1D)).eval(FILE); Assert.assertTrue("Should match: all nan column doesn't contain number", shouldRead); shouldRead = new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("max_nan", 1D)).eval(FILE); Assert.assertTrue("Should match: 1 is smaller than lower bound", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("max_nan", 10D)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("max_nan", 10D)).eval(FILE); Assert.assertFalse("Should not match: 10 is within bounds", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("min_max_nan", 1F)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("min_max_nan", 1F)).eval(FILE); Assert.assertFalse("Should not match: no visibility", shouldRead); - shouldRead = new StrictMetricsEvaluator( - SCHEMA, Expressions.notEqual("all_nan_null_bounds", 1D)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("all_nan_null_bounds", 1D)) + .eval(FILE); Assert.assertTrue("Should match: all nan column doesn't contain number", shouldRead); - shouldRead = new StrictMetricsEvaluator( - SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 1F)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 1F)) + .eval(FILE); Assert.assertTrue("Should match: 1 is smaller than lower bound", shouldRead); - shouldRead = new StrictMetricsEvaluator( - SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 10F)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 10F)) + .eval(FILE); Assert.assertFalse("Should not match: 10 is within bounds", shouldRead); - shouldRead = new StrictMetricsEvaluator( - SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 30)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 30)) + .eval(FILE); Assert.assertTrue("Should match: 30 is greater than upper bound", shouldRead); } @Test public void testStrictMetricsEvaluatorEquals() { - boolean shouldRead = new StrictMetricsEvaluator( - SCHEMA, Expressions.equal("all_nan", 1D)).eval(FILE); - shouldRead = shouldRead | new StrictMetricsEvaluator( - SCHEMA, Expressions.equal("max_nan", 1D)).eval(FILE); - shouldRead = shouldRead | new StrictMetricsEvaluator( - SCHEMA, Expressions.equal("max_nan", 10D)).eval(FILE); - shouldRead = shouldRead | new StrictMetricsEvaluator( - SCHEMA, Expressions.equal("min_max_nan", 1F)).eval(FILE); - shouldRead = shouldRead | new StrictMetricsEvaluator( - SCHEMA, Expressions.equal("all_nan_null_bounds", 1D)).eval(FILE); - shouldRead = shouldRead | new StrictMetricsEvaluator( - SCHEMA, Expressions.equal("some_nan_correct_bounds", 1F)).eval(FILE); - shouldRead = shouldRead | new StrictMetricsEvaluator( - SCHEMA, Expressions.equal("some_nan_correct_bounds", 10F)).eval(FILE); - shouldRead = shouldRead | new StrictMetricsEvaluator( - SCHEMA, Expressions.equal("some_nan_correct_bounds", 30)).eval(FILE); + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, Expressions.equal("all_nan", 1D)).eval(FILE); + shouldRead = + shouldRead + | new StrictMetricsEvaluator(SCHEMA, Expressions.equal("max_nan", 1D)).eval(FILE); + shouldRead = + shouldRead + | new StrictMetricsEvaluator(SCHEMA, Expressions.equal("max_nan", 10D)).eval(FILE); + shouldRead = + shouldRead + | new StrictMetricsEvaluator(SCHEMA, Expressions.equal("min_max_nan", 1F)).eval(FILE); + shouldRead = + shouldRead + | new StrictMetricsEvaluator(SCHEMA, Expressions.equal("all_nan_null_bounds", 1D)) + .eval(FILE); + shouldRead = + shouldRead + | new StrictMetricsEvaluator(SCHEMA, Expressions.equal("some_nan_correct_bounds", 1F)) + .eval(FILE); + shouldRead = + shouldRead + | new StrictMetricsEvaluator(SCHEMA, Expressions.equal("some_nan_correct_bounds", 10F)) + .eval(FILE); + shouldRead = + shouldRead + | new StrictMetricsEvaluator(SCHEMA, Expressions.equal("some_nan_correct_bounds", 30)) + .eval(FILE); Assert.assertFalse("Should not match: bounds not equal to given value", shouldRead); } @Test public void testStrictMetricsEvaluatorNotIn() { - boolean shouldRead = new StrictMetricsEvaluator( - SCHEMA, Expressions.notIn("all_nan", 1D, 10D, 30D)).eval(FILE); + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, Expressions.notIn("all_nan", 1D, 10D, 30D)).eval(FILE); Assert.assertTrue("Should match: all nan column doesn't contain number", shouldRead); - shouldRead = new StrictMetricsEvaluator( - SCHEMA, Expressions.notIn("max_nan", 1D, 10D, 30D)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, Expressions.notIn("max_nan", 1D, 10D, 30D)).eval(FILE); Assert.assertFalse("Should not match: 10 and 30 are greater than lower bound", shouldRead); - shouldRead = new StrictMetricsEvaluator( - SCHEMA, Expressions.notIn("max_nan", 1D)).eval(FILE); + shouldRead = new StrictMetricsEvaluator(SCHEMA, Expressions.notIn("max_nan", 1D)).eval(FILE); Assert.assertTrue("Should match: 1 is less than lower bound", shouldRead); - shouldRead = new StrictMetricsEvaluator( - SCHEMA, Expressions.notIn("min_max_nan", 1F, 10F, 30F)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, Expressions.notIn("min_max_nan", 1F, 10F, 30F)) + .eval(FILE); Assert.assertFalse("Should not match: no visibility", shouldRead); - shouldRead = new StrictMetricsEvaluator( - SCHEMA, Expressions.notIn("all_nan_null_bounds", 1D, 10D, 30D)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, Expressions.notIn("all_nan_null_bounds", 1D, 10D, 30D)) + .eval(FILE); Assert.assertTrue("Should match: all nan column doesn't contain number", shouldRead); - shouldRead = new StrictMetricsEvaluator( - SCHEMA, Expressions.notIn("some_nan_correct_bounds", 1F, 10F, 30F)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("some_nan_correct_bounds", 1F, 10F, 30F)) + .eval(FILE); Assert.assertFalse("Should not match: 10 within bounds", shouldRead); - shouldRead = new StrictMetricsEvaluator( - SCHEMA, Expressions.notIn("some_nan_correct_bounds", 1D)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, Expressions.notIn("some_nan_correct_bounds", 1D)) + .eval(FILE); Assert.assertTrue("Should match: 1 not within bounds", shouldRead); - shouldRead = new StrictMetricsEvaluator( - SCHEMA, Expressions.notIn("some_nan_correct_bounds", 30D)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, Expressions.notIn("some_nan_correct_bounds", 30D)) + .eval(FILE); Assert.assertTrue("Should match: 30 not within bounds", shouldRead); } @Test public void testStrictMetricsEvaluatorIn() { - boolean shouldRead = new StrictMetricsEvaluator( - SCHEMA, Expressions.in("all_nan", 1D)).eval(FILE); - shouldRead = shouldRead | new StrictMetricsEvaluator( - SCHEMA, Expressions.in("max_nan", 1D)).eval(FILE); - shouldRead = shouldRead | new StrictMetricsEvaluator( - SCHEMA, Expressions.in("max_nan", 10D)).eval(FILE); - shouldRead = shouldRead | new StrictMetricsEvaluator( - SCHEMA, Expressions.in("min_max_nan", 1F)).eval(FILE); - shouldRead = shouldRead | new StrictMetricsEvaluator( - SCHEMA, Expressions.in("all_nan_null_bounds", 1D)).eval(FILE); - shouldRead = shouldRead | new StrictMetricsEvaluator( - SCHEMA, Expressions.in("some_nan_correct_bounds", 1F)).eval(FILE); - shouldRead = shouldRead | new StrictMetricsEvaluator( - SCHEMA, Expressions.in("some_nan_correct_bounds", 10F)).eval(FILE); - shouldRead = shouldRead | new StrictMetricsEvaluator( - SCHEMA, Expressions.equal("some_nan_correct_bounds", 30)).eval(FILE); + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, Expressions.in("all_nan", 1D)).eval(FILE); + shouldRead = + shouldRead | new StrictMetricsEvaluator(SCHEMA, Expressions.in("max_nan", 1D)).eval(FILE); + shouldRead = + shouldRead | new StrictMetricsEvaluator(SCHEMA, Expressions.in("max_nan", 10D)).eval(FILE); + shouldRead = + shouldRead + | new StrictMetricsEvaluator(SCHEMA, Expressions.in("min_max_nan", 1F)).eval(FILE); + shouldRead = + shouldRead + | new StrictMetricsEvaluator(SCHEMA, Expressions.in("all_nan_null_bounds", 1D)) + .eval(FILE); + shouldRead = + shouldRead + | new StrictMetricsEvaluator(SCHEMA, Expressions.in("some_nan_correct_bounds", 1F)) + .eval(FILE); + shouldRead = + shouldRead + | new StrictMetricsEvaluator(SCHEMA, Expressions.in("some_nan_correct_bounds", 10F)) + .eval(FILE); + shouldRead = + shouldRead + | new StrictMetricsEvaluator(SCHEMA, Expressions.equal("some_nan_correct_bounds", 30)) + .eval(FILE); Assert.assertFalse("Should not match: bounds not equal to given value", shouldRead); } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestMiscLiteralConversions.java b/api/src/test/java/org/apache/iceberg/expressions/TestMiscLiteralConversions.java index 5ab914437976..b9edd0fa8ca7 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestMiscLiteralConversions.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestMiscLiteralConversions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.math.BigDecimal; @@ -32,21 +31,21 @@ public class TestMiscLiteralConversions { @Test public void testIdentityConversions() { - List, Type>> pairs = Arrays.asList( - Pair.of(Literal.of(true), Types.BooleanType.get()), - Pair.of(Literal.of(34), Types.IntegerType.get()), - Pair.of(Literal.of(34L), Types.LongType.get()), - Pair.of(Literal.of(34.11F), Types.FloatType.get()), - Pair.of(Literal.of(34.55D), Types.DoubleType.get()), - Pair.of(Literal.of("34.55"), Types.DecimalType.of(9, 2)), - Pair.of(Literal.of("2017-08-18"), Types.DateType.get()), - Pair.of(Literal.of("14:21:01.919"), Types.TimeType.get()), - Pair.of(Literal.of("2017-08-18T14:21:01.919"), Types.TimestampType.withoutZone()), - Pair.of(Literal.of("abc"), Types.StringType.get()), - Pair.of(Literal.of(UUID.randomUUID()), Types.UUIDType.get()), - Pair.of(Literal.of(new byte[] {0, 1, 2}), Types.FixedType.ofLength(3)), - Pair.of(Literal.of(ByteBuffer.wrap(new byte[] {0, 1, 2})), Types.BinaryType.get()) - ); + List, Type>> pairs = + Arrays.asList( + Pair.of(Literal.of(true), Types.BooleanType.get()), + Pair.of(Literal.of(34), Types.IntegerType.get()), + Pair.of(Literal.of(34L), Types.LongType.get()), + Pair.of(Literal.of(34.11F), Types.FloatType.get()), + Pair.of(Literal.of(34.55D), Types.DoubleType.get()), + Pair.of(Literal.of("34.55"), Types.DecimalType.of(9, 2)), + Pair.of(Literal.of("2017-08-18"), Types.DateType.get()), + Pair.of(Literal.of("14:21:01.919"), Types.TimeType.get()), + Pair.of(Literal.of("2017-08-18T14:21:01.919"), Types.TimestampType.withoutZone()), + Pair.of(Literal.of("abc"), Types.StringType.get()), + Pair.of(Literal.of(UUID.randomUUID()), Types.UUIDType.get()), + Pair.of(Literal.of(new byte[] {0, 1, 2}), Types.FixedType.ofLength(3)), + Pair.of(Literal.of(ByteBuffer.wrap(new byte[] {0, 1, 2})), Types.BinaryType.get())); for (Pair, Type> pair : pairs) { Literal lit = pair.first(); @@ -56,8 +55,8 @@ public void testIdentityConversions() { Literal expected = lit.to(type); // then check that converting again to the same type results in an identical literal - Assert.assertSame("Converting twice should produce identical values", - expected, expected.to(type)); + Assert.assertSame( + "Converting twice should produce identical values", expected, expected.to(type)); } } @@ -66,12 +65,16 @@ public void testBinaryToFixed() { Literal lit = Literal.of(ByteBuffer.wrap(new byte[] {0, 1, 2})); Literal fixedLit = lit.to(Types.FixedType.ofLength(3)); Assert.assertNotNull("Should allow conversion to correct fixed length", fixedLit); - Assert.assertEquals("Conversion should not change value", - lit.value().duplicate(), fixedLit.value().duplicate()); + Assert.assertEquals( + "Conversion should not change value", + lit.value().duplicate(), + fixedLit.value().duplicate()); - Assert.assertNull("Should not allow conversion to different fixed length", + Assert.assertNull( + "Should not allow conversion to different fixed length", lit.to(Types.FixedType.ofLength(4))); - Assert.assertNull("Should not allow conversion to different fixed length", + Assert.assertNull( + "Should not allow conversion to different fixed length", lit.to(Types.FixedType.ofLength(2))); } @@ -80,13 +83,16 @@ public void testFixedToBinary() { Literal lit = Literal.of(new byte[] {0, 1, 2}); Literal binaryLit = lit.to(Types.BinaryType.get()); Assert.assertNotNull("Should allow conversion to binary", binaryLit); - Assert.assertEquals("Conversion should not change value", - lit.value().duplicate(), binaryLit.value().duplicate()); + Assert.assertEquals( + "Conversion should not change value", + lit.value().duplicate(), + binaryLit.value().duplicate()); } @Test public void testInvalidBooleanConversions() { - testInvalidConversions(Literal.of(true), + testInvalidConversions( + Literal.of(true), Types.IntegerType.get(), Types.LongType.get(), Types.FloatType.get(), @@ -99,13 +105,13 @@ public void testInvalidBooleanConversions() { Types.StringType.get(), Types.UUIDType.get(), Types.FixedType.ofLength(1), - Types.BinaryType.get() - ); + Types.BinaryType.get()); } @Test public void testInvalidIntegerConversions() { - testInvalidConversions(Literal.of(34), + testInvalidConversions( + Literal.of(34), Types.BooleanType.get(), Types.TimeType.get(), Types.TimestampType.withZone(), @@ -113,24 +119,24 @@ public void testInvalidIntegerConversions() { Types.StringType.get(), Types.UUIDType.get(), Types.FixedType.ofLength(1), - Types.BinaryType.get() - ); + Types.BinaryType.get()); } @Test public void testInvalidLongConversions() { - testInvalidConversions(Literal.of(34L), + testInvalidConversions( + Literal.of(34L), Types.BooleanType.get(), Types.StringType.get(), Types.UUIDType.get(), Types.FixedType.ofLength(1), - Types.BinaryType.get() - ); + Types.BinaryType.get()); } @Test public void testInvalidFloatConversions() { - testInvalidConversions(Literal.of(34.11F), + testInvalidConversions( + Literal.of(34.11F), Types.BooleanType.get(), Types.IntegerType.get(), Types.LongType.get(), @@ -141,13 +147,13 @@ public void testInvalidFloatConversions() { Types.StringType.get(), Types.UUIDType.get(), Types.FixedType.ofLength(1), - Types.BinaryType.get() - ); + Types.BinaryType.get()); } @Test public void testInvalidDoubleConversions() { - testInvalidConversions(Literal.of(34.11D), + testInvalidConversions( + Literal.of(34.11D), Types.BooleanType.get(), Types.IntegerType.get(), Types.LongType.get(), @@ -158,13 +164,13 @@ public void testInvalidDoubleConversions() { Types.StringType.get(), Types.UUIDType.get(), Types.FixedType.ofLength(1), - Types.BinaryType.get() - ); + Types.BinaryType.get()); } @Test public void testInvalidDateConversions() { - testInvalidConversions(Literal.of("2017-08-18").to(Types.DateType.get()), + testInvalidConversions( + Literal.of("2017-08-18").to(Types.DateType.get()), Types.BooleanType.get(), Types.IntegerType.get(), Types.LongType.get(), @@ -177,8 +183,7 @@ public void testInvalidDateConversions() { Types.StringType.get(), Types.UUIDType.get(), Types.FixedType.ofLength(1), - Types.BinaryType.get() - ); + Types.BinaryType.get()); } @Test @@ -197,8 +202,7 @@ public void testInvalidTimeConversions() { Types.StringType.get(), Types.UUIDType.get(), Types.FixedType.ofLength(1), - Types.BinaryType.get() - ); + Types.BinaryType.get()); } @Test @@ -215,13 +219,13 @@ public void testInvalidTimestampConversions() { Types.StringType.get(), Types.UUIDType.get(), Types.FixedType.ofLength(1), - Types.BinaryType.get() - ); + Types.BinaryType.get()); } @Test public void testInvalidDecimalConversions() { - testInvalidConversions(Literal.of(new BigDecimal("34.11")), + testInvalidConversions( + Literal.of(new BigDecimal("34.11")), Types.BooleanType.get(), Types.IntegerType.get(), Types.LongType.get(), @@ -234,28 +238,28 @@ public void testInvalidDecimalConversions() { Types.StringType.get(), Types.UUIDType.get(), Types.FixedType.ofLength(1), - Types.BinaryType.get() - ); + Types.BinaryType.get()); } @Test public void testInvalidStringConversions() { // Strings can be used for types that are difficult to construct, like decimal or timestamp, // but are not intended to support parsing strings to any type - testInvalidConversions(Literal.of("abc"), + testInvalidConversions( + Literal.of("abc"), Types.BooleanType.get(), Types.IntegerType.get(), Types.LongType.get(), Types.FloatType.get(), Types.DoubleType.get(), Types.FixedType.ofLength(1), - Types.BinaryType.get() - ); + Types.BinaryType.get()); } @Test public void testInvalidUUIDConversions() { - testInvalidConversions(Literal.of(UUID.randomUUID()), + testInvalidConversions( + Literal.of(UUID.randomUUID()), Types.BooleanType.get(), Types.IntegerType.get(), Types.LongType.get(), @@ -268,13 +272,13 @@ public void testInvalidUUIDConversions() { Types.DecimalType.of(9, 2), Types.StringType.get(), Types.FixedType.ofLength(1), - Types.BinaryType.get() - ); + Types.BinaryType.get()); } @Test public void testInvalidFixedConversions() { - testInvalidConversions(Literal.of(new byte[] {0, 1, 2}), + testInvalidConversions( + Literal.of(new byte[] {0, 1, 2}), Types.BooleanType.get(), Types.IntegerType.get(), Types.LongType.get(), @@ -287,13 +291,13 @@ public void testInvalidFixedConversions() { Types.DecimalType.of(9, 2), Types.StringType.get(), Types.UUIDType.get(), - Types.FixedType.ofLength(1) - ); + Types.FixedType.ofLength(1)); } @Test public void testInvalidBinaryConversions() { - testInvalidConversions(Literal.of(ByteBuffer.wrap(new byte[] {0, 1, 2})), + testInvalidConversions( + Literal.of(ByteBuffer.wrap(new byte[] {0, 1, 2})), Types.BooleanType.get(), Types.IntegerType.get(), Types.LongType.get(), @@ -306,8 +310,7 @@ public void testInvalidBinaryConversions() { Types.DecimalType.of(9, 2), Types.StringType.get(), Types.UUIDType.get(), - Types.FixedType.ofLength(1) - ); + Types.FixedType.ofLength(1)); } private void testInvalidConversions(Literal lit, Type... invalidTypes) { diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestNumericLiteralConversions.java b/api/src/test/java/org/apache/iceberg/expressions/TestNumericLiteralConversions.java index 59e50c04095a..fdf3dc6690a3 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestNumericLiteralConversions.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestNumericLiteralConversions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.math.BigDecimal; @@ -54,12 +53,14 @@ public void testIntegerToDoubleConversion() { public void testIntegerToDecimalConversion() { Literal lit = Literal.of(34); - Assert.assertEquals("Value should match", - new BigDecimal("34"), lit.to(Types.DecimalType.of(9, 0)).value()); - Assert.assertEquals("Value should match", - new BigDecimal("34.00"), lit.to(Types.DecimalType.of(9, 2)).value()); - Assert.assertEquals("Value should match", - new BigDecimal("34.0000"), lit.to(Types.DecimalType.of(9, 4)).value()); + Assert.assertEquals( + "Value should match", new BigDecimal("34"), lit.to(Types.DecimalType.of(9, 0)).value()); + Assert.assertEquals( + "Value should match", new BigDecimal("34.00"), lit.to(Types.DecimalType.of(9, 2)).value()); + Assert.assertEquals( + "Value should match", + new BigDecimal("34.0000"), + lit.to(Types.DecimalType.of(9, 4)).value()); } @Test @@ -69,10 +70,14 @@ public void testLongToIntegerConversion() { Assert.assertEquals("Value should match", 34, (int) intLit.value()); - Assert.assertEquals("Values above Integer.MAX_VALUE should be Literals.aboveMax()", - Literals.aboveMax(), Literal.of((long) Integer.MAX_VALUE + 1L).to(Types.IntegerType.get())); - Assert.assertEquals("Values below Integer.MIN_VALUE should be Literals.belowMin()", - Literals.belowMin(), Literal.of((long) Integer.MIN_VALUE - 1L).to(Types.IntegerType.get())); + Assert.assertEquals( + "Values above Integer.MAX_VALUE should be Literals.aboveMax()", + Literals.aboveMax(), + Literal.of((long) Integer.MAX_VALUE + 1L).to(Types.IntegerType.get())); + Assert.assertEquals( + "Values below Integer.MIN_VALUE should be Literals.belowMin()", + Literals.belowMin(), + Literal.of((long) Integer.MIN_VALUE - 1L).to(Types.IntegerType.get())); } @Test @@ -95,12 +100,14 @@ public void testLongToDoubleConversion() { public void testLongToDecimalConversion() { Literal lit = Literal.of(34L); - Assert.assertEquals("Value should match", - new BigDecimal("34"), lit.to(Types.DecimalType.of(9, 0)).value()); - Assert.assertEquals("Value should match", - new BigDecimal("34.00"), lit.to(Types.DecimalType.of(9, 2)).value()); - Assert.assertEquals("Value should match", - new BigDecimal("34.0000"), lit.to(Types.DecimalType.of(9, 4)).value()); + Assert.assertEquals( + "Value should match", new BigDecimal("34"), lit.to(Types.DecimalType.of(9, 0)).value()); + Assert.assertEquals( + "Value should match", new BigDecimal("34.00"), lit.to(Types.DecimalType.of(9, 2)).value()); + Assert.assertEquals( + "Value should match", + new BigDecimal("34.0000"), + lit.to(Types.DecimalType.of(9, 4)).value()); } @Test @@ -115,12 +122,16 @@ public void testFloatToDoubleConversion() { public void testFloatToDecimalConversion() { Literal lit = Literal.of(34.56F); - Assert.assertEquals("Value should round using HALF_UP", - new BigDecimal("34.6"), lit.to(Types.DecimalType.of(9, 1)).value()); - Assert.assertEquals("Value should match", - new BigDecimal("34.56"), lit.to(Types.DecimalType.of(9, 2)).value()); - Assert.assertEquals("Value should match", - new BigDecimal("34.5600"), lit.to(Types.DecimalType.of(9, 4)).value()); + Assert.assertEquals( + "Value should round using HALF_UP", + new BigDecimal("34.6"), + lit.to(Types.DecimalType.of(9, 1)).value()); + Assert.assertEquals( + "Value should match", new BigDecimal("34.56"), lit.to(Types.DecimalType.of(9, 2)).value()); + Assert.assertEquals( + "Value should match", + new BigDecimal("34.5600"), + lit.to(Types.DecimalType.of(9, 4)).value()); } @Test @@ -132,50 +143,63 @@ public void testDoubleToFloatConversion() { // this adjusts Float.MAX_VALUE using multipliers because most integer adjustments are lost by // floating point precision. - Assert.assertEquals("Values above Float.MAX_VALUE should be Literals.aboveMax()", - Literals.aboveMax(), Literal.of(2 * ((double) Float.MAX_VALUE)).to(Types.FloatType.get())); - Assert.assertEquals("Values below Float.MIN_VALUE should be Literals.belowMin()", - Literals.belowMin(), Literal.of(-2 * ((double) Float.MAX_VALUE)).to(Types.FloatType.get())); + Assert.assertEquals( + "Values above Float.MAX_VALUE should be Literals.aboveMax()", + Literals.aboveMax(), + Literal.of(2 * ((double) Float.MAX_VALUE)).to(Types.FloatType.get())); + Assert.assertEquals( + "Values below Float.MIN_VALUE should be Literals.belowMin()", + Literals.belowMin(), + Literal.of(-2 * ((double) Float.MAX_VALUE)).to(Types.FloatType.get())); } @Test public void testDoubleToDecimalConversion() { Literal lit = Literal.of(34.56D); - Assert.assertEquals("Value should round using HALF_UP", - new BigDecimal("34.6"), lit.to(Types.DecimalType.of(9, 1)).value()); - Assert.assertEquals("Value should match", - new BigDecimal("34.56"), lit.to(Types.DecimalType.of(9, 2)).value()); - Assert.assertEquals("Value should match", - new BigDecimal("34.5600"), lit.to(Types.DecimalType.of(9, 4)).value()); + Assert.assertEquals( + "Value should round using HALF_UP", + new BigDecimal("34.6"), + lit.to(Types.DecimalType.of(9, 1)).value()); + Assert.assertEquals( + "Value should match", new BigDecimal("34.56"), lit.to(Types.DecimalType.of(9, 2)).value()); + Assert.assertEquals( + "Value should match", + new BigDecimal("34.5600"), + lit.to(Types.DecimalType.of(9, 4)).value()); } @Test public void testDecimalToDecimalConversion() { Literal lit = Literal.of(new BigDecimal("34.11")); - IntStream.range(0, 10).forEach(scale -> { - Assert.assertSame("Should return identical object", - lit, lit.to(Types.DecimalType.of(9, scale))); - Assert.assertSame("Should return identical object", - lit, lit.to(Types.DecimalType.of(11, scale))); - }); + IntStream.range(0, 10) + .forEach( + scale -> { + Assert.assertSame( + "Should return identical object", lit, lit.to(Types.DecimalType.of(9, scale))); + Assert.assertSame( + "Should return identical object", lit, lit.to(Types.DecimalType.of(11, scale))); + }); } @Test public void testIntegerToDateConversion() { Literal lit = Literal.of(0); - Assert.assertEquals("Dates should be equal", lit.to(Types.DateType.get()), new Literals.DateLiteral(0)); + Assert.assertEquals( + "Dates should be equal", lit.to(Types.DateType.get()), new Literals.DateLiteral(0)); lit = Literal.of(365 * 50); - Assert.assertEquals("Dates should be equal", lit.to(Types.DateType.get()), new Literals.DateLiteral(365 * 50)); + Assert.assertEquals( + "Dates should be equal", lit.to(Types.DateType.get()), new Literals.DateLiteral(365 * 50)); } @Test public void testLongToDateConversion() { Literal lit = Literal.of(0L); - Assert.assertEquals("Dates should be equal", lit.to(Types.DateType.get()), new Literals.DateLiteral(0)); + Assert.assertEquals( + "Dates should be equal", lit.to(Types.DateType.get()), new Literals.DateLiteral(0)); lit = Literal.of(365L * 50); - Assert.assertEquals("Dates should be equal", lit.to(Types.DateType.get()), new Literals.DateLiteral(365 * 50)); + Assert.assertEquals( + "Dates should be equal", lit.to(Types.DateType.get()), new Literals.DateLiteral(365 * 50)); } - } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestPredicateBinding.java b/api/src/test/java/org/apache/iceberg/expressions/TestPredicateBinding.java index 765d832efb41..6c7ed5839832 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestPredicateBinding.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestPredicateBinding.java @@ -16,20 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; -import java.math.BigDecimal; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.types.Types.StructType; -import org.junit.Assert; -import org.junit.Test; - import static org.apache.iceberg.TestHelpers.assertAndUnwrap; import static org.apache.iceberg.TestHelpers.assertAndUnwrapBoundSet; import static org.apache.iceberg.expressions.Expression.Operation.EQ; @@ -50,18 +38,29 @@ import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import java.math.BigDecimal; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.iceberg.AssertHelpers; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.types.Types.StructType; +import org.junit.Assert; +import org.junit.Test; + public class TestPredicateBinding { - private static final List COMPARISONS = Arrays.asList( - LT, LT_EQ, GT, GT_EQ, EQ, NOT_EQ); + private static final List COMPARISONS = + Arrays.asList(LT, LT_EQ, GT, GT_EQ, EQ, NOT_EQ); @Test @SuppressWarnings("unchecked") public void testMultipleFields() { - StructType struct = StructType.of( - required(10, "x", Types.IntegerType.get()), - required(11, "y", Types.IntegerType.get()), - required(12, "z", Types.IntegerType.get()) - ); + StructType struct = + StructType.of( + required(10, "x", Types.IntegerType.get()), + required(11, "y", Types.IntegerType.get()), + required(12, "z", Types.IntegerType.get())); UnboundPredicate unbound = new UnboundPredicate<>(LT, ref("y"), 6); @@ -71,22 +70,23 @@ public void testMultipleFields() { Assert.assertEquals("Should reference correct field ID", 11, bound.ref().fieldId()); Assert.assertEquals("Should not change the comparison operation", LT, bound.op()); Assert.assertTrue("Should be a literal predicate", bound.isLiteralPredicate()); - Assert.assertEquals("Should not alter literal value", - Integer.valueOf(6), bound.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should not alter literal value", + Integer.valueOf(6), + bound.asLiteralPredicate().literal().value()); } @Test public void testMissingField() { - StructType struct = StructType.of( - required(13, "x", Types.IntegerType.get()) - ); + StructType struct = StructType.of(required(13, "x", Types.IntegerType.get())); UnboundPredicate unbound = new UnboundPredicate<>(LT, ref("missing"), 6); try { unbound.bind(struct); Assert.fail("Binding a missing field should fail"); } catch (ValidationException e) { - Assert.assertTrue("Validation should complain about missing field", + Assert.assertTrue( + "Validation should complain about missing field", e.getMessage().contains("Cannot find field 'missing' in struct:")); } } @@ -103,8 +103,10 @@ public void testComparisonPredicateBinding() { BoundPredicate bound = assertAndUnwrap(expr); Assert.assertTrue("Should be a literal predicate", bound.isLiteralPredicate()); - Assert.assertEquals("Should not alter literal value", - Integer.valueOf(5), bound.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should not alter literal value", + Integer.valueOf(5), + bound.asLiteralPredicate().literal().value()); Assert.assertEquals("Should reference correct field ID", 14, bound.ref().fieldId()); Assert.assertEquals("Should not change the comparison operation", op, bound.op()); } @@ -122,8 +124,8 @@ public void testPredicateBindingForStringPrefixComparisons() { BoundPredicate bound = assertAndUnwrap(expr); Assert.assertTrue("Should be a literal predicate", bound.isLiteralPredicate()); - Assert.assertEquals("Should not alter literal value", - "s", bound.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should not alter literal value", "s", bound.asLiteralPredicate().literal().value()); Assert.assertEquals("Should reference correct field ID", 17, bound.ref().fieldId()); Assert.assertEquals("Should not change the comparison operation", op, bound.op()); } @@ -140,8 +142,10 @@ public void testLiteralConversion() { Expression expr = unbound.bind(struct); BoundPredicate bound = assertAndUnwrap(expr); Assert.assertTrue("Should be a literal predicate", bound.isLiteralPredicate()); - Assert.assertEquals("Should convert literal value to decimal", - new BigDecimal("12.40"), bound.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should convert literal value to decimal", + new BigDecimal("12.40"), + bound.asLiteralPredicate().literal().value()); Assert.assertEquals("Should reference correct field ID", 15, bound.ref().fieldId()); Assert.assertEquals("Should not change the comparison operation", op, bound.op()); } @@ -158,7 +162,8 @@ public void testInvalidConversions() { unbound.bind(struct); Assert.fail("Should not convert string to float"); } catch (ValidationException e) { - Assert.assertEquals("Should ", + Assert.assertEquals( + "Should ", e.getMessage(), "Invalid value for conversion to type float: 12.40 (java.lang.String)"); } @@ -170,71 +175,88 @@ public void testInvalidConversions() { public void testLongToIntegerConversion() { StructType struct = StructType.of(required(17, "i", Types.IntegerType.get())); - UnboundPredicate lt = new UnboundPredicate<>( - LT, ref("i"), (long) Integer.MAX_VALUE + 1L); - Assert.assertEquals("Less than above max should be alwaysTrue", - Expressions.alwaysTrue(), lt.bind(struct)); - - UnboundPredicate lteq = new UnboundPredicate<>( - LT_EQ, ref("i"), (long) Integer.MAX_VALUE + 1L); - Assert.assertEquals("Less than or equal above max should be alwaysTrue", - Expressions.alwaysTrue(), lteq.bind(struct)); - - UnboundPredicate gt = new UnboundPredicate<>( - GT, ref("i"), (long) Integer.MIN_VALUE - 1L); - Assert.assertEquals("Greater than below min should be alwaysTrue", - Expressions.alwaysTrue(), gt.bind(struct)); - - UnboundPredicate gteq = new UnboundPredicate<>( - GT_EQ, ref("i"), (long) Integer.MIN_VALUE - 1L); - Assert.assertEquals("Greater than or equal below min should be alwaysTrue", - Expressions.alwaysTrue(), gteq.bind(struct)); - - UnboundPredicate gtMax = new UnboundPredicate<>( - GT, ref("i"), (long) Integer.MAX_VALUE + 1L); - Assert.assertEquals("Greater than above max should be alwaysFalse", - Expressions.alwaysFalse(), gtMax.bind(struct)); - - UnboundPredicate gteqMax = new UnboundPredicate<>( - GT_EQ, ref("i"), (long) Integer.MAX_VALUE + 1L); - Assert.assertEquals("Greater than or equal above max should be alwaysFalse", - Expressions.alwaysFalse(), gteqMax.bind(struct)); - - UnboundPredicate ltMin = new UnboundPredicate<>( - LT, ref("i"), (long) Integer.MIN_VALUE - 1L); - Assert.assertEquals("Less than below min should be alwaysFalse", - Expressions.alwaysFalse(), ltMin.bind(struct)); - - UnboundPredicate lteqMin = new UnboundPredicate<>( - LT_EQ, ref("i"), (long) Integer.MIN_VALUE - 1L); - Assert.assertEquals("Less than or equal below min should be alwaysFalse", - Expressions.alwaysFalse(), lteqMin.bind(struct)); - - Expression ltExpr = new UnboundPredicate<>(LT, ref("i"), (long) Integer.MAX_VALUE).bind(struct, true); + UnboundPredicate lt = new UnboundPredicate<>(LT, ref("i"), (long) Integer.MAX_VALUE + 1L); + Assert.assertEquals( + "Less than above max should be alwaysTrue", Expressions.alwaysTrue(), lt.bind(struct)); + + UnboundPredicate lteq = + new UnboundPredicate<>(LT_EQ, ref("i"), (long) Integer.MAX_VALUE + 1L); + Assert.assertEquals( + "Less than or equal above max should be alwaysTrue", + Expressions.alwaysTrue(), + lteq.bind(struct)); + + UnboundPredicate gt = new UnboundPredicate<>(GT, ref("i"), (long) Integer.MIN_VALUE - 1L); + Assert.assertEquals( + "Greater than below min should be alwaysTrue", Expressions.alwaysTrue(), gt.bind(struct)); + + UnboundPredicate gteq = + new UnboundPredicate<>(GT_EQ, ref("i"), (long) Integer.MIN_VALUE - 1L); + Assert.assertEquals( + "Greater than or equal below min should be alwaysTrue", + Expressions.alwaysTrue(), + gteq.bind(struct)); + + UnboundPredicate gtMax = + new UnboundPredicate<>(GT, ref("i"), (long) Integer.MAX_VALUE + 1L); + Assert.assertEquals( + "Greater than above max should be alwaysFalse", + Expressions.alwaysFalse(), + gtMax.bind(struct)); + + UnboundPredicate gteqMax = + new UnboundPredicate<>(GT_EQ, ref("i"), (long) Integer.MAX_VALUE + 1L); + Assert.assertEquals( + "Greater than or equal above max should be alwaysFalse", + Expressions.alwaysFalse(), + gteqMax.bind(struct)); + + UnboundPredicate ltMin = + new UnboundPredicate<>(LT, ref("i"), (long) Integer.MIN_VALUE - 1L); + Assert.assertEquals( + "Less than below min should be alwaysFalse", Expressions.alwaysFalse(), ltMin.bind(struct)); + + UnboundPredicate lteqMin = + new UnboundPredicate<>(LT_EQ, ref("i"), (long) Integer.MIN_VALUE - 1L); + Assert.assertEquals( + "Less than or equal below min should be alwaysFalse", + Expressions.alwaysFalse(), + lteqMin.bind(struct)); + + Expression ltExpr = + new UnboundPredicate<>(LT, ref("i"), (long) Integer.MAX_VALUE).bind(struct, true); BoundPredicate ltMax = assertAndUnwrap(ltExpr); Assert.assertTrue("Should be a literal predicate", ltMax.isLiteralPredicate()); - Assert.assertEquals("Should translate bound to Integer", - (Integer) Integer.MAX_VALUE, ltMax.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should translate bound to Integer", + (Integer) Integer.MAX_VALUE, + ltMax.asLiteralPredicate().literal().value()); - Expression lteqExpr = new UnboundPredicate<>(LT_EQ, ref("i"), (long) Integer.MAX_VALUE) - .bind(struct); + Expression lteqExpr = + new UnboundPredicate<>(LT_EQ, ref("i"), (long) Integer.MAX_VALUE).bind(struct); BoundPredicate lteqMax = assertAndUnwrap(lteqExpr); Assert.assertTrue("Should be a literal predicate", lteqMax.isLiteralPredicate()); - Assert.assertEquals("Should translate bound to Integer", - (Integer) Integer.MAX_VALUE, lteqMax.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should translate bound to Integer", + (Integer) Integer.MAX_VALUE, + lteqMax.asLiteralPredicate().literal().value()); Expression gtExpr = new UnboundPredicate<>(GT, ref("i"), (long) Integer.MIN_VALUE).bind(struct); BoundPredicate gtMin = assertAndUnwrap(gtExpr); Assert.assertTrue("Should be a literal predicate", gtMin.isLiteralPredicate()); - Assert.assertEquals("Should translate bound to Integer", - (Integer) Integer.MIN_VALUE, gtMin.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should translate bound to Integer", + (Integer) Integer.MIN_VALUE, + gtMin.asLiteralPredicate().literal().value()); - Expression gteqExpr = new UnboundPredicate<>(GT_EQ, ref("i"), (long) Integer.MIN_VALUE) - .bind(struct); + Expression gteqExpr = + new UnboundPredicate<>(GT_EQ, ref("i"), (long) Integer.MIN_VALUE).bind(struct); BoundPredicate gteqMin = assertAndUnwrap(gteqExpr); Assert.assertTrue("Should be a literal predicate", gteqMin.isLiteralPredicate()); - Assert.assertEquals("Should translate bound to Integer", - (Integer) Integer.MIN_VALUE, gteqMin.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should translate bound to Integer", + (Integer) Integer.MIN_VALUE, + gteqMin.asLiteralPredicate().literal().value()); } @Test @@ -242,71 +264,90 @@ public void testLongToIntegerConversion() { public void testDoubleToFloatConversion() { StructType struct = StructType.of(required(18, "f", Types.FloatType.get())); - UnboundPredicate lt = new UnboundPredicate<>( - LT, ref("f"), (double) Float.MAX_VALUE * 2); - Assert.assertEquals("Less than above max should be alwaysTrue", - Expressions.alwaysTrue(), lt.bind(struct)); - - UnboundPredicate lteq = new UnboundPredicate<>( - LT_EQ, ref("f"), (double) Float.MAX_VALUE * 2); - Assert.assertEquals("Less than or equal above max should be alwaysTrue", - Expressions.alwaysTrue(), lteq.bind(struct)); - - UnboundPredicate gt = new UnboundPredicate<>( - GT, ref("f"), (double) Float.MAX_VALUE * -2); - Assert.assertEquals("Greater than below min should be alwaysTrue", - Expressions.alwaysTrue(), gt.bind(struct)); - - UnboundPredicate gteq = new UnboundPredicate<>( - GT_EQ, ref("f"), (double) Float.MAX_VALUE * -2); - Assert.assertEquals("Greater than or equal below min should be alwaysTrue", - Expressions.alwaysTrue(), gteq.bind(struct)); - - UnboundPredicate gtMax = new UnboundPredicate<>( - GT, ref("f"), (double) Float.MAX_VALUE * 2); - Assert.assertEquals("Greater than above max should be alwaysFalse", - Expressions.alwaysFalse(), gtMax.bind(struct)); - - UnboundPredicate gteqMax = new UnboundPredicate<>( - GT_EQ, ref("f"), (double) Float.MAX_VALUE * 2); - Assert.assertEquals("Greater than or equal above max should be alwaysFalse", - Expressions.alwaysFalse(), gteqMax.bind(struct)); - - UnboundPredicate ltMin = new UnboundPredicate<>( - LT, ref("f"), (double) Float.MAX_VALUE * -2); - Assert.assertEquals("Less than below min should be alwaysFalse", - Expressions.alwaysFalse(), ltMin.bind(struct)); - - UnboundPredicate lteqMin = new UnboundPredicate<>( - LT_EQ, ref("f"), (double) Float.MAX_VALUE * -2); - Assert.assertEquals("Less than or equal below min should be alwaysFalse", - Expressions.alwaysFalse(), lteqMin.bind(struct)); + UnboundPredicate lt = + new UnboundPredicate<>(LT, ref("f"), (double) Float.MAX_VALUE * 2); + Assert.assertEquals( + "Less than above max should be alwaysTrue", Expressions.alwaysTrue(), lt.bind(struct)); + + UnboundPredicate lteq = + new UnboundPredicate<>(LT_EQ, ref("f"), (double) Float.MAX_VALUE * 2); + Assert.assertEquals( + "Less than or equal above max should be alwaysTrue", + Expressions.alwaysTrue(), + lteq.bind(struct)); + + UnboundPredicate gt = + new UnboundPredicate<>(GT, ref("f"), (double) Float.MAX_VALUE * -2); + Assert.assertEquals( + "Greater than below min should be alwaysTrue", Expressions.alwaysTrue(), gt.bind(struct)); + + UnboundPredicate gteq = + new UnboundPredicate<>(GT_EQ, ref("f"), (double) Float.MAX_VALUE * -2); + Assert.assertEquals( + "Greater than or equal below min should be alwaysTrue", + Expressions.alwaysTrue(), + gteq.bind(struct)); + + UnboundPredicate gtMax = + new UnboundPredicate<>(GT, ref("f"), (double) Float.MAX_VALUE * 2); + Assert.assertEquals( + "Greater than above max should be alwaysFalse", + Expressions.alwaysFalse(), + gtMax.bind(struct)); + + UnboundPredicate gteqMax = + new UnboundPredicate<>(GT_EQ, ref("f"), (double) Float.MAX_VALUE * 2); + Assert.assertEquals( + "Greater than or equal above max should be alwaysFalse", + Expressions.alwaysFalse(), + gteqMax.bind(struct)); + + UnboundPredicate ltMin = + new UnboundPredicate<>(LT, ref("f"), (double) Float.MAX_VALUE * -2); + Assert.assertEquals( + "Less than below min should be alwaysFalse", Expressions.alwaysFalse(), ltMin.bind(struct)); + + UnboundPredicate lteqMin = + new UnboundPredicate<>(LT_EQ, ref("f"), (double) Float.MAX_VALUE * -2); + Assert.assertEquals( + "Less than or equal below min should be alwaysFalse", + Expressions.alwaysFalse(), + lteqMin.bind(struct)); Expression ltExpr = new UnboundPredicate<>(LT, ref("f"), (double) Float.MAX_VALUE).bind(struct); BoundPredicate ltMax = assertAndUnwrap(ltExpr); Assert.assertTrue("Should be a literal predicate", ltMax.isLiteralPredicate()); - Assert.assertEquals("Should translate bound to Float", - (Float) Float.MAX_VALUE, ltMax.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should translate bound to Float", + (Float) Float.MAX_VALUE, + ltMax.asLiteralPredicate().literal().value()); - Expression lteqExpr = new UnboundPredicate<>(LT_EQ, ref("f"), (double) Float.MAX_VALUE) - .bind(struct); + Expression lteqExpr = + new UnboundPredicate<>(LT_EQ, ref("f"), (double) Float.MAX_VALUE).bind(struct); BoundPredicate lteqMax = assertAndUnwrap(lteqExpr); Assert.assertTrue("Should be a literal predicate", lteqMax.isLiteralPredicate()); - Assert.assertEquals("Should translate bound to Float", - (Float) Float.MAX_VALUE, lteqMax.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should translate bound to Float", + (Float) Float.MAX_VALUE, + lteqMax.asLiteralPredicate().literal().value()); - Expression gtExpr = new UnboundPredicate<>(GT, ref("f"), (double) -Float.MAX_VALUE).bind(struct); + Expression gtExpr = + new UnboundPredicate<>(GT, ref("f"), (double) -Float.MAX_VALUE).bind(struct); BoundPredicate gtMin = assertAndUnwrap(gtExpr); Assert.assertTrue("Should be a literal predicate", gtMin.isLiteralPredicate()); - Assert.assertEquals("Should translate bound to Float", - Float.valueOf(-Float.MAX_VALUE), gtMin.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should translate bound to Float", + Float.valueOf(-Float.MAX_VALUE), + gtMin.asLiteralPredicate().literal().value()); - Expression gteqExpr = new UnboundPredicate<>(GT_EQ, ref("f"), (double) -Float.MAX_VALUE) - .bind(struct); + Expression gteqExpr = + new UnboundPredicate<>(GT_EQ, ref("f"), (double) -Float.MAX_VALUE).bind(struct); BoundPredicate gteqMin = assertAndUnwrap(gteqExpr); Assert.assertTrue("Should be a literal predicate", gteqMin.isLiteralPredicate()); - Assert.assertEquals("Should translate bound to Float", - Float.valueOf(-Float.MAX_VALUE), gteqMin.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should translate bound to Float", + Float.valueOf(-Float.MAX_VALUE), + gteqMin.asLiteralPredicate().literal().value()); } @Test @@ -322,8 +363,10 @@ public void testIsNull() { Assert.assertTrue("Should be a unary predicate", bound.isUnaryPredicate()); StructType required = StructType.of(required(20, "s", Types.StringType.get())); - Assert.assertEquals("IsNull inclusive a required field should be alwaysFalse", - Expressions.alwaysFalse(), unbound.bind(required)); + Assert.assertEquals( + "IsNull inclusive a required field should be alwaysFalse", + Expressions.alwaysFalse(), + unbound.bind(required)); } @Test @@ -338,8 +381,10 @@ public void testNotNull() { Assert.assertTrue("Should be a unary predicate", bound.isUnaryPredicate()); StructType required = StructType.of(required(22, "s", Types.StringType.get())); - Assert.assertEquals("NotNull inclusive a required field should be alwaysTrue", - Expressions.alwaysTrue(), unbound.bind(required)); + Assert.assertEquals( + "NotNull inclusive a required field should be alwaysTrue", + Expressions.alwaysTrue(), + unbound.bind(required)); } @Test @@ -366,8 +411,10 @@ public void testIsNaN() { // string (non-compatible) StructType strStruct = StructType.of(optional(21, "s", Types.StringType.get())); - AssertHelpers.assertThrows("Should complain about incompatible type binding", - ValidationException.class, "IsNaN cannot be used with a non-floating-point column", + AssertHelpers.assertThrows( + "Should complain about incompatible type binding", + ValidationException.class, + "IsNaN cannot be used with a non-floating-point column", () -> new UnboundPredicate<>(IS_NAN, ref("s")).bind(strStruct)); } @@ -395,19 +442,20 @@ public void testNotNaN() { // string (non-compatible) StructType strStruct = StructType.of(optional(21, "s", Types.StringType.get())); - AssertHelpers.assertThrows("Should complain about incompatible type binding", - ValidationException.class, "NotNaN cannot be used with a non-floating-point column", + AssertHelpers.assertThrows( + "Should complain about incompatible type binding", + ValidationException.class, + "NotNaN cannot be used with a non-floating-point column", () -> new UnboundPredicate<>(NOT_NAN, ref("s")).bind(strStruct)); - } @Test public void testInPredicateBinding() { - StructType struct = StructType.of( - required(10, "x", Types.IntegerType.get()), - required(11, "y", Types.IntegerType.get()), - required(12, "z", Types.IntegerType.get()) - ); + StructType struct = + StructType.of( + required(10, "x", Types.IntegerType.get()), + required(11, "y", Types.IntegerType.get()), + required(12, "z", Types.IntegerType.get())); UnboundPredicate unbound = Expressions.in("y", 6, 7, 11); @@ -416,10 +464,10 @@ public void testInPredicateBinding() { Assert.assertEquals("Should reference correct field ID", 11, bound.ref().fieldId()); Assert.assertEquals("Should not change the IN operation", IN, bound.op()); - Assert.assertArrayEquals("Should not alter literal set values", - new Integer[]{6, 7, 11}, - bound.literalSet().stream().sorted() - .collect(Collectors.toList()).toArray(new Integer[2])); + Assert.assertArrayEquals( + "Should not alter literal set values", + new Integer[] {6, 7, 11}, + bound.literalSet().stream().sorted().collect(Collectors.toList()).toArray(new Integer[2])); } @Test @@ -428,10 +476,13 @@ public void testInPredicateBindingConversion() { UnboundPredicate unbound = Expressions.in("d", "12.40", "1.23", "99.99", "1.23"); Expression expr = unbound.bind(struct); BoundSetPredicate bound = assertAndUnwrapBoundSet(expr); - Assert.assertArrayEquals("Should convert literal set values to decimal", - new BigDecimal[]{new BigDecimal("1.23"), new BigDecimal("12.40"), new BigDecimal("99.99")}, - bound.literalSet().stream().sorted() - .collect(Collectors.toList()).toArray(new BigDecimal[2])); + Assert.assertArrayEquals( + "Should convert literal set values to decimal", + new BigDecimal[] {new BigDecimal("1.23"), new BigDecimal("12.40"), new BigDecimal("99.99")}, + bound.literalSet().stream() + .sorted() + .collect(Collectors.toList()) + .toArray(new BigDecimal[2])); Assert.assertEquals("Should reference correct field ID", 15, bound.ref().fieldId()); Assert.assertEquals("Should not change the IN operation", IN, bound.op()); } @@ -443,15 +494,17 @@ public void testInToEqPredicate() { UnboundPredicate unbound = Expressions.in("x", 5); Assert.assertEquals("Should create an IN predicate with a single item", IN, unbound.op()); - Assert.assertEquals("Should create an IN predicate with a single item", - 1, unbound.literals().size()); + Assert.assertEquals( + "Should create an IN predicate with a single item", 1, unbound.literals().size()); Expression expr = unbound.bind(struct); BoundPredicate bound = assertAndUnwrap(expr); Assert.assertTrue("Should be a literal predicate", bound.isLiteralPredicate()); - Assert.assertEquals("Should not alter literal value", - Integer.valueOf(5), bound.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should not alter literal value", + Integer.valueOf(5), + bound.asLiteralPredicate().literal().value()); Assert.assertEquals("Should reference correct field ID", 14, bound.ref().fieldId()); Assert.assertEquals("Should change the operation from IN to EQ", EQ, bound.op()); } @@ -469,13 +522,14 @@ public void testInPredicateBindingConversionToEq() { BoundPredicate bound = assertAndUnwrap(expr); Assert.assertTrue("Should be a literal predicate", bound.isLiteralPredicate()); - Assert.assertEquals("Should remove aboveMax literal value", - Integer.valueOf(5), bound.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should remove aboveMax literal value", + Integer.valueOf(5), + bound.asLiteralPredicate().literal().value()); Assert.assertEquals("Should reference correct field ID", 14, bound.ref().fieldId()); Assert.assertEquals("Should change the IN operation to EQ", EQ, bound.op()); } - @Test public void testInPredicateBindingConversionDedupToEq() { StructType struct = StructType.of(required(15, "d", Types.DecimalType.of(9, 2))); @@ -485,8 +539,10 @@ public void testInPredicateBindingConversionDedupToEq() { Expression expr = unbound.bind(struct); BoundPredicate bound = assertAndUnwrap(expr); Assert.assertTrue("Should be a literal predicate", bound.isLiteralPredicate()); - Assert.assertEquals("Should convert literal set values to a single decimal", - new BigDecimal("12.40"), bound.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should convert literal set values to a single decimal", + new BigDecimal("12.40"), + bound.asLiteralPredicate().literal().value()); Assert.assertEquals("Should reference correct field ID", 15, bound.ref().fieldId()); Assert.assertEquals("Should change the IN operation to EQ", EQ, bound.op()); } @@ -501,16 +557,17 @@ public void testInPredicateBindingConversionToExpression() { Assert.assertEquals("Should create an IN predicate", IN, op); Expression expr = unbound.bind(struct); - Assert.assertEquals("Should change IN to alwaysFalse expression", Expressions.alwaysFalse(), expr); + Assert.assertEquals( + "Should change IN to alwaysFalse expression", Expressions.alwaysFalse(), expr); } @Test public void testNotInPredicateBinding() { - StructType struct = StructType.of( - required(10, "x", Types.IntegerType.get()), - required(11, "y", Types.IntegerType.get()), - required(12, "z", Types.IntegerType.get()) - ); + StructType struct = + StructType.of( + required(10, "x", Types.IntegerType.get()), + required(11, "y", Types.IntegerType.get()), + required(12, "z", Types.IntegerType.get())); UnboundPredicate unbound = Expressions.notIn("y", 6, 7, 11); @@ -519,10 +576,10 @@ public void testNotInPredicateBinding() { Assert.assertEquals("Should reference correct field ID", 11, bound.ref().fieldId()); Assert.assertEquals("Should not change the NOT_IN operation", NOT_IN, bound.op()); - Assert.assertArrayEquals("Should not alter literal set values", - new Integer[]{6, 7, 11}, - bound.literalSet().stream().sorted() - .collect(Collectors.toList()).toArray(new Integer[2])); + Assert.assertArrayEquals( + "Should not alter literal set values", + new Integer[] {6, 7, 11}, + bound.literalSet().stream().sorted().collect(Collectors.toList()).toArray(new Integer[2])); } @Test @@ -531,10 +588,13 @@ public void testNotInPredicateBindingConversion() { UnboundPredicate unbound = Expressions.notIn("d", "12.40", "1.23", "99.99", "1.23"); Expression expr = unbound.bind(struct); BoundSetPredicate bound = assertAndUnwrapBoundSet(expr); - Assert.assertArrayEquals("Should convert literal set values to decimal", - new BigDecimal[]{new BigDecimal("1.23"), new BigDecimal("12.40"), new BigDecimal("99.99")}, - bound.literalSet().stream().sorted() - .collect(Collectors.toList()).toArray(new BigDecimal[2])); + Assert.assertArrayEquals( + "Should convert literal set values to decimal", + new BigDecimal[] {new BigDecimal("1.23"), new BigDecimal("12.40"), new BigDecimal("99.99")}, + bound.literalSet().stream() + .sorted() + .collect(Collectors.toList()) + .toArray(new BigDecimal[2])); Assert.assertEquals("Should reference correct field ID", 15, bound.ref().fieldId()); Assert.assertEquals("Should not change the NOT_IN operation", NOT_IN, bound.op()); } @@ -545,16 +605,19 @@ public void testNotInToNotEqPredicate() { UnboundPredicate unbound = Expressions.notIn("x", 5); - Assert.assertEquals("Should create a NOT_IN predicate with a single item", NOT_IN, unbound.op()); - Assert.assertEquals("Should create a NOT_IN predicate with a single item", - 1, unbound.literals().size()); + Assert.assertEquals( + "Should create a NOT_IN predicate with a single item", NOT_IN, unbound.op()); + Assert.assertEquals( + "Should create a NOT_IN predicate with a single item", 1, unbound.literals().size()); Expression expr = unbound.bind(struct); BoundPredicate bound = assertAndUnwrap(expr); Assert.assertTrue("Should be a literal predicate", bound.isLiteralPredicate()); - Assert.assertEquals("Should not alter literal value", - Integer.valueOf(5), bound.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should not alter literal value", + Integer.valueOf(5), + bound.asLiteralPredicate().literal().value()); Assert.assertEquals("Should reference correct field ID", 14, bound.ref().fieldId()); Assert.assertEquals("Should change the operation from NOT_IN to NOT_EQ", NOT_EQ, bound.op()); } @@ -572,8 +635,10 @@ public void testNotInPredicateBindingConversionToNotEq() { BoundPredicate bound = assertAndUnwrap(expr); Assert.assertTrue("Should be a literal predicate", bound.isLiteralPredicate()); - Assert.assertEquals("Should remove aboveMax literal value", - Integer.valueOf(5), bound.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should remove aboveMax literal value", + Integer.valueOf(5), + bound.asLiteralPredicate().literal().value()); Assert.assertEquals("Should reference correct field ID", 14, bound.ref().fieldId()); Assert.assertEquals("Should change the NOT_IN operation to NOT_EQ", NOT_EQ, bound.op()); } @@ -587,8 +652,10 @@ public void testNotInPredicateBindingConversionDedupToNotEq() { Expression expr = unbound.bind(struct); BoundPredicate bound = assertAndUnwrap(expr); Assert.assertTrue("Should be a literal predicate", bound.isLiteralPredicate()); - Assert.assertEquals("Should convert literal set values to a single decimal", - new BigDecimal("12.40"), bound.asLiteralPredicate().literal().value()); + Assert.assertEquals( + "Should convert literal set values to a single decimal", + new BigDecimal("12.40"), + bound.asLiteralPredicate().literal().value()); Assert.assertEquals("Should reference correct field ID", 15, bound.ref().fieldId()); Assert.assertEquals("Should change the NOT_IN operation to NOT_EQ", NOT_EQ, bound.op()); } @@ -603,6 +670,7 @@ public void testNotInPredicateBindingConversionToExpression() { Assert.assertEquals("Should create an NOT_IN predicate", NOT_IN, op); Expression expr = unbound.bind(struct); - Assert.assertEquals("Should change NOT_IN to alwaysTrue expression", Expressions.alwaysTrue(), expr); + Assert.assertEquals( + "Should change NOT_IN to alwaysTrue expression", Expressions.alwaysTrue(), expr); } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java index 8df5475546e9..81b8b200bd91 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java @@ -16,22 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; -import org.apache.iceberg.AssertHelpers; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.Schema; -import org.apache.iceberg.TestHelpers.Row; -import org.apache.iceberg.TestHelpers.TestDataFile; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.types.Types.IntegerType; -import org.apache.iceberg.types.Types.StringType; -import org.junit.Assert; -import org.junit.Test; - import static org.apache.iceberg.expressions.Expressions.and; import static org.apache.iceberg.expressions.Expressions.equal; import static org.apache.iceberg.expressions.Expressions.greaterThan; @@ -51,104 +37,129 @@ import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import org.apache.iceberg.AssertHelpers; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.Schema; +import org.apache.iceberg.TestHelpers.Row; +import org.apache.iceberg.TestHelpers.TestDataFile; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.types.Types.IntegerType; +import org.apache.iceberg.types.Types.StringType; +import org.junit.Assert; +import org.junit.Test; + public class TestStrictMetricsEvaluator { - private static final Schema SCHEMA = new Schema( - required(1, "id", IntegerType.get()), - optional(2, "no_stats", IntegerType.get()), - required(3, "required", StringType.get()), - optional(4, "all_nulls", StringType.get()), - optional(5, "some_nulls", StringType.get()), - optional(6, "no_nulls", StringType.get()), - required(7, "always_5", IntegerType.get()), - optional(8, "all_nans", Types.DoubleType.get()), - optional(9, "some_nans", Types.FloatType.get()), - optional(10, "no_nans", Types.FloatType.get()), - optional(11, "all_nulls_double", Types.DoubleType.get()), - optional(12, "all_nans_v1_stats", Types.FloatType.get()), - optional(13, "nan_and_null_only", Types.DoubleType.get()), - optional(14, "no_nan_stats", Types.DoubleType.get()) - ); + private static final Schema SCHEMA = + new Schema( + required(1, "id", IntegerType.get()), + optional(2, "no_stats", IntegerType.get()), + required(3, "required", StringType.get()), + optional(4, "all_nulls", StringType.get()), + optional(5, "some_nulls", StringType.get()), + optional(6, "no_nulls", StringType.get()), + required(7, "always_5", IntegerType.get()), + optional(8, "all_nans", Types.DoubleType.get()), + optional(9, "some_nans", Types.FloatType.get()), + optional(10, "no_nans", Types.FloatType.get()), + optional(11, "all_nulls_double", Types.DoubleType.get()), + optional(12, "all_nans_v1_stats", Types.FloatType.get()), + optional(13, "nan_and_null_only", Types.DoubleType.get()), + optional(14, "no_nan_stats", Types.DoubleType.get())); private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 79; - private static final DataFile FILE = new TestDataFile("file.avro", Row.of(), 50, - // any value counts, including nulls - ImmutableMap.builder() - .put(4, 50L) - .put(5, 50L) - .put(6, 50L) - .put(8, 50L) - .put(9, 50L) - .put(10, 50L) - .put(11, 50L) - .put(12, 50L) - .put(13, 50L) - .put(14, 50L) - .build(), - // null value counts - ImmutableMap.builder() - .put(4, 50L) - .put(5, 10L) - .put(6, 0L) - .put(11, 50L) - .put(12, 0L) - .put(13, 1L) - .build(), - // nan value counts - ImmutableMap.of( - 8, 50L, - 9, 10L, - 10, 0L), - // lower bounds - ImmutableMap.of( - 1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE), - 7, toByteBuffer(IntegerType.get(), 5), - 12, toByteBuffer(Types.FloatType.get(), Float.NaN), - 13, toByteBuffer(Types.DoubleType.get(), Double.NaN)), - // upper bounds - ImmutableMap.of( - 1, toByteBuffer(IntegerType.get(), INT_MAX_VALUE), - 7, toByteBuffer(IntegerType.get(), 5), - 12, toByteBuffer(Types.FloatType.get(), Float.NaN), - 13, toByteBuffer(Types.DoubleType.get(), Double.NaN))); - - private static final DataFile FILE_2 = new TestDataFile("file_2.avro", Row.of(), 50, - // any value counts, including nulls - ImmutableMap.of( - 4, 50L, - 5, 50L, - 6, 50L, - 8, 50L), - // null value counts - ImmutableMap.of( - 4, 50L, - 5, 10L, - 6, 0L), - // nan value counts - null, - // lower bounds - ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb")), - // upper bounds - ImmutableMap.of(5, toByteBuffer(StringType.get(), "eee"))); - - private static final DataFile FILE_3 = new TestDataFile("file_3.avro", Row.of(), 50, - // any value counts, including nulls - ImmutableMap.of( - 4, 50L, - 5, 50L, - 6, 50L), - // null value counts - ImmutableMap.of( - 4, 50L, - 5, 10L, - 6, 0L), - // nan value counts - null, - // lower bounds - ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb")), - // upper bounds - ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb"))); + private static final DataFile FILE = + new TestDataFile( + "file.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.builder() + .put(4, 50L) + .put(5, 50L) + .put(6, 50L) + .put(8, 50L) + .put(9, 50L) + .put(10, 50L) + .put(11, 50L) + .put(12, 50L) + .put(13, 50L) + .put(14, 50L) + .build(), + // null value counts + ImmutableMap.builder() + .put(4, 50L) + .put(5, 10L) + .put(6, 0L) + .put(11, 50L) + .put(12, 0L) + .put(13, 1L) + .build(), + // nan value counts + ImmutableMap.of( + 8, 50L, + 9, 10L, + 10, 0L), + // lower bounds + ImmutableMap.of( + 1, toByteBuffer(IntegerType.get(), INT_MIN_VALUE), + 7, toByteBuffer(IntegerType.get(), 5), + 12, toByteBuffer(Types.FloatType.get(), Float.NaN), + 13, toByteBuffer(Types.DoubleType.get(), Double.NaN)), + // upper bounds + ImmutableMap.of( + 1, toByteBuffer(IntegerType.get(), INT_MAX_VALUE), + 7, toByteBuffer(IntegerType.get(), 5), + 12, toByteBuffer(Types.FloatType.get(), Float.NaN), + 13, toByteBuffer(Types.DoubleType.get(), Double.NaN))); + + private static final DataFile FILE_2 = + new TestDataFile( + "file_2.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of( + 4, 50L, + 5, 50L, + 6, 50L, + 8, 50L), + // null value counts + ImmutableMap.of( + 4, 50L, + 5, 10L, + 6, 0L), + // nan value counts + null, + // lower bounds + ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb")), + // upper bounds + ImmutableMap.of(5, toByteBuffer(StringType.get(), "eee"))); + + private static final DataFile FILE_3 = + new TestDataFile( + "file_3.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of( + 4, 50L, + 5, 50L, + 6, 50L), + // null value counts + ImmutableMap.of( + 4, 50L, + 5, 10L, + 6, 0L), + // nan value counts + null, + // lower bounds + ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb")), + // upper bounds + ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb"))); @Test public void testAllNulls() { @@ -156,7 +167,8 @@ public void testAllNulls() { Assert.assertFalse("Should not match: no non-null value in all null column", shouldRead); shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("some_nulls")).eval(FILE); - Assert.assertFalse("Should not match: column with some nulls contains a non-null value", shouldRead); + Assert.assertFalse( + "Should not match: column with some nulls contains a non-null value", shouldRead); shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("no_nulls")).eval(FILE); Assert.assertTrue("Should match: non-null column contains no null values", shouldRead); @@ -179,16 +191,19 @@ public void testNoNulls() { @Test public void testSomeNulls() { - boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, lessThan("some_nulls", "ggg")).eval(FILE_2); + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, lessThan("some_nulls", "ggg")).eval(FILE_2); Assert.assertFalse("Should not match: lessThan on some nulls column", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, lessThanOrEqual("some_nulls", "eee")).eval(FILE_2); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, lessThanOrEqual("some_nulls", "eee")).eval(FILE_2); Assert.assertFalse("Should not match: lessThanOrEqual on some nulls column", shouldRead); shouldRead = new StrictMetricsEvaluator(SCHEMA, greaterThan("some_nulls", "aaa")).eval(FILE_2); Assert.assertFalse("Should not match: greaterThan on some nulls column", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, greaterThanOrEqual("some_nulls", "bbb")).eval(FILE_2); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, greaterThanOrEqual("some_nulls", "bbb")).eval(FILE_2); Assert.assertFalse("Should not match: greaterThanOrEqual on some nulls column", shouldRead); shouldRead = new StrictMetricsEvaluator(SCHEMA, equal("some_nulls", "bbb")).eval(FILE_3); @@ -201,13 +216,15 @@ public void testIsNaN() { Assert.assertTrue("Should match: all values are nan", shouldRead); shouldRead = new StrictMetricsEvaluator(SCHEMA, isNaN("some_nans")).eval(FILE); - Assert.assertFalse("Should not match: at least one non-nan value in some nan column", shouldRead); + Assert.assertFalse( + "Should not match: at least one non-nan value in some nan column", shouldRead); shouldRead = new StrictMetricsEvaluator(SCHEMA, isNaN("no_nans")).eval(FILE); Assert.assertFalse("Should not match: at least one non-nan value in no nan column", shouldRead); shouldRead = new StrictMetricsEvaluator(SCHEMA, isNaN("all_nulls_double")).eval(FILE); - Assert.assertFalse("Should not match: at least one non-nan value in all null column", shouldRead); + Assert.assertFalse( + "Should not match: at least one non-nan value in all null column", shouldRead); shouldRead = new StrictMetricsEvaluator(SCHEMA, isNaN("no_nan_stats")).eval(FILE); Assert.assertFalse("Should not match: cannot determine without nan stats", shouldRead); @@ -254,8 +271,10 @@ public void testRequiredColumn() { @Test public void testMissingColumn() { - AssertHelpers.assertThrows("Should complain about missing column in expression", - ValidationException.class, "Cannot find field 'missing'", + AssertHelpers.assertThrows( + "Should complain about missing column in expression", + ValidationException.class, + "Cannot find field 'missing'", () -> new StrictMetricsEvaluator(SCHEMA, lessThan("missing", 5)).eval(FILE)); } @@ -263,11 +282,19 @@ public void testMissingColumn() { public void testMissingStats() { DataFile missingStats = new TestDataFile("file.parquet", Row.of(), 50); - Expression[] exprs = new Expression[] { - lessThan("no_stats", 5), lessThanOrEqual("no_stats", 30), equal("no_stats", 70), - greaterThan("no_stats", 78), greaterThanOrEqual("no_stats", 90), notEqual("no_stats", 101), - isNull("no_stats"), notNull("no_stats"), isNaN("all_nans"), notNaN("all_nans") - }; + Expression[] exprs = + new Expression[] { + lessThan("no_stats", 5), + lessThanOrEqual("no_stats", 30), + equal("no_stats", 70), + greaterThan("no_stats", 78), + greaterThanOrEqual("no_stats", 90), + notEqual("no_stats", 101), + isNull("no_stats"), + notNull("no_stats"), + isNaN("all_nans"), + notNaN("all_nans") + }; for (Expression expr : exprs) { boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, expr).eval(missingStats); @@ -279,11 +306,19 @@ public void testMissingStats() { public void testZeroRecordFile() { DataFile empty = new TestDataFile("file.parquet", Row.of(), 0); - Expression[] exprs = new Expression[] { - lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78), - greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"), - notNull("some_nulls"), isNaN("all_nans"), notNaN("all_nans") - }; + Expression[] exprs = + new Expression[] { + lessThan("id", 5), + lessThanOrEqual("id", 30), + equal("id", 70), + greaterThan("id", 78), + greaterThanOrEqual("id", 90), + notEqual("id", 101), + isNull("some_nulls"), + notNull("some_nulls"), + isNaN("all_nans"), + notNaN("all_nans") + }; for (Expression expr : exprs) { boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, expr).eval(empty); @@ -294,48 +329,75 @@ public void testZeroRecordFile() { @Test public void testNot() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, not(lessThan("id", INT_MIN_VALUE - 25))).eval(FILE); + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, not(lessThan("id", INT_MIN_VALUE - 25))).eval(FILE); Assert.assertTrue("Should not match: not(false)", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, not(greaterThan("id", INT_MIN_VALUE - 25))).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, not(greaterThan("id", INT_MIN_VALUE - 25))).eval(FILE); Assert.assertFalse("Should match: not(true)", shouldRead); } @Test public void testAnd() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, - and(greaterThan("id", INT_MIN_VALUE - 25), lessThanOrEqual("id", INT_MIN_VALUE))).eval(FILE); + boolean shouldRead = + new StrictMetricsEvaluator( + SCHEMA, + and(greaterThan("id", INT_MIN_VALUE - 25), lessThanOrEqual("id", INT_MIN_VALUE))) + .eval(FILE); Assert.assertFalse("Should not match: range may not overlap data", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MIN_VALUE - 30))).eval(FILE); + shouldRead = + new StrictMetricsEvaluator( + SCHEMA, + and( + lessThan("id", INT_MIN_VALUE - 25), + greaterThanOrEqual("id", INT_MIN_VALUE - 30))) + .eval(FILE); Assert.assertFalse("Should not match: range does not overlap data", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - and(lessThan("id", INT_MAX_VALUE + 6), greaterThanOrEqual("id", INT_MIN_VALUE - 30))).eval(FILE); + shouldRead = + new StrictMetricsEvaluator( + SCHEMA, + and( + lessThan("id", INT_MAX_VALUE + 6), + greaterThanOrEqual("id", INT_MIN_VALUE - 30))) + .eval(FILE); Assert.assertTrue("Should match: range includes all data", shouldRead); } @Test public void testOr() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, - or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))).eval(FILE); + boolean shouldRead = + new StrictMetricsEvaluator( + SCHEMA, + or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))) + .eval(FILE); Assert.assertFalse("Should not match: no matching values", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE - 19))).eval(FILE); + shouldRead = + new StrictMetricsEvaluator( + SCHEMA, + or( + lessThan("id", INT_MIN_VALUE - 25), + greaterThanOrEqual("id", INT_MAX_VALUE - 19))) + .eval(FILE); Assert.assertFalse("Should not match: some values do not match", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MIN_VALUE))).eval(FILE); + shouldRead = + new StrictMetricsEvaluator( + SCHEMA, + or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MIN_VALUE))) + .eval(FILE); Assert.assertTrue("Should match: all values match >= 30", shouldRead); } @Test public void testIntegerLt() { - boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, lessThan("id", INT_MIN_VALUE)).eval(FILE); + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, lessThan("id", INT_MIN_VALUE)).eval(FILE); Assert.assertFalse("Should not match: always false", shouldRead); shouldRead = new StrictMetricsEvaluator(SCHEMA, lessThan("id", INT_MIN_VALUE + 1)).eval(FILE); @@ -350,52 +412,64 @@ public void testIntegerLt() { @Test public void testIntegerLtEq() { - boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE - 1)).eval(FILE); + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE - 1)).eval(FILE); Assert.assertFalse("Should not match: always false", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE)).eval(FILE); Assert.assertFalse("Should not match: 31 and greater not in range", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, lessThanOrEqual("id", INT_MAX_VALUE)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, lessThanOrEqual("id", INT_MAX_VALUE)).eval(FILE); Assert.assertTrue("Should match: all values in range", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, lessThanOrEqual("id", INT_MAX_VALUE + 1)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, lessThanOrEqual("id", INT_MAX_VALUE + 1)).eval(FILE); Assert.assertTrue("Should match: all values in range", shouldRead); } @Test public void testIntegerGt() { - boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, greaterThan("id", INT_MAX_VALUE)).eval(FILE); + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, greaterThan("id", INT_MAX_VALUE)).eval(FILE); Assert.assertFalse("Should not match: always false", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, greaterThan("id", INT_MAX_VALUE - 1)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, greaterThan("id", INT_MAX_VALUE - 1)).eval(FILE); Assert.assertFalse("Should not match: 77 and less not in range", shouldRead); shouldRead = new StrictMetricsEvaluator(SCHEMA, greaterThan("id", INT_MIN_VALUE)).eval(FILE); Assert.assertFalse("Should not match: 30 not in range", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, greaterThan("id", INT_MIN_VALUE - 1)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, greaterThan("id", INT_MIN_VALUE - 1)).eval(FILE); Assert.assertTrue("Should match: all values in range", shouldRead); } @Test public void testIntegerGtEq() { - boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE + 1)).eval(FILE); + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE + 1)).eval(FILE); Assert.assertFalse("Should not match: no values in range", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE)).eval(FILE); Assert.assertFalse("Should not match: 78 and lower are not in range", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", INT_MIN_VALUE + 1)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", INT_MIN_VALUE + 1)).eval(FILE); Assert.assertFalse("Should not match: 30 not in range", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", INT_MIN_VALUE)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, greaterThanOrEqual("id", INT_MIN_VALUE)).eval(FILE); Assert.assertTrue("Should match: all values in range", shouldRead); } @Test public void testIntegerEq() { - boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, equal("id", INT_MIN_VALUE - 25)).eval(FILE); + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, equal("id", INT_MIN_VALUE - 25)).eval(FILE); Assert.assertFalse("Should not match: all values != 5", shouldRead); shouldRead = new StrictMetricsEvaluator(SCHEMA, equal("id", INT_MIN_VALUE)).eval(FILE); @@ -410,13 +484,15 @@ public void testIntegerEq() { shouldRead = new StrictMetricsEvaluator(SCHEMA, equal("id", INT_MAX_VALUE + 1)).eval(FILE); Assert.assertFalse("Should not match: some values != 80", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, equal("always_5", INT_MIN_VALUE - 25)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, equal("always_5", INT_MIN_VALUE - 25)).eval(FILE); Assert.assertTrue("Should match: all values == 5", shouldRead); } @Test public void testIntegerNotEq() { - boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notEqual("id", INT_MIN_VALUE - 25)).eval(FILE); + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notEqual("id", INT_MIN_VALUE - 25)).eval(FILE); Assert.assertTrue("Should match: no values == 5", shouldRead); shouldRead = new StrictMetricsEvaluator(SCHEMA, notEqual("id", INT_MIN_VALUE - 1)).eval(FILE); @@ -440,7 +516,8 @@ public void testIntegerNotEq() { @Test public void testIntegerNotEqRewritten() { - boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, not(equal("id", INT_MIN_VALUE - 25))).eval(FILE); + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, not(equal("id", INT_MIN_VALUE - 25))).eval(FILE); Assert.assertTrue("Should match: no values == 5", shouldRead); shouldRead = new StrictMetricsEvaluator(SCHEMA, not(equal("id", INT_MIN_VALUE - 1))).eval(FILE); @@ -462,82 +539,82 @@ public void testIntegerNotEqRewritten() { Assert.assertTrue("Should read: no values == 85", shouldRead); } - @Test public void testIntegerIn() { - boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, - in("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)).eval(FILE); + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, in("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)) + .eval(FILE); Assert.assertFalse("Should not match: all values != 5 and != 6", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - in("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, in("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)).eval(FILE); Assert.assertFalse("Should not match: some values != 30 and != 31", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - in("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, in("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)) + .eval(FILE); Assert.assertFalse("Should not match: some values != 75 and != 76", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - in("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, in("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)).eval(FILE); Assert.assertFalse("Should not match: some values != 78 and != 79", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - in("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, in("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)) + .eval(FILE); Assert.assertFalse("Should not match: some values != 80 and != 81)", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - in("always_5", 5, 6)).eval(FILE); + shouldRead = new StrictMetricsEvaluator(SCHEMA, in("always_5", 5, 6)).eval(FILE); Assert.assertTrue("Should match: all values == 5", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - in("all_nulls", "abc", "def")).eval(FILE); + shouldRead = new StrictMetricsEvaluator(SCHEMA, in("all_nulls", "abc", "def")).eval(FILE); Assert.assertFalse("Should not match: in on all nulls column", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - in("some_nulls", "abc", "def")).eval(FILE_3); + shouldRead = new StrictMetricsEvaluator(SCHEMA, in("some_nulls", "abc", "def")).eval(FILE_3); Assert.assertFalse("Should not match: in on some nulls column", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - in("no_nulls", "abc", "def")).eval(FILE); + shouldRead = new StrictMetricsEvaluator(SCHEMA, in("no_nulls", "abc", "def")).eval(FILE); Assert.assertFalse("Should not match: no_nulls field does not have bounds", shouldRead); } @Test public void testIntegerNotIn() { - boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, - notIn("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)).eval(FILE); + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notIn("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)) + .eval(FILE); Assert.assertTrue("Should not match: all values !=5 and !=6", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - notIn("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notIn("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)) + .eval(FILE); Assert.assertFalse("Should not match: some values may be == 30", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - notIn("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notIn("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)) + .eval(FILE); Assert.assertFalse("Should not match: some value may be == 75 or == 76", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - notIn("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notIn("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)) + .eval(FILE); Assert.assertFalse("Should not match: some value may be == 79", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - notIn("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)).eval(FILE); + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notIn("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)) + .eval(FILE); Assert.assertTrue("Should match: no values == 80 or == 81", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - notIn("always_5", 5, 6)).eval(FILE); + shouldRead = new StrictMetricsEvaluator(SCHEMA, notIn("always_5", 5, 6)).eval(FILE); Assert.assertFalse("Should not match: all values == 5", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - notIn("all_nulls", "abc", "def")).eval(FILE); + shouldRead = new StrictMetricsEvaluator(SCHEMA, notIn("all_nulls", "abc", "def")).eval(FILE); Assert.assertTrue("Should match: notIn on all nulls column", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - notIn("some_nulls", "abc", "def")).eval(FILE_3); - Assert.assertTrue("Should match: notIn on some nulls column, 'bbb' > 'abc' and 'bbb' < 'def'", shouldRead); + shouldRead = new StrictMetricsEvaluator(SCHEMA, notIn("some_nulls", "abc", "def")).eval(FILE_3); + Assert.assertTrue( + "Should match: notIn on some nulls column, 'bbb' > 'abc' and 'bbb' < 'def'", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, - notIn("no_nulls", "abc", "def")).eval(FILE); + shouldRead = new StrictMetricsEvaluator(SCHEMA, notIn("no_nulls", "abc", "def")).eval(FILE); Assert.assertFalse("Should not match: no_nulls field does not have bounds", shouldRead); } } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStringLiteralConversions.java b/api/src/test/java/org/apache/iceberg/expressions/TestStringLiteralConversions.java index afc725c8b2c3..4a8969fe5d6e 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStringLiteralConversions.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStringLiteralConversions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.math.BigDecimal; @@ -35,7 +34,6 @@ import org.junit.Assert; import org.junit.Test; - public class TestStringLiteralConversions { @Test public void testStringToStringLiteral() { @@ -51,9 +49,8 @@ public void testStringToDateLiteral() { // use Avro's date conversion to validate the result Schema avroSchema = LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT)); TimeConversions.DateConversion avroConversion = new TimeConversions.DateConversion(); - int avroValue = avroConversion.toInt( - LocalDate.of(2017, 8, 18), - avroSchema, avroSchema.getLogicalType()); + int avroValue = + avroConversion.toInt(LocalDate.of(2017, 8, 18), avroSchema, avroSchema.getLogicalType()); Assert.assertEquals("Date should match", avroValue, (int) date.value()); } @@ -66,9 +63,8 @@ public void testNegativeStringToDateLiteral() { // use Avro's date conversion to validate the result Schema avroSchema = LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT)); TimeConversions.DateConversion avroConversion = new TimeConversions.DateConversion(); - int avroValue = avroConversion.toInt( - LocalDate.of(1969, 12, 30), - avroSchema, avroSchema.getLogicalType()); + int avroValue = + avroConversion.toInt(LocalDate.of(1969, 12, 30), avroSchema, avroSchema.getLogicalType()); Assert.assertEquals("Date should be -2", -2, (int) date.value()); Assert.assertEquals("Date should match", avroValue, (int) date.value()); @@ -82,9 +78,10 @@ public void testStringToTimeLiteral() { Literal timeStr = Literal.of("14:21:01.919"); Literal time = timeStr.to(Types.TimeType.get()); - long avroValue = new TimeConversions.TimeMicrosConversion().toLong( - LocalTime.of(14, 21, 1, 919 * 1000000), - avroSchema, avroSchema.getLogicalType()); + long avroValue = + new TimeConversions.TimeMicrosConversion() + .toLong( + LocalTime.of(14, 21, 1, 919 * 1000000), avroSchema, avroSchema.getLogicalType()); Assert.assertEquals("Time should match", avroValue, (long) time.value()); } @@ -99,9 +96,11 @@ public void testStringToTimestampLiteral() { // Timestamp with explicit UTC offset, +00:00 Literal timestampStr = Literal.of("2017-08-18T14:21:01.919+00:00"); Literal timestamp = timestampStr.to(Types.TimestampType.withZone()); - long avroValue = avroConversion.toLong( - LocalDateTime.of(2017, 8, 18, 14, 21, 1, 919 * 1000000).toInstant(ZoneOffset.UTC), - avroSchema, avroSchema.getLogicalType()); + long avroValue = + avroConversion.toLong( + LocalDateTime.of(2017, 8, 18, 14, 21, 1, 919 * 1000000).toInstant(ZoneOffset.UTC), + avroSchema, + avroSchema.getLogicalType()); Assert.assertEquals("Timestamp should match", avroValue, (long) timestamp.value()); @@ -109,18 +108,20 @@ public void testStringToTimestampLiteral() { timestampStr = Literal.of("2017-08-18T14:21:01.919"); timestamp = timestampStr.to(Types.TimestampType.withoutZone()); - Assert.assertEquals("Timestamp without zone should match UTC", - avroValue, (long) timestamp.value()); + Assert.assertEquals( + "Timestamp without zone should match UTC", avroValue, (long) timestamp.value()); // Timestamp with an explicit offset should be adjusted to UTC timestampStr = Literal.of("2017-08-18T14:21:01.919-07:00"); timestamp = timestampStr.to(Types.TimestampType.withZone()); - avroValue = avroConversion.toLong( - LocalDateTime.of(2017, 8, 18, 21, 21, 1, 919 * 1000000).toInstant(ZoneOffset.UTC), - avroSchema, avroSchema.getLogicalType()); - - Assert.assertEquals("Timestamp without zone should match UTC", - avroValue, (long) timestamp.value()); + avroValue = + avroConversion.toLong( + LocalDateTime.of(2017, 8, 18, 21, 21, 1, 919 * 1000000).toInstant(ZoneOffset.UTC), + avroSchema, + avroSchema.getLogicalType()); + + Assert.assertEquals( + "Timestamp without zone should match UTC", avroValue, (long) timestamp.value()); } @Test @@ -133,9 +134,11 @@ public void testNegativeStringToTimestampLiteral() { // Timestamp with explicit UTC offset, +00:00 Literal timestampStr = Literal.of("1969-12-31T23:59:58.999999+00:00"); Literal timestamp = timestampStr.to(Types.TimestampType.withZone()); - long avroValue = avroConversion.toLong( - LocalDateTime.of(1969, 12, 31, 23, 59, 58, 999999 * 1_000).toInstant(ZoneOffset.UTC), - avroSchema, avroSchema.getLogicalType()); + long avroValue = + avroConversion.toLong( + LocalDateTime.of(1969, 12, 31, 23, 59, 58, 999999 * 1_000).toInstant(ZoneOffset.UTC), + avroSchema, + avroSchema.getLogicalType()); Assert.assertEquals("Timestamp should match", avroValue, (long) timestamp.value()); Assert.assertEquals("Timestamp should be -1_000_001", -1_000_001, (long) timestamp.value()); @@ -144,20 +147,22 @@ public void testNegativeStringToTimestampLiteral() { timestampStr = Literal.of("1969-12-31T23:59:58.999999"); timestamp = timestampStr.to(Types.TimestampType.withoutZone()); - Assert.assertEquals("Timestamp without zone should match UTC", - avroValue, (long) timestamp.value()); + Assert.assertEquals( + "Timestamp without zone should match UTC", avroValue, (long) timestamp.value()); // Timestamp with an explicit offset should be adjusted to UTC timestampStr = Literal.of("1969-12-31T16:59:58.999999-07:00"); timestamp = timestampStr.to(Types.TimestampType.withZone()); - avroValue = avroConversion.toLong( - LocalDateTime.of(1969, 12, 31, 23, 59, 58, 999999 * 1_000).toInstant(ZoneOffset.UTC), - avroSchema, avroSchema.getLogicalType()); - - Assert.assertEquals("Timestamp without zone should match UTC", - avroValue, (long) timestamp.value()); - Assert.assertEquals("Timestamp without zone should be -1_000_001", -1_000_001, (long) timestamp.value()); - + avroValue = + avroConversion.toLong( + LocalDateTime.of(1969, 12, 31, 23, 59, 58, 999999 * 1_000).toInstant(ZoneOffset.UTC), + avroSchema, + avroSchema.getLogicalType()); + + Assert.assertEquals( + "Timestamp without zone should match UTC", avroValue, (long) timestamp.value()); + Assert.assertEquals( + "Timestamp without zone should be -1_000_001", -1_000_001, (long) timestamp.value()); } @Test @@ -192,10 +197,12 @@ public void testStringToDecimalLiteral() { BigDecimal expected = new BigDecimal("34.560"); Literal decimalStr = Literal.of("34.560"); - IntStream.range(0, 10).forEach(scale -> { - Literal decimal = decimalStr.to(Types.DecimalType.of(9, scale)); - Assert.assertEquals("Decimal should have scale 3", 3, decimal.value().scale()); - Assert.assertEquals("Decimal should match", expected, decimal.value()); - }); + IntStream.range(0, 10) + .forEach( + scale -> { + Literal decimal = decimalStr.to(Types.DecimalType.of(9, scale)); + Assert.assertEquals("Decimal should have scale 3", 3, decimal.value().scale()); + Assert.assertEquals("Decimal should match", expected, decimal.value()); + }); } } diff --git a/api/src/test/java/org/apache/iceberg/io/TestCloseableGroup.java b/api/src/test/java/org/apache/iceberg/io/TestCloseableGroup.java index 4125dd66ddbb..a05a8a1ddf22 100644 --- a/api/src/test/java/org/apache/iceberg/io/TestCloseableGroup.java +++ b/api/src/test/java/org/apache/iceberg/io/TestCloseableGroup.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.Closeable; @@ -92,8 +91,7 @@ public void notSuppressExceptionIfSetSuppressIsFalse() throws Exception { closeableGroup.addCloseable(closeable2); closeableGroup.addCloseable(closeable3); - Assertions.assertThatThrownBy(closeableGroup::close) - .isEqualTo(ioException); + Assertions.assertThatThrownBy(closeableGroup::close).isEqualTo(ioException); Mockito.verify(closeable1).close(); Mockito.verify(closeable2).close(); Mockito.verifyNoInteractions(closeable3); @@ -113,8 +111,7 @@ public void notSuppressExceptionIfSetSuppressIsFalseForAutoCloseable() throws Ex closeableGroup.addCloseable(closeable2); closeableGroup.addCloseable(closeable3); - Assertions.assertThatThrownBy(closeableGroup::close) - .isEqualTo(ioException); + Assertions.assertThatThrownBy(closeableGroup::close).isEqualTo(ioException); Mockito.verify(closeable1).close(); Mockito.verify(closeable2).close(); Mockito.verifyNoInteractions(closeable3); @@ -144,8 +141,7 @@ public void notWrapRuntimeException() throws Exception { CloseableGroup closeableGroup = new CloseableGroup(); closeableGroup.addCloseable(throwingCloseable); - Assertions.assertThatThrownBy(closeableGroup::close) - .isEqualTo(runtimeException); + Assertions.assertThatThrownBy(closeableGroup::close).isEqualTo(runtimeException); } @Test @@ -157,7 +153,6 @@ public void notWrapRuntimeExceptionFromAutoCloseable() throws Exception { CloseableGroup closeableGroup = new CloseableGroup(); closeableGroup.addCloseable(throwingAutoCloseable); - Assertions.assertThatThrownBy(closeableGroup::close) - .isEqualTo(runtimeException); + Assertions.assertThatThrownBy(closeableGroup::close).isEqualTo(runtimeException); } } diff --git a/api/src/test/java/org/apache/iceberg/io/TestCloseableIterable.java b/api/src/test/java/org/apache/iceberg/io/TestCloseableIterable.java index 8eef8fc036f5..826dcb4fbfc5 100644 --- a/api/src/test/java/org/apache/iceberg/io/TestCloseableIterable.java +++ b/api/src/test/java/org/apache/iceberg/io/TestCloseableIterable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -65,24 +64,31 @@ public void testFilterAutomaticallyClosable() throws IOException { @Test public void testConcatWithEmptyIterables() { - CloseableIterable iter = CloseableIterable.combine(Lists.newArrayList(1, 2, 3), () -> { }); + CloseableIterable iter = + CloseableIterable.combine(Lists.newArrayList(1, 2, 3), () -> {}); CloseableIterable empty = CloseableIterable.empty(); - CloseableIterable concat1 = CloseableIterable.concat(Lists.newArrayList(iter, empty, empty)); + CloseableIterable concat1 = + CloseableIterable.concat(Lists.newArrayList(iter, empty, empty)); Assert.assertEquals(Iterables.getLast(concat1).intValue(), 3); - CloseableIterable concat2 = CloseableIterable.concat(Lists.newArrayList(empty, empty, iter)); + CloseableIterable concat2 = + CloseableIterable.concat(Lists.newArrayList(empty, empty, iter)); Assert.assertEquals(Iterables.getLast(concat2).intValue(), 3); - CloseableIterable concat3 = CloseableIterable.concat(Lists.newArrayList(empty, iter, empty)); + CloseableIterable concat3 = + CloseableIterable.concat(Lists.newArrayList(empty, iter, empty)); Assert.assertEquals(Iterables.getLast(concat3).intValue(), 3); - CloseableIterable concat4 = CloseableIterable.concat(Lists.newArrayList(empty, iter, empty, empty, iter)); + CloseableIterable concat4 = + CloseableIterable.concat(Lists.newArrayList(empty, iter, empty, empty, iter)); Assert.assertEquals(Iterables.getLast(concat4).intValue(), 3); // This will throw a NoSuchElementException - CloseableIterable concat5 = CloseableIterable.concat(Lists.newArrayList(empty, empty, empty)); - AssertHelpers.assertThrows("should throw NoSuchElementException", + CloseableIterable concat5 = + CloseableIterable.concat(Lists.newArrayList(empty, empty, empty)); + AssertHelpers.assertThrows( + "should throw NoSuchElementException", NoSuchElementException.class, () -> Iterables.getLast(concat5)); } @@ -98,22 +104,27 @@ public void testConcatWithEmpty() { public void concatShouldOnlyEvaluateItemsOnce() throws IOException { AtomicInteger counter = new AtomicInteger(0); List items = Lists.newArrayList(1, 2, 3, 4, 5); - Iterable iterable = Iterables.filter(items, item -> { - counter.incrementAndGet(); - return true; - }); + Iterable iterable = + Iterables.filter( + items, + item -> { + counter.incrementAndGet(); + return true; + }); Iterable> transform = - Iterables.transform(iterable, item -> new CloseableIterable() { - @Override - public void close() { - } - - @Override - public CloseableIterator iterator() { - return CloseableIterator.withClose(Collections.singletonList(item).iterator()); - } - }); + Iterables.transform( + iterable, + item -> + new CloseableIterable() { + @Override + public void close() {} + + @Override + public CloseableIterator iterator() { + return CloseableIterator.withClose(Collections.singletonList(item).iterator()); + } + }); try (CloseableIterable concat = CloseableIterable.concat(transform)) { concat.forEach(c -> c++); diff --git a/api/src/test/java/org/apache/iceberg/io/TestClosingIterator.java b/api/src/test/java/org/apache/iceberg/io/TestClosingIterator.java index 5aebc2fa5603..2cc65cf7ffe6 100644 --- a/api/src/test/java/org/apache/iceberg/io/TestClosingIterator.java +++ b/api/src/test/java/org/apache/iceberg/io/TestClosingIterator.java @@ -16,11 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; -import org.junit.Test; - import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -29,6 +26,8 @@ import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; +import org.junit.Test; + public class TestClosingIterator { @Test public void testEmptyIterator() { diff --git a/api/src/test/java/org/apache/iceberg/io/TestableCloseableIterable.java b/api/src/test/java/org/apache/iceberg/io/TestableCloseableIterable.java index 067f96a78c6d..abd088cf6b25 100644 --- a/api/src/test/java/org/apache/iceberg/io/TestableCloseableIterable.java +++ b/api/src/test/java/org/apache/iceberg/io/TestableCloseableIterable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; diff --git a/api/src/test/java/org/apache/iceberg/metrics/TestDefaultMetricsContext.java b/api/src/test/java/org/apache/iceberg/metrics/TestDefaultMetricsContext.java index f82ec47b90e8..75cacdd29cfb 100644 --- a/api/src/test/java/org/apache/iceberg/metrics/TestDefaultMetricsContext.java +++ b/api/src/test/java/org/apache/iceberg/metrics/TestDefaultMetricsContext.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.metrics; import java.time.Duration; @@ -29,7 +28,8 @@ public class TestDefaultMetricsContext { @Test public void unsupportedCounter() { MetricsContext metricsContext = new DefaultMetricsContext(); - Assertions.assertThatThrownBy(() -> metricsContext.counter("test", Double.class, MetricsContext.Unit.COUNT)) + Assertions.assertThatThrownBy( + () -> metricsContext.counter("test", Double.class, MetricsContext.Unit.COUNT)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Counter for type java.lang.Double is not supported"); } @@ -37,7 +37,8 @@ public void unsupportedCounter() { @Test public void intCounter() { MetricsContext metricsContext = new DefaultMetricsContext(); - MetricsContext.Counter counter = metricsContext.counter("test", Integer.class, MetricsContext.Unit.COUNT); + MetricsContext.Counter counter = + metricsContext.counter("test", Integer.class, MetricsContext.Unit.COUNT); counter.increment(5); Assertions.assertThat(counter.value()).isEqualTo(5); } @@ -45,7 +46,8 @@ public void intCounter() { @Test public void intCounterOverflow() { MetricsContext metricsContext = new DefaultMetricsContext(); - MetricsContext.Counter counter = metricsContext.counter("test", Integer.class, MetricsContext.Unit.COUNT); + MetricsContext.Counter counter = + metricsContext.counter("test", Integer.class, MetricsContext.Unit.COUNT); counter.increment(Integer.MAX_VALUE); Assertions.assertThatThrownBy(counter::increment) .isInstanceOf(ArithmeticException.class) @@ -56,7 +58,8 @@ public void intCounterOverflow() { @Test public void longCounter() { MetricsContext metricsContext = new DefaultMetricsContext(); - MetricsContext.Counter counter = metricsContext.counter("test", Long.class, MetricsContext.Unit.COUNT); + MetricsContext.Counter counter = + metricsContext.counter("test", Long.class, MetricsContext.Unit.COUNT); counter.increment(5L); Assertions.assertThat(counter.value()).isEqualTo(5L); } @@ -64,7 +67,8 @@ public void longCounter() { @Test public void longCounterOverflow() { MetricsContext metricsContext = new DefaultMetricsContext(); - MetricsContext.Counter counter = metricsContext.counter("test", Long.class, MetricsContext.Unit.COUNT); + MetricsContext.Counter counter = + metricsContext.counter("test", Long.class, MetricsContext.Unit.COUNT); counter.increment(Long.MAX_VALUE); Assertions.assertThatThrownBy(counter::increment) .isInstanceOf(ArithmeticException.class) diff --git a/api/src/test/java/org/apache/iceberg/metrics/TestDefaultTimer.java b/api/src/test/java/org/apache/iceberg/metrics/TestDefaultTimer.java index ec632bf279db..7c04da9f93b2 100644 --- a/api/src/test/java/org/apache/iceberg/metrics/TestDefaultTimer.java +++ b/api/src/test/java/org/apache/iceberg/metrics/TestDefaultTimer.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.metrics; +import static java.util.concurrent.Executors.newFixedThreadPool; +import static java.util.concurrent.TimeUnit.SECONDS; + import java.time.Duration; import java.util.List; import java.util.concurrent.Callable; @@ -32,9 +34,6 @@ import org.assertj.core.api.Assertions; import org.junit.Test; -import static java.util.concurrent.Executors.newFixedThreadPool; -import static java.util.concurrent.TimeUnit.SECONDS; - public class TestDefaultTimer { @Test @@ -81,13 +80,14 @@ public void closeableTimer() throws InterruptedException { @Test public void measureRunnable() { Timer timer = new DefaultTimer(TimeUnit.NANOSECONDS); - Runnable runnable = () -> { - try { - Thread.sleep(100); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - }; + Runnable runnable = + () -> { + try { + Thread.sleep(100); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + }; Assertions.assertThat(timer.count()).isEqualTo(0); Assertions.assertThat(timer.totalDuration()).isEqualTo(Duration.ZERO); @@ -105,14 +105,15 @@ public void measureRunnable() { @Test public void measureCallable() throws Exception { Timer timer = new DefaultTimer(TimeUnit.NANOSECONDS); - Callable callable = () -> { - try { - Thread.sleep(100); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - return true; - }; + Callable callable = + () -> { + try { + Thread.sleep(100); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + return true; + }; Assertions.assertThat(timer.count()).isEqualTo(0); Assertions.assertThat(timer.totalDuration()).isEqualTo(Duration.ZERO); @@ -130,14 +131,15 @@ public void measureCallable() throws Exception { @Test public void measureSupplier() { Timer timer = new DefaultTimer(TimeUnit.NANOSECONDS); - Supplier supplier = () -> { - try { - Thread.sleep(100); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - return true; - }; + Supplier supplier = + () -> { + try { + Thread.sleep(100); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + return true; + }; Assertions.assertThat(timer.count()).isEqualTo(0); Assertions.assertThat(timer.totalDuration()).isEqualTo(Duration.ZERO); @@ -156,22 +158,24 @@ public void measureSupplier() { public void measureNestedRunnables() { Timer timer = new DefaultTimer(TimeUnit.NANOSECONDS); Timer innerTimer = new DefaultTimer(TimeUnit.NANOSECONDS); - Runnable inner = () -> { - try { - Thread.sleep(100); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - }; - - Runnable outer = () -> { - try { - Thread.sleep(100); - innerTimer.time(inner); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - }; + Runnable inner = + () -> { + try { + Thread.sleep(100); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + }; + + Runnable outer = + () -> { + try { + Thread.sleep(100); + innerTimer.time(inner); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + }; Assertions.assertThat(timer.count()).isEqualTo(0); Assertions.assertThat(timer.totalDuration()).isEqualTo(Duration.ZERO); @@ -196,25 +200,31 @@ public void multiThreadedStarts() throws InterruptedException { CyclicBarrier barrier = new CyclicBarrier(threads); ExecutorService executor = newFixedThreadPool(threads); - List> futures = IntStream.range(0, threads) - .mapToObj(threadNumber -> executor.submit(() -> { - try { - barrier.await(30, SECONDS); - timer.record(5, TimeUnit.NANOSECONDS); - return timer.totalDuration(); - } catch (Exception e) { - throw new RuntimeException(e); - } - })) - .collect(Collectors.toList()); + List> futures = + IntStream.range(0, threads) + .mapToObj( + threadNumber -> + executor.submit( + () -> { + try { + barrier.await(30, SECONDS); + timer.record(5, TimeUnit.NANOSECONDS); + return timer.totalDuration(); + } catch (Exception e) { + throw new RuntimeException(e); + } + })) + .collect(Collectors.toList()); futures.stream() - .map(f -> { - try { - return f.get(30, SECONDS); - } catch (Exception e) { - throw new RuntimeException(e); - } - }).forEach(d -> System.out.println("d = " + d)); + .map( + f -> { + try { + return f.get(30, SECONDS); + } catch (Exception e) { + throw new RuntimeException(e); + } + }) + .forEach(d -> System.out.println("d = " + d)); executor.shutdownNow(); executor.awaitTermination(5, SECONDS); diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestBucketing.java b/api/src/test/java/org/apache/iceberg/transforms/TestBucketing.java index 1175bffa962a..c5bb8c2b2518 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestBucketing.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestBucketing.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; import java.lang.reflect.Constructor; @@ -62,60 +61,86 @@ public void initRandom() { @Test public void testSpecValues() { - Assert.assertEquals("Spec example: hash(true) = 1392991556", - 1392991556, Bucket.get(Types.IntegerType.get(), 100).hash(1)); - Assert.assertEquals("Spec example: hash(34) = 2017239379", - 2017239379, Bucket.get(Types.IntegerType.get(), 100).hash(34)); - Assert.assertEquals("Spec example: hash(34L) = 2017239379", - 2017239379, Bucket.get(Types.LongType.get(), 100).hash(34L)); - Assert.assertEquals("Spec example: hash(17.11F) = -142385009", - -142385009, new Bucket.BucketFloat(100).hash(1.0F)); - Assert.assertEquals("Spec example: hash(17.11D) = -142385009", - -142385009, new Bucket.BucketDouble(100).hash(1.0D)); - Assert.assertEquals("Spec example: hash(decimal2(14.20)) = -500754589", + Assert.assertEquals( + "Spec example: hash(true) = 1392991556", + 1392991556, + Bucket.get(Types.IntegerType.get(), 100).hash(1)); + Assert.assertEquals( + "Spec example: hash(34) = 2017239379", + 2017239379, + Bucket.get(Types.IntegerType.get(), 100).hash(34)); + Assert.assertEquals( + "Spec example: hash(34L) = 2017239379", + 2017239379, + Bucket.get(Types.LongType.get(), 100).hash(34L)); + Assert.assertEquals( + "Spec example: hash(17.11F) = -142385009", + -142385009, + new Bucket.BucketFloat(100).hash(1.0F)); + Assert.assertEquals( + "Spec example: hash(17.11D) = -142385009", + -142385009, + new Bucket.BucketDouble(100).hash(1.0D)); + Assert.assertEquals( + "Spec example: hash(decimal2(14.20)) = -500754589", -500754589, Bucket.get(Types.DecimalType.of(9, 2), 100).hash(new BigDecimal("14.20"))); - Assert.assertEquals("Spec example: hash(decimal2(14.20)) = -500754589", + Assert.assertEquals( + "Spec example: hash(decimal2(14.20)) = -500754589", -500754589, Bucket.get(Types.DecimalType.of(9, 2), 100).hash(new BigDecimal("14.20"))); Literal date = Literal.of("2017-11-16").to(Types.DateType.get()); - Assert.assertEquals("Spec example: hash(2017-11-16) = -653330422", + Assert.assertEquals( + "Spec example: hash(2017-11-16) = -653330422", -653330422, Bucket.get(Types.DateType.get(), 100).hash(date.value())); Literal timeValue = Literal.of("22:31:08").to(Types.TimeType.get()); - Assert.assertEquals("Spec example: hash(22:31:08) = -662762989", + Assert.assertEquals( + "Spec example: hash(22:31:08) = -662762989", -662762989, Bucket.get(Types.TimeType.get(), 100).hash(timeValue.value())); - Literal timestampVal = Literal.of("2017-11-16T22:31:08") - .to(Types.TimestampType.withoutZone()); - Assert.assertEquals("Spec example: hash(2017-11-16T22:31:08) = -2047944441", + Literal timestampVal = + Literal.of("2017-11-16T22:31:08").to(Types.TimestampType.withoutZone()); + Assert.assertEquals( + "Spec example: hash(2017-11-16T22:31:08) = -2047944441", -2047944441, Bucket.get(Types.TimestampType.withoutZone(), 100).hash(timestampVal.value())); - Literal timestamptzVal = Literal.of("2017-11-16T14:31:08-08:00") - .to(Types.TimestampType.withZone()); - Assert.assertEquals("Spec example: hash(2017-11-16T14:31:08-08:00) = -2047944441", + Literal timestamptzVal = + Literal.of("2017-11-16T14:31:08-08:00").to(Types.TimestampType.withZone()); + Assert.assertEquals( + "Spec example: hash(2017-11-16T14:31:08-08:00) = -2047944441", -2047944441, Bucket.get(Types.TimestampType.withZone(), 100).hash(timestamptzVal.value())); - Assert.assertEquals("Spec example: hash(\"iceberg\") = 1210000089", - 1210000089, Bucket.get(Types.StringType.get(), 100).hash("iceberg")); - Assert.assertEquals("Spec example: hash(\"iceberg\") = 1210000089", - 1210000089, Bucket.get(Types.StringType.get(), 100).hash(new Utf8("iceberg"))); + Assert.assertEquals( + "Spec example: hash(\"iceberg\") = 1210000089", + 1210000089, + Bucket.get(Types.StringType.get(), 100).hash("iceberg")); + Assert.assertEquals( + "Spec example: hash(\"iceberg\") = 1210000089", + 1210000089, + Bucket.get(Types.StringType.get(), 100).hash(new Utf8("iceberg"))); - Literal uuid = Literal.of("f79c3e09-677c-4bbd-a479-3f349cb785e7") - .to(Types.UUIDType.get()); - Assert.assertEquals("Spec example: hash(f79c3e09-677c-4bbd-a479-3f349cb785e7) = 1488055340", - 1488055340, Bucket.get(Types.UUIDType.get(), 100).hash(uuid.value())); + Literal uuid = + Literal.of("f79c3e09-677c-4bbd-a479-3f349cb785e7").to(Types.UUIDType.get()); + Assert.assertEquals( + "Spec example: hash(f79c3e09-677c-4bbd-a479-3f349cb785e7) = 1488055340", + 1488055340, + Bucket.get(Types.UUIDType.get(), 100).hash(uuid.value())); ByteBuffer bytes = ByteBuffer.wrap(new byte[] {0, 1, 2, 3}); - Assert.assertEquals("Spec example: hash([00 01 02 03]) = -188683207", - -188683207, Bucket.get(Types.BinaryType.get(), 100).hash(bytes)); - Assert.assertEquals("Spec example: hash([00 01 02 03]) = -188683207", - -188683207, Bucket.get(Types.BinaryType.get(), 100).hash(bytes)); + Assert.assertEquals( + "Spec example: hash([00 01 02 03]) = -188683207", + -188683207, + Bucket.get(Types.BinaryType.get(), 100).hash(bytes)); + Assert.assertEquals( + "Spec example: hash([00 01 02 03]) = -188683207", + -188683207, + Bucket.get(Types.BinaryType.get(), 100).hash(bytes)); } @Test @@ -127,8 +152,10 @@ public void testInteger() { Bucket bucketFunc = Bucket.get(Types.IntegerType.get(), 100); - Assert.assertEquals("Integer hash should match hash of little-endian bytes", - hashBytes(buffer.array()), bucketFunc.hash(num)); + Assert.assertEquals( + "Integer hash should match hash of little-endian bytes", + hashBytes(buffer.array()), + bucketFunc.hash(num)); } @Test @@ -140,8 +167,10 @@ public void testLong() { Bucket bucketFunc = Bucket.get(Types.LongType.get(), 100); - Assert.assertEquals("Long hash should match hash of little-endian bytes", - hashBytes(buffer.array()), bucketFunc.hash(num)); + Assert.assertEquals( + "Long hash should match hash of little-endian bytes", + hashBytes(buffer.array()), + bucketFunc.hash(num)); } @Test @@ -151,8 +180,10 @@ public void testIntegerTypePromotion() { int randomInt = testRandom.nextInt(); - Assert.assertEquals("Integer and Long bucket results should match", - bucketInts.apply(randomInt), bucketLongs.apply((long) randomInt)); + Assert.assertEquals( + "Integer and Long bucket results should match", + bucketInts.apply(randomInt), + bucketLongs.apply((long) randomInt)); } @Test @@ -164,8 +195,10 @@ public void testFloat() { Bucket bucketFunc = new Bucket.BucketFloat(100); - Assert.assertEquals("Float hash should match hash of little-endian bytes", - hashBytes(buffer.array()), bucketFunc.hash(num)); + Assert.assertEquals( + "Float hash should match hash of little-endian bytes", + hashBytes(buffer.array()), + bucketFunc.hash(num)); } @Test @@ -177,8 +210,10 @@ public void testDouble() { Bucket bucketFunc = new Bucket.BucketDouble(100); - Assert.assertEquals("Double hash should match hash of little-endian bytes", - hashBytes(buffer.array()), bucketFunc.hash(num)); + Assert.assertEquals( + "Double hash should match hash of little-endian bytes", + hashBytes(buffer.array()), + bucketFunc.hash(num)); } @Test @@ -188,8 +223,10 @@ public void testFloatTypePromotion() { float randomFloat = testRandom.nextFloat(); - Assert.assertEquals("Float and Double bucket results should match", - bucketFloats.apply(randomFloat), bucketDoubles.apply((double) randomFloat)); + Assert.assertEquals( + "Float and Double bucket results should match", + bucketFloats.apply(randomFloat), + bucketDoubles.apply((double) randomFloat)); } @Test @@ -200,8 +237,10 @@ public void testDecimal() { Bucket bucketFunc = Bucket.get(Types.DecimalType.of(9, 2), 100); - Assert.assertEquals("Decimal hash should match hash of backing bytes", - hashBytes(unscaledBytes), bucketFunc.hash(decimal)); + Assert.assertEquals( + "Decimal hash should match hash of backing bytes", + hashBytes(unscaledBytes), + bucketFunc.hash(decimal)); } @Test @@ -211,20 +250,25 @@ public void testString() { Bucket bucketFunc = Bucket.get(Types.StringType.get(), 100); - Assert.assertEquals("String hash should match hash of UTF-8 bytes", - hashBytes(asBytes), bucketFunc.hash(string)); + Assert.assertEquals( + "String hash should match hash of UTF-8 bytes", + hashBytes(asBytes), + bucketFunc.hash(string)); } @Test public void testStringWithSurrogatePair() { String string = "string with a surrogate pair: 💰"; - Assert.assertNotEquals("string has no surrogate pairs", string.length(), string.codePoints().count()); + Assert.assertNotEquals( + "string has no surrogate pairs", string.length(), string.codePoints().count()); byte[] asBytes = string.getBytes(StandardCharsets.UTF_8); Bucket bucketFunc = Bucket.get(Types.StringType.get(), 100); - Assert.assertEquals("String hash should match hash of UTF-8 bytes", - hashBytes(asBytes), bucketFunc.hash(string)); + Assert.assertEquals( + "String hash should match hash of UTF-8 bytes", + hashBytes(asBytes), + bucketFunc.hash(string)); } @Test @@ -234,8 +278,8 @@ public void testUtf8() { Bucket bucketFunc = Bucket.get(Types.StringType.get(), 100); - Assert.assertEquals("String hash should match hash of UTF-8 bytes", - hashBytes(asBytes), bucketFunc.hash(utf8)); + Assert.assertEquals( + "String hash should match hash of UTF-8 bytes", hashBytes(asBytes), bucketFunc.hash(utf8)); } @Test @@ -247,7 +291,8 @@ public void testByteBufferOnHeap() { Assert.assertEquals( "HeapByteBuffer hash should match hash for correct slice", - hashBytes(bytes, 5, 100), bucketFunc.hash(buffer)); + hashBytes(bytes, 5, 100), + bucketFunc.hash(buffer)); // verify that the buffer was not modified Assert.assertEquals("Buffer position should not change", 5, buffer.position()); @@ -270,7 +315,8 @@ public void testByteBufferOffHeap() { Assert.assertEquals( "DirectByteBuffer hash should match hash for correct slice", - hashBytes(bytes, 5, 100), bucketFunc.hash(buffer)); + hashBytes(bytes, 5, 100), + bucketFunc.hash(buffer)); // verify that the buffer was not modified Assert.assertEquals("Buffer position should not change", 5, buffer.position()); @@ -284,13 +330,16 @@ public void testUUIDHash() { Bucket bucketFunc = Bucket.get(Types.UUIDType.get(), 100); - Assert.assertEquals("UUID hash should match hash of backing bytes", - hashBytes(uuidBytes), bucketFunc.hash(uuid)); + Assert.assertEquals( + "UUID hash should match hash of backing bytes", + hashBytes(uuidBytes), + bucketFunc.hash(uuid)); } @Test public void testVerifiedIllegalNumBuckets() { - AssertHelpers.assertThrows("Should fail if numBucket is less than or equal to zero", + AssertHelpers.assertThrows( + "Should fail if numBucket is less than or equal to zero", IllegalArgumentException.class, "Invalid number of buckets: 0 (must be > 0)", () -> Bucket.get(Types.IntegerType.get(), 0)); @@ -312,6 +361,7 @@ private int hashBytes(byte[] bytes, int offset, int length) { /** * This method returns a UUID for the bytes in the array without modification. + * * @param bytes a 16-byte array * @return a UUID for the bytes */ diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestBucketingProjection.java b/api/src/test/java/org/apache/iceberg/transforms/TestBucketingProjection.java index af487ffdb288..25f10f06ecaf 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestBucketingProjection.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestBucketingProjection.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; +import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.expressions.Expressions.notIn; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.math.BigDecimal; import java.nio.ByteBuffer; import java.util.UUID; @@ -35,34 +45,31 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.notEqual; -import static org.apache.iceberg.expressions.Expressions.notIn; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestBucketingProjection { - public void assertProjectionStrict(PartitionSpec spec, UnboundPredicate filter, - Expression.Operation expectedOp, String expectedLiteral) { + public void assertProjectionStrict( + PartitionSpec spec, + UnboundPredicate filter, + Expression.Operation expectedOp, + String expectedLiteral) { Expression projection = Projections.strict(spec).project(filter); UnboundPredicate predicate = assertAndUnwrapUnbound(projection); Assert.assertEquals(expectedOp, predicate.op()); - Assert.assertNotEquals("Strict projection never runs for IN", Expression.Operation.IN, predicate.op()); + Assert.assertNotEquals( + "Strict projection never runs for IN", Expression.Operation.IN, predicate.op()); Bucket transform = (Bucket) spec.getFieldsBySourceId(1).get(0).transform(); if (predicate.op() == Expression.Operation.NOT_IN) { Iterable values = Iterables.transform(predicate.literals(), Literal::value); - String actual = Lists.newArrayList(values).stream().sorted() - .map(v -> transform.toHumanString(v)).collect(Collectors.toList()).toString(); + String actual = + Lists.newArrayList(values).stream() + .sorted() + .map(v -> transform.toHumanString(v)) + .collect(Collectors.toList()) + .toString(); Assert.assertEquals(expectedLiteral, actual); } else { Literal literal = predicate.literal(); @@ -71,32 +78,40 @@ public void assertProjectionStrict(PartitionSpec spec, UnboundPredicate filte } } - public void assertProjectionStrictValue(PartitionSpec spec, UnboundPredicate filter, - Expression.Operation expectedOp) { + public void assertProjectionStrictValue( + PartitionSpec spec, UnboundPredicate filter, Expression.Operation expectedOp) { Expression projection = Projections.strict(spec).project(filter); Assert.assertEquals(projection.op(), expectedOp); } - public void assertProjectionInclusiveValue(PartitionSpec spec, UnboundPredicate filter, - Expression.Operation expectedOp) { + public void assertProjectionInclusiveValue( + PartitionSpec spec, UnboundPredicate filter, Expression.Operation expectedOp) { Expression projection = Projections.inclusive(spec).project(filter); Assert.assertEquals(projection.op(), expectedOp); } - public void assertProjectionInclusive(PartitionSpec spec, UnboundPredicate filter, - Expression.Operation expectedOp, String expectedLiteral) { + public void assertProjectionInclusive( + PartitionSpec spec, + UnboundPredicate filter, + Expression.Operation expectedOp, + String expectedLiteral) { Expression projection = Projections.inclusive(spec).project(filter); UnboundPredicate predicate = assertAndUnwrapUnbound(projection); Assert.assertEquals(predicate.op(), expectedOp); - Assert.assertNotEquals("Inclusive projection never runs for NOT_IN", Expression.Operation.NOT_IN, predicate.op()); + Assert.assertNotEquals( + "Inclusive projection never runs for NOT_IN", Expression.Operation.NOT_IN, predicate.op()); Bucket transform = (Bucket) spec.getFieldsBySourceId(1).get(0).transform(); if (predicate.op() == Expression.Operation.IN) { Iterable values = Iterables.transform(predicate.literals(), Literal::value); - String actual = Lists.newArrayList(values).stream().sorted() - .map(v -> transform.toHumanString(v)).collect(Collectors.toList()).toString(); + String actual = + Lists.newArrayList(values).stream() + .sorted() + .map(v -> transform.toHumanString(v)) + .collect(Collectors.toList()) + .toString(); Assert.assertEquals(expectedLiteral, actual); } else { Literal literal = predicate.literal(); @@ -117,10 +132,14 @@ public void testBucketIntegerStrict() { assertProjectionStrictValue(spec, lessThan("value", value), Expression.Operation.FALSE); assertProjectionStrictValue(spec, lessThanOrEqual("value", value), Expression.Operation.FALSE); assertProjectionStrictValue(spec, greaterThan("value", value), Expression.Operation.FALSE); - assertProjectionStrictValue(spec, greaterThanOrEqual("value", value), Expression.Operation.FALSE); - - assertProjectionStrict(spec, notIn("value", value - 1, value, value + 1), - Expression.Operation.NOT_IN, "[6, 7, 8]"); + assertProjectionStrictValue( + spec, greaterThanOrEqual("value", value), Expression.Operation.FALSE); + + assertProjectionStrict( + spec, + notIn("value", value - 1, value, value + 1), + Expression.Operation.NOT_IN, + "[6, 7, 8]"); assertProjectionStrictValue(spec, in("value", value, value + 1), Expression.Operation.FALSE); } @@ -134,13 +153,16 @@ public void testBucketIntegerInclusive() { assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, "6"); assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE); assertProjectionInclusiveValue(spec, lessThan("value", value), Expression.Operation.TRUE); - assertProjectionInclusiveValue(spec, lessThanOrEqual("value", value), Expression.Operation.TRUE); + assertProjectionInclusiveValue( + spec, lessThanOrEqual("value", value), Expression.Operation.TRUE); assertProjectionInclusiveValue(spec, greaterThan("value", value), Expression.Operation.TRUE); - assertProjectionInclusiveValue(spec, greaterThanOrEqual("value", value), Expression.Operation.TRUE); + assertProjectionInclusiveValue( + spec, greaterThanOrEqual("value", value), Expression.Operation.TRUE); - assertProjectionInclusive(spec, in("value", value - 1, value, value + 1), - Expression.Operation.IN, "[6, 7, 8]"); - assertProjectionInclusiveValue(spec, notIn("value", value, value + 1), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("value", value - 1, value, value + 1), Expression.Operation.IN, "[6, 7, 8]"); + assertProjectionInclusiveValue( + spec, notIn("value", value, value + 1), Expression.Operation.TRUE); } // all types @@ -157,10 +179,14 @@ public void testBucketLongStrict() { assertProjectionStrictValue(spec, lessThan("value", value), Expression.Operation.FALSE); assertProjectionStrictValue(spec, lessThanOrEqual("value", value), Expression.Operation.FALSE); assertProjectionStrictValue(spec, greaterThan("value", value), Expression.Operation.FALSE); - assertProjectionStrictValue(spec, greaterThanOrEqual("value", value), Expression.Operation.FALSE); - - assertProjectionStrict(spec, notIn("value", value - 1, value, value + 1), - Expression.Operation.NOT_IN, "[6, 7, 8]"); + assertProjectionStrictValue( + spec, greaterThanOrEqual("value", value), Expression.Operation.FALSE); + + assertProjectionStrict( + spec, + notIn("value", value - 1, value, value + 1), + Expression.Operation.NOT_IN, + "[6, 7, 8]"); assertProjectionStrictValue(spec, in("value", value, value + 1), Expression.Operation.FALSE); } @@ -174,13 +200,16 @@ public void testBucketLongInclusive() { assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, "6"); assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE); assertProjectionInclusiveValue(spec, lessThan("value", value), Expression.Operation.TRUE); - assertProjectionInclusiveValue(spec, lessThanOrEqual("value", value), Expression.Operation.TRUE); + assertProjectionInclusiveValue( + spec, lessThanOrEqual("value", value), Expression.Operation.TRUE); assertProjectionInclusiveValue(spec, greaterThan("value", value), Expression.Operation.TRUE); - assertProjectionInclusiveValue(spec, greaterThanOrEqual("value", value), Expression.Operation.TRUE); + assertProjectionInclusiveValue( + spec, greaterThanOrEqual("value", value), Expression.Operation.TRUE); - assertProjectionInclusive(spec, in("value", value - 1, value, value + 1), - Expression.Operation.IN, "[6, 7, 8]"); - assertProjectionInclusiveValue(spec, notIn("value", value, value + 1), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("value", value - 1, value, value + 1), Expression.Operation.IN, "[6, 7, 8]"); + assertProjectionInclusiveValue( + spec, notIn("value", value, value + 1), Expression.Operation.TRUE); } @Test @@ -196,12 +225,17 @@ public void testBucketDecimalStrict() { assertProjectionStrictValue(spec, lessThan("value", value), Expression.Operation.FALSE); assertProjectionStrictValue(spec, lessThanOrEqual("value", value), Expression.Operation.FALSE); assertProjectionStrictValue(spec, greaterThan("value", value), Expression.Operation.FALSE); - assertProjectionStrictValue(spec, greaterThanOrEqual("value", value), Expression.Operation.FALSE); + assertProjectionStrictValue( + spec, greaterThanOrEqual("value", value), Expression.Operation.FALSE); BigDecimal delta = new BigDecimal(1); - assertProjectionStrict(spec, notIn("value", value.add(delta), value, value.subtract(delta)), - Expression.Operation.NOT_IN, "[2, 2, 6]"); - assertProjectionStrictValue(spec, in("value", value, value.add(delta)), Expression.Operation.FALSE); + assertProjectionStrict( + spec, + notIn("value", value.add(delta), value, value.subtract(delta)), + Expression.Operation.NOT_IN, + "[2, 2, 6]"); + assertProjectionStrictValue( + spec, in("value", value, value.add(delta)), Expression.Operation.FALSE); } @Test @@ -215,14 +249,20 @@ public void testBucketDecimalInclusive() { assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, "2"); assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE); assertProjectionInclusiveValue(spec, lessThan("value", value), Expression.Operation.TRUE); - assertProjectionInclusiveValue(spec, lessThanOrEqual("value", value), Expression.Operation.TRUE); + assertProjectionInclusiveValue( + spec, lessThanOrEqual("value", value), Expression.Operation.TRUE); assertProjectionInclusiveValue(spec, greaterThan("value", value), Expression.Operation.TRUE); - assertProjectionInclusiveValue(spec, greaterThanOrEqual("value", value), Expression.Operation.TRUE); + assertProjectionInclusiveValue( + spec, greaterThanOrEqual("value", value), Expression.Operation.TRUE); BigDecimal delta = new BigDecimal(1); - assertProjectionInclusive(spec, in("value", value.add(delta), value, value.subtract(delta)), - Expression.Operation.IN, "[2, 2, 6]"); - assertProjectionInclusiveValue(spec, notIn("value", value, value.add(delta)), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, + in("value", value.add(delta), value, value.subtract(delta)), + Expression.Operation.IN, + "[2, 2, 6]"); + assertProjectionInclusiveValue( + spec, notIn("value", value, value.add(delta)), Expression.Operation.TRUE); } @Test @@ -237,11 +277,13 @@ public void testBucketStringStrict() { assertProjectionStrictValue(spec, lessThan("value", value), Expression.Operation.FALSE); assertProjectionStrictValue(spec, lessThanOrEqual("value", value), Expression.Operation.FALSE); assertProjectionStrictValue(spec, greaterThan("value", value), Expression.Operation.FALSE); - assertProjectionStrictValue(spec, greaterThanOrEqual("value", value), Expression.Operation.FALSE); + assertProjectionStrictValue( + spec, greaterThanOrEqual("value", value), Expression.Operation.FALSE); - assertProjectionStrict(spec, notIn("value", value, value + "abc"), - Expression.Operation.NOT_IN, "[4, 9]"); - assertProjectionStrictValue(spec, in("value", value, value + "abc"), Expression.Operation.FALSE); + assertProjectionStrict( + spec, notIn("value", value, value + "abc"), Expression.Operation.NOT_IN, "[4, 9]"); + assertProjectionStrictValue( + spec, in("value", value, value + "abc"), Expression.Operation.FALSE); } @Test @@ -254,13 +296,16 @@ public void testBucketStringInclusive() { assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, "4"); assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE); assertProjectionInclusiveValue(spec, lessThan("value", value), Expression.Operation.TRUE); - assertProjectionInclusiveValue(spec, lessThanOrEqual("value", value), Expression.Operation.TRUE); + assertProjectionInclusiveValue( + spec, lessThanOrEqual("value", value), Expression.Operation.TRUE); assertProjectionInclusiveValue(spec, greaterThan("value", value), Expression.Operation.TRUE); - assertProjectionInclusiveValue(spec, greaterThanOrEqual("value", value), Expression.Operation.TRUE); + assertProjectionInclusiveValue( + spec, greaterThanOrEqual("value", value), Expression.Operation.TRUE); - assertProjectionInclusive(spec, in("value", value, value + "abc"), - Expression.Operation.IN, "[4, 9]"); - assertProjectionInclusiveValue(spec, notIn("value", value, value + "abc"), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("value", value, value + "abc"), Expression.Operation.IN, "[4, 9]"); + assertProjectionInclusiveValue( + spec, notIn("value", value, value + "abc"), Expression.Operation.TRUE); } @Test @@ -275,11 +320,12 @@ public void testBucketByteBufferStrict() throws Exception { assertProjectionStrictValue(spec, lessThan("value", value), Expression.Operation.FALSE); assertProjectionStrictValue(spec, lessThanOrEqual("value", value), Expression.Operation.FALSE); assertProjectionStrictValue(spec, greaterThan("value", value), Expression.Operation.FALSE); - assertProjectionStrictValue(spec, greaterThanOrEqual("value", value), Expression.Operation.FALSE); + assertProjectionStrictValue( + spec, greaterThanOrEqual("value", value), Expression.Operation.FALSE); ByteBuffer anotherValue = ByteBuffer.wrap("abcdehij".getBytes("UTF-8")); - assertProjectionStrict(spec, notIn("value", value, anotherValue), - Expression.Operation.NOT_IN, "[4, 6]"); + assertProjectionStrict( + spec, notIn("value", value, anotherValue), Expression.Operation.NOT_IN, "[4, 6]"); assertProjectionStrictValue(spec, in("value", value, anotherValue), Expression.Operation.FALSE); } @@ -293,14 +339,17 @@ public void testBucketByteBufferInclusive() throws Exception { assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, "4"); assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE); assertProjectionInclusiveValue(spec, lessThan("value", value), Expression.Operation.TRUE); - assertProjectionInclusiveValue(spec, lessThanOrEqual("value", value), Expression.Operation.TRUE); + assertProjectionInclusiveValue( + spec, lessThanOrEqual("value", value), Expression.Operation.TRUE); assertProjectionInclusiveValue(spec, greaterThan("value", value), Expression.Operation.TRUE); - assertProjectionInclusiveValue(spec, greaterThanOrEqual("value", value), Expression.Operation.TRUE); + assertProjectionInclusiveValue( + spec, greaterThanOrEqual("value", value), Expression.Operation.TRUE); ByteBuffer anotherValue = ByteBuffer.wrap("abcdehij".getBytes("UTF-8")); - assertProjectionInclusive(spec, in("value", value, anotherValue), - Expression.Operation.IN, "[4, 6]"); - assertProjectionInclusiveValue(spec, notIn("value", value, anotherValue), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("value", value, anotherValue), Expression.Operation.IN, "[4, 6]"); + assertProjectionInclusiveValue( + spec, notIn("value", value, anotherValue), Expression.Operation.TRUE); } @Test @@ -315,11 +364,12 @@ public void testBucketUUIDStrict() { assertProjectionStrictValue(spec, lessThan("value", value), Expression.Operation.FALSE); assertProjectionStrictValue(spec, lessThanOrEqual("value", value), Expression.Operation.FALSE); assertProjectionStrictValue(spec, greaterThan("value", value), Expression.Operation.FALSE); - assertProjectionStrictValue(spec, greaterThanOrEqual("value", value), Expression.Operation.FALSE); + assertProjectionStrictValue( + spec, greaterThanOrEqual("value", value), Expression.Operation.FALSE); UUID anotherValue = new UUID(456L, 123L); - assertProjectionStrict(spec, notIn("value", value, anotherValue), - Expression.Operation.NOT_IN, "[4, 6]"); + assertProjectionStrict( + spec, notIn("value", value, anotherValue), Expression.Operation.NOT_IN, "[4, 6]"); assertProjectionStrictValue(spec, in("value", value, anotherValue), Expression.Operation.FALSE); } @@ -333,13 +383,16 @@ public void testBucketUUIDInclusive() { assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, "4"); assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE); assertProjectionInclusiveValue(spec, lessThan("value", value), Expression.Operation.TRUE); - assertProjectionInclusiveValue(spec, lessThanOrEqual("value", value), Expression.Operation.TRUE); + assertProjectionInclusiveValue( + spec, lessThanOrEqual("value", value), Expression.Operation.TRUE); assertProjectionInclusiveValue(spec, greaterThan("value", value), Expression.Operation.TRUE); - assertProjectionInclusiveValue(spec, greaterThanOrEqual("value", value), Expression.Operation.TRUE); + assertProjectionInclusiveValue( + spec, greaterThanOrEqual("value", value), Expression.Operation.TRUE); UUID anotherValue = new UUID(456L, 123L); - assertProjectionInclusive(spec, in("value", value, anotherValue), - Expression.Operation.IN, "[4, 6]"); - assertProjectionInclusiveValue(spec, notIn("value", value, anotherValue), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("value", value, anotherValue), Expression.Operation.IN, "[4, 6]"); + assertProjectionInclusiveValue( + spec, notIn("value", value, anotherValue), Expression.Operation.TRUE); } } diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestDates.java b/api/src/test/java/org/apache/iceberg/transforms/TestDates.java index f9479f8c3dae..39829221d6b5 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestDates.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestDates.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; import org.apache.iceberg.expressions.Literal; @@ -55,16 +54,22 @@ public void testDateToHumanString() { Literal date = Literal.of("2017-12-01").to(type); Transform year = Transforms.year(type); - Assert.assertEquals("Should produce the correct Human string", - "2017", year.toHumanString(year.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "2017", + year.toHumanString(year.apply(date.value()))); Transform month = Transforms.month(type); - Assert.assertEquals("Should produce the correct Human string", - "2017-12", month.toHumanString(month.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "2017-12", + month.toHumanString(month.apply(date.value()))); Transform day = Transforms.day(type); - Assert.assertEquals("Should produce the correct Human string", - "2017-12-01", day.toHumanString(day.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "2017-12-01", + day.toHumanString(day.apply(date.value()))); } @Test @@ -73,16 +78,22 @@ public void testNegativeDateToHumanString() { Literal date = Literal.of("1969-12-30").to(type); Transform year = Transforms.year(type); - Assert.assertEquals("Should produce the correct Human string", - "1969", year.toHumanString(year.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969", + year.toHumanString(year.apply(date.value()))); Transform month = Transforms.month(type); - Assert.assertEquals("Should produce the correct Human string", - "1969-12", month.toHumanString(month.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969-12", + month.toHumanString(month.apply(date.value()))); Transform day = Transforms.day(type); - Assert.assertEquals("Should produce the correct Human string", - "1969-12-30", day.toHumanString(day.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969-12-30", + day.toHumanString(day.apply(date.value()))); } @Test @@ -91,16 +102,22 @@ public void testDateToHumanStringLowerBound() { Literal date = Literal.of("1970-01-01").to(type); Transform year = Transforms.year(type); - Assert.assertEquals("Should produce the correct Human string", - "1970", year.toHumanString(year.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1970", + year.toHumanString(year.apply(date.value()))); Transform month = Transforms.month(type); - Assert.assertEquals("Should produce the correct Human string", - "1970-01", month.toHumanString(month.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1970-01", + month.toHumanString(month.apply(date.value()))); Transform day = Transforms.day(type); - Assert.assertEquals("Should produce the correct Human string", - "1970-01-01", day.toHumanString(day.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1970-01-01", + day.toHumanString(day.apply(date.value()))); } @Test @@ -109,16 +126,22 @@ public void testNegativeDateToHumanStringLowerBound() { Literal date = Literal.of("1969-01-01").to(type); Transform year = Transforms.year(type); - Assert.assertEquals("Should produce the correct Human string", - "1969", year.toHumanString(year.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969", + year.toHumanString(year.apply(date.value()))); Transform month = Transforms.month(type); - Assert.assertEquals("Should produce the correct Human string", - "1969-01", month.toHumanString(month.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969-01", + month.toHumanString(month.apply(date.value()))); Transform day = Transforms.day(type); - Assert.assertEquals("Should produce the correct Human string", - "1969-01-01", day.toHumanString(day.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969-01-01", + day.toHumanString(day.apply(date.value()))); } @Test @@ -127,27 +150,33 @@ public void testNegativeDateToHumanStringUpperBound() { Literal date = Literal.of("1969-12-31").to(type); Transform year = Transforms.year(type); - Assert.assertEquals("Should produce the correct Human string", - "1969", year.toHumanString(year.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969", + year.toHumanString(year.apply(date.value()))); Transform month = Transforms.month(type); - Assert.assertEquals("Should produce the correct Human string", - "1969-12", month.toHumanString(month.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969-12", + month.toHumanString(month.apply(date.value()))); Transform day = Transforms.day(type); - Assert.assertEquals("Should produce the correct Human string", - "1969-12-31", day.toHumanString(day.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969-12-31", + day.toHumanString(day.apply(date.value()))); } @Test public void testNullHumanString() { Types.DateType type = Types.DateType.get(); - Assert.assertEquals("Should produce \"null\" for null", - "null", Transforms.year(type).toHumanString(null)); - Assert.assertEquals("Should produce \"null\" for null", - "null", Transforms.month(type).toHumanString(null)); - Assert.assertEquals("Should produce \"null\" for null", - "null", Transforms.day(type).toHumanString(null)); + Assert.assertEquals( + "Should produce \"null\" for null", "null", Transforms.year(type).toHumanString(null)); + Assert.assertEquals( + "Should produce \"null\" for null", "null", Transforms.month(type).toHumanString(null)); + Assert.assertEquals( + "Should produce \"null\" for null", "null", Transforms.day(type).toHumanString(null)); } @Test diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestDatesProjection.java b/api/src/test/java/org/apache/iceberg/transforms/TestDatesProjection.java index 301f35c4f613..a6abdc8f93eb 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestDatesProjection.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestDatesProjection.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; +import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.expressions.Expressions.notIn; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.util.stream.Collectors; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; @@ -32,36 +42,33 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.notEqual; -import static org.apache.iceberg.expressions.Expressions.notIn; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestDatesProjection { private static final Types.DateType TYPE = Types.DateType.get(); private static final Schema SCHEMA = new Schema(optional(1, "date", TYPE)); - public void assertProjectionStrict(PartitionSpec spec, UnboundPredicate filter, - Expression.Operation expectedOp, String expectedLiteral) { + public void assertProjectionStrict( + PartitionSpec spec, + UnboundPredicate filter, + Expression.Operation expectedOp, + String expectedLiteral) { Expression projection = Projections.strict(spec).project(filter); UnboundPredicate predicate = assertAndUnwrapUnbound(projection); Assert.assertEquals(expectedOp, predicate.op()); - Assert.assertNotEquals("Strict projection never runs for IN", Expression.Operation.IN, predicate.op()); + Assert.assertNotEquals( + "Strict projection never runs for IN", Expression.Operation.IN, predicate.op()); Dates transform = (Dates) spec.getFieldsBySourceId(1).get(0).transform(); if (predicate.op() == Expression.Operation.NOT_IN) { Iterable values = Iterables.transform(predicate.literals(), Literal::value); - String actual = Lists.newArrayList(values).stream().sorted() - .map(v -> transform.toHumanString((Integer) v)).collect(Collectors.toList()).toString(); + String actual = + Lists.newArrayList(values).stream() + .sorted() + .map(v -> transform.toHumanString((Integer) v)) + .collect(Collectors.toList()) + .toString(); Assert.assertEquals(expectedLiteral, actual); } else { Literal literal = predicate.literal(); @@ -70,34 +77,42 @@ public void assertProjectionStrict(PartitionSpec spec, UnboundPredicate filte } } - public void assertProjectionStrictValue(PartitionSpec spec, UnboundPredicate filter, - Expression.Operation expectedOp) { + public void assertProjectionStrictValue( + PartitionSpec spec, UnboundPredicate filter, Expression.Operation expectedOp) { Expression projection = Projections.strict(spec).project(filter); Assert.assertEquals(expectedOp, projection.op()); } - public void assertProjectionInclusiveValue(PartitionSpec spec, UnboundPredicate filter, - Expression.Operation expectedOp) { + public void assertProjectionInclusiveValue( + PartitionSpec spec, UnboundPredicate filter, Expression.Operation expectedOp) { Expression projection = Projections.inclusive(spec).project(filter); Assert.assertEquals(expectedOp, projection.op()); } - public void assertProjectionInclusive(PartitionSpec spec, UnboundPredicate filter, - Expression.Operation expectedOp, String expectedLiteral) { + public void assertProjectionInclusive( + PartitionSpec spec, + UnboundPredicate filter, + Expression.Operation expectedOp, + String expectedLiteral) { Expression projection = Projections.inclusive(spec).project(filter); UnboundPredicate predicate = assertAndUnwrapUnbound(projection); Assert.assertEquals(expectedOp, predicate.op()); - Assert.assertNotEquals("Inclusive projection never runs for NOT_IN", Expression.Operation.NOT_IN, predicate.op()); + Assert.assertNotEquals( + "Inclusive projection never runs for NOT_IN", Expression.Operation.NOT_IN, predicate.op()); Dates transform = (Dates) spec.getFieldsBySourceId(1).get(0).transform(); if (predicate.op() == Expression.Operation.IN) { Iterable values = Iterables.transform(predicate.literals(), Literal::value); - String actual = Lists.newArrayList(values).stream().sorted() - .map(v -> transform.toHumanString((Integer) v)).collect(Collectors.toList()).toString(); + String actual = + Lists.newArrayList(values).stream() + .sorted() + .map(v -> transform.toHumanString((Integer) v)) + .collect(Collectors.toList()) + .toString(); Assert.assertEquals(expectedLiteral, actual); } else { Literal literal = predicate.literal(); @@ -114,13 +129,14 @@ public void testMonthStrictEpoch() { assertProjectionStrict(spec, lessThan("date", date), Expression.Operation.LT, "1970-01"); assertProjectionStrict(spec, lessThanOrEqual("date", date), Expression.Operation.LT, "1970-01"); assertProjectionStrict(spec, greaterThan("date", date), Expression.Operation.GT, "1970-02"); - assertProjectionStrict(spec, greaterThanOrEqual("date", date), Expression.Operation.GT, "1970-01"); + assertProjectionStrict( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT, "1970-01"); assertProjectionStrict(spec, notEqual("date", date), Expression.Operation.NOT_EQ, "1970-01"); assertProjectionStrictValue(spec, equal("date", date), Expression.Operation.FALSE); Integer anotherDate = (Integer) Literal.of("1969-12-31").to(TYPE).value(); - assertProjectionStrict(spec, notIn("date", date, anotherDate), - Expression.Operation.NOT_IN, "[1969-12, 1970-01]"); + assertProjectionStrict( + spec, notIn("date", date, anotherDate), Expression.Operation.NOT_IN, "[1969-12, 1970-01]"); assertProjectionStrictValue(spec, in("date", date, anotherDate), Expression.Operation.FALSE); } @@ -130,16 +146,20 @@ public void testMonthInclusiveEpoch() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("date").build(); assertProjectionInclusive(spec, lessThan("date", date), Expression.Operation.LT_EQ, "1970-01"); - assertProjectionInclusive(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "1970-01"); - assertProjectionInclusive(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "1970-01"); - assertProjectionInclusive(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "1970-01"); + assertProjectionInclusive( + spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "1970-01"); + assertProjectionInclusive( + spec, greaterThan("date", date), Expression.Operation.GT_EQ, "1970-01"); + assertProjectionInclusive( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "1970-01"); assertProjectionInclusive(spec, equal("date", date), Expression.Operation.EQ, "1970-01"); assertProjectionInclusiveValue(spec, notEqual("date", date), Expression.Operation.TRUE); Integer anotherDate = (Integer) Literal.of("1969-12-31").to(TYPE).value(); - assertProjectionInclusive(spec, in("date", date, anotherDate), - Expression.Operation.IN, "[1969-12, 1970-01]"); - assertProjectionInclusiveValue(spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("date", date, anotherDate), Expression.Operation.IN, "[1969-12, 1970-01]"); + assertProjectionInclusiveValue( + spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -150,13 +170,14 @@ public void testMonthStrictLowerBound() { assertProjectionStrict(spec, lessThan("date", date), Expression.Operation.LT, "2017-01"); assertProjectionStrict(spec, lessThanOrEqual("date", date), Expression.Operation.LT, "2017-01"); assertProjectionStrict(spec, greaterThan("date", date), Expression.Operation.GT, "2017-01"); - assertProjectionStrict(spec, greaterThanOrEqual("date", date), Expression.Operation.GT, "2016-12"); + assertProjectionStrict( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT, "2016-12"); assertProjectionStrict(spec, notEqual("date", date), Expression.Operation.NOT_EQ, "2017-01"); assertProjectionStrictValue(spec, equal("date", date), Expression.Operation.FALSE); Integer anotherDate = (Integer) Literal.of("2017-12-02").to(TYPE).value(); - assertProjectionStrict(spec, notIn("date", anotherDate, date), - Expression.Operation.NOT_IN, "[2017-01, 2017-12]"); + assertProjectionStrict( + spec, notIn("date", anotherDate, date), Expression.Operation.NOT_IN, "[2017-01, 2017-12]"); assertProjectionStrictValue(spec, in("date", anotherDate, date), Expression.Operation.FALSE); } @@ -168,13 +189,18 @@ public void testNegativeMonthStrictLowerBound() { assertProjectionStrict(spec, lessThan("date", date), Expression.Operation.LT, "1969-01"); assertProjectionStrict(spec, lessThanOrEqual("date", date), Expression.Operation.LT, "1969-01"); assertProjectionStrict(spec, greaterThan("date", date), Expression.Operation.GT, "1969-02"); - assertProjectionStrict(spec, greaterThanOrEqual("date", date), Expression.Operation.GT, "1969-01"); - assertProjectionStrict(spec, notEqual("date", date), Expression.Operation.NOT_IN, "[1969-01, 1969-02]"); + assertProjectionStrict( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT, "1969-01"); + assertProjectionStrict( + spec, notEqual("date", date), Expression.Operation.NOT_IN, "[1969-01, 1969-02]"); assertProjectionStrictValue(spec, equal("date", date), Expression.Operation.FALSE); Integer anotherDate = (Integer) Literal.of("1969-12-31").to(TYPE).value(); - assertProjectionStrict(spec, notIn("date", date, anotherDate), - Expression.Operation.NOT_IN, "[1969-01, 1969-02, 1969-12, 1970-01]"); + assertProjectionStrict( + spec, + notIn("date", date, anotherDate), + Expression.Operation.NOT_IN, + "[1969-01, 1969-02, 1969-12, 1970-01]"); assertProjectionStrictValue(spec, in("date", date, anotherDate), Expression.Operation.FALSE); } @@ -186,13 +212,14 @@ public void testMonthStrictUpperBound() { assertProjectionStrict(spec, lessThan("date", date), Expression.Operation.LT, "2017-12"); assertProjectionStrict(spec, lessThanOrEqual("date", date), Expression.Operation.LT, "2018-01"); assertProjectionStrict(spec, greaterThan("date", date), Expression.Operation.GT, "2017-12"); - assertProjectionStrict(spec, greaterThanOrEqual("date", date), Expression.Operation.GT, "2017-12"); + assertProjectionStrict( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT, "2017-12"); assertProjectionStrict(spec, notEqual("date", date), Expression.Operation.NOT_EQ, "2017-12"); assertProjectionStrictValue(spec, equal("date", date), Expression.Operation.FALSE); Integer anotherDate = (Integer) Literal.of("2017-01-01").to(TYPE).value(); - assertProjectionStrict(spec, notIn("date", anotherDate, date), - Expression.Operation.NOT_IN, "[2017-01, 2017-12]"); + assertProjectionStrict( + spec, notIn("date", anotherDate, date), Expression.Operation.NOT_IN, "[2017-01, 2017-12]"); assertProjectionStrictValue(spec, in("date", anotherDate, date), Expression.Operation.FALSE); } @@ -204,13 +231,18 @@ public void testNegativeMonthStrictUpperBound() { assertProjectionStrict(spec, lessThan("date", date), Expression.Operation.LT, "1969-12"); assertProjectionStrict(spec, lessThanOrEqual("date", date), Expression.Operation.LT, "1970-01"); assertProjectionStrict(spec, greaterThan("date", date), Expression.Operation.GT, "1970-01"); - assertProjectionStrict(spec, greaterThanOrEqual("date", date), Expression.Operation.GT, "1970-01"); - assertProjectionStrict(spec, notEqual("date", date), Expression.Operation.NOT_IN, "[1969-12, 1970-01]"); + assertProjectionStrict( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT, "1970-01"); + assertProjectionStrict( + spec, notEqual("date", date), Expression.Operation.NOT_IN, "[1969-12, 1970-01]"); assertProjectionStrictValue(spec, equal("date", date), Expression.Operation.FALSE); Integer anotherDate = (Integer) Literal.of("1969-11-01").to(TYPE).value(); - assertProjectionStrict(spec, notIn("date", date, anotherDate), - Expression.Operation.NOT_IN, "[1969-11, 1969-12, 1970-01]"); + assertProjectionStrict( + spec, + notIn("date", date, anotherDate), + Expression.Operation.NOT_IN, + "[1969-11, 1969-12, 1970-01]"); assertProjectionStrictValue(spec, in("date", date, anotherDate), Expression.Operation.FALSE); } @@ -220,16 +252,20 @@ public void testMonthInclusiveLowerBound() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("date").build(); assertProjectionInclusive(spec, lessThan("date", date), Expression.Operation.LT_EQ, "2017-11"); - assertProjectionInclusive(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017-12"); - assertProjectionInclusive(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2017-12"); - assertProjectionInclusive(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017-12"); + assertProjectionInclusive( + spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017-12"); + assertProjectionInclusive( + spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2017-12"); + assertProjectionInclusive( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017-12"); assertProjectionInclusive(spec, equal("date", date), Expression.Operation.EQ, "2017-12"); assertProjectionInclusiveValue(spec, notEqual("date", date), Expression.Operation.TRUE); Integer anotherDate = (Integer) Literal.of("2017-01-01").to(TYPE).value(); - assertProjectionInclusive(spec, in("date", date, anotherDate), - Expression.Operation.IN, "[2017-01, 2017-12]"); - assertProjectionInclusiveValue(spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("date", date, anotherDate), Expression.Operation.IN, "[2017-01, 2017-12]"); + assertProjectionInclusiveValue( + spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -238,16 +274,24 @@ public void testNegativeMonthInclusiveLowerBound() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("date").build(); assertProjectionInclusive(spec, lessThan("date", date), Expression.Operation.LT_EQ, "1969-01"); - assertProjectionInclusive(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "1969-02"); - assertProjectionInclusive(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "1969-01"); - assertProjectionInclusive(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "1969-01"); - assertProjectionInclusive(spec, equal("date", date), Expression.Operation.IN, "[1969-01, 1969-02]"); + assertProjectionInclusive( + spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "1969-02"); + assertProjectionInclusive( + spec, greaterThan("date", date), Expression.Operation.GT_EQ, "1969-01"); + assertProjectionInclusive( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "1969-01"); + assertProjectionInclusive( + spec, equal("date", date), Expression.Operation.IN, "[1969-01, 1969-02]"); assertProjectionInclusiveValue(spec, notEqual("date", date), Expression.Operation.TRUE); Integer anotherDate = (Integer) Literal.of("1969-12-31").to(TYPE).value(); - assertProjectionInclusive(spec, in("date", date, anotherDate), - Expression.Operation.IN, "[1969-01, 1969-02, 1969-12, 1970-01]"); - assertProjectionInclusiveValue(spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, + in("date", date, anotherDate), + Expression.Operation.IN, + "[1969-01, 1969-02, 1969-12, 1970-01]"); + assertProjectionInclusiveValue( + spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -256,16 +300,20 @@ public void testMonthInclusiveUpperBound() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("date").build(); assertProjectionInclusive(spec, lessThan("date", date), Expression.Operation.LT_EQ, "2017-12"); - assertProjectionInclusive(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017-12"); - assertProjectionInclusive(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2018-01"); - assertProjectionInclusive(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017-12"); + assertProjectionInclusive( + spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017-12"); + assertProjectionInclusive( + spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2018-01"); + assertProjectionInclusive( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017-12"); assertProjectionInclusive(spec, equal("date", date), Expression.Operation.EQ, "2017-12"); assertProjectionInclusiveValue(spec, notEqual("date", date), Expression.Operation.TRUE); Integer anotherDate = (Integer) Literal.of("2017-01-01").to(TYPE).value(); - assertProjectionInclusive(spec, in("date", date, anotherDate), - Expression.Operation.IN, "[2017-01, 2017-12]"); - assertProjectionInclusiveValue(spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("date", date, anotherDate), Expression.Operation.IN, "[2017-01, 2017-12]"); + assertProjectionInclusiveValue( + spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -274,16 +322,24 @@ public void testNegativeMonthInclusiveUpperBound() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("date").build(); assertProjectionInclusive(spec, lessThan("date", date), Expression.Operation.LT_EQ, "1970-01"); - assertProjectionInclusive(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "1970-01"); - assertProjectionInclusive(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "1970-01"); - assertProjectionInclusive(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "1969-12"); - assertProjectionInclusive(spec, equal("date", date), Expression.Operation.IN, "[1969-12, 1970-01]"); + assertProjectionInclusive( + spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "1970-01"); + assertProjectionInclusive( + spec, greaterThan("date", date), Expression.Operation.GT_EQ, "1970-01"); + assertProjectionInclusive( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "1969-12"); + assertProjectionInclusive( + spec, equal("date", date), Expression.Operation.IN, "[1969-12, 1970-01]"); assertProjectionInclusiveValue(spec, notEqual("date", date), Expression.Operation.TRUE); Integer anotherDate = (Integer) Literal.of("1969-01-01").to(TYPE).value(); - assertProjectionInclusive(spec, in("date", date, anotherDate), - Expression.Operation.IN, "[1969-01, 1969-02, 1969-12, 1970-01]"); - assertProjectionInclusiveValue(spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, + in("date", date, anotherDate), + Expression.Operation.IN, + "[1969-01, 1969-02, 1969-12, 1970-01]"); + assertProjectionInclusiveValue( + spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -293,16 +349,21 @@ public void testDayStrict() { assertProjectionStrict(spec, lessThan("date", date), Expression.Operation.LT, "2017-01-01"); // should be the same date for <= - assertProjectionStrict(spec, lessThanOrEqual("date", date), Expression.Operation.LT, "2017-01-02"); + assertProjectionStrict( + spec, lessThanOrEqual("date", date), Expression.Operation.LT, "2017-01-02"); assertProjectionStrict(spec, greaterThan("date", date), Expression.Operation.GT, "2017-01-01"); // should be the same date for >= - assertProjectionStrict(spec, greaterThanOrEqual("date", date), Expression.Operation.GT, "2016-12-31"); + assertProjectionStrict( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT, "2016-12-31"); assertProjectionStrict(spec, notEqual("date", date), Expression.Operation.NOT_EQ, "2017-01-01"); assertProjectionStrictValue(spec, equal("date", date), Expression.Operation.FALSE); Integer anotherDate = (Integer) Literal.of("2017-12-31").to(TYPE).value(); - assertProjectionStrict(spec, notIn("date", date, anotherDate), - Expression.Operation.NOT_IN, "[2017-01-01, 2017-12-31]"); + assertProjectionStrict( + spec, + notIn("date", date, anotherDate), + Expression.Operation.NOT_IN, + "[2017-01-01, 2017-12-31]"); assertProjectionStrictValue(spec, in("date", date, anotherDate), Expression.Operation.FALSE); } @@ -313,16 +374,21 @@ public void testNegativeDayStrict() { assertProjectionStrict(spec, lessThan("date", date), Expression.Operation.LT, "1969-12-30"); // should be the same date for <= - assertProjectionStrict(spec, lessThanOrEqual("date", date), Expression.Operation.LT, "1969-12-31"); + assertProjectionStrict( + spec, lessThanOrEqual("date", date), Expression.Operation.LT, "1969-12-31"); assertProjectionStrict(spec, greaterThan("date", date), Expression.Operation.GT, "1969-12-30"); // should be the same date for >= - assertProjectionStrict(spec, greaterThanOrEqual("date", date), Expression.Operation.GT, "1969-12-29"); + assertProjectionStrict( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT, "1969-12-29"); assertProjectionStrict(spec, notEqual("date", date), Expression.Operation.NOT_EQ, "1969-12-30"); assertProjectionStrictValue(spec, equal("date", date), Expression.Operation.FALSE); Integer anotherDate = (Integer) Literal.of("1969-12-28").to(TYPE).value(); - assertProjectionStrict(spec, notIn("date", date, anotherDate), - Expression.Operation.NOT_IN, "[1969-12-28, 1969-12-30]"); + assertProjectionStrict( + spec, + notIn("date", date, anotherDate), + Expression.Operation.NOT_IN, + "[1969-12-28, 1969-12-30]"); assertProjectionStrictValue(spec, in("date", date, anotherDate), Expression.Operation.FALSE); } @@ -331,17 +397,22 @@ public void testDayInclusive() { Integer date = (Integer) Literal.of("2017-01-01").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("date").build(); - assertProjectionInclusive(spec, lessThan("date", date), Expression.Operation.LT_EQ, "2016-12-31"); - assertProjectionInclusive(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017-01-01"); - assertProjectionInclusive(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2017-01-02"); - assertProjectionInclusive(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017-01-01"); + assertProjectionInclusive( + spec, lessThan("date", date), Expression.Operation.LT_EQ, "2016-12-31"); + assertProjectionInclusive( + spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017-01-01"); + assertProjectionInclusive( + spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2017-01-02"); + assertProjectionInclusive( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017-01-01"); assertProjectionInclusive(spec, equal("date", date), Expression.Operation.EQ, "2017-01-01"); assertProjectionInclusiveValue(spec, notEqual("date", date), Expression.Operation.TRUE); Integer anotherDate = (Integer) Literal.of("2017-12-31").to(TYPE).value(); - assertProjectionInclusive(spec, in("date", date, anotherDate), - Expression.Operation.IN, "[2017-01-01, 2017-12-31]"); - assertProjectionInclusiveValue(spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("date", date, anotherDate), Expression.Operation.IN, "[2017-01-01, 2017-12-31]"); + assertProjectionInclusiveValue( + spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -349,17 +420,22 @@ public void testNegativeDayInclusive() { Integer date = (Integer) Literal.of("1969-12-30").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("date").build(); - assertProjectionInclusive(spec, lessThan("date", date), Expression.Operation.LT_EQ, "1969-12-29"); - assertProjectionInclusive(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "1969-12-30"); - assertProjectionInclusive(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "1969-12-31"); - assertProjectionInclusive(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "1969-12-30"); + assertProjectionInclusive( + spec, lessThan("date", date), Expression.Operation.LT_EQ, "1969-12-29"); + assertProjectionInclusive( + spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "1969-12-30"); + assertProjectionInclusive( + spec, greaterThan("date", date), Expression.Operation.GT_EQ, "1969-12-31"); + assertProjectionInclusive( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "1969-12-30"); assertProjectionInclusive(spec, equal("date", date), Expression.Operation.EQ, "1969-12-30"); assertProjectionInclusiveValue(spec, notEqual("date", date), Expression.Operation.TRUE); Integer anotherDate = (Integer) Literal.of("1969-12-28").to(TYPE).value(); - assertProjectionInclusive(spec, in("date", date, anotherDate), - Expression.Operation.IN, "[1969-12-28, 1969-12-30]"); - assertProjectionInclusiveValue(spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("date", date, anotherDate), Expression.Operation.IN, "[1969-12-28, 1969-12-30]"); + assertProjectionInclusiveValue( + spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -375,8 +451,8 @@ public void testYearStrictLowerBound() { assertProjectionStrictValue(spec, equal("date", date), Expression.Operation.FALSE); Integer anotherDate = (Integer) Literal.of("2016-12-31").to(TYPE).value(); - assertProjectionStrict(spec, notIn("date", date, anotherDate), - Expression.Operation.NOT_IN, "[2016, 2017]"); + assertProjectionStrict( + spec, notIn("date", date, anotherDate), Expression.Operation.NOT_IN, "[2016, 2017]"); assertProjectionStrictValue(spec, in("date", date, anotherDate), Expression.Operation.FALSE); } @@ -393,8 +469,8 @@ public void testNegativeYearStrictLowerBound() { assertProjectionStrictValue(spec, equal("date", date), Expression.Operation.FALSE); Integer anotherDate = (Integer) Literal.of("1969-12-31").to(TYPE).value(); - assertProjectionStrict(spec, notIn("date", date, anotherDate), - Expression.Operation.NOT_IN, "[1969, 1970]"); + assertProjectionStrict( + spec, notIn("date", date, anotherDate), Expression.Operation.NOT_IN, "[1969, 1970]"); assertProjectionStrictValue(spec, in("date", date, anotherDate), Expression.Operation.FALSE); } @@ -411,8 +487,8 @@ public void testYearStrictUpperBound() { assertProjectionStrictValue(spec, equal("date", date), Expression.Operation.FALSE); Integer anotherDate = (Integer) Literal.of("2016-01-01").to(TYPE).value(); - assertProjectionStrict(spec, notIn("date", date, anotherDate), - Expression.Operation.NOT_IN, "[2016, 2017]"); + assertProjectionStrict( + spec, notIn("date", date, anotherDate), Expression.Operation.NOT_IN, "[2016, 2017]"); assertProjectionStrictValue(spec, in("date", date, anotherDate), Expression.Operation.FALSE); } @@ -425,12 +501,13 @@ public void testNegativeYearStrictUpperBound() { assertProjectionStrict(spec, lessThanOrEqual("date", date), Expression.Operation.LT, "1970"); assertProjectionStrict(spec, greaterThan("date", date), Expression.Operation.GT, "1970"); assertProjectionStrict(spec, greaterThanOrEqual("date", date), Expression.Operation.GT, "1970"); - assertProjectionStrict(spec, notEqual("date", date), Expression.Operation.NOT_IN, "[1969, 1970]"); + assertProjectionStrict( + spec, notEqual("date", date), Expression.Operation.NOT_IN, "[1969, 1970]"); assertProjectionStrictValue(spec, equal("date", date), Expression.Operation.FALSE); Integer anotherDate = (Integer) Literal.of("1970-01-01").to(TYPE).value(); - assertProjectionStrict(spec, notIn("date", date, anotherDate), - Expression.Operation.NOT_IN, "[1969, 1970]"); + assertProjectionStrict( + spec, notIn("date", date, anotherDate), Expression.Operation.NOT_IN, "[1969, 1970]"); assertProjectionStrictValue(spec, in("date", date, anotherDate), Expression.Operation.FALSE); } @@ -440,16 +517,19 @@ public void testYearInclusiveLowerBound() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).year("date").build(); assertProjectionInclusive(spec, lessThan("date", date), Expression.Operation.LT_EQ, "2016"); - assertProjectionInclusive(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017"); + assertProjectionInclusive( + spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017"); assertProjectionInclusive(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2017"); - assertProjectionInclusive(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017"); + assertProjectionInclusive( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017"); assertProjectionInclusive(spec, equal("date", date), Expression.Operation.EQ, "2017"); assertProjectionInclusiveValue(spec, notEqual("date", date), Expression.Operation.TRUE); Integer anotherDate = (Integer) Literal.of("2016-12-31").to(TYPE).value(); - assertProjectionInclusive(spec, in("date", date, anotherDate), - Expression.Operation.IN, "[2016, 2017]"); - assertProjectionInclusiveValue(spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("date", date, anotherDate), Expression.Operation.IN, "[2016, 2017]"); + assertProjectionInclusiveValue( + spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -458,16 +538,19 @@ public void testNegativeYearInclusiveLowerBound() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).year("date").build(); assertProjectionInclusive(spec, lessThan("date", date), Expression.Operation.LT_EQ, "1970"); - assertProjectionInclusive(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "1970"); + assertProjectionInclusive( + spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "1970"); assertProjectionInclusive(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "1970"); - assertProjectionInclusive(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "1970"); + assertProjectionInclusive( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "1970"); assertProjectionInclusive(spec, equal("date", date), Expression.Operation.EQ, "1970"); assertProjectionInclusiveValue(spec, notEqual("date", date), Expression.Operation.TRUE); Integer anotherDate = (Integer) Literal.of("1969-12-31").to(TYPE).value(); - assertProjectionInclusive(spec, in("date", date, anotherDate), - Expression.Operation.IN, "[1969, 1970]"); - assertProjectionInclusiveValue(spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("date", date, anotherDate), Expression.Operation.IN, "[1969, 1970]"); + assertProjectionInclusiveValue( + spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -476,16 +559,19 @@ public void testYearInclusiveUpperBound() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).year("date").build(); assertProjectionInclusive(spec, lessThan("date", date), Expression.Operation.LT_EQ, "2017"); - assertProjectionInclusive(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017"); + assertProjectionInclusive( + spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "2017"); assertProjectionInclusive(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "2018"); - assertProjectionInclusive(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017"); + assertProjectionInclusive( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "2017"); assertProjectionInclusive(spec, equal("date", date), Expression.Operation.EQ, "2017"); assertProjectionInclusiveValue(spec, notEqual("date", date), Expression.Operation.TRUE); Integer anotherDate = (Integer) Literal.of("2016-01-01").to(TYPE).value(); - assertProjectionInclusive(spec, in("date", date, anotherDate), - Expression.Operation.IN, "[2016, 2017]"); - assertProjectionInclusiveValue(spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("date", date, anotherDate), Expression.Operation.IN, "[2016, 2017]"); + assertProjectionInclusiveValue( + spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -494,15 +580,18 @@ public void testNegativeYearInclusiveUpperBound() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).year("date").build(); assertProjectionInclusive(spec, lessThan("date", date), Expression.Operation.LT_EQ, "1970"); - assertProjectionInclusive(spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "1970"); + assertProjectionInclusive( + spec, lessThanOrEqual("date", date), Expression.Operation.LT_EQ, "1970"); assertProjectionInclusive(spec, greaterThan("date", date), Expression.Operation.GT_EQ, "1970"); - assertProjectionInclusive(spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "1969"); + assertProjectionInclusive( + spec, greaterThanOrEqual("date", date), Expression.Operation.GT_EQ, "1969"); assertProjectionInclusive(spec, equal("date", date), Expression.Operation.IN, "[1969, 1970]"); assertProjectionInclusiveValue(spec, notEqual("date", date), Expression.Operation.TRUE); Integer anotherDate = (Integer) Literal.of("1969-01-01").to(TYPE).value(); - assertProjectionInclusive(spec, in("date", date, anotherDate), - Expression.Operation.IN, "[1969, 1970]"); - assertProjectionInclusiveValue(spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("date", date, anotherDate), Expression.Operation.IN, "[1969, 1970]"); + assertProjectionInclusiveValue( + spec, notIn("date", date, anotherDate), Expression.Operation.TRUE); } } diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestIdentity.java b/api/src/test/java/org/apache/iceberg/transforms/TestIdentity.java index a092dda3b24e..e2e3680c7b5d 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestIdentity.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestIdentity.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; import java.math.BigDecimal; @@ -32,8 +31,7 @@ public void testNullHumanString() { Types.LongType longType = Types.LongType.get(); Transform identity = Transforms.identity(longType); - Assert.assertEquals("Should produce \"null\" for null", - "null", identity.toHumanString(null)); + Assert.assertEquals("Should produce \"null\" for null", "null", identity.toHumanString(null)); } @Test @@ -41,8 +39,10 @@ public void testBinaryHumanString() { Types.BinaryType binary = Types.BinaryType.get(); Transform identity = Transforms.identity(binary); - Assert.assertEquals("Should base64-encode binary", - "AQID", identity.toHumanString(ByteBuffer.wrap(new byte[] {1, 2, 3}))); + Assert.assertEquals( + "Should base64-encode binary", + "AQID", + identity.toHumanString(ByteBuffer.wrap(new byte[] {1, 2, 3}))); } @Test @@ -50,8 +50,8 @@ public void testFixedHumanString() { Types.FixedType fixed3 = Types.FixedType.ofLength(3); Transform identity = Transforms.identity(fixed3); - Assert.assertEquals("Should base64-encode binary", - "AQID", identity.toHumanString(new byte[] {1, 2, 3})); + Assert.assertEquals( + "Should base64-encode binary", "AQID", identity.toHumanString(new byte[] {1, 2, 3})); } @Test @@ -62,8 +62,8 @@ public void testDateHumanString() { String dateString = "2017-12-01"; Literal dateLit = Literal.of(dateString).to(date); - Assert.assertEquals("Should produce identical date", - dateString, identity.toHumanString(dateLit.value())); + Assert.assertEquals( + "Should produce identical date", dateString, identity.toHumanString(dateLit.value())); } @Test @@ -74,8 +74,8 @@ public void testTimeHumanString() { String timeString = "10:12:55.038194"; Literal timeLit = Literal.of(timeString).to(time); - Assert.assertEquals("Should produce identical time", - timeString, identity.toHumanString(timeLit.value())); + Assert.assertEquals( + "Should produce identical time", timeString, identity.toHumanString(timeLit.value())); } @Test @@ -86,8 +86,10 @@ public void testTimestampWithZoneHumanString() { Literal ts = Literal.of("2017-12-01T10:12:55.038194-08:00").to(timestamptz); // value will always be in UTC - Assert.assertEquals("Should produce timestamp with time zone adjusted to UTC", - "2017-12-01T18:12:55.038194Z", identity.toHumanString(ts.value())); + Assert.assertEquals( + "Should produce timestamp with time zone adjusted to UTC", + "2017-12-01T18:12:55.038194Z", + identity.toHumanString(ts.value())); } @Test @@ -99,8 +101,10 @@ public void testTimestampWithoutZoneHumanString() { Literal ts = Literal.of(tsString).to(timestamp); // value is not changed - Assert.assertEquals("Should produce identical timestamp without time zone", - tsString, identity.toHumanString(ts.value())); + Assert.assertEquals( + "Should produce identical timestamp without time zone", + tsString, + identity.toHumanString(ts.value())); } @Test @@ -108,8 +112,8 @@ public void testLongToHumanString() { Types.LongType longType = Types.LongType.get(); Transform identity = Transforms.identity(longType); - Assert.assertEquals("Should use Long toString", - "-1234567890000", identity.toHumanString(-1234567890000L)); + Assert.assertEquals( + "Should use Long toString", "-1234567890000", identity.toHumanString(-1234567890000L)); } @Test @@ -128,6 +132,7 @@ public void testBigDecimalToHumanString() { String decimalString = "-1.50"; BigDecimal bigDecimal = new BigDecimal(decimalString); - Assert.assertEquals("Should not modify Strings", decimalString, identity.toHumanString(bigDecimal)); + Assert.assertEquals( + "Should not modify Strings", decimalString, identity.toHumanString(bigDecimal)); } } diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestNotStartsWith.java b/api/src/test/java/org/apache/iceberg/transforms/TestNotStartsWith.java index 4e4b6da6dc90..54a362a9d337 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestNotStartsWith.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestNotStartsWith.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; +import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; +import static org.apache.iceberg.types.Conversions.toByteBuffer; +import static org.apache.iceberg.types.Types.NestedField.optional; + import org.apache.iceberg.DataFile; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; @@ -41,71 +45,83 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; -import static org.apache.iceberg.expressions.Expressions.notStartsWith; -import static org.apache.iceberg.types.Conversions.toByteBuffer; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestNotStartsWith { private static final String COLUMN = "someStringCol"; private static final Schema SCHEMA = new Schema(optional(1, COLUMN, Types.StringType.get())); // All 50 rows have someStringCol = 'bbb', none are null (despite being optional). - private static final DataFile FILE_1 = new TestDataFile("file_1.avro", Row.of(), 50, - // any value counts, including nulls - ImmutableMap.of(1, 50L), - // null value counts - ImmutableMap.of(1, 0L), - // nan value counts - null, - // lower bounds - ImmutableMap.of(1, toByteBuffer(StringType.get(), "bbb")), - // upper bounds - ImmutableMap.of(1, toByteBuffer(StringType.get(), "bbb"))); + private static final DataFile FILE_1 = + new TestDataFile( + "file_1.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(1, 50L), + // null value counts + ImmutableMap.of(1, 0L), + // nan value counts + null, + // lower bounds + ImmutableMap.of(1, toByteBuffer(StringType.get(), "bbb")), + // upper bounds + ImmutableMap.of(1, toByteBuffer(StringType.get(), "bbb"))); @Test public void testTruncateProjections() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).truncate(COLUMN, 4).build(); - assertProjectionInclusive(spec, notStartsWith(COLUMN, "ab"), "ab", Expression.Operation.NOT_STARTS_WITH); - assertProjectionInclusive(spec, notStartsWith(COLUMN, "abab"), "abab", Expression.Operation.NOT_EQ); - // When literal is longer than partition spec's truncation width, we always read for an inclusive projection + assertProjectionInclusive( + spec, notStartsWith(COLUMN, "ab"), "ab", Expression.Operation.NOT_STARTS_WITH); + assertProjectionInclusive( + spec, notStartsWith(COLUMN, "abab"), "abab", Expression.Operation.NOT_EQ); + // When literal is longer than partition spec's truncation width, we always read for an + // inclusive projection // when using notStartsWith. Expression projection = Projections.inclusive(spec).project(notStartsWith(COLUMN, "ababab")); Assert.assertTrue(projection instanceof True); - assertProjectionStrict(spec, notStartsWith(COLUMN, "ab"), "ab", Expression.Operation.NOT_STARTS_WITH); - assertProjectionStrict(spec, notStartsWith(COLUMN, "abab"), "abab", Expression.Operation.NOT_EQ); - assertProjectionStrict(spec, notStartsWith(COLUMN, "ababab"), "abab", Expression.Operation.NOT_STARTS_WITH); - assertProjectionStrict(spec, notStartsWith(COLUMN, "abcde"), "abcd", Expression.Operation.NOT_STARTS_WITH); + assertProjectionStrict( + spec, notStartsWith(COLUMN, "ab"), "ab", Expression.Operation.NOT_STARTS_WITH); + assertProjectionStrict( + spec, notStartsWith(COLUMN, "abab"), "abab", Expression.Operation.NOT_EQ); + assertProjectionStrict( + spec, notStartsWith(COLUMN, "ababab"), "abab", Expression.Operation.NOT_STARTS_WITH); + assertProjectionStrict( + spec, notStartsWith(COLUMN, "abcde"), "abcd", Expression.Operation.NOT_STARTS_WITH); } @Test public void testTruncateStringWhenProjectedPredicateTermIsLongerThanWidth() { Truncate trunc = Truncate.get(Types.StringType.get(), 2); UnboundPredicate expr = notStartsWith(COLUMN, "abcde"); - BoundPredicate boundExpr = (BoundPredicate) Binder.bind(SCHEMA.asStruct(), expr, false); + BoundPredicate boundExpr = + (BoundPredicate) Binder.bind(SCHEMA.asStruct(), expr, false); UnboundPredicate projected = trunc.projectStrict(COLUMN, boundExpr); Evaluator evaluator = new Evaluator(SCHEMA.asStruct(), projected); - Assert.assertEquals("The projected literal should be truncated to the truncation width", - projected.literal().value(), "ab"); + Assert.assertEquals( + "The projected literal should be truncated to the truncation width", + projected.literal().value(), + "ab"); - Assert.assertFalse("notStartsWith(abcde, truncate(abcde,2)) => false", + Assert.assertFalse( + "notStartsWith(abcde, truncate(abcde,2)) => false", evaluator.eval(TestHelpers.Row.of("abcde"))); - Assert.assertFalse("notStartsWith(abcde, truncate(ab, 2)) => false", - evaluator.eval(TestHelpers.Row.of("ab"))); + Assert.assertFalse( + "notStartsWith(abcde, truncate(ab, 2)) => false", evaluator.eval(TestHelpers.Row.of("ab"))); - Assert.assertFalse("notStartsWith(abcde, truncate(abcdz, 2)) => false", + Assert.assertFalse( + "notStartsWith(abcde, truncate(abcdz, 2)) => false", evaluator.eval(TestHelpers.Row.of("abcdz"))); - Assert.assertTrue("notStartsWith(abcde, truncate(a, 2)) => true", - evaluator.eval(TestHelpers.Row.of("a"))); + Assert.assertTrue( + "notStartsWith(abcde, truncate(a, 2)) => true", evaluator.eval(TestHelpers.Row.of("a"))); - Assert.assertTrue("notStartsWith(abcde, truncate(aczcde, 2)) => true", + Assert.assertTrue( + "notStartsWith(abcde, truncate(aczcde, 2)) => true", evaluator.eval(TestHelpers.Row.of("aczcde"))); } @@ -113,55 +129,67 @@ public void testTruncateStringWhenProjectedPredicateTermIsLongerThanWidth() { public void testTruncateStringWhenProjectedPredicateTermIsShorterThanWidth() { Truncate trunc = Truncate.get(Types.StringType.get(), 16); UnboundPredicate expr = notStartsWith(COLUMN, "ab"); - BoundPredicate boundExpr = (BoundPredicate) Binder.bind(SCHEMA.asStruct(), expr, false); + BoundPredicate boundExpr = + (BoundPredicate) Binder.bind(SCHEMA.asStruct(), expr, false); UnboundPredicate projected = trunc.projectStrict(COLUMN, boundExpr); Evaluator evaluator = new Evaluator(SCHEMA.asStruct(), projected); - Assert.assertEquals("The projected literal should not be truncated as its size is shorter than truncation width", - projected.literal().value(), "ab"); + Assert.assertEquals( + "The projected literal should not be truncated as its size is shorter than truncation width", + projected.literal().value(), + "ab"); - Assert.assertFalse("notStartsWith(ab, truncate(abcde, 16)) => false", + Assert.assertFalse( + "notStartsWith(ab, truncate(abcde, 16)) => false", evaluator.eval(TestHelpers.Row.of("abcde"))); - Assert.assertFalse("notStartsWith(ab, truncate(ab, 16)) => false", - evaluator.eval(TestHelpers.Row.of("ab"))); + Assert.assertFalse( + "notStartsWith(ab, truncate(ab, 16)) => false", evaluator.eval(TestHelpers.Row.of("ab"))); - Assert.assertTrue("notStartsWith(ab, truncate(a, 16)) => true", - evaluator.eval(TestHelpers.Row.of("a"))); + Assert.assertTrue( + "notStartsWith(ab, truncate(a, 16)) => true", evaluator.eval(TestHelpers.Row.of("a"))); } @Test public void testTruncateStringWhenProjectedPredicateTermIsEqualToWidth() { Truncate trunc = Truncate.get(Types.StringType.get(), 7); UnboundPredicate expr = notStartsWith(COLUMN, "abcdefg"); - BoundPredicate boundExpr = (BoundPredicate) Binder.bind(SCHEMA.asStruct(), expr, false); + BoundPredicate boundExpr = + (BoundPredicate) Binder.bind(SCHEMA.asStruct(), expr, false); UnboundPredicate projected = trunc.projectStrict(COLUMN, boundExpr); Evaluator evaluator = new Evaluator(SCHEMA.asStruct(), projected); - Assert.assertEquals("The projected literal should not be truncated as its size is equal to truncation width", - projected.literal().value(), "abcdefg"); + Assert.assertEquals( + "The projected literal should not be truncated as its size is equal to truncation width", + projected.literal().value(), + "abcdefg"); - Assert.assertFalse("notStartsWith(abcdefg, truncate(abcdefg, 7)) => false", + Assert.assertFalse( + "notStartsWith(abcdefg, truncate(abcdefg, 7)) => false", evaluator.eval(TestHelpers.Row.of("abcdefg"))); - Assert.assertTrue("notStartsWith(abcdefg, truncate(ab, 2)) => true", + Assert.assertTrue( + "notStartsWith(abcdefg, truncate(ab, 2)) => true", evaluator.eval(TestHelpers.Row.of("ab"))); - Assert.assertTrue("notStartsWith(abcdefg, truncate(a, 16)) => true", - evaluator.eval(TestHelpers.Row.of("a"))); + Assert.assertTrue( + "notStartsWith(abcdefg, truncate(a, 16)) => true", evaluator.eval(TestHelpers.Row.of("a"))); } @Test public void testStrictMetricsEvaluatorForNotStartsWith() { - boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notStartsWith(COLUMN, "bbb")).eval(FILE_1); - Assert.assertFalse("Should not match: strict metrics eval is always false for notStartsWith", shouldRead); + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith(COLUMN, "bbb")).eval(FILE_1); + Assert.assertFalse( + "Should not match: strict metrics eval is always false for notStartsWith", shouldRead); } @Test public void testInclusiveMetricsEvaluatorForNotStartsWith() { - boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith(COLUMN, "aaa")).eval(FILE_1); + boolean shouldRead = + new InclusiveMetricsEvaluator(SCHEMA, notStartsWith(COLUMN, "aaa")).eval(FILE_1); Assert.assertTrue("Should match: some columns meet the filter criteria", shouldRead); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, notStartsWith(COLUMN, "b")).eval(FILE_1); @@ -177,23 +205,33 @@ public void testInclusiveMetricsEvaluatorForNotStartsWith() { Assert.assertTrue("Should match: some columns match the filter criteria", shouldRead); } - private void assertProjectionInclusive(PartitionSpec spec, UnboundPredicate filter, - String expectedLiteral, Expression.Operation expectedOp) { + private void assertProjectionInclusive( + PartitionSpec spec, + UnboundPredicate filter, + String expectedLiteral, + Expression.Operation expectedOp) { Expression projection = Projections.inclusive(spec).project(filter); assertProjection(spec, expectedLiteral, projection, expectedOp); } - private void assertProjectionStrict(PartitionSpec spec, UnboundPredicate filter, - String expectedLiteral, Expression.Operation expectedOp) { + private void assertProjectionStrict( + PartitionSpec spec, + UnboundPredicate filter, + String expectedLiteral, + Expression.Operation expectedOp) { Expression projection = Projections.strict(spec).project(filter); assertProjection(spec, expectedLiteral, projection, expectedOp); } - private void assertProjection(PartitionSpec spec, String expectedLiteral, Expression projection, - Expression.Operation expectedOp) { + private void assertProjection( + PartitionSpec spec, + String expectedLiteral, + Expression projection, + Expression.Operation expectedOp) { UnboundPredicate predicate = assertAndUnwrapUnbound(projection); Literal literal = predicate.literal(); - Truncate transform = (Truncate) spec.getFieldsBySourceId(1).get(0).transform(); + Truncate transform = + (Truncate) spec.getFieldsBySourceId(1).get(0).transform(); String output = transform.toHumanString((String) literal.value()); Assert.assertEquals(expectedOp, predicate.op()); diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestProjection.java b/api/src/test/java/org/apache/iceberg/transforms/TestProjection.java index aab821eca297..bc4c3e788ac8 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestProjection.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestProjection.java @@ -16,25 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; -import java.util.List; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.expressions.BoundPredicate; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.expressions.Or; -import org.apache.iceberg.expressions.Projections; -import org.apache.iceberg.expressions.UnboundPredicate; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Test; - import static org.apache.iceberg.AssertHelpers.assertThrows; import static org.apache.iceberg.TestHelpers.assertAndUnwrap; import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; @@ -52,27 +35,39 @@ import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import java.util.List; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.expressions.BoundPredicate; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.expressions.Or; +import org.apache.iceberg.expressions.Projections; +import org.apache.iceberg.expressions.UnboundPredicate; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.assertj.core.api.Assertions; +import org.junit.Assert; +import org.junit.Test; + public class TestProjection { - private static final Schema SCHEMA = new Schema( - optional(16, "id", Types.LongType.get()) - ); + private static final Schema SCHEMA = new Schema(optional(16, "id", Types.LongType.get())); @Test public void testIdentityProjection() { - List> predicates = Lists.newArrayList( - Expressions.notNull("id"), - Expressions.isNull("id"), - Expressions.lessThan("id", 100), - Expressions.lessThanOrEqual("id", 101), - Expressions.greaterThan("id", 102), - Expressions.greaterThanOrEqual("id", 103), - Expressions.equal("id", 104), - Expressions.notEqual("id", 105) - ); - - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("id") - .build(); + List> predicates = + Lists.newArrayList( + Expressions.notNull("id"), + Expressions.isNull("id"), + Expressions.lessThan("id", 100), + Expressions.lessThanOrEqual("id", 101), + Expressions.greaterThan("id", 102), + Expressions.greaterThanOrEqual("id", 103), + Expressions.equal("id", 104), + Expressions.notEqual("id", 105)); + + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); for (UnboundPredicate predicate : predicates) { // get the projected predicate @@ -82,13 +77,15 @@ public void testIdentityProjection() { // check inclusive the bound predicate to ensure the types are correct BoundPredicate bound = assertAndUnwrap(predicate.bind(spec.schema().asStruct(), true)); - Assert.assertEquals("Field name should match partition struct field", - "id", projected.ref().name()); + Assert.assertEquals( + "Field name should match partition struct field", "id", projected.ref().name()); Assert.assertEquals("Operation should match", bound.op(), projected.op()); if (bound.isLiteralPredicate()) { - Assert.assertEquals("Literal should be equal", - bound.asLiteralPredicate().literal().value(), projected.literal().value()); + Assert.assertEquals( + "Literal should be equal", + bound.asLiteralPredicate().literal().value(), + projected.literal().value()); } else { Assert.assertNull("Literal should be null", projected.literal()); } @@ -97,20 +94,18 @@ public void testIdentityProjection() { @Test public void testCaseInsensitiveIdentityProjection() { - List> predicates = Lists.newArrayList( - Expressions.notNull("ID"), - Expressions.isNull("ID"), - Expressions.lessThan("ID", 100), - Expressions.lessThanOrEqual("ID", 101), - Expressions.greaterThan("ID", 102), - Expressions.greaterThanOrEqual("ID", 103), - Expressions.equal("ID", 104), - Expressions.notEqual("ID", 105) - ); - - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("id") - .build(); + List> predicates = + Lists.newArrayList( + Expressions.notNull("ID"), + Expressions.isNull("ID"), + Expressions.lessThan("ID", 100), + Expressions.lessThanOrEqual("ID", 101), + Expressions.greaterThan("ID", 102), + Expressions.greaterThanOrEqual("ID", 103), + Expressions.equal("ID", 104), + Expressions.notEqual("ID", 105)); + + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); for (UnboundPredicate predicate : predicates) { // get the projected predicate @@ -120,13 +115,15 @@ public void testCaseInsensitiveIdentityProjection() { // check inclusive the bound predicate to ensure the types are correct BoundPredicate bound = assertAndUnwrap(predicate.bind(spec.schema().asStruct(), false)); - Assert.assertEquals("Field name should match partition struct field", - "id", projected.ref().name()); + Assert.assertEquals( + "Field name should match partition struct field", "id", projected.ref().name()); Assert.assertEquals("Operation should match", bound.op(), projected.op()); if (bound.isLiteralPredicate()) { - Assert.assertEquals("Literal should be equal", - bound.asLiteralPredicate().literal().value(), projected.literal().value()); + Assert.assertEquals( + "Literal should be equal", + bound.asLiteralPredicate().literal().value(), + projected.literal().value()); } else { Assert.assertNull("Literal should be null", projected.literal()); } @@ -135,9 +132,7 @@ public void testCaseInsensitiveIdentityProjection() { @Test public void testCaseSensitiveIdentityProjection() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("id") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); assertThrows( "X != x when case sensitivity is on", @@ -148,20 +143,18 @@ public void testCaseSensitiveIdentityProjection() { @Test public void testStrictIdentityProjection() { - List> predicates = Lists.newArrayList( - Expressions.notNull("id"), - Expressions.isNull("id"), - Expressions.lessThan("id", 100), - Expressions.lessThanOrEqual("id", 101), - Expressions.greaterThan("id", 102), - Expressions.greaterThanOrEqual("id", 103), - Expressions.equal("id", 104), - Expressions.notEqual("id", 105) - ); - - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("id") - .build(); + List> predicates = + Lists.newArrayList( + Expressions.notNull("id"), + Expressions.isNull("id"), + Expressions.lessThan("id", 100), + Expressions.lessThanOrEqual("id", 101), + Expressions.greaterThan("id", 102), + Expressions.greaterThanOrEqual("id", 103), + Expressions.equal("id", 104), + Expressions.notEqual("id", 105)); + + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); for (UnboundPredicate predicate : predicates) { // get the projected predicate @@ -171,13 +164,15 @@ public void testStrictIdentityProjection() { // check inclusive the bound predicate to ensure the types are correct BoundPredicate bound = assertAndUnwrap(predicate.bind(spec.schema().asStruct(), true)); - Assert.assertEquals("Field name should match partition struct field", - "id", projected.ref().name()); + Assert.assertEquals( + "Field name should match partition struct field", "id", projected.ref().name()); Assert.assertEquals("Operation should match", bound.op(), projected.op()); if (bound.isLiteralPredicate()) { - Assert.assertEquals("Literal should be equal", - bound.asLiteralPredicate().literal().value(), projected.literal().value()); + Assert.assertEquals( + "Literal should be equal", + bound.asLiteralPredicate().literal().value(), + projected.literal().value()); } else { Assert.assertNull("Literal should be null", projected.literal()); } @@ -186,20 +181,18 @@ public void testStrictIdentityProjection() { @Test public void testCaseInsensitiveStrictIdentityProjection() { - List> predicates = Lists.newArrayList( - Expressions.notNull("ID"), - Expressions.isNull("ID"), - Expressions.lessThan("ID", 100), - Expressions.lessThanOrEqual("ID", 101), - Expressions.greaterThan("ID", 102), - Expressions.greaterThanOrEqual("ID", 103), - Expressions.equal("ID", 104), - Expressions.notEqual("ID", 105) - ); - - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("id") - .build(); + List> predicates = + Lists.newArrayList( + Expressions.notNull("ID"), + Expressions.isNull("ID"), + Expressions.lessThan("ID", 100), + Expressions.lessThanOrEqual("ID", 101), + Expressions.greaterThan("ID", 102), + Expressions.greaterThanOrEqual("ID", 103), + Expressions.equal("ID", 104), + Expressions.notEqual("ID", 105)); + + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); for (UnboundPredicate predicate : predicates) { // get the projected predicate @@ -209,13 +202,15 @@ public void testCaseInsensitiveStrictIdentityProjection() { // check inclusive the bound predicate to ensure the types are correct BoundPredicate bound = assertAndUnwrap(predicate.bind(spec.schema().asStruct(), false)); - Assert.assertEquals("Field name should match partition struct field", - "id", projected.ref().name()); + Assert.assertEquals( + "Field name should match partition struct field", "id", projected.ref().name()); Assert.assertEquals("Operation should match", bound.op(), projected.op()); if (bound.isLiteralPredicate()) { - Assert.assertEquals("Literal should be equal", - bound.asLiteralPredicate().literal().value(), projected.literal().value()); + Assert.assertEquals( + "Literal should be equal", + bound.asLiteralPredicate().literal().value(), + projected.literal().value()); } else { Assert.assertNull("Literal should be null", projected.literal()); } @@ -244,19 +239,21 @@ public void testBadSparkPartitionFilter() { // OR (dateint = 20180415 and hour >= 20) // OR (dateint = 20180417 and hour <= 4) - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "hour", Types.IntegerType.get()), - required(4, "dateint", Types.IntegerType.get())); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "hour", Types.IntegerType.get()), + required(4, "dateint", Types.IntegerType.get())); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .identity("dateint") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("dateint").build(); - Expression filter = or(equal("dateint", 20180416), or( - and(equal("dateint", 20180415), greaterThanOrEqual("hour", 20)), - and(equal("dateint", 20180417), lessThanOrEqual("hour", 4)))); + Expression filter = + or( + equal("dateint", 20180416), + or( + and(equal("dateint", 20180415), greaterThanOrEqual("hour", 20)), + and(equal("dateint", 20180417), lessThanOrEqual("hour", 4)))); Expression projection = Projections.inclusive(spec).project(filter); @@ -278,93 +275,117 @@ public void testBadSparkPartitionFilter() { @Test @SuppressWarnings("unchecked") public void testProjectionNames() { - final Schema schema = new Schema( - required(1, "timestamp1", Types.TimestampType.withoutZone()), - optional(2, "timestamp2", Types.TimestampType.withoutZone()), - optional(3, "timestamp3", Types.TimestampType.withoutZone()), - optional(4, "timestamp4", Types.TimestampType.withoutZone()), - optional(5, "date1", Types.DateType.get()), - optional(6, "date2", Types.DateType.get()), - optional(7, "date3", Types.DateType.get()), - optional(8, "long", Types.LongType.get()), - optional(9, "string", Types.StringType.get()) - ); - - final PartitionSpec partitionSpec = PartitionSpec.builderFor(schema) - .withSpecId(0) - .hour("timestamp1") - .day("timestamp2") - .month("timestamp3") - .year("timestamp4") - .day("date1") - .month("date2") - .year("date3") - .bucket("long", 10) - .truncate("string", 10) - .build(); - - UnboundPredicate predicate = (UnboundPredicate) Projections - .strict(partitionSpec).project(equal(hour("timestamp1"), 20)); - Assert.assertEquals("should expected timestamp1_hour", "timestamp1_hour", predicate.ref().name()); - predicate = (UnboundPredicate) Projections - .inclusive(partitionSpec).project(equal(hour("timestamp1"), 20)); - Assert.assertEquals("should expected timestamp1_hour", "timestamp1_hour", predicate.ref().name()); - - predicate = (UnboundPredicate) Projections - .strict(partitionSpec).project(equal(day("timestamp2"), 20)); + final Schema schema = + new Schema( + required(1, "timestamp1", Types.TimestampType.withoutZone()), + optional(2, "timestamp2", Types.TimestampType.withoutZone()), + optional(3, "timestamp3", Types.TimestampType.withoutZone()), + optional(4, "timestamp4", Types.TimestampType.withoutZone()), + optional(5, "date1", Types.DateType.get()), + optional(6, "date2", Types.DateType.get()), + optional(7, "date3", Types.DateType.get()), + optional(8, "long", Types.LongType.get()), + optional(9, "string", Types.StringType.get())); + + final PartitionSpec partitionSpec = + PartitionSpec.builderFor(schema) + .withSpecId(0) + .hour("timestamp1") + .day("timestamp2") + .month("timestamp3") + .year("timestamp4") + .day("date1") + .month("date2") + .year("date3") + .bucket("long", 10) + .truncate("string", 10) + .build(); + + UnboundPredicate predicate = + (UnboundPredicate) + Projections.strict(partitionSpec).project(equal(hour("timestamp1"), 20)); + Assert.assertEquals( + "should expected timestamp1_hour", "timestamp1_hour", predicate.ref().name()); + predicate = + (UnboundPredicate) + Projections.inclusive(partitionSpec).project(equal(hour("timestamp1"), 20)); + Assert.assertEquals( + "should expected timestamp1_hour", "timestamp1_hour", predicate.ref().name()); + + predicate = + (UnboundPredicate) + Projections.strict(partitionSpec).project(equal(day("timestamp2"), 20)); Assert.assertEquals("should expected timestamp2_day", "timestamp2_day", predicate.ref().name()); - predicate = (UnboundPredicate) Projections - .inclusive(partitionSpec).project(equal(day("timestamp2"), 20)); + predicate = + (UnboundPredicate) + Projections.inclusive(partitionSpec).project(equal(day("timestamp2"), 20)); Assert.assertEquals("should expected timestamp2_day", "timestamp2_day", predicate.ref().name()); - predicate = (UnboundPredicate) Projections - .strict(partitionSpec).project(equal(month("timestamp3"), 20)); - Assert.assertEquals("should expected timestamp3_month", "timestamp3_month", predicate.ref().name()); - predicate = (UnboundPredicate) Projections - .inclusive(partitionSpec).project(equal(month("timestamp3"), 20)); - Assert.assertEquals("should expected timestamp3_month", "timestamp3_month", predicate.ref().name()); - - predicate = (UnboundPredicate) Projections - .strict(partitionSpec).project(equal(year("timestamp4"), 20)); - Assert.assertEquals("should expected timestamp4_year", "timestamp4_year", predicate.ref().name()); - predicate = (UnboundPredicate) Projections - .inclusive(partitionSpec).project(equal(year("timestamp4"), 20)); - Assert.assertEquals("should expected timestamp4_year", "timestamp4_year", predicate.ref().name()); - - predicate = (UnboundPredicate) Projections - .strict(partitionSpec).project(equal(day("date1"), 20)); + predicate = + (UnboundPredicate) + Projections.strict(partitionSpec).project(equal(month("timestamp3"), 20)); + Assert.assertEquals( + "should expected timestamp3_month", "timestamp3_month", predicate.ref().name()); + predicate = + (UnboundPredicate) + Projections.inclusive(partitionSpec).project(equal(month("timestamp3"), 20)); + Assert.assertEquals( + "should expected timestamp3_month", "timestamp3_month", predicate.ref().name()); + + predicate = + (UnboundPredicate) + Projections.strict(partitionSpec).project(equal(year("timestamp4"), 20)); + Assert.assertEquals( + "should expected timestamp4_year", "timestamp4_year", predicate.ref().name()); + predicate = + (UnboundPredicate) + Projections.inclusive(partitionSpec).project(equal(year("timestamp4"), 20)); + Assert.assertEquals( + "should expected timestamp4_year", "timestamp4_year", predicate.ref().name()); + + predicate = + (UnboundPredicate) + Projections.strict(partitionSpec).project(equal(day("date1"), 20)); Assert.assertEquals("should expected date1_day", "date1_day", predicate.ref().name()); - predicate = (UnboundPredicate) Projections - .inclusive(partitionSpec).project(equal(day("date1"), 20)); + predicate = + (UnboundPredicate) + Projections.inclusive(partitionSpec).project(equal(day("date1"), 20)); Assert.assertEquals("should expected date1_day", "date1_day", predicate.ref().name()); - predicate = (UnboundPredicate) Projections - .strict(partitionSpec).project(equal(month("date2"), 20)); + predicate = + (UnboundPredicate) + Projections.strict(partitionSpec).project(equal(month("date2"), 20)); Assert.assertEquals("should expected date2_month", "date2_month", predicate.ref().name()); - predicate = (UnboundPredicate) Projections - .inclusive(partitionSpec).project(equal(month("date2"), 20)); + predicate = + (UnboundPredicate) + Projections.inclusive(partitionSpec).project(equal(month("date2"), 20)); Assert.assertEquals("should expected date2_month", "date2_month", predicate.ref().name()); - predicate = (UnboundPredicate) Projections - .strict(partitionSpec).project(equal(year("date3"), 20)); + predicate = + (UnboundPredicate) + Projections.strict(partitionSpec).project(equal(year("date3"), 20)); Assert.assertEquals("should expected date3_year", "date3_year", predicate.ref().name()); - predicate = (UnboundPredicate) Projections - .inclusive(partitionSpec).project(equal(year("date3"), 20)); + predicate = + (UnboundPredicate) + Projections.inclusive(partitionSpec).project(equal(year("date3"), 20)); Assert.assertEquals("should expected date3_year", "date3_year", predicate.ref().name()); - predicate = (UnboundPredicate) Projections - .strict(partitionSpec).project(equal(bucket("long", 10), 20)); + predicate = + (UnboundPredicate) + Projections.strict(partitionSpec).project(equal(bucket("long", 10), 20)); Assert.assertEquals("should expected long_bucket", "long_bucket", predicate.ref().name()); - predicate = (UnboundPredicate) Projections - .inclusive(partitionSpec).project(equal(bucket("long", 10), 20)); + predicate = + (UnboundPredicate) + Projections.inclusive(partitionSpec).project(equal(bucket("long", 10), 20)); Assert.assertEquals("should expected long_bucket", "long_bucket", predicate.ref().name()); - predicate = (UnboundPredicate) Projections - .strict(partitionSpec).project(equal(truncate("string", 10), "abc")); + predicate = + (UnboundPredicate) + Projections.strict(partitionSpec).project(equal(truncate("string", 10), "abc")); Assert.assertEquals("should expected string_trunc", "string_trunc", predicate.ref().name()); - predicate = (UnboundPredicate) Projections - .inclusive(partitionSpec).project(equal(truncate("string", 10), "abc")); + predicate = + (UnboundPredicate) + Projections.inclusive(partitionSpec).project(equal(truncate("string", 10), "abc")); Assert.assertEquals("should expected string_trunc", "string_trunc", predicate.ref().name()); } - } diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestResiduals.java b/api/src/test/java/org/apache/iceberg/transforms/TestResiduals.java index 9560009f76cc..dc46964c4686 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestResiduals.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestResiduals.java @@ -16,24 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.TestHelpers.Row; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.expressions.Literal; -import org.apache.iceberg.expressions.Predicate; -import org.apache.iceberg.expressions.ResidualEvaluator; -import org.apache.iceberg.expressions.UnboundPredicate; -import org.apache.iceberg.types.Types; -import org.assertj.core.api.Assertions; -import org.junit.Assert; -import org.junit.Test; - import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; import static org.apache.iceberg.expressions.Expression.Operation.GT; import static org.apache.iceberg.expressions.Expression.Operation.LT; @@ -49,24 +33,40 @@ import static org.apache.iceberg.expressions.Expressions.notNaN; import static org.apache.iceberg.expressions.Expressions.or; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.TestHelpers.Row; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.expressions.Literal; +import org.apache.iceberg.expressions.Predicate; +import org.apache.iceberg.expressions.ResidualEvaluator; +import org.apache.iceberg.expressions.UnboundPredicate; +import org.apache.iceberg.types.Types; +import org.assertj.core.api.Assertions; +import org.junit.Assert; +import org.junit.Test; + public class TestResiduals { @Test public void testIdentityTransformResiduals() { - Schema schema = new Schema( - Types.NestedField.optional(50, "dateint", Types.IntegerType.get()), - Types.NestedField.optional(51, "hour", Types.IntegerType.get()) - ); - - PartitionSpec spec = PartitionSpec.builderFor(schema) - .identity("dateint") - .build(); - - ResidualEvaluator resEval = ResidualEvaluator.of(spec, or(or( - and(lessThan("dateint", 20170815), greaterThan("dateint", 20170801)), - and(equal("dateint", 20170815), lessThan("hour", 12))), - and(equal("dateint", 20170801), greaterThan("hour", 11))), - true - ); + Schema schema = + new Schema( + Types.NestedField.optional(50, "dateint", Types.IntegerType.get()), + Types.NestedField.optional(51, "hour", Types.IntegerType.get())); + + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("dateint").build(); + + ResidualEvaluator resEval = + ResidualEvaluator.of( + spec, + or( + or( + and(lessThan("dateint", 20170815), greaterThan("dateint", 20170801)), + and(equal("dateint", 20170815), lessThan("hour", 12))), + and(equal("dateint", 20170801), greaterThan("hour", 11))), + true); // equal to the upper date bound Expression residual = resEval.residualFor(Row.of(20170815)); @@ -93,19 +93,22 @@ public void testIdentityTransformResiduals() { @Test public void testCaseInsensitiveIdentityTransformResiduals() { - Schema schema = new Schema( - Types.NestedField.optional(50, "dateint", Types.IntegerType.get()), - Types.NestedField.optional(51, "hour", Types.IntegerType.get())); - - PartitionSpec spec = PartitionSpec.builderFor(schema) - .identity("dateint") - .build(); - - ResidualEvaluator resEval = ResidualEvaluator.of(spec, or(or( - and(lessThan("DATEINT", 20170815), greaterThan("dateint", 20170801)), - and(equal("dateint", 20170815), lessThan("HOUR", 12))), - and(equal("DateInt", 20170801), greaterThan("hOUr", 11))), - false); + Schema schema = + new Schema( + Types.NestedField.optional(50, "dateint", Types.IntegerType.get()), + Types.NestedField.optional(51, "hour", Types.IntegerType.get())); + + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("dateint").build(); + + ResidualEvaluator resEval = + ResidualEvaluator.of( + spec, + or( + or( + and(lessThan("DATEINT", 20170815), greaterThan("dateint", 20170801)), + and(equal("dateint", 20170815), lessThan("HOUR", 12))), + and(equal("DateInt", 20170801), greaterThan("hOUr", 11))), + false); // equal to the upper date bound Expression residual = resEval.residualFor(Row.of(20170815)); @@ -132,14 +135,12 @@ public void testCaseInsensitiveIdentityTransformResiduals() { @Test public void testCaseSensitiveIdentityTransformResiduals() { - Schema schema = new Schema( - Types.NestedField.optional(50, "dateint", Types.IntegerType.get()), - Types.NestedField.optional(51, "hour", Types.IntegerType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.optional(50, "dateint", Types.IntegerType.get()), + Types.NestedField.optional(51, "hour", Types.IntegerType.get())); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .identity("dateint") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("dateint").build(); ResidualEvaluator resEval = ResidualEvaluator.of(spec, lessThan("DATEINT", 20170815), true); @@ -150,41 +151,41 @@ public void testCaseSensitiveIdentityTransformResiduals() { @Test public void testUnpartitionedResiduals() { - Expression[] expressions = new Expression[] { - Expressions.alwaysTrue(), - Expressions.alwaysFalse(), - Expressions.lessThan("a", 5), - Expressions.greaterThanOrEqual("b", 16), - Expressions.notNull("c"), - Expressions.isNull("d"), - Expressions.in("e", 1, 2, 3), - Expressions.notIn("f", 1, 2, 3), - Expressions.notNaN("g"), - Expressions.isNaN("h"), - Expressions.startsWith("data", "abcd"), - Expressions.notStartsWith("data", "abcd") - }; + Expression[] expressions = + new Expression[] { + Expressions.alwaysTrue(), + Expressions.alwaysFalse(), + Expressions.lessThan("a", 5), + Expressions.greaterThanOrEqual("b", 16), + Expressions.notNull("c"), + Expressions.isNull("d"), + Expressions.in("e", 1, 2, 3), + Expressions.notIn("f", 1, 2, 3), + Expressions.notNaN("g"), + Expressions.isNaN("h"), + Expressions.startsWith("data", "abcd"), + Expressions.notStartsWith("data", "abcd") + }; for (Expression expr : expressions) { - ResidualEvaluator residualEvaluator = ResidualEvaluator.of(PartitionSpec.unpartitioned(), expr, true); - Assert.assertEquals("Should return expression", - expr, residualEvaluator.residualFor(Row.of())); + ResidualEvaluator residualEvaluator = + ResidualEvaluator.of(PartitionSpec.unpartitioned(), expr, true); + Assert.assertEquals( + "Should return expression", expr, residualEvaluator.residualFor(Row.of())); } } @Test public void testIn() { - Schema schema = new Schema( - Types.NestedField.optional(50, "dateint", Types.IntegerType.get()), - Types.NestedField.optional(51, "hour", Types.IntegerType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.optional(50, "dateint", Types.IntegerType.get()), + Types.NestedField.optional(51, "hour", Types.IntegerType.get())); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .identity("dateint") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("dateint").build(); - ResidualEvaluator resEval = ResidualEvaluator.of(spec, - in("dateint", 20170815, 20170816, 20170817), true); + ResidualEvaluator resEval = + ResidualEvaluator.of(spec, in("dateint", 20170815, 20170816, 20170817), true); Expression residual = resEval.residualFor(Row.of(20170815)); Assert.assertEquals("Residual should be alwaysTrue", alwaysTrue(), residual); @@ -195,26 +196,25 @@ public void testIn() { @Test public void testInTimestamp() { - Schema schema = new Schema( - Types.NestedField.optional(50, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(51, "dateint", Types.IntegerType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.optional(50, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(51, "dateint", Types.IntegerType.get())); - Long date20191201 = (Long) Literal.of("2019-12-01T00:00:00.00000") - .to(Types.TimestampType.withoutZone()).value(); - Long date20191202 = (Long) Literal.of("2019-12-02T00:00:00.00000") - .to(Types.TimestampType.withoutZone()).value(); + Long date20191201 = + (Long) + Literal.of("2019-12-01T00:00:00.00000").to(Types.TimestampType.withoutZone()).value(); + Long date20191202 = + (Long) + Literal.of("2019-12-02T00:00:00.00000").to(Types.TimestampType.withoutZone()).value(); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .day("ts") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(schema).day("ts").build(); Transform day = spec.getFieldsBySourceId(50).get(0).transform(); Integer tsDay = (Integer) day.apply(date20191201); Predicate pred = in("ts", date20191201, date20191202); - ResidualEvaluator resEval = ResidualEvaluator.of(spec, - pred, true); + ResidualEvaluator resEval = ResidualEvaluator.of(spec, pred, true); Expression residual = resEval.residualFor(Row.of(tsDay)); Assert.assertEquals("Residual should be the original in predicate", pred, residual); @@ -225,17 +225,15 @@ public void testInTimestamp() { @Test public void testNotIn() { - Schema schema = new Schema( - Types.NestedField.optional(50, "dateint", Types.IntegerType.get()), - Types.NestedField.optional(51, "hour", Types.IntegerType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.optional(50, "dateint", Types.IntegerType.get()), + Types.NestedField.optional(51, "hour", Types.IntegerType.get())); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .identity("dateint") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("dateint").build(); - ResidualEvaluator resEval = ResidualEvaluator.of(spec, - notIn("dateint", 20170815, 20170816, 20170817), true); + ResidualEvaluator resEval = + ResidualEvaluator.of(spec, notIn("dateint", 20170815, 20170816, 20170817), true); Expression residual = resEval.residualFor(Row.of(20180815)); Assert.assertEquals("Residual should be alwaysTrue", alwaysTrue(), residual); @@ -246,18 +244,15 @@ public void testNotIn() { @Test public void testIsNaN() { - Schema schema = new Schema( - Types.NestedField.optional(50, "double", Types.DoubleType.get()), - Types.NestedField.optional(51, "float", Types.FloatType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.optional(50, "double", Types.DoubleType.get()), + Types.NestedField.optional(51, "float", Types.FloatType.get())); // test double field - PartitionSpec spec = PartitionSpec.builderFor(schema) - .identity("double") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("double").build(); - ResidualEvaluator resEval = ResidualEvaluator.of(spec, - isNaN("double"), true); + ResidualEvaluator resEval = ResidualEvaluator.of(spec, isNaN("double"), true); Expression residual = resEval.residualFor(Row.of(Double.NaN)); Assert.assertEquals("Residual should be alwaysTrue", alwaysTrue(), residual); @@ -266,12 +261,9 @@ public void testIsNaN() { Assert.assertEquals("Residual should be alwaysFalse", alwaysFalse(), residual); // test float field - spec = PartitionSpec.builderFor(schema) - .identity("float") - .build(); + spec = PartitionSpec.builderFor(schema).identity("float").build(); - resEval = ResidualEvaluator.of(spec, - isNaN("float"), true); + resEval = ResidualEvaluator.of(spec, isNaN("float"), true); residual = resEval.residualFor(Row.of(Float.NaN)); Assert.assertEquals("Residual should be alwaysTrue", alwaysTrue(), residual); @@ -282,18 +274,15 @@ public void testIsNaN() { @Test public void testNotNaN() { - Schema schema = new Schema( - Types.NestedField.optional(50, "double", Types.DoubleType.get()), - Types.NestedField.optional(51, "float", Types.FloatType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.optional(50, "double", Types.DoubleType.get()), + Types.NestedField.optional(51, "float", Types.FloatType.get())); // test double field - PartitionSpec spec = PartitionSpec.builderFor(schema) - .identity("double") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("double").build(); - ResidualEvaluator resEval = ResidualEvaluator.of(spec, - notNaN("double"), true); + ResidualEvaluator resEval = ResidualEvaluator.of(spec, notNaN("double"), true); Expression residual = resEval.residualFor(Row.of(Double.NaN)); Assert.assertEquals("Residual should be alwaysFalse", alwaysFalse(), residual); @@ -302,12 +291,9 @@ public void testNotNaN() { Assert.assertEquals("Residual should be alwaysTrue", alwaysTrue(), residual); // test float field - spec = PartitionSpec.builderFor(schema) - .identity("float") - .build(); + spec = PartitionSpec.builderFor(schema).identity("float").build(); - resEval = ResidualEvaluator.of(spec, - notNaN("float"), true); + resEval = ResidualEvaluator.of(spec, notNaN("float"), true); residual = resEval.residualFor(Row.of(Float.NaN)); Assert.assertEquals("Residual should be alwaysFalse", alwaysFalse(), residual); @@ -318,26 +304,25 @@ public void testNotNaN() { @Test public void testNotInTimestamp() { - Schema schema = new Schema( - Types.NestedField.optional(50, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(51, "dateint", Types.IntegerType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.optional(50, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(51, "dateint", Types.IntegerType.get())); - Long date20191201 = (Long) Literal.of("2019-12-01T00:00:00.00000") - .to(Types.TimestampType.withoutZone()).value(); - Long date20191202 = (Long) Literal.of("2019-12-02T00:00:00.00000") - .to(Types.TimestampType.withoutZone()).value(); + Long date20191201 = + (Long) + Literal.of("2019-12-01T00:00:00.00000").to(Types.TimestampType.withoutZone()).value(); + Long date20191202 = + (Long) + Literal.of("2019-12-02T00:00:00.00000").to(Types.TimestampType.withoutZone()).value(); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .day("ts") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(schema).day("ts").build(); Transform day = spec.getFieldsBySourceId(50).get(0).transform(); Integer tsDay = (Integer) day.apply(date20191201); Predicate pred = notIn("ts", date20191201, date20191202); - ResidualEvaluator resEval = ResidualEvaluator.of(spec, - pred, true); + ResidualEvaluator resEval = ResidualEvaluator.of(spec, pred, true); Expression residual = resEval.residualFor(Row.of(tsDay)); Assert.assertEquals("Residual should be the original notIn predicate", pred, residual); diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestStartsWith.java b/api/src/test/java/org/apache/iceberg/transforms/TestStartsWith.java index e694d279880b..90143aa7e856 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestStartsWith.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestStartsWith.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; +import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; +import static org.apache.iceberg.expressions.Expressions.startsWith; +import static org.apache.iceberg.types.Types.NestedField.optional; + import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.TestHelpers; @@ -35,10 +38,6 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; -import static org.apache.iceberg.expressions.Expressions.startsWith; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestStartsWith { private static final String COLUMN = "someStringCol"; @@ -48,9 +47,11 @@ public class TestStartsWith { public void testTruncateProjections() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).truncate(COLUMN, 4).build(); - assertProjectionInclusive(spec, startsWith(COLUMN, "ab"), "ab", Expression.Operation.STARTS_WITH); + assertProjectionInclusive( + spec, startsWith(COLUMN, "ab"), "ab", Expression.Operation.STARTS_WITH); assertProjectionInclusive(spec, startsWith(COLUMN, "abab"), "abab", Expression.Operation.EQ); - assertProjectionInclusive(spec, startsWith(COLUMN, "ababab"), "abab", Expression.Operation.STARTS_WITH); + assertProjectionInclusive( + spec, startsWith(COLUMN, "ababab"), "abab", Expression.Operation.STARTS_WITH); assertProjectionStrict(spec, startsWith(COLUMN, "ab"), "ab", Expression.Operation.STARTS_WITH); assertProjectionStrict(spec, startsWith(COLUMN, "abab"), "abab", Expression.Operation.EQ); @@ -63,32 +64,44 @@ public void testTruncateProjections() { public void testTruncateString() { Truncate trunc = Truncate.get(Types.StringType.get(), 2); Expression expr = startsWith(COLUMN, "abcde"); - BoundPredicate boundExpr = (BoundPredicate) Binder.bind(SCHEMA.asStruct(), expr, false); + BoundPredicate boundExpr = + (BoundPredicate) Binder.bind(SCHEMA.asStruct(), expr, false); UnboundPredicate projected = trunc.project(COLUMN, boundExpr); Evaluator evaluator = new Evaluator(SCHEMA.asStruct(), projected); - Assert.assertTrue("startsWith(abcde, truncate(abcdg,2)) => true", + Assert.assertTrue( + "startsWith(abcde, truncate(abcdg,2)) => true", evaluator.eval(TestHelpers.Row.of("abcdg"))); } - private void assertProjectionInclusive(PartitionSpec spec, UnboundPredicate filter, - String expectedLiteral, Expression.Operation expectedOp) { + private void assertProjectionInclusive( + PartitionSpec spec, + UnboundPredicate filter, + String expectedLiteral, + Expression.Operation expectedOp) { Expression projection = Projections.inclusive(spec).project(filter); assertProjection(spec, expectedLiteral, projection, expectedOp); } - private void assertProjectionStrict(PartitionSpec spec, UnboundPredicate filter, - String expectedLiteral, Expression.Operation expectedOp) { + private void assertProjectionStrict( + PartitionSpec spec, + UnboundPredicate filter, + String expectedLiteral, + Expression.Operation expectedOp) { Expression projection = Projections.strict(spec).project(filter); assertProjection(spec, expectedLiteral, projection, expectedOp); } - private void assertProjection(PartitionSpec spec, String expectedLiteral, Expression projection, - Expression.Operation expectedOp) { + private void assertProjection( + PartitionSpec spec, + String expectedLiteral, + Expression projection, + Expression.Operation expectedOp) { UnboundPredicate predicate = assertAndUnwrapUnbound(projection); Literal literal = predicate.literal(); - Truncate transform = (Truncate) spec.getFieldsBySourceId(1).get(0).transform(); + Truncate transform = + (Truncate) spec.getFieldsBySourceId(1).get(0).transform(); String output = transform.toHumanString((String) literal.value()); Assert.assertEquals(expectedOp, predicate.op()); diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestTimestamps.java b/api/src/test/java/org/apache/iceberg/transforms/TestTimestamps.java index 8a4ba8a40e37..c5ff6788dbd2 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestTimestamps.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestTimestamps.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; import org.apache.iceberg.expressions.Literal; @@ -60,20 +59,28 @@ public void testTimestampWithoutZoneToHumanString() { Literal date = Literal.of("2017-12-01T10:12:55.038194").to(type); Transform year = Transforms.year(type); - Assert.assertEquals("Should produce the correct Human string", - "2017", year.toHumanString(year.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "2017", + year.toHumanString(year.apply(date.value()))); Transform month = Transforms.month(type); - Assert.assertEquals("Should produce the correct Human string", - "2017-12", month.toHumanString(month.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "2017-12", + month.toHumanString(month.apply(date.value()))); Transform day = Transforms.day(type); - Assert.assertEquals("Should produce the correct Human string", - "2017-12-01", day.toHumanString(day.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "2017-12-01", + day.toHumanString(day.apply(date.value()))); Transform hour = Transforms.hour(type); - Assert.assertEquals("Should produce the correct Human string", - "2017-12-01-10", hour.toHumanString(hour.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "2017-12-01-10", + hour.toHumanString(hour.apply(date.value()))); } @Test @@ -82,20 +89,28 @@ public void testNegativeTimestampWithoutZoneToHumanString() { Literal date = Literal.of("1969-12-30T10:12:55.038194").to(type); Transform year = Transforms.year(type); - Assert.assertEquals("Should produce the correct Human string", - "1969", year.toHumanString(year.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969", + year.toHumanString(year.apply(date.value()))); Transform month = Transforms.month(type); - Assert.assertEquals("Should produce the correct Human string", - "1969-12", month.toHumanString(month.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969-12", + month.toHumanString(month.apply(date.value()))); Transform day = Transforms.day(type); - Assert.assertEquals("Should produce the correct Human string", - "1969-12-30", day.toHumanString(day.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969-12-30", + day.toHumanString(day.apply(date.value()))); Transform hour = Transforms.hour(type); - Assert.assertEquals("Should produce the correct Human string", - "1969-12-30-10", hour.toHumanString(hour.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969-12-30-10", + hour.toHumanString(hour.apply(date.value()))); } @Test @@ -104,20 +119,28 @@ public void testNegativeTimestampWithoutZoneToHumanStringLowerBound() { Literal date = Literal.of("1969-12-30T00:00:00.000000").to(type); Transform year = Transforms.year(type); - Assert.assertEquals("Should produce the correct Human string", - "1969", year.toHumanString(year.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969", + year.toHumanString(year.apply(date.value()))); Transform month = Transforms.month(type); - Assert.assertEquals("Should produce the correct Human string", - "1969-12", month.toHumanString(month.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969-12", + month.toHumanString(month.apply(date.value()))); Transform day = Transforms.day(type); - Assert.assertEquals("Should produce the correct Human string", - "1969-12-30", day.toHumanString(day.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969-12-30", + day.toHumanString(day.apply(date.value()))); Transform hour = Transforms.hour(type); - Assert.assertEquals("Should produce the correct Human string", - "1969-12-30-00", hour.toHumanString(hour.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969-12-30-00", + hour.toHumanString(hour.apply(date.value()))); } @Test @@ -126,20 +149,28 @@ public void testNegativeTimestampWithoutZoneToHumanStringUpperBound() { Literal date = Literal.of("1969-12-31T23:59:59.999999").to(type); Transform year = Transforms.year(type); - Assert.assertEquals("Should produce the correct Human string", - "1969", year.toHumanString(year.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969", + year.toHumanString(year.apply(date.value()))); Transform month = Transforms.month(type); - Assert.assertEquals("Should produce the correct Human string", - "1969-12", month.toHumanString(month.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969-12", + month.toHumanString(month.apply(date.value()))); Transform day = Transforms.day(type); - Assert.assertEquals("Should produce the correct Human string", - "1969-12-31", day.toHumanString(day.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969-12-31", + day.toHumanString(day.apply(date.value()))); Transform hour = Transforms.hour(type); - Assert.assertEquals("Should produce the correct Human string", - "1969-12-31-23", hour.toHumanString(hour.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "1969-12-31-23", + hour.toHumanString(hour.apply(date.value()))); } @Test @@ -148,34 +179,42 @@ public void testTimestampWithZoneToHumanString() { Literal date = Literal.of("2017-12-01T10:12:55.038194-08:00").to(type); Transform year = Transforms.year(type); - Assert.assertEquals("Should produce the correct Human string", - "2017", year.toHumanString(year.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "2017", + year.toHumanString(year.apply(date.value()))); Transform month = Transforms.month(type); - Assert.assertEquals("Should produce the correct Human string", - "2017-12", month.toHumanString(month.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "2017-12", + month.toHumanString(month.apply(date.value()))); Transform day = Transforms.day(type); - Assert.assertEquals("Should produce the correct Human string", - "2017-12-01", day.toHumanString(day.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "2017-12-01", + day.toHumanString(day.apply(date.value()))); // the hour is 18 because the value is always UTC Transform hour = Transforms.hour(type); - Assert.assertEquals("Should produce the correct Human string", - "2017-12-01-18", hour.toHumanString(hour.apply(date.value()))); + Assert.assertEquals( + "Should produce the correct Human string", + "2017-12-01-18", + hour.toHumanString(hour.apply(date.value()))); } @Test public void testNullHumanString() { Types.TimestampType type = Types.TimestampType.withZone(); - Assert.assertEquals("Should produce \"null\" for null", - "null", Transforms.year(type).toHumanString(null)); - Assert.assertEquals("Should produce \"null\" for null", - "null", Transforms.month(type).toHumanString(null)); - Assert.assertEquals("Should produce \"null\" for null", - "null", Transforms.day(type).toHumanString(null)); - Assert.assertEquals("Should produce \"null\" for null", - "null", Transforms.hour(type).toHumanString(null)); + Assert.assertEquals( + "Should produce \"null\" for null", "null", Transforms.year(type).toHumanString(null)); + Assert.assertEquals( + "Should produce \"null\" for null", "null", Transforms.month(type).toHumanString(null)); + Assert.assertEquals( + "Should produce \"null\" for null", "null", Transforms.day(type).toHumanString(null)); + Assert.assertEquals( + "Should produce \"null\" for null", "null", Transforms.hour(type).toHumanString(null)); } @Test diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestTimestampsProjection.java b/api/src/test/java/org/apache/iceberg/transforms/TestTimestampsProjection.java index d59ecf00ac11..3ed5f9bac84a 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestTimestampsProjection.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestTimestampsProjection.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; +import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.expressions.Expressions.notIn; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.util.stream.Collectors; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; @@ -32,36 +42,33 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.notEqual; -import static org.apache.iceberg.expressions.Expressions.notIn; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestTimestampsProjection { private static final Types.TimestampType TYPE = Types.TimestampType.withoutZone(); private static final Schema SCHEMA = new Schema(optional(1, "timestamp", TYPE)); - public void assertProjectionStrict(PartitionSpec spec, UnboundPredicate filter, - Expression.Operation expectedOp, String expectedLiteral) { + public void assertProjectionStrict( + PartitionSpec spec, + UnboundPredicate filter, + Expression.Operation expectedOp, + String expectedLiteral) { Expression projection = Projections.strict(spec).project(filter); UnboundPredicate predicate = assertAndUnwrapUnbound(projection); Assert.assertEquals(expectedOp, predicate.op()); - Assert.assertNotEquals("Strict projection never runs for IN", Expression.Operation.IN, predicate.op()); + Assert.assertNotEquals( + "Strict projection never runs for IN", Expression.Operation.IN, predicate.op()); Timestamps transform = (Timestamps) spec.getFieldsBySourceId(1).get(0).transform(); if (predicate.op() == Expression.Operation.NOT_IN) { Iterable values = Iterables.transform(predicate.literals(), Literal::value); - String actual = Lists.newArrayList(values).stream().sorted() - .map(v -> transform.toHumanString((Integer) v)).collect(Collectors.toList()).toString(); + String actual = + Lists.newArrayList(values).stream() + .sorted() + .map(v -> transform.toHumanString((Integer) v)) + .collect(Collectors.toList()) + .toString(); Assert.assertEquals(expectedLiteral, actual); } else { Literal literal = predicate.literal(); @@ -70,34 +77,42 @@ public void assertProjectionStrict(PartitionSpec spec, UnboundPredicate filte } } - public void assertProjectionStrictValue(PartitionSpec spec, UnboundPredicate filter, - Expression.Operation expectedOp) { + public void assertProjectionStrictValue( + PartitionSpec spec, UnboundPredicate filter, Expression.Operation expectedOp) { Expression projection = Projections.strict(spec).project(filter); Assert.assertEquals(expectedOp, projection.op()); } - public void assertProjectionInclusiveValue(PartitionSpec spec, UnboundPredicate filter, - Expression.Operation expectedOp) { + public void assertProjectionInclusiveValue( + PartitionSpec spec, UnboundPredicate filter, Expression.Operation expectedOp) { Expression projection = Projections.inclusive(spec).project(filter); Assert.assertEquals(expectedOp, projection.op()); } - public void assertProjectionInclusive(PartitionSpec spec, UnboundPredicate filter, - Expression.Operation expectedOp, String expectedLiteral) { + public void assertProjectionInclusive( + PartitionSpec spec, + UnboundPredicate filter, + Expression.Operation expectedOp, + String expectedLiteral) { Expression projection = Projections.inclusive(spec).project(filter); UnboundPredicate predicate = assertAndUnwrapUnbound(projection); Assert.assertEquals(expectedOp, predicate.op()); - Assert.assertNotEquals("Inclusive projection never runs for NOT_IN", Expression.Operation.NOT_IN, predicate.op()); + Assert.assertNotEquals( + "Inclusive projection never runs for NOT_IN", Expression.Operation.NOT_IN, predicate.op()); Timestamps transform = (Timestamps) spec.getFieldsBySourceId(1).get(0).transform(); if (predicate.op() == Expression.Operation.IN) { Iterable values = Iterables.transform(predicate.literals(), Literal::value); - String actual = Lists.newArrayList(values).stream().sorted() - .map(v -> transform.toHumanString((Integer) v)).collect(Collectors.toList()).toString(); + String actual = + Lists.newArrayList(values).stream() + .sorted() + .map(v -> transform.toHumanString((Integer) v)) + .collect(Collectors.toList()) + .toString(); Assert.assertEquals(expectedLiteral, actual); } else { Literal literal = predicate.literal(); @@ -111,17 +126,26 @@ public void testDayStrictEpoch() { Long date = (long) Literal.of("1970-01-01T00:00:00.00000").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("timestamp").build(); - assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT, "1970-01-01"); - assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "1970-01-01"); - assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT, "1970-01-02"); - assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "1970-01-01"); - assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "1970-01-01"); + assertProjectionStrict( + spec, lessThan("timestamp", date), Expression.Operation.LT, "1970-01-01"); + assertProjectionStrict( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "1970-01-01"); + assertProjectionStrict( + spec, greaterThan("timestamp", date), Expression.Operation.GT, "1970-01-02"); + assertProjectionStrict( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "1970-01-01"); + assertProjectionStrict( + spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "1970-01-01"); assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); Long anotherDate = (long) Literal.of("1970-01-02T00:00:00.00000").to(TYPE).value(); - assertProjectionStrict(spec, notIn("timestamp", date, anotherDate), - Expression.Operation.NOT_IN, "[1970-01-01, 1970-01-02]"); - assertProjectionStrictValue(spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); + assertProjectionStrict( + spec, + notIn("timestamp", date, anotherDate), + Expression.Operation.NOT_IN, + "[1970-01-01, 1970-01-02]"); + assertProjectionStrictValue( + spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); } @Test @@ -129,17 +153,26 @@ public void testDayInclusiveEpoch() { Long date = (long) Literal.of("1970-01-01T00:00:00.00000").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("timestamp").build(); - assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "1970-01-01"); - assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "1970-01-01"); - assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "1970-01-01"); - assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "1970-01-01"); - assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "1970-01-01"); + assertProjectionInclusive( + spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "1970-01-01"); + assertProjectionInclusive( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "1970-01-01"); + assertProjectionInclusive( + spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "1970-01-01"); + assertProjectionInclusive( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "1970-01-01"); + assertProjectionInclusive( + spec, equal("timestamp", date), Expression.Operation.EQ, "1970-01-01"); assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); Long anotherDate = (long) Literal.of("1970-01-02T00:00:00.00000").to(TYPE).value(); - assertProjectionInclusive(spec, in("timestamp", date, anotherDate), - Expression.Operation.IN, "[1970-01-01, 1970-01-02]"); - assertProjectionInclusiveValue(spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, + in("timestamp", date, anotherDate), + Expression.Operation.IN, + "[1970-01-01, 1970-01-02]"); + assertProjectionInclusiveValue( + spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -148,16 +181,24 @@ public void testMonthStrictLowerBound() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("timestamp").build(); assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT, "2017-12"); - assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "2017-12"); - assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT, "2017-12"); - assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "2017-11"); - assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12"); + assertProjectionStrict( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "2017-12"); + assertProjectionStrict( + spec, greaterThan("timestamp", date), Expression.Operation.GT, "2017-12"); + assertProjectionStrict( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "2017-11"); + assertProjectionStrict( + spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12"); assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); Long anotherDate = (long) Literal.of("2017-12-02T00:00:00.00000").to(TYPE).value(); - assertProjectionStrict(spec, notIn("timestamp", anotherDate, date), - Expression.Operation.NOT_IN, "[2017-12, 2017-12]"); - assertProjectionStrictValue(spec, in("timestamp", anotherDate, date), Expression.Operation.FALSE); + assertProjectionStrict( + spec, + notIn("timestamp", anotherDate, date), + Expression.Operation.NOT_IN, + "[2017-12, 2017-12]"); + assertProjectionStrictValue( + spec, in("timestamp", anotherDate, date), Expression.Operation.FALSE); } @Test @@ -166,16 +207,24 @@ public void testNegativeMonthStrictLowerBound() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("timestamp").build(); assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT, "1969-01"); - assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "1969-01"); - assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT, "1969-02"); - assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "1969-01"); - assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_IN, "[1969-01, 1969-02]"); + assertProjectionStrict( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "1969-01"); + assertProjectionStrict( + spec, greaterThan("timestamp", date), Expression.Operation.GT, "1969-02"); + assertProjectionStrict( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "1969-01"); + assertProjectionStrict( + spec, notEqual("timestamp", date), Expression.Operation.NOT_IN, "[1969-01, 1969-02]"); assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); Long anotherDate = (long) Literal.of("1969-03-01T00:00:00.00000").to(TYPE).value(); - assertProjectionStrict(spec, notIn("timestamp", anotherDate, date), - Expression.Operation.NOT_IN, "[1969-01, 1969-02, 1969-03, 1969-04]"); - assertProjectionStrictValue(spec, in("timestamp", anotherDate, date), Expression.Operation.FALSE); + assertProjectionStrict( + spec, + notIn("timestamp", anotherDate, date), + Expression.Operation.NOT_IN, + "[1969-01, 1969-02, 1969-03, 1969-04]"); + assertProjectionStrictValue( + spec, in("timestamp", anotherDate, date), Expression.Operation.FALSE); } @Test @@ -184,16 +233,24 @@ public void testMonthStrictUpperBound() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("timestamp").build(); assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT, "2017-12"); - assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "2018-01"); - assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT, "2017-12"); - assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "2017-12"); - assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12"); + assertProjectionStrict( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "2018-01"); + assertProjectionStrict( + spec, greaterThan("timestamp", date), Expression.Operation.GT, "2017-12"); + assertProjectionStrict( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "2017-12"); + assertProjectionStrict( + spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12"); assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); Long anotherDate = (long) Literal.of("2017-11-02T00:00:00.00000").to(TYPE).value(); - assertProjectionStrict(spec, notIn("timestamp", anotherDate, date), - Expression.Operation.NOT_IN, "[2017-11, 2017-12]"); - assertProjectionStrictValue(spec, in("timestamp", anotherDate, date), Expression.Operation.FALSE); + assertProjectionStrict( + spec, + notIn("timestamp", anotherDate, date), + Expression.Operation.NOT_IN, + "[2017-11, 2017-12]"); + assertProjectionStrictValue( + spec, in("timestamp", anotherDate, date), Expression.Operation.FALSE); } @Test @@ -202,16 +259,24 @@ public void testNegativeMonthStrictUpperBound() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("timestamp").build(); assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT, "1969-12"); - assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "1970-01"); - assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT, "1970-01"); - assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "1970-01"); - assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_IN, "[1969-12, 1970-01]"); + assertProjectionStrict( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "1970-01"); + assertProjectionStrict( + spec, greaterThan("timestamp", date), Expression.Operation.GT, "1970-01"); + assertProjectionStrict( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "1970-01"); + assertProjectionStrict( + spec, notEqual("timestamp", date), Expression.Operation.NOT_IN, "[1969-12, 1970-01]"); assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); Long anotherDate = (long) Literal.of("1970-02-01T00:00:00.00000").to(TYPE).value(); - assertProjectionStrict(spec, notIn("timestamp", anotherDate, date), - Expression.Operation.NOT_IN, "[1969-12, 1970-01, 1970-02]"); - assertProjectionStrictValue(spec, in("timestamp", anotherDate, date), Expression.Operation.FALSE); + assertProjectionStrict( + spec, + notIn("timestamp", anotherDate, date), + Expression.Operation.NOT_IN, + "[1969-12, 1970-01, 1970-02]"); + assertProjectionStrictValue( + spec, in("timestamp", anotherDate, date), Expression.Operation.FALSE); } @Test @@ -219,17 +284,22 @@ public void testMonthInclusiveLowerBound() { Long date = (long) Literal.of("2017-12-01T00:00:00.00000").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("timestamp").build(); - assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-11"); - assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12"); - assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12"); - assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12"); + assertProjectionInclusive( + spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-11"); + assertProjectionInclusive( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12"); + assertProjectionInclusive( + spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12"); + assertProjectionInclusive( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12"); assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "2017-12"); assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); Long anotherDate = (long) Literal.of("2017-12-02T00:00:00.00000").to(TYPE).value(); - assertProjectionInclusive(spec, in("timestamp", date, anotherDate), - Expression.Operation.IN, "[2017-12, 2017-12]"); - assertProjectionInclusiveValue(spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("timestamp", date, anotherDate), Expression.Operation.IN, "[2017-12, 2017-12]"); + assertProjectionInclusiveValue( + spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -237,17 +307,26 @@ public void testNegativeMonthInclusiveLowerBound() { Long date = (long) Literal.of("1969-01-01T00:00:00.00000").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("timestamp").build(); - assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "1969-01"); - assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "1969-02"); - assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "1969-01"); - assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "1969-01"); - assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.IN, "[1969-01, 1969-02]"); + assertProjectionInclusive( + spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "1969-01"); + assertProjectionInclusive( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "1969-02"); + assertProjectionInclusive( + spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "1969-01"); + assertProjectionInclusive( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "1969-01"); + assertProjectionInclusive( + spec, equal("timestamp", date), Expression.Operation.IN, "[1969-01, 1969-02]"); assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); Long anotherDate = (long) Literal.of("1969-03-01T00:00:00.00000").to(TYPE).value(); - assertProjectionInclusive(spec, in("timestamp", date, anotherDate), - Expression.Operation.IN, "[1969-01, 1969-02, 1969-03, 1969-04]"); - assertProjectionInclusiveValue(spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, + in("timestamp", date, anotherDate), + Expression.Operation.IN, + "[1969-01, 1969-02, 1969-03, 1969-04]"); + assertProjectionInclusiveValue( + spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -255,17 +334,22 @@ public void testMonthInclusiveUpperBound() { Long date = (long) Literal.of("2017-12-01T23:59:59.999999").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("timestamp").build(); - assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-12"); - assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12"); - assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12"); - assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12"); + assertProjectionInclusive( + spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-12"); + assertProjectionInclusive( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12"); + assertProjectionInclusive( + spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12"); + assertProjectionInclusive( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12"); assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "2017-12"); assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); Long anotherDate = (long) Literal.of("2017-11-02T00:00:00.00000").to(TYPE).value(); - assertProjectionInclusive(spec, in("timestamp", date, anotherDate), - Expression.Operation.IN, "[2017-11, 2017-12]"); - assertProjectionInclusiveValue(spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("timestamp", date, anotherDate), Expression.Operation.IN, "[2017-11, 2017-12]"); + assertProjectionInclusiveValue( + spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -273,17 +357,23 @@ public void testNegativeMonthInclusiveUpperBound() { Long date = (long) Literal.of("1969-12-31T23:59:59.999999").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).month("timestamp").build(); - assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "1970-01"); - assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "1970-01"); - assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "1970-01"); - assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "1969-12"); - assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.IN, "[1969-12, 1970-01]"); + assertProjectionInclusive( + spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "1970-01"); + assertProjectionInclusive( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "1970-01"); + assertProjectionInclusive( + spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "1970-01"); + assertProjectionInclusive( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "1969-12"); + assertProjectionInclusive( + spec, equal("timestamp", date), Expression.Operation.IN, "[1969-12, 1970-01]"); assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); Long anotherDate = (long) Literal.of("1970-01-01T00:00:00.00000").to(TYPE).value(); - assertProjectionInclusive(spec, in("timestamp", date, anotherDate), - Expression.Operation.IN, "[1969-12, 1970-01]"); - assertProjectionInclusiveValue(spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("timestamp", date, anotherDate), Expression.Operation.IN, "[1969-12, 1970-01]"); + assertProjectionInclusiveValue( + spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -291,17 +381,26 @@ public void testDayStrictLowerBound() { Long date = (long) Literal.of("2017-12-01T00:00:00.00000").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("timestamp").build(); - assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT, "2017-12-01"); - assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "2017-12-01"); - assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT, "2017-12-01"); - assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "2017-11-30"); - assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12-01"); + assertProjectionStrict( + spec, lessThan("timestamp", date), Expression.Operation.LT, "2017-12-01"); + assertProjectionStrict( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "2017-12-01"); + assertProjectionStrict( + spec, greaterThan("timestamp", date), Expression.Operation.GT, "2017-12-01"); + assertProjectionStrict( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "2017-11-30"); + assertProjectionStrict( + spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12-01"); assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); Long anotherDate = (long) Literal.of("2017-12-02T00:00:00.00000").to(TYPE).value(); - assertProjectionStrict(spec, notIn("timestamp", date, anotherDate), - Expression.Operation.NOT_IN, "[2017-12-01, 2017-12-02]"); - assertProjectionStrictValue(spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); + assertProjectionStrict( + spec, + notIn("timestamp", date, anotherDate), + Expression.Operation.NOT_IN, + "[2017-12-01, 2017-12-02]"); + assertProjectionStrictValue( + spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); } @Test @@ -309,17 +408,26 @@ public void testNegativeDayStrictLowerBound() { Long date = (long) Literal.of("1969-01-01T00:00:00.00000").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("timestamp").build(); - assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT, "1969-01-01"); - assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "1969-01-01"); - assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT, "1969-01-02"); - assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "1969-01-01"); - assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_IN, "[1969-01-01, 1969-01-02]"); + assertProjectionStrict( + spec, lessThan("timestamp", date), Expression.Operation.LT, "1969-01-01"); + assertProjectionStrict( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "1969-01-01"); + assertProjectionStrict( + spec, greaterThan("timestamp", date), Expression.Operation.GT, "1969-01-02"); + assertProjectionStrict( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "1969-01-01"); + assertProjectionStrict( + spec, notEqual("timestamp", date), Expression.Operation.NOT_IN, "[1969-01-01, 1969-01-02]"); assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); Long anotherDate = (long) Literal.of("1969-01-02T00:00:00.00000").to(TYPE).value(); - assertProjectionStrict(spec, notIn("timestamp", date, anotherDate), - Expression.Operation.NOT_IN, "[1969-01-01, 1969-01-02, 1969-01-03]"); - assertProjectionStrictValue(spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); + assertProjectionStrict( + spec, + notIn("timestamp", date, anotherDate), + Expression.Operation.NOT_IN, + "[1969-01-01, 1969-01-02, 1969-01-03]"); + assertProjectionStrictValue( + spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); } @Test @@ -327,17 +435,26 @@ public void testDayStrictUpperBound() { Long date = (long) Literal.of("2017-12-01T23:59:59.999999").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("timestamp").build(); - assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT, "2017-12-01"); - assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "2017-12-02"); - assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT, "2017-12-01"); - assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "2017-12-01"); - assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12-01"); + assertProjectionStrict( + spec, lessThan("timestamp", date), Expression.Operation.LT, "2017-12-01"); + assertProjectionStrict( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "2017-12-02"); + assertProjectionStrict( + spec, greaterThan("timestamp", date), Expression.Operation.GT, "2017-12-01"); + assertProjectionStrict( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "2017-12-01"); + assertProjectionStrict( + spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12-01"); assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); Long anotherDate = (long) Literal.of("2017-11-02T00:00:00.00000").to(TYPE).value(); - assertProjectionStrict(spec, notIn("timestamp", date, anotherDate), - Expression.Operation.NOT_IN, "[2017-11-02, 2017-12-01]"); - assertProjectionStrictValue(spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); + assertProjectionStrict( + spec, + notIn("timestamp", date, anotherDate), + Expression.Operation.NOT_IN, + "[2017-11-02, 2017-12-01]"); + assertProjectionStrictValue( + spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); } @Test @@ -345,17 +462,26 @@ public void testNegativeDayStrictUpperBound() { Long date = (long) Literal.of("1969-12-31T23:59:59.999999").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("timestamp").build(); - assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT, "1969-12-31"); - assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "1970-01-01"); - assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT, "1970-01-01"); - assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "1970-01-01"); - assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_IN, "[1969-12-31, 1970-01-01]"); + assertProjectionStrict( + spec, lessThan("timestamp", date), Expression.Operation.LT, "1969-12-31"); + assertProjectionStrict( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "1970-01-01"); + assertProjectionStrict( + spec, greaterThan("timestamp", date), Expression.Operation.GT, "1970-01-01"); + assertProjectionStrict( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "1970-01-01"); + assertProjectionStrict( + spec, notEqual("timestamp", date), Expression.Operation.NOT_IN, "[1969-12-31, 1970-01-01]"); assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); Long anotherDate = (long) Literal.of("1970-01-01T00:00:00.00000").to(TYPE).value(); - assertProjectionStrict(spec, notIn("timestamp", date, anotherDate), - Expression.Operation.NOT_IN, "[1969-12-31, 1970-01-01]"); - assertProjectionStrictValue(spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); + assertProjectionStrict( + spec, + notIn("timestamp", date, anotherDate), + Expression.Operation.NOT_IN, + "[1969-12-31, 1970-01-01]"); + assertProjectionStrictValue( + spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); } @Test @@ -363,17 +489,26 @@ public void testDayInclusiveLowerBound() { Long date = (long) Literal.of("2017-12-01T00:00:00.00000").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("timestamp").build(); - assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-11-30"); - assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01"); - assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01"); - assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01"); - assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "2017-12-01"); + assertProjectionInclusive( + spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-11-30"); + assertProjectionInclusive( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01"); + assertProjectionInclusive( + spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01"); + assertProjectionInclusive( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01"); + assertProjectionInclusive( + spec, equal("timestamp", date), Expression.Operation.EQ, "2017-12-01"); assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); Long anotherDate = (long) Literal.of("2017-12-02T00:00:00.00000").to(TYPE).value(); - assertProjectionInclusive(spec, in("timestamp", date, anotherDate), - Expression.Operation.IN, "[2017-12-01, 2017-12-02]"); - assertProjectionInclusiveValue(spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, + in("timestamp", date, anotherDate), + Expression.Operation.IN, + "[2017-12-01, 2017-12-02]"); + assertProjectionInclusiveValue( + spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -381,17 +516,26 @@ public void testNegativeDayInclusiveLowerBound() { Long date = (long) Literal.of("1969-01-01T00:00:00.00000").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("timestamp").build(); - assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "1969-01-01"); - assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "1969-01-02"); - assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "1969-01-01"); - assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "1969-01-01"); - assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.IN, "[1969-01-01, 1969-01-02]"); + assertProjectionInclusive( + spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "1969-01-01"); + assertProjectionInclusive( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "1969-01-02"); + assertProjectionInclusive( + spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "1969-01-01"); + assertProjectionInclusive( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "1969-01-01"); + assertProjectionInclusive( + spec, equal("timestamp", date), Expression.Operation.IN, "[1969-01-01, 1969-01-02]"); assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); Long anotherDate = (long) Literal.of("1969-01-02T00:00:00.00000").to(TYPE).value(); - assertProjectionInclusive(spec, in("timestamp", date, anotherDate), - Expression.Operation.IN, "[1969-01-01, 1969-01-02, 1969-01-03]"); - assertProjectionInclusiveValue(spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, + in("timestamp", date, anotherDate), + Expression.Operation.IN, + "[1969-01-01, 1969-01-02, 1969-01-03]"); + assertProjectionInclusiveValue( + spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -399,17 +543,26 @@ public void testDayInclusiveUpperBound() { Long date = (long) Literal.of("2017-12-01T23:59:59.999999").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("timestamp").build(); - assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01"); - assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01"); - assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12-02"); - assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01"); - assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "2017-12-01"); + assertProjectionInclusive( + spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01"); + assertProjectionInclusive( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01"); + assertProjectionInclusive( + spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12-02"); + assertProjectionInclusive( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01"); + assertProjectionInclusive( + spec, equal("timestamp", date), Expression.Operation.EQ, "2017-12-01"); assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); Long anotherDate = (long) Literal.of("2017-12-02T00:00:00.00000").to(TYPE).value(); - assertProjectionInclusive(spec, in("timestamp", date, anotherDate), - Expression.Operation.IN, "[2017-12-01, 2017-12-02]"); - assertProjectionInclusiveValue(spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, + in("timestamp", date, anotherDate), + Expression.Operation.IN, + "[2017-12-01, 2017-12-02]"); + assertProjectionInclusiveValue( + spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -417,17 +570,26 @@ public void testNegativeDayInclusiveUpperBound() { Long date = (long) Literal.of("1969-12-31T23:59:59.999999").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("timestamp").build(); - assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "1970-01-01"); - assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "1970-01-01"); - assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "1970-01-01"); - assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "1969-12-31"); - assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.IN, "[1969-12-31, 1970-01-01]"); + assertProjectionInclusive( + spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "1970-01-01"); + assertProjectionInclusive( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "1970-01-01"); + assertProjectionInclusive( + spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "1970-01-01"); + assertProjectionInclusive( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "1969-12-31"); + assertProjectionInclusive( + spec, equal("timestamp", date), Expression.Operation.IN, "[1969-12-31, 1970-01-01]"); assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); Long anotherDate = (long) Literal.of("1970-01-01T00:00:00.00000").to(TYPE).value(); - assertProjectionInclusive(spec, in("timestamp", date, anotherDate), - Expression.Operation.IN, "[1969-12-31, 1970-01-01]"); - assertProjectionInclusiveValue(spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, + in("timestamp", date, anotherDate), + Expression.Operation.IN, + "[1969-12-31, 1970-01-01]"); + assertProjectionInclusiveValue( + spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -436,16 +598,19 @@ public void testYearStrictLowerBound() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).year("timestamp").build(); assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT, "2017"); - assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "2017"); + assertProjectionStrict( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "2017"); assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT, "2017"); - assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "2016"); + assertProjectionStrict( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "2016"); assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017"); assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); Long anotherDate = (long) Literal.of("2016-12-02T00:00:00.00000").to(TYPE).value(); - assertProjectionStrict(spec, notIn("timestamp", date, anotherDate), - Expression.Operation.NOT_IN, "[2016, 2017]"); - assertProjectionStrictValue(spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); + assertProjectionStrict( + spec, notIn("timestamp", date, anotherDate), Expression.Operation.NOT_IN, "[2016, 2017]"); + assertProjectionStrictValue( + spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); } @Test @@ -454,16 +619,19 @@ public void testYearStrictUpperBound() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).year("timestamp").build(); assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT, "2017"); - assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "2018"); + assertProjectionStrict( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "2018"); assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT, "2017"); - assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "2017"); + assertProjectionStrict( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "2017"); assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017"); assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); Long anotherDate = (long) Literal.of("2016-12-31T23:59:59.999999").to(TYPE).value(); - assertProjectionStrict(spec, notIn("timestamp", date, anotherDate), - Expression.Operation.NOT_IN, "[2016, 2017]"); - assertProjectionStrictValue(spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); + assertProjectionStrict( + spec, notIn("timestamp", date, anotherDate), Expression.Operation.NOT_IN, "[2016, 2017]"); + assertProjectionStrictValue( + spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); } @Test @@ -471,17 +639,22 @@ public void testYearInclusiveLowerBound() { Long date = (long) Literal.of("2017-01-01T00:00:00.00000").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).year("timestamp").build(); - assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2016"); - assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017"); - assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017"); - assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017"); + assertProjectionInclusive( + spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2016"); + assertProjectionInclusive( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017"); + assertProjectionInclusive( + spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017"); + assertProjectionInclusive( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017"); assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "2017"); assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); Long anotherDate = (long) Literal.of("2016-12-02T00:00:00.00000").to(TYPE).value(); - assertProjectionInclusive(spec, in("timestamp", date, anotherDate), - Expression.Operation.IN, "[2016, 2017]"); - assertProjectionInclusiveValue(spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("timestamp", date, anotherDate), Expression.Operation.IN, "[2016, 2017]"); + assertProjectionInclusiveValue( + spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -489,17 +662,22 @@ public void testYearInclusiveUpperBound() { Long date = (long) Literal.of("2017-12-31T23:59:59.999999").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).year("timestamp").build(); - assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017"); - assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017"); - assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2018"); - assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017"); + assertProjectionInclusive( + spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017"); + assertProjectionInclusive( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017"); + assertProjectionInclusive( + spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2018"); + assertProjectionInclusive( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017"); assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "2017"); assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); Long anotherDate = (long) Literal.of("2016-12-31T23:59:59.999999").to(TYPE).value(); - assertProjectionInclusive(spec, in("timestamp", date, anotherDate), - Expression.Operation.IN, "[2016, 2017]"); - assertProjectionInclusiveValue(spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("timestamp", date, anotherDate), Expression.Operation.IN, "[2016, 2017]"); + assertProjectionInclusiveValue( + spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -507,17 +685,26 @@ public void testHourStrictLowerBound() { Long date = (long) Literal.of("2017-12-01T10:00:00.00000").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).hour("timestamp").build(); - assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT, "2017-12-01-10"); - assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "2017-12-01-10"); - assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT, "2017-12-01-10"); - assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "2017-12-01-09"); - assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12-01-10"); + assertProjectionStrict( + spec, lessThan("timestamp", date), Expression.Operation.LT, "2017-12-01-10"); + assertProjectionStrict( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "2017-12-01-10"); + assertProjectionStrict( + spec, greaterThan("timestamp", date), Expression.Operation.GT, "2017-12-01-10"); + assertProjectionStrict( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "2017-12-01-09"); + assertProjectionStrict( + spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12-01-10"); assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); Long anotherDate = (long) Literal.of("2016-12-02T00:00:00.00000").to(TYPE).value(); - assertProjectionStrict(spec, notIn("timestamp", date, anotherDate), - Expression.Operation.NOT_IN, "[2016-12-02-00, 2017-12-01-10]"); - assertProjectionStrictValue(spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); + assertProjectionStrict( + spec, + notIn("timestamp", date, anotherDate), + Expression.Operation.NOT_IN, + "[2016-12-02-00, 2017-12-01-10]"); + assertProjectionStrictValue( + spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); } @Test @@ -525,17 +712,26 @@ public void testHourStrictUpperBound() { Long date = (long) Literal.of("2017-12-01T10:59:59.999999").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).hour("timestamp").build(); - assertProjectionStrict(spec, lessThan("timestamp", date), Expression.Operation.LT, "2017-12-01-10"); - assertProjectionStrict(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "2017-12-01-11"); - assertProjectionStrict(spec, greaterThan("timestamp", date), Expression.Operation.GT, "2017-12-01-10"); - assertProjectionStrict(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "2017-12-01-10"); - assertProjectionStrict(spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12-01-10"); + assertProjectionStrict( + spec, lessThan("timestamp", date), Expression.Operation.LT, "2017-12-01-10"); + assertProjectionStrict( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT, "2017-12-01-11"); + assertProjectionStrict( + spec, greaterThan("timestamp", date), Expression.Operation.GT, "2017-12-01-10"); + assertProjectionStrict( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT, "2017-12-01-10"); + assertProjectionStrict( + spec, notEqual("timestamp", date), Expression.Operation.NOT_EQ, "2017-12-01-10"); assertProjectionStrictValue(spec, equal("timestamp", date), Expression.Operation.FALSE); Long anotherDate = (long) Literal.of("2016-12-31T23:59:59.999999").to(TYPE).value(); - assertProjectionStrict(spec, notIn("timestamp", date, anotherDate), - Expression.Operation.NOT_IN, "[2016-12-31-23, 2017-12-01-10]"); - assertProjectionStrictValue(spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); + assertProjectionStrict( + spec, + notIn("timestamp", date, anotherDate), + Expression.Operation.NOT_IN, + "[2016-12-31-23, 2017-12-01-10]"); + assertProjectionStrictValue( + spec, in("timestamp", date, anotherDate), Expression.Operation.FALSE); } @Test @@ -543,17 +739,26 @@ public void testHourInclusiveLowerBound() { Long date = (long) Literal.of("2017-12-01T10:00:00.00000").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).hour("timestamp").build(); - assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01-09"); - assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01-10"); - assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01-10"); - assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01-10"); - assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "2017-12-01-10"); + assertProjectionInclusive( + spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01-09"); + assertProjectionInclusive( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01-10"); + assertProjectionInclusive( + spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01-10"); + assertProjectionInclusive( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01-10"); + assertProjectionInclusive( + spec, equal("timestamp", date), Expression.Operation.EQ, "2017-12-01-10"); assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); Long anotherDate = (long) Literal.of("2016-12-02T00:00:00.00000").to(TYPE).value(); - assertProjectionInclusive(spec, in("timestamp", date, anotherDate), - Expression.Operation.IN, "[2016-12-02-00, 2017-12-01-10]"); - assertProjectionInclusiveValue(spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, + in("timestamp", date, anotherDate), + Expression.Operation.IN, + "[2016-12-02-00, 2017-12-01-10]"); + assertProjectionInclusiveValue( + spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); } @Test @@ -561,16 +766,25 @@ public void testHourInclusiveUpperBound() { Long date = (long) Literal.of("2017-12-01T10:59:59.999999").to(TYPE).value(); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).hour("timestamp").build(); - assertProjectionInclusive(spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01-10"); - assertProjectionInclusive(spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01-10"); - assertProjectionInclusive(spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01-11"); - assertProjectionInclusive(spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01-10"); - assertProjectionInclusive(spec, equal("timestamp", date), Expression.Operation.EQ, "2017-12-01-10"); + assertProjectionInclusive( + spec, lessThan("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01-10"); + assertProjectionInclusive( + spec, lessThanOrEqual("timestamp", date), Expression.Operation.LT_EQ, "2017-12-01-10"); + assertProjectionInclusive( + spec, greaterThan("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01-11"); + assertProjectionInclusive( + spec, greaterThanOrEqual("timestamp", date), Expression.Operation.GT_EQ, "2017-12-01-10"); + assertProjectionInclusive( + spec, equal("timestamp", date), Expression.Operation.EQ, "2017-12-01-10"); assertProjectionInclusiveValue(spec, notEqual("timestamp", date), Expression.Operation.TRUE); Long anotherDate = (long) Literal.of("2016-12-31T23:59:59.999999").to(TYPE).value(); - assertProjectionInclusive(spec, in("timestamp", date, anotherDate), - Expression.Operation.IN, "[2016-12-31-23, 2017-12-01-10]"); - assertProjectionInclusiveValue(spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, + in("timestamp", date, anotherDate), + Expression.Operation.IN, + "[2016-12-31-23, 2017-12-01-10]"); + assertProjectionInclusiveValue( + spec, notIn("timestamp", date, anotherDate), Expression.Operation.TRUE); } } diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestTruncate.java b/api/src/test/java/org/apache/iceberg/transforms/TestTruncate.java index 03f5897349a8..6682086a6e1f 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestTruncate.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestTruncate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; import java.math.BigDecimal; @@ -71,28 +70,29 @@ public void testTruncateDecimal() { @Test public void testTruncateString() { Truncate trunc = Truncate.get(Types.StringType.get(), 5); - Assert.assertEquals("Should truncate strings longer than length", - "abcde", trunc.apply("abcdefg")); - Assert.assertEquals("Should not pad strings shorter than length", - "abc", trunc.apply("abc")); - Assert.assertEquals("Should not alter strings equal to length", - "abcde", trunc.apply("abcde")); + Assert.assertEquals( + "Should truncate strings longer than length", "abcde", trunc.apply("abcdefg")); + Assert.assertEquals("Should not pad strings shorter than length", "abc", trunc.apply("abc")); + Assert.assertEquals("Should not alter strings equal to length", "abcde", trunc.apply("abcde")); } @Test public void testTruncateByteBuffer() throws Exception { Truncate trunc = Truncate.get(Types.BinaryType.get(), 4); - Assert.assertEquals("Should truncate binary longer than length", + Assert.assertEquals( + "Should truncate binary longer than length", ByteBuffer.wrap("abcd".getBytes("UTF-8")), trunc.apply(ByteBuffer.wrap("abcdefg".getBytes("UTF-8")))); - Assert.assertEquals("Should not pad binary shorter than length", + Assert.assertEquals( + "Should not pad binary shorter than length", ByteBuffer.wrap("abc".getBytes("UTF-8")), trunc.apply(ByteBuffer.wrap("abc".getBytes("UTF-8")))); } @Test public void testVerifiedIllegalWidth() { - AssertHelpers.assertThrows("Should fail if width is less than or equal to zero", + AssertHelpers.assertThrows( + "Should fail if width is less than or equal to zero", IllegalArgumentException.class, "Invalid truncate width: 0 (must be > 0)", () -> Truncate.get(Types.IntegerType.get(), 0)); diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestTruncatesProjection.java b/api/src/test/java/org/apache/iceberg/transforms/TestTruncatesProjection.java index 298aee34d6f2..4cbabd4213d7 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestTruncatesProjection.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestTruncatesProjection.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; +import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.expressions.Expressions.notIn; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.math.BigDecimal; import java.nio.ByteBuffer; import java.util.stream.Collectors; @@ -34,34 +44,31 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.notEqual; -import static org.apache.iceberg.expressions.Expressions.notIn; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestTruncatesProjection { - public void assertProjectionStrict(PartitionSpec spec, UnboundPredicate filter, - Expression.Operation expectedOp, String expectedLiteral) { + public void assertProjectionStrict( + PartitionSpec spec, + UnboundPredicate filter, + Expression.Operation expectedOp, + String expectedLiteral) { Expression projection = Projections.strict(spec).project(filter); UnboundPredicate predicate = assertAndUnwrapUnbound(projection); Assert.assertEquals(expectedOp, predicate.op()); - Assert.assertNotEquals("Strict projection never runs for IN", Expression.Operation.IN, predicate.op()); + Assert.assertNotEquals( + "Strict projection never runs for IN", Expression.Operation.IN, predicate.op()); Truncate transform = (Truncate) spec.getFieldsBySourceId(1).get(0).transform(); if (predicate.op() == Expression.Operation.NOT_IN) { Iterable values = Iterables.transform(predicate.literals(), Literal::value); - String actual = Lists.newArrayList(values).stream().sorted() - .map(v -> transform.toHumanString(v)).collect(Collectors.toList()).toString(); + String actual = + Lists.newArrayList(values).stream() + .sorted() + .map(v -> transform.toHumanString(v)) + .collect(Collectors.toList()) + .toString(); Assert.assertEquals(expectedLiteral, actual); } else { Literal literal = predicate.literal(); @@ -70,34 +77,42 @@ public void assertProjectionStrict(PartitionSpec spec, UnboundPredicate filte } } - public void assertProjectionStrictValue(PartitionSpec spec, UnboundPredicate filter, - Expression.Operation expectedOp) { + public void assertProjectionStrictValue( + PartitionSpec spec, UnboundPredicate filter, Expression.Operation expectedOp) { Expression projection = Projections.strict(spec).project(filter); Assert.assertEquals(projection.op(), expectedOp); } - public void assertProjectionInclusiveValue(PartitionSpec spec, UnboundPredicate filter, - Expression.Operation expectedOp) { + public void assertProjectionInclusiveValue( + PartitionSpec spec, UnboundPredicate filter, Expression.Operation expectedOp) { Expression projection = Projections.inclusive(spec).project(filter); Assert.assertEquals(projection.op(), expectedOp); } - public void assertProjectionInclusive(PartitionSpec spec, UnboundPredicate filter, - Expression.Operation expectedOp, String expectedLiteral) { + public void assertProjectionInclusive( + PartitionSpec spec, + UnboundPredicate filter, + Expression.Operation expectedOp, + String expectedLiteral) { Expression projection = Projections.inclusive(spec).project(filter); UnboundPredicate predicate = assertAndUnwrapUnbound(projection); Assert.assertEquals(predicate.op(), expectedOp); - Assert.assertNotEquals("Inclusive projection never runs for NOT_IN", Expression.Operation.NOT_IN, predicate.op()); + Assert.assertNotEquals( + "Inclusive projection never runs for NOT_IN", Expression.Operation.NOT_IN, predicate.op()); Truncate transform = (Truncate) spec.getFieldsBySourceId(1).get(0).transform(); if (predicate.op() == Expression.Operation.IN) { Iterable values = Iterables.transform(predicate.literals(), Literal::value); - String actual = Lists.newArrayList(values).stream().sorted() - .map(v -> transform.toHumanString(v)).collect(Collectors.toList()).toString(); + String actual = + Lists.newArrayList(values).stream() + .sorted() + .map(v -> transform.toHumanString(v)) + .collect(Collectors.toList()) + .toString(); Assert.assertEquals(expectedLiteral, actual); } else { Literal literal = predicate.literal(); @@ -119,8 +134,11 @@ public void testIntegerStrictLowerBound() { assertProjectionStrict(spec, notEqual("value", value), Expression.Operation.NOT_EQ, "100"); assertProjectionStrictValue(spec, equal("value", value), Expression.Operation.FALSE); - assertProjectionStrict(spec, notIn("value", value - 1, value, value + 1), - Expression.Operation.NOT_IN, "[90, 100, 100]"); + assertProjectionStrict( + spec, + notIn("value", value - 1, value, value + 1), + Expression.Operation.NOT_IN, + "[90, 100, 100]"); assertProjectionStrictValue(spec, in("value", value, value + 1), Expression.Operation.FALSE); } @@ -137,8 +155,11 @@ public void testIntegerStrictUpperBound() { assertProjectionStrict(spec, notEqual("value", value), Expression.Operation.NOT_EQ, "90"); assertProjectionStrictValue(spec, equal("value", value), Expression.Operation.FALSE); - assertProjectionStrict(spec, notIn("value", value - 1, value, value + 1), - Expression.Operation.NOT_IN, "[90, 90, 100]"); + assertProjectionStrict( + spec, + notIn("value", value - 1, value, value + 1), + Expression.Operation.NOT_IN, + "[90, 90, 100]"); assertProjectionStrictValue(spec, in("value", value, value - 1), Expression.Operation.FALSE); } @@ -149,15 +170,18 @@ public void testIntegerInclusiveLowerBound() { PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("value", 10).build(); assertProjectionInclusive(spec, lessThan("value", value), Expression.Operation.LT_EQ, "90"); - assertProjectionInclusive(spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "100"); + assertProjectionInclusive( + spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "100"); assertProjectionInclusive(spec, greaterThan("value", value), Expression.Operation.GT_EQ, "100"); - assertProjectionInclusive(spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "100"); + assertProjectionInclusive( + spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "100"); assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, "100"); assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE); - assertProjectionInclusive(spec, in("value", value - 1, value, value + 1), - Expression.Operation.IN, "[90, 100, 100]"); - assertProjectionInclusiveValue(spec, notIn("value", value, value + 1), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("value", value - 1, value, value + 1), Expression.Operation.IN, "[90, 100, 100]"); + assertProjectionInclusiveValue( + spec, notIn("value", value, value + 1), Expression.Operation.TRUE); } @Test @@ -167,15 +191,18 @@ public void testIntegerInclusiveUpperBound() { PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("value", 10).build(); assertProjectionInclusive(spec, lessThan("value", value), Expression.Operation.LT_EQ, "90"); - assertProjectionInclusive(spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "90"); + assertProjectionInclusive( + spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "90"); assertProjectionInclusive(spec, greaterThan("value", value), Expression.Operation.GT_EQ, "100"); - assertProjectionInclusive(spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "90"); + assertProjectionInclusive( + spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "90"); assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, "90"); assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE); - assertProjectionInclusive(spec, in("value", value - 1, value, value + 1), - Expression.Operation.IN, "[90, 90, 100]"); - assertProjectionInclusiveValue(spec, notIn("value", value, value - 1), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("value", value - 1, value, value + 1), Expression.Operation.IN, "[90, 90, 100]"); + assertProjectionInclusiveValue( + spec, notIn("value", value, value - 1), Expression.Operation.TRUE); } @Test @@ -191,8 +218,11 @@ public void testLongStrictLowerBound() { assertProjectionStrict(spec, notEqual("value", value), Expression.Operation.NOT_EQ, "100"); assertProjectionStrictValue(spec, equal("value", value), Expression.Operation.FALSE); - assertProjectionStrict(spec, notIn("value", value - 1, value, value + 1), - Expression.Operation.NOT_IN, "[90, 100, 100]"); + assertProjectionStrict( + spec, + notIn("value", value - 1, value, value + 1), + Expression.Operation.NOT_IN, + "[90, 100, 100]"); assertProjectionStrictValue(spec, in("value", value, value + 1), Expression.Operation.FALSE); } @@ -209,8 +239,11 @@ public void testLongStrictUpperBound() { assertProjectionStrict(spec, notEqual("value", value), Expression.Operation.NOT_EQ, "90"); assertProjectionStrictValue(spec, equal("value", value), Expression.Operation.FALSE); - assertProjectionStrict(spec, notIn("value", value - 1, value, value + 1), - Expression.Operation.NOT_IN, "[90, 90, 100]"); + assertProjectionStrict( + spec, + notIn("value", value - 1, value, value + 1), + Expression.Operation.NOT_IN, + "[90, 90, 100]"); assertProjectionStrictValue(spec, in("value", value, value - 1), Expression.Operation.FALSE); } @@ -221,15 +254,18 @@ public void testLongInclusiveLowerBound() { PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("value", 10).build(); assertProjectionInclusive(spec, lessThan("value", value), Expression.Operation.LT_EQ, "90"); - assertProjectionInclusive(spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "100"); + assertProjectionInclusive( + spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "100"); assertProjectionInclusive(spec, greaterThan("value", value), Expression.Operation.GT_EQ, "100"); - assertProjectionInclusive(spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "100"); + assertProjectionInclusive( + spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "100"); assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, "100"); assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE); - assertProjectionInclusive(spec, in("value", value - 1, value, value + 1), - Expression.Operation.IN, "[90, 100, 100]"); - assertProjectionInclusiveValue(spec, notIn("value", value, value + 1), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("value", value - 1, value, value + 1), Expression.Operation.IN, "[90, 100, 100]"); + assertProjectionInclusiveValue( + spec, notIn("value", value, value + 1), Expression.Operation.TRUE); } @Test @@ -239,15 +275,18 @@ public void testLongInclusiveUpperBound() { PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("value", 10).build(); assertProjectionInclusive(spec, lessThan("value", value), Expression.Operation.LT_EQ, "90"); - assertProjectionInclusive(spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "90"); + assertProjectionInclusive( + spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "90"); assertProjectionInclusive(spec, greaterThan("value", value), Expression.Operation.GT_EQ, "100"); - assertProjectionInclusive(spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "90"); + assertProjectionInclusive( + spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "90"); assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, "90"); assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE); - assertProjectionInclusive(spec, in("value", value - 1, value, value + 1), - Expression.Operation.IN, "[90, 90, 100]"); - assertProjectionInclusiveValue(spec, notIn("value", value, value - 1), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("value", value - 1, value, value + 1), Expression.Operation.IN, "[90, 90, 100]"); + assertProjectionInclusiveValue( + spec, notIn("value", value, value - 1), Expression.Operation.TRUE); } @Test @@ -258,16 +297,22 @@ public void testDecimalStrictLowerBound() { PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("value", 10).build(); assertProjectionStrict(spec, lessThan("value", value), Expression.Operation.LT, "100.00"); - assertProjectionStrict(spec, lessThanOrEqual("value", value), Expression.Operation.LT, "100.00"); + assertProjectionStrict( + spec, lessThanOrEqual("value", value), Expression.Operation.LT, "100.00"); assertProjectionStrict(spec, greaterThan("value", value), Expression.Operation.GT, "100.00"); - assertProjectionStrict(spec, greaterThanOrEqual("value", value), Expression.Operation.GT, "99.90"); + assertProjectionStrict( + spec, greaterThanOrEqual("value", value), Expression.Operation.GT, "99.90"); assertProjectionStrict(spec, notEqual("value", value), Expression.Operation.NOT_EQ, "100.00"); assertProjectionStrictValue(spec, equal("value", value), Expression.Operation.FALSE); BigDecimal delta = new BigDecimal(1); - assertProjectionStrict(spec, notIn("value", value.add(delta), value, value.subtract(delta)), - Expression.Operation.NOT_IN, "[99.00, 100.00, 101.00]"); - assertProjectionStrictValue(spec, in("value", value, value.add(delta)), Expression.Operation.FALSE); + assertProjectionStrict( + spec, + notIn("value", value.add(delta), value, value.subtract(delta)), + Expression.Operation.NOT_IN, + "[99.00, 100.00, 101.00]"); + assertProjectionStrictValue( + spec, in("value", value, value.add(delta)), Expression.Operation.FALSE); } @Test @@ -278,16 +323,22 @@ public void testDecimalStrictUpperBound() { PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("value", 10).build(); assertProjectionStrict(spec, lessThan("value", value), Expression.Operation.LT, "99.90"); - assertProjectionStrict(spec, lessThanOrEqual("value", value), Expression.Operation.LT, "100.00"); + assertProjectionStrict( + spec, lessThanOrEqual("value", value), Expression.Operation.LT, "100.00"); assertProjectionStrict(spec, greaterThan("value", value), Expression.Operation.GT, "99.90"); - assertProjectionStrict(spec, greaterThanOrEqual("value", value), Expression.Operation.GT, "99.90"); + assertProjectionStrict( + spec, greaterThanOrEqual("value", value), Expression.Operation.GT, "99.90"); assertProjectionStrict(spec, notEqual("value", value), Expression.Operation.NOT_EQ, "99.90"); assertProjectionStrictValue(spec, equal("value", value), Expression.Operation.FALSE); BigDecimal delta = new BigDecimal(1); - assertProjectionStrict(spec, notIn("value", value.add(delta), value, value.subtract(delta)), - Expression.Operation.NOT_IN, "[98.90, 99.90, 100.90]"); - assertProjectionStrictValue(spec, in("value", value, value.subtract(delta)), Expression.Operation.FALSE); + assertProjectionStrict( + spec, + notIn("value", value.add(delta), value, value.subtract(delta)), + Expression.Operation.NOT_IN, + "[98.90, 99.90, 100.90]"); + assertProjectionStrictValue( + spec, in("value", value, value.subtract(delta)), Expression.Operation.FALSE); } @Test @@ -298,16 +349,23 @@ public void testDecimalInclusiveLowerBound() { PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("value", 10).build(); assertProjectionInclusive(spec, lessThan("value", value), Expression.Operation.LT_EQ, "99.90"); - assertProjectionInclusive(spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "100.00"); - assertProjectionInclusive(spec, greaterThan("value", value), Expression.Operation.GT_EQ, "100.00"); - assertProjectionInclusive(spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "100.00"); + assertProjectionInclusive( + spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "100.00"); + assertProjectionInclusive( + spec, greaterThan("value", value), Expression.Operation.GT_EQ, "100.00"); + assertProjectionInclusive( + spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "100.00"); assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, "100.00"); assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE); BigDecimal delta = new BigDecimal(1); - assertProjectionInclusive(spec, in("value", value.add(delta), value, value.subtract(delta)), - Expression.Operation.IN, "[99.00, 100.00, 101.00]"); - assertProjectionInclusiveValue(spec, notIn("value", value, value.add(delta)), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, + in("value", value.add(delta), value, value.subtract(delta)), + Expression.Operation.IN, + "[99.00, 100.00, 101.00]"); + assertProjectionInclusiveValue( + spec, notIn("value", value, value.add(delta)), Expression.Operation.TRUE); } @Test @@ -318,16 +376,23 @@ public void testDecimalInclusiveUpperBound() { PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("value", 10).build(); assertProjectionInclusive(spec, lessThan("value", value), Expression.Operation.LT_EQ, "99.90"); - assertProjectionInclusive(spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "99.90"); - assertProjectionInclusive(spec, greaterThan("value", value), Expression.Operation.GT_EQ, "100.00"); - assertProjectionInclusive(spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "99.90"); + assertProjectionInclusive( + spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "99.90"); + assertProjectionInclusive( + spec, greaterThan("value", value), Expression.Operation.GT_EQ, "100.00"); + assertProjectionInclusive( + spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "99.90"); assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, "99.90"); assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE); BigDecimal delta = new BigDecimal(1); - assertProjectionInclusive(spec, in("value", value.add(delta), value, value.subtract(delta)), - Expression.Operation.IN, "[98.90, 99.90, 100.90]"); - assertProjectionInclusiveValue(spec, notIn("value", value, value.subtract(delta)), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, + in("value", value.add(delta), value, value.subtract(delta)), + Expression.Operation.IN, + "[98.90, 99.90, 100.90]"); + assertProjectionInclusiveValue( + spec, notIn("value", value, value.subtract(delta)), Expression.Operation.TRUE); } @Test @@ -339,13 +404,15 @@ public void testStringStrict() { assertProjectionStrict(spec, lessThan("value", value), Expression.Operation.LT, "abcde"); assertProjectionStrict(spec, lessThanOrEqual("value", value), Expression.Operation.LT, "abcde"); assertProjectionStrict(spec, greaterThan("value", value), Expression.Operation.GT, "abcde"); - assertProjectionStrict(spec, greaterThanOrEqual("value", value), Expression.Operation.GT, "abcde"); + assertProjectionStrict( + spec, greaterThanOrEqual("value", value), Expression.Operation.GT, "abcde"); assertProjectionStrict(spec, notEqual("value", value), Expression.Operation.NOT_EQ, "abcde"); assertProjectionStrictValue(spec, equal("value", value), Expression.Operation.FALSE); - assertProjectionStrict(spec, notIn("value", value, value + "abc"), - Expression.Operation.NOT_IN, "[abcde, abcde]"); - assertProjectionStrictValue(spec, in("value", value, value + "abc"), Expression.Operation.FALSE); + assertProjectionStrict( + spec, notIn("value", value, value + "abc"), Expression.Operation.NOT_IN, "[abcde, abcde]"); + assertProjectionStrictValue( + spec, in("value", value, value + "abc"), Expression.Operation.FALSE); } @Test @@ -355,15 +422,19 @@ public void testStringInclusive() { PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("value", 5).build(); assertProjectionInclusive(spec, lessThan("value", value), Expression.Operation.LT_EQ, "abcde"); - assertProjectionInclusive(spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "abcde"); - assertProjectionInclusive(spec, greaterThan("value", value), Expression.Operation.GT_EQ, "abcde"); - assertProjectionInclusive(spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "abcde"); + assertProjectionInclusive( + spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, "abcde"); + assertProjectionInclusive( + spec, greaterThan("value", value), Expression.Operation.GT_EQ, "abcde"); + assertProjectionInclusive( + spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, "abcde"); assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, "abcde"); assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE); - assertProjectionInclusive(spec, in("value", value, value + "abc"), - Expression.Operation.IN, "[abcde, abcde]"); - assertProjectionInclusiveValue(spec, notIn("value", value, value + "abc"), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, in("value", value, value + "abc"), Expression.Operation.IN, "[abcde, abcde]"); + assertProjectionInclusiveValue( + spec, notIn("value", value, value + "abc"), Expression.Operation.TRUE); } @Test @@ -374,15 +445,22 @@ public void testBinaryStrict() throws Exception { String expectedValue = TransformUtil.base64encode(ByteBuffer.wrap("abcde".getBytes("UTF-8"))); assertProjectionStrict(spec, lessThan("value", value), Expression.Operation.LT, expectedValue); - assertProjectionStrict(spec, lessThanOrEqual("value", value), Expression.Operation.LT, expectedValue); - assertProjectionStrict(spec, greaterThan("value", value), Expression.Operation.GT, expectedValue); - assertProjectionStrict(spec, greaterThanOrEqual("value", value), Expression.Operation.GT, expectedValue); - assertProjectionStrict(spec, notEqual("value", value), Expression.Operation.NOT_EQ, expectedValue); + assertProjectionStrict( + spec, lessThanOrEqual("value", value), Expression.Operation.LT, expectedValue); + assertProjectionStrict( + spec, greaterThan("value", value), Expression.Operation.GT, expectedValue); + assertProjectionStrict( + spec, greaterThanOrEqual("value", value), Expression.Operation.GT, expectedValue); + assertProjectionStrict( + spec, notEqual("value", value), Expression.Operation.NOT_EQ, expectedValue); assertProjectionStrictValue(spec, equal("value", value), Expression.Operation.FALSE); ByteBuffer anotherValue = ByteBuffer.wrap("abcdehij".getBytes("UTF-8")); - assertProjectionStrict(spec, notIn("value", value, anotherValue), - Expression.Operation.NOT_IN, String.format("[%s, %s]", expectedValue, expectedValue)); + assertProjectionStrict( + spec, + notIn("value", value, anotherValue), + Expression.Operation.NOT_IN, + String.format("[%s, %s]", expectedValue, expectedValue)); assertProjectionStrictValue(spec, in("value", value, anotherValue), Expression.Operation.FALSE); } @@ -393,16 +471,24 @@ public void testBinaryInclusive() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("value", 5).build(); String expectedValue = TransformUtil.base64encode(ByteBuffer.wrap("abcde".getBytes("UTF-8"))); - assertProjectionInclusive(spec, lessThan("value", value), Expression.Operation.LT_EQ, expectedValue); - assertProjectionInclusive(spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, expectedValue); - assertProjectionInclusive(spec, greaterThan("value", value), Expression.Operation.GT_EQ, expectedValue); - assertProjectionInclusive(spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, expectedValue); + assertProjectionInclusive( + spec, lessThan("value", value), Expression.Operation.LT_EQ, expectedValue); + assertProjectionInclusive( + spec, lessThanOrEqual("value", value), Expression.Operation.LT_EQ, expectedValue); + assertProjectionInclusive( + spec, greaterThan("value", value), Expression.Operation.GT_EQ, expectedValue); + assertProjectionInclusive( + spec, greaterThanOrEqual("value", value), Expression.Operation.GT_EQ, expectedValue); assertProjectionInclusive(spec, equal("value", value), Expression.Operation.EQ, expectedValue); assertProjectionInclusiveValue(spec, notEqual("value", value), Expression.Operation.TRUE); ByteBuffer anotherValue = ByteBuffer.wrap("abcdehij".getBytes("UTF-8")); - assertProjectionInclusive(spec, in("value", value, anotherValue), - Expression.Operation.IN, String.format("[%s, %s]", expectedValue, expectedValue)); - assertProjectionInclusiveValue(spec, notIn("value", value, anotherValue), Expression.Operation.TRUE); + assertProjectionInclusive( + spec, + in("value", value, anotherValue), + Expression.Operation.IN, + String.format("[%s, %s]", expectedValue, expectedValue)); + assertProjectionInclusiveValue( + spec, notIn("value", value, anotherValue), Expression.Operation.TRUE); } } diff --git a/api/src/test/java/org/apache/iceberg/transforms/TestTruncatesResiduals.java b/api/src/test/java/org/apache/iceberg/transforms/TestTruncatesResiduals.java index 2bb6e2458d99..3234b09f37de 100644 --- a/api/src/test/java/org/apache/iceberg/transforms/TestTruncatesResiduals.java +++ b/api/src/test/java/org/apache/iceberg/transforms/TestTruncatesResiduals.java @@ -16,19 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.transforms; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.TestHelpers; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.ResidualEvaluator; -import org.apache.iceberg.expressions.UnboundPredicate; -import org.apache.iceberg.types.Types; -import org.junit.Assert; -import org.junit.Test; - import static org.apache.iceberg.TestHelpers.assertAndUnwrapUnbound; import static org.apache.iceberg.expressions.Expressions.equal; import static org.apache.iceberg.expressions.Expressions.greaterThan; @@ -39,11 +28,21 @@ import static org.apache.iceberg.expressions.Expressions.notStartsWith; import static org.apache.iceberg.expressions.Expressions.startsWith; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.TestHelpers; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.ResidualEvaluator; +import org.apache.iceberg.expressions.UnboundPredicate; +import org.apache.iceberg.types.Types; +import org.junit.Assert; +import org.junit.Test; + public class TestTruncatesResiduals { /** - * Test helper method to compute residual for a given partitionValue against a predicate - * and assert the resulting residual expression is same as the expectedOp + * Test helper method to compute residual for a given partitionValue against a predicate and + * assert the resulting residual expression is same as the expectedOp * * @param spec the partition spec * @param predicate predicate to calculate the residual against @@ -51,8 +50,11 @@ public class TestTruncatesResiduals { * @param expectedOp expected operation to assert against * @param Type parameter of partitionValue */ - public void assertResidualValue(PartitionSpec spec, UnboundPredicate predicate, - T partitionValue, Expression.Operation expectedOp) { + public void assertResidualValue( + PartitionSpec spec, + UnboundPredicate predicate, + T partitionValue, + Expression.Operation expectedOp) { ResidualEvaluator resEval = ResidualEvaluator.of(spec, predicate, true); Expression residual = resEval.residualFor(TestHelpers.Row.of(partitionValue)); @@ -60,16 +62,16 @@ public void assertResidualValue(PartitionSpec spec, UnboundPredicate pred } /** - * Test helper method to compute residual for a given partitionValue against a predicate - * and assert that the resulting expression is same as the original predicate + * Test helper method to compute residual for a given partitionValue against a predicate and + * assert that the resulting expression is same as the original predicate * * @param spec the partition spec * @param predicate predicate to calculate the residual against * @param partitionValue value of the partition to check the residual for * @param Type parameter of partitionValue */ - public void assertResidualPredicate(PartitionSpec spec, - UnboundPredicate predicate, T partitionValue) { + public void assertResidualPredicate( + PartitionSpec spec, UnboundPredicate predicate, T partitionValue) { ResidualEvaluator resEval = ResidualEvaluator.of(spec, predicate, true); Expression residual = resEval.residualFor(TestHelpers.Row.of(partitionValue)); diff --git a/api/src/test/java/org/apache/iceberg/types/TestBinaryComparator.java b/api/src/test/java/org/apache/iceberg/types/TestBinaryComparator.java index 39655cabcd75..3454a47e3058 100644 --- a/api/src/test/java/org/apache/iceberg/types/TestBinaryComparator.java +++ b/api/src/test/java/org/apache/iceberg/types/TestBinaryComparator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.nio.ByteBuffer; @@ -27,35 +26,33 @@ /** * Tests the comparator returned by binary and fixed literals. - *

- * The tests use assertTrue instead of assertEquals because the return value is not necessarily one - * of {-1, 0, 1}. It is also more clear to compare the return value to 0 because the same operation - * can be used: a < b is equivalent to compare(a, b) < 0. + * + *

The tests use assertTrue instead of assertEquals because the return value is not necessarily + * one of {-1, 0, 1}. It is also more clear to compare the return value to 0 because the same + * operation can be used: a < b is equivalent to compare(a, b) < 0. */ public class TestBinaryComparator { @Test public void testBinaryUnsignedComparator() { // b1 < b2 because comparison is unsigned, and -1 has msb set - ByteBuffer b1 = ByteBuffer.wrap(new byte[] { 1, 1, 2 }); - ByteBuffer b2 = ByteBuffer.wrap(new byte[] { 1, -1, 2 }); + ByteBuffer b1 = ByteBuffer.wrap(new byte[] {1, 1, 2}); + ByteBuffer b2 = ByteBuffer.wrap(new byte[] {1, -1, 2}); Comparator cmp = Literal.of(b1).comparator(); - Assert.assertTrue("Negative bytes should sort after positive bytes", - cmp.compare(b1, b2) < 0); + Assert.assertTrue("Negative bytes should sort after positive bytes", cmp.compare(b1, b2) < 0); } @Test public void testFixedUnsignedComparator() { // b1 < b2 because comparison is unsigned, and -1 has msb set - ByteBuffer b1 = ByteBuffer.wrap(new byte[] { 1, 1, 2 }); - ByteBuffer b2 = ByteBuffer.wrap(new byte[] { 1, -1, 2 }); + ByteBuffer b1 = ByteBuffer.wrap(new byte[] {1, 1, 2}); + ByteBuffer b2 = ByteBuffer.wrap(new byte[] {1, -1, 2}); Literal fixedLit = Literal.of(b1).to(Types.FixedType.ofLength(3)); Comparator cmp = fixedLit.comparator(); - Assert.assertTrue("Negative bytes should sort after positive bytes", - cmp.compare(b1, b2) < 0); + Assert.assertTrue("Negative bytes should sort after positive bytes", cmp.compare(b1, b2) < 0); } @Test diff --git a/api/src/test/java/org/apache/iceberg/types/TestCharSeqComparator.java b/api/src/test/java/org/apache/iceberg/types/TestCharSeqComparator.java index a2edb69c4d9e..488d65f096d0 100644 --- a/api/src/test/java/org/apache/iceberg/types/TestCharSeqComparator.java +++ b/api/src/test/java/org/apache/iceberg/types/TestCharSeqComparator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.util.Comparator; @@ -27,10 +26,10 @@ /** * Tests the comparator returned by CharSequence literals. - *

- * The tests use assertTrue instead of assertEquals because the return value is not necessarily one - * of {-1, 0, 1}. It is also more clear to compare the return value to 0 because the same operation - * can be used: a < b is equivalent to compare(a, b) < 0. + * + *

The tests use assertTrue instead of assertEquals because the return value is not necessarily + * one of {-1, 0, 1}. It is also more clear to compare the return value to 0 because the same + * operation can be used: a < b is equivalent to compare(a, b) < 0. */ public class TestCharSeqComparator { @Test @@ -39,12 +38,10 @@ public void testStringAndUtf8() { Utf8 s2 = new Utf8("abc"); Comparator stringComp = Literal.of(s1).comparator(); - Assert.assertEquals("Should consider String and Utf8 equal", - 0, stringComp.compare(s1, s2)); + Assert.assertEquals("Should consider String and Utf8 equal", 0, stringComp.compare(s1, s2)); Comparator utf8Comp = Literal.of(s2).comparator(); - Assert.assertEquals("Should consider String and Utf8 equal", - 0, utf8Comp.compare(s1, s2)); + Assert.assertEquals("Should consider String and Utf8 equal", 0, utf8Comp.compare(s1, s2)); } @Test @@ -55,15 +52,17 @@ public void testSeqLength() { Comparator cmp = Literal.of(s1).comparator(); // Sanity check that String.compareTo gives the same result - Assert.assertTrue("When one string is a substring of the other, the longer is greater", - s1.compareTo(s2) < 0); - Assert.assertTrue("When one string is a substring of the other, the longer is greater", - s2.compareTo(s1) > 0); + Assert.assertTrue( + "When one string is a substring of the other, the longer is greater", s1.compareTo(s2) < 0); + Assert.assertTrue( + "When one string is a substring of the other, the longer is greater", s2.compareTo(s1) > 0); // Test the comparator - Assert.assertTrue("When one string is a substring of the other, the longer is greater", + Assert.assertTrue( + "When one string is a substring of the other, the longer is greater", cmp.compare(s1, s2) < 0); - Assert.assertTrue("When one string is a substring of the other, the longer is greater", + Assert.assertTrue( + "When one string is a substring of the other, the longer is greater", cmp.compare(s2, s1) > 0); } @@ -76,16 +75,12 @@ public void testCharOrderBeforeLength() { Comparator cmp = Literal.of(s1).comparator(); // Sanity check that String.compareTo gives the same result - Assert.assertTrue("First difference takes precedence over length", - s1.compareTo(s2) > 0); - Assert.assertTrue("First difference takes precedence over length", - s2.compareTo(s1) < 0); + Assert.assertTrue("First difference takes precedence over length", s1.compareTo(s2) > 0); + Assert.assertTrue("First difference takes precedence over length", s2.compareTo(s1) < 0); // Test the comparator - Assert.assertTrue("First difference takes precedence over length", - cmp.compare(s1, s2) > 0); - Assert.assertTrue("First difference takes precedence over length", - cmp.compare(s2, s1) < 0); + Assert.assertTrue("First difference takes precedence over length", cmp.compare(s1, s2) > 0); + Assert.assertTrue("First difference takes precedence over length", cmp.compare(s2, s1) < 0); } @Test diff --git a/api/src/test/java/org/apache/iceberg/types/TestComparableComparator.java b/api/src/test/java/org/apache/iceberg/types/TestComparableComparator.java index 309d885b8fae..8c36ee3e68d2 100644 --- a/api/src/test/java/org/apache/iceberg/types/TestComparableComparator.java +++ b/api/src/test/java/org/apache/iceberg/types/TestComparableComparator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.util.Comparator; @@ -26,19 +25,18 @@ /** * This tests the Comparator returned by ComparableLiteral, which is used for most types. - *

- * The tests use assertTrue instead of assertEquals because the return value is not necessarily one - * of {-1, 0, 1}. It is also more clear to compare the return value to 0 because the same operation - * can be used: a < b is equivalent to compare(a, b) < 0. + * + *

The tests use assertTrue instead of assertEquals because the return value is not necessarily + * one of {-1, 0, 1}. It is also more clear to compare the return value to 0 because the same + * operation can be used: a < b is equivalent to compare(a, b) < 0. */ public class TestComparableComparator { @Test public void testNaturalOrder() { Comparator cmp = Literal.of(34L).comparator(); - Assert.assertTrue("Should use the natural order for non-null values", - cmp.compare(33L, 34L) < 0); - Assert.assertTrue("Should use signed ordering", - cmp.compare(33L, -34L) > 0); + Assert.assertTrue( + "Should use the natural order for non-null values", cmp.compare(33L, 34L) < 0); + Assert.assertTrue("Should use signed ordering", cmp.compare(33L, -34L) > 0); } @Test diff --git a/api/src/test/java/org/apache/iceberg/types/TestComparators.java b/api/src/test/java/org/apache/iceberg/types/TestComparators.java index b9b31b579e34..4a9990bb9eca 100644 --- a/api/src/test/java/org/apache/iceberg/types/TestComparators.java +++ b/api/src/test/java/org/apache/iceberg/types/TestComparators.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.math.BigDecimal; @@ -86,7 +85,8 @@ public void testString() { @Test public void testUuid() { - assertComparesCorrectly(Comparators.forType(Types.UUIDType.get()), + assertComparesCorrectly( + Comparators.forType(Types.UUIDType.get()), UUID.fromString("81873e7d-1374-4493-8e1d-9095eff7046c"), UUID.fromString("fd02441d-1423-4a3f-8785-c7dd5647e26b")); } @@ -136,15 +136,16 @@ public void testList() { @Test public void testStruct() { assertComparesCorrectly( - Comparators.forType(Types.StructType.of( - Types.NestedField.required(18, "str19", Types.StringType.get()), - Types.NestedField.required(19, "int19", Types.IntegerType.get()))), + Comparators.forType( + Types.StructType.of( + Types.NestedField.required(18, "str19", Types.StringType.get()), + Types.NestedField.required(19, "int19", Types.IntegerType.get()))), TestHelpers.Row.of("a", 1), TestHelpers.Row.of("a", 2)); assertComparesCorrectly( - Comparators.forType(Types.StructType.of( - Types.NestedField.optional(18, "str19", Types.StringType.get()))), + Comparators.forType( + Types.StructType.of(Types.NestedField.optional(18, "str19", Types.StringType.get()))), TestHelpers.Row.of((String) null), TestHelpers.Row.of("a")); } diff --git a/api/src/test/java/org/apache/iceberg/types/TestConversions.java b/api/src/test/java/org/apache/iceberg/types/TestConversions.java index 3dcf62e75e36..226bf1892e62 100644 --- a/api/src/test/java/org/apache/iceberg/types/TestConversions.java +++ b/api/src/test/java/org/apache/iceberg/types/TestConversions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.math.BigDecimal; @@ -46,70 +45,76 @@ public class TestConversions { @Test public void testByteBufferConversions() { // booleans are stored as 0x00 for 'false' and a non-zero byte for 'true' - assertConversion(false, BooleanType.get(), new byte[]{0x00}); - assertConversion(true, BooleanType.get(), new byte[]{0x01}); - Assert.assertArrayEquals(new byte[]{0x00}, Literal.of(false).toByteBuffer().array()); - Assert.assertArrayEquals(new byte[]{0x01}, Literal.of(true).toByteBuffer().array()); + assertConversion(false, BooleanType.get(), new byte[] {0x00}); + assertConversion(true, BooleanType.get(), new byte[] {0x01}); + Assert.assertArrayEquals(new byte[] {0x00}, Literal.of(false).toByteBuffer().array()); + Assert.assertArrayEquals(new byte[] {0x01}, Literal.of(true).toByteBuffer().array()); // integers are stored as 4 bytes in little-endian order // 84202 is 0...01|01001000|11101010 in binary // 11101010 -> -22, 01001000 -> 72, 00000001 -> 1, 00000000 -> 0 - assertConversion(84202, IntegerType.get(), new byte[]{-22, 72, 1, 0}); - Assert.assertArrayEquals(new byte[]{-22, 72, 1, 0}, Literal.of(84202).toByteBuffer().array()); + assertConversion(84202, IntegerType.get(), new byte[] {-22, 72, 1, 0}); + Assert.assertArrayEquals(new byte[] {-22, 72, 1, 0}, Literal.of(84202).toByteBuffer().array()); // longs are stored as 8 bytes in little-endian order // 200L is 0...0|11001000 in binary // 11001000 -> -56, 00000000 -> 0, ... , 00000000 -> 0 - assertConversion(200L, LongType.get(), new byte[]{-56, 0, 0, 0, 0, 0, 0, 0}); - Assert.assertArrayEquals(new byte[]{-56, 0, 0, 0, 0, 0, 0, 0}, Literal.of(200L).toByteBuffer().array()); + assertConversion(200L, LongType.get(), new byte[] {-56, 0, 0, 0, 0, 0, 0, 0}); + Assert.assertArrayEquals( + new byte[] {-56, 0, 0, 0, 0, 0, 0, 0}, Literal.of(200L).toByteBuffer().array()); // floats are stored as 4 bytes in little-endian order // floating point numbers are represented as sign * 2ˆexponent * mantissa // -4.5F is -1 * 2ˆ2 * 1.125 and encoded as 11000000|10010000|0...0 in binary // 00000000 -> 0, 00000000 -> 0, 10010000 -> -112, 11000000 -> -64, - assertConversion(-4.5F, FloatType.get(), new byte[]{0, 0, -112, -64}); - Assert.assertArrayEquals(new byte[]{0, 0, -112, -64}, Literal.of(-4.5F).toByteBuffer().array()); + assertConversion(-4.5F, FloatType.get(), new byte[] {0, 0, -112, -64}); + Assert.assertArrayEquals( + new byte[] {0, 0, -112, -64}, Literal.of(-4.5F).toByteBuffer().array()); // doubles are stored as 8 bytes in little-endian order // floating point numbers are represented as sign * 2ˆexponent * mantissa // 6.0 is 1 * 2ˆ4 * 1.5 and encoded as 01000000|00011000|0...0 // 00000000 -> 0, ... , 00011000 -> 24, 01000000 -> 64 - assertConversion(6.0, DoubleType.get(), new byte[]{0, 0, 0, 0, 0, 0, 24, 64}); - Assert.assertArrayEquals(new byte[]{0, 0, 0, 0, 0, 0, 24, 64}, Literal.of(6.0).toByteBuffer().array()); + assertConversion(6.0, DoubleType.get(), new byte[] {0, 0, 0, 0, 0, 0, 24, 64}); + Assert.assertArrayEquals( + new byte[] {0, 0, 0, 0, 0, 0, 24, 64}, Literal.of(6.0).toByteBuffer().array()); // dates are stored as days from 1970-01-01 in a 4-byte little-endian int // 1000 is 0...0|00000011|11101000 in binary // 11101000 -> -24, 00000011 -> 3, ... , 00000000 -> 0 - assertConversion(1000, DateType.get(), new byte[]{-24, 3, 0, 0}); - Assert.assertArrayEquals(new byte[]{-24, 3, 0, 0}, Literal.of(1000).to(DateType.get()).toByteBuffer().array()); + assertConversion(1000, DateType.get(), new byte[] {-24, 3, 0, 0}); + Assert.assertArrayEquals( + new byte[] {-24, 3, 0, 0}, Literal.of(1000).to(DateType.get()).toByteBuffer().array()); // time is stored as microseconds from midnight in an 8-byte little-endian long // 10000L is 0...0|00100111|00010000 in binary // 00010000 -> 16, 00100111 -> 39, ... , 00000000 -> 0 - assertConversion(10000L, TimeType.get(), new byte[]{16, 39, 0, 0, 0, 0, 0, 0}); + assertConversion(10000L, TimeType.get(), new byte[] {16, 39, 0, 0, 0, 0, 0, 0}); Assert.assertArrayEquals( - new byte[]{16, 39, 0, 0, 0, 0, 0, 0}, + new byte[] {16, 39, 0, 0, 0, 0, 0, 0}, Literal.of(10000L).to(TimeType.get()).toByteBuffer().array()); - // timestamps are stored as microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian long + // timestamps are stored as microseconds from 1970-01-01 00:00:00.000000 in an 8-byte + // little-endian long // 400000L is 0...110|00011010|10000000 in binary // 10000000 -> -128, 00011010 -> 26, 00000110 -> 6, ... , 00000000 -> 0 - assertConversion(400000L, TimestampType.withoutZone(), new byte[]{-128, 26, 6, 0, 0, 0, 0, 0}); - assertConversion(400000L, TimestampType.withZone(), new byte[]{-128, 26, 6, 0, 0, 0, 0, 0}); + assertConversion(400000L, TimestampType.withoutZone(), new byte[] {-128, 26, 6, 0, 0, 0, 0, 0}); + assertConversion(400000L, TimestampType.withZone(), new byte[] {-128, 26, 6, 0, 0, 0, 0, 0}); Assert.assertArrayEquals( - new byte[]{-128, 26, 6, 0, 0, 0, 0, 0}, + new byte[] {-128, 26, 6, 0, 0, 0, 0, 0}, Literal.of(400000L).to(TimestampType.withoutZone()).toByteBuffer().array()); Assert.assertArrayEquals( - new byte[]{-128, 26, 6, 0, 0, 0, 0, 0}, + new byte[] {-128, 26, 6, 0, 0, 0, 0, 0}, Literal.of(400000L).to(TimestampType.withZone()).toByteBuffer().array()); // strings are stored as UTF-8 bytes (without length) // 'A' -> 65, 'B' -> 66, 'C' -> 67 - assertConversion(CharBuffer.wrap("ABC"), StringType.get(), new byte[]{65, 66, 67}); - Assert.assertArrayEquals(new byte[]{65, 66, 67}, Literal.of("ABC").toByteBuffer().array()); + assertConversion(CharBuffer.wrap("ABC"), StringType.get(), new byte[] {65, 66, 67}); + Assert.assertArrayEquals(new byte[] {65, 66, 67}, Literal.of("ABC").toByteBuffer().array()); // uuids are stored as 16-byte big-endian values - // f79c3e09-677c-4bbd-a479-3f349cb785e7 is encoded as F7 9C 3E 09 67 7C 4B BD A4 79 3F 34 9C B7 85 E7 + // f79c3e09-677c-4bbd-a479-3f349cb785e7 is encoded as F7 9C 3E 09 67 7C 4B BD A4 79 3F 34 9C B7 + // 85 E7 // 0xF7 -> 11110111 -> -9, 0x9C -> 10011100 -> -100, 0x3E -> 00111110 -> 62, // 0x09 -> 00001001 -> 9, 0x67 -> 01100111 -> 103, 0x7C -> 01111100 -> 124, // 0x4B -> 01001011 -> 75, 0xBD -> 10111101 -> -67, 0xA4 -> 10100100 -> -92, @@ -119,9 +124,9 @@ public void testByteBufferConversions() { assertConversion( UUID.fromString("f79c3e09-677c-4bbd-a479-3f349cb785e7"), UUIDType.get(), - new byte[]{-9, -100, 62, 9, 103, 124, 75, -67, -92, 121, 63, 52, -100, -73, -123, -25}); + new byte[] {-9, -100, 62, 9, 103, 124, 75, -67, -92, 121, 63, 52, -100, -73, -123, -25}); Assert.assertArrayEquals( - new byte[]{-9, -100, 62, 9, 103, 124, 75, -67, -92, 121, 63, 52, -100, -73, -123, -25}, + new byte[] {-9, -100, 62, 9, 103, 124, 75, -67, -92, 121, 63, 52, -100, -73, -123, -25}, Literal.of(UUID.fromString("f79c3e09-677c-4bbd-a479-3f349cb785e7")).toByteBuffer().array()); // fixed values are stored directly @@ -129,65 +134,47 @@ public void testByteBufferConversions() { assertConversion( ByteBuffer.wrap("ab".getBytes(StandardCharsets.UTF_8)), FixedType.ofLength(2), - new byte[]{97, 98}); + new byte[] {97, 98}); Assert.assertArrayEquals( - new byte[]{97, 98}, + new byte[] {97, 98}, Literal.of("ab".getBytes(StandardCharsets.UTF_8)).toByteBuffer().array()); // binary values are stored directly // 'Z' -> 90 assertConversion( - ByteBuffer.wrap("Z".getBytes(StandardCharsets.UTF_8)), - BinaryType.get(), - new byte[]{90}); + ByteBuffer.wrap("Z".getBytes(StandardCharsets.UTF_8)), BinaryType.get(), new byte[] {90}); Assert.assertArrayEquals( - new byte[]{90}, + new byte[] {90}, Literal.of(ByteBuffer.wrap("Z".getBytes(StandardCharsets.UTF_8))).toByteBuffer().array()); // decimals are stored as unscaled values in the form of two's-complement big-endian binary, // using the minimum number of bytes for the values // 345 is 0...1|01011001 in binary // 00000001 -> 1, 01011001 -> 89 - assertConversion( - new BigDecimal("3.45"), - DecimalType.of(3, 2), - new byte[]{1, 89}); + assertConversion(new BigDecimal("3.45"), DecimalType.of(3, 2), new byte[] {1, 89}); Assert.assertArrayEquals( - new byte[]{1, 89}, - Literal.of(new BigDecimal("3.45")).toByteBuffer().array()); + new byte[] {1, 89}, Literal.of(new BigDecimal("3.45")).toByteBuffer().array()); // decimal on 3-bytes to test that we use the minimum number of bytes and not a power of 2 // 1234567 is 00010010|11010110|10000111 in binary // 00010010 -> 18, 11010110 -> -42, 10000111 -> -121 - assertConversion( - new BigDecimal("123.4567"), - DecimalType.of(7, 4), - new byte[]{18, -42, -121}); + assertConversion(new BigDecimal("123.4567"), DecimalType.of(7, 4), new byte[] {18, -42, -121}); Assert.assertArrayEquals( - new byte[]{18, -42, -121}, - Literal.of(new BigDecimal("123.4567")).toByteBuffer().array()); + new byte[] {18, -42, -121}, Literal.of(new BigDecimal("123.4567")).toByteBuffer().array()); // negative decimal to test two's complement // -1234567 is 11101101|00101001|01111001 in binary // 11101101 -> -19, 00101001 -> 41, 01111001 -> 121 - assertConversion( - new BigDecimal("-123.4567"), - DecimalType.of(7, 4), - new byte[]{-19, 41, 121}); + assertConversion(new BigDecimal("-123.4567"), DecimalType.of(7, 4), new byte[] {-19, 41, 121}); Assert.assertArrayEquals( - new byte[]{-19, 41, 121}, - Literal.of(new BigDecimal("-123.4567")).toByteBuffer().array()); + new byte[] {-19, 41, 121}, Literal.of(new BigDecimal("-123.4567")).toByteBuffer().array()); // test empty byte in decimal // 11 is 00001011 in binary // 00001011 -> 11 - assertConversion( - new BigDecimal("0.011"), - DecimalType.of(10, 3), - new byte[]{11}); + assertConversion(new BigDecimal("0.011"), DecimalType.of(10, 3), new byte[] {11}); Assert.assertArrayEquals( - new byte[]{11}, - Literal.of(new BigDecimal("0.011")).toByteBuffer().array()); + new byte[] {11}, Literal.of(new BigDecimal("0.011")).toByteBuffer().array()); } private void assertConversion(T value, Type type, byte[] expectedBinary) { diff --git a/api/src/test/java/org/apache/iceberg/types/TestReadabilityChecks.java b/api/src/test/java/org/apache/iceberg/types/TestReadabilityChecks.java index 1a967f61dc99..67d73f5f05c6 100644 --- a/api/src/test/java/org/apache/iceberg/types/TestReadabilityChecks.java +++ b/api/src/test/java/org/apache/iceberg/types/TestReadabilityChecks.java @@ -16,53 +16,55 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.List; import org.apache.iceberg.Schema; import org.apache.iceberg.types.Type.PrimitiveType; import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestReadabilityChecks { - private static final Type.PrimitiveType[] PRIMITIVES = new Type.PrimitiveType[] { - Types.BooleanType.get(), - Types.IntegerType.get(), - Types.LongType.get(), - Types.FloatType.get(), - Types.DoubleType.get(), - Types.DateType.get(), - Types.TimeType.get(), - Types.TimestampType.withoutZone(), - Types.TimestampType.withZone(), - Types.StringType.get(), - Types.UUIDType.get(), - Types.FixedType.ofLength(3), - Types.FixedType.ofLength(4), - Types.BinaryType.get(), - Types.DecimalType.of(9, 2), - Types.DecimalType.of(11, 2), - Types.DecimalType.of(9, 3) - }; + private static final Type.PrimitiveType[] PRIMITIVES = + new Type.PrimitiveType[] { + Types.BooleanType.get(), + Types.IntegerType.get(), + Types.LongType.get(), + Types.FloatType.get(), + Types.DoubleType.get(), + Types.DateType.get(), + Types.TimeType.get(), + Types.TimestampType.withoutZone(), + Types.TimestampType.withZone(), + Types.StringType.get(), + Types.UUIDType.get(), + Types.FixedType.ofLength(3), + Types.FixedType.ofLength(4), + Types.BinaryType.get(), + Types.DecimalType.of(9, 2), + Types.DecimalType.of(11, 2), + Types.DecimalType.of(9, 3) + }; @Test public void testPrimitiveTypes() { for (Type.PrimitiveType from : PRIMITIVES) { Schema fromSchema = new Schema(required(1, "from_field", from)); for (Type.PrimitiveType to : PRIMITIVES) { - List errors = CheckCompatibility.writeCompatibilityErrors( - new Schema(required(1, "to_field", to)), fromSchema); + List errors = + CheckCompatibility.writeCompatibilityErrors( + new Schema(required(1, "to_field", to)), fromSchema); if (TypeUtil.isPromotionAllowed(from, to)) { Assert.assertEquals("Should produce 0 error messages", 0, errors.size()); } else { Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain that promotion is not allowed", + Assert.assertTrue( + "Should complain that promotion is not allowed", errors.get(0).contains("cannot be promoted to")); } } @@ -74,13 +76,15 @@ public void testPrimitiveTypes() { } private void testDisallowPrimitiveToMap(PrimitiveType from, Schema fromSchema) { - Schema mapSchema = new Schema(required(1, "map_field", - Types.MapType.ofRequired(2, 3, Types.StringType.get(), from))); + Schema mapSchema = + new Schema( + required(1, "map_field", Types.MapType.ofRequired(2, 3, Types.StringType.get(), from))); List errors = CheckCompatibility.writeCompatibilityErrors(mapSchema, fromSchema); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain that primitive to map is not allowed", + Assert.assertTrue( + "Should complain that primitive to map is not allowed", errors.get(0).contains("cannot be read as a map")); } @@ -90,18 +94,19 @@ private void testDisallowPrimitiveToList(PrimitiveType from, Schema fromSchema) List errors = CheckCompatibility.writeCompatibilityErrors(listSchema, fromSchema); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain that primitive to list is not allowed", + Assert.assertTrue( + "Should complain that primitive to list is not allowed", errors.get(0).contains("cannot be read as a list")); } private void testDisallowPrimitiveToStruct(PrimitiveType from, Schema fromSchema) { - Schema structSchema = new Schema(required(1, "struct_field", Types.StructType.of( - required(2, "from", from)) - )); + Schema structSchema = + new Schema(required(1, "struct_field", Types.StructType.of(required(2, "from", from)))); List errors = CheckCompatibility.writeCompatibilityErrors(structSchema, fromSchema); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain that primitive to struct is not allowed", + Assert.assertTrue( + "Should complain that primitive to struct is not allowed", errors.get(0).contains("cannot be read as a struct")); } @@ -113,7 +118,8 @@ public void testRequiredSchemaField() { List errors = CheckCompatibility.writeCompatibilityErrors(read, write); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain that a required column is optional", + Assert.assertTrue( + "Should complain that a required column is optional", errors.get(0).contains("should be required, but is optional")); } @@ -125,50 +131,71 @@ public void testMissingSchemaField() { List errors = CheckCompatibility.writeCompatibilityErrors(read, write); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain that a required column is missing", + Assert.assertTrue( + "Should complain that a required column is missing", errors.get(0).contains("is required, but is missing")); } @Test public void testRequiredStructField() { - Schema write = new Schema(required(0, "nested", Types.StructType.of( - optional(1, "from_field", Types.IntegerType.get()) - ))); - Schema read = new Schema(required(0, "nested", Types.StructType.of( - required(1, "to_field", Types.IntegerType.get()) - ))); + Schema write = + new Schema( + required( + 0, + "nested", + Types.StructType.of(optional(1, "from_field", Types.IntegerType.get())))); + Schema read = + new Schema( + required( + 0, + "nested", + Types.StructType.of(required(1, "to_field", Types.IntegerType.get())))); List errors = CheckCompatibility.writeCompatibilityErrors(read, write); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain that a required field is optional", + Assert.assertTrue( + "Should complain that a required field is optional", errors.get(0).contains("should be required, but is optional")); } @Test public void testMissingRequiredStructField() { - Schema write = new Schema(required(0, "nested", Types.StructType.of( - optional(2, "from_field", Types.IntegerType.get()) - ))); - Schema read = new Schema(required(0, "nested", Types.StructType.of( - required(1, "to_field", Types.IntegerType.get()) - ))); + Schema write = + new Schema( + required( + 0, + "nested", + Types.StructType.of(optional(2, "from_field", Types.IntegerType.get())))); + Schema read = + new Schema( + required( + 0, + "nested", + Types.StructType.of(required(1, "to_field", Types.IntegerType.get())))); List errors = CheckCompatibility.writeCompatibilityErrors(read, write); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain that a required field is missing", + Assert.assertTrue( + "Should complain that a required field is missing", errors.get(0).contains("is required, but is missing")); } @Test public void testMissingOptionalStructField() { - Schema write = new Schema(required(0, "nested", Types.StructType.of( - required(2, "from_field", Types.IntegerType.get()) - ))); - Schema read = new Schema(required(0, "nested", Types.StructType.of( - optional(1, "to_field", Types.IntegerType.get()) - ))); + Schema write = + new Schema( + required( + 0, + "nested", + Types.StructType.of(required(2, "from_field", Types.IntegerType.get())))); + Schema read = + new Schema( + required( + 0, + "nested", + Types.StructType.of(optional(1, "to_field", Types.IntegerType.get())))); List errors = CheckCompatibility.writeCompatibilityErrors(read, write); Assert.assertEquals("Should produce no error messages", 0, errors.size()); @@ -176,172 +203,222 @@ public void testMissingOptionalStructField() { @Test public void testIncompatibleStructField() { - Schema write = new Schema(required(0, "nested", Types.StructType.of( - required(1, "from_field", Types.IntegerType.get()) - ))); - Schema read = new Schema(required(0, "nested", Types.StructType.of( - required(1, "to_field", Types.FloatType.get()) - ))); + Schema write = + new Schema( + required( + 0, + "nested", + Types.StructType.of(required(1, "from_field", Types.IntegerType.get())))); + Schema read = + new Schema( + required( + 0, "nested", Types.StructType.of(required(1, "to_field", Types.FloatType.get())))); List errors = CheckCompatibility.writeCompatibilityErrors(read, write); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain about incompatible types", + Assert.assertTrue( + "Should complain about incompatible types", errors.get(0).contains("cannot be promoted to float")); } @Test public void testIncompatibleStructAndPrimitive() { - Schema write = new Schema(required(0, "nested", Types.StructType.of( - required(1, "from_field", Types.StringType.get()) - ))); + Schema write = + new Schema( + required( + 0, + "nested", + Types.StructType.of(required(1, "from_field", Types.StringType.get())))); Schema read = new Schema(required(0, "nested", Types.StringType.get())); List errors = CheckCompatibility.writeCompatibilityErrors(read, write); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain about incompatible types", + Assert.assertTrue( + "Should complain about incompatible types", errors.get(0).contains("struct cannot be read as a string")); } @Test public void testMultipleErrors() { // required field is optional and cannot be promoted to the read type - Schema write = new Schema(required(0, "nested", Types.StructType.of( - optional(1, "from_field", Types.IntegerType.get()) - ))); - Schema read = new Schema(required(0, "nested", Types.StructType.of( - required(1, "to_field", Types.FloatType.get()) - ))); + Schema write = + new Schema( + required( + 0, + "nested", + Types.StructType.of(optional(1, "from_field", Types.IntegerType.get())))); + Schema read = + new Schema( + required( + 0, "nested", Types.StructType.of(required(1, "to_field", Types.FloatType.get())))); List errors = CheckCompatibility.writeCompatibilityErrors(read, write); Assert.assertEquals("Should produce 1 error message", 2, errors.size()); - Assert.assertTrue("Should complain that a required field is optional", + Assert.assertTrue( + "Should complain that a required field is optional", errors.get(0).contains("should be required, but is optional")); - Assert.assertTrue("Should complain about incompatible types", + Assert.assertTrue( + "Should complain about incompatible types", errors.get(1).contains("cannot be promoted to float")); } @Test public void testRequiredMapValue() { - Schema write = new Schema(required(0, "map_field", Types.MapType.ofOptional( - 1, 2, Types.StringType.get(), Types.IntegerType.get() - ))); - Schema read = new Schema(required(0, "map_field", Types.MapType.ofRequired( - 1, 2, Types.StringType.get(), Types.IntegerType.get() - ))); + Schema write = + new Schema( + required( + 0, + "map_field", + Types.MapType.ofOptional(1, 2, Types.StringType.get(), Types.IntegerType.get()))); + Schema read = + new Schema( + required( + 0, + "map_field", + Types.MapType.ofRequired(1, 2, Types.StringType.get(), Types.IntegerType.get()))); List errors = CheckCompatibility.writeCompatibilityErrors(read, write); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain that values are optional", + Assert.assertTrue( + "Should complain that values are optional", errors.get(0).contains("values should be required, but are optional")); } @Test public void testIncompatibleMapKey() { - Schema write = new Schema(required(0, "map_field", Types.MapType.ofOptional( - 1, 2, Types.IntegerType.get(), Types.StringType.get() - ))); - Schema read = new Schema(required(0, "map_field", Types.MapType.ofOptional( - 1, 2, Types.DoubleType.get(), Types.StringType.get() - ))); + Schema write = + new Schema( + required( + 0, + "map_field", + Types.MapType.ofOptional(1, 2, Types.IntegerType.get(), Types.StringType.get()))); + Schema read = + new Schema( + required( + 0, + "map_field", + Types.MapType.ofOptional(1, 2, Types.DoubleType.get(), Types.StringType.get()))); List errors = CheckCompatibility.writeCompatibilityErrors(read, write); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain about incompatible types", + Assert.assertTrue( + "Should complain about incompatible types", errors.get(0).contains("cannot be promoted to double")); } @Test public void testIncompatibleMapValue() { - Schema write = new Schema(required(0, "map_field", Types.MapType.ofOptional( - 1, 2, Types.StringType.get(), Types.IntegerType.get() - ))); - Schema read = new Schema(required(0, "map_field", Types.MapType.ofOptional( - 1, 2, Types.StringType.get(), Types.DoubleType.get() - ))); + Schema write = + new Schema( + required( + 0, + "map_field", + Types.MapType.ofOptional(1, 2, Types.StringType.get(), Types.IntegerType.get()))); + Schema read = + new Schema( + required( + 0, + "map_field", + Types.MapType.ofOptional(1, 2, Types.StringType.get(), Types.DoubleType.get()))); List errors = CheckCompatibility.writeCompatibilityErrors(read, write); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain about incompatible types", + Assert.assertTrue( + "Should complain about incompatible types", errors.get(0).contains("cannot be promoted to double")); } @Test public void testIncompatibleMapAndPrimitive() { - Schema write = new Schema(required(0, "map_field", Types.MapType.ofOptional( - 1, 2, Types.StringType.get(), Types.IntegerType.get() - ))); + Schema write = + new Schema( + required( + 0, + "map_field", + Types.MapType.ofOptional(1, 2, Types.StringType.get(), Types.IntegerType.get()))); Schema read = new Schema(required(0, "map_field", Types.StringType.get())); List errors = CheckCompatibility.writeCompatibilityErrors(read, write); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain about incompatible types", + Assert.assertTrue( + "Should complain about incompatible types", errors.get(0).contains("map cannot be read as a string")); } @Test public void testRequiredListElement() { - Schema write = new Schema(required(0, "list_field", Types.ListType.ofOptional( - 1, Types.IntegerType.get() - ))); - Schema read = new Schema(required(0, "list_field", Types.ListType.ofRequired( - 1, Types.IntegerType.get() - ))); + Schema write = + new Schema( + required(0, "list_field", Types.ListType.ofOptional(1, Types.IntegerType.get()))); + Schema read = + new Schema( + required(0, "list_field", Types.ListType.ofRequired(1, Types.IntegerType.get()))); List errors = CheckCompatibility.writeCompatibilityErrors(read, write); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain that elements are optional", + Assert.assertTrue( + "Should complain that elements are optional", errors.get(0).contains("elements should be required, but are optional")); } @Test public void testIncompatibleListElement() { - Schema write = new Schema(required(0, "list_field", Types.ListType.ofOptional( - 1, Types.IntegerType.get() - ))); - Schema read = new Schema(required(0, "list_field", Types.ListType.ofOptional( - 1, Types.StringType.get() - ))); + Schema write = + new Schema( + required(0, "list_field", Types.ListType.ofOptional(1, Types.IntegerType.get()))); + Schema read = + new Schema(required(0, "list_field", Types.ListType.ofOptional(1, Types.StringType.get()))); List errors = CheckCompatibility.writeCompatibilityErrors(read, write); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain about incompatible types", + Assert.assertTrue( + "Should complain about incompatible types", errors.get(0).contains("cannot be promoted to string")); } @Test public void testIncompatibleListAndPrimitive() { - Schema write = new Schema(required(0, "list_field", Types.ListType.ofOptional( - 1, Types.IntegerType.get() - ))); + Schema write = + new Schema( + required(0, "list_field", Types.ListType.ofOptional(1, Types.IntegerType.get()))); Schema read = new Schema(required(0, "list_field", Types.StringType.get())); List errors = CheckCompatibility.writeCompatibilityErrors(read, write); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain about incompatible types", + Assert.assertTrue( + "Should complain about incompatible types", errors.get(0).contains("list cannot be read as a string")); } @Test public void testDifferentFieldOrdering() { // writes should not reorder fields - Schema read = new Schema(required(0, "nested", Types.StructType.of( - required(1, "field_a", Types.IntegerType.get()), - required(2, "field_b", Types.IntegerType.get()) - ))); - Schema write = new Schema(required(0, "nested", Types.StructType.of( - required(2, "field_b", Types.IntegerType.get()), - required(1, "field_a", Types.IntegerType.get()) - ))); + Schema read = + new Schema( + required( + 0, + "nested", + Types.StructType.of( + required(1, "field_a", Types.IntegerType.get()), + required(2, "field_b", Types.IntegerType.get())))); + Schema write = + new Schema( + required( + 0, + "nested", + Types.StructType.of( + required(2, "field_b", Types.IntegerType.get()), + required(1, "field_a", Types.IntegerType.get())))); List errors = CheckCompatibility.writeCompatibilityErrors(read, write, false); Assert.assertEquals("Should produce 0 error message", 0, errors.size()); @@ -350,33 +427,50 @@ public void testDifferentFieldOrdering() { @Test public void testStructWriteReordering() { // writes should not reorder fields - Schema read = new Schema(required(0, "nested", Types.StructType.of( - required(1, "field_a", Types.IntegerType.get()), - required(2, "field_b", Types.IntegerType.get()) - ))); - Schema write = new Schema(required(0, "nested", Types.StructType.of( - required(2, "field_b", Types.IntegerType.get()), - required(1, "field_a", Types.IntegerType.get()) - ))); + Schema read = + new Schema( + required( + 0, + "nested", + Types.StructType.of( + required(1, "field_a", Types.IntegerType.get()), + required(2, "field_b", Types.IntegerType.get())))); + Schema write = + new Schema( + required( + 0, + "nested", + Types.StructType.of( + required(2, "field_b", Types.IntegerType.get()), + required(1, "field_a", Types.IntegerType.get())))); List errors = CheckCompatibility.writeCompatibilityErrors(read, write); Assert.assertEquals("Should produce 1 error message", 1, errors.size()); - Assert.assertTrue("Should complain about field_b before field_a", + Assert.assertTrue( + "Should complain about field_b before field_a", errors.get(0).contains("field_b is out of order, before field_a")); } @Test public void testStructReadReordering() { // reads should allow reordering - Schema read = new Schema(required(0, "nested", Types.StructType.of( - required(1, "field_a", Types.IntegerType.get()), - required(2, "field_b", Types.IntegerType.get()) - ))); - Schema write = new Schema(required(0, "nested", Types.StructType.of( - required(2, "field_b", Types.IntegerType.get()), - required(1, "field_a", Types.IntegerType.get()) - ))); + Schema read = + new Schema( + required( + 0, + "nested", + Types.StructType.of( + required(1, "field_a", Types.IntegerType.get()), + required(2, "field_b", Types.IntegerType.get())))); + Schema write = + new Schema( + required( + 0, + "nested", + Types.StructType.of( + required(2, "field_b", Types.IntegerType.get()), + required(1, "field_a", Types.IntegerType.get())))); List errors = CheckCompatibility.readCompatibilityErrors(read, write); Assert.assertEquals("Should produce no error messages", 0, errors.size()); @@ -384,16 +478,19 @@ public void testStructReadReordering() { @Test public void testCaseInsensitiveSchemaProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); Assert.assertNotNull(schema.caseInsensitiveSelect("ID").findField(0)); Assert.assertNotNull(schema.caseInsensitiveSelect("loCATIONs").findField(5)); @@ -412,12 +509,18 @@ public void testCheckNullabilityRequiredSchemaField() { @Test public void testCheckNullabilityRequiredStructField() { - Schema write = new Schema(required(0, "nested", Types.StructType.of( - optional(1, "from_field", Types.IntegerType.get()) - ))); - Schema read = new Schema(required(0, "nested", Types.StructType.of( - required(1, "to_field", Types.IntegerType.get()) - ))); + Schema write = + new Schema( + required( + 0, + "nested", + Types.StructType.of(optional(1, "from_field", Types.IntegerType.get())))); + Schema read = + new Schema( + required( + 0, + "nested", + Types.StructType.of(required(1, "to_field", Types.IntegerType.get())))); List errors = CheckCompatibility.typeCompatibilityErrors(read, write); Assert.assertEquals("Should produce no error messages", 0, errors.size()); diff --git a/api/src/test/java/org/apache/iceberg/types/TestSerializableTypes.java b/api/src/test/java/org/apache/iceberg/types/TestSerializableTypes.java index 8cd6a95b1520..ac6ca7edfdcf 100644 --- a/api/src/test/java/org/apache/iceberg/types/TestSerializableTypes.java +++ b/api/src/test/java/org/apache/iceberg/types/TestSerializableTypes.java @@ -16,141 +16,166 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import org.apache.iceberg.Schema; import org.apache.iceberg.TestHelpers; import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSerializableTypes { @Test public void testIdentityTypes() throws Exception { // these types make a strong guarantee than equality, instances are identical - Type[] identityPrimitives = new Type[] { - Types.BooleanType.get(), - Types.IntegerType.get(), - Types.LongType.get(), - Types.FloatType.get(), - Types.DoubleType.get(), - Types.DateType.get(), - Types.TimeType.get(), - Types.TimestampType.withoutZone(), - Types.TimestampType.withZone(), - Types.StringType.get(), - Types.UUIDType.get(), - Types.BinaryType.get() - }; + Type[] identityPrimitives = + new Type[] { + Types.BooleanType.get(), + Types.IntegerType.get(), + Types.LongType.get(), + Types.FloatType.get(), + Types.DoubleType.get(), + Types.DateType.get(), + Types.TimeType.get(), + Types.TimestampType.withoutZone(), + Types.TimestampType.withZone(), + Types.StringType.get(), + Types.UUIDType.get(), + Types.BinaryType.get() + }; for (Type type : identityPrimitives) { - Assert.assertSame("Serialization result should be identical to starting type", - type, TestHelpers.roundTripSerialize(type)); + Assert.assertSame( + "Serialization result should be identical to starting type", + type, + TestHelpers.roundTripSerialize(type)); } } @Test public void testEqualTypes() throws Exception { - Type[] equalityPrimitives = new Type[] { - Types.DecimalType.of(9, 3), - Types.DecimalType.of(11, 0), - Types.FixedType.ofLength(4), - Types.FixedType.ofLength(34) - }; + Type[] equalityPrimitives = + new Type[] { + Types.DecimalType.of(9, 3), + Types.DecimalType.of(11, 0), + Types.FixedType.ofLength(4), + Types.FixedType.ofLength(34) + }; for (Type type : equalityPrimitives) { - Assert.assertEquals("Serialization result should be equal to starting type", - type, TestHelpers.roundTripSerialize(type)); + Assert.assertEquals( + "Serialization result should be equal to starting type", + type, + TestHelpers.roundTripSerialize(type)); } } @Test public void testStructs() throws Exception { - Types.StructType struct = Types.StructType.of( - Types.NestedField.required(34, "Name!", Types.StringType.get()), - Types.NestedField.optional(35, "col", Types.DecimalType.of(38, 2))); + Types.StructType struct = + Types.StructType.of( + Types.NestedField.required(34, "Name!", Types.StringType.get()), + Types.NestedField.optional(35, "col", Types.DecimalType.of(38, 2))); Type copy = TestHelpers.roundTripSerialize(struct); Assert.assertEquals("Struct serialization should be equal to starting type", struct, copy); Type stringType = copy.asNestedType().asStructType().fieldType("Name!"); - Assert.assertSame("Struct serialization should preserve identity type", - Types.StringType.get(), stringType); + Assert.assertSame( + "Struct serialization should preserve identity type", Types.StringType.get(), stringType); Type decimalType = copy.asNestedType().asStructType().field(35).type(); - Assert.assertEquals("Struct serialization should support id lookup", - Types.DecimalType.of(38, 2), decimalType); + Assert.assertEquals( + "Struct serialization should support id lookup", Types.DecimalType.of(38, 2), decimalType); } @Test public void testMaps() throws Exception { - Type[] maps = new Type[] { - Types.MapType.ofOptional(1, 2, Types.StringType.get(), Types.LongType.get()), - Types.MapType.ofRequired(4, 5, Types.StringType.get(), Types.LongType.get()) - }; + Type[] maps = + new Type[] { + Types.MapType.ofOptional(1, 2, Types.StringType.get(), Types.LongType.get()), + Types.MapType.ofRequired(4, 5, Types.StringType.get(), Types.LongType.get()) + }; for (Type map : maps) { Type copy = TestHelpers.roundTripSerialize(map); Assert.assertEquals("Map serialization should be equal to starting type", map, copy); - Assert.assertSame("Map serialization should preserve identity type", - Types.LongType.get(), map.asNestedType().asMapType().valueType()); + Assert.assertSame( + "Map serialization should preserve identity type", + Types.LongType.get(), + map.asNestedType().asMapType().valueType()); } } @Test public void testLists() throws Exception { - Type[] maps = new Type[] { - Types.ListType.ofOptional(2, Types.DoubleType.get()), - Types.ListType.ofRequired(5, Types.DoubleType.get()) - }; + Type[] maps = + new Type[] { + Types.ListType.ofOptional(2, Types.DoubleType.get()), + Types.ListType.ofRequired(5, Types.DoubleType.get()) + }; for (Type list : maps) { Type copy = TestHelpers.roundTripSerialize(list); Assert.assertEquals("List serialization should be equal to starting type", list, copy); - Assert.assertSame("List serialization should preserve identity type", - Types.DoubleType.get(), list.asNestedType().asListType().elementType()); + Assert.assertSame( + "List serialization should preserve identity type", + Types.DoubleType.get(), + list.asNestedType().asListType().elementType()); } } @Test public void testSchema() throws Exception { - Schema schema = new Schema( - required(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()), - optional(3, "preferences", Types.StructType.of( - required(8, "feature1", Types.BooleanType.get()), - optional(9, "feature2", Types.BooleanType.get()) - )), - required(4, "locations", Types.MapType.ofRequired(10, 11, - Types.StringType.get(), - Types.StructType.of( - required(12, "lat", Types.FloatType.get()), - required(13, "long", Types.FloatType.get()) - ))), - optional(5, "points", Types.ListType.ofOptional(14, - Types.StructType.of( - required(15, "x", Types.LongType.get()), - required(16, "y", Types.LongType.get()) - ))), - required(6, "doubles", Types.ListType.ofRequired(17, - Types.DoubleType.get() - )), - optional(7, "properties", Types.MapType.ofOptional(18, 19, - Types.StringType.get(), - Types.StringType.get() - )), - required(20, "complex_key_map", Types.MapType.ofOptional(21, 22, - Types.StructType.of( - required(23, "x", Types.LongType.get()), - optional(24, "y", Types.LongType.get())), - Types.StringType.get())) - ); - - Assert.assertEquals("Schema serialization should be equal to starting schema", - schema.asStruct(), TestHelpers.roundTripSerialize(schema).asStruct()); + Schema schema = + new Schema( + required(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get()), + optional( + 3, + "preferences", + Types.StructType.of( + required(8, "feature1", Types.BooleanType.get()), + optional(9, "feature2", Types.BooleanType.get()))), + required( + 4, + "locations", + Types.MapType.ofRequired( + 10, + 11, + Types.StringType.get(), + Types.StructType.of( + required(12, "lat", Types.FloatType.get()), + required(13, "long", Types.FloatType.get())))), + optional( + 5, + "points", + Types.ListType.ofOptional( + 14, + Types.StructType.of( + required(15, "x", Types.LongType.get()), + required(16, "y", Types.LongType.get())))), + required(6, "doubles", Types.ListType.ofRequired(17, Types.DoubleType.get())), + optional( + 7, + "properties", + Types.MapType.ofOptional(18, 19, Types.StringType.get(), Types.StringType.get())), + required( + 20, + "complex_key_map", + Types.MapType.ofOptional( + 21, + 22, + Types.StructType.of( + required(23, "x", Types.LongType.get()), + optional(24, "y", Types.LongType.get())), + Types.StringType.get()))); + + Assert.assertEquals( + "Schema serialization should be equal to starting schema", + schema.asStruct(), + TestHelpers.roundTripSerialize(schema).asStruct()); } - } diff --git a/api/src/test/java/org/apache/iceberg/types/TestTypeUtil.java b/api/src/test/java/org/apache/iceberg/types/TestTypeUtil.java index 5e64f8a712ce..69cde2929018 100644 --- a/api/src/test/java/org/apache/iceberg/types/TestTypeUtil.java +++ b/api/src/test/java/org/apache/iceberg/types/TestTypeUtil.java @@ -16,10 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.types; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Set; import org.apache.iceberg.AssertHelpers; import org.apache.iceberg.Schema; @@ -30,121 +31,133 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - - public class TestTypeUtil { @Test public void testReassignIdsDuplicateColumns() { - Schema schema = new Schema( - required(0, "a", Types.IntegerType.get()), - required(1, "A", Types.IntegerType.get()) - ); - Schema sourceSchema = new Schema( - required(1, "a", Types.IntegerType.get()), - required(2, "A", Types.IntegerType.get()) - ); + Schema schema = + new Schema( + required(0, "a", Types.IntegerType.get()), required(1, "A", Types.IntegerType.get())); + Schema sourceSchema = + new Schema( + required(1, "a", Types.IntegerType.get()), required(2, "A", Types.IntegerType.get())); final Schema actualSchema = TypeUtil.reassignIds(schema, sourceSchema); Assert.assertEquals(sourceSchema.asStruct(), actualSchema.asStruct()); } @Test public void testReassignIdsWithIdentifier() { - Schema schema = new Schema( - Lists.newArrayList( - required(0, "a", Types.IntegerType.get()), - required(1, "A", Types.IntegerType.get())), - Sets.newHashSet(0) - ); - Schema sourceSchema = new Schema( - Lists.newArrayList( - required(1, "a", Types.IntegerType.get()), - required(2, "A", Types.IntegerType.get())), - Sets.newHashSet(1) - ); + Schema schema = + new Schema( + Lists.newArrayList( + required(0, "a", Types.IntegerType.get()), + required(1, "A", Types.IntegerType.get())), + Sets.newHashSet(0)); + Schema sourceSchema = + new Schema( + Lists.newArrayList( + required(1, "a", Types.IntegerType.get()), + required(2, "A", Types.IntegerType.get())), + Sets.newHashSet(1)); final Schema actualSchema = TypeUtil.reassignIds(schema, sourceSchema); Assert.assertEquals(sourceSchema.asStruct(), actualSchema.asStruct()); - Assert.assertEquals("identifier field ID should change based on source schema", - sourceSchema.identifierFieldIds(), actualSchema.identifierFieldIds()); + Assert.assertEquals( + "identifier field ID should change based on source schema", + sourceSchema.identifierFieldIds(), + actualSchema.identifierFieldIds()); } @Test public void testAssignIncreasingFreshIdWithIdentifier() { - Schema schema = new Schema( - Lists.newArrayList( - required(10, "a", Types.IntegerType.get()), - required(11, "A", Types.IntegerType.get())), - Sets.newHashSet(10) - ); - Schema expectedSchema = new Schema( - Lists.newArrayList( - required(1, "a", Types.IntegerType.get()), - required(2, "A", Types.IntegerType.get())), - Sets.newHashSet(1) - ); + Schema schema = + new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(11, "A", Types.IntegerType.get())), + Sets.newHashSet(10)); + Schema expectedSchema = + new Schema( + Lists.newArrayList( + required(1, "a", Types.IntegerType.get()), + required(2, "A", Types.IntegerType.get())), + Sets.newHashSet(1)); final Schema actualSchema = TypeUtil.assignIncreasingFreshIds(schema); Assert.assertEquals(expectedSchema.asStruct(), actualSchema.asStruct()); - Assert.assertEquals("identifier field ID should change based on source schema", - expectedSchema.identifierFieldIds(), actualSchema.identifierFieldIds()); + Assert.assertEquals( + "identifier field ID should change based on source schema", + expectedSchema.identifierFieldIds(), + actualSchema.identifierFieldIds()); } @Test public void testAssignIncreasingFreshIdNewIdentifier() { - Schema schema = new Schema( - Lists.newArrayList( - required(10, "a", Types.IntegerType.get()), - required(11, "A", Types.IntegerType.get())), - Sets.newHashSet(10) - ); - Schema sourceSchema = new Schema( - Lists.newArrayList( - required(1, "a", Types.IntegerType.get()), - required(2, "A", Types.IntegerType.get())) - ); + Schema schema = + new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(11, "A", Types.IntegerType.get())), + Sets.newHashSet(10)); + Schema sourceSchema = + new Schema( + Lists.newArrayList( + required(1, "a", Types.IntegerType.get()), + required(2, "A", Types.IntegerType.get()))); final Schema actualSchema = TypeUtil.reassignIds(schema, sourceSchema); Assert.assertEquals(sourceSchema.asStruct(), actualSchema.asStruct()); - Assert.assertEquals("source schema missing identifier should not impact refreshing new identifier", - Sets.newHashSet(sourceSchema.findField("a").fieldId()), actualSchema.identifierFieldIds()); + Assert.assertEquals( + "source schema missing identifier should not impact refreshing new identifier", + Sets.newHashSet(sourceSchema.findField("a").fieldId()), + actualSchema.identifierFieldIds()); } @Test public void testProject() { - Schema schema = new Schema( - Lists.newArrayList( - required(10, "a", Types.IntegerType.get()), - required(11, "A", Types.IntegerType.get()), - required(12, "someStruct", Types.StructType.of( - required(13, "b", Types.IntegerType.get()), - required(14, "B", Types.IntegerType.get()), - required(15, "anotherStruct", Types.StructType.of( - required(16, "c", Types.IntegerType.get()), - required(17, "C", Types.IntegerType.get())) - ))))); - - Schema expectedTop = new Schema( - Lists.newArrayList( - required(11, "A", Types.IntegerType.get()))); + Schema schema = + new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(11, "A", Types.IntegerType.get()), + required( + 12, + "someStruct", + Types.StructType.of( + required(13, "b", Types.IntegerType.get()), + required(14, "B", Types.IntegerType.get()), + required( + 15, + "anotherStruct", + Types.StructType.of( + required(16, "c", Types.IntegerType.get()), + required(17, "C", Types.IntegerType.get()))))))); + + Schema expectedTop = new Schema(Lists.newArrayList(required(11, "A", Types.IntegerType.get()))); Schema actualTop = TypeUtil.project(schema, Sets.newHashSet(11)); Assert.assertEquals(expectedTop.asStruct(), actualTop.asStruct()); - Schema expectedDepthOne = new Schema( - Lists.newArrayList( - required(10, "a", Types.IntegerType.get()), - required(12, "someStruct", Types.StructType.of( - required(13, "b", Types.IntegerType.get()))))); + Schema expectedDepthOne = + new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required( + 12, + "someStruct", + Types.StructType.of(required(13, "b", Types.IntegerType.get()))))); Schema actualDepthOne = TypeUtil.project(schema, Sets.newHashSet(10, 12, 13)); Assert.assertEquals(expectedDepthOne.asStruct(), actualDepthOne.asStruct()); - Schema expectedDepthTwo = new Schema( - Lists.newArrayList( - required(11, "A", Types.IntegerType.get()), - required(12, "someStruct", Types.StructType.of( - required(15, "anotherStruct", Types.StructType.of( - required(17, "C", Types.IntegerType.get())) - ))))); + Schema expectedDepthTwo = + new Schema( + Lists.newArrayList( + required(11, "A", Types.IntegerType.get()), + required( + 12, + "someStruct", + Types.StructType.of( + required( + 15, + "anotherStruct", + Types.StructType.of(required(17, "C", Types.IntegerType.get()))))))); Schema actualDepthTwo = TypeUtil.project(schema, Sets.newHashSet(11, 12, 15, 17)); Schema actualDepthTwoChildren = TypeUtil.project(schema, Sets.newHashSet(11, 17)); @@ -154,34 +167,46 @@ public void testProject() { @Test public void testProjectNaturallyEmpty() { - Schema schema = new Schema( - Lists.newArrayList( - required(12, "someStruct", Types.StructType.of( - required(15, "anotherStruct", Types.StructType.of( - required(20, "empty", Types.StructType.of()) - )))))); + Schema schema = + new Schema( + Lists.newArrayList( + required( + 12, + "someStruct", + Types.StructType.of( + required( + 15, + "anotherStruct", + Types.StructType.of(required(20, "empty", Types.StructType.of()))))))); - Schema expectedDepthOne = new Schema( - Lists.newArrayList( - required(12, "someStruct", Types.StructType.of()))); + Schema expectedDepthOne = + new Schema(Lists.newArrayList(required(12, "someStruct", Types.StructType.of()))); Schema actualDepthOne = TypeUtil.project(schema, Sets.newHashSet(12)); Assert.assertEquals(expectedDepthOne.asStruct(), actualDepthOne.asStruct()); - Schema expectedDepthTwo = new Schema( - Lists.newArrayList( - required(12, "someStruct", Types.StructType.of( - required(15, "anotherStruct", Types.StructType.of()))))); + Schema expectedDepthTwo = + new Schema( + Lists.newArrayList( + required( + 12, + "someStruct", + Types.StructType.of(required(15, "anotherStruct", Types.StructType.of()))))); Schema actualDepthTwo = TypeUtil.project(schema, Sets.newHashSet(12, 15)); Assert.assertEquals(expectedDepthTwo.asStruct(), actualDepthTwo.asStruct()); - Schema expectedDepthThree = new Schema( - Lists.newArrayList( - required(12, "someStruct", Types.StructType.of( - required(15, "anotherStruct", Types.StructType.of( - required(20, "empty", Types.StructType.of()) - )))))); + Schema expectedDepthThree = + new Schema( + Lists.newArrayList( + required( + 12, + "someStruct", + Types.StructType.of( + required( + 15, + "anotherStruct", + Types.StructType.of(required(20, "empty", Types.StructType.of()))))))); Schema actualDepthThree = TypeUtil.project(schema, Sets.newHashSet(12, 15, 20)); Schema actualDepthThreeChildren = TypeUtil.project(schema, Sets.newHashSet(20)); @@ -191,29 +216,37 @@ public void testProjectNaturallyEmpty() { @Test public void testProjectEmpty() { - Schema schema = new Schema( - Lists.newArrayList( - required(10, "a", Types.IntegerType.get()), - required(11, "A", Types.IntegerType.get()), - required(12, "someStruct", Types.StructType.of( - required(13, "b", Types.IntegerType.get()), - required(14, "B", Types.IntegerType.get()), - required(15, "anotherStruct", Types.StructType.of( - required(16, "c", Types.IntegerType.get()), - required(17, "C", Types.IntegerType.get())) - ))))); - - Schema expectedDepthOne = new Schema( - Lists.newArrayList( - required(12, "someStruct", Types.StructType.of()))); + Schema schema = + new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(11, "A", Types.IntegerType.get()), + required( + 12, + "someStruct", + Types.StructType.of( + required(13, "b", Types.IntegerType.get()), + required(14, "B", Types.IntegerType.get()), + required( + 15, + "anotherStruct", + Types.StructType.of( + required(16, "c", Types.IntegerType.get()), + required(17, "C", Types.IntegerType.get()))))))); + + Schema expectedDepthOne = + new Schema(Lists.newArrayList(required(12, "someStruct", Types.StructType.of()))); Schema actualDepthOne = TypeUtil.project(schema, Sets.newHashSet(12)); Assert.assertEquals(expectedDepthOne.asStruct(), actualDepthOne.asStruct()); - Schema expectedDepthTwo = new Schema( - Lists.newArrayList( - required(12, "someStruct", Types.StructType.of( - required(15, "anotherStruct", Types.StructType.of()))))); + Schema expectedDepthTwo = + new Schema( + Lists.newArrayList( + required( + 12, + "someStruct", + Types.StructType.of(required(15, "anotherStruct", Types.StructType.of()))))); Schema actualDepthTwo = TypeUtil.project(schema, Sets.newHashSet(12, 15)); Assert.assertEquals(expectedDepthTwo.asStruct(), actualDepthTwo.asStruct()); @@ -221,45 +254,61 @@ public void testProjectEmpty() { @Test public void testSelect() { - Schema schema = new Schema( - Lists.newArrayList( - required(10, "a", Types.IntegerType.get()), - required(11, "A", Types.IntegerType.get()), - required(12, "someStruct", Types.StructType.of( - required(13, "b", Types.IntegerType.get()), - required(14, "B", Types.IntegerType.get()), - required(15, "anotherStruct", Types.StructType.of( - required(16, "c", Types.IntegerType.get()), - required(17, "C", Types.IntegerType.get())) - ))))); - - Schema expectedTop = new Schema( - Lists.newArrayList( - required(11, "A", Types.IntegerType.get()))); + Schema schema = + new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(11, "A", Types.IntegerType.get()), + required( + 12, + "someStruct", + Types.StructType.of( + required(13, "b", Types.IntegerType.get()), + required(14, "B", Types.IntegerType.get()), + required( + 15, + "anotherStruct", + Types.StructType.of( + required(16, "c", Types.IntegerType.get()), + required(17, "C", Types.IntegerType.get()))))))); + + Schema expectedTop = new Schema(Lists.newArrayList(required(11, "A", Types.IntegerType.get()))); Schema actualTop = TypeUtil.select(schema, Sets.newHashSet(11)); Assert.assertEquals(expectedTop.asStruct(), actualTop.asStruct()); - Schema expectedDepthOne = new Schema( - Lists.newArrayList( - required(10, "a", Types.IntegerType.get()), - required(12, "someStruct", Types.StructType.of( - required(13, "b", Types.IntegerType.get()), - required(14, "B", Types.IntegerType.get()), - required(15, "anotherStruct", Types.StructType.of( - required(16, "c", Types.IntegerType.get()), - required(17, "C", Types.IntegerType.get()))))))); + Schema expectedDepthOne = + new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required( + 12, + "someStruct", + Types.StructType.of( + required(13, "b", Types.IntegerType.get()), + required(14, "B", Types.IntegerType.get()), + required( + 15, + "anotherStruct", + Types.StructType.of( + required(16, "c", Types.IntegerType.get()), + required(17, "C", Types.IntegerType.get()))))))); Schema actualDepthOne = TypeUtil.select(schema, Sets.newHashSet(10, 12)); Assert.assertEquals(expectedDepthOne.asStruct(), actualDepthOne.asStruct()); - Schema expectedDepthTwo = new Schema( - Lists.newArrayList( - required(11, "A", Types.IntegerType.get()), - required(12, "someStruct", Types.StructType.of( - required(15, "anotherStruct", Types.StructType.of( - required(17, "C", Types.IntegerType.get())) - ))))); + Schema expectedDepthTwo = + new Schema( + Lists.newArrayList( + required(11, "A", Types.IntegerType.get()), + required( + 12, + "someStruct", + Types.StructType.of( + required( + 15, + "anotherStruct", + Types.StructType.of(required(17, "C", Types.IntegerType.get()))))))); Schema actualDepthTwo = TypeUtil.select(schema, Sets.newHashSet(11, 17)); Assert.assertEquals(expectedDepthTwo.asStruct(), actualDepthTwo.asStruct()); @@ -268,76 +317,112 @@ public void testSelect() { @Test public void testProjectMap() { // We can't partially project keys because it changes key equality - Schema schema = new Schema( - Lists.newArrayList( - required(10, "a", Types.IntegerType.get()), - required(11, "A", Types.IntegerType.get()), - required(12, "map", Types.MapType.ofRequired(13, 14, - Types.StructType.of( - optional(100, "x", Types.IntegerType.get()), - optional(101, "y", Types.IntegerType.get())), - Types.StructType.of( - required(200, "z", Types.IntegerType.get()), - optional(201, "innerMap", Types.MapType.ofOptional(202, 203, - Types.IntegerType.get(), + Schema schema = + new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(11, "A", Types.IntegerType.get()), + required( + 12, + "map", + Types.MapType.ofRequired( + 13, + 14, Types.StructType.of( - required(300, "foo", Types.IntegerType.get()), - required(301, "bar", Types.IntegerType.get()))))))))); - - Assert.assertThrows("Cannot project maps explicitly", IllegalArgumentException.class, + optional(100, "x", Types.IntegerType.get()), + optional(101, "y", Types.IntegerType.get())), + Types.StructType.of( + required(200, "z", Types.IntegerType.get()), + optional( + 201, + "innerMap", + Types.MapType.ofOptional( + 202, + 203, + Types.IntegerType.get(), + Types.StructType.of( + required(300, "foo", Types.IntegerType.get()), + required(301, "bar", Types.IntegerType.get()))))))))); + + Assert.assertThrows( + "Cannot project maps explicitly", + IllegalArgumentException.class, () -> TypeUtil.project(schema, Sets.newHashSet(12))); - Assert.assertThrows("Cannot project maps explicitly", IllegalArgumentException.class, + Assert.assertThrows( + "Cannot project maps explicitly", + IllegalArgumentException.class, () -> TypeUtil.project(schema, Sets.newHashSet(201))); - Schema expectedTopLevel = new Schema( - Lists.newArrayList(required(10, "a", Types.IntegerType.get()))); + Schema expectedTopLevel = + new Schema(Lists.newArrayList(required(10, "a", Types.IntegerType.get()))); Schema actualTopLevel = TypeUtil.project(schema, Sets.newHashSet(10)); Assert.assertEquals(expectedTopLevel.asStruct(), actualTopLevel.asStruct()); - Schema expectedDepthOne = new Schema( - Lists.newArrayList( - required(10, "a", Types.IntegerType.get()), - required(12, "map", Types.MapType.ofRequired(13, 14, - Types.StructType.of( - optional(100, "x", Types.IntegerType.get()), - optional(101, "y", Types.IntegerType.get())), - Types.StructType.of())))); + Schema expectedDepthOne = + new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required( + 12, + "map", + Types.MapType.ofRequired( + 13, + 14, + Types.StructType.of( + optional(100, "x", Types.IntegerType.get()), + optional(101, "y", Types.IntegerType.get())), + Types.StructType.of())))); Schema actualDepthOne = TypeUtil.project(schema, Sets.newHashSet(10, 13, 14, 100, 101)); Schema actualDepthOneNoKeys = TypeUtil.project(schema, Sets.newHashSet(10, 13, 14)); Assert.assertEquals(expectedDepthOne.asStruct(), actualDepthOne.asStruct()); Assert.assertEquals(expectedDepthOne.asStruct(), actualDepthOneNoKeys.asStruct()); - Schema expectedDepthTwo = new Schema( + Schema expectedDepthTwo = + new Schema( Lists.newArrayList( required(10, "a", Types.IntegerType.get()), - required(12, "map", Types.MapType.ofRequired(13, 14, - Types.StructType.of( - optional(100, "x", Types.IntegerType.get()), - optional(101, "y", Types.IntegerType.get())), - Types.StructType.of( - required(200, "z", Types.IntegerType.get()), - optional(201, "innerMap", Types.MapType.ofOptional(202, 203, - Types.IntegerType.get(), - Types.StructType.of()))))))); - Schema actualDepthTwo = TypeUtil.project(schema, Sets.newHashSet(10, 13, 14, 100, 101, 200, 202, 203)); + required( + 12, + "map", + Types.MapType.ofRequired( + 13, + 14, + Types.StructType.of( + optional(100, "x", Types.IntegerType.get()), + optional(101, "y", Types.IntegerType.get())), + Types.StructType.of( + required(200, "z", Types.IntegerType.get()), + optional( + 201, + "innerMap", + Types.MapType.ofOptional( + 202, 203, Types.IntegerType.get(), Types.StructType.of()))))))); + Schema actualDepthTwo = + TypeUtil.project(schema, Sets.newHashSet(10, 13, 14, 100, 101, 200, 202, 203)); Assert.assertEquals(expectedDepthTwo.asStruct(), actualDepthTwo.asStruct()); } @Test public void testGetProjectedIds() { - Schema schema = new Schema( - Lists.newArrayList( - required(10, "a", Types.IntegerType.get()), - required(11, "A", Types.IntegerType.get()), - required(35, "emptyStruct", Types.StructType.of()), - required(12, "someStruct", Types.StructType.of( - required(13, "b", Types.IntegerType.get()), - required(14, "B", Types.IntegerType.get()), - required(15, "anotherStruct", Types.StructType.of( - required(16, "c", Types.IntegerType.get()), - required(17, "C", Types.IntegerType.get())) - ))))); + Schema schema = + new Schema( + Lists.newArrayList( + required(10, "a", Types.IntegerType.get()), + required(11, "A", Types.IntegerType.get()), + required(35, "emptyStruct", Types.StructType.of()), + required( + 12, + "someStruct", + Types.StructType.of( + required(13, "b", Types.IntegerType.get()), + required(14, "B", Types.IntegerType.get()), + required( + 15, + "anotherStruct", + Types.StructType.of( + required(16, "c", Types.IntegerType.get()), + required(17, "C", Types.IntegerType.get()))))))); Set expectedIds = Sets.newHashSet(10, 11, 35, 12, 13, 14, 15, 16, 17); Set actualIds = TypeUtil.getProjectedIds(schema); @@ -347,39 +432,51 @@ public void testGetProjectedIds() { @Test public void testProjectListNested() { - Schema schema = new Schema( - Lists.newArrayList( - required(12, "list", Types.ListType.ofRequired(13, - Types.ListType.ofRequired(14, - Types.MapType.ofRequired(15, 16, - IntegerType.get(), - Types.StructType.of( - required(17, "x", Types.IntegerType.get()), - required(18, "y", Types.IntegerType.get()) - ))))))); - - AssertHelpers.assertThrows("Cannot explicitly project List", + Schema schema = + new Schema( + Lists.newArrayList( + required( + 12, + "list", + Types.ListType.ofRequired( + 13, + Types.ListType.ofRequired( + 14, + Types.MapType.ofRequired( + 15, + 16, + IntegerType.get(), + Types.StructType.of( + required(17, "x", Types.IntegerType.get()), + required(18, "y", Types.IntegerType.get())))))))); + + AssertHelpers.assertThrows( + "Cannot explicitly project List", IllegalArgumentException.class, - () -> TypeUtil.project(schema, Sets.newHashSet(12)) - ); + () -> TypeUtil.project(schema, Sets.newHashSet(12))); - AssertHelpers.assertThrows("Cannot explicitly project List", + AssertHelpers.assertThrows( + "Cannot explicitly project List", IllegalArgumentException.class, - () -> TypeUtil.project(schema, Sets.newHashSet(13)) - ); + () -> TypeUtil.project(schema, Sets.newHashSet(13))); - AssertHelpers.assertThrows("Cannot explicitly project Map", + AssertHelpers.assertThrows( + "Cannot explicitly project Map", IllegalArgumentException.class, - () -> TypeUtil.project(schema, Sets.newHashSet(14)) - ); + () -> TypeUtil.project(schema, Sets.newHashSet(14))); - Schema expected = new Schema( - Lists.newArrayList( - required(12, "list", Types.ListType.ofRequired(13, - Types.ListType.ofRequired(14, - Types.MapType.ofRequired(15, 16, - IntegerType.get(), - Types.StructType.of())))))); + Schema expected = + new Schema( + Lists.newArrayList( + required( + 12, + "list", + Types.ListType.ofRequired( + 13, + Types.ListType.ofRequired( + 14, + Types.MapType.ofRequired( + 15, 16, IntegerType.get(), Types.StructType.of())))))); Schema actual = TypeUtil.project(schema, Sets.newHashSet(16)); Assert.assertEquals(expected.asStruct(), actual.asStruct()); @@ -387,42 +484,56 @@ public void testProjectListNested() { @Test public void testProjectMapNested() { - Schema schema = new Schema( - Lists.newArrayList( - required(12, "map", Types.MapType.ofRequired(13, 14, - Types.IntegerType.get(), - Types.MapType.ofRequired(15, 16, - Types.IntegerType.get(), - Types.ListType.ofRequired(17, - Types.StructType.of( - required(18, "x", Types.IntegerType.get()), - required(19, "y", Types.IntegerType.get()) - ))))))); - - - AssertHelpers.assertThrows("Cannot explicitly project Map", + Schema schema = + new Schema( + Lists.newArrayList( + required( + 12, + "map", + Types.MapType.ofRequired( + 13, + 14, + Types.IntegerType.get(), + Types.MapType.ofRequired( + 15, + 16, + Types.IntegerType.get(), + Types.ListType.ofRequired( + 17, + Types.StructType.of( + required(18, "x", Types.IntegerType.get()), + required(19, "y", Types.IntegerType.get())))))))); + + AssertHelpers.assertThrows( + "Cannot explicitly project Map", IllegalArgumentException.class, - () -> TypeUtil.project(schema, Sets.newHashSet(12)) - ); + () -> TypeUtil.project(schema, Sets.newHashSet(12))); - AssertHelpers.assertThrows("Cannot explicitly project Map", + AssertHelpers.assertThrows( + "Cannot explicitly project Map", IllegalArgumentException.class, - () -> TypeUtil.project(schema, Sets.newHashSet(14)) - ); + () -> TypeUtil.project(schema, Sets.newHashSet(14))); - AssertHelpers.assertThrows("Cannot explicitly project List", + AssertHelpers.assertThrows( + "Cannot explicitly project List", IllegalArgumentException.class, - () -> TypeUtil.project(schema, Sets.newHashSet(16)) - ); - - Schema expected = new Schema( - Lists.newArrayList( - required(12, "map", Types.MapType.ofRequired(13, 14, - Types.IntegerType.get(), - Types.MapType.ofRequired(15, 16, - Types.IntegerType.get(), - Types.ListType.ofRequired(17, - Types.StructType.of())))))); + () -> TypeUtil.project(schema, Sets.newHashSet(16))); + + Schema expected = + new Schema( + Lists.newArrayList( + required( + 12, + "map", + Types.MapType.ofRequired( + 13, + 14, + Types.IntegerType.get(), + Types.MapType.ofRequired( + 15, + 16, + Types.IntegerType.get(), + Types.ListType.ofRequired(17, Types.StructType.of())))))); Schema actual = TypeUtil.project(schema, Sets.newHashSet(17)); Assert.assertEquals(expected.asStruct(), actual.asStruct()); @@ -430,50 +541,52 @@ public void testProjectMapNested() { @Test public void testReassignIdsIllegalArgumentException() { - Schema schema = new Schema( - required(1, "a", Types.IntegerType.get()), - required(2, "b", Types.IntegerType.get()) - ); - Schema sourceSchema = new Schema( - required(1, "a", Types.IntegerType.get()) - ); + Schema schema = + new Schema( + required(1, "a", Types.IntegerType.get()), required(2, "b", Types.IntegerType.get())); + Schema sourceSchema = new Schema(required(1, "a", Types.IntegerType.get())); Assertions.assertThatThrownBy(() -> TypeUtil.reassignIds(schema, sourceSchema)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Field b not found in source schema"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Field b not found in source schema"); } @Test public void testValidateSchemaViaIndexByName() { - Types.NestedField nestedType = Types.NestedField - .required(1, "a", Types.StructType.of( - required(2, "b", Types.StructType.of( - required(3, "c", Types.BooleanType.get()) - )), - required(4, "b.c", Types.BooleanType.get()) - ) - ); + Types.NestedField nestedType = + Types.NestedField.required( + 1, + "a", + Types.StructType.of( + required(2, "b", Types.StructType.of(required(3, "c", Types.BooleanType.get()))), + required(4, "b.c", Types.BooleanType.get()))); Assertions.assertThatThrownBy(() -> TypeUtil.indexByName(Types.StructType.of(nestedType))) - .isInstanceOf(RuntimeException.class) - .hasMessageContaining("Invalid schema: multiple fields for name a.b.c"); + .isInstanceOf(RuntimeException.class) + .hasMessageContaining("Invalid schema: multiple fields for name a.b.c"); } @Test public void testSelectNot() { - Schema schema = new Schema( - Lists.newArrayList( - required(1, "id", Types.LongType.get()), - required(2, "location", Types.StructType.of( - required(3, "lat", Types.DoubleType.get()), - required(4, "long", Types.DoubleType.get()) - )))); - - Schema expectedNoPrimitive = new Schema( - Lists.newArrayList( - required(2, "location", Types.StructType.of( - required(3, "lat", Types.DoubleType.get()), - required(4, "long", Types.DoubleType.get()) - )))); + Schema schema = + new Schema( + Lists.newArrayList( + required(1, "id", Types.LongType.get()), + required( + 2, + "location", + Types.StructType.of( + required(3, "lat", Types.DoubleType.get()), + required(4, "long", Types.DoubleType.get()))))); + + Schema expectedNoPrimitive = + new Schema( + Lists.newArrayList( + required( + 2, + "location", + Types.StructType.of( + required(3, "lat", Types.DoubleType.get()), + required(4, "long", Types.DoubleType.get()))))); Schema actualNoPrimitve = TypeUtil.selectNot(schema, Sets.newHashSet(1)); Assert.assertEquals(expectedNoPrimitive.asStruct(), actualNoPrimitve.asStruct()); diff --git a/api/src/test/java/org/apache/iceberg/util/RandomUtil.java b/api/src/test/java/org/apache/iceberg/util/RandomUtil.java index f01db616c88a..a84dc4d8f8ce 100644 --- a/api/src/test/java/org/apache/iceberg/util/RandomUtil.java +++ b/api/src/test/java/org/apache/iceberg/util/RandomUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.math.BigDecimal; @@ -28,16 +27,14 @@ public class RandomUtil { - private RandomUtil() { - } + private RandomUtil() {} private static boolean negate(int num) { return num % 2 == 1; } @SuppressWarnings("RandomModInteger") - public static Object generatePrimitive(Type.PrimitiveType primitive, - Random random) { + public static Object generatePrimitive(Type.PrimitiveType primitive, Random random) { int choice = random.nextInt(20); switch (primitive.typeId()) { @@ -153,7 +150,8 @@ public static Object generatePrimitive(Type.PrimitiveType primitive, } } - public static Object generateDictionaryEncodablePrimitive(Type.PrimitiveType primitive, Random random) { + public static Object generateDictionaryEncodablePrimitive( + Type.PrimitiveType primitive, Random random) { int value = random.nextInt(3); switch (primitive.typeId()) { case BOOLEAN: diff --git a/api/src/test/java/org/apache/iceberg/util/TestCharSequenceSet.java b/api/src/test/java/org/apache/iceberg/util/TestCharSequenceSet.java index d208de20db0e..95bc77dd8bed 100644 --- a/api/src/test/java/org/apache/iceberg/util/TestCharSequenceSet.java +++ b/api/src/test/java/org/apache/iceberg/util/TestCharSequenceSet.java @@ -16,21 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - package org.apache.iceberg.util; import java.util.Arrays; diff --git a/api/src/test/java/org/apache/iceberg/util/TestExceptionUtil.java b/api/src/test/java/org/apache/iceberg/util/TestExceptionUtil.java index b02717dc33d4..1fc6de18b566 100644 --- a/api/src/test/java/org/apache/iceberg/util/TestExceptionUtil.java +++ b/api/src/test/java/org/apache/iceberg/util/TestExceptionUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.io.IOException; @@ -39,14 +38,17 @@ private CustomCheckedException(String message) { public void testRunSafely() { CustomCheckedException exc = new CustomCheckedException("test"); try { - ExceptionUtil.runSafely(() -> { + ExceptionUtil.runSafely( + () -> { throw exc; - }, e -> { + }, + e -> { throw new Exception("test catch suppression"); - }, () -> { + }, + () -> { throw new RuntimeException("test finally suppression"); - }, CustomCheckedException.class - ); + }, + CustomCheckedException.class); Assert.fail("Should have thrown CustomCheckedException"); @@ -57,12 +59,20 @@ public void testRunSafely() { Assert.assertEquals("Should have 2 suppressed exceptions", 2, e.getSuppressed().length); Throwable throwSuppressed = e.getSuppressed()[0]; - Assertions.assertThat(throwSuppressed).as("Should be an Exception").isInstanceOf(Exception.class); - Assert.assertEquals("Should have correct message", "test catch suppression", throwSuppressed.getMessage()); + Assertions.assertThat(throwSuppressed) + .as("Should be an Exception") + .isInstanceOf(Exception.class); + Assert.assertEquals( + "Should have correct message", "test catch suppression", throwSuppressed.getMessage()); Throwable finallySuppressed = e.getSuppressed()[1]; - Assertions.assertThat(finallySuppressed).as("Should be a RuntimeException").isInstanceOf(RuntimeException.class); - Assert.assertEquals("Should have correct message", "test finally suppression", finallySuppressed.getMessage()); + Assertions.assertThat(finallySuppressed) + .as("Should be a RuntimeException") + .isInstanceOf(RuntimeException.class); + Assert.assertEquals( + "Should have correct message", + "test finally suppression", + finallySuppressed.getMessage()); } } @@ -70,14 +80,19 @@ public void testRunSafely() { public void testRunSafelyTwoExceptions() { CustomCheckedException exc = new CustomCheckedException("test"); try { - ExceptionUtil.runSafely((ExceptionUtil.Block) () -> { - throw exc; - }, e -> { + ExceptionUtil.runSafely( + (ExceptionUtil.Block) + () -> { + throw exc; + }, + e -> { throw new Exception("test catch suppression"); - }, () -> { + }, + () -> { throw new RuntimeException("test finally suppression"); - }, CustomCheckedException.class, IOException.class - ); + }, + CustomCheckedException.class, + IOException.class); Assert.fail("Should have thrown CustomCheckedException"); @@ -91,12 +106,20 @@ public void testRunSafelyTwoExceptions() { Assert.assertEquals("Should have 2 suppressed exceptions", 2, e.getSuppressed().length); Throwable throwSuppressed = e.getSuppressed()[0]; - Assertions.assertThat(throwSuppressed).as("Should be an Exception").isInstanceOf(Exception.class); - Assert.assertEquals("Should have correct message", "test catch suppression", throwSuppressed.getMessage()); + Assertions.assertThat(throwSuppressed) + .as("Should be an Exception") + .isInstanceOf(Exception.class); + Assert.assertEquals( + "Should have correct message", "test catch suppression", throwSuppressed.getMessage()); Throwable finallySuppressed = e.getSuppressed()[1]; - Assertions.assertThat(finallySuppressed).as("Should be a RuntimeException").isInstanceOf(RuntimeException.class); - Assert.assertEquals("Should have correct message", "test finally suppression", finallySuppressed.getMessage()); + Assertions.assertThat(finallySuppressed) + .as("Should be a RuntimeException") + .isInstanceOf(RuntimeException.class); + Assert.assertEquals( + "Should have correct message", + "test finally suppression", + finallySuppressed.getMessage()); } } @@ -104,15 +127,20 @@ public void testRunSafelyTwoExceptions() { public void testRunSafelyThreeExceptions() { CustomCheckedException exc = new CustomCheckedException("test"); try { - ExceptionUtil.runSafely((ExceptionUtil.Block) - () -> { - throw exc; - }, e -> { + ExceptionUtil.runSafely( + (ExceptionUtil.Block) + () -> { + throw exc; + }, + e -> { throw new Exception("test catch suppression"); - }, () -> { + }, + () -> { throw new RuntimeException("test finally suppression"); - }, CustomCheckedException.class, IOException.class, ClassNotFoundException.class - ); + }, + CustomCheckedException.class, + IOException.class, + ClassNotFoundException.class); Assert.fail("Should have thrown CustomCheckedException"); @@ -126,12 +154,20 @@ public void testRunSafelyThreeExceptions() { Assert.assertEquals("Should have 2 suppressed exceptions", 2, e.getSuppressed().length); Throwable throwSuppressed = e.getSuppressed()[0]; - Assertions.assertThat(throwSuppressed).as("Should be an Exception").isInstanceOf(Exception.class); - Assert.assertEquals("Should have correct message", "test catch suppression", throwSuppressed.getMessage()); + Assertions.assertThat(throwSuppressed) + .as("Should be an Exception") + .isInstanceOf(Exception.class); + Assert.assertEquals( + "Should have correct message", "test catch suppression", throwSuppressed.getMessage()); Throwable finallySuppressed = e.getSuppressed()[1]; - Assertions.assertThat(finallySuppressed).as("Should be a RuntimeException").isInstanceOf(RuntimeException.class); - Assert.assertEquals("Should have correct message", "test finally suppression", finallySuppressed.getMessage()); + Assertions.assertThat(finallySuppressed) + .as("Should be a RuntimeException") + .isInstanceOf(RuntimeException.class); + Assert.assertEquals( + "Should have correct message", + "test finally suppression", + finallySuppressed.getMessage()); } } @@ -139,14 +175,16 @@ public void testRunSafelyThreeExceptions() { public void testRunSafelyRuntimeExceptions() { RuntimeException exc = new RuntimeException("test"); try { - ExceptionUtil.runSafely(() -> { + ExceptionUtil.runSafely( + () -> { throw exc; - }, e -> { + }, + e -> { throw new Exception("test catch suppression"); - }, () -> { + }, + () -> { throw new CustomCheckedException("test finally suppression"); - } - ); + }); Assert.fail("Should have thrown RuntimeException"); @@ -157,14 +195,20 @@ public void testRunSafelyRuntimeExceptions() { Assert.assertEquals("Should have 2 suppressed exceptions", 2, e.getSuppressed().length); Throwable throwSuppressed = e.getSuppressed()[0]; - Assertions.assertThat(throwSuppressed).as("Should be an Exception").isInstanceOf(Exception.class); - Assert.assertEquals("Should have correct message", "test catch suppression", throwSuppressed.getMessage()); + Assertions.assertThat(throwSuppressed) + .as("Should be an Exception") + .isInstanceOf(Exception.class); + Assert.assertEquals( + "Should have correct message", "test catch suppression", throwSuppressed.getMessage()); Throwable finallySuppressed = e.getSuppressed()[1]; - Assertions.assertThat(finallySuppressed).as("Should be a CustomCheckedException") + Assertions.assertThat(finallySuppressed) + .as("Should be a CustomCheckedException") .isInstanceOf(CustomCheckedException.class); - Assert.assertEquals("Should have correct message", "test finally suppression", finallySuppressed.getMessage()); + Assert.assertEquals( + "Should have correct message", + "test finally suppression", + finallySuppressed.getMessage()); } } - } diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/ArrowAllocation.java b/arrow/src/main/java/org/apache/iceberg/arrow/ArrowAllocation.java index 49882ce90690..342621bb587c 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/ArrowAllocation.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/ArrowAllocation.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow; import org.apache.arrow.memory.RootAllocator; @@ -28,8 +27,7 @@ public class ArrowAllocation { private static final RootAllocator ROOT_ALLOCATOR; - private ArrowAllocation() { - } + private ArrowAllocation() {} public static RootAllocator rootAllocator() { return ROOT_ALLOCATOR; diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/ArrowSchemaUtil.java b/arrow/src/main/java/org/apache/iceberg/arrow/ArrowSchemaUtil.java index fe7ecc9e1759..57d503031181 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/ArrowSchemaUtil.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/ArrowSchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow; import java.util.List; @@ -37,13 +36,11 @@ import org.apache.iceberg.types.Types.NestedField; import org.apache.iceberg.types.Types.StructType; - public class ArrowSchemaUtil { private static final String ORIGINAL_TYPE = "originalType"; private static final String MAP_TYPE = "mapType"; - private ArrowSchemaUtil() { - } + private ArrowSchemaUtil() {} /** * Convert Iceberg schema to Arrow Schema. @@ -102,8 +99,10 @@ public static Field convert(final NestedField field) { arrowType = new ArrowType.FixedSizeBinary(16); break; case TIMESTAMP: - arrowType = new ArrowType.Timestamp(TimeUnit.MICROSECOND, - ((Types.TimestampType) field.type()).shouldAdjustToUTC() ? "UTC" : null); + arrowType = + new ArrowType.Timestamp( + TimeUnit.MICROSECOND, + ((Types.TimestampType) field.type()).shouldAdjustToUTC() ? "UTC" : null); break; case DATE: arrowType = new ArrowType.Date(DateUnit.DAY); @@ -129,14 +128,15 @@ public static Field convert(final NestedField field) { final MapType mapType = field.type().asMapType(); arrowType = new ArrowType.Map(false); List entryFields = Lists.transform(mapType.fields(), ArrowSchemaUtil::convert); - Field entry = new Field("", - new FieldType(field.isOptional(), arrowType, null), entryFields); + Field entry = + new Field("", new FieldType(field.isOptional(), arrowType, null), entryFields); children.add(entry); break; default: throw new UnsupportedOperationException("Unsupported field type: " + field); } - return new Field(field.name(), new FieldType(field.isOptional(), arrowType, null, metadata), children); + return new Field( + field.name(), new FieldType(field.isOptional(), arrowType, null, metadata), children); } } diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowBatchReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowBatchReader.java index 9cb13eb6f1e2..51edf742fc71 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowBatchReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowBatchReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized; import java.util.List; @@ -24,8 +23,8 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; /** - * A collection of vectorized readers per column (in the expected read schema) and Arrow Vector holders. This class owns - * the Arrow vectors and is responsible for closing the Arrow vectors. + * A collection of vectorized readers per column (in the expected read schema) and Arrow Vector + * holders. This class owns the Arrow vectors and is responsible for closing the Arrow vectors. */ class ArrowBatchReader extends BaseBatchReader { @@ -35,7 +34,8 @@ class ArrowBatchReader extends BaseBatchReader { @Override public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { - Preconditions.checkArgument(numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); + Preconditions.checkArgument( + numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); if (reuse == null) { closeVectors(); @@ -47,7 +47,8 @@ public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { int numRowsInVector = vectorHolders[i].numValues(); Preconditions.checkState( numRowsInVector == numRowsToRead, - "Number of rows in the vector %s didn't match expected %s ", numRowsInVector, + "Number of rows in the vector %s didn't match expected %s ", + numRowsInVector, numRowsToRead); // Handle null vector for constant case columnVectors[i] = new ColumnVector(vectorHolders[i]); diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowReader.java index 4b87302a55a7..09dbf8bfccfc 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized; import java.io.IOException; @@ -63,58 +62,58 @@ import org.slf4j.LoggerFactory; /** - * Vectorized reader that returns an iterator of {@link ColumnarBatch}. - * See {@link #open(CloseableIterable)} ()} to learn about the - * behavior of the iterator. + * Vectorized reader that returns an iterator of {@link ColumnarBatch}. See {@link + * #open(CloseableIterable)} ()} to learn about the behavior of the iterator. * *

The following Iceberg data types are supported and have been tested: + * *

    - *
  • Iceberg: {@link Types.BooleanType}, Arrow: {@link MinorType#BIT}
  • - *
  • Iceberg: {@link Types.IntegerType}, Arrow: {@link MinorType#INT}
  • - *
  • Iceberg: {@link Types.LongType}, Arrow: {@link MinorType#BIGINT}
  • - *
  • Iceberg: {@link Types.FloatType}, Arrow: {@link MinorType#FLOAT4}
  • - *
  • Iceberg: {@link Types.DoubleType}, Arrow: {@link MinorType#FLOAT8}
  • - *
  • Iceberg: {@link Types.StringType}, Arrow: {@link MinorType#VARCHAR}
  • - *
  • Iceberg: {@link Types.TimestampType} (both with and without timezone), - * Arrow: {@link MinorType#TIMEMICRO}
  • - *
  • Iceberg: {@link Types.BinaryType}, Arrow: {@link MinorType#VARBINARY}
  • - *
  • Iceberg: {@link Types.DateType}, Arrow: {@link MinorType#DATEDAY}
  • - *
  • Iceberg: {@link Types.TimeType}, Arrow: {@link MinorType#TIMEMICRO}
  • - *
  • Iceberg: {@link Types.UUIDType}, Arrow: {@link MinorType#FIXEDSIZEBINARY}(16)
  • + *
  • Iceberg: {@link Types.BooleanType}, Arrow: {@link MinorType#BIT} + *
  • Iceberg: {@link Types.IntegerType}, Arrow: {@link MinorType#INT} + *
  • Iceberg: {@link Types.LongType}, Arrow: {@link MinorType#BIGINT} + *
  • Iceberg: {@link Types.FloatType}, Arrow: {@link MinorType#FLOAT4} + *
  • Iceberg: {@link Types.DoubleType}, Arrow: {@link MinorType#FLOAT8} + *
  • Iceberg: {@link Types.StringType}, Arrow: {@link MinorType#VARCHAR} + *
  • Iceberg: {@link Types.TimestampType} (both with and without timezone), Arrow: {@link + * MinorType#TIMEMICRO} + *
  • Iceberg: {@link Types.BinaryType}, Arrow: {@link MinorType#VARBINARY} + *
  • Iceberg: {@link Types.DateType}, Arrow: {@link MinorType#DATEDAY} + *
  • Iceberg: {@link Types.TimeType}, Arrow: {@link MinorType#TIMEMICRO} + *
  • Iceberg: {@link Types.UUIDType}, Arrow: {@link MinorType#FIXEDSIZEBINARY}(16) *
* *

Features that don't work in this implementation: + * *

    - *
  • Type promotion: In case of type promotion, the Arrow vector corresponding to - * the data type in the parquet file is returned instead of the data type in the latest schema. - * See https://github.com/apache/iceberg/issues/2483.
  • - *
  • Columns with constant values are physically encoded as a dictionary. The Arrow vector - * type is int32 instead of the type as per the schema. - * See https://github.com/apache/iceberg/issues/2484.
  • - *
  • Data types: {@link Types.ListType}, {@link Types.MapType}, - * {@link Types.StructType}, {@link Types.FixedType} and - * {@link Types.DecimalType} - * See https://github.com/apache/iceberg/issues/2485 and https://github.com/apache/iceberg/issues/2486.
  • - *
  • Delete files are not supported. - * See https://github.com/apache/iceberg/issues/2487.
  • + *
  • Type promotion: In case of type promotion, the Arrow vector corresponding to the data type + * in the parquet file is returned instead of the data type in the latest schema. See + * https://github.com/apache/iceberg/issues/2483. + *
  • Columns with constant values are physically encoded as a dictionary. The Arrow vector type + * is int32 instead of the type as per the schema. See + * https://github.com/apache/iceberg/issues/2484. + *
  • Data types: {@link Types.ListType}, {@link Types.MapType}, {@link Types.StructType}, {@link + * Types.FixedType} and {@link Types.DecimalType} See + * https://github.com/apache/iceberg/issues/2485 and + * https://github.com/apache/iceberg/issues/2486. + *
  • Delete files are not supported. See https://github.com/apache/iceberg/issues/2487. *
*/ public class ArrowReader extends CloseableGroup { private static final Logger LOG = LoggerFactory.getLogger(ArrowReader.class); - private static final Set SUPPORTED_TYPES = ImmutableSet.of( - TypeID.BOOLEAN, - TypeID.INTEGER, - TypeID.LONG, - TypeID.FLOAT, - TypeID.DOUBLE, - TypeID.STRING, - TypeID.TIMESTAMP, - TypeID.BINARY, - TypeID.DATE, - TypeID.UUID, - TypeID.TIME - ); + private static final Set SUPPORTED_TYPES = + ImmutableSet.of( + TypeID.BOOLEAN, + TypeID.INTEGER, + TypeID.LONG, + TypeID.FLOAT, + TypeID.DOUBLE, + TypeID.STRING, + TypeID.TIMESTAMP, + TypeID.BINARY, + TypeID.DATE, + TypeID.UUID, + TypeID.TIME); private final Schema schema; private final FileIO io; @@ -127,16 +126,13 @@ public class ArrowReader extends CloseableGroup { * * @param scan the table scan object. * @param batchSize the maximum number of rows per Arrow batch. - * @param reuseContainers whether to reuse Arrow vectors when iterating through the data. - * If set to {@code false}, every {@link Iterator#next()} call creates - * new instances of Arrow vectors. - * If set to {@code true}, the Arrow vectors in the previous - * {@link Iterator#next()} may be reused for the data returned - * in the current {@link Iterator#next()}. - * This option avoids allocating memory again and again. - * Irrespective of the value of {@code reuseContainers}, the Arrow vectors - * in the previous {@link Iterator#next()} call are closed before creating - * new instances if the current {@link Iterator#next()}. + * @param reuseContainers whether to reuse Arrow vectors when iterating through the data. If set + * to {@code false}, every {@link Iterator#next()} call creates new instances of Arrow + * vectors. If set to {@code true}, the Arrow vectors in the previous {@link Iterator#next()} + * may be reused for the data returned in the current {@link Iterator#next()}. This option + * avoids allocating memory again and again. Irrespective of the value of {@code + * reuseContainers}, the Arrow vectors in the previous {@link Iterator#next()} call are closed + * before creating new instances if the current {@link Iterator#next()}. */ public ArrowReader(TableScan scan, int batchSize, boolean reuseContainers) { this.schema = scan.schema(); @@ -149,39 +145,34 @@ public ArrowReader(TableScan scan, int batchSize, boolean reuseContainers) { /** * Returns a new iterator of {@link ColumnarBatch} objects. - *

- * Note that the reader owns the {@link ColumnarBatch} objects and takes care of closing them. + * + *

Note that the reader owns the {@link ColumnarBatch} objects and takes care of closing them. * The caller should not hold onto a {@link ColumnarBatch} or try to close them. * - *

If {@code reuseContainers} is {@code false}, the Arrow vectors in the - * previous {@link ColumnarBatch} are closed before returning the next {@link ColumnarBatch} object. - * This implies that the caller should either use the {@link ColumnarBatch} or transfer the ownership of - * {@link ColumnarBatch} before getting the next {@link ColumnarBatch}. + *

If {@code reuseContainers} is {@code false}, the Arrow vectors in the previous {@link + * ColumnarBatch} are closed before returning the next {@link ColumnarBatch} object. This implies + * that the caller should either use the {@link ColumnarBatch} or transfer the ownership of {@link + * ColumnarBatch} before getting the next {@link ColumnarBatch}. + * + *

If {@code reuseContainers} is {@code true}, the Arrow vectors in the previous {@link + * ColumnarBatch} may be reused for the next {@link ColumnarBatch}. This implies that the caller + * should either use the {@link ColumnarBatch} or deep copy the {@link ColumnarBatch} before + * getting the next {@link ColumnarBatch}. + * + *

This method works for only when the following conditions are true: * - *

If {@code reuseContainers} is {@code true}, the Arrow vectors in the - * previous {@link ColumnarBatch} may be reused for the next {@link ColumnarBatch}. - * This implies that the caller should either use the {@link ColumnarBatch} or deep copy the - * {@link ColumnarBatch} before getting the next {@link ColumnarBatch}. - *

- * This method works for only when the following conditions are true: *

    - *
  1. At least one column is queried,
  2. - *
  3. There are no delete files, and
  4. - *
  5. Supported data types are queried (see {@link #SUPPORTED_TYPES}).
  6. + *
  7. At least one column is queried, + *
  8. There are no delete files, and + *
  9. Supported data types are queried (see {@link #SUPPORTED_TYPES}). *
+ * * When any of these conditions fail, an {@link UnsupportedOperationException} is thrown. */ public CloseableIterator open(CloseableIterable tasks) { - CloseableIterator itr = new VectorizedCombinedScanIterator( - tasks, - schema, - null, - io, - encryption, - true, - batchSize, - reuseContainers - ); + CloseableIterator itr = + new VectorizedCombinedScanIterator( + tasks, schema, null, io, encryption, true, batchSize, reuseContainers); addCloseable(itr); return itr; } @@ -192,10 +183,11 @@ public void close() throws IOException { } /** - * Reads the data file and returns an iterator of {@link VectorSchemaRoot}. - * Only Parquet data file format is supported. + * Reads the data file and returns an iterator of {@link VectorSchemaRoot}. Only Parquet data file + * format is supported. */ - private static final class VectorizedCombinedScanIterator implements CloseableIterator { + private static final class VectorizedCombinedScanIterator + implements CloseableIterator { private final Iterator fileItr; private final Map inputFiles; @@ -210,24 +202,22 @@ private static final class VectorizedCombinedScanIterator implements CloseableIt /** * Create a new instance. * - * @param tasks Combined file scan tasks. - * @param expectedSchema Read schema. The returned data will have this schema. - * @param nameMapping Mapping from external schema names to Iceberg type IDs. - * @param io File I/O. + * @param tasks Combined file scan tasks. + * @param expectedSchema Read schema. The returned data will have this schema. + * @param nameMapping Mapping from external schema names to Iceberg type IDs. + * @param io File I/O. * @param encryptionManager Encryption manager. - * @param caseSensitive If {@code true}, column names are case sensitive. - * If {@code false}, column names are not case sensitive. - * @param batchSize Batch size in number of rows. Each Arrow batch contains - * a maximum of {@code batchSize} rows. - * @param reuseContainers If set to {@code false}, every {@link Iterator#next()} call creates - * new instances of Arrow vectors. - * If set to {@code true}, the Arrow vectors in the previous - * {@link Iterator#next()} may be reused for the data returned - * in the current {@link Iterator#next()}. - * This option avoids allocating memory again and again. - * Irrespective of the value of {@code reuseContainers}, the Arrow vectors - * in the previous {@link Iterator#next()} call are closed before creating - * new instances if the current {@link Iterator#next()}. + * @param caseSensitive If {@code true}, column names are case sensitive. If {@code false}, + * column names are not case sensitive. + * @param batchSize Batch size in number of rows. Each Arrow batch contains a maximum of {@code + * batchSize} rows. + * @param reuseContainers If set to {@code false}, every {@link Iterator#next()} call creates + * new instances of Arrow vectors. If set to {@code true}, the Arrow vectors in the previous + * {@link Iterator#next()} may be reused for the data returned in the current {@link + * Iterator#next()}. This option avoids allocating memory again and again. Irrespective of + * the value of {@code reuseContainers}, the Arrow vectors in the previous {@link + * Iterator#next()} call are closed before creating new instances if the current {@link + * Iterator#next()}. */ VectorizedCombinedScanIterator( CloseableIterable tasks, @@ -238,25 +228,32 @@ private static final class VectorizedCombinedScanIterator implements CloseableIt boolean caseSensitive, int batchSize, boolean reuseContainers) { - List fileTasks = StreamSupport.stream(tasks.spliterator(), false) - .map(CombinedScanTask::files) - .flatMap(Collection::stream) - .collect(Collectors.toList()); + List fileTasks = + StreamSupport.stream(tasks.spliterator(), false) + .map(CombinedScanTask::files) + .flatMap(Collection::stream) + .collect(Collectors.toList()); this.fileItr = fileTasks.iterator(); if (fileTasks.stream().anyMatch(TableScanUtil::hasDeletes)) { - throw new UnsupportedOperationException("Cannot read files that require applying delete files"); + throw new UnsupportedOperationException( + "Cannot read files that require applying delete files"); } if (expectedSchema.columns().isEmpty()) { - throw new UnsupportedOperationException("Cannot read without at least one projected column"); + throw new UnsupportedOperationException( + "Cannot read without at least one projected column"); } - Set unsupportedTypes = Sets.difference( - expectedSchema.columns().stream().map(c -> c.type().typeId()).collect(Collectors.toSet()), - SUPPORTED_TYPES); + Set unsupportedTypes = + Sets.difference( + expectedSchema.columns().stream() + .map(c -> c.type().typeId()) + .collect(Collectors.toSet()), + SUPPORTED_TYPES); if (!unsupportedTypes.isEmpty()) { - throw new UnsupportedOperationException("Cannot read unsupported column types: " + unsupportedTypes); + throw new UnsupportedOperationException( + "Cannot read unsupported column types: " + unsupportedTypes); } Map keyMetadata = Maps.newHashMap(); @@ -264,8 +261,12 @@ private static final class VectorizedCombinedScanIterator implements CloseableIt .map(FileScanTask::file) .forEach(file -> keyMetadata.put(file.path().toString(), file.keyMetadata())); - Stream encrypted = keyMetadata.entrySet().stream() - .map(entry -> EncryptedFiles.encryptedInput(io.newInputFile(entry.getKey()), entry.getValue())); + Stream encrypted = + keyMetadata.entrySet().stream() + .map( + entry -> + EncryptedFiles.encryptedInput( + io.newInputFile(entry.getKey()), entry.getValue())); // decrypt with the batch call to avoid multiple RPCs to a key server, if possible @SuppressWarnings("StreamToIterable") @@ -320,14 +321,19 @@ CloseableIterator open(FileScanTask task) { InputFile location = getInputFile(task); Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask"); if (task.file().format() == FileFormat.PARQUET) { - Parquet.ReadBuilder builder = Parquet.read(location) - .project(expectedSchema) - .split(task.start(), task.length()) - .createBatchedReaderFunc(fileSchema -> buildReader(expectedSchema, - fileSchema, /* setArrowValidityVector */ NullCheckingForGet.NULL_CHECKING_ENABLED)) - .recordsPerBatch(batchSize) - .filter(task.residual()) - .caseSensitive(caseSensitive); + Parquet.ReadBuilder builder = + Parquet.read(location) + .project(expectedSchema) + .split(task.start(), task.length()) + .createBatchedReaderFunc( + fileSchema -> + buildReader( + expectedSchema, + fileSchema, /* setArrowValidityVector */ + NullCheckingForGet.NULL_CHECKING_ENABLED)) + .recordsPerBatch(batchSize) + .filter(task.residual()) + .caseSensitive(caseSensitive); if (reuseContainers) { builder.reuseContainers(); @@ -363,18 +369,22 @@ private InputFile getInputFile(FileScanTask task) { /** * Build the {@link ArrowBatchReader} for the expected schema and file schema. * - * @param expectedSchema Expected schema of the data returned. - * @param fileSchema Schema of the data file. + * @param expectedSchema Expected schema of the data returned. + * @param fileSchema Schema of the data file. * @param setArrowValidityVector Indicates whether to set the validity vector in Arrow vectors. */ private static ArrowBatchReader buildReader( - Schema expectedSchema, - MessageType fileSchema, - boolean setArrowValidityVector) { + Schema expectedSchema, MessageType fileSchema, boolean setArrowValidityVector) { return (ArrowBatchReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), + fileSchema, new VectorizedReaderBuilder( - expectedSchema, fileSchema, setArrowValidityVector, ImmutableMap.of(), ArrowBatchReader::new)); + expectedSchema, + fileSchema, + setArrowValidityVector, + ImmutableMap.of(), + ArrowBatchReader::new)); } } } diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowVectorAccessor.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowVectorAccessor.java index c912531732c9..397a51ae5efe 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowVectorAccessor.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowVectorAccessor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized; import org.apache.arrow.vector.ValueVector; diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowVectorAccessors.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowVectorAccessors.java index 69b5934c44e8..b97277f87dca 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowVectorAccessors.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowVectorAccessors.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized; import java.nio.ByteBuffer; @@ -30,12 +29,12 @@ final class ArrowVectorAccessors { private static final GenericArrowVectorAccessorFactory factory; static { - factory = new GenericArrowVectorAccessorFactory<>( - throwingSupplier("Decimal type is not supported"), - JavaStringFactory::new, - throwingSupplier("Struct type is not supported"), - throwingSupplier("List type is not supported") - ); + factory = + new GenericArrowVectorAccessorFactory<>( + throwingSupplier("Decimal type is not supported"), + JavaStringFactory::new, + throwingSupplier("Struct type is not supported"), + throwingSupplier("List type is not supported")); } private static Supplier throwingSupplier(String message) { @@ -45,7 +44,8 @@ private static Supplier throwingSupplier(String message) { } private ArrowVectorAccessors() { - throw new UnsupportedOperationException(ArrowVectorAccessors.class.getName() + " cannot be instantiated."); + throw new UnsupportedOperationException( + ArrowVectorAccessors.class.getName() + " cannot be instantiated."); } static ArrowVectorAccessor getVectorAccessor(VectorHolder holder) { @@ -71,8 +71,11 @@ public String ofBytes(byte[] bytes) { @Override public String ofByteBuffer(ByteBuffer byteBuffer) { if (byteBuffer.hasArray()) { - return new String(byteBuffer.array(), byteBuffer.arrayOffset() + byteBuffer.position(), - byteBuffer.remaining(), StandardCharsets.UTF_8); + return new String( + byteBuffer.array(), + byteBuffer.arrayOffset() + byteBuffer.position(), + byteBuffer.remaining(), + StandardCharsets.UTF_8); } byte[] bytes = new byte[byteBuffer.remaining()]; byteBuffer.get(bytes); diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/BaseBatchReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/BaseBatchReader.java index 76b5fd55d521..2175293ab2b6 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/BaseBatchReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/BaseBatchReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized; import java.util.List; @@ -26,18 +25,17 @@ import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.ColumnPath; -/** - * A base BatchReader class that contains common functionality - */ +/** A base BatchReader class that contains common functionality */ @SuppressWarnings("checkstyle:VisibilityModifier") public abstract class BaseBatchReader implements VectorizedReader { protected final VectorizedArrowReader[] readers; protected final VectorHolder[] vectorHolders; protected BaseBatchReader(List> readers) { - this.readers = readers.stream() - .map(VectorizedArrowReader.class::cast) - .toArray(VectorizedArrowReader[]::new); + this.readers = + readers.stream() + .map(VectorizedArrowReader.class::cast) + .toArray(VectorizedArrowReader[]::new); this.vectorHolders = new VectorHolder[readers.size()]; } diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ColumnVector.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ColumnVector.java index 78c6f4de6eb3..940f099cd21e 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ColumnVector.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ColumnVector.java @@ -16,33 +16,31 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized; import org.apache.arrow.vector.FieldVector; import org.apache.iceberg.types.Types; /** - * This class is inspired by Spark's {@code ColumnVector}. - * This class represents the column data for an Iceberg table query. - * It wraps an arrow {@link FieldVector} and provides simple - * accessors for the row values. Advanced users can access - * the {@link FieldVector}. - *

- * Supported Iceberg data types: - *

    - *
  • {@link Types.BooleanType}
  • - *
  • {@link Types.IntegerType}
  • - *
  • {@link Types.LongType}
  • - *
  • {@link Types.FloatType}
  • - *
  • {@link Types.DoubleType}
  • - *
  • {@link Types.StringType}
  • - *
  • {@link Types.BinaryType}
  • - *
  • {@link Types.TimestampType} (with and without timezone)
  • - *
  • {@link Types.DateType}
  • - *
  • {@link Types.TimeType}
  • - *
  • {@link Types.UUIDType}
  • - *
+ * This class is inspired by Spark's {@code ColumnVector}. This class represents the column data for + * an Iceberg table query. It wraps an arrow {@link FieldVector} and provides simple accessors for + * the row values. Advanced users can access the {@link FieldVector}. + * + *

Supported Iceberg data types: + * + *

    + *
  • {@link Types.BooleanType} + *
  • {@link Types.IntegerType} + *
  • {@link Types.LongType} + *
  • {@link Types.FloatType} + *
  • {@link Types.DoubleType} + *
  • {@link Types.StringType} + *
  • {@link Types.BinaryType} + *
  • {@link Types.TimestampType} (with and without timezone) + *
  • {@link Types.DateType} + *
  • {@link Types.TimeType} + *
  • {@link Types.UUIDType} + *
*/ public class ColumnVector implements AutoCloseable { private final VectorHolder vectorHolder; diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ColumnarBatch.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ColumnarBatch.java index 976447ed63ff..c7deee6471ee 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ColumnarBatch.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ColumnarBatch.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized; import java.util.Arrays; @@ -25,8 +24,8 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; /** - * This class is inspired by Spark's {@code ColumnarBatch}. - * This class wraps a columnar batch in the result set of an Iceberg table query. + * This class is inspired by Spark's {@code ColumnarBatch}. This class wraps a columnar batch in the + * result set of an Iceberg table query. */ public class ColumnarBatch implements AutoCloseable { @@ -36,27 +35,32 @@ public class ColumnarBatch implements AutoCloseable { ColumnarBatch(int numRows, ColumnVector[] columns) { for (int i = 0; i < columns.length; i++) { int columnValueCount = columns[i].getFieldVector().getValueCount(); - Preconditions.checkArgument(numRows == columnValueCount, - "Number of rows (=" + numRows + ") != column[" + i + "] size (=" + columnValueCount + ")"); + Preconditions.checkArgument( + numRows == columnValueCount, + "Number of rows (=" + + numRows + + ") != column[" + + i + + "] size (=" + + columnValueCount + + ")"); } this.numRows = numRows; this.columns = columns; } /** - * Create a new instance of {@link VectorSchemaRoot} - * from the arrow vectors stored in this arrow batch. - * The arrow vectors are owned by the reader. + * Create a new instance of {@link VectorSchemaRoot} from the arrow vectors stored in this arrow + * batch. The arrow vectors are owned by the reader. */ public VectorSchemaRoot createVectorSchemaRootFromVectors() { - return VectorSchemaRoot.of(Arrays.stream(columns) - .map(ColumnVector::getFieldVector) - .toArray(FieldVector[]::new)); + return VectorSchemaRoot.of( + Arrays.stream(columns).map(ColumnVector::getFieldVector).toArray(FieldVector[]::new)); } /** - * Called to close all the columns in this batch. It is not valid to access the data after calling this. This must be - * called at the end to clean up memory allocations. + * Called to close all the columns in this batch. It is not valid to access the data after calling + * this. This must be called at the end to clean up memory allocations. */ @Override public void close() { @@ -65,23 +69,17 @@ public void close() { } } - /** - * Returns the number of columns that make up this batch. - */ + /** Returns the number of columns that make up this batch. */ public int numCols() { return columns.length; } - /** - * Returns the number of rows for read, including filtered rows. - */ + /** Returns the number of rows for read, including filtered rows. */ public int numRows() { return numRows; } - /** - * Returns the column at `ordinal`. - */ + /** Returns the column at `ordinal`. */ public ColumnVector column(int ordinal) { return columns[ordinal]; } diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java index 85d2fb7b5024..83f3f62e7018 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized; import java.lang.reflect.Array; @@ -49,22 +48,28 @@ import org.apache.parquet.schema.PrimitiveType; /** - * This class is creates typed {@link ArrowVectorAccessor} from {@link VectorHolder}. - * It provides a generic implementation for following Arrow types: + * This class is creates typed {@link ArrowVectorAccessor} from {@link VectorHolder}. It provides a + * generic implementation for following Arrow types: + * *
    - *
  • Decimal type can be deserialized to a type that supports decimal, - * e.g. BigDecimal or Spark's Decimal.
  • - *
  • UTF8 String type can deserialized to a Java String or Spark's UTF8String.
  • - *
  • List type: the child elements of a list can be deserialized to Spark's ColumnarArray or similar type.
  • - *
  • Struct type: the child elements of a struct can be deserialized to a Spark's ArrowColumnVector - * or similar type.
  • + *
  • Decimal type can be deserialized to a type that supports decimal, e.g. BigDecimal or + * Spark's Decimal. + *
  • UTF8 String type can deserialized to a Java String or Spark's UTF8String. + *
  • List type: the child elements of a list can be deserialized to Spark's ColumnarArray or + * similar type. + *
  • Struct type: the child elements of a struct can be deserialized to a Spark's + * ArrowColumnVector or similar type. *
+ * * @param A concrete type that can represent a decimal. * @param A concrete type that can represent a UTF8 string. - * @param A concrete type that can represent an array value in a list vector, e.g. Spark's ColumnarArray. - * @param A concrete type that can represent a child vector in a struct, e.g. Spark's ArrowColumnVector. + * @param A concrete type that can represent an array value in a list vector, e.g. Spark's + * ColumnarArray. + * @param A concrete type that can represent a child vector in a struct, e.g. Spark's + * ArrowColumnVector. */ -public class GenericArrowVectorAccessorFactory { +public class GenericArrowVectorAccessorFactory< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> { private final Supplier> decimalFactorySupplier; private final Supplier> stringFactorySupplier; @@ -72,22 +77,23 @@ public class GenericArrowVectorAccessorFactory> arrayFactorySupplier; /** - * The constructor is parameterized using the decimal, string, struct and array factories. - * If a specific type is not supported, the factory supplier can raise an - * {@link UnsupportedOperationException}. + * The constructor is parameterized using the decimal, string, struct and array factories. If a + * specific type is not supported, the factory supplier can raise an {@link + * UnsupportedOperationException}. */ protected GenericArrowVectorAccessorFactory( - Supplier> decimalFactorySupplier, - Supplier> stringFactorySupplier, - Supplier> structChildFactorySupplier, - Supplier> arrayFactorySupplier) { + Supplier> decimalFactorySupplier, + Supplier> stringFactorySupplier, + Supplier> structChildFactorySupplier, + Supplier> arrayFactorySupplier) { this.decimalFactorySupplier = decimalFactorySupplier; this.stringFactorySupplier = stringFactorySupplier; this.structChildFactorySupplier = structChildFactorySupplier; this.arrayFactorySupplier = arrayFactorySupplier; } - public ArrowVectorAccessor getVectorAccessor(VectorHolder holder) { + public ArrowVectorAccessor getVectorAccessor( + VectorHolder holder) { Dictionary dictionary = holder.dictionary(); boolean isVectorDictEncoded = holder.isDictionaryEncoded(); FieldVector vector = holder.vector(); @@ -100,18 +106,22 @@ public ArrowVectorAccessor getVecto } } - private ArrowVectorAccessor getDictionaryVectorAccessor( - Dictionary dictionary, - ColumnDescriptor desc, - FieldVector vector, PrimitiveType primitive) { - Preconditions.checkState(vector instanceof IntVector, "Dictionary ids should be stored in IntVectors only"); + private ArrowVectorAccessor + getDictionaryVectorAccessor( + Dictionary dictionary, + ColumnDescriptor desc, + FieldVector vector, + PrimitiveType primitive) { + Preconditions.checkState( + vector instanceof IntVector, "Dictionary ids should be stored in IntVectors only"); if (primitive.getOriginalType() != null) { switch (desc.getPrimitiveType().getOriginalType()) { case ENUM: case JSON: case UTF8: case BSON: - return new DictionaryStringAccessor<>((IntVector) vector, dictionary, stringFactorySupplier.get()); + return new DictionaryStringAccessor<>( + (IntVector) vector, dictionary, stringFactorySupplier.get()); case INT_64: case TIME_MICROS: case TIMESTAMP_MILLIS: @@ -122,19 +132,13 @@ private ArrowVectorAccessor getDict case BINARY: case FIXED_LEN_BYTE_ARRAY: return new DictionaryDecimalBinaryAccessor<>( - (IntVector) vector, - dictionary, - decimalFactorySupplier.get()); + (IntVector) vector, dictionary, decimalFactorySupplier.get()); case INT64: return new DictionaryDecimalLongAccessor<>( - (IntVector) vector, - dictionary, - decimalFactorySupplier.get()); + (IntVector) vector, dictionary, decimalFactorySupplier.get()); case INT32: return new DictionaryDecimalIntAccessor<>( - (IntVector) vector, - dictionary, - decimalFactorySupplier.get()); + (IntVector) vector, dictionary, decimalFactorySupplier.get()); default: throw new UnsupportedOperationException( "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); @@ -161,8 +165,8 @@ private ArrowVectorAccessor getDict } @SuppressWarnings("checkstyle:CyclomaticComplexity") - private ArrowVectorAccessor - getPlainVectorAccessor(FieldVector vector) { + private ArrowVectorAccessor getPlainVectorAccessor( + FieldVector vector) { if (vector instanceof BitVector) { return new BooleanAccessor<>((BitVector) vector); } else if (vector instanceof IntVector) { @@ -199,8 +203,9 @@ private ArrowVectorAccessor getDict throw new UnsupportedOperationException("Unsupported vector: " + vector.getClass()); } - private static class BooleanAccessor - extends ArrowVectorAccessor { + private static class BooleanAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final BitVector vector; BooleanAccessor(BitVector vector) { @@ -214,8 +219,9 @@ public final boolean getBoolean(int rowId) { } } - private static class IntAccessor - extends ArrowVectorAccessor { + private static class IntAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final IntVector vector; @@ -235,8 +241,9 @@ public final long getLong(int rowId) { } } - private static class LongAccessor - extends ArrowVectorAccessor { + private static class LongAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final BigIntVector vector; @@ -251,8 +258,9 @@ public final long getLong(int rowId) { } } - private static class DictionaryLongAccessor - extends ArrowVectorAccessor { + private static class DictionaryLongAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final IntVector offsetVector; private final Dictionary dictionary; @@ -268,8 +276,9 @@ public final long getLong(int rowId) { } } - private static class FloatAccessor - extends ArrowVectorAccessor { + private static class FloatAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final Float4Vector vector; @@ -289,8 +298,9 @@ public final double getDouble(int rowId) { } } - private static class DictionaryFloatAccessor - extends ArrowVectorAccessor { + private static class DictionaryFloatAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final IntVector offsetVector; private final Dictionary dictionary; @@ -311,8 +321,9 @@ public final double getDouble(int rowId) { } } - private static class DoubleAccessor - extends ArrowVectorAccessor { + private static class DoubleAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final Float8Vector vector; @@ -327,8 +338,9 @@ public final double getDouble(int rowId) { } } - private static class DictionaryDoubleAccessor - extends ArrowVectorAccessor { + private static class DictionaryDoubleAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final IntVector offsetVector; private final Dictionary dictionary; @@ -344,8 +356,9 @@ public final double getDouble(int rowId) { } } - private static class StringAccessor - extends ArrowVectorAccessor { + private static class StringAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final VarCharVector vector; private final StringFactory stringFactory; @@ -362,14 +375,16 @@ public final Utf8StringT getUTF8String(int rowId) { } } - private static class DictionaryStringAccessor - extends ArrowVectorAccessor { + private static class DictionaryStringAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final Dictionary dictionary; private final StringFactory stringFactory; private final IntVector offsetVector; private final Utf8StringT[] cache; - DictionaryStringAccessor(IntVector vector, Dictionary dictionary, StringFactory stringFactory) { + DictionaryStringAccessor( + IntVector vector, Dictionary dictionary, StringFactory stringFactory) { super(vector); this.offsetVector = vector; this.dictionary = dictionary; @@ -381,14 +396,16 @@ private static class DictionaryStringAccessor - extends ArrowVectorAccessor { + private static class BinaryAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final VarBinaryVector vector; @@ -403,8 +420,9 @@ public final byte[] getBinary(int rowId) { } } - private static class DictionaryBinaryAccessor - extends ArrowVectorAccessor { + private static class DictionaryBinaryAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final IntVector offsetVector; private final Dictionary dictionary; @@ -420,8 +438,9 @@ public final byte[] getBinary(int rowId) { } } - private static class DateAccessor - extends ArrowVectorAccessor { + private static class DateAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final DateDayVector vector; @@ -436,8 +455,9 @@ public final int getInt(int rowId) { } } - private static class TimestampMicroTzAccessor - extends ArrowVectorAccessor { + private static class TimestampMicroTzAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final TimeStampMicroTZVector vector; @@ -452,7 +472,8 @@ public final long getLong(int rowId) { } } - private static class TimestampMicroAccessor + private static class TimestampMicroAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> extends ArrowVectorAccessor { private final TimeStampMicroVector vector; @@ -468,7 +489,8 @@ public final long getLong(int rowId) { } } - private static class TimeMicroAccessor + private static class TimeMicroAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> extends ArrowVectorAccessor { private final TimeMicroVector vector; @@ -484,7 +506,8 @@ public final long getLong(int rowId) { } } - private static class FixedSizeBinaryAccessor + private static class FixedSizeBinaryAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> extends ArrowVectorAccessor { private final FixedSizeBinaryVector vector; @@ -500,8 +523,9 @@ public byte[] getBinary(int rowId) { } } - private static class ArrayAccessor - extends ArrowVectorAccessor { + private static class ArrayAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final ListVector vector; private final ChildVectorT arrayData; @@ -520,18 +544,22 @@ public final ArrayT getArray(int rowId) { } } - private static class StructAccessor - extends ArrowVectorAccessor { + private static class StructAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { StructAccessor(StructVector structVector, StructChildFactory structChildFactory) { - super(structVector, IntStream.range(0, structVector.size()) + super( + structVector, + IntStream.range(0, structVector.size()) .mapToObj(structVector::getVectorById) .map(structChildFactory::of) .toArray(genericArray(structChildFactory.getGenericClass()))); } } - private static class DecimalAccessor - extends ArrowVectorAccessor { + private static class DecimalAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final DecimalVector vector; private final DecimalFactory decimalFactory; @@ -545,24 +573,24 @@ private static class DecimalAccessor - extends ArrowVectorAccessor { + private abstract static class DictionaryDecimalAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { private final DecimalT[] cache; private final IntVector offsetVector; protected final DecimalFactory decimalFactory; protected final Dictionary parquetDictionary; private DictionaryDecimalAccessor( - IntVector vector, - Dictionary dictionary, - DecimalFactory decimalFactory) { + IntVector vector, Dictionary dictionary, DecimalFactory decimalFactory) { super(vector); this.offsetVector = vector; this.parquetDictionary = dictionary; @@ -582,26 +610,30 @@ public final DecimalT getDecimal(int rowId, int precision, int scale) { protected abstract DecimalT decode(int dictId, int precision, int scale); } - private static class - DictionaryDecimalBinaryAccessor - extends DictionaryDecimalAccessor { + private static class DictionaryDecimalBinaryAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends DictionaryDecimalAccessor { - DictionaryDecimalBinaryAccessor(IntVector vector, Dictionary dictionary, DecimalFactory decimalFactory) { + DictionaryDecimalBinaryAccessor( + IntVector vector, Dictionary dictionary, DecimalFactory decimalFactory) { super(vector, dictionary, decimalFactory); } @Override protected DecimalT decode(int dictId, int precision, int scale) { ByteBuffer byteBuffer = parquetDictionary.decodeToBinary(dictId).toByteBuffer(); - BigDecimal value = DecimalUtility.getBigDecimalFromByteBuffer(byteBuffer, scale, byteBuffer.remaining()); + BigDecimal value = + DecimalUtility.getBigDecimalFromByteBuffer(byteBuffer, scale, byteBuffer.remaining()); return decimalFactory.ofBigDecimal(value, precision, scale); } } - private static class DictionaryDecimalLongAccessor - extends DictionaryDecimalAccessor { + private static class DictionaryDecimalLongAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends DictionaryDecimalAccessor { - DictionaryDecimalLongAccessor(IntVector vector, Dictionary dictionary, DecimalFactory decimalFactory) { + DictionaryDecimalLongAccessor( + IntVector vector, Dictionary dictionary, DecimalFactory decimalFactory) { super(vector, dictionary, decimalFactory); } @@ -611,10 +643,12 @@ protected DecimalT decode(int dictId, int precision, int scale) { } } - private static class DictionaryDecimalIntAccessor - extends DictionaryDecimalAccessor { + private static class DictionaryDecimalIntAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends DictionaryDecimalAccessor { - DictionaryDecimalIntAccessor(IntVector vector, Dictionary dictionary, DecimalFactory decimalFactory) { + DictionaryDecimalIntAccessor( + IntVector vector, Dictionary dictionary, DecimalFactory decimalFactory) { super(vector, dictionary, decimalFactory); } @@ -626,83 +660,68 @@ protected DecimalT decode(int dictId, int precision, int scale) { /** * Create a decimal value of type {@code DecimalT} from arrow vector value. + * * @param A concrete type that can represent a decimal, e.g, Spark's Decimal. */ protected interface DecimalFactory { - /** - * Class of concrete decimal type. - */ + /** Class of concrete decimal type. */ Class getGenericClass(); - /** - * Create a decimal from the given long value, precision and scale. - */ + /** Create a decimal from the given long value, precision and scale. */ DecimalT ofLong(long value, int precision, int scale); - /** - * Create a decimal from the given {@link BigDecimal} value, precision and scale. - */ + /** Create a decimal from the given {@link BigDecimal} value, precision and scale. */ DecimalT ofBigDecimal(BigDecimal value, int precision, int scale); } /** * Create a UTF8 String value of type {@code Utf8StringT} from arrow vector value. + * * @param A concrete type that can represent a UTF8 string. */ protected interface StringFactory { - /** - * Class of concrete UTF8 String type. - */ + /** Class of concrete UTF8 String type. */ Class getGenericClass(); - /** - * Create a UTF8 String from the row value in the arrow vector. - */ + /** Create a UTF8 String from the row value in the arrow vector. */ Utf8StringT ofRow(VarCharVector vector, int rowId); - /** - * Create a UTF8 String from the byte array. - */ + /** Create a UTF8 String from the byte array. */ Utf8StringT ofBytes(byte[] bytes); - /** - * Create a UTF8 String from the byte buffer. - */ + /** Create a UTF8 String from the byte buffer. */ Utf8StringT ofByteBuffer(ByteBuffer byteBuffer); } /** * Create an array value of type {@code ArrayT} from arrow vector value. - * @param A concrete type that can represent an array value in a list vector, - * e.g. Spark's ColumnarArray. - * @param A concrete type that can represent a child vector in a struct, - * e.g. Spark's ArrowColumnVector. + * + * @param A concrete type that can represent an array value in a list vector, e.g. + * Spark's ColumnarArray. + * @param A concrete type that can represent a child vector in a struct, e.g. + * Spark's ArrowColumnVector. */ protected interface ArrayFactory { - /** - * Create a child vector of type {@code ChildVectorT} from the arrow child vector. - */ + /** Create a child vector of type {@code ChildVectorT} from the arrow child vector. */ ChildVectorT ofChild(ValueVector childVector); - /** - * Create an Arrow of type {@code ArrayT} from the row value in the arrow child vector. - */ + /** Create an Arrow of type {@code ArrayT} from the row value in the arrow child vector. */ ArrayT ofRow(ValueVector vector, ChildVectorT childData, int rowId); } /** * Create a struct child vector of type {@code ChildVectorT} from arrow vector value. - * @param A concrete type that can represent a child vector in a struct, - * e.g. Spark's ArrowColumnVector. + * + * @param A concrete type that can represent a child vector in a struct, e.g. + * Spark's ArrowColumnVector. */ protected interface StructChildFactory { - /** - * Class of concrete child vector type. - */ + /** Class of concrete child vector type. */ Class getGenericClass(); /** - * Create the child vector of type such as Spark's ArrowColumnVector from the arrow child vector. + * Create the child vector of type such as Spark's ArrowColumnVector from the arrow child + * vector. */ ChildVectorT of(ValueVector childVector); } diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/NullabilityHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/NullabilityHolder.java index 589b2491fa45..c8d9376d5254 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/NullabilityHolder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/NullabilityHolder.java @@ -16,17 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized; import java.util.Arrays; /** - * Instances of this class simply track whether a value at an index is null. - * For simplicity and performance, it is expected that various setter methods - * {@link #setNull(int)}, {@link #setNulls(int, int)}, {@link #setNotNull(int)} - * and {@link #setNotNulls(int, int)} are invoked with monotonically - * increasing values for the index parameter. + * Instances of this class simply track whether a value at an index is null. For simplicity and + * performance, it is expected that various setter methods {@link #setNull(int)}, {@link + * #setNulls(int, int)}, {@link #setNotNull(int)} and {@link #setNotNulls(int, int)} are invoked + * with monotonically increasing values for the index parameter. */ public class NullabilityHolder { private final byte[] isNull; @@ -64,9 +62,7 @@ public void setNotNulls(int startIndex, int num) { System.arraycopy(nonNulls, 0, isNull, startIndex, num); } - /** - * Returns 1 if null, 0 otherwise. - */ + /** Returns 1 if null, 0 otherwise. */ public byte isNullAt(int index) { return isNull[index]; } diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java index 94c4372ffde7..e4254b1448c6 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized; import org.apache.arrow.vector.FieldVector; @@ -26,8 +25,8 @@ import org.apache.parquet.column.Dictionary; /** - * Container class for holding the Arrow vector storing a batch of values along with other state needed for reading - * values out of it. + * Container class for holding the Arrow vector storing a batch of values along with other state + * needed for reading values out of it. */ public class VectorHolder { private final ColumnDescriptor columnDescriptor; @@ -38,8 +37,12 @@ public class VectorHolder { private final Type icebergType; public VectorHolder( - ColumnDescriptor columnDescriptor, FieldVector vector, boolean isDictionaryEncoded, - Dictionary dictionary, NullabilityHolder holder, Type type) { + ColumnDescriptor columnDescriptor, + FieldVector vector, + boolean isDictionaryEncoded, + Dictionary dictionary, + NullabilityHolder holder, + Type type) { // All the fields except dictionary are not nullable unless it is a dummy holder Preconditions.checkNotNull(columnDescriptor, "ColumnDescriptor cannot be null"); Preconditions.checkNotNull(vector, "Vector cannot be null"); @@ -117,8 +120,8 @@ public boolean isDummy() { } /** - * A Vector Holder which does not actually produce values, consumers of this class should - * use the constantValue to populate their ColumnVector implementation. + * A Vector Holder which does not actually produce values, consumers of this class should use the + * constantValue to populate their ColumnVector implementation. */ public static class ConstantVectorHolder extends VectorHolder { private final T constantValue; @@ -162,5 +165,4 @@ public int numValues() { return numRows; } } - } diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java index 7db664505f7e..79cbfb34bd54 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized; import java.util.Map; @@ -54,8 +53,9 @@ import org.apache.parquet.schema.PrimitiveType; /** - * {@link VectorizedReader VectorReader(s)} that read in a batch of values into Arrow vectors. It also takes care of - * allocating the right kind of Arrow vectors depending on the corresponding Iceberg/Parquet data types. + * {@link VectorizedReader VectorReader(s)} that read in a batch of values into Arrow vectors. It + * also takes care of allocating the right kind of Arrow vectors depending on the corresponding + * Iceberg/Parquet data types. */ public class VectorizedArrowReader implements VectorizedReader { public static final int DEFAULT_BATCH_SIZE = 5000; @@ -73,8 +73,10 @@ public class VectorizedArrowReader implements VectorizedReader { private ReadType readType; private NullabilityHolder nullabilityHolder; - // In cases when Parquet employs fall back to plain encoding, we eagerly decode the dictionary encoded pages - // before storing the values in the Arrow vector. This means even if the dictionary is present, data + // In cases when Parquet employs fall back to plain encoding, we eagerly decode the dictionary + // encoded pages + // before storing the values in the Arrow vector. This means even if the dictionary is present, + // data // present in the vector may not necessarily be dictionary encoded. private Dictionary dictionary; @@ -124,8 +126,9 @@ public void setBatchSize(int batchSize) { @Override public VectorHolder read(VectorHolder reuse, int numValsToRead) { boolean dictEncoded = vectorizedColumnIterator.producesDictionaryEncodedVector(); - if (reuse == null || (!dictEncoded && readType == ReadType.DICTIONARY) || - (dictEncoded && readType != ReadType.DICTIONARY)) { + if (reuse == null + || (!dictEncoded && readType == ReadType.DICTIONARY) + || (dictEncoded && readType != ReadType.DICTIONARY)) { allocateFieldVector(dictEncoded); nullabilityHolder = new NullabilityHolder(batchSize); } else { @@ -138,49 +141,72 @@ public VectorHolder read(VectorHolder reuse, int numValsToRead) { } else { switch (readType) { case FIXED_LENGTH_DECIMAL: - vectorizedColumnIterator.fixedLengthDecimalBatchReader().nextBatch(vec, typeWidth, nullabilityHolder); + vectorizedColumnIterator + .fixedLengthDecimalBatchReader() + .nextBatch(vec, typeWidth, nullabilityHolder); break; case INT_BACKED_DECIMAL: - vectorizedColumnIterator.intBackedDecimalBatchReader().nextBatch(vec, -1, nullabilityHolder); + vectorizedColumnIterator + .intBackedDecimalBatchReader() + .nextBatch(vec, -1, nullabilityHolder); break; case LONG_BACKED_DECIMAL: - vectorizedColumnIterator.longBackedDecimalBatchReader().nextBatch(vec, -1, nullabilityHolder); + vectorizedColumnIterator + .longBackedDecimalBatchReader() + .nextBatch(vec, -1, nullabilityHolder); break; case VARBINARY: case VARCHAR: - vectorizedColumnIterator.varWidthTypeBatchReader().nextBatch(vec, -1, nullabilityHolder); + vectorizedColumnIterator + .varWidthTypeBatchReader() + .nextBatch(vec, -1, nullabilityHolder); break; case FIXED_WIDTH_BINARY: - vectorizedColumnIterator.fixedWidthTypeBinaryBatchReader().nextBatch(vec, typeWidth, nullabilityHolder); + vectorizedColumnIterator + .fixedWidthTypeBinaryBatchReader() + .nextBatch(vec, typeWidth, nullabilityHolder); break; case BOOLEAN: vectorizedColumnIterator.booleanBatchReader().nextBatch(vec, -1, nullabilityHolder); break; case INT: - vectorizedColumnIterator.integerBatchReader().nextBatch(vec, typeWidth, nullabilityHolder); + vectorizedColumnIterator + .integerBatchReader() + .nextBatch(vec, typeWidth, nullabilityHolder); break; case LONG: vectorizedColumnIterator.longBatchReader().nextBatch(vec, typeWidth, nullabilityHolder); break; case FLOAT: - vectorizedColumnIterator.floatBatchReader().nextBatch(vec, typeWidth, nullabilityHolder); + vectorizedColumnIterator + .floatBatchReader() + .nextBatch(vec, typeWidth, nullabilityHolder); break; case DOUBLE: - vectorizedColumnIterator.doubleBatchReader().nextBatch(vec, typeWidth, nullabilityHolder); + vectorizedColumnIterator + .doubleBatchReader() + .nextBatch(vec, typeWidth, nullabilityHolder); break; case TIMESTAMP_MILLIS: - vectorizedColumnIterator.timestampMillisBatchReader().nextBatch(vec, typeWidth, nullabilityHolder); + vectorizedColumnIterator + .timestampMillisBatchReader() + .nextBatch(vec, typeWidth, nullabilityHolder); break; case UUID: - vectorizedColumnIterator.fixedSizeBinaryBatchReader().nextBatch(vec, typeWidth, nullabilityHolder); + vectorizedColumnIterator + .fixedSizeBinaryBatchReader() + .nextBatch(vec, typeWidth, nullabilityHolder); break; } } } - Preconditions.checkState(vec.getValueCount() == numValsToRead, - "Number of values read, %s, does not equal expected, %s", vec.getValueCount(), numValsToRead); - return new VectorHolder(columnDescriptor, vec, dictEncoded, dictionary, - nullabilityHolder, icebergField.type()); + Preconditions.checkState( + vec.getValueCount() == numValsToRead, + "Number of values read, %s, does not equal expected, %s", + vec.getValueCount(), + numValsToRead); + return new VectorHolder( + columnDescriptor, vec, dictEncoded, dictionary, nullabilityHolder, icebergField.type()); } private void allocateFieldVector(boolean dictionaryEncodedVector) { @@ -197,10 +223,12 @@ private void allocateFieldVector(boolean dictionaryEncodedVector) { } private void allocateDictEncodedVector() { - Field field = new Field( - icebergField.name(), - new FieldType(icebergField.isOptional(), new ArrowType.Int(Integer.SIZE, true), null, null), - null); + Field field = + new Field( + icebergField.name(), + new FieldType( + icebergField.isOptional(), new ArrowType.Int(Integer.SIZE, true), null, null), + null); this.vec = field.createVector(rootAlloc); ((IntVector) vec).allocateNew(batchSize); this.typeWidth = (int) IntVector.TYPE_WIDTH; @@ -315,20 +343,27 @@ private void allocateVectorBasedOnTypeName(PrimitiveType primitive, Field arrowF this.typeWidth = UNKNOWN_WIDTH; break; case INT32: - Field intField = new Field( - icebergField.name(), - new FieldType(icebergField.isOptional(), new ArrowType.Int(Integer.SIZE, true), - null, null), null); + Field intField = + new Field( + icebergField.name(), + new FieldType( + icebergField.isOptional(), new ArrowType.Int(Integer.SIZE, true), null, null), + null); this.vec = intField.createVector(rootAlloc); ((IntVector) vec).allocateNew(batchSize); this.readType = ReadType.INT; this.typeWidth = (int) IntVector.TYPE_WIDTH; break; case FLOAT: - Field floatField = new Field( - icebergField.name(), - new FieldType(icebergField.isOptional(), new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), - null, null), null); + Field floatField = + new Field( + icebergField.name(), + new FieldType( + icebergField.isOptional(), + new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), + null, + null), + null); this.vec = floatField.createVector(rootAlloc); ((Float4Vector) vec).allocateNew(batchSize); this.readType = ReadType.FLOAT; @@ -358,11 +393,13 @@ private void allocateVectorBasedOnTypeName(PrimitiveType primitive, Field arrowF } @Override - public void setRowGroupInfo(PageReadStore source, Map metadata, long rowPosition) { + public void setRowGroupInfo( + PageReadStore source, Map metadata, long rowPosition) { ColumnChunkMetaData chunkMetaData = metadata.get(ColumnPath.get(columnDescriptor.getPath())); - this.dictionary = vectorizedColumnIterator.setRowGroupInfo( - source.getPageReader(columnDescriptor), - !ParquetUtil.hasNonDictionaryPages(chunkMetaData)); + this.dictionary = + vectorizedColumnIterator.setRowGroupInfo( + source.getPageReader(columnDescriptor), + !ParquetUtil.hasNonDictionaryPages(chunkMetaData)); } @Override @@ -398,8 +435,8 @@ public VectorHolder read(VectorHolder reuse, int numValsToRead) { } @Override - public void setRowGroupInfo(PageReadStore source, Map metadata, long rowPosition) { - } + public void setRowGroupInfo( + PageReadStore source, Map metadata, long rowPosition) {} @Override public String toString() { @@ -407,12 +444,12 @@ public String toString() { } @Override - public void setBatchSize(int batchSize) { - } + public void setBatchSize(int batchSize) {} } private static final class PositionVectorReader extends VectorizedArrowReader { - private static final Field ROW_POSITION_ARROW_FIELD = ArrowSchemaUtil.convert(MetadataColumns.ROW_POSITION); + private static final Field ROW_POSITION_ARROW_FIELD = + ArrowSchemaUtil.convert(MetadataColumns.ROW_POSITION); private final boolean setArrowValidityVector; private long rowStart; private int batchSize; @@ -451,7 +488,8 @@ public VectorHolder read(VectorHolder reuse, int numValsToRead) { } private static BigIntVector newVector(int valueCount) { - BigIntVector vector = (BigIntVector) ROW_POSITION_ARROW_FIELD.createVector(ArrowAllocation.rootAllocator()); + BigIntVector vector = + (BigIntVector) ROW_POSITION_ARROW_FIELD.createVector(ArrowAllocation.rootAllocator()); vector.allocateNew(valueCount); return vector; } @@ -463,7 +501,8 @@ private static NullabilityHolder newNullabilityHolder(int size) { } @Override - public void setRowGroupInfo(PageReadStore source, Map metadata, long rowPosition) { + public void setRowGroupInfo( + PageReadStore source, Map metadata, long rowPosition) { this.rowStart = rowPosition; } @@ -489,6 +528,7 @@ public void close() { /** * A Dummy Vector Reader which doesn't actually read files, instead it returns a dummy * VectorHolder which indicates the constant value which should be used for this column. + * * @param The constant value to use */ public static class ConstantVectorReader extends VectorizedArrowReader { @@ -504,8 +544,8 @@ public VectorHolder read(VectorHolder reuse, int numValsToRead) { } @Override - public void setRowGroupInfo(PageReadStore source, Map metadata, long rowPosition) { - } + public void setRowGroupInfo( + PageReadStore source, Map metadata, long rowPosition) {} @Override public String toString() { @@ -513,17 +553,15 @@ public String toString() { } @Override - public void setBatchSize(int batchSize) { - } + public void setBatchSize(int batchSize) {} } /** - * A Dummy Vector Reader which doesn't actually read files. Instead, it returns a - * Deleted Vector Holder which indicates whether a given row is deleted. + * A Dummy Vector Reader which doesn't actually read files. Instead, it returns a Deleted Vector + * Holder which indicates whether a given row is deleted. */ public static class DeletedVectorReader extends VectorizedArrowReader { - public DeletedVectorReader() { - } + public DeletedVectorReader() {} @Override public VectorHolder read(VectorHolder reuse, int numValsToRead) { @@ -531,8 +569,8 @@ public VectorHolder read(VectorHolder reuse, int numValsToRead) { } @Override - public void setRowGroupInfo(PageReadStore source, Map metadata, long rowPosition) { - } + public void setRowGroupInfo( + PageReadStore source, Map metadata, long rowPosition) {} @Override public String toString() { @@ -540,9 +578,6 @@ public String toString() { } @Override - public void setBatchSize(int batchSize) { - } + public void setBatchSize(int batchSize) {} } - } - diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java index 88480fd21b0e..896b95f24a6b 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized; import java.util.List; @@ -50,12 +49,14 @@ public class VectorizedReaderBuilder extends TypeWithSchemaVisitor idToConstant, + boolean setArrowValidityVector, + Map idToConstant, Function>, VectorizedReader> readerFactory) { this.parquetSchema = parquetSchema; this.icebergSchema = expectedSchema; - this.rootAllocator = ArrowAllocation.rootAllocator() - .newChildAllocator("VectorizedReadBuilder", 0, Long.MAX_VALUE); + this.rootAllocator = + ArrowAllocation.rootAllocator() + .newChildAllocator("VectorizedReadBuilder", 0, Long.MAX_VALUE); this.setArrowValidityVector = setArrowValidityVector; this.idToConstant = idToConstant; this.readerFactory = readerFactory; @@ -63,8 +64,7 @@ public VectorizedReaderBuilder( @Override public VectorizedReader message( - Types.StructType expected, MessageType message, - List> fieldReaders) { + Types.StructType expected, MessageType message, List> fieldReaders) { GroupType groupType = message.asGroupType(); Map> readersById = Maps.newHashMap(); List fields = groupType.getFields(); @@ -73,11 +73,11 @@ public VectorizedReader message( .filter(pos -> fields.get(pos).getId() != null) .forEach(pos -> readersById.put(fields.get(pos).getId().intValue(), fieldReaders.get(pos))); - List icebergFields = expected != null ? - expected.fields() : ImmutableList.of(); + List icebergFields = + expected != null ? expected.fields() : ImmutableList.of(); - List> reorderedFields = Lists.newArrayListWithExpectedSize( - icebergFields.size()); + List> reorderedFields = + Lists.newArrayListWithExpectedSize(icebergFields.size()); for (Types.NestedField field : icebergFields) { int id = field.fieldId(); @@ -107,18 +107,17 @@ protected VectorizedReader vectorizedReader(List> reorder @Override public VectorizedReader struct( - Types.StructType expected, GroupType groupType, - List> fieldReaders) { + Types.StructType expected, GroupType groupType, List> fieldReaders) { if (expected != null) { - throw new UnsupportedOperationException("Vectorized reads are not supported yet for struct fields"); + throw new UnsupportedOperationException( + "Vectorized reads are not supported yet for struct fields"); } return null; } @Override public VectorizedReader primitive( - org.apache.iceberg.types.Type.PrimitiveType expected, - PrimitiveType primitive) { + org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { // Create arrow vector for this field if (primitive.getId() == null) { diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedTableScanIterable.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedTableScanIterable.java index e5a81e8f9604..3a4ef34adae6 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedTableScanIterable.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedTableScanIterable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized; import java.io.IOException; @@ -27,10 +26,11 @@ import org.apache.iceberg.io.CloseableIterator; /** - * A vectorized implementation of the Iceberg reader that iterates over the table scan. - * See {@link ArrowReader} for details. + * A vectorized implementation of the Iceberg reader that iterates over the table scan. See {@link + * ArrowReader} for details. */ -public class VectorizedTableScanIterable extends CloseableGroup implements CloseableIterable { +public class VectorizedTableScanIterable extends CloseableGroup + implements CloseableIterable { private static final int BATCH_SIZE_IN_NUM_ROWS = 1 << 16; @@ -39,9 +39,8 @@ public class VectorizedTableScanIterable extends CloseableGroup implements Close /** * Create a new instance using default values for {@code batchSize} and {@code reuseContainers}. - * The {@code batchSize} is set to {@link #BATCH_SIZE_IN_NUM_ROWS} and {@code reuseContainers} - * is set to {@code false}. - * + * The {@code batchSize} is set to {@link #BATCH_SIZE_IN_NUM_ROWS} and {@code reuseContainers} is + * set to {@code false}. */ public VectorizedTableScanIterable(TableScan scan) { this(scan, BATCH_SIZE_IN_NUM_ROWS, false); @@ -50,7 +49,7 @@ public VectorizedTableScanIterable(TableScan scan) { /** * Create a new instance. * - * See {@link ArrowReader#ArrowReader(TableScan, int, boolean)} for details. + *

See {@link ArrowReader#ArrowReader(TableScan, int, boolean)} for details. */ public VectorizedTableScanIterable(TableScan scan, int batchSize, boolean reuseContainers) { this.reader = new ArrowReader(scan, batchSize, reuseContainers); diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/BaseVectorizedParquetValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/BaseVectorizedParquetValuesReader.java index 5ef6efaba1c9..247f4d4f849e 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/BaseVectorizedParquetValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/BaseVectorizedParquetValuesReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized.parquet; import java.io.IOException; @@ -30,13 +29,13 @@ import org.apache.parquet.io.ParquetDecodingException; /** - * A values reader for Parquet's run-length encoded data that reads column data in batches instead of one value at a - * time. This is based off of the VectorizedRleValuesReader class in Apache Spark with these changes: - *

- * Writes batches of values retrieved to Arrow vectors. If all pages of a column within the row group - * are not dictionary encoded, then dictionary ids are eagerly decoded into actual values before - * writing them to the Arrow vectors - *

+ * A values reader for Parquet's run-length encoded data that reads column data in batches instead + * of one value at a time. This is based off of the VectorizedRleValuesReader class in Apache Spark + * with these changes: + * + *

Writes batches of values retrieved to Arrow vectors. If all pages of a column within the row + * group are not dictionary encoded, then dictionary ids are eagerly decoded into actual values + * before writing them to the Arrow vectors */ @SuppressWarnings("checkstyle:VisibilityModifier") public class BaseVectorizedParquetValuesReader extends ValuesReader { @@ -80,12 +79,13 @@ public BaseVectorizedParquetValuesReader(int maxDefLevel, boolean setValidityVec this.setArrowValidityVector = setValidityVector; } - public BaseVectorizedParquetValuesReader(int bitWidth, int maxDefLevel, boolean setValidityVector) { + public BaseVectorizedParquetValuesReader( + int bitWidth, int maxDefLevel, boolean setValidityVector) { this(bitWidth, maxDefLevel, bitWidth != 0, setValidityVector); } - public BaseVectorizedParquetValuesReader(int bitWidth, int maxDefLevel, boolean readLength, - boolean setValidityVector) { + public BaseVectorizedParquetValuesReader( + int bitWidth, int maxDefLevel, boolean readLength, boolean setValidityVector) { this.fixedWidth = true; this.readLength = readLength; this.maxDefLevel = maxDefLevel; @@ -118,9 +118,7 @@ public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOExce } } - /** - * Initializes the internal state for decoding ints of `bitWidth`. - */ + /** Initializes the internal state for decoding ints of `bitWidth`. */ private void init(int bw) { Preconditions.checkArgument(bw >= 0 && bw <= 32, "bitWidth must be >= 0 and <= 32"); this.bitWidth = bw; @@ -128,9 +126,7 @@ private void init(int bw) { this.packer = Packer.LITTLE_ENDIAN.newBytePacker(bw); } - /** - * Reads the next varint encoded int. - */ + /** Reads the next varint encoded int. */ private int readUnsignedVarInt() throws IOException { int value = 0; int shift = 0; @@ -143,9 +139,7 @@ private int readUnsignedVarInt() throws IOException { return value; } - /** - * Reads the next 4 byte little endian int. - */ + /** Reads the next 4 byte little endian int. */ private int readIntLittleEndian() throws IOException { int ch4 = inputStream.read(); int ch3 = inputStream.read(); @@ -154,36 +148,35 @@ private int readIntLittleEndian() throws IOException { return (ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0); } - /** - * Reads the next byteWidth little endian int. - */ + /** Reads the next byteWidth little endian int. */ private int readIntLittleEndianPaddedOnBitWidth() throws IOException { switch (bytesWidth) { case 0: return 0; case 1: return inputStream.read(); - case 2: { - int ch2 = inputStream.read(); - int ch1 = inputStream.read(); - return (ch1 << 8) + ch2; - } - case 3: { - int ch3 = inputStream.read(); - int ch2 = inputStream.read(); - int ch1 = inputStream.read(); - return (ch1 << 16) + (ch2 << 8) + (ch3 << 0); - } - case 4: { - return readIntLittleEndian(); - } + case 2: + { + int ch2 = inputStream.read(); + int ch1 = inputStream.read(); + return (ch1 << 8) + ch2; + } + case 3: + { + int ch3 = inputStream.read(); + int ch2 = inputStream.read(); + int ch1 = inputStream.read(); + return (ch1 << 16) + (ch2 << 8) + (ch3 << 0); + } + case 4: + { + return readIntLittleEndian(); + } } throw new RuntimeException("Non-supported bytesWidth: " + bytesWidth); } - /** - * Reads the next group. - */ + /** Reads the next group. */ void readNextGroup() { try { int header = readUnsignedVarInt(); @@ -204,7 +197,8 @@ void readNextGroup() { while (valueIndex < this.currentCount) { // values are bit packed 8 at a time, so reading bitWidth will always work ByteBuffer buffer = inputStream.slice(bitWidth); - this.packer.unpack8Values(buffer, buffer.position(), this.packedValuesBuffer, valueIndex); + this.packer.unpack8Values( + buffer, buffer.position(), this.packedValuesBuffer, valueIndex); valueIndex += 8; } return; diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/DecimalVectorUtil.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/DecimalVectorUtil.java index 5e6855a91a8e..f6fe062b561b 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/DecimalVectorUtil.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/DecimalVectorUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized.parquet; import java.util.Arrays; @@ -25,8 +24,7 @@ public class DecimalVectorUtil { - private DecimalVectorUtil() { - } + private DecimalVectorUtil() {} public static void setBigEndian(DecimalVector vector, int idx, byte[] value) { byte[] paddedBytes = DecimalVectorUtil.padBigEndianBytes(value, DecimalVector.TYPE_WIDTH); @@ -34,14 +32,14 @@ public static void setBigEndian(DecimalVector vector, int idx, byte[] value) { } /** - * Parquet stores decimal values in big-endian byte order, and Arrow stores them in native byte order. - * When setting the value in Arrow, we call setBigEndian(), and the byte order is reversed if needed. - * Also, the byte array is padded to fill 16 bytes in length by calling Unsafe.setMemory(). The padding - * operation can be slow, so by using this utility method, we can pad before calling setBigEndian() and - * avoid the call to Unsafe.setMemory(). + * Parquet stores decimal values in big-endian byte order, and Arrow stores them in native byte + * order. When setting the value in Arrow, we call setBigEndian(), and the byte order is reversed + * if needed. Also, the byte array is padded to fill 16 bytes in length by calling + * Unsafe.setMemory(). The padding operation can be slow, so by using this utility method, we can + * pad before calling setBigEndian() and avoid the call to Unsafe.setMemory(). * * @param bigEndianBytes The big endian bytes - * @param newLength The length of the byte array to return + * @param newLength The length of the byte array to return * @return The new byte array */ @VisibleForTesting @@ -62,8 +60,9 @@ static byte[] padBigEndianBytes(byte[] bigEndianBytes, int newLength) { return result; } - throw new IllegalArgumentException(String.format("Buffer size of %d is larger than requested size of %d", + throw new IllegalArgumentException( + String.format( + "Buffer size of %d is larger than requested size of %d", bigEndianBytes.length, newLength)); } - } diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedColumnIterator.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedColumnIterator.java index c87a80e13469..094e306d5bf1 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedColumnIterator.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedColumnIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized.parquet; import org.apache.arrow.vector.FieldVector; @@ -30,19 +29,22 @@ import org.apache.parquet.column.page.PageReader; /** - * Vectorized version of the ColumnIterator that reads column values in data pages of a column in a row group in a - * batched fashion. + * Vectorized version of the ColumnIterator that reads column values in data pages of a column in a + * row group in a batched fashion. */ public class VectorizedColumnIterator extends BaseColumnIterator { private final VectorizedPageIterator vectorizedPageIterator; private int batchSize; - public VectorizedColumnIterator(ColumnDescriptor desc, String writerVersion, boolean setArrowValidityVector) { + public VectorizedColumnIterator( + ColumnDescriptor desc, String writerVersion, boolean setArrowValidityVector) { super(desc); - Preconditions.checkArgument(desc.getMaxRepetitionLevel() == 0, + Preconditions.checkArgument( + desc.getMaxRepetitionLevel() == 0, "Only non-nested columns are supported for vectorized reads"); - this.vectorizedPageIterator = new VectorizedPageIterator(desc, writerVersion, setArrowValidityVector); + this.vectorizedPageIterator = + new VectorizedPageIterator(desc, writerVersion, setArrowValidityVector); } public void setBatchSize(int batchSize) { @@ -71,8 +73,8 @@ public void nextBatch(FieldVector fieldVector, int typeWidth, NullabilityHolder int rowsReadSoFar = 0; while (rowsReadSoFar < batchSize && hasNext()) { advance(); - int rowsInThisBatch = nextBatchOf(fieldVector, batchSize - rowsReadSoFar, - rowsReadSoFar, typeWidth, holder); + int rowsInThisBatch = + nextBatchOf(fieldVector, batchSize - rowsReadSoFar, rowsReadSoFar, typeWidth, holder); rowsReadSoFar += rowsInThisBatch; triplesRead += rowsInThisBatch; fieldVector.setValueCount(rowsReadSoFar); @@ -80,15 +82,23 @@ public void nextBatch(FieldVector fieldVector, int typeWidth, NullabilityHolder } protected abstract int nextBatchOf( - FieldVector vector, int expectedBatchSize, int numValsInVector, int typeWidth, NullabilityHolder holder); + FieldVector vector, + int expectedBatchSize, + int numValsInVector, + int typeWidth, + NullabilityHolder holder); } public class IntegerBatchReader extends BatchReader { @Override protected int nextBatchOf( - final FieldVector vector, final int expectedBatchSize, final int numValsInVector, final int typeWidth, + final FieldVector vector, + final int expectedBatchSize, + final int numValsInVector, + final int typeWidth, NullabilityHolder holder) { - return vectorizedPageIterator.intPageReader() + return vectorizedPageIterator + .intPageReader() .nextBatch(vector, expectedBatchSize, numValsInVector, typeWidth, holder); } } @@ -96,19 +106,26 @@ protected int nextBatchOf( public class DictionaryBatchReader extends BatchReader { @Override protected int nextBatchOf( - final FieldVector vector, final int expectedBatchSize, final int numValsInVector, final int typeWidth, + final FieldVector vector, + final int expectedBatchSize, + final int numValsInVector, + final int typeWidth, NullabilityHolder holder) { - return vectorizedPageIterator.nextBatchDictionaryIds((IntVector) vector, expectedBatchSize, numValsInVector, - holder); + return vectorizedPageIterator.nextBatchDictionaryIds( + (IntVector) vector, expectedBatchSize, numValsInVector, holder); } } public class LongBatchReader extends BatchReader { @Override protected int nextBatchOf( - final FieldVector vector, final int expectedBatchSize, final int numValsInVector, final int typeWidth, + final FieldVector vector, + final int expectedBatchSize, + final int numValsInVector, + final int typeWidth, NullabilityHolder holder) { - return vectorizedPageIterator.longPageReader() + return vectorizedPageIterator + .longPageReader() .nextBatch(vector, expectedBatchSize, numValsInVector, typeWidth, holder); } } @@ -116,19 +133,27 @@ protected int nextBatchOf( public class TimestampMillisBatchReader extends BatchReader { @Override protected int nextBatchOf( - final FieldVector vector, final int expectedBatchSize, final int numValsInVector, final int typeWidth, + final FieldVector vector, + final int expectedBatchSize, + final int numValsInVector, + final int typeWidth, NullabilityHolder holder) { - return vectorizedPageIterator.timestampMillisPageReader().nextBatch(vector, expectedBatchSize, numValsInVector, - typeWidth, holder); + return vectorizedPageIterator + .timestampMillisPageReader() + .nextBatch(vector, expectedBatchSize, numValsInVector, typeWidth, holder); } } public class FloatBatchReader extends BatchReader { @Override protected int nextBatchOf( - final FieldVector vector, final int expectedBatchSize, final int numValsInVector, final int typeWidth, + final FieldVector vector, + final int expectedBatchSize, + final int numValsInVector, + final int typeWidth, NullabilityHolder holder) { - return vectorizedPageIterator.floatPageReader() + return vectorizedPageIterator + .floatPageReader() .nextBatch(vector, expectedBatchSize, numValsInVector, typeWidth, holder); } } @@ -136,9 +161,13 @@ protected int nextBatchOf( public class DoubleBatchReader extends BatchReader { @Override protected int nextBatchOf( - final FieldVector vector, final int expectedBatchSize, final int numValsInVector, final int typeWidth, + final FieldVector vector, + final int expectedBatchSize, + final int numValsInVector, + final int typeWidth, NullabilityHolder holder) { - return vectorizedPageIterator.doublePageReader() + return vectorizedPageIterator + .doublePageReader() .nextBatch(vector, expectedBatchSize, numValsInVector, typeWidth, holder); } } @@ -146,9 +175,13 @@ protected int nextBatchOf( public class IntBackedDecimalBatchReader extends BatchReader { @Override protected int nextBatchOf( - final FieldVector vector, final int expectedBatchSize, final int numValsInVector, final int typeWidth, + final FieldVector vector, + final int expectedBatchSize, + final int numValsInVector, + final int typeWidth, NullabilityHolder holder) { - return vectorizedPageIterator.intBackedDecimalPageReader() + return vectorizedPageIterator + .intBackedDecimalPageReader() .nextBatch(vector, expectedBatchSize, numValsInVector, typeWidth, holder); } } @@ -156,60 +189,84 @@ protected int nextBatchOf( public class LongBackedDecimalBatchReader extends BatchReader { @Override protected int nextBatchOf( - final FieldVector vector, final int expectedBatchSize, final int numValsInVector, final int typeWidth, + final FieldVector vector, + final int expectedBatchSize, + final int numValsInVector, + final int typeWidth, NullabilityHolder holder) { - return vectorizedPageIterator.longBackedDecimalPageReader().nextBatch(vector, expectedBatchSize, numValsInVector, - typeWidth, holder); + return vectorizedPageIterator + .longBackedDecimalPageReader() + .nextBatch(vector, expectedBatchSize, numValsInVector, typeWidth, holder); } } public class FixedLengthDecimalBatchReader extends BatchReader { @Override protected int nextBatchOf( - final FieldVector vector, final int expectedBatchSize, final int numValsInVector, final int typeWidth, + final FieldVector vector, + final int expectedBatchSize, + final int numValsInVector, + final int typeWidth, NullabilityHolder holder) { - return vectorizedPageIterator.fixedLengthDecimalPageReader().nextBatch(vector, expectedBatchSize, numValsInVector, - typeWidth, holder); + return vectorizedPageIterator + .fixedLengthDecimalPageReader() + .nextBatch(vector, expectedBatchSize, numValsInVector, typeWidth, holder); } } public class FixedSizeBinaryBatchReader extends BatchReader { @Override protected int nextBatchOf( - final FieldVector vector, final int expectedBatchSize, final int numValsInVector, final int typeWidth, + final FieldVector vector, + final int expectedBatchSize, + final int numValsInVector, + final int typeWidth, NullabilityHolder holder) { - return vectorizedPageIterator.fixedSizeBinaryPageReader().nextBatch(vector, expectedBatchSize, numValsInVector, - typeWidth, holder); + return vectorizedPageIterator + .fixedSizeBinaryPageReader() + .nextBatch(vector, expectedBatchSize, numValsInVector, typeWidth, holder); } } public class VarWidthTypeBatchReader extends BatchReader { @Override protected int nextBatchOf( - final FieldVector vector, final int expectedBatchSize, final int numValsInVector, final int typeWidth, + final FieldVector vector, + final int expectedBatchSize, + final int numValsInVector, + final int typeWidth, NullabilityHolder holder) { - return vectorizedPageIterator.varWidthTypePageReader().nextBatch(vector, expectedBatchSize, numValsInVector, - typeWidth, holder); + return vectorizedPageIterator + .varWidthTypePageReader() + .nextBatch(vector, expectedBatchSize, numValsInVector, typeWidth, holder); } } public class FixedWidthTypeBinaryBatchReader extends BatchReader { @Override protected int nextBatchOf( - final FieldVector vector, final int expectedBatchSize, final int numValsInVector, final int typeWidth, + final FieldVector vector, + final int expectedBatchSize, + final int numValsInVector, + final int typeWidth, NullabilityHolder holder) { - return vectorizedPageIterator.fixedWidthBinaryPageReader().nextBatch(vector, expectedBatchSize, numValsInVector, - typeWidth, holder); + return vectorizedPageIterator + .fixedWidthBinaryPageReader() + .nextBatch(vector, expectedBatchSize, numValsInVector, typeWidth, holder); } } public class BooleanBatchReader extends BatchReader { @Override protected int nextBatchOf( - final FieldVector vector, final int expectedBatchSize, final int numValsInVector, final int typeWidth, + final FieldVector vector, + final int expectedBatchSize, + final int numValsInVector, + final int typeWidth, NullabilityHolder holder) { - return vectorizedPageIterator.booleanPageReader().nextBatch(vector, expectedBatchSize, numValsInVector, - typeWidth, holder); + return vectorizedPageIterator + .booleanPageReader() + .nextBatch(vector, expectedBatchSize, numValsInVector, typeWidth, holder); } } diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java index b930f35ebfd5..96071fe75ccc 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedDictionaryEncodedParquetValuesReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized.parquet; import java.nio.ByteBuffer; @@ -31,19 +30,25 @@ /** * This decoder reads Parquet dictionary encoded data in a vectorized fashion. Unlike other - * vectorized readers, methods in this decoder don't need to read definition levels. In other - * words, these methods are called when there are non-null values to be read. + * vectorized readers, methods in this decoder don't need to read definition levels. In other words, + * these methods are called when there are non-null values to be read. */ -public class VectorizedDictionaryEncodedParquetValuesReader extends BaseVectorizedParquetValuesReader { +public class VectorizedDictionaryEncodedParquetValuesReader + extends BaseVectorizedParquetValuesReader { - public VectorizedDictionaryEncodedParquetValuesReader(int maxDefLevel, boolean setValidityVector) { + public VectorizedDictionaryEncodedParquetValuesReader( + int maxDefLevel, boolean setValidityVector) { super(maxDefLevel, setValidityVector); } abstract class BaseDictEncodedReader { public void nextBatch( - FieldVector vector, int startOffset, int numValuesToRead, Dictionary dict, - NullabilityHolder nullabilityHolder, int typeWidth) { + FieldVector vector, + int startOffset, + int numValuesToRead, + Dictionary dict, + NullabilityHolder nullabilityHolder, + int typeWidth) { int left = numValuesToRead; int idx = startOffset; while (left > 0) { @@ -72,54 +77,62 @@ public void nextBatch( } } - protected abstract void nextVal(FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth); + protected abstract void nextVal( + FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth); } class DictionaryIdReader extends BaseDictEncodedReader { @Override - protected void nextVal(FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { + protected void nextVal( + FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { ((IntVector) vector).set(idx, currentVal); } } class LongDictEncodedReader extends BaseDictEncodedReader { @Override - protected void nextVal(FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { + protected void nextVal( + FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { vector.getDataBuffer().setLong(idx, dict.decodeToLong(currentVal)); } } class TimestampMillisDictEncodedReader extends BaseDictEncodedReader { @Override - protected void nextVal(FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { + protected void nextVal( + FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { vector.getDataBuffer().setLong(idx, dict.decodeToLong(currentVal) * 1000); } } class IntegerDictEncodedReader extends BaseDictEncodedReader { @Override - protected void nextVal(FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { + protected void nextVal( + FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { vector.getDataBuffer().setInt(idx, dict.decodeToInt(currentVal)); } } class FloatDictEncodedReader extends BaseDictEncodedReader { @Override - protected void nextVal(FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { + protected void nextVal( + FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { vector.getDataBuffer().setFloat(idx, dict.decodeToFloat(currentVal)); } } class DoubleDictEncodedReader extends BaseDictEncodedReader { @Override - protected void nextVal(FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { + protected void nextVal( + FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { vector.getDataBuffer().setDouble(idx, dict.decodeToDouble(currentVal)); } } class FixedWidthBinaryDictEncodedReader extends BaseDictEncodedReader { @Override - protected void nextVal(FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { + protected void nextVal( + FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { ByteBuffer buffer = dict.decodeToBinary(currentVal).toByteBuffer(); vector.getDataBuffer().setBytes(idx, buffer); } @@ -127,7 +140,8 @@ protected void nextVal(FieldVector vector, Dictionary dict, int idx, int current class FixedLengthDecimalDictEncodedReader extends BaseDictEncodedReader { @Override - protected void nextVal(FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { + protected void nextVal( + FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { byte[] bytes = dict.decodeToBinary(currentVal).getBytesUnsafe(); DecimalVectorUtil.setBigEndian((DecimalVector) vector, idx, bytes); } @@ -135,30 +149,38 @@ protected void nextVal(FieldVector vector, Dictionary dict, int idx, int current class VarWidthBinaryDictEncodedReader extends BaseDictEncodedReader { @Override - protected void nextVal(FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { + protected void nextVal( + FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { ByteBuffer buffer = dict.decodeToBinary(currentVal).toByteBuffer(); - ((BaseVariableWidthVector) vector).setSafe(idx, buffer.array(), - buffer.position() + buffer.arrayOffset(), buffer.limit() - buffer.position()); + ((BaseVariableWidthVector) vector) + .setSafe( + idx, + buffer.array(), + buffer.position() + buffer.arrayOffset(), + buffer.limit() - buffer.position()); } } class IntBackedDecimalDictEncodedReader extends BaseDictEncodedReader { @Override - protected void nextVal(FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { + protected void nextVal( + FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { ((DecimalVector) vector).set(idx, dict.decodeToInt(currentVal)); } } class LongBackedDecimalDictEncodedReader extends BaseDictEncodedReader { @Override - protected void nextVal(FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { + protected void nextVal( + FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { ((DecimalVector) vector).set(idx, dict.decodeToLong(currentVal)); } } class FixedSizeBinaryDictEncodedReader extends BaseDictEncodedReader { @Override - protected void nextVal(FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { + protected void nextVal( + FieldVector vector, Dictionary dict, int idx, int currentVal, int typeWidth) { byte[] bytes = dict.decodeToBinary(currentVal).getBytesUnsafe(); byte[] vectorBytes = new byte[typeWidth]; System.arraycopy(bytes, 0, vectorBytes, 0, typeWidth); diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java index 5cce5286b79f..47d9ae1da82b 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized.parquet; import java.io.IOException; @@ -42,7 +41,8 @@ public class VectorizedPageIterator extends BasePageIterator { private final boolean setArrowValidityVector; - public VectorizedPageIterator(ColumnDescriptor desc, String writerVersion, boolean setValidityVector) { + public VectorizedPageIterator( + ColumnDescriptor desc, String writerVersion, boolean setValidityVector) { super(desc, writerVersion); this.setArrowValidityVector = setValidityVector; } @@ -77,11 +77,15 @@ protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, i if (dataEncoding.usesDictionary()) { if (dictionary == null) { throw new ParquetDecodingException( - "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding); + "could not read page in col " + + desc + + " as the dictionary was missing for encoding " + + dataEncoding); } try { dictionaryEncodedValuesReader = - new VectorizedDictionaryEncodedParquetValuesReader(desc.getMaxDefinitionLevel(), setArrowValidityVector); + new VectorizedDictionaryEncodedParquetValuesReader( + desc.getMaxDefinitionLevel(), setArrowValidityVector); dictionaryEncodedValuesReader.initFromPage(valueCount, in); if (ParquetUtil.isIntType(desc.getPrimitiveType()) || !allPagesDictEncoded) { dictionaryDecodeMode = DictionaryDecodeMode.EAGER; @@ -93,15 +97,20 @@ protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, i } } else { if (dataEncoding != Encoding.PLAIN) { - throw new UnsupportedOperationException("Cannot support vectorized reads for column " + desc + " with " + - "encoding " + dataEncoding + ". Disable vectorized reads to read this table/file"); + throw new UnsupportedOperationException( + "Cannot support vectorized reads for column " + + desc + + " with " + + "encoding " + + dataEncoding + + ". Disable vectorized reads to read this table/file"); } plainValuesReader = new ValuesAsBytesReader(); plainValuesReader.initFromPage(valueCount, in); dictionaryDecodeMode = DictionaryDecodeMode.NONE; } - if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) && - previousReader instanceof RequiresPreviousReader) { + if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) + && previousReader instanceof RequiresPreviousReader) { // previous reader can only be set if reading sequentially ((RequiresPreviousReader) plainValuesReader).setPreviousReader(previousReader); } @@ -113,36 +122,50 @@ public boolean producesDictionaryEncodedVector() { @Override protected void initDefinitionLevelsReader( - DataPageV1 dataPageV1, ColumnDescriptor desc, ByteBufferInputStream in, - int triplesCount) throws IOException { + DataPageV1 dataPageV1, ColumnDescriptor desc, ByteBufferInputStream in, int triplesCount) + throws IOException { int bitWidth = BytesUtils.getWidthFromMaxInt(desc.getMaxDefinitionLevel()); - this.vectorizedDefinitionLevelReader = new VectorizedParquetDefinitionLevelReader(bitWidth, - desc.getMaxDefinitionLevel(), setArrowValidityVector); + this.vectorizedDefinitionLevelReader = + new VectorizedParquetDefinitionLevelReader( + bitWidth, desc.getMaxDefinitionLevel(), setArrowValidityVector); this.vectorizedDefinitionLevelReader.initFromPage(triplesCount, in); } @Override - protected void initDefinitionLevelsReader(DataPageV2 dataPageV2, ColumnDescriptor desc) throws IOException { + protected void initDefinitionLevelsReader(DataPageV2 dataPageV2, ColumnDescriptor desc) + throws IOException { int bitWidth = BytesUtils.getWidthFromMaxInt(desc.getMaxDefinitionLevel()); // do not read the length from the stream. v2 pages handle dividing the page bytes. - this.vectorizedDefinitionLevelReader = new VectorizedParquetDefinitionLevelReader(bitWidth, - desc.getMaxDefinitionLevel(), false, setArrowValidityVector); + this.vectorizedDefinitionLevelReader = + new VectorizedParquetDefinitionLevelReader( + bitWidth, desc.getMaxDefinitionLevel(), false, setArrowValidityVector); this.vectorizedDefinitionLevelReader.initFromPage( dataPageV2.getValueCount(), dataPageV2.getDefinitionLevels().toInputStream()); } /** - * Method for reading a batch of dictionary ids from the dictionary encoded data pages. Like definition levels, - * dictionary ids in Parquet are RLE/bin-packed encoded as well. + * Method for reading a batch of dictionary ids from the dictionary encoded data pages. Like + * definition levels, dictionary ids in Parquet are RLE/bin-packed encoded as well. */ public int nextBatchDictionaryIds( - final IntVector vector, final int expectedBatchSize, final int numValsInVector, NullabilityHolder holder) { + final IntVector vector, + final int expectedBatchSize, + final int numValsInVector, + NullabilityHolder holder) { final int actualBatchSize = getActualBatchSize(expectedBatchSize); if (actualBatchSize <= 0) { return 0; } - vectorizedDefinitionLevelReader.dictionaryIdReader().nextDictEncodedBatch(vector, numValsInVector, -1, - actualBatchSize, holder, dictionaryEncodedValuesReader, null); + vectorizedDefinitionLevelReader + .dictionaryIdReader() + .nextDictEncodedBatch( + vector, + numValsInVector, + -1, + actualBatchSize, + holder, + dictionaryEncodedValuesReader, + null); triplesRead += actualBatchSize; this.hasNext = triplesRead < triplesCount; return actualBatchSize; @@ -150,7 +173,11 @@ public int nextBatchDictionaryIds( abstract class BagePageReader { public int nextBatch( - FieldVector vector, int expectedBatchSize, int numValsInVector, int typeWidth, NullabilityHolder holder) { + FieldVector vector, + int expectedBatchSize, + int numValsInVector, + int typeWidth, + NullabilityHolder holder) { final int actualBatchSize = getActualBatchSize(expectedBatchSize); if (actualBatchSize <= 0) { return 0; @@ -167,103 +194,146 @@ public int nextBatch( protected abstract void nextVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder); + protected abstract void nextDictEncodedVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder); } - /** - * Method for reading a batch of values of INT32 data type - */ + /** Method for reading a batch of values of INT32 data type */ class IntPageReader extends BagePageReader { @Override - protected void nextVal(FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.integerReader().nextBatch(vector, numVals, typeWidth, batchSize, - holder, plainValuesReader); + protected void nextVal( + FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { + vectorizedDefinitionLevelReader + .integerReader() + .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); } @Override protected void nextDictEncodedVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.integerReader().nextDictEncodedBatch(vector, numVals, typeWidth, batchSize, - holder, dictionaryEncodedValuesReader, dictionary); + vectorizedDefinitionLevelReader + .integerReader() + .nextDictEncodedBatch( + vector, + numVals, + typeWidth, + batchSize, + holder, + dictionaryEncodedValuesReader, + dictionary); } } - /** - * Method for reading a batch of values of INT64 data type - */ + /** Method for reading a batch of values of INT64 data type */ class LongPageReader extends BagePageReader { @Override - protected void nextVal(FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.longReader().nextBatch(vector, numVals, typeWidth, - batchSize, holder, plainValuesReader); + protected void nextVal( + FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { + vectorizedDefinitionLevelReader + .longReader() + .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); } @Override protected void nextDictEncodedVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.longReader().nextDictEncodedBatch(vector, numVals, typeWidth, - batchSize, holder, dictionaryEncodedValuesReader, dictionary); + vectorizedDefinitionLevelReader + .longReader() + .nextDictEncodedBatch( + vector, + numVals, + typeWidth, + batchSize, + holder, + dictionaryEncodedValuesReader, + dictionary); } } /** - * Method for reading a batch of values of TIMESTAMP_MILLIS data type. In iceberg, TIMESTAMP - * is always represented in micro-seconds. So we multiply values stored in millis with 1000 - * before writing them to the vector. + * Method for reading a batch of values of TIMESTAMP_MILLIS data type. In iceberg, TIMESTAMP is + * always represented in micro-seconds. So we multiply values stored in millis with 1000 before + * writing them to the vector. */ class TimestampMillisPageReader extends BagePageReader { @Override - protected void nextVal(FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.timestampMillisReader().nextBatch(vector, numVals, typeWidth, - batchSize, holder, plainValuesReader); + protected void nextVal( + FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { + vectorizedDefinitionLevelReader + .timestampMillisReader() + .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); } @Override protected void nextDictEncodedVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.timestampMillisReader().nextDictEncodedBatch(vector, numVals, typeWidth, - batchSize, holder, dictionaryEncodedValuesReader, dictionary); + vectorizedDefinitionLevelReader + .timestampMillisReader() + .nextDictEncodedBatch( + vector, + numVals, + typeWidth, + batchSize, + holder, + dictionaryEncodedValuesReader, + dictionary); } } - /** - * Method for reading a batch of values of FLOAT data type. - */ + /** Method for reading a batch of values of FLOAT data type. */ class FloatPageReader extends BagePageReader { @Override - protected void nextVal(FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.floatReader().nextBatch(vector, numVals, typeWidth, - batchSize, holder, plainValuesReader); + protected void nextVal( + FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { + vectorizedDefinitionLevelReader + .floatReader() + .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); } @Override protected void nextDictEncodedVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.floatReader().nextDictEncodedBatch(vector, numVals, typeWidth, - batchSize, holder, dictionaryEncodedValuesReader, dictionary); + vectorizedDefinitionLevelReader + .floatReader() + .nextDictEncodedBatch( + vector, + numVals, + typeWidth, + batchSize, + holder, + dictionaryEncodedValuesReader, + dictionary); } } - /** - * Method for reading a batch of values of DOUBLE data type - */ + /** Method for reading a batch of values of DOUBLE data type */ class DoublePageReader extends BagePageReader { @Override - protected void nextVal(FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.doubleReader().nextBatch(vector, numVals, typeWidth, - batchSize, holder, plainValuesReader); + protected void nextVal( + FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { + vectorizedDefinitionLevelReader + .doubleReader() + .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); } @Override protected void nextDictEncodedVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.doubleReader().nextDictEncodedBatch(vector, numVals, typeWidth, - batchSize, holder, dictionaryEncodedValuesReader, dictionary); + vectorizedDefinitionLevelReader + .doubleReader() + .nextDictEncodedBatch( + vector, + numVals, + typeWidth, + batchSize, + holder, + dictionaryEncodedValuesReader, + dictionary); } } @@ -272,124 +342,182 @@ private int getActualBatchSize(int expectedBatchSize) { } /** - * Method for reading a batch of decimals backed by INT32 and INT64 parquet data types. Since Arrow stores all - * decimals in 16 bytes, byte arrays are appropriately padded before being written to Arrow data buffers. + * Method for reading a batch of decimals backed by INT32 and INT64 parquet data types. Since + * Arrow stores all decimals in 16 bytes, byte arrays are appropriately padded before being + * written to Arrow data buffers. */ class IntBackedDecimalPageReader extends BagePageReader { @Override - protected void nextVal(FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.intBackedDecimalReader().nextBatch(vector, numVals, typeWidth, batchSize, - holder, plainValuesReader); + protected void nextVal( + FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { + vectorizedDefinitionLevelReader + .intBackedDecimalReader() + .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); } @Override protected void nextDictEncodedVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.intBackedDecimalReader() - .nextDictEncodedBatch(vector, numVals, typeWidth, batchSize, - holder, dictionaryEncodedValuesReader, dictionary); + vectorizedDefinitionLevelReader + .intBackedDecimalReader() + .nextDictEncodedBatch( + vector, + numVals, + typeWidth, + batchSize, + holder, + dictionaryEncodedValuesReader, + dictionary); } } class LongBackedDecimalPageReader extends BagePageReader { @Override - protected void nextVal(FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.longBackedDecimalReader().nextBatch(vector, numVals, typeWidth, batchSize, - holder, plainValuesReader); + protected void nextVal( + FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { + vectorizedDefinitionLevelReader + .longBackedDecimalReader() + .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); } @Override protected void nextDictEncodedVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.longBackedDecimalReader() - .nextDictEncodedBatch(vector, numVals, typeWidth, batchSize, - holder, dictionaryEncodedValuesReader, dictionary); + vectorizedDefinitionLevelReader + .longBackedDecimalReader() + .nextDictEncodedBatch( + vector, + numVals, + typeWidth, + batchSize, + holder, + dictionaryEncodedValuesReader, + dictionary); } } /** - * Method for reading a batch of decimals backed by fixed length byte array parquet data type. Arrow stores all - * decimals in 16 bytes. This method provides the necessary padding to the decimals read. Moreover, Arrow interprets - * the decimals in Arrow buffer as little endian. Parquet stores fixed length decimals as big endian. So, this method - * uses {@link DecimalVector#setBigEndian(int, byte[])} method so that the data in Arrow vector is indeed little - * endian. + * Method for reading a batch of decimals backed by fixed length byte array parquet data type. + * Arrow stores all decimals in 16 bytes. This method provides the necessary padding to the + * decimals read. Moreover, Arrow interprets the decimals in Arrow buffer as little endian. + * Parquet stores fixed length decimals as big endian. So, this method uses {@link + * DecimalVector#setBigEndian(int, byte[])} method so that the data in Arrow vector is indeed + * little endian. */ class FixedLengthDecimalPageReader extends BagePageReader { @Override - protected void nextVal(FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.fixedLengthDecimalReader().nextBatch(vector, numVals, typeWidth, - batchSize, holder, plainValuesReader); + protected void nextVal( + FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { + vectorizedDefinitionLevelReader + .fixedLengthDecimalReader() + .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); } @Override protected void nextDictEncodedVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.fixedLengthDecimalReader().nextDictEncodedBatch(vector, numVals, typeWidth, - batchSize, holder, dictionaryEncodedValuesReader, dictionary); + vectorizedDefinitionLevelReader + .fixedLengthDecimalReader() + .nextDictEncodedBatch( + vector, + numVals, + typeWidth, + batchSize, + holder, + dictionaryEncodedValuesReader, + dictionary); } } class FixedSizeBinaryPageReader extends BagePageReader { @Override - protected void nextVal(FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.fixedSizeBinaryReader().nextBatch(vector, numVals, typeWidth, - batchSize, holder, plainValuesReader); + protected void nextVal( + FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { + vectorizedDefinitionLevelReader + .fixedSizeBinaryReader() + .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); } @Override protected void nextDictEncodedVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.fixedSizeBinaryReader().nextDictEncodedBatch(vector, numVals, typeWidth, - batchSize, holder, dictionaryEncodedValuesReader, dictionary); + vectorizedDefinitionLevelReader + .fixedSizeBinaryReader() + .nextDictEncodedBatch( + vector, + numVals, + typeWidth, + batchSize, + holder, + dictionaryEncodedValuesReader, + dictionary); } } - /** - * Method for reading a batch of variable width data type (ENUM, JSON, UTF8, BSON). - */ + /** Method for reading a batch of variable width data type (ENUM, JSON, UTF8, BSON). */ class VarWidthTypePageReader extends BagePageReader { @Override - protected void nextVal(FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.varWidthReader().nextBatch(vector, numVals, typeWidth, batchSize, - holder, plainValuesReader); + protected void nextVal( + FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { + vectorizedDefinitionLevelReader + .varWidthReader() + .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); } @Override protected void nextDictEncodedVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.varWidthReader().nextDictEncodedBatch(vector, numVals, typeWidth, batchSize, - holder, dictionaryEncodedValuesReader, dictionary); + vectorizedDefinitionLevelReader + .varWidthReader() + .nextDictEncodedBatch( + vector, + numVals, + typeWidth, + batchSize, + holder, + dictionaryEncodedValuesReader, + dictionary); } } /** - * Method for reading batches of fixed width binary type (e.g. BYTE[7]). Spark does not support fixed width binary - * data type. To work around this limitation, the data is read as fixed width binary from parquet and stored in a - * {@link VarBinaryVector} in Arrow. + * Method for reading batches of fixed width binary type (e.g. BYTE[7]). Spark does not support + * fixed width binary data type. To work around this limitation, the data is read as fixed width + * binary from parquet and stored in a {@link VarBinaryVector} in Arrow. */ class FixedWidthBinaryPageReader extends BagePageReader { @Override - protected void nextVal(FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.fixedWidthBinaryReader().nextBatch(vector, numVals, typeWidth, - batchSize, holder, plainValuesReader); + protected void nextVal( + FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { + vectorizedDefinitionLevelReader + .fixedWidthBinaryReader() + .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); } @Override protected void nextDictEncodedVal( FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.fixedWidthBinaryReader().nextDictEncodedBatch(vector, numVals, typeWidth, - batchSize, holder, dictionaryEncodedValuesReader, dictionary); + vectorizedDefinitionLevelReader + .fixedWidthBinaryReader() + .nextDictEncodedBatch( + vector, + numVals, + typeWidth, + batchSize, + holder, + dictionaryEncodedValuesReader, + dictionary); } } - /** - * Method for reading batches of booleans. - */ + /** Method for reading batches of booleans. */ class BooleanPageReader extends BagePageReader { @Override - protected void nextVal(FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { - vectorizedDefinitionLevelReader.booleanReader().nextBatch(vector, numVals, typeWidth, batchSize, holder, - plainValuesReader); + protected void nextVal( + FieldVector vector, int batchSize, int numVals, int typeWidth, NullabilityHolder holder) { + vectorizedDefinitionLevelReader + .booleanReader() + .nextBatch(vector, numVals, typeWidth, batchSize, holder, plainValuesReader); } @Override diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java index 46c6736ce4e9..a036aee9e683 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedParquetDefinitionLevelReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized.parquet; import java.nio.ByteBuffer; @@ -33,21 +32,27 @@ import org.apache.iceberg.parquet.ValuesAsBytesReader; import org.apache.parquet.column.Dictionary; -public final class VectorizedParquetDefinitionLevelReader extends BaseVectorizedParquetValuesReader { +public final class VectorizedParquetDefinitionLevelReader + extends BaseVectorizedParquetValuesReader { - public VectorizedParquetDefinitionLevelReader(int bitWidth, int maxDefLevel, boolean setArrowValidityVector) { + public VectorizedParquetDefinitionLevelReader( + int bitWidth, int maxDefLevel, boolean setArrowValidityVector) { super(bitWidth, maxDefLevel, setArrowValidityVector); } - public VectorizedParquetDefinitionLevelReader(int bitWidth, int maxDefLevel, boolean readLength, - boolean setArrowValidityVector) { + public VectorizedParquetDefinitionLevelReader( + int bitWidth, int maxDefLevel, boolean readLength, boolean setArrowValidityVector) { super(bitWidth, maxDefLevel, readLength, setArrowValidityVector); } abstract class NumericBaseReader { public void nextBatch( - final FieldVector vector, final int startOffset, final int typeWidth, - final int numValsToRead, NullabilityHolder nullabilityHolder, ValuesAsBytesReader valuesReader) { + final FieldVector vector, + final int startOffset, + final int typeWidth, + final int numValsToRead, + NullabilityHolder nullabilityHolder, + ValuesAsBytesReader valuesReader) { int bufferIdx = startOffset; int left = numValsToRead; while (left > 0) { @@ -57,7 +62,8 @@ public void nextBatch( int numValues = Math.min(left, currentCount); switch (mode) { case RLE: - setNextNValuesInVector(typeWidth, nullabilityHolder, valuesReader, bufferIdx, vector, numValues); + setNextNValuesInVector( + typeWidth, nullabilityHolder, valuesReader, bufferIdx, vector, numValues); bufferIdx += numValues; break; case PACKED: @@ -81,9 +87,13 @@ public void nextBatch( } public void nextDictEncodedBatch( - final FieldVector vector, final int startOffset, final int typeWidth, - final int numValsToRead, NullabilityHolder nullabilityHolder, - VectorizedDictionaryEncodedParquetValuesReader dictionaryEncodedValuesReader, Dictionary dict) { + final FieldVector vector, + final int startOffset, + final int typeWidth, + final int numValsToRead, + NullabilityHolder nullabilityHolder, + VectorizedDictionaryEncodedParquetValuesReader dictionaryEncodedValuesReader, + Dictionary dict) { int idx = startOffset; int left = numValsToRead; while (left > 0) { @@ -95,8 +105,15 @@ public void nextDictEncodedBatch( switch (mode) { case RLE: if (currentValue == maxDefLevel) { - nextDictEncodedVal(vector, idx, dictionaryEncodedValuesReader, dict, mode, - numValues, nullabilityHolder, typeWidth); + nextDictEncodedVal( + vector, + idx, + dictionaryEncodedValuesReader, + dict, + mode, + numValues, + nullabilityHolder, + typeWidth); } else { setNulls(nullabilityHolder, idx, numValues, validityBuffer); } @@ -105,8 +122,15 @@ public void nextDictEncodedBatch( case PACKED: for (int i = 0; i < numValues; i++) { if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) { - nextDictEncodedVal(vector, idx, dictionaryEncodedValuesReader, dict, mode, - numValues, nullabilityHolder, typeWidth); + nextDictEncodedVal( + vector, + idx, + dictionaryEncodedValuesReader, + dict, + mode, + numValues, + nullabilityHolder, + typeWidth); nullabilityHolder.setNotNull(idx); if (setArrowValidityVector) { BitVectorHelper.setBit(vector.getValidityBuffer(), idx); @@ -125,97 +149,149 @@ public void nextDictEncodedBatch( protected abstract void nextVal( FieldVector vector, int idx, ValuesAsBytesReader valuesReader, Mode mode); + protected abstract void nextDictEncodedVal( - FieldVector vector, int idx, VectorizedDictionaryEncodedParquetValuesReader dictionaryEncodedValuesReader, - Dictionary dict, Mode mode, int numValues, NullabilityHolder holder, int typeWidth); + FieldVector vector, + int idx, + VectorizedDictionaryEncodedParquetValuesReader dictionaryEncodedValuesReader, + Dictionary dict, + Mode mode, + int numValues, + NullabilityHolder holder, + int typeWidth); } class LongReader extends NumericBaseReader { @Override - protected void nextVal(FieldVector vector, int idx, ValuesAsBytesReader valuesReader, Mode mode) { + protected void nextVal( + FieldVector vector, int idx, ValuesAsBytesReader valuesReader, Mode mode) { vector.getDataBuffer().setLong(idx, valuesReader.readLong()); } @Override protected void nextDictEncodedVal( - FieldVector vector, int idx, VectorizedDictionaryEncodedParquetValuesReader dictionaryEncodedValuesReader, - Dictionary dict, Mode mode, int numValues, NullabilityHolder holder, int typeWidth) { + FieldVector vector, + int idx, + VectorizedDictionaryEncodedParquetValuesReader dictionaryEncodedValuesReader, + Dictionary dict, + Mode mode, + int numValues, + NullabilityHolder holder, + int typeWidth) { if (Mode.RLE.equals(mode)) { - dictionaryEncodedValuesReader.longDictEncodedReader() + dictionaryEncodedValuesReader + .longDictEncodedReader() .nextBatch(vector, idx, numValues, dict, holder, typeWidth); } else if (Mode.PACKED.equals(mode)) { - vector.getDataBuffer() - .setLong((long) idx * typeWidth, dict.decodeToLong(dictionaryEncodedValuesReader.readInteger())); + vector + .getDataBuffer() + .setLong( + (long) idx * typeWidth, + dict.decodeToLong(dictionaryEncodedValuesReader.readInteger())); } } } class DoubleReader extends NumericBaseReader { @Override - protected void nextVal(FieldVector vector, int idx, ValuesAsBytesReader valuesReader, Mode mode) { + protected void nextVal( + FieldVector vector, int idx, ValuesAsBytesReader valuesReader, Mode mode) { vector.getDataBuffer().setDouble(idx, valuesReader.readDouble()); } @Override protected void nextDictEncodedVal( - FieldVector vector, int idx, VectorizedDictionaryEncodedParquetValuesReader dictionaryEncodedValuesReader, - Dictionary dict, Mode mode, int numValues, NullabilityHolder holder, int typeWidth) { + FieldVector vector, + int idx, + VectorizedDictionaryEncodedParquetValuesReader dictionaryEncodedValuesReader, + Dictionary dict, + Mode mode, + int numValues, + NullabilityHolder holder, + int typeWidth) { if (Mode.RLE.equals(mode)) { - dictionaryEncodedValuesReader.doubleDictEncodedReader().nextBatch(vector, - idx, numValues, dict, holder, typeWidth); + dictionaryEncodedValuesReader + .doubleDictEncodedReader() + .nextBatch(vector, idx, numValues, dict, holder, typeWidth); } else if (Mode.PACKED.equals(mode)) { - vector.getDataBuffer() - .setDouble((long) idx * typeWidth, dict.decodeToDouble(dictionaryEncodedValuesReader.readInteger())); + vector + .getDataBuffer() + .setDouble( + (long) idx * typeWidth, + dict.decodeToDouble(dictionaryEncodedValuesReader.readInteger())); } } } class FloatReader extends NumericBaseReader { @Override - protected void nextVal(FieldVector vector, int idx, ValuesAsBytesReader valuesReader, Mode mode) { + protected void nextVal( + FieldVector vector, int idx, ValuesAsBytesReader valuesReader, Mode mode) { vector.getDataBuffer().setFloat(idx, valuesReader.readFloat()); } @Override protected void nextDictEncodedVal( - FieldVector vector, int idx, + FieldVector vector, + int idx, VectorizedDictionaryEncodedParquetValuesReader dictionaryEncodedValuesReader, - Dictionary dict, Mode mode, int numValues, NullabilityHolder holder, int typeWidth) { + Dictionary dict, + Mode mode, + int numValues, + NullabilityHolder holder, + int typeWidth) { if (Mode.RLE.equals(mode)) { - dictionaryEncodedValuesReader.floatDictEncodedReader().nextBatch(vector, - idx, numValues, dict, holder, typeWidth); + dictionaryEncodedValuesReader + .floatDictEncodedReader() + .nextBatch(vector, idx, numValues, dict, holder, typeWidth); } else if (Mode.PACKED.equals(mode)) { - vector.getDataBuffer() - .setFloat((long) idx * typeWidth, dict.decodeToFloat(dictionaryEncodedValuesReader.readInteger())); + vector + .getDataBuffer() + .setFloat( + (long) idx * typeWidth, + dict.decodeToFloat(dictionaryEncodedValuesReader.readInteger())); } } } class IntegerReader extends NumericBaseReader { @Override - protected void nextVal(FieldVector vector, int idx, ValuesAsBytesReader valuesReader, Mode mode) { + protected void nextVal( + FieldVector vector, int idx, ValuesAsBytesReader valuesReader, Mode mode) { vector.getDataBuffer().setInt(idx, valuesReader.readInteger()); } @Override protected void nextDictEncodedVal( - FieldVector vector, int idx, + FieldVector vector, + int idx, VectorizedDictionaryEncodedParquetValuesReader dictionaryEncodedValuesReader, - Dictionary dict, Mode mode, int numValues, NullabilityHolder holder, int typeWidth) { + Dictionary dict, + Mode mode, + int numValues, + NullabilityHolder holder, + int typeWidth) { if (Mode.RLE.equals(mode)) { - dictionaryEncodedValuesReader.integerDictEncodedReader().nextBatch(vector, - idx, numValues, dict, holder, typeWidth); + dictionaryEncodedValuesReader + .integerDictEncodedReader() + .nextBatch(vector, idx, numValues, dict, holder, typeWidth); } else if (Mode.PACKED.equals(mode)) { - vector.getDataBuffer() - .setInt((long) idx * typeWidth, dict.decodeToInt(dictionaryEncodedValuesReader.readInteger())); + vector + .getDataBuffer() + .setInt( + (long) idx * typeWidth, + dict.decodeToInt(dictionaryEncodedValuesReader.readInteger())); } } } abstract class BaseReader { public void nextBatch( - final FieldVector vector, final int startOffset, final int typeWidth, - final int numValsToRead, NullabilityHolder nullabilityHolder, + final FieldVector vector, + final int startOffset, + final int typeWidth, + final int numValsToRead, + NullabilityHolder nullabilityHolder, ValuesAsBytesReader valuesReader) { int bufferIdx = startOffset; int left = numValsToRead; @@ -259,8 +335,11 @@ public void nextBatch( } public void nextDictEncodedBatch( - final FieldVector vector, final int startOffset, final int typeWidth, - final int numValsToRead, NullabilityHolder nullabilityHolder, + final FieldVector vector, + final int startOffset, + final int typeWidth, + final int numValsToRead, + NullabilityHolder nullabilityHolder, VectorizedDictionaryEncodedParquetValuesReader dictionaryEncodedValuesReader, Dictionary dict) { int idx = startOffset; @@ -274,8 +353,15 @@ public void nextDictEncodedBatch( switch (mode) { case RLE: if (currentValue == maxDefLevel) { - nextDictEncodedVal(vector, - idx, dictionaryEncodedValuesReader, numValues, dict, nullabilityHolder, typeWidth, mode); + nextDictEncodedVal( + vector, + idx, + dictionaryEncodedValuesReader, + numValues, + dict, + nullabilityHolder, + typeWidth, + mode); } else { setNulls(nullabilityHolder, idx, numValues, validityBuffer); } @@ -284,8 +370,15 @@ public void nextDictEncodedBatch( case PACKED: for (int i = 0; i < numValues; i++) { if (packedValuesBuffer[packedValuesBufferIdx++] == maxDefLevel) { - nextDictEncodedVal(vector, - idx, dictionaryEncodedValuesReader, numValues, dict, nullabilityHolder, typeWidth, mode); + nextDictEncodedVal( + vector, + idx, + dictionaryEncodedValuesReader, + numValues, + dict, + nullabilityHolder, + typeWidth, + mode); nullabilityHolder.setNotNull(idx); if (setArrowValidityVector) { BitVectorHelper.setBit(vector.getValidityBuffer(), idx); @@ -303,29 +396,53 @@ public void nextDictEncodedBatch( } protected abstract void nextVal( - FieldVector vector, int idx, ValuesAsBytesReader valuesReader, int typeWidth, byte[] byteArray); + FieldVector vector, + int idx, + ValuesAsBytesReader valuesReader, + int typeWidth, + byte[] byteArray); + protected abstract void nextDictEncodedVal( - FieldVector vector, int idx, VectorizedDictionaryEncodedParquetValuesReader reader, - int numValuesToRead, Dictionary dict, NullabilityHolder nullabilityHolder, int typeWidth, Mode mode); + FieldVector vector, + int idx, + VectorizedDictionaryEncodedParquetValuesReader reader, + int numValuesToRead, + Dictionary dict, + NullabilityHolder nullabilityHolder, + int typeWidth, + Mode mode); } class TimestampMillisReader extends BaseReader { @Override protected void nextVal( - FieldVector vector, int idx, ValuesAsBytesReader valuesReader, int typeWidth, byte[] byteArray) { + FieldVector vector, + int idx, + ValuesAsBytesReader valuesReader, + int typeWidth, + byte[] byteArray) { vector.getDataBuffer().setLong((long) idx * typeWidth, valuesReader.readLong() * 1000); } @Override protected void nextDictEncodedVal( - FieldVector vector, int idx, VectorizedDictionaryEncodedParquetValuesReader reader, - int numValuesToRead, Dictionary dict, NullabilityHolder nullabilityHolder, int typeWidth, Mode mode) { + FieldVector vector, + int idx, + VectorizedDictionaryEncodedParquetValuesReader reader, + int numValuesToRead, + Dictionary dict, + NullabilityHolder nullabilityHolder, + int typeWidth, + Mode mode) { if (Mode.RLE.equals(mode)) { - reader.timestampMillisDictEncodedReader().nextBatch(vector, - idx, numValuesToRead, dict, nullabilityHolder, typeWidth); + reader + .timestampMillisDictEncodedReader() + .nextBatch(vector, idx, numValuesToRead, dict, nullabilityHolder, typeWidth); } else if (Mode.PACKED.equals(mode)) { - vector.getDataBuffer().setLong((long) idx * typeWidth, dict.decodeToLong(reader.readInteger()) * 1000); + vector + .getDataBuffer() + .setLong((long) idx * typeWidth, dict.decodeToLong(reader.readInteger()) * 1000); } } } @@ -333,18 +450,33 @@ protected void nextDictEncodedVal( class FixedWidthBinaryReader extends BaseReader { @Override protected void nextVal( - FieldVector vector, int idx, ValuesAsBytesReader valuesReader, int typeWidth, byte[] byteArray) { + FieldVector vector, + int idx, + ValuesAsBytesReader valuesReader, + int typeWidth, + byte[] byteArray) { ByteBuffer buffer = valuesReader.getBuffer(typeWidth); - ((VarBinaryVector) vector).setSafe(idx, buffer.array(), buffer.position() + buffer.arrayOffset(), - buffer.limit() - buffer.position()); + ((VarBinaryVector) vector) + .setSafe( + idx, + buffer.array(), + buffer.position() + buffer.arrayOffset(), + buffer.limit() - buffer.position()); } @Override protected void nextDictEncodedVal( - FieldVector vector, int idx, VectorizedDictionaryEncodedParquetValuesReader reader, - int numValuesToRead, Dictionary dict, NullabilityHolder nullabilityHolder, int typeWidth, Mode mode) { + FieldVector vector, + int idx, + VectorizedDictionaryEncodedParquetValuesReader reader, + int numValuesToRead, + Dictionary dict, + NullabilityHolder nullabilityHolder, + int typeWidth, + Mode mode) { if (Mode.RLE.equals(mode)) { - reader.fixedWidthBinaryDictEncodedReader() + reader + .fixedWidthBinaryDictEncodedReader() .nextBatch(vector, idx, numValuesToRead, dict, nullabilityHolder, typeWidth); } else if (Mode.PACKED.equals(mode)) { ByteBuffer buffer = dict.decodeToBinary(reader.readInteger()).toByteBuffer(); @@ -356,17 +488,28 @@ protected void nextDictEncodedVal( class FixedLengthDecimalReader extends BaseReader { @Override protected void nextVal( - FieldVector vector, int idx, ValuesAsBytesReader valuesReader, int typeWidth, byte[] byteArray) { + FieldVector vector, + int idx, + ValuesAsBytesReader valuesReader, + int typeWidth, + byte[] byteArray) { valuesReader.getBuffer(typeWidth).get(byteArray, 0, typeWidth); DecimalVectorUtil.setBigEndian((DecimalVector) vector, idx, byteArray); } @Override protected void nextDictEncodedVal( - FieldVector vector, int idx, VectorizedDictionaryEncodedParquetValuesReader reader, - int numValuesToRead, Dictionary dict, NullabilityHolder nullabilityHolder, int typeWidth, Mode mode) { + FieldVector vector, + int idx, + VectorizedDictionaryEncodedParquetValuesReader reader, + int numValuesToRead, + Dictionary dict, + NullabilityHolder nullabilityHolder, + int typeWidth, + Mode mode) { if (Mode.RLE.equals(mode)) { - reader.fixedLengthDecimalDictEncodedReader() + reader + .fixedLengthDecimalDictEncodedReader() .nextBatch(vector, idx, numValuesToRead, dict, nullabilityHolder, typeWidth); } else if (Mode.PACKED.equals(mode)) { byte[] bytes = dict.decodeToBinary(reader.readInteger()).getBytesUnsafe(); @@ -378,18 +521,29 @@ protected void nextDictEncodedVal( class FixedSizeBinaryReader extends BaseReader { @Override protected void nextVal( - FieldVector vector, int idx, ValuesAsBytesReader valuesReader, int typeWidth, byte[] byteArray) { + FieldVector vector, + int idx, + ValuesAsBytesReader valuesReader, + int typeWidth, + byte[] byteArray) { valuesReader.getBuffer(typeWidth).get(byteArray, 0, typeWidth); ((FixedSizeBinaryVector) vector).set(idx, byteArray); } @Override protected void nextDictEncodedVal( - FieldVector vector, int idx, VectorizedDictionaryEncodedParquetValuesReader reader, - int numValuesToRead, Dictionary dict, NullabilityHolder nullabilityHolder, int typeWidth, Mode mode) { + FieldVector vector, + int idx, + VectorizedDictionaryEncodedParquetValuesReader reader, + int numValuesToRead, + Dictionary dict, + NullabilityHolder nullabilityHolder, + int typeWidth, + Mode mode) { if (Mode.RLE.equals(mode)) { - reader.fixedSizeBinaryDictEncodedReader().nextBatch(vector, idx, - numValuesToRead, dict, nullabilityHolder, typeWidth); + reader + .fixedSizeBinaryDictEncodedReader() + .nextBatch(vector, idx, numValuesToRead, dict, nullabilityHolder, typeWidth); } else if (Mode.PACKED.equals(mode)) { byte[] bytes = dict.decodeToBinary(reader.readInteger()).getBytes(); byte[] vectorBytes = new byte[typeWidth]; @@ -402,7 +556,11 @@ protected void nextDictEncodedVal( class VarWidthReader extends BaseReader { @Override protected void nextVal( - FieldVector vector, int idx, ValuesAsBytesReader valuesReader, int typeWidth, byte[] byteArray) { + FieldVector vector, + int idx, + ValuesAsBytesReader valuesReader, + int typeWidth, + byte[] byteArray) { int len = valuesReader.readInteger(); ByteBuffer buffer = valuesReader.getBuffer(len); // Calling setValueLengthSafe takes care of allocating a larger buffer if @@ -421,14 +579,21 @@ protected void nextVal( @Override protected void nextDictEncodedVal( - FieldVector vector, int idx, VectorizedDictionaryEncodedParquetValuesReader reader, - int numValuesToRead, Dictionary dict, NullabilityHolder nullabilityHolder, int typeWidth, Mode mode) { + FieldVector vector, + int idx, + VectorizedDictionaryEncodedParquetValuesReader reader, + int numValuesToRead, + Dictionary dict, + NullabilityHolder nullabilityHolder, + int typeWidth, + Mode mode) { if (Mode.RLE.equals(mode)) { - reader.varWidthBinaryDictEncodedReader().nextBatch(vector, idx, - numValuesToRead, dict, nullabilityHolder, typeWidth); + reader + .varWidthBinaryDictEncodedReader() + .nextBatch(vector, idx, numValuesToRead, dict, nullabilityHolder, typeWidth); } else if (Mode.PACKED.equals(mode)) { - ((BaseVariableWidthVector) vector).setSafe( - idx, dict.decodeToBinary(reader.readInteger()).getBytesUnsafe()); + ((BaseVariableWidthVector) vector) + .setSafe(idx, dict.decodeToBinary(reader.readInteger()).getBytesUnsafe()); } } } @@ -436,16 +601,27 @@ protected void nextDictEncodedVal( class IntBackedDecimalReader extends BaseReader { @Override protected void nextVal( - FieldVector vector, int idx, ValuesAsBytesReader valuesReader, int typeWidth, byte[] byteArray) { + FieldVector vector, + int idx, + ValuesAsBytesReader valuesReader, + int typeWidth, + byte[] byteArray) { ((DecimalVector) vector).set(idx, valuesReader.getBuffer(Integer.BYTES).getInt()); } @Override protected void nextDictEncodedVal( - FieldVector vector, int idx, VectorizedDictionaryEncodedParquetValuesReader reader, - int numValuesToRead, Dictionary dict, NullabilityHolder nullabilityHolder, int typeWidth, Mode mode) { + FieldVector vector, + int idx, + VectorizedDictionaryEncodedParquetValuesReader reader, + int numValuesToRead, + Dictionary dict, + NullabilityHolder nullabilityHolder, + int typeWidth, + Mode mode) { if (Mode.RLE.equals(mode)) { - reader.intBackedDecimalDictEncodedReader() + reader + .intBackedDecimalDictEncodedReader() .nextBatch(vector, idx, numValuesToRead, dict, nullabilityHolder, typeWidth); } else if (Mode.PACKED.equals(mode)) { ((DecimalVector) vector).set(idx, dict.decodeToInt(reader.readInteger())); @@ -456,20 +632,30 @@ protected void nextDictEncodedVal( class LongBackedDecimalReader extends BaseReader { @Override protected void nextVal( - FieldVector vector, int idx, ValuesAsBytesReader valuesReader, int typeWidth, byte[] byteArray) { + FieldVector vector, + int idx, + ValuesAsBytesReader valuesReader, + int typeWidth, + byte[] byteArray) { ((DecimalVector) vector).set(idx, valuesReader.getBuffer(Long.BYTES).getLong()); } @Override protected void nextDictEncodedVal( - FieldVector vector, int idx, VectorizedDictionaryEncodedParquetValuesReader reader, - int numValuesToRead, Dictionary dict, NullabilityHolder nullabilityHolder, int typeWidth, Mode mode) { + FieldVector vector, + int idx, + VectorizedDictionaryEncodedParquetValuesReader reader, + int numValuesToRead, + Dictionary dict, + NullabilityHolder nullabilityHolder, + int typeWidth, + Mode mode) { if (Mode.RLE.equals(mode)) { - reader.longBackedDecimalDictEncodedReader() + reader + .longBackedDecimalDictEncodedReader() .nextBatch(vector, idx, numValuesToRead, dict, nullabilityHolder, typeWidth); } else if (Mode.PACKED.equals(mode)) { - ((DecimalVector) vector).set( - idx, dict.decodeToLong(reader.readInteger())); + ((DecimalVector) vector).set(idx, dict.decodeToLong(reader.readInteger())); } } } @@ -477,14 +663,24 @@ protected void nextDictEncodedVal( class BooleanReader extends BaseReader { @Override protected void nextVal( - FieldVector vector, int idx, ValuesAsBytesReader valuesReader, int typeWidth, byte[] byteArray) { + FieldVector vector, + int idx, + ValuesAsBytesReader valuesReader, + int typeWidth, + byte[] byteArray) { ((BitVector) vector).setSafe(idx, valuesReader.readBooleanAsInt()); } @Override protected void nextDictEncodedVal( - FieldVector vector, int idx, VectorizedDictionaryEncodedParquetValuesReader reader, - int numValuesToRead, Dictionary dict, NullabilityHolder nullabilityHolder, int typeWidth, Mode mode) { + FieldVector vector, + int idx, + VectorizedDictionaryEncodedParquetValuesReader reader, + int numValuesToRead, + Dictionary dict, + NullabilityHolder nullabilityHolder, + int typeWidth, + Mode mode) { throw new UnsupportedOperationException(); } } @@ -493,30 +689,44 @@ class DictionaryIdReader extends BaseReader { @Override protected void nextVal( - FieldVector vector, int idx, ValuesAsBytesReader valuesReader, int typeWidth, byte[] byteArray) { + FieldVector vector, + int idx, + ValuesAsBytesReader valuesReader, + int typeWidth, + byte[] byteArray) { throw new UnsupportedOperationException(); } @Override protected void nextDictEncodedVal( - FieldVector vector, int idx, VectorizedDictionaryEncodedParquetValuesReader reader, - int numValuesToRead, Dictionary dict, NullabilityHolder nullabilityHolder, int typeWidth, Mode mode) { + FieldVector vector, + int idx, + VectorizedDictionaryEncodedParquetValuesReader reader, + int numValuesToRead, + Dictionary dict, + NullabilityHolder nullabilityHolder, + int typeWidth, + Mode mode) { if (Mode.RLE.equals(mode)) { - reader.dictionaryIdReader().nextBatch(vector, idx, numValuesToRead, dict, nullabilityHolder, typeWidth); + reader + .dictionaryIdReader() + .nextBatch(vector, idx, numValuesToRead, dict, nullabilityHolder, typeWidth); } else if (Mode.PACKED.equals(mode)) { vector.getDataBuffer().setInt((long) idx * IntVector.TYPE_WIDTH, reader.readInteger()); } } } - private void setNull(NullabilityHolder nullabilityHolder, int bufferIdx, ArrowBuf validityBuffer) { + private void setNull( + NullabilityHolder nullabilityHolder, int bufferIdx, ArrowBuf validityBuffer) { nullabilityHolder.setNull(bufferIdx); if (setArrowValidityVector) { BitVectorHelper.setValidityBit(validityBuffer, bufferIdx, 0); } } - private void setNulls(NullabilityHolder nullabilityHolder, int idx, int numValues, ArrowBuf validityBuffer) { + private void setNulls( + NullabilityHolder nullabilityHolder, int idx, int numValues, ArrowBuf validityBuffer) { nullabilityHolder.setNulls(idx, numValues); if (setArrowValidityVector) { for (int i = 0; i < numValues; i++) { @@ -526,8 +736,12 @@ private void setNulls(NullabilityHolder nullabilityHolder, int idx, int numValue } private void setNextNValuesInVector( - int typeWidth, NullabilityHolder nullabilityHolder, - ValuesAsBytesReader valuesReader, int bufferIdx, FieldVector vector, int numValues) { + int typeWidth, + NullabilityHolder nullabilityHolder, + ValuesAsBytesReader valuesReader, + int bufferIdx, + FieldVector vector, + int numValues) { ArrowBuf validityBuffer = vector.getValidityBuffer(); if (currentValue == maxDefLevel) { ByteBuffer buffer = valuesReader.getBuffer(numValues * typeWidth); diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java index 99d0e3ed5b4a..eda639a36da3 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/ArrowSchemaUtilTest.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow; import org.apache.arrow.vector.types.pojo.ArrowType; @@ -38,7 +37,6 @@ import org.junit.Assert; import org.junit.Test; - public class ArrowSchemaUtilTest { private static final String INTEGER_FIELD = "i"; @@ -60,23 +58,27 @@ public class ArrowSchemaUtilTest { @Test public void convertPrimitive() { - Schema iceberg = new Schema( - Types.NestedField.optional(0, INTEGER_FIELD, IntegerType.get()), - Types.NestedField.optional(1, BOOLEAN_FIELD, BooleanType.get()), - Types.NestedField.required(2, DOUBLE_FIELD, DoubleType.get()), - Types.NestedField.required(3, STRING_FIELD, StringType.get()), - Types.NestedField.optional(4, DATE_FIELD, DateType.get()), - Types.NestedField.optional(5, TIMESTAMP_FIELD, TimestampType.withZone()), - Types.NestedField.optional(6, LONG_FIELD, LongType.get()), - Types.NestedField.optional(7, FLOAT_FIELD, FloatType.get()), - Types.NestedField.optional(8, TIME_FIELD, TimeType.get()), - Types.NestedField.optional(9, BINARY_FIELD, Types.BinaryType.get()), - Types.NestedField.optional(10, DECIMAL_FIELD, Types.DecimalType.of(1, 1)), - Types.NestedField.optional(12, LIST_FIELD, Types.ListType.ofOptional(13, Types.IntegerType.get())), - Types.NestedField.required(14, MAP_FIELD, Types.MapType.ofOptional(15, 16, - StringType.get(), IntegerType.get())), - Types.NestedField.optional(17, FIXED_WIDTH_BINARY_FIELD, Types.FixedType.ofLength(10)), - Types.NestedField.optional(18, UUID_FIELD, Types.UUIDType.get())); + Schema iceberg = + new Schema( + Types.NestedField.optional(0, INTEGER_FIELD, IntegerType.get()), + Types.NestedField.optional(1, BOOLEAN_FIELD, BooleanType.get()), + Types.NestedField.required(2, DOUBLE_FIELD, DoubleType.get()), + Types.NestedField.required(3, STRING_FIELD, StringType.get()), + Types.NestedField.optional(4, DATE_FIELD, DateType.get()), + Types.NestedField.optional(5, TIMESTAMP_FIELD, TimestampType.withZone()), + Types.NestedField.optional(6, LONG_FIELD, LongType.get()), + Types.NestedField.optional(7, FLOAT_FIELD, FloatType.get()), + Types.NestedField.optional(8, TIME_FIELD, TimeType.get()), + Types.NestedField.optional(9, BINARY_FIELD, Types.BinaryType.get()), + Types.NestedField.optional(10, DECIMAL_FIELD, Types.DecimalType.of(1, 1)), + Types.NestedField.optional( + 12, LIST_FIELD, Types.ListType.ofOptional(13, Types.IntegerType.get())), + Types.NestedField.required( + 14, + MAP_FIELD, + Types.MapType.ofOptional(15, 16, StringType.get(), IntegerType.get())), + Types.NestedField.optional(17, FIXED_WIDTH_BINARY_FIELD, Types.FixedType.ofLength(10)), + Types.NestedField.optional(18, UUID_FIELD, Types.UUIDType.get())); org.apache.arrow.vector.types.pojo.Schema arrow = ArrowSchemaUtil.convert(iceberg); @@ -85,16 +87,15 @@ public void convertPrimitive() { @Test public void convertComplex() { - Schema iceberg = new Schema( - Types.NestedField.optional(0, "m", MapType.ofOptional( - 1, 2, StringType.get(), - LongType.get()) - ), - Types.NestedField.required(3, "m2", MapType.ofOptional( - 4, 5, StringType.get(), - ListType.ofOptional(6, TimestampType.withoutZone())) - ) - ); + Schema iceberg = + new Schema( + Types.NestedField.optional( + 0, "m", MapType.ofOptional(1, 2, StringType.get(), LongType.get())), + Types.NestedField.required( + 3, + "m2", + MapType.ofOptional( + 4, 5, StringType.get(), ListType.ofOptional(6, TimestampType.withoutZone())))); org.apache.arrow.vector.types.pojo.Schema arrow = ArrowSchemaUtil.convert(iceberg); Assert.assertEquals(iceberg.columns().size(), arrow.getFields().size()); } diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index c243bcb1cc65..d37c9884002c 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized; +import static org.apache.iceberg.Files.localInput; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; @@ -86,12 +89,9 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.Files.localInput; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - /** * Test cases for {@link ArrowReader}. + * *

All tests create a table with monthly partitions and write 1 year of data to the table. */ public class ArrowReaderTest { @@ -123,11 +123,9 @@ public class ArrowReaderTest { "time", "time_nullable", "uuid", - "uuid_nullable" - ); + "uuid_nullable"); - @Rule - public final TemporaryFolder temp = new TemporaryFolder(); + @Rule public final TemporaryFolder temp = new TemporaryFolder(); private HadoopTables tables; @@ -135,22 +133,26 @@ public class ArrowReaderTest { private List rowsWritten; /** - * Read all rows and columns from the table without any filter. The test asserts that the Arrow {@link - * VectorSchemaRoot} contains the expected schema and expected vector types. Then the test asserts that the vectors - * contains expected values. The test also asserts the total number of rows match the expected value. + * Read all rows and columns from the table without any filter. The test asserts that the Arrow + * {@link VectorSchemaRoot} contains the expected schema and expected vector types. Then the test + * asserts that the vectors contains expected values. The test also asserts the total number of + * rows match the expected value. */ @Test public void testReadAll() throws Exception { writeTableWithIncrementalRecords(); Table table = tables.load(tableLocation); - readAndCheckQueryResult(table.newScan(), NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, ALL_COLUMNS); + readAndCheckQueryResult( + table.newScan(), NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, ALL_COLUMNS); } /** - * This test writes each partition with constant value rows. The Arrow vectors returned are mostly of type int32 - * which is unexpected. This is happening because of dictionary encoding at the storage level. - *

- * Following are the expected and actual Arrow schema: + * This test writes each partition with constant value rows. The Arrow vectors returned are mostly + * of type int32 which is unexpected. This is happening because of dictionary encoding at the + * storage level. + * + *

Following are the expected and actual Arrow schema: + * *

    * Expected Arrow Schema:
    * timestamp: Timestamp(MICROSECOND, null) not null,
@@ -198,26 +200,29 @@ public void testReadAll() throws Exception {
    * date_nullable: Date(DAY),
    * int_promotion: Int(32, true) not null
    * 
- *

- * TODO: fix the returned Arrow vectors to have vector types consistent with Iceberg types. - *

- * Read all rows and columns from the table without any filter. The test asserts that the Arrow {@link - * VectorSchemaRoot} contains the expected schema and expected vector types. Then the test asserts that the vectors - * contains expected values. The test also asserts the total number of rows match the expected value. + * + *

TODO: fix the returned Arrow vectors to have vector types consistent with Iceberg types. + * + *

Read all rows and columns from the table without any filter. The test asserts that the Arrow + * {@link VectorSchemaRoot} contains the expected schema and expected vector types. Then the test + * asserts that the vectors contains expected values. The test also asserts the total number of + * rows match the expected value. */ @Test @Ignore public void testReadAllWithConstantRecords() throws Exception { writeTableWithConstantRecords(); Table table = tables.load(tableLocation); - readAndCheckQueryResult(table.newScan(), NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, ALL_COLUMNS); + readAndCheckQueryResult( + table.newScan(), NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, ALL_COLUMNS); } /** - * Read all rows and columns from the table without any filter. The test uses a batch size smaller than the number of - * rows in a partition. The test asserts that the Arrow {@link VectorSchemaRoot} contains the expected schema and - * expected vector types. Then the test asserts that the vectors contains expected values. The test also asserts the - * total number of rows match the expected value. + * Read all rows and columns from the table without any filter. The test uses a batch size smaller + * than the number of rows in a partition. The test asserts that the Arrow {@link + * VectorSchemaRoot} contains the expected schema and expected vector types. Then the test asserts + * that the vectors contains expected values. The test also asserts the total number of rows match + * the expected value. */ @Test public void testReadAllWithSmallerBatchSize() throws Exception { @@ -228,9 +233,10 @@ public void testReadAllWithSmallerBatchSize() throws Exception { } /** - * Read selected rows and all columns from the table using a time range row filter. The test asserts that the Arrow - * {@link VectorSchemaRoot} contains the expected schema and expected vector types. Then the test asserts that the - * vectors contains expected values. The test also asserts the total number of rows match the expected value. + * Read selected rows and all columns from the table using a time range row filter. The test + * asserts that the Arrow {@link VectorSchemaRoot} contains the expected schema and expected + * vector types. Then the test asserts that the vectors contains expected values. The test also + * asserts the total number of rows match the expected value. */ @Test public void testReadRangeFilter() throws Exception { @@ -238,16 +244,19 @@ public void testReadRangeFilter() throws Exception { Table table = tables.load(tableLocation); LocalDateTime beginTime = LocalDateTime.of(2020, 1, 1, 0, 0, 0); LocalDateTime endTime = LocalDateTime.of(2020, 2, 1, 0, 0, 0); - TableScan scan = table.newScan() - .filter(Expressions.and( - Expressions.greaterThanOrEqual("timestamp", timestampToMicros(beginTime)), - Expressions.lessThan("timestamp", timestampToMicros(endTime)))); + TableScan scan = + table + .newScan() + .filter( + Expressions.and( + Expressions.greaterThanOrEqual("timestamp", timestampToMicros(beginTime)), + Expressions.lessThan("timestamp", timestampToMicros(endTime)))); readAndCheckQueryResult(scan, NUM_ROWS_PER_MONTH, NUM_ROWS_PER_MONTH, ALL_COLUMNS); } /** - * Read selected rows and all columns from the table using a time range row filter. - * The test asserts that the result is empty. + * Read selected rows and all columns from the table using a time range row filter. The test + * asserts that the result is empty. */ @Test public void testReadRangeFilterEmptyResult() throws Exception { @@ -255,12 +264,16 @@ public void testReadRangeFilterEmptyResult() throws Exception { Table table = tables.load(tableLocation); LocalDateTime beginTime = LocalDateTime.of(2021, 1, 1, 0, 0, 0); LocalDateTime endTime = LocalDateTime.of(2021, 2, 1, 0, 0, 0); - TableScan scan = table.newScan() - .filter(Expressions.and( + TableScan scan = + table + .newScan() + .filter( + Expressions.and( Expressions.greaterThanOrEqual("timestamp", timestampToMicros(beginTime)), Expressions.lessThan("timestamp", timestampToMicros(endTime)))); int numRoots = 0; - try (VectorizedTableScanIterable itr = new VectorizedTableScanIterable(scan, NUM_ROWS_PER_MONTH, false)) { + try (VectorizedTableScanIterable itr = + new VectorizedTableScanIterable(scan, NUM_ROWS_PER_MONTH, false)) { for (ColumnarBatch batch : itr) { numRoots++; } @@ -269,40 +282,41 @@ public void testReadRangeFilterEmptyResult() throws Exception { } /** - * Read all rows and selected columns from the table with a column selection filter. The test asserts that the Arrow - * {@link VectorSchemaRoot} contains the expected schema and expected vector types. Then the test asserts that the - * vectors contains expected values. The test also asserts the total number of rows match the expected value. + * Read all rows and selected columns from the table with a column selection filter. The test + * asserts that the Arrow {@link VectorSchemaRoot} contains the expected schema and expected + * vector types. Then the test asserts that the vectors contains expected values. The test also + * asserts the total number of rows match the expected value. */ @Test public void testReadColumnFilter1() throws Exception { writeTableWithIncrementalRecords(); Table table = tables.load(tableLocation); - TableScan scan = table.newScan() - .select("timestamp", "int", "string"); + TableScan scan = table.newScan().select("timestamp", "int", "string"); readAndCheckQueryResult( - scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, + scan, + NUM_ROWS_PER_MONTH, + 12 * NUM_ROWS_PER_MONTH, ImmutableList.of("timestamp", "int", "string")); } /** - * Read all rows and a single column from the table with a column selection filter. The test asserts that the Arrow - * {@link VectorSchemaRoot} contains the expected schema and expected vector types. Then the test asserts that the - * vectors contains expected values. The test also asserts the total number of rows match the expected value. + * Read all rows and a single column from the table with a column selection filter. The test + * asserts that the Arrow {@link VectorSchemaRoot} contains the expected schema and expected + * vector types. Then the test asserts that the vectors contains expected values. The test also + * asserts the total number of rows match the expected value. */ @Test public void testReadColumnFilter2() throws Exception { writeTableWithIncrementalRecords(); Table table = tables.load(tableLocation); - TableScan scan = table.newScan() - .select("timestamp"); + TableScan scan = table.newScan().select("timestamp"); readAndCheckQueryResult( - scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, - ImmutableList.of("timestamp")); + scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, ImmutableList.of("timestamp")); } /** - * The test asserts that {@link CloseableIterator#hasNext()} returned - * by the {@link ArrowReader} is idempotent. + * The test asserts that {@link CloseableIterator#hasNext()} returned by the {@link ArrowReader} + * is idempotent. */ @Test public void testHasNextIsIdempotent() throws Exception { @@ -310,37 +324,38 @@ public void testHasNextIsIdempotent() throws Exception { Table table = tables.load(tableLocation); TableScan scan = table.newScan(); // Call hasNext() 0 extra times. - readAndCheckHasNextIsIdempotent(scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, 0, ALL_COLUMNS); + readAndCheckHasNextIsIdempotent( + scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, 0, ALL_COLUMNS); // Call hasNext() 1 extra time. - readAndCheckHasNextIsIdempotent(scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, 1, ALL_COLUMNS); + readAndCheckHasNextIsIdempotent( + scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, 1, ALL_COLUMNS); // Call hasNext() 2 extra times. - readAndCheckHasNextIsIdempotent(scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, 2, ALL_COLUMNS); + readAndCheckHasNextIsIdempotent( + scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, 2, ALL_COLUMNS); } /** * Run the following verifications: + * *

    - *
  1. Read the data and verify that the returned ColumnarBatches match expected rows.
  2. - *
  3. Read the data and verify that the returned Arrow VectorSchemaRoots match expected rows.
  4. + *
  5. Read the data and verify that the returned ColumnarBatches match expected rows. + *
  6. Read the data and verify that the returned Arrow VectorSchemaRoots match expected rows. *
*/ private void readAndCheckQueryResult( - TableScan scan, - int numRowsPerRoot, - int expectedTotalRows, - List columns) throws IOException { + TableScan scan, int numRowsPerRoot, int expectedTotalRows, List columns) + throws IOException { // Read the data and verify that the returned ColumnarBatches match expected rows. readAndCheckColumnarBatch(scan, numRowsPerRoot, columns); // Read the data and verify that the returned Arrow VectorSchemaRoots match expected rows. readAndCheckArrowResult(scan, numRowsPerRoot, expectedTotalRows, columns); } - private void readAndCheckColumnarBatch( - TableScan scan, - int numRowsPerRoot, - List columns) throws IOException { + private void readAndCheckColumnarBatch(TableScan scan, int numRowsPerRoot, List columns) + throws IOException { int rowIndex = 0; - try (VectorizedTableScanIterable itr = new VectorizedTableScanIterable(scan, numRowsPerRoot, false)) { + try (VectorizedTableScanIterable itr = + new VectorizedTableScanIterable(scan, numRowsPerRoot, false)) { for (ColumnarBatch batch : itr) { List expectedRows = rowsWritten.subList(rowIndex, rowIndex + numRowsPerRoot); checkColumnarBatch(numRowsPerRoot, expectedRows, batch, columns); @@ -350,14 +365,13 @@ private void readAndCheckColumnarBatch( } private void readAndCheckArrowResult( - TableScan scan, - int numRowsPerRoot, - int expectedTotalRows, - List columns) throws IOException { + TableScan scan, int numRowsPerRoot, int expectedTotalRows, List columns) + throws IOException { Set columnSet = ImmutableSet.copyOf(columns); int rowIndex = 0; int totalRows = 0; - try (VectorizedTableScanIterable itr = new VectorizedTableScanIterable(scan, numRowsPerRoot, false)) { + try (VectorizedTableScanIterable itr = + new VectorizedTableScanIterable(scan, numRowsPerRoot, false)) { for (ColumnarBatch batch : itr) { List expectedRows = rowsWritten.subList(rowIndex, rowIndex + numRowsPerRoot); VectorSchemaRoot root = batch.createVectorSchemaRootFromVectors(); @@ -376,11 +390,13 @@ private void readAndCheckHasNextIsIdempotent( int numRowsPerRoot, int expectedTotalRows, int numExtraCallsToHasNext, - List columns) throws IOException { + List columns) + throws IOException { Set columnSet = ImmutableSet.copyOf(columns); int rowIndex = 0; int totalRows = 0; - try (VectorizedTableScanIterable itr = new VectorizedTableScanIterable(scan, numRowsPerRoot, false)) { + try (VectorizedTableScanIterable itr = + new VectorizedTableScanIterable(scan, numRowsPerRoot, false)) { CloseableIterator iterator = itr.iterator(); while (iterator.hasNext()) { // Call hasNext() a few extra times. @@ -419,159 +435,236 @@ private void checkColumnarBatch( assertEquals(columns.size(), batch.numCols()); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("timestamp"), - columnSet, "timestamp", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("timestamp"), + columnSet, + "timestamp", (records, i) -> records.get(i).getField("timestamp"), - (array, i) -> timestampFromMicros(array.getLong(i)) - ); + (array, i) -> timestampFromMicros(array.getLong(i))); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("timestamp_nullable"), - columnSet, "timestamp_nullable", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("timestamp_nullable"), + columnSet, + "timestamp_nullable", (records, i) -> records.get(i).getField("timestamp_nullable"), - (array, i) -> timestampFromMicros(array.getLong(i)) - ); + (array, i) -> timestampFromMicros(array.getLong(i))); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("boolean"), - columnSet, "boolean", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("boolean"), + columnSet, + "boolean", (records, i) -> records.get(i).getField("boolean"), - ColumnVector::getBoolean - ); + ColumnVector::getBoolean); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("boolean_nullable"), - columnSet, "boolean_nullable", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("boolean_nullable"), + columnSet, + "boolean_nullable", (records, i) -> records.get(i).getField("boolean_nullable"), - ColumnVector::getBoolean - ); + ColumnVector::getBoolean); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("int"), - columnSet, "int", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("int"), + columnSet, + "int", (records, i) -> records.get(i).getField("int"), - ColumnVector::getInt - ); + ColumnVector::getInt); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("int_nullable"), - columnSet, "int_nullable", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("int_nullable"), + columnSet, + "int_nullable", (records, i) -> records.get(i).getField("int_nullable"), - ColumnVector::getInt - ); + ColumnVector::getInt); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("long"), - columnSet, "long", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("long"), + columnSet, + "long", (records, i) -> records.get(i).getField("long"), - ColumnVector::getLong - ); + ColumnVector::getLong); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("long_nullable"), - columnSet, "long_nullable", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("long_nullable"), + columnSet, + "long_nullable", (records, i) -> records.get(i).getField("long_nullable"), - ColumnVector::getLong - ); + ColumnVector::getLong); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("float"), - columnSet, "float", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("float"), + columnSet, + "float", (records, i) -> Float.floatToIntBits((float) records.get(i).getField("float")), - (array, i) -> Float.floatToIntBits(array.getFloat(i)) - ); + (array, i) -> Float.floatToIntBits(array.getFloat(i))); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("float_nullable"), - columnSet, "float_nullable", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("float_nullable"), + columnSet, + "float_nullable", (records, i) -> Float.floatToIntBits((float) records.get(i).getField("float_nullable")), - (array, i) -> Float.floatToIntBits(array.getFloat(i)) - ); + (array, i) -> Float.floatToIntBits(array.getFloat(i))); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("double"), - columnSet, "double", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("double"), + columnSet, + "double", (records, i) -> Double.doubleToLongBits((double) records.get(i).getField("double")), - (array, i) -> Double.doubleToLongBits(array.getDouble(i)) - ); + (array, i) -> Double.doubleToLongBits(array.getDouble(i))); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("double_nullable"), - columnSet, "double_nullable", - (records, i) -> Double.doubleToLongBits((double) records.get(i).getField("double_nullable")), - (array, i) -> Double.doubleToLongBits(array.getDouble(i)) - ); + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("double_nullable"), + columnSet, + "double_nullable", + (records, i) -> + Double.doubleToLongBits((double) records.get(i).getField("double_nullable")), + (array, i) -> Double.doubleToLongBits(array.getDouble(i))); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("timestamp_tz"), - columnSet, "timestamp_tz", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("timestamp_tz"), + columnSet, + "timestamp_tz", (records, i) -> timestampToMicros((OffsetDateTime) records.get(i).getField("timestamp_tz")), - ColumnVector::getLong - ); + ColumnVector::getLong); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("timestamp_tz_nullable"), - columnSet, "timestamp_tz_nullable", - (records, i) -> timestampToMicros((OffsetDateTime) records.get(i).getField("timestamp_tz_nullable")), - ColumnVector::getLong - ); + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("timestamp_tz_nullable"), + columnSet, + "timestamp_tz_nullable", + (records, i) -> + timestampToMicros((OffsetDateTime) records.get(i).getField("timestamp_tz_nullable")), + ColumnVector::getLong); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("string"), - columnSet, "string", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("string"), + columnSet, + "string", (records, i) -> records.get(i).getField("string"), - ColumnVector::getString - ); + ColumnVector::getString); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("string_nullable"), - columnSet, "string_nullable", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("string_nullable"), + columnSet, + "string_nullable", (records, i) -> records.get(i).getField("string_nullable"), - ColumnVector::getString - ); + ColumnVector::getString); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("bytes"), - columnSet, "bytes", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("bytes"), + columnSet, + "bytes", (records, i) -> records.get(i).getField("bytes"), - (array, i) -> ByteBuffer.wrap(array.getBinary(i)) - ); + (array, i) -> ByteBuffer.wrap(array.getBinary(i))); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("bytes_nullable"), - columnSet, "bytes_nullable", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("bytes_nullable"), + columnSet, + "bytes_nullable", (records, i) -> records.get(i).getField("bytes_nullable"), - (array, i) -> ByteBuffer.wrap(array.getBinary(i)) - ); + (array, i) -> ByteBuffer.wrap(array.getBinary(i))); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("date"), - columnSet, "date", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("date"), + columnSet, + "date", (records, i) -> records.get(i).getField("date"), - (array, i) -> dateFromDay(array.getInt(i)) - ); + (array, i) -> dateFromDay(array.getInt(i))); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("date_nullable"), - columnSet, "date_nullable", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("date_nullable"), + columnSet, + "date_nullable", (records, i) -> records.get(i).getField("date_nullable"), - (array, i) -> dateFromDay(array.getInt(i)) - ); + (array, i) -> dateFromDay(array.getInt(i))); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("int_promotion"), - columnSet, "int_promotion", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("int_promotion"), + columnSet, + "int_promotion", (records, i) -> records.get(i).getField("int_promotion"), - ColumnVector::getInt - ); + ColumnVector::getInt); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("uuid"), - columnSet, "uuid", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("uuid"), + columnSet, + "uuid", (records, i) -> records.get(i).getField("uuid"), - ColumnVector::getBinary + ColumnVector::getBinary); - ); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("uuid_nullable"), - columnSet, "uuid_nullable", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("uuid_nullable"), + columnSet, + "uuid_nullable", (records, i) -> records.get(i).getField("uuid_nullable"), - ColumnVector::getBinary - ); + ColumnVector::getBinary); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("time"), - columnSet, "time", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("time"), + columnSet, + "time", (records, i) -> records.get(i).getField("time"), - (array, i) -> LocalTime.ofNanoOfDay(array.getLong(i) * 1000) - ); + (array, i) -> LocalTime.ofNanoOfDay(array.getLong(i) * 1000)); checkColumnarArrayValues( - expectedNumRows, expectedRows, batch, columnNameToIndex.get("time_nullable"), - columnSet, "time_nullable", + expectedNumRows, + expectedRows, + batch, + columnNameToIndex.get("time_nullable"), + columnSet, + "time_nullable", (records, i) -> records.get(i).getField("time_nullable"), - (array, i) -> LocalTime.ofNanoOfDay(array.getLong(i) * 1000) - ); + (array, i) -> LocalTime.ofNanoOfDay(array.getLong(i) * 1000)); } private static void checkColumnarArrayValues( @@ -588,7 +681,8 @@ private static void checkColumnarArrayValues( for (int i = 0; i < expectedNumRows; i++) { Object expectedValue = expectedValueExtractor.apply(expectedRows, i); Object actualValue = vectorValueExtractor.apply(columnVector, i); - // we need to use assertThat() here because it does a java.util.Objects.deepEquals() and that + // we need to use assertThat() here because it does a java.util.Objects.deepEquals() and + // that // is relevant for byte[] Assertions.assertThat(actualValue).as("Row#" + i + " mismatches").isEqualTo(expectedValue); } @@ -608,37 +702,35 @@ private void writeTable(boolean constantRecords) throws Exception { tables = new HadoopTables(); tableLocation = temp.newFolder("test").toString(); - Schema schema = new Schema( - Types.NestedField.required(1, "timestamp", Types.TimestampType.withoutZone()), - Types.NestedField.optional(2, "timestamp_nullable", Types.TimestampType.withoutZone()), - Types.NestedField.required(3, "boolean", Types.BooleanType.get()), - Types.NestedField.optional(4, "boolean_nullable", Types.BooleanType.get()), - Types.NestedField.required(5, "int", Types.IntegerType.get()), - Types.NestedField.optional(6, "int_nullable", Types.IntegerType.get()), - Types.NestedField.required(7, "long", Types.LongType.get()), - Types.NestedField.optional(8, "long_nullable", Types.LongType.get()), - Types.NestedField.required(9, "float", Types.FloatType.get()), - Types.NestedField.optional(10, "float_nullable", Types.FloatType.get()), - Types.NestedField.required(11, "double", Types.DoubleType.get()), - Types.NestedField.optional(12, "double_nullable", Types.DoubleType.get()), - Types.NestedField.required(13, "timestamp_tz", Types.TimestampType.withZone()), - Types.NestedField.optional(14, "timestamp_tz_nullable", Types.TimestampType.withZone()), - Types.NestedField.required(15, "string", Types.StringType.get()), - Types.NestedField.optional(16, "string_nullable", Types.StringType.get()), - Types.NestedField.required(17, "bytes", Types.BinaryType.get()), - Types.NestedField.optional(18, "bytes_nullable", Types.BinaryType.get()), - Types.NestedField.required(19, "date", Types.DateType.get()), - Types.NestedField.optional(20, "date_nullable", Types.DateType.get()), - Types.NestedField.required(21, "int_promotion", Types.IntegerType.get()), - Types.NestedField.required(22, "time", Types.TimeType.get()), - Types.NestedField.optional(23, "time_nullable", Types.TimeType.get()), - Types.NestedField.required(24, "uuid", Types.UUIDType.get()), - Types.NestedField.optional(25, "uuid_nullable", Types.UUIDType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(1, "timestamp", Types.TimestampType.withoutZone()), + Types.NestedField.optional(2, "timestamp_nullable", Types.TimestampType.withoutZone()), + Types.NestedField.required(3, "boolean", Types.BooleanType.get()), + Types.NestedField.optional(4, "boolean_nullable", Types.BooleanType.get()), + Types.NestedField.required(5, "int", Types.IntegerType.get()), + Types.NestedField.optional(6, "int_nullable", Types.IntegerType.get()), + Types.NestedField.required(7, "long", Types.LongType.get()), + Types.NestedField.optional(8, "long_nullable", Types.LongType.get()), + Types.NestedField.required(9, "float", Types.FloatType.get()), + Types.NestedField.optional(10, "float_nullable", Types.FloatType.get()), + Types.NestedField.required(11, "double", Types.DoubleType.get()), + Types.NestedField.optional(12, "double_nullable", Types.DoubleType.get()), + Types.NestedField.required(13, "timestamp_tz", Types.TimestampType.withZone()), + Types.NestedField.optional(14, "timestamp_tz_nullable", Types.TimestampType.withZone()), + Types.NestedField.required(15, "string", Types.StringType.get()), + Types.NestedField.optional(16, "string_nullable", Types.StringType.get()), + Types.NestedField.required(17, "bytes", Types.BinaryType.get()), + Types.NestedField.optional(18, "bytes_nullable", Types.BinaryType.get()), + Types.NestedField.required(19, "date", Types.DateType.get()), + Types.NestedField.optional(20, "date_nullable", Types.DateType.get()), + Types.NestedField.required(21, "int_promotion", Types.IntegerType.get()), + Types.NestedField.required(22, "time", Types.TimeType.get()), + Types.NestedField.optional(23, "time_nullable", Types.TimeType.get()), + Types.NestedField.required(24, "uuid", Types.UUIDType.get()), + Types.NestedField.optional(25, "uuid_nullable", Types.UUIDType.get())); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .month("timestamp") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(schema).month("timestamp").build(); Table table = tables.create(schema, spec, tableLocation); @@ -646,13 +738,11 @@ private void writeTable(boolean constantRecords) throws Exception { for (int i = 1; i <= 12; i++) { final List records; if (constantRecords) { - records = createConstantRecordsForDate( - table.schema(), LocalDateTime.of(2020, i, 1, 0, 0, 0) - ); + records = + createConstantRecordsForDate(table.schema(), LocalDateTime.of(2020, i, 1, 0, 0, 0)); } else { - records = createIncrementalRecordsForDate( - table.schema(), LocalDateTime.of(2020, i, 1, 0, 0, 0) - ); + records = + createIncrementalRecordsForDate(table.schema(), LocalDateTime.of(2020, i, 1, 0, 0, 0)); } overwrite.addFile(writeParquetFile(table, records)); } @@ -661,73 +751,74 @@ private void writeTable(boolean constantRecords) throws Exception { // Perform a type promotion // TODO: The read Arrow vector should of type BigInt (promoted type) but it is Int (old type). Table tableLatest = tables.load(tableLocation); - tableLatest.updateSchema() - .updateColumn("int_promotion", Types.LongType.get()) - .commit(); + tableLatest.updateSchema().updateColumn("int_promotion", Types.LongType.get()).commit(); } - private static org.apache.arrow.vector.types.pojo.Schema createExpectedArrowSchema(Set columnSet) { - List allFields = ImmutableList.of( - new Field( - "timestamp", new FieldType(false, MinorType.TIMESTAMPMICRO.getType(), null), null), - new Field( - "timestamp_nullable", new FieldType(true, MinorType.TIMESTAMPMICRO.getType(), null), null), - new Field( - "boolean", new FieldType(false, MinorType.BIT.getType(), null), null), - new Field( - "boolean_nullable", new FieldType(true, MinorType.BIT.getType(), null), null), - new Field( - "int", new FieldType(false, MinorType.INT.getType(), null), null), - new Field( - "int_nullable", new FieldType(true, MinorType.INT.getType(), null), null), - new Field( - "long", new FieldType(false, MinorType.BIGINT.getType(), null), null), - new Field( - "long_nullable", new FieldType(true, MinorType.BIGINT.getType(), null), null), - new Field( - "float", new FieldType(false, MinorType.FLOAT4.getType(), null), null), - new Field( - "float_nullable", new FieldType(true, MinorType.FLOAT4.getType(), null), null), - new Field( - "double", new FieldType(false, MinorType.FLOAT8.getType(), null), null), - new Field( - "double_nullable", new FieldType(true, MinorType.FLOAT8.getType(), null), null), - new Field( - "timestamp_tz", new FieldType(false, new ArrowType.Timestamp( - org.apache.arrow.vector.types.TimeUnit.MICROSECOND, "UTC"), null), null), - new Field( - "timestamp_tz_nullable", new FieldType(true, new ArrowType.Timestamp( - org.apache.arrow.vector.types.TimeUnit.MICROSECOND, "UTC"), null), null), - new Field( - "string", new FieldType(false, MinorType.VARCHAR.getType(), null), null), - new Field( - "string_nullable", new FieldType(true, MinorType.VARCHAR.getType(), null), null), - new Field( - "bytes", new FieldType(false, MinorType.VARBINARY.getType(), null), null), - new Field( - "bytes_nullable", new FieldType(true, MinorType.VARBINARY.getType(), null), null), - new Field( - "date", new FieldType(false, MinorType.DATEDAY.getType(), null), null), - new Field( - "date_nullable", new FieldType(true, MinorType.DATEDAY.getType(), null), null), - new Field( - "int_promotion", new FieldType(false, MinorType.INT.getType(), null), null), - new Field( - "time", new FieldType(false, MinorType.TIMEMICRO.getType(), null), null), - new Field( - "time_nullable", new FieldType(true, MinorType.TIMEMICRO.getType(), null), null), - new Field( - "uuid", new FieldType(false, new ArrowType.FixedSizeBinary(16), null), null), - new Field( - "uuid_nullable", new FieldType(true, new ArrowType.FixedSizeBinary(16), null), null) - ); - List filteredFields = allFields.stream() - .filter(f -> columnSet.contains(f.getName())) - .collect(Collectors.toList()); + private static org.apache.arrow.vector.types.pojo.Schema createExpectedArrowSchema( + Set columnSet) { + List allFields = + ImmutableList.of( + new Field( + "timestamp", new FieldType(false, MinorType.TIMESTAMPMICRO.getType(), null), null), + new Field( + "timestamp_nullable", + new FieldType(true, MinorType.TIMESTAMPMICRO.getType(), null), + null), + new Field("boolean", new FieldType(false, MinorType.BIT.getType(), null), null), + new Field("boolean_nullable", new FieldType(true, MinorType.BIT.getType(), null), null), + new Field("int", new FieldType(false, MinorType.INT.getType(), null), null), + new Field("int_nullable", new FieldType(true, MinorType.INT.getType(), null), null), + new Field("long", new FieldType(false, MinorType.BIGINT.getType(), null), null), + new Field("long_nullable", new FieldType(true, MinorType.BIGINT.getType(), null), null), + new Field("float", new FieldType(false, MinorType.FLOAT4.getType(), null), null), + new Field( + "float_nullable", new FieldType(true, MinorType.FLOAT4.getType(), null), null), + new Field("double", new FieldType(false, MinorType.FLOAT8.getType(), null), null), + new Field( + "double_nullable", new FieldType(true, MinorType.FLOAT8.getType(), null), null), + new Field( + "timestamp_tz", + new FieldType( + false, + new ArrowType.Timestamp( + org.apache.arrow.vector.types.TimeUnit.MICROSECOND, "UTC"), + null), + null), + new Field( + "timestamp_tz_nullable", + new FieldType( + true, + new ArrowType.Timestamp( + org.apache.arrow.vector.types.TimeUnit.MICROSECOND, "UTC"), + null), + null), + new Field("string", new FieldType(false, MinorType.VARCHAR.getType(), null), null), + new Field( + "string_nullable", new FieldType(true, MinorType.VARCHAR.getType(), null), null), + new Field("bytes", new FieldType(false, MinorType.VARBINARY.getType(), null), null), + new Field( + "bytes_nullable", new FieldType(true, MinorType.VARBINARY.getType(), null), null), + new Field("date", new FieldType(false, MinorType.DATEDAY.getType(), null), null), + new Field( + "date_nullable", new FieldType(true, MinorType.DATEDAY.getType(), null), null), + new Field("int_promotion", new FieldType(false, MinorType.INT.getType(), null), null), + new Field("time", new FieldType(false, MinorType.TIMEMICRO.getType(), null), null), + new Field( + "time_nullable", new FieldType(true, MinorType.TIMEMICRO.getType(), null), null), + new Field("uuid", new FieldType(false, new ArrowType.FixedSizeBinary(16), null), null), + new Field( + "uuid_nullable", + new FieldType(true, new ArrowType.FixedSizeBinary(16), null), + null)); + List filteredFields = + allFields.stream() + .filter(f -> columnSet.contains(f.getName())) + .collect(Collectors.toList()); return new org.apache.arrow.vector.types.pojo.Schema(filteredFields); } - private List createIncrementalRecordsForDate(Schema schema, LocalDateTime datetime) { + private List createIncrementalRecordsForDate( + Schema schema, LocalDateTime datetime) { List records = Lists.newArrayList(); for (int i = 0; i < NUM_ROWS_PER_MONTH; i++) { GenericRecord rec = GenericRecord.create(schema); @@ -744,11 +835,13 @@ private List createIncrementalRecordsForDate(Schema schema, Local rec.setField("double", (double) i * 4); rec.setField("double_nullable", (double) i * 4); rec.setField("timestamp_tz", datetime.plus(i, ChronoUnit.MINUTES).atOffset(ZoneOffset.UTC)); - rec.setField("timestamp_tz_nullable", datetime.plus(i, ChronoUnit.MINUTES).atOffset(ZoneOffset.UTC)); + rec.setField( + "timestamp_tz_nullable", datetime.plus(i, ChronoUnit.MINUTES).atOffset(ZoneOffset.UTC)); rec.setField("string", "String-" + i); rec.setField("string_nullable", "String-" + i); rec.setField("bytes", ByteBuffer.wrap(("Bytes-" + i).getBytes(StandardCharsets.UTF_8))); - rec.setField("bytes_nullable", ByteBuffer.wrap(("Bytes-" + i).getBytes(StandardCharsets.UTF_8))); + rec.setField( + "bytes_nullable", ByteBuffer.wrap(("Bytes-" + i).getBytes(StandardCharsets.UTF_8))); rec.setField("date", LocalDate.of(2020, 1, 1).plus(i, ChronoUnit.DAYS)); rec.setField("date_nullable", LocalDate.of(2020, 1, 1).plus(i, ChronoUnit.DAYS)); rec.setField("int_promotion", i); @@ -790,7 +883,8 @@ private List createConstantRecordsForDate(Schema schema, LocalDat rec.setField("int_promotion", 1); rec.setField("time", LocalTime.of(11, 30)); rec.setField("time_nullable", LocalTime.of(11, 30)); - ByteBuffer bb = UUIDUtil.convertToByteBuffer(UUID.fromString("abcd91cf-08d0-4223-b145-f64030b3077f")); + ByteBuffer bb = + UUIDUtil.convertToByteBuffer(UUID.fromString("abcd91cf-08d0-4223-b145-f64030b3077f")); byte[] uuid = bb.array(); rec.setField("uuid", uuid); rec.setField("uuid_nullable", uuid); @@ -803,10 +897,11 @@ private DataFile writeParquetFile(Table table, List records) thro rowsWritten.addAll(records); File parquetFile = temp.newFile(); assertTrue(parquetFile.delete()); - FileAppender appender = Parquet.write(Files.localOutput(parquetFile)) - .schema(table.schema()) - .createWriterFunc(GenericParquetWriter::buildWriter) - .build(); + FileAppender appender = + Parquet.write(Files.localOutput(parquetFile)) + .schema(table.schema()) + .createWriterFunc(GenericParquetWriter::buildWriter) + .build(); try { appender.addAll(records); } finally { @@ -838,8 +933,7 @@ private static LocalDateTime timestampFromMicros(long micros) { return LocalDateTime.ofEpochSecond( TimeUnit.MICROSECONDS.toSeconds(micros), (int) TimeUnit.MICROSECONDS.toNanos(micros % 1000), - ZoneOffset.UTC - ); + ZoneOffset.UTC); } private static LocalDate dateFromDay(int day) { @@ -890,133 +984,210 @@ private void checkAllVectorValues( assertEquals(expectedNumRows, root.getRowCount()); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "timestamp", + expectedNumRows, + expectedRows, + root, + columnSet, + "timestamp", (records, i) -> records.get(i).getField("timestamp"), - (vector, i) -> timestampFromMicros(((TimeStampMicroVector) vector).get(i)) - ); + (vector, i) -> timestampFromMicros(((TimeStampMicroVector) vector).get(i))); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "timestamp_nullable", + expectedNumRows, + expectedRows, + root, + columnSet, + "timestamp_nullable", (records, i) -> records.get(i).getField("timestamp_nullable"), - (vector, i) -> timestampFromMicros(((TimeStampMicroVector) vector).get(i)) - ); + (vector, i) -> timestampFromMicros(((TimeStampMicroVector) vector).get(i))); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "boolean", + expectedNumRows, + expectedRows, + root, + columnSet, + "boolean", (records, i) -> records.get(i).getField("boolean"), - (vector, i) -> ((BitVector) vector).get(i) == 1 - ); + (vector, i) -> ((BitVector) vector).get(i) == 1); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "boolean_nullable", + expectedNumRows, + expectedRows, + root, + columnSet, + "boolean_nullable", (records, i) -> records.get(i).getField("boolean_nullable"), - (vector, i) -> ((BitVector) vector).get(i) == 1 - ); + (vector, i) -> ((BitVector) vector).get(i) == 1); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "int", + expectedNumRows, + expectedRows, + root, + columnSet, + "int", (records, i) -> records.get(i).getField("int"), - (vector, i) -> ((IntVector) vector).get(i) - ); + (vector, i) -> ((IntVector) vector).get(i)); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "int_nullable", + expectedNumRows, + expectedRows, + root, + columnSet, + "int_nullable", (records, i) -> records.get(i).getField("int_nullable"), - (vector, i) -> ((IntVector) vector).get(i) - ); + (vector, i) -> ((IntVector) vector).get(i)); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "long", + expectedNumRows, + expectedRows, + root, + columnSet, + "long", (records, i) -> records.get(i).getField("long"), - (vector, i) -> ((BigIntVector) vector).get(i) - ); + (vector, i) -> ((BigIntVector) vector).get(i)); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "long_nullable", + expectedNumRows, + expectedRows, + root, + columnSet, + "long_nullable", (records, i) -> records.get(i).getField("long_nullable"), - (vector, i) -> ((BigIntVector) vector).get(i) - ); + (vector, i) -> ((BigIntVector) vector).get(i)); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "float", + expectedNumRows, + expectedRows, + root, + columnSet, + "float", (records, i) -> Float.floatToIntBits((float) records.get(i).getField("float")), - (vector, i) -> Float.floatToIntBits(((Float4Vector) vector).get(i)) - ); + (vector, i) -> Float.floatToIntBits(((Float4Vector) vector).get(i))); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "float_nullable", + expectedNumRows, + expectedRows, + root, + columnSet, + "float_nullable", (records, i) -> Float.floatToIntBits((float) records.get(i).getField("float_nullable")), - (vector, i) -> Float.floatToIntBits(((Float4Vector) vector).get(i)) - ); + (vector, i) -> Float.floatToIntBits(((Float4Vector) vector).get(i))); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "double", + expectedNumRows, + expectedRows, + root, + columnSet, + "double", (records, i) -> Double.doubleToLongBits((double) records.get(i).getField("double")), - (vector, i) -> Double.doubleToLongBits(((Float8Vector) vector).get(i)) - ); + (vector, i) -> Double.doubleToLongBits(((Float8Vector) vector).get(i))); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "double_nullable", - (records, i) -> Double.doubleToLongBits((double) records.get(i).getField("double_nullable")), - (vector, i) -> Double.doubleToLongBits(((Float8Vector) vector).get(i)) - ); + expectedNumRows, + expectedRows, + root, + columnSet, + "double_nullable", + (records, i) -> + Double.doubleToLongBits((double) records.get(i).getField("double_nullable")), + (vector, i) -> Double.doubleToLongBits(((Float8Vector) vector).get(i))); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "timestamp_tz", + expectedNumRows, + expectedRows, + root, + columnSet, + "timestamp_tz", (records, i) -> timestampToMicros((OffsetDateTime) records.get(i).getField("timestamp_tz")), - (vector, i) -> ((TimeStampMicroTZVector) vector).get(i) - ); + (vector, i) -> ((TimeStampMicroTZVector) vector).get(i)); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "timestamp_tz_nullable", - (records, i) -> timestampToMicros((OffsetDateTime) records.get(i).getField("timestamp_tz_nullable")), - (vector, i) -> ((TimeStampMicroTZVector) vector).get(i) - ); + expectedNumRows, + expectedRows, + root, + columnSet, + "timestamp_tz_nullable", + (records, i) -> + timestampToMicros((OffsetDateTime) records.get(i).getField("timestamp_tz_nullable")), + (vector, i) -> ((TimeStampMicroTZVector) vector).get(i)); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "string", + expectedNumRows, + expectedRows, + root, + columnSet, + "string", (records, i) -> records.get(i).getField("string"), - (vector, i) -> new String(((VarCharVector) vector).get(i), StandardCharsets.UTF_8) - ); + (vector, i) -> new String(((VarCharVector) vector).get(i), StandardCharsets.UTF_8)); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "string_nullable", + expectedNumRows, + expectedRows, + root, + columnSet, + "string_nullable", (records, i) -> records.get(i).getField("string_nullable"), - (vector, i) -> new String(((VarCharVector) vector).get(i), StandardCharsets.UTF_8) - ); + (vector, i) -> new String(((VarCharVector) vector).get(i), StandardCharsets.UTF_8)); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "bytes", + expectedNumRows, + expectedRows, + root, + columnSet, + "bytes", (records, i) -> records.get(i).getField("bytes"), - (vector, i) -> ByteBuffer.wrap(((VarBinaryVector) vector).get(i)) - ); + (vector, i) -> ByteBuffer.wrap(((VarBinaryVector) vector).get(i))); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "bytes_nullable", + expectedNumRows, + expectedRows, + root, + columnSet, + "bytes_nullable", (records, i) -> records.get(i).getField("bytes_nullable"), - (vector, i) -> ByteBuffer.wrap(((VarBinaryVector) vector).get(i)) - ); + (vector, i) -> ByteBuffer.wrap(((VarBinaryVector) vector).get(i))); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "date", + expectedNumRows, + expectedRows, + root, + columnSet, + "date", (records, i) -> records.get(i).getField("date"), - (vector, i) -> dateFromDay(((DateDayVector) vector).get(i)) - ); + (vector, i) -> dateFromDay(((DateDayVector) vector).get(i))); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "date_nullable", + expectedNumRows, + expectedRows, + root, + columnSet, + "date_nullable", (records, i) -> records.get(i).getField("date_nullable"), - (vector, i) -> dateFromDay(((DateDayVector) vector).get(i)) - ); + (vector, i) -> dateFromDay(((DateDayVector) vector).get(i))); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "int_promotion", + expectedNumRows, + expectedRows, + root, + columnSet, + "int_promotion", (records, i) -> records.get(i).getField("int_promotion"), - (vector, i) -> ((IntVector) vector).get(i) - ); + (vector, i) -> ((IntVector) vector).get(i)); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "uuid", + expectedNumRows, + expectedRows, + root, + columnSet, + "uuid", (records, i) -> records.get(i).getField("uuid"), - (vector, i) -> ((FixedSizeBinaryVector) vector).get(i) - ); + (vector, i) -> ((FixedSizeBinaryVector) vector).get(i)); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "uuid_nullable", + expectedNumRows, + expectedRows, + root, + columnSet, + "uuid_nullable", (records, i) -> records.get(i).getField("uuid_nullable"), - (vector, i) -> ((FixedSizeBinaryVector) vector).get(i) - ); + (vector, i) -> ((FixedSizeBinaryVector) vector).get(i)); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "time", + expectedNumRows, + expectedRows, + root, + columnSet, + "time", (records, i) -> records.get(i).getField("time"), - (vector, i) -> LocalTime.ofNanoOfDay(((TimeMicroVector) vector).get(i) * 1000) - ); + (vector, i) -> LocalTime.ofNanoOfDay(((TimeMicroVector) vector).get(i) * 1000)); checkVectorValues( - expectedNumRows, expectedRows, root, columnSet, "time_nullable", + expectedNumRows, + expectedRows, + root, + columnSet, + "time_nullable", (records, i) -> records.get(i).getField("time_nullable"), - (vector, i) -> LocalTime.ofNanoOfDay(((TimeMicroVector) vector).get(i) * 1000) - ); + (vector, i) -> LocalTime.ofNanoOfDay(((TimeMicroVector) vector).get(i) * 1000)); } private static void checkVectorValues( @@ -1033,7 +1204,8 @@ private static void checkVectorValues( for (int i = 0; i < expectedNumRows; i++) { Object expectedValue = expectedValueExtractor.apply(expectedRows, i); Object actualValue = vectorValueExtractor.apply(vector, i); - // we need to use assertThat() here because it does a java.util.Objects.deepEquals() and that + // we need to use assertThat() here because it does a java.util.Objects.deepEquals() and + // that // is relevant for byte[] Assertions.assertThat(actualValue).as("Row#" + i + " mismatches").isEqualTo(expectedValue); } diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/parquet/DecimalVectorUtilTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/parquet/DecimalVectorUtilTest.java index e62c6ae7ba2a..4e78bafd0a1a 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/parquet/DecimalVectorUtilTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/parquet/DecimalVectorUtilTest.java @@ -16,15 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.arrow.vectorized.parquet; +import static org.junit.Assert.assertEquals; + import java.math.BigInteger; import org.assertj.core.api.Assertions; import org.junit.Test; -import static org.junit.Assert.assertEquals; - public class DecimalVectorUtilTest { @Test @@ -70,7 +69,7 @@ public void testPadBigEndianBytesZero() { public void testPadBigEndianBytesOverflow() { byte[] bytes = new byte[17]; Assertions.assertThatThrownBy(() -> DecimalVectorUtil.padBigEndianBytes(bytes, 16)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Buffer size of 17 is larger than requested size of 16"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Buffer size of 17 is larger than requested size of 16"); } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/AssumeRoleAwsClientFactory.java b/aws/src/main/java/org/apache/iceberg/aws/AssumeRoleAwsClientFactory.java index ea5a0b5bea45..7b34e8d0b3de 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/AssumeRoleAwsClientFactory.java +++ b/aws/src/main/java/org/apache/iceberg/aws/AssumeRoleAwsClientFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws; import java.util.Map; @@ -79,32 +78,41 @@ public DynamoDbClient dynamo() { @Override public void initialize(Map properties) { this.roleArn = properties.get(AwsProperties.CLIENT_ASSUME_ROLE_ARN); - Preconditions.checkNotNull(roleArn, - "Cannot initialize AssumeRoleClientConfigFactory with null role ARN"); - this.timeout = PropertyUtil.propertyAsInt(properties, - AwsProperties.CLIENT_ASSUME_ROLE_TIMEOUT_SEC, AwsProperties.CLIENT_ASSUME_ROLE_TIMEOUT_SEC_DEFAULT); + Preconditions.checkNotNull( + roleArn, "Cannot initialize AssumeRoleClientConfigFactory with null role ARN"); + this.timeout = + PropertyUtil.propertyAsInt( + properties, + AwsProperties.CLIENT_ASSUME_ROLE_TIMEOUT_SEC, + AwsProperties.CLIENT_ASSUME_ROLE_TIMEOUT_SEC_DEFAULT); this.externalId = properties.get(AwsProperties.CLIENT_ASSUME_ROLE_EXTERNAL_ID); this.region = properties.get(AwsProperties.CLIENT_ASSUME_ROLE_REGION); - Preconditions.checkNotNull(region, "Cannot initialize AssumeRoleClientConfigFactory with null region"); + Preconditions.checkNotNull( + region, "Cannot initialize AssumeRoleClientConfigFactory with null region"); this.s3Endpoint = properties.get(AwsProperties.S3FILEIO_ENDPOINT); this.tags = toTags(properties); - this.s3UseArnRegionEnabled = PropertyUtil.propertyAsBoolean(properties, AwsProperties.S3_ACCESS_POINTS_PREFIX, - AwsProperties.S3_USE_ARN_REGION_ENABLED_DEFAULT); + this.s3UseArnRegionEnabled = + PropertyUtil.propertyAsBoolean( + properties, + AwsProperties.S3_ACCESS_POINTS_PREFIX, + AwsProperties.S3_USE_ARN_REGION_ENABLED_DEFAULT); this.dynamoDbEndpoint = properties.get(AwsProperties.DYNAMODB_ENDPOINT); - this.httpClientType = PropertyUtil.propertyAsString(properties, - AwsProperties.HTTP_CLIENT_TYPE, AwsProperties.HTTP_CLIENT_TYPE_DEFAULT); + this.httpClientType = + PropertyUtil.propertyAsString( + properties, AwsProperties.HTTP_CLIENT_TYPE, AwsProperties.HTTP_CLIENT_TYPE_DEFAULT); } protected T configure(T clientBuilder) { - AssumeRoleRequest request = AssumeRoleRequest.builder() - .roleArn(roleArn) - .roleSessionName(genSessionName()) - .durationSeconds(timeout) - .externalId(externalId) - .tags(tags) - .build(); + AssumeRoleRequest request = + AssumeRoleRequest.builder() + .roleArn(roleArn) + .roleSessionName(genSessionName()) + .durationSeconds(timeout) + .externalId(externalId) + .tags(tags) + .build(); clientBuilder.credentialsProvider( StsAssumeRoleCredentialsProvider.builder() @@ -149,7 +157,8 @@ private String genSessionName() { } private static Set toTags(Map properties) { - return PropertyUtil.propertiesWithPrefix(properties, AwsProperties.CLIENT_ASSUME_ROLE_TAGS_PREFIX) + return PropertyUtil.propertiesWithPrefix( + properties, AwsProperties.CLIENT_ASSUME_ROLE_TAGS_PREFIX) .entrySet().stream() .map(e -> Tag.builder().key(e.getKey()).value(e.getValue()).build()) .collect(Collectors.toSet()); diff --git a/aws/src/main/java/org/apache/iceberg/aws/AwsClientFactories.java b/aws/src/main/java/org/apache/iceberg/aws/AwsClientFactories.java index 8f2172ef8bae..fa154107558e 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/AwsClientFactories.java +++ b/aws/src/main/java/org/apache/iceberg/aws/AwsClientFactories.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws; import java.net.URI; @@ -42,18 +41,19 @@ public class AwsClientFactories { - private static final DefaultAwsClientFactory AWS_CLIENT_FACTORY_DEFAULT = new DefaultAwsClientFactory(); + private static final DefaultAwsClientFactory AWS_CLIENT_FACTORY_DEFAULT = + new DefaultAwsClientFactory(); - private AwsClientFactories() { - } + private AwsClientFactories() {} public static AwsClientFactory defaultFactory() { return AWS_CLIENT_FACTORY_DEFAULT; } public static AwsClientFactory from(Map properties) { - String factoryImpl = PropertyUtil.propertyAsString( - properties, AwsProperties.CLIENT_FACTORY, DefaultAwsClientFactory.class.getName()); + String factoryImpl = + PropertyUtil.propertyAsString( + properties, AwsProperties.CLIENT_FACTORY, DefaultAwsClientFactory.class.getName()); return loadClientFactory(factoryImpl, properties); } @@ -66,8 +66,9 @@ private static AwsClientFactory loadClientFactory(String impl, Map configureEndpoint(builder, s3Endpoint)) .serviceConfiguration(s3Configuration(s3PathStyleAccess, s3UseArnRegionEnabled)) - .credentialsProvider(credentialsProvider(s3AccessKeyId, s3SecretAccessKey, s3SessionToken)) + .credentialsProvider( + credentialsProvider(s3AccessKeyId, s3SecretAccessKey, s3SessionToken)) .build(); } @@ -117,7 +120,9 @@ public GlueClient glue() { @Override public KmsClient kms() { - return KmsClient.builder().httpClientBuilder(configureHttpClientBuilder(httpClientType)).build(); + return KmsClient.builder() + .httpClientBuilder(configureHttpClientBuilder(httpClientType)) + .build(); } @Override @@ -135,23 +140,24 @@ public void initialize(Map properties) { this.s3AccessKeyId = properties.get(AwsProperties.S3FILEIO_ACCESS_KEY_ID); this.s3SecretAccessKey = properties.get(AwsProperties.S3FILEIO_SECRET_ACCESS_KEY); this.s3SessionToken = properties.get(AwsProperties.S3FILEIO_SESSION_TOKEN); - this.s3PathStyleAccess = PropertyUtil.propertyAsBoolean( - properties, - AwsProperties.S3FILEIO_PATH_STYLE_ACCESS, - AwsProperties.S3FILEIO_PATH_STYLE_ACCESS_DEFAULT - ); - this.s3UseArnRegionEnabled = PropertyUtil.propertyAsBoolean( - properties, - AwsProperties.S3_USE_ARN_REGION_ENABLED, - AwsProperties.S3_USE_ARN_REGION_ENABLED_DEFAULT - ); + this.s3PathStyleAccess = + PropertyUtil.propertyAsBoolean( + properties, + AwsProperties.S3FILEIO_PATH_STYLE_ACCESS, + AwsProperties.S3FILEIO_PATH_STYLE_ACCESS_DEFAULT); + this.s3UseArnRegionEnabled = + PropertyUtil.propertyAsBoolean( + properties, + AwsProperties.S3_USE_ARN_REGION_ENABLED, + AwsProperties.S3_USE_ARN_REGION_ENABLED_DEFAULT); ValidationException.check( (s3AccessKeyId == null) == (s3SecretAccessKey == null), "S3 client access key ID and secret access key must be set at the same time"); this.dynamoDbEndpoint = properties.get(AwsProperties.DYNAMODB_ENDPOINT); - this.httpClientType = PropertyUtil.propertyAsString(properties, - AwsProperties.HTTP_CLIENT_TYPE, AwsProperties.HTTP_CLIENT_TYPE_DEFAULT); + this.httpClientType = + PropertyUtil.propertyAsString( + properties, AwsProperties.HTTP_CLIENT_TYPE, AwsProperties.HTTP_CLIENT_TYPE_DEFAULT); } } @@ -176,7 +182,8 @@ public static void configureEndpoint(T builder, Str } } - public static S3Configuration s3Configuration(Boolean pathStyleAccess, Boolean s3UseArnRegionEnabled) { + public static S3Configuration s3Configuration( + Boolean pathStyleAccess, Boolean s3UseArnRegionEnabled) { return S3Configuration.builder() .pathStyleAccessEnabled(pathStyleAccess) .useArnRegionEnabled(s3UseArnRegionEnabled) diff --git a/aws/src/main/java/org/apache/iceberg/aws/AwsClientFactory.java b/aws/src/main/java/org/apache/iceberg/aws/AwsClientFactory.java index 1ed5e6f5a9c0..daef0d048090 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/AwsClientFactory.java +++ b/aws/src/main/java/org/apache/iceberg/aws/AwsClientFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws; import java.io.Serializable; @@ -27,39 +26,43 @@ import software.amazon.awssdk.services.s3.S3Client; /** - * Interface to customize AWS clients used by Iceberg. - * A custom factory must have a no-arg constructor, and use {@link #initialize(Map)} to initialize the factory. + * Interface to customize AWS clients used by Iceberg. A custom factory must have a no-arg + * constructor, and use {@link #initialize(Map)} to initialize the factory. */ public interface AwsClientFactory extends Serializable { /** * create a Amazon S3 client + * * @return s3 client */ S3Client s3(); /** * create a AWS Glue client + * * @return glue client */ GlueClient glue(); /** * Create a AWS KMS client + * * @return kms client */ KmsClient kms(); /** * Create a Amazon DynamoDB client + * * @return dynamoDB client */ DynamoDbClient dynamo(); /** * Initialize AWS client factory from catalog properties. + * * @param properties catalog properties */ void initialize(Map properties); - } diff --git a/aws/src/main/java/org/apache/iceberg/aws/AwsProperties.java b/aws/src/main/java/org/apache/iceberg/aws/AwsProperties.java index c38144e3efad..60791acf43eb 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/AwsProperties.java +++ b/aws/src/main/java/org/apache/iceberg/aws/AwsProperties.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws; import java.io.Serializable; @@ -36,96 +35,103 @@ public class AwsProperties implements Serializable { /** - * Type of S3 Server side encryption used, default to {@link AwsProperties#S3FILEIO_SSE_TYPE_NONE}. - *

- * For more details: https://docs.aws.amazon.com/AmazonS3/latest/dev/serv-side-encryption.html + * Type of S3 Server side encryption used, default to {@link + * AwsProperties#S3FILEIO_SSE_TYPE_NONE}. + * + *

For more details: https://docs.aws.amazon.com/AmazonS3/latest/dev/serv-side-encryption.html */ public static final String S3FILEIO_SSE_TYPE = "s3.sse.type"; - /** - * No server side encryption. - */ + /** No server side encryption. */ public static final String S3FILEIO_SSE_TYPE_NONE = "none"; /** * S3 SSE-KMS encryption. - *

- * For more details: https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingKMSEncryption.html + * + *

For more details: https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingKMSEncryption.html */ public static final String S3FILEIO_SSE_TYPE_KMS = "kms"; /** * S3 SSE-S3 encryption. - *

- * For more details: https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingServerSideEncryption.html + * + *

For more details: + * https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingServerSideEncryption.html */ public static final String S3FILEIO_SSE_TYPE_S3 = "s3"; /** * S3 SSE-C encryption. - *

- * For more details: https://docs.aws.amazon.com/AmazonS3/latest/dev/ServerSideEncryptionCustomerKeys.html + * + *

For more details: + * https://docs.aws.amazon.com/AmazonS3/latest/dev/ServerSideEncryptionCustomerKeys.html */ public static final String S3FILEIO_SSE_TYPE_CUSTOM = "custom"; /** - * If S3 encryption type is SSE-KMS, input is a KMS Key ID or ARN. - * In case this property is not set, default key "aws/s3" is used. - * If encryption type is SSE-C, input is a custom base-64 AES256 symmetric key. + * If S3 encryption type is SSE-KMS, input is a KMS Key ID or ARN. In case this property is not + * set, default key "aws/s3" is used. If encryption type is SSE-C, input is a custom base-64 + * AES256 symmetric key. */ public static final String S3FILEIO_SSE_KEY = "s3.sse.key"; /** - * If S3 encryption type is SSE-C, input is the base-64 MD5 digest of the secret key. - * This MD5 must be explicitly passed in by the caller to ensure key integrity. + * If S3 encryption type is SSE-C, input is the base-64 MD5 digest of the secret key. This MD5 + * must be explicitly passed in by the caller to ensure key integrity. */ public static final String S3FILEIO_SSE_MD5 = "s3.sse.md5"; /** - * The ID of the Glue Data Catalog where the tables reside. - * If none is provided, Glue automatically uses the caller's AWS account ID by default. - *

- * For more details, see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-catalog-databases.html + * The ID of the Glue Data Catalog where the tables reside. If none is provided, Glue + * automatically uses the caller's AWS account ID by default. + * + *

For more details, see + * https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-catalog-databases.html */ public static final String GLUE_CATALOG_ID = "glue.id"; /** - * The account ID used in a Glue resource ARN, e.g. arn:aws:glue:us-east-1:1000000000000:table/db1/table1 + * The account ID used in a Glue resource ARN, e.g. + * arn:aws:glue:us-east-1:1000000000000:table/db1/table1 */ public static final String GLUE_ACCOUNT_ID = "glue.account-id"; /** - * If Glue should skip archiving an old table version when creating a new version in a commit. - * By default Glue archives all old table versions after an UpdateTable call, - * but Glue has a default max number of archived table versions (can be increased). - * So for streaming use case with lots of commits, it is recommended to set this value to true. + * If Glue should skip archiving an old table version when creating a new version in a commit. By + * default Glue archives all old table versions after an UpdateTable call, but Glue has a default + * max number of archived table versions (can be increased). So for streaming use case with lots + * of commits, it is recommended to set this value to true. */ public static final String GLUE_CATALOG_SKIP_ARCHIVE = "glue.skip-archive"; + public static final boolean GLUE_CATALOG_SKIP_ARCHIVE_DEFAULT = false; /** - * If Glue should skip name validations - * It is recommended to stick to Glue best practice in - * https://docs.aws.amazon.com/athena/latest/ug/glue-best-practices.html to make sure operations are Hive compatible. - * This is only added for users that have existing conventions using non-standard characters. When database name - * and table name validation are skipped, there is no guarantee that downstream systems would all support the names. + * If Glue should skip name validations It is recommended to stick to Glue best practice in + * https://docs.aws.amazon.com/athena/latest/ug/glue-best-practices.html to make sure operations + * are Hive compatible. This is only added for users that have existing conventions using + * non-standard characters. When database name and table name validation are skipped, there is no + * guarantee that downstream systems would all support the names. */ public static final String GLUE_CATALOG_SKIP_NAME_VALIDATION = "glue.skip-name-validation"; + public static final boolean GLUE_CATALOG_SKIP_NAME_VALIDATION_DEFAULT = false; /** - * If set, GlueCatalog will use Lake Formation for access control. - * For more credential vending details, see: https://docs.aws.amazon.com/lake-formation/latest/dg/api-overview.html. - * If enabled, the {@link AwsClientFactory} implementation must be {@link LakeFormationAwsClientFactory} - * or any class that extends it. + * If set, GlueCatalog will use Lake Formation for access control. For more credential vending + * details, see: https://docs.aws.amazon.com/lake-formation/latest/dg/api-overview.html. If + * enabled, the {@link AwsClientFactory} implementation must be {@link + * LakeFormationAwsClientFactory} or any class that extends it. */ public static final String GLUE_LAKEFORMATION_ENABLED = "glue.lakeformation-enabled"; + public static final boolean GLUE_LAKEFORMATION_ENABLED_DEFAULT = false; /** * Configure an alternative endpoint of the Glue service for GlueCatalog to access. - *

- * This could be used to use GlueCatalog with any glue-compatible metastore service that has a different endpoint + * + *

This could be used to use GlueCatalog with any glue-compatible metastore service that has a + * different endpoint */ public static final String GLUE_CATALOG_ENDPOINT = "glue.endpoint"; @@ -133,203 +139,210 @@ public class AwsProperties implements Serializable { * Number of threads to use for uploading parts to S3 (shared pool across all output streams), * default to {@link Runtime#availableProcessors()} */ - public static final String S3FILEIO_MULTIPART_UPLOAD_THREADS = "s3.multipart.num-threads"; + public static final String S3FILEIO_MULTIPART_UPLOAD_THREADS = "s3.multipart.num-threads"; /** - * The size of a single part for multipart upload requests in bytes (default: 32MB). - * based on S3 requirement, the part size must be at least 5MB. - * Too ensure performance of the reader and writer, the part size must be less than 2GB. - *

- * For more details, see https://docs.aws.amazon.com/AmazonS3/latest/dev/qfacts.html + * The size of a single part for multipart upload requests in bytes (default: 32MB). based on S3 + * requirement, the part size must be at least 5MB. Too ensure performance of the reader and + * writer, the part size must be less than 2GB. + * + *

For more details, see https://docs.aws.amazon.com/AmazonS3/latest/dev/qfacts.html */ public static final String S3FILEIO_MULTIPART_SIZE = "s3.multipart.part-size-bytes"; + public static final int S3FILEIO_MULTIPART_SIZE_DEFAULT = 32 * 1024 * 1024; public static final int S3FILEIO_MULTIPART_SIZE_MIN = 5 * 1024 * 1024; /** - * The threshold expressed as a factor times the multipart size at which to - * switch from uploading using a single put object request to uploading using multipart upload - * (default: 1.5). + * The threshold expressed as a factor times the multipart size at which to switch from uploading + * using a single put object request to uploading using multipart upload (default: 1.5). */ public static final String S3FILEIO_MULTIPART_THRESHOLD_FACTOR = "s3.multipart.threshold"; + public static final double S3FILEIO_MULTIPART_THRESHOLD_FACTOR_DEFAULT = 1.5; /** - * Location to put staging files for upload to S3, default to temp directory set in java.io.tmpdir. + * Location to put staging files for upload to S3, default to temp directory set in + * java.io.tmpdir. */ public static final String S3FILEIO_STAGING_DIRECTORY = "s3.staging-dir"; /** - * Used to configure canned access control list (ACL) for S3 client to use during write. - * If not set, ACL will not be set for requests. - *

- * The input must be one of {@link software.amazon.awssdk.services.s3.model.ObjectCannedACL}, - * such as 'public-read-write' - * For more details: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html + * Used to configure canned access control list (ACL) for S3 client to use during write. If not + * set, ACL will not be set for requests. + * + *

The input must be one of {@link software.amazon.awssdk.services.s3.model.ObjectCannedACL}, + * such as 'public-read-write' For more details: + * https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html */ public static final String S3FILEIO_ACL = "s3.acl"; /** * Configure an alternative endpoint of the S3 service for S3FileIO to access. - *

- * This could be used to use S3FileIO with any s3-compatible object storage service that has a different endpoint, - * or access a private S3 endpoint in a virtual private cloud. + * + *

This could be used to use S3FileIO with any s3-compatible object storage service that has a + * different endpoint, or access a private S3 endpoint in a virtual private cloud. */ public static final String S3FILEIO_ENDPOINT = "s3.endpoint"; /** - * If set {@code true}, requests to S3FileIO will use Path-Style, otherwise, Virtual Hosted-Style will be used. - *

- * For more details: https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html + * If set {@code true}, requests to S3FileIO will use Path-Style, otherwise, Virtual Hosted-Style + * will be used. + * + *

For more details: https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html */ public static final String S3FILEIO_PATH_STYLE_ACCESS = "s3.path-style-access"; + public static final boolean S3FILEIO_PATH_STYLE_ACCESS_DEFAULT = false; /** * Configure the static access key ID used to access S3FileIO. - *

- * When set, the default client factory will use the basic or session credentials provided instead of - * reading the default credential chain to create S3 access credentials. - * If {@link #S3FILEIO_SESSION_TOKEN} is set, session credential is used, otherwise basic credential is used. + * + *

When set, the default client factory will use the basic or session credentials provided + * instead of reading the default credential chain to create S3 access credentials. If {@link + * #S3FILEIO_SESSION_TOKEN} is set, session credential is used, otherwise basic credential is + * used. */ public static final String S3FILEIO_ACCESS_KEY_ID = "s3.access-key-id"; /** * Configure the static secret access key used to access S3FileIO. - *

- * When set, the default client factory will use the basic or session credentials provided instead of - * reading the default credential chain to create S3 access credentials. - * If {@link #S3FILEIO_SESSION_TOKEN} is set, session credential is used, otherwise basic credential is used. + * + *

When set, the default client factory will use the basic or session credentials provided + * instead of reading the default credential chain to create S3 access credentials. If {@link + * #S3FILEIO_SESSION_TOKEN} is set, session credential is used, otherwise basic credential is + * used. */ public static final String S3FILEIO_SECRET_ACCESS_KEY = "s3.secret-access-key"; /** * Configure the static session token used to access S3FileIO. - *

- * When set, the default client factory will use the session credentials provided instead of + * + *

When set, the default client factory will use the session credentials provided instead of * reading the default credential chain to create S3 access credentials. */ public static final String S3FILEIO_SESSION_TOKEN = "s3.session-token"; /** - * Enable to make S3FileIO, to make cross-region call to the region specified in the ARN of an access point. - *

- * By default, attempting to use an access point in a different region will throw an exception. + * Enable to make S3FileIO, to make cross-region call to the region specified in the ARN of an + * access point. + * + *

By default, attempting to use an access point in a different region will throw an exception. * When enabled, this property allows using access points in other regions. - *

- * For more details see: https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/services/s3/S3Configuration.html#useArnRegionEnabled-- + * + *

For more details see: + * https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/services/s3/S3Configuration.html#useArnRegionEnabled-- */ public static final String S3_USE_ARN_REGION_ENABLED = "s3.use-arn-region-enabled"; + public static final boolean S3_USE_ARN_REGION_ENABLED_DEFAULT = false; - /** - * Enables eTag checks for S3 PUT and MULTIPART upload requests. - */ + /** Enables eTag checks for S3 PUT and MULTIPART upload requests. */ public static final String S3_CHECKSUM_ENABLED = "s3.checksum-enabled"; + public static final boolean S3_CHECKSUM_ENABLED_DEFAULT = false; - /** - * Configure the batch size used when deleting multiple files from a given S3 bucket - */ + /** Configure the batch size used when deleting multiple files from a given S3 bucket */ public static final String S3FILEIO_DELETE_BATCH_SIZE = "s3.delete.batch-size"; /** * Default batch size used when deleting files. - *

- * Refer to https://github.com/apache/hadoop/commit/56dee667707926f3796c7757be1a133a362f05c9 + * + *

Refer to https://github.com/apache/hadoop/commit/56dee667707926f3796c7757be1a133a362f05c9 * for more details on why this value was chosen. */ public static final int S3FILEIO_DELETE_BATCH_SIZE_DEFAULT = 250; /** - * Max possible batch size for deletion. Currently, a max of 1000 keys can be deleted in one batch. - * https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html + * Max possible batch size for deletion. Currently, a max of 1000 keys can be deleted in one + * batch. https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html */ public static final int S3FILEIO_DELETE_BATCH_SIZE_MAX = 1000; - /** - * Configure an alternative endpoint of the DynamoDB service to access. - */ + /** Configure an alternative endpoint of the DynamoDB service to access. */ public static final String DYNAMODB_ENDPOINT = "dynamodb.endpoint"; - /** - * DynamoDB table name for {@link DynamoDbCatalog} - */ + /** DynamoDB table name for {@link DynamoDbCatalog} */ public static final String DYNAMODB_TABLE_NAME = "dynamodb.table-name"; + public static final String DYNAMODB_TABLE_NAME_DEFAULT = "iceberg"; /** - * The implementation class of {@link AwsClientFactory} to customize AWS client configurations. - * If set, all AWS clients will be initialized by the specified factory. - * If not set, {@link AwsClientFactories#defaultFactory()} is used as default factory. + * The implementation class of {@link AwsClientFactory} to customize AWS client configurations. If + * set, all AWS clients will be initialized by the specified factory. If not set, {@link + * AwsClientFactories#defaultFactory()} is used as default factory. */ public static final String CLIENT_FACTORY = "client.factory"; /** - * Used by {@link AssumeRoleAwsClientFactory}. - * If set, all AWS clients will assume a role of the given ARN, instead of using the default credential chain. + * Used by {@link AssumeRoleAwsClientFactory}. If set, all AWS clients will assume a role of the + * given ARN, instead of using the default credential chain. */ public static final String CLIENT_ASSUME_ROLE_ARN = "client.assume-role.arn"; /** - * Used by {@link AssumeRoleAwsClientFactory} to pass a list of sessions. - * Each session tag consists of a key name and an associated value. + * Used by {@link AssumeRoleAwsClientFactory} to pass a list of sessions. Each session tag + * consists of a key name and an associated value. */ public static final String CLIENT_ASSUME_ROLE_TAGS_PREFIX = "client.assume-role.tags."; /** - * Used by {@link AssumeRoleAwsClientFactory}. - * The timeout of the assume role session in seconds, default to 1 hour. - * At the end of the timeout, a new set of role session credentials will be fetched through a STS client. + * Used by {@link AssumeRoleAwsClientFactory}. The timeout of the assume role session in seconds, + * default to 1 hour. At the end of the timeout, a new set of role session credentials will be + * fetched through a STS client. */ public static final String CLIENT_ASSUME_ROLE_TIMEOUT_SEC = "client.assume-role.timeout-sec"; + public static final int CLIENT_ASSUME_ROLE_TIMEOUT_SEC_DEFAULT = 3600; /** - * Used by {@link AssumeRoleAwsClientFactory}. - * Optional external ID used to assume an IAM role. - *

- * For more details, see https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-user_externalid.html + * Used by {@link AssumeRoleAwsClientFactory}. Optional external ID used to assume an IAM role. + * + *

For more details, see + * https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-user_externalid.html */ public static final String CLIENT_ASSUME_ROLE_EXTERNAL_ID = "client.assume-role.external-id"; /** - * Used by {@link AssumeRoleAwsClientFactory}. - * If set, all AWS clients except STS client will use the given region instead of the default region chain. - *

- * The value must be one of {@link software.amazon.awssdk.regions.Region}, such as 'us-east-1'. + * Used by {@link AssumeRoleAwsClientFactory}. If set, all AWS clients except STS client will use + * the given region instead of the default region chain. + * + *

The value must be one of {@link software.amazon.awssdk.regions.Region}, such as 'us-east-1'. * For more details, see https://docs.aws.amazon.com/general/latest/gr/rande.html */ public static final String CLIENT_ASSUME_ROLE_REGION = "client.assume-role.region"; /** - * The type of {@link software.amazon.awssdk.http.SdkHttpClient} implementation used by {@link AwsClientFactory} - * If set, all AWS clients will use this specified HTTP client. - * If not set, {@link #HTTP_CLIENT_TYPE_DEFAULT} will be used. - * For specific types supported, see HTTP_CLIENT_TYPE_* defined below. + * The type of {@link software.amazon.awssdk.http.SdkHttpClient} implementation used by {@link + * AwsClientFactory} If set, all AWS clients will use this specified HTTP client. If not set, + * {@link #HTTP_CLIENT_TYPE_DEFAULT} will be used. For specific types supported, see + * HTTP_CLIENT_TYPE_* defined below. */ public static final String HTTP_CLIENT_TYPE = "http-client.type"; /** - * If this is set under {@link #HTTP_CLIENT_TYPE}, - * {@link software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient} - * will be used as the HTTP Client in {@link AwsClientFactory} + * If this is set under {@link #HTTP_CLIENT_TYPE}, {@link + * software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient} will be used as the HTTP + * Client in {@link AwsClientFactory} */ public static final String HTTP_CLIENT_TYPE_URLCONNECTION = "urlconnection"; /** - * If this is set under {@link #HTTP_CLIENT_TYPE}, {@link software.amazon.awssdk.http.apache.ApacheHttpClient} - * will be used as the HTTP Client in {@link AwsClientFactory} + * If this is set under {@link #HTTP_CLIENT_TYPE}, {@link + * software.amazon.awssdk.http.apache.ApacheHttpClient} will be used as the HTTP Client in {@link + * AwsClientFactory} */ public static final String HTTP_CLIENT_TYPE_APACHE = "apache"; + public static final String HTTP_CLIENT_TYPE_DEFAULT = HTTP_CLIENT_TYPE_URLCONNECTION; /** * Used by {@link S3FileIO} to tag objects when writing. To set, we can pass a catalog property. - *

- * For more details, see https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-tagging.html - *

- * Example: s3.write.tags.my_key=my_val + * + *

For more details, see + * https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-tagging.html + * + *

Example: s3.write.tags.my_key=my_val */ public static final String S3_WRITE_TAGS_PREFIX = "s3.write.tags."; @@ -338,10 +351,11 @@ public class AwsProperties implements Serializable { * tagged with the configured key-value pairs before deletion. This is considered a soft-delete, * because users are able to configure tag-based object lifecycle policy at bucket level to * transition objects to different tiers. - *

- * For more details, see https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-lifecycle-mgmt.html - *

- * Example: s3.delete.tags.my_key=my_val + * + *

For more details, see + * https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-lifecycle-mgmt.html + * + *

Example: s3.delete.tags.my_key=my_val */ public static final String S3_DELETE_TAGS_PREFIX = "s3.delete.tags."; @@ -352,38 +366,38 @@ public class AwsProperties implements Serializable { public static final String S3FILEIO_DELETE_THREADS = "s3.delete.num-threads"; /** - * Determines if {@link S3FileIO} deletes the object when io.delete() is called, default to true. Once - * disabled, users are expected to set tags through {@link #S3_DELETE_TAGS_PREFIX} and manage + * Determines if {@link S3FileIO} deletes the object when io.delete() is called, default to true. + * Once disabled, users are expected to set tags through {@link #S3_DELETE_TAGS_PREFIX} and manage * deleted files through S3 lifecycle policy. */ public static final String S3_DELETE_ENABLED = "s3.delete-enabled"; + public static final boolean S3_DELETE_ENABLED_DEFAULT = true; /** - * Used by {@link S3FileIO}, prefix used for bucket access point configuration. - * To set, we can pass a catalog property. - *

- * For more details, see https://aws.amazon.com/s3/features/access-points/ - *

- * Example: s3.access-points.my-bucket=access-point + * Used by {@link S3FileIO}, prefix used for bucket access point configuration. To set, we can + * pass a catalog property. + * + *

For more details, see https://aws.amazon.com/s3/features/access-points/ + * + *

Example: s3.access-points.my-bucket=access-point */ public static final String S3_ACCESS_POINTS_PREFIX = "s3.access-points."; /** * @deprecated will be removed at 0.15.0, please use {@link #S3_CHECKSUM_ENABLED_DEFAULT} instead */ - @Deprecated - public static final boolean CLIENT_ENABLE_ETAG_CHECK_DEFAULT = false; + @Deprecated public static final boolean CLIENT_ENABLE_ETAG_CHECK_DEFAULT = false; /** - * Used by {@link LakeFormationAwsClientFactory}. - * The table name used as part of lake formation credentials request. + * Used by {@link LakeFormationAwsClientFactory}. The table name used as part of lake formation + * credentials request. */ public static final String LAKE_FORMATION_TABLE_NAME = "lakeformation.table-name"; /** - * Used by {@link LakeFormationAwsClientFactory}. - * The database name used as part of lake formation credentials request. + * Used by {@link LakeFormationAwsClientFactory}. The database name used as part of lake formation + * credentials request. */ public static final String LAKE_FORMATION_DB_NAME = "lakeformation.db-name"; @@ -437,70 +451,96 @@ public AwsProperties() { } public AwsProperties(Map properties) { - this.s3FileIoSseType = properties.getOrDefault( - AwsProperties.S3FILEIO_SSE_TYPE, AwsProperties.S3FILEIO_SSE_TYPE_NONE); + this.s3FileIoSseType = + properties.getOrDefault( + AwsProperties.S3FILEIO_SSE_TYPE, AwsProperties.S3FILEIO_SSE_TYPE_NONE); this.s3FileIoSseKey = properties.get(AwsProperties.S3FILEIO_SSE_KEY); this.s3FileIoSseMd5 = properties.get(AwsProperties.S3FILEIO_SSE_MD5); if (AwsProperties.S3FILEIO_SSE_TYPE_CUSTOM.equals(s3FileIoSseType)) { - Preconditions.checkNotNull(s3FileIoSseKey, "Cannot initialize SSE-C S3FileIO with null encryption key"); - Preconditions.checkNotNull(s3FileIoSseMd5, "Cannot initialize SSE-C S3FileIO with null encryption key MD5"); + Preconditions.checkNotNull( + s3FileIoSseKey, "Cannot initialize SSE-C S3FileIO with null encryption key"); + Preconditions.checkNotNull( + s3FileIoSseMd5, "Cannot initialize SSE-C S3FileIO with null encryption key MD5"); } this.glueCatalogId = properties.get(GLUE_CATALOG_ID); - this.glueCatalogSkipArchive = PropertyUtil.propertyAsBoolean(properties, - AwsProperties.GLUE_CATALOG_SKIP_ARCHIVE, AwsProperties.GLUE_CATALOG_SKIP_ARCHIVE_DEFAULT); - this.glueCatalogSkipNameValidation = PropertyUtil.propertyAsBoolean(properties, - AwsProperties.GLUE_CATALOG_SKIP_NAME_VALIDATION, AwsProperties.GLUE_CATALOG_SKIP_NAME_VALIDATION_DEFAULT); - this.glueLakeFormationEnabled = PropertyUtil.propertyAsBoolean(properties, - GLUE_LAKEFORMATION_ENABLED, - GLUE_LAKEFORMATION_ENABLED_DEFAULT); - - this.s3FileIoMultipartUploadThreads = PropertyUtil.propertyAsInt(properties, S3FILEIO_MULTIPART_UPLOAD_THREADS, - Runtime.getRuntime().availableProcessors()); + this.glueCatalogSkipArchive = + PropertyUtil.propertyAsBoolean( + properties, + AwsProperties.GLUE_CATALOG_SKIP_ARCHIVE, + AwsProperties.GLUE_CATALOG_SKIP_ARCHIVE_DEFAULT); + this.glueCatalogSkipNameValidation = + PropertyUtil.propertyAsBoolean( + properties, + AwsProperties.GLUE_CATALOG_SKIP_NAME_VALIDATION, + AwsProperties.GLUE_CATALOG_SKIP_NAME_VALIDATION_DEFAULT); + this.glueLakeFormationEnabled = + PropertyUtil.propertyAsBoolean( + properties, GLUE_LAKEFORMATION_ENABLED, GLUE_LAKEFORMATION_ENABLED_DEFAULT); + + this.s3FileIoMultipartUploadThreads = + PropertyUtil.propertyAsInt( + properties, + S3FILEIO_MULTIPART_UPLOAD_THREADS, + Runtime.getRuntime().availableProcessors()); try { - this.s3FileIoMultiPartSize = PropertyUtil.propertyAsInt(properties, S3FILEIO_MULTIPART_SIZE, - S3FILEIO_MULTIPART_SIZE_DEFAULT); + this.s3FileIoMultiPartSize = + PropertyUtil.propertyAsInt( + properties, S3FILEIO_MULTIPART_SIZE, S3FILEIO_MULTIPART_SIZE_DEFAULT); } catch (NumberFormatException e) { - throw new IllegalArgumentException("Input malformed or exceeded maximum multipart upload size 5GB: %s" + - properties.get(S3FILEIO_MULTIPART_SIZE)); + throw new IllegalArgumentException( + "Input malformed or exceeded maximum multipart upload size 5GB: %s" + + properties.get(S3FILEIO_MULTIPART_SIZE)); } - this.s3FileIoMultipartThresholdFactor = PropertyUtil.propertyAsDouble(properties, - S3FILEIO_MULTIPART_THRESHOLD_FACTOR, S3FILEIO_MULTIPART_THRESHOLD_FACTOR_DEFAULT); + this.s3FileIoMultipartThresholdFactor = + PropertyUtil.propertyAsDouble( + properties, + S3FILEIO_MULTIPART_THRESHOLD_FACTOR, + S3FILEIO_MULTIPART_THRESHOLD_FACTOR_DEFAULT); - Preconditions.checkArgument(s3FileIoMultipartThresholdFactor >= 1.0, - "Multipart threshold factor must be >= to 1.0"); + Preconditions.checkArgument( + s3FileIoMultipartThresholdFactor >= 1.0, "Multipart threshold factor must be >= to 1.0"); - Preconditions.checkArgument(s3FileIoMultiPartSize >= S3FILEIO_MULTIPART_SIZE_MIN, + Preconditions.checkArgument( + s3FileIoMultiPartSize >= S3FILEIO_MULTIPART_SIZE_MIN, "Minimum multipart upload object size must be larger than 5 MB."); - this.s3fileIoStagingDirectory = PropertyUtil.propertyAsString(properties, S3FILEIO_STAGING_DIRECTORY, - System.getProperty("java.io.tmpdir")); + this.s3fileIoStagingDirectory = + PropertyUtil.propertyAsString( + properties, S3FILEIO_STAGING_DIRECTORY, System.getProperty("java.io.tmpdir")); String aclType = properties.get(S3FILEIO_ACL); this.s3FileIoAcl = ObjectCannedACL.fromValue(aclType); - Preconditions.checkArgument(s3FileIoAcl == null || !s3FileIoAcl.equals(ObjectCannedACL.UNKNOWN_TO_SDK_VERSION), + Preconditions.checkArgument( + s3FileIoAcl == null || !s3FileIoAcl.equals(ObjectCannedACL.UNKNOWN_TO_SDK_VERSION), "Cannot support S3 CannedACL " + aclType); - this.isS3ChecksumEnabled = PropertyUtil.propertyAsBoolean(properties, S3_CHECKSUM_ENABLED, - S3_CHECKSUM_ENABLED_DEFAULT); + this.isS3ChecksumEnabled = + PropertyUtil.propertyAsBoolean( + properties, S3_CHECKSUM_ENABLED, S3_CHECKSUM_ENABLED_DEFAULT); - this.s3FileIoDeleteBatchSize = PropertyUtil.propertyAsInt(properties, S3FILEIO_DELETE_BATCH_SIZE, - S3FILEIO_DELETE_BATCH_SIZE_DEFAULT); - Preconditions.checkArgument(s3FileIoDeleteBatchSize > 0 && - s3FileIoDeleteBatchSize <= S3FILEIO_DELETE_BATCH_SIZE_MAX, - String.format("Deletion batch size must be between 1 and %s", S3FILEIO_DELETE_BATCH_SIZE_MAX)); + this.s3FileIoDeleteBatchSize = + PropertyUtil.propertyAsInt( + properties, S3FILEIO_DELETE_BATCH_SIZE, S3FILEIO_DELETE_BATCH_SIZE_DEFAULT); + Preconditions.checkArgument( + s3FileIoDeleteBatchSize > 0 && s3FileIoDeleteBatchSize <= S3FILEIO_DELETE_BATCH_SIZE_MAX, + String.format( + "Deletion batch size must be between 1 and %s", S3FILEIO_DELETE_BATCH_SIZE_MAX)); this.s3WriteTags = toTags(properties, S3_WRITE_TAGS_PREFIX); this.s3DeleteTags = toTags(properties, S3_DELETE_TAGS_PREFIX); - this.s3FileIoDeleteThreads = PropertyUtil.propertyAsInt(properties, S3FILEIO_DELETE_THREADS, - Runtime.getRuntime().availableProcessors()); - this.isS3DeleteEnabled = PropertyUtil.propertyAsBoolean(properties, S3_DELETE_ENABLED, S3_DELETE_ENABLED_DEFAULT); - this.s3BucketToAccessPointMapping = PropertyUtil.propertiesWithPrefix(properties, S3_ACCESS_POINTS_PREFIX); + this.s3FileIoDeleteThreads = + PropertyUtil.propertyAsInt( + properties, S3FILEIO_DELETE_THREADS, Runtime.getRuntime().availableProcessors()); + this.isS3DeleteEnabled = + PropertyUtil.propertyAsBoolean(properties, S3_DELETE_ENABLED, S3_DELETE_ENABLED_DEFAULT); + this.s3BucketToAccessPointMapping = + PropertyUtil.propertiesWithPrefix(properties, S3_ACCESS_POINTS_PREFIX); - this.dynamoDbTableName = PropertyUtil.propertyAsString(properties, DYNAMODB_TABLE_NAME, - DYNAMODB_TABLE_NAME_DEFAULT); + this.dynamoDbTableName = + PropertyUtil.propertyAsString(properties, DYNAMODB_TABLE_NAME, DYNAMODB_TABLE_NAME_DEFAULT); } public String s3FileIoSseType() { @@ -546,6 +586,7 @@ public void setGlueCatalogId(String id) { public boolean glueCatalogSkipArchive() { return glueCatalogSkipArchive; } + public void setGlueCatalogSkipArchive(boolean skipArchive) { this.glueCatalogSkipArchive = skipArchive; } @@ -647,8 +688,7 @@ public void setS3DeleteEnabled(boolean s3DeleteEnabled) { } private Set toTags(Map properties, String prefix) { - return PropertyUtil.propertiesWithPrefix(properties, prefix) - .entrySet().stream() + return PropertyUtil.propertiesWithPrefix(properties, prefix).entrySet().stream() .map(e -> Tag.builder().key(e.getKey()).value(e.getValue()).build()) .collect(Collectors.toSet()); } diff --git a/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbCatalog.java b/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbCatalog.java index cf9a21ac57d9..accaad59dcd1 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbCatalog.java +++ b/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.dynamodb; import java.io.Closeable; @@ -84,10 +83,9 @@ import software.amazon.awssdk.services.dynamodb.model.TransactWriteItemsRequest; import software.amazon.awssdk.services.dynamodb.model.UpdateItemRequest; -/** - * DynamoDB implementation of Iceberg catalog - */ -public class DynamoDbCatalog extends BaseMetastoreCatalog implements Closeable, SupportsNamespaces, Configurable { +/** DynamoDB implementation of Iceberg catalog */ +public class DynamoDbCatalog extends BaseMetastoreCatalog + implements Closeable, SupportsNamespaces, Configurable { private static final Logger LOG = LoggerFactory.getLogger(DynamoDbCatalog.class); private static final int CATALOG_TABLE_CREATION_WAIT_ATTEMPTS_MAX = 5; @@ -113,8 +111,7 @@ public class DynamoDbCatalog extends BaseMetastoreCatalog implements Closeable, private FileIO fileIO; private CloseableGroup closeableGroup; - public DynamoDbCatalog() { - } + public DynamoDbCatalog() {} @Override public void initialize(String name, Map properties) { @@ -127,8 +124,10 @@ public void initialize(String name, Map properties) { } @VisibleForTesting - void initialize(String name, String path, AwsProperties properties, DynamoDbClient client, FileIO io) { - Preconditions.checkArgument(path != null && path.length() > 0, + void initialize( + String name, String path, AwsProperties properties, DynamoDbClient client, FileIO io) { + Preconditions.checkArgument( + path != null && path.length() > 0, "Cannot initialize DynamoDbCatalog because warehousePath must not be null or empty"); this.catalogName = name; @@ -159,22 +158,27 @@ protected TableOperations newTableOps(TableIdentifier tableIdentifier) { @Override protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { validateTableIdentifier(tableIdentifier); - GetItemResponse response = dynamo.getItem(GetItemRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .consistentRead(true) - .key(namespacePrimaryKey(tableIdentifier.namespace())) - .build()); + GetItemResponse response = + dynamo.getItem( + GetItemRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) + .consistentRead(true) + .key(namespacePrimaryKey(tableIdentifier.namespace())) + .build()); if (!response.hasItem()) { - throw new NoSuchNamespaceException("Cannot find default warehouse location: namespace %s does not exist", + throw new NoSuchNamespaceException( + "Cannot find default warehouse location: namespace %s does not exist", tableIdentifier.namespace()); } String defaultLocationCol = toPropertyCol(PROPERTY_DEFAULT_LOCATION); if (response.item().containsKey(defaultLocationCol)) { - return String.format("%s/%s", response.item().get(defaultLocationCol), tableIdentifier.name()); + return String.format( + "%s/%s", response.item().get(defaultLocationCol), tableIdentifier.name()); } else { - return String.format("%s/%s.db/%s", warehousePath, tableIdentifier.namespace(), tableIdentifier.name()); + return String.format( + "%s/%s.db/%s", warehousePath, tableIdentifier.namespace(), tableIdentifier.name()); } } @@ -183,14 +187,16 @@ public void createNamespace(Namespace namespace, Map metadata) { validateNamespace(namespace); Map values = namespacePrimaryKey(namespace); setNewCatalogEntryMetadata(values); - metadata.forEach((key, value) -> values.put(toPropertyCol(key), AttributeValue.builder().s(value).build())); + metadata.forEach( + (key, value) -> values.put(toPropertyCol(key), AttributeValue.builder().s(value).build())); try { - dynamo.putItem(PutItemRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .conditionExpression("attribute_not_exists(" + DynamoDbCatalog.COL_VERSION + ")") - .item(values) - .build()); + dynamo.putItem( + PutItemRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) + .conditionExpression("attribute_not_exists(" + DynamoDbCatalog.COL_VERSION + ")") + .item(values) + .build()); } catch (ConditionalCheckFailedException e) { throw new AlreadyExistsException("Cannot create namespace %s: already exists", namespace); } @@ -203,20 +209,23 @@ public List listNamespaces(Namespace namespace) throws NoSuchNamespac Map lastEvaluatedKey = null; String condition = COL_IDENTIFIER + " = :identifier"; Map conditionValues = Maps.newHashMap(); - conditionValues.put(":identifier", AttributeValue.builder().s(COL_IDENTIFIER_NAMESPACE).build()); + conditionValues.put( + ":identifier", AttributeValue.builder().s(COL_IDENTIFIER_NAMESPACE).build()); if (!namespace.isEmpty()) { condition += " AND " + "begins_with(" + COL_NAMESPACE + ",:ns)"; conditionValues.put(":ns", AttributeValue.builder().s(namespace.toString()).build()); } do { - QueryResponse response = dynamo.query(QueryRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .consistentRead(true) - .keyConditionExpression(condition) - .expressionAttributeValues(conditionValues) - .exclusiveStartKey(lastEvaluatedKey) - .build()); + QueryResponse response = + dynamo.query( + QueryRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) + .consistentRead(true) + .keyConditionExpression(condition) + .expressionAttributeValues(conditionValues) + .exclusiveStartKey(lastEvaluatedKey) + .build()); if (response.hasItems()) { for (Map item : response.items()) { @@ -232,13 +241,16 @@ public List listNamespaces(Namespace namespace) throws NoSuchNamespac } @Override - public Map loadNamespaceMetadata(Namespace namespace) throws NoSuchNamespaceException { + public Map loadNamespaceMetadata(Namespace namespace) + throws NoSuchNamespaceException { validateNamespace(namespace); - GetItemResponse response = dynamo.getItem(GetItemRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .consistentRead(true) - .key(namespacePrimaryKey(namespace)) - .build()); + GetItemResponse response = + dynamo.getItem( + GetItemRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) + .consistentRead(true) + .key(namespacePrimaryKey(namespace)) + .build()); if (!response.hasItem()) { throw new NoSuchNamespaceException("Cannot find namespace %s", namespace); @@ -257,11 +269,12 @@ public boolean dropNamespace(Namespace namespace) throws NamespaceNotEmptyExcept } try { - dynamo.deleteItem(DeleteItemRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .key(namespacePrimaryKey(namespace)) - .conditionExpression("attribute_exists(" + COL_NAMESPACE + ")") - .build()); + dynamo.deleteItem( + DeleteItemRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) + .key(namespacePrimaryKey(namespace)) + .conditionExpression("attribute_exists(" + COL_NAMESPACE + ")") + .build()); return true; } catch (ConditionalCheckFailedException e) { return false; @@ -269,7 +282,8 @@ public boolean dropNamespace(Namespace namespace) throws NamespaceNotEmptyExcept } @Override - public boolean setProperties(Namespace namespace, Map properties) throws NoSuchNamespaceException { + public boolean setProperties(Namespace namespace, Map properties) + throws NoSuchNamespaceException { List updateParts = Lists.newArrayList(); Map attributeNames = Maps.newHashMap(); Map attributeValues = Maps.newHashMap(); @@ -289,7 +303,8 @@ public boolean setProperties(Namespace namespace, Map properties } @Override - public boolean removeProperties(Namespace namespace, Set properties) throws NoSuchNamespaceException { + public boolean removeProperties(Namespace namespace, Set properties) + throws NoSuchNamespaceException { List removeParts = Lists.newArrayList(properties.iterator()); Map attributeNames = Maps.newHashMap(); Map attributeValues = Maps.newHashMap(); @@ -303,7 +318,8 @@ public boolean removeProperties(Namespace namespace, Set properties) thr List updateParts = Lists.newArrayList(); updateCatalogEntryMetadata(updateParts, attributeValues); - String updateExpression = "REMOVE " + COMMA.join(removeParts) + " SET " + COMMA.join(updateParts); + String updateExpression = + "REMOVE " + COMMA.join(removeParts) + " SET " + COMMA.join(updateParts); return updateProperties(namespace, updateExpression, attributeValues, attributeNames); } @@ -312,15 +328,17 @@ public List listTables(Namespace namespace) { List identifiers = Lists.newArrayList(); Map lastEvaluatedKey; String condition = COL_NAMESPACE + " = :ns"; - Map conditionValues = ImmutableMap.of( - ":ns", AttributeValue.builder().s(namespace.toString()).build()); + Map conditionValues = + ImmutableMap.of(":ns", AttributeValue.builder().s(namespace.toString()).build()); do { - QueryResponse response = dynamo.query(QueryRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .indexName(GSI_NAMESPACE_IDENTIFIER) - .keyConditionExpression(condition) - .expressionAttributeValues(conditionValues) - .build()); + QueryResponse response = + dynamo.query( + QueryRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) + .indexName(GSI_NAMESPACE_IDENTIFIER) + .keyConditionExpression(condition) + .expressionAttributeValues(conditionValues) + .build()); if (response.hasItems()) { for (Map item : response.items()) { @@ -340,11 +358,13 @@ public List listTables(Namespace namespace) { public boolean dropTable(TableIdentifier identifier, boolean purge) { Map key = tablePrimaryKey(identifier); try { - GetItemResponse response = dynamo.getItem(GetItemRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .consistentRead(true) - .key(key) - .build()); + GetItemResponse response = + dynamo.getItem( + GetItemRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) + .consistentRead(true) + .key(key) + .build()); if (!response.hasItem()) { throw new NoSuchTableException("Cannot find table %s to drop", identifier); @@ -352,12 +372,13 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { TableOperations ops = newTableOps(identifier); TableMetadata lastMetadata = ops.current(); - dynamo.deleteItem(DeleteItemRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .key(tablePrimaryKey(identifier)) - .conditionExpression(COL_VERSION + " = :v") - .expressionAttributeValues(ImmutableMap.of(":v", response.item().get(COL_VERSION))) - .build()); + dynamo.deleteItem( + DeleteItemRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) + .key(tablePrimaryKey(identifier)) + .conditionExpression(COL_VERSION + " = :v") + .expressionAttributeValues(ImmutableMap.of(":v", response.item().get(COL_VERSION))) + .build()); LOG.info("Successfully dropped table {} from DynamoDb catalog", identifier); if (purge && lastMetadata != null) { @@ -381,24 +402,30 @@ public void renameTable(TableIdentifier from, TableIdentifier to) { Map fromKey = tablePrimaryKey(from); Map toKey = tablePrimaryKey(to); - GetItemResponse fromResponse = dynamo.getItem(GetItemRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .consistentRead(true) - .key(fromKey) - .build()); + GetItemResponse fromResponse = + dynamo.getItem( + GetItemRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) + .consistentRead(true) + .key(fromKey) + .build()); if (!fromResponse.hasItem()) { - throw new NoSuchTableException("Cannot rename table %s to %s: %s does not exist", from, to, from); + throw new NoSuchTableException( + "Cannot rename table %s to %s: %s does not exist", from, to, from); } - GetItemResponse toResponse = dynamo.getItem(GetItemRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .consistentRead(true) - .key(toKey) - .build()); + GetItemResponse toResponse = + dynamo.getItem( + GetItemRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) + .consistentRead(true) + .key(toKey) + .build()); if (toResponse.hasItem()) { - throw new AlreadyExistsException("Cannot rename table %s to %s: %s already exists", from, to, to); + throw new AlreadyExistsException( + "Cannot rename table %s to %s: %s already exists", from, to, to); } fromResponse.item().entrySet().stream() @@ -407,23 +434,27 @@ public void renameTable(TableIdentifier from, TableIdentifier to) { setNewCatalogEntryMetadata(toKey); - dynamo.transactWriteItems(TransactWriteItemsRequest.builder() - .transactItems( - TransactWriteItem.builder() - .delete(Delete.builder() - .tableName(awsProperties.dynamoDbTableName()) - .key(fromKey) - .conditionExpression(COL_VERSION + " = :v") - .expressionAttributeValues(ImmutableMap.of(":v", fromResponse.item().get(COL_VERSION))) - .build()) - .build(), - TransactWriteItem.builder() - .put(Put.builder() - .tableName(awsProperties.dynamoDbTableName()) - .item(toKey) - .conditionExpression("attribute_not_exists(" + COL_VERSION + ")") + dynamo.transactWriteItems( + TransactWriteItemsRequest.builder() + .transactItems( + TransactWriteItem.builder() + .delete( + Delete.builder() + .tableName(awsProperties.dynamoDbTableName()) + .key(fromKey) + .conditionExpression(COL_VERSION + " = :v") + .expressionAttributeValues( + ImmutableMap.of(":v", fromResponse.item().get(COL_VERSION))) + .build()) + .build(), + TransactWriteItem.builder() + .put( + Put.builder() + .tableName(awsProperties.dynamoDbTableName()) + .item(toKey) + .conditionExpression("attribute_not_exists(" + COL_VERSION + ")") + .build()) .build()) - .build()) .build()); LOG.info("Successfully renamed table from {} to {}", from, to); @@ -445,9 +476,10 @@ public void close() throws IOException { } /** - * The property used to set a default location for tables in a namespace. - * Call {@link #setProperties(Namespace, Map)} to set a path value using this property for a namespace, - * then all tables in the namespace will have default table root path under that given path. + * The property used to set a default location for tables in a namespace. Call {@link + * #setProperties(Namespace, Map)} to set a path value using this property for a namespace, then + * all tables in the namespace will have default table root path under that given path. + * * @return default location property key */ public static String defaultLocationProperty() { @@ -487,9 +519,11 @@ static void setNewCatalogEntryMetadata(Map values) { values.put(COL_VERSION, AttributeValue.builder().s(UUID.randomUUID().toString()).build()); } - static void updateCatalogEntryMetadata(List updateParts, Map attributeValues) { + static void updateCatalogEntryMetadata( + List updateParts, Map attributeValues) { updateParts.add(COL_UPDATED_AT + " = :uat"); - attributeValues.put(":uat", AttributeValue.builder().n(Long.toString(System.currentTimeMillis())).build()); + attributeValues.put( + ":uat", AttributeValue.builder().n(Long.toString(System.currentTimeMillis())).build()); updateParts.add(COL_VERSION + " = :uv"); attributeValues.put(":uv", AttributeValue.builder().s(UUID.randomUUID().toString()).build()); } @@ -507,27 +541,28 @@ private FileIO initializeFileIO(Map properties) { private void validateNamespace(Namespace namespace) { for (String level : namespace.levels()) { - ValidationException.check(level != null && !level.isEmpty(), - "Namespace level must not be empty: %s", namespace); - ValidationException.check(!level.contains("."), - "Namespace level must not contain dot, but found %s in %s", level, namespace); + ValidationException.check( + level != null && !level.isEmpty(), "Namespace level must not be empty: %s", namespace); + ValidationException.check( + !level.contains("."), + "Namespace level must not contain dot, but found %s in %s", + level, + namespace); } } private void validateTableIdentifier(TableIdentifier identifier) { validateNamespace(identifier.namespace()); - ValidationException.check(identifier.hasNamespace(), - "Table namespace must not be empty: %s", identifier); + ValidationException.check( + identifier.hasNamespace(), "Table namespace must not be empty: %s", identifier); String tableName = identifier.name(); - ValidationException.check(!tableName.contains("."), - "Table name must not contain dot: %s", tableName); + ValidationException.check( + !tableName.contains("."), "Table name must not contain dot: %s", tableName); } private boolean dynamoDbTableExists(String tableName) { try { - dynamo.describeTable(DescribeTableRequest.builder() - .tableName(tableName) - .build()); + dynamo.describeTable(DescribeTableRequest.builder().tableName(tableName).build()); return true; } catch (ResourceNotFoundException e) { return false; @@ -539,24 +574,46 @@ private void ensureCatalogTableExistsOrCreate() { return; } - LOG.info("DynamoDb catalog table {} not found, trying to create", awsProperties.dynamoDbTableName()); - dynamo.createTable(CreateTableRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .keySchema( - KeySchemaElement.builder().attributeName(COL_IDENTIFIER).keyType(KeyType.HASH).build(), - KeySchemaElement.builder().attributeName(COL_NAMESPACE).keyType(KeyType.RANGE).build()) - .attributeDefinitions( - AttributeDefinition.builder().attributeName(COL_IDENTIFIER).attributeType(ScalarAttributeType.S).build(), - AttributeDefinition.builder().attributeName(COL_NAMESPACE).attributeType(ScalarAttributeType.S).build()) - .globalSecondaryIndexes(GlobalSecondaryIndex.builder() - .indexName(GSI_NAMESPACE_IDENTIFIER) + LOG.info( + "DynamoDb catalog table {} not found, trying to create", awsProperties.dynamoDbTableName()); + dynamo.createTable( + CreateTableRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) .keySchema( - KeySchemaElement.builder().attributeName(COL_NAMESPACE).keyType(KeyType.HASH).build(), - KeySchemaElement.builder().attributeName(COL_IDENTIFIER).keyType(KeyType.RANGE).build()) - .projection(Projection.builder().projectionType(ProjectionType.KEYS_ONLY).build()) - .build()) - .billingMode(BillingMode.PAY_PER_REQUEST) - .build()); + KeySchemaElement.builder() + .attributeName(COL_IDENTIFIER) + .keyType(KeyType.HASH) + .build(), + KeySchemaElement.builder() + .attributeName(COL_NAMESPACE) + .keyType(KeyType.RANGE) + .build()) + .attributeDefinitions( + AttributeDefinition.builder() + .attributeName(COL_IDENTIFIER) + .attributeType(ScalarAttributeType.S) + .build(), + AttributeDefinition.builder() + .attributeName(COL_NAMESPACE) + .attributeType(ScalarAttributeType.S) + .build()) + .globalSecondaryIndexes( + GlobalSecondaryIndex.builder() + .indexName(GSI_NAMESPACE_IDENTIFIER) + .keySchema( + KeySchemaElement.builder() + .attributeName(COL_NAMESPACE) + .keyType(KeyType.HASH) + .build(), + KeySchemaElement.builder() + .attributeName(COL_IDENTIFIER) + .keyType(KeyType.RANGE) + .build()) + .projection( + Projection.builder().projectionType(ProjectionType.KEYS_ONLY).build()) + .build()) + .billingMode(BillingMode.PAY_PER_REQUEST) + .build()); // wait for the dynamo table to complete provisioning, which takes around 10 seconds Tasks.foreach(awsProperties.dynamoDbTableName()) @@ -568,44 +625,51 @@ private void ensureCatalogTableExistsOrCreate() { private void checkTableActive(String tableName) { try { - DescribeTableResponse response = dynamo.describeTable(DescribeTableRequest.builder() - .tableName(tableName) - .build()); + DescribeTableResponse response = + dynamo.describeTable(DescribeTableRequest.builder().tableName(tableName).build()); TableStatus currentStatus = response.table().tableStatus(); if (!currentStatus.equals(TableStatus.ACTIVE)) { - throw new IllegalStateException(String.format("Dynamo catalog table %s is not active, current status: %s", - tableName, currentStatus)); + throw new IllegalStateException( + String.format( + "Dynamo catalog table %s is not active, current status: %s", + tableName, currentStatus)); } } catch (ResourceNotFoundException e) { - throw new IllegalStateException(String.format("Cannot find Dynamo catalog table %s", tableName)); + throw new IllegalStateException( + String.format("Cannot find Dynamo catalog table %s", tableName)); } } - private boolean updateProperties(Namespace namespace, String updateExpression, - Map attributeValues, - Map attributeNames) { + private boolean updateProperties( + Namespace namespace, + String updateExpression, + Map attributeValues, + Map attributeNames) { validateNamespace(namespace); Map key = namespacePrimaryKey(namespace); try { - GetItemResponse response = dynamo.getItem(GetItemRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .consistentRead(true) - .key(key) - .build()); + GetItemResponse response = + dynamo.getItem( + GetItemRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) + .consistentRead(true) + .key(key) + .build()); if (!response.hasItem()) { throw new NoSuchNamespaceException("Cannot find namespace %s", namespace); } attributeValues.put(":v", response.item().get(COL_VERSION)); - dynamo.updateItem(UpdateItemRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .key(key) - .conditionExpression(COL_VERSION + " = :v") - .updateExpression(updateExpression) - .expressionAttributeValues(attributeValues) - .expressionAttributeNames(attributeNames) - .build()); + dynamo.updateItem( + UpdateItemRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) + .key(key) + .conditionExpression(COL_VERSION + " = :v") + .updateExpression(updateExpression) + .expressionAttributeValues(attributeValues) + .expressionAttributeNames(attributeNames) + .build()); return true; } catch (ConditionalCheckFailedException e) { return false; diff --git a/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbLockManager.java b/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbLockManager.java index 8ed40d18d46f..e38a0cecf34c 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbLockManager.java +++ b/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbLockManager.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.dynamodb; import java.util.List; @@ -59,9 +58,7 @@ import software.amazon.awssdk.services.dynamodb.model.TableStatus; import software.amazon.awssdk.services.dynamodb.model.TransactionConflictException; -/** - * DynamoDB implementation for the lock manager. - */ +/** DynamoDB implementation for the lock manager. */ public class DynamoDbLockManager extends LockManagers.BaseLockManager { private static final Logger LOG = LoggerFactory.getLogger(DynamoDbLockManager.class); @@ -71,44 +68,43 @@ public class DynamoDbLockManager extends LockManagers.BaseLockManager { private static final String COL_VERSION = "version"; private static final String COL_LOCK_OWNER_ID = "ownerId"; - private static final String CONDITION_LOCK_ID_MATCH = String.format( - "%s = :eid AND %s = :oid", - COL_LOCK_ENTITY_ID, COL_LOCK_OWNER_ID); - private static final String CONDITION_LOCK_ENTITY_NOT_EXIST = String.format( - "attribute_not_exists(%s)", - COL_LOCK_ENTITY_ID); - private static final String CONDITION_LOCK_ENTITY_NOT_EXIST_OR_VERSION_MATCH = String.format( - "attribute_not_exists(%s) OR (%s = :eid AND %s = :vid)", - COL_LOCK_ENTITY_ID, COL_LOCK_ENTITY_ID, COL_VERSION); + private static final String CONDITION_LOCK_ID_MATCH = + String.format("%s = :eid AND %s = :oid", COL_LOCK_ENTITY_ID, COL_LOCK_OWNER_ID); + private static final String CONDITION_LOCK_ENTITY_NOT_EXIST = + String.format("attribute_not_exists(%s)", COL_LOCK_ENTITY_ID); + private static final String CONDITION_LOCK_ENTITY_NOT_EXIST_OR_VERSION_MATCH = + String.format( + "attribute_not_exists(%s) OR (%s = :eid AND %s = :vid)", + COL_LOCK_ENTITY_ID, COL_LOCK_ENTITY_ID, COL_VERSION); private static final int LOCK_TABLE_CREATION_WAIT_ATTEMPTS_MAX = 5; private static final int RELEASE_RETRY_ATTEMPTS_MAX = 5; - private static final List LOCK_TABLE_SCHEMA = Lists.newArrayList( - KeySchemaElement.builder() - .attributeName(COL_LOCK_ENTITY_ID) - .keyType(KeyType.HASH) - .build()); + private static final List LOCK_TABLE_SCHEMA = + Lists.newArrayList( + KeySchemaElement.builder() + .attributeName(COL_LOCK_ENTITY_ID) + .keyType(KeyType.HASH) + .build()); - private static final List LOCK_TABLE_COL_DEFINITIONS = Lists.newArrayList( - AttributeDefinition.builder() - .attributeName(COL_LOCK_ENTITY_ID) - .attributeType(ScalarAttributeType.S) - .build()); + private static final List LOCK_TABLE_COL_DEFINITIONS = + Lists.newArrayList( + AttributeDefinition.builder() + .attributeName(COL_LOCK_ENTITY_ID) + .attributeType(ScalarAttributeType.S) + .build()); private final Map heartbeats = Maps.newHashMap(); private DynamoDbClient dynamo; private String lockTableName; - /** - * constructor for dynamic initialization, {@link #initialize(Map)} must be called later. - */ - public DynamoDbLockManager() { - } + /** constructor for dynamic initialization, {@link #initialize(Map)} must be called later. */ + public DynamoDbLockManager() {} /** * constructor used for testing purpose + * * @param dynamo dynamo client * @param lockTableName lock table name */ @@ -126,12 +122,13 @@ private void ensureLockTableExistsOrCreate() { } LOG.info("Dynamo lock table {} not found, trying to create", lockTableName); - dynamo.createTable(CreateTableRequest.builder() - .tableName(lockTableName) - .keySchema(lockTableSchema()) - .attributeDefinitions(lockTableColDefinitions()) - .billingMode(BillingMode.PAY_PER_REQUEST) - .build()); + dynamo.createTable( + CreateTableRequest.builder() + .tableName(lockTableName) + .keySchema(lockTableSchema()) + .attributeDefinitions(lockTableColDefinitions()) + .billingMode(BillingMode.PAY_PER_REQUEST) + .build()); Tasks.foreach(lockTableName) .retry(LOCK_TABLE_CREATION_WAIT_ATTEMPTS_MAX) @@ -143,9 +140,7 @@ private void ensureLockTableExistsOrCreate() { @VisibleForTesting boolean tableExists(String tableName) { try { - dynamo.describeTable(DescribeTableRequest.builder() - .tableName(tableName) - .build()); + dynamo.describeTable(DescribeTableRequest.builder().tableName(tableName).build()); return true; } catch (ResourceNotFoundException e) { return false; @@ -154,13 +149,13 @@ boolean tableExists(String tableName) { private void checkTableActive(String tableName) { try { - DescribeTableResponse response = dynamo.describeTable(DescribeTableRequest.builder() - .tableName(tableName) - .build()); + DescribeTableResponse response = + dynamo.describeTable(DescribeTableRequest.builder().tableName(tableName).build()); TableStatus currentStatus = response.table().tableStatus(); if (!currentStatus.equals(TableStatus.ACTIVE)) { - throw new IllegalStateException(String.format("Dynamo table %s is not active, current status: %s", - tableName, currentStatus)); + throw new IllegalStateException( + String.format( + "Dynamo table %s is not active, current status: %s", tableName, currentStatus)); } } catch (ResourceNotFoundException e) { throw new IllegalStateException(String.format("Cannot find Dynamo table %s", tableName)); @@ -198,18 +193,21 @@ public boolean acquire(String entityId, String ownerId) { @VisibleForTesting void acquireOnce(String entityId, String ownerId) { - GetItemResponse response = dynamo.getItem(GetItemRequest.builder() - .tableName(lockTableName) - .consistentRead(true) - .key(toKey(entityId)) - .build()); + GetItemResponse response = + dynamo.getItem( + GetItemRequest.builder() + .tableName(lockTableName) + .consistentRead(true) + .key(toKey(entityId)) + .build()); if (!response.hasItem()) { - dynamo.putItem(PutItemRequest.builder() - .tableName(lockTableName) - .item(toNewItem(entityId, ownerId, heartbeatTimeoutMs())) - .conditionExpression(CONDITION_LOCK_ENTITY_NOT_EXIST) - .build()); + dynamo.putItem( + PutItemRequest.builder() + .tableName(lockTableName) + .item(toNewItem(entityId, ownerId, heartbeatTimeoutMs())) + .conditionExpression(CONDITION_LOCK_ENTITY_NOT_EXIST) + .build()); } else { Map currentItem = response.item(); @@ -217,17 +215,21 @@ void acquireOnce(String entityId, String ownerId) { Thread.sleep(Long.parseLong(currentItem.get(COL_LEASE_DURATION_MS).n())); } catch (InterruptedException e) { throw new IllegalStateException( - String.format("Fail to acquire lock %s by %s, interrupted during sleep", entityId, ownerId), e); + String.format( + "Fail to acquire lock %s by %s, interrupted during sleep", entityId, ownerId), + e); } - dynamo.putItem(PutItemRequest.builder() - .tableName(lockTableName) - .item(toNewItem(entityId, ownerId, heartbeatTimeoutMs())) - .conditionExpression(CONDITION_LOCK_ENTITY_NOT_EXIST_OR_VERSION_MATCH) - .expressionAttributeValues(ImmutableMap.of( - ":eid", AttributeValue.builder().s(entityId).build(), - ":vid", AttributeValue.builder().s(currentItem.get(COL_VERSION).s()).build())) - .build()); + dynamo.putItem( + PutItemRequest.builder() + .tableName(lockTableName) + .item(toNewItem(entityId, ownerId, heartbeatTimeoutMs())) + .conditionExpression(CONDITION_LOCK_ENTITY_NOT_EXIST_OR_VERSION_MATCH) + .expressionAttributeValues( + ImmutableMap.of( + ":eid", AttributeValue.builder().s(entityId).build(), + ":vid", AttributeValue.builder().s(currentItem.get(COL_VERSION).s()).build())) + .build()); } startNewHeartbeat(entityId, ownerId); @@ -238,8 +240,9 @@ private void startNewHeartbeat(String entityId, String ownerId) { heartbeats.remove(entityId).cancel(); } - DynamoDbHeartbeat heartbeat = new DynamoDbHeartbeat(dynamo, lockTableName, - heartbeatIntervalMs(), heartbeatTimeoutMs(), entityId, ownerId); + DynamoDbHeartbeat heartbeat = + new DynamoDbHeartbeat( + dynamo, lockTableName, heartbeatIntervalMs(), heartbeatTimeoutMs(), entityId, ownerId); heartbeat.schedule(scheduler()); heartbeats.put(entityId, heartbeat); } @@ -257,19 +260,28 @@ public boolean release(String entityId, String ownerId) { TransactionConflictException.class, RequestLimitExceededException.class, InternalServerErrorException.class) - .run(id -> dynamo.deleteItem(DeleteItemRequest.builder() - .tableName(lockTableName) - .key(toKey(id)) - .conditionExpression(CONDITION_LOCK_ID_MATCH) - .expressionAttributeValues(toLockIdValues(id, ownerId)) - .build())); + .run( + id -> + dynamo.deleteItem( + DeleteItemRequest.builder() + .tableName(lockTableName) + .key(toKey(id)) + .conditionExpression(CONDITION_LOCK_ID_MATCH) + .expressionAttributeValues(toLockIdValues(id, ownerId)) + .build())); succeeded = true; } catch (ConditionalCheckFailedException e) { - LOG.error("Failed to release lock for entity: {}, owner: {}, lock entity does not exist or owner not match", - entityId, ownerId, e); + LOG.error( + "Failed to release lock for entity: {}, owner: {}, lock entity does not exist or owner not match", + entityId, + ownerId, + e); } catch (DynamoDbException e) { - LOG.error("Failed to release lock {} by for entity: {}, owner: {}, encountered unexpected DynamoDB exception", - entityId, ownerId, e); + LOG.error( + "Failed to release lock {} by for entity: {}, owner: {}, encountered unexpected DynamoDB exception", + entityId, + ownerId, + e); } finally { if (heartbeat != null && heartbeat.ownerId().equals(ownerId)) { heartbeat.cancel(); @@ -283,12 +295,14 @@ private static Map toKey(String entityId) { return ImmutableMap.of(COL_LOCK_ENTITY_ID, AttributeValue.builder().s(entityId).build()); } - private static Map toNewItem(String entityId, String ownerId, long heartbeatTimeoutMs) { + private static Map toNewItem( + String entityId, String ownerId, long heartbeatTimeoutMs) { return ImmutableMap.of( COL_LOCK_ENTITY_ID, AttributeValue.builder().s(entityId).build(), COL_LOCK_OWNER_ID, AttributeValue.builder().s(ownerId).build(), COL_VERSION, AttributeValue.builder().s(UUID.randomUUID().toString()).build(), - COL_LEASE_DURATION_MS, AttributeValue.builder().n(Long.toString(heartbeatTimeoutMs)).build()); + COL_LEASE_DURATION_MS, + AttributeValue.builder().n(Long.toString(heartbeatTimeoutMs)).build()); } private static Map toLockIdValues(String entityId, String ownerId) { @@ -306,6 +320,7 @@ public void close() { /** * The lock table schema, for users who would like to create the table separately + * * @return lock table schema */ public static List lockTableSchema() { @@ -314,6 +329,7 @@ public static List lockTableSchema() { /** * The lock table column definition, for users who whould like to create the table separately + * * @return lock table column definition */ public static List lockTableColDefinitions() { @@ -330,8 +346,13 @@ private static class DynamoDbHeartbeat implements Runnable { private final String ownerId; private ScheduledFuture future; - DynamoDbHeartbeat(DynamoDbClient dynamo, String lockTableName, long intervalMs, long timeoutMs, - String entityId, String ownerId) { + DynamoDbHeartbeat( + DynamoDbClient dynamo, + String lockTableName, + long intervalMs, + long timeoutMs, + String entityId, + String ownerId) { this.dynamo = dynamo; this.lockTableName = lockTableName; this.intervalMs = intervalMs; @@ -344,15 +365,20 @@ private static class DynamoDbHeartbeat implements Runnable { @Override public void run() { try { - dynamo.putItem(PutItemRequest.builder() - .tableName(lockTableName) - .item(toNewItem(entityId, ownerId, timeoutMs)) - .conditionExpression(CONDITION_LOCK_ID_MATCH) - .expressionAttributeValues(toLockIdValues(entityId, ownerId)) - .build()); + dynamo.putItem( + PutItemRequest.builder() + .tableName(lockTableName) + .item(toNewItem(entityId, ownerId, timeoutMs)) + .conditionExpression(CONDITION_LOCK_ID_MATCH) + .expressionAttributeValues(toLockIdValues(entityId, ownerId)) + .build()); } catch (ConditionalCheckFailedException e) { - LOG.error("Fail to heartbeat for entity: {}, owner: {} due to conditional check failure, " + - "unsafe concurrent commits might be going on", entityId, ownerId, e); + LOG.error( + "Fail to heartbeat for entity: {}, owner: {} due to conditional check failure, " + + "unsafe concurrent commits might be going on", + entityId, + ownerId, + e); } catch (RuntimeException e) { LOG.error("Failed to heartbeat for entity: {}, owner: {}", entityId, ownerId, e); } diff --git a/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbTableOperations.java b/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbTableOperations.java index 46c816f0c33c..0fa4f8f0b1a7 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbTableOperations.java +++ b/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbTableOperations.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.dynamodb; import java.util.List; @@ -80,17 +79,21 @@ public FileIO io() { @Override protected void doRefresh() { String metadataLocation = null; - GetItemResponse table = dynamo.getItem(GetItemRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .consistentRead(true) - .key(DynamoDbCatalog.tablePrimaryKey(tableIdentifier)) - .build()); + GetItemResponse table = + dynamo.getItem( + GetItemRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) + .consistentRead(true) + .key(DynamoDbCatalog.tablePrimaryKey(tableIdentifier)) + .build()); if (table.hasItem()) { metadataLocation = getMetadataLocation(table); } else { if (currentMetadataLocation() != null) { - throw new NoSuchTableException("Cannot find table %s after refresh, " + - "maybe another process deleted it or revoked your access permission", tableName()); + throw new NoSuchTableException( + "Cannot find table %s after refresh, " + + "maybe another process deleted it or revoked your access permission", + tableName()); } } @@ -103,31 +106,36 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { CommitStatus commitStatus = CommitStatus.FAILURE; Map tableKey = DynamoDbCatalog.tablePrimaryKey(tableIdentifier); try { - GetItemResponse table = dynamo.getItem(GetItemRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .consistentRead(true) - .key(tableKey) - .build()); + GetItemResponse table = + dynamo.getItem( + GetItemRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) + .consistentRead(true) + .key(tableKey) + .build()); checkMetadataLocation(table, base); Map properties = prepareProperties(table, newMetadataLocation); persistTable(tableKey, table, properties); commitStatus = CommitStatus.SUCCESS; } catch (ConditionalCheckFailedException e) { - throw new CommitFailedException(e, "Cannot commit %s: concurrent update detected", tableName()); + throw new CommitFailedException( + e, "Cannot commit %s: concurrent update detected", tableName()); } catch (CommitFailedException e) { // any explicit commit failures are passed up and out to the retry handler throw e; } catch (RuntimeException persistFailure) { - LOG.error("Confirming if commit to {} indeed failed to persist, attempting to reconnect and check.", - fullTableName, persistFailure); + LOG.error( + "Confirming if commit to {} indeed failed to persist, attempting to reconnect and check.", + fullTableName, + persistFailure); commitStatus = checkCommitStatus(newMetadataLocation, metadata); switch (commitStatus) { case SUCCESS: break; case FAILURE: - throw new CommitFailedException(persistFailure, - "Cannot commit %s due to unexpected exception", tableName()); + throw new CommitFailedException( + persistFailure, "Cannot commit %s due to unexpected exception", tableName()); case UNKNOWN: throw new CommitStateUnknownException(persistFailure); } @@ -157,8 +165,10 @@ private String getMetadataLocation(GetItemResponse table) { return table.item().get(DynamoDbCatalog.toPropertyCol(METADATA_LOCATION_PROP)).s(); } - private Map prepareProperties(GetItemResponse response, String newMetadataLocation) { - Map properties = response.hasItem() ? getProperties(response) : Maps.newHashMap(); + private Map prepareProperties( + GetItemResponse response, String newMetadataLocation) { + Map properties = + response.hasItem() ? getProperties(response) : Maps.newHashMap(); properties.put(TABLE_TYPE_PROP, ICEBERG_TABLE_TYPE_VALUE.toUpperCase(Locale.ENGLISH)); properties.put(METADATA_LOCATION_PROP, newMetadataLocation); if (currentMetadataLocation() != null && !currentMetadataLocation().isEmpty()) { @@ -171,10 +181,13 @@ private Map prepareProperties(GetItemResponse response, String n private Map getProperties(GetItemResponse table) { return table.item().entrySet().stream() .filter(e -> DynamoDbCatalog.isProperty(e.getKey())) - .collect(Collectors.toMap(e -> DynamoDbCatalog.toPropertyKey(e.getKey()), e -> e.getValue().s())); + .collect( + Collectors.toMap( + e -> DynamoDbCatalog.toPropertyKey(e.getKey()), e -> e.getValue().s())); } - void persistTable(Map tableKey, GetItemResponse table, Map parameters) { + void persistTable( + Map tableKey, GetItemResponse table, Map parameters) { if (table.hasItem()) { LOG.debug("Committing existing DynamoDb catalog table: {}", tableName()); List updateParts = Lists.newArrayList(); @@ -187,31 +200,35 @@ void persistTable(Map tableKey, GetItemResponse table, M idx++; updateParts.add(attributeKey + " = " + attributeValue); attributeNames.put(attributeKey, DynamoDbCatalog.toPropertyCol(property.getKey())); - attributeValues.put(attributeValue, AttributeValue.builder().s(property.getValue()).build()); + attributeValues.put( + attributeValue, AttributeValue.builder().s(property.getValue()).build()); } DynamoDbCatalog.updateCatalogEntryMetadata(updateParts, attributeValues); String updateExpression = "SET " + DynamoDbCatalog.COMMA.join(updateParts); attributeValues.put(":v", table.item().get(DynamoDbCatalog.COL_VERSION)); - dynamo.updateItem(UpdateItemRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .key(tableKey) - .conditionExpression(DynamoDbCatalog.COL_VERSION + " = :v") - .updateExpression(updateExpression) - .expressionAttributeValues(attributeValues) - .expressionAttributeNames(attributeNames) - .build()); + dynamo.updateItem( + UpdateItemRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) + .key(tableKey) + .conditionExpression(DynamoDbCatalog.COL_VERSION + " = :v") + .updateExpression(updateExpression) + .expressionAttributeValues(attributeValues) + .expressionAttributeNames(attributeNames) + .build()); } else { LOG.debug("Committing new DynamoDb catalog table: {}", tableName()); Map values = Maps.newHashMap(tableKey); - parameters.forEach((k, v) -> values.put(DynamoDbCatalog.toPropertyCol(k), - AttributeValue.builder().s(v).build())); + parameters.forEach( + (k, v) -> + values.put(DynamoDbCatalog.toPropertyCol(k), AttributeValue.builder().s(v).build())); DynamoDbCatalog.setNewCatalogEntryMetadata(values); - dynamo.putItem(PutItemRequest.builder() - .tableName(awsProperties.dynamoDbTableName()) - .item(values) - .conditionExpression("attribute_not_exists(" + DynamoDbCatalog.COL_VERSION + ")") - .build()); + dynamo.putItem( + PutItemRequest.builder() + .tableName(awsProperties.dynamoDbTableName()) + .item(values) + .conditionExpression("attribute_not_exists(" + DynamoDbCatalog.COL_VERSION + ")") + .build()); } } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/glue/DynamoLockManager.java b/aws/src/main/java/org/apache/iceberg/aws/glue/DynamoLockManager.java index aefe8c491382..09ffd5ccf662 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/glue/DynamoLockManager.java +++ b/aws/src/main/java/org/apache/iceberg/aws/glue/DynamoLockManager.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.glue; import org.apache.iceberg.aws.dynamodb.DynamoDbLockManager; @@ -25,9 +24,9 @@ import software.amazon.awssdk.services.dynamodb.DynamoDbClient; /** - * @deprecated this class is kept only for backwards compatibility. - * For GlueCatalog, Glue has supported optimistic locking and lock manager is no longer needed. - * For HadoopCatalog and HadoopTables, please use {@link org.apache.iceberg.aws.dynamodb.DynamoDbLockManager} instead. + * @deprecated this class is kept only for backwards compatibility. For GlueCatalog, Glue has + * supported optimistic locking and lock manager is no longer needed. For HadoopCatalog and + * HadoopTables, please use {@link org.apache.iceberg.aws.dynamodb.DynamoDbLockManager} instead. */ @Deprecated class DynamoLockManager extends DynamoDbLockManager { @@ -44,8 +43,10 @@ class DynamoLockManager extends DynamoDbLockManager { } private void logDeprecationWarning() { - LOG.warn("{} is deprecated. For GlueCatalog, Glue has supported optimistic locking and " + - "lock manager is no longer needed. For HadoopCatalog and HadoopTables, please use {} instead.", - DynamoLockManager.class.getName(), DynamoDbLockManager.class.getName()); + LOG.warn( + "{} is deprecated. For GlueCatalog, Glue has supported optimistic locking and " + + "lock manager is no longer needed. For HadoopCatalog and HadoopTables, please use {} instead.", + DynamoLockManager.class.getName(), + DynamoDbLockManager.class.getName()); } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java b/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java index 755005abe6e1..0f7e9ddff2af 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java +++ b/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.glue; import java.io.Closeable; @@ -95,18 +94,19 @@ public class GlueCatalog extends BaseMetastoreCatalog private Map catalogProperties; // Attempt to set versionId if available on the path - private static final DynMethods.UnboundMethod SET_VERSION_ID = DynMethods.builder("versionId") - .hiddenImpl("software.amazon.awssdk.services.glue.model.UpdateTableRequest$Builder", String.class) - .orNoop() - .build(); + private static final DynMethods.UnboundMethod SET_VERSION_ID = + DynMethods.builder("versionId") + .hiddenImpl( + "software.amazon.awssdk.services.glue.model.UpdateTableRequest$Builder", String.class) + .orNoop() + .build(); /** * No-arg constructor to load the catalog dynamically. - *

- * All fields are initialized by calling {@link GlueCatalog#initialize(String, Map)} later. + * + *

All fields are initialized by calling {@link GlueCatalog#initialize(String, Map)} later. */ - public GlueCatalog() { - } + public GlueCatalog() {} @Override public void initialize(String name, Map properties) { @@ -117,17 +117,21 @@ public void initialize(String name, Map properties) { properties, AwsProperties.GLUE_LAKEFORMATION_ENABLED, AwsProperties.GLUE_LAKEFORMATION_ENABLED_DEFAULT)) { - String factoryImpl = PropertyUtil.propertyAsString(properties, AwsProperties.CLIENT_FACTORY, null); - ImmutableMap.Builder builder = ImmutableMap.builder().putAll(properties); + String factoryImpl = + PropertyUtil.propertyAsString(properties, AwsProperties.CLIENT_FACTORY, null); + ImmutableMap.Builder builder = + ImmutableMap.builder().putAll(properties); if (factoryImpl == null) { builder.put(AwsProperties.CLIENT_FACTORY, LakeFormationAwsClientFactory.class.getName()); } this.catalogProperties = builder.build(); awsClientFactory = AwsClientFactories.from(catalogProperties); - Preconditions.checkArgument(awsClientFactory instanceof LakeFormationAwsClientFactory, + Preconditions.checkArgument( + awsClientFactory instanceof LakeFormationAwsClientFactory, "Detected LakeFormation enabled for Glue catalog, should use a client factory that extends %s, but found %s", - LakeFormationAwsClientFactory.class.getName(), factoryImpl); + LakeFormationAwsClientFactory.class.getName(), + factoryImpl); catalogFileIO = null; } else { awsClientFactory = AwsClientFactories.from(properties); @@ -177,8 +181,15 @@ void initialize( } @VisibleForTesting - void initialize(String name, String path, AwsProperties properties, GlueClient client, LockManager lock, FileIO io) { - Preconditions.checkArgument(path != null && path.length() > 0, + void initialize( + String name, + String path, + AwsProperties properties, + GlueClient client, + LockManager lock, + FileIO io) { + Preconditions.checkArgument( + path != null && path.length() > 0, "Cannot initialize GlueCatalog because warehousePath must not be null or empty"); this.catalogName = name; @@ -198,34 +209,51 @@ void initialize(String name, String path, AwsProperties properties, GlueClient c @Override protected TableOperations newTableOps(TableIdentifier tableIdentifier) { if (catalogProperties != null) { - Map tableSpecificCatalogProperties = ImmutableMap.builder() - .putAll(catalogProperties) - .put(AwsProperties.LAKE_FORMATION_DB_NAME, - IcebergToGlueConverter.getDatabaseName(tableIdentifier, awsProperties.glueCatalogSkipNameValidation())) - .put(AwsProperties.LAKE_FORMATION_TABLE_NAME, - IcebergToGlueConverter.getTableName(tableIdentifier, awsProperties.glueCatalogSkipNameValidation())) - .build(); - // FileIO initialization depends on tableSpecificCatalogProperties, so a new FileIO is initialized each time - return new GlueTableOperations(glue, lockManager, catalogName, awsProperties, - initializeFileIO(tableSpecificCatalogProperties), tableIdentifier); + Map tableSpecificCatalogProperties = + ImmutableMap.builder() + .putAll(catalogProperties) + .put( + AwsProperties.LAKE_FORMATION_DB_NAME, + IcebergToGlueConverter.getDatabaseName( + tableIdentifier, awsProperties.glueCatalogSkipNameValidation())) + .put( + AwsProperties.LAKE_FORMATION_TABLE_NAME, + IcebergToGlueConverter.getTableName( + tableIdentifier, awsProperties.glueCatalogSkipNameValidation())) + .build(); + // FileIO initialization depends on tableSpecificCatalogProperties, so a new FileIO is + // initialized each time + return new GlueTableOperations( + glue, + lockManager, + catalogName, + awsProperties, + initializeFileIO(tableSpecificCatalogProperties), + tableIdentifier); } - return new GlueTableOperations(glue, lockManager, catalogName, awsProperties, fileIO, tableIdentifier); + return new GlueTableOperations( + glue, lockManager, catalogName, awsProperties, fileIO, tableIdentifier); } /** - * This method produces the same result as using a HiveCatalog. - * If databaseUri exists for the Glue database URI, the default location is databaseUri/tableName. - * If not, the default location is warehousePath/databaseName.db/tableName + * This method produces the same result as using a HiveCatalog. If databaseUri exists for the Glue + * database URI, the default location is databaseUri/tableName. If not, the default location is + * warehousePath/databaseName.db/tableName + * * @param tableIdentifier table id * @return default warehouse path */ @Override protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { // check if value is set in database - GetDatabaseResponse response = glue.getDatabase(GetDatabaseRequest.builder() - .name(IcebergToGlueConverter.getDatabaseName(tableIdentifier, awsProperties.glueCatalogSkipNameValidation())) - .build()); + GetDatabaseResponse response = + glue.getDatabase( + GetDatabaseRequest.builder() + .name( + IcebergToGlueConverter.getDatabaseName( + tableIdentifier, awsProperties.glueCatalogSkipNameValidation())) + .build()); String dbLocationUri = response.database().locationUri(); if (dbLocationUri != null) { return String.format("%s/%s", dbLocationUri, tableIdentifier.name()); @@ -234,7 +262,8 @@ protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { return String.format( "%s/%s.db/%s", warehousePath, - IcebergToGlueConverter.getDatabaseName(tableIdentifier, awsProperties.glueCatalogSkipNameValidation()), + IcebergToGlueConverter.getDatabaseName( + tableIdentifier, awsProperties.glueCatalogSkipNameValidation()), tableIdentifier.name()); } @@ -245,17 +274,22 @@ public List listTables(Namespace namespace) { String nextToken = null; List results = Lists.newArrayList(); do { - GetTablesResponse response = glue.getTables(GetTablesRequest.builder() - .catalogId(awsProperties.glueCatalogId()) - .databaseName(IcebergToGlueConverter.toDatabaseName(namespace, awsProperties.glueCatalogSkipNameValidation())) - .nextToken(nextToken) - .build()); + GetTablesResponse response = + glue.getTables( + GetTablesRequest.builder() + .catalogId(awsProperties.glueCatalogId()) + .databaseName( + IcebergToGlueConverter.toDatabaseName( + namespace, awsProperties.glueCatalogSkipNameValidation())) + .nextToken(nextToken) + .build()); nextToken = response.nextToken(); if (response.hasTableList()) { - results.addAll(response.tableList().stream() - .filter(this::isGlueIcebergTable) - .map(GlueToIcebergConverter::toTableId) - .collect(Collectors.toList())); + results.addAll( + response.tableList().stream() + .filter(this::isGlueIcebergTable) + .map(GlueToIcebergConverter::toTableId) + .collect(Collectors.toList())); } } while (nextToken != null); @@ -264,8 +298,8 @@ public List listTables(Namespace namespace) { } private boolean isGlueIcebergTable(Table table) { - return table.parameters() != null && - BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE.equalsIgnoreCase( + return table.parameters() != null + && BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE.equalsIgnoreCase( table.parameters().get(BaseMetastoreTableOperations.TABLE_TYPE_PROP)); } @@ -274,12 +308,14 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { try { TableOperations ops = newTableOps(identifier); TableMetadata lastMetadata = ops.current(); - glue.deleteTable(DeleteTableRequest.builder() - .catalogId(awsProperties.glueCatalogId()) - .databaseName(IcebergToGlueConverter.getDatabaseName( - identifier, awsProperties.glueCatalogSkipNameValidation())) - .name(identifier.name()) - .build()); + glue.deleteTable( + DeleteTableRequest.builder() + .catalogId(awsProperties.glueCatalogId()) + .databaseName( + IcebergToGlueConverter.getDatabaseName( + identifier, awsProperties.glueCatalogSkipNameValidation())) + .name(identifier.name()) + .build()); LOG.info("Successfully dropped table {} from Glue", identifier); if (purge && lastMetadata != null) { CatalogUtil.dropTableData(ops.io(), lastMetadata); @@ -291,13 +327,15 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { LOG.error("Cannot drop table {} because table not found or not accessible", identifier, e); return false; } catch (Exception e) { - LOG.error("Cannot complete drop table operation for {} due to unexpected exception", identifier, e); + LOG.error( + "Cannot complete drop table operation for {} due to unexpected exception", identifier, e); throw e; } } /** * Rename table in Glue is a drop table and create table. + * * @param from identifier of the table to rename * @param to new table name */ @@ -305,51 +343,64 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { public void renameTable(TableIdentifier from, TableIdentifier to) { // check new namespace exists if (!namespaceExists(to.namespace())) { - throw new NoSuchNamespaceException("Cannot rename %s to %s because namespace %s does not exist", - from, to, to.namespace()); + throw new NoSuchNamespaceException( + "Cannot rename %s to %s because namespace %s does not exist", from, to, to.namespace()); } // keep metadata Table fromTable = null; - String fromTableDbName = IcebergToGlueConverter.getDatabaseName( - from, awsProperties.glueCatalogSkipNameValidation()); - String fromTableName = IcebergToGlueConverter.getTableName(from, awsProperties.glueCatalogSkipNameValidation()); - String toTableDbName = IcebergToGlueConverter.getDatabaseName(to, awsProperties.glueCatalogSkipNameValidation()); - String toTableName = IcebergToGlueConverter.getTableName(to, awsProperties.glueCatalogSkipNameValidation()); + String fromTableDbName = + IcebergToGlueConverter.getDatabaseName(from, awsProperties.glueCatalogSkipNameValidation()); + String fromTableName = + IcebergToGlueConverter.getTableName(from, awsProperties.glueCatalogSkipNameValidation()); + String toTableDbName = + IcebergToGlueConverter.getDatabaseName(to, awsProperties.glueCatalogSkipNameValidation()); + String toTableName = + IcebergToGlueConverter.getTableName(to, awsProperties.glueCatalogSkipNameValidation()); try { - GetTableResponse response = glue.getTable(GetTableRequest.builder() - .catalogId(awsProperties.glueCatalogId()) - .databaseName(fromTableDbName) - .name(fromTableName) - .build()); + GetTableResponse response = + glue.getTable( + GetTableRequest.builder() + .catalogId(awsProperties.glueCatalogId()) + .databaseName(fromTableDbName) + .name(fromTableName) + .build()); fromTable = response.table(); } catch (EntityNotFoundException e) { - throw new NoSuchTableException(e, "Cannot rename %s because the table does not exist in Glue", from); + throw new NoSuchTableException( + e, "Cannot rename %s because the table does not exist in Glue", from); } // use the same Glue info to create the new table, pointing to the old metadata - TableInput.Builder tableInputBuilder = TableInput.builder() - .owner(fromTable.owner()) - .tableType(fromTable.tableType()) - .parameters(fromTable.parameters()) - .storageDescriptor(fromTable.storageDescriptor()); - - glue.createTable(CreateTableRequest.builder() - .catalogId(awsProperties.glueCatalogId()) - .databaseName(toTableDbName) - .tableInput(tableInputBuilder.name(toTableName).build()) - .build()); + TableInput.Builder tableInputBuilder = + TableInput.builder() + .owner(fromTable.owner()) + .tableType(fromTable.tableType()) + .parameters(fromTable.parameters()) + .storageDescriptor(fromTable.storageDescriptor()); + + glue.createTable( + CreateTableRequest.builder() + .catalogId(awsProperties.glueCatalogId()) + .databaseName(toTableDbName) + .tableInput(tableInputBuilder.name(toTableName).build()) + .build()); LOG.info("created rename destination table {}", to); try { dropTable(from, false); } catch (Exception e) { // rollback, delete renamed table - LOG.error("Fail to drop old table {} after renaming to {}, rollback to use the old table", from, to, e); - glue.deleteTable(DeleteTableRequest.builder() - .catalogId(awsProperties.glueCatalogId()) - .databaseName(toTableDbName) - .name(toTableName) - .build()); + LOG.error( + "Fail to drop old table {} after renaming to {}, rollback to use the old table", + from, + to, + e); + glue.deleteTable( + DeleteTableRequest.builder() + .catalogId(awsProperties.glueCatalogId()) + .databaseName(toTableDbName) + .name(toTableName) + .build()); throw e; } @@ -359,14 +410,17 @@ public void renameTable(TableIdentifier from, TableIdentifier to) { @Override public void createNamespace(Namespace namespace, Map metadata) { try { - glue.createDatabase(CreateDatabaseRequest.builder() - .catalogId(awsProperties.glueCatalogId()) - .databaseInput(IcebergToGlueConverter.toDatabaseInput( - namespace, metadata, awsProperties.glueCatalogSkipNameValidation())) - .build()); + glue.createDatabase( + CreateDatabaseRequest.builder() + .catalogId(awsProperties.glueCatalogId()) + .databaseInput( + IcebergToGlueConverter.toDatabaseInput( + namespace, metadata, awsProperties.glueCatalogSkipNameValidation())) + .build()); LOG.info("Created namespace: {}", namespace); } catch (software.amazon.awssdk.services.glue.model.AlreadyExistsException e) { - throw new AlreadyExistsException("Cannot create namespace %s because it already exists in Glue", namespace); + throw new AlreadyExistsException( + "Cannot create namespace %s because it already exists in Glue", namespace); } } @@ -385,15 +439,18 @@ public List listNamespaces(Namespace namespace) throws NoSuchNamespac String nextToken = null; List results = Lists.newArrayList(); do { - GetDatabasesResponse response = glue.getDatabases(GetDatabasesRequest.builder() - .catalogId(awsProperties.glueCatalogId()) - .nextToken(nextToken) - .build()); + GetDatabasesResponse response = + glue.getDatabases( + GetDatabasesRequest.builder() + .catalogId(awsProperties.glueCatalogId()) + .nextToken(nextToken) + .build()); nextToken = response.nextToken(); if (response.hasDatabaseList()) { - results.addAll(response.databaseList().stream() - .map(GlueToIcebergConverter::toNamespace) - .collect(Collectors.toList())); + results.addAll( + response.databaseList().stream() + .map(GlueToIcebergConverter::toNamespace) + .collect(Collectors.toList())); } } while (nextToken != null); @@ -402,15 +459,19 @@ public List listNamespaces(Namespace namespace) throws NoSuchNamespac } @Override - public Map loadNamespaceMetadata(Namespace namespace) throws NoSuchNamespaceException { - String databaseName = IcebergToGlueConverter.toDatabaseName( - namespace, awsProperties.glueCatalogSkipNameValidation()); + public Map loadNamespaceMetadata(Namespace namespace) + throws NoSuchNamespaceException { + String databaseName = + IcebergToGlueConverter.toDatabaseName( + namespace, awsProperties.glueCatalogSkipNameValidation()); try { - Database database = glue.getDatabase(GetDatabaseRequest.builder() - .catalogId(awsProperties.glueCatalogId()) - .name(databaseName) - .build()) - .database(); + Database database = + glue.getDatabase( + GetDatabaseRequest.builder() + .catalogId(awsProperties.glueCatalogId()) + .name(databaseName) + .build()) + .database(); Map result = Maps.newHashMap(database.parameters()); if (database.locationUri() != null) { @@ -424,10 +485,11 @@ public Map loadNamespaceMetadata(Namespace namespace) throws NoS LOG.debug("Loaded metadata for namespace {} found {}", namespace, result); return result; } catch (InvalidInputException e) { - throw new NoSuchNamespaceException("invalid input for namespace %s, error message: %s", - namespace, e.getMessage()); + throw new NoSuchNamespaceException( + "invalid input for namespace %s, error message: %s", namespace, e.getMessage()); } catch (EntityNotFoundException e) { - throw new NoSuchNamespaceException("fail to find Glue database for namespace %s, error message: %s", + throw new NoSuchNamespaceException( + "fail to find Glue database for namespace %s, error message: %s", databaseName, e.getMessage()); } } @@ -436,10 +498,14 @@ public Map loadNamespaceMetadata(Namespace namespace) throws NoS public boolean dropNamespace(Namespace namespace) throws NamespaceNotEmptyException { namespaceExists(namespace); - GetTablesResponse response = glue.getTables(GetTablesRequest.builder() - .catalogId(awsProperties.glueCatalogId()) - .databaseName(IcebergToGlueConverter.toDatabaseName(namespace, awsProperties.glueCatalogSkipNameValidation())) - .build()); + GetTablesResponse response = + glue.getTables( + GetTablesRequest.builder() + .catalogId(awsProperties.glueCatalogId()) + .databaseName( + IcebergToGlueConverter.toDatabaseName( + namespace, awsProperties.glueCatalogSkipNameValidation())) + .build()); if (response.hasTableList() && !response.tableList().isEmpty()) { Table table = response.tableList().get(0); @@ -452,44 +518,57 @@ public boolean dropNamespace(Namespace namespace) throws NamespaceNotEmptyExcept } } - glue.deleteDatabase(DeleteDatabaseRequest.builder() - .catalogId(awsProperties.glueCatalogId()) - .name(IcebergToGlueConverter.toDatabaseName(namespace, awsProperties.glueCatalogSkipNameValidation())) - .build()); + glue.deleteDatabase( + DeleteDatabaseRequest.builder() + .catalogId(awsProperties.glueCatalogId()) + .name( + IcebergToGlueConverter.toDatabaseName( + namespace, awsProperties.glueCatalogSkipNameValidation())) + .build()); LOG.info("Dropped namespace: {}", namespace); // Always successful, otherwise exception is thrown return true; } @Override - public boolean setProperties(Namespace namespace, Map properties) throws NoSuchNamespaceException { + public boolean setProperties(Namespace namespace, Map properties) + throws NoSuchNamespaceException { Map newProperties = Maps.newHashMap(); newProperties.putAll(loadNamespaceMetadata(namespace)); newProperties.putAll(properties); - glue.updateDatabase(UpdateDatabaseRequest.builder() - .catalogId(awsProperties.glueCatalogId()) - .name(IcebergToGlueConverter.toDatabaseName(namespace, awsProperties.glueCatalogSkipNameValidation())) - .databaseInput(IcebergToGlueConverter.toDatabaseInput( - namespace, newProperties, awsProperties.glueCatalogSkipNameValidation())) - .build()); + glue.updateDatabase( + UpdateDatabaseRequest.builder() + .catalogId(awsProperties.glueCatalogId()) + .name( + IcebergToGlueConverter.toDatabaseName( + namespace, awsProperties.glueCatalogSkipNameValidation())) + .databaseInput( + IcebergToGlueConverter.toDatabaseInput( + namespace, newProperties, awsProperties.glueCatalogSkipNameValidation())) + .build()); LOG.debug("Successfully set properties {} for {}", properties.keySet(), namespace); // Always successful, otherwise exception is thrown return true; } @Override - public boolean removeProperties(Namespace namespace, Set properties) throws NoSuchNamespaceException { + public boolean removeProperties(Namespace namespace, Set properties) + throws NoSuchNamespaceException { Map metadata = Maps.newHashMap(loadNamespaceMetadata(namespace)); for (String property : properties) { metadata.remove(property); } - glue.updateDatabase(UpdateDatabaseRequest.builder() - .catalogId(awsProperties.glueCatalogId()) - .name(IcebergToGlueConverter.toDatabaseName(namespace, awsProperties.glueCatalogSkipNameValidation())) - .databaseInput(IcebergToGlueConverter.toDatabaseInput( - namespace, metadata, awsProperties.glueCatalogSkipNameValidation())) - .build()); + glue.updateDatabase( + UpdateDatabaseRequest.builder() + .catalogId(awsProperties.glueCatalogId()) + .name( + IcebergToGlueConverter.toDatabaseName( + namespace, awsProperties.glueCatalogSkipNameValidation())) + .databaseInput( + IcebergToGlueConverter.toDatabaseInput( + namespace, metadata, awsProperties.glueCatalogSkipNameValidation())) + .build()); LOG.debug("Successfully removed properties {} from {}", properties, namespace); // Always successful, otherwise exception is thrown return true; @@ -501,8 +580,8 @@ protected boolean isValidIdentifier(TableIdentifier tableIdentifier) { return true; } - return IcebergToGlueConverter.isValidNamespace(tableIdentifier.namespace()) && - IcebergToGlueConverter.isValidTableName(tableIdentifier.name()); + return IcebergToGlueConverter.isValidNamespace(tableIdentifier.namespace()) + && IcebergToGlueConverter.isValidTableName(tableIdentifier.name()); } @Override diff --git a/aws/src/main/java/org/apache/iceberg/aws/glue/GlueTableOperations.java b/aws/src/main/java/org/apache/iceberg/aws/glue/GlueTableOperations.java index 2db156b6e332..e7c8b969024e 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/glue/GlueTableOperations.java +++ b/aws/src/main/java/org/apache/iceberg/aws/glue/GlueTableOperations.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.glue; import java.util.Locale; @@ -72,19 +71,28 @@ class GlueTableOperations extends BaseMetastoreTableOperations { private final LockManager lockManager; // Attempt to set versionId if available on the path - private static final DynMethods.UnboundMethod SET_VERSION_ID = DynMethods.builder("versionId") - .hiddenImpl("software.amazon.awssdk.services.glue.model.UpdateTableRequest$Builder", String.class) - .orNoop() - .build(); + private static final DynMethods.UnboundMethod SET_VERSION_ID = + DynMethods.builder("versionId") + .hiddenImpl( + "software.amazon.awssdk.services.glue.model.UpdateTableRequest$Builder", String.class) + .orNoop() + .build(); - GlueTableOperations(GlueClient glue, LockManager lockManager, String catalogName, AwsProperties awsProperties, - FileIO fileIO, TableIdentifier tableIdentifier) { + GlueTableOperations( + GlueClient glue, + LockManager lockManager, + String catalogName, + AwsProperties awsProperties, + FileIO fileIO, + TableIdentifier tableIdentifier) { this.glue = glue; this.awsProperties = awsProperties; - this.databaseName = IcebergToGlueConverter.getDatabaseName( - tableIdentifier, awsProperties.glueCatalogSkipNameValidation()); - this.tableName = IcebergToGlueConverter.getTableName( - tableIdentifier, awsProperties.glueCatalogSkipNameValidation()); + this.databaseName = + IcebergToGlueConverter.getDatabaseName( + tableIdentifier, awsProperties.glueCatalogSkipNameValidation()); + this.tableName = + IcebergToGlueConverter.getTableName( + tableIdentifier, awsProperties.glueCatalogSkipNameValidation()); this.fullTableName = String.format("%s.%s.%s", catalogName, databaseName, tableName); this.commitLockEntityId = String.format("%s.%s", databaseName, tableName); this.fileIO = fileIO; @@ -110,8 +118,10 @@ protected void doRefresh() { metadataLocation = table.parameters().get(METADATA_LOCATION_PROP); } else { if (currentMetadataLocation() != null) { - throw new NoSuchTableException("Cannot find Glue table %s after refresh, " + - "maybe another process deleted it or revoked your access permission", tableName()); + throw new NoSuchTableException( + "Cannot find Glue table %s after refresh, " + + "maybe another process deleted it or revoked your access permission", + tableName()); } } @@ -137,24 +147,30 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { } catch (CommitFailedException e) { throw e; } catch (ConcurrentModificationException e) { - throw new CommitFailedException(e, "Cannot commit %s because Glue detected concurrent update", tableName()); + throw new CommitFailedException( + e, "Cannot commit %s because Glue detected concurrent update", tableName()); } catch (software.amazon.awssdk.services.glue.model.AlreadyExistsException e) { - throw new AlreadyExistsException(e, - "Cannot commit %s because its Glue table already exists when trying to create one", tableName()); + throw new AlreadyExistsException( + e, + "Cannot commit %s because its Glue table already exists when trying to create one", + tableName()); } catch (EntityNotFoundException e) { - throw new NotFoundException(e, - "Cannot commit %s because Glue cannot find the requested entity", tableName()); + throw new NotFoundException( + e, "Cannot commit %s because Glue cannot find the requested entity", tableName()); } catch (software.amazon.awssdk.services.glue.model.AccessDeniedException e) { - throw new ForbiddenException(e, - "Cannot commit %s because Glue cannot access the requested resources", tableName()); + throw new ForbiddenException( + e, "Cannot commit %s because Glue cannot access the requested resources", tableName()); } catch (software.amazon.awssdk.services.glue.model.ValidationException e) { - throw new ValidationException(e, - "Cannot commit %s because Glue encountered a validation exception " + - "while accessing requested resources", + throw new ValidationException( + e, + "Cannot commit %s because Glue encountered a validation exception " + + "while accessing requested resources", tableName()); } catch (RuntimeException persistFailure) { - LOG.error("Confirming if commit to {} indeed failed to persist, attempting to reconnect and check.", - fullTableName, persistFailure); + LOG.error( + "Confirming if commit to {} indeed failed to persist, attempting to reconnect and check.", + fullTableName, + persistFailure); if (persistFailure instanceof AwsServiceException) { int statusCode = ((AwsServiceException) persistFailure).statusCode(); @@ -171,8 +187,8 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { case SUCCESS: break; case FAILURE: - throw new CommitFailedException(persistFailure, - "Cannot commit %s due to unexpected exception", tableName()); + throw new CommitFailedException( + persistFailure, "Cannot commit %s due to unexpected exception", tableName()); case UNKNOWN: throw new CommitStateUnknownException(persistFailure); } @@ -186,35 +202,43 @@ private boolean createGlueTempTableIfNecessary(TableMetadata base, String metada if (awsProperties.glueLakeFormationEnabled() && base == null) { // LakeFormation credential require TableArn as input,so creating a dummy table // beforehand for create table scenario - glue.createTable(CreateTableRequest.builder() - .databaseName(databaseName) - .tableInput(TableInput.builder() - .parameters(ImmutableMap.of(TABLE_TYPE_PROP, ICEBERG_TABLE_TYPE_VALUE)) - .name(tableName) - .storageDescriptor(StorageDescriptor.builder().location(metadataLocation).build()) - .build()) - .build()); + glue.createTable( + CreateTableRequest.builder() + .databaseName(databaseName) + .tableInput( + TableInput.builder() + .parameters(ImmutableMap.of(TABLE_TYPE_PROP, ICEBERG_TABLE_TYPE_VALUE)) + .name(tableName) + .storageDescriptor( + StorageDescriptor.builder().location(metadataLocation).build()) + .build()) + .build()); return true; } return false; } - private void cleanupGlueTempTableIfNecessary(boolean glueTempTableCreated, CommitStatus commitStatus) { + private void cleanupGlueTempTableIfNecessary( + boolean glueTempTableCreated, CommitStatus commitStatus) { if (glueTempTableCreated && commitStatus != CommitStatus.SUCCESS) { - glue.deleteTable(DeleteTableRequest.builder().databaseName(databaseName).name(tableName).build()); + glue.deleteTable( + DeleteTableRequest.builder().databaseName(databaseName).name(tableName).build()); } } private void lock(String newMetadataLocation) { if (lockManager != null && !lockManager.acquire(commitLockEntityId, newMetadataLocation)) { - throw new IllegalStateException(String.format("Fail to acquire lock %s to commit new metadata at %s", - commitLockEntityId, newMetadataLocation)); + throw new IllegalStateException( + String.format( + "Fail to acquire lock %s to commit new metadata at %s", + commitLockEntityId, newMetadataLocation)); } } private void checkMetadataLocation(Table glueTable, TableMetadata base) { - String glueMetadataLocation = glueTable != null ? glueTable.parameters().get(METADATA_LOCATION_PROP) : null; + String glueMetadataLocation = + glueTable != null ? glueTable.parameters().get(METADATA_LOCATION_PROP) : null; String baseMetadataLocation = base != null ? base.metadataFileLocation() : null; if (!Objects.equals(baseMetadataLocation, glueMetadataLocation)) { throw new CommitFailedException( @@ -225,11 +249,13 @@ private void checkMetadataLocation(Table glueTable, TableMetadata base) { private Table getGlueTable() { try { - GetTableResponse response = glue.getTable(GetTableRequest.builder() - .catalogId(awsProperties.glueCatalogId()) - .databaseName(databaseName) - .name(tableName) - .build()); + GetTableResponse response = + glue.getTable( + GetTableRequest.builder() + .catalogId(awsProperties.glueCatalogId()) + .databaseName(databaseName) + .name(tableName) + .build()); return response.table(); } catch (EntityNotFoundException e) { return null; @@ -237,7 +263,8 @@ private Table getGlueTable() { } private Map prepareProperties(Table glueTable, String newMetadataLocation) { - Map properties = glueTable != null ? Maps.newHashMap(glueTable.parameters()) : Maps.newHashMap(); + Map properties = + glueTable != null ? Maps.newHashMap(glueTable.parameters()) : Maps.newHashMap(); properties.put(TABLE_TYPE_PROP, ICEBERG_TABLE_TYPE_VALUE.toUpperCase(Locale.ENGLISH)); properties.put(METADATA_LOCATION_PROP, newMetadataLocation); if (currentMetadataLocation() != null && !currentMetadataLocation().isEmpty()) { @@ -251,16 +278,20 @@ private Map prepareProperties(Table glueTable, String newMetadat void persistGlueTable(Table glueTable, Map parameters, TableMetadata metadata) { if (glueTable != null) { LOG.debug("Committing existing Glue table: {}", tableName()); - UpdateTableRequest.Builder updateTableRequest = UpdateTableRequest.builder() - .catalogId(awsProperties.glueCatalogId()) - .databaseName(databaseName) - .skipArchive(awsProperties.glueCatalogSkipArchive()) - .tableInput(TableInput.builder() - .applyMutation(builder -> IcebergToGlueConverter.setTableInputInformation(builder, metadata)) - .name(tableName) - .tableType(GLUE_EXTERNAL_TABLE_TYPE) - .parameters(parameters) - .build()); + UpdateTableRequest.Builder updateTableRequest = + UpdateTableRequest.builder() + .catalogId(awsProperties.glueCatalogId()) + .databaseName(databaseName) + .skipArchive(awsProperties.glueCatalogSkipArchive()) + .tableInput( + TableInput.builder() + .applyMutation( + builder -> + IcebergToGlueConverter.setTableInputInformation(builder, metadata)) + .name(tableName) + .tableType(GLUE_EXTERNAL_TABLE_TYPE) + .parameters(parameters) + .build()); // Use Optimistic locking with table version id while updating table if (!SET_VERSION_ID.isNoop() && lockManager == null) { SET_VERSION_ID.invoke(updateTableRequest, glueTable.versionId()); @@ -269,23 +300,29 @@ void persistGlueTable(Table glueTable, Map parameters, TableMeta glue.updateTable(updateTableRequest.build()); } else { LOG.debug("Committing new Glue table: {}", tableName()); - glue.createTable(CreateTableRequest.builder() - .catalogId(awsProperties.glueCatalogId()) - .databaseName(databaseName) - .tableInput(TableInput.builder() - .applyMutation(builder -> IcebergToGlueConverter.setTableInputInformation(builder, metadata)) - .name(tableName) - .tableType(GLUE_EXTERNAL_TABLE_TYPE) - .parameters(parameters) - .build()) - .build()); + glue.createTable( + CreateTableRequest.builder() + .catalogId(awsProperties.glueCatalogId()) + .databaseName(databaseName) + .tableInput( + TableInput.builder() + .applyMutation( + builder -> + IcebergToGlueConverter.setTableInputInformation(builder, metadata)) + .name(tableName) + .tableType(GLUE_EXTERNAL_TABLE_TYPE) + .parameters(parameters) + .build()) + .build()); } } @VisibleForTesting void cleanupMetadataAndUnlock(CommitStatus commitStatus, String metadataLocation) { try { - if (commitStatus == CommitStatus.FAILURE && metadataLocation != null && !metadataLocation.isEmpty()) { + if (commitStatus == CommitStatus.FAILURE + && metadataLocation != null + && !metadataLocation.isEmpty()) { // if anything went wrong, clean up the uncommitted metadata file io().deleteFile(metadataLocation); } diff --git a/aws/src/main/java/org/apache/iceberg/aws/glue/GlueToIcebergConverter.java b/aws/src/main/java/org/apache/iceberg/aws/glue/GlueToIcebergConverter.java index 5078ba34d609..903d2e853dc5 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/glue/GlueToIcebergConverter.java +++ b/aws/src/main/java/org/apache/iceberg/aws/glue/GlueToIcebergConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.glue; import org.apache.iceberg.BaseMetastoreTableOperations; @@ -28,8 +27,7 @@ class GlueToIcebergConverter { - private GlueToIcebergConverter() { - } + private GlueToIcebergConverter() {} static Namespace toNamespace(Database database) { return Namespace.of(database.name()); @@ -41,13 +39,17 @@ static TableIdentifier toTableId(Table table) { /** * Validate the Glue table is Iceberg table by checking its parameters + * * @param table glue table * @param fullName full table name for logging */ static void validateTable(Table table, String fullName) { String tableType = table.parameters().get(BaseMetastoreTableOperations.TABLE_TYPE_PROP); - ValidationException.check(tableType != null && tableType.equalsIgnoreCase( - BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE), - "Input Glue table is not an iceberg table: %s (type=%s)", fullName, tableType); + ValidationException.check( + tableType != null + && tableType.equalsIgnoreCase(BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE), + "Input Glue table is not an iceberg table: %s (type=%s)", + fullName, + tableType); } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/glue/IcebergToGlueConverter.java b/aws/src/main/java/org/apache/iceberg/aws/glue/IcebergToGlueConverter.java index 765ca501d621..241c0098d628 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/glue/IcebergToGlueConverter.java +++ b/aws/src/main/java/org/apache/iceberg/aws/glue/IcebergToGlueConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.glue; import java.util.Collection; @@ -55,8 +54,7 @@ class IcebergToGlueConverter { private static final Logger LOG = LoggerFactory.getLogger(IcebergToGlueConverter.class); - private IcebergToGlueConverter() { - } + private IcebergToGlueConverter() {} private static final Pattern GLUE_DB_PATTERN = Pattern.compile("^[a-z0-9_]{1,252}$"); private static final Pattern GLUE_TABLE_PATTERN = Pattern.compile("^[a-z0-9_]{1,255}$"); @@ -65,23 +63,27 @@ private IcebergToGlueConverter() { public static final String ICEBERG_FIELD_ID = "iceberg.field.id"; public static final String ICEBERG_FIELD_OPTIONAL = "iceberg.field.optional"; public static final String ICEBERG_FIELD_CURRENT = "iceberg.field.current"; - private static final List ADDITIONAL_LOCATION_PROPERTIES = ImmutableList.of( - TableProperties.WRITE_DATA_LOCATION, - TableProperties.WRITE_METADATA_LOCATION, - TableProperties.OBJECT_STORE_PATH, - TableProperties.WRITE_FOLDER_STORAGE_LOCATION - ); + private static final List ADDITIONAL_LOCATION_PROPERTIES = + ImmutableList.of( + TableProperties.WRITE_DATA_LOCATION, + TableProperties.WRITE_METADATA_LOCATION, + TableProperties.OBJECT_STORE_PATH, + TableProperties.WRITE_FOLDER_STORAGE_LOCATION); // Attempt to set additionalLocations if available on the given AWS SDK version - private static final DynMethods.UnboundMethod SET_ADDITIONAL_LOCATIONS = DynMethods.builder("additionalLocations") - .hiddenImpl("software.amazon.awssdk.services.glue.model.StorageDescriptor$Builder", Collection.class) - .orNoop() - .build(); + private static final DynMethods.UnboundMethod SET_ADDITIONAL_LOCATIONS = + DynMethods.builder("additionalLocations") + .hiddenImpl( + "software.amazon.awssdk.services.glue.model.StorageDescriptor$Builder", + Collection.class) + .orNoop() + .build(); /** - * A Glue database name cannot be longer than 252 characters. - * The only acceptable characters are lowercase letters, numbers, and the underscore character. - * More details: https://docs.aws.amazon.com/athena/latest/ug/glue-best-practices.html + * A Glue database name cannot be longer than 252 characters. The only acceptable characters are + * lowercase letters, numbers, and the underscore character. More details: + * https://docs.aws.amazon.com/athena/latest/ug/glue-best-practices.html + * * @param namespace namespace * @return if namespace can be accepted by Glue */ @@ -95,18 +97,22 @@ static boolean isValidNamespace(Namespace namespace) { /** * Validate if an Iceberg namespace is valid in Glue + * * @param namespace namespace * @throws NoSuchNamespaceException if namespace is not valid in Glue */ static void validateNamespace(Namespace namespace) { - ValidationException.check(isValidNamespace(namespace), "Cannot convert namespace %s to Glue database name, " + - "because it must be 1-252 chars of lowercase letters, numbers, underscore", namespace); + ValidationException.check( + isValidNamespace(namespace), + "Cannot convert namespace %s to Glue database name, " + + "because it must be 1-252 chars of lowercase letters, numbers, underscore", + namespace); } /** * Validate and convert Iceberg namespace to Glue database name * - * @param namespace Iceberg namespace + * @param namespace Iceberg namespace * @param skipNameValidation should skip name validation * @return database name */ @@ -121,7 +127,7 @@ static String toDatabaseName(Namespace namespace, boolean skipNameValidation) { /** * Validate and get Glue database name from Iceberg TableIdentifier * - * @param tableIdentifier Iceberg table identifier + * @param tableIdentifier Iceberg table identifier * @param skipNameValidation should skip name validation * @return database name */ @@ -132,32 +138,35 @@ static String getDatabaseName(TableIdentifier tableIdentifier, boolean skipNameV /** * Validate and convert Iceberg name to Glue DatabaseInput * - * @param namespace Iceberg namespace - * @param metadata metadata map + * @param namespace Iceberg namespace + * @param metadata metadata map * @param skipNameValidation should skip name validation * @return Glue DatabaseInput */ - static DatabaseInput toDatabaseInput(Namespace namespace, Map metadata, boolean skipNameValidation) { - DatabaseInput.Builder builder = DatabaseInput.builder().name(toDatabaseName(namespace, - skipNameValidation)); + static DatabaseInput toDatabaseInput( + Namespace namespace, Map metadata, boolean skipNameValidation) { + DatabaseInput.Builder builder = + DatabaseInput.builder().name(toDatabaseName(namespace, skipNameValidation)); Map parameters = Maps.newHashMap(); - metadata.forEach((k, v) -> { - if (GLUE_DB_DESCRIPTION_KEY.equals(k)) { - builder.description(v); - } else if (GLUE_DB_LOCATION_KEY.equals(k)) { - builder.locationUri(v); - } else { - parameters.put(k, v); - } - }); + metadata.forEach( + (k, v) -> { + if (GLUE_DB_DESCRIPTION_KEY.equals(k)) { + builder.description(v); + } else if (GLUE_DB_LOCATION_KEY.equals(k)) { + builder.locationUri(v); + } else { + parameters.put(k, v); + } + }); return builder.parameters(parameters).build(); } /** - * A Glue table name cannot be longer than 255 characters. - * The only acceptable characters are lowercase letters, numbers, and the underscore character. - * More details: https://docs.aws.amazon.com/athena/latest/ug/glue-best-practices.html + * A Glue table name cannot be longer than 255 characters. The only acceptable characters are + * lowercase letters, numbers, and the underscore character. More details: + * https://docs.aws.amazon.com/athena/latest/ug/glue-best-practices.html + * * @param tableName table name * @return if a table name can be accepted by Glue */ @@ -167,19 +176,23 @@ static boolean isValidTableName(String tableName) { /** * Validate if a table name is valid in Glue + * * @param tableName table name * @throws NoSuchTableException if table name not valid in Glue */ static void validateTableName(String tableName) { - ValidationException.check(isValidTableName(tableName), "Cannot use %s as Glue table name, " + - "because it must be 1-255 chars of lowercase letters, numbers, underscore", tableName); + ValidationException.check( + isValidTableName(tableName), + "Cannot use %s as Glue table name, " + + "because it must be 1-255 chars of lowercase letters, numbers, underscore", + tableName); } /** * Validate and get Glue table name from Iceberg TableIdentifier * - * @param tableIdentifier table identifier - * @param skipNameValidation should skip name validation + * @param tableIdentifier table identifier + * @param skipNameValidation should skip name validation * @return table name */ static String getTableName(TableIdentifier tableIdentifier, boolean skipNameValidation) { @@ -192,6 +205,7 @@ static String getTableName(TableIdentifier tableIdentifier, boolean skipNameVali /** * Validate Iceberg TableIdentifier is valid in Glue + * * @param tableIdentifier Iceberg table identifier */ static void validateTableIdentifier(TableIdentifier tableIdentifier) { @@ -201,40 +215,43 @@ static void validateTableIdentifier(TableIdentifier tableIdentifier) { /** * Set Glue table input information based on Iceberg table metadata. - *

- * A best-effort conversion of Iceberg metadata to Glue table is performed to display Iceberg information in Glue, - * but such information is only intended for informational human read access through tools like UI or CLI, - * and should never be used by any query processing engine to infer information like schema, partition spec, etc. - * The source of truth is stored in the actual Iceberg metadata file defined by the metadata_location table property. + * + *

A best-effort conversion of Iceberg metadata to Glue table is performed to display Iceberg + * information in Glue, but such information is only intended for informational human read access + * through tools like UI or CLI, and should never be used by any query processing engine to infer + * information like schema, partition spec, etc. The source of truth is stored in the actual + * Iceberg metadata file defined by the metadata_location table property. + * * @param tableInputBuilder Glue TableInput builder * @param metadata Iceberg table metadata */ - static void setTableInputInformation(TableInput.Builder tableInputBuilder, TableMetadata metadata) { + static void setTableInputInformation( + TableInput.Builder tableInputBuilder, TableMetadata metadata) { try { StorageDescriptor.Builder storageDescriptor = StorageDescriptor.builder(); if (!SET_ADDITIONAL_LOCATIONS.isNoop()) { - SET_ADDITIONAL_LOCATIONS.invoke(storageDescriptor, + SET_ADDITIONAL_LOCATIONS.invoke( + storageDescriptor, ADDITIONAL_LOCATION_PROPERTIES.stream() - .map(metadata.properties()::get) - .filter(Objects::nonNull) - .collect(Collectors.toSet())); + .map(metadata.properties()::get) + .filter(Objects::nonNull) + .collect(Collectors.toSet())); } - tableInputBuilder - .storageDescriptor(storageDescriptor - .location(metadata.location()) - .columns(toColumns(metadata)) - .build()); + tableInputBuilder.storageDescriptor( + storageDescriptor.location(metadata.location()).columns(toColumns(metadata)).build()); } catch (RuntimeException e) { - LOG.warn("Encountered unexpected exception while converting Iceberg metadata to Glue table information", e); + LOG.warn( + "Encountered unexpected exception while converting Iceberg metadata to Glue table information", + e); } } /** * Converting from an Iceberg type to a type string that can be displayed in Glue. - *

- * Such conversion is only used for informational purpose, - * DO NOT reference this method for any actual data processing type conversion. + * + *

Such conversion is only used for informational purpose, DO NOT reference this method for any + * actual data processing type conversion. * * @param type Iceberg type * @return type string @@ -267,16 +284,18 @@ private static String toTypeString(Type type) { return String.format("decimal(%s,%s)", decimalType.precision(), decimalType.scale()); case STRUCT: final Types.StructType structType = type.asStructType(); - final String nameToType = structType.fields().stream() - .map(f -> String.format("%s:%s", f.name(), toTypeString(f.type()))) - .collect(Collectors.joining(",")); + final String nameToType = + structType.fields().stream() + .map(f -> String.format("%s:%s", f.name(), toTypeString(f.type()))) + .collect(Collectors.joining(",")); return String.format("struct<%s>", nameToType); case LIST: final Types.ListType listType = type.asListType(); return String.format("array<%s>", toTypeString(listType.elementType())); case MAP: final Types.MapType mapType = type.asMapType(); - return String.format("map<%s,%s>", toTypeString(mapType.keyType()), toTypeString(mapType.valueType())); + return String.format( + "map<%s,%s>", toTypeString(mapType.keyType()), toTypeString(mapType.valueType())); default: return type.typeId().name().toLowerCase(Locale.ENGLISH); } @@ -301,19 +320,20 @@ private static List toColumns(TableMetadata metadata) { return columns; } - private static void addColumnWithDedupe(List columns, Set dedupe, - NestedField field, boolean isCurrent) { + private static void addColumnWithDedupe( + List columns, Set dedupe, NestedField field, boolean isCurrent) { if (!dedupe.contains(field.name())) { - columns.add(Column.builder() - .name(field.name()) - .type(toTypeString(field.type())) - .comment(field.doc()) - .parameters(ImmutableMap.of( - ICEBERG_FIELD_ID, Integer.toString(field.fieldId()), - ICEBERG_FIELD_OPTIONAL, Boolean.toString(field.isOptional()), - ICEBERG_FIELD_CURRENT, Boolean.toString(isCurrent) - )) - .build()); + columns.add( + Column.builder() + .name(field.name()) + .type(toTypeString(field.type())) + .comment(field.doc()) + .parameters( + ImmutableMap.of( + ICEBERG_FIELD_ID, Integer.toString(field.fieldId()), + ICEBERG_FIELD_OPTIONAL, Boolean.toString(field.isOptional()), + ICEBERG_FIELD_CURRENT, Boolean.toString(isCurrent))) + .build()); dedupe.add(field.name()); } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/lakeformation/LakeFormationAwsClientFactory.java b/aws/src/main/java/org/apache/iceberg/aws/lakeformation/LakeFormationAwsClientFactory.java index e4de99f530a6..bb3c4459feab 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/lakeformation/LakeFormationAwsClientFactory.java +++ b/aws/src/main/java/org/apache/iceberg/aws/lakeformation/LakeFormationAwsClientFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.lakeformation; import java.util.Map; @@ -39,15 +38,15 @@ import software.amazon.awssdk.services.s3.S3Client; /** - * This implementation of AwsClientFactory is used by default if - * {@link org.apache.iceberg.aws.AwsProperties#GLUE_LAKEFORMATION_ENABLED} is set to true. - * It uses the default credential chain to assume role. Third-party engines can further extend this class - * to any custom credential setup. - *

- * It extends AssumeRoleAwsClientFactory to reuse the assuming-role approach - * for all clients except S3 and KMS. If a table is registered with LakeFormation, the S3/KMS client will use - * LakeFormation vended credentials, otherwise it uses AssumingRole credentials. - * For using LakeFormation credential vending for a third-party query engine, see: + * This implementation of AwsClientFactory is used by default if {@link + * org.apache.iceberg.aws.AwsProperties#GLUE_LAKEFORMATION_ENABLED} is set to true. It uses the + * default credential chain to assume role. Third-party engines can further extend this class to any + * custom credential setup. + * + *

It extends AssumeRoleAwsClientFactory to reuse the assuming-role approach for all clients + * except S3 and KMS. If a table is registered with LakeFormation, the S3/KMS client will use + * LakeFormation vended credentials, otherwise it uses AssumingRole credentials. For using + * LakeFormation credential vending for a third-party query engine, see: * https://docs.aws.amazon.com/lake-formation/latest/dg/register-query-engine.html */ public class LakeFormationAwsClientFactory extends AssumeRoleAwsClientFactory { @@ -59,15 +58,16 @@ public class LakeFormationAwsClientFactory extends AssumeRoleAwsClientFactory { private String glueCatalogId; private String glueAccountId; - public LakeFormationAwsClientFactory() { - } + public LakeFormationAwsClientFactory() {} @Override public void initialize(Map catalogProperties) { super.initialize(catalogProperties); - Preconditions.checkArgument(tags().stream().anyMatch(t -> t.key().equals(LF_AUTHORIZED_CALLER)), + Preconditions.checkArgument( + tags().stream().anyMatch(t -> t.key().equals(LF_AUTHORIZED_CALLER)), "STS assume role session tag %s must be set using %s to use LakeFormation client factory", - LF_AUTHORIZED_CALLER, AwsProperties.CLIENT_ASSUME_ROLE_TAGS_PREFIX); + LF_AUTHORIZED_CALLER, + AwsProperties.CLIENT_ASSUME_ROLE_TAGS_PREFIX); this.dbName = catalogProperties.get(AwsProperties.LAKE_FORMATION_DB_NAME); this.tableName = catalogProperties.get(AwsProperties.LAKE_FORMATION_TABLE_NAME); this.glueCatalogId = catalogProperties.get(AwsProperties.GLUE_CATALOG_ID); @@ -80,7 +80,8 @@ public S3Client s3() { return S3Client.builder() .httpClientBuilder(AwsClientFactories.configureHttpClientBuilder(httpClientType())) .applyMutation(builder -> AwsClientFactories.configureEndpoint(builder, s3Endpoint())) - .credentialsProvider(new LakeFormationCredentialsProvider(lakeFormation(), buildTableArn())) + .credentialsProvider( + new LakeFormationCredentialsProvider(lakeFormation(), buildTableArn())) .serviceConfiguration(s -> s.useArnRegionEnabled(s3UseArnRegionEnabled()).build()) .region(Region.of(region())) .build(); @@ -94,7 +95,8 @@ public KmsClient kms() { if (isTableRegisteredWithLakeFormation()) { return KmsClient.builder() .httpClientBuilder(AwsClientFactories.configureHttpClientBuilder(httpClientType())) - .credentialsProvider(new LakeFormationCredentialsProvider(lakeFormation(), buildTableArn())) + .credentialsProvider( + new LakeFormationCredentialsProvider(lakeFormation(), buildTableArn())) .region(Region.of(region())) .build(); } else { @@ -103,33 +105,34 @@ public KmsClient kms() { } private boolean isTableRegisteredWithLakeFormation() { - Preconditions.checkArgument(dbName != null && !dbName.isEmpty(), "Database name can not be empty"); - Preconditions.checkArgument(tableName != null && !tableName.isEmpty(), "Table name can not be empty"); - - GetTableResponse response = glue().getTable(GetTableRequest.builder() - .catalogId(glueCatalogId) - .databaseName(dbName) - .name(tableName) - .build()); + Preconditions.checkArgument( + dbName != null && !dbName.isEmpty(), "Database name can not be empty"); + Preconditions.checkArgument( + tableName != null && !tableName.isEmpty(), "Table name can not be empty"); + + GetTableResponse response = + glue() + .getTable( + GetTableRequest.builder() + .catalogId(glueCatalogId) + .databaseName(dbName) + .name(tableName) + .build()); return response.table().isRegisteredWithLakeFormation(); } private String buildTableArn() { - Preconditions.checkArgument(glueAccountId != null && !glueAccountId.isEmpty(), - "%s can not be empty", AwsProperties.GLUE_ACCOUNT_ID); + Preconditions.checkArgument( + glueAccountId != null && !glueAccountId.isEmpty(), + "%s can not be empty", + AwsProperties.GLUE_ACCOUNT_ID); String partitionName = PartitionMetadata.of(Region.of(region())).id(); - return String.format("arn:%s:glue:%s:%s:table/%s/%s", - partitionName, - region(), - glueAccountId, - dbName, - tableName); + return String.format( + "arn:%s:glue:%s:%s:table/%s/%s", partitionName, region(), glueAccountId, dbName, tableName); } private LakeFormationClient lakeFormation() { - return LakeFormationClient.builder() - .applyMutation(this::configure) - .build(); + return LakeFormationClient.builder().applyMutation(this::configure).build(); } static class LakeFormationCredentialsProvider implements AwsCredentialsProvider { @@ -146,13 +149,15 @@ public AwsCredentials resolveCredentials() { GetTemporaryGlueTableCredentialsRequest getTemporaryGlueTableCredentialsRequest = GetTemporaryGlueTableCredentialsRequest.builder() .tableArn(tableArn) - // Now only two permission types (COLUMN_PERMISSION and CELL_FILTER_PERMISSION) are supported + // Now only two permission types (COLUMN_PERMISSION and CELL_FILTER_PERMISSION) are + // supported // and Iceberg only supports COLUMN_PERMISSION at this time .supportedPermissionTypes(PermissionType.COLUMN_PERMISSION) .build(); GetTemporaryGlueTableCredentialsResponse response = client.getTemporaryGlueTableCredentials(getTemporaryGlueTableCredentialsRequest); - return AwsSessionCredentials.create(response.accessKeyId(), response.secretAccessKey(), response.sessionToken()); + return AwsSessionCredentials.create( + response.accessKeyId(), response.secretAccessKey(), response.sessionToken()); } } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/BaseS3File.java b/aws/src/main/java/org/apache/iceberg/aws/s3/BaseS3File.java index d5381733a8ee..24b2cf26158a 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/BaseS3File.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/BaseS3File.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.s3; import org.apache.iceberg.aws.AwsProperties; @@ -80,9 +79,8 @@ public boolean exists() { protected HeadObjectResponse getObjectMetadata() throws S3Exception { if (metadata == null) { - HeadObjectRequest.Builder requestBuilder = HeadObjectRequest.builder() - .bucket(uri().bucket()) - .key(uri().key()); + HeadObjectRequest.Builder requestBuilder = + HeadObjectRequest.builder().bucket(uri().bucket()).key(uri().key()); S3RequestUtil.configureEncryption(awsProperties, requestBuilder); metadata = client().headObject(requestBuilder.build()); } @@ -94,5 +92,4 @@ protected HeadObjectResponse getObjectMetadata() throws S3Exception { public String toString() { return uri.toString(); } - } diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIO.java b/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIO.java index 5d5d88eafbe9..d7a2fb1b0fe1 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIO.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIO.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.s3; import java.util.Collection; @@ -66,14 +65,16 @@ /** * FileIO implementation backed by S3. - *

- * Locations used must follow the conventions for S3 URIs (e.g. s3://bucket/path...). - * URIs with schemes s3a, s3n, https are also treated as s3 file paths. - * Using this FileIO with other schemes will result in {@link org.apache.iceberg.exceptions.ValidationException}. + * + *

Locations used must follow the conventions for S3 URIs (e.g. s3://bucket/path...). URIs with + * schemes s3a, s3n, https are also treated as s3 file paths. Using this FileIO with other schemes + * will result in {@link org.apache.iceberg.exceptions.ValidationException}. */ -public class S3FileIO implements FileIO, SupportsBulkOperations, SupportsPrefixOperations, CredentialSupplier { +public class S3FileIO + implements FileIO, SupportsBulkOperations, SupportsPrefixOperations, CredentialSupplier { private static final Logger LOG = LoggerFactory.getLogger(S3FileIO.class); - private static final String DEFAULT_METRICS_IMPL = "org.apache.iceberg.hadoop.HadoopMetricsContext"; + private static final String DEFAULT_METRICS_IMPL = + "org.apache.iceberg.hadoop.HadoopMetricsContext"; private static volatile ExecutorService executorService; private String credential = null; @@ -86,16 +87,16 @@ public class S3FileIO implements FileIO, SupportsBulkOperations, SupportsPrefixO /** * No-arg constructor to load the FileIO dynamically. - *

- * All fields are initialized by calling {@link S3FileIO#initialize(Map)} later. + * + *

All fields are initialized by calling {@link S3FileIO#initialize(Map)} later. */ - public S3FileIO() { - } + public S3FileIO() {} /** * Constructor with custom s3 supplier and default AWS properties. - *

- * Calling {@link S3FileIO#initialize(Map)} will overwrite information set in this constructor. + * + *

Calling {@link S3FileIO#initialize(Map)} will overwrite information set in this constructor. + * * @param s3 s3 supplier */ public S3FileIO(SerializableSupplier s3) { @@ -104,8 +105,9 @@ public S3FileIO(SerializableSupplier s3) { /** * Constructor with custom s3 supplier and AWS properties. - *

- * Calling {@link S3FileIO#initialize(Map)} will overwrite information set in this constructor. + * + *

Calling {@link S3FileIO#initialize(Map)} will overwrite information set in this constructor. + * * @param s3 s3 supplier * @param awsProperties aws properties */ @@ -157,9 +159,9 @@ public Map properties() { /** * Deletes the given paths in a batched manner. - *

- * The paths are grouped by bucket, and deletion is triggered when we either reach the configured batch size - * or have a final remainder batch for each bucket. + * + *

The paths are grouped by bucket, and deletion is triggered when we either reach the + * configured batch size or have a final remainder batch for each bucket. * * @param paths paths to delete */ @@ -170,8 +172,13 @@ public void deleteFiles(Iterable paths) throws BulkDeletionFailureExcept .noRetry() .executeWith(executorService()) .suppressFailureWhenFinished() - .onFailure((path, exc) -> LOG.warn("Failed to add delete tags: {} to {}", - awsProperties.s3DeleteTags(), path, exc)) + .onFailure( + (path, exc) -> + LOG.warn( + "Failed to add delete tags: {} to {}", + awsProperties.s3DeleteTags(), + path, + exc)) .run(path -> tagFileToDelete(path, awsProperties.s3DeleteTags())); } @@ -179,7 +186,8 @@ public void deleteFiles(Iterable paths) throws BulkDeletionFailureExcept return; } - SetMultimap bucketToObjects = Multimaps.newSetMultimap(Maps.newHashMap(), Sets::newHashSet); + SetMultimap bucketToObjects = + Multimaps.newSetMultimap(Maps.newHashMap(), Sets::newHashSet); int numberOfFailedDeletions = 0; for (String path : paths) { S3URI location = new S3URI(path, awsProperties.s3BucketToAccessPointMapping()); @@ -189,18 +197,21 @@ public void deleteFiles(Iterable paths) throws BulkDeletionFailureExcept if (objectsInBucket.size() == awsProperties.s3FileIoDeleteBatchSize()) { List failedDeletionsForBatch = deleteObjectsInBucket(bucket, objectsInBucket); numberOfFailedDeletions += failedDeletionsForBatch.size(); - failedDeletionsForBatch.forEach(failedPath -> LOG.warn("Failed to delete object at path {}", failedPath)); + failedDeletionsForBatch.forEach( + failedPath -> LOG.warn("Failed to delete object at path {}", failedPath)); bucketToObjects.removeAll(bucket); } bucketToObjects.get(bucket).add(objectKey); } // Delete the remainder - for (Map.Entry> bucketToObjectsEntry : bucketToObjects.asMap().entrySet()) { + for (Map.Entry> bucketToObjectsEntry : + bucketToObjects.asMap().entrySet()) { final String bucket = bucketToObjectsEntry.getKey(); final Collection objects = bucketToObjectsEntry.getValue(); List failedDeletions = deleteObjectsInBucket(bucket, objects); - failedDeletions.forEach(failedPath -> LOG.warn("Failed to delete object at path {}", failedPath)); + failedDeletions.forEach( + failedPath -> LOG.warn("Failed to delete object at path {}", failedPath)); numberOfFailedDeletions += failedDeletions.size(); } @@ -213,12 +224,10 @@ private void tagFileToDelete(String path, Set deleteTags) throws S3Exceptio S3URI location = new S3URI(path, awsProperties.s3BucketToAccessPointMapping()); String bucket = location.bucket(); String objectKey = location.key(); - GetObjectTaggingRequest getObjectTaggingRequest = GetObjectTaggingRequest.builder() - .bucket(bucket) - .key(objectKey) - .build(); - GetObjectTaggingResponse getObjectTaggingResponse = client() - .getObjectTagging(getObjectTaggingRequest); + GetObjectTaggingRequest getObjectTaggingRequest = + GetObjectTaggingRequest.builder().bucket(bucket).key(objectKey).build(); + GetObjectTaggingResponse getObjectTaggingResponse = + client().getObjectTagging(getObjectTaggingRequest); // Get existing tags, if any and then add the delete tags Set tags = Sets.newHashSet(); if (getObjectTaggingResponse.hasTagSet()) { @@ -226,28 +235,29 @@ private void tagFileToDelete(String path, Set deleteTags) throws S3Exceptio } tags.addAll(deleteTags); - PutObjectTaggingRequest putObjectTaggingRequest = PutObjectTaggingRequest.builder() - .bucket(bucket) - .key(objectKey) - .tagging(Tagging.builder().tagSet(tags).build()) - .build(); + PutObjectTaggingRequest putObjectTaggingRequest = + PutObjectTaggingRequest.builder() + .bucket(bucket) + .key(objectKey) + .tagging(Tagging.builder().tagSet(tags).build()) + .build(); client().putObjectTagging(putObjectTaggingRequest); } private List deleteObjectsInBucket(String bucket, Collection objects) { if (!objects.isEmpty()) { - List objectIds = objects - .stream() - .map(objectKey -> ObjectIdentifier.builder().key(objectKey).build()) - .collect(Collectors.toList()); - DeleteObjectsRequest deleteObjectsRequest = DeleteObjectsRequest.builder() - .bucket(bucket) - .delete(Delete.builder().objects(objectIds).build()) - .build(); + List objectIds = + objects.stream() + .map(objectKey -> ObjectIdentifier.builder().key(objectKey).build()) + .collect(Collectors.toList()); + DeleteObjectsRequest deleteObjectsRequest = + DeleteObjectsRequest.builder() + .bucket(bucket) + .delete(Delete.builder().objects(objectIds).build()) + .build(); DeleteObjectsResponse response = client().deleteObjects(deleteObjectsRequest); if (response.hasErrors()) { - return response.errors() - .stream() + return response.errors().stream() .map(error -> String.format("s3://%s/%s", bucket, error.key())) .collect(Collectors.toList()); } @@ -259,22 +269,26 @@ private List deleteObjectsInBucket(String bucket, Collection obj @Override public Iterable listPrefix(String prefix) { S3URI s3uri = new S3URI(prefix, awsProperties.s3BucketToAccessPointMapping()); - ListObjectsV2Request request = ListObjectsV2Request.builder().bucket(s3uri.bucket()).prefix(s3uri.key()).build(); - - return () -> client().listObjectsV2Paginator(request).stream() - .flatMap(r -> r.contents().stream()) - .map(o -> new FileInfo( - String.format("%s://%s/%s", s3uri.scheme(), s3uri.bucket(), o.key()), - o.size(), o.lastModified().toEpochMilli())).iterator(); + ListObjectsV2Request request = + ListObjectsV2Request.builder().bucket(s3uri.bucket()).prefix(s3uri.key()).build(); + + return () -> + client().listObjectsV2Paginator(request).stream() + .flatMap(r -> r.contents().stream()) + .map( + o -> + new FileInfo( + String.format("%s://%s/%s", s3uri.scheme(), s3uri.bucket(), o.key()), + o.size(), + o.lastModified().toEpochMilli())) + .iterator(); } /** - * This method provides a "best-effort" to delete all objects under the - * given prefix. + * This method provides a "best-effort" to delete all objects under the given prefix. * - * Bulk delete operations are used and no reattempt is made for deletes if - * they fail, but will log any individual objects that are not deleted as part - * of the bulk operation. + *

Bulk delete operations are used and no reattempt is made for deletes if they fail, but will + * log any individual objects that are not deleted as part of the bulk operation. * * @param prefix prefix to delete */ @@ -298,8 +312,9 @@ private ExecutorService executorService() { if (executorService == null) { synchronized (S3FileIO.class) { if (executorService == null) { - executorService = ThreadPools.newWorkerPool( - "iceberg-s3fileio-delete", awsProperties.s3FileIoDeleteThreads()); + executorService = + ThreadPools.newWorkerPool( + "iceberg-s3fileio-delete", awsProperties.s3FileIoDeleteThreads()); } } } @@ -337,7 +352,10 @@ public void initialize(Map props) { context.initialize(props); this.metrics = context; } catch (NoClassDefFoundError | NoSuchMethodException | ClassCastException e) { - LOG.warn("Unable to load metrics class: '{}', falling back to null metrics", DEFAULT_METRICS_IMPL, e); + LOG.warn( + "Unable to load metrics class: '{}', falling back to null metrics", + DEFAULT_METRICS_IMPL, + e); } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/S3InputFile.java b/aws/src/main/java/org/apache/iceberg/aws/s3/S3InputFile.java index e4862e0dcfbd..aeef04a6b98a 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/S3InputFile.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/S3InputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.s3; import org.apache.iceberg.aws.AwsProperties; @@ -31,19 +30,36 @@ public class S3InputFile extends BaseS3File implements InputFile, NativelyEncryp private NativeFileCryptoParameters nativeDecryptionParameters; private Long length; - public static S3InputFile fromLocation(String location, S3Client client, AwsProperties awsProperties, - MetricsContext metrics) { - return new S3InputFile(client, new S3URI(location, awsProperties.s3BucketToAccessPointMapping()), null, - awsProperties, metrics); + public static S3InputFile fromLocation( + String location, S3Client client, AwsProperties awsProperties, MetricsContext metrics) { + return new S3InputFile( + client, + new S3URI(location, awsProperties.s3BucketToAccessPointMapping()), + null, + awsProperties, + metrics); } - public static S3InputFile fromLocation(String location, long length, S3Client client, AwsProperties awsProperties, - MetricsContext metrics) { - return new S3InputFile(client, new S3URI(location, awsProperties.s3BucketToAccessPointMapping()), - length > 0 ? length : null, awsProperties, metrics); + public static S3InputFile fromLocation( + String location, + long length, + S3Client client, + AwsProperties awsProperties, + MetricsContext metrics) { + return new S3InputFile( + client, + new S3URI(location, awsProperties.s3BucketToAccessPointMapping()), + length > 0 ? length : null, + awsProperties, + metrics); } - S3InputFile(S3Client client, S3URI uri, Long length, AwsProperties awsProperties, MetricsContext metrics) { + S3InputFile( + S3Client client, + S3URI uri, + Long length, + AwsProperties awsProperties, + MetricsContext metrics) { super(client, uri, awsProperties, metrics); this.length = length; } diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/S3InputStream.java b/aws/src/main/java/org/apache/iceberg/aws/s3/S3InputStream.java index 92d0f3f3eae9..c0847d217bdb 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/S3InputStream.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/S3InputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.s3; import java.io.IOException; @@ -67,7 +66,8 @@ class S3InputStream extends SeekableInputStream implements RangeReadable { this.awsProperties = awsProperties; this.readBytes = metrics.counter(FileIOMetricsContext.READ_BYTES, Long.class, Unit.BYTES); - this.readOperations = metrics.counter(FileIOMetricsContext.READ_OPERATIONS, Integer.class, Unit.COUNT); + this.readOperations = + metrics.counter(FileIOMetricsContext.READ_OPERATIONS, Integer.class, Unit.COUNT); this.createStack = Thread.currentThread().getStackTrace(); } @@ -132,10 +132,8 @@ public int readTail(byte[] buffer, int offset, int length) throws IOException { } private InputStream readRange(String range) { - GetObjectRequest.Builder requestBuilder = GetObjectRequest.builder() - .bucket(location.bucket()) - .key(location.key()) - .range(range); + GetObjectRequest.Builder requestBuilder = + GetObjectRequest.builder().bucket(location.bucket()).key(location.key()).range(range); S3RequestUtil.configureEncryption(awsProperties, requestBuilder); @@ -178,10 +176,11 @@ private void positionStream() throws IOException { } private void openStream() throws IOException { - GetObjectRequest.Builder requestBuilder = GetObjectRequest.builder() - .bucket(location.bucket()) - .key(location.key()) - .range(String.format("bytes=%s-", pos)); + GetObjectRequest.Builder requestBuilder = + GetObjectRequest.builder() + .bucket(location.bucket()) + .key(location.key()) + .range(String.format("bytes=%s-", pos)); S3RequestUtil.configureEncryption(awsProperties, requestBuilder); @@ -205,8 +204,7 @@ protected void finalize() throws Throwable { super.finalize(); if (!closed) { close(); // releasing resources is more important than printing the warning - String trace = Joiner.on("\n\t").join( - Arrays.copyOfRange(createStack, 1, createStack.length)); + String trace = Joiner.on("\n\t").join(Arrays.copyOfRange(createStack, 1, createStack.length)); LOG.warn("Unclosed input stream created by:\n\t{}", trace); } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/S3OutputFile.java b/aws/src/main/java/org/apache/iceberg/aws/s3/S3OutputFile.java index f2af425f1c08..534596131576 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/S3OutputFile.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/S3OutputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.s3; import java.io.IOException; @@ -34,10 +33,13 @@ public class S3OutputFile extends BaseS3File implements OutputFile, NativelyEncryptedFile { private NativeFileCryptoParameters nativeEncryptionParameters; - public static S3OutputFile fromLocation(String location, S3Client client, AwsProperties awsProperties, - MetricsContext metrics) { - return new S3OutputFile(client, new S3URI(location, awsProperties.s3BucketToAccessPointMapping()), - awsProperties, metrics); + public static S3OutputFile fromLocation( + String location, S3Client client, AwsProperties awsProperties, MetricsContext metrics) { + return new S3OutputFile( + client, + new S3URI(location, awsProperties.s3BucketToAccessPointMapping()), + awsProperties, + metrics); } S3OutputFile(S3Client client, S3URI uri, AwsProperties awsProperties, MetricsContext metrics) { @@ -45,8 +47,8 @@ public static S3OutputFile fromLocation(String location, S3Client client, AwsPro } /** - * Create an output stream for the specified location if the target object - * does not exist in S3 at the time of invocation. + * Create an output stream for the specified location if the target object does not exist in S3 at + * the time of invocation. * * @return output stream */ diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/S3OutputStream.java b/aws/src/main/java/org/apache/iceberg/aws/s3/S3OutputStream.java index b307d9081fae..5ec346ba29af 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/S3OutputStream.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/S3OutputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.s3; import java.io.BufferedInputStream; @@ -112,13 +111,15 @@ class S3OutputStream extends PositionOutputStream { if (executorService == null) { synchronized (S3OutputStream.class) { if (executorService == null) { - executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool( - awsProperties.s3FileIoMultipartUploadThreads(), - new ThreadFactoryBuilder() - .setDaemon(true) - .setNameFormat("iceberg-s3fileio-upload-%d") - .build())); + executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) + Executors.newFixedThreadPool( + awsProperties.s3FileIoMultipartUploadThreads(), + new ThreadFactoryBuilder() + .setDaemon(true) + .setNameFormat("iceberg-s3fileio-upload-%d") + .build())); } } } @@ -131,17 +132,21 @@ class S3OutputStream extends PositionOutputStream { this.createStack = Thread.currentThread().getStackTrace(); this.multiPartSize = awsProperties.s3FileIoMultiPartSize(); - this.multiPartThresholdSize = (int) (multiPartSize * awsProperties.s3FileIOMultipartThresholdFactor()); + this.multiPartThresholdSize = + (int) (multiPartSize * awsProperties.s3FileIOMultipartThresholdFactor()); this.stagingDirectory = new File(awsProperties.s3fileIoStagingDirectory()); this.isChecksumEnabled = awsProperties.isS3ChecksumEnabled(); try { - this.completeMessageDigest = isChecksumEnabled ? MessageDigest.getInstance(digestAlgorithm) : null; + this.completeMessageDigest = + isChecksumEnabled ? MessageDigest.getInstance(digestAlgorithm) : null; } catch (NoSuchAlgorithmException e) { - throw new RuntimeException("Failed to create message digest needed for s3 checksum checks", e); + throw new RuntimeException( + "Failed to create message digest needed for s3 checksum checks", e); } this.writeBytes = metrics.counter(FileIOMetricsContext.WRITE_BYTES, Long.class, Unit.BYTES); - this.writeOperations = metrics.counter(FileIOMetricsContext.WRITE_OPERATIONS, Integer.class, Unit.COUNT); + this.writeOperations = + metrics.counter(FileIOMetricsContext.WRITE_OPERATIONS, Integer.class, Unit.COUNT); newStream(); } @@ -215,9 +220,11 @@ private void newStream() throws IOException { currentStagingFile = File.createTempFile("s3fileio-", ".tmp", stagingDirectory); currentStagingFile.deleteOnExit(); try { - currentPartMessageDigest = isChecksumEnabled ? MessageDigest.getInstance(digestAlgorithm) : null; + currentPartMessageDigest = + isChecksumEnabled ? MessageDigest.getInstance(digestAlgorithm) : null; } catch (NoSuchAlgorithmException e) { - throw new RuntimeException("Failed to create message digest needed for s3 checksum checks.", e); + throw new RuntimeException( + "Failed to create message digest needed for s3 checksum checks.", e); } stagingFiles.add(new FileAndDigest(currentStagingFile, currentPartMessageDigest)); @@ -227,16 +234,24 @@ private void newStream() throws IOException { // if switched over to multipart threshold already, no need to update complete message digest if (multipartUploadId != null) { - digestOutputStream = new DigestOutputStream(new BufferedOutputStream( - new FileOutputStream(currentStagingFile)), currentPartMessageDigest); + digestOutputStream = + new DigestOutputStream( + new BufferedOutputStream(new FileOutputStream(currentStagingFile)), + currentPartMessageDigest); } else { - digestOutputStream = new DigestOutputStream(new DigestOutputStream(new BufferedOutputStream( - new FileOutputStream(currentStagingFile)), currentPartMessageDigest), completeMessageDigest); + digestOutputStream = + new DigestOutputStream( + new DigestOutputStream( + new BufferedOutputStream(new FileOutputStream(currentStagingFile)), + currentPartMessageDigest), + completeMessageDigest); } stream = new CountingOutputStream(digestOutputStream); } else { - stream = new CountingOutputStream(new BufferedOutputStream(new FileOutputStream(currentStagingFile))); + stream = + new CountingOutputStream( + new BufferedOutputStream(new FileOutputStream(currentStagingFile))); } } @@ -259,9 +274,8 @@ public void close() throws IOException { } private void initializeMultiPartUpload() { - CreateMultipartUploadRequest.Builder requestBuilder = CreateMultipartUploadRequest.builder() - .bucket(location.bucket()) - .key(location.key()); + CreateMultipartUploadRequest.Builder requestBuilder = + CreateMultipartUploadRequest.builder().bucket(location.bucket()).key(location.key()); if (writeTags != null && !writeTags.isEmpty()) { requestBuilder.tagging(Tagging.builder().tagSet(writeTags).build()); } @@ -284,67 +298,78 @@ private void uploadParts() { .filter(f -> closed || !f.file().equals(currentStagingFile)) // do not upload any files that have already been processed .filter(Predicates.not(f -> multiPartMap.containsKey(f.file()))) - .forEach(fileAndDigest -> { - File f = fileAndDigest.file(); - UploadPartRequest.Builder requestBuilder = UploadPartRequest.builder() - .bucket(location.bucket()) - .key(location.key()) - .uploadId(multipartUploadId) - .partNumber(stagingFiles.indexOf(fileAndDigest) + 1) - .contentLength(f.length()); - - if (fileAndDigest.hasDigest()) { - requestBuilder.contentMD5(BinaryUtils.toBase64(fileAndDigest.digest())); - } - - S3RequestUtil.configureEncryption(awsProperties, requestBuilder); - - UploadPartRequest uploadRequest = requestBuilder.build(); - - CompletableFuture future = CompletableFuture.supplyAsync( - () -> { - UploadPartResponse response = s3.uploadPart(uploadRequest, RequestBody.fromFile(f)); - return CompletedPart.builder().eTag(response.eTag()).partNumber(uploadRequest.partNumber()).build(); - }, - executorService - ).whenComplete((result, thrown) -> { - try { - Files.deleteIfExists(f.toPath()); - } catch (IOException e) { - LOG.warn("Failed to delete staging file: {}", f, e); - } - - if (thrown != null) { - LOG.error("Failed to upload part: {}", uploadRequest, thrown); - abortUpload(); - } - }); - - multiPartMap.put(f, future); - }); + .forEach( + fileAndDigest -> { + File f = fileAndDigest.file(); + UploadPartRequest.Builder requestBuilder = + UploadPartRequest.builder() + .bucket(location.bucket()) + .key(location.key()) + .uploadId(multipartUploadId) + .partNumber(stagingFiles.indexOf(fileAndDigest) + 1) + .contentLength(f.length()); + + if (fileAndDigest.hasDigest()) { + requestBuilder.contentMD5(BinaryUtils.toBase64(fileAndDigest.digest())); + } + + S3RequestUtil.configureEncryption(awsProperties, requestBuilder); + + UploadPartRequest uploadRequest = requestBuilder.build(); + + CompletableFuture future = + CompletableFuture.supplyAsync( + () -> { + UploadPartResponse response = + s3.uploadPart(uploadRequest, RequestBody.fromFile(f)); + return CompletedPart.builder() + .eTag(response.eTag()) + .partNumber(uploadRequest.partNumber()) + .build(); + }, + executorService) + .whenComplete( + (result, thrown) -> { + try { + Files.deleteIfExists(f.toPath()); + } catch (IOException e) { + LOG.warn("Failed to delete staging file: {}", f, e); + } + + if (thrown != null) { + LOG.error("Failed to upload part: {}", uploadRequest, thrown); + abortUpload(); + } + }); + + multiPartMap.put(f, future); + }); } private void completeMultiPartUpload() { Preconditions.checkState(closed, "Complete upload called on open stream: " + location); List completedParts = - multiPartMap.values() - .stream() + multiPartMap.values().stream() .map(CompletableFuture::join) .sorted(Comparator.comparing(CompletedPart::partNumber)) .collect(Collectors.toList()); - CompleteMultipartUploadRequest request = CompleteMultipartUploadRequest.builder() - .bucket(location.bucket()).key(location.key()) - .uploadId(multipartUploadId) - .multipartUpload(CompletedMultipartUpload.builder().parts(completedParts).build()).build(); + CompleteMultipartUploadRequest request = + CompleteMultipartUploadRequest.builder() + .bucket(location.bucket()) + .key(location.key()) + .uploadId(multipartUploadId) + .multipartUpload(CompletedMultipartUpload.builder().parts(completedParts).build()) + .build(); Tasks.foreach(request) .noRetry() - .onFailure((r, thrown) -> { - LOG.error("Failed to complete multipart upload request: {}", r, thrown); - abortUpload(); - }) + .onFailure( + (r, thrown) -> { + LOG.error("Failed to complete multipart upload request: {}", r, thrown); + abortUpload(); + }) .throwFailureWhenFinished() .run(s3::completeMultipartUpload); } @@ -352,8 +377,12 @@ private void completeMultiPartUpload() { private void abortUpload() { if (multipartUploadId != null) { try { - s3.abortMultipartUpload(AbortMultipartUploadRequest.builder() - .bucket(location.bucket()).key(location.key()).uploadId(multipartUploadId).build()); + s3.abortMultipartUpload( + AbortMultipartUploadRequest.builder() + .bucket(location.bucket()) + .key(location.key()) + .uploadId(multipartUploadId) + .build()); } finally { cleanUpStagingFiles(); } @@ -369,16 +398,19 @@ private void cleanUpStagingFiles() { private void completeUploads() { if (multipartUploadId == null) { - long contentLength = stagingFiles.stream().map(FileAndDigest::file).mapToLong(File::length).sum(); - ContentStreamProvider contentProvider = () -> new BufferedInputStream(stagingFiles.stream() - .map(FileAndDigest::file) - .map(S3OutputStream::uncheckedInputStream) - .reduce(SequenceInputStream::new) - .orElseGet(() -> new ByteArrayInputStream(new byte[0]))); - - PutObjectRequest.Builder requestBuilder = PutObjectRequest.builder() - .bucket(location.bucket()) - .key(location.key()); + long contentLength = + stagingFiles.stream().map(FileAndDigest::file).mapToLong(File::length).sum(); + ContentStreamProvider contentProvider = + () -> + new BufferedInputStream( + stagingFiles.stream() + .map(FileAndDigest::file) + .map(S3OutputStream::uncheckedInputStream) + .reduce(SequenceInputStream::new) + .orElseGet(() -> new ByteArrayInputStream(new byte[0]))); + + PutObjectRequest.Builder requestBuilder = + PutObjectRequest.builder().bucket(location.bucket()).key(location.key()); if (writeTags != null && !writeTags.isEmpty()) { requestBuilder.tagging(Tagging.builder().tagSet(writeTags).build()); @@ -393,7 +425,8 @@ private void completeUploads() { s3.putObject( requestBuilder.build(), - RequestBody.fromContentProvider(contentProvider, contentLength, Mimetype.MIMETYPE_OCTET_STREAM)); + RequestBody.fromContentProvider( + contentProvider, contentLength, Mimetype.MIMETYPE_OCTET_STREAM)); } else { uploadParts(); completeMultiPartUpload(); @@ -410,19 +443,21 @@ private static InputStream uncheckedInputStream(File file) { private void createStagingDirectoryIfNotExists() throws IOException, SecurityException { if (!stagingDirectory.exists()) { - LOG.info("Staging directory does not exist, trying to create one: {}", + LOG.info( + "Staging directory does not exist, trying to create one: {}", stagingDirectory.getAbsolutePath()); boolean createdStagingDirectory = stagingDirectory.mkdirs(); if (createdStagingDirectory) { LOG.info("Successfully created staging directory: {}", stagingDirectory.getAbsolutePath()); } else { if (stagingDirectory.exists()) { - LOG.info("Successfully created staging directory by another process: {}", + LOG.info( + "Successfully created staging directory by another process: {}", stagingDirectory.getAbsolutePath()); } else { throw new IOException( - "Failed to create staging directory due to some unknown reason: " + stagingDirectory - .getAbsolutePath()); + "Failed to create staging directory due to some unknown reason: " + + stagingDirectory.getAbsolutePath()); } } } @@ -434,8 +469,7 @@ protected void finalize() throws Throwable { super.finalize(); if (!closed) { close(); // releasing resources is more important than printing the warning - String trace = Joiner.on("\n\t").join( - Arrays.copyOfRange(createStack, 1, createStack.length)); + String trace = Joiner.on("\n\t").join(Arrays.copyOfRange(createStack, 1, createStack.length)); LOG.warn("Unclosed output stream created by:\n\t{}", trace); } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/S3RequestUtil.java b/aws/src/main/java/org/apache/iceberg/aws/s3/S3RequestUtil.java index 1a32ac42c840..1adbda66c0f5 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/S3RequestUtil.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/S3RequestUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.s3; import java.util.Locale; @@ -34,35 +33,65 @@ @SuppressWarnings("UnnecessaryLambda") public class S3RequestUtil { - private static final Function NULL_SSE_SETTER = sse -> null; + private static final Function NULL_SSE_SETTER = + sse -> null; private static final Function NULL_STRING_SETTER = s -> null; - private S3RequestUtil() { - } + private S3RequestUtil() {} - static void configureEncryption(AwsProperties awsProperties, PutObjectRequest.Builder requestBuilder) { - configureEncryption(awsProperties, requestBuilder::serverSideEncryption, requestBuilder::ssekmsKeyId, - requestBuilder::sseCustomerAlgorithm, requestBuilder::sseCustomerKey, requestBuilder::sseCustomerKeyMD5); + static void configureEncryption( + AwsProperties awsProperties, PutObjectRequest.Builder requestBuilder) { + configureEncryption( + awsProperties, + requestBuilder::serverSideEncryption, + requestBuilder::ssekmsKeyId, + requestBuilder::sseCustomerAlgorithm, + requestBuilder::sseCustomerKey, + requestBuilder::sseCustomerKeyMD5); } - static void configureEncryption(AwsProperties awsProperties, CreateMultipartUploadRequest.Builder requestBuilder) { - configureEncryption(awsProperties, requestBuilder::serverSideEncryption, requestBuilder::ssekmsKeyId, - requestBuilder::sseCustomerAlgorithm, requestBuilder::sseCustomerKey, requestBuilder::sseCustomerKeyMD5); + static void configureEncryption( + AwsProperties awsProperties, CreateMultipartUploadRequest.Builder requestBuilder) { + configureEncryption( + awsProperties, + requestBuilder::serverSideEncryption, + requestBuilder::ssekmsKeyId, + requestBuilder::sseCustomerAlgorithm, + requestBuilder::sseCustomerKey, + requestBuilder::sseCustomerKeyMD5); } - static void configureEncryption(AwsProperties awsProperties, UploadPartRequest.Builder requestBuilder) { - configureEncryption(awsProperties, NULL_SSE_SETTER, NULL_STRING_SETTER, - requestBuilder::sseCustomerAlgorithm, requestBuilder::sseCustomerKey, requestBuilder::sseCustomerKeyMD5); + static void configureEncryption( + AwsProperties awsProperties, UploadPartRequest.Builder requestBuilder) { + configureEncryption( + awsProperties, + NULL_SSE_SETTER, + NULL_STRING_SETTER, + requestBuilder::sseCustomerAlgorithm, + requestBuilder::sseCustomerKey, + requestBuilder::sseCustomerKeyMD5); } - static void configureEncryption(AwsProperties awsProperties, GetObjectRequest.Builder requestBuilder) { - configureEncryption(awsProperties, NULL_SSE_SETTER, NULL_STRING_SETTER, - requestBuilder::sseCustomerAlgorithm, requestBuilder::sseCustomerKey, requestBuilder::sseCustomerKeyMD5); + static void configureEncryption( + AwsProperties awsProperties, GetObjectRequest.Builder requestBuilder) { + configureEncryption( + awsProperties, + NULL_SSE_SETTER, + NULL_STRING_SETTER, + requestBuilder::sseCustomerAlgorithm, + requestBuilder::sseCustomerKey, + requestBuilder::sseCustomerKeyMD5); } - static void configureEncryption(AwsProperties awsProperties, HeadObjectRequest.Builder requestBuilder) { - configureEncryption(awsProperties, NULL_SSE_SETTER, NULL_STRING_SETTER, - requestBuilder::sseCustomerAlgorithm, requestBuilder::sseCustomerKey, requestBuilder::sseCustomerKeyMD5); + static void configureEncryption( + AwsProperties awsProperties, HeadObjectRequest.Builder requestBuilder) { + configureEncryption( + awsProperties, + NULL_SSE_SETTER, + NULL_STRING_SETTER, + requestBuilder::sseCustomerAlgorithm, + requestBuilder::sseCustomerKey, + requestBuilder::sseCustomerKeyMD5); } @SuppressWarnings("ReturnValueIgnored") @@ -100,18 +129,19 @@ static void configureEncryption( } } - static void configurePermission(AwsProperties awsProperties, PutObjectRequest.Builder requestBuilder) { + static void configurePermission( + AwsProperties awsProperties, PutObjectRequest.Builder requestBuilder) { configurePermission(awsProperties, requestBuilder::acl); } - static void configurePermission(AwsProperties awsProperties, CreateMultipartUploadRequest.Builder requestBuilder) { + static void configurePermission( + AwsProperties awsProperties, CreateMultipartUploadRequest.Builder requestBuilder) { configurePermission(awsProperties, requestBuilder::acl); } @SuppressWarnings("ReturnValueIgnored") static void configurePermission( - AwsProperties awsProperties, - Function aclSetter) { + AwsProperties awsProperties, Function aclSetter) { aclSetter.apply(awsProperties.s3FileIoAcl()); } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/S3URI.java b/aws/src/main/java/org/apache/iceberg/aws/s3/S3URI.java index 6a798b18364c..3d73694fb88d 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/S3URI.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/S3URI.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.s3; import java.util.Map; @@ -25,15 +24,12 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; /** - * This class represents a fully qualified location in S3 for input/output - * operations expressed as as URI. This implementation is provided to - * ensure compatibility with Hadoop Path implementations that may introduce - * encoding issues with native URI implementation. - * If the bucket in the location has an access point in the mapping, the - * access point is used to perform all the S3 operations. + * This class represents a fully qualified location in S3 for input/output operations expressed as + * as URI. This implementation is provided to ensure compatibility with Hadoop Path implementations + * that may introduce encoding issues with native URI implementation. If the bucket in the location + * has an access point in the mapping, the access point is used to perform all the S3 operations. * - * Note: Path-style access is deprecated and not supported by this - * implementation. + *

Note: Path-style access is deprecated and not supported by this implementation. */ class S3URI { private static final String SCHEME_DELIM = "://"; @@ -48,9 +44,9 @@ class S3URI { /** * Creates a new S3URI in the form of scheme://bucket/key?query#fragment - *

- * The URI supports any valid URI schemes to be backwards compatible with s3a and s3n, - * and also allows users to use S3FileIO with other S3-compatible object storage services like GCS. + * + *

The URI supports any valid URI schemes to be backwards compatible with s3a and s3n, and also + * allows users to use S3FileIO with other S3-compatible object storage services like GCS. * * @param location fully qualified URI */ @@ -59,11 +55,11 @@ class S3URI { } /** - * Creates a new S3URI in the form of scheme://(bucket|accessPoint)/key?query#fragment with additional information - * on accessPoints. - *

- * The URI supports any valid URI schemes to be backwards compatible with s3a and s3n, - * and also allows users to use S3FileIO with other S3-compatible object storage services like GCS. + * Creates a new S3URI in the form of scheme://(bucket|accessPoint)/key?query#fragment with + * additional information on accessPoints. + * + *

The URI supports any valid URI schemes to be backwards compatible with s3a and s3n, and also + * allows users to use S3FileIO with other S3-compatible object storage services like GCS. * * @param location fully qualified URI * @param bucketToAccessPointMapping contains mapping of bucket to access point @@ -72,15 +68,20 @@ class S3URI { Preconditions.checkNotNull(location, "Location cannot be null."); this.location = location; - String [] schemeSplit = location.split(SCHEME_DELIM, -1); - ValidationException.check(schemeSplit.length == 2, "Invalid S3 URI, cannot determine scheme: %s", location); + String[] schemeSplit = location.split(SCHEME_DELIM, -1); + ValidationException.check( + schemeSplit.length == 2, "Invalid S3 URI, cannot determine scheme: %s", location); this.scheme = schemeSplit[0]; - String [] authoritySplit = schemeSplit[1].split(PATH_DELIM, 2); - ValidationException.check(authoritySplit.length == 2, "Invalid S3 URI, cannot determine bucket: %s", location); - ValidationException.check(!authoritySplit[1].trim().isEmpty(), "Invalid S3 URI, path is empty: %s", location); - this.bucket = bucketToAccessPointMapping == null ? authoritySplit[0] : bucketToAccessPointMapping.getOrDefault( - authoritySplit[0], authoritySplit[0]); + String[] authoritySplit = schemeSplit[1].split(PATH_DELIM, 2); + ValidationException.check( + authoritySplit.length == 2, "Invalid S3 URI, cannot determine bucket: %s", location); + ValidationException.check( + !authoritySplit[1].trim().isEmpty(), "Invalid S3 URI, path is empty: %s", location); + this.bucket = + bucketToAccessPointMapping == null + ? authoritySplit[0] + : bucketToAccessPointMapping.getOrDefault(authoritySplit[0], authoritySplit[0]); // Strip query and fragment if they exist String path = authoritySplit[1]; @@ -89,23 +90,17 @@ class S3URI { this.key = path; } - /** - * Returns S3 bucket name. - */ + /** Returns S3 bucket name. */ public String bucket() { return bucket; } - /** - * Returns S3 object key name. - */ + /** Returns S3 object key name. */ public String key() { return key; } - /** - * Returns original, unmodified S3 URI location. - */ + /** Returns original, unmodified S3 URI location. */ public String location() { return location; } diff --git a/aws/src/test/java/org/apache/iceberg/aws/TestAwsClientFactories.java b/aws/src/test/java/org/apache/iceberg/aws/TestAwsClientFactories.java index 86a10e491ae3..19098b056b83 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/TestAwsClientFactories.java +++ b/aws/src/test/java/org/apache/iceberg/aws/TestAwsClientFactories.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws; import java.util.Map; @@ -34,50 +33,61 @@ import software.amazon.awssdk.services.kms.KmsClient; import software.amazon.awssdk.services.s3.S3Client; - public class TestAwsClientFactories { @Test public void testLoadDefault() { - Assert.assertEquals("default client should be singleton", - AwsClientFactories.defaultFactory(), AwsClientFactories.defaultFactory()); - - Assert.assertTrue("should load default when not configured", - AwsClientFactories.from(Maps.newHashMap()) instanceof AwsClientFactories.DefaultAwsClientFactory); + Assert.assertEquals( + "default client should be singleton", + AwsClientFactories.defaultFactory(), + AwsClientFactories.defaultFactory()); + + Assert.assertTrue( + "should load default when not configured", + AwsClientFactories.from(Maps.newHashMap()) + instanceof AwsClientFactories.DefaultAwsClientFactory); } @Test public void testLoadCustom() { Map properties = Maps.newHashMap(); properties.put(AwsProperties.CLIENT_FACTORY, CustomFactory.class.getName()); - Assert.assertTrue("should load custom class", - AwsClientFactories.from(properties) instanceof CustomFactory); + Assert.assertTrue( + "should load custom class", AwsClientFactories.from(properties) instanceof CustomFactory); } @Test public void testS3FileIoCredentialsProviders() { - AwsCredentialsProvider basicCredentials = AwsClientFactories.credentialsProvider("key", "secret", null); - Assert.assertTrue("Should use basic credentials if access key ID and secret access key are set", - basicCredentials.resolveCredentials() instanceof AwsBasicCredentials); - AwsCredentialsProvider sessionCredentials = AwsClientFactories.credentialsProvider("key", "secret", "token"); - Assert.assertTrue("Should use session credentials if session token is set", - sessionCredentials.resolveCredentials() instanceof AwsSessionCredentials); - Assert.assertTrue("Should use default credentials if nothing is set", - AwsClientFactories.credentialsProvider(null, null, null) instanceof DefaultCredentialsProvider); + AwsCredentialsProvider basicCredentials = + AwsClientFactories.credentialsProvider("key", "secret", null); + Assert.assertTrue( + "Should use basic credentials if access key ID and secret access key are set", + basicCredentials.resolveCredentials() instanceof AwsBasicCredentials); + AwsCredentialsProvider sessionCredentials = + AwsClientFactories.credentialsProvider("key", "secret", "token"); + Assert.assertTrue( + "Should use session credentials if session token is set", + sessionCredentials.resolveCredentials() instanceof AwsSessionCredentials); + Assert.assertTrue( + "Should use default credentials if nothing is set", + AwsClientFactories.credentialsProvider(null, null, null) + instanceof DefaultCredentialsProvider); } @Test public void testS3FileIoCredentialsVerification() { Map properties = Maps.newHashMap(); properties.put(AwsProperties.S3FILEIO_ACCESS_KEY_ID, "key"); - AssertHelpers.assertThrows("Should fail if only access key ID is set", + AssertHelpers.assertThrows( + "Should fail if only access key ID is set", ValidationException.class, "S3 client access key ID and secret access key must be set at the same time", () -> AwsClientFactories.from(properties)); properties.remove(AwsProperties.S3FILEIO_ACCESS_KEY_ID); properties.put(AwsProperties.S3FILEIO_SECRET_ACCESS_KEY, "secret"); - AssertHelpers.assertThrows("Should fail if only secret access key is set", + AssertHelpers.assertThrows( + "Should fail if only secret access key is set", ValidationException.class, "S3 client access key ID and secret access key must be set at the same time", () -> AwsClientFactories.from(properties)); @@ -85,8 +95,7 @@ public void testS3FileIoCredentialsVerification() { public static class CustomFactory implements AwsClientFactory { - public CustomFactory() { - } + public CustomFactory() {} @Override public S3Client s3() { @@ -109,9 +118,6 @@ public DynamoDbClient dynamo() { } @Override - public void initialize(Map properties) { - - } + public void initialize(Map properties) {} } - } diff --git a/aws/src/test/java/org/apache/iceberg/aws/TestAwsProperties.java b/aws/src/test/java/org/apache/iceberg/aws/TestAwsProperties.java index 1d849370330e..ab9a3b165c3e 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/TestAwsProperties.java +++ b/aws/src/test/java/org/apache/iceberg/aws/TestAwsProperties.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws; import java.util.Map; @@ -32,7 +31,8 @@ public class TestAwsProperties { public void testS3FileIoSseCustom_mustHaveCustomKey() { Map map = Maps.newHashMap(); map.put(AwsProperties.S3FILEIO_SSE_TYPE, AwsProperties.S3FILEIO_SSE_TYPE_CUSTOM); - AssertHelpers.assertThrows("must have key for SSE-C", + AssertHelpers.assertThrows( + "must have key for SSE-C", NullPointerException.class, "Cannot initialize SSE-C S3FileIO with null encryption key", () -> new AwsProperties(map)); @@ -43,7 +43,8 @@ public void testS3FileIoSseCustom_mustHaveCustomMd5() { Map map = Maps.newHashMap(); map.put(AwsProperties.S3FILEIO_SSE_TYPE, AwsProperties.S3FILEIO_SSE_TYPE_CUSTOM); map.put(AwsProperties.S3FILEIO_SSE_KEY, "something"); - AssertHelpers.assertThrows("must have md5 for SSE-C", + AssertHelpers.assertThrows( + "must have md5 for SSE-C", NullPointerException.class, "Cannot initialize SSE-C S3FileIO with null encryption key MD5", () -> new AwsProperties(map)); @@ -61,7 +62,8 @@ public void testS3FileIoAcl() { public void testS3FileIoAcl_unknownType() { Map map = Maps.newHashMap(); map.put(AwsProperties.S3FILEIO_ACL, "bad-input"); - AssertHelpers.assertThrows("should not accept bad input", + AssertHelpers.assertThrows( + "should not accept bad input", IllegalArgumentException.class, "Cannot support S3 CannedACL bad-input", () -> new AwsProperties(map)); @@ -71,7 +73,8 @@ public void testS3FileIoAcl_unknownType() { public void testS3MultipartSizeTooSmall() { Map map = Maps.newHashMap(); map.put(AwsProperties.S3FILEIO_MULTIPART_SIZE, "1"); - AssertHelpers.assertThrows("should not accept small part size", + AssertHelpers.assertThrows( + "should not accept small part size", IllegalArgumentException.class, "Minimum multipart upload object size must be larger than 5 MB", () -> new AwsProperties(map)); @@ -81,7 +84,8 @@ public void testS3MultipartSizeTooSmall() { public void testS3MultipartSizeTooLarge() { Map map = Maps.newHashMap(); map.put(AwsProperties.S3FILEIO_MULTIPART_SIZE, "5368709120"); // 5GB - AssertHelpers.assertThrows("should not accept too big part size", + AssertHelpers.assertThrows( + "should not accept too big part size", IllegalArgumentException.class, "Input malformed or exceeded maximum multipart upload size 5GB", () -> new AwsProperties(map)); @@ -91,7 +95,8 @@ public void testS3MultipartSizeTooLarge() { public void testS3MultipartThresholdFactorLessThanOne() { Map map = Maps.newHashMap(); map.put(AwsProperties.S3FILEIO_MULTIPART_THRESHOLD_FACTOR, "0.9"); - AssertHelpers.assertThrows("should not accept factor less than 1", + AssertHelpers.assertThrows( + "should not accept factor less than 1", IllegalArgumentException.class, "Multipart threshold factor must be >= to 1.0", () -> new AwsProperties(map)); @@ -101,7 +106,8 @@ public void testS3MultipartThresholdFactorLessThanOne() { public void testS3FileIoDeleteBatchSizeTooLarge() { Map map = Maps.newHashMap(); map.put(AwsProperties.S3FILEIO_DELETE_BATCH_SIZE, "2000"); - AssertHelpers.assertThrows("should not accept batch size greater than 1000", + AssertHelpers.assertThrows( + "should not accept batch size greater than 1000", IllegalArgumentException.class, "Deletion batch size must be between 1 and 1000", () -> new AwsProperties(map)); @@ -111,10 +117,10 @@ public void testS3FileIoDeleteBatchSizeTooLarge() { public void testS3FileIoDeleteBatchSizeTooSmall() { Map map = Maps.newHashMap(); map.put(AwsProperties.S3FILEIO_DELETE_BATCH_SIZE, "0"); - AssertHelpers.assertThrows("should not accept batch size less than 1", + AssertHelpers.assertThrows( + "should not accept batch size less than 1", IllegalArgumentException.class, "Deletion batch size must be between 1 and 1000", () -> new AwsProperties(map)); } - } diff --git a/aws/src/test/java/org/apache/iceberg/aws/glue/TestGlueCatalog.java b/aws/src/test/java/org/apache/iceberg/aws/glue/TestGlueCatalog.java index b24fc7e0ccd1..7902164f8218 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/glue/TestGlueCatalog.java +++ b/aws/src/test/java/org/apache/iceberg/aws/glue/TestGlueCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.glue; import java.util.List; @@ -75,142 +74,188 @@ public class TestGlueCatalog { public void before() { glue = Mockito.mock(GlueClient.class); glueCatalog = new GlueCatalog(); - glueCatalog.initialize(CATALOG_NAME, WAREHOUSE_PATH, new AwsProperties(), glue, - LockManagers.defaultLockManager(), null); + glueCatalog.initialize( + CATALOG_NAME, + WAREHOUSE_PATH, + new AwsProperties(), + glue, + LockManagers.defaultLockManager(), + null); } @Test public void testConstructorEmptyWarehousePath() { - AssertHelpers.assertThrows("warehouse path cannot be null", + AssertHelpers.assertThrows( + "warehouse path cannot be null", IllegalArgumentException.class, "Cannot initialize GlueCatalog because warehousePath must not be null or empty", () -> { - GlueCatalog catalog = new GlueCatalog(); - catalog.initialize(CATALOG_NAME, null, new AwsProperties(), glue, - LockManagers.defaultLockManager(), null); + GlueCatalog catalog = new GlueCatalog(); + catalog.initialize( + CATALOG_NAME, + null, + new AwsProperties(), + glue, + LockManagers.defaultLockManager(), + null); }); } @Test public void testConstructorWarehousePathWithEndSlash() { GlueCatalog catalogWithSlash = new GlueCatalog(); - catalogWithSlash.initialize(CATALOG_NAME, WAREHOUSE_PATH + "/", new AwsProperties(), glue, - LockManagers.defaultLockManager(), null); - Mockito.doReturn(GetDatabaseResponse.builder() - .database(Database.builder().name("db").build()).build()) - .when(glue).getDatabase(Mockito.any(GetDatabaseRequest.class)); + catalogWithSlash.initialize( + CATALOG_NAME, + WAREHOUSE_PATH + "/", + new AwsProperties(), + glue, + LockManagers.defaultLockManager(), + null); + Mockito.doReturn( + GetDatabaseResponse.builder().database(Database.builder().name("db").build()).build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); String location = catalogWithSlash.defaultWarehouseLocation(TableIdentifier.of("db", "table")); Assert.assertEquals(WAREHOUSE_PATH + "/db.db/table", location); } @Test public void testDefaultWarehouseLocationNoDbUri() { - Mockito.doReturn(GetDatabaseResponse.builder() - .database(Database.builder().name("db").build()).build()) - .when(glue).getDatabase(Mockito.any(GetDatabaseRequest.class)); + Mockito.doReturn( + GetDatabaseResponse.builder().database(Database.builder().name("db").build()).build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); String location = glueCatalog.defaultWarehouseLocation(TableIdentifier.of("db", "table")); Assert.assertEquals(WAREHOUSE_PATH + "/db.db/table", location); } @Test public void testDefaultWarehouseLocationDbUri() { - Mockito.doReturn(GetDatabaseResponse.builder() - .database(Database.builder().name("db").locationUri("s3://bucket2/db").build()).build()) - .when(glue).getDatabase(Mockito.any(GetDatabaseRequest.class)); + Mockito.doReturn( + GetDatabaseResponse.builder() + .database(Database.builder().name("db").locationUri("s3://bucket2/db").build()) + .build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); String location = glueCatalog.defaultWarehouseLocation(TableIdentifier.of("db", "table")); Assert.assertEquals("s3://bucket2/db/table", location); } @Test public void testListTables() { - Mockito.doReturn(GetDatabaseResponse.builder() - .database(Database.builder().name("db1").build()).build()) - .when(glue).getDatabase(Mockito.any(GetDatabaseRequest.class)); - Mockito.doReturn(GetTablesResponse.builder() - .tableList( - Table.builder().databaseName("db1").name("t1").parameters( - ImmutableMap.of( - BaseMetastoreTableOperations.TABLE_TYPE_PROP, BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE - ) - ).build(), - Table.builder().databaseName("db1").name("t2").parameters( - ImmutableMap.of( - "key", "val", - BaseMetastoreTableOperations.TABLE_TYPE_PROP, BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE - ) - ).build(), - Table.builder().databaseName("db1").name("t3").parameters( - ImmutableMap.of( - "key", "val", - BaseMetastoreTableOperations.TABLE_TYPE_PROP, "wrongVal" - ) - ).build(), - Table.builder().databaseName("db1").name("t4").parameters( - ImmutableMap.of( - "key", "val" - ) - ).build(), - Table.builder().databaseName("db1").name("t5").parameters(null).build() - ).build()) - .when(glue).getTables(Mockito.any(GetTablesRequest.class)); + Mockito.doReturn( + GetDatabaseResponse.builder().database(Database.builder().name("db1").build()).build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); + Mockito.doReturn( + GetTablesResponse.builder() + .tableList( + Table.builder() + .databaseName("db1") + .name("t1") + .parameters( + ImmutableMap.of( + BaseMetastoreTableOperations.TABLE_TYPE_PROP, + BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE)) + .build(), + Table.builder() + .databaseName("db1") + .name("t2") + .parameters( + ImmutableMap.of( + "key", + "val", + BaseMetastoreTableOperations.TABLE_TYPE_PROP, + BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE)) + .build(), + Table.builder() + .databaseName("db1") + .name("t3") + .parameters( + ImmutableMap.of( + "key", + "val", + BaseMetastoreTableOperations.TABLE_TYPE_PROP, + "wrongVal")) + .build(), + Table.builder() + .databaseName("db1") + .name("t4") + .parameters(ImmutableMap.of("key", "val")) + .build(), + Table.builder().databaseName("db1").name("t5").parameters(null).build()) + .build()) + .when(glue) + .getTables(Mockito.any(GetTablesRequest.class)); Assert.assertEquals( - Lists.newArrayList( - TableIdentifier.of("db1", "t1"), - TableIdentifier.of("db1", "t2") - ), - glueCatalog.listTables(Namespace.of("db1")) - ); + Lists.newArrayList(TableIdentifier.of("db1", "t1"), TableIdentifier.of("db1", "t2")), + glueCatalog.listTables(Namespace.of("db1"))); } @Test public void testListTablesPagination() { AtomicInteger counter = new AtomicInteger(10); - Mockito.doReturn(GetDatabaseResponse.builder() - .database(Database.builder().name("db1").build()).build()) - .when(glue).getDatabase(Mockito.any(GetDatabaseRequest.class)); - Mockito.doAnswer(new Answer() { - @Override - public Object answer(InvocationOnMock invocation) throws Throwable { - if (counter.decrementAndGet() > 0) { - return GetTablesResponse.builder() - .tableList( - Table.builder() - .databaseName("db1") - .name(UUID.randomUUID().toString().replace("-", "")) - .parameters(ImmutableMap.of( - BaseMetastoreTableOperations.TABLE_TYPE_PROP, - BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE - )) - .build() - ) - .nextToken("token") - .build(); - } else { - return GetTablesResponse.builder() - .tableList(Table.builder().databaseName("db1").name("tb1").parameters(ImmutableMap.of( - BaseMetastoreTableOperations.TABLE_TYPE_PROP, - BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE - )).build()) - .build(); - } - } - }).when(glue).getTables(Mockito.any(GetTablesRequest.class)); + Mockito.doReturn( + GetDatabaseResponse.builder().database(Database.builder().name("db1").build()).build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); + Mockito.doAnswer( + new Answer() { + @Override + public Object answer(InvocationOnMock invocation) throws Throwable { + if (counter.decrementAndGet() > 0) { + return GetTablesResponse.builder() + .tableList( + Table.builder() + .databaseName("db1") + .name(UUID.randomUUID().toString().replace("-", "")) + .parameters( + ImmutableMap.of( + BaseMetastoreTableOperations.TABLE_TYPE_PROP, + BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE)) + .build()) + .nextToken("token") + .build(); + } else { + return GetTablesResponse.builder() + .tableList( + Table.builder() + .databaseName("db1") + .name("tb1") + .parameters( + ImmutableMap.of( + BaseMetastoreTableOperations.TABLE_TYPE_PROP, + BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE)) + .build()) + .build(); + } + } + }) + .when(glue) + .getTables(Mockito.any(GetTablesRequest.class)); Assert.assertEquals(10, glueCatalog.listTables(Namespace.of("db1")).size()); } @Test public void testDropTable() { Map properties = Maps.newHashMap(); - properties.put(BaseMetastoreTableOperations.TABLE_TYPE_PROP, + properties.put( + BaseMetastoreTableOperations.TABLE_TYPE_PROP, BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE); - Mockito.doReturn(GetTableResponse.builder() - .table(Table.builder().databaseName("db1").name("t1").parameters(properties).build()).build()) - .when(glue).getTable(Mockito.any(GetTableRequest.class)); - Mockito.doReturn(GetDatabaseResponse.builder() - .database(Database.builder().name("db1").build()).build()) - .when(glue).getDatabase(Mockito.any(GetDatabaseRequest.class)); + Mockito.doReturn( + GetTableResponse.builder() + .table( + Table.builder().databaseName("db1").name("t1").parameters(properties).build()) + .build()) + .when(glue) + .getTable(Mockito.any(GetTableRequest.class)); + Mockito.doReturn( + GetDatabaseResponse.builder().database(Database.builder().name("db1").build()).build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); Mockito.doReturn(DeleteTableResponse.builder().build()) - .when(glue).deleteTable(Mockito.any(DeleteTableRequest.class)); + .when(glue) + .deleteTable(Mockito.any(DeleteTableRequest.class)); glueCatalog.dropTable(TableIdentifier.of("db1", "t1")); } @@ -218,23 +263,33 @@ public void testDropTable() { public void testRenameTable() { AtomicInteger counter = new AtomicInteger(1); Map properties = Maps.newHashMap(); - properties.put(BaseMetastoreTableOperations.TABLE_TYPE_PROP, + properties.put( + BaseMetastoreTableOperations.TABLE_TYPE_PROP, BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE); - Mockito.doReturn(GetTableResponse.builder() - .table(Table.builder().databaseName("db1").name("t1").parameters(properties).build()).build()) - .when(glue).getTable(Mockito.any(GetTableRequest.class)); + Mockito.doReturn( + GetTableResponse.builder() + .table( + Table.builder().databaseName("db1").name("t1").parameters(properties).build()) + .build()) + .when(glue) + .getTable(Mockito.any(GetTableRequest.class)); Mockito.doReturn(GetTablesResponse.builder().build()) - .when(glue).getTables(Mockito.any(GetTablesRequest.class)); - Mockito.doReturn(GetDatabaseResponse.builder() - .database(Database.builder().name("db1").build()).build()) - .when(glue).getDatabase(Mockito.any(GetDatabaseRequest.class)); - Mockito.doAnswer(new Answer() { - @Override - public Object answer(InvocationOnMock invocation) throws Throwable { - counter.decrementAndGet(); - return DeleteTableResponse.builder().build(); - } - }).when(glue).deleteTable(Mockito.any(DeleteTableRequest.class)); + .when(glue) + .getTables(Mockito.any(GetTablesRequest.class)); + Mockito.doReturn( + GetDatabaseResponse.builder().database(Database.builder().name("db1").build()).build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); + Mockito.doAnswer( + new Answer() { + @Override + public Object answer(InvocationOnMock invocation) throws Throwable { + counter.decrementAndGet(); + return DeleteTableResponse.builder().build(); + } + }) + .when(glue) + .deleteTable(Mockito.any(DeleteTableRequest.class)); glueCatalog.dropTable(TableIdentifier.of("db1", "t1")); Assert.assertEquals(0, counter.get()); } @@ -244,37 +299,50 @@ public void testRenameTableWithStorageDescriptor() { AtomicInteger counter = new AtomicInteger(1); Map parameters = Maps.newHashMap(); - parameters.put(BaseMetastoreTableOperations.TABLE_TYPE_PROP, - BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE); + parameters.put( + BaseMetastoreTableOperations.TABLE_TYPE_PROP, + BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE); Map storageDescriptorParameters = Maps.newHashMap(); storageDescriptorParameters.put("key_0", "value_0"); - StorageDescriptor storageDescriptor = StorageDescriptor.builder().parameters(storageDescriptorParameters).build(); - - Mockito.doReturn(GetTableResponse.builder().table( - Table.builder() - .databaseName("db") - .name("t_renamed") - .parameters(parameters) - .storageDescriptor(storageDescriptor).build()).build() - ).when(glue).getTable(Mockito.any(GetTableRequest.class)); + StorageDescriptor storageDescriptor = + StorageDescriptor.builder().parameters(storageDescriptorParameters).build(); + + Mockito.doReturn( + GetTableResponse.builder() + .table( + Table.builder() + .databaseName("db") + .name("t_renamed") + .parameters(parameters) + .storageDescriptor(storageDescriptor) + .build()) + .build()) + .when(glue) + .getTable(Mockito.any(GetTableRequest.class)); Mockito.doReturn(GetTablesResponse.builder().build()) - .when(glue).getTables(Mockito.any(GetTablesRequest.class)); - Mockito.doReturn(GetDatabaseResponse.builder() - .database(Database.builder().name("db").build()).build()) - .when(glue).getDatabase(Mockito.any(GetDatabaseRequest.class)); - - Mockito.doAnswer(new Answer() { - @Override - public Object answer(InvocationOnMock invocation) throws Throwable { - CreateTableRequest createTableRequest = (CreateTableRequest) invocation.getArguments()[0]; - if (createTableRequest.tableInput().storageDescriptor().hasParameters()) { - counter.decrementAndGet(); - } - return CreateTableResponse.builder().build(); - } - }).when(glue).createTable(Mockito.any(CreateTableRequest.class)); + .when(glue) + .getTables(Mockito.any(GetTablesRequest.class)); + Mockito.doReturn( + GetDatabaseResponse.builder().database(Database.builder().name("db").build()).build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); + + Mockito.doAnswer( + new Answer() { + @Override + public Object answer(InvocationOnMock invocation) throws Throwable { + CreateTableRequest createTableRequest = + (CreateTableRequest) invocation.getArguments()[0]; + if (createTableRequest.tableInput().storageDescriptor().hasParameters()) { + counter.decrementAndGet(); + } + return CreateTableResponse.builder().build(); + } + }) + .when(glue) + .createTable(Mockito.any(CreateTableRequest.class)); glueCatalog.renameTable(TableIdentifier.of("db", "t"), TableIdentifier.of("db", "x_renamed")); Assert.assertEquals(0, counter.get()); @@ -283,21 +351,22 @@ public Object answer(InvocationOnMock invocation) throws Throwable { @Test public void testCreateNamespace() { Mockito.doReturn(CreateDatabaseResponse.builder().build()) - .when(glue).createDatabase(Mockito.any(CreateDatabaseRequest.class)); + .when(glue) + .createDatabase(Mockito.any(CreateDatabaseRequest.class)); glueCatalog.createNamespace(Namespace.of("db")); } @Test public void testCreateNamespaceBadName() { Mockito.doReturn(CreateDatabaseResponse.builder().build()) - .when(glue).createDatabase(Mockito.any(CreateDatabaseRequest.class)); - List invalidNamespaces = Lists.newArrayList( - Namespace.of("db-1"), - Namespace.of("db", "db2") - ); + .when(glue) + .createDatabase(Mockito.any(CreateDatabaseRequest.class)); + List invalidNamespaces = + Lists.newArrayList(Namespace.of("db-1"), Namespace.of("db", "db2")); for (Namespace namespace : invalidNamespaces) { - AssertHelpers.assertThrows("should not create namespace with invalid or nested names", + AssertHelpers.assertThrows( + "should not create namespace with invalid or nested names", ValidationException.class, "Cannot convert namespace", () -> glueCatalog.createNamespace(namespace)); @@ -306,59 +375,60 @@ public void testCreateNamespaceBadName() { @Test public void testListAllNamespaces() { - Mockito.doReturn(GetDatabasesResponse.builder() - .databaseList( - Database.builder().name("db1").build(), - Database.builder().name("db2").build() - ).build()) - .when(glue).getDatabases(Mockito.any(GetDatabasesRequest.class)); + Mockito.doReturn( + GetDatabasesResponse.builder() + .databaseList( + Database.builder().name("db1").build(), Database.builder().name("db2").build()) + .build()) + .when(glue) + .getDatabases(Mockito.any(GetDatabasesRequest.class)); Assert.assertEquals( - Lists.newArrayList( - Namespace.of("db1"), - Namespace.of("db2") - ), - glueCatalog.listNamespaces() - ); + Lists.newArrayList(Namespace.of("db1"), Namespace.of("db2")), glueCatalog.listNamespaces()); } @Test public void testListNamespacesPagination() { AtomicInteger counter = new AtomicInteger(10); - Mockito.doAnswer(new Answer() { - @Override - public Object answer(InvocationOnMock invocation) throws Throwable { - if (counter.decrementAndGet() > 0) { - return GetDatabasesResponse.builder() - .databaseList( - Database.builder().name(UUID.randomUUID().toString().replace("-", "")).build() - ) - .nextToken("token") - .build(); - } else { - return GetDatabasesResponse.builder() - .databaseList(Database.builder().name("db").build()) - .build(); - } - } - }).when(glue).getDatabases(Mockito.any(GetDatabasesRequest.class)); + Mockito.doAnswer( + new Answer() { + @Override + public Object answer(InvocationOnMock invocation) throws Throwable { + if (counter.decrementAndGet() > 0) { + return GetDatabasesResponse.builder() + .databaseList( + Database.builder() + .name(UUID.randomUUID().toString().replace("-", "")) + .build()) + .nextToken("token") + .build(); + } else { + return GetDatabasesResponse.builder() + .databaseList(Database.builder().name("db").build()) + .build(); + } + } + }) + .when(glue) + .getDatabases(Mockito.any(GetDatabasesRequest.class)); Assert.assertEquals(10, glueCatalog.listNamespaces().size()); } @Test public void testListNamespacesWithNameShouldReturnItself() { - Mockito.doReturn(GetDatabaseResponse.builder() - .database(Database.builder().name("db1").build()).build()) - .when(glue).getDatabase(Mockito.any(GetDatabaseRequest.class)); + Mockito.doReturn( + GetDatabaseResponse.builder().database(Database.builder().name("db1").build()).build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); Assert.assertEquals( "list self should return empty list", Lists.newArrayList(), - glueCatalog.listNamespaces(Namespace.of("db1")) - ); + glueCatalog.listNamespaces(Namespace.of("db1"))); } @Test public void testListNamespacesBadName() { - AssertHelpers.assertThrows("table name invalid", + AssertHelpers.assertThrows( + "table name invalid", ValidationException.class, "Cannot convert namespace", () -> glueCatalog.listNamespaces(Namespace.of("db-1"))); @@ -368,43 +438,55 @@ public void testListNamespacesBadName() { public void testLoadNamespaceMetadata() { Map parameters = Maps.newHashMap(); parameters.put("key", "val"); - Mockito.doReturn(GetDatabaseResponse.builder() - .database(Database.builder().name("db1") - .parameters(parameters) - .build()).build()) - .when(glue).getDatabase(Mockito.any(GetDatabaseRequest.class)); + Mockito.doReturn( + GetDatabaseResponse.builder() + .database(Database.builder().name("db1").parameters(parameters).build()) + .build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); Assert.assertEquals(parameters, glueCatalog.loadNamespaceMetadata(Namespace.of("db1"))); } @Test public void testDropNamespace() { Mockito.doReturn(GetTablesResponse.builder().build()) - .when(glue).getTables(Mockito.any(GetTablesRequest.class)); - Mockito.doReturn(GetDatabaseResponse.builder() - .database(Database.builder().name("db1").build()).build()) - .when(glue).getDatabase(Mockito.any(GetDatabaseRequest.class)); + .when(glue) + .getTables(Mockito.any(GetTablesRequest.class)); + Mockito.doReturn( + GetDatabaseResponse.builder().database(Database.builder().name("db1").build()).build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); Mockito.doReturn(DeleteDatabaseResponse.builder().build()) - .when(glue).deleteDatabase(Mockito.any(DeleteDatabaseRequest.class)); + .when(glue) + .deleteDatabase(Mockito.any(DeleteDatabaseRequest.class)); glueCatalog.dropNamespace(Namespace.of("db1")); } @Test public void testDropNamespaceThatContainsOnlyIcebergTable() { - Mockito.doReturn(GetTablesResponse.builder() - .tableList( - Table.builder().databaseName("db1").name("t1").parameters( - ImmutableMap.of( - BaseMetastoreTableOperations.TABLE_TYPE_PROP, BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE - ) - ).build() - ).build()) - .when(glue).getTables(Mockito.any(GetTablesRequest.class)); - Mockito.doReturn(GetDatabaseResponse.builder() - .database(Database.builder().name("db1").build()).build()) - .when(glue).getDatabase(Mockito.any(GetDatabaseRequest.class)); + Mockito.doReturn( + GetTablesResponse.builder() + .tableList( + Table.builder() + .databaseName("db1") + .name("t1") + .parameters( + ImmutableMap.of( + BaseMetastoreTableOperations.TABLE_TYPE_PROP, + BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE)) + .build()) + .build()) + .when(glue) + .getTables(Mockito.any(GetTablesRequest.class)); + Mockito.doReturn( + GetDatabaseResponse.builder().database(Database.builder().name("db1").build()).build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); Mockito.doReturn(DeleteDatabaseResponse.builder().build()) - .when(glue).deleteDatabase(Mockito.any(DeleteDatabaseRequest.class)); - AssertHelpers.assertThrows("namespace should not be dropped when still has Iceberg table", + .when(glue) + .deleteDatabase(Mockito.any(DeleteDatabaseRequest.class)); + AssertHelpers.assertThrows( + "namespace should not be dropped when still has Iceberg table", NamespaceNotEmptyException.class, "still contains Iceberg tables", () -> glueCatalog.dropNamespace(Namespace.of("db1"))); @@ -412,17 +494,21 @@ public void testDropNamespaceThatContainsOnlyIcebergTable() { @Test public void testDropNamespaceThatContainsNonIcebergTable() { - Mockito.doReturn(GetTablesResponse.builder() - .tableList( - Table.builder().databaseName("db1").name("t1").build() - ).build()) - .when(glue).getTables(Mockito.any(GetTablesRequest.class)); - Mockito.doReturn(GetDatabaseResponse.builder() - .database(Database.builder().name("db1").build()).build()) - .when(glue).getDatabase(Mockito.any(GetDatabaseRequest.class)); + Mockito.doReturn( + GetTablesResponse.builder() + .tableList(Table.builder().databaseName("db1").name("t1").build()) + .build()) + .when(glue) + .getTables(Mockito.any(GetTablesRequest.class)); + Mockito.doReturn( + GetDatabaseResponse.builder().database(Database.builder().name("db1").build()).build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); Mockito.doReturn(DeleteDatabaseResponse.builder().build()) - .when(glue).deleteDatabase(Mockito.any(DeleteDatabaseRequest.class)); - AssertHelpers.assertThrows("namespace should not be dropped when still has non-Iceberg table", + .when(glue) + .deleteDatabase(Mockito.any(DeleteDatabaseRequest.class)); + AssertHelpers.assertThrows( + "namespace should not be dropped when still has non-Iceberg table", NamespaceNotEmptyException.class, "still contains non-Iceberg tables", () -> glueCatalog.dropNamespace(Namespace.of("db1"))); @@ -432,13 +518,15 @@ public void testDropNamespaceThatContainsNonIcebergTable() { public void testSetProperties() { Map parameters = Maps.newHashMap(); parameters.put("key", "val"); - Mockito.doReturn(GetDatabaseResponse.builder() - .database(Database.builder().name("db1") - .parameters(parameters) - .build()).build()) - .when(glue).getDatabase(Mockito.any(GetDatabaseRequest.class)); + Mockito.doReturn( + GetDatabaseResponse.builder() + .database(Database.builder().name("db1").parameters(parameters).build()) + .build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); Mockito.doReturn(UpdateDatabaseResponse.builder().build()) - .when(glue).updateDatabase(Mockito.any(UpdateDatabaseRequest.class)); + .when(glue) + .updateDatabase(Mockito.any(UpdateDatabaseRequest.class)); glueCatalog.setProperties(Namespace.of("db1"), parameters); } @@ -446,26 +534,35 @@ public void testSetProperties() { public void testRemoveProperties() { Map parameters = Maps.newHashMap(); parameters.put("key", "val"); - Mockito.doReturn(GetDatabaseResponse.builder() - .database(Database.builder().name("db1") - .parameters(parameters) - .build()).build()) - .when(glue).getDatabase(Mockito.any(GetDatabaseRequest.class)); + Mockito.doReturn( + GetDatabaseResponse.builder() + .database(Database.builder().name("db1").parameters(parameters).build()) + .build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); Mockito.doReturn(UpdateDatabaseResponse.builder().build()) - .when(glue).updateDatabase(Mockito.any(UpdateDatabaseRequest.class)); + .when(glue) + .updateDatabase(Mockito.any(UpdateDatabaseRequest.class)); glueCatalog.removeProperties(Namespace.of("db1"), Sets.newHashSet("key")); } @Test public void testTablePropsDefinedAtCatalogLevel() { - ImmutableMap catalogProps = ImmutableMap.of( - "table-default.key1", "catalog-default-key1", - "table-default.key2", "catalog-default-key2", - "table-default.key3", "catalog-default-key3", - "table-override.key3", "catalog-override-key3", - "table-override.key4", "catalog-override-key4"); - glueCatalog.initialize(CATALOG_NAME, WAREHOUSE_PATH, new AwsProperties(), glue, - LockManagers.defaultLockManager(), null, catalogProps); + ImmutableMap catalogProps = + ImmutableMap.of( + "table-default.key1", "catalog-default-key1", + "table-default.key2", "catalog-default-key2", + "table-default.key3", "catalog-default-key3", + "table-override.key3", "catalog-override-key3", + "table-override.key4", "catalog-override-key4"); + glueCatalog.initialize( + CATALOG_NAME, + WAREHOUSE_PATH, + new AwsProperties(), + glue, + LockManagers.defaultLockManager(), + null, + catalogProps); Map properties = glueCatalog.properties(); Assert.assertFalse(properties.isEmpty()); Assert.assertTrue(properties.containsKey("table-default.key1")); @@ -484,8 +581,8 @@ public void testTablePropsDefinedAtCatalogLevel() { public void testValidateIdentifierSkipNameValidation() { AwsProperties props = new AwsProperties(); props.setGlueCatalogSkipNameValidation(true); - glueCatalog.initialize(CATALOG_NAME, WAREHOUSE_PATH, props, glue, - LockManagers.defaultLockManager(), null); + glueCatalog.initialize( + CATALOG_NAME, WAREHOUSE_PATH, props, glue, LockManagers.defaultLockManager(), null); Assert.assertEquals(glueCatalog.isValidIdentifier(TableIdentifier.parse("db-1.a-1")), true); } } diff --git a/aws/src/test/java/org/apache/iceberg/aws/glue/TestGlueToIcebergConverter.java b/aws/src/test/java/org/apache/iceberg/aws/glue/TestGlueToIcebergConverter.java index 23bc1cd7c0a2..2b5379f099d6 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/glue/TestGlueToIcebergConverter.java +++ b/aws/src/test/java/org/apache/iceberg/aws/glue/TestGlueToIcebergConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.glue; import java.util.Map; @@ -35,56 +34,47 @@ public class TestGlueToIcebergConverter { @Test public void testToNamespace() { - Database database = Database.builder() - .name("db") - .build(); + Database database = Database.builder().name("db").build(); Namespace namespace = Namespace.of("db"); Assert.assertEquals(namespace, GlueToIcebergConverter.toNamespace(database)); } @Test public void testToTableId() { - Table table = Table.builder() - .databaseName("db") - .name("name") - .build(); + Table table = Table.builder().databaseName("db").name("name").build(); TableIdentifier icebergId = TableIdentifier.of("db", "name"); Assert.assertEquals(icebergId, GlueToIcebergConverter.toTableId(table)); } @Test public void testValidateTable() { - Map properties = ImmutableMap.of( - BaseMetastoreTableOperations.TABLE_TYPE_PROP, - BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE); - Table table = Table.builder() - .parameters(properties) - .build(); + Map properties = + ImmutableMap.of( + BaseMetastoreTableOperations.TABLE_TYPE_PROP, + BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE); + Table table = Table.builder().parameters(properties).build(); GlueToIcebergConverter.validateTable(table, "name"); } @Test public void testValidateTableIcebergPropertyNotFound() { - Table table = Table.builder() - .parameters(ImmutableMap.of()) - .build(); - AssertHelpers.assertThrows("Iceberg property not found", + Table table = Table.builder().parameters(ImmutableMap.of()).build(); + AssertHelpers.assertThrows( + "Iceberg property not found", ValidationException.class, "Input Glue table is not an iceberg table", - () -> GlueToIcebergConverter.validateTable(table, "name") - ); + () -> GlueToIcebergConverter.validateTable(table, "name")); } @Test public void testValidateTableIcebergPropertyValueWrong() { - Map properties = ImmutableMap.of(BaseMetastoreTableOperations.TABLE_TYPE_PROP, "other"); - Table table = Table.builder() - .parameters(properties) - .build(); - AssertHelpers.assertThrows("Iceberg property value wrong", + Map properties = + ImmutableMap.of(BaseMetastoreTableOperations.TABLE_TYPE_PROP, "other"); + Table table = Table.builder().parameters(properties).build(); + AssertHelpers.assertThrows( + "Iceberg property value wrong", ValidationException.class, "Input Glue table is not an iceberg table", - () -> GlueToIcebergConverter.validateTable(table, "name") - ); + () -> GlueToIcebergConverter.validateTable(table, "name")); } } diff --git a/aws/src/test/java/org/apache/iceberg/aws/glue/TestIcebergToGlueConverter.java b/aws/src/test/java/org/apache/iceberg/aws/glue/TestIcebergToGlueConverter.java index ac9c9950cd41..701416de2f8d 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/glue/TestIcebergToGlueConverter.java +++ b/aws/src/test/java/org/apache/iceberg/aws/glue/TestIcebergToGlueConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.glue; import java.util.List; @@ -43,10 +42,11 @@ public class TestIcebergToGlueConverter { - private final Map tableLocationProperties = ImmutableMap.of( - TableProperties.WRITE_DATA_LOCATION, "s3://writeDataLoc", - TableProperties.WRITE_METADATA_LOCATION, "s3://writeMetaDataLoc", - TableProperties.WRITE_FOLDER_STORAGE_LOCATION, "s3://writeFolderStorageLoc"); + private final Map tableLocationProperties = + ImmutableMap.of( + TableProperties.WRITE_DATA_LOCATION, "s3://writeDataLoc", + TableProperties.WRITE_METADATA_LOCATION, "s3://writeMetaDataLoc", + TableProperties.WRITE_FOLDER_STORAGE_LOCATION, "s3://writeFolderStorageLoc"); @Test public void testToDatabaseName() { @@ -55,88 +55,94 @@ public void testToDatabaseName() { @Test public void testToDatabaseNameFailure() { - List badNames = Lists.newArrayList( - Namespace.of("db", "a"), - Namespace.of("db-1"), - Namespace.empty(), - Namespace.of(""), - Namespace.of(new String(new char[600]).replace("\0", "a"))); + List badNames = + Lists.newArrayList( + Namespace.of("db", "a"), + Namespace.of("db-1"), + Namespace.empty(), + Namespace.of(""), + Namespace.of(new String(new char[600]).replace("\0", "a"))); for (Namespace name : badNames) { - AssertHelpers.assertThrows("bad namespace name", + AssertHelpers.assertThrows( + "bad namespace name", ValidationException.class, "Cannot convert namespace", - () -> IcebergToGlueConverter.toDatabaseName(name, false) - ); + () -> IcebergToGlueConverter.toDatabaseName(name, false)); } } @Test public void testSkipNamespaceValidation() { - List acceptableNames = Lists.newArrayList( - Namespace.of("db-1"), - Namespace.of("db-1-1-1")); + List acceptableNames = + Lists.newArrayList(Namespace.of("db-1"), Namespace.of("db-1-1-1")); for (Namespace name : acceptableNames) { - Assert.assertEquals(name.toString(), IcebergToGlueConverter.toDatabaseName(name, true) - ); + Assert.assertEquals(name.toString(), IcebergToGlueConverter.toDatabaseName(name, true)); } } @Test public void testSkipTableNameValidation() { - List acceptableIdentifiers = Lists.newArrayList( + List acceptableIdentifiers = + Lists.newArrayList( TableIdentifier.parse("db.a-1"), TableIdentifier.parse("db.a-1-1"), TableIdentifier.parse("db.a#1")); for (TableIdentifier identifier : acceptableIdentifiers) { - Assert.assertEquals(identifier.name(), IcebergToGlueConverter.getTableName(identifier, true) - ); + Assert.assertEquals(identifier.name(), IcebergToGlueConverter.getTableName(identifier, true)); } } @Test public void testToDatabaseInput() { - Map properties = ImmutableMap.of( - IcebergToGlueConverter.GLUE_DB_DESCRIPTION_KEY, "description", - IcebergToGlueConverter.GLUE_DB_LOCATION_KEY, "s3://location", - "key", "val"); - DatabaseInput databaseInput = IcebergToGlueConverter.toDatabaseInput(Namespace.of("ns"), properties, false); + Map properties = + ImmutableMap.of( + IcebergToGlueConverter.GLUE_DB_DESCRIPTION_KEY, + "description", + IcebergToGlueConverter.GLUE_DB_LOCATION_KEY, + "s3://location", + "key", + "val"); + DatabaseInput databaseInput = + IcebergToGlueConverter.toDatabaseInput(Namespace.of("ns"), properties, false); Assert.assertEquals("Location should be set", "s3://location", databaseInput.locationUri()); Assert.assertEquals("Description should be set", "description", databaseInput.description()); - Assert.assertEquals("Parameters should be set", ImmutableMap.of("key", "val"), databaseInput.parameters()); + Assert.assertEquals( + "Parameters should be set", ImmutableMap.of("key", "val"), databaseInput.parameters()); Assert.assertEquals("Database name should be set", "ns", databaseInput.name()); } @Test public void testToDatabaseInputNoParameter() { - DatabaseInput input = DatabaseInput.builder() - .name("db") - .parameters(ImmutableMap.of()) - .build(); + DatabaseInput input = DatabaseInput.builder().name("db").parameters(ImmutableMap.of()).build(); Namespace namespace = Namespace.of("db"); - Assert.assertEquals(input, IcebergToGlueConverter.toDatabaseInput(namespace, ImmutableMap.of(), false)); + Assert.assertEquals( + input, IcebergToGlueConverter.toDatabaseInput(namespace, ImmutableMap.of(), false)); } @Test public void testToDatabaseInputEmptyLocation() { - Map properties = ImmutableMap.of( - IcebergToGlueConverter.GLUE_DB_DESCRIPTION_KEY, "description", - "key", "val"); - DatabaseInput databaseInput = IcebergToGlueConverter.toDatabaseInput(Namespace.of("ns"), properties, false); + Map properties = + ImmutableMap.of( + IcebergToGlueConverter.GLUE_DB_DESCRIPTION_KEY, "description", "key", "val"); + DatabaseInput databaseInput = + IcebergToGlueConverter.toDatabaseInput(Namespace.of("ns"), properties, false); Assert.assertNull("Location should not be set", databaseInput.locationUri()); Assert.assertEquals("Description should be set", "description", databaseInput.description()); - Assert.assertEquals("Parameters should be set", ImmutableMap.of("key", "val"), databaseInput.parameters()); + Assert.assertEquals( + "Parameters should be set", ImmutableMap.of("key", "val"), databaseInput.parameters()); Assert.assertEquals("Database name should be set", "ns", databaseInput.name()); } @Test public void testToDatabaseInputEmptyDescription() { - Map properties = ImmutableMap.of( - IcebergToGlueConverter.GLUE_DB_LOCATION_KEY, "s3://location", - "key", "val"); - DatabaseInput databaseInput = IcebergToGlueConverter.toDatabaseInput(Namespace.of("ns"), properties, false); + Map properties = + ImmutableMap.of(IcebergToGlueConverter.GLUE_DB_LOCATION_KEY, "s3://location", "key", "val"); + DatabaseInput databaseInput = + IcebergToGlueConverter.toDatabaseInput(Namespace.of("ns"), properties, false); Assert.assertEquals("Location should be set", "s3://location", databaseInput.locationUri()); Assert.assertNull("Description should not be set", databaseInput.description()); - Assert.assertEquals("Parameters should be set", ImmutableMap.of("key", "val"), databaseInput.parameters()); + Assert.assertEquals( + "Parameters should be set", ImmutableMap.of("key", "val"), databaseInput.parameters()); Assert.assertEquals("Database name should be set", "ns", databaseInput.name()); } @@ -144,48 +150,52 @@ public void testToDatabaseInputEmptyDescription() { public void testSetTableInputInformation() { // Actual TableInput TableInput.Builder actualTableInputBuilder = TableInput.builder(); - Schema schema = new Schema( - Types.NestedField.required(1, "x", Types.StringType.get(), "comment1"), - Types.NestedField.required(2, "y", Types.StructType.of( - Types.NestedField.required(3, "z", Types.IntegerType.get())), "comment2") - ); - PartitionSpec partitionSpec = PartitionSpec.builderFor(schema) - .identity("x") - .withSpecId(1000) - .build(); - TableMetadata tableMetadata = TableMetadata - .newTableMetadata(schema, partitionSpec, "s3://test", tableLocationProperties); + Schema schema = + new Schema( + Types.NestedField.required(1, "x", Types.StringType.get(), "comment1"), + Types.NestedField.required( + 2, + "y", + Types.StructType.of(Types.NestedField.required(3, "z", Types.IntegerType.get())), + "comment2")); + PartitionSpec partitionSpec = + PartitionSpec.builderFor(schema).identity("x").withSpecId(1000).build(); + TableMetadata tableMetadata = + TableMetadata.newTableMetadata(schema, partitionSpec, "s3://test", tableLocationProperties); IcebergToGlueConverter.setTableInputInformation(actualTableInputBuilder, tableMetadata); TableInput actualTableInput = actualTableInputBuilder.build(); // Expected TableInput - TableInput expectedTableInput = TableInput.builder().storageDescriptor( - StorageDescriptor.builder() - .location("s3://test") - .additionalLocations(Sets.newHashSet(tableLocationProperties.values())) - .columns(ImmutableList.of( - Column.builder() - .name("x") - .type("string") - .comment("comment1") - .parameters(ImmutableMap.of( - IcebergToGlueConverter.ICEBERG_FIELD_ID, "1", - IcebergToGlueConverter.ICEBERG_FIELD_OPTIONAL, "false", - IcebergToGlueConverter.ICEBERG_FIELD_CURRENT, "true" - )) - .build(), - Column.builder() - .name("y") - .type("struct") - .comment("comment2") - .parameters(ImmutableMap.of( - IcebergToGlueConverter.ICEBERG_FIELD_ID, "2", - IcebergToGlueConverter.ICEBERG_FIELD_OPTIONAL, "false", - IcebergToGlueConverter.ICEBERG_FIELD_CURRENT, "true" - )) - .build())) - .build()) - .build(); + TableInput expectedTableInput = + TableInput.builder() + .storageDescriptor( + StorageDescriptor.builder() + .location("s3://test") + .additionalLocations(Sets.newHashSet(tableLocationProperties.values())) + .columns( + ImmutableList.of( + Column.builder() + .name("x") + .type("string") + .comment("comment1") + .parameters( + ImmutableMap.of( + IcebergToGlueConverter.ICEBERG_FIELD_ID, "1", + IcebergToGlueConverter.ICEBERG_FIELD_OPTIONAL, "false", + IcebergToGlueConverter.ICEBERG_FIELD_CURRENT, "true")) + .build(), + Column.builder() + .name("y") + .type("struct") + .comment("comment2") + .parameters( + ImmutableMap.of( + IcebergToGlueConverter.ICEBERG_FIELD_ID, "2", + IcebergToGlueConverter.ICEBERG_FIELD_OPTIONAL, "false", + IcebergToGlueConverter.ICEBERG_FIELD_CURRENT, "true")) + .build())) + .build()) + .build(); Assert.assertEquals( "additionalLocations should match", @@ -205,53 +215,56 @@ public void testSetTableInputInformation() { public void testSetTableInputInformationWithRemovedColumns() { // Actual TableInput TableInput.Builder actualTableInputBuilder = TableInput.builder(); - Schema schema = new Schema( - Types.NestedField.required(1, "x", Types.StringType.get(), "comment1"), - Types.NestedField.required(2, "y", Types.StructType.of( - Types.NestedField.required(3, "z", Types.IntegerType.get())), "comment2") - ); - PartitionSpec partitionSpec = PartitionSpec.builderFor(schema) - .identity("x") - .withSpecId(1000) - .build(); - TableMetadata tableMetadata = TableMetadata - .newTableMetadata(schema, partitionSpec, "s3://test", tableLocationProperties); + Schema schema = + new Schema( + Types.NestedField.required(1, "x", Types.StringType.get(), "comment1"), + Types.NestedField.required( + 2, + "y", + Types.StructType.of(Types.NestedField.required(3, "z", Types.IntegerType.get())), + "comment2")); + PartitionSpec partitionSpec = + PartitionSpec.builderFor(schema).identity("x").withSpecId(1000).build(); + TableMetadata tableMetadata = + TableMetadata.newTableMetadata(schema, partitionSpec, "s3://test", tableLocationProperties); - Schema newSchema = new Schema( - Types.NestedField.required(1, "x", Types.StringType.get(), "comment1") - ); + Schema newSchema = + new Schema(Types.NestedField.required(1, "x", Types.StringType.get(), "comment1")); tableMetadata = tableMetadata.updateSchema(newSchema, 3); IcebergToGlueConverter.setTableInputInformation(actualTableInputBuilder, tableMetadata); TableInput actualTableInput = actualTableInputBuilder.build(); // Expected TableInput - TableInput expectedTableInput = TableInput.builder().storageDescriptor( - StorageDescriptor.builder() - .additionalLocations(Sets.newHashSet(tableLocationProperties.values())) - .location("s3://test") - .columns(ImmutableList.of( - Column.builder() - .name("x") - .type("string") - .comment("comment1") - .parameters(ImmutableMap.of( - IcebergToGlueConverter.ICEBERG_FIELD_ID, "1", - IcebergToGlueConverter.ICEBERG_FIELD_OPTIONAL, "false", - IcebergToGlueConverter.ICEBERG_FIELD_CURRENT, "true" - )) - .build(), - Column.builder() - .name("y") - .type("struct") - .comment("comment2") - .parameters(ImmutableMap.of( - IcebergToGlueConverter.ICEBERG_FIELD_ID, "2", - IcebergToGlueConverter.ICEBERG_FIELD_OPTIONAL, "false", - IcebergToGlueConverter.ICEBERG_FIELD_CURRENT, "false" - )) - .build())) - .build()) - .build(); + TableInput expectedTableInput = + TableInput.builder() + .storageDescriptor( + StorageDescriptor.builder() + .additionalLocations(Sets.newHashSet(tableLocationProperties.values())) + .location("s3://test") + .columns( + ImmutableList.of( + Column.builder() + .name("x") + .type("string") + .comment("comment1") + .parameters( + ImmutableMap.of( + IcebergToGlueConverter.ICEBERG_FIELD_ID, "1", + IcebergToGlueConverter.ICEBERG_FIELD_OPTIONAL, "false", + IcebergToGlueConverter.ICEBERG_FIELD_CURRENT, "true")) + .build(), + Column.builder() + .name("y") + .type("struct") + .comment("comment2") + .parameters( + ImmutableMap.of( + IcebergToGlueConverter.ICEBERG_FIELD_ID, "2", + IcebergToGlueConverter.ICEBERG_FIELD_OPTIONAL, "false", + IcebergToGlueConverter.ICEBERG_FIELD_CURRENT, "false")) + .build())) + .build()) + .build(); Assert.assertEquals( "additionalLocations should match", diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIO.java b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIO.java index 11e531931bfd..8ea4bf8909ea 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIO.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIO.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.s3; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.mockito.AdditionalAnswers.delegatesTo; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + import com.adobe.testing.s3mock.junit4.S3MockRule; import java.io.IOException; import java.io.InputStream; @@ -56,21 +66,9 @@ import software.amazon.awssdk.services.s3.model.DeleteObjectsResponse; import software.amazon.awssdk.services.s3.model.S3Error; -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; -import static org.mockito.AdditionalAnswers.delegatesTo; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.Mockito.doReturn; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.times; -import static org.mockito.Mockito.verify; - @RunWith(MockitoJUnitRunner.class) public class TestS3FileIO { - @ClassRule - public static final S3MockRule S3_MOCK_RULE = S3MockRule.builder().silent().build(); + @ClassRule public static final S3MockRule S3_MOCK_RULE = S3MockRule.builder().silent().build(); public SerializableSupplier s3 = S3_MOCK_RULE::createS3ClientV2; private final S3Client s3mock = mock(S3Client.class, delegatesTo(s3.get())); private final Random random = new Random(1); @@ -78,9 +76,12 @@ public class TestS3FileIO { private final String batchDeletionBucketPrefix = "batch-delete-"; private final int batchDeletionSize = 5; private S3FileIO s3FileIO; - private final Map properties = ImmutableMap.of( - "s3.write.tags.tagKey1", "TagValue1", - "s3.delete.batch-size", Integer.toString(batchDeletionSize)); + private final Map properties = + ImmutableMap.of( + "s3.write.tags.tagKey1", + "TagValue1", + "s3.delete.batch-size", + Integer.toString(batchDeletionSize)); @Before public void before() { @@ -88,14 +89,16 @@ public void before() { s3FileIO.initialize(properties); s3.get().createBucket(CreateBucketRequest.builder().bucket("bucket").build()); for (int i = 1; i <= numBucketsForBatchDeletion; i++) { - s3.get().createBucket(CreateBucketRequest.builder().bucket(batchDeletionBucketPrefix + i).build()); + s3.get() + .createBucket( + CreateBucketRequest.builder().bucket(batchDeletionBucketPrefix + i).build()); } } @Test public void testNewInputFile() throws IOException { String location = "s3://bucket/path/to/file.txt"; - byte [] expected = new byte[1024 * 1024]; + byte[] expected = new byte[1024 * 1024]; random.nextBytes(expected); InputFile in = s3FileIO.newInputFile(location); @@ -107,7 +110,7 @@ public void testNewInputFile() throws IOException { } assertTrue(in.exists()); - byte [] actual; + byte[] actual; try (InputStream is = in.newStream()) { actual = IOUtils.readFully(is, expected.length); @@ -155,12 +158,14 @@ public void testDeleteEmptyList() throws IOException { @Test public void testDeleteFilesS3ReturnsError() { String location = "s3://bucket/path/to/file-to-delete.txt"; - DeleteObjectsResponse deleteObjectsResponse = DeleteObjectsResponse.builder() - .errors(ImmutableList.of(S3Error.builder().key("path/to/file.txt").build())) - .build(); + DeleteObjectsResponse deleteObjectsResponse = + DeleteObjectsResponse.builder() + .errors(ImmutableList.of(S3Error.builder().key("path/to/file.txt").build())) + .build(); doReturn(deleteObjectsResponse).when(s3mock).deleteObjects((DeleteObjectsRequest) any()); - AssertHelpers.assertThrows("A failure during S3 DeleteObjects call should result in FileIODeleteException", + AssertHelpers.assertThrows( + "A failure during S3 DeleteObjects call should result in FileIODeleteException", BulkDeletionFailureException.class, "Failed to delete 1 file", () -> s3FileIO.deleteFiles(Lists.newArrayList(location))); @@ -173,7 +178,8 @@ private void testBatchDelete(int numObjects) { for (int j = 1; j <= numObjects; j++) { String key = "object-" + j; paths.add("s3://" + bucketName + "/" + key); - s3mock.putObject(builder -> builder.bucket(bucketName).key(key).build(), RequestBody.empty()); + s3mock.putObject( + builder -> builder.bucket(bucketName).key(key).build(), RequestBody.empty()); } } s3FileIO.deleteFiles(paths); @@ -190,9 +196,13 @@ private void testBatchDelete(int numObjects) { @Test public void testSerializeClient() { SerializableSupplier pre = - () -> S3Client.builder().httpClientBuilder(UrlConnectionHttpClient.builder()).region(Region.US_EAST_1).build(); + () -> + S3Client.builder() + .httpClientBuilder(UrlConnectionHttpClient.builder()) + .region(Region.US_EAST_1) + .build(); - byte [] data = SerializationUtils.serialize(pre); + byte[] data = SerializationUtils.serialize(pre); SerializableSupplier post = SerializationUtils.deserialize(data); assertEquals("s3", post.get().serviceName()); @@ -204,12 +214,15 @@ public void testPrefixList() { List scaleSizes = Lists.newArrayList(1, 1000, 2500); - scaleSizes.parallelStream().forEach(scale -> { - String scalePrefix = String.format("%s/%s/", prefix, scale); + scaleSizes + .parallelStream() + .forEach( + scale -> { + String scalePrefix = String.format("%s/%s/", prefix, scale); - createRandomObjects(scalePrefix, scale); - assertEquals((long) scale, Streams.stream(s3FileIO.listPrefix(scalePrefix)).count()); - }); + createRandomObjects(scalePrefix, scale); + assertEquals((long) scale, Streams.stream(s3FileIO.listPrefix(scalePrefix)).count()); + }); long totalFiles = scaleSizes.stream().mapToLong(Integer::longValue).sum(); Assertions.assertEquals(totalFiles, Streams.stream(s3FileIO.listPrefix(prefix)).count()); @@ -220,13 +233,14 @@ public void testPrefixDelete() { String prefix = "s3://bucket/path/to/delete"; List scaleSizes = Lists.newArrayList(0, 5, 1001); - scaleSizes.forEach(scale -> { - String scalePrefix = String.format("%s/%s/", prefix, scale); + scaleSizes.forEach( + scale -> { + String scalePrefix = String.format("%s/%s/", prefix, scale); - createRandomObjects(scalePrefix, scale); - s3FileIO.deletePrefix(scalePrefix); - assertEquals(0L, Streams.stream(s3FileIO.listPrefix(scalePrefix)).count()); - }); + createRandomObjects(scalePrefix, scale); + s3FileIO.deletePrefix(scalePrefix); + assertEquals(0L, Streams.stream(s3FileIO.listPrefix(scalePrefix)).count()); + }); } @Test @@ -248,8 +262,13 @@ public void testFileIOJsonSerialization() { private void createRandomObjects(String prefix, int count) { S3URI s3URI = new S3URI(prefix); - random.ints(count).parallel().forEach(i -> - s3mock.putObject(builder -> builder.bucket(s3URI.bucket()).key(s3URI.key() + i).build(), RequestBody.empty()) - ); + random + .ints(count) + .parallel() + .forEach( + i -> + s3mock.putObject( + builder -> builder.bucket(s3URI.bucket()).key(s3URI.key() + i).build(), + RequestBody.empty())); } } diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3InputStream.java b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3InputStream.java index 417ba16f71ad..640aefc8016f 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3InputStream.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3InputStream.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.s3; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; + import com.adobe.testing.s3mock.junit4.S3MockRule; import java.io.IOException; import java.util.Arrays; @@ -34,13 +37,8 @@ import software.amazon.awssdk.services.s3.model.CreateBucketRequest; import software.amazon.awssdk.services.s3.model.PutObjectRequest; -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertThrows; - public class TestS3InputStream { - @ClassRule - public static final S3MockRule S3_MOCK_RULE = S3MockRule.builder().silent().build(); + @ClassRule public static final S3MockRule S3_MOCK_RULE = S3MockRule.builder().silent().build(); private final S3Client s3 = S3_MOCK_RULE.createS3ClientV2(); private final Random random = new Random(1); @@ -60,7 +58,7 @@ public void testRead() throws Exception { try (SeekableInputStream in = new S3InputStream(s3, uri)) { int readSize = 1024; - byte [] actual = new byte[readSize]; + byte[] actual = new byte[readSize]; readAndCheck(in, in.getPos(), readSize, data, false); readAndCheck(in, in.getPos(), readSize, data, true); @@ -85,13 +83,14 @@ public void testRead() throws Exception { } } - private void readAndCheck(SeekableInputStream in, long rangeStart, int size, byte [] original, boolean buffered) + private void readAndCheck( + SeekableInputStream in, long rangeStart, int size, byte[] original, boolean buffered) throws IOException { in.seek(rangeStart); assertEquals(rangeStart, in.getPos()); long rangeEnd = rangeStart + size; - byte [] actual = new byte[size]; + byte[] actual = new byte[size]; if (buffered) { IOUtils.readFully(in, actual); @@ -140,8 +139,8 @@ public void testRangeRead() throws Exception { } private void readAndCheckRanges( - RangeReadable in, byte [] original, long position, byte [] buffer, int offset, - int length) throws IOException { + RangeReadable in, byte[] original, long position, byte[] buffer, int offset, int length) + throws IOException { in.readFully(position, buffer, offset, length); assertArrayEquals( diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3OutputStream.java b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3OutputStream.java index d4ce92b2a7da..893f4edd3cba 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3OutputStream.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3OutputStream.java @@ -16,9 +16,21 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.s3; +import static org.apache.iceberg.metrics.MetricsContext.nullMetrics; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; +import static org.mockito.AdditionalAnswers.delegatesTo; +import static org.mockito.Mockito.any; +import static org.mockito.Mockito.atLeastOnce; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.reset; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + import com.adobe.testing.s3mock.junit4.S3MockRule; import java.io.File; import java.io.IOException; @@ -60,28 +72,13 @@ import software.amazon.awssdk.services.s3.model.UploadPartRequest; import software.amazon.awssdk.utils.BinaryUtils; -import static org.apache.iceberg.metrics.MetricsContext.nullMetrics; -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.fail; -import static org.mockito.AdditionalAnswers.delegatesTo; -import static org.mockito.Mockito.any; -import static org.mockito.Mockito.atLeastOnce; -import static org.mockito.Mockito.doThrow; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.reset; -import static org.mockito.Mockito.times; -import static org.mockito.Mockito.verify; - - @RunWith(MockitoJUnitRunner.class) public class TestS3OutputStream { private static final Logger LOG = LoggerFactory.getLogger(TestS3OutputStream.class); private static final String BUCKET = "test-bucket"; private static final int FIVE_MBS = 5 * 1024 * 1024; - @ClassRule - public static final S3MockRule S3_MOCK_RULE = S3MockRule.builder().silent().build(); + @ClassRule public static final S3MockRule S3_MOCK_RULE = S3MockRule.builder().silent().build(); private final S3Client s3 = S3_MOCK_RULE.createS3ClientV2(); private final S3Client s3mock = mock(S3Client.class, delegatesTo(s3)); @@ -89,15 +86,21 @@ public class TestS3OutputStream { private final Path tmpDir = Files.createTempDirectory("s3fileio-test-"); private final String newTmpDirectory = "/tmp/newStagingDirectory"; - private final AwsProperties properties = new AwsProperties(ImmutableMap.of( - AwsProperties.S3FILEIO_MULTIPART_SIZE, Integer.toString(5 * 1024 * 1024), - AwsProperties.S3FILEIO_STAGING_DIRECTORY, tmpDir.toString(), - "s3.write.tags.abc", "123", - "s3.write.tags.def", "789", - "s3.delete.tags.xyz", "456")); - - public TestS3OutputStream() throws IOException { - } + private final AwsProperties properties = + new AwsProperties( + ImmutableMap.of( + AwsProperties.S3FILEIO_MULTIPART_SIZE, + Integer.toString(5 * 1024 * 1024), + AwsProperties.S3FILEIO_STAGING_DIRECTORY, + tmpDir.toString(), + "s3.write.tags.abc", + "123", + "s3.write.tags.def", + "789", + "s3.delete.tags.xyz", + "456")); + + public TestS3OutputStream() throws IOException {} @Before public void before() { @@ -123,11 +126,13 @@ public void testAbortAfterFailedPartUpload() { RuntimeException mockException = new RuntimeException("mock uploadPart failure"); doThrow(mockException).when(s3mock).uploadPart((UploadPartRequest) any(), (RequestBody) any()); - Assertions.assertThatThrownBy(() -> { - try (S3OutputStream stream = new S3OutputStream(s3mock, randomURI(), properties, nullMetrics())) { - stream.write(randomData(10 * 1024 * 1024)); - } - }) + Assertions.assertThatThrownBy( + () -> { + try (S3OutputStream stream = + new S3OutputStream(s3mock, randomURI(), properties, nullMetrics())) { + stream.write(randomData(10 * 1024 * 1024)); + } + }) .isInstanceOf(mockException.getClass()) .hasMessageContaining(mockException.getMessage()); @@ -137,14 +142,17 @@ public void testAbortAfterFailedPartUpload() { @Test public void testAbortMultipart() { RuntimeException mockException = new RuntimeException("mock completeMultipartUpload failure"); - doThrow(mockException).when(s3mock) + doThrow(mockException) + .when(s3mock) .completeMultipartUpload((CompleteMultipartUploadRequest) any()); - Assertions.assertThatThrownBy(() -> { - try (S3OutputStream stream = new S3OutputStream(s3mock, randomURI(), properties, nullMetrics())) { - stream.write(randomData(10 * 1024 * 1024)); - } - }) + Assertions.assertThatThrownBy( + () -> { + try (S3OutputStream stream = + new S3OutputStream(s3mock, randomURI(), properties, nullMetrics())) { + stream.write(randomData(10 * 1024 * 1024)); + } + }) .isInstanceOf(mockException.getClass()) .hasMessageContaining(mockException.getMessage()); @@ -160,9 +168,11 @@ public void testMultipleClose() throws IOException { @Test public void testStagingDirectoryCreation() throws IOException { - AwsProperties newStagingDirectoryAwsProperties = new AwsProperties(ImmutableMap.of( - AwsProperties.S3FILEIO_STAGING_DIRECTORY, newTmpDirectory)); - S3OutputStream stream = new S3OutputStream(s3, randomURI(), newStagingDirectoryAwsProperties, nullMetrics()); + AwsProperties newStagingDirectoryAwsProperties = + new AwsProperties( + ImmutableMap.of(AwsProperties.S3FILEIO_STAGING_DIRECTORY, newTmpDirectory)); + S3OutputStream stream = + new S3OutputStream(s3, randomURI(), newStagingDirectoryAwsProperties, nullMetrics()); stream.close(); } @@ -174,53 +184,53 @@ public void testWriteWithChecksumEnabled() { private void writeTest() { // Run tests for both byte and array write paths - Stream.of(true, false).forEach(arrayWrite -> { - // Test small file write (less than multipart threshold) - byte[] data = randomData(1024); - writeAndVerify(s3mock, randomURI(), data, arrayWrite); - ArgumentCaptor putObjectRequestArgumentCaptor = - ArgumentCaptor.forClass(PutObjectRequest.class); - verify(s3mock, times(1)).putObject(putObjectRequestArgumentCaptor.capture(), - (RequestBody) any()); - checkPutObjectRequestContent(data, putObjectRequestArgumentCaptor); - checkTags(putObjectRequestArgumentCaptor); - reset(s3mock); - - // Test file larger than part size but less than multipart threshold - data = randomData(6 * 1024 * 1024); - writeAndVerify(s3mock, randomURI(), data, arrayWrite); - putObjectRequestArgumentCaptor = ArgumentCaptor.forClass(PutObjectRequest.class); - verify(s3mock, times(1)).putObject(putObjectRequestArgumentCaptor.capture(), - (RequestBody) any()); - checkPutObjectRequestContent(data, putObjectRequestArgumentCaptor); - checkTags(putObjectRequestArgumentCaptor); - reset(s3mock); - - // Test file large enough to trigger multipart upload - data = randomData(10 * 1024 * 1024); - writeAndVerify(s3mock, randomURI(), data, arrayWrite); - ArgumentCaptor uploadPartRequestArgumentCaptor = - ArgumentCaptor.forClass(UploadPartRequest.class); - verify(s3mock, times(2)).uploadPart(uploadPartRequestArgumentCaptor.capture(), - (RequestBody) any()); - checkUploadPartRequestContent(data, uploadPartRequestArgumentCaptor); - reset(s3mock); - - // Test uploading many parts - data = randomData(22 * 1024 * 1024); - writeAndVerify(s3mock, randomURI(), data, arrayWrite); - uploadPartRequestArgumentCaptor = - ArgumentCaptor.forClass(UploadPartRequest.class); - verify(s3mock, times(5)).uploadPart(uploadPartRequestArgumentCaptor.capture(), - (RequestBody) any()); - checkUploadPartRequestContent(data, uploadPartRequestArgumentCaptor); - reset(s3mock); - }); + Stream.of(true, false) + .forEach( + arrayWrite -> { + // Test small file write (less than multipart threshold) + byte[] data = randomData(1024); + writeAndVerify(s3mock, randomURI(), data, arrayWrite); + ArgumentCaptor putObjectRequestArgumentCaptor = + ArgumentCaptor.forClass(PutObjectRequest.class); + verify(s3mock, times(1)) + .putObject(putObjectRequestArgumentCaptor.capture(), (RequestBody) any()); + checkPutObjectRequestContent(data, putObjectRequestArgumentCaptor); + checkTags(putObjectRequestArgumentCaptor); + reset(s3mock); + + // Test file larger than part size but less than multipart threshold + data = randomData(6 * 1024 * 1024); + writeAndVerify(s3mock, randomURI(), data, arrayWrite); + putObjectRequestArgumentCaptor = ArgumentCaptor.forClass(PutObjectRequest.class); + verify(s3mock, times(1)) + .putObject(putObjectRequestArgumentCaptor.capture(), (RequestBody) any()); + checkPutObjectRequestContent(data, putObjectRequestArgumentCaptor); + checkTags(putObjectRequestArgumentCaptor); + reset(s3mock); + + // Test file large enough to trigger multipart upload + data = randomData(10 * 1024 * 1024); + writeAndVerify(s3mock, randomURI(), data, arrayWrite); + ArgumentCaptor uploadPartRequestArgumentCaptor = + ArgumentCaptor.forClass(UploadPartRequest.class); + verify(s3mock, times(2)) + .uploadPart(uploadPartRequestArgumentCaptor.capture(), (RequestBody) any()); + checkUploadPartRequestContent(data, uploadPartRequestArgumentCaptor); + reset(s3mock); + + // Test uploading many parts + data = randomData(22 * 1024 * 1024); + writeAndVerify(s3mock, randomURI(), data, arrayWrite); + uploadPartRequestArgumentCaptor = ArgumentCaptor.forClass(UploadPartRequest.class); + verify(s3mock, times(5)) + .uploadPart(uploadPartRequestArgumentCaptor.capture(), (RequestBody) any()); + checkUploadPartRequestContent(data, uploadPartRequestArgumentCaptor); + reset(s3mock); + }); } private void checkUploadPartRequestContent( - byte[] data, - ArgumentCaptor uploadPartRequestArgumentCaptor) { + byte[] data, ArgumentCaptor uploadPartRequestArgumentCaptor) { if (properties.isS3ChecksumEnabled()) { List uploadPartRequests = uploadPartRequestArgumentCaptor.getAllValues().stream() @@ -235,8 +245,7 @@ private void checkUploadPartRequestContent( } private void checkPutObjectRequestContent( - byte[] data, - ArgumentCaptor putObjectRequestArgumentCaptor) { + byte[] data, ArgumentCaptor putObjectRequestArgumentCaptor) { if (properties.isS3ChecksumEnabled()) { List putObjectRequests = putObjectRequestArgumentCaptor.getAllValues(); assertEquals(getDigest(data, 0, data.length), putObjectRequests.get(0).contentMD5()); @@ -252,9 +261,7 @@ private void checkTags(ArgumentCaptor putObjectRequestArgument } private String getTags(Set objectTags) { - return objectTags.stream() - .map(e -> e.key() + "=" + e.value()) - .collect(Collectors.joining("&")); + return objectTags.stream().map(e -> e.key() + "=" + e.value()).collect(Collectors.joining("&")); } private String getDigest(byte[] data, int offset, int length) { @@ -268,7 +275,7 @@ private String getDigest(byte[] data, int offset, int length) { return null; } - private void writeAndVerify(S3Client client, S3URI uri, byte [] data, boolean arrayWrite) { + private void writeAndVerify(S3Client client, S3URI uri, byte[] data, boolean arrayWrite) { try (S3OutputStream stream = new S3OutputStream(client, uri, properties, nullMetrics())) { if (arrayWrite) { stream.write(data); @@ -296,14 +303,15 @@ private void writeAndVerify(S3Client client, S3URI uri, byte [] data, boolean ar private byte[] readS3Data(S3URI uri) { ResponseBytes data = - s3.getObject(GetObjectRequest.builder().bucket(uri.bucket()).key(uri.key()).build(), - ResponseTransformer.toBytes()); + s3.getObject( + GetObjectRequest.builder().bucket(uri.bucket()).key(uri.key()).build(), + ResponseTransformer.toBytes()); return data.asByteArray(); } private byte[] randomData(int size) { - byte [] result = new byte[size]; + byte[] result = new byte[size]; random.nextBytes(result); return result; } diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3RequestUtil.java b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3RequestUtil.java index 2dd011224395..516342025b47 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3RequestUtil.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3RequestUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.s3; import org.apache.iceberg.aws.AwsProperties; @@ -39,8 +38,13 @@ public void testConfigureServerSideCustomEncryption() { awsProperties.setS3FileIoSseType(AwsProperties.S3FILEIO_SSE_TYPE_CUSTOM); awsProperties.setS3FileIoSseKey("key"); awsProperties.setS3FileIoSseMd5("md5"); - S3RequestUtil.configureEncryption(awsProperties, this::setServerSideEncryption, this::setKmsKeyId, - this::setCustomAlgorithm, this::setCustomKey, this::setCustomMd5); + S3RequestUtil.configureEncryption( + awsProperties, + this::setServerSideEncryption, + this::setKmsKeyId, + this::setCustomAlgorithm, + this::setCustomKey, + this::setCustomMd5); Assert.assertNull(serverSideEncryption); Assert.assertNull(kmsKeyId); Assert.assertEquals(ServerSideEncryption.AES256.name(), customAlgorithm); @@ -52,8 +56,13 @@ public void testConfigureServerSideCustomEncryption() { public void testConfigureServerSideS3Encryption() { AwsProperties awsProperties = new AwsProperties(); awsProperties.setS3FileIoSseType(AwsProperties.S3FILEIO_SSE_TYPE_S3); - S3RequestUtil.configureEncryption(awsProperties, this::setServerSideEncryption, this::setKmsKeyId, - this::setCustomAlgorithm, this::setCustomKey, this::setCustomMd5); + S3RequestUtil.configureEncryption( + awsProperties, + this::setServerSideEncryption, + this::setKmsKeyId, + this::setCustomAlgorithm, + this::setCustomKey, + this::setCustomMd5); Assert.assertEquals(ServerSideEncryption.AES256, serverSideEncryption); Assert.assertNull(kmsKeyId); Assert.assertNull(customAlgorithm); @@ -66,8 +75,13 @@ public void testConfigureServerSideKmsEncryption() { AwsProperties awsProperties = new AwsProperties(); awsProperties.setS3FileIoSseType(AwsProperties.S3FILEIO_SSE_TYPE_KMS); awsProperties.setS3FileIoSseKey("key"); - S3RequestUtil.configureEncryption(awsProperties, this::setServerSideEncryption, this::setKmsKeyId, - this::setCustomAlgorithm, this::setCustomKey, this::setCustomMd5); + S3RequestUtil.configureEncryption( + awsProperties, + this::setServerSideEncryption, + this::setKmsKeyId, + this::setCustomAlgorithm, + this::setCustomKey, + this::setCustomMd5); Assert.assertEquals(ServerSideEncryption.AWS_KMS, serverSideEncryption); Assert.assertEquals("key", kmsKeyId); Assert.assertNull(customAlgorithm); @@ -80,8 +94,13 @@ public void testConfigureEncryptionSkipNullSetters() { AwsProperties awsProperties = new AwsProperties(); awsProperties.setS3FileIoSseType(AwsProperties.S3FILEIO_SSE_TYPE_KMS); awsProperties.setS3FileIoSseKey("key"); - S3RequestUtil.configureEncryption(awsProperties, v -> null, v -> null, - this::setCustomAlgorithm, this::setCustomKey, this::setCustomMd5); + S3RequestUtil.configureEncryption( + awsProperties, + v -> null, + v -> null, + this::setCustomAlgorithm, + this::setCustomKey, + this::setCustomMd5); Assert.assertNull(serverSideEncryption); Assert.assertNull(kmsKeyId); Assert.assertNull(customAlgorithm); diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3URI.java b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3URI.java index 16b71e238c0d..e91d33b0e995 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3URI.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3URI.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.aws.s3; +import static org.junit.Assert.assertEquals; + import java.util.Map; import org.apache.iceberg.AssertHelpers; import org.apache.iceberg.exceptions.ValidationException; @@ -26,8 +27,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.junit.Test; -import static org.junit.Assert.assertEquals; - public class TestS3URI { @Test @@ -52,7 +51,8 @@ public void testEncodedString() { @Test public void testEmptyPath() { - AssertHelpers.assertThrows("Should not allow missing object key", + AssertHelpers.assertThrows( + "Should not allow missing object key", ValidationException.class, "Invalid S3 URI, path is empty", () -> new S3URI("https://bucket/")); @@ -60,7 +60,8 @@ public void testEmptyPath() { @Test public void testMissingScheme() { - AssertHelpers.assertThrows("Should not allow missing scheme", + AssertHelpers.assertThrows( + "Should not allow missing scheme", ValidationException.class, "Invalid S3 URI, cannot determine scheme", () -> new S3URI("/path/to/file")); @@ -68,7 +69,8 @@ public void testMissingScheme() { @Test public void testMissingBucket() { - AssertHelpers.assertThrows("Should not allow missing bucket", + AssertHelpers.assertThrows( + "Should not allow missing bucket", ValidationException.class, "Invalid S3 URI, cannot determine bucket", () -> new S3URI("https://bucket")); @@ -96,9 +98,7 @@ public void testValidSchemes() { @Test public void testS3URIWithBucketToAccessPointMapping() { String p1 = "s3://bucket/path/to/file?query=foo#bar"; - Map bucketToAccessPointMapping = ImmutableMap.of( - "bucket", "access-point" - ); + Map bucketToAccessPointMapping = ImmutableMap.of("bucket", "access-point"); S3URI uri1 = new S3URI(p1, bucketToAccessPointMapping); assertEquals("access-point", uri1.bucket()); diff --git a/bundled-guava/src/main/java/org/apache/iceberg/GuavaClasses.java b/bundled-guava/src/main/java/org/apache/iceberg/GuavaClasses.java index f7f840c3858a..5bf2eb199648 100644 --- a/bundled-guava/src/main/java/org/apache/iceberg/GuavaClasses.java +++ b/bundled-guava/src/main/java/org/apache/iceberg/GuavaClasses.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.google.common.annotations.VisibleForTesting; @@ -55,7 +54,8 @@ import com.google.common.util.concurrent.MoreExecutors; import com.google.common.util.concurrent.ThreadFactoryBuilder; -// inspired in part by https://github.com/apache/avro/blob/release-1.8.2/lang/java/guava/src/main/java/org/apache/avro/GuavaClasses.java +// inspired in part by +// https://github.com/apache/avro/blob/release-1.8.2/lang/java/guava/src/main/java/org/apache/avro/GuavaClasses.java public class GuavaClasses { /* @@ -99,6 +99,4 @@ public class GuavaClasses { Suppliers.class.getName(); Stopwatch.class.getName(); } - } - diff --git a/common/src/main/java/org/apache/iceberg/common/DynClasses.java b/common/src/main/java/org/apache/iceberg/common/DynClasses.java index 78b0ff345ea4..af33aded80b1 100644 --- a/common/src/main/java/org/apache/iceberg/common/DynClasses.java +++ b/common/src/main/java/org/apache/iceberg/common/DynClasses.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.common; import java.util.Set; @@ -25,8 +24,7 @@ public class DynClasses { - private DynClasses() { - } + private DynClasses() {} public static Builder builder() { return new Builder(); @@ -38,13 +36,12 @@ public static class Builder { private boolean nullOk = false; private Set classNames = Sets.newLinkedHashSet(); - private Builder() { - } + private Builder() {} /** * Set the {@link ClassLoader} used to lookup classes by name. - *

- * If not set, the current thread's ClassLoader is used. + * + *

If not set, the current thread's ClassLoader is used. * * @param newLoader a ClassLoader * @return this Builder for method chaining @@ -77,8 +74,8 @@ public Builder impl(String className) { } /** - * Instructs this builder to return null if no class is found, rather than - * throwing an Exception. + * Instructs this builder to return null if no class is found, rather than throwing an + * Exception. * * @return this Builder for method chaining */ @@ -88,8 +85,7 @@ public Builder orNull() { } /** - * Returns the first implementation or throws ClassNotFoundException if - * one was not found. + * Returns the first implementation or throws ClassNotFoundException if one was not found. * * @param Java superclass * @return a {@link Class} for the first implementation found @@ -98,15 +94,14 @@ public Builder orNull() { @SuppressWarnings("unchecked") public Class buildChecked() throws ClassNotFoundException { if (!nullOk && foundClass == null) { - throw new ClassNotFoundException("Cannot find class; alternatives: " + - Joiner.on(", ").join(classNames)); + throw new ClassNotFoundException( + "Cannot find class; alternatives: " + Joiner.on(", ").join(classNames)); } return (Class) foundClass; } /** - * Returns the first implementation or throws RuntimeException if one was - * not found. + * Returns the first implementation or throws RuntimeException if one was not found. * * @param Java superclass * @return a {@link Class} for the first implementation found @@ -115,8 +110,8 @@ public Class buildChecked() throws ClassNotFoundException { @SuppressWarnings("unchecked") public Class build() { if (!nullOk && foundClass == null) { - throw new RuntimeException("Cannot find class; alternatives: " + - Joiner.on(", ").join(classNames)); + throw new RuntimeException( + "Cannot find class; alternatives: " + Joiner.on(", ").join(classNames)); } return (Class) foundClass; } diff --git a/common/src/main/java/org/apache/iceberg/common/DynConstructors.java b/common/src/main/java/org/apache/iceberg/common/DynConstructors.java index 25c30f9dcd54..4761be4f3e32 100644 --- a/common/src/main/java/org/apache/iceberg/common/DynConstructors.java +++ b/common/src/main/java/org/apache/iceberg/common/DynConstructors.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.common; import java.lang.reflect.Constructor; @@ -29,14 +28,10 @@ import org.apache.iceberg.relocated.com.google.common.base.Throwables; import org.apache.iceberg.relocated.com.google.common.collect.Maps; - -/** - * Copied from parquet-common - */ +/** Copied from parquet-common */ public class DynConstructors { - private DynConstructors() { - } + private DynConstructors() {} public static class Ctor extends DynMethods.UnboundMethod { private final Constructor ctor; @@ -80,16 +75,16 @@ public C newInstance(Object... args) { @Override @SuppressWarnings("unchecked") public R invoke(Object target, Object... args) { - Preconditions.checkArgument(target == null, - "Invalid call to constructor: target must be null"); + Preconditions.checkArgument( + target == null, "Invalid call to constructor: target must be null"); return (R) newInstance(args); } @Override @SuppressWarnings("unchecked") public R invokeChecked(Object target, Object... args) throws Exception { - Preconditions.checkArgument(target == null, - "Invalid call to constructor: target must be null"); + Preconditions.checkArgument( + target == null, "Invalid call to constructor: target must be null"); return (R) newInstanceChecked(args); } @@ -105,8 +100,7 @@ public boolean isStatic() { @Override public String toString() { - return getClass().getSimpleName() + - "(constructor=" + ctor + ", class=" + constructed + ")"; + return getClass().getSimpleName() + "(constructor=" + ctor + ", class=" + constructed + ")"; } } @@ -134,8 +128,8 @@ public Builder() { /** * Set the {@link ClassLoader} used to lookup classes by name. - *

- * If not set, the current thread's ClassLoader is used. + * + *

If not set, the current thread's ClassLoader is used. * * @param newLoader a ClassLoader * @return this Builder for method chaining @@ -249,16 +243,20 @@ public Void run() { } } - private static NoSuchMethodException buildCheckedException(Class baseClass, Map problems) { - NoSuchMethodException exc = new NoSuchMethodException( - "Cannot find constructor for " + baseClass + "\n" + formatProblems(problems)); + private static NoSuchMethodException buildCheckedException( + Class baseClass, Map problems) { + NoSuchMethodException exc = + new NoSuchMethodException( + "Cannot find constructor for " + baseClass + "\n" + formatProblems(problems)); problems.values().forEach(exc::addSuppressed); return exc; } - private static RuntimeException buildRuntimeException(Class baseClass, Map problems) { - RuntimeException exc = new RuntimeException( - "Cannot find constructor for " + baseClass + "\n" + formatProblems(problems)); + private static RuntimeException buildRuntimeException( + Class baseClass, Map problems) { + RuntimeException exc = + new RuntimeException( + "Cannot find constructor for " + baseClass + "\n" + formatProblems(problems)); problems.values().forEach(exc::addSuppressed); return exc; } @@ -272,9 +270,13 @@ private static String formatProblems(Map problems) { } else { sb.append("\n"); } - sb.append("\tMissing ").append(problem.getKey()).append(" [") - .append(problem.getValue().getClass().getName()).append(": ") - .append(problem.getValue().getMessage()).append("]"); + sb.append("\tMissing ") + .append(problem.getKey()) + .append(" [") + .append(problem.getValue().getClass().getName()) + .append(": ") + .append(problem.getValue().getMessage()) + .append("]"); } return sb.toString(); } diff --git a/common/src/main/java/org/apache/iceberg/common/DynFields.java b/common/src/main/java/org/apache/iceberg/common/DynFields.java index 72a65e47bd26..01afbad22ecb 100644 --- a/common/src/main/java/org/apache/iceberg/common/DynFields.java +++ b/common/src/main/java/org/apache/iceberg/common/DynFields.java @@ -16,10 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.common; - import java.lang.reflect.Field; import java.lang.reflect.Modifier; import java.security.AccessController; @@ -31,17 +29,15 @@ import org.apache.iceberg.relocated.com.google.common.base.Throwables; import org.apache.iceberg.relocated.com.google.common.collect.Sets; - public class DynFields { - private DynFields() { - } + private DynFields() {} /** * Convenience wrapper class around {@link java.lang.reflect.Field}. * - * Allows callers to invoke the wrapped method with all Exceptions wrapped by - * RuntimeException, or with a single Exception catch block. + *

Allows callers to invoke the wrapped method with all Exceptions wrapped by RuntimeException, + * or with a single Exception catch block. */ public static class UnboundField { private final Field field; @@ -87,13 +83,13 @@ public String toString() { * @throws IllegalArgumentException if the receiver's class is incompatible */ public BoundField bind(Object target) { - Preconditions.checkState(!isStatic() || this == AlwaysNull.INSTANCE, - "Cannot bind static field %s", name); + Preconditions.checkState( + !isStatic() || this == AlwaysNull.INSTANCE, "Cannot bind static field %s", name); Preconditions.checkArgument( field.getDeclaringClass().isAssignableFrom(target.getClass()), "Cannot bind field %s to instance of %s", - name, - target.getClass()); + name, + target.getClass()); return new BoundField<>(this, target); } @@ -109,16 +105,12 @@ public StaticField asStatic() { return new StaticField<>(this); } - /** - * Returns whether the field is a static field. - */ + /** Returns whether the field is a static field. */ public boolean isStatic() { return Modifier.isStatic(field.getModifiers()); } - /** - * Returns whether the field is always null. - */ + /** Returns whether the field is always null. */ public boolean isAlwaysNull() { return this == AlwaysNull.INSTANCE; } @@ -137,8 +129,7 @@ public Void get(Object target) { } @Override - public void set(Object target, Void value) { - } + public void set(Object target, Void value) {} @Override public String toString() { @@ -172,7 +163,6 @@ public void set(T value) { } } - public static class BoundField { private final UnboundField field; private final Object target; @@ -201,13 +191,12 @@ public static class Builder { private final Set candidates = Sets.newHashSet(); private boolean defaultAlwaysNull = false; - private Builder() { - } + private Builder() {} /** * Set the {@link ClassLoader} used to lookup classes by name. - *

- * If not set, the current thread's ClassLoader is used. + * + *

If not set, the current thread's ClassLoader is used. * * @param newLoader a ClassLoader * @return this Builder for method chaining @@ -218,8 +207,7 @@ public Builder loader(ClassLoader newLoader) { } /** - * Instructs this builder to return AlwaysNull if no implementation is - * found. + * Instructs this builder to return AlwaysNull if no implementation is found. * * @return this Builder for method chaining */ @@ -269,8 +257,7 @@ public Builder impl(Class targetClass, String fieldName) { } try { - this.field = new UnboundField<>( - targetClass.getField(fieldName), fieldName); + this.field = new UnboundField<>(targetClass.getField(fieldName), fieldName); } catch (NoSuchFieldException e) { // not the right implementation candidates.add(targetClass.getName() + "." + fieldName); @@ -330,8 +317,8 @@ public Builder hiddenImpl(Class targetClass, String fieldName) { } /** - * Returns the first valid implementation as a UnboundField or throws a - * NoSuchFieldException if there is none. + * Returns the first valid implementation as a UnboundField or throws a NoSuchFieldException if + * there is none. * * @param Java class stored in the field * @return a {@link UnboundField} with a valid implementation @@ -344,14 +331,14 @@ public UnboundField buildChecked() throws NoSuchFieldException { } else if (defaultAlwaysNull) { return (UnboundField) AlwaysNull.INSTANCE; } else { - throw new NoSuchFieldException("Cannot find field from candidates: " + - Joiner.on(", ").join(candidates)); + throw new NoSuchFieldException( + "Cannot find field from candidates: " + Joiner.on(", ").join(candidates)); } } /** - * Returns the first valid implementation as a BoundMethod or throws a - * NoSuchMethodException if there is none. + * Returns the first valid implementation as a BoundMethod or throws a NoSuchMethodException if + * there is none. * * @param target an Object on which to get and set the field * @param Java class stored in the field @@ -365,8 +352,8 @@ public BoundField buildChecked(Object target) throws NoSuchFieldException } /** - * Returns the first valid implementation as a UnboundField or throws a - * NoSuchFieldException if there is none. + * Returns the first valid implementation as a UnboundField or throws a NoSuchFieldException if + * there is none. * * @param Java class stored in the field * @return a {@link UnboundField} with a valid implementation @@ -379,14 +366,14 @@ public UnboundField build() { } else if (defaultAlwaysNull) { return (UnboundField) AlwaysNull.INSTANCE; } else { - throw new RuntimeException("Cannot find field from candidates: " + - Joiner.on(", ").join(candidates)); + throw new RuntimeException( + "Cannot find field from candidates: " + Joiner.on(", ").join(candidates)); } } /** - * Returns the first valid implementation as a BoundMethod or throws a - * RuntimeException if there is none. + * Returns the first valid implementation as a BoundMethod or throws a RuntimeException if there + * is none. * * @param target an Object on which to get and set the field * @param Java class stored in the field @@ -400,8 +387,8 @@ public BoundField build(Object target) { } /** - * Returns the first valid implementation as a StaticField or throws a - * NoSuchFieldException if there is none. + * Returns the first valid implementation as a StaticField or throws a NoSuchFieldException if + * there is none. * * @param Java class stored in the field * @return a {@link StaticField} with a valid implementation @@ -413,8 +400,8 @@ public StaticField buildStaticChecked() throws NoSuchFieldException { } /** - * Returns the first valid implementation as a StaticField or throws a - * RuntimeException if there is none. + * Returns the first valid implementation as a StaticField or throws a RuntimeException if there + * is none. * * @param Java class stored in the field * @return a {@link StaticField} with a valid implementation @@ -424,7 +411,6 @@ public StaticField buildStaticChecked() throws NoSuchFieldException { public StaticField buildStatic() { return this.build().asStatic(); } - } private static class MakeFieldAccessible implements PrivilegedAction { diff --git a/common/src/main/java/org/apache/iceberg/common/DynMethods.java b/common/src/main/java/org/apache/iceberg/common/DynMethods.java index 2c08b624edf6..ae331da84ed4 100644 --- a/common/src/main/java/org/apache/iceberg/common/DynMethods.java +++ b/common/src/main/java/org/apache/iceberg/common/DynMethods.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.common; import java.lang.reflect.InvocationTargetException; @@ -28,20 +27,16 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.base.Throwables; - -/** - * Copied from parquet-common - */ +/** Copied from parquet-common */ public class DynMethods { - private DynMethods() { - } + private DynMethods() {} /** * Convenience wrapper class around {@link java.lang.reflect.Method}. * - * Allows callers to invoke the wrapped method with all Exceptions wrapped by - * RuntimeException, or with a single Exception catch block. + *

Allows callers to invoke the wrapped method with all Exceptions wrapped by RuntimeException, + * or with a single Exception catch block. */ public static class UnboundMethod { @@ -52,8 +47,8 @@ public static class UnboundMethod { UnboundMethod(Method method, String name) { this.method = method; this.name = name; - this.argLength = (method == null || method.isVarArgs()) ? -1 : - method.getParameterTypes().length; + this.argLength = + (method == null || method.isVarArgs()) ? -1 : method.getParameterTypes().length; } @SuppressWarnings("unchecked") @@ -90,27 +85,23 @@ public R invoke(Object target, Object... args) { * @throws IllegalArgumentException if the receiver's class is incompatible */ public BoundMethod bind(Object receiver) { - Preconditions.checkState(!isStatic(), - "Cannot bind static method %s", method.toGenericString()); + Preconditions.checkState( + !isStatic(), "Cannot bind static method %s", method.toGenericString()); Preconditions.checkArgument( method.getDeclaringClass().isAssignableFrom(receiver.getClass()), "Cannot bind %s to instance of %s", - method.toGenericString(), - receiver.getClass()); + method.toGenericString(), + receiver.getClass()); return new BoundMethod(this, receiver); } - /** - * Returns whether the method is a static method. - */ + /** Returns whether the method is a static method. */ public boolean isStatic() { return Modifier.isStatic(method.getModifiers()); } - /** - * Returns whether the method is a noop. - */ + /** Returns whether the method is a noop. */ public boolean isNoop() { return this == NOOP; } @@ -128,39 +119,37 @@ public StaticMethod asStatic() { @Override public String toString() { - return "DynMethods.UnboundMethod(name=" + name + " method=" + - method.toGenericString() + ")"; - } - - /** - * Singleton {@link UnboundMethod}, performs no operation and returns null. - */ - private static final UnboundMethod NOOP = new UnboundMethod(null, "NOOP") { - @Override - public R invokeChecked(Object target, Object... args) throws Exception { - return null; - } - - @Override - public BoundMethod bind(Object receiver) { - return new BoundMethod(this, receiver); - } - - @Override - public StaticMethod asStatic() { - return new StaticMethod(this); - } - - @Override - public boolean isStatic() { - return true; - } - - @Override - public String toString() { - return "DynMethods.UnboundMethod(NOOP)"; - } - }; + return "DynMethods.UnboundMethod(name=" + name + " method=" + method.toGenericString() + ")"; + } + + /** Singleton {@link UnboundMethod}, performs no operation and returns null. */ + private static final UnboundMethod NOOP = + new UnboundMethod(null, "NOOP") { + @Override + public R invokeChecked(Object target, Object... args) throws Exception { + return null; + } + + @Override + public BoundMethod bind(Object receiver) { + return new BoundMethod(this, receiver); + } + + @Override + public StaticMethod asStatic() { + return new StaticMethod(this); + } + + @Override + public boolean isStatic() { + return true; + } + + @Override + public String toString() { + return "DynMethods.UnboundMethod(NOOP)"; + } + }; } public static class BoundMethod { @@ -218,8 +207,8 @@ public Builder(String methodName) { /** * Set the {@link ClassLoader} used to lookup classes by name. - *

- * If not set, the current thread's ClassLoader is used. + * + *

If not set, the current thread's ClassLoader is used. * * @param newLoader a ClassLoader * @return this Builder for method chaining @@ -232,7 +221,7 @@ public Builder loader(ClassLoader newLoader) { /** * If no implementation has been found, adds a NOOP method. * - * Note: calls to impl will not match after this method is called! + *

Note: calls to impl will not match after this method is called! * * @return this Builder for method chaining */ @@ -271,7 +260,7 @@ public Builder impl(String className, String methodName, Class... argClasses) /** * Checks for an implementation, first finding the given class by name. * - * The name passed to the constructor is the method name used. + *

The name passed to the constructor is the method name used. * * @param className name of a class * @param argClasses argument classes for the method @@ -301,8 +290,7 @@ public Builder impl(Class targetClass, String methodName, Class... argClas } try { - this.method = new UnboundMethod( - targetClass.getMethod(methodName, argClasses), name); + this.method = new UnboundMethod(targetClass.getMethod(methodName, argClasses), name); } catch (NoSuchMethodException e) { // not the right implementation } @@ -312,7 +300,7 @@ public Builder impl(Class targetClass, String methodName, Class... argClas /** * Checks for a method implementation. * - * The name passed to the constructor is the method name used. + *

The name passed to the constructor is the method name used. * * @param targetClass a class instance * @param argClasses argument classes for the method @@ -332,9 +320,7 @@ public Builder ctorImpl(Class targetClass, Class... argClasses) { } try { - this.method = new DynConstructors.Builder() - .impl(targetClass, argClasses) - .buildChecked(); + this.method = new DynConstructors.Builder().impl(targetClass, argClasses).buildChecked(); } catch (NoSuchMethodException e) { // not the right implementation } @@ -348,9 +334,7 @@ public Builder ctorImpl(String className, Class... argClasses) { } try { - this.method = new DynConstructors.Builder() - .impl(className, argClasses) - .buildChecked(); + this.method = new DynConstructors.Builder().impl(className, argClasses).buildChecked(); } catch (NoSuchMethodException e) { // not the right implementation } @@ -385,7 +369,7 @@ public Builder hiddenImpl(String className, String methodName, Class... argCl /** * Checks for an implementation, first finding the given class by name. * - * The name passed to the constructor is the method name used. + *

The name passed to the constructor is the method name used. * * @param className name of a class * @param argClasses argument classes for the method @@ -427,7 +411,7 @@ public Builder hiddenImpl(Class targetClass, String methodName, Class... a /** * Checks for a method implementation. * - * The name passed to the constructor is the method name used. + *

The name passed to the constructor is the method name used. * * @param targetClass a class instance * @param argClasses argument classes for the method @@ -441,8 +425,8 @@ public Builder hiddenImpl(Class targetClass, Class... argClasses) { } /** - * Returns the first valid implementation as a UnboundMethod or throws a - * RuntimeError if there is none. + * Returns the first valid implementation as a UnboundMethod or throws a RuntimeError if there + * is none. * * @return a {@link UnboundMethod} with a valid implementation * @throws RuntimeException if no implementation was found @@ -456,8 +440,8 @@ public UnboundMethod build() { } /** - * Returns the first valid implementation as a BoundMethod or throws a - * RuntimeError if there is none. + * Returns the first valid implementation as a BoundMethod or throws a RuntimeError if there is + * none. * * @param receiver an Object to receive the method invocation * @return a {@link BoundMethod} with a valid implementation and receiver @@ -470,8 +454,8 @@ public BoundMethod build(Object receiver) { } /** - * Returns the first valid implementation as a UnboundMethod or throws a - * NoSuchMethodException if there is none. + * Returns the first valid implementation as a UnboundMethod or throws a NoSuchMethodException + * if there is none. * * @return a {@link UnboundMethod} with a valid implementation * @throws NoSuchMethodException if no implementation was found @@ -485,8 +469,8 @@ public UnboundMethod buildChecked() throws NoSuchMethodException { } /** - * Returns the first valid implementation as a BoundMethod or throws a - * NoSuchMethodException if there is none. + * Returns the first valid implementation as a BoundMethod or throws a NoSuchMethodException if + * there is none. * * @param receiver an Object to receive the method invocation * @return a {@link BoundMethod} with a valid implementation and receiver @@ -499,8 +483,8 @@ public BoundMethod buildChecked(Object receiver) throws NoSuchMethodException { } /** - * Returns the first valid implementation as a StaticMethod or throws a - * NoSuchMethodException if there is none. + * Returns the first valid implementation as a StaticMethod or throws a NoSuchMethodException if + * there is none. * * @return a {@link StaticMethod} with a valid implementation * @throws IllegalStateException if the method is not static @@ -511,8 +495,8 @@ public StaticMethod buildStaticChecked() throws NoSuchMethodException { } /** - * Returns the first valid implementation as a StaticMethod or throws a - * RuntimeException if there is none. + * Returns the first valid implementation as a StaticMethod or throws a RuntimeException if + * there is none. * * @return a {@link StaticMethod} with a valid implementation * @throws IllegalStateException if the method is not static @@ -521,7 +505,6 @@ public StaticMethod buildStaticChecked() throws NoSuchMethodException { public StaticMethod buildStatic() { return build().asStatic(); } - } private static class MakeAccessible implements PrivilegedAction { diff --git a/core/src/jmh/java/org/apache/iceberg/util/ZOrderByteUtilsBenchmark.java b/core/src/jmh/java/org/apache/iceberg/util/ZOrderByteUtilsBenchmark.java index 77f66f12cff3..9221a36b13db 100644 --- a/core/src/jmh/java/org/apache/iceberg/util/ZOrderByteUtilsBenchmark.java +++ b/core/src/jmh/java/org/apache/iceberg/util/ZOrderByteUtilsBenchmark.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.util; import java.nio.ByteBuffer; @@ -78,7 +76,8 @@ public void interleaveValuesFourColumns(Blackhole blackhole) { ByteBuffer outputBuffer = ByteBuffer.allocate(outputSize); for (int i = 0; i < fourColumnInput.length; i++) { - byte[] interleavedBytes = ZOrderByteUtils.interleaveBits(fourColumnInput[i], outputSize, outputBuffer); + byte[] interleavedBytes = + ZOrderByteUtils.interleaveBits(fourColumnInput[i], outputSize, outputBuffer); blackhole.consume(interleavedBytes); } } @@ -90,7 +89,8 @@ public void interleaveValuesThreeColumns(Blackhole blackhole) { ByteBuffer outputBuffer = ByteBuffer.allocate(outputSize); for (int i = 0; i < fourColumnInput.length; i++) { - byte[] interleavedBytes = ZOrderByteUtils.interleaveBits(threeColumnInput[i], outputSize, outputBuffer); + byte[] interleavedBytes = + ZOrderByteUtils.interleaveBits(threeColumnInput[i], outputSize, outputBuffer); blackhole.consume(interleavedBytes); } } @@ -102,7 +102,8 @@ public void interleaveValuesTwoColumns(Blackhole blackhole) { ByteBuffer outputBuffer = ByteBuffer.allocate(outputSize); for (int i = 0; i < fourColumnInput.length; i++) { - byte[] interleavedBytes = ZOrderByteUtils.interleaveBits(twoColumnInput[i], outputSize, outputBuffer); + byte[] interleavedBytes = + ZOrderByteUtils.interleaveBits(twoColumnInput[i], outputSize, outputBuffer); blackhole.consume(interleavedBytes); } } @@ -114,7 +115,8 @@ public void interleaveValuesFourColumns8ByteOutput(Blackhole blackhole) { ByteBuffer outputBuffer = ByteBuffer.allocate(outputSize); for (int i = 0; i < fourColumnInput.length; i++) { - byte[] interleavedBytes = ZOrderByteUtils.interleaveBits(fourColumnInput[i], outputSize, outputBuffer); + byte[] interleavedBytes = + ZOrderByteUtils.interleaveBits(fourColumnInput[i], outputSize, outputBuffer); blackhole.consume(interleavedBytes); } } diff --git a/core/src/main/java/org/apache/iceberg/AllDataFilesTable.java b/core/src/main/java/org/apache/iceberg/AllDataFilesTable.java index e51692f18168..2f058c42b92c 100644 --- a/core/src/main/java/org/apache/iceberg/AllDataFilesTable.java +++ b/core/src/main/java/org/apache/iceberg/AllDataFilesTable.java @@ -16,17 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.io.CloseableIterable; /** * A {@link Table} implementation that exposes a table's valid data files as rows. - *

- * A valid data file is one that is readable from any snapshot currently tracked by the table. - *

- * This table may return duplicate rows. + * + *

A valid data file is one that is readable from any snapshot currently tracked by the table. + * + *

This table may return duplicate rows. */ public class AllDataFilesTable extends BaseFilesTable { @@ -54,13 +53,14 @@ public static class AllDataFilesTableScan extends BaseAllFilesTableScan { super(ops, table, schema, MetadataTableType.ALL_DATA_FILES); } - private AllDataFilesTableScan(TableOperations ops, Table table, Schema schema, - TableScanContext context) { + private AllDataFilesTableScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { super(ops, table, schema, MetadataTableType.ALL_DATA_FILES, context); } @Override - protected TableScan newRefinedScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + protected TableScan newRefinedScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { return new AllDataFilesTableScan(ops, table, schema, context); } diff --git a/core/src/main/java/org/apache/iceberg/AllDeleteFilesTable.java b/core/src/main/java/org/apache/iceberg/AllDeleteFilesTable.java index 19f299aa84dc..d1aece4340f8 100644 --- a/core/src/main/java/org/apache/iceberg/AllDeleteFilesTable.java +++ b/core/src/main/java/org/apache/iceberg/AllDeleteFilesTable.java @@ -16,17 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.io.CloseableIterable; /** * A {@link Table} implementation that exposes its valid delete files as rows. - *

- * A valid delete file is one that is readable from any snapshot currently tracked by the table. - *

- * This table may return duplicate rows. + * + *

A valid delete file is one that is readable from any snapshot currently tracked by the table. + * + *

This table may return duplicate rows. */ public class AllDeleteFilesTable extends BaseFilesTable { @@ -54,13 +53,14 @@ public static class AllDeleteFilesTableScan extends BaseAllFilesTableScan { super(ops, table, schema, MetadataTableType.ALL_DELETE_FILES); } - private AllDeleteFilesTableScan(TableOperations ops, Table table, Schema schema, - TableScanContext context) { + private AllDeleteFilesTableScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { super(ops, table, schema, MetadataTableType.ALL_DELETE_FILES, context); } @Override - protected TableScan newRefinedScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + protected TableScan newRefinedScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { return new AllDeleteFilesTableScan(ops, table, schema, context); } diff --git a/core/src/main/java/org/apache/iceberg/AllEntriesTable.java b/core/src/main/java/org/apache/iceberg/AllEntriesTable.java index a2de86183fc9..4d447dc78296 100644 --- a/core/src/main/java/org/apache/iceberg/AllEntriesTable.java +++ b/core/src/main/java/org/apache/iceberg/AllEntriesTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.ManifestEntriesTable.ManifestReadTask; @@ -29,10 +28,11 @@ import org.apache.iceberg.types.Types.StructType; /** - * A {@link Table} implementation that exposes a table's manifest entries as rows, for both delete and data files. - *

- * WARNING: this table exposes internal details, like files that have been deleted. For a table of the live data files, - * use {@link DataFilesTable}. + * A {@link Table} implementation that exposes a table's manifest entries as rows, for both delete + * and data files. + * + *

WARNING: this table exposes internal details, like files that have been deleted. For a table + * of the live data files, use {@link DataFilesTable}. */ public class AllEntriesTable extends BaseMetadataTable { @@ -54,7 +54,8 @@ public Schema schema() { StructType partitionType = Partitioning.partitionType(table()); Schema schema = ManifestEntry.getSchema(partitionType); if (partitionType.fields().size() < 1) { - // avoid returning an empty struct, which is not always supported. instead, drop the partition field (id 102) + // avoid returning an empty struct, which is not always supported. instead, drop the partition + // field (id 102) return TypeUtil.selectNot(schema, Sets.newHashSet(102)); } else { return schema; @@ -77,8 +78,8 @@ private Scan(TableOperations ops, Table table, Schema schema, TableScanContext c } @Override - protected TableScan newRefinedScan(TableOperations ops, Table table, Schema schema, - TableScanContext context) { + protected TableScan newRefinedScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { return new Scan(ops, table, schema, context); } @@ -92,8 +93,11 @@ protected CloseableIterable doPlanFiles() { Expression filter = shouldIgnoreResiduals() ? Expressions.alwaysTrue() : filter(); ResidualEvaluator residuals = ResidualEvaluator.unpartitioned(filter); - return CloseableIterable.transform(manifests, manifest -> - new ManifestReadTask(table(), manifest, schema(), schemaString, specString, residuals)); + return CloseableIterable.transform( + manifests, + manifest -> + new ManifestReadTask( + table(), manifest, schema(), schemaString, specString, residuals)); } } } diff --git a/core/src/main/java/org/apache/iceberg/AllFilesTable.java b/core/src/main/java/org/apache/iceberg/AllFilesTable.java index 470a9e42ad92..c93b58c71582 100644 --- a/core/src/main/java/org/apache/iceberg/AllFilesTable.java +++ b/core/src/main/java/org/apache/iceberg/AllFilesTable.java @@ -16,17 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.io.CloseableIterable; /** * A {@link Table} implementation that exposes its valid files as rows. - *

- * A valid file is one that is readable from any snapshot currently tracked by the table. - *

- * This table may return duplicate rows. + * + *

A valid file is one that is readable from any snapshot currently tracked by the table. + * + *

This table may return duplicate rows. */ public class AllFilesTable extends BaseFilesTable { @@ -54,13 +53,14 @@ public static class AllFilesTableScan extends BaseAllFilesTableScan { super(ops, table, schema, MetadataTableType.ALL_FILES); } - private AllFilesTableScan(TableOperations ops, Table table, Schema schema, - TableScanContext context) { + private AllFilesTableScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { super(ops, table, schema, MetadataTableType.ALL_FILES, context); } @Override - protected TableScan newRefinedScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + protected TableScan newRefinedScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { return new AllFilesTableScan(ops, table, schema, context); } diff --git a/core/src/main/java/org/apache/iceberg/AllManifestsTable.java b/core/src/main/java/org/apache/iceberg/AllManifestsTable.java index 613a7557aed1..395efd06094c 100644 --- a/core/src/main/java/org/apache/iceberg/AllManifestsTable.java +++ b/core/src/main/java/org/apache/iceberg/AllManifestsTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -44,33 +43,39 @@ /** * A {@link Table} implementation that exposes a table's valid manifest files as rows. - *

- * A valid manifest file is one that is referenced from any snapshot currently tracked by the table. - *

- * This table may return duplicate rows. + * + *

A valid manifest file is one that is referenced from any snapshot currently tracked by the + * table. + * + *

This table may return duplicate rows. */ public class AllManifestsTable extends BaseMetadataTable { private static final int REF_SNAPSHOT_ID = 18; - private static final Schema MANIFEST_FILE_SCHEMA = new Schema( - Types.NestedField.required(14, "content", Types.IntegerType.get()), - Types.NestedField.required(1, "path", Types.StringType.get()), - Types.NestedField.required(2, "length", Types.LongType.get()), - Types.NestedField.optional(3, "partition_spec_id", Types.IntegerType.get()), - Types.NestedField.optional(4, "added_snapshot_id", Types.LongType.get()), - Types.NestedField.optional(5, "added_data_files_count", Types.IntegerType.get()), - Types.NestedField.optional(6, "existing_data_files_count", Types.IntegerType.get()), - Types.NestedField.optional(7, "deleted_data_files_count", Types.IntegerType.get()), - Types.NestedField.required(15, "added_delete_files_count", Types.IntegerType.get()), - Types.NestedField.required(16, "existing_delete_files_count", Types.IntegerType.get()), - Types.NestedField.required(17, "deleted_delete_files_count", Types.IntegerType.get()), - Types.NestedField.optional(8, "partition_summaries", Types.ListType.ofRequired(9, Types.StructType.of( - Types.NestedField.required(10, "contains_null", Types.BooleanType.get()), - Types.NestedField.required(11, "contains_nan", Types.BooleanType.get()), - Types.NestedField.optional(12, "lower_bound", Types.StringType.get()), - Types.NestedField.optional(13, "upper_bound", Types.StringType.get()) - ))), - Types.NestedField.required(REF_SNAPSHOT_ID, "reference_snapshot_id", Types.LongType.get()) - ); + private static final Schema MANIFEST_FILE_SCHEMA = + new Schema( + Types.NestedField.required(14, "content", Types.IntegerType.get()), + Types.NestedField.required(1, "path", Types.StringType.get()), + Types.NestedField.required(2, "length", Types.LongType.get()), + Types.NestedField.optional(3, "partition_spec_id", Types.IntegerType.get()), + Types.NestedField.optional(4, "added_snapshot_id", Types.LongType.get()), + Types.NestedField.optional(5, "added_data_files_count", Types.IntegerType.get()), + Types.NestedField.optional(6, "existing_data_files_count", Types.IntegerType.get()), + Types.NestedField.optional(7, "deleted_data_files_count", Types.IntegerType.get()), + Types.NestedField.required(15, "added_delete_files_count", Types.IntegerType.get()), + Types.NestedField.required(16, "existing_delete_files_count", Types.IntegerType.get()), + Types.NestedField.required(17, "deleted_delete_files_count", Types.IntegerType.get()), + Types.NestedField.optional( + 8, + "partition_summaries", + Types.ListType.ofRequired( + 9, + Types.StructType.of( + Types.NestedField.required(10, "contains_null", Types.BooleanType.get()), + Types.NestedField.required(11, "contains_nan", Types.BooleanType.get()), + Types.NestedField.optional(12, "lower_bound", Types.StringType.get()), + Types.NestedField.optional(13, "upper_bound", Types.StringType.get())))), + Types.NestedField.required( + REF_SNAPSHOT_ID, "reference_snapshot_id", Types.LongType.get())); AllManifestsTable(TableOperations ops, Table table) { this(ops, table, table.name() + ".all_manifests"); @@ -101,14 +106,14 @@ public static class AllManifestsTableScan extends BaseAllMetadataTableScan { super(ops, table, fileSchema, MetadataTableType.ALL_MANIFESTS); } - private AllManifestsTableScan(TableOperations ops, Table table, Schema schema, - TableScanContext context) { + private AllManifestsTableScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { super(ops, table, schema, MetadataTableType.ALL_MANIFESTS, context); } @Override - protected TableScan newRefinedScan(TableOperations ops, Table table, Schema schema, - TableScanContext context) { + protected TableScan newRefinedScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { return new AllManifestsTableScan(ops, table, schema, context); } @@ -121,28 +126,40 @@ protected CloseableIterable doPlanFiles() { Expression filter = shouldIgnoreResiduals() ? Expressions.alwaysTrue() : filter(); ResidualEvaluator residuals = ResidualEvaluator.unpartitioned(filter); - SnapshotEvaluator snapshotEvaluator = new SnapshotEvaluator(filter, MANIFEST_FILE_SCHEMA.asStruct(), - isCaseSensitive()); - Iterable filteredSnapshots = Iterables.filter(table().snapshots(), snapshotEvaluator::eval); - - return CloseableIterable.withNoopClose(Iterables.transform(filteredSnapshots, snap -> { - if (snap.manifestListLocation() != null) { - DataFile manifestListAsDataFile = DataFiles.builder(PartitionSpec.unpartitioned()) - .withInputFile(io.newInputFile(snap.manifestListLocation())) - .withRecordCount(1) - .withFormat(FileFormat.AVRO) - .build(); - return new ManifestListReadTask(io, schema(), specs, - new BaseFileScanTask(manifestListAsDataFile, null, schemaString, specString, residuals), - snap.snapshotId()); - } else { - return StaticDataTask.of( - io.newInputFile(tableOps().current().metadataFileLocation()), - MANIFEST_FILE_SCHEMA, schema(), snap.allManifests(io), - manifest -> manifestFileToRow(specs.get(manifest.partitionSpecId()), manifest, snap.snapshotId()) - ); - } - })); + SnapshotEvaluator snapshotEvaluator = + new SnapshotEvaluator(filter, MANIFEST_FILE_SCHEMA.asStruct(), isCaseSensitive()); + Iterable filteredSnapshots = + Iterables.filter(table().snapshots(), snapshotEvaluator::eval); + + return CloseableIterable.withNoopClose( + Iterables.transform( + filteredSnapshots, + snap -> { + if (snap.manifestListLocation() != null) { + DataFile manifestListAsDataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withInputFile(io.newInputFile(snap.manifestListLocation())) + .withRecordCount(1) + .withFormat(FileFormat.AVRO) + .build(); + return new ManifestListReadTask( + io, + schema(), + specs, + new BaseFileScanTask( + manifestListAsDataFile, null, schemaString, specString, residuals), + snap.snapshotId()); + } else { + return StaticDataTask.of( + io.newInputFile(tableOps().current().metadataFileLocation()), + MANIFEST_FILE_SCHEMA, + schema(), + snap.allManifests(io), + manifest -> + manifestFileToRow( + specs.get(manifest.partitionSpecId()), manifest, snap.snapshotId())); + } + })); } } @@ -153,8 +170,12 @@ static class ManifestListReadTask implements DataTask { private final FileScanTask manifestListTask; private final long referenceSnapshotId; - ManifestListReadTask(FileIO io, Schema schema, Map specs, FileScanTask manifestListTask, - long referenceSnapshotId) { + ManifestListReadTask( + FileIO io, + Schema schema, + Map specs, + FileScanTask manifestListTask, + long referenceSnapshotId) { this.io = io; this.schema = schema; this.specs = specs; @@ -169,24 +190,29 @@ public List deletes() { @Override public CloseableIterable rows() { - try (CloseableIterable manifests = Avro - .read(io.newInputFile(manifestListTask.file().path().toString())) - .rename("manifest_file", GenericManifestFile.class.getName()) - .rename("partitions", GenericPartitionFieldSummary.class.getName()) - .rename("r508", GenericPartitionFieldSummary.class.getName()) - .project(ManifestFile.schema()) - .classLoader(GenericManifestFile.class.getClassLoader()) - .reuseContainers(false) - .build()) { - - CloseableIterable rowIterable = CloseableIterable.transform(manifests, - manifest -> manifestFileToRow(specs.get(manifest.partitionSpecId()), manifest, referenceSnapshotId)); + try (CloseableIterable manifests = + Avro.read(io.newInputFile(manifestListTask.file().path().toString())) + .rename("manifest_file", GenericManifestFile.class.getName()) + .rename("partitions", GenericPartitionFieldSummary.class.getName()) + .rename("r508", GenericPartitionFieldSummary.class.getName()) + .project(ManifestFile.schema()) + .classLoader(GenericManifestFile.class.getClassLoader()) + .reuseContainers(false) + .build()) { + + CloseableIterable rowIterable = + CloseableIterable.transform( + manifests, + manifest -> + manifestFileToRow( + specs.get(manifest.partitionSpecId()), manifest, referenceSnapshotId)); StructProjection projection = StructProjection.create(MANIFEST_FILE_SCHEMA, schema); return CloseableIterable.transform(rowIterable, projection::wrap); } catch (IOException e) { - throw new RuntimeIOException(e, "Cannot read manifest list file: %s", manifestListTask.file().path()); + throw new RuntimeIOException( + e, "Cannot read manifest list file: %s", manifestListTask.file().path()); } } @@ -221,7 +247,8 @@ public Iterable split(long splitSize) { } } - static StaticDataTask.Row manifestFileToRow(PartitionSpec spec, ManifestFile manifest, long referenceSnapshotId) { + static StaticDataTask.Row manifestFileToRow( + PartitionSpec spec, ManifestFile manifest, long referenceSnapshotId) { return StaticDataTask.Row.of( manifest.content().id(), manifest.path(), @@ -235,8 +262,7 @@ static StaticDataTask.Row manifestFileToRow(PartitionSpec spec, ManifestFile man manifest.content() == ManifestContent.DELETES ? manifest.existingFilesCount() : 0, manifest.content() == ManifestContent.DELETES ? manifest.deletedFilesCount() : 0, ManifestsTable.partitionSummariesToRows(spec, manifest.partitions()), - referenceSnapshotId - ); + referenceSnapshotId); } private static class SnapshotEvaluator { @@ -378,13 +404,15 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { /** * Comparison of snapshot reference and literal, using long comparator. * - * @param ref bound reference, comparison attempted only if reference is for reference_snapshot_id - * @param lit literal value to compare with snapshot id. - * @param desiredResult function to apply to long comparator result, returns true if result is as expected. + * @param ref bound reference, comparison attempted only if reference is for + * reference_snapshot_id + * @param lit literal value to compare with snapshot id. + * @param desiredResult function to apply to long comparator result, returns true if result is + * as expected. * @return false if comparator does not achieve desired result, true otherwise */ - private Boolean compareSnapshotRef(BoundReference ref, Literal lit, - Function desiredResult) { + private Boolean compareSnapshotRef( + BoundReference ref, Literal lit, Function desiredResult) { if (isSnapshotRef(ref)) { Literal longLit = lit.to(Types.LongType.get()); int cmp = longLit.comparator().compare(snapshotId, longLit.value()); diff --git a/core/src/main/java/org/apache/iceberg/BaseAddedRowsScanTask.java b/core/src/main/java/org/apache/iceberg/BaseAddedRowsScanTask.java index e671044ee360..7e4cc719e44f 100644 --- a/core/src/main/java/org/apache/iceberg/BaseAddedRowsScanTask.java +++ b/core/src/main/java/org/apache/iceberg/BaseAddedRowsScanTask.java @@ -16,21 +16,25 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; import org.apache.iceberg.expressions.ResidualEvaluator; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -class BaseAddedRowsScanTask - extends BaseChangelogContentScanTask +class BaseAddedRowsScanTask extends BaseChangelogContentScanTask implements AddedRowsScanTask { private final DeleteFile[] deletes; - BaseAddedRowsScanTask(int changeOrdinal, long commitSnapshotId, DataFile file, DeleteFile[] deletes, - String schemaString, String specString, ResidualEvaluator residuals) { + BaseAddedRowsScanTask( + int changeOrdinal, + long commitSnapshotId, + DataFile file, + DeleteFile[] deletes, + String schemaString, + String specString, + ResidualEvaluator residuals) { super(changeOrdinal, commitSnapshotId, file, schemaString, specString, residuals); this.deletes = deletes != null ? deletes : new DeleteFile[0]; } diff --git a/core/src/main/java/org/apache/iceberg/BaseAllMetadataTableScan.java b/core/src/main/java/org/apache/iceberg/BaseAllMetadataTableScan.java index 7d97e743ab4f..cd74ea348f9d 100644 --- a/core/src/main/java/org/apache/iceberg/BaseAllMetadataTableScan.java +++ b/core/src/main/java/org/apache/iceberg/BaseAllMetadataTableScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -35,12 +34,17 @@ abstract class BaseAllMetadataTableScan extends BaseMetadataTableScan { private static final Logger LOG = LoggerFactory.getLogger(BaseAllMetadataTableScan.class); - BaseAllMetadataTableScan(TableOperations ops, Table table, Schema schema, MetadataTableType tableType) { + BaseAllMetadataTableScan( + TableOperations ops, Table table, Schema schema, MetadataTableType tableType) { super(ops, table, schema, tableType); } - BaseAllMetadataTableScan(TableOperations ops, Table table, Schema schema, MetadataTableType tableType, - TableScanContext context) { + BaseAllMetadataTableScan( + TableOperations ops, + Table table, + Schema schema, + MetadataTableType tableType, + TableScanContext context) { super(ops, table, schema, tableType, context); } @@ -56,17 +60,23 @@ public TableScan asOfTime(long timestampMillis) { @Override public CloseableIterable planFiles() { - LOG.info("Scanning metadata table {} with filter {}.", table(), ExpressionUtil.toSanitizedString(filter())); + LOG.info( + "Scanning metadata table {} with filter {}.", + table(), + ExpressionUtil.toSanitizedString(filter())); Listeners.notifyAll(new ScanEvent(table().name(), 0L, filter(), schema())); return doPlanFiles(); } - protected CloseableIterable reachableManifests(Function> toManifests) { + protected CloseableIterable reachableManifests( + Function> toManifests) { Iterable snapshots = table().snapshots(); - Iterable> manifestIterables = Iterables.transform(snapshots, toManifests); + Iterable> manifestIterables = + Iterables.transform(snapshots, toManifests); - try (CloseableIterable iterable = new ParallelIterable<>(manifestIterables, planExecutor())) { + try (CloseableIterable iterable = + new ParallelIterable<>(manifestIterables, planExecutor())) { return CloseableIterable.withNoopClose(Sets.newHashSet(iterable)); } catch (IOException e) { throw new UncheckedIOException("Failed to close parallel iterable", e); diff --git a/core/src/main/java/org/apache/iceberg/BaseChangelogContentScanTask.java b/core/src/main/java/org/apache/iceberg/BaseChangelogContentScanTask.java index e1faae55a3b0..64c8cae08019 100644 --- a/core/src/main/java/org/apache/iceberg/BaseChangelogContentScanTask.java +++ b/core/src/main/java/org/apache/iceberg/BaseChangelogContentScanTask.java @@ -16,23 +16,26 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.ResidualEvaluator; import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; -abstract class BaseChangelogContentScanTask - & ChangelogScanTask, F extends ContentFile> - extends BaseContentScanTask - implements ChangelogScanTask { +abstract class BaseChangelogContentScanTask< + ThisT extends ContentScanTask & ChangelogScanTask, F extends ContentFile> + extends BaseContentScanTask implements ChangelogScanTask { private final int changeOrdinal; private final long commitSnapshotId; - BaseChangelogContentScanTask(int changeOrdinal, long commitSnapshotId, F file, - String schemaString, String specString, ResidualEvaluator residuals) { + BaseChangelogContentScanTask( + int changeOrdinal, + long commitSnapshotId, + F file, + String schemaString, + String specString, + ResidualEvaluator residuals) { super(file, schemaString, specString, residuals); this.changeOrdinal = changeOrdinal; this.commitSnapshotId = commitSnapshotId; @@ -59,8 +62,8 @@ public String toString() { .toString(); } - abstract static class SplitScanTask - & ChangelogScanTask, F extends ContentFile> + abstract static class SplitScanTask< + ThisT, ParentT extends ContentScanTask & ChangelogScanTask, F extends ContentFile> implements ContentScanTask, ChangelogScanTask, MergeableScanTask { private final ParentT parentTask; @@ -118,10 +121,10 @@ public Expression residual() { public boolean canMerge(ScanTask other) { if (getClass().equals(other.getClass())) { SplitScanTask that = (SplitScanTask) other; - return changeOrdinal() == that.changeOrdinal() && - commitSnapshotId() == that.commitSnapshotId() && - file().equals(that.file()) && - start() + length() == that.start(); + return changeOrdinal() == that.changeOrdinal() + && commitSnapshotId() == that.commitSnapshotId() + && file().equals(that.file()) + && start() + length() == that.start(); } else { return false; diff --git a/core/src/main/java/org/apache/iceberg/BaseCombinedScanTask.java b/core/src/main/java/org/apache/iceberg/BaseCombinedScanTask.java index aa65bf7c4ce8..da7e3d4b39b4 100644 --- a/core/src/main/java/org/apache/iceberg/BaseCombinedScanTask.java +++ b/core/src/main/java/org/apache/iceberg/BaseCombinedScanTask.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Collection; @@ -47,8 +46,6 @@ public Collection files() { @Override public String toString() { - return MoreObjects.toStringHelper(this) - .add("tasks", Joiner.on(", ").join(tasks)) - .toString(); + return MoreObjects.toStringHelper(this).add("tasks", Joiner.on(", ").join(tasks)).toString(); } } diff --git a/core/src/main/java/org/apache/iceberg/BaseContentScanTask.java b/core/src/main/java/org/apache/iceberg/BaseContentScanTask.java index c04334509558..e15b2b3f85d6 100644 --- a/core/src/main/java/org/apache/iceberg/BaseContentScanTask.java +++ b/core/src/main/java/org/apache/iceberg/BaseContentScanTask.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.expressions.Expression; @@ -84,9 +83,13 @@ public Expression residual() { public Iterable split(long targetSplitSize) { if (file.format().isSplittable()) { if (file.splitOffsets() != null && OFFSET_ORDERING.isOrdered(file.splitOffsets())) { - return () -> new OffsetsAwareSplitScanTaskIterator<>(self(), length(), file.splitOffsets(), this::newSplitTask); + return () -> + new OffsetsAwareSplitScanTaskIterator<>( + self(), length(), file.splitOffsets(), this::newSplitTask); } else { - return () -> new FixedSizeSplitScanTaskIterator<>(self(), length(), targetSplitSize, this::newSplitTask); + return () -> + new FixedSizeSplitScanTaskIterator<>( + self(), length(), targetSplitSize, this::newSplitTask); } } diff --git a/core/src/main/java/org/apache/iceberg/BaseDeletedDataFileScanTask.java b/core/src/main/java/org/apache/iceberg/BaseDeletedDataFileScanTask.java index 22591b1ea0b7..2c15cbbbfd90 100644 --- a/core/src/main/java/org/apache/iceberg/BaseDeletedDataFileScanTask.java +++ b/core/src/main/java/org/apache/iceberg/BaseDeletedDataFileScanTask.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -29,8 +28,14 @@ class BaseDeletedDataFileScanTask private final DeleteFile[] deletes; - BaseDeletedDataFileScanTask(int changeOrdinal, long commitSnapshotId, DataFile file, DeleteFile[] deletes, - String schemaString, String specString, ResidualEvaluator residuals) { + BaseDeletedDataFileScanTask( + int changeOrdinal, + long commitSnapshotId, + DataFile file, + DeleteFile[] deletes, + String schemaString, + String specString, + ResidualEvaluator residuals) { super(changeOrdinal, commitSnapshotId, file, schemaString, specString, residuals); this.deletes = deletes != null ? deletes : new DeleteFile[0]; } @@ -41,7 +46,8 @@ protected DeletedDataFileScanTask self() { } @Override - protected DeletedDataFileScanTask newSplitTask(DeletedDataFileScanTask parentTask, long offset, long length) { + protected DeletedDataFileScanTask newSplitTask( + DeletedDataFileScanTask parentTask, long offset, long length) { return new SplitDeletedDataFileScanTask(parentTask, offset, length); } diff --git a/core/src/main/java/org/apache/iceberg/BaseDeletedRowsScanTask.java b/core/src/main/java/org/apache/iceberg/BaseDeletedRowsScanTask.java index 66bd9d36f7d9..69272cedac0b 100644 --- a/core/src/main/java/org/apache/iceberg/BaseDeletedRowsScanTask.java +++ b/core/src/main/java/org/apache/iceberg/BaseDeletedRowsScanTask.java @@ -16,23 +16,27 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; import org.apache.iceberg.expressions.ResidualEvaluator; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -class BaseDeletedRowsScanTask - extends BaseChangelogContentScanTask +class BaseDeletedRowsScanTask extends BaseChangelogContentScanTask implements DeletedRowsScanTask { private final DeleteFile[] addedDeletes; private final DeleteFile[] existingDeletes; - BaseDeletedRowsScanTask(int changeOrdinal, long commitSnapshotId, DataFile file, - DeleteFile[] addedDeletes, DeleteFile[] existingDeletes, - String schemaString, String specString, ResidualEvaluator residuals) { + BaseDeletedRowsScanTask( + int changeOrdinal, + long commitSnapshotId, + DataFile file, + DeleteFile[] addedDeletes, + DeleteFile[] existingDeletes, + String schemaString, + String specString, + ResidualEvaluator residuals) { super(changeOrdinal, commitSnapshotId, file, schemaString, specString, residuals); this.addedDeletes = addedDeletes != null ? addedDeletes : new DeleteFile[0]; this.existingDeletes = existingDeletes != null ? existingDeletes : new DeleteFile[0]; @@ -44,7 +48,8 @@ protected DeletedRowsScanTask self() { } @Override - protected DeletedRowsScanTask newSplitTask(DeletedRowsScanTask parentTask, long offset, long length) { + protected DeletedRowsScanTask newSplitTask( + DeletedRowsScanTask parentTask, long offset, long length) { return new SplitDeletedRowsScanTask(parentTask, offset, length); } diff --git a/core/src/main/java/org/apache/iceberg/BaseFile.java b/core/src/main/java/org/apache/iceberg/BaseFile.java index b566b77e1331..21582bd044c9 100644 --- a/core/src/main/java/org/apache/iceberg/BaseFile.java +++ b/core/src/main/java/org/apache/iceberg/BaseFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; @@ -37,18 +36,21 @@ import org.apache.iceberg.util.ByteBuffers; import org.apache.iceberg.util.SerializableMap; -/** - * Base class for both {@link DataFile} and {@link DeleteFile}. - */ +/** Base class for both {@link DataFile} and {@link DeleteFile}. */ abstract class BaseFile - implements ContentFile, IndexedRecord, StructLike, SpecificData.SchemaConstructable, Serializable { + implements ContentFile, + IndexedRecord, + StructLike, + SpecificData.SchemaConstructable, + Serializable { static final Types.StructType EMPTY_STRUCT_TYPE = Types.StructType.of(); - static final PartitionData EMPTY_PARTITION_DATA = new PartitionData(EMPTY_STRUCT_TYPE) { - @Override - public PartitionData copy() { - return this; // this does not change - } - }; + static final PartitionData EMPTY_PARTITION_DATA = + new PartitionData(EMPTY_STRUCT_TYPE) { + @Override + public PartitionData copy() { + return this; // this does not change + } + }; private int[] fromProjectionPos; private Types.StructType partitionType; @@ -77,9 +79,7 @@ public PartitionData copy() { // cached schema private transient Schema avroSchema = null; - /** - * Used by Avro reflection to instantiate this class when reading manifest files. - */ + /** Used by Avro reflection to instantiate this class when reading manifest files. */ BaseFile(Schema avroSchema) { this.avroSchema = avroSchema; @@ -116,12 +116,24 @@ public PartitionData copy() { this.partitionData = new PartitionData(partitionType); } - BaseFile(int specId, FileContent content, String filePath, FileFormat format, - PartitionData partition, long fileSizeInBytes, long recordCount, - Map columnSizes, Map valueCounts, - Map nullValueCounts, Map nanValueCounts, - Map lowerBounds, Map upperBounds, List splitOffsets, - int[] equalityFieldIds, Integer sortOrderId, ByteBuffer keyMetadata) { + BaseFile( + int specId, + FileContent content, + String filePath, + FileFormat format, + PartitionData partition, + long fileSizeInBytes, + long recordCount, + Map columnSizes, + Map valueCounts, + Map nullValueCounts, + Map nanValueCounts, + Map lowerBounds, + Map upperBounds, + List splitOffsets, + int[] equalityFieldIds, + Integer sortOrderId, + ByteBuffer keyMetadata) { this.partitionSpecId = specId; this.content = content; this.filePath = filePath; @@ -183,18 +195,23 @@ public PartitionData copy() { this.upperBounds = null; } this.fromProjectionPos = toCopy.fromProjectionPos; - this.keyMetadata = toCopy.keyMetadata == null ? null : Arrays.copyOf(toCopy.keyMetadata, toCopy.keyMetadata.length); - this.splitOffsets = toCopy.splitOffsets == null ? null : - Arrays.copyOf(toCopy.splitOffsets, toCopy.splitOffsets.length); - this.equalityIds = toCopy.equalityIds != null ? Arrays.copyOf(toCopy.equalityIds, toCopy.equalityIds.length) : null; + this.keyMetadata = + toCopy.keyMetadata == null + ? null + : Arrays.copyOf(toCopy.keyMetadata, toCopy.keyMetadata.length); + this.splitOffsets = + toCopy.splitOffsets == null + ? null + : Arrays.copyOf(toCopy.splitOffsets, toCopy.splitOffsets.length); + this.equalityIds = + toCopy.equalityIds != null + ? Arrays.copyOf(toCopy.equalityIds, toCopy.equalityIds.length) + : null; this.sortOrderId = toCopy.sortOrderId; } - /** - * Constructor for Java serialization. - */ - BaseFile() { - } + /** Constructor for Java serialization. */ + BaseFile() {} @Override public int specId() { diff --git a/core/src/main/java/org/apache/iceberg/BaseFileScanTask.java b/core/src/main/java/org/apache/iceberg/BaseFileScanTask.java index 2105310ce055..2d7258be717a 100644 --- a/core/src/main/java/org/apache/iceberg/BaseFileScanTask.java +++ b/core/src/main/java/org/apache/iceberg/BaseFileScanTask.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -25,11 +24,16 @@ import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -public class BaseFileScanTask extends BaseContentScanTask implements FileScanTask { +public class BaseFileScanTask extends BaseContentScanTask + implements FileScanTask { private final DeleteFile[] deletes; - public BaseFileScanTask(DataFile file, DeleteFile[] deletes, String schemaString, String specString, - ResidualEvaluator residuals) { + public BaseFileScanTask( + DataFile file, + DeleteFile[] deletes, + String schemaString, + String specString, + ResidualEvaluator residuals) { super(file, schemaString, specString, residuals); this.deletes = deletes != null ? deletes : new DeleteFile[0]; } diff --git a/core/src/main/java/org/apache/iceberg/BaseFilesTable.java b/core/src/main/java/org/apache/iceberg/BaseFilesTable.java index 477bbc782857..4925506fe99f 100644 --- a/core/src/main/java/org/apache/iceberg/BaseFilesTable.java +++ b/core/src/main/java/org/apache/iceberg/BaseFilesTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.github.benmanes.caffeine.cache.Caffeine; @@ -35,9 +34,7 @@ import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types.StructType; -/** - * Base class logic for files metadata tables - */ +/** Base class logic for files metadata tables */ abstract class BaseFilesTable extends BaseMetadataTable { BaseFilesTable(TableOperations ops, Table table, String name) { @@ -49,52 +46,66 @@ public Schema schema() { StructType partitionType = Partitioning.partitionType(table()); Schema schema = new Schema(DataFile.getType(partitionType).fields()); if (partitionType.fields().size() < 1) { - // avoid returning an empty struct, which is not always supported. instead, drop the partition field + // avoid returning an empty struct, which is not always supported. instead, drop the partition + // field return TypeUtil.selectNot(schema, Sets.newHashSet(DataFile.PARTITION_ID)); } else { return schema; } } - private static CloseableIterable planFiles(Table table, CloseableIterable manifests, - Schema tableSchema, Schema projectedSchema, - TableScanContext context) { + private static CloseableIterable planFiles( + Table table, + CloseableIterable manifests, + Schema tableSchema, + Schema projectedSchema, + TableScanContext context) { Expression rowFilter = context.rowFilter(); boolean caseSensitive = context.caseSensitive(); boolean ignoreResiduals = context.ignoreResiduals(); - LoadingCache evalCache = Caffeine.newBuilder().build(specId -> { - PartitionSpec spec = table.specs().get(specId); - PartitionSpec transformedSpec = BaseFilesTable.transformSpec(tableSchema, spec); - return ManifestEvaluator.forRowFilter(rowFilter, transformedSpec, caseSensitive); - }); + LoadingCache evalCache = + Caffeine.newBuilder() + .build( + specId -> { + PartitionSpec spec = table.specs().get(specId); + PartitionSpec transformedSpec = BaseFilesTable.transformSpec(tableSchema, spec); + return ManifestEvaluator.forRowFilter(rowFilter, transformedSpec, caseSensitive); + }); - CloseableIterable filteredManifests = CloseableIterable.filter(manifests, - manifest -> evalCache.get(manifest.partitionSpecId()).eval(manifest)); + CloseableIterable filteredManifests = + CloseableIterable.filter( + manifests, manifest -> evalCache.get(manifest.partitionSpecId()).eval(manifest)); String schemaString = SchemaParser.toJson(projectedSchema); String specString = PartitionSpecParser.toJson(PartitionSpec.unpartitioned()); Expression filter = ignoreResiduals ? Expressions.alwaysTrue() : rowFilter; ResidualEvaluator residuals = ResidualEvaluator.unpartitioned(filter); - return CloseableIterable.transform(filteredManifests, manifest -> - new ManifestReadTask(table, manifest, projectedSchema, schemaString, specString, residuals)); + return CloseableIterable.transform( + filteredManifests, + manifest -> + new ManifestReadTask( + table, manifest, projectedSchema, schemaString, specString, residuals)); } abstract static class BaseFilesTableScan extends BaseMetadataTableScan { - protected BaseFilesTableScan(TableOperations ops, Table table, Schema schema, MetadataTableType tableType) { + protected BaseFilesTableScan( + TableOperations ops, Table table, Schema schema, MetadataTableType tableType) { super(ops, table, schema, tableType); } - protected BaseFilesTableScan(TableOperations ops, Table table, Schema schema, - MetadataTableType tableType, TableScanContext context) { + protected BaseFilesTableScan( + TableOperations ops, + Table table, + Schema schema, + MetadataTableType tableType, + TableScanContext context) { super(ops, table, schema, tableType, context); } - /** - * Returns an iterable of manifest files to explore for this files metadata table scan - */ + /** Returns an iterable of manifest files to explore for this files metadata table scan */ protected abstract CloseableIterable manifests(); @Override @@ -105,18 +116,21 @@ protected CloseableIterable doPlanFiles() { abstract static class BaseAllFilesTableScan extends BaseAllMetadataTableScan { - protected BaseAllFilesTableScan(TableOperations ops, Table table, Schema schema, MetadataTableType tableType) { + protected BaseAllFilesTableScan( + TableOperations ops, Table table, Schema schema, MetadataTableType tableType) { super(ops, table, schema, tableType); } - protected BaseAllFilesTableScan(TableOperations ops, Table table, Schema schema, - MetadataTableType tableType, TableScanContext context) { + protected BaseAllFilesTableScan( + TableOperations ops, + Table table, + Schema schema, + MetadataTableType tableType, + TableScanContext context) { super(ops, table, schema, tableType, context); } - /** - * Returns an iterable of manifest files to explore for this all files metadata table scan - */ + /** Returns an iterable of manifest files to explore for this all files metadata table scan */ protected abstract CloseableIterable manifests(); @Override @@ -131,8 +145,13 @@ static class ManifestReadTask extends BaseFileScanTask implements DataTask { private final ManifestFile manifest; private final Schema schema; - ManifestReadTask(Table table, ManifestFile manifest, - Schema schema, String schemaString, String specString, ResidualEvaluator residuals) { + ManifestReadTask( + Table table, + ManifestFile manifest, + Schema schema, + String schemaString, + String specString, + ResidualEvaluator residuals) { super(DataFiles.fromManifest(manifest), null, schemaString, specString, residuals); this.io = table.io(); this.specsById = Maps.newHashMap(table.specs()); @@ -152,7 +171,8 @@ private CloseableIterable> manifestEntries() { case DELETES: return ManifestFiles.readDeleteManifest(manifest, io, specsById).project(schema); default: - throw new IllegalArgumentException("Unsupported manifest content type:" + manifest.content()); + throw new IllegalArgumentException( + "Unsupported manifest content type:" + manifest.content()); } } diff --git a/core/src/main/java/org/apache/iceberg/BaseIncrementalAppendScan.java b/core/src/main/java/org/apache/iceberg/BaseIncrementalAppendScan.java index 9018b13000db..d8386bd98ef8 100644 --- a/core/src/main/java/org/apache/iceberg/BaseIncrementalAppendScan.java +++ b/core/src/main/java/org/apache/iceberg/BaseIncrementalAppendScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -40,7 +39,8 @@ class BaseIncrementalAppendScan this(ops, table, table.schema(), new TableScanContext()); } - BaseIncrementalAppendScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + BaseIncrementalAppendScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { super(ops, table, schema, context); } @@ -52,23 +52,27 @@ protected IncrementalAppendScan newRefinedScan( @Override public IncrementalAppendScan fromSnapshotInclusive(long fromSnapshotId) { - Preconditions.checkArgument(table().snapshot(fromSnapshotId) != null, - "Cannot find the starting snapshot: %s", fromSnapshotId); - return newRefinedScan(tableOps(), table(), schema(), context().fromSnapshotIdInclusive(fromSnapshotId)); + Preconditions.checkArgument( + table().snapshot(fromSnapshotId) != null, + "Cannot find the starting snapshot: %s", + fromSnapshotId); + return newRefinedScan( + tableOps(), table(), schema(), context().fromSnapshotIdInclusive(fromSnapshotId)); } @Override public IncrementalAppendScan fromSnapshotExclusive(long fromSnapshotId) { // for exclusive behavior, table().snapshot(fromSnapshotId) check can't be applied. // as fromSnapshotId could be matched to a parent snapshot that is already expired - return newRefinedScan(tableOps(), table(), schema(), context().fromSnapshotIdExclusive(fromSnapshotId)); + return newRefinedScan( + tableOps(), table(), schema(), context().fromSnapshotIdExclusive(fromSnapshotId)); } @Override public IncrementalAppendScan toSnapshot(long toSnapshotId) { - Preconditions.checkArgument(table().snapshot(toSnapshotId) != null, - "Cannot find end snapshot: %s", toSnapshotId); - return newRefinedScan(tableOps(), table(), schema(), context().toSnapshotId(toSnapshotId)); + Preconditions.checkArgument( + table().snapshot(toSnapshotId) != null, "Cannot find end snapshot: %s", toSnapshotId); + return newRefinedScan(tableOps(), table(), schema(), context().toSnapshotId(toSnapshotId)); } @Override @@ -76,26 +80,42 @@ public CloseableIterable planFiles() { Long fromSnapshotId = context().fromSnapshotId(); Long toSnapshotId = context().toSnapshotId(); if (fromSnapshotId == null && toSnapshotId == null && table().currentSnapshot() == null) { - // If it is an empty table (no current snapshot) and both from and to snapshots aren't set either, + // If it is an empty table (no current snapshot) and both from and to snapshots aren't set + // either, // simply return an empty iterable. In this case, listener notification is also skipped. return CloseableIterable.empty(); } long toSnapshotIdInclusive = toSnapshotIdInclusive(); - // fromSnapshotIdExclusive can be null. appendsBetween handles null fromSnapshotIdExclusive properly + // fromSnapshotIdExclusive can be null. appendsBetween handles null fromSnapshotIdExclusive + // properly // by finding the oldest ancestor of end snapshot. Long fromSnapshotIdExclusive = fromSnapshotIdExclusive(fromSnapshotId, toSnapshotIdInclusive); if (fromSnapshotIdExclusive != null) { - Listeners.notifyAll(new IncrementalScanEvent(table().name(), fromSnapshotIdExclusive, - toSnapshotIdInclusive, context().rowFilter(), table().schema(), false)); + Listeners.notifyAll( + new IncrementalScanEvent( + table().name(), + fromSnapshotIdExclusive, + toSnapshotIdInclusive, + context().rowFilter(), + table().schema(), + false)); } else { - Snapshot oldestAncestorSnapshot = SnapshotUtil.oldestAncestorOf(toSnapshotIdInclusive, table()::snapshot); - Listeners.notifyAll(new IncrementalScanEvent(table().name(), oldestAncestorSnapshot.snapshotId(), - toSnapshotIdInclusive, context().rowFilter(), table().schema(), true)); + Snapshot oldestAncestorSnapshot = + SnapshotUtil.oldestAncestorOf(toSnapshotIdInclusive, table()::snapshot); + Listeners.notifyAll( + new IncrementalScanEvent( + table().name(), + oldestAncestorSnapshot.snapshotId(), + toSnapshotIdInclusive, + context().rowFilter(), + table().schema(), + true)); } // appendsBetween handles null fromSnapshotId (exclusive) properly - List snapshots = appendsBetween(table(), fromSnapshotIdExclusive, toSnapshotIdInclusive); + List snapshots = + appendsBetween(table(), fromSnapshotIdExclusive, toSnapshotIdInclusive); if (snapshots.isEmpty()) { return CloseableIterable.empty(); } @@ -106,8 +126,10 @@ public CloseableIterable planFiles() { @Override public CloseableIterable planTasks() { CloseableIterable fileScanTasks = planFiles(); - CloseableIterable splitFiles = TableScanUtil.splitFiles(fileScanTasks, targetSplitSize()); - return TableScanUtil.planTasks(splitFiles, targetSplitSize(), splitLookback(), splitOpenFileCost()); + CloseableIterable splitFiles = + TableScanUtil.splitFiles(fileScanTasks, targetSplitSize()); + return TableScanUtil.planTasks( + splitFiles, targetSplitSize(), splitLookback(), splitOpenFileCost()); } private Long fromSnapshotIdExclusive(Long fromSnapshotId, long toSnapshotIdInclusive) { @@ -117,15 +139,18 @@ private Long fromSnapshotIdExclusive(Long fromSnapshotId, long toSnapshotIdInclu Preconditions.checkArgument( SnapshotUtil.isAncestorOf(table(), toSnapshotIdInclusive, fromSnapshotId), "Starting snapshot (inclusive) %s is not an ancestor of end snapshot %s", - fromSnapshotId, toSnapshotIdInclusive); - // for inclusive behavior fromSnapshotIdExclusive is set to the parent snapshot id, which can be null. + fromSnapshotId, + toSnapshotIdInclusive); + // for inclusive behavior fromSnapshotIdExclusive is set to the parent snapshot id, which + // can be null. return table().snapshot(fromSnapshotId).parentId(); } else { // validate the parent snapshot id an ancestor of toSnapshotId Preconditions.checkArgument( SnapshotUtil.isParentAncestorOf(table(), toSnapshotIdInclusive, fromSnapshotId), "Starting snapshot (exclusive) %s is not a parent ancestor of end snapshot %s", - fromSnapshotId, toSnapshotIdInclusive); + fromSnapshotId, + toSnapshotIdInclusive); return fromSnapshotId; } } else { @@ -138,7 +163,8 @@ private long toSnapshotIdInclusive() { return context().toSnapshotId(); } else { Snapshot currentSnapshot = table().currentSnapshot(); - Preconditions.checkArgument(currentSnapshot != null, + Preconditions.checkArgument( + currentSnapshot != null, "Invalid config: end snapshot is not set and table has no current snapshot"); return currentSnapshot.snapshotId(); } @@ -146,29 +172,33 @@ private long toSnapshotIdInclusive() { private CloseableIterable appendFilesFromSnapshots(List snapshots) { Set snapshotIds = Sets.newHashSet(Iterables.transform(snapshots, Snapshot::snapshotId)); - Set manifests = FluentIterable - .from(snapshots) - .transformAndConcat(Snapshot::dataManifests) - .filter(manifestFile -> snapshotIds.contains(manifestFile.snapshotId())) - .toSet(); - - ManifestGroup manifestGroup = new ManifestGroup(tableOps().io(), manifests) - .caseSensitive(context().caseSensitive()) - .select(context().returnColumnStats() ? DataTableScan.SCAN_WITH_STATS_COLUMNS : DataTableScan.SCAN_COLUMNS) - .filterData(context().rowFilter()) - .filterManifestEntries( - manifestEntry -> - snapshotIds.contains(manifestEntry.snapshotId()) && - manifestEntry.status() == ManifestEntry.Status.ADDED) - .specsById(tableOps().current().specsById()) - .ignoreDeleted(); + Set manifests = + FluentIterable.from(snapshots) + .transformAndConcat(Snapshot::dataManifests) + .filter(manifestFile -> snapshotIds.contains(manifestFile.snapshotId())) + .toSet(); + + ManifestGroup manifestGroup = + new ManifestGroup(tableOps().io(), manifests) + .caseSensitive(context().caseSensitive()) + .select( + context().returnColumnStats() + ? DataTableScan.SCAN_WITH_STATS_COLUMNS + : DataTableScan.SCAN_COLUMNS) + .filterData(context().rowFilter()) + .filterManifestEntries( + manifestEntry -> + snapshotIds.contains(manifestEntry.snapshotId()) + && manifestEntry.status() == ManifestEntry.Status.ADDED) + .specsById(tableOps().current().specsById()) + .ignoreDeleted(); if (context().ignoreResiduals()) { manifestGroup = manifestGroup.ignoreResiduals(); } - if (manifests.size() > 1 && - (DataTableScan.PLAN_SCANS_WITH_WORKER_POOL || context().planWithCustomizedExecutor())) { + if (manifests.size() > 1 + && (DataTableScan.PLAN_SCANS_WITH_WORKER_POOL || context().planWithCustomizedExecutor())) { manifestGroup = manifestGroup.planWith(context().planExecutor()); } @@ -176,12 +206,15 @@ private CloseableIterable appendFilesFromSnapshots(List } /** - * This method doesn't perform validation, which is already done by the caller {@link #planFiles()} + * This method doesn't perform validation, which is already done by the caller {@link + * #planFiles()} */ - private static List appendsBetween(Table table, Long fromSnapshotIdExclusive, long toSnapshotIdInclusive) { + private static List appendsBetween( + Table table, Long fromSnapshotIdExclusive, long toSnapshotIdInclusive) { List snapshots = Lists.newArrayList(); - for (Snapshot snapshot : SnapshotUtil.ancestorsBetween( - toSnapshotIdInclusive, fromSnapshotIdExclusive, table::snapshot)) { + for (Snapshot snapshot : + SnapshotUtil.ancestorsBetween( + toSnapshotIdInclusive, fromSnapshotIdExclusive, table::snapshot)) { if (snapshot.operation().equals(DataOperations.APPEND)) { snapshots.add(snapshot); } diff --git a/core/src/main/java/org/apache/iceberg/BaseMetadataTable.java b/core/src/main/java/org/apache/iceberg/BaseMetadataTable.java index a2c68b6533b0..b065ade32bcb 100644 --- a/core/src/main/java/org/apache/iceberg/BaseMetadataTable.java +++ b/core/src/main/java/org/apache/iceberg/BaseMetadataTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; @@ -29,10 +28,10 @@ /** * Base class for metadata tables. - *

- * Serializing and deserializing a metadata table object returns a read only implementation of the metadata table - * using a {@link StaticTableOperations}. This way no Catalog related calls are needed when reading the table data after - * deserialization. + * + *

Serializing and deserializing a metadata table object returns a read only implementation of + * the metadata table using a {@link StaticTableOperations}. This way no Catalog related calls are + * needed when reading the table data after deserialization. */ public abstract class BaseMetadataTable implements Table, HasTableOperations, Serializable { private final PartitionSpec spec = PartitionSpec.unpartitioned(); @@ -48,19 +47,22 @@ protected BaseMetadataTable(TableOperations ops, Table table, String name) { } /** - * This method transforms the table's partition spec to a spec that is used to rewrite the user-provided filter - * expression against the given metadata table. - *

- * The resulting partition spec maps partition.X fields to partition X using an identity partition transform. - * When this spec is used to project an expression for the given metadata table, the projection will remove - * predicates for non-partition fields (not in the spec) and will remove the "partition." prefix from fields. + * This method transforms the table's partition spec to a spec that is used to rewrite the + * user-provided filter expression against the given metadata table. + * + *

The resulting partition spec maps partition.X fields to partition X using an identity + * partition transform. When this spec is used to project an expression for the given metadata + * table, the projection will remove predicates for non-partition fields (not in the spec) and + * will remove the "partition." prefix from fields. * * @param metadataTableSchema schema of the metadata table * @param spec spec on which the metadata table schema is based - * @return a spec used to rewrite the metadata table filters to partition filters using an inclusive projection + * @return a spec used to rewrite the metadata table filters to partition filters using an + * inclusive projection */ static PartitionSpec transformSpec(Schema metadataTableSchema, PartitionSpec spec) { - PartitionSpec.Builder identitySpecBuilder = PartitionSpec.builderFor(metadataTableSchema).checkConflicts(false); + PartitionSpec.Builder identitySpecBuilder = + PartitionSpec.builderFor(metadataTableSchema).checkConflicts(false); spec.fields().forEach(pf -> identitySpecBuilder.add(pf.fieldId(), pf.name(), "identity")); return identitySpecBuilder.build(); } diff --git a/core/src/main/java/org/apache/iceberg/BaseMetadataTableScan.java b/core/src/main/java/org/apache/iceberg/BaseMetadataTableScan.java index 5708207ac55d..7b7ecb66bb27 100644 --- a/core/src/main/java/org/apache/iceberg/BaseMetadataTableScan.java +++ b/core/src/main/java/org/apache/iceberg/BaseMetadataTableScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.util.PropertyUtil; @@ -25,22 +24,27 @@ abstract class BaseMetadataTableScan extends BaseTableScan { private final MetadataTableType tableType; - protected BaseMetadataTableScan(TableOperations ops, Table table, Schema schema, MetadataTableType tableType) { + protected BaseMetadataTableScan( + TableOperations ops, Table table, Schema schema, MetadataTableType tableType) { super(ops, table, schema); this.tableType = tableType; } - protected BaseMetadataTableScan(TableOperations ops, Table table, Schema schema, MetadataTableType tableType, - TableScanContext context) { + protected BaseMetadataTableScan( + TableOperations ops, + Table table, + Schema schema, + MetadataTableType tableType, + TableScanContext context) { super(ops, table, schema, context); this.tableType = tableType; } /** - * Type of scan being performed, such as {@link MetadataTableType#ALL_DATA_FILES} when scanning - * a table's {@link org.apache.iceberg.AllDataFilesTable}. - *

- * Used for logging and error messages. + * Type of scan being performed, such as {@link MetadataTableType#ALL_DATA_FILES} when scanning a + * table's {@link org.apache.iceberg.AllDataFilesTable}. + * + *

Used for logging and error messages. */ protected MetadataTableType tableType() { return tableType; @@ -60,9 +64,11 @@ public TableScan appendsAfter(long fromSnapshotId) { @Override public long targetSplitSize() { - long tableValue = tableOps().current().propertyAsLong( - TableProperties.METADATA_SPLIT_SIZE, - TableProperties.METADATA_SPLIT_SIZE_DEFAULT); + long tableValue = + tableOps() + .current() + .propertyAsLong( + TableProperties.METADATA_SPLIT_SIZE, TableProperties.METADATA_SPLIT_SIZE_DEFAULT); return PropertyUtil.propertyAsLong(options(), TableProperties.SPLIT_SIZE, tableValue); } } diff --git a/core/src/main/java/org/apache/iceberg/BaseMetastoreCatalog.java b/core/src/main/java/org/apache/iceberg/BaseMetastoreCatalog.java index 7527bb7b58e2..16a5164f6f49 100644 --- a/core/src/main/java/org/apache/iceberg/BaseMetastoreCatalog.java +++ b/core/src/main/java/org/apache/iceberg/BaseMetastoreCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Map; @@ -70,7 +69,8 @@ public Table loadTable(TableIdentifier identifier) { public Table registerTable(TableIdentifier identifier, String metadataFileLocation) { Preconditions.checkArgument( identifier != null && isValidIdentifier(identifier), "Invalid identifier: %s", identifier); - Preconditions.checkArgument(metadataFileLocation != null && !metadataFileLocation.isEmpty(), + Preconditions.checkArgument( + metadataFileLocation != null && !metadataFileLocation.isEmpty(), "Cannot register an empty metadata file location as a table"); // Throw an exception if this table already exists in the catalog. @@ -101,15 +101,16 @@ private Table loadMetadataTable(TableIdentifier identifier) { throw new NoSuchTableException("Table does not exist: %s", baseTableIdentifier); } - return MetadataTableUtils.createMetadataTableInstance(ops, name(), baseTableIdentifier, identifier, type); + return MetadataTableUtils.createMetadataTableInstance( + ops, name(), baseTableIdentifier, identifier, type); } else { throw new NoSuchTableException("Table does not exist: %s", identifier); } } private boolean isValidMetadataIdentifier(TableIdentifier identifier) { - return MetadataTableType.from(identifier.name()) != null && - isValidIdentifier(TableIdentifier.of(identifier.namespace().levels())); + return MetadataTableType.from(identifier.name()) != null + && isValidIdentifier(TableIdentifier.of(identifier.namespace().levels())); } protected boolean isValidIdentifier(TableIdentifier tableIdentifier) { @@ -139,7 +140,8 @@ protected class BaseMetastoreCatalogTableBuilder implements TableBuilder { private String location = null; public BaseMetastoreCatalogTableBuilder(TableIdentifier identifier, Schema schema) { - Preconditions.checkArgument(isValidIdentifier(identifier), "Invalid table identifier: %s", identifier); + Preconditions.checkArgument( + isValidIdentifier(identifier), "Invalid table identifier: %s", identifier); this.identifier = identifier; this.schema = schema; @@ -187,7 +189,8 @@ public Table create() { String baseLocation = location != null ? location : defaultWarehouseLocation(identifier); tableProperties.putAll(tableOverrideProperties()); - TableMetadata metadata = TableMetadata.newTableMetadata(schema, spec, sortOrder, baseLocation, tableProperties); + TableMetadata metadata = + TableMetadata.newTableMetadata(schema, spec, sortOrder, baseLocation, tableProperties); try { ops.commit(null, metadata); @@ -207,7 +210,8 @@ public Transaction createTransaction() { String baseLocation = location != null ? location : defaultWarehouseLocation(identifier); tableProperties.putAll(tableOverrideProperties()); - TableMetadata metadata = TableMetadata.newTableMetadata(schema, spec, sortOrder, baseLocation, tableProperties); + TableMetadata metadata = + TableMetadata.newTableMetadata(schema, spec, sortOrder, baseLocation, tableProperties); return Transactions.createTableTransaction(identifier.toString(), ops, metadata); } @@ -231,10 +235,12 @@ private Transaction newReplaceTableTransaction(boolean orCreate) { tableProperties.putAll(tableOverrideProperties()); if (ops.current() != null) { String baseLocation = location != null ? location : ops.current().location(); - metadata = ops.current().buildReplacement(schema, spec, sortOrder, baseLocation, tableProperties); + metadata = + ops.current().buildReplacement(schema, spec, sortOrder, baseLocation, tableProperties); } else { String baseLocation = location != null ? location : defaultWarehouseLocation(identifier); - metadata = TableMetadata.newTableMetadata(schema, spec, sortOrder, baseLocation, tableProperties); + metadata = + TableMetadata.newTableMetadata(schema, spec, sortOrder, baseLocation, tableProperties); } if (orCreate) { @@ -252,7 +258,9 @@ private Transaction newReplaceTableTransaction(boolean orCreate) { private Map tableDefaultProperties() { Map tableDefaultProperties = PropertyUtil.propertiesWithPrefix(properties(), CatalogProperties.TABLE_DEFAULT_PREFIX); - LOG.info("Table properties set at catalog level through catalog properties: {}", tableDefaultProperties); + LOG.info( + "Table properties set at catalog level through catalog properties: {}", + tableDefaultProperties); return tableDefaultProperties; } @@ -264,7 +272,9 @@ private Map tableDefaultProperties() { private Map tableOverrideProperties() { Map tableOverrideProperties = PropertyUtil.propertiesWithPrefix(properties(), CatalogProperties.TABLE_OVERRIDE_PREFIX); - LOG.info("Table properties enforced at catalog level through catalog properties: {}", tableOverrideProperties); + LOG.info( + "Table properties enforced at catalog level through catalog properties: {}", + tableOverrideProperties); return tableOverrideProperties; } } diff --git a/core/src/main/java/org/apache/iceberg/BaseMetastoreTableOperations.java b/core/src/main/java/org/apache/iceberg/BaseMetastoreTableOperations.java index bfc91416ed41..d2a2a48ed81b 100644 --- a/core/src/main/java/org/apache/iceberg/BaseMetastoreTableOperations.java +++ b/core/src/main/java/org/apache/iceberg/BaseMetastoreTableOperations.java @@ -16,9 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_STATUS_CHECKS; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_STATUS_CHECKS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_MAX_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_MAX_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_MIN_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_MIN_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_TOTAL_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_TOTAL_WAIT_MS_DEFAULT; + import java.util.Set; import java.util.UUID; import java.util.concurrent.atomic.AtomicReference; @@ -39,15 +47,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_STATUS_CHECKS; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_STATUS_CHECKS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_MAX_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_MAX_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_MIN_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_MIN_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_TOTAL_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_STATUS_CHECKS_TOTAL_WAIT_MS_DEFAULT; - public abstract class BaseMetastoreTableOperations implements TableOperations { private static final Logger LOG = LoggerFactory.getLogger(BaseMetastoreTableOperations.class); @@ -63,12 +62,12 @@ public abstract class BaseMetastoreTableOperations implements TableOperations { private boolean shouldRefresh = true; private int version = -1; - protected BaseMetastoreTableOperations() { - } + protected BaseMetastoreTableOperations() {} /** - * The full name of the table used for logging purposes only. For example for HiveTableOperations it is - * catalogName + "." + database + "." + table. + * The full name of the table used for logging purposes only. For example for HiveTableOperations + * it is catalogName + "." + database + "." + table. + * * @return The full name */ protected abstract String tableName(); @@ -119,7 +118,8 @@ public void commit(TableMetadata base, TableMetadata metadata) { if (base != null) { throw new CommitFailedException("Cannot commit: stale table metadata"); } else { - // when current is non-null, the table exists. but when base is null, the commit is trying to create the table + // when current is non-null, the table exists. but when base is null, the commit is trying + // to create the table throw new AlreadyExistsException("Table already exists: %s", tableName()); } } @@ -134,7 +134,8 @@ public void commit(TableMetadata base, TableMetadata metadata) { deleteRemovedMetadataFiles(base, metadata); requestRefresh(); - LOG.info("Successfully committed to table {} in {} ms", + LOG.info( + "Successfully committed to table {} in {} ms", tableName(), System.currentTimeMillis() - start); } @@ -171,29 +172,39 @@ protected void refreshFromMetadataLocation(String newLocation, int numRetries) { refreshFromMetadataLocation(newLocation, null, numRetries); } - protected void refreshFromMetadataLocation(String newLocation, Predicate shouldRetry, - int numRetries) { - refreshFromMetadataLocation(newLocation, shouldRetry, numRetries, + protected void refreshFromMetadataLocation( + String newLocation, Predicate shouldRetry, int numRetries) { + refreshFromMetadataLocation( + newLocation, + shouldRetry, + numRetries, metadataLocation -> TableMetadataParser.read(io(), metadataLocation)); } - protected void refreshFromMetadataLocation(String newLocation, Predicate shouldRetry, - int numRetries, Function metadataLoader) { + protected void refreshFromMetadataLocation( + String newLocation, + Predicate shouldRetry, + int numRetries, + Function metadataLoader) { // use null-safe equality check because new tables have a null metadata location if (!Objects.equal(currentMetadataLocation, newLocation)) { LOG.info("Refreshing table metadata from new version: {}", newLocation); AtomicReference newMetadata = new AtomicReference<>(); Tasks.foreach(newLocation) - .retry(numRetries).exponentialBackoff(100, 5000, 600000, 4.0 /* 100, 400, 1600, ... */) + .retry(numRetries) + .exponentialBackoff(100, 5000, 600000, 4.0 /* 100, 400, 1600, ... */) .throwFailureWhenFinished() .shouldRetryTest(shouldRetry) .run(metadataLocation -> newMetadata.set(metadataLoader.apply(metadataLocation))); String newUUID = newMetadata.get().uuid(); if (currentMetadata != null && currentMetadata.uuid() != null && newUUID != null) { - Preconditions.checkState(newUUID.equals(currentMetadata.uuid()), - "Table UUID does not match: current=%s != refreshed=%s", currentMetadata.uuid(), newUUID); + Preconditions.checkState( + newUUID.equals(currentMetadata.uuid()), + "Table UUID does not match: current=%s != refreshed=%s", + currentMetadata.uuid(), + newUUID); } this.currentMetadata = newMetadata.get(); @@ -204,8 +215,7 @@ protected void refreshFromMetadataLocation(String newLocation, Predicate status = new AtomicReference<>(CommitStatus.UNKNOWN); @@ -301,42 +325,57 @@ protected CommitStatus checkCommitStatus(String newMetadataLocation, TableMetada .retry(maxAttempts) .suppressFailureWhenFinished() .exponentialBackoff(minWaitMs, maxWaitMs, totalRetryMs, 2.0) - .onFailure((location, checkException) -> - LOG.error("Cannot check if commit to {} exists.", tableName(), checkException)) - .run(location -> { - TableMetadata metadata = refresh(); - String currentMetadataFileLocation = metadata.metadataFileLocation(); - boolean commitSuccess = currentMetadataFileLocation.equals(newMetadataLocation) || - metadata.previousFiles().stream().anyMatch(log -> log.file().equals(newMetadataLocation)); - if (commitSuccess) { - LOG.info("Commit status check: Commit to {} of {} succeeded", tableName(), newMetadataLocation); - status.set(CommitStatus.SUCCESS); - } else { - LOG.warn("Commit status check: Commit to {} of {} unknown, new metadata location is not current " + - "or in history", tableName(), newMetadataLocation); - } - }); + .onFailure( + (location, checkException) -> + LOG.error("Cannot check if commit to {} exists.", tableName(), checkException)) + .run( + location -> { + TableMetadata metadata = refresh(); + String currentMetadataFileLocation = metadata.metadataFileLocation(); + boolean commitSuccess = + currentMetadataFileLocation.equals(newMetadataLocation) + || metadata.previousFiles().stream() + .anyMatch(log -> log.file().equals(newMetadataLocation)); + if (commitSuccess) { + LOG.info( + "Commit status check: Commit to {} of {} succeeded", + tableName(), + newMetadataLocation); + status.set(CommitStatus.SUCCESS); + } else { + LOG.warn( + "Commit status check: Commit to {} of {} unknown, new metadata location is not current " + + "or in history", + tableName(), + newMetadataLocation); + } + }); if (status.get() == CommitStatus.UNKNOWN) { - LOG.error("Cannot determine commit state to {}. Failed during checking {} times. " + - "Treating commit state as unknown.", tableName(), maxAttempts); + LOG.error( + "Cannot determine commit state to {}. Failed during checking {} times. " + + "Treating commit state as unknown.", + tableName(), + maxAttempts); } return status.get(); } private String newTableMetadataFilePath(TableMetadata meta, int newVersion) { - String codecName = meta.property( - TableProperties.METADATA_COMPRESSION, TableProperties.METADATA_COMPRESSION_DEFAULT); + String codecName = + meta.property( + TableProperties.METADATA_COMPRESSION, TableProperties.METADATA_COMPRESSION_DEFAULT); String fileExtension = TableMetadataParser.getFileExtension(codecName); - return metadataFileLocation(meta, String.format("%05d-%s%s", newVersion, UUID.randomUUID(), fileExtension)); + return metadataFileLocation( + meta, String.format("%05d-%s%s", newVersion, UUID.randomUUID(), fileExtension)); } /** * Parse the version from table metadata file name. * * @param metadataLocation table metadata file location - * @return version of the table metadata file in success case and - * -1 if the version is not parsable (as a sign that the metadata is not part of this catalog) + * @return version of the table metadata file in success case and -1 if the version is not + * parsable (as a sign that the metadata is not part of this catalog) */ private static int parseVersion(String metadataLocation) { int versionStart = metadataLocation.lastIndexOf('/') + 1; // if '/' isn't found, this will be 0 @@ -355,9 +394,10 @@ private static int parseVersion(String metadataLocation) { } /** - * Deletes the oldest metadata files if {@link TableProperties#METADATA_DELETE_AFTER_COMMIT_ENABLED} is true. + * Deletes the oldest metadata files if {@link + * TableProperties#METADATA_DELETE_AFTER_COMMIT_ENABLED} is true. * - * @param base table metadata on which previous versions were based + * @param base table metadata on which previous versions were based * @param metadata new table metadata with updated previous versions */ private void deleteRemovedMetadataFiles(TableMetadata base, TableMetadata metadata) { @@ -365,17 +405,22 @@ private void deleteRemovedMetadataFiles(TableMetadata base, TableMetadata metada return; } - boolean deleteAfterCommit = metadata.propertyAsBoolean( - TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED, - TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT); + boolean deleteAfterCommit = + metadata.propertyAsBoolean( + TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED, + TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT); if (deleteAfterCommit) { - Set removedPreviousMetadataFiles = Sets.newHashSet(base.previousFiles()); + Set removedPreviousMetadataFiles = + Sets.newHashSet(base.previousFiles()); removedPreviousMetadataFiles.removeAll(metadata.previousFiles()); Tasks.foreach(removedPreviousMetadataFiles) - .noRetry().suppressFailureWhenFinished() - .onFailure((previousMetadataFile, exc) -> - LOG.warn("Delete failed for previous metadata file: {}", previousMetadataFile, exc)) + .noRetry() + .suppressFailureWhenFinished() + .onFailure( + (previousMetadataFile, exc) -> + LOG.warn( + "Delete failed for previous metadata file: {}", previousMetadataFile, exc)) .run(previousMetadataFile -> io().deleteFile(previousMetadataFile.file())); } } diff --git a/core/src/main/java/org/apache/iceberg/BaseOverwriteFiles.java b/core/src/main/java/org/apache/iceberg/BaseOverwriteFiles.java index 3dd8cad11d90..bbb51fdc7e3e 100644 --- a/core/src/main/java/org/apache/iceberg/BaseOverwriteFiles.java +++ b/core/src/main/java/org/apache/iceberg/BaseOverwriteFiles.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Set; @@ -29,7 +28,8 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Sets; -public class BaseOverwriteFiles extends MergingSnapshotProducer implements OverwriteFiles { +public class BaseOverwriteFiles extends MergingSnapshotProducer + implements OverwriteFiles { private final Set deletedDataFiles = Sets.newHashSet(); private boolean validateAddedFilesMatchOverwriteFilter = false; private Long startingSnapshotId = null; @@ -84,7 +84,8 @@ public OverwriteFiles validateFromSnapshot(long snapshotId) { @Override public OverwriteFiles conflictDetectionFilter(Expression newConflictDetectionFilter) { - Preconditions.checkArgument(newConflictDetectionFilter != null, "Conflict detection filter cannot be null"); + Preconditions.checkArgument( + newConflictDetectionFilter != null, "Conflict detection filter cannot be null"); this.conflictDetectionFilter = newConflictDetectionFilter; return this; } @@ -115,21 +116,22 @@ protected void validate(TableMetadata base) { Expression strictExpr = Projections.strict(spec).project(rowFilter); Evaluator strict = new Evaluator(spec.partitionType(), strictExpr); - StrictMetricsEvaluator metrics = new StrictMetricsEvaluator(base.schema(), rowFilter, isCaseSensitive()); + StrictMetricsEvaluator metrics = + new StrictMetricsEvaluator(base.schema(), rowFilter, isCaseSensitive()); for (DataFile file : addedFiles()) { // the real test is that the strict or metrics test matches the file, indicating that all // records in the file match the filter. inclusive is used to avoid testing the metrics, // which is more complicated ValidationException.check( - inclusive.eval(file.partition()) && - (strict.eval(file.partition()) || metrics.eval(file)), + inclusive.eval(file.partition()) + && (strict.eval(file.partition()) || metrics.eval(file)), "Cannot append file with rows that do not match filter: %s: %s", - rowFilter, file.path()); + rowFilter, + file.path()); } } - if (validateNewDataFiles) { validateAddedDataFiles(base, startingSnapshotId, dataConflictDetectionFilter()); } @@ -142,7 +144,8 @@ protected void validate(TableMetadata base) { } if (deletedDataFiles.size() > 0) { - validateNoNewDeletesForDataFiles(base, startingSnapshotId, conflictDetectionFilter, deletedDataFiles); + validateNoNewDeletesForDataFiles( + base, startingSnapshotId, conflictDetectionFilter, deletedDataFiles); } } } diff --git a/core/src/main/java/org/apache/iceberg/BaseReplacePartitions.java b/core/src/main/java/org/apache/iceberg/BaseReplacePartitions.java index fdf8d2580831..2847f5ceca6b 100644 --- a/core/src/main/java/org/apache/iceberg/BaseReplacePartitions.java +++ b/core/src/main/java/org/apache/iceberg/BaseReplacePartitions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -24,8 +23,8 @@ import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.util.PartitionSet; -public class BaseReplacePartitions - extends MergingSnapshotProducer implements ReplacePartitions { +public class BaseReplacePartitions extends MergingSnapshotProducer + implements ReplacePartitions { private final PartitionSet replacedPartitions; private Long startingSnapshotId; diff --git a/core/src/main/java/org/apache/iceberg/BaseReplaceSortOrder.java b/core/src/main/java/org/apache/iceberg/BaseReplaceSortOrder.java index 6226660875ac..32d69cc98f75 100644 --- a/core/src/main/java/org/apache/iceberg/BaseReplaceSortOrder.java +++ b/core/src/main/java/org/apache/iceberg/BaseReplaceSortOrder.java @@ -16,13 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -import org.apache.iceberg.exceptions.CommitFailedException; -import org.apache.iceberg.expressions.Term; -import org.apache.iceberg.util.Tasks; - import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; @@ -32,6 +27,10 @@ import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; +import org.apache.iceberg.exceptions.CommitFailedException; +import org.apache.iceberg.expressions.Term; +import org.apache.iceberg.util.Tasks; + public class BaseReplaceSortOrder implements ReplaceSortOrder { private final TableOperations ops; private final SortOrder.Builder builder; @@ -58,12 +57,13 @@ public void commit() { base.propertyAsInt(COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), 2.0 /* exponential */) .onlyRetryOn(CommitFailedException.class) - .run(taskOps -> { - this.base = ops.refresh(); - SortOrder newOrder = apply(); - TableMetadata updated = base.replaceSortOrder(newOrder); - taskOps.commit(base, updated); - }); + .run( + taskOps -> { + this.base = ops.refresh(); + SortOrder newOrder = apply(); + TableMetadata updated = base.replaceSortOrder(newOrder); + taskOps.commit(base, updated); + }); } @Override diff --git a/core/src/main/java/org/apache/iceberg/BaseRewriteFiles.java b/core/src/main/java/org/apache/iceberg/BaseRewriteFiles.java index 6765986bdd0c..1bc846e27602 100644 --- a/core/src/main/java/org/apache/iceberg/BaseRewriteFiles.java +++ b/core/src/main/java/org/apache/iceberg/BaseRewriteFiles.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Set; @@ -45,8 +44,11 @@ protected String operation() { return DataOperations.REPLACE; } - private void verifyInputAndOutputFiles(Set dataFilesToDelete, Set deleteFilesToDelete, - Set dataFilesToAdd, Set deleteFilesToAdd) { + private void verifyInputAndOutputFiles( + Set dataFilesToDelete, + Set deleteFilesToDelete, + Set dataFilesToAdd, + Set deleteFilesToAdd) { Preconditions.checkNotNull(dataFilesToDelete, "Data files to delete can not be null"); Preconditions.checkNotNull(deleteFilesToDelete, "Delete files to delete can not be null"); Preconditions.checkNotNull(dataFilesToAdd, "Data files to add can not be null"); @@ -59,21 +61,27 @@ private void verifyInputAndOutputFiles(Set dataFilesToDelete, Set 0, "Files to delete cannot be null or empty"); if (deleteFilesToDelete.isEmpty()) { - Preconditions.checkArgument(deleteFilesToAdd.isEmpty(), + Preconditions.checkArgument( + deleteFilesToAdd.isEmpty(), "Delete files to add must be empty because there's no delete file to be rewritten"); } } @Override - public RewriteFiles rewriteFiles(Set filesToDelete, Set filesToAdd, long sequenceNumber) { + public RewriteFiles rewriteFiles( + Set filesToDelete, Set filesToAdd, long sequenceNumber) { setNewFilesSequenceNumber(sequenceNumber); return rewriteFiles(filesToDelete, ImmutableSet.of(), filesToAdd, ImmutableSet.of()); } @Override - public RewriteFiles rewriteFiles(Set dataFilesToReplace, Set deleteFilesToReplace, - Set dataFilesToAdd, Set deleteFilesToAdd) { - verifyInputAndOutputFiles(dataFilesToReplace, deleteFilesToReplace, dataFilesToAdd, deleteFilesToAdd); + public RewriteFiles rewriteFiles( + Set dataFilesToReplace, + Set deleteFilesToReplace, + Set dataFilesToAdd, + Set deleteFilesToAdd) { + verifyInputAndOutputFiles( + dataFilesToReplace, deleteFilesToReplace, dataFilesToAdd, deleteFilesToAdd); replacedDataFiles.addAll(dataFilesToReplace); for (DataFile dataFile : dataFilesToReplace) { @@ -104,7 +112,8 @@ public RewriteFiles validateFromSnapshot(long snapshotId) { @Override protected void validate(TableMetadata base) { if (replacedDataFiles.size() > 0) { - // if there are replaced data files, there cannot be any new row-level deletes for those data files + // if there are replaced data files, there cannot be any new row-level deletes for those data + // files validateNoNewDeletesForDataFiles(base, startingSnapshotId, replacedDataFiles); } } diff --git a/core/src/main/java/org/apache/iceberg/BaseRewriteManifests.java b/core/src/main/java/org/apache/iceberg/BaseRewriteManifests.java index f0e8e7904fb4..c61b99dcfc65 100644 --- a/core/src/main/java/org/apache/iceberg/BaseRewriteManifests.java +++ b/core/src/main/java/org/apache/iceberg/BaseRewriteManifests.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TableProperties.MANIFEST_TARGET_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT; +import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; +import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT; + import java.io.IOException; import java.util.Arrays; import java.util.Collection; @@ -43,13 +47,8 @@ import org.apache.iceberg.util.Pair; import org.apache.iceberg.util.Tasks; -import static org.apache.iceberg.TableProperties.MANIFEST_TARGET_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT; -import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; -import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT; - - -public class BaseRewriteManifests extends SnapshotProducer implements RewriteManifests { +public class BaseRewriteManifests extends SnapshotProducer + implements RewriteManifests { private static final String KEPT_MANIFESTS_COUNT = "manifests-kept"; private static final String CREATED_MANIFESTS_COUNT = "manifests-created"; private static final String REPLACED_MANIFESTS_COUNT = "manifests-replaced"; @@ -81,9 +80,12 @@ public class BaseRewriteManifests extends SnapshotProducer imp this.ops = ops; this.specsById = ops.current().specsById(); this.manifestTargetSizeBytes = - ops.current().propertyAsLong(MANIFEST_TARGET_SIZE_BYTES, MANIFEST_TARGET_SIZE_BYTES_DEFAULT); - this.snapshotIdInheritanceEnabled = ops.current() - .propertyAsBoolean(SNAPSHOT_ID_INHERITANCE_ENABLED, SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); + ops.current() + .propertyAsLong(MANIFEST_TARGET_SIZE_BYTES, MANIFEST_TARGET_SIZE_BYTES_DEFAULT); + this.snapshotIdInheritanceEnabled = + ops.current() + .propertyAsBoolean( + SNAPSHOT_ID_INHERITANCE_ENABLED, SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); } @Override @@ -104,12 +106,16 @@ public RewriteManifests set(String property, String value) { @Override protected Map summary() { - int createdManifestsCount = newManifests.size() + addedManifests.size() + rewrittenAddedManifests.size(); + int createdManifestsCount = + newManifests.size() + addedManifests.size() + rewrittenAddedManifests.size(); summaryBuilder.set(CREATED_MANIFESTS_COUNT, String.valueOf(createdManifestsCount)); summaryBuilder.set(KEPT_MANIFESTS_COUNT, String.valueOf(keptManifests.size())); - summaryBuilder.set(REPLACED_MANIFESTS_COUNT, String.valueOf(rewrittenManifests.size() + deletedManifests.size())); + summaryBuilder.set( + REPLACED_MANIFESTS_COUNT, + String.valueOf(rewrittenManifests.size() + deletedManifests.size())); summaryBuilder.set(PROCESSED_ENTRY_COUNT, String.valueOf(entryCount.get())); - summaryBuilder.setPartitionSummaryLimit(0); // do not include partition summaries because data did not change + summaryBuilder.setPartitionSummaryLimit( + 0); // do not include partition summaries because data did not change return summaryBuilder.build(); } @@ -134,12 +140,13 @@ public RewriteManifests deleteManifest(ManifestFile manifest) { @Override public RewriteManifests addManifest(ManifestFile manifest) { Preconditions.checkArgument(!manifest.hasAddedFiles(), "Cannot add manifest with added files"); - Preconditions.checkArgument(!manifest.hasDeletedFiles(), "Cannot add manifest with deleted files"); + Preconditions.checkArgument( + !manifest.hasDeletedFiles(), "Cannot add manifest with deleted files"); Preconditions.checkArgument( manifest.snapshotId() == null || manifest.snapshotId() == -1, "Snapshot id must be assigned during commit"); - Preconditions.checkArgument(manifest.sequenceNumber() == -1, - "Sequence must be assigned during commit"); + Preconditions.checkArgument( + manifest.sequenceNumber() == -1, "Sequence must be assigned during commit"); if (snapshotIdInheritanceEnabled && manifest.snapshotId() == null) { addedManifests.add(manifest); @@ -175,9 +182,10 @@ public List apply(TableMetadata base) { validateFilesCounts(); - Iterable newManifestsWithMetadata = Iterables.transform( - Iterables.concat(newManifests, addedManifests, rewrittenAddedManifests), - manifest -> GenericManifestFile.copyOf(manifest).withSnapshotId(snapshotId()).build()); + Iterable newManifestsWithMetadata = + Iterables.transform( + Iterables.concat(newManifests, addedManifests, rewrittenAddedManifests), + manifest -> GenericManifestFile.copyOf(manifest).withSnapshotId(snapshotId()).build()); // put new manifests at the beginning List apply = Lists.newArrayList(); @@ -207,7 +215,9 @@ private void keepActiveManifests(List currentManifests) { // keep any existing manifests as-is that were not processed keptManifests.clear(); currentManifests.stream() - .filter(manifest -> !rewrittenManifests.contains(manifest) && !deletedManifests.contains(manifest)) + .filter( + manifest -> + !rewrittenManifests.contains(manifest) && !deletedManifests.contains(manifest)) .forEach(keptManifests::add); } @@ -223,29 +233,37 @@ private void reset() { private void performRewrite(List currentManifests) { reset(); - List remainingManifests = currentManifests.stream() - .filter(manifest -> !deletedManifests.contains(manifest)) - .collect(Collectors.toList()); + List remainingManifests = + currentManifests.stream() + .filter(manifest -> !deletedManifests.contains(manifest)) + .collect(Collectors.toList()); try { Tasks.foreach(remainingManifests) .executeWith(workerPool()) - .run(manifest -> { - if (predicate != null && !predicate.test(manifest)) { - keptManifests.add(manifest); - } else { - rewrittenManifests.add(manifest); - try (ManifestReader reader = ManifestFiles.read(manifest, ops.io(), ops.current().specsById()) - .select(Arrays.asList("*"))) { - reader.liveEntries().forEach( - entry -> appendEntry(entry, clusterByFunc.apply(entry.file()), manifest.partitionSpecId()) - ); - - } catch (IOException x) { - throw new RuntimeIOException(x); - } - } - }); + .run( + manifest -> { + if (predicate != null && !predicate.test(manifest)) { + keptManifests.add(manifest); + } else { + rewrittenManifests.add(manifest); + try (ManifestReader reader = + ManifestFiles.read(manifest, ops.io(), ops.current().specsById()) + .select(Arrays.asList("*"))) { + reader + .liveEntries() + .forEach( + entry -> + appendEntry( + entry, + clusterByFunc.apply(entry.file()), + manifest.partitionSpecId())); + + } catch (IOException x) { + throw new RuntimeIOException(x); + } + } + }); } finally { Tasks.foreach(writers.values()).executeWith(workerPool()).run(WriterWrapper::close); } @@ -256,16 +274,19 @@ private void validateDeletedManifests(Set currentManifests) { deletedManifests.stream() .filter(manifest -> !currentManifests.contains(manifest)) .findAny() - .ifPresent(manifest -> { - throw new ValidationException("Manifest is missing: %s", manifest.path()); - }); + .ifPresent( + manifest -> { + throw new ValidationException("Manifest is missing: %s", manifest.path()); + }); } private void validateFilesCounts() { - Iterable createdManifests = Iterables.concat(newManifests, addedManifests, rewrittenAddedManifests); + Iterable createdManifests = + Iterables.concat(newManifests, addedManifests, rewrittenAddedManifests); int createdManifestsFilesCount = activeFilesCount(createdManifests); - Iterable replacedManifests = Iterables.concat(rewrittenManifests, deletedManifests); + Iterable replacedManifests = + Iterables.concat(rewrittenManifests, deletedManifests); int replacedManifestsFilesCount = activeFilesCount(replacedManifests); if (createdManifestsFilesCount != replacedManifestsFilesCount) { @@ -279,8 +300,10 @@ private int activeFilesCount(Iterable manifests) { int activeFilesCount = 0; for (ManifestFile manifest : manifests) { - Preconditions.checkNotNull(manifest.addedFilesCount(), "Missing file counts in %s", manifest.path()); - Preconditions.checkNotNull(manifest.existingFilesCount(), "Missing file counts in %s", manifest.path()); + Preconditions.checkNotNull( + manifest.addedFilesCount(), "Missing file counts in %s", manifest.path()); + Preconditions.checkNotNull( + manifest.existingFilesCount(), "Missing file counts in %s", manifest.path()); activeFilesCount += manifest.addedFilesCount(); activeFilesCount += manifest.existingFilesCount(); } @@ -310,7 +333,8 @@ protected void cleanUncommitted(Set committed) { cleanUncommitted(rewrittenAddedManifests, committed); } - private void cleanUncommitted(Iterable manifests, Set committedManifests) { + private void cleanUncommitted( + Iterable manifests, Set committedManifests) { for (ManifestFile manifest : manifests) { if (!committedManifests.contains(manifest)) { deleteFile(manifest.path()); @@ -350,7 +374,5 @@ synchronized void close() { } } } - } - } diff --git a/core/src/main/java/org/apache/iceberg/BaseRowDelta.java b/core/src/main/java/org/apache/iceberg/BaseRowDelta.java index b5655fe1fb1e..35a04ba39493 100644 --- a/core/src/main/java/org/apache/iceberg/BaseRowDelta.java +++ b/core/src/main/java/org/apache/iceberg/BaseRowDelta.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.expressions.Expression; @@ -78,7 +77,8 @@ public RowDelta validateDataFilesExist(Iterable referenc @Override public RowDelta conflictDetectionFilter(Expression newConflictDetectionFilter) { - Preconditions.checkArgument(newConflictDetectionFilter != null, "Conflict detection filter cannot be null"); + Preconditions.checkArgument( + newConflictDetectionFilter != null, "Conflict detection filter cannot be null"); this.conflictDetectionFilter = newConflictDetectionFilter; return this; } @@ -100,7 +100,11 @@ protected void validate(TableMetadata base) { if (base.currentSnapshot() != null) { if (!referencedDataFiles.isEmpty()) { validateDataFilesExist( - base, startingSnapshotId, referencedDataFiles, !validateDeletes, conflictDetectionFilter); + base, + startingSnapshotId, + referencedDataFiles, + !validateDeletes, + conflictDetectionFilter); } if (validateNewDataFiles) { diff --git a/core/src/main/java/org/apache/iceberg/BaseScan.java b/core/src/main/java/org/apache/iceberg/BaseScan.java index 7cf88f5b1c48..bdde1f680f40 100644 --- a/core/src/main/java/org/apache/iceberg/BaseScan.java +++ b/core/src/main/java/org/apache/iceberg/BaseScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Collection; @@ -30,7 +29,8 @@ import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.util.PropertyUtil; -abstract class BaseScan> implements Scan { +abstract class BaseScan> + implements Scan { private final TableOperations ops; private final Table table; private final Schema schema; @@ -89,7 +89,8 @@ public ThisT select(Collection columns) { @Override public ThisT filter(Expression expr) { - return newRefinedScan(ops, table, schema, context.filterRows(Expressions.and(context.rowFilter(), expr))); + return newRefinedScan( + ops, table, schema, context.filterRows(Expressions.and(context.rowFilter(), expr))); } @Override @@ -109,33 +110,36 @@ public Schema schema() { @Override public long targetSplitSize() { - long tableValue = ops.current().propertyAsLong( - TableProperties.SPLIT_SIZE, - TableProperties.SPLIT_SIZE_DEFAULT); + long tableValue = + ops.current() + .propertyAsLong(TableProperties.SPLIT_SIZE, TableProperties.SPLIT_SIZE_DEFAULT); return PropertyUtil.propertyAsLong(context.options(), TableProperties.SPLIT_SIZE, tableValue); } @Override public int splitLookback() { - int tableValue = ops.current().propertyAsInt( - TableProperties.SPLIT_LOOKBACK, - TableProperties.SPLIT_LOOKBACK_DEFAULT); - return PropertyUtil.propertyAsInt(context.options(), TableProperties.SPLIT_LOOKBACK, tableValue); + int tableValue = + ops.current() + .propertyAsInt(TableProperties.SPLIT_LOOKBACK, TableProperties.SPLIT_LOOKBACK_DEFAULT); + return PropertyUtil.propertyAsInt( + context.options(), TableProperties.SPLIT_LOOKBACK, tableValue); } @Override public long splitOpenFileCost() { - long tableValue = ops.current().propertyAsLong( - TableProperties.SPLIT_OPEN_FILE_COST, - TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); - return PropertyUtil.propertyAsLong(context.options(), TableProperties.SPLIT_OPEN_FILE_COST, tableValue); + long tableValue = + ops.current() + .propertyAsLong( + TableProperties.SPLIT_OPEN_FILE_COST, TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); + return PropertyUtil.propertyAsLong( + context.options(), TableProperties.SPLIT_OPEN_FILE_COST, tableValue); } /** * Resolve the schema to be projected lazily. * - * If there are selected columns from scan context, selected columns are projected to the table schema. - * Otherwise, projected schema from scan context shall be returned. + *

If there are selected columns from scan context, selected columns are projected to the table + * schema. Otherwise, projected schema from scan context shall be returned. * * @param context scan context * @param schema table schema @@ -148,8 +152,10 @@ private static Schema lazyColumnProjection(TableScanContext context, Schema sche // all of the filter columns are required requiredFieldIds.addAll( - Binder.boundReferences(schema.asStruct(), - Collections.singletonList(context.rowFilter()), context.caseSensitive())); + Binder.boundReferences( + schema.asStruct(), + Collections.singletonList(context.rowFilter()), + context.caseSensitive())); // all of the projection columns are required Set selectedIds; diff --git a/core/src/main/java/org/apache/iceberg/BaseSnapshot.java b/core/src/main/java/org/apache/iceberg/BaseSnapshot.java index 33d016b8ab8c..817d96e3ec2c 100644 --- a/core/src/main/java/org/apache/iceberg/BaseSnapshot.java +++ b/core/src/main/java/org/apache/iceberg/BaseSnapshot.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -38,10 +37,10 @@ class BaseSnapshot implements Snapshot { private static final long INITIAL_SEQUENCE_NUMBER = 0; /** - * @deprecated since 0.14.0, will be removed in 1.0.0; {@link FileIO} should be passed to methods which require it + * @deprecated since 0.14.0, will be removed in 1.0.0; {@link FileIO} should be passed to methods + * which require it */ - @Deprecated - private final FileIO io; + @Deprecated private final FileIO io; private final long snapshotId; private final Long parentId; @@ -61,27 +60,31 @@ class BaseSnapshot implements Snapshot { private transient List addedDeleteFiles = null; private transient List removedDeleteFiles = null; - /** - * For testing only. - */ - BaseSnapshot(FileIO io, - long snapshotId, - Integer schemaId, - String... manifestFiles) { - this(io, snapshotId, null, System.currentTimeMillis(), null, null, - schemaId, Lists.transform(Arrays.asList(manifestFiles), + /** For testing only. */ + BaseSnapshot(FileIO io, long snapshotId, Integer schemaId, String... manifestFiles) { + this( + io, + snapshotId, + null, + System.currentTimeMillis(), + null, + null, + schemaId, + Lists.transform( + Arrays.asList(manifestFiles), path -> new GenericManifestFile(io.newInputFile(path), 0))); } - BaseSnapshot(FileIO io, - long sequenceNumber, - long snapshotId, - Long parentId, - long timestampMillis, - String operation, - Map summary, - Integer schemaId, - String manifestList) { + BaseSnapshot( + FileIO io, + long sequenceNumber, + long snapshotId, + Long parentId, + long timestampMillis, + String operation, + Map summary, + Integer schemaId, + String manifestList) { this.io = io; this.sequenceNumber = sequenceNumber; this.snapshotId = snapshotId; @@ -93,14 +96,15 @@ class BaseSnapshot implements Snapshot { this.manifestListLocation = manifestList; } - BaseSnapshot(long sequenceNumber, - long snapshotId, - Long parentId, - long timestampMillis, - String operation, - Map summary, - Integer schemaId, - String manifestList) { + BaseSnapshot( + long sequenceNumber, + long snapshotId, + Long parentId, + long timestampMillis, + String operation, + Map summary, + Integer schemaId, + String manifestList) { this.io = null; this.sequenceNumber = sequenceNumber; this.snapshotId = snapshotId; @@ -112,15 +116,25 @@ class BaseSnapshot implements Snapshot { this.manifestListLocation = manifestList; } - BaseSnapshot(FileIO io, - long snapshotId, - Long parentId, - long timestampMillis, - String operation, - Map summary, - Integer schemaId, - List dataManifests) { - this(io, INITIAL_SEQUENCE_NUMBER, snapshotId, parentId, timestampMillis, operation, summary, schemaId, null); + BaseSnapshot( + FileIO io, + long snapshotId, + Long parentId, + long timestampMillis, + String operation, + Map summary, + Integer schemaId, + List dataManifests) { + this( + io, + INITIAL_SEQUENCE_NUMBER, + snapshotId, + parentId, + timestampMillis, + operation, + summary, + schemaId, + null); this.allManifests = dataManifests; } @@ -170,10 +184,14 @@ private void cacheManifests(FileIO fileIO) { } if (dataManifests == null || deleteManifests == null) { - this.dataManifests = ImmutableList.copyOf(Iterables.filter(allManifests, - manifest -> manifest.content() == ManifestContent.DATA)); - this.deleteManifests = ImmutableList.copyOf(Iterables.filter(allManifests, - manifest -> manifest.content() == ManifestContent.DELETES)); + this.dataManifests = + ImmutableList.copyOf( + Iterables.filter( + allManifests, manifest -> manifest.content() == ManifestContent.DATA)); + this.deleteManifests = + ImmutableList.copyOf( + Iterables.filter( + allManifests, manifest -> manifest.content() == ManifestContent.DELETES)); } } @@ -186,7 +204,8 @@ public List allManifests(FileIO fileIO) { } /** - * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link Snapshot#allManifests(FileIO)} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link Snapshot#allManifests(FileIO)} + * instead. */ @Override @Deprecated @@ -205,9 +224,9 @@ public List dataManifests(FileIO fileIO) { return dataManifests; } - /** - * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link Snapshot#dataManifests(FileIO)} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link Snapshot#dataManifests(FileIO)} + * instead. */ @Override @Deprecated @@ -227,7 +246,8 @@ public List deleteManifests(FileIO fileIO) { } /** - * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link Snapshot#deleteManifests(FileIO)} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link + * Snapshot#deleteManifests(FileIO)} instead. */ @Override @Deprecated @@ -247,7 +267,8 @@ public List addedDataFiles(FileIO fileIO) { } /** - * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link Snapshot#addedDataFiles(FileIO)} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link Snapshot#addedDataFiles(FileIO)} + * instead. */ @Override @Deprecated @@ -267,7 +288,8 @@ public List removedDataFiles(FileIO fileIO) { } /** - * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link Snapshot#removedDataFiles(FileIO)} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; Use {@link + * Snapshot#removedDataFiles(FileIO)} instead. */ @Override @Deprecated @@ -305,11 +327,13 @@ private void cacheDeleteFileChanges(FileIO fileIO) { ImmutableList.Builder adds = ImmutableList.builder(); ImmutableList.Builder deletes = ImmutableList.builder(); - Iterable changedManifests = Iterables.filter(deleteManifests(fileIO), - manifest -> Objects.equal(manifest.snapshotId(), snapshotId)); + Iterable changedManifests = + Iterables.filter( + deleteManifests(fileIO), manifest -> Objects.equal(manifest.snapshotId(), snapshotId)); for (ManifestFile manifest : changedManifests) { - try (ManifestReader reader = ManifestFiles.readDeleteManifest(manifest, fileIO, null)) { + try (ManifestReader reader = + ManifestFiles.readDeleteManifest(manifest, fileIO, null)) { for (ManifestEntry entry : reader.entries()) { switch (entry.status()) { case ADDED: @@ -338,11 +362,11 @@ private void cacheDataFileChanges(FileIO fileIO) { ImmutableList.Builder deletes = ImmutableList.builder(); // read only manifests that were created by this snapshot - Iterable changedManifests = Iterables.filter(dataManifests(fileIO), - manifest -> Objects.equal(manifest.snapshotId(), snapshotId)); - try (CloseableIterable> entries = new ManifestGroup(fileIO, changedManifests) - .ignoreExisting() - .entries()) { + Iterable changedManifests = + Iterables.filter( + dataManifests(fileIO), manifest -> Objects.equal(manifest.snapshotId(), snapshotId)); + try (CloseableIterable> entries = + new ManifestGroup(fileIO, changedManifests).ignoreExisting().entries()) { for (ManifestEntry entry : entries) { switch (entry.status()) { case ADDED: @@ -372,11 +396,11 @@ public boolean equals(Object o) { if (o instanceof BaseSnapshot) { BaseSnapshot other = (BaseSnapshot) o; - return this.snapshotId == other.snapshotId() && - Objects.equal(this.parentId, other.parentId()) && - this.sequenceNumber == other.sequenceNumber() && - this.timestampMillis == other.timestampMillis() && - Objects.equal(this.schemaId, other.schemaId()); + return this.snapshotId == other.snapshotId() + && Objects.equal(this.parentId, other.parentId()) + && this.sequenceNumber == other.sequenceNumber() + && this.timestampMillis == other.timestampMillis() + && Objects.equal(this.schemaId, other.schemaId()); } return false; @@ -385,12 +409,7 @@ public boolean equals(Object o) { @Override public int hashCode() { return Objects.hashCode( - this.snapshotId, - this.parentId, - this.sequenceNumber, - this.timestampMillis, - this.schemaId - ); + this.snapshotId, this.parentId, this.sequenceNumber, this.timestampMillis, this.schemaId); } @Override diff --git a/core/src/main/java/org/apache/iceberg/BaseTable.java b/core/src/main/java/org/apache/iceberg/BaseTable.java index f73c59af14ca..00842f6d77c6 100644 --- a/core/src/main/java/org/apache/iceberg/BaseTable.java +++ b/core/src/main/java/org/apache/iceberg/BaseTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; @@ -28,12 +27,12 @@ /** * Base {@link Table} implementation. - *

- * This can be extended by providing a {@link TableOperations} to the constructor. - *

- * Serializing and deserializing a BaseTable object returns a read only implementation of the BaseTable using a - * {@link StaticTableOperations}. This way no Catalog related calls are needed when reading the table data after - * deserialization. + * + *

This can be extended by providing a {@link TableOperations} to the constructor. + * + *

Serializing and deserializing a BaseTable object returns a read only implementation of the + * BaseTable using a {@link StaticTableOperations}. This way no Catalog related calls are needed + * when reading the table data after deserialization. */ public class BaseTable implements Table, HasTableOperations, Serializable { private final TableOperations ops; diff --git a/core/src/main/java/org/apache/iceberg/BaseTableScan.java b/core/src/main/java/org/apache/iceberg/BaseTableScan.java index f2a2f0d472b8..98a7df032533 100644 --- a/core/src/main/java/org/apache/iceberg/BaseTableScan.java +++ b/core/src/main/java/org/apache/iceberg/BaseTableScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Map; @@ -34,17 +33,17 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * Base class for {@link TableScan} implementations. - */ -abstract class BaseTableScan extends BaseScan implements TableScan { +/** Base class for {@link TableScan} implementations. */ +abstract class BaseTableScan extends BaseScan + implements TableScan { private static final Logger LOG = LoggerFactory.getLogger(BaseTableScan.class); protected BaseTableScan(TableOperations ops, Table table, Schema schema) { this(ops, table, schema, new TableScanContext()); } - protected BaseTableScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + protected BaseTableScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { super(ops, table, schema, context); } @@ -87,17 +86,20 @@ public TableScan appendsAfter(long fromSnapshotId) { @Override public TableScan useSnapshot(long scanSnapshotId) { - Preconditions.checkArgument(snapshotId() == null, - "Cannot override snapshot, already set to id=%s", snapshotId()); - Preconditions.checkArgument(tableOps().current().snapshot(scanSnapshotId) != null, - "Cannot find snapshot with ID %s", scanSnapshotId); - return newRefinedScan(tableOps(), table(), tableSchema(), context().useSnapshotId(scanSnapshotId)); + Preconditions.checkArgument( + snapshotId() == null, "Cannot override snapshot, already set to id=%s", snapshotId()); + Preconditions.checkArgument( + tableOps().current().snapshot(scanSnapshotId) != null, + "Cannot find snapshot with ID %s", + scanSnapshotId); + return newRefinedScan( + tableOps(), table(), tableSchema(), context().useSnapshotId(scanSnapshotId)); } @Override public TableScan asOfTime(long timestampMillis) { - Preconditions.checkArgument(snapshotId() == null, - "Cannot override snapshot, already set to id=%s", snapshotId()); + Preconditions.checkArgument( + snapshotId() == null, "Cannot override snapshot, already set to id=%s", snapshotId()); return useSnapshot(SnapshotUtil.snapshotIdAsOfTime(table(), timestampMillis)); } @@ -111,8 +113,11 @@ public Expression filter() { public CloseableIterable planFiles() { Snapshot snapshot = snapshot(); if (snapshot != null) { - LOG.info("Scanning table {} snapshot {} created at {} with filter {}", table(), - snapshot.snapshotId(), DateTimeUtil.formatTimestampMillis(snapshot.timestampMillis()), + LOG.info( + "Scanning table {} snapshot {} created at {} with filter {}", + table(), + snapshot.snapshotId(), + DateTimeUtil.formatTimestampMillis(snapshot.timestampMillis()), ExpressionUtil.toSanitizedString(filter())); Listeners.notifyAll(new ScanEvent(table().name(), snapshot.snapshotId(), filter(), schema())); @@ -128,15 +133,17 @@ public CloseableIterable planFiles() { @Override public CloseableIterable planTasks() { CloseableIterable fileScanTasks = planFiles(); - CloseableIterable splitFiles = TableScanUtil.splitFiles(fileScanTasks, targetSplitSize()); - return TableScanUtil.planTasks(splitFiles, targetSplitSize(), splitLookback(), splitOpenFileCost()); + CloseableIterable splitFiles = + TableScanUtil.splitFiles(fileScanTasks, targetSplitSize()); + return TableScanUtil.planTasks( + splitFiles, targetSplitSize(), splitLookback(), splitOpenFileCost()); } @Override public Snapshot snapshot() { - return snapshotId() != null ? - tableOps().current().snapshot(snapshotId()) : - tableOps().current().currentSnapshot(); + return snapshotId() != null + ? tableOps().current().snapshot(snapshotId()) + : tableOps().current().currentSnapshot(); } @Override diff --git a/core/src/main/java/org/apache/iceberg/BaseTransaction.java b/core/src/main/java/org/apache/iceberg/BaseTransaction.java index b64f328c32cd..38dfa0aaf3ee 100644 --- a/core/src/main/java/org/apache/iceberg/BaseTransaction.java +++ b/core/src/main/java/org/apache/iceberg/BaseTransaction.java @@ -16,9 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; + import java.io.Serializable; import java.util.List; import java.util.Map; @@ -40,15 +48,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; - public class BaseTransaction implements Transaction { private static final Logger LOG = LoggerFactory.getLogger(BaseTransaction.class); @@ -65,14 +64,16 @@ enum TransactionType { private final TableOperations transactionOps; private final List updates; private final Set intermediateSnapshotIds; - private final Set deletedFiles = Sets.newHashSet(); // keep track of files deleted in the most recent commit + private final Set deletedFiles = + Sets.newHashSet(); // keep track of files deleted in the most recent commit private final Consumer enqueueDelete = deletedFiles::add; private TransactionType type; private TableMetadata base; private TableMetadata current; private boolean hasLastOpCommitted; - BaseTransaction(String tableName, TableOperations ops, TransactionType type, TableMetadata start) { + BaseTransaction( + String tableName, TableOperations ops, TransactionType type, TableMetadata start) { this.tableName = tableName; this.ops = ops; this.transactionTable = new TransactionTable(); @@ -99,8 +100,8 @@ public TableOperations underlyingOps() { } private void checkLastOperationCommitted(String operation) { - Preconditions.checkState(hasLastOpCommitted, - "Cannot create new %s: last operation has not committed", operation); + Preconditions.checkState( + hasLastOpCommitted, "Cannot create new %s: last operation has not committed", operation); this.hasLastOpCommitted = false; } @@ -248,8 +249,8 @@ UpdateSnapshotReferencesOperation updateSnapshotReferencesOperation() { @Override public void commitTransaction() { - Preconditions.checkState(hasLastOpCommitted, - "Cannot commit transaction: last operation has not committed"); + Preconditions.checkState( + hasLastOpCommitted, "Cannot commit transaction: last operation has not committed"); switch (type) { case CREATE_TABLE: @@ -283,16 +284,18 @@ private void commitCreateTransaction() { // the commit failed and no files were committed. clean up each update. Tasks.foreach(updates) .suppressFailureWhenFinished() - .run(update -> { - if (update instanceof SnapshotProducer) { - ((SnapshotProducer) update).cleanAll(); - } - }); + .run( + update -> { + if (update instanceof SnapshotProducer) { + ((SnapshotProducer) update).cleanAll(); + } + }); throw e; } finally { - // create table never needs to retry because the table has no previous state. because retries are not a + // create table never needs to retry because the table has no previous state. because retries + // are not a // concern, it is safe to delete all of the deleted files from individual operations Tasks.foreach(deletedFiles) .suppressFailureWhenFinished() @@ -308,29 +311,32 @@ private void commitReplaceTransaction(boolean orCreate) { Tasks.foreach(ops) .retry(PropertyUtil.propertyAsInt(props, COMMIT_NUM_RETRIES, COMMIT_NUM_RETRIES_DEFAULT)) .exponentialBackoff( - PropertyUtil.propertyAsInt(props, COMMIT_MIN_RETRY_WAIT_MS, COMMIT_MIN_RETRY_WAIT_MS_DEFAULT), - PropertyUtil.propertyAsInt(props, COMMIT_MAX_RETRY_WAIT_MS, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT), - PropertyUtil.propertyAsInt(props, COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_MIN_RETRY_WAIT_MS, COMMIT_MIN_RETRY_WAIT_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_MAX_RETRY_WAIT_MS, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), 2.0 /* exponential */) .onlyRetryOn(CommitFailedException.class) - .run(underlyingOps -> { - - try { - underlyingOps.refresh(); - } catch (NoSuchTableException e) { - if (!orCreate) { - throw e; - } - } - - // because this is a replace table, it will always completely replace the table - // metadata. even if it was just updated. - if (base != underlyingOps.current()) { - this.base = underlyingOps.current(); // just refreshed - } - - underlyingOps.commit(base, current); - }); + .run( + underlyingOps -> { + try { + underlyingOps.refresh(); + } catch (NoSuchTableException e) { + if (!orCreate) { + throw e; + } + } + + // because this is a replace table, it will always completely replace the table + // metadata. even if it was just updated. + if (base != underlyingOps.current()) { + this.base = underlyingOps.current(); // just refreshed + } + + underlyingOps.commit(base, current); + }); } catch (CommitStateUnknownException e) { throw e; @@ -339,16 +345,18 @@ private void commitReplaceTransaction(boolean orCreate) { // the commit failed and no files were committed. clean up each update. Tasks.foreach(updates) .suppressFailureWhenFinished() - .run(update -> { - if (update instanceof SnapshotProducer) { - ((SnapshotProducer) update).cleanAll(); - } - }); + .run( + update -> { + if (update instanceof SnapshotProducer) { + ((SnapshotProducer) update).cleanAll(); + } + }); throw e; } finally { - // replace table never needs to retry because the table state is completely replaced. because retries are not + // replace table never needs to retry because the table state is completely replaced. because + // retries are not // a concern, it is safe to delete all of the deleted files from individual operations Tasks.foreach(deletedFiles) .suppressFailureWhenFinished() @@ -375,16 +383,17 @@ private void commitSimpleTransaction() { base.propertyAsInt(COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), 2.0 /* exponential */) .onlyRetryOn(CommitFailedException.class) - .run(underlyingOps -> { - applyUpdates(underlyingOps); + .run( + underlyingOps -> { + applyUpdates(underlyingOps); - if (current.currentSnapshot() != null) { - currentSnapshotId.set(current.currentSnapshot().snapshotId()); - } + if (current.currentSnapshot() != null) { + currentSnapshotId.set(current.currentSnapshot().snapshotId()); + } - // fix up the snapshot log, which should not contain intermediate snapshots - underlyingOps.commit(base, current); - }); + // fix up the snapshot log, which should not contain intermediate snapshots + underlyingOps.commit(base, current); + }); } catch (CommitStateUnknownException e) { throw e; @@ -404,20 +413,24 @@ private void commitSimpleTransaction() { intermediateSnapshotIds.add(currentSnapshotId.get()); } - // clean up the data files that were deleted by each operation. first, get the list of committed manifests to - // ensure that no committed manifest is deleted. a manifest could be deleted in one successful operation - // commit, but reused in another successful commit of that operation if the whole transaction is retried. + // clean up the data files that were deleted by each operation. first, get the list of + // committed manifests to + // ensure that no committed manifest is deleted. a manifest could be deleted in one successful + // operation + // commit, but reused in another successful commit of that operation if the whole transaction + // is retried. Set committedFiles = committedFiles(ops, intermediateSnapshotIds); if (committedFiles != null) { // delete all of the files that were deleted in the most recent set of operation commits Tasks.foreach(deletedFiles) .suppressFailureWhenFinished() .onFailure((file, exc) -> LOG.warn("Failed to delete uncommitted file: {}", file, exc)) - .run(path -> { - if (!committedFiles.contains(path)) { - ops.io().deleteFile(path); - } - }); + .run( + path -> { + if (!committedFiles.contains(path)) { + ops.io().deleteFile(path); + } + }); } else { LOG.warn("Failed to load metadata for a committed snapshot, skipping clean-up"); } @@ -431,11 +444,12 @@ private void cleanUpOnCommitFailure() { // the commit failed and no files were committed. clean up each update. Tasks.foreach(updates) .suppressFailureWhenFinished() - .run(update -> { - if (update instanceof SnapshotProducer) { - ((SnapshotProducer) update).cleanAll(); - } - }); + .run( + update -> { + if (update instanceof SnapshotProducer) { + ((SnapshotProducer) update).cleanAll(); + } + }); // delete all files that were cleaned up Tasks.foreach(deletedFiles) @@ -454,7 +468,8 @@ private void applyUpdates(TableOperations underlyingOps) { try { update.commit(); } catch (CommitFailedException e) { - // Cannot pass even with retry due to conflicting metadata changes. So, break the retry-loop. + // Cannot pass even with retry due to conflicting metadata changes. So, break the + // retry-loop. throw new PendingUpdateFailedException(e); } } @@ -472,8 +487,7 @@ private static Set committedFiles(TableOperations ops, Set snapsho Snapshot snap = ops.current().snapshot(snapshotId); if (snap != null) { committedFiles.add(snap.manifestListLocation()); - snap.allManifests(ops.io()) - .forEach(manifest -> committedFiles.add(manifest.path())); + snap.allManifests(ops.io()).forEach(manifest -> committedFiles.add(manifest.path())); } else { return null; } @@ -565,8 +579,7 @@ public String name() { } @Override - public void refresh() { - } + public void refresh() {} @Override public TableScan newScan() { @@ -710,7 +723,8 @@ public Rollback rollback() { @Override public ManageSnapshots manageSnapshots() { - throw new UnsupportedOperationException("Transaction tables do not support managing snapshots"); + throw new UnsupportedOperationException( + "Transaction tables do not support managing snapshots"); } @Override @@ -754,7 +768,8 @@ Set deletedFiles() { } /** - * Exception used to avoid retrying {@link PendingUpdate} when it is failed with {@link CommitFailedException}. + * Exception used to avoid retrying {@link PendingUpdate} when it is failed with {@link + * CommitFailedException}. */ private static class PendingUpdateFailedException extends RuntimeException { private final CommitFailedException wrapped; diff --git a/core/src/main/java/org/apache/iceberg/BaseUpdatePartitionSpec.java b/core/src/main/java/org/apache/iceberg/BaseUpdatePartitionSpec.java index 0181b98ac8ea..6f688848f761 100644 --- a/core/src/main/java/org/apache/iceberg/BaseUpdatePartitionSpec.java +++ b/core/src/main/java/org/apache/iceberg/BaseUpdatePartitionSpec.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -52,7 +51,8 @@ class BaseUpdatePartitionSpec implements UpdatePartitionSpec { private final List adds = Lists.newArrayList(); private final Map addedTimeFields = Maps.newHashMap(); - private final Map, PartitionField> transformToAddedField = Maps.newHashMap(); + private final Map, PartitionField> transformToAddedField = + Maps.newHashMap(); private final Map nameToAddedField = Maps.newHashMap(); private final Set deletes = Sets.newHashSet(); private final Map renames = Maps.newHashMap(); @@ -74,22 +74,20 @@ class BaseUpdatePartitionSpec implements UpdatePartitionSpec { spec.fields().stream() .filter(field -> field.transform() instanceof UnknownTransform) .findAny() - .ifPresent(field -> { - throw new IllegalArgumentException("Cannot update partition spec with unknown transform: " + field); - }); + .ifPresent( + field -> { + throw new IllegalArgumentException( + "Cannot update partition spec with unknown transform: " + field); + }); } - /** - * For testing only. - */ + /** For testing only. */ @VisibleForTesting BaseUpdatePartitionSpec(int formatVersion, PartitionSpec spec) { this(formatVersion, spec, spec.lastAssignedFieldId()); } - /** - * For testing only. - */ + /** For testing only. */ @VisibleForTesting BaseUpdatePartitionSpec(int formatVersion, PartitionSpec spec, int lastAssignedPartitionId) { this.ops = null; @@ -109,13 +107,16 @@ private int assignFieldId() { } /** - * In V2 it searches for a similar partition field in historical partition specs. Tries to match on source field - * ID, transform type and target name (optional). If not found or in V1 cases it creates a new PartitionField. + * In V2 it searches for a similar partition field in historical partition specs. Tries to match + * on source field ID, transform type and target name (optional). If not found or in V1 cases it + * creates a new PartitionField. + * * @param sourceTransform pair of source ID and transform for this PartitionField addition * @param name target partition field name, if specified * @return the recycled or newly created partition field */ - private PartitionField recycleOrCreatePartitionField(Pair> sourceTransform, String name) { + private PartitionField recycleOrCreatePartitionField( + Pair> sourceTransform, String name) { if (formatVersion == 2 && base != null) { int sourceId = sourceTransform.first(); Transform transform = sourceTransform.second(); @@ -134,7 +135,8 @@ private PartitionField recycleOrCreatePartitionField(Pair> sourceTransform = resolve(term); - Pair validationKey = Pair.of(sourceTransform.first(), sourceTransform.second().toString()); + Pair validationKey = + Pair.of(sourceTransform.first(), sourceTransform.second().toString()); PartitionField existing = transformToField.get(validationKey); - if (existing != null && deletes.contains(existing.fieldId()) && - existing.transform().equals(sourceTransform.second())) { + if (existing != null + && deletes.contains(existing.fieldId()) + && existing.transform().equals(sourceTransform.second())) { return rewriteDeleteAndAddField(existing, name, sourceTransform); } - Preconditions.checkArgument(existing == null || - (deletes.contains(existing.fieldId()) && - !existing.transform().toString().equals(sourceTransform.second().toString())), - "Cannot add duplicate partition field %s=%s, conflicts with %s", name, term, existing); + Preconditions.checkArgument( + existing == null + || (deletes.contains(existing.fieldId()) + && !existing.transform().toString().equals(sourceTransform.second().toString())), + "Cannot add duplicate partition field %s=%s, conflicts with %s", + name, + term, + existing); PartitionField added = transformToAddedField.get(validationKey); - Preconditions.checkArgument(added == null, - "Cannot add duplicate partition field %s=%s, already added: %s", name, term, added); + Preconditions.checkArgument( + added == null, + "Cannot add duplicate partition field %s=%s, already added: %s", + name, + term, + added); PartitionField newField = recycleOrCreatePartitionField(sourceTransform, name); if (newField.name() == null) { - String partitionName = PartitionSpecVisitor.visit(schema, newField, PartitionNameGenerator.INSTANCE); - newField = new PartitionField(newField.sourceId(), newField.fieldId(), partitionName, newField.transform()); + String partitionName = + PartitionSpecVisitor.visit(schema, newField, PartitionNameGenerator.INSTANCE); + newField = + new PartitionField( + newField.sourceId(), newField.fieldId(), partitionName, newField.transform()); } checkForRedundantAddedPartitions(newField); @@ -201,7 +217,8 @@ public BaseUpdatePartitionSpec addField(String name, Term term) { // rename the old deleted field that is being replaced by the new field renameField(existingField.name(), existingField.name() + "_" + existingField.fieldId()); } else { - throw new IllegalArgumentException(String.format("Cannot add duplicate partition field name: %s", name)); + throw new IllegalArgumentException( + String.format("Cannot add duplicate partition field name: %s", name)); } } else if (existingField != null && deletes.contains(existingField.fieldId())) { renames.put(existingField.name(), existingField.name() + "_" + existingField.fieldId()); @@ -217,14 +234,14 @@ public BaseUpdatePartitionSpec addField(String name, Term term) { @Override public BaseUpdatePartitionSpec removeField(String name) { PartitionField alreadyAdded = nameToAddedField.get(name); - Preconditions.checkArgument(alreadyAdded == null, "Cannot delete newly added field: %s", alreadyAdded); + Preconditions.checkArgument( + alreadyAdded == null, "Cannot delete newly added field: %s", alreadyAdded); - Preconditions.checkArgument(renames.get(name) == null, - "Cannot rename and delete partition field: %s", name); + Preconditions.checkArgument( + renames.get(name) == null, "Cannot rename and delete partition field: %s", name); PartitionField field = nameToField.get(name); - Preconditions.checkArgument(field != null, - "Cannot find partition field to remove: %s", name); + Preconditions.checkArgument(field != null, "Cannot find partition field to remove: %s", name); deletes.add(field.fieldId()); @@ -234,16 +251,18 @@ public BaseUpdatePartitionSpec removeField(String name) { @Override public BaseUpdatePartitionSpec removeField(Term term) { Pair> sourceTransform = resolve(term); - Pair key = Pair.of(sourceTransform.first(), sourceTransform.second().toString()); + Pair key = + Pair.of(sourceTransform.first(), sourceTransform.second().toString()); PartitionField added = transformToAddedField.get(key); Preconditions.checkArgument(added == null, "Cannot delete newly added field: %s", added); PartitionField field = transformToField.get(key); - Preconditions.checkArgument(field != null, - "Cannot find partition field to remove: %s", term); - Preconditions.checkArgument(renames.get(field.name()) == null, - "Cannot rename and delete partition field: %s", field.name()); + Preconditions.checkArgument(field != null, "Cannot find partition field to remove: %s", term); + Preconditions.checkArgument( + renames.get(field.name()) == null, + "Cannot rename and delete partition field: %s", + field.name()); deletes.add(field.fieldId()); @@ -259,14 +278,13 @@ public BaseUpdatePartitionSpec renameField(String name, String newName) { } PartitionField added = nameToAddedField.get(name); - Preconditions.checkArgument(added == null, - "Cannot rename newly added partition field: %s", name); + Preconditions.checkArgument( + added == null, "Cannot rename newly added partition field: %s", name); PartitionField field = nameToField.get(name); - Preconditions.checkArgument(field != null, - "Cannot find partition field to rename: %s", name); - Preconditions.checkArgument(!deletes.contains(field.fieldId()), - "Cannot delete and rename partition field: %s", name); + Preconditions.checkArgument(field != null, "Cannot find partition field to rename: %s", name); + Preconditions.checkArgument( + !deletes.contains(field.fieldId()), "Cannot delete and rename partition field: %s", name); renames.put(name, newName); @@ -286,8 +304,10 @@ public PartitionSpec apply() { builder.add(field.sourceId(), field.fieldId(), field.name(), field.transform()); } } else if (formatVersion < 2) { - // field IDs were not required for v1 and were assigned sequentially in each partition spec starting at 1,000. - // to maintain consistent field ids across partition specs in v1 tables, any partition field that is removed + // field IDs were not required for v1 and were assigned sequentially in each partition spec + // starting at 1,000. + // to maintain consistent field ids across partition specs in v1 tables, any partition field + // that is removed // must be replaced with a null transform. null values are always allowed in partition data. String newName = renames.get(field.name()); if (newName != null) { @@ -327,15 +347,19 @@ public void commit() { } else if (term instanceof BoundTransform) { return ((BoundTransform) term).transform(); } else { - throw new ValidationException("Invalid term: %s, expected either a bound reference or transform", term); + throw new ValidationException( + "Invalid term: %s, expected either a bound reference or transform", term); } } private void checkForRedundantAddedPartitions(PartitionField field) { if (isTimeTransform(field)) { PartitionField timeField = addedTimeFields.get(field.sourceId()); - Preconditions.checkArgument(timeField == null, - "Cannot add redundant partition field: %s conflicts with %s", timeField, field); + Preconditions.checkArgument( + timeField == null, + "Cannot add redundant partition field: %s conflicts with %s", + timeField, + field); addedTimeFields.put(field.sourceId(), field); } } @@ -350,7 +374,8 @@ private static Map indexSpecByName(PartitionSpec spec) { return builder.build(); } - private static Map, PartitionField> indexSpecByTransform(PartitionSpec spec) { + private static Map, PartitionField> indexSpecByTransform( + PartitionSpec spec) { Map, PartitionField> indexSpecs = Maps.newHashMap(); List fields = spec.fields(); for (PartitionField field : fields) { @@ -367,8 +392,7 @@ private boolean isTimeTransform(PartitionField field) { private static class IsTimeTransform implements PartitionSpecVisitor { private static final IsTimeTransform INSTANCE = new IsTimeTransform(); - private IsTimeTransform() { - } + private IsTimeTransform() {} @Override public Boolean identity(int fieldId, String sourceName, int sourceId) { @@ -423,8 +447,7 @@ private boolean isVoidTransform(PartitionField field) { private static class IsVoidTransform implements PartitionSpecVisitor { private static final IsVoidTransform INSTANCE = new IsVoidTransform(); - private IsVoidTransform() { - } + private IsVoidTransform() {} @Override public Boolean identity(int fieldId, String sourceName, int sourceId) { @@ -475,8 +498,7 @@ public Boolean unknown(int fieldId, String sourceName, int sourceId, String tran private static class PartitionNameGenerator implements PartitionSpecVisitor { private static final PartitionNameGenerator INSTANCE = new PartitionNameGenerator(); - private PartitionNameGenerator() { - } + private PartitionNameGenerator() {} @Override public String identity(int fieldId, String sourceName, int sourceId) { diff --git a/core/src/main/java/org/apache/iceberg/CachingCatalog.java b/core/src/main/java/org/apache/iceberg/CachingCatalog.java index 668ae1ff3366..f6c3657ff16b 100644 --- a/core/src/main/java/org/apache/iceberg/CachingCatalog.java +++ b/core/src/main/java/org/apache/iceberg/CachingCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.github.benmanes.caffeine.cache.Cache; @@ -40,9 +39,9 @@ /** * Class that wraps an Iceberg Catalog to cache tables. - *

- * See {@link CatalogProperties#CACHE_EXPIRATION_INTERVAL_MS} for more details - * regarding special values for {@code expirationIntervalMillis}. + * + *

See {@link CatalogProperties#CACHE_EXPIRATION_INTERVAL_MS} for more details regarding special + * values for {@code expirationIntervalMillis}. */ public class CachingCatalog implements Catalog { private static final Logger LOG = LoggerFactory.getLogger(CachingCatalog.class); @@ -55,14 +54,17 @@ public static Catalog wrap(Catalog catalog, long expirationIntervalMillis) { return wrap(catalog, true, expirationIntervalMillis); } - public static Catalog wrap(Catalog catalog, boolean caseSensitive, long expirationIntervalMillis) { + public static Catalog wrap( + Catalog catalog, boolean caseSensitive, long expirationIntervalMillis) { return new CachingCatalog(catalog, caseSensitive, expirationIntervalMillis); } private final Catalog catalog; private final boolean caseSensitive; + @SuppressWarnings("checkstyle:VisibilityModifier") protected final long expirationIntervalMillis; + @SuppressWarnings("checkstyle:VisibilityModifier") protected final Cache tableCache; @@ -71,8 +73,10 @@ private CachingCatalog(Catalog catalog, boolean caseSensitive, long expirationIn } @SuppressWarnings("checkstyle:VisibilityModifier") - protected CachingCatalog(Catalog catalog, boolean caseSensitive, long expirationIntervalMillis, Ticker ticker) { - Preconditions.checkArgument(expirationIntervalMillis != 0, + protected CachingCatalog( + Catalog catalog, boolean caseSensitive, long expirationIntervalMillis, Ticker ticker) { + Preconditions.checkArgument( + expirationIntervalMillis != 0, "When %s is set to 0, the catalog cache should be disabled. This indicates a bug.", CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS); this.catalog = catalog; @@ -85,7 +89,8 @@ protected CachingCatalog(Catalog catalog, boolean caseSensitive, long expiration * RemovalListener class for removing metadata tables when their associated data table is expired * via cache expiration. */ - class MetadataTableInvalidatingRemovalListener implements RemovalListener { + class MetadataTableInvalidatingRemovalListener + implements RemovalListener { @Override public void onRemoval(TableIdentifier tableIdentifier, Table table, RemovalCause cause) { LOG.debug("Evicted {} from the table cache ({})", tableIdentifier, cause); @@ -98,9 +103,7 @@ public void onRemoval(TableIdentifier tableIdentifier, Table table, RemovalCause } private Cache createTableCache(Ticker ticker) { - Caffeine cacheBuilder = Caffeine - .newBuilder() - .softValues(); + Caffeine cacheBuilder = Caffeine.newBuilder().softValues(); if (expirationIntervalMillis > 0) { return cacheBuilder @@ -141,18 +144,20 @@ public Table loadTable(TableIdentifier ident) { } if (MetadataTableUtils.hasMetadataTableName(canonicalized)) { - TableIdentifier originTableIdentifier = TableIdentifier.of(canonicalized.namespace().levels()); + TableIdentifier originTableIdentifier = + TableIdentifier.of(canonicalized.namespace().levels()); Table originTable = tableCache.get(originTableIdentifier, catalog::loadTable); - // share TableOperations instance of origin table for all metadata tables, so that metadata table instances are + // share TableOperations instance of origin table for all metadata tables, so that metadata + // table instances are // also refreshed as well when origin table instance is refreshed. if (originTable instanceof HasTableOperations) { TableOperations ops = ((HasTableOperations) originTable).operations(); MetadataTableType type = MetadataTableType.from(canonicalized.name()); - Table metadataTable = MetadataTableUtils.createMetadataTableInstance( - ops, catalog.name(), originTableIdentifier, - canonicalized, type); + Table metadataTable = + MetadataTableUtils.createMetadataTableInstance( + ops, catalog.name(), originTableIdentifier, canonicalized, type); tableCache.put(canonicalized, metadataTable); return metadataTable; } @@ -248,10 +253,13 @@ public TableBuilder withProperty(String key, String value) { @Override public Table create() { AtomicBoolean created = new AtomicBoolean(false); - Table table = tableCache.get(canonicalizeIdentifier(ident), identifier -> { - created.set(true); - return innerBuilder.create(); - }); + Table table = + tableCache.get( + canonicalizeIdentifier(ident), + identifier -> { + created.set(true); + return innerBuilder.create(); + }); if (!created.get()) { throw new AlreadyExistsException("Table already exists: %s", ident); @@ -262,28 +270,33 @@ public Table create() { @Override public Transaction createTransaction() { - // create a new transaction without altering the cache. the table doesn't exist until the transaction is - // committed. if the table is created before the transaction commits, any cached version is correct and the - // transaction create will fail. if the transaction commits before another create, then the cache will be empty. + // create a new transaction without altering the cache. the table doesn't exist until the + // transaction is + // committed. if the table is created before the transaction commits, any cached version is + // correct and the + // transaction create will fail. if the transaction commits before another create, then the + // cache will be empty. return innerBuilder.createTransaction(); } @Override public Transaction replaceTransaction() { - // create a new transaction without altering the cache. the table doesn't change until the transaction is - // committed. when the transaction commits, invalidate the table in the cache if it is present. + // create a new transaction without altering the cache. the table doesn't change until the + // transaction is + // committed. when the transaction commits, invalidate the table in the cache if it is + // present. return CommitCallbackTransaction.addCallback( - innerBuilder.replaceTransaction(), - () -> invalidateTable(ident)); + innerBuilder.replaceTransaction(), () -> invalidateTable(ident)); } @Override public Transaction createOrReplaceTransaction() { - // create a new transaction without altering the cache. the table doesn't change until the transaction is - // committed. when the transaction commits, invalidate the table in the cache if it is present. + // create a new transaction without altering the cache. the table doesn't change until the + // transaction is + // committed. when the transaction commits, invalidate the table in the cache if it is + // present. return CommitCallbackTransaction.addCallback( - innerBuilder.createOrReplaceTransaction(), - () -> invalidateTable(ident)); + innerBuilder.createOrReplaceTransaction(), () -> invalidateTable(ident)); } } } diff --git a/core/src/main/java/org/apache/iceberg/CatalogProperties.java b/core/src/main/java/org/apache/iceberg/CatalogProperties.java index d96784162176..e9bc7d556ba3 100644 --- a/core/src/main/java/org/apache/iceberg/CatalogProperties.java +++ b/core/src/main/java/org/apache/iceberg/CatalogProperties.java @@ -16,15 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.concurrent.TimeUnit; public class CatalogProperties { - private CatalogProperties() { - } + private CatalogProperties() {} public static final String CATALOG_IMPL = "catalog-impl"; public static final String FILE_IO_IMPL = "io-impl"; @@ -34,32 +32,38 @@ private CatalogProperties() { /** * Controls whether the catalog will cache table entries upon load. - *

- * If {@link #CACHE_EXPIRATION_INTERVAL_MS} is set to zero, this value - * will be ignored and the cache will be disabled. + * + *

If {@link #CACHE_EXPIRATION_INTERVAL_MS} is set to zero, this value will be ignored and the + * cache will be disabled. */ public static final String CACHE_ENABLED = "cache-enabled"; + public static final boolean CACHE_ENABLED_DEFAULT = true; /** * Controls the duration for which entries in the catalog are cached. - *

- * Behavior of specific values of cache.expiration-interval-ms: + * + *

Behavior of specific values of cache.expiration-interval-ms: + * *

    - *
  • Zero - Caching and cache expiration are both disabled
  • - *
  • Negative Values - Cache expiration is turned off and entries expire only on refresh etc
  • - *
  • Positive Values - Cache entries expire if not accessed via the cache after this many milliseconds
  • + *
  • Zero - Caching and cache expiration are both disabled + *
  • Negative Values - Cache expiration is turned off and entries expire only on refresh etc + *
  • Positive Values - Cache entries expire if not accessed via the cache after this many + * milliseconds *
*/ public static final String CACHE_EXPIRATION_INTERVAL_MS = "cache.expiration-interval-ms"; + public static final long CACHE_EXPIRATION_INTERVAL_MS_DEFAULT = TimeUnit.SECONDS.toMillis(30); public static final long CACHE_EXPIRATION_INTERVAL_MS_OFF = -1; public static final String URI = "uri"; public static final String CLIENT_POOL_SIZE = "clients"; public static final int CLIENT_POOL_SIZE_DEFAULT = 2; - public static final String CLIENT_POOL_CACHE_EVICTION_INTERVAL_MS = "client.pool.cache.eviction-interval-ms"; - public static final long CLIENT_POOL_CACHE_EVICTION_INTERVAL_MS_DEFAULT = TimeUnit.MINUTES.toMillis(5); + public static final String CLIENT_POOL_CACHE_EVICTION_INTERVAL_MS = + "client.pool.cache.eviction-interval-ms"; + public static final long CLIENT_POOL_CACHE_EVICTION_INTERVAL_MS_DEFAULT = + TimeUnit.MINUTES.toMillis(5); public static final String LOCK_IMPL = "lock-impl"; diff --git a/core/src/main/java/org/apache/iceberg/CatalogUtil.java b/core/src/main/java/org/apache/iceberg/CatalogUtil.java index b837b0cfb8c9..1ce3f73e8f98 100644 --- a/core/src/main/java/org/apache/iceberg/CatalogUtil.java +++ b/core/src/main/java/org/apache/iceberg/CatalogUtil.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.io.IOException; import java.util.Locale; import java.util.Map; @@ -42,35 +44,33 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - public class CatalogUtil { private static final Logger LOG = LoggerFactory.getLogger(CatalogUtil.class); /** - * Shortcut catalog property to load a catalog implementation through a short type name, - * instead of specifying a full java class through {@link CatalogProperties#CATALOG_IMPL}. - * Currently the following type to implementation mappings are supported: + * Shortcut catalog property to load a catalog implementation through a short type name, instead + * of specifying a full java class through {@link CatalogProperties#CATALOG_IMPL}. Currently the + * following type to implementation mappings are supported: + * *
    - *
  • hive: org.apache.iceberg.hive.HiveCatalog
  • - *
  • hadoop: org.apache.iceberg.hadoop.HadoopCatalog
  • + *
  • hive: org.apache.iceberg.hive.HiveCatalog + *
  • hadoop: org.apache.iceberg.hadoop.HadoopCatalog *
*/ public static final String ICEBERG_CATALOG_TYPE = "type"; + public static final String ICEBERG_CATALOG_TYPE_HADOOP = "hadoop"; public static final String ICEBERG_CATALOG_TYPE_HIVE = "hive"; public static final String ICEBERG_CATALOG_HIVE = "org.apache.iceberg.hive.HiveCatalog"; public static final String ICEBERG_CATALOG_HADOOP = "org.apache.iceberg.hadoop.HadoopCatalog"; - private CatalogUtil() { - } + private CatalogUtil() {} /** * Drops all data and metadata files referenced by TableMetadata. - *

- * This should be called by dropTable implementations to clean up table files once the table has been dropped in the - * metastore. + * + *

This should be called by dropTable implementations to clean up table files once the table + * has been dropped in the metastore. * * @param io a FileIO to use for deletes * @param metadata the last valid TableMetadata instance for a dropped table. @@ -94,7 +94,8 @@ public static void dropTableData(FileIO io, TableMetadata metadata) { // run all of the deletes - boolean gcEnabled = PropertyUtil.propertyAsBoolean(metadata.properties(), GC_ENABLED, GC_ENABLED_DEFAULT); + boolean gcEnabled = + PropertyUtil.propertyAsBoolean(metadata.properties(), GC_ENABLED, GC_ENABLED_DEFAULT); if (gcEnabled) { // delete data files only if we are sure this won't corrupt other tables @@ -103,67 +104,80 @@ public static void dropTableData(FileIO io, TableMetadata metadata) { Tasks.foreach(Iterables.transform(manifestsToDelete, ManifestFile::path)) .executeWith(ThreadPools.getWorkerPool()) - .noRetry().suppressFailureWhenFinished() + .noRetry() + .suppressFailureWhenFinished() .onFailure((manifest, exc) -> LOG.warn("Delete failed for manifest: {}", manifest, exc)) .run(io::deleteFile); Tasks.foreach(manifestListsToDelete) .executeWith(ThreadPools.getWorkerPool()) - .noRetry().suppressFailureWhenFinished() + .noRetry() + .suppressFailureWhenFinished() .onFailure((list, exc) -> LOG.warn("Delete failed for manifest list: {}", list, exc)) .run(io::deleteFile); - Tasks.foreach(Iterables.transform(metadata.previousFiles(), TableMetadata.MetadataLogEntry::file)) + Tasks.foreach( + Iterables.transform(metadata.previousFiles(), TableMetadata.MetadataLogEntry::file)) .executeWith(ThreadPools.getWorkerPool()) - .noRetry().suppressFailureWhenFinished() - .onFailure((metadataFile, exc) -> LOG.warn("Delete failed for previous metadata file: {}", metadataFile, exc)) + .noRetry() + .suppressFailureWhenFinished() + .onFailure( + (metadataFile, exc) -> + LOG.warn("Delete failed for previous metadata file: {}", metadataFile, exc)) .run(io::deleteFile); Tasks.foreach(metadata.metadataFileLocation()) - .noRetry().suppressFailureWhenFinished() - .onFailure((metadataFile, exc) -> LOG.warn("Delete failed for metadata file: {}", metadataFile, exc)) + .noRetry() + .suppressFailureWhenFinished() + .onFailure( + (metadataFile, exc) -> + LOG.warn("Delete failed for metadata file: {}", metadataFile, exc)) .run(io::deleteFile); } @SuppressWarnings("DangerousStringInternUsage") private static void deleteFiles(FileIO io, Set allManifests) { // keep track of deleted files in a map that can be cleaned up when memory runs low - Map deletedFiles = new MapMaker() - .concurrencyLevel(ThreadPools.WORKER_THREAD_POOL_SIZE) - .weakKeys() - .makeMap(); + Map deletedFiles = + new MapMaker().concurrencyLevel(ThreadPools.WORKER_THREAD_POOL_SIZE).weakKeys().makeMap(); Tasks.foreach(allManifests) - .noRetry().suppressFailureWhenFinished() + .noRetry() + .suppressFailureWhenFinished() .executeWith(ThreadPools.getWorkerPool()) - .onFailure((item, exc) -> LOG.warn("Failed to get deleted files: this may cause orphaned data files", exc)) - .run(manifest -> { - try (ManifestReader reader = ManifestFiles.open(manifest, io)) { - for (ManifestEntry entry : reader.entries()) { - // intern the file path because the weak key map uses identity (==) instead of equals - String path = entry.file().path().toString().intern(); - Boolean alreadyDeleted = deletedFiles.putIfAbsent(path, true); - if (alreadyDeleted == null || !alreadyDeleted) { - try { - io.deleteFile(path); - } catch (RuntimeException e) { - // this may happen if the map of deleted files gets cleaned up by gc - LOG.warn("Delete failed for data file: {}", path, e); + .onFailure( + (item, exc) -> + LOG.warn("Failed to get deleted files: this may cause orphaned data files", exc)) + .run( + manifest -> { + try (ManifestReader reader = ManifestFiles.open(manifest, io)) { + for (ManifestEntry entry : reader.entries()) { + // intern the file path because the weak key map uses identity (==) instead of + // equals + String path = entry.file().path().toString().intern(); + Boolean alreadyDeleted = deletedFiles.putIfAbsent(path, true); + if (alreadyDeleted == null || !alreadyDeleted) { + try { + io.deleteFile(path); + } catch (RuntimeException e) { + // this may happen if the map of deleted files gets cleaned up by gc + LOG.warn("Delete failed for data file: {}", path, e); + } + } } + } catch (IOException e) { + throw new RuntimeIOException( + e, "Failed to read manifest file: %s", manifest.path()); } - } - } catch (IOException e) { - throw new RuntimeIOException(e, "Failed to read manifest file: %s", manifest.path()); - } - }); + }); } /** * Load a custom catalog implementation. - *

- * The catalog must have a no-arg constructor. - * If the class implements Configurable, a Hadoop config will be passed using Configurable.setConf. - * {@link Catalog#initialize(String catalogName, Map options)} is called to complete the initialization. + * + *

The catalog must have a no-arg constructor. If the class implements Configurable, a Hadoop + * config will be passed using Configurable.setConf. {@link Catalog#initialize(String catalogName, + * Map options)} is called to complete the initialization. * * @param impl catalog implementation full class name * @param catalogName catalog name @@ -173,17 +187,15 @@ private static void deleteFiles(FileIO io, Set allManifests) { * @throws IllegalArgumentException if no-arg constructor not found or error during initialization */ public static Catalog loadCatalog( - String impl, - String catalogName, - Map properties, - Object hadoopConf) { + String impl, String catalogName, Map properties, Object hadoopConf) { Preconditions.checkNotNull(impl, "Cannot initialize custom Catalog, impl class name is null"); DynConstructors.Ctor ctor; try { ctor = DynConstructors.builder(Catalog.class).impl(impl).buildChecked(); } catch (NoSuchMethodException e) { - throw new IllegalArgumentException(String.format( - "Cannot initialize Catalog implementation %s: %s", impl, e.getMessage()), e); + throw new IllegalArgumentException( + String.format("Cannot initialize Catalog implementation %s: %s", impl, e.getMessage()), + e); } Catalog catalog; @@ -202,11 +214,12 @@ public static Catalog loadCatalog( } /** - * Build an Iceberg {@link Catalog} based on a map of catalog properties and optional Hadoop configuration. - *

- * This method examines both the {@link #ICEBERG_CATALOG_TYPE} and {@link CatalogProperties#CATALOG_IMPL} properties - * to determine the catalog implementation to load. - * If nothing is specified for both properties, Hive catalog will be loaded by default. + * Build an Iceberg {@link Catalog} based on a map of catalog properties and optional Hadoop + * configuration. + * + *

This method examines both the {@link #ICEBERG_CATALOG_TYPE} and {@link + * CatalogProperties#CATALOG_IMPL} properties to determine the catalog implementation to load. If + * nothing is specified for both properties, Hive catalog will be loaded by default. * * @param name catalog name * @param options catalog properties @@ -216,7 +229,8 @@ public static Catalog loadCatalog( public static Catalog buildIcebergCatalog(String name, Map options, Object conf) { String catalogImpl = options.get(CatalogProperties.CATALOG_IMPL); if (catalogImpl == null) { - String catalogType = PropertyUtil.propertyAsString(options, ICEBERG_CATALOG_TYPE, ICEBERG_CATALOG_TYPE_HIVE); + String catalogType = + PropertyUtil.propertyAsString(options, ICEBERG_CATALOG_TYPE, ICEBERG_CATALOG_TYPE_HIVE); switch (catalogType.toLowerCase(Locale.ENGLISH)) { case ICEBERG_CATALOG_TYPE_HIVE: catalogImpl = ICEBERG_CATALOG_HIVE; @@ -229,9 +243,12 @@ public static Catalog buildIcebergCatalog(String name, Map optio } } else { String catalogType = options.get(ICEBERG_CATALOG_TYPE); - Preconditions.checkArgument(catalogType == null, + Preconditions.checkArgument( + catalogType == null, "Cannot create catalog %s, both type and catalog-impl are set: type=%s, catalog-impl=%s", - name, catalogType, catalogImpl); + name, + catalogType, + catalogImpl); } return CatalogUtil.loadCatalog(catalogImpl, name, options, conf); @@ -239,31 +256,30 @@ public static Catalog buildIcebergCatalog(String name, Map optio /** * Load a custom {@link FileIO} implementation. - *

- * The implementation must have a no-arg constructor. - * If the class implements Configurable, a Hadoop config will be passed using Configurable.setConf. - * {@link FileIO#initialize(Map properties)} is called to complete the initialization. + * + *

The implementation must have a no-arg constructor. If the class implements Configurable, a + * Hadoop config will be passed using Configurable.setConf. {@link FileIO#initialize(Map + * properties)} is called to complete the initialization. * * @param impl full class name of a custom FileIO implementation * @param properties used to initialize the FileIO implementation * @param hadoopConf a hadoop Configuration * @return FileIO class - * @throws IllegalArgumentException if class path not found or - * right constructor not found or - * the loaded class cannot be cast to the given interface type + * @throws IllegalArgumentException if class path not found or right constructor not found or the + * loaded class cannot be cast to the given interface type */ - public static FileIO loadFileIO( - String impl, - Map properties, - Object hadoopConf) { + public static FileIO loadFileIO(String impl, Map properties, Object hadoopConf) { LOG.info("Loading custom FileIO implementation: {}", impl); DynConstructors.Ctor ctor; try { ctor = - DynConstructors.builder(FileIO.class).loader(CatalogUtil.class.getClassLoader()).impl(impl).buildChecked(); + DynConstructors.builder(FileIO.class) + .loader(CatalogUtil.class.getClassLoader()) + .impl(impl) + .buildChecked(); } catch (NoSuchMethodException e) { - throw new IllegalArgumentException(String.format( - "Cannot initialize FileIO, missing no-arg constructor: %s", impl), e); + throw new IllegalArgumentException( + String.format("Cannot initialize FileIO, missing no-arg constructor: %s", impl), e); } FileIO fileIO; @@ -282,6 +298,7 @@ public static FileIO loadFileIO( /** * Dynamically detects whether an object is a Hadoop Configurable and calls setConf. + * * @param maybeConfigurable an object that may be Configurable * @param conf a Configuration */ @@ -305,10 +322,11 @@ public static void configureHadoopConf(Object maybeConfigurable, Object conf) { Class configurableInterface; try { // load the Configurable interface - configurableInterface = DynClasses.builder() - .loader(maybeConfigurableLoader) - .impl("org.apache.hadoop.conf.Configurable") - .buildChecked(); + configurableInterface = + DynClasses.builder() + .loader(maybeConfigurableLoader) + .impl("org.apache.hadoop.conf.Configurable") + .buildChecked(); } catch (ClassNotFoundException e) { // not Configurable because it was loaded and Configurable is not present in its classloader return; @@ -321,27 +339,35 @@ public static void configureHadoopConf(Object maybeConfigurable, Object conf) { Class configurationClass; try { - configurationClass = DynClasses.builder() - .loader(maybeConfigurableLoader) - .impl("org.apache.hadoop.conf.Configuration") - .buildChecked(); + configurationClass = + DynClasses.builder() + .loader(maybeConfigurableLoader) + .impl("org.apache.hadoop.conf.Configuration") + .buildChecked(); } catch (ClassNotFoundException e) { - // this shouldn't happen because Configurable cannot be loaded without first loading Configuration - throw new UnsupportedOperationException("Failed to load Configuration after loading Configurable", e); + // this shouldn't happen because Configurable cannot be loaded without first loading + // Configuration + throw new UnsupportedOperationException( + "Failed to load Configuration after loading Configurable", e); } - ValidationException.check(configurationClass.isInstance(conf), - "%s is not an instance of Configuration from the classloader for %s", conf, maybeConfigurable); + ValidationException.check( + configurationClass.isInstance(conf), + "%s is not an instance of Configuration from the classloader for %s", + conf, + maybeConfigurable); DynMethods.BoundMethod setConf; try { - setConf = DynMethods.builder("setConf") - .impl(configurableInterface, configurationClass) - .buildChecked() - .bind(maybeConfigurable); + setConf = + DynMethods.builder("setConf") + .impl(configurableInterface, configurationClass) + .buildChecked() + .bind(maybeConfigurable); } catch (NoSuchMethodException e) { // this shouldn't happen because Configurable was loaded and defines setConf - throw new UnsupportedOperationException("Failed to load Configuration.setConf after loading Configurable", e); + throw new UnsupportedOperationException( + "Failed to load Configuration.setConf after loading Configurable", e); } setConf.invoke(conf); diff --git a/core/src/main/java/org/apache/iceberg/CherryPickOperation.java b/core/src/main/java/org/apache/iceberg/CherryPickOperation.java index 3e6978052281..77de54279768 100644 --- a/core/src/main/java/org/apache/iceberg/CherryPickOperation.java +++ b/core/src/main/java/org/apache/iceberg/CherryPickOperation.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -32,9 +31,9 @@ /** * Cherry-picks or fast-forwards the current state to a snapshot. - *

- * This update is not exposed though the Table API. Instead, it is a package-private part of the Transaction API - * intended for use in {@link ManageSnapshots}. + * + *

This update is not exposed though the Table API. Instead, it is a package-private part of the + * Transaction API intended for use in {@link ManageSnapshots}. */ class CherryPickOperation extends MergingSnapshotProducer { @@ -57,7 +56,8 @@ protected CherryPickOperation self() { @Override protected String operation() { - // snapshotOperation is used by SnapshotProducer when building and writing a new snapshot for cherrypick + // snapshotOperation is used by SnapshotProducer when building and writing a new snapshot for + // cherrypick Preconditions.checkNotNull(cherrypickSnapshot, "[BUG] Detected uninitialized operation"); return cherrypickSnapshot.operation(); } @@ -65,8 +65,8 @@ protected String operation() { public CherryPickOperation cherrypick(long snapshotId) { TableMetadata current = current(); this.cherrypickSnapshot = current.snapshot(snapshotId); - ValidationException.check(cherrypickSnapshot != null, - "Cannot cherry-pick unknown snapshot ID: %s", snapshotId); + ValidationException.check( + cherrypickSnapshot != null, "Cannot cherry-pick unknown snapshot ID: %s", snapshotId); if (cherrypickSnapshot.operation().equals(DataOperations.APPEND)) { // this property is set on target snapshot that will get published @@ -83,14 +83,19 @@ public CherryPickOperation cherrypick(long snapshotId) { add(addedFile); } - } else if (cherrypickSnapshot.operation().equals(DataOperations.OVERWRITE) && - PropertyUtil.propertyAsBoolean(cherrypickSnapshot.summary(), SnapshotSummary.REPLACE_PARTITIONS_PROP, false)) { - // the operation was ReplacePartitions. this can be cherry-picked iff the partitions have not been modified. - // detecting modification requires finding the new files since the parent was committed, so the parent must be an + } else if (cherrypickSnapshot.operation().equals(DataOperations.OVERWRITE) + && PropertyUtil.propertyAsBoolean( + cherrypickSnapshot.summary(), SnapshotSummary.REPLACE_PARTITIONS_PROP, false)) { + // the operation was ReplacePartitions. this can be cherry-picked iff the partitions have not + // been modified. + // detecting modification requires finding the new files since the parent was committed, so + // the parent must be an // ancestor of the current state, or null if the overwrite was based on an empty table. ValidationException.check( - cherrypickSnapshot.parentId() == null || isCurrentAncestor(current, cherrypickSnapshot.parentId()), - "Cannot cherry-pick overwrite not based on an ancestor of the current state: %s", snapshotId); + cherrypickSnapshot.parentId() == null + || isCurrentAncestor(current, cherrypickSnapshot.parentId()), + "Cannot cherry-pick overwrite not based on an ancestor of the current state: %s", + snapshotId); // this property is set on target snapshot that will get published String wapId = WapUtil.validateWapPublish(current, snapshotId); @@ -118,7 +123,8 @@ public CherryPickOperation cherrypick(long snapshotId) { } else { // attempt to fast-forward - ValidationException.check(isFastForward(current), + ValidationException.check( + isFastForward(current), "Cannot cherry-pick snapshot %s: not append, dynamic overwrite, or fast-forward", cherrypickSnapshot.snapshotId()); this.requireFastForward = true; @@ -147,7 +153,8 @@ public Object updateEvent() { @Override protected void validate(TableMetadata base) { - // this is only called after apply() passes off to super, but check fast-forward status just in case + // this is only called after apply() passes off to super, but check fast-forward status just in + // case if (!isFastForward(base)) { validateNonAncestor(base, cherrypickSnapshot.snapshotId()); validateReplacedPartitions(base, cherrypickSnapshot.parentId(), replacedPartitions, io); @@ -158,8 +165,8 @@ protected void validate(TableMetadata base) { private boolean isFastForward(TableMetadata base) { if (base.currentSnapshot() != null) { // can fast-forward if the cherry-picked snapshot's parent is the current snapshot - return cherrypickSnapshot.parentId() != null && - base.currentSnapshot().snapshotId() == cherrypickSnapshot.parentId(); + return cherrypickSnapshot.parentId() != null + && base.currentSnapshot().snapshotId() == cherrypickSnapshot.parentId(); } else { // ... or if the parent and current snapshot are both null return cherrypickSnapshot.parentId() == null; @@ -177,12 +184,14 @@ public Snapshot apply() { boolean isFastForward = isFastForward(base); if (requireFastForward || isFastForward) { - ValidationException.check(isFastForward, + ValidationException.check( + isFastForward, "Cannot cherry-pick snapshot %s: not append, dynamic overwrite, or fast-forward", cherrypickSnapshot.snapshotId()); return base.snapshot(cherrypickSnapshot.snapshotId()); } else { - // validate(TableMetadata) is called in apply(TableMetadata) after this apply refreshes the table state + // validate(TableMetadata) is called in apply(TableMetadata) after this apply refreshes the + // table state return super.apply(); } } @@ -198,15 +207,18 @@ private static void validateNonAncestor(TableMetadata meta, long snapshotId) { } } - private static void validateReplacedPartitions(TableMetadata meta, Long parentId, - PartitionSet replacedPartitions, FileIO io) { + private static void validateReplacedPartitions( + TableMetadata meta, Long parentId, PartitionSet replacedPartitions, FileIO io) { if (replacedPartitions != null && meta.currentSnapshot() != null) { - ValidationException.check(parentId == null || isCurrentAncestor(meta, parentId), - "Cannot cherry-pick overwrite, based on non-ancestor of the current state: %s", parentId); - List newFiles = SnapshotUtil.newFiles( - parentId, meta.currentSnapshot().snapshotId(), meta::snapshot, io); + ValidationException.check( + parentId == null || isCurrentAncestor(meta, parentId), + "Cannot cherry-pick overwrite, based on non-ancestor of the current state: %s", + parentId); + List newFiles = + SnapshotUtil.newFiles(parentId, meta.currentSnapshot().snapshotId(), meta::snapshot, io); for (DataFile newFile : newFiles) { - ValidationException.check(!replacedPartitions.contains(newFile.specId(), newFile.partition()), + ValidationException.check( + !replacedPartitions.contains(newFile.specId(), newFile.partition()), "Cannot cherry-pick replace partitions with changed partition: %s", newFile.partition()); } @@ -217,7 +229,8 @@ private static Long lookupAncestorBySourceSnapshot(TableMetadata meta, long snap String snapshotIdStr = String.valueOf(snapshotId); for (long ancestorId : currentAncestors(meta)) { Map summary = meta.snapshot(ancestorId).summary(); - if (summary != null && snapshotIdStr.equals(summary.get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP))) { + if (summary != null + && snapshotIdStr.equals(summary.get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP))) { return ancestorId; } } diff --git a/core/src/main/java/org/apache/iceberg/ClientPool.java b/core/src/main/java/org/apache/iceberg/ClientPool.java index 117939af0f94..ca5d8fc1aca3 100644 --- a/core/src/main/java/org/apache/iceberg/ClientPool.java +++ b/core/src/main/java/org/apache/iceberg/ClientPool.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; public interface ClientPool { diff --git a/core/src/main/java/org/apache/iceberg/ClientPoolImpl.java b/core/src/main/java/org/apache/iceberg/ClientPoolImpl.java index 18f7afa17a68..f63e0e0f3338 100644 --- a/core/src/main/java/org/apache/iceberg/ClientPoolImpl.java +++ b/core/src/main/java/org/apache/iceberg/ClientPoolImpl.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Closeable; @@ -26,7 +25,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public abstract class ClientPoolImpl implements Closeable, ClientPool { +public abstract class ClientPoolImpl + implements Closeable, ClientPool { private static final Logger LOG = LoggerFactory.getLogger(ClientPoolImpl.class); private final int poolSize; diff --git a/core/src/main/java/org/apache/iceberg/CommitCallbackTransaction.java b/core/src/main/java/org/apache/iceberg/CommitCallbackTransaction.java index bd15744a87ad..f1ad14590766 100644 --- a/core/src/main/java/org/apache/iceberg/CommitCallbackTransaction.java +++ b/core/src/main/java/org/apache/iceberg/CommitCallbackTransaction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; class CommitCallbackTransaction implements Transaction { diff --git a/core/src/main/java/org/apache/iceberg/DataFiles.java b/core/src/main/java/org/apache/iceberg/DataFiles.java index a765dc7fb86a..c0b7f6b46b12 100644 --- a/core/src/main/java/org/apache/iceberg/DataFiles.java +++ b/core/src/main/java/org/apache/iceberg/DataFiles.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.nio.ByteBuffer; @@ -35,14 +34,14 @@ public class DataFiles { - private DataFiles() { - } + private DataFiles() {} static PartitionData newPartitionData(PartitionSpec spec) { return new PartitionData(spec.partitionType()); } - static PartitionData copyPartitionData(PartitionSpec spec, StructLike partitionData, PartitionData reuse) { + static PartitionData copyPartitionData( + PartitionSpec spec, StructLike partitionData, PartitionData reuse) { PartitionData data = reuse; if (data == null) { data = newPartitionData(spec); @@ -64,20 +63,22 @@ static PartitionData fillFromPath(PartitionSpec spec, String partitionPath, Part } String[] partitions = partitionPath.split("/", -1); - Preconditions.checkArgument(partitions.length <= spec.fields().size(), + Preconditions.checkArgument( + partitions.length <= spec.fields().size(), "Invalid partition data, too many fields (expecting %s): %s", - spec.fields().size(), partitionPath); - Preconditions.checkArgument(partitions.length >= spec.fields().size(), + spec.fields().size(), + partitionPath); + Preconditions.checkArgument( + partitions.length >= spec.fields().size(), "Invalid partition data, not enough fields (expecting %s): %s", - spec.fields().size(), partitionPath); + spec.fields().size(), + partitionPath); for (int i = 0; i < partitions.length; i += 1) { PartitionField field = spec.fields().get(i); String[] parts = partitions[i].split("=", 2); Preconditions.checkArgument( - parts.length == 2 && - parts[0] != null && - field.name().equals(parts[0]), + parts.length == 2 && parts[0] != null && field.name().equals(parts[0]), "Invalid partition: %s", partitions[i]); @@ -160,7 +161,8 @@ public void clear() { public Builder copy(DataFile toCopy) { if (isPartitioned) { - Preconditions.checkState(specId == toCopy.specId(), "Cannot copy a DataFile with a different spec"); + Preconditions.checkState( + specId == toCopy.specId(), "Cannot copy a DataFile with a different spec"); this.partitionData = copyPartitionData(spec, toCopy.partition(), partitionData); } this.filePath = toCopy.path().toString(); @@ -173,8 +175,8 @@ public Builder copy(DataFile toCopy) { this.nanValueCounts = toCopy.nanValueCounts(); this.lowerBounds = toCopy.lowerBounds(); this.upperBounds = toCopy.upperBounds(); - this.keyMetadata = toCopy.keyMetadata() == null ? null - : ByteBuffers.copy(toCopy.keyMetadata()); + this.keyMetadata = + toCopy.keyMetadata() == null ? null : ByteBuffers.copy(toCopy.keyMetadata()); this.splitOffsets = toCopy.splitOffsets() == null ? null : copyList(toCopy.splitOffsets()); this.sortOrderId = toCopy.sortOrderId(); return this; @@ -233,7 +235,8 @@ public Builder withFileSizeInBytes(long newFileSizeInBytes) { } public Builder withPartitionPath(String newPartitionPath) { - Preconditions.checkArgument(isPartitioned || newPartitionPath.isEmpty(), + Preconditions.checkArgument( + isPartitioned || newPartitionPath.isEmpty(), "Cannot add partition data for an unpartitioned table"); if (!newPartitionPath.isEmpty()) { this.partitionData = fillFromPath(spec, newPartitionPath, partitionData); @@ -288,10 +291,22 @@ public DataFile build() { Preconditions.checkArgument(recordCount >= 0, "Record count is required"); return new GenericDataFile( - specId, filePath, format, isPartitioned ? partitionData.copy() : null, - fileSizeInBytes, new Metrics( - recordCount, columnSizes, valueCounts, nullValueCounts, nanValueCounts, lowerBounds, upperBounds), - keyMetadata, splitOffsets, sortOrderId); + specId, + filePath, + format, + isPartitioned ? partitionData.copy() : null, + fileSizeInBytes, + new Metrics( + recordCount, + columnSizes, + valueCounts, + nullValueCounts, + nanValueCounts, + lowerBounds, + upperBounds), + keyMetadata, + splitOffsets, + sortOrderId); } } diff --git a/core/src/main/java/org/apache/iceberg/DataFilesTable.java b/core/src/main/java/org/apache/iceberg/DataFilesTable.java index f9fcdd525dc5..ee6d876efa20 100644 --- a/core/src/main/java/org/apache/iceberg/DataFilesTable.java +++ b/core/src/main/java/org/apache/iceberg/DataFilesTable.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.io.CloseableIterable; -/** - * A {@link Table} implementation that exposes a table's data files as rows. - */ +/** A {@link Table} implementation that exposes a table's data files as rows. */ public class DataFilesTable extends BaseFilesTable { DataFilesTable(TableOperations ops, Table table) { @@ -55,7 +52,8 @@ public static class DataFilesTableScan extends BaseFilesTableScan { } @Override - protected TableScan newRefinedScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + protected TableScan newRefinedScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { return new DataFilesTableScan(ops, table, schema, context); } diff --git a/core/src/main/java/org/apache/iceberg/DataTableScan.java b/core/src/main/java/org/apache/iceberg/DataTableScan.java index b4d51079671f..46ae8ae0ed80 100644 --- a/core/src/main/java/org/apache/iceberg/DataTableScan.java +++ b/core/src/main/java/org/apache/iceberg/DataTableScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.io.CloseableIterable; @@ -26,14 +25,29 @@ import org.apache.iceberg.util.SnapshotUtil; public class DataTableScan extends BaseTableScan { - static final ImmutableList SCAN_COLUMNS = ImmutableList.of( - "snapshot_id", "file_path", "file_ordinal", "file_format", "block_size_in_bytes", - "file_size_in_bytes", "record_count", "partition", "key_metadata", "split_offsets" - ); - static final ImmutableList SCAN_WITH_STATS_COLUMNS = ImmutableList.builder() - .addAll(SCAN_COLUMNS) - .add("value_counts", "null_value_counts", "nan_value_counts", "lower_bounds", "upper_bounds", "column_sizes") - .build(); + static final ImmutableList SCAN_COLUMNS = + ImmutableList.of( + "snapshot_id", + "file_path", + "file_ordinal", + "file_format", + "block_size_in_bytes", + "file_size_in_bytes", + "record_count", + "partition", + "key_metadata", + "split_offsets"); + static final ImmutableList SCAN_WITH_STATS_COLUMNS = + ImmutableList.builder() + .addAll(SCAN_COLUMNS) + .add( + "value_counts", + "null_value_counts", + "nan_value_counts", + "lower_bounds", + "upper_bounds", + "column_sizes") + .build(); static final boolean PLAN_SCANS_WITH_WORKER_POOL = SystemProperties.getBoolean(SystemProperties.SCAN_THREAD_POOL_ENABLED, true); @@ -41,22 +55,30 @@ public DataTableScan(TableOperations ops, Table table) { super(ops, table, table.schema()); } - protected DataTableScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + protected DataTableScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { super(ops, table, schema, context); } @Override public TableScan appendsBetween(long fromSnapshotId, long toSnapshotId) { - Preconditions.checkState(snapshotId() == null, - "Cannot enable incremental scan, scan-snapshot set to id=%s", snapshotId()); - return new IncrementalDataTableScan(tableOps(), table(), schema(), + Preconditions.checkState( + snapshotId() == null, + "Cannot enable incremental scan, scan-snapshot set to id=%s", + snapshotId()); + return new IncrementalDataTableScan( + tableOps(), + table(), + schema(), context().fromSnapshotIdExclusive(fromSnapshotId).toSnapshotId(toSnapshotId)); } @Override public TableScan appendsAfter(long fromSnapshotId) { Snapshot currentSnapshot = table().currentSnapshot(); - Preconditions.checkState(currentSnapshot != null, "Cannot scan appends after %s, there is no current snapshot", + Preconditions.checkState( + currentSnapshot != null, + "Cannot scan appends after %s, there is no current snapshot", fromSnapshotId); return appendsBetween(fromSnapshotId, currentSnapshot.snapshotId()); } @@ -67,11 +89,13 @@ public TableScan useSnapshot(long scanSnapshotId) { // we do not use its return value super.useSnapshot(scanSnapshotId); Schema snapshotSchema = SnapshotUtil.schemaFor(table(), scanSnapshotId); - return newRefinedScan(tableOps(), table(), snapshotSchema, context().useSnapshotId(scanSnapshotId)); + return newRefinedScan( + tableOps(), table(), snapshotSchema, context().useSnapshotId(scanSnapshotId)); } @Override - protected TableScan newRefinedScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + protected TableScan newRefinedScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { return new DataTableScan(ops, table, schema, context); } @@ -80,19 +104,20 @@ public CloseableIterable doPlanFiles() { Snapshot snapshot = snapshot(); FileIO io = table().io(); - ManifestGroup manifestGroup = new ManifestGroup(io, snapshot.dataManifests(io), snapshot.deleteManifests(io)) - .caseSensitive(isCaseSensitive()) - .select(colStats() ? SCAN_WITH_STATS_COLUMNS : SCAN_COLUMNS) - .filterData(filter()) - .specsById(table().specs()) - .ignoreDeleted(); + ManifestGroup manifestGroup = + new ManifestGroup(io, snapshot.dataManifests(io), snapshot.deleteManifests(io)) + .caseSensitive(isCaseSensitive()) + .select(colStats() ? SCAN_WITH_STATS_COLUMNS : SCAN_COLUMNS) + .filterData(filter()) + .specsById(table().specs()) + .ignoreDeleted(); if (shouldIgnoreResiduals()) { manifestGroup = manifestGroup.ignoreResiduals(); } - if (snapshot.dataManifests(io).size() > 1 && - (PLAN_SCANS_WITH_WORKER_POOL || context().planWithCustomizedExecutor())) { + if (snapshot.dataManifests(io).size() > 1 + && (PLAN_SCANS_WITH_WORKER_POOL || context().planWithCustomizedExecutor())) { manifestGroup = manifestGroup.planWith(planExecutor()); } diff --git a/core/src/main/java/org/apache/iceberg/DeleteFileIndex.java b/core/src/main/java/org/apache/iceberg/DeleteFileIndex.java index f8fab0930585..877c83cf8aea 100644 --- a/core/src/main/java/org/apache/iceberg/DeleteFileIndex.java +++ b/core/src/main/java/org/apache/iceberg/DeleteFileIndex.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.github.benmanes.caffeine.cache.Caffeine; @@ -60,9 +59,10 @@ /** * An index of {@link DeleteFile delete files} by sequence number. - *

- * Use {@link #builderFor(FileIO, Iterable)} to construct an index, and {@link #forDataFile(long, DataFile)} or - * {@link #forEntry(ManifestEntry)} to get the delete files to apply to a given data file. + * + *

Use {@link #builderFor(FileIO, Iterable)} to construct an index, and {@link #forDataFile(long, + * DataFile)} or {@link #forEntry(ManifestEntry)} to get the delete files to apply to a given data + * file. */ class DeleteFileIndex { private final Map specsById; @@ -70,10 +70,14 @@ class DeleteFileIndex { private final Map> wrapperById; private final long[] globalSeqs; private final DeleteFile[] globalDeletes; - private final Map, Pair> sortedDeletesByPartition; - - DeleteFileIndex(Map specsById, long[] globalSeqs, DeleteFile[] globalDeletes, - Map, Pair> sortedDeletesByPartition) { + private final Map, Pair> + sortedDeletesByPartition; + + DeleteFileIndex( + Map specsById, + long[] globalSeqs, + DeleteFile[] globalDeletes, + Map, Pair> sortedDeletesByPartition) { this.specsById = specsById; ImmutableMap.Builder builder = ImmutableMap.builder(); specsById.forEach((specId, spec) -> builder.put(specId, spec.partitionType())); @@ -85,7 +89,8 @@ class DeleteFileIndex { } public boolean isEmpty() { - return (globalDeletes == null || globalDeletes.length == 0) && sortedDeletesByPartition.isEmpty(); + return (globalDeletes == null || globalDeletes.length == 0) + && sortedDeletesByPartition.isEmpty(); } public Iterable referencedDeleteFiles() { @@ -107,8 +112,8 @@ private StructLikeWrapper newWrapper(int specId) { } private Pair partition(int specId, StructLike struct) { - ThreadLocal wrapper = wrapperById.computeIfAbsent(specId, - id -> ThreadLocal.withInitial(() -> newWrapper(id))); + ThreadLocal wrapper = + wrapperById.computeIfAbsent(specId, id -> ThreadLocal.withInitial(() -> newWrapper(id))); return Pair.of(specId, wrapper.get().set(struct)); } @@ -124,19 +129,26 @@ DeleteFile[] forDataFile(long sequenceNumber, DataFile file) { if (partitionDeletes == null) { matchingDeletes = limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes); } else if (globalDeletes == null) { - matchingDeletes = limitBySequenceNumber(sequenceNumber, partitionDeletes.first(), partitionDeletes.second()); + matchingDeletes = + limitBySequenceNumber( + sequenceNumber, partitionDeletes.first(), partitionDeletes.second()); } else { - matchingDeletes = Stream.concat( - limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes), - limitBySequenceNumber(sequenceNumber, partitionDeletes.first(), partitionDeletes.second())); + matchingDeletes = + Stream.concat( + limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes), + limitBySequenceNumber( + sequenceNumber, partitionDeletes.first(), partitionDeletes.second())); } return matchingDeletes - .filter(deleteFile -> canContainDeletesForFile(file, deleteFile, specsById.get(file.specId()).schema())) + .filter( + deleteFile -> + canContainDeletesForFile(file, deleteFile, specsById.get(file.specId()).schema())) .toArray(DeleteFile[]::new); } - private static boolean canContainDeletesForFile(DataFile dataFile, DeleteFile deleteFile, Schema schema) { + private static boolean canContainDeletesForFile( + DataFile dataFile, DeleteFile deleteFile, Schema schema) { switch (deleteFile.content()) { case POSITION_DELETES: return canContainPosDeletesForFile(dataFile, deleteFile); @@ -160,12 +172,14 @@ private static boolean canContainPosDeletesForFile(DataFile dataFile, DeleteFile int pathId = MetadataColumns.DELETE_FILE_PATH.fieldId(); Comparator comparator = Comparators.charSequences(); ByteBuffer lower = lowers.get(pathId); - if (lower != null && comparator.compare(dataFile.path(), Conversions.fromByteBuffer(pathType, lower)) < 0) { + if (lower != null + && comparator.compare(dataFile.path(), Conversions.fromByteBuffer(pathType, lower)) < 0) { return false; } ByteBuffer upper = uppers.get(pathId); - if (upper != null && comparator.compare(dataFile.path(), Conversions.fromByteBuffer(pathType, upper)) > 0) { + if (upper != null + && comparator.compare(dataFile.path(), Conversions.fromByteBuffer(pathType, upper)) > 0) { return false; } @@ -173,11 +187,16 @@ private static boolean canContainPosDeletesForFile(DataFile dataFile, DeleteFile } @SuppressWarnings("checkstyle:CyclomaticComplexity") - private static boolean canContainEqDeletesForFile(DataFile dataFile, DeleteFile deleteFile, Schema schema) { + private static boolean canContainEqDeletesForFile( + DataFile dataFile, DeleteFile deleteFile, Schema schema) { // whether to check data ranges or to assume that the ranges match - // if upper/lower bounds are missing, null counts may still be used to determine delete files can be skipped - boolean checkRanges = dataFile.lowerBounds() != null && dataFile.upperBounds() != null && - deleteFile.lowerBounds() != null && deleteFile.upperBounds() != null; + // if upper/lower bounds are missing, null counts may still be used to determine delete files + // can be skipped + boolean checkRanges = + dataFile.lowerBounds() != null + && dataFile.upperBounds() != null + && deleteFile.lowerBounds() != null + && deleteFile.upperBounds() != null; Map dataLowers = dataFile.lowerBounds(); Map dataUppers = dataFile.upperBounds(); @@ -202,12 +221,15 @@ private static boolean canContainEqDeletesForFile(DataFile dataFile, DeleteFile } if (allNull(dataNullCounts, dataValueCounts, field) && allNonNull(deleteNullCounts, field)) { - // the data file contains only null values for this field, but there are no deletes for null values + // the data file contains only null values for this field, but there are no deletes for null + // values return false; } - if (allNull(deleteNullCounts, deleteValueCounts, field) && allNonNull(dataNullCounts, field)) { - // the delete file removes only null rows with null for this field, but there are no data rows with null + if (allNull(deleteNullCounts, deleteValueCounts, field) + && allNonNull(dataNullCounts, field)) { + // the delete file removes only null rows with null for this field, but there are no data + // rows with null return false; } @@ -225,7 +247,8 @@ private static boolean canContainEqDeletesForFile(DataFile dataFile, DeleteFile continue; } - if (!rangesOverlap(field.type().asPrimitiveType(), dataLower, dataUpper, deleteLower, deleteUpper)) { + if (!rangesOverlap( + field.type().asPrimitiveType(), dataLower, dataUpper, deleteLower, deleteUpper)) { // no values overlap between the data file and the deletes return false; } @@ -234,16 +257,20 @@ private static boolean canContainEqDeletesForFile(DataFile dataFile, DeleteFile return true; } - private static boolean rangesOverlap(Type.PrimitiveType type, - ByteBuffer dataLowerBuf, ByteBuffer dataUpperBuf, - ByteBuffer deleteLowerBuf, ByteBuffer deleteUpperBuf) { + private static boolean rangesOverlap( + Type.PrimitiveType type, + ByteBuffer dataLowerBuf, + ByteBuffer dataUpperBuf, + ByteBuffer deleteLowerBuf, + ByteBuffer deleteUpperBuf) { Comparator comparator = Comparators.forType(type); T dataLower = Conversions.fromByteBuffer(type, dataLowerBuf); T dataUpper = Conversions.fromByteBuffer(type, dataUpperBuf); T deleteLower = Conversions.fromByteBuffer(type, deleteLowerBuf); T deleteUpper = Conversions.fromByteBuffer(type, deleteUpperBuf); - return comparator.compare(deleteLower, dataUpper) <= 0 && comparator.compare(dataLower, deleteUpper) <= 0; + return comparator.compare(deleteLower, dataUpper) <= 0 + && comparator.compare(dataLower, deleteUpper) <= 0; } private static boolean allNonNull(Map nullValueCounts, Types.NestedField field) { @@ -263,8 +290,8 @@ private static boolean allNonNull(Map nullValueCounts, Types.Nest return nullValueCount <= 0; } - private static boolean allNull(Map nullValueCounts, Map valueCounts, - Types.NestedField field) { + private static boolean allNull( + Map nullValueCounts, Map valueCounts, Types.NestedField field) { if (field.isRequired()) { return false; } @@ -299,7 +326,8 @@ private static boolean containsNull(Map nullValueCounts, Types.Ne return nullValueCount > 0; } - private static Stream limitBySequenceNumber(long sequenceNumber, long[] seqs, DeleteFile[] files) { + private static Stream limitBySequenceNumber( + long sequenceNumber, long[] seqs, DeleteFile[] files) { if (files == null) { return Stream.empty(); } @@ -377,102 +405,132 @@ Builder planWith(ExecutorService newExecutorService) { } DeleteFileIndex build() { - // read all of the matching delete manifests in parallel and accumulate the matching files in a queue + // read all of the matching delete manifests in parallel and accumulate the matching files in + // a queue Queue> deleteEntries = new ConcurrentLinkedQueue<>(); Tasks.foreach(deleteManifestReaders()) - .stopOnFailure().throwFailureWhenFinished() + .stopOnFailure() + .throwFailureWhenFinished() .executeWith(executorService) - .run(deleteFile -> { - try (CloseableIterable> reader = deleteFile) { - for (ManifestEntry entry : reader) { - if (entry.sequenceNumber() > minSequenceNumber) { - // copy with stats for better filtering against data file stats - deleteEntries.add(entry.copy()); + .run( + deleteFile -> { + try (CloseableIterable> reader = deleteFile) { + for (ManifestEntry entry : reader) { + if (entry.sequenceNumber() > minSequenceNumber) { + // copy with stats for better filtering against data file stats + deleteEntries.add(entry.copy()); + } + } + } catch (IOException e) { + throw new RuntimeIOException(e, "Failed to close"); } - } - } catch (IOException e) { - throw new RuntimeIOException(e, "Failed to close"); - } - }); + }); // build a map from (specId, partition) to delete file entries Map wrappersBySpecId = Maps.newHashMap(); - ListMultimap, ManifestEntry> deleteFilesByPartition = - Multimaps.newListMultimap(Maps.newHashMap(), Lists::newArrayList); + ListMultimap, ManifestEntry> + deleteFilesByPartition = + Multimaps.newListMultimap(Maps.newHashMap(), Lists::newArrayList); for (ManifestEntry entry : deleteEntries) { int specId = entry.file().specId(); - StructLikeWrapper wrapper = wrappersBySpecId - .computeIfAbsent(specId, id -> StructLikeWrapper.forType(specsById.get(id).partitionType())) - .copyFor(entry.file().partition()); + StructLikeWrapper wrapper = + wrappersBySpecId + .computeIfAbsent( + specId, id -> StructLikeWrapper.forType(specsById.get(id).partitionType())) + .copyFor(entry.file().partition()); deleteFilesByPartition.put(Pair.of(specId, wrapper), entry); } - // sort the entries in each map value by sequence number and split into sequence numbers and delete files lists - Map, Pair> sortedDeletesByPartition = Maps.newHashMap(); - // also, separate out equality deletes in an unpartitioned spec that should be applied globally + // sort the entries in each map value by sequence number and split into sequence numbers and + // delete files lists + Map, Pair> sortedDeletesByPartition = + Maps.newHashMap(); + // also, separate out equality deletes in an unpartitioned spec that should be applied + // globally long[] globalApplySeqs = null; DeleteFile[] globalDeletes = null; for (Pair partition : deleteFilesByPartition.keySet()) { if (specsById.get(partition.first()).isUnpartitioned()) { - Preconditions.checkState(globalDeletes == null, "Detected multiple partition specs with no partitions"); - - List> eqFilesSortedBySeq = deleteFilesByPartition.get(partition).stream() - .filter(entry -> entry.file().content() == FileContent.EQUALITY_DELETES) - .map(entry -> - // a delete file is indexed by the sequence number it should be applied to - Pair.of(entry.sequenceNumber() - 1, entry.file())) - .sorted(Comparator.comparingLong(Pair::first)) - .collect(Collectors.toList()); + Preconditions.checkState( + globalDeletes == null, "Detected multiple partition specs with no partitions"); + + List> eqFilesSortedBySeq = + deleteFilesByPartition.get(partition).stream() + .filter(entry -> entry.file().content() == FileContent.EQUALITY_DELETES) + .map( + entry -> + // a delete file is indexed by the sequence number it should be applied to + Pair.of(entry.sequenceNumber() - 1, entry.file())) + .sorted(Comparator.comparingLong(Pair::first)) + .collect(Collectors.toList()); globalApplySeqs = eqFilesSortedBySeq.stream().mapToLong(Pair::first).toArray(); globalDeletes = eqFilesSortedBySeq.stream().map(Pair::second).toArray(DeleteFile[]::new); - List> posFilesSortedBySeq = deleteFilesByPartition.get(partition).stream() - .filter(entry -> entry.file().content() == FileContent.POSITION_DELETES) - .map(entry -> Pair.of(entry.sequenceNumber(), entry.file())) - .sorted(Comparator.comparingLong(Pair::first)) - .collect(Collectors.toList()); + List> posFilesSortedBySeq = + deleteFilesByPartition.get(partition).stream() + .filter(entry -> entry.file().content() == FileContent.POSITION_DELETES) + .map(entry -> Pair.of(entry.sequenceNumber(), entry.file())) + .sorted(Comparator.comparingLong(Pair::first)) + .collect(Collectors.toList()); long[] seqs = posFilesSortedBySeq.stream().mapToLong(Pair::first).toArray(); - DeleteFile[] files = posFilesSortedBySeq.stream().map(Pair::second).toArray(DeleteFile[]::new); + DeleteFile[] files = + posFilesSortedBySeq.stream().map(Pair::second).toArray(DeleteFile[]::new); sortedDeletesByPartition.put(partition, Pair.of(seqs, files)); } else { - List> filesSortedBySeq = deleteFilesByPartition.get(partition).stream() - .map(entry -> { - // a delete file is indexed by the sequence number it should be applied to - long applySeq = entry.sequenceNumber() - - (entry.file().content() == FileContent.EQUALITY_DELETES ? 1 : 0); - return Pair.of(applySeq, entry.file()); - }) - .sorted(Comparator.comparingLong(Pair::first)) - .collect(Collectors.toList()); + List> filesSortedBySeq = + deleteFilesByPartition.get(partition).stream() + .map( + entry -> { + // a delete file is indexed by the sequence number it should be applied to + long applySeq = + entry.sequenceNumber() + - (entry.file().content() == FileContent.EQUALITY_DELETES ? 1 : 0); + return Pair.of(applySeq, entry.file()); + }) + .sorted(Comparator.comparingLong(Pair::first)) + .collect(Collectors.toList()); long[] seqs = filesSortedBySeq.stream().mapToLong(Pair::first).toArray(); - DeleteFile[] files = filesSortedBySeq.stream().map(Pair::second).toArray(DeleteFile[]::new); + DeleteFile[] files = + filesSortedBySeq.stream().map(Pair::second).toArray(DeleteFile[]::new); sortedDeletesByPartition.put(partition, Pair.of(seqs, files)); } } - return new DeleteFileIndex(specsById, globalApplySeqs, globalDeletes, sortedDeletesByPartition); + return new DeleteFileIndex( + specsById, globalApplySeqs, globalDeletes, sortedDeletesByPartition); } private Iterable>> deleteManifestReaders() { - LoadingCache evalCache = specsById == null ? null : - Caffeine.newBuilder().build(specId -> { - PartitionSpec spec = specsById.get(specId); - return ManifestEvaluator.forPartitionFilter( - Expressions.and(partitionFilter, Projections.inclusive(spec, caseSensitive).project(dataFilter)), - spec, caseSensitive); - }); - - Iterable matchingManifests = evalCache == null ? deleteManifests : - Iterables.filter(deleteManifests, manifest -> - manifest.content() == ManifestContent.DELETES && - (manifest.hasAddedFiles() || manifest.hasExistingFiles()) && - evalCache.get(manifest.partitionSpecId()).eval(manifest)); + LoadingCache evalCache = + specsById == null + ? null + : Caffeine.newBuilder() + .build( + specId -> { + PartitionSpec spec = specsById.get(specId); + return ManifestEvaluator.forPartitionFilter( + Expressions.and( + partitionFilter, + Projections.inclusive(spec, caseSensitive).project(dataFilter)), + spec, + caseSensitive); + }); + + Iterable matchingManifests = + evalCache == null + ? deleteManifests + : Iterables.filter( + deleteManifests, + manifest -> + manifest.content() == ManifestContent.DELETES + && (manifest.hasAddedFiles() || manifest.hasExistingFiles()) + && evalCache.get(manifest.partitionSpecId()).eval(manifest)); return Iterables.transform( matchingManifests, @@ -482,8 +540,7 @@ private Iterable>> deleteManifestRea .filterPartitions(partitionFilter) .filterPartitions(partitionSet) .caseSensitive(caseSensitive) - .liveEntries() - ); + .liveEntries()); } } } diff --git a/core/src/main/java/org/apache/iceberg/DeleteFilesTable.java b/core/src/main/java/org/apache/iceberg/DeleteFilesTable.java index f69c7d6508e4..dc89b7387466 100644 --- a/core/src/main/java/org/apache/iceberg/DeleteFilesTable.java +++ b/core/src/main/java/org/apache/iceberg/DeleteFilesTable.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.io.CloseableIterable; -/** - * A {@link Table} implementation that exposes a table's delete files as rows. - */ +/** A {@link Table} implementation that exposes a table's delete files as rows. */ public class DeleteFilesTable extends BaseFilesTable { DeleteFilesTable(TableOperations ops, Table table) { @@ -50,12 +47,14 @@ public static class DeleteFilesTableScan extends BaseFilesTableScan { super(ops, table, schema, MetadataTableType.DELETE_FILES); } - DeleteFilesTableScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + DeleteFilesTableScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { super(ops, table, schema, MetadataTableType.DELETE_FILES, context); } @Override - protected TableScan newRefinedScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + protected TableScan newRefinedScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { return new DeleteFilesTableScan(ops, table, schema, context); } diff --git a/core/src/main/java/org/apache/iceberg/DoubleFieldMetrics.java b/core/src/main/java/org/apache/iceberg/DoubleFieldMetrics.java index 8185f3a9f2a5..85e7a06470b5 100644 --- a/core/src/main/java/org/apache/iceberg/DoubleFieldMetrics.java +++ b/core/src/main/java/org/apache/iceberg/DoubleFieldMetrics.java @@ -16,19 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; /** * Iceberg internally tracked field level metrics, used by Parquet and ORC writers only. - *

- * Parquet/ORC keeps track of most metrics in file statistics, and only NaN counter is actually tracked by writers. - * This wrapper ensures that metrics not being updated by those writers will not be incorrectly used, by throwing - * exceptions when they are accessed. + * + *

Parquet/ORC keeps track of most metrics in file statistics, and only NaN counter is actually + * tracked by writers. This wrapper ensures that metrics not being updated by those writers will not + * be incorrectly used, by throwing exceptions when they are accessed. */ public class DoubleFieldMetrics extends FieldMetrics { - private DoubleFieldMetrics(int id, long valueCount, long nanValueCount, Double lowerBound, Double upperBound) { + private DoubleFieldMetrics( + int id, long valueCount, long nanValueCount, Double lowerBound, Double upperBound) { super(id, valueCount, 0L, nanValueCount, lowerBound, upperBound); } @@ -59,8 +59,12 @@ public void addValue(double value) { public DoubleFieldMetrics build() { boolean hasBound = valueCount - nanValueCount > 0; - return new DoubleFieldMetrics(id, valueCount, nanValueCount, - hasBound ? lowerBound : null, hasBound ? upperBound : null); + return new DoubleFieldMetrics( + id, + valueCount, + nanValueCount, + hasBound ? lowerBound : null, + hasBound ? upperBound : null); } } } diff --git a/core/src/main/java/org/apache/iceberg/FastAppend.java b/core/src/main/java/org/apache/iceberg/FastAppend.java index a13089ff9ada..febdcee633e8 100644 --- a/core/src/main/java/org/apache/iceberg/FastAppend.java +++ b/core/src/main/java/org/apache/iceberg/FastAppend.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; +import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT; + import java.io.IOException; import java.util.List; import java.util.Map; @@ -32,13 +34,11 @@ import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; -import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT; - /** * {@link AppendFiles Append} implementation that adds a new manifest file for the write. - *

- * This implementation will attempt to commit 5 times before throwing {@link CommitFailedException}. + * + *

This implementation will attempt to commit 5 times before throwing {@link + * CommitFailedException}. */ class FastAppend extends SnapshotProducer implements AppendFiles { private final String tableName; @@ -57,8 +57,10 @@ class FastAppend extends SnapshotProducer implements AppendFiles { this.tableName = tableName; this.ops = ops; this.spec = ops.current().spec(); - this.snapshotIdInheritanceEnabled = ops.current() - .propertyAsBoolean(SNAPSHOT_ID_INHERITANCE_ENABLED, SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); + this.snapshotIdInheritanceEnabled = + ops.current() + .propertyAsBoolean( + SNAPSHOT_ID_INHERITANCE_ENABLED, SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); } @Override @@ -79,8 +81,11 @@ protected String operation() { @Override protected Map summary() { - summaryBuilder.setPartitionSummaryLimit(ops.current().propertyAsInt( - TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, TableProperties.WRITE_PARTITION_SUMMARY_LIMIT_DEFAULT)); + summaryBuilder.setPartitionSummaryLimit( + ops.current() + .propertyAsInt( + TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, + TableProperties.WRITE_PARTITION_SUMMARY_LIMIT_DEFAULT)); return summaryBuilder.build(); } @@ -94,13 +99,15 @@ public FastAppend appendFile(DataFile file) { @Override public FastAppend appendManifest(ManifestFile manifest) { - Preconditions.checkArgument(!manifest.hasExistingFiles(), "Cannot append manifest with existing files"); - Preconditions.checkArgument(!manifest.hasDeletedFiles(), "Cannot append manifest with deleted files"); + Preconditions.checkArgument( + !manifest.hasExistingFiles(), "Cannot append manifest with existing files"); + Preconditions.checkArgument( + !manifest.hasDeletedFiles(), "Cannot append manifest with deleted files"); Preconditions.checkArgument( manifest.snapshotId() == null || manifest.snapshotId() == -1, "Snapshot id must be assigned during commit"); - Preconditions.checkArgument(manifest.sequenceNumber() == -1, - "Sequence number must be assigned during commit"); + Preconditions.checkArgument( + manifest.sequenceNumber() == -1, "Sequence number must be assigned during commit"); if (snapshotIdInheritanceEnabled && manifest.snapshotId() == null) { summaryBuilder.addedManifest(manifest); @@ -119,7 +126,12 @@ private ManifestFile copyManifest(ManifestFile manifest) { InputFile toCopy = ops.io().newInputFile(manifest.path()); OutputFile newManifestPath = newManifestOutput(); return ManifestFiles.copyAppendManifest( - current.formatVersion(), toCopy, current.specsById(), newManifestPath, snapshotId(), summaryBuilder); + current.formatVersion(), + toCopy, + current.specsById(), + newManifestPath, + snapshotId(), + summaryBuilder); } @Override @@ -135,9 +147,10 @@ public List apply(TableMetadata base) { throw new RuntimeIOException(e, "Failed to write manifest"); } - Iterable appendManifestsWithMetadata = Iterables.transform( - Iterables.concat(appendManifests, rewrittenAppendManifests), - manifest -> GenericManifestFile.copyOf(manifest).withSnapshotId(snapshotId()).build()); + Iterable appendManifestsWithMetadata = + Iterables.transform( + Iterables.concat(appendManifests, rewrittenAppendManifests), + manifest -> GenericManifestFile.copyOf(manifest).withSnapshotId(snapshotId()).build()); Iterables.addAll(newManifests, appendManifestsWithMetadata); if (base.currentSnapshot() != null) { @@ -153,11 +166,7 @@ public Object updateEvent() { Snapshot snapshot = ops.current().snapshot(snapshotId); long sequenceNumber = snapshot.sequenceNumber(); return new CreateSnapshotEvent( - tableName, - operation(), - snapshotId, - sequenceNumber, - snapshot.summary()); + tableName, operation(), snapshotId, sequenceNumber, snapshot.summary()); } @Override diff --git a/core/src/main/java/org/apache/iceberg/FieldMetrics.java b/core/src/main/java/org/apache/iceberg/FieldMetrics.java index effcb78cdf58..1192751ca002 100644 --- a/core/src/main/java/org/apache/iceberg/FieldMetrics.java +++ b/core/src/main/java/org/apache/iceberg/FieldMetrics.java @@ -16,13 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; - -/** - * Iceberg internally tracked field level metrics. - */ +/** Iceberg internally tracked field level metrics. */ public class FieldMetrics { private final int id; private final long valueCount; @@ -31,12 +27,13 @@ public class FieldMetrics { private final T lowerBound; private final T upperBound; - public FieldMetrics(int id, - long valueCount, - long nullValueCount, - long nanValueCount, - T lowerBound, - T upperBound) { + public FieldMetrics( + int id, + long valueCount, + long nullValueCount, + long nanValueCount, + T lowerBound, + T upperBound) { this.id = id; this.valueCount = valueCount; this.nullValueCount = nullValueCount; @@ -45,51 +42,40 @@ public FieldMetrics(int id, this.upperBound = upperBound; } - /** - * Returns the id of the field that the metrics within this class are associated with. - */ + /** Returns the id of the field that the metrics within this class are associated with. */ public int id() { return id; } - /** - * Returns the number of all values, including nulls, NaN and repeated, for the given field. - */ + /** Returns the number of all values, including nulls, NaN and repeated, for the given field. */ public long valueCount() { return valueCount; } - /** - * Returns the number of null values for this field. - */ + /** Returns the number of null values for this field. */ public long nullValueCount() { return nullValueCount; } /** - * Returns the number of NaN values for this field. Will only be non-0 if this field is a double or float field. + * Returns the number of NaN values for this field. Will only be non-0 if this field is a double + * or float field. */ public long nanValueCount() { return nanValueCount; } - /** - * Returns the lower bound value of this field. - */ + /** Returns the lower bound value of this field. */ public T lowerBound() { return lowerBound; } - /** - * Returns the upper bound value of this field. - */ + /** Returns the upper bound value of this field. */ public T upperBound() { return upperBound; } - /** - * Returns if the metrics has bounds (i.e. there is at least non-null value for this field) - */ + /** Returns if the metrics has bounds (i.e. there is at least non-null value for this field) */ public boolean hasBounds() { return upperBound != null; } diff --git a/core/src/main/java/org/apache/iceberg/FileMetadata.java b/core/src/main/java/org/apache/iceberg/FileMetadata.java index 84971b40d970..7d300dc3d879 100644 --- a/core/src/main/java/org/apache/iceberg/FileMetadata.java +++ b/core/src/main/java/org/apache/iceberg/FileMetadata.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.nio.ByteBuffer; @@ -31,8 +30,7 @@ import org.apache.iceberg.util.ByteBuffers; public class FileMetadata { - private FileMetadata() { - } + private FileMetadata() {} public static Builder deleteFileBuilder(PartitionSpec spec) { return new Builder(spec); @@ -86,7 +84,8 @@ public void clear() { public Builder copy(DeleteFile toCopy) { if (isPartitioned) { - Preconditions.checkState(specId == toCopy.specId(), "Cannot copy a DeleteFile with a different spec"); + Preconditions.checkState( + specId == toCopy.specId(), "Cannot copy a DeleteFile with a different spec"); this.partitionData = DataFiles.copyPartitionData(spec, toCopy.partition(), partitionData); } this.content = toCopy.content(); @@ -100,8 +99,8 @@ public Builder copy(DeleteFile toCopy) { this.nanValueCounts = toCopy.nanValueCounts(); this.lowerBounds = toCopy.lowerBounds(); this.upperBounds = toCopy.upperBounds(); - this.keyMetadata = toCopy.keyMetadata() == null ? null - : ByteBuffers.copy(toCopy.keyMetadata()); + this.keyMetadata = + toCopy.keyMetadata() == null ? null : ByteBuffers.copy(toCopy.keyMetadata()); this.sortOrderId = toCopy.sortOrderId(); return this; } @@ -171,7 +170,8 @@ public Builder withFileSizeInBytes(long newFileSizeInBytes) { } public Builder withPartitionPath(String newPartitionPath) { - Preconditions.checkArgument(isPartitioned || newPartitionPath.isEmpty(), + Preconditions.checkArgument( + isPartitioned || newPartitionPath.isEmpty(), "Cannot add partition data for an unpartitioned table"); if (!newPartitionPath.isEmpty()) { this.partitionData = DataFiles.fillFromPath(spec, newPartitionPath, partitionData); @@ -219,8 +219,8 @@ public DeleteFile build() { switch (content) { case POSITION_DELETES: - Preconditions.checkArgument(sortOrderId == null, - "Position delete file should not have sort order"); + Preconditions.checkArgument( + sortOrderId == null, "Position delete file should not have sort order"); break; case EQUALITY_DELETES: if (sortOrderId == null) { @@ -232,10 +232,23 @@ public DeleteFile build() { } return new GenericDeleteFile( - specId, content, filePath, format, isPartitioned ? DataFiles.copy(spec, partitionData) : null, - fileSizeInBytes, new Metrics( - recordCount, columnSizes, valueCounts, nullValueCounts, nanValueCounts, lowerBounds, upperBounds), - equalityFieldIds, sortOrderId, keyMetadata); + specId, + content, + filePath, + format, + isPartitioned ? DataFiles.copy(spec, partitionData) : null, + fileSizeInBytes, + new Metrics( + recordCount, + columnSizes, + valueCounts, + nullValueCounts, + nanValueCounts, + lowerBounds, + upperBounds), + equalityFieldIds, + sortOrderId, + keyMetadata); } } } diff --git a/core/src/main/java/org/apache/iceberg/FilesTable.java b/core/src/main/java/org/apache/iceberg/FilesTable.java index f2531915077d..53efb3637d02 100644 --- a/core/src/main/java/org/apache/iceberg/FilesTable.java +++ b/core/src/main/java/org/apache/iceberg/FilesTable.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.io.CloseableIterable; -/** - * A {@link Table} implementation that exposes a table's files as rows. - */ +/** A {@link Table} implementation that exposes a table's files as rows. */ public class FilesTable extends BaseFilesTable { FilesTable(TableOperations ops, Table table) { @@ -55,7 +52,8 @@ public static class FilesTableScan extends BaseFilesTableScan { } @Override - protected TableScan newRefinedScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + protected TableScan newRefinedScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { return new FilesTableScan(ops, table, schema, context); } diff --git a/core/src/main/java/org/apache/iceberg/FindFiles.java b/core/src/main/java/org/apache/iceberg/FindFiles.java index 5a62216aac1e..2cd944d922c7 100644 --- a/core/src/main/java/org/apache/iceberg/FindFiles.java +++ b/core/src/main/java/org/apache/iceberg/FindFiles.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Arrays; @@ -28,8 +27,7 @@ import org.apache.iceberg.util.DateTimeUtil; public class FindFiles { - private FindFiles() { - } + private FindFiles() {} public static Builder in(Table table) { return new Builder(table); @@ -72,10 +70,12 @@ public Builder includeColumnStats() { * @return this for method chaining */ public Builder inSnapshot(long findSnapshotId) { - Preconditions.checkArgument(this.snapshotId == null, - "Cannot set snapshot multiple times, already set to id=%s", findSnapshotId); - Preconditions.checkArgument(table.snapshot(findSnapshotId) != null, - "Cannot find snapshot for id=%s", findSnapshotId); + Preconditions.checkArgument( + this.snapshotId == null, + "Cannot set snapshot multiple times, already set to id=%s", + findSnapshotId); + Preconditions.checkArgument( + table.snapshot(findSnapshotId) != null, "Cannot find snapshot for id=%s", findSnapshotId); this.snapshotId = findSnapshotId; return this; } @@ -87,8 +87,10 @@ public Builder inSnapshot(long findSnapshotId) { * @return this for method chaining */ public Builder asOfTime(long timestampMillis) { - Preconditions.checkArgument(this.snapshotId == null, - "Cannot set snapshot multiple times, already set to id=%s", snapshotId); + Preconditions.checkArgument( + this.snapshotId == null, + "Cannot set snapshot multiple times, already set to id=%s", + snapshotId); Long lastSnapshotId = null; for (HistoryEntry logEntry : ops.current().snapshotLog()) { @@ -102,7 +104,8 @@ public Builder asOfTime(long timestampMillis) { // the snapshot ID could be null if no entries were older than the requested time. in that // case, there is no valid snapshot to read. - Preconditions.checkArgument(lastSnapshotId != null, + Preconditions.checkArgument( + lastSnapshotId != null, "Cannot find a snapshot older than %s", DateTimeUtil.formatTimestampMillis(timestampMillis)); return inSnapshot(lastSnapshotId); @@ -161,17 +164,19 @@ public Builder inPartitions(PartitionSpec spec, StructLike... partitions) { * @return this for method chaining */ public Builder inPartitions(PartitionSpec spec, List partitions) { - Preconditions.checkArgument(spec.equals(ops.current().spec(spec.specId())), - "Partition spec does not belong to table: %s", table); + Preconditions.checkArgument( + spec.equals(ops.current().spec(spec.specId())), + "Partition spec does not belong to table: %s", + table); Expression partitionSetFilter = Expressions.alwaysFalse(); for (StructLike partitionData : partitions) { Expression partFilter = Expressions.alwaysTrue(); for (int i = 0; i < spec.fields().size(); i += 1) { PartitionField field = spec.fields().get(i); - partFilter = Expressions.and( - partFilter, - Expressions.equal(field.name(), partitionData.get(i, Object.class))); + partFilter = + Expressions.and( + partFilter, Expressions.equal(field.name(), partitionData.get(i, Object.class))); } partitionSetFilter = Expressions.or(partitionSetFilter, partFilter); } @@ -185,12 +190,10 @@ public Builder inPartitions(PartitionSpec spec, List partitions) { return this; } - /** - * Returns all files in the table that match all of the filters. - */ + /** Returns all files in the table that match all of the filters. */ public CloseableIterable collect() { - Snapshot snapshot = snapshotId != null ? - ops.current().snapshot(snapshotId) : ops.current().currentSnapshot(); + Snapshot snapshot = + snapshotId != null ? ops.current().snapshot(snapshotId) : ops.current().currentSnapshot(); // snapshot could be null when the table just gets created if (snapshot == null) { @@ -198,14 +201,15 @@ public CloseableIterable collect() { } // when snapshot is not null - CloseableIterable> entries = new ManifestGroup(ops.io(), snapshot.dataManifests(ops.io())) - .specsById(ops.current().specsById()) - .filterData(rowFilter) - .filterFiles(fileFilter) - .filterPartitions(partitionFilter) - .ignoreDeleted() - .caseSensitive(caseSensitive) - .entries(); + CloseableIterable> entries = + new ManifestGroup(ops.io(), snapshot.dataManifests(ops.io())) + .specsById(ops.current().specsById()) + .filterData(rowFilter) + .filterFiles(fileFilter) + .filterPartitions(partitionFilter) + .ignoreDeleted() + .caseSensitive(caseSensitive) + .entries(); return CloseableIterable.transform(entries, entry -> entry.file().copy(includeColumnStats)); } diff --git a/core/src/main/java/org/apache/iceberg/FixedSizeSplitScanTaskIterator.java b/core/src/main/java/org/apache/iceberg/FixedSizeSplitScanTaskIterator.java index 11ff458ac345..d59a760a9f25 100644 --- a/core/src/main/java/org/apache/iceberg/FixedSizeSplitScanTaskIterator.java +++ b/core/src/main/java/org/apache/iceberg/FixedSizeSplitScanTaskIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; /** @@ -31,8 +30,11 @@ class FixedSizeSplitScanTaskIterator implements SplitScanTas private long offset; private long remainingLength; - FixedSizeSplitScanTaskIterator(T parentTask, long parentTaskLength, long splitSize, - SplitScanTaskCreator splitTaskCreator) { + FixedSizeSplitScanTaskIterator( + T parentTask, + long parentTaskLength, + long splitSize, + SplitScanTaskCreator splitTaskCreator) { this.parentTask = parentTask; this.splitSize = splitSize; this.splitTaskCreator = splitTaskCreator; diff --git a/core/src/main/java/org/apache/iceberg/FloatFieldMetrics.java b/core/src/main/java/org/apache/iceberg/FloatFieldMetrics.java index 5854b0c57393..4c03878b53f2 100644 --- a/core/src/main/java/org/apache/iceberg/FloatFieldMetrics.java +++ b/core/src/main/java/org/apache/iceberg/FloatFieldMetrics.java @@ -16,19 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; /** * Iceberg internally tracked field level metrics, used by Parquet and ORC writers only. - *

- * Parquet/ORC keeps track of most metrics in file statistics, and only NaN counter is actually tracked by writers. - * This wrapper ensures that metrics not being updated by those writers will not be incorrectly used, by throwing - * exceptions when they are accessed. + * + *

Parquet/ORC keeps track of most metrics in file statistics, and only NaN counter is actually + * tracked by writers. This wrapper ensures that metrics not being updated by those writers will not + * be incorrectly used, by throwing exceptions when they are accessed. */ public class FloatFieldMetrics extends FieldMetrics { - private FloatFieldMetrics(int id, long valueCount, long nanValueCount, Float lowerBound, Float upperBound) { + private FloatFieldMetrics( + int id, long valueCount, long nanValueCount, Float lowerBound, Float upperBound) { super(id, valueCount, 0L, nanValueCount, lowerBound, upperBound); } @@ -63,8 +63,12 @@ public void addValue(float value) { public FloatFieldMetrics build() { boolean hasBound = valueCount - nanValueCount > 0; - return new FloatFieldMetrics(id, valueCount, nanValueCount, - hasBound ? lowerBound : null, hasBound ? upperBound : null); + return new FloatFieldMetrics( + id, + valueCount, + nanValueCount, + hasBound ? lowerBound : null, + hasBound ? upperBound : null); } } } diff --git a/core/src/main/java/org/apache/iceberg/GenericDataFile.java b/core/src/main/java/org/apache/iceberg/GenericDataFile.java index 40785664d8f5..34c65e669fb2 100644 --- a/core/src/main/java/org/apache/iceberg/GenericDataFile.java +++ b/core/src/main/java/org/apache/iceberg/GenericDataFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.nio.ByteBuffer; @@ -27,19 +26,39 @@ import org.apache.iceberg.types.Types; class GenericDataFile extends BaseFile implements DataFile { - /** - * Used by Avro reflection to instantiate this class when reading manifest files. - */ + /** Used by Avro reflection to instantiate this class when reading manifest files. */ GenericDataFile(Schema avroSchema) { super(avroSchema); } - GenericDataFile(int specId, String filePath, FileFormat format, PartitionData partition, - long fileSizeInBytes, Metrics metrics, - ByteBuffer keyMetadata, List splitOffsets, Integer sortOrderId) { - super(specId, FileContent.DATA, filePath, format, partition, fileSizeInBytes, metrics.recordCount(), - metrics.columnSizes(), metrics.valueCounts(), metrics.nullValueCounts(), metrics.nanValueCounts(), - metrics.lowerBounds(), metrics.upperBounds(), splitOffsets, null, sortOrderId, keyMetadata); + GenericDataFile( + int specId, + String filePath, + FileFormat format, + PartitionData partition, + long fileSizeInBytes, + Metrics metrics, + ByteBuffer keyMetadata, + List splitOffsets, + Integer sortOrderId) { + super( + specId, + FileContent.DATA, + filePath, + format, + partition, + fileSizeInBytes, + metrics.recordCount(), + metrics.columnSizes(), + metrics.valueCounts(), + metrics.nullValueCounts(), + metrics.nanValueCounts(), + metrics.lowerBounds(), + metrics.upperBounds(), + splitOffsets, + null, + sortOrderId, + keyMetadata); } /** @@ -52,11 +71,8 @@ private GenericDataFile(GenericDataFile toCopy, boolean fullCopy) { super(toCopy, fullCopy); } - /** - * Constructor for Java serialization. - */ - GenericDataFile() { - } + /** Constructor for Java serialization. */ + GenericDataFile() {} @Override public DataFile copyWithoutStats() { @@ -71,8 +87,10 @@ public DataFile copy() { @Override protected Schema getAvroSchema(Types.StructType partitionStruct) { Types.StructType type = DataFile.getType(partitionStruct); - return AvroSchemaUtil.convert(type, ImmutableMap.of( - type, GenericDataFile.class.getName(), - partitionStruct, PartitionData.class.getName())); + return AvroSchemaUtil.convert( + type, + ImmutableMap.of( + type, GenericDataFile.class.getName(), + partitionStruct, PartitionData.class.getName())); } } diff --git a/core/src/main/java/org/apache/iceberg/GenericDeleteFile.java b/core/src/main/java/org/apache/iceberg/GenericDeleteFile.java index 2cd5f33d29ac..1b4effba642c 100644 --- a/core/src/main/java/org/apache/iceberg/GenericDeleteFile.java +++ b/core/src/main/java/org/apache/iceberg/GenericDeleteFile.java @@ -16,10 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; - import java.nio.ByteBuffer; import org.apache.avro.Schema; import org.apache.iceberg.avro.AvroSchemaUtil; @@ -27,19 +25,40 @@ import org.apache.iceberg.types.Types; class GenericDeleteFile extends BaseFile implements DeleteFile { - /** - * Used by Avro reflection to instantiate this class when reading manifest files. - */ + /** Used by Avro reflection to instantiate this class when reading manifest files. */ GenericDeleteFile(Schema avroSchema) { super(avroSchema); } - GenericDeleteFile(int specId, FileContent content, String filePath, FileFormat format, PartitionData partition, - long fileSizeInBytes, Metrics metrics, int[] equalityFieldIds, - Integer sortOrderId, ByteBuffer keyMetadata) { - super(specId, content, filePath, format, partition, fileSizeInBytes, metrics.recordCount(), - metrics.columnSizes(), metrics.valueCounts(), metrics.nullValueCounts(), metrics.nanValueCounts(), - metrics.lowerBounds(), metrics.upperBounds(), null, equalityFieldIds, sortOrderId, keyMetadata); + GenericDeleteFile( + int specId, + FileContent content, + String filePath, + FileFormat format, + PartitionData partition, + long fileSizeInBytes, + Metrics metrics, + int[] equalityFieldIds, + Integer sortOrderId, + ByteBuffer keyMetadata) { + super( + specId, + content, + filePath, + format, + partition, + fileSizeInBytes, + metrics.recordCount(), + metrics.columnSizes(), + metrics.valueCounts(), + metrics.nullValueCounts(), + metrics.nanValueCounts(), + metrics.lowerBounds(), + metrics.upperBounds(), + null, + equalityFieldIds, + sortOrderId, + keyMetadata); } /** @@ -52,11 +71,8 @@ private GenericDeleteFile(GenericDeleteFile toCopy, boolean fullCopy) { super(toCopy, fullCopy); } - /** - * Constructor for Java serialization. - */ - GenericDeleteFile() { - } + /** Constructor for Java serialization. */ + GenericDeleteFile() {} @Override public DeleteFile copyWithoutStats() { @@ -71,8 +87,10 @@ public DeleteFile copy() { @Override protected Schema getAvroSchema(Types.StructType partitionStruct) { Types.StructType type = DataFile.getType(partitionStruct); - return AvroSchemaUtil.convert(type, ImmutableMap.of( - type, GenericDeleteFile.class.getName(), - partitionStruct, PartitionData.class.getName())); + return AvroSchemaUtil.convert( + type, + ImmutableMap.of( + type, GenericDeleteFile.class.getName(), + partitionStruct, PartitionData.class.getName())); } } diff --git a/core/src/main/java/org/apache/iceberg/GenericManifestEntry.java b/core/src/main/java/org/apache/iceberg/GenericManifestEntry.java index 83fa1610435b..8e0a54093ce2 100644 --- a/core/src/main/java/org/apache/iceberg/GenericManifestEntry.java +++ b/core/src/main/java/org/apache/iceberg/GenericManifestEntry.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.avro.generic.IndexedRecord; @@ -77,17 +76,13 @@ ManifestEntry wrapDelete(Long newSnapshotId, F newFile) { return this; } - /** - * @return the status of the file, whether EXISTING, ADDED, or DELETED - */ + /** @return the status of the file, whether EXISTING, ADDED, or DELETED */ @Override public Status status() { return status; } - /** - * @return id of the snapshot in which the file was added to the table - */ + /** @return id of the snapshot in which the file was added to the table */ @Override public Long snapshotId() { return snapshotId; @@ -98,9 +93,7 @@ public Long sequenceNumber() { return sequenceNumber; } - /** - * @return a file - */ + /** @return a file */ @Override public F file() { return file; diff --git a/core/src/main/java/org/apache/iceberg/GenericManifestFile.java b/core/src/main/java/org/apache/iceberg/GenericManifestFile.java index 7fff183aa046..d6b7636d3a55 100644 --- a/core/src/main/java/org/apache/iceberg/GenericManifestFile.java +++ b/core/src/main/java/org/apache/iceberg/GenericManifestFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; @@ -38,8 +37,8 @@ public class GenericManifestFile implements ManifestFile, StructLike, IndexedRecord, SchemaConstructable, Serializable { - private static final Schema AVRO_SCHEMA = AvroSchemaUtil.convert( - ManifestFile.schema(), "manifest_file"); + private static final Schema AVRO_SCHEMA = + AvroSchemaUtil.convert(ManifestFile.schema(), "manifest_file"); private transient Schema avroSchema; // not final for Java serialization private int[] fromProjectionPos; @@ -62,9 +61,7 @@ public class GenericManifestFile private PartitionFieldSummary[] partitions = null; private byte[] keyMetadata = null; - /** - * Used by Avro reflection to instantiate this class when reading manifest files. - */ + /** Used by Avro reflection to instantiate this class when reading manifest files. */ public GenericManifestFile(Schema avroSchema) { this.avroSchema = avroSchema; @@ -107,11 +104,22 @@ public GenericManifestFile(Schema avroSchema) { this.keyMetadata = null; } - public GenericManifestFile(String path, long length, int specId, ManifestContent content, - long sequenceNumber, long minSequenceNumber, Long snapshotId, - int addedFilesCount, long addedRowsCount, int existingFilesCount, - long existingRowsCount, int deletedFilesCount, long deletedRowsCount, - List partitions, ByteBuffer keyMetadata) { + public GenericManifestFile( + String path, + long length, + int specId, + ManifestContent content, + long sequenceNumber, + long minSequenceNumber, + Long snapshotId, + int addedFilesCount, + long addedRowsCount, + int existingFilesCount, + long existingRowsCount, + int deletedFilesCount, + long deletedRowsCount, + List partitions, + ByteBuffer keyMetadata) { this.avroSchema = AVRO_SCHEMA; this.manifestPath = path; this.length = length; @@ -152,21 +160,22 @@ private GenericManifestFile(GenericManifestFile toCopy) { this.deletedFilesCount = toCopy.deletedFilesCount; this.deletedRowsCount = toCopy.deletedRowsCount; if (toCopy.partitions != null) { - this.partitions = Stream.of(toCopy.partitions) - .map(PartitionFieldSummary::copy) - .toArray(PartitionFieldSummary[]::new); + this.partitions = + Stream.of(toCopy.partitions) + .map(PartitionFieldSummary::copy) + .toArray(PartitionFieldSummary[]::new); } else { this.partitions = null; } this.fromProjectionPos = toCopy.fromProjectionPos; - this.keyMetadata = toCopy.keyMetadata == null ? null : Arrays.copyOf(toCopy.keyMetadata, toCopy.keyMetadata.length); + this.keyMetadata = + toCopy.keyMetadata == null + ? null + : Arrays.copyOf(toCopy.keyMetadata, toCopy.keyMetadata.length); } - /** - * Constructor for Java serialization. - */ - GenericManifestFile() { - } + /** Constructor for Java serialization. */ + GenericManifestFile() {} @Override public String path() { @@ -329,7 +338,8 @@ public void set(int i, T value) { this.specId = (Integer) value; return; case 3: - this.content = value != null ? ManifestContent.values()[(Integer) value] : ManifestContent.DATA; + this.content = + value != null ? ManifestContent.values()[(Integer) value] : ManifestContent.DATA; return; case 4: this.sequenceNumber = value != null ? (Long) value : 0; @@ -359,8 +369,10 @@ public void set(int i, T value) { this.deletedRowsCount = (Long) value; return; case 13: - this.partitions = value == null ? null : - ((List) value).toArray(new PartitionFieldSummary[0]); + this.partitions = + value == null + ? null + : ((List) value).toArray(new PartitionFieldSummary[0]); return; case 14: this.keyMetadata = ByteBuffers.toByteArray((ByteBuffer) value); @@ -433,12 +445,23 @@ private CopyBuilder(ManifestFile toCopy) { if (toCopy instanceof GenericManifestFile) { this.manifestFile = new GenericManifestFile((GenericManifestFile) toCopy); } else { - this.manifestFile = new GenericManifestFile( - toCopy.path(), toCopy.length(), toCopy.partitionSpecId(), toCopy.content(), - toCopy.sequenceNumber(), toCopy.minSequenceNumber(), toCopy.snapshotId(), - toCopy.addedFilesCount(), toCopy.addedRowsCount(), toCopy.existingFilesCount(), - toCopy.existingRowsCount(), toCopy.deletedFilesCount(), toCopy.deletedRowsCount(), - copyList(toCopy.partitions(), PartitionFieldSummary::copy), toCopy.keyMetadata()); + this.manifestFile = + new GenericManifestFile( + toCopy.path(), + toCopy.length(), + toCopy.partitionSpecId(), + toCopy.content(), + toCopy.sequenceNumber(), + toCopy.minSequenceNumber(), + toCopy.snapshotId(), + toCopy.addedFilesCount(), + toCopy.addedRowsCount(), + toCopy.existingFilesCount(), + toCopy.existingRowsCount(), + toCopy.deletedFilesCount(), + toCopy.deletedRowsCount(), + copyList(toCopy.partitions(), PartitionFieldSummary::copy), + toCopy.keyMetadata()); } } diff --git a/core/src/main/java/org/apache/iceberg/GenericPartitionFieldSummary.java b/core/src/main/java/org/apache/iceberg/GenericPartitionFieldSummary.java index 18370e0c8abc..e75f37d2ec12 100644 --- a/core/src/main/java/org/apache/iceberg/GenericPartitionFieldSummary.java +++ b/core/src/main/java/org/apache/iceberg/GenericPartitionFieldSummary.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; @@ -46,16 +45,12 @@ public class GenericPartitionFieldSummary private byte[] lowerBound = null; private byte[] upperBound = null; - /** - * Used by Avro reflection to instantiate this class when reading manifest files. - */ + /** Used by Avro reflection to instantiate this class when reading manifest files. */ public GenericPartitionFieldSummary(Schema avroSchema) { this.avroSchema = avroSchema; - List fields = AvroSchemaUtil.convert(avroSchema) - .asNestedType() - .asStructType() - .fields(); + List fields = + AvroSchemaUtil.convert(avroSchema).asNestedType().asStructType().fields(); List allFields = PartitionFieldSummary.getType().fields(); this.fromProjectionPos = new int[fields.size()]; @@ -74,8 +69,8 @@ public GenericPartitionFieldSummary(Schema avroSchema) { } } - public GenericPartitionFieldSummary(boolean containsNull, boolean containsNaN, ByteBuffer lowerBound, - ByteBuffer upperBound) { + public GenericPartitionFieldSummary( + boolean containsNull, boolean containsNaN, ByteBuffer lowerBound, ByteBuffer upperBound) { this.avroSchema = AVRO_SCHEMA; this.containsNull = containsNull; this.containsNaN = containsNaN; @@ -103,16 +98,19 @@ private GenericPartitionFieldSummary(GenericPartitionFieldSummary toCopy) { this.avroSchema = toCopy.avroSchema; this.containsNull = toCopy.containsNull; this.containsNaN = toCopy.containsNaN; - this.lowerBound = toCopy.lowerBound == null ? null : Arrays.copyOf(toCopy.lowerBound, toCopy.lowerBound.length); - this.upperBound = toCopy.upperBound == null ? null : Arrays.copyOf(toCopy.upperBound, toCopy.upperBound.length); + this.lowerBound = + toCopy.lowerBound == null + ? null + : Arrays.copyOf(toCopy.lowerBound, toCopy.lowerBound.length); + this.upperBound = + toCopy.upperBound == null + ? null + : Arrays.copyOf(toCopy.upperBound, toCopy.upperBound.length); this.fromProjectionPos = toCopy.fromProjectionPos; } - /** - * Constructor for Java serialization. - */ - GenericPartitionFieldSummary() { - } + /** Constructor for Java serialization. */ + GenericPartitionFieldSummary() {} @Override public boolean containsNull() { diff --git a/core/src/main/java/org/apache/iceberg/HasTableOperations.java b/core/src/main/java/org/apache/iceberg/HasTableOperations.java index d15cbcb93ab9..7777778aa5d7 100644 --- a/core/src/main/java/org/apache/iceberg/HasTableOperations.java +++ b/core/src/main/java/org/apache/iceberg/HasTableOperations.java @@ -16,12 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -/** - * Used to expose a table's TableOperations. - */ +/** Used to expose a table's TableOperations. */ public interface HasTableOperations { TableOperations operations(); } diff --git a/core/src/main/java/org/apache/iceberg/HistoryTable.java b/core/src/main/java/org/apache/iceberg/HistoryTable.java index 53f47acaccbc..6db7d30ebd1a 100644 --- a/core/src/main/java/org/apache/iceberg/HistoryTable.java +++ b/core/src/main/java/org/apache/iceberg/HistoryTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Map; @@ -30,16 +29,17 @@ /** * A {@link Table} implementation that exposes a table's history as rows. - *

- * History is based on the table's snapshot log, which logs each update to the table's current snapshot. + * + *

History is based on the table's snapshot log, which logs each update to the table's current + * snapshot. */ public class HistoryTable extends BaseMetadataTable { - private static final Schema HISTORY_SCHEMA = new Schema( - Types.NestedField.required(1, "made_current_at", Types.TimestampType.withZone()), - Types.NestedField.required(2, "snapshot_id", Types.LongType.get()), - Types.NestedField.optional(3, "parent_id", Types.LongType.get()), - Types.NestedField.required(4, "is_current_ancestor", Types.BooleanType.get()) - ); + private static final Schema HISTORY_SCHEMA = + new Schema( + Types.NestedField.required(1, "made_current_at", Types.TimestampType.withZone()), + Types.NestedField.required(2, "snapshot_id", Types.LongType.get()), + Types.NestedField.optional(3, "parent_id", Types.LongType.get()), + Types.NestedField.required(4, "is_current_ancestor", Types.BooleanType.get())); HistoryTable(TableOperations ops, Table table) { this(ops, table, table.name() + ".history"); @@ -68,9 +68,10 @@ private DataTask task(TableScan scan) { TableOperations ops = operations(); return StaticDataTask.of( ops.io().newInputFile(ops.current().metadataFileLocation()), - schema(), scan.schema(), ops.current().snapshotLog(), - convertHistoryEntryFunc(table()) - ); + schema(), + scan.schema(), + ops.current().snapshotLog(), + convertHistoryEntryFunc(table())); } private class HistoryScan extends StaticTableScan { @@ -79,17 +80,20 @@ private class HistoryScan extends StaticTableScan { } HistoryScan(TableOperations ops, Table table, TableScanContext context) { - super(ops, table, HISTORY_SCHEMA, MetadataTableType.HISTORY, HistoryTable.this::task, context); + super( + ops, table, HISTORY_SCHEMA, MetadataTableType.HISTORY, HistoryTable.this::task, context); } @Override - protected TableScan newRefinedScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + protected TableScan newRefinedScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { return new HistoryScan(ops, table, context); } @Override public CloseableIterable planFiles() { - // override planFiles to avoid the check for a current snapshot because this metadata table is for all snapshots + // override planFiles to avoid the check for a current snapshot because this metadata table is + // for all snapshots return CloseableIterable.withNoopClose(HistoryTable.this.task(this)); } } @@ -109,8 +113,7 @@ private static Function convertHistoryEntryFun historyEntry.timestampMillis() * 1000, historyEntry.snapshotId(), snap != null ? snap.parentId() : null, - ancestorIds.contains(snapshotId) - ); + ancestorIds.contains(snapshotId)); }; } } diff --git a/core/src/main/java/org/apache/iceberg/IncrementalDataTableScan.java b/core/src/main/java/org/apache/iceberg/IncrementalDataTableScan.java index 034d217b9e34..66bb42b0b41b 100644 --- a/core/src/main/java/org/apache/iceberg/IncrementalDataTableScan.java +++ b/core/src/main/java/org/apache/iceberg/IncrementalDataTableScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -33,37 +32,45 @@ class IncrementalDataTableScan extends DataTableScan { - IncrementalDataTableScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + IncrementalDataTableScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { super(ops, table, schema, context.useSnapshotId(null)); validateSnapshotIds(table, context.fromSnapshotId(), context.toSnapshotId()); } @Override public TableScan asOfTime(long timestampMillis) { - throw new UnsupportedOperationException(String.format( - "Cannot scan table as of time %s: configured for incremental data in snapshots (%s, %s]", - timestampMillis, context().fromSnapshotId(), context().toSnapshotId())); + throw new UnsupportedOperationException( + String.format( + "Cannot scan table as of time %s: configured for incremental data in snapshots (%s, %s]", + timestampMillis, context().fromSnapshotId(), context().toSnapshotId())); } @Override public TableScan useSnapshot(long scanSnapshotId) { - throw new UnsupportedOperationException(String.format( - "Cannot scan table using scan snapshot id %s: configured for incremental data in snapshots (%s, %s]", - scanSnapshotId, context().fromSnapshotId(), context().toSnapshotId())); + throw new UnsupportedOperationException( + String.format( + "Cannot scan table using scan snapshot id %s: configured for incremental data in snapshots (%s, %s]", + scanSnapshotId, context().fromSnapshotId(), context().toSnapshotId())); } @Override public TableScan appendsBetween(long fromSnapshotId, long toSnapshotId) { validateSnapshotIdsRefinement(fromSnapshotId, toSnapshotId); - return new IncrementalDataTableScan(tableOps(), table(), schema(), + return new IncrementalDataTableScan( + tableOps(), + table(), + schema(), context().fromSnapshotIdExclusive(fromSnapshotId).toSnapshotId(toSnapshotId)); } @Override public TableScan appendsAfter(long newFromSnapshotId) { final Snapshot currentSnapshot = table().currentSnapshot(); - Preconditions.checkState(currentSnapshot != null, - "Cannot scan appends after %s, there is no current snapshot", newFromSnapshotId); + Preconditions.checkState( + currentSnapshot != null, + "Cannot scan appends after %s, there is no current snapshot", + newFromSnapshotId); return appendsBetween(newFromSnapshotId, currentSnapshot.snapshotId()); } @@ -74,31 +81,34 @@ public CloseableIterable planFiles() { List snapshots = snapshotsWithin(table(), fromSnapshotId, toSnapshotId); Set snapshotIds = Sets.newHashSet(Iterables.transform(snapshots, Snapshot::snapshotId)); - Set manifests = FluentIterable - .from(snapshots) - .transformAndConcat(snapshot -> snapshot.dataManifests(tableOps().io())) - .filter(manifestFile -> snapshotIds.contains(manifestFile.snapshotId())) - .toSet(); - - ManifestGroup manifestGroup = new ManifestGroup(table().io(), manifests) - .caseSensitive(isCaseSensitive()) - .select(colStats() ? SCAN_WITH_STATS_COLUMNS : SCAN_COLUMNS) - .filterData(filter()) - .filterManifestEntries( - manifestEntry -> - snapshotIds.contains(manifestEntry.snapshotId()) && - manifestEntry.status() == ManifestEntry.Status.ADDED) - .specsById(table().specs()) - .ignoreDeleted(); + Set manifests = + FluentIterable.from(snapshots) + .transformAndConcat(snapshot -> snapshot.dataManifests(tableOps().io())) + .filter(manifestFile -> snapshotIds.contains(manifestFile.snapshotId())) + .toSet(); + + ManifestGroup manifestGroup = + new ManifestGroup(table().io(), manifests) + .caseSensitive(isCaseSensitive()) + .select(colStats() ? SCAN_WITH_STATS_COLUMNS : SCAN_COLUMNS) + .filterData(filter()) + .filterManifestEntries( + manifestEntry -> + snapshotIds.contains(manifestEntry.snapshotId()) + && manifestEntry.status() == ManifestEntry.Status.ADDED) + .specsById(table().specs()) + .ignoreDeleted(); if (shouldIgnoreResiduals()) { manifestGroup = manifestGroup.ignoreResiduals(); } - Listeners.notifyAll(new IncrementalScanEvent(table().name(), fromSnapshotId, toSnapshotId, - filter(), schema(), false)); + Listeners.notifyAll( + new IncrementalScanEvent( + table().name(), fromSnapshotId, toSnapshotId, filter(), schema(), false)); - if (manifests.size() > 1 && (PLAN_SCANS_WITH_WORKER_POOL || context().planWithCustomizedExecutor())) { + if (manifests.size() > 1 + && (PLAN_SCANS_WITH_WORKER_POOL || context().planWithCustomizedExecutor())) { manifestGroup = manifestGroup.planWith(planExecutor()); } @@ -107,19 +117,23 @@ public CloseableIterable planFiles() { @Override @SuppressWarnings("checkstyle:HiddenField") - protected TableScan newRefinedScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + protected TableScan newRefinedScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { return new IncrementalDataTableScan(ops, table, schema, context); } - private static List snapshotsWithin(Table table, long fromSnapshotId, long toSnapshotId) { + private static List snapshotsWithin( + Table table, long fromSnapshotId, long toSnapshotId) { List snapshots = Lists.newArrayList(); - for (Snapshot snapshot : SnapshotUtil.ancestorsBetween(toSnapshotId, fromSnapshotId, table::snapshot)) { + for (Snapshot snapshot : + SnapshotUtil.ancestorsBetween(toSnapshotId, fromSnapshotId, table::snapshot)) { // for now, incremental scan supports only appends if (snapshot.operation().equals(DataOperations.APPEND)) { snapshots.add(snapshot); } else if (snapshot.operation().equals(DataOperations.OVERWRITE)) { throw new UnsupportedOperationException( - String.format("Found %s operation, cannot support incremental data in snapshots (%s, %s]", + String.format( + "Found %s operation, cannot support incremental data in snapshots (%s, %s]", DataOperations.OVERWRITE, fromSnapshotId, toSnapshotId)); } } @@ -127,27 +141,37 @@ private static List snapshotsWithin(Table table, long fromSnapshotId, } private void validateSnapshotIdsRefinement(long newFromSnapshotId, long newToSnapshotId) { - Set snapshotIdsRange = Sets.newHashSet( - SnapshotUtil.ancestorIdsBetween(context().toSnapshotId(), context().fromSnapshotId(), table()::snapshot)); + Set snapshotIdsRange = + Sets.newHashSet( + SnapshotUtil.ancestorIdsBetween( + context().toSnapshotId(), context().fromSnapshotId(), table()::snapshot)); // since snapshotIdsBetween return ids in range (fromSnapshotId, toSnapshotId] snapshotIdsRange.add(context().fromSnapshotId()); Preconditions.checkArgument( snapshotIdsRange.contains(newFromSnapshotId), "from snapshot id %s not in existing snapshot ids range (%s, %s]", - newFromSnapshotId, context().fromSnapshotId(), newToSnapshotId); + newFromSnapshotId, + context().fromSnapshotId(), + newToSnapshotId); Preconditions.checkArgument( snapshotIdsRange.contains(newToSnapshotId), "to snapshot id %s not in existing snapshot ids range (%s, %s]", - newToSnapshotId, context().fromSnapshotId(), context().toSnapshotId()); + newToSnapshotId, + context().fromSnapshotId(), + context().toSnapshotId()); } private static void validateSnapshotIds(Table table, long fromSnapshotId, long toSnapshotId) { - Preconditions.checkArgument(fromSnapshotId != toSnapshotId, "from and to snapshot ids cannot be the same"); + Preconditions.checkArgument( + fromSnapshotId != toSnapshotId, "from and to snapshot ids cannot be the same"); Preconditions.checkArgument( table.snapshot(fromSnapshotId) != null, "from snapshot %s does not exist", fromSnapshotId); Preconditions.checkArgument( table.snapshot(toSnapshotId) != null, "to snapshot %s does not exist", toSnapshotId); - Preconditions.checkArgument(SnapshotUtil.isAncestorOf(table, toSnapshotId, fromSnapshotId), - "from snapshot %s is not an ancestor of to snapshot %s", fromSnapshotId, toSnapshotId); + Preconditions.checkArgument( + SnapshotUtil.isAncestorOf(table, toSnapshotId, fromSnapshotId), + "from snapshot %s is not an ancestor of to snapshot %s", + fromSnapshotId, + toSnapshotId); } } diff --git a/core/src/main/java/org/apache/iceberg/IndexedStructLike.java b/core/src/main/java/org/apache/iceberg/IndexedStructLike.java index cf5d0e6a3e3b..ad4c7cabf271 100644 --- a/core/src/main/java/org/apache/iceberg/IndexedStructLike.java +++ b/core/src/main/java/org/apache/iceberg/IndexedStructLike.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.avro.generic.IndexedRecord; -/** - * IndexedRecord implementation to wrap a StructLike for writing to Avro. - */ +/** IndexedRecord implementation to wrap a StructLike for writing to Avro. */ class IndexedStructLike implements StructLike, IndexedRecord { private final org.apache.avro.Schema avroSchema; private StructLike wrapped = null; diff --git a/core/src/main/java/org/apache/iceberg/InheritableMetadata.java b/core/src/main/java/org/apache/iceberg/InheritableMetadata.java index 44e05212af7b..6abe3e9a94a2 100644 --- a/core/src/main/java/org/apache/iceberg/InheritableMetadata.java +++ b/core/src/main/java/org/apache/iceberg/InheritableMetadata.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; diff --git a/core/src/main/java/org/apache/iceberg/InheritableMetadataFactory.java b/core/src/main/java/org/apache/iceberg/InheritableMetadataFactory.java index 19c7fb2c4cdf..14ce537e9894 100644 --- a/core/src/main/java/org/apache/iceberg/InheritableMetadataFactory.java +++ b/core/src/main/java/org/apache/iceberg/InheritableMetadataFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; @@ -25,17 +24,18 @@ class InheritableMetadataFactory { private static final InheritableMetadata EMPTY = new EmptyInheritableMetadata(); - private InheritableMetadataFactory() { - } + private InheritableMetadataFactory() {} static InheritableMetadata empty() { return EMPTY; } static InheritableMetadata fromManifest(ManifestFile manifest) { - Preconditions.checkArgument(manifest.snapshotId() != null, + Preconditions.checkArgument( + manifest.snapshotId() != null, "Cannot read from ManifestFile with null (unassigned) snapshot ID"); - return new BaseInheritableMetadata(manifest.partitionSpecId(), manifest.snapshotId(), manifest.sequenceNumber()); + return new BaseInheritableMetadata( + manifest.partitionSpecId(), manifest.snapshotId(), manifest.sequenceNumber()); } static InheritableMetadata forCopy(long snapshotId) { @@ -85,13 +85,13 @@ public > ManifestEntry apply(ManifestEntry manife static class EmptyInheritableMetadata implements InheritableMetadata { - private EmptyInheritableMetadata() { - } + private EmptyInheritableMetadata() {} @Override public > ManifestEntry apply(ManifestEntry manifestEntry) { if (manifestEntry.snapshotId() == null) { - throw new IllegalArgumentException("Entries must have explicit snapshot ids if inherited metadata is empty"); + throw new IllegalArgumentException( + "Entries must have explicit snapshot ids if inherited metadata is empty"); } return manifestEntry; } diff --git a/core/src/main/java/org/apache/iceberg/IsolationLevel.java b/core/src/main/java/org/apache/iceberg/IsolationLevel.java index 444db454cb4d..1f4dd9aadd82 100644 --- a/core/src/main/java/org/apache/iceberg/IsolationLevel.java +++ b/core/src/main/java/org/apache/iceberg/IsolationLevel.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Locale; @@ -24,21 +23,22 @@ /** * An isolation level in a table. - *

- * Two isolation levels are supported: serializable and snapshot isolation. Both of them provide + * + *

Two isolation levels are supported: serializable and snapshot isolation. Both of them provide * a read consistent view of the table to all operations and allow readers to see only already - * committed data. While serializable is the strongest isolation level in databases, - * snapshot isolation is beneficial for environments with many concurrent writers. - *

- * The serializable isolation level guarantees that an ongoing UPDATE/DELETE/MERGE operation - * fails if a concurrent transaction commits a new file that might contain rows matching - * the condition used in UPDATE/DELETE/MERGE. For example, if there is an ongoing update - * on a subset of rows and a concurrent transaction adds a new file with records - * that potentially match the update condition, the update operation must fail under - * the serializable isolation but can still commit under the snapshot isolation. + * committed data. While serializable is the strongest isolation level in databases, snapshot + * isolation is beneficial for environments with many concurrent writers. + * + *

The serializable isolation level guarantees that an ongoing UPDATE/DELETE/MERGE operation + * fails if a concurrent transaction commits a new file that might contain rows matching the + * condition used in UPDATE/DELETE/MERGE. For example, if there is an ongoing update on a subset of + * rows and a concurrent transaction adds a new file with records that potentially match the update + * condition, the update operation must fail under the serializable isolation but can still commit + * under the snapshot isolation. */ public enum IsolationLevel { - SERIALIZABLE, SNAPSHOT; + SERIALIZABLE, + SNAPSHOT; public static IsolationLevel fromName(String levelName) { Preconditions.checkArgument(levelName != null, "Level name is null"); diff --git a/core/src/main/java/org/apache/iceberg/LocationProviders.java b/core/src/main/java/org/apache/iceberg/LocationProviders.java index 6a2fccecb9ed..f22060a52a22 100644 --- a/core/src/main/java/org/apache/iceberg/LocationProviders.java +++ b/core/src/main/java/org/apache/iceberg/LocationProviders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Map; @@ -32,34 +31,41 @@ public class LocationProviders { - private LocationProviders() { - } + private LocationProviders() {} - public static LocationProvider locationsFor(String inputLocation, Map properties) { + public static LocationProvider locationsFor( + String inputLocation, Map properties) { String location = LocationUtil.stripTrailingSlash(inputLocation); if (properties.containsKey(TableProperties.WRITE_LOCATION_PROVIDER_IMPL)) { String impl = properties.get(TableProperties.WRITE_LOCATION_PROVIDER_IMPL); DynConstructors.Ctor ctor; try { - ctor = DynConstructors.builder(LocationProvider.class) - .impl(impl, String.class, Map.class) - .impl(impl).buildChecked(); // fall back to no-arg constructor + ctor = + DynConstructors.builder(LocationProvider.class) + .impl(impl, String.class, Map.class) + .impl(impl) + .buildChecked(); // fall back to no-arg constructor } catch (NoSuchMethodException e) { - throw new IllegalArgumentException(String.format( - "Unable to find a constructor for implementation %s of %s. " + - "Make sure the implementation is in classpath, and that it either " + - "has a public no-arg constructor or a two-arg constructor " + - "taking in the string base table location and its property string map.", - impl, LocationProvider.class), e); + throw new IllegalArgumentException( + String.format( + "Unable to find a constructor for implementation %s of %s. " + + "Make sure the implementation is in classpath, and that it either " + + "has a public no-arg constructor or a two-arg constructor " + + "taking in the string base table location and its property string map.", + impl, LocationProvider.class), + e); } try { return ctor.newInstance(location, properties); } catch (ClassCastException e) { throw new IllegalArgumentException( - String.format("Provided implementation for dynamic instantiation should implement %s.", - LocationProvider.class), e); + String.format( + "Provided implementation for dynamic instantiation should implement %s.", + LocationProvider.class), + e); } - } else if (PropertyUtil.propertyAsBoolean(properties, + } else if (PropertyUtil.propertyAsBoolean( + properties, TableProperties.OBJECT_STORE_ENABLED, TableProperties.OBJECT_STORE_ENABLED_DEFAULT)) { return new ObjectStoreLocationProvider(location, properties); @@ -98,15 +104,17 @@ public String newDataLocation(String filename) { } static class ObjectStoreLocationProvider implements LocationProvider { - private static final Transform HASH_FUNC = Transforms - .bucket(Types.StringType.get(), Integer.MAX_VALUE); + private static final Transform HASH_FUNC = + Transforms.bucket(Types.StringType.get(), Integer.MAX_VALUE); private final String storageLocation; private final String context; ObjectStoreLocationProvider(String tableLocation, Map properties) { - this.storageLocation = LocationUtil.stripTrailingSlash(dataLocation(properties, tableLocation)); - // if the storage location is within the table prefix, don't add table and database name context + this.storageLocation = + LocationUtil.stripTrailingSlash(dataLocation(properties, tableLocation)); + // if the storage location is within the table prefix, don't add table and database name + // context if (storageLocation.startsWith(tableLocation)) { this.context = null; } else { @@ -155,8 +163,7 @@ private static String pathContext(String tableLocation) { } Preconditions.checkState( - !resolvedContext.endsWith("/"), - "Path context must not end with a slash."); + !resolvedContext.endsWith("/"), "Path context must not end with a slash."); return resolvedContext; } diff --git a/core/src/main/java/org/apache/iceberg/ManifestEntriesTable.java b/core/src/main/java/org/apache/iceberg/ManifestEntriesTable.java index 5085dece05dc..51f5026b20c1 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestEntriesTable.java +++ b/core/src/main/java/org/apache/iceberg/ManifestEntriesTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Map; @@ -34,10 +33,11 @@ import org.apache.iceberg.util.StructProjection; /** - * A {@link Table} implementation that exposes a table's manifest entries as rows, for both delete and data files. - *

- * WARNING: this table exposes internal details, like files that have been deleted. For a table of the live data files, - * use {@link DataFilesTable}. + * A {@link Table} implementation that exposes a table's manifest entries as rows, for both delete + * and data files. + * + *

WARNING: this table exposes internal details, like files that have been deleted. For a table + * of the live data files, use {@link DataFilesTable}. */ public class ManifestEntriesTable extends BaseMetadataTable { @@ -59,7 +59,8 @@ public Schema schema() { StructType partitionType = Partitioning.partitionType(table()); Schema schema = ManifestEntry.getSchema(partitionType); if (partitionType.fields().size() < 1) { - // avoid returning an empty struct, which is not always supported. instead, drop the partition field (id 102) + // avoid returning an empty struct, which is not always supported. instead, drop the partition + // field (id 102) return TypeUtil.selectNot(schema, Sets.newHashSet(102)); } else { return schema; @@ -77,13 +78,14 @@ private static class EntriesTableScan extends BaseMetadataTableScan { super(ops, table, schema, MetadataTableType.ENTRIES); } - private EntriesTableScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + private EntriesTableScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { super(ops, table, schema, MetadataTableType.ENTRIES, context); } @Override - protected TableScan newRefinedScan(TableOperations ops, Table table, Schema schema, - TableScanContext context) { + protected TableScan newRefinedScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { return new EntriesTableScan(ops, table, schema, context); } @@ -97,8 +99,11 @@ protected CloseableIterable doPlanFiles() { Expression filter = shouldIgnoreResiduals() ? Expressions.alwaysTrue() : filter(); ResidualEvaluator residuals = ResidualEvaluator.unpartitioned(filter); - return CloseableIterable.transform(manifests, manifest -> - new ManifestReadTask(table(), manifest, schema(), schemaString, specString, residuals)); + return CloseableIterable.transform( + manifests, + manifest -> + new ManifestReadTask( + table(), manifest, schema(), schemaString, specString, residuals)); } } @@ -109,8 +114,13 @@ static class ManifestReadTask extends BaseFileScanTask implements DataTask { private final ManifestFile manifest; private final Map specsById; - ManifestReadTask(Table table, ManifestFile manifest, Schema schema, String schemaString, - String specString, ResidualEvaluator residuals) { + ManifestReadTask( + Table table, + ManifestFile manifest, + Schema schema, + String schemaString, + String specString, + ResidualEvaluator residuals) { super(DataFiles.fromManifest(manifest), null, schemaString, specString, residuals); this.schema = schema; this.io = table.io(); @@ -118,7 +128,10 @@ static class ManifestReadTask extends BaseFileScanTask implements DataTask { this.specsById = Maps.newHashMap(table.specs()); Type fileProjection = schema.findType("data_file"); - this.fileSchema = fileProjection != null ? new Schema(fileProjection.asStructType().fields()) : new Schema(); + this.fileSchema = + fileProjection != null + ? new Schema(fileProjection.asStructType().fields()) + : new Schema(); } @Override @@ -126,12 +139,17 @@ public CloseableIterable rows() { // Project data-file fields CloseableIterable prunedRows; if (manifest.content() == ManifestContent.DATA) { - prunedRows = CloseableIterable.transform(ManifestFiles.read(manifest, io).project(fileSchema).entries(), - file -> (GenericManifestEntry) file); + prunedRows = + CloseableIterable.transform( + ManifestFiles.read(manifest, io).project(fileSchema).entries(), + file -> (GenericManifestEntry) file); } else { - prunedRows = CloseableIterable.transform(ManifestFiles.readDeleteManifest(manifest, io, specsById) - .project(fileSchema).entries(), - file -> (GenericManifestEntry) file); + prunedRows = + CloseableIterable.transform( + ManifestFiles.readDeleteManifest(manifest, io, specsById) + .project(fileSchema) + .entries(), + file -> (GenericManifestEntry) file); } // Project non-readable fields diff --git a/core/src/main/java/org/apache/iceberg/ManifestEntry.java b/core/src/main/java/org/apache/iceberg/ManifestEntry.java index 1199ee845ee6..bfac45321967 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestEntry.java +++ b/core/src/main/java/org/apache/iceberg/ManifestEntry.java @@ -16,15 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.types.Types.StructType; - import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.types.Types.StructType; + interface ManifestEntry> { enum Status { EXISTING(0), @@ -54,17 +53,14 @@ static Schema getSchema(StructType partitionType) { } static Schema wrapFileSchema(StructType fileType) { - return new Schema(STATUS, SNAPSHOT_ID, SEQUENCE_NUMBER, required(DATA_FILE_ID, "data_file", fileType)); + return new Schema( + STATUS, SNAPSHOT_ID, SEQUENCE_NUMBER, required(DATA_FILE_ID, "data_file", fileType)); } - /** - * Returns the status of the file, whether EXISTING, ADDED, or DELETED. - */ + /** Returns the status of the file, whether EXISTING, ADDED, or DELETED. */ Status status(); - /** - * Returns id of the snapshot in which the file was added to the table. - */ + /** Returns id of the snapshot in which the file was added to the table. */ Long snapshotId(); /** @@ -74,9 +70,7 @@ static Schema wrapFileSchema(StructType fileType) { */ void setSnapshotId(long snapshotId); - /** - * Returns the sequence number of the snapshot in which the file was added to the table. - */ + /** Returns the sequence number of the snapshot in which the file was added to the table. */ Long sequenceNumber(); /** @@ -86,9 +80,7 @@ static Schema wrapFileSchema(StructType fileType) { */ void setSequenceNumber(long sequenceNumber); - /** - * Returns a file. - */ + /** Returns a file. */ F file(); ManifestEntry copy(); diff --git a/core/src/main/java/org/apache/iceberg/ManifestFiles.java b/core/src/main/java/org/apache/iceberg/ManifestFiles.java index 6e2c10f4ca44..f039907b8682 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestFiles.java +++ b/core/src/main/java/org/apache/iceberg/ManifestFiles.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -34,14 +33,16 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; public class ManifestFiles { - private ManifestFiles() { - } + private ManifestFiles() {} - private static final org.apache.avro.Schema MANIFEST_AVRO_SCHEMA = AvroSchemaUtil.convert(ManifestFile.schema(), - ImmutableMap.of( - ManifestFile.schema().asStruct(), GenericManifestFile.class.getName(), - ManifestFile.PARTITION_SUMMARY_TYPE, GenericPartitionFieldSummary.class.getName() - )); + private static final org.apache.avro.Schema MANIFEST_AVRO_SCHEMA = + AvroSchemaUtil.convert( + ManifestFile.schema(), + ImmutableMap.of( + ManifestFile.schema().asStruct(), + GenericManifestFile.class.getName(), + ManifestFile.PARTITION_SUMMARY_TYPE, + GenericPartitionFieldSummary.class.getName())); /** * Returns a {@link CloseableIterable} of file paths in the {@link ManifestFile}. @@ -58,10 +59,10 @@ public static CloseableIterable readPaths(ManifestFile manifest, FileIO /** * Returns a new {@link ManifestReader} for a {@link ManifestFile}. - *

- * Note: Callers should use {@link ManifestFiles#read(ManifestFile, FileIO, Map)} to ensure - * the schema used by filters is the latest table schema. This should be used only when reading - * a manifest without filters. + * + *

Note: Callers should use {@link ManifestFiles#read(ManifestFile, FileIO, Map)} to + * ensure the schema used by filters is the latest table schema. This should be used only when + * reading a manifest without filters. * * @param manifest a ManifestFile * @param io a FileIO @@ -79,9 +80,12 @@ public static ManifestReader read(ManifestFile manifest, FileIO io) { * @param specsById a Map from spec ID to partition spec * @return a {@link ManifestReader} */ - public static ManifestReader read(ManifestFile manifest, FileIO io, Map specsById) { - Preconditions.checkArgument(manifest.content() == ManifestContent.DATA, - "Cannot read a delete manifest with a ManifestReader: %s", manifest); + public static ManifestReader read( + ManifestFile manifest, FileIO io, Map specsById) { + Preconditions.checkArgument( + manifest.content() == ManifestContent.DATA, + "Cannot read a delete manifest with a ManifestReader: %s", + manifest); InputFile file = io.newInputFile(manifest.path(), manifest.length()); InheritableMetadata inheritableMetadata = InheritableMetadataFactory.fromManifest(manifest); return new ManifestReader<>(file, specsById, inheritableMetadata, FileType.DATA_FILES); @@ -89,9 +93,9 @@ public static ManifestReader read(ManifestFile manifest, FileIO io, Ma /** * Create a new {@link ManifestWriter}. - *

- * Manifests created by this writer have all entry snapshot IDs set to null. - * All entries will inherit the snapshot ID that will be assigned to the manifest on commit. + * + *

Manifests created by this writer have all entry snapshot IDs set to null. All entries will + * inherit the snapshot ID that will be assigned to the manifest on commit. * * @param spec {@link PartitionSpec} used to produce {@link DataFile} partition tuples * @param outputFile the destination file location @@ -110,15 +114,16 @@ public static ManifestWriter write(PartitionSpec spec, OutputFile outp * @param snapshotId a snapshot ID for the manifest entries, or null for an inherited ID * @return a manifest writer */ - public static ManifestWriter write(int formatVersion, PartitionSpec spec, OutputFile outputFile, - Long snapshotId) { + public static ManifestWriter write( + int formatVersion, PartitionSpec spec, OutputFile outputFile, Long snapshotId) { switch (formatVersion) { case 1: return new ManifestWriter.V1Writer(spec, outputFile, snapshotId); case 2: return new ManifestWriter.V2Writer(spec, outputFile, snapshotId); } - throw new UnsupportedOperationException("Cannot write manifest for table version: " + formatVersion); + throw new UnsupportedOperationException( + "Cannot write manifest for table version: " + formatVersion); } /** @@ -129,10 +134,12 @@ public static ManifestWriter write(int formatVersion, PartitionSpec sp * @param specsById a Map from spec ID to partition spec * @return a {@link ManifestReader} */ - public static ManifestReader readDeleteManifest(ManifestFile manifest, FileIO io, - Map specsById) { - Preconditions.checkArgument(manifest.content() == ManifestContent.DELETES, - "Cannot read a data manifest with a DeleteManifestReader: %s", manifest); + public static ManifestReader readDeleteManifest( + ManifestFile manifest, FileIO io, Map specsById) { + Preconditions.checkArgument( + manifest.content() == ManifestContent.DELETES, + "Cannot read a data manifest with a DeleteManifestReader: %s", + manifest); InputFile file = io.newInputFile(manifest.path(), manifest.length()); InheritableMetadata inheritableMetadata = InheritableMetadataFactory.fromManifest(manifest); return new ManifestReader<>(file, specsById, inheritableMetadata, FileType.DELETE_FILES); @@ -147,21 +154,23 @@ public static ManifestReader readDeleteManifest(ManifestFile manifes * @param snapshotId a snapshot ID for the manifest entries, or null for an inherited ID * @return a manifest writer */ - public static ManifestWriter writeDeleteManifest(int formatVersion, PartitionSpec spec, - OutputFile outputFile, Long snapshotId) { + public static ManifestWriter writeDeleteManifest( + int formatVersion, PartitionSpec spec, OutputFile outputFile, Long snapshotId) { switch (formatVersion) { case 1: throw new IllegalArgumentException("Cannot write delete files in a v1 table"); case 2: return new ManifestWriter.V2DeleteWriter(spec, outputFile, snapshotId); } - throw new UnsupportedOperationException("Cannot write manifest for table version: " + formatVersion); + throw new UnsupportedOperationException( + "Cannot write manifest for table version: " + formatVersion); } /** * Encode the {@link ManifestFile} to a byte array by using avro encoder. * - * @param manifestFile a {@link ManifestFile}, which should always be a {@link GenericManifestFile}. + * @param manifestFile a {@link ManifestFile}, which should always be a {@link + * GenericManifestFile}. * @return the binary data. * @throws IOException if encounter any IO error when encoding. */ @@ -174,7 +183,8 @@ public static byte[] encode(ManifestFile manifestFile) throws IOException { * Decode the binary data into a {@link ManifestFile}. * * @param manifestData the binary data. - * @return a {@link ManifestFile}. To be precise, it's a {@link GenericManifestFile} which don't expose to public. + * @return a {@link ManifestFile}. To be precise, it's a {@link GenericManifestFile} which don't + * expose to public. * @throws IOException if encounter any IO error when decoding. */ public static ManifestFile decode(byte[] manifestData) throws IOException { @@ -185,52 +195,73 @@ static ManifestReader open(ManifestFile manifest, FileIO io) { return open(manifest, io, null); } - static ManifestReader open(ManifestFile manifest, FileIO io, - Map specsById) { + static ManifestReader open( + ManifestFile manifest, FileIO io, Map specsById) { switch (manifest.content()) { case DATA: return ManifestFiles.read(manifest, io, specsById); case DELETES: return ManifestFiles.readDeleteManifest(manifest, io, specsById); } - throw new UnsupportedOperationException("Cannot read unknown manifest type: " + manifest.content()); + throw new UnsupportedOperationException( + "Cannot read unknown manifest type: " + manifest.content()); } - static ManifestFile copyAppendManifest(int formatVersion, - InputFile toCopy, Map specsById, - OutputFile outputFile, long snapshotId, - SnapshotSummary.Builder summaryBuilder) { + static ManifestFile copyAppendManifest( + int formatVersion, + InputFile toCopy, + Map specsById, + OutputFile outputFile, + long snapshotId, + SnapshotSummary.Builder summaryBuilder) { // use metadata that will add the current snapshot's ID for the rewrite InheritableMetadata inheritableMetadata = InheritableMetadataFactory.forCopy(snapshotId); try (ManifestReader reader = - new ManifestReader<>(toCopy, specsById, inheritableMetadata, FileType.DATA_FILES)) { + new ManifestReader<>(toCopy, specsById, inheritableMetadata, FileType.DATA_FILES)) { return copyManifestInternal( - formatVersion, reader, outputFile, snapshotId, summaryBuilder, ManifestEntry.Status.ADDED); + formatVersion, + reader, + outputFile, + snapshotId, + summaryBuilder, + ManifestEntry.Status.ADDED); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to close manifest: %s", toCopy.location()); } } - static ManifestFile copyRewriteManifest(int formatVersion, - InputFile toCopy, Map specsById, - OutputFile outputFile, long snapshotId, - SnapshotSummary.Builder summaryBuilder) { - // for a rewritten manifest all snapshot ids should be set. use empty metadata to throw an exception if it is not + static ManifestFile copyRewriteManifest( + int formatVersion, + InputFile toCopy, + Map specsById, + OutputFile outputFile, + long snapshotId, + SnapshotSummary.Builder summaryBuilder) { + // for a rewritten manifest all snapshot ids should be set. use empty metadata to throw an + // exception if it is not InheritableMetadata inheritableMetadata = InheritableMetadataFactory.empty(); try (ManifestReader reader = - new ManifestReader<>(toCopy, specsById, inheritableMetadata, FileType.DATA_FILES)) { + new ManifestReader<>(toCopy, specsById, inheritableMetadata, FileType.DATA_FILES)) { return copyManifestInternal( - formatVersion, reader, outputFile, snapshotId, summaryBuilder, ManifestEntry.Status.EXISTING); + formatVersion, + reader, + outputFile, + snapshotId, + summaryBuilder, + ManifestEntry.Status.EXISTING); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to close manifest: %s", toCopy.location()); } } @SuppressWarnings("Finally") - private static ManifestFile copyManifestInternal(int formatVersion, ManifestReader reader, - OutputFile outputFile, long snapshotId, - SnapshotSummary.Builder summaryBuilder, - ManifestEntry.Status allowedEntryStatus) { + private static ManifestFile copyManifestInternal( + int formatVersion, + ManifestReader reader, + OutputFile outputFile, + long snapshotId, + SnapshotSummary.Builder summaryBuilder, + ManifestEntry.Status allowedEntryStatus) { ManifestWriter writer = write(formatVersion, reader.spec(), outputFile, snapshotId); boolean threw = true; try { @@ -238,7 +269,8 @@ private static ManifestFile copyManifestInternal(int formatVersion, ManifestRead Preconditions.checkArgument( allowedEntryStatus == entry.status(), "Invalid manifest entry status: %s (allowed status: %s)", - entry.status(), allowedEntryStatus); + entry.status(), + allowedEntryStatus); switch (entry.status()) { case ADDED: summaryBuilder.addedFile(reader.spec(), entry.file()); diff --git a/core/src/main/java/org/apache/iceberg/ManifestFilterManager.java b/core/src/main/java/org/apache/iceberg/ManifestFilterManager.java index 81a7fbfad9a3..bd1175173ca5 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestFilterManager.java +++ b/core/src/main/java/org/apache/iceberg/ManifestFilterManager.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -89,8 +88,8 @@ public String partition() { private final Supplier workerPoolSupplier; - protected ManifestFilterManager(Map specsById, - Supplier executorSupplier) { + protected ManifestFilterManager( + Map specsById, Supplier executorSupplier) { this.specsById = specsById; this.deleteFilePartitions = PartitionSet.create(specsById); this.dropPartitions = PartitionSet.create(specsById); @@ -98,7 +97,9 @@ protected ManifestFilterManager(Map specsById, } protected abstract void deleteFile(String location); + protected abstract ManifestWriter newManifestWriter(PartitionSpec spec); + protected abstract ManifestReader newManifestReader(ManifestFile manifest); protected void failAnyDelete() { @@ -121,9 +122,7 @@ protected void deleteByRowFilter(Expression expr) { this.deleteExpression = Expressions.or(deleteExpression, expr); } - /** - * Add a partition tuple to drop from the table during the delete phase. - */ + /** Add a partition tuple to drop from the table during the delete phase. */ protected void dropPartition(int specId, StructLike partition) { Preconditions.checkNotNull(partition, "Cannot delete files in invalid partition: null"); invalidateFilteredCache(); @@ -132,16 +131,17 @@ protected void dropPartition(int specId, StructLike partition) { /** * Set the sequence number used to remove old delete files. - *

- * Delete files with a sequence number older than the given value will be removed. By setting this to the sequence - * number of the oldest data file in the table, this will continuously remove delete files that are no longer needed - * because deletes cannot match any existing rows in the table. + * + *

Delete files with a sequence number older than the given value will be removed. By setting + * this to the sequence number of the oldest data file in the table, this will continuously remove + * delete files that are no longer needed because deletes cannot match any existing rows in the + * table. * * @param sequenceNumber a sequence number used to remove old delete files */ protected void dropDeleteFilesOlderThan(long sequenceNumber) { - Preconditions.checkArgument(sequenceNumber >= 0, - "Invalid minimum data sequence number: %s", sequenceNumber); + Preconditions.checkArgument( + sequenceNumber >= 0, "Invalid minimum data sequence number: %s", sequenceNumber); this.minSequenceNumber = sequenceNumber; } @@ -149,9 +149,7 @@ void caseSensitive(boolean newCaseSensitive) { this.caseSensitive = newCaseSensitive; } - /** - * Add a specific path to be deleted in the new snapshot. - */ + /** Add a specific path to be deleted in the new snapshot. */ void delete(F file) { Preconditions.checkNotNull(file, "Cannot delete file: null"); invalidateFilteredCache(); @@ -159,9 +157,7 @@ void delete(F file) { deleteFilePartitions.add(file.specId(), file.partition()); } - /** - * Add a specific path to be deleted in the new snapshot. - */ + /** Add a specific path to be deleted in the new snapshot. */ void delete(CharSequence path) { Preconditions.checkNotNull(path, "Cannot delete file path: null"); invalidateFilteredCache(); @@ -185,12 +181,14 @@ List filterManifests(Schema tableSchema, List manife ManifestFile[] filtered = new ManifestFile[manifests.size()]; // open all of the manifest files in parallel, use index to avoid reordering Tasks.range(filtered.length) - .stopOnFailure().throwFailureWhenFinished() + .stopOnFailure() + .throwFailureWhenFinished() .executeWith(workerPoolSupplier.get()) - .run(index -> { - ManifestFile manifest = filterManifest(tableSchema, manifests.get(index)); - filtered[index] = manifest; - }); + .run( + index -> { + ManifestFile manifest = filterManifest(tableSchema, manifests.get(index)); + filtered[index] = manifest; + }); validateRequiredDeletes(filtered); @@ -221,7 +219,8 @@ SnapshotSummary.Builder buildSummary(Iterable manifests) { } /** - * Throws a {@link ValidationException} if any deleted file was not present in a filtered manifest. + * Throws a {@link ValidationException} if any deleted file was not present in a filtered + * manifest. * * @param manifests a set of filtered manifests */ @@ -229,7 +228,8 @@ SnapshotSummary.Builder buildSummary(Iterable manifests) { private void validateRequiredDeletes(ManifestFile... manifests) { if (failMissingDeletePaths) { CharSequenceSet deletedFiles = deletedFiles(manifests); - ValidationException.check(deletedFiles.containsAll(deletePaths), + ValidationException.check( + deletedFiles.containsAll(deletePaths), "Missing required files to delete: %s", COMMA.join(Iterables.filter(deletePaths, path -> !deletedFiles.contains(path)))); } @@ -253,7 +253,8 @@ private CharSequenceSet deletedFiles(ManifestFile[] manifests) { } /** - * Deletes filtered manifests that were created by this class, but are not in the committed manifest set. + * Deletes filtered manifests that were created by this class, but are not in the committed + * manifest set. * * @param committed the set of manifest files that were committed */ @@ -282,9 +283,7 @@ private void invalidateFilteredCache() { cleanUncommitted(SnapshotProducer.EMPTY_SET); } - /** - * @return a ManifestReader that is a filtered version of the input manifest. - */ + /** @return a ManifestReader that is a filtered version of the input manifest. */ private ManifestFile filterManifest(Schema tableSchema, ManifestFile manifest) { ManifestFile cached = filteredManifests.get(manifest); if (cached != null) { @@ -299,7 +298,8 @@ private ManifestFile filterManifest(Schema tableSchema, ManifestFile manifest) { try (ManifestReader reader = newManifestReader(manifest)) { PartitionSpec spec = reader.spec(); - PartitionAndMetricsEvaluator evaluator = new PartitionAndMetricsEvaluator(tableSchema, spec, deleteExpression); + PartitionAndMetricsEvaluator evaluator = + new PartitionAndMetricsEvaluator(tableSchema, spec, deleteExpression); // this assumes that the manifest doesn't have files to remove and streams through the // manifest without copying data. if a manifest does have a file to remove, this will break @@ -321,7 +321,8 @@ private boolean canContainDeletedFiles(ManifestFile manifest) { boolean canContainExpressionDeletes; if (deleteExpression != null && deleteExpression != Expressions.alwaysFalse()) { ManifestEvaluator manifestEvaluator = - ManifestEvaluator.forRowFilter(deleteExpression, specsById.get(manifest.partitionSpecId()), caseSensitive); + ManifestEvaluator.forRowFilter( + deleteExpression, specsById.get(manifest.partitionSpecId()), caseSensitive); canContainExpressionDeletes = manifestEvaluator.eval(manifest); } else { canContainExpressionDeletes = false; @@ -329,7 +330,8 @@ private boolean canContainDeletedFiles(ManifestFile manifest) { boolean canContainDroppedPartitions; if (dropPartitions.size() > 0) { - canContainDroppedPartitions = ManifestFileUtil.canContainAny(manifest, dropPartitions, specsById); + canContainDroppedPartitions = + ManifestFileUtil.canContainAny(manifest, dropPartitions, specsById); } else { canContainDroppedPartitions = false; } @@ -339,33 +341,44 @@ private boolean canContainDeletedFiles(ManifestFile manifest) { canContainDroppedFiles = true; } else if (deletePaths.size() > 0) { // because there were no path-only deletes, the set of deleted file partitions is valid - canContainDroppedFiles = ManifestFileUtil.canContainAny(manifest, deleteFilePartitions, specsById); + canContainDroppedFiles = + ManifestFileUtil.canContainAny(manifest, deleteFilePartitions, specsById); } else { canContainDroppedFiles = false; } - boolean canContainDropBySeq = manifest.content() == ManifestContent.DELETES && - manifest.minSequenceNumber() < minSequenceNumber; + boolean canContainDropBySeq = + manifest.content() == ManifestContent.DELETES + && manifest.minSequenceNumber() < minSequenceNumber; - return canContainExpressionDeletes || canContainDroppedPartitions || canContainDroppedFiles || canContainDropBySeq; + return canContainExpressionDeletes + || canContainDroppedPartitions + || canContainDroppedFiles + || canContainDropBySeq; } @SuppressWarnings("CollectionUndefinedEquality") - private boolean manifestHasDeletedFiles(PartitionAndMetricsEvaluator evaluator, ManifestReader reader) { + private boolean manifestHasDeletedFiles( + PartitionAndMetricsEvaluator evaluator, ManifestReader reader) { boolean isDelete = reader.isDeleteManifestReader(); for (ManifestEntry entry : reader.liveEntries()) { F file = entry.file(); - boolean markedForDelete = deletePaths.contains(file.path()) || - dropPartitions.contains(file.specId(), file.partition()) || - (isDelete && entry.sequenceNumber() > 0 && entry.sequenceNumber() < minSequenceNumber); + boolean markedForDelete = + deletePaths.contains(file.path()) + || dropPartitions.contains(file.specId(), file.partition()) + || (isDelete + && entry.sequenceNumber() > 0 + && entry.sequenceNumber() < minSequenceNumber); if (markedForDelete || evaluator.rowsMightMatch(file)) { boolean allRowsMatch = markedForDelete || evaluator.rowsMustMatch(file); ValidationException.check( - allRowsMatch || isDelete, // ignore delete files where some records may not match the expression + allRowsMatch + || isDelete, // ignore delete files where some records may not match the expression "Cannot delete file where some, but not all, rows match filter %s: %s", - this.deleteExpression, file.path()); + this.deleteExpression, + file.path()); if (allRowsMatch) { if (failAnyDelete) { @@ -382,8 +395,8 @@ private boolean manifestHasDeletedFiles(PartitionAndMetricsEvaluator evaluator, } @SuppressWarnings({"CollectionUndefinedEquality", "checkstyle:CyclomaticComplexity"}) - private ManifestFile filterManifestWithDeletedFiles(PartitionAndMetricsEvaluator evaluator, - ManifestFile manifest, ManifestReader reader) { + private ManifestFile filterManifestWithDeletedFiles( + PartitionAndMetricsEvaluator evaluator, ManifestFile manifest, ManifestReader reader) { boolean isDelete = reader.isDeleteManifestReader(); // when this point is reached, there is at least one file that will be deleted in the // manifest. produce a copy of the manifest with all deleted files removed. @@ -393,41 +406,53 @@ private ManifestFile filterManifestWithDeletedFiles(PartitionAndMetricsEvaluator try { ManifestWriter writer = newManifestWriter(reader.spec()); try { - reader.entries().forEach(entry -> { - F file = entry.file(); - boolean markedForDelete = deletePaths.contains(file.path()) || - dropPartitions.contains(file.specId(), file.partition()) || - (isDelete && entry.sequenceNumber() > 0 && entry.sequenceNumber() < minSequenceNumber); - if (entry.status() != ManifestEntry.Status.DELETED) { - if (markedForDelete || evaluator.rowsMightMatch(file)) { - boolean allRowsMatch = markedForDelete || evaluator.rowsMustMatch(file); - ValidationException.check( - allRowsMatch || isDelete, // ignore delete files where some records may not match the expression - "Cannot delete file where some, but not all, rows match filter %s: %s", - this.deleteExpression, file.path()); - - if (allRowsMatch) { - writer.delete(entry); - - CharSequenceWrapper wrapper = CharSequenceWrapper.wrap(entry.file().path()); - if (deletedPaths.contains(wrapper)) { - LOG.warn("Deleting a duplicate path from manifest {}: {}", manifest.path(), wrapper.get()); - duplicateDeleteCount += 1; - } else { - // only add the file to deletes if it is a new delete - // this keeps the snapshot summary accurate for non-duplicate data - deletedFiles.add(entry.file().copyWithoutStats()); - } - deletedPaths.add(wrapper); - } else { - writer.existing(entry); - } - - } else { - writer.existing(entry); - } - } - }); + reader + .entries() + .forEach( + entry -> { + F file = entry.file(); + boolean markedForDelete = + deletePaths.contains(file.path()) + || dropPartitions.contains(file.specId(), file.partition()) + || (isDelete + && entry.sequenceNumber() > 0 + && entry.sequenceNumber() < minSequenceNumber); + if (entry.status() != ManifestEntry.Status.DELETED) { + if (markedForDelete || evaluator.rowsMightMatch(file)) { + boolean allRowsMatch = markedForDelete || evaluator.rowsMustMatch(file); + ValidationException.check( + allRowsMatch + || isDelete, // ignore delete files where some records may not match + // the expression + "Cannot delete file where some, but not all, rows match filter %s: %s", + this.deleteExpression, + file.path()); + + if (allRowsMatch) { + writer.delete(entry); + + CharSequenceWrapper wrapper = CharSequenceWrapper.wrap(entry.file().path()); + if (deletedPaths.contains(wrapper)) { + LOG.warn( + "Deleting a duplicate path from manifest {}: {}", + manifest.path(), + wrapper.get()); + duplicateDeleteCount += 1; + } else { + // only add the file to deletes if it is a new delete + // this keeps the snapshot summary accurate for non-duplicate data + deletedFiles.add(entry.file().copyWithoutStats()); + } + deletedPaths.add(wrapper); + } else { + writer.existing(entry); + } + + } else { + writer.existing(entry); + } + } + }); } finally { writer.close(); } @@ -452,7 +477,8 @@ private ManifestFile filterManifestWithDeletedFiles(PartitionAndMetricsEvaluator private class PartitionAndMetricsEvaluator { private final Schema tableSchema; private final ResidualEvaluator residualEvaluator; - private final StructLikeMap> metricsEvaluators; + private final StructLikeMap> + metricsEvaluators; PartitionAndMetricsEvaluator(Schema tableSchema, PartitionSpec spec, Expression expr) { this.tableSchema = tableSchema; @@ -474,17 +500,24 @@ boolean rowsMustMatch(F file) { private Pair metricsEvaluators(F file) { // ResidualEvaluator removes predicates in the expression using strict/inclusive projections - // if strict projection returns true -> the pred would return true -> replace the pred with true - // if inclusive projection returns false -> the pred would return false -> replace the pred with false + // if strict projection returns true -> the pred would return true -> replace the pred with + // true + // if inclusive projection returns false -> the pred would return false -> replace the pred + // with false // otherwise, keep the original predicate and proceed to other predicates in the expression - // in other words, ResidualEvaluator returns a part of the expression that needs to be evaluated + // in other words, ResidualEvaluator returns a part of the expression that needs to be + // evaluated // for rows in the given partition using metrics - return metricsEvaluators.computeIfAbsent(file.partition(), partition -> { - Expression residual = residualEvaluator.residualFor(partition); - InclusiveMetricsEvaluator inclusive = new InclusiveMetricsEvaluator(tableSchema, residual, caseSensitive); - StrictMetricsEvaluator strict = new StrictMetricsEvaluator(tableSchema, residual, caseSensitive); - return Pair.of(inclusive, strict); - }); + return metricsEvaluators.computeIfAbsent( + file.partition(), + partition -> { + Expression residual = residualEvaluator.residualFor(partition); + InclusiveMetricsEvaluator inclusive = + new InclusiveMetricsEvaluator(tableSchema, residual, caseSensitive); + StrictMetricsEvaluator strict = + new StrictMetricsEvaluator(tableSchema, residual, caseSensitive); + return Pair.of(inclusive, strict); + }); } } } diff --git a/core/src/main/java/org/apache/iceberg/ManifestGroup.java b/core/src/main/java/org/apache/iceberg/ManifestGroup.java index bb0665540537..dd814b0aec70 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestGroup.java +++ b/core/src/main/java/org/apache/iceberg/ManifestGroup.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.github.benmanes.caffeine.cache.Caffeine; @@ -63,12 +62,14 @@ class ManifestGroup { private ExecutorService executorService; ManifestGroup(FileIO io, Iterable manifests) { - this(io, + this( + io, Iterables.filter(manifests, manifest -> manifest.content() == ManifestContent.DATA), Iterables.filter(manifests, manifest -> manifest.content() == ManifestContent.DELETES)); } - ManifestGroup(FileIO io, Iterable dataManifests, Iterable deleteManifests) { + ManifestGroup( + FileIO io, Iterable dataManifests, Iterable deleteManifests) { this.io = io; this.dataManifests = Sets.newHashSet(dataManifests); this.deleteIndexBuilder = DeleteFileIndex.builderFor(io, deleteManifests); @@ -112,7 +113,8 @@ ManifestGroup filterManifests(Predicate newManifestPredicate) { return this; } - ManifestGroup filterManifestEntries(Predicate> newManifestEntryPredicate) { + ManifestGroup filterManifestEntries( + Predicate> newManifestEntryPredicate) { this.manifestEntryPredicate = manifestEntryPredicate.and(newManifestEntryPredicate); return this; } @@ -150,9 +152,9 @@ ManifestGroup planWith(ExecutorService newExecutorService) { } /** - * Returns an iterable of scan tasks. It is safe to add entries of this iterable - * to a collection as {@link DataFile} in each {@link FileScanTask} is defensively - * copied. + * Returns an iterable of scan tasks. It is safe to add entries of this iterable to a collection + * as {@link DataFile} in each {@link FileScanTask} is defensively copied. + * * @return a {@link CloseableIterable} of {@link FileScanTask} */ public CloseableIterable planFiles() { @@ -160,11 +162,14 @@ public CloseableIterable planFiles() { } public CloseableIterable plan(CreateTasksFunction createTasksFunc) { - LoadingCache residualCache = Caffeine.newBuilder().build(specId -> { - PartitionSpec spec = specsById.get(specId); - Expression filter = ignoreResiduals ? Expressions.alwaysTrue() : dataFilter; - return ResidualEvaluator.of(spec, filter, caseSensitive); - }); + LoadingCache residualCache = + Caffeine.newBuilder() + .build( + specId -> { + PartitionSpec spec = specsById.get(specId); + Expression filter = ignoreResiduals ? Expressions.alwaysTrue() : dataFilter; + return ResidualEvaluator.of(spec, filter, caseSensitive); + }); DeleteFileIndex deleteFiles = deleteIndexBuilder.build(); @@ -173,17 +178,22 @@ public CloseableIterable plan(CreateTasksFunction cre select(ManifestReader.withStatsColumns(columns)); } - LoadingCache taskContextCache = Caffeine.newBuilder().build(specId -> { - PartitionSpec spec = specsById.get(specId); - ResidualEvaluator residuals = residualCache.get(specId); - return new TaskContext(spec, deleteFiles, residuals, dropStats); - }); - - Iterable> tasks = entries((manifest, entries) -> { - int specId = manifest.partitionSpecId(); - TaskContext taskContext = taskContextCache.get(specId); - return createTasksFunc.apply(entries, taskContext); - }); + LoadingCache taskContextCache = + Caffeine.newBuilder() + .build( + specId -> { + PartitionSpec spec = specsById.get(specId); + ResidualEvaluator residuals = residualCache.get(specId); + return new TaskContext(spec, deleteFiles, residuals, dropStats); + }); + + Iterable> tasks = + entries( + (manifest, entries) -> { + int specId = manifest.partitionSpecId(); + TaskContext taskContext = taskContextCache.get(specId); + return createTasksFunc.apply(entries, taskContext); + }); if (executorService != null) { return new ParallelIterable<>(tasks, executorService); @@ -192,10 +202,10 @@ public CloseableIterable plan(CreateTasksFunction cre } } - /** + /** * Returns an iterable for manifest entries in the set of manifests. - *

- * Entries are not copied and it is the caller's responsibility to make defensive copies if + * + *

Entries are not copied and it is the caller's responsibility to make defensive copies if * adding these entries to a collection. * * @return a CloseableIterable of manifest entries. @@ -205,14 +215,22 @@ public CloseableIterable> entries() { } private Iterable> entries( - BiFunction>, CloseableIterable> entryFn) { - LoadingCache evalCache = specsById == null ? - null : Caffeine.newBuilder().build(specId -> { - PartitionSpec spec = specsById.get(specId); - return ManifestEvaluator.forPartitionFilter( - Expressions.and(partitionFilter, Projections.inclusive(spec, caseSensitive).project(dataFilter)), - spec, caseSensitive); - }); + BiFunction>, CloseableIterable> + entryFn) { + LoadingCache evalCache = + specsById == null + ? null + : Caffeine.newBuilder() + .build( + specId -> { + PartitionSpec spec = specsById.get(specId); + return ManifestEvaluator.forPartitionFilter( + Expressions.and( + partitionFilter, + Projections.inclusive(spec, caseSensitive).project(dataFilter)), + spec, + caseSensitive); + }); Evaluator evaluator; if (fileFilter != null && fileFilter != Expressions.alwaysTrue()) { @@ -221,85 +239,101 @@ private Iterable> entries( evaluator = null; } - Iterable matchingManifests = evalCache == null ? dataManifests : - Iterables.filter(dataManifests, manifest -> evalCache.get(manifest.partitionSpecId()).eval(manifest)); + Iterable matchingManifests = + evalCache == null + ? dataManifests + : Iterables.filter( + dataManifests, + manifest -> evalCache.get(manifest.partitionSpecId()).eval(manifest)); if (ignoreDeleted) { // only scan manifests that have entries other than deletes // remove any manifests that don't have any existing or added files. if either the added or // existing files count is missing, the manifest must be scanned. - matchingManifests = Iterables.filter(matchingManifests, - manifest -> manifest.hasAddedFiles() || manifest.hasExistingFiles()); + matchingManifests = + Iterables.filter( + matchingManifests, + manifest -> manifest.hasAddedFiles() || manifest.hasExistingFiles()); } if (ignoreExisting) { // only scan manifests that have entries other than existing // remove any manifests that don't have any deleted or added files. if either the added or // deleted files count is missing, the manifest must be scanned. - matchingManifests = Iterables.filter(matchingManifests, - manifest -> manifest.hasAddedFiles() || manifest.hasDeletedFiles()); + matchingManifests = + Iterables.filter( + matchingManifests, + manifest -> manifest.hasAddedFiles() || manifest.hasDeletedFiles()); } matchingManifests = Iterables.filter(matchingManifests, manifestPredicate::test); return Iterables.transform( matchingManifests, - manifest -> new CloseableIterable() { - private CloseableIterable iterable; - - @Override - public CloseableIterator iterator() { - ManifestReader reader = ManifestFiles.read(manifest, io, specsById) - .filterRows(dataFilter) - .filterPartitions(partitionFilter) - .caseSensitive(caseSensitive) - .select(columns); - - CloseableIterable> entries; - if (ignoreDeleted) { - entries = reader.liveEntries(); - } else { - entries = reader.entries(); - } - - if (ignoreExisting) { - entries = CloseableIterable.filter(entries, - entry -> entry.status() != ManifestEntry.Status.EXISTING); - } - - if (evaluator != null) { - entries = CloseableIterable.filter(entries, - entry -> evaluator.eval((GenericDataFile) entry.file())); - } - - entries = CloseableIterable.filter(entries, manifestEntryPredicate); - - iterable = entryFn.apply(manifest, entries); - - return iterable.iterator(); - } - - @Override - public void close() throws IOException { - if (iterable != null) { - iterable.close(); - } - } - }); + manifest -> + new CloseableIterable() { + private CloseableIterable iterable; + + @Override + public CloseableIterator iterator() { + ManifestReader reader = + ManifestFiles.read(manifest, io, specsById) + .filterRows(dataFilter) + .filterPartitions(partitionFilter) + .caseSensitive(caseSensitive) + .select(columns); + + CloseableIterable> entries; + if (ignoreDeleted) { + entries = reader.liveEntries(); + } else { + entries = reader.entries(); + } + + if (ignoreExisting) { + entries = + CloseableIterable.filter( + entries, entry -> entry.status() != ManifestEntry.Status.EXISTING); + } + + if (evaluator != null) { + entries = + CloseableIterable.filter( + entries, entry -> evaluator.eval((GenericDataFile) entry.file())); + } + + entries = CloseableIterable.filter(entries, manifestEntryPredicate); + + iterable = entryFn.apply(manifest, entries); + + return iterable.iterator(); + } + + @Override + public void close() throws IOException { + if (iterable != null) { + iterable.close(); + } + } + }); } - private static CloseableIterable createFileScanTasks(CloseableIterable> entries, - TaskContext ctx) { - return CloseableIterable.transform(entries, entry -> { - DataFile dataFile = entry.file().copy(ctx.shouldKeepStats()); - DeleteFile[] deleteFiles = ctx.deletes().forEntry(entry); - return new BaseFileScanTask(dataFile, deleteFiles, ctx.schemaAsString(), ctx.specAsString(), ctx.residuals()); - }); + private static CloseableIterable createFileScanTasks( + CloseableIterable> entries, TaskContext ctx) { + return CloseableIterable.transform( + entries, + entry -> { + DataFile dataFile = entry.file().copy(ctx.shouldKeepStats()); + DeleteFile[] deleteFiles = ctx.deletes().forEntry(entry); + return new BaseFileScanTask( + dataFile, deleteFiles, ctx.schemaAsString(), ctx.specAsString(), ctx.residuals()); + }); } @FunctionalInterface interface CreateTasksFunction { - CloseableIterable apply(CloseableIterable> entries, TaskContext context); + CloseableIterable apply( + CloseableIterable> entries, TaskContext context); } static class TaskContext { @@ -309,7 +343,11 @@ static class TaskContext { private final ResidualEvaluator residuals; private final boolean dropStats; - TaskContext(PartitionSpec spec, DeleteFileIndex deletes, ResidualEvaluator residuals, boolean dropStats) { + TaskContext( + PartitionSpec spec, + DeleteFileIndex deletes, + ResidualEvaluator residuals, + boolean dropStats) { this.schemaAsString = SchemaParser.toJson(spec.schema()); this.specAsString = PartitionSpecParser.toJson(spec); this.deletes = deletes; diff --git a/core/src/main/java/org/apache/iceberg/ManifestListWriter.java b/core/src/main/java/org/apache/iceberg/ManifestListWriter.java index 6d95d41bc235..3f7f20d4df6c 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestListWriter.java +++ b/core/src/main/java/org/apache/iceberg/ManifestListWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -38,7 +37,8 @@ private ManifestListWriter(OutputFile file, Map meta) { protected abstract ManifestFile prepare(ManifestFile manifest); - protected abstract FileAppender newAppender(OutputFile file, Map meta); + protected abstract FileAppender newAppender( + OutputFile file, Map meta); @Override public void add(ManifestFile manifest) { @@ -74,11 +74,13 @@ static class V2Writer extends ManifestListWriter { private final V2Metadata.IndexedManifestFile wrapper; V2Writer(OutputFile snapshotFile, long snapshotId, Long parentSnapshotId, long sequenceNumber) { - super(snapshotFile, ImmutableMap.of( - "snapshot-id", String.valueOf(snapshotId), - "parent-snapshot-id", String.valueOf(parentSnapshotId), - "sequence-number", String.valueOf(sequenceNumber), - "format-version", "2")); + super( + snapshotFile, + ImmutableMap.of( + "snapshot-id", String.valueOf(snapshotId), + "parent-snapshot-id", String.valueOf(parentSnapshotId), + "sequence-number", String.valueOf(sequenceNumber), + "format-version", "2")); this.wrapper = new V2Metadata.IndexedManifestFile(snapshotId, sequenceNumber); } @@ -107,15 +109,18 @@ static class V1Writer extends ManifestListWriter { private final V1Metadata.IndexedManifestFile wrapper = new V1Metadata.IndexedManifestFile(); V1Writer(OutputFile snapshotFile, long snapshotId, Long parentSnapshotId) { - super(snapshotFile, ImmutableMap.of( - "snapshot-id", String.valueOf(snapshotId), - "parent-snapshot-id", String.valueOf(parentSnapshotId), - "format-version", "1")); + super( + snapshotFile, + ImmutableMap.of( + "snapshot-id", String.valueOf(snapshotId), + "parent-snapshot-id", String.valueOf(parentSnapshotId), + "format-version", "1")); } @Override protected ManifestFile prepare(ManifestFile manifest) { - Preconditions.checkArgument(manifest.content() == ManifestContent.DATA, + Preconditions.checkArgument( + manifest.content() == ManifestContent.DATA, "Cannot store delete manifests in a v1 table"); return wrapper.wrap(manifest); } diff --git a/core/src/main/java/org/apache/iceberg/ManifestLists.java b/core/src/main/java/org/apache/iceberg/ManifestLists.java index bf61716777ba..c7b3e5fee5a9 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestLists.java +++ b/core/src/main/java/org/apache/iceberg/ManifestLists.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -30,36 +29,45 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; class ManifestLists { - private ManifestLists() { - } + private ManifestLists() {} static List read(InputFile manifestList) { - try (CloseableIterable files = Avro.read(manifestList) - .rename("manifest_file", GenericManifestFile.class.getName()) - .rename("partitions", GenericPartitionFieldSummary.class.getName()) - .rename("r508", GenericPartitionFieldSummary.class.getName()) - .classLoader(GenericManifestFile.class.getClassLoader()) - .project(ManifestFile.schema()) - .reuseContainers(false) - .build()) { + try (CloseableIterable files = + Avro.read(manifestList) + .rename("manifest_file", GenericManifestFile.class.getName()) + .rename("partitions", GenericPartitionFieldSummary.class.getName()) + .rename("r508", GenericPartitionFieldSummary.class.getName()) + .classLoader(GenericManifestFile.class.getClassLoader()) + .project(ManifestFile.schema()) + .reuseContainers(false) + .build()) { return Lists.newLinkedList(files); } catch (IOException e) { - throw new RuntimeIOException(e, "Cannot read manifest list file: %s", manifestList.location()); + throw new RuntimeIOException( + e, "Cannot read manifest list file: %s", manifestList.location()); } } - static ManifestListWriter write(int formatVersion, OutputFile manifestListFile, - long snapshotId, Long parentSnapshotId, long sequenceNumber) { + static ManifestListWriter write( + int formatVersion, + OutputFile manifestListFile, + long snapshotId, + Long parentSnapshotId, + long sequenceNumber) { switch (formatVersion) { case 1: - Preconditions.checkArgument(sequenceNumber == TableMetadata.INITIAL_SEQUENCE_NUMBER, - "Invalid sequence number for v1 manifest list: %s", sequenceNumber); + Preconditions.checkArgument( + sequenceNumber == TableMetadata.INITIAL_SEQUENCE_NUMBER, + "Invalid sequence number for v1 manifest list: %s", + sequenceNumber); return new ManifestListWriter.V1Writer(manifestListFile, snapshotId, parentSnapshotId); case 2: - return new ManifestListWriter.V2Writer(manifestListFile, snapshotId, parentSnapshotId, sequenceNumber); + return new ManifestListWriter.V2Writer( + manifestListFile, snapshotId, parentSnapshotId, sequenceNumber); } - throw new UnsupportedOperationException("Cannot write manifest list for table version: " + formatVersion); + throw new UnsupportedOperationException( + "Cannot write manifest list for table version: " + formatVersion); } } diff --git a/core/src/main/java/org/apache/iceberg/ManifestMergeManager.java b/core/src/main/java/org/apache/iceberg/ManifestMergeManager.java index 070199427cfc..94eb8a110709 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestMergeManager.java +++ b/core/src/main/java/org/apache/iceberg/ManifestMergeManager.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -49,8 +48,11 @@ abstract class ManifestMergeManager> { private final Supplier workerPoolSupplier; - ManifestMergeManager(long targetSizeBytes, int minCountToMerge, boolean mergeEnabled, - Supplier executorSupplier) { + ManifestMergeManager( + long targetSizeBytes, + int minCountToMerge, + boolean mergeEnabled, + Supplier executorSupplier) { this.targetSizeBytes = targetSizeBytes; this.minCountToMerge = minCountToMerge; this.mergeEnabled = mergeEnabled; @@ -58,9 +60,13 @@ abstract class ManifestMergeManager> { } protected abstract long snapshotId(); + protected abstract PartitionSpec spec(int specId); + protected abstract void deleteFile(String location); + protected abstract ManifestWriter newManifestWriter(PartitionSpec spec); + protected abstract ManifestReader newManifestReader(ManifestFile manifest); Iterable mergeManifests(Iterable manifests) { @@ -96,17 +102,19 @@ void cleanUncommitted(Set committed) { } } - private ListMultimap groupBySpec(ManifestFile first, Iterator remaining) { - ListMultimap groups = Multimaps.newListMultimap( - Maps.newTreeMap(Comparator.reverseOrder()), - Lists::newArrayList); + private ListMultimap groupBySpec( + ManifestFile first, Iterator remaining) { + ListMultimap groups = + Multimaps.newListMultimap( + Maps.newTreeMap(Comparator.reverseOrder()), Lists::newArrayList); groups.put(first.partitionSpecId(), first); remaining.forEachRemaining(manifest -> groups.put(manifest.partitionSpecId(), manifest)); return groups; } @SuppressWarnings("unchecked") - private Iterable mergeGroup(ManifestFile first, int specId, List group) { + private Iterable mergeGroup( + ManifestFile first, int specId, List group) { // use a lookback of 1 to avoid reordering the manifests. using 1 also means this should pack // from the end so that the manifest that gets under-filled is the first one, which will be // merged the next time. @@ -116,34 +124,38 @@ private Iterable mergeGroup(ManifestFile first, int specId, List[] binResults = (List[]) - Array.newInstance(List.class, bins.size()); + List[] binResults = + (List[]) Array.newInstance(List.class, bins.size()); Tasks.range(bins.size()) - .stopOnFailure().throwFailureWhenFinished() + .stopOnFailure() + .throwFailureWhenFinished() .executeWith(workerPoolSupplier.get()) - .run(index -> { - List bin = bins.get(index); - List outputManifests = Lists.newArrayList(); - binResults[index] = outputManifests; - - if (bin.size() == 1) { - // no need to rewrite - outputManifests.add(bin.get(0)); - return; - } + .run( + index -> { + List bin = bins.get(index); + List outputManifests = Lists.newArrayList(); + binResults[index] = outputManifests; + + if (bin.size() == 1) { + // no need to rewrite + outputManifests.add(bin.get(0)); + return; + } - // if the bin has the first manifest (the new data files or an appended manifest file) then only merge it - // if the number of manifests is above the minimum count. this is applied only to bins with an in-memory - // manifest so that large manifests don't prevent merging older groups. - if (bin.contains(first) && bin.size() < minCountToMerge) { - // not enough to merge, add all manifest files to the output list - outputManifests.addAll(bin); - } else { - // merge the group - outputManifests.add(createManifest(specId, bin)); - } - }); + // if the bin has the first manifest (the new data files or an appended manifest file) + // then only merge it + // if the number of manifests is above the minimum count. this is applied only to bins + // with an in-memory + // manifest so that large manifests don't prevent merging older groups. + if (bin.contains(first) && bin.size() < minCountToMerge) { + // not enough to merge, add all manifest files to the output list + outputManifests.addAll(bin); + } else { + // merge the group + outputManifests.add(createManifest(specId, bin)); + } + }); return Iterables.concat(binResults); } @@ -192,6 +204,5 @@ private ManifestFile createManifest(int specId, List bin) { mergedManifests.put(bin, manifest); return manifest; - } } diff --git a/core/src/main/java/org/apache/iceberg/ManifestReader.java b/core/src/main/java/org/apache/iceberg/ManifestReader.java index 9b465169773e..19875e8084cf 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestReader.java +++ b/core/src/main/java/org/apache/iceberg/ManifestReader.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.expressions.Expressions.alwaysTrue; + import java.io.IOException; import java.util.Collection; import java.util.List; @@ -44,19 +45,23 @@ import org.apache.iceberg.types.Types; import org.apache.iceberg.util.PartitionSet; -import static org.apache.iceberg.expressions.Expressions.alwaysTrue; - /** * Base reader for data and delete manifest files. * * @param The Java class of files returned by this reader. */ -public class ManifestReader> - extends CloseableGroup implements CloseableIterable { +public class ManifestReader> extends CloseableGroup + implements CloseableIterable { static final ImmutableList ALL_COLUMNS = ImmutableList.of("*"); - private static final Set STATS_COLUMNS = ImmutableSet.of( - "value_counts", "null_value_counts", "nan_value_counts", "lower_bounds", "upper_bounds", "record_count"); + private static final Set STATS_COLUMNS = + ImmutableSet.of( + "value_counts", + "null_value_counts", + "nan_value_counts", + "lower_bounds", + "upper_bounds", + "record_count"); protected enum FileType { DATA_FILES(GenericDataFile.class.getName()), @@ -92,17 +97,21 @@ private String fileClass() { private Evaluator lazyEvaluator = null; private InclusiveMetricsEvaluator lazyMetricsEvaluator = null; - protected ManifestReader(InputFile file, Map specsById, - InheritableMetadata inheritableMetadata, FileType content) { + protected ManifestReader( + InputFile file, + Map specsById, + InheritableMetadata inheritableMetadata, + FileType content) { this.file = file; this.inheritableMetadata = inheritableMetadata; this.content = content; try { - try (AvroIterable> headerReader = Avro.read(file) - .project(ManifestEntry.getSchema(Types.StructType.of()).select("status")) - .classLoader(GenericManifestEntry.class.getClassLoader()) - .build()) { + try (AvroIterable> headerReader = + Avro.read(file) + .project(ManifestEntry.getSchema(Types.StructType.of()).select("status")) + .classLoader(GenericManifestEntry.class.getClassLoader()) + .build()) { this.metadata = headerReader.getMetadata(); } } catch (IOException e) { @@ -119,7 +128,8 @@ protected ManifestReader(InputFile file, Map specsById, this.spec = specsById.get(specId); } else { Schema schema = SchemaParser.fromJson(metadata.get("schema")); - this.spec = PartitionSpecParser.fromJsonFields(schema, specId, metadata.get("partition-spec")); + this.spec = + PartitionSpecParser.fromJsonFields(schema, specId, metadata.get("partition-spec")); } this.fileSchema = new Schema(DataFile.getType(spec.partitionType()).fields()); @@ -142,15 +152,16 @@ public PartitionSpec spec() { } public ManifestReader select(Collection newColumns) { - Preconditions.checkState(fileProjection == null, + Preconditions.checkState( + fileProjection == null, "Cannot select columns using both select(String...) and project(Schema)"); this.columns = newColumns; return this; } public ManifestReader project(Schema newFileProjection) { - Preconditions.checkState(columns == null, - "Cannot select columns using both select(String...) and project(Schema)"); + Preconditions.checkState( + columns == null, "Cannot select columns using both select(String...) and project(Schema)"); this.fileProjection = newFileProjection; return this; } @@ -176,29 +187,32 @@ public ManifestReader caseSensitive(boolean isCaseSensitive) { } CloseableIterable> entries() { - if ((rowFilter != null && rowFilter != Expressions.alwaysTrue()) || - (partFilter != null && partFilter != Expressions.alwaysTrue()) || - (partitionSet != null)) { + if ((rowFilter != null && rowFilter != Expressions.alwaysTrue()) + || (partFilter != null && partFilter != Expressions.alwaysTrue()) + || (partitionSet != null)) { Evaluator evaluator = evaluator(); InclusiveMetricsEvaluator metricsEvaluator = metricsEvaluator(); // ensure stats columns are present for metrics evaluation boolean requireStatsProjection = requireStatsProjection(rowFilter, columns); - Collection projectColumns = requireStatsProjection ? withStatsColumns(columns) : columns; + Collection projectColumns = + requireStatsProjection ? withStatsColumns(columns) : columns; return CloseableIterable.filter( open(projection(fileSchema, fileProjection, projectColumns, caseSensitive)), - entry -> entry != null && - evaluator.eval(entry.file().partition()) && - metricsEvaluator.eval(entry.file()) && - inPartitionSet(entry.file())); + entry -> + entry != null + && evaluator.eval(entry.file().partition()) + && metricsEvaluator.eval(entry.file()) + && inPartitionSet(entry.file())); } else { return open(projection(fileSchema, fileProjection, columns, caseSensitive)); } } private boolean inPartitionSet(F fileToCheck) { - return partitionSet == null || partitionSet.contains(fileToCheck.specId(), fileToCheck.partition()); + return partitionSet == null + || partitionSet.contains(fileToCheck.specId(), fileToCheck.partition()); } private CloseableIterable> open(Schema projection) { @@ -211,16 +225,17 @@ private CloseableIterable> open(Schema projection) { switch (format) { case AVRO: - AvroIterable> reader = Avro.read(file) - .project(ManifestEntry.wrapFileSchema(Types.StructType.of(fields))) - .rename("manifest_entry", GenericManifestEntry.class.getName()) - .rename("partition", PartitionData.class.getName()) - .rename("r102", PartitionData.class.getName()) - .rename("data_file", content.fileClass()) - .rename("r2", content.fileClass()) - .classLoader(GenericManifestEntry.class.getClassLoader()) - .reuseContainers() - .build(); + AvroIterable> reader = + Avro.read(file) + .project(ManifestEntry.wrapFileSchema(Types.StructType.of(fields))) + .rename("manifest_entry", GenericManifestEntry.class.getName()) + .rename("partition", PartitionData.class.getName()) + .rename("r102", PartitionData.class.getName()) + .rename("data_file", content.fileClass()) + .rename("r2", content.fileClass()) + .classLoader(GenericManifestEntry.class.getClassLoader()) + .reuseContainers() + .build(); addCloseable(reader); @@ -232,19 +247,20 @@ private CloseableIterable> open(Schema projection) { } CloseableIterable> liveEntries() { - return CloseableIterable.filter(entries(), - entry -> entry != null && entry.status() != ManifestEntry.Status.DELETED); + return CloseableIterable.filter( + entries(), entry -> entry != null && entry.status() != ManifestEntry.Status.DELETED); } - /** - * @return an Iterator of DataFile. Makes defensive copies of files before returning - */ + /** @return an Iterator of DataFile. Makes defensive copies of files before returning */ @Override public CloseableIterator iterator() { - return CloseableIterable.transform(liveEntries(), e -> e.file().copy(!dropStats(rowFilter, columns))).iterator(); + return CloseableIterable.transform( + liveEntries(), e -> e.file().copy(!dropStats(rowFilter, columns))) + .iterator(); } - private static Schema projection(Schema schema, Schema project, Collection columns, boolean caseSensitive) { + private static Schema projection( + Schema schema, Schema project, Collection columns, boolean caseSensitive) { if (columns != null) { if (caseSensitive) { return schema.select(columns); @@ -265,7 +281,8 @@ private Evaluator evaluator() { if (finalPartFilter != null) { this.lazyEvaluator = new Evaluator(spec.partitionType(), finalPartFilter, caseSensitive); } else { - this.lazyEvaluator = new Evaluator(spec.partitionType(), Expressions.alwaysTrue(), caseSensitive); + this.lazyEvaluator = + new Evaluator(spec.partitionType(), Expressions.alwaysTrue(), caseSensitive); } } return lazyEvaluator; @@ -274,11 +291,11 @@ private Evaluator evaluator() { private InclusiveMetricsEvaluator metricsEvaluator() { if (lazyMetricsEvaluator == null) { if (rowFilter != null) { - this.lazyMetricsEvaluator = new InclusiveMetricsEvaluator( - spec.schema(), rowFilter, caseSensitive); + this.lazyMetricsEvaluator = + new InclusiveMetricsEvaluator(spec.schema(), rowFilter, caseSensitive); } else { - this.lazyMetricsEvaluator = new InclusiveMetricsEvaluator( - spec.schema(), Expressions.alwaysTrue(), caseSensitive); + this.lazyMetricsEvaluator = + new InclusiveMetricsEvaluator(spec.schema(), Expressions.alwaysTrue(), caseSensitive); } } return lazyMetricsEvaluator; @@ -286,19 +303,22 @@ private InclusiveMetricsEvaluator metricsEvaluator() { private static boolean requireStatsProjection(Expression rowFilter, Collection columns) { // Make sure we have all stats columns for metrics evaluator - return rowFilter != Expressions.alwaysTrue() && - columns != null && - !columns.containsAll(ManifestReader.ALL_COLUMNS) && - !columns.containsAll(STATS_COLUMNS); + return rowFilter != Expressions.alwaysTrue() + && columns != null + && !columns.containsAll(ManifestReader.ALL_COLUMNS) + && !columns.containsAll(STATS_COLUMNS); } static boolean dropStats(Expression rowFilter, Collection columns) { // Make sure we only drop all stats if we had projected all stats - // We do not drop stats even if we had partially added some stats columns, except for record_count column. - // Since we don't want to keep stats map which could be huge in size just because we select record_count, which + // We do not drop stats even if we had partially added some stats columns, except for + // record_count column. + // Since we don't want to keep stats map which could be huge in size just because we select + // record_count, which // is a primitive type. - if (rowFilter != Expressions.alwaysTrue() && columns != null && - !columns.containsAll(ManifestReader.ALL_COLUMNS)) { + if (rowFilter != Expressions.alwaysTrue() + && columns != null + && !columns.containsAll(ManifestReader.ALL_COLUMNS)) { Set intersection = Sets.intersection(Sets.newHashSet(columns), STATS_COLUMNS); return intersection.isEmpty() || intersection.equals(Sets.newHashSet("record_count")); } diff --git a/core/src/main/java/org/apache/iceberg/ManifestWriter.java b/core/src/main/java/org/apache/iceberg/ManifestWriter.java index 7c81c15e0cc0..7de0cb7be561 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestWriter.java +++ b/core/src/main/java/org/apache/iceberg/ManifestWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -29,7 +28,8 @@ /** * Writer for manifest files. * - * @param Java class of files written to the manifest, either {@link DataFile} or {@link DeleteFile}. + * @param Java class of files written to the manifest, either {@link DataFile} or {@link + * DeleteFile}. */ public abstract class ManifestWriter> implements FileAppender { // stand-in for the current sequence number that will be assigned when the commit is successful @@ -63,7 +63,8 @@ private ManifestWriter(PartitionSpec spec, OutputFile file, Long snapshotId) { protected abstract ManifestEntry prepare(ManifestEntry entry); - protected abstract FileAppender> newAppender(PartitionSpec spec, OutputFile outputFile); + protected abstract FileAppender> newAppender( + PartitionSpec spec, OutputFile outputFile); protected ManifestContent content() { return ManifestContent.DATA; @@ -85,7 +86,8 @@ void addEntry(ManifestEntry entry) { break; } stats.update(entry.file().partition()); - if (entry.sequenceNumber() != null && (minSequenceNumber == null || entry.sequenceNumber() < minSequenceNumber)) { + if (entry.sequenceNumber() != null + && (minSequenceNumber == null || entry.sequenceNumber() < minSequenceNumber)) { this.minSequenceNumber = entry.sequenceNumber(); } writer.add(prepare(entry)); @@ -93,8 +95,8 @@ void addEntry(ManifestEntry entry) { /** * Add an added entry for a file. - *

- * The entry's snapshot ID will be this manifest's snapshot ID. + * + *

The entry's snapshot ID will be this manifest's snapshot ID. * * @param addedFile a data file */ @@ -105,9 +107,9 @@ public void add(F addedFile) { /** * Add an added entry for a file with a specific sequence number. - *

- * The entry's snapshot ID will be this manifest's snapshot ID. - * The entry's sequence number will be the provided sequence number. + * + *

The entry's snapshot ID will be this manifest's snapshot ID. The entry's sequence number + * will be the provided sequence number. * * @param addedFile a data file * @param sequenceNumber sequence number for the data file @@ -141,8 +143,8 @@ void existing(ManifestEntry entry) { /** * Add a delete entry for a file. - *

- * The entry's snapshot ID will be this manifest's snapshot ID. + * + *

The entry's snapshot ID will be this manifest's snapshot ID. * * @param deletedFile a file */ @@ -168,12 +170,27 @@ public long length() { public ManifestFile toManifestFile() { Preconditions.checkState(closed, "Cannot build ManifestFile, writer is not closed"); - // if the minSequenceNumber is null, then no manifests with a sequence number have been written, so the min - // sequence number is the one that will be assigned when this is committed. pass UNASSIGNED_SEQ to inherit it. + // if the minSequenceNumber is null, then no manifests with a sequence number have been written, + // so the min + // sequence number is the one that will be assigned when this is committed. pass UNASSIGNED_SEQ + // to inherit it. long minSeqNumber = minSequenceNumber != null ? minSequenceNumber : UNASSIGNED_SEQ; - return new GenericManifestFile(file.location(), writer.length(), specId, content(), - UNASSIGNED_SEQ, minSeqNumber, snapshotId, - addedFiles, addedRows, existingFiles, existingRows, deletedFiles, deletedRows, stats.summaries(), null); + return new GenericManifestFile( + file.location(), + writer.length(), + specId, + content(), + UNASSIGNED_SEQ, + minSeqNumber, + snapshotId, + addedFiles, + addedRows, + existingFiles, + existingRows, + deletedFiles, + deletedRows, + stats.summaries(), + null); } @Override @@ -196,7 +213,8 @@ protected ManifestEntry prepare(ManifestEntry entry) { } @Override - protected FileAppender> newAppender(PartitionSpec spec, OutputFile file) { + protected FileAppender> newAppender( + PartitionSpec spec, OutputFile file) { Schema manifestSchema = V2Metadata.entrySchema(spec.partitionType()); try { return Avro.write(file) @@ -229,7 +247,8 @@ protected ManifestEntry prepare(ManifestEntry entry) { } @Override - protected FileAppender> newAppender(PartitionSpec spec, OutputFile file) { + protected FileAppender> newAppender( + PartitionSpec spec, OutputFile file) { Schema manifestSchema = V2Metadata.entrySchema(spec.partitionType()); try { return Avro.write(file) @@ -267,7 +286,8 @@ protected ManifestEntry prepare(ManifestEntry entry) { } @Override - protected FileAppender> newAppender(PartitionSpec spec, OutputFile file) { + protected FileAppender> newAppender( + PartitionSpec spec, OutputFile file) { Schema manifestSchema = V1Metadata.entrySchema(spec.partitionType()); try { return Avro.write(file) diff --git a/core/src/main/java/org/apache/iceberg/ManifestsTable.java b/core/src/main/java/org/apache/iceberg/ManifestsTable.java index b7b081d3814d..88f9943ddfd4 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestsTable.java +++ b/core/src/main/java/org/apache/iceberg/ManifestsTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -26,29 +25,31 @@ import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types; -/** - * A {@link Table} implementation that exposes a table's manifest files as rows. - */ +/** A {@link Table} implementation that exposes a table's manifest files as rows. */ public class ManifestsTable extends BaseMetadataTable { - private static final Schema SNAPSHOT_SCHEMA = new Schema( - Types.NestedField.required(14, "content", Types.IntegerType.get()), - Types.NestedField.required(1, "path", Types.StringType.get()), - Types.NestedField.required(2, "length", Types.LongType.get()), - Types.NestedField.required(3, "partition_spec_id", Types.IntegerType.get()), - Types.NestedField.required(4, "added_snapshot_id", Types.LongType.get()), - Types.NestedField.required(5, "added_data_files_count", Types.IntegerType.get()), - Types.NestedField.required(6, "existing_data_files_count", Types.IntegerType.get()), - Types.NestedField.required(7, "deleted_data_files_count", Types.IntegerType.get()), - Types.NestedField.required(15, "added_delete_files_count", Types.IntegerType.get()), - Types.NestedField.required(16, "existing_delete_files_count", Types.IntegerType.get()), - Types.NestedField.required(17, "deleted_delete_files_count", Types.IntegerType.get()), - Types.NestedField.required(8, "partition_summaries", Types.ListType.ofRequired(9, Types.StructType.of( - Types.NestedField.required(10, "contains_null", Types.BooleanType.get()), - Types.NestedField.optional(11, "contains_nan", Types.BooleanType.get()), - Types.NestedField.optional(12, "lower_bound", Types.StringType.get()), - Types.NestedField.optional(13, "upper_bound", Types.StringType.get()) - ))) - ); + private static final Schema SNAPSHOT_SCHEMA = + new Schema( + Types.NestedField.required(14, "content", Types.IntegerType.get()), + Types.NestedField.required(1, "path", Types.StringType.get()), + Types.NestedField.required(2, "length", Types.LongType.get()), + Types.NestedField.required(3, "partition_spec_id", Types.IntegerType.get()), + Types.NestedField.required(4, "added_snapshot_id", Types.LongType.get()), + Types.NestedField.required(5, "added_data_files_count", Types.IntegerType.get()), + Types.NestedField.required(6, "existing_data_files_count", Types.IntegerType.get()), + Types.NestedField.required(7, "deleted_data_files_count", Types.IntegerType.get()), + Types.NestedField.required(15, "added_delete_files_count", Types.IntegerType.get()), + Types.NestedField.required(16, "existing_delete_files_count", Types.IntegerType.get()), + Types.NestedField.required(17, "deleted_delete_files_count", Types.IntegerType.get()), + Types.NestedField.required( + 8, + "partition_summaries", + Types.ListType.ofRequired( + 9, + Types.StructType.of( + Types.NestedField.required(10, "contains_null", Types.BooleanType.get()), + Types.NestedField.optional(11, "contains_nan", Types.BooleanType.get()), + Types.NestedField.optional(12, "lower_bound", Types.StringType.get()), + Types.NestedField.optional(13, "upper_bound", Types.StringType.get()))))); ManifestsTable(TableOperations ops, Table table) { this(ops, table, table.name() + ".manifests"); @@ -80,12 +81,13 @@ protected DataTask task(TableScan scan) { return StaticDataTask.of( ops.io().newInputFile(location != null ? location : ops.current().metadataFileLocation()), - schema(), scan.schema(), scan.snapshot().allManifests(ops.io()), + schema(), + scan.schema(), + scan.snapshot().allManifests(ops.io()), manifest -> { PartitionSpec spec = specs.get(manifest.partitionSpecId()); return ManifestsTable.manifestFileToRow(spec, manifest); - } - ); + }); } private class ManifestsTableScan extends StaticTableScan { @@ -107,12 +109,11 @@ static StaticDataTask.Row manifestFileToRow(PartitionSpec spec, ManifestFile man manifest.content() == ManifestContent.DELETES ? manifest.addedFilesCount() : 0, manifest.content() == ManifestContent.DELETES ? manifest.existingFilesCount() : 0, manifest.content() == ManifestContent.DELETES ? manifest.deletedFilesCount() : 0, - partitionSummariesToRows(spec, manifest.partitions()) - ); + partitionSummariesToRows(spec, manifest.partitions())); } - static List partitionSummariesToRows(PartitionSpec spec, - List summaries) { + static List partitionSummariesToRows( + PartitionSpec spec, List summaries) { if (summaries == null) { return null; } @@ -121,14 +122,22 @@ static List partitionSummariesToRows(PartitionSpec spec, for (int i = 0; i < summaries.size(); i += 1) { ManifestFile.PartitionFieldSummary summary = summaries.get(i); - rows.add(StaticDataTask.Row.of( - summary.containsNull(), - summary.containsNaN(), - spec.fields().get(i).transform().toHumanString( - Conversions.fromByteBuffer(spec.partitionType().fields().get(i).type(), summary.lowerBound())), - spec.fields().get(i).transform().toHumanString( - Conversions.fromByteBuffer(spec.partitionType().fields().get(i).type(), summary.upperBound())) - )); + rows.add( + StaticDataTask.Row.of( + summary.containsNull(), + summary.containsNaN(), + spec.fields() + .get(i) + .transform() + .toHumanString( + Conversions.fromByteBuffer( + spec.partitionType().fields().get(i).type(), summary.lowerBound())), + spec.fields() + .get(i) + .transform() + .toHumanString( + Conversions.fromByteBuffer( + spec.partitionType().fields().get(i).type(), summary.upperBound())))); } return rows; diff --git a/core/src/main/java/org/apache/iceberg/MergeAppend.java b/core/src/main/java/org/apache/iceberg/MergeAppend.java index fb3a9a8419b5..1781e95e9db6 100644 --- a/core/src/main/java/org/apache/iceberg/MergeAppend.java +++ b/core/src/main/java/org/apache/iceberg/MergeAppend.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.exceptions.CommitFailedException; @@ -24,8 +23,9 @@ /** * Append implementation that produces a minimal number of manifest files. - *

- * This implementation will attempt to commit 5 times before throwing {@link CommitFailedException}. + * + *

This implementation will attempt to commit 5 times before throwing {@link + * CommitFailedException}. */ class MergeAppend extends MergingSnapshotProducer implements AppendFiles { MergeAppend(String tableName, TableOperations ops) { @@ -50,13 +50,15 @@ public MergeAppend appendFile(DataFile file) { @Override public AppendFiles appendManifest(ManifestFile manifest) { - Preconditions.checkArgument(!manifest.hasExistingFiles(), "Cannot append manifest with existing files"); - Preconditions.checkArgument(!manifest.hasDeletedFiles(), "Cannot append manifest with deleted files"); + Preconditions.checkArgument( + !manifest.hasExistingFiles(), "Cannot append manifest with existing files"); + Preconditions.checkArgument( + !manifest.hasDeletedFiles(), "Cannot append manifest with deleted files"); Preconditions.checkArgument( manifest.snapshotId() == null || manifest.snapshotId() == -1, "Snapshot id must be assigned during commit"); - Preconditions.checkArgument(manifest.sequenceNumber() == -1, - "Sequence must be assigned during commit"); + Preconditions.checkArgument( + manifest.sequenceNumber() == -1, "Sequence must be assigned during commit"); add(manifest); return this; } diff --git a/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java b/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java index e184b50e8284..789c6c23c32b 100644 --- a/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java +++ b/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TableProperties.MANIFEST_MIN_MERGE_COUNT; +import static org.apache.iceberg.TableProperties.MANIFEST_MIN_MERGE_COUNT_DEFAULT; +import static org.apache.iceberg.TableProperties.MANIFEST_TARGET_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT; +import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; +import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT; + import java.io.IOException; import java.io.UncheckedIOException; import java.util.Arrays; @@ -52,13 +58,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.MANIFEST_MIN_MERGE_COUNT; -import static org.apache.iceberg.TableProperties.MANIFEST_MIN_MERGE_COUNT_DEFAULT; -import static org.apache.iceberg.TableProperties.MANIFEST_TARGET_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT; -import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; -import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT; - abstract class MergingSnapshotProducer extends SnapshotProducer { private static final Logger LOG = LoggerFactory.getLogger(MergingSnapshotProducer.class); @@ -109,18 +108,25 @@ abstract class MergingSnapshotProducer extends SnapshotProducer { this.tableName = tableName; this.ops = ops; this.dataSpec = null; - long targetSizeBytes = ops.current() - .propertyAsLong(MANIFEST_TARGET_SIZE_BYTES, MANIFEST_TARGET_SIZE_BYTES_DEFAULT); - int minCountToMerge = ops.current() - .propertyAsInt(MANIFEST_MIN_MERGE_COUNT, MANIFEST_MIN_MERGE_COUNT_DEFAULT); - boolean mergeEnabled = ops.current() - .propertyAsBoolean(TableProperties.MANIFEST_MERGE_ENABLED, TableProperties.MANIFEST_MERGE_ENABLED_DEFAULT); + long targetSizeBytes = + ops.current() + .propertyAsLong(MANIFEST_TARGET_SIZE_BYTES, MANIFEST_TARGET_SIZE_BYTES_DEFAULT); + int minCountToMerge = + ops.current().propertyAsInt(MANIFEST_MIN_MERGE_COUNT, MANIFEST_MIN_MERGE_COUNT_DEFAULT); + boolean mergeEnabled = + ops.current() + .propertyAsBoolean( + TableProperties.MANIFEST_MERGE_ENABLED, + TableProperties.MANIFEST_MERGE_ENABLED_DEFAULT); this.mergeManager = new DataFileMergeManager(targetSizeBytes, minCountToMerge, mergeEnabled); this.filterManager = new DataFileFilterManager(); - this.deleteMergeManager = new DeleteFileMergeManager(targetSizeBytes, minCountToMerge, mergeEnabled); + this.deleteMergeManager = + new DeleteFileMergeManager(targetSizeBytes, minCountToMerge, mergeEnabled); this.deleteFilterManager = new DeleteFileFilterManager(); - this.snapshotIdInheritanceEnabled = ops.current() - .propertyAsBoolean(SNAPSHOT_ID_INHERITANCE_ENABLED, SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); + this.snapshotIdInheritanceEnabled = + ops.current() + .propertyAsBoolean( + SNAPSHOT_ID_INHERITANCE_ENABLED, SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); } @Override @@ -141,7 +147,8 @@ protected boolean isCaseSensitive() { } protected PartitionSpec dataSpec() { - Preconditions.checkState(dataSpec != null, "Cannot determine partition spec: no data files have been added"); + Preconditions.checkState( + dataSpec != null, "Cannot determine partition spec: no data files have been added"); // the spec is set when the write is started return dataSpec; } @@ -173,44 +180,36 @@ protected void failMissingDeletePaths() { protected void deleteByRowFilter(Expression expr) { this.deleteExpression = expr; filterManager.deleteByRowFilter(expr); - // if a delete file matches the row filter, then it can be deleted because the rows will also be deleted + // if a delete file matches the row filter, then it can be deleted because the rows will also be + // deleted deleteFilterManager.deleteByRowFilter(expr); } - /** - * Add a partition tuple to drop from the table during the delete phase. - */ + /** Add a partition tuple to drop from the table during the delete phase. */ protected void dropPartition(int specId, StructLike partition) { // dropping the data in a partition also drops all deletes in the partition filterManager.dropPartition(specId, partition); deleteFilterManager.dropPartition(specId, partition); } - /** - * Add a specific data file to be deleted in the new snapshot. - */ + /** Add a specific data file to be deleted in the new snapshot. */ protected void delete(DataFile file) { filterManager.delete(file); } - /** - * Add a specific delete file to be deleted in the new snapshot. - */ + /** Add a specific delete file to be deleted in the new snapshot. */ protected void delete(DeleteFile file) { deleteFilterManager.delete(file); } - /** - * Add a specific data path to be deleted in the new snapshot. - */ + /** Add a specific data path to be deleted in the new snapshot. */ protected void delete(CharSequence path) { - // this is an old call that never worked for delete files and can only be used to remove data files. + // this is an old call that never worked for delete files and can only be used to remove data + // files. filterManager.delete(path); } - /** - * Add a data file to the new snapshot. - */ + /** Add a data file to the new snapshot. */ protected void add(DataFile file) { Preconditions.checkNotNull(file, "Invalid data file: null"); setDataSpec(file); @@ -219,13 +218,12 @@ protected void add(DataFile file) { newFiles.add(file); } - /** - * Add a delete file to the new snapshot. - */ + /** Add a delete file to the new snapshot. */ protected void add(DeleteFile file) { Preconditions.checkNotNull(file, "Invalid delete file: null"); PartitionSpec fileSpec = ops.current().spec(file.specId()); - List deleteFiles = newDeleteFilesBySpec.computeIfAbsent(file.specId(), specId -> Lists.newArrayList()); + List deleteFiles = + newDeleteFilesBySpec.computeIfAbsent(file.specId(), specId -> Lists.newArrayList()); deleteFiles.add(file); addedFilesSummary.addedFile(fileSpec, file); hasNewDeleteFiles = true; @@ -233,7 +231,8 @@ protected void add(DeleteFile file) { private void setDataSpec(DataFile file) { PartitionSpec fileSpec = ops.current().spec(file.specId()); - Preconditions.checkNotNull(fileSpec, "Cannot find partition spec for data file: %s", file.path()); + Preconditions.checkNotNull( + fileSpec, "Cannot find partition spec for data file: %s", file.path()); if (dataSpec == null) { dataSpec = fileSpec; } else if (dataSpec.specId() != file.specId()) { @@ -241,12 +240,10 @@ private void setDataSpec(DataFile file) { } } - /** - * Add all files in a manifest to the new snapshot. - */ + /** Add all files in a manifest to the new snapshot. */ protected void add(ManifestFile manifest) { - Preconditions.checkArgument(manifest.content() == ManifestContent.DATA, - "Cannot append delete manifest: %s", manifest); + Preconditions.checkArgument( + manifest.content() == ManifestContent.DATA, "Cannot append delete manifest: %s", manifest); if (snapshotIdInheritanceEnabled && manifest.snapshotId() == null) { appendedManifestsSummary.addedManifest(manifest); appendManifests.add(manifest); @@ -262,49 +259,62 @@ private ManifestFile copyManifest(ManifestFile manifest) { InputFile toCopy = ops.io().newInputFile(manifest.path()); OutputFile newManifestPath = newManifestOutput(); return ManifestFiles.copyAppendManifest( - current.formatVersion(), toCopy, current.specsById(), newManifestPath, snapshotId(), appendedManifestsSummary); + current.formatVersion(), + toCopy, + current.specsById(), + newManifestPath, + snapshotId(), + appendedManifestsSummary); } /** - * Validates that no files matching given partitions have been added to the table since a starting snapshot. + * Validates that no files matching given partitions have been added to the table since a starting + * snapshot. * * @param base table metadata to validate * @param startingSnapshotId id of the snapshot current at the start of the operation * @param partitionSet a set of partitions to filter new conflicting data files */ - protected void validateAddedDataFiles(TableMetadata base, Long startingSnapshotId, PartitionSet partitionSet) { + protected void validateAddedDataFiles( + TableMetadata base, Long startingSnapshotId, PartitionSet partitionSet) { CloseableIterable> conflictEntries = addedDataFiles(base, startingSnapshotId, null, partitionSet); try (CloseableIterator> conflicts = conflictEntries.iterator()) { if (conflicts.hasNext()) { - throw new ValidationException("Found conflicting files that can contain records matching partitions %s: %s", + throw new ValidationException( + "Found conflicting files that can contain records matching partitions %s: %s", partitionSet, - Iterators.toString(Iterators.transform(conflicts, entry -> entry.file().path().toString()))); + Iterators.toString( + Iterators.transform(conflicts, entry -> entry.file().path().toString()))); } } catch (IOException e) { - throw new UncheckedIOException(String.format("Failed to validate no appends matching %s", partitionSet), e); + throw new UncheckedIOException( + String.format("Failed to validate no appends matching %s", partitionSet), e); } } /** - * Validates that no files matching a filter have been added to the table since a starting snapshot. + * Validates that no files matching a filter have been added to the table since a starting + * snapshot. * * @param base table metadata to validate * @param startingSnapshotId id of the snapshot current at the start of the operation * @param conflictDetectionFilter an expression used to find new conflicting data files */ - protected void validateAddedDataFiles(TableMetadata base, Long startingSnapshotId, - Expression conflictDetectionFilter) { + protected void validateAddedDataFiles( + TableMetadata base, Long startingSnapshotId, Expression conflictDetectionFilter) { CloseableIterable> conflictEntries = addedDataFiles(base, startingSnapshotId, conflictDetectionFilter, null); try (CloseableIterator> conflicts = conflictEntries.iterator()) { if (conflicts.hasNext()) { - throw new ValidationException("Found conflicting files that can contain records matching %s: %s", + throw new ValidationException( + "Found conflicting files that can contain records matching %s: %s", conflictDetectionFilter, - Iterators.toString(Iterators.transform(conflicts, entry -> entry.file().path().toString()))); + Iterators.toString( + Iterators.transform(conflicts, entry -> entry.file().path().toString()))); } } catch (IOException e) { @@ -314,81 +324,92 @@ protected void validateAddedDataFiles(TableMetadata base, Long startingSnapshotI } /** - * Returns an iterable of files matching a filter have been added to the table since a starting snapshot. + * Returns an iterable of files matching a filter have been added to the table since a starting + * snapshot. * * @param base table metadata to validate * @param startingSnapshotId id of the snapshot current at the start of the operation * @param dataFilter an expression used to find new data files * @param partitionSet a set of partitions to find new data files */ - private CloseableIterable> addedDataFiles(TableMetadata base, - Long startingSnapshotId, - Expression dataFilter, - PartitionSet partitionSet) { + private CloseableIterable> addedDataFiles( + TableMetadata base, + Long startingSnapshotId, + Expression dataFilter, + PartitionSet partitionSet) { // if there is no current table state, no files have been added if (base.currentSnapshot() == null) { return CloseableIterable.empty(); } Pair, Set> history = - validationHistory(base, startingSnapshotId, VALIDATE_ADDED_FILES_OPERATIONS, ManifestContent.DATA); + validationHistory( + base, startingSnapshotId, VALIDATE_ADDED_FILES_OPERATIONS, ManifestContent.DATA); List manifests = history.first(); Set newSnapshots = history.second(); - ManifestGroup manifestGroup = new ManifestGroup(ops.io(), manifests, ImmutableList.of()) - .caseSensitive(caseSensitive) - .filterManifestEntries(entry -> newSnapshots.contains(entry.snapshotId())) - .specsById(base.specsById()) - .ignoreDeleted() - .ignoreExisting(); + ManifestGroup manifestGroup = + new ManifestGroup(ops.io(), manifests, ImmutableList.of()) + .caseSensitive(caseSensitive) + .filterManifestEntries(entry -> newSnapshots.contains(entry.snapshotId())) + .specsById(base.specsById()) + .ignoreDeleted() + .ignoreExisting(); if (dataFilter != null) { manifestGroup = manifestGroup.filterData(dataFilter); } if (partitionSet != null) { - manifestGroup = manifestGroup.filterManifestEntries(entry -> - partitionSet.contains(entry.file().specId(), entry.file().partition())); + manifestGroup = + manifestGroup.filterManifestEntries( + entry -> partitionSet.contains(entry.file().specId(), entry.file().partition())); } return manifestGroup.entries(); } /** - * Validates that no new delete files that must be applied to the given data files have been added to the table since - * a starting snapshot. + * Validates that no new delete files that must be applied to the given data files have been added + * to the table since a starting snapshot. * * @param base table metadata to validate * @param startingSnapshotId id of the snapshot current at the start of the operation * @param dataFiles data files to validate have no new row deletes */ - protected void validateNoNewDeletesForDataFiles(TableMetadata base, Long startingSnapshotId, - Iterable dataFiles) { - validateNoNewDeletesForDataFiles(base, startingSnapshotId, null, dataFiles, newFilesSequenceNumber != null); + protected void validateNoNewDeletesForDataFiles( + TableMetadata base, Long startingSnapshotId, Iterable dataFiles) { + validateNoNewDeletesForDataFiles( + base, startingSnapshotId, null, dataFiles, newFilesSequenceNumber != null); } /** - * Validates that no new delete files that must be applied to the given data files have been added to the table since - * a starting snapshot. + * Validates that no new delete files that must be applied to the given data files have been added + * to the table since a starting snapshot. * * @param base table metadata to validate * @param startingSnapshotId id of the snapshot current at the start of the operation * @param dataFilter a data filter * @param dataFiles data files to validate have no new row deletes */ - protected void validateNoNewDeletesForDataFiles(TableMetadata base, Long startingSnapshotId, - Expression dataFilter, Iterable dataFiles) { + protected void validateNoNewDeletesForDataFiles( + TableMetadata base, + Long startingSnapshotId, + Expression dataFilter, + Iterable dataFiles) { validateNoNewDeletesForDataFiles(base, startingSnapshotId, dataFilter, dataFiles, false); } /** - * Validates that no new delete files that must be applied to the given data files have been added to the table since - * a starting snapshot, with the option to ignore equality deletes during the validation. - *

- * For example, in the case of rewriting data files, if the added data files have the same sequence number as the - * replaced data files, equality deletes added at a higher sequence number are still effective against the added - * data files, so there is no risk of commit conflict between RewriteFiles and RowDelta. In cases like this, - * validation against equality delete files can be omitted. + * Validates that no new delete files that must be applied to the given data files have been added + * to the table since a starting snapshot, with the option to ignore equality deletes during the + * validation. + * + *

For example, in the case of rewriting data files, if the added data files have the same + * sequence number as the replaced data files, equality deletes added at a higher sequence number + * are still effective against the added data files, so there is no risk of commit conflict + * between RewriteFiles and RowDelta. In cases like this, validation against equality delete files + * can be omitted. * * @param base table metadata to validate * @param startingSnapshotId id of the snapshot current at the start of the operation @@ -396,9 +417,12 @@ protected void validateNoNewDeletesForDataFiles(TableMetadata base, Long startin * @param dataFiles data files to validate have no new row deletes * @param ignoreEqualityDeletes whether equality deletes should be ignored in validation */ - private void validateNoNewDeletesForDataFiles(TableMetadata base, Long startingSnapshotId, - Expression dataFilter, Iterable dataFiles, - boolean ignoreEqualityDeletes) { + private void validateNoNewDeletesForDataFiles( + TableMetadata base, + Long startingSnapshotId, + Expression dataFilter, + Iterable dataFiles, + boolean ignoreEqualityDeletes) { // if there is no current table state, no files have been added if (base.currentSnapshot() == null || base.formatVersion() < 2) { return; @@ -408,46 +432,58 @@ private void validateNoNewDeletesForDataFiles(TableMetadata base, Long startingS long startingSequenceNumber = startingSequenceNumber(base, startingSnapshotId); for (DataFile dataFile : dataFiles) { - // if any delete is found that applies to files written in or before the starting snapshot, fail + // if any delete is found that applies to files written in or before the starting snapshot, + // fail DeleteFile[] deleteFiles = deletes.forDataFile(startingSequenceNumber, dataFile); if (ignoreEqualityDeletes) { ValidationException.check( - Arrays.stream(deleteFiles).noneMatch(deleteFile -> deleteFile.content() == FileContent.POSITION_DELETES), - "Cannot commit, found new position delete for replaced data file: %s", dataFile); + Arrays.stream(deleteFiles) + .noneMatch(deleteFile -> deleteFile.content() == FileContent.POSITION_DELETES), + "Cannot commit, found new position delete for replaced data file: %s", + dataFile); } else { - ValidationException.check(deleteFiles.length == 0, - "Cannot commit, found new delete for replaced data file: %s", dataFile); + ValidationException.check( + deleteFiles.length == 0, + "Cannot commit, found new delete for replaced data file: %s", + dataFile); } } } /** - * Validates that no delete files matching a filter have been added to the table since a starting snapshot. + * Validates that no delete files matching a filter have been added to the table since a starting + * snapshot. * * @param base table metadata to validate * @param startingSnapshotId id of the snapshot current at the start of the operation * @param dataFilter an expression used to find new conflicting delete files */ - protected void validateNoNewDeleteFiles(TableMetadata base, Long startingSnapshotId, Expression dataFilter) { + protected void validateNoNewDeleteFiles( + TableMetadata base, Long startingSnapshotId, Expression dataFilter) { DeleteFileIndex deletes = addedDeleteFiles(base, startingSnapshotId, dataFilter, null); - ValidationException.check(deletes.isEmpty(), + ValidationException.check( + deletes.isEmpty(), "Found new conflicting delete files that can apply to records matching %s: %s", - dataFilter, Iterables.transform(deletes.referencedDeleteFiles(), ContentFile::path)); + dataFilter, + Iterables.transform(deletes.referencedDeleteFiles(), ContentFile::path)); } /** - * Validates that no delete files matching a partition set have been added to the table since a starting snapshot. + * Validates that no delete files matching a partition set have been added to the table since a + * starting snapshot. * * @param base table metadata to validate * @param startingSnapshotId id of the snapshot current at the start of the operation * @param partitionSet a partition set used to find new conflicting delete files */ - protected void validateNoNewDeleteFiles(TableMetadata base, Long startingSnapshotId, - PartitionSet partitionSet) { + protected void validateNoNewDeleteFiles( + TableMetadata base, Long startingSnapshotId, PartitionSet partitionSet) { DeleteFileIndex deletes = addedDeleteFiles(base, startingSnapshotId, null, partitionSet); - ValidationException.check(deletes.isEmpty(), + ValidationException.check( + deletes.isEmpty(), "Found new conflicting delete files that can apply to records matching %s: %s", - partitionSet, Iterables.transform(deletes.referencedDeleteFiles(), ContentFile::path)); + partitionSet, + Iterables.transform(deletes.referencedDeleteFiles(), ContentFile::path)); } /** @@ -458,8 +494,11 @@ protected void validateNoNewDeleteFiles(TableMetadata base, Long startingSnapsho * @param dataFilter an expression used to find delete files * @param partitionSet a partition set used to find delete files */ - protected DeleteFileIndex addedDeleteFiles(TableMetadata base, Long startingSnapshotId, Expression dataFilter, - PartitionSet partitionSet) { + protected DeleteFileIndex addedDeleteFiles( + TableMetadata base, + Long startingSnapshotId, + Expression dataFilter, + PartitionSet partitionSet) { // if there is no current table state, return empty delete file index if (base.currentSnapshot() == null || base.formatVersion() < 2) { return DeleteFileIndex.builderFor(ops.io(), ImmutableList.of()) @@ -468,7 +507,11 @@ protected DeleteFileIndex addedDeleteFiles(TableMetadata base, Long startingSnap } Pair, Set> history = - validationHistory(base, startingSnapshotId, VALIDATE_ADDED_DELETE_FILES_OPERATIONS, ManifestContent.DELETES); + validationHistory( + base, + startingSnapshotId, + VALIDATE_ADDED_DELETE_FILES_OPERATIONS, + ManifestContent.DELETES); List deleteManifests = history.first(); long startingSequenceNumber = startingSequenceNumber(base, startingSnapshotId); @@ -476,22 +519,25 @@ protected DeleteFileIndex addedDeleteFiles(TableMetadata base, Long startingSnap } /** - * Validates that no files matching a filter have been deleted from the table since a starting snapshot. + * Validates that no files matching a filter have been deleted from the table since a starting + * snapshot. * * @param base table metadata to validate * @param startingSnapshotId id of the snapshot current at the start of the operation * @param dataFilter an expression used to find deleted data files */ - protected void validateDeletedDataFiles(TableMetadata base, Long startingSnapshotId, - Expression dataFilter) { + protected void validateDeletedDataFiles( + TableMetadata base, Long startingSnapshotId, Expression dataFilter) { CloseableIterable> conflictEntries = deletedDataFiles(base, startingSnapshotId, dataFilter, null); try (CloseableIterator> conflicts = conflictEntries.iterator()) { if (conflicts.hasNext()) { - throw new ValidationException("Found conflicting deleted files that can contain records matching %s: %s", + throw new ValidationException( + "Found conflicting deleted files that can contain records matching %s: %s", dataFilter, - Iterators.toString(Iterators.transform(conflicts, entry -> entry.file().path().toString()))); + Iterators.toString( + Iterators.transform(conflicts, entry -> entry.file().path().toString()))); } } catch (IOException e) { @@ -501,22 +547,25 @@ protected void validateDeletedDataFiles(TableMetadata base, Long startingSnapsho } /** - * Validates that no files matching a filter have been deleted from the table since a starting snapshot. + * Validates that no files matching a filter have been deleted from the table since a starting + * snapshot. * * @param base table metadata to validate * @param startingSnapshotId id of the snapshot current at the start of the operation * @param partitionSet a partition set used to find deleted data files */ - protected void validateDeletedDataFiles(TableMetadata base, Long startingSnapshotId, - PartitionSet partitionSet) { + protected void validateDeletedDataFiles( + TableMetadata base, Long startingSnapshotId, PartitionSet partitionSet) { CloseableIterable> conflictEntries = deletedDataFiles(base, startingSnapshotId, null, partitionSet); try (CloseableIterator> conflicts = conflictEntries.iterator()) { if (conflicts.hasNext()) { - throw new ValidationException("Found conflicting deleted files that can apply to records matching %s: %s", + throw new ValidationException( + "Found conflicting deleted files that can apply to records matching %s: %s", partitionSet, - Iterators.toString(Iterators.transform(conflicts, entry -> entry.file().path().toString()))); + Iterators.toString( + Iterators.transform(conflicts, entry -> entry.file().path().toString()))); } } catch (IOException e) { @@ -525,43 +574,47 @@ protected void validateDeletedDataFiles(TableMetadata base, Long startingSnapsho } } - /** - * Returns an iterable of files matching a filter have been added to the table since a starting snapshot. + * Returns an iterable of files matching a filter have been added to the table since a starting + * snapshot. * * @param base table metadata to validate * @param startingSnapshotId id of the snapshot current at the start of the operation * @param dataFilter an expression used to find deleted data files * @param partitionSet a set of partitions to find deleted data files */ - private CloseableIterable> deletedDataFiles(TableMetadata base, - Long startingSnapshotId, - Expression dataFilter, - PartitionSet partitionSet) { + private CloseableIterable> deletedDataFiles( + TableMetadata base, + Long startingSnapshotId, + Expression dataFilter, + PartitionSet partitionSet) { // if there is no current table state, no files have been deleted if (base.currentSnapshot() == null) { return CloseableIterable.empty(); } Pair, Set> history = - validationHistory(base, startingSnapshotId, VALIDATE_DATA_FILES_EXIST_OPERATIONS, ManifestContent.DATA); + validationHistory( + base, startingSnapshotId, VALIDATE_DATA_FILES_EXIST_OPERATIONS, ManifestContent.DATA); List manifests = history.first(); Set newSnapshots = history.second(); - ManifestGroup manifestGroup = new ManifestGroup(ops.io(), manifests, ImmutableList.of()) - .caseSensitive(caseSensitive) - .filterManifestEntries(entry -> newSnapshots.contains(entry.snapshotId())) - .filterManifestEntries(entry -> entry.status().equals(ManifestEntry.Status.DELETED)) - .specsById(base.specsById()) - .ignoreExisting(); + ManifestGroup manifestGroup = + new ManifestGroup(ops.io(), manifests, ImmutableList.of()) + .caseSensitive(caseSensitive) + .filterManifestEntries(entry -> newSnapshots.contains(entry.snapshotId())) + .filterManifestEntries(entry -> entry.status().equals(ManifestEntry.Status.DELETED)) + .specsById(base.specsById()) + .ignoreExisting(); if (dataFilter != null) { manifestGroup = manifestGroup.filterData(dataFilter); } if (partitionSet != null) { - manifestGroup = manifestGroup.filterManifestEntries(entry -> - partitionSet.contains(entry.file().specId(), entry.file().partition())); + manifestGroup = + manifestGroup.filterManifestEntries( + entry -> partitionSet.contains(entry.file().specId(), entry.file().partition())); } return manifestGroup.entries(); @@ -580,12 +633,16 @@ private long startingSequenceNumber(TableMetadata metadata, Long staringSnapshot } } - private DeleteFileIndex buildDeleteFileIndex(List deleteManifests, long startingSequenceNumber, - Expression dataFilter, PartitionSet partitionSet) { - DeleteFileIndex.Builder builder = DeleteFileIndex.builderFor(ops.io(), deleteManifests) - .afterSequenceNumber(startingSequenceNumber) - .caseSensitive(caseSensitive) - .specsById(ops.current().specsById()); + private DeleteFileIndex buildDeleteFileIndex( + List deleteManifests, + long startingSequenceNumber, + Expression dataFilter, + PartitionSet partitionSet) { + DeleteFileIndex.Builder builder = + DeleteFileIndex.builderFor(ops.io(), deleteManifests) + .afterSequenceNumber(startingSequenceNumber) + .caseSensitive(caseSensitive) + .specsById(ops.current().specsById()); if (dataFilter != null) { builder.filterData(dataFilter); @@ -599,37 +656,48 @@ private DeleteFileIndex buildDeleteFileIndex(List deleteManifests, } @SuppressWarnings("CollectionUndefinedEquality") - protected void validateDataFilesExist(TableMetadata base, Long startingSnapshotId, - CharSequenceSet requiredDataFiles, boolean skipDeletes, - Expression conflictDetectionFilter) { + protected void validateDataFilesExist( + TableMetadata base, + Long startingSnapshotId, + CharSequenceSet requiredDataFiles, + boolean skipDeletes, + Expression conflictDetectionFilter) { // if there is no current table state, no files have been removed if (base.currentSnapshot() == null) { return; } - Set matchingOperations = skipDeletes ? - VALIDATE_DATA_FILES_EXIST_SKIP_DELETE_OPERATIONS : - VALIDATE_DATA_FILES_EXIST_OPERATIONS; + Set matchingOperations = + skipDeletes + ? VALIDATE_DATA_FILES_EXIST_SKIP_DELETE_OPERATIONS + : VALIDATE_DATA_FILES_EXIST_OPERATIONS; Pair, Set> history = validationHistory(base, startingSnapshotId, matchingOperations, ManifestContent.DATA); List manifests = history.first(); Set newSnapshots = history.second(); - ManifestGroup matchingDeletesGroup = new ManifestGroup(ops.io(), manifests, ImmutableList.of()) - .filterManifestEntries(entry -> entry.status() != ManifestEntry.Status.ADDED && - newSnapshots.contains(entry.snapshotId()) && requiredDataFiles.contains(entry.file().path())) - .specsById(base.specsById()) - .ignoreExisting(); + ManifestGroup matchingDeletesGroup = + new ManifestGroup(ops.io(), manifests, ImmutableList.of()) + .filterManifestEntries( + entry -> + entry.status() != ManifestEntry.Status.ADDED + && newSnapshots.contains(entry.snapshotId()) + && requiredDataFiles.contains(entry.file().path())) + .specsById(base.specsById()) + .ignoreExisting(); if (conflictDetectionFilter != null) { matchingDeletesGroup.filterData(conflictDetectionFilter); } - try (CloseableIterator> deletes = matchingDeletesGroup.entries().iterator()) { + try (CloseableIterator> deletes = + matchingDeletesGroup.entries().iterator()) { if (deletes.hasNext()) { - throw new ValidationException("Cannot commit, missing data files: %s", - Iterators.toString(Iterators.transform(deletes, entry -> entry.file().path().toString()))); + throw new ValidationException( + "Cannot commit, missing data files: %s", + Iterators.toString( + Iterators.transform(deletes, entry -> entry.file().path().toString()))); } } catch (IOException e) { @@ -637,15 +705,18 @@ protected void validateDataFilesExist(TableMetadata base, Long startingSnapshotI } } - private Pair, Set> validationHistory(TableMetadata base, Long startingSnapshotId, - Set matchingOperations, - ManifestContent content) { + private Pair, Set> validationHistory( + TableMetadata base, + Long startingSnapshotId, + Set matchingOperations, + ManifestContent content) { List manifests = Lists.newArrayList(); Set newSnapshots = Sets.newHashSet(); Snapshot lastSnapshot = null; - Iterable snapshots = SnapshotUtil.ancestorsBetween( - base.currentSnapshot().snapshotId(), startingSnapshotId, base::snapshot); + Iterable snapshots = + SnapshotUtil.ancestorsBetween( + base.currentSnapshot().snapshotId(), startingSnapshotId, base::snapshot); for (Snapshot currentSnapshot : snapshots) { lastSnapshot = currentSnapshot; @@ -667,17 +738,22 @@ private Pair, Set> validationHistory(TableMetadata base } } - ValidationException.check(lastSnapshot == null || Objects.equals(lastSnapshot.parentId(), startingSnapshotId), + ValidationException.check( + lastSnapshot == null || Objects.equals(lastSnapshot.parentId(), startingSnapshotId), "Cannot determine history between starting snapshot %s and the last known ancestor %s", - startingSnapshotId, lastSnapshot != null ? lastSnapshot.snapshotId() : null); + startingSnapshotId, + lastSnapshot != null ? lastSnapshot.snapshotId() : null); return Pair.of(manifests, newSnapshots); } @Override protected Map summary() { - summaryBuilder.setPartitionSummaryLimit(ops.current().propertyAsInt( - TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, TableProperties.WRITE_PARTITION_SUMMARY_LIMIT_DEFAULT)); + summaryBuilder.setPartitionSummaryLimit( + ops.current() + .propertyAsInt( + TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, + TableProperties.WRITE_PARTITION_SUMMARY_LIMIT_DEFAULT)); return summaryBuilder.build(); } @@ -686,23 +762,33 @@ public List apply(TableMetadata base) { Snapshot current = base.currentSnapshot(); // filter any existing manifests - List filtered = filterManager.filterManifests( - base.schema(), current != null ? current.dataManifests(ops.io()) : null); - long minDataSequenceNumber = filtered.stream() - .map(ManifestFile::minSequenceNumber) - .filter(seq -> seq != ManifestWriter.UNASSIGNED_SEQ) // filter out unassigned in rewritten manifests - .reduce(base.lastSequenceNumber(), Math::min); + List filtered = + filterManager.filterManifests( + base.schema(), current != null ? current.dataManifests(ops.io()) : null); + long minDataSequenceNumber = + filtered.stream() + .map(ManifestFile::minSequenceNumber) + .filter( + seq -> + seq + != ManifestWriter + .UNASSIGNED_SEQ) // filter out unassigned in rewritten manifests + .reduce(base.lastSequenceNumber(), Math::min); deleteFilterManager.dropDeleteFilesOlderThan(minDataSequenceNumber); - List filteredDeletes = deleteFilterManager.filterManifests( - base.schema(), current != null ? current.deleteManifests(ops.io()) : null); + List filteredDeletes = + deleteFilterManager.filterManifests( + base.schema(), current != null ? current.deleteManifests(ops.io()) : null); // only keep manifests that have live data files or that were written by this commit - Predicate shouldKeep = manifest -> - manifest.hasAddedFiles() || manifest.hasExistingFiles() || manifest.snapshotId() == snapshotId(); - Iterable unmergedManifests = Iterables.filter( - Iterables.concat(prepareNewManifests(), filtered), shouldKeep); - Iterable unmergedDeleteManifests = Iterables.filter( - Iterables.concat(prepareDeleteManifests(), filteredDeletes), shouldKeep); + Predicate shouldKeep = + manifest -> + manifest.hasAddedFiles() + || manifest.hasExistingFiles() + || manifest.snapshotId() == snapshotId(); + Iterable unmergedManifests = + Iterables.filter(Iterables.concat(prepareNewManifests(), filtered), shouldKeep); + Iterable unmergedDeleteManifests = + Iterables.filter(Iterables.concat(prepareDeleteManifests(), filteredDeletes), shouldKeep); // update the snapshot summary summaryBuilder.clear(); @@ -725,7 +811,8 @@ public Object updateEvent() { long sequenceNumber = TableMetadata.INVALID_SEQUENCE_NUMBER; Map summary; if (justSaved == null) { - // The snapshot just saved may not be present if the latest metadata couldn't be loaded due to eventual + // The snapshot just saved may not be present if the latest metadata couldn't be loaded due to + // eventual // consistency problems in refresh. LOG.warn("Failed to load committed snapshot: omitting sequence number from notifications"); summary = summary(); @@ -734,12 +821,7 @@ public Object updateEvent() { summary = justSaved.summary(); } - return new CreateSnapshotEvent( - tableName, - operation(), - snapshotId, - sequenceNumber, - summary); + return new CreateSnapshotEvent(tableName, operation(), snapshotId, sequenceNumber, summary); } private void cleanUncommittedAppends(Set committed) { @@ -789,7 +871,9 @@ private Iterable prepareNewManifests() { Iterable newManifests; if (newFiles.size() > 0) { ManifestFile newManifest = newFilesAsManifest(); - newManifests = Iterables.concat(ImmutableList.of(newManifest), appendManifests, rewrittenAppendManifests); + newManifests = + Iterables.concat( + ImmutableList.of(newManifest), appendManifests, rewrittenAppendManifests); } else { newManifests = Iterables.concat(appendManifests, rewrittenAppendManifests); } @@ -847,20 +931,21 @@ private List newDeleteFilesAsManifests() { } if (cachedNewDeleteManifests.isEmpty()) { - newDeleteFilesBySpec.forEach((specId, deleteFiles) -> { - PartitionSpec spec = ops.current().spec(specId); - try { - ManifestWriter writer = newDeleteManifestWriter(spec); - try { - writer.addAll(deleteFiles); - } finally { - writer.close(); - } - cachedNewDeleteManifests.add(writer.toManifestFile()); - } catch (IOException e) { - throw new RuntimeIOException(e, "Failed to close manifest writer"); - } - }); + newDeleteFilesBySpec.forEach( + (specId, deleteFiles) -> { + PartitionSpec spec = ops.current().spec(specId); + try { + ManifestWriter writer = newDeleteManifestWriter(spec); + try { + writer.addAll(deleteFiles); + } finally { + writer.close(); + } + cachedNewDeleteManifests.add(writer.toManifestFile()); + } catch (IOException e) { + throw new RuntimeIOException(e, "Failed to close manifest writer"); + } + }); this.hasNewDeleteFiles = false; } @@ -891,7 +976,8 @@ protected ManifestReader newManifestReader(ManifestFile manifest) { private class DataFileMergeManager extends ManifestMergeManager { DataFileMergeManager(long targetSizeBytes, int minCountToMerge, boolean mergeEnabled) { - super(targetSizeBytes, minCountToMerge, mergeEnabled, MergingSnapshotProducer.this::workerPool); + super( + targetSizeBytes, minCountToMerge, mergeEnabled, MergingSnapshotProducer.this::workerPool); } @Override @@ -943,7 +1029,8 @@ protected ManifestReader newManifestReader(ManifestFile manifest) { private class DeleteFileMergeManager extends ManifestMergeManager { DeleteFileMergeManager(long targetSizeBytes, int minCountToMerge, boolean mergeEnabled) { - super(targetSizeBytes, minCountToMerge, mergeEnabled, MergingSnapshotProducer.this::workerPool); + super( + targetSizeBytes, minCountToMerge, mergeEnabled, MergingSnapshotProducer.this::workerPool); } @Override diff --git a/core/src/main/java/org/apache/iceberg/MetadataColumns.java b/core/src/main/java/org/apache/iceberg/MetadataColumns.java index af7b655b2bfe..dc6de143b0e4 100644 --- a/core/src/main/java/org/apache/iceberg/MetadataColumns.java +++ b/core/src/main/java/org/apache/iceberg/MetadataColumns.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Map; @@ -28,46 +27,69 @@ public class MetadataColumns { - private MetadataColumns() { - } + private MetadataColumns() {} // IDs Integer.MAX_VALUE - (1-100) are used for metadata columns - public static final NestedField FILE_PATH = NestedField.required( - Integer.MAX_VALUE - 1, "_file", Types.StringType.get(), "Path of the file in which a row is stored"); - public static final NestedField ROW_POSITION = NestedField.required( - Integer.MAX_VALUE - 2, "_pos", Types.LongType.get(), "Ordinal position of a row in the source data file"); - public static final NestedField IS_DELETED = NestedField.required( - Integer.MAX_VALUE - 3, "_deleted", Types.BooleanType.get(), "Whether the row has been deleted"); - public static final NestedField SPEC_ID = NestedField.required( - Integer.MAX_VALUE - 4, "_spec_id", Types.IntegerType.get(), "Spec ID used to track the file containing a row"); + public static final NestedField FILE_PATH = + NestedField.required( + Integer.MAX_VALUE - 1, + "_file", + Types.StringType.get(), + "Path of the file in which a row is stored"); + public static final NestedField ROW_POSITION = + NestedField.required( + Integer.MAX_VALUE - 2, + "_pos", + Types.LongType.get(), + "Ordinal position of a row in the source data file"); + public static final NestedField IS_DELETED = + NestedField.required( + Integer.MAX_VALUE - 3, + "_deleted", + Types.BooleanType.get(), + "Whether the row has been deleted"); + public static final NestedField SPEC_ID = + NestedField.required( + Integer.MAX_VALUE - 4, + "_spec_id", + Types.IntegerType.get(), + "Spec ID used to track the file containing a row"); // the partition column type is not static and depends on all specs in the table public static final int PARTITION_COLUMN_ID = Integer.MAX_VALUE - 5; public static final String PARTITION_COLUMN_NAME = "_partition"; public static final String PARTITION_COLUMN_DOC = "Partition to which a row belongs to"; // IDs Integer.MAX_VALUE - (101-200) are used for reserved columns - public static final NestedField DELETE_FILE_PATH = NestedField.required( - Integer.MAX_VALUE - 101, "file_path", Types.StringType.get(), "Path of a file in which a deleted row is stored"); - public static final NestedField DELETE_FILE_POS = NestedField.required( - Integer.MAX_VALUE - 102, "pos", Types.LongType.get(), "Ordinal position of a deleted row in the data file"); + public static final NestedField DELETE_FILE_PATH = + NestedField.required( + Integer.MAX_VALUE - 101, + "file_path", + Types.StringType.get(), + "Path of a file in which a deleted row is stored"); + public static final NestedField DELETE_FILE_POS = + NestedField.required( + Integer.MAX_VALUE - 102, + "pos", + Types.LongType.get(), + "Ordinal position of a deleted row in the data file"); public static final String DELETE_FILE_ROW_FIELD_NAME = "row"; public static final int DELETE_FILE_ROW_FIELD_ID = Integer.MAX_VALUE - 103; public static final String DELETE_FILE_ROW_DOC = "Deleted row values"; - private static final Map META_COLUMNS = ImmutableMap.of( - FILE_PATH.name(), FILE_PATH, - ROW_POSITION.name(), ROW_POSITION, - IS_DELETED.name(), IS_DELETED, - SPEC_ID.name(), SPEC_ID - ); + private static final Map META_COLUMNS = + ImmutableMap.of( + FILE_PATH.name(), FILE_PATH, + ROW_POSITION.name(), ROW_POSITION, + IS_DELETED.name(), IS_DELETED, + SPEC_ID.name(), SPEC_ID); - private static final Set META_IDS = ImmutableSet.of( - FILE_PATH.fieldId(), - ROW_POSITION.fieldId(), - IS_DELETED.fieldId(), - SPEC_ID.fieldId(), - PARTITION_COLUMN_ID - ); + private static final Set META_IDS = + ImmutableSet.of( + FILE_PATH.fieldId(), + ROW_POSITION.fieldId(), + IS_DELETED.fieldId(), + SPEC_ID.fieldId(), + PARTITION_COLUMN_ID); public static Set metadataFieldIds() { return META_IDS; diff --git a/core/src/main/java/org/apache/iceberg/MetadataLogsTable.java b/core/src/main/java/org/apache/iceberg/MetadataLogsTable.java index 4f29894ed94d..a8d02fb3eb27 100644 --- a/core/src/main/java/org/apache/iceberg/MetadataLogsTable.java +++ b/core/src/main/java/org/apache/iceberg/MetadataLogsTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -27,13 +26,13 @@ public class MetadataLogsTable extends BaseMetadataTable { - private static final Schema METADATA_LOGS_SCHEMA = new Schema( - Types.NestedField.required(1, "timestamp_millis", Types.LongType.get()), - Types.NestedField.required(2, "file", Types.StringType.get()), - Types.NestedField.optional(3, "latest_snapshot_id", Types.LongType.get()), - Types.NestedField.optional(4, "latest_schema_id", Types.IntegerType.get()), - Types.NestedField.optional(5, "latest_sequence_number", Types.LongType.get()) - ); + private static final Schema METADATA_LOGS_SCHEMA = + new Schema( + Types.NestedField.required(1, "timestamp_millis", Types.LongType.get()), + Types.NestedField.required(2, "file", Types.StringType.get()), + Types.NestedField.optional(3, "latest_snapshot_id", Types.LongType.get()), + Types.NestedField.optional(4, "latest_schema_id", Types.IntegerType.get()), + Types.NestedField.optional(5, "latest_sequence_number", Types.LongType.get())); MetadataLogsTable(TableOperations ops, Table table) { this(ops, table, table.name() + ".metadata_logs"); @@ -62,28 +61,40 @@ private DataTask task(TableScan scan) { TableOperations ops = operations(); List metadataLogEntries = Lists.newArrayList(ops.current().previousFiles().listIterator()); - metadataLogEntries.add(new TableMetadata.MetadataLogEntry(ops.current().lastUpdatedMillis(), - ops.current().metadataFileLocation())); + metadataLogEntries.add( + new TableMetadata.MetadataLogEntry( + ops.current().lastUpdatedMillis(), ops.current().metadataFileLocation())); return StaticDataTask.of( ops.io().newInputFile(ops.current().metadataFileLocation()), schema(), scan.schema(), metadataLogEntries, - metadataLogEntry -> MetadataLogsTable.metadataLogToRow(metadataLogEntry, table()) - ); + metadataLogEntry -> MetadataLogsTable.metadataLogToRow(metadataLogEntry, table())); } private class MetadataLogScan extends StaticTableScan { MetadataLogScan(TableOperations ops, Table table) { - super(ops, table, METADATA_LOGS_SCHEMA, MetadataTableType.METADATA_LOGS, MetadataLogsTable.this::task); + super( + ops, + table, + METADATA_LOGS_SCHEMA, + MetadataTableType.METADATA_LOGS, + MetadataLogsTable.this::task); } MetadataLogScan(TableOperations ops, Table table, TableScanContext context) { - super(ops, table, METADATA_LOGS_SCHEMA, MetadataTableType.METADATA_LOGS, MetadataLogsTable.this::task, context); + super( + ops, + table, + METADATA_LOGS_SCHEMA, + MetadataTableType.METADATA_LOGS, + MetadataLogsTable.this::task, + context); } @Override - protected TableScan newRefinedScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + protected TableScan newRefinedScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { return new MetadataLogScan(ops, table, context); } @@ -93,7 +104,8 @@ public CloseableIterable planFiles() { } } - private static StaticDataTask.Row metadataLogToRow(TableMetadata.MetadataLogEntry metadataLogEntry, Table table) { + private static StaticDataTask.Row metadataLogToRow( + TableMetadata.MetadataLogEntry metadataLogEntry, Table table) { Long latestSnapshotId = null; Snapshot latestSnapshot = null; try { @@ -109,7 +121,6 @@ private static StaticDataTask.Row metadataLogToRow(TableMetadata.MetadataLogEntr // latest snapshot in this file corresponding to the log entry latestSnapshotId, latestSnapshot != null ? latestSnapshot.schemaId() : null, - latestSnapshot != null ? latestSnapshot.sequenceNumber() : null - ); + latestSnapshot != null ? latestSnapshot.sequenceNumber() : null); } } diff --git a/core/src/main/java/org/apache/iceberg/MetadataTableType.java b/core/src/main/java/org/apache/iceberg/MetadataTableType.java index ce3ee2202b84..d1ea1c7d1ba4 100644 --- a/core/src/main/java/org/apache/iceberg/MetadataTableType.java +++ b/core/src/main/java/org/apache/iceberg/MetadataTableType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Locale; diff --git a/core/src/main/java/org/apache/iceberg/MetadataTableUtils.java b/core/src/main/java/org/apache/iceberg/MetadataTableUtils.java index 7246d18a31b5..2554aee96b49 100644 --- a/core/src/main/java/org/apache/iceberg/MetadataTableUtils.java +++ b/core/src/main/java/org/apache/iceberg/MetadataTableUtils.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Locale; @@ -24,8 +23,7 @@ import org.apache.iceberg.exceptions.NoSuchTableException; public class MetadataTableUtils { - private MetadataTableUtils() { - } + private MetadataTableUtils() {} public static boolean hasMetadataTableName(TableIdentifier identifier) { return MetadataTableType.from(identifier.name()) != null; @@ -36,21 +34,19 @@ public static Table createMetadataTableInstance(Table table, MetadataTableType t TableOperations ops = ((BaseTable) table).operations(); return createMetadataTableInstance(ops, table, metadataTableName(table.name(), type), type); } else { - throw new IllegalArgumentException(String.format( - "Cannot create metadata table for table %s: not a base table", table)); + throw new IllegalArgumentException( + String.format("Cannot create metadata table for table %s: not a base table", table)); } } - public static Table createMetadataTableInstance(TableOperations ops, - String baseTableName, - String metadataTableName, - MetadataTableType type) { + public static Table createMetadataTableInstance( + TableOperations ops, String baseTableName, String metadataTableName, MetadataTableType type) { Table baseTable = new BaseTable(ops, baseTableName); return createMetadataTableInstance(ops, baseTable, metadataTableName, type); } - private static Table createMetadataTableInstance(TableOperations ops, Table baseTable, String metadataTableName, - MetadataTableType type) { + private static Table createMetadataTableInstance( + TableOperations ops, Table baseTable, String metadataTableName, MetadataTableType type) { switch (type) { case ENTRIES: return new ManifestEntriesTable(ops, baseTable, metadataTableName); @@ -81,17 +77,20 @@ private static Table createMetadataTableInstance(TableOperations ops, Table base case ALL_ENTRIES: return new AllEntriesTable(ops, baseTable, metadataTableName); default: - throw new NoSuchTableException("Unknown metadata table type: %s for %s", type, metadataTableName); + throw new NoSuchTableException( + "Unknown metadata table type: %s for %s", type, metadataTableName); } } - public static Table createMetadataTableInstance(TableOperations ops, - String catalogName, - TableIdentifier baseTableIdentifier, - TableIdentifier metadataTableIdentifier, - MetadataTableType type) { + public static Table createMetadataTableInstance( + TableOperations ops, + String catalogName, + TableIdentifier baseTableIdentifier, + TableIdentifier metadataTableIdentifier, + MetadataTableType type) { String baseTableName = BaseMetastoreCatalog.fullTableName(catalogName, baseTableIdentifier); - String metadataTableName = BaseMetastoreCatalog.fullTableName(catalogName, metadataTableIdentifier); + String metadataTableName = + BaseMetastoreCatalog.fullTableName(catalogName, metadataTableIdentifier); return createMetadataTableInstance(ops, baseTableName, metadataTableName, type); } diff --git a/core/src/main/java/org/apache/iceberg/MetadataUpdate.java b/core/src/main/java/org/apache/iceberg/MetadataUpdate.java index 90eaaa584157..1bb467840be0 100644 --- a/core/src/main/java/org/apache/iceberg/MetadataUpdate.java +++ b/core/src/main/java/org/apache/iceberg/MetadataUpdate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; @@ -25,9 +24,7 @@ import java.util.Set; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -/** - * Represents a change to table metadata. - */ +/** Represents a change to table metadata. */ public interface MetadataUpdate extends Serializable { void applyTo(TableMetadata.Builder metadataBuilder); @@ -240,8 +237,13 @@ class SetSnapshotRef implements MetadataUpdate { private Long maxSnapshotAgeMs; private Long maxRefAgeMs; - public SetSnapshotRef(String refName, Long snapshotId, SnapshotRefType type, Integer minSnapshotsToKeep, - Long maxSnapshotAgeMs, Long maxRefAgeMs) { + public SetSnapshotRef( + String refName, + Long snapshotId, + SnapshotRefType type, + Integer minSnapshotsToKeep, + Long maxSnapshotAgeMs, + Long maxRefAgeMs) { this.refName = refName; this.snapshotId = snapshotId; this.type = type; @@ -276,11 +278,12 @@ public Long maxRefAgeMs() { @Override public void applyTo(TableMetadata.Builder metadataBuilder) { - SnapshotRef ref = SnapshotRef.builderFor(snapshotId, type) - .minSnapshotsToKeep(minSnapshotsToKeep) - .maxSnapshotAgeMs(maxSnapshotAgeMs) - .maxRefAgeMs(maxRefAgeMs) - .build(); + SnapshotRef ref = + SnapshotRef.builderFor(snapshotId, type) + .minSnapshotsToKeep(minSnapshotsToKeep) + .maxSnapshotAgeMs(maxSnapshotAgeMs) + .maxRefAgeMs(maxRefAgeMs) + .build(); metadataBuilder.setRef(refName, ref); } } diff --git a/core/src/main/java/org/apache/iceberg/MetadataUpdateParser.java b/core/src/main/java/org/apache/iceberg/MetadataUpdateParser.java index 8b3cb41edb81..9b7a76c1572d 100644 --- a/core/src/main/java/org/apache/iceberg/MetadataUpdateParser.java +++ b/core/src/main/java/org/apache/iceberg/MetadataUpdateParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.fasterxml.jackson.core.JsonGenerator; @@ -35,8 +34,7 @@ public class MetadataUpdateParser { - private MetadataUpdateParser() { - } + private MetadataUpdateParser() {} private static final String ACTION = "action"; @@ -105,24 +103,24 @@ private MetadataUpdateParser() { // SetLocation private static final String LOCATION = "location"; - private static final Map, String> ACTIONS = ImmutableMap - ., String>builder() - .put(MetadataUpdate.AssignUUID.class, ASSIGN_UUID) - .put(MetadataUpdate.UpgradeFormatVersion.class, UPGRADE_FORMAT_VERSION) - .put(MetadataUpdate.AddSchema.class, ADD_SCHEMA) - .put(MetadataUpdate.SetCurrentSchema.class, SET_CURRENT_SCHEMA) - .put(MetadataUpdate.AddPartitionSpec.class, ADD_PARTITION_SPEC) - .put(MetadataUpdate.SetDefaultPartitionSpec.class, SET_DEFAULT_PARTITION_SPEC) - .put(MetadataUpdate.AddSortOrder.class, ADD_SORT_ORDER) - .put(MetadataUpdate.SetDefaultSortOrder.class, SET_DEFAULT_SORT_ORDER) - .put(MetadataUpdate.AddSnapshot.class, ADD_SNAPSHOT) - .put(MetadataUpdate.RemoveSnapshot.class, REMOVE_SNAPSHOTS) - .put(MetadataUpdate.RemoveSnapshotRef.class, REMOVE_SNAPSHOT_REF) - .put(MetadataUpdate.SetSnapshotRef.class, SET_SNAPSHOT_REF) - .put(MetadataUpdate.SetProperties.class, SET_PROPERTIES) - .put(MetadataUpdate.RemoveProperties.class, REMOVE_PROPERTIES) - .put(MetadataUpdate.SetLocation.class, SET_LOCATION) - .build(); + private static final Map, String> ACTIONS = + ImmutableMap., String>builder() + .put(MetadataUpdate.AssignUUID.class, ASSIGN_UUID) + .put(MetadataUpdate.UpgradeFormatVersion.class, UPGRADE_FORMAT_VERSION) + .put(MetadataUpdate.AddSchema.class, ADD_SCHEMA) + .put(MetadataUpdate.SetCurrentSchema.class, SET_CURRENT_SCHEMA) + .put(MetadataUpdate.AddPartitionSpec.class, ADD_PARTITION_SPEC) + .put(MetadataUpdate.SetDefaultPartitionSpec.class, SET_DEFAULT_PARTITION_SPEC) + .put(MetadataUpdate.AddSortOrder.class, ADD_SORT_ORDER) + .put(MetadataUpdate.SetDefaultSortOrder.class, SET_DEFAULT_SORT_ORDER) + .put(MetadataUpdate.AddSnapshot.class, ADD_SNAPSHOT) + .put(MetadataUpdate.RemoveSnapshot.class, REMOVE_SNAPSHOTS) + .put(MetadataUpdate.RemoveSnapshotRef.class, REMOVE_SNAPSHOT_REF) + .put(MetadataUpdate.SetSnapshotRef.class, SET_SNAPSHOT_REF) + .put(MetadataUpdate.SetProperties.class, SET_PROPERTIES) + .put(MetadataUpdate.RemoveProperties.class, REMOVE_PROPERTIES) + .put(MetadataUpdate.SetLocation.class, SET_LOCATION) + .build(); public static String toJson(MetadataUpdate metadataUpdate) { return toJson(metadataUpdate, false); @@ -139,15 +137,19 @@ public static String toJson(MetadataUpdate metadataUpdate, boolean pretty) { generator.flush(); return writer.toString(); } catch (IOException e) { - throw new UncheckedIOException(String.format("Failed to write metadata update json for: %s", metadataUpdate), e); + throw new UncheckedIOException( + String.format("Failed to write metadata update json for: %s", metadataUpdate), e); } } - public static void toJson(MetadataUpdate metadataUpdate, JsonGenerator generator) throws IOException { + public static void toJson(MetadataUpdate metadataUpdate, JsonGenerator generator) + throws IOException { String updateAction = ACTIONS.get(metadataUpdate.getClass()); - // Provide better exception message than the NPE thrown by writing null for the update action, which is required - Preconditions.checkArgument(updateAction != null, + // Provide better exception message than the NPE thrown by writing null for the update action, + // which is required + Preconditions.checkArgument( + updateAction != null, "Cannot convert metadata update to json. Unrecognized metadata update type: {}", metadataUpdate.getClass().getName()); @@ -171,7 +173,8 @@ public static void toJson(MetadataUpdate metadataUpdate, JsonGenerator generator writeAddPartitionSpec((MetadataUpdate.AddPartitionSpec) metadataUpdate, generator); break; case SET_DEFAULT_PARTITION_SPEC: - writeSetDefaultPartitionSpec((MetadataUpdate.SetDefaultPartitionSpec) metadataUpdate, generator); + writeSetDefaultPartitionSpec( + (MetadataUpdate.SetDefaultPartitionSpec) metadataUpdate, generator); break; case ADD_SORT_ORDER: writeAddSortOrder((MetadataUpdate.AddSortOrder) metadataUpdate, generator); @@ -202,7 +205,8 @@ public static void toJson(MetadataUpdate metadataUpdate, JsonGenerator generator break; default: throw new IllegalArgumentException( - String.format("Cannot convert metadata update to json. Unrecognized action: %s", updateAction)); + String.format( + "Cannot convert metadata update to json. Unrecognized action: %s", updateAction)); } generator.writeEndObject(); @@ -223,9 +227,12 @@ public static MetadataUpdate fromJson(String json) { } public static MetadataUpdate fromJson(JsonNode jsonNode) { - Preconditions.checkArgument(jsonNode != null && jsonNode.isObject(), - "Cannot parse metadata update from non-object value: %s", jsonNode); - Preconditions.checkArgument(jsonNode.hasNonNull(ACTION), "Cannot parse metadata update. Missing field: action"); + Preconditions.checkArgument( + jsonNode != null && jsonNode.isObject(), + "Cannot parse metadata update from non-object value: %s", + jsonNode); + Preconditions.checkArgument( + jsonNode.hasNonNull(ACTION), "Cannot parse metadata update. Missing field: action"); String action = JsonUtil.getString(ACTION, jsonNode).toLowerCase(Locale.ROOT); switch (action) { @@ -265,34 +272,36 @@ public static MetadataUpdate fromJson(JsonNode jsonNode) { } } - private static void writeAssignUUID(MetadataUpdate.AssignUUID update, JsonGenerator gen) throws IOException { + private static void writeAssignUUID(MetadataUpdate.AssignUUID update, JsonGenerator gen) + throws IOException { gen.writeStringField(UUID, update.uuid()); } - private static void writeUpgradeFormatVersion(MetadataUpdate.UpgradeFormatVersion update, JsonGenerator gen) - throws IOException { + private static void writeUpgradeFormatVersion( + MetadataUpdate.UpgradeFormatVersion update, JsonGenerator gen) throws IOException { gen.writeNumberField(FORMAT_VERSION, update.formatVersion()); } - private static void writeAddSchema(MetadataUpdate.AddSchema update, JsonGenerator gen) throws IOException { + private static void writeAddSchema(MetadataUpdate.AddSchema update, JsonGenerator gen) + throws IOException { gen.writeFieldName(SCHEMA); SchemaParser.toJson(update.schema(), gen); gen.writeNumberField(LAST_COLUMN_ID, update.lastColumnId()); } - private static void writeSetCurrentSchema(MetadataUpdate.SetCurrentSchema update, JsonGenerator gen) - throws IOException { + private static void writeSetCurrentSchema( + MetadataUpdate.SetCurrentSchema update, JsonGenerator gen) throws IOException { gen.writeNumberField(SCHEMA_ID, update.schemaId()); } - private static void writeAddPartitionSpec(MetadataUpdate.AddPartitionSpec update, JsonGenerator gen) - throws IOException { + private static void writeAddPartitionSpec( + MetadataUpdate.AddPartitionSpec update, JsonGenerator gen) throws IOException { gen.writeFieldName(SPEC); PartitionSpecParser.toJson(update.spec(), gen); } - private static void writeSetDefaultPartitionSpec(MetadataUpdate.SetDefaultPartitionSpec update, JsonGenerator gen) - throws IOException { + private static void writeSetDefaultPartitionSpec( + MetadataUpdate.SetDefaultPartitionSpec update, JsonGenerator gen) throws IOException { gen.writeNumberField(SPEC_ID, update.specId()); } @@ -302,18 +311,21 @@ private static void writeAddSortOrder(MetadataUpdate.AddSortOrder update, JsonGe SortOrderParser.toJson(update.sortOrder(), gen); } - private static void writeSetDefaultSortOrder(MetadataUpdate.SetDefaultSortOrder update, JsonGenerator gen) - throws IOException { + private static void writeSetDefaultSortOrder( + MetadataUpdate.SetDefaultSortOrder update, JsonGenerator gen) throws IOException { gen.writeNumberField(SORT_ORDER_ID, update.sortOrderId()); } - private static void writeAddSnapshot(MetadataUpdate.AddSnapshot update, JsonGenerator gen) throws IOException { + private static void writeAddSnapshot(MetadataUpdate.AddSnapshot update, JsonGenerator gen) + throws IOException { gen.writeFieldName(SNAPSHOT); SnapshotParser.toJson(update.snapshot(), gen); } - // TODO - Reconcile the spec's set-based removal with the current class implementation that only handles one value. - private static void writeRemoveSnapshots(MetadataUpdate.RemoveSnapshot update, JsonGenerator gen) throws IOException { + // TODO - Reconcile the spec's set-based removal with the current class implementation that only + // handles one value. + private static void writeRemoveSnapshots(MetadataUpdate.RemoveSnapshot update, JsonGenerator gen) + throws IOException { gen.writeArrayFieldStart(SNAPSHOT_IDS); for (long snapshotId : ImmutableSet.of(update.snapshotId())) { gen.writeNumber(snapshotId); @@ -321,33 +333,41 @@ private static void writeRemoveSnapshots(MetadataUpdate.RemoveSnapshot update, J gen.writeEndArray(); } - private static void writeSetSnapshotRef(MetadataUpdate.SetSnapshotRef update, JsonGenerator gen) throws IOException { + private static void writeSetSnapshotRef(MetadataUpdate.SetSnapshotRef update, JsonGenerator gen) + throws IOException { gen.writeStringField(REF_NAME, update.name()); gen.writeNumberField(SNAPSHOT_ID, update.snapshotId()); gen.writeStringField(TYPE, update.type()); JsonUtil.writeIntegerFieldIf( - update.minSnapshotsToKeep() != null, MIN_SNAPSHOTS_TO_KEEP, update.minSnapshotsToKeep(), gen); - JsonUtil.writeLongFieldIf(update.maxSnapshotAgeMs() != null, MAX_SNAPSHOT_AGE_MS, update.maxSnapshotAgeMs(), gen); - JsonUtil.writeLongFieldIf(update.maxRefAgeMs() != null, MAX_REF_AGE_MS, update.maxRefAgeMs(), gen); - } - - private static void writeRemoveSnapshotRef(MetadataUpdate.RemoveSnapshotRef update, JsonGenerator gen) - throws IOException { + update.minSnapshotsToKeep() != null, + MIN_SNAPSHOTS_TO_KEEP, + update.minSnapshotsToKeep(), + gen); + JsonUtil.writeLongFieldIf( + update.maxSnapshotAgeMs() != null, MAX_SNAPSHOT_AGE_MS, update.maxSnapshotAgeMs(), gen); + JsonUtil.writeLongFieldIf( + update.maxRefAgeMs() != null, MAX_REF_AGE_MS, update.maxRefAgeMs(), gen); + } + + private static void writeRemoveSnapshotRef( + MetadataUpdate.RemoveSnapshotRef update, JsonGenerator gen) throws IOException { gen.writeStringField(REF_NAME, update.name()); } - private static void writeSetProperties(MetadataUpdate.SetProperties update, JsonGenerator gen) throws IOException { + private static void writeSetProperties(MetadataUpdate.SetProperties update, JsonGenerator gen) + throws IOException { gen.writeFieldName(UPDATED); gen.writeObject(update.updated()); } - private static void writeRemoveProperties(MetadataUpdate.RemoveProperties update, JsonGenerator gen) - throws IOException { + private static void writeRemoveProperties( + MetadataUpdate.RemoveProperties update, JsonGenerator gen) throws IOException { gen.writeFieldName(REMOVED); gen.writeObject(update.removed()); } - private static void writeSetLocation(MetadataUpdate.SetLocation update, JsonGenerator gen) throws IOException { + private static void writeSetLocation(MetadataUpdate.SetLocation update, JsonGenerator gen) + throws IOException { gen.writeStringField(LOCATION, update.location()); } @@ -362,8 +382,7 @@ private static MetadataUpdate readUpgradeFormatVersion(JsonNode node) { } private static MetadataUpdate readAddSchema(JsonNode node) { - Preconditions.checkArgument(node.hasNonNull(SCHEMA), - "Cannot parse missing field: schema"); + Preconditions.checkArgument(node.hasNonNull(SCHEMA), "Cannot parse missing field: schema"); JsonNode schemaNode = node.get(SCHEMA); Schema schema = SchemaParser.fromJson(schemaNode); int lastColumnId = JsonUtil.getInt(LAST_COLUMN_ID, node); @@ -388,7 +407,8 @@ private static MetadataUpdate readSetDefaultPartitionSpec(JsonNode node) { } private static MetadataUpdate readAddSortOrder(JsonNode node) { - Preconditions.checkArgument(node.hasNonNull(SORT_ORDER), "Cannot parse missing field: sort-order"); + Preconditions.checkArgument( + node.hasNonNull(SORT_ORDER), "Cannot parse missing field: sort-order"); JsonNode sortOrderNode = node.get(SORT_ORDER); UnboundSortOrder sortOrder = SortOrderParser.fromJson(sortOrderNode); return new MetadataUpdate.AddSortOrder(sortOrder); @@ -407,8 +427,10 @@ private static MetadataUpdate readAddSnapshot(JsonNode node) { private static MetadataUpdate readRemoveSnapshots(JsonNode node) { Set snapshotIds = JsonUtil.getLongSetOrNull(SNAPSHOT_IDS, node); - Preconditions.checkArgument(snapshotIds != null && snapshotIds.size() == 1, - "Invalid set of snapshot ids to remove. Expected one value but received: %s", snapshotIds); + Preconditions.checkArgument( + snapshotIds != null && snapshotIds.size() == 1, + "Invalid set of snapshot ids to remove. Expected one value but received: %s", + snapshotIds); Long snapshotId = Iterables.getOnlyElement(snapshotIds); return new MetadataUpdate.RemoveSnapshot(snapshotId); } @@ -416,7 +438,8 @@ private static MetadataUpdate readRemoveSnapshots(JsonNode node) { private static MetadataUpdate readSetSnapshotRef(JsonNode node) { String refName = JsonUtil.getString(REF_NAME, node); long snapshotId = JsonUtil.getLong(SNAPSHOT_ID, node); - SnapshotRefType type = SnapshotRefType.valueOf(JsonUtil.getString(TYPE, node).toUpperCase(Locale.ENGLISH)); + SnapshotRefType type = + SnapshotRefType.valueOf(JsonUtil.getString(TYPE, node).toUpperCase(Locale.ENGLISH)); Integer minSnapshotsToKeep = JsonUtil.getIntOrNull(MIN_SNAPSHOTS_TO_KEEP, node); Long maxSnapshotAgeMs = JsonUtil.getLongOrNull(MAX_SNAPSHOT_AGE_MS, node); Long maxRefAgeMs = JsonUtil.getLongOrNull(MAX_REF_AGE_MS, node); @@ -444,4 +467,3 @@ private static MetadataUpdate readSetLocation(JsonNode node) { return new MetadataUpdate.SetLocation(location); } } - diff --git a/core/src/main/java/org/apache/iceberg/MetricsConfig.java b/core/src/main/java/org/apache/iceberg/MetricsConfig.java index 3e7a55d0117b..502903a9965b 100644 --- a/core/src/main/java/org/apache/iceberg/MetricsConfig.java +++ b/core/src/main/java/org/apache/iceberg/MetricsConfig.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TableProperties.DEFAULT_WRITE_METRICS_MODE; +import static org.apache.iceberg.TableProperties.DEFAULT_WRITE_METRICS_MODE_DEFAULT; +import static org.apache.iceberg.TableProperties.METRICS_MAX_INFERRED_COLUMN_DEFAULTS; +import static org.apache.iceberg.TableProperties.METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT; +import static org.apache.iceberg.TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX; + import java.io.Serializable; import java.util.List; import java.util.Map; @@ -36,12 +41,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.DEFAULT_WRITE_METRICS_MODE; -import static org.apache.iceberg.TableProperties.DEFAULT_WRITE_METRICS_MODE_DEFAULT; -import static org.apache.iceberg.TableProperties.METRICS_MAX_INFERRED_COLUMN_DEFAULTS; -import static org.apache.iceberg.TableProperties.METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT; -import static org.apache.iceberg.TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX; - @Immutable public final class MetricsConfig implements Serializable { @@ -49,7 +48,8 @@ public final class MetricsConfig implements Serializable { private static final Joiner DOT = Joiner.on('.'); // Disable metrics by default for wide tables to prevent excessive metadata - private static final MetricsMode DEFAULT_MODE = MetricsModes.fromString(DEFAULT_WRITE_METRICS_MODE_DEFAULT); + private static final MetricsMode DEFAULT_MODE = + MetricsModes.fromString(DEFAULT_WRITE_METRICS_MODE_DEFAULT); private static final MetricsConfig DEFAULT = new MetricsConfig(ImmutableMap.of(), DEFAULT_MODE); private final Map columnModes; @@ -64,39 +64,44 @@ public static MetricsConfig getDefault() { return DEFAULT; } - static Map updateProperties(Map props, List deletedColumns, - Map renamedColumns) { + static Map updateProperties( + Map props, List deletedColumns, Map renamedColumns) { if (props.keySet().stream().noneMatch(key -> key.startsWith(METRICS_MODE_COLUMN_CONF_PREFIX))) { return props; } else { Map updatedProperties = Maps.newHashMap(); // Put all of the non metrics columns we aren't modifying - props.keySet().forEach(key -> { - if (key.startsWith(METRICS_MODE_COLUMN_CONF_PREFIX)) { - String columnAlias = key.replaceFirst(METRICS_MODE_COLUMN_CONF_PREFIX, ""); - if (renamedColumns.get(columnAlias) != null) { - // The name has changed. - String newKey = METRICS_MODE_COLUMN_CONF_PREFIX + renamedColumns.get(columnAlias); - updatedProperties.put(newKey, props.get(key)); - } else if (!deletedColumns.contains(columnAlias)) { - // Copy over the original - updatedProperties.put(key, props.get(key)); - } - // Implicit drop if deleted - } else { - // Not a metric property - updatedProperties.put(key, props.get(key)); - } - }); + props + .keySet() + .forEach( + key -> { + if (key.startsWith(METRICS_MODE_COLUMN_CONF_PREFIX)) { + String columnAlias = key.replaceFirst(METRICS_MODE_COLUMN_CONF_PREFIX, ""); + if (renamedColumns.get(columnAlias) != null) { + // The name has changed. + String newKey = + METRICS_MODE_COLUMN_CONF_PREFIX + renamedColumns.get(columnAlias); + updatedProperties.put(newKey, props.get(key)); + } else if (!deletedColumns.contains(columnAlias)) { + // Copy over the original + updatedProperties.put(key, props.get(key)); + } + // Implicit drop if deleted + } else { + // Not a metric property + updatedProperties.put(key, props.get(key)); + } + }); return updatedProperties; } } /** * Creates a metrics config from table configuration. + * * @param props table configuration * @deprecated use {@link MetricsConfig#forTable(Table)} - **/ + */ @Deprecated public static MetricsConfig fromProperties(Map props) { return from(props, null, null); @@ -104,6 +109,7 @@ public static MetricsConfig fromProperties(Map props) { /** * Creates a metrics config from a table. + * * @param table iceberg table */ public static MetricsConfig forTable(Table table) { @@ -124,10 +130,12 @@ public static MetricsConfig forPositionDelete(Table table) { MetricsConfig tableConfig = forTable(table); MetricsMode defaultMode = tableConfig.defaultMode; - tableConfig.columnModes.forEach((columnAlias, mode) -> { - String positionDeleteColumnAlias = DOT.join(MetadataColumns.DELETE_FILE_ROW_FIELD_NAME, columnAlias); - columnModes.put(positionDeleteColumnAlias, mode); - }); + tableConfig.columnModes.forEach( + (columnAlias, mode) -> { + String positionDeleteColumnAlias = + DOT.join(MetadataColumns.DELETE_FILE_ROW_FIELD_NAME, columnAlias); + columnModes.put(positionDeleteColumnAlias, mode); + }); return new MetricsConfig(columnModes.build(), defaultMode); } @@ -136,7 +144,7 @@ public static MetricsConfig forPositionDelete(Table table) { * Generate a MetricsConfig for all columns based on overrides, schema, and sort order. * * @param props will be read for metrics overrides (write.metadata.metrics.column.*) and default - * (write.metadata.metrics.default) + * (write.metadata.metrics.default) * @param schema table schema * @param order sort order columns, will be promoted to truncate(16) * @return metrics configuration @@ -186,6 +194,7 @@ private static MetricsConfig from(Map props, Schema schema, Sort /** * Auto promote sorted columns to truncate(16) if default is set at Counts or None. + * * @param defaultMode default mode * @return mode to use */ @@ -198,11 +207,16 @@ private static MetricsMode sortedColumnDefaultMode(MetricsMode defaultMode) { } private static int maxInferredColumnDefaults(Map properties) { - int maxInferredDefaultColumns = PropertyUtil.propertyAsInt(properties, - METRICS_MAX_INFERRED_COLUMN_DEFAULTS, METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT); + int maxInferredDefaultColumns = + PropertyUtil.propertyAsInt( + properties, + METRICS_MAX_INFERRED_COLUMN_DEFAULTS, + METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT); if (maxInferredDefaultColumns < 0) { - LOG.warn("Invalid value for {} (negative): {}, falling back to {}", - METRICS_MAX_INFERRED_COLUMN_DEFAULTS, maxInferredDefaultColumns, + LOG.warn( + "Invalid value for {} (negative): {}, falling back to {}", + METRICS_MAX_INFERRED_COLUMN_DEFAULTS, + maxInferredDefaultColumns, METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT); return METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT; } else { @@ -225,7 +239,9 @@ public void validateReferencedColumns(Schema schema) { ValidationException.check( schema.findField(column) != null, "Invalid metrics config, could not find column %s from table prop %s in schema %s", - column, METRICS_MODE_COLUMN_CONF_PREFIX + column, schema); + column, + METRICS_MODE_COLUMN_CONF_PREFIX + column, + schema); } } diff --git a/core/src/main/java/org/apache/iceberg/MetricsModes.java b/core/src/main/java/org/apache/iceberg/MetricsModes.java index e9e2844dffd2..fd6b9c580ee2 100644 --- a/core/src/main/java/org/apache/iceberg/MetricsModes.java +++ b/core/src/main/java/org/apache/iceberg/MetricsModes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.ObjectStreamException; @@ -28,14 +27,14 @@ /** * This class defines different metrics modes, which allow users to control the collection of - * value_counts, null_value_counts, nan_value_counts, lower_bounds, upper_bounds for different columns in metadata. + * value_counts, null_value_counts, nan_value_counts, lower_bounds, upper_bounds for different + * columns in metadata. */ public class MetricsModes { private static final Pattern TRUNCATE = Pattern.compile("truncate\\((\\d+)\\)"); - private MetricsModes() { - } + private MetricsModes() {} public static MetricsMode fromString(String mode) { if ("none".equalsIgnoreCase(mode)) { @@ -57,14 +56,14 @@ public static MetricsMode fromString(String mode) { /** * A metrics calculation mode. - *

- * Implementations must be immutable. + * + *

Implementations must be immutable. */ - public interface MetricsMode extends Serializable { - } + public interface MetricsMode extends Serializable {} /** - * Under this mode, value_counts, null_value_counts, nan_value_counts, lower_bounds, upper_bounds are not persisted. + * Under this mode, value_counts, null_value_counts, nan_value_counts, lower_bounds, upper_bounds + * are not persisted. */ public static class None extends ProxySerializableMetricsMode { private static final None INSTANCE = new None(); @@ -79,9 +78,7 @@ public String toString() { } } - /** - * Under this mode, only value_counts, null_value_counts, nan_value_counts are persisted. - */ + /** Under this mode, only value_counts, null_value_counts, nan_value_counts are persisted. */ public static class Counts extends ProxySerializableMetricsMode { private static final Counts INSTANCE = new Counts(); @@ -96,8 +93,8 @@ public String toString() { } /** - * Under this mode, value_counts, null_value_counts, nan_value_counts - * and truncated lower_bounds, upper_bounds are persisted. + * Under this mode, value_counts, null_value_counts, nan_value_counts and truncated lower_bounds, + * upper_bounds are persisted. */ public static class Truncate extends ProxySerializableMetricsMode { private final int length; @@ -138,8 +135,8 @@ public int hashCode() { } /** - * Under this mode, value_counts, null_value_counts, nan_value_counts - * and full lower_bounds, upper_bounds are persisted. + * Under this mode, value_counts, null_value_counts, nan_value_counts and full lower_bounds, + * upper_bounds are persisted. */ public static class Full extends ProxySerializableMetricsMode { private static final Full INSTANCE = new Full(); @@ -154,7 +151,8 @@ public String toString() { } } - // we cannot serialize/deserialize MetricsMode directly as it breaks reference equality used in metrics utils + // we cannot serialize/deserialize MetricsMode directly as it breaks reference equality used in + // metrics utils private abstract static class ProxySerializableMetricsMode implements MetricsMode { Object writeReplace() throws ObjectStreamException { return new MetricsModeProxy(toString()); diff --git a/core/src/main/java/org/apache/iceberg/MetricsUtil.java b/core/src/main/java/org/apache/iceberg/MetricsUtil.java index 8cddcab0902a..98710ef79ee8 100644 --- a/core/src/main/java/org/apache/iceberg/MetricsUtil.java +++ b/core/src/main/java/org/apache/iceberg/MetricsUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Map; @@ -27,11 +26,11 @@ public class MetricsUtil { - private MetricsUtil() { - } + private MetricsUtil() {} /** - * Construct mapping relationship between column id to NaN value counts from input metrics and metrics config. + * Construct mapping relationship between column id to NaN value counts from input metrics and + * metrics config. */ public static Map createNanValueCounts( Stream> fieldMetrics, MetricsConfig metricsConfig, Schema inputSchema) { @@ -42,19 +41,19 @@ public static Map createNanValueCounts( } return fieldMetrics - .filter(metrics -> metricsMode(inputSchema, metricsConfig, metrics.id()) != MetricsModes.None.get()) + .filter( + metrics -> + metricsMode(inputSchema, metricsConfig, metrics.id()) != MetricsModes.None.get()) .collect(Collectors.toMap(FieldMetrics::id, FieldMetrics::nanValueCount)); } - /** - * Extract MetricsMode for the given field id from metrics config. - */ - public static MetricsModes.MetricsMode metricsMode(Schema inputSchema, MetricsConfig metricsConfig, int fieldId) { + /** Extract MetricsMode for the given field id from metrics config. */ + public static MetricsModes.MetricsMode metricsMode( + Schema inputSchema, MetricsConfig metricsConfig, int fieldId) { Preconditions.checkNotNull(inputSchema, "inputSchema is required"); Preconditions.checkNotNull(metricsConfig, "metricsConfig is required"); String columnName = inputSchema.findColumnName(fieldId); return metricsConfig.columnMode(columnName); } - } diff --git a/core/src/main/java/org/apache/iceberg/MicroBatches.java b/core/src/main/java/org/apache/iceberg/MicroBatches.java index 56c7f4a371b1..e066e1b31493 100644 --- a/core/src/main/java/org/apache/iceberg/MicroBatches.java +++ b/core/src/main/java/org/apache/iceberg/MicroBatches.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -35,8 +34,7 @@ import org.slf4j.LoggerFactory; public class MicroBatches { - private MicroBatches() { - } + private MicroBatches() {} public static class MicroBatch { private final long snapshotId; @@ -46,8 +44,13 @@ public static class MicroBatch { private final List tasks; private final boolean lastIndexOfSnapshot; - private MicroBatch(long snapshotId, long startFileIndex, long endFileIndex, long sizeInBytes, - List tasks, boolean lastIndexOfSnapshot) { + private MicroBatch( + long snapshotId, + long startFileIndex, + long endFileIndex, + long sizeInBytes, + List tasks, + boolean lastIndexOfSnapshot) { this.snapshotId = snapshotId; this.startFileIndex = startFileIndex; this.endFileIndex = endFileIndex; @@ -110,27 +113,36 @@ public MicroBatchBuilder specsById(Map specs) { } public MicroBatch generate(long startFileIndex, long targetSizeInBytes, boolean scanAllFiles) { - Preconditions.checkArgument(startFileIndex >= 0, "startFileIndex is unexpectedly smaller than 0"); - Preconditions.checkArgument(targetSizeInBytes > 0, "targetSizeInBytes should be larger than 0"); - - List manifests = scanAllFiles ? snapshot.dataManifests(io) : - snapshot.dataManifests(io).stream().filter(m -> m.snapshotId().equals(snapshot.snapshotId())) - .collect(Collectors.toList()); + Preconditions.checkArgument( + startFileIndex >= 0, "startFileIndex is unexpectedly smaller than 0"); + Preconditions.checkArgument( + targetSizeInBytes > 0, "targetSizeInBytes should be larger than 0"); + + List manifests = + scanAllFiles + ? snapshot.dataManifests(io) + : snapshot.dataManifests(io).stream() + .filter(m -> m.snapshotId().equals(snapshot.snapshotId())) + .collect(Collectors.toList()); List> manifestIndexes = indexManifests(manifests); - List> skippedManifestIndexes = skipManifests(manifestIndexes, startFileIndex); + List> skippedManifestIndexes = + skipManifests(manifestIndexes, startFileIndex); - return generateMicroBatch(skippedManifestIndexes, startFileIndex, targetSizeInBytes, scanAllFiles); + return generateMicroBatch( + skippedManifestIndexes, startFileIndex, targetSizeInBytes, scanAllFiles); } /** - * Method to index the data files for each manifest. For example, if manifest m1 has 3 data files, manifest - * m2 has 2 data files, manifest m3 has 1 data file, then the index will be (m1, 0), (m2, 3), (m3, 5). + * Method to index the data files for each manifest. For example, if manifest m1 has 3 data + * files, manifest m2 has 2 data files, manifest m3 has 1 data file, then the index will be (m1, + * 0), (m2, 3), (m3, 5). * * @param manifestFiles List of input manifests used to index. * @return a list of manifest index with key as manifest file, value as file counts. */ - private static List> indexManifests(List manifestFiles) { + private static List> indexManifests( + List manifestFiles) { int currentFileIndex = 0; List> manifestIndexes = Lists.newArrayList(); @@ -143,17 +155,17 @@ private static List> indexManifests(List> skipManifests(List> indexedManifests, - long startFileIndex) { + private static List> skipManifests( + List> indexedManifests, long startFileIndex) { if (startFileIndex == 0) { return indexedManifests; } @@ -171,20 +183,30 @@ private static List> skipManifests(List> indexedManifests, - long startFileIndex, long targetSizeInBytes, boolean scanAllFiles) { + private MicroBatch generateMicroBatch( + List> indexedManifests, + long startFileIndex, + long targetSizeInBytes, + boolean scanAllFiles) { if (indexedManifests.isEmpty()) { - return new MicroBatch(snapshot.snapshotId(), startFileIndex, startFileIndex + 1, 0L, - Collections.emptyList(), true); + return new MicroBatch( + snapshot.snapshotId(), + startFileIndex, + startFileIndex + 1, + 0L, + Collections.emptyList(), + true); } long currentSizeInBytes = 0L; @@ -195,12 +217,14 @@ private MicroBatch generateMicroBatch(List> indexedM for (int idx = 0; idx < indexedManifests.size(); idx++) { currentFileIndex = indexedManifests.get(idx).second(); - try (CloseableIterable taskIterable = open(indexedManifests.get(idx).first(), scanAllFiles); + try (CloseableIterable taskIterable = + open(indexedManifests.get(idx).first(), scanAllFiles); CloseableIterator taskIter = taskIterable.iterator()) { while (taskIter.hasNext()) { FileScanTask task = taskIter.next(); if (currentFileIndex >= startFileIndex) { - // Make sure there's at least one task in each MicroBatch to void job to be stuck, always add task + // Make sure there's at least one task in each MicroBatch to void job to be stuck, + // always add task // firstly. tasks.add(task); currentSizeInBytes += task.length(); @@ -222,7 +246,8 @@ private MicroBatch generateMicroBatch(List> indexedM if (currentSizeInBytes >= targetSizeInBytes) { if (tasks.size() > 1 && currentSizeInBytes > targetSizeInBytes) { - // If there's more than 1 task in this batch, and the size exceeds the limit, we should revert last + // If there's more than 1 task in this batch, and the size exceeds the limit, we should + // revert last // task to make sure we don't exceed the size limit. FileScanTask extraTask = tasks.remove(tasks.size() - 1); currentSizeInBytes -= extraTask.length(); @@ -234,19 +259,28 @@ private MicroBatch generateMicroBatch(List> indexedM } } - return new MicroBatch(snapshot.snapshotId(), startFileIndex, currentFileIndex, currentSizeInBytes, - tasks, isLastIndex); + return new MicroBatch( + snapshot.snapshotId(), + startFileIndex, + currentFileIndex, + currentSizeInBytes, + tasks, + isLastIndex); } private CloseableIterable open(ManifestFile manifestFile, boolean scanAllFiles) { - ManifestGroup manifestGroup = new ManifestGroup(io, ImmutableList.of(manifestFile)) - .specsById(specsById) - .caseSensitive(caseSensitive); + ManifestGroup manifestGroup = + new ManifestGroup(io, ImmutableList.of(manifestFile)) + .specsById(specsById) + .caseSensitive(caseSensitive); if (!scanAllFiles) { - manifestGroup = manifestGroup - .filterManifestEntries(entry -> - entry.snapshotId() == snapshot.snapshotId() && entry.status() == ManifestEntry.Status.ADDED) - .ignoreDeleted(); + manifestGroup = + manifestGroup + .filterManifestEntries( + entry -> + entry.snapshotId() == snapshot.snapshotId() + && entry.status() == ManifestEntry.Status.ADDED) + .ignoreDeleted(); } return manifestGroup.planFiles(); diff --git a/core/src/main/java/org/apache/iceberg/OffsetsAwareSplitScanTaskIterator.java b/core/src/main/java/org/apache/iceberg/OffsetsAwareSplitScanTaskIterator.java index b6be1bed24d9..1951255bbaed 100644 --- a/core/src/main/java/org/apache/iceberg/OffsetsAwareSplitScanTaskIterator.java +++ b/core/src/main/java/org/apache/iceberg/OffsetsAwareSplitScanTaskIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -40,9 +39,13 @@ class OffsetsAwareSplitScanTaskIterator implements SplitScan private final List splitSizes; private int splitIndex = 0; - OffsetsAwareSplitScanTaskIterator(T parentTask, long parentTaskLength, List offsetList, - SplitScanTaskCreator splitTaskCreator) { - Preconditions.checkArgument(OFFSET_ORDERING.isStrictlyOrdered(offsetList), "Offsets must be sorted in asc order"); + OffsetsAwareSplitScanTaskIterator( + T parentTask, + long parentTaskLength, + List offsetList, + SplitScanTaskCreator splitTaskCreator) { + Preconditions.checkArgument( + OFFSET_ORDERING.isStrictlyOrdered(offsetList), "Offsets must be sorted in asc order"); this.parentTask = parentTask; this.splitTaskCreator = splitTaskCreator; diff --git a/core/src/main/java/org/apache/iceberg/PartitionData.java b/core/src/main/java/org/apache/iceberg/PartitionData.java index 0c551fff8160..dfe45b03ced3 100644 --- a/core/src/main/java/org/apache/iceberg/PartitionData.java +++ b/core/src/main/java/org/apache/iceberg/PartitionData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; @@ -49,9 +48,7 @@ static Schema partitionDataSchema(Types.StructType partitionType) { private final String stringSchema; private transient Schema schema = null; - /** - * Used by Avro reflection to instantiate this class when reading manifest files. - */ + /** Used by Avro reflection to instantiate this class when reading manifest files. */ PartitionData(Schema schema) { this.partitionType = AvroSchemaUtil.convert(schema).asNestedType().asStructType(); this.size = partitionType.fields().size(); @@ -62,8 +59,10 @@ static Schema partitionDataSchema(Types.StructType partitionType) { PartitionData(Types.StructType partitionType) { for (Types.NestedField field : partitionType.fields()) { - Preconditions.checkArgument(field.type().isPrimitiveType(), - "Partitions cannot contain nested types: %s", field.type()); + Preconditions.checkArgument( + field.type().isPrimitiveType(), + "Partitions cannot contain nested types: %s", + field.type()); } this.partitionType = partitionType; @@ -73,9 +72,7 @@ static Schema partitionDataSchema(Types.StructType partitionType) { this.stringSchema = schema.toString(); } - /** - * Copy constructor - */ + /** Copy constructor */ private PartitionData(PartitionData toCopy) { this.partitionType = toCopy.partitionType; this.size = toCopy.size; @@ -116,9 +113,10 @@ public T get(int pos, Class javaClass) { return javaClass.cast(value); } - throw new IllegalArgumentException(String.format( - "Wrong class, expected %s, but was %s, for object: %s", - javaClass.getName(), value.getClass().getName(), value)); + throw new IllegalArgumentException( + String.format( + "Wrong class, expected %s, but was %s, for object: %s", + javaClass.getName(), value.getClass().getName(), value)); } @Override @@ -163,9 +161,7 @@ public String toString() { if (i > 0) { sb.append(", "); } - sb.append(partitionType.fields().get(i).name()) - .append("=") - .append(data[i]); + sb.append(partitionType.fields().get(i).name()).append("=").append(data[i]); } sb.append("}"); return sb.toString(); diff --git a/core/src/main/java/org/apache/iceberg/PartitionSpecParser.java b/core/src/main/java/org/apache/iceberg/PartitionSpecParser.java index 67ec98b763de..0d0fff168282 100644 --- a/core/src/main/java/org/apache/iceberg/PartitionSpecParser.java +++ b/core/src/main/java/org/apache/iceberg/PartitionSpecParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.fasterxml.jackson.core.JsonGenerator; @@ -33,8 +32,7 @@ import org.apache.iceberg.util.Pair; public class PartitionSpecParser { - private PartitionSpecParser() { - } + private PartitionSpecParser() {} private static final String SPEC_ID = "spec-id"; private static final String FIELDS = "fields"; @@ -95,13 +93,12 @@ public static UnboundPartitionSpec fromJson(JsonNode json) { return builder.build(); } - private static final Cache, PartitionSpec> SPEC_CACHE = Caffeine - .newBuilder() - .weakValues() - .build(); + private static final Cache, PartitionSpec> SPEC_CACHE = + Caffeine.newBuilder().weakValues().build(); public static PartitionSpec fromJson(Schema schema, String json) { - return SPEC_CACHE.get(Pair.of(schema.asStruct(), json), + return SPEC_CACHE.get( + Pair.of(schema.asStruct(), json), schemaJsonPair -> { try { return fromJson(schema, JsonUtil.mapper().readValue(json, JsonNode.class)); @@ -156,21 +153,22 @@ static PartitionSpec fromJsonFields(Schema schema, int specId, String json) { } private static void buildFromJsonFields(UnboundPartitionSpec.Builder builder, JsonNode json) { - Preconditions.checkArgument(json.isArray(), - "Cannot parse partition spec fields, not an array: %s", json); + Preconditions.checkArgument( + json.isArray(), "Cannot parse partition spec fields, not an array: %s", json); Iterator elements = json.elements(); int fieldIdCount = 0; while (elements.hasNext()) { JsonNode element = elements.next(); - Preconditions.checkArgument(element.isObject(), - "Cannot parse partition field, not an object: %s", element); + Preconditions.checkArgument( + element.isObject(), "Cannot parse partition field, not an object: %s", element); String name = JsonUtil.getString(NAME, element); String transform = JsonUtil.getString(TRANSFORM, element); int sourceId = JsonUtil.getInt(SOURCE_ID, element); - // partition field ids are missing in old PartitionSpec, they always auto-increment from PARTITION_DATA_ID_START + // partition field ids are missing in old PartitionSpec, they always auto-increment from + // PARTITION_DATA_ID_START if (element.has(FIELD_ID)) { builder.addField(transform, sourceId, JsonUtil.getInt(FIELD_ID, element), name); fieldIdCount++; @@ -179,8 +177,10 @@ private static void buildFromJsonFields(UnboundPartitionSpec.Builder builder, Js } } - Preconditions.checkArgument(fieldIdCount == 0 || fieldIdCount == json.size(), + Preconditions.checkArgument( + fieldIdCount == 0 || fieldIdCount == json.size(), "Cannot parse spec with missing field IDs: %s missing of %s fields.", - json.size() - fieldIdCount, json.size()); + json.size() - fieldIdCount, + json.size()); } } diff --git a/core/src/main/java/org/apache/iceberg/PartitionSummary.java b/core/src/main/java/org/apache/iceberg/PartitionSummary.java index 7fca830a1d22..f7046edac31b 100644 --- a/core/src/main/java/org/apache/iceberg/PartitionSummary.java +++ b/core/src/main/java/org/apache/iceberg/PartitionSummary.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Arrays; @@ -75,7 +74,9 @@ private PartitionFieldStats(Type type) { } public PartitionFieldSummary toSummary() { - return new GenericPartitionFieldSummary(containsNull, containsNaN, + return new GenericPartitionFieldSummary( + containsNull, + containsNaN, min != null ? Conversions.toByteBuffer(type, min) : null, max != null ? Conversions.toByteBuffer(type, max) : null); } diff --git a/core/src/main/java/org/apache/iceberg/Partitioning.java b/core/src/main/java/org/apache/iceberg/Partitioning.java index 2bf0b800db32..cd7eef4fa9d8 100644 --- a/core/src/main/java/org/apache/iceberg/Partitioning.java +++ b/core/src/main/java/org/apache/iceberg/Partitioning.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Collections; @@ -37,8 +36,7 @@ import org.apache.iceberg.types.Types.StructType; public class Partitioning { - private Partitioning() { - } + private Partitioning() {} /** * Check whether the spec contains a bucketed partition field. @@ -47,61 +45,66 @@ private Partitioning() { * @return true if the spec has field with a bucket transform */ public static boolean hasBucketField(PartitionSpec spec) { - List bucketList = PartitionSpecVisitor.visit(spec, new PartitionSpecVisitor() { - @Override - public Boolean identity(int fieldId, String sourceName, int sourceId) { - return false; - } - - @Override - public Boolean bucket(int fieldId, String sourceName, int sourceId, int width) { - return true; - } - - @Override - public Boolean truncate(int fieldId, String sourceName, int sourceId, int width) { - return false; - } - - @Override - public Boolean year(int fieldId, String sourceName, int sourceId) { - return false; - } - - @Override - public Boolean month(int fieldId, String sourceName, int sourceId) { - return false; - } - - @Override - public Boolean day(int fieldId, String sourceName, int sourceId) { - return false; - } - - @Override - public Boolean hour(int fieldId, String sourceName, int sourceId) { - return false; - } - - @Override - public Boolean alwaysNull(int fieldId, String sourceName, int sourceId) { - return false; - } - - @Override - public Boolean unknown(int fieldId, String sourceName, int sourceId, String transform) { - return false; - } - }); + List bucketList = + PartitionSpecVisitor.visit( + spec, + new PartitionSpecVisitor() { + @Override + public Boolean identity(int fieldId, String sourceName, int sourceId) { + return false; + } + + @Override + public Boolean bucket(int fieldId, String sourceName, int sourceId, int width) { + return true; + } + + @Override + public Boolean truncate(int fieldId, String sourceName, int sourceId, int width) { + return false; + } + + @Override + public Boolean year(int fieldId, String sourceName, int sourceId) { + return false; + } + + @Override + public Boolean month(int fieldId, String sourceName, int sourceId) { + return false; + } + + @Override + public Boolean day(int fieldId, String sourceName, int sourceId) { + return false; + } + + @Override + public Boolean hour(int fieldId, String sourceName, int sourceId) { + return false; + } + + @Override + public Boolean alwaysNull(int fieldId, String sourceName, int sourceId) { + return false; + } + + @Override + public Boolean unknown( + int fieldId, String sourceName, int sourceId, String transform) { + return false; + } + }); return bucketList.stream().anyMatch(Boolean::booleanValue); } /** * Create a sort order that will group data for a partition spec. - *

- * If the partition spec contains bucket columns, the sort order will also have a field to sort by a column that is - * bucketed in the spec. The column is selected by the highest number of buckets in the transform. + * + *

If the partition spec contains bucket columns, the sort order will also have a field to sort + * by a column that is bucketed in the spec. The column is selected by the highest number of + * buckets in the transform. * * @param spec a partition spec * @return a sort order that will cluster data for the spec @@ -193,9 +196,9 @@ public Void alwaysNull(int fieldId, String sourceName, int sourceId) { /** * Builds a common partition type for all specs in a table. - *

- * Whenever a table has multiple specs, the partition type is a struct containing - * all columns that have ever been a part of any spec in the table. + * + *

Whenever a table has multiple specs, the partition type is a struct containing all columns + * that have ever been a part of any spec in the table. * * @param table a table with one or many specs * @return the constructed common partition type @@ -203,8 +206,10 @@ public Void alwaysNull(int fieldId, String sourceName, int sourceId) { public static StructType partitionType(Table table) { // we currently don't know the output type of unknown transforms List> unknownTransforms = collectUnknownTransforms(table); - ValidationException.check(unknownTransforms.isEmpty(), - "Cannot build table partition type, unknown transforms: %s", unknownTransforms); + ValidationException.check( + unknownTransforms.isEmpty(), + "Cannot build table partition type, unknown transforms: %s", + unknownTransforms); if (table.specs().size() == 1) { return table.spec().partitionType(); @@ -215,9 +220,10 @@ public static StructType partitionType(Table table) { Map nameMap = Maps.newHashMap(); // sort the spec IDs in descending order to pick up the most recent field names - List specIds = table.specs().keySet().stream() - .sorted(Collections.reverseOrder()) - .collect(Collectors.toList()); + List specIds = + table.specs().keySet().stream() + .sorted(Collections.reverseOrder()) + .collect(Collectors.toList()); for (Integer specId : specIds) { PartitionSpec spec = table.specs().get(specId); @@ -234,9 +240,11 @@ public static StructType partitionType(Table table) { } else { // verify the fields are compatible as they may conflict in v1 tables - ValidationException.check(equivalentIgnoringNames(field, existingField), + ValidationException.check( + equivalentIgnoringNames(field, existingField), "Conflicting partition fields: ['%s', '%s']", - field, existingField); + field, + existingField); // use the correct type for dropped partitions in v1 tables if (isVoidTransform(existingField) && !isVoidTransform(field)) { @@ -247,10 +255,13 @@ public static StructType partitionType(Table table) { } } - List sortedStructFields = fieldMap.keySet().stream() - .sorted(Comparator.naturalOrder()) - .map(fieldId -> NestedField.optional(fieldId, nameMap.get(fieldId), typeMap.get(fieldId))) - .collect(Collectors.toList()); + List sortedStructFields = + fieldMap.keySet().stream() + .sorted(Comparator.naturalOrder()) + .map( + fieldId -> + NestedField.optional(fieldId, nameMap.get(fieldId), typeMap.get(fieldId))) + .collect(Collectors.toList()); return StructType.of(sortedStructFields); } @@ -261,23 +272,30 @@ private static boolean isVoidTransform(PartitionField field) { private static List> collectUnknownTransforms(Table table) { List> unknownTransforms = Lists.newArrayList(); - table.specs().values().forEach(spec -> { - spec.fields().stream() - .map(PartitionField::transform) - .filter(transform -> transform instanceof UnknownTransform) - .forEach(unknownTransforms::add); - }); + table + .specs() + .values() + .forEach( + spec -> { + spec.fields().stream() + .map(PartitionField::transform) + .filter(transform -> transform instanceof UnknownTransform) + .forEach(unknownTransforms::add); + }); return unknownTransforms; } - private static boolean equivalentIgnoringNames(PartitionField field, PartitionField anotherField) { - return field.fieldId() == anotherField.fieldId() && - field.sourceId() == anotherField.sourceId() && - compatibleTransforms(field.transform(), anotherField.transform()); + private static boolean equivalentIgnoringNames( + PartitionField field, PartitionField anotherField) { + return field.fieldId() == anotherField.fieldId() + && field.sourceId() == anotherField.sourceId() + && compatibleTransforms(field.transform(), anotherField.transform()); } private static boolean compatibleTransforms(Transform t1, Transform t2) { - return t1.equals(t2) || t1.equals(Transforms.alwaysNull()) || t2.equals(Transforms.alwaysNull()); + return t1.equals(t2) + || t1.equals(Transforms.alwaysNull()) + || t2.equals(Transforms.alwaysNull()); } } diff --git a/core/src/main/java/org/apache/iceberg/PartitionsTable.java b/core/src/main/java/org/apache/iceberg/PartitionsTable.java index 92f8864de05e..3723a54bc972 100644 --- a/core/src/main/java/org/apache/iceberg/PartitionsTable.java +++ b/core/src/main/java/org/apache/iceberg/PartitionsTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.github.benmanes.caffeine.cache.Caffeine; @@ -29,9 +28,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; -/** - * A {@link Table} implementation that exposes a table's partitions as rows. - */ +/** A {@link Table} implementation that exposes a table's partitions as rows. */ public class PartitionsTable extends BaseMetadataTable { private final Schema schema; @@ -45,12 +42,12 @@ public class PartitionsTable extends BaseMetadataTable { PartitionsTable(TableOperations ops, Table table, String name) { super(ops, table, name); - this.schema = new Schema( - Types.NestedField.required(1, "partition", Partitioning.partitionType(table)), - Types.NestedField.required(2, "record_count", Types.LongType.get()), - Types.NestedField.required(3, "file_count", Types.IntegerType.get()), - Types.NestedField.required(4, "spec_id", Types.IntegerType.get()) - ); + this.schema = + new Schema( + Types.NestedField.required(1, "partition", Partitioning.partitionType(table)), + Types.NestedField.required(2, "record_count", Types.LongType.get()), + Types.NestedField.required(3, "file_count", Types.IntegerType.get()), + Types.NestedField.required(4, "spec_id", Types.IntegerType.get())); } @Override @@ -78,20 +75,23 @@ private DataTask task(StaticTableScan scan) { // the table is unpartitioned, partitions contains only the root partition return StaticDataTask.of( io().newInputFile(ops.current().metadataFileLocation()), - schema(), scan.schema(), partitions, - root -> StaticDataTask.Row.of(root.recordCount, root.fileCount) - ); + schema(), + scan.schema(), + partitions, + root -> StaticDataTask.Row.of(root.recordCount, root.fileCount)); } else { return StaticDataTask.of( io().newInputFile(ops.current().metadataFileLocation()), - schema(), scan.schema(), partitions, - PartitionsTable::convertPartition - ); + schema(), + scan.schema(), + partitions, + PartitionsTable::convertPartition); } } private static StaticDataTask.Row convertPartition(Partition partition) { - return StaticDataTask.Row.of(partition.key, partition.recordCount, partition.fileCount, partition.specId); + return StaticDataTask.Row.of( + partition.key, partition.recordCount, partition.fileCount, partition.specId); } private static Iterable partitions(Table table, StaticTableScan scan) { @@ -100,13 +100,17 @@ private static Iterable partitions(Table table, StaticTableScan scan) PartitionMap partitions = new PartitionMap(); // cache a position map needed by each partition spec to normalize partitions to final schema - Map normalizedPositionsBySpec = Maps.newHashMapWithExpectedSize(table.specs().size()); + Map normalizedPositionsBySpec = + Maps.newHashMapWithExpectedSize(table.specs().size()); for (FileScanTask task : tasks) { PartitionData original = (PartitionData) task.file().partition(); - int[] normalizedPositions = normalizedPositionsBySpec.computeIfAbsent( - task.spec().specId(), specId -> normalizedPositions(table, specId, normalizedPartitionType)); - PartitionData normalized = normalizePartition(original, normalizedPartitionType, normalizedPositions); + int[] normalizedPositions = + normalizedPositionsBySpec.computeIfAbsent( + task.spec().specId(), + specId -> normalizedPositions(table, specId, normalizedPartitionType)); + PartitionData normalized = + normalizePartition(original, normalizedPartitionType, normalizedPositions); partitions.get(normalized).update(task.file()); } return partitions.all(); @@ -116,31 +120,37 @@ private static Iterable partitions(Table table, StaticTableScan scan) * Builds an array of the field position of positions in the normalized partition type indexed by * field position in the original partition type */ - private static int[] normalizedPositions(Table table, int specId, Types.StructType normalizedType) { + private static int[] normalizedPositions( + Table table, int specId, Types.StructType normalizedType) { Types.StructType originalType = table.specs().get(specId).partitionType(); int[] normalizedPositions = new int[originalType.fields().size()]; for (int originalIndex = 0; originalIndex < originalType.fields().size(); originalIndex++) { - Types.NestedField normalizedField = normalizedType.field(originalType.fields().get(originalIndex).fieldId()); + Types.NestedField normalizedField = + normalizedType.field(originalType.fields().get(originalIndex).fieldId()); normalizedPositions[originalIndex] = normalizedType.fields().indexOf(normalizedField); } return normalizedPositions; } /** - * Convert a partition data written by an old spec, to table's normalized partition type, which is a common partition - * type for all specs of the table. + * Convert a partition data written by an old spec, to table's normalized partition type, which is + * a common partition type for all specs of the table. + * * @param originalPartition un-normalized partition data - * @param normalizedPartitionType table's normalized partition type {@link Partitioning#partitionType(Table)} - * @param normalizedPositions field positions in the normalized partition type indexed by field position in - * the original partition type + * @param normalizedPartitionType table's normalized partition type {@link + * Partitioning#partitionType(Table)} + * @param normalizedPositions field positions in the normalized partition type indexed by field + * position in the original partition type * @return the normalized partition data */ - private static PartitionData normalizePartition(PartitionData originalPartition, - Types.StructType normalizedPartitionType, - int[] normalizedPositions) { + private static PartitionData normalizePartition( + PartitionData originalPartition, + Types.StructType normalizedPartitionType, + int[] normalizedPositions) { PartitionData normalizedPartition = new PartitionData(normalizedPartitionType); for (int originalIndex = 0; originalIndex < originalPartition.size(); originalIndex++) { - normalizedPartition.put(normalizedPositions[originalIndex], originalPartition.get(originalIndex)); + normalizedPartition.put( + normalizedPositions[originalIndex], originalPartition.get(originalIndex)); } return normalizedPartition; } @@ -151,26 +161,34 @@ static CloseableIterable planFiles(StaticTableScan scan) { Snapshot snapshot = table.snapshot(scan.snapshot().snapshotId()); boolean caseSensitive = scan.isCaseSensitive(); - LoadingCache evalCache = Caffeine.newBuilder().build(specId -> { - PartitionSpec spec = table.specs().get(specId); - PartitionSpec transformedSpec = transformSpec(scan.tableSchema(), spec); - return ManifestEvaluator.forRowFilter(scan.filter(), transformedSpec, caseSensitive); - }); + LoadingCache evalCache = + Caffeine.newBuilder() + .build( + specId -> { + PartitionSpec spec = table.specs().get(specId); + PartitionSpec transformedSpec = transformSpec(scan.tableSchema(), spec); + return ManifestEvaluator.forRowFilter( + scan.filter(), transformedSpec, caseSensitive); + }); FileIO io = table.io(); - ManifestGroup manifestGroup = new ManifestGroup(io, snapshot.dataManifests(io), snapshot.deleteManifests(io)) - .caseSensitive(caseSensitive) - .filterManifests(m -> evalCache.get(m.partitionSpecId()).eval(m)) - .select(scan.colStats() ? DataTableScan.SCAN_WITH_STATS_COLUMNS : DataTableScan.SCAN_COLUMNS) - .specsById(scan.table().specs()) - .ignoreDeleted(); + ManifestGroup manifestGroup = + new ManifestGroup(io, snapshot.dataManifests(io), snapshot.deleteManifests(io)) + .caseSensitive(caseSensitive) + .filterManifests(m -> evalCache.get(m.partitionSpecId()).eval(m)) + .select( + scan.colStats() + ? DataTableScan.SCAN_WITH_STATS_COLUMNS + : DataTableScan.SCAN_COLUMNS) + .specsById(scan.table().specs()) + .ignoreDeleted(); if (scan.shouldIgnoreResiduals()) { manifestGroup = manifestGroup.ignoreResiduals(); } - if (scan.snapshot().dataManifests(io).size() > 1 && - (PLAN_SCANS_WITH_WORKER_POOL || scan.context().planWithCustomizedExecutor())) { + if (scan.snapshot().dataManifests(io).size() > 1 + && (PLAN_SCANS_WITH_WORKER_POOL || scan.context().planWithCustomizedExecutor())) { manifestGroup = manifestGroup.planWith(scan.context().planExecutor()); } @@ -179,7 +197,12 @@ static CloseableIterable planFiles(StaticTableScan scan) { private class PartitionsScan extends StaticTableScan { PartitionsScan(TableOperations ops, Table table) { - super(ops, table, PartitionsTable.this.schema(), MetadataTableType.PARTITIONS, PartitionsTable.this::task); + super( + ops, + table, + PartitionsTable.this.schema(), + MetadataTableType.PARTITIONS, + PartitionsTable.this::task); } } diff --git a/core/src/main/java/org/apache/iceberg/PropertiesUpdate.java b/core/src/main/java/org/apache/iceberg/PropertiesUpdate.java index fe20302278a1..9168b84b4042 100644 --- a/core/src/main/java/org/apache/iceberg/PropertiesUpdate.java +++ b/core/src/main/java/org/apache/iceberg/PropertiesUpdate.java @@ -16,17 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -import java.util.Map; -import java.util.Set; -import org.apache.iceberg.exceptions.CommitFailedException; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.util.Tasks; - import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; @@ -36,6 +27,14 @@ import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.exceptions.CommitFailedException; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.util.Tasks; + class PropertiesUpdate implements UpdateProperties { private final TableOperations ops; private final Map updates = Maps.newHashMap(); @@ -51,8 +50,8 @@ class PropertiesUpdate implements UpdateProperties { public UpdateProperties set(String key, String value) { Preconditions.checkNotNull(key, "Key cannot be null"); Preconditions.checkNotNull(value, "Value cannot be null"); - Preconditions.checkArgument(!removals.contains(key), - "Cannot remove and update the same key: %s", key); + Preconditions.checkArgument( + !removals.contains(key), "Cannot remove and update the same key: %s", key); updates.put(key, value); @@ -62,8 +61,8 @@ public UpdateProperties set(String key, String value) { @Override public UpdateProperties remove(String key) { Preconditions.checkNotNull(key, "Key cannot be null"); - Preconditions.checkArgument(!updates.keySet().contains(key), - "Cannot remove and update the same key: %s", key); + Preconditions.checkArgument( + !updates.keySet().contains(key), "Cannot remove and update the same key: %s", key); removals.add(key); @@ -107,10 +106,11 @@ public void commit() { base.propertyAsInt(COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), 2.0 /* exponential */) .onlyRetryOn(CommitFailedException.class) - .run(taskOps -> { - Map newProperties = apply(); - TableMetadata updated = base.replaceProperties(newProperties); - taskOps.commit(base, updated); - }); + .run( + taskOps -> { + Map newProperties = apply(); + TableMetadata updated = base.replaceProperties(newProperties); + taskOps.commit(base, updated); + }); } } diff --git a/core/src/main/java/org/apache/iceberg/ReachableFileUtil.java b/core/src/main/java/org/apache/iceberg/ReachableFileUtil.java index 004e31c33504..45c885a745e4 100644 --- a/core/src/main/java/org/apache/iceberg/ReachableFileUtil.java +++ b/core/src/main/java/org/apache/iceberg/ReachableFileUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -35,8 +34,7 @@ public class ReachableFileUtil { private static final Logger LOG = LoggerFactory.getLogger(ReachableFileUtil.class); private static final String METADATA_FOLDER_NAME = "metadata"; - private ReachableFileUtil() { - } + private ReachableFileUtil() {} /** * Returns the location of the version hint file @@ -54,9 +52,9 @@ public static String versionHintLocation(Table table) { /** * Returns locations of JSON metadata files in a table. * - * @param table Table to get JSON metadata files from - * @param recursive When true, recursively retrieves all the reachable JSON metadata files. - * When false, gets the all the JSON metadata files only from the current metadata. + * @param table Table to get JSON metadata files from + * @param recursive When true, recursively retrieves all the reachable JSON metadata files. When + * false, gets the all the JSON metadata files only from the current metadata. * @return locations of JSON metadata files */ public static Set metadataFileLocations(Table table, boolean recursive) { @@ -68,8 +66,8 @@ public static Set metadataFileLocations(Table table, boolean recursive) return metadataFileLocations; } - private static void metadataFileLocations(TableMetadata metadata, Set metadataFileLocations, - FileIO io, boolean recursive) { + private static void metadataFileLocations( + TableMetadata metadata, Set metadataFileLocations, FileIO io, boolean recursive) { List metadataLogEntries = metadata.previousFiles(); if (metadataLogEntries.size() > 0) { for (MetadataLogEntry metadataLogEntry : metadataLogEntries) { @@ -84,7 +82,8 @@ private static void metadataFileLocations(TableMetadata metadata, Set me } } - private static TableMetadata findFirstExistentPreviousMetadata(List metadataLogEntries, FileIO io) { + private static TableMetadata findFirstExistentPreviousMetadata( + List metadataLogEntries, FileIO io) { TableMetadata metadata = null; for (MetadataLogEntry metadataLogEntry : metadataLogEntries) { try { diff --git a/core/src/main/java/org/apache/iceberg/RemoveSnapshots.java b/core/src/main/java/org/apache/iceberg/RemoveSnapshots.java index 1c4286ed2424..b996822aaf03 100644 --- a/core/src/main/java/org/apache/iceberg/RemoveSnapshots.java +++ b/core/src/main/java/org/apache/iceberg/RemoveSnapshots.java @@ -16,9 +16,25 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; +import static org.apache.iceberg.TableProperties.MAX_REF_AGE_MS; +import static org.apache.iceberg.TableProperties.MAX_REF_AGE_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.MAX_SNAPSHOT_AGE_MS; +import static org.apache.iceberg.TableProperties.MAX_SNAPSHOT_AGE_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.MIN_SNAPSHOTS_TO_KEEP; +import static org.apache.iceberg.TableProperties.MIN_SNAPSHOTS_TO_KEEP_DEFAULT; + import java.io.IOException; import java.util.Collection; import java.util.List; @@ -47,36 +63,21 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; -import static org.apache.iceberg.TableProperties.MAX_REF_AGE_MS; -import static org.apache.iceberg.TableProperties.MAX_REF_AGE_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.MAX_SNAPSHOT_AGE_MS; -import static org.apache.iceberg.TableProperties.MAX_SNAPSHOT_AGE_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.MIN_SNAPSHOTS_TO_KEEP; -import static org.apache.iceberg.TableProperties.MIN_SNAPSHOTS_TO_KEEP_DEFAULT; - @SuppressWarnings("UnnecessaryAnonymousClass") class RemoveSnapshots implements ExpireSnapshots { private static final Logger LOG = LoggerFactory.getLogger(RemoveSnapshots.class); // Creates an executor service that runs each task in the thread that invokes execute/submit. - private static final ExecutorService DEFAULT_DELETE_EXECUTOR_SERVICE = MoreExecutors.newDirectExecutorService(); - - private final Consumer defaultDelete = new Consumer() { - @Override - public void accept(String file) { - ops.io().deleteFile(file); - } - }; + private static final ExecutorService DEFAULT_DELETE_EXECUTOR_SERVICE = + MoreExecutors.newDirectExecutorService(); + + private final Consumer defaultDelete = + new Consumer() { + @Override + public void accept(String file) { + ops.io().deleteFile(file); + } + }; private final TableOperations ops; private final Set idsToRemove = Sets.newHashSet(); @@ -97,23 +98,18 @@ public void accept(String file) { PropertyUtil.propertyAsBoolean(base.properties(), GC_ENABLED, GC_ENABLED_DEFAULT), "Cannot expire snapshots: GC is disabled (deleting files may corrupt other tables)"); - long defaultMaxSnapshotAgeMs = PropertyUtil.propertyAsLong( - base.properties(), - MAX_SNAPSHOT_AGE_MS, - MAX_SNAPSHOT_AGE_MS_DEFAULT); + long defaultMaxSnapshotAgeMs = + PropertyUtil.propertyAsLong( + base.properties(), MAX_SNAPSHOT_AGE_MS, MAX_SNAPSHOT_AGE_MS_DEFAULT); this.now = System.currentTimeMillis(); this.defaultExpireOlderThan = now - defaultMaxSnapshotAgeMs; - this.defaultMinNumSnapshots = PropertyUtil.propertyAsInt( - base.properties(), - MIN_SNAPSHOTS_TO_KEEP, - MIN_SNAPSHOTS_TO_KEEP_DEFAULT); - - this.defaultMaxRefAgeMs = PropertyUtil.propertyAsLong( - base.properties(), - MAX_REF_AGE_MS, - MAX_REF_AGE_MS_DEFAULT - ); + this.defaultMinNumSnapshots = + PropertyUtil.propertyAsInt( + base.properties(), MIN_SNAPSHOTS_TO_KEEP, MIN_SNAPSHOTS_TO_KEEP_DEFAULT); + + this.defaultMaxRefAgeMs = + PropertyUtil.propertyAsLong(base.properties(), MAX_REF_AGE_MS, MAX_REF_AGE_MS_DEFAULT); } @Override @@ -131,16 +127,20 @@ public ExpireSnapshots expireSnapshotId(long expireSnapshotId) { @Override public ExpireSnapshots expireOlderThan(long timestampMillis) { - LOG.info("Expiring snapshots older than: {} ({})", - DateTimeUtil.formatTimestampMillis(timestampMillis), timestampMillis); + LOG.info( + "Expiring snapshots older than: {} ({})", + DateTimeUtil.formatTimestampMillis(timestampMillis), + timestampMillis); this.defaultExpireOlderThan = timestampMillis; return this; } @Override public ExpireSnapshots retainLast(int numSnapshots) { - Preconditions.checkArgument(1 <= numSnapshots, - "Number of snapshots to retain must be at least 1, cannot be: %s", numSnapshots); + Preconditions.checkArgument( + 1 <= numSnapshots, + "Number of snapshots to retain must be at least 1, cannot be: %s", + numSnapshots); this.defaultMinNumSnapshots = numSnapshots; return this; } @@ -191,8 +191,11 @@ private TableMetadata internalApply() { for (long idToRemove : idsToRemove) { List refsForId = retainedIdToRefs.get(idToRemove); - Preconditions.checkArgument(refsForId == null, - "Cannot expire %s. Still referenced by refs: %s", idToRemove, refsForId); + Preconditions.checkArgument( + refsForId == null, + "Cannot expire %s. Still referenced by refs: %s", + idToRemove, + refsForId); } idsToRetain.addAll(computeAllBranchSnapshotsToRetain(retainedRefs.values())); @@ -242,12 +245,13 @@ private Set computeAllBranchSnapshotsToRetain(Collection refs Set branchSnapshotsToRetain = Sets.newHashSet(); for (SnapshotRef ref : refs) { if (ref.isBranch()) { - long expireSnapshotsOlderThan = ref.maxSnapshotAgeMs() != null ? now - ref.maxSnapshotAgeMs() : - defaultExpireOlderThan; - int minSnapshotsToKeep = ref.minSnapshotsToKeep() != null ? ref.minSnapshotsToKeep() : - defaultMinNumSnapshots; + long expireSnapshotsOlderThan = + ref.maxSnapshotAgeMs() != null ? now - ref.maxSnapshotAgeMs() : defaultExpireOlderThan; + int minSnapshotsToKeep = + ref.minSnapshotsToKeep() != null ? ref.minSnapshotsToKeep() : defaultMinNumSnapshots; branchSnapshotsToRetain.addAll( - computeBranchSnapshotsToRetain(ref.snapshotId(), expireSnapshotsOlderThan, minSnapshotsToKeep)); + computeBranchSnapshotsToRetain( + ref.snapshotId(), expireSnapshotsOlderThan, minSnapshotsToKeep)); } } @@ -255,12 +259,11 @@ private Set computeAllBranchSnapshotsToRetain(Collection refs } private Set computeBranchSnapshotsToRetain( - long snapshot, - long expireSnapshotsOlderThan, - int minSnapshotsToKeep) { + long snapshot, long expireSnapshotsOlderThan, int minSnapshotsToKeep) { Set idsToRetain = Sets.newHashSet(); for (Snapshot ancestor : SnapshotUtil.ancestorsOf(snapshot, base::snapshot)) { - if (idsToRetain.size() < minSnapshotsToKeep || ancestor.timestampMillis() >= expireSnapshotsOlderThan) { + if (idsToRetain.size() < minSnapshotsToKeep + || ancestor.timestampMillis() >= expireSnapshotsOlderThan) { idsToRetain.add(ancestor.snapshotId()); } else { return idsToRetain; @@ -284,7 +287,8 @@ private Set unreferencedSnapshotsToRetain(Collection refs) { Set snapshotsToRetain = Sets.newHashSet(); for (Snapshot snapshot : base.snapshots()) { - if (!referencedSnapshots.contains(snapshot.snapshotId()) && // unreferenced + if (!referencedSnapshots.contains(snapshot.snapshotId()) + && // unreferenced snapshot.timestampMillis() >= defaultExpireOlderThan) { // not old enough to expire snapshotsToRetain.add(snapshot.snapshotId()); } @@ -303,14 +307,16 @@ public void commit() { base.propertyAsInt(COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), 2.0 /* exponential */) .onlyRetryOn(CommitFailedException.class) - .run(item -> { - TableMetadata updated = internalApply(); - if (cleanExpiredFiles && updated.refs().size() > 1) { - throw new UnsupportedOperationException("Cannot incrementally clean files for tables with more than 1 ref"); - } - - ops.commit(base, updated); - }); + .run( + item -> { + TableMetadata updated = internalApply(); + if (cleanExpiredFiles && updated.refs().size() > 1) { + throw new UnsupportedOperationException( + "Cannot incrementally clean files for tables with more than 1 ref"); + } + + ops.commit(base, updated); + }); LOG.info("Committed snapshot changes"); if (cleanExpiredFiles) { @@ -355,31 +361,41 @@ private void cleanExpiredSnapshots() { } @SuppressWarnings({"checkstyle:CyclomaticComplexity", "MethodLength"}) - private void removeExpiredFiles(List snapshots, Set validIds, Set expiredIds) { + private void removeExpiredFiles( + List snapshots, Set validIds, Set expiredIds) { // Reads and deletes are done using Tasks.foreach(...).suppressFailureWhenFinished to complete // as much of the delete work as possible and avoid orphaned data or manifest files. // this is the set of ancestors of the current table state. when removing snapshots, this must // only remove files that were deleted in an ancestor of the current table state to avoid // physically deleting files that were logically deleted in a commit that was rolled back. - Set ancestorIds = Sets.newHashSet(SnapshotUtil.ancestorIds(base.currentSnapshot(), base::snapshot)); + Set ancestorIds = + Sets.newHashSet(SnapshotUtil.ancestorIds(base.currentSnapshot(), base::snapshot)); Set pickedAncestorSnapshotIds = Sets.newHashSet(); for (long snapshotId : ancestorIds) { - String sourceSnapshotId = base.snapshot(snapshotId).summary().get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP); + String sourceSnapshotId = + base.snapshot(snapshotId).summary().get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP); if (sourceSnapshotId != null) { // protect any snapshot that was cherry-picked into the current table state pickedAncestorSnapshotIds.add(Long.parseLong(sourceSnapshotId)); } } - // find manifests to clean up that are still referenced by a valid snapshot, but written by an expired snapshot + // find manifests to clean up that are still referenced by a valid snapshot, but written by an + // expired snapshot Set validManifests = Sets.newHashSet(); Set manifestsToScan = Sets.newHashSet(); - Tasks.foreach(snapshots).retry(3).suppressFailureWhenFinished() - .onFailure((snapshot, exc) -> - LOG.warn("Failed on snapshot {} while reading manifest list: {}", snapshot.snapshotId(), - snapshot.manifestListLocation(), exc)) + Tasks.foreach(snapshots) + .retry(3) + .suppressFailureWhenFinished() + .onFailure( + (snapshot, exc) -> + LOG.warn( + "Failed on snapshot {} while reading manifest list: {}", + snapshot.snapshotId(), + snapshot.manifestListLocation(), + exc)) .run( snapshot -> { try (CloseableIterable manifests = readManifestFiles(snapshot)) { @@ -387,23 +403,29 @@ private void removeExpiredFiles(List snapshots, Set validIds, Se validManifests.add(manifest.path()); long snapshotId = manifest.snapshotId(); - // whether the manifest was created by a valid snapshot (true) or an expired snapshot (false) + // whether the manifest was created by a valid snapshot (true) or an expired + // snapshot (false) boolean fromValidSnapshots = validIds.contains(snapshotId); - // whether the snapshot that created the manifest was an ancestor of the table state + // whether the snapshot that created the manifest was an ancestor of the table + // state boolean isFromAncestor = ancestorIds.contains(snapshotId); - // whether the changes in this snapshot have been picked into the current table state + // whether the changes in this snapshot have been picked into the current table + // state boolean isPicked = pickedAncestorSnapshotIds.contains(snapshotId); // if the snapshot that wrote this manifest is no longer valid (has expired), - // then delete its deleted files. note that this is only for expired snapshots that are in the + // then delete its deleted files. note that this is only for expired snapshots + // that are in the // current table state - if (!fromValidSnapshots && (isFromAncestor || isPicked) && manifest.hasDeletedFiles()) { + if (!fromValidSnapshots + && (isFromAncestor || isPicked) + && manifest.hasDeletedFiles()) { manifestsToScan.add(manifest.copy()); } } } catch (IOException e) { - throw new RuntimeIOException(e, - "Failed to close manifest list: %s", snapshot.manifestListLocation()); + throw new RuntimeIOException( + e, "Failed to close manifest list: %s", snapshot.manifestListLocation()); } }); @@ -411,36 +433,48 @@ private void removeExpiredFiles(List snapshots, Set validIds, Se Set manifestListsToDelete = Sets.newHashSet(); Set manifestsToDelete = Sets.newHashSet(); Set manifestsToRevert = Sets.newHashSet(); - Tasks.foreach(base.snapshots()).retry(3).suppressFailureWhenFinished() - .onFailure((snapshot, exc) -> - LOG.warn("Failed on snapshot {} while reading manifest list: {}", snapshot.snapshotId(), - snapshot.manifestListLocation(), exc)) + Tasks.foreach(base.snapshots()) + .retry(3) + .suppressFailureWhenFinished() + .onFailure( + (snapshot, exc) -> + LOG.warn( + "Failed on snapshot {} while reading manifest list: {}", + snapshot.snapshotId(), + snapshot.manifestListLocation(), + exc)) .run( snapshot -> { long snapshotId = snapshot.snapshotId(); if (!validIds.contains(snapshotId)) { // determine whether the changes in this snapshot are in the current table state if (pickedAncestorSnapshotIds.contains(snapshotId)) { - // this snapshot was cherry-picked into the current table state, so skip cleaning it up. + // this snapshot was cherry-picked into the current table state, so skip cleaning + // it up. // its changes will expire when the picked snapshot expires. // A -- C -- D (source=B) // `- B <-- this commit return; } - long sourceSnapshotId = PropertyUtil.propertyAsLong( - snapshot.summary(), SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP, -1); + long sourceSnapshotId = + PropertyUtil.propertyAsLong( + snapshot.summary(), SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP, -1); if (ancestorIds.contains(sourceSnapshotId)) { - // this commit was cherry-picked from a commit that is in the current table state. do not clean up its - // changes because it would revert data file additions that are in the current table. + // this commit was cherry-picked from a commit that is in the current table state. + // do not clean up its + // changes because it would revert data file additions that are in the current + // table. // A -- B -- C // `- D (source=B) <-- this commit return; } if (pickedAncestorSnapshotIds.contains(sourceSnapshotId)) { - // this commit was cherry-picked from a commit that is in the current table state. do not clean up its - // changes because it would revert data file additions that are in the current table. + // this commit was cherry-picked from a commit that is in the current table state. + // do not clean up its + // changes because it would revert data file additions that are in the current + // table. // A -- C -- E (source=B) // `- B `- D (source=B) <-- this commit return; @@ -456,28 +490,37 @@ private void removeExpiredFiles(List snapshots, Set validIds, Se boolean isFromExpiringSnapshot = expiredIds.contains(manifest.snapshotId()); if (isFromAncestor && manifest.hasDeletedFiles()) { - // Only delete data files that were deleted in by an expired snapshot if that - // snapshot is an ancestor of the current table state. Otherwise, a snapshot that - // deleted files and was rolled back will delete files that could be in the current + // Only delete data files that were deleted in by an expired snapshot if + // that + // snapshot is an ancestor of the current table state. Otherwise, a snapshot + // that + // deleted files and was rolled back will delete files that could be in the + // current // table state. manifestsToScan.add(manifest.copy()); } if (!isFromAncestor && isFromExpiringSnapshot && manifest.hasAddedFiles()) { - // Because the manifest was written by a snapshot that is not an ancestor of the - // current table state, the files added in this manifest can be removed. The extra - // check whether the manifest was written by a known snapshot that was expired in - // this commit ensures that the full ancestor list between when the snapshot was - // written and this expiration is known and there is no missing history. If history - // were missing, then the snapshot could be an ancestor of the table state but the + // Because the manifest was written by a snapshot that is not an ancestor of + // the + // current table state, the files added in this manifest can be removed. The + // extra + // check whether the manifest was written by a known snapshot that was + // expired in + // this commit ensures that the full ancestor list between when the snapshot + // was + // written and this expiration is known and there is no missing history. If + // history + // were missing, then the snapshot could be an ancestor of the table state + // but the // ancestor ID set would not contain it and this would be unsafe. manifestsToRevert.add(manifest.copy()); } } } } catch (IOException e) { - throw new RuntimeIOException(e, - "Failed to close manifest list: %s", snapshot.manifestListLocation()); + throw new RuntimeIOException( + e, "Failed to close manifest list: %s", snapshot.manifestListLocation()); } // add the manifest list to the delete set, if present @@ -490,80 +533,100 @@ private void removeExpiredFiles(List snapshots, Set validIds, Se deleteMetadataFiles(manifestsToDelete, manifestListsToDelete); } - private void deleteMetadataFiles(Set manifestsToDelete, Set manifestListsToDelete) { + private void deleteMetadataFiles( + Set manifestsToDelete, Set manifestListsToDelete) { LOG.warn("Manifests to delete: {}", Joiner.on(", ").join(manifestsToDelete)); LOG.warn("Manifests Lists to delete: {}", Joiner.on(", ").join(manifestListsToDelete)); Tasks.foreach(manifestsToDelete) .executeWith(deleteExecutorService) - .retry(3).stopRetryOn(NotFoundException.class).suppressFailureWhenFinished() + .retry(3) + .stopRetryOn(NotFoundException.class) + .suppressFailureWhenFinished() .onFailure((manifest, exc) -> LOG.warn("Delete failed for manifest: {}", manifest, exc)) .run(deleteFunc::accept); Tasks.foreach(manifestListsToDelete) .executeWith(deleteExecutorService) - .retry(3).stopRetryOn(NotFoundException.class).suppressFailureWhenFinished() + .retry(3) + .stopRetryOn(NotFoundException.class) + .suppressFailureWhenFinished() .onFailure((list, exc) -> LOG.warn("Delete failed for manifest list: {}", list, exc)) .run(deleteFunc::accept); } - private void deleteDataFiles(Set manifestsToScan, Set manifestsToRevert, - Set validIds) { + private void deleteDataFiles( + Set manifestsToScan, Set manifestsToRevert, Set validIds) { Set filesToDelete = findFilesToDelete(manifestsToScan, manifestsToRevert, validIds); Tasks.foreach(filesToDelete) .executeWith(deleteExecutorService) - .retry(3).stopRetryOn(NotFoundException.class).suppressFailureWhenFinished() + .retry(3) + .stopRetryOn(NotFoundException.class) + .suppressFailureWhenFinished() .onFailure((file, exc) -> LOG.warn("Delete failed for data file: {}", file, exc)) .run(file -> deleteFunc.accept(file)); } - private Set findFilesToDelete(Set manifestsToScan, Set manifestsToRevert, - Set validIds) { + private Set findFilesToDelete( + Set manifestsToScan, Set manifestsToRevert, Set validIds) { Set filesToDelete = ConcurrentHashMap.newKeySet(); Tasks.foreach(manifestsToScan) - .retry(3).suppressFailureWhenFinished() + .retry(3) + .suppressFailureWhenFinished() .executeWith(planExecutorService) - .onFailure((item, exc) -> LOG.warn("Failed to get deleted files: this may cause orphaned data files", exc)) - .run(manifest -> { - // the manifest has deletes, scan it to find files to delete - try (ManifestReader reader = ManifestFiles.open(manifest, ops.io(), ops.current().specsById())) { - for (ManifestEntry entry : reader.entries()) { - // if the snapshot ID of the DELETE entry is no longer valid, the data can be deleted - if (entry.status() == ManifestEntry.Status.DELETED && - !validIds.contains(entry.snapshotId())) { - // use toString to ensure the path will not change (Utf8 is reused) - filesToDelete.add(entry.file().path().toString()); + .onFailure( + (item, exc) -> + LOG.warn("Failed to get deleted files: this may cause orphaned data files", exc)) + .run( + manifest -> { + // the manifest has deletes, scan it to find files to delete + try (ManifestReader reader = + ManifestFiles.open(manifest, ops.io(), ops.current().specsById())) { + for (ManifestEntry entry : reader.entries()) { + // if the snapshot ID of the DELETE entry is no longer valid, the data can be + // deleted + if (entry.status() == ManifestEntry.Status.DELETED + && !validIds.contains(entry.snapshotId())) { + // use toString to ensure the path will not change (Utf8 is reused) + filesToDelete.add(entry.file().path().toString()); + } + } + } catch (IOException e) { + throw new RuntimeIOException(e, "Failed to read manifest file: %s", manifest); } - } - } catch (IOException e) { - throw new RuntimeIOException(e, "Failed to read manifest file: %s", manifest); - } - }); + }); Tasks.foreach(manifestsToRevert) - .retry(3).suppressFailureWhenFinished() + .retry(3) + .suppressFailureWhenFinished() .executeWith(planExecutorService) - .onFailure((item, exc) -> LOG.warn("Failed to get added files: this may cause orphaned data files", exc)) - .run(manifest -> { - // the manifest has deletes, scan it to find files to delete - try (ManifestReader reader = ManifestFiles.open(manifest, ops.io(), ops.current().specsById())) { - for (ManifestEntry entry : reader.entries()) { - // delete any ADDED file from manifests that were reverted - if (entry.status() == ManifestEntry.Status.ADDED) { - // use toString to ensure the path will not change (Utf8 is reused) - filesToDelete.add(entry.file().path().toString()); + .onFailure( + (item, exc) -> + LOG.warn("Failed to get added files: this may cause orphaned data files", exc)) + .run( + manifest -> { + // the manifest has deletes, scan it to find files to delete + try (ManifestReader reader = + ManifestFiles.open(manifest, ops.io(), ops.current().specsById())) { + for (ManifestEntry entry : reader.entries()) { + // delete any ADDED file from manifests that were reverted + if (entry.status() == ManifestEntry.Status.ADDED) { + // use toString to ensure the path will not change (Utf8 is reused) + filesToDelete.add(entry.file().path().toString()); + } + } + } catch (IOException e) { + throw new RuntimeIOException(e, "Failed to read manifest file: %s", manifest); } - } - } catch (IOException e) { - throw new RuntimeIOException(e, "Failed to read manifest file: %s", manifest); - } - }); + }); return filesToDelete; } - private static final Schema MANIFEST_PROJECTION = ManifestFile.schema() - .select("manifest_path", "manifest_length", "added_snapshot_id", "deleted_data_files_count"); + private static final Schema MANIFEST_PROJECTION = + ManifestFile.schema() + .select( + "manifest_path", "manifest_length", "added_snapshot_id", "deleted_data_files_count"); private CloseableIterable readManifestFiles(Snapshot snapshot) { if (snapshot.manifestListLocation() != null) { diff --git a/core/src/main/java/org/apache/iceberg/RollbackToSnapshot.java b/core/src/main/java/org/apache/iceberg/RollbackToSnapshot.java index 3497d83182a1..499f6898a92f 100644 --- a/core/src/main/java/org/apache/iceberg/RollbackToSnapshot.java +++ b/core/src/main/java/org/apache/iceberg/RollbackToSnapshot.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; class RollbackToSnapshot extends SnapshotManager implements Rollback { diff --git a/core/src/main/java/org/apache/iceberg/RowLevelOperationMode.java b/core/src/main/java/org/apache/iceberg/RowLevelOperationMode.java index 6d08ec557fc7..55f345bb663a 100644 --- a/core/src/main/java/org/apache/iceberg/RowLevelOperationMode.java +++ b/core/src/main/java/org/apache/iceberg/RowLevelOperationMode.java @@ -16,29 +16,30 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; /** * Iceberg supports two ways to modify records in a table: copy-on-write and merge-on-read. - *

- * In copy-on-write, changes are materialized immediately and matching data files are replaced with - * new data files that represent the new table state. For example, if there is a record that - * has to be deleted, the data file that contains this record has to be replaced with another - * data file without that record. All unchanged rows have to be copied over to the new data file. - *

- * In merge-on-read, changes aren't materialized immediately. Instead, IDs of deleted and updated + * + *

In copy-on-write, changes are materialized immediately and matching data files are replaced + * with new data files that represent the new table state. For example, if there is a record that + * has to be deleted, the data file that contains this record has to be replaced with another data + * file without that record. All unchanged rows have to be copied over to the new data file. + * + *

In merge-on-read, changes aren't materialized immediately. Instead, IDs of deleted and updated * records are written into delete files that are applied during reads and updated/inserted records * are written into new data files that are committed together with the delete files. - *

- * Copy-on-write changes tend to consume more time and resources during writes but don't introduce any - * performance overhead during reads. Merge-on-read operations, on the other hand, tend to be much - * faster during writes but require more time and resources to apply delete files during reads. + * + *

Copy-on-write changes tend to consume more time and resources during writes but don't + * introduce any performance overhead during reads. Merge-on-read operations, on the other hand, + * tend to be much faster during writes but require more time and resources to apply delete files + * during reads. */ public enum RowLevelOperationMode { - COPY_ON_WRITE, MERGE_ON_READ; + COPY_ON_WRITE, + MERGE_ON_READ; public static RowLevelOperationMode fromName(String modeName) { Preconditions.checkArgument(modeName != null, "Mode name is null"); diff --git a/core/src/main/java/org/apache/iceberg/ScanSummary.java b/core/src/main/java/org/apache/iceberg/ScanSummary.java index 6ae9d1eceb45..7b8cf22e9a67 100644 --- a/core/src/main/java/org/apache/iceberg/ScanSummary.java +++ b/core/src/main/java/org/apache/iceberg/ScanSummary.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -49,11 +48,10 @@ import org.apache.iceberg.util.Pair; public class ScanSummary { - private ScanSummary() { - } + private ScanSummary() {} - private static final ImmutableList SCAN_SUMMARY_COLUMNS = ImmutableList.of( - "partition", "record_count", "file_size_in_bytes"); + private static final ImmutableList SCAN_SUMMARY_COLUMNS = + ImmutableList.of("partition", "record_count", "file_size_in_bytes"); /** * Create a scan summary builder for a table scan. @@ -66,8 +64,8 @@ public static ScanSummary.Builder of(TableScan scan) { } public static class Builder { - private static final Set TIMESTAMP_NAMES = Sets.newHashSet( - "dateCreated", "lastUpdated"); + private static final Set TIMESTAMP_NAMES = + Sets.newHashSet("dateCreated", "lastUpdated"); private final TableScan scan; private final Table table; private final TableOperations ops; @@ -186,8 +184,8 @@ public Map build() { // if oldest known snapshot is in the range, then there may be an expired snapshot that has // been removed that matched the range. because the timestamp of that snapshot is unknown, // it can't be included in the results and the results are not reliable. - if (snapshotsInTimeRange.contains(oldestSnapshot.snapshotId()) && - minTimestamp < oldestSnapshot.timestampMillis()) { + if (snapshotsInTimeRange.contains(oldestSnapshot.snapshotId()) + && minTimestamp < oldestSnapshot.timestampMillis()) { throw new IllegalArgumentException( "Cannot satisfy time filters: time range may include expired snapshots"); } @@ -196,19 +194,25 @@ public Map build() { // time range. manifests after the end of the time range must be included because // compaction may create a manifest after the time range that includes files added in the // range. - manifests = Iterables.filter(manifests, manifest -> { - if (manifest.snapshotId() == null) { - return true; // can't tell when the manifest was written, so it may contain matches - } - - Long timestamp = snapshotTimestamps.get(manifest.snapshotId()); - // if the timestamp is null, then its snapshot has expired. the check for the oldest - // snapshot ensures that all expired snapshots are not in the time range. - return timestamp != null && timestamp >= minTimestamp; - }); + manifests = + Iterables.filter( + manifests, + manifest -> { + if (manifest.snapshotId() == null) { + return true; // can't tell when the manifest was written, so it may contain + // matches + } + + Long timestamp = snapshotTimestamps.get(manifest.snapshotId()); + // if the timestamp is null, then its snapshot has expired. the check for the + // oldest + // snapshot ensures that all expired snapshots are not in the time range. + return timestamp != null && timestamp >= minTimestamp; + }); } - return computeTopPartitionMetrics(rowFilter, manifests, filterByTimestamp, snapshotsInTimeRange); + return computeTopPartitionMetrics( + rowFilter, manifests, filterByTimestamp, snapshotsInTimeRange); } private Map computeTopPartitionMetrics( @@ -216,15 +220,16 @@ private Map computeTopPartitionMetrics( Iterable manifests, boolean filterByTimestamp, Set snapshotsInTimeRange) { - TopN topN = new TopN<>( - limit, throwIfLimited, Comparators.charSequences()); + TopN topN = + new TopN<>(limit, throwIfLimited, Comparators.charSequences()); - try (CloseableIterable> entries = new ManifestGroup(ops.io(), manifests) - .specsById(ops.current().specsById()) - .filterData(rowFilter) - .ignoreDeleted() - .select(SCAN_SUMMARY_COLUMNS) - .entries()) { + try (CloseableIterable> entries = + new ManifestGroup(ops.io(), manifests) + .specsById(ops.current().specsById()) + .filterData(rowFilter) + .ignoreDeleted() + .select(SCAN_SUMMARY_COLUMNS) + .entries()) { PartitionSpec spec = table.spec(); for (ManifestEntry entry : entries) { @@ -236,8 +241,11 @@ private Map computeTopPartitionMetrics( } String partition = spec.partitionToPath(entry.file().partition()); - topN.update(partition, metrics -> (metrics == null ? new PartitionMetrics() : metrics) - .updateFromFile(entry.file(), timestamp)); + topN.update( + partition, + metrics -> + (metrics == null ? new PartitionMetrics() : metrics) + .updateFromFile(entry.file(), timestamp)); } } catch (IOException e) { @@ -270,12 +278,13 @@ public Long dataTimestampMillis() { return dataTimestampMillis; } - PartitionMetrics updateFromCounts(int numFiles, long filesRecordCount, long filesSize, - Long timestampMillis) { + PartitionMetrics updateFromCounts( + int numFiles, long filesRecordCount, long filesSize, Long timestampMillis) { this.fileCount += numFiles; this.recordCount += filesRecordCount; this.totalSize += filesSize; - if (timestampMillis != null && (dataTimestampMillis == null || dataTimestampMillis < timestampMillis)) { + if (timestampMillis != null + && (dataTimestampMillis == null || dataTimestampMillis < timestampMillis)) { this.dataTimestampMillis = timestampMillis; } return this; @@ -285,8 +294,8 @@ private PartitionMetrics updateFromFile(ContentFile file, Long timestampMilli this.fileCount += 1; this.recordCount += file.recordCount(); this.totalSize += file.fileSizeInBytes(); - if (timestampMillis != null && - (dataTimestampMillis == null || dataTimestampMillis < timestampMillis)) { + if (timestampMillis != null + && (dataTimestampMillis == null || dataTimestampMillis < timestampMillis)) { this.dataTimestampMillis = timestampMillis; } return this; @@ -294,12 +303,19 @@ private PartitionMetrics updateFromFile(ContentFile file, Long timestampMilli @Override public String toString() { - String dataTimestamp = dataTimestampMillis != null ? - DateTimeUtil.formatTimestampMillis(dataTimestampMillis) : null; - return "PartitionMetrics(fileCount=" + fileCount + - ", recordCount=" + recordCount + - ", totalSize=" + totalSize + - ", dataTimestamp=" + dataTimestamp + ")"; + String dataTimestamp = + dataTimestampMillis != null + ? DateTimeUtil.formatTimestampMillis(dataTimestampMillis) + : null; + return "PartitionMetrics(fileCount=" + + fileCount + + ", recordCount=" + + recordCount + + ", totalSize=" + + totalSize + + ", dataTimestamp=" + + dataTimestamp + + ")"; } } diff --git a/core/src/main/java/org/apache/iceberg/SchemaParser.java b/core/src/main/java/org/apache/iceberg/SchemaParser.java index a270e217bd2d..8cbc255d49bf 100644 --- a/core/src/main/java/org/apache/iceberg/SchemaParser.java +++ b/core/src/main/java/org/apache/iceberg/SchemaParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.fasterxml.jackson.core.JsonGenerator; @@ -37,8 +36,7 @@ public class SchemaParser { - private SchemaParser() { - } + private SchemaParser() {} private static final String SCHEMA_ID = "schema-id"; private static final String IDENTIFIER_FIELD_IDS = "identifier-field-ids"; @@ -64,8 +62,12 @@ private static void toJson(Types.StructType struct, JsonGenerator generator) thr toJson(struct, null, null, generator); } - private static void toJson(Types.StructType struct, Integer schemaId, Set identifierFieldIds, - JsonGenerator generator) throws IOException { + private static void toJson( + Types.StructType struct, + Integer schemaId, + Set identifierFieldIds, + JsonGenerator generator) + throws IOException { generator.writeStartObject(); generator.writeStringField(TYPE, STRUCT); @@ -185,7 +187,7 @@ private static Type typeFromJson(JsonNode json) { } else if (json.isObject()) { JsonNode typeObj = json.get(TYPE); if (typeObj != null) { - String type = typeObj.asText(); + String type = typeObj.asText(); if (STRUCT.equals(type)) { return structFromJson(json); } else if (LIST.equals(type)) { @@ -201,15 +203,15 @@ private static Type typeFromJson(JsonNode json) { private static Types.StructType structFromJson(JsonNode json) { JsonNode fieldArray = json.get(FIELDS); - Preconditions.checkArgument(fieldArray.isArray(), - "Cannot parse struct fields from non-array: %s", fieldArray); + Preconditions.checkArgument( + fieldArray.isArray(), "Cannot parse struct fields from non-array: %s", fieldArray); List fields = Lists.newArrayListWithExpectedSize(fieldArray.size()); Iterator iterator = fieldArray.elements(); while (iterator.hasNext()) { JsonNode field = iterator.next(); - Preconditions.checkArgument(field.isObject(), - "Cannot parse struct field from non-object: %s", field); + Preconditions.checkArgument( + field.isObject(), "Cannot parse struct field from non-object: %s", field); int id = JsonUtil.getInt(ID, field); String name = JsonUtil.getString(NAME, field); @@ -227,7 +229,6 @@ private static Types.StructType structFromJson(JsonNode json) { return Types.StructType.of(fields); } - private static Types.ListType listFromJson(JsonNode json) { int elementId = JsonUtil.getInt(ELEMENT_ID, json); Type elementType = typeFromJson(json.get(ELEMENT)); @@ -258,8 +259,10 @@ private static Types.MapType mapFromJson(JsonNode json) { public static Schema fromJson(JsonNode json) { Type type = typeFromJson(json); - Preconditions.checkArgument(type.isNestedType() && type.asNestedType().isStructType(), - "Cannot create schema, not a struct type: %s", type); + Preconditions.checkArgument( + type.isNestedType() && type.asNestedType().isStructType(), + "Cannot create schema, not a struct type: %s", + type); Integer schemaId = JsonUtil.getIntOrNull(SCHEMA_ID, json); Set identifierFieldIds = JsonUtil.getIntegerSetOrNull(IDENTIFIER_FIELD_IDS, json); @@ -270,17 +273,18 @@ public static Schema fromJson(JsonNode json) { } } - private static final Cache SCHEMA_CACHE = Caffeine.newBuilder() - .weakValues() - .build(); + private static final Cache SCHEMA_CACHE = + Caffeine.newBuilder().weakValues().build(); public static Schema fromJson(String json) { - return SCHEMA_CACHE.get(json, jsonKey -> { - try { - return fromJson(JsonUtil.mapper().readValue(jsonKey, JsonNode.class)); - } catch (IOException e) { - throw new RuntimeIOException(e); - } - }); + return SCHEMA_CACHE.get( + json, + jsonKey -> { + try { + return fromJson(JsonUtil.mapper().readValue(jsonKey, JsonNode.class)); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + }); } } diff --git a/core/src/main/java/org/apache/iceberg/SchemaUpdate.java b/core/src/main/java/org/apache/iceberg/SchemaUpdate.java index 753c7811c1e3..0d6ed9f6e37c 100644 --- a/core/src/main/java/org/apache/iceberg/SchemaUpdate.java +++ b/core/src/main/java/org/apache/iceberg/SchemaUpdate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Collection; @@ -43,9 +42,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * Schema evolution API implementation. - */ +/** Schema evolution API implementation. */ class SchemaUpdate implements UpdateSchema { private static final Logger LOG = LoggerFactory.getLogger(SchemaUpdate.class); private static final int TABLE_ROOT_ID = -1; @@ -59,7 +56,8 @@ class SchemaUpdate implements UpdateSchema { private final Multimap adds = Multimaps.newListMultimap(Maps.newHashMap(), Lists::newArrayList); private final Map addedNameToId = Maps.newHashMap(); - private final Multimap moves = Multimaps.newListMultimap(Maps.newHashMap(), Lists::newArrayList); + private final Multimap moves = + Multimaps.newListMultimap(Maps.newHashMap(), Lists::newArrayList); private int lastColumnId; private boolean allowIncompatibleChanges = false; private Set identifierFieldNames; @@ -68,9 +66,7 @@ class SchemaUpdate implements UpdateSchema { this(ops, ops.current()); } - /** - * For testing only. - */ + /** For testing only. */ SchemaUpdate(Schema schema, int lastColumnId) { this(null, null, schema, lastColumnId); } @@ -96,8 +92,10 @@ public SchemaUpdate allowIncompatibleChanges() { @Override public UpdateSchema addColumn(String name, Type type, String doc) { - Preconditions.checkArgument(!name.contains("."), - "Cannot add column with ambiguous name: %s, use addColumn(parent, name, type)", name); + Preconditions.checkArgument( + !name.contains("."), + "Cannot add column with ambiguous name: %s, use addColumn(parent, name, type)", + name); return addColumn(null, name, type, doc); } @@ -109,21 +107,24 @@ public UpdateSchema addColumn(String parent, String name, Type type, String doc) @Override public UpdateSchema addRequiredColumn(String name, Type type, String doc) { - Preconditions.checkArgument(!name.contains("."), - "Cannot add column with ambiguous name: %s, use addColumn(parent, name, type)", name); + Preconditions.checkArgument( + !name.contains("."), + "Cannot add column with ambiguous name: %s, use addColumn(parent, name, type)", + name); addRequiredColumn(null, name, type, doc); return this; } @Override public UpdateSchema addRequiredColumn(String parent, String name, Type type, String doc) { - Preconditions.checkArgument(allowIncompatibleChanges, - "Incompatible change: cannot add required column: %s", name); + Preconditions.checkArgument( + allowIncompatibleChanges, "Incompatible change: cannot add required column: %s", name); internalAddColumn(parent, name, false, type, doc); return this; } - private void internalAddColumn(String parent, String name, boolean isOptional, Type type, String doc) { + private void internalAddColumn( + String parent, String name, boolean isOptional, Type type, String doc) { int parentId = TABLE_ROOT_ID; String fullName; if (parent != null) { @@ -142,18 +143,25 @@ private void internalAddColumn(String parent, String name, boolean isOptional, T } Preconditions.checkArgument( parentField.type().isNestedType() && parentField.type().asNestedType().isStructType(), - "Cannot add to non-struct column: %s: %s", parent, parentField.type()); + "Cannot add to non-struct column: %s: %s", + parent, + parentField.type()); parentId = parentField.fieldId(); Types.NestedField currentField = schema.findField(parent + "." + name); - Preconditions.checkArgument(!deletes.contains(parentId), - "Cannot add to a column that will be deleted: %s", parent); - Preconditions.checkArgument(currentField == null || deletes.contains(currentField.fieldId()), - "Cannot add column, name already exists: %s.%s", parent, name); + Preconditions.checkArgument( + !deletes.contains(parentId), "Cannot add to a column that will be deleted: %s", parent); + Preconditions.checkArgument( + currentField == null || deletes.contains(currentField.fieldId()), + "Cannot add column, name already exists: %s.%s", + parent, + name); fullName = schema.findColumnName(parentId) + "." + name; } else { Types.NestedField currentField = schema.findField(name); - Preconditions.checkArgument(currentField == null || deletes.contains(currentField.fieldId()), - "Cannot add column, name already exists: %s", name); + Preconditions.checkArgument( + currentField == null || deletes.contains(currentField.fieldId()), + "Cannot add column, name already exists: %s", + name); fullName = name; } @@ -166,18 +174,20 @@ private void internalAddColumn(String parent, String name, boolean isOptional, T idToParent.put(newId, parentId); } - adds.put(parentId, Types.NestedField.of(newId, isOptional, name, - TypeUtil.assignFreshIds(type, this::assignNewColumnId), doc)); + adds.put( + parentId, + Types.NestedField.of( + newId, isOptional, name, TypeUtil.assignFreshIds(type, this::assignNewColumnId), doc)); } @Override public UpdateSchema deleteColumn(String name) { Types.NestedField field = schema.findField(name); Preconditions.checkArgument(field != null, "Cannot delete missing column: %s", name); - Preconditions.checkArgument(!adds.containsKey(field.fieldId()), - "Cannot delete a column that has additions: %s", name); - Preconditions.checkArgument(!updates.containsKey(field.fieldId()), - "Cannot delete a column that has updates: %s", name); + Preconditions.checkArgument( + !adds.containsKey(field.fieldId()), "Cannot delete a column that has additions: %s", name); + Preconditions.checkArgument( + !updates.containsKey(field.fieldId()), "Cannot delete a column that has updates: %s", name); deletes.add(field.fieldId()); return this; @@ -188,16 +198,22 @@ public UpdateSchema renameColumn(String name, String newName) { Types.NestedField field = schema.findField(name); Preconditions.checkArgument(field != null, "Cannot rename missing column: %s", name); Preconditions.checkArgument(newName != null, "Cannot rename a column to null"); - Preconditions.checkArgument(!deletes.contains(field.fieldId()), - "Cannot rename a column that will be deleted: %s", field.name()); + Preconditions.checkArgument( + !deletes.contains(field.fieldId()), + "Cannot rename a column that will be deleted: %s", + field.name()); // merge with an update, if present int fieldId = field.fieldId(); Types.NestedField update = updates.get(fieldId); if (update != null) { - updates.put(fieldId, Types.NestedField.of(fieldId, update.isOptional(), newName, update.type(), update.doc())); + updates.put( + fieldId, + Types.NestedField.of(fieldId, update.isOptional(), newName, update.type(), update.doc())); } else { - updates.put(fieldId, Types.NestedField.of(fieldId, field.isOptional(), newName, field.type(), field.doc())); + updates.put( + fieldId, + Types.NestedField.of(fieldId, field.isOptional(), newName, field.type(), field.doc())); } if (identifierFieldNames.contains(name)) { @@ -229,18 +245,26 @@ private void internalUpdateColumnRequirement(String name, boolean isOptional) { return; } - Preconditions.checkArgument(isOptional || allowIncompatibleChanges, - "Cannot change column nullability: %s: optional -> required", name); - Preconditions.checkArgument(!deletes.contains(field.fieldId()), - "Cannot update a column that will be deleted: %s", field.name()); + Preconditions.checkArgument( + isOptional || allowIncompatibleChanges, + "Cannot change column nullability: %s: optional -> required", + name); + Preconditions.checkArgument( + !deletes.contains(field.fieldId()), + "Cannot update a column that will be deleted: %s", + field.name()); int fieldId = field.fieldId(); Types.NestedField update = updates.get(fieldId); if (update != null) { - updates.put(fieldId, Types.NestedField.of(fieldId, isOptional, update.name(), update.type(), update.doc())); + updates.put( + fieldId, + Types.NestedField.of(fieldId, isOptional, update.name(), update.type(), update.doc())); } else { - updates.put(fieldId, Types.NestedField.of(fieldId, isOptional, field.name(), field.type(), field.doc())); + updates.put( + fieldId, + Types.NestedField.of(fieldId, isOptional, field.name(), field.type(), field.doc())); } } @@ -248,23 +272,33 @@ private void internalUpdateColumnRequirement(String name, boolean isOptional) { public UpdateSchema updateColumn(String name, Type.PrimitiveType newType) { Types.NestedField field = schema.findField(name); Preconditions.checkArgument(field != null, "Cannot update missing column: %s", name); - Preconditions.checkArgument(!deletes.contains(field.fieldId()), - "Cannot update a column that will be deleted: %s", field.name()); + Preconditions.checkArgument( + !deletes.contains(field.fieldId()), + "Cannot update a column that will be deleted: %s", + field.name()); if (field.type().equals(newType)) { return this; } - Preconditions.checkArgument(TypeUtil.isPromotionAllowed(field.type(), newType), - "Cannot change column type: %s: %s -> %s", name, field.type(), newType); + Preconditions.checkArgument( + TypeUtil.isPromotionAllowed(field.type(), newType), + "Cannot change column type: %s: %s -> %s", + name, + field.type(), + newType); // merge with a rename, if present int fieldId = field.fieldId(); Types.NestedField update = updates.get(fieldId); if (update != null) { - updates.put(fieldId, Types.NestedField.of(fieldId, update.isOptional(), update.name(), newType, update.doc())); + updates.put( + fieldId, + Types.NestedField.of(fieldId, update.isOptional(), update.name(), newType, update.doc())); } else { - updates.put(fieldId, Types.NestedField.of(fieldId, field.isOptional(), field.name(), newType, field.doc())); + updates.put( + fieldId, + Types.NestedField.of(fieldId, field.isOptional(), field.name(), newType, field.doc())); } return this; @@ -274,8 +308,10 @@ public UpdateSchema updateColumn(String name, Type.PrimitiveType newType) { public UpdateSchema updateColumnDoc(String name, String doc) { Types.NestedField field = schema.findField(name); Preconditions.checkArgument(field != null, "Cannot update missing column: %s", name); - Preconditions.checkArgument(!deletes.contains(field.fieldId()), - "Cannot update a column that will be deleted: %s", field.name()); + Preconditions.checkArgument( + !deletes.contains(field.fieldId()), + "Cannot update a column that will be deleted: %s", + field.name()); if (Objects.equals(field.doc(), doc)) { return this; @@ -285,9 +321,13 @@ public UpdateSchema updateColumnDoc(String name, String doc) { int fieldId = field.fieldId(); Types.NestedField update = updates.get(fieldId); if (update != null) { - updates.put(fieldId, Types.NestedField.of(fieldId, update.isOptional(), update.name(), update.type(), doc)); + updates.put( + fieldId, + Types.NestedField.of(fieldId, update.isOptional(), update.name(), update.type(), doc)); } else { - updates.put(fieldId, Types.NestedField.of(fieldId, field.isOptional(), field.name(), field.type(), doc)); + updates.put( + fieldId, + Types.NestedField.of(fieldId, field.isOptional(), field.name(), field.type(), doc)); } return this; @@ -306,7 +346,8 @@ public UpdateSchema moveBefore(String name, String beforeName) { Integer fieldId = findForMove(name); Preconditions.checkArgument(fieldId != null, "Cannot move missing column: %s", name); Integer beforeId = findForMove(beforeName); - Preconditions.checkArgument(beforeId != null, "Cannot move %s before missing column: %s", name, beforeName); + Preconditions.checkArgument( + beforeId != null, "Cannot move %s before missing column: %s", name, beforeName); Preconditions.checkArgument(!fieldId.equals(beforeId), "Cannot move %s before itself", name); internalMove(name, Move.before(fieldId, beforeId)); return this; @@ -317,7 +358,8 @@ public UpdateSchema moveAfter(String name, String afterName) { Integer fieldId = findForMove(name); Preconditions.checkArgument(fieldId != null, "Cannot move missing column: %s", name); Integer afterId = findForMove(afterName); - Preconditions.checkArgument(afterId != null, "Cannot move %s after missing column: %s", name, afterName); + Preconditions.checkArgument( + afterId != null, "Cannot move %s after missing column: %s", name, afterName); Preconditions.checkArgument(!fieldId.equals(afterId), "Cannot move %s after itself", name); internalMove(name, Move.after(fieldId, afterId)); return this; @@ -347,13 +389,14 @@ private void internalMove(String name, Move move) { Integer parentId = idToParent.get(move.fieldId()); if (parentId != null) { Types.NestedField parent = schema.findField(parentId); - Preconditions.checkArgument(parent.type().isStructType(), - "Cannot move fields in non-struct type: %s", parent.type()); + Preconditions.checkArgument( + parent.type().isStructType(), "Cannot move fields in non-struct type: %s", parent.type()); if (move.type() == Move.MoveType.AFTER || move.type() == Move.MoveType.BEFORE) { Preconditions.checkArgument( parentId.equals(idToParent.get(move.referenceFieldId())), - "Cannot move field %s to a different struct", name); + "Cannot move field %s to a different struct", + name); } moves.put(parentId, move); @@ -361,7 +404,8 @@ private void internalMove(String name, Move move) { if (move.type() == Move.MoveType.AFTER || move.type() == Move.MoveType.BEFORE) { Preconditions.checkArgument( idToParent.get(move.referenceFieldId()) == null, - "Cannot move field %s to a different struct", name); + "Cannot move field %s to a different struct", + name); } moves.put(TABLE_ROOT_ID, move); @@ -370,8 +414,8 @@ private void internalMove(String name, Move move) { /** * Apply the pending changes to the original schema and returns the result. - *

- * This does not result in a permanent update. + * + *

This does not result in a permanent update. * * @return the result Schema when all pending updates are applied */ @@ -406,7 +450,8 @@ private TableMetadata applyChangesToMetadata(TableMetadata metadata) { // replace the table property Map updatedProperties = Maps.newHashMap(); updatedProperties.putAll(metadata.properties()); - updatedProperties.put(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(updated)); + updatedProperties.put( + TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(updated)); newMetadata = metadata.replaceProperties(updatedProperties); @@ -419,60 +464,70 @@ private TableMetadata applyChangesToMetadata(TableMetadata metadata) { // Transform the metrics if they exist if (base != null && base.properties() != null) { Schema newSchema = newMetadata.schema(); - List deletedColumns = deletes.stream() - .map(schema::findColumnName) - .collect(Collectors.toList()); - Map renamedColumns = updates.keySet().stream() - .filter(id -> !schema.findColumnName(id).equals(newSchema.findColumnName(id))) - .collect(Collectors.toMap(schema::findColumnName, newSchema::findColumnName)); - Map updatedProperties = MetricsConfig.updateProperties( - newMetadata.properties(), deletedColumns, renamedColumns); + List deletedColumns = + deletes.stream().map(schema::findColumnName).collect(Collectors.toList()); + Map renamedColumns = + updates.keySet().stream() + .filter(id -> !schema.findColumnName(id).equals(newSchema.findColumnName(id))) + .collect(Collectors.toMap(schema::findColumnName, newSchema::findColumnName)); + Map updatedProperties = + MetricsConfig.updateProperties(newMetadata.properties(), deletedColumns, renamedColumns); newMetadata = newMetadata.replaceProperties(updatedProperties); } return newMetadata; } - private static Schema applyChanges(Schema schema, List deletes, - Map updates, - Multimap adds, - Multimap moves, - Set identifierFieldNames) { + private static Schema applyChanges( + Schema schema, + List deletes, + Map updates, + Multimap adds, + Multimap moves, + Set identifierFieldNames) { // validate existing identifier fields are not deleted Map idToParent = TypeUtil.indexParents(schema.asStruct()); for (String name : identifierFieldNames) { Types.NestedField field = schema.findField(name); if (field != null) { - Preconditions.checkArgument(!deletes.contains(field.fieldId()), - "Cannot delete identifier field %s. To force deletion, " + - "also call setIdentifierFields to update identifier fields.", field); + Preconditions.checkArgument( + !deletes.contains(field.fieldId()), + "Cannot delete identifier field %s. To force deletion, " + + "also call setIdentifierFields to update identifier fields.", + field); Integer parentId = idToParent.get(field.fieldId()); while (parentId != null) { - Preconditions.checkArgument(!deletes.contains(parentId), + Preconditions.checkArgument( + !deletes.contains(parentId), "Cannot delete field %s as it will delete nested identifier field %s", - schema.findField(parentId), field); + schema.findField(parentId), + field); parentId = idToParent.get(parentId); } } } // apply schema changes - Types.StructType struct = TypeUtil - .visit(schema, new ApplyChanges(deletes, updates, adds, moves)) - .asNestedType().asStructType(); + Types.StructType struct = + TypeUtil.visit(schema, new ApplyChanges(deletes, updates, adds, moves)) + .asNestedType() + .asStructType(); // validate identifier requirements based on the latest schema Map nameToId = TypeUtil.indexByName(struct); Set freshIdentifierFieldIds = Sets.newHashSet(); for (String name : identifierFieldNames) { - Preconditions.checkArgument(nameToId.containsKey(name), - "Cannot add field %s as an identifier field: not found in current schema or added columns", name); + Preconditions.checkArgument( + nameToId.containsKey(name), + "Cannot add field %s as an identifier field: not found in current schema or added columns", + name); freshIdentifierFieldIds.add(nameToId.get(name)); } Map idToField = TypeUtil.indexById(struct); - freshIdentifierFieldIds.forEach(id -> Schema.validateIdentifierField(id, idToField, idToParent)); + freshIdentifierFieldIds.forEach( + id -> Schema.validateIdentifierField(id, idToField, idToParent)); return new Schema(struct.fields(), freshIdentifierFieldIds); } @@ -483,10 +538,11 @@ private static class ApplyChanges extends TypeUtil.SchemaVisitor { private final Multimap adds; private final Multimap moves; - private ApplyChanges(List deletes, - Map updates, - Multimap adds, - Multimap moves) { + private ApplyChanges( + List deletes, + Map updates, + Multimap adds, + Multimap moves) { this.deletes = deletes; this.updates = updates; this.adds = adds; @@ -495,8 +551,11 @@ private ApplyChanges(List deletes, @Override public Type schema(Schema schema, Type structResult) { - List fields = addAndMoveFields(structResult.asStructType().fields(), - adds.get(TABLE_ROOT_ID), moves.get(TABLE_ROOT_ID)); + List fields = + addAndMoveFields( + structResult.asStructType().fields(), + adds.get(TABLE_ROOT_ID), + moves.get(TABLE_ROOT_ID)); if (fields != null) { return Types.StructType.of(fields); @@ -527,10 +586,10 @@ public Type struct(Types.StructType struct, List fieldResults) { isOptional = update.isOptional(); } - if (name.equals(field.name()) && - isOptional == field.isOptional() && - field.type() == resultType && - Objects.equals(doc, field.doc())) { + if (name.equals(field.name()) + && isOptional == field.isOptional() + && field.type() == resultType + && Objects.equals(doc, field.doc())) { newFields.add(field); } else { hasChange = true; @@ -566,9 +625,10 @@ public Type field(Types.NestedField field, Type fieldResult) { Collection newFields = adds.get(fieldId); Collection columnsToMove = moves.get(fieldId); if (!newFields.isEmpty() || !columnsToMove.isEmpty()) { - // if either collection is non-null, then this must be a struct type. try to apply the changes - List fields = addAndMoveFields( - fieldResult.asStructType().fields(), newFields, columnsToMove); + // if either collection is non-null, then this must be a struct type. try to apply the + // changes + List fields = + addAndMoveFields(fieldResult.asStructType().fields(), newFields, columnsToMove); if (fields != null) { return Types.StructType.of(fields); } @@ -587,7 +647,8 @@ public Type list(Types.ListType list, Type elementResult) { } Types.NestedField elementUpdate = updates.get(elementField.fieldId()); - boolean isElementOptional = elementUpdate != null ? elementUpdate.isOptional() : list.isElementOptional(); + boolean isElementOptional = + elementUpdate != null ? elementUpdate.isOptional() : list.isElementOptional(); if (isElementOptional == elementField.isOptional() && list.elementType() == elementType) { return list; @@ -622,7 +683,8 @@ public Type map(Types.MapType map, Type kResult, Type valueResult) { } Types.NestedField valueUpdate = updates.get(valueField.fieldId()); - boolean isValueOptional = valueUpdate != null ? valueUpdate.isOptional() : map.isValueOptional(); + boolean isValueOptional = + valueUpdate != null ? valueUpdate.isOptional() : map.isValueOptional(); if (isValueOptional == map.isValueOptional() && map.valueType() == valueType) { return map; @@ -641,9 +703,8 @@ public Type primitive(Type.PrimitiveType primitive) { } } - private static List addAndMoveFields(List fields, - Collection adds, - Collection moves) { + private static List addAndMoveFields( + List fields, Collection adds, Collection moves) { if (adds != null && !adds.isEmpty()) { if (moves != null && !moves.isEmpty()) { // always apply adds first so that added fields can be moved @@ -657,20 +718,21 @@ private static List addAndMoveFields(List return null; } - private static List addFields(List fields, - Collection adds) { + private static List addFields( + List fields, Collection adds) { List newFields = Lists.newArrayList(fields); newFields.addAll(adds); return newFields; } @SuppressWarnings({"checkstyle:IllegalType", "JdkObsolete"}) - private static List moveFields(List fields, - Collection moves) { + private static List moveFields( + List fields, Collection moves) { LinkedList reordered = Lists.newLinkedList(fields); for (Move move : moves) { - Types.NestedField toMove = Iterables.find(reordered, field -> field.fieldId() == move.fieldId()); + Types.NestedField toMove = + Iterables.find(reordered, field -> field.fieldId() == move.fieldId()); reordered.remove(toMove); switch (move.type()) { @@ -679,14 +741,16 @@ private static List moveFields(List fields break; case BEFORE: - Types.NestedField before = Iterables.find(reordered, field -> field.fieldId() == move.referenceFieldId()); + Types.NestedField before = + Iterables.find(reordered, field -> field.fieldId() == move.referenceFieldId()); int beforeIndex = reordered.indexOf(before); // insert the new node at the index of the existing node reordered.add(beforeIndex, toMove); break; case AFTER: - Types.NestedField after = Iterables.find(reordered, field -> field.fieldId() == move.referenceFieldId()); + Types.NestedField after = + Iterables.find(reordered, field -> field.fieldId() == move.referenceFieldId()); int afterIndex = reordered.indexOf(after); reordered.add(afterIndex + 1, toMove); break; @@ -699,9 +763,7 @@ private static List moveFields(List fields return reordered; } - /** - * Represents a requested column move in a struct. - */ + /** Represents a requested column move in a struct. */ private static class Move { private enum MoveType { FIRST, diff --git a/core/src/main/java/org/apache/iceberg/SerializableByteBufferMap.java b/core/src/main/java/org/apache/iceberg/SerializableByteBufferMap.java index 73732977717f..3aabefa23b1e 100644 --- a/core/src/main/java/org/apache/iceberg/SerializableByteBufferMap.java +++ b/core/src/main/java/org/apache/iceberg/SerializableByteBufferMap.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.ObjectStreamException; @@ -55,11 +54,8 @@ private static class MapSerializationProxy implements Serializable { private int[] keys = null; private byte[][] values = null; - /** - * Constructor for Java serialization. - */ - MapSerializationProxy() { - } + /** Constructor for Java serialization. */ + MapSerializationProxy() {} MapSerializationProxy(int[] keys, byte[][] values) { this.keys = keys; diff --git a/core/src/main/java/org/apache/iceberg/SerializableTable.java b/core/src/main/java/org/apache/iceberg/SerializableTable.java index 45970cd20f63..37d7453c033b 100644 --- a/core/src/main/java/org/apache/iceberg/SerializableTable.java +++ b/core/src/main/java/org/apache/iceberg/SerializableTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; @@ -33,20 +32,21 @@ /** * A read-only serializable table that can be sent to other nodes in a cluster. - *

- * An instance of this class represents an immutable serializable copy of a table state and - * will not reflect any subsequent changed made to the original table. - *

- * While this class captures the metadata file location that can be used to load the complete - * table metadata, it directly persists the current schema, spec, sort order, table properties - * to avoid reading the metadata file from other nodes for frequently needed metadata. - *

- * The implementation assumes the passed instances of {@link FileIO}, {@link EncryptionManager}, + * + *

An instance of this class represents an immutable serializable copy of a table state and will + * not reflect any subsequent changed made to the original table. + * + *

While this class captures the metadata file location that can be used to load the complete + * table metadata, it directly persists the current schema, spec, sort order, table properties to + * avoid reading the metadata file from other nodes for frequently needed metadata. + * + *

The implementation assumes the passed instances of {@link FileIO}, {@link EncryptionManager}, * {@link LocationProvider} are serializable. If you are serializing the table using a custom * serialization framework like Kryo, those instances of {@link FileIO}, {@link EncryptionManager}, * {@link LocationProvider} must be supported by that particular serialization framework. - *

- * Note: loading the complete metadata from a large number of nodes can overwhelm the storage. + * + *

Note: loading the complete metadata from a large number of nodes can overwhelm the + * storage. */ public class SerializableTable implements Table, Serializable { @@ -119,10 +119,12 @@ private Table lazyTable() { synchronized (this) { if (lazyTable == null) { if (metadataFileLocation == null) { - throw new UnsupportedOperationException("Cannot load metadata: metadata file location is null"); + throw new UnsupportedOperationException( + "Cannot load metadata: metadata file location is null"); } - TableOperations ops = new StaticTableOperations(metadataFileLocation, io, locationProvider); + TableOperations ops = + new StaticTableOperations(metadataFileLocation, io, locationProvider); this.lazyTable = newTable(ops, name); } } @@ -183,9 +185,10 @@ public Map specs() { if (lazySpecs == null && lazyTable == null) { // prefer parsing JSON as opposed to loading the metadata Map specs = Maps.newHashMapWithExpectedSize(specAsJsonMap.size()); - specAsJsonMap.forEach((specId, specAsJson) -> { - specs.put(specId, PartitionSpecParser.fromJson(schema(), specAsJson)); - }); + specAsJsonMap.forEach( + (specId, specAsJson) -> { + specs.put(specId, PartitionSpecParser.fromJson(schema(), specAsJson)); + }); this.lazySpecs = specs; } else if (lazySpecs == null) { this.lazySpecs = lazyTable.specs(); diff --git a/core/src/main/java/org/apache/iceberg/SetLocation.java b/core/src/main/java/org/apache/iceberg/SetLocation.java index c6cb975ca5af..148e4b8bc8be 100644 --- a/core/src/main/java/org/apache/iceberg/SetLocation.java +++ b/core/src/main/java/org/apache/iceberg/SetLocation.java @@ -16,12 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -import org.apache.iceberg.exceptions.CommitFailedException; -import org.apache.iceberg.util.Tasks; - import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; @@ -31,6 +27,9 @@ import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; +import org.apache.iceberg.exceptions.CommitFailedException; +import org.apache.iceberg.util.Tasks; + public class SetLocation implements UpdateLocation { private final TableOperations ops; private String newLocation; diff --git a/core/src/main/java/org/apache/iceberg/SetSnapshotOperation.java b/core/src/main/java/org/apache/iceberg/SetSnapshotOperation.java index 4f00eb905c04..5f3750b58f23 100644 --- a/core/src/main/java/org/apache/iceberg/SetSnapshotOperation.java +++ b/core/src/main/java/org/apache/iceberg/SetSnapshotOperation.java @@ -16,16 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -import java.util.List; -import org.apache.iceberg.exceptions.CommitFailedException; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.SnapshotUtil; -import org.apache.iceberg.util.Tasks; - import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; @@ -35,11 +27,18 @@ import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; +import java.util.List; +import org.apache.iceberg.exceptions.CommitFailedException; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.iceberg.util.Tasks; + /** * Sets the current snapshot directly or by rolling back. - *

- * This update is not exposed though the Table API. Instead, it is a package-private part of the Transaction API - * intended for use in {@link ManageSnapshots}. + * + *

This update is not exposed though the Table API. Instead, it is a package-private part of the + * Transaction API intended for use in {@link ManageSnapshots}. */ class SetSnapshotOperation implements PendingUpdate { @@ -53,10 +52,11 @@ class SetSnapshotOperation implements PendingUpdate { this.base = ops.current(); } - public SetSnapshotOperation setCurrentSnapshot(long snapshotId) { - ValidationException.check(base.snapshot(snapshotId) != null, - "Cannot roll back to unknown snapshot id: %s", snapshotId); + ValidationException.check( + base.snapshot(snapshotId) != null, + "Cannot roll back to unknown snapshot id: %s", + snapshotId); this.targetSnapshotId = snapshotId; @@ -66,8 +66,8 @@ public SetSnapshotOperation setCurrentSnapshot(long snapshotId) { public SetSnapshotOperation rollbackToTime(long timestampMillis) { // find the latest snapshot by timestamp older than timestampMillis Snapshot snapshot = findLatestAncestorOlderThan(base, timestampMillis); - Preconditions.checkArgument(snapshot != null, - "Cannot roll back, no valid snapshot older than: %s", timestampMillis); + Preconditions.checkArgument( + snapshot != null, "Cannot roll back, no valid snapshot older than: %s", timestampMillis); this.targetSnapshotId = snapshot.snapshotId(); this.isRollback = true; @@ -77,11 +77,14 @@ public SetSnapshotOperation rollbackToTime(long timestampMillis) { public SetSnapshotOperation rollbackTo(long snapshotId) { TableMetadata current = base; - ValidationException.check(current.snapshot(snapshotId) != null, - "Cannot roll back to unknown snapshot id: %s", snapshotId); + ValidationException.check( + current.snapshot(snapshotId) != null, + "Cannot roll back to unknown snapshot id: %s", + snapshotId); ValidationException.check( isCurrentAncestor(current, snapshotId), - "Cannot roll back to snapshot, not an ancestor of the current state: %s", snapshotId); + "Cannot roll back to snapshot, not an ancestor of the current state: %s", + snapshotId); return setCurrentSnapshot(snapshotId); } @@ -94,8 +97,10 @@ public Snapshot apply() { return base.currentSnapshot(); } - ValidationException.check(!isRollback || isCurrentAncestor(base, targetSnapshotId), - "Cannot roll back to %s: not an ancestor of the current table state", targetSnapshotId); + ValidationException.check( + !isRollback || isCurrentAncestor(base, targetSnapshotId), + "Cannot roll back to %s: not an ancestor of the current table state", + targetSnapshotId); return base.snapshot(targetSnapshotId); } @@ -110,22 +115,27 @@ public void commit() { base.propertyAsInt(COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), 2.0 /* exponential */) .onlyRetryOn(CommitFailedException.class) - .run(taskOps -> { - Snapshot snapshot = apply(); - TableMetadata updated = TableMetadata.buildFrom(base) - .setBranchSnapshot(snapshot.snapshotId(), SnapshotRef.MAIN_BRANCH) - .build(); - - if (updated.changes().isEmpty()) { - // do not commit if the metadata has not changed. for example, this may happen when setting the current - // snapshot to an ID that is already current. note that this check uses identity. - return; - } - - // if the table UUID is missing, add it here. the UUID will be re-created each time this operation retries - // to ensure that if a concurrent operation assigns the UUID, this operation will not fail. - taskOps.commit(base, updated.withUUID()); - }); + .run( + taskOps -> { + Snapshot snapshot = apply(); + TableMetadata updated = + TableMetadata.buildFrom(base) + .setBranchSnapshot(snapshot.snapshotId(), SnapshotRef.MAIN_BRANCH) + .build(); + + if (updated.changes().isEmpty()) { + // do not commit if the metadata has not changed. for example, this may happen when + // setting the current + // snapshot to an ID that is already current. note that this check uses identity. + return; + } + + // if the table UUID is missing, add it here. the UUID will be re-created each time + // this operation retries + // to ensure that if a concurrent operation assigns the UUID, this operation will not + // fail. + taskOps.commit(base, updated.withUUID()); + }); } /** @@ -140,8 +150,8 @@ private static Snapshot findLatestAncestorOlderThan(TableMetadata meta, long tim Snapshot result = null; for (Long snapshotId : currentAncestors(meta)) { Snapshot snapshot = meta.snapshot(snapshotId); - if (snapshot.timestampMillis() < timestampMillis && - snapshot.timestampMillis() > snapshotTimestamp) { + if (snapshot.timestampMillis() < timestampMillis + && snapshot.timestampMillis() > snapshotTimestamp) { result = snapshot; snapshotTimestamp = snapshot.timestampMillis(); } diff --git a/core/src/main/java/org/apache/iceberg/SnapshotIdGeneratorUtil.java b/core/src/main/java/org/apache/iceberg/SnapshotIdGeneratorUtil.java index b54f40c36d6a..52ed66a6b133 100644 --- a/core/src/main/java/org/apache/iceberg/SnapshotIdGeneratorUtil.java +++ b/core/src/main/java/org/apache/iceberg/SnapshotIdGeneratorUtil.java @@ -16,15 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.UUID; public final class SnapshotIdGeneratorUtil { - private SnapshotIdGeneratorUtil() { - } + private SnapshotIdGeneratorUtil() {} /** * Create a new ID for a Snapshot diff --git a/core/src/main/java/org/apache/iceberg/SnapshotManager.java b/core/src/main/java/org/apache/iceberg/SnapshotManager.java index 60b9d34b9d6b..355a9aee5c1f 100644 --- a/core/src/main/java/org/apache/iceberg/SnapshotManager.java +++ b/core/src/main/java/org/apache/iceberg/SnapshotManager.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; @@ -27,8 +26,10 @@ public class SnapshotManager implements ManageSnapshots { private UpdateSnapshotReferencesOperation updateSnapshotReferencesOperation; SnapshotManager(String tableName, TableOperations ops) { - Preconditions.checkState(ops.current() != null, "Cannot manage snapshots: table %s does not exist", tableName); - this.transaction = new BaseTransaction(tableName, ops, BaseTransaction.TransactionType.SIMPLE, ops.refresh()); + Preconditions.checkState( + ops.current() != null, "Cannot manage snapshots: table %s does not exist", tableName); + this.transaction = + new BaseTransaction(tableName, ops, BaseTransaction.TransactionType.SIMPLE, ops.refresh()); } @Override @@ -102,13 +103,13 @@ public ManageSnapshots setMaxRefAgeMs(String name, long maxRefAgeMs) { } @Override - public ManageSnapshots replaceTag(String name, long snapshotId) { + public ManageSnapshots replaceTag(String name, long snapshotId) { updateSnapshotReferencesOperation().replaceTag(name, snapshotId); return this; } @Override - public ManageSnapshots replaceBranch(String name, long snapshotId) { + public ManageSnapshots replaceBranch(String name, long snapshotId) { updateSnapshotReferencesOperation().replaceBranch(name, snapshotId); return this; } diff --git a/core/src/main/java/org/apache/iceberg/SnapshotParser.java b/core/src/main/java/org/apache/iceberg/SnapshotParser.java index e4a4428b49af..6618207b73c4 100644 --- a/core/src/main/java/org/apache/iceberg/SnapshotParser.java +++ b/core/src/main/java/org/apache/iceberg/SnapshotParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.fasterxml.jackson.core.JsonGenerator; @@ -35,8 +34,7 @@ public class SnapshotParser { - private SnapshotParser() { - } + private SnapshotParser() {} private static final String SEQUENCE_NUMBER = "sequence-number"; private static final String SNAPSHOT_ID = "snapshot-id"; @@ -48,8 +46,7 @@ private SnapshotParser() { private static final String MANIFEST_LIST = "manifest-list"; private static final String SCHEMA_ID = "schema-id"; - static void toJson(Snapshot snapshot, JsonGenerator generator) - throws IOException { + static void toJson(Snapshot snapshot, JsonGenerator generator) throws IOException { generator.writeStartObject(); if (snapshot.sequenceNumber() > TableMetadata.INITIAL_SEQUENCE_NUMBER) { generator.writeNumberField(SEQUENCE_NUMBER, snapshot.sequenceNumber()); @@ -118,8 +115,8 @@ public static String toJson(Snapshot snapshot, boolean pretty) { } static Snapshot fromJson(FileIO io, JsonNode node) { - Preconditions.checkArgument(node.isObject(), - "Cannot parse table version from a non-object: %s", node); + Preconditions.checkArgument( + node.isObject(), "Cannot parse table version from a non-object: %s", node); long sequenceNumber = TableMetadata.INITIAL_SEQUENCE_NUMBER; if (node.has(SEQUENCE_NUMBER)) { @@ -136,8 +133,10 @@ static Snapshot fromJson(FileIO io, JsonNode node) { String operation = null; if (node.has(SUMMARY)) { JsonNode sNode = node.get(SUMMARY); - Preconditions.checkArgument(sNode != null && !sNode.isNull() && sNode.isObject(), - "Cannot parse summary from non-object value: %s", sNode); + Preconditions.checkArgument( + sNode != null && !sNode.isNull() && sNode.isObject(), + "Cannot parse summary from non-object value: %s", + sNode); ImmutableMap.Builder builder = ImmutableMap.builder(); Iterator fields = sNode.fieldNames(); @@ -158,14 +157,25 @@ static Snapshot fromJson(FileIO io, JsonNode node) { // the manifest list is stored in a manifest list file String manifestList = JsonUtil.getString(MANIFEST_LIST, node); return new BaseSnapshot( - io, sequenceNumber, snapshotId, parentId, timestamp, operation, summary, schemaId, manifestList); + io, + sequenceNumber, + snapshotId, + parentId, + timestamp, + operation, + summary, + schemaId, + manifestList); } else { // fall back to an embedded manifest list. pass in the manifest's InputFile so length can be // loaded lazily, if it is needed - List manifests = Lists.transform(JsonUtil.getStringList(MANIFESTS, node), - location -> new GenericManifestFile(io.newInputFile(location), 0)); - return new BaseSnapshot(io, snapshotId, parentId, timestamp, operation, summary, schemaId, manifests); + List manifests = + Lists.transform( + JsonUtil.getStringList(MANIFESTS, node), + location -> new GenericManifestFile(io.newInputFile(location), 0)); + return new BaseSnapshot( + io, snapshotId, parentId, timestamp, operation, summary, schemaId, manifests); } } diff --git a/core/src/main/java/org/apache/iceberg/SnapshotProducer.java b/core/src/main/java/org/apache/iceberg/SnapshotProducer.java index eab82b48ce50..520c70bcef1a 100644 --- a/core/src/main/java/org/apache/iceberg/SnapshotProducer.java +++ b/core/src/main/java/org/apache/iceberg/SnapshotProducer.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.MANIFEST_LISTS_ENABLED; +import static org.apache.iceberg.TableProperties.MANIFEST_LISTS_ENABLED_DEFAULT; + import com.github.benmanes.caffeine.cache.Caffeine; import com.github.benmanes.caffeine.cache.LoadingCache; import java.io.IOException; @@ -46,35 +56,21 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.MANIFEST_LISTS_ENABLED; -import static org.apache.iceberg.TableProperties.MANIFEST_LISTS_ENABLED_DEFAULT; - @SuppressWarnings("UnnecessaryAnonymousClass") abstract class SnapshotProducer implements SnapshotUpdate { private static final Logger LOG = LoggerFactory.getLogger(SnapshotProducer.class); static final Set EMPTY_SET = Sets.newHashSet(); - /** - * Default callback used to delete files. - */ - private final Consumer defaultDelete = new Consumer() { - @Override - public void accept(String file) { - ops.io().deleteFile(file); - } - }; + /** Default callback used to delete files. */ + private final Consumer defaultDelete = + new Consumer() { + @Override + public void accept(String file) { + ops.io().deleteFile(file); + } + }; - /** - * Cache used to enrich ManifestFile instances that are written to a ManifestListWriter. - */ + /** Cache used to enrich ManifestFile instances that are written to a ManifestListWriter. */ private final LoadingCache manifestsWithMetadata; private final TableOperations ops; @@ -92,14 +88,15 @@ public void accept(String file) { protected SnapshotProducer(TableOperations ops) { this.ops = ops; this.base = ops.current(); - this.manifestsWithMetadata = Caffeine - .newBuilder() - .build(file -> { - if (file.snapshotId() != null) { - return file; - } - return addMetadata(ops, file); - }); + this.manifestsWithMetadata = + Caffeine.newBuilder() + .build( + file -> { + if (file.snapshotId() != null) { + return file; + } + return addMetadata(ops, file); + }); } protected abstract ThisT self(); @@ -122,18 +119,19 @@ protected ExecutorService workerPool() { @Override public ThisT deleteWith(Consumer deleteCallback) { - Preconditions.checkArgument(this.deleteFunc == defaultDelete, "Cannot set delete callback more than once"); + Preconditions.checkArgument( + this.deleteFunc == defaultDelete, "Cannot set delete callback more than once"); this.deleteFunc = deleteCallback; return self(); } /** * Clean up any uncommitted manifests that were created. - *

- * Manifests may not be committed if apply is called more because a commit conflict has occurred. - * Implementations may keep around manifests because the same changes will be made by both apply - * calls. This method instructs the implementation to clean up those manifests and passes the - * paths of the manifests that were actually committed. + * + *

Manifests may not be committed if apply is called more because a commit conflict has + * occurred. Implementations may keep around manifests because the same changes will be made by + * both apply calls. This method instructs the implementation to clean up those manifests and + * passes the paths of the manifests that were actually committed. * * @param committed a set of manifest paths that were actually committed */ @@ -148,13 +146,12 @@ public ThisT deleteWith(Consumer deleteCallback) { /** * Validate the current metadata. - *

- * Child operations can override this to add custom validation. + * + *

Child operations can override this to add custom validation. * * @param currentMetadata current table metadata to validate */ - protected void validate(TableMetadata currentMetadata) { - } + protected void validate(TableMetadata currentMetadata) {} /** * Apply the update's changes to the base table metadata and return the new manifest list. @@ -167,8 +164,8 @@ protected void validate(TableMetadata currentMetadata) { @Override public Snapshot apply() { refresh(); - Long parentSnapshotId = base.currentSnapshot() != null ? - base.currentSnapshot().snapshotId() : null; + Long parentSnapshotId = + base.currentSnapshot() != null ? base.currentSnapshot().snapshotId() : null; long sequenceNumber = base.nextSequenceNumber(); // run validations from the child operation @@ -176,11 +173,17 @@ public Snapshot apply() { List manifests = apply(base); - if (base.formatVersion() > 1 || base.propertyAsBoolean(MANIFEST_LISTS_ENABLED, MANIFEST_LISTS_ENABLED_DEFAULT)) { + if (base.formatVersion() > 1 + || base.propertyAsBoolean(MANIFEST_LISTS_ENABLED, MANIFEST_LISTS_ENABLED_DEFAULT)) { OutputFile manifestList = manifestListPath(); - try (ManifestListWriter writer = ManifestLists.write( - ops.current().formatVersion(), manifestList, snapshotId(), parentSnapshotId, sequenceNumber)) { + try (ManifestListWriter writer = + ManifestLists.write( + ops.current().formatVersion(), + manifestList, + snapshotId(), + parentSnapshotId, + sequenceNumber)) { // keep track of the manifest lists created manifestLists.add(manifestList.location()); @@ -188,10 +191,10 @@ public Snapshot apply() { ManifestFile[] manifestFiles = new ManifestFile[manifests.size()]; Tasks.range(manifestFiles.length) - .stopOnFailure().throwFailureWhenFinished() + .stopOnFailure() + .throwFailureWhenFinished() .executeWith(workerPool) - .run(index -> - manifestFiles[index] = manifestsWithMetadata.get(manifests.get(index))); + .run(index -> manifestFiles[index] = manifestsWithMetadata.get(manifests.get(index))); writer.addAll(Arrays.asList(manifestFiles)); @@ -199,22 +202,33 @@ public Snapshot apply() { throw new RuntimeIOException(e, "Failed to write manifest list file"); } - return new BaseSnapshot(ops.io(), - sequenceNumber, snapshotId(), parentSnapshotId, System.currentTimeMillis(), operation(), summary(base), - base.currentSchemaId(), manifestList.location()); + return new BaseSnapshot( + ops.io(), + sequenceNumber, + snapshotId(), + parentSnapshotId, + System.currentTimeMillis(), + operation(), + summary(base), + base.currentSchemaId(), + manifestList.location()); } else { - return new BaseSnapshot(ops.io(), - snapshotId(), parentSnapshotId, System.currentTimeMillis(), operation(), summary(base), - base.currentSchemaId(), manifests); + return new BaseSnapshot( + ops.io(), + snapshotId(), + parentSnapshotId, + System.currentTimeMillis(), + operation(), + summary(base), + base.currentSchemaId(), + manifests); } } protected abstract Map summary(); - /** - * Returns the snapshot summary from the implementation and updates totals. - */ + /** Returns the snapshot summary from the implementation and updates totals. */ private Map summary(TableMetadata previous) { Map summary = summary(); @@ -249,23 +263,47 @@ private Map summary(TableMetadata previous) { builder.putAll(summary); updateTotal( - builder, previousSummary, SnapshotSummary.TOTAL_RECORDS_PROP, - summary, SnapshotSummary.ADDED_RECORDS_PROP, SnapshotSummary.DELETED_RECORDS_PROP); + builder, + previousSummary, + SnapshotSummary.TOTAL_RECORDS_PROP, + summary, + SnapshotSummary.ADDED_RECORDS_PROP, + SnapshotSummary.DELETED_RECORDS_PROP); updateTotal( - builder, previousSummary, SnapshotSummary.TOTAL_FILE_SIZE_PROP, - summary, SnapshotSummary.ADDED_FILE_SIZE_PROP, SnapshotSummary.REMOVED_FILE_SIZE_PROP); + builder, + previousSummary, + SnapshotSummary.TOTAL_FILE_SIZE_PROP, + summary, + SnapshotSummary.ADDED_FILE_SIZE_PROP, + SnapshotSummary.REMOVED_FILE_SIZE_PROP); updateTotal( - builder, previousSummary, SnapshotSummary.TOTAL_DATA_FILES_PROP, - summary, SnapshotSummary.ADDED_FILES_PROP, SnapshotSummary.DELETED_FILES_PROP); + builder, + previousSummary, + SnapshotSummary.TOTAL_DATA_FILES_PROP, + summary, + SnapshotSummary.ADDED_FILES_PROP, + SnapshotSummary.DELETED_FILES_PROP); updateTotal( - builder, previousSummary, SnapshotSummary.TOTAL_DELETE_FILES_PROP, - summary, SnapshotSummary.ADDED_DELETE_FILES_PROP, SnapshotSummary.REMOVED_DELETE_FILES_PROP); + builder, + previousSummary, + SnapshotSummary.TOTAL_DELETE_FILES_PROP, + summary, + SnapshotSummary.ADDED_DELETE_FILES_PROP, + SnapshotSummary.REMOVED_DELETE_FILES_PROP); updateTotal( - builder, previousSummary, SnapshotSummary.TOTAL_POS_DELETES_PROP, - summary, SnapshotSummary.ADDED_POS_DELETES_PROP, SnapshotSummary.REMOVED_POS_DELETES_PROP); + builder, + previousSummary, + SnapshotSummary.TOTAL_POS_DELETES_PROP, + summary, + SnapshotSummary.ADDED_POS_DELETES_PROP, + SnapshotSummary.REMOVED_POS_DELETES_PROP); updateTotal( - builder, previousSummary, SnapshotSummary.TOTAL_EQ_DELETES_PROP, - summary, SnapshotSummary.ADDED_EQ_DELETES_PROP, SnapshotSummary.REMOVED_EQ_DELETES_PROP); + builder, + previousSummary, + SnapshotSummary.TOTAL_EQ_DELETES_PROP, + summary, + SnapshotSummary.ADDED_EQ_DELETES_PROP, + SnapshotSummary.REMOVED_EQ_DELETES_PROP); return builder.build(); } @@ -292,30 +330,34 @@ public void commit() { base.propertyAsInt(COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), 2.0 /* exponential */) .onlyRetryOn(CommitFailedException.class) - .run(taskOps -> { - Snapshot newSnapshot = apply(); - newSnapshotId.set(newSnapshot.snapshotId()); - TableMetadata.Builder update = TableMetadata.buildFrom(base); - if (base.snapshot(newSnapshot.snapshotId()) != null) { - // this is a rollback operation - update.setBranchSnapshot(newSnapshot.snapshotId(), SnapshotRef.MAIN_BRANCH); - } else if (stageOnly) { - update.addSnapshot(newSnapshot); - } else { - update.setBranchSnapshot(newSnapshot, SnapshotRef.MAIN_BRANCH); - } - - TableMetadata updated = update.build(); - if (updated.changes().isEmpty()) { - // do not commit if the metadata has not changed. for example, this may happen when setting the current - // snapshot to an ID that is already current. note that this check uses identity. - return; - } - - // if the table UUID is missing, add it here. the UUID will be re-created each time this operation retries - // to ensure that if a concurrent operation assigns the UUID, this operation will not fail. - taskOps.commit(base, updated.withUUID()); - }); + .run( + taskOps -> { + Snapshot newSnapshot = apply(); + newSnapshotId.set(newSnapshot.snapshotId()); + TableMetadata.Builder update = TableMetadata.buildFrom(base); + if (base.snapshot(newSnapshot.snapshotId()) != null) { + // this is a rollback operation + update.setBranchSnapshot(newSnapshot.snapshotId(), SnapshotRef.MAIN_BRANCH); + } else if (stageOnly) { + update.addSnapshot(newSnapshot); + } else { + update.setBranchSnapshot(newSnapshot, SnapshotRef.MAIN_BRANCH); + } + + TableMetadata updated = update.build(); + if (updated.changes().isEmpty()) { + // do not commit if the metadata has not changed. for example, this may happen + // when setting the current + // snapshot to an ID that is already current. note that this check uses identity. + return; + } + + // if the table UUID is missing, add it here. the UUID will be re-created each time + // this operation retries + // to ensure that if a concurrent operation assigns the UUID, this operation will + // not fail. + taskOps.commit(base, updated.withUUID()); + }); } catch (CommitStateUnknownException commitStateUnknownException) { throw commitStateUnknownException; @@ -344,7 +386,8 @@ public void commit() { } } catch (Throwable e) { - LOG.warn("Failed to load committed table metadata or during cleanup, skipping further cleanup", e); + LOG.warn( + "Failed to load committed table metadata or during cleanup, skipping further cleanup", e); } try { @@ -378,21 +421,29 @@ protected void deleteFile(String path) { } protected OutputFile manifestListPath() { - return ops.io().newOutputFile(ops.metadataFileLocation(FileFormat.AVRO.addExtension( - String.format("snap-%d-%d-%s", snapshotId(), attempt.incrementAndGet(), commitUUID)))); + return ops.io() + .newOutputFile( + ops.metadataFileLocation( + FileFormat.AVRO.addExtension( + String.format( + "snap-%d-%d-%s", snapshotId(), attempt.incrementAndGet(), commitUUID)))); } protected OutputFile newManifestOutput() { - return ops.io().newOutputFile( - ops.metadataFileLocation(FileFormat.AVRO.addExtension(commitUUID + "-m" + manifestCount.getAndIncrement()))); + return ops.io() + .newOutputFile( + ops.metadataFileLocation( + FileFormat.AVRO.addExtension(commitUUID + "-m" + manifestCount.getAndIncrement()))); } protected ManifestWriter newManifestWriter(PartitionSpec spec) { - return ManifestFiles.write(ops.current().formatVersion(), spec, newManifestOutput(), snapshotId()); + return ManifestFiles.write( + ops.current().formatVersion(), spec, newManifestOutput(), snapshotId()); } protected ManifestWriter newDeleteManifestWriter(PartitionSpec spec) { - return ManifestFiles.writeDeleteManifest(ops.current().formatVersion(), spec, newManifestOutput(), snapshotId()); + return ManifestFiles.writeDeleteManifest( + ops.current().formatVersion(), spec, newManifestOutput(), snapshotId()); } protected ManifestReader newManifestReader(ManifestFile manifest) { @@ -415,7 +466,8 @@ protected long snapshotId() { } private static ManifestFile addMetadata(TableOperations ops, ManifestFile manifest) { - try (ManifestReader reader = ManifestFiles.read(manifest, ops.io(), ops.current().specsById())) { + try (ManifestReader reader = + ManifestFiles.read(manifest, ops.io(), ops.current().specsById())) { PartitionSummary stats = new PartitionSummary(ops.current().spec(manifest.partitionSpecId())); int addedFiles = 0; long addedRows = 0L; @@ -460,19 +512,35 @@ private static ManifestFile addMetadata(TableOperations ops, ManifestFile manife snapshotId = maxSnapshotId; } - return new GenericManifestFile(manifest.path(), manifest.length(), manifest.partitionSpecId(), - ManifestContent.DATA, manifest.sequenceNumber(), manifest.minSequenceNumber(), snapshotId, - addedFiles, addedRows, existingFiles, existingRows, deletedFiles, deletedRows, stats.summaries(), null); + return new GenericManifestFile( + manifest.path(), + manifest.length(), + manifest.partitionSpecId(), + ManifestContent.DATA, + manifest.sequenceNumber(), + manifest.minSequenceNumber(), + snapshotId, + addedFiles, + addedRows, + existingFiles, + existingRows, + deletedFiles, + deletedRows, + stats.summaries(), + null); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to read manifest: %s", manifest.path()); } } - private static void updateTotal(ImmutableMap.Builder summaryBuilder, - Map previousSummary, String totalProperty, - Map currentSummary, - String addedProperty, String deletedProperty) { + private static void updateTotal( + ImmutableMap.Builder summaryBuilder, + Map previousSummary, + String totalProperty, + Map currentSummary, + String addedProperty, + String deletedProperty) { String totalStr = previousSummary.get(totalProperty); if (totalStr != null) { try { diff --git a/core/src/main/java/org/apache/iceberg/SnapshotRefParser.java b/core/src/main/java/org/apache/iceberg/SnapshotRefParser.java index c57f251dd46f..e82da01d1789 100644 --- a/core/src/main/java/org/apache/iceberg/SnapshotRefParser.java +++ b/core/src/main/java/org/apache/iceberg/SnapshotRefParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.fasterxml.jackson.core.JsonGenerator; @@ -30,8 +29,7 @@ public class SnapshotRefParser { - private SnapshotRefParser() { - } + private SnapshotRefParser() {} private static final String SNAPSHOT_ID = "snapshot-id"; private static final String TYPE = "type"; @@ -63,16 +61,21 @@ public static void toJson(SnapshotRef ref, JsonGenerator generator) throws IOExc generator.writeStartObject(); generator.writeNumberField(SNAPSHOT_ID, ref.snapshotId()); generator.writeStringField(TYPE, ref.type().name().toLowerCase(Locale.ENGLISH)); - JsonUtil.writeIntegerFieldIf(ref.minSnapshotsToKeep() != null, MIN_SNAPSHOTS_TO_KEEP, ref.minSnapshotsToKeep(), + JsonUtil.writeIntegerFieldIf( + ref.minSnapshotsToKeep() != null, + MIN_SNAPSHOTS_TO_KEEP, + ref.minSnapshotsToKeep(), generator); - JsonUtil.writeLongFieldIf(ref.maxSnapshotAgeMs() != null, MAX_SNAPSHOT_AGE_MS, ref.maxSnapshotAgeMs(), generator); - JsonUtil.writeLongFieldIf(ref.maxRefAgeMs() != null, MAX_REF_AGE_MS, ref.maxRefAgeMs(), generator); + JsonUtil.writeLongFieldIf( + ref.maxSnapshotAgeMs() != null, MAX_SNAPSHOT_AGE_MS, ref.maxSnapshotAgeMs(), generator); + JsonUtil.writeLongFieldIf( + ref.maxRefAgeMs() != null, MAX_REF_AGE_MS, ref.maxRefAgeMs(), generator); generator.writeEndObject(); } public static SnapshotRef fromJson(String json) { - Preconditions.checkArgument(json != null && !json.isEmpty(), "Cannot parse snapshot ref from invalid JSON: %s", - json); + Preconditions.checkArgument( + json != null && !json.isEmpty(), "Cannot parse snapshot ref from invalid JSON: %s", json); try { return fromJson(JsonUtil.mapper().readValue(json, JsonNode.class)); } catch (IOException e) { @@ -81,9 +84,11 @@ public static SnapshotRef fromJson(String json) { } public static SnapshotRef fromJson(JsonNode node) { - Preconditions.checkArgument(node.isObject(), "Cannot parse snapshot reference from a non-object: %s", node); + Preconditions.checkArgument( + node.isObject(), "Cannot parse snapshot reference from a non-object: %s", node); long snapshotId = JsonUtil.getLong(SNAPSHOT_ID, node); - SnapshotRefType type = SnapshotRefType.valueOf(JsonUtil.getString(TYPE, node).toUpperCase(Locale.ENGLISH)); + SnapshotRefType type = + SnapshotRefType.valueOf(JsonUtil.getString(TYPE, node).toUpperCase(Locale.ENGLISH)); Integer minSnapshotsToKeep = JsonUtil.getIntOrNull(MIN_SNAPSHOTS_TO_KEEP, node); Long maxSnapshotAgeMs = JsonUtil.getLongOrNull(MAX_SNAPSHOT_AGE_MS, node); Long maxRefAgeMs = JsonUtil.getLongOrNull(MAX_REF_AGE_MS, node); diff --git a/core/src/main/java/org/apache/iceberg/SnapshotSummary.java b/core/src/main/java/org/apache/iceberg/SnapshotSummary.java index b2d5e2be9bf5..460e67430b2f 100644 --- a/core/src/main/java/org/apache/iceberg/SnapshotSummary.java +++ b/core/src/main/java/org/apache/iceberg/SnapshotSummary.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Map; @@ -61,8 +60,7 @@ public class SnapshotSummary { public static final MapJoiner MAP_JOINER = Joiner.on(",").withKeyValueSeparator("="); - private SnapshotSummary() { - } + private SnapshotSummary() {} public static Builder builder() { return new Builder(); @@ -77,8 +75,7 @@ public static class Builder { private long deletedDuplicateFiles = 0L; private boolean trustPartitionMetrics = true; - private Builder() { - } + private Builder() {} public void clear() { partitionMetrics.clear(); @@ -89,10 +86,11 @@ public void clear() { /** * Sets the maximum number of changed partitions before partition summaries will be excluded. - *

- * If the number of changed partitions is over this max, summaries will not be included. If the number of changed - * partitions is <= this limit, then partition-level summaries will be included in the summary if they are - * available, and "partition-summaries-included" will be set to "true". + * + *

If the number of changed partitions is over this max, summaries will not be included. If + * the number of changed partitions is <= this limit, then partition-level summaries will be + * included in the summary if they are available, and "partition-summaries-included" will be set + * to "true". * * @param max maximum number of changed partitions */ @@ -124,7 +122,8 @@ public void deletedFile(PartitionSpec spec, ContentFile file) { } else if (file instanceof DeleteFile) { deletedFile(spec, (DeleteFile) file); } else { - throw new IllegalArgumentException("Unsupported file type: " + file.getClass().getSimpleName()); + throw new IllegalArgumentException( + "Unsupported file type: " + file.getClass().getSimpleName()); } } @@ -150,9 +149,9 @@ public void set(String property, String value) { private void updatePartitions(PartitionSpec spec, ContentFile file, boolean isAddition) { if (trustPartitionMetrics) { - UpdateMetrics partMetrics = partitionMetrics.computeIfAbsent( - spec.partitionToPath(file.partition()), - key -> new UpdateMetrics()); + UpdateMetrics partMetrics = + partitionMetrics.computeIfAbsent( + spec.partitionToPath(file.partition()), key -> new UpdateMetrics()); if (isAddition) { partMetrics.addedFile(file); @@ -169,7 +168,9 @@ public void merge(SnapshotSummary.Builder builder) { this.trustPartitionMetrics = trustPartitionMetrics && builder.trustPartitionMetrics; if (trustPartitionMetrics) { for (Map.Entry entry : builder.partitionMetrics.entrySet()) { - partitionMetrics.computeIfAbsent(entry.getKey(), key -> new UpdateMetrics()).merge(entry.getValue()); + partitionMetrics + .computeIfAbsent(entry.getKey(), key -> new UpdateMetrics()) + .merge(entry.getValue()); } } else { partitionMetrics.clear(); @@ -192,7 +193,11 @@ public Map build() { if (trustPartitionMetrics && changedPartitions.size() <= maxChangedPartitionsForSummaries) { setIf(changedPartitions.size() > 0, builder, PARTITION_SUMMARY_PROP, "true"); for (String key : changedPartitions) { - setIf(key != null, builder, CHANGED_PARTITION_PREFIX + key, partitionSummary(partitionMetrics.get(key))); + setIf( + key != null, + builder, + CHANGED_PARTITION_PREFIX + key, + partitionSummary(partitionMetrics.get(key))); } } @@ -251,7 +256,8 @@ void addTo(ImmutableMap.Builder builder) { setIf(addedEqDeleteFiles > 0, builder, ADD_EQ_DELETE_FILES_PROP, addedEqDeleteFiles); setIf(removedEqDeleteFiles > 0, builder, REMOVED_EQ_DELETE_FILES_PROP, removedEqDeleteFiles); setIf(addedPosDeleteFiles > 0, builder, ADD_POS_DELETE_FILES_PROP, addedPosDeleteFiles); - setIf(removedPosDeleteFiles > 0, builder, REMOVED_POS_DELETE_FILES_PROP, removedPosDeleteFiles); + setIf( + removedPosDeleteFiles > 0, builder, REMOVED_POS_DELETE_FILES_PROP, removedPosDeleteFiles); setIf(addedDeleteFiles > 0, builder, ADDED_DELETE_FILES_PROP, addedDeleteFiles); setIf(removedDeleteFiles > 0, builder, REMOVED_DELETE_FILES_PROP, removedDeleteFiles); setIf(addedRecords > 0, builder, ADDED_RECORDS_PROP, addedRecords); @@ -285,7 +291,8 @@ void addedFile(ContentFile file) { this.addedEqDeletes += file.recordCount(); break; default: - throw new UnsupportedOperationException("Unsupported file content type: " + file.content()); + throw new UnsupportedOperationException( + "Unsupported file content type: " + file.content()); } } @@ -307,7 +314,8 @@ void removedFile(ContentFile file) { this.removedEqDeletes += file.recordCount(); break; default: - throw new UnsupportedOperationException("Unsupported file content type: " + file.content()); + throw new UnsupportedOperationException( + "Unsupported file content type: " + file.content()); } } @@ -348,8 +356,11 @@ void merge(UpdateMetrics other) { } } - private static void setIf(boolean expression, ImmutableMap.Builder builder, - String property, Object value) { + private static void setIf( + boolean expression, + ImmutableMap.Builder builder, + String property, + Object value) { if (expression) { builder.put(property, String.valueOf(value)); } diff --git a/core/src/main/java/org/apache/iceberg/SnapshotsTable.java b/core/src/main/java/org/apache/iceberg/SnapshotsTable.java index 9bfcf3a09b4c..16e010877413 100644 --- a/core/src/main/java/org/apache/iceberg/SnapshotsTable.java +++ b/core/src/main/java/org/apache/iceberg/SnapshotsTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.io.CloseableIterable; @@ -24,19 +23,21 @@ /** * A {@link Table} implementation that exposes a table's known snapshots as rows. - *

- * This does not include snapshots that have been expired using {@link ExpireSnapshots}. + * + *

This does not include snapshots that have been expired using {@link ExpireSnapshots}. */ public class SnapshotsTable extends BaseMetadataTable { - private static final Schema SNAPSHOT_SCHEMA = new Schema( - Types.NestedField.required(1, "committed_at", Types.TimestampType.withZone()), - Types.NestedField.required(2, "snapshot_id", Types.LongType.get()), - Types.NestedField.optional(3, "parent_id", Types.LongType.get()), - Types.NestedField.optional(4, "operation", Types.StringType.get()), - Types.NestedField.optional(5, "manifest_list", Types.StringType.get()), - Types.NestedField.optional(6, "summary", - Types.MapType.ofRequired(7, 8, Types.StringType.get(), Types.StringType.get())) - ); + private static final Schema SNAPSHOT_SCHEMA = + new Schema( + Types.NestedField.required(1, "committed_at", Types.TimestampType.withZone()), + Types.NestedField.required(2, "snapshot_id", Types.LongType.get()), + Types.NestedField.optional(3, "parent_id", Types.LongType.get()), + Types.NestedField.optional(4, "operation", Types.StringType.get()), + Types.NestedField.optional(5, "manifest_list", Types.StringType.get()), + Types.NestedField.optional( + 6, + "summary", + Types.MapType.ofRequired(7, 8, Types.StringType.get(), Types.StringType.get()))); SnapshotsTable(TableOperations ops, Table table) { this(ops, table, table.name() + ".snapshots"); @@ -60,9 +61,10 @@ private DataTask task(BaseTableScan scan) { TableOperations ops = operations(); return StaticDataTask.of( ops.io().newInputFile(ops.current().metadataFileLocation()), - schema(), scan.schema(), ops.current().snapshots(), - SnapshotsTable::snapshotToRow - ); + schema(), + scan.schema(), + ops.current().snapshots(), + SnapshotsTable::snapshotToRow); } @Override @@ -76,17 +78,25 @@ private class SnapshotsTableScan extends StaticTableScan { } SnapshotsTableScan(TableOperations ops, Table table, TableScanContext context) { - super(ops, table, SNAPSHOT_SCHEMA, MetadataTableType.SNAPSHOTS, SnapshotsTable.this::task, context); + super( + ops, + table, + SNAPSHOT_SCHEMA, + MetadataTableType.SNAPSHOTS, + SnapshotsTable.this::task, + context); } @Override - protected TableScan newRefinedScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + protected TableScan newRefinedScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { return new SnapshotsTableScan(ops, table, context); } @Override public CloseableIterable planFiles() { - // override planFiles to avoid the check for a current snapshot because this metadata table is for all snapshots + // override planFiles to avoid the check for a current snapshot because this metadata table is + // for all snapshots return CloseableIterable.withNoopClose(SnapshotsTable.this.task(this)); } } @@ -98,7 +108,6 @@ private static StaticDataTask.Row snapshotToRow(Snapshot snap) { snap.parentId(), snap.operation(), snap.manifestListLocation(), - snap.summary() - ); + snap.summary()); } } diff --git a/core/src/main/java/org/apache/iceberg/SortOrderParser.java b/core/src/main/java/org/apache/iceberg/SortOrderParser.java index 7b242e551119..6d4f6efaa201 100644 --- a/core/src/main/java/org/apache/iceberg/SortOrderParser.java +++ b/core/src/main/java/org/apache/iceberg/SortOrderParser.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.NullOrder.NULLS_FIRST; +import static org.apache.iceberg.NullOrder.NULLS_LAST; + import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.JsonNode; import java.io.IOException; @@ -27,9 +29,6 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.util.JsonUtil; -import static org.apache.iceberg.NullOrder.NULLS_FIRST; -import static org.apache.iceberg.NullOrder.NULLS_LAST; - public class SortOrderParser { private static final String ORDER_ID = "order-id"; private static final String FIELDS = "fields"; @@ -38,8 +37,7 @@ public class SortOrderParser { private static final String TRANSFORM = "transform"; private static final String SOURCE_ID = "source-id"; - private SortOrderParser() { - } + private SortOrderParser() {} public static void toJson(SortOrder sortOrder, JsonGenerator generator) throws IOException { generator.writeStartObject(); @@ -65,7 +63,8 @@ private static String toJson(NullOrder nullOrder) { return nullOrder == NULLS_FIRST ? "nulls-first" : "nulls-last"; } - private static void toJsonFields(SortOrder sortOrder, JsonGenerator generator) throws IOException { + private static void toJsonFields(SortOrder sortOrder, JsonGenerator generator) + throws IOException { generator.writeStartArray(); for (SortField field : sortOrder.fields()) { generator.writeStartObject(); @@ -78,7 +77,8 @@ private static void toJsonFields(SortOrder sortOrder, JsonGenerator generator) t generator.writeEndArray(); } - public static void toJson(UnboundSortOrder sortOrder, JsonGenerator generator) throws IOException { + public static void toJson(UnboundSortOrder sortOrder, JsonGenerator generator) + throws IOException { generator.writeStartObject(); generator.writeNumberField(ORDER_ID, sortOrder.orderId()); generator.writeFieldName(FIELDS); @@ -94,7 +94,8 @@ public static String toJson(UnboundSortOrder sortOrder, boolean pretty) { return JsonUtil.generate(gen -> toJson(sortOrder, gen), pretty); } - private static void toJsonFields(UnboundSortOrder sortOrder, JsonGenerator generator) throws IOException { + private static void toJsonFields(UnboundSortOrder sortOrder, JsonGenerator generator) + throws IOException { generator.writeStartArray(); for (UnboundSortOrder.UnboundSortField field : sortOrder.fields()) { generator.writeStartObject(); @@ -120,7 +121,8 @@ public static UnboundSortOrder fromJson(String json) { } public static UnboundSortOrder fromJson(JsonNode json) { - Preconditions.checkArgument(json.isObject(), "Cannot parse sort order from non-object: %s", json); + Preconditions.checkArgument( + json.isObject(), "Cannot parse sort order from non-object: %s", json); int orderId = JsonUtil.getInt(ORDER_ID, json); UnboundSortOrder.Builder builder = UnboundSortOrder.builder().withOrderId(orderId); buildFromJsonFields(builder, json.get(FIELDS)); @@ -129,12 +131,14 @@ public static UnboundSortOrder fromJson(JsonNode json) { private static void buildFromJsonFields(UnboundSortOrder.Builder builder, JsonNode json) { Preconditions.checkArgument(json != null, "Cannot parse null sort order fields"); - Preconditions.checkArgument(json.isArray(), "Cannot parse sort order fields, not an array: %s", json); + Preconditions.checkArgument( + json.isArray(), "Cannot parse sort order fields, not an array: %s", json); Iterator elements = json.elements(); while (elements.hasNext()) { JsonNode element = elements.next(); - Preconditions.checkArgument(element.isObject(), "Cannot parse sort field, not an object: %s", element); + Preconditions.checkArgument( + element.isObject(), "Cannot parse sort field, not an object: %s", element); String transform = JsonUtil.getString(TRANSFORM, element); int sourceId = JsonUtil.getInt(SOURCE_ID, element); diff --git a/core/src/main/java/org/apache/iceberg/SplitScanTaskIterator.java b/core/src/main/java/org/apache/iceberg/SplitScanTaskIterator.java index e526285a7b9b..9c52a0d1b049 100644 --- a/core/src/main/java/org/apache/iceberg/SplitScanTaskIterator.java +++ b/core/src/main/java/org/apache/iceberg/SplitScanTaskIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Iterator; diff --git a/core/src/main/java/org/apache/iceberg/StaticDataTask.java b/core/src/main/java/org/apache/iceberg/StaticDataTask.java index 0525cd6cc6da..cffb42427960 100644 --- a/core/src/main/java/org/apache/iceberg/StaticDataTask.java +++ b/core/src/main/java/org/apache/iceberg/StaticDataTask.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; @@ -34,9 +33,14 @@ class StaticDataTask implements DataTask { - static DataTask of(InputFile metadata, Schema tableSchema, Schema projectedSchema, Iterable values, - Function transform) { - return new StaticDataTask(metadata, + static DataTask of( + InputFile metadata, + Schema tableSchema, + Schema projectedSchema, + Iterable values, + Function transform) { + return new StaticDataTask( + metadata, tableSchema, projectedSchema, Lists.newArrayList(Iterables.transform(values, transform::apply)).toArray(new Row[0])); @@ -47,14 +51,16 @@ static DataTask of(InputFile metadata, Schema tableSchema, Schema projectedS private final Schema tableSchema; private final Schema projectedSchema; - private StaticDataTask(InputFile metadata, Schema tableSchema, Schema projectedSchema, StructLike[] rows) { + private StaticDataTask( + InputFile metadata, Schema tableSchema, Schema projectedSchema, StructLike[] rows) { this.tableSchema = tableSchema; this.projectedSchema = projectedSchema; - this.metadataFile = DataFiles.builder(PartitionSpec.unpartitioned()) - .withInputFile(metadata) - .withRecordCount(rows.length) - .withFormat(FileFormat.METADATA) - .build(); + this.metadataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withInputFile(metadata) + .withRecordCount(rows.length) + .withFormat(FileFormat.METADATA) + .build(); this.rows = rows; } @@ -100,9 +106,7 @@ public Iterable split(long splitSize) { return ImmutableList.of(this); } - /** - * Implements {@link StructLike#get} for passing static rows. - */ + /** Implements {@link StructLike#get} for passing static rows. */ static class Row implements StructLike, Serializable { public static Row of(Object... values) { return new Row(values); diff --git a/core/src/main/java/org/apache/iceberg/StaticTableOperations.java b/core/src/main/java/org/apache/iceberg/StaticTableOperations.java index b4eb9e33cb78..fada6de9c487 100644 --- a/core/src/main/java/org/apache/iceberg/StaticTableOperations.java +++ b/core/src/main/java/org/apache/iceberg/StaticTableOperations.java @@ -15,18 +15,16 @@ * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. - * */ - package org.apache.iceberg; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.LocationProvider; /** - * TableOperations implementation that provides access to metadata for a Table at some point in time, using a - * table metadata location. It will never refer to a different Metadata object than the one it was created with - * and cannot be used to create or delete files. + * TableOperations implementation that provides access to metadata for a Table at some point in + * time, using a table metadata location. It will never refer to a different Metadata object than + * the one it was created with and cannot be used to create or delete files. */ public class StaticTableOperations implements TableOperations { private TableMetadata staticMetadata; @@ -34,14 +32,13 @@ public class StaticTableOperations implements TableOperations { private final FileIO io; private final LocationProvider locationProvider; - /** - * Creates a StaticTableOperations tied to a specific static version of the TableMetadata - */ + /** Creates a StaticTableOperations tied to a specific static version of the TableMetadata */ public StaticTableOperations(String metadataFileLocation, FileIO io) { this(metadataFileLocation, io, null); } - public StaticTableOperations(String metadataFileLocation, FileIO io, LocationProvider locationProvider) { + public StaticTableOperations( + String metadataFileLocation, FileIO io, LocationProvider locationProvider) { this.io = io; this.metadataFileLocation = metadataFileLocation; this.locationProvider = locationProvider; @@ -56,8 +53,9 @@ public TableMetadata current() { } /** - * StaticTableOperations works on the same version of TableMetadata, and it will never refer a different TableMetadata - * object than the one it was created with. + * StaticTableOperations works on the same version of TableMetadata, and it will never refer a + * different TableMetadata object than the one it was created with. + * * @return always {@link #current()}. */ @Override diff --git a/core/src/main/java/org/apache/iceberg/StaticTableScan.java b/core/src/main/java/org/apache/iceberg/StaticTableScan.java index c2247723f112..765f41a80a4c 100644 --- a/core/src/main/java/org/apache/iceberg/StaticTableScan.java +++ b/core/src/main/java/org/apache/iceberg/StaticTableScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.function.Function; @@ -25,20 +24,30 @@ class StaticTableScan extends BaseMetadataTableScan { private final Function buildTask; - StaticTableScan(TableOperations ops, Table table, Schema schema, MetadataTableType tableType, - Function buildTask) { + StaticTableScan( + TableOperations ops, + Table table, + Schema schema, + MetadataTableType tableType, + Function buildTask) { super(ops, table, schema, tableType); this.buildTask = buildTask; } - StaticTableScan(TableOperations ops, Table table, Schema schema, MetadataTableType tableType, - Function buildTask, TableScanContext context) { + StaticTableScan( + TableOperations ops, + Table table, + Schema schema, + MetadataTableType tableType, + Function buildTask, + TableScanContext context) { super(ops, table, schema, tableType, context); this.buildTask = buildTask; } @Override - protected TableScan newRefinedScan(TableOperations ops, Table table, Schema schema, TableScanContext context) { + protected TableScan newRefinedScan( + TableOperations ops, Table table, Schema schema, TableScanContext context) { return new StaticTableScan(ops, table, schema, tableType(), buildTask, context); } diff --git a/core/src/main/java/org/apache/iceberg/StreamingDelete.java b/core/src/main/java/org/apache/iceberg/StreamingDelete.java index b1327d5d2874..493c4e44c8ee 100644 --- a/core/src/main/java/org/apache/iceberg/StreamingDelete.java +++ b/core/src/main/java/org/apache/iceberg/StreamingDelete.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.exceptions.CommitFailedException; @@ -24,8 +23,9 @@ /** * {@link DeleteFiles Delete} implementation that avoids loading full manifests in memory. - *

- * This implementation will attempt to commit 5 times before throwing {@link CommitFailedException}. + * + *

This implementation will attempt to commit 5 times before throwing {@link + * CommitFailedException}. */ public class StreamingDelete extends MergingSnapshotProducer implements DeleteFiles { protected StreamingDelete(String tableName, TableOperations ops) { diff --git a/core/src/main/java/org/apache/iceberg/SystemProperties.java b/core/src/main/java/org/apache/iceberg/SystemProperties.java index 7ef8a20eb580..3d44b195ffe1 100644 --- a/core/src/main/java/org/apache/iceberg/SystemProperties.java +++ b/core/src/main/java/org/apache/iceberg/SystemProperties.java @@ -16,16 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -/** - * Configuration properties that are controlled by Java system properties. - */ +/** Configuration properties that are controlled by Java system properties. */ public class SystemProperties { - private SystemProperties() { - } + private SystemProperties() {} /** * Sets the size of the worker pool. The worker pool limits the number of tasks concurrently @@ -34,9 +30,7 @@ private SystemProperties() { */ public static final String WORKER_THREAD_POOL_SIZE_PROP = "iceberg.worker.num-threads"; - /** - * Whether to use the shared worker pool when planning table scans. - */ + /** Whether to use the shared worker pool when planning table scans. */ public static final String SCAN_THREAD_POOL_ENABLED = "iceberg.scan.plan-in-worker-pool"; static boolean getBoolean(String systemProperty, boolean defaultValue) { diff --git a/core/src/main/java/org/apache/iceberg/TableMetadata.java b/core/src/main/java/org/apache/iceberg/TableMetadata.java index 8c1d8f5dbf41..0266e83bd553 100644 --- a/core/src/main/java/org/apache/iceberg/TableMetadata.java +++ b/core/src/main/java/org/apache/iceberg/TableMetadata.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.Serializable; @@ -45,9 +44,7 @@ import org.apache.iceberg.util.Pair; import org.apache.iceberg.util.PropertyUtil; -/** - * Metadata for a table. - */ +/** Metadata for a table. */ public class TableMetadata implements Serializable { static final long INITIAL_SEQUENCE_NUMBER = 0; static final long INVALID_SEQUENCE_NUMBER = -1; @@ -59,24 +56,27 @@ public class TableMetadata implements Serializable { private static final long ONE_MINUTE = TimeUnit.MINUTES.toMillis(1); - public static TableMetadata newTableMetadata(Schema schema, - PartitionSpec spec, - SortOrder sortOrder, - String location, - Map properties) { - int formatVersion = PropertyUtil.propertyAsInt(properties, TableProperties.FORMAT_VERSION, - DEFAULT_TABLE_FORMAT_VERSION); - return newTableMetadata(schema, spec, sortOrder, location, unreservedProperties(properties), formatVersion); - } - - public static TableMetadata newTableMetadata(Schema schema, - PartitionSpec spec, - String location, - Map properties) { + public static TableMetadata newTableMetadata( + Schema schema, + PartitionSpec spec, + SortOrder sortOrder, + String location, + Map properties) { + int formatVersion = + PropertyUtil.propertyAsInt( + properties, TableProperties.FORMAT_VERSION, DEFAULT_TABLE_FORMAT_VERSION); + return newTableMetadata( + schema, spec, sortOrder, location, unreservedProperties(properties), formatVersion); + } + + public static TableMetadata newTableMetadata( + Schema schema, PartitionSpec spec, String location, Map properties) { SortOrder sortOrder = SortOrder.unsorted(); - int formatVersion = PropertyUtil.propertyAsInt(properties, TableProperties.FORMAT_VERSION, - DEFAULT_TABLE_FORMAT_VERSION); - return newTableMetadata(schema, spec, sortOrder, location, unreservedProperties(properties), formatVersion); + int formatVersion = + PropertyUtil.propertyAsInt( + properties, TableProperties.FORMAT_VERSION, DEFAULT_TABLE_FORMAT_VERSION); + return newTableMetadata( + schema, spec, sortOrder, location, unreservedProperties(properties), formatVersion); } private static Map unreservedProperties(Map rawProperties) { @@ -85,30 +85,32 @@ private static Map unreservedProperties(Map rawP .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } - static TableMetadata newTableMetadata(Schema schema, - PartitionSpec spec, - SortOrder sortOrder, - String location, - Map properties, - int formatVersion) { - Preconditions.checkArgument(properties.keySet().stream().noneMatch(TableProperties.RESERVED_PROPERTIES::contains), - "Table properties should not contain reserved properties, but got %s", properties); + static TableMetadata newTableMetadata( + Schema schema, + PartitionSpec spec, + SortOrder sortOrder, + String location, + Map properties, + int formatVersion) { + Preconditions.checkArgument( + properties.keySet().stream().noneMatch(TableProperties.RESERVED_PROPERTIES::contains), + "Table properties should not contain reserved properties, but got %s", + properties); // reassign all column ids to ensure consistency AtomicInteger lastColumnId = new AtomicInteger(0); - Schema freshSchema = TypeUtil.assignFreshIds(INITIAL_SCHEMA_ID, schema, lastColumnId::incrementAndGet); + Schema freshSchema = + TypeUtil.assignFreshIds(INITIAL_SCHEMA_ID, schema, lastColumnId::incrementAndGet); // rebuild the partition spec using the new column ids - PartitionSpec.Builder specBuilder = PartitionSpec.builderFor(freshSchema) - .withSpecId(INITIAL_SPEC_ID); + PartitionSpec.Builder specBuilder = + PartitionSpec.builderFor(freshSchema).withSpecId(INITIAL_SPEC_ID); for (PartitionField field : spec.fields()) { // look up the name of the source field in the old schema to get the new schema's id String sourceName = schema.findColumnName(field.sourceId()); // reassign all partition fields with fresh partition field Ids to ensure consistency specBuilder.add( - freshSchema.findField(sourceName).fieldId(), - field.name(), - field.transform().toString()); + freshSchema.findField(sourceName).fieldId(), field.name(), field.transform().toString()); } PartitionSpec freshSpec = specBuilder.build(); @@ -199,8 +201,7 @@ public boolean equals(Object other) { return false; } MetadataLogEntry that = (MetadataLogEntry) other; - return timestampMillis == that.timestampMillis && - java.util.Objects.equals(file, that.file); + return timestampMillis == that.timestampMillis && java.util.Objects.equals(file, that.file); } @Override @@ -245,36 +246,44 @@ public String toString() { private final List changes; @SuppressWarnings("checkstyle:CyclomaticComplexity") - TableMetadata(String metadataFileLocation, - int formatVersion, - String uuid, - String location, - long lastSequenceNumber, - long lastUpdatedMillis, - int lastColumnId, - int currentSchemaId, - List schemas, - int defaultSpecId, - List specs, - int lastAssignedPartitionId, - int defaultSortOrderId, - List sortOrders, - Map properties, - long currentSnapshotId, - List snapshots, - List snapshotLog, - List previousFiles, - Map refs, - List changes) { - Preconditions.checkArgument(specs != null && !specs.isEmpty(), "Partition specs cannot be null or empty"); - Preconditions.checkArgument(sortOrders != null && !sortOrders.isEmpty(), "Sort orders cannot be null or empty"); - Preconditions.checkArgument(formatVersion <= SUPPORTED_TABLE_FORMAT_VERSION, - "Unsupported format version: v%s", formatVersion); - Preconditions.checkArgument(formatVersion == 1 || uuid != null, - "UUID is required in format v%s", formatVersion); - Preconditions.checkArgument(formatVersion > 1 || lastSequenceNumber == 0, - "Sequence number must be 0 in v1: %s", lastSequenceNumber); - Preconditions.checkArgument(metadataFileLocation == null || changes.isEmpty(), + TableMetadata( + String metadataFileLocation, + int formatVersion, + String uuid, + String location, + long lastSequenceNumber, + long lastUpdatedMillis, + int lastColumnId, + int currentSchemaId, + List schemas, + int defaultSpecId, + List specs, + int lastAssignedPartitionId, + int defaultSortOrderId, + List sortOrders, + Map properties, + long currentSnapshotId, + List snapshots, + List snapshotLog, + List previousFiles, + Map refs, + List changes) { + Preconditions.checkArgument( + specs != null && !specs.isEmpty(), "Partition specs cannot be null or empty"); + Preconditions.checkArgument( + sortOrders != null && !sortOrders.isEmpty(), "Sort orders cannot be null or empty"); + Preconditions.checkArgument( + formatVersion <= SUPPORTED_TABLE_FORMAT_VERSION, + "Unsupported format version: v%s", + formatVersion); + Preconditions.checkArgument( + formatVersion == 1 || uuid != null, "UUID is required in format v%s", formatVersion); + Preconditions.checkArgument( + formatVersion > 1 || lastSequenceNumber == 0, + "Sequence number must be 0 in v1: %s", + lastSequenceNumber); + Preconditions.checkArgument( + metadataFileLocation == null || changes.isEmpty(), "Cannot create TableMetadata with a metadata location and changes"); this.metadataFileLocation = metadataFileLocation; @@ -321,7 +330,8 @@ public String toString() { // A tolerance helps us avoid failure for small clock skew lastUpdatedMillis - last.timestampMillis() >= -ONE_MINUTE, "Invalid update timestamp %s: before last snapshot log entry at %s", - lastUpdatedMillis, last.timestampMillis()); + lastUpdatedMillis, + last.timestampMillis()); } MetadataLogEntry previous = null; @@ -335,14 +345,15 @@ public String toString() { } previous = metadataEntry; } - // Make sure that this update's lastUpdatedMillis is > max(previousFile's timestamp) + // Make sure that this update's lastUpdatedMillis is > max(previousFile's timestamp) if (previous != null) { Preconditions.checkArgument( // commits can happen concurrently from different machines. // A tolerance helps us avoid failure for small clock skew lastUpdatedMillis - previous.timestampMillis >= -ONE_MINUTE, "Invalid update timestamp %s: before the latest metadata log entry timestamp %s", - lastUpdatedMillis, previous.timestampMillis); + lastUpdatedMillis, + previous.timestampMillis); } Preconditions.checkArgument( @@ -526,7 +537,8 @@ public TableMetadata replaceProperties(Map rawProperties) { } } - int newFormatVersion = PropertyUtil.propertyAsInt(rawProperties, TableProperties.FORMAT_VERSION, formatVersion); + int newFormatVersion = + PropertyUtil.propertyAsInt(rawProperties, TableProperties.FORMAT_VERSION, formatVersion); return new Builder(this) .setProperties(updated) @@ -536,46 +548,52 @@ public TableMetadata replaceProperties(Map rawProperties) { } private PartitionSpec reassignPartitionIds(PartitionSpec partitionSpec, TypeUtil.NextID nextID) { - PartitionSpec.Builder specBuilder = PartitionSpec.builderFor(partitionSpec.schema()) - .withSpecId(partitionSpec.specId()); + PartitionSpec.Builder specBuilder = + PartitionSpec.builderFor(partitionSpec.schema()).withSpecId(partitionSpec.specId()); if (formatVersion > 1) { // for v2 and later, reuse any existing field IDs, but reproduce the same spec - Map, Integer> transformToFieldId = specs.stream() - .flatMap(spec -> spec.fields().stream()) - .collect(Collectors.toMap( - field -> Pair.of(field.sourceId(), field.transform().toString()), - PartitionField::fieldId, - Math::max)); + Map, Integer> transformToFieldId = + specs.stream() + .flatMap(spec -> spec.fields().stream()) + .collect( + Collectors.toMap( + field -> Pair.of(field.sourceId(), field.transform().toString()), + PartitionField::fieldId, + Math::max)); for (PartitionField field : partitionSpec.fields()) { // reassign the partition field ids - int partitionFieldId = transformToFieldId.computeIfAbsent( - Pair.of(field.sourceId(), field.transform().toString()), k -> nextID.get()); - specBuilder.add( - field.sourceId(), - partitionFieldId, - field.name(), - field.transform()); + int partitionFieldId = + transformToFieldId.computeIfAbsent( + Pair.of(field.sourceId(), field.transform().toString()), k -> nextID.get()); + specBuilder.add(field.sourceId(), partitionFieldId, field.name(), field.transform()); } } else { - // for v1, preserve the existing spec and carry forward all fields, replacing missing fields with void + // for v1, preserve the existing spec and carry forward all fields, replacing missing fields + // with void Map, PartitionField> newFields = Maps.newLinkedHashMap(); for (PartitionField newField : partitionSpec.fields()) { newFields.put(Pair.of(newField.sourceId(), newField.transform().toString()), newField); } - List newFieldNames = newFields.values().stream().map(PartitionField::name).collect(Collectors.toList()); + List newFieldNames = + newFields.values().stream().map(PartitionField::name).collect(Collectors.toList()); for (PartitionField field : spec().fields()) { // ensure each field is either carried forward or replaced with void - PartitionField newField = newFields.remove(Pair.of(field.sourceId(), field.transform().toString())); + PartitionField newField = + newFields.remove(Pair.of(field.sourceId(), field.transform().toString())); if (newField != null) { // copy the new field with the existing field ID - specBuilder.add(newField.sourceId(), field.fieldId(), newField.name(), newField.transform()); + specBuilder.add( + newField.sourceId(), field.fieldId(), newField.name(), newField.transform()); } else { // Rename old void transforms that would otherwise conflict - String voidName = newFieldNames.contains(field.name()) ? field.name() + "_" + field.fieldId() : field.name(); + String voidName = + newFieldNames.contains(field.name()) + ? field.name() + "_" + field.fieldId() + : field.name(); specBuilder.add(field.sourceId(), field.fieldId(), voidName, Transforms.alwaysNull()); } } @@ -590,26 +608,36 @@ private PartitionSpec reassignPartitionIds(PartitionSpec partitionSpec, TypeUtil } // The caller is responsible to pass a updatedPartitionSpec with correct partition field IDs - public TableMetadata buildReplacement(Schema updatedSchema, PartitionSpec updatedPartitionSpec, - SortOrder updatedSortOrder, String newLocation, - Map updatedProperties) { - ValidationException.check(formatVersion > 1 || PartitionSpec.hasSequentialIds(updatedPartitionSpec), - "Spec does not use sequential IDs that are required in v1: %s", updatedPartitionSpec); + public TableMetadata buildReplacement( + Schema updatedSchema, + PartitionSpec updatedPartitionSpec, + SortOrder updatedSortOrder, + String newLocation, + Map updatedProperties) { + ValidationException.check( + formatVersion > 1 || PartitionSpec.hasSequentialIds(updatedPartitionSpec), + "Spec does not use sequential IDs that are required in v1: %s", + updatedPartitionSpec); AtomicInteger newLastColumnId = new AtomicInteger(lastColumnId); - Schema freshSchema = TypeUtil.assignFreshIds(updatedSchema, schema(), newLastColumnId::incrementAndGet); + Schema freshSchema = + TypeUtil.assignFreshIds(updatedSchema, schema(), newLastColumnId::incrementAndGet); - // rebuild the partition spec using the new column ids and reassign partition field ids to align with existing + // rebuild the partition spec using the new column ids and reassign partition field ids to align + // with existing // partition specs in the table - PartitionSpec freshSpec = reassignPartitionIds( - freshSpec(INITIAL_SPEC_ID, freshSchema, updatedPartitionSpec), - new AtomicInteger(lastAssignedPartitionId)::incrementAndGet); + PartitionSpec freshSpec = + reassignPartitionIds( + freshSpec(INITIAL_SPEC_ID, freshSchema, updatedPartitionSpec), + new AtomicInteger(lastAssignedPartitionId)::incrementAndGet); // rebuild the sort order using new column ids SortOrder freshSortOrder = freshSortOrder(INITIAL_SORT_ORDER_ID, freshSchema, updatedSortOrder); // check if there is format version override - int newFormatVersion = PropertyUtil.propertyAsInt(updatedProperties, TableProperties.FORMAT_VERSION, formatVersion); + int newFormatVersion = + PropertyUtil.propertyAsInt( + updatedProperties, TableProperties.FORMAT_VERSION, formatVersion); return new Builder(this) .upgradeFormatVersion(newFormatVersion) @@ -631,15 +659,16 @@ public TableMetadata upgradeToFormatVersion(int newFormatVersion) { } private static PartitionSpec updateSpecSchema(Schema schema, PartitionSpec partitionSpec) { - PartitionSpec.Builder specBuilder = PartitionSpec.builderFor(schema) - .withSpecId(partitionSpec.specId()); + PartitionSpec.Builder specBuilder = + PartitionSpec.builderFor(schema).withSpecId(partitionSpec.specId()); // add all the fields to the builder. IDs should not change. for (PartitionField field : partitionSpec.fields()) { specBuilder.add(field.sourceId(), field.fieldId(), field.name(), field.transform()); } - // build without validation because the schema may have changed in a way that makes this spec invalid. the spec + // build without validation because the schema may have changed in a way that makes this spec + // invalid. the spec // should still be preserved so that older metadata can be interpreted. return specBuilder.buildUnchecked(); } @@ -649,17 +678,18 @@ private static SortOrder updateSortOrderSchema(Schema schema, SortOrder sortOrde // add all the fields to the builder. IDs should not change. for (SortField field : sortOrder.fields()) { - builder.addSortField(field.transform(), field.sourceId(), field.direction(), field.nullOrder()); + builder.addSortField( + field.transform(), field.sourceId(), field.direction(), field.nullOrder()); } - // build without validation because the schema may have changed in a way that makes this order invalid. the order + // build without validation because the schema may have changed in a way that makes this order + // invalid. the order // should still be preserved so that older metadata can be interpreted. return builder.buildUnchecked(); } private static PartitionSpec freshSpec(int specId, Schema schema, PartitionSpec partitionSpec) { - UnboundPartitionSpec.Builder specBuilder = UnboundPartitionSpec.builder() - .withSpecId(specId); + UnboundPartitionSpec.Builder specBuilder = UnboundPartitionSpec.builder().withSpecId(specId); for (PartitionField field : partitionSpec.fields()) { // look up the name of the source field in the old schema to get the new schema's id @@ -687,21 +717,21 @@ private static SortOrder freshSortOrder(int orderId, Schema schema, SortOrder so // reassign all sort fields with fresh sort field IDs int newSourceId = schema.findField(sourceName).fieldId(); builder.addSortField( - field.transform().toString(), - newSourceId, - field.direction(), - field.nullOrder()); + field.transform().toString(), newSourceId, field.direction(), field.nullOrder()); } return builder.build().bind(schema); } - private static Map indexAndValidateSnapshots(List snapshots, long lastSequenceNumber) { + private static Map indexAndValidateSnapshots( + List snapshots, long lastSequenceNumber) { ImmutableMap.Builder builder = ImmutableMap.builder(); for (Snapshot snap : snapshots) { - ValidationException.check(snap.sequenceNumber() <= lastSequenceNumber, + ValidationException.check( + snap.sequenceNumber() <= lastSequenceNumber, "Invalid snapshot with sequence number %s greater than last sequence number %s", - snap.sequenceNumber(), lastSequenceNumber); + snap.sequenceNumber(), + lastSequenceNumber); builder.put(snap.snapshotId(), snap); } return builder.build(); @@ -731,22 +761,27 @@ private static Map indexSortOrders(List sortOrder return builder.build(); } - private static Map validateRefs(Long currentSnapshotId, - Map inputRefs, - Map snapshotsById) { + private static Map validateRefs( + Long currentSnapshotId, + Map inputRefs, + Map snapshotsById) { for (SnapshotRef ref : inputRefs.values()) { - Preconditions.checkArgument(snapshotsById.containsKey(ref.snapshotId()), - "Snapshot for reference %s does not exist in the existing snapshots list", ref); + Preconditions.checkArgument( + snapshotsById.containsKey(ref.snapshotId()), + "Snapshot for reference %s does not exist in the existing snapshots list", + ref); } SnapshotRef main = inputRefs.get(SnapshotRef.MAIN_BRANCH); if (currentSnapshotId != -1) { - Preconditions.checkArgument(main == null || currentSnapshotId == main.snapshotId(), + Preconditions.checkArgument( + main == null || currentSnapshotId == main.snapshotId(), "Current snapshot ID does not match main branch (%s != %s)", - currentSnapshotId, main != null ? main.snapshotId() : null); + currentSnapshotId, + main != null ? main.snapshotId() : null); } else { - Preconditions.checkArgument(main == null, - "Current snapshot is not set, but main branch exists: %s", main); + Preconditions.checkArgument( + main == null, "Current snapshot is not set, but main branch exists: %s", main); } return inputRefs; @@ -871,11 +906,16 @@ public Builder assignUUID() { } public Builder upgradeFormatVersion(int newFormatVersion) { - Preconditions.checkArgument(newFormatVersion <= SUPPORTED_TABLE_FORMAT_VERSION, + Preconditions.checkArgument( + newFormatVersion <= SUPPORTED_TABLE_FORMAT_VERSION, "Cannot upgrade table to unsupported format version: v%s (supported: v%s)", - newFormatVersion, SUPPORTED_TABLE_FORMAT_VERSION); - Preconditions.checkArgument(newFormatVersion >= formatVersion, - "Cannot downgrade v%s table to v%s", formatVersion, newFormatVersion); + newFormatVersion, + SUPPORTED_TABLE_FORMAT_VERSION); + Preconditions.checkArgument( + newFormatVersion >= formatVersion, + "Cannot downgrade v%s table to v%s", + formatVersion, + newFormatVersion); if (newFormatVersion == formatVersion) { return this; @@ -894,8 +934,8 @@ public Builder setCurrentSchema(Schema newSchema, int newLastColumnId) { public Builder setCurrentSchema(int schemaId) { if (schemaId == -1) { - ValidationException.check(lastAddedSchemaId != null, - "Cannot set last added schema: no schema has been added"); + ValidationException.check( + lastAddedSchemaId != null, "Cannot set last added schema: no schema has been added"); return setCurrentSchema(lastAddedSchemaId); } @@ -904,16 +944,18 @@ public Builder setCurrentSchema(int schemaId) { } Schema schema = schemasById.get(schemaId); - Preconditions.checkArgument(schema != null, "Cannot set current schema to unknown schema: %s", schemaId); + Preconditions.checkArgument( + schema != null, "Cannot set current schema to unknown schema: %s", schemaId); // rebuild all the partition specs and sort orders for the new current schema - this.specs = Lists.newArrayList(Iterables.transform(specs, - spec -> updateSpecSchema(schema, spec))); + this.specs = + Lists.newArrayList(Iterables.transform(specs, spec -> updateSpecSchema(schema, spec))); specsById.clear(); specsById.putAll(indexSpecs(specs)); - this.sortOrders = Lists.newArrayList(Iterables.transform(sortOrders, - order -> updateSortOrderSchema(schema, order))); + this.sortOrders = + Lists.newArrayList( + Iterables.transform(sortOrders, order -> updateSortOrderSchema(schema, order))); sortOrdersById.clear(); sortOrdersById.putAll(indexSortOrders(sortOrders)); @@ -941,7 +983,8 @@ public Builder setDefaultPartitionSpec(PartitionSpec spec) { public Builder setDefaultPartitionSpec(int specId) { if (specId == -1) { - ValidationException.check(lastAddedSpecId != null, "Cannot set last added spec: no spec has been added"); + ValidationException.check( + lastAddedSpecId != null, "Cannot set last added spec: no spec has been added"); return setDefaultPartitionSpec(lastAddedSpecId); } @@ -977,7 +1020,8 @@ public Builder setDefaultSortOrder(SortOrder order) { public Builder setDefaultSortOrder(int sortOrderId) { if (sortOrderId == -1) { - ValidationException.check(lastAddedOrderId != null, + ValidationException.check( + lastAddedOrderId != null, "Cannot set last added sort order: no sort order has been added"); return setDefaultSortOrder(lastAddedOrderId); } @@ -1012,17 +1056,23 @@ public Builder addSnapshot(Snapshot snapshot) { return this; } - ValidationException.check(!schemas.isEmpty(), "Attempting to add a snapshot before a schema is added"); - ValidationException.check(!specs.isEmpty(), "Attempting to add a snapshot before a partition spec is added"); - ValidationException.check(!sortOrders.isEmpty(), "Attempting to add a snapshot before a sort order is added"); + ValidationException.check( + !schemas.isEmpty(), "Attempting to add a snapshot before a schema is added"); + ValidationException.check( + !specs.isEmpty(), "Attempting to add a snapshot before a partition spec is added"); + ValidationException.check( + !sortOrders.isEmpty(), "Attempting to add a snapshot before a sort order is added"); - ValidationException.check(!snapshotsById.containsKey(snapshot.snapshotId()), + ValidationException.check( + !snapshotsById.containsKey(snapshot.snapshotId()), "Snapshot already exists for id: %s", snapshot.snapshotId()); - ValidationException.check(formatVersion == 1 || snapshot.sequenceNumber() > lastSequenceNumber, + ValidationException.check( + formatVersion == 1 || snapshot.sequenceNumber() > lastSequenceNumber, "Cannot add snapshot with sequence number %s older than last sequence number %s", - snapshot.sequenceNumber(), lastSequenceNumber); + snapshot.sequenceNumber(), + lastSequenceNumber); this.lastUpdatedMillis = snapshot.timestampMillis(); this.lastSequenceNumber = snapshot.sequenceNumber(); @@ -1047,7 +1097,8 @@ public Builder setBranchSnapshot(long snapshotId, String branch) { } Snapshot snapshot = snapshotsById.get(snapshotId); - ValidationException.check(snapshot != null, "Cannot set %s to unknown snapshot: %s", branch, snapshotId); + ValidationException.check( + snapshot != null, "Cannot set %s to unknown snapshot: %s", branch, snapshotId); setBranchSnapshotInternal(snapshot, branch); @@ -1062,7 +1113,8 @@ public Builder setRef(String name, SnapshotRef ref) { long snapshotId = ref.snapshotId(); Snapshot snapshot = snapshotsById.get(snapshotId); - ValidationException.check(snapshot != null, "Cannot set %s to unknown snapshot: %s", name, snapshotId); + ValidationException.check( + snapshot != null, "Cannot set %s to unknown snapshot: %s", name, snapshotId); if (isAddedSnapshot(snapshotId)) { this.lastUpdatedMillis = snapshot.timestampMillis(); } @@ -1077,8 +1129,14 @@ public Builder setRef(String name, SnapshotRef ref) { } refs.put(name, ref); - MetadataUpdate.SetSnapshotRef refUpdate = new MetadataUpdate.SetSnapshotRef( - name, ref.snapshotId(), ref.type(), ref.minSnapshotsToKeep(), ref.maxSnapshotAgeMs(), ref.maxRefAgeMs()); + MetadataUpdate.SetSnapshotRef refUpdate = + new MetadataUpdate.SetSnapshotRef( + name, + ref.snapshotId(), + ref.type(), + ref.minSnapshotsToKeep(), + ref.maxSnapshotAgeMs(), + ref.maxRefAgeMs()); changes.add(refUpdate); return this; } @@ -1119,12 +1177,14 @@ public Builder removeBranch(String branch) { } public Builder removeSnapshots(List snapshotsToRemove) { - Set idsToRemove = snapshotsToRemove.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + Set idsToRemove = + snapshotsToRemove.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); return removeSnapshots(idsToRemove); } public Builder removeSnapshots(Collection idsToRemove) { - List retainedSnapshots = Lists.newArrayListWithExpectedSize(snapshots.size() - idsToRemove.size()); + List retainedSnapshots = + Lists.newArrayListWithExpectedSize(snapshots.size() - idsToRemove.size()); for (Snapshot snapshot : snapshots) { long snapshotId = snapshot.snapshotId(); if (idsToRemove.contains(snapshotId)) { @@ -1194,9 +1254,9 @@ public Builder setPreviousFileLocation(String previousFileLocation) { } private boolean hasChanges() { - return changes.size() != startingChangeCount || - (discardChanges && changes.size() > 0) || - metadataLocation != null; + return changes.size() != startingChangeCount + || (discardChanges && changes.size() > 0) + || metadataLocation != null; } public TableMetadata build() { @@ -1208,11 +1268,14 @@ public TableMetadata build() { this.lastUpdatedMillis = System.currentTimeMillis(); } - // when associated with a metadata file, table metadata must have no changes so that the metadata matches exactly - // what is in the metadata file, which does not store changes. metadata location with changes is inconsistent. + // when associated with a metadata file, table metadata must have no changes so that the + // metadata matches exactly + // what is in the metadata file, which does not store changes. metadata location with changes + // is inconsistent. Preconditions.checkArgument( changes.size() == 0 || discardChanges || metadataLocation == null, - "Cannot set metadata location with changes to table metadata: %s changes", changes.size()); + "Cannot set metadata location with changes to table metadata: %s changes", + changes.size()); Schema schema = schemasById.get(currentSchemaId); PartitionSpec.checkCompatibility(specsById.get(defaultSpecId), schema); @@ -1222,10 +1285,12 @@ public TableMetadata build() { if (base == null) { metadataHistory = Lists.newArrayList(); } else { - metadataHistory = addPreviousFile( - previousFiles, previousFileLocation, base.lastUpdatedMillis(), properties); + metadataHistory = + addPreviousFile( + previousFiles, previousFileLocation, base.lastUpdatedMillis(), properties); } - List newSnapshotLog = updateSnapshotLog(snapshotLog, snapshotsById, currentSnapshotId, changes); + List newSnapshotLog = + updateSnapshotLog(snapshotLog, snapshotsById, currentSnapshotId, changes); return new TableMetadata( metadataLocation, @@ -1248,21 +1313,26 @@ public TableMetadata build() { ImmutableList.copyOf(newSnapshotLog), ImmutableList.copyOf(metadataHistory), ImmutableMap.copyOf(refs), - discardChanges ? ImmutableList.of() : ImmutableList.copyOf(changes) - ); + discardChanges ? ImmutableList.of() : ImmutableList.copyOf(changes)); } private int addSchemaInternal(Schema schema, int newLastColumnId) { - Preconditions.checkArgument(newLastColumnId >= lastColumnId, - "Invalid last column ID: %s < %s (previous last column ID)", newLastColumnId, lastColumnId); + Preconditions.checkArgument( + newLastColumnId >= lastColumnId, + "Invalid last column ID: %s < %s (previous last column ID)", + newLastColumnId, + lastColumnId); int newSchemaId = reuseOrCreateNewSchemaId(schema); boolean schemaFound = schemasById.containsKey(newSchemaId); if (schemaFound && newLastColumnId == lastColumnId) { // the new spec and last column id is already current and no change is needed - // update lastAddedSchemaId if the schema was added in this set of changes (since it is now the last) - boolean isNewSchema = lastAddedSchemaId != null && - changes(MetadataUpdate.AddSchema.class).anyMatch(added -> added.schema().schemaId() == newSchemaId); + // update lastAddedSchemaId if the schema was added in this set of changes (since it is now + // the last) + boolean isNewSchema = + lastAddedSchemaId != null + && changes(MetadataUpdate.AddSchema.class) + .anyMatch(added -> added.schema().schemaId() == newSchemaId); this.lastAddedSchemaId = isNewSchema ? newSchemaId : null; return newSchemaId; } @@ -1304,20 +1374,26 @@ private int reuseOrCreateNewSchemaId(Schema newSchema) { private int addPartitionSpecInternal(PartitionSpec spec) { int newSpecId = reuseOrCreateNewSpecId(spec); if (specsById.containsKey(newSpecId)) { - // update lastAddedSpecId if the spec was added in this set of changes (since it is now the last) - boolean isNewSpec = lastAddedSpecId != null && - changes(MetadataUpdate.AddPartitionSpec.class).anyMatch(added -> added.spec().specId() == lastAddedSpecId); + // update lastAddedSpecId if the spec was added in this set of changes (since it is now the + // last) + boolean isNewSpec = + lastAddedSpecId != null + && changes(MetadataUpdate.AddPartitionSpec.class) + .anyMatch(added -> added.spec().specId() == lastAddedSpecId); this.lastAddedSpecId = isNewSpec ? newSpecId : null; return newSpecId; } Schema schema = schemasById.get(currentSchemaId); PartitionSpec.checkCompatibility(spec, schema); - ValidationException.check(formatVersion > 1 || PartitionSpec.hasSequentialIds(spec), - "Spec does not use sequential IDs that are required in v1: %s", spec); + ValidationException.check( + formatVersion > 1 || PartitionSpec.hasSequentialIds(spec), + "Spec does not use sequential IDs that are required in v1: %s", + spec); PartitionSpec newSpec = freshSpec(newSpecId, schema, spec); - this.lastAssignedPartitionId = Math.max(lastAssignedPartitionId, newSpec.lastAssignedFieldId()); + this.lastAssignedPartitionId = + Math.max(lastAssignedPartitionId, newSpec.lastAssignedFieldId()); specs.add(newSpec); specsById.put(newSpecId, newSpec); @@ -1345,10 +1421,12 @@ private int reuseOrCreateNewSpecId(PartitionSpec newSpec) { private int addSortOrderInternal(SortOrder order) { int newOrderId = reuseOrCreateNewSortOrderId(order); if (sortOrdersById.containsKey(newOrderId)) { - // update lastAddedOrderId if the order was added in this set of changes (since it is now the last) - boolean isNewOrder = lastAddedOrderId != null && - changes(MetadataUpdate.AddSortOrder.class) - .anyMatch(added -> added.sortOrder().orderId() == lastAddedOrderId); + // update lastAddedOrderId if the order was added in this set of changes (since it is now + // the last) + boolean isNewOrder = + lastAddedOrderId != null + && changes(MetadataUpdate.AddSortOrder.class) + .anyMatch(added -> added.sortOrder().orderId() == lastAddedOrderId); this.lastAddedOrderId = isNewOrder ? newOrderId : null; return newOrderId; } @@ -1402,9 +1480,11 @@ private void setBranchSnapshotInternal(Snapshot snapshot, String branch) { } } - ValidationException.check(formatVersion == 1 || snapshot.sequenceNumber() <= lastSequenceNumber, + ValidationException.check( + formatVersion == 1 || snapshot.sequenceNumber() <= lastSequenceNumber, "Last sequence number %s is less than existing snapshot sequence number %s", - lastSequenceNumber, snapshot.sequenceNumber()); + lastSequenceNumber, + snapshot.sequenceNumber()); SnapshotRef newRef; if (ref != null) { @@ -1417,19 +1497,27 @@ private void setBranchSnapshotInternal(Snapshot snapshot, String branch) { } private static List addPreviousFile( - List previousFiles, String previousFileLocation, long timestampMillis, + List previousFiles, + String previousFileLocation, + long timestampMillis, Map properties) { if (previousFileLocation == null) { return previousFiles; } - int maxSize = Math.max(1, PropertyUtil.propertyAsInt(properties, - TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, TableProperties.METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT)); + int maxSize = + Math.max( + 1, + PropertyUtil.propertyAsInt( + properties, + TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, + TableProperties.METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT)); List newMetadataLog; if (previousFiles.size() >= maxSize) { int removeIndex = previousFiles.size() - maxSize + 1; - newMetadataLog = Lists.newArrayList(previousFiles.subList(removeIndex, previousFiles.size())); + newMetadataLog = + Lists.newArrayList(previousFiles.subList(removeIndex, previousFiles.size())); } else { newMetadataLog = Lists.newArrayList(previousFiles); } @@ -1441,9 +1529,11 @@ private static List addPreviousFile( /** * Finds intermediate snapshots that have not been committed as the current snapshot. * - * @return a set of snapshot ids for all added snapshots that were later replaced as the current snapshot in changes + * @return a set of snapshot ids for all added snapshots that were later replaced as the current + * snapshot in changes */ - private static Set intermediateSnapshotIdSet(List changes, long currentSnapshotId) { + private static Set intermediateSnapshotIdSet( + List changes, long currentSnapshotId) { Set addedSnapshotIds = Sets.newHashSet(); Set intermediateSnapshotIds = Sets.newHashSet(); for (MetadataUpdate update : changes) { @@ -1454,8 +1544,9 @@ private static Set intermediateSnapshotIdSet(List changes, } else if (update instanceof MetadataUpdate.SetSnapshotRef) { MetadataUpdate.SetSnapshotRef setRef = (MetadataUpdate.SetSnapshotRef) update; long snapshotId = setRef.snapshotId(); - if (addedSnapshotIds.contains(snapshotId) && - SnapshotRef.MAIN_BRANCH.equals(setRef.name()) && snapshotId != currentSnapshotId) { + if (addedSnapshotIds.contains(snapshotId) + && SnapshotRef.MAIN_BRANCH.equals(setRef.name()) + && snapshotId != currentSnapshotId) { intermediateSnapshotIds.add(snapshotId); } } @@ -1465,14 +1556,20 @@ private static Set intermediateSnapshotIdSet(List changes, } private static List updateSnapshotLog( - List snapshotLog, Map snapshotsById, long currentSnapshotId, + List snapshotLog, + Map snapshotsById, + long currentSnapshotId, List changes) { // find intermediate snapshots to suppress incorrect entries in the snapshot log. // - // transactions can create snapshots that are never the current snapshot because several changes are combined - // by the transaction into one table metadata update. when each intermediate snapshot is added to table metadata, - // it is added to the snapshot log, assuming that it will be the current snapshot. when there are multiple - // snapshot updates, the log must be corrected by suppressing the intermediate snapshot entries. + // transactions can create snapshots that are never the current snapshot because several + // changes are combined + // by the transaction into one table metadata update. when each intermediate snapshot is added + // to table metadata, + // it is added to the snapshot log, assuming that it will be the current snapshot. when there + // are multiple + // snapshot updates, the log must be corrected by suppressing the intermediate snapshot + // entries. // // a snapshot is an intermediate snapshot if it was added but is not the current snapshot. Set intermediateSnapshotIds = intermediateSnapshotIdSet(changes, currentSnapshotId); @@ -1481,7 +1578,8 @@ private static List updateSnapshotLog( List newSnapshotLog = Lists.newArrayList(); for (HistoryEntry logEntry : snapshotLog) { long snapshotId = logEntry.snapshotId(); - if (snapshotsById.containsKey(snapshotId) && !intermediateSnapshotIds.contains(snapshotId)) { + if (snapshotsById.containsKey(snapshotId) + && !intermediateSnapshotIds.contains(snapshotId)) { // copy the log entries that are still valid newSnapshotLog.add(logEntry); } else { @@ -1495,7 +1593,8 @@ private static List updateSnapshotLog( } if (snapshotsById.get(currentSnapshotId) != null) { - ValidationException.check(Iterables.getLast(newSnapshotLog).snapshotId() == currentSnapshotId, + ValidationException.check( + Iterables.getLast(newSnapshotLog).snapshotId() == currentSnapshotId, "Cannot set invalid snapshot log: latest entry is not the current snapshot"); } @@ -1508,9 +1607,7 @@ private boolean isAddedSnapshot(long snapshotId) { } private Stream changes(Class updateClass) { - return changes.stream() - .filter(updateClass::isInstance) - .map(updateClass::cast); + return changes.stream().filter(updateClass::isInstance).map(updateClass::cast); } } } diff --git a/core/src/main/java/org/apache/iceberg/TableMetadataParser.java b/core/src/main/java/org/apache/iceberg/TableMetadataParser.java index c167b08706a1..2abfba95c0b6 100644 --- a/core/src/main/java/org/apache/iceberg/TableMetadataParser.java +++ b/core/src/main/java/org/apache/iceberg/TableMetadataParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.fasterxml.jackson.core.JsonGenerator; @@ -64,8 +63,8 @@ public static Codec fromName(String codecName) { } public static Codec fromFileName(String fileName) { - Preconditions.checkArgument(fileName.contains(".metadata.json"), - "%s is not a valid metadata file", fileName); + Preconditions.checkArgument( + fileName.contains(".metadata.json"), "%s is not a valid metadata file", fileName); // we have to be backward-compatible with .metadata.json.gz files if (fileName.endsWith(".metadata.json.gz")) { return Codec.GZIP; @@ -79,8 +78,7 @@ public static Codec fromFileName(String fileName) { } } - private TableMetadataParser() { - } + private TableMetadataParser() {} // visible for testing static final String FORMAT_VERSION = "format-version"; @@ -121,7 +119,7 @@ public static void internalWrite( boolean isGzip = Codec.fromFileName(outputFile.location()) == Codec.GZIP; OutputStream stream = overwrite ? outputFile.createOrOverwrite() : outputFile.create(); try (OutputStream ou = isGzip ? new GZIPOutputStream(stream) : stream; - OutputStreamWriter writer = new OutputStreamWriter(ou, StandardCharsets.UTF_8)) { + OutputStreamWriter writer = new OutputStreamWriter(ou, StandardCharsets.UTF_8)) { JsonGenerator generator = JsonUtil.factory().createGenerator(writer); generator.useDefaultPrettyPrinter(); toJson(metadata, generator); @@ -168,7 +166,8 @@ public static void toJson(TableMetadata metadata, JsonGenerator generator) throw generator.writeNumberField(LAST_COLUMN_ID, metadata.lastColumnId()); // for older readers, continue writing the current schema as "schema". - // this is only needed for v1 because support for schemas and current-schema-id is required in v2 and later. + // this is only needed for v1 because support for schemas and current-schema-id is required in + // v2 and later. if (metadata.formatVersion() == 1) { generator.writeFieldName(SCHEMA); SchemaParser.toJson(metadata.schema(), generator); @@ -213,7 +212,8 @@ public static void toJson(TableMetadata metadata, JsonGenerator generator) throw } generator.writeEndObject(); - generator.writeNumberField(CURRENT_SNAPSHOT_ID, + generator.writeNumberField( + CURRENT_SNAPSHOT_ID, metadata.currentSnapshot() != null ? metadata.currentSnapshot().snapshotId() : -1); toJson(metadata.refs(), generator); @@ -245,7 +245,8 @@ public static void toJson(TableMetadata metadata, JsonGenerator generator) throw generator.writeEndObject(); } - private static void toJson(Map refs, JsonGenerator generator) throws IOException { + private static void toJson(Map refs, JsonGenerator generator) + throws IOException { generator.writeObjectFieldStart(REFS); for (Map.Entry refEntry : refs.entrySet()) { generator.writeFieldName(refEntry.getKey()); @@ -260,7 +261,8 @@ public static TableMetadata read(FileIO io, String path) { public static TableMetadata read(FileIO io, InputFile file) { Codec codec = Codec.fromFileName(file.location()); - try (InputStream is = codec == Codec.GZIP ? new GZIPInputStream(file.newStream()) : file.newStream()) { + try (InputStream is = + codec == Codec.GZIP ? new GZIPInputStream(file.newStream()) : file.newStream()) { return fromJson(io, file, JsonUtil.mapper().readValue(is, JsonNode.class)); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to read file: %s", file); @@ -269,8 +271,8 @@ public static TableMetadata read(FileIO io, InputFile file) { /** * Read TableMetadata from a JSON string. - *

- * The TableMetadata's metadata file location will be unset. + * + *

The TableMetadata's metadata file location will be unset. * * @param io a FileIO used by {@link Snapshot} instances * @param json a JSON string of table metadata @@ -307,12 +309,14 @@ public static TableMetadata fromJson(JsonNode node) { @SuppressWarnings({"checkstyle:CyclomaticComplexity", "checkstyle:MethodLength"}) static TableMetadata fromJson(FileIO io, String metadataLocation, JsonNode node) { - Preconditions.checkArgument(node.isObject(), - "Cannot parse metadata from a non-object: %s", node); + Preconditions.checkArgument( + node.isObject(), "Cannot parse metadata from a non-object: %s", node); int formatVersion = JsonUtil.getInt(FORMAT_VERSION, node); - Preconditions.checkArgument(formatVersion <= TableMetadata.SUPPORTED_TABLE_FORMAT_VERSION, - "Cannot read unsupported version %s", formatVersion); + Preconditions.checkArgument( + formatVersion <= TableMetadata.SUPPORTED_TABLE_FORMAT_VERSION, + "Cannot read unsupported version %s", + formatVersion); String uuid = JsonUtil.getStringOrNull(TABLE_UUID, node); String location = JsonUtil.getString(LOCATION, node); @@ -330,8 +334,8 @@ static TableMetadata fromJson(FileIO io, String metadataLocation, JsonNode node) JsonNode schemaArray = node.get(SCHEMAS); if (schemaArray != null) { - Preconditions.checkArgument(schemaArray.isArray(), - "Cannot parse schemas from non-array: %s", schemaArray); + Preconditions.checkArgument( + schemaArray.isArray(), "Cannot parse schemas from non-array: %s", schemaArray); // current schema ID is required when the schema array is present currentSchemaId = JsonUtil.getInt(CURRENT_SCHEMA_ID, node); @@ -345,14 +349,18 @@ static TableMetadata fromJson(FileIO io, String metadataLocation, JsonNode node) builder.add(current); } - Preconditions.checkArgument(schema != null, - "Cannot find schema with %s=%s from %s", CURRENT_SCHEMA_ID, currentSchemaId, SCHEMAS); + Preconditions.checkArgument( + schema != null, + "Cannot find schema with %s=%s from %s", + CURRENT_SCHEMA_ID, + currentSchemaId, + SCHEMAS); schemas = builder.build(); } else { - Preconditions.checkArgument(formatVersion == 1, - "%s must exist in format v%s", SCHEMAS, formatVersion); + Preconditions.checkArgument( + formatVersion == 1, "%s must exist in format v%s", SCHEMAS, formatVersion); schema = SchemaParser.fromJson(node.get(SCHEMA)); currentSchemaId = schema.schemaId(); @@ -363,8 +371,8 @@ static TableMetadata fromJson(FileIO io, String metadataLocation, JsonNode node) List specs; int defaultSpecId; if (specArray != null) { - Preconditions.checkArgument(specArray.isArray(), - "Cannot parse partition specs from non-array: %s", specArray); + Preconditions.checkArgument( + specArray.isArray(), "Cannot parse partition specs from non-array: %s", specArray); // default spec ID is required when the spec array is present defaultSpecId = JsonUtil.getInt(DEFAULT_SPEC_ID, node); @@ -376,22 +384,27 @@ static TableMetadata fromJson(FileIO io, String metadataLocation, JsonNode node) specs = builder.build(); } else { - Preconditions.checkArgument(formatVersion == 1, - "%s must exist in format v%s", PARTITION_SPECS, formatVersion); + Preconditions.checkArgument( + formatVersion == 1, "%s must exist in format v%s", PARTITION_SPECS, formatVersion); // partition spec is required for older readers, but is always set to the default if the spec // array is set. it is only used to default the spec map is missing, indicating that the // table metadata was written by an older writer. defaultSpecId = TableMetadata.INITIAL_SPEC_ID; - specs = ImmutableList.of(PartitionSpecParser.fromJsonFields( - schema, TableMetadata.INITIAL_SPEC_ID, node.get(PARTITION_SPEC))); + specs = + ImmutableList.of( + PartitionSpecParser.fromJsonFields( + schema, TableMetadata.INITIAL_SPEC_ID, node.get(PARTITION_SPEC))); } Integer lastAssignedPartitionId = JsonUtil.getIntOrNull(LAST_PARTITION_ID, node); if (lastAssignedPartitionId == null) { - Preconditions.checkArgument(formatVersion == 1, - "%s must exist in format v%s", LAST_PARTITION_ID, formatVersion); - lastAssignedPartitionId = specs.stream().mapToInt(PartitionSpec::lastAssignedFieldId).max() - .orElse(PartitionSpec.unpartitioned().lastAssignedFieldId()); + Preconditions.checkArgument( + formatVersion == 1, "%s must exist in format v%s", LAST_PARTITION_ID, formatVersion); + lastAssignedPartitionId = + specs.stream() + .mapToInt(PartitionSpec::lastAssignedFieldId) + .max() + .orElse(PartitionSpec.unpartitioned().lastAssignedFieldId()); } // parse the sort orders @@ -406,8 +419,8 @@ static TableMetadata fromJson(FileIO io, String metadataLocation, JsonNode node) } sortOrders = sortOrdersBuilder.build(); } else { - Preconditions.checkArgument(formatVersion == 1, - "%s must exist in format v%s", SORT_ORDERS, formatVersion); + Preconditions.checkArgument( + formatVersion == 1, "%s must exist in format v%s", SORT_ORDERS, formatVersion); SortOrder defaultSortOrder = SortOrder.unsorted(); sortOrders = ImmutableList.of(defaultSortOrder); defaultSortOrderId = defaultSortOrder.orderId(); @@ -423,14 +436,16 @@ static TableMetadata fromJson(FileIO io, String metadataLocation, JsonNode node) refs = refsFromJson(node.get(REFS)); } else if (currentSnapshotId != -1) { // initialize the main branch if there are no refs - refs = ImmutableMap.of(SnapshotRef.MAIN_BRANCH, SnapshotRef.branchBuilder(currentSnapshotId).build()); + refs = + ImmutableMap.of( + SnapshotRef.MAIN_BRANCH, SnapshotRef.branchBuilder(currentSnapshotId).build()); } else { refs = ImmutableMap.of(); } JsonNode snapshotArray = node.get(SNAPSHOTS); - Preconditions.checkArgument(snapshotArray.isArray(), - "Cannot parse snapshots from non-array: %s", snapshotArray); + Preconditions.checkArgument( + snapshotArray.isArray(), "Cannot parse snapshots from non-array: %s", snapshotArray); List snapshots = Lists.newArrayListWithExpectedSize(snapshotArray.size()); Iterator iterator = snapshotArray.elements(); @@ -443,8 +458,10 @@ static TableMetadata fromJson(FileIO io, String metadataLocation, JsonNode node) Iterator logIterator = node.get(SNAPSHOT_LOG).elements(); while (logIterator.hasNext()) { JsonNode entryNode = logIterator.next(); - entries.add(new SnapshotLogEntry( - JsonUtil.getLong(TIMESTAMP_MS, entryNode), JsonUtil.getLong(SNAPSHOT_ID, entryNode))); + entries.add( + new SnapshotLogEntry( + JsonUtil.getLong(TIMESTAMP_MS, entryNode), + JsonUtil.getLong(SNAPSHOT_ID, entryNode))); } } @@ -453,15 +470,34 @@ static TableMetadata fromJson(FileIO io, String metadataLocation, JsonNode node) Iterator logIterator = node.get(METADATA_LOG).elements(); while (logIterator.hasNext()) { JsonNode entryNode = logIterator.next(); - metadataEntries.add(new MetadataLogEntry( - JsonUtil.getLong(TIMESTAMP_MS, entryNode), JsonUtil.getString(METADATA_FILE, entryNode))); + metadataEntries.add( + new MetadataLogEntry( + JsonUtil.getLong(TIMESTAMP_MS, entryNode), + JsonUtil.getString(METADATA_FILE, entryNode))); } } - return new TableMetadata(metadataLocation, formatVersion, uuid, location, - lastSequenceNumber, lastUpdatedMillis, lastAssignedColumnId, currentSchemaId, schemas, defaultSpecId, specs, - lastAssignedPartitionId, defaultSortOrderId, sortOrders, properties, currentSnapshotId, - snapshots, entries.build(), metadataEntries.build(), refs, + return new TableMetadata( + metadataLocation, + formatVersion, + uuid, + location, + lastSequenceNumber, + lastUpdatedMillis, + lastAssignedColumnId, + currentSchemaId, + schemas, + defaultSpecId, + specs, + lastAssignedPartitionId, + defaultSortOrderId, + sortOrders, + properties, + currentSnapshotId, + snapshots, + entries.build(), + metadataEntries.build(), + refs, ImmutableList.of() /* no changes from the file */); } @@ -473,7 +509,8 @@ private static Map refsFromJson(JsonNode refMap) { while (refNames.hasNext()) { String refName = refNames.next(); JsonNode refNode = refMap.get(refName); - Preconditions.checkArgument(refNode.isObject(), "Cannot parse ref %s from non-object: %s", refName, refMap); + Preconditions.checkArgument( + refNode.isObject(), "Cannot parse ref %s from non-object: %s", refName, refMap); SnapshotRef ref = SnapshotRefParser.fromJson(refNode); refsBuilder.put(refName, ref); } diff --git a/core/src/main/java/org/apache/iceberg/TableOperations.java b/core/src/main/java/org/apache/iceberg/TableOperations.java index d8c3695cfd8f..c00f42275eac 100644 --- a/core/src/main/java/org/apache/iceberg/TableOperations.java +++ b/core/src/main/java/org/apache/iceberg/TableOperations.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.encryption.EncryptionManager; @@ -24,9 +23,7 @@ import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.LocationProvider; -/** - * SPI interface to abstract table metadata access and updates. - */ +/** SPI interface to abstract table metadata access and updates. */ public interface TableOperations { /** @@ -45,32 +42,32 @@ public interface TableOperations { /** * Replace the base table metadata with a new version. - *

- * This method should implement and document atomicity guarantees. - *

- * Implementations must check that the base metadata is current to avoid overwriting updates. + * + *

This method should implement and document atomicity guarantees. + * + *

Implementations must check that the base metadata is current to avoid overwriting updates. * Once the atomic commit operation succeeds, implementations must not perform any operations that * may fail because failure in this method cannot be distinguished from commit failure. - *

- * Implementations must throw a {@link org.apache.iceberg.exceptions.CommitStateUnknownException} - * in cases where it cannot be determined if the commit succeeded or failed. - * For example if a network partition causes the confirmation of the commit to be lost, - * the implementation should throw a CommitStateUnknownException. This is important because downstream users of - * this API need to know whether they can clean up the commit or not, if the state is unknown then it is not safe + * + *

Implementations must throw a {@link + * org.apache.iceberg.exceptions.CommitStateUnknownException} in cases where it cannot be + * determined if the commit succeeded or failed. For example if a network partition causes the + * confirmation of the commit to be lost, the implementation should throw a + * CommitStateUnknownException. This is important because downstream users of this API need to + * know whether they can clean up the commit or not, if the state is unknown then it is not safe * to remove any files. All other exceptions will be treated as if the commit has failed. * - * @param base table metadata on which changes were based + * @param base table metadata on which changes were based * @param metadata new table metadata with updates */ void commit(TableMetadata base, TableMetadata metadata); - /** - * Returns a {@link FileIO} to read and write table data and metadata files. - */ + /** Returns a {@link FileIO} to read and write table data and metadata files. */ FileIO io(); /** - * Returns a {@link org.apache.iceberg.encryption.EncryptionManager} to encrypt and decrypt data files. + * Returns a {@link org.apache.iceberg.encryption.EncryptionManager} to encrypt and decrypt data + * files. */ default EncryptionManager encryption() { return new PlaintextEncryptionManager(); @@ -79,9 +76,9 @@ default EncryptionManager encryption() { /** * Given the name of a metadata file, obtain the full path of that file using an appropriate base * location of the implementation's choosing. - *

- * The file may not exist yet, in which case the path should be returned as if it were to be created - * by e.g. {@link FileIO#newOutputFile(String)}. + * + *

The file may not exist yet, in which case the path should be returned as if it were to be + * created by e.g. {@link FileIO#newOutputFile(String)}. */ String metadataFileLocation(String fileName); @@ -93,12 +90,15 @@ default EncryptionManager encryption() { LocationProvider locationProvider(); /** - * Return a temporary {@link TableOperations} instance that uses configuration from uncommitted metadata. - *

- * This is called by transactions when uncommitted table metadata should be used; for example, to create a metadata - * file location based on metadata in the transaction that has not been committed. - *

- * Transactions will not call {@link #refresh()} or {@link #commit(TableMetadata, TableMetadata)}. + * Return a temporary {@link TableOperations} instance that uses configuration from uncommitted + * metadata. + * + *

This is called by transactions when uncommitted table metadata should be used; for example, + * to create a metadata file location based on metadata in the transaction that has not been + * committed. + * + *

Transactions will not call {@link #refresh()} or {@link #commit(TableMetadata, + * TableMetadata)}. * * @param uncommittedMetadata uncommitted table metadata * @return a temporary table operations that behaves like the uncommitted metadata is current @@ -115,5 +115,4 @@ default TableOperations temp(TableMetadata uncommittedMetadata) { default long newSnapshotId() { return SnapshotIdGeneratorUtil.generateSnapshotID(); } - } diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java index 3cc520faa5a5..a8ca36379e30 100644 --- a/core/src/main/java/org/apache/iceberg/TableProperties.java +++ b/core/src/main/java/org/apache/iceberg/TableProperties.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Set; @@ -24,79 +23,64 @@ public class TableProperties { - private TableProperties() { - } + private TableProperties() {} /** * Reserved table property for table format version. - *

- * Iceberg will default a new table's format version to the latest stable and recommended version. - * This reserved property keyword allows users to override the Iceberg format version of the table metadata. - *

- * If this table property exists when creating a table, the table will use the specified format version. - * If a table updates this property, it will try to upgrade to the specified format version. - *

- * Note: incomplete or unstable versions cannot be selected using this property. + * + *

Iceberg will default a new table's format version to the latest stable and recommended + * version. This reserved property keyword allows users to override the Iceberg format version of + * the table metadata. + * + *

If this table property exists when creating a table, the table will use the specified format + * version. If a table updates this property, it will try to upgrade to the specified format + * version. + * + *

Note: incomplete or unstable versions cannot be selected using this property. */ public static final String FORMAT_VERSION = "format-version"; - /** - * Reserved table property for table UUID. - */ + /** Reserved table property for table UUID. */ public static final String UUID = "uuid"; - /** - * Reserved table property for the total number of snapshots. - */ + /** Reserved table property for the total number of snapshots. */ public static final String SNAPSHOT_COUNT = "snapshot-count"; - /** - * Reserved table property for current snapshot summary. - */ + /** Reserved table property for current snapshot summary. */ public static final String CURRENT_SNAPSHOT_SUMMARY = "current-snapshot-summary"; - /** - * Reserved table property for current snapshot id. - */ + /** Reserved table property for current snapshot id. */ public static final String CURRENT_SNAPSHOT_ID = "current-snapshot-id"; - /** - * Reserved table property for current snapshot timestamp. - */ + /** Reserved table property for current snapshot timestamp. */ public static final String CURRENT_SNAPSHOT_TIMESTAMP = "current-snapshot-timestamp-ms"; - /** - * Reserved table property for the JSON representation of current schema. - */ + /** Reserved table property for the JSON representation of current schema. */ public static final String CURRENT_SCHEMA = "current-schema"; - /** - * Reserved table property for the JSON representation of current(default) partition spec. - */ + /** Reserved table property for the JSON representation of current(default) partition spec. */ public static final String DEFAULT_PARTITION_SPEC = "default-partition-spec"; - /** - * Reserved table property for the JSON representation of current(default) sort order. - */ + /** Reserved table property for the JSON representation of current(default) sort order. */ public static final String DEFAULT_SORT_ORDER = "default-sort-order"; /** * Reserved Iceberg table properties list. - *

- * Reserved table properties are only used to control behaviors when creating or updating a table. - * The value of these properties are not persisted as a part of the table metadata. + * + *

Reserved table properties are only used to control behaviors when creating or updating a + * table. The value of these properties are not persisted as a part of the table metadata. */ - public static final Set RESERVED_PROPERTIES = ImmutableSet.of( - FORMAT_VERSION, - UUID, - SNAPSHOT_COUNT, - CURRENT_SNAPSHOT_ID, - CURRENT_SNAPSHOT_SUMMARY, - CURRENT_SNAPSHOT_TIMESTAMP, - CURRENT_SCHEMA, - DEFAULT_PARTITION_SPEC, - DEFAULT_SORT_ORDER - ); + public static final Set RESERVED_PROPERTIES = + ImmutableSet.of( + FORMAT_VERSION, + UUID, + SNAPSHOT_COUNT, + CURRENT_SNAPSHOT_ID, + CURRENT_SNAPSHOT_SUMMARY, + CURRENT_SNAPSHOT_TIMESTAMP, + CURRENT_SCHEMA, + DEFAULT_PARTITION_SPEC, + DEFAULT_SORT_ORDER); public static final String COMMIT_NUM_RETRIES = "commit.retry.num-retries"; public static final int COMMIT_NUM_RETRIES_DEFAULT = 4; @@ -119,8 +103,10 @@ private TableProperties() { public static final String COMMIT_STATUS_CHECKS_MAX_WAIT_MS = "commit.status-check.max-wait-ms"; public static final long COMMIT_STATUS_CHECKS_MAX_WAIT_MS_DEFAULT = 60 * 1000; // 1 minute - public static final String COMMIT_STATUS_CHECKS_TOTAL_WAIT_MS = "commit.status-check.total-timeout-ms"; - public static final long COMMIT_STATUS_CHECKS_TOTAL_WAIT_MS_DEFAULT = 30 * 60 * 1000; // 30 minutes + public static final String COMMIT_STATUS_CHECKS_TOTAL_WAIT_MS = + "commit.status-check.total-timeout-ms"; + public static final long COMMIT_STATUS_CHECKS_TOTAL_WAIT_MS_DEFAULT = + 30 * 60 * 1000; // 30 minutes public static final String MANIFEST_TARGET_SIZE_BYTES = "commit.manifest.target-size-bytes"; public static final long MANIFEST_TARGET_SIZE_BYTES_DEFAULT = 8 * 1024 * 1024; // 8 MB @@ -136,15 +122,18 @@ private TableProperties() { public static final String DEFAULT_FILE_FORMAT_DEFAULT = "parquet"; public static final String PARQUET_ROW_GROUP_SIZE_BYTES = "write.parquet.row-group-size-bytes"; - public static final String DELETE_PARQUET_ROW_GROUP_SIZE_BYTES = "write.delete.parquet.row-group-size-bytes"; + public static final String DELETE_PARQUET_ROW_GROUP_SIZE_BYTES = + "write.delete.parquet.row-group-size-bytes"; public static final int PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024; // 128 MB public static final String PARQUET_PAGE_SIZE_BYTES = "write.parquet.page-size-bytes"; - public static final String DELETE_PARQUET_PAGE_SIZE_BYTES = "write.delete.parquet.page-size-bytes"; + public static final String DELETE_PARQUET_PAGE_SIZE_BYTES = + "write.delete.parquet.page-size-bytes"; public static final int PARQUET_PAGE_SIZE_BYTES_DEFAULT = 1024 * 1024; // 1 MB public static final String PARQUET_DICT_SIZE_BYTES = "write.parquet.dict-size-bytes"; - public static final String DELETE_PARQUET_DICT_SIZE_BYTES = "write.delete.parquet.dict-size-bytes"; + public static final String DELETE_PARQUET_DICT_SIZE_BYTES = + "write.delete.parquet.dict-size-bytes"; public static final int PARQUET_DICT_SIZE_BYTES_DEFAULT = 2 * 1024 * 1024; // 2 MB public static final String PARQUET_COMPRESSION = "write.parquet.compression-codec"; @@ -152,7 +141,8 @@ private TableProperties() { public static final String PARQUET_COMPRESSION_DEFAULT = "gzip"; public static final String PARQUET_COMPRESSION_LEVEL = "write.parquet.compression-level"; - public static final String DELETE_PARQUET_COMPRESSION_LEVEL = "write.delete.parquet.compression-level"; + public static final String DELETE_PARQUET_COMPRESSION_LEVEL = + "write.delete.parquet.compression-level"; public static final String PARQUET_COMPRESSION_LEVEL_DEFAULT = null; public static final String PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT = @@ -167,10 +157,12 @@ private TableProperties() { "write.delete.parquet.row-group-check-max-record-count"; public static final int PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT_DEFAULT = 10000; - public static final String PARQUET_BLOOM_FILTER_MAX_BYTES = "write.parquet.bloom-filter-max-bytes"; + public static final String PARQUET_BLOOM_FILTER_MAX_BYTES = + "write.parquet.bloom-filter-max-bytes"; public static final int PARQUET_BLOOM_FILTER_MAX_BYTES_DEFAULT = 1024 * 1024; - public static final String PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX = "write.parquet.bloom-filter-enabled.column."; + public static final String PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX = + "write.parquet.bloom-filter-enabled.column."; public static final String AVRO_COMPRESSION = "write.avro.compression-codec"; public static final String DELETE_AVRO_COMPRESSION = "write.delete.avro.compression-codec"; @@ -197,7 +189,8 @@ private TableProperties() { public static final String ORC_COMPRESSION_DEFAULT = "zlib"; public static final String ORC_COMPRESSION_STRATEGY = "write.orc.compression-strategy"; - public static final String DELETE_ORC_COMPRESSION_STRATEGY = "write.delete.orc.compression-strategy"; + public static final String DELETE_ORC_COMPRESSION_STRATEGY = + "write.delete.orc.compression-strategy"; public static final String ORC_COMPRESSION_STRATEGY_DEFAULT = "speed"; public static final String SPLIT_SIZE = "read.split.target-size"; @@ -227,17 +220,12 @@ private TableProperties() { public static final String OBJECT_STORE_ENABLED = "write.object-storage.enabled"; public static final boolean OBJECT_STORE_ENABLED_DEFAULT = false; - /** - * @deprecated Use {@link #WRITE_DATA_LOCATION} instead. - */ - @Deprecated - public static final String OBJECT_STORE_PATH = "write.object-storage.path"; + /** @deprecated Use {@link #WRITE_DATA_LOCATION} instead. */ + @Deprecated public static final String OBJECT_STORE_PATH = "write.object-storage.path"; public static final String WRITE_LOCATION_PROVIDER_IMPL = "write.location-provider.impl"; - /** - * @deprecated Use {@link #WRITE_DATA_LOCATION} instead. - */ + /** @deprecated Use {@link #WRITE_DATA_LOCATION} instead. */ @Deprecated public static final String WRITE_FOLDER_STORAGE_LOCATION = "write.folder-storage.path"; @@ -260,11 +248,13 @@ private TableProperties() { public static final String METADATA_COMPRESSION = "write.metadata.compression-codec"; public static final String METADATA_COMPRESSION_DEFAULT = "none"; - public static final String METADATA_PREVIOUS_VERSIONS_MAX = "write.metadata.previous-versions-max"; + public static final String METADATA_PREVIOUS_VERSIONS_MAX = + "write.metadata.previous-versions-max"; public static final int METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT = 100; // This enables to delete the oldest metadata file after commit. - public static final String METADATA_DELETE_AFTER_COMMIT_ENABLED = "write.metadata.delete-after-commit.enabled"; + public static final String METADATA_DELETE_AFTER_COMMIT_ENABLED = + "write.metadata.delete-after-commit.enabled"; public static final boolean METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT = false; public static final String METRICS_MAX_INFERRED_COLUMN_DEFAULTS = @@ -292,7 +282,8 @@ private TableProperties() { public static final String SPARK_WRITE_ACCEPT_ANY_SCHEMA = "write.spark.accept-any-schema"; public static final boolean SPARK_WRITE_ACCEPT_ANY_SCHEMA_DEFAULT = false; - public static final String SNAPSHOT_ID_INHERITANCE_ENABLED = "compatibility.snapshot-id-inheritance.enabled"; + public static final String SNAPSHOT_ID_INHERITANCE_ENABLED = + "compatibility.snapshot-id-inheritance.enabled"; public static final boolean SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT = false; public static final String ENGINE_HIVE_ENABLED = "engine.hive.enabled"; @@ -338,15 +329,17 @@ private TableProperties() { public static final String MERGE_MODE_DEFAULT = "copy-on-write"; /** - * @deprecated will be removed in 0.14.0, the cardinality check is always performed starting from 0.13.0. + * @deprecated will be removed in 0.14.0, the cardinality check is always performed starting from + * 0.13.0. */ @Deprecated - public static final String MERGE_CARDINALITY_CHECK_ENABLED = "write.merge.cardinality-check.enabled"; + public static final String MERGE_CARDINALITY_CHECK_ENABLED = + "write.merge.cardinality-check.enabled"; /** - * @deprecated will be removed in 0.14.0, the cardinality check is always performed starting from 0.13.0. + * @deprecated will be removed in 0.14.0, the cardinality check is always performed starting from + * 0.13.0. */ - @Deprecated - public static final boolean MERGE_CARDINALITY_CHECK_ENABLED_DEFAULT = true; + @Deprecated public static final boolean MERGE_CARDINALITY_CHECK_ENABLED_DEFAULT = true; public static final String MERGE_DISTRIBUTION_MODE = "write.merge.distribution-mode"; diff --git a/core/src/main/java/org/apache/iceberg/TableScanContext.java b/core/src/main/java/org/apache/iceberg/TableScanContext.java index cde786b1587a..beca0db87c72 100644 --- a/core/src/main/java/org/apache/iceberg/TableScanContext.java +++ b/core/src/main/java/org/apache/iceberg/TableScanContext.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Collection; @@ -29,9 +28,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.util.ThreadPools; -/** - * Context object with optional arguments for a TableScan. - */ +/** Context object with optional arguments for a TableScan. */ final class TableScanContext { private final Long snapshotId; private final Expression rowFilter; @@ -61,11 +58,19 @@ final class TableScanContext { this.fromSnapshotInclusive = false; } - private TableScanContext(Long snapshotId, Expression rowFilter, boolean ignoreResiduals, - boolean caseSensitive, boolean colStats, Schema projectedSchema, - Collection selectedColumns, ImmutableMap options, - Long fromSnapshotId, Long toSnapshotId, ExecutorService planExecutor, - boolean fromSnapshotInclusive) { + private TableScanContext( + Long snapshotId, + Expression rowFilter, + boolean ignoreResiduals, + boolean caseSensitive, + boolean colStats, + Schema projectedSchema, + Collection selectedColumns, + ImmutableMap options, + Long fromSnapshotId, + Long toSnapshotId, + ExecutorService planExecutor, + boolean fromSnapshotInclusive) { this.snapshotId = snapshotId; this.rowFilter = rowFilter; this.ignoreResiduals = ignoreResiduals; @@ -85,9 +90,19 @@ Long snapshotId() { } TableScanContext useSnapshotId(Long scanSnapshotId) { - return new TableScanContext(scanSnapshotId, rowFilter, ignoreResiduals, - caseSensitive, colStats, projectedSchema, selectedColumns, options, fromSnapshotId, toSnapshotId, - planExecutor, fromSnapshotInclusive); + return new TableScanContext( + scanSnapshotId, + rowFilter, + ignoreResiduals, + caseSensitive, + colStats, + projectedSchema, + selectedColumns, + options, + fromSnapshotId, + toSnapshotId, + planExecutor, + fromSnapshotInclusive); } Expression rowFilter() { @@ -95,9 +110,19 @@ Expression rowFilter() { } TableScanContext filterRows(Expression filter) { - return new TableScanContext(snapshotId, filter, ignoreResiduals, - caseSensitive, colStats, projectedSchema, selectedColumns, options, fromSnapshotId, toSnapshotId, - planExecutor, fromSnapshotInclusive); + return new TableScanContext( + snapshotId, + filter, + ignoreResiduals, + caseSensitive, + colStats, + projectedSchema, + selectedColumns, + options, + fromSnapshotId, + toSnapshotId, + planExecutor, + fromSnapshotInclusive); } boolean ignoreResiduals() { @@ -105,9 +130,19 @@ boolean ignoreResiduals() { } TableScanContext ignoreResiduals(boolean shouldIgnoreResiduals) { - return new TableScanContext(snapshotId, rowFilter, shouldIgnoreResiduals, - caseSensitive, colStats, projectedSchema, selectedColumns, options, fromSnapshotId, toSnapshotId, - planExecutor, fromSnapshotInclusive); + return new TableScanContext( + snapshotId, + rowFilter, + shouldIgnoreResiduals, + caseSensitive, + colStats, + projectedSchema, + selectedColumns, + options, + fromSnapshotId, + toSnapshotId, + planExecutor, + fromSnapshotInclusive); } boolean caseSensitive() { @@ -115,9 +150,19 @@ boolean caseSensitive() { } TableScanContext setCaseSensitive(boolean isCaseSensitive) { - return new TableScanContext(snapshotId, rowFilter, ignoreResiduals, - isCaseSensitive, colStats, projectedSchema, selectedColumns, options, fromSnapshotId, toSnapshotId, - planExecutor, fromSnapshotInclusive); + return new TableScanContext( + snapshotId, + rowFilter, + ignoreResiduals, + isCaseSensitive, + colStats, + projectedSchema, + selectedColumns, + options, + fromSnapshotId, + toSnapshotId, + planExecutor, + fromSnapshotInclusive); } boolean returnColumnStats() { @@ -125,9 +170,19 @@ boolean returnColumnStats() { } TableScanContext shouldReturnColumnStats(boolean returnColumnStats) { - return new TableScanContext(snapshotId, rowFilter, ignoreResiduals, - caseSensitive, returnColumnStats, projectedSchema, selectedColumns, options, fromSnapshotId, toSnapshotId, - planExecutor, fromSnapshotInclusive); + return new TableScanContext( + snapshotId, + rowFilter, + ignoreResiduals, + caseSensitive, + returnColumnStats, + projectedSchema, + selectedColumns, + options, + fromSnapshotId, + toSnapshotId, + planExecutor, + fromSnapshotInclusive); } Collection selectedColumns() { @@ -135,10 +190,21 @@ Collection selectedColumns() { } TableScanContext selectColumns(Collection columns) { - Preconditions.checkState(projectedSchema == null, "Cannot select columns when projection schema is set"); - return new TableScanContext(snapshotId, rowFilter, ignoreResiduals, - caseSensitive, colStats, null, columns, options, fromSnapshotId, toSnapshotId, - planExecutor, fromSnapshotInclusive); + Preconditions.checkState( + projectedSchema == null, "Cannot select columns when projection schema is set"); + return new TableScanContext( + snapshotId, + rowFilter, + ignoreResiduals, + caseSensitive, + colStats, + null, + columns, + options, + fromSnapshotId, + toSnapshotId, + planExecutor, + fromSnapshotInclusive); } Schema projectedSchema() { @@ -146,10 +212,21 @@ Schema projectedSchema() { } TableScanContext project(Schema schema) { - Preconditions.checkState(selectedColumns == null, "Cannot set projection schema when columns are selected"); - return new TableScanContext(snapshotId, rowFilter, ignoreResiduals, - caseSensitive, colStats, schema, null, options, fromSnapshotId, toSnapshotId, - planExecutor, fromSnapshotInclusive); + Preconditions.checkState( + selectedColumns == null, "Cannot set projection schema when columns are selected"); + return new TableScanContext( + snapshotId, + rowFilter, + ignoreResiduals, + caseSensitive, + colStats, + schema, + null, + options, + fromSnapshotId, + toSnapshotId, + planExecutor, + fromSnapshotInclusive); } Map options() { @@ -160,9 +237,19 @@ TableScanContext withOption(String property, String value) { ImmutableMap.Builder builder = ImmutableMap.builder(); builder.putAll(options); builder.put(property, value); - return new TableScanContext(snapshotId, rowFilter, ignoreResiduals, - caseSensitive, colStats, projectedSchema, selectedColumns, builder.build(), fromSnapshotId, toSnapshotId, - planExecutor, fromSnapshotInclusive); + return new TableScanContext( + snapshotId, + rowFilter, + ignoreResiduals, + caseSensitive, + colStats, + projectedSchema, + selectedColumns, + builder.build(), + fromSnapshotId, + toSnapshotId, + planExecutor, + fromSnapshotInclusive); } Long fromSnapshotId() { @@ -170,15 +257,35 @@ Long fromSnapshotId() { } TableScanContext fromSnapshotIdExclusive(long id) { - return new TableScanContext(snapshotId, rowFilter, ignoreResiduals, - caseSensitive, colStats, projectedSchema, selectedColumns, options, id, toSnapshotId, - planExecutor, false); + return new TableScanContext( + snapshotId, + rowFilter, + ignoreResiduals, + caseSensitive, + colStats, + projectedSchema, + selectedColumns, + options, + id, + toSnapshotId, + planExecutor, + false); } TableScanContext fromSnapshotIdInclusive(long id) { - return new TableScanContext(snapshotId, rowFilter, ignoreResiduals, - caseSensitive, colStats, projectedSchema, selectedColumns, options, id, toSnapshotId, - planExecutor, true); + return new TableScanContext( + snapshotId, + rowFilter, + ignoreResiduals, + caseSensitive, + colStats, + projectedSchema, + selectedColumns, + options, + id, + toSnapshotId, + planExecutor, + true); } boolean fromSnapshotInclusive() { @@ -190,9 +297,19 @@ Long toSnapshotId() { } TableScanContext toSnapshotId(long id) { - return new TableScanContext(snapshotId, rowFilter, ignoreResiduals, - caseSensitive, colStats, projectedSchema, selectedColumns, options, fromSnapshotId, id, - planExecutor, fromSnapshotInclusive); + return new TableScanContext( + snapshotId, + rowFilter, + ignoreResiduals, + caseSensitive, + colStats, + projectedSchema, + selectedColumns, + options, + fromSnapshotId, + id, + planExecutor, + fromSnapshotInclusive); } ExecutorService planExecutor() { @@ -204,8 +321,18 @@ boolean planWithCustomizedExecutor() { } TableScanContext planWith(ExecutorService executor) { - return new TableScanContext(snapshotId, rowFilter, ignoreResiduals, - caseSensitive, colStats, projectedSchema, selectedColumns, options, fromSnapshotId, toSnapshotId, - executor, fromSnapshotInclusive); + return new TableScanContext( + snapshotId, + rowFilter, + ignoreResiduals, + caseSensitive, + colStats, + projectedSchema, + selectedColumns, + options, + fromSnapshotId, + toSnapshotId, + executor, + fromSnapshotInclusive); } } diff --git a/core/src/main/java/org/apache/iceberg/Transactions.java b/core/src/main/java/org/apache/iceberg/Transactions.java index d624fbdc5bed..32d3dedfe8f4 100644 --- a/core/src/main/java/org/apache/iceberg/Transactions.java +++ b/core/src/main/java/org/apache/iceberg/Transactions.java @@ -16,28 +16,28 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.BaseTransaction.TransactionType; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; public final class Transactions { - private Transactions() { - } + private Transactions() {} public static Transaction createOrReplaceTableTransaction( String tableName, TableOperations ops, TableMetadata start) { return new BaseTransaction(tableName, ops, TransactionType.CREATE_OR_REPLACE_TABLE, start); } - public static Transaction replaceTableTransaction(String tableName, TableOperations ops, TableMetadata start) { + public static Transaction replaceTableTransaction( + String tableName, TableOperations ops, TableMetadata start) { return new BaseTransaction(tableName, ops, TransactionType.REPLACE_TABLE, start); } - public static Transaction createTableTransaction(String tableName, TableOperations ops, TableMetadata start) { - Preconditions.checkArgument(ops.current() == null, - "Cannot start create table transaction: table already exists"); + public static Transaction createTableTransaction( + String tableName, TableOperations ops, TableMetadata start) { + Preconditions.checkArgument( + ops.current() == null, "Cannot start create table transaction: table already exists"); return new BaseTransaction(tableName, ops, TransactionType.CREATE_TABLE, start); } diff --git a/core/src/main/java/org/apache/iceberg/UpdateSnapshotReferencesOperation.java b/core/src/main/java/org/apache/iceberg/UpdateSnapshotReferencesOperation.java index 070d80a06f07..b87bac2f014f 100644 --- a/core/src/main/java/org/apache/iceberg/UpdateSnapshotReferencesOperation.java +++ b/core/src/main/java/org/apache/iceberg/UpdateSnapshotReferencesOperation.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Map; @@ -25,8 +24,8 @@ import org.apache.iceberg.util.SnapshotUtil; /** - * ToDo: Add SetSnapshotOperation operations such as setCurrentSnapshot, rollBackTime, rollbackTo - * to this class so that we can support those operations for refs. + * ToDo: Add SetSnapshotOperation operations such as setCurrentSnapshot, rollBackTime, rollbackTo to + * this class so that we can support those operations for refs. */ class UpdateSnapshotReferencesOperation implements PendingUpdate> { @@ -115,7 +114,8 @@ public UpdateSnapshotReferencesOperation fastForward(String name, String source) return replaceBranch(name, source, true); } - private UpdateSnapshotReferencesOperation replaceBranch(String name, String source, boolean fastForward) { + private UpdateSnapshotReferencesOperation replaceBranch( + String name, String source, boolean fastForward) { Preconditions.checkNotNull(name, "Target branch cannot be null"); Preconditions.checkNotNull(source, "Source ref cannot be null"); SnapshotRef sourceRef = updatedRefs.get(source); @@ -132,10 +132,11 @@ private UpdateSnapshotReferencesOperation replaceBranch(String name, String sour SnapshotRef updatedRef = SnapshotRef.builderFrom(refToUpdate, sourceRef.snapshotId()).build(); if (fastForward) { - boolean targetIsAncestor = SnapshotUtil.isAncestorOf(sourceRef.snapshotId(), - refToUpdate.snapshotId(), base::snapshot); - Preconditions.checkArgument(targetIsAncestor, - "Cannot fast-forward: %s is not an ancestor of %s", name, source); + boolean targetIsAncestor = + SnapshotUtil.isAncestorOf( + sourceRef.snapshotId(), refToUpdate.snapshotId(), base::snapshot); + Preconditions.checkArgument( + targetIsAncestor, "Cannot fast-forward: %s is not an ancestor of %s", name, source); } updatedRefs.put(name, updatedRef); @@ -152,13 +153,13 @@ public UpdateSnapshotReferencesOperation replaceTag(String name, long snapshotId return this; } - public UpdateSnapshotReferencesOperation setMinSnapshotsToKeep(String name, int minSnapshotsToKeep) { + public UpdateSnapshotReferencesOperation setMinSnapshotsToKeep( + String name, int minSnapshotsToKeep) { Preconditions.checkNotNull(name, "Branch name cannot be null"); SnapshotRef ref = updatedRefs.get(name); Preconditions.checkArgument(ref != null, "Branch does not exist: %s", name); - SnapshotRef updateBranch = SnapshotRef.builderFrom(ref) - .minSnapshotsToKeep(minSnapshotsToKeep) - .build(); + SnapshotRef updateBranch = + SnapshotRef.builderFrom(ref).minSnapshotsToKeep(minSnapshotsToKeep).build(); updatedRefs.put(name, updateBranch); return this; } @@ -167,9 +168,8 @@ public UpdateSnapshotReferencesOperation setMaxSnapshotAgeMs(String name, long m Preconditions.checkNotNull(name, "Branch name cannot be null"); SnapshotRef ref = updatedRefs.get(name); Preconditions.checkArgument(ref != null, "Branch does not exist: %s", name); - SnapshotRef updateBranch = SnapshotRef.builderFrom(ref) - .maxSnapshotAgeMs(maxSnapshotAgeMs) - .build(); + SnapshotRef updateBranch = + SnapshotRef.builderFrom(ref).maxSnapshotAgeMs(maxSnapshotAgeMs).build(); updatedRefs.put(name, updateBranch); return this; } @@ -178,9 +178,7 @@ public UpdateSnapshotReferencesOperation setMaxRefAgeMs(String name, long maxRef Preconditions.checkNotNull(name, "Reference name cannot be null"); SnapshotRef ref = updatedRefs.get(name); Preconditions.checkArgument(ref != null, "Ref does not exist: %s", name); - SnapshotRef updatedRef = SnapshotRef.builderFrom(ref) - .maxRefAgeMs(maxRefAgeMs) - .build(); + SnapshotRef updatedRef = SnapshotRef.builderFrom(ref).maxRefAgeMs(maxRefAgeMs).build(); updatedRefs.put(name, updatedRef); return this; } diff --git a/core/src/main/java/org/apache/iceberg/V1Metadata.java b/core/src/main/java/org/apache/iceberg/V1Metadata.java index d368679609f3..c47d2183805d 100644 --- a/core/src/main/java/org/apache/iceberg/V1Metadata.java +++ b/core/src/main/java/org/apache/iceberg/V1Metadata.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.nio.ByteBuffer; import java.util.List; import java.util.Map; @@ -26,23 +27,28 @@ import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.required; - class V1Metadata { - private V1Metadata() { - } - - static final Schema MANIFEST_LIST_SCHEMA = new Schema( - ManifestFile.PATH, ManifestFile.LENGTH, ManifestFile.SPEC_ID, ManifestFile.SNAPSHOT_ID, - ManifestFile.ADDED_FILES_COUNT, ManifestFile.EXISTING_FILES_COUNT, ManifestFile.DELETED_FILES_COUNT, - ManifestFile.PARTITION_SUMMARIES, - ManifestFile.ADDED_ROWS_COUNT, ManifestFile.EXISTING_ROWS_COUNT, ManifestFile.DELETED_ROWS_COUNT); + private V1Metadata() {} + + static final Schema MANIFEST_LIST_SCHEMA = + new Schema( + ManifestFile.PATH, + ManifestFile.LENGTH, + ManifestFile.SPEC_ID, + ManifestFile.SNAPSHOT_ID, + ManifestFile.ADDED_FILES_COUNT, + ManifestFile.EXISTING_FILES_COUNT, + ManifestFile.DELETED_FILES_COUNT, + ManifestFile.PARTITION_SUMMARIES, + ManifestFile.ADDED_ROWS_COUNT, + ManifestFile.EXISTING_ROWS_COUNT, + ManifestFile.DELETED_ROWS_COUNT); /** * A wrapper class to write any ManifestFile implementation to Avro using the v1 schema. * - * This is used to maintain compatibility with v1 by writing manifest list files with the old schema, instead of - * writing a sequence number into metadata files in v1 tables. + *

This is used to maintain compatibility with v1 by writing manifest list files with the old + * schema, instead of writing a sequence number into metadata files in v1 tables. */ static class IndexedManifestFile implements ManifestFile, IndexedRecord { private static final org.apache.avro.Schema AVRO_SCHEMA = @@ -193,11 +199,13 @@ static Schema entrySchema(Types.StructType partitionType) { static Schema wrapFileSchema(Types.StructType fileSchema) { // this is used to build projection schemas return new Schema( - ManifestEntry.STATUS, ManifestEntry.SNAPSHOT_ID, + ManifestEntry.STATUS, + ManifestEntry.SNAPSHOT_ID, required(ManifestEntry.DATA_FILE_ID, "data_file", fileSchema)); } - private static final Types.NestedField BLOCK_SIZE = required(105, "block_size_in_bytes", Types.LongType.get()); + private static final Types.NestedField BLOCK_SIZE = + required(105, "block_size_in_bytes", Types.LongType.get()); static Types.StructType dataFileSchema(Types.StructType partitionType) { return Types.StructType.of( @@ -215,13 +223,10 @@ static Types.StructType dataFileSchema(Types.StructType partitionType) { DataFile.UPPER_BOUNDS, DataFile.KEY_METADATA, DataFile.SPLIT_OFFSETS, - DataFile.SORT_ORDER_ID - ); + DataFile.SORT_ORDER_ID); } - /** - * Wrapper used to write a ManifestEntry to v1 metadata. - */ + /** Wrapper used to write a ManifestEntry to v1 metadata. */ static class IndexedManifestEntry implements ManifestEntry, IndexedRecord { private final org.apache.avro.Schema avroSchema; private final IndexedDataFile fileWrapper; diff --git a/core/src/main/java/org/apache/iceberg/V2Metadata.java b/core/src/main/java/org/apache/iceberg/V2Metadata.java index 40b8624baa3b..8639b7817649 100644 --- a/core/src/main/java/org/apache/iceberg/V2Metadata.java +++ b/core/src/main/java/org/apache/iceberg/V2Metadata.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.nio.ByteBuffer; import java.util.List; import java.util.Map; @@ -27,34 +28,31 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.required; - class V2Metadata { - private V2Metadata() { - } - - static final Schema MANIFEST_LIST_SCHEMA = new Schema( - ManifestFile.PATH, - ManifestFile.LENGTH, - ManifestFile.SPEC_ID, - ManifestFile.MANIFEST_CONTENT.asRequired(), - ManifestFile.SEQUENCE_NUMBER.asRequired(), - ManifestFile.MIN_SEQUENCE_NUMBER.asRequired(), - ManifestFile.SNAPSHOT_ID.asRequired(), - ManifestFile.ADDED_FILES_COUNT.asRequired(), - ManifestFile.EXISTING_FILES_COUNT.asRequired(), - ManifestFile.DELETED_FILES_COUNT.asRequired(), - ManifestFile.ADDED_ROWS_COUNT.asRequired(), - ManifestFile.EXISTING_ROWS_COUNT.asRequired(), - ManifestFile.DELETED_ROWS_COUNT.asRequired(), - ManifestFile.PARTITION_SUMMARIES - ); + private V2Metadata() {} + + static final Schema MANIFEST_LIST_SCHEMA = + new Schema( + ManifestFile.PATH, + ManifestFile.LENGTH, + ManifestFile.SPEC_ID, + ManifestFile.MANIFEST_CONTENT.asRequired(), + ManifestFile.SEQUENCE_NUMBER.asRequired(), + ManifestFile.MIN_SEQUENCE_NUMBER.asRequired(), + ManifestFile.SNAPSHOT_ID.asRequired(), + ManifestFile.ADDED_FILES_COUNT.asRequired(), + ManifestFile.EXISTING_FILES_COUNT.asRequired(), + ManifestFile.DELETED_FILES_COUNT.asRequired(), + ManifestFile.ADDED_ROWS_COUNT.asRequired(), + ManifestFile.EXISTING_ROWS_COUNT.asRequired(), + ManifestFile.DELETED_ROWS_COUNT.asRequired(), + ManifestFile.PARTITION_SUMMARIES); /** * A wrapper class to write any ManifestFile implementation to Avro using the v2 write schema. * - * This is used to maintain compatibility with v2 by writing manifest list files with the old schema, instead of - * writing a sequence number into metadata files in v2 tables. + *

This is used to maintain compatibility with v2 by writing manifest list files with the old + * schema, instead of writing a sequence number into metadata files in v2 tables. */ static class IndexedManifestFile implements ManifestFile, IndexedRecord { private static final org.apache.avro.Schema AVRO_SCHEMA = @@ -97,10 +95,13 @@ public Object get(int pos) { return wrapped.content().id(); case 4: if (wrapped.sequenceNumber() == ManifestWriter.UNASSIGNED_SEQ) { - // if the sequence number is being assigned here, then the manifest must be created by the current + // if the sequence number is being assigned here, then the manifest must be created by + // the current // operation. to validate this, check that the snapshot id matches the current commit - Preconditions.checkState(commitSnapshotId == wrapped.snapshotId(), - "Found unassigned sequence number for a manifest from snapshot: %s", wrapped.snapshotId()); + Preconditions.checkState( + commitSnapshotId == wrapped.snapshotId(), + "Found unassigned sequence number for a manifest from snapshot: %s", + wrapped.snapshotId()); return sequenceNumber; } else { return wrapped.sequenceNumber(); @@ -108,10 +109,14 @@ public Object get(int pos) { case 5: if (wrapped.minSequenceNumber() == ManifestWriter.UNASSIGNED_SEQ) { // same sanity check as above - Preconditions.checkState(commitSnapshotId == wrapped.snapshotId(), - "Found unassigned sequence number for a manifest from snapshot: %s", wrapped.snapshotId()); - // if the min sequence number is not determined, then there was no assigned sequence number for any file - // written to the wrapped manifest. replace the unassigned sequence number with the one for this commit + Preconditions.checkState( + commitSnapshotId == wrapped.snapshotId(), + "Found unassigned sequence number for a manifest from snapshot: %s", + wrapped.snapshotId()); + // if the min sequence number is not determined, then there was no assigned sequence + // number for any file + // written to the wrapped manifest. replace the unassigned sequence number with the one + // for this commit return sequenceNumber; } else { return wrapped.minSequenceNumber(); @@ -242,7 +247,9 @@ static Schema entrySchema(Types.StructType partitionType) { static Schema wrapFileSchema(Types.StructType fileSchema) { // this is used to build projection schemas return new Schema( - ManifestEntry.STATUS, ManifestEntry.SNAPSHOT_ID, ManifestEntry.SEQUENCE_NUMBER, + ManifestEntry.STATUS, + ManifestEntry.SNAPSHOT_ID, + ManifestEntry.SEQUENCE_NUMBER, required(ManifestEntry.DATA_FILE_ID, "data_file", fileSchema)); } @@ -251,7 +258,8 @@ static Types.StructType fileType(Types.StructType partitionType) { DataFile.CONTENT.asRequired(), DataFile.FILE_PATH, DataFile.FILE_FORMAT, - required(DataFile.PARTITION_ID, DataFile.PARTITION_NAME, partitionType, DataFile.PARTITION_DOC), + required( + DataFile.PARTITION_ID, DataFile.PARTITION_NAME, partitionType, DataFile.PARTITION_DOC), DataFile.RECORD_COUNT, DataFile.FILE_SIZE, DataFile.COLUMN_SIZES, @@ -263,11 +271,11 @@ static Types.StructType fileType(Types.StructType partitionType) { DataFile.KEY_METADATA, DataFile.SPLIT_OFFSETS, DataFile.EQUALITY_IDS, - DataFile.SORT_ORDER_ID - ); + DataFile.SORT_ORDER_ID); } - static class IndexedManifestEntry> implements ManifestEntry, IndexedRecord { + static class IndexedManifestEntry> + implements ManifestEntry, IndexedRecord { private final org.apache.avro.Schema avroSchema; private final Long commitSnapshotId; private final IndexedDataFile fileWrapper; @@ -303,12 +311,15 @@ public Object get(int i) { return wrapped.snapshotId(); case 2: if (wrapped.sequenceNumber() == null) { - // if the entry's sequence number is null, then it will inherit the sequence number of the current commit. - // to validate that this is correct, check that the snapshot id is either null (will also be inherited) or + // if the entry's sequence number is null, then it will inherit the sequence number of + // the current commit. + // to validate that this is correct, check that the snapshot id is either null (will + // also be inherited) or // that it matches the id of the current commit. Preconditions.checkState( wrapped.snapshotId() == null || wrapped.snapshotId().equals(commitSnapshotId), - "Found unassigned sequence number for an entry from snapshot: %s", wrapped.snapshotId()); + "Found unassigned sequence number for an entry from snapshot: %s", + wrapped.snapshotId()); return null; } return wrapped.sequenceNumber(); @@ -360,9 +371,7 @@ public ManifestEntry copyWithoutStats() { } } - /** - * Wrapper used to write DataFile or DeleteFile to v2 metadata. - */ + /** Wrapper used to write DataFile or DeleteFile to v2 metadata. */ static class IndexedDataFile implements ContentFile, IndexedRecord { private final org.apache.avro.Schema avroSchema; private final IndexedStructLike partitionWrapper; diff --git a/core/src/main/java/org/apache/iceberg/actions/BaseAction.java b/core/src/main/java/org/apache/iceberg/actions/BaseAction.java index 5b10eeee0085..80579aa3566e 100644 --- a/core/src/main/java/org/apache/iceberg/actions/BaseAction.java +++ b/core/src/main/java/org/apache/iceberg/actions/BaseAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import org.apache.iceberg.MetadataTableType; diff --git a/core/src/main/java/org/apache/iceberg/actions/BaseDeleteOrphanFilesActionResult.java b/core/src/main/java/org/apache/iceberg/actions/BaseDeleteOrphanFilesActionResult.java index 881e908706ea..4ce5994451e1 100644 --- a/core/src/main/java/org/apache/iceberg/actions/BaseDeleteOrphanFilesActionResult.java +++ b/core/src/main/java/org/apache/iceberg/actions/BaseDeleteOrphanFilesActionResult.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; public class BaseDeleteOrphanFilesActionResult implements DeleteOrphanFiles.Result { diff --git a/core/src/main/java/org/apache/iceberg/actions/BaseDeleteReachableFilesActionResult.java b/core/src/main/java/org/apache/iceberg/actions/BaseDeleteReachableFilesActionResult.java index 61abd43141e4..6adb8258221a 100644 --- a/core/src/main/java/org/apache/iceberg/actions/BaseDeleteReachableFilesActionResult.java +++ b/core/src/main/java/org/apache/iceberg/actions/BaseDeleteReachableFilesActionResult.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; public class BaseDeleteReachableFilesActionResult implements DeleteReachableFiles.Result { @@ -26,10 +25,11 @@ public class BaseDeleteReachableFilesActionResult implements DeleteReachableFile private final long deletedManifestListsCount; private final long deletedOtherFilesCount; - public BaseDeleteReachableFilesActionResult(long deletedDataFilesCount, - long deletedManifestsCount, - long deletedManifestListsCount, - long otherDeletedFilesCount) { + public BaseDeleteReachableFilesActionResult( + long deletedDataFilesCount, + long deletedManifestsCount, + long deletedManifestListsCount, + long otherDeletedFilesCount) { this.deletedDataFilesCount = deletedDataFilesCount; this.deletedManifestsCount = deletedManifestsCount; this.deletedManifestListsCount = deletedManifestListsCount; diff --git a/core/src/main/java/org/apache/iceberg/actions/BaseExpireSnapshotsActionResult.java b/core/src/main/java/org/apache/iceberg/actions/BaseExpireSnapshotsActionResult.java index 32c6be5ae103..93fd8431b34e 100644 --- a/core/src/main/java/org/apache/iceberg/actions/BaseExpireSnapshotsActionResult.java +++ b/core/src/main/java/org/apache/iceberg/actions/BaseExpireSnapshotsActionResult.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; public class BaseExpireSnapshotsActionResult implements ExpireSnapshots.Result { @@ -27,9 +26,8 @@ public class BaseExpireSnapshotsActionResult implements ExpireSnapshots.Result { private final long deletedManifestsCount; private final long deletedManifestListsCount; - public BaseExpireSnapshotsActionResult(long deletedDataFilesCount, - long deletedManifestsCount, - long deletedManifestListsCount) { + public BaseExpireSnapshotsActionResult( + long deletedDataFilesCount, long deletedManifestsCount, long deletedManifestListsCount) { this.deletedDataFilesCount = deletedDataFilesCount; this.deletedPosDeleteFilesCount = 0; this.deletedEqDeleteFilesCount = 0; @@ -37,11 +35,12 @@ public BaseExpireSnapshotsActionResult(long deletedDataFilesCount, this.deletedManifestListsCount = deletedManifestListsCount; } - public BaseExpireSnapshotsActionResult(long deletedDataFilesCount, - long deletedPosDeleteFilesCount, - long deletedEqDeleteFilesCount, - long deletedManifestsCount, - long deletedManifestListsCount) { + public BaseExpireSnapshotsActionResult( + long deletedDataFilesCount, + long deletedPosDeleteFilesCount, + long deletedEqDeleteFilesCount, + long deletedManifestsCount, + long deletedManifestListsCount) { this.deletedDataFilesCount = deletedDataFilesCount; this.deletedPosDeleteFilesCount = deletedPosDeleteFilesCount; this.deletedEqDeleteFilesCount = deletedEqDeleteFilesCount; diff --git a/core/src/main/java/org/apache/iceberg/actions/BaseFileGroupRewriteResult.java b/core/src/main/java/org/apache/iceberg/actions/BaseFileGroupRewriteResult.java index d61162096832..fd44f7f6a333 100644 --- a/core/src/main/java/org/apache/iceberg/actions/BaseFileGroupRewriteResult.java +++ b/core/src/main/java/org/apache/iceberg/actions/BaseFileGroupRewriteResult.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import org.apache.iceberg.actions.RewriteDataFiles.FileGroupInfo; @@ -27,7 +26,8 @@ public class BaseFileGroupRewriteResult implements FileGroupRewriteResult { private final int rewrittenDataFilesCount; private final FileGroupInfo info; - public BaseFileGroupRewriteResult(FileGroupInfo info, int addedFilesCount, int rewrittenFilesCount) { + public BaseFileGroupRewriteResult( + FileGroupInfo info, int addedFilesCount, int rewrittenFilesCount) { this.info = info; this.addedDataFilesCount = addedFilesCount; this.rewrittenDataFilesCount = rewrittenFilesCount; diff --git a/core/src/main/java/org/apache/iceberg/actions/BaseMigrateTableActionResult.java b/core/src/main/java/org/apache/iceberg/actions/BaseMigrateTableActionResult.java index 4d7397bb150f..b4c1b7f3de0b 100644 --- a/core/src/main/java/org/apache/iceberg/actions/BaseMigrateTableActionResult.java +++ b/core/src/main/java/org/apache/iceberg/actions/BaseMigrateTableActionResult.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; public class BaseMigrateTableActionResult implements MigrateTable.Result { diff --git a/core/src/main/java/org/apache/iceberg/actions/BaseRewriteDataFilesAction.java b/core/src/main/java/org/apache/iceberg/actions/BaseRewriteDataFilesAction.java index 1cdee7224650..bd922082eaae 100644 --- a/core/src/main/java/org/apache/iceberg/actions/BaseRewriteDataFilesAction.java +++ b/core/src/main/java/org/apache/iceberg/actions/BaseRewriteDataFilesAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.io.IOException; @@ -75,24 +74,26 @@ protected BaseRewriteDataFilesAction(Table table) { this.filter = Expressions.alwaysTrue(); this.caseSensitive = false; - long splitSize = PropertyUtil.propertyAsLong( - table.properties(), - TableProperties.SPLIT_SIZE, - TableProperties.SPLIT_SIZE_DEFAULT); - long targetFileSize = PropertyUtil.propertyAsLong( - table.properties(), - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); + long splitSize = + PropertyUtil.propertyAsLong( + table.properties(), TableProperties.SPLIT_SIZE, TableProperties.SPLIT_SIZE_DEFAULT); + long targetFileSize = + PropertyUtil.propertyAsLong( + table.properties(), + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); this.targetSizeInBytes = Math.min(splitSize, targetFileSize); - this.splitLookback = PropertyUtil.propertyAsInt( - table.properties(), - TableProperties.SPLIT_LOOKBACK, - TableProperties.SPLIT_LOOKBACK_DEFAULT); - this.splitOpenFileCost = PropertyUtil.propertyAsLong( - table.properties(), - TableProperties.SPLIT_OPEN_FILE_COST, - TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); + this.splitLookback = + PropertyUtil.propertyAsInt( + table.properties(), + TableProperties.SPLIT_LOOKBACK, + TableProperties.SPLIT_LOOKBACK_DEFAULT); + this.splitOpenFileCost = + PropertyUtil.propertyAsLong( + table.properties(), + TableProperties.SPLIT_OPEN_FILE_COST, + TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); this.fileIO = fileIO(); this.encryptionManager = table.encryption(); @@ -145,21 +146,22 @@ public BaseRewriteDataFilesAction outputSpecId(int specId) { * @return this for method chaining */ public BaseRewriteDataFilesAction targetSizeInBytes(long targetSize) { - Preconditions.checkArgument(targetSize > 0L, "Invalid target rewrite data file size in bytes %s", - targetSize); + Preconditions.checkArgument( + targetSize > 0L, "Invalid target rewrite data file size in bytes %s", targetSize); this.targetSizeInBytes = targetSize; return this; } /** - * Specify the number of "bins" considered when trying to pack the next file split into a task. Increasing this - * usually makes tasks a bit more even by considering more ways to pack file regions into a single task with extra - * planning cost. - *

- * This configuration can reorder the incoming file regions, to preserve order for lower/upper bounds in file - * metadata, user can use a lookback of 1. + * Specify the number of "bins" considered when trying to pack the next file split into a task. + * Increasing this usually makes tasks a bit more even by considering more ways to pack file + * regions into a single task with extra planning cost. + * + *

This configuration can reorder the incoming file regions, to preserve order for lower/upper + * bounds in file metadata, user can use a lookback of 1. * - * @param lookback number of "bins" considered when trying to pack the next file split into a task. + * @param lookback number of "bins" considered when trying to pack the next file split into a + * task. * @return this for method chaining */ public BaseRewriteDataFilesAction splitLookback(int lookback) { @@ -169,11 +171,11 @@ public BaseRewriteDataFilesAction splitLookback(int lookback) { } /** - * Specify the minimum file size to count to pack into one "bin". If the read file size is smaller than this specified - * threshold, Iceberg will use this value to do count. - *

- * this configuration controls the number of files to compact for each task, small value would lead to a high - * compaction, the default value is 4MB. + * Specify the minimum file size to count to pack into one "bin". If the read file size is smaller + * than this specified threshold, Iceberg will use this value to do count. + * + *

this configuration controls the number of files to compact for each task, small value would + * lead to a high compaction, the default value is 4MB. * * @param openFileCost minimum file size to count to pack into one "bin". * @return this for method chaining @@ -185,8 +187,8 @@ public BaseRewriteDataFilesAction splitOpenFileCost(long openFileCost) { } /** - * Pass a row Expression to filter DataFiles to be rewritten. Note that all files that may contain data matching the - * filter may be rewritten. + * Pass a row Expression to filter DataFiles to be rewritten. Note that all files that may contain + * data matching the filter may be rewritten. * * @param expr Expression to filter out DataFiles * @return this for method chaining @@ -205,12 +207,14 @@ public RewriteDataFilesActionResult execute() { long startingSnapshotId = table.currentSnapshot().snapshotId(); try { - fileScanTasks = table.newScan() - .useSnapshot(startingSnapshotId) - .caseSensitive(caseSensitive) - .ignoreResiduals() - .filter(filter) - .planFiles(); + fileScanTasks = + table + .newScan() + .useSnapshot(startingSnapshotId) + .caseSensitive(caseSensitive) + .ignoreResiduals() + .filter(filter) + .planFiles(); } finally { try { if (fileScanTasks != null) { @@ -221,58 +225,67 @@ public RewriteDataFilesActionResult execute() { } } - Map> groupedTasks = groupTasksByPartition(fileScanTasks.iterator()); - Map> filteredGroupedTasks = groupedTasks.entrySet().stream() - .filter(kv -> kv.getValue().size() > 1) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + Map> groupedTasks = + groupTasksByPartition(fileScanTasks.iterator()); + Map> filteredGroupedTasks = + groupedTasks.entrySet().stream() + .filter(kv -> kv.getValue().size() > 1) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); // Nothing to rewrite if there's only one DataFile in each partition. if (filteredGroupedTasks.isEmpty()) { return RewriteDataFilesActionResult.empty(); } // Split and combine tasks under each partition - List combinedScanTasks = filteredGroupedTasks.values().stream() - .map(scanTasks -> { - CloseableIterable splitTasks = TableScanUtil.splitFiles( - CloseableIterable.withNoopClose(scanTasks), targetSizeInBytes); - return TableScanUtil.planTasks(splitTasks, targetSizeInBytes, splitLookback, splitOpenFileCost); - }) - .flatMap(Streams::stream) - .filter(task -> task.files().size() > 1 || isPartialFileScan(task)) - .collect(Collectors.toList()); + List combinedScanTasks = + filteredGroupedTasks.values().stream() + .map( + scanTasks -> { + CloseableIterable splitTasks = + TableScanUtil.splitFiles( + CloseableIterable.withNoopClose(scanTasks), targetSizeInBytes); + return TableScanUtil.planTasks( + splitTasks, targetSizeInBytes, splitLookback, splitOpenFileCost); + }) + .flatMap(Streams::stream) + .filter(task -> task.files().size() > 1 || isPartialFileScan(task)) + .collect(Collectors.toList()); if (combinedScanTasks.isEmpty()) { return RewriteDataFilesActionResult.empty(); } List addedDataFiles = rewriteDataForTasks(combinedScanTasks); - List currentDataFiles = combinedScanTasks.stream() - .flatMap(tasks -> tasks.files().stream().map(FileScanTask::file)) - .collect(Collectors.toList()); + List currentDataFiles = + combinedScanTasks.stream() + .flatMap(tasks -> tasks.files().stream().map(FileScanTask::file)) + .collect(Collectors.toList()); replaceDataFiles(currentDataFiles, addedDataFiles, startingSnapshotId); return new RewriteDataFilesActionResult(currentDataFiles, addedDataFiles); } - private Map> groupTasksByPartition( CloseableIterator tasksIter) { - ListMultimap tasksGroupedByPartition = Multimaps.newListMultimap( - Maps.newHashMap(), Lists::newArrayList); + ListMultimap tasksGroupedByPartition = + Multimaps.newListMultimap(Maps.newHashMap(), Lists::newArrayList); StructLikeWrapper partitionWrapper = StructLikeWrapper.forType(spec.partitionType()); try (CloseableIterator iterator = tasksIter) { - iterator.forEachRemaining(task -> { - StructLikeWrapper structLike = partitionWrapper.copyFor(task.file().partition()); - tasksGroupedByPartition.put(structLike, task); - }); + iterator.forEachRemaining( + task -> { + StructLikeWrapper structLike = partitionWrapper.copyFor(task.file().partition()); + tasksGroupedByPartition.put(structLike, task); + }); } catch (IOException e) { LOG.warn("Failed to close task iterator", e); } return tasksGroupedByPartition.asMap(); } - private void replaceDataFiles(Iterable deletedDataFiles, Iterable addedDataFiles, - long startingSnapshotId) { + private void replaceDataFiles( + Iterable deletedDataFiles, + Iterable addedDataFiles, + long startingSnapshotId) { try { doReplace(deletedDataFiles, addedDataFiles, startingSnapshotId); } catch (CommitStateUnknownException e) { @@ -290,11 +303,15 @@ private void replaceDataFiles(Iterable deletedDataFiles, Iterable deletedDataFiles, Iterable addedDataFiles, + void doReplace( + Iterable deletedDataFiles, + Iterable addedDataFiles, long startingSnapshotId) { - RewriteFiles rewriteFiles = table.newRewrite() - .validateFromSnapshot(startingSnapshotId) - .rewriteFiles(Sets.newHashSet(deletedDataFiles), Sets.newHashSet(addedDataFiles)); + RewriteFiles rewriteFiles = + table + .newRewrite() + .validateFromSnapshot(startingSnapshotId) + .rewriteFiles(Sets.newHashSet(deletedDataFiles), Sets.newHashSet(addedDataFiles)); commit(rewriteFiles); } diff --git a/core/src/main/java/org/apache/iceberg/actions/BaseRewriteDataFilesFileGroupInfo.java b/core/src/main/java/org/apache/iceberg/actions/BaseRewriteDataFilesFileGroupInfo.java index 480799b80ba2..1320fdf63a36 100644 --- a/core/src/main/java/org/apache/iceberg/actions/BaseRewriteDataFilesFileGroupInfo.java +++ b/core/src/main/java/org/apache/iceberg/actions/BaseRewriteDataFilesFileGroupInfo.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import org.apache.iceberg.StructLike; @@ -27,7 +26,8 @@ public class BaseRewriteDataFilesFileGroupInfo implements RewriteDataFiles.FileG private final int partitionIndex; private final StructLike partition; - public BaseRewriteDataFilesFileGroupInfo(int globalIndex, int partitionIndex, StructLike partition) { + public BaseRewriteDataFilesFileGroupInfo( + int globalIndex, int partitionIndex, StructLike partition) { this.globalIndex = globalIndex; this.partitionIndex = partitionIndex; this.partition = partition; diff --git a/core/src/main/java/org/apache/iceberg/actions/BaseRewriteDataFilesResult.java b/core/src/main/java/org/apache/iceberg/actions/BaseRewriteDataFilesResult.java index af9236fb16a3..9a0aba8af8b6 100644 --- a/core/src/main/java/org/apache/iceberg/actions/BaseRewriteDataFilesResult.java +++ b/core/src/main/java/org/apache/iceberg/actions/BaseRewriteDataFilesResult.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.util.List; diff --git a/core/src/main/java/org/apache/iceberg/actions/BaseRewriteManifestsActionResult.java b/core/src/main/java/org/apache/iceberg/actions/BaseRewriteManifestsActionResult.java index 8654b0af9fee..b1ff3708341e 100644 --- a/core/src/main/java/org/apache/iceberg/actions/BaseRewriteManifestsActionResult.java +++ b/core/src/main/java/org/apache/iceberg/actions/BaseRewriteManifestsActionResult.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import org.apache.iceberg.ManifestFile; @@ -27,8 +26,8 @@ public class BaseRewriteManifestsActionResult implements RewriteManifests.Result private final Iterable rewrittenManifests; private final Iterable addedManifests; - public BaseRewriteManifestsActionResult(Iterable rewrittenManifests, - Iterable addedManifests) { + public BaseRewriteManifestsActionResult( + Iterable rewrittenManifests, Iterable addedManifests) { this.rewrittenManifests = rewrittenManifests; this.addedManifests = addedManifests; } diff --git a/core/src/main/java/org/apache/iceberg/actions/BaseSnapshotTableActionResult.java b/core/src/main/java/org/apache/iceberg/actions/BaseSnapshotTableActionResult.java index 3ea24d374716..1661bc10bc9f 100644 --- a/core/src/main/java/org/apache/iceberg/actions/BaseSnapshotTableActionResult.java +++ b/core/src/main/java/org/apache/iceberg/actions/BaseSnapshotTableActionResult.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; public class BaseSnapshotTableActionResult implements SnapshotTable.Result { diff --git a/core/src/main/java/org/apache/iceberg/actions/BaseSnapshotUpdateAction.java b/core/src/main/java/org/apache/iceberg/actions/BaseSnapshotUpdateAction.java index bdcbcbc126f2..b9697491f83f 100644 --- a/core/src/main/java/org/apache/iceberg/actions/BaseSnapshotUpdateAction.java +++ b/core/src/main/java/org/apache/iceberg/actions/BaseSnapshotUpdateAction.java @@ -16,15 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.util.Map; import org.apache.iceberg.SnapshotUpdate; import org.apache.iceberg.relocated.com.google.common.collect.Maps; -abstract class BaseSnapshotUpdateAction - extends BaseAction implements SnapshotUpdateAction { +abstract class BaseSnapshotUpdateAction extends BaseAction + implements SnapshotUpdateAction { private final Map summary = Maps.newHashMap(); diff --git a/core/src/main/java/org/apache/iceberg/actions/BinPackStrategy.java b/core/src/main/java/org/apache/iceberg/actions/BinPackStrategy.java index 1b7da85d05c0..785f5c3ea3f8 100644 --- a/core/src/main/java/org/apache/iceberg/actions/BinPackStrategy.java +++ b/core/src/main/java/org/apache/iceberg/actions/BinPackStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.math.RoundingMode; @@ -37,14 +36,14 @@ import org.slf4j.LoggerFactory; /** - * A rewrite strategy for data files which determines which files to rewrite - * based on their size. If files are either smaller than the {@link #MIN_FILE_SIZE_BYTES} threshold or - * larger than the {@link #MAX_FILE_SIZE_BYTES} threshold, they are considered targets for being rewritten. - *

- * Once selected files are grouped based on a {@link BinPacking} into groups defined - * by {@link RewriteDataFiles#MAX_FILE_GROUP_SIZE_BYTES}. Groups will be considered for rewriting if they contain - * more files than {@link #MIN_INPUT_FILES} or would produce at least one file of - * {@link RewriteDataFiles#TARGET_FILE_SIZE_BYTES}. + * A rewrite strategy for data files which determines which files to rewrite based on their size. If + * files are either smaller than the {@link #MIN_FILE_SIZE_BYTES} threshold or larger than the + * {@link #MAX_FILE_SIZE_BYTES} threshold, they are considered targets for being rewritten. + * + *

Once selected files are grouped based on a {@link BinPacking} into groups defined by {@link + * RewriteDataFiles#MAX_FILE_GROUP_SIZE_BYTES}. Groups will be considered for rewriting if they + * contain more files than {@link #MIN_INPUT_FILES} or would produce at least one file of {@link + * RewriteDataFiles#TARGET_FILE_SIZE_BYTES}. */ public abstract class BinPackStrategy implements RewriteStrategy { @@ -52,43 +51,48 @@ public abstract class BinPackStrategy implements RewriteStrategy { /** * The minimum number of files that need to be in a file group for it to be considered for - * compaction if the total size of that group is less than the {@link RewriteDataFiles#TARGET_FILE_SIZE_BYTES}. - * This can also be thought of as the maximum number of non-target-size files that could remain in a file - * group (partition) after rewriting. + * compaction if the total size of that group is less than the {@link + * RewriteDataFiles#TARGET_FILE_SIZE_BYTES}. This can also be thought of as the maximum number of + * non-target-size files that could remain in a file group (partition) after rewriting. */ public static final String MIN_INPUT_FILES = "min-input-files"; + public static final int MIN_INPUT_FILES_DEFAULT = 5; /** - * Adjusts files which will be considered for rewriting. Files smaller than - * {@link #MIN_FILE_SIZE_BYTES} will be considered for rewriting. This functions independently - * of {@link #MAX_FILE_SIZE_BYTES}. - *

- * Defaults to 75% of the target file size + * Adjusts files which will be considered for rewriting. Files smaller than {@link + * #MIN_FILE_SIZE_BYTES} will be considered for rewriting. This functions independently of {@link + * #MAX_FILE_SIZE_BYTES}. + * + *

Defaults to 75% of the target file size */ public static final String MIN_FILE_SIZE_BYTES = "min-file-size-bytes"; + public static final double MIN_FILE_SIZE_DEFAULT_RATIO = 0.75d; /** - * Adjusts files which will be considered for rewriting. Files larger than - * {@link #MAX_FILE_SIZE_BYTES} will be considered for rewriting. This functions independently - * of {@link #MIN_FILE_SIZE_BYTES}. - *

- * Defaults to 180% of the target file size + * Adjusts files which will be considered for rewriting. Files larger than {@link + * #MAX_FILE_SIZE_BYTES} will be considered for rewriting. This functions independently of {@link + * #MIN_FILE_SIZE_BYTES}. + * + *

Defaults to 180% of the target file size */ public static final String MAX_FILE_SIZE_BYTES = "max-file-size-bytes"; + public static final double MAX_FILE_SIZE_DEFAULT_RATIO = 1.80d; /** - * The minimum number of deletes that needs to be associated with a data file for it to be considered for rewriting. - * If a data file has this number of deletes or more, it will be rewritten regardless of its file size determined - * by {@link #MIN_FILE_SIZE_BYTES} and {@link #MAX_FILE_SIZE_BYTES}. - * If a file group contains a file that satisfies this condition, the file group will be rewritten regardless of - * the number of files in the file group determined by {@link #MIN_INPUT_FILES} - *

- * Defaults to Integer.MAX_VALUE, which means this feature is not enabled by default. + * The minimum number of deletes that needs to be associated with a data file for it to be + * considered for rewriting. If a data file has this number of deletes or more, it will be + * rewritten regardless of its file size determined by {@link #MIN_FILE_SIZE_BYTES} and {@link + * #MAX_FILE_SIZE_BYTES}. If a file group contains a file that satisfies this condition, the file + * group will be rewritten regardless of the number of files in the file group determined by + * {@link #MIN_INPUT_FILES} + * + *

Defaults to Integer.MAX_VALUE, which means this feature is not enabled by default. */ public static final String DELETE_FILE_THRESHOLD = "delete-file-threshold"; + public static final int DELETE_FILE_THRESHOLD_DEFAULT = Integer.MAX_VALUE; static final long SPLIT_OVERHEAD = 1024 * 5; @@ -98,6 +102,7 @@ public abstract class BinPackStrategy implements RewriteStrategy { * files; */ public static final String REWRITE_ALL = "rewrite-all"; + public static final boolean REWRITE_ALL_DEFAULT = false; private int minInputFiles; @@ -120,42 +125,40 @@ public Set validOptions() { DELETE_FILE_THRESHOLD, MIN_FILE_SIZE_BYTES, MAX_FILE_SIZE_BYTES, - REWRITE_ALL - ); + REWRITE_ALL); } @Override public RewriteStrategy options(Map options) { - targetFileSize = PropertyUtil.propertyAsLong(options, - RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + targetFileSize = PropertyUtil.propertyAsLong( - table().properties(), - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT)); - - minFileSize = PropertyUtil.propertyAsLong(options, - MIN_FILE_SIZE_BYTES, - (long) (targetFileSize * MIN_FILE_SIZE_DEFAULT_RATIO)); + options, + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + PropertyUtil.propertyAsLong( + table().properties(), + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT)); + + minFileSize = + PropertyUtil.propertyAsLong( + options, MIN_FILE_SIZE_BYTES, (long) (targetFileSize * MIN_FILE_SIZE_DEFAULT_RATIO)); - maxFileSize = PropertyUtil.propertyAsLong(options, - MAX_FILE_SIZE_BYTES, - (long) (targetFileSize * MAX_FILE_SIZE_DEFAULT_RATIO)); + maxFileSize = + PropertyUtil.propertyAsLong( + options, MAX_FILE_SIZE_BYTES, (long) (targetFileSize * MAX_FILE_SIZE_DEFAULT_RATIO)); - maxGroupSize = PropertyUtil.propertyAsLong(options, - RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, - RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES_DEFAULT); + maxGroupSize = + PropertyUtil.propertyAsLong( + options, + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES_DEFAULT); - minInputFiles = PropertyUtil.propertyAsInt(options, - MIN_INPUT_FILES, - MIN_INPUT_FILES_DEFAULT); + minInputFiles = PropertyUtil.propertyAsInt(options, MIN_INPUT_FILES, MIN_INPUT_FILES_DEFAULT); - deleteFileThreshold = PropertyUtil.propertyAsInt(options, - DELETE_FILE_THRESHOLD, - DELETE_FILE_THRESHOLD_DEFAULT); + deleteFileThreshold = + PropertyUtil.propertyAsInt(options, DELETE_FILE_THRESHOLD, DELETE_FILE_THRESHOLD_DEFAULT); - rewriteAll = PropertyUtil.propertyAsBoolean(options, - REWRITE_ALL, - REWRITE_ALL_DEFAULT); + rewriteAll = PropertyUtil.propertyAsBoolean(options, REWRITE_ALL, REWRITE_ALL_DEFAULT); validateOptions(); return this; @@ -168,8 +171,11 @@ public Iterable selectFilesToRewrite(Iterable dataFi return dataFiles; } else { return FluentIterable.from(dataFiles) - .filter(scanTask -> scanTask.length() < minFileSize || scanTask.length() > maxFileSize || - taskHasTooManyDeletes(scanTask)); + .filter( + scanTask -> + scanTask.length() < minFileSize + || scanTask.length() > maxFileSize + || taskHasTooManyDeletes(scanTask)); } } @@ -180,12 +186,14 @@ public Iterable> planFileGroups(Iterable dataFi if (rewriteAll) { return potentialGroups; } else { - return potentialGroups.stream().filter(group -> - (group.size() >= minInputFiles && group.size() > 1) || - (sizeOfInputFiles(group) > targetFileSize && group.size() > 1) || - sizeOfInputFiles(group) > maxFileSize || - group.stream().anyMatch(this::taskHasTooManyDeletes) - ).collect(Collectors.toList()); + return potentialGroups.stream() + .filter( + group -> + (group.size() >= minInputFiles && group.size() > 1) + || (sizeOfInputFiles(group) > targetFileSize && group.size() > 1) + || sizeOfInputFiles(group) > maxFileSize + || group.stream().anyMatch(this::taskHasTooManyDeletes)) + .collect(Collectors.toList()); } } @@ -194,15 +202,16 @@ protected long targetFileSize() { } /** - * Determine how many output files to create when rewriting. We use this to determine the split-size - * we want to use when actually writing files to avoid the following situation. - *

- * If we are writing 10.1 G of data with a target file size of 1G we would end up with - * 11 files, one of which would only have 0.1g. This would most likely be less preferable to - * 10 files each of which was 1.01g. So here we decide whether to round up or round down - * based on what the estimated average file size will be if we ignore the remainder (0.1g). If - * the new file size is less than 10% greater than the target file size then we will round down - * when determining the number of output files. + * Determine how many output files to create when rewriting. We use this to determine the + * split-size we want to use when actually writing files to avoid the following situation. + * + *

If we are writing 10.1 G of data with a target file size of 1G we would end up with 11 + * files, one of which would only have 0.1g. This would most likely be less preferable to 10 files + * each of which was 1.01g. So here we decide whether to round up or round down based on what the + * estimated average file size will be if we ignore the remainder (0.1g). If the new file size is + * less than 10% greater than the target file size then we will round down when determining the + * number of output files. + * * @param totalSizeInBytes total data size for a file group * @return the number of files this strategy should create */ @@ -211,13 +220,15 @@ protected long numOutputFiles(long totalSizeInBytes) { return 1; } - long fileCountWithRemainder = LongMath.divide(totalSizeInBytes, targetFileSize, RoundingMode.CEILING); + long fileCountWithRemainder = + LongMath.divide(totalSizeInBytes, targetFileSize, RoundingMode.CEILING); if (LongMath.mod(totalSizeInBytes, targetFileSize) > minFileSize) { // Our Remainder file is of valid size for this compaction so keep it return fileCountWithRemainder; } - long fileCountWithoutRemainder = LongMath.divide(totalSizeInBytes, targetFileSize, RoundingMode.FLOOR); + long fileCountWithoutRemainder = + LongMath.divide(totalSizeInBytes, targetFileSize, RoundingMode.FLOOR); long avgFileSizeWithoutRemainder = totalSizeInBytes / fileCountWithoutRemainder; if (avgFileSizeWithoutRemainder < Math.min(1.1 * targetFileSize, writeMaxFileSize())) { // Round down and distribute remainder amongst other files @@ -229,12 +240,13 @@ protected long numOutputFiles(long totalSizeInBytes) { } /** - * Returns the smallest of our max write file threshold, and our estimated split size based on - * the number of output files we want to generate. Add a overhead onto the estimated splitSize to try to avoid - * small errors in size creating brand-new files. + * Returns the smallest of our max write file threshold, and our estimated split size based on the + * number of output files we want to generate. Add a overhead onto the estimated splitSize to try + * to avoid small errors in size creating brand-new files. */ protected long splitSize(long totalSizeInBytes) { - long estimatedSplitSize = (totalSizeInBytes / numOutputFiles(totalSizeInBytes)) + SPLIT_OVERHEAD; + long estimatedSplitSize = + (totalSizeInBytes / numOutputFiles(totalSizeInBytes)) + SPLIT_OVERHEAD; return Math.min(estimatedSplitSize, writeMaxFileSize()); } @@ -244,17 +256,19 @@ protected long inputFileSize(List fileToRewrite) { /** * Estimates a larger max target file size than our target size used in task creation to avoid - * tasks which are predicted to have a certain size, but exceed that target size when serde is complete creating - * tiny remainder files. - *

- * While we create tasks that should all be smaller than our target size there is a chance that the actual - * data will end up being larger than our target size due to various factors of compression, serialization and - * other factors outside our control. If this occurs, instead of making a single file that is close in size to - * our target we would end up producing one file of the target size, and then a small extra file with the remaining - * data. For example, if our target is 512 MB we may generate a rewrite task that should be 500 MB. When we write - * the data we may find we actually have to write out 530 MB. If we use the target size while writing we would - * produced a 512 MB file and a 18 MB file. If instead we use a larger size estimated by this method, - * then we end up writing a single file. + * tasks which are predicted to have a certain size, but exceed that target size when serde is + * complete creating tiny remainder files. + * + *

While we create tasks that should all be smaller than our target size there is a chance that + * the actual data will end up being larger than our target size due to various factors of + * compression, serialization and other factors outside our control. If this occurs, instead of + * making a single file that is close in size to our target we would end up producing one file of + * the target size, and then a small extra file with the remaining data. For example, if our + * target is 512 MB we may generate a rewrite task that should be 500 MB. When we write the data + * we may find we actually have to write out 530 MB. If we use the target size while writing we + * would produced a 512 MB file and a 18 MB file. If instead we use a larger size estimated by + * this method, then we end up writing a single file. + * * @return the target size plus one half of the distance between max and target */ protected long writeMaxFileSize() { @@ -270,28 +284,46 @@ private boolean taskHasTooManyDeletes(FileScanTask task) { } private void validateOptions() { - Preconditions.checkArgument(minFileSize >= 0, + Preconditions.checkArgument( + minFileSize >= 0, "Cannot set %s to a negative number, %s < 0", - MIN_FILE_SIZE_BYTES, minFileSize); + MIN_FILE_SIZE_BYTES, + minFileSize); - Preconditions.checkArgument(maxFileSize > minFileSize, + Preconditions.checkArgument( + maxFileSize > minFileSize, "Cannot set %s greater than or equal to %s, %s >= %s", - MIN_FILE_SIZE_BYTES, MAX_FILE_SIZE_BYTES, minFileSize, maxFileSize); + MIN_FILE_SIZE_BYTES, + MAX_FILE_SIZE_BYTES, + minFileSize, + maxFileSize); - Preconditions.checkArgument(targetFileSize > minFileSize, + Preconditions.checkArgument( + targetFileSize > minFileSize, "Cannot set %s greater than or equal to %s, all files written will be smaller than the threshold, %s >= %s", - MIN_FILE_SIZE_BYTES, RewriteDataFiles.TARGET_FILE_SIZE_BYTES, minFileSize, targetFileSize); + MIN_FILE_SIZE_BYTES, + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + minFileSize, + targetFileSize); - Preconditions.checkArgument(targetFileSize < maxFileSize, + Preconditions.checkArgument( + targetFileSize < maxFileSize, "Cannot set %s is greater than or equal to %s, all files written will be larger than the threshold, %s >= %s", - RewriteDataFiles.TARGET_FILE_SIZE_BYTES, MAX_FILE_SIZE_BYTES, targetFileSize, maxFileSize); + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + MAX_FILE_SIZE_BYTES, + targetFileSize, + maxFileSize); - Preconditions.checkArgument(minInputFiles > 0, + Preconditions.checkArgument( + minInputFiles > 0, "Cannot set %s is less than 1. All values less than 1 have the same effect as 1. %s < 1", - MIN_INPUT_FILES, minInputFiles); + MIN_INPUT_FILES, + minInputFiles); - Preconditions.checkArgument(deleteFileThreshold > 0, + Preconditions.checkArgument( + deleteFileThreshold > 0, "Cannot set %s is less than 1. All values less than 1 have the same effect as 1. %s < 1", - DELETE_FILE_THRESHOLD, deleteFileThreshold); + DELETE_FILE_THRESHOLD, + deleteFileThreshold); } } diff --git a/core/src/main/java/org/apache/iceberg/actions/ConvertEqualityDeleteStrategy.java b/core/src/main/java/org/apache/iceberg/actions/ConvertEqualityDeleteStrategy.java index 52ef54c455fc..a8abf46cd852 100644 --- a/core/src/main/java/org/apache/iceberg/actions/ConvertEqualityDeleteStrategy.java +++ b/core/src/main/java/org/apache/iceberg/actions/ConvertEqualityDeleteStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.util.Map; @@ -25,30 +24,22 @@ import org.apache.iceberg.FileScanTask; import org.apache.iceberg.Table; -/** - * A strategy for the action to convert equality delete to position deletes. - */ +/** A strategy for the action to convert equality delete to position deletes. */ public interface ConvertEqualityDeleteStrategy { - /** - * Returns the name of this convert deletes strategy - */ + /** Returns the name of this convert deletes strategy */ String name(); - /** - * Returns the table being modified by this convert strategy - */ + /** Returns the table being modified by this convert strategy */ Table table(); /** - * Returns a set of options which this convert strategy can use. This is an allowed-list and any options not - * specified here will be rejected at runtime. + * Returns a set of options which this convert strategy can use. This is an allowed-list and any + * options not specified here will be rejected at runtime. */ Set validOptions(); - /** - * Sets options to be used with this strategy - */ + /** Sets options to be used with this strategy */ ConvertEqualityDeleteStrategy options(Map options); /** @@ -60,9 +51,9 @@ public interface ConvertEqualityDeleteStrategy { Iterable selectDeleteFiles(Iterable deleteFiles); /** - * Groups delete files into lists which will be processed in a single executable unit. Each group will end up being - * committed as an independent set of changes. This creates the jobs which will eventually be run as by the underlying - * Action. + * Groups delete files into lists which will be processed in a single executable unit. Each group + * will end up being committed as an independent set of changes. This creates the jobs which will + * eventually be run as by the underlying Action. * * @param dataFiles iterable of data files that contain the DeleteFile to be converted * @return iterable of lists of FileScanTasks which will be processed together diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteDataFilesActionResult.java b/core/src/main/java/org/apache/iceberg/actions/RewriteDataFilesActionResult.java index 7313d9cb0418..a3b0292d0463 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewriteDataFilesActionResult.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewriteDataFilesActionResult.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.util.List; @@ -31,7 +30,8 @@ public class RewriteDataFilesActionResult { private List deletedDataFiles; private List addedDataFiles; - public RewriteDataFilesActionResult(List deletedDataFiles, List addedDataFiles) { + public RewriteDataFilesActionResult( + List deletedDataFiles, List addedDataFiles) { this.deletedDataFiles = deletedDataFiles; this.addedDataFiles = addedDataFiles; } diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteDataFilesCommitManager.java b/core/src/main/java/org/apache/iceberg/actions/RewriteDataFilesCommitManager.java index 277b480009f6..f6fc53bba12a 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewriteDataFilesCommitManager.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewriteDataFilesCommitManager.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.io.Closeable; @@ -40,9 +39,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * Functionality used by RewriteDataFile Actions from different platforms to handle commits. - */ +/** Functionality used by RewriteDataFile Actions from different platforms to handle commits. */ public class RewriteDataFilesCommitManager { private static final Logger LOG = LoggerFactory.getLogger(RewriteDataFilesCommitManager.class); @@ -59,15 +56,17 @@ public RewriteDataFilesCommitManager(Table table, long startingSnapshotId) { this(table, startingSnapshotId, RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER_DEFAULT); } - public RewriteDataFilesCommitManager(Table table, long startingSnapshotId, boolean useStartingSequenceNumber) { + public RewriteDataFilesCommitManager( + Table table, long startingSnapshotId, boolean useStartingSequenceNumber) { this.table = table; this.startingSnapshotId = startingSnapshotId; this.useStartingSequenceNumber = useStartingSequenceNumber; } /** - * Perform a commit operation on the table adding and removing files as - * required for this set of file groups + * Perform a commit operation on the table adding and removing files as required for this set of + * file groups + * * @param fileGroups fileSets to commit */ public void commitFileGroups(Set fileGroups) { @@ -90,13 +89,14 @@ public void commitFileGroups(Set fileGroups) { } /** - * Clean up a specified file set by removing any files created for that operation, should - * not throw any exceptions + * Clean up a specified file set by removing any files created for that operation, should not + * throw any exceptions + * * @param fileGroup group of files which has already been rewritten */ public void abortFileGroup(RewriteFileGroup fileGroup) { - Preconditions.checkState(fileGroup.addedFiles() != null, - "Cannot abort a fileGroup that was not rewritten"); + Preconditions.checkState( + fileGroup.addedFiles() != null, "Cannot abort a fileGroup that was not rewritten"); Tasks.foreach(fileGroup.addedFiles()) .noRetry() @@ -109,8 +109,10 @@ public void commitOrClean(Set rewriteGroups) { try { commitFileGroups(rewriteGroups); } catch (CommitStateUnknownException e) { - LOG.error("Commit state unknown for {}, cannot clean up files because they may have been committed successfully.", - rewriteGroups, e); + LOG.error( + "Commit state unknown for {}, cannot clean up files because they may have been committed successfully.", + rewriteGroups, + e); throw e; } catch (Exception e) { LOG.error("Cannot commit groups {}, attempting to clean up written files", rewriteGroups, e); @@ -120,9 +122,10 @@ public void commitOrClean(Set rewriteGroups) { } /** - * An async service which allows for committing multiple file groups as their rewrites complete. The service also - * allows for partial-progress since commits can fail. Once the service has been closed no new file groups should not - * be offered. + * An async service which allows for committing multiple file groups as their rewrites complete. + * The service also allows for partial-progress since commits can fail. Once the service has been + * closed no new file groups should not be offered. + * * @param rewritesPerCommit number of file groups to include in a commit * @return the service for handling commits */ @@ -138,97 +141,113 @@ public class CommitService implements Closeable { private final AtomicBoolean running = new AtomicBoolean(false); CommitService(int rewritesPerCommit) { - LOG.info("Creating commit service for table {} with {} groups per commit", table, rewritesPerCommit); + LOG.info( + "Creating commit service for table {} with {} groups per commit", + table, + rewritesPerCommit); this.rewritesPerCommit = rewritesPerCommit; - committerService = Executors.newSingleThreadExecutor(new ThreadFactoryBuilder() - .setNameFormat("Committer-Service") - .build()); + committerService = + Executors.newSingleThreadExecutor( + new ThreadFactoryBuilder().setNameFormat("Committer-Service").build()); completedRewrites = Queues.newConcurrentLinkedQueue(); committedRewrites = Lists.newArrayList(); } - /** - * Starts a single threaded executor service for handling file group commits. - */ + /** Starts a single threaded executor service for handling file group commits. */ public void start() { - Preconditions.checkState(running.compareAndSet(false, true), "Rewrite Commit service already started"); + Preconditions.checkState( + running.compareAndSet(false, true), "Rewrite Commit service already started"); LOG.info("Starting commit service for {}", table); // Partial progress commit service - committerService.execute(() -> { - while (running.get() || completedRewrites.size() > 0) { - try { - if (completedRewrites.size() == 0) { - // Give other threads a chance to make progress - Thread.sleep(100); - } - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new RuntimeException("Interrupted while processing commits", e); - } - - // Either we have a full commit group, or we have completed writing and need to commit what is left over - if (completedRewrites.size() >= rewritesPerCommit || (!running.get() && completedRewrites.size() > 0)) { - Set batch = Sets.newHashSetWithExpectedSize(rewritesPerCommit); - for (int i = 0; i < rewritesPerCommit && !completedRewrites.isEmpty(); i++) { - batch.add(completedRewrites.poll()); + committerService.execute( + () -> { + while (running.get() || completedRewrites.size() > 0) { + try { + if (completedRewrites.size() == 0) { + // Give other threads a chance to make progress + Thread.sleep(100); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while processing commits", e); + } + + // Either we have a full commit group, or we have completed writing and need to commit + // what is left over + if (completedRewrites.size() >= rewritesPerCommit + || (!running.get() && completedRewrites.size() > 0)) { + Set batch = Sets.newHashSetWithExpectedSize(rewritesPerCommit); + for (int i = 0; i < rewritesPerCommit && !completedRewrites.isEmpty(); i++) { + batch.add(completedRewrites.poll()); + } + + try { + commitOrClean(batch); + committedRewrites.addAll(batch); + } catch (Exception e) { + LOG.error( + "Failure during rewrite commit process, partial progress enabled. Ignoring", + e); + } + } } - - try { - commitOrClean(batch); - committedRewrites.addAll(batch); - } catch (Exception e) { - LOG.error("Failure during rewrite commit process, partial progress enabled. Ignoring", e); - } - } - } - }); + }); } /** - * Places a file group in the queue to be asynchronously committed either when the queue has enough elements - * to do a batch of size {@link #rewritesPerCommit} or the service has been closed. + * Places a file group in the queue to be asynchronously committed either when the queue has + * enough elements to do a batch of size {@link #rewritesPerCommit} or the service has been + * closed. + * * @param group file group to eventually be committed */ public void offer(RewriteFileGroup group) { LOG.debug("Offered to commit service: {}", group); - Preconditions.checkState(running.get(), "Cannot add rewrites to a service which has already been closed"); + Preconditions.checkState( + running.get(), "Cannot add rewrites to a service which has already been closed"); completedRewrites.add(group); } - /** - * Returns all File groups which have been committed - */ + /** Returns all File groups which have been committed */ public List results() { - Preconditions.checkState(committerService.isShutdown(), + Preconditions.checkState( + committerService.isShutdown(), "Cannot get results from a service which has not been closed"); return committedRewrites; } @Override public void close() { - Preconditions.checkState(running.compareAndSet(true, false), - "Cannot close already closed RewriteService"); + Preconditions.checkState( + running.compareAndSet(true, false), "Cannot close already closed RewriteService"); LOG.info("Closing commit service for {}", table); committerService.shutdown(); try { - // All rewrites have completed and all new files have been created, we are now waiting for the commit - // pool to finish doing it's commits to Iceberg State. In the case of partial progress this should - // have been occurring simultaneously with rewrites, if not there should be only a single commit operation. + // All rewrites have completed and all new files have been created, we are now waiting for + // the commit + // pool to finish doing it's commits to Iceberg State. In the case of partial progress this + // should + // have been occurring simultaneously with rewrites, if not there should be only a single + // commit operation. // In either case this should take much less than 10 minutes to actually complete. if (!committerService.awaitTermination(10, TimeUnit.MINUTES)) { - LOG.warn("Commit operation did not complete within 10 minutes of the files being written. This may mean " + - "that changes were not successfully committed to the the Iceberg table."); + LOG.warn( + "Commit operation did not complete within 10 minutes of the files being written. This may mean " + + "that changes were not successfully committed to the the Iceberg table."); } } catch (InterruptedException e) { Thread.currentThread().interrupt(); - throw new RuntimeException("Cannot complete commit for rewrite, commit service interrupted", e); + throw new RuntimeException( + "Cannot complete commit for rewrite, commit service interrupted", e); } - Preconditions.checkState(completedRewrites.isEmpty(), "File groups offered after service was closed, " + - "they were not successfully committed."); + Preconditions.checkState( + completedRewrites.isEmpty(), + "File groups offered after service was closed, " + + "they were not successfully committed."); } } } diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroup.java b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroup.java index 99ec75772301..dd4516be76c0 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroup.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroup.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.util.Collections; @@ -30,8 +29,8 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; /** - * Container class representing a set of files to be rewritten by a RewriteAction and the new files which have been - * written by the action. + * Container class representing a set of files to be rewritten by a RewriteAction and the new files + * which have been written by the action. */ public class RewriteFileGroup { private final FileGroupInfo info; @@ -74,7 +73,9 @@ public String toString() { return MoreObjects.toStringHelper(this) .add("info", info) .add("numRewrittenFiles", fileScanTasks.size()) - .add("numAddedFiles", addedFiles == null ? "Rewrite Incomplete" : Integer.toString(addedFiles.size())) + .add( + "numAddedFiles", + addedFiles == null ? "Rewrite Incomplete" : Integer.toString(addedFiles.size())) .toString(); } diff --git a/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeleteStrategy.java b/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeleteStrategy.java index 0b2c48a8a3a1..d44b9bbf2622 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeleteStrategy.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeleteStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.util.Map; @@ -24,30 +23,22 @@ import org.apache.iceberg.DeleteFile; import org.apache.iceberg.Table; -/** - * A strategy for an action to rewrite position delete files. - */ +/** A strategy for an action to rewrite position delete files. */ public interface RewritePositionDeleteStrategy { - /** - * Returns the name of this rewrite deletes strategy - */ + /** Returns the name of this rewrite deletes strategy */ String name(); - /** - * Returns the table being modified by this rewrite strategy - */ + /** Returns the table being modified by this rewrite strategy */ Table table(); /** - * Returns a set of options which this rewrite strategy can use. This is an allowed-list and any options not - * specified here will be rejected at runtime. + * Returns a set of options which this rewrite strategy can use. This is an allowed-list and any + * options not specified here will be rejected at runtime. */ Set validOptions(); - /** - * Sets options to be used with this strategy - */ + /** Sets options to be used with this strategy */ RewritePositionDeleteStrategy options(Map options); /** @@ -59,9 +50,9 @@ public interface RewritePositionDeleteStrategy { Iterable selectDeleteFiles(Iterable deleteFiles); /** - * Groups into lists which will be processed in a single executable unit. Each group will end up being - * committed as an independent set of changes. This creates the jobs which will eventually be run as by the underlying - * Action. + * Groups into lists which will be processed in a single executable unit. Each group will end up + * being committed as an independent set of changes. This creates the jobs which will eventually + * be run as by the underlying Action. * * @param deleteFiles iterable of DeleteFile to be rewritten * @return iterable of lists of FileScanTasks which will be processed together diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteStrategy.java b/core/src/main/java/org/apache/iceberg/actions/RewriteStrategy.java index 72a10680caf2..36fc7247528e 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewriteStrategy.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewriteStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.io.Serializable; @@ -28,25 +27,19 @@ import org.apache.iceberg.Table; public interface RewriteStrategy extends Serializable { - /** - * Returns the name of this rewrite strategy - */ + /** Returns the name of this rewrite strategy */ String name(); - /** - * Returns the table being modified by this rewrite strategy - */ + /** Returns the table being modified by this rewrite strategy */ Table table(); /** - * Returns a set of options which this rewrite strategy can use. This is an allowed-list and any options not - * specified here will be rejected at runtime. + * Returns a set of options which this rewrite strategy can use. This is an allowed-list and any + * options not specified here will be rejected at runtime. */ Set validOptions(); - /** - * Sets options to be used with this strategy - */ + /** Sets options to be used with this strategy */ RewriteStrategy options(Map options); /** @@ -58,9 +51,9 @@ public interface RewriteStrategy extends Serializable { Iterable selectFilesToRewrite(Iterable dataFiles); /** - * Groups file scans into lists which will be processed in a single executable unit. Each group will end up being - * committed as an independent set of changes. This creates the jobs which will eventually be run as by the underlying - * Action. + * Groups file scans into lists which will be processed in a single executable unit. Each group + * will end up being committed as an independent set of changes. This creates the jobs which will + * eventually be run as by the underlying Action. * * @param dataFiles iterable of FileScanTasks to be rewritten * @return iterable of lists of FileScanTasks which will be processed together @@ -68,8 +61,8 @@ public interface RewriteStrategy extends Serializable { Iterable> planFileGroups(Iterable dataFiles); /** - * Method which will rewrite files based on this particular RewriteStrategy's algorithm. - * This will most likely be Action framework specific (Spark/Presto/Flink ....). + * Method which will rewrite files based on this particular RewriteStrategy's algorithm. This will + * most likely be Action framework specific (Spark/Presto/Flink ....). * * @param filesToRewrite a group of files to be rewritten together * @return a set of newly written files diff --git a/core/src/main/java/org/apache/iceberg/actions/SnapshotUpdateAction.java b/core/src/main/java/org/apache/iceberg/actions/SnapshotUpdateAction.java index 13164a91ed7a..30ef9aa1a74f 100644 --- a/core/src/main/java/org/apache/iceberg/actions/SnapshotUpdateAction.java +++ b/core/src/main/java/org/apache/iceberg/actions/SnapshotUpdateAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; public interface SnapshotUpdateAction extends Action { diff --git a/core/src/main/java/org/apache/iceberg/actions/SortStrategy.java b/core/src/main/java/org/apache/iceberg/actions/SortStrategy.java index 96914de09525..d08f0940f5d8 100644 --- a/core/src/main/java/org/apache/iceberg/actions/SortStrategy.java +++ b/core/src/main/java/org/apache/iceberg/actions/SortStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.util.Map; @@ -27,17 +26,17 @@ import org.apache.iceberg.util.SortOrderUtil; /** - * A rewrite strategy for data files which aims to reorder data with data files to optimally lay them out - * in relation to a column. For example, if the Sort strategy is used on a set of files which is ordered - * by column x and original has files File A (x: 0 - 50), File B ( x: 10 - 40) and File C ( x: 30 - 60), - * this Strategy will attempt to rewrite those files into File A' (x: 0-20), File B' (x: 21 - 40), - * File C' (x: 41 - 60). - *

- * Currently the there is no file overlap detection and we will rewrite all files if {@link SortStrategy#REWRITE_ALL} - * is true (default: false). If this property is disabled any files that would be chosen by - * {@link BinPackStrategy} will be rewrite candidates. - *

- * In the future other algorithms for determining files to rewrite will be provided. + * A rewrite strategy for data files which aims to reorder data with data files to optimally lay + * them out in relation to a column. For example, if the Sort strategy is used on a set of files + * which is ordered by column x and original has files File A (x: 0 - 50), File B ( x: 10 - 40) and + * File C ( x: 30 - 60), this Strategy will attempt to rewrite those files into File A' (x: 0-20), + * File B' (x: 21 - 40), File C' (x: 41 - 60). + * + *

Currently the there is no file overlap detection and we will rewrite all files if {@link + * SortStrategy#REWRITE_ALL} is true (default: false). If this property is disabled any files that + * would be chosen by {@link BinPackStrategy} will be rewrite candidates. + * + *

In the future other algorithms for determining files to rewrite will be provided. */ public abstract class SortStrategy extends BinPackStrategy { @@ -45,6 +44,7 @@ public abstract class SortStrategy extends BinPackStrategy { /** * Sets the sort order to be used in this strategy when rewriting files + * * @param order the order to use * @return this for method chaining */ @@ -65,9 +65,7 @@ public String name() { @Override public Set validOptions() { - return ImmutableSet.builder() - .addAll(super.validOptions()) - .build(); + return ImmutableSet.builder().addAll(super.validOptions()).build(); } @Override @@ -83,10 +81,12 @@ public RewriteStrategy options(Map options) { } protected void validateOptions() { - Preconditions.checkArgument(!sortOrder.isUnsorted(), - "Can't use %s when there is no sort order, either define table %s's sort order or set sort" + - "order in the action", - name(), table().name()); + Preconditions.checkArgument( + !sortOrder.isUnsorted(), + "Can't use %s when there is no sort order, either define table %s's sort order or set sort" + + "order in the action", + name(), + table().name()); SortOrder.checkCompatibility(sortOrder, table().schema()); } diff --git a/core/src/main/java/org/apache/iceberg/avro/Avro.java b/core/src/main/java/org/apache/iceberg/avro/Avro.java index 07a8a2df1d9a..85cc8d902026 100644 --- a/core/src/main/java/org/apache/iceberg/avro/Avro.java +++ b/core/src/main/java/org/apache/iceberg/avro/Avro.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION; +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION_DEFAULT; +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION_LEVEL_DEFAULT; +import static org.apache.iceberg.TableProperties.DELETE_AVRO_COMPRESSION; +import static org.apache.iceberg.TableProperties.DELETE_AVRO_COMPRESSION_LEVEL; + import java.io.IOException; import java.util.List; import java.util.Locale; @@ -60,16 +66,8 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.util.ArrayUtil; -import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION; -import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION_DEFAULT; -import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION_LEVEL; -import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION_LEVEL_DEFAULT; -import static org.apache.iceberg.TableProperties.DELETE_AVRO_COMPRESSION; -import static org.apache.iceberg.TableProperties.DELETE_AVRO_COMPRESSION_LEVEL; - public class Avro { - private Avro() { - } + private Avro() {} private enum Codec { UNCOMPRESSED, @@ -165,7 +163,8 @@ public WriteBuilder overwrite(boolean enabled) { } // supposed to always be a private method used strictly by data and delete write builders - private WriteBuilder createContextFunc(Function, Context> newCreateContextFunc) { + private WriteBuilder createContextFunc( + Function, Context> newCreateContextFunc) { this.createContextFunc = newCreateContextFunc; return this; } @@ -188,7 +187,14 @@ public FileAppender build() throws IOException { CodecFactory codec = context.codec(); return new AvroFileAppender<>( - schema, AvroSchemaUtil.convert(schema, name), file, writerFunc, codec, metadata, metricsConfig, overwrite); + schema, + AvroSchemaUtil.convert(schema, name), + file, + writerFunc, + codec, + metadata, + metricsConfig, + overwrite); } private static class Context { @@ -200,7 +206,8 @@ private Context(CodecFactory codec) { static Context dataContext(Map config) { String codecAsString = config.getOrDefault(AVRO_COMPRESSION, AVRO_COMPRESSION_DEFAULT); - String compressionLevel = config.getOrDefault(AVRO_COMPRESSION_LEVEL, AVRO_COMPRESSION_LEVEL_DEFAULT); + String compressionLevel = + config.getOrDefault(AVRO_COMPRESSION_LEVEL, AVRO_COMPRESSION_LEVEL_DEFAULT); CodecFactory codec = toCodec(codecAsString, compressionLevel); return new Context(codec); @@ -211,8 +218,10 @@ static Context deleteContext(Map config) { Context dataContext = dataContext(config); String codecAsString = config.get(DELETE_AVRO_COMPRESSION); - String compressionLevel = config.getOrDefault(DELETE_AVRO_COMPRESSION_LEVEL, AVRO_COMPRESSION_LEVEL_DEFAULT); - CodecFactory codec = codecAsString != null ? toCodec(codecAsString, compressionLevel) : dataContext.codec(); + String compressionLevel = + config.getOrDefault(DELETE_AVRO_COMPRESSION_LEVEL, AVRO_COMPRESSION_LEVEL_DEFAULT); + CodecFactory codec = + codecAsString != null ? toCodec(codecAsString, compressionLevel) : dataContext.codec(); return new Context(codec); } @@ -228,12 +237,14 @@ private static CodecFactory toCodec(String codecAsString, String compressionLeve codecFactory = CodecFactory.snappyCodec(); break; case ZSTD: - codecFactory = CodecFactory.zstandardCodec( - compressionLevelAsInt(compressionLevel, ZSTD_COMPRESSION_LEVEL_DEFAULT)); + codecFactory = + CodecFactory.zstandardCodec( + compressionLevelAsInt(compressionLevel, ZSTD_COMPRESSION_LEVEL_DEFAULT)); break; case GZIP: - codecFactory = CodecFactory.deflateCodec( - compressionLevelAsInt(compressionLevel, GZIP_COMPRESSION_LEVEL_DEFAULT)); + codecFactory = + CodecFactory.deflateCodec( + compressionLevelAsInt(compressionLevel, GZIP_COMPRESSION_LEVEL_DEFAULT)); break; default: throw new IllegalArgumentException("Unsupported compression codec: " + codecAsString); @@ -244,8 +255,11 @@ private static CodecFactory toCodec(String codecAsString, String compressionLeve return codecFactory; } - private static int compressionLevelAsInt(String tableCompressionLevel, int defaultCompressionLevel) { - return tableCompressionLevel != null ? Integer.parseInt(tableCompressionLevel) : defaultCompressionLevel; + private static int compressionLevelAsInt( + String tableCompressionLevel, int defaultCompressionLevel) { + return tableCompressionLevel != null + ? Integer.parseInt(tableCompressionLevel) + : defaultCompressionLevel; } CodecFactory codec() { @@ -340,11 +354,13 @@ public DataWriteBuilder withSortOrder(SortOrder newSortOrder) { public DataWriter build() throws IOException { Preconditions.checkArgument(spec != null, "Cannot create data writer without spec"); - Preconditions.checkArgument(spec.isUnpartitioned() || partition != null, + Preconditions.checkArgument( + spec.isUnpartitioned() || partition != null, "Partition must not be null when creating data writer for partitioned spec"); FileAppender fileAppender = appenderBuilder.build(); - return new DataWriter<>(fileAppender, FileFormat.AVRO, location, spec, partition, keyMetadata, sortOrder); + return new DataWriter<>( + fileAppender, FileFormat.AVRO, location, spec, partition, keyMetadata, sortOrder); } } @@ -451,18 +467,25 @@ public DeleteWriteBuilder withSortOrder(SortOrder newSortOrder) { } public EqualityDeleteWriter buildEqualityWriter() throws IOException { - Preconditions.checkState(rowSchema != null, "Cannot create equality delete file without a schema"); - Preconditions.checkState(equalityFieldIds != null, "Cannot create equality delete file without delete field ids"); - Preconditions.checkState(createWriterFunc != null, + Preconditions.checkState( + rowSchema != null, "Cannot create equality delete file without a schema"); + Preconditions.checkState( + equalityFieldIds != null, "Cannot create equality delete file without delete field ids"); + Preconditions.checkState( + createWriterFunc != null, "Cannot create equality delete file unless createWriterFunc is set"); - Preconditions.checkArgument(spec != null, "Spec must not be null when creating equality delete writer"); - Preconditions.checkArgument(spec.isUnpartitioned() || partition != null, + Preconditions.checkArgument( + spec != null, "Spec must not be null when creating equality delete writer"); + Preconditions.checkArgument( + spec.isUnpartitioned() || partition != null, "Partition must not be null for partitioned writes"); meta("delete-type", "equality"); - meta("delete-field-ids", IntStream.of(equalityFieldIds) - .mapToObj(Objects::toString) - .collect(Collectors.joining(", "))); + meta( + "delete-field-ids", + IntStream.of(equalityFieldIds) + .mapToObj(Objects::toString) + .collect(Collectors.joining(", "))); // the appender uses the row schema without extra columns appenderBuilder.schema(rowSchema); @@ -470,16 +493,26 @@ public EqualityDeleteWriter buildEqualityWriter() throws IOException { appenderBuilder.createContextFunc(WriteBuilder.Context::deleteContext); return new EqualityDeleteWriter<>( - appenderBuilder.build(), FileFormat.AVRO, location, spec, partition, keyMetadata, sortOrder, + appenderBuilder.build(), + FileFormat.AVRO, + location, + spec, + partition, + keyMetadata, + sortOrder, equalityFieldIds); } public PositionDeleteWriter buildPositionWriter() throws IOException { - Preconditions.checkState(equalityFieldIds == null, "Cannot create position delete file using delete field ids"); - Preconditions.checkArgument(spec != null, "Spec must not be null when creating position delete writer"); - Preconditions.checkArgument(spec.isUnpartitioned() || partition != null, + Preconditions.checkState( + equalityFieldIds == null, "Cannot create position delete file using delete field ids"); + Preconditions.checkArgument( + spec != null, "Spec must not be null when creating position delete writer"); + Preconditions.checkArgument( + spec.isUnpartitioned() || partition != null, "Partition must not be null for partitioned writes"); - Preconditions.checkArgument(rowSchema == null || createWriterFunc != null, + Preconditions.checkArgument( + rowSchema == null || createWriterFunc != null, "Create function should be provided if we write row data"); meta("delete-type", "position"); @@ -494,7 +527,8 @@ public PositionDeleteWriter buildPositionWriter() throws IOException { } else { appenderBuilder.schema(DeleteSchemaUtil.pathPosSchema()); - // We ignore the 'createWriterFunc' and 'rowSchema' even if is provided, since we do not write row data itself + // We ignore the 'createWriterFunc' and 'rowSchema' even if is provided, since we do not + // write row data itself appenderBuilder.createWriterFunc(ignored -> new PositionDatumWriter()); } @@ -505,16 +539,13 @@ public PositionDeleteWriter buildPositionWriter() throws IOException { } } - /** - * A {@link DatumWriter} implementation that wraps another to produce position deletes. - */ + /** A {@link DatumWriter} implementation that wraps another to produce position deletes. */ private static class PositionDatumWriter implements MetricsAwareDatumWriter> { private static final ValueWriter PATH_WRITER = ValueWriters.strings(); private static final ValueWriter POS_WRITER = ValueWriters.longs(); @Override - public void setSchema(Schema schema) { - } + public void setSchema(Schema schema) {} @Override public void write(PositionDelete delete, Encoder out) throws IOException { @@ -529,11 +560,13 @@ public Stream metrics() { } /** - * A {@link DatumWriter} implementation that wraps another to produce position deletes with row data. + * A {@link DatumWriter} implementation that wraps another to produce position deletes with row + * data. * * @param the type of datum written as a deleted row */ - private static class PositionAndRowDatumWriter implements MetricsAwareDatumWriter> { + private static class PositionAndRowDatumWriter + implements MetricsAwareDatumWriter> { private static final ValueWriter PATH_WRITER = ValueWriters.strings(); private static final ValueWriter POS_WRITER = ValueWriters.longs(); @@ -577,12 +610,15 @@ public static class ReadBuilder { private org.apache.iceberg.Schema schema = null; private Function> createReaderFunc = null; private BiFunction> createReaderBiFunc = null; + @SuppressWarnings("UnnecessaryLambda") - private final Function> defaultCreateReaderFunc = readSchema -> { - GenericAvroReader reader = new GenericAvroReader<>(readSchema); - reader.setClassLoader(loader); - return reader; - }; + private final Function> defaultCreateReaderFunc = + readSchema -> { + GenericAvroReader reader = new GenericAvroReader<>(readSchema); + reader.setClassLoader(loader); + return reader; + }; + private Long start = null; private Long length = null; @@ -597,7 +633,8 @@ public ReadBuilder createReaderFunc(Function> readerFunct return this; } - public ReadBuilder createReaderFunc(BiFunction> readerFunction) { + public ReadBuilder createReaderFunc( + BiFunction> readerFunction) { Preconditions.checkState(createReaderFunc == null, "Cannot set multiple createReaderFunc"); this.createReaderBiFunc = readerFunction; return this; @@ -657,14 +694,18 @@ public AvroIterable build() { readerFunc = defaultCreateReaderFunc; } - return new AvroIterable<>(file, + return new AvroIterable<>( + file, new ProjectionDatumReader<>(readerFunc, schema, renames, nameMapping), - start, length, reuseContainers); + start, + length, + reuseContainers); } } /** * Returns number of rows in specified Avro file + * * @param file Avro file * @return number of rows in file */ diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroCustomOrderSchemaVisitor.java b/core/src/main/java/org/apache/iceberg/avro/AvroCustomOrderSchemaVisitor.java index bdef9210176d..575e44a83e42 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroCustomOrderSchemaVisitor.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroCustomOrderSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.util.Deque; @@ -33,8 +32,8 @@ public static T visit(Schema schema, AvroCustomOrderSchemaVisitor v case RECORD: // check to make sure this hasn't been visited before String name = schema.getFullName(); - Preconditions.checkState(!visitor.recordLevels.contains(name), - "Cannot process recursive Avro record %s", name); + Preconditions.checkState( + !visitor.recordLevels.contains(name), "Cannot process recursive Avro record %s", name); visitor.recordLevels.push(name); diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroEncoderUtil.java b/core/src/main/java/org/apache/iceberg/avro/AvroEncoderUtil.java index 74e8ede1add6..ba3c6fece7f9 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroEncoderUtil.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroEncoderUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.ByteArrayInputStream; @@ -36,8 +35,7 @@ public class AvroEncoderUtil { - private AvroEncoderUtil() { - } + private AvroEncoderUtil() {} static { LogicalTypes.register(LogicalMap.NAME, schema -> LogicalMap.get()); @@ -72,8 +70,11 @@ public static T decode(byte[] data) throws IOException { // Read the magic bytes byte header0 = dataInput.readByte(); byte header1 = dataInput.readByte(); - Preconditions.checkState(header0 == MAGIC_BYTES[0] && header1 == MAGIC_BYTES[1], - "Unrecognized header bytes: 0x%02X 0x%02X", header0, header1); + Preconditions.checkState( + header0 == MAGIC_BYTES[0] && header1 == MAGIC_BYTES[1], + "Unrecognized header bytes: 0x%02X 0x%02X", + header0, + header1); // Read avro schema Schema avroSchema = new Schema.Parser().parse(dataInput.readUTF()); diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroFileAppender.java b/core/src/main/java/org/apache/iceberg/avro/AvroFileAppender.java index f4f6c8482917..4ffe5432b872 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroFileAppender.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroFileAppender.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.IOException; @@ -43,10 +42,16 @@ class AvroFileAppender implements FileAppender { private long numRecords = 0L; private boolean isClosed = false; - AvroFileAppender(org.apache.iceberg.Schema icebergSchema, Schema schema, OutputFile file, - Function> createWriterFunc, - CodecFactory codec, Map metadata, - MetricsConfig metricsConfig, boolean overwrite) throws IOException { + AvroFileAppender( + org.apache.iceberg.Schema icebergSchema, + Schema schema, + OutputFile file, + Function> createWriterFunc, + CodecFactory codec, + Map metadata, + MetricsConfig metricsConfig, + boolean overwrite) + throws IOException { this.icebergSchema = icebergSchema; this.stream = overwrite ? file.createOrOverwrite() : file.create(); this.datumWriter = createWriterFunc.apply(schema); @@ -66,8 +71,7 @@ public void add(D datum) { @Override public Metrics metrics() { - Preconditions.checkState(isClosed, - "Cannot return metrics while appending to an open file."); + Preconditions.checkState(isClosed, "Cannot return metrics while appending to an open file."); return AvroMetrics.fromWriter(datumWriter, icebergSchema, numRecords, metricsConfig); } @@ -95,10 +99,13 @@ public void close() throws IOException { @SuppressWarnings("unchecked") private static DataFileWriter newAvroWriter( - Schema schema, PositionOutputStream stream, DatumWriter metricsAwareDatumWriter, - CodecFactory codec, Map metadata) throws IOException { - DataFileWriter writer = new DataFileWriter<>( - (DatumWriter) metricsAwareDatumWriter); + Schema schema, + PositionOutputStream stream, + DatumWriter metricsAwareDatumWriter, + CodecFactory codec, + Map metadata) + throws IOException { + DataFileWriter writer = new DataFileWriter<>((DatumWriter) metricsAwareDatumWriter); writer.setCodec(codec); diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroIO.java b/core/src/main/java/org/apache/iceberg/avro/AvroIO.java index c569d93b1fa5..cf575fb0e8bf 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroIO.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroIO.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.EOFException; @@ -36,35 +35,33 @@ import org.apache.iceberg.io.SeekableInputStream; class AvroIO { - private static final byte[] AVRO_MAGIC = new byte[] { 'O', 'b', 'j', 1 }; + private static final byte[] AVRO_MAGIC = new byte[] {'O', 'b', 'j', 1}; private static final ValueReader MAGIC_READER = ValueReaders.fixed(AVRO_MAGIC.length); - private static final ValueReader> META_READER = ValueReaders.map( - ValueReaders.strings(), ValueReaders.strings()); + private static final ValueReader> META_READER = + ValueReaders.map(ValueReaders.strings(), ValueReaders.strings()); private static final ValueReader SYNC_READER = ValueReaders.fixed(16); - private AvroIO() { - } + private AvroIO() {} - private static final Class fsDataInputStreamClass = DynClasses.builder() - .impl("org.apache.hadoop.fs.FSDataInputStream") - .orNull() - .build(); + private static final Class fsDataInputStreamClass = + DynClasses.builder().impl("org.apache.hadoop.fs.FSDataInputStream").orNull().build(); private static final boolean relocated = "org.apache.avro.file.SeekableInput".equals(SeekableInput.class.getName()); private static final DynConstructors.Ctor avroFsInputCtor = - !relocated && fsDataInputStreamClass != null ? - DynConstructors.builder(SeekableInput.class) + !relocated && fsDataInputStreamClass != null + ? DynConstructors.builder(SeekableInput.class) .impl("org.apache.hadoop.fs.AvroFSInput", fsDataInputStreamClass, Long.TYPE) - .build() : - null; + .build() + : null; static SeekableInput stream(SeekableInputStream stream, long length) { if (stream instanceof DelegatingInputStream) { InputStream wrapped = ((DelegatingInputStream) stream).getDelegate(); - if (avroFsInputCtor != null && fsDataInputStreamClass != null && - fsDataInputStreamClass.isInstance(wrapped)) { + if (avroFsInputCtor != null + && fsDataInputStreamClass != null + && fsDataInputStreamClass.isInstance(wrapped)) { return avroFsInputCtor.newInstance(wrapped, length); } } @@ -159,7 +156,8 @@ static long findStartingRowPos(Supplier open, long start) { // each block consists of: // row-count|compressed-size-in-bytes|block-bytes|sync - // it is necessary to read the header here because this is the only way to get the expected file sync bytes + // it is necessary to read the header here because this is the only way to get the expected + // file sync bytes byte[] magic = MAGIC_READER.read(decoder, null); if (!Arrays.equals(AVRO_MAGIC, magic)) { throw new InvalidAvroMagicException("Not an Avro file"); @@ -168,7 +166,8 @@ static long findStartingRowPos(Supplier open, long start) { META_READER.read(decoder, null); // ignore the file metadata, it isn't needed byte[] fileSync = SYNC_READER.read(decoder, null); - // the while loop reads row counts and seeks past the block bytes until the next sync pos is >= start, which + // the while loop reads row counts and seeks past the block bytes until the next sync pos is + // >= start, which // indicates that the next sync is the start of the split. byte[] blockSync = new byte[16]; long nextSyncPos = in.getPos(); diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroIterable.java b/core/src/main/java/org/apache/iceberg/avro/AvroIterable.java index fd6337941c48..49acb8010b61 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroIterable.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroIterable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.IOException; @@ -42,8 +41,8 @@ public class AvroIterable extends CloseableGroup implements CloseableIterable private final boolean reuseContainers; private Map metadata = null; - AvroIterable(InputFile file, DatumReader reader, - Long start, Long length, boolean reuseContainers) { + AvroIterable( + InputFile file, DatumReader reader, Long start, Long length, boolean reuseContainers) { this.file = file; this.reader = reader; this.start = start; @@ -78,8 +77,8 @@ public CloseableIterator iterator() { if (start != null) { if (reader instanceof SupportsRowPosition) { - ((SupportsRowPosition) reader).setRowPositionSupplier( - () -> AvroIO.findStartingRowPos(file::newStream, start)); + ((SupportsRowPosition) reader) + .setRowPositionSupplier(() -> AvroIO.findStartingRowPos(file::newStream, start)); } fileReader = new AvroRangeIterator<>(fileReader, start, end); } else if (reader instanceof SupportsRowPosition) { @@ -97,8 +96,8 @@ public CloseableIterator iterator() { private DataFileReader newFileReader() { try { - return (DataFileReader) DataFileReader.openReader( - AvroIO.stream(file.newStream(), file.getLength()), reader); + return (DataFileReader) + DataFileReader.openReader(AvroIO.stream(file.newStream(), file.getLength()), reader); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to open file: %s", file); } diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroMetrics.java b/core/src/main/java/org/apache/iceberg/avro/AvroMetrics.java index d0bf308dc2f6..c0d1b44f29ce 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroMetrics.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import org.apache.avro.io.DatumWriter; @@ -26,11 +25,13 @@ public class AvroMetrics { - private AvroMetrics() { - } + private AvroMetrics() {} - static Metrics fromWriter(DatumWriter datumWriter, Schema schema, long numRecords, - MetricsConfig inputMetricsConfig) { + static Metrics fromWriter( + DatumWriter datumWriter, + Schema schema, + long numRecords, + MetricsConfig inputMetricsConfig) { // TODO will populate in following PRs if datum writer is a MetricsAwareDatumWriter return new Metrics(numRecords, null, null, null, null); } diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroSchemaUtil.java b/core/src/main/java/org/apache/iceberg/avro/AvroSchemaUtil.java index 71faf3bc1ce6..46c17722f8f7 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroSchemaUtil.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroSchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.util.List; @@ -38,8 +37,7 @@ public class AvroSchemaUtil { - private AvroSchemaUtil() { - } + private AvroSchemaUtil() {} // Original Iceberg field name corresponding to a sanitized Avro name public static final String ICEBERG_FIELD_NAME_PROP = "iceberg-field-name"; @@ -55,13 +53,12 @@ private AvroSchemaUtil() { private static final Schema.Type UNION = Schema.Type.UNION; private static final Schema.Type RECORD = Schema.Type.RECORD; - public static Schema convert(org.apache.iceberg.Schema schema, - String tableName) { + public static Schema convert(org.apache.iceberg.Schema schema, String tableName) { return convert(schema, ImmutableMap.of(schema.asStruct(), tableName)); } - public static Schema convert(org.apache.iceberg.Schema schema, - Map names) { + public static Schema convert( + org.apache.iceberg.Schema schema, Map names) { return TypeUtil.visit(schema, new TypeToSchema(names)); } @@ -92,18 +89,22 @@ static boolean hasIds(Schema schema) { /** * Check if any of the nodes in a given avro schema is missing an ID - *

- * To have an ID for a node: + * + *

To have an ID for a node: + * *

    *
  • a field node under struct (record) schema should have {@link #FIELD_ID_PROP} property *
  • an element node under list (array) schema should have {@link #ELEMENT_ID_PROP} property - *
  • a pair of key and value node under map schema should have {@link #KEY_ID_PROP} and - * {@link #VALUE_ID_PROP} respectively + *
  • a pair of key and value node under map schema should have {@link #KEY_ID_PROP} and {@link + * #VALUE_ID_PROP} respectively *
  • a primitive node is not assigned any ID properties *
+ * *

+ * * @param schema an Avro Schema - * @return true if any of the nodes of the given Avro Schema is missing an ID property, false otherwise + * @return true if any of the nodes of the given Avro Schema is missing an ID property, false + * otherwise */ static boolean missingIds(Schema schema) { return AvroCustomOrderSchemaVisitor.visit(schema, new MissingIds()); @@ -115,23 +116,26 @@ public static Map convertTypes(Types.StructType type, String name) return ImmutableMap.copyOf(converter.getConversionMap()); } - public static Schema pruneColumns(Schema schema, Set selectedIds, NameMapping nameMapping) { + public static Schema pruneColumns( + Schema schema, Set selectedIds, NameMapping nameMapping) { return new PruneColumns(selectedIds, nameMapping).rootSchema(schema); } - public static Schema buildAvroProjection(Schema schema, org.apache.iceberg.Schema expected, - Map renames) { + public static Schema buildAvroProjection( + Schema schema, org.apache.iceberg.Schema expected, Map renames) { return AvroCustomOrderSchemaVisitor.visit(schema, new BuildAvroProjection(expected, renames)); } public static boolean isTimestamptz(Schema schema) { LogicalType logicalType = schema.getLogicalType(); - if (logicalType instanceof LogicalTypes.TimestampMillis || logicalType instanceof LogicalTypes.TimestampMicros) { + if (logicalType instanceof LogicalTypes.TimestampMillis + || logicalType instanceof LogicalTypes.TimestampMicros) { // timestamptz is adjusted to UTC Object value = schema.getObjectProp(ADJUST_TO_UTC_PROP); if (value == null) { - // not all avro timestamp logical types will have the adjust_to_utc prop, default to timestamp without timezone + // not all avro timestamp logical types will have the adjust_to_utc prop, default to + // timestamp without timezone return false; } else if (value instanceof Boolean) { return (Boolean) value; @@ -156,8 +160,8 @@ public static boolean isOptionSchema(Schema schema) { static Schema toOption(Schema schema) { if (schema.getType() == UNION) { - Preconditions.checkArgument(isOptionSchema(schema), - "Union schemas are not supported: %s", schema); + Preconditions.checkArgument( + isOptionSchema(schema), "Union schemas are not supported: %s", schema); return schema; } else { return Schema.createUnion(NULL, schema); @@ -165,10 +169,10 @@ static Schema toOption(Schema schema) { } static Schema fromOption(Schema schema) { - Preconditions.checkArgument(schema.getType() == UNION, - "Expected union schema but was passed: %s", schema); - Preconditions.checkArgument(schema.getTypes().size() == 2, - "Expected optional schema, but was passed: %s", schema); + Preconditions.checkArgument( + schema.getType() == UNION, "Expected union schema but was passed: %s", schema); + Preconditions.checkArgument( + schema.getTypes().size() == 2, "Expected optional schema, but was passed: %s", schema); if (schema.getTypes().get(0).getType() == Schema.Type.NULL) { return schema.getTypes().get(1); } else { @@ -177,8 +181,8 @@ static Schema fromOption(Schema schema) { } static Schema fromOptions(List options) { - Preconditions.checkArgument(options.size() == 2, - "Expected two schemas, but was passed: %s options", options.size()); + Preconditions.checkArgument( + options.size() == 2, "Expected two schemas, but was passed: %s options", options.size()); if (options.get(0).getType() == Schema.Type.NULL) { return options.get(1); } else { @@ -190,24 +194,35 @@ public static boolean isKeyValueSchema(Schema schema) { return schema.getType() == RECORD && schema.getFields().size() == 2; } - static Schema createMap(int keyId, Schema keySchema, - int valueId, Schema valueSchema) { + static Schema createMap(int keyId, Schema keySchema, int valueId, Schema valueSchema) { String keyValueName = "k" + keyId + "_v" + valueId; Schema.Field keyField = new Schema.Field("key", keySchema, null, (Object) null); keyField.addProp(FIELD_ID_PROP, keyId); - Schema.Field valueField = new Schema.Field("value", valueSchema, null, - isOptionSchema(valueSchema) ? JsonProperties.NULL_VALUE : null); + Schema.Field valueField = + new Schema.Field( + "value", + valueSchema, + null, + isOptionSchema(valueSchema) ? JsonProperties.NULL_VALUE : null); valueField.addProp(FIELD_ID_PROP, valueId); - return LogicalMap.get().addToSchema(Schema.createArray(Schema.createRecord( - keyValueName, null, null, false, ImmutableList.of(keyField, valueField)))); - } - - static Schema createProjectionMap(String recordName, - int keyId, String keyName, Schema keySchema, - int valueId, String valueName, Schema valueSchema) { + return LogicalMap.get() + .addToSchema( + Schema.createArray( + Schema.createRecord( + keyValueName, null, null, false, ImmutableList.of(keyField, valueField)))); + } + + static Schema createProjectionMap( + String recordName, + int keyId, + String keyName, + Schema keySchema, + int valueId, + String valueName, + Schema valueSchema) { String keyValueName = "k" + keyId + "_v" + valueId; Schema.Field keyField = new Schema.Field("key", keySchema, null, (Object) null); @@ -216,15 +231,20 @@ static Schema createProjectionMap(String recordName, } keyField.addProp(FIELD_ID_PROP, keyId); - Schema.Field valueField = new Schema.Field("value", valueSchema, null, - isOptionSchema(valueSchema) ? JsonProperties.NULL_VALUE : null); + Schema.Field valueField = + new Schema.Field( + "value", + valueSchema, + null, + isOptionSchema(valueSchema) ? JsonProperties.NULL_VALUE : null); valueField.addProp(FIELD_ID_PROP, valueId); if (!"value".equals(valueName)) { valueField.addAlias(valueName); } - Schema keyValueRecord = Schema.createRecord( - keyValueName, null, null, false, ImmutableList.of(keyField, valueField)); + Schema keyValueRecord = + Schema.createRecord( + keyValueName, null, null, false, ImmutableList.of(keyField, valueField)); if (!keyValueName.equals(recordName)) { keyValueRecord.addAlias(recordName); } @@ -238,7 +258,8 @@ private static Integer getId(Schema schema, String propertyName) { return id; } - private static Integer getId(Schema schema, String propertyName, NameMapping nameMapping, List names) { + private static Integer getId( + Schema schema, String propertyName, NameMapping nameMapping, List names) { if (schema.getType() == UNION) { return getId(fromOption(schema), propertyName, nameMapping, names); } @@ -264,42 +285,45 @@ static boolean hasProperty(Schema schema, String propertyName) { } public static int getKeyId(Schema schema) { - Preconditions.checkArgument(schema.getType() == MAP, - "Cannot get map key id for non-map schema: %s", schema); + Preconditions.checkArgument( + schema.getType() == MAP, "Cannot get map key id for non-map schema: %s", schema); return getId(schema, KEY_ID_PROP); } - static Integer getKeyId(Schema schema, NameMapping nameMapping, Iterable parentFieldNames) { - Preconditions.checkArgument(schema.getType() == MAP, - "Cannot get map key id for non-map schema: %s", schema); + static Integer getKeyId( + Schema schema, NameMapping nameMapping, Iterable parentFieldNames) { + Preconditions.checkArgument( + schema.getType() == MAP, "Cannot get map key id for non-map schema: %s", schema); List names = Lists.newArrayList(parentFieldNames); names.add("key"); return getId(schema, KEY_ID_PROP, nameMapping, names); } public static int getValueId(Schema schema) { - Preconditions.checkArgument(schema.getType() == MAP, - "Cannot get map value id for non-map schema: %s", schema); + Preconditions.checkArgument( + schema.getType() == MAP, "Cannot get map value id for non-map schema: %s", schema); return getId(schema, VALUE_ID_PROP); } - static Integer getValueId(Schema schema, NameMapping nameMapping, Iterable parentFieldNames) { - Preconditions.checkArgument(schema.getType() == MAP, - "Cannot get map value id for non-map schema: %s", schema); + static Integer getValueId( + Schema schema, NameMapping nameMapping, Iterable parentFieldNames) { + Preconditions.checkArgument( + schema.getType() == MAP, "Cannot get map value id for non-map schema: %s", schema); List names = Lists.newArrayList(parentFieldNames); names.add("value"); return getId(schema, VALUE_ID_PROP, nameMapping, names); } public static int getElementId(Schema schema) { - Preconditions.checkArgument(schema.getType() == ARRAY, - "Cannot get array element id for non-array schema: %s", schema); + Preconditions.checkArgument( + schema.getType() == ARRAY, "Cannot get array element id for non-array schema: %s", schema); return getId(schema, ELEMENT_ID_PROP); } - static Integer getElementId(Schema schema, NameMapping nameMapping, Iterable parentFieldNames) { - Preconditions.checkArgument(schema.getType() == ARRAY, - "Cannot get array element id for non-array schema: %s", schema); + static Integer getElementId( + Schema schema, NameMapping nameMapping, Iterable parentFieldNames) { + Preconditions.checkArgument( + schema.getType() == ARRAY, "Cannot get array element id for non-array schema: %s", schema); List names = Lists.newArrayList(parentFieldNames); names.add("element"); return getId(schema, ELEMENT_ID_PROP, nameMapping, names); @@ -311,7 +335,8 @@ public static int getFieldId(Schema.Field field) { return id; } - static Integer getFieldId(Schema.Field field, NameMapping nameMapping, Iterable parentFieldNames) { + static Integer getFieldId( + Schema.Field field, NameMapping nameMapping, Iterable parentFieldNames) { Object id = field.getObjectProp(FIELD_ID_PROP); if (id != null) { return toInt(id); @@ -350,8 +375,13 @@ static Schema copyRecord(Schema record, List newFields, String new // original schema. copy.addAlias(record.getName(), record.getNamespace() == null ? "" : record.getNamespace()); } else { - copy = Schema.createRecord(record.getName(), - record.getDoc(), record.getNamespace(), record.isError(), newFields); + copy = + Schema.createRecord( + record.getName(), + record.getDoc(), + record.getNamespace(), + record.isError(), + newFields); } for (Map.Entry prop : record.getObjectProps().entrySet()) { @@ -362,8 +392,8 @@ static Schema copyRecord(Schema record, List newFields, String new } static Schema.Field copyField(Schema.Field field, Schema newSchema, String newName) { - Schema.Field copy = new Schema.Field(newName, - newSchema, field.doc(), field.defaultVal(), field.order()); + Schema.Field copy = + new Schema.Field(newName, newSchema, field.doc(), field.defaultVal(), field.order()); for (Map.Entry prop : field.getObjectProps().entrySet()) { copy.addProp(prop.getKey(), prop.getValue()); @@ -377,8 +407,8 @@ static Schema.Field copyField(Schema.Field field, Schema newSchema, String newNa } static Schema replaceElement(Schema array, Schema elementSchema) { - Preconditions.checkArgument(array.getType() == ARRAY, - "Cannot invoke replaceElement on non array schema: %s", array); + Preconditions.checkArgument( + array.getType() == ARRAY, "Cannot invoke replaceElement on non array schema: %s", array); Schema copy = Schema.createArray(elementSchema); for (Map.Entry prop : array.getObjectProps().entrySet()) { copy.addProp(prop.getKey(), prop.getValue()); @@ -387,8 +417,8 @@ static Schema replaceElement(Schema array, Schema elementSchema) { } static Schema replaceValue(Schema map, Schema valueSchema) { - Preconditions.checkArgument(map.getType() == MAP, - "Cannot invoke replaceValue on non map schema: %s", map); + Preconditions.checkArgument( + map.getType() == MAP, "Cannot invoke replaceValue on non map schema: %s", map); Schema copy = Schema.createMap(valueSchema); for (Map.Entry prop : map.getObjectProps().entrySet()) { copy.addProp(prop.getKey(), prop.getValue()); diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroSchemaVisitor.java b/core/src/main/java/org/apache/iceberg/avro/AvroSchemaVisitor.java index 281d45b51c2a..f22a3592ad3d 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroSchemaVisitor.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.util.Deque; @@ -31,8 +30,8 @@ public static T visit(Schema schema, AvroSchemaVisitor visitor) { case RECORD: // check to make sure this hasn't been visited before String name = schema.getFullName(); - Preconditions.checkState(!visitor.recordLevels.contains(name), - "Cannot process recursive Avro record %s", name); + Preconditions.checkState( + !visitor.recordLevels.contains(name), "Cannot process recursive Avro record %s", name); visitor.recordLevels.push(name); diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroSchemaWithTypeVisitor.java b/core/src/main/java/org/apache/iceberg/avro/AvroSchemaWithTypeVisitor.java index e6f1c6eb5097..85a8718abfce 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroSchemaWithTypeVisitor.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroSchemaWithTypeVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.util.Deque; @@ -28,7 +27,8 @@ import org.apache.iceberg.types.Types; public abstract class AvroSchemaWithTypeVisitor { - public static T visit(org.apache.iceberg.Schema iSchema, Schema schema, AvroSchemaWithTypeVisitor visitor) { + public static T visit( + org.apache.iceberg.Schema iSchema, Schema schema, AvroSchemaWithTypeVisitor visitor) { return visit(iSchema.asStruct(), schema, visitor); } @@ -45,7 +45,9 @@ public static T visit(Type iType, Schema schema, AvroSchemaWithTypeVisitor T visit(Type iType, Schema schema, AvroSchemaWithTypeVisitor T visitRecord(Types.StructType struct, Schema record, AvroSchemaWithTypeVisitor visitor) { + private static T visitRecord( + Types.StructType struct, Schema record, AvroSchemaWithTypeVisitor visitor) { // check to make sure this hasn't been visited before String name = record.getFullName(); - Preconditions.checkState(!visitor.recordLevels.contains(name), - "Cannot process recursive Avro record %s", name); + Preconditions.checkState( + !visitor.recordLevels.contains(name), "Cannot process recursive Avro record %s", name); visitor.recordLevels.push(name); @@ -93,16 +96,21 @@ private static T visitArray(Type type, Schema array, AvroSchemaWithTypeVisit if (array.getLogicalType() instanceof LogicalMap || (type != null && type.isMapType())) { Preconditions.checkState( AvroSchemaUtil.isKeyValueSchema(array.getElementType()), - "Cannot visit invalid logical map type: %s", array); + "Cannot visit invalid logical map type: %s", + array); Types.MapType map = type != null ? type.asMapType() : null; List keyValueFields = array.getElementType().getFields(); - return visitor.map(map, array, + return visitor.map( + map, + array, visit(map != null ? map.keyType() : null, keyValueFields.get(0).schema(), visitor), visit(map != null ? map.valueType() : null, keyValueFields.get(1).schema(), visitor)); } else { Types.ListType list = type != null ? type.asListType() : null; - return visitor.array(list, array, + return visitor.array( + list, + array, visit(list != null ? list.elementType() : null, array.getElementType(), visitor)); } } diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroWithPartnerByStructureVisitor.java b/core/src/main/java/org/apache/iceberg/avro/AvroWithPartnerByStructureVisitor.java index 2b28f54aaaaa..e06c774c338e 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroWithPartnerByStructureVisitor.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroWithPartnerByStructureVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.util.Deque; @@ -27,15 +26,16 @@ import org.apache.iceberg.util.Pair; /** - * A abstract avro schema visitor with partner type. The visitor rely on the structure matching exactly and are - * guaranteed that because both schemas are derived from the same Iceberg schema. + * A abstract avro schema visitor with partner type. The visitor rely on the structure matching + * exactly and are guaranteed that because both schemas are derived from the same Iceberg schema. * * @param

Partner type. * @param Return T. */ public abstract class AvroWithPartnerByStructureVisitor { - public static T visit(P partner, Schema schema, AvroWithPartnerByStructureVisitor visitor) { + public static T visit( + P partner, Schema schema, AvroWithPartnerByStructureVisitor visitor) { switch (schema.getType()) { case RECORD: return visitRecord(partner, schema, visitor); @@ -49,9 +49,9 @@ public static T visit(P partner, Schema schema, AvroWithPartnerByStructur case MAP: P keyType = visitor.mapKeyType(partner); Preconditions.checkArgument( - visitor.isStringType(keyType), - "Invalid map: %s is not a string", keyType); - return visitor.map(partner, schema, visit(visitor.mapValueType(partner), schema.getValueType(), visitor)); + visitor.isStringType(keyType), "Invalid map: %s is not a string", keyType); + return visitor.map( + partner, schema, visit(visitor.mapValueType(partner), schema.getValueType(), visitor)); default: return visitor.primitive(partner, schema); @@ -60,11 +60,12 @@ public static T visit(P partner, Schema schema, AvroWithPartnerByStructur // ---------------------------------- Static helpers --------------------------------------------- - private static T visitRecord(P struct, Schema record, AvroWithPartnerByStructureVisitor visitor) { + private static T visitRecord( + P struct, Schema record, AvroWithPartnerByStructureVisitor visitor) { // check to make sure this hasn't been visited before String name = record.getFullName(); - Preconditions.checkState(!visitor.recordLevels.contains(name), - "Cannot process recursive Avro record %s", name); + Preconditions.checkState( + !visitor.recordLevels.contains(name), "Cannot process recursive Avro record %s", name); List fields = record.getFields(); visitor.recordLevels.push(name); @@ -75,8 +76,11 @@ private static T visitRecord(P struct, Schema record, AvroWithPartnerBySt Pair nameAndType = visitor.fieldNameAndType(struct, i); String fieldName = nameAndType.first(); Schema.Field field = fields.get(i); - Preconditions.checkArgument(AvroSchemaUtil.makeCompatibleName(fieldName).equals(field.name()), - "Structs do not match: field %s != %s", fieldName, field.name()); + Preconditions.checkArgument( + AvroSchemaUtil.makeCompatibleName(fieldName).equals(field.name()), + "Structs do not match: field %s != %s", + fieldName, + field.name()); results.add(visit(nameAndType.second(), field.schema(), visitor)); names.add(fieldName); } @@ -86,10 +90,11 @@ private static T visitRecord(P struct, Schema record, AvroWithPartnerBySt return visitor.record(struct, record, names, results); } - private static T visitUnion(P type, Schema union, AvroWithPartnerByStructureVisitor visitor) { + private static T visitUnion( + P type, Schema union, AvroWithPartnerByStructureVisitor visitor) { List types = union.getTypes(); - Preconditions.checkArgument(AvroSchemaUtil.isOptionSchema(union), - "Cannot visit non-option union: %s", union); + Preconditions.checkArgument( + AvroSchemaUtil.isOptionSchema(union), "Cannot visit non-option union: %s", union); List options = Lists.newArrayListWithExpectedSize(types.size()); for (Schema branch : types) { if (branch.getType() == Schema.Type.NULL) { @@ -101,27 +106,31 @@ private static T visitUnion(P type, Schema union, AvroWithPartnerByStruct return visitor.union(type, union, options); } - private static T visitArray(P type, Schema array, AvroWithPartnerByStructureVisitor visitor) { + private static T visitArray( + P type, Schema array, AvroWithPartnerByStructureVisitor visitor) { if (array.getLogicalType() instanceof LogicalMap || visitor.isMapType(type)) { Preconditions.checkState( AvroSchemaUtil.isKeyValueSchema(array.getElementType()), - "Cannot visit invalid logical map type: %s", array); + "Cannot visit invalid logical map type: %s", + array); List keyValueFields = array.getElementType().getFields(); - return visitor.map(type, array, + return visitor.map( + type, + array, visit(visitor.mapKeyType(type), keyValueFields.get(0).schema(), visitor), visit(visitor.mapValueType(type), keyValueFields.get(1).schema(), visitor)); } else { - return visitor.array(type, array, visit(visitor.arrayElementType(type), array.getElementType(), visitor)); + return visitor.array( + type, array, visit(visitor.arrayElementType(type), array.getElementType(), visitor)); } } - /** - * Just for checking state. - */ + /** Just for checking state. */ private Deque recordLevels = Lists.newLinkedList(); - // ---------------------------------- Partner type methods --------------------------------------------- + // ---------------------------------- Partner type methods + // --------------------------------------------- protected abstract boolean isMapType(P type); @@ -130,6 +139,7 @@ private static T visitArray(P type, Schema array, AvroWithPartnerByStruct protected abstract P arrayElementType(P arrayType); protected abstract P mapKeyType(P mapType); + protected abstract P mapValueType(P mapType); protected abstract Pair fieldNameAndType(P structType, int pos); diff --git a/core/src/main/java/org/apache/iceberg/avro/BuildAvroProjection.java b/core/src/main/java/org/apache/iceberg/avro/BuildAvroProjection.java index f708f556fff9..3f1a71a9e6c2 100644 --- a/core/src/main/java/org/apache/iceberg/avro/BuildAvroProjection.java +++ b/core/src/main/java/org/apache/iceberg/avro/BuildAvroProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.util.List; @@ -34,11 +33,11 @@ /** * Renames and aliases fields in an Avro schema based on the current table schema. - *

- * This class creates a read schema based on an Avro file's schema that will correctly translate + * + *

This class creates a read schema based on an Avro file's schema that will correctly translate * from the file's field names to the current table schema. - *

- * This will also rename records in the file's Avro schema to support custom read classes. + * + *

This will also rename records in the file's Avro schema to support custom read classes. */ class BuildAvroProjection extends AvroCustomOrderSchemaVisitor { private final Map renames; @@ -59,7 +58,8 @@ class BuildAvroProjection extends AvroCustomOrderSchemaVisitor names, Iterable schemaIterable) { Preconditions.checkArgument( current.isNestedType() && current.asNestedType().isStructType(), - "Cannot project non-struct: %s", current); + "Cannot project non-struct: %s", + current); Types.StructType struct = current.asNestedType().asStructType(); @@ -75,8 +75,8 @@ public Schema record(Schema record, List names, Iterable s if (updatedField != null) { updateMap.put(updatedField.name(), updatedField); - if (!updatedField.schema().equals(field.schema()) || - !updatedField.name().equals(field.name())) { + if (!updatedField.schema().equals(field.schema()) + || !updatedField.name().equals(field.name())) { hasChange = true; } } else { @@ -103,12 +103,16 @@ public Schema record(Schema record, List names, Iterable s } else { Preconditions.checkArgument( field.isOptional() || MetadataColumns.metadataFieldIds().contains(field.fieldId()), - "Missing required field: %s", field.name()); + "Missing required field: %s", + field.name()); // Create a field that will be defaulted to null. We assign a unique suffix to the field // to make sure that even if records in the file have the field it is not projected. - Schema.Field newField = new Schema.Field( - field.name() + "_r" + field.fieldId(), - AvroSchemaUtil.toOption(AvroSchemaUtil.convert(field.type())), null, JsonProperties.NULL_VALUE); + Schema.Field newField = + new Schema.Field( + field.name() + "_r" + field.fieldId(), + AvroSchemaUtil.toOption(AvroSchemaUtil.convert(field.type())), + null, + JsonProperties.NULL_VALUE); newField.addProp(AvroSchemaUtil.FIELD_ID_PROP, field.fieldId()); updatedFields.add(newField); hasChange = true; @@ -141,7 +145,8 @@ public Schema.Field field(Schema.Field field, Supplier fieldResult) { if (!Objects.equals(schema, field.schema()) || !expectedName.equals(field.name())) { // add an alias for the field - return AvroSchemaUtil.copyField(field, schema, AvroSchemaUtil.makeCompatibleName(expectedName)); + return AvroSchemaUtil.copyField( + field, schema, AvroSchemaUtil.makeCompatibleName(expectedName)); } else { // always copy because fields can't be reused return AvroSchemaUtil.copyField(field, field.schema(), field.name()); @@ -154,8 +159,10 @@ public Schema.Field field(Schema.Field field, Supplier fieldResult) { @Override public Schema union(Schema union, Iterable options) { - Preconditions.checkState(AvroSchemaUtil.isOptionSchema(union), - "Invalid schema: non-option unions are not supported: %s", union); + Preconditions.checkState( + AvroSchemaUtil.isOptionSchema(union), + "Invalid schema: non-option unions are not supported: %s", + union); Schema nonNullOriginal = AvroSchemaUtil.fromOption(union); Schema nonNullResult = AvroSchemaUtil.fromOptions(Lists.newArrayList(options)); @@ -168,11 +175,12 @@ public Schema union(Schema union, Iterable options) { @Override public Schema array(Schema array, Supplier element) { - if (array.getLogicalType() instanceof LogicalMap || - (current.isMapType() && AvroSchemaUtil.isKeyValueSchema(array.getElementType()))) { + if (array.getLogicalType() instanceof LogicalMap + || (current.isMapType() && AvroSchemaUtil.isKeyValueSchema(array.getElementType()))) { Preconditions.checkArgument(current.isMapType(), "Incompatible projected type: %s", current); Types.MapType asMapType = current.asNestedType().asMapType(); - this.current = Types.StructType.of(asMapType.fields()); // create a struct to correspond to element + this.current = + Types.StructType.of(asMapType.fields()); // create a struct to correspond to element try { Schema keyValueSchema = array.getElementType(); Schema.Field keyField = keyValueSchema.getFields().get(0); @@ -181,13 +189,23 @@ public Schema array(Schema array, Supplier element) { // element was changed, create a new array if (!Objects.equals(valueProjection.schema(), valueField.schema())) { - return AvroSchemaUtil.createProjectionMap(keyValueSchema.getFullName(), - AvroSchemaUtil.getFieldId(keyField), keyField.name(), keyField.schema(), - AvroSchemaUtil.getFieldId(valueField), valueField.name(), valueProjection.schema()); + return AvroSchemaUtil.createProjectionMap( + keyValueSchema.getFullName(), + AvroSchemaUtil.getFieldId(keyField), + keyField.name(), + keyField.schema(), + AvroSchemaUtil.getFieldId(valueField), + valueField.name(), + valueProjection.schema()); } else if (!(array.getLogicalType() instanceof LogicalMap)) { - return AvroSchemaUtil.createProjectionMap(keyValueSchema.getFullName(), - AvroSchemaUtil.getFieldId(keyField), keyField.name(), keyField.schema(), - AvroSchemaUtil.getFieldId(valueField), valueField.name(), valueField.schema()); + return AvroSchemaUtil.createProjectionMap( + keyValueSchema.getFullName(), + AvroSchemaUtil.getFieldId(keyField), + keyField.name(), + keyField.schema(), + AvroSchemaUtil.getFieldId(valueField), + valueField.name(), + valueField.schema()); } return array; @@ -197,8 +215,7 @@ public Schema array(Schema array, Supplier element) { } } else { - Preconditions.checkArgument(current.isListType(), - "Incompatible projected type: %s", current); + Preconditions.checkArgument(current.isListType(), "Incompatible projected type: %s", current); Types.ListType list = current.asNestedType().asListType(); this.current = list.elementType(); try { @@ -219,11 +236,15 @@ public Schema array(Schema array, Supplier element) { @Override public Schema map(Schema map, Supplier value) { - Preconditions.checkArgument(current.isNestedType() && current.asNestedType().isMapType(), - "Incompatible projected type: %s", current); + Preconditions.checkArgument( + current.isNestedType() && current.asNestedType().isMapType(), + "Incompatible projected type: %s", + current); Types.MapType asMapType = current.asNestedType().asMapType(); - Preconditions.checkArgument(asMapType.keyType() == Types.StringType.get(), - "Incompatible projected type: key type %s is not string", asMapType.keyType()); + Preconditions.checkArgument( + asMapType.keyType() == Types.StringType.get(), + "Incompatible projected type: key type %s is not string", + asMapType.keyType()); this.current = asMapType.valueType(); try { Schema valueSchema = value.get(); @@ -260,5 +281,4 @@ public Schema primitive(Schema primitive) { return primitive; } } - } diff --git a/core/src/main/java/org/apache/iceberg/avro/GenericAvroReader.java b/core/src/main/java/org/apache/iceberg/avro/GenericAvroReader.java index 5957cae30488..c0a7895acbba 100644 --- a/core/src/main/java/org/apache/iceberg/avro/GenericAvroReader.java +++ b/core/src/main/java/org/apache/iceberg/avro/GenericAvroReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.IOException; @@ -80,10 +79,8 @@ private ReadBuilder(ClassLoader loader) { @SuppressWarnings("unchecked") public ValueReader record(Schema record, List names, List> fields) { try { - Class recordClass = DynClasses.builder() - .loader(loader) - .impl(record.getFullName()) - .buildChecked(); + Class recordClass = + DynClasses.builder().loader(loader).impl(record.getFullName()).buildChecked(); if (IndexedRecord.class.isAssignableFrom(recordClass)) { return ValueReaders.record(fields, (Class) recordClass, record); } diff --git a/core/src/main/java/org/apache/iceberg/avro/GenericAvroWriter.java b/core/src/main/java/org/apache/iceberg/avro/GenericAvroWriter.java index 9ee5de32fe7d..d6a4574dab66 100644 --- a/core/src/main/java/org/apache/iceberg/avro/GenericAvroWriter.java +++ b/core/src/main/java/org/apache/iceberg/avro/GenericAvroWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.IOException; @@ -53,8 +52,7 @@ public Stream metrics() { } private static class WriteBuilder extends AvroSchemaVisitor> { - private WriteBuilder() { - } + private WriteBuilder() {} @Override public ValueWriter record(Schema record, List names, List> fields) { @@ -63,10 +61,12 @@ public ValueWriter record(Schema record, List names, List union(Schema union, List> options) { - Preconditions.checkArgument(options.contains(ValueWriters.nulls()), - "Cannot create writer for non-option union: %s", union); - Preconditions.checkArgument(options.size() == 2, - "Cannot create writer for non-option union: %s", union); + Preconditions.checkArgument( + options.contains(ValueWriters.nulls()), + "Cannot create writer for non-option union: %s", + union); + Preconditions.checkArgument( + options.size() == 2, "Cannot create writer for non-option union: %s", union); if (union.getTypes().get(0).getType() == Schema.Type.NULL) { return ValueWriters.option(0, options.get(1)); } else { diff --git a/core/src/main/java/org/apache/iceberg/avro/HasIds.java b/core/src/main/java/org/apache/iceberg/avro/HasIds.java index 85ce3809a974..52ecfd01eaac 100644 --- a/core/src/main/java/org/apache/iceberg/avro/HasIds.java +++ b/core/src/main/java/org/apache/iceberg/avro/HasIds.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.util.List; @@ -25,8 +24,8 @@ import org.apache.iceberg.relocated.com.google.common.collect.Iterables; /** - * Lazily evaluates the schema to see if any field ids are set. - * Returns true when a first field is found which has field id set + * Lazily evaluates the schema to see if any field ids are set. Returns true when a first field is + * found which has field id set */ class HasIds extends AvroCustomOrderSchemaVisitor { @Override @@ -42,9 +41,9 @@ public Boolean field(Schema.Field field, Supplier fieldResult) { @Override public Boolean map(Schema map, Supplier value) { - return AvroSchemaUtil.hasProperty(map, AvroSchemaUtil.KEY_ID_PROP) || - AvroSchemaUtil.hasProperty(map, AvroSchemaUtil.VALUE_ID_PROP) || - value.get(); + return AvroSchemaUtil.hasProperty(map, AvroSchemaUtil.KEY_ID_PROP) + || AvroSchemaUtil.hasProperty(map, AvroSchemaUtil.VALUE_ID_PROP) + || value.get(); } @Override diff --git a/core/src/main/java/org/apache/iceberg/avro/LogicalMap.java b/core/src/main/java/org/apache/iceberg/avro/LogicalMap.java index 04f7cd3015d6..7232a934a77b 100644 --- a/core/src/main/java/org/apache/iceberg/avro/LogicalMap.java +++ b/core/src/main/java/org/apache/iceberg/avro/LogicalMap.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import org.apache.avro.LogicalType; @@ -38,9 +37,13 @@ private LogicalMap() { @Override public void validate(Schema schema) { super.validate(schema); - Preconditions.checkArgument(schema.getType() == Schema.Type.ARRAY, - "Invalid type for map, must be an array: %s", schema); - Preconditions.checkArgument(AvroSchemaUtil.isKeyValueSchema(schema.getElementType()), - "Invalid key-value record: %s", schema.getElementType()); + Preconditions.checkArgument( + schema.getType() == Schema.Type.ARRAY, + "Invalid type for map, must be an array: %s", + schema); + Preconditions.checkArgument( + AvroSchemaUtil.isKeyValueSchema(schema.getElementType()), + "Invalid key-value record: %s", + schema.getElementType()); } } diff --git a/core/src/main/java/org/apache/iceberg/avro/MetricsAwareDatumWriter.java b/core/src/main/java/org/apache/iceberg/avro/MetricsAwareDatumWriter.java index 08aa194993ba..887a1a816309 100644 --- a/core/src/main/java/org/apache/iceberg/avro/MetricsAwareDatumWriter.java +++ b/core/src/main/java/org/apache/iceberg/avro/MetricsAwareDatumWriter.java @@ -16,20 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.util.stream.Stream; import org.apache.avro.io.DatumWriter; import org.apache.iceberg.FieldMetrics; -/** - * Wrapper writer around {@link DatumWriter} with metrics support. - */ +/** Wrapper writer around {@link DatumWriter} with metrics support. */ public interface MetricsAwareDatumWriter extends DatumWriter { - /** - * Returns a stream of {@link FieldMetrics} that this MetricsAwareDatumWriter keeps track of. - */ + /** Returns a stream of {@link FieldMetrics} that this MetricsAwareDatumWriter keeps track of. */ Stream metrics(); } diff --git a/core/src/main/java/org/apache/iceberg/avro/MissingIds.java b/core/src/main/java/org/apache/iceberg/avro/MissingIds.java index 6b7a2d2161ed..e47d012a36ee 100644 --- a/core/src/main/java/org/apache/iceberg/avro/MissingIds.java +++ b/core/src/main/java/org/apache/iceberg/avro/MissingIds.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.util.List; @@ -26,11 +25,11 @@ /** * Returns true once the first node is found with ID property missing. Reverse of {@link HasIds} - *

- * Note: To use {@link AvroSchemaUtil#toIceberg(Schema)} on an avro schema, the avro schema need to be either - * have IDs on every node or not have IDs at all. Invoke {@link AvroSchemaUtil#hasIds(Schema)} only proves - * that the schema has at least one ID, and not sufficient condition for invoking - * {@link AvroSchemaUtil#toIceberg(Schema)} on the schema. + * + *

Note: To use {@link AvroSchemaUtil#toIceberg(Schema)} on an avro schema, the avro schema need + * to be either have IDs on every node or not have IDs at all. Invoke {@link + * AvroSchemaUtil#hasIds(Schema)} only proves that the schema has at least one ID, and not + * sufficient condition for invoking {@link AvroSchemaUtil#toIceberg(Schema)} on the schema. */ class MissingIds extends AvroCustomOrderSchemaVisitor { @Override @@ -47,9 +46,9 @@ public Boolean field(Schema.Field field, Supplier fieldResult) { @Override public Boolean map(Schema map, Supplier value) { // either this map node is missing (key/value) ID, or the subtree is missing ID somewhere - return !AvroSchemaUtil.hasProperty(map, AvroSchemaUtil.KEY_ID_PROP) || - !AvroSchemaUtil.hasProperty(map, AvroSchemaUtil.VALUE_ID_PROP) || - value.get(); + return !AvroSchemaUtil.hasProperty(map, AvroSchemaUtil.KEY_ID_PROP) + || !AvroSchemaUtil.hasProperty(map, AvroSchemaUtil.VALUE_ID_PROP) + || value.get(); } @Override @@ -65,7 +64,8 @@ public Boolean union(Schema union, Iterable options) { @Override public Boolean primitive(Schema primitive) { - // primitive node cannot be missing ID as Iceberg do not assign primitive node IDs in the first place + // primitive node cannot be missing ID as Iceberg do not assign primitive node IDs in the first + // place return false; } } diff --git a/core/src/main/java/org/apache/iceberg/avro/ProjectionDatumReader.java b/core/src/main/java/org/apache/iceberg/avro/ProjectionDatumReader.java index 1ee77cb9e966..3b04fe30db65 100644 --- a/core/src/main/java/org/apache/iceberg/avro/ProjectionDatumReader.java +++ b/core/src/main/java/org/apache/iceberg/avro/ProjectionDatumReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.IOException; @@ -40,10 +39,11 @@ public class ProjectionDatumReader implements DatumReader, SupportsRowPosi private Schema fileSchema = null; private DatumReader wrapped = null; - public ProjectionDatumReader(Function> getReader, - org.apache.iceberg.Schema expectedSchema, - Map renames, - NameMapping nameMapping) { + public ProjectionDatumReader( + Function> getReader, + org.apache.iceberg.Schema expectedSchema, + Map renames, + NameMapping nameMapping) { this.getReader = getReader; this.expectedSchema = expectedSchema; this.renames = renames; diff --git a/core/src/main/java/org/apache/iceberg/avro/PruneColumns.java b/core/src/main/java/org/apache/iceberg/avro/PruneColumns.java index 99855ea050da..2de2c0fe029d 100644 --- a/core/src/main/java/org/apache/iceberg/avro/PruneColumns.java +++ b/core/src/main/java/org/apache/iceberg/avro/PruneColumns.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.util.Collections; @@ -119,8 +118,10 @@ public Schema record(Schema record, List names, List fields) { @Override public Schema union(Schema union, List options) { - Preconditions.checkState(AvroSchemaUtil.isOptionSchema(union), - "Invalid schema: non-option unions are not supported: %s", union); + Preconditions.checkState( + AvroSchemaUtil.isOptionSchema(union), + "Invalid schema: non-option unions are not supported: %s", + union); // only unions with null are allowed, and a null schema results in null Schema pruned = null; @@ -145,8 +146,10 @@ public Schema union(Schema union, List options) { public Schema array(Schema array, Schema element) { if (array.getLogicalType() instanceof LogicalMap) { Schema keyValue = array.getElementType(); - Integer keyId = AvroSchemaUtil.getFieldId(keyValue.getField("key"), nameMapping, fieldNames()); - Integer valueId = AvroSchemaUtil.getFieldId(keyValue.getField("value"), nameMapping, fieldNames()); + Integer keyId = + AvroSchemaUtil.getFieldId(keyValue.getField("key"), nameMapping, fieldNames()); + Integer valueId = + AvroSchemaUtil.getFieldId(keyValue.getField("value"), nameMapping, fieldNames()); if (keyId == null || valueId == null) { if (keyId != null || valueId != null) { LOG.warn("Map schema {} should have both key and value ids set or both unset", array); @@ -162,15 +165,17 @@ public Schema array(Schema array, Schema element) { Schema valueProjection = element.getField("value").schema(); // it is possible that key is not selected, and // key schemas can be different if new field ids were assigned to them - if (keyProjectionField != null && - !Objects.equals(keyValue.getField("key").schema(), keyProjectionField.schema())) { + if (keyProjectionField != null + && !Objects.equals(keyValue.getField("key").schema(), keyProjectionField.schema())) { Preconditions.checkState( - SchemaNormalization.parsingFingerprint64(keyValue.getField("key").schema()) == - SchemaNormalization.parsingFingerprint64(keyProjectionField.schema()), + SchemaNormalization.parsingFingerprint64(keyValue.getField("key").schema()) + == SchemaNormalization.parsingFingerprint64(keyProjectionField.schema()), "Map keys should not be projected"); - return AvroSchemaUtil.createMap(keyId, keyProjectionField.schema(), valueId, valueProjection); + return AvroSchemaUtil.createMap( + keyId, keyProjectionField.schema(), valueId, valueProjection); } else if (!Objects.equals(keyValue.getField("value").schema(), valueProjection)) { - return AvroSchemaUtil.createMap(keyId, keyValue.getField("key").schema(), valueId, valueProjection); + return AvroSchemaUtil.createMap( + keyId, keyValue.getField("key").schema(), valueId, valueProjection); } else { return complexMapWithIds(array, keyId, valueId); } @@ -234,8 +239,8 @@ private Schema arrayWithId(Schema array, Integer elementId) { private Schema complexMapWithIds(Schema map, Integer keyId, Integer valueId) { Schema keyValue = map.getElementType(); - if (!AvroSchemaUtil.hasFieldId(keyValue.getField("key")) || - !AvroSchemaUtil.hasFieldId(keyValue.getField("value"))) { + if (!AvroSchemaUtil.hasFieldId(keyValue.getField("key")) + || !AvroSchemaUtil.hasFieldId(keyValue.getField("value"))) { return AvroSchemaUtil.createMap( keyId, keyValue.getField("key").schema(), valueId, keyValue.getField("value").schema()); @@ -244,8 +249,8 @@ private Schema complexMapWithIds(Schema map, Integer keyId, Integer valueId) { } private Schema mapWithIds(Schema map, Integer keyId, Integer valueId) { - if (!AvroSchemaUtil.hasProperty(map, AvroSchemaUtil.KEY_ID_PROP) || - !AvroSchemaUtil.hasProperty(map, AvroSchemaUtil.VALUE_ID_PROP)) { + if (!AvroSchemaUtil.hasProperty(map, AvroSchemaUtil.KEY_ID_PROP) + || !AvroSchemaUtil.hasProperty(map, AvroSchemaUtil.VALUE_ID_PROP)) { Schema result = Schema.createMap(map.getValueType()); result.addProp(AvroSchemaUtil.KEY_ID_PROP, keyId); result.addProp(AvroSchemaUtil.VALUE_ID_PROP, valueId); @@ -261,8 +266,9 @@ public Schema primitive(Schema primitive) { } private static Schema copyRecord(Schema record, List newFields) { - Schema copy = Schema.createRecord(record.getName(), - record.getDoc(), record.getNamespace(), record.isError(), newFields); + Schema copy = + Schema.createRecord( + record.getName(), record.getDoc(), record.getNamespace(), record.isError(), newFields); for (Map.Entry prop : record.getObjectProps().entrySet()) { copy.addProp(prop.getKey(), prop.getValue()); @@ -282,11 +288,20 @@ private boolean isRecord(Schema field) { private static Schema makeEmptyCopy(Schema field) { if (AvroSchemaUtil.isOptionSchema(field)) { Schema innerSchema = AvroSchemaUtil.fromOption(field); - Schema emptyRecord = Schema.createRecord(innerSchema.getName(), innerSchema.getDoc(), innerSchema.getNamespace(), - innerSchema.isError(), Collections.emptyList()); + Schema emptyRecord = + Schema.createRecord( + innerSchema.getName(), + innerSchema.getDoc(), + innerSchema.getNamespace(), + innerSchema.isError(), + Collections.emptyList()); return AvroSchemaUtil.toOption(emptyRecord); } else { - return Schema.createRecord(field.getName(), field.getDoc(), field.getNamespace(), field.isError(), + return Schema.createRecord( + field.getName(), + field.getDoc(), + field.getNamespace(), + field.isError(), Collections.emptyList()); } } @@ -299,10 +314,15 @@ private static Schema.Field copyField(Schema.Field field, Schema newSchema, Inte } else { newSchemaReordered = newSchema; } - // do not copy over default values as the file is expected to have values for fields already in the file schema - Schema.Field copy = new Schema.Field(field.name(), - newSchemaReordered, field.doc(), - AvroSchemaUtil.isOptionSchema(newSchemaReordered) ? JsonProperties.NULL_VALUE : null, field.order()); + // do not copy over default values as the file is expected to have values for fields already in + // the file schema + Schema.Field copy = + new Schema.Field( + field.name(), + newSchemaReordered, + field.doc(), + AvroSchemaUtil.isOptionSchema(newSchemaReordered) ? JsonProperties.NULL_VALUE : null, + field.order()); for (Map.Entry prop : field.getObjectProps().entrySet()) { copy.addProp(prop.getKey(), prop.getValue()); @@ -310,7 +330,8 @@ private static Schema.Field copyField(Schema.Field field, Schema newSchema, Inte if (AvroSchemaUtil.hasFieldId(field)) { int existingFieldId = AvroSchemaUtil.getFieldId(field); - Preconditions.checkArgument(existingFieldId == fieldId, + Preconditions.checkArgument( + existingFieldId == fieldId, "Existing field does match with that fetched from name mapping"); } else { // field may not have a fieldId if the fieldId was fetched from nameMapping @@ -321,6 +342,7 @@ private static Schema.Field copyField(Schema.Field field, Schema newSchema, Inte } private static boolean isOptionSchemaWithNonNullFirstOption(Schema schema) { - return AvroSchemaUtil.isOptionSchema(schema) && schema.getTypes().get(0).getType() != Schema.Type.NULL; + return AvroSchemaUtil.isOptionSchema(schema) + && schema.getTypes().get(0).getType() != Schema.Type.NULL; } } diff --git a/core/src/main/java/org/apache/iceberg/avro/RemoveIds.java b/core/src/main/java/org/apache/iceberg/avro/RemoveIds.java index 7fae9474455d..dccc8bf57e9d 100644 --- a/core/src/main/java/org/apache/iceberg/avro/RemoveIds.java +++ b/core/src/main/java/org/apache/iceberg/avro/RemoveIds.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.util.List; @@ -72,7 +71,8 @@ public Schema union(Schema union, List options) { } private static Schema.Field copyField(Schema.Field field, Schema newSchema) { - Schema.Field copy = new Schema.Field(field.name(), newSchema, field.doc(), field.defaultVal(), field.order()); + Schema.Field copy = + new Schema.Field(field.name(), newSchema, field.doc(), field.defaultVal(), field.order()); for (Map.Entry prop : field.getObjectProps().entrySet()) { String key = prop.getKey(); if (!Objects.equals(key, AvroSchemaUtil.FIELD_ID_PROP)) { @@ -83,7 +83,8 @@ private static Schema.Field copyField(Schema.Field field, Schema newSchema) { } static Schema removeIds(org.apache.iceberg.Schema schema) { - return AvroSchemaVisitor.visit(AvroSchemaUtil.convert(schema.asStruct(), "table"), new RemoveIds()); + return AvroSchemaVisitor.visit( + AvroSchemaUtil.convert(schema.asStruct(), "table"), new RemoveIds()); } public static Schema removeIds(Schema schema) { diff --git a/core/src/main/java/org/apache/iceberg/avro/SchemaToType.java b/core/src/main/java/org/apache/iceberg/avro/SchemaToType.java index 73c86226007f..174d63975195 100644 --- a/core/src/main/java/org/apache/iceberg/avro/SchemaToType.java +++ b/core/src/main/java/org/apache/iceberg/avro/SchemaToType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.util.List; @@ -105,8 +104,8 @@ public Type record(Schema record, List names, List fieldTypes) { @Override public Type union(Schema union, List options) { - Preconditions.checkArgument(AvroSchemaUtil.isOptionSchema(union), - "Unsupported type: non-option union: %s", union); + Preconditions.checkArgument( + AvroSchemaUtil.isOptionSchema(union), "Unsupported type: non-option union: %s", union); // records, arrays, and maps will check nullability later if (options.get(0) == null) { return options.get(1); @@ -120,8 +119,10 @@ public Type array(Schema array, Type elementType) { if (array.getLogicalType() instanceof LogicalMap) { // map stored as an array Schema keyValueSchema = array.getElementType(); - Preconditions.checkArgument(AvroSchemaUtil.isKeyValueSchema(keyValueSchema), - "Invalid key-value pair schema: %s", keyValueSchema); + Preconditions.checkArgument( + AvroSchemaUtil.isKeyValueSchema(keyValueSchema), + "Invalid key-value pair schema: %s", + keyValueSchema); Types.StructType keyValueType = elementType.asStructType(); Types.NestedField keyField = keyValueType.field("key"); @@ -174,14 +175,12 @@ public Type primitive(Schema primitive) { } else if (logical instanceof LogicalTypes.Date) { return Types.DateType.get(); - } else if ( - logical instanceof LogicalTypes.TimeMillis || - logical instanceof LogicalTypes.TimeMicros) { + } else if (logical instanceof LogicalTypes.TimeMillis + || logical instanceof LogicalTypes.TimeMicros) { return Types.TimeType.get(); - } else if ( - logical instanceof LogicalTypes.TimestampMillis || - logical instanceof LogicalTypes.TimestampMicros) { + } else if (logical instanceof LogicalTypes.TimestampMillis + || logical instanceof LogicalTypes.TimestampMicros) { if (AvroSchemaUtil.isTimestamptz(primitive)) { return Types.TimestampType.withZone(); } else { @@ -215,7 +214,6 @@ public Type primitive(Schema primitive) { return null; } - throw new UnsupportedOperationException( - "Unsupported primitive type: " + primitive); + throw new UnsupportedOperationException("Unsupported primitive type: " + primitive); } } diff --git a/core/src/main/java/org/apache/iceberg/avro/SupportsRowPosition.java b/core/src/main/java/org/apache/iceberg/avro/SupportsRowPosition.java index 1113cd4e7088..db2a50a7b239 100644 --- a/core/src/main/java/org/apache/iceberg/avro/SupportsRowPosition.java +++ b/core/src/main/java/org/apache/iceberg/avro/SupportsRowPosition.java @@ -16,13 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.util.function.Supplier; /** - * Interface for readers that accept a callback to determine the starting row position of an Avro split. + * Interface for readers that accept a callback to determine the starting row position of an Avro + * split. */ public interface SupportsRowPosition { void setRowPositionSupplier(Supplier posSupplier); diff --git a/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java b/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java index 3f1885b374a8..bc2847e1b4ba 100644 --- a/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java +++ b/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.util.Deque; @@ -37,17 +36,17 @@ class TypeToSchema extends TypeUtil.SchemaVisitor { private static final Schema LONG_SCHEMA = Schema.create(Schema.Type.LONG); private static final Schema FLOAT_SCHEMA = Schema.create(Schema.Type.FLOAT); private static final Schema DOUBLE_SCHEMA = Schema.create(Schema.Type.DOUBLE); - private static final Schema DATE_SCHEMA = LogicalTypes.date() - .addToSchema(Schema.create(Schema.Type.INT)); - private static final Schema TIME_SCHEMA = LogicalTypes.timeMicros() - .addToSchema(Schema.create(Schema.Type.LONG)); - private static final Schema TIMESTAMP_SCHEMA = LogicalTypes.timestampMicros() - .addToSchema(Schema.create(Schema.Type.LONG)); - private static final Schema TIMESTAMPTZ_SCHEMA = LogicalTypes.timestampMicros() - .addToSchema(Schema.create(Schema.Type.LONG)); + private static final Schema DATE_SCHEMA = + LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT)); + private static final Schema TIME_SCHEMA = + LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG)); + private static final Schema TIMESTAMP_SCHEMA = + LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)); + private static final Schema TIMESTAMPTZ_SCHEMA = + LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)); private static final Schema STRING_SCHEMA = Schema.create(Schema.Type.STRING); - private static final Schema UUID_SCHEMA = LogicalTypes.uuid() - .addToSchema(Schema.createFixed("uuid_fixed", null, null, 16)); + private static final Schema UUID_SCHEMA = + LogicalTypes.uuid().addToSchema(Schema.createFixed("uuid_fixed", null, null, 16)); private static final Schema BINARY_SCHEMA = Schema.create(Schema.Type.BYTES); static { @@ -100,10 +99,13 @@ public Schema struct(Types.StructType struct, List fieldSchemas) { Types.NestedField structField = structFields.get(i); String origFieldName = structField.name(); boolean isValidFieldName = AvroSchemaUtil.validAvroName(origFieldName); - String fieldName = isValidFieldName ? origFieldName : AvroSchemaUtil.sanitize(origFieldName); - Schema.Field field = new Schema.Field( - fieldName, fieldSchemas.get(i), structField.doc(), - structField.isOptional() ? JsonProperties.NULL_VALUE : null); + String fieldName = isValidFieldName ? origFieldName : AvroSchemaUtil.sanitize(origFieldName); + Schema.Field field = + new Schema.Field( + fieldName, + fieldSchemas.get(i), + structField.doc(), + structField.isOptional() ? JsonProperties.NULL_VALUE : null); if (!isValidFieldName) { field.addProp(AvroSchemaUtil.ICEBERG_FIELD_NAME_PROP, origFieldName); } @@ -156,14 +158,19 @@ public Schema map(Types.MapType map, Schema keySchema, Schema valueSchema) { if (keySchema.getType() == Schema.Type.STRING) { // if the map has string keys, use Avro's map type - mapSchema = Schema.createMap( - map.isValueOptional() ? AvroSchemaUtil.toOption(valueSchema) : valueSchema); + mapSchema = + Schema.createMap( + map.isValueOptional() ? AvroSchemaUtil.toOption(valueSchema) : valueSchema); mapSchema.addProp(AvroSchemaUtil.KEY_ID_PROP, map.keyId()); mapSchema.addProp(AvroSchemaUtil.VALUE_ID_PROP, map.valueId()); } else { - mapSchema = AvroSchemaUtil.createMap(map.keyId(), keySchema, - map.valueId(), map.isValueOptional() ? AvroSchemaUtil.toOption(valueSchema) : valueSchema); + mapSchema = + AvroSchemaUtil.createMap( + map.keyId(), + keySchema, + map.valueId(), + map.isValueOptional() ? AvroSchemaUtil.toOption(valueSchema) : valueSchema); } results.put(map, mapSchema); @@ -218,14 +225,17 @@ public Schema primitive(Type.PrimitiveType primitive) { break; case DECIMAL: Types.DecimalType decimal = (Types.DecimalType) primitive; - primitiveSchema = LogicalTypes.decimal(decimal.precision(), decimal.scale()) - .addToSchema(Schema.createFixed( - "decimal_" + decimal.precision() + "_" + decimal.scale(), - null, null, TypeUtil.decimalRequiredBytes(decimal.precision()))); + primitiveSchema = + LogicalTypes.decimal(decimal.precision(), decimal.scale()) + .addToSchema( + Schema.createFixed( + "decimal_" + decimal.precision() + "_" + decimal.scale(), + null, + null, + TypeUtil.decimalRequiredBytes(decimal.precision()))); break; default: - throw new UnsupportedOperationException( - "Unsupported type ID: " + primitive.typeId()); + throw new UnsupportedOperationException("Unsupported type ID: " + primitive.typeId()); } results.put(primitive, primitiveSchema); diff --git a/core/src/main/java/org/apache/iceberg/avro/UUIDConversion.java b/core/src/main/java/org/apache/iceberg/avro/UUIDConversion.java index a88cc213494b..104ad04aea5c 100644 --- a/core/src/main/java/org/apache/iceberg/avro/UUIDConversion.java +++ b/core/src/main/java/org/apache/iceberg/avro/UUIDConversion.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.util.UUID; diff --git a/core/src/main/java/org/apache/iceberg/avro/ValueReader.java b/core/src/main/java/org/apache/iceberg/avro/ValueReader.java index 2264dc896ca6..5470b8168f1b 100644 --- a/core/src/main/java/org/apache/iceberg/avro/ValueReader.java +++ b/core/src/main/java/org/apache/iceberg/avro/ValueReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.IOException; diff --git a/core/src/main/java/org/apache/iceberg/avro/ValueReaders.java b/core/src/main/java/org/apache/iceberg/avro/ValueReaders.java index e85211f8e517..19789cce82fc 100644 --- a/core/src/main/java/org/apache/iceberg/avro/ValueReaders.java +++ b/core/src/main/java/org/apache/iceberg/avro/ValueReaders.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; +import static java.util.Collections.emptyIterator; + import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; @@ -45,11 +46,8 @@ import org.apache.iceberg.types.Types; import org.apache.iceberg.util.UUIDUtil; -import static java.util.Collections.emptyIterator; - public class ValueReaders { - private ValueReaders() { - } + private ValueReaders() {} public static ValueReader nulls() { return NullReader.INSTANCE; @@ -131,15 +129,18 @@ public static ValueReader> array(ValueReader elementReader) return new ArrayReader<>(elementReader); } - public static ValueReader> arrayMap(ValueReader keyReader, ValueReader valueReader) { + public static ValueReader> arrayMap( + ValueReader keyReader, ValueReader valueReader) { return new ArrayMapReader<>(keyReader, valueReader); } - public static ValueReader> map(ValueReader keyReader, ValueReader valueReader) { + public static ValueReader> map( + ValueReader keyReader, ValueReader valueReader) { return new MapReader<>(keyReader, valueReader); } - public static ValueReader record(List> readers, Schema recordSchema) { + public static ValueReader record( + List> readers, Schema recordSchema) { return new RecordReader(readers, recordSchema); } @@ -151,8 +152,7 @@ public static ValueReader record( private static class NullReader implements ValueReader { private static final NullReader INSTANCE = new NullReader(); - private NullReader() { - } + private NullReader() {} @Override public Object read(Decoder decoder, Object ignored) throws IOException { @@ -164,8 +164,7 @@ public Object read(Decoder decoder, Object ignored) throws IOException { private static class BooleanReader implements ValueReader { private static final BooleanReader INSTANCE = new BooleanReader(); - private BooleanReader() { - } + private BooleanReader() {} @Override public Boolean read(Decoder decoder, Object ignored) throws IOException { @@ -176,8 +175,7 @@ public Boolean read(Decoder decoder, Object ignored) throws IOException { private static class IntegerReader implements ValueReader { private static final IntegerReader INSTANCE = new IntegerReader(); - private IntegerReader() { - } + private IntegerReader() {} @Override public Integer read(Decoder decoder, Object ignored) throws IOException { @@ -188,8 +186,7 @@ public Integer read(Decoder decoder, Object ignored) throws IOException { private static class LongReader implements ValueReader { private static final LongReader INSTANCE = new LongReader(); - private LongReader() { - } + private LongReader() {} @Override public Long read(Decoder decoder, Object ignored) throws IOException { @@ -200,8 +197,7 @@ public Long read(Decoder decoder, Object ignored) throws IOException { private static class FloatReader implements ValueReader { private static final FloatReader INSTANCE = new FloatReader(); - private FloatReader() { - } + private FloatReader() {} @Override public Float read(Decoder decoder, Object ignored) throws IOException { @@ -212,8 +208,7 @@ public Float read(Decoder decoder, Object ignored) throws IOException { private static class DoubleReader implements ValueReader { private static final DoubleReader INSTANCE = new DoubleReader(); - private DoubleReader() { - } + private DoubleReader() {} @Override public Double read(Decoder decoder, Object ignored) throws IOException { @@ -225,25 +220,23 @@ private static class StringReader implements ValueReader { private static final StringReader INSTANCE = new StringReader(); private final ThreadLocal reusedTempUtf8 = ThreadLocal.withInitial(Utf8::new); - private StringReader() { - } + private StringReader() {} @Override public String read(Decoder decoder, Object ignored) throws IOException { // use the decoder's readString(Utf8) method because it may be a resolving decoder this.reusedTempUtf8.set(decoder.readString(reusedTempUtf8.get())); return reusedTempUtf8.get().toString(); -// int length = decoder.readInt(); -// byte[] bytes = new byte[length]; -// decoder.readFixed(bytes, 0, length); + // int length = decoder.readInt(); + // byte[] bytes = new byte[length]; + // decoder.readFixed(bytes, 0, length); } } private static class Utf8Reader implements ValueReader { private static final Utf8Reader INSTANCE = new Utf8Reader(); - private Utf8Reader() { - } + private Utf8Reader() {} @Override public Utf8 read(Decoder decoder, Object reuse) throws IOException { @@ -253,23 +246,24 @@ public Utf8 read(Decoder decoder, Object reuse) throws IOException { } else { return decoder.readString(null); } -// int length = decoder.readInt(); -// byte[] bytes = new byte[length]; -// decoder.readFixed(bytes, 0, length); + // int length = decoder.readInt(); + // byte[] bytes = new byte[length]; + // decoder.readFixed(bytes, 0, length); } } private static class UUIDReader implements ValueReader { - private static final ThreadLocal BUFFER = ThreadLocal.withInitial(() -> { - ByteBuffer buffer = ByteBuffer.allocate(16); - buffer.order(ByteOrder.BIG_ENDIAN); - return buffer; - }); + private static final ThreadLocal BUFFER = + ThreadLocal.withInitial( + () -> { + ByteBuffer buffer = ByteBuffer.allocate(16); + buffer.order(ByteOrder.BIG_ENDIAN); + return buffer; + }); private static final UUIDReader INSTANCE = new UUIDReader(); - private UUIDReader() { - } + private UUIDReader() {} @Override @SuppressWarnings("ByteBufferBackingArray") @@ -334,8 +328,7 @@ public GenericData.Fixed read(Decoder decoder, Object reuse) throws IOException private static class BytesReader implements ValueReader { private static final BytesReader INSTANCE = new BytesReader(); - private BytesReader() { - } + private BytesReader() {} @Override public byte[] read(Decoder decoder, Object reuse) throws IOException { @@ -346,18 +339,17 @@ public byte[] read(Decoder decoder, Object reuse) throws IOException { // a new buffer. since the usual case requires an allocation anyway to get the size right, // just allocate every time. return decoder.readBytes(null).array(); -// int length = decoder.readInt(); -// byte[] bytes = new byte[length]; -// decoder.readFixed(bytes, 0, length); -// return bytes; + // int length = decoder.readInt(); + // byte[] bytes = new byte[length]; + // decoder.readFixed(bytes, 0, length); + // return bytes; } } private static class ByteBufferReader implements ValueReader { private static final ByteBufferReader INSTANCE = new ByteBufferReader(); - private ByteBufferReader() { - } + private ByteBufferReader() {} @Override public ByteBuffer read(Decoder decoder, Object reuse) throws IOException { @@ -367,10 +359,10 @@ public ByteBuffer read(Decoder decoder, Object reuse) throws IOException { } else { return decoder.readBytes(null); } -// int length = decoder.readInt(); -// byte[] bytes = new byte[length]; -// decoder.readFixed(bytes, 0, length); -// return bytes; + // int length = decoder.readInt(); + // byte[] bytes = new byte[length]; + // decoder.readFixed(bytes, 0, length); + // return bytes; } } @@ -494,9 +486,8 @@ public Map read(Decoder decoder, Object reuse) throws IOException { } long chunkLength = decoder.readArrayStart(); - Iterator> kvIter = lastMap != null ? - lastMap.entrySet().iterator() : - emptyIterator(); + Iterator> kvIter = + lastMap != null ? lastMap.entrySet().iterator() : emptyIterator(); while (chunkLength > 0) { for (long i = 0; i < chunkLength; i += 1) { @@ -548,9 +539,8 @@ public Map read(Decoder decoder, Object reuse) throws IOException { } long chunkLength = decoder.readMapStart(); - Iterator> kvIter = lastMap != null ? - lastMap.entrySet().iterator() : - emptyIterator(); + Iterator> kvIter = + lastMap != null ? lastMap.entrySet().iterator() : emptyIterator(); while (chunkLength > 0) { for (long i = 0; i < chunkLength; i += 1) { @@ -599,12 +589,13 @@ protected StructReader(List> readers, Schema schema) { this.positions = new int[0]; this.constants = new Object[0]; } else { - this.positions = new int[]{isDeletedColumnPos}; - this.constants = new Object[]{false}; + this.positions = new int[] {isDeletedColumnPos}; + this.constants = new Object[] {false}; } } - protected StructReader(List> readers, Types.StructType struct, Map idToConstant) { + protected StructReader( + List> readers, Types.StructType struct, Map idToConstant) { this.readers = readers.toArray(new ValueReader[0]); List fields = struct.fields(); @@ -720,10 +711,11 @@ static class IndexedRecordReader extends StructReader> readers, Class recordClass, Schema schema) { super(readers, schema); this.recordClass = recordClass; - this.ctor = DynConstructors.builder(IndexedRecord.class) - .hiddenImpl(recordClass, Schema.class) - .hiddenImpl(recordClass) - .build(); + this.ctor = + DynConstructors.builder(IndexedRecord.class) + .hiddenImpl(recordClass, Schema.class) + .hiddenImpl(recordClass) + .build(); this.schema = schema; } diff --git a/core/src/main/java/org/apache/iceberg/avro/ValueWriter.java b/core/src/main/java/org/apache/iceberg/avro/ValueWriter.java index 5059d125658e..2b2c50bd5e30 100644 --- a/core/src/main/java/org/apache/iceberg/avro/ValueWriter.java +++ b/core/src/main/java/org/apache/iceberg/avro/ValueWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.IOException; diff --git a/core/src/main/java/org/apache/iceberg/avro/ValueWriters.java b/core/src/main/java/org/apache/iceberg/avro/ValueWriters.java index 4e4d375ddd30..b99b67fe7f4b 100644 --- a/core/src/main/java/org/apache/iceberg/avro/ValueWriters.java +++ b/core/src/main/java/org/apache/iceberg/avro/ValueWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.IOException; @@ -38,8 +37,7 @@ import org.apache.iceberg.util.DecimalUtil; public class ValueWriters { - private ValueWriters() { - } + private ValueWriters() {} public static ValueWriter nulls() { return NullWriter.INSTANCE; @@ -113,13 +111,13 @@ public static ValueWriter> array(ValueWriter elementWriter) return new CollectionWriter<>(elementWriter); } - public static ValueWriter> arrayMap(ValueWriter keyWriter, - ValueWriter valueWriter) { + public static ValueWriter> arrayMap( + ValueWriter keyWriter, ValueWriter valueWriter) { return new ArrayMapWriter<>(keyWriter, valueWriter); } - public static ValueWriter> map(ValueWriter keyWriter, - ValueWriter valueWriter) { + public static ValueWriter> map( + ValueWriter keyWriter, ValueWriter valueWriter) { return new MapWriter<>(keyWriter, valueWriter); } @@ -130,8 +128,7 @@ public static ValueWriter record(List> writers) { private static class NullWriter implements ValueWriter { private static final NullWriter INSTANCE = new NullWriter(); - private NullWriter() { - } + private NullWriter() {} @Override public void write(Void ignored, Encoder encoder) throws IOException { @@ -142,8 +139,7 @@ public void write(Void ignored, Encoder encoder) throws IOException { private static class BooleanWriter implements ValueWriter { private static final BooleanWriter INSTANCE = new BooleanWriter(); - private BooleanWriter() { - } + private BooleanWriter() {} @Override public void write(Boolean bool, Encoder encoder) throws IOException { @@ -154,8 +150,7 @@ public void write(Boolean bool, Encoder encoder) throws IOException { private static class ByteToIntegerWriter implements ValueWriter { private static final ByteToIntegerWriter INSTANCE = new ByteToIntegerWriter(); - private ByteToIntegerWriter() { - } + private ByteToIntegerWriter() {} @Override public void write(Byte b, Encoder encoder) throws IOException { @@ -166,8 +161,7 @@ public void write(Byte b, Encoder encoder) throws IOException { private static class ShortToIntegerWriter implements ValueWriter { private static final ShortToIntegerWriter INSTANCE = new ShortToIntegerWriter(); - private ShortToIntegerWriter() { - } + private ShortToIntegerWriter() {} @Override public void write(Short s, Encoder encoder) throws IOException { @@ -178,8 +172,7 @@ public void write(Short s, Encoder encoder) throws IOException { private static class IntegerWriter implements ValueWriter { private static final IntegerWriter INSTANCE = new IntegerWriter(); - private IntegerWriter() { - } + private IntegerWriter() {} @Override public void write(Integer i, Encoder encoder) throws IOException { @@ -190,8 +183,7 @@ public void write(Integer i, Encoder encoder) throws IOException { private static class LongWriter implements ValueWriter { private static final LongWriter INSTANCE = new LongWriter(); - private LongWriter() { - } + private LongWriter() {} @Override public void write(Long l, Encoder encoder) throws IOException { @@ -202,8 +194,7 @@ public void write(Long l, Encoder encoder) throws IOException { private static class FloatWriter implements ValueWriter { private static final FloatWriter INSTANCE = new FloatWriter(); - private FloatWriter() { - } + private FloatWriter() {} @Override public void write(Float f, Encoder encoder) throws IOException { @@ -214,8 +205,7 @@ public void write(Float f, Encoder encoder) throws IOException { private static class DoubleWriter implements ValueWriter { private static final DoubleWriter INSTANCE = new DoubleWriter(); - private DoubleWriter() { - } + private DoubleWriter() {} @Override public void write(Double d, Encoder encoder) throws IOException { @@ -226,8 +216,7 @@ public void write(Double d, Encoder encoder) throws IOException { private static class StringWriter implements ValueWriter { private static final StringWriter INSTANCE = new StringWriter(); - private StringWriter() { - } + private StringWriter() {} @Override public void write(Object s, Encoder encoder) throws IOException { @@ -250,8 +239,7 @@ public void write(Object s, Encoder encoder) throws IOException { private static class Utf8Writer implements ValueWriter { private static final Utf8Writer INSTANCE = new Utf8Writer(); - private Utf8Writer() { - } + private Utf8Writer() {} @Override public void write(Utf8 s, Encoder encoder) throws IOException { @@ -260,16 +248,17 @@ public void write(Utf8 s, Encoder encoder) throws IOException { } private static class UUIDWriter implements ValueWriter { - private static final ThreadLocal BUFFER = ThreadLocal.withInitial(() -> { - ByteBuffer buffer = ByteBuffer.allocate(16); - buffer.order(ByteOrder.BIG_ENDIAN); - return buffer; - }); + private static final ThreadLocal BUFFER = + ThreadLocal.withInitial( + () -> { + ByteBuffer buffer = ByteBuffer.allocate(16); + buffer.order(ByteOrder.BIG_ENDIAN); + return buffer; + }); private static final UUIDWriter INSTANCE = new UUIDWriter(); - private UUIDWriter() { - } + private UUIDWriter() {} @Override @SuppressWarnings("ByteBufferBackingArray") @@ -292,8 +281,11 @@ private FixedWriter(int length) { @Override public void write(byte[] bytes, Encoder encoder) throws IOException { - Preconditions.checkArgument(bytes.length == length, - "Cannot write byte array of length %s as fixed[%s]", bytes.length, length); + Preconditions.checkArgument( + bytes.length == length, + "Cannot write byte array of length %s as fixed[%s]", + bytes.length, + length); encoder.writeFixed(bytes); } } @@ -307,8 +299,11 @@ private GenericFixedWriter(int length) { @Override public void write(GenericData.Fixed datum, Encoder encoder) throws IOException { - Preconditions.checkArgument(datum.bytes().length == length, - "Cannot write byte array of length %s as fixed[%s]", datum.bytes().length, length); + Preconditions.checkArgument( + datum.bytes().length == length, + "Cannot write byte array of length %s as fixed[%s]", + datum.bytes().length, + length); encoder.writeFixed(datum.bytes()); } } @@ -316,8 +311,7 @@ public void write(GenericData.Fixed datum, Encoder encoder) throws IOException { private static class BytesWriter implements ValueWriter { private static final BytesWriter INSTANCE = new BytesWriter(); - private BytesWriter() { - } + private BytesWriter() {} @Override public void write(byte[] bytes, Encoder encoder) throws IOException { @@ -328,8 +322,7 @@ public void write(byte[] bytes, Encoder encoder) throws IOException { private static class ByteBufferWriter implements ValueWriter { private static final ByteBufferWriter INSTANCE = new ByteBufferWriter(); - private ByteBufferWriter() { - } + private ByteBufferWriter() {} @Override public void write(ByteBuffer bytes, Encoder encoder) throws IOException { @@ -345,12 +338,14 @@ private static class DecimalWriter implements ValueWriter { private DecimalWriter(int precision, int scale) { this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(BigDecimal decimal, Encoder encoder) throws IOException { - encoder.writeFixed(DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal, bytes.get())); + encoder.writeFixed( + DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal, bytes.get())); } } diff --git a/core/src/main/java/org/apache/iceberg/catalog/BaseSessionCatalog.java b/core/src/main/java/org/apache/iceberg/catalog/BaseSessionCatalog.java index 8724cbdb79bb..d6ee4d345cfa 100644 --- a/core/src/main/java/org/apache/iceberg/catalog/BaseSessionCatalog.java +++ b/core/src/main/java/org/apache/iceberg/catalog/BaseSessionCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.catalog; import com.github.benmanes.caffeine.cache.Cache; @@ -33,9 +32,8 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; public abstract class BaseSessionCatalog implements SessionCatalog { - private final Cache catalogs = Caffeine.newBuilder() - .expireAfterAccess(10, TimeUnit.MINUTES) - .build(); + private final Cache catalogs = + Caffeine.newBuilder().expireAfterAccess(10, TimeUnit.MINUTES).build(); private String name = null; private Map properties = null; @@ -147,12 +145,14 @@ public boolean dropNamespace(Namespace namespace) throws NamespaceNotEmptyExcept @Override public boolean setProperties(Namespace namespace, Map updates) { - return BaseSessionCatalog.this.updateNamespaceMetadata(context, namespace, updates, ImmutableSet.of()); + return BaseSessionCatalog.this.updateNamespaceMetadata( + context, namespace, updates, ImmutableSet.of()); } @Override public boolean removeProperties(Namespace namespace, Set removals) { - return BaseSessionCatalog.this.updateNamespaceMetadata(context, namespace, ImmutableMap.of(), removals); + return BaseSessionCatalog.this.updateNamespaceMetadata( + context, namespace, ImmutableMap.of(), removals); } @Override diff --git a/core/src/main/java/org/apache/iceberg/catalog/TableIdentifierParser.java b/core/src/main/java/org/apache/iceberg/catalog/TableIdentifierParser.java index 7cc3a78066aa..8c8dc5c1f52e 100644 --- a/core/src/main/java/org/apache/iceberg/catalog/TableIdentifierParser.java +++ b/core/src/main/java/org/apache/iceberg/catalog/TableIdentifierParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.catalog; import com.fasterxml.jackson.core.JsonGenerator; @@ -29,12 +28,13 @@ import org.apache.iceberg.util.JsonUtil; /** - * Parses TableIdentifiers from a JSON representation, which is the JSON - * representation utilized in the REST catalog. - *

- * For TableIdentifier.of("dogs", "owners.and.handlers", "food"), we'd have - * the following JSON representation, where the dot character of an - * individual level is in the namespace is replaced by the unit separator byte character. + * Parses TableIdentifiers from a JSON representation, which is the JSON representation utilized in + * the REST catalog. + * + *

For TableIdentifier.of("dogs", "owners.and.handlers", "food"), we'd have the following JSON + * representation, where the dot character of an individual level is in the namespace is replaced by + * the unit separator byte character. + * *

  * {
  *   "namespace": ["dogs", "owners.and.handlers"],
@@ -47,8 +47,7 @@ public class TableIdentifierParser {
   private static final String NAMESPACE = "namespace";
   private static final String NAME = "name";
 
-  private TableIdentifierParser() {
-  }
+  private TableIdentifierParser() {}
 
   public static String toJson(TableIdentifier identifier) {
     return toJson(identifier, false);
@@ -69,7 +68,8 @@ public static String toJson(TableIdentifier identifier, boolean pretty) {
     }
   }
 
-  public static void toJson(TableIdentifier identifier, JsonGenerator generator) throws IOException {
+  public static void toJson(TableIdentifier identifier, JsonGenerator generator)
+      throws IOException {
     generator.writeStartObject();
     generator.writeFieldName(NAMESPACE);
     generator.writeArray(identifier.namespace().levels(), 0, identifier.namespace().length());
@@ -78,23 +78,27 @@ public static void toJson(TableIdentifier identifier, JsonGenerator generator) t
   }
 
   public static TableIdentifier fromJson(String json) {
-    Preconditions.checkArgument(json != null,
-        "Cannot parse table identifier from invalid JSON: null");
-    Preconditions.checkArgument(!json.isEmpty(),
-        "Cannot parse table identifier from invalid JSON: ''");
+    Preconditions.checkArgument(
+        json != null, "Cannot parse table identifier from invalid JSON: null");
+    Preconditions.checkArgument(
+        !json.isEmpty(), "Cannot parse table identifier from invalid JSON: ''");
     try {
       return fromJson(JsonUtil.mapper().readValue(json, JsonNode.class));
     } catch (IOException e) {
-      throw new UncheckedIOException(String.format("Cannot parse table identifier from invalid JSON: %s", json), e);
+      throw new UncheckedIOException(
+          String.format("Cannot parse table identifier from invalid JSON: %s", json), e);
     }
   }
 
   public static TableIdentifier fromJson(JsonNode node) {
-    Preconditions.checkArgument(node != null && !node.isNull() && node.isObject(),
-        "Cannot parse missing or non-object table identifier: %s", node);
+    Preconditions.checkArgument(
+        node != null && !node.isNull() && node.isObject(),
+        "Cannot parse missing or non-object table identifier: %s",
+        node);
     List levels = JsonUtil.getStringListOrNull(NAMESPACE, node);
     String tableName = JsonUtil.getString(NAME, node);
-    Namespace namespace = levels == null ? Namespace.empty() : Namespace.of(levels.toArray(new String[0]));
+    Namespace namespace =
+        levels == null ? Namespace.empty() : Namespace.of(levels.toArray(new String[0]));
     return TableIdentifier.of(namespace, tableName);
   }
 }
diff --git a/core/src/main/java/org/apache/iceberg/data/GenericRecord.java b/core/src/main/java/org/apache/iceberg/data/GenericRecord.java
index 46507b88b391..6fcd4bdd6223 100644
--- a/core/src/main/java/org/apache/iceberg/data/GenericRecord.java
+++ b/core/src/main/java/org/apache/iceberg/data/GenericRecord.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.data;
 
 import com.github.benmanes.caffeine.cache.Caffeine;
@@ -35,15 +34,16 @@
 public class GenericRecord implements Record, StructLike {
   private static final LoadingCache> NAME_MAP_CACHE =
       Caffeine.newBuilder()
-      .weakKeys()
-      .build(struct -> {
-        Map idToPos = Maps.newHashMap();
-        List fields = struct.fields();
-        for (int i = 0; i < fields.size(); i += 1) {
-          idToPos.put(fields.get(i).name(), i);
-        }
-        return idToPos;
-      });
+          .weakKeys()
+          .build(
+              struct -> {
+                Map idToPos = Maps.newHashMap();
+                List fields = struct.fields();
+                for (int i = 0; i < fields.size(); i += 1) {
+                  idToPos.put(fields.get(i).name(), i);
+                }
+                return idToPos;
+              });
 
   public static GenericRecord create(Schema schema) {
     return new GenericRecord(schema.asStruct());
diff --git a/core/src/main/java/org/apache/iceberg/data/IdentityPartitionConverters.java b/core/src/main/java/org/apache/iceberg/data/IdentityPartitionConverters.java
index 2b24f84b2b50..4cb41263152d 100644
--- a/core/src/main/java/org/apache/iceberg/data/IdentityPartitionConverters.java
+++ b/core/src/main/java/org/apache/iceberg/data/IdentityPartitionConverters.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.data;
 
 import org.apache.avro.generic.GenericData;
@@ -24,14 +23,10 @@
 import org.apache.iceberg.types.Types;
 import org.apache.iceberg.util.DateTimeUtil;
 
-
 public class IdentityPartitionConverters {
-  private IdentityPartitionConverters() {
-  }
+  private IdentityPartitionConverters() {}
 
-  /**
-   * Conversions from internal representations to Iceberg generic values.
-   */
+  /** Conversions from internal representations to Iceberg generic values. */
   public static Object convertConstant(Type type, Object value) {
     if (value == null) {
       return null;
diff --git a/core/src/main/java/org/apache/iceberg/data/avro/DataReader.java b/core/src/main/java/org/apache/iceberg/data/avro/DataReader.java
index 7c8cabc9c1fb..1cc901d15bc1 100644
--- a/core/src/main/java/org/apache/iceberg/data/avro/DataReader.java
+++ b/core/src/main/java/org/apache/iceberg/data/avro/DataReader.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.data.avro;
 
 import java.io.IOException;
@@ -39,12 +38,13 @@
 
 public class DataReader implements DatumReader, SupportsRowPosition {
 
-  public static  DataReader create(org.apache.iceberg.Schema expectedSchema, Schema readSchema) {
+  public static  DataReader create(
+      org.apache.iceberg.Schema expectedSchema, Schema readSchema) {
     return create(expectedSchema, readSchema, ImmutableMap.of());
   }
 
-  public static  DataReader create(org.apache.iceberg.Schema expectedSchema, Schema readSchema,
-                                         Map idToConstant) {
+  public static  DataReader create(
+      org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map idToConstant) {
     return new DataReader<>(expectedSchema, readSchema, idToConstant);
   }
 
@@ -53,10 +53,13 @@ public static  DataReader create(org.apache.iceberg.Schema expectedSchema,
   private Schema fileSchema = null;
 
   @SuppressWarnings("unchecked")
-  protected DataReader(org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map idToConstant) {
+  protected DataReader(
+      org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map idToConstant) {
     this.readSchema = readSchema;
-    this.reader = (ValueReader) AvroSchemaWithTypeVisitor
-        .visit(expectedSchema, readSchema, new ReadBuilder(idToConstant));
+    this.reader =
+        (ValueReader)
+            AvroSchemaWithTypeVisitor.visit(
+                expectedSchema, readSchema, new ReadBuilder(idToConstant));
   }
 
   @Override
@@ -76,8 +79,8 @@ public void setRowPositionSupplier(Supplier posSupplier) {
     }
   }
 
-  protected ValueReader createStructReader(Types.StructType struct,
-                                              List> fields, Map idToConstant) {
+  protected ValueReader createStructReader(
+      Types.StructType struct, List> fields, Map idToConstant) {
     return GenericReaders.struct(struct, fields, idToConstant);
   }
 
@@ -89,8 +92,8 @@ private ReadBuilder(Map idToConstant) {
     }
 
     @Override
-    public ValueReader record(Types.StructType struct, Schema record,
-                                 List names, List> fields) {
+    public ValueReader record(
+        Types.StructType struct, Schema record, List names, List> fields) {
       return createStructReader(struct, fields, idToConstant);
     }
 
@@ -100,12 +103,14 @@ public ValueReader union(Type ignored, Schema union, List> opt
     }
 
     @Override
-    public ValueReader array(Types.ListType ignored, Schema array, ValueReader elementReader) {
+    public ValueReader array(
+        Types.ListType ignored, Schema array, ValueReader elementReader) {
       return ValueReaders.array(elementReader);
     }
 
     @Override
-    public ValueReader map(Types.MapType iMap, Schema map, ValueReader keyReader, ValueReader valueReader) {
+    public ValueReader map(
+        Types.MapType iMap, Schema map, ValueReader keyReader, ValueReader valueReader) {
       return ValueReaders.arrayMap(keyReader, valueReader);
     }
 
diff --git a/core/src/main/java/org/apache/iceberg/data/avro/DataWriter.java b/core/src/main/java/org/apache/iceberg/data/avro/DataWriter.java
index bf4d9e6cdd19..4d6973d3cfe3 100644
--- a/core/src/main/java/org/apache/iceberg/data/avro/DataWriter.java
+++ b/core/src/main/java/org/apache/iceberg/data/avro/DataWriter.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.data.avro;
 
 import java.io.IOException;
@@ -74,10 +73,12 @@ public ValueWriter record(Schema record, List names, List union(Schema union, List> options) {
-      Preconditions.checkArgument(options.contains(ValueWriters.nulls()),
-          "Cannot create writer for non-option union: %s", union);
-      Preconditions.checkArgument(options.size() == 2,
-          "Cannot create writer for non-option union: %s", union);
+      Preconditions.checkArgument(
+          options.contains(ValueWriters.nulls()),
+          "Cannot create writer for non-option union: %s",
+          union);
+      Preconditions.checkArgument(
+          options.size() == 2, "Cannot create writer for non-option union: %s", union);
       if (union.getTypes().get(0).getType() == Schema.Type.NULL) {
         return ValueWriters.option(0, options.get(1));
       } else {
diff --git a/core/src/main/java/org/apache/iceberg/data/avro/DecoderResolver.java b/core/src/main/java/org/apache/iceberg/data/avro/DecoderResolver.java
index ac2499e55aaf..97b90755d9f9 100644
--- a/core/src/main/java/org/apache/iceberg/data/avro/DecoderResolver.java
+++ b/core/src/main/java/org/apache/iceberg/data/avro/DecoderResolver.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.data.avro;
 
 import java.io.IOException;
@@ -31,33 +30,33 @@
 import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 
 /**
- * Resolver to resolve {@link Decoder} to a {@link ResolvingDecoder}. This class uses a {@link ThreadLocal} for caching
- * {@link ResolvingDecoder}.
+ * Resolver to resolve {@link Decoder} to a {@link ResolvingDecoder}. This class uses a {@link
+ * ThreadLocal} for caching {@link ResolvingDecoder}.
  */
 public class DecoderResolver {
 
   private static final ThreadLocal>> DECODER_CACHES =
       ThreadLocal.withInitial(() -> new MapMaker().weakKeys().makeMap());
 
-  private DecoderResolver() {
-  }
+  private DecoderResolver() {}
 
   public static  T resolveAndRead(
-      Decoder decoder, Schema readSchema, Schema fileSchema, ValueReader reader, T reuse) throws IOException {
+      Decoder decoder, Schema readSchema, Schema fileSchema, ValueReader reader, T reuse)
+      throws IOException {
     ResolvingDecoder resolver = DecoderResolver.resolve(decoder, readSchema, fileSchema);
     T value = reader.read(resolver, reuse);
     resolver.drain();
     return value;
   }
 
-  private static ResolvingDecoder resolve(Decoder decoder, Schema readSchema, Schema fileSchema) throws IOException {
+  private static ResolvingDecoder resolve(Decoder decoder, Schema readSchema, Schema fileSchema)
+      throws IOException {
     Map> cache = DECODER_CACHES.get();
-    Map fileSchemaToResolver = cache
-        .computeIfAbsent(readSchema, k -> Maps.newHashMap());
+    Map fileSchemaToResolver =
+        cache.computeIfAbsent(readSchema, k -> Maps.newHashMap());
 
-    ResolvingDecoder resolver = fileSchemaToResolver.computeIfAbsent(
-        fileSchema,
-        schema -> newResolver(readSchema, schema));
+    ResolvingDecoder resolver =
+        fileSchemaToResolver.computeIfAbsent(fileSchema, schema -> newResolver(readSchema, schema));
 
     resolver.configure(decoder);
 
diff --git a/core/src/main/java/org/apache/iceberg/data/avro/GenericReaders.java b/core/src/main/java/org/apache/iceberg/data/avro/GenericReaders.java
index 239f09dbeddc..91a728d53d38 100644
--- a/core/src/main/java/org/apache/iceberg/data/avro/GenericReaders.java
+++ b/core/src/main/java/org/apache/iceberg/data/avro/GenericReaders.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.data.avro;
 
 import java.io.IOException;
@@ -35,8 +34,7 @@
 import org.apache.iceberg.util.DateTimeUtil;
 
 class GenericReaders {
-  private GenericReaders() {
-  }
+  private GenericReaders() {}
 
   static ValueReader dates() {
     return DateReader.INSTANCE;
@@ -54,15 +52,15 @@ static ValueReader timestamptz() {
     return TimestamptzReader.INSTANCE;
   }
 
-  static ValueReader struct(StructType struct, List> readers, Map idToConstant) {
+  static ValueReader struct(
+      StructType struct, List> readers, Map idToConstant) {
     return new GenericRecordReader(readers, struct, idToConstant);
   }
 
   private static class DateReader implements ValueReader {
     private static final DateReader INSTANCE = new DateReader();
 
-    private DateReader() {
-    }
+    private DateReader() {}
 
     @Override
     public LocalDate read(Decoder decoder, Object reuse) throws IOException {
@@ -73,8 +71,7 @@ public LocalDate read(Decoder decoder, Object reuse) throws IOException {
   private static class TimeReader implements ValueReader {
     private static final TimeReader INSTANCE = new TimeReader();
 
-    private TimeReader() {
-    }
+    private TimeReader() {}
 
     @Override
     public LocalTime read(Decoder decoder, Object reuse) throws IOException {
@@ -85,8 +82,7 @@ public LocalTime read(Decoder decoder, Object reuse) throws IOException {
   private static class TimestampReader implements ValueReader {
     private static final TimestampReader INSTANCE = new TimestampReader();
 
-    private TimestampReader() {
-    }
+    private TimestampReader() {}
 
     @Override
     public LocalDateTime read(Decoder decoder, Object reuse) throws IOException {
@@ -97,8 +93,7 @@ public LocalDateTime read(Decoder decoder, Object reuse) throws IOException {
   private static class TimestamptzReader implements ValueReader {
     private static final TimestamptzReader INSTANCE = new TimestamptzReader();
 
-    private TimestamptzReader() {
-    }
+    private TimestamptzReader() {}
 
     @Override
     public OffsetDateTime read(Decoder decoder, Object reuse) throws IOException {
@@ -109,7 +104,8 @@ public OffsetDateTime read(Decoder decoder, Object reuse) throws IOException {
   private static class GenericRecordReader extends ValueReaders.StructReader {
     private final StructType structType;
 
-    private GenericRecordReader(List> readers, StructType struct, Map idToConstant) {
+    private GenericRecordReader(
+        List> readers, StructType struct, Map idToConstant) {
       super(readers, struct, idToConstant);
       this.structType = struct;
     }
diff --git a/core/src/main/java/org/apache/iceberg/data/avro/GenericWriters.java b/core/src/main/java/org/apache/iceberg/data/avro/GenericWriters.java
index f8638946ccaf..1cea012e7a37 100644
--- a/core/src/main/java/org/apache/iceberg/data/avro/GenericWriters.java
+++ b/core/src/main/java/org/apache/iceberg/data/avro/GenericWriters.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.data.avro;
 
 import java.io.IOException;
@@ -34,8 +33,7 @@
 import org.apache.iceberg.data.Record;
 
 class GenericWriters {
-  private GenericWriters() {
-  }
+  private GenericWriters() {}
 
   static ValueWriter dates() {
     return DateWriter.INSTANCE;
@@ -63,8 +61,7 @@ static ValueWriter struct(List> writers) {
   private static class DateWriter implements ValueWriter {
     private static final DateWriter INSTANCE = new DateWriter();
 
-    private DateWriter() {
-    }
+    private DateWriter() {}
 
     @Override
     public void write(LocalDate date, Encoder encoder) throws IOException {
@@ -75,8 +72,7 @@ public void write(LocalDate date, Encoder encoder) throws IOException {
   private static class TimeWriter implements ValueWriter {
     private static final TimeWriter INSTANCE = new TimeWriter();
 
-    private TimeWriter() {
-    }
+    private TimeWriter() {}
 
     @Override
     public void write(LocalTime time, Encoder encoder) throws IOException {
@@ -87,8 +83,7 @@ public void write(LocalTime time, Encoder encoder) throws IOException {
   private static class TimestampWriter implements ValueWriter {
     private static final TimestampWriter INSTANCE = new TimestampWriter();
 
-    private TimestampWriter() {
-    }
+    private TimestampWriter() {}
 
     @Override
     public void write(LocalDateTime timestamp, Encoder encoder) throws IOException {
@@ -99,8 +94,7 @@ public void write(LocalDateTime timestamp, Encoder encoder) throws IOException {
   private static class TimestamptzWriter implements ValueWriter {
     private static final TimestamptzWriter INSTANCE = new TimestamptzWriter();
 
-    private TimestamptzWriter() {
-    }
+    private TimestamptzWriter() {}
 
     @Override
     public void write(OffsetDateTime timestamptz, Encoder encoder) throws IOException {
diff --git a/core/src/main/java/org/apache/iceberg/data/avro/IcebergDecoder.java b/core/src/main/java/org/apache/iceberg/data/avro/IcebergDecoder.java
index 57f0e2c6a3cf..7eca98a1cfe7 100644
--- a/core/src/main/java/org/apache/iceberg/data/avro/IcebergDecoder.java
+++ b/core/src/main/java/org/apache/iceberg/data/avro/IcebergDecoder.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.data.avro;
 
 import java.io.IOException;
@@ -44,26 +43,27 @@ public class IcebergDecoder extends MessageDecoder.BaseDecoder {
       ThreadLocal.withInitial(() -> new byte[10]);
 
   private static final ThreadLocal FP_BUFFER =
-      ThreadLocal.withInitial(() -> {
-        byte[] header = HEADER_BUFFER.get();
-        return ByteBuffer.wrap(header).order(ByteOrder.LITTLE_ENDIAN);
-      });
+      ThreadLocal.withInitial(
+          () -> {
+            byte[] header = HEADER_BUFFER.get();
+            return ByteBuffer.wrap(header).order(ByteOrder.LITTLE_ENDIAN);
+          });
 
   private final org.apache.iceberg.Schema readSchema;
   private final SchemaStore resolver;
   private final Map> decoders = new MapMaker().makeMap();
 
   /**
-   * Creates a new decoder that constructs datum instances described by an
-   * {@link org.apache.iceberg.Schema Iceberg schema}.
-   * 

- * The {@code readSchema} is as used the expected schema (read schema). Datum instances created + * Creates a new decoder that constructs datum instances described by an {@link + * org.apache.iceberg.Schema Iceberg schema}. + * + *

The {@code readSchema} is as used the expected schema (read schema). Datum instances created * by this class will are described by the expected schema. - *

- * The schema used to decode incoming buffers is determined by the schema fingerprint encoded in - * the message header. This class can decode messages that were encoded using the - * {@code readSchema} and other schemas that are added using - * {@link #addSchema(org.apache.iceberg.Schema)}. + * + *

The schema used to decode incoming buffers is determined by the schema fingerprint encoded + * in the message header. This class can decode messages that were encoded using the {@code + * readSchema} and other schemas that are added using {@link + * #addSchema(org.apache.iceberg.Schema)}. * * @param readSchema the schema used to construct datum instances */ @@ -72,18 +72,18 @@ public IcebergDecoder(org.apache.iceberg.Schema readSchema) { } /** - * Creates a new decoder that constructs datum instances described by an - * {@link org.apache.iceberg.Schema Iceberg schema}. - *

- * The {@code readSchema} is as used the expected schema (read schema). Datum instances created + * Creates a new decoder that constructs datum instances described by an {@link + * org.apache.iceberg.Schema Iceberg schema}. + * + *

The {@code readSchema} is as used the expected schema (read schema). Datum instances created * by this class will are described by the expected schema. - *

- * The schema used to decode incoming buffers is determined by the schema fingerprint encoded in - * the message header. This class can decode messages that were encoded using the - * {@code readSchema} and other schemas that are added using - * {@link #addSchema(org.apache.iceberg.Schema)}. - *

- * Schemas may also be returned from an Avro {@link SchemaStore}. Avro Schemas from the store + * + *

The schema used to decode incoming buffers is determined by the schema fingerprint encoded + * in the message header. This class can decode messages that were encoded using the {@code + * readSchema} and other schemas that are added using {@link + * #addSchema(org.apache.iceberg.Schema)}. + * + *

Schemas may also be returned from an Avro {@link SchemaStore}. Avro Schemas from the store * must be compatible with Iceberg and should contain id properties and use only Iceberg types. * * @param readSchema the {@link Schema} used to construct datum instances @@ -123,8 +123,7 @@ private RawDecoder getDecoder(long fp) { } } - throw new MissingSchemaException( - "Cannot resolve schema for fingerprint: " + fp); + throw new MissingSchemaException("Cannot resolve schema for fingerprint: " + fp); } @Override @@ -139,9 +138,8 @@ public D decode(InputStream stream, D reuse) throws IOException { } if (IcebergEncoder.V1_HEADER[0] != header[0] || IcebergEncoder.V1_HEADER[1] != header[1]) { - throw new BadHeaderException(String.format( - "Unrecognized header bytes: 0x%02X 0x%02X", - header[0], header[1])); + throw new BadHeaderException( + String.format("Unrecognized header bytes: 0x%02X 0x%02X", header[0], header[1])); } RawDecoder decoder = getDecoder(FP_BUFFER.get().getLong(2)); @@ -155,10 +153,10 @@ private static class RawDecoder extends MessageDecoder.BaseDecoder { private final DatumReader reader; /** - * Creates a new {@link MessageDecoder} that constructs datum instances described by the - * {@link Schema readSchema}. - *

- * The {@code readSchema} is used for the expected schema and the {@code writeSchema} is the + * Creates a new {@link MessageDecoder} that constructs datum instances described by the {@link + * Schema readSchema}. + * + *

The {@code readSchema} is used for the expected schema and the {@code writeSchema} is the * schema used to decode buffers. The {@code writeSchema} must be the schema that was used to * encode all buffers decoded by this class. * @@ -166,9 +164,12 @@ private static class RawDecoder extends MessageDecoder.BaseDecoder { * @param writeSchema the schema used to decode buffers */ private RawDecoder(org.apache.iceberg.Schema readSchema, Schema writeSchema) { - this.reader = new ProjectionDatumReader<>( - avroSchema -> DataReader.create(readSchema, avroSchema), - readSchema, ImmutableMap.of(), null); + this.reader = + new ProjectionDatumReader<>( + avroSchema -> DataReader.create(readSchema, avroSchema), + readSchema, + ImmutableMap.of(), + null); this.reader.setSchema(writeSchema); } @@ -193,12 +194,11 @@ public D decode(InputStream stream, D reuse) { * @throws IOException if there is an error while reading */ @SuppressWarnings("checkstyle:InnerAssignment") - private boolean readFully(InputStream stream, byte[] bytes) - throws IOException { + private boolean readFully(InputStream stream, byte[] bytes) throws IOException { int pos = 0; int bytesRead; - while ((bytes.length - pos) > 0 && - (bytesRead = stream.read(bytes, pos, bytes.length - pos)) > 0) { + while ((bytes.length - pos) > 0 + && (bytesRead = stream.read(bytes, pos, bytes.length - pos)) > 0) { pos += bytesRead; } return pos == bytes.length; diff --git a/core/src/main/java/org/apache/iceberg/data/avro/IcebergEncoder.java b/core/src/main/java/org/apache/iceberg/data/avro/IcebergEncoder.java index 9b1fe2d3ed1a..a70e79aac2c5 100644 --- a/core/src/main/java/org/apache/iceberg/data/avro/IcebergEncoder.java +++ b/core/src/main/java/org/apache/iceberg/data/avro/IcebergEncoder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.avro; import java.io.ByteArrayOutputStream; @@ -41,8 +40,7 @@ public class IcebergEncoder implements MessageEncoder { private static final ThreadLocal TEMP = ThreadLocal.withInitial(BufferOutputStream::new); - private static final ThreadLocal ENCODER = - new ThreadLocal<>(); + private static final ThreadLocal ENCODER = new ThreadLocal<>(); private final byte[] headerBytes; private final boolean copyOutputBytes; @@ -51,8 +49,8 @@ public class IcebergEncoder implements MessageEncoder { /** * Creates a new {@link MessageEncoder} that will deconstruct datum instances described by the * {@link Schema schema}. - *

- * Buffers returned by {@code encode} are copied and will not be modified by future calls to + * + *

Buffers returned by {@code encode} are copied and will not be modified by future calls to * {@code encode}. * * @param schema the {@link Schema} for datum instances @@ -64,11 +62,11 @@ public IcebergEncoder(Schema schema) { /** * Creates a new {@link MessageEncoder} that will deconstruct datum instances described by the * {@link Schema schema}. - *

- * If {@code shouldCopy} is true, then buffers returned by {@code encode} are copied and will + * + *

If {@code shouldCopy} is true, then buffers returned by {@code encode} are copied and will * not be modified by future calls to {@code encode}. - *

- * If {@code shouldCopy} is false, then buffers returned by {@code encode} wrap a thread-local + * + *

If {@code shouldCopy} is false, then buffers returned by {@code encode} wrap a thread-local * buffer that can be reused by future calls to {@code encode}, but may not be. Callers should * only set {@code shouldCopy} to false if the buffer will be copied before the current thread's * next call to {@code encode}. @@ -81,7 +79,6 @@ public IcebergEncoder(Schema schema, boolean shouldCopy) { org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(schema, "table"); this.writer = DataWriter.create(avroSchema); this.headerBytes = getWriteHeader(avroSchema); - } @Override @@ -108,8 +105,7 @@ public void encode(D datum, OutputStream stream) throws IOException { } private static class BufferOutputStream extends ByteArrayOutputStream { - BufferOutputStream() { - } + BufferOutputStream() {} ByteBuffer toBufferWithoutCopy() { return ByteBuffer.wrap(buf, 0, count); diff --git a/core/src/main/java/org/apache/iceberg/deletes/BitmapPositionDeleteIndex.java b/core/src/main/java/org/apache/iceberg/deletes/BitmapPositionDeleteIndex.java index a41f1a7d643b..7690ab7e4879 100644 --- a/core/src/main/java/org/apache/iceberg/deletes/BitmapPositionDeleteIndex.java +++ b/core/src/main/java/org/apache/iceberg/deletes/BitmapPositionDeleteIndex.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.deletes; import org.roaringbitmap.longlong.Roaring64Bitmap; diff --git a/core/src/main/java/org/apache/iceberg/deletes/Deletes.java b/core/src/main/java/org/apache/iceberg/deletes/Deletes.java index 6d6aba2bbbe3..bd4e03916a01 100644 --- a/core/src/main/java/org/apache/iceberg/deletes/Deletes.java +++ b/core/src/main/java/org/apache/iceberg/deletes/Deletes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.deletes; import java.io.IOException; @@ -42,44 +41,46 @@ import org.apache.iceberg.util.StructLikeSet; public class Deletes { - private static final Schema POSITION_DELETE_SCHEMA = new Schema( - MetadataColumns.DELETE_FILE_PATH, - MetadataColumns.DELETE_FILE_POS - ); + private static final Schema POSITION_DELETE_SCHEMA = + new Schema(MetadataColumns.DELETE_FILE_PATH, MetadataColumns.DELETE_FILE_POS); - private static final Accessor FILENAME_ACCESSOR = POSITION_DELETE_SCHEMA - .accessorForField(MetadataColumns.DELETE_FILE_PATH.fieldId()); - private static final Accessor POSITION_ACCESSOR = POSITION_DELETE_SCHEMA - .accessorForField(MetadataColumns.DELETE_FILE_POS.fieldId()); + private static final Accessor FILENAME_ACCESSOR = + POSITION_DELETE_SCHEMA.accessorForField(MetadataColumns.DELETE_FILE_PATH.fieldId()); + private static final Accessor POSITION_ACCESSOR = + POSITION_DELETE_SCHEMA.accessorForField(MetadataColumns.DELETE_FILE_POS.fieldId()); - private Deletes() { - } + private Deletes() {} - public static CloseableIterable filter(CloseableIterable rows, Function rowToDeleteKey, - StructLikeSet deleteSet) { + public static CloseableIterable filter( + CloseableIterable rows, Function rowToDeleteKey, StructLikeSet deleteSet) { if (deleteSet.isEmpty()) { return rows; } - EqualitySetDeleteFilter equalityFilter = new EqualitySetDeleteFilter<>(rowToDeleteKey, deleteSet); + EqualitySetDeleteFilter equalityFilter = + new EqualitySetDeleteFilter<>(rowToDeleteKey, deleteSet); return equalityFilter.filter(rows); } - public static CloseableIterable markDeleted(CloseableIterable rows, Predicate isDeleted, - Consumer deleteMarker) { - return CloseableIterable.transform(rows, row -> { - if (isDeleted.test(row)) { - deleteMarker.accept(row); - } - return row; - }); + public static CloseableIterable markDeleted( + CloseableIterable rows, Predicate isDeleted, Consumer deleteMarker) { + return CloseableIterable.transform( + rows, + row -> { + if (isDeleted.test(row)) { + deleteMarker.accept(row); + } + return row; + }); } - public static CloseableIterable filterDeleted(CloseableIterable rows, Predicate isDeleted) { + public static CloseableIterable filterDeleted( + CloseableIterable rows, Predicate isDeleted) { return CloseableIterable.filter(rows, isDeleted.negate()); } - public static StructLikeSet toEqualitySet(CloseableIterable eqDeletes, Types.StructType eqType) { + public static StructLikeSet toEqualitySet( + CloseableIterable eqDeletes, Types.StructType eqType) { try (CloseableIterable deletes = eqDeletes) { StructLikeSet deleteSet = StructLikeSet.create(eqType); Iterables.addAll(deleteSet, deletes); @@ -89,11 +90,15 @@ public static StructLikeSet toEqualitySet(CloseableIterable eqDelete } } - public static PositionDeleteIndex toPositionIndex(CharSequence dataLocation, - List> deleteFiles) { + public static PositionDeleteIndex toPositionIndex( + CharSequence dataLocation, List> deleteFiles) { DataFileFilter locationFilter = new DataFileFilter<>(dataLocation); - List> positions = Lists.transform(deleteFiles, deletes -> - CloseableIterable.transform(locationFilter.filter(deletes), row -> (Long) POSITION_ACCESSOR.get(row))); + List> positions = + Lists.transform( + deleteFiles, + deletes -> + CloseableIterable.transform( + locationFilter.filter(deletes), row -> (Long) POSITION_ACCESSOR.get(row))); return toPositionIndex(CloseableIterable.concat(positions)); } @@ -107,29 +112,35 @@ public static PositionDeleteIndex toPositionIndex(CloseableIterable posDel } } - public static CloseableIterable streamingFilter(CloseableIterable rows, - Function rowToPosition, - CloseableIterable posDeletes) { + public static CloseableIterable streamingFilter( + CloseableIterable rows, + Function rowToPosition, + CloseableIterable posDeletes) { return new PositionStreamDeleteFilter<>(rows, rowToPosition, posDeletes); } - public static CloseableIterable streamingMarker(CloseableIterable rows, - Function rowToPosition, - CloseableIterable posDeletes, - Consumer markDeleted) { + public static CloseableIterable streamingMarker( + CloseableIterable rows, + Function rowToPosition, + CloseableIterable posDeletes, + Consumer markDeleted) { return new PositionStreamDeleteMarker<>(rows, rowToPosition, posDeletes, markDeleted); } - public static CloseableIterable deletePositions(CharSequence dataLocation, - CloseableIterable deleteFile) { + public static CloseableIterable deletePositions( + CharSequence dataLocation, CloseableIterable deleteFile) { return deletePositions(dataLocation, ImmutableList.of(deleteFile)); } - public static CloseableIterable deletePositions(CharSequence dataLocation, - List> deleteFiles) { + public static CloseableIterable deletePositions( + CharSequence dataLocation, List> deleteFiles) { DataFileFilter locationFilter = new DataFileFilter<>(dataLocation); - List> positions = Lists.transform(deleteFiles, deletes -> - CloseableIterable.transform(locationFilter.filter(deletes), row -> (Long) POSITION_ACCESSOR.get(row))); + List> positions = + Lists.transform( + deleteFiles, + deletes -> + CloseableIterable.transform( + locationFilter.filter(deletes), row -> (Long) POSITION_ACCESSOR.get(row))); return new SortedMerge<>(Long::compare, positions); } @@ -138,8 +149,7 @@ private static class EqualitySetDeleteFilter extends Filter { private final StructLikeSet deletes; private final Function extractEqStruct; - protected EqualitySetDeleteFilter(Function extractEq, - StructLikeSet deletes) { + protected EqualitySetDeleteFilter(Function extractEq, StructLikeSet deletes) { this.extractEqStruct = extractEq; this.deletes = deletes; } @@ -150,14 +160,17 @@ protected boolean shouldKeep(T row) { } } - private abstract static class PositionStreamDeleteIterable extends CloseableGroup implements CloseableIterable { + private abstract static class PositionStreamDeleteIterable extends CloseableGroup + implements CloseableIterable { private final CloseableIterable rows; private final CloseableIterator deletePosIterator; private final Function rowToPosition; private long nextDeletePos; - PositionStreamDeleteIterable(CloseableIterable rows, Function rowToPosition, - CloseableIterable deletePositions) { + PositionStreamDeleteIterable( + CloseableIterable rows, + Function rowToPosition, + CloseableIterable deletePositions) { this.rows = rows; this.rowToPosition = rowToPosition; this.deletePosIterator = deletePositions.iterator(); @@ -202,8 +215,10 @@ boolean isDeleted(T row) { } private static class PositionStreamDeleteFilter extends PositionStreamDeleteIterable { - private PositionStreamDeleteFilter(CloseableIterable rows, Function rowToPosition, - CloseableIterable deletePositions) { + private PositionStreamDeleteFilter( + CloseableIterable rows, + Function rowToPosition, + CloseableIterable deletePositions) { super(rows, rowToPosition, deletePositions); } @@ -221,20 +236,25 @@ protected boolean shouldKeep(T item) { private static class PositionStreamDeleteMarker extends PositionStreamDeleteIterable { private final Consumer markDeleted; - PositionStreamDeleteMarker(CloseableIterable rows, Function rowToPosition, - CloseableIterable deletePositions, Consumer markDeleted) { + PositionStreamDeleteMarker( + CloseableIterable rows, + Function rowToPosition, + CloseableIterable deletePositions, + Consumer markDeleted) { super(rows, rowToPosition, deletePositions); this.markDeleted = markDeleted; } @Override protected CloseableIterator applyDelete(CloseableIterator items) { - return CloseableIterator.transform(items, row -> { - if (isDeleted(row)) { - markDeleted.accept(row); - } - return row; - }); + return CloseableIterator.transform( + items, + row -> { + if (isDeleted(row)) { + markDeleted.accept(row); + } + return row; + }); } } @@ -264,7 +284,8 @@ private boolean charSeqEquals(CharSequence s1, CharSequence s2) { return false; } - // File paths inside a delete file normally have more identical chars at the beginning. For example, a typical + // File paths inside a delete file normally have more identical chars at the beginning. For + // example, a typical // path is like "s3:/bucket/db/table/data/partition/00000-0-[uuid]-00001.parquet". // The uuid is where the difference starts. So it's faster to find the first diff backward. for (int i = count - 1; i >= 0; i--) { diff --git a/core/src/main/java/org/apache/iceberg/deletes/EqualityDeleteWriter.java b/core/src/main/java/org/apache/iceberg/deletes/EqualityDeleteWriter.java index c914ad224f30..be889b2993f2 100644 --- a/core/src/main/java/org/apache/iceberg/deletes/EqualityDeleteWriter.java +++ b/core/src/main/java/org/apache/iceberg/deletes/EqualityDeleteWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.deletes; import java.io.IOException; @@ -44,9 +43,15 @@ public class EqualityDeleteWriter implements FileWriter private final SortOrder sortOrder; private DeleteFile deleteFile = null; - public EqualityDeleteWriter(FileAppender appender, FileFormat format, String location, - PartitionSpec spec, StructLike partition, EncryptionKeyMetadata keyMetadata, - SortOrder sortOrder, int... equalityFieldIds) { + public EqualityDeleteWriter( + FileAppender appender, + FileFormat format, + String location, + PartitionSpec spec, + StructLike partition, + EncryptionKeyMetadata keyMetadata, + SortOrder sortOrder, + int... equalityFieldIds) { this.appender = appender; this.format = format; this.location = location; @@ -91,16 +96,17 @@ public long length() { public void close() throws IOException { if (deleteFile == null) { appender.close(); - this.deleteFile = FileMetadata.deleteFileBuilder(spec) - .ofEqualityDeletes(equalityFieldIds) - .withFormat(format) - .withPath(location) - .withPartition(partition) - .withEncryptionKeyMetadata(keyMetadata) - .withFileSizeInBytes(appender.length()) - .withMetrics(appender.metrics()) - .withSortOrder(sortOrder) - .build(); + this.deleteFile = + FileMetadata.deleteFileBuilder(spec) + .ofEqualityDeletes(equalityFieldIds) + .withFormat(format) + .withPath(location) + .withPartition(partition) + .withEncryptionKeyMetadata(keyMetadata) + .withFileSizeInBytes(appender.length()) + .withMetrics(appender.metrics()) + .withSortOrder(sortOrder) + .build(); } } diff --git a/core/src/main/java/org/apache/iceberg/deletes/PositionDelete.java b/core/src/main/java/org/apache/iceberg/deletes/PositionDelete.java index cac04ca2eba0..655428ce7713 100644 --- a/core/src/main/java/org/apache/iceberg/deletes/PositionDelete.java +++ b/core/src/main/java/org/apache/iceberg/deletes/PositionDelete.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.deletes; import org.apache.iceberg.StructLike; @@ -30,8 +29,7 @@ public static PositionDelete create() { private long pos; private R row; - private PositionDelete() { - } + private PositionDelete() {} public PositionDelete set(CharSequence newPath, long newPos, R newRow) { this.path = newPath; diff --git a/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteIndex.java b/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteIndex.java index ced11b8acd19..bcfa9f2cf5ff 100644 --- a/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteIndex.java +++ b/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteIndex.java @@ -16,18 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.deletes; public interface PositionDeleteIndex { /** * Set a deleted row position. + * * @param position the deleted row position */ void delete(long position); /** * Set a range of deleted row positions. + * * @param posStart inclusive beginning of position range * @param posEnd exclusive ending of position range */ @@ -35,13 +36,12 @@ public interface PositionDeleteIndex { /** * Checks whether a row at the position is deleted. + * * @param position deleted row position * @return whether the position is deleted */ boolean isDeleted(long position); - /** - * Returns true if this collection contains no element. - */ + /** Returns true if this collection contains no element. */ boolean isEmpty(); } diff --git a/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteWriter.java b/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteWriter.java index a7dff07e7105..a540396537d5 100644 --- a/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteWriter.java +++ b/core/src/main/java/org/apache/iceberg/deletes/PositionDeleteWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.deletes; import java.io.IOException; @@ -44,8 +43,13 @@ public class PositionDeleteWriter implements FileWriter, De private final CharSequenceSet referencedDataFiles; private DeleteFile deleteFile = null; - public PositionDeleteWriter(FileAppender appender, FileFormat format, String location, - PartitionSpec spec, StructLike partition, EncryptionKeyMetadata keyMetadata) { + public PositionDeleteWriter( + FileAppender appender, + FileFormat format, + String location, + PartitionSpec spec, + StructLike partition, + EncryptionKeyMetadata keyMetadata) { this.appender = appender; this.format = format; this.location = location; @@ -65,7 +69,8 @@ public void write(PositionDelete positionDelete) { /** * Writes a position delete. * - * @deprecated since 0.13.0, will be removed in 0.14.0; use {@link #write(PositionDelete)} instead. + * @deprecated since 0.13.0, will be removed in 0.14.0; use {@link #write(PositionDelete)} + * instead. */ @Deprecated public void delete(CharSequence path, long pos) { @@ -75,7 +80,8 @@ public void delete(CharSequence path, long pos) { /** * Writes a position delete and persists the deleted row. * - * @deprecated since 0.13.0, will be removed in 0.14.0; use {@link #write(PositionDelete)} instead. + * @deprecated since 0.13.0, will be removed in 0.14.0; use {@link #write(PositionDelete)} + * instead. */ @Deprecated public void delete(CharSequence path, long pos, T row) { @@ -92,15 +98,16 @@ public long length() { public void close() throws IOException { if (deleteFile == null) { appender.close(); - this.deleteFile = FileMetadata.deleteFileBuilder(spec) - .ofPositionDeletes() - .withFormat(format) - .withPath(location) - .withPartition(partition) - .withEncryptionKeyMetadata(keyMetadata) - .withFileSizeInBytes(appender.length()) - .withMetrics(appender.metrics()) - .build(); + this.deleteFile = + FileMetadata.deleteFileBuilder(spec) + .ofPositionDeletes() + .withFormat(format) + .withPath(location) + .withPartition(partition) + .withEncryptionKeyMetadata(keyMetadata) + .withFileSizeInBytes(appender.length()) + .withMetrics(appender.metrics()) + .build(); } } diff --git a/core/src/main/java/org/apache/iceberg/encryption/BaseEncryptedInputFile.java b/core/src/main/java/org/apache/iceberg/encryption/BaseEncryptedInputFile.java index fb6836b0bc01..7905007be817 100644 --- a/core/src/main/java/org/apache/iceberg/encryption/BaseEncryptedInputFile.java +++ b/core/src/main/java/org/apache/iceberg/encryption/BaseEncryptedInputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; import org.apache.iceberg.io.InputFile; diff --git a/core/src/main/java/org/apache/iceberg/encryption/BaseEncryptedOutputFile.java b/core/src/main/java/org/apache/iceberg/encryption/BaseEncryptedOutputFile.java index ab758b2c7171..999cd510af32 100644 --- a/core/src/main/java/org/apache/iceberg/encryption/BaseEncryptedOutputFile.java +++ b/core/src/main/java/org/apache/iceberg/encryption/BaseEncryptedOutputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; import org.apache.iceberg.io.OutputFile; diff --git a/core/src/main/java/org/apache/iceberg/encryption/BaseEncryptionKeyMetadata.java b/core/src/main/java/org/apache/iceberg/encryption/BaseEncryptionKeyMetadata.java index 598edf274f60..d2d5e275b5c5 100644 --- a/core/src/main/java/org/apache/iceberg/encryption/BaseEncryptionKeyMetadata.java +++ b/core/src/main/java/org/apache/iceberg/encryption/BaseEncryptionKeyMetadata.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; import java.nio.ByteBuffer; @@ -51,7 +50,6 @@ public ByteBuffer buffer() { @Override public EncryptionKeyMetadata copy() { - return new BaseEncryptionKeyMetadata( - ByteBuffers.copy(keyMetadata)); + return new BaseEncryptionKeyMetadata(ByteBuffers.copy(keyMetadata)); } } diff --git a/core/src/main/java/org/apache/iceberg/encryption/Ciphers.java b/core/src/main/java/org/apache/iceberg/encryption/Ciphers.java index 11c09543f4ee..6eecefda472c 100644 --- a/core/src/main/java/org/apache/iceberg/encryption/Ciphers.java +++ b/core/src/main/java/org/apache/iceberg/encryption/Ciphers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; import java.security.GeneralSecurityException; @@ -40,8 +39,11 @@ public static class AesGcmEncryptor { public AesGcmEncryptor(byte[] keyBytes) { Preconditions.checkArgument(keyBytes != null, "Key can't be null"); int keyLength = keyBytes.length; - Preconditions.checkArgument((keyLength == 16 || keyLength == 24 || keyLength == 32), - "Cannot use a key of length " + keyLength + " because AES only allows 16, 24 or 32 bytes"); + Preconditions.checkArgument( + (keyLength == 16 || keyLength == 24 || keyLength == 32), + "Cannot use a key of length " + + keyLength + + " because AES only allows 16, 24 or 32 bytes"); this.aesKey = new SecretKeySpec(keyBytes, "AES"); try { @@ -84,8 +86,11 @@ public static class AesGcmDecryptor { public AesGcmDecryptor(byte[] keyBytes) { Preconditions.checkArgument(keyBytes != null, "Key can't be null"); int keyLength = keyBytes.length; - Preconditions.checkArgument((keyLength == 16 || keyLength == 24 || keyLength == 32), - "Cannot use a key of length " + keyLength + " because AES only allows 16, 24 or 32 bytes"); + Preconditions.checkArgument( + (keyLength == 16 || keyLength == 24 || keyLength == 32), + "Cannot use a key of length " + + keyLength + + " because AES only allows 16, 24 or 32 bytes"); this.aesKey = new SecretKeySpec(keyBytes, "AES"); try { @@ -95,12 +100,14 @@ public AesGcmDecryptor(byte[] keyBytes) { } } - public byte[] decrypt(byte[] ciphertext, byte[] aad) { + public byte[] decrypt(byte[] ciphertext, byte[] aad) { int plainTextLength = ciphertext.length - GCM_TAG_LENGTH - NONCE_LENGTH; - Preconditions.checkState(plainTextLength >= 1, - "Cannot decrypt cipher text of length " + ciphertext.length + - " because text must longer than GCM_TAG_LENGTH + NONCE_LENGTH bytes. Text may not be encrypted" + - " with AES GCM cipher"); + Preconditions.checkState( + plainTextLength >= 1, + "Cannot decrypt cipher text of length " + + ciphertext.length + + " because text must longer than GCM_TAG_LENGTH + NONCE_LENGTH bytes. Text may not be encrypted" + + " with AES GCM cipher"); // Get the nonce from ciphertext byte[] nonce = new byte[NONCE_LENGTH]; @@ -115,9 +122,11 @@ public byte[] decrypt(byte[] ciphertext, byte[] aad) { cipher.updateAAD(aad); } cipher.doFinal(ciphertext, NONCE_LENGTH, inputLength, plainText, 0); - } catch (AEADBadTagException e) { - throw new RuntimeException("GCM tag check failed. Possible reasons: wrong decryption key; or corrupt/tampered" + - "data. AES GCM doesn't differentiate between these two.. ", e); + } catch (AEADBadTagException e) { + throw new RuntimeException( + "GCM tag check failed. Possible reasons: wrong decryption key; or corrupt/tampered" + + "data. AES GCM doesn't differentiate between these two.. ", + e); } catch (GeneralSecurityException e) { throw new RuntimeException("Failed to decrypt", e); } diff --git a/core/src/main/java/org/apache/iceberg/encryption/EncryptedFiles.java b/core/src/main/java/org/apache/iceberg/encryption/EncryptedFiles.java index cbc22830ccb7..c0fc41ca1385 100644 --- a/core/src/main/java/org/apache/iceberg/encryption/EncryptedFiles.java +++ b/core/src/main/java/org/apache/iceberg/encryption/EncryptedFiles.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; import java.nio.ByteBuffer; @@ -32,10 +31,12 @@ public static EncryptedInputFile encryptedInput( public static EncryptedInputFile encryptedInput( InputFile encryptedInputFile, ByteBuffer keyMetadata) { - return encryptedInput(encryptedInputFile, BaseEncryptionKeyMetadata.fromKeyMetadata(keyMetadata)); + return encryptedInput( + encryptedInputFile, BaseEncryptionKeyMetadata.fromKeyMetadata(keyMetadata)); } - public static EncryptedInputFile encryptedInput(InputFile encryptedInputFile, byte[] keyMetadata) { + public static EncryptedInputFile encryptedInput( + InputFile encryptedInputFile, byte[] keyMetadata) { return encryptedInput(encryptedInputFile, BaseEncryptionKeyMetadata.fromByteArray(keyMetadata)); } @@ -46,13 +47,15 @@ public static EncryptedOutputFile encryptedOutput( public static EncryptedOutputFile encryptedOutput( OutputFile encryptingOutputFile, ByteBuffer keyMetadata) { - return encryptedOutput(encryptingOutputFile, BaseEncryptionKeyMetadata.fromKeyMetadata(keyMetadata)); + return encryptedOutput( + encryptingOutputFile, BaseEncryptionKeyMetadata.fromKeyMetadata(keyMetadata)); } - public static EncryptedOutputFile encryptedOutput(OutputFile encryptedOutputFile, byte[] keyMetadata) { - return encryptedOutput(encryptedOutputFile, BaseEncryptionKeyMetadata.fromByteArray(keyMetadata)); + public static EncryptedOutputFile encryptedOutput( + OutputFile encryptedOutputFile, byte[] keyMetadata) { + return encryptedOutput( + encryptedOutputFile, BaseEncryptionKeyMetadata.fromByteArray(keyMetadata)); } - private EncryptedFiles() { - } + private EncryptedFiles() {} } diff --git a/core/src/main/java/org/apache/iceberg/encryption/EncryptionAlgorithm.java b/core/src/main/java/org/apache/iceberg/encryption/EncryptionAlgorithm.java index 650958c5a3b7..dd0a1d5d1be7 100644 --- a/core/src/main/java/org/apache/iceberg/encryption/EncryptionAlgorithm.java +++ b/core/src/main/java/org/apache/iceberg/encryption/EncryptionAlgorithm.java @@ -16,37 +16,28 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; -/** - * Algorithm supported for file encryption. - */ +/** Algorithm supported for file encryption. */ public enum EncryptionAlgorithm { - /** - * Counter mode (CTR) allows fast encryption with high throughput. - * It is an encryption only cipher and does not ensure content integrity. - * Inputs to CTR cipher are: - * 1. encryption key - * 2. a 16-byte initialization vector (12-byte nonce, 4-byte counter) - * 3. plaintext data - */ - AES_CTR, - /** - * Galois/Counter mode (GCM) combines CTR with the new Galois mode of authentication. - * It not only ensures data confidentiality, but also ensures data integrity. - * Inputs to GCM cipher are: - * 1. encryption key - * 2. a 12-byte initialization vector - * 3. additional authenticated data - * 4. plaintext data - */ - AES_GCM, - /** - * A combination of GCM and CTR that can be used for file types like Parquet, - * so that all modules except pages are encrypted by GCM to ensure integrity, - * and CTR is used for efficient encryption of bulk data. - * The tradeoff is that attackers would be able to tamper page data encrypted with CTR. - */ - AES_GCM_CTR + /** + * Counter mode (CTR) allows fast encryption with high throughput. It is an encryption only cipher + * and does not ensure content integrity. Inputs to CTR cipher are: 1. encryption key 2. a 16-byte + * initialization vector (12-byte nonce, 4-byte counter) 3. plaintext data + */ + AES_CTR, + /** + * Galois/Counter mode (GCM) combines CTR with the new Galois mode of authentication. It not only + * ensures data confidentiality, but also ensures data integrity. Inputs to GCM cipher are: 1. + * encryption key 2. a 12-byte initialization vector 3. additional authenticated data 4. plaintext + * data + */ + AES_GCM, + /** + * A combination of GCM and CTR that can be used for file types like Parquet, so that all modules + * except pages are encrypted by GCM to ensure integrity, and CTR is used for efficient encryption + * of bulk data. The tradeoff is that attackers would be able to tamper page data encrypted with + * CTR. + */ + AES_GCM_CTR } diff --git a/core/src/main/java/org/apache/iceberg/encryption/EncryptionKeyMetadatas.java b/core/src/main/java/org/apache/iceberg/encryption/EncryptionKeyMetadatas.java index 5799a3aa8bb0..a473478b2ab9 100644 --- a/core/src/main/java/org/apache/iceberg/encryption/EncryptionKeyMetadatas.java +++ b/core/src/main/java/org/apache/iceberg/encryption/EncryptionKeyMetadatas.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; import java.nio.ByteBuffer; @@ -31,6 +30,5 @@ public static EncryptionKeyMetadata of(byte[] keyMetadata) { return BaseEncryptionKeyMetadata.fromByteArray(keyMetadata); } - private EncryptionKeyMetadatas() { - } + private EncryptionKeyMetadatas() {} } diff --git a/core/src/main/java/org/apache/iceberg/encryption/InputFilesDecryptor.java b/core/src/main/java/org/apache/iceberg/encryption/InputFilesDecryptor.java index 4987b2e7b771..158e61971da2 100644 --- a/core/src/main/java/org/apache/iceberg/encryption/InputFilesDecryptor.java +++ b/core/src/main/java/org/apache/iceberg/encryption/InputFilesDecryptor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; import java.nio.ByteBuffer; @@ -34,13 +33,20 @@ public class InputFilesDecryptor { private final Map decryptedInputFiles; - public InputFilesDecryptor(CombinedScanTask combinedTask, FileIO io, EncryptionManager encryption) { + public InputFilesDecryptor( + CombinedScanTask combinedTask, FileIO io, EncryptionManager encryption) { Map keyMetadata = Maps.newHashMap(); combinedTask.files().stream() - .flatMap(fileScanTask -> Stream.concat(Stream.of(fileScanTask.file()), fileScanTask.deletes().stream())) + .flatMap( + fileScanTask -> + Stream.concat(Stream.of(fileScanTask.file()), fileScanTask.deletes().stream())) .forEach(file -> keyMetadata.put(file.path().toString(), file.keyMetadata())); - Stream encrypted = keyMetadata.entrySet().stream() - .map(entry -> EncryptedFiles.encryptedInput(io.newInputFile(entry.getKey()), entry.getValue())); + Stream encrypted = + keyMetadata.entrySet().stream() + .map( + entry -> + EncryptedFiles.encryptedInput( + io.newInputFile(entry.getKey()), entry.getValue())); // decrypt with the batch call to avoid multiple RPCs to a key server, if possible @SuppressWarnings("StreamToIterable") diff --git a/core/src/main/java/org/apache/iceberg/encryption/NativeFileCryptoParameters.java b/core/src/main/java/org/apache/iceberg/encryption/NativeFileCryptoParameters.java index c19ab2fcd759..f11506a1bbc9 100644 --- a/core/src/main/java/org/apache/iceberg/encryption/NativeFileCryptoParameters.java +++ b/core/src/main/java/org/apache/iceberg/encryption/NativeFileCryptoParameters.java @@ -16,22 +16,22 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; import java.nio.ByteBuffer; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; /** - * Barebone encryption parameters, one object per content file. - * Carries the file encryption key (later, will be extended with column keys and AAD prefix). - * Applicable only to formats with native encryption support (Parquet and ORC). + * Barebone encryption parameters, one object per content file. Carries the file encryption key + * (later, will be extended with column keys and AAD prefix). Applicable only to formats with native + * encryption support (Parquet and ORC). */ public class NativeFileCryptoParameters { private ByteBuffer fileKey; private EncryptionAlgorithm fileEncryptionAlgorithm; - private NativeFileCryptoParameters(ByteBuffer fileKey, EncryptionAlgorithm fileEncryptionAlgorithm) { + private NativeFileCryptoParameters( + ByteBuffer fileKey, EncryptionAlgorithm fileEncryptionAlgorithm) { Preconditions.checkState(fileKey != null, "File encryption key is not supplied"); this.fileKey = fileKey; this.fileEncryptionAlgorithm = fileEncryptionAlgorithm; @@ -40,7 +40,8 @@ private NativeFileCryptoParameters(ByteBuffer fileKey, EncryptionAlgorithm fileE /** * Creates the builder. * - * @param fileKey per-file encryption key. For example, used as "footer key" DEK in Parquet encryption. + * @param fileKey per-file encryption key. For example, used as "footer key" DEK in Parquet + * encryption. */ public static Builder create(ByteBuffer fileKey) { return new Builder(fileKey); diff --git a/core/src/main/java/org/apache/iceberg/encryption/NativelyEncryptedFile.java b/core/src/main/java/org/apache/iceberg/encryption/NativelyEncryptedFile.java index 2e0f403397dc..e55456591a64 100644 --- a/core/src/main/java/org/apache/iceberg/encryption/NativelyEncryptedFile.java +++ b/core/src/main/java/org/apache/iceberg/encryption/NativelyEncryptedFile.java @@ -16,13 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; /** - * This interface is applied to OutputFile and InputFile implementations, in order to enable delivery of crypto - * parameters (such as encryption keys etc) from the Iceberg key management module to the writers/readers of file - * formats that support encryption natively (Parquet and ORC). + * This interface is applied to OutputFile and InputFile implementations, in order to enable + * delivery of crypto parameters (such as encryption keys etc) from the Iceberg key management + * module to the writers/readers of file formats that support encryption natively (Parquet and ORC). */ public interface NativelyEncryptedFile { NativeFileCryptoParameters nativeCryptoParameters(); diff --git a/core/src/main/java/org/apache/iceberg/encryption/PlaintextEncryptionManager.java b/core/src/main/java/org/apache/iceberg/encryption/PlaintextEncryptionManager.java index 59ac871ca878..4d8d8aa7aff9 100644 --- a/core/src/main/java/org/apache/iceberg/encryption/PlaintextEncryptionManager.java +++ b/core/src/main/java/org/apache/iceberg/encryption/PlaintextEncryptionManager.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; import java.nio.ByteBuffer; @@ -31,7 +30,8 @@ public class PlaintextEncryptionManager implements EncryptionManager { @Override public InputFile decrypt(EncryptedInputFile encrypted) { if (encrypted.keyMetadata().buffer() != null) { - LOG.warn("File encryption key metadata is present, but currently using PlaintextEncryptionManager."); + LOG.warn( + "File encryption key metadata is present, but currently using PlaintextEncryptionManager."); } return encrypted.encryptedInputFile(); } diff --git a/core/src/main/java/org/apache/iceberg/events/CreateSnapshotEvent.java b/core/src/main/java/org/apache/iceberg/events/CreateSnapshotEvent.java index 7d671e728074..7872f9bf257b 100644 --- a/core/src/main/java/org/apache/iceberg/events/CreateSnapshotEvent.java +++ b/core/src/main/java/org/apache/iceberg/events/CreateSnapshotEvent.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.events; import java.util.Map; @@ -29,7 +28,11 @@ public final class CreateSnapshotEvent { private final Map summary; public CreateSnapshotEvent( - String tableName, String operation, long snapshotId, long sequenceNumber, Map summary) { + String tableName, + String operation, + long snapshotId, + long sequenceNumber, + Map summary) { this.tableName = tableName; this.operation = operation; this.snapshotId = snapshotId; diff --git a/core/src/main/java/org/apache/iceberg/expressions/Zorder.java b/core/src/main/java/org/apache/iceberg/expressions/Zorder.java index 274e68617033..347ad01ab9bd 100644 --- a/core/src/main/java/org/apache/iceberg/expressions/Zorder.java +++ b/core/src/main/java/org/apache/iceberg/expressions/Zorder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.expressions; import java.util.Arrays; diff --git a/core/src/main/java/org/apache/iceberg/hadoop/ConfigProperties.java b/core/src/main/java/org/apache/iceberg/hadoop/ConfigProperties.java index 58e5d63c70bb..a8bc21af0661 100644 --- a/core/src/main/java/org/apache/iceberg/hadoop/ConfigProperties.java +++ b/core/src/main/java/org/apache/iceberg/hadoop/ConfigProperties.java @@ -16,13 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; public class ConfigProperties { - private ConfigProperties() { - } + private ConfigProperties() {} public static final String ENGINE_HIVE_ENABLED = "iceberg.engine.hive.enabled"; public static final String KEEP_HIVE_STATS = "iceberg.hive.keep.stats"; diff --git a/core/src/main/java/org/apache/iceberg/hadoop/Configurable.java b/core/src/main/java/org/apache/iceberg/hadoop/Configurable.java index 38b8c69c889f..2d416670341c 100644 --- a/core/src/main/java/org/apache/iceberg/hadoop/Configurable.java +++ b/core/src/main/java/org/apache/iceberg/hadoop/Configurable.java @@ -16,12 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; -/** - * Interface used to avoid runtime dependencies on Hadoop Configurable - */ +/** Interface used to avoid runtime dependencies on Hadoop Configurable */ public interface Configurable { void setConf(C conf); } diff --git a/core/src/main/java/org/apache/iceberg/hadoop/HadoopCatalog.java b/core/src/main/java/org/apache/iceberg/hadoop/HadoopCatalog.java index 5eeb0ad7fa7e..dd542a8af327 100644 --- a/core/src/main/java/org/apache/iceberg/hadoop/HadoopCatalog.java +++ b/core/src/main/java/org/apache/iceberg/hadoop/HadoopCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; import java.io.Closeable; @@ -64,25 +63,28 @@ import org.slf4j.LoggerFactory; /** - * HadoopCatalog provides a way to use table names like db.table to work with path-based tables under a common - * location. It uses a specified directory under a specified filesystem as the warehouse directory, and organizes - * multiple levels directories that mapped to the database, namespace and the table respectively. The HadoopCatalog - * takes a location as the warehouse directory. When creating a table such as $db.$tbl, it creates $db/$tbl - * directory under the warehouse directory, and put the table metadata into that directory. + * HadoopCatalog provides a way to use table names like db.table to work with path-based tables + * under a common location. It uses a specified directory under a specified filesystem as the + * warehouse directory, and organizes multiple levels directories that mapped to the database, + * namespace and the table respectively. The HadoopCatalog takes a location as the warehouse + * directory. When creating a table such as $db.$tbl, it creates $db/$tbl directory under the + * warehouse directory, and put the table metadata into that directory. * - * The HadoopCatalog now supports {@link org.apache.iceberg.catalog.Catalog#createTable}, - * {@link org.apache.iceberg.catalog.Catalog#dropTable}, the {@link org.apache.iceberg.catalog.Catalog#renameTable} - * is not supported yet. + *

The HadoopCatalog now supports {@link org.apache.iceberg.catalog.Catalog#createTable}, {@link + * org.apache.iceberg.catalog.Catalog#dropTable}, the {@link + * org.apache.iceberg.catalog.Catalog#renameTable} is not supported yet. * - * Note: The HadoopCatalog requires that the underlying file system supports atomic rename. + *

Note: The HadoopCatalog requires that the underlying file system supports atomic rename. */ -public class HadoopCatalog extends BaseMetastoreCatalog implements Closeable, SupportsNamespaces, Configurable { +public class HadoopCatalog extends BaseMetastoreCatalog + implements Closeable, SupportsNamespaces, Configurable { private static final Logger LOG = LoggerFactory.getLogger(HadoopCatalog.class); private static final String TABLE_METADATA_FILE_EXTENSION = ".metadata.json"; private static final Joiner SLASH = Joiner.on("/"); - private static final PathFilter TABLE_FILTER = path -> path.getName().endsWith(TABLE_METADATA_FILE_EXTENSION); + private static final PathFilter TABLE_FILTER = + path -> path.getName().endsWith(TABLE_METADATA_FILE_EXTENSION); private static final String HADOOP_SUPPRESS_PERMISSION_ERROR = "suppress-permission-error"; private String catalogName; @@ -95,14 +97,14 @@ public class HadoopCatalog extends BaseMetastoreCatalog implements Closeable, Su private boolean suppressPermissionError = false; private Map catalogProperties; - public HadoopCatalog() { - } + public HadoopCatalog() {} @Override public void initialize(String name, Map properties) { this.catalogProperties = ImmutableMap.copyOf(properties); String inputWarehouseLocation = properties.get(CatalogProperties.WAREHOUSE_LOCATION); - Preconditions.checkArgument(inputWarehouseLocation != null && inputWarehouseLocation.length() > 0, + Preconditions.checkArgument( + inputWarehouseLocation != null && inputWarehouseLocation.length() > 0, "Cannot initialize HadoopCatalog because warehousePath must not be null or empty"); this.catalogName = name; @@ -110,7 +112,10 @@ public void initialize(String name, Map properties) { this.fs = Util.getFs(new Path(warehouseLocation), conf); String fileIOImpl = properties.get(CatalogProperties.FILE_IO_IMPL); - this.fileIO = fileIOImpl == null ? new HadoopFileIO(conf) : CatalogUtil.loadFileIO(fileIOImpl, properties, conf); + this.fileIO = + fileIOImpl == null + ? new HadoopFileIO(conf) + : CatalogUtil.loadFileIO(fileIOImpl, properties, conf); this.lockManager = LockManagers.from(properties); @@ -118,7 +123,8 @@ public void initialize(String name, Map properties) { closeableGroup.addCloseable(lockManager); closeableGroup.setSuppressCloseFailure(true); - this.suppressPermissionError = Boolean.parseBoolean(properties.get(HADOOP_SUPPRESS_PERMISSION_ERROR)); + this.suppressPermissionError = + Boolean.parseBoolean(properties.get(HADOOP_SUPPRESS_PERMISSION_ERROR)); } /** @@ -139,9 +145,9 @@ public String name() { private boolean shouldSuppressPermissionError(IOException ioException) { if (suppressPermissionError) { - return ioException instanceof AccessDeniedException || - (ioException.getMessage() != null && - ioException.getMessage().contains("AuthorizationPermissionMismatch")); + return ioException instanceof AccessDeniedException + || (ioException.getMessage() != null + && ioException.getMessage().contains("AuthorizationPermissionMismatch")); } return false; } @@ -181,8 +187,8 @@ private boolean isDirectory(Path path) { @Override public List listTables(Namespace namespace) { - Preconditions.checkArgument(namespace.levels().length >= 1, - "Missing database in table identifier: %s", namespace); + Preconditions.checkArgument( + namespace.levels().length >= 1, "Missing database in table identifier: %s", namespace); Path nsPath = new Path(warehouseLocation, SLASH.join(namespace.levels())); Set tblIdents = Sets.newHashSet(); @@ -219,7 +225,8 @@ protected boolean isValidIdentifier(TableIdentifier identifier) { @Override protected TableOperations newTableOps(TableIdentifier identifier) { - return new HadoopTableOperations(new Path(defaultWarehouseLocation(identifier)), fileIO, conf, lockManager); + return new HadoopTableOperations( + new Path(defaultWarehouseLocation(identifier)), fileIO, conf, lockManager); } @Override @@ -270,10 +277,10 @@ public void renameTable(TableIdentifier from, TableIdentifier to) { @Override public void createNamespace(Namespace namespace, Map meta) { Preconditions.checkArgument( - !namespace.isEmpty(), - "Cannot create namespace with invalid name: %s", namespace); + !namespace.isEmpty(), "Cannot create namespace with invalid name: %s", namespace); if (!meta.isEmpty()) { - throw new UnsupportedOperationException("Cannot create namespace " + namespace + ": metadata is not supported"); + throw new UnsupportedOperationException( + "Cannot create namespace " + namespace + ": metadata is not supported"); } Path nsPath = new Path(warehouseLocation, SLASH.join(namespace.levels())); @@ -292,8 +299,10 @@ public void createNamespace(Namespace namespace, Map meta) { @Override public List listNamespaces(Namespace namespace) { - Path nsPath = namespace.isEmpty() ? new Path(warehouseLocation) - : new Path(warehouseLocation, SLASH.join(namespace.levels())); + Path nsPath = + namespace.isEmpty() + ? new Path(warehouseLocation) + : new Path(warehouseLocation, SLASH.join(namespace.levels())); if (!isNamespace(nsPath)) { throw new NoSuchNamespaceException("Namespace does not exist: %s", namespace); } @@ -341,13 +350,13 @@ public boolean dropNamespace(Namespace namespace) { } @Override - public boolean setProperties(Namespace namespace, Map properties) { + public boolean setProperties(Namespace namespace, Map properties) { throw new UnsupportedOperationException( "Cannot set namespace properties " + namespace + " : setProperties is not supported"); } @Override - public boolean removeProperties(Namespace namespace, Set properties) { + public boolean removeProperties(Namespace namespace, Set properties) { throw new UnsupportedOperationException( "Cannot remove properties " + namespace + " : removeProperties is not supported"); } @@ -410,8 +419,12 @@ private HadoopCatalogTableBuilder(TableIdentifier identifier, Schema schema) { @Override public TableBuilder withLocation(String location) { - Preconditions.checkArgument(location == null || location.equals(defaultLocation), - "Cannot set a custom location for a path-based table. Expected " + defaultLocation + " but got " + location); + Preconditions.checkArgument( + location == null || location.equals(defaultLocation), + "Cannot set a custom location for a path-based table. Expected " + + defaultLocation + + " but got " + + location); return this; } } diff --git a/core/src/main/java/org/apache/iceberg/hadoop/HadoopConfigurable.java b/core/src/main/java/org/apache/iceberg/hadoop/HadoopConfigurable.java index 9e79c3b41426..d2ca1d97563f 100644 --- a/core/src/main/java/org/apache/iceberg/hadoop/HadoopConfigurable.java +++ b/core/src/main/java/org/apache/iceberg/hadoop/HadoopConfigurable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; import java.util.function.Function; @@ -25,20 +24,23 @@ import org.apache.iceberg.util.SerializableSupplier; /** - * An interface that extends the Hadoop {@link Configurable} interface to offer better serialization support for - * customizable Iceberg objects such as {@link org.apache.iceberg.io.FileIO}. - *

- * If an object is serialized and needs to use Hadoop configuration, it is recommended for the object to implement - * this interface so that a serializable supplier of configuration can be provided instead of an actual Hadoop - * configuration which is not serializable. + * An interface that extends the Hadoop {@link Configurable} interface to offer better serialization + * support for customizable Iceberg objects such as {@link org.apache.iceberg.io.FileIO}. + * + *

If an object is serialized and needs to use Hadoop configuration, it is recommended for the + * object to implement this interface so that a serializable supplier of configuration can be + * provided instead of an actual Hadoop configuration which is not serializable. */ public interface HadoopConfigurable extends Configurable { /** - * Take a function that serializes Hadoop configuration into a supplier. An implementation is supposed to pass in - * its current Hadoop configuration into this function, and the result can be safely serialized for future use. - * @param confSerializer A function that takes Hadoop configuration and returns a serializable supplier of it. + * Take a function that serializes Hadoop configuration into a supplier. An implementation is + * supposed to pass in its current Hadoop configuration into this function, and the result can be + * safely serialized for future use. + * + * @param confSerializer A function that takes Hadoop configuration and returns a serializable + * supplier of it. */ - void serializeConfWith(Function> confSerializer); - + void serializeConfWith( + Function> confSerializer); } diff --git a/core/src/main/java/org/apache/iceberg/hadoop/HadoopFileIO.java b/core/src/main/java/org/apache/iceberg/hadoop/HadoopFileIO.java index 2ba2cbf2693b..fc4c0d2f4879 100644 --- a/core/src/main/java/org/apache/iceberg/hadoop/HadoopFileIO.java +++ b/core/src/main/java/org/apache/iceberg/hadoop/HadoopFileIO.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; import java.io.IOException; @@ -44,11 +43,11 @@ public class HadoopFileIO implements FileIO, HadoopConfigurable, SupportsPrefixO /** * Constructor used for dynamic FileIO loading. - *

- * {@link Configuration Hadoop configuration} must be set through {@link HadoopFileIO#setConf(Configuration)} + * + *

{@link Configuration Hadoop configuration} must be set through {@link + * HadoopFileIO#setConf(Configuration)} */ - public HadoopFileIO() { - } + public HadoopFileIO() {} public HadoopFileIO(Configuration hadoopConf) { this(new SerializableConfiguration(hadoopConf)::get); @@ -104,7 +103,8 @@ public Configuration getConf() { } @Override - public void serializeConfWith(Function> confSerializer) { + public void serializeConfWith( + Function> confSerializer) { this.hadoopConf = confSerializer.apply(getConf()); } @@ -115,9 +115,15 @@ public Iterable listPrefix(String prefix) { return () -> { try { - return Streams.stream(new AdaptingIterator<>(fs.listFiles(prefixToList, true /* recursive */))) - .map(fileStatus -> new FileInfo(fileStatus.getPath().toString(), fileStatus.getLen(), - fileStatus.getModificationTime())).iterator(); + return Streams.stream( + new AdaptingIterator<>(fs.listFiles(prefixToList, true /* recursive */))) + .map( + fileStatus -> + new FileInfo( + fileStatus.getPath().toString(), + fileStatus.getLen(), + fileStatus.getModificationTime())) + .iterator(); } catch (IOException e) { throw new UncheckedIOException(e); } @@ -137,8 +143,7 @@ public void deletePrefix(String prefix) { } /** - * This class is a simple adaptor to allow for using Hadoop's - * RemoteIterator as an Iterator. + * This class is a simple adaptor to allow for using Hadoop's RemoteIterator as an Iterator. * * @param element type */ diff --git a/core/src/main/java/org/apache/iceberg/hadoop/HadoopInputFile.java b/core/src/main/java/org/apache/iceberg/hadoop/HadoopInputFile.java index 8e39dcaf09c7..6c74575f8cc9 100644 --- a/core/src/main/java/org/apache/iceberg/hadoop/HadoopInputFile.java +++ b/core/src/main/java/org/apache/iceberg/hadoop/HadoopInputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; import java.io.FileNotFoundException; @@ -39,8 +38,8 @@ /** * {@link InputFile} implementation using the Hadoop {@link FileSystem} API. - *

- * This class is based on Parquet's HadoopInputFile. + * + *

This class is based on Parquet's HadoopInputFile. */ public class HadoopInputFile implements InputFile, NativelyEncryptedFile { public static final String[] NO_LOCATION_PREFERENCE = new String[0]; @@ -58,8 +57,8 @@ public static HadoopInputFile fromLocation(CharSequence location, Configuration return new HadoopInputFile(fs, location.toString(), conf); } - public static HadoopInputFile fromLocation(CharSequence location, long length, - Configuration conf) { + public static HadoopInputFile fromLocation( + CharSequence location, long length, Configuration conf) { FileSystem fs = Util.getFs(new Path(location.toString()), conf); if (length > 0) { return new HadoopInputFile(fs, location.toString(), length, conf); @@ -72,8 +71,7 @@ public static HadoopInputFile fromLocation(CharSequence location, FileSystem fs) return new HadoopInputFile(fs, location.toString(), fs.getConf()); } - public static HadoopInputFile fromLocation(CharSequence location, long length, - FileSystem fs) { + public static HadoopInputFile fromLocation(CharSequence location, long length, FileSystem fs) { return new HadoopInputFile(fs, location.toString(), length, fs.getConf()); } @@ -99,7 +97,8 @@ public static HadoopInputFile fromPath(Path path, FileSystem fs, Configuration c return new HadoopInputFile(fs, path, conf); } - public static HadoopInputFile fromPath(Path path, long length, FileSystem fs, Configuration conf) { + public static HadoopInputFile fromPath( + Path path, long length, FileSystem fs, Configuration conf) { return new HadoopInputFile(fs, path, length, conf); } @@ -119,7 +118,7 @@ public static HadoopInputFile fromStatus(FileStatus stat, FileSystem fs, Configu private HadoopInputFile(FileSystem fs, String location, Configuration conf) { this.fs = fs; this.location = location; - this.path = new Path(location); + this.path = new Path(location); this.conf = conf; } @@ -127,7 +126,7 @@ private HadoopInputFile(FileSystem fs, String location, long length, Configurati Preconditions.checkArgument(length >= 0, "Invalid file length: %s", length); this.fs = fs; this.location = location; - this.path = new Path(location); + this.path = new Path(location); this.conf = conf; this.length = length; } diff --git a/core/src/main/java/org/apache/iceberg/hadoop/HadoopMetricsContext.java b/core/src/main/java/org/apache/iceberg/hadoop/HadoopMetricsContext.java index adff76c25095..5e966a11935f 100644 --- a/core/src/main/java/org/apache/iceberg/hadoop/HadoopMetricsContext.java +++ b/core/src/main/java/org/apache/iceberg/hadoop/HadoopMetricsContext.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; import java.util.Map; @@ -26,8 +25,8 @@ import org.apache.iceberg.io.FileIOMetricsContext; /** - * FileIO Metrics implementation that delegates to Hadoop FileSystem - * statistics implementation using the provided scheme. + * FileIO Metrics implementation that delegates to Hadoop FileSystem statistics implementation using + * the provided scheme. */ public class HadoopMetricsContext implements FileIOMetricsContext { public static final String SCHEME = "io.metrics-scheme"; @@ -36,8 +35,8 @@ public class HadoopMetricsContext implements FileIOMetricsContext { private transient volatile FileSystem.Statistics statistics; public HadoopMetricsContext(String scheme) { - ValidationException.check(scheme != null, - "Scheme is required for Hadoop FileSystem metrics reporting"); + ValidationException.check( + scheme != null, "Scheme is required for Hadoop FileSystem metrics reporting"); this.scheme = scheme; } @@ -51,10 +50,9 @@ public void initialize(Map properties) { } /** - * The Hadoop implementation delegates to the FileSystem.Statistics - * implementation and therefore does not require - * support for operations like unit() and count() as the counter - * values are not directly consumed. + * The Hadoop implementation delegates to the FileSystem.Statistics implementation and therefore + * does not require support for operations like unit() and count() as the counter values are not + * directly consumed. * * @param name name of the metric * @param type numeric type of the counter value @@ -70,13 +68,15 @@ public Counter counter(String name, Class type, Unit un ValidationException.check(type == Long.class, "'%s' requires Long type", READ_BYTES); return (Counter) longCounter(statistics()::incrementBytesRead); case READ_OPERATIONS: - ValidationException.check(type == Integer.class, "'%s' requires Integer type", READ_OPERATIONS); + ValidationException.check( + type == Integer.class, "'%s' requires Integer type", READ_OPERATIONS); return (Counter) integerCounter(statistics()::incrementReadOps); case WRITE_BYTES: ValidationException.check(type == Long.class, "'%s' requires Long type", WRITE_BYTES); return (Counter) longCounter(statistics()::incrementBytesWritten); case WRITE_OPERATIONS: - ValidationException.check(type == Integer.class, "'%s' requires Integer type", WRITE_OPERATIONS); + ValidationException.check( + type == Integer.class, "'%s' requires Integer type", WRITE_OPERATIONS); return (Counter) integerCounter(statistics()::incrementWriteOps); default: throw new IllegalArgumentException(String.format("Unsupported counter: '%s'", name)); @@ -84,7 +84,7 @@ public Counter counter(String name, Class type, Unit un } private Counter longCounter(Consumer consumer) { - return new Counter() { + return new Counter() { @Override public void increment() { increment(1L); diff --git a/core/src/main/java/org/apache/iceberg/hadoop/HadoopOutputFile.java b/core/src/main/java/org/apache/iceberg/hadoop/HadoopOutputFile.java index 764725de5d0c..9453ee572066 100644 --- a/core/src/main/java/org/apache/iceberg/hadoop/HadoopOutputFile.java +++ b/core/src/main/java/org/apache/iceberg/hadoop/HadoopOutputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; import java.io.IOException; @@ -32,9 +31,7 @@ import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.io.PositionOutputStream; -/** - * {@link OutputFile} implementation using the Hadoop {@link FileSystem} API. - */ +/** {@link OutputFile} implementation using the Hadoop {@link FileSystem} API. */ public class HadoopOutputFile implements OutputFile, NativelyEncryptedFile { private final FileSystem fs; diff --git a/core/src/main/java/org/apache/iceberg/hadoop/HadoopStreams.java b/core/src/main/java/org/apache/iceberg/hadoop/HadoopStreams.java index 6cf230406a4e..1936888c5c05 100644 --- a/core/src/main/java/org/apache/iceberg/hadoop/HadoopStreams.java +++ b/core/src/main/java/org/apache/iceberg/hadoop/HadoopStreams.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; import java.io.IOException; @@ -37,12 +36,11 @@ /** * Convenience methods to get Parquet abstractions for Hadoop data streams. * - * This class is based on Parquet's HadoopStreams. + *

This class is based on Parquet's HadoopStreams. */ class HadoopStreams { - private HadoopStreams() { - } + private HadoopStreams() {} private static final Logger LOG = LoggerFactory.getLogger(HadoopStreams.class); @@ -71,7 +69,8 @@ static PositionOutputStream wrap(FSDataOutputStream stream) { * SeekableInputStream implementation for FSDataInputStream that implements ByteBufferReadable in * Hadoop 2. */ - private static class HadoopSeekableInputStream extends SeekableInputStream implements DelegatingInputStream { + private static class HadoopSeekableInputStream extends SeekableInputStream + implements DelegatingInputStream { private final FSDataInputStream stream; private final StackTraceElement[] createStack; private boolean closed; @@ -123,17 +122,16 @@ protected void finalize() throws Throwable { super.finalize(); if (!closed) { close(); // releasing resources is more important than printing the warning - String trace = Joiner.on("\n\t").join( - Arrays.copyOfRange(createStack, 1, createStack.length)); + String trace = + Joiner.on("\n\t").join(Arrays.copyOfRange(createStack, 1, createStack.length)); LOG.warn("Unclosed input stream created by:\n\t{}", trace); } } } - /** - * PositionOutputStream implementation for FSDataOutputStream. - */ - private static class HadoopPositionOutputStream extends PositionOutputStream implements DelegatingOutputStream { + /** PositionOutputStream implementation for FSDataOutputStream. */ + private static class HadoopPositionOutputStream extends PositionOutputStream + implements DelegatingOutputStream { private final FSDataOutputStream stream; private final StackTraceElement[] createStack; private boolean closed; @@ -186,8 +184,8 @@ protected void finalize() throws Throwable { super.finalize(); if (!closed) { close(); // releasing resources is more important than printing the warning - String trace = Joiner.on("\n\t").join( - Arrays.copyOfRange(createStack, 1, createStack.length)); + String trace = + Joiner.on("\n\t").join(Arrays.copyOfRange(createStack, 1, createStack.length)); LOG.warn("Unclosed output stream created by:\n\t{}", trace); } } diff --git a/core/src/main/java/org/apache/iceberg/hadoop/HadoopTableOperations.java b/core/src/main/java/org/apache/iceberg/hadoop/HadoopTableOperations.java index c43800f8c2b7..44936f251495 100644 --- a/core/src/main/java/org/apache/iceberg/hadoop/HadoopTableOperations.java +++ b/core/src/main/java/org/apache/iceberg/hadoop/HadoopTableOperations.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; import java.io.BufferedReader; @@ -55,8 +54,8 @@ /** * TableOperations implementation for file systems that support atomic rename. - *

- * This maintains metadata in a "metadata" folder under the table location. + * + *

This maintains metadata in a "metadata" folder under the table location. */ public class HadoopTableOperations implements TableOperations { private static final Logger LOG = LoggerFactory.getLogger(HadoopTableOperations.class); @@ -71,7 +70,8 @@ public class HadoopTableOperations implements TableOperations { private volatile Integer version = null; private volatile boolean shouldRefresh = true; - protected HadoopTableOperations(Path location, FileIO fileIO, Configuration conf, LockManager lockManager) { + protected HadoopTableOperations( + Path location, FileIO fileIO, Configuration conf, LockManager lockManager) { this.conf = conf; this.location = location; this.fileIO = fileIO; @@ -94,7 +94,8 @@ private synchronized void updateVersionAndMetadata(int newVersion, String metada // update if the current version is out of date if (version == null || version != newVersion) { this.version = newVersion; - this.currentMetadata = checkUUID(currentMetadata, TableMetadataParser.read(io(), metadataFile)); + this.currentMetadata = + checkUUID(currentMetadata, TableMetadataParser.read(io(), metadataFile)); } } @@ -138,14 +139,16 @@ public void commit(TableMetadata base, TableMetadata metadata) { return; } - Preconditions.checkArgument(base == null || base.location().equals(metadata.location()), + Preconditions.checkArgument( + base == null || base.location().equals(metadata.location()), "Hadoop path-based tables cannot be relocated"); Preconditions.checkArgument( !metadata.properties().containsKey(TableProperties.WRITE_METADATA_LOCATION), "Hadoop path-based tables cannot relocate metadata"); - String codecName = metadata.property( - TableProperties.METADATA_COMPRESSION, TableProperties.METADATA_COMPRESSION_DEFAULT); + String codecName = + metadata.property( + TableProperties.METADATA_COMPRESSION, TableProperties.METADATA_COMPRESSION_DEFAULT); TableMetadataParser.Codec codec = TableMetadataParser.Codec.fromName(codecName); String fileExtension = TableMetadataParser.getFileExtension(codec); Path tempMetadataFile = metadataPath(UUID.randomUUID().toString() + fileExtension); @@ -193,7 +196,8 @@ public TableMetadata current() { @Override public TableMetadata refresh() { - throw new UnsupportedOperationException("Cannot call refresh on temporary table operations"); + throw new UnsupportedOperationException( + "Cannot call refresh on temporary table operations"); } @Override @@ -208,7 +212,8 @@ public String metadataFileLocation(String fileName) { @Override public LocationProvider locationProvider() { - return LocationProviders.locationsFor(uncommittedMetadata.location(), uncommittedMetadata.properties()); + return LocationProviders.locationsFor( + uncommittedMetadata.location(), uncommittedMetadata.properties()); } @Override @@ -309,8 +314,9 @@ int findVersion() { Path versionHintFile = versionHintFile(); FileSystem fs = getFileSystem(versionHintFile, conf); - try (InputStreamReader fsr = new InputStreamReader(fs.open(versionHintFile), StandardCharsets.UTF_8); - BufferedReader in = new BufferedReader(fsr)) { + try (InputStreamReader fsr = + new InputStreamReader(fs.open(versionHintFile), StandardCharsets.UTF_8); + BufferedReader in = new BufferedReader(fsr)) { return Integer.parseInt(in.readLine().replace("\n", "")); } catch (Exception e) { @@ -322,8 +328,11 @@ int findVersion() { return 0; } - // List the metadata directory to find the version files, and try to recover the max available version - FileStatus[] files = fs.listStatus(metadataRoot(), name -> VERSION_PATTERN.matcher(name.getName()).matches()); + // List the metadata directory to find the version files, and try to recover the max + // available version + FileStatus[] files = + fs.listStatus( + metadataRoot(), name -> VERSION_PATTERN.matcher(name.getName()).matches()); int maxVersion = 0; for (FileStatus file : files) { @@ -357,8 +366,8 @@ private void renameToFinal(FileSystem fs, Path src, Path dst, int nextVersion) { } if (!fs.rename(src, dst)) { - CommitFailedException cfe = new CommitFailedException( - "Failed to commit changes using rename: %s", dst); + CommitFailedException cfe = + new CommitFailedException("Failed to commit changes using rename: %s", dst); RuntimeException re = tryDelete(src); if (re != null) { cfe.addSuppressed(re); @@ -366,8 +375,8 @@ private void renameToFinal(FileSystem fs, Path src, Path dst, int nextVersion) { throw cfe; } } catch (IOException e) { - CommitFailedException cfe = new CommitFailedException(e, - "Failed to commit changes using rename: %s", dst); + CommitFailedException cfe = + new CommitFailedException(e, "Failed to commit changes using rename: %s", dst); RuntimeException re = tryDelete(src); if (re != null) { cfe.addSuppressed(re); @@ -398,9 +407,10 @@ protected FileSystem getFileSystem(Path path, Configuration hadoopConf) { } /** - * Deletes the oldest metadata files if {@link TableProperties#METADATA_DELETE_AFTER_COMMIT_ENABLED} is true. + * Deletes the oldest metadata files if {@link + * TableProperties#METADATA_DELETE_AFTER_COMMIT_ENABLED} is true. * - * @param base table metadata on which previous versions were based + * @param base table metadata on which previous versions were based * @param metadata new table metadata with updated previous versions */ private void deleteRemovedMetadataFiles(TableMetadata base, TableMetadata metadata) { @@ -408,18 +418,23 @@ private void deleteRemovedMetadataFiles(TableMetadata base, TableMetadata metada return; } - boolean deleteAfterCommit = metadata.propertyAsBoolean( - TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED, - TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT); + boolean deleteAfterCommit = + metadata.propertyAsBoolean( + TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED, + TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT); if (deleteAfterCommit) { - Set removedPreviousMetadataFiles = Sets.newHashSet(base.previousFiles()); + Set removedPreviousMetadataFiles = + Sets.newHashSet(base.previousFiles()); removedPreviousMetadataFiles.removeAll(metadata.previousFiles()); Tasks.foreach(removedPreviousMetadataFiles) .executeWith(ThreadPools.getWorkerPool()) - .noRetry().suppressFailureWhenFinished() - .onFailure((previousMetadataFile, exc) -> - LOG.warn("Delete failed for previous metadata file: {}", previousMetadataFile, exc)) + .noRetry() + .suppressFailureWhenFinished() + .onFailure( + (previousMetadataFile, exc) -> + LOG.warn( + "Delete failed for previous metadata file: {}", previousMetadataFile, exc)) .run(previousMetadataFile -> io().deleteFile(previousMetadataFile.file())); } } @@ -427,8 +442,11 @@ private void deleteRemovedMetadataFiles(TableMetadata base, TableMetadata metada private static TableMetadata checkUUID(TableMetadata currentMetadata, TableMetadata newMetadata) { String newUUID = newMetadata.uuid(); if (currentMetadata != null && currentMetadata.uuid() != null && newUUID != null) { - Preconditions.checkState(newUUID.equals(currentMetadata.uuid()), - "Table UUID does not match: current=%s != refreshed=%s", currentMetadata.uuid(), newUUID); + Preconditions.checkState( + newUUID.equals(currentMetadata.uuid()), + "Table UUID does not match: current=%s != refreshed=%s", + currentMetadata.uuid(), + newUUID); } return newMetadata; } diff --git a/core/src/main/java/org/apache/iceberg/hadoop/HadoopTables.java b/core/src/main/java/org/apache/iceberg/hadoop/HadoopTables.java index 740158741b2a..764d0d7d863a 100644 --- a/core/src/main/java/org/apache/iceberg/hadoop/HadoopTables.java +++ b/core/src/main/java/org/apache/iceberg/hadoop/HadoopTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; import java.io.IOException; @@ -54,8 +53,7 @@ import org.slf4j.LoggerFactory; /** - * Implementation of Iceberg tables that uses the Hadoop FileSystem - * to store metadata and manifests. + * Implementation of Iceberg tables that uses the Hadoop FileSystem to store metadata and manifests. */ public class HadoopTables implements Tables, Configurable { @@ -110,8 +108,9 @@ public boolean exists(String location) { } /** - * Try to resolve a metadata table, which we encode as URI fragments - * e.g. hdfs:///warehouse/my_table#snapshots + * Try to resolve a metadata table, which we encode as URI fragments e.g. + * hdfs:///warehouse/my_table#snapshots + * * @param location Path to parse * @return A base table name and MetadataTableType if a type is found, null if not */ @@ -127,7 +126,8 @@ private Pair parseMetadataType(String location) { } } - private Table loadMetadataTable(String location, String metadataTableName, MetadataTableType type) { + private Table loadMetadataTable( + String location, String metadataTableName, MetadataTableType type) { TableOperations ops = newTableOps(location); if (ops.current() == null) { throw new NoSuchTableException("Table does not exist at location: %s", location); @@ -137,8 +137,7 @@ private Table loadMetadataTable(String location, String metadataTableName, Metad } /** - * Create a table using the FileSystem implementation resolve from - * location. + * Create a table using the FileSystem implementation resolve from location. * * @param schema iceberg schema used to create the table * @param spec partitioning spec, if null the table will be unpartitioned @@ -147,9 +146,14 @@ private Table loadMetadataTable(String location, String metadataTableName, Metad * @return newly created table implementation */ @Override - public Table create(Schema schema, PartitionSpec spec, SortOrder order, - Map properties, String location) { - return buildTable(location, schema).withPartitionSpec(spec) + public Table create( + Schema schema, + PartitionSpec spec, + SortOrder order, + Map properties, + String location) { + return buildTable(location, schema) + .withPartitionSpec(spec) .withSortOrder(order) .withProperties(properties) .create(); @@ -167,8 +171,8 @@ public boolean dropTable(String location) { /** * Drop a table; optionally delete data and metadata files. - *

- * If purge is set to true the implementation should delete all data and metadata files. + * + *

If purge is set to true the implementation should delete all data and metadata files. * * @param location a path URI (e.g. hdfs:///warehouse/my_table) * @param purge if true, delete all data and metadata files in the table @@ -204,8 +208,8 @@ TableOperations newTableOps(String location) { if (location.contains(METADATA_JSON)) { return new StaticTableOperations(location, new HadoopFileIO(conf)); } else { - return new HadoopTableOperations(new Path(location), new HadoopFileIO(conf), conf, - createOrGetLockManager(this)); + return new HadoopTableOperations( + new Path(location), new HadoopFileIO(conf), conf, createOrGetLockManager(this)); } } @@ -227,8 +231,12 @@ private static synchronized LockManager createOrGetLockManager(HadoopTables tabl return lockManager; } - private TableMetadata tableMetadata(Schema schema, PartitionSpec spec, SortOrder order, - Map properties, String location) { + private TableMetadata tableMetadata( + Schema schema, + PartitionSpec spec, + SortOrder order, + Map properties, + String location) { Preconditions.checkNotNull(schema, "A table schema is required"); Map tableProps = properties == null ? ImmutableMap.of() : properties; @@ -248,11 +256,11 @@ private TableMetadata tableMetadata(Schema schema, PartitionSpec spec, SortOrder * @throws AlreadyExistsException if the table already exists */ public Transaction newCreateTableTransaction( - String location, - Schema schema, - PartitionSpec spec, - Map properties) { - return buildTable(location, schema).withPartitionSpec(spec).withProperties(properties).createTransaction(); + String location, Schema schema, PartitionSpec spec, Map properties) { + return buildTable(location, schema) + .withPartitionSpec(spec) + .withProperties(properties) + .createTransaction(); } /** @@ -273,8 +281,8 @@ public Transaction newReplaceTableTransaction( Map properties, boolean orCreate) { - - Catalog.TableBuilder builder = buildTable(location, schema).withPartitionSpec(spec).withProperties(properties); + Catalog.TableBuilder builder = + buildTable(location, schema).withPartitionSpec(spec).withProperties(properties); return orCreate ? builder.createOrReplaceTransaction() : builder.replaceTransaction(); } @@ -289,7 +297,6 @@ private class HadoopTableBuilder implements Catalog.TableBuilder { private PartitionSpec spec = PartitionSpec.unpartitioned(); private SortOrder sortOrder = SortOrder.unsorted(); - HadoopTableBuilder(String location, Schema schema) { this.location = location; this.schema = schema; @@ -309,8 +316,10 @@ public Catalog.TableBuilder withSortOrder(SortOrder newSortOrder) { @Override public Catalog.TableBuilder withLocation(String newLocation) { - Preconditions.checkArgument(newLocation == null || location.equals(newLocation), - String.format("Table location %s differs from the table location (%s) from the PathIdentifier", + Preconditions.checkArgument( + newLocation == null || location.equals(newLocation), + String.format( + "Table location %s differs from the table location (%s) from the PathIdentifier", newLocation, location)); return this; } diff --git a/core/src/main/java/org/apache/iceberg/hadoop/HiddenPathFilter.java b/core/src/main/java/org/apache/iceberg/hadoop/HiddenPathFilter.java index 248d3d3679db..11df9bf2a818 100644 --- a/core/src/main/java/org/apache/iceberg/hadoop/HiddenPathFilter.java +++ b/core/src/main/java/org/apache/iceberg/hadoop/HiddenPathFilter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; import java.io.Serializable; @@ -24,15 +23,14 @@ import org.apache.hadoop.fs.PathFilter; /** - * A {@link PathFilter} that filters out hidden paths. A path is considered to - * be hidden when the path name starts with a period ('.') or an underscore ('_'). + * A {@link PathFilter} that filters out hidden paths. A path is considered to be hidden when the + * path name starts with a period ('.') or an underscore ('_'). */ public class HiddenPathFilter implements PathFilter, Serializable { private static final HiddenPathFilter INSTANCE = new HiddenPathFilter(); - private HiddenPathFilter() { - } + private HiddenPathFilter() {} public static HiddenPathFilter get() { return INSTANCE; diff --git a/core/src/main/java/org/apache/iceberg/hadoop/SerializableConfiguration.java b/core/src/main/java/org/apache/iceberg/hadoop/SerializableConfiguration.java index 34baf1c42532..3e9f17455f81 100644 --- a/core/src/main/java/org/apache/iceberg/hadoop/SerializableConfiguration.java +++ b/core/src/main/java/org/apache/iceberg/hadoop/SerializableConfiguration.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; import java.io.IOException; @@ -25,9 +24,7 @@ import java.io.Serializable; import org.apache.hadoop.conf.Configuration; -/** - * Wraps a {@link Configuration} object in a {@link Serializable} layer. - */ +/** Wraps a {@link Configuration} object in a {@link Serializable} layer. */ public class SerializableConfiguration implements Serializable { private transient Configuration hadoopConf; diff --git a/core/src/main/java/org/apache/iceberg/hadoop/Util.java b/core/src/main/java/org/apache/iceberg/hadoop/Util.java index 7fc55d290740..86ab481a3ec1 100644 --- a/core/src/main/java/org/apache/iceberg/hadoop/Util.java +++ b/core/src/main/java/org/apache/iceberg/hadoop/Util.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; import java.io.IOException; @@ -43,8 +42,7 @@ public class Util { private static final Logger LOG = LoggerFactory.getLogger(Util.class); - private Util() { - } + private Util() {} public static FileSystem getFs(Path path, Configuration conf) { try { @@ -76,7 +74,8 @@ public static String[] blockLocations(FileIO io, CombinedScanTask task) { for (FileScanTask f : task.files()) { InputFile in = io.newInputFile(f.file().path().toString()); if (in instanceof HadoopInputFile) { - Collections.addAll(locations, ((HadoopInputFile) in).getBlockLocations(f.start(), f.length())); + Collections.addAll( + locations, ((HadoopInputFile) in).getBlockLocations(f.start(), f.length())); } } @@ -86,10 +85,9 @@ public static String[] blockLocations(FileIO io, CombinedScanTask task) { /** * From Apache Spark * - * Convert URI to String. - * Since URI.toString does not decode the uri, e.g. change '%25' to '%'. - * Here we create a hadoop Path with the given URI, and rely on Path.toString - * to decode the uri + *

Convert URI to String. Since URI.toString does not decode the uri, e.g. change '%25' to '%'. + * Here we create a hadoop Path with the given URI, and rely on Path.toString to decode the uri + * * @param uri the URI of the path * @return the String of the path */ diff --git a/core/src/main/java/org/apache/iceberg/io/BasePositionDeltaWriter.java b/core/src/main/java/org/apache/iceberg/io/BasePositionDeltaWriter.java index 4cf38d9c9712..d54a5e05a74b 100644 --- a/core/src/main/java/org/apache/iceberg/io/BasePositionDeltaWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/BasePositionDeltaWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -34,12 +33,14 @@ public class BasePositionDeltaWriter implements PositionDeltaWriter { private boolean closed; - public BasePositionDeltaWriter(PartitioningWriter insertWriter, - PartitioningWriter updateWriter, - PartitioningWriter, DeleteWriteResult> deleteWriter) { + public BasePositionDeltaWriter( + PartitioningWriter insertWriter, + PartitioningWriter updateWriter, + PartitioningWriter, DeleteWriteResult> deleteWriter) { Preconditions.checkArgument(insertWriter != null, "Insert writer cannot be null"); Preconditions.checkArgument(updateWriter != null, "Update writer cannot be null"); - Preconditions.checkArgument(insertWriter != updateWriter, "Update and insert writers must be different"); + Preconditions.checkArgument( + insertWriter != updateWriter, "Update and insert writers must be different"); Preconditions.checkArgument(deleteWriter != null, "Delete writer cannot be null"); this.insertWriter = insertWriter; diff --git a/core/src/main/java/org/apache/iceberg/io/BaseTaskWriter.java b/core/src/main/java/org/apache/iceberg/io/BaseTaskWriter.java index c80084f3d3cd..55d6d245aa03 100644 --- a/core/src/main/java/org/apache/iceberg/io/BaseTaskWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/BaseTaskWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.Closeable; @@ -54,8 +53,13 @@ public abstract class BaseTaskWriter implements TaskWriter { private final FileIO io; private final long targetFileSize; - protected BaseTaskWriter(PartitionSpec spec, FileFormat format, FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize) { + protected BaseTaskWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize) { this.spec = spec; this.format = format; this.appenderFactory = appenderFactory; @@ -91,9 +95,7 @@ public WriteResult complete() throws IOException { .build(); } - /** - * Base equality delta writer to write both insert records and equality-deletes. - */ + /** Base equality delta writer to write both insert records and equality-deletes. */ protected abstract class BaseEqualityDeltaWriter implements Closeable { private final StructProjection structProjection; private RollingFileWriter dataWriter; @@ -108,18 +110,15 @@ protected BaseEqualityDeltaWriter(StructLike partition, Schema schema, Schema de this.dataWriter = new RollingFileWriter(partition); this.eqDeleteWriter = new RollingEqDeleteWriter(partition); - this.posDeleteWriter = new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, partition); + this.posDeleteWriter = + new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, partition); this.insertedRowMap = StructLikeMap.create(deleteSchema.asStruct()); } - /** - * Wrap the data as a {@link StructLike}. - */ + /** Wrap the data as a {@link StructLike}. */ protected abstract StructLike asStructLike(T data); - /** - * Wrap the passed in key of a row as a {@link StructLike} - */ + /** Wrap the passed in key of a row as a {@link StructLike} */ protected abstract StructLike asStructLikeKey(T key); public void write(T row) throws IOException { @@ -156,8 +155,8 @@ private boolean internalPosDelete(StructLike key) { } /** - * Delete those rows whose equality fields has the same values with the given row. It will write the entire row into - * the equality-delete file. + * Delete those rows whose equality fields has the same values with the given row. It will write + * the entire row into the equality-delete file. * * @param row the given row to delete. */ @@ -168,8 +167,8 @@ public void delete(T row) throws IOException { } /** - * Delete those rows with the given key. It will only write the values of equality fields into the equality-delete - * file. + * Delete those rows with the given key. It will only write the values of equality fields into + * the equality-delete file. * * @param key is the projected data whose columns are the same as the equality fields. */ @@ -293,7 +292,8 @@ private void closeCurrent() throws IOException { try { io.deleteFile(currentFile.encryptingOutputFile()); } catch (UncheckedIOException e) { - // the file may not have been created, and it isn't worth failing the job to clean up, skip deleting + // the file may not have been created, and it isn't worth failing the job to clean up, + // skip deleting } } else { complete(currentWriter); diff --git a/core/src/main/java/org/apache/iceberg/io/ByteBufferInputStream.java b/core/src/main/java/org/apache/iceberg/io/ByteBufferInputStream.java index aa3311a6c53d..254a10653696 100644 --- a/core/src/main/java/org/apache/iceberg/io/ByteBufferInputStream.java +++ b/core/src/main/java/org/apache/iceberg/io/ByteBufferInputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.EOFException; @@ -46,8 +45,7 @@ public static ByteBufferInputStream wrap(List buffers) { public void skipFully(long length) throws IOException { long skipped = skip(length); if (skipped < length) { - throw new EOFException( - "Not enough bytes to skip: " + skipped + " < " + length); + throw new EOFException("Not enough bytes to skip: " + skipped + " < " + length); } } diff --git a/core/src/main/java/org/apache/iceberg/io/ClusteredDataWriter.java b/core/src/main/java/org/apache/iceberg/io/ClusteredDataWriter.java index 9b1b7cca8d9a..fdb99c5f114c 100644 --- a/core/src/main/java/org/apache/iceberg/io/ClusteredDataWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/ClusteredDataWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.util.List; @@ -26,8 +25,8 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; /** - * A data writer capable of writing to multiple specs and partitions that requires the incoming records - * to be properly clustered by partition spec and by partition within each spec. + * A data writer capable of writing to multiple specs and partitions that requires the incoming + * records to be properly clustered by partition spec and by partition within each spec. */ public class ClusteredDataWriter extends ClusteredWriter { @@ -37,8 +36,11 @@ public class ClusteredDataWriter extends ClusteredWriter private final long targetFileSizeInBytes; private final List dataFiles; - public ClusteredDataWriter(FileWriterFactory writerFactory, OutputFileFactory fileFactory, - FileIO io, long targetFileSizeInBytes) { + public ClusteredDataWriter( + FileWriterFactory writerFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSizeInBytes) { this.writerFactory = writerFactory; this.fileFactory = fileFactory; this.io = io; @@ -48,7 +50,8 @@ public ClusteredDataWriter(FileWriterFactory writerFactory, OutputFileFactory @Override protected FileWriter newWriter(PartitionSpec spec, StructLike partition) { - return new RollingDataWriter<>(writerFactory, fileFactory, io, targetFileSizeInBytes, spec, partition); + return new RollingDataWriter<>( + writerFactory, fileFactory, io, targetFileSizeInBytes, spec, partition); } @Override diff --git a/core/src/main/java/org/apache/iceberg/io/ClusteredEqualityDeleteWriter.java b/core/src/main/java/org/apache/iceberg/io/ClusteredEqualityDeleteWriter.java index 4b4b71181468..41146aae1911 100644 --- a/core/src/main/java/org/apache/iceberg/io/ClusteredEqualityDeleteWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/ClusteredEqualityDeleteWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.util.List; @@ -27,8 +26,9 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; /** - * An equality delete writer capable of writing to multiple specs and partitions that requires - * the incoming delete records to be properly clustered by partition spec and by partition within each spec. + * An equality delete writer capable of writing to multiple specs and partitions that requires the + * incoming delete records to be properly clustered by partition spec and by partition within each + * spec. */ public class ClusteredEqualityDeleteWriter extends ClusteredWriter { @@ -38,8 +38,11 @@ public class ClusteredEqualityDeleteWriter extends ClusteredWriter deleteFiles; - public ClusteredEqualityDeleteWriter(FileWriterFactory writerFactory, OutputFileFactory fileFactory, - FileIO io, long targetFileSizeInBytes) { + public ClusteredEqualityDeleteWriter( + FileWriterFactory writerFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSizeInBytes) { this.writerFactory = writerFactory; this.fileFactory = fileFactory; this.io = io; @@ -49,12 +52,14 @@ public ClusteredEqualityDeleteWriter(FileWriterFactory writerFactory, OutputF @Override protected FileWriter newWriter(PartitionSpec spec, StructLike partition) { - return new RollingEqualityDeleteWriter<>(writerFactory, fileFactory, io, targetFileSizeInBytes, spec, partition); + return new RollingEqualityDeleteWriter<>( + writerFactory, fileFactory, io, targetFileSizeInBytes, spec, partition); } @Override protected void addResult(DeleteWriteResult result) { - Preconditions.checkArgument(!result.referencesDataFiles(), "Equality deletes cannot reference data files"); + Preconditions.checkArgument( + !result.referencesDataFiles(), "Equality deletes cannot reference data files"); deleteFiles.addAll(result.deleteFiles()); } diff --git a/core/src/main/java/org/apache/iceberg/io/ClusteredPositionDeleteWriter.java b/core/src/main/java/org/apache/iceberg/io/ClusteredPositionDeleteWriter.java index 450d427ce0b1..c0c26c2b9086 100644 --- a/core/src/main/java/org/apache/iceberg/io/ClusteredPositionDeleteWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/ClusteredPositionDeleteWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.util.List; @@ -28,10 +27,12 @@ import org.apache.iceberg.util.CharSequenceSet; /** - * A position delete writer capable of writing to multiple specs and partitions that requires - * the incoming delete records to be properly clustered by partition spec and by partition within each spec. + * A position delete writer capable of writing to multiple specs and partitions that requires the + * incoming delete records to be properly clustered by partition spec and by partition within each + * spec. */ -public class ClusteredPositionDeleteWriter extends ClusteredWriter, DeleteWriteResult> { +public class ClusteredPositionDeleteWriter + extends ClusteredWriter, DeleteWriteResult> { private final FileWriterFactory writerFactory; private final OutputFileFactory fileFactory; @@ -40,8 +41,11 @@ public class ClusteredPositionDeleteWriter extends ClusteredWriter deleteFiles; private final CharSequenceSet referencedDataFiles; - public ClusteredPositionDeleteWriter(FileWriterFactory writerFactory, OutputFileFactory fileFactory, - FileIO io, long targetFileSizeInBytes) { + public ClusteredPositionDeleteWriter( + FileWriterFactory writerFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSizeInBytes) { this.writerFactory = writerFactory; this.fileFactory = fileFactory; this.io = io; @@ -51,8 +55,10 @@ public ClusteredPositionDeleteWriter(FileWriterFactory writerFactory, OutputF } @Override - protected FileWriter, DeleteWriteResult> newWriter(PartitionSpec spec, StructLike partition) { - return new RollingPositionDeleteWriter<>(writerFactory, fileFactory, io, targetFileSizeInBytes, spec, partition); + protected FileWriter, DeleteWriteResult> newWriter( + PartitionSpec spec, StructLike partition) { + return new RollingPositionDeleteWriter<>( + writerFactory, fileFactory, io, targetFileSizeInBytes, spec, partition); } @Override diff --git a/core/src/main/java/org/apache/iceberg/io/ClusteredWriter.java b/core/src/main/java/org/apache/iceberg/io/ClusteredWriter.java index 61a6f9f9164d..ee9e35eb32cb 100644 --- a/core/src/main/java/org/apache/iceberg/io/ClusteredWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/ClusteredWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -35,17 +34,17 @@ /** * A writer capable of writing to multiple specs and partitions that requires the incoming records * to be clustered by partition spec and by partition within each spec. - *

- * As opposed to {@link FanoutWriter}, this writer keeps at most one file open to reduce - * the memory consumption. Prefer using this writer whenever the incoming records can be clustered - * by spec/partition. + * + *

As opposed to {@link FanoutWriter}, this writer keeps at most one file open to reduce the + * memory consumption. Prefer using this writer whenever the incoming records can be clustered by + * spec/partition. */ abstract class ClusteredWriter implements PartitioningWriter { private static final String NOT_CLUSTERED_ROWS_ERROR_MSG_TEMPLATE = - "Incoming records violate the writer assumption that records are clustered by spec and " + - "by partition within each spec. Either cluster the incoming records or switch to fanout writers.\n" + - "Encountered records that belong to already closed files:\n"; + "Incoming records violate the writer assumption that records are clustered by spec and " + + "by partition within each spec. Either cluster the incoming records or switch to fanout writers.\n" + + "Encountered records that belong to already closed files:\n"; private final Set completedSpecIds = Sets.newHashSet(); @@ -86,12 +85,14 @@ public void write(T row, PartitionSpec spec, StructLike partition) { this.currentPartition = StructCopy.copy(partition); this.currentWriter = newWriter(currentSpec, currentPartition); - } else if (partition != currentPartition && partitionComparator.compare(partition, currentPartition) != 0) { + } else if (partition != currentPartition + && partitionComparator.compare(partition, currentPartition) != 0) { closeCurrentWriter(); completedPartitions.add(currentPartition); if (completedPartitions.contains(partition)) { - String errorCtx = String.format("partition '%s' in spec %s", spec.partitionToPath(partition), spec); + String errorCtx = + String.format("partition '%s' in spec %s", spec.partitionToPath(partition), spec); throw new IllegalStateException(NOT_CLUSTERED_ROWS_ERROR_MSG_TEMPLATE + errorCtx); } @@ -131,9 +132,13 @@ public final R result() { return aggregatedResult(); } - protected EncryptedOutputFile newOutputFile(OutputFileFactory fileFactory, PartitionSpec spec, StructLike partition) { - Preconditions.checkArgument(spec.isUnpartitioned() || partition != null, + protected EncryptedOutputFile newOutputFile( + OutputFileFactory fileFactory, PartitionSpec spec, StructLike partition) { + Preconditions.checkArgument( + spec.isUnpartitioned() || partition != null, "Partition must not be null when creating output file for partitioned spec"); - return partition == null ? fileFactory.newOutputFile() : fileFactory.newOutputFile(spec, partition); + return partition == null + ? fileFactory.newOutputFile() + : fileFactory.newOutputFile(spec, partition); } } diff --git a/core/src/main/java/org/apache/iceberg/io/DataWriteResult.java b/core/src/main/java/org/apache/iceberg/io/DataWriteResult.java index 97e450f96f05..d04676d77c8d 100644 --- a/core/src/main/java/org/apache/iceberg/io/DataWriteResult.java +++ b/core/src/main/java/org/apache/iceberg/io/DataWriteResult.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.util.Collections; @@ -25,9 +24,9 @@ /** * A result of writing data files. - *

- * Note that objects of this class are NOT meant to be serialized. Task or delta writers will wrap - * these results into their own serializable results that can be sent back to query engines. + * + *

Note that objects of this class are NOT meant to be serialized. Task or delta writers will + * wrap these results into their own serializable results that can be sent back to query engines. */ public class DataWriteResult { private final List dataFiles; diff --git a/core/src/main/java/org/apache/iceberg/io/DataWriter.java b/core/src/main/java/org/apache/iceberg/io/DataWriter.java index 090ccebfa80f..35bdd13fe558 100644 --- a/core/src/main/java/org/apache/iceberg/io/DataWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/DataWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -40,13 +39,24 @@ public class DataWriter implements FileWriter { private final SortOrder sortOrder; private DataFile dataFile = null; - public DataWriter(FileAppender appender, FileFormat format, String location, - PartitionSpec spec, StructLike partition, EncryptionKeyMetadata keyMetadata) { + public DataWriter( + FileAppender appender, + FileFormat format, + String location, + PartitionSpec spec, + StructLike partition, + EncryptionKeyMetadata keyMetadata) { this(appender, format, location, spec, partition, keyMetadata, null); } - public DataWriter(FileAppender appender, FileFormat format, String location, - PartitionSpec spec, StructLike partition, EncryptionKeyMetadata keyMetadata, SortOrder sortOrder) { + public DataWriter( + FileAppender appender, + FileFormat format, + String location, + PartitionSpec spec, + StructLike partition, + EncryptionKeyMetadata keyMetadata, + SortOrder sortOrder) { this.appender = appender; this.format = format; this.location = location; @@ -80,16 +90,17 @@ public long length() { public void close() throws IOException { if (dataFile == null) { appender.close(); - this.dataFile = DataFiles.builder(spec) - .withFormat(format) - .withPath(location) - .withPartition(partition) - .withEncryptionKeyMetadata(keyMetadata) - .withFileSizeInBytes(appender.length()) - .withMetrics(appender.metrics()) - .withSplitOffsets(appender.splitOffsets()) - .withSortOrder(sortOrder) - .build(); + this.dataFile = + DataFiles.builder(spec) + .withFormat(format) + .withPath(location) + .withPartition(partition) + .withEncryptionKeyMetadata(keyMetadata) + .withFileSizeInBytes(appender.length()) + .withMetrics(appender.metrics()) + .withSplitOffsets(appender.splitOffsets()) + .withSortOrder(sortOrder) + .build(); } } diff --git a/core/src/main/java/org/apache/iceberg/io/DeleteSchemaUtil.java b/core/src/main/java/org/apache/iceberg/io/DeleteSchemaUtil.java index 466a2c01d76a..91c7a03151f6 100644 --- a/core/src/main/java/org/apache/iceberg/io/DeleteSchemaUtil.java +++ b/core/src/main/java/org/apache/iceberg/io/DeleteSchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import org.apache.iceberg.MetadataColumns; @@ -24,22 +23,21 @@ import org.apache.iceberg.types.Types; public class DeleteSchemaUtil { - private DeleteSchemaUtil() { - } + private DeleteSchemaUtil() {} private static Schema pathPosSchema(Schema rowSchema) { return new Schema( MetadataColumns.DELETE_FILE_PATH, MetadataColumns.DELETE_FILE_POS, Types.NestedField.required( - MetadataColumns.DELETE_FILE_ROW_FIELD_ID, "row", rowSchema.asStruct(), + MetadataColumns.DELETE_FILE_ROW_FIELD_ID, + "row", + rowSchema.asStruct(), MetadataColumns.DELETE_FILE_ROW_DOC)); } public static Schema pathPosSchema() { - return new Schema( - MetadataColumns.DELETE_FILE_PATH, - MetadataColumns.DELETE_FILE_POS); + return new Schema(MetadataColumns.DELETE_FILE_PATH, MetadataColumns.DELETE_FILE_POS); } public static Schema posDeleteSchema(Schema rowSchema) { diff --git a/core/src/main/java/org/apache/iceberg/io/DeleteWriteResult.java b/core/src/main/java/org/apache/iceberg/io/DeleteWriteResult.java index 3d59e0cb0ca9..d1e8b9a112dd 100644 --- a/core/src/main/java/org/apache/iceberg/io/DeleteWriteResult.java +++ b/core/src/main/java/org/apache/iceberg/io/DeleteWriteResult.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.util.Collections; @@ -26,9 +25,9 @@ /** * A result of writing delete files. - *

- * Note that objects of this class are NOT meant to be serialized. Task or delta writers will wrap - * these results into their own serializable results that can be sent back to query engines. + * + *

Note that objects of this class are NOT meant to be serialized. Task or delta writers will + * wrap these results into their own serializable results that can be sent back to query engines. */ public class DeleteWriteResult { private final List deleteFiles; diff --git a/core/src/main/java/org/apache/iceberg/io/EqualityDeltaWriter.java b/core/src/main/java/org/apache/iceberg/io/EqualityDeltaWriter.java index 196292047f3a..b9ed4c59961f 100644 --- a/core/src/main/java/org/apache/iceberg/io/EqualityDeltaWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/EqualityDeltaWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.Closeable; @@ -26,7 +25,8 @@ import org.apache.iceberg.StructLike; /** - * A writer capable of writing data and equality deletes that may belong to different specs and partitions. + * A writer capable of writing data and equality deletes that may belong to different specs and + * partitions. * * @param the row type */ @@ -43,8 +43,8 @@ public interface EqualityDeltaWriter extends Closeable { /** * Deletes a row from the provided spec/partition. - *

- * This method assumes the delete record has the same schema as the rows that will be inserted. + * + *

This method assumes the delete record has the same schema as the rows that will be inserted. * * @param row a delete record * @param spec a partition spec @@ -54,8 +54,8 @@ public interface EqualityDeltaWriter extends Closeable { /** * Deletes a key from the provided spec/partition. - *

- * This method assumes the delete key contains values only for equality fields. + * + *

This method assumes the delete key contains values only for equality fields. * * @param key a delete key * @param spec a partition spec @@ -64,8 +64,8 @@ public interface EqualityDeltaWriter extends Closeable { void deleteKey(T key, PartitionSpec spec, StructLike partition); /** - * Returns a result that contains information about written {@link DataFile}s or {@link DeleteFile}s. - * The result is valid only after the writer is closed. + * Returns a result that contains information about written {@link DataFile}s or {@link + * DeleteFile}s. The result is valid only after the writer is closed. * * @return the writer result */ diff --git a/core/src/main/java/org/apache/iceberg/io/FanoutDataWriter.java b/core/src/main/java/org/apache/iceberg/io/FanoutDataWriter.java index eddd1864d724..1276fbb62802 100644 --- a/core/src/main/java/org/apache/iceberg/io/FanoutDataWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/FanoutDataWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.util.List; @@ -26,8 +25,8 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; /** - * A data writer capable of writing to multiple specs and partitions that keeps data writers for each - * seen spec/partition pair open until this writer is closed. + * A data writer capable of writing to multiple specs and partitions that keeps data writers for + * each seen spec/partition pair open until this writer is closed. */ public class FanoutDataWriter extends FanoutWriter { @@ -37,8 +36,11 @@ public class FanoutDataWriter extends FanoutWriter { private final long targetFileSizeInBytes; private final List dataFiles; - public FanoutDataWriter(FileWriterFactory writerFactory, OutputFileFactory fileFactory, - FileIO io, long targetFileSizeInBytes) { + public FanoutDataWriter( + FileWriterFactory writerFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSizeInBytes) { this.writerFactory = writerFactory; this.fileFactory = fileFactory; this.io = io; @@ -48,7 +50,8 @@ public FanoutDataWriter(FileWriterFactory writerFactory, OutputFileFactory fi @Override protected FileWriter newWriter(PartitionSpec spec, StructLike partition) { - return new RollingDataWriter<>(writerFactory, fileFactory, io, targetFileSizeInBytes, spec, partition); + return new RollingDataWriter<>( + writerFactory, fileFactory, io, targetFileSizeInBytes, spec, partition); } @Override diff --git a/core/src/main/java/org/apache/iceberg/io/FanoutWriter.java b/core/src/main/java/org/apache/iceberg/io/FanoutWriter.java index 631fc0a6d4ea..ca19919bb1bd 100644 --- a/core/src/main/java/org/apache/iceberg/io/FanoutWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/FanoutWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -29,13 +28,13 @@ import org.apache.iceberg.util.StructLikeMap; /** - * A writer capable of writing to multiple specs and partitions that keeps files for each - * seen spec/partition pair open until this writer is closed. - *

- * As opposed to {@link ClusteredWriter}, this writer does not require the incoming records - * to be clustered by partition spec and partition as all files are kept open. As a consequence, - * this writer may potentially consume substantially more memory compared to {@link ClusteredWriter}. - * Use this writer only when clustering by spec/partition is not possible (e.g. streaming). + * A writer capable of writing to multiple specs and partitions that keeps files for each seen + * spec/partition pair open until this writer is closed. + * + *

As opposed to {@link ClusteredWriter}, this writer does not require the incoming records to be + * clustered by partition spec and partition as all files are kept open. As a consequence, this + * writer may potentially consume substantially more memory compared to {@link ClusteredWriter}. Use + * this writer only when clustering by spec/partition is not possible (e.g. streaming). */ abstract class FanoutWriter implements PartitioningWriter { @@ -55,9 +54,8 @@ public void write(T row, PartitionSpec spec, StructLike partition) { } private FileWriter writer(PartitionSpec spec, StructLike partition) { - Map> specWriters = writers.computeIfAbsent( - spec.specId(), - id -> StructLikeMap.create(spec.partitionType())); + Map> specWriters = + writers.computeIfAbsent(spec.specId(), id -> StructLikeMap.create(spec.partitionType())); FileWriter writer = specWriters.get(partition); if (writer == null) { @@ -97,9 +95,13 @@ public final R result() { return aggregatedResult(); } - protected EncryptedOutputFile newOutputFile(OutputFileFactory fileFactory, PartitionSpec spec, StructLike partition) { - Preconditions.checkArgument(spec.isUnpartitioned() || partition != null, + protected EncryptedOutputFile newOutputFile( + OutputFileFactory fileFactory, PartitionSpec spec, StructLike partition) { + Preconditions.checkArgument( + spec.isUnpartitioned() || partition != null, "Partition must not be null when creating output file for partitioned spec"); - return partition == null ? fileFactory.newOutputFile() : fileFactory.newOutputFile(spec, partition); + return partition == null + ? fileFactory.newOutputFile() + : fileFactory.newOutputFile(spec, partition); } } diff --git a/core/src/main/java/org/apache/iceberg/io/FileAppenderFactory.java b/core/src/main/java/org/apache/iceberg/io/FileAppenderFactory.java index 80eae531aec0..59b0b4b3bf6a 100644 --- a/core/src/main/java/org/apache/iceberg/io/FileAppenderFactory.java +++ b/core/src/main/java/org/apache/iceberg/io/FileAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import org.apache.iceberg.FileFormat; @@ -45,29 +44,32 @@ public interface FileAppenderFactory { * Create a new {@link DataWriter}. * * @param outputFile an OutputFile used to create an output stream. - * @param format a file format - * @param partition a tuple of partition values + * @param format a file format + * @param partition a tuple of partition values * @return a newly created {@link DataWriter} for rows */ - DataWriter newDataWriter(EncryptedOutputFile outputFile, FileFormat format, StructLike partition); + DataWriter newDataWriter( + EncryptedOutputFile outputFile, FileFormat format, StructLike partition); /** * Create a new {@link EqualityDeleteWriter}. * * @param outputFile an OutputFile used to create an output stream. - * @param format a file format - * @param partition a tuple of partition values + * @param format a file format + * @param partition a tuple of partition values * @return a newly created {@link EqualityDeleteWriter} for equality deletes */ - EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile outputFile, FileFormat format, StructLike partition); + EqualityDeleteWriter newEqDeleteWriter( + EncryptedOutputFile outputFile, FileFormat format, StructLike partition); /** * Create a new {@link PositionDeleteWriter}. * * @param outputFile an OutputFile used to create an output stream. - * @param format a file format - * @param partition a tuple of partition values + * @param format a file format + * @param partition a tuple of partition values * @return a newly created {@link PositionDeleteWriter} for position deletes */ - PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile outputFile, FileFormat format, StructLike partition); + PositionDeleteWriter newPosDeleteWriter( + EncryptedOutputFile outputFile, FileFormat format, StructLike partition); } diff --git a/core/src/main/java/org/apache/iceberg/io/FileIOParser.java b/core/src/main/java/org/apache/iceberg/io/FileIOParser.java index 3f3c5aac0d3e..f2377bb16880 100644 --- a/core/src/main/java/org/apache/iceberg/io/FileIOParser.java +++ b/core/src/main/java/org/apache/iceberg/io/FileIOParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import com.fasterxml.jackson.core.JsonGenerator; @@ -28,8 +27,7 @@ import org.apache.iceberg.util.JsonUtil; public class FileIOParser { - private FileIOParser() { - } + private FileIOParser() {} private static final String FILE_IO_IMPL = "io-impl"; private static final String PROPERTIES = "properties"; @@ -48,12 +46,15 @@ private static void toJson(FileIO io, JsonGenerator generator) throws IOExceptio try { properties = io.properties(); } catch (UnsupportedOperationException e) { - throw new IllegalArgumentException(String.format( - "Cannot serialize FileIO: %s does not expose configuration properties", impl)); + throw new IllegalArgumentException( + String.format( + "Cannot serialize FileIO: %s does not expose configuration properties", impl)); } - Preconditions.checkArgument(properties != null, - "Cannot serialize FileIO: invalid configuration properties (null)", impl); + Preconditions.checkArgument( + properties != null, + "Cannot serialize FileIO: invalid configuration properties (null)", + impl); generator.writeStartObject(); diff --git a/core/src/main/java/org/apache/iceberg/io/FileWriter.java b/core/src/main/java/org/apache/iceberg/io/FileWriter.java index 6f0c4ab2194a..4f431769d965 100644 --- a/core/src/main/java/org/apache/iceberg/io/FileWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/FileWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.Closeable; @@ -25,11 +24,12 @@ /** * A writer capable of writing files of a single type (i.e. data/delete) to one spec/partition. - *

- * As opposed to {@link FileAppender}, this interface should be implemented by classes that not only - * append records to files but actually produce {@link DataFile}s or {@link DeleteFile}s objects - * with Iceberg metadata. Implementations may wrap {@link FileAppender}s with extra information - * such as spec, partition, sort order ID needed to construct {@link DataFile}s or {@link DeleteFile}s. + * + *

As opposed to {@link FileAppender}, this interface should be implemented by classes that not + * only append records to files but actually produce {@link DataFile}s or {@link DeleteFile}s + * objects with Iceberg metadata. Implementations may wrap {@link FileAppender}s with extra + * information such as spec, partition, sort order ID needed to construct {@link DataFile}s or + * {@link DeleteFile}s. * * @param the row type * @param the result type @@ -62,8 +62,8 @@ default void write(Iterable rows) { long length(); /** - * Returns a result that contains information about written {@link DataFile}s or {@link DeleteFile}s. - * The result is valid only after the writer is closed. + * Returns a result that contains information about written {@link DataFile}s or {@link + * DeleteFile}s. The result is valid only after the writer is closed. * * @return the file writer result */ diff --git a/core/src/main/java/org/apache/iceberg/io/FileWriterFactory.java b/core/src/main/java/org/apache/iceberg/io/FileWriterFactory.java index 9b57676f099d..72cc74c126c9 100644 --- a/core/src/main/java/org/apache/iceberg/io/FileWriterFactory.java +++ b/core/src/main/java/org/apache/iceberg/io/FileWriterFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import org.apache.iceberg.PartitionSpec; @@ -25,9 +24,7 @@ import org.apache.iceberg.deletes.PositionDeleteWriter; import org.apache.iceberg.encryption.EncryptedOutputFile; -/** - * A factory for creating data and delete writers. - */ +/** A factory for creating data and delete writers. */ public interface FileWriterFactory { /** @@ -48,7 +45,8 @@ public interface FileWriterFactory { * @param partition the partition written deletes belong to or null if the spec is unpartitioned * @return the constructed equality delete writer */ - EqualityDeleteWriter newEqualityDeleteWriter(EncryptedOutputFile file, PartitionSpec spec, StructLike partition); + EqualityDeleteWriter newEqualityDeleteWriter( + EncryptedOutputFile file, PartitionSpec spec, StructLike partition); /** * Creates a new {@link PositionDeleteWriter}. @@ -58,5 +56,6 @@ public interface FileWriterFactory { * @param partition the partition written deletes belong to or null if the spec is unpartitioned * @return the constructed position delete writer */ - PositionDeleteWriter newPositionDeleteWriter(EncryptedOutputFile file, PartitionSpec spec, StructLike partition); + PositionDeleteWriter newPositionDeleteWriter( + EncryptedOutputFile file, PartitionSpec spec, StructLike partition); } diff --git a/core/src/main/java/org/apache/iceberg/io/IOUtil.java b/core/src/main/java/org/apache/iceberg/io/IOUtil.java index e5fbc47f872f..4308d5f1300f 100644 --- a/core/src/main/java/org/apache/iceberg/io/IOUtil.java +++ b/core/src/main/java/org/apache/iceberg/io/IOUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.EOFException; @@ -27,8 +26,7 @@ public class IOUtil { // not meant to be instantiated - private IOUtil() { - } + private IOUtil() {} private static final int WRITE_CHUNK_SIZE = 8192; @@ -42,7 +40,8 @@ private IOUtil() { * @throws EOFException if the end of the stream is reached before reading length bytes * @throws IOException if there is an error while reading */ - public static void readFully(InputStream stream, byte[] bytes, int offset, int length) throws IOException { + public static void readFully(InputStream stream, byte[] bytes, int offset, int length) + throws IOException { int bytesRead = readRemaining(stream, bytes, offset, length); if (bytesRead < length) { throw new EOFException( @@ -50,9 +49,7 @@ public static void readFully(InputStream stream, byte[] bytes, int offset, int l } } - /** - * Writes a buffer into a stream, making multiple write calls if necessary. - */ + /** Writes a buffer into a stream, making multiple write calls if necessary. */ public static void writeFully(OutputStream outputStream, ByteBuffer buffer) throws IOException { if (!buffer.hasRemaining()) { return; @@ -66,8 +63,8 @@ public static void writeFully(OutputStream outputStream, ByteBuffer buffer) thro } /** - * Reads into a buffer from a stream, making multiple read calls if necessary - * returning the number of bytes read until end of stream. + * Reads into a buffer from a stream, making multiple read calls if necessary returning the number + * of bytes read until end of stream. * * @param stream an InputStream to read from * @param bytes a buffer to write into @@ -76,7 +73,8 @@ public static void writeFully(OutputStream outputStream, ByteBuffer buffer) thro * @return the number of bytes read * @throws IOException if there is an error while reading */ - public static int readRemaining(InputStream stream, byte[] bytes, int offset, int length) throws IOException { + public static int readRemaining(InputStream stream, byte[] bytes, int offset, int length) + throws IOException { int pos = offset; int remaining = length; while (remaining > 0) { diff --git a/core/src/main/java/org/apache/iceberg/io/MultiBufferInputStream.java b/core/src/main/java/org/apache/iceberg/io/MultiBufferInputStream.java index f1c04ab2c37e..6d38497d224f 100644 --- a/core/src/main/java/org/apache/iceberg/io/MultiBufferInputStream.java +++ b/core/src/main/java/org/apache/iceberg/io/MultiBufferInputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.EOFException; @@ -70,7 +69,8 @@ public long getPos() { @Override public void seek(long newPosition) throws IOException { if (newPosition > length) { - throw new EOFException(String.format("Cannot seek to position after end of file: %s", newPosition)); + throw new EOFException( + String.format("Cannot seek to position after end of file: %s", newPosition)); } if (position > newPosition) { @@ -222,8 +222,7 @@ public List remainingBuffers() { return sliceBuffers(length - position); } catch (EOFException e) { throw new RuntimeException( - "[Parquet bug] Stream is bad: incorrect bytes remaining " + - (length - position)); + "[Parquet bug] Stream is bad: incorrect bytes remaining " + (length - position)); } } diff --git a/core/src/main/java/org/apache/iceberg/io/OutputFileFactory.java b/core/src/main/java/org/apache/iceberg/io/OutputFileFactory.java index 422f41f5f0bf..457c4c0200cf 100644 --- a/core/src/main/java/org/apache/iceberg/io/OutputFileFactory.java +++ b/core/src/main/java/org/apache/iceberg/io/OutputFileFactory.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; + import java.util.Locale; import java.util.UUID; import java.util.concurrent.atomic.AtomicInteger; @@ -29,12 +31,7 @@ import org.apache.iceberg.encryption.EncryptedOutputFile; import org.apache.iceberg.encryption.EncryptionManager; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; - -/** - * Factory responsible for generating unique but recognizable data file names. - */ +/** Factory responsible for generating unique but recognizable data file names. */ public class OutputFileFactory { private final PartitionSpec defaultSpec; private final FileFormat format; @@ -43,15 +40,19 @@ public class OutputFileFactory { private final EncryptionManager encryptionManager; private final int partitionId; private final long taskId; - // The purpose of this uuid is to be able to know from two paths that they were written by the same operation. - // That's useful, for example, if a Spark job dies and leaves files in the file system, you can identify them all + // The purpose of this uuid is to be able to know from two paths that they were written by the + // same operation. + // That's useful, for example, if a Spark job dies and leaves files in the file system, you can + // identify them all // with a recursive listing and grep. private final String operationId; private final AtomicInteger fileCount = new AtomicInteger(0); /** - * Constructor with specific operationId. The [partitionId, taskId, operationId] triplet has to be unique across JVM - * instances otherwise the same file name could be generated by different instances of the OutputFileFactory. + * Constructor with specific operationId. The [partitionId, taskId, operationId] triplet has to be + * unique across JVM instances otherwise the same file name could be generated by different + * instances of the OutputFileFactory. + * * @param spec Partition specification used by the location provider * @param format File format used for the extension * @param locations Location provider used for generating locations @@ -61,8 +62,15 @@ public class OutputFileFactory { * @param taskId Second part of the file name * @param operationId Third part of the file name */ - private OutputFileFactory(PartitionSpec spec, FileFormat format, LocationProvider locations, FileIO io, - EncryptionManager encryptionManager, int partitionId, long taskId, String operationId) { + private OutputFileFactory( + PartitionSpec spec, + FileFormat format, + LocationProvider locations, + FileIO io, + EncryptionManager encryptionManager, + int partitionId, + long taskId, + String operationId) { this.defaultSpec = spec; this.format = format; this.locations = locations; @@ -79,27 +87,22 @@ public static Builder builderFor(Table table, int partitionId, long taskId) { private String generateFilename() { return format.addExtension( - String.format("%05d-%d-%s-%05d", partitionId, taskId, operationId, fileCount.incrementAndGet())); + String.format( + "%05d-%d-%s-%05d", partitionId, taskId, operationId, fileCount.incrementAndGet())); } - /** - * Generates an {@link EncryptedOutputFile} for unpartitioned writes. - */ + /** Generates an {@link EncryptedOutputFile} for unpartitioned writes. */ public EncryptedOutputFile newOutputFile() { OutputFile file = io.newOutputFile(locations.newDataLocation(generateFilename())); return encryptionManager.encrypt(file); } - /** - * Generates an {@link EncryptedOutputFile} for partitioned writes in the default spec. - */ + /** Generates an {@link EncryptedOutputFile} for partitioned writes in the default spec. */ public EncryptedOutputFile newOutputFile(StructLike partition) { return newOutputFile(defaultSpec, partition); } - /** - * Generates an {@link EncryptedOutputFile} for partitioned writes in a given spec. - */ + /** Generates an {@link EncryptedOutputFile} for partitioned writes in a given spec. */ public EncryptedOutputFile newOutputFile(PartitionSpec spec, StructLike partition) { String newDataLocation = locations.newDataLocation(spec, partition, generateFilename()); OutputFile rawOutputFile = io.newOutputFile(newDataLocation); @@ -121,7 +124,8 @@ private Builder(Table table, int partitionId, long taskId) { this.defaultSpec = table.spec(); this.operationId = UUID.randomUUID().toString(); - String formatAsString = table.properties().getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); + String formatAsString = + table.properties().getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); this.format = FileFormat.valueOf(formatAsString.toUpperCase(Locale.ROOT)); } @@ -144,7 +148,8 @@ public OutputFileFactory build() { LocationProvider locations = table.locationProvider(); FileIO io = table.io(); EncryptionManager encryption = table.encryption(); - return new OutputFileFactory(defaultSpec, format, locations, io, encryption, partitionId, taskId, operationId); + return new OutputFileFactory( + defaultSpec, format, locations, io, encryption, partitionId, taskId, operationId); } } } diff --git a/core/src/main/java/org/apache/iceberg/io/PartitionedFanoutWriter.java b/core/src/main/java/org/apache/iceberg/io/PartitionedFanoutWriter.java index a49fe199cfc9..b8c52024d2ed 100644 --- a/core/src/main/java/org/apache/iceberg/io/PartitionedFanoutWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/PartitionedFanoutWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -29,15 +28,20 @@ public abstract class PartitionedFanoutWriter extends BaseTaskWriter { private final Map writers = Maps.newHashMap(); - protected PartitionedFanoutWriter(PartitionSpec spec, FileFormat format, FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize) { + protected PartitionedFanoutWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); } /** * Create a PartitionKey from the values in row. - *

- * Any PartitionKey returned by this method can be reused by the implementation. + * + *

Any PartitionKey returned by this method can be reused by the implementation. * * @param row a data row */ @@ -49,7 +53,8 @@ public void write(T row) throws IOException { RollingFileWriter writer = writers.get(partitionKey); if (writer == null) { - // NOTICE: we need to copy a new partition key here, in case of messing up the keys in writers. + // NOTICE: we need to copy a new partition key here, in case of messing up the keys in + // writers. PartitionKey copiedKey = partitionKey.copy(); writer = new RollingFileWriter(copiedKey); writers.put(copiedKey, writer); diff --git a/core/src/main/java/org/apache/iceberg/io/PartitionedWriter.java b/core/src/main/java/org/apache/iceberg/io/PartitionedWriter.java index a551b8e2686e..625f8f94c997 100644 --- a/core/src/main/java/org/apache/iceberg/io/PartitionedWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/PartitionedWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -37,15 +36,20 @@ public abstract class PartitionedWriter extends BaseTaskWriter { private PartitionKey currentKey = null; private RollingFileWriter currentWriter = null; - protected PartitionedWriter(PartitionSpec spec, FileFormat format, FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize) { + protected PartitionedWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); } /** * Create a PartitionKey from the values in row. - *

- * Any PartitionKey returned by this method can be reused by the implementation. + * + *

Any PartitionKey returned by this method can be reused by the implementation. * * @param row a data row */ diff --git a/core/src/main/java/org/apache/iceberg/io/PartitioningWriter.java b/core/src/main/java/org/apache/iceberg/io/PartitioningWriter.java index 4afdd2162f8b..edce43229ce1 100644 --- a/core/src/main/java/org/apache/iceberg/io/PartitioningWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/PartitioningWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.Closeable; @@ -26,13 +25,14 @@ import org.apache.iceberg.StructLike; /** - * A writer capable of writing files of a single type (i.e. data/delete) to multiple specs and partitions. - *

- * As opposed to {@link FileWriter}, this interface should be implemented by writers that are not - * limited to writing to a single spec/partition. Implementations may internally use {@link FileWriter}s - * for writing to a single spec/partition. - *

- * Note that this writer can be used both for partitioned and unpartitioned tables. + * A writer capable of writing files of a single type (i.e. data/delete) to multiple specs and + * partitions. + * + *

As opposed to {@link FileWriter}, this interface should be implemented by writers that are not + * limited to writing to a single spec/partition. Implementations may internally use {@link + * FileWriter}s for writing to a single spec/partition. + * + *

Note that this writer can be used both for partitioned and unpartitioned tables. * * @param the row type * @param the result type @@ -49,8 +49,8 @@ public interface PartitioningWriter extends Closeable { void write(T row, PartitionSpec spec, StructLike partition); /** - * Returns a result that contains information about written {@link DataFile}s or {@link DeleteFile}s. - * The result is valid only after the writer is closed. + * Returns a result that contains information about written {@link DataFile}s or {@link + * DeleteFile}s. The result is valid only after the writer is closed. * * @return the writer result */ diff --git a/core/src/main/java/org/apache/iceberg/io/PositionDeltaWriter.java b/core/src/main/java/org/apache/iceberg/io/PositionDeltaWriter.java index 915670d50e98..8ce4e64c3a8b 100644 --- a/core/src/main/java/org/apache/iceberg/io/PositionDeltaWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/PositionDeltaWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.Closeable; @@ -26,7 +25,8 @@ import org.apache.iceberg.StructLike; /** - * A writer capable of writing data and position deletes that may belong to different specs and partitions. + * A writer capable of writing data and position deletes that may belong to different specs and + * partitions. * * @param the row type */ @@ -43,10 +43,10 @@ public interface PositionDeltaWriter extends Closeable { /** * Inserts a new version of an existing row to the provided spec/partition. - *

- * This method allows writers to distinguish new and updated records. The caller must separately - * invoke {@link #delete(CharSequence, long, PartitionSpec, StructLike)} for the original - * row position that is being updated. + * + *

This method allows writers to distinguish new and updated records. The caller must + * separately invoke {@link #delete(CharSequence, long, PartitionSpec, StructLike)} for the + * original row position that is being updated. * * @param row a new version of an existing row * @param spec a new partition spec @@ -60,7 +60,7 @@ default void update(T row, PartitionSpec spec, StructLike partition) { * Deletes a position in the provided spec/partition. * * @param path a data file path - * @param pos a position + * @param pos a position * @param spec a partition spec * @param partition a partition or null if the spec is unpartitioned */ @@ -69,7 +69,8 @@ default void delete(CharSequence path, long pos, PartitionSpec spec, StructLike } /** - * Deletes a position in the provided spec/partition and records the deleted row in the delete file. + * Deletes a position in the provided spec/partition and records the deleted row in the delete + * file. * * @param path a data file path * @param pos a position @@ -80,8 +81,8 @@ default void delete(CharSequence path, long pos, PartitionSpec spec, StructLike void delete(CharSequence path, long pos, T row, PartitionSpec spec, StructLike partition); /** - * Returns a result that contains information about written {@link DataFile}s or {@link DeleteFile}s. - * The result is valid only after the writer is closed. + * Returns a result that contains information about written {@link DataFile}s or {@link + * DeleteFile}s. The result is valid only after the writer is closed. * * @return the writer result */ diff --git a/core/src/main/java/org/apache/iceberg/io/ResolvingFileIO.java b/core/src/main/java/org/apache/iceberg/io/ResolvingFileIO.java index 3815d5da5402..256dfb56cc98 100644 --- a/core/src/main/java/org/apache/iceberg/io/ResolvingFileIO.java +++ b/core/src/main/java/org/apache/iceberg/io/ResolvingFileIO.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.util.List; @@ -33,18 +32,16 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * FileIO implementation that uses location scheme to choose the correct FileIO implementation. - */ +/** FileIO implementation that uses location scheme to choose the correct FileIO implementation. */ public class ResolvingFileIO implements FileIO, HadoopConfigurable { private static final Logger LOG = LoggerFactory.getLogger(ResolvingFileIO.class); private static final String FALLBACK_IMPL = "org.apache.iceberg.hadoop.HadoopFileIO"; private static final String S3_FILE_IO_IMPL = "org.apache.iceberg.aws.s3.S3FileIO"; - private static final Map SCHEME_TO_FILE_IO = ImmutableMap.of( - "s3", S3_FILE_IO_IMPL, - "s3a", S3_FILE_IO_IMPL, - "s3n", S3_FILE_IO_IMPL - ); + private static final Map SCHEME_TO_FILE_IO = + ImmutableMap.of( + "s3", S3_FILE_IO_IMPL, + "s3a", S3_FILE_IO_IMPL, + "s3n", S3_FILE_IO_IMPL); private final Map ioInstances = Maps.newHashMap(); private Map properties; @@ -52,11 +49,10 @@ public class ResolvingFileIO implements FileIO, HadoopConfigurable { /** * No-arg constructor to load the FileIO dynamically. - *

- * All fields are initialized by calling {@link ResolvingFileIO#initialize(Map)} later. + * + *

All fields are initialized by calling {@link ResolvingFileIO#initialize(Map)} later. */ - public ResolvingFileIO() { - } + public ResolvingFileIO() {} @Override public InputFile newInputFile(String location) { @@ -104,7 +100,8 @@ public void close() { } @Override - public void serializeConfWith(Function> confSerializer) { + public void serializeConfWith( + Function> confSerializer) { this.hadoopConf = confSerializer.apply(hadoopConf.get()); } @@ -142,12 +139,18 @@ private FileIO io(String location) { throw e; } else { // couldn't load the normal class, fall back to HadoopFileIO - LOG.warn("Failed to load FileIO implementation: {}, falling back to {}", impl, FALLBACK_IMPL, e); + LOG.warn( + "Failed to load FileIO implementation: {}, falling back to {}", + impl, + FALLBACK_IMPL, + e); try { io = CatalogUtil.loadFileIO(FALLBACK_IMPL, properties, conf); } catch (IllegalArgumentException suppressed) { - LOG.warn("Failed to load FileIO implementation: {} (fallback)", FALLBACK_IMPL, suppressed); - // both attempts failed, throw the original exception with the later exception suppressed + LOG.warn( + "Failed to load FileIO implementation: {} (fallback)", FALLBACK_IMPL, suppressed); + // both attempts failed, throw the original exception with the later exception + // suppressed e.addSuppressed(suppressed); throw e; } diff --git a/core/src/main/java/org/apache/iceberg/io/RollingDataWriter.java b/core/src/main/java/org/apache/iceberg/io/RollingDataWriter.java index 1642a30e41ba..018981667fd5 100644 --- a/core/src/main/java/org/apache/iceberg/io/RollingDataWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/RollingDataWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.util.List; @@ -35,9 +34,13 @@ public class RollingDataWriter extends RollingFileWriter, Da private final FileWriterFactory writerFactory; private final List dataFiles; - public RollingDataWriter(FileWriterFactory writerFactory, OutputFileFactory fileFactory, - FileIO io, long targetFileSizeInBytes, - PartitionSpec spec, StructLike partition) { + public RollingDataWriter( + FileWriterFactory writerFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSizeInBytes, + PartitionSpec spec, + StructLike partition) { super(fileFactory, io, targetFileSizeInBytes, spec, partition); this.writerFactory = writerFactory; this.dataFiles = Lists.newArrayList(); diff --git a/core/src/main/java/org/apache/iceberg/io/RollingEqualityDeleteWriter.java b/core/src/main/java/org/apache/iceberg/io/RollingEqualityDeleteWriter.java index c12bfd31d6d5..f5c7f7b2c311 100644 --- a/core/src/main/java/org/apache/iceberg/io/RollingEqualityDeleteWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/RollingEqualityDeleteWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.util.List; @@ -29,17 +28,22 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; /** - * A rolling equality delete writer that splits incoming deletes into multiple files within one spec/partition - * based on the target file size. + * A rolling equality delete writer that splits incoming deletes into multiple files within one + * spec/partition based on the target file size. */ -public class RollingEqualityDeleteWriter extends RollingFileWriter, DeleteWriteResult> { +public class RollingEqualityDeleteWriter + extends RollingFileWriter, DeleteWriteResult> { private final FileWriterFactory writerFactory; private final List deleteFiles; - public RollingEqualityDeleteWriter(FileWriterFactory writerFactory, OutputFileFactory fileFactory, - FileIO io, long targetFileSizeInBytes, - PartitionSpec spec, StructLike partition) { + public RollingEqualityDeleteWriter( + FileWriterFactory writerFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSizeInBytes, + PartitionSpec spec, + StructLike partition) { super(fileFactory, io, targetFileSizeInBytes, spec, partition); this.writerFactory = writerFactory; this.deleteFiles = Lists.newArrayList(); @@ -53,7 +57,8 @@ protected EqualityDeleteWriter newWriter(EncryptedOutputFile file) { @Override protected void addResult(DeleteWriteResult result) { - Preconditions.checkArgument(!result.referencesDataFiles(), "Equality deletes cannot reference data files"); + Preconditions.checkArgument( + !result.referencesDataFiles(), "Equality deletes cannot reference data files"); deleteFiles.addAll(result.deleteFiles()); } diff --git a/core/src/main/java/org/apache/iceberg/io/RollingFileWriter.java b/core/src/main/java/org/apache/iceberg/io/RollingFileWriter.java index 80a589b18095..791d6d1bb201 100644 --- a/core/src/main/java/org/apache/iceberg/io/RollingFileWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/RollingFileWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -27,8 +26,8 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; /** - * A rolling writer capable of splitting incoming data or deletes into multiple files within one spec/partition - * based on the target file size. + * A rolling writer capable of splitting incoming data or deletes into multiple files within one + * spec/partition based on the target file size. */ abstract class RollingFileWriter, R> implements FileWriter { private static final int ROWS_DIVISOR = 1000; @@ -45,8 +44,12 @@ abstract class RollingFileWriter, R> implements Fi private boolean closed = false; - protected RollingFileWriter(OutputFileFactory fileFactory, FileIO io, long targetFileSizeInBytes, - PartitionSpec spec, StructLike partition) { + protected RollingFileWriter( + OutputFileFactory fileFactory, + FileIO io, + long targetFileSizeInBytes, + PartitionSpec spec, + StructLike partition) { this.fileFactory = fileFactory; this.io = io; this.targetFileSizeInBytes = targetFileSizeInBytes; @@ -78,7 +81,8 @@ public long currentFileRows() { @Override public long length() { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement length"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement length"); } @Override @@ -124,7 +128,8 @@ private void closeCurrentWriter() { try { io.deleteFile(currentFile.encryptingOutputFile()); } catch (UncheckedIOException e) { - // the file may not have been created, and it isn't worth failing the job to clean up, skip deleting + // the file may not have been created, and it isn't worth failing the job to clean up, + // skip deleting } } else { addResult(currentWriter.result()); diff --git a/core/src/main/java/org/apache/iceberg/io/RollingPositionDeleteWriter.java b/core/src/main/java/org/apache/iceberg/io/RollingPositionDeleteWriter.java index 001b0bb5f5d6..5a6f8dbc9caf 100644 --- a/core/src/main/java/org/apache/iceberg/io/RollingPositionDeleteWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/RollingPositionDeleteWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.util.List; @@ -30,8 +29,8 @@ import org.apache.iceberg.util.CharSequenceSet; /** - * A rolling position delete writer that splits incoming deletes into multiple files within one spec/partition - * based on the target file size. + * A rolling position delete writer that splits incoming deletes into multiple files within one + * spec/partition based on the target file size. */ public class RollingPositionDeleteWriter extends RollingFileWriter, PositionDeleteWriter, DeleteWriteResult> { @@ -40,9 +39,13 @@ public class RollingPositionDeleteWriter private final List deleteFiles; private final CharSequenceSet referencedDataFiles; - public RollingPositionDeleteWriter(FileWriterFactory writerFactory, OutputFileFactory fileFactory, - FileIO io, long targetFileSizeInBytes, - PartitionSpec spec, StructLike partition) { + public RollingPositionDeleteWriter( + FileWriterFactory writerFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSizeInBytes, + PartitionSpec spec, + StructLike partition) { super(fileFactory, io, targetFileSizeInBytes, spec, partition); this.writerFactory = writerFactory; this.deleteFiles = Lists.newArrayList(); diff --git a/core/src/main/java/org/apache/iceberg/io/SingleBufferInputStream.java b/core/src/main/java/org/apache/iceberg/io/SingleBufferInputStream.java index 88c5b79a24ea..d32b339de95d 100644 --- a/core/src/main/java/org/apache/iceberg/io/SingleBufferInputStream.java +++ b/core/src/main/java/org/apache/iceberg/io/SingleBufferInputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.EOFException; @@ -26,8 +25,8 @@ import java.util.List; /** - * This ByteBufferInputStream does not consume the ByteBuffer being passed in, - * but will create a slice of the current buffer. + * This ByteBufferInputStream does not consume the ByteBuffer being passed in, but will create a + * slice of the current buffer. */ class SingleBufferInputStream extends ByteBufferInputStream { @@ -84,7 +83,8 @@ public int read(byte[] bytes, int offset, int len) throws IOException { @Override public void seek(long newPosition) throws IOException { if (newPosition > length) { - throw new EOFException(String.format("Cannot seek to position after end of file: %s", newPosition)); + throw new EOFException( + String.format("Cannot seek to position after end of file: %s", newPosition)); } if (getPos() > newPosition) { diff --git a/core/src/main/java/org/apache/iceberg/io/SortedPosDeleteWriter.java b/core/src/main/java/org/apache/iceberg/io/SortedPosDeleteWriter.java index 36a0313a4e41..1f7626eab78a 100644 --- a/core/src/main/java/org/apache/iceberg/io/SortedPosDeleteWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/SortedPosDeleteWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -54,11 +53,12 @@ class SortedPosDeleteWriter implements FileWriter, DeleteWr private int records = 0; private boolean closed = false; - SortedPosDeleteWriter(FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileFormat format, - StructLike partition, - long recordsNumThreshold) { + SortedPosDeleteWriter( + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileFormat format, + StructLike partition, + long recordsNumThreshold) { this.appenderFactory = appenderFactory; this.fileFactory = fileFactory; this.format = format; @@ -66,16 +66,18 @@ class SortedPosDeleteWriter implements FileWriter, DeleteWr this.recordsNumThreshold = recordsNumThreshold; } - SortedPosDeleteWriter(FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileFormat format, - StructLike partition) { + SortedPosDeleteWriter( + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileFormat format, + StructLike partition) { this(appenderFactory, fileFactory, format, partition, DEFAULT_RECORDS_NUM_THRESHOLD); } @Override public long length() { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement length"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement length"); } @Override @@ -97,7 +99,8 @@ public void delete(CharSequence path, long pos, T row) { records += 1; - // TODO Flush buffer based on the policy that checking whether whole heap memory size exceed the threshold. + // TODO Flush buffer based on the policy that checking whether whole heap memory size exceed the + // threshold. if (records >= recordsNumThreshold) { flushDeletes(); } @@ -140,7 +143,8 @@ private void flushDeletes() { outputFile = fileFactory.newOutputFile(partition); } - PositionDeleteWriter writer = appenderFactory.newPosDeleteWriter(outputFile, format, partition); + PositionDeleteWriter writer = + appenderFactory.newPosDeleteWriter(outputFile, format, partition); try (PositionDeleteWriter closeableWriter = writer) { // Sort all the paths. List paths = Lists.newArrayListWithCapacity(posDeletes.keySet().size()); @@ -157,8 +161,10 @@ private void flushDeletes() { positions.forEach(posRow -> closeableWriter.delete(path, posRow.pos(), posRow.row())); } } catch (IOException e) { - throw new UncheckedIOException("Failed to write the sorted path/pos pairs to pos-delete file: " + - outputFile.encryptingOutputFile().location(), e); + throw new UncheckedIOException( + "Failed to write the sorted path/pos pairs to pos-delete file: " + + outputFile.encryptingOutputFile().location(), + e); } // Clear the buffered pos-deletions. diff --git a/core/src/main/java/org/apache/iceberg/io/StructCopy.java b/core/src/main/java/org/apache/iceberg/io/StructCopy.java index 7d4373348ccb..229dff371762 100644 --- a/core/src/main/java/org/apache/iceberg/io/StructCopy.java +++ b/core/src/main/java/org/apache/iceberg/io/StructCopy.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import org.apache.iceberg.StructLike; -/** - * Copy the StructLike's values into a new one. It does not handle list or map values now. - */ +/** Copy the StructLike's values into a new one. It does not handle list or map values now. */ class StructCopy implements StructLike { static StructLike copy(StructLike struct) { return struct != null ? new StructCopy(struct) : null; diff --git a/core/src/main/java/org/apache/iceberg/io/TaskWriter.java b/core/src/main/java/org/apache/iceberg/io/TaskWriter.java index 4a31afb96bde..88ac7ab43003 100644 --- a/core/src/main/java/org/apache/iceberg/io/TaskWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/TaskWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.Closeable; @@ -31,9 +30,7 @@ */ public interface TaskWriter extends Closeable { - /** - * Write the row into the data files. - */ + /** Write the row into the data files. */ void write(T row) throws IOException; /** @@ -44,13 +41,15 @@ public interface TaskWriter extends Closeable { void abort() throws IOException; /** - * Close the writer and get the completed data files, it requires that the task writer would produce data files only. + * Close the writer and get the completed data files, it requires that the task writer would + * produce data files only. * * @return the completed data files of this task writer. */ default DataFile[] dataFiles() throws IOException { WriteResult result = complete(); - Preconditions.checkArgument(result.deleteFiles() == null || result.deleteFiles().length == 0, + Preconditions.checkArgument( + result.deleteFiles() == null || result.deleteFiles().length == 0, "Should have no delete files in this write result."); return result.dataFiles(); diff --git a/core/src/main/java/org/apache/iceberg/io/UnpartitionedWriter.java b/core/src/main/java/org/apache/iceberg/io/UnpartitionedWriter.java index 2e98706816c7..1c4aa3564bde 100644 --- a/core/src/main/java/org/apache/iceberg/io/UnpartitionedWriter.java +++ b/core/src/main/java/org/apache/iceberg/io/UnpartitionedWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -27,8 +26,13 @@ public class UnpartitionedWriter extends BaseTaskWriter { private final RollingFileWriter currentWriter; - public UnpartitionedWriter(PartitionSpec spec, FileFormat format, FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize) { + public UnpartitionedWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); currentWriter = new RollingFileWriter(null); } diff --git a/core/src/main/java/org/apache/iceberg/io/WriteResult.java b/core/src/main/java/org/apache/iceberg/io/WriteResult.java index 100a37c48a12..c73b09f46c53 100644 --- a/core/src/main/java/org/apache/iceberg/io/WriteResult.java +++ b/core/src/main/java/org/apache/iceberg/io/WriteResult.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.Serializable; @@ -33,9 +32,8 @@ public class WriteResult implements Serializable { private DeleteFile[] deleteFiles; private CharSequence[] referencedDataFiles; - private WriteResult(List dataFiles, - List deleteFiles, - CharSequenceSet referencedDataFiles) { + private WriteResult( + List dataFiles, List deleteFiles, CharSequenceSet referencedDataFiles) { this.dataFiles = dataFiles.toArray(new DataFile[0]); this.deleteFiles = deleteFiles.toArray(new DeleteFile[0]); this.referencedDataFiles = referencedDataFiles.toArray(new CharSequence[0]); diff --git a/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java b/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java index a5ffa2f7c0f8..18159e40b499 100644 --- a/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java +++ b/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.jdbc; import java.io.Closeable; @@ -74,8 +73,7 @@ public class JdbcCatalog extends BaseMetastoreCatalog private Object conf; private JdbcClientPool connections; - public JdbcCatalog() { - } + public JdbcCatalog() {} @Override public void initialize(String name, Map properties) { @@ -83,7 +81,8 @@ public void initialize(String name, Map properties) { Preconditions.checkNotNull(uri, "JDBC connection URI is required"); String inputWarehouseLocation = properties.get(CatalogProperties.WAREHOUSE_LOCATION); - Preconditions.checkArgument(inputWarehouseLocation != null && inputWarehouseLocation.length() > 0, + Preconditions.checkArgument( + inputWarehouseLocation != null && inputWarehouseLocation.length() > 0, "Cannot initialize JDBCCatalog because warehousePath must not be null or empty"); this.warehouseLocation = LocationUtil.stripTrailingSlash(inputWarehouseLocation); @@ -92,8 +91,9 @@ public void initialize(String name, Map properties) { this.catalogName = name; } - String fileIOImpl = properties.getOrDefault( - CatalogProperties.FILE_IO_IMPL, "org.apache.iceberg.hadoop.HadoopFileIO"); + String fileIOImpl = + properties.getOrDefault( + CatalogProperties.FILE_IO_IMPL, "org.apache.iceberg.hadoop.HadoopFileIO"); this.io = CatalogUtil.loadFileIO(fileIOImpl, properties, conf); try { @@ -114,31 +114,42 @@ public void initialize(String name, Map properties) { private void initializeCatalogTables() throws InterruptedException, SQLException { LOG.trace("Creating database tables (if missing) to store iceberg catalog"); - connections.run(conn -> { - DatabaseMetaData dbMeta = conn.getMetaData(); - ResultSet tableExists = dbMeta.getTables(null /* catalog name */, null /* schemaPattern */, - JdbcUtil.CATALOG_TABLE_NAME /* tableNamePattern */, null /* types */); - if (tableExists.next()) { - return true; - } - - LOG.debug("Creating table {} to store iceberg catalog", JdbcUtil.CATALOG_TABLE_NAME); - return conn.prepareStatement(JdbcUtil.CREATE_CATALOG_TABLE).execute(); - }); - - connections.run(conn -> { - DatabaseMetaData dbMeta = conn.getMetaData(); - ResultSet tableExists = dbMeta.getTables(null /* catalog name */, null /* schemaPattern */, - JdbcUtil.NAMESPACE_PROPERTIES_TABLE_NAME /* tableNamePattern */, null /* types */); + connections.run( + conn -> { + DatabaseMetaData dbMeta = conn.getMetaData(); + ResultSet tableExists = + dbMeta.getTables( + null /* catalog name */, + null /* schemaPattern */, + JdbcUtil.CATALOG_TABLE_NAME /* tableNamePattern */, + null /* types */); + if (tableExists.next()) { + return true; + } - if (tableExists.next()) { - return true; - } + LOG.debug("Creating table {} to store iceberg catalog", JdbcUtil.CATALOG_TABLE_NAME); + return conn.prepareStatement(JdbcUtil.CREATE_CATALOG_TABLE).execute(); + }); + + connections.run( + conn -> { + DatabaseMetaData dbMeta = conn.getMetaData(); + ResultSet tableExists = + dbMeta.getTables( + null /* catalog name */, + null /* schemaPattern */, + JdbcUtil.NAMESPACE_PROPERTIES_TABLE_NAME /* tableNamePattern */, + null /* types */); + + if (tableExists.next()) { + return true; + } - LOG.debug("Creating table {} to store iceberg catalog namespace properties", - JdbcUtil.NAMESPACE_PROPERTIES_TABLE_NAME); - return conn.prepareStatement(JdbcUtil.CREATE_NAMESPACE_PROPERTIES_TABLE).execute(); - }); + LOG.debug( + "Creating table {} to store iceberg catalog namespace properties", + JdbcUtil.NAMESPACE_PROPERTIES_TABLE_NAME); + return conn.prepareStatement(JdbcUtil.CREATE_NAMESPACE_PROPERTIES_TABLE).execute(); + }); } @Override @@ -161,8 +172,12 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { lastMetadata = null; } - int deletedRecords = execute( - JdbcUtil.DROP_TABLE_SQL, catalogName, JdbcUtil.namespaceToString(identifier.namespace()), identifier.name()); + int deletedRecords = + execute( + JdbcUtil.DROP_TABLE_SQL, + catalogName, + JdbcUtil.namespaceToString(identifier.namespace()), + identifier.name()); if (deletedRecords == 0) { LOG.info("Skipping drop, table does not exist: {}", identifier); @@ -184,32 +199,39 @@ public List listTables(Namespace namespace) { } return fetch( - row -> JdbcUtil.stringToTableIdentifier( - row.getString(JdbcUtil.TABLE_NAMESPACE), - row.getString(JdbcUtil.TABLE_NAME)), - JdbcUtil.LIST_TABLES_SQL, catalogName, JdbcUtil.namespaceToString(namespace)); + row -> + JdbcUtil.stringToTableIdentifier( + row.getString(JdbcUtil.TABLE_NAMESPACE), row.getString(JdbcUtil.TABLE_NAME)), + JdbcUtil.LIST_TABLES_SQL, + catalogName, + JdbcUtil.namespaceToString(namespace)); } @Override public void renameTable(TableIdentifier from, TableIdentifier to) { - int updatedRecords = execute( - err -> { - // SQLite doesn't set SQLState or throw SQLIntegrityConstraintViolationException - if (err instanceof SQLIntegrityConstraintViolationException || - (err.getMessage() != null && err.getMessage().contains("constraint failed"))) { - throw new AlreadyExistsException("Table already exists: %s", to); - } - }, - JdbcUtil.RENAME_TABLE_SQL, - JdbcUtil.namespaceToString(to.namespace()), to.name(), catalogName, - JdbcUtil.namespaceToString(from.namespace()), from.name()); + int updatedRecords = + execute( + err -> { + // SQLite doesn't set SQLState or throw SQLIntegrityConstraintViolationException + if (err instanceof SQLIntegrityConstraintViolationException + || (err.getMessage() != null && err.getMessage().contains("constraint failed"))) { + throw new AlreadyExistsException("Table already exists: %s", to); + } + }, + JdbcUtil.RENAME_TABLE_SQL, + JdbcUtil.namespaceToString(to.namespace()), + to.name(), + catalogName, + JdbcUtil.namespaceToString(from.namespace()), + from.name()); if (updatedRecords == 1) { LOG.info("Renamed table from {}, to {}", from, to); } else if (updatedRecords == 0) { throw new NoSuchTableException("Table does not exist: %s", from); } else { - LOG.warn("Rename operation affected {} rows: the catalog table's primary key assumption has been violated", + LOG.warn( + "Rename operation affected {} rows: the catalog table's primary key assumption has been violated", updatedRecords); } } @@ -234,10 +256,11 @@ public void createNamespace(Namespace namespace, Map metadata) { if (metadata == null || metadata.isEmpty()) { createMetadata = ImmutableMap.of(NAMESPACE_EXISTS_PROPERTY, "true"); } else { - createMetadata = ImmutableMap.builder() - .putAll(metadata) - .put(NAMESPACE_EXISTS_PROPERTY, "true") - .build(); + createMetadata = + ImmutableMap.builder() + .putAll(metadata) + .put(NAMESPACE_EXISTS_PROPERTY, "true") + .build(); } insertProperties(namespace, createMetadata); @@ -246,24 +269,26 @@ public void createNamespace(Namespace namespace, Map metadata) { @Override public List listNamespaces() { List namespaces = Lists.newArrayList(); - namespaces.addAll(fetch( - row -> JdbcUtil.stringToNamespace(row.getString(JdbcUtil.TABLE_NAMESPACE)), - JdbcUtil.LIST_ALL_TABLE_NAMESPACES_SQL, catalogName)); - namespaces.addAll(fetch( - row -> JdbcUtil.stringToNamespace(row.getString(JdbcUtil.NAMESPACE_NAME)), - JdbcUtil.LIST_ALL_PROPERTY_NAMESPACES_SQL, catalogName)); - - namespaces = namespaces.stream() - // only get sub namespaces/children - .filter(n -> n.levels().length >= 1) - // only get sub namespaces/children - .map(n -> Namespace.of( - Arrays.stream(n.levels()).limit(1).toArray(String[]::new) - ) - ) - // remove duplicates - .distinct() - .collect(Collectors.toList()); + namespaces.addAll( + fetch( + row -> JdbcUtil.stringToNamespace(row.getString(JdbcUtil.TABLE_NAMESPACE)), + JdbcUtil.LIST_ALL_TABLE_NAMESPACES_SQL, + catalogName)); + namespaces.addAll( + fetch( + row -> JdbcUtil.stringToNamespace(row.getString(JdbcUtil.NAMESPACE_NAME)), + JdbcUtil.LIST_ALL_PROPERTY_NAMESPACES_SQL, + catalogName)); + + namespaces = + namespaces.stream() + // only get sub namespaces/children + .filter(n -> n.levels().length >= 1) + // only get sub namespaces/children + .map(n -> Namespace.of(Arrays.stream(n.levels()).limit(1).toArray(String[]::new))) + // remove duplicates + .distinct() + .collect(Collectors.toList()); return namespaces; } @@ -279,33 +304,43 @@ public List listNamespaces(Namespace namespace) throws NoSuchNamespac } List namespaces = Lists.newArrayList(); - namespaces.addAll(fetch( - row -> JdbcUtil.stringToNamespace(row.getString(JdbcUtil.TABLE_NAMESPACE)), - JdbcUtil.LIST_NAMESPACES_SQL, catalogName, JdbcUtil.namespaceToString(namespace) + "%")); - namespaces.addAll(fetch( - row -> JdbcUtil.stringToNamespace(row.getString(JdbcUtil.NAMESPACE_NAME)), - JdbcUtil.LIST_PROPERTY_NAMESPACES_SQL, catalogName, JdbcUtil.namespaceToString(namespace) + "%")); + namespaces.addAll( + fetch( + row -> JdbcUtil.stringToNamespace(row.getString(JdbcUtil.TABLE_NAMESPACE)), + JdbcUtil.LIST_NAMESPACES_SQL, + catalogName, + JdbcUtil.namespaceToString(namespace) + "%")); + namespaces.addAll( + fetch( + row -> JdbcUtil.stringToNamespace(row.getString(JdbcUtil.NAMESPACE_NAME)), + JdbcUtil.LIST_PROPERTY_NAMESPACES_SQL, + catalogName, + JdbcUtil.namespaceToString(namespace) + "%")); int subNamespaceLevelLength = namespace.levels().length + 1; - namespaces = namespaces.stream() - // exclude itself - .filter(n -> !n.equals(namespace)) - // only get sub namespaces/children - .filter(n -> n.levels().length >= subNamespaceLevelLength) - // only get sub namespaces/children - .map(n -> Namespace.of( - Arrays.stream(n.levels()).limit(subNamespaceLevelLength).toArray(String[]::new) - ) - ) - // remove duplicates - .distinct() - .collect(Collectors.toList()); + namespaces = + namespaces.stream() + // exclude itself + .filter(n -> !n.equals(namespace)) + // only get sub namespaces/children + .filter(n -> n.levels().length >= subNamespaceLevelLength) + // only get sub namespaces/children + .map( + n -> + Namespace.of( + Arrays.stream(n.levels()) + .limit(subNamespaceLevelLength) + .toArray(String[]::new))) + // remove duplicates + .distinct() + .collect(Collectors.toList()); return namespaces; } @Override - public Map loadNamespaceMetadata(Namespace namespace) throws NoSuchNamespaceException { + public Map loadNamespaceMetadata(Namespace namespace) + throws NoSuchNamespaceException { if (!namespaceExists(namespace)) { throw new NoSuchNamespaceException("Namespace does not exist: %s", namespace); } @@ -338,14 +373,18 @@ public boolean dropNamespace(Namespace namespace) throws NamespaceNotEmptyExcept "Namespace %s is not empty. %s tables exist.", namespace, tableIdentifiers.size()); } - int deletedRows = execute( - JdbcUtil.DELETE_ALL_NAMESPACE_PROPERTIES_SQL, catalogName, JdbcUtil.namespaceToString(namespace)); + int deletedRows = + execute( + JdbcUtil.DELETE_ALL_NAMESPACE_PROPERTIES_SQL, + catalogName, + JdbcUtil.namespaceToString(namespace)); return deletedRows > 0; } @Override - public boolean setProperties(Namespace namespace, Map properties) throws NoSuchNamespaceException { + public boolean setProperties(Namespace namespace, Map properties) + throws NoSuchNamespaceException { if (!namespaceExists(namespace)) { throw new NoSuchNamespaceException("Namespace does not exist: %s", namespace); } @@ -356,8 +395,10 @@ public boolean setProperties(Namespace namespace, Map properties return false; } - Preconditions.checkArgument(!properties.containsKey(NAMESPACE_EXISTS_PROPERTY), - "Cannot set reserved property: %s", NAMESPACE_EXISTS_PROPERTY); + Preconditions.checkArgument( + !properties.containsKey(NAMESPACE_EXISTS_PROPERTY), + "Cannot set reserved property: %s", + NAMESPACE_EXISTS_PROPERTY); Map startingProperties = fetchProperties(namespace); Map inserts = Maps.newHashMap(); @@ -386,7 +427,8 @@ public boolean setProperties(Namespace namespace, Map properties } @Override - public boolean removeProperties(Namespace namespace, Set properties) throws NoSuchNamespaceException { + public boolean removeProperties(Namespace namespace, Set properties) + throws NoSuchNamespaceException { if (!namespaceExists(namespace)) { throw new NoSuchNamespaceException("Namespace does not exist: %s", namespace); } @@ -407,11 +449,15 @@ public void close() { @Override public boolean namespaceExists(Namespace namespace) { - if (exists(JdbcUtil.GET_NAMESPACE_SQL, catalogName, JdbcUtil.namespaceToString(namespace) + "%")) { + if (exists( + JdbcUtil.GET_NAMESPACE_SQL, catalogName, JdbcUtil.namespaceToString(namespace) + "%")) { return true; } - if (exists(JdbcUtil.GET_NAMESPACE_PROPERTIES_SQL, catalogName, JdbcUtil.namespaceToString(namespace) + "%")) { + if (exists( + JdbcUtil.GET_NAMESPACE_PROPERTIES_SQL, + catalogName, + JdbcUtil.namespaceToString(namespace) + "%")) { return true; } @@ -419,20 +465,21 @@ public boolean namespaceExists(Namespace namespace) { } private int execute(String sql, String... args) { - return execute(err -> { }, sql, args); + return execute(err -> {}, sql, args); } private int execute(Consumer sqlErrorHandler, String sql, String... args) { try { - return connections.run(conn -> { - try (PreparedStatement preparedStatement = conn.prepareStatement(sql)) { - for (int pos = 0; pos < args.length; pos += 1) { - preparedStatement.setString(pos + 1, args[pos]); - } - - return preparedStatement.executeUpdate(); - } - }); + return connections.run( + conn -> { + try (PreparedStatement preparedStatement = conn.prepareStatement(sql)) { + for (int pos = 0; pos < args.length; pos += 1) { + preparedStatement.setString(pos + 1, args[pos]); + } + + return preparedStatement.executeUpdate(); + } + }); } catch (SQLException e) { sqlErrorHandler.accept(e); throw new UncheckedSQLException(e, "Failed to execute: %s", sql); @@ -444,21 +491,22 @@ private int execute(Consumer sqlErrorHandler, String sql, String.. @SuppressWarnings("checkstyle:NestedTryDepth") private boolean exists(String sql, String... args) { try { - return connections.run(conn -> { - try (PreparedStatement preparedStatement = conn.prepareStatement(sql)) { - for (int pos = 0; pos < args.length; pos += 1) { - preparedStatement.setString(pos + 1, args[pos]); - } - - try (ResultSet rs = preparedStatement.executeQuery()) { - if (rs.next()) { - return true; + return connections.run( + conn -> { + try (PreparedStatement preparedStatement = conn.prepareStatement(sql)) { + for (int pos = 0; pos < args.length; pos += 1) { + preparedStatement.setString(pos + 1, args[pos]); + } + + try (ResultSet rs = preparedStatement.executeQuery()) { + if (rs.next()) { + return true; + } + } } - } - } - return false; - }); + return false; + }); } catch (SQLException e) { throw new UncheckedSQLException(e, "Failed to execute exists query: %s", sql); } catch (InterruptedException e) { @@ -475,23 +523,24 @@ interface RowProducer { @SuppressWarnings("checkstyle:NestedTryDepth") private List fetch(RowProducer toRow, String sql, String... args) { try { - return connections.run(conn -> { - List result = Lists.newArrayList(); - - try (PreparedStatement preparedStatement = conn.prepareStatement(sql)) { - for (int pos = 0; pos < args.length; pos += 1) { - preparedStatement.setString(pos + 1, args[pos]); - } - - try (ResultSet rs = preparedStatement.executeQuery()) { - while (rs.next()) { - result.add(toRow.apply(rs)); + return connections.run( + conn -> { + List result = Lists.newArrayList(); + + try (PreparedStatement preparedStatement = conn.prepareStatement(sql)) { + for (int pos = 0; pos < args.length; pos += 1) { + preparedStatement.setString(pos + 1, args[pos]); + } + + try (ResultSet rs = preparedStatement.executeQuery()) { + while (rs.next()) { + result.add(toRow.apply(rs)); + } + } } - } - } - return result; - }); + return result; + }); } catch (SQLException e) { throw new UncheckedSQLException(e, "Failed to execute query: %s", sql); } catch (InterruptedException e) { @@ -507,20 +556,26 @@ private Map fetchProperties(Namespace namespace) { String namespaceName = JdbcUtil.namespaceToString(namespace); - List> entries = fetch( - row -> new AbstractMap.SimpleImmutableEntry<>( - row.getString(JdbcUtil.NAMESPACE_PROPERTY_KEY), - row.getString(JdbcUtil.NAMESPACE_PROPERTY_VALUE)), - JdbcUtil.GET_ALL_NAMESPACE_PROPERTIES_SQL, catalogName, namespaceName); + List> entries = + fetch( + row -> + new AbstractMap.SimpleImmutableEntry<>( + row.getString(JdbcUtil.NAMESPACE_PROPERTY_KEY), + row.getString(JdbcUtil.NAMESPACE_PROPERTY_VALUE)), + JdbcUtil.GET_ALL_NAMESPACE_PROPERTIES_SQL, + catalogName, + namespaceName); return ImmutableMap.builder().putAll(entries).build(); } private boolean insertProperties(Namespace namespace, Map properties) { String namespaceName = JdbcUtil.namespaceToString(namespace); - String[] args = properties.entrySet().stream() - .flatMap(entry -> Stream.of(catalogName, namespaceName, entry.getKey(), entry.getValue())) - .toArray(String[]::new); + String[] args = + properties.entrySet().stream() + .flatMap( + entry -> Stream.of(catalogName, namespaceName, entry.getKey(), entry.getValue())) + .toArray(String[]::new); int insertedRecords = execute(JdbcUtil.insertPropertiesStatement(properties.size()), args); @@ -528,15 +583,17 @@ private boolean insertProperties(Namespace namespace, Map proper return true; } - throw new IllegalStateException(String.format( - "Failed to insert: %d of %d succeeded", insertedRecords, properties.size())); + throw new IllegalStateException( + String.format("Failed to insert: %d of %d succeeded", insertedRecords, properties.size())); } private boolean updateProperties(Namespace namespace, Map properties) { String namespaceName = JdbcUtil.namespaceToString(namespace); - Stream caseArgs = properties.entrySet().stream() - .flatMap(entry -> Stream.of(entry.getKey(), entry.getValue())); - Stream whereArgs = Stream.concat(Stream.of(catalogName, namespaceName), properties.keySet().stream()); + Stream caseArgs = + properties.entrySet().stream() + .flatMap(entry -> Stream.of(entry.getKey(), entry.getValue())); + Stream whereArgs = + Stream.concat(Stream.of(catalogName, namespaceName), properties.keySet().stream()); String[] args = Stream.concat(caseArgs, whereArgs).toArray(String[]::new); @@ -546,15 +603,15 @@ private boolean updateProperties(Namespace namespace, Map proper return true; } - throw new IllegalStateException(String.format( - "Failed to update: %d of %d succeeded", updatedRecords, properties.size())); + throw new IllegalStateException( + String.format("Failed to update: %d of %d succeeded", updatedRecords, properties.size())); } private boolean deleteProperties(Namespace namespace, Set properties) { String namespaceName = JdbcUtil.namespaceToString(namespace); - String[] args = Stream - .concat(Stream.of(catalogName, namespaceName), properties.stream()) - .toArray(String[]::new); + String[] args = + Stream.concat(Stream.of(catalogName, namespaceName), properties.stream()) + .toArray(String[]::new); return execute(JdbcUtil.deletePropertiesStatement(properties), args) > 0; } diff --git a/core/src/main/java/org/apache/iceberg/jdbc/JdbcClientPool.java b/core/src/main/java/org/apache/iceberg/jdbc/JdbcClientPool.java index ba5edf55583b..daa04908f41e 100644 --- a/core/src/main/java/org/apache/iceberg/jdbc/JdbcClientPool.java +++ b/core/src/main/java/org/apache/iceberg/jdbc/JdbcClientPool.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.jdbc; import java.sql.Connection; @@ -34,8 +33,13 @@ class JdbcClientPool extends ClientPoolImpl { private final Map properties; JdbcClientPool(String dbUrl, Map props) { - this(Integer.parseInt(props.getOrDefault(CatalogProperties.CLIENT_POOL_SIZE, - String.valueOf(CatalogProperties.CLIENT_POOL_SIZE_DEFAULT))), dbUrl, props); + this( + Integer.parseInt( + props.getOrDefault( + CatalogProperties.CLIENT_POOL_SIZE, + String.valueOf(CatalogProperties.CLIENT_POOL_SIZE_DEFAULT))), + dbUrl, + props); } JdbcClientPool(int poolSize, String dbUrl, Map props) { diff --git a/core/src/main/java/org/apache/iceberg/jdbc/JdbcTableOperations.java b/core/src/main/java/org/apache/iceberg/jdbc/JdbcTableOperations.java index 28d516377c1f..3af9026f5375 100644 --- a/core/src/main/java/org/apache/iceberg/jdbc/JdbcTableOperations.java +++ b/core/src/main/java/org/apache/iceberg/jdbc/JdbcTableOperations.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.jdbc; import java.sql.DataTruncation; @@ -50,8 +49,11 @@ class JdbcTableOperations extends BaseMetastoreTableOperations { private final FileIO fileIO; private final JdbcClientPool connections; - protected JdbcTableOperations(JdbcClientPool dbConnPool, FileIO fileIO, String catalogName, - TableIdentifier tableIdentifier) { + protected JdbcTableOperations( + JdbcClientPool dbConnPool, + FileIO fileIO, + String catalogName, + TableIdentifier tableIdentifier) { this.catalogName = catalogName; this.tableIdentifier = tableIdentifier; this.fileIO = fileIO; @@ -69,12 +71,14 @@ public void doRefresh() { throw new UncheckedInterruptedException(e, "Interrupted during refresh"); } catch (SQLException e) { // SQL exception happened when getting table from catalog - throw new UncheckedSQLException(e, "Failed to get table %s from catalog %s", tableIdentifier, catalogName); + throw new UncheckedSQLException( + e, "Failed to get table %s from catalog %s", tableIdentifier, catalogName); } if (table.isEmpty()) { if (currentMetadataLocation() != null) { - throw new NoSuchTableException("Failed to load table %s from catalog %s: dropped by another process", + throw new NoSuchTableException( + "Failed to load table %s from catalog %s: dropped by another process", tableIdentifier, catalogName); } else { this.disableRefresh(); @@ -83,7 +87,9 @@ public void doRefresh() { } String newMetadataLocation = table.get(JdbcUtil.METADATA_LOCATION); - Preconditions.checkState(newMetadataLocation != null, "Invalid table %s: metadata location is null", + Preconditions.checkState( + newMetadataLocation != null, + "Invalid table %s: metadata location is null", tableIdentifier); refreshFromMetadataLocation(newMetadataLocation); } @@ -136,43 +142,49 @@ public void doCommit(TableMetadata base, TableMetadata metadata) { private void updateTable(String newMetadataLocation, String oldMetadataLocation) throws SQLException, InterruptedException { - int updatedRecords = connections.run(conn -> { - try (PreparedStatement sql = conn.prepareStatement(JdbcUtil.DO_COMMIT_SQL)) { - // UPDATE - sql.setString(1, newMetadataLocation); - sql.setString(2, oldMetadataLocation); - // WHERE - sql.setString(3, catalogName); - sql.setString(4, JdbcUtil.namespaceToString(tableIdentifier.namespace())); - sql.setString(5, tableIdentifier.name()); - sql.setString(6, oldMetadataLocation); - return sql.executeUpdate(); - } - }); + int updatedRecords = + connections.run( + conn -> { + try (PreparedStatement sql = conn.prepareStatement(JdbcUtil.DO_COMMIT_SQL)) { + // UPDATE + sql.setString(1, newMetadataLocation); + sql.setString(2, oldMetadataLocation); + // WHERE + sql.setString(3, catalogName); + sql.setString(4, JdbcUtil.namespaceToString(tableIdentifier.namespace())); + sql.setString(5, tableIdentifier.name()); + sql.setString(6, oldMetadataLocation); + return sql.executeUpdate(); + } + }); if (updatedRecords == 1) { LOG.debug("Successfully committed to existing table: {}", tableIdentifier); } else { - throw new CommitFailedException("Failed to update table %s from catalog %s", tableIdentifier, catalogName); + throw new CommitFailedException( + "Failed to update table %s from catalog %s", tableIdentifier, catalogName); } - } private void createTable(String newMetadataLocation) throws SQLException, InterruptedException { - int insertRecord = connections.run(conn -> { - try (PreparedStatement sql = conn.prepareStatement(JdbcUtil.DO_COMMIT_CREATE_TABLE_SQL)) { - sql.setString(1, catalogName); - sql.setString(2, JdbcUtil.namespaceToString(tableIdentifier.namespace())); - sql.setString(3, tableIdentifier.name()); - sql.setString(4, newMetadataLocation); - return sql.executeUpdate(); - } - }); + int insertRecord = + connections.run( + conn -> { + try (PreparedStatement sql = + conn.prepareStatement(JdbcUtil.DO_COMMIT_CREATE_TABLE_SQL)) { + sql.setString(1, catalogName); + sql.setString(2, JdbcUtil.namespaceToString(tableIdentifier.namespace())); + sql.setString(3, tableIdentifier.name()); + sql.setString(4, newMetadataLocation); + return sql.executeUpdate(); + } + }); if (insertRecord == 1) { LOG.debug("Successfully committed to new table: {}", tableIdentifier); } else { - throw new CommitFailedException("Failed to create table %s in catalog %s", tableIdentifier, catalogName); + throw new CommitFailedException( + "Failed to create table %s in catalog %s", tableIdentifier, catalogName); } } @@ -181,7 +193,8 @@ private void validateMetadataLocation(Map table, TableMetadata b String baseMetadataLocation = base != null ? base.metadataFileLocation() : null; if (!Objects.equals(baseMetadataLocation, catalogMetadataLocation)) { - throw new CommitFailedException("Cannot commit %s: metadata location %s has changed from %s", + throw new CommitFailedException( + "Cannot commit %s: metadata location %s has changed from %s", tableIdentifier, baseMetadataLocation, catalogMetadataLocation); } } @@ -196,29 +209,32 @@ protected String tableName() { return tableIdentifier.toString(); } - private Map getTable() throws UncheckedSQLException, SQLException, InterruptedException { - return connections.run(conn -> { - Map table = Maps.newHashMap(); - - try (PreparedStatement sql = conn.prepareStatement(JdbcUtil.GET_TABLE_SQL)) { - sql.setString(1, catalogName); - sql.setString(2, JdbcUtil.namespaceToString(tableIdentifier.namespace())); - sql.setString(3, tableIdentifier.name()); - ResultSet rs = sql.executeQuery(); - - if (rs.next()) { - table.put(JdbcUtil.CATALOG_NAME, rs.getString(JdbcUtil.CATALOG_NAME)); - table.put(JdbcUtil.TABLE_NAMESPACE, rs.getString(JdbcUtil.TABLE_NAMESPACE)); - table.put(JdbcUtil.TABLE_NAME, rs.getString(JdbcUtil.TABLE_NAME)); - table.put(JdbcUtil.METADATA_LOCATION, rs.getString(JdbcUtil.METADATA_LOCATION)); - table.put(JdbcUtil.PREVIOUS_METADATA_LOCATION, rs.getString(JdbcUtil.PREVIOUS_METADATA_LOCATION)); - } - - rs.close(); - } - - return table; - }); + private Map getTable() + throws UncheckedSQLException, SQLException, InterruptedException { + return connections.run( + conn -> { + Map table = Maps.newHashMap(); + + try (PreparedStatement sql = conn.prepareStatement(JdbcUtil.GET_TABLE_SQL)) { + sql.setString(1, catalogName); + sql.setString(2, JdbcUtil.namespaceToString(tableIdentifier.namespace())); + sql.setString(3, tableIdentifier.name()); + ResultSet rs = sql.executeQuery(); + + if (rs.next()) { + table.put(JdbcUtil.CATALOG_NAME, rs.getString(JdbcUtil.CATALOG_NAME)); + table.put(JdbcUtil.TABLE_NAMESPACE, rs.getString(JdbcUtil.TABLE_NAMESPACE)); + table.put(JdbcUtil.TABLE_NAME, rs.getString(JdbcUtil.TABLE_NAME)); + table.put(JdbcUtil.METADATA_LOCATION, rs.getString(JdbcUtil.METADATA_LOCATION)); + table.put( + JdbcUtil.PREVIOUS_METADATA_LOCATION, + rs.getString(JdbcUtil.PREVIOUS_METADATA_LOCATION)); + } + + rs.close(); + } + + return table; + }); } - } diff --git a/core/src/main/java/org/apache/iceberg/jdbc/JdbcUtil.java b/core/src/main/java/org/apache/iceberg/jdbc/JdbcUtil.java index 395e38bc6d9f..5799fcb9c151 100644 --- a/core/src/main/java/org/apache/iceberg/jdbc/JdbcUtil.java +++ b/core/src/main/java/org/apache/iceberg/jdbc/JdbcUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.jdbc; import java.util.Collections; @@ -39,43 +38,131 @@ final class JdbcUtil { static final String METADATA_LOCATION = "metadata_location"; static final String PREVIOUS_METADATA_LOCATION = "previous_metadata_location"; - static final String DO_COMMIT_SQL = "UPDATE " + CATALOG_TABLE_NAME + - " SET " + METADATA_LOCATION + " = ? , " + PREVIOUS_METADATA_LOCATION + " = ? " + - " WHERE " + CATALOG_NAME + " = ? AND " + - TABLE_NAMESPACE + " = ? AND " + - TABLE_NAME + " = ? AND " + - METADATA_LOCATION + " = ?"; + static final String DO_COMMIT_SQL = + "UPDATE " + + CATALOG_TABLE_NAME + + " SET " + + METADATA_LOCATION + + " = ? , " + + PREVIOUS_METADATA_LOCATION + + " = ? " + + " WHERE " + + CATALOG_NAME + + " = ? AND " + + TABLE_NAMESPACE + + " = ? AND " + + TABLE_NAME + + " = ? AND " + + METADATA_LOCATION + + " = ?"; static final String CREATE_CATALOG_TABLE = - "CREATE TABLE " + CATALOG_TABLE_NAME + - "(" + - CATALOG_NAME + " VARCHAR(255) NOT NULL," + - TABLE_NAMESPACE + " VARCHAR(255) NOT NULL," + - TABLE_NAME + " VARCHAR(255) NOT NULL," + - METADATA_LOCATION + " VARCHAR(5500)," + - PREVIOUS_METADATA_LOCATION + " VARCHAR(5500)," + - "PRIMARY KEY (" + CATALOG_NAME + ", " + TABLE_NAMESPACE + ", " + TABLE_NAME + ")" + - ")"; - static final String GET_TABLE_SQL = "SELECT * FROM " + CATALOG_TABLE_NAME + - " WHERE " + CATALOG_NAME + " = ? AND " + TABLE_NAMESPACE + " = ? AND " + TABLE_NAME + " = ? "; - static final String LIST_TABLES_SQL = "SELECT * FROM " + CATALOG_TABLE_NAME + - " WHERE " + CATALOG_NAME + " = ? AND " + TABLE_NAMESPACE + " = ?"; - static final String RENAME_TABLE_SQL = "UPDATE " + CATALOG_TABLE_NAME + - " SET " + TABLE_NAMESPACE + " = ? , " + TABLE_NAME + " = ? " + - " WHERE " + CATALOG_NAME + " = ? AND " + TABLE_NAMESPACE + " = ? AND " + TABLE_NAME + " = ? "; - static final String DROP_TABLE_SQL = "DELETE FROM " + CATALOG_TABLE_NAME + - " WHERE " + CATALOG_NAME + " = ? AND " + TABLE_NAMESPACE + " = ? AND " + TABLE_NAME + " = ? "; - static final String GET_NAMESPACE_SQL = "SELECT " + TABLE_NAMESPACE + " FROM " + CATALOG_TABLE_NAME + - " WHERE " + CATALOG_NAME + " = ? AND " + TABLE_NAMESPACE + " LIKE ? LIMIT 1"; - static final String LIST_NAMESPACES_SQL = "SELECT DISTINCT " + TABLE_NAMESPACE + - " FROM " + CATALOG_TABLE_NAME + - " WHERE " + CATALOG_NAME + " = ? AND " + TABLE_NAMESPACE + " LIKE ?"; - static final String LIST_ALL_TABLE_NAMESPACES_SQL = "SELECT DISTINCT " + TABLE_NAMESPACE + - " FROM " + CATALOG_TABLE_NAME + - " WHERE " + CATALOG_NAME + " = ?"; - static final String DO_COMMIT_CREATE_TABLE_SQL = "INSERT INTO " + CATALOG_TABLE_NAME + - " (" + CATALOG_NAME + ", " + TABLE_NAMESPACE + ", " + TABLE_NAME + - ", " + METADATA_LOCATION + ", " + PREVIOUS_METADATA_LOCATION + ") " + - " VALUES (?,?,?,?,null)"; + "CREATE TABLE " + + CATALOG_TABLE_NAME + + "(" + + CATALOG_NAME + + " VARCHAR(255) NOT NULL," + + TABLE_NAMESPACE + + " VARCHAR(255) NOT NULL," + + TABLE_NAME + + " VARCHAR(255) NOT NULL," + + METADATA_LOCATION + + " VARCHAR(5500)," + + PREVIOUS_METADATA_LOCATION + + " VARCHAR(5500)," + + "PRIMARY KEY (" + + CATALOG_NAME + + ", " + + TABLE_NAMESPACE + + ", " + + TABLE_NAME + + ")" + + ")"; + static final String GET_TABLE_SQL = + "SELECT * FROM " + + CATALOG_TABLE_NAME + + " WHERE " + + CATALOG_NAME + + " = ? AND " + + TABLE_NAMESPACE + + " = ? AND " + + TABLE_NAME + + " = ? "; + static final String LIST_TABLES_SQL = + "SELECT * FROM " + + CATALOG_TABLE_NAME + + " WHERE " + + CATALOG_NAME + + " = ? AND " + + TABLE_NAMESPACE + + " = ?"; + static final String RENAME_TABLE_SQL = + "UPDATE " + + CATALOG_TABLE_NAME + + " SET " + + TABLE_NAMESPACE + + " = ? , " + + TABLE_NAME + + " = ? " + + " WHERE " + + CATALOG_NAME + + " = ? AND " + + TABLE_NAMESPACE + + " = ? AND " + + TABLE_NAME + + " = ? "; + static final String DROP_TABLE_SQL = + "DELETE FROM " + + CATALOG_TABLE_NAME + + " WHERE " + + CATALOG_NAME + + " = ? AND " + + TABLE_NAMESPACE + + " = ? AND " + + TABLE_NAME + + " = ? "; + static final String GET_NAMESPACE_SQL = + "SELECT " + + TABLE_NAMESPACE + + " FROM " + + CATALOG_TABLE_NAME + + " WHERE " + + CATALOG_NAME + + " = ? AND " + + TABLE_NAMESPACE + + " LIKE ? LIMIT 1"; + static final String LIST_NAMESPACES_SQL = + "SELECT DISTINCT " + + TABLE_NAMESPACE + + " FROM " + + CATALOG_TABLE_NAME + + " WHERE " + + CATALOG_NAME + + " = ? AND " + + TABLE_NAMESPACE + + " LIKE ?"; + static final String LIST_ALL_TABLE_NAMESPACES_SQL = + "SELECT DISTINCT " + + TABLE_NAMESPACE + + " FROM " + + CATALOG_TABLE_NAME + + " WHERE " + + CATALOG_NAME + + " = ?"; + static final String DO_COMMIT_CREATE_TABLE_SQL = + "INSERT INTO " + + CATALOG_TABLE_NAME + + " (" + + CATALOG_NAME + + ", " + + TABLE_NAMESPACE + + ", " + + TABLE_NAME + + ", " + + METADATA_LOCATION + + ", " + + PREVIOUS_METADATA_LOCATION + + ") " + + " VALUES (?,?,?,?,null)"; // Catalog Namespace Properties static final String NAMESPACE_PROPERTIES_TABLE_NAME = "iceberg_namespace_properties"; @@ -84,41 +171,99 @@ final class JdbcUtil { static final String NAMESPACE_PROPERTY_VALUE = "property_value"; static final String CREATE_NAMESPACE_PROPERTIES_TABLE = - "CREATE TABLE " + NAMESPACE_PROPERTIES_TABLE_NAME + - "(" + - CATALOG_NAME + " VARCHAR(255) NOT NULL," + - NAMESPACE_NAME + " VARCHAR(255) NOT NULL," + - NAMESPACE_PROPERTY_KEY + " VARCHAR(5500)," + - NAMESPACE_PROPERTY_VALUE + " VARCHAR(5500)," + - "PRIMARY KEY (" + CATALOG_NAME + ", " + NAMESPACE_NAME + ", " + NAMESPACE_PROPERTY_KEY + ")" + - ")"; - static final String GET_NAMESPACE_PROPERTIES_SQL = "SELECT " + NAMESPACE_NAME + - " FROM " + NAMESPACE_PROPERTIES_TABLE_NAME + - " WHERE " + CATALOG_NAME + " = ? AND " + NAMESPACE_NAME + " LIKE ? LIMIT 1"; - static final String INSERT_NAMESPACE_PROPERTIES_SQL = "INSERT INTO " + NAMESPACE_PROPERTIES_TABLE_NAME + - " (" + CATALOG_NAME + ", " + NAMESPACE_NAME + ", " + NAMESPACE_PROPERTY_KEY + - ", " + NAMESPACE_PROPERTY_VALUE + ") VALUES "; + "CREATE TABLE " + + NAMESPACE_PROPERTIES_TABLE_NAME + + "(" + + CATALOG_NAME + + " VARCHAR(255) NOT NULL," + + NAMESPACE_NAME + + " VARCHAR(255) NOT NULL," + + NAMESPACE_PROPERTY_KEY + + " VARCHAR(5500)," + + NAMESPACE_PROPERTY_VALUE + + " VARCHAR(5500)," + + "PRIMARY KEY (" + + CATALOG_NAME + + ", " + + NAMESPACE_NAME + + ", " + + NAMESPACE_PROPERTY_KEY + + ")" + + ")"; + static final String GET_NAMESPACE_PROPERTIES_SQL = + "SELECT " + + NAMESPACE_NAME + + " FROM " + + NAMESPACE_PROPERTIES_TABLE_NAME + + " WHERE " + + CATALOG_NAME + + " = ? AND " + + NAMESPACE_NAME + + " LIKE ? LIMIT 1"; + static final String INSERT_NAMESPACE_PROPERTIES_SQL = + "INSERT INTO " + + NAMESPACE_PROPERTIES_TABLE_NAME + + " (" + + CATALOG_NAME + + ", " + + NAMESPACE_NAME + + ", " + + NAMESPACE_PROPERTY_KEY + + ", " + + NAMESPACE_PROPERTY_VALUE + + ") VALUES "; static final String INSERT_PROPERTIES_VALUES_BASE = "(?,?,?,?)"; - static final String GET_ALL_NAMESPACE_PROPERTIES_SQL = "SELECT * " + - " FROM " + NAMESPACE_PROPERTIES_TABLE_NAME + " WHERE " + CATALOG_NAME + " = ? AND " + NAMESPACE_NAME + " = ? "; - static final String DELETE_NAMESPACE_PROPERTIES_SQL = "DELETE FROM " + NAMESPACE_PROPERTIES_TABLE_NAME + - " WHERE " + CATALOG_NAME + " = ? AND " + NAMESPACE_NAME + " = ? AND " + NAMESPACE_PROPERTY_KEY + " IN "; + static final String GET_ALL_NAMESPACE_PROPERTIES_SQL = + "SELECT * " + + " FROM " + + NAMESPACE_PROPERTIES_TABLE_NAME + + " WHERE " + + CATALOG_NAME + + " = ? AND " + + NAMESPACE_NAME + + " = ? "; + static final String DELETE_NAMESPACE_PROPERTIES_SQL = + "DELETE FROM " + + NAMESPACE_PROPERTIES_TABLE_NAME + + " WHERE " + + CATALOG_NAME + + " = ? AND " + + NAMESPACE_NAME + + " = ? AND " + + NAMESPACE_PROPERTY_KEY + + " IN "; static final String DELETE_ALL_NAMESPACE_PROPERTIES_SQL = - "DELETE FROM " + NAMESPACE_PROPERTIES_TABLE_NAME + - " WHERE " + CATALOG_NAME + " = ? AND " + NAMESPACE_NAME + " = ?"; - static final String LIST_PROPERTY_NAMESPACES_SQL = "SELECT DISTINCT " + NAMESPACE_NAME + - " FROM " + NAMESPACE_PROPERTIES_TABLE_NAME + - " WHERE " + CATALOG_NAME + " = ? AND " + NAMESPACE_NAME + " LIKE ?"; - static final String LIST_ALL_PROPERTY_NAMESPACES_SQL = "SELECT DISTINCT " + NAMESPACE_NAME + - " FROM " + NAMESPACE_PROPERTIES_TABLE_NAME + - " WHERE " + CATALOG_NAME + " = ?"; + "DELETE FROM " + + NAMESPACE_PROPERTIES_TABLE_NAME + + " WHERE " + + CATALOG_NAME + + " = ? AND " + + NAMESPACE_NAME + + " = ?"; + static final String LIST_PROPERTY_NAMESPACES_SQL = + "SELECT DISTINCT " + + NAMESPACE_NAME + + " FROM " + + NAMESPACE_PROPERTIES_TABLE_NAME + + " WHERE " + + CATALOG_NAME + + " = ? AND " + + NAMESPACE_NAME + + " LIKE ?"; + static final String LIST_ALL_PROPERTY_NAMESPACES_SQL = + "SELECT DISTINCT " + + NAMESPACE_NAME + + " FROM " + + NAMESPACE_PROPERTIES_TABLE_NAME + + " WHERE " + + CATALOG_NAME + + " = ?"; // Utilities private static final Joiner JOINER_DOT = Joiner.on('.'); private static final Splitter SPLITTER_DOT = Splitter.on('.'); - private JdbcUtil() { - } + private JdbcUtil() {} public static Namespace stringToNamespace(String namespace) { Preconditions.checkArgument(namespace != null, "Invalid namespace %s", namespace); @@ -133,26 +278,37 @@ public static TableIdentifier stringToTableIdentifier(String tableNamespace, Str return TableIdentifier.of(JdbcUtil.stringToNamespace(tableNamespace), tableName); } - public static Properties filterAndRemovePrefix(Map properties, - String prefix) { + public static Properties filterAndRemovePrefix(Map properties, String prefix) { Properties result = new Properties(); - properties.forEach((key, value) -> { - if (key.startsWith(prefix)) { - result.put(key.substring(prefix.length()), value); - } - }); + properties.forEach( + (key, value) -> { + if (key.startsWith(prefix)) { + result.put(key.substring(prefix.length()), value); + } + }); return result; } public static String updatePropertiesStatement(int size) { - StringBuilder sqlStatement = new StringBuilder("UPDATE " + NAMESPACE_PROPERTIES_TABLE_NAME + - " SET " + NAMESPACE_PROPERTY_VALUE + " = CASE"); + StringBuilder sqlStatement = + new StringBuilder( + "UPDATE " + + NAMESPACE_PROPERTIES_TABLE_NAME + + " SET " + + NAMESPACE_PROPERTY_VALUE + + " = CASE"); for (int i = 0; i < size; i += 1) { sqlStatement.append(" WHEN " + NAMESPACE_PROPERTY_KEY + " = ? THEN ?"); } - sqlStatement.append(" END WHERE " + CATALOG_NAME + " = ? AND " + - NAMESPACE_NAME + " = ? AND " + NAMESPACE_PROPERTY_KEY + " IN "); + sqlStatement.append( + " END WHERE " + + CATALOG_NAME + + " = ? AND " + + NAMESPACE_NAME + + " = ? AND " + + NAMESPACE_PROPERTY_KEY + + " IN "); String values = String.join(",", Collections.nCopies(size, String.valueOf('?'))); sqlStatement.append("(").append(values).append(")"); diff --git a/core/src/main/java/org/apache/iceberg/jdbc/UncheckedInterruptedException.java b/core/src/main/java/org/apache/iceberg/jdbc/UncheckedInterruptedException.java index 8ab8bab51fd4..e564bd5b3a8b 100644 --- a/core/src/main/java/org/apache/iceberg/jdbc/UncheckedInterruptedException.java +++ b/core/src/main/java/org/apache/iceberg/jdbc/UncheckedInterruptedException.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.jdbc; import com.google.errorprone.annotations.FormatMethod; diff --git a/core/src/main/java/org/apache/iceberg/jdbc/UncheckedSQLException.java b/core/src/main/java/org/apache/iceberg/jdbc/UncheckedSQLException.java index 504dc376a88c..7d961ef9e91c 100644 --- a/core/src/main/java/org/apache/iceberg/jdbc/UncheckedSQLException.java +++ b/core/src/main/java/org/apache/iceberg/jdbc/UncheckedSQLException.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.jdbc; import com.google.errorprone.annotations.FormatMethod; diff --git a/core/src/main/java/org/apache/iceberg/mapping/MappedField.java b/core/src/main/java/org/apache/iceberg/mapping/MappedField.java index 21473efd1cb3..21f3029b6eb4 100644 --- a/core/src/main/java/org/apache/iceberg/mapping/MappedField.java +++ b/core/src/main/java/org/apache/iceberg/mapping/MappedField.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mapping; import java.io.Serializable; @@ -25,9 +24,7 @@ import org.apache.iceberg.relocated.com.google.common.base.Joiner; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -/** - * An immutable mapping between a field ID and a set of names. - */ +/** An immutable mapping between a field ID and a set of names. */ public class MappedField implements Serializable { public static MappedField of(Integer id, String name) { @@ -77,9 +74,9 @@ public boolean equals(Object other) { } MappedField that = (MappedField) other; - return names.equals(that.names) && - Objects.equals(id, that.id) && - Objects.equals(nestedMapping, that.nestedMapping); + return names.equals(that.names) + && Objects.equals(id, that.id) + && Objects.equals(nestedMapping, that.nestedMapping); } @Override @@ -89,7 +86,10 @@ public int hashCode() { @Override public String toString() { - return "([" + Joiner.on(", ").join(names) + "] -> " + (id != null ? id : "?") + - (nestedMapping != null ? ", " + nestedMapping + ")" : ")"); + return "([" + + Joiner.on(", ").join(names) + + "] -> " + + (id != null ? id : "?") + + (nestedMapping != null ? ", " + nestedMapping + ")" : ")"); } } diff --git a/core/src/main/java/org/apache/iceberg/mapping/MappedFields.java b/core/src/main/java/org/apache/iceberg/mapping/MappedFields.java index 1e344420f8ea..fe031e4e476f 100644 --- a/core/src/main/java/org/apache/iceberg/mapping/MappedFields.java +++ b/core/src/main/java/org/apache/iceberg/mapping/MappedFields.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mapping; import java.io.Serializable; @@ -61,24 +60,29 @@ public int size() { private static Map indexIds(List fields) { ImmutableMap.Builder builder = ImmutableMap.builder(); - fields.forEach(field -> - field.names().forEach(name -> { - Integer id = field.id(); - if (id != null) { - builder.put(name, id); - } - })); + fields.forEach( + field -> + field + .names() + .forEach( + name -> { + Integer id = field.id(); + if (id != null) { + builder.put(name, id); + } + })); return builder.build(); } private static Map indexFields(List fields) { ImmutableMap.Builder builder = ImmutableMap.builder(); - fields.forEach(field -> { - Integer id = field.id(); - if (id != null) { - builder.put(id, field); - } - }); + fields.forEach( + field -> { + Integer id = field.id(); + if (id != null) { + builder.put(id, field); + } + }); return builder.build(); } diff --git a/core/src/main/java/org/apache/iceberg/mapping/MappingUtil.java b/core/src/main/java/org/apache/iceberg/mapping/MappingUtil.java index cfb1d296a8ad..de6ce2ad0425 100644 --- a/core/src/main/java/org/apache/iceberg/mapping/MappingUtil.java +++ b/core/src/main/java/org/apache/iceberg/mapping/MappingUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mapping; import java.util.Collection; @@ -39,13 +38,12 @@ public class MappingUtil { private static final Joiner DOT = Joiner.on('.'); - private MappingUtil() { - } + private MappingUtil() {} /** * Create a name-based mapping for a schema. - *

- * The mapping returned by this method will use the schema's name for each field. + * + *

The mapping returned by this method will use the schema's name for each field. * * @param schema a {@link Schema} * @return a {@link NameMapping} initialized with the schema's fields and names @@ -60,11 +58,13 @@ public static NameMapping create(Schema schema) { * @param mapping a name-based mapping * @param updates a map from field ID to updated field definitions * @param adds a map from parent field ID to nested fields to be added - * @return an updated mapping with names added to renamed fields and the mapping extended for new fields + * @return an updated mapping with names added to renamed fields and the mapping extended for new + * fields */ - public static NameMapping update(NameMapping mapping, - Map updates, - Multimap adds) { + public static NameMapping update( + NameMapping mapping, + Map updates, + Multimap adds) { return new NameMapping(visit(mapping, new UpdateMapping(updates, adds))); } @@ -80,7 +80,8 @@ private static class UpdateMapping implements Visitor private final Map updates; private final Multimap adds; - private UpdateMapping(Map updates, Multimap adds) { + private UpdateMapping( + Map updates, Multimap adds) { this.updates = updates; this.adds = adds; } @@ -94,12 +95,15 @@ public MappedFields mapping(NameMapping mapping, MappedFields result) { public MappedFields fields(MappedFields fields, List fieldResults) { ImmutableMap.Builder builder = ImmutableMap.builder(); fieldResults.stream() - .map(MappedField::id).filter(Objects::nonNull) - .map(updates::get).filter(Objects::nonNull) + .map(MappedField::id) + .filter(Objects::nonNull) + .map(updates::get) + .filter(Objects::nonNull) .forEach(field -> builder.put(field.name(), field.fieldId())); Map updateAssignments = builder.build(); - return MappedFields.of(Lists.transform(fieldResults, field -> removeReassignedNames(field, updateAssignments))); + return MappedFields.of( + Lists.transform(fieldResults, field -> removeReassignedNames(field, updateAssignments))); } @Override @@ -136,7 +140,8 @@ private MappedFields addNewFields(MappedFields mapping, int parentId) { fieldsToAdd.forEach(field -> builder.put(field.name(), field.fieldId())); Map assignments = builder.build(); - // create a copy of fields that can be updated (append new fields, replace existing for reassignment) + // create a copy of fields that can be updated (append new fields, replace existing for + // reassignment) List fields = Lists.newArrayList(); for (MappedField field : mapping.fields()) { fields.add(removeReassignedNames(field, assignments)); @@ -147,7 +152,8 @@ private MappedFields addNewFields(MappedFields mapping, int parentId) { return MappedFields.of(fields); } - private static MappedField removeReassignedNames(MappedField field, Map assignments) { + private static MappedField removeReassignedNames( + MappedField field, Map assignments) { MappedField newField = field; for (String name : field.names()) { Integer assignedId = assignments.get(name); @@ -159,11 +165,13 @@ private static MappedField removeReassignedNames(MappedField field, Map, Map> { + private static class IndexByName + implements Visitor, Map> { static final IndexByName INSTANCE = new IndexByName(); @Override @@ -172,7 +180,8 @@ public Map mapping(NameMapping mapping, Map fields(MappedFields fields, List> fieldResults) { + public Map fields( + MappedFields fields, List> fieldResults) { // merge the results of each field ImmutableMap.Builder builder = ImmutableMap.builder(); for (Map results : fieldResults) { @@ -202,22 +211,27 @@ public Map field(MappedField field, Map, Map> { + private static class IndexById + implements Visitor, Map> { private final Map result = Maps.newHashMap(); @Override - public Map mapping(NameMapping mapping, Map fieldsResult) { + public Map mapping( + NameMapping mapping, Map fieldsResult) { return fieldsResult; } @Override - public Map fields(MappedFields fields, List> fieldResults) { + public Map fields( + MappedFields fields, List> fieldResults) { return result; } @Override - public Map field(MappedField field, Map fieldResult) { - Preconditions.checkState(!result.containsKey(field.id()), "Invalid mapping: ID %s is not unique", field.id()); + public Map field( + MappedField field, Map fieldResult) { + Preconditions.checkState( + !result.containsKey(field.id()), "Invalid mapping: ID %s is not unique", field.id()); result.put(field.id(), field); return result; } @@ -225,7 +239,9 @@ public Map field(MappedField field, Map { S mapping(NameMapping mapping, S result); + S fields(MappedFields fields, List fieldResults); + T field(MappedField field, S fieldResult); } @@ -249,8 +265,7 @@ private static S visit(MappedFields mapping, Visitor visitor) { private static class CreateMapping extends TypeUtil.SchemaVisitor { private static final CreateMapping INSTANCE = new CreateMapping(); - private CreateMapping() { - } + private CreateMapping() {} @Override public MappedFields schema(Schema schema, MappedFields structResult) { @@ -284,8 +299,7 @@ public MappedFields list(Types.ListType list, MappedFields elementResult) { public MappedFields map(Types.MapType map, MappedFields keyResult, MappedFields valueResult) { return MappedFields.of( MappedField.of(map.keyId(), "key", keyResult), - MappedField.of(map.valueId(), "value", valueResult) - ); + MappedField.of(map.valueId(), "value", valueResult)); } @Override diff --git a/core/src/main/java/org/apache/iceberg/mapping/NameMapping.java b/core/src/main/java/org/apache/iceberg/mapping/NameMapping.java index 015771f007a0..aa5f5f3cdaca 100644 --- a/core/src/main/java/org/apache/iceberg/mapping/NameMapping.java +++ b/core/src/main/java/org/apache/iceberg/mapping/NameMapping.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mapping; import java.io.Serializable; @@ -25,9 +24,7 @@ import org.apache.iceberg.relocated.com.google.common.base.Joiner; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -/** - * Represents a mapping from external schema names to Iceberg type IDs. - */ +/** Represents a mapping from external schema names to Iceberg type IDs. */ public class NameMapping implements Serializable { private static final Joiner DOT = Joiner.on('.'); diff --git a/core/src/main/java/org/apache/iceberg/mapping/NameMappingParser.java b/core/src/main/java/org/apache/iceberg/mapping/NameMappingParser.java index 5eaef84abb10..f5bce333fcd2 100644 --- a/core/src/main/java/org/apache/iceberg/mapping/NameMappingParser.java +++ b/core/src/main/java/org/apache/iceberg/mapping/NameMappingParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mapping; import com.fasterxml.jackson.core.JsonGenerator; @@ -33,6 +32,7 @@ /** * Parses external name mappings from a JSON representation. + * *

  * [ { "field-id": 1, "names": ["id", "record_id"] },
  *   { "field-id": 2, "names": ["data"] },
@@ -44,8 +44,7 @@
  */
 public class NameMappingParser {
 
-  private NameMappingParser() {
-  }
+  private NameMappingParser() {}
 
   private static final String FIELD_ID = "field-id";
   private static final String NAMES = "names";
@@ -120,8 +119,10 @@ private static MappedFields fieldsFromJson(JsonNode node) {
   }
 
   private static MappedField fieldFromJson(JsonNode node) {
-    Preconditions.checkArgument(node != null && !node.isNull() && node.isObject(),
-        "Cannot parse non-object mapping field: %s", node);
+    Preconditions.checkArgument(
+        node != null && !node.isNull() && node.isObject(),
+        "Cannot parse non-object mapping field: %s",
+        node);
 
     Integer id = JsonUtil.getIntOrNull(FIELD_ID, node);
 
diff --git a/core/src/main/java/org/apache/iceberg/puffin/Blob.java b/core/src/main/java/org/apache/iceberg/puffin/Blob.java
index 57c2cd1c002b..9fe0c2520037 100644
--- a/core/src/main/java/org/apache/iceberg/puffin/Blob.java
+++ b/core/src/main/java/org/apache/iceberg/puffin/Blob.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.puffin;
 
 import java.nio.ByteBuffer;
@@ -37,14 +36,22 @@ public final class Blob {
   private final Map properties;
 
   public Blob(
-      String type, List inputFields, long snapshotId, long sequenceNumber,
+      String type,
+      List inputFields,
+      long snapshotId,
+      long sequenceNumber,
       ByteBuffer blobData) {
     this(type, inputFields, snapshotId, sequenceNumber, blobData, null, ImmutableMap.of());
   }
 
   public Blob(
-      String type, List inputFields, long snapshotId, long sequenceNumber,
-      ByteBuffer blobData, @Nullable PuffinCompressionCodec requestedCompression, Map properties) {
+      String type,
+      List inputFields,
+      long snapshotId,
+      long sequenceNumber,
+      ByteBuffer blobData,
+      @Nullable PuffinCompressionCodec requestedCompression,
+      Map properties) {
     Preconditions.checkNotNull(type, "type is null");
     Preconditions.checkNotNull(inputFields, "inputFields is null");
     Preconditions.checkNotNull(blobData, "blobData is null");
diff --git a/core/src/main/java/org/apache/iceberg/puffin/BlobMetadata.java b/core/src/main/java/org/apache/iceberg/puffin/BlobMetadata.java
index cc29dd5df92f..839baaa04031 100644
--- a/core/src/main/java/org/apache/iceberg/puffin/BlobMetadata.java
+++ b/core/src/main/java/org/apache/iceberg/puffin/BlobMetadata.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.puffin;
 
 import java.util.List;
@@ -37,8 +36,14 @@ public class BlobMetadata {
   private final Map properties;
 
   public BlobMetadata(
-      String type, List inputFields, long snapshotId, long sequenceNumber,
-      long offset, long length, @Nullable String compressionCodec, Map properties) {
+      String type,
+      List inputFields,
+      long snapshotId,
+      long sequenceNumber,
+      long offset,
+      long length,
+      @Nullable String compressionCodec,
+      Map properties) {
     Preconditions.checkNotNull(type, "type is null");
     Preconditions.checkNotNull(inputFields, "inputFields is null");
     Preconditions.checkNotNull(properties, "properties is null");
@@ -60,30 +65,22 @@ public List inputFields() {
     return inputFields;
   }
 
-  /**
-   * ID of the Iceberg table's snapshot the blob was computed from
-   */
+  /** ID of the Iceberg table's snapshot the blob was computed from */
   public long snapshotId() {
     return snapshotId;
   }
 
-  /**
-   * Sequence number of the Iceberg table's snapshot the blob was computed from
-   */
+  /** Sequence number of the Iceberg table's snapshot the blob was computed from */
   public long sequenceNumber() {
     return sequenceNumber;
   }
 
-  /**
-   * Offset in the file
-   */
+  /** Offset in the file */
   public long offset() {
     return offset;
   }
 
-  /**
-   * Length in the file
-   */
+  /** Length in the file */
   public long length() {
     return length;
   }
diff --git a/core/src/main/java/org/apache/iceberg/puffin/FileMetadata.java b/core/src/main/java/org/apache/iceberg/puffin/FileMetadata.java
index eb33edd051bc..4c006c3ba900 100644
--- a/core/src/main/java/org/apache/iceberg/puffin/FileMetadata.java
+++ b/core/src/main/java/org/apache/iceberg/puffin/FileMetadata.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.puffin;
 
 import java.util.List;
diff --git a/core/src/main/java/org/apache/iceberg/puffin/FileMetadataParser.java b/core/src/main/java/org/apache/iceberg/puffin/FileMetadataParser.java
index 40b84f682fe1..96ccd15a0ff6 100644
--- a/core/src/main/java/org/apache/iceberg/puffin/FileMetadataParser.java
+++ b/core/src/main/java/org/apache/iceberg/puffin/FileMetadataParser.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.puffin;
 
 import com.fasterxml.jackson.core.JsonGenerator;
@@ -33,8 +32,7 @@
 
 public final class FileMetadataParser {
 
-  private FileMetadataParser() {
-  }
+  private FileMetadataParser() {}
 
   private static final String BLOBS = "blobs";
   private static final String PROPERTIES = "properties";
@@ -98,8 +96,10 @@ static FileMetadata fileMetadataFromJson(JsonNode json) {
 
     ImmutableList.Builder blobs = ImmutableList.builder();
     JsonNode blobsJson = json.get(BLOBS);
-    Preconditions.checkArgument(blobsJson != null && blobsJson.isArray(),
-        "Cannot parse blobs from non-array: %s", blobsJson);
+    Preconditions.checkArgument(
+        blobsJson != null && blobsJson.isArray(),
+        "Cannot parse blobs from non-array: %s",
+        blobsJson);
     for (JsonNode blobJson : blobsJson) {
       blobs.add(blobMetadataFromJson(blobJson));
     }
@@ -110,9 +110,7 @@ static FileMetadata fileMetadataFromJson(JsonNode json) {
       properties = JsonUtil.getStringMap(PROPERTIES, json);
     }
 
-    return new FileMetadata(
-        blobs.build(),
-        properties);
+    return new FileMetadata(blobs.build(), properties);
   }
 
   static void toJson(BlobMetadata blobMetadata, JsonGenerator generator) throws IOException {
@@ -160,15 +158,7 @@ static BlobMetadata blobMetadataFromJson(JsonNode json) {
       properties = JsonUtil.getStringMap(PROPERTIES, json);
     }
 
-
     return new BlobMetadata(
-        type,
-        fields,
-        snapshotId,
-        sequenceNumber,
-        offset,
-        length,
-        compressionCodec,
-        properties);
+        type, fields, snapshotId, sequenceNumber, offset, length, compressionCodec, properties);
   }
 }
diff --git a/core/src/main/java/org/apache/iceberg/puffin/Puffin.java b/core/src/main/java/org/apache/iceberg/puffin/Puffin.java
index 6cfb8be20699..251486d01e76 100644
--- a/core/src/main/java/org/apache/iceberg/puffin/Puffin.java
+++ b/core/src/main/java/org/apache/iceberg/puffin/Puffin.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.puffin;
 
 import java.util.Map;
@@ -24,20 +23,15 @@
 import org.apache.iceberg.io.OutputFile;
 import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 
-/**
- * Utility class for reading and writing Puffin files.
- */
+/** Utility class for reading and writing Puffin files. */
 public final class Puffin {
-  private Puffin() {
-  }
+  private Puffin() {}
 
   public static WriteBuilder write(OutputFile outputFile) {
     return new WriteBuilder(outputFile);
   }
 
-  /**
-   * A builder for {@link PuffinWriter}.
-   */
+  /** A builder for {@link PuffinWriter}. */
   public static class WriteBuilder {
     private final OutputFile outputFile;
     private final Map properties = Maps.newLinkedHashMap();
@@ -48,41 +42,32 @@ private WriteBuilder(OutputFile outputFile) {
       this.outputFile = outputFile;
     }
 
-    /**
-     * Sets file-level property to be written
-     */
+    /** Sets file-level property to be written */
     public WriteBuilder set(String property, String value) {
       properties.put(property, value);
       return this;
     }
 
-    /**
-     * Sets file-level properties to be written
-     */
+    /** Sets file-level properties to be written */
     public WriteBuilder setAll(Map props) {
       this.properties.putAll(props);
       return this;
     }
 
-    /**
-     * Sets file-level {@value StandardPuffinProperties#CREATED_BY_PROPERTY} property.
-     */
+    /** Sets file-level {@value StandardPuffinProperties#CREATED_BY_PROPERTY} property. */
     public WriteBuilder createdBy(String applicationIdentifier) {
       this.properties.put(StandardPuffinProperties.CREATED_BY_PROPERTY, applicationIdentifier);
       return this;
     }
 
-    /**
-     * Configures the writer to compress the footer.
-     */
+    /** Configures the writer to compress the footer. */
     public WriteBuilder compressFooter() {
       this.compressFooter = true;
       return this;
     }
 
     /**
-     * Configures the writer to compress the blobs.
-     * Can be overwritten by {@link Blob} attribute.
+     * Configures the writer to compress the blobs. Can be overwritten by {@link Blob} attribute.
      */
     public WriteBuilder compressBlobs(PuffinCompressionCodec compression) {
       this.defaultBlobCompression = compression;
@@ -98,9 +83,7 @@ public static ReadBuilder read(InputFile inputFile) {
     return new ReadBuilder(inputFile);
   }
 
-  /**
-   * A builder for {@link PuffinReader}.
-   */
+  /** A builder for {@link PuffinReader}. */
   public static final class ReadBuilder {
     private final InputFile inputFile;
     private Long fileSize;
@@ -110,17 +93,13 @@ private ReadBuilder(InputFile inputFile) {
       this.inputFile = inputFile;
     }
 
-    /**
-     * Passes known file size to the reader. This may improve read performance.
-     */
+    /** Passes known file size to the reader. This may improve read performance. */
     public ReadBuilder withFileSize(long size) {
       this.fileSize = size;
       return this;
     }
 
-    /**
-     * Passes known footer size to the reader. This may improve read performance.
-     */
+    /** Passes known footer size to the reader. This may improve read performance. */
     public ReadBuilder withFooterSize(long size) {
       this.footerSize = size;
       return this;
diff --git a/core/src/main/java/org/apache/iceberg/puffin/PuffinCompressionCodec.java b/core/src/main/java/org/apache/iceberg/puffin/PuffinCompressionCodec.java
index dc8182df4762..780e73bf699c 100644
--- a/core/src/main/java/org/apache/iceberg/puffin/PuffinCompressionCodec.java
+++ b/core/src/main/java/org/apache/iceberg/puffin/PuffinCompressionCodec.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.puffin;
 
 import java.util.Map;
@@ -28,30 +27,26 @@
 import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 
 public enum PuffinCompressionCodec {
-  /**
-   * No compression
-   */
+  /** No compression */
   NONE(null),
 
-  /**
-   * LZ4 single compression frame with content size present
-   */
+  /** LZ4 single compression frame with content size present */
   LZ4("lz4"),
 
-  /**
-   * Zstandard single compression frame with content size present
-   */
+  /** Zstandard single compression frame with content size present */
   ZSTD("zstd"),
-  /**/;
+/**/ ;
 
-  private static final Map BY_NAME = Stream.of(values())
-      .collect(Collectors.toMap(
-          PuffinCompressionCodec::codecName,
-          Function.identity(),
-          (a, b) -> {
-            throw new UnsupportedOperationException("Two enum instances with same name");
-          },
-          Maps::newHashMap));
+  private static final Map BY_NAME =
+      Stream.of(values())
+          .collect(
+              Collectors.toMap(
+                  PuffinCompressionCodec::codecName,
+                  Function.identity(),
+                  (a, b) -> {
+                    throw new UnsupportedOperationException("Two enum instances with same name");
+                  },
+                  Maps::newHashMap));
 
   private final String codecName;
 
diff --git a/core/src/main/java/org/apache/iceberg/puffin/PuffinFormat.java b/core/src/main/java/org/apache/iceberg/puffin/PuffinFormat.java
index c68909c52181..7a2ee61612a9 100644
--- a/core/src/main/java/org/apache/iceberg/puffin/PuffinFormat.java
+++ b/core/src/main/java/org/apache/iceberg/puffin/PuffinFormat.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.puffin;
 
 import io.airlift.compress.Compressor;
@@ -35,16 +34,17 @@
 import org.apache.iceberg.util.Pair;
 
 final class PuffinFormat {
-  private PuffinFormat() {
-  }
+  private PuffinFormat() {}
 
   enum Flag {
     FOOTER_PAYLOAD_COMPRESSED(0, 0),
-    /**/;
+  /**/ ;
 
-    private static final Map, Flag> BY_BYTE_AND_BIT = Stream.of(values())
-        .collect(ImmutableMap.toImmutableMap(
-            flag -> Pair.of(flag.byteNumber(), flag.bitNumber()), Function.identity()));
+    private static final Map, Flag> BY_BYTE_AND_BIT =
+        Stream.of(values())
+            .collect(
+                ImmutableMap.toImmutableMap(
+                    flag -> Pair.of(flag.byteNumber(), flag.bitNumber()), Function.identity()));
 
     private final int byteNumber;
     private final int bitNumber;
@@ -79,7 +79,8 @@ public int bitNumber() {
   static final int FOOTER_STRUCT_PAYLOAD_SIZE_OFFSET = 0;
   static final int FOOTER_STRUCT_FLAGS_OFFSET = FOOTER_STRUCT_PAYLOAD_SIZE_OFFSET + 4;
   static final int FOOTER_STRUCT_FLAGS_LENGTH = 4;
-  static final int FOOTER_STRUCT_MAGIC_OFFSET = FOOTER_STRUCT_FLAGS_OFFSET + FOOTER_STRUCT_FLAGS_LENGTH;
+  static final int FOOTER_STRUCT_MAGIC_OFFSET =
+      FOOTER_STRUCT_FLAGS_OFFSET + FOOTER_STRUCT_FLAGS_LENGTH;
   static final int FOOTER_STRUCT_LENGTH = FOOTER_STRUCT_MAGIC_OFFSET + getMagic().length;
 
   static final PuffinCompressionCodec FOOTER_COMPRESSION_CODEC = PuffinCompressionCodec.LZ4;
@@ -96,10 +97,10 @@ static void writeIntegerLittleEndian(OutputStream outputStream, int value) throw
   }
 
   static int readIntegerLittleEndian(byte[] data, int offset) {
-    return Byte.toUnsignedInt(data[offset]) |
-        (Byte.toUnsignedInt(data[offset + 1]) << 8) |
-        (Byte.toUnsignedInt(data[offset + 2]) << 16) |
-        (Byte.toUnsignedInt(data[offset + 3]) << 24);
+    return Byte.toUnsignedInt(data[offset])
+        | (Byte.toUnsignedInt(data[offset + 1]) << 8)
+        | (Byte.toUnsignedInt(data[offset + 2]) << 16)
+        | (Byte.toUnsignedInt(data[offset + 3]) << 24);
   }
 
   static ByteBuffer compress(PuffinCompressionCodec codec, ByteBuffer input) {
@@ -107,7 +108,8 @@ static ByteBuffer compress(PuffinCompressionCodec codec, ByteBuffer input) {
       case NONE:
         return input.duplicate();
       case LZ4:
-        // TODO requires LZ4 frame compressor, e.g. https://github.com/airlift/aircompressor/pull/142
+        // TODO requires LZ4 frame compressor, e.g.
+        // https://github.com/airlift/aircompressor/pull/142
         break;
       case ZSTD:
         return compress(new ZstdCompressor(), input);
@@ -128,7 +130,8 @@ static ByteBuffer decompress(PuffinCompressionCodec codec, ByteBuffer input) {
         return input.duplicate();
 
       case LZ4:
-        // TODO requires LZ4 frame decompressor, e.g. https://github.com/airlift/aircompressor/pull/142
+        // TODO requires LZ4 frame decompressor, e.g.
+        // https://github.com/airlift/aircompressor/pull/142
         break;
 
       case ZSTD:
@@ -154,16 +157,14 @@ private static ByteBuffer decompressZstd(ByteBuffer input) {
     }
 
     byte[] decompressed =
-        new byte[Math.toIntExact(ZstdDecompressor.getDecompressedSize(inputBytes, inputOffset, inputLength))];
+        new byte
+            [Math.toIntExact(
+                ZstdDecompressor.getDecompressedSize(inputBytes, inputOffset, inputLength))];
     int decompressedLength =
-        new ZstdDecompressor().decompress(
-            inputBytes,
-            inputOffset,
-            inputLength,
-            decompressed,
-            0,
-            decompressed.length);
-    Preconditions.checkState(decompressedLength == decompressed.length, "Invalid decompressed length");
+        new ZstdDecompressor()
+            .decompress(inputBytes, inputOffset, inputLength, decompressed, 0, decompressed.length);
+    Preconditions.checkState(
+        decompressedLength == decompressed.length, "Invalid decompressed length");
     return ByteBuffer.wrap(decompressed);
   }
 }
diff --git a/core/src/main/java/org/apache/iceberg/puffin/PuffinReader.java b/core/src/main/java/org/apache/iceberg/puffin/PuffinReader.java
index 0aecfe6456e5..e30b6e1ee6ef 100644
--- a/core/src/main/java/org/apache/iceberg/puffin/PuffinReader.java
+++ b/core/src/main/java/org/apache/iceberg/puffin/PuffinReader.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.puffin;
 
 import java.io.Closeable;
@@ -53,8 +52,10 @@ public class PuffinReader implements Closeable {
     this.fileSize = fileSize == null ? inputFile.getLength() : fileSize;
     this.input = inputFile.newStream();
     if (footerSize != null) {
-      Preconditions.checkArgument(0 < footerSize && footerSize <= this.fileSize - MAGIC.length,
-          "Invalid footer size: %s", footerSize);
+      Preconditions.checkArgument(
+          0 < footerSize && footerSize <= this.fileSize - MAGIC.length,
+          "Invalid footer size: %s",
+          footerSize);
       this.knownFooterSize = Math.toIntExact(footerSize);
     }
   }
@@ -79,12 +80,17 @@ public FileMetadata fileMetadata() throws IOException {
         }
       }
 
-      int footerPayloadSize = PuffinFormat.readIntegerLittleEndian(
-          footer,
-          footerStructOffset + PuffinFormat.FOOTER_STRUCT_PAYLOAD_SIZE_OFFSET);
+      int footerPayloadSize =
+          PuffinFormat.readIntegerLittleEndian(
+              footer, footerStructOffset + PuffinFormat.FOOTER_STRUCT_PAYLOAD_SIZE_OFFSET);
       Preconditions.checkState(
-          footerSize == PuffinFormat.FOOTER_START_MAGIC_LENGTH + footerPayloadSize + PuffinFormat.FOOTER_STRUCT_LENGTH,
-          "Unexpected footer payload size value %s for footer size %s", footerPayloadSize, footerSize);
+          footerSize
+              == PuffinFormat.FOOTER_START_MAGIC_LENGTH
+                  + footerPayloadSize
+                  + PuffinFormat.FOOTER_STRUCT_LENGTH,
+          "Unexpected footer payload size value %s for footer size %s",
+          footerPayloadSize,
+          footerSize);
 
       ByteBuffer footerPayload = ByteBuffer.wrap(footer, 4, footerPayloadSize);
       ByteBuffer footerJson = PuffinFormat.decompress(footerCompression, footerPayload);
@@ -97,12 +103,14 @@ private Set decodeFlags(byte[] footer, int footerStructOffset) {
     EnumSet flags = EnumSet.noneOf(Flag.class);
     for (int byteNumber = 0; byteNumber < PuffinFormat.FOOTER_STRUCT_FLAGS_LENGTH; byteNumber++) {
       int flagByte =
-          Byte.toUnsignedInt(footer[footerStructOffset + PuffinFormat.FOOTER_STRUCT_FLAGS_OFFSET + byteNumber]);
+          Byte.toUnsignedInt(
+              footer[footerStructOffset + PuffinFormat.FOOTER_STRUCT_FLAGS_OFFSET + byteNumber]);
       int bitNumber = 0;
       while (flagByte != 0) {
         if ((flagByte & 0x1) != 0) {
           Flag flag = Flag.fromBit(byteNumber, bitNumber);
-          Preconditions.checkState(flag != null, "Unknown flag byte %s and bit %s set", byteNumber, bitNumber);
+          Preconditions.checkState(
+              flag != null, "Unknown flag byte %s and bit %s set", byteNumber, bitNumber);
           flags.add(flag);
         }
         flagByte = flagByte >> 1;
@@ -119,30 +127,34 @@ public Iterable> readAll(List blobs
 
     // TODO inspect blob offsets and coalesce read regions close to each other
 
-    return () -> blobs.stream()
-        .sorted(Comparator.comparingLong(BlobMetadata::offset))
-        .map((BlobMetadata blobMetadata) -> {
-          try {
-            input.seek(blobMetadata.offset());
-            byte[] bytes = new byte[Math.toIntExact(blobMetadata.length())];
-            ByteStreams.readFully(input, bytes);
-            ByteBuffer rawData = ByteBuffer.wrap(bytes);
-            PuffinCompressionCodec codec = PuffinCompressionCodec.forName(blobMetadata.compressionCodec());
-            ByteBuffer data = PuffinFormat.decompress(codec, rawData);
-            return Pair.of(blobMetadata, data);
-          } catch (IOException e) {
-            throw new UncheckedIOException(e);
-          }
-        })
-        .iterator();
+    return () ->
+        blobs.stream()
+            .sorted(Comparator.comparingLong(BlobMetadata::offset))
+            .map(
+                (BlobMetadata blobMetadata) -> {
+                  try {
+                    input.seek(blobMetadata.offset());
+                    byte[] bytes = new byte[Math.toIntExact(blobMetadata.length())];
+                    ByteStreams.readFully(input, bytes);
+                    ByteBuffer rawData = ByteBuffer.wrap(bytes);
+                    PuffinCompressionCodec codec =
+                        PuffinCompressionCodec.forName(blobMetadata.compressionCodec());
+                    ByteBuffer data = PuffinFormat.decompress(codec, rawData);
+                    return Pair.of(blobMetadata, data);
+                  } catch (IOException e) {
+                    throw new UncheckedIOException(e);
+                  }
+                })
+            .iterator();
   }
 
   private static void checkMagic(byte[] data, int offset) {
     byte[] read = Arrays.copyOfRange(data, offset, offset + MAGIC.length);
     if (!Arrays.equals(read, MAGIC)) {
-      throw new IllegalStateException(String.format(
-          "Invalid file: expected magic at offset %s: %s, but got %s",
-          offset, Arrays.toString(MAGIC), Arrays.toString(read)));
+      throw new IllegalStateException(
+          String.format(
+              "Invalid file: expected magic at offset %s: %s, but got %s",
+              offset, Arrays.toString(MAGIC), Arrays.toString(read)));
     }
   }
 
@@ -151,13 +163,20 @@ private int footerSize() throws IOException {
       Preconditions.checkState(
           fileSize >= PuffinFormat.FOOTER_STRUCT_LENGTH,
           "Invalid file: file length %s is less tha minimal length of the footer tail %s",
-          fileSize, PuffinFormat.FOOTER_STRUCT_LENGTH);
-      byte[] footerStruct = readInput(fileSize - PuffinFormat.FOOTER_STRUCT_LENGTH, PuffinFormat.FOOTER_STRUCT_LENGTH);
+          fileSize,
+          PuffinFormat.FOOTER_STRUCT_LENGTH);
+      byte[] footerStruct =
+          readInput(
+              fileSize - PuffinFormat.FOOTER_STRUCT_LENGTH, PuffinFormat.FOOTER_STRUCT_LENGTH);
       checkMagic(footerStruct, PuffinFormat.FOOTER_STRUCT_MAGIC_OFFSET);
 
-      int footerPayloadSize = PuffinFormat.readIntegerLittleEndian(
-          footerStruct, PuffinFormat.FOOTER_STRUCT_PAYLOAD_SIZE_OFFSET);
-      knownFooterSize = PuffinFormat.FOOTER_START_MAGIC_LENGTH + footerPayloadSize + PuffinFormat.FOOTER_STRUCT_LENGTH;
+      int footerPayloadSize =
+          PuffinFormat.readIntegerLittleEndian(
+              footerStruct, PuffinFormat.FOOTER_STRUCT_PAYLOAD_SIZE_OFFSET);
+      knownFooterSize =
+          PuffinFormat.FOOTER_START_MAGIC_LENGTH
+              + footerPayloadSize
+              + PuffinFormat.FOOTER_STRUCT_LENGTH;
     }
     return knownFooterSize;
   }
diff --git a/core/src/main/java/org/apache/iceberg/puffin/PuffinWriter.java b/core/src/main/java/org/apache/iceberg/puffin/PuffinWriter.java
index cbaaf6828ede..5728b7474885 100644
--- a/core/src/main/java/org/apache/iceberg/puffin/PuffinWriter.java
+++ b/core/src/main/java/org/apache/iceberg/puffin/PuffinWriter.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.puffin;
 
 import java.io.IOException;
@@ -57,14 +56,17 @@ public class PuffinWriter implements FileAppender {
   private Optional fileSize = Optional.empty();
 
   PuffinWriter(
-      OutputFile outputFile, Map properties, boolean compressFooter,
+      OutputFile outputFile,
+      Map properties,
+      boolean compressFooter,
       PuffinCompressionCodec defaultBlobCompression) {
     Preconditions.checkNotNull(outputFile, "outputFile is null");
     Preconditions.checkNotNull(properties, "properties is null");
     Preconditions.checkNotNull(defaultBlobCompression, "defaultBlobCompression is null");
     this.outputStream = outputFile.create();
     this.properties = ImmutableMap.copyOf(properties);
-    this.footerCompression = compressFooter ? PuffinFormat.FOOTER_COMPRESSION_CODEC : PuffinCompressionCodec.NONE;
+    this.footerCompression =
+        compressFooter ? PuffinFormat.FOOTER_COMPRESSION_CODEC : PuffinCompressionCodec.NONE;
     this.defaultBlobCompression = defaultBlobCompression;
   }
 
@@ -75,12 +77,21 @@ public void add(Blob blob) {
     try {
       writeHeaderIfNeeded();
       long fileOffset = outputStream.getPos();
-      PuffinCompressionCodec codec = MoreObjects.firstNonNull(blob.requestedCompression(), defaultBlobCompression);
+      PuffinCompressionCodec codec =
+          MoreObjects.firstNonNull(blob.requestedCompression(), defaultBlobCompression);
       ByteBuffer rawData = PuffinFormat.compress(codec, blob.blobData());
       int length = rawData.remaining();
       IOUtil.writeFully(outputStream, rawData);
-      writtenBlobsMetadata.add(new BlobMetadata(blob.type(), blob.inputFields(), blob.snapshotId(),
-          blob.sequenceNumber(), fileOffset, length, codec.codecName(), blob.properties()));
+      writtenBlobsMetadata.add(
+          new BlobMetadata(
+              blob.type(),
+              blob.inputFields(),
+              blob.snapshotId(),
+              blob.sequenceNumber(),
+              fileOffset,
+              length,
+              codec.codecName(),
+              blob.properties()));
     } catch (IOException e) {
       throw new UncheckedIOException(e);
     }
@@ -127,8 +138,9 @@ public void finish() throws IOException {
 
   private void writeFooter() throws IOException {
     FileMetadata fileMetadata = new FileMetadata(writtenBlobsMetadata, properties);
-    ByteBuffer footerJson = ByteBuffer.wrap(
-        FileMetadataParser.toJson(fileMetadata, false).getBytes(StandardCharsets.UTF_8));
+    ByteBuffer footerJson =
+        ByteBuffer.wrap(
+            FileMetadataParser.toJson(fileMetadata, false).getBytes(StandardCharsets.UTF_8));
     ByteBuffer footerPayload = PuffinFormat.compress(footerCompression, footerJson);
     outputStream.write(MAGIC);
     int footerPayloadLength = footerPayload.remaining();
@@ -139,8 +151,8 @@ private void writeFooter() throws IOException {
   }
 
   private void writeFlags() throws IOException {
-    Map> flagsByByteNumber = fileFlags().stream()
-        .collect(Collectors.groupingBy(Flag::byteNumber));
+    Map> flagsByByteNumber =
+        fileFlags().stream().collect(Collectors.groupingBy(Flag::byteNumber));
     for (int byteNumber = 0; byteNumber < PuffinFormat.FOOTER_STRUCT_FLAGS_LENGTH; byteNumber++) {
       int byteFlag = 0;
       for (Flag flag : flagsByByteNumber.getOrDefault(byteNumber, ImmutableList.of())) {
diff --git a/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java b/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java
index befbe37c41c5..2be5df5f88b9 100644
--- a/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java
+++ b/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java
@@ -16,15 +16,14 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.puffin;
 
 public final class StandardBlobTypes {
-  private StandardBlobTypes() {
-  }
+  private StandardBlobTypes() {}
 
   /**
-   * A serialized form of a "compact" Theta sketch produced by the Apache DataSketches library
+   * A serialized form of a "compact" Theta sketch produced by the Apache DataSketches library
    */
   public static final String APACHE_DATASKETCHES_THETA_V1 = "apache-datasketches-theta-v1";
 }
diff --git a/core/src/main/java/org/apache/iceberg/puffin/StandardPuffinProperties.java b/core/src/main/java/org/apache/iceberg/puffin/StandardPuffinProperties.java
index 8d50ba9e83d4..64aecb52b3dc 100644
--- a/core/src/main/java/org/apache/iceberg/puffin/StandardPuffinProperties.java
+++ b/core/src/main/java/org/apache/iceberg/puffin/StandardPuffinProperties.java
@@ -16,16 +16,14 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.puffin;
 
 public final class StandardPuffinProperties {
-  private StandardPuffinProperties() {
-  }
+  private StandardPuffinProperties() {}
 
   /**
-   * human-readable identification of the application writing the file,
-   * along with its version. Example "Trino version 381".
+   * human-readable identification of the application writing the file, along with its version.
+   * Example "Trino version 381".
    */
   public static final String CREATED_BY_PROPERTY = "created-by";
 }
diff --git a/core/src/main/java/org/apache/iceberg/rest/CatalogHandlers.java b/core/src/main/java/org/apache/iceberg/rest/CatalogHandlers.java
index 4a96a26b1877..e260bc313d99 100644
--- a/core/src/main/java/org/apache/iceberg/rest/CatalogHandlers.java
+++ b/core/src/main/java/org/apache/iceberg/rest/CatalogHandlers.java
@@ -16,9 +16,13 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.rest;
 
+import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT;
+import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT;
+import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT;
+import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT;
+
 import java.time.OffsetDateTime;
 import java.time.ZoneOffset;
 import java.util.List;
@@ -60,24 +64,19 @@
 import org.apache.iceberg.rest.responses.UpdateNamespacePropertiesResponse;
 import org.apache.iceberg.util.Tasks;
 
-import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT;
-import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT;
-import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT;
-import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT;
-
 public class CatalogHandlers {
   private static final Schema EMPTY_SCHEMA = new Schema();
 
-  private CatalogHandlers() {
-  }
+  private CatalogHandlers() {}
 
   /**
    * Exception used to avoid retrying commits when assertions fail.
-   * 

- * When a REST assertion fails, it will throw CommitFailedException to send back to the client. But the assertion - * checks happen in the block that is retried if {@link TableOperations#commit(TableMetadata, TableMetadata)} throws - * CommitFailedException. This is used to avoid retries for assertion failures, which are unwrapped and rethrown - * outside of the commit loop. + * + *

When a REST assertion fails, it will throw CommitFailedException to send back to the client. + * But the assertion checks happen in the block that is retried if {@link + * TableOperations#commit(TableMetadata, TableMetadata)} throws CommitFailedException. This is + * used to avoid retries for assertion failures, which are unwrapped and rethrown outside of the + * commit loop. */ private static class ValidationFailureException extends RuntimeException { private final CommitFailedException wrapped; @@ -92,7 +91,8 @@ public CommitFailedException wrapped() { } } - public static ListNamespacesResponse listNamespaces(SupportsNamespaces catalog, Namespace parent) { + public static ListNamespacesResponse listNamespaces( + SupportsNamespaces catalog, Namespace parent) { List results; if (parent.isEmpty()) { results = catalog.listNamespaces(); @@ -103,7 +103,8 @@ public static ListNamespacesResponse listNamespaces(SupportsNamespaces catalog, return ListNamespacesResponse.builder().addAll(results).build(); } - public static CreateNamespaceResponse createNamespace(SupportsNamespaces catalog, CreateNamespaceRequest request) { + public static CreateNamespaceResponse createNamespace( + SupportsNamespaces catalog, CreateNamespaceRequest request) { Namespace namespace = request.namespace(); catalog.createNamespace(namespace, request.properties()); return CreateNamespaceResponse.builder() @@ -112,7 +113,8 @@ public static CreateNamespaceResponse createNamespace(SupportsNamespaces catalog .build(); } - public static GetNamespaceResponse loadNamespace(SupportsNamespaces catalog, Namespace namespace) { + public static GetNamespaceResponse loadNamespace( + SupportsNamespaces catalog, Namespace namespace) { Map properties = catalog.loadNamespaceMetadata(namespace); return GetNamespaceResponse.builder() .withNamespace(namespace) @@ -147,10 +149,10 @@ public static UpdateNamespacePropertiesResponse updateNamespaceProperties( } return UpdateNamespacePropertiesResponse.builder() - .addMissing(missing) - .addUpdated(updates.keySet()) - .addRemoved(Sets.difference(removals, missing)) - .build(); + .addMissing(missing) + .addUpdated(updates.keySet()) + .addRemoved(Sets.difference(removals, missing)) + .build(); } public static ListTablesResponse listTables(Catalog catalog, Namespace namespace) { @@ -158,7 +160,8 @@ public static ListTablesResponse listTables(Catalog catalog, Namespace namespace return ListTablesResponse.builder().addAll(idents).build(); } - public static LoadTableResponse stageTableCreate(Catalog catalog, Namespace namespace, CreateTableRequest request) { + public static LoadTableResponse stageTableCreate( + Catalog catalog, Namespace namespace, CreateTableRequest request) { request.validate(); TableIdentifier ident = TableIdentifier.of(namespace, request.name()); @@ -174,37 +177,41 @@ public static LoadTableResponse stageTableCreate(Catalog catalog, Namespace name if (request.location() != null) { location = request.location(); } else { - location = catalog.buildTable(ident, request.schema()) - .withPartitionSpec(request.spec()) - .withSortOrder(request.writeOrder()) - .withProperties(properties) - .createTransaction() - .table() - .location(); + location = + catalog + .buildTable(ident, request.schema()) + .withPartitionSpec(request.spec()) + .withSortOrder(request.writeOrder()) + .withProperties(properties) + .createTransaction() + .table() + .location(); } - TableMetadata metadata = TableMetadata.newTableMetadata( - request.schema(), - request.spec() != null ? request.spec() : PartitionSpec.unpartitioned(), - request.writeOrder() != null ? request.writeOrder() : SortOrder.unsorted(), - location, - properties); + TableMetadata metadata = + TableMetadata.newTableMetadata( + request.schema(), + request.spec() != null ? request.spec() : PartitionSpec.unpartitioned(), + request.writeOrder() != null ? request.writeOrder() : SortOrder.unsorted(), + location, + properties); - return LoadTableResponse.builder() - .withTableMetadata(metadata) - .build(); + return LoadTableResponse.builder().withTableMetadata(metadata).build(); } - public static LoadTableResponse createTable(Catalog catalog, Namespace namespace, CreateTableRequest request) { + public static LoadTableResponse createTable( + Catalog catalog, Namespace namespace, CreateTableRequest request) { request.validate(); TableIdentifier ident = TableIdentifier.of(namespace, request.name()); - Table table = catalog.buildTable(ident, request.schema()) - .withLocation(request.location()) - .withPartitionSpec(request.spec()) - .withSortOrder(request.writeOrder()) - .withProperties(request.properties()) - .create(); + Table table = + catalog + .buildTable(ident, request.schema()) + .withLocation(request.location()) + .withPartitionSpec(request.spec()) + .withSortOrder(request.writeOrder()) + .withProperties(request.properties()) + .create(); if (table instanceof BaseTable) { return LoadTableResponse.builder() @@ -237,16 +244,19 @@ public static LoadTableResponse loadTable(Catalog catalog, TableIdentifier ident throw new IllegalStateException("Cannot wrap catalog that does not produce BaseTable"); } - public static LoadTableResponse updateTable(Catalog catalog, TableIdentifier ident, UpdateTableRequest request) { + public static LoadTableResponse updateTable( + Catalog catalog, TableIdentifier ident, UpdateTableRequest request) { TableMetadata finalMetadata; if (isCreate(request)) { // this is a hacky way to get TableOperations for an uncommitted table - Transaction transaction = catalog.buildTable(ident, EMPTY_SCHEMA).createOrReplaceTransaction(); + Transaction transaction = + catalog.buildTable(ident, EMPTY_SCHEMA).createOrReplaceTransaction(); if (transaction instanceof BaseTransaction) { BaseTransaction baseTransaction = (BaseTransaction) transaction; finalMetadata = create(baseTransaction.underlyingOps(), request); } else { - throw new IllegalStateException("Cannot wrap catalog that does not produce BaseTransaction"); + throw new IllegalStateException( + "Cannot wrap catalog that does not produce BaseTransaction"); } } else { @@ -259,9 +269,7 @@ public static LoadTableResponse updateTable(Catalog catalog, TableIdentifier ide } } - return LoadTableResponse.builder() - .withTableMetadata(finalMetadata) - .build(); + return LoadTableResponse.builder().withTableMetadata(finalMetadata).build(); } public static void renameTable(Catalog catalog, RenameTableRequest request) { @@ -269,15 +277,21 @@ public static void renameTable(Catalog catalog, RenameTableRequest request) { } private static boolean isCreate(UpdateTableRequest request) { - boolean isCreate = request.requirements().stream() - .anyMatch(UpdateTableRequest.UpdateRequirement.AssertTableDoesNotExist.class::isInstance); + boolean isCreate = + request.requirements().stream() + .anyMatch( + UpdateTableRequest.UpdateRequirement.AssertTableDoesNotExist.class::isInstance); if (isCreate) { - List invalidRequirements = request.requirements().stream() - .filter(req -> !(req instanceof UpdateTableRequest.UpdateRequirement.AssertTableDoesNotExist)) - .collect(Collectors.toList()); - Preconditions.checkArgument(invalidRequirements.isEmpty(), - "Invalid create requirements: %s", invalidRequirements); + List invalidRequirements = + request.requirements().stream() + .filter( + req -> + !(req + instanceof UpdateTableRequest.UpdateRequirement.AssertTableDoesNotExist)) + .collect(Collectors.toList()); + Preconditions.checkArgument( + invalidRequirements.isEmpty(), "Invalid create requirements: %s", invalidRequirements); } return isCreate; @@ -302,34 +316,37 @@ private static TableMetadata commit(TableOperations ops, UpdateTableRequest requ Tasks.foreach(ops) .retry(COMMIT_NUM_RETRIES_DEFAULT) .exponentialBackoff( - COMMIT_MIN_RETRY_WAIT_MS_DEFAULT, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT, - COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT, 2.0 /* exponential */) + COMMIT_MIN_RETRY_WAIT_MS_DEFAULT, + COMMIT_MAX_RETRY_WAIT_MS_DEFAULT, + COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT, + 2.0 /* exponential */) .onlyRetryOn(CommitFailedException.class) - .run(taskOps -> { - TableMetadata base = isRetry.get() ? taskOps.refresh() : taskOps.current(); - isRetry.set(true); - - // validate requirements - try { - request.requirements().forEach(requirement -> requirement.validate(base)); - } catch (CommitFailedException e) { - // wrap and rethrow outside of tasks to avoid unnecessary retry - throw new ValidationFailureException(e); - } - - // apply changes - TableMetadata.Builder metadataBuilder = TableMetadata.buildFrom(base); - request.updates().forEach(update -> update.applyTo(metadataBuilder)); - - TableMetadata updated = metadataBuilder.build(); - if (updated.changes().isEmpty()) { - // do not commit if the metadata has not changed - return; - } - - // commit - taskOps.commit(base, updated); - }); + .run( + taskOps -> { + TableMetadata base = isRetry.get() ? taskOps.refresh() : taskOps.current(); + isRetry.set(true); + + // validate requirements + try { + request.requirements().forEach(requirement -> requirement.validate(base)); + } catch (CommitFailedException e) { + // wrap and rethrow outside of tasks to avoid unnecessary retry + throw new ValidationFailureException(e); + } + + // apply changes + TableMetadata.Builder metadataBuilder = TableMetadata.buildFrom(base); + request.updates().forEach(update -> update.applyTo(metadataBuilder)); + + TableMetadata updated = metadataBuilder.build(); + if (updated.changes().isEmpty()) { + // do not commit if the metadata has not changed + return; + } + + // commit + taskOps.commit(base, updated); + }); } catch (ValidationFailureException e) { throw e.wrapped(); diff --git a/core/src/main/java/org/apache/iceberg/rest/ErrorHandlers.java b/core/src/main/java/org/apache/iceberg/rest/ErrorHandlers.java index d72ee460aae5..6ef080e5f173 100644 --- a/core/src/main/java/org/apache/iceberg/rest/ErrorHandlers.java +++ b/core/src/main/java/org/apache/iceberg/rest/ErrorHandlers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; import java.util.function.Consumer; @@ -32,13 +31,12 @@ import org.apache.iceberg.rest.responses.ErrorResponse; /** - * A set of consumers to handle errors for requests for table entities or for namespace entities, - * to throw the correct exception. + * A set of consumers to handle errors for requests for table entities or for namespace entities, to + * throw the correct exception. */ public class ErrorHandlers { - private ErrorHandlers() { - } + private ErrorHandlers() {} public static Consumer namespaceErrorHandler() { return baseNamespaceErrorHandler().andThen(defaultErrorHandler()); @@ -64,8 +62,8 @@ private static Consumer baseCommitErrorHandler() { } /** - * Table level error handlers. - * Should be chained wih the {@link #defaultErrorHandler}, which takes care of common cases. + * Table level error handlers. Should be chained wih the {@link #defaultErrorHandler}, which takes + * care of common cases. */ private static Consumer baseTableErrorHandler() { return error -> { @@ -83,8 +81,8 @@ private static Consumer baseTableErrorHandler() { } /** - * Request error handlers specifically for CRUD ops on namespaces. - * Should be chained wih the {@link #defaultErrorHandler}, which takes care of common cases. + * Request error handlers specifically for CRUD ops on namespaces. Should be chained wih the + * {@link #defaultErrorHandler}, which takes care of common cases. */ private static Consumer baseNamespaceErrorHandler() { return error -> { @@ -100,8 +98,8 @@ private static Consumer baseNamespaceErrorHandler() { } /** - * Request error handler that handles the common cases that are included with all responses, - * such as 400, 500, etc. + * Request error handler that handles the common cases that are included with all responses, such + * as 400, 500, etc. */ public static Consumer defaultErrorHandler() { return error -> { @@ -125,4 +123,3 @@ public static Consumer defaultErrorHandler() { }; } } - diff --git a/core/src/main/java/org/apache/iceberg/rest/HTTPClient.java b/core/src/main/java/org/apache/iceberg/rest/HTTPClient.java index a7f642bffa64..154ef2a6c212 100644 --- a/core/src/main/java/org/apache/iceberg/rest/HTTPClient.java +++ b/core/src/main/java/org/apache/iceberg/rest/HTTPClient.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; import com.fasterxml.jackson.core.JsonProcessingException; @@ -51,9 +50,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * An HttpClient for usage with the REST catalog. - */ +/** An HttpClient for usage with the REST catalog. */ public class HTTPClient implements RESTClient { private static final Logger LOG = LoggerFactory.getLogger(HTTPClient.class); @@ -86,14 +83,17 @@ private static String extractResponseBodyAsString(CloseableHttpResponse response // Per the spec, the only currently defined / used "success" responses are 200 and 202. private static boolean isSuccessful(CloseableHttpResponse response) { int code = response.getCode(); - return code == HttpStatus.SC_OK || code == HttpStatus.SC_ACCEPTED || code == HttpStatus.SC_NO_CONTENT; + return code == HttpStatus.SC_OK + || code == HttpStatus.SC_ACCEPTED + || code == HttpStatus.SC_NO_CONTENT; } private static ErrorResponse buildDefaultErrorResponse(CloseableHttpResponse response) { String responseReason = response.getReasonPhrase(); String message = - responseReason != null && !responseReason.isEmpty() ? responseReason : - EnglishReasonPhraseCatalog.INSTANCE.getReason(response.getCode(), null /* ignored */); + responseReason != null && !responseReason.isEmpty() + ? responseReason + : EnglishReasonPhraseCatalog.INSTANCE.getReason(response.getCode(), null /* ignored */); String type = "RESTException"; return ErrorResponse.builder() .responseCode(response.getCode()) @@ -112,11 +112,14 @@ private static void throwFailure( try { errorResponse = ErrorResponseParser.fromJson(responseBody); } catch (UncheckedIOException | IllegalArgumentException e) { - // It's possible to receive a non-successful response that isn't a properly defined ErrorResponse - // without any bugs in the server implementation. So we ignore this exception and build an error + // It's possible to receive a non-successful response that isn't a properly defined + // ErrorResponse + // without any bugs in the server implementation. So we ignore this exception and build an + // error // response for the user. // - // For example, the connection could time out before every reaching the server, in which case we'll + // For example, the connection could time out before every reaching the server, in which + // case we'll // likely get a 5xx with the load balancers default 5xx response. LOG.error("Failed to parse an error response. Will create one instead.", e); } @@ -141,25 +144,34 @@ private URI buildUri(String path, Map params) { } return builder.build(); } catch (URISyntaxException e) { - throw new RESTException("Failed to create request URI from base %s, params %s", baseUri, params); + throw new RESTException( + "Failed to create request URI from base %s, params %s", baseUri, params); } } /** * Method to execute an HTTP request and process the corresponding response. * - * @param method - HTTP method, such as GET, POST, HEAD, etc. - * @param queryParams - A map of query parameters - * @param path - URL path to send the request to - * @param requestBody - Content to place in the request body - * @param responseType - Class of the Response type. Needs to have serializer registered with ObjectMapper - * @param errorHandler - Error handler delegated for HTTP responses which handles server error responses - * @param - Class type of the response for deserialization. Must be registered with the ObjectMapper. + * @param method - HTTP method, such as GET, POST, HEAD, etc. + * @param queryParams - A map of query parameters + * @param path - URL path to send the request to + * @param requestBody - Content to place in the request body + * @param responseType - Class of the Response type. Needs to have serializer registered with + * ObjectMapper + * @param errorHandler - Error handler delegated for HTTP responses which handles server error + * responses + * @param - Class type of the response for deserialization. Must be registered with the + * ObjectMapper. * @return The response entity, parsed and converted to its type T */ private T execute( - Method method, String path, Map queryParams, Object requestBody, Class responseType, - Map headers, Consumer errorHandler) { + Method method, + String path, + Map queryParams, + Object requestBody, + Class responseType, + Map headers, + Consumer errorHandler) { if (path.startsWith("/")) { throw new RESTException( "Received a malformed path for a REST request: %s. Paths should not start with /", path); @@ -182,7 +194,8 @@ private T execute( try (CloseableHttpResponse response = httpClient.execute(request)) { // Skip parsing the response stream for any successful request not expecting a response body - if (response.getCode() == HttpStatus.SC_NO_CONTENT || (responseType == null && isSuccessful(response))) { + if (response.getCode() == HttpStatus.SC_NO_CONTENT + || (responseType == null && isSuccessful(response))) { return null; } @@ -203,8 +216,10 @@ private T execute( return mapper.readValue(responseBody, responseType); } catch (JsonProcessingException e) { throw new RESTException( - e, "Received a success response code of %d, but failed to parse response body into %s", - response.getCode(), responseType.getSimpleName()); + e, + "Received a success response code of %d, but failed to parse response body into %s", + response.getCode(), + responseType.getSimpleName()); } } catch (IOException e) { throw new RESTException(e, "Error occurred while processing %s request", method); @@ -217,32 +232,49 @@ public void head(String path, Map headers, Consumer T get(String path, Map queryParams, Class responseType, - Map headers, Consumer errorHandler) { + public T get( + String path, + Map queryParams, + Class responseType, + Map headers, + Consumer errorHandler) { return execute(Method.GET, path, queryParams, null, responseType, headers, errorHandler); } @Override - public T post(String path, RESTRequest body, Class responseType, - Map headers, Consumer errorHandler) { + public T post( + String path, + RESTRequest body, + Class responseType, + Map headers, + Consumer errorHandler) { return execute(Method.POST, path, null, body, responseType, headers, errorHandler); } @Override - public T delete(String path, Class responseType, Map headers, - Consumer errorHandler) { + public T delete( + String path, + Class responseType, + Map headers, + Consumer errorHandler) { return execute(Method.DELETE, path, null, null, responseType, headers, errorHandler); } @Override - public T postForm(String path, Map formData, Class responseType, - Map headers, Consumer errorHandler) { + public T postForm( + String path, + Map formData, + Class responseType, + Map headers, + Consumer errorHandler) { return execute(Method.POST, path, null, formData, responseType, headers, errorHandler); } - private void addRequestHeaders(HttpUriRequest request, Map requestHeaders, String bodyMimeType) { + private void addRequestHeaders( + HttpUriRequest request, Map requestHeaders, String bodyMimeType) { request.setHeader(HttpHeaders.ACCEPT, ContentType.APPLICATION_JSON.getMimeType()); - // Many systems require that content type is set regardless and will fail, even on an empty bodied request. + // Many systems require that content type is set regardless and will fail, even on an empty + // bodied request. request.setHeader(HttpHeaders.CONTENT_TYPE, bodyMimeType); baseHeaders.forEach(request::setHeader); requestHeaders.forEach(request::setHeader); @@ -261,8 +293,7 @@ public static class Builder { private final Map baseHeaders = Maps.newHashMap(); private String uri; - private Builder() { - } + private Builder() {} public Builder uri(String baseUri) { Preconditions.checkNotNull(baseUri, "Invalid uri for http client: null"); diff --git a/core/src/main/java/org/apache/iceberg/rest/HTTPClientFactory.java b/core/src/main/java/org/apache/iceberg/rest/HTTPClientFactory.java index fcf8a240468f..87ca84233e97 100644 --- a/core/src/main/java/org/apache/iceberg/rest/HTTPClientFactory.java +++ b/core/src/main/java/org/apache/iceberg/rest/HTTPClientFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; import java.util.Map; @@ -28,28 +27,27 @@ /** * Takes in the full configuration for the {@link RESTSessionCatalog}, which should already have - * called the server's initial configuration route. - * Using the merged configuration, an instance of {@link RESTClient} is obtained that can be used with the - * RESTCatalog. + * called the server's initial configuration route. Using the merged configuration, an instance of + * {@link RESTClient} is obtained that can be used with the RESTCatalog. */ public class HTTPClientFactory implements Function, RESTClient> { - @VisibleForTesting - static final String CLIENT_VERSION_HEADER = "X-Client-Version"; + @VisibleForTesting static final String CLIENT_VERSION_HEADER = "X-Client-Version"; + @VisibleForTesting static final String CLIENT_GIT_COMMIT_SHORT_HEADER = "X-Client-Git-Commit-Short"; @Override public RESTClient apply(Map properties) { Preconditions.checkArgument(properties != null, "Invalid configuration: null"); - Preconditions.checkArgument(properties.containsKey(CatalogProperties.URI), "REST Catalog server URI is required"); + Preconditions.checkArgument( + properties.containsKey(CatalogProperties.URI), "REST Catalog server URI is required"); String baseURI = properties.get(CatalogProperties.URI).trim(); String clientVersion = IcebergBuild.fullVersion(); String gitCommitShortId = IcebergBuild.gitCommitShortId(); - return HTTPClient - .builder() + return HTTPClient.builder() .withHeader(CLIENT_VERSION_HEADER, clientVersion) .withHeader(CLIENT_GIT_COMMIT_SHORT_HEADER, gitCommitShortId) .uri(baseURI) diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTCatalog.java b/core/src/main/java/org/apache/iceberg/rest/RESTCatalog.java index 63747a596647..2278c7be9c53 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTCatalog.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; import java.io.Closeable; @@ -40,7 +39,8 @@ import org.apache.iceberg.hadoop.Configurable; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -public class RESTCatalog implements Catalog, SupportsNamespaces, Configurable, Closeable { +public class RESTCatalog + implements Catalog, SupportsNamespaces, Configurable, Closeable { private final RESTSessionCatalog sessionCatalog; private final Catalog delegate; private final SupportsNamespaces nsDelegate; @@ -53,7 +53,9 @@ public RESTCatalog(Function, RESTClient> clientBuilder) { this(SessionCatalog.SessionContext.createEmpty(), clientBuilder); } - public RESTCatalog(SessionCatalog.SessionContext context, Function, RESTClient> clientBuilder) { + public RESTCatalog( + SessionCatalog.SessionContext context, + Function, RESTClient> clientBuilder) { this.sessionCatalog = new RESTSessionCatalog(clientBuilder); this.delegate = sessionCatalog.asCatalog(context); this.nsDelegate = (SupportsNamespaces) delegate; @@ -100,14 +102,18 @@ public TableBuilder buildTable(TableIdentifier ident, Schema schema) { } @Override - public Table createTable(TableIdentifier ident, Schema schema, PartitionSpec spec, String location, - Map props) { + public Table createTable( + TableIdentifier ident, + Schema schema, + PartitionSpec spec, + String location, + Map props) { return delegate.createTable(ident, schema, spec, location, props); } @Override - public Table createTable(TableIdentifier ident, Schema schema, PartitionSpec spec, - Map props) { + public Table createTable( + TableIdentifier ident, Schema schema, PartitionSpec spec, Map props) { return delegate.createTable(ident, schema, spec, props); } @@ -122,19 +128,24 @@ public Table createTable(TableIdentifier identifier, Schema schema) { } @Override - public Transaction newCreateTableTransaction(TableIdentifier ident, Schema schema, PartitionSpec spec, - String location, Map props) { + public Transaction newCreateTableTransaction( + TableIdentifier ident, + Schema schema, + PartitionSpec spec, + String location, + Map props) { return delegate.newCreateTableTransaction(ident, schema, spec, location, props); } @Override - public Transaction newCreateTableTransaction(TableIdentifier ident, Schema schema, PartitionSpec spec, - Map props) { + public Transaction newCreateTableTransaction( + TableIdentifier ident, Schema schema, PartitionSpec spec, Map props) { return delegate.newCreateTableTransaction(ident, schema, spec, props); } @Override - public Transaction newCreateTableTransaction(TableIdentifier ident, Schema schema, PartitionSpec spec) { + public Transaction newCreateTableTransaction( + TableIdentifier ident, Schema schema, PartitionSpec spec) { return delegate.newCreateTableTransaction(ident, schema, spec); } @@ -144,25 +155,35 @@ public Transaction newCreateTableTransaction(TableIdentifier identifier, Schema } @Override - public Transaction newReplaceTableTransaction(TableIdentifier ident, Schema schema, PartitionSpec spec, - String location, Map props, boolean orCreate) { + public Transaction newReplaceTableTransaction( + TableIdentifier ident, + Schema schema, + PartitionSpec spec, + String location, + Map props, + boolean orCreate) { return delegate.newReplaceTableTransaction(ident, schema, spec, location, props, orCreate); } @Override - public Transaction newReplaceTableTransaction(TableIdentifier ident, Schema schema, PartitionSpec spec, - Map props, boolean orCreate) { + public Transaction newReplaceTableTransaction( + TableIdentifier ident, + Schema schema, + PartitionSpec spec, + Map props, + boolean orCreate) { return delegate.newReplaceTableTransaction(ident, schema, spec, props, orCreate); } @Override - public Transaction newReplaceTableTransaction(TableIdentifier ident, Schema schema, PartitionSpec spec, - boolean orCreate) { + public Transaction newReplaceTableTransaction( + TableIdentifier ident, Schema schema, PartitionSpec spec, boolean orCreate) { return delegate.newReplaceTableTransaction(ident, schema, spec, orCreate); } @Override - public Transaction newReplaceTableTransaction(TableIdentifier ident, Schema schema, boolean orCreate) { + public Transaction newReplaceTableTransaction( + TableIdentifier ident, Schema schema, boolean orCreate) { return delegate.newReplaceTableTransaction(ident, schema, orCreate); } @@ -207,7 +228,8 @@ public boolean dropNamespace(Namespace ns) throws NamespaceNotEmptyException { } @Override - public boolean setProperties(Namespace ns, Map props) throws NoSuchNamespaceException { + public boolean setProperties(Namespace ns, Map props) + throws NoSuchNamespaceException { return nsDelegate.setProperties(ns, props); } diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTClient.java b/core/src/main/java/org/apache/iceberg/rest/RESTClient.java index 015ed3d6bd6f..2057f23fc363 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTClient.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTClient.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; import java.io.Closeable; @@ -26,58 +25,91 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.rest.responses.ErrorResponse; -/** - * Interface for a basic HTTP Client for interfacing with the REST catalog. - */ +/** Interface for a basic HTTP Client for interfacing with the REST catalog. */ public interface RESTClient extends Closeable { - default void head(String path, Supplier> headers, Consumer errorHandler) { + default void head( + String path, Supplier> headers, Consumer errorHandler) { head(path, headers.get(), errorHandler); } void head(String path, Map headers, Consumer errorHandler); - default T delete(String path, Class responseType, Supplier> headers, - Consumer errorHandler) { + default T delete( + String path, + Class responseType, + Supplier> headers, + Consumer errorHandler) { return delete(path, responseType, headers.get(), errorHandler); } - T delete(String path, Class responseType, Map headers, - Consumer errorHandler); - - default T get(String path, Class responseType, - Supplier> headers, Consumer errorHandler) { + T delete( + String path, + Class responseType, + Map headers, + Consumer errorHandler); + + default T get( + String path, + Class responseType, + Supplier> headers, + Consumer errorHandler) { return get(path, ImmutableMap.of(), responseType, headers, errorHandler); } - default T get(String path, Class responseType, Map headers, - Consumer errorHandler) { + default T get( + String path, + Class responseType, + Map headers, + Consumer errorHandler) { return get(path, ImmutableMap.of(), responseType, headers, errorHandler); } - default T get(String path, Map queryParams, Class responseType, - Supplier> headers, Consumer errorHandler) { + default T get( + String path, + Map queryParams, + Class responseType, + Supplier> headers, + Consumer errorHandler) { return get(path, queryParams, responseType, headers.get(), errorHandler); } - T get(String path, Map queryParams, Class responseType, - Map headers, Consumer errorHandler); - - default T post(String path, RESTRequest body, Class responseType, - Supplier> headers, Consumer errorHandler) { + T get( + String path, + Map queryParams, + Class responseType, + Map headers, + Consumer errorHandler); + + default T post( + String path, + RESTRequest body, + Class responseType, + Supplier> headers, + Consumer errorHandler) { return post(path, body, responseType, headers.get(), errorHandler); } - T post(String path, RESTRequest body, Class responseType, Map headers, - Consumer errorHandler); - - default T postForm(String path, Map formData, Class responseType, - Supplier> headers, - Consumer errorHandler) { + T post( + String path, + RESTRequest body, + Class responseType, + Map headers, + Consumer errorHandler); + + default T postForm( + String path, + Map formData, + Class responseType, + Supplier> headers, + Consumer errorHandler) { return postForm(path, formData, responseType, headers.get(), errorHandler); } - T postForm(String path, Map formData, Class responseType, - Map headers, Consumer errorHandler); + T postForm( + String path, + Map formData, + Class responseType, + Map headers, + Consumer errorHandler); } - diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTMessage.java b/core/src/main/java/org/apache/iceberg/rest/RESTMessage.java index df569f93a40b..f79247159658 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTMessage.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTMessage.java @@ -16,18 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; -/** - * Interface to mark both REST requests and responses. - */ +/** Interface to mark both REST requests and responses. */ public interface RESTMessage { /** * Ensures that a constructed instance of a REST message is valid according to the REST spec. - *

- * This is needed when parsing data that comes from external sources and the object might have + * + *

This is needed when parsing data that comes from external sources and the object might have * been constructed without all the required fields present. */ void validate(); diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTObjectMapper.java b/core/src/main/java/org/apache/iceberg/rest/RESTObjectMapper.java index 778719708f46..f9572b227d10 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTObjectMapper.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTObjectMapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; import com.fasterxml.jackson.annotation.JsonAutoDetect; @@ -31,8 +30,7 @@ class RESTObjectMapper { private static final ObjectMapper MAPPER = new ObjectMapper(FACTORY); private static volatile boolean isInitialized = false; - private RESTObjectMapper() { - } + private RESTObjectMapper() {} static ObjectMapper mapper() { if (!isInitialized) { diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTRequest.java b/core/src/main/java/org/apache/iceberg/rest/RESTRequest.java index ebb03fef4b56..ed9f6bd2553a 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTRequest.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTRequest.java @@ -16,11 +16,7 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; -/** - * Interface to mark a REST request. - */ -public interface RESTRequest extends RESTMessage { -} +/** Interface to mark a REST request. */ +public interface RESTRequest extends RESTMessage {} diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTResponse.java b/core/src/main/java/org/apache/iceberg/rest/RESTResponse.java index eb9f731932aa..e0cc47e9adda 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTResponse.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTResponse.java @@ -16,11 +16,7 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; -/** - * Interface to mark a REST response - */ -public interface RESTResponse extends RESTMessage { -} +/** Interface to mark a REST response */ +public interface RESTResponse extends RESTMessage {} diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTSerializers.java b/core/src/main/java/org/apache/iceberg/rest/RESTSerializers.java index a0a3888b2347..6dfb8756f337 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTSerializers.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTSerializers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; import com.fasterxml.jackson.core.JsonGenerator; @@ -52,8 +51,7 @@ public class RESTSerializers { - private RESTSerializers() { - } + private RESTSerializers() {} public static void registerAll(ObjectMapper mapper) { SimpleModule module = new SimpleModule(); @@ -92,7 +90,8 @@ public UpdateRequirement deserialize(JsonParser p, DeserializationContext ctxt) public static class UpdateRequirementSerializer extends JsonSerializer { @Override - public void serialize(UpdateRequirement value, JsonGenerator gen, SerializerProvider serializers) + public void serialize( + UpdateRequirement value, JsonGenerator gen, SerializerProvider serializers) throws IOException { UpdateRequirementParser.toJson(value, gen); } @@ -100,7 +99,8 @@ public void serialize(UpdateRequirement value, JsonGenerator gen, SerializerProv public static class TableMetadataDeserializer extends JsonDeserializer { @Override - public TableMetadata deserialize(JsonParser p, DeserializationContext context) throws IOException { + public TableMetadata deserialize(JsonParser p, DeserializationContext context) + throws IOException { JsonNode node = p.getCodec().readTree(p); return TableMetadataParser.fromJson(node); } @@ -133,7 +133,8 @@ public void serialize(MetadataUpdate value, JsonGenerator gen, SerializerProvide public static class ErrorResponseDeserializer extends JsonDeserializer { @Override - public ErrorResponse deserialize(JsonParser p, DeserializationContext context) throws IOException { + public ErrorResponse deserialize(JsonParser p, DeserializationContext context) + throws IOException { JsonNode node = p.getCodec().readTree(p); return ErrorResponseParser.fromJson(node); } @@ -141,7 +142,8 @@ public ErrorResponse deserialize(JsonParser p, DeserializationContext context) t public static class ErrorResponseSerializer extends JsonSerializer { @Override - public void serialize(ErrorResponse errorResponse, JsonGenerator gen, SerializerProvider serializers) + public void serialize( + ErrorResponse errorResponse, JsonGenerator gen, SerializerProvider serializers) throws IOException { ErrorResponseParser.toJson(errorResponse, gen); } @@ -175,7 +177,8 @@ public TableIdentifier deserialize(JsonParser p, DeserializationContext context) public static class TableIdentifierSerializer extends JsonSerializer { @Override - public void serialize(TableIdentifier identifier, JsonGenerator gen, SerializerProvider serializers) + public void serialize( + TableIdentifier identifier, JsonGenerator gen, SerializerProvider serializers) throws IOException { TableIdentifierParser.toJson(identifier, gen); } @@ -206,7 +209,8 @@ public void serialize( } } - public static class UnboundPartitionSpecDeserializer extends JsonDeserializer { + public static class UnboundPartitionSpecDeserializer + extends JsonDeserializer { @Override public UnboundPartitionSpec deserialize(JsonParser p, DeserializationContext context) throws IOException { @@ -217,7 +221,8 @@ public UnboundPartitionSpec deserialize(JsonParser p, DeserializationContext con public static class UnboundSortOrderSerializer extends JsonSerializer { @Override - public void serialize(UnboundSortOrder sortOrder, JsonGenerator gen, SerializerProvider serializers) + public void serialize( + UnboundSortOrder sortOrder, JsonGenerator gen, SerializerProvider serializers) throws IOException { SortOrderParser.toJson(sortOrder, gen); } @@ -225,7 +230,8 @@ public void serialize(UnboundSortOrder sortOrder, JsonGenerator gen, SerializerP public static class UnboundSortOrderDeserializer extends JsonDeserializer { @Override - public UnboundSortOrder deserialize(JsonParser p, DeserializationContext context) throws IOException { + public UnboundSortOrder deserialize(JsonParser p, DeserializationContext context) + throws IOException { JsonNode jsonNode = p.getCodec().readTree(p); return SortOrderParser.fromJson(jsonNode); } @@ -233,7 +239,8 @@ public UnboundSortOrder deserialize(JsonParser p, DeserializationContext context public static class OAuthTokenResponseSerializer extends JsonSerializer { @Override - public void serialize(OAuthTokenResponse tokenResponse, JsonGenerator gen, SerializerProvider serializers) + public void serialize( + OAuthTokenResponse tokenResponse, JsonGenerator gen, SerializerProvider serializers) throws IOException { OAuth2Util.tokenResponseToJson(tokenResponse, gen); } @@ -241,7 +248,8 @@ public void serialize(OAuthTokenResponse tokenResponse, JsonGenerator gen, Seria public static class OAuthTokenResponseDeserializer extends JsonDeserializer { @Override - public OAuthTokenResponse deserialize(JsonParser p, DeserializationContext context) throws IOException { + public OAuthTokenResponse deserialize(JsonParser p, DeserializationContext context) + throws IOException { JsonNode jsonNode = p.getCodec().readTree(p); return OAuth2Util.tokenResponseFromJson(jsonNode); } diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java b/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java index 1d466f5d4892..c7892a7e7ffb 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; import com.github.benmanes.caffeine.cache.Cache; @@ -82,13 +81,18 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class RESTSessionCatalog extends BaseSessionCatalog implements Configurable, Closeable { +public class RESTSessionCatalog extends BaseSessionCatalog + implements Configurable, Closeable { private static final Logger LOG = LoggerFactory.getLogger(RESTSessionCatalog.class); private static final long MAX_REFRESH_WINDOW_MILLIS = 300_000; // 5 minutes private static final long MIN_REFRESH_WAIT_MILLIS = 10; - private static final List TOKEN_PREFERENCE_ORDER = ImmutableList.of( - OAuth2Properties.ID_TOKEN_TYPE, OAuth2Properties.ACCESS_TOKEN_TYPE, OAuth2Properties.JWT_TOKEN_TYPE, - OAuth2Properties.SAML2_TOKEN_TYPE, OAuth2Properties.SAML1_TOKEN_TYPE); + private static final List TOKEN_PREFERENCE_ORDER = + ImmutableList.of( + OAuth2Properties.ID_TOKEN_TYPE, + OAuth2Properties.ACCESS_TOKEN_TYPE, + OAuth2Properties.JWT_TOKEN_TYPE, + OAuth2Properties.SAML2_TOKEN_TYPE, + OAuth2Properties.SAML1_TOKEN_TYPE); private final Function, RESTClient> clientBuilder; private Cache sessions = null; @@ -114,22 +118,28 @@ public RESTSessionCatalog() { public void initialize(String name, Map unresolved) { Preconditions.checkArgument(unresolved != null, "Invalid configuration: null"); // resolve any configuration that is supplied by environment variables - // note that this is only done for local config properties and not for properties from the catalog service + // note that this is only done for local config properties and not for properties from the + // catalog service Map props = EnvironmentUtil.resolveAll(unresolved); - long startTimeMillis = System.currentTimeMillis(); // keep track of the init start time for token refresh + long startTimeMillis = + System.currentTimeMillis(); // keep track of the init start time for token refresh String initToken = props.get(OAuth2Properties.TOKEN); // fetch auth and config to complete initialization ConfigResponse config; OAuthTokenResponse authResponse; try (RESTClient initClient = clientBuilder.apply(props)) { - Map initHeaders = RESTUtil.merge(configHeaders(props), OAuth2Util.authHeaders(initToken)); + Map initHeaders = + RESTUtil.merge(configHeaders(props), OAuth2Util.authHeaders(initToken)); String credential = props.get(OAuth2Properties.CREDENTIAL); if (credential != null && !credential.isEmpty()) { String scope = props.getOrDefault(OAuth2Properties.SCOPE, OAuth2Properties.CATALOG_SCOPE); authResponse = OAuth2Util.fetchToken(initClient, initHeaders, credential, scope); - config = fetchConfig(initClient, RESTUtil.merge(initHeaders, OAuth2Util.authHeaders(authResponse.token()))); + config = + fetchConfig( + initClient, + RESTUtil.merge(initHeaders, OAuth2Util.authHeaders(authResponse.token()))); } else { authResponse = null; config = fetchConfig(initClient, initHeaders); @@ -149,19 +159,25 @@ public void initialize(String name, Map unresolved) { } this.sessions = newSessionCache(mergedProps); - this.refreshAuthByDefault = PropertyUtil.propertyAsBoolean(mergedProps, - CatalogProperties.AUTH_DEFAULT_REFRESH_ENABLED, CatalogProperties.AUTH_DEFAULT_REFRESH_ENABLED_DEFAULT); + this.refreshAuthByDefault = + PropertyUtil.propertyAsBoolean( + mergedProps, + CatalogProperties.AUTH_DEFAULT_REFRESH_ENABLED, + CatalogProperties.AUTH_DEFAULT_REFRESH_ENABLED_DEFAULT); this.client = clientBuilder.apply(mergedProps); this.paths = ResourcePaths.forCatalogProperties(mergedProps); String ioImpl = mergedProps.get(CatalogProperties.FILE_IO_IMPL); - this.io = CatalogUtil.loadFileIO(ioImpl != null ? ioImpl : ResolvingFileIO.class.getName(), mergedProps, conf); + this.io = + CatalogUtil.loadFileIO( + ioImpl != null ? ioImpl : ResolvingFileIO.class.getName(), mergedProps, conf); super.initialize(name, mergedProps); } private AuthSession session(SessionContext context) { - return sessions.get(context.sessionId(), + return sessions.get( + context.sessionId(), id -> newSession(context.credentials(), context.properties(), catalogAuth)); } @@ -178,8 +194,12 @@ public void setConf(Configuration newConf) { public List listTables(SessionContext context, Namespace ns) { checkNamespaceIsValid(ns); - ListTablesResponse response = client.get( - paths.tables(ns), ListTablesResponse.class, headers(context), ErrorHandlers.namespaceErrorHandler()); + ListTablesResponse response = + client.get( + paths.tables(ns), + ListTablesResponse.class, + headers(context), + ErrorHandlers.namespaceErrorHandler()); return response.identifiers(); } @@ -188,7 +208,8 @@ public boolean dropTable(SessionContext context, TableIdentifier identifier) { checkIdentifierIsValid(identifier); try { - client.delete(paths.table(identifier), null, headers(context), ErrorHandlers.tableErrorHandler()); + client.delete( + paths.table(identifier), null, headers(context), ErrorHandlers.tableErrorHandler()); return true; } catch (NoSuchTableException e) { return false; @@ -205,10 +226,8 @@ public void renameTable(SessionContext context, TableIdentifier from, TableIdent checkIdentifierIsValid(from); checkIdentifierIsValid(to); - RenameTableRequest request = RenameTableRequest.builder() - .withSource(from) - .withDestination(to) - .build(); + RenameTableRequest request = + RenameTableRequest.builder().withSource(from).withDestination(to).build(); // for now, ignore the response because there is no way to return it client.post(paths.rename(), request, null, headers(context), ErrorHandlers.tableErrorHandler()); @@ -216,7 +235,10 @@ public void renameTable(SessionContext context, TableIdentifier from, TableIdent private LoadTableResponse loadInternal(SessionContext context, TableIdentifier identifier) { return client.get( - paths.table(identifier), LoadTableResponse.class, headers(context), ErrorHandlers.tableErrorHandler()); + paths.table(identifier), + LoadTableResponse.class, + headers(context), + ErrorHandlers.tableErrorHandler()); } @Override @@ -250,8 +272,13 @@ public Table loadTable(SessionContext context, TableIdentifier identifier) { } AuthSession session = tableSession(response.config(), session(context)); - RESTTableOperations ops = new RESTTableOperations( - client, paths.table(loadedIdent), session::headers, tableFileIO(response.config()), response.tableMetadata()); + RESTTableOperations ops = + new RESTTableOperations( + client, + paths.table(loadedIdent), + session::headers, + tableFileIO(response.config()), + response.tableMetadata()); BaseTable table = new BaseTable(ops, fullTableName(loadedIdent)); if (metadataType != null) { @@ -262,29 +289,32 @@ public Table loadTable(SessionContext context, TableIdentifier identifier) { } @Override - public Catalog.TableBuilder buildTable(SessionContext context, TableIdentifier identifier, Schema schema) { + public Catalog.TableBuilder buildTable( + SessionContext context, TableIdentifier identifier, Schema schema) { return new Builder(identifier, schema, context); } @Override - public void invalidateTable(SessionContext context, TableIdentifier ident) { - } + public void invalidateTable(SessionContext context, TableIdentifier ident) {} @Override - public Table registerTable(SessionContext context, TableIdentifier ident, String metadataFileLocation) { + public Table registerTable( + SessionContext context, TableIdentifier ident, String metadataFileLocation) { throw new UnsupportedOperationException("Register table is not supported"); } @Override - public void createNamespace(SessionContext context, Namespace namespace, Map metadata) { - CreateNamespaceRequest request = CreateNamespaceRequest.builder() - .withNamespace(namespace) - .setProperties(metadata) - .build(); + public void createNamespace( + SessionContext context, Namespace namespace, Map metadata) { + CreateNamespaceRequest request = + CreateNamespaceRequest.builder().withNamespace(namespace).setProperties(metadata).build(); // for now, ignore the response because there is no way to return it client.post( - paths.namespaces(), request, CreateNamespaceResponse.class, headers(context), + paths.namespaces(), + request, + CreateNamespaceResponse.class, + headers(context), ErrorHandlers.namespaceErrorHandler()); } @@ -298,9 +328,13 @@ public List listNamespaces(SessionContext context, Namespace namespac queryParams = ImmutableMap.of("parent", RESTUtil.NAMESPACE_JOINER.join(namespace.levels())); } - ListNamespacesResponse response = client.get( - paths.namespaces(), queryParams, ListNamespacesResponse.class, headers(context), - ErrorHandlers.namespaceErrorHandler()); + ListNamespacesResponse response = + client.get( + paths.namespaces(), + queryParams, + ListNamespacesResponse.class, + headers(context), + ErrorHandlers.namespaceErrorHandler()); return response.namespaces(); } @@ -309,8 +343,12 @@ public Map loadNamespaceMetadata(SessionContext context, Namespa checkNamespaceIsValid(ns); // TODO: rename to LoadNamespaceResponse? - GetNamespaceResponse response = client - .get(paths.namespace(ns), GetNamespaceResponse.class, headers(context), ErrorHandlers.namespaceErrorHandler()); + GetNamespaceResponse response = + client.get( + paths.namespace(ns), + GetNamespaceResponse.class, + headers(context), + ErrorHandlers.namespaceErrorHandler()); return response.properties(); } @@ -319,7 +357,8 @@ public boolean dropNamespace(SessionContext context, Namespace ns) { checkNamespaceIsValid(ns); try { - client.delete(paths.namespace(ns), null, headers(context), ErrorHandlers.namespaceErrorHandler()); + client.delete( + paths.namespace(ns), null, headers(context), ErrorHandlers.namespaceErrorHandler()); return true; } catch (NoSuchNamespaceException e) { return false; @@ -327,18 +366,20 @@ public boolean dropNamespace(SessionContext context, Namespace ns) { } @Override - public boolean updateNamespaceMetadata(SessionContext context, Namespace ns, - Map updates, Set removals) { + public boolean updateNamespaceMetadata( + SessionContext context, Namespace ns, Map updates, Set removals) { checkNamespaceIsValid(ns); - UpdateNamespacePropertiesRequest request = UpdateNamespacePropertiesRequest.builder() - .updateAll(updates) - .removeAll(removals) - .build(); + UpdateNamespacePropertiesRequest request = + UpdateNamespacePropertiesRequest.builder().updateAll(updates).removeAll(removals).build(); - UpdateNamespacePropertiesResponse response = client.post( - paths.namespaceProperties(ns), request, UpdateNamespacePropertiesResponse.class, headers(context), - ErrorHandlers.namespaceErrorHandler()); + UpdateNamespacePropertiesResponse response = + client.post( + paths.namespaceProperties(ns), + request, + UpdateNamespacePropertiesResponse.class, + headers(context), + ErrorHandlers.namespaceErrorHandler()); return !response.updated().isEmpty(); } @@ -369,15 +410,18 @@ private void scheduleTokenRefresh( // how much time to actually wait long timeToWait = Math.max(waitIntervalMillis - elapsedMillis, MIN_REFRESH_WAIT_MILLIS); - tokenRefreshExecutor().schedule( - () -> { - long refreshStartTime = System.currentTimeMillis(); - Pair expiration = session.refresh(client); - if (expiration != null) { - scheduleTokenRefresh(session, refreshStartTime, expiration.first(), expiration.second()); - } - }, - timeToWait, TimeUnit.MILLISECONDS); + tokenRefreshExecutor() + .schedule( + () -> { + long refreshStartTime = System.currentTimeMillis(); + Pair expiration = session.refresh(client); + if (expiration != null) { + scheduleTokenRefresh( + session, refreshStartTime, expiration.first(), expiration.second()); + } + }, + timeToWait, + TimeUnit.MILLISECONDS); } @Override @@ -395,11 +439,12 @@ private void shutdownRefreshExecutor() { this.refreshExecutor = null; List tasks = service.shutdownNow(); - tasks.forEach(task -> { - if (task instanceof Future) { - ((Future) task).cancel(true); - } - }); + tasks.forEach( + task -> { + if (task instanceof Future) { + ((Future) task).cancel(true); + } + }); try { if (service.awaitTermination(1, TimeUnit.MINUTES)) { @@ -463,22 +508,32 @@ public Builder withProperty(String key, String value) { @Override public Table create() { - CreateTableRequest request = CreateTableRequest.builder() - .withName(ident.name()) - .withSchema(schema) - .withPartitionSpec(spec) - .withWriteOrder(writeOrder) - .withLocation(location) - .setProperties(propertiesBuilder.build()) - .build(); - - LoadTableResponse response = client.post( - paths.tables(ident.namespace()), request, LoadTableResponse.class, headers(context), - ErrorHandlers.tableErrorHandler()); + CreateTableRequest request = + CreateTableRequest.builder() + .withName(ident.name()) + .withSchema(schema) + .withPartitionSpec(spec) + .withWriteOrder(writeOrder) + .withLocation(location) + .setProperties(propertiesBuilder.build()) + .build(); + + LoadTableResponse response = + client.post( + paths.tables(ident.namespace()), + request, + LoadTableResponse.class, + headers(context), + ErrorHandlers.tableErrorHandler()); AuthSession session = tableSession(response.config(), session(context)); - RESTTableOperations ops = new RESTTableOperations( - client, paths.table(ident), session::headers, tableFileIO(response.config()), response.tableMetadata()); + RESTTableOperations ops = + new RESTTableOperations( + client, + paths.table(ident), + session::headers, + tableFileIO(response.config()), + response.tableMetadata()); return new BaseTable(ops, fullTableName(ident)); } @@ -491,9 +546,15 @@ public Transaction createTransaction() { AuthSession session = tableSession(response.config(), session(context)); TableMetadata meta = response.tableMetadata(); - RESTTableOperations ops = new RESTTableOperations( - client, paths.table(ident), session::headers, tableFileIO(response.config()), - RESTTableOperations.UpdateType.CREATE, createChanges(meta), meta); + RESTTableOperations ops = + new RESTTableOperations( + client, + paths.table(ident), + session::headers, + tableFileIO(response.config()), + RESTTableOperations.UpdateType.CREATE, + createChanges(meta), + meta); return Transactions.createTableTransaction(fullName, ops, meta); } @@ -507,33 +568,43 @@ public Transaction replaceTransaction() { TableMetadata base = response.tableMetadata(); Map tableProperties = propertiesBuilder.build(); - TableMetadata replacement = base.buildReplacement( - schema, - spec != null ? spec : PartitionSpec.unpartitioned(), - writeOrder != null ? writeOrder : SortOrder.unsorted(), - location != null ? location : base.location(), - tableProperties); + TableMetadata replacement = + base.buildReplacement( + schema, + spec != null ? spec : PartitionSpec.unpartitioned(), + writeOrder != null ? writeOrder : SortOrder.unsorted(), + location != null ? location : base.location(), + tableProperties); ImmutableList.Builder changes = ImmutableList.builder(); - if (replacement.changes().stream().noneMatch(MetadataUpdate.SetCurrentSchema.class::isInstance)) { + if (replacement.changes().stream() + .noneMatch(MetadataUpdate.SetCurrentSchema.class::isInstance)) { // ensure there is a change to set the current schema changes.add(new MetadataUpdate.SetCurrentSchema(replacement.currentSchemaId())); } - if (replacement.changes().stream().noneMatch(MetadataUpdate.SetDefaultPartitionSpec.class::isInstance)) { + if (replacement.changes().stream() + .noneMatch(MetadataUpdate.SetDefaultPartitionSpec.class::isInstance)) { // ensure there is a change to set the default spec changes.add(new MetadataUpdate.SetDefaultPartitionSpec(replacement.defaultSpecId())); } - if (replacement.changes().stream().noneMatch(MetadataUpdate.SetDefaultSortOrder.class::isInstance)) { + if (replacement.changes().stream() + .noneMatch(MetadataUpdate.SetDefaultSortOrder.class::isInstance)) { // ensure there is a change to set the default sort order changes.add(new MetadataUpdate.SetDefaultSortOrder(replacement.defaultSortOrderId())); } - RESTTableOperations ops = new RESTTableOperations( - client, paths.table(ident), session::headers, tableFileIO(response.config()), - RESTTableOperations.UpdateType.REPLACE, changes.build(), base); + RESTTableOperations ops = + new RESTTableOperations( + client, + paths.table(ident), + session::headers, + tableFileIO(response.config()), + RESTTableOperations.UpdateType.REPLACE, + changes.build(), + base); return Transactions.replaceTableTransaction(fullName, ops, replacement); } @@ -541,8 +612,10 @@ public Transaction replaceTransaction() { @Override public Transaction createOrReplaceTransaction() { // return a create or a replace transaction, depending on whether the table exists - // deciding whether to create or replace can't be determined on the service because schema field IDs are assigned - // at this point and then used in data and metadata files. because create and replace will assign different + // deciding whether to create or replace can't be determined on the service because schema + // field IDs are assigned + // at this point and then used in data and metadata files. because create and replace will + // assign different // field IDs, they must be determined before any writes occur try { return replaceTransaction(); @@ -554,18 +627,22 @@ public Transaction createOrReplaceTransaction() { private LoadTableResponse stageCreate() { Map tableProperties = propertiesBuilder.build(); - CreateTableRequest request = CreateTableRequest.builder() - .stageCreate() - .withName(ident.name()) - .withSchema(schema) - .withPartitionSpec(spec) - .withWriteOrder(writeOrder) - .withLocation(location) - .setProperties(tableProperties) - .build(); + CreateTableRequest request = + CreateTableRequest.builder() + .stageCreate() + .withName(ident.name()) + .withSchema(schema) + .withPartitionSpec(spec) + .withWriteOrder(writeOrder) + .withLocation(location) + .setProperties(tableProperties) + .build(); return client.post( - paths.tables(ident.namespace()), request, LoadTableResponse.class, headers(context), + paths.tables(ident.namespace()), + request, + LoadTableResponse.class, + headers(context), ErrorHandlers.tableErrorHandler()); } } @@ -620,7 +697,8 @@ private FileIO tableFileIO(Map config) { Map fullConf = RESTUtil.merge(properties(), config); String ioImpl = fullConf.get(CatalogProperties.FILE_IO_IMPL); - return CatalogUtil.loadFileIO(ioImpl != null ? ioImpl : ResolvingFileIO.class.getName(), fullConf, this.conf); + return CatalogUtil.loadFileIO( + ioImpl != null ? ioImpl : ResolvingFileIO.class.getName(), fullConf, this.conf); } private AuthSession tableSession(Map tableConf, AuthSession parent) { @@ -628,13 +706,18 @@ private AuthSession tableSession(Map tableConf, AuthSession pare } private static ConfigResponse fetchConfig(RESTClient client, Map headers) { - ConfigResponse configResponse = client - .get(ResourcePaths.config(), ConfigResponse.class, headers, ErrorHandlers.defaultErrorHandler()); + ConfigResponse configResponse = + client.get( + ResourcePaths.config(), + ConfigResponse.class, + headers, + ErrorHandlers.defaultErrorHandler()); configResponse.validate(); return configResponse; } - private AuthSession newSession(Map credentials, Map properties, AuthSession parent) { + private AuthSession newSession( + Map credentials, Map properties, AuthSession parent) { if (credentials != null) { // use the bearer token without exchanging if (credentials.containsKey(OAuth2Properties.TOKEN)) { @@ -658,29 +741,40 @@ private AuthSession newSession(Map credentials, Map properties) { if (refreshAuthByDefault || properties.containsKey(OAuth2Properties.TOKEN_EXPIRES_IN_MS)) { return PropertyUtil.propertyAsLong( - properties, OAuth2Properties.TOKEN_EXPIRES_IN_MS, OAuth2Properties.TOKEN_EXPIRES_IN_MS_DEFAULT); + properties, + OAuth2Properties.TOKEN_EXPIRES_IN_MS, + OAuth2Properties.TOKEN_EXPIRES_IN_MS_DEFAULT); } else { return null; } @@ -713,13 +809,16 @@ private static Map configHeaders(Map properties) } private static Cache newSessionCache(Map properties) { - long expirationIntervalMs = PropertyUtil.propertyAsLong(properties, - CatalogProperties.AUTH_SESSION_TIMEOUT_MS, - CatalogProperties.AUTH_SESSION_TIMEOUT_MS_DEFAULT); + long expirationIntervalMs = + PropertyUtil.propertyAsLong( + properties, + CatalogProperties.AUTH_SESSION_TIMEOUT_MS, + CatalogProperties.AUTH_SESSION_TIMEOUT_MS_DEFAULT); return Caffeine.newBuilder() .expireAfterAccess(Duration.ofMillis(expirationIntervalMs)) - .removalListener((RemovalListener) (id, auth, cause) -> auth.stopRefreshing()) + .removalListener( + (RemovalListener) (id, auth, cause) -> auth.stopRefreshing()) .build(); } } diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTTableOperations.java b/core/src/main/java/org/apache/iceberg/rest/RESTTableOperations.java index bdb6a773a117..639856eb2dca 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTTableOperations.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTTableOperations.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; import java.util.List; @@ -58,13 +57,22 @@ enum UpdateType { private TableMetadata current; RESTTableOperations( - RESTClient client, String path, Supplier> headers, FileIO io, TableMetadata current) { + RESTClient client, + String path, + Supplier> headers, + FileIO io, + TableMetadata current) { this(client, path, headers, io, UpdateType.SIMPLE, Lists.newArrayList(), current); } RESTTableOperations( - RESTClient client, String path, Supplier> headers, FileIO io, UpdateType updateType, - List createChanges, TableMetadata current) { + RESTClient client, + String path, + Supplier> headers, + FileIO io, + UpdateType updateType, + List createChanges, + TableMetadata current) { this.client = client; this.path = path; this.headers = headers; @@ -86,7 +94,8 @@ public TableMetadata current() { @Override public TableMetadata refresh() { - return updateCurrentMetadata(client.get(path, LoadTableResponse.class, headers, ErrorHandlers.tableErrorHandler())); + return updateCurrentMetadata( + client.get(path, LoadTableResponse.class, headers, ErrorHandlers.tableErrorHandler())); } @Override @@ -96,7 +105,8 @@ public void commit(TableMetadata base, TableMetadata metadata) { Consumer errorHandler; switch (updateType) { case CREATE: - Preconditions.checkState(base == null, "Invalid base metadata for create transaction, expected null: %s", base); + Preconditions.checkState( + base == null, "Invalid base metadata for create transaction, expected null: %s", base); requestBuilder = UpdateTableRequest.builderForCreate(); baseChanges = createChanges; errorHandler = ErrorHandlers.tableErrorHandler(); // throws NoSuchTableException @@ -118,16 +128,19 @@ public void commit(TableMetadata base, TableMetadata metadata) { break; default: - throw new UnsupportedOperationException(String.format("Update type %s is not supported", updateType)); + throw new UnsupportedOperationException( + String.format("Update type %s is not supported", updateType)); } baseChanges.forEach(requestBuilder::update); metadata.changes().forEach(requestBuilder::update); UpdateTableRequest request = requestBuilder.build(); - // the error handler will throw necessary exceptions like CommitFailedException and UnknownCommitStateException + // the error handler will throw necessary exceptions like CommitFailedException and + // UnknownCommitStateException // TODO: ensure that the HTTP client lib passes HTTP client errors to the error handler - LoadTableResponse response = client.post(path, request, LoadTableResponse.class, headers, errorHandler); + LoadTableResponse response = + client.post(path, request, LoadTableResponse.class, headers, errorHandler); // all future commits should be simple commits this.updateType = UpdateType.SIMPLE; @@ -141,9 +154,11 @@ public FileIO io() { } private TableMetadata updateCurrentMetadata(LoadTableResponse response) { - // LoadTableResponse is used to deserialize the response, but config is not allowed by the REST spec so it can be + // LoadTableResponse is used to deserialize the response, but config is not allowed by the REST + // spec so it can be // safely ignored. there is no requirement to update config on refresh or commit. - if (current == null || !Objects.equals(current.metadataFileLocation(), response.metadataLocation())) { + if (current == null + || !Objects.equals(current.metadataFileLocation(), response.metadataLocation())) { this.current = response.tableMetadata(); } @@ -151,8 +166,7 @@ private TableMetadata updateCurrentMetadata(LoadTableResponse response) { } private static String metadataFileLocation(TableMetadata metadata, String filename) { - String metadataLocation = metadata.properties() - .get(TableProperties.WRITE_METADATA_LOCATION); + String metadataLocation = metadata.properties().get(TableProperties.WRITE_METADATA_LOCATION); if (metadataLocation != null) { return String.format("%s/%s", metadataLocation, filename); @@ -181,7 +195,8 @@ public TableMetadata current() { @Override public TableMetadata refresh() { - throw new UnsupportedOperationException("Cannot call refresh on temporary table operations"); + throw new UnsupportedOperationException( + "Cannot call refresh on temporary table operations"); } @Override @@ -196,7 +211,8 @@ public String metadataFileLocation(String fileName) { @Override public LocationProvider locationProvider() { - return LocationProviders.locationsFor(uncommittedMetadata.location(), uncommittedMetadata.properties()); + return LocationProviders.locationsFor( + uncommittedMetadata.location(), uncommittedMetadata.properties()); } @Override diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTUtil.java b/core/src/main/java/org/apache/iceberg/rest/RESTUtil.java index ee6a3c51de0b..55e2a8276d6e 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTUtil.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; import java.io.UncheckedIOException; @@ -39,10 +38,10 @@ public class RESTUtil { public static final Splitter NAMESPACE_SPLITTER = Splitter.on(NAMESPACE_SEPARATOR); private static final String NAMESPACE_ESCAPED_SEPARATOR = "%1F"; private static final Joiner NAMESPACE_ESCAPED_JOINER = Joiner.on(NAMESPACE_ESCAPED_SEPARATOR); - private static final Splitter NAMESPACE_ESCAPED_SPLITTER = Splitter.on(NAMESPACE_ESCAPED_SEPARATOR); + private static final Splitter NAMESPACE_ESCAPED_SPLITTER = + Splitter.on(NAMESPACE_ESCAPED_SEPARATOR); - private RESTUtil() { - } + private RESTUtil() {} public static String stripTrailingSlash(String path) { if (path == null) { @@ -66,11 +65,12 @@ public static String stripTrailingSlash(String path) { public static Map merge(Map target, Map updates) { ImmutableMap.Builder builder = ImmutableMap.builder(); - target.forEach((key, value) -> { - if (!updates.containsKey(key)) { - builder.put(key, value); - } - }); + target.forEach( + (key, value) -> { + if (!updates.containsKey(key)) { + builder.put(key, value); + } + }); updates.forEach(builder::put); @@ -78,23 +78,25 @@ public static Map merge(Map target, Map - * Any entries whose keys don't begin with the prefix are not returned. - *

- * This can be used to get a subset of the configuration related to the REST catalog, such as all - * properties from a prefix of `spark.sql.catalog.my_catalog.rest.` to get REST catalog specific properties - * from the spark configuration. + * Takes in a map, and returns a copy filtered on the entries with keys beginning with the + * designated prefix. The keys are returned with the prefix removed. + * + *

Any entries whose keys don't begin with the prefix are not returned. + * + *

This can be used to get a subset of the configuration related to the REST catalog, such as + * all properties from a prefix of `spark.sql.catalog.my_catalog.rest.` to get REST catalog + * specific properties from the spark configuration. */ - public static Map extractPrefixMap(Map properties, String prefix) { + public static Map extractPrefixMap( + Map properties, String prefix) { Preconditions.checkNotNull(properties, "Invalid properties map: null"); Map result = Maps.newHashMap(); - properties.forEach((key, value) -> { - if (key != null && key.startsWith(prefix)) { - result.put(key.substring(prefix.length()), value); - } - }); + properties.forEach( + (key, value) -> { + if (key != null && key.startsWith(prefix)) { + result.put(key.substring(prefix.length()), value); + } + }); return result; } @@ -103,24 +105,24 @@ public static Map extractPrefixMap(Map propertie /** * Encodes a map of form data as application/x-www-form-urlencoded. - *

- * This encodes the form with pairs separated by & and keys separated from values by =. + * + *

This encodes the form with pairs separated by & and keys separated from values by =. * * @param formData a map of form data * @return a String of encoded form data */ public static String encodeFormData(Map formData) { ImmutableMap.Builder builder = ImmutableMap.builder(); - formData.forEach((key, value) -> builder.put( - encodeString(String.valueOf(key)), - encodeString(String.valueOf(value)))); + formData.forEach( + (key, value) -> + builder.put(encodeString(String.valueOf(key)), encodeString(String.valueOf(value)))); return FORM_JOINER.join(builder.build()); } /** * Encodes a string using URL encoding - *

- * {@link #decodeString(String)} should be used to decode. + * + *

{@link #decodeString(String)} should be used to decode. * * @param toEncode string to encode * @return UTF-8 encoded string, suitable for use as a URL parameter @@ -137,8 +139,8 @@ public static String encodeString(String toEncode) { /** * Decodes a URL-encoded string. - *

- * See also {@link #encodeString(String)} for URL encoding. + * + *

See also {@link #encodeString(String)} for URL encoding. * * @param encoded a string to decode * @return a decoded string @@ -155,11 +157,11 @@ public static String decodeString(String encoded) { /** * Returns a String representation of a namespace that is suitable for use in a URL / URI. - *

- * This function needs to be called when a namespace is used as a path variable (or query parameter etc.), - * to format the namespace per the spec. - *

- * {@link #decodeNamespace} should be used to parse the namespace from a URL parameter. + * + *

This function needs to be called when a namespace is used as a path variable (or query + * parameter etc.), to format the namespace per the spec. + * + *

{@link #decodeNamespace} should be used to parse the namespace from a URL parameter. * * @param ns namespace to encode * @return UTF-8 encoded string representing the namespace, suitable for use as a URL parameter @@ -177,10 +179,10 @@ public static String encodeNamespace(Namespace ns) { } /** - * Takes in a string representation of a namespace as used for a URL parameter - * and returns the corresponding namespace. - *

- * See also {@link #encodeNamespace} for generating correctly formatted URLs. + * Takes in a string representation of a namespace as used for a URL parameter and returns the + * corresponding namespace. + * + *

See also {@link #encodeNamespace} for generating correctly formatted URLs. * * @param encodedNs a namespace to decode * @return a namespace @@ -196,5 +198,4 @@ public static Namespace decodeNamespace(String encodedNs) { return Namespace.of(levels); } - } diff --git a/core/src/main/java/org/apache/iceberg/rest/ResourcePaths.java b/core/src/main/java/org/apache/iceberg/rest/ResourcePaths.java index 1f1d13efac6c..c36af2469bf9 100644 --- a/core/src/main/java/org/apache/iceberg/rest/ResourcePaths.java +++ b/core/src/main/java/org/apache/iceberg/rest/ResourcePaths.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; import java.util.Map; @@ -64,7 +63,11 @@ public String tables(Namespace ns) { public String table(TableIdentifier ident) { return SLASH.join( - "v1", prefix, "namespaces", RESTUtil.encodeNamespace(ident.namespace()), "tables", + "v1", + prefix, + "namespaces", + RESTUtil.encodeNamespace(ident.namespace()), + "tables", RESTUtil.encodeString(ident.name())); } diff --git a/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Properties.java b/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Properties.java index dd3a2e78c4df..edee9a6211b6 100644 --- a/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Properties.java +++ b/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Properties.java @@ -16,38 +16,29 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.auth; public class OAuth2Properties { - private OAuth2Properties() { - } + private OAuth2Properties() {} - /** - * A Bearer token which will be used for interaction with the server. - */ + /** A Bearer token which will be used for interaction with the server. */ public static final String TOKEN = "token"; - /** - * A credential to exchange for a token in the OAuth2 client credentials flow. - */ + /** A credential to exchange for a token in the OAuth2 client credentials flow. */ public static final String CREDENTIAL = "credential"; /** - * Interval in milliseconds to wait before attempting to exchange the configured catalog Bearer token. - * By default, token exchange will be attempted after 1 hour. + * Interval in milliseconds to wait before attempting to exchange the configured catalog Bearer + * token. By default, token exchange will be attempted after 1 hour. */ public static final String TOKEN_EXPIRES_IN_MS = "token-expires-in-ms"; + public static final long TOKEN_EXPIRES_IN_MS_DEFAULT = 3_600_000; // 1 hour - /** - * Additional scope for OAuth2. - */ + /** Additional scope for OAuth2. */ public static final String SCOPE = "scope"; - /** - * Scope for OAuth2 flows. - */ + /** Scope for OAuth2 flows. */ public static final String CATALOG_SCOPE = "catalog"; // token type constants diff --git a/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java b/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java index 8906dcea8bb3..038037e2ce96 100644 --- a/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java +++ b/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.auth; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; + import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.JsonNode; import java.io.IOException; @@ -47,13 +50,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; - public class OAuth2Util { - private OAuth2Util() { - } + private OAuth2Util() {} private static final Logger LOG = LoggerFactory.getLogger(OAuth2Util.class); @@ -81,9 +79,14 @@ private OAuth2Util() { private static final String SUBJECT_TOKEN_TYPE = "subject_token_type"; private static final String ACTOR_TOKEN = "actor_token"; private static final String ACTOR_TOKEN_TYPE = "actor_token_type"; - private static final Set VALID_TOKEN_TYPES = Sets.newHashSet( - OAuth2Properties.ACCESS_TOKEN_TYPE, OAuth2Properties.REFRESH_TOKEN_TYPE, OAuth2Properties.ID_TOKEN_TYPE, - OAuth2Properties.SAML1_TOKEN_TYPE, OAuth2Properties.SAML2_TOKEN_TYPE, OAuth2Properties.JWT_TOKEN_TYPE); + private static final Set VALID_TOKEN_TYPES = + Sets.newHashSet( + OAuth2Properties.ACCESS_TOKEN_TYPE, + OAuth2Properties.REFRESH_TOKEN_TYPE, + OAuth2Properties.ID_TOKEN_TYPE, + OAuth2Properties.SAML1_TOKEN_TYPE, + OAuth2Properties.SAML2_TOKEN_TYPE, + OAuth2Properties.JWT_TOKEN_TYPE); // response serialization private static final String ACCESS_TOKEN = "access_token"; @@ -112,57 +115,93 @@ public static String toScope(Iterable scopes) { return SCOPE_JOINER.join(scopes); } - private static OAuthTokenResponse refreshToken(RESTClient client, Map headers, - String subjectToken, String subjectTokenType, String scope) { - Map request = tokenExchangeRequest( - subjectToken, subjectTokenType, - scope != null ? ImmutableList.of(scope) : ImmutableList.of()); - - OAuthTokenResponse response = client.postForm( - ResourcePaths.tokens(), request, OAuthTokenResponse.class, headers, ErrorHandlers.defaultErrorHandler()); + private static OAuthTokenResponse refreshToken( + RESTClient client, + Map headers, + String subjectToken, + String subjectTokenType, + String scope) { + Map request = + tokenExchangeRequest( + subjectToken, + subjectTokenType, + scope != null ? ImmutableList.of(scope) : ImmutableList.of()); + + OAuthTokenResponse response = + client.postForm( + ResourcePaths.tokens(), + request, + OAuthTokenResponse.class, + headers, + ErrorHandlers.defaultErrorHandler()); response.validate(); return response; } - public static OAuthTokenResponse exchangeToken(RESTClient client, Map headers, - String subjectToken, String subjectTokenType, - String actorToken, String actorTokenType, String scope) { - Map request = tokenExchangeRequest( - subjectToken, subjectTokenType, actorToken, actorTokenType, - scope != null ? ImmutableList.of(scope) : ImmutableList.of()); - - OAuthTokenResponse response = client.postForm( - ResourcePaths.tokens(), request, OAuthTokenResponse.class, headers, ErrorHandlers.defaultErrorHandler()); + public static OAuthTokenResponse exchangeToken( + RESTClient client, + Map headers, + String subjectToken, + String subjectTokenType, + String actorToken, + String actorTokenType, + String scope) { + Map request = + tokenExchangeRequest( + subjectToken, + subjectTokenType, + actorToken, + actorTokenType, + scope != null ? ImmutableList.of(scope) : ImmutableList.of()); + + OAuthTokenResponse response = + client.postForm( + ResourcePaths.tokens(), + request, + OAuthTokenResponse.class, + headers, + ErrorHandlers.defaultErrorHandler()); response.validate(); return response; } - public static OAuthTokenResponse fetchToken(RESTClient client, Map headers, String credential, - String scope) { - Map request = clientCredentialsRequest( - credential, scope != null ? ImmutableList.of(scope) : ImmutableList.of()); - - OAuthTokenResponse response = client.postForm( - ResourcePaths.tokens(), request, OAuthTokenResponse.class, headers, ErrorHandlers.defaultErrorHandler()); + public static OAuthTokenResponse fetchToken( + RESTClient client, Map headers, String credential, String scope) { + Map request = + clientCredentialsRequest( + credential, scope != null ? ImmutableList.of(scope) : ImmutableList.of()); + + OAuthTokenResponse response = + client.postForm( + ResourcePaths.tokens(), + request, + OAuthTokenResponse.class, + headers, + ErrorHandlers.defaultErrorHandler()); response.validate(); return response; } - private static Map tokenExchangeRequest(String subjectToken, String subjectTokenType, - List scopes) { + private static Map tokenExchangeRequest( + String subjectToken, String subjectTokenType, List scopes) { return tokenExchangeRequest(subjectToken, subjectTokenType, null, null, scopes); } - private static Map tokenExchangeRequest(String subjectToken, String subjectTokenType, - String actorToken, String actorTokenType, - List scopes) { - Preconditions.checkArgument(VALID_TOKEN_TYPES.contains(subjectTokenType), - "Invalid token type: %s", subjectTokenType); - Preconditions.checkArgument(actorToken == null || VALID_TOKEN_TYPES.contains(actorTokenType), - "Invalid token type: %s", actorTokenType); + private static Map tokenExchangeRequest( + String subjectToken, + String subjectTokenType, + String actorToken, + String actorTokenType, + List scopes) { + Preconditions.checkArgument( + VALID_TOKEN_TYPES.contains(subjectTokenType), "Invalid token type: %s", subjectTokenType); + Preconditions.checkArgument( + actorToken == null || VALID_TOKEN_TYPES.contains(actorTokenType), + "Invalid token type: %s", + actorTokenType); ImmutableMap.Builder formData = ImmutableMap.builder(); formData.put(GRANT_TYPE, TOKEN_EXCHANGE); @@ -193,13 +232,14 @@ private static Pair parseCredential(String credential) { } } - private static Map clientCredentialsRequest(String credential, List scopes) { + private static Map clientCredentialsRequest( + String credential, List scopes) { Pair credentialPair = parseCredential(credential); return clientCredentialsRequest(credentialPair.first(), credentialPair.second(), scopes); } - private static Map clientCredentialsRequest(String clientId, String clientSecret, - List scopes) { + private static Map clientCredentialsRequest( + String clientId, String clientSecret, List scopes) { ImmutableMap.Builder formData = ImmutableMap.builder(); formData.put(GRANT_TYPE, CLIENT_CREDENTIALS); if (clientId != null) { @@ -223,7 +263,8 @@ public static String tokenResponseToJson(OAuthTokenResponse response) { } } - public static void tokenResponseToJson(OAuthTokenResponse response, JsonGenerator gen) throws IOException { + public static void tokenResponseToJson(OAuthTokenResponse response, JsonGenerator gen) + throws IOException { response.validate(); gen.writeStartObject(); @@ -255,12 +296,14 @@ public static OAuthTokenResponse tokenResponseFromJson(String json) { } public static OAuthTokenResponse tokenResponseFromJson(JsonNode json) { - Preconditions.checkArgument(json.isObject(), "Cannot parse token response from non-object: %s", json); + Preconditions.checkArgument( + json.isObject(), "Cannot parse token response from non-object: %s", json); - OAuthTokenResponse.Builder builder = OAuthTokenResponse.builder() - .withToken(JsonUtil.getString(ACCESS_TOKEN, json)) - .withTokenType(JsonUtil.getString(TOKEN_TYPE, json)) - .withIssuedTokenType(JsonUtil.getStringOrNull(ISSUED_TOKEN_TYPE, json)); + OAuthTokenResponse.Builder builder = + OAuthTokenResponse.builder() + .withToken(JsonUtil.getString(ACCESS_TOKEN, json)) + .withTokenType(JsonUtil.getString(TOKEN_TYPE, json)) + .withIssuedTokenType(JsonUtil.getStringOrNull(ISSUED_TOKEN_TYPE, json)); if (json.has(EXPIRES_IN)) { builder.setExpirationInSeconds(JsonUtil.getInt(EXPIRES_IN, json)); @@ -273,9 +316,7 @@ public static OAuthTokenResponse tokenResponseFromJson(JsonNode json) { return builder.build(); } - /** - * Class to handle authorization headers and token refresh. - */ + /** Class to handle authorization headers and token refresh. */ public static class AuthSession { private Map headers; private String token; @@ -313,17 +354,25 @@ public void stopRefreshing() { public Pair refresh(RESTClient client) { if (token != null && keepRefreshed) { AtomicReference ref = new AtomicReference<>(null); - boolean isSuccessful = Tasks.foreach(ref) - .suppressFailureWhenFinished() - .retry(5) - .onFailure((task, err) -> LOG.warn("Failed to refresh token", err)) - .exponentialBackoff( - COMMIT_MIN_RETRY_WAIT_MS_DEFAULT, - COMMIT_MAX_RETRY_WAIT_MS_DEFAULT, - COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT, - 2.0 /* exponential */) - .run(holder -> - holder.set(refreshToken(client, headers(), token, tokenType, OAuth2Properties.CATALOG_SCOPE))); + boolean isSuccessful = + Tasks.foreach(ref) + .suppressFailureWhenFinished() + .retry(5) + .onFailure((task, err) -> LOG.warn("Failed to refresh token", err)) + .exponentialBackoff( + COMMIT_MIN_RETRY_WAIT_MS_DEFAULT, + COMMIT_MAX_RETRY_WAIT_MS_DEFAULT, + COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT, + 2.0 /* exponential */) + .run( + holder -> + holder.set( + refreshToken( + client, + headers(), + token, + tokenType, + OAuth2Properties.CATALOG_SCOPE))); if (!isSuccessful || ref.get() == null) { return null; diff --git a/core/src/main/java/org/apache/iceberg/rest/requests/CreateNamespaceRequest.java b/core/src/main/java/org/apache/iceberg/rest/requests/CreateNamespaceRequest.java index 577b74140943..9b5180a1d71f 100644 --- a/core/src/main/java/org/apache/iceberg/rest/requests/CreateNamespaceRequest.java +++ b/core/src/main/java/org/apache/iceberg/rest/requests/CreateNamespaceRequest.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.requests; import java.util.Map; @@ -28,9 +27,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.rest.RESTRequest; -/** - * A REST request to create a namespace, with an optional set of properties. - */ +/** A REST request to create a namespace, with an optional set of properties. */ public class CreateNamespaceRequest implements RESTRequest { private Namespace namespace; @@ -75,8 +72,7 @@ public static class Builder { private Namespace namespace; private final ImmutableMap.Builder properties = ImmutableMap.builder(); - private Builder() { - } + private Builder() {} public Builder withNamespace(Namespace ns) { Preconditions.checkNotNull(ns, "Invalid namespace: null"); @@ -87,8 +83,10 @@ public Builder withNamespace(Namespace ns) { public Builder setProperties(Map props) { Preconditions.checkNotNull(props, "Invalid collection of properties: null"); Preconditions.checkArgument(!props.containsKey(null), "Invalid property: null"); - Preconditions.checkArgument(!props.containsValue(null), - "Invalid value for properties %s: null", Maps.filterValues(props, Objects::isNull).keySet()); + Preconditions.checkArgument( + !props.containsValue(null), + "Invalid value for properties %s: null", + Maps.filterValues(props, Objects::isNull).keySet()); properties.putAll(props); return this; } diff --git a/core/src/main/java/org/apache/iceberg/rest/requests/CreateTableRequest.java b/core/src/main/java/org/apache/iceberg/rest/requests/CreateTableRequest.java index 49796541dadf..a84b1fa8802b 100644 --- a/core/src/main/java/org/apache/iceberg/rest/requests/CreateTableRequest.java +++ b/core/src/main/java/org/apache/iceberg/rest/requests/CreateTableRequest.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.requests; import java.util.Map; @@ -33,8 +32,8 @@ import org.apache.iceberg.rest.RESTRequest; /** - * A REST request to create a table, either via direct commit or staging the creation - * of the table as part of a transaction. + * A REST request to create a table, either via direct commit or staging the creation of the table + * as part of a transaction. */ public class CreateTableRequest implements RESTRequest { @@ -50,8 +49,14 @@ public CreateTableRequest() { // Needed for Jackson Deserialization. } - private CreateTableRequest(String name, String location, Schema schema, PartitionSpec partitionSpec, - SortOrder writeOrder, Map properties, boolean stageCreate) { + private CreateTableRequest( + String name, + String location, + Schema schema, + PartitionSpec partitionSpec, + SortOrder writeOrder, + Map properties, + boolean stageCreate) { this.name = name; this.location = location; this.schema = schema; @@ -123,8 +128,7 @@ public static class Builder { private final ImmutableMap.Builder properties = ImmutableMap.builder(); private boolean stageCreate = false; - private Builder() { - } + private Builder() {} public Builder withName(String tableName) { Preconditions.checkNotNull(tableName, "Invalid name: null"); @@ -147,8 +151,10 @@ public Builder setProperty(String property, String value) { public Builder setProperties(Map props) { Preconditions.checkNotNull(props, "Invalid collection of properties: null"); Preconditions.checkArgument(!props.containsKey(null), "Invalid property: null"); - Preconditions.checkArgument(!props.containsValue(null), - "Invalid value for properties %s: null", Maps.filterValues(props, Objects::isNull).keySet()); + Preconditions.checkArgument( + !props.containsValue(null), + "Invalid value for properties %s: null", + Maps.filterValues(props, Objects::isNull).keySet()); properties.putAll(props); return this; } @@ -175,7 +181,8 @@ public Builder stageCreate() { } public CreateTableRequest build() { - return new CreateTableRequest(name, location, schema, partitionSpec, writeOrder, properties.build(), stageCreate); + return new CreateTableRequest( + name, location, schema, partitionSpec, writeOrder, properties.build(), stageCreate); } } } diff --git a/core/src/main/java/org/apache/iceberg/rest/requests/RenameTableRequest.java b/core/src/main/java/org/apache/iceberg/rest/requests/RenameTableRequest.java index c3430baec584..bb44410f2bc6 100644 --- a/core/src/main/java/org/apache/iceberg/rest/requests/RenameTableRequest.java +++ b/core/src/main/java/org/apache/iceberg/rest/requests/RenameTableRequest.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.requests; import org.apache.iceberg.catalog.TableIdentifier; @@ -24,9 +23,7 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.rest.RESTRequest; -/** - * A REST request to rename a table. - */ +/** A REST request to rename a table. */ public class RenameTableRequest implements RESTRequest { private TableIdentifier source; @@ -73,8 +70,7 @@ public static class Builder { private TableIdentifier source; private TableIdentifier destination; - private Builder() { - } + private Builder() {} public Builder withSource(TableIdentifier sourceTable) { Preconditions.checkNotNull(sourceTable, "Invalid source table identifier: null"); diff --git a/core/src/main/java/org/apache/iceberg/rest/requests/UpdateNamespacePropertiesRequest.java b/core/src/main/java/org/apache/iceberg/rest/requests/UpdateNamespacePropertiesRequest.java index 2bdbe7721932..e1fd59980ca7 100644 --- a/core/src/main/java/org/apache/iceberg/rest/requests/UpdateNamespacePropertiesRequest.java +++ b/core/src/main/java/org/apache/iceberg/rest/requests/UpdateNamespacePropertiesRequest.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.requests; import java.util.Collection; @@ -34,9 +33,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.rest.RESTRequest; -/** - * A REST request to set and/or remove properties on a namespace. - */ +/** A REST request to set and/or remove properties on a namespace. */ public class UpdateNamespacePropertiesRequest implements RESTRequest { private List removals; @@ -85,40 +82,34 @@ public static class Builder { private final ImmutableSet.Builder removalsBuilder = ImmutableSet.builder(); private final ImmutableMap.Builder updatesBuilder = ImmutableMap.builder(); - private Builder() { - } + private Builder() {} public Builder remove(String removal) { - Preconditions.checkNotNull(removal, - "Invalid property to remove: null"); + Preconditions.checkNotNull(removal, "Invalid property to remove: null"); removalsBuilder.add(removal); return this; } public Builder removeAll(Collection removals) { - Preconditions.checkNotNull(removals, - "Invalid list of properties to remove: null"); - Preconditions.checkArgument(!removals.contains(null), - "Invalid property to remove: null"); + Preconditions.checkNotNull(removals, "Invalid list of properties to remove: null"); + Preconditions.checkArgument(!removals.contains(null), "Invalid property to remove: null"); removalsBuilder.addAll(removals); return this; } public Builder update(String key, String value) { - Preconditions.checkNotNull(key, - "Invalid property to update: null"); - Preconditions.checkNotNull(value, - "Invalid value to update for key [%s]: null. Use remove instead", key); + Preconditions.checkNotNull(key, "Invalid property to update: null"); + Preconditions.checkNotNull( + value, "Invalid value to update for key [%s]: null. Use remove instead", key); updatesBuilder.put(key, value); return this; } public Builder updateAll(Map updates) { - Preconditions.checkNotNull(updates, - "Invalid collection of properties to update: null"); - Preconditions.checkArgument(!updates.containsKey(null), - "Invalid property to update: null"); - Preconditions.checkArgument(!updates.containsValue(null), + Preconditions.checkNotNull(updates, "Invalid collection of properties to update: null"); + Preconditions.checkArgument(!updates.containsKey(null), "Invalid property to update: null"); + Preconditions.checkArgument( + !updates.containsValue(null), "Invalid value to update for properties %s: null. Use remove instead", Maps.filterValues(updates, Objects::isNull).keySet()); updatesBuilder.putAll(updates); @@ -133,4 +124,3 @@ public UpdateNamespacePropertiesRequest build() { } } } - diff --git a/core/src/main/java/org/apache/iceberg/rest/requests/UpdateRequirementParser.java b/core/src/main/java/org/apache/iceberg/rest/requests/UpdateRequirementParser.java index 60029087b577..cc8aa3f7c4a6 100644 --- a/core/src/main/java/org/apache/iceberg/rest/requests/UpdateRequirementParser.java +++ b/core/src/main/java/org/apache/iceberg/rest/requests/UpdateRequirementParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.requests; import com.fasterxml.jackson.core.JsonGenerator; @@ -33,8 +32,7 @@ public class UpdateRequirementParser { - private UpdateRequirementParser() { - } + private UpdateRequirementParser() {} private static final String TYPE = "type"; @@ -70,17 +68,19 @@ private UpdateRequirementParser() { // AssertDefaultSortOrderID private static final String SORT_ORDER_ID = "default-sort-order-id"; - private static final Map, String> TYPES = ImmutableMap - ., String>builder() - .put(UpdateRequirement.AssertTableUUID.class, ASSERT_TABLE_UUID) - .put(UpdateRequirement.AssertTableDoesNotExist.class, ASSERT_TABLE_DOES_NOT_EXIST) - .put(UpdateRequirement.AssertRefSnapshotID.class, ASSERT_REF_SNAPSHOT_ID) - .put(UpdateRequirement.AssertLastAssignedFieldId.class, ASSERT_LAST_ASSIGNED_FIELD_ID) - .put(UpdateRequirement.AssertCurrentSchemaID.class, ASSERT_CURRENT_SCHEMA_ID) - .put(UpdateRequirement.AssertLastAssignedPartitionId.class, ASSERT_LAST_ASSIGNED_PARTITION_ID) - .put(UpdateRequirement.AssertDefaultSpecID.class, ASSERT_DEFAULT_SPEC_ID) - .put(UpdateRequirement.AssertDefaultSortOrderID.class, ASSERT_DEFAULT_SORT_ORDER_ID) - .build(); + private static final Map, String> TYPES = + ImmutableMap., String>builder() + .put(UpdateRequirement.AssertTableUUID.class, ASSERT_TABLE_UUID) + .put(UpdateRequirement.AssertTableDoesNotExist.class, ASSERT_TABLE_DOES_NOT_EXIST) + .put(UpdateRequirement.AssertRefSnapshotID.class, ASSERT_REF_SNAPSHOT_ID) + .put(UpdateRequirement.AssertLastAssignedFieldId.class, ASSERT_LAST_ASSIGNED_FIELD_ID) + .put(UpdateRequirement.AssertCurrentSchemaID.class, ASSERT_CURRENT_SCHEMA_ID) + .put( + UpdateRequirement.AssertLastAssignedPartitionId.class, + ASSERT_LAST_ASSIGNED_PARTITION_ID) + .put(UpdateRequirement.AssertDefaultSpecID.class, ASSERT_DEFAULT_SPEC_ID) + .put(UpdateRequirement.AssertDefaultSortOrderID.class, ASSERT_DEFAULT_SORT_ORDER_ID) + .build(); public static String toJson(UpdateRequirement updateRequirement) { return toJson(updateRequirement, false); @@ -102,7 +102,8 @@ public static String toJson(UpdateRequirement updateRequirement, boolean pretty) } } - public static void toJson(UpdateRequirement updateRequirement, JsonGenerator generator) throws IOException { + public static void toJson(UpdateRequirement updateRequirement, JsonGenerator generator) + throws IOException { String requirementType = TYPES.get(updateRequirement.getClass()); generator.writeStartObject(); @@ -116,27 +117,34 @@ public static void toJson(UpdateRequirement updateRequirement, JsonGenerator gen writeAssertTableUUID((UpdateRequirement.AssertTableUUID) updateRequirement, generator); break; case ASSERT_REF_SNAPSHOT_ID: - writeAssertRefSnapshotId((UpdateRequirement.AssertRefSnapshotID) updateRequirement, generator); + writeAssertRefSnapshotId( + (UpdateRequirement.AssertRefSnapshotID) updateRequirement, generator); break; case ASSERT_LAST_ASSIGNED_FIELD_ID: - writeAssertLastAssignedFieldId((UpdateRequirement.AssertLastAssignedFieldId) updateRequirement, generator); + writeAssertLastAssignedFieldId( + (UpdateRequirement.AssertLastAssignedFieldId) updateRequirement, generator); break; case ASSERT_LAST_ASSIGNED_PARTITION_ID: writeAssertLastAssignedPartitionId( (UpdateRequirement.AssertLastAssignedPartitionId) updateRequirement, generator); break; case ASSERT_CURRENT_SCHEMA_ID: - writeAssertCurrentSchemaId((UpdateRequirement.AssertCurrentSchemaID) updateRequirement, generator); + writeAssertCurrentSchemaId( + (UpdateRequirement.AssertCurrentSchemaID) updateRequirement, generator); break; case ASSERT_DEFAULT_SPEC_ID: - writeAssertDefaultSpecId((UpdateRequirement.AssertDefaultSpecID) updateRequirement, generator); + writeAssertDefaultSpecId( + (UpdateRequirement.AssertDefaultSpecID) updateRequirement, generator); break; case ASSERT_DEFAULT_SORT_ORDER_ID: - writeAssertDefaultSortOrderId((UpdateRequirement.AssertDefaultSortOrderID) updateRequirement, generator); + writeAssertDefaultSortOrderId( + (UpdateRequirement.AssertDefaultSortOrderID) updateRequirement, generator); break; default: throw new IllegalArgumentException( - String.format("Cannot convert update requirement to json. Unrecognized type: %s", requirementType)); + String.format( + "Cannot convert update requirement to json. Unrecognized type: %s", + requirementType)); } generator.writeEndObject(); @@ -157,9 +165,12 @@ public static UpdateRequirement fromJson(String json) { } public static UpdateRequirement fromJson(JsonNode jsonNode) { - Preconditions.checkArgument(jsonNode != null && jsonNode.isObject(), - "Cannot parse update requirement from non-object value: %s", jsonNode); - Preconditions.checkArgument(jsonNode.hasNonNull(TYPE), "Cannot parse update requirement. Missing field: type"); + Preconditions.checkArgument( + jsonNode != null && jsonNode.isObject(), + "Cannot parse update requirement from non-object value: %s", + jsonNode); + Preconditions.checkArgument( + jsonNode.hasNonNull(TYPE), "Cannot parse update requirement. Missing field: type"); String type = JsonUtil.getString(TYPE, jsonNode).toLowerCase(Locale.ROOT); switch (type) { @@ -185,13 +196,13 @@ public static UpdateRequirement fromJson(JsonNode jsonNode) { } } - private static void writeAssertTableUUID(UpdateRequirement.AssertTableUUID requirement, JsonGenerator gen) - throws IOException { + private static void writeAssertTableUUID( + UpdateRequirement.AssertTableUUID requirement, JsonGenerator gen) throws IOException { gen.writeStringField(UUID, requirement.uuid()); } - private static void writeAssertRefSnapshotId(UpdateRequirement.AssertRefSnapshotID requirement, JsonGenerator gen) - throws IOException { + private static void writeAssertRefSnapshotId( + UpdateRequirement.AssertRefSnapshotID requirement, JsonGenerator gen) throws IOException { gen.writeStringField(NAME, requirement.refName()); if (requirement.snapshotId() != null) { gen.writeNumberField(SNAPSHOT_ID, requirement.snapshotId()); @@ -200,32 +211,36 @@ private static void writeAssertRefSnapshotId(UpdateRequirement.AssertRefSnapshot } } - private static void writeAssertLastAssignedFieldId(UpdateRequirement.AssertLastAssignedFieldId requirement, - JsonGenerator gen) throws IOException { + private static void writeAssertLastAssignedFieldId( + UpdateRequirement.AssertLastAssignedFieldId requirement, JsonGenerator gen) + throws IOException { gen.writeNumberField(LAST_ASSIGNED_FIELD_ID, requirement.lastAssignedFieldId()); } - private static void writeAssertLastAssignedPartitionId(UpdateRequirement.AssertLastAssignedPartitionId requirement, - JsonGenerator gen) throws IOException { + private static void writeAssertLastAssignedPartitionId( + UpdateRequirement.AssertLastAssignedPartitionId requirement, JsonGenerator gen) + throws IOException { gen.writeNumberField(LAST_ASSIGNED_PARTITION_ID, requirement.lastAssignedPartitionId()); } - private static void writeAssertCurrentSchemaId(UpdateRequirement.AssertCurrentSchemaID requirement, - JsonGenerator gen) throws IOException { + private static void writeAssertCurrentSchemaId( + UpdateRequirement.AssertCurrentSchemaID requirement, JsonGenerator gen) throws IOException { gen.writeNumberField(SCHEMA_ID, requirement.schemaId()); } - private static void writeAssertDefaultSpecId(UpdateRequirement.AssertDefaultSpecID requirement, JsonGenerator gen) - throws IOException { + private static void writeAssertDefaultSpecId( + UpdateRequirement.AssertDefaultSpecID requirement, JsonGenerator gen) throws IOException { gen.writeNumberField(SPEC_ID, requirement.specId()); } - private static void writeAssertDefaultSortOrderId(UpdateRequirement.AssertDefaultSortOrderID requirement, - JsonGenerator gen) throws IOException { + private static void writeAssertDefaultSortOrderId( + UpdateRequirement.AssertDefaultSortOrderID requirement, JsonGenerator gen) + throws IOException { gen.writeNumberField(SORT_ORDER_ID, requirement.sortOrderId()); } - @SuppressWarnings("unused") // Keep same signature in case this requirement class evolves and gets fields + @SuppressWarnings( + "unused") // Keep same signature in case this requirement class evolves and gets fields private static UpdateRequirement readAssertTableDoesNotExist(JsonNode node) { return new UpdateRequirement.AssertTableDoesNotExist(); } diff --git a/core/src/main/java/org/apache/iceberg/rest/requests/UpdateTableRequest.java b/core/src/main/java/org/apache/iceberg/rest/requests/UpdateTableRequest.java index aa2580cdd0bf..694e44e841dc 100644 --- a/core/src/main/java/org/apache/iceberg/rest/requests/UpdateTableRequest.java +++ b/core/src/main/java/org/apache/iceberg/rest/requests/UpdateTableRequest.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.requests; import java.util.List; @@ -47,8 +46,7 @@ public UpdateTableRequest(List requirements, List requirements() { return requirements != null ? requirements : ImmutableList.of(); @@ -227,8 +225,7 @@ public interface UpdateRequirement { void validate(TableMetadata base); class AssertTableDoesNotExist implements UpdateRequirement { - AssertTableDoesNotExist() { - } + AssertTableDoesNotExist() {} @Override public void validate(TableMetadata base) { diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/ConfigResponse.java b/core/src/main/java/org/apache/iceberg/rest/responses/ConfigResponse.java index 0029770b67bd..f4efc0ff281a 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/ConfigResponse.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/ConfigResponse.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; import java.util.Map; @@ -28,16 +27,18 @@ import org.apache.iceberg.rest.RESTResponse; /** - * Represents a response to requesting server-side provided configuration for the REST catalog. - * This allows client provided values to be overridden by the server or defaulted if not provided by the client. - *

- * The catalog properties, with overrides and defaults applied, should be used to configure the catalog and for all - * subsequent requests after this initial config request. - *

- * Configuration from the server consists of two sets of key/value pairs. + * Represents a response to requesting server-side provided configuration for the REST catalog. This + * allows client provided values to be overridden by the server or defaulted if not provided by the + * client. + * + *

The catalog properties, with overrides and defaults applied, should be used to configure the + * catalog and for all subsequent requests after this initial config request. + * + *

Configuration from the server consists of two sets of key/value pairs. + * *

    - *
  • defaults - properties that should be used as default configuration
  • - *
  • overrides - properties that should be used to override client configuration
  • + *
  • defaults - properties that should be used as default configuration + *
  • overrides - properties that should be used to override client configuration *
*/ public class ConfigResponse implements RESTResponse { @@ -56,12 +57,11 @@ private ConfigResponse(Map defaults, Map overrid } @Override - public void validate() { - } + public void validate() {} /** - * Properties that should be used as default configuration. {@code defaults} have the lowest priority - * and should be applied before the client provided configuration. + * Properties that should be used as default configuration. {@code defaults} have the lowest + * priority and should be applied before the client provided configuration. * * @return properties that should be used as default configuration */ @@ -70,8 +70,9 @@ public Map defaults() { } /** - * Properties that should be used to override client configuration. {@code overrides} have the highest priority - * and should be applied after defaults and any client-provided configuration properties. + * Properties that should be used to override client configuration. {@code overrides} have the + * highest priority and should be applied after defaults and any client-provided configuration + * properties. * * @return properties that should be given higher precedence than any client provided input */ @@ -84,10 +85,12 @@ public Map overrides() { * properties map which will be used for instantiating and configuring the REST catalog. * * @param clientProperties - Client provided configuration - * @return Merged configuration, with precedence in the order overrides, then client properties, and then defaults. + * @return Merged configuration, with precedence in the order overrides, then client properties, + * and then defaults. */ public Map merge(Map clientProperties) { - Preconditions.checkNotNull(clientProperties, + Preconditions.checkNotNull( + clientProperties, "Cannot merge client properties with server-provided properties. Invalid client configuration: null"); Map merged = defaults != null ? Maps.newHashMap(defaults) : Maps.newHashMap(); merged.putAll(clientProperties); @@ -132,22 +135,20 @@ public Builder withOverride(String key, String value) { return this; } - /** - * Adds the passed in map entries to the existing `defaults` of this Builder. - */ + /** Adds the passed in map entries to the existing `defaults` of this Builder. */ public Builder withDefaults(Map defaultsToAdd) { Preconditions.checkNotNull(defaultsToAdd, "Invalid default properties map: null"); - Preconditions.checkArgument(!defaultsToAdd.containsKey(null), "Invalid default property: null"); + Preconditions.checkArgument( + !defaultsToAdd.containsKey(null), "Invalid default property: null"); defaults.putAll(defaultsToAdd); return this; } - /** - * Adds the passed in map entries to the existing `overrides` of this Builder. - */ + /** Adds the passed in map entries to the existing `overrides` of this Builder. */ public Builder withOverrides(Map overridesToAdd) { Preconditions.checkNotNull(overridesToAdd, "Invalid override properties map: null"); - Preconditions.checkArgument(!overridesToAdd.containsKey(null), "Invalid override property: null"); + Preconditions.checkArgument( + !overridesToAdd.containsKey(null), "Invalid override property: null"); overrides.putAll(overridesToAdd); return this; } diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/CreateNamespaceResponse.java b/core/src/main/java/org/apache/iceberg/rest/responses/CreateNamespaceResponse.java index bd722e399343..a09947589371 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/CreateNamespaceResponse.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/CreateNamespaceResponse.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; import java.util.Map; @@ -28,9 +27,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.rest.RESTResponse; -/** - * Represents a REST response for a request to create a namespace / database. - */ +/** Represents a REST response for a request to create a namespace / database. */ public class CreateNamespaceResponse implements RESTResponse { private Namespace namespace; @@ -75,8 +72,7 @@ public static class Builder { private Namespace namespace; private final ImmutableMap.Builder properties = ImmutableMap.builder(); - private Builder() { - } + private Builder() {} public Builder withNamespace(Namespace ns) { Preconditions.checkNotNull(ns, "Invalid namespace: null"); @@ -85,12 +81,12 @@ public Builder withNamespace(Namespace ns) { } public Builder setProperties(Map props) { - Preconditions.checkNotNull(props, - "Invalid collection of properties: null"); - Preconditions.checkArgument(!props.containsKey(null), - "Invalid property to set: null"); - Preconditions.checkArgument(!props.containsValue(null), - "Invalid value to set for properties %s: null", Maps.filterValues(props, Objects::isNull).keySet()); + Preconditions.checkNotNull(props, "Invalid collection of properties: null"); + Preconditions.checkArgument(!props.containsKey(null), "Invalid property to set: null"); + Preconditions.checkArgument( + !props.containsValue(null), + "Invalid value to set for properties %s: null", + Maps.filterValues(props, Objects::isNull).keySet()); properties.putAll(props); return this; } @@ -100,4 +96,3 @@ public CreateNamespaceResponse build() { } } } - diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/ErrorResponse.java b/core/src/main/java/org/apache/iceberg/rest/responses/ErrorResponse.java index 423d440b3991..5543259af1e0 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/ErrorResponse.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/ErrorResponse.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; import java.io.PrintWriter; @@ -26,9 +25,7 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.rest.RESTResponse; -/** - * Standard response body for all API errors - */ +/** Standard response body for all API errors */ public class ErrorResponse implements RESTResponse { private String message; @@ -96,8 +93,7 @@ public static class Builder { private Integer code; private List stack; - private Builder() { - } + private Builder() {} public Builder withMessage(String errorMessage) { this.message = errorMessage; diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/ErrorResponseParser.java b/core/src/main/java/org/apache/iceberg/rest/responses/ErrorResponseParser.java index da197a5b0276..5900e70bdf99 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/ErrorResponseParser.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/ErrorResponseParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; import com.fasterxml.jackson.core.JsonGenerator; @@ -30,8 +29,7 @@ public class ErrorResponseParser { - private ErrorResponseParser() { - } + private ErrorResponseParser() {} private static final String ERROR = "error"; private static final String MESSAGE = "message"; @@ -54,11 +52,13 @@ public static String toJson(ErrorResponse errorResponse, boolean pretty) { generator.flush(); return writer.toString(); } catch (IOException e) { - throw new UncheckedIOException(String.format("Failed to write error response json for: %s", errorResponse), e); + throw new UncheckedIOException( + String.format("Failed to write error response json for: %s", errorResponse), e); } } - public static void toJson(ErrorResponse errorResponse, JsonGenerator generator) throws IOException { + public static void toJson(ErrorResponse errorResponse, JsonGenerator generator) + throws IOException { generator.writeStartObject(); generator.writeObjectFieldStart(ERROR); @@ -94,8 +94,10 @@ public static ErrorResponse fromJson(String json) { } public static ErrorResponse fromJson(JsonNode jsonNode) { - Preconditions.checkArgument(jsonNode != null && jsonNode.isObject(), - "Cannot parse error respone from non-object value: %s", jsonNode); + Preconditions.checkArgument( + jsonNode != null && jsonNode.isObject(), + "Cannot parse error respone from non-object value: %s", + jsonNode); Preconditions.checkArgument(jsonNode.has(ERROR), "Cannot parse missing field: error"); JsonNode error = jsonNode.get(ERROR); String message = JsonUtil.getStringOrNull(MESSAGE, error); @@ -110,4 +112,3 @@ public static ErrorResponse fromJson(JsonNode jsonNode) { .build(); } } - diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/GetNamespaceResponse.java b/core/src/main/java/org/apache/iceberg/rest/responses/GetNamespaceResponse.java index 9955e29771c0..87c8a4f6fdd9 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/GetNamespaceResponse.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/GetNamespaceResponse.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; import java.util.Map; @@ -28,9 +27,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.rest.RESTResponse; -/** - * Represents a REST response to fetch a namespace and its metadata properties - */ +/** Represents a REST response to fetch a namespace and its metadata properties */ public class GetNamespaceResponse implements RESTResponse { private Namespace namespace; @@ -73,10 +70,9 @@ public static Builder builder() { public static class Builder { private Namespace namespace; - private final ImmutableMap.Builder properties = ImmutableMap.builder(); + private final ImmutableMap.Builder properties = ImmutableMap.builder(); - private Builder() { - } + private Builder() {} public Builder withNamespace(Namespace ns) { Preconditions.checkNotNull(ns, "Invalid namespace: null"); @@ -86,9 +82,9 @@ public Builder withNamespace(Namespace ns) { public Builder setProperties(Map props) { Preconditions.checkNotNull(props, "Invalid properties map: null"); - Preconditions.checkArgument(!props.containsKey(null), - "Invalid property: null"); - Preconditions.checkArgument(!props.containsValue(null), + Preconditions.checkArgument(!props.containsKey(null), "Invalid property: null"); + Preconditions.checkArgument( + !props.containsValue(null), "Invalid value for properties %s: null", Maps.filterValues(props, Objects::isNull).keySet()); properties.putAll(props); @@ -100,4 +96,3 @@ public GetNamespaceResponse build() { } } } - diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/ListNamespacesResponse.java b/core/src/main/java/org/apache/iceberg/rest/responses/ListNamespacesResponse.java index 7508edda6aea..13a599e1a76c 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/ListNamespacesResponse.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/ListNamespacesResponse.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.rest.responses; import java.util.Collection; @@ -46,16 +44,13 @@ public void validate() { Preconditions.checkArgument(namespaces != null, "Invalid namespace: null"); } - public List namespaces() { return namespaces != null ? namespaces : ImmutableList.of(); } @Override public String toString() { - return MoreObjects.toStringHelper(this) - .add("namespaces", namespaces()) - .toString(); + return MoreObjects.toStringHelper(this).add("namespaces", namespaces()).toString(); } public static Builder builder() { @@ -65,8 +60,7 @@ public static Builder builder() { public static class Builder { private final ImmutableList.Builder namespaces = ImmutableList.builder(); - private Builder() { - } + private Builder() {} public Builder add(Namespace toAdd) { Preconditions.checkNotNull(toAdd, "Invalid namespace: null"); @@ -86,4 +80,3 @@ public ListNamespacesResponse build() { } } } - diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/ListTablesResponse.java b/core/src/main/java/org/apache/iceberg/rest/responses/ListTablesResponse.java index 7611b5e3e1c9..3c99c12c9023 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/ListTablesResponse.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/ListTablesResponse.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.rest.responses; import java.util.Collection; @@ -28,9 +26,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.rest.RESTResponse; -/** - * A list of table identifiers in a given namespace. - */ +/** A list of table identifiers in a given namespace. */ public class ListTablesResponse implements RESTResponse { private List identifiers; @@ -55,9 +51,7 @@ public List identifiers() { @Override public String toString() { - return MoreObjects.toStringHelper(this) - .add("identifiers", identifiers) - .toString(); + return MoreObjects.toStringHelper(this).add("identifiers", identifiers).toString(); } public static Builder builder() { @@ -67,8 +61,7 @@ public static Builder builder() { public static class Builder { private final ImmutableList.Builder identifiers = ImmutableList.builder(); - private Builder() { - } + private Builder() {} public Builder add(TableIdentifier toAdd) { Preconditions.checkNotNull(toAdd, "Invalid table identifier: null"); @@ -88,4 +81,3 @@ public ListTablesResponse build() { } } } - diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/LoadTableResponse.java b/core/src/main/java/org/apache/iceberg/rest/responses/LoadTableResponse.java index 01cd84bba8c5..a389479fff10 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/LoadTableResponse.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/LoadTableResponse.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; import java.util.Map; @@ -29,11 +28,11 @@ /** * A REST response that is used when a table is successfully loaded. - *

- * This class is used whenever the response to a request is a table's requested metadata and the associated location - * of its metadata, to reduce code duplication. This includes using this class as the response for - * {@link org.apache.iceberg.rest.requests.CreateTableRequest}, including when that request is used to commit - * an already staged table creation as part of a transaction. + * + *

This class is used whenever the response to a request is a table's requested metadata and the + * associated location of its metadata, to reduce code duplication. This includes using this class + * as the response for {@link org.apache.iceberg.rest.requests.CreateTableRequest}, including when + * that request is used to commit an already staged table creation as part of a transaction. */ public class LoadTableResponse implements RESTResponse { @@ -45,7 +44,8 @@ public LoadTableResponse() { // Required for Jackson deserialization } - private LoadTableResponse(String metadataLocation, TableMetadata metadata, Map config) { + private LoadTableResponse( + String metadataLocation, TableMetadata metadata, Map config) { this.metadataLocation = metadataLocation; this.metadata = metadata; this.config = config; @@ -86,8 +86,7 @@ public static class Builder { private TableMetadata metadata; private Map config = Maps.newHashMap(); - private Builder() { - } + private Builder() {} public Builder withTableMetadata(TableMetadata tableMetadata) { this.metadataLocation = tableMetadata.metadataFileLocation(); diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/OAuthTokenResponse.java b/core/src/main/java/org/apache/iceberg/rest/responses/OAuthTokenResponse.java index 239aa3ee476c..56c35f3cc6c6 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/OAuthTokenResponse.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/OAuthTokenResponse.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; import java.util.List; @@ -33,8 +32,12 @@ public class OAuthTokenResponse implements RESTResponse { private final Integer expiresIn; private final String scope; - private OAuthTokenResponse(String accessToken, String issuedTokenType, String tokenType, Integer expiresIn, - String scope) { + private OAuthTokenResponse( + String accessToken, + String issuedTokenType, + String tokenType, + Integer expiresIn, + String scope) { this.accessToken = accessToken; this.issuedTokenType = issuedTokenType; this.tokenType = tokenType; @@ -45,8 +48,10 @@ private OAuthTokenResponse(String accessToken, String issuedTokenType, String to @Override public void validate() { Preconditions.checkNotNull(accessToken, "Invalid access token: null"); - Preconditions.checkArgument("bearer".equalsIgnoreCase(tokenType) || "N_A".equalsIgnoreCase(tokenType), - "Unsupported token type: %s (must be \"bearer\" or \"N_A\")", tokenType); + Preconditions.checkArgument( + "bearer".equalsIgnoreCase(tokenType) || "N_A".equalsIgnoreCase(tokenType), + "Unsupported token type: %s (must be \"bearer\" or \"N_A\")", + tokenType); } public String token() { @@ -80,8 +85,7 @@ public static class Builder { private Integer expiresInSeconds; private final List scopes = Lists.newArrayList(); - private Builder() { - } + private Builder() {} public Builder withToken(String token) { this.accessToken = token; @@ -116,7 +120,8 @@ public Builder addScopes(List scope) { public OAuthTokenResponse build() { String scope = scopes.isEmpty() ? null : OAuth2Util.toScope(scopes); - return new OAuthTokenResponse(accessToken, issuedTokenType, tokenType, expiresInSeconds, scope); + return new OAuthTokenResponse( + accessToken, issuedTokenType, tokenType, expiresInSeconds, scope); } } } diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/UpdateNamespacePropertiesResponse.java b/core/src/main/java/org/apache/iceberg/rest/responses/UpdateNamespacePropertiesResponse.java index 344a67965d1c..b6186c5d7485 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/UpdateNamespacePropertiesResponse.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/UpdateNamespacePropertiesResponse.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; import java.util.Collection; @@ -27,23 +26,23 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.rest.RESTResponse; -/** - * A REST response to a request to set and/or remove properties on a namespace. - */ +/** A REST response to a request to set and/or remove properties on a namespace. */ public class UpdateNamespacePropertiesResponse implements RESTResponse { // List of namespace property keys that were removed private List removed; // List of namespace property keys that were added or updated private List updated; - // List of properties that were requested for removal that were not found in the namespace's properties + // List of properties that were requested for removal that were not found in the namespace's + // properties private List missing; public UpdateNamespacePropertiesResponse() { // Required for Jackson deserialization } - private UpdateNamespacePropertiesResponse(List removed, List updated, List missing) { + private UpdateNamespacePropertiesResponse( + List removed, List updated, List missing) { this.removed = removed; this.updated = updated; this.missing = missing; @@ -51,8 +50,7 @@ private UpdateNamespacePropertiesResponse(List removed, List upd } @Override - public void validate() { - } + public void validate() {} public List removed() { return removed != null ? removed : ImmutableList.of(); @@ -84,8 +82,7 @@ public static class Builder { private final ImmutableSet.Builder updatedBuilder = ImmutableSet.builder(); private final ImmutableSet.Builder missingBuilder = ImmutableSet.builder(); - private Builder() { - } + private Builder() {} public Builder addMissing(String key) { Preconditions.checkNotNull(key, "Invalid missing property: null"); @@ -108,8 +105,7 @@ public Builder addRemoved(String key) { public Builder addRemoved(Collection removed) { Preconditions.checkNotNull(removed, "Invalid removed property list: null"); - Preconditions.checkArgument(!removed.contains(null), - "Invalid removed property: null"); + Preconditions.checkArgument(!removed.contains(null), "Invalid removed property: null"); removedBuilder.addAll(removed); return this; } @@ -135,4 +131,3 @@ public UpdateNamespacePropertiesResponse build() { } } } - diff --git a/core/src/main/java/org/apache/iceberg/schema/SchemaWithPartnerVisitor.java b/core/src/main/java/org/apache/iceberg/schema/SchemaWithPartnerVisitor.java index 871f4acaa7f2..9b2226f5714d 100644 --- a/core/src/main/java/org/apache/iceberg/schema/SchemaWithPartnerVisitor.java +++ b/core/src/main/java/org/apache/iceberg/schema/SchemaWithPartnerVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.schema; import java.util.List; @@ -38,19 +37,25 @@ public interface PartnerAccessors

{ P listElementPartner(P partnerList); } - public static T visit(Schema schema, P partner, SchemaWithPartnerVisitor visitor, - PartnerAccessors

accessors) { + public static T visit( + Schema schema, + P partner, + SchemaWithPartnerVisitor visitor, + PartnerAccessors

accessors) { return visitor.schema(schema, partner, visit(schema.asStruct(), partner, visitor, accessors)); } - public static T visit(Type type, P partner, SchemaWithPartnerVisitor visitor, - PartnerAccessors

accessors) { + public static T visit( + Type type, P partner, SchemaWithPartnerVisitor visitor, PartnerAccessors

accessors) { switch (type.typeId()) { case STRUCT: Types.StructType struct = type.asNestedType().asStructType(); List results = Lists.newArrayListWithExpectedSize(struct.fields().size()); for (Types.NestedField field : struct.fields()) { - P fieldPartner = partner != null ? accessors.fieldPartner(partner, field.fieldId(), field.name()) : null; + P fieldPartner = + partner != null + ? accessors.fieldPartner(partner, field.fieldId(), field.name()) + : null; visitor.beforeField(field, fieldPartner); T result; try { @@ -107,11 +112,9 @@ public static T visit(Type type, P partner, SchemaWithPartnerVisitor { @@ -43,19 +42,24 @@ private UnionByNameVisitor(UpdateSchema api, Schema partnerSchema) { /** * Adds changes needed to produce a union of two schemas to an {@link UpdateSchema} operation. - *

- * Changes are accumulated to evolve the existingSchema into a union with newSchema. + * + *

Changes are accumulated to evolve the existingSchema into a union with newSchema. * * @param api an UpdateSchema for adding changes * @param existingSchema an existing schema * @param newSchema a new schema to compare with the existing */ public static void visit(UpdateSchema api, Schema existingSchema, Schema newSchema) { - visit(newSchema, -1, new UnionByNameVisitor(api, existingSchema), new PartnerIdByNameAccessors(existingSchema)); + visit( + newSchema, + -1, + new UnionByNameVisitor(api, existingSchema), + new PartnerIdByNameAccessors(existingSchema)); } @Override - public Boolean struct(Types.StructType struct, Integer partnerId, List missingPositions) { + public Boolean struct( + Types.StructType struct, Integer partnerId, List missingPositions) { if (partnerId == null) { return true; } @@ -63,15 +67,16 @@ public Boolean struct(Types.StructType struct, Integer partnerId, List List fields = struct.fields(); Types.StructType partnerStruct = findFieldType(partnerId).asStructType(); IntStream.range(0, missingPositions.size()) - .forEach(pos -> { - Boolean isMissing = missingPositions.get(pos); - Types.NestedField field = fields.get(pos); - if (isMissing) { - addColumn(partnerId, field); - } else { - updateColumn(field, partnerStruct.field(field.name())); - } - }); + .forEach( + pos -> { + Boolean isMissing = missingPositions.get(pos); + Types.NestedField field = fields.get(pos); + if (isMissing) { + addColumn(partnerId, field); + } else { + updateColumn(field, partnerStruct.field(field.name())); + } + }); return false; } @@ -87,7 +92,8 @@ public Boolean list(Types.ListType list, Integer partnerId, Boolean isElementMis return true; } - Preconditions.checkState(!isElementMissing, "Error traversing schemas: element is missing, but list is present"); + Preconditions.checkState( + !isElementMissing, "Error traversing schemas: element is missing, but list is present"); Types.ListType partnerList = findFieldType(partnerId).asListType(); updateColumn(list.fields().get(0), partnerList.fields().get(0)); @@ -96,13 +102,16 @@ public Boolean list(Types.ListType list, Integer partnerId, Boolean isElementMis } @Override - public Boolean map(Types.MapType map, Integer partnerId, Boolean isKeyMissing, Boolean isValueMissing) { + public Boolean map( + Types.MapType map, Integer partnerId, Boolean isKeyMissing, Boolean isValueMissing) { if (partnerId == null) { return true; } - Preconditions.checkState(!isKeyMissing, "Error traversing schemas: key is missing, but map is present"); - Preconditions.checkState(!isValueMissing, "Error traversing schemas: value is missing, but map is present"); + Preconditions.checkState( + !isKeyMissing, "Error traversing schemas: key is missing, but map is present"); + Preconditions.checkState( + !isValueMissing, "Error traversing schemas: value is missing, but map is present"); Types.MapType partnerMap = findFieldType(partnerId).asMapType(); updateColumn(map.fields().get(0), partnerMap.fields().get(0)); @@ -133,7 +142,8 @@ private void updateColumn(Types.NestedField field, Types.NestedField existingFie String fullName = partnerSchema.findColumnName(existingField.fieldId()); boolean needsOptionalUpdate = field.isOptional() && existingField.isRequired(); - boolean needsTypeUpdate = field.type().isPrimitiveType() && !field.type().equals(existingField.type()); + boolean needsTypeUpdate = + field.type().isPrimitiveType() && !field.type().equals(existingField.type()); boolean needsDocUpdate = field.doc() != null && !field.doc().equals(existingField.doc()); if (needsOptionalUpdate) { diff --git a/core/src/main/java/org/apache/iceberg/types/FixupTypes.java b/core/src/main/java/org/apache/iceberg/types/FixupTypes.java index 8cfd9ab0b651..7ae36e488467 100644 --- a/core/src/main/java/org/apache/iceberg/types/FixupTypes.java +++ b/core/src/main/java/org/apache/iceberg/types/FixupTypes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.types; import java.util.List; diff --git a/core/src/main/java/org/apache/iceberg/util/ArrayUtil.java b/core/src/main/java/org/apache/iceberg/util/ArrayUtil.java index b968571e2055..f8026fe25254 100644 --- a/core/src/main/java/org/apache/iceberg/util/ArrayUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/ArrayUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.lang.reflect.Array; @@ -26,8 +25,7 @@ import java.util.stream.LongStream; public class ArrayUtil { - private ArrayUtil() { - } + private ArrayUtil() {} public static final boolean[] EMPTY_BOOLEAN_ARRAY = new boolean[0]; public static final byte[] EMPTY_BYTE_ARRAY = new byte[0]; @@ -71,12 +69,12 @@ public static long[] toLongArray(List longs) { /** * Converts an array of object Booleans to primitives. - *

- * This method returns {@code null} for a {@code null} input array. - *

- * This code is borrowed from `org.apache.commons:commons-lang3`. * - * @param array a {@code Boolean} array, may be {@code null} + *

This method returns {@code null} for a {@code null} input array. + * + *

This code is borrowed from `org.apache.commons:commons-lang3`. + * + * @param array a {@code Boolean} array, may be {@code null} * @return a {@code boolean} array, {@code null} if null array input * @throws NullPointerException if array content is {@code null} */ @@ -95,12 +93,12 @@ public static boolean[] toPrimitive(final Boolean[] array) { /** * Converts an array of object Bytes to primitives. - *

- * This method returns {@code null} for a {@code null} input array. - *

- * This code is borrowed from `org.apache.commons:commons-lang3`. * - * @param array a {@code Byte} array, may be {@code null} + *

This method returns {@code null} for a {@code null} input array. + * + *

This code is borrowed from `org.apache.commons:commons-lang3`. + * + * @param array a {@code Byte} array, may be {@code null} * @return a {@code byte} array, {@code null} if null array input * @throws NullPointerException if array content is {@code null} */ @@ -119,12 +117,12 @@ public static byte[] toPrimitive(final Byte[] array) { /** * Converts an array of object Shorts to primitives. - *

- * This method returns {@code null} for a {@code null} input array. - *

- * This code is borrowed from `org.apache.commons:commons-lang3`. * - * @param array a {@code Short} array, may be {@code null} + *

This method returns {@code null} for a {@code null} input array. + * + *

This code is borrowed from `org.apache.commons:commons-lang3`. + * + * @param array a {@code Short} array, may be {@code null} * @return a {@code byte} array, {@code null} if null array input * @throws NullPointerException if array content is {@code null} */ @@ -143,12 +141,12 @@ public static short[] toPrimitive(final Short[] array) { /** * Converts an array of object Integers to primitives. - *

- * This method returns {@code null} for a {@code null} input array. - *

- * This code is borrowed from `org.apache.commons:commons-lang3`. * - * @param array a {@code Integer} array, may be {@code null} + *

This method returns {@code null} for a {@code null} input array. + * + *

This code is borrowed from `org.apache.commons:commons-lang3`. + * + * @param array a {@code Integer} array, may be {@code null} * @return an {@code int} array, {@code null} if null array input * @throws NullPointerException if array content is {@code null} */ @@ -167,12 +165,12 @@ public static int[] toPrimitive(final Integer[] array) { /** * Converts an array of object Longs to primitives. - *

- * This method returns {@code null} for a {@code null} input array. - *

- * This code is borrowed from `org.apache.commons:commons-lang3`. * - * @param array a {@code Long} array, may be {@code null} + *

This method returns {@code null} for a {@code null} input array. + * + *

This code is borrowed from `org.apache.commons:commons-lang3`. + * + * @param array a {@code Long} array, may be {@code null} * @return a {@code long} array, {@code null} if null array input * @throws NullPointerException if array content is {@code null} */ @@ -191,12 +189,12 @@ public static long[] toPrimitive(final Long[] array) { /** * Converts an array of object Floats to primitives. - *

- * This method returns {@code null} for a {@code null} input array. - *

- * This code is borrowed from `org.apache.commons:commons-lang3`. * - * @param array a {@code Float} array, may be {@code null} + *

This method returns {@code null} for a {@code null} input array. + * + *

This code is borrowed from `org.apache.commons:commons-lang3`. + * + * @param array a {@code Float} array, may be {@code null} * @return a {@code float} array, {@code null} if null array input * @throws NullPointerException if array content is {@code null} */ @@ -215,12 +213,12 @@ public static float[] toPrimitive(final Float[] array) { /** * Converts an array of object Doubles to primitives. - *

- * This method returns {@code null} for a {@code null} input array. - *

- * This code is borrowed from `org.apache.commons:commons-lang3`. * - * @param array a {@code Double} array, may be {@code null} + *

This method returns {@code null} for a {@code null} input array. + * + *

This code is borrowed from `org.apache.commons:commons-lang3`. + * + * @param array a {@code Double} array, may be {@code null} * @return a {@code double} array, {@code null} if null array input * @throws NullPointerException if array content is {@code null} */ @@ -239,14 +237,13 @@ public static double[] toPrimitive(final Double[] array) { /** * Copies the given array and adds the given element at the end of the new array. - *

- * The new array contains the same elements of the input - * array plus the given element in the last position. The component type of - * the new array is the same as that of the input array. - *

- * If the input array is {@code null}, a new one element array is returned - * whose component type is the same as the element, unless the element itself is null, - * in which case the return type is Object[] + * + *

The new array contains the same elements of the input array plus the given element in the + * last position. The component type of the new array is the same as that of the input array. + * + *

If the input array is {@code null}, a new one element array is returned whose component type + * is the same as the element, unless the element itself is null, in which case the return type is + * Object[] * *

    * ArrayUtils.add(null, null)      = IllegalArgumentException
@@ -259,12 +256,11 @@ public static double[] toPrimitive(final Double[] array) {
    * This code is borrowed from `org.apache.commons:commons-lang3`.
    *
    * @param  the component type of the array
-   * @param array  the array to "add" the element to, may be {@code null}
-   * @param element  the object to add, may be {@code null}
-   * @return A new array containing the existing elements plus the new element
-   * The returned array type will be that of the input array (unless null),
-   * in which case it will have the same type as the element.
-   * If both are null, an IllegalArgumentException is thrown
+   * @param array the array to "add" the element to, may be {@code null}
+   * @param element the object to add, may be {@code null}
+   * @return A new array containing the existing elements plus the new element The returned array
+   *     type will be that of the input array (unless null), in which case it will have the same
+   *     type as the element. If both are null, an IllegalArgumentException is thrown
    * @since 2.1
    * @throws IllegalArgumentException if both arguments are null
    */
@@ -284,20 +280,21 @@ public static  T[] add(final T[] array, final T element) {
   }
 
   /**
-   * Returns a copy of the given array of size 1 greater than the argument.
-   * The last value of the array is left to the default value.
-   * 

- * This code is borrowed from `org.apache.commons:commons-lang3`. + * Returns a copy of the given array of size 1 greater than the argument. The last value of the + * array is left to the default value. + * + *

This code is borrowed from `org.apache.commons:commons-lang3`. * * @param array The array to copy, must not be {@code null}. - * @param newArrayComponentType If {@code array} is {@code null}, create a - * size 1 array of this type. + * @param newArrayComponentType If {@code array} is {@code null}, create a size 1 array of this + * type. * @return A new copy of the array of size 1 greater than the input. */ private static Object copyArrayGrow1(final Object array, final Class newArrayComponentType) { if (array != null) { final int arrayLength = Array.getLength(array); - final Object newArray = Array.newInstance(array.getClass().getComponentType(), arrayLength + 1); + final Object newArray = + Array.newInstance(array.getClass().getComponentType(), arrayLength + 1); System.arraycopy(array, 0, newArray, 0, arrayLength); return newArray; } diff --git a/core/src/main/java/org/apache/iceberg/util/BinPacking.java b/core/src/main/java/org/apache/iceberg/util/BinPacking.java index 59f2a9fc09d5..f3160389ca6c 100644 --- a/core/src/main/java/org/apache/iceberg/util/BinPacking.java +++ b/core/src/main/java/org/apache/iceberg/util/BinPacking.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.Collection; @@ -45,13 +44,17 @@ public ListPacker(long targetWeight, int lookback, boolean largestBinFirst) { } public List> packEnd(List items, Function weightFunc) { - return Lists.reverse(ImmutableList.copyOf(Iterables.transform( - new PackingIterable<>(Lists.reverse(items), targetWeight, lookback, weightFunc, largestBinFirst), - Lists::reverse))); + return Lists.reverse( + ImmutableList.copyOf( + Iterables.transform( + new PackingIterable<>( + Lists.reverse(items), targetWeight, lookback, weightFunc, largestBinFirst), + Lists::reverse))); } public List> pack(Iterable items, Function weightFunc) { - return ImmutableList.copyOf(new PackingIterable<>(items, targetWeight, lookback, weightFunc, largestBinFirst)); + return ImmutableList.copyOf( + new PackingIterable<>(items, targetWeight, lookback, weightFunc, largestBinFirst)); } } @@ -62,15 +65,19 @@ public static class PackingIterable implements Iterable> { private final Function weightFunc; private final boolean largestBinFirst; - public PackingIterable(Iterable iterable, long targetWeight, int lookback, - Function weightFunc) { + public PackingIterable( + Iterable iterable, long targetWeight, int lookback, Function weightFunc) { this(iterable, targetWeight, lookback, weightFunc, false); } - public PackingIterable(Iterable iterable, long targetWeight, int lookback, - Function weightFunc, boolean largestBinFirst) { - Preconditions.checkArgument(lookback > 0, - "Bin look-back size must be greater than 0: %s", lookback); + public PackingIterable( + Iterable iterable, + long targetWeight, + int lookback, + Function weightFunc, + boolean largestBinFirst) { + Preconditions.checkArgument( + lookback > 0, "Bin look-back size must be greater than 0: %s", lookback); this.iterable = iterable; this.targetWeight = targetWeight; this.lookback = lookback; @@ -80,7 +87,8 @@ public PackingIterable(Iterable iterable, long targetWeight, int lookback, @Override public Iterator> iterator() { - return new PackingIterator<>(iterable.iterator(), targetWeight, lookback, weightFunc, largestBinFirst); + return new PackingIterator<>( + iterable.iterator(), targetWeight, lookback, weightFunc, largestBinFirst); } } @@ -92,8 +100,12 @@ private static class PackingIterator implements Iterator> { private final Function weightFunc; private final boolean largestBinFirst; - private PackingIterator(Iterator items, long targetWeight, int lookback, - Function weightFunc, boolean largestBinFirst) { + private PackingIterator( + Iterator items, + long targetWeight, + int lookback, + Function weightFunc, + boolean largestBinFirst) { this.items = items; this.targetWeight = targetWeight; this.lookback = lookback; diff --git a/core/src/main/java/org/apache/iceberg/util/CopySortOrderFields.java b/core/src/main/java/org/apache/iceberg/util/CopySortOrderFields.java index 1e3036c2e26f..433f30f81386 100644 --- a/core/src/main/java/org/apache/iceberg/util/CopySortOrderFields.java +++ b/core/src/main/java/org/apache/iceberg/util/CopySortOrderFields.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import org.apache.iceberg.NullOrder; @@ -39,13 +38,19 @@ public Void field(String sourceName, int sourceId, SortDirection direction, Null } @Override - public Void bucket(String sourceName, int sourceId, int numBuckets, SortDirection direction, NullOrder nullOrder) { + public Void bucket( + String sourceName, + int sourceId, + int numBuckets, + SortDirection direction, + NullOrder nullOrder) { builder.sortBy(Expressions.bucket(sourceName, numBuckets), direction, nullOrder); return null; } @Override - public Void truncate(String sourceName, int sourceId, int width, SortDirection direction, NullOrder nullOrder) { + public Void truncate( + String sourceName, int sourceId, int width, SortDirection direction, NullOrder nullOrder) { builder.sortBy(Expressions.truncate(sourceName, width), direction, nullOrder); return null; } diff --git a/core/src/main/java/org/apache/iceberg/util/DateTimeUtil.java b/core/src/main/java/org/apache/iceberg/util/DateTimeUtil.java index 041339396d0a..a3d5b219de92 100644 --- a/core/src/main/java/org/apache/iceberg/util/DateTimeUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/DateTimeUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.time.Instant; @@ -28,8 +27,7 @@ import java.time.temporal.ChronoUnit; public class DateTimeUtil { - private DateTimeUtil() { - } + private DateTimeUtil() {} public static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC); public static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); diff --git a/core/src/main/java/org/apache/iceberg/util/DecimalUtil.java b/core/src/main/java/org/apache/iceberg/util/DecimalUtil.java index 7dbf2312e0e4..11d76c41ddb8 100644 --- a/core/src/main/java/org/apache/iceberg/util/DecimalUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/DecimalUtil.java @@ -16,24 +16,32 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.math.BigDecimal; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; public class DecimalUtil { - private DecimalUtil() { - } + private DecimalUtil() {} /** - * Convert a {@link BigDecimal} to reused fix length bytes, the extra bytes are filled according to the signum. + * Convert a {@link BigDecimal} to reused fix length bytes, the extra bytes are filled according + * to the signum. */ - public static byte[] toReusedFixLengthBytes(int precision, int scale, BigDecimal decimal, byte[] reuseBuf) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + public static byte[] toReusedFixLengthBytes( + int precision, int scale, BigDecimal decimal, byte[] reuseBuf) { + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); byte[] unscaled = decimal.unscaledValue().toByteArray(); if (unscaled.length == reuseBuf.length) { diff --git a/core/src/main/java/org/apache/iceberg/util/EnvironmentUtil.java b/core/src/main/java/org/apache/iceberg/util/EnvironmentUtil.java index d7a48e887638..9b1813ba70e2 100644 --- a/core/src/main/java/org/apache/iceberg/util/EnvironmentUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/EnvironmentUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.Map; @@ -24,8 +23,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; public class EnvironmentUtil { - private EnvironmentUtil() { - } + private EnvironmentUtil() {} private static final String ENVIRONMENT_VARIABLE_PREFIX = "env:"; @@ -36,16 +34,17 @@ public static Map resolveAll(Map properties) { @VisibleForTesting static Map resolveAll(Map env, Map properties) { ImmutableMap.Builder builder = ImmutableMap.builder(); - properties.forEach((name, value) -> { - if (value.startsWith(ENVIRONMENT_VARIABLE_PREFIX)) { - String resolved = env.get(value.substring(ENVIRONMENT_VARIABLE_PREFIX.length())); - if (resolved != null) { - builder.put(name, resolved); - } - } else { - builder.put(name, value); - } - }); + properties.forEach( + (name, value) -> { + if (value.startsWith(ENVIRONMENT_VARIABLE_PREFIX)) { + String resolved = env.get(value.substring(ENVIRONMENT_VARIABLE_PREFIX.length())); + if (resolved != null) { + builder.put(name, resolved); + } + } else { + builder.put(name, value); + } + }); return builder.build(); } diff --git a/core/src/main/java/org/apache/iceberg/util/Exceptions.java b/core/src/main/java/org/apache/iceberg/util/Exceptions.java index 51f8d28e780d..bf5bd86bd589 100644 --- a/core/src/main/java/org/apache/iceberg/util/Exceptions.java +++ b/core/src/main/java/org/apache/iceberg/util/Exceptions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.io.Closeable; @@ -24,8 +23,7 @@ import org.apache.iceberg.exceptions.RuntimeIOException; public class Exceptions { - private Exceptions() { - } + private Exceptions() {} public static void close(Closeable closeable, boolean suppressExceptions) { try { @@ -47,7 +45,8 @@ public static E suppressExceptions(E alreadyThrown, Runnab return alreadyThrown; } - public static void suppressAndThrow(E alreadyThrown, Runnable run) throws E { + public static void suppressAndThrow(E alreadyThrown, Runnable run) + throws E { throw suppressExceptions(alreadyThrown, run); } } diff --git a/core/src/main/java/org/apache/iceberg/util/Filter.java b/core/src/main/java/org/apache/iceberg/util/Filter.java index 80533de3b3d4..f9a98c7cd4d2 100644 --- a/core/src/main/java/org/apache/iceberg/util/Filter.java +++ b/core/src/main/java/org/apache/iceberg/util/Filter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import org.apache.iceberg.io.CloseableIterable; diff --git a/core/src/main/java/org/apache/iceberg/util/JsonUtil.java b/core/src/main/java/org/apache/iceberg/util/JsonUtil.java index 3af718738726..33a548d41df5 100644 --- a/core/src/main/java/org/apache/iceberg/util/JsonUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/JsonUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import com.fasterxml.jackson.core.JsonFactory; @@ -38,8 +37,7 @@ public class JsonUtil { - private JsonUtil() { - } + private JsonUtil() {} private static final JsonFactory FACTORY = new JsonFactory(); private static final ObjectMapper MAPPER = new ObjectMapper(FACTORY); @@ -66,7 +64,7 @@ public interface ToJson { */ public static String generate(ToJson toJson, boolean pretty) { try (StringWriter writer = new StringWriter(); - JsonGenerator generator = JsonUtil.factory().createGenerator(writer)) { + JsonGenerator generator = JsonUtil.factory().createGenerator(writer)) { if (pretty) { generator.useDefaultPrettyPrinter(); } @@ -103,8 +101,11 @@ public static T parse(String json, FromJson parser) { public static int getInt(String property, JsonNode node) { Preconditions.checkArgument(node.has(property), "Cannot parse missing int %s", property); JsonNode pNode = node.get(property); - Preconditions.checkArgument(pNode != null && !pNode.isNull() && pNode.isNumber(), - "Cannot parse %s to an integer value: %s", property, pNode); + Preconditions.checkArgument( + pNode != null && !pNode.isNull() && pNode.isNumber(), + "Cannot parse %s to an integer value: %s", + property, + pNode); return pNode.asInt(); } @@ -113,8 +114,11 @@ public static Integer getIntOrNull(String property, JsonNode node) { return null; } JsonNode pNode = node.get(property); - Preconditions.checkArgument(pNode != null && !pNode.isNull() && pNode.isIntegralNumber() && pNode.canConvertToInt(), - "Cannot parse %s to an integer value: %s", property, pNode); + Preconditions.checkArgument( + pNode != null && !pNode.isNull() && pNode.isIntegralNumber() && pNode.canConvertToInt(), + "Cannot parse %s to an integer value: %s", + property, + pNode); return pNode.asInt(); } @@ -123,32 +127,44 @@ public static Long getLongOrNull(String property, JsonNode node) { return null; } JsonNode pNode = node.get(property); - Preconditions.checkArgument(pNode != null && !pNode.isNull() && pNode.isIntegralNumber() && - pNode.canConvertToLong(), "Cannot parse %s to a long value: %s", property, pNode); + Preconditions.checkArgument( + pNode != null && !pNode.isNull() && pNode.isIntegralNumber() && pNode.canConvertToLong(), + "Cannot parse %s to a long value: %s", + property, + pNode); return pNode.asLong(); } public static long getLong(String property, JsonNode node) { Preconditions.checkArgument(node.has(property), "Cannot parse missing long %s", property); JsonNode pNode = node.get(property); - Preconditions.checkArgument(pNode != null && !pNode.isNull() && pNode.isNumber(), - "Cannot parse %s to a long value: %s", property, pNode); + Preconditions.checkArgument( + pNode != null && !pNode.isNull() && pNode.isNumber(), + "Cannot parse %s to a long value: %s", + property, + pNode); return pNode.asLong(); } public static boolean getBool(String property, JsonNode node) { Preconditions.checkArgument(node.has(property), "Cannot parse missing boolean %s", property); JsonNode pNode = node.get(property); - Preconditions.checkArgument(pNode != null && !pNode.isNull() && pNode.isBoolean(), - "Cannot parse %s to a boolean value: %s", property, pNode); + Preconditions.checkArgument( + pNode != null && !pNode.isNull() && pNode.isBoolean(), + "Cannot parse %s to a boolean value: %s", + property, + pNode); return pNode.asBoolean(); } public static String getString(String property, JsonNode node) { Preconditions.checkArgument(node.has(property), "Cannot parse missing string %s", property); JsonNode pNode = node.get(property); - Preconditions.checkArgument(pNode != null && !pNode.isNull() && pNode.isTextual(), - "Cannot parse %s to a string value: %s", property, pNode); + Preconditions.checkArgument( + pNode != null && !pNode.isNull() && pNode.isTextual(), + "Cannot parse %s to a string value: %s", + property, + pNode); return pNode.asText(); } @@ -160,16 +176,22 @@ public static String getStringOrNull(String property, JsonNode node) { if (pNode != null && pNode.isNull()) { return null; } - Preconditions.checkArgument(pNode != null && pNode.isTextual(), - "Cannot parse %s from non-string value: %s", property, pNode); + Preconditions.checkArgument( + pNode != null && pNode.isTextual(), + "Cannot parse %s from non-string value: %s", + property, + pNode); return pNode.asText(); } public static Map getStringMap(String property, JsonNode node) { Preconditions.checkArgument(node.has(property), "Cannot parse missing map %s", property); JsonNode pNode = node.get(property); - Preconditions.checkArgument(pNode != null && !pNode.isNull() && pNode.isObject(), - "Cannot parse %s from non-object value: %s", property, pNode); + Preconditions.checkArgument( + pNode != null && !pNode.isNull() && pNode.isObject(), + "Cannot parse %s from non-object value: %s", + property, + pNode); ImmutableMap.Builder builder = ImmutableMap.builder(); Iterator fields = pNode.fieldNames(); @@ -181,8 +203,10 @@ public static Map getStringMap(String property, JsonNode node) { } public static String[] getStringArray(JsonNode node) { - Preconditions.checkArgument(node != null && !node.isNull() && node.isArray(), - "Cannot parse string array from non-array: %s", node); + Preconditions.checkArgument( + node != null && !node.isNull() && node.isArray(), + "Cannot parse string array from non-array: %s", + node); ArrayNode arrayNode = (ArrayNode) node; String[] arr = new String[arrayNode.size()]; for (int i = 0; i < arr.length; i++) { @@ -241,20 +265,18 @@ public static Set getLongSetOrNull(String property, JsonNode node) { return null; } - return ImmutableSet.builder() - .addAll(new JsonLongArrayIterator(property, node)) - .build(); + return ImmutableSet.builder().addAll(new JsonLongArrayIterator(property, node)).build(); } - public static void writeIntegerFieldIf(boolean condition, String key, Integer value, JsonGenerator generator) - throws IOException { + public static void writeIntegerFieldIf( + boolean condition, String key, Integer value, JsonGenerator generator) throws IOException { if (condition) { generator.writeNumberField(key, value); } } - public static void writeLongFieldIf(boolean condition, String key, Long value, JsonGenerator generator) - throws IOException { + public static void writeLongFieldIf( + boolean condition, String key, Long value, JsonGenerator generator) throws IOException { if (condition) { generator.writeNumberField(key, value); } @@ -266,8 +288,11 @@ abstract static class JsonArrayIterator implements Iterator { JsonArrayIterator(String property, JsonNode node) { JsonNode pNode = node.get(property); - Preconditions.checkArgument(pNode != null && !pNode.isNull() && pNode.isArray(), - "Cannot parse %s from non-array value: %s", property, pNode); + Preconditions.checkArgument( + pNode != null && !pNode.isNull() && pNode.isArray(), + "Cannot parse %s from non-array value: %s", + property, + pNode); this.elements = pNode.elements(); } @@ -301,7 +326,8 @@ String convert(JsonNode element) { @Override void validate(JsonNode element) { - Preconditions.checkArgument(element.isTextual(), "Cannot parse string from non-text value: %s", element); + Preconditions.checkArgument( + element.isTextual(), "Cannot parse string from non-text value: %s", element); } } @@ -318,7 +344,8 @@ Integer convert(JsonNode element) { @Override void validate(JsonNode element) { - Preconditions.checkArgument(element.isInt(), "Cannot parse integer from non-int value: %s", element); + Preconditions.checkArgument( + element.isInt(), "Cannot parse integer from non-int value: %s", element); } } @@ -335,8 +362,10 @@ Long convert(JsonNode element) { @Override void validate(JsonNode element) { - Preconditions.checkArgument(element.isIntegralNumber() && element.canConvertToLong(), - "Cannot parse long from non-long value: %s", element); + Preconditions.checkArgument( + element.isIntegralNumber() && element.canConvertToLong(), + "Cannot parse long from non-long value: %s", + element); } } } diff --git a/core/src/main/java/org/apache/iceberg/util/LocationUtil.java b/core/src/main/java/org/apache/iceberg/util/LocationUtil.java index 42c26524f28f..8eca0051ccb9 100644 --- a/core/src/main/java/org/apache/iceberg/util/LocationUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/LocationUtil.java @@ -16,18 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; public class LocationUtil { - private LocationUtil() { - - } + private LocationUtil() {} public static String stripTrailingSlash(String path) { - Preconditions.checkArgument(path != null && path.length() > 0, "path must not be null or empty"); + Preconditions.checkArgument( + path != null && path.length() > 0, "path must not be null or empty"); String result = path; while (result.endsWith("/")) { diff --git a/core/src/main/java/org/apache/iceberg/util/LockManagers.java b/core/src/main/java/org/apache/iceberg/util/LockManagers.java index a366db517db9..ed141bdf7ba1 100644 --- a/core/src/main/java/org/apache/iceberg/util/LockManagers.java +++ b/core/src/main/java/org/apache/iceberg/util/LockManagers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.Map; @@ -37,10 +36,10 @@ public class LockManagers { - private static final LockManager LOCK_MANAGER_DEFAULT = new InMemoryLockManager(Maps.newHashMap()); + private static final LockManager LOCK_MANAGER_DEFAULT = + new InMemoryLockManager(Maps.newHashMap()); - private LockManagers() { - } + private LockManagers() {} public static LockManager defaultLockManager() { return LOCK_MANAGER_DEFAULT; @@ -59,8 +58,8 @@ private static LockManager loadLockManager(String impl, Map prop try { ctor = DynConstructors.builder(LockManager.class).hiddenImpl(impl).buildChecked(); } catch (NoSuchMethodException e) { - throw new IllegalArgumentException(String.format( - "Cannot initialize LockManager, missing no-arg constructor: %s", impl), e); + throw new IllegalArgumentException( + String.format("Cannot initialize LockManager, missing no-arg constructor: %s", impl), e); } LockManager lockManager; @@ -68,7 +67,8 @@ private static LockManager loadLockManager(String impl, Map prop lockManager = ctor.newInstance(); } catch (ClassCastException e) { throw new IllegalArgumentException( - String.format("Cannot initialize LockManager, %s does not implement LockManager.", impl), e); + String.format("Cannot initialize LockManager, %s does not implement LockManager.", impl), + e); } lockManager.initialize(properties); @@ -109,13 +109,15 @@ public ScheduledExecutorService scheduler() { if (scheduler == null) { synchronized (BaseLockManager.class) { if (scheduler == null) { - scheduler = MoreExecutors.getExitingScheduledExecutorService( - (ScheduledThreadPoolExecutor) Executors.newScheduledThreadPool( - heartbeatThreads(), - new ThreadFactoryBuilder() - .setDaemon(true) - .setNameFormat("iceberg-lock-manager-%d") - .build())); + scheduler = + MoreExecutors.getExitingScheduledExecutorService( + (ScheduledThreadPoolExecutor) + Executors.newScheduledThreadPool( + heartbeatThreads(), + new ThreadFactoryBuilder() + .setDaemon(true) + .setNameFormat("iceberg-lock-manager-%d") + .build())); } } } @@ -125,23 +127,38 @@ public ScheduledExecutorService scheduler() { @Override public void initialize(Map properties) { - this.acquireTimeoutMs = PropertyUtil.propertyAsLong(properties, - CatalogProperties.LOCK_ACQUIRE_TIMEOUT_MS, CatalogProperties.LOCK_ACQUIRE_TIMEOUT_MS_DEFAULT); - this.acquireIntervalMs = PropertyUtil.propertyAsLong(properties, - CatalogProperties.LOCK_ACQUIRE_INTERVAL_MS, CatalogProperties.LOCK_ACQUIRE_INTERVAL_MS_DEFAULT); - this.heartbeatIntervalMs = PropertyUtil.propertyAsLong(properties, - CatalogProperties.LOCK_HEARTBEAT_INTERVAL_MS, CatalogProperties.LOCK_HEARTBEAT_INTERVAL_MS_DEFAULT); - this.heartbeatTimeoutMs = PropertyUtil.propertyAsLong(properties, - CatalogProperties.LOCK_HEARTBEAT_TIMEOUT_MS, CatalogProperties.LOCK_HEARTBEAT_TIMEOUT_MS_DEFAULT); - this.heartbeatThreads = PropertyUtil.propertyAsInt(properties, - CatalogProperties.LOCK_HEARTBEAT_THREADS, CatalogProperties.LOCK_HEARTBEAT_THREADS_DEFAULT); + this.acquireTimeoutMs = + PropertyUtil.propertyAsLong( + properties, + CatalogProperties.LOCK_ACQUIRE_TIMEOUT_MS, + CatalogProperties.LOCK_ACQUIRE_TIMEOUT_MS_DEFAULT); + this.acquireIntervalMs = + PropertyUtil.propertyAsLong( + properties, + CatalogProperties.LOCK_ACQUIRE_INTERVAL_MS, + CatalogProperties.LOCK_ACQUIRE_INTERVAL_MS_DEFAULT); + this.heartbeatIntervalMs = + PropertyUtil.propertyAsLong( + properties, + CatalogProperties.LOCK_HEARTBEAT_INTERVAL_MS, + CatalogProperties.LOCK_HEARTBEAT_INTERVAL_MS_DEFAULT); + this.heartbeatTimeoutMs = + PropertyUtil.propertyAsLong( + properties, + CatalogProperties.LOCK_HEARTBEAT_TIMEOUT_MS, + CatalogProperties.LOCK_HEARTBEAT_TIMEOUT_MS_DEFAULT); + this.heartbeatThreads = + PropertyUtil.propertyAsInt( + properties, + CatalogProperties.LOCK_HEARTBEAT_THREADS, + CatalogProperties.LOCK_HEARTBEAT_THREADS_DEFAULT); } } /** - * Implementation of {@link LockManager} that uses an in-memory concurrent map for locking. - * This implementation should only be used for testing, - * or if the caller only needs locking within the same JVM during table commits. + * Implementation of {@link LockManager} that uses an in-memory concurrent map for locking. This + * implementation should only be used for testing, or if the caller only needs locking within the + * same JVM during table commits. */ static class InMemoryLockManager extends BaseLockManager { @@ -158,15 +175,17 @@ static class InMemoryLockManager extends BaseLockManager { void acquireOnce(String entityId, String ownerId) { InMemoryLockContent content = LOCKS.get(entityId); if (content != null && content.expireMs() > System.currentTimeMillis()) { - throw new IllegalStateException(String.format("Lock for %s currently held by %s, expiration: %s", - entityId, content.ownerId(), content.expireMs())); + throw new IllegalStateException( + String.format( + "Lock for %s currently held by %s, expiration: %s", + entityId, content.ownerId(), content.expireMs())); } long expiration = System.currentTimeMillis() + heartbeatTimeoutMs(); boolean succeed; if (content == null) { - InMemoryLockContent previous = LOCKS.putIfAbsent( - entityId, new InMemoryLockContent(ownerId, expiration)); + InMemoryLockContent previous = + LOCKS.putIfAbsent(entityId, new InMemoryLockContent(ownerId, expiration)); succeed = previous == null; } else { succeed = LOCKS.replace(entityId, content, new InMemoryLockContent(ownerId, expiration)); @@ -178,16 +197,24 @@ void acquireOnce(String entityId, String ownerId) { HEARTBEATS.remove(entityId).cancel(false); } - HEARTBEATS.put(entityId, scheduler().scheduleAtFixedRate(() -> { - InMemoryLockContent lastContent = LOCKS.get(entityId); - try { - long newExpiration = System.currentTimeMillis() + heartbeatTimeoutMs(); - LOCKS.replace(entityId, lastContent, new InMemoryLockContent(ownerId, newExpiration)); - } catch (NullPointerException e) { - throw new RuntimeException("Cannot heartbeat to a deleted lock " + entityId, e); - } - - }, 0, heartbeatIntervalMs(), TimeUnit.MILLISECONDS)); + HEARTBEATS.put( + entityId, + scheduler() + .scheduleAtFixedRate( + () -> { + InMemoryLockContent lastContent = LOCKS.get(entityId); + try { + long newExpiration = System.currentTimeMillis() + heartbeatTimeoutMs(); + LOCKS.replace( + entityId, lastContent, new InMemoryLockContent(ownerId, newExpiration)); + } catch (NullPointerException e) { + throw new RuntimeException( + "Cannot heartbeat to a deleted lock " + entityId, e); + } + }, + 0, + heartbeatIntervalMs(), + TimeUnit.MILLISECONDS)); } else { throw new IllegalStateException("Unable to acquire lock " + entityId); @@ -218,7 +245,11 @@ public boolean release(String entityId, String ownerId) { } if (!currentContent.ownerId().equals(ownerId)) { - LOG.error("Cannot unlock {} by {}, current owner: {}", entityId, ownerId, currentContent.ownerId()); + LOG.error( + "Cannot unlock {} by {}, current owner: {}", + entityId, + ownerId, + currentContent.ownerId()); return false; } @@ -251,6 +282,5 @@ public long expireMs() { public String ownerId() { return ownerId; } - } } diff --git a/core/src/main/java/org/apache/iceberg/util/ManifestFileUtil.java b/core/src/main/java/org/apache/iceberg/util/ManifestFileUtil.java index c7f9b0b044b5..a73a00d0e6fd 100644 --- a/core/src/main/java/org/apache/iceberg/util/ManifestFileUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/ManifestFileUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.Comparator; @@ -33,8 +32,7 @@ import org.apache.iceberg.types.Types; public class ManifestFileUtil { - private ManifestFileUtil() { - } + private ManifestFileUtil() {} private static class FieldSummary { private final Comparator comparator; @@ -103,9 +101,10 @@ private static boolean canContain(List> summaries, StructLike st return true; } - public static boolean canContainAny(ManifestFile manifest, - Iterable partitions, - Function specLookup) { + public static boolean canContainAny( + ManifestFile manifest, + Iterable partitions, + Function specLookup) { if (manifest.partitions() == null) { return true; } @@ -121,9 +120,10 @@ public static boolean canContainAny(ManifestFile manifest, return false; } - public static boolean canContainAny(ManifestFile manifest, - Iterable> partitions, - Map specsById) { + public static boolean canContainAny( + ManifestFile manifest, + Iterable> partitions, + Map specsById) { if (manifest.partitions() == null) { return true; } @@ -131,7 +131,8 @@ public static boolean canContainAny(ManifestFile manifest, List> summaries = summaries(manifest, specsById::get); for (Pair partition : partitions) { - if (partition.first() == manifest.partitionSpecId() && canContain(summaries, partition.second())) { + if (partition.first() == manifest.partitionSpecId() + && canContain(summaries, partition.second())) { return true; } } @@ -139,7 +140,8 @@ public static boolean canContainAny(ManifestFile manifest, return false; } - private static List> summaries(ManifestFile manifest, Function specLookup) { + private static List> summaries( + ManifestFile manifest, Function specLookup) { Types.StructType partitionType = specLookup.apply(manifest.partitionSpecId()).partitionType(); List fieldSummaries = manifest.partitions(); List fields = partitionType.fields(); diff --git a/core/src/main/java/org/apache/iceberg/util/Pair.java b/core/src/main/java/org/apache/iceberg/util/Pair.java index 46d5010d8a75..89b697ad4f7a 100644 --- a/core/src/main/java/org/apache/iceberg/util/Pair.java +++ b/core/src/main/java/org/apache/iceberg/util/Pair.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import com.github.benmanes.caffeine.cache.CacheLoader; @@ -35,28 +34,31 @@ public static Pair of(X first, Y second) { return new Pair<>(first, second); } - private static final LoadingCache, Class>, Schema> SCHEMA_CACHE = Caffeine - .newBuilder() - .build(new CacheLoader, Class>, Schema>() { - @Override - @SuppressWarnings("deprecation") - public Schema load(Pair, Class> key) { - Schema xSchema = ReflectData.get().getSchema(key.first); - Schema ySchema = ReflectData.get().getSchema(key.second); - return Schema.createRecord("pair", null, null, false, Lists.newArrayList( - new Schema.Field("x", xSchema, null, (Object) null), - new Schema.Field("y", ySchema, null, (Object) null) - )); - } - }); + private static final LoadingCache, Class>, Schema> SCHEMA_CACHE = + Caffeine.newBuilder() + .build( + new CacheLoader, Class>, Schema>() { + @Override + @SuppressWarnings("deprecation") + public Schema load(Pair, Class> key) { + Schema xSchema = ReflectData.get().getSchema(key.first); + Schema ySchema = ReflectData.get().getSchema(key.second); + return Schema.createRecord( + "pair", + null, + null, + false, + Lists.newArrayList( + new Schema.Field("x", xSchema, null, (Object) null), + new Schema.Field("y", ySchema, null, (Object) null))); + } + }); private Schema schema = null; private X first; private Y second; - /** - * Constructor used by Avro - */ + /** Constructor used by Avro */ private Pair(Schema schema) { this.schema = schema; } diff --git a/core/src/main/java/org/apache/iceberg/util/ParallelIterable.java b/core/src/main/java/org/apache/iceberg/util/ParallelIterable.java index 3afc1c932a84..1e383021c620 100644 --- a/core/src/main/java/org/apache/iceberg/util/ParallelIterable.java +++ b/core/src/main/java/org/apache/iceberg/util/ParallelIterable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.io.Closeable; @@ -38,8 +37,7 @@ public class ParallelIterable extends CloseableGroup implements CloseableIter private final Iterable> iterables; private final ExecutorService workerPool; - public ParallelIterable(Iterable> iterables, - ExecutorService workerPool) { + public ParallelIterable(Iterable> iterables, ExecutorService workerPool) { this.iterables = iterables; this.workerPool = workerPool; } @@ -58,19 +56,24 @@ private static class ParallelIterator implements CloseableIterator { private final ConcurrentLinkedQueue queue = new ConcurrentLinkedQueue<>(); private boolean closed = false; - private ParallelIterator(Iterable> iterables, - ExecutorService workerPool) { - this.tasks = Iterables.transform(iterables, iterable -> - (Runnable) () -> { - try (Closeable ignored = (iterable instanceof Closeable) ? - (Closeable) iterable : () -> { }) { - for (T item : iterable) { - queue.add(item); - } - } catch (IOException e) { - throw new RuntimeIOException(e, "Failed to close iterable"); - } - }).iterator(); + private ParallelIterator( + Iterable> iterables, ExecutorService workerPool) { + this.tasks = + Iterables.transform( + iterables, + iterable -> + (Runnable) + () -> { + try (Closeable ignored = + (iterable instanceof Closeable) ? (Closeable) iterable : () -> {}) { + for (T item : iterable) { + queue.add(item); + } + } catch (IOException e) { + throw new RuntimeIOException(e, "Failed to close iterable"); + } + }) + .iterator(); this.workerPool = workerPool; // submit 2 tasks per worker at a time this.taskFutures = new Future[2 * ThreadPools.WORKER_THREAD_POOL_SIZE]; @@ -89,8 +92,8 @@ public void close() { /** * Checks on running tasks and submits new tasks if needed. - *

- * This should not be called after {@link #close()}. + * + *

This should not be called after {@link #close()}. * * @return true if there are pending tasks, false otherwise */ diff --git a/core/src/main/java/org/apache/iceberg/util/PartitionSet.java b/core/src/main/java/org/apache/iceberg/util/PartitionSet.java index f65f01c3c831..fdd81ac3d005 100644 --- a/core/src/main/java/org/apache/iceberg/util/PartitionSet.java +++ b/core/src/main/java/org/apache/iceberg/util/PartitionSet.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.Collection; @@ -87,8 +86,9 @@ public boolean add(Pair pair) { } public boolean add(int specId, StructLike struct) { - Set partitionSet = partitionSetById.computeIfAbsent(specId, - id -> StructLikeSet.create(partitionTypeById.get(id))); + Set partitionSet = + partitionSetById.computeIfAbsent( + specId, id -> StructLikeSet.create(partitionTypeById.get(id))); return partitionSet.add(struct); } @@ -116,8 +116,12 @@ public boolean remove(int specId, StructLike struct) { @Override public Iterator> iterator() { - Iterable>> setsAsPairs = Iterables.transform(partitionSetById.entrySet(), - idAndSet -> Iterables.transform(idAndSet.getValue(), struct -> Pair.of(idAndSet.getKey(), struct))); + Iterable>> setsAsPairs = + Iterables.transform( + partitionSetById.entrySet(), + idAndSet -> + Iterables.transform( + idAndSet.getValue(), struct -> Pair.of(idAndSet.getKey(), struct))); return Iterables.concat(setsAsPairs).iterator(); } diff --git a/core/src/main/java/org/apache/iceberg/util/PartitionUtil.java b/core/src/main/java/org/apache/iceberg/util/PartitionUtil.java index fafe18eb62b0..af2f79c3c6f9 100644 --- a/core/src/main/java/org/apache/iceberg/util/PartitionUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/PartitionUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.List; @@ -32,20 +31,21 @@ import org.apache.iceberg.types.Types; public class PartitionUtil { - private PartitionUtil() { - } + private PartitionUtil() {} public static Map constantsMap(ContentScanTask task) { return constantsMap(task, null, (type, constant) -> constant); } - public static Map constantsMap(ContentScanTask task, - BiFunction convertConstant) { + public static Map constantsMap( + ContentScanTask task, BiFunction convertConstant) { return constantsMap(task, null, convertConstant); } - public static Map constantsMap(ContentScanTask task, Types.StructType partitionType, - BiFunction convertConstant) { + public static Map constantsMap( + ContentScanTask task, + Types.StructType partitionType, + BiFunction convertConstant) { PartitionSpec spec = task.spec(); StructLike partitionData = task.file().partition(); @@ -66,7 +66,9 @@ private PartitionUtil() { if (partitionType != null) { if (partitionType.fields().size() > 0) { StructLike coercedPartition = coercePartition(partitionType, spec, partitionData); - idToConstant.put(MetadataColumns.PARTITION_COLUMN_ID, convertConstant.apply(partitionType, coercedPartition)); + idToConstant.put( + MetadataColumns.PARTITION_COLUMN_ID, + convertConstant.apply(partitionType, coercedPartition)); } else { // use null as some query engines may not be able to handle empty structs idToConstant.put(MetadataColumns.PARTITION_COLUMN_ID, null); @@ -78,7 +80,9 @@ private PartitionUtil() { for (int pos = 0; pos < fields.size(); pos += 1) { PartitionField field = fields.get(pos); if (field.transform().isIdentity()) { - Object converted = convertConstant.apply(partitionFields.get(pos).type(), partitionData.get(pos, Object.class)); + Object converted = + convertConstant.apply( + partitionFields.get(pos).type(), partitionData.get(pos, Object.class)); idToConstant.put(field.sourceId(), converted); } } @@ -87,8 +91,10 @@ private PartitionUtil() { } // adapts the provided partition data to match the table partition type - private static StructLike coercePartition(Types.StructType partitionType, PartitionSpec spec, StructLike partition) { - StructProjection projection = StructProjection.createAllowMissing(spec.partitionType(), partitionType); + private static StructLike coercePartition( + Types.StructType partitionType, PartitionSpec spec, StructLike partition) { + StructProjection projection = + StructProjection.createAllowMissing(spec.partitionType(), partitionType); projection.wrap(partition); return projection; } diff --git a/core/src/main/java/org/apache/iceberg/util/PropertyUtil.java b/core/src/main/java/org/apache/iceberg/util/PropertyUtil.java index 1028bc5e05c1..62676617abe3 100644 --- a/core/src/main/java/org/apache/iceberg/util/PropertyUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/PropertyUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.Map; @@ -26,11 +25,10 @@ public class PropertyUtil { - private PropertyUtil() { - } + private PropertyUtil() {} - public static boolean propertyAsBoolean(Map properties, - String property, boolean defaultValue) { + public static boolean propertyAsBoolean( + Map properties, String property, boolean defaultValue) { String value = properties.get(property); if (value != null) { return Boolean.parseBoolean(value); @@ -38,8 +36,8 @@ public static boolean propertyAsBoolean(Map properties, return defaultValue; } - public static double propertyAsDouble(Map properties, - String property, double defaultValue) { + public static double propertyAsDouble( + Map properties, String property, double defaultValue) { String value = properties.get(property); if (value != null) { return Double.parseDouble(value); @@ -47,8 +45,8 @@ public static double propertyAsDouble(Map properties, return defaultValue; } - public static int propertyAsInt(Map properties, - String property, int defaultValue) { + public static int propertyAsInt( + Map properties, String property, int defaultValue) { String value = properties.get(property); if (value != null) { return Integer.parseInt(value); @@ -56,8 +54,8 @@ public static int propertyAsInt(Map properties, return defaultValue; } - public static long propertyAsLong(Map properties, - String property, long defaultValue) { + public static long propertyAsLong( + Map properties, String property, long defaultValue) { String value = properties.get(property); if (value != null) { return Long.parseLong(value); @@ -65,8 +63,8 @@ public static long propertyAsLong(Map properties, return defaultValue; } - public static String propertyAsString(Map properties, - String property, String defaultValue) { + public static String propertyAsString( + Map properties, String property, String defaultValue) { String value = properties.get(property); if (value != null) { return value; @@ -75,8 +73,8 @@ public static String propertyAsString(Map properties, } /** - * Returns subset of provided map with keys matching the provided prefix. Matching is case-sensitive and the matching - * prefix is removed from the keys in returned map. + * Returns subset of provided map with keys matching the provided prefix. Matching is + * case-sensitive and the matching prefix is removed from the keys in returned map. * * @param properties input map * @param prefix prefix to choose keys from input map @@ -92,7 +90,6 @@ public static Map propertiesWithPrefix( return properties.entrySet().stream() .filter(e -> e.getKey().startsWith(prefix)) - .collect(Collectors.toMap( - e -> e.getKey().replaceFirst(prefix, ""), Map.Entry::getValue)); + .collect(Collectors.toMap(e -> e.getKey().replaceFirst(prefix, ""), Map.Entry::getValue)); } } diff --git a/core/src/main/java/org/apache/iceberg/util/SerializableMap.java b/core/src/main/java/org/apache/iceberg/util/SerializableMap.java index dd4a5a4197d6..137f915c3a2c 100644 --- a/core/src/main/java/org/apache/iceberg/util/SerializableMap.java +++ b/core/src/main/java/org/apache/iceberg/util/SerializableMap.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.io.Serializable; diff --git a/core/src/main/java/org/apache/iceberg/util/SerializableSupplier.java b/core/src/main/java/org/apache/iceberg/util/SerializableSupplier.java index 9ff3fa388f0a..e33f98134a87 100644 --- a/core/src/main/java/org/apache/iceberg/util/SerializableSupplier.java +++ b/core/src/main/java/org/apache/iceberg/util/SerializableSupplier.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.io.Serializable; diff --git a/core/src/main/java/org/apache/iceberg/util/SerializationUtil.java b/core/src/main/java/org/apache/iceberg/util/SerializationUtil.java index f7ccd7a8145d..ddba0b0bbea4 100644 --- a/core/src/main/java/org/apache/iceberg/util/SerializationUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/SerializationUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.io.ByteArrayInputStream; @@ -34,12 +33,12 @@ public class SerializationUtil { - private SerializationUtil() { - } + private SerializationUtil() {} /** - * Serialize an object to bytes. If the object implements {@link HadoopConfigurable}, its Hadoop configuration will - * be serialized into a {@link SerializableConfiguration}. + * Serialize an object to bytes. If the object implements {@link HadoopConfigurable}, its Hadoop + * configuration will be serialized into a {@link SerializableConfiguration}. + * * @param obj object to serialize * @return serialized bytes */ @@ -48,20 +47,21 @@ public static byte[] serializeToBytes(Object obj) { } /** - * Serialize an object to bytes. If the object implements {@link HadoopConfigurable}, the confSerializer will be used - * to serialize Hadoop configuration used by the object. + * Serialize an object to bytes. If the object implements {@link HadoopConfigurable}, the + * confSerializer will be used to serialize Hadoop configuration used by the object. + * * @param obj object to serialize * @param confSerializer serializer for the Hadoop configuration * @return serialized bytes */ - public static byte[] serializeToBytes(Object obj, - Function> confSerializer) { + public static byte[] serializeToBytes( + Object obj, Function> confSerializer) { if (obj instanceof HadoopConfigurable) { ((HadoopConfigurable) obj).serializeConfWith(confSerializer); } try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - ObjectOutputStream oos = new ObjectOutputStream(baos)) { + ObjectOutputStream oos = new ObjectOutputStream(baos)) { oos.writeObject(obj); return baos.toByteArray(); } catch (IOException e) { @@ -76,7 +76,7 @@ public static T deserializeFromBytes(byte[] bytes) { } try (ByteArrayInputStream bais = new ByteArrayInputStream(bytes); - ObjectInputStream ois = new ObjectInputStream(bais)) { + ObjectInputStream ois = new ObjectInputStream(bais)) { return (T) ois.readObject(); } catch (IOException e) { throw new UncheckedIOException("Failed to deserialize object", e); diff --git a/core/src/main/java/org/apache/iceberg/util/SnapshotUtil.java b/core/src/main/java/org/apache/iceberg/util/SnapshotUtil.java index b60af75f79c0..0c33ea878f75 100644 --- a/core/src/main/java/org/apache/iceberg/util/SnapshotUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/SnapshotUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.Iterator; @@ -37,12 +36,9 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; public class SnapshotUtil { - private SnapshotUtil() { - } + private SnapshotUtil() {} - /** - * Returns whether ancestorSnapshotId is an ancestor of snapshotId. - */ + /** Returns whether ancestorSnapshotId is an ancestor of snapshotId. */ public static boolean isAncestorOf(Table table, long snapshotId, long ancestorSnapshotId) { for (Snapshot snapshot : ancestorsOf(snapshotId, table::snapshot)) { if (snapshot.snapshotId() == ancestorSnapshotId) { @@ -54,9 +50,11 @@ public static boolean isAncestorOf(Table table, long snapshotId, long ancestorSn } /** - * Returns whether ancestorSnapshotId is an ancestor of snapshotId using the given lookup function. + * Returns whether ancestorSnapshotId is an ancestor of snapshotId using the given lookup + * function. */ - public static boolean isAncestorOf(long snapshotId, long ancestorSnapshotId, Function lookup) { + public static boolean isAncestorOf( + long snapshotId, long ancestorSnapshotId, Function lookup) { for (Snapshot snapshot : ancestorsOf(snapshotId, lookup)) { if (snapshot.snapshotId() == ancestorSnapshotId) { return true; @@ -65,17 +63,14 @@ public static boolean isAncestorOf(long snapshotId, long ancestorSnapshotId, Fun return false; } - /** - * Returns whether ancestorSnapshotId is an ancestor of the table's current state. - */ + /** Returns whether ancestorSnapshotId is an ancestor of the table's current state. */ public static boolean isAncestorOf(Table table, long ancestorSnapshotId) { return isAncestorOf(table, table.currentSnapshot().snapshotId(), ancestorSnapshotId); } - /** - * Returns whether some ancestor of snapshotId has parentId matches ancestorParentSnapshotId - */ - public static boolean isParentAncestorOf(Table table, long snapshotId, long ancestorParentSnapshotId) { + /** Returns whether some ancestor of snapshotId has parentId matches ancestorParentSnapshotId */ + public static boolean isParentAncestorOf( + Table table, long snapshotId, long ancestorParentSnapshotId) { for (Snapshot snapshot : ancestorsOf(snapshotId, table::snapshot)) { if (snapshot.parentId() == ancestorParentSnapshotId) { return true; @@ -86,7 +81,8 @@ public static boolean isParentAncestorOf(Table table, long snapshotId, long ance } /** - * Returns an iterable that traverses the table's snapshots from the current to the last known ancestor. + * Returns an iterable that traverses the table's snapshots from the current to the last known + * ancestor. * * @param table a Table * @return an iterable from the table's current snapshot to its last known ancestor @@ -97,9 +93,9 @@ public static Iterable currentAncestors(Table table) { /** * Return the snapshot IDs for the ancestors of the current table state. - *

- * Ancestor IDs are ordered by commit time, descending. The first ID is the current snapshot, followed by its parent, - * and so on. + * + *

Ancestor IDs are ordered by commit time, descending. The first ID is the current snapshot, + * followed by its parent, and so on. * * @param table a {@link Table} * @return a set of snapshot IDs of the known ancestor snapshots, including the current ID @@ -110,6 +106,7 @@ public static List currentAncestorIds(Table table) { /** * Traverses the history of the table's current snapshot and finds the oldest Snapshot. + * * @return null if there is no current snapshot in the table, else the oldest Snapshot. */ public static Snapshot oldestAncestor(Table table) { @@ -123,10 +120,10 @@ public static Snapshot oldestAncestor(Table table) { /** * Traverses the history and finds the oldest ancestor of the specified snapshot. - *

- * Oldest ancestor is defined as the ancestor snapshot whose parent is null or has been expired. - * If the specified snapshot has no parent or parent has been expired, - * the specified snapshot itself is returned. + * + *

Oldest ancestor is defined as the ancestor snapshot whose parent is null or has been + * expired. If the specified snapshot has no parent or parent has been expired, the specified + * snapshot itself is returned. * * @param snapshotId the ID of the snapshot to find the oldest ancestor * @param lookup lookup function from snapshot ID to snapshot @@ -148,11 +145,13 @@ public static Iterable ancestorsOf(long snapshotId, Function - * This method assumes that fromSnapshotId is an ancestor of toSnapshotId. + * + *

This method assumes that fromSnapshotId is an ancestor of toSnapshotId. */ public static List snapshotIdsBetween(Table table, long fromSnapshotId, long toSnapshotId) { - List snapshotIds = Lists.newArrayList(ancestorIds(table.snapshot(toSnapshotId), - snapshotId -> snapshotId != fromSnapshotId ? table.snapshot(snapshotId) : null)); + List snapshotIds = + Lists.newArrayList( + ancestorIds( + table.snapshot(toSnapshotId), + snapshotId -> snapshotId != fromSnapshotId ? table.snapshot(snapshotId) : null)); return snapshotIds; } - public static Iterable ancestorIdsBetween(long latestSnapshotId, Long oldestSnapshotId, - Function lookup) { + public static Iterable ancestorIdsBetween( + long latestSnapshotId, Long oldestSnapshotId, Function lookup) { return toIds(ancestorsBetween(latestSnapshotId, oldestSnapshotId, lookup)); } - public static Iterable ancestorsBetween(long latestSnapshotId, Long oldestSnapshotId, - Function lookup) { + public static Iterable ancestorsBetween( + long latestSnapshotId, Long oldestSnapshotId, Function lookup) { if (oldestSnapshotId != null) { if (latestSnapshotId == oldestSnapshotId) { return ImmutableList.of(); } - return ancestorsOf(latestSnapshotId, + return ancestorsOf( + latestSnapshotId, snapshotId -> !oldestSnapshotId.equals(snapshotId) ? lookup.apply(snapshotId) : null); } else { return ancestorsOf(latestSnapshotId, lookup); } } - private static Iterable ancestorsOf(Snapshot snapshot, Function lookup) { + private static Iterable ancestorsOf( + Snapshot snapshot, Function lookup) { if (snapshot != null) { - return () -> new Iterator() { - private Snapshot next = snapshot; - private boolean consumed = false; // include the snapshot in its history - - @Override - public boolean hasNext() { - if (!consumed) { - return true; - } - - Long parentId = next.parentId(); - if (parentId == null) { - return false; - } - - this.next = lookup.apply(parentId); - if (next != null) { - this.consumed = false; - return true; - } - - return false; - } - - @Override - public Snapshot next() { - if (hasNext()) { - this.consumed = true; - return next; - } - - throw new NoSuchElementException(); - } - }; + return () -> + new Iterator() { + private Snapshot next = snapshot; + private boolean consumed = false; // include the snapshot in its history + + @Override + public boolean hasNext() { + if (!consumed) { + return true; + } + + Long parentId = next.parentId(); + if (parentId == null) { + return false; + } + + this.next = lookup.apply(parentId); + if (next != null) { + this.consumed = false; + return true; + } + + return false; + } + + @Override + public Snapshot next() { + if (hasNext()) { + this.consumed = true; + return next; + } + + throw new NoSuchElementException(); + } + }; } else { return ImmutableList.of(); @@ -274,22 +279,27 @@ public static List newFiles( Iterables.addAll(newFiles, currentSnapshot.addedDataFiles(io)); } - ValidationException.check(Objects.equals(lastSnapshot.parentId(), baseSnapshotId), + ValidationException.check( + Objects.equals(lastSnapshot.parentId(), baseSnapshotId), "Cannot determine history between read snapshot %s and the last known ancestor %s", - baseSnapshotId, lastSnapshot.snapshotId()); + baseSnapshotId, + lastSnapshot.snapshotId()); return newFiles; } /** - * Traverses the history of the table's current snapshot and finds the snapshot with the given snapshot id as its - * parent. + * Traverses the history of the table's current snapshot and finds the snapshot with the given + * snapshot id as its parent. + * * @return the snapshot for which the given snapshot is the parent * @throws IllegalArgumentException when the given snapshotId is not found in the table - * @throws IllegalStateException when the given snapshotId is not an ancestor of the current table state + * @throws IllegalStateException when the given snapshotId is not an ancestor of the current table + * state */ public static Snapshot snapshotAfter(Table table, long snapshotId) { - Preconditions.checkArgument(table.snapshot(snapshotId) != null, "Cannot find parent snapshot: %s", snapshotId); + Preconditions.checkArgument( + table.snapshot(snapshotId) != null, "Cannot find parent snapshot: %s", snapshotId); for (Snapshot current : currentAncestors(table)) { if (current.parentId() == snapshotId) { return current; @@ -297,7 +307,9 @@ public static Snapshot snapshotAfter(Table table, long snapshotId) { } throw new IllegalStateException( - String.format("Cannot find snapshot after %s: not an ancestor of table's current snapshot", snapshotId)); + String.format( + "Cannot find snapshot after %s: not an ancestor of table's current snapshot", + snapshotId)); } /** @@ -306,8 +318,8 @@ public static Snapshot snapshotAfter(Table table, long snapshotId) { * @param table a {@link Table} * @param timestampMillis the timestamp in millis since the Unix epoch * @return the snapshot ID - * @throws IllegalArgumentException when no snapshot is found in the table - * older than the timestamp + * @throws IllegalArgumentException when no snapshot is found in the table older than the + * timestamp */ public static long snapshotIdAsOfTime(Table table, long timestampMillis) { Long snapshotId = null; @@ -317,8 +329,10 @@ public static long snapshotIdAsOfTime(Table table, long timestampMillis) { } } - Preconditions.checkArgument(snapshotId != null, - "Cannot find a snapshot older than %s", DateTimeUtil.formatTimestampMillis(timestampMillis)); + Preconditions.checkArgument( + snapshotId != null, + "Cannot find a snapshot older than %s", + DateTimeUtil.formatTimestampMillis(timestampMillis)); return snapshotId; } @@ -337,8 +351,7 @@ public static Schema schemaFor(Table table, long snapshotId) { // schemaId could be null, if snapshot was created before Iceberg added schema id to snapshot if (schemaId != null) { Schema schema = table.schemas().get(schemaId); - Preconditions.checkState(schema != null, - "Cannot find schema with schema id %s", schemaId); + Preconditions.checkState(schema != null, "Cannot find schema with schema id %s", schemaId); return schema; } @@ -347,9 +360,9 @@ public static Schema schemaFor(Table table, long snapshotId) { } /** - * Convenience method for returning the schema of the table for a snapshot, - * when we have a snapshot id or a timestamp. Only one of them should be specified - * (non-null), or an IllegalArgumentException is thrown. + * Convenience method for returning the schema of the table for a snapshot, when we have a + * snapshot id or a timestamp. Only one of them should be specified (non-null), or an + * IllegalArgumentException is thrown. * * @param table a {@link Table} * @param snapshotId the ID of the snapshot @@ -358,7 +371,8 @@ public static Schema schemaFor(Table table, long snapshotId) { * @throws IllegalArgumentException if both snapshotId and timestampMillis are non-null */ public static Schema schemaFor(Table table, Long snapshotId, Long timestampMillis) { - Preconditions.checkArgument(snapshotId == null || timestampMillis == null, + Preconditions.checkArgument( + snapshotId == null || timestampMillis == null, "Cannot use both snapshot id and timestamp to find a schema"); if (snapshotId != null) { diff --git a/core/src/main/java/org/apache/iceberg/util/SortOrderUtil.java b/core/src/main/java/org/apache/iceberg/util/SortOrderUtil.java index 74b8e2c36739..2edbe7fb28da 100644 --- a/core/src/main/java/org/apache/iceberg/util/SortOrderUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/SortOrderUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.Collections; @@ -37,8 +36,7 @@ public class SortOrderUtil { - private SortOrderUtil() { - } + private SortOrderUtil() {} public static SortOrder buildSortOrder(Table table) { return buildSortOrder(table.schema(), table.spec(), table.sortOrder()); @@ -51,27 +49,34 @@ public static SortOrder buildSortOrder(Table table, SortOrder sortOrder) { /** * Build a final sort order that satisfies the clustering required by the partition spec. - *

- * The incoming sort order may or may not satisfy the clustering needed by the partition spec. This modifies the sort - * order so that it clusters by partition and still produces the same order within each partition. + * + *

The incoming sort order may or may not satisfy the clustering needed by the partition spec. + * This modifies the sort order so that it clusters by partition and still produces the same order + * within each partition. * * @param schema a schema * @param spec a partition spec * @param sortOrder a sort order - * @return the sort order with additional sort fields to satisfy the clustering required by the spec + * @return the sort order with additional sort fields to satisfy the clustering required by the + * spec */ public static SortOrder buildSortOrder(Schema schema, PartitionSpec spec, SortOrder sortOrder) { if (sortOrder.isUnsorted() && spec.isUnpartitioned()) { return SortOrder.unsorted(); } - // make a map of the partition fields that need to be included in the clustering produced by the sort order - Map, Integer>, PartitionField> requiredClusteringFields = requiredClusteringFields(spec); + // make a map of the partition fields that need to be included in the clustering produced by the + // sort order + Map, Integer>, PartitionField> requiredClusteringFields = + requiredClusteringFields(spec); - // remove any partition fields that are clustered by the sort order by iterating over a prefix in the sort order. - // this will stop when a non-partition field is found, or when the sort field only satisfies the partition field. + // remove any partition fields that are clustered by the sort order by iterating over a prefix + // in the sort order. + // this will stop when a non-partition field is found, or when the sort field only satisfies the + // partition field. for (SortField sortField : sortOrder.fields()) { - Pair, Integer> sourceAndTransform = Pair.of(sortField.transform(), sortField.sourceId()); + Pair, Integer> sourceAndTransform = + Pair.of(sortField.transform(), sortField.sourceId()); if (requiredClusteringFields.containsKey(sourceAndTransform)) { requiredClusteringFields.remove(sourceAndTransform); continue; // keep processing the prefix @@ -80,7 +85,8 @@ public static SortOrder buildSortOrder(Schema schema, PartitionSpec spec, SortOr // if the field satisfies the order of any partition fields, also remove them before stopping // use a set to avoid concurrent modification for (PartitionField field : spec.fields()) { - if (sortField.sourceId() == field.sourceId() && sortField.transform().satisfiesOrderOf(field.transform())) { + if (sortField.sourceId() == field.sourceId() + && sortField.transform().satisfiesOrderOf(field.transform())) { requiredClusteringFields.remove(Pair.of(field.transform(), field.sourceId())); } } @@ -101,20 +107,24 @@ public static SortOrder buildSortOrder(Schema schema, PartitionSpec spec, SortOr return builder.build(); } - private static Map, Integer>, PartitionField> requiredClusteringFields(PartitionSpec spec) { - Map, Integer>, PartitionField> requiredClusteringFields = Maps.newLinkedHashMap(); + private static Map, Integer>, PartitionField> requiredClusteringFields( + PartitionSpec spec) { + Map, Integer>, PartitionField> requiredClusteringFields = + Maps.newLinkedHashMap(); for (PartitionField partField : spec.fields()) { if (!partField.transform().toString().equals("void")) { - requiredClusteringFields.put(Pair.of(partField.transform(), partField.sourceId()), partField); + requiredClusteringFields.put( + Pair.of(partField.transform(), partField.sourceId()), partField); } } - // remove any partition fields that are satisfied by another partition field, like days(ts) and hours(ts) + // remove any partition fields that are satisfied by another partition field, like days(ts) and + // hours(ts) for (PartitionField partField : spec.fields()) { for (PartitionField field : spec.fields()) { - if (!partField.equals(field) && - partField.sourceId() == field.sourceId() && - partField.transform().satisfiesOrderOf(field.transform())) { + if (!partField.equals(field) + && partField.sourceId() == field.sourceId() + && partField.transform().satisfiesOrderOf(field.transform())) { requiredClusteringFields.remove(Pair.of(field.transform(), field.sourceId())); } } diff --git a/core/src/main/java/org/apache/iceberg/util/SortedMerge.java b/core/src/main/java/org/apache/iceberg/util/SortedMerge.java index f747439ac88c..d93116852eb9 100644 --- a/core/src/main/java/org/apache/iceberg/util/SortedMerge.java +++ b/core/src/main/java/org/apache/iceberg/util/SortedMerge.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.io.Closeable; @@ -34,8 +33,8 @@ /** * An Iterable that merges the items from other Iterables in order. - *

- * This assumes that the Iterables passed in produce items in sorted order. + * + *

This assumes that the Iterables passed in produce items in sorted order. * * @param the type of objects produced by this Iterable */ @@ -50,10 +49,11 @@ public SortedMerge(Comparator comparator, List> iterable @Override public CloseableIterator iterator() { - List> iterators = iterables.stream() - .map(CloseableIterable::iterator) - .filter(Iterator::hasNext) - .collect(Collectors.toList()); + List> iterators = + iterables.stream() + .map(CloseableIterable::iterator) + .filter(Iterator::hasNext) + .collect(Collectors.toList()); if (iterators.size() == 1) { addCloseable(iterators.get(0)); @@ -67,8 +67,8 @@ public CloseableIterator iterator() { /** * An Iterator that merges the items from other Iterators in order. - *

- * This assumes that the Iterators passed in produce items in sorted order. + * + *

This assumes that the Iterators passed in produce items in sorted order. */ private class MergeIterator implements CloseableIterator { private final PriorityQueue>> heap; diff --git a/core/src/main/java/org/apache/iceberg/util/StructLikeMap.java b/core/src/main/java/org/apache/iceberg/util/StructLikeMap.java index 66141fb0bcd4..6af658d13007 100644 --- a/core/src/main/java/org/apache/iceberg/util/StructLikeMap.java +++ b/core/src/main/java/org/apache/iceberg/util/StructLikeMap.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.AbstractMap; @@ -162,8 +161,8 @@ public boolean equals(Object o) { return false; } else { StructLikeEntry that = (StructLikeEntry) o; - return Objects.equals(getKey(), that.getKey()) && - Objects.equals(getValue(), that.getValue()); + return Objects.equals(getKey(), that.getKey()) + && Objects.equals(getValue(), that.getValue()); } } diff --git a/core/src/main/java/org/apache/iceberg/util/StructLikeSet.java b/core/src/main/java/org/apache/iceberg/util/StructLikeSet.java index 8e76490c3f4e..f1f32cb92435 100644 --- a/core/src/main/java/org/apache/iceberg/util/StructLikeSet.java +++ b/core/src/main/java/org/apache/iceberg/util/StructLikeSet.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.AbstractSet; @@ -125,8 +124,8 @@ public boolean containsAll(Collection objects) { @Override public boolean addAll(Collection structs) { if (structs != null) { - return Iterables.addAll(wrapperSet, - Iterables.transform(structs, struct -> wrappers.get().copyFor(struct))); + return Iterables.addAll( + wrapperSet, Iterables.transform(structs, struct -> wrappers.get().copyFor(struct))); } return false; } diff --git a/core/src/main/java/org/apache/iceberg/util/StructLikeWrapper.java b/core/src/main/java/org/apache/iceberg/util/StructLikeWrapper.java index 5d614acac566..e8cf0a8db76e 100644 --- a/core/src/main/java/org/apache/iceberg/util/StructLikeWrapper.java +++ b/core/src/main/java/org/apache/iceberg/util/StructLikeWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.Comparator; @@ -25,9 +24,7 @@ import org.apache.iceberg.types.JavaHash; import org.apache.iceberg.types.Types; -/** - * Wrapper to adapt StructLike for use in maps and sets by implementing equals and hashCode. - */ +/** Wrapper to adapt StructLike for use in maps and sets by implementing equals and hashCode. */ public class StructLikeWrapper { public static StructLikeWrapper forType(Types.StructType struct) { @@ -51,9 +48,9 @@ private StructLikeWrapper(Comparator comparator, JavaHash - * This is equivalent to {@code new StructLikeWrapper(type).set(newStruct)} but is cheaper because no analysis of the - * type is necessary. + * + *

This is equivalent to {@code new StructLikeWrapper(type).set(newStruct)} but is cheaper + * because no analysis of the type is necessary. * * @param newStruct a {@link StructLike} row * @return a copy of this wrapper wrapping the give struct diff --git a/core/src/main/java/org/apache/iceberg/util/TableScanUtil.java b/core/src/main/java/org/apache/iceberg/util/TableScanUtil.java index dc62d9f5a4f2..36123949b1d4 100644 --- a/core/src/main/java/org/apache/iceberg/util/TableScanUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/TableScanUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.List; @@ -39,46 +38,54 @@ public class TableScanUtil { - private TableScanUtil() { - } + private TableScanUtil() {} public static boolean hasDeletes(CombinedScanTask task) { return task.files().stream().anyMatch(TableScanUtil::hasDeletes); } /** - * This is temporarily introduced since we plan to support pos-delete vectorized read first, then get to the - * equality-delete support. We will remove this method once both are supported. + * This is temporarily introduced since we plan to support pos-delete vectorized read first, then + * get to the equality-delete support. We will remove this method once both are supported. */ public static boolean hasEqDeletes(CombinedScanTask task) { - return task.files().stream().anyMatch( - t -> t.deletes().stream().anyMatch(deleteFile -> deleteFile.content().equals(FileContent.EQUALITY_DELETES))); + return task.files().stream() + .anyMatch( + t -> + t.deletes().stream() + .anyMatch( + deleteFile -> deleteFile.content().equals(FileContent.EQUALITY_DELETES))); } public static boolean hasDeletes(FileScanTask task) { return !task.deletes().isEmpty(); } - public static CloseableIterable splitFiles(CloseableIterable tasks, long splitSize) { + public static CloseableIterable splitFiles( + CloseableIterable tasks, long splitSize) { Preconditions.checkArgument(splitSize > 0, "Invalid split size (negative or 0): %s", splitSize); - Iterable splitTasks = FluentIterable - .from(tasks) - .transformAndConcat(input -> input.split(splitSize)); + Iterable splitTasks = + FluentIterable.from(tasks).transformAndConcat(input -> input.split(splitSize)); // Capture manifests which can be closed after scan planning return CloseableIterable.combine(splitTasks, tasks); } - public static CloseableIterable planTasks(CloseableIterable splitFiles, - long splitSize, int lookback, long openFileCost) { + public static CloseableIterable planTasks( + CloseableIterable splitFiles, long splitSize, int lookback, long openFileCost) { Preconditions.checkArgument(splitSize > 0, "Invalid split size (negative or 0): %s", splitSize); - Preconditions.checkArgument(lookback > 0, "Invalid split planning lookback (negative or 0): %s", lookback); - Preconditions.checkArgument(openFileCost >= 0, "Invalid file open cost (negative): %s", openFileCost); + Preconditions.checkArgument( + lookback > 0, "Invalid split planning lookback (negative or 0): %s", lookback); + Preconditions.checkArgument( + openFileCost >= 0, "Invalid file open cost (negative): %s", openFileCost); // Check the size of delete file as well to avoid unbalanced bin-packing - Function weightFunc = file -> Math.max( - file.length() + file.deletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum(), - (1 + file.deletes().size()) * openFileCost); + Function weightFunc = + file -> + Math.max( + file.length() + + file.deletes().stream().mapToLong(ContentFile::fileSizeInBytes).sum(), + (1 + file.deletes().size()) * openFileCost); return CloseableIterable.transform( CloseableIterable.combine( @@ -88,26 +95,31 @@ public static CloseableIterable planTasks(CloseableIterable CloseableIterable> planTaskGroups(CloseableIterable tasks, - long splitSize, int lookback, - long openFileCost) { + public static CloseableIterable> planTaskGroups( + CloseableIterable tasks, long splitSize, int lookback, long openFileCost) { Preconditions.checkArgument(splitSize > 0, "Invalid split size (negative or 0): %s", splitSize); - Preconditions.checkArgument(lookback > 0, "Invalid split planning lookback (negative or 0): %s", lookback); - Preconditions.checkArgument(openFileCost >= 0, "Invalid file open cost (negative): %s", openFileCost); + Preconditions.checkArgument( + lookback > 0, "Invalid split planning lookback (negative or 0): %s", lookback); + Preconditions.checkArgument( + openFileCost >= 0, "Invalid file open cost (negative): %s", openFileCost); // capture manifests which can be closed after scan planning - CloseableIterable splitTasks = CloseableIterable.combine( - FluentIterable.from(tasks).transformAndConcat(task -> { - if (task instanceof SplittableScanTask) { - return ((SplittableScanTask) task).split(splitSize); - } else { - return ImmutableList.of(task); - } - }), - tasks); - - Function weightFunc = task -> Math.max(task.sizeBytes(), task.filesCount() * openFileCost); + CloseableIterable splitTasks = + CloseableIterable.combine( + FluentIterable.from(tasks) + .transformAndConcat( + task -> { + if (task instanceof SplittableScanTask) { + return ((SplittableScanTask) task).split(splitSize); + } else { + return ImmutableList.of(task); + } + }), + tasks); + + Function weightFunc = + task -> Math.max(task.sizeBytes(), task.filesCount() * openFileCost); return CloseableIterable.transform( CloseableIterable.combine( @@ -125,7 +137,8 @@ public static List mergeTasks(List tasks) { for (T task : tasks) { if (lastTask != null) { if (lastTask instanceof MergeableScanTask) { - MergeableScanTask mergeableLastTask = (MergeableScanTask) lastTask; + MergeableScanTask mergeableLastTask = + (MergeableScanTask) lastTask; if (mergeableLastTask.canMerge(task)) { lastTask = mergeableLastTask.merge(task); } else { diff --git a/core/src/main/java/org/apache/iceberg/util/Tasks.java b/core/src/main/java/org/apache/iceberg/util/Tasks.java index 66b8d236dccf..dfd4ab99848a 100644 --- a/core/src/main/java/org/apache/iceberg/util/Tasks.java +++ b/core/src/main/java/org/apache/iceberg/util/Tasks.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.Arrays; @@ -42,8 +41,7 @@ public class Tasks { private static final Logger LOG = LoggerFactory.getLogger(Tasks.class); - private Tasks() { - } + private Tasks() {} public static class UnrecoverableException extends RuntimeException { public UnrecoverableException(String message) { @@ -79,15 +77,15 @@ public static class Builder { private boolean stopAbortsOnFailure = false; // retry settings - private List> stopRetryExceptions = Lists.newArrayList( - UnrecoverableException.class); + private List> stopRetryExceptions = + Lists.newArrayList(UnrecoverableException.class); private List> onlyRetryExceptions = null; private Predicate shouldRetryPredicate = null; - private int maxAttempts = 1; // not all operations can be retried - private long minSleepTimeMs = 1000; // 1 second + private int maxAttempts = 1; // not all operations can be retried + private long minSleepTimeMs = 1000; // 1 second private long maxSleepTimeMs = 600000; // 10 minutes - private long maxDurationMs = 600000; // 10 minutes - private double scaleFactor = 2.0; // exponential + private long maxDurationMs = 600000; // 10 minutes + private double scaleFactor = 2.0; // exponential public Builder(Iterable items) { this.items = items; @@ -175,10 +173,11 @@ public final Builder onlyRetryOn(Class... exceptions) { return this; } - public Builder exponentialBackoff(long backoffMinSleepTimeMs, - long backoffMaxSleepTimeMs, - long backoffMaxRetryTimeMs, - double backoffScaleFactor) { + public Builder exponentialBackoff( + long backoffMinSleepTimeMs, + long backoffMaxSleepTimeMs, + long backoffMaxRetryTimeMs, + double backoffScaleFactor) { this.minSleepTimeMs = backoffMinSleepTimeMs; this.maxSleepTimeMs = backoffMaxSleepTimeMs; this.maxDurationMs = backoffMaxRetryTimeMs; @@ -190,8 +189,7 @@ public boolean run(Task task) { return run(task, RuntimeException.class); } - public boolean run(Task task, - Class exceptionClass) throws E { + public boolean run(Task task, Class exceptionClass) throws E { if (service != null) { return runParallel(task, exceptionClass); } else { @@ -270,8 +268,7 @@ private boolean runSingleThreaded( if (throwFailureWhenFinished && !exceptions.isEmpty()) { Tasks.throwOne(exceptions, exceptionClass); } else if (throwFailureWhenFinished && threw) { - throw new RuntimeException( - "Task set failed with an uncaught throwable"); + throw new RuntimeException("Task set failed with an uncaught throwable"); } return !threw; @@ -287,9 +284,8 @@ private void tryRunOnFailure(I item, Exception failure) { } } - private boolean runParallel(final Task task, - Class exceptionClass) - throws E { + private boolean runParallel( + final Task task, Class exceptionClass) throws E { final Queue succeeded = new ConcurrentLinkedQueue<>(); final Queue exceptions = new ConcurrentLinkedQueue<>(); final AtomicBoolean taskFailed = new AtomicBoolean(false); @@ -300,53 +296,55 @@ private boolean runParallel(final Task task, for (final I item : items) { // submit a task for each item that will either run or abort the task - futures.add(service.submit(new Runnable() { - @Override - public void run() { - if (!(stopOnFailure && taskFailed.get())) { - // run the task with retries - boolean threw = true; - try { - runTaskWithRetry(task, item); - - succeeded.add(item); - - threw = false; - - } catch (Exception e) { - taskFailed.set(true); - exceptions.add(e); - - if (onFailure != null) { - tryRunOnFailure(item, e); - } - } finally { - if (threw) { - taskFailed.set(true); - } - } - - } else if (abortTask != null) { - // abort the task instead of running it - if (stopAbortsOnFailure && abortFailed.get()) { - return; - } - - boolean failed = true; - try { - abortTask.run(item); - failed = false; - } catch (Exception e) { - LOG.error("Failed to abort task", e); - // swallow the exception - } finally { - if (failed) { - abortFailed.set(true); - } - } - } - } - })); + futures.add( + service.submit( + new Runnable() { + @Override + public void run() { + if (!(stopOnFailure && taskFailed.get())) { + // run the task with retries + boolean threw = true; + try { + runTaskWithRetry(task, item); + + succeeded.add(item); + + threw = false; + + } catch (Exception e) { + taskFailed.set(true); + exceptions.add(e); + + if (onFailure != null) { + tryRunOnFailure(item, e); + } + } finally { + if (threw) { + taskFailed.set(true); + } + } + + } else if (abortTask != null) { + // abort the task instead of running it + if (stopAbortsOnFailure && abortFailed.get()) { + return; + } + + boolean failed = true; + try { + abortTask.run(item); + failed = false; + } catch (Exception e) { + LOG.error("Failed to abort task", e); + // swallow the exception + } finally { + if (failed) { + abortFailed.set(true); + } + } + } + } + })); } // let the above tasks complete (or abort) @@ -356,27 +354,29 @@ public void run() { if (taskFailed.get() && revertTask != null) { // at least one task failed, revert any that succeeded for (final I item : succeeded) { - futures.add(service.submit(new Runnable() { - @Override - public void run() { - if (stopRevertsOnFailure && revertFailed.get()) { - return; - } - - boolean failed = true; - try { - revertTask.run(item); - failed = false; - } catch (Exception e) { - LOG.error("Failed to revert task", e); - // swallow the exception - } finally { - if (failed) { - revertFailed.set(true); - } - } - } - })); + futures.add( + service.submit( + new Runnable() { + @Override + public void run() { + if (stopRevertsOnFailure && revertFailed.get()) { + return; + } + + boolean failed = true; + try { + revertTask.run(item); + failed = false; + } catch (Exception e) { + LOG.error("Failed to revert task", e); + // swallow the exception + } finally { + if (failed) { + revertFailed.set(true); + } + } + } + })); } // let the revert tasks complete @@ -386,16 +386,14 @@ public void run() { if (throwFailureWhenFinished && !exceptions.isEmpty()) { Tasks.throwOne(exceptions, exceptionClass); } else if (throwFailureWhenFinished && taskFailed.get()) { - throw new RuntimeException( - "Task set failed with an uncaught throwable"); + throw new RuntimeException("Task set failed with an uncaught throwable"); } return !taskFailed.get(); } @SuppressWarnings("checkstyle:CyclomaticComplexity") - private void runTaskWithRetry(Task task, I item) - throws E { + private void runTaskWithRetry(Task task, I item) throws E { long start = System.currentTimeMillis(); int attempt = 0; while (true) { @@ -440,11 +438,9 @@ private void runTaskWithRetry(Task task, I item) } } - int delayMs = (int) Math.min( - minSleepTimeMs * Math.pow(scaleFactor, attempt - 1), - maxSleepTimeMs); - int jitter = ThreadLocalRandom.current() - .nextInt(Math.max(1, (int) (delayMs * 0.1))); + int delayMs = + (int) Math.min(minSleepTimeMs * Math.pow(scaleFactor, attempt - 1), maxSleepTimeMs); + int jitter = ThreadLocalRandom.current().nextInt(Math.max(1, (int) (delayMs * 0.1))); LOG.warn("Retrying task after failure: {}", e.getMessage(), e); @@ -522,9 +518,7 @@ private static Collection waitFor(Collection> futures) { } } - /** - * A range, [ 0, size ) - */ + /** A range, [ 0, size ) */ private static class Range implements Iterable { private int size; @@ -589,5 +583,4 @@ private static void throwOne( ExceptionUtil.castAndThrow(exception, allowedException); } - } diff --git a/core/src/main/java/org/apache/iceberg/util/ThreadPools.java b/core/src/main/java/org/apache/iceberg/util/ThreadPools.java index ee970005bb72..c4e0f31e21a6 100644 --- a/core/src/main/java/org/apache/iceberg/util/ThreadPools.java +++ b/core/src/main/java/org/apache/iceberg/util/ThreadPools.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.concurrent.ExecutorService; @@ -31,26 +30,25 @@ public class ThreadPools { - private ThreadPools() { - } + private ThreadPools() {} public static final String WORKER_THREAD_POOL_SIZE_PROP = SystemProperties.WORKER_THREAD_POOL_SIZE_PROP; - public static final int WORKER_THREAD_POOL_SIZE = getPoolSize( - WORKER_THREAD_POOL_SIZE_PROP, - Math.max(2, Runtime.getRuntime().availableProcessors())); + public static final int WORKER_THREAD_POOL_SIZE = + getPoolSize( + WORKER_THREAD_POOL_SIZE_PROP, Math.max(2, Runtime.getRuntime().availableProcessors())); private static final ExecutorService WORKER_POOL = newWorkerPool("iceberg-worker-pool"); /** * Return an {@link ExecutorService} that uses the "worker" thread-pool. - *

- * The size of the worker pool limits the number of tasks concurrently reading manifests in the + * + *

The size of the worker pool limits the number of tasks concurrently reading manifests in the * base table implementation across all concurrent planning operations. - *

- * The size of this thread-pool is controlled by the Java system property - * {@code iceberg.worker.num-threads}. + * + *

The size of this thread-pool is controlled by the Java system property {@code + * iceberg.worker.num-threads}. * * @return an {@link ExecutorService} that uses the worker pool */ @@ -64,13 +62,14 @@ public static ExecutorService newWorkerPool(String namePrefix) { public static ExecutorService newWorkerPool(String namePrefix, int poolSize) { return MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(poolSize, newDaemonThreadFactory(namePrefix))); + (ThreadPoolExecutor) + Executors.newFixedThreadPool(poolSize, newDaemonThreadFactory(namePrefix))); } /** * Create a new {@link ScheduledExecutorService} with the given name and pool size. - *

- * Threads used by this service will be daemon threads. + * + *

Threads used by this service will be daemon threads. * * @param namePrefix a base name for threads in the executor service's thread pool * @param poolSize max number of threads to use @@ -93,9 +92,6 @@ private static int getPoolSize(String systemProperty, int defaultSize) { } private static ThreadFactory newDaemonThreadFactory(String namePrefix) { - return new ThreadFactoryBuilder() - .setDaemon(true) - .setNameFormat(namePrefix + "-%d") - .build(); + return new ThreadFactoryBuilder().setDaemon(true).setNameFormat(namePrefix + "-%d").build(); } } diff --git a/core/src/main/java/org/apache/iceberg/util/WapUtil.java b/core/src/main/java/org/apache/iceberg/util/WapUtil.java index 3ecd44a52241..4718d138b5d4 100644 --- a/core/src/main/java/org/apache/iceberg/util/WapUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/WapUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import org.apache.iceberg.Snapshot; @@ -26,19 +25,23 @@ public class WapUtil { - private WapUtil() { - } + private WapUtil() {} public static String stagedWapId(Snapshot snapshot) { - return snapshot.summary() != null ? snapshot.summary().get(SnapshotSummary.STAGED_WAP_ID_PROP) : null; + return snapshot.summary() != null + ? snapshot.summary().get(SnapshotSummary.STAGED_WAP_ID_PROP) + : null; } public static String publishedWapId(Snapshot snapshot) { - return snapshot.summary() != null ? snapshot.summary().get(SnapshotSummary.PUBLISHED_WAP_ID_PROP) : null; + return snapshot.summary() != null + ? snapshot.summary().get(SnapshotSummary.PUBLISHED_WAP_ID_PROP) + : null; } /** - * Check if a given staged snapshot's associated wap-id was already published. Does not fail for non-WAP workflows. + * Check if a given staged snapshot's associated wap-id was already published. Does not fail for + * non-WAP workflows. * * @param current the current {@link TableMetadata metadata} for the target table * @param wapSnapshotId a snapshot id which could have been staged and is associated with a wap id diff --git a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java index a25a90a20cdf..923f3dc2d5c6 100644 --- a/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java +++ b/core/src/main/java/org/apache/iceberg/util/ZOrderByteUtils.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.nio.ByteBuffer; @@ -27,35 +26,34 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; /** - * Within Z-Ordering the byte representations of objects being compared must be ordered, - * this requires several types to be transformed when converted to bytes. The goal is to - * map object's whose byte representation are not lexicographically ordered into representations - * that are lexicographically ordered. Bytes produced should be compared lexicographically as - * unsigned bytes, big-endian. - *

- * All types except for String are stored within an 8 Byte Buffer - *

- * Most of these techniques are derived from + * Within Z-Ordering the byte representations of objects being compared must be ordered, this + * requires several types to be transformed when converted to bytes. The goal is to map object's + * whose byte representation are not lexicographically ordered into representations that are + * lexicographically ordered. Bytes produced should be compared lexicographically as unsigned bytes, + * big-endian. + * + *

All types except for String are stored within an 8 Byte Buffer + * + *

Most of these techniques are derived from * https://aws.amazon.com/blogs/database/z-order-indexing-for-multifaceted-queries-in-amazon-dynamodb-part-2/ - *

- * Some implementation is taken from + * + *

Some implementation is taken from * https://github.com/apache/hbase/blob/master/hbase-common/src/main/java/org/apache/hadoop/hbase/util/OrderedBytes.java */ public class ZOrderByteUtils { public static final int PRIMITIVE_BUFFER_SIZE = 8; - private ZOrderByteUtils() { - } + private ZOrderByteUtils() {} static ByteBuffer allocatePrimitiveBuffer() { return ByteBuffer.allocate(PRIMITIVE_BUFFER_SIZE); } /** - * Signed ints do not have their bytes in magnitude order because of the sign bit. - * To fix this, flip the sign bit so that all negatives are ordered before positives. This essentially - * shifts the 0 value so that we don't break our ordering when we cross the new 0 value. + * Signed ints do not have their bytes in magnitude order because of the sign bit. To fix this, + * flip the sign bit so that all negatives are ordered before positives. This essentially shifts + * the 0 value so that we don't break our ordering when we cross the new 0 value. */ public static ByteBuffer intToOrderedBytes(int val, ByteBuffer reuse) { ByteBuffer bytes = ByteBuffers.reuse(reuse, PRIMITIVE_BUFFER_SIZE); @@ -64,7 +62,8 @@ public static ByteBuffer intToOrderedBytes(int val, ByteBuffer reuse) { } /** - * Signed longs are treated the same as the signed ints in {@link #intToOrderedBytes(int, ByteBuffer)} + * Signed longs are treated the same as the signed ints in {@link #intToOrderedBytes(int, + * ByteBuffer)} */ public static ByteBuffer longToOrderedBytes(long val, ByteBuffer reuse) { ByteBuffer bytes = ByteBuffers.reuse(reuse, PRIMITIVE_BUFFER_SIZE); @@ -73,7 +72,8 @@ public static ByteBuffer longToOrderedBytes(long val, ByteBuffer reuse) { } /** - * Signed shorts are treated the same as the signed ints in {@link #intToOrderedBytes(int, ByteBuffer)} + * Signed shorts are treated the same as the signed ints in {@link #intToOrderedBytes(int, + * ByteBuffer)} */ public static ByteBuffer shortToOrderedBytes(short val, ByteBuffer reuse) { ByteBuffer bytes = ByteBuffers.reuse(reuse, PRIMITIVE_BUFFER_SIZE); @@ -82,7 +82,8 @@ public static ByteBuffer shortToOrderedBytes(short val, ByteBuffer reuse) { } /** - * Signed tiny ints are treated the same as the signed ints in {@link #intToOrderedBytes(int, ByteBuffer)} + * Signed tiny ints are treated the same as the signed ints in {@link #intToOrderedBytes(int, + * ByteBuffer)} */ public static ByteBuffer tinyintToOrderedBytes(byte val, ByteBuffer reuse) { ByteBuffer bytes = ByteBuffers.reuse(reuse, PRIMITIVE_BUFFER_SIZE); @@ -91,12 +92,12 @@ public static ByteBuffer tinyintToOrderedBytes(byte val, ByteBuffer reuse) { } /** - * IEEE 754 : - * “If two floating-point numbers in the same format are ordered (say, x {@literal <} y), - * they are ordered the same way when their bits are reinterpreted as sign-magnitude integers.” + * IEEE 754 : “If two floating-point numbers in the same format are ordered (say, x {@literal <} + * y), they are ordered the same way when their bits are reinterpreted as sign-magnitude + * integers.” * - * Which means floats can be treated as sign magnitude integers which can then be converted into lexicographically - * comparable bytes + *

Which means floats can be treated as sign magnitude integers which can then be converted + * into lexicographically comparable bytes */ public static ByteBuffer floatToOrderedBytes(float val, ByteBuffer reuse) { ByteBuffer bytes = ByteBuffers.reuse(reuse, PRIMITIVE_BUFFER_SIZE); @@ -106,9 +107,7 @@ public static ByteBuffer floatToOrderedBytes(float val, ByteBuffer reuse) { return bytes; } - /** - * Doubles are treated the same as floats in {@link #floatToOrderedBytes(float, ByteBuffer)} - */ + /** Doubles are treated the same as floats in {@link #floatToOrderedBytes(float, ByteBuffer)} */ public static ByteBuffer doubleToOrderedBytes(double val, ByteBuffer reuse) { ByteBuffer bytes = ByteBuffers.reuse(reuse, PRIMITIVE_BUFFER_SIZE); long lval = Double.doubleToLongBits(val); @@ -118,14 +117,16 @@ public static ByteBuffer doubleToOrderedBytes(double val, ByteBuffer reuse) { } /** - * Strings are lexicographically sortable BUT if different byte array lengths will - * ruin the Z-Ordering. (ZOrder requires that a given column contribute the same number of bytes every time). - * This implementation just uses a set size to for all output byte representations. Truncating longer strings - * and right padding 0 for shorter strings. + * Strings are lexicographically sortable BUT if different byte array lengths will ruin the + * Z-Ordering. (ZOrder requires that a given column contribute the same number of bytes every + * time). This implementation just uses a set size to for all output byte representations. + * Truncating longer strings and right padding 0 for shorter strings. */ @SuppressWarnings("ByteBufferBackingArray") - public static ByteBuffer stringToOrderedBytes(String val, int length, ByteBuffer reuse, CharsetEncoder encoder) { - Preconditions.checkArgument(encoder.charset().equals(StandardCharsets.UTF_8), + public static ByteBuffer stringToOrderedBytes( + String val, int length, ByteBuffer reuse, CharsetEncoder encoder) { + Preconditions.checkArgument( + encoder.charset().equals(StandardCharsets.UTF_8), "Cannot use an encoder not using UTF_8 as it's Charset"); ByteBuffer bytes = ByteBuffers.reuse(reuse, length); @@ -138,8 +139,8 @@ public static ByteBuffer stringToOrderedBytes(String val, int length, ByteBuffer } /** - * Return a bytebuffer with the given bytes truncated to length, or filled with 0's to length depending on whether - * the given bytes are larger or smaller than the given length. + * Return a bytebuffer with the given bytes truncated to length, or filled with 0's to length + * depending on whether the given bytes are larger or smaller than the given length. */ @SuppressWarnings("ByteBufferBackingArray") public static ByteBuffer byteTruncateOrFill(byte[] val, int length, ByteBuffer reuse) { @@ -158,17 +159,20 @@ static byte[] interleaveBits(byte[][] columnsBinary, int interleavedSize) { } /** - * Interleave bits using a naive loop. Variable length inputs are allowed but to get a consistent ordering it is - * required that every column contribute the same number of bytes in each invocation. Bits are interleaved from all - * columns that have a bit available at that position. Once a Column has no more bits to produce it is skipped in the - * interleaving. + * Interleave bits using a naive loop. Variable length inputs are allowed but to get a consistent + * ordering it is required that every column contribute the same number of bytes in each + * invocation. Bits are interleaved from all columns that have a bit available at that position. + * Once a Column has no more bits to produce it is skipped in the interleaving. + * * @param columnsBinary an array of ordered byte representations of the columns being ZOrdered * @param interleavedSize the number of bytes to use in the output * @return the columnbytes interleaved */ - // NarrowingCompoundAssignment is intended here. See https://github.com/apache/iceberg/pull/5200#issuecomment-1176226163 + // NarrowingCompoundAssignment is intended here. See + // https://github.com/apache/iceberg/pull/5200#issuecomment-1176226163 @SuppressWarnings({"ByteBufferBackingArray", "NarrowingCompoundAssignment"}) - public static byte[] interleaveBits(byte[][] columnsBinary, int interleavedSize, ByteBuffer reuse) { + public static byte[] interleaveBits( + byte[][] columnsBinary, int interleavedSize, ByteBuffer reuse) { byte[] interleavedBytes = reuse.array(); Arrays.fill(interleavedBytes, 0, interleavedSize, (byte) 0x00); @@ -181,7 +185,7 @@ public static byte[] interleaveBits(byte[][] columnsBinary, int interleavedSize, while (interleaveByte < interleavedSize) { // Take the source bit from source byte and move it to the output bit position interleavedBytes[interleaveByte] |= - (columnsBinary[sourceColumn][sourceByte] & 1 << sourceBit) >>> sourceBit << interleaveBit; + (columnsBinary[sourceColumn][sourceByte] & 1 << sourceBit) >>> sourceBit << interleaveBit; --interleaveBit; // Check if an output byte has been completed @@ -206,7 +210,8 @@ public static byte[] interleaveBits(byte[][] columnsBinary, int interleavedSize, sourceColumn = 0; --sourceBit; if (sourceBit == -1) { - // If the last bit of the source byte was used, reset to the highest bit of the next byte + // If the last bit of the source byte was used, reset to the highest bit of the next + // byte sourceByte++; sourceBit = 7; } @@ -215,5 +220,4 @@ public static byte[] interleaveBits(byte[][] columnsBinary, int interleavedSize, } return interleavedBytes; } - } diff --git a/core/src/test/java/org/apache/iceberg/LocalTableOperations.java b/core/src/test/java/org/apache/iceberg/LocalTableOperations.java index f12c09942808..27767801cd4e 100644 --- a/core/src/test/java/org/apache/iceberg/LocalTableOperations.java +++ b/core/src/test/java/org/apache/iceberg/LocalTableOperations.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -60,13 +59,15 @@ public FileIO io() { @Override public String metadataFileLocation(String fileName) { - return createdMetadataFilePaths.computeIfAbsent(fileName, name -> { - try { - return temp.newFile(name).getAbsolutePath(); - } catch (IOException e) { - throw new RuntimeIOException(e); - } - }); + return createdMetadataFilePaths.computeIfAbsent( + fileName, + name -> { + try { + return temp.newFile(name).getAbsolutePath(); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + }); } @Override diff --git a/core/src/test/java/org/apache/iceberg/MockFileScanTask.java b/core/src/test/java/org/apache/iceberg/MockFileScanTask.java index aef8065b68b7..fe9aae826c6b 100644 --- a/core/src/test/java/org/apache/iceberg/MockFileScanTask.java +++ b/core/src/test/java/org/apache/iceberg/MockFileScanTask.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.mockito.Mockito; diff --git a/core/src/test/java/org/apache/iceberg/ScanTestBase.java b/core/src/test/java/org/apache/iceberg/ScanTestBase.java index 01943b632ff2..2b68b6b0ec13 100644 --- a/core/src/test/java/org/apache/iceberg/ScanTestBase.java +++ b/core/src/test/java/org/apache/iceberg/ScanTestBase.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.junit.Assert.assertEquals; + import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -34,14 +36,12 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.junit.Assert.assertEquals; - @RunWith(Parameterized.class) -public abstract class ScanTestBase> extends TableTestBase { +public abstract class ScanTestBase> + extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public ScanTestBase(int formatVersion) { @@ -56,17 +56,22 @@ public void testTableScanHonorsSelect() { Schema expectedSchema = new Schema(required(1, "id", Types.IntegerType.get())); - assertEquals("A tableScan.select() should prune the schema", + assertEquals( + "A tableScan.select() should prune the schema", expectedSchema.asStruct(), scan.schema().asStruct()); } @Test public void testTableBothProjectAndSelect() { - AssertHelpers.assertThrows("Cannot set projection schema when columns are selected", - IllegalStateException.class, () -> newScan().select(Arrays.asList("id")).project(SCHEMA.select("data"))); - AssertHelpers.assertThrows("Cannot select columns when projection schema is set", - IllegalStateException.class, () -> newScan().project(SCHEMA.select("data")).select(Arrays.asList("id"))); + AssertHelpers.assertThrows( + "Cannot set projection schema when columns are selected", + IllegalStateException.class, + () -> newScan().select(Arrays.asList("id")).project(SCHEMA.select("data"))); + AssertHelpers.assertThrows( + "Cannot select columns when projection schema is set", + IllegalStateException.class, + () -> newScan().project(SCHEMA.select("data")).select(Arrays.asList("id"))); } @Test @@ -77,43 +82,41 @@ public void testTableScanHonorsSelectWithoutCaseSensitivity() { Schema expectedSchema = new Schema(required(1, "id", Types.IntegerType.get())); - assertEquals("A tableScan.select() should prune the schema without case sensitivity", + assertEquals( + "A tableScan.select() should prune the schema without case sensitivity", expectedSchema.asStruct(), scan1.schema().asStruct()); - assertEquals("A tableScan.select() should prune the schema regardless of scan refinement order", + assertEquals( + "A tableScan.select() should prune the schema regardless of scan refinement order", expectedSchema.asStruct(), scan2.schema().asStruct()); } @Test public void testTableScanHonorsIgnoreResiduals() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - T scan1 = newScan() - .filter(Expressions.equal("id", 5)); + T scan1 = newScan().filter(Expressions.equal("id", 5)); try (CloseableIterable tasks = scan1.planTasks()) { Assert.assertTrue("Tasks should not be empty", Iterables.size(tasks) > 0); for (CombinedScanTask combinedScanTask : tasks) { for (FileScanTask fileScanTask : combinedScanTask.files()) { - Assert.assertNotEquals("Residuals must be preserved", Expressions.alwaysTrue(), fileScanTask.residual()); + Assert.assertNotEquals( + "Residuals must be preserved", Expressions.alwaysTrue(), fileScanTask.residual()); } } } - T scan2 = newScan() - .filter(Expressions.equal("id", 5)) - .ignoreResiduals(); + T scan2 = newScan().filter(Expressions.equal("id", 5)).ignoreResiduals(); try (CloseableIterable tasks = scan2.planTasks()) { Assert.assertTrue("Tasks should not be empty", Iterables.size(tasks) > 0); for (CombinedScanTask combinedScanTask : tasks) { for (FileScanTask fileScanTask : combinedScanTask.files()) { - Assert.assertEquals("Residuals must be ignored", Expressions.alwaysTrue(), fileScanTask.residual()); + Assert.assertEquals( + "Residuals must be ignored", Expressions.alwaysTrue(), fileScanTask.residual()); } } } @@ -125,13 +128,18 @@ public void testTableScanWithPlanExecutor() { table.newFastAppend().appendFile(FILE_B).commit(); AtomicInteger planThreadsIndex = new AtomicInteger(0); - T scan = newScan() - .planWith(Executors.newFixedThreadPool(1, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("plan-" + planThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })); + T scan = + newScan() + .planWith( + Executors.newFixedThreadPool( + 1, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("plan-" + planThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })); Assert.assertEquals(2, Iterables.size(scan.planFiles())); Assert.assertTrue("Thread should be created in provided pool", planThreadsIndex.get() > 0); } @@ -139,50 +147,67 @@ public void testTableScanWithPlanExecutor() { @Test public void testReAddingPartitionField() throws Exception { Assume.assumeTrue(formatVersion == 2); - Schema schema = new Schema( - required(1, "a", Types.IntegerType.get()), - required(2, "b", Types.StringType.get()), - required(3, "data", Types.IntegerType.get()) - ); + Schema schema = + new Schema( + required(1, "a", Types.IntegerType.get()), + required(2, "b", Types.StringType.get()), + required(3, "data", Types.IntegerType.get())); PartitionSpec initialSpec = PartitionSpec.builderFor(schema).identity("a").build(); File dir = temp.newFolder(); dir.delete(); this.table = TestTables.create(dir, "test_part_evolution", schema, initialSpec, formatVersion); - table.newFastAppend().appendFile(DataFiles.builder(initialSpec) - .withPath("/path/to/data/a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("a=1") - .withRecordCount(1) - .build()).commit(); + table + .newFastAppend() + .appendFile( + DataFiles.builder(initialSpec) + .withPath("/path/to/data/a.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("a=1") + .withRecordCount(1) + .build()) + .commit(); table.updateSpec().addField("b").removeField("a").commit(); - table.newFastAppend().appendFile(DataFiles.builder(table.spec()) - .withPath("/path/to/data/b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("b=1") - .withRecordCount(1) - .build()).commit(); + table + .newFastAppend() + .appendFile( + DataFiles.builder(table.spec()) + .withPath("/path/to/data/b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("b=1") + .withRecordCount(1) + .build()) + .commit(); table.updateSpec().addField("a").commit(); - table.newFastAppend().appendFile(DataFiles.builder(table.spec()) - .withPath("/path/to/data/ab.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("b=1/a=1") - .withRecordCount(1) - .build()).commit(); - - table.newFastAppend().appendFile(DataFiles.builder(table.spec()) - .withPath("/path/to/data/a2b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("b=1/a=2") - .withRecordCount(1) - .build()).commit(); + table + .newFastAppend() + .appendFile( + DataFiles.builder(table.spec()) + .withPath("/path/to/data/ab.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("b=1/a=1") + .withRecordCount(1) + .build()) + .commit(); + + table + .newFastAppend() + .appendFile( + DataFiles.builder(table.spec()) + .withPath("/path/to/data/a2b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("b=1/a=2") + .withRecordCount(1) + .build()) + .commit(); TableScan scan1 = table.newScan().filter(Expressions.equal("b", "1")); try (CloseableIterable tasks = scan1.planTasks()) { Assert.assertTrue("There should be 1 combined task", Iterables.size(tasks) == 1); for (CombinedScanTask combinedScanTask : tasks) { - Assert.assertEquals("All 4 files should match b=1 filter", 4, combinedScanTask.files().size()); + Assert.assertEquals( + "All 4 files should match b=1 filter", 4, combinedScanTask.files().size()); } } @@ -190,7 +215,8 @@ public void testReAddingPartitionField() throws Exception { try (CloseableIterable tasks = scan2.planTasks()) { Assert.assertTrue("There should be 1 combined task", Iterables.size(tasks) == 1); for (CombinedScanTask combinedScanTask : tasks) { - Assert.assertEquals("a=2 and file without a in spec should match", 2, combinedScanTask.files().size()); + Assert.assertEquals( + "a=2 and file without a in spec should match", 2, combinedScanTask.files().size()); } } } diff --git a/core/src/test/java/org/apache/iceberg/TableMetadataParserCodecTest.java b/core/src/test/java/org/apache/iceberg/TableMetadataParserCodecTest.java index 0ffca02348f6..f225b364f4e7 100644 --- a/core/src/test/java/org/apache/iceberg/TableMetadataParserCodecTest.java +++ b/core/src/test/java/org/apache/iceberg/TableMetadataParserCodecTest.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.TableMetadataParser.Codec; @@ -27,8 +26,7 @@ public class TableMetadataParserCodecTest { - @Rule - public ExpectedException exceptionRule = ExpectedException.none(); + @Rule public ExpectedException exceptionRule = ExpectedException.none(); @Test public void testCompressionCodec() { diff --git a/core/src/test/java/org/apache/iceberg/TableMetadataParserTest.java b/core/src/test/java/org/apache/iceberg/TableMetadataParserTest.java index fde8343bc3cb..77a2e89a6fa8 100644 --- a/core/src/test/java/org/apache/iceberg/TableMetadataParserTest.java +++ b/core/src/test/java/org/apache/iceberg/TableMetadataParserTest.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.PartitionSpec.unpartitioned; +import static org.apache.iceberg.TableMetadata.newTableMetadata; +import static org.apache.iceberg.TableMetadataParser.getFileExtension; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -39,11 +43,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.PartitionSpec.unpartitioned; -import static org.apache.iceberg.TableMetadata.newTableMetadata; -import static org.apache.iceberg.TableMetadataParser.getFileExtension; -import static org.apache.iceberg.types.Types.NestedField.optional; - @RunWith(Parameterized.class) public class TableMetadataParserTest { @@ -51,7 +50,7 @@ public class TableMetadataParserTest { @Parameterized.Parameters(name = "codecName = {0}") public static Object[] parameters() { - return new Object[] { "none", "gzip" }; + return new Object[] {"none", "gzip"}; } private final String codecName; @@ -72,7 +71,8 @@ public void testCompressionProperty() throws IOException { TableMetadata metadata = newTableMetadata(SCHEMA, unpartitioned(), location, properties); TableMetadataParser.write(metadata, outputFile); Assert.assertEquals(codec == Codec.GZIP, isCompressed(fileName)); - TableMetadata actualMetadata = TableMetadataParser.read((FileIO) null, Files.localInput(new File(fileName))); + TableMetadata actualMetadata = + TableMetadataParser.read((FileIO) null, Files.localInput(new File(fileName))); verifyMetadata(metadata, actualMetadata); } diff --git a/core/src/test/java/org/apache/iceberg/TableTestBase.java b/core/src/test/java/org/apache/iceberg/TableTestBase.java index 3a3ab92b5186..44b6dd8395b0 100644 --- a/core/src/test/java/org/apache/iceberg/TableTestBase.java +++ b/core/src/test/java/org/apache/iceberg/TableTestBase.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -45,115 +46,131 @@ import org.junit.Rule; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TableTestBase { // Schema passed to create tables - public static final Schema SCHEMA = new Schema( - required(3, "id", Types.IntegerType.get()), - required(4, "data", Types.StringType.get()) - ); + public static final Schema SCHEMA = + new Schema( + required(3, "id", Types.IntegerType.get()), required(4, "data", Types.StringType.get())); protected static final int BUCKETS_NUMBER = 16; // Partition spec used to create tables - protected static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .bucket("data", BUCKETS_NUMBER) - .build(); - - static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_A2 = DataFiles.builder(SPEC) - .withPath("/path/to/data-a-2.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DeleteFile FILE_A_DELETES = FileMetadata.deleteFileBuilder(SPEC) - .ofPositionDeletes() - .withPath("/path/to/data-a-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); + protected static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).bucket("data", BUCKETS_NUMBER).build(); + + static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_A2 = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a-2.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DeleteFile FILE_A_DELETES = + FileMetadata.deleteFileBuilder(SPEC) + .ofPositionDeletes() + .withPath("/path/to/data-a-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); // Equality delete files. - static final DeleteFile FILE_A2_DELETES = FileMetadata.deleteFileBuilder(SPEC) - .ofEqualityDeletes(1) - .withPath("/path/to/data-a2-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=0") - .withRecordCount(1) - .build(); - static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=1") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DeleteFile FILE_B_DELETES = FileMetadata.deleteFileBuilder(SPEC) - .ofPositionDeletes() - .withPath("/path/to/data-b-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=1") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_C = DataFiles.builder(SPEC) - .withPath("/path/to/data-c.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=2") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DeleteFile FILE_C2_DELETES = FileMetadata.deleteFileBuilder(SPEC) - .ofEqualityDeletes(1) - .withPath("/path/to/data-c-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=2") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_D = DataFiles.builder(SPEC) - .withPath("/path/to/data-d.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=3") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DeleteFile FILE_D2_DELETES = FileMetadata.deleteFileBuilder(SPEC) - .ofEqualityDeletes(1) - .withPath("/path/to/data-d-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=3") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_WITH_STATS = DataFiles.builder(SPEC) - .withPath("/path/to/data-with-stats.parquet") - .withMetrics(new Metrics(10L, - ImmutableMap.of(3, 100L, 4, 200L), // column sizes - ImmutableMap.of(3, 90L, 4, 180L), // value counts - ImmutableMap.of(3, 10L, 4, 20L), // null value counts - ImmutableMap.of(3, 0L, 4, 0L), // nan value counts - ImmutableMap.of(3, Conversions.toByteBuffer(Types.IntegerType.get(), 1), - 4, Conversions.toByteBuffer(Types.IntegerType.get(), 2)), // lower bounds - ImmutableMap.of(3, Conversions.toByteBuffer(Types.IntegerType.get(), 5), - 4, Conversions.toByteBuffer(Types.IntegerType.get(), 10)) // upperbounds - )) - .withFileSizeInBytes(350) - .build(); + static final DeleteFile FILE_A2_DELETES = + FileMetadata.deleteFileBuilder(SPEC) + .ofEqualityDeletes(1) + .withPath("/path/to/data-a2-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0") + .withRecordCount(1) + .build(); + static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=1") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DeleteFile FILE_B_DELETES = + FileMetadata.deleteFileBuilder(SPEC) + .ofPositionDeletes() + .withPath("/path/to/data-b-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=1") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_C = + DataFiles.builder(SPEC) + .withPath("/path/to/data-c.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=2") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DeleteFile FILE_C2_DELETES = + FileMetadata.deleteFileBuilder(SPEC) + .ofEqualityDeletes(1) + .withPath("/path/to/data-c-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=2") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_D = + DataFiles.builder(SPEC) + .withPath("/path/to/data-d.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=3") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DeleteFile FILE_D2_DELETES = + FileMetadata.deleteFileBuilder(SPEC) + .ofEqualityDeletes(1) + .withPath("/path/to/data-d-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=3") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_WITH_STATS = + DataFiles.builder(SPEC) + .withPath("/path/to/data-with-stats.parquet") + .withMetrics( + new Metrics( + 10L, + ImmutableMap.of(3, 100L, 4, 200L), // column sizes + ImmutableMap.of(3, 90L, 4, 180L), // value counts + ImmutableMap.of(3, 10L, 4, 20L), // null value counts + ImmutableMap.of(3, 0L, 4, 0L), // nan value counts + ImmutableMap.of( + 3, + Conversions.toByteBuffer(Types.IntegerType.get(), 1), + 4, + Conversions.toByteBuffer(Types.IntegerType.get(), 2)), // lower bounds + ImmutableMap.of( + 3, + Conversions.toByteBuffer(Types.IntegerType.get(), 5), + 4, + Conversions.toByteBuffer(Types.IntegerType.get(), 10)) // upperbounds + )) + .withFileSizeInBytes(350) + .build(); static final FileIO FILE_IO = new TestTables.LocalFileIO(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); protected File tableDir = null; protected File metadataDir = null; public TestTables.TestTable table = null; protected final int formatVersion; + @SuppressWarnings("checkstyle:MemberName") protected final Assertions V1Assert; + @SuppressWarnings("checkstyle:MemberName") protected final Assertions V2Assert; @@ -182,12 +199,18 @@ List listManifestFiles() { } List listManifestFiles(File tableDirToList) { - return Lists.newArrayList(new File(tableDirToList, "metadata").listFiles((dir, name) -> - !name.startsWith("snap") && Files.getFileExtension(name).equalsIgnoreCase("avro"))); + return Lists.newArrayList( + new File(tableDirToList, "metadata") + .listFiles( + (dir, name) -> + !name.startsWith("snap") + && Files.getFileExtension(name).equalsIgnoreCase("avro"))); } public static long countAllMetadataFiles(File tableDir) { - return Arrays.stream(new File(tableDir, "metadata").listFiles()).filter(f -> f.isFile()).count(); + return Arrays.stream(new File(tableDir, "metadata").listFiles()) + .filter(f -> f.isFile()) + .count(); } protected TestTables.TestTable create(Schema schema, PartitionSpec spec) { @@ -215,7 +238,8 @@ ManifestFile writeManifest(Long snapshotId, DataFile... files) throws IOExceptio Assert.assertTrue(manifestFile.delete()); OutputFile outputFile = table.ops().io().newOutputFile(manifestFile.getCanonicalPath()); - ManifestWriter writer = ManifestFiles.write(formatVersion, table.spec(), outputFile, snapshotId); + ManifestWriter writer = + ManifestFiles.write(formatVersion, table.spec(), outputFile, snapshotId); try { for (DataFile file : files) { writer.add(file); @@ -236,19 +260,22 @@ ManifestFile writeManifest(Long snapshotId, ManifestEntry... entries) throws } @SuppressWarnings("unchecked") - > ManifestFile writeManifest(Long snapshotId, String fileName, ManifestEntry... entries) - throws IOException { + > ManifestFile writeManifest( + Long snapshotId, String fileName, ManifestEntry... entries) throws IOException { File manifestFile = temp.newFile(fileName); Assert.assertTrue(manifestFile.delete()); OutputFile outputFile = table.ops().io().newOutputFile(manifestFile.getCanonicalPath()); ManifestWriter writer; if (entries[0].file() instanceof DataFile) { - writer = (ManifestWriter) ManifestFiles.write( - formatVersion, table.spec(), outputFile, snapshotId); + writer = + (ManifestWriter) + ManifestFiles.write(formatVersion, table.spec(), outputFile, snapshotId); } else { - writer = (ManifestWriter) ManifestFiles.writeDeleteManifest( - formatVersion, table.spec(), outputFile, snapshotId); + writer = + (ManifestWriter) + ManifestFiles.writeDeleteManifest( + formatVersion, table.spec(), outputFile, snapshotId); } try { for (ManifestEntry entry : entries) { @@ -263,10 +290,11 @@ > ManifestFile writeManifest(Long snapshotId, String fi ManifestFile writeDeleteManifest(int newFormatVersion, Long snapshotId, DeleteFile... deleteFiles) throws IOException { - OutputFile manifestFile = org.apache.iceberg.Files - .localOutput(FileFormat.AVRO.addExtension(temp.newFile().toString())); - ManifestWriter writer = ManifestFiles.writeDeleteManifest( - newFormatVersion, SPEC, manifestFile, snapshotId); + OutputFile manifestFile = + org.apache.iceberg.Files.localOutput( + FileFormat.AVRO.addExtension(temp.newFile().toString())); + ManifestWriter writer = + ManifestFiles.writeDeleteManifest(newFormatVersion, SPEC, manifestFile, snapshotId); try { for (DeleteFile deleteFile : deleteFiles) { writer.add(deleteFile); @@ -282,7 +310,8 @@ ManifestFile writeManifestWithName(String name, DataFile... files) throws IOExce Assert.assertTrue(manifestFile.delete()); OutputFile outputFile = table.ops().io().newOutputFile(manifestFile.getCanonicalPath()); - ManifestWriter writer = ManifestFiles.write(formatVersion, table.spec(), outputFile, null); + ManifestWriter writer = + ManifestFiles.write(formatVersion, table.spec(), outputFile, null); try { for (DataFile file : files) { writer.add(file); @@ -294,7 +323,8 @@ ManifestFile writeManifestWithName(String name, DataFile... files) throws IOExce return writer.toManifestFile(); } - ManifestEntry manifestEntry(ManifestEntry.Status status, Long snapshotId, DataFile file) { + ManifestEntry manifestEntry( + ManifestEntry.Status status, Long snapshotId, DataFile file) { GenericManifestEntry entry = new GenericManifestEntry<>(table.spec().partitionType()); switch (status) { case ADDED: @@ -317,7 +347,8 @@ void validateSnapshot(Snapshot old, Snapshot snap, long sequenceNumber, DataFile } void validateSnapshot(Snapshot old, Snapshot snap, Long sequenceNumber, DataFile... newFiles) { - Assert.assertEquals("Should not change delete manifests", + Assert.assertEquals( + "Should not change delete manifests", old != null ? Sets.newHashSet(old.deleteManifests(FILE_IO)) : ImmutableSet.of(), Sets.newHashSet(snap.deleteManifests(FILE_IO))); List oldManifests = old != null ? old.dataManifests(FILE_IO) : ImmutableList.of(); @@ -325,12 +356,12 @@ void validateSnapshot(Snapshot old, Snapshot snap, Long sequenceNumber, DataFile // copy the manifests to a modifiable list and remove the existing manifests List newManifests = Lists.newArrayList(snap.dataManifests(FILE_IO)); for (ManifestFile oldManifest : oldManifests) { - Assert.assertTrue("New snapshot should contain old manifests", - newManifests.remove(oldManifest)); + Assert.assertTrue( + "New snapshot should contain old manifests", newManifests.remove(oldManifest)); } - Assert.assertEquals("Should create 1 new manifest and reuse old manifests", - 1, newManifests.size()); + Assert.assertEquals( + "Should create 1 new manifest and reuse old manifests", 1, newManifests.size()); ManifestFile manifest = newManifests.get(0); long id = snap.snapshotId(); @@ -339,8 +370,10 @@ void validateSnapshot(Snapshot old, Snapshot snap, Long sequenceNumber, DataFile for (ManifestEntry entry : ManifestFiles.read(manifest, FILE_IO).entries()) { DataFile file = entry.file(); if (sequenceNumber != null) { - V1Assert.assertEquals("Sequence number should default to 0", 0, entry.sequenceNumber().longValue()); - V2Assert.assertEquals("Sequence number should match expected", sequenceNumber, entry.sequenceNumber()); + V1Assert.assertEquals( + "Sequence number should default to 0", 0, entry.sequenceNumber().longValue()); + V2Assert.assertEquals( + "Sequence number should match expected", sequenceNumber, entry.sequenceNumber()); } Assert.assertEquals("Path should match expected", newPaths.next(), file.path().toString()); Assert.assertEquals("File's snapshot ID should match", id, (long) entry.snapshotId()); @@ -385,62 +418,65 @@ List paths(DataFile... dataFiles) { return paths; } - void validateManifest(ManifestFile manifest, - Iterator ids, - Iterator expectedFiles) { + void validateManifest( + ManifestFile manifest, Iterator ids, Iterator expectedFiles) { validateManifest(manifest, null, ids, expectedFiles, null); } - void validateManifest(ManifestFile manifest, - Iterator seqs, - Iterator ids, - Iterator expectedFiles) { + void validateManifest( + ManifestFile manifest, + Iterator seqs, + Iterator ids, + Iterator expectedFiles) { validateManifest(manifest, seqs, ids, expectedFiles, null); } - void validateManifest(ManifestFile manifest, - Iterator seqs, - Iterator ids, - Iterator expectedFiles, - Iterator statuses) { + void validateManifest( + ManifestFile manifest, + Iterator seqs, + Iterator ids, + Iterator expectedFiles, + Iterator statuses) { for (ManifestEntry entry : ManifestFiles.read(manifest, FILE_IO).entries()) { DataFile file = entry.file(); DataFile expected = expectedFiles.next(); if (seqs != null) { - V1Assert.assertEquals("Sequence number should default to 0", 0, entry.sequenceNumber().longValue()); - V2Assert.assertEquals("Sequence number should match expected", seqs.next(), entry.sequenceNumber()); + V1Assert.assertEquals( + "Sequence number should default to 0", 0, entry.sequenceNumber().longValue()); + V2Assert.assertEquals( + "Sequence number should match expected", seqs.next(), entry.sequenceNumber()); } - Assert.assertEquals("Path should match expected", - expected.path().toString(), file.path().toString()); - Assert.assertEquals("Snapshot ID should match expected ID", - ids.next(), entry.snapshotId()); + Assert.assertEquals( + "Path should match expected", expected.path().toString(), file.path().toString()); + Assert.assertEquals("Snapshot ID should match expected ID", ids.next(), entry.snapshotId()); if (statuses != null) { - Assert.assertEquals("Status should match expected", - statuses.next(), entry.status()); + Assert.assertEquals("Status should match expected", statuses.next(), entry.status()); } } Assert.assertFalse("Should find all files in the manifest", expectedFiles.hasNext()); } - void validateDeleteManifest(ManifestFile manifest, - Iterator seqs, - Iterator ids, - Iterator expectedFiles, - Iterator statuses) { - for (ManifestEntry entry : ManifestFiles.readDeleteManifest(manifest, FILE_IO, null).entries()) { + void validateDeleteManifest( + ManifestFile manifest, + Iterator seqs, + Iterator ids, + Iterator expectedFiles, + Iterator statuses) { + for (ManifestEntry entry : + ManifestFiles.readDeleteManifest(manifest, FILE_IO, null).entries()) { DeleteFile file = entry.file(); DeleteFile expected = expectedFiles.next(); if (seqs != null) { - V1Assert.assertEquals("Sequence number should default to 0", 0, entry.sequenceNumber().longValue()); - V2Assert.assertEquals("Sequence number should match expected", seqs.next(), entry.sequenceNumber()); + V1Assert.assertEquals( + "Sequence number should default to 0", 0, entry.sequenceNumber().longValue()); + V2Assert.assertEquals( + "Sequence number should match expected", seqs.next(), entry.sequenceNumber()); } - Assert.assertEquals("Path should match expected", - expected.path().toString(), file.path().toString()); - Assert.assertEquals("Snapshot ID should match expected ID", - ids.next(), entry.snapshotId()); - Assert.assertEquals("Status should match expected", - statuses.next(), entry.status()); + Assert.assertEquals( + "Path should match expected", expected.path().toString(), file.path().toString()); + Assert.assertEquals("Snapshot ID should match expected ID", ids.next(), entry.snapshotId()); + Assert.assertEquals("Status should match expected", statuses.next(), entry.status()); } Assert.assertFalse("Should find all files in the manifest", expectedFiles.hasNext()); @@ -482,20 +518,19 @@ protected PositionDelete positionDelete(CharSequence path, long pos, T ro return positionDelete.set(path, pos, row); } - static void validateManifestEntries(ManifestFile manifest, - Iterator ids, - Iterator expectedFiles, - Iterator expectedStatuses) { + static void validateManifestEntries( + ManifestFile manifest, + Iterator ids, + Iterator expectedFiles, + Iterator expectedStatuses) { for (ManifestEntry entry : ManifestFiles.read(manifest, FILE_IO).entries()) { DataFile file = entry.file(); DataFile expected = expectedFiles.next(); final ManifestEntry.Status expectedStatus = expectedStatuses.next(); - Assert.assertEquals("Path should match expected", - expected.path().toString(), file.path().toString()); - Assert.assertEquals("Snapshot ID should match expected ID", - ids.next(), entry.snapshotId()); - Assert.assertEquals("Entry status should match expected ID", - expectedStatus, entry.status()); + Assert.assertEquals( + "Path should match expected", expected.path().toString(), file.path().toString()); + Assert.assertEquals("Snapshot ID should match expected ID", ids.next(), entry.snapshotId()); + Assert.assertEquals("Entry status should match expected ID", expectedStatus, entry.status()); } Assert.assertFalse("Should find all files in the manifest", expectedFiles.hasNext()); @@ -525,9 +560,7 @@ static Iterator files(ManifestFile manifest) { return ManifestFiles.read(manifest, FILE_IO).iterator(); } - /** - * Used for assertions that only apply if the table version is v2. - */ + /** Used for assertions that only apply if the table version is v2. */ protected static class Assertions { private final boolean enabled; diff --git a/core/src/test/java/org/apache/iceberg/TestBaseIncrementalAppendScan.java b/core/src/test/java/org/apache/iceberg/TestBaseIncrementalAppendScan.java index 1fb764d2896f..b22e03ef0b8f 100644 --- a/core/src/test/java/org/apache/iceberg/TestBaseIncrementalAppendScan.java +++ b/core/src/test/java/org/apache/iceberg/TestBaseIncrementalAppendScan.java @@ -16,10 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; - import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.junit.Assert; import org.junit.Test; @@ -43,13 +41,11 @@ public void testFromSnapshotInclusive() { table.newFastAppend().appendFile(FILE_C).commit(); long snapshotCId = table.currentSnapshot().snapshotId(); - IncrementalAppendScan scan = newScan() - .fromSnapshotInclusive(snapshotAId); + IncrementalAppendScan scan = newScan().fromSnapshotInclusive(snapshotAId); Assert.assertEquals(3, Iterables.size(scan.planFiles())); - IncrementalAppendScan scanWithToSnapshot = newScan() - .fromSnapshotInclusive(snapshotAId) - .toSnapshot(snapshotCId); + IncrementalAppendScan scanWithToSnapshot = + newScan().fromSnapshotInclusive(snapshotAId).toSnapshot(snapshotCId); Assert.assertEquals(3, Iterables.size(scanWithToSnapshot.planFiles())); } @@ -62,13 +58,11 @@ public void testFromSnapshotExclusive() { table.newFastAppend().appendFile(FILE_C).commit(); long snapshotCId = table.currentSnapshot().snapshotId(); - IncrementalAppendScan scan = newScan() - .fromSnapshotExclusive(snapshotAId); + IncrementalAppendScan scan = newScan().fromSnapshotExclusive(snapshotAId); Assert.assertEquals(2, Iterables.size(scan.planFiles())); - IncrementalAppendScan scanWithToSnapshot = newScan() - .fromSnapshotExclusive(snapshotAId) - .toSnapshot(snapshotBId); + IncrementalAppendScan scanWithToSnapshot = + newScan().fromSnapshotExclusive(snapshotAId).toSnapshot(snapshotBId); Assert.assertEquals(1, Iterables.size(scanWithToSnapshot.planFiles())); } @@ -76,20 +70,19 @@ public void testFromSnapshotExclusive() { public void testFromSnapshotExclusiveForExpiredParent() { table.newFastAppend().appendFile(FILE_A).commit(); long snapshotAId = table.currentSnapshot().snapshotId(); - long expireTimestampSnapshotA = TestHelpers.waitUntilAfter(table.currentSnapshot().timestampMillis()); + long expireTimestampSnapshotA = + TestHelpers.waitUntilAfter(table.currentSnapshot().timestampMillis()); table.newFastAppend().appendFile(FILE_B).commit(); long snapshotBId = table.currentSnapshot().snapshotId(); table.newFastAppend().appendFile(FILE_C).commit(); long snapshotCId = table.currentSnapshot().snapshotId(); table.expireSnapshots().expireOlderThan(expireTimestampSnapshotA).commit(); - IncrementalAppendScan scan = newScan() - .fromSnapshotExclusive(snapshotAId); + IncrementalAppendScan scan = newScan().fromSnapshotExclusive(snapshotAId); Assert.assertEquals(2, Iterables.size(scan.planFiles())); - IncrementalAppendScan scanWithToSnapshot = newScan() - .fromSnapshotExclusive(snapshotAId) - .toSnapshot(snapshotBId); + IncrementalAppendScan scanWithToSnapshot = + newScan().fromSnapshotExclusive(snapshotAId).toSnapshot(snapshotBId); Assert.assertEquals(1, Iterables.size(scanWithToSnapshot.planFiles())); } @@ -102,8 +95,7 @@ public void testToSnapshot() { table.newFastAppend().appendFile(FILE_C).commit(); long snapshotCId = table.currentSnapshot().snapshotId(); - IncrementalAppendScan scan = newScan() - .toSnapshot(snapshotBId); + IncrementalAppendScan scan = newScan().toSnapshot(snapshotBId); Assert.assertEquals(2, Iterables.size(scan.planFiles())); } @@ -111,7 +103,8 @@ public void testToSnapshot() { public void testMultipleRootSnapshots() throws Exception { table.newFastAppend().appendFile(FILE_A).commit(); long snapshotAId = table.currentSnapshot().snapshotId(); - long expireTimestampSnapshotA = TestHelpers.waitUntilAfter(table.currentSnapshot().timestampMillis()); + long expireTimestampSnapshotA = + TestHelpers.waitUntilAfter(table.currentSnapshot().timestampMillis()); // append file B in a staged branch AppendFiles appendFiles = table.newFastAppend().appendFile(FILE_B).stageOnly(); @@ -127,27 +120,28 @@ public void testMultipleRootSnapshots() throws Exception { table.expireSnapshots().expireOlderThan(expireTimestampSnapshotA).commit(); // scan should discover snapshot C and D - IncrementalAppendScan scan = newScan() - .toSnapshot(snapshotDId); + IncrementalAppendScan scan = newScan().toSnapshot(snapshotDId); Assert.assertEquals(2, Iterables.size(scan.planFiles())); // scan should fail because snapshot B is not an ancestor of snapshot D - IncrementalAppendScan scanShouldFail = newScan() - .fromSnapshotExclusive(snapshotBId) - .toSnapshot(snapshotDId); - AssertHelpers.assertThrows("Should throw exception", + IncrementalAppendScan scanShouldFail = + newScan().fromSnapshotExclusive(snapshotBId).toSnapshot(snapshotDId); + AssertHelpers.assertThrows( + "Should throw exception", IllegalArgumentException.class, - String.format("Starting snapshot (exclusive) %d is not a parent ancestor of end snapshot %d", + String.format( + "Starting snapshot (exclusive) %d is not a parent ancestor of end snapshot %d", snapshotBId, snapshotDId), () -> Iterables.size(scanShouldFail.planFiles())); // scan should fail because snapshot B is not an ancestor of snapshot D - IncrementalAppendScan scanShouldFailInclusive = newScan() - .fromSnapshotInclusive(snapshotBId) - .toSnapshot(snapshotDId); - AssertHelpers.assertThrows("Should throw exception", + IncrementalAppendScan scanShouldFailInclusive = + newScan().fromSnapshotInclusive(snapshotBId).toSnapshot(snapshotDId); + AssertHelpers.assertThrows( + "Should throw exception", IllegalArgumentException.class, - String.format("Starting snapshot (inclusive) %d is not an ancestor of end snapshot %d", + String.format( + "Starting snapshot (inclusive) %d is not an ancestor of end snapshot %d", snapshotBId, snapshotDId), () -> Iterables.size(scanShouldFailInclusive.planFiles())); } diff --git a/core/src/test/java/org/apache/iceberg/TestCatalogErrorConstructor.java b/core/src/test/java/org/apache/iceberg/TestCatalogErrorConstructor.java index ade2dd18dbc0..f1c768624636 100644 --- a/core/src/test/java/org/apache/iceberg/TestCatalogErrorConstructor.java +++ b/core/src/test/java/org/apache/iceberg/TestCatalogErrorConstructor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -24,10 +23,7 @@ import org.apache.iceberg.catalog.Namespace; import org.apache.iceberg.catalog.TableIdentifier; - -/** - * Throws an error on initialization to simulate class not found error. - */ +/** Throws an error on initialization to simulate class not found error. */ public class TestCatalogErrorConstructor extends BaseMetastoreCatalog { static { if (true) { @@ -35,9 +31,7 @@ public class TestCatalogErrorConstructor extends BaseMetastoreCatalog { } } - public TestCatalogErrorConstructor() { - - } + public TestCatalogErrorConstructor() {} @Override protected TableOperations newTableOps(TableIdentifier tableIdentifier) { @@ -60,11 +54,8 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { } @Override - public void renameTable(TableIdentifier from, TableIdentifier to) { - - } + public void renameTable(TableIdentifier from, TableIdentifier to) {} @Override - public void initialize(String name, Map properties) { - } + public void initialize(String name, Map properties) {} } diff --git a/core/src/test/java/org/apache/iceberg/TestCatalogUtil.java b/core/src/test/java/org/apache/iceberg/TestCatalogUtil.java index 2295b771dd28..a31cf51b685a 100644 --- a/core/src/test/java/org/apache/iceberg/TestCatalogUtil.java +++ b/core/src/test/java/org/apache/iceberg/TestCatalogUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -43,7 +42,8 @@ public void loadCustomCatalog() { options.put("key", "val"); Configuration hadoopConf = new Configuration(); String name = "custom"; - Catalog catalog = CatalogUtil.loadCatalog(TestCatalog.class.getName(), name, options, hadoopConf); + Catalog catalog = + CatalogUtil.loadCatalog(TestCatalog.class.getName(), name, options, hadoopConf); Assertions.assertThat(catalog).isInstanceOf(TestCatalog.class); Assert.assertEquals(name, ((TestCatalog) catalog).catalogName); Assert.assertEquals(options, ((TestCatalog) catalog).flinkOptions); @@ -56,7 +56,8 @@ public void loadCustomCatalog_withHadoopConfig() { Configuration hadoopConf = new Configuration(); hadoopConf.set("key", "val"); String name = "custom"; - Catalog catalog = CatalogUtil.loadCatalog(TestCatalogConfigurable.class.getName(), name, options, hadoopConf); + Catalog catalog = + CatalogUtil.loadCatalog(TestCatalogConfigurable.class.getName(), name, options, hadoopConf); Assertions.assertThat(catalog).isInstanceOf(TestCatalogConfigurable.class); Assert.assertEquals(name, ((TestCatalogConfigurable) catalog).catalogName); Assert.assertEquals(options, ((TestCatalogConfigurable) catalog).flinkOptions); @@ -69,10 +70,13 @@ public void loadCustomCatalog_NoArgConstructorNotFound() { options.put("key", "val"); Configuration hadoopConf = new Configuration(); String name = "custom"; - AssertHelpers.assertThrows("must have no-arg constructor", + AssertHelpers.assertThrows( + "must have no-arg constructor", IllegalArgumentException.class, "NoSuchMethodException: org.apache.iceberg.TestCatalogUtil$TestCatalogBadConstructor.()", - () -> CatalogUtil.loadCatalog(TestCatalogBadConstructor.class.getName(), name, options, hadoopConf)); + () -> + CatalogUtil.loadCatalog( + TestCatalogBadConstructor.class.getName(), name, options, hadoopConf)); } @Test @@ -82,10 +86,13 @@ public void loadCustomCatalog_NotImplementCatalog() { Configuration hadoopConf = new Configuration(); String name = "custom"; - AssertHelpers.assertThrows("must implement catalog", + AssertHelpers.assertThrows( + "must implement catalog", IllegalArgumentException.class, "does not implement Catalog", - () -> CatalogUtil.loadCatalog(TestCatalogNoInterface.class.getName(), name, options, hadoopConf)); + () -> + CatalogUtil.loadCatalog( + TestCatalogNoInterface.class.getName(), name, options, hadoopConf)); } @Test @@ -96,7 +103,8 @@ public void loadCustomCatalog_ConstructorErrorCatalog() { String name = "custom"; String impl = TestCatalogErrorConstructor.class.getName(); - AssertHelpers.assertThrows("must be able to initialize catalog", + AssertHelpers.assertThrows( + "must be able to initialize catalog", IllegalArgumentException.class, "NoClassDefFoundError: Error while initializing class", () -> CatalogUtil.loadCatalog(impl, name, options, hadoopConf)); @@ -109,7 +117,8 @@ public void loadCustomCatalog_BadCatalogNameCatalog() { Configuration hadoopConf = new Configuration(); String name = "custom"; String impl = "CatalogDoesNotExist"; - AssertHelpers.assertThrows("catalog must exist", + AssertHelpers.assertThrows( + "catalog must exist", IllegalArgumentException.class, "java.lang.ClassNotFoundException: CatalogDoesNotExist", () -> CatalogUtil.loadCatalog(impl, name, options, hadoopConf)); @@ -128,7 +137,8 @@ public void loadCustomFileIO_noArg() { public void loadCustomFileIO_hadoopConfigConstructor() { Configuration configuration = new Configuration(); configuration.set("key", "val"); - FileIO fileIO = CatalogUtil.loadFileIO(HadoopFileIO.class.getName(), Maps.newHashMap(), configuration); + FileIO fileIO = + CatalogUtil.loadFileIO(HadoopFileIO.class.getName(), Maps.newHashMap(), configuration); Assertions.assertThat(fileIO).isInstanceOf(HadoopFileIO.class); Assert.assertEquals("val", ((HadoopFileIO) fileIO).conf().get("key")); } @@ -137,14 +147,17 @@ public void loadCustomFileIO_hadoopConfigConstructor() { public void loadCustomFileIO_configurable() { Configuration configuration = new Configuration(); configuration.set("key", "val"); - FileIO fileIO = CatalogUtil.loadFileIO(TestFileIOConfigurable.class.getName(), Maps.newHashMap(), configuration); + FileIO fileIO = + CatalogUtil.loadFileIO( + TestFileIOConfigurable.class.getName(), Maps.newHashMap(), configuration); Assertions.assertThat(fileIO).isInstanceOf(TestFileIOConfigurable.class); Assert.assertEquals(configuration, ((TestFileIOConfigurable) fileIO).configuration); } @Test public void loadCustomFileIO_badArg() { - AssertHelpers.assertThrows("cannot find constructor", + AssertHelpers.assertThrows( + "cannot find constructor", IllegalArgumentException.class, "missing no-arg constructor", () -> CatalogUtil.loadFileIO(TestFileIOBadArg.class.getName(), Maps.newHashMap(), null)); @@ -152,7 +165,8 @@ public void loadCustomFileIO_badArg() { @Test public void loadCustomFileIO_badClass() { - AssertHelpers.assertThrows("cannot cast", + AssertHelpers.assertThrows( + "cannot cast", IllegalArgumentException.class, "does not implement FileIO", () -> CatalogUtil.loadFileIO(TestFileIONotImpl.class.getName(), Maps.newHashMap(), null)); @@ -166,8 +180,11 @@ public void buildCustomCatalog_withTypeSet() { Configuration hadoopConf = new Configuration(); String name = "custom"; - AssertHelpers.assertThrows("Should complain about both configs being set", IllegalArgumentException.class, - "both type and catalog-impl are set", () -> CatalogUtil.buildIcebergCatalog(name, options, hadoopConf)); + AssertHelpers.assertThrows( + "Should complain about both configs being set", + IllegalArgumentException.class, + "both type and catalog-impl are set", + () -> CatalogUtil.buildIcebergCatalog(name, options, hadoopConf)); } public static class TestCatalog extends BaseMetastoreCatalog { @@ -175,8 +192,7 @@ public static class TestCatalog extends BaseMetastoreCatalog { private String catalogName; private Map flinkOptions; - public TestCatalog() { - } + public TestCatalog() {} @Override public void initialize(String name, Map properties) { @@ -205,9 +221,7 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { } @Override - public void renameTable(TableIdentifier from, TableIdentifier to) { - - } + public void renameTable(TableIdentifier from, TableIdentifier to) {} } public static class TestCatalogConfigurable extends BaseMetastoreCatalog implements Configurable { @@ -216,8 +230,7 @@ public static class TestCatalogConfigurable extends BaseMetastoreCatalog impleme private Map flinkOptions; private Configuration configuration; - public TestCatalogConfigurable() { - } + public TestCatalogConfigurable() {} @Override public void initialize(String name, Map properties) { @@ -256,15 +269,12 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { } @Override - public void renameTable(TableIdentifier from, TableIdentifier to) { - - } + public void renameTable(TableIdentifier from, TableIdentifier to) {} } public static class TestCatalogBadConstructor extends BaseMetastoreCatalog { - public TestCatalogBadConstructor(String arg) { - } + public TestCatalogBadConstructor(String arg) {} @Override protected TableOperations newTableOps(TableIdentifier tableIdentifier) { @@ -287,26 +297,21 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { } @Override - public void renameTable(TableIdentifier from, TableIdentifier to) { - - } + public void renameTable(TableIdentifier from, TableIdentifier to) {} @Override - public void initialize(String name, Map properties) { - } + public void initialize(String name, Map properties) {} } public static class TestCatalogNoInterface { - public TestCatalogNoInterface() { - } + public TestCatalogNoInterface() {} } public static class TestFileIOConfigurable implements FileIO, Configurable { private Configuration configuration; - public TestFileIOConfigurable() { - } + public TestFileIOConfigurable() {} @Override public void setConf(Configuration conf) { @@ -329,9 +334,7 @@ public OutputFile newOutputFile(String path) { } @Override - public void deleteFile(String path) { - - } + public void deleteFile(String path) {} public Configuration getConfiguration() { return configuration; @@ -342,8 +345,7 @@ public static class TestFileIONoArg implements FileIO { private Map map; - public TestFileIONoArg() { - } + public TestFileIONoArg() {} @Override public InputFile newInputFile(String path) { @@ -356,9 +358,7 @@ public OutputFile newOutputFile(String path) { } @Override - public void deleteFile(String path) { - - } + public void deleteFile(String path) {} public Map getMap() { return map; @@ -389,9 +389,7 @@ public OutputFile newOutputFile(String path) { } @Override - public void deleteFile(String path) { - - } + public void deleteFile(String path) {} public String getArg() { return arg; @@ -399,7 +397,6 @@ public String getArg() { } public static class TestFileIONotImpl { - public TestFileIONotImpl() { - } + public TestFileIONotImpl() {} } } diff --git a/core/src/test/java/org/apache/iceberg/TestCreateSnapshotEvent.java b/core/src/test/java/org/apache/iceberg/TestCreateSnapshotEvent.java index ff98e8cef48e..2c9580bb842c 100644 --- a/core/src/test/java/org/apache/iceberg/TestCreateSnapshotEvent.java +++ b/core/src/test/java/org/apache/iceberg/TestCreateSnapshotEvent.java @@ -16,21 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - package org.apache.iceberg; import org.apache.iceberg.events.CreateSnapshotEvent; @@ -61,25 +46,33 @@ public void testAppendCommitEvent() { table.newAppend().appendFile(FILE_A).commit(); Assert.assertNotNull(currentEvent); - Assert.assertEquals("Added records in the table should be 1", - "1", currentEvent.summary().get("added-records")); - Assert.assertEquals("Added files in the table should be 1", - "1", currentEvent.summary().get("added-data-files")); - Assert.assertEquals("Total records in the table should be 1", - "1", currentEvent.summary().get("total-records")); - Assert.assertEquals("Total data files in the table should be 1", - "1", currentEvent.summary().get("total-data-files")); + Assert.assertEquals( + "Added records in the table should be 1", "1", currentEvent.summary().get("added-records")); + Assert.assertEquals( + "Added files in the table should be 1", + "1", + currentEvent.summary().get("added-data-files")); + Assert.assertEquals( + "Total records in the table should be 1", "1", currentEvent.summary().get("total-records")); + Assert.assertEquals( + "Total data files in the table should be 1", + "1", + currentEvent.summary().get("total-data-files")); table.newAppend().appendFile(FILE_A).commit(); Assert.assertNotNull(currentEvent); - Assert.assertEquals("Added records in the table should be 1", - "1", currentEvent.summary().get("added-records")); - Assert.assertEquals("Added files in the table should be 1", - "1", currentEvent.summary().get("added-data-files")); - Assert.assertEquals("Total records in the table should be 2", - "2", currentEvent.summary().get("total-records")); - Assert.assertEquals("Total data files in the table should be 2", - "2", currentEvent.summary().get("total-data-files")); + Assert.assertEquals( + "Added records in the table should be 1", "1", currentEvent.summary().get("added-records")); + Assert.assertEquals( + "Added files in the table should be 1", + "1", + currentEvent.summary().get("added-data-files")); + Assert.assertEquals( + "Total records in the table should be 2", "2", currentEvent.summary().get("total-records")); + Assert.assertEquals( + "Total data files in the table should be 2", + "2", + currentEvent.summary().get("total-data-files")); } @Test @@ -88,25 +81,35 @@ public void testAppendAndDeleteCommitEvent() { table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Assert.assertNotNull(currentEvent); - Assert.assertEquals("Added records in the table should be 2", - "2", currentEvent.summary().get("added-records")); - Assert.assertEquals("Added files in the table should be 2", - "2", currentEvent.summary().get("added-data-files")); - Assert.assertEquals("Total records in the table should be 2", - "2", currentEvent.summary().get("total-records")); - Assert.assertEquals("Total data files in the table should be 2", - "2", currentEvent.summary().get("total-data-files")); + Assert.assertEquals( + "Added records in the table should be 2", "2", currentEvent.summary().get("added-records")); + Assert.assertEquals( + "Added files in the table should be 2", + "2", + currentEvent.summary().get("added-data-files")); + Assert.assertEquals( + "Total records in the table should be 2", "2", currentEvent.summary().get("total-records")); + Assert.assertEquals( + "Total data files in the table should be 2", + "2", + currentEvent.summary().get("total-data-files")); table.newDelete().deleteFile(FILE_A).commit(); Assert.assertNotNull(currentEvent); - Assert.assertEquals("Deleted records in the table should be 1", - "1", currentEvent.summary().get("deleted-records")); - Assert.assertEquals("Deleted files in the table should be 1", - "1", currentEvent.summary().get("deleted-data-files")); - Assert.assertEquals("Total records in the table should be 1", - "1", currentEvent.summary().get("total-records")); - Assert.assertEquals("Total data files in the table should be 1", - "1", currentEvent.summary().get("total-data-files")); + Assert.assertEquals( + "Deleted records in the table should be 1", + "1", + currentEvent.summary().get("deleted-records")); + Assert.assertEquals( + "Deleted files in the table should be 1", + "1", + currentEvent.summary().get("deleted-data-files")); + Assert.assertEquals( + "Total records in the table should be 1", "1", currentEvent.summary().get("total-records")); + Assert.assertEquals( + "Total data files in the table should be 1", + "1", + currentEvent.summary().get("total-data-files")); } class MyListener implements Listener { @@ -115,5 +118,4 @@ public void notify(CreateSnapshotEvent event) { currentEvent = event; } } - } diff --git a/core/src/test/java/org/apache/iceberg/TestCreateTransaction.java b/core/src/test/java/org/apache/iceberg/TestCreateTransaction.java index 283f85470723..157e1233232b 100644 --- a/core/src/test/java/org/apache/iceberg/TestCreateTransaction.java +++ b/core/src/test/java/org/apache/iceberg/TestCreateTransaction.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.PartitionSpec.unpartitioned; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.exceptions.CommitFailedException; @@ -31,14 +33,11 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.PartitionSpec.unpartitioned; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestCreateTransaction extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestCreateTransaction(int formatVersion) { @@ -52,22 +51,23 @@ public void testCreateTransaction() throws IOException { Transaction txn = TestTables.beginCreate(tableDir, "test_create", SCHEMA, unpartitioned()); - Assert.assertNull("Starting a create transaction should not commit metadata", + Assert.assertNull( + "Starting a create transaction should not commit metadata", TestTables.readMetadata("test_create")); - Assert.assertNull("Should have no metadata version", - TestTables.metadataVersion("test_create")); + Assert.assertNull("Should have no metadata version", TestTables.metadataVersion("test_create")); txn.commitTransaction(); TableMetadata meta = TestTables.readMetadata("test_create"); Assert.assertNotNull("Table metadata should be created after transaction commits", meta); - Assert.assertEquals("Should have metadata version 0", - 0, (int) TestTables.metadataVersion("test_create")); - Assert.assertEquals("Should have 0 manifest files", - 0, listManifestFiles(tableDir).size()); - - Assert.assertEquals("Table schema should match with reassigned IDs", - TypeUtil.assignIncreasingFreshIds(SCHEMA).asStruct(), meta.schema().asStruct()); + Assert.assertEquals( + "Should have metadata version 0", 0, (int) TestTables.metadataVersion("test_create")); + Assert.assertEquals("Should have 0 manifest files", 0, listManifestFiles(tableDir).size()); + + Assert.assertEquals( + "Table schema should match with reassigned IDs", + TypeUtil.assignIncreasingFreshIds(SCHEMA).asStruct(), + meta.schema().asStruct()); Assert.assertEquals("Table spec should match", unpartitioned(), meta.spec()); Assert.assertEquals("Table should not have any snapshots", 0, meta.snapshots().size()); } @@ -79,10 +79,10 @@ public void testCreateTransactionAndUpdateSchema() throws IOException { Transaction txn = TestTables.beginCreate(tableDir, "test_create", SCHEMA, unpartitioned()); - Assert.assertNull("Starting a create transaction should not commit metadata", + Assert.assertNull( + "Starting a create transaction should not commit metadata", TestTables.readMetadata("test_create")); - Assert.assertNull("Should have no metadata version", - TestTables.metadataVersion("test_create")); + Assert.assertNull("Should have no metadata version", TestTables.metadataVersion("test_create")); txn.updateSchema() .allowIncompatibleChanges() @@ -94,23 +94,26 @@ public void testCreateTransactionAndUpdateSchema() throws IOException { TableMetadata meta = TestTables.readMetadata("test_create"); Assert.assertNotNull("Table metadata should be created after transaction commits", meta); - Assert.assertEquals("Should have metadata version 0", - 0, (int) TestTables.metadataVersion("test_create")); - Assert.assertEquals("Should have 0 manifest files", - 0, listManifestFiles(tableDir).size()); - - Schema resultSchema = new Schema( - Lists.newArrayList( - required(1, "id", Types.IntegerType.get()), - required(2, "data", Types.StringType.get()), - required(3, "col", Types.StringType.get())), - Sets.newHashSet(1, 3) - ); - - Assert.assertEquals("Table schema should match with reassigned IDs", - resultSchema.asStruct(), meta.schema().asStruct()); - Assert.assertEquals("Table schema identifier should match", - resultSchema.identifierFieldIds(), meta.schema().identifierFieldIds()); + Assert.assertEquals( + "Should have metadata version 0", 0, (int) TestTables.metadataVersion("test_create")); + Assert.assertEquals("Should have 0 manifest files", 0, listManifestFiles(tableDir).size()); + + Schema resultSchema = + new Schema( + Lists.newArrayList( + required(1, "id", Types.IntegerType.get()), + required(2, "data", Types.StringType.get()), + required(3, "col", Types.StringType.get())), + Sets.newHashSet(1, 3)); + + Assert.assertEquals( + "Table schema should match with reassigned IDs", + resultSchema.asStruct(), + meta.schema().asStruct()); + Assert.assertEquals( + "Table schema identifier should match", + resultSchema.identifierFieldIds(), + meta.schema().identifierFieldIds()); Assert.assertEquals("Table spec should match", unpartitioned(), meta.spec()); Assert.assertEquals("Table should not have any snapshots", 0, meta.snapshots().size()); } @@ -122,32 +125,30 @@ public void testCreateAndAppendWithTransaction() throws IOException { Transaction txn = TestTables.beginCreate(tableDir, "test_append", SCHEMA, unpartitioned()); - Assert.assertNull("Starting a create transaction should not commit metadata", + Assert.assertNull( + "Starting a create transaction should not commit metadata", TestTables.readMetadata("test_append")); - Assert.assertNull("Should have no metadata version", - TestTables.metadataVersion("test_append")); + Assert.assertNull("Should have no metadata version", TestTables.metadataVersion("test_append")); - txn.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + txn.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - Assert.assertNull("Appending in a transaction should not commit metadata", + Assert.assertNull( + "Appending in a transaction should not commit metadata", TestTables.readMetadata("test_append")); - Assert.assertNull("Should have no metadata version", - TestTables.metadataVersion("test_append")); + Assert.assertNull("Should have no metadata version", TestTables.metadataVersion("test_append")); txn.commitTransaction(); TableMetadata meta = TestTables.readMetadata("test_append"); Assert.assertNotNull("Table metadata should be created after transaction commits", meta); - Assert.assertEquals("Should have metadata version 0", - 0, (int) TestTables.metadataVersion("test_append")); - Assert.assertEquals("Should have 1 manifest file", - 1, listManifestFiles(tableDir).size()); - - Assert.assertEquals("Table schema should match with reassigned IDs", - TypeUtil.assignIncreasingFreshIds(SCHEMA).asStruct(), meta.schema().asStruct()); + Assert.assertEquals( + "Should have metadata version 0", 0, (int) TestTables.metadataVersion("test_append")); + Assert.assertEquals("Should have 1 manifest file", 1, listManifestFiles(tableDir).size()); + + Assert.assertEquals( + "Table schema should match with reassigned IDs", + TypeUtil.assignIncreasingFreshIds(SCHEMA).asStruct(), + meta.schema().asStruct()); Assert.assertEquals("Table spec should match", unpartitioned(), meta.spec()); Assert.assertEquals("Table should have one snapshot", 1, meta.snapshots().size()); @@ -161,35 +162,34 @@ public void testCreateAndAppendWithTable() throws IOException { Transaction txn = TestTables.beginCreate(tableDir, "test_append", SCHEMA, unpartitioned()); - Assert.assertNull("Starting a create transaction should not commit metadata", + Assert.assertNull( + "Starting a create transaction should not commit metadata", TestTables.readMetadata("test_append")); - Assert.assertNull("Should have no metadata version", - TestTables.metadataVersion("test_append")); + Assert.assertNull("Should have no metadata version", TestTables.metadataVersion("test_append")); - Assert.assertTrue("Should return a transaction table", + Assert.assertTrue( + "Should return a transaction table", txn.table() instanceof BaseTransaction.TransactionTable); - txn.table().newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + txn.table().newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - Assert.assertNull("Appending in a transaction should not commit metadata", + Assert.assertNull( + "Appending in a transaction should not commit metadata", TestTables.readMetadata("test_append")); - Assert.assertNull("Should have no metadata version", - TestTables.metadataVersion("test_append")); + Assert.assertNull("Should have no metadata version", TestTables.metadataVersion("test_append")); txn.commitTransaction(); TableMetadata meta = TestTables.readMetadata("test_append"); Assert.assertNotNull("Table metadata should be created after transaction commits", meta); - Assert.assertEquals("Should have metadata version 0", - 0, (int) TestTables.metadataVersion("test_append")); - Assert.assertEquals("Should have 1 manifest file", - 1, listManifestFiles(tableDir).size()); - - Assert.assertEquals("Table schema should match with reassigned IDs", - TypeUtil.assignIncreasingFreshIds(SCHEMA).asStruct(), meta.schema().asStruct()); + Assert.assertEquals( + "Should have metadata version 0", 0, (int) TestTables.metadataVersion("test_append")); + Assert.assertEquals("Should have 1 manifest file", 1, listManifestFiles(tableDir).size()); + + Assert.assertEquals( + "Table schema should match with reassigned IDs", + TypeUtil.assignIncreasingFreshIds(SCHEMA).asStruct(), + meta.schema().asStruct()); Assert.assertEquals("Table spec should match", unpartitioned(), meta.spec()); Assert.assertEquals("Table should have one snapshot", 1, meta.snapshots().size()); @@ -203,36 +203,39 @@ public void testCreateAndUpdatePropertiesWithTransaction() throws IOException { Transaction txn = TestTables.beginCreate(tableDir, "test_properties", SCHEMA, unpartitioned()); - Assert.assertNull("Starting a create transaction should not commit metadata", + Assert.assertNull( + "Starting a create transaction should not commit metadata", TestTables.readMetadata("test_properties")); - Assert.assertNull("Should have no metadata version", - TestTables.metadataVersion("test_properties")); + Assert.assertNull( + "Should have no metadata version", TestTables.metadataVersion("test_properties")); - txn.updateProperties() - .set("test-property", "test-value") - .commit(); + txn.updateProperties().set("test-property", "test-value").commit(); - Assert.assertNull("Adding properties in a transaction should not commit metadata", + Assert.assertNull( + "Adding properties in a transaction should not commit metadata", TestTables.readMetadata("test_properties")); - Assert.assertNull("Should have no metadata version", - TestTables.metadataVersion("test_properties")); + Assert.assertNull( + "Should have no metadata version", TestTables.metadataVersion("test_properties")); txn.commitTransaction(); TableMetadata meta = TestTables.readMetadata("test_properties"); Assert.assertNotNull("Table metadata should be created after transaction commits", meta); - Assert.assertEquals("Should have metadata version 0", - 0, (int) TestTables.metadataVersion("test_properties")); - Assert.assertEquals("Should have 0 manifest files", - 0, listManifestFiles(tableDir).size()); - - Assert.assertEquals("Table schema should match with reassigned IDs", - TypeUtil.assignIncreasingFreshIds(SCHEMA).asStruct(), meta.schema().asStruct()); + Assert.assertEquals( + "Should have metadata version 0", 0, (int) TestTables.metadataVersion("test_properties")); + Assert.assertEquals("Should have 0 manifest files", 0, listManifestFiles(tableDir).size()); + + Assert.assertEquals( + "Table schema should match with reassigned IDs", + TypeUtil.assignIncreasingFreshIds(SCHEMA).asStruct(), + meta.schema().asStruct()); Assert.assertEquals("Table spec should match", unpartitioned(), meta.spec()); Assert.assertEquals("Table should not have any snapshots", 0, meta.snapshots().size()); Assert.assertEquals("Should have one table property", 1, meta.properties().size()); - Assert.assertEquals("Should have correct table property value", - "test-value", meta.properties().get("test-property")); + Assert.assertEquals( + "Should have correct table property value", + "test-value", + meta.properties().get("test-property")); } @Test @@ -242,39 +245,43 @@ public void testCreateAndUpdatePropertiesWithTable() throws IOException { Transaction txn = TestTables.beginCreate(tableDir, "test_properties", SCHEMA, unpartitioned()); - Assert.assertNull("Starting a create transaction should not commit metadata", + Assert.assertNull( + "Starting a create transaction should not commit metadata", TestTables.readMetadata("test_properties")); - Assert.assertNull("Should have no metadata version", - TestTables.metadataVersion("test_properties")); + Assert.assertNull( + "Should have no metadata version", TestTables.metadataVersion("test_properties")); - Assert.assertTrue("Should return a transaction table", + Assert.assertTrue( + "Should return a transaction table", txn.table() instanceof BaseTransaction.TransactionTable); - txn.table().updateProperties() - .set("test-property", "test-value") - .commit(); + txn.table().updateProperties().set("test-property", "test-value").commit(); - Assert.assertNull("Adding properties in a transaction should not commit metadata", + Assert.assertNull( + "Adding properties in a transaction should not commit metadata", TestTables.readMetadata("test_properties")); - Assert.assertNull("Should have no metadata version", - TestTables.metadataVersion("test_properties")); + Assert.assertNull( + "Should have no metadata version", TestTables.metadataVersion("test_properties")); txn.commitTransaction(); TableMetadata meta = TestTables.readMetadata("test_properties"); Assert.assertNotNull("Table metadata should be created after transaction commits", meta); - Assert.assertEquals("Should have metadata version 0", - 0, (int) TestTables.metadataVersion("test_properties")); - Assert.assertEquals("Should have 0 manifest files", - 0, listManifestFiles(tableDir).size()); - - Assert.assertEquals("Table schema should match with reassigned IDs", - TypeUtil.assignIncreasingFreshIds(SCHEMA).asStruct(), meta.schema().asStruct()); + Assert.assertEquals( + "Should have metadata version 0", 0, (int) TestTables.metadataVersion("test_properties")); + Assert.assertEquals("Should have 0 manifest files", 0, listManifestFiles(tableDir).size()); + + Assert.assertEquals( + "Table schema should match with reassigned IDs", + TypeUtil.assignIncreasingFreshIds(SCHEMA).asStruct(), + meta.schema().asStruct()); Assert.assertEquals("Table spec should match", unpartitioned(), meta.spec()); Assert.assertEquals("Table should not have any snapshots", 0, meta.snapshots().size()); Assert.assertEquals("Should have one table property", 1, meta.properties().size()); - Assert.assertEquals("Should have correct table property value", - "test-value", meta.properties().get("test-property")); + Assert.assertEquals( + "Should have correct table property value", + "test-value", + meta.properties().get("test-property")); } @Test @@ -282,16 +289,19 @@ public void testCreateDetectsUncommittedChange() throws IOException { File tableDir = temp.newFolder(); Assert.assertTrue(tableDir.delete()); - Transaction txn = TestTables.beginCreate(tableDir, "uncommitted_change", SCHEMA, unpartitioned()); + Transaction txn = + TestTables.beginCreate(tableDir, "uncommitted_change", SCHEMA, unpartitioned()); - Assert.assertNull("Starting a create transaction should not commit metadata", + Assert.assertNull( + "Starting a create transaction should not commit metadata", TestTables.readMetadata("uncommitted_change")); - Assert.assertNull("Should have no metadata version", - TestTables.metadataVersion("uncommitted_change")); + Assert.assertNull( + "Should have no metadata version", TestTables.metadataVersion("uncommitted_change")); txn.updateProperties().set("test-property", "test-value"); // not committed - AssertHelpers.assertThrows("Should reject commit when last operation has not committed", + AssertHelpers.assertThrows( + "Should reject commit when last operation has not committed", IllegalStateException.class, "Cannot create new DeleteFiles: last operation has not committed", txn::newDelete); @@ -302,17 +312,21 @@ public void testCreateDetectsUncommittedChangeOnCommit() throws IOException { File tableDir = temp.newFolder(); Assert.assertTrue(tableDir.delete()); - Transaction txn = TestTables.beginCreate(tableDir, "uncommitted_change", SCHEMA, unpartitioned()); + Transaction txn = + TestTables.beginCreate(tableDir, "uncommitted_change", SCHEMA, unpartitioned()); - Assert.assertNull("Starting a create transaction should not commit metadata", + Assert.assertNull( + "Starting a create transaction should not commit metadata", TestTables.readMetadata("uncommitted_change")); - Assert.assertNull("Should have no metadata version", - TestTables.metadataVersion("uncommitted_change")); + Assert.assertNull( + "Should have no metadata version", TestTables.metadataVersion("uncommitted_change")); txn.updateProperties().set("test-property", "test-value"); // not committed - AssertHelpers.assertThrows("Should reject commit when last operation has not committed", - IllegalStateException.class, "Cannot commit transaction: last operation has not committed", + AssertHelpers.assertThrows( + "Should reject commit when last operation has not committed", + IllegalStateException.class, + "Cannot commit transaction: last operation has not committed", txn::commitTransaction); } @@ -326,23 +340,35 @@ public void testCreateTransactionConflict() throws IOException { // append in the transaction to ensure a manifest file is created txn.newAppend().appendFile(FILE_A).commit(); - Assert.assertNull("Starting a create transaction should not commit metadata", + Assert.assertNull( + "Starting a create transaction should not commit metadata", TestTables.readMetadata("test_conflict")); - Assert.assertNull("Should have no metadata version", - TestTables.metadataVersion("test_conflict")); - - Table conflict = TestTables.create(tableDir, "test_conflict", SCHEMA, unpartitioned(), formatVersion); - - Assert.assertEquals("Table schema should match with reassigned IDs", - TypeUtil.assignIncreasingFreshIds(SCHEMA).asStruct(), conflict.schema().asStruct()); - Assert.assertEquals("Table spec should match conflict table, not transaction table", - unpartitioned(), conflict.spec()); - Assert.assertFalse("Table should not have any snapshots", - conflict.snapshots().iterator().hasNext()); - - AssertHelpers.assertThrows("Transaction commit should fail", - CommitFailedException.class, "Commit failed: table was updated", txn::commitTransaction); + Assert.assertNull( + "Should have no metadata version", TestTables.metadataVersion("test_conflict")); + + Table conflict = + TestTables.create(tableDir, "test_conflict", SCHEMA, unpartitioned(), formatVersion); + + Assert.assertEquals( + "Table schema should match with reassigned IDs", + TypeUtil.assignIncreasingFreshIds(SCHEMA).asStruct(), + conflict.schema().asStruct()); + Assert.assertEquals( + "Table spec should match conflict table, not transaction table", + unpartitioned(), + conflict.spec()); + Assert.assertFalse( + "Table should not have any snapshots", conflict.snapshots().iterator().hasNext()); + + AssertHelpers.assertThrows( + "Transaction commit should fail", + CommitFailedException.class, + "Commit failed: table was updated", + txn::commitTransaction); - Assert.assertEquals("Should clean up metadata", Sets.newHashSet(), Sets.newHashSet(listManifestFiles(tableDir))); + Assert.assertEquals( + "Should clean up metadata", + Sets.newHashSet(), + Sets.newHashSet(listManifestFiles(tableDir))); } } diff --git a/core/src/test/java/org/apache/iceberg/TestDataTableScan.java b/core/src/test/java/org/apache/iceberg/TestDataTableScan.java index 4bd72ca61840..a5e9f6f9ecda 100644 --- a/core/src/test/java/org/apache/iceberg/TestDataTableScan.java +++ b/core/src/test/java/org/apache/iceberg/TestDataTableScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; public class TestDataTableScan extends ScanTestBase { diff --git a/core/src/test/java/org/apache/iceberg/TestDeleteFileIndex.java b/core/src/test/java/org/apache/iceberg/TestDeleteFileIndex.java index bff43e98a23a..95ce34ab0443 100644 --- a/core/src/test/java/org/apache/iceberg/TestDeleteFileIndex.java +++ b/core/src/test/java/org/apache/iceberg/TestDeleteFileIndex.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.expressions.Expressions.bucket; +import static org.apache.iceberg.expressions.Expressions.equal; + import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -32,32 +34,32 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.expressions.Expressions.bucket; -import static org.apache.iceberg.expressions.Expressions.equal; - public class TestDeleteFileIndex extends TableTestBase { public TestDeleteFileIndex() { super(2 /* table format version */); } - static final DeleteFile FILE_A_POS_1 = FileMetadata.deleteFileBuilder(SPEC) - .ofPositionDeletes() - .withPath("/path/to/data-a-pos-deletes.parquet") - .withFileSizeInBytes(10) - .withPartition(FILE_A.partition()) - .withRecordCount(1) - .build(); + static final DeleteFile FILE_A_POS_1 = + FileMetadata.deleteFileBuilder(SPEC) + .ofPositionDeletes() + .withPath("/path/to/data-a-pos-deletes.parquet") + .withFileSizeInBytes(10) + .withPartition(FILE_A.partition()) + .withRecordCount(1) + .build(); static final DeleteFile FILE_A_POS_2 = FILE_A_POS_1.copy(); - static final DeleteFile FILE_A_EQ_1 = FileMetadata.deleteFileBuilder(SPEC) - .ofEqualityDeletes() - .withPath("/path/to/data-a-eq-deletes.parquet") - .withFileSizeInBytes(10) - .withPartition(FILE_A.partition()) - .withRecordCount(1) - .build(); + static final DeleteFile FILE_A_EQ_1 = + FileMetadata.deleteFileBuilder(SPEC) + .ofEqualityDeletes() + .withPath("/path/to/data-a-eq-deletes.parquet") + .withFileSizeInBytes(10) + .withPartition(FILE_A.partition()) + .withRecordCount(1) + .build(); static final DeleteFile FILE_A_EQ_2 = FILE_A_EQ_1.copy(); - static final DeleteFile[] DELETE_FILES = new DeleteFile[] { FILE_A_POS_1, FILE_A_EQ_1, FILE_A_POS_2, FILE_A_EQ_2 }; + static final DeleteFile[] DELETE_FILES = + new DeleteFile[] {FILE_A_POS_1, FILE_A_EQ_1, FILE_A_POS_2, FILE_A_EQ_2}; private static DataFile unpartitionedFile(PartitionSpec spec) { return DataFiles.builder(spec) @@ -88,72 +90,102 @@ private static DeleteFile unpartitionedEqDeletes(PartitionSpec spec) { @Test public void testUnpartitionedDeletes() { PartitionSpec partSpec = PartitionSpec.unpartitioned(); - DeleteFileIndex index = new DeleteFileIndex( - ImmutableMap.of( - partSpec.specId(), partSpec, - 1, SPEC), - new long[] { 3, 5, 5, 6 }, DELETE_FILES, ImmutableMap.of()); + DeleteFileIndex index = + new DeleteFileIndex( + ImmutableMap.of(partSpec.specId(), partSpec, 1, SPEC), + new long[] {3, 5, 5, 6}, + DELETE_FILES, + ImmutableMap.of()); DataFile unpartitionedFile = unpartitionedFile(partSpec); - Assert.assertArrayEquals("All deletes should apply to seq 0", - DELETE_FILES, index.forDataFile(0, unpartitionedFile)); - Assert.assertArrayEquals("All deletes should apply to seq 3", - DELETE_FILES, index.forDataFile(3, unpartitionedFile)); - Assert.assertArrayEquals("Last 3 deletes should apply to seq 4", - Arrays.copyOfRange(DELETE_FILES, 1, 4), index.forDataFile(4, unpartitionedFile)); - Assert.assertArrayEquals("Last 3 deletes should apply to seq 5", - Arrays.copyOfRange(DELETE_FILES, 1, 4), index.forDataFile(5, unpartitionedFile)); - Assert.assertArrayEquals("Last delete should apply to seq 6", - Arrays.copyOfRange(DELETE_FILES, 3, 4), index.forDataFile(6, unpartitionedFile)); - Assert.assertArrayEquals("No deletes should apply to seq 7", - new DataFile[0], index.forDataFile(7, unpartitionedFile)); - Assert.assertArrayEquals("No deletes should apply to seq 10", - new DataFile[0], index.forDataFile(10, unpartitionedFile)); + Assert.assertArrayEquals( + "All deletes should apply to seq 0", DELETE_FILES, index.forDataFile(0, unpartitionedFile)); + Assert.assertArrayEquals( + "All deletes should apply to seq 3", DELETE_FILES, index.forDataFile(3, unpartitionedFile)); + Assert.assertArrayEquals( + "Last 3 deletes should apply to seq 4", + Arrays.copyOfRange(DELETE_FILES, 1, 4), + index.forDataFile(4, unpartitionedFile)); + Assert.assertArrayEquals( + "Last 3 deletes should apply to seq 5", + Arrays.copyOfRange(DELETE_FILES, 1, 4), + index.forDataFile(5, unpartitionedFile)); + Assert.assertArrayEquals( + "Last delete should apply to seq 6", + Arrays.copyOfRange(DELETE_FILES, 3, 4), + index.forDataFile(6, unpartitionedFile)); + Assert.assertArrayEquals( + "No deletes should apply to seq 7", + new DataFile[0], + index.forDataFile(7, unpartitionedFile)); + Assert.assertArrayEquals( + "No deletes should apply to seq 10", + new DataFile[0], + index.forDataFile(10, unpartitionedFile)); // copy file A with a different spec ID DataFile partitionedFileA = FILE_A.copy(); ((BaseFile) partitionedFileA).setSpecId(1); - Assert.assertArrayEquals("All global deletes should apply to a partitioned file", - DELETE_FILES, index.forDataFile(0, partitionedFileA)); + Assert.assertArrayEquals( + "All global deletes should apply to a partitioned file", + DELETE_FILES, + index.forDataFile(0, partitionedFileA)); } @Test public void testPartitionedDeleteIndex() { - DeleteFileIndex index = new DeleteFileIndex( - ImmutableMap.of( - SPEC.specId(), SPEC, - 1, PartitionSpec.unpartitioned()), - null, null, ImmutableMap.of( - Pair.of(SPEC.specId(), StructLikeWrapper.forType(SPEC.partitionType()).set(FILE_A.partition())), - Pair.of(new long[] { 3, 5, 5, 6 }, DELETE_FILES), - Pair.of(SPEC.specId(), StructLikeWrapper.forType(SPEC.partitionType()).set(FILE_C.partition())), - Pair.of(new long[0], new DeleteFile[0]))); - - Assert.assertArrayEquals("All deletes should apply to seq 0", - DELETE_FILES, index.forDataFile(0, FILE_A)); - Assert.assertArrayEquals("All deletes should apply to seq 3", - DELETE_FILES, index.forDataFile(3, FILE_A)); - Assert.assertArrayEquals("Last 3 deletes should apply to seq 4", - Arrays.copyOfRange(DELETE_FILES, 1, 4), index.forDataFile(4, FILE_A)); - Assert.assertArrayEquals("Last 3 deletes should apply to seq 5", - Arrays.copyOfRange(DELETE_FILES, 1, 4), index.forDataFile(5, FILE_A)); - Assert.assertArrayEquals("Last delete should apply to seq 6", - Arrays.copyOfRange(DELETE_FILES, 3, 4), index.forDataFile(6, FILE_A)); - Assert.assertArrayEquals("No deletes should apply to seq 7", - new DataFile[0], index.forDataFile(7, FILE_A)); - Assert.assertArrayEquals("No deletes should apply to seq 10", - new DataFile[0], index.forDataFile(10, FILE_A)); - - Assert.assertEquals("No deletes should apply to FILE_B, partition not in index", - 0, index.forDataFile(0, FILE_B).length); - - Assert.assertEquals("No deletes should apply to FILE_C, no indexed delete files", - 0, index.forDataFile(0, FILE_C).length); + DeleteFileIndex index = + new DeleteFileIndex( + ImmutableMap.of(SPEC.specId(), SPEC, 1, PartitionSpec.unpartitioned()), + null, + null, + ImmutableMap.of( + Pair.of( + SPEC.specId(), + StructLikeWrapper.forType(SPEC.partitionType()).set(FILE_A.partition())), + Pair.of(new long[] {3, 5, 5, 6}, DELETE_FILES), + Pair.of( + SPEC.specId(), + StructLikeWrapper.forType(SPEC.partitionType()).set(FILE_C.partition())), + Pair.of(new long[0], new DeleteFile[0]))); + + Assert.assertArrayEquals( + "All deletes should apply to seq 0", DELETE_FILES, index.forDataFile(0, FILE_A)); + Assert.assertArrayEquals( + "All deletes should apply to seq 3", DELETE_FILES, index.forDataFile(3, FILE_A)); + Assert.assertArrayEquals( + "Last 3 deletes should apply to seq 4", + Arrays.copyOfRange(DELETE_FILES, 1, 4), + index.forDataFile(4, FILE_A)); + Assert.assertArrayEquals( + "Last 3 deletes should apply to seq 5", + Arrays.copyOfRange(DELETE_FILES, 1, 4), + index.forDataFile(5, FILE_A)); + Assert.assertArrayEquals( + "Last delete should apply to seq 6", + Arrays.copyOfRange(DELETE_FILES, 3, 4), + index.forDataFile(6, FILE_A)); + Assert.assertArrayEquals( + "No deletes should apply to seq 7", new DataFile[0], index.forDataFile(7, FILE_A)); + Assert.assertArrayEquals( + "No deletes should apply to seq 10", new DataFile[0], index.forDataFile(10, FILE_A)); + + Assert.assertEquals( + "No deletes should apply to FILE_B, partition not in index", + 0, + index.forDataFile(0, FILE_B).length); + + Assert.assertEquals( + "No deletes should apply to FILE_C, no indexed delete files", + 0, + index.forDataFile(0, FILE_C).length); DataFile unpartitionedFileA = FILE_A.copy(); ((BaseFile) unpartitionedFileA).setSpecId(1); - Assert.assertEquals("No deletes should apply to FILE_A with a different specId", - 0, index.forDataFile(0, unpartitionedFileA).length); + Assert.assertEquals( + "No deletes should apply to FILE_A with a different specId", + 0, + index.forDataFile(0, unpartitionedFileA).length); } @Test @@ -161,145 +193,118 @@ public void testUnpartitionedTableScan() throws IOException { File location = temp.newFolder(); Assert.assertTrue(location.delete()); - Table unpartitioned = TestTables.create(location, "unpartitioned", SCHEMA, PartitionSpec.unpartitioned(), 2); + Table unpartitioned = + TestTables.create(location, "unpartitioned", SCHEMA, PartitionSpec.unpartitioned(), 2); DataFile unpartitionedFile = unpartitionedFile(unpartitioned.spec()); - unpartitioned.newAppend() - .appendFile(unpartitionedFile) - .commit(); + unpartitioned.newAppend().appendFile(unpartitionedFile).commit(); // add a delete file DeleteFile unpartitionedPosDeletes = unpartitionedPosDeletes(unpartitioned.spec()); - unpartitioned.newRowDelta() - .addDeletes(unpartitionedPosDeletes) - .commit(); + unpartitioned.newRowDelta().addDeletes(unpartitionedPosDeletes).commit(); List tasks = Lists.newArrayList(unpartitioned.newScan().planFiles().iterator()); Assert.assertEquals("Should have one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should have the correct data file path", - unpartitionedFile.path(), task.file().path()); - Assert.assertEquals("Should have one associated delete file", - 1, task.deletes().size()); - Assert.assertEquals("Should have expected delete file", - unpartitionedPosDeletes.path(), task.deletes().get(0).path()); + Assert.assertEquals( + "Should have the correct data file path", unpartitionedFile.path(), task.file().path()); + Assert.assertEquals("Should have one associated delete file", 1, task.deletes().size()); + Assert.assertEquals( + "Should have expected delete file", + unpartitionedPosDeletes.path(), + task.deletes().get(0).path()); // add a second delete file DeleteFile unpartitionedEqDeletes = unpartitionedEqDeletes(unpartitioned.spec()); - unpartitioned.newRowDelta() - .addDeletes(unpartitionedEqDeletes) - .commit(); + unpartitioned.newRowDelta().addDeletes(unpartitionedEqDeletes).commit(); tasks = Lists.newArrayList(unpartitioned.newScan().planFiles().iterator()); task = tasks.get(0); - Assert.assertEquals("Should have the correct data file path", - unpartitionedFile.path(), task.file().path()); - Assert.assertEquals("Should have two associated delete files", - 2, task.deletes().size()); - Assert.assertEquals("Should have expected delete files", + Assert.assertEquals( + "Should have the correct data file path", unpartitionedFile.path(), task.file().path()); + Assert.assertEquals("Should have two associated delete files", 2, task.deletes().size()); + Assert.assertEquals( + "Should have expected delete files", Sets.newHashSet(unpartitionedPosDeletes.path(), unpartitionedEqDeletes.path()), Sets.newHashSet(Iterables.transform(task.deletes(), ContentFile::path))); } @Test public void testPartitionedTableWithPartitionPosDeletes() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newRowDelta() - .addDeletes(FILE_A_POS_1) - .commit(); + table.newRowDelta().addDeletes(FILE_A_POS_1).commit(); List tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); Assert.assertEquals("Should have one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should have the correct data file path", - FILE_A.path(), task.file().path()); - Assert.assertEquals("Should have one associated delete file", - 1, task.deletes().size()); - Assert.assertEquals("Should have only pos delete file", - FILE_A_POS_1.path(), task.deletes().get(0).path()); + Assert.assertEquals( + "Should have the correct data file path", FILE_A.path(), task.file().path()); + Assert.assertEquals("Should have one associated delete file", 1, task.deletes().size()); + Assert.assertEquals( + "Should have only pos delete file", FILE_A_POS_1.path(), task.deletes().get(0).path()); } @Test public void testPartitionedTableWithPartitionEqDeletes() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newRowDelta() - .addDeletes(FILE_A_EQ_1) - .commit(); + table.newRowDelta().addDeletes(FILE_A_EQ_1).commit(); List tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); Assert.assertEquals("Should have one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should have the correct data file path", - FILE_A.path(), task.file().path()); - Assert.assertEquals("Should have one associated delete file", - 1, task.deletes().size()); - Assert.assertEquals("Should have only pos delete file", - FILE_A_EQ_1.path(), task.deletes().get(0).path()); + Assert.assertEquals( + "Should have the correct data file path", FILE_A.path(), task.file().path()); + Assert.assertEquals("Should have one associated delete file", 1, task.deletes().size()); + Assert.assertEquals( + "Should have only pos delete file", FILE_A_EQ_1.path(), task.deletes().get(0).path()); } @Test public void testPartitionedTableWithUnrelatedPartitionDeletes() { - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); - table.newRowDelta() - .addDeletes(FILE_A_POS_1) - .addDeletes(FILE_A_EQ_1) - .commit(); + table.newRowDelta().addDeletes(FILE_A_POS_1).addDeletes(FILE_A_EQ_1).commit(); List tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); Assert.assertEquals("Should have one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should have the correct data file path", - FILE_B.path(), task.file().path()); - Assert.assertEquals("Should have no delete files to apply", - 0, task.deletes().size()); + Assert.assertEquals( + "Should have the correct data file path", FILE_B.path(), task.file().path()); + Assert.assertEquals("Should have no delete files to apply", 0, task.deletes().size()); } @Test public void testPartitionedTableWithOlderPartitionDeletes() { - table.newRowDelta() - .addDeletes(FILE_A_POS_1) - .addDeletes(FILE_A_EQ_1) - .commit(); + table.newRowDelta().addDeletes(FILE_A_POS_1).addDeletes(FILE_A_EQ_1).commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); List tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); Assert.assertEquals("Should have one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should have the correct data file path", - FILE_A.path(), task.file().path()); - Assert.assertEquals("Should have no delete files to apply", - 0, task.deletes().size()); + Assert.assertEquals( + "Should have the correct data file path", FILE_A.path(), task.file().path()); + Assert.assertEquals("Should have no delete files to apply", 0, task.deletes().size()); } @Test public void testPartitionedTableScanWithGlobalDeletes() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = table.ops().current(); table.ops().commit(base, base.updatePartitionSpec(PartitionSpec.unpartitioned())); // add unpartitioned equality and position deletes, but only equality deletes are global DeleteFile unpartitionedEqDeletes = unpartitionedEqDeletes(table.spec()); - table.newRowDelta() + table + .newRowDelta() .addDeletes(unpartitionedPosDeletes(table.spec())) .addDeletes(unpartitionedEqDeletes) .commit(); @@ -308,30 +313,28 @@ public void testPartitionedTableScanWithGlobalDeletes() { Assert.assertEquals("Should have one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should have the correct data file path", - FILE_A.path(), task.file().path()); - Assert.assertEquals("Should have one associated delete file", - 1, task.deletes().size()); - Assert.assertEquals("Should have expected delete file", - unpartitionedEqDeletes.path(), task.deletes().get(0).path()); + Assert.assertEquals( + "Should have the correct data file path", FILE_A.path(), task.file().path()); + Assert.assertEquals("Should have one associated delete file", 1, task.deletes().size()); + Assert.assertEquals( + "Should have expected delete file", + unpartitionedEqDeletes.path(), + task.deletes().get(0).path()); } @Test public void testPartitionedTableScanWithGlobalAndPartitionDeletes() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newRowDelta() - .addDeletes(FILE_A_EQ_1) - .commit(); + table.newRowDelta().addDeletes(FILE_A_EQ_1).commit(); TableMetadata base = table.ops().current(); table.ops().commit(base, base.updatePartitionSpec(PartitionSpec.unpartitioned())); // add unpartitioned equality and position deletes, but only equality deletes are global DeleteFile unpartitionedEqDeletes = unpartitionedEqDeletes(table.spec()); - table.newRowDelta() + table + .newRowDelta() .addDeletes(unpartitionedPosDeletes(table.spec())) .addDeletes(unpartitionedEqDeletes) .commit(); @@ -340,33 +343,28 @@ public void testPartitionedTableScanWithGlobalAndPartitionDeletes() { Assert.assertEquals("Should have one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should have the correct data file path", - FILE_A.path(), task.file().path()); - Assert.assertEquals("Should have two associated delete files", - 2, task.deletes().size()); - Assert.assertEquals("Should have expected delete files", + Assert.assertEquals( + "Should have the correct data file path", FILE_A.path(), task.file().path()); + Assert.assertEquals("Should have two associated delete files", 2, task.deletes().size()); + Assert.assertEquals( + "Should have expected delete files", Sets.newHashSet(unpartitionedEqDeletes.path(), FILE_A_EQ_1.path()), Sets.newHashSet(Iterables.transform(task.deletes(), ContentFile::path))); } @Test public void testPartitionedTableSequenceNumbers() { - table.newRowDelta() - .addRows(FILE_A) - .addDeletes(FILE_A_EQ_1) - .addDeletes(FILE_A_POS_1) - .commit(); + table.newRowDelta().addRows(FILE_A).addDeletes(FILE_A_EQ_1).addDeletes(FILE_A_POS_1).commit(); List tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); Assert.assertEquals("Should have one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should have the correct data file path", - FILE_A.path(), task.file().path()); - Assert.assertEquals("Should have one associated delete file", - 1, task.deletes().size()); - Assert.assertEquals("Should have only pos delete file", - FILE_A_POS_1.path(), task.deletes().get(0).path()); + Assert.assertEquals( + "Should have the correct data file path", FILE_A.path(), task.file().path()); + Assert.assertEquals("Should have one associated delete file", 1, task.deletes().size()); + Assert.assertEquals( + "Should have only pos delete file", FILE_A_POS_1.path(), task.deletes().get(0).path()); } @Test @@ -374,84 +372,99 @@ public void testUnpartitionedTableSequenceNumbers() throws IOException { File location = temp.newFolder(); Assert.assertTrue(location.delete()); - Table unpartitioned = TestTables.create(location, "unpartitioned", SCHEMA, PartitionSpec.unpartitioned(), 2); + Table unpartitioned = + TestTables.create(location, "unpartitioned", SCHEMA, PartitionSpec.unpartitioned(), 2); // add data, pos deletes, and eq deletes in the same sequence number // the position deletes will be applied to the data file, but the equality deletes will not DataFile unpartitionedFile = unpartitionedFile(unpartitioned.spec()); DeleteFile unpartitionedPosDeleteFile = unpartitionedPosDeletes(unpartitioned.spec()); - unpartitioned.newRowDelta() + unpartitioned + .newRowDelta() .addRows(unpartitionedFile) .addDeletes(unpartitionedPosDeleteFile) .addDeletes(unpartitionedEqDeletes(unpartitioned.spec())) .commit(); - Assert.assertEquals("Table should contain 2 delete files", - 2, (long) unpartitioned.currentSnapshot().deleteManifests(unpartitioned.io()).get(0).addedFilesCount()); + Assert.assertEquals( + "Table should contain 2 delete files", + 2, + (long) + unpartitioned + .currentSnapshot() + .deleteManifests(unpartitioned.io()) + .get(0) + .addedFilesCount()); List tasks = Lists.newArrayList(unpartitioned.newScan().planFiles().iterator()); Assert.assertEquals("Should have one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should have the correct data file path", - unpartitionedFile.path(), task.file().path()); - Assert.assertEquals("Should have one associated delete file", - 1, task.deletes().size()); - Assert.assertEquals("Should have only pos delete file", - unpartitionedPosDeleteFile.path(), task.deletes().get(0).path()); + Assert.assertEquals( + "Should have the correct data file path", unpartitionedFile.path(), task.file().path()); + Assert.assertEquals("Should have one associated delete file", 1, task.deletes().size()); + Assert.assertEquals( + "Should have only pos delete file", + unpartitionedPosDeleteFile.path(), + task.deletes().get(0).path()); } @Test public void testPartitionedTableWithExistingDeleteFile() { - table.updateProperties() - .set(TableProperties.MANIFEST_MERGE_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MERGE_ENABLED, "false").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newRowDelta() - .addDeletes(FILE_A_EQ_1) - .commit(); + table.newRowDelta().addDeletes(FILE_A_EQ_1).commit(); - table.newRowDelta() - .addDeletes(FILE_A_POS_1) - .commit(); + table.newRowDelta().addDeletes(FILE_A_POS_1).commit(); - table.updateProperties() + table + .updateProperties() .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1") .set(TableProperties.MANIFEST_MERGE_ENABLED, "true") .commit(); - Assert.assertEquals("Should have two delete manifests", - 2, table.currentSnapshot().deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should have two delete manifests", + 2, + table.currentSnapshot().deleteManifests(table.io()).size()); // merge delete manifests - table.newAppend() - .appendFile(FILE_B) - .commit(); - - Assert.assertEquals("Should have one delete manifest", - 1, table.currentSnapshot().deleteManifests(table.io()).size()); - Assert.assertEquals("Should have zero added delete file", - 0, table.currentSnapshot().deleteManifests(table.io()).get(0).addedFilesCount().intValue()); - Assert.assertEquals("Should have zero deleted delete file", - 0, table.currentSnapshot().deleteManifests(table.io()).get(0).deletedFilesCount().intValue()); - Assert.assertEquals("Should have two existing delete files", - 2, table.currentSnapshot().deleteManifests(table.io()).get(0).existingFilesCount().intValue()); + table.newAppend().appendFile(FILE_B).commit(); + + Assert.assertEquals( + "Should have one delete manifest", + 1, + table.currentSnapshot().deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should have zero added delete file", + 0, + table.currentSnapshot().deleteManifests(table.io()).get(0).addedFilesCount().intValue()); + Assert.assertEquals( + "Should have zero deleted delete file", + 0, + table.currentSnapshot().deleteManifests(table.io()).get(0).deletedFilesCount().intValue()); + Assert.assertEquals( + "Should have two existing delete files", + 2, + table.currentSnapshot().deleteManifests(table.io()).get(0).existingFilesCount().intValue()); List tasks = - Lists.newArrayList(table.newScan().filter(equal(bucket("data", BUCKETS_NUMBER), 0)) - .planFiles().iterator()); + Lists.newArrayList( + table + .newScan() + .filter(equal(bucket("data", BUCKETS_NUMBER), 0)) + .planFiles() + .iterator()); Assert.assertEquals("Should have one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should have the correct data file path", - FILE_A.path(), task.file().path()); - Assert.assertEquals("Should have two associated delete files", - 2, task.deletes().size()); - Assert.assertEquals("Should have expected delete files", + Assert.assertEquals( + "Should have the correct data file path", FILE_A.path(), task.file().path()); + Assert.assertEquals("Should have two associated delete files", 2, task.deletes().size()); + Assert.assertEquals( + "Should have expected delete files", Sets.newHashSet(FILE_A_EQ_1.path(), FILE_A_POS_1.path()), Sets.newHashSet(Iterables.transform(task.deletes(), ContentFile::path))); } diff --git a/core/src/test/java/org/apache/iceberg/TestDeleteFiles.java b/core/src/test/java/org/apache/iceberg/TestDeleteFiles.java index cc0e84cb72dd..58d4352626a5 100644 --- a/core/src/test/java/org/apache/iceberg/TestDeleteFiles.java +++ b/core/src/test/java/org/apache/iceberg/TestDeleteFiles.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.nio.ByteBuffer; @@ -35,37 +34,43 @@ @RunWith(Parameterized.class) public class TestDeleteFiles extends TableTestBase { - private static final DataFile DATA_FILE_BUCKET_0_IDS_0_2 = DataFiles.builder(SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=0") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 5L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - null, // no nan value counts - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(2L)) // upper bounds - )) - .build(); - - private static final DataFile DATA_FILE_BUCKET_0_IDS_8_10 = DataFiles.builder(SPEC) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=0") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 5L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - null, // no nan value counts - ImmutableMap.of(1, longToBuffer(8L)), // lower bounds - ImmutableMap.of(1, longToBuffer(10L)) // upper bounds - )) - .build(); + private static final DataFile DATA_FILE_BUCKET_0_IDS_0_2 = + DataFiles.builder(SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 5L), // value count + ImmutableMap.of(1, 0L, 2, 0L), // null count + null, // no nan value counts + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(2L)) // upper bounds + )) + .build(); + + private static final DataFile DATA_FILE_BUCKET_0_IDS_8_10 = + DataFiles.builder(SPEC) + .withPath("/path/to/data-2.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 5L), // value count + ImmutableMap.of(1, 0L, 2, 0L), // null count + null, // no nan value counts + ImmutableMap.of(1, longToBuffer(8L)), // lower bounds + ImmutableMap.of(1, longToBuffer(10L)) // upper bounds + )) + .build(); @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestDeleteFiles(int formatVersion) { @@ -74,36 +79,30 @@ public TestDeleteFiles(int formatVersion) { @Test public void testMultipleDeletes() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).appendFile(FILE_C).commit(); Assert.assertEquals("Metadata should be at version 1", 1L, (long) version()); Snapshot append = readMetadata().currentSnapshot(); validateSnapshot(null, append, FILE_A, FILE_B, FILE_C); - table.newDelete() - .deleteFile(FILE_A) - .commit(); + table.newDelete().deleteFile(FILE_A).commit(); Assert.assertEquals("Metadata should be at version 2", 2L, (long) version()); Snapshot delete = readMetadata().currentSnapshot(); Assert.assertEquals("Should have 1 manifest", 1, delete.allManifests(FILE_IO).size()); - validateManifestEntries(delete.allManifests(table.io()).get(0), + validateManifestEntries( + delete.allManifests(table.io()).get(0), ids(delete.snapshotId(), append.snapshotId(), append.snapshotId()), files(FILE_A, FILE_B, FILE_C), statuses(Status.DELETED, Status.EXISTING, Status.EXISTING)); - table.newDelete() - .deleteFile(FILE_B) - .commit(); + table.newDelete().deleteFile(FILE_B).commit(); Assert.assertEquals("Metadata should be at version 3", 3L, (long) version()); Snapshot delete2 = readMetadata().currentSnapshot(); Assert.assertEquals("Should have 1 manifest", 1, delete2.allManifests(FILE_IO).size()); - validateManifestEntries(delete2.allManifests(FILE_IO).get(0), + validateManifestEntries( + delete2.allManifests(FILE_IO).get(0), ids(delete2.snapshotId(), append.snapshotId()), files(FILE_B, FILE_C), statuses(Status.DELETED, Status.EXISTING)); @@ -113,68 +112,70 @@ public void testMultipleDeletes() { public void testAlreadyDeletedFilesAreIgnoredDuringDeletesByRowFilter() { PartitionSpec spec = table.spec(); - DataFile firstDataFile = DataFiles.builder(spec) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=0") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 5L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - null, // no nan value counts - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(10L)) // upper bounds - )) - .build(); - - DataFile secondDataFile = DataFiles.builder(spec) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=0") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 5L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - null, // no nan value counts - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); + DataFile firstDataFile = + DataFiles.builder(spec) + .withPath("/path/to/data-2.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 5L), // value count + ImmutableMap.of(1, 0L, 2, 0L), // null count + null, // no nan value counts + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(10L)) // upper bounds + )) + .build(); + + DataFile secondDataFile = + DataFiles.builder(spec) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 5L), // value count + ImmutableMap.of(1, 0L, 2, 0L), // null count + null, // no nan value counts + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(4L)) // upper bounds + )) + .build(); // add both data files - table.newFastAppend() - .appendFile(firstDataFile) - .appendFile(secondDataFile) - .commit(); + table.newFastAppend().appendFile(firstDataFile).appendFile(secondDataFile).commit(); Snapshot initialSnapshot = table.currentSnapshot(); Assert.assertEquals("Should have 1 manifest", 1, initialSnapshot.allManifests(FILE_IO).size()); - validateManifestEntries(initialSnapshot.allManifests(FILE_IO).get(0), + validateManifestEntries( + initialSnapshot.allManifests(FILE_IO).get(0), ids(initialSnapshot.snapshotId(), initialSnapshot.snapshotId()), files(firstDataFile, secondDataFile), statuses(Status.ADDED, Status.ADDED)); // delete the first data file - table.newDelete() - .deleteFile(firstDataFile) - .commit(); + table.newDelete().deleteFile(firstDataFile).commit(); Snapshot deleteSnapshot = table.currentSnapshot(); Assert.assertEquals("Should have 1 manifest", 1, deleteSnapshot.allManifests(FILE_IO).size()); - validateManifestEntries(deleteSnapshot.allManifests(FILE_IO).get(0), + validateManifestEntries( + deleteSnapshot.allManifests(FILE_IO).get(0), ids(deleteSnapshot.snapshotId(), initialSnapshot.snapshotId()), files(firstDataFile, secondDataFile), statuses(Status.DELETED, Status.EXISTING)); // delete the second data file using a row filter // the commit should succeed as there is only one live data file - table.newDelete() - .deleteFromRowFilter(Expressions.lessThan("id", 7)) - .commit(); + table.newDelete().deleteFromRowFilter(Expressions.lessThan("id", 7)).commit(); Snapshot finalSnapshot = table.currentSnapshot(); Assert.assertEquals("Should have 1 manifest", 1, finalSnapshot.allManifests(FILE_IO).size()); - validateManifestEntries(finalSnapshot.allManifests(FILE_IO).get(0), + validateManifestEntries( + finalSnapshot.allManifests(FILE_IO).get(0), ids(finalSnapshot.snapshotId()), files(secondDataFile), statuses(Status.DELETED)); @@ -183,26 +184,27 @@ public void testAlreadyDeletedFilesAreIgnoredDuringDeletesByRowFilter() { @Test public void testDeleteSomeFilesByRowFilterWithoutPartitionPredicates() { // add both data files - table.newFastAppend() + table + .newFastAppend() .appendFile(DATA_FILE_BUCKET_0_IDS_0_2) .appendFile(DATA_FILE_BUCKET_0_IDS_8_10) .commit(); Snapshot initialSnapshot = table.currentSnapshot(); Assert.assertEquals("Should have 1 manifest", 1, initialSnapshot.allManifests(FILE_IO).size()); - validateManifestEntries(initialSnapshot.allManifests(FILE_IO).get(0), + validateManifestEntries( + initialSnapshot.allManifests(FILE_IO).get(0), ids(initialSnapshot.snapshotId(), initialSnapshot.snapshotId()), files(DATA_FILE_BUCKET_0_IDS_0_2, DATA_FILE_BUCKET_0_IDS_8_10), statuses(Status.ADDED, Status.ADDED)); // delete the second one using a metrics filter (no partition filter) - table.newDelete() - .deleteFromRowFilter(Expressions.greaterThan("id", 5)) - .commit(); + table.newDelete().deleteFromRowFilter(Expressions.greaterThan("id", 5)).commit(); Snapshot deleteSnapshot = table.currentSnapshot(); Assert.assertEquals("Should have 1 manifest", 1, deleteSnapshot.allManifests(FILE_IO).size()); - validateManifestEntries(deleteSnapshot.allManifests(FILE_IO).get(0), + validateManifestEntries( + deleteSnapshot.allManifests(FILE_IO).get(0), ids(initialSnapshot.snapshotId(), deleteSnapshot.snapshotId()), files(DATA_FILE_BUCKET_0_IDS_0_2, DATA_FILE_BUCKET_0_IDS_8_10), statuses(Status.EXISTING, Status.DELETED)); @@ -211,14 +213,16 @@ public void testDeleteSomeFilesByRowFilterWithoutPartitionPredicates() { @Test public void testDeleteSomeFilesByRowFilterWithCombinedPredicates() { // add both data files - table.newFastAppend() + table + .newFastAppend() .appendFile(DATA_FILE_BUCKET_0_IDS_0_2) .appendFile(DATA_FILE_BUCKET_0_IDS_8_10) .commit(); Snapshot initialSnapshot = table.currentSnapshot(); Assert.assertEquals("Should have 1 manifest", 1, initialSnapshot.allManifests(FILE_IO).size()); - validateManifestEntries(initialSnapshot.allManifests(FILE_IO).get(0), + validateManifestEntries( + initialSnapshot.allManifests(FILE_IO).get(0), ids(initialSnapshot.snapshotId(), initialSnapshot.snapshotId()), files(DATA_FILE_BUCKET_0_IDS_0_2, DATA_FILE_BUCKET_0_IDS_8_10), statuses(Status.ADDED, Status.ADDED)); @@ -227,13 +231,12 @@ public void testDeleteSomeFilesByRowFilterWithCombinedPredicates() { Expression partPredicate = Expressions.equal(Expressions.bucket("data", 16), 0); Expression rowPredicate = Expressions.greaterThan("id", 5); Expression predicate = Expressions.and(partPredicate, rowPredicate); - table.newDelete() - .deleteFromRowFilter(predicate) - .commit(); + table.newDelete().deleteFromRowFilter(predicate).commit(); Snapshot deleteSnapshot = table.currentSnapshot(); Assert.assertEquals("Should have 1 manifest", 1, deleteSnapshot.allManifests(FILE_IO).size()); - validateManifestEntries(deleteSnapshot.allManifests(FILE_IO).get(0), + validateManifestEntries( + deleteSnapshot.allManifests(FILE_IO).get(0), ids(initialSnapshot.snapshotId(), deleteSnapshot.snapshotId()), files(DATA_FILE_BUCKET_0_IDS_0_2, DATA_FILE_BUCKET_0_IDS_8_10), statuses(Status.EXISTING, Status.DELETED)); @@ -243,60 +246,55 @@ public void testDeleteSomeFilesByRowFilterWithCombinedPredicates() { public void testCannotDeleteFileWhereNotAllRowsMatchPartitionFilter() { Assume.assumeTrue(formatVersion == 2); - table.updateSpec() + table + .updateSpec() .removeField(Expressions.bucket("data", 16)) .addField(Expressions.truncate("data", 2)) .commit(); PartitionSpec spec = table.spec(); - DataFile dataFile = DataFiles.builder(spec) - .withPath("/path/to/data-1.parquet") - .withRecordCount(10) - .withFileSizeInBytes(10) - .withPartitionPath("data_trunc_2=aa") - .build(); - - table.newFastAppend() - .appendFile(dataFile) - .commit(); - - AssertHelpers.assertThrows("Should reject as not all rows match filter", - ValidationException.class, "Cannot delete file where some, but not all, rows match filter", - () -> table.newDelete() - .deleteFromRowFilter(Expressions.equal("data", "aa")) - .commit()); + DataFile dataFile = + DataFiles.builder(spec) + .withPath("/path/to/data-1.parquet") + .withRecordCount(10) + .withFileSizeInBytes(10) + .withPartitionPath("data_trunc_2=aa") + .build(); + + table.newFastAppend().appendFile(dataFile).commit(); + + AssertHelpers.assertThrows( + "Should reject as not all rows match filter", + ValidationException.class, + "Cannot delete file where some, but not all, rows match filter", + () -> table.newDelete().deleteFromRowFilter(Expressions.equal("data", "aa")).commit()); } @Test public void testDeleteCaseSensitivity() { - table.newFastAppend() - .appendFile(DATA_FILE_BUCKET_0_IDS_0_2) - .commit(); + table.newFastAppend().appendFile(DATA_FILE_BUCKET_0_IDS_0_2).commit(); Expression rowFilter = Expressions.lessThan("iD", 5); - AssertHelpers.assertThrows("Should use case sensitive binding by default", - ValidationException.class, "Cannot find field 'iD'", - () -> table.newDelete() - .deleteFromRowFilter(rowFilter) - .commit()); - - AssertHelpers.assertThrows("Should fail with case sensitive binding", - ValidationException.class, "Cannot find field 'iD'", - () -> table.newDelete() - .deleteFromRowFilter(rowFilter) - .caseSensitive(true) - .commit()); - - table.newDelete() - .deleteFromRowFilter(rowFilter) - .caseSensitive(false) - .commit(); + AssertHelpers.assertThrows( + "Should use case sensitive binding by default", + ValidationException.class, + "Cannot find field 'iD'", + () -> table.newDelete().deleteFromRowFilter(rowFilter).commit()); + + AssertHelpers.assertThrows( + "Should fail with case sensitive binding", + ValidationException.class, + "Cannot find field 'iD'", + () -> table.newDelete().deleteFromRowFilter(rowFilter).caseSensitive(true).commit()); + + table.newDelete().deleteFromRowFilter(rowFilter).caseSensitive(false).commit(); Snapshot deleteSnapshot = table.currentSnapshot(); Assert.assertEquals("Should have 1 manifest", 1, deleteSnapshot.allManifests(FILE_IO).size()); - validateManifestEntries(deleteSnapshot.allManifests(FILE_IO).get(0), + validateManifestEntries( + deleteSnapshot.allManifests(FILE_IO).get(0), ids(deleteSnapshot.snapshotId()), files(DATA_FILE_BUCKET_0_IDS_0_2), statuses(Status.DELETED)); diff --git a/core/src/test/java/org/apache/iceberg/TestEntriesMetadataTable.java b/core/src/test/java/org/apache/iceberg/TestEntriesMetadataTable.java index 0db8e6c6bc78..b976ce72e2e1 100644 --- a/core/src/test/java/org/apache/iceberg/TestEntriesMetadataTable.java +++ b/core/src/test/java/org/apache/iceberg/TestEntriesMetadataTable.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.junit.Assert.assertEquals; + import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; @@ -28,13 +29,11 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.junit.Assert.assertEquals; - @RunWith(Parameterized.class) public class TestEntriesMetadataTable extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestEntriesMetadataTable(int formatVersion) { @@ -43,56 +42,49 @@ public TestEntriesMetadataTable(int formatVersion) { @Test public void testEntriesTable() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Table entriesTable = new ManifestEntriesTable(table.ops(), table); Schema expectedSchema = ManifestEntry.getSchema(table.spec().partitionType()); - assertEquals("A tableScan.select() should prune the schema", + assertEquals( + "A tableScan.select() should prune the schema", expectedSchema.asStruct(), entriesTable.schema().asStruct()); } @Test public void testEntriesTableScan() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Table entriesTable = new ManifestEntriesTable(table.ops(), table); TableScan scan = entriesTable.newScan(); Schema expectedSchema = ManifestEntry.getSchema(table.spec().partitionType()); - assertEquals("A tableScan.select() should prune the schema", + assertEquals( + "A tableScan.select() should prune the schema", expectedSchema.asStruct(), scan.schema().asStruct()); FileScanTask file = Iterables.getOnlyElement(scan.planFiles()); - Assert.assertEquals("Data file should be the table's manifest", - Iterables.getOnlyElement(table.currentSnapshot().allManifests(table.io())).path(), file.file().path()); + Assert.assertEquals( + "Data file should be the table's manifest", + Iterables.getOnlyElement(table.currentSnapshot().allManifests(table.io())).path(), + file.file().path()); Assert.assertEquals("Should contain 2 data file records", 2, file.file().recordCount()); } @Test public void testSplitPlanningWithMetadataSplitSizeProperty() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.newAppend() - .appendFile(FILE_C) - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); // set the split size to a large value so that both manifests are in 1 split - table.updateProperties() + table + .updateProperties() .set(TableProperties.METADATA_SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)) .commit(); @@ -101,33 +93,34 @@ public void testSplitPlanningWithMetadataSplitSizeProperty() { Assert.assertEquals(1, Iterables.size(entriesTable.newScan().planTasks())); // set the split size to a small value so that manifests end up in different splits - table.updateProperties() - .set(TableProperties.METADATA_SPLIT_SIZE, String.valueOf(1)) - .commit(); + table.updateProperties().set(TableProperties.METADATA_SPLIT_SIZE, String.valueOf(1)).commit(); Assert.assertEquals(2, Iterables.size(entriesTable.newScan().planTasks())); // override the table property with a large value so that both manifests are in 1 split - TableScan scan = entriesTable.newScan() - .option(TableProperties.SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)); + TableScan scan = + entriesTable + .newScan() + .option(TableProperties.SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)); Assert.assertEquals(1, Iterables.size(scan.planTasks())); } @Test public void testSplitPlanningWithDefaultMetadataSplitSize() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - int splitSize = (int) TableProperties.METADATA_SPLIT_SIZE_DEFAULT; // default split size is 32 MB + int splitSize = + (int) TableProperties.METADATA_SPLIT_SIZE_DEFAULT; // default split size is 32 MB Table entriesTable = new ManifestEntriesTable(table.ops(), table); Assert.assertEquals(1, entriesTable.currentSnapshot().allManifests(table.io()).size()); int expectedSplits = - ((int) entriesTable.currentSnapshot().allManifests(table.io()).get(0).length() + splitSize - 1) / splitSize; + ((int) entriesTable.currentSnapshot().allManifests(table.io()).get(0).length() + + splitSize + - 1) + / splitSize; TableScan scan = entriesTable.newScan(); @@ -137,32 +130,31 @@ public void testSplitPlanningWithDefaultMetadataSplitSize() { @Test public void testEntriesTableWithDeleteManifests() throws Exception { Assume.assumeTrue("Only V2 Tables Support Deletes", formatVersion >= 2); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.newRowDelta() - .addDeletes(FILE_A_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_A_DELETES).commit(); Table entriesTable = new ManifestEntriesTable(table.ops(), table); TableScan scan = entriesTable.newScan(); Schema expectedSchema = ManifestEntry.getSchema(table.spec().partitionType()); - assertEquals("A tableScan.select() should prune the schema", + assertEquals( + "A tableScan.select() should prune the schema", expectedSchema.asStruct(), scan.schema().asStruct()); List files = ImmutableList.copyOf(scan.planFiles()); - Assert.assertEquals("Data file should be the table's manifest", - Iterables.getOnlyElement(table.currentSnapshot().dataManifests(table.io())).path(), files.get(0).file().path()); + Assert.assertEquals( + "Data file should be the table's manifest", + Iterables.getOnlyElement(table.currentSnapshot().dataManifests(table.io())).path(), + files.get(0).file().path()); Assert.assertEquals("Should contain 2 data file records", 2, files.get(0).file().recordCount()); - Assert.assertEquals("Delete file should be in the table manifest", + Assert.assertEquals( + "Delete file should be in the table manifest", Iterables.getOnlyElement(table.currentSnapshot().deleteManifests(table.io())).path(), files.get(1).file().path()); - Assert.assertEquals("Should contain 1 delete file record", 1, files.get(1).file().recordCount()); + Assert.assertEquals( + "Should contain 1 delete file record", 1, files.get(1).file().recordCount()); } - } diff --git a/core/src/test/java/org/apache/iceberg/TestFastAppend.java b/core/src/test/java/org/apache/iceberg/TestFastAppend.java index e0ffacd73778..c04a20b98bd5 100644 --- a/core/src/test/java/org/apache/iceberg/TestFastAppend.java +++ b/core/src/test/java/org/apache/iceberg/TestFastAppend.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.File; @@ -37,7 +36,7 @@ public class TestFastAppend extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestFastAppend(int formatVersion) { @@ -50,21 +49,21 @@ public void testEmptyTableAppend() { TableMetadata base = readMetadata(); Assert.assertNull("Should not have a current snapshot", base.currentSnapshot()); - Assert.assertEquals("Table should start with last-sequence-number 0", 0, base.lastSequenceNumber()); + Assert.assertEquals( + "Table should start with last-sequence-number 0", 0, base.lastSequenceNumber()); - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot snap = table.currentSnapshot(); validateSnapshot(base.currentSnapshot(), snap, 1, FILE_A, FILE_B); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); } @Test @@ -73,25 +72,28 @@ public void testEmptyTableAppendManifest() throws IOException { TableMetadata base = readMetadata(); Assert.assertNull("Should not have a current snapshot", base.currentSnapshot()); - Assert.assertEquals("Table should start with last-sequence-number 0", 0, base.lastSequenceNumber()); + Assert.assertEquals( + "Table should start with last-sequence-number 0", 0, base.lastSequenceNumber()); ManifestFile manifest = writeManifest(FILE_A, FILE_B); - table.newFastAppend() - .appendManifest(manifest) - .commit(); + table.newFastAppend().appendManifest(manifest).commit(); Snapshot snap = table.currentSnapshot(); validateSnapshot(base.currentSnapshot(), snap, 1, FILE_A, FILE_B); // validate that the metadata summary is correct when using appendManifest - Assert.assertEquals("Summary metadata should include 2 added files", - "2", snap.summary().get("added-data-files")); + Assert.assertEquals( + "Summary metadata should include 2 added files", + "2", + snap.summary().get("added-data-files")); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); } @Test @@ -100,40 +102,38 @@ public void testEmptyTableAppendFilesAndManifest() throws IOException { TableMetadata base = readMetadata(); Assert.assertNull("Should not have a current snapshot", base.currentSnapshot()); - Assert.assertEquals("Table should start with last-sequence-number 0", 0, base.lastSequenceNumber()); + Assert.assertEquals( + "Table should start with last-sequence-number 0", 0, base.lastSequenceNumber()); ManifestFile manifest = writeManifest(FILE_A, FILE_B); - table.newFastAppend() - .appendFile(FILE_C) - .appendFile(FILE_D) - .appendManifest(manifest) - .commit(); + table.newFastAppend().appendFile(FILE_C).appendFile(FILE_D).appendManifest(manifest).commit(); Snapshot snap = table.currentSnapshot(); long commitId = snap.snapshotId(); - validateManifest(snap.allManifests(FILE_IO).get(0), + validateManifest( + snap.allManifests(FILE_IO).get(0), seqs(1, 1), ids(commitId, commitId), files(FILE_C, FILE_D)); - validateManifest(snap.allManifests(FILE_IO).get(1), + validateManifest( + snap.allManifests(FILE_IO).get(1), seqs(1, 1), ids(commitId, commitId), files(FILE_A, FILE_B)); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); } @Test public void testNonEmptyTableAppend() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); TableMetadata base = readMetadata(); Assert.assertNotNull("Should have a current snapshot", base.currentSnapshot()); @@ -141,25 +141,20 @@ public void testNonEmptyTableAppend() { Assert.assertEquals("Should have one existing manifest", 1, v2manifests.size()); // prepare a new append - Snapshot pending = table.newFastAppend() - .appendFile(FILE_C) - .appendFile(FILE_D) - .apply(); + Snapshot pending = table.newFastAppend().appendFile(FILE_C).appendFile(FILE_D).apply(); - Assert.assertNotEquals("Snapshots should have unique IDs", - base.currentSnapshot().snapshotId(), pending.snapshotId()); + Assert.assertNotEquals( + "Snapshots should have unique IDs", + base.currentSnapshot().snapshotId(), + pending.snapshotId()); validateSnapshot(base.currentSnapshot(), pending, FILE_C, FILE_D); } @Test public void testNoMerge() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); TableMetadata base = readMetadata(); Assert.assertNotNull("Should have a current snapshot", base.currentSnapshot()); @@ -167,10 +162,7 @@ public void testNoMerge() { Assert.assertEquals("Should have 2 existing manifests", 2, v3manifests.size()); // prepare a new append - Snapshot pending = table.newFastAppend() - .appendFile(FILE_C) - .appendFile(FILE_D) - .apply(); + Snapshot pending = table.newFastAppend().appendFile(FILE_C).appendFile(FILE_D).apply(); Set ids = Sets.newHashSet(); for (Snapshot snapshot : base.snapshots()) { @@ -187,9 +179,7 @@ public void testRefreshBeforeApply() { // load a new copy of the table that will not be refreshed by the commit Table stale = load(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); Assert.assertNotNull("Should have a current snapshot", base.currentSnapshot()); @@ -197,8 +187,7 @@ public void testRefreshBeforeApply() { Assert.assertEquals("Should have 1 existing manifest", 1, v2manifests.size()); // commit from the stale table - AppendFiles append = stale.newFastAppend() - .appendFile(FILE_D); + AppendFiles append = stale.newFastAppend().appendFile(FILE_D); Snapshot pending = append.apply(); // table should have been refreshed before applying the changes @@ -208,15 +197,12 @@ public void testRefreshBeforeApply() { @Test public void testRefreshBeforeCommit() { // commit from the stale table - AppendFiles append = table.newFastAppend() - .appendFile(FILE_D); + AppendFiles append = table.newFastAppend().appendFile(FILE_D); Snapshot pending = append.apply(); validateSnapshot(null, pending, FILE_D); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); Assert.assertNotNull("Should have a current snapshot", base.currentSnapshot()); @@ -230,10 +216,13 @@ public void testRefreshBeforeCommit() { // apply was called before the conflicting commit, but the commit was still consistent validateSnapshot(base.currentSnapshot(), committed.currentSnapshot(), FILE_D); - List committedManifests = Lists.newArrayList(committed.currentSnapshot().allManifests(FILE_IO)); + List committedManifests = + Lists.newArrayList(committed.currentSnapshot().allManifests(FILE_IO)); committedManifests.removeAll(base.currentSnapshot().allManifests(FILE_IO)); - Assert.assertEquals("Should reused manifest created by apply", - pending.allManifests(FILE_IO).get(0), committedManifests.get(0)); + Assert.assertEquals( + "Should reused manifest created by apply", + pending.allManifests(FILE_IO).get(0), + committedManifests.get(0)); } @Test @@ -247,8 +236,11 @@ public void testFailure() { ManifestFile newManifest = pending.allManifests(FILE_IO).get(0); Assert.assertTrue("Should create new manifest", new File(newManifest.path()).exists()); - AssertHelpers.assertThrows("Should retry 4 times and throw last failure", - CommitFailedException.class, "Injected failure", append::commit); + AssertHelpers.assertThrows( + "Should retry 4 times and throw last failure", + CommitFailedException.class, + "Injected failure", + append::commit); Assert.assertFalse("Should clean up new manifest", new File(newManifest.path()).exists()); } @@ -265,8 +257,11 @@ public void testAppendManifestCleanup() throws IOException { ManifestFile newManifest = pending.allManifests(FILE_IO).get(0); Assert.assertTrue("Should create new manifest", new File(newManifest.path()).exists()); - AssertHelpers.assertThrows("Should retry 4 times and throw last failure", - CommitFailedException.class, "Injected failure", append::commit); + AssertHelpers.assertThrows( + "Should retry 4 times and throw last failure", + CommitFailedException.class, + "Injected failure", + append::commit); Assert.assertFalse("Should clean up new manifest", new File(newManifest.path()).exists()); } @@ -290,7 +285,8 @@ public void testRecoveryWithManifestList() { validateSnapshot(null, metadata.currentSnapshot(), FILE_B); Assert.assertTrue("Should commit same new manifest", new File(newManifest.path()).exists()); - Assert.assertTrue("Should commit the same new manifest", + Assert.assertTrue( + "Should commit the same new manifest", metadata.currentSnapshot().allManifests(FILE_IO).contains(newManifest)); } @@ -313,15 +309,14 @@ public void testRecoveryWithoutManifestList() { validateSnapshot(null, metadata.currentSnapshot(), FILE_B); Assert.assertTrue("Should commit same new manifest", new File(newManifest.path()).exists()); - Assert.assertTrue("Should commit the same new manifest", + Assert.assertTrue( + "Should commit the same new manifest", metadata.currentSnapshot().allManifests(FILE_IO).contains(newManifest)); } @Test public void testAppendManifestWithSnapshotIdInheritance() throws IOException { - table.updateProperties() - .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); @@ -329,44 +324,47 @@ public void testAppendManifestWithSnapshotIdInheritance() throws IOException { Assert.assertNull("Should not have a current snapshot", base.currentSnapshot()); ManifestFile manifest = writeManifest(FILE_A, FILE_B); - table.newFastAppend() - .appendManifest(manifest) - .commit(); + table.newFastAppend().appendManifest(manifest).commit(); Snapshot snapshot = table.currentSnapshot(); List manifests = table.currentSnapshot().allManifests(FILE_IO); Assert.assertEquals("Should have 1 committed manifest", 1, manifests.size()); - validateManifestEntries(manifests.get(0), + validateManifestEntries( + manifests.get(0), ids(snapshot.snapshotId(), snapshot.snapshotId()), files(FILE_A, FILE_B), statuses(Status.ADDED, Status.ADDED)); // validate that the metadata summary is correct when using appendManifest - Assert.assertEquals("Summary metadata should include 2 added files", - "2", snapshot.summary().get("added-data-files")); - Assert.assertEquals("Summary metadata should include 2 added records", - "2", snapshot.summary().get("added-records")); - Assert.assertEquals("Summary metadata should include 2 files in total", - "2", snapshot.summary().get("total-data-files")); - Assert.assertEquals("Summary metadata should include 2 records in total", - "2", snapshot.summary().get("total-records")); + Assert.assertEquals( + "Summary metadata should include 2 added files", + "2", + snapshot.summary().get("added-data-files")); + Assert.assertEquals( + "Summary metadata should include 2 added records", + "2", + snapshot.summary().get("added-records")); + Assert.assertEquals( + "Summary metadata should include 2 files in total", + "2", + snapshot.summary().get("total-data-files")); + Assert.assertEquals( + "Summary metadata should include 2 records in total", + "2", + snapshot.summary().get("total-records")); } @Test public void testAppendManifestFailureWithSnapshotIdInheritance() throws IOException { - table.updateProperties() - .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); TableMetadata base = readMetadata(); Assert.assertNull("Should not have a current snapshot", base.currentSnapshot()); - table.updateProperties() - .set(TableProperties.COMMIT_NUM_RETRIES, "1") - .commit(); + table.updateProperties().set(TableProperties.COMMIT_NUM_RETRIES, "1").commit(); table.ops().failCommits(5); @@ -375,9 +373,8 @@ public void testAppendManifestFailureWithSnapshotIdInheritance() throws IOExcept AppendFiles append = table.newAppend(); append.appendManifest(manifest); - AssertHelpers.assertThrows("Should reject commit", - CommitFailedException.class, "Injected failure", - append::commit); + AssertHelpers.assertThrows( + "Should reject commit", CommitFailedException.class, "Injected failure", append::commit); Assert.assertTrue("Append manifest should not be deleted", new File(manifest.path()).exists()); } @@ -389,93 +386,105 @@ public void testInvalidAppendManifest() throws IOException { TableMetadata base = readMetadata(); Assert.assertNull("Should not have a current snapshot", base.currentSnapshot()); - ManifestFile manifestWithExistingFiles = writeManifest( - "manifest-file-1.avro", - manifestEntry(Status.EXISTING, null, FILE_A)); - AssertHelpers.assertThrows("Should reject commit", - IllegalArgumentException.class, "Cannot append manifest with existing files", - () -> table.newFastAppend() - .appendManifest(manifestWithExistingFiles) - .commit()); - - ManifestFile manifestWithDeletedFiles = writeManifest( - "manifest-file-2.avro", - manifestEntry(Status.DELETED, null, FILE_A)); - AssertHelpers.assertThrows("Should reject commit", - IllegalArgumentException.class, "Cannot append manifest with deleted files", - () -> table.newFastAppend() - .appendManifest(manifestWithDeletedFiles) - .commit()); + ManifestFile manifestWithExistingFiles = + writeManifest("manifest-file-1.avro", manifestEntry(Status.EXISTING, null, FILE_A)); + AssertHelpers.assertThrows( + "Should reject commit", + IllegalArgumentException.class, + "Cannot append manifest with existing files", + () -> table.newFastAppend().appendManifest(manifestWithExistingFiles).commit()); + + ManifestFile manifestWithDeletedFiles = + writeManifest("manifest-file-2.avro", manifestEntry(Status.DELETED, null, FILE_A)); + AssertHelpers.assertThrows( + "Should reject commit", + IllegalArgumentException.class, + "Cannot append manifest with deleted files", + () -> table.newFastAppend().appendManifest(manifestWithDeletedFiles).commit()); } @Test public void testDefaultPartitionSummaries() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); - - Set partitionSummaryKeys = table.currentSnapshot().summary().keySet().stream() - .filter(key -> key.startsWith(SnapshotSummary.CHANGED_PARTITION_PREFIX)) - .collect(Collectors.toSet()); - Assert.assertEquals("Should include no partition summaries by default", 0, partitionSummaryKeys.size()); - - String summariesIncluded = table.currentSnapshot().summary() - .getOrDefault(SnapshotSummary.PARTITION_SUMMARY_PROP, "false"); - Assert.assertEquals("Should not set partition-summaries-included to true", "false", summariesIncluded); - - String changedPartitions = table.currentSnapshot().summary().get(SnapshotSummary.CHANGED_PARTITION_COUNT_PROP); + table.newFastAppend().appendFile(FILE_A).commit(); + + Set partitionSummaryKeys = + table.currentSnapshot().summary().keySet().stream() + .filter(key -> key.startsWith(SnapshotSummary.CHANGED_PARTITION_PREFIX)) + .collect(Collectors.toSet()); + Assert.assertEquals( + "Should include no partition summaries by default", 0, partitionSummaryKeys.size()); + + String summariesIncluded = + table + .currentSnapshot() + .summary() + .getOrDefault(SnapshotSummary.PARTITION_SUMMARY_PROP, "false"); + Assert.assertEquals( + "Should not set partition-summaries-included to true", "false", summariesIncluded); + + String changedPartitions = + table.currentSnapshot().summary().get(SnapshotSummary.CHANGED_PARTITION_COUNT_PROP); Assert.assertEquals("Should set changed partition count", "1", changedPartitions); } @Test public void testIncludedPartitionSummaries() { - table.updateProperties() - .set(TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, "1") - .commit(); + table.updateProperties().set(TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, "1").commit(); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - Set partitionSummaryKeys = table.currentSnapshot().summary().keySet().stream() - .filter(key -> key.startsWith(SnapshotSummary.CHANGED_PARTITION_PREFIX)) - .collect(Collectors.toSet()); + Set partitionSummaryKeys = + table.currentSnapshot().summary().keySet().stream() + .filter(key -> key.startsWith(SnapshotSummary.CHANGED_PARTITION_PREFIX)) + .collect(Collectors.toSet()); Assert.assertEquals("Should include a partition summary", 1, partitionSummaryKeys.size()); - String summariesIncluded = table.currentSnapshot().summary() - .getOrDefault(SnapshotSummary.PARTITION_SUMMARY_PROP, "false"); - Assert.assertEquals("Should set partition-summaries-included to true", "true", summariesIncluded); + String summariesIncluded = + table + .currentSnapshot() + .summary() + .getOrDefault(SnapshotSummary.PARTITION_SUMMARY_PROP, "false"); + Assert.assertEquals( + "Should set partition-summaries-included to true", "true", summariesIncluded); - String changedPartitions = table.currentSnapshot().summary().get(SnapshotSummary.CHANGED_PARTITION_COUNT_PROP); + String changedPartitions = + table.currentSnapshot().summary().get(SnapshotSummary.CHANGED_PARTITION_COUNT_PROP); Assert.assertEquals("Should set changed partition count", "1", changedPartitions); - String partitionSummary = table.currentSnapshot().summary() - .get(SnapshotSummary.CHANGED_PARTITION_PREFIX + "data_bucket=0"); - Assert.assertEquals("Summary should include 1 file with 1 record that is 10 bytes", - "added-data-files=1,added-records=1,added-files-size=10", partitionSummary); + String partitionSummary = + table + .currentSnapshot() + .summary() + .get(SnapshotSummary.CHANGED_PARTITION_PREFIX + "data_bucket=0"); + Assert.assertEquals( + "Summary should include 1 file with 1 record that is 10 bytes", + "added-data-files=1,added-records=1,added-files-size=10", + partitionSummary); } @Test public void testIncludedPartitionSummaryLimit() { - table.updateProperties() - .set(TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, "1") - .commit(); - - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); - - Set partitionSummaryKeys = table.currentSnapshot().summary().keySet().stream() - .filter(key -> key.startsWith(SnapshotSummary.CHANGED_PARTITION_PREFIX)) - .collect(Collectors.toSet()); - Assert.assertEquals("Should include no partition summaries, over limit", 0, partitionSummaryKeys.size()); - - String summariesIncluded = table.currentSnapshot().summary() - .getOrDefault(SnapshotSummary.PARTITION_SUMMARY_PROP, "false"); - Assert.assertEquals("Should not set partition-summaries-included to true", "false", summariesIncluded); - - String changedPartitions = table.currentSnapshot().summary().get(SnapshotSummary.CHANGED_PARTITION_COUNT_PROP); + table.updateProperties().set(TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, "1").commit(); + + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); + + Set partitionSummaryKeys = + table.currentSnapshot().summary().keySet().stream() + .filter(key -> key.startsWith(SnapshotSummary.CHANGED_PARTITION_PREFIX)) + .collect(Collectors.toSet()); + Assert.assertEquals( + "Should include no partition summaries, over limit", 0, partitionSummaryKeys.size()); + + String summariesIncluded = + table + .currentSnapshot() + .summary() + .getOrDefault(SnapshotSummary.PARTITION_SUMMARY_PROP, "false"); + Assert.assertEquals( + "Should not set partition-summaries-included to true", "false", summariesIncluded); + + String changedPartitions = + table.currentSnapshot().summary().get(SnapshotSummary.CHANGED_PARTITION_COUNT_PROP); Assert.assertEquals("Should set changed partition count", "2", changedPartitions); } } diff --git a/core/src/test/java/org/apache/iceberg/TestFilterFiles.java b/core/src/test/java/org/apache/iceberg/TestFilterFiles.java index ddda5a146942..34b18e18b838 100644 --- a/core/src/test/java/org/apache/iceberg/TestFilterFiles.java +++ b/core/src/test/java/org/apache/iceberg/TestFilterFiles.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.junit.Assert.assertEquals; + import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; @@ -36,14 +38,11 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.junit.Assert.assertEquals; - @RunWith(Parameterized.class) public class TestFilterFiles { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public final int formatVersion; @@ -52,11 +51,10 @@ public TestFilterFiles(int formatVersion) { this.formatVersion = formatVersion; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - private final Schema schema = new Schema( - required(1, "id", Types.IntegerType.get()), - required(2, "data", Types.StringType.get())); + @Rule public TemporaryFolder temp = new TemporaryFolder(); + private final Schema schema = + new Schema( + required(1, "id", Types.IntegerType.get()), required(2, "data", Types.StringType.get())); private File tableDir = null; @Before @@ -103,14 +101,22 @@ private void testFilterFiles(Table table) { lowerBounds.put(1, Conversions.toByteBuffer(Types.IntegerType.get(), 1)); upperBounds.put(1, Conversions.toByteBuffer(Types.IntegerType.get(), 2)); - Metrics metrics = new Metrics(2L, Maps.newHashMap(), Maps.newHashMap(), - Maps.newHashMap(), null, lowerBounds, upperBounds); - - DataFile file = DataFiles.builder(table.spec()) - .withPath("/path/to/file.parquet") - .withFileSizeInBytes(0) - .withMetrics(metrics) - .build(); + Metrics metrics = + new Metrics( + 2L, + Maps.newHashMap(), + Maps.newHashMap(), + Maps.newHashMap(), + null, + lowerBounds, + upperBounds); + + DataFile file = + DataFiles.builder(table.spec()) + .withPath("/path/to/file.parquet") + .withFileSizeInBytes(0) + .withMetrics(metrics) + .build(); table.newAppend().appendFile(file).commit(); @@ -129,14 +135,22 @@ private void testCaseInsensitiveFilterFiles(Table table) { lowerBounds.put(1, Conversions.toByteBuffer(Types.IntegerType.get(), 1)); upperBounds.put(1, Conversions.toByteBuffer(Types.IntegerType.get(), 2)); - Metrics metrics = new Metrics(2L, Maps.newHashMap(), Maps.newHashMap(), - Maps.newHashMap(), null, lowerBounds, upperBounds); - - DataFile file = DataFiles.builder(table.spec()) - .withPath("/path/to/file.parquet") - .withFileSizeInBytes(0) - .withMetrics(metrics) - .build(); + Metrics metrics = + new Metrics( + 2L, + Maps.newHashMap(), + Maps.newHashMap(), + Maps.newHashMap(), + null, + lowerBounds, + upperBounds); + + DataFile file = + DataFiles.builder(table.spec()) + .withPath("/path/to/file.parquet") + .withFileSizeInBytes(0) + .withMetrics(metrics) + .build(); table.newAppend().appendFile(file).commit(); @@ -145,7 +159,8 @@ private void testCaseInsensitiveFilterFiles(Table table) { TableScan emptyScan = table.newScan().caseSensitive(false).filter(Expressions.equal("ID", 5)); assertEquals(0, Iterables.size(emptyScan.planFiles())); - TableScan nonEmptyScan = table.newScan().caseSensitive(false).filter(Expressions.equal("ID", 1)); + TableScan nonEmptyScan = + table.newScan().caseSensitive(false).filter(Expressions.equal("ID", 1)); assertEquals(1, Iterables.size(nonEmptyScan.planFiles())); } } diff --git a/core/src/test/java/org/apache/iceberg/TestFindFiles.java b/core/src/test/java/org/apache/iceberg/TestFindFiles.java index eb45c8979d03..2f5080ec2a56 100644 --- a/core/src/test/java/org/apache/iceberg/TestFindFiles.java +++ b/core/src/test/java/org/apache/iceberg/TestFindFiles.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Arrays; @@ -36,7 +35,7 @@ public class TestFindFiles extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestFindFiles(int formatVersion) { @@ -45,10 +44,7 @@ public TestFindFiles(int formatVersion) { @Test public void testBasicBehavior() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Iterable files = FindFiles.in(table).collect(); @@ -57,95 +53,100 @@ public void testBasicBehavior() { @Test public void testWithMetadataMatching() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) .appendFile(FILE_B) .appendFile(FILE_C) .appendFile(FILE_D) .commit(); - Iterable files = FindFiles.in(table) - .withMetadataMatching(Expressions.startsWith("file_path", "/path/to/data-a")) - .collect(); + Iterable files = + FindFiles.in(table) + .withMetadataMatching(Expressions.startsWith("file_path", "/path/to/data-a")) + .collect(); Assert.assertEquals(pathSet(FILE_A), pathSet(files)); } @Test public void testWithRecordsMatching() { - table.newAppend() - .appendFile(DataFiles.builder(SPEC) - .withInputFile(Files.localInput("/path/to/data-e.parquet")) - .withPartitionPath("data_bucket=4") - .withMetrics(new Metrics(3L, - null, // no column sizes - ImmutableMap.of(1, 3L), // value count - ImmutableMap.of(1, 0L), // null count - null, - ImmutableMap.of(1, Conversions.toByteBuffer(Types.IntegerType.get(), 1)), // lower bounds - ImmutableMap.of(1, Conversions.toByteBuffer(Types.IntegerType.get(), 5)))) // lower bounds - .build()) + table + .newAppend() + .appendFile( + DataFiles.builder(SPEC) + .withInputFile(Files.localInput("/path/to/data-e.parquet")) + .withPartitionPath("data_bucket=4") + .withMetrics( + new Metrics( + 3L, + null, // no column sizes + ImmutableMap.of(1, 3L), // value count + ImmutableMap.of(1, 0L), // null count + null, + ImmutableMap.of( + 1, + Conversions.toByteBuffer(Types.IntegerType.get(), 1)), // lower bounds + ImmutableMap.of( + 1, + Conversions.toByteBuffer(Types.IntegerType.get(), 5)))) // lower bounds + .build()) .commit(); - final Iterable files = FindFiles.in(table) - .withRecordsMatching(Expressions.equal("id", 1)) - .collect(); + final Iterable files = + FindFiles.in(table).withRecordsMatching(Expressions.equal("id", 1)).collect(); Assert.assertEquals(Sets.newHashSet("/path/to/data-e.parquet"), pathSet(files)); } @Test public void testInPartition() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // bucket 0 .appendFile(FILE_B) // bucket 1 .appendFile(FILE_C) // bucket 2 .appendFile(FILE_D) // bucket 3 .commit(); - Iterable files = FindFiles.in(table) - .inPartition(table.spec(), StaticDataTask.Row.of(1)) - .inPartition(table.spec(), StaticDataTask.Row.of(2)) - .collect(); + Iterable files = + FindFiles.in(table) + .inPartition(table.spec(), StaticDataTask.Row.of(1)) + .inPartition(table.spec(), StaticDataTask.Row.of(2)) + .collect(); Assert.assertEquals(pathSet(FILE_B, FILE_C), pathSet(files)); } @Test public void testInPartitions() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // bucket 0 .appendFile(FILE_B) // bucket 1 .appendFile(FILE_C) // bucket 2 .appendFile(FILE_D) // bucket 3 .commit(); - Iterable files = FindFiles.in(table) - .inPartitions(table.spec(), StaticDataTask.Row.of(1), StaticDataTask.Row.of(2)) - .collect(); + Iterable files = + FindFiles.in(table) + .inPartitions(table.spec(), StaticDataTask.Row.of(1), StaticDataTask.Row.of(2)) + .collect(); Assert.assertEquals(pathSet(FILE_B, FILE_C), pathSet(files)); } @Test public void testAsOfTimestamp() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); long timestamp = System.currentTimeMillis(); - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); - table.newAppend() - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_D).commit(); Iterable files = FindFiles.in(table).asOfTime(timestamp).collect(); @@ -154,20 +155,13 @@ public void testAsOfTimestamp() { @Test public void testSnapshotId() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); long snapshotId = table.currentSnapshot().snapshotId(); - table.newAppend() - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_D).commit(); Iterable files = FindFiles.in(table).inSnapshot(snapshotId).collect(); @@ -176,30 +170,28 @@ public void testSnapshotId() { @Test public void testCaseSensitivity() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) .appendFile(FILE_B) .appendFile(FILE_C) .appendFile(FILE_D) .commit(); - Iterable files = FindFiles.in(table) - .caseInsensitive() - .withMetadataMatching(Expressions.startsWith("FILE_PATH", "/path/to/data-a")) - .collect(); + Iterable files = + FindFiles.in(table) + .caseInsensitive() + .withMetadataMatching(Expressions.startsWith("FILE_PATH", "/path/to/data-a")) + .collect(); Assert.assertEquals(pathSet(FILE_A), pathSet(files)); } @Test public void testIncludeColumnStats() { - table.newAppend() - .appendFile(FILE_WITH_STATS) - .commit(); + table.newAppend().appendFile(FILE_WITH_STATS).commit(); - Iterable files = FindFiles.in(table) - .includeColumnStats() - .collect(); + Iterable files = FindFiles.in(table).includeColumnStats().collect(); final DataFile file = files.iterator().next(); Assert.assertEquals(FILE_WITH_STATS.columnSizes(), file.columnSizes()); @@ -222,7 +214,8 @@ public void testNoSnapshot() { } private Set pathSet(DataFile... files) { - return Sets.newHashSet(Iterables.transform(Arrays.asList(files), file -> file.path().toString())); + return Sets.newHashSet( + Iterables.transform(Arrays.asList(files), file -> file.path().toString())); } private Set pathSet(Iterable files) { diff --git a/core/src/test/java/org/apache/iceberg/TestFixedSizeSplitScanTaskIterator.java b/core/src/test/java/org/apache/iceberg/TestFixedSizeSplitScanTaskIterator.java index 76844553c679..b08449e3a7d3 100644 --- a/core/src/test/java/org/apache/iceberg/TestFixedSizeSplitScanTaskIterator.java +++ b/core/src/test/java/org/apache/iceberg/TestFixedSizeSplitScanTaskIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -28,18 +27,29 @@ public class TestFixedSizeSplitScanTaskIterator { @Test public void testSplits() { - verify(15L, 100L, asList( - asList(0L, 15L), asList(15L, 15L), asList(30L, 15L), asList(45L, 15L), asList(60L, 15L), - asList(75L, 15L), asList(90L, 10L))); + verify( + 15L, + 100L, + asList( + asList(0L, 15L), + asList(15L, 15L), + asList(30L, 15L), + asList(45L, 15L), + asList(60L, 15L), + asList(75L, 15L), + asList(90L, 10L))); verify(10L, 10L, asList(asList(0L, 10L))); verify(20L, 10L, asList(asList(0L, 10L))); } private static void verify(long splitSize, long fileLen, List> offsetLenPairs) { FileScanTask mockFileScanTask = new MockFileScanTask(fileLen); - SplitScanTaskIterator splitTaskIterator = new FixedSizeSplitScanTaskIterator<>( - mockFileScanTask, mockFileScanTask.length(), - splitSize, TestFixedSizeSplitScanTaskIterator::createSplitTask); + SplitScanTaskIterator splitTaskIterator = + new FixedSizeSplitScanTaskIterator<>( + mockFileScanTask, + mockFileScanTask.length(), + splitSize, + TestFixedSizeSplitScanTaskIterator::createSplitTask); List tasks = Lists.newArrayList(splitTaskIterator); for (int i = 0; i < tasks.size(); i++) { diff --git a/core/src/test/java/org/apache/iceberg/TestFormatVersions.java b/core/src/test/java/org/apache/iceberg/TestFormatVersions.java index 1d04834c2882..f3d31e4db33e 100644 --- a/core/src/test/java/org/apache/iceberg/TestFormatVersions.java +++ b/core/src/test/java/org/apache/iceberg/TestFormatVersions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.junit.Assert; @@ -49,8 +48,10 @@ public void testFormatVersionDowngrade() { Assert.assertEquals("Should report v2", 2, ops.current().formatVersion()); - AssertHelpers.assertThrows("Should reject a version downgrade", - IllegalArgumentException.class, "Cannot downgrade", + AssertHelpers.assertThrows( + "Should reject a version downgrade", + IllegalArgumentException.class, + "Cannot downgrade", () -> ops.current().upgradeToFormatVersion(1)); Assert.assertEquals("Should report v2", 2, ops.current().formatVersion()); @@ -60,9 +61,14 @@ public void testFormatVersionDowngrade() { public void testFormatVersionUpgradeNotSupported() { TableOperations ops = table.ops(); TableMetadata base = ops.current(); - AssertHelpers.assertThrows("Should reject an unsupported version upgrade", - IllegalArgumentException.class, "Cannot upgrade table to unsupported format version", - () -> ops.commit(base, base.upgradeToFormatVersion(TableMetadata.SUPPORTED_TABLE_FORMAT_VERSION + 1))); + AssertHelpers.assertThrows( + "Should reject an unsupported version upgrade", + IllegalArgumentException.class, + "Cannot upgrade table to unsupported format version", + () -> + ops.commit( + base, + base.upgradeToFormatVersion(TableMetadata.SUPPORTED_TABLE_FORMAT_VERSION + 1))); Assert.assertEquals("Should report v1", 1, ops.current().formatVersion()); } diff --git a/core/src/test/java/org/apache/iceberg/TestIncrementalDataTableScan.java b/core/src/test/java/org/apache/iceberg/TestIncrementalDataTableScan.java index 32c68812b2ad..1492070edeea 100644 --- a/core/src/test/java/org/apache/iceberg/TestIncrementalDataTableScan.java +++ b/core/src/test/java/org/apache/iceberg/TestIncrementalDataTableScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -42,7 +41,7 @@ public class TestIncrementalDataTableScan extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestIncrementalDataTableScan(int formatVersion) { @@ -59,7 +58,8 @@ public void testInvalidScans() { add(table.newAppend(), files("A")); AssertHelpers.assertThrows( "from and to snapshots cannot be the same, since from snapshot is exclusive and not part of the scan", - IllegalArgumentException.class, "from and to snapshot ids cannot be the same", + IllegalArgumentException.class, + "from and to snapshot ids cannot be the same", () -> appendsBetweenScan(1, 1)); add(table.newAppend(), files("B")); @@ -68,11 +68,13 @@ public void testInvalidScans() { add(table.newAppend(), files("E")); AssertHelpers.assertThrows( "Check refinement api", - IllegalArgumentException.class, "from snapshot id 1 not in existing snapshot ids range (2, 4]", + IllegalArgumentException.class, + "from snapshot id 1 not in existing snapshot ids range (2, 4]", () -> table.newScan().appendsBetween(2, 5).appendsBetween(1, 4)); AssertHelpers.assertThrows( "Check refinement api", - IllegalArgumentException.class, "to snapshot id 3 not in existing snapshot ids range (1, 2]", + IllegalArgumentException.class, + "to snapshot id 3 not in existing snapshot ids range (1, 2]", () -> table.newScan().appendsBetween(1, 2).appendsBetween(1, 3)); } @@ -139,7 +141,8 @@ public void testReplaceOverwritesDeletes() { overwrite(table.newOverwrite(), files("H"), files("E")); // 9 AssertHelpers.assertThrows( - "Overwrites are not supported for Incremental scan", UnsupportedOperationException.class, + "Overwrites are not supported for Incremental scan", + UnsupportedOperationException.class, "Found overwrite operation, cannot support incremental data in snapshots (8, 9]", () -> appendsBetweenScan(8, 9)); } @@ -210,29 +213,29 @@ public void testIgnoreResiduals() throws IOException { add(table.newAppend(), files("B")); add(table.newAppend(), files("C")); - TableScan scan1 = table.newScan() - .filter(Expressions.equal("id", 5)) - .appendsBetween(1, 3); + TableScan scan1 = table.newScan().filter(Expressions.equal("id", 5)).appendsBetween(1, 3); try (CloseableIterable tasks = scan1.planTasks()) { - Assert.assertTrue("Tasks should not be empty", com.google.common.collect.Iterables.size(tasks) > 0); + Assert.assertTrue( + "Tasks should not be empty", com.google.common.collect.Iterables.size(tasks) > 0); for (CombinedScanTask combinedScanTask : tasks) { for (FileScanTask fileScanTask : combinedScanTask.files()) { - Assert.assertNotEquals("Residuals must be preserved", Expressions.alwaysTrue(), fileScanTask.residual()); + Assert.assertNotEquals( + "Residuals must be preserved", Expressions.alwaysTrue(), fileScanTask.residual()); } } } - TableScan scan2 = table.newScan() - .filter(Expressions.equal("id", 5)) - .appendsBetween(1, 3) - .ignoreResiduals(); + TableScan scan2 = + table.newScan().filter(Expressions.equal("id", 5)).appendsBetween(1, 3).ignoreResiduals(); try (CloseableIterable tasks = scan2.planTasks()) { - Assert.assertTrue("Tasks should not be empty", com.google.common.collect.Iterables.size(tasks) > 0); + Assert.assertTrue( + "Tasks should not be empty", com.google.common.collect.Iterables.size(tasks) > 0); for (CombinedScanTask combinedScanTask : tasks) { for (FileScanTask fileScanTask : combinedScanTask.files()) { - Assert.assertEquals("Residuals must be ignored", Expressions.alwaysTrue(), fileScanTask.residual()); + Assert.assertEquals( + "Residuals must be ignored", Expressions.alwaysTrue(), fileScanTask.residual()); } } } @@ -245,25 +248,31 @@ public void testPlanWithExecutor() throws IOException { add(table.newAppend(), files("C")); AtomicInteger planThreadsIndex = new AtomicInteger(0); - TableScan scan = table.newScan() - .appendsAfter(1) - .planWith(Executors.newFixedThreadPool(1, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("plan-" + planThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })); + TableScan scan = + table + .newScan() + .appendsAfter(1) + .planWith( + Executors.newFixedThreadPool( + 1, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("plan-" + planThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })); Assert.assertEquals(2, Iterables.size(scan.planFiles())); Assert.assertTrue("Thread should be created in provided pool", planThreadsIndex.get() > 0); } private static DataFile file(String name) { return DataFiles.builder(SPEC) - .withPath(name + ".parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); + .withPath(name + ".parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); } private static void add(AppendFiles appendFiles, List adds) { @@ -280,12 +289,14 @@ private static void delete(DeleteFiles deleteFiles, List deletes) { deleteFiles.commit(); } - private static void replace(RewriteFiles rewriteFiles, List deletes, List adds) { + private static void replace( + RewriteFiles rewriteFiles, List deletes, List adds) { rewriteFiles.rewriteFiles(Sets.newHashSet(deletes), Sets.newHashSet(adds)); rewriteFiles.commit(); } - private static void overwrite(OverwriteFiles overwriteFiles, List adds, List deletes) { + private static void overwrite( + OverwriteFiles overwriteFiles, List adds, List deletes) { for (DataFile f : adds) { overwriteFiles.addFile(f); } @@ -312,10 +323,13 @@ private List appendsBetweenScan(long fromSnapshotId, long toSnapshotId) } private static List filesToScan(TableScan tableScan) { - Iterable filesToRead = Iterables.transform(tableScan.planFiles(), t -> { - String path = t.file().path().toString(); - return path.split("\\.")[0]; - }); + Iterable filesToRead = + Iterables.transform( + tableScan.planFiles(), + t -> { + String path = t.file().path().toString(); + return path.split("\\.")[0]; + }); return Lists.newArrayList(filesToRead); } diff --git a/core/src/test/java/org/apache/iceberg/TestLocationProvider.java b/core/src/test/java/org/apache/iceberg/TestLocationProvider.java index bdb8d6c5e5fb..2b9cc47e9389 100644 --- a/core/src/test/java/org/apache/iceberg/TestLocationProvider.java +++ b/core/src/test/java/org/apache/iceberg/TestLocationProvider.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -33,8 +32,7 @@ public class TestLocationProvider extends TableTestBase { @Parameterized.Parameters public static Object[][] parameters() { return new Object[][] { - new Object[] { 1 }, - new Object[] { 2 }, + new Object[] {1}, new Object[] {2}, }; } @@ -47,7 +45,8 @@ public static class TwoArgDynamicallyLoadedLocationProvider implements LocationP String tableLocation; Map properties; - public TwoArgDynamicallyLoadedLocationProvider(String tableLocation, Map properties) { + public TwoArgDynamicallyLoadedLocationProvider( + String tableLocation, Map properties) { this.tableLocation = tableLocation; this.properties = properties; } @@ -81,8 +80,7 @@ public String newDataLocation(PartitionSpec spec, StructLike partitionData, Stri // publicly visible for testing to be dynamically loaded public static class InvalidArgTypesDynamicallyLoadedLocationProvider implements LocationProvider { - public InvalidArgTypesDynamicallyLoadedLocationProvider(Integer bogusArg1, String bogusArg2) { - } + public InvalidArgTypesDynamicallyLoadedLocationProvider(Integer bogusArg1, String bogusArg2) {} @Override public String newDataLocation(String filename) { @@ -102,184 +100,191 @@ public static class InvalidNoInterfaceDynamicallyLoadedLocationProvider { @Test public void testDefaultLocationProvider() { - this.table.updateProperties() - .commit(); + this.table.updateProperties().commit(); this.table.locationProvider().newDataLocation("my_file"); Assert.assertEquals( "Default data path should have table location as root", String.format("%s/data/%s", this.table.location(), "my_file"), - this.table.locationProvider().newDataLocation("my_file") - ); + this.table.locationProvider().newDataLocation("my_file")); } @Test public void testDefaultLocationProviderWithCustomDataLocation() { - this.table.updateProperties() - .set(TableProperties.WRITE_DATA_LOCATION, "new_location") - .commit(); + this.table.updateProperties().set(TableProperties.WRITE_DATA_LOCATION, "new_location").commit(); this.table.locationProvider().newDataLocation("my_file"); Assert.assertEquals( "Default location provider should allow custom path location", "new_location/my_file", - this.table.locationProvider().newDataLocation("my_file") - ); + this.table.locationProvider().newDataLocation("my_file")); } @Test public void testNoArgDynamicallyLoadedLocationProvider() { - String invalidImpl = String.format("%s$%s", - this.getClass().getCanonicalName(), - NoArgDynamicallyLoadedLocationProvider.class.getSimpleName()); - this.table.updateProperties() + String invalidImpl = + String.format( + "%s$%s", + this.getClass().getCanonicalName(), + NoArgDynamicallyLoadedLocationProvider.class.getSimpleName()); + this.table + .updateProperties() .set(TableProperties.WRITE_LOCATION_PROVIDER_IMPL, invalidImpl) .commit(); Assert.assertEquals( "Custom provider should take base table location", "test_no_arg_provider/my_file", - this.table.locationProvider().newDataLocation("my_file") - ); + this.table.locationProvider().newDataLocation("my_file")); } @Test public void testTwoArgDynamicallyLoadedLocationProvider() { - this.table.updateProperties() - .set(TableProperties.WRITE_LOCATION_PROVIDER_IMPL, - String.format("%s$%s", + this.table + .updateProperties() + .set( + TableProperties.WRITE_LOCATION_PROVIDER_IMPL, + String.format( + "%s$%s", this.getClass().getCanonicalName(), TwoArgDynamicallyLoadedLocationProvider.class.getSimpleName())) .commit(); - Assert.assertTrue(String.format("Table should load impl defined in its properties"), - this.table.locationProvider() instanceof TwoArgDynamicallyLoadedLocationProvider - ); + Assert.assertTrue( + String.format("Table should load impl defined in its properties"), + this.table.locationProvider() instanceof TwoArgDynamicallyLoadedLocationProvider); Assert.assertEquals( "Custom provider should take base table location", String.format("%s/test_custom_provider/%s", this.table.location(), "my_file"), - this.table.locationProvider().newDataLocation("my_file") - ); + this.table.locationProvider().newDataLocation("my_file")); } @Test public void testDynamicallyLoadedLocationProviderNotFound() { - String nonExistentImpl = String.format("%s$NonExistent%s", - this.getClass().getCanonicalName(), - TwoArgDynamicallyLoadedLocationProvider.class.getSimpleName()); - this.table.updateProperties() + String nonExistentImpl = + String.format( + "%s$NonExistent%s", + this.getClass().getCanonicalName(), + TwoArgDynamicallyLoadedLocationProvider.class.getSimpleName()); + this.table + .updateProperties() .set(TableProperties.WRITE_LOCATION_PROVIDER_IMPL, nonExistentImpl) .commit(); - AssertHelpers.assertThrows("Non-existent implementation should fail on finding constructor", + AssertHelpers.assertThrows( + "Non-existent implementation should fail on finding constructor", IllegalArgumentException.class, - String.format("Unable to find a constructor for implementation %s of %s. ", + String.format( + "Unable to find a constructor for implementation %s of %s. ", nonExistentImpl, LocationProvider.class), - () -> table.locationProvider() - ); + () -> table.locationProvider()); } @Test public void testInvalidNoInterfaceDynamicallyLoadedLocationProvider() { - String invalidImpl = String.format("%s$%s", - this.getClass().getCanonicalName(), - InvalidNoInterfaceDynamicallyLoadedLocationProvider.class.getSimpleName()); - this.table.updateProperties() + String invalidImpl = + String.format( + "%s$%s", + this.getClass().getCanonicalName(), + InvalidNoInterfaceDynamicallyLoadedLocationProvider.class.getSimpleName()); + this.table + .updateProperties() .set(TableProperties.WRITE_LOCATION_PROVIDER_IMPL, invalidImpl) .commit(); AssertHelpers.assertThrows( "Class with missing interface implementation should fail on instantiation.", IllegalArgumentException.class, - String.format("Provided implementation for dynamic instantiation should implement %s", + String.format( + "Provided implementation for dynamic instantiation should implement %s", LocationProvider.class), - () -> table.locationProvider() - ); + () -> table.locationProvider()); } @Test public void testInvalidArgTypesDynamicallyLoadedLocationProvider() { - String invalidImpl = String.format("%s$%s", - this.getClass().getCanonicalName(), - InvalidArgTypesDynamicallyLoadedLocationProvider.class.getSimpleName()); - this.table.updateProperties() + String invalidImpl = + String.format( + "%s$%s", + this.getClass().getCanonicalName(), + InvalidArgTypesDynamicallyLoadedLocationProvider.class.getSimpleName()); + this.table + .updateProperties() .set(TableProperties.WRITE_LOCATION_PROVIDER_IMPL, invalidImpl) .commit(); - AssertHelpers.assertThrows("Implementation with invalid arg types should fail on finding constructor", + AssertHelpers.assertThrows( + "Implementation with invalid arg types should fail on finding constructor", IllegalArgumentException.class, - String.format("Unable to find a constructor for implementation %s of %s. ", + String.format( + "Unable to find a constructor for implementation %s of %s. ", invalidImpl, LocationProvider.class), - () -> table.locationProvider() - ); + () -> table.locationProvider()); } @Test public void testObjectStorageLocationProviderPathResolution() { - table.updateProperties() - .set(TableProperties.OBJECT_STORE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.OBJECT_STORE_ENABLED, "true").commit(); - Assert.assertTrue("default data location should be used when object storage path not set", + Assert.assertTrue( + "default data location should be used when object storage path not set", table.locationProvider().newDataLocation("file").contains(table.location() + "/data")); String folderPath = "s3://random/folder/location"; - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_FOLDER_STORAGE_LOCATION, folderPath) .commit(); - Assert.assertTrue("folder storage path should be used when set", + Assert.assertTrue( + "folder storage path should be used when set", table.locationProvider().newDataLocation("file").contains(folderPath)); String objectPath = "s3://random/object/location"; - table.updateProperties() - .set(TableProperties.OBJECT_STORE_PATH, objectPath) - .commit(); + table.updateProperties().set(TableProperties.OBJECT_STORE_PATH, objectPath).commit(); - Assert.assertTrue("object storage path should be used when set", + Assert.assertTrue( + "object storage path should be used when set", table.locationProvider().newDataLocation("file").contains(objectPath)); String dataPath = "s3://random/data/location"; - table.updateProperties() - .set(TableProperties.WRITE_DATA_LOCATION, dataPath) - .commit(); + table.updateProperties().set(TableProperties.WRITE_DATA_LOCATION, dataPath).commit(); - Assert.assertTrue("write data path should be used when set", + Assert.assertTrue( + "write data path should be used when set", table.locationProvider().newDataLocation("file").contains(dataPath)); } @Test public void testDefaultStorageLocationProviderPathResolution() { - table.updateProperties() - .set(TableProperties.OBJECT_STORE_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.OBJECT_STORE_ENABLED, "false").commit(); - Assert.assertTrue("default data location should be used when object storage path not set", + Assert.assertTrue( + "default data location should be used when object storage path not set", table.locationProvider().newDataLocation("file").contains(table.location() + "/data")); String folderPath = "s3://random/folder/location"; - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_FOLDER_STORAGE_LOCATION, folderPath) .commit(); - Assert.assertTrue("folder storage path should be used when set", + Assert.assertTrue( + "folder storage path should be used when set", table.locationProvider().newDataLocation("file").contains(folderPath)); String dataPath = "s3://random/data/location"; - table.updateProperties() - .set(TableProperties.WRITE_DATA_LOCATION, dataPath) - .commit(); + table.updateProperties().set(TableProperties.WRITE_DATA_LOCATION, dataPath).commit(); - Assert.assertTrue("write data path should be used when set", + Assert.assertTrue( + "write data path should be used when set", table.locationProvider().newDataLocation("file").contains(dataPath)); } @Test public void testObjectStorageWithinTableLocation() { - table.updateProperties() - .set(TableProperties.OBJECT_STORE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.OBJECT_STORE_ENABLED, "true").commit(); String fileLocation = table.locationProvider().newDataLocation("test.parquet"); String relativeLocation = fileLocation.replaceFirst(table.location(), ""); @@ -289,6 +294,7 @@ public void testObjectStorageWithinTableLocation() { Assert.assertTrue("First part should be empty", parts.get(0).isEmpty()); Assert.assertEquals("Second part should be data", "data", parts.get(1)); Assert.assertFalse("Third part should be a hash value", parts.get(2).isEmpty()); - Assert.assertEquals("Fourth part should be the file name passed in", "test.parquet", parts.get(3)); + Assert.assertEquals( + "Fourth part should be the file name passed in", "test.parquet", parts.get(3)); } } diff --git a/core/src/test/java/org/apache/iceberg/TestManifestCleanup.java b/core/src/test/java/org/apache/iceberg/TestManifestCleanup.java index a04ddc30061f..f7b989065be9 100644 --- a/core/src/test/java/org/apache/iceberg/TestManifestCleanup.java +++ b/core/src/test/java/org/apache/iceberg/TestManifestCleanup.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.expressions.Expressions; @@ -29,7 +28,7 @@ public class TestManifestCleanup extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestManifestCleanup(int formatVersion) { @@ -38,88 +37,88 @@ public TestManifestCleanup(int formatVersion) { @Test public void testDelete() { - Assert.assertEquals("Table should start with no manifests", - 0, listManifestFiles().size()); + Assert.assertEquals("Table should start with no manifests", 0, listManifestFiles().size()); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - Assert.assertEquals("Table should have one append manifest", - 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Table should have one append manifest", + 1, + table.currentSnapshot().allManifests(table.io()).size()); - table.newDelete() - .deleteFromRowFilter(Expressions.alwaysTrue()) - .commit(); + table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); - Assert.assertEquals("Table should have one delete manifest", - 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Table should have one delete manifest", + 1, + table.currentSnapshot().allManifests(table.io()).size()); table.newAppend().commit(); - Assert.assertEquals("Table should have no manifests", - 0, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Table should have no manifests", + 0, + table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testPartialDelete() { - Assert.assertEquals("Table should start with no manifests", - 0, listManifestFiles().size()); + Assert.assertEquals("Table should start with no manifests", 0, listManifestFiles().size()); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot s1 = table.currentSnapshot(); - Assert.assertEquals("Table should have one append manifest", - 1, s1.allManifests(table.io()).size()); + Assert.assertEquals( + "Table should have one append manifest", 1, s1.allManifests(table.io()).size()); - table.newDelete() - .deleteFile(FILE_B) - .commit(); + table.newDelete().deleteFile(FILE_B).commit(); Snapshot s2 = table.currentSnapshot(); - Assert.assertEquals("Table should have one mixed manifest", - 1, s2.allManifests(table.io()).size()); + Assert.assertEquals( + "Table should have one mixed manifest", 1, s2.allManifests(table.io()).size()); table.newAppend().commit(); Snapshot s3 = table.currentSnapshot(); - Assert.assertEquals("Table should have the same manifests", - s2.allManifests(table.io()), s3.allManifests(table.io())); + Assert.assertEquals( + "Table should have the same manifests", + s2.allManifests(table.io()), + s3.allManifests(table.io())); } @Test public void testOverwrite() { - Assert.assertEquals("Table should start with no manifests", - 0, listManifestFiles().size()); + Assert.assertEquals("Table should start with no manifests", 0, listManifestFiles().size()); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - Assert.assertEquals("Table should have one append manifest", - 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Table should have one append manifest", + 1, + table.currentSnapshot().allManifests(table.io()).size()); - table.newOverwrite() + table + .newOverwrite() .overwriteByRowFilter(Expressions.alwaysTrue()) .addFile(FILE_C) .addFile(FILE_D) .commit(); - Assert.assertEquals("Table should have one delete manifest and one append manifest", - 2, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Table should have one delete manifest and one append manifest", + 2, + table.currentSnapshot().allManifests(table.io()).size()); - table.newOverwrite() + table + .newOverwrite() .overwriteByRowFilter(Expressions.alwaysTrue()) .addFile(FILE_A) .addFile(FILE_B) .commit(); - Assert.assertEquals("Table should have one delete manifest and one append manifest", - 2, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Table should have one delete manifest and one append manifest", + 2, + table.currentSnapshot().allManifests(table.io()).size()); } } diff --git a/core/src/test/java/org/apache/iceberg/TestManifestListVersions.java b/core/src/test/java/org/apache/iceberg/TestManifestListVersions.java index 8cc64f0787a4..fe026a0a5c65 100644 --- a/core/src/test/java/org/apache/iceberg/TestManifestListVersions.java +++ b/core/src/test/java/org/apache/iceberg/TestManifestListVersions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.File; @@ -56,26 +55,54 @@ public class TestManifestListVersions { private static final long EXISTING_ROWS = 857273L; private static final int DELETED_FILES = 1; private static final long DELETED_ROWS = 22910L; - private static final List PARTITION_SUMMARIES = ImmutableList.of(); + private static final List PARTITION_SUMMARIES = + ImmutableList.of(); private static final ByteBuffer KEY_METADATA = null; - private static final ManifestFile TEST_MANIFEST = new GenericManifestFile( - PATH, LENGTH, SPEC_ID, ManifestContent.DATA, SEQ_NUM, MIN_SEQ_NUM, SNAPSHOT_ID, - ADDED_FILES, ADDED_ROWS, EXISTING_FILES, EXISTING_ROWS, DELETED_FILES, DELETED_ROWS, - PARTITION_SUMMARIES, KEY_METADATA); - - private static final ManifestFile TEST_DELETE_MANIFEST = new GenericManifestFile( - PATH, LENGTH, SPEC_ID, ManifestContent.DELETES, SEQ_NUM, MIN_SEQ_NUM, SNAPSHOT_ID, - ADDED_FILES, ADDED_ROWS, EXISTING_FILES, EXISTING_ROWS, DELETED_FILES, DELETED_ROWS, - PARTITION_SUMMARIES, KEY_METADATA); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final ManifestFile TEST_MANIFEST = + new GenericManifestFile( + PATH, + LENGTH, + SPEC_ID, + ManifestContent.DATA, + SEQ_NUM, + MIN_SEQ_NUM, + SNAPSHOT_ID, + ADDED_FILES, + ADDED_ROWS, + EXISTING_FILES, + EXISTING_ROWS, + DELETED_FILES, + DELETED_ROWS, + PARTITION_SUMMARIES, + KEY_METADATA); + + private static final ManifestFile TEST_DELETE_MANIFEST = + new GenericManifestFile( + PATH, + LENGTH, + SPEC_ID, + ManifestContent.DELETES, + SEQ_NUM, + MIN_SEQ_NUM, + SNAPSHOT_ID, + ADDED_FILES, + ADDED_ROWS, + EXISTING_FILES, + EXISTING_ROWS, + DELETED_FILES, + DELETED_ROWS, + PARTITION_SUMMARIES, + KEY_METADATA); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testV1WriteDeleteManifest() { - AssertHelpers.assertThrows("Should fail to write a DELETE manifest to v1", - IllegalArgumentException.class, "Cannot store delete manifests in a v1 table", + AssertHelpers.assertThrows( + "Should fail to write a DELETE manifest to v1", + IllegalArgumentException.class, + "Cannot store delete manifests in a v1 table", () -> writeManifestList(TEST_DELETE_MANIFEST, 1)); } @@ -84,8 +111,10 @@ public void testV1Write() throws IOException { ManifestFile manifest = writeAndReadManifestList(1); // v2 fields are not written and are defaulted - Assert.assertEquals("Should not contain sequence number, default to 0", 0, manifest.sequenceNumber()); - Assert.assertEquals("Should not contain min sequence number, default to 0", 0, manifest.minSequenceNumber()); + Assert.assertEquals( + "Should not contain sequence number, default to 0", 0, manifest.sequenceNumber()); + Assert.assertEquals( + "Should not contain min sequence number, default to 0", 0, manifest.minSequenceNumber()); // v1 fields are read correctly, even though order changed Assert.assertEquals("Path", PATH, manifest.path()); @@ -94,7 +123,8 @@ public void testV1Write() throws IOException { Assert.assertEquals("Content", ManifestContent.DATA, manifest.content()); Assert.assertEquals("Snapshot id", SNAPSHOT_ID, (long) manifest.snapshotId()); Assert.assertEquals("Added files count", ADDED_FILES, (int) manifest.addedFilesCount()); - Assert.assertEquals("Existing files count", EXISTING_FILES, (int) manifest.existingFilesCount()); + Assert.assertEquals( + "Existing files count", EXISTING_FILES, (int) manifest.existingFilesCount()); Assert.assertEquals("Deleted files count", DELETED_FILES, (int) manifest.deletedFilesCount()); Assert.assertEquals("Added rows count", ADDED_ROWS, (long) manifest.addedRowsCount()); Assert.assertEquals("Existing rows count", EXISTING_ROWS, (long) manifest.existingRowsCount()); @@ -115,7 +145,8 @@ public void testV2Write() throws IOException { Assert.assertEquals("Snapshot id", SNAPSHOT_ID, (long) manifest.snapshotId()); Assert.assertEquals("Added files count", ADDED_FILES, (int) manifest.addedFilesCount()); Assert.assertEquals("Added rows count", ADDED_ROWS, (long) manifest.addedRowsCount()); - Assert.assertEquals("Existing files count", EXISTING_FILES, (int) manifest.existingFilesCount()); + Assert.assertEquals( + "Existing files count", EXISTING_FILES, (int) manifest.existingFilesCount()); Assert.assertEquals("Existing rows count", EXISTING_ROWS, (long) manifest.existingRowsCount()); Assert.assertEquals("Deleted files count", DELETED_FILES, (int) manifest.deletedFilesCount()); Assert.assertEquals("Deleted rows count", DELETED_ROWS, (long) manifest.deletedRowsCount()); @@ -131,12 +162,17 @@ public void testV1ForwardCompatibility() throws IOException { Assert.assertEquals("Length", LENGTH, generic.get("manifest_length")); Assert.assertEquals("Spec id", SPEC_ID, generic.get("partition_spec_id")); Assert.assertEquals("Snapshot id", SNAPSHOT_ID, (long) generic.get("added_snapshot_id")); - Assert.assertEquals("Added files count", ADDED_FILES, (int) generic.get("added_data_files_count")); - Assert.assertEquals("Existing files count", EXISTING_FILES, (int) generic.get("existing_data_files_count")); - Assert.assertEquals("Deleted files count", DELETED_FILES, (int) generic.get("deleted_data_files_count")); + Assert.assertEquals( + "Added files count", ADDED_FILES, (int) generic.get("added_data_files_count")); + Assert.assertEquals( + "Existing files count", EXISTING_FILES, (int) generic.get("existing_data_files_count")); + Assert.assertEquals( + "Deleted files count", DELETED_FILES, (int) generic.get("deleted_data_files_count")); Assert.assertEquals("Added rows count", ADDED_ROWS, (long) generic.get("added_rows_count")); - Assert.assertEquals("Existing rows count", EXISTING_ROWS, (long) generic.get("existing_rows_count")); - Assert.assertEquals("Deleted rows count", DELETED_ROWS, (long) generic.get("deleted_rows_count")); + Assert.assertEquals( + "Existing rows count", EXISTING_ROWS, (long) generic.get("existing_rows_count")); + Assert.assertEquals( + "Deleted rows count", DELETED_ROWS, (long) generic.get("deleted_rows_count")); AssertHelpers.assertEmptyAvroField(generic, ManifestFile.MANIFEST_CONTENT.name()); AssertHelpers.assertEmptyAvroField(generic, ManifestFile.SEQUENCE_NUMBER.name()); AssertHelpers.assertEmptyAvroField(generic, ManifestFile.MIN_SEQUENCE_NUMBER.name()); @@ -144,7 +180,8 @@ public void testV1ForwardCompatibility() throws IOException { @Test public void testV2ForwardCompatibility() throws IOException { - // v2 manifest list files can be read by v1 readers, but the sequence numbers and content will be ignored. + // v2 manifest list files can be read by v1 readers, but the sequence numbers and content will + // be ignored. InputFile manifestList = writeManifestList(TEST_MANIFEST, 2); GenericData.Record generic = readGeneric(manifestList, V1Metadata.MANIFEST_LIST_SCHEMA); @@ -153,12 +190,17 @@ public void testV2ForwardCompatibility() throws IOException { Assert.assertEquals("Length", LENGTH, generic.get("manifest_length")); Assert.assertEquals("Spec id", SPEC_ID, generic.get("partition_spec_id")); Assert.assertEquals("Snapshot id", SNAPSHOT_ID, (long) generic.get("added_snapshot_id")); - Assert.assertEquals("Added files count", ADDED_FILES, (int) generic.get("added_data_files_count")); - Assert.assertEquals("Existing files count", EXISTING_FILES, (int) generic.get("existing_data_files_count")); - Assert.assertEquals("Deleted files count", DELETED_FILES, (int) generic.get("deleted_data_files_count")); + Assert.assertEquals( + "Added files count", ADDED_FILES, (int) generic.get("added_data_files_count")); + Assert.assertEquals( + "Existing files count", EXISTING_FILES, (int) generic.get("existing_data_files_count")); + Assert.assertEquals( + "Deleted files count", DELETED_FILES, (int) generic.get("deleted_data_files_count")); Assert.assertEquals("Added rows count", ADDED_ROWS, (long) generic.get("added_rows_count")); - Assert.assertEquals("Existing rows count", EXISTING_ROWS, (long) generic.get("existing_rows_count")); - Assert.assertEquals("Deleted rows count", DELETED_ROWS, (long) generic.get("deleted_rows_count")); + Assert.assertEquals( + "Existing rows count", EXISTING_ROWS, (long) generic.get("existing_rows_count")); + Assert.assertEquals( + "Deleted rows count", DELETED_ROWS, (long) generic.get("deleted_rows_count")); AssertHelpers.assertEmptyAvroField(generic, ManifestFile.MANIFEST_CONTENT.name()); AssertHelpers.assertEmptyAvroField(generic, ManifestFile.SEQUENCE_NUMBER.name()); AssertHelpers.assertEmptyAvroField(generic, ManifestFile.MIN_SEQUENCE_NUMBER.name()); @@ -169,30 +211,40 @@ public void testManifestsWithoutRowStats() throws IOException { File manifestListFile = temp.newFile("manifest-list.avro"); Assert.assertTrue(manifestListFile.delete()); - Collection columnNamesWithoutRowStats = ImmutableList.of( - "manifest_path", "manifest_length", "partition_spec_id", "added_snapshot_id", - "added_data_files_count", "existing_data_files_count", "deleted_data_files_count", - "partitions"); - Schema schemaWithoutRowStats = V1Metadata.MANIFEST_LIST_SCHEMA.select(columnNamesWithoutRowStats); + Collection columnNamesWithoutRowStats = + ImmutableList.of( + "manifest_path", + "manifest_length", + "partition_spec_id", + "added_snapshot_id", + "added_data_files_count", + "existing_data_files_count", + "deleted_data_files_count", + "partitions"); + Schema schemaWithoutRowStats = + V1Metadata.MANIFEST_LIST_SCHEMA.select(columnNamesWithoutRowStats); OutputFile outputFile = Files.localOutput(manifestListFile); - try (FileAppender appender = Avro.write(outputFile) - .schema(schemaWithoutRowStats) - .named("manifest_file") - .overwrite() - .build()) { - - org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(schemaWithoutRowStats, "manifest_file"); - GenericData.Record withoutRowStats = new GenericRecordBuilder(avroSchema) - .set("manifest_path", "path/to/manifest.avro") - .set("manifest_length", 1024L) - .set("partition_spec_id", 1) - .set("added_snapshot_id", 100L) - .set("added_data_files_count", 2) - .set("existing_data_files_count", 3) - .set("deleted_data_files_count", 4) - .set("partitions", null) - .build(); + try (FileAppender appender = + Avro.write(outputFile) + .schema(schemaWithoutRowStats) + .named("manifest_file") + .overwrite() + .build()) { + + org.apache.avro.Schema avroSchema = + AvroSchemaUtil.convert(schemaWithoutRowStats, "manifest_file"); + GenericData.Record withoutRowStats = + new GenericRecordBuilder(avroSchema) + .set("manifest_path", "path/to/manifest.avro") + .set("manifest_length", 1024L) + .set("partition_spec_id", 1) + .set("added_snapshot_id", 100L) + .set("added_data_files_count", 2) + .set("existing_data_files_count", 3) + .set("deleted_data_files_count", 4) + .set("partitions", null) + .build(); appender.add(withoutRowStats); } @@ -204,7 +256,8 @@ public void testManifestsWithoutRowStats() throws IOException { Assert.assertNull("Added rows count should be null", manifest.addedRowsCount()); Assert.assertTrue("Existing files should be present", manifest.hasExistingFiles()); - Assert.assertEquals("Existing files count should match", 3, (int) manifest.existingFilesCount()); + Assert.assertEquals( + "Existing files count should match", 3, (int) manifest.existingFilesCount()); Assert.assertNull("Existing rows count should be null", manifest.existingRowsCount()); Assert.assertTrue("Deleted files should be present", manifest.hasDeletedFiles()); @@ -219,52 +272,82 @@ public void testManifestsPartitionSummary() throws IOException { ByteBuffer secondSummaryLowerBound = Conversions.toByteBuffer(Types.IntegerType.get(), 20); ByteBuffer secondSummaryUpperBound = Conversions.toByteBuffer(Types.IntegerType.get(), 200); - List partitionFieldSummaries = Lists.newArrayList( - new GenericPartitionFieldSummary(false, firstSummaryLowerBound, firstSummaryUpperBound), - new GenericPartitionFieldSummary(true, false, secondSummaryLowerBound, secondSummaryUpperBound)); - ManifestFile manifest = new GenericManifestFile( - PATH, LENGTH, SPEC_ID, ManifestContent.DATA, SEQ_NUM, MIN_SEQ_NUM, SNAPSHOT_ID, - ADDED_FILES, ADDED_ROWS, EXISTING_FILES, EXISTING_ROWS, DELETED_FILES, DELETED_ROWS, - partitionFieldSummaries, KEY_METADATA); + List partitionFieldSummaries = + Lists.newArrayList( + new GenericPartitionFieldSummary(false, firstSummaryLowerBound, firstSummaryUpperBound), + new GenericPartitionFieldSummary( + true, false, secondSummaryLowerBound, secondSummaryUpperBound)); + ManifestFile manifest = + new GenericManifestFile( + PATH, + LENGTH, + SPEC_ID, + ManifestContent.DATA, + SEQ_NUM, + MIN_SEQ_NUM, + SNAPSHOT_ID, + ADDED_FILES, + ADDED_ROWS, + EXISTING_FILES, + EXISTING_ROWS, + DELETED_FILES, + DELETED_ROWS, + partitionFieldSummaries, + KEY_METADATA); InputFile manifestList = writeManifestList(manifest, 2); List files = ManifestLists.read(manifestList); ManifestFile returnedManifest = Iterables.getOnlyElement(files); - Assert.assertEquals("Number of partition field summaries should match", - 2, returnedManifest.partitions().size()); + Assert.assertEquals( + "Number of partition field summaries should match", + 2, + returnedManifest.partitions().size()); ManifestFile.PartitionFieldSummary first = returnedManifest.partitions().get(0); - Assert.assertFalse("First partition field summary should not contain null", first.containsNull()); + Assert.assertFalse( + "First partition field summary should not contain null", first.containsNull()); Assert.assertNull("First partition field summary has unknown NaN", first.containsNaN()); - Assert.assertEquals("Lower bound for first partition field summary should match", - firstSummaryLowerBound, first.lowerBound()); - Assert.assertEquals("Upper bound for first partition field summary should match", - firstSummaryUpperBound, first.upperBound()); + Assert.assertEquals( + "Lower bound for first partition field summary should match", + firstSummaryLowerBound, + first.lowerBound()); + Assert.assertEquals( + "Upper bound for first partition field summary should match", + firstSummaryUpperBound, + first.upperBound()); ManifestFile.PartitionFieldSummary second = returnedManifest.partitions().get(1); Assert.assertTrue("Second partition field summary should contain null", second.containsNull()); - Assert.assertFalse("Second partition field summary should not contain NaN", second.containsNaN()); - Assert.assertEquals("Lower bound for second partition field summary should match", - secondSummaryLowerBound, second.lowerBound()); - Assert.assertEquals("Upper bound for second partition field summary should match", - secondSummaryUpperBound, second.upperBound()); + Assert.assertFalse( + "Second partition field summary should not contain NaN", second.containsNaN()); + Assert.assertEquals( + "Lower bound for second partition field summary should match", + secondSummaryLowerBound, + second.lowerBound()); + Assert.assertEquals( + "Upper bound for second partition field summary should match", + secondSummaryUpperBound, + second.upperBound()); } private InputFile writeManifestList(ManifestFile manifest, int formatVersion) throws IOException { OutputFile manifestList = new InMemoryOutputFile(); - try (FileAppender writer = ManifestLists.write( - formatVersion, manifestList, SNAPSHOT_ID, SNAPSHOT_ID - 1, formatVersion > 1 ? SEQ_NUM : 0)) { + try (FileAppender writer = + ManifestLists.write( + formatVersion, + manifestList, + SNAPSHOT_ID, + SNAPSHOT_ID - 1, + formatVersion > 1 ? SEQ_NUM : 0)) { writer.add(manifest); } return manifestList.toInputFile(); } private GenericData.Record readGeneric(InputFile manifestList, Schema schema) throws IOException { - try (CloseableIterable files = Avro.read(manifestList) - .project(schema) - .reuseContainers(false) - .build()) { + try (CloseableIterable files = + Avro.read(manifestList).project(schema).reuseContainers(false).build()) { List records = Lists.newLinkedList(files); Assert.assertEquals("Should contain one manifest", 1, records.size()); return records.get(0); @@ -272,7 +355,8 @@ private GenericData.Record readGeneric(InputFile manifestList, Schema schema) th } private ManifestFile writeAndReadManifestList(int formatVersion) throws IOException { - List manifests = ManifestLists.read(writeManifestList(TEST_MANIFEST, formatVersion)); + List manifests = + ManifestLists.read(writeManifestList(TEST_MANIFEST, formatVersion)); Assert.assertEquals("Should contain one manifest", 1, manifests.size()); return manifests.get(0); } diff --git a/core/src/test/java/org/apache/iceberg/TestManifestReader.java b/core/src/test/java/org/apache/iceberg/TestManifestReader.java index 8061e33f5efa..dfc84200fdb2 100644 --- a/core/src/test/java/org/apache/iceberg/TestManifestReader.java +++ b/core/src/test/java/org/apache/iceberg/TestManifestReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -38,7 +37,7 @@ public class TestManifestReader extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestManifestReader(int formatVersion) { @@ -59,16 +58,18 @@ public void testManifestReaderWithEmptyInheritableMetadata() throws IOException @Test public void testReaderWithFilterWithoutSelect() throws IOException { ManifestFile manifest = writeManifest(1000L, FILE_A, FILE_B, FILE_C); - try (ManifestReader reader = ManifestFiles.read(manifest, FILE_IO) - .filterRows(Expressions.equal("id", 0))) { - List files = Streams.stream(reader) - .map(file -> file.path().toString()) - .collect(Collectors.toList()); + try (ManifestReader reader = + ManifestFiles.read(manifest, FILE_IO).filterRows(Expressions.equal("id", 0))) { + List files = + Streams.stream(reader).map(file -> file.path().toString()).collect(Collectors.toList()); - // note that all files are returned because the reader returns data files that may match, and the partition is + // note that all files are returned because the reader returns data files that may match, and + // the partition is // bucketing by data, which doesn't help filter files - Assert.assertEquals("Should read the expected files", - Lists.newArrayList(FILE_A.path(), FILE_B.path(), FILE_C.path()), files); + Assert.assertEquals( + "Should read the expected files", + Lists.newArrayList(FILE_A.path(), FILE_B.path(), FILE_C.path()), + files); } } @@ -77,7 +78,8 @@ public void testInvalidUsage() throws IOException { ManifestFile manifest = writeManifest(FILE_A, FILE_B); AssertHelpers.assertThrows( "Should not be possible to read manifest without explicit snapshot ids and inheritable metadata", - IllegalArgumentException.class, "Cannot read from ManifestFile with null (unassigned) snapshot ID", + IllegalArgumentException.class, + "Cannot read from ManifestFile with null (unassigned) snapshot ID", () -> ManifestFiles.read(manifest, FILE_IO)); } @@ -88,7 +90,8 @@ public void testManifestReaderWithPartitionMetadata() throws IOException { ManifestEntry entry = Iterables.getOnlyElement(reader.entries()); Assert.assertEquals(123L, (long) entry.snapshotId()); - List fields = ((PartitionData) entry.file().partition()).getPartitionType().fields(); + List fields = + ((PartitionData) entry.file().partition()).getPartitionType().fields(); Assert.assertEquals(1, fields.size()); Assert.assertEquals(1000, fields.get(0).fieldId()); Assert.assertEquals("data_bucket", fields.get(0).name()); @@ -98,10 +101,8 @@ public void testManifestReaderWithPartitionMetadata() throws IOException { @Test public void testManifestReaderWithUpdatedPartitionMetadataForV1Table() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(table.schema()) - .bucket("id", 8) - .bucket("data", 16) - .build(); + PartitionSpec spec = + PartitionSpec.builderFor(table.schema()).bucket("id", 8).bucket("data", 16).build(); table.ops().commit(table.ops().current(), table.ops().current().updatePartitionSpec(spec)); ManifestFile manifest = writeManifest(1000L, manifestEntry(Status.EXISTING, 123L, FILE_A)); @@ -109,7 +110,8 @@ public void testManifestReaderWithUpdatedPartitionMetadataForV1Table() throws IO ManifestEntry entry = Iterables.getOnlyElement(reader.entries()); Assert.assertEquals(123L, (long) entry.snapshotId()); - List fields = ((PartitionData) entry.file().partition()).getPartitionType().fields(); + List fields = + ((PartitionData) entry.file().partition()).getPartitionType().fields(); Assert.assertEquals(2, fields.size()); Assert.assertEquals(1000, fields.get(0).fieldId()); Assert.assertEquals("id_bucket", fields.get(0).name()); @@ -128,7 +130,8 @@ public void testDataFilePositions() throws IOException { long expectedPos = 0L; for (DataFile file : reader) { Assert.assertEquals("Position should match", (Long) expectedPos, file.pos()); - Assert.assertEquals("Position from field index should match", expectedPos, ((BaseFile) file).get(17)); + Assert.assertEquals( + "Position from field index should match", expectedPos, ((BaseFile) file).get(17)); expectedPos += 1; } } @@ -137,12 +140,15 @@ public void testDataFilePositions() throws IOException { @Test public void testDeleteFilePositions() throws IOException { Assume.assumeTrue("Delete files only work for format version 2", formatVersion == 2); - ManifestFile manifest = writeDeleteManifest(formatVersion, 1000L, FILE_A_DELETES, FILE_B_DELETES); - try (ManifestReader reader = ManifestFiles.readDeleteManifest(manifest, FILE_IO, null)) { + ManifestFile manifest = + writeDeleteManifest(formatVersion, 1000L, FILE_A_DELETES, FILE_B_DELETES); + try (ManifestReader reader = + ManifestFiles.readDeleteManifest(manifest, FILE_IO, null)) { long expectedPos = 0L; for (DeleteFile file : reader) { Assert.assertEquals("Position should match", (Long) expectedPos, file.pos()); - Assert.assertEquals("Position from field index should match", expectedPos, ((BaseFile) file).get(17)); + Assert.assertEquals( + "Position from field index should match", expectedPos, ((BaseFile) file).get(17)); expectedPos += 1; } } diff --git a/core/src/test/java/org/apache/iceberg/TestManifestReaderStats.java b/core/src/test/java/org/apache/iceberg/TestManifestReaderStats.java index a3c4adb6cdb9..530d9544a210 100644 --- a/core/src/test/java/org/apache/iceberg/TestManifestReaderStats.java +++ b/core/src/test/java/org/apache/iceberg/TestManifestReaderStats.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -37,7 +36,7 @@ public class TestManifestReaderStats extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestManifestReaderStats(int formatVersion) { @@ -52,16 +51,18 @@ public TestManifestReaderStats(int formatVersion) { private static final Map UPPER_BOUNDS = ImmutableMap.of(3, Conversions.toByteBuffer(Types.IntegerType.get(), 4)); - private static final Metrics METRICS = new Metrics(3L, null, - VALUE_COUNT, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS); + private static final Metrics METRICS = + new Metrics( + 3L, null, VALUE_COUNT, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS); private static final String FILE_PATH = "/path/to/data-a.parquet"; - private static final DataFile FILE = DataFiles.builder(SPEC) - .withPath(FILE_PATH) - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=0") // easy way to set partition data for now - .withRecordCount(3) - .withMetrics(METRICS) - .build(); + private static final DataFile FILE = + DataFiles.builder(SPEC) + .withPath(FILE_PATH) + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0") // easy way to set partition data for now + .withRecordCount(3) + .withMetrics(METRICS) + .build(); @Test public void testReadIncludesFullStats() throws IOException { @@ -76,8 +77,8 @@ public void testReadIncludesFullStats() throws IOException { @Test public void testReadEntriesWithFilterIncludesFullStats() throws IOException { ManifestFile manifest = writeManifest(1000L, FILE); - try (ManifestReader reader = ManifestFiles.read(manifest, FILE_IO) - .filterRows(Expressions.equal("id", 3))) { + try (ManifestReader reader = + ManifestFiles.read(manifest, FILE_IO).filterRows(Expressions.equal("id", 3))) { CloseableIterable> entries = reader.entries(); ManifestEntry entry = entries.iterator().next(); assertFullStats(entry.file()); @@ -87,8 +88,8 @@ public void testReadEntriesWithFilterIncludesFullStats() throws IOException { @Test public void testReadIteratorWithFilterIncludesFullStats() throws IOException { ManifestFile manifest = writeManifest(1000L, FILE); - try (ManifestReader reader = ManifestFiles.read(manifest, FILE_IO) - .filterRows(Expressions.equal("id", 3))) { + try (ManifestReader reader = + ManifestFiles.read(manifest, FILE_IO).filterRows(Expressions.equal("id", 3))) { DataFile entry = reader.iterator().next(); assertFullStats(entry); } @@ -97,9 +98,10 @@ public void testReadIteratorWithFilterIncludesFullStats() throws IOException { @Test public void testReadEntriesWithFilterAndSelectIncludesFullStats() throws IOException { ManifestFile manifest = writeManifest(1000L, FILE); - try (ManifestReader reader = ManifestFiles.read(manifest, FILE_IO) - .select(ImmutableList.of("file_path")) - .filterRows(Expressions.equal("id", 3))) { + try (ManifestReader reader = + ManifestFiles.read(manifest, FILE_IO) + .select(ImmutableList.of("file_path")) + .filterRows(Expressions.equal("id", 3))) { CloseableIterable> entries = reader.entries(); ManifestEntry entry = entries.iterator().next(); assertFullStats(entry.file()); @@ -109,9 +111,10 @@ public void testReadEntriesWithFilterAndSelectIncludesFullStats() throws IOExcep @Test public void testReadIteratorWithFilterAndSelectDropsStats() throws IOException { ManifestFile manifest = writeManifest(1000L, FILE); - try (ManifestReader reader = ManifestFiles.read(manifest, FILE_IO) - .select(ImmutableList.of("file_path")) - .filterRows(Expressions.equal("id", 3))) { + try (ManifestReader reader = + ManifestFiles.read(manifest, FILE_IO) + .select(ImmutableList.of("file_path")) + .filterRows(Expressions.equal("id", 3))) { DataFile entry = reader.iterator().next(); assertStatsDropped(entry); } @@ -120,9 +123,10 @@ public void testReadIteratorWithFilterAndSelectDropsStats() throws IOException { @Test public void testReadIteratorWithFilterAndSelectRecordCountDropsStats() throws IOException { ManifestFile manifest = writeManifest(1000L, FILE); - try (ManifestReader reader = ManifestFiles.read(manifest, FILE_IO) - .select(ImmutableList.of("file_path", "record_count")) - .filterRows(Expressions.equal("id", 3))) { + try (ManifestReader reader = + ManifestFiles.read(manifest, FILE_IO) + .select(ImmutableList.of("file_path", "record_count")) + .filterRows(Expressions.equal("id", 3))) { DataFile entry = reader.iterator().next(); assertStatsDropped(entry); } @@ -131,9 +135,10 @@ public void testReadIteratorWithFilterAndSelectRecordCountDropsStats() throws IO @Test public void testReadIteratorWithFilterAndSelectStatsIncludesFullStats() throws IOException { ManifestFile manifest = writeManifest(1000L, FILE); - try (ManifestReader reader = ManifestFiles.read(manifest, FILE_IO) - .select(ImmutableList.of("file_path", "value_counts")) - .filterRows(Expressions.equal("id", 3))) { + try (ManifestReader reader = + ManifestFiles.read(manifest, FILE_IO) + .select(ImmutableList.of("file_path", "value_counts")) + .filterRows(Expressions.equal("id", 3))) { DataFile entry = reader.iterator().next(); assertFullStats(entry); @@ -145,8 +150,8 @@ public void testReadIteratorWithFilterAndSelectStatsIncludesFullStats() throws I @Test public void testReadEntriesWithSelectNotProjectStats() throws IOException { ManifestFile manifest = writeManifest(1000L, FILE); - try (ManifestReader reader = ManifestFiles.read(manifest, FILE_IO) - .select(ImmutableList.of("file_path"))) { + try (ManifestReader reader = + ManifestFiles.read(manifest, FILE_IO).select(ImmutableList.of("file_path"))) { CloseableIterable> entries = reader.entries(); ManifestEntry entry = entries.iterator().next(); DataFile dataFile = entry.file(); @@ -168,8 +173,9 @@ public void testReadEntriesWithSelectNotProjectStats() throws IOException { @Test public void testReadEntriesWithSelectCertainStatNotProjectStats() throws IOException { ManifestFile manifest = writeManifest(1000L, FILE); - try (ManifestReader reader = ManifestFiles.read(manifest, FILE_IO) - .select(ImmutableList.of("file_path", "value_counts"))) { + try (ManifestReader reader = + ManifestFiles.read(manifest, FILE_IO) + .select(ImmutableList.of("file_path", "value_counts"))) { DataFile dataFile = reader.iterator().next(); // selected fields are populated @@ -199,7 +205,8 @@ private void assertFullStats(DataFile dataFile) { } private void assertStatsDropped(DataFile dataFile) { - Assert.assertEquals(3, dataFile.recordCount()); // record count is not considered as droppable stats + Assert.assertEquals( + 3, dataFile.recordCount()); // record count is not considered as droppable stats Assert.assertNull(dataFile.columnSizes()); Assert.assertNull(dataFile.valueCounts()); Assert.assertNull(dataFile.nullValueCounts()); @@ -217,5 +224,4 @@ private void assertNullRecordCount(DataFile dataFile) { NullPointerException.class, dataFile::recordCount); } - } diff --git a/core/src/test/java/org/apache/iceberg/TestManifestWriter.java b/core/src/test/java/org/apache/iceberg/TestManifestWriter.java index a6866465e6a8..4b7f4d3b8f92 100644 --- a/core/src/test/java/org/apache/iceberg/TestManifestWriter.java +++ b/core/src/test/java/org/apache/iceberg/TestManifestWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.File; @@ -37,7 +36,7 @@ public class TestManifestWriter extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestManifestWriter(int formatVersion) { @@ -46,25 +45,28 @@ public TestManifestWriter(int formatVersion) { @Test public void testManifestStats() throws IOException { - ManifestFile manifest = writeManifest( - "manifest.avro", - manifestEntry(Status.ADDED, null, newFile(10)), - manifestEntry(Status.ADDED, null, newFile(20)), - manifestEntry(Status.ADDED, null, newFile(5)), - manifestEntry(Status.ADDED, null, newFile(5)), - manifestEntry(Status.EXISTING, null, newFile(15)), - manifestEntry(Status.EXISTING, null, newFile(10)), - manifestEntry(Status.EXISTING, null, newFile(1)), - manifestEntry(Status.DELETED, null, newFile(5)), - manifestEntry(Status.DELETED, null, newFile(2))); + ManifestFile manifest = + writeManifest( + "manifest.avro", + manifestEntry(Status.ADDED, null, newFile(10)), + manifestEntry(Status.ADDED, null, newFile(20)), + manifestEntry(Status.ADDED, null, newFile(5)), + manifestEntry(Status.ADDED, null, newFile(5)), + manifestEntry(Status.EXISTING, null, newFile(15)), + manifestEntry(Status.EXISTING, null, newFile(10)), + manifestEntry(Status.EXISTING, null, newFile(1)), + manifestEntry(Status.DELETED, null, newFile(5)), + manifestEntry(Status.DELETED, null, newFile(2))); Assert.assertTrue("Added files should be present", manifest.hasAddedFiles()); Assert.assertEquals("Added files count should match", 4, (int) manifest.addedFilesCount()); Assert.assertEquals("Added rows count should match", 40L, (long) manifest.addedRowsCount()); Assert.assertTrue("Existing files should be present", manifest.hasExistingFiles()); - Assert.assertEquals("Existing files count should match", 3, (int) manifest.existingFilesCount()); - Assert.assertEquals("Existing rows count should match", 26L, (long) manifest.existingRowsCount()); + Assert.assertEquals( + "Existing files count should match", 3, (int) manifest.existingFilesCount()); + Assert.assertEquals( + "Existing rows count should match", 26L, (long) manifest.existingRowsCount()); Assert.assertTrue("Deleted files should be present", manifest.hasDeletedFiles()); Assert.assertEquals("Deleted files count should match", 2, (int) manifest.deletedFilesCount()); @@ -73,20 +75,25 @@ public void testManifestStats() throws IOException { @Test public void testManifestPartitionStats() throws IOException { - ManifestFile manifest = writeManifest( - "manifest.avro", - manifestEntry(Status.ADDED, null, newFile(10, TestHelpers.Row.of(1))), - manifestEntry(Status.EXISTING, null, newFile(15, TestHelpers.Row.of(2))), - manifestEntry(Status.DELETED, null, newFile(2, TestHelpers.Row.of(3)))); + ManifestFile manifest = + writeManifest( + "manifest.avro", + manifestEntry(Status.ADDED, null, newFile(10, TestHelpers.Row.of(1))), + manifestEntry(Status.EXISTING, null, newFile(15, TestHelpers.Row.of(2))), + manifestEntry(Status.DELETED, null, newFile(2, TestHelpers.Row.of(3)))); List partitions = manifest.partitions(); Assert.assertEquals("Partition field summaries count should match", 1, partitions.size()); ManifestFile.PartitionFieldSummary partitionFieldSummary = partitions.get(0); Assert.assertFalse("contains_null should be false", partitionFieldSummary.containsNull()); Assert.assertFalse("contains_nan should be false", partitionFieldSummary.containsNaN()); - Assert.assertEquals("Lower bound should match", Integer.valueOf(1), + Assert.assertEquals( + "Lower bound should match", + Integer.valueOf(1), Conversions.fromByteBuffer(Types.IntegerType.get(), partitionFieldSummary.lowerBound())); - Assert.assertEquals("Upper bound should match", Integer.valueOf(3), + Assert.assertEquals( + "Upper bound should match", + Integer.valueOf(3), Conversions.fromByteBuffer(Types.IntegerType.get(), partitionFieldSummary.upperBound())); } @@ -96,15 +103,18 @@ public void testWriteManifestWithSequenceNumber() throws IOException { File manifestFile = temp.newFile("manifest.avro"); Assert.assertTrue(manifestFile.delete()); OutputFile outputFile = table.ops().io().newOutputFile(manifestFile.getCanonicalPath()); - ManifestWriter writer = ManifestFiles.write(formatVersion, table.spec(), outputFile, 1L); + ManifestWriter writer = + ManifestFiles.write(formatVersion, table.spec(), outputFile, 1L); writer.add(newFile(10, TestHelpers.Row.of(1)), 1000L); writer.close(); ManifestFile manifest = writer.toManifestFile(); Assert.assertEquals("Manifest should have no sequence number", -1L, manifest.sequenceNumber()); ManifestReader manifestReader = ManifestFiles.read(manifest, table.io()); for (ManifestEntry entry : manifestReader.entries()) { - Assert.assertEquals("Custom sequence number should be used for all manifest entries", - 1000L, (long) entry.sequenceNumber()); + Assert.assertEquals( + "Custom sequence number should be used for all manifest entries", + 1000L, + (long) entry.sequenceNumber()); } } @@ -114,10 +124,11 @@ private DataFile newFile(long recordCount) { private DataFile newFile(long recordCount, StructLike partition) { String fileName = UUID.randomUUID().toString(); - DataFiles.Builder builder = DataFiles.builder(SPEC) - .withPath("data_bucket=0/" + fileName + ".parquet") - .withFileSizeInBytes(1024) - .withRecordCount(recordCount); + DataFiles.Builder builder = + DataFiles.builder(SPEC) + .withPath("data_bucket=0/" + fileName + ".parquet") + .withFileSizeInBytes(1024) + .withRecordCount(recordCount); if (partition != null) { builder.withPartition(partition); } diff --git a/core/src/test/java/org/apache/iceberg/TestManifestWriterVersions.java b/core/src/test/java/org/apache/iceberg/TestManifestWriterVersions.java index 79aa35db974c..9c606f5e7e60 100644 --- a/core/src/test/java/org/apache/iceberg/TestManifestWriterVersions.java +++ b/core/src/test/java/org/apache/iceberg/TestManifestWriterVersions.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.List; import org.apache.iceberg.io.CloseableIterable; @@ -37,51 +38,64 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestManifestWriterVersions { private static final FileIO FILE_IO = new TestTables.LocalFileIO(); - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "timestamp", Types.TimestampType.withZone()), - required(3, "category", Types.StringType.get()), - required(4, "data", Types.StringType.get()), - required(5, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .hour("timestamp") - .bucket("id", 16) - .build(); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + required(2, "timestamp", Types.TimestampType.withZone()), + required(3, "category", Types.StringType.get()), + required(4, "data", Types.StringType.get()), + required(5, "double", Types.DoubleType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA) + .identity("category") + .hour("timestamp") + .bucket("id", 16) + .build(); private static final long SEQUENCE_NUMBER = 34L; private static final long SNAPSHOT_ID = 987134631982734L; - private static final String PATH = "s3://bucket/table/category=cheesy/timestamp_hour=10/id_bucket=3/file.avro"; + private static final String PATH = + "s3://bucket/table/category=cheesy/timestamp_hour=10/id_bucket=3/file.avro"; private static final FileFormat FORMAT = FileFormat.AVRO; - private static final PartitionData PARTITION = DataFiles.data(SPEC, "category=cheesy/timestamp_hour=10/id_bucket=3"); - private static final Metrics METRICS = new Metrics( - 1587L, - ImmutableMap.of(1, 15L, 2, 122L, 3, 4021L, 4, 9411L, 5, 15L), // sizes - ImmutableMap.of(1, 100L, 2, 100L, 3, 100L, 4, 100L, 5, 100L), // value counts - ImmutableMap.of(1, 0L, 2, 0L, 3, 0L, 4, 0L, 5, 0L), // null value counts - ImmutableMap.of(5, 10L), // nan value counts - ImmutableMap.of(1, Conversions.toByteBuffer(Types.IntegerType.get(), 1)), // lower bounds - ImmutableMap.of(1, Conversions.toByteBuffer(Types.IntegerType.get(), 1))); // upper bounds + private static final PartitionData PARTITION = + DataFiles.data(SPEC, "category=cheesy/timestamp_hour=10/id_bucket=3"); + private static final Metrics METRICS = + new Metrics( + 1587L, + ImmutableMap.of(1, 15L, 2, 122L, 3, 4021L, 4, 9411L, 5, 15L), // sizes + ImmutableMap.of(1, 100L, 2, 100L, 3, 100L, 4, 100L, 5, 100L), // value counts + ImmutableMap.of(1, 0L, 2, 0L, 3, 0L, 4, 0L, 5, 0L), // null value counts + ImmutableMap.of(5, 10L), // nan value counts + ImmutableMap.of(1, Conversions.toByteBuffer(Types.IntegerType.get(), 1)), // lower bounds + ImmutableMap.of(1, Conversions.toByteBuffer(Types.IntegerType.get(), 1))); // upper bounds private static final List OFFSETS = ImmutableList.of(4L); private static final Integer SORT_ORDER_ID = 2; - private static final DataFile DATA_FILE = new GenericDataFile( - 0, PATH, FORMAT, PARTITION, 150972L, METRICS, null, OFFSETS, SORT_ORDER_ID); + private static final DataFile DATA_FILE = + new GenericDataFile( + 0, PATH, FORMAT, PARTITION, 150972L, METRICS, null, OFFSETS, SORT_ORDER_ID); private static final List EQUALITY_IDS = ImmutableList.of(1); - private static final int[] EQUALITY_ID_ARR = new int[] { 1 }; - - private static final DeleteFile DELETE_FILE = new GenericDeleteFile( - 0, FileContent.EQUALITY_DELETES, PATH, FORMAT, PARTITION, 22905L, METRICS, EQUALITY_ID_ARR, SORT_ORDER_ID, null); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final int[] EQUALITY_ID_ARR = new int[] {1}; + + private static final DeleteFile DELETE_FILE = + new GenericDeleteFile( + 0, + FileContent.EQUALITY_DELETES, + PATH, + FORMAT, + PARTITION, + 22905L, + METRICS, + EQUALITY_ID_ARR, + SORT_ORDER_ID, + null); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testV1Write() throws IOException { @@ -92,8 +106,10 @@ public void testV1Write() throws IOException { @Test public void testV1WriteDelete() { - AssertHelpers.assertThrows("Should fail to write a delete manifest for v1", - IllegalArgumentException.class, "Cannot write delete files in a v1 table", + AssertHelpers.assertThrows( + "Should fail to write a delete manifest for v1", + IllegalArgumentException.class, + "Cannot write delete files in a v1 table", () -> writeDeleteManifest(1)); } @@ -129,7 +145,8 @@ public void testV2WriteDelete() throws IOException { ManifestFile manifest = writeDeleteManifest(2); checkManifest(manifest, ManifestWriter.UNASSIGNED_SEQ); Assert.assertEquals("Content", ManifestContent.DELETES, manifest.content()); - checkEntry(readDeleteManifest(manifest), ManifestWriter.UNASSIGNED_SEQ, FileContent.EQUALITY_DELETES); + checkEntry( + readDeleteManifest(manifest), ManifestWriter.UNASSIGNED_SEQ, FileContent.EQUALITY_DELETES); } @Test @@ -169,7 +186,8 @@ public void testV2ManifestRewriteWithInheritance() throws IOException { // add the v2 manifest to a v2 manifest list, with a sequence number ManifestFile manifest2 = writeAndReadManifestList(rewritten, 2); - // the ManifestFile is new so it has a sequence number, but the min sequence number 0 is from the entry + // the ManifestFile is new so it has a sequence number, but the min sequence number 0 is from + // the entry checkRewrittenManifest(manifest2, SEQUENCE_NUMBER, 0L); // should not inherit the v2 sequence number because it was written into the v2 manifest @@ -183,7 +201,8 @@ void checkEntry(ManifestEntry entry, Long expectedSequenceNumber, FileContent checkDataFile(entry.file(), content); } - void checkRewrittenEntry(ManifestEntry entry, Long expectedSequenceNumber, FileContent content) { + void checkRewrittenEntry( + ManifestEntry entry, Long expectedSequenceNumber, FileContent content) { Assert.assertEquals("Status", ManifestEntry.Status.EXISTING, entry.status()); Assert.assertEquals("Snapshot ID", (Long) SNAPSHOT_ID, entry.snapshotId()); Assert.assertEquals("Sequence number", expectedSequenceNumber, entry.sequenceNumber()); @@ -214,7 +233,8 @@ void checkDataFile(ContentFile dataFile, FileContent content) { void checkManifest(ManifestFile manifest, long expectedSequenceNumber) { Assert.assertEquals("Snapshot ID", (Long) SNAPSHOT_ID, manifest.snapshotId()); Assert.assertEquals("Sequence number", expectedSequenceNumber, manifest.sequenceNumber()); - Assert.assertEquals("Min sequence number", expectedSequenceNumber, manifest.minSequenceNumber()); + Assert.assertEquals( + "Min sequence number", expectedSequenceNumber, manifest.minSequenceNumber()); Assert.assertEquals("Added files count", (Integer) 1, manifest.addedFilesCount()); Assert.assertEquals("Existing files count", (Integer) 0, manifest.existingFilesCount()); Assert.assertEquals("Deleted files count", (Integer) 0, manifest.deletedFilesCount()); @@ -223,10 +243,12 @@ void checkManifest(ManifestFile manifest, long expectedSequenceNumber) { Assert.assertEquals("Deleted rows count", (Long) 0L, manifest.deletedRowsCount()); } - void checkRewrittenManifest(ManifestFile manifest, long expectedSequenceNumber, long expectedMinSequenceNumber) { + void checkRewrittenManifest( + ManifestFile manifest, long expectedSequenceNumber, long expectedMinSequenceNumber) { Assert.assertEquals("Snapshot ID", (Long) SNAPSHOT_ID, manifest.snapshotId()); Assert.assertEquals("Sequence number", expectedSequenceNumber, manifest.sequenceNumber()); - Assert.assertEquals("Min sequence number", expectedMinSequenceNumber, manifest.minSequenceNumber()); + Assert.assertEquals( + "Min sequence number", expectedMinSequenceNumber, manifest.minSequenceNumber()); Assert.assertEquals("Added files count", (Integer) 0, manifest.addedFilesCount()); Assert.assertEquals("Existing files count", (Integer) 1, manifest.existingFilesCount()); Assert.assertEquals("Deleted files count", (Integer) 0, manifest.deletedFilesCount()); @@ -237,22 +259,31 @@ void checkRewrittenManifest(ManifestFile manifest, long expectedSequenceNumber, private InputFile writeManifestList(ManifestFile manifest, int formatVersion) throws IOException { OutputFile manifestList = new InMemoryOutputFile(); - try (FileAppender writer = ManifestLists.write( - formatVersion, manifestList, SNAPSHOT_ID, SNAPSHOT_ID - 1, formatVersion > 1 ? SEQUENCE_NUMBER : 0)) { + try (FileAppender writer = + ManifestLists.write( + formatVersion, + manifestList, + SNAPSHOT_ID, + SNAPSHOT_ID - 1, + formatVersion > 1 ? SEQUENCE_NUMBER : 0)) { writer.add(manifest); } return manifestList.toInputFile(); } - private ManifestFile writeAndReadManifestList(ManifestFile manifest, int formatVersion) throws IOException { + private ManifestFile writeAndReadManifestList(ManifestFile manifest, int formatVersion) + throws IOException { List manifests = ManifestLists.read(writeManifestList(manifest, formatVersion)); Assert.assertEquals("Should contain one manifest", 1, manifests.size()); return manifests.get(0); } - private ManifestFile rewriteManifest(ManifestFile manifest, int formatVersion) throws IOException { - OutputFile manifestFile = Files.localOutput(FileFormat.AVRO.addExtension(temp.newFile().toString())); - ManifestWriter writer = ManifestFiles.write(formatVersion, SPEC, manifestFile, SNAPSHOT_ID); + private ManifestFile rewriteManifest(ManifestFile manifest, int formatVersion) + throws IOException { + OutputFile manifestFile = + Files.localOutput(FileFormat.AVRO.addExtension(temp.newFile().toString())); + ManifestWriter writer = + ManifestFiles.write(formatVersion, SPEC, manifestFile, SNAPSHOT_ID); try { writer.existing(readManifest(manifest)); } finally { @@ -266,8 +297,10 @@ private ManifestFile writeManifest(int formatVersion) throws IOException { } private ManifestFile writeManifest(DataFile file, int formatVersion) throws IOException { - OutputFile manifestFile = Files.localOutput(FileFormat.AVRO.addExtension(temp.newFile().toString())); - ManifestWriter writer = ManifestFiles.write(formatVersion, SPEC, manifestFile, SNAPSHOT_ID); + OutputFile manifestFile = + Files.localOutput(FileFormat.AVRO.addExtension(temp.newFile().toString())); + ManifestWriter writer = + ManifestFiles.write(formatVersion, SPEC, manifestFile, SNAPSHOT_ID); try { writer.add(file); } finally { @@ -277,7 +310,8 @@ private ManifestFile writeManifest(DataFile file, int formatVersion) throws IOEx } private ManifestEntry readManifest(ManifestFile manifest) throws IOException { - try (CloseableIterable> reader = ManifestFiles.read(manifest, FILE_IO).entries()) { + try (CloseableIterable> reader = + ManifestFiles.read(manifest, FILE_IO).entries()) { List> files = Lists.newArrayList(reader); Assert.assertEquals("Should contain only one data file", 1, files.size()); return files.get(0); @@ -285,9 +319,10 @@ private ManifestEntry readManifest(ManifestFile manifest) throws IOExc } private ManifestFile writeDeleteManifest(int formatVersion) throws IOException { - OutputFile manifestFile = Files.localOutput(FileFormat.AVRO.addExtension(temp.newFile().toString())); - ManifestWriter writer = ManifestFiles.writeDeleteManifest( - formatVersion, SPEC, manifestFile, SNAPSHOT_ID); + OutputFile manifestFile = + Files.localOutput(FileFormat.AVRO.addExtension(temp.newFile().toString())); + ManifestWriter writer = + ManifestFiles.writeDeleteManifest(formatVersion, SPEC, manifestFile, SNAPSHOT_ID); try { writer.add(DELETE_FILE); } finally { @@ -298,7 +333,7 @@ private ManifestFile writeDeleteManifest(int formatVersion) throws IOException { private ManifestEntry readDeleteManifest(ManifestFile manifest) throws IOException { try (CloseableIterable> reader = - ManifestFiles.readDeleteManifest(manifest, FILE_IO, null).entries()) { + ManifestFiles.readDeleteManifest(manifest, FILE_IO, null).entries()) { List> entries = Lists.newArrayList(reader); Assert.assertEquals("Should contain only one data file", 1, entries.size()); return entries.get(0); diff --git a/core/src/test/java/org/apache/iceberg/TestMergeAppend.java b/core/src/test/java/org/apache/iceberg/TestMergeAppend.java index 2060bf572cdd..f0de418ab23b 100644 --- a/core/src/test/java/org/apache/iceberg/TestMergeAppend.java +++ b/core/src/test/java/org/apache/iceberg/TestMergeAppend.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.relocated.com.google.common.collect.Iterators.concat; + import java.io.File; import java.io.IOException; import java.util.List; @@ -36,13 +37,11 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.relocated.com.google.common.collect.Iterators.concat; - @RunWith(Parameterized.class) public class TestMergeAppend extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestMergeAppend(int formatVersion) { @@ -57,22 +56,24 @@ public void testEmptyTableAppend() { Assert.assertNull("Should not have a current snapshot", base.currentSnapshot()); Assert.assertEquals("Last sequence number should be 0", 0, base.lastSequenceNumber()); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot committedSnapshot = table.currentSnapshot(); Assert.assertNotNull("Should create a snapshot", table.currentSnapshot()); - V1Assert.assertEquals("Last sequence number should be 0", 0, table.ops().current().lastSequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + V1Assert.assertEquals( + "Last sequence number should be 0", 0, table.ops().current().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); - Assert.assertEquals("Should create 1 manifest for initial write", - 1, committedSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create 1 manifest for initial write", + 1, + committedSnapshot.allManifests(table.io()).size()); long snapshotId = committedSnapshot.snapshotId(); - validateManifest(committedSnapshot.allManifests(table.io()).get(0), + validateManifest( + committedSnapshot.allManifests(table.io()).get(0), seqs(1, 1), ids(snapshotId, snapshotId), files(FILE_A, FILE_B), @@ -88,27 +89,32 @@ public void testEmptyTableAppendManifest() throws IOException { Assert.assertEquals("Last sequence number should be 0", 0, base.lastSequenceNumber()); ManifestFile manifest = writeManifest(FILE_A, FILE_B); - table.newAppend() - .appendManifest(manifest) - .commit(); + table.newAppend().appendManifest(manifest).commit(); Snapshot committedSnapshot = table.currentSnapshot(); Assert.assertNotNull("Should create a snapshot", table.currentSnapshot()); - V1Assert.assertEquals("Last sequence number should be 0", 0, table.ops().current().lastSequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); - Assert.assertEquals("Should create 1 manifest for initial write", - 1, committedSnapshot.allManifests(table.io()).size()); + V1Assert.assertEquals( + "Last sequence number should be 0", 0, table.ops().current().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Should create 1 manifest for initial write", + 1, + committedSnapshot.allManifests(table.io()).size()); long snapshotId = committedSnapshot.snapshotId(); - validateManifest(committedSnapshot.allManifests(table.io()).get(0), + validateManifest( + committedSnapshot.allManifests(table.io()).get(0), seqs(1, 1), ids(snapshotId, snapshotId), files(FILE_A, FILE_B), statuses(Status.ADDED, Status.ADDED)); // validate that the metadata summary is correct when using appendManifest - Assert.assertEquals("Summary metadata should include 2 added files", - "2", committedSnapshot.summary().get("added-data-files")); + Assert.assertEquals( + "Summary metadata should include 2 added files", + "2", + committedSnapshot.summary().get("added-data-files")); } @Test @@ -120,28 +126,30 @@ public void testEmptyTableAppendFilesAndManifest() throws IOException { Assert.assertEquals("Last sequence number should be 0", 0, base.lastSequenceNumber()); ManifestFile manifest = writeManifest(FILE_A, FILE_B); - table.newAppend() - .appendFile(FILE_C) - .appendFile(FILE_D) - .appendManifest(manifest) - .commit(); + table.newAppend().appendFile(FILE_C).appendFile(FILE_D).appendManifest(manifest).commit(); Snapshot committedSnapshot = table.currentSnapshot(); Assert.assertNotNull("Should create a snapshot", table.currentSnapshot()); - V1Assert.assertEquals("Last sequence number should be 0", 0, table.ops().current().lastSequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); - Assert.assertEquals("Should create 2 manifests for initial write", - 2, committedSnapshot.allManifests(table.io()).size()); + V1Assert.assertEquals( + "Last sequence number should be 0", 0, table.ops().current().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Should create 2 manifests for initial write", + 2, + committedSnapshot.allManifests(table.io()).size()); long snapshotId = committedSnapshot.snapshotId(); - validateManifest(committedSnapshot.allManifests(table.io()).get(0), + validateManifest( + committedSnapshot.allManifests(table.io()).get(0), seqs(1, 1), ids(snapshotId, snapshotId), files(FILE_C, FILE_D), statuses(Status.ADDED, Status.ADDED)); - validateManifest(committedSnapshot.allManifests(table.io()).get(1), + validateManifest( + committedSnapshot.allManifests(table.io()).get(1), seqs(1, 1), ids(snapshotId, snapshotId), files(FILE_A, FILE_B), @@ -156,15 +164,20 @@ public void testAppendWithManifestScanExecutor() { Assert.assertNull("Should not have a current snapshot", base.currentSnapshot()); Assert.assertEquals("Last sequence number should be 0", 0, base.lastSequenceNumber()); AtomicInteger scanThreadsIndex = new AtomicInteger(0); - table.newAppend() + table + .newAppend() .appendFile(FILE_A) .appendFile(FILE_B) - .scanManifestsWith(Executors.newFixedThreadPool(1, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("scan-" + scanThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })) + .scanManifestsWith( + Executors.newFixedThreadPool( + 1, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("scan-" + scanThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })) .commit(); Assert.assertTrue("Thread should be created in provided pool", scanThreadsIndex.get() > 0); Assert.assertNotNull("Should create a snapshot", table.currentSnapshot()); @@ -182,27 +195,26 @@ public void testMergeWithAppendFilesAndManifest() throws IOException { Assert.assertEquals("Last sequence number should be 0", 0, base.lastSequenceNumber()); ManifestFile manifest = writeManifest(FILE_A, FILE_B); - table.newAppend() - .appendFile(FILE_C) - .appendFile(FILE_D) - .appendManifest(manifest) - .commit(); + table.newAppend().appendFile(FILE_C).appendFile(FILE_D).appendManifest(manifest).commit(); Snapshot committedSnapshot = table.currentSnapshot(); Assert.assertNotNull("Should create a snapshot", table.currentSnapshot()); - V1Assert.assertEquals("Last sequence number should be 0", 0, table.ops().current().lastSequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + V1Assert.assertEquals( + "Last sequence number should be 0", 0, table.ops().current().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); long snapshotId = committedSnapshot.snapshotId(); - Assert.assertEquals("Should create 1 merged manifest", 1, committedSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create 1 merged manifest", 1, committedSnapshot.allManifests(table.io()).size()); - validateManifest(committedSnapshot.allManifests(table.io()).get(0), + validateManifest( + committedSnapshot.allManifests(table.io()).get(0), seqs(1, 1, 1, 1), ids(snapshotId, snapshotId, snapshotId, snapshotId), files(FILE_C, FILE_D, FILE_A, FILE_B), - statuses(Status.ADDED, Status.ADDED, Status.ADDED, Status.ADDED) - ); + statuses(Status.ADDED, Status.ADDED, Status.ADDED, Status.ADDED)); } @Test @@ -212,58 +224,63 @@ public void testMergeWithExistingManifest() { Assert.assertEquals("Last sequence number should be 0", 0, readMetadata().lastSequenceNumber()); Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Assert.assertNotNull("Should create a snapshot", table.currentSnapshot()); - V1Assert.assertEquals("Last sequence number should be 0", 0, table.ops().current().lastSequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + V1Assert.assertEquals( + "Last sequence number should be 0", 0, table.ops().current().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); TableMetadata base = readMetadata(); Snapshot commitBefore = table.currentSnapshot(); long baseId = commitBefore.snapshotId(); validateSnapshot(null, commitBefore, 1, FILE_A, FILE_B); - Assert.assertEquals("Should create 1 manifest for initial write", - 1, commitBefore.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create 1 manifest for initial write", + 1, + commitBefore.allManifests(table.io()).size()); ManifestFile initialManifest = base.currentSnapshot().allManifests(table.io()).get(0); - validateManifest(initialManifest, + validateManifest( + initialManifest, seqs(1, 1), ids(baseId, baseId), files(FILE_A, FILE_B), statuses(Status.ADDED, Status.ADDED)); - table.newAppend() - .appendFile(FILE_C) - .appendFile(FILE_D) - .commit(); - V1Assert.assertEquals("Last sequence number should be 0", 0, table.ops().current().lastSequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); + table.newAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); + V1Assert.assertEquals( + "Last sequence number should be 0", 0, table.ops().current().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); Snapshot committedAfter = table.currentSnapshot(); - Assert.assertEquals("Should contain 1 merged manifest for second write", - 1, committedAfter.allManifests(table.io()).size()); + Assert.assertEquals( + "Should contain 1 merged manifest for second write", + 1, + committedAfter.allManifests(table.io()).size()); ManifestFile newManifest = committedAfter.allManifests(table.io()).get(0); - Assert.assertNotEquals("Should not contain manifest from initial write", - initialManifest, newManifest); + Assert.assertNotEquals( + "Should not contain manifest from initial write", initialManifest, newManifest); long snapshotId = committedAfter.snapshotId(); - validateManifest(newManifest, + validateManifest( + newManifest, seqs(2, 2, 1, 1), ids(snapshotId, snapshotId, baseId, baseId), concat(files(FILE_C, FILE_D), files(initialManifest)), - statuses(Status.ADDED, Status.ADDED, Status.EXISTING, Status.EXISTING) - ); + statuses(Status.ADDED, Status.ADDED, Status.EXISTING, Status.EXISTING)); } @Test public void testManifestMergeMinCount() throws IOException { Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); - table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "2") + table + .updateProperties() + .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "2") // Each initial v1/v2 ManifestFile is 5661/6397 bytes respectively. Merging two of the given // manifests make one v1/v2 ManifestFile of 5672/6408 bytes respectively, so 15000 bytes // limit will give us two bins with three manifest/data files. @@ -277,7 +294,8 @@ public void testManifestMergeMinCount() throws IOException { ManifestFile manifest = writeManifest(FILE_A); ManifestFile manifest2 = writeManifestWithName("FILE_C", FILE_C); ManifestFile manifest3 = writeManifestWithName("FILE_D", FILE_D); - table.newAppend() + table + .newAppend() .appendManifest(manifest) .appendManifest(manifest2) .appendManifest(manifest3) @@ -288,22 +306,28 @@ public void testManifestMergeMinCount() throws IOException { base = readMetadata(); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap1.sequenceNumber()); V2Assert.assertEquals("Last sequence number should be 1", 1, base.lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); - - Assert.assertEquals("Should contain 2 merged manifest for first write", - 2, readMetadata().currentSnapshot().allManifests(table.io()).size()); - validateManifest(snap1.allManifests(table.io()).get(0), + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); + + Assert.assertEquals( + "Should contain 2 merged manifest for first write", + 2, + readMetadata().currentSnapshot().allManifests(table.io()).size()); + validateManifest( + snap1.allManifests(table.io()).get(0), seqs(1), ids(commitId1), files(FILE_A), statuses(Status.ADDED)); - validateManifest(snap1.allManifests(table.io()).get(1), + validateManifest( + snap1.allManifests(table.io()).get(1), seqs(1, 1), ids(commitId1, commitId1), files(FILE_C, FILE_D), statuses(Status.ADDED, Status.ADDED)); - table.newAppend() + table + .newAppend() .appendManifest(manifest) .appendManifest(manifest2) .appendManifest(manifest3) @@ -313,29 +337,37 @@ public void testManifestMergeMinCount() throws IOException { base = readMetadata(); V2Assert.assertEquals("Snapshot sequence number should be 2", 2, snap2.sequenceNumber()); V2Assert.assertEquals("Last sequence number should be 2", 2, base.lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); - - Assert.assertEquals("Should contain 3 merged manifest for second write", - 3, readMetadata().currentSnapshot().allManifests(table.io()).size()); - validateManifest(snap2.allManifests(table.io()).get(0), + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); + + Assert.assertEquals( + "Should contain 3 merged manifest for second write", + 3, + readMetadata().currentSnapshot().allManifests(table.io()).size()); + validateManifest( + snap2.allManifests(table.io()).get(0), seqs(2), ids(commitId2), files(FILE_A), statuses(Status.ADDED)); - validateManifest(snap2.allManifests(table.io()).get(1), + validateManifest( + snap2.allManifests(table.io()).get(1), seqs(2, 2), ids(commitId2, commitId2), files(FILE_C, FILE_D), statuses(Status.ADDED, Status.ADDED)); - validateManifest(snap2.allManifests(table.io()).get(2), + validateManifest( + snap2.allManifests(table.io()).get(2), seqs(1, 1, 1), ids(commitId1, commitId1, commitId1), files(FILE_A, FILE_C, FILE_D), statuses(Status.EXISTING, Status.EXISTING, Status.EXISTING)); // validate that the metadata summary is correct when using appendManifest - Assert.assertEquals("Summary metadata should include 3 added files", - "3", readMetadata().currentSnapshot().summary().get("added-data-files")); + Assert.assertEquals( + "Summary metadata should include 3 added files", + "3", + readMetadata().currentSnapshot().summary().get("added-data-files")); } @Test @@ -346,11 +378,13 @@ public void testManifestsMergeIntoOne() throws IOException { TableMetadata base = readMetadata(); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap1.sequenceNumber()); V2Assert.assertEquals("Last sequence number should be 1", 1, base.lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); long commitId1 = snap1.snapshotId(); Assert.assertEquals("Should contain 1 manifest", 1, snap1.allManifests(table.io()).size()); - validateManifest(snap1.allManifests(table.io()).get(0), + validateManifest( + snap1.allManifests(table.io()).get(0), seqs(1), ids(commitId1), files(FILE_A), @@ -362,67 +396,77 @@ public void testManifestsMergeIntoOne() throws IOException { base = readMetadata(); V2Assert.assertEquals("Snapshot sequence number should be 2", 2, snap2.sequenceNumber()); V2Assert.assertEquals("Last sequence number should be 2", 2, base.lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); Assert.assertEquals("Should contain 2 manifests", 2, snap2.allManifests(table.io()).size()); - validateManifest(snap2.allManifests(table.io()).get(0), + validateManifest( + snap2.allManifests(table.io()).get(0), seqs(2), ids(commitId2), files(FILE_B), statuses(Status.ADDED)); - validateManifest(snap2.allManifests(table.io()).get(1), + validateManifest( + snap2.allManifests(table.io()).get(1), seqs(1), ids(commitId1), files(FILE_A), statuses(Status.ADDED)); - table.newAppend() - .appendManifest(writeManifest("input-m0.avro", - manifestEntry(ManifestEntry.Status.ADDED, null, FILE_C))) + table + .newAppend() + .appendManifest( + writeManifest("input-m0.avro", manifestEntry(ManifestEntry.Status.ADDED, null, FILE_C))) .commit(); Snapshot snap3 = table.currentSnapshot(); base = readMetadata(); V2Assert.assertEquals("Snapshot sequence number should be 3", 3, snap3.sequenceNumber()); V2Assert.assertEquals("Last sequence number should be 3", 3, base.lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); Assert.assertEquals("Should contain 3 manifests", 3, snap3.allManifests(table.io()).size()); long commitId3 = snap3.snapshotId(); - validateManifest(snap3.allManifests(table.io()).get(0), + validateManifest( + snap3.allManifests(table.io()).get(0), seqs(3), ids(commitId3), files(FILE_C), statuses(Status.ADDED)); - validateManifest(snap3.allManifests(table.io()).get(1), + validateManifest( + snap3.allManifests(table.io()).get(1), seqs(2), ids(commitId2), files(FILE_B), statuses(Status.ADDED)); - validateManifest(snap3.allManifests(table.io()).get(2), + validateManifest( + snap3.allManifests(table.io()).get(2), seqs(1), ids(commitId1), files(FILE_A), statuses(Status.ADDED)); - table.updateProperties() - .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1").commit(); - table.newAppend() - .appendManifest(writeManifest("input-m1.avro", - manifestEntry(ManifestEntry.Status.ADDED, null, FILE_D))) + table + .newAppend() + .appendManifest( + writeManifest("input-m1.avro", manifestEntry(ManifestEntry.Status.ADDED, null, FILE_D))) .commit(); Snapshot snap4 = table.currentSnapshot(); base = readMetadata(); V2Assert.assertEquals("Snapshot sequence number should be 4", 4, snap4.sequenceNumber()); V2Assert.assertEquals("Last sequence number should be 4", 4, base.lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); long commitId4 = snap4.snapshotId(); - Assert.assertEquals("Should only contains 1 merged manifest", 1, snap4.allManifests(table.io()).size()); - validateManifest(snap4.allManifests(table.io()).get(0), + Assert.assertEquals( + "Should only contains 1 merged manifest", 1, snap4.allManifests(table.io()).size()); + validateManifest( + snap4.allManifests(table.io()).get(0), seqs(4, 3, 2, 1), ids(commitId4, commitId3, commitId2, commitId1), files(FILE_D, FILE_C, FILE_B, FILE_A), @@ -441,45 +485,52 @@ public void testManifestDoNotMergeMinCount() throws IOException { ManifestFile manifest = writeManifest(FILE_A, FILE_B); ManifestFile manifest2 = writeManifestWithName("FILE_C", FILE_C); ManifestFile manifest3 = writeManifestWithName("FILE_D", FILE_D); - table.newAppend() + table + .newAppend() .appendManifest(manifest) .appendManifest(manifest2) .appendManifest(manifest3) .commit(); Assert.assertNotNull("Should create a snapshot", table.currentSnapshot()); - V1Assert.assertEquals("Last sequence number should be 0", 0, table.ops().current().lastSequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + V1Assert.assertEquals( + "Last sequence number should be 0", 0, table.ops().current().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); Snapshot committed = table.currentSnapshot(); - Assert.assertEquals("Should contain 3 merged manifest after 1st write write", - 3, committed.allManifests(table.io()).size()); + Assert.assertEquals( + "Should contain 3 merged manifest after 1st write write", + 3, + committed.allManifests(table.io()).size()); long snapshotId = table.currentSnapshot().snapshotId(); - validateManifest(committed.allManifests(table.io()).get(0), + validateManifest( + committed.allManifests(table.io()).get(0), seqs(1, 1), ids(snapshotId, snapshotId), files(FILE_A, FILE_B), - statuses(Status.ADDED, Status.ADDED) - ); - validateManifest(committed.allManifests(table.io()).get(1), + statuses(Status.ADDED, Status.ADDED)); + validateManifest( + committed.allManifests(table.io()).get(1), seqs(1), ids(snapshotId), files(FILE_C), - statuses(Status.ADDED) - ); - validateManifest(committed.allManifests(table.io()).get(2), + statuses(Status.ADDED)); + validateManifest( + committed.allManifests(table.io()).get(2), seqs(1), ids(snapshotId), files(FILE_D), - statuses(Status.ADDED) - ); + statuses(Status.ADDED)); // validate that the metadata summary is correct when using appendManifest - Assert.assertEquals("Summary metadata should include 4 added files", - "4", committed.summary().get("added-data-files")); + Assert.assertEquals( + "Summary metadata should include 4 added files", + "4", + committed.summary().get("added-data-files")); } @Test @@ -490,66 +541,73 @@ public void testMergeWithExistingManifestAfterDelete() { Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); Assert.assertEquals("Last sequence number should be 0", 0, readMetadata().lastSequenceNumber()); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot snap = table.currentSnapshot(); validateSnapshot(null, snap, 1, FILE_A, FILE_B); TableMetadata base = readMetadata(); long baseId = base.currentSnapshot().snapshotId(); - Assert.assertEquals("Should create 1 manifest for initial write", - 1, base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should create 1 manifest for initial write", + 1, + base.currentSnapshot().allManifests(table.io()).size()); ManifestFile initialManifest = base.currentSnapshot().allManifests(table.io()).get(0); - validateManifest(initialManifest, + validateManifest( + initialManifest, seqs(1, 1), ids(baseId, baseId), files(FILE_A, FILE_B), statuses(Status.ADDED, Status.ADDED)); - table.newDelete() - .deleteFile(FILE_A) - .commit(); + table.newDelete().deleteFile(FILE_A).commit(); Snapshot deleteSnapshot = table.currentSnapshot(); - V2Assert.assertEquals("Snapshot sequence number should be 2", 2, deleteSnapshot.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Snapshot sequence number should be 2", 2, deleteSnapshot.sequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); TableMetadata delete = readMetadata(); long deleteId = delete.currentSnapshot().snapshotId(); - Assert.assertEquals("Should create 1 filtered manifest for delete", - 1, delete.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should create 1 filtered manifest for delete", + 1, + delete.currentSnapshot().allManifests(table.io()).size()); ManifestFile deleteManifest = delete.currentSnapshot().allManifests(table.io()).get(0); - validateManifest(deleteManifest, + validateManifest( + deleteManifest, seqs(2, 1), ids(deleteId, baseId), files(FILE_A, FILE_B), statuses(Status.DELETED, Status.EXISTING)); - table.newAppend() - .appendFile(FILE_C) - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); Snapshot committedSnapshot = table.currentSnapshot(); - V2Assert.assertEquals("Snapshot sequence number should be 3", 3, committedSnapshot.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 3", 3, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); - - Assert.assertEquals("Should contain 1 merged manifest for second write", - 1, committedSnapshot.allManifests(table.io()).size()); + V2Assert.assertEquals( + "Snapshot sequence number should be 3", 3, committedSnapshot.sequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 3", 3, readMetadata().lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); + + Assert.assertEquals( + "Should contain 1 merged manifest for second write", + 1, + committedSnapshot.allManifests(table.io()).size()); ManifestFile newManifest = committedSnapshot.allManifests(table.io()).get(0); - Assert.assertNotEquals("Should not contain manifest from initial write", - initialManifest, newManifest); + Assert.assertNotEquals( + "Should not contain manifest from initial write", initialManifest, newManifest); long snapshotId = committedSnapshot.snapshotId(); // the deleted entry from the previous manifest should be removed - validateManifestEntries(newManifest, + validateManifestEntries( + newManifest, ids(snapshotId, snapshotId, baseId), files(FILE_C, FILE_D, FILE_B), statuses(Status.ADDED, Status.ADDED, Status.EXISTING)); @@ -563,253 +621,253 @@ public void testMinMergeCount() { Assert.assertEquals("Last sequence number should be 0", 0, readMetadata().lastSequenceNumber()); Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); Snapshot snap1 = table.currentSnapshot(); long idFileA = snap1.snapshotId(); validateSnapshot(null, snap1, 1, FILE_A); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); Snapshot snap2 = table.currentSnapshot(); long idFileB = snap2.snapshotId(); validateSnapshot(snap1, snap2, 2, FILE_B); - Assert.assertEquals("Should have 2 manifests from setup writes", - 2, readMetadata().currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should have 2 manifests from setup writes", + 2, + readMetadata().currentSnapshot().allManifests(table.io()).size()); - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Snapshot snap3 = table.currentSnapshot(); long idFileC = snap3.snapshotId(); validateSnapshot(snap2, snap3, 3, FILE_C); TableMetadata base = readMetadata(); - Assert.assertEquals("Should have 3 unmerged manifests", - 3, base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should have 3 unmerged manifests", + 3, + base.currentSnapshot().allManifests(table.io()).size()); Set unmerged = Sets.newHashSet(base.currentSnapshot().allManifests(table.io())); - table.newAppend() - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_D).commit(); Snapshot committed = table.currentSnapshot(); V2Assert.assertEquals("Snapshot sequence number should be 4", 4, committed.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 4", 4, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); - - Assert.assertEquals("Should contain 1 merged manifest after the 4th write", - 1, committed.allManifests(table.io()).size()); + V2Assert.assertEquals( + "Last sequence number should be 4", 4, readMetadata().lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); + + Assert.assertEquals( + "Should contain 1 merged manifest after the 4th write", + 1, + committed.allManifests(table.io()).size()); ManifestFile newManifest = committed.allManifests(table.io()).get(0); Assert.assertFalse("Should not contain previous manifests", unmerged.contains(newManifest)); long lastSnapshotId = committed.snapshotId(); - validateManifest(newManifest, + validateManifest( + newManifest, seqs(4, 3, 2, 1), ids(lastSnapshotId, idFileC, idFileB, idFileA), files(FILE_D, FILE_C, FILE_B, FILE_A), - statuses(Status.ADDED, Status.EXISTING, Status.EXISTING, Status.EXISTING) - ); + statuses(Status.ADDED, Status.EXISTING, Status.EXISTING, Status.EXISTING)); } @Test public void testMergeSizeTargetWithExistingManifest() { // use a small limit on manifest size to prevent merging - table.updateProperties() - .set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, "10") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, "10").commit(); Assert.assertEquals("Last sequence number should be 0", 0, readMetadata().lastSequenceNumber()); Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot snap = table.currentSnapshot(); validateSnapshot(null, snap, 1, FILE_A, FILE_B); TableMetadata base = readMetadata(); long baseId = base.currentSnapshot().snapshotId(); - Assert.assertEquals("Should create 1 manifest for initial write", - 1, base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should create 1 manifest for initial write", + 1, + base.currentSnapshot().allManifests(table.io()).size()); ManifestFile initialManifest = base.currentSnapshot().allManifests(table.io()).get(0); - validateManifest(initialManifest, + validateManifest( + initialManifest, seqs(1, 1), ids(baseId, baseId), files(FILE_A, FILE_B), statuses(Status.ADDED, Status.ADDED)); - table.newAppend() - .appendFile(FILE_C) - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); Snapshot committed = table.currentSnapshot(); V2Assert.assertEquals("Snapshot sequence number should be 2", 2, committed.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); - - Assert.assertEquals("Should contain 2 unmerged manifests after second write", - 2, committed.allManifests(table.io()).size()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); + + Assert.assertEquals( + "Should contain 2 unmerged manifests after second write", + 2, + committed.allManifests(table.io()).size()); ManifestFile newManifest = committed.allManifests(table.io()).get(0); - Assert.assertNotEquals("Should not contain manifest from initial write", - initialManifest, newManifest); + Assert.assertNotEquals( + "Should not contain manifest from initial write", initialManifest, newManifest); long pendingId = committed.snapshotId(); - validateManifest(newManifest, + validateManifest( + newManifest, seqs(2, 2), ids(pendingId, pendingId), files(FILE_C, FILE_D), - statuses(Status.ADDED, Status.ADDED) - ); + statuses(Status.ADDED, Status.ADDED)); - validateManifest(committed.allManifests(table.io()).get(1), + validateManifest( + committed.allManifests(table.io()).get(1), seqs(1, 1), ids(baseId, baseId), files(initialManifest), - statuses(Status.ADDED, Status.ADDED) - ); + statuses(Status.ADDED, Status.ADDED)); } @Test public void testChangedPartitionSpec() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot snap = table.currentSnapshot(); long commitId = snap.snapshotId(); validateSnapshot(null, snap, 1, FILE_A, FILE_B); TableMetadata base = readMetadata(); - Assert.assertEquals("Should create 1 manifest for initial write", - 1, base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should create 1 manifest for initial write", + 1, + base.currentSnapshot().allManifests(table.io()).size()); ManifestFile initialManifest = base.currentSnapshot().allManifests(table.io()).get(0); - validateManifest(initialManifest, + validateManifest( + initialManifest, seqs(1, 1), ids(commitId, commitId), files(FILE_A, FILE_B), statuses(Status.ADDED, Status.ADDED)); // build the new spec using the table's schema, which uses fresh IDs - PartitionSpec newSpec = PartitionSpec.builderFor(base.schema()) - .bucket("data", 16) - .bucket("id", 4) - .build(); + PartitionSpec newSpec = + PartitionSpec.builderFor(base.schema()).bucket("data", 16).bucket("id", 4).build(); // commit the new partition spec to the table manually table.ops().commit(base, base.updatePartitionSpec(newSpec)); Snapshot snap2 = table.currentSnapshot(); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap2.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); - - DataFile newFileY = DataFiles.builder(newSpec) - .withPath("/path/to/data-y.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=2/id_bucket=3") - .withRecordCount(1) - .build(); - - table.newAppend() - .appendFile(newFileY) - .commit(); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); + + DataFile newFileY = + DataFiles.builder(newSpec) + .withPath("/path/to/data-y.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=2/id_bucket=3") + .withRecordCount(1) + .build(); + + table.newAppend().appendFile(newFileY).commit(); Snapshot lastSnapshot = table.currentSnapshot(); V2Assert.assertEquals("Snapshot sequence number should be 2", 2, lastSnapshot.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); - Assert.assertEquals("Should use 2 manifest files", - 2, lastSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should use 2 manifest files", 2, lastSnapshot.allManifests(table.io()).size()); // new manifest comes first - validateManifest(lastSnapshot.allManifests(table.io()).get(0), + validateManifest( + lastSnapshot.allManifests(table.io()).get(0), seqs(2), ids(lastSnapshot.snapshotId()), files(newFileY), - statuses(Status.ADDED) - ); + statuses(Status.ADDED)); - Assert.assertEquals("Second manifest should be the initial manifest with the old spec", - initialManifest, lastSnapshot.allManifests(table.io()).get(1)); + Assert.assertEquals( + "Second manifest should be the initial manifest with the old spec", + initialManifest, + lastSnapshot.allManifests(table.io()).get(1)); } @Test public void testChangedPartitionSpecMergeExisting() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot snap1 = table.currentSnapshot(); long id1 = snap1.snapshotId(); validateSnapshot(null, snap1, 1, FILE_A); // create a second compatible manifest - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); Snapshot snap2 = table.currentSnapshot(); long id2 = snap2.snapshotId(); validateSnapshot(snap1, snap2, 2, FILE_B); TableMetadata base = readMetadata(); - Assert.assertEquals("Should contain 2 manifests", - 2, base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should contain 2 manifests", 2, base.currentSnapshot().allManifests(table.io()).size()); ManifestFile manifest = base.currentSnapshot().allManifests(table.io()).get(0); // build the new spec using the table's schema, which uses fresh IDs - PartitionSpec newSpec = PartitionSpec.builderFor(base.schema()) - .bucket("data", 16) - .bucket("id", 4) - .build(); + PartitionSpec newSpec = + PartitionSpec.builderFor(base.schema()).bucket("data", 16).bucket("id", 4).build(); // commit the new partition spec to the table manually table.ops().commit(base, base.updatePartitionSpec(newSpec)); Snapshot snap3 = table.currentSnapshot(); V2Assert.assertEquals("Snapshot sequence number should be 2", 2, snap3.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); - - DataFile newFileY = DataFiles.builder(table.spec()) - .withPath("/path/to/data-y.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=2/id_bucket=3") - .withRecordCount(1) - .build(); - - table.newAppend() - .appendFile(newFileY) - .commit(); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); + + DataFile newFileY = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-y.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=2/id_bucket=3") + .withRecordCount(1) + .build(); + + table.newAppend().appendFile(newFileY).commit(); Snapshot lastSnapshot = table.currentSnapshot(); V2Assert.assertEquals("Snapshot sequence number should be 3", 3, lastSnapshot.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 3", 3, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); - - Assert.assertEquals("Should use 2 manifest files", - 2, lastSnapshot.allManifests(table.io()).size()); - Assert.assertFalse("First manifest should not be in the new snapshot", + V2Assert.assertEquals( + "Last sequence number should be 3", 3, readMetadata().lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); + + Assert.assertEquals( + "Should use 2 manifest files", 2, lastSnapshot.allManifests(table.io()).size()); + Assert.assertFalse( + "First manifest should not be in the new snapshot", lastSnapshot.allManifests(table.io()).contains(manifest)); - validateManifest(lastSnapshot.allManifests(table.io()).get(0), + validateManifest( + lastSnapshot.allManifests(table.io()).get(0), seqs(3), ids(lastSnapshot.snapshotId()), files(newFileY), - statuses(Status.ADDED) - ); - validateManifest(lastSnapshot.allManifests(table.io()).get(1), + statuses(Status.ADDED)); + validateManifest( + lastSnapshot.allManifests(table.io()).get(1), seqs(2, 1), ids(id2, id1), files(FILE_B, FILE_A), - statuses(Status.EXISTING, Status.EXISTING) - ); + statuses(Status.EXISTING, Status.EXISTING)); } @Test @@ -818,14 +876,13 @@ public void testFailure() { table.updateProperties().set("commit.manifest.min-count-to-merge", "1").commit(); Assert.assertEquals("Last sequence number should be 0", 0, readMetadata().lastSequenceNumber()); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); long baseId = base.currentSnapshot().snapshotId(); V2Assert.assertEquals("Last sequence number should be 1", 1, base.lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, base.lastSequenceNumber()); ManifestFile initialManifest = base.currentSnapshot().allManifests(table.io()).get(0); validateManifest(initialManifest, seqs(1), ids(baseId), files(FILE_A), statuses(Status.ADDED)); @@ -839,24 +896,32 @@ public void testFailure() { ManifestFile newManifest = pending.allManifests(table.io()).get(0); Assert.assertTrue("Should create new manifest", new File(newManifest.path()).exists()); - validateManifest(newManifest, + validateManifest( + newManifest, ids(pending.snapshotId(), baseId), concat(files(FILE_B), files(initialManifest))); - AssertHelpers.assertThrows("Should retry 4 times and throw last failure", - CommitFailedException.class, "Injected failure", append::commit); - - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); - Assert.assertEquals("Should only contain 1 manifest file", - 1, table.currentSnapshot().allManifests(table.io()).size()); + AssertHelpers.assertThrows( + "Should retry 4 times and throw last failure", + CommitFailedException.class, + "Injected failure", + append::commit); - validateManifest(table.currentSnapshot().allManifests(table.io()).get(0), + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); + Assert.assertEquals( + "Should only contain 1 manifest file", + 1, + table.currentSnapshot().allManifests(table.io()).size()); + + validateManifest( + table.currentSnapshot().allManifests(table.io()).get(0), seqs(1), ids(baseId), files(initialManifest), - statuses(Status.ADDED) - ); + statuses(Status.ADDED)); Assert.assertFalse("Should clean up new manifest", new File(newManifest.path()).exists()); } @@ -873,10 +938,15 @@ public void testAppendManifestCleanup() throws IOException { ManifestFile newManifest = pending.allManifests(table.io()).get(0); Assert.assertTrue("Should create new manifest", new File(newManifest.path()).exists()); - AssertHelpers.assertThrows("Should retry 4 times and throw last failure", - CommitFailedException.class, "Injected failure", append::commit); - V2Assert.assertEquals("Last sequence number should be 0", 0, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); + AssertHelpers.assertThrows( + "Should retry 4 times and throw last failure", + CommitFailedException.class, + "Injected failure", + append::commit); + V2Assert.assertEquals( + "Last sequence number should be 0", 0, readMetadata().lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); Assert.assertFalse("Should clean up new manifest", new File(newManifest.path()).exists()); } @@ -888,14 +958,14 @@ public void testRecovery() { Assert.assertEquals("Last sequence number should be 0", 0, readMetadata().lastSequenceNumber()); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); long baseId = base.currentSnapshot().snapshotId(); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); ManifestFile initialManifest = base.currentSnapshot().allManifests(table.io()).get(0); validateManifest(initialManifest, seqs(1), ids(baseId), files(FILE_A), statuses(Status.ADDED)); @@ -908,30 +978,42 @@ public void testRecovery() { ManifestFile newManifest = pending.allManifests(table.io()).get(0); Assert.assertTrue("Should create new manifest", new File(newManifest.path()).exists()); - validateManifest(newManifest, + validateManifest( + newManifest, ids(pending.snapshotId(), baseId), concat(files(FILE_B), files(initialManifest))); - V2Assert.assertEquals("Snapshot sequence number should be 1", 1, table.currentSnapshot().sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Snapshot sequence number should be 1", 1, table.currentSnapshot().sequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); append.commit(); Snapshot snapshot = table.currentSnapshot(); long snapshotId = snapshot.snapshotId(); - V2Assert.assertEquals("Snapshot sequence number should be 2", 2, table.currentSnapshot().sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Snapshot sequence number should be 2", 2, table.currentSnapshot().sequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); TableMetadata metadata = readMetadata(); Assert.assertTrue("Should reuse the new manifest", new File(newManifest.path()).exists()); - Assert.assertEquals("Should commit the same new manifest during retry", - Lists.newArrayList(newManifest), metadata.currentSnapshot().allManifests(table.io())); - - Assert.assertEquals("Should only contain 1 merged manifest file", - 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should commit the same new manifest during retry", + Lists.newArrayList(newManifest), + metadata.currentSnapshot().allManifests(table.io())); + + Assert.assertEquals( + "Should only contain 1 merged manifest file", + 1, + table.currentSnapshot().allManifests(table.io()).size()); ManifestFile manifestFile = snapshot.allManifests(table.io()).get(0); - validateManifest(manifestFile, + validateManifest( + manifestFile, seqs(2, 1), ids(snapshotId, baseId), files(FILE_B, FILE_A), @@ -940,9 +1022,7 @@ public void testRecovery() { @Test public void testAppendManifestWithSnapshotIdInheritance() throws IOException { - table.updateProperties() - .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); Assert.assertEquals("Last sequence number should be 0", 0, readMetadata().lastSequenceNumber()); Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); @@ -951,9 +1031,7 @@ public void testAppendManifestWithSnapshotIdInheritance() throws IOException { Assert.assertNull("Should not have a current snapshot", base.currentSnapshot()); ManifestFile manifest = writeManifest(FILE_A, FILE_B); - table.newAppend() - .appendManifest(manifest) - .commit(); + table.newAppend().appendManifest(manifest).commit(); Snapshot snapshot = table.currentSnapshot(); long snapshotId = snapshot.snapshotId(); @@ -962,28 +1040,35 @@ public void testAppendManifestWithSnapshotIdInheritance() throws IOException { List manifests = table.currentSnapshot().allManifests(table.io()); Assert.assertEquals("Should have 1 committed manifest", 1, manifests.size()); ManifestFile manifestFile = snapshot.allManifests(table.io()).get(0); - validateManifest(manifestFile, + validateManifest( + manifestFile, seqs(1, 1), ids(snapshotId, snapshotId), files(FILE_A, FILE_B), statuses(Status.ADDED, Status.ADDED)); // validate that the metadata summary is correct when using appendManifest - Assert.assertEquals("Summary metadata should include 2 added files", - "2", snapshot.summary().get("added-data-files")); - Assert.assertEquals("Summary metadata should include 2 added records", - "2", snapshot.summary().get("added-records")); - Assert.assertEquals("Summary metadata should include 2 files in total", - "2", snapshot.summary().get("total-data-files")); - Assert.assertEquals("Summary metadata should include 2 records in total", - "2", snapshot.summary().get("total-records")); + Assert.assertEquals( + "Summary metadata should include 2 added files", + "2", + snapshot.summary().get("added-data-files")); + Assert.assertEquals( + "Summary metadata should include 2 added records", + "2", + snapshot.summary().get("added-records")); + Assert.assertEquals( + "Summary metadata should include 2 files in total", + "2", + snapshot.summary().get("total-data-files")); + Assert.assertEquals( + "Summary metadata should include 2 records in total", + "2", + snapshot.summary().get("total-records")); } @Test public void testMergedAppendManifestCleanupWithSnapshotIdInheritance() throws IOException { - table.updateProperties() - .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); Assert.assertEquals("Last sequence number should be 0", 0, readMetadata().lastSequenceNumber()); Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); @@ -991,53 +1076,53 @@ public void testMergedAppendManifestCleanupWithSnapshotIdInheritance() throws IO TableMetadata base = readMetadata(); Assert.assertNull("Should not have a current snapshot", base.currentSnapshot()); - table.updateProperties() - .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1").commit(); ManifestFile manifest1 = writeManifestWithName("manifest-file-1.avro", FILE_A, FILE_B); - table.newAppend() - .appendManifest(manifest1) - .commit(); + table.newAppend().appendManifest(manifest1).commit(); Snapshot snap1 = table.currentSnapshot(); long commitId1 = snap1.snapshotId(); validateSnapshot(null, snap1, 1, FILE_A, FILE_B); Assert.assertEquals("Should have only 1 manifest", 1, snap1.allManifests(table.io()).size()); - validateManifest(table.currentSnapshot().allManifests(table.io()).get(0), + validateManifest( + table.currentSnapshot().allManifests(table.io()).get(0), seqs(1, 1), ids(commitId1, commitId1), files(FILE_A, FILE_B), statuses(Status.ADDED, Status.ADDED)); - Assert.assertTrue("Unmerged append manifest should not be deleted", new File(manifest1.path()).exists()); + Assert.assertTrue( + "Unmerged append manifest should not be deleted", new File(manifest1.path()).exists()); ManifestFile manifest2 = writeManifestWithName("manifest-file-2.avro", FILE_C, FILE_D); - table.newAppend() - .appendManifest(manifest2) - .commit(); + table.newAppend().appendManifest(manifest2).commit(); Snapshot snap2 = table.currentSnapshot(); long commitId2 = snap2.snapshotId(); - V2Assert.assertEquals("Snapshot sequence number should be 2", 2, table.currentSnapshot().sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); - - Assert.assertEquals("Manifests should be merged into 1", 1, snap2.allManifests(table.io()).size()); - validateManifest(table.currentSnapshot().allManifests(table.io()).get(0), + V2Assert.assertEquals( + "Snapshot sequence number should be 2", 2, table.currentSnapshot().sequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); + + Assert.assertEquals( + "Manifests should be merged into 1", 1, snap2.allManifests(table.io()).size()); + validateManifest( + table.currentSnapshot().allManifests(table.io()).get(0), seqs(2, 2, 1, 1), ids(commitId2, commitId2, commitId1, commitId1), files(FILE_C, FILE_D, FILE_A, FILE_B), statuses(Status.ADDED, Status.ADDED, Status.EXISTING, Status.EXISTING)); - Assert.assertFalse("Merged append manifest should be deleted", new File(manifest2.path()).exists()); + Assert.assertFalse( + "Merged append manifest should be deleted", new File(manifest2.path()).exists()); } @Test public void testAppendManifestFailureWithSnapshotIdInheritance() throws IOException { - table.updateProperties() - .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); Assert.assertEquals("Last sequence number should be 0", 0, readMetadata().lastSequenceNumber()); Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); @@ -1045,9 +1130,7 @@ public void testAppendManifestFailureWithSnapshotIdInheritance() throws IOExcept TableMetadata base = readMetadata(); Assert.assertNull("Should not have a current snapshot", base.currentSnapshot()); - table.updateProperties() - .set(TableProperties.COMMIT_NUM_RETRIES, "1") - .commit(); + table.updateProperties().set(TableProperties.COMMIT_NUM_RETRIES, "1").commit(); table.ops().failCommits(5); @@ -1056,9 +1139,8 @@ public void testAppendManifestFailureWithSnapshotIdInheritance() throws IOExcept AppendFiles append = table.newAppend(); append.appendManifest(manifest); - AssertHelpers.assertThrows("Should reject commit", - CommitFailedException.class, "Injected failure", - append::commit); + AssertHelpers.assertThrows( + "Should reject commit", CommitFailedException.class, "Injected failure", append::commit); Assert.assertEquals("Last sequence number should be 0", 0, readMetadata().lastSequenceNumber()); Assert.assertTrue("Append manifest should not be deleted", new File(manifest.path()).exists()); @@ -1071,39 +1153,37 @@ public void testInvalidAppendManifest() throws IOException { TableMetadata base = readMetadata(); Assert.assertNull("Should not have a current snapshot", base.currentSnapshot()); - ManifestFile manifestWithExistingFiles = writeManifest( - "manifest-file-1.avro", - manifestEntry(Status.EXISTING, null, FILE_A)); - AssertHelpers.assertThrows("Should reject commit", - IllegalArgumentException.class, "Cannot append manifest with existing files", - () -> table.newAppend() - .appendManifest(manifestWithExistingFiles) - .commit()); + ManifestFile manifestWithExistingFiles = + writeManifest("manifest-file-1.avro", manifestEntry(Status.EXISTING, null, FILE_A)); + AssertHelpers.assertThrows( + "Should reject commit", + IllegalArgumentException.class, + "Cannot append manifest with existing files", + () -> table.newAppend().appendManifest(manifestWithExistingFiles).commit()); Assert.assertEquals("Last sequence number should be 0", 0, readMetadata().lastSequenceNumber()); - ManifestFile manifestWithDeletedFiles = writeManifest( - "manifest-file-2.avro", - manifestEntry(Status.DELETED, null, FILE_A)); - AssertHelpers.assertThrows("Should reject commit", - IllegalArgumentException.class, "Cannot append manifest with deleted files", - () -> table.newAppend() - .appendManifest(manifestWithDeletedFiles) - .commit()); + ManifestFile manifestWithDeletedFiles = + writeManifest("manifest-file-2.avro", manifestEntry(Status.DELETED, null, FILE_A)); + AssertHelpers.assertThrows( + "Should reject commit", + IllegalArgumentException.class, + "Cannot append manifest with deleted files", + () -> table.newAppend().appendManifest(manifestWithDeletedFiles).commit()); Assert.assertEquals("Last sequence number should be 0", 0, readMetadata().lastSequenceNumber()); } - @Test public void testUpdatePartitionSpecFieldIdsForV1Table() { TableMetadata base = readMetadata(); // build the new spec using the table's schema, which uses fresh IDs - PartitionSpec newSpec = PartitionSpec.builderFor(base.schema()) - .bucket("id", 16) - .identity("data") - .bucket("data", 4) - .bucket("data", 16, "data_partition") // reuse field id although different target name - .build(); + PartitionSpec newSpec = + PartitionSpec.builderFor(base.schema()) + .bucket("id", 16) + .identity("data") + .bucket("data", 4) + .bucket("data", 16, "data_partition") // reuse field id although different target name + .build(); // commit the new partition spec to the table manually table.ops().commit(base, base.updatePartitionSpec(newSpec)); @@ -1137,72 +1217,87 @@ public void testUpdatePartitionSpecFieldIdsForV1Table() { @Test public void testManifestEntryFieldIdsForChangedPartitionSpecForV1Table() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot snap = table.currentSnapshot(); long commitId = snap.snapshotId(); validateSnapshot(null, snap, 1, FILE_A); TableMetadata base = readMetadata(); - Assert.assertEquals("Should create 1 manifest for initial write", - 1, base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should create 1 manifest for initial write", + 1, + base.currentSnapshot().allManifests(table.io()).size()); ManifestFile initialManifest = base.currentSnapshot().allManifests(table.io()).get(0); - validateManifest(initialManifest, seqs(1), ids(commitId), files(FILE_A), statuses(Status.ADDED)); + validateManifest( + initialManifest, seqs(1), ids(commitId), files(FILE_A), statuses(Status.ADDED)); // build the new spec using the table's schema, which uses fresh IDs - PartitionSpec newSpec = PartitionSpec.builderFor(base.schema()) - .bucket("id", 8) - .bucket("data", 8) - .build(); + PartitionSpec newSpec = + PartitionSpec.builderFor(base.schema()).bucket("id", 8).bucket("data", 8).build(); // commit the new partition spec to the table manually table.ops().commit(base, base.updatePartitionSpec(newSpec)); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); // create a new with the table's current spec - DataFile newFile = DataFiles.builder(table.spec()) - .withPath("/path/to/data-x.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("id_bucket=1/data_bucket=1") - .withRecordCount(1) - .build(); - - table.newAppend() - .appendFile(newFile) - .commit(); + DataFile newFile = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-x.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("id_bucket=1/data_bucket=1") + .withRecordCount(1) + .build(); + + table.newAppend().appendFile(newFile).commit(); Snapshot committedSnapshot = table.currentSnapshot(); - V2Assert.assertEquals("Snapshot sequence number should be 2", 2, committedSnapshot.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); - V1Assert.assertEquals("Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Snapshot sequence number should be 2", 2, committedSnapshot.sequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V1Assert.assertEquals( + "Table should end with last-sequence-number 0", 0, readMetadata().lastSequenceNumber()); - Assert.assertEquals("Should use 2 manifest files", - 2, committedSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should use 2 manifest files", 2, committedSnapshot.allManifests(table.io()).size()); // new manifest comes first - validateManifest(committedSnapshot.allManifests(table.io()).get(0), + validateManifest( + committedSnapshot.allManifests(table.io()).get(0), seqs(2), - ids(committedSnapshot.snapshotId()), files(newFile), - statuses(Status.ADDED) - ); - - Assert.assertEquals("Second manifest should be the initial manifest with the old spec", - initialManifest, committedSnapshot.allManifests(table.io()).get(1)); + ids(committedSnapshot.snapshotId()), + files(newFile), + statuses(Status.ADDED)); - // field ids of manifest entries in two manifests with different specs of the same source field should be different - ManifestEntry entry = ManifestFiles.read(committedSnapshot.allManifests(table.io()).get(0), FILE_IO) - .entries().iterator().next(); - Types.NestedField field = ((PartitionData) entry.file().partition()).getPartitionType().fields().get(0); + Assert.assertEquals( + "Second manifest should be the initial manifest with the old spec", + initialManifest, + committedSnapshot.allManifests(table.io()).get(1)); + + // field ids of manifest entries in two manifests with different specs of the same source field + // should be different + ManifestEntry entry = + ManifestFiles.read(committedSnapshot.allManifests(table.io()).get(0), FILE_IO) + .entries() + .iterator() + .next(); + Types.NestedField field = + ((PartitionData) entry.file().partition()).getPartitionType().fields().get(0); Assert.assertEquals(1000, field.fieldId()); Assert.assertEquals("id_bucket", field.name()); field = ((PartitionData) entry.file().partition()).getPartitionType().fields().get(1); Assert.assertEquals(1001, field.fieldId()); Assert.assertEquals("data_bucket", field.name()); - entry = ManifestFiles.read(committedSnapshot.allManifests(table.io()).get(1), FILE_IO).entries().iterator().next(); + entry = + ManifestFiles.read(committedSnapshot.allManifests(table.io()).get(1), FILE_IO) + .entries() + .iterator() + .next(); field = ((PartitionData) entry.file().partition()).getPartitionType().fields().get(0); Assert.assertEquals(1000, field.fieldId()); Assert.assertEquals("data_bucket", field.name()); @@ -1210,72 +1305,86 @@ public void testManifestEntryFieldIdsForChangedPartitionSpecForV1Table() { @Test public void testDefaultPartitionSummaries() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); - - Set partitionSummaryKeys = table.currentSnapshot().summary().keySet().stream() - .filter(key -> key.startsWith(SnapshotSummary.CHANGED_PARTITION_PREFIX)) - .collect(Collectors.toSet()); - Assert.assertEquals("Should include no partition summaries by default", 0, partitionSummaryKeys.size()); - - String summariesIncluded = table.currentSnapshot().summary() - .getOrDefault(SnapshotSummary.PARTITION_SUMMARY_PROP, "false"); - Assert.assertEquals("Should not set partition-summaries-included to true", "false", summariesIncluded); - - String changedPartitions = table.currentSnapshot().summary().get(SnapshotSummary.CHANGED_PARTITION_COUNT_PROP); + table.newFastAppend().appendFile(FILE_A).commit(); + + Set partitionSummaryKeys = + table.currentSnapshot().summary().keySet().stream() + .filter(key -> key.startsWith(SnapshotSummary.CHANGED_PARTITION_PREFIX)) + .collect(Collectors.toSet()); + Assert.assertEquals( + "Should include no partition summaries by default", 0, partitionSummaryKeys.size()); + + String summariesIncluded = + table + .currentSnapshot() + .summary() + .getOrDefault(SnapshotSummary.PARTITION_SUMMARY_PROP, "false"); + Assert.assertEquals( + "Should not set partition-summaries-included to true", "false", summariesIncluded); + + String changedPartitions = + table.currentSnapshot().summary().get(SnapshotSummary.CHANGED_PARTITION_COUNT_PROP); Assert.assertEquals("Should set changed partition count", "1", changedPartitions); } @Test public void testIncludedPartitionSummaries() { - table.updateProperties() - .set(TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, "1") - .commit(); + table.updateProperties().set(TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, "1").commit(); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - Set partitionSummaryKeys = table.currentSnapshot().summary().keySet().stream() - .filter(key -> key.startsWith(SnapshotSummary.CHANGED_PARTITION_PREFIX)) - .collect(Collectors.toSet()); + Set partitionSummaryKeys = + table.currentSnapshot().summary().keySet().stream() + .filter(key -> key.startsWith(SnapshotSummary.CHANGED_PARTITION_PREFIX)) + .collect(Collectors.toSet()); Assert.assertEquals("Should include a partition summary", 1, partitionSummaryKeys.size()); - String summariesIncluded = table.currentSnapshot().summary() - .getOrDefault(SnapshotSummary.PARTITION_SUMMARY_PROP, "false"); - Assert.assertEquals("Should set partition-summaries-included to true", "true", summariesIncluded); + String summariesIncluded = + table + .currentSnapshot() + .summary() + .getOrDefault(SnapshotSummary.PARTITION_SUMMARY_PROP, "false"); + Assert.assertEquals( + "Should set partition-summaries-included to true", "true", summariesIncluded); - String changedPartitions = table.currentSnapshot().summary().get(SnapshotSummary.CHANGED_PARTITION_COUNT_PROP); + String changedPartitions = + table.currentSnapshot().summary().get(SnapshotSummary.CHANGED_PARTITION_COUNT_PROP); Assert.assertEquals("Should set changed partition count", "1", changedPartitions); - String partitionSummary = table.currentSnapshot().summary() - .get(SnapshotSummary.CHANGED_PARTITION_PREFIX + "data_bucket=0"); - Assert.assertEquals("Summary should include 1 file with 1 record that is 10 bytes", - "added-data-files=1,added-records=1,added-files-size=10", partitionSummary); + String partitionSummary = + table + .currentSnapshot() + .summary() + .get(SnapshotSummary.CHANGED_PARTITION_PREFIX + "data_bucket=0"); + Assert.assertEquals( + "Summary should include 1 file with 1 record that is 10 bytes", + "added-data-files=1,added-records=1,added-files-size=10", + partitionSummary); } @Test public void testIncludedPartitionSummaryLimit() { - table.updateProperties() - .set(TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, "1") - .commit(); - - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); - - Set partitionSummaryKeys = table.currentSnapshot().summary().keySet().stream() - .filter(key -> key.startsWith(SnapshotSummary.CHANGED_PARTITION_PREFIX)) - .collect(Collectors.toSet()); - Assert.assertEquals("Should include no partition summaries, over limit", 0, partitionSummaryKeys.size()); - - String summariesIncluded = table.currentSnapshot().summary() - .getOrDefault(SnapshotSummary.PARTITION_SUMMARY_PROP, "false"); - Assert.assertEquals("Should not set partition-summaries-included to true", "false", summariesIncluded); - - String changedPartitions = table.currentSnapshot().summary().get(SnapshotSummary.CHANGED_PARTITION_COUNT_PROP); + table.updateProperties().set(TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, "1").commit(); + + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); + + Set partitionSummaryKeys = + table.currentSnapshot().summary().keySet().stream() + .filter(key -> key.startsWith(SnapshotSummary.CHANGED_PARTITION_PREFIX)) + .collect(Collectors.toSet()); + Assert.assertEquals( + "Should include no partition summaries, over limit", 0, partitionSummaryKeys.size()); + + String summariesIncluded = + table + .currentSnapshot() + .summary() + .getOrDefault(SnapshotSummary.PARTITION_SUMMARY_PROP, "false"); + Assert.assertEquals( + "Should not set partition-summaries-included to true", "false", summariesIncluded); + + String changedPartitions = + table.currentSnapshot().summary().get(SnapshotSummary.CHANGED_PARTITION_COUNT_PROP); Assert.assertEquals("Should set changed partition count", "2", changedPartitions); } } diff --git a/core/src/test/java/org/apache/iceberg/TestMetadataTableFilters.java b/core/src/test/java/org/apache/iceberg/TestMetadataTableFilters.java index 12ffd34bc3ed..ecaa07450e90 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetadataTableFilters.java +++ b/core/src/test/java/org/apache/iceberg/TestMetadataTableFilters.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Set; import java.util.stream.StreamSupport; import org.apache.iceberg.BaseFilesTable.ManifestReadTask; @@ -36,31 +38,30 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestMetadataTableFilters extends TableTestBase { - private static final Set aggFileTables = Sets.newHashSet(MetadataTableType.ALL_DATA_FILES, - MetadataTableType.ALL_DATA_FILES, - MetadataTableType.ALL_FILES); + private static final Set aggFileTables = + Sets.newHashSet( + MetadataTableType.ALL_DATA_FILES, + MetadataTableType.ALL_DATA_FILES, + MetadataTableType.ALL_FILES); private final MetadataTableType type; @Parameterized.Parameters(name = "table_type = {0}, format = {1}") public static Object[][] parameters() { return new Object[][] { - { MetadataTableType.DATA_FILES, 1 }, - { MetadataTableType.DATA_FILES, 2 }, - { MetadataTableType.DELETE_FILES, 2 }, - { MetadataTableType.FILES, 1 }, - { MetadataTableType.FILES, 2 }, - { MetadataTableType.ALL_DATA_FILES, 1 }, - { MetadataTableType.ALL_DATA_FILES, 2 }, - { MetadataTableType.ALL_DELETE_FILES, 2 }, - { MetadataTableType.ALL_FILES, 1 }, - { MetadataTableType.ALL_FILES, 2 } + {MetadataTableType.DATA_FILES, 1}, + {MetadataTableType.DATA_FILES, 2}, + {MetadataTableType.DELETE_FILES, 2}, + {MetadataTableType.FILES, 1}, + {MetadataTableType.FILES, 2}, + {MetadataTableType.ALL_DATA_FILES, 1}, + {MetadataTableType.ALL_DATA_FILES, 2}, + {MetadataTableType.ALL_DELETE_FILES, 2}, + {MetadataTableType.ALL_FILES, 1}, + {MetadataTableType.ALL_FILES, 2} }; } @@ -73,43 +74,34 @@ public TestMetadataTableFilters(MetadataTableType type, int formatVersion) { @Override public void setupTable() throws Exception { super.setupTable(); - table.updateProperties() - .set(TableProperties.MANIFEST_MERGE_ENABLED, "false") - .commit(); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); - table.newFastAppend() - .appendFile(FILE_D) - .commit(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MERGE_ENABLED, "false").commit(); + table.newFastAppend().appendFile(FILE_A).commit(); + table.newFastAppend().appendFile(FILE_C).commit(); + table.newFastAppend().appendFile(FILE_D).commit(); + table.newFastAppend().appendFile(FILE_B).commit(); if (formatVersion == 2) { - table.newRowDelta() - .addDeletes(FILE_A_DELETES) - .commit(); - table.newRowDelta() - .addDeletes(FILE_B_DELETES) - .commit(); - table.newRowDelta() - .addDeletes(FILE_C2_DELETES) - .commit(); - table.newRowDelta() - .addDeletes(FILE_D2_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_A_DELETES).commit(); + table.newRowDelta().addDeletes(FILE_B_DELETES).commit(); + table.newRowDelta().addDeletes(FILE_C2_DELETES).commit(); + table.newRowDelta().addDeletes(FILE_D2_DELETES).commit(); } if (isAggFileTable(type)) { - // Clear all files from current snapshot to test whether 'all' Files tables scans previous files - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); // Moves file entries to DELETED state - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); // Removes all entries - Assert.assertEquals("Current snapshot should be made empty", - 0, table.currentSnapshot().allManifests(table.io()).size()); + // Clear all files from current snapshot to test whether 'all' Files tables scans previous + // files + table + .newDelete() + .deleteFromRowFilter(Expressions.alwaysTrue()) + .commit(); // Moves file entries to DELETED state + table + .newDelete() + .deleteFromRowFilter(Expressions.alwaysTrue()) + .commit(); // Removes all entries + Assert.assertEquals( + "Current snapshot should be made empty", + 0, + table.currentSnapshot().allManifests(table.io()).size()); } } @@ -164,10 +156,14 @@ private boolean isAggFileTable(MetadataTableType tableType) { @Test public void testNoFilter() { Table metadataTable = createMetadataTable(); - Types.StructType expected = new Schema( - required(102, "partition", Types.StructType.of( - optional(1000, "data_bucket", Types.IntegerType.get())), - "Partition data tuple, schema based on the partition spec")).asStruct(); + Types.StructType expected = + new Schema( + required( + 102, + "partition", + Types.StructType.of(optional(1000, "data_bucket", Types.IntegerType.get())), + "Partition data tuple, schema based on the partition spec")) + .asStruct(); TableScan scan = metadataTable.newScan().select("partition.data_bucket"); Assert.assertEquals(expected, scan.schema().asStruct()); @@ -184,9 +180,10 @@ public void testNoFilter() { public void testAnd() { Table metadataTable = createMetadataTable(); - Expression and = Expressions.and( - Expressions.equal("partition.data_bucket", 0), - Expressions.greaterThan("record_count", 0)); + Expression and = + Expressions.and( + Expressions.equal("partition.data_bucket", 0), + Expressions.greaterThan("record_count", 0)); TableScan scan = metadataTable.newScan().filter(and); CloseableIterable tasks = scan.planFiles(); @@ -210,9 +207,10 @@ public void testLt() { public void testOr() { Table metadataTable = createMetadataTable(); - Expression or = Expressions.or( - Expressions.equal("partition.data_bucket", 2), - Expressions.greaterThan("record_count", 0)); + Expression or = + Expressions.or( + Expressions.equal("partition.data_bucket", 2), + Expressions.greaterThan("record_count", 0)); TableScan scan = metadataTable.newScan().filter(or); CloseableIterable tasks = scan.planFiles(); @@ -270,9 +268,10 @@ public void testNotNull() { public void testPlanTasks() { Table metadataTable = createMetadataTable(); - Expression and = Expressions.and( - Expressions.equal("partition.data_bucket", 0), - Expressions.greaterThan("record_count", 0)); + Expression and = + Expressions.and( + Expressions.equal("partition.data_bucket", 0), + Expressions.greaterThan("record_count", 0)); TableScan scan = metadataTable.newScan().filter(and); CloseableIterable tasks = scan.planTasks(); @@ -285,58 +284,68 @@ public void testPartitionSpecEvolutionRemovalV1() { Assume.assumeTrue(formatVersion == 1); // Change spec and add two data files - table.updateSpec() - .removeField(Expressions.bucket("data", 16)) - .addField("id") - .commit(); + table.updateSpec().removeField(Expressions.bucket("data", 16)).addField("id").commit(); PartitionSpec newSpec = table.spec(); // Add two data files with new spec PartitionKey data10Key = new PartitionKey(newSpec, table.schema()); data10Key.set(1, 10); - DataFile data10 = DataFiles.builder(newSpec) - .withPath("/path/to/data-10.parquet") - .withRecordCount(10) - .withFileSizeInBytes(10) - .withPartition(data10Key) - .build(); + DataFile data10 = + DataFiles.builder(newSpec) + .withPath("/path/to/data-10.parquet") + .withRecordCount(10) + .withFileSizeInBytes(10) + .withPartition(data10Key) + .build(); PartitionKey data11Key = new PartitionKey(newSpec, table.schema()); data10Key.set(1, 11); - DataFile data11 = DataFiles.builder(newSpec) - .withPath("/path/to/data-11.parquet") - .withRecordCount(10) - .withFileSizeInBytes(10) - .withPartition(data11Key) - .build(); + DataFile data11 = + DataFiles.builder(newSpec) + .withPath("/path/to/data-11.parquet") + .withRecordCount(10) + .withFileSizeInBytes(10) + .withPartition(data11Key) + .build(); table.newFastAppend().appendFile(data10).commit(); table.newFastAppend().appendFile(data11).commit(); if (isAggFileTable(type)) { - // Clear all files from current snapshot to test whether 'all' Files tables scans previous files - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); // Moves file entries to DELETED state - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); // Removes all entries - Assert.assertEquals("Current snapshot should be made empty", - 0, table.currentSnapshot().allManifests(table.io()).size()); + // Clear all files from current snapshot to test whether 'all' Files tables scans previous + // files + table + .newDelete() + .deleteFromRowFilter(Expressions.alwaysTrue()) + .commit(); // Moves file entries to DELETED state + table + .newDelete() + .deleteFromRowFilter(Expressions.alwaysTrue()) + .commit(); // Removes all entries + Assert.assertEquals( + "Current snapshot should be made empty", + 0, + table.currentSnapshot().allManifests(table.io()).size()); } Table metadataTable = createMetadataTable(); - Expression filter = Expressions.and( - Expressions.equal("partition.id", 10), - Expressions.greaterThan("record_count", 0)); + Expression filter = + Expressions.and( + Expressions.equal("partition.id", 10), Expressions.greaterThan("record_count", 0)); TableScan scan = metadataTable.newScan().filter(filter); CloseableIterable tasks = scan.planFiles(); // All 4 original data files written by old spec, plus one data file written by new spec Assert.assertEquals(expectedScanTaskCount(5), Iterables.size(tasks)); - filter = Expressions.and( - Expressions.equal("partition.data_bucket", 0), - Expressions.greaterThan("record_count", 0)); + filter = + Expressions.and( + Expressions.equal("partition.data_bucket", 0), + Expressions.greaterThan("record_count", 0)); scan = metadataTable.newScan().filter(filter); tasks = scan.planFiles(); - // 1 original data file written by old spec (V1 filters out new specs which don't have this value) + // 1 original data file written by old spec (V1 filters out new specs which don't have this + // value) Assert.assertEquals(expectedScanTaskCount(1), Iterables.size(tasks)); } @@ -345,39 +354,41 @@ public void testPartitionSpecEvolutionRemovalV2() { Assume.assumeTrue(formatVersion == 2); // Change spec and add two data and delete files each - table.updateSpec() - .removeField(Expressions.bucket("data", 16)) - .addField("id").commit(); + table.updateSpec().removeField(Expressions.bucket("data", 16)).addField("id").commit(); PartitionSpec newSpec = table.spec(); // Add two data files and two delete files with new spec - DataFile data10 = DataFiles.builder(newSpec) - .withPath("/path/to/data-10.parquet") - .withRecordCount(10) - .withFileSizeInBytes(10) - .withPartitionPath("id=10") - .build(); - DataFile data11 = DataFiles.builder(newSpec) - .withPath("/path/to/data-11.parquet") - .withRecordCount(10) - .withFileSizeInBytes(10) - .withPartitionPath("id=11") - .build(); - - DeleteFile delete10 = FileMetadata.deleteFileBuilder(newSpec) - .ofPositionDeletes() - .withPath("/path/to/data-10-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("id=10") - .withRecordCount(1) - .build(); - DeleteFile delete11 = FileMetadata.deleteFileBuilder(newSpec) - .ofPositionDeletes() - .withPath("/path/to/data-11-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("id=11") - .withRecordCount(1) - .build(); + DataFile data10 = + DataFiles.builder(newSpec) + .withPath("/path/to/data-10.parquet") + .withRecordCount(10) + .withFileSizeInBytes(10) + .withPartitionPath("id=10") + .build(); + DataFile data11 = + DataFiles.builder(newSpec) + .withPath("/path/to/data-11.parquet") + .withRecordCount(10) + .withFileSizeInBytes(10) + .withPartitionPath("id=11") + .build(); + + DeleteFile delete10 = + FileMetadata.deleteFileBuilder(newSpec) + .ofPositionDeletes() + .withPath("/path/to/data-10-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("id=10") + .withRecordCount(1) + .build(); + DeleteFile delete11 = + FileMetadata.deleteFileBuilder(newSpec) + .ofPositionDeletes() + .withPath("/path/to/data-11-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("id=11") + .withRecordCount(1) + .build(); table.newFastAppend().appendFile(data10).commit(); table.newFastAppend().appendFile(data11).commit(); @@ -388,30 +399,42 @@ public void testPartitionSpecEvolutionRemovalV2() { } if (isAggFileTable(type)) { - // Clear all files from current snapshot to test whether 'all' Files tables scans previous files - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); // Moves file entries to DELETED state - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); // Removes all entries - Assert.assertEquals("Current snapshot should be made empty", - 0, table.currentSnapshot().allManifests(table.io()).size()); + // Clear all files from current snapshot to test whether 'all' Files tables scans previous + // files + table + .newDelete() + .deleteFromRowFilter(Expressions.alwaysTrue()) + .commit(); // Moves file entries to DELETED state + table + .newDelete() + .deleteFromRowFilter(Expressions.alwaysTrue()) + .commit(); // Removes all entries + Assert.assertEquals( + "Current snapshot should be made empty", + 0, + table.currentSnapshot().allManifests(table.io()).size()); } Table metadataTable = createMetadataTable(); - Expression filter = Expressions.and( - Expressions.equal("partition.id", 10), - Expressions.greaterThan("record_count", 0)); + Expression filter = + Expressions.and( + Expressions.equal("partition.id", 10), Expressions.greaterThan("record_count", 0)); TableScan scan = metadataTable.newScan().filter(filter); CloseableIterable tasks = scan.planFiles(); - // All 4 original data/delete files written by old spec, plus one new data file/delete file written by new spec + // All 4 original data/delete files written by old spec, plus one new data file/delete file + // written by new spec Assert.assertEquals(expectedScanTaskCount(5), Iterables.size(tasks)); - filter = Expressions.and( - Expressions.equal("partition.data_bucket", 0), - Expressions.greaterThan("record_count", 0)); + filter = + Expressions.and( + Expressions.equal("partition.data_bucket", 0), + Expressions.greaterThan("record_count", 0)); scan = metadataTable.newScan().filter(filter); tasks = scan.planFiles(); - // 1 original data/delete files written by old spec, plus both of new data file/delete file written by new spec + // 1 original data/delete files written by old spec, plus both of new data file/delete file + // written by new spec Assert.assertEquals(expectedScanTaskCount(3), Iterables.size(tasks)); } @@ -420,55 +443,66 @@ public void testPartitionSpecEvolutionAdditiveV1() { Assume.assumeTrue(formatVersion == 1); // Change spec and add two data files - table.updateSpec() - .addField("id") - .commit(); + table.updateSpec().addField("id").commit(); PartitionSpec newSpec = table.spec(); // Add two data files with new spec PartitionKey data10Key = new PartitionKey(newSpec, table.schema()); data10Key.set(0, 0); // data=0 data10Key.set(1, 10); // id=10 - DataFile data10 = DataFiles.builder(newSpec) - .withPath("/path/to/data-10.parquet") - .withRecordCount(10) - .withFileSizeInBytes(10) - .withPartition(data10Key) - .build(); + DataFile data10 = + DataFiles.builder(newSpec) + .withPath("/path/to/data-10.parquet") + .withRecordCount(10) + .withFileSizeInBytes(10) + .withPartition(data10Key) + .build(); PartitionKey data11Key = new PartitionKey(newSpec, table.schema()); data11Key.set(0, 1); // data=0 data10Key.set(1, 11); // id=11 - DataFile data11 = DataFiles.builder(newSpec) - .withPath("/path/to/data-11.parquet") - .withRecordCount(10) - .withFileSizeInBytes(10) - .withPartition(data11Key) - .build(); + DataFile data11 = + DataFiles.builder(newSpec) + .withPath("/path/to/data-11.parquet") + .withRecordCount(10) + .withFileSizeInBytes(10) + .withPartition(data11Key) + .build(); table.newFastAppend().appendFile(data10).commit(); table.newFastAppend().appendFile(data11).commit(); if (isAggFileTable(type)) { - // Clear all files from current snapshot to test whether 'all' Files tables scans previous files - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); // Moves file entries to DELETED state - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); // Removes all entries - Assert.assertEquals("Current snapshot should be made empty", - 0, table.currentSnapshot().allManifests(table.io()).size()); + // Clear all files from current snapshot to test whether 'all' Files tables scans previous + // files + table + .newDelete() + .deleteFromRowFilter(Expressions.alwaysTrue()) + .commit(); // Moves file entries to DELETED state + table + .newDelete() + .deleteFromRowFilter(Expressions.alwaysTrue()) + .commit(); // Removes all entries + Assert.assertEquals( + "Current snapshot should be made empty", + 0, + table.currentSnapshot().allManifests(table.io()).size()); } Table metadataTable = createMetadataTable(); - Expression filter = Expressions.and( - Expressions.equal("partition.id", 10), - Expressions.greaterThan("record_count", 0)); + Expression filter = + Expressions.and( + Expressions.equal("partition.id", 10), Expressions.greaterThan("record_count", 0)); TableScan scan = metadataTable.newScan().filter(filter); CloseableIterable tasks = scan.planFiles(); - // All 4 original data/delete files written by old spec, plus one new data file written by new spec + // All 4 original data/delete files written by old spec, plus one new data file written by new + // spec Assert.assertEquals(expectedScanTaskCount(5), Iterables.size(tasks)); - filter = Expressions.and( - Expressions.equal("partition.data_bucket", 0), - Expressions.greaterThan("record_count", 0)); + filter = + Expressions.and( + Expressions.equal("partition.data_bucket", 0), + Expressions.greaterThan("record_count", 0)); scan = metadataTable.newScan().filter(filter); tasks = scan.planFiles(); @@ -481,39 +515,41 @@ public void testPartitionSpecEvolutionAdditiveV2() { Assume.assumeTrue(formatVersion == 2); // Change spec and add two data and delete files each - table.updateSpec() - .addField("id") - .commit(); + table.updateSpec().addField("id").commit(); PartitionSpec newSpec = table.spec(); // Add two data files and two delete files with new spec - DataFile data10 = DataFiles.builder(newSpec) - .withPath("/path/to/data-10.parquet") - .withRecordCount(10) - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=0/id=10") - .build(); - DataFile data11 = DataFiles.builder(newSpec) - .withPath("/path/to/data-11.parquet") - .withRecordCount(10) - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=1/id=11") - .build(); - - DeleteFile delete10 = FileMetadata.deleteFileBuilder(newSpec) - .ofPositionDeletes() - .withPath("/path/to/data-10-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=0/id=10") - .withRecordCount(1) - .build(); - DeleteFile delete11 = FileMetadata.deleteFileBuilder(newSpec) - .ofPositionDeletes() - .withPath("/path/to/data-11-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=1/id=11") - .withRecordCount(1) - .build(); + DataFile data10 = + DataFiles.builder(newSpec) + .withPath("/path/to/data-10.parquet") + .withRecordCount(10) + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0/id=10") + .build(); + DataFile data11 = + DataFiles.builder(newSpec) + .withPath("/path/to/data-11.parquet") + .withRecordCount(10) + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=1/id=11") + .build(); + + DeleteFile delete10 = + FileMetadata.deleteFileBuilder(newSpec) + .ofPositionDeletes() + .withPath("/path/to/data-10-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0/id=10") + .withRecordCount(1) + .build(); + DeleteFile delete11 = + FileMetadata.deleteFileBuilder(newSpec) + .ofPositionDeletes() + .withPath("/path/to/data-11-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=1/id=11") + .withRecordCount(1) + .build(); table.newFastAppend().appendFile(data10).commit(); table.newFastAppend().appendFile(data11).commit(); @@ -524,39 +560,54 @@ public void testPartitionSpecEvolutionAdditiveV2() { } if (isAggFileTable(type)) { - // Clear all files from current snapshot to test whether 'all' Files tables scans previous files - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); // Moves file entries to DELETED state - table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); // Removes all entries - Assert.assertEquals("Current snapshot should be made empty", - 0, table.currentSnapshot().allManifests(table.io()).size()); + // Clear all files from current snapshot to test whether 'all' Files tables scans previous + // files + table + .newDelete() + .deleteFromRowFilter(Expressions.alwaysTrue()) + .commit(); // Moves file entries to DELETED state + table + .newDelete() + .deleteFromRowFilter(Expressions.alwaysTrue()) + .commit(); // Removes all entries + Assert.assertEquals( + "Current snapshot should be made empty", + 0, + table.currentSnapshot().allManifests(table.io()).size()); } Table metadataTable = createMetadataTable(); - Expression filter = Expressions.and( - Expressions.equal("partition.id", 10), - Expressions.greaterThan("record_count", 0)); + Expression filter = + Expressions.and( + Expressions.equal("partition.id", 10), Expressions.greaterThan("record_count", 0)); TableScan scan = metadataTable.newScan().filter(filter); CloseableIterable tasks = scan.planFiles(); - // All 4 original data/delete files written by old spec, plus one new data file/delete file written by new spec + // All 4 original data/delete files written by old spec, plus one new data file/delete file + // written by new spec Assert.assertEquals(expectedScanTaskCount(5), Iterables.size(tasks)); - filter = Expressions.and( - Expressions.equal("partition.data_bucket", 0), - Expressions.greaterThan("record_count", 0)); + filter = + Expressions.and( + Expressions.equal("partition.data_bucket", 0), + Expressions.greaterThan("record_count", 0)); scan = metadataTable.newScan().filter(filter); tasks = scan.planFiles(); - // 1 original data/delete files written by old spec, plus 1 of new data file/delete file written by new spec + // 1 original data/delete files written by old spec, plus 1 of new data file/delete file written + // by new spec Assert.assertEquals(expectedScanTaskCount(2), Iterables.size(tasks)); } private void validateFileScanTasks(CloseableIterable fileScanTasks, int partValue) { - Assert.assertTrue("File scan tasks do not include correct file", - StreamSupport.stream(fileScanTasks.spliterator(), false).anyMatch(t -> { - ManifestFile mf = ((ManifestReadTask) t).manifest(); - return manifestHasPartition(mf, partValue); - })); + Assert.assertTrue( + "File scan tasks do not include correct file", + StreamSupport.stream(fileScanTasks.spliterator(), false) + .anyMatch( + t -> { + ManifestFile mf = ((ManifestReadTask) t).manifest(); + return manifestHasPartition(mf, partValue); + })); } private void validateCombinedScanTasks(CloseableIterable tasks, int partValue) { @@ -566,8 +617,10 @@ private void validateCombinedScanTasks(CloseableIterable tasks } private boolean manifestHasPartition(ManifestFile mf, int partValue) { - int lower = Conversions.fromByteBuffer(Types.IntegerType.get(), mf.partitions().get(0).lowerBound()); - int upper = Conversions.fromByteBuffer(Types.IntegerType.get(), mf.partitions().get(0).upperBound()); + int lower = + Conversions.fromByteBuffer(Types.IntegerType.get(), mf.partitions().get(0).lowerBound()); + int upper = + Conversions.fromByteBuffer(Types.IntegerType.get(), mf.partitions().get(0).upperBound()); return (lower <= partValue) && (upper >= partValue); } } diff --git a/core/src/test/java/org/apache/iceberg/TestMetadataTableScans.java b/core/src/test/java/org/apache/iceberg/TestMetadataTableScans.java index db78d1cde851..bb1958fb6766 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetadataTableScans.java +++ b/core/src/test/java/org/apache/iceberg/TestMetadataTableScans.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Set; @@ -41,15 +43,12 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestMetadataTableScans extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestMetadataTableScans(int formatVersion) { @@ -60,42 +59,23 @@ private void preparePartitionedTable() { preparePartitionedTableData(); if (formatVersion == 2) { - table.newRowDelta() - .addDeletes(FILE_A_DELETES) - .commit(); - table.newRowDelta() - .addDeletes(FILE_B_DELETES) - .commit(); - table.newRowDelta() - .addDeletes(FILE_C2_DELETES) - .commit(); - table.newRowDelta() - .addDeletes(FILE_D2_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_A_DELETES).commit(); + table.newRowDelta().addDeletes(FILE_B_DELETES).commit(); + table.newRowDelta().addDeletes(FILE_C2_DELETES).commit(); + table.newRowDelta().addDeletes(FILE_D2_DELETES).commit(); } } private void preparePartitionedTableData() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); - table.newFastAppend() - .appendFile(FILE_D) - .commit(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); + table.newFastAppend().appendFile(FILE_C).commit(); + table.newFastAppend().appendFile(FILE_D).commit(); + table.newFastAppend().appendFile(FILE_B).commit(); } @Test public void testManifestsTableWithDroppedPartition() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); table.updateSpec().removeField(Expressions.bucket("data", 16)).commit(); table.refresh(); @@ -118,15 +98,11 @@ public void testManifestsTableWithDroppedPartition() throws IOException { @Test public void testManifestsTableAlwaysIgnoresResiduals() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Table manifestsTable = new ManifestsTable(table.ops(), table); - TableScan scan = manifestsTable.newScan() - .filter(Expressions.lessThan("length", 10000L)); + TableScan scan = manifestsTable.newScan().filter(Expressions.lessThan("length", 10000L)); try (CloseableIterable tasks = scan.planFiles()) { Assert.assertTrue("Tasks should not be empty", Iterables.size(tasks) > 0); @@ -138,10 +114,7 @@ public void testManifestsTableAlwaysIgnoresResiduals() throws IOException { @Test public void testDataFilesTableWithDroppedPartition() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); table.updateSpec().removeField(Expressions.bucket("data", 16)).commit(); table.refresh(); @@ -164,48 +137,38 @@ public void testDataFilesTableWithDroppedPartition() throws IOException { @Test public void testDataFilesTableHonorsIgnoreResiduals() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Table dataFilesTable = new DataFilesTable(table.ops(), table); - TableScan scan1 = dataFilesTable.newScan() - .filter(Expressions.equal("record_count", 1)); + TableScan scan1 = dataFilesTable.newScan().filter(Expressions.equal("record_count", 1)); validateTaskScanResiduals(scan1, false); - TableScan scan2 = dataFilesTable.newScan() - .filter(Expressions.equal("record_count", 1)) - .ignoreResiduals(); + TableScan scan2 = + dataFilesTable.newScan().filter(Expressions.equal("record_count", 1)).ignoreResiduals(); validateTaskScanResiduals(scan2, true); } @Test public void testManifestEntriesTableHonorsIgnoreResiduals() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Table manifestEntriesTable = new ManifestEntriesTable(table.ops(), table); - TableScan scan1 = manifestEntriesTable.newScan() - .filter(Expressions.equal("snapshot_id", 1L)); + TableScan scan1 = manifestEntriesTable.newScan().filter(Expressions.equal("snapshot_id", 1L)); validateTaskScanResiduals(scan1, false); - TableScan scan2 = manifestEntriesTable.newScan() - .filter(Expressions.equal("snapshot_id", 1L)) - .ignoreResiduals(); + TableScan scan2 = + manifestEntriesTable + .newScan() + .filter(Expressions.equal("snapshot_id", 1L)) + .ignoreResiduals(); validateTaskScanResiduals(scan2, true); } @Test public void testManifestEntriesTableWithDroppedPartition() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); table.updateSpec().removeField(Expressions.bucket("data", 16)).commit(); table.refresh(); @@ -228,29 +191,21 @@ public void testManifestEntriesTableWithDroppedPartition() throws IOException { @Test public void testAllDataFilesTableHonorsIgnoreResiduals() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Table allDataFilesTable = new AllDataFilesTable(table.ops(), table); - TableScan scan1 = allDataFilesTable.newScan() - .filter(Expressions.equal("record_count", 1)); + TableScan scan1 = allDataFilesTable.newScan().filter(Expressions.equal("record_count", 1)); validateTaskScanResiduals(scan1, false); - TableScan scan2 = allDataFilesTable.newScan() - .filter(Expressions.equal("record_count", 1)) - .ignoreResiduals(); + TableScan scan2 = + allDataFilesTable.newScan().filter(Expressions.equal("record_count", 1)).ignoreResiduals(); validateTaskScanResiduals(scan2, true); } @Test public void testAllDataFilesTableWithDroppedPartition() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); table.updateSpec().removeField(Expressions.bucket("data", 16)).commit(); table.refresh(); @@ -273,29 +228,21 @@ public void testAllDataFilesTableWithDroppedPartition() throws IOException { @Test public void testAllEntriesTableHonorsIgnoreResiduals() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Table allEntriesTable = new AllEntriesTable(table.ops(), table); - TableScan scan1 = allEntriesTable.newScan() - .filter(Expressions.equal("snapshot_id", 1L)); + TableScan scan1 = allEntriesTable.newScan().filter(Expressions.equal("snapshot_id", 1L)); validateTaskScanResiduals(scan1, false); - TableScan scan2 = allEntriesTable.newScan() - .filter(Expressions.equal("snapshot_id", 1L)) - .ignoreResiduals(); + TableScan scan2 = + allEntriesTable.newScan().filter(Expressions.equal("snapshot_id", 1L)).ignoreResiduals(); validateTaskScanResiduals(scan2, true); } @Test public void testAllEntriesTableWithDroppedPartition() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); table.updateSpec().removeField(Expressions.bucket("data", 16)).commit(); table.refresh(); @@ -318,10 +265,7 @@ public void testAllEntriesTableWithDroppedPartition() throws IOException { @Test public void testAllManifestsTableWithDroppedPartition() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); table.updateSpec().removeField(Expressions.bucket("data", 16)).commit(); table.refresh(); @@ -345,20 +289,18 @@ public void testAllManifestsTableWithDroppedPartition() throws IOException { @Test public void testAllManifestsTableHonorsIgnoreResiduals() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Table allManifestsTable = new AllManifestsTable(table.ops(), table); - TableScan scan1 = allManifestsTable.newScan() - .filter(Expressions.lessThan("length", 10000L)); + TableScan scan1 = allManifestsTable.newScan().filter(Expressions.lessThan("length", 10000L)); validateTaskScanResiduals(scan1, false); - TableScan scan2 = allManifestsTable.newScan() - .filter(Expressions.lessThan("length", 10000L)) - .ignoreResiduals(); + TableScan scan2 = + allManifestsTable + .newScan() + .filter(Expressions.lessThan("length", 10000L)) + .ignoreResiduals(); validateTaskScanResiduals(scan2, true); } @@ -367,13 +309,18 @@ public void testPartitionsTableScanNoFilter() { preparePartitionedTable(); Table partitionsTable = new PartitionsTable(table.ops(), table); - Types.StructType expected = new Schema( - required(1, "partition", Types.StructType.of( - optional(1000, "data_bucket", Types.IntegerType.get())))).asStruct(); + Types.StructType expected = + new Schema( + required( + 1, + "partition", + Types.StructType.of(optional(1000, "data_bucket", Types.IntegerType.get())))) + .asStruct(); TableScan scanNoFilter = partitionsTable.newScan().select("partition.data_bucket"); Assert.assertEquals(expected, scanNoFilter.schema().asStruct()); - CloseableIterable tasksNoFilter = PartitionsTable.planFiles((StaticTableScan) scanNoFilter); + CloseableIterable tasksNoFilter = + PartitionsTable.planFiles((StaticTableScan) scanNoFilter); Assert.assertEquals(4, Iterators.size(tasksNoFilter.iterator())); validateIncludesPartitionScan(tasksNoFilter, 0); validateIncludesPartitionScan(tasksNoFilter, 1); @@ -386,9 +333,8 @@ public void testPartitionsTableScanWithProjection() { preparePartitionedTable(); Table partitionsTable = new PartitionsTable(table.ops(), table); - Types.StructType expected = new Schema( - required(3, "file_count", Types.IntegerType.get()) - ).asStruct(); + Types.StructType expected = + new Schema(required(3, "file_count", Types.IntegerType.get())).asStruct(); TableScan scanWithProjection = partitionsTable.newScan().select("file_count"); Assert.assertEquals(expected, scanWithProjection.schema().asStruct()); @@ -403,12 +349,11 @@ public void testPartitionsTableScanWithProjection() { @Test public void testPartitionsTableScanNoStats() { - table.newFastAppend() - .appendFile(FILE_WITH_STATS) - .commit(); + table.newFastAppend().appendFile(FILE_WITH_STATS).commit(); Table partitionsTable = new PartitionsTable(table.ops(), table); - CloseableIterable tasksAndEq = PartitionsTable.planFiles((StaticTableScan) partitionsTable.newScan()); + CloseableIterable tasksAndEq = + PartitionsTable.planFiles((StaticTableScan) partitionsTable.newScan()); for (FileScanTask fileTask : tasksAndEq) { Assert.assertNull(fileTask.file().columnSizes()); Assert.assertNull(fileTask.file().valueCounts()); @@ -424,11 +369,13 @@ public void testPartitionsTableScanAndFilter() { Table partitionsTable = new PartitionsTable(table.ops(), table); - Expression andEquals = Expressions.and( - Expressions.equal("partition.data_bucket", 0), - Expressions.greaterThan("record_count", 0)); + Expression andEquals = + Expressions.and( + Expressions.equal("partition.data_bucket", 0), + Expressions.greaterThan("record_count", 0)); TableScan scanAndEq = partitionsTable.newScan().filter(andEquals); - CloseableIterable tasksAndEq = PartitionsTable.planFiles((StaticTableScan) scanAndEq); + CloseableIterable tasksAndEq = + PartitionsTable.planFiles((StaticTableScan) scanAndEq); Assert.assertEquals(1, Iterators.size(tasksAndEq.iterator())); validateIncludesPartitionScan(tasksAndEq, 0); } @@ -439,11 +386,13 @@ public void testPartitionsTableScanLtFilter() { Table partitionsTable = new PartitionsTable(table.ops(), table); - Expression ltAnd = Expressions.and( - Expressions.lessThan("partition.data_bucket", 2), - Expressions.greaterThan("record_count", 0)); + Expression ltAnd = + Expressions.and( + Expressions.lessThan("partition.data_bucket", 2), + Expressions.greaterThan("record_count", 0)); TableScan scanLtAnd = partitionsTable.newScan().filter(ltAnd); - CloseableIterable tasksLtAnd = PartitionsTable.planFiles((StaticTableScan) scanLtAnd); + CloseableIterable tasksLtAnd = + PartitionsTable.planFiles((StaticTableScan) scanLtAnd); Assert.assertEquals(2, Iterators.size(tasksLtAnd.iterator())); validateIncludesPartitionScan(tasksLtAnd, 0); validateIncludesPartitionScan(tasksLtAnd, 1); @@ -455,9 +404,10 @@ public void testPartitionsTableScanOrFilter() { Table partitionsTable = new PartitionsTable(table.ops(), table); - Expression or = Expressions.or( - Expressions.equal("partition.data_bucket", 2), - Expressions.greaterThan("record_count", 0)); + Expression or = + Expressions.or( + Expressions.equal("partition.data_bucket", 2), + Expressions.greaterThan("record_count", 0)); TableScan scanOr = partitionsTable.newScan().filter(or); CloseableIterable tasksOr = PartitionsTable.planFiles((StaticTableScan) scanOr); Assert.assertEquals(4, Iterators.size(tasksOr.iterator())); @@ -467,7 +417,6 @@ public void testPartitionsTableScanOrFilter() { validateIncludesPartitionScan(tasksOr, 3); } - @Test public void testPartitionsScanNotFilter() { preparePartitionedTable(); @@ -503,7 +452,8 @@ public void testPartitionsTableScanNotNullFilter() { Expression unary = Expressions.notNull("partition.data_bucket"); TableScan scanUnary = partitionsTable.newScan().filter(unary); - CloseableIterable tasksUnary = PartitionsTable.planFiles((StaticTableScan) scanUnary); + CloseableIterable tasksUnary = + PartitionsTable.planFiles((StaticTableScan) scanUnary); Assert.assertEquals(4, Iterators.size(tasksUnary.iterator())); validateIncludesPartitionScan(tasksUnary, 0); validateIncludesPartitionScan(tasksUnary, 1); @@ -518,9 +468,12 @@ public void testFilesTableScanWithDroppedPartition() throws IOException { table.updateSpec().removeField(Expressions.bucket("data", 16)).commit(); table.refresh(); - // Here we need to specify target name as 'data_bucket_16'. If unspecified a default name will be generated. As per - // https://github.com/apache/iceberg/pull/4868 there's an inconsistency of doing this: in V2, the above removed - // data_bucket would be recycled in favor of data_bucket_16. By specifying the target name, we explicitly require + // Here we need to specify target name as 'data_bucket_16'. If unspecified a default name will + // be generated. As per + // https://github.com/apache/iceberg/pull/4868 there's an inconsistency of doing this: in V2, + // the above removed + // data_bucket would be recycled in favor of data_bucket_16. By specifying the target name, we + // explicitly require // data_bucket not to be recycled. table.updateSpec().addField("data_bucket_16", Expressions.bucket("data", 16)).commit(); table.refresh(); @@ -535,18 +488,21 @@ public void testFilesTableScanWithDroppedPartition() throws IOException { Schema schema = dataFilesTable.schema(); Types.StructType actualType = schema.findField(DataFile.PARTITION_ID).type().asStructType(); - Types.StructType expectedType = Types.StructType.of( - Types.NestedField.optional(1000, "data_bucket", Types.IntegerType.get()), - Types.NestedField.optional(1001, "data_bucket_16", Types.IntegerType.get()), - Types.NestedField.optional(1002, "data_trunc_2", Types.StringType.get()) - ); + Types.StructType expectedType = + Types.StructType.of( + Types.NestedField.optional(1000, "data_bucket", Types.IntegerType.get()), + Types.NestedField.optional(1001, "data_bucket_16", Types.IntegerType.get()), + Types.NestedField.optional(1002, "data_trunc_2", Types.StringType.get())); Assert.assertEquals("Partition type must match", expectedType, actualType); Accessor accessor = schema.accessorForField(1000); try (CloseableIterable tasks = scan.planFiles()) { - Set results = StreamSupport.stream(tasks.spliterator(), false) - .flatMap(fileScanTask -> Streams.stream(fileScanTask.asDataTask().rows())) - .map(accessor::get).map(i -> (Integer) i).collect(Collectors.toSet()); + Set results = + StreamSupport.stream(tasks.spliterator(), false) + .flatMap(fileScanTask -> Streams.stream(fileScanTask.asDataTask().rows())) + .map(accessor::get) + .map(i -> (Integer) i) + .collect(Collectors.toSet()); Assert.assertEquals("Partition value must match", Sets.newHashSet(0, 1, 2, 3), results); } } @@ -555,26 +511,28 @@ public void testFilesTableScanWithDroppedPartition() throws IOException { public void testDeleteFilesTableSelection() throws IOException { Assume.assumeTrue("Only V2 Tables Support Deletes", formatVersion >= 2); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newRowDelta() - .addDeletes(FILE_A_DELETES) - .addDeletes(FILE_A2_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_A_DELETES).addDeletes(FILE_A2_DELETES).commit(); Table deleteFilesTable = new DeleteFilesTable(table.ops(), table); - TableScan scan = deleteFilesTable.newScan() - .filter(Expressions.equal("record_count", 1)) - .select("content", "record_count"); + TableScan scan = + deleteFilesTable + .newScan() + .filter(Expressions.equal("record_count", 1)) + .select("content", "record_count"); validateTaskScanResiduals(scan, false); - Types.StructType expected = new Schema( - optional(134, "content", Types.IntegerType.get(), - "Contents of the file: 0=data, 1=position deletes, 2=equality deletes"), - required(103, "record_count", Types.LongType.get(), "Number of records in the file") - ).asStruct(); + Types.StructType expected = + new Schema( + optional( + 134, + "content", + Types.IntegerType.get(), + "Contents of the file: 0=data, 1=position deletes, 2=equality deletes"), + required( + 103, "record_count", Types.LongType.get(), "Number of records in the file")) + .asStruct(); Assert.assertEquals(expected, scan.schema().asStruct()); } @@ -583,47 +541,48 @@ public void testPartitionSpecEvolutionAdditive() { preparePartitionedTable(); // Change spec and add two data files - table.updateSpec() - .addField("id") - .commit(); + table.updateSpec().addField("id").commit(); PartitionSpec newSpec = table.spec(); // Add two data files with new spec PartitionKey data10Key = new PartitionKey(newSpec, table.schema()); data10Key.set(0, 0); // data=0 data10Key.set(1, 10); // id=10 - DataFile data10 = DataFiles.builder(newSpec) - .withPath("/path/to/data-10.parquet") - .withRecordCount(10) - .withFileSizeInBytes(10) - .withPartition(data10Key) - .build(); + DataFile data10 = + DataFiles.builder(newSpec) + .withPath("/path/to/data-10.parquet") + .withRecordCount(10) + .withFileSizeInBytes(10) + .withPartition(data10Key) + .build(); PartitionKey data11Key = new PartitionKey(newSpec, table.schema()); data11Key.set(0, 1); // data=0 data10Key.set(1, 11); // id=11 - DataFile data11 = DataFiles.builder(newSpec) - .withPath("/path/to/data-11.parquet") - .withRecordCount(10) - .withFileSizeInBytes(10) - .withPartition(data11Key) - .build(); + DataFile data11 = + DataFiles.builder(newSpec) + .withPath("/path/to/data-11.parquet") + .withRecordCount(10) + .withFileSizeInBytes(10) + .withPartition(data11Key) + .build(); table.newFastAppend().appendFile(data10).commit(); table.newFastAppend().appendFile(data11).commit(); Table metadataTable = new PartitionsTable(table.ops(), table); - Expression filter = Expressions.and( - Expressions.equal("partition.id", 10), - Expressions.greaterThan("record_count", 0)); + Expression filter = + Expressions.and( + Expressions.equal("partition.id", 10), Expressions.greaterThan("record_count", 0)); TableScan scan = metadataTable.newScan().filter(filter); CloseableIterable tasks = PartitionsTable.planFiles((StaticTableScan) scan); // Four data files of old spec, one new data file of new spec Assert.assertEquals(5, Iterables.size(tasks)); - filter = Expressions.and( - Expressions.equal("partition.data_bucket", 0), - Expressions.greaterThan("record_count", 0)); + filter = + Expressions.and( + Expressions.equal("partition.data_bucket", 0), + Expressions.greaterThan("record_count", 0)); scan = metadataTable.newScan().filter(filter); tasks = PartitionsTable.planFiles((StaticTableScan) scan); @@ -636,10 +595,7 @@ public void testPartitionSpecEvolutionRemoval() { preparePartitionedTable(); // Remove partition field - table.updateSpec() - .removeField(Expressions.bucket("data", 16)) - .addField("id") - .commit(); + table.updateSpec().removeField(Expressions.bucket("data", 16)).addField("id").commit(); PartitionSpec newSpec = table.spec(); // Add two data files with new spec @@ -647,38 +603,42 @@ public void testPartitionSpecEvolutionRemoval() { int partIndex = (formatVersion == 1) ? 1 : 0; PartitionKey data10Key = new PartitionKey(newSpec, table.schema()); data10Key.set(partIndex, 10); - DataFile data10 = DataFiles.builder(newSpec) - .withPath("/path/to/data-10.parquet") - .withRecordCount(10) - .withFileSizeInBytes(10) - .withPartition(data10Key) - .build(); + DataFile data10 = + DataFiles.builder(newSpec) + .withPath("/path/to/data-10.parquet") + .withRecordCount(10) + .withFileSizeInBytes(10) + .withPartition(data10Key) + .build(); PartitionKey data11Key = new PartitionKey(newSpec, table.schema()); data11Key.set(partIndex, 11); - DataFile data11 = DataFiles.builder(newSpec) - .withPath("/path/to/data-11.parquet") - .withRecordCount(10) - .withFileSizeInBytes(10) - .withPartition(data11Key) - .build(); + DataFile data11 = + DataFiles.builder(newSpec) + .withPath("/path/to/data-11.parquet") + .withRecordCount(10) + .withFileSizeInBytes(10) + .withPartition(data11Key) + .build(); table.newFastAppend().appendFile(data10).commit(); table.newFastAppend().appendFile(data11).commit(); Table metadataTable = new PartitionsTable(table.ops(), table); - Expression filter = Expressions.and( - Expressions.equal("partition.id", 10), - Expressions.greaterThan("record_count", 0)); + Expression filter = + Expressions.and( + Expressions.equal("partition.id", 10), Expressions.greaterThan("record_count", 0)); TableScan scan = metadataTable.newScan().filter(filter); CloseableIterable tasks = PartitionsTable.planFiles((StaticTableScan) scan); // Four original files of original spec, one data file written by new spec Assert.assertEquals(5, Iterables.size(tasks)); - // Filter for a dropped partition spec field. Correct behavior is that only old partitions are returned. - filter = Expressions.and( - Expressions.equal("partition.data_bucket", 0), - Expressions.greaterThan("record_count", 0)); + // Filter for a dropped partition spec field. Correct behavior is that only old partitions are + // returned. + filter = + Expressions.and( + Expressions.equal("partition.data_bucket", 0), + Expressions.greaterThan("record_count", 0)); scan = metadataTable.newScan().filter(filter); tasks = PartitionsTable.planFiles((StaticTableScan) scan); @@ -686,14 +646,19 @@ public void testPartitionSpecEvolutionRemoval() { // 1 original data file written by old spec Assert.assertEquals(1, Iterables.size(tasks)); } else { - // 1 original data/delete files written by old spec, plus both of new data file/delete file written by new spec + // 1 original data/delete files written by old spec, plus both of new data file/delete file + // written by new spec // - // Unlike in V1, V2 does not write (data=null) on newer files' partition data, so these cannot be filtered out + // Unlike in V1, V2 does not write (data=null) on newer files' partition data, so these cannot + // be filtered out // early in scan planning here. // - // However, these partition rows are filtered out later in Spark data filtering, as the newer partitions - // will have 'data=null' field added as part of normalization to the Partitions table final schema. - // The Partitions table final schema is a union of fields of all specs, including dropped fields. + // However, these partition rows are filtered out later in Spark data filtering, as the newer + // partitions + // will have 'data=null' field added as part of normalization to the Partitions table final + // schema. + // The Partitions table final schema is a union of fields of all specs, including dropped + // fields. Assert.assertEquals(3, Iterables.size(tasks)); } } @@ -704,93 +669,95 @@ public void testPartitionColumnNamedPartition() throws Exception { this.tableDir = temp.newFolder(); tableDir.delete(); - Schema schema = new Schema( - required(1, "id", Types.IntegerType.get()), - required(2, "partition", Types.IntegerType.get()) - ); + Schema schema = + new Schema( + required(1, "id", Types.IntegerType.get()), + required(2, "partition", Types.IntegerType.get())); this.metadataDir = new File(tableDir, "metadata"); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .identity("partition") - .build(); - - DataFile par0 = DataFiles.builder(spec) - .withPath("/path/to/data-0.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(0)) - .withRecordCount(1) - .build(); - DataFile par1 = DataFiles.builder(spec) - .withPath("/path/to/data-0.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(1)) - .withRecordCount(1) - .build(); - DataFile par2 = DataFiles.builder(spec) - .withPath("/path/to/data-0.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(2)) - .withRecordCount(1) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("partition").build(); + + DataFile par0 = + DataFiles.builder(spec) + .withPath("/path/to/data-0.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(0)) + .withRecordCount(1) + .build(); + DataFile par1 = + DataFiles.builder(spec) + .withPath("/path/to/data-0.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(1)) + .withRecordCount(1) + .build(); + DataFile par2 = + DataFiles.builder(spec) + .withPath("/path/to/data-0.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(2)) + .withRecordCount(1) + .build(); this.table = create(schema, spec); - table.newFastAppend() - .appendFile(par0) - .commit(); - table.newFastAppend() - .appendFile(par1) - .commit(); - table.newFastAppend() - .appendFile(par2) - .commit(); + table.newFastAppend().appendFile(par0).commit(); + table.newFastAppend().appendFile(par1).commit(); + table.newFastAppend().appendFile(par2).commit(); Table partitionsTable = new PartitionsTable(table.ops(), table); - Expression andEquals = Expressions.and( - Expressions.equal("partition.partition", 0), - Expressions.greaterThan("record_count", 0)); + Expression andEquals = + Expressions.and( + Expressions.equal("partition.partition", 0), + Expressions.greaterThan("record_count", 0)); TableScan scanAndEq = partitionsTable.newScan().filter(andEquals); - CloseableIterable tasksAndEq = PartitionsTable.planFiles((StaticTableScan) scanAndEq); + CloseableIterable tasksAndEq = + PartitionsTable.planFiles((StaticTableScan) scanAndEq); Assert.assertEquals(1, Iterators.size(tasksAndEq.iterator())); validateIncludesPartitionScan(tasksAndEq, 0); } - @Test public void testAllDataFilesTableScanWithPlanExecutor() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Table allDataFilesTable = new AllDataFilesTable(table.ops(), table); AtomicInteger planThreadsIndex = new AtomicInteger(0); - TableScan scan = allDataFilesTable.newScan() - .planWith(Executors.newFixedThreadPool(1, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("plan-" + planThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })); + TableScan scan = + allDataFilesTable + .newScan() + .planWith( + Executors.newFixedThreadPool( + 1, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("plan-" + planThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })); Assert.assertEquals(1, Iterables.size(scan.planFiles())); Assert.assertTrue("Thread should be created in provided pool", planThreadsIndex.get() > 0); } @Test public void testAllEntriesTableScanWithPlanExecutor() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Table allEntriesTable = new AllEntriesTable(table.ops(), table); AtomicInteger planThreadsIndex = new AtomicInteger(0); - TableScan scan = allEntriesTable.newScan() - .planWith(Executors.newFixedThreadPool(1, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("plan-" + planThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })); + TableScan scan = + allEntriesTable + .newScan() + .planWith( + Executors.newFixedThreadPool( + 1, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("plan-" + planThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })); Assert.assertEquals(1, Iterables.size(scan.planFiles())); Assert.assertTrue("Thread should be created in provided pool", planThreadsIndex.get() > 0); } @@ -801,13 +768,19 @@ public void testPartitionsTableScanWithPlanExecutor() { Table partitionsTable = new PartitionsTable(table.ops(), table); AtomicInteger planThreadsIndex = new AtomicInteger(0); - TableScan scan = partitionsTable.newScan() - .planWith(Executors.newFixedThreadPool(1, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("plan-" + planThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })); + TableScan scan = + partitionsTable + .newScan() + .planWith( + Executors.newFixedThreadPool( + 1, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("plan-" + planThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })); CloseableIterable tasks = PartitionsTable.planFiles((StaticTableScan) scan); Assert.assertEquals(4, Iterables.size(tasks)); Assert.assertTrue("Thread should be created in provided pool", planThreadsIndex.get() > 0); @@ -819,10 +792,11 @@ public void testAllManifestsTableSnapshotGt() { preparePartitionedTableData(); Table manifestsTable = new AllManifestsTable(table.ops(), table); - TableScan manifestsTableScan = manifestsTable.newScan() - .filter(Expressions.greaterThan("reference_snapshot_id", 2)); + TableScan manifestsTableScan = + manifestsTable.newScan().filter(Expressions.greaterThan("reference_snapshot_id", 2)); - Assert.assertEquals("Expected snapshots do not match", + Assert.assertEquals( + "Expected snapshots do not match", expectedManifestListPaths(table.snapshots(), 3L, 4L), actualManifestListPaths(manifestsTableScan)); } @@ -833,10 +807,11 @@ public void testAllManifestsTableSnapshotGte() { preparePartitionedTableData(); Table manifestsTable = new AllManifestsTable(table.ops(), table); - TableScan manifestsTableScan = manifestsTable.newScan() - .filter(Expressions.greaterThanOrEqual("reference_snapshot_id", 3)); + TableScan manifestsTableScan = + manifestsTable.newScan().filter(Expressions.greaterThanOrEqual("reference_snapshot_id", 3)); - Assert.assertEquals("Expected snapshots do not match", + Assert.assertEquals( + "Expected snapshots do not match", expectedManifestListPaths(table.snapshots(), 3L, 4L), actualManifestListPaths(manifestsTableScan)); } @@ -847,10 +822,11 @@ public void testAllManifestsTableSnapshotLt() { preparePartitionedTableData(); Table manifestsTable = new AllManifestsTable(table.ops(), table); - TableScan manifestsTableScan = manifestsTable.newScan() - .filter(Expressions.lessThan("reference_snapshot_id", 3)); + TableScan manifestsTableScan = + manifestsTable.newScan().filter(Expressions.lessThan("reference_snapshot_id", 3)); - Assert.assertEquals("Expected snapshots do not match", + Assert.assertEquals( + "Expected snapshots do not match", expectedManifestListPaths(table.snapshots(), 1L, 2L), actualManifestListPaths(manifestsTableScan)); } @@ -861,10 +837,11 @@ public void testAllManifestsTableSnapshotLte() { preparePartitionedTableData(); Table manifestsTable = new AllManifestsTable(table.ops(), table); - TableScan manifestsTableScan = manifestsTable.newScan() - .filter(Expressions.lessThanOrEqual("reference_snapshot_id", 2)); + TableScan manifestsTableScan = + manifestsTable.newScan().filter(Expressions.lessThanOrEqual("reference_snapshot_id", 2)); - Assert.assertEquals("Expected snapshots do not match", + Assert.assertEquals( + "Expected snapshots do not match", expectedManifestListPaths(table.snapshots(), 1L, 2L), actualManifestListPaths(manifestsTableScan)); } @@ -875,10 +852,11 @@ public void testAllManifestsTableSnapshotEq() { preparePartitionedTableData(); Table manifestsTable = new AllManifestsTable(table.ops(), table); - TableScan manifestsTableScan = manifestsTable.newScan() - .filter(Expressions.equal("reference_snapshot_id", 2)); + TableScan manifestsTableScan = + manifestsTable.newScan().filter(Expressions.equal("reference_snapshot_id", 2)); - Assert.assertEquals("Expected snapshots do not match", + Assert.assertEquals( + "Expected snapshots do not match", expectedManifestListPaths(table.snapshots(), 2L), actualManifestListPaths(manifestsTableScan)); } @@ -889,12 +867,13 @@ public void testAllManifestsTableSnapshotNotEq() { preparePartitionedTableData(); Table manifestsTable = new AllManifestsTable(table.ops(), table); - TableScan manifestsTableScan = manifestsTable.newScan() - .filter(Expressions.notEqual("reference_snapshot_id", 2)); + TableScan manifestsTableScan = + manifestsTable.newScan().filter(Expressions.notEqual("reference_snapshot_id", 2)); - Assert.assertEquals("Expected snapshots do not match", - expectedManifestListPaths(table.snapshots(), 1L, 3L, 4L), - actualManifestListPaths(manifestsTableScan)); + Assert.assertEquals( + "Expected snapshots do not match", + expectedManifestListPaths(table.snapshots(), 1L, 3L, 4L), + actualManifestListPaths(manifestsTableScan)); } @Test @@ -904,9 +883,10 @@ public void testAllManifestsTableSnapshotIn() { Table manifestsTable = new AllManifestsTable(table.ops(), table); - TableScan manifestsTableScan = manifestsTable.newScan() - .filter(Expressions.in("reference_snapshot_id", 1, 3)); - Assert.assertEquals("Expected snapshots do not match", + TableScan manifestsTableScan = + manifestsTable.newScan().filter(Expressions.in("reference_snapshot_id", 1, 3)); + Assert.assertEquals( + "Expected snapshots do not match", expectedManifestListPaths(table.snapshots(), 1L, 3L), actualManifestListPaths(manifestsTableScan)); } @@ -917,10 +897,11 @@ public void testAllManifestsTableSnapshotNotIn() { preparePartitionedTableData(); Table manifestsTable = new AllManifestsTable(table.ops(), table); - TableScan manifestsTableScan = manifestsTable.newScan() - .filter(Expressions.notIn("reference_snapshot_id", 1, 3)); + TableScan manifestsTableScan = + manifestsTable.newScan().filter(Expressions.notIn("reference_snapshot_id", 1, 3)); - Assert.assertEquals("Expected snapshots do not match", + Assert.assertEquals( + "Expected snapshots do not match", expectedManifestListPaths(table.snapshots(), 2L, 4L), actualManifestListPaths(manifestsTableScan)); } @@ -932,13 +913,17 @@ public void testAllManifestsTableSnapshotAnd() { Table manifestsTable = new AllManifestsTable(table.ops(), table); - TableScan manifestsTableScan = manifestsTable.newScan() - .filter(Expressions.and( - Expressions.equal("reference_snapshot_id", 2), - Expressions.greaterThan("length", 0))); - Assert.assertEquals("Expected snapshots do not match", - expectedManifestListPaths(table.snapshots(), 2L), - actualManifestListPaths(manifestsTableScan)); + TableScan manifestsTableScan = + manifestsTable + .newScan() + .filter( + Expressions.and( + Expressions.equal("reference_snapshot_id", 2), + Expressions.greaterThan("length", 0))); + Assert.assertEquals( + "Expected snapshots do not match", + expectedManifestListPaths(table.snapshots(), 2L), + actualManifestListPaths(manifestsTableScan)); } @Test @@ -948,11 +933,15 @@ public void testAllManifestsTableSnapshotOr() { Table manifestsTable = new AllManifestsTable(table.ops(), table); - TableScan manifestsTableScan = manifestsTable.newScan() - .filter(Expressions.or( - Expressions.equal("reference_snapshot_id", 2), - Expressions.equal("reference_snapshot_id", 4))); - Assert.assertEquals("Expected snapshots do not match", + TableScan manifestsTableScan = + manifestsTable + .newScan() + .filter( + Expressions.or( + Expressions.equal("reference_snapshot_id", 2), + Expressions.equal("reference_snapshot_id", 4))); + Assert.assertEquals( + "Expected snapshots do not match", expectedManifestListPaths(table.snapshots(), 2L, 4L), actualManifestListPaths(manifestsTableScan)); } @@ -963,11 +952,13 @@ public void testAllManifestsTableSnapshotNot() { preparePartitionedTableData(); Table manifestsTable = new AllManifestsTable(table.ops(), table); - TableScan manifestsTableScan = manifestsTable.newScan() - .filter(Expressions.not( - Expressions.equal("reference_snapshot_id", 2))); + TableScan manifestsTableScan = + manifestsTable + .newScan() + .filter(Expressions.not(Expressions.equal("reference_snapshot_id", 2))); - Assert.assertEquals("Expected snapshots do not match", + Assert.assertEquals( + "Expected snapshots do not match", expectedManifestListPaths(table.snapshots(), 1L, 3L, 4L), actualManifestListPaths(manifestsTableScan)); } @@ -987,15 +978,18 @@ private Set expectedManifestListPaths(Iterable snapshots, Long .collect(Collectors.toSet()); } - private void validateTaskScanResiduals(TableScan scan, boolean ignoreResiduals) throws IOException { + private void validateTaskScanResiduals(TableScan scan, boolean ignoreResiduals) + throws IOException { try (CloseableIterable tasks = scan.planTasks()) { Assert.assertTrue("Tasks should not be empty", Iterables.size(tasks) > 0); for (CombinedScanTask combinedScanTask : tasks) { for (FileScanTask fileScanTask : combinedScanTask.files()) { if (ignoreResiduals) { - Assert.assertEquals("Residuals must be ignored", Expressions.alwaysTrue(), fileScanTask.residual()); + Assert.assertEquals( + "Residuals must be ignored", Expressions.alwaysTrue(), fileScanTask.residual()); } else { - Assert.assertNotEquals("Residuals must be preserved", Expressions.alwaysTrue(), fileScanTask.residual()); + Assert.assertNotEquals( + "Residuals must be preserved", Expressions.alwaysTrue(), fileScanTask.residual()); } } } @@ -1003,14 +997,17 @@ private void validateTaskScanResiduals(TableScan scan, boolean ignoreResiduals) } private void validateIncludesPartitionScan(CloseableIterable tasks, int partValue) { - Assert.assertTrue("File scan tasks do not include correct file", - StreamSupport.stream(tasks.spliterator(), false).anyMatch( - a -> a.file().partition().get(0, Object.class).equals(partValue))); + Assert.assertTrue( + "File scan tasks do not include correct file", + StreamSupport.stream(tasks.spliterator(), false) + .anyMatch(a -> a.file().partition().get(0, Object.class).equals(partValue))); } private boolean manifestHasPartition(ManifestFile mf, int partValue) { - int lower = Conversions.fromByteBuffer(Types.IntegerType.get(), mf.partitions().get(0).lowerBound()); - int upper = Conversions.fromByteBuffer(Types.IntegerType.get(), mf.partitions().get(0).upperBound()); + int lower = + Conversions.fromByteBuffer(Types.IntegerType.get(), mf.partitions().get(0).lowerBound()); + int upper = + Conversions.fromByteBuffer(Types.IntegerType.get(), mf.partitions().get(0).upperBound()); return (lower <= partValue) && (upper >= partValue); } } diff --git a/core/src/test/java/org/apache/iceberg/TestMetadataUpdateParser.java b/core/src/test/java/org/apache/iceberg/TestMetadataUpdateParser.java index e60b8131f507..f215b04a8af3 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetadataUpdateParser.java +++ b/core/src/test/java/org/apache/iceberg/TestMetadataUpdateParser.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.Files.localInput; + import java.util.List; import java.util.Map; import java.util.Objects; @@ -33,20 +34,17 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.Files.localInput; - public class TestMetadataUpdateParser { - private static final Schema ID_DATA_SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); + private static final Schema ID_DATA_SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); @Test public void testMetadataUpdateWithoutActionCannotDeserialize() { - List invalidJson = ImmutableList.of( - "{\"action\":null,\"format-version\":2}", - "{\"format-version\":2}" - ); + List invalidJson = + ImmutableList.of("{\"action\":null,\"format-version\":2}", "{\"format-version\":2}"); for (String json : invalidJson) { AssertHelpers.assertThrows( @@ -57,8 +55,7 @@ public void testMetadataUpdateWithoutActionCannotDeserialize() { } } - /** AssignUUID **/ - + /** AssignUUID * */ @Test public void testAssignUUIDToJson() { String action = MetadataUpdateParser.ASSIGN_UUID; @@ -71,20 +68,23 @@ public void testAssignUUIDToJson() { @Test public void testAssignUUIDFromJson() { String uuid = "9510c070-5e6d-4b40-bf40-a8915bb76e5d"; - String expected = "{\"action\":\"assign-uuid\",\"uuid\":\"9510c070-5e6d-4b40-bf40-a8915bb76e5d\"}"; + String expected = + "{\"action\":\"assign-uuid\",\"uuid\":\"9510c070-5e6d-4b40-bf40-a8915bb76e5d\"}"; MetadataUpdate actual = new MetadataUpdate.AssignUUID(uuid); - Assert.assertEquals("Assign UUID should convert to the correct JSON value", - expected, MetadataUpdateParser.toJson(actual)); + Assert.assertEquals( + "Assign UUID should convert to the correct JSON value", + expected, + MetadataUpdateParser.toJson(actual)); } - /** UpgradeFormatVersion **/ - + /** UpgradeFormatVersion * */ @Test public void testUpgradeFormatVersionToJson() { int formatVersion = 2; String action = MetadataUpdateParser.UPGRADE_FORMAT_VERSION; String json = "{\"action\":\"upgrade-format-version\",\"format-version\":2}"; - MetadataUpdate.UpgradeFormatVersion expected = new MetadataUpdate.UpgradeFormatVersion(formatVersion); + MetadataUpdate.UpgradeFormatVersion expected = + new MetadataUpdate.UpgradeFormatVersion(formatVersion); assertEquals(action, expected, MetadataUpdateParser.fromJson(json)); } @@ -92,20 +92,24 @@ public void testUpgradeFormatVersionToJson() { public void testUpgradeFormatVersionFromJson() { int formatVersion = 2; String expected = "{\"action\":\"upgrade-format-version\",\"format-version\":2}"; - MetadataUpdate.UpgradeFormatVersion actual = new MetadataUpdate.UpgradeFormatVersion(formatVersion); - Assert.assertEquals("Upgrade format version should convert to the correct JSON value", - expected, MetadataUpdateParser.toJson(actual)); + MetadataUpdate.UpgradeFormatVersion actual = + new MetadataUpdate.UpgradeFormatVersion(formatVersion); + Assert.assertEquals( + "Upgrade format version should convert to the correct JSON value", + expected, + MetadataUpdateParser.toJson(actual)); } - /** AddSchema **/ - + /** AddSchema * */ @Test public void testAddSchemaFromJson() { String action = MetadataUpdateParser.ADD_SCHEMA; Schema schema = ID_DATA_SCHEMA; int lastColumnId = schema.highestFieldId(); - String json = String.format("{\"action\":\"add-schema\",\"schema\":%s,\"last-column-id\":%d}", - SchemaParser.toJson(schema), lastColumnId); + String json = + String.format( + "{\"action\":\"add-schema\",\"schema\":%s,\"last-column-id\":%d}", + SchemaParser.toJson(schema), lastColumnId); MetadataUpdate actualUpdate = new MetadataUpdate.AddSchema(schema, lastColumnId); assertEquals(action, actualUpdate, MetadataUpdateParser.fromJson(json)); } @@ -114,15 +118,16 @@ public void testAddSchemaFromJson() { public void testAddSchemaToJson() { Schema schema = ID_DATA_SCHEMA; int lastColumnId = schema.highestFieldId(); - String expected = String.format("{\"action\":\"add-schema\",\"schema\":%s,\"last-column-id\":%d}", - SchemaParser.toJson(schema), lastColumnId); + String expected = + String.format( + "{\"action\":\"add-schema\",\"schema\":%s,\"last-column-id\":%d}", + SchemaParser.toJson(schema), lastColumnId); MetadataUpdate update = new MetadataUpdate.AddSchema(schema, lastColumnId); String actual = MetadataUpdateParser.toJson(update); Assert.assertEquals("Add schema should convert to the correct JSON value", expected, actual); } - /** SetCurrentSchema **/ - + /** SetCurrentSchema * */ @Test public void testSetCurrentSchemaFromJson() { String action = MetadataUpdateParser.SET_CURRENT_SCHEMA; @@ -139,67 +144,79 @@ public void testSetCurrentSchemaToJson() { String expected = String.format("{\"action\":\"%s\",\"schema-id\":%d}", action, schemaId); MetadataUpdate update = new MetadataUpdate.SetCurrentSchema(schemaId); String actual = MetadataUpdateParser.toJson(update); - Assert.assertEquals("Set current schema should convert to the correct JSON value", expected, actual); + Assert.assertEquals( + "Set current schema should convert to the correct JSON value", expected, actual); } - /** AddPartitionSpec **/ - + /** AddPartitionSpec * */ @Test public void testAddPartitionSpecFromJsonWithFieldId() { String action = MetadataUpdateParser.ADD_PARTITION_SPEC; - String specString = "{" + - "\"spec-id\":1," + - "\"fields\":[{" + - "\"name\":\"id_bucket\"," + - "\"transform\":\"bucket[8]\"," + - "\"source-id\":1," + - "\"field-id\":1000" + - "},{" + - "\"name\":\"data_bucket\"," + - "\"transform\":\"bucket[16]\"," + - "\"source-id\":2," + - "\"field-id\":1001" + - "}]" + - "}"; - - UnboundPartitionSpec actualSpec = PartitionSpecParser.fromJson(ID_DATA_SCHEMA, specString).toUnbound(); - String json = String.format("{\"action\":\"%s\",\"spec\":%s}", action, PartitionSpecParser.toJson(actualSpec)); - - // Partition spec order declaration needs to match declaration in spec string to be assigned the same field ids. - PartitionSpec expectedSpec = PartitionSpec.builderFor(ID_DATA_SCHEMA) - .bucket("id", 8) - .bucket("data", 16) - .withSpecId(1) - .build(); + String specString = + "{" + + "\"spec-id\":1," + + "\"fields\":[{" + + "\"name\":\"id_bucket\"," + + "\"transform\":\"bucket[8]\"," + + "\"source-id\":1," + + "\"field-id\":1000" + + "},{" + + "\"name\":\"data_bucket\"," + + "\"transform\":\"bucket[16]\"," + + "\"source-id\":2," + + "\"field-id\":1001" + + "}]" + + "}"; + + UnboundPartitionSpec actualSpec = + PartitionSpecParser.fromJson(ID_DATA_SCHEMA, specString).toUnbound(); + String json = + String.format( + "{\"action\":\"%s\",\"spec\":%s}", action, PartitionSpecParser.toJson(actualSpec)); + + // Partition spec order declaration needs to match declaration in spec string to be assigned the + // same field ids. + PartitionSpec expectedSpec = + PartitionSpec.builderFor(ID_DATA_SCHEMA) + .bucket("id", 8) + .bucket("data", 16) + .withSpecId(1) + .build(); MetadataUpdate expected = new MetadataUpdate.AddPartitionSpec(expectedSpec); assertEquals(action, expected, MetadataUpdateParser.fromJson(json)); } @Test public void testAddPartitionSpecFromJsonWithoutFieldId() { - // partition field ids are missing in old PartitionSpec, they always auto-increment from 1000 in declared order + // partition field ids are missing in old PartitionSpec, they always auto-increment from 1000 in + // declared order String action = MetadataUpdateParser.ADD_PARTITION_SPEC; - String specString = "{" + - "\"spec-id\":1," + - "\"fields\":[{" + - "\"name\":\"id_bucket\"," + - "\"transform\":\"bucket[8]\"," + - "\"source-id\":1" + - "},{" + - "\"name\": \"data_bucket\"," + - "\"transform\":\"bucket[16]\"," + - "\"source-id\":2" + - "}]" + - "}"; - - UnboundPartitionSpec actualSpec = PartitionSpecParser.fromJson(ID_DATA_SCHEMA, specString).toUnbound(); - String json = String.format("{\"action\":\"%s\",\"spec\":%s}", action, PartitionSpecParser.toJson(actualSpec)); - - PartitionSpec expectedSpec = PartitionSpec.builderFor(ID_DATA_SCHEMA) - .bucket("id", 8) - .bucket("data", 16) - .withSpecId(1) - .build(); + String specString = + "{" + + "\"spec-id\":1," + + "\"fields\":[{" + + "\"name\":\"id_bucket\"," + + "\"transform\":\"bucket[8]\"," + + "\"source-id\":1" + + "},{" + + "\"name\": \"data_bucket\"," + + "\"transform\":\"bucket[16]\"," + + "\"source-id\":2" + + "}]" + + "}"; + + UnboundPartitionSpec actualSpec = + PartitionSpecParser.fromJson(ID_DATA_SCHEMA, specString).toUnbound(); + String json = + String.format( + "{\"action\":\"%s\",\"spec\":%s}", action, PartitionSpecParser.toJson(actualSpec)); + + PartitionSpec expectedSpec = + PartitionSpec.builderFor(ID_DATA_SCHEMA) + .bucket("id", 8) + .bucket("data", 16) + .withSpecId(1) + .build(); MetadataUpdate expected = new MetadataUpdate.AddPartitionSpec(expectedSpec.toUnbound()); assertEquals(action, expected, MetadataUpdateParser.fromJson(json)); } @@ -207,37 +224,43 @@ public void testAddPartitionSpecFromJsonWithoutFieldId() { @Test public void testAddPartitionSpecToJson() { String action = MetadataUpdateParser.ADD_PARTITION_SPEC; - String specString = "{" + - "\"spec-id\":1," + - "\"fields\":[{" + - "\"name\":\"id_bucket\"," + - "\"transform\":\"bucket[8]\"," + - "\"source-id\":1," + - "\"field-id\":1000" + - "},{" + - "\"name\":\"data_bucket\"," + - "\"transform\":\"bucket[16]\"," + - "\"source-id\":2," + - "\"field-id\":1001" + - "}]" + - "}"; - - UnboundPartitionSpec actualSpec = PartitionSpecParser.fromJson(ID_DATA_SCHEMA, specString).toUnbound(); - String expected = String.format("{\"action\":\"%s\",\"spec\":%s}", action, PartitionSpecParser.toJson(actualSpec)); - - // Partition spec order declaration needs to match declaration in spec string to be assigned the same field ids. - PartitionSpec expectedSpec = PartitionSpec.builderFor(ID_DATA_SCHEMA) - .bucket("id", 8) - .bucket("data", 16) - .withSpecId(1) - .build(); + String specString = + "{" + + "\"spec-id\":1," + + "\"fields\":[{" + + "\"name\":\"id_bucket\"," + + "\"transform\":\"bucket[8]\"," + + "\"source-id\":1," + + "\"field-id\":1000" + + "},{" + + "\"name\":\"data_bucket\"," + + "\"transform\":\"bucket[16]\"," + + "\"source-id\":2," + + "\"field-id\":1001" + + "}]" + + "}"; + + UnboundPartitionSpec actualSpec = + PartitionSpecParser.fromJson(ID_DATA_SCHEMA, specString).toUnbound(); + String expected = + String.format( + "{\"action\":\"%s\",\"spec\":%s}", action, PartitionSpecParser.toJson(actualSpec)); + + // Partition spec order declaration needs to match declaration in spec string to be assigned the + // same field ids. + PartitionSpec expectedSpec = + PartitionSpec.builderFor(ID_DATA_SCHEMA) + .bucket("id", 8) + .bucket("data", 16) + .withSpecId(1) + .build(); MetadataUpdate update = new MetadataUpdate.AddPartitionSpec(expectedSpec); String actual = MetadataUpdateParser.toJson(update); - Assert.assertEquals("Add partition spec should convert to the correct JSON value", expected, actual); + Assert.assertEquals( + "Add partition spec should convert to the correct JSON value", expected, actual); } - /** SetDefaultPartitionSpec **/ - + /** SetDefaultPartitionSpec * */ @Test public void testSetDefaultPartitionSpecToJson() { String action = MetadataUpdateParser.SET_DEFAULT_PARTITION_SPEC; @@ -245,7 +268,8 @@ public void testSetDefaultPartitionSpecToJson() { String expected = String.format("{\"action\":\"%s\",\"spec-id\":%d}", action, specId); MetadataUpdate update = new MetadataUpdate.SetDefaultPartitionSpec(specId); String actual = MetadataUpdateParser.toJson(update); - Assert.assertEquals("Set default partition spec should serialize to the correct JSON value", expected, actual); + Assert.assertEquals( + "Set default partition spec should serialize to the correct JSON value", expected, actual); } @Test @@ -253,53 +277,62 @@ public void testSetDefaultPartitionSpecFromJson() { String action = MetadataUpdateParser.SET_DEFAULT_PARTITION_SPEC; int specId = 4; String json = String.format("{\"action\":\"%s\",\"spec-id\":%d}", action, specId); - MetadataUpdate.SetDefaultPartitionSpec expected = new MetadataUpdate.SetDefaultPartitionSpec(specId); + MetadataUpdate.SetDefaultPartitionSpec expected = + new MetadataUpdate.SetDefaultPartitionSpec(specId); assertEquals(action, expected, MetadataUpdateParser.fromJson(json)); } - /** AddSortOrder **/ - + /** AddSortOrder * */ @Test public void testAddSortOrderToJson() { String action = MetadataUpdateParser.ADD_SORT_ORDER; - UnboundSortOrder sortOrder = SortOrder.builderFor(ID_DATA_SCHEMA) - .withOrderId(3) - .asc("id", NullOrder.NULLS_FIRST) - .desc("data") - .build() - .toUnbound(); - - String expected = String.format("{\"action\":\"%s\",\"sort-order\":%s}", action, SortOrderParser.toJson(sortOrder)); + UnboundSortOrder sortOrder = + SortOrder.builderFor(ID_DATA_SCHEMA) + .withOrderId(3) + .asc("id", NullOrder.NULLS_FIRST) + .desc("data") + .build() + .toUnbound(); + + String expected = + String.format( + "{\"action\":\"%s\",\"sort-order\":%s}", action, SortOrderParser.toJson(sortOrder)); MetadataUpdate update = new MetadataUpdate.AddSortOrder(sortOrder); - Assert.assertEquals("Add sort order should serialize to the correct JSON value", - expected, MetadataUpdateParser.toJson(update)); + Assert.assertEquals( + "Add sort order should serialize to the correct JSON value", + expected, + MetadataUpdateParser.toJson(update)); } @Test public void testAddSortOrderFromJson() { String action = MetadataUpdateParser.ADD_SORT_ORDER; - UnboundSortOrder sortOrder = SortOrder.builderFor(ID_DATA_SCHEMA) - .withOrderId(3) - .asc("id", NullOrder.NULLS_FIRST) - .desc("data") - .build() - .toUnbound(); - - String json = String.format("{\"action\":\"%s\",\"sort-order\":%s}", action, SortOrderParser.toJson(sortOrder)); + UnboundSortOrder sortOrder = + SortOrder.builderFor(ID_DATA_SCHEMA) + .withOrderId(3) + .asc("id", NullOrder.NULLS_FIRST) + .desc("data") + .build() + .toUnbound(); + + String json = + String.format( + "{\"action\":\"%s\",\"sort-order\":%s}", action, SortOrderParser.toJson(sortOrder)); MetadataUpdate.AddSortOrder expected = new MetadataUpdate.AddSortOrder(sortOrder); assertEquals(action, expected, MetadataUpdateParser.fromJson(json)); } - /** SetDefaultSortOrder **/ - + /** SetDefaultSortOrder * */ @Test public void testSetDefaultSortOrderToJson() { String action = MetadataUpdateParser.SET_DEFAULT_SORT_ORDER; int sortOrderId = 2; - String expected = String.format("{\"action\":\"%s\",\"sort-order-id\":%d}", action, sortOrderId); + String expected = + String.format("{\"action\":\"%s\",\"sort-order-id\":%d}", action, sortOrderId); MetadataUpdate update = new MetadataUpdate.SetDefaultSortOrder(sortOrderId); String actual = MetadataUpdateParser.toJson(update); - Assert.assertEquals("Set default sort order should serialize to the correct JSON value", expected, actual); + Assert.assertEquals( + "Set default sort order should serialize to the correct JSON value", expected, actual); } @Test @@ -311,26 +344,34 @@ public void testSetDefaultSortOrderFromJson() { assertEquals(action, expected, MetadataUpdateParser.fromJson(json)); } - /** AddSnapshot **/ - + /** AddSnapshot * */ @Test public void testAddSnapshotToJson() { String action = MetadataUpdateParser.ADD_SNAPSHOT; long parentId = 1; long snapshotId = 2; int schemaId = 3; - List manifests = ImmutableList.of( - new GenericManifestFile(localInput("file:/tmp/manifest1.avro"), 0), - new GenericManifestFile(localInput("file:/tmp/manifest2.avro"), 0)); - - Snapshot snapshot = new BaseSnapshot(null, snapshotId, parentId, System.currentTimeMillis(), - DataOperations.REPLACE, ImmutableMap.of("files-added", "4", "files-deleted", "100"), - schemaId, manifests); + List manifests = + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manifest1.avro"), 0), + new GenericManifestFile(localInput("file:/tmp/manifest2.avro"), 0)); + + Snapshot snapshot = + new BaseSnapshot( + null, + snapshotId, + parentId, + System.currentTimeMillis(), + DataOperations.REPLACE, + ImmutableMap.of("files-added", "4", "files-deleted", "100"), + schemaId, + manifests); String snapshotJson = SnapshotParser.toJson(snapshot, /* pretty */ false); String expected = String.format("{\"action\":\"%s\",\"snapshot\":%s}", action, snapshotJson); MetadataUpdate update = new MetadataUpdate.AddSnapshot(snapshot); String actual = MetadataUpdateParser.toJson(update); - Assert.assertEquals("Add snapshot should serialize to the correct JSON value", expected, actual); + Assert.assertEquals( + "Add snapshot should serialize to the correct JSON value", expected, actual); } @Test @@ -339,20 +380,28 @@ public void testAddSnapshotFromJson() { long parentId = 1; long snapshotId = 2; int schemaId = 3; - List manifests = ImmutableList.of( - new GenericManifestFile(localInput("file:/tmp/manifest1.avro"), 0), - new GenericManifestFile(localInput("file:/tmp/manifest2.avro"), 0)); + List manifests = + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manifest1.avro"), 0), + new GenericManifestFile(localInput("file:/tmp/manifest2.avro"), 0)); Map summary = ImmutableMap.of("files-added", "4", "files-deleted", "100"); - Snapshot snapshot = new BaseSnapshot(null, snapshotId, parentId, System.currentTimeMillis(), - DataOperations.REPLACE, summary, schemaId, manifests); + Snapshot snapshot = + new BaseSnapshot( + null, + snapshotId, + parentId, + System.currentTimeMillis(), + DataOperations.REPLACE, + summary, + schemaId, + manifests); String snapshotJson = SnapshotParser.toJson(snapshot, /* pretty */ false); String json = String.format("{\"action\":\"%s\",\"snapshot\":%s}", action, snapshotJson); MetadataUpdate expected = new MetadataUpdate.AddSnapshot(snapshot); assertEquals(action, expected, MetadataUpdateParser.fromJson(json)); } - /** RemoveSnapshots **/ - + /** RemoveSnapshots * */ @Test public void testRemoveSnapshotsFromJson() { String action = MetadataUpdateParser.REMOVE_SNAPSHOTS; @@ -369,11 +418,11 @@ public void testRemoveSnapshotsToJson() { String expected = String.format("{\"action\":\"%s\",\"snapshot-ids\":[2]}", action); MetadataUpdate update = new MetadataUpdate.RemoveSnapshot(snapshotId); String actual = MetadataUpdateParser.toJson(update); - Assert.assertEquals("Remove snapshots should serialize to the correct JSON value", expected, actual); + Assert.assertEquals( + "Remove snapshots should serialize to the correct JSON value", expected, actual); } - /** RemoveSnapshotRef **/ - + /** RemoveSnapshotRef * */ @Test public void testRemoveSnapshotRefFromJson() { String action = MetadataUpdateParser.REMOVE_SNAPSHOT_REF; @@ -388,12 +437,13 @@ public void testRemoveSnapshotRefToJson() { String snapshotRef = "snapshot-ref"; String expected = "{\"action\":\"remove-snapshot-ref\",\"ref-name\":\"snapshot-ref\"}"; MetadataUpdate actual = new MetadataUpdate.RemoveSnapshotRef(snapshotRef); - Assert.assertEquals("RemoveSnapshotRef should convert to the correct JSON value", - expected, MetadataUpdateParser.toJson(actual)); + Assert.assertEquals( + "RemoveSnapshotRef should convert to the correct JSON value", + expected, + MetadataUpdateParser.toJson(actual)); } - /** SetSnapshotRef **/ - + /** SetSnapshotRef * */ @Test public void testSetSnapshotRefTagFromJsonDefault_NullValuesMissing() { String action = MetadataUpdateParser.SET_SNAPSHOT_REF; @@ -403,9 +453,11 @@ public void testSetSnapshotRefTagFromJsonDefault_NullValuesMissing() { Integer minSnapshotsToKeep = null; Long maxSnapshotAgeMs = null; Long maxRefAgeMs = null; - String json = "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"tag\"}"; - MetadataUpdate expected = new MetadataUpdate.SetSnapshotRef( - refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); + String json = + "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"tag\"}"; + MetadataUpdate expected = + new MetadataUpdate.SetSnapshotRef( + refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); assertEquals(action, expected, MetadataUpdateParser.fromJson(json)); } @@ -418,10 +470,12 @@ public void testSetSnapshotRefTagFromJsonDefault_ExplicitNullValues() { Integer minSnapshotsToKeep = null; Long maxSnapshotAgeMs = null; Long maxRefAgeMs = null; - String json = "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"tag\"," + - "\"min-snapshots-to-keep\":null,\"max-snapshot-age-ms\":null,\"max-ref-age-ms\":null}"; - MetadataUpdate expected = new MetadataUpdate.SetSnapshotRef( - refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); + String json = + "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"tag\"," + + "\"min-snapshots-to-keep\":null,\"max-snapshot-age-ms\":null,\"max-ref-age-ms\":null}"; + MetadataUpdate expected = + new MetadataUpdate.SetSnapshotRef( + refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); assertEquals(action, expected, MetadataUpdateParser.fromJson(json)); } @@ -434,10 +488,12 @@ public void testSetSnapshotRefTagFromJsonAllFields_NullValuesMissing() { Integer minSnapshotsToKeep = null; Long maxSnapshotAgeMs = null; Long maxRefAgeMs = 1L; - String json = "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\"," + - "\"snapshot-id\":1,\"type\":\"tag\",\"max-ref-age-ms\":1}"; - MetadataUpdate expected = new MetadataUpdate.SetSnapshotRef( - refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); + String json = + "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\"," + + "\"snapshot-id\":1,\"type\":\"tag\",\"max-ref-age-ms\":1}"; + MetadataUpdate expected = + new MetadataUpdate.SetSnapshotRef( + refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); assertEquals(action, expected, MetadataUpdateParser.fromJson(json)); } @@ -450,10 +506,12 @@ public void testSetSnapshotRefTagFromJsonAllFields_ExplicitNullValues() { Integer minSnapshotsToKeep = null; Long maxSnapshotAgeMs = null; Long maxRefAgeMs = 1L; - String json = "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"tag\"," + - "\"max-ref-age-ms\":1,\"min-snapshots-to-keep\":null,\"max-snapshot-age-ms\":null}"; - MetadataUpdate expected = new MetadataUpdate.SetSnapshotRef( - refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); + String json = + "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"tag\"," + + "\"max-ref-age-ms\":1,\"min-snapshots-to-keep\":null,\"max-snapshot-age-ms\":null}"; + MetadataUpdate expected = + new MetadataUpdate.SetSnapshotRef( + refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); assertEquals(action, expected, MetadataUpdateParser.fromJson(json)); } @@ -466,9 +524,11 @@ public void testSetSnapshotRefBranchFromJsonDefault_NullValuesMissing() { Integer minSnapshotsToKeep = null; Long maxSnapshotAgeMs = null; Long maxRefAgeMs = null; - String json = "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"bRaNch\"}"; - MetadataUpdate expected = new MetadataUpdate.SetSnapshotRef( - refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); + String json = + "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"bRaNch\"}"; + MetadataUpdate expected = + new MetadataUpdate.SetSnapshotRef( + refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); assertEquals(action, expected, MetadataUpdateParser.fromJson(json)); } @@ -481,10 +541,12 @@ public void testSetSnapshotRefBranchFromJsonDefault_ExplicitNullValues() { Integer minSnapshotsToKeep = null; Long maxSnapshotAgeMs = null; Long maxRefAgeMs = null; - String json = "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"bRaNch\"," + - "\"max-ref-age-ms\":null,\"min-snapshots-to-keep\":null,\"max-snapshot-age-ms\":null}"; - MetadataUpdate expected = new MetadataUpdate.SetSnapshotRef( - refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); + String json = + "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"bRaNch\"," + + "\"max-ref-age-ms\":null,\"min-snapshots-to-keep\":null,\"max-snapshot-age-ms\":null}"; + MetadataUpdate expected = + new MetadataUpdate.SetSnapshotRef( + refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); assertEquals(action, expected, MetadataUpdateParser.fromJson(json)); } @@ -497,10 +559,12 @@ public void testBranchFromJsonAllFields() { Integer minSnapshotsToKeep = 2; Long maxSnapshotAgeMs = 3L; Long maxRefAgeMs = 4L; - String json = "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"branch\"," + - "\"min-snapshots-to-keep\":2,\"max-snapshot-age-ms\":3,\"max-ref-age-ms\":4}"; - MetadataUpdate expected = new MetadataUpdate.SetSnapshotRef( - refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); + String json = + "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"branch\"," + + "\"min-snapshots-to-keep\":2,\"max-snapshot-age-ms\":3,\"max-ref-age-ms\":4}"; + MetadataUpdate expected = + new MetadataUpdate.SetSnapshotRef( + refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); assertEquals(action, expected, MetadataUpdateParser.fromJson(json)); } @@ -512,12 +576,16 @@ public void testSetSnapshotRefTagToJsonDefault() { Integer minSnapshotsToKeep = null; Long maxSnapshotAgeMs = null; Long maxRefAgeMs = null; - String expected = "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"tag\"}"; - MetadataUpdate update = new MetadataUpdate.SetSnapshotRef( - refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); + String expected = + "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"tag\"}"; + MetadataUpdate update = + new MetadataUpdate.SetSnapshotRef( + refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); String actual = MetadataUpdateParser.toJson(update); - Assert.assertEquals("Set snapshot ref should serialize to the correct JSON value for tag with default fields", - expected, actual); + Assert.assertEquals( + "Set snapshot ref should serialize to the correct JSON value for tag with default fields", + expected, + actual); } @Test @@ -528,13 +596,17 @@ public void testSetSnapshotRefTagToJsonAllFields() { Integer minSnapshotsToKeep = null; Long maxSnapshotAgeMs = null; Long maxRefAgeMs = 1L; - String expected = "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\"," + - "\"snapshot-id\":1,\"type\":\"tag\",\"max-ref-age-ms\":1}"; - MetadataUpdate update = new MetadataUpdate.SetSnapshotRef( - refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); + String expected = + "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\"," + + "\"snapshot-id\":1,\"type\":\"tag\",\"max-ref-age-ms\":1}"; + MetadataUpdate update = + new MetadataUpdate.SetSnapshotRef( + refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); String actual = MetadataUpdateParser.toJson(update); - Assert.assertEquals("Set snapshot ref should serialize to the correct JSON value for tag with all fields", - expected, actual); + Assert.assertEquals( + "Set snapshot ref should serialize to the correct JSON value for tag with all fields", + expected, + actual); } @Test @@ -547,11 +619,14 @@ public void testSetSnapshotRefBranchToJsonDefault() { Long maxRefAgeMs = null; String expected = "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"branch\"}"; - MetadataUpdate update = new MetadataUpdate.SetSnapshotRef( - refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); + MetadataUpdate update = + new MetadataUpdate.SetSnapshotRef( + refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); String actual = MetadataUpdateParser.toJson(update); - Assert.assertEquals("Set snapshot ref should serialize to the correct JSON value for branch with default fields", - expected, actual); + Assert.assertEquals( + "Set snapshot ref should serialize to the correct JSON value for branch with default fields", + expected, + actual); } @Test @@ -562,24 +637,27 @@ public void testSetSnapshotRefBranchToJsonAllFields() { Integer minSnapshotsToKeep = 2; Long maxSnapshotAgeMs = 3L; Long maxRefAgeMs = 4L; - String expected = "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"branch\"," + - "\"min-snapshots-to-keep\":2,\"max-snapshot-age-ms\":3,\"max-ref-age-ms\":4}"; - MetadataUpdate update = new MetadataUpdate.SetSnapshotRef( - refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); + String expected = + "{\"action\":\"set-snapshot-ref\",\"ref-name\":\"hank\",\"snapshot-id\":1,\"type\":\"branch\"," + + "\"min-snapshots-to-keep\":2,\"max-snapshot-age-ms\":3,\"max-ref-age-ms\":4}"; + MetadataUpdate update = + new MetadataUpdate.SetSnapshotRef( + refName, snapshotId, type, minSnapshotsToKeep, maxSnapshotAgeMs, maxRefAgeMs); String actual = MetadataUpdateParser.toJson(update); - Assert.assertEquals("Set snapshot ref should serialize to the correct JSON value for branch with all fields", - expected, actual); + Assert.assertEquals( + "Set snapshot ref should serialize to the correct JSON value for branch with all fields", + expected, + actual); } /** SetProperties */ - @Test public void testSetPropertiesFromJson() { String action = MetadataUpdateParser.SET_PROPERTIES; - Map props = ImmutableMap.of( - "prop1", "val1", - "prop2", "val2" - ); + Map props = + ImmutableMap.of( + "prop1", "val1", + "prop2", "val2"); String propsMap = "{\"prop1\":\"val1\",\"prop2\":\"val2\"}"; String json = String.format("{\"action\":\"%s\",\"updated\":%s}", action, propsMap); MetadataUpdate expected = new MetadataUpdate.SetProperties(props); @@ -598,26 +676,25 @@ public void testSetPropertiesFromJsonFailsWhenDeserializingNullValues() { "Parsing updates from SetProperties with a property set to null should throw", IllegalArgumentException.class, "Cannot parse prop2 to a string value: null", - () -> MetadataUpdateParser.fromJson(json) - ); + () -> MetadataUpdateParser.fromJson(json)); } @Test public void testSetPropertiesToJson() { String action = MetadataUpdateParser.SET_PROPERTIES; - Map props = ImmutableMap.of( - "prop1", "val1", - "prop2", "val2" - ); + Map props = + ImmutableMap.of( + "prop1", "val1", + "prop2", "val2"); String propsMap = "{\"prop1\":\"val1\",\"prop2\":\"val2\"}"; String expected = String.format("{\"action\":\"%s\",\"updated\":%s}", action, propsMap); MetadataUpdate update = new MetadataUpdate.SetProperties(props); String actual = MetadataUpdateParser.toJson(update); - Assert.assertEquals("Set properties should serialize to the correct JSON value", expected, actual); + Assert.assertEquals( + "Set properties should serialize to the correct JSON value", expected, actual); } /** RemoveProperties */ - @Test public void testRemovePropertiesFromJson() { String action = MetadataUpdateParser.REMOVE_PROPERTIES; @@ -636,11 +713,11 @@ public void testRemovePropertiesToJson() { String expected = String.format("{\"action\":\"%s\",\"removed\":%s}", action, toRemoveAsJSON); MetadataUpdate update = new MetadataUpdate.RemoveProperties(toRemove); String actual = MetadataUpdateParser.toJson(update); - Assert.assertEquals("Remove properties should serialize to the correct JSON value", expected, actual); + Assert.assertEquals( + "Remove properties should serialize to the correct JSON value", expected, actual); } /** SetLocation */ - @Test public void testSetLocationFromJson() { String action = MetadataUpdateParser.SET_LOCATION; @@ -657,85 +734,105 @@ public void testSetLocationToJson() { String expected = String.format("{\"action\":\"%s\",\"location\":\"%s\"}", action, location); MetadataUpdate update = new MetadataUpdate.SetLocation(location); String actual = MetadataUpdateParser.toJson(update); - Assert.assertEquals("Remove properties should serialize to the correct JSON value", expected, actual); + Assert.assertEquals( + "Remove properties should serialize to the correct JSON value", expected, actual); } - public void assertEquals(String action, MetadataUpdate expectedUpdate, MetadataUpdate actualUpdate) { + public void assertEquals( + String action, MetadataUpdate expectedUpdate, MetadataUpdate actualUpdate) { switch (action) { case MetadataUpdateParser.ASSIGN_UUID: - assertEqualsAssignUUID((MetadataUpdate.AssignUUID) expectedUpdate, (MetadataUpdate.AssignUUID) actualUpdate); + assertEqualsAssignUUID( + (MetadataUpdate.AssignUUID) expectedUpdate, (MetadataUpdate.AssignUUID) actualUpdate); break; case MetadataUpdateParser.UPGRADE_FORMAT_VERSION: - assertEqualsUpgradeFormatVersion((MetadataUpdate.UpgradeFormatVersion) expectedUpdate, + assertEqualsUpgradeFormatVersion( + (MetadataUpdate.UpgradeFormatVersion) expectedUpdate, (MetadataUpdate.UpgradeFormatVersion) actualUpdate); break; case MetadataUpdateParser.ADD_SCHEMA: - assertEqualsAddSchema((MetadataUpdate.AddSchema) expectedUpdate, (MetadataUpdate.AddSchema) actualUpdate); + assertEqualsAddSchema( + (MetadataUpdate.AddSchema) expectedUpdate, (MetadataUpdate.AddSchema) actualUpdate); break; case MetadataUpdateParser.SET_CURRENT_SCHEMA: - assertEqualsSetCurrentSchema((MetadataUpdate.SetCurrentSchema) expectedUpdate, + assertEqualsSetCurrentSchema( + (MetadataUpdate.SetCurrentSchema) expectedUpdate, (MetadataUpdate.SetCurrentSchema) actualUpdate); break; case MetadataUpdateParser.ADD_PARTITION_SPEC: - assertEqualsAddPartitionSpec((MetadataUpdate.AddPartitionSpec) expectedUpdate, + assertEqualsAddPartitionSpec( + (MetadataUpdate.AddPartitionSpec) expectedUpdate, (MetadataUpdate.AddPartitionSpec) actualUpdate); break; case MetadataUpdateParser.SET_DEFAULT_PARTITION_SPEC: - assertEqualsSetDefaultPartitionSpec((MetadataUpdate.SetDefaultPartitionSpec) expectedUpdate, + assertEqualsSetDefaultPartitionSpec( + (MetadataUpdate.SetDefaultPartitionSpec) expectedUpdate, (MetadataUpdate.SetDefaultPartitionSpec) actualUpdate); break; case MetadataUpdateParser.ADD_SORT_ORDER: - assertEqualsAddSortOrder((MetadataUpdate.AddSortOrder) expectedUpdate, + assertEqualsAddSortOrder( + (MetadataUpdate.AddSortOrder) expectedUpdate, (MetadataUpdate.AddSortOrder) actualUpdate); break; case MetadataUpdateParser.SET_DEFAULT_SORT_ORDER: - assertEqualsSetDefaultSortOrder((MetadataUpdate.SetDefaultSortOrder) expectedUpdate, + assertEqualsSetDefaultSortOrder( + (MetadataUpdate.SetDefaultSortOrder) expectedUpdate, (MetadataUpdate.SetDefaultSortOrder) actualUpdate); break; case MetadataUpdateParser.ADD_SNAPSHOT: - assertEqualsAddSnapshot((MetadataUpdate.AddSnapshot) expectedUpdate, (MetadataUpdate.AddSnapshot) actualUpdate); + assertEqualsAddSnapshot( + (MetadataUpdate.AddSnapshot) expectedUpdate, (MetadataUpdate.AddSnapshot) actualUpdate); break; case MetadataUpdateParser.REMOVE_SNAPSHOTS: - assertEqualsRemoveSnapshots((MetadataUpdate.RemoveSnapshot) expectedUpdate, + assertEqualsRemoveSnapshots( + (MetadataUpdate.RemoveSnapshot) expectedUpdate, (MetadataUpdate.RemoveSnapshot) actualUpdate); break; case MetadataUpdateParser.REMOVE_SNAPSHOT_REF: - assertEqualsRemoveSnapshotRef((MetadataUpdate.RemoveSnapshotRef) expectedUpdate, + assertEqualsRemoveSnapshotRef( + (MetadataUpdate.RemoveSnapshotRef) expectedUpdate, (MetadataUpdate.RemoveSnapshotRef) actualUpdate); break; case MetadataUpdateParser.SET_SNAPSHOT_REF: - assertEqualsSetSnapshotRef((MetadataUpdate.SetSnapshotRef) expectedUpdate, + assertEqualsSetSnapshotRef( + (MetadataUpdate.SetSnapshotRef) expectedUpdate, (MetadataUpdate.SetSnapshotRef) actualUpdate); break; case MetadataUpdateParser.SET_PROPERTIES: - assertEqualsSetProperties((MetadataUpdate.SetProperties) expectedUpdate, + assertEqualsSetProperties( + (MetadataUpdate.SetProperties) expectedUpdate, (MetadataUpdate.SetProperties) actualUpdate); break; case MetadataUpdateParser.REMOVE_PROPERTIES: - assertEqualsRemoveProperties((MetadataUpdate.RemoveProperties) expectedUpdate, + assertEqualsRemoveProperties( + (MetadataUpdate.RemoveProperties) expectedUpdate, (MetadataUpdate.RemoveProperties) actualUpdate); break; case MetadataUpdateParser.SET_LOCATION: - assertEqualsSetLocation((MetadataUpdate.SetLocation) expectedUpdate, - (MetadataUpdate.SetLocation) actualUpdate); + assertEqualsSetLocation( + (MetadataUpdate.SetLocation) expectedUpdate, (MetadataUpdate.SetLocation) actualUpdate); break; default: Assert.fail("Unrecognized metadata update action: " + action); } } - private static void assertEqualsAssignUUID(MetadataUpdate.AssignUUID expected, MetadataUpdate.AssignUUID actual) { + private static void assertEqualsAssignUUID( + MetadataUpdate.AssignUUID expected, MetadataUpdate.AssignUUID actual) { Assert.assertEquals("UUIDs should be equal", expected.uuid(), actual.uuid()); } private static void assertEqualsUpgradeFormatVersion( MetadataUpdate.UpgradeFormatVersion expected, MetadataUpdate.UpgradeFormatVersion actual) { - Assert.assertEquals("Format version should be equal", expected.formatVersion(), actual.formatVersion()); + Assert.assertEquals( + "Format version should be equal", expected.formatVersion(), actual.formatVersion()); } - private static void assertEqualsAddSchema(MetadataUpdate.AddSchema expected, MetadataUpdate.AddSchema actual) { + private static void assertEqualsAddSchema( + MetadataUpdate.AddSchema expected, MetadataUpdate.AddSchema actual) { Assert.assertTrue("Schemas should be the same", expected.schema().sameSchema(actual.schema())); - Assert.assertEquals("Last column id should be equal", expected.lastColumnId(), actual.lastColumnId()); + Assert.assertEquals( + "Last column id should be equal", expected.lastColumnId(), actual.lastColumnId()); } private static void assertEqualsSetCurrentSchema( @@ -744,75 +841,104 @@ private static void assertEqualsSetCurrentSchema( } private static void assertEqualsSetDefaultPartitionSpec( - MetadataUpdate.SetDefaultPartitionSpec expected, MetadataUpdate.SetDefaultPartitionSpec actual) { + MetadataUpdate.SetDefaultPartitionSpec expected, + MetadataUpdate.SetDefaultPartitionSpec actual) { Assertions.assertThat(actual.specId()).isEqualTo(expected.specId()); } private static void assertEqualsAddPartitionSpec( MetadataUpdate.AddPartitionSpec expected, MetadataUpdate.AddPartitionSpec actual) { - Assert.assertEquals("Unbound partition specs should have the same spec id", - expected.spec().specId(), actual.spec().specId()); - Assert.assertEquals("Unbound partition specs should have the same number of fields", - expected.spec().fields().size(), actual.spec().fields().size()); + Assert.assertEquals( + "Unbound partition specs should have the same spec id", + expected.spec().specId(), + actual.spec().specId()); + Assert.assertEquals( + "Unbound partition specs should have the same number of fields", + expected.spec().fields().size(), + actual.spec().fields().size()); IntStream.range(0, expected.spec().fields().size()) - .forEachOrdered(i -> { - UnboundPartitionSpec.UnboundPartitionField expectedField = expected.spec().fields().get(i); - UnboundPartitionSpec.UnboundPartitionField actualField = actual.spec().fields().get(i); - Assert.assertTrue( - "Fields of the unbound partition spec should be the same", - Objects.equals(expectedField.partitionId(), actualField.partitionId()) && - expectedField.name().equals(actualField.name()) && - Objects.equals(expectedField.transformAsString(), actualField.transformAsString()) && - expectedField.sourceId() == actualField.sourceId()); - }); + .forEachOrdered( + i -> { + UnboundPartitionSpec.UnboundPartitionField expectedField = + expected.spec().fields().get(i); + UnboundPartitionSpec.UnboundPartitionField actualField = + actual.spec().fields().get(i); + Assert.assertTrue( + "Fields of the unbound partition spec should be the same", + Objects.equals(expectedField.partitionId(), actualField.partitionId()) + && expectedField.name().equals(actualField.name()) + && Objects.equals( + expectedField.transformAsString(), actualField.transformAsString()) + && expectedField.sourceId() == actualField.sourceId()); + }); } private static void assertEqualsAddSortOrder( MetadataUpdate.AddSortOrder expected, MetadataUpdate.AddSortOrder actual) { - Assert.assertEquals("Order id of the sort order should be the same", - expected.sortOrder().orderId(), actual.sortOrder().orderId()); + Assert.assertEquals( + "Order id of the sort order should be the same", + expected.sortOrder().orderId(), + actual.sortOrder().orderId()); - Assert.assertEquals("Sort orders should have the same number of fields", - expected.sortOrder().fields().size(), actual.sortOrder().fields().size()); + Assert.assertEquals( + "Sort orders should have the same number of fields", + expected.sortOrder().fields().size(), + actual.sortOrder().fields().size()); IntStream.range(0, expected.sortOrder().fields().size()) - .forEachOrdered(i -> { - UnboundSortOrder.UnboundSortField expectedField = expected.sortOrder().fields().get(i); - UnboundSortOrder.UnboundSortField actualField = actual.sortOrder().fields().get(i); - Assert.assertTrue("Fields of the sort order should be the same", - expectedField.sourceId() == actualField.sourceId() && - expectedField.nullOrder().equals(actualField.nullOrder()) && - expectedField.direction().equals(actualField.direction()) && - Objects.equals(expectedField.transformAsString(), actualField.transformAsString())); - }); + .forEachOrdered( + i -> { + UnboundSortOrder.UnboundSortField expectedField = + expected.sortOrder().fields().get(i); + UnboundSortOrder.UnboundSortField actualField = actual.sortOrder().fields().get(i); + Assert.assertTrue( + "Fields of the sort order should be the same", + expectedField.sourceId() == actualField.sourceId() + && expectedField.nullOrder().equals(actualField.nullOrder()) + && expectedField.direction().equals(actualField.direction()) + && Objects.equals( + expectedField.transformAsString(), actualField.transformAsString())); + }); } private static void assertEqualsSetDefaultSortOrder( MetadataUpdate.SetDefaultSortOrder expected, MetadataUpdate.SetDefaultSortOrder actual) { - Assert.assertEquals("Sort order id should be the same", expected.sortOrderId(), actual.sortOrderId()); + Assert.assertEquals( + "Sort order id should be the same", expected.sortOrderId(), actual.sortOrderId()); } private static void assertEqualsAddSnapshot( MetadataUpdate.AddSnapshot expected, MetadataUpdate.AddSnapshot actual) { - Assert.assertEquals("Snapshot ID should be equal", - expected.snapshot().snapshotId(), actual.snapshot().snapshotId()); - Assert.assertEquals("Manifest list location should be equal", - expected.snapshot().manifestListLocation(), actual.snapshot().manifestListLocation()); + Assert.assertEquals( + "Snapshot ID should be equal", + expected.snapshot().snapshotId(), + actual.snapshot().snapshotId()); + Assert.assertEquals( + "Manifest list location should be equal", + expected.snapshot().manifestListLocation(), + actual.snapshot().manifestListLocation()); Assertions.assertThat(actual.snapshot().summary()) .as("Snapshot summary should be equivalent") .containsExactlyEntriesOf(expected.snapshot().summary()); - Assert.assertEquals("Snapshot Parent ID should be equal", - expected.snapshot().parentId(), actual.snapshot().parentId()); - Assert.assertEquals("Snapshot timestamp should be equal", expected.snapshot().timestampMillis(), + Assert.assertEquals( + "Snapshot Parent ID should be equal", + expected.snapshot().parentId(), + actual.snapshot().parentId()); + Assert.assertEquals( + "Snapshot timestamp should be equal", + expected.snapshot().timestampMillis(), actual.snapshot().timestampMillis()); - Assert.assertEquals("Snapshot schema id should be equal", expected.snapshot().schemaId(), + Assert.assertEquals( + "Snapshot schema id should be equal", + expected.snapshot().schemaId(), actual.snapshot().schemaId()); } private static void assertEqualsRemoveSnapshots( MetadataUpdate.RemoveSnapshot expected, MetadataUpdate.RemoveSnapshot actual) { - Assert.assertEquals("Snapshots to remove should be the same", expected.snapshotId(), actual.snapshotId()); + Assert.assertEquals( + "Snapshots to remove should be the same", expected.snapshotId(), actual.snapshotId()); } private static void assertEqualsSetSnapshotRef( @@ -825,12 +951,18 @@ private static void assertEqualsSetSnapshotRef( Assert.assertEquals("Snapshot reference type should be equal", expected.type(), actual.type()); // Nullable fields - Assert.assertEquals("Min snapshots to keep should be equal when present and null when missing or explicitly null", - expected.minSnapshotsToKeep(), actual.minSnapshotsToKeep()); - Assert.assertEquals("Max snapshot age ms should be equal when present and null when missing or explicitly null", - expected.maxSnapshotAgeMs(), actual.maxSnapshotAgeMs()); - Assert.assertEquals("Max ref age ms should be equal when present and null when missing or explicitly null", - expected.maxRefAgeMs(), actual.maxRefAgeMs()); + Assert.assertEquals( + "Min snapshots to keep should be equal when present and null when missing or explicitly null", + expected.minSnapshotsToKeep(), + actual.minSnapshotsToKeep()); + Assert.assertEquals( + "Max snapshot age ms should be equal when present and null when missing or explicitly null", + expected.maxSnapshotAgeMs(), + actual.maxSnapshotAgeMs()); + Assert.assertEquals( + "Max ref age ms should be equal when present and null when missing or explicitly null", + expected.maxRefAgeMs(), + actual.maxRefAgeMs()); } private static void assertEqualsRemoveSnapshotRef( diff --git a/core/src/test/java/org/apache/iceberg/TestMetrics.java b/core/src/test/java/org/apache/iceberg/TestMetrics.java index 74fb02e0c29a..7eb4d3344326 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetrics.java +++ b/core/src/test/java/org/apache/iceberg/TestMetrics.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Conversions.fromByteBuffer; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.math.BigDecimal; @@ -60,62 +63,55 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Conversions.fromByteBuffer; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - -/** - * Tests for Metrics. - */ +/** Tests for Metrics. */ public abstract class TestMetrics { protected TestMetrics(int formatVersion) { this.formatVersion = formatVersion; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - - private static final StructType LEAF_STRUCT_TYPE = StructType.of( - optional(5, "leafLongCol", LongType.get()), - optional(6, "leafBinaryCol", BinaryType.get()) - ); - - private static final StructType NESTED_STRUCT_TYPE = StructType.of( - required(3, "longCol", LongType.get()), - required(4, "leafStructCol", LEAF_STRUCT_TYPE), - required(7, "doubleCol", DoubleType.get()) - ); - - private static final Schema NESTED_SCHEMA = new Schema( - required(1, "intCol", IntegerType.get()), - required(2, "nestedStructCol", NESTED_STRUCT_TYPE) - ); - - private static final Schema SIMPLE_SCHEMA = new Schema( - optional(1, "booleanCol", BooleanType.get()), - required(2, "intCol", IntegerType.get()), - optional(3, "longCol", LongType.get()), - required(4, "floatCol", FloatType.get()), - optional(5, "doubleCol", DoubleType.get()), - optional(6, "decimalCol", DecimalType.of(10, 2)), - required(7, "stringCol", StringType.get()), - optional(8, "dateCol", DateType.get()), - required(9, "timeCol", TimeType.get()), - required(10, "timestampColAboveEpoch", TimestampType.withoutZone()), - required(11, "fixedCol", FixedType.ofLength(4)), - required(12, "binaryCol", BinaryType.get()), - required(13, "timestampColBelowEpoch", TimestampType.withoutZone()) - ); - - private static final Schema FLOAT_DOUBLE_ONLY_SCHEMA = new Schema( - optional(1, "floatCol", FloatType.get()), - optional(2, "doubleCol", DoubleType.get()) - ); + @Rule public TemporaryFolder temp = new TemporaryFolder(); + + private static final StructType LEAF_STRUCT_TYPE = + StructType.of( + optional(5, "leafLongCol", LongType.get()), + optional(6, "leafBinaryCol", BinaryType.get())); + + private static final StructType NESTED_STRUCT_TYPE = + StructType.of( + required(3, "longCol", LongType.get()), + required(4, "leafStructCol", LEAF_STRUCT_TYPE), + required(7, "doubleCol", DoubleType.get())); + + private static final Schema NESTED_SCHEMA = + new Schema( + required(1, "intCol", IntegerType.get()), + required(2, "nestedStructCol", NESTED_STRUCT_TYPE)); + + private static final Schema SIMPLE_SCHEMA = + new Schema( + optional(1, "booleanCol", BooleanType.get()), + required(2, "intCol", IntegerType.get()), + optional(3, "longCol", LongType.get()), + required(4, "floatCol", FloatType.get()), + optional(5, "doubleCol", DoubleType.get()), + optional(6, "decimalCol", DecimalType.of(10, 2)), + required(7, "stringCol", StringType.get()), + optional(8, "dateCol", DateType.get()), + required(9, "timeCol", TimeType.get()), + required(10, "timestampColAboveEpoch", TimestampType.withoutZone()), + required(11, "fixedCol", FixedType.ofLength(4)), + required(12, "binaryCol", BinaryType.get()), + required(13, "timestampColBelowEpoch", TimestampType.withoutZone())); + + private static final Schema FLOAT_DOUBLE_ONLY_SCHEMA = + new Schema( + optional(1, "floatCol", FloatType.get()), optional(2, "doubleCol", DoubleType.get())); private static final Record FLOAT_DOUBLE_RECORD_1 = createRecordWithFloatAndDouble(1.2F, 3.4D); private static final Record FLOAT_DOUBLE_RECORD_2 = createRecordWithFloatAndDouble(5.6F, 7.8D); - private static final Record NAN_ONLY_RECORD = createRecordWithFloatAndDouble(Float.NaN, Double.NaN); + private static final Record NAN_ONLY_RECORD = + createRecordWithFloatAndDouble(Float.NaN, Double.NaN); private final int formatVersion; private final byte[] fixed = "abcd".getBytes(StandardCharsets.UTF_8); @@ -134,12 +130,13 @@ private static Record createRecordWithFloatAndDouble(float floatValue, double do public abstract FileFormat fileFormat(); - public abstract Metrics getMetrics(Schema schema, MetricsConfig metricsConfig, Record... records) throws IOException; + public abstract Metrics getMetrics(Schema schema, MetricsConfig metricsConfig, Record... records) + throws IOException; public abstract Metrics getMetrics(Schema schema, Record... records) throws IOException; - protected abstract Metrics getMetricsForRecordsWithSmallRowGroups(Schema schema, OutputFile outputFile, - Record... records) throws IOException; + protected abstract Metrics getMetricsForRecordsWithSmallRowGroups( + Schema schema, OutputFile outputFile, Record... records) throws IOException; public abstract int splitCount(InputFile inputFile) throws IOException; @@ -237,15 +234,20 @@ public void testMetricsForTopLevelFields() throws IOException { assertCounts(10, 2L, 0L, metrics); assertBounds(10, TimestampType.withoutZone(), 0L, 900L, metrics); assertCounts(11, 2L, 0L, metrics); - assertBounds(11, FixedType.ofLength(4), - ByteBuffer.wrap(fixed), ByteBuffer.wrap(fixed), metrics); + assertBounds( + 11, FixedType.ofLength(4), ByteBuffer.wrap(fixed), ByteBuffer.wrap(fixed), metrics); assertCounts(12, 2L, 0L, metrics); - assertBounds(12, BinaryType.get(), - ByteBuffer.wrap("S".getBytes()), ByteBuffer.wrap("W".getBytes()), metrics); + assertBounds( + 12, + BinaryType.get(), + ByteBuffer.wrap("S".getBytes()), + ByteBuffer.wrap("W".getBytes()), + metrics); if (fileFormat() == FileFormat.ORC) { // TODO: The special condition for ORC can be removed when ORC-342 is fixed // ORC-342: ORC writer creates inaccurate timestamp data and stats 1 sec below epoch - // Values in the range `[1969-12-31 23:59:59.000,1969-12-31 23:59:59.999]` will have 1 sec added to them + // Values in the range `[1969-12-31 23:59:59.000,1969-12-31 23:59:59.999]` will have 1 sec + // added to them // So the upper bound value of -7_000 micros becomes 993_000 micros assertBounds(13, TimestampType.withoutZone(), -1_900_300L, 993_000L, metrics); } else { @@ -255,11 +257,11 @@ public void testMetricsForTopLevelFields() throws IOException { @Test public void testMetricsForDecimals() throws IOException { - Schema schema = new Schema( - required(1, "decimalAsInt32", DecimalType.of(4, 2)), - required(2, "decimalAsInt64", DecimalType.of(14, 2)), - required(3, "decimalAsFixed", DecimalType.of(22, 2)) - ); + Schema schema = + new Schema( + required(1, "decimalAsInt32", DecimalType.of(4, 2)), + required(2, "decimalAsInt64", DecimalType.of(14, 2)), + required(3, "decimalAsFixed", DecimalType.of(22, 2))); Record record = GenericRecord.create(schema); record.setField("decimalAsInt32", new BigDecimal("2.55")); @@ -287,19 +289,24 @@ public void testMetricsForNestedStructFields() throws IOException { assertCounts(5, 1L, 0L, metrics); assertBounds(5, LongType.get(), 20L, 20L, metrics); assertCounts(6, 1L, 0L, metrics); - assertBounds(6, BinaryType.get(), - ByteBuffer.wrap("A".getBytes()), ByteBuffer.wrap("A".getBytes()), metrics); + assertBounds( + 6, + BinaryType.get(), + ByteBuffer.wrap("A".getBytes()), + ByteBuffer.wrap("A".getBytes()), + metrics); assertCounts(7, 1L, 0L, 1L, metrics); assertBounds(7, DoubleType.get(), null, null, metrics); } @Test public void testMetricsModeForNestedStructFields() throws IOException { - Map properties = ImmutableMap.of( - TableProperties.DEFAULT_WRITE_METRICS_MODE, - MetricsModes.None.get().toString(), - TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "nestedStructCol.longCol", - MetricsModes.Full.get().toString()); + Map properties = + ImmutableMap.of( + TableProperties.DEFAULT_WRITE_METRICS_MODE, + MetricsModes.None.get().toString(), + TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "nestedStructCol.longCol", + MetricsModes.Full.get().toString()); MetricsConfig config = MetricsConfig.fromProperties(properties); Metrics metrics = getMetrics(NESTED_SCHEMA, config, buildNestedTestRecord()); @@ -326,14 +333,14 @@ private Record buildNestedTestRecord() { @Test public void testMetricsForListAndMapElements() throws IOException { - StructType structType = StructType.of( - required(1, "leafIntCol", IntegerType.get()), - optional(2, "leafStringCol", StringType.get()) - ); - Schema schema = new Schema( - optional(3, "intListCol", ListType.ofRequired(4, IntegerType.get())), - optional(5, "mapCol", MapType.ofRequired(6, 7, StringType.get(), structType)) - ); + StructType structType = + StructType.of( + required(1, "leafIntCol", IntegerType.get()), + optional(2, "leafStringCol", StringType.get())); + Schema schema = + new Schema( + optional(3, "intListCol", ListType.ofRequired(4, IntegerType.get())), + optional(5, "mapCol", MapType.ofRequired(6, 7, StringType.get(), structType))); Record record = GenericRecord.create(schema); record.setField("intListCol", Lists.newArrayList(10, 11, 12)); @@ -366,9 +373,7 @@ public void testMetricsForListAndMapElements() throws IOException { @Test public void testMetricsForNullColumns() throws IOException { - Schema schema = new Schema( - optional(1, "intCol", IntegerType.get()) - ); + Schema schema = new Schema(optional(1, "intCol", IntegerType.get())); Record firstRecord = GenericRecord.create(schema); firstRecord.setField("intCol", null); Record secondRecord = GenericRecord.create(schema); @@ -393,8 +398,12 @@ public void testMetricsForNaNColumns() throws IOException { @Test public void testColumnBoundsWithNaNValueAtFront() throws IOException { - Metrics metrics = getMetrics(FLOAT_DOUBLE_ONLY_SCHEMA, - NAN_ONLY_RECORD, FLOAT_DOUBLE_RECORD_1, FLOAT_DOUBLE_RECORD_2); + Metrics metrics = + getMetrics( + FLOAT_DOUBLE_ONLY_SCHEMA, + NAN_ONLY_RECORD, + FLOAT_DOUBLE_RECORD_1, + FLOAT_DOUBLE_RECORD_2); Assert.assertEquals(3L, (long) metrics.recordCount()); assertCounts(1, 3L, 0L, 1L, metrics); assertCounts(2, 3L, 0L, 1L, metrics); @@ -405,8 +414,12 @@ public void testColumnBoundsWithNaNValueAtFront() throws IOException { @Test public void testColumnBoundsWithNaNValueInMiddle() throws IOException { - Metrics metrics = getMetrics(FLOAT_DOUBLE_ONLY_SCHEMA, - FLOAT_DOUBLE_RECORD_1, NAN_ONLY_RECORD, FLOAT_DOUBLE_RECORD_2); + Metrics metrics = + getMetrics( + FLOAT_DOUBLE_ONLY_SCHEMA, + FLOAT_DOUBLE_RECORD_1, + NAN_ONLY_RECORD, + FLOAT_DOUBLE_RECORD_2); Assert.assertEquals(3L, (long) metrics.recordCount()); assertCounts(1, 3L, 0L, 1L, metrics); assertCounts(2, 3L, 0L, 1L, metrics); @@ -417,8 +430,12 @@ public void testColumnBoundsWithNaNValueInMiddle() throws IOException { @Test public void testColumnBoundsWithNaNValueAtEnd() throws IOException { - Metrics metrics = getMetrics(FLOAT_DOUBLE_ONLY_SCHEMA, - FLOAT_DOUBLE_RECORD_1, FLOAT_DOUBLE_RECORD_2, NAN_ONLY_RECORD); + Metrics metrics = + getMetrics( + FLOAT_DOUBLE_ONLY_SCHEMA, + FLOAT_DOUBLE_RECORD_1, + FLOAT_DOUBLE_RECORD_2, + NAN_ONLY_RECORD); Assert.assertEquals(3L, (long) metrics.recordCount()); assertCounts(1, 3L, 0L, 1L, metrics); assertCounts(2, 3L, 0L, 1L, metrics); @@ -429,7 +446,8 @@ public void testColumnBoundsWithNaNValueAtEnd() throws IOException { @Test public void testMetricsForTopLevelWithMultipleRowGroup() throws Exception { - Assume.assumeTrue("Skip test for formats that do not support small row groups", supportsSmallRowGroups()); + Assume.assumeTrue( + "Skip test for formats that do not support small row groups", supportsSmallRowGroups()); int recordCount = 201; List records = Lists.newArrayListWithExpectedSize(recordCount); @@ -441,20 +459,24 @@ public void testMetricsForTopLevelWithMultipleRowGroup() throws Exception { newRecord.setField("longCol", i == 0 ? null : i + 1L); newRecord.setField("floatCol", i + 1.0F); newRecord.setField("doubleCol", i == 0 ? null : i + 1.0D); - newRecord.setField("decimalCol", i == 0 ? null : new BigDecimal(i + "").add(new BigDecimal("1.00"))); + newRecord.setField( + "decimalCol", i == 0 ? null : new BigDecimal(i + "").add(new BigDecimal("1.00"))); newRecord.setField("stringCol", "AAA"); newRecord.setField("dateCol", DateTimeUtil.dateFromDays(i + 1)); newRecord.setField("timeCol", DateTimeUtil.timeFromMicros(i + 1L)); newRecord.setField("timestampColAboveEpoch", DateTimeUtil.timestampFromMicros(i + 1L)); newRecord.setField("fixedCol", fixed); newRecord.setField("binaryCol", ByteBuffer.wrap("S".getBytes())); - newRecord.setField("timestampColBelowEpoch", DateTimeUtil.timestampFromMicros((i + 1L) * -1L)); + newRecord.setField( + "timestampColBelowEpoch", DateTimeUtil.timestampFromMicros((i + 1L) * -1L)); records.add(newRecord); } // create file with multiple row groups. by using smaller number of bytes OutputFile outputFile = createOutputFile(); - Metrics metrics = getMetricsForRecordsWithSmallRowGroups(SIMPLE_SCHEMA, outputFile, records.toArray(new Record[0])); + Metrics metrics = + getMetricsForRecordsWithSmallRowGroups( + SIMPLE_SCHEMA, outputFile, records.toArray(new Record[0])); InputFile recordsFile = outputFile.toInputFile(); Assert.assertNotNull(recordsFile); @@ -472,13 +494,14 @@ public void testMetricsForTopLevelWithMultipleRowGroup() throws Exception { assertCounts(5, 201L, 1L, 0L, metrics); assertBounds(5, Types.DoubleType.get(), 2.0D, 201.0D, metrics); assertCounts(6, 201L, 1L, metrics); - assertBounds(6, Types.DecimalType.of(10, 2), new BigDecimal("2.00"), - new BigDecimal("201.00"), metrics); + assertBounds( + 6, Types.DecimalType.of(10, 2), new BigDecimal("2.00"), new BigDecimal("201.00"), metrics); } @Test public void testMetricsForNestedStructFieldsWithMultipleRowGroup() throws IOException { - Assume.assumeTrue("Skip test for formats that do not support small row groups", supportsSmallRowGroups()); + Assume.assumeTrue( + "Skip test for formats that do not support small row groups", supportsSmallRowGroups()); int recordCount = 201; List records = Lists.newArrayListWithExpectedSize(recordCount); @@ -499,7 +522,9 @@ public void testMetricsForNestedStructFieldsWithMultipleRowGroup() throws IOExce // create file with multiple row groups. by using smaller number of bytes OutputFile outputFile = createOutputFile(); - Metrics metrics = getMetricsForRecordsWithSmallRowGroups(NESTED_SCHEMA, outputFile, records.toArray(new Record[0])); + Metrics metrics = + getMetricsForRecordsWithSmallRowGroups( + NESTED_SCHEMA, outputFile, records.toArray(new Record[0])); InputFile recordsFile = outputFile.toInputFile(); Assert.assertNotNull(recordsFile); @@ -514,18 +539,23 @@ public void testMetricsForNestedStructFieldsWithMultipleRowGroup() throws IOExce assertCounts(5, 201L, 0L, metrics); assertBounds(5, LongType.get(), 1L, 201L, metrics); assertCounts(6, 201L, 0L, metrics); - assertBounds(6, BinaryType.get(), - ByteBuffer.wrap("A".getBytes()), ByteBuffer.wrap("A".getBytes()), metrics); + assertBounds( + 6, + BinaryType.get(), + ByteBuffer.wrap("A".getBytes()), + ByteBuffer.wrap("A".getBytes()), + metrics); assertCounts(7, 201L, 0L, 201L, metrics); assertBounds(7, DoubleType.get(), null, null, metrics); } @Test public void testNoneMetricsMode() throws IOException { - Metrics metrics = getMetrics( - NESTED_SCHEMA, - MetricsConfig.fromProperties(ImmutableMap.of("write.metadata.metrics.default", "none")), - buildNestedTestRecord()); + Metrics metrics = + getMetrics( + NESTED_SCHEMA, + MetricsConfig.fromProperties(ImmutableMap.of("write.metadata.metrics.default", "none")), + buildNestedTestRecord()); Assert.assertEquals(1L, (long) metrics.recordCount()); Assert.assertTrue(metrics.columnSizes().values().stream().allMatch(Objects::nonNull)); assertCounts(1, null, null, metrics); @@ -542,10 +572,12 @@ public void testNoneMetricsMode() throws IOException { @Test public void testCountsMetricsMode() throws IOException { - Metrics metrics = getMetrics( - NESTED_SCHEMA, - MetricsConfig.fromProperties(ImmutableMap.of("write.metadata.metrics.default", "counts")), - buildNestedTestRecord()); + Metrics metrics = + getMetrics( + NESTED_SCHEMA, + MetricsConfig.fromProperties( + ImmutableMap.of("write.metadata.metrics.default", "counts")), + buildNestedTestRecord()); Assert.assertEquals(1L, (long) metrics.recordCount()); Assert.assertTrue(metrics.columnSizes().values().stream().allMatch(Objects::nonNull)); assertCounts(1, 1L, 0L, metrics); @@ -562,10 +594,11 @@ public void testCountsMetricsMode() throws IOException { @Test public void testFullMetricsMode() throws IOException { - Metrics metrics = getMetrics( - NESTED_SCHEMA, - MetricsConfig.fromProperties(ImmutableMap.of("write.metadata.metrics.default", "full")), - buildNestedTestRecord()); + Metrics metrics = + getMetrics( + NESTED_SCHEMA, + MetricsConfig.fromProperties(ImmutableMap.of("write.metadata.metrics.default", "full")), + buildNestedTestRecord()); Assert.assertEquals(1L, (long) metrics.recordCount()); Assert.assertTrue(metrics.columnSizes().values().stream().allMatch(Objects::nonNull)); assertCounts(1, 1L, 0L, metrics); @@ -575,8 +608,12 @@ public void testFullMetricsMode() throws IOException { assertCounts(5, 1L, 0L, metrics); assertBounds(5, Types.LongType.get(), 20L, 20L, metrics); assertCounts(6, 1L, 0L, metrics); - assertBounds(6, Types.BinaryType.get(), - ByteBuffer.wrap("A".getBytes()), ByteBuffer.wrap("A".getBytes()), metrics); + assertBounds( + 6, + Types.BinaryType.get(), + ByteBuffer.wrap("A".getBytes()), + ByteBuffer.wrap("A".getBytes()), + metrics); assertCounts(7, 1L, 0L, 1L, metrics); assertBounds(7, Types.DoubleType.get(), null, null, metrics); } @@ -584,18 +621,18 @@ public void testFullMetricsMode() throws IOException { @Test public void testTruncateStringMetricsMode() throws IOException { String colName = "str_to_truncate"; - Schema singleStringColSchema = new Schema( - required(1, colName, Types.StringType.get()) - ); + Schema singleStringColSchema = new Schema(required(1, colName, Types.StringType.get())); String value = "Lorem ipsum dolor sit amet"; Record record = GenericRecord.create(singleStringColSchema); record.setField(colName, value); - Metrics metrics = getMetrics( - singleStringColSchema, - MetricsConfig.fromProperties(ImmutableMap.of("write.metadata.metrics.default", "truncate(10)")), - record); + Metrics metrics = + getMetrics( + singleStringColSchema, + MetricsConfig.fromProperties( + ImmutableMap.of("write.metadata.metrics.default", "truncate(10)")), + record); CharBuffer expectedMinBound = CharBuffer.wrap("Lorem ipsu"); CharBuffer expectedMaxBound = CharBuffer.wrap("Lorem ipsv"); @@ -608,21 +645,21 @@ public void testTruncateStringMetricsMode() throws IOException { @Test public void testTruncateBinaryMetricsMode() throws IOException { String colName = "bin_to_truncate"; - Schema singleBinaryColSchema = new Schema( - required(1, colName, Types.BinaryType.get()) - ); + Schema singleBinaryColSchema = new Schema(required(1, colName, Types.BinaryType.get())); - byte[] value = new byte[]{ 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x10, 0xA, 0xB}; + byte[] value = new byte[] {0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x10, 0xA, 0xB}; Record record = GenericRecord.create(singleBinaryColSchema); record.setField(colName, ByteBuffer.wrap(value)); - Metrics metrics = getMetrics( - singleBinaryColSchema, - MetricsConfig.fromProperties(ImmutableMap.of("write.metadata.metrics.default", "truncate(5)")), - record); + Metrics metrics = + getMetrics( + singleBinaryColSchema, + MetricsConfig.fromProperties( + ImmutableMap.of("write.metadata.metrics.default", "truncate(5)")), + record); - ByteBuffer expectedMinBounds = ByteBuffer.wrap(new byte[]{ 0x1, 0x2, 0x3, 0x4, 0x5 }); - ByteBuffer expectedMaxBounds = ByteBuffer.wrap(new byte[]{ 0x1, 0x2, 0x3, 0x4, 0x6 }); + ByteBuffer expectedMinBounds = ByteBuffer.wrap(new byte[] {0x1, 0x2, 0x3, 0x4, 0x5}); + ByteBuffer expectedMaxBounds = ByteBuffer.wrap(new byte[] {0x1, 0x2, 0x3, 0x4, 0x6}); Assert.assertEquals(1L, (long) metrics.recordCount()); Assert.assertTrue(metrics.columnSizes().values().stream().allMatch(Objects::nonNull)); assertCounts(1, 1L, 0L, metrics); @@ -634,15 +671,18 @@ public void testSortedColumnMetrics() throws IOException { File tableDir = temp.newFolder(); tableDir.delete(); // created by table create - SortOrder sortOrder = SortOrder.builderFor(SIMPLE_SCHEMA) - .asc("booleanCol") - .asc("intCol") - .asc("longCol") - .asc("decimalCol") - .asc("stringCol") - .asc("dateCol").build(); + SortOrder sortOrder = + SortOrder.builderFor(SIMPLE_SCHEMA) + .asc("booleanCol") + .asc("intCol") + .asc("longCol") + .asc("decimalCol") + .asc("stringCol") + .asc("dateCol") + .build(); PartitionSpec spec = PartitionSpec.unpartitioned(); - Table table = TestTables.create(tableDir, "test", SIMPLE_SCHEMA, spec, sortOrder, formatVersion); + Table table = + TestTables.create(tableDir, "test", SIMPLE_SCHEMA, spec, sortOrder, formatVersion); table.updateProperties().set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none").commit(); Record firstRecord = GenericRecord.create(SIMPLE_SCHEMA); @@ -676,19 +716,16 @@ public void testSortedColumnMetrics() throws IOException { secondRecord.setField("binaryCol", ByteBuffer.wrap("S".getBytes())); secondRecord.setField("timestampColBelowEpoch", DateTimeUtil.timestampFromMicros(0L)); - Metrics metrics = getMetrics( - SIMPLE_SCHEMA, - MetricsConfig.forTable(table), - firstRecord, secondRecord); + Metrics metrics = + getMetrics(SIMPLE_SCHEMA, MetricsConfig.forTable(table), firstRecord, secondRecord); Assert.assertEquals(2L, (long) metrics.recordCount()); assertBounds(1, BooleanType.get(), false, true, metrics); assertBounds(2, IntegerType.get(), Integer.MIN_VALUE, Integer.MAX_VALUE, metrics); assertBounds(3, LongType.get(), Long.MIN_VALUE, Long.MAX_VALUE, metrics); - assertBounds(6, DecimalType.of(10, 2), - new BigDecimal("0.00"), new BigDecimal("10.00"), metrics); - assertBounds(7, StringType.get(), - CharBuffer.wrap("AAA"), CharBuffer.wrap("ZZZ"), metrics); + assertBounds( + 6, DecimalType.of(10, 2), new BigDecimal("0.00"), new BigDecimal("10.00"), metrics); + assertBounds(7, StringType.get(), CharBuffer.wrap("AAA"), CharBuffer.wrap("ZZZ"), metrics); assertBounds(8, DateType.get(), 1500, 3000, metrics); } @@ -697,11 +734,14 @@ public void testMetricsForSortedNestedStructFields() throws IOException { File tableDir = temp.newFolder(); tableDir.delete(); // created by table create - SortOrder sortOrder = SortOrder.builderFor(NESTED_SCHEMA) - .asc("nestedStructCol.longCol") - .asc("nestedStructCol.leafStructCol.leafLongCol").build(); + SortOrder sortOrder = + SortOrder.builderFor(NESTED_SCHEMA) + .asc("nestedStructCol.longCol") + .asc("nestedStructCol.leafStructCol.leafLongCol") + .build(); PartitionSpec spec = PartitionSpec.unpartitioned(); - Table table = TestTables.create(tableDir, "nested", NESTED_SCHEMA, spec, sortOrder, formatVersion); + Table table = + TestTables.create(tableDir, "nested", NESTED_SCHEMA, spec, sortOrder, formatVersion); Record leafStruct = GenericRecord.create(LEAF_STRUCT_TYPE); leafStruct.setField("leafLongCol", Long.MAX_VALUE); @@ -714,9 +754,7 @@ public void testMetricsForSortedNestedStructFields() throws IOException { record.setField("intCol", Integer.MAX_VALUE); record.setField("nestedStructCol", nestedStruct); - Metrics metrics = getMetrics(NESTED_SCHEMA, - MetricsConfig.forTable(table), - record); + Metrics metrics = getMetrics(NESTED_SCHEMA, MetricsConfig.forTable(table), record); assertBounds(3, LongType.get(), Long.MAX_VALUE, Long.MAX_VALUE, metrics); assertBounds(5, LongType.get(), Long.MAX_VALUE, Long.MAX_VALUE, metrics); @@ -726,7 +764,8 @@ protected void assertCounts(int fieldId, Long valueCount, Long nullValueCount, M assertCounts(fieldId, valueCount, nullValueCount, null, metrics); } - protected void assertCounts(int fieldId, Long valueCount, Long nullValueCount, Long nanValueCount, Metrics metrics) { + protected void assertCounts( + int fieldId, Long valueCount, Long nullValueCount, Long nanValueCount, Metrics metrics) { Map valueCounts = metrics.valueCounts(); Map nullValueCounts = metrics.nullValueCounts(); Map nanValueCounts = metrics.nanValueCounts(); @@ -735,7 +774,8 @@ protected void assertCounts(int fieldId, Long valueCount, Long nullValueCount, L Assert.assertEquals(nanValueCount, nanValueCounts.get(fieldId)); } - protected void assertBounds(int fieldId, Type type, T lowerBound, T upperBound, Metrics metrics) { + protected void assertBounds( + int fieldId, Type type, T lowerBound, T upperBound, Metrics metrics) { Map lowerBounds = metrics.lowerBounds(); Map upperBounds = metrics.upperBounds(); @@ -746,5 +786,4 @@ protected void assertBounds(int fieldId, Type type, T lowerBound, T upperBou upperBound, upperBounds.containsKey(fieldId) ? fromByteBuffer(type, upperBounds.get(fieldId)) : null); } - } diff --git a/core/src/test/java/org/apache/iceberg/TestMetricsModes.java b/core/src/test/java/org/apache/iceberg/TestMetricsModes.java index 939811067619..c6c85ff2f98d 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetricsModes.java +++ b/core/src/test/java/org/apache/iceberg/TestMetricsModes.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -37,8 +38,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestMetricsModes { @@ -46,18 +45,16 @@ public class TestMetricsModes { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestMetricsModes(int formatVersion) { this.formatVersion = formatVersion; } - @Rule - public ExpectedException exceptionRule = ExpectedException.none(); + @Rule public ExpectedException exceptionRule = ExpectedException.none(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @After public void after() { @@ -85,24 +82,34 @@ public void testInvalidTruncationLength() { @Test public void testInvalidColumnModeValue() { - Map properties = ImmutableMap.of( - TableProperties.DEFAULT_WRITE_METRICS_MODE, "full", - TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col", "troncate(5)"); + Map properties = + ImmutableMap.of( + TableProperties.DEFAULT_WRITE_METRICS_MODE, + "full", + TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col", + "troncate(5)"); MetricsConfig config = MetricsConfig.fromProperties(properties); - Assert.assertEquals("Invalid mode should be defaulted to table default (full)", - MetricsModes.Full.get(), config.columnMode("col")); + Assert.assertEquals( + "Invalid mode should be defaulted to table default (full)", + MetricsModes.Full.get(), + config.columnMode("col")); } @Test public void testInvalidDefaultColumnModeValue() { - Map properties = ImmutableMap.of( - TableProperties.DEFAULT_WRITE_METRICS_MODE, "fuull", - TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col", "troncate(5)"); + Map properties = + ImmutableMap.of( + TableProperties.DEFAULT_WRITE_METRICS_MODE, + "fuull", + TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col", + "troncate(5)"); MetricsConfig config = MetricsConfig.fromProperties(properties); - Assert.assertEquals("Invalid mode should be defaulted to library default (truncate(16))", - MetricsModes.Truncate.withLength(16), config.columnMode("col")); + Assert.assertEquals( + "Invalid mode should be defaulted to library default (truncate(16))", + MetricsModes.Truncate.withLength(16), + config.columnMode("col")); } @Test @@ -110,30 +117,38 @@ public void testMetricsConfigSortedColsDefault() throws Exception { File tableDir = temp.newFolder(); tableDir.delete(); // created by table create - Schema schema = new Schema( - required(1, "col1", Types.IntegerType.get()), - required(2, "col2", Types.IntegerType.get()), - required(3, "col3", Types.IntegerType.get()), - required(4, "col4", Types.IntegerType.get()) - ); + Schema schema = + new Schema( + required(1, "col1", Types.IntegerType.get()), + required(2, "col2", Types.IntegerType.get()), + required(3, "col3", Types.IntegerType.get()), + required(4, "col4", Types.IntegerType.get())); SortOrder sortOrder = SortOrder.builderFor(schema).asc("col2").asc("col3").build(); - Table testTable = TestTables.create(tableDir, "test", schema, PartitionSpec.unpartitioned(), - sortOrder, formatVersion); - testTable.updateProperties() + Table testTable = + TestTables.create( + tableDir, "test", schema, PartitionSpec.unpartitioned(), sortOrder, formatVersion); + testTable + .updateProperties() .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts") .set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col1", "counts") .set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col2", "none") .commit(); MetricsConfig config = MetricsConfig.forTable(testTable); - Assert.assertEquals("Non-sorted existing column should not be overridden", - Counts.get(), config.columnMode("col1")); - Assert.assertEquals("Sorted column defaults should not override user specified config", - None.get(), config.columnMode("col2")); - Assert.assertEquals("Unspecified sorted column should use default", - Truncate.withLength(16), config.columnMode("col3")); - Assert.assertEquals("Unspecified normal column should use default", - Counts.get(), config.columnMode("col4")); + Assert.assertEquals( + "Non-sorted existing column should not be overridden", + Counts.get(), + config.columnMode("col1")); + Assert.assertEquals( + "Sorted column defaults should not override user specified config", + None.get(), + config.columnMode("col2")); + Assert.assertEquals( + "Unspecified sorted column should use default", + Truncate.withLength(16), + config.columnMode("col3")); + Assert.assertEquals( + "Unspecified normal column should use default", Counts.get(), config.columnMode("col4")); } @Test @@ -141,48 +156,65 @@ public void testMetricsConfigSortedColsDefaultByInvalid() throws Exception { File tableDir = temp.newFolder(); tableDir.delete(); // created by table create - Schema schema = new Schema( - required(1, "col1", Types.IntegerType.get()), - required(2, "col2", Types.IntegerType.get()), - required(3, "col3", Types.IntegerType.get()) - ); + Schema schema = + new Schema( + required(1, "col1", Types.IntegerType.get()), + required(2, "col2", Types.IntegerType.get()), + required(3, "col3", Types.IntegerType.get())); SortOrder sortOrder = SortOrder.builderFor(schema).asc("col2").asc("col3").build(); - Table testTable = TestTables.create(tableDir, "test", schema, PartitionSpec.unpartitioned(), - sortOrder, formatVersion); - testTable.updateProperties() + Table testTable = + TestTables.create( + tableDir, "test", schema, PartitionSpec.unpartitioned(), sortOrder, formatVersion); + testTable + .updateProperties() .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts") .set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col1", "full") .set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col2", "invalid") .commit(); MetricsConfig config = MetricsConfig.forTable(testTable); - Assert.assertEquals("Non-sorted existing column should not be overridden by sorted column", - Full.get(), config.columnMode("col1")); - Assert.assertEquals("Original default applies as user entered invalid mode for sorted column", - Counts.get(), config.columnMode("col2")); + Assert.assertEquals( + "Non-sorted existing column should not be overridden by sorted column", + Full.get(), + config.columnMode("col1")); + Assert.assertEquals( + "Original default applies as user entered invalid mode for sorted column", + Counts.get(), + config.columnMode("col2")); } @Test public void testMetricsConfigInferredDefaultModeLimit() throws IOException { - Schema schema = new Schema( - required(1, "col1", Types.IntegerType.get()), - required(2, "col2", Types.IntegerType.get()), - required(3, "col3", Types.IntegerType.get()) - ); + Schema schema = + new Schema( + required(1, "col1", Types.IntegerType.get()), + required(2, "col2", Types.IntegerType.get()), + required(3, "col3", Types.IntegerType.get())); File tableDir = temp.newFolder(); Assert.assertTrue(tableDir.delete()); - Table table = TestTables.create( - tableDir, "test", schema, PartitionSpec.unpartitioned(), SortOrder.unsorted(), formatVersion); + Table table = + TestTables.create( + tableDir, + "test", + schema, + PartitionSpec.unpartitioned(), + SortOrder.unsorted(), + formatVersion); // only infer a default for the first two columns - table.updateProperties().set(TableProperties.METRICS_MAX_INFERRED_COLUMN_DEFAULTS, "2").commit(); + table + .updateProperties() + .set(TableProperties.METRICS_MAX_INFERRED_COLUMN_DEFAULTS, "2") + .commit(); MetricsConfig config = MetricsConfig.forTable(table); - Assert.assertEquals("Should use default mode for col1", Truncate.withLength(16), config.columnMode("col1")); - Assert.assertEquals("Should use default mode for col2", Truncate.withLength(16), config.columnMode("col2")); + Assert.assertEquals( + "Should use default mode for col1", Truncate.withLength(16), config.columnMode("col1")); + Assert.assertEquals( + "Should use default mode for col2", Truncate.withLength(16), config.columnMode("col2")); Assert.assertEquals("Should use None for col3", None.get(), config.columnMode("col3")); } } diff --git a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java index f006b1d848f2..e314385f7e4f 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java +++ b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java @@ -16,43 +16,52 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -import java.nio.ByteBuffer; -import java.util.Comparator; -import org.apache.iceberg.expressions.Literal; -import org.junit.Assert; -import org.junit.Test; - import static org.apache.iceberg.util.BinaryUtil.truncateBinary; import static org.apache.iceberg.util.BinaryUtil.truncateBinaryMax; import static org.apache.iceberg.util.BinaryUtil.truncateBinaryMin; import static org.apache.iceberg.util.UnicodeUtil.truncateStringMax; import static org.apache.iceberg.util.UnicodeUtil.truncateStringMin; +import java.nio.ByteBuffer; +import java.util.Comparator; +import org.apache.iceberg.expressions.Literal; +import org.junit.Assert; +import org.junit.Test; + @SuppressWarnings("checkstyle:LocalVariableName") public class TestMetricsTruncation { @Test public void testTruncateBinary() { - ByteBuffer original = ByteBuffer.wrap(new byte[]{1, 1, (byte) 0xFF, 2}); - ByteBuffer emptyByteBuffer = ByteBuffer.allocate(0); + ByteBuffer original = ByteBuffer.wrap(new byte[] {1, 1, (byte) 0xFF, 2}); + ByteBuffer emptyByteBuffer = ByteBuffer.allocate(0); Comparator cmp = Literal.of(original).comparator(); - Assert.assertEquals("Truncating to a length of zero should return an empty ByteBuffer", - 0, cmp.compare(truncateBinary(original, 0), emptyByteBuffer)); - Assert.assertEquals("Truncating to the original buffer's remaining size should return the original buffer", - original, truncateBinary(original, original.remaining())); - Assert.assertEquals("Truncating with a length greater than the input's remaining size should return the input", - original, truncateBinary(original, 16)); + Assert.assertEquals( + "Truncating to a length of zero should return an empty ByteBuffer", + 0, + cmp.compare(truncateBinary(original, 0), emptyByteBuffer)); + Assert.assertEquals( + "Truncating to the original buffer's remaining size should return the original buffer", + original, + truncateBinary(original, original.remaining())); + Assert.assertEquals( + "Truncating with a length greater than the input's remaining size should return the input", + original, + truncateBinary(original, 16)); ByteBuffer truncated = truncateBinary(original, 2); - Assert.assertTrue("Truncating with a length less than the input's remaining size should truncate properly", + Assert.assertTrue( + "Truncating with a length less than the input's remaining size should truncate properly", truncated.remaining() == 2 && truncated.position() == 0); - Assert.assertTrue("Truncating should not modify the input buffer", + Assert.assertTrue( + "Truncating should not modify the input buffer", original.remaining() == 4 && original.position() == 0); - AssertHelpers.assertThrows("Should not allow a negative truncation length", - IllegalArgumentException.class, "length should be non-negative", + AssertHelpers.assertThrows( + "Should not allow a negative truncation length", + IllegalArgumentException.class, + "length should be non-negative", () -> truncateBinary(original, -1)); } @@ -65,16 +74,21 @@ public void testTruncateBinaryMin() { ByteBuffer test2_2 = ByteBuffer.wrap(new byte[] {(byte) 0xFF, (byte) 0xFF}); Comparator cmp = Literal.of(test1).comparator(); - Assert.assertTrue("Truncated lower bound should be lower than or equal to the actual lower bound", + Assert.assertTrue( + "Truncated lower bound should be lower than or equal to the actual lower bound", cmp.compare(truncateBinaryMin(Literal.of(test1), 2).value(), test1) <= 0); - Assert.assertTrue("Output must have the first two bytes of the input", + Assert.assertTrue( + "Output must have the first two bytes of the input", cmp.compare(truncateBinaryMin(Literal.of(test1), 2).value(), test1_2_expected) == 0); - Assert.assertTrue("No truncation required as truncate length is greater than the input size", + Assert.assertTrue( + "No truncation required as truncate length is greater than the input size", cmp.compare(truncateBinaryMin(Literal.of(test1), 5).value(), test1) == 0); - Assert.assertTrue("Truncated lower bound should be lower than or equal to the actual lower bound", + Assert.assertTrue( + "Truncated lower bound should be lower than or equal to the actual lower bound", cmp.compare(truncateBinaryMin(Literal.of(test2), 2).value(), test2) <= 0); - Assert.assertTrue("Output must have the first two bytes of the input. A lower bound exists " + - "even though the first two bytes are the max value", + Assert.assertTrue( + "Output must have the first two bytes of the input. A lower bound exists " + + "even though the first two bytes are the max value", cmp.compare(truncateBinaryMin(Literal.of(test2), 2).value(), test2_2) == 0); } @@ -87,23 +101,31 @@ public void testTruncateBinaryMax() { ByteBuffer expectedOutput = ByteBuffer.wrap(new byte[] {1, 2}); Comparator cmp = Literal.of(test1).comparator(); - Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound", + Assert.assertTrue( + "Truncated upper bound should be greater than or equal to the actual upper bound", cmp.compare(truncateBinaryMax(Literal.of(test1), 2).value(), test1) >= 0); - Assert.assertTrue("Output must have two bytes and the second byte of the input must be incremented", + Assert.assertTrue( + "Output must have two bytes and the second byte of the input must be incremented", cmp.compare(truncateBinaryMax(Literal.of(test1), 2).value(), expectedOutput) == 0); - Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound", + Assert.assertTrue( + "Truncated upper bound should be greater than or equal to the actual upper bound", cmp.compare(truncateBinaryMax(Literal.of(test2), 2).value(), test2) >= 0); - Assert.assertTrue("Since the third byte is already the max value, output must have two bytes " + - "with the second byte incremented ", cmp.compare( - truncateBinaryMax(Literal.of(test2), 3).value(), expectedOutput) == 0); - Assert.assertTrue("No truncation required as truncate length is greater than the input size", + Assert.assertTrue( + "Since the third byte is already the max value, output must have two bytes " + + "with the second byte incremented ", + cmp.compare(truncateBinaryMax(Literal.of(test2), 3).value(), expectedOutput) == 0); + Assert.assertTrue( + "No truncation required as truncate length is greater than the input size", cmp.compare(truncateBinaryMax(Literal.of(test3), 5).value(), test3) == 0); - Assert.assertNull("An upper bound doesn't exist since the first two bytes are the max value", + Assert.assertNull( + "An upper bound doesn't exist since the first two bytes are the max value", truncateBinaryMax(Literal.of(test3), 2)); - Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound", + Assert.assertTrue( + "Truncated upper bound should be greater than or equal to the actual upper bound", cmp.compare(truncateBinaryMax(Literal.of(test4), 2).value(), test4) >= 0); - Assert.assertTrue("Since a shorter sequence is considered smaller, output must have two bytes " + - "and the second byte of the input must be incremented", + Assert.assertTrue( + "Since a shorter sequence is considered smaller, output must have two bytes " + + "and the second byte of the input must be incremented", cmp.compare(truncateBinaryMax(Literal.of(test4), 2).value(), expectedOutput) == 0); } @@ -122,25 +144,35 @@ public void testTruncateStringMin() { String test4 = "\uD800\uDC00\uD800\uDC00"; String test4_1_expected = "\uD800\uDC00"; Comparator cmp = Literal.of(test1).comparator(); - Assert.assertTrue("Truncated lower bound should be lower than or equal to the actual lower bound", + Assert.assertTrue( + "Truncated lower bound should be lower than or equal to the actual lower bound", cmp.compare(truncateStringMin(Literal.of(test1), 3).value(), test1) <= 0); - Assert.assertTrue("No truncation required as truncate length is greater than the input size", + Assert.assertTrue( + "No truncation required as truncate length is greater than the input size", cmp.compare(truncateStringMin(Literal.of(test1), 8).value(), test1) == 0); - Assert.assertTrue("Output must have the first two characters of the input", + Assert.assertTrue( + "Output must have the first two characters of the input", cmp.compare(truncateStringMin(Literal.of(test1), 2).value(), test1_2_expected) == 0); - Assert.assertTrue("Output must have the first three characters of the input", + Assert.assertTrue( + "Output must have the first three characters of the input", cmp.compare(truncateStringMin(Literal.of(test1), 3).value(), test1_3_expected) == 0); - Assert.assertTrue("Truncated lower bound should be lower than or equal to the actual lower bound", + Assert.assertTrue( + "Truncated lower bound should be lower than or equal to the actual lower bound", cmp.compare(truncateStringMin(Literal.of(test2), 16).value(), test2) <= 0); - Assert.assertTrue("Output must have the first seven characters of the input", + Assert.assertTrue( + "Output must have the first seven characters of the input", cmp.compare(truncateStringMin(Literal.of(test2), 7).value(), test2_7_expected) == 0); - Assert.assertTrue("Truncated lower bound should be lower than or equal to the actual lower bound", + Assert.assertTrue( + "Truncated lower bound should be lower than or equal to the actual lower bound", cmp.compare(truncateStringMin(Literal.of(test3), 2).value(), test3) <= 0); - Assert.assertTrue("No truncation required as truncate length is equal to the input size", + Assert.assertTrue( + "No truncation required as truncate length is equal to the input size", cmp.compare(truncateStringMin(Literal.of(test3), 2).value(), test3) == 0); - Assert.assertTrue("Truncated lower bound should be lower than or equal to the actual lower bound", + Assert.assertTrue( + "Truncated lower bound should be lower than or equal to the actual lower bound", cmp.compare(truncateStringMin(Literal.of(test4), 1).value(), test4) <= 0); - Assert.assertTrue("Output must have the first 4 byte UTF-8 character of the input", + Assert.assertTrue( + "Output must have the first 4 byte UTF-8 character of the input", cmp.compare(truncateStringMin(Literal.of(test4), 1).value(), test4_1_expected) == 0); } @@ -168,48 +200,65 @@ public void testTruncateStringMax() { String test7_1_expected = "\uD83D\uDE03"; Comparator cmp = Literal.of(test1).comparator(); - Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound", + Assert.assertTrue( + "Truncated upper bound should be greater than or equal to the actual upper bound", cmp.compare(truncateStringMax(Literal.of(test1), 4).value(), test1) >= 0); - Assert.assertTrue("No truncation required as truncate length is equal to the input size", + Assert.assertTrue( + "No truncation required as truncate length is equal to the input size", cmp.compare(truncateStringMax(Literal.of(test1), 7).value(), test1) == 0); - Assert.assertTrue("Output must have two characters and the second character of the input must " + - "be incremented", cmp.compare( - truncateStringMax(Literal.of(test1), 2).value(), test1_2_expected) == 0); - Assert.assertTrue("Output must have three characters and the third character of the input must " + - "be incremented", cmp.compare( - truncateStringMax(Literal.of(test1), 3).value(), test1_3_expected) == 0); - Assert.assertTrue("No truncation required as truncate length is greater than the input size", + Assert.assertTrue( + "Output must have two characters and the second character of the input must " + + "be incremented", + cmp.compare(truncateStringMax(Literal.of(test1), 2).value(), test1_2_expected) == 0); + Assert.assertTrue( + "Output must have three characters and the third character of the input must " + + "be incremented", + cmp.compare(truncateStringMax(Literal.of(test1), 3).value(), test1_3_expected) == 0); + Assert.assertTrue( + "No truncation required as truncate length is greater than the input size", cmp.compare(truncateStringMax(Literal.of(test1), 8).value(), test1) == 0); - Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper " + - "bound", cmp.compare(truncateStringMax(Literal.of(test2), 8).value(), test2) >= 0); - Assert.assertTrue("Output must have seven characters and the seventh character of the input " + - "must be incremented", cmp.compare( - truncateStringMax(Literal.of(test2), 7).value(), test2_7_expected) == 0); - Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper " + - "bound", cmp.compare(truncateStringMax(Literal.of(test3), 3).value(), test3) >= 0); - Assert.assertTrue("Output must have three characters and the third character of the input must " + - "be incremented. The second perceivable character in this string is actually a glyph. It consists of " + - "two unicode characters", cmp.compare( - truncateStringMax(Literal.of(test3), 3).value(), test3_3_expected) == 0); - Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound", + Assert.assertTrue( + "Truncated upper bound should be greater than or equal to the actual upper " + "bound", + cmp.compare(truncateStringMax(Literal.of(test2), 8).value(), test2) >= 0); + Assert.assertTrue( + "Output must have seven characters and the seventh character of the input " + + "must be incremented", + cmp.compare(truncateStringMax(Literal.of(test2), 7).value(), test2_7_expected) == 0); + Assert.assertTrue( + "Truncated upper bound should be greater than or equal to the actual upper " + "bound", + cmp.compare(truncateStringMax(Literal.of(test3), 3).value(), test3) >= 0); + Assert.assertTrue( + "Output must have three characters and the third character of the input must " + + "be incremented. The second perceivable character in this string is actually a glyph. It consists of " + + "two unicode characters", + cmp.compare(truncateStringMax(Literal.of(test3), 3).value(), test3_3_expected) == 0); + Assert.assertTrue( + "Truncated upper bound should be greater than or equal to the actual upper bound", cmp.compare(truncateStringMax(Literal.of(test4), 1).value(), test4) >= 0); - Assert.assertTrue("Output must have one character. Since the first character is the max 3 byte " + - "UTF-8 character, it should be incremented to the lowest 4 byte UTF-8 character", + Assert.assertTrue( + "Output must have one character. Since the first character is the max 3 byte " + + "UTF-8 character, it should be incremented to the lowest 4 byte UTF-8 character", cmp.compare(truncateStringMax(Literal.of(test4), 1).value(), test4_1_expected) == 0); - Assert.assertNull("An upper bound doesn't exist since the first two characters are max UTF-8 " + - "characters", truncateStringMax(Literal.of(test5), 1)); - Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound", + Assert.assertNull( + "An upper bound doesn't exist since the first two characters are max UTF-8 " + "characters", + truncateStringMax(Literal.of(test5), 1)); + Assert.assertTrue( + "Truncated upper bound should be greater than or equal to the actual upper bound", cmp.compare(truncateStringMax(Literal.of(test6), 2).value(), test6) >= 0); - Assert.assertTrue("Test 4 byte UTF-8 character increment. Output must have one character with " + - "the first character incremented", cmp.compare( - truncateStringMax(Literal.of(test6), 1).value(), test6_2_expected) == 0); - Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound", + Assert.assertTrue( + "Test 4 byte UTF-8 character increment. Output must have one character with " + + "the first character incremented", + cmp.compare(truncateStringMax(Literal.of(test6), 1).value(), test6_2_expected) == 0); + Assert.assertTrue( + "Truncated upper bound should be greater than or equal to the actual upper bound", cmp.compare(truncateStringMax(Literal.of(test7), 2).value(), test7) >= 0); - Assert.assertTrue("Test input with multiple 4 byte UTF-8 character where the second unicode " + - "character should be incremented", cmp.compare( - truncateStringMax(Literal.of(test7), 2).value(), test7_2_expected) == 0); - Assert.assertTrue("Test input with multiple 4 byte UTF-8 character where the first unicode " + - "character should be incremented", cmp.compare( - truncateStringMax(Literal.of(test7), 1).value(), test7_1_expected) == 0); + Assert.assertTrue( + "Test input with multiple 4 byte UTF-8 character where the second unicode " + + "character should be incremented", + cmp.compare(truncateStringMax(Literal.of(test7), 2).value(), test7_2_expected) == 0); + Assert.assertTrue( + "Test input with multiple 4 byte UTF-8 character where the first unicode " + + "character should be incremented", + cmp.compare(truncateStringMax(Literal.of(test7), 1).value(), test7_1_expected) == 0); } } diff --git a/core/src/test/java/org/apache/iceberg/TestMicroBatchBuilder.java b/core/src/test/java/org/apache/iceberg/TestMicroBatchBuilder.java index e30126a8bb06..b907a5031964 100644 --- a/core/src/test/java/org/apache/iceberg/TestMicroBatchBuilder.java +++ b/core/src/test/java/org/apache/iceberg/TestMicroBatchBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Collections; @@ -34,7 +33,7 @@ public class TestMicroBatchBuilder extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestMicroBatchBuilder(int formatVersion) { @@ -50,9 +49,10 @@ public void setupTableProperties() { public void testGenerateMicroBatch() { add(table.newAppend(), files("A", "B", "C", "D", "E")); - MicroBatch batch = MicroBatches.from(table.snapshot(1L), table.io()) - .specsById(table.specs()) - .generate(0, Long.MAX_VALUE, true); + MicroBatch batch = + MicroBatches.from(table.snapshot(1L), table.io()) + .specsById(table.specs()) + .generate(0, Long.MAX_VALUE, true); Assert.assertEquals(batch.snapshotId(), 1L); Assert.assertEquals(batch.startFileIndex(), 0); Assert.assertEquals(batch.endFileIndex(), 5); @@ -60,25 +60,28 @@ public void testGenerateMicroBatch() { Assert.assertTrue(batch.lastIndexOfSnapshot()); filesMatch(Lists.newArrayList("A", "B", "C", "D", "E"), filesToScan(batch.tasks())); - MicroBatch batch1 = MicroBatches.from(table.snapshot(1L), table.io()) - .specsById(table.specs()) - .generate(0, 15L, true); + MicroBatch batch1 = + MicroBatches.from(table.snapshot(1L), table.io()) + .specsById(table.specs()) + .generate(0, 15L, true); Assert.assertEquals(batch1.endFileIndex(), 1); Assert.assertEquals(batch1.sizeInBytes(), 10); Assert.assertFalse(batch1.lastIndexOfSnapshot()); filesMatch(Lists.newArrayList("A"), filesToScan(batch1.tasks())); - MicroBatch batch2 = MicroBatches.from(table.snapshot(1L), table.io()) - .specsById(table.specs()) - .generate(batch1.endFileIndex(), 30L, true); + MicroBatch batch2 = + MicroBatches.from(table.snapshot(1L), table.io()) + .specsById(table.specs()) + .generate(batch1.endFileIndex(), 30L, true); Assert.assertEquals(batch2.endFileIndex(), 4); Assert.assertEquals(batch2.sizeInBytes(), 30); Assert.assertFalse(batch2.lastIndexOfSnapshot()); filesMatch(Lists.newArrayList("B", "C", "D"), filesToScan(batch2.tasks())); - MicroBatch batch3 = MicroBatches.from(table.snapshot(1L), table.io()) - .specsById(table.specs()) - .generate(batch2.endFileIndex(), 50L, true); + MicroBatch batch3 = + MicroBatches.from(table.snapshot(1L), table.io()) + .specsById(table.specs()) + .generate(batch2.endFileIndex(), 50L, true); Assert.assertEquals(batch3.endFileIndex(), 5); Assert.assertEquals(batch3.sizeInBytes(), 10); Assert.assertTrue(batch3.lastIndexOfSnapshot()); @@ -89,9 +92,10 @@ public void testGenerateMicroBatch() { public void testGenerateMicroBatchWithSmallTargetSize() { add(table.newAppend(), files("A", "B", "C", "D", "E")); - MicroBatch batch = MicroBatches.from(table.snapshot(1L), table.io()) - .specsById(table.specs()) - .generate(0, 10L, true); + MicroBatch batch = + MicroBatches.from(table.snapshot(1L), table.io()) + .specsById(table.specs()) + .generate(0, 10L, true); Assert.assertEquals(batch.snapshotId(), 1L); Assert.assertEquals(batch.startFileIndex(), 0); Assert.assertEquals(batch.endFileIndex(), 1); @@ -99,41 +103,46 @@ public void testGenerateMicroBatchWithSmallTargetSize() { Assert.assertFalse(batch.lastIndexOfSnapshot()); filesMatch(Lists.newArrayList("A"), filesToScan(batch.tasks())); - MicroBatch batch1 = MicroBatches.from(table.snapshot(1L), table.io()) - .specsById(table.specs()) - .generate(batch.endFileIndex(), 5L, true); + MicroBatch batch1 = + MicroBatches.from(table.snapshot(1L), table.io()) + .specsById(table.specs()) + .generate(batch.endFileIndex(), 5L, true); Assert.assertEquals(batch1.endFileIndex(), 2); Assert.assertEquals(batch1.sizeInBytes(), 10); filesMatch(Lists.newArrayList("B"), filesToScan(batch1.tasks())); Assert.assertFalse(batch1.lastIndexOfSnapshot()); - MicroBatch batch2 = MicroBatches.from(table.snapshot(1L), table.io()) - .specsById(table.specs()) - .generate(batch1.endFileIndex(), 10L, true); + MicroBatch batch2 = + MicroBatches.from(table.snapshot(1L), table.io()) + .specsById(table.specs()) + .generate(batch1.endFileIndex(), 10L, true); Assert.assertEquals(batch2.endFileIndex(), 3); Assert.assertEquals(batch2.sizeInBytes(), 10); filesMatch(Lists.newArrayList("C"), filesToScan(batch2.tasks())); Assert.assertFalse(batch2.lastIndexOfSnapshot()); - MicroBatch batch3 = MicroBatches.from(table.snapshot(1L), table.io()) - .specsById(table.specs()) - .generate(batch2.endFileIndex(), 10L, true); + MicroBatch batch3 = + MicroBatches.from(table.snapshot(1L), table.io()) + .specsById(table.specs()) + .generate(batch2.endFileIndex(), 10L, true); Assert.assertEquals(batch3.endFileIndex(), 4); Assert.assertEquals(batch3.sizeInBytes(), 10); filesMatch(Lists.newArrayList("D"), filesToScan(batch3.tasks())); Assert.assertFalse(batch3.lastIndexOfSnapshot()); - MicroBatch batch4 = MicroBatches.from(table.snapshot(1L), table.io()) - .specsById(table.specs()) - .generate(batch3.endFileIndex(), 5L, true); + MicroBatch batch4 = + MicroBatches.from(table.snapshot(1L), table.io()) + .specsById(table.specs()) + .generate(batch3.endFileIndex(), 5L, true); Assert.assertEquals(batch4.endFileIndex(), 5); Assert.assertEquals(batch4.sizeInBytes(), 10); filesMatch(Lists.newArrayList("E"), filesToScan(batch4.tasks())); Assert.assertTrue(batch4.lastIndexOfSnapshot()); - MicroBatch batch5 = MicroBatches.from(table.snapshot(1L), table.io()) - .specsById(table.specs()) - .generate(batch4.endFileIndex(), 5L, true); + MicroBatch batch5 = + MicroBatches.from(table.snapshot(1L), table.io()) + .specsById(table.specs()) + .generate(batch4.endFileIndex(), 5L, true); Assert.assertEquals(batch5.endFileIndex(), 5); Assert.assertEquals(batch5.sizeInBytes(), 0); Assert.assertTrue(Iterables.isEmpty(batch5.tasks())); @@ -168,10 +177,13 @@ private static List files(String... names) { } private static List filesToScan(Iterable tasks) { - Iterable filesToRead = Iterables.transform(tasks, t -> { - String path = t.file().path().toString(); - return path.split("\\.")[0]; - }); + Iterable filesToRead = + Iterables.transform( + tasks, + t -> { + String path = t.file().path().toString(); + return path.split("\\.")[0]; + }); return Lists.newArrayList(filesToRead); } diff --git a/core/src/test/java/org/apache/iceberg/TestOffsetsBasedSplitScanTaskIterator.java b/core/src/test/java/org/apache/iceberg/TestOffsetsBasedSplitScanTaskIterator.java index 9f3b5bea3b95..7ca14a185a05 100644 --- a/core/src/test/java/org/apache/iceberg/TestOffsetsBasedSplitScanTaskIterator.java +++ b/core/src/test/java/org/apache/iceberg/TestOffsetsBasedSplitScanTaskIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -29,21 +28,39 @@ public class TestOffsetsBasedSplitScanTaskIterator { @Test public void testSplits() { // case when the last row group has more than one byte - verify(asList(4L, 10L, 15L, 18L, 30L, 45L), 48L, asList( - asList(4L, 6L), asList(10L, 5L), asList(15L, 3L), - asList(18L, 12L), asList(30L, 15L), asList(45L, 3L))); + verify( + asList(4L, 10L, 15L, 18L, 30L, 45L), + 48L, + asList( + asList(4L, 6L), + asList(10L, 5L), + asList(15L, 3L), + asList(18L, 12L), + asList(30L, 15L), + asList(45L, 3L))); // case when the last row group has one byte - verify(asList(4L, 10L, 15L, 18L, 30L, 47L), 48L, asList( - asList(4L, 6L), asList(10L, 5L), asList(15L, 3L), - asList(18L, 12L), asList(30L, 17L), asList(47L, 1L))); + verify( + asList(4L, 10L, 15L, 18L, 30L, 47L), + 48L, + asList( + asList(4L, 6L), + asList(10L, 5L), + asList(15L, 3L), + asList(18L, 12L), + asList(30L, 17L), + asList(47L, 1L))); } - private static void verify(List offsetRanges, long fileLen, List> offsetLenPairs) { + private static void verify( + List offsetRanges, long fileLen, List> offsetLenPairs) { FileScanTask mockFileScanTask = new MockFileScanTask(fileLen); - SplitScanTaskIterator splitTaskIterator = new OffsetsAwareSplitScanTaskIterator<>( - mockFileScanTask, mockFileScanTask.length(), - offsetRanges, TestOffsetsBasedSplitScanTaskIterator::createSplitTask); + SplitScanTaskIterator splitTaskIterator = + new OffsetsAwareSplitScanTaskIterator<>( + mockFileScanTask, + mockFileScanTask.length(), + offsetRanges, + TestOffsetsBasedSplitScanTaskIterator::createSplitTask); List tasks = Lists.newArrayList(splitTaskIterator); Assert.assertEquals("Number of tasks don't match", offsetLenPairs.size(), tasks.size()); diff --git a/core/src/test/java/org/apache/iceberg/TestOverwrite.java b/core/src/test/java/org/apache/iceberg/TestOverwrite.java index b21e78c47b8d..f7788fbe3221 100644 --- a/core/src/test/java/org/apache/iceberg/TestOverwrite.java +++ b/core/src/test/java/org/apache/iceberg/TestOverwrite.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.expressions.Expressions.and; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; @@ -33,71 +38,73 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestOverwrite extends TableTestBase { - private static final Schema DATE_SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get())); + private static final Schema DATE_SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get())); - private static final PartitionSpec PARTITION_BY_DATE = PartitionSpec - .builderFor(DATE_SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec PARTITION_BY_DATE = + PartitionSpec.builderFor(DATE_SCHEMA).identity("date").build(); private static final String TABLE_NAME = "overwrite_table"; - private static final DataFile FILE_0_TO_4 = DataFiles.builder(PARTITION_BY_DATE) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("date=2018-06-08") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - null, - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); - - private static final DataFile FILE_5_TO_9 = DataFiles.builder(PARTITION_BY_DATE) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("date=2018-06-09") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - null, - ImmutableMap.of(1, longToBuffer(5L)), // lower bounds - ImmutableMap.of(1, longToBuffer(9L)) // upper bounds - )) - .build(); - - private static final DataFile FILE_10_TO_14 = DataFiles.builder(PARTITION_BY_DATE) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("date=2018-06-09") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - null, - ImmutableMap.of(1, longToBuffer(5L)), // lower bounds - ImmutableMap.of(1, longToBuffer(9L)) // upper bounds - )) - .build(); + private static final DataFile FILE_0_TO_4 = + DataFiles.builder(PARTITION_BY_DATE) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("date=2018-06-08") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + null, + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(4L)) // upper bounds + )) + .build(); + + private static final DataFile FILE_5_TO_9 = + DataFiles.builder(PARTITION_BY_DATE) + .withPath("/path/to/data-2.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("date=2018-06-09") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + null, + ImmutableMap.of(1, longToBuffer(5L)), // lower bounds + ImmutableMap.of(1, longToBuffer(9L)) // upper bounds + )) + .build(); + + private static final DataFile FILE_10_TO_14 = + DataFiles.builder(PARTITION_BY_DATE) + .withPath("/path/to/data-2.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("date=2018-06-09") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + null, + ImmutableMap.of(1, longToBuffer(5L)), // lower bounds + ImmutableMap.of(1, longToBuffer(9L)) // upper bounds + )) + .build(); @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestOverwrite(int formatVersion) { @@ -115,12 +122,10 @@ public void createTestTable() throws IOException { File tableDir = temp.newFolder(); Assert.assertTrue(tableDir.delete()); - this.table = TestTables.create(tableDir, TABLE_NAME, DATE_SCHEMA, PARTITION_BY_DATE, formatVersion); + this.table = + TestTables.create(tableDir, TABLE_NAME, DATE_SCHEMA, PARTITION_BY_DATE, formatVersion); - table.newAppend() - .appendFile(FILE_0_TO_4) - .appendFile(FILE_5_TO_9) - .commit(); + table.newAppend().appendFile(FILE_0_TO_4).appendFile(FILE_5_TO_9).commit(); } @Test @@ -128,17 +133,18 @@ public void testOverwriteWithoutAppend() { TableMetadata base = TestTables.readMetadata(TABLE_NAME); long baseId = base.currentSnapshot().snapshotId(); - table.newOverwrite() - .overwriteByRowFilter(equal("date", "2018-06-08")) - .commit(); + table.newOverwrite().overwriteByRowFilter(equal("date", "2018-06-08")).commit(); long overwriteId = table.currentSnapshot().snapshotId(); Assert.assertNotEquals("Should create a new snapshot", baseId, overwriteId); - Assert.assertEquals("Table should have one manifest", - 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Table should have one manifest", + 1, + table.currentSnapshot().allManifests(table.io()).size()); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(0), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(0), ids(overwriteId, baseId), files(FILE_0_TO_4, FILE_5_TO_9), statuses(Status.DELETED, Status.EXISTING)); @@ -149,15 +155,19 @@ public void testOverwriteFailsDelete() { TableMetadata base = TestTables.readMetadata(TABLE_NAME); long baseId = base.currentSnapshot().snapshotId(); - OverwriteFiles overwrite = table.newOverwrite() - .overwriteByRowFilter(and(equal("date", "2018-06-09"), lessThan("id", 9))); + OverwriteFiles overwrite = + table + .newOverwrite() + .overwriteByRowFilter(and(equal("date", "2018-06-09"), lessThan("id", 9))); - AssertHelpers.assertThrows("Should reject commit with file not matching delete expression", - ValidationException.class, "Cannot delete file where some, but not all, rows match filter", + AssertHelpers.assertThrows( + "Should reject commit with file not matching delete expression", + ValidationException.class, + "Cannot delete file where some, but not all, rows match filter", overwrite::commit); - Assert.assertEquals("Should not create a new snapshot", - baseId, table.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Should not create a new snapshot", baseId, table.currentSnapshot().snapshotId()); } @Test @@ -165,7 +175,8 @@ public void testOverwriteWithAppendOutsideOfDelete() { TableMetadata base = TestTables.readMetadata(TABLE_NAME); long baseId = base.currentSnapshot().snapshotId(); - table.newOverwrite() + table + .newOverwrite() .overwriteByRowFilter(equal("date", "2018-06-08")) .addFile(FILE_10_TO_14) // in 2018-06-09, NOT in 2018-06-08 .commit(); @@ -173,16 +184,20 @@ public void testOverwriteWithAppendOutsideOfDelete() { long overwriteId = table.currentSnapshot().snapshotId(); Assert.assertNotEquals("Should create a new snapshot", baseId, overwriteId); - Assert.assertEquals("Table should have 2 manifests", - 2, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Table should have 2 manifests", + 2, + table.currentSnapshot().allManifests(table.io()).size()); // manifest is not merged because it is less than the minimum - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(0), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(0), ids(overwriteId), files(FILE_10_TO_14), statuses(Status.ADDED)); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(1), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(1), ids(overwriteId, baseId), files(FILE_0_TO_4, FILE_5_TO_9), statuses(Status.DELETED, Status.EXISTING)); @@ -196,7 +211,8 @@ public void testOverwriteWithMergedAppendOutsideOfDelete() { TableMetadata base = TestTables.readMetadata(TABLE_NAME); long baseId = base.currentSnapshot().snapshotId(); - table.newOverwrite() + table + .newOverwrite() .overwriteByRowFilter(equal("date", "2018-06-08")) .addFile(FILE_10_TO_14) // in 2018-06-09, NOT in 2018-06-08 .commit(); @@ -204,10 +220,13 @@ public void testOverwriteWithMergedAppendOutsideOfDelete() { long overwriteId = table.currentSnapshot().snapshotId(); Assert.assertNotEquals("Should create a new snapshot", baseId, overwriteId); - Assert.assertEquals("Table should have one merged manifest", - 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Table should have one merged manifest", + 1, + table.currentSnapshot().allManifests(table.io()).size()); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(0), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(0), ids(overwriteId, overwriteId, baseId), files(FILE_10_TO_14, FILE_0_TO_4, FILE_5_TO_9), statuses(Status.ADDED, Status.DELETED, Status.EXISTING)); @@ -221,17 +240,21 @@ public void testValidatedOverwriteWithAppendOutsideOfDelete() { TableMetadata base = TestTables.readMetadata(TABLE_NAME); long baseId = base.currentSnapshot().snapshotId(); - OverwriteFiles overwrite = table.newOverwrite() - .overwriteByRowFilter(equal("date", "2018-06-08")) - .addFile(FILE_10_TO_14) // in 2018-06-09, NOT in 2018-06-08 - .validateAddedFilesMatchOverwriteFilter(); - - AssertHelpers.assertThrows("Should reject commit with file not matching delete expression", - ValidationException.class, "Cannot append file with rows that do not match filter", + OverwriteFiles overwrite = + table + .newOverwrite() + .overwriteByRowFilter(equal("date", "2018-06-08")) + .addFile(FILE_10_TO_14) // in 2018-06-09, NOT in 2018-06-08 + .validateAddedFilesMatchOverwriteFilter(); + + AssertHelpers.assertThrows( + "Should reject commit with file not matching delete expression", + ValidationException.class, + "Cannot append file with rows that do not match filter", overwrite::commit); - Assert.assertEquals("Should not create a new snapshot", - baseId, table.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Should not create a new snapshot", baseId, table.currentSnapshot().snapshotId()); } @Test @@ -239,17 +262,21 @@ public void testValidatedOverwriteWithAppendOutsideOfDeleteMetrics() { TableMetadata base = TestTables.readMetadata(TABLE_NAME); long baseId = base.currentSnapshot().snapshotId(); - OverwriteFiles overwrite = table.newOverwrite() - .overwriteByRowFilter(and(equal("date", "2018-06-09"), lessThan("id", 10))) - .addFile(FILE_10_TO_14) // in 2018-06-09 matches, but IDs are outside range - .validateAddedFilesMatchOverwriteFilter(); - - AssertHelpers.assertThrows("Should reject commit with file not matching delete expression", - ValidationException.class, "Cannot append file with rows that do not match filter", + OverwriteFiles overwrite = + table + .newOverwrite() + .overwriteByRowFilter(and(equal("date", "2018-06-09"), lessThan("id", 10))) + .addFile(FILE_10_TO_14) // in 2018-06-09 matches, but IDs are outside range + .validateAddedFilesMatchOverwriteFilter(); + + AssertHelpers.assertThrows( + "Should reject commit with file not matching delete expression", + ValidationException.class, + "Cannot append file with rows that do not match filter", overwrite::commit); - Assert.assertEquals("Should not create a new snapshot", - baseId, table.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Should not create a new snapshot", baseId, table.currentSnapshot().snapshotId()); } @Test @@ -257,16 +284,20 @@ public void testValidatedOverwriteWithAppendSuccess() { TableMetadata base = TestTables.readMetadata(TABLE_NAME); long baseId = base.currentSnapshot().snapshotId(); - OverwriteFiles overwrite = table.newOverwrite() - .overwriteByRowFilter(and(equal("date", "2018-06-09"), lessThan("id", 20))) - .addFile(FILE_10_TO_14) // in 2018-06-09 matches and IDs are inside range - .validateAddedFilesMatchOverwriteFilter(); - - AssertHelpers.assertThrows("Should reject commit with file not matching delete expression", - ValidationException.class, "Cannot append file with rows that do not match filter", + OverwriteFiles overwrite = + table + .newOverwrite() + .overwriteByRowFilter(and(equal("date", "2018-06-09"), lessThan("id", 20))) + .addFile(FILE_10_TO_14) // in 2018-06-09 matches and IDs are inside range + .validateAddedFilesMatchOverwriteFilter(); + + AssertHelpers.assertThrows( + "Should reject commit with file not matching delete expression", + ValidationException.class, + "Cannot append file with rows that do not match filter", overwrite::commit); - Assert.assertEquals("Should not create a new snapshot", - baseId, table.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Should not create a new snapshot", baseId, table.currentSnapshot().snapshotId()); } } diff --git a/core/src/test/java/org/apache/iceberg/TestOverwriteWithValidation.java b/core/src/test/java/org/apache/iceberg/TestOverwriteWithValidation.java index 899d4b6169d5..acc11b6d8e82 100644 --- a/core/src/test/java/org/apache/iceberg/TestOverwriteWithValidation.java +++ b/core/src/test/java/org/apache/iceberg/TestOverwriteWithValidation.java @@ -16,9 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.expressions.Expressions.alwaysTrue; +import static org.apache.iceberg.expressions.Expressions.and; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; @@ -35,140 +42,144 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.expressions.Expressions.alwaysTrue; -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestOverwriteWithValidation extends TableTestBase { private static final String TABLE_NAME = "overwrite_table"; - private static final Schema DATE_SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get())); - - private static final PartitionSpec PARTITION_SPEC = PartitionSpec - .builderFor(DATE_SCHEMA) - .identity("date") - .build(); - - private static final DataFile FILE_DAY_1 = DataFiles - .builder(PARTITION_SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("date=2018-06-08") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - null, - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); - - private static final DeleteFile FILE_DAY_1_POS_DELETES = FileMetadata.deleteFileBuilder(PARTITION_SPEC) - .ofPositionDeletes() - .withPath("/path/to/data-1-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("date=2018-06-08") - .withRecordCount(1) - .build(); - - private static final DataFile FILE_DAY_2 = DataFiles - .builder(PARTITION_SPEC) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("date=2018-06-09") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - null, - ImmutableMap.of(1, longToBuffer(5L)), // lower bounds - ImmutableMap.of(1, longToBuffer(9L)) // upper bounds - )) - .build(); - - private static final DeleteFile FILE_DAY_2_EQ_DELETES = FileMetadata.deleteFileBuilder(PARTITION_SPEC) - .ofEqualityDeletes() - .withPath("/path/to/data-2-eq-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("date=2018-06-09") - .withRecordCount(1) - .build(); - - private static final DeleteFile FILE_DAY_2_POS_DELETES = FileMetadata.deleteFileBuilder(PARTITION_SPEC) - .ofPositionDeletes() - .withPath("/path/to/data-2-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("date=2018-06-09") - .withRecordCount(1) - .build(); - - private static final DataFile FILE_DAY_2_MODIFIED = DataFiles - .builder(PARTITION_SPEC) - .withPath("/path/to/data-3.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("date=2018-06-09") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - null, - ImmutableMap.of(1, longToBuffer(5L)), // lower bounds - ImmutableMap.of(1, longToBuffer(9L)) // upper bounds - )) - .build(); - - private static final DataFile FILE_DAY_2_ANOTHER_RANGE = DataFiles - .builder(PARTITION_SPEC) - .withPath("/path/to/data-3.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("date=2018-06-09") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - null, - ImmutableMap.of(1, longToBuffer(10L)), // lower bounds - ImmutableMap.of(1, longToBuffer(14L)) // upper bounds - )) - .build(); - - private static final DeleteFile FILE_DAY_2_ANOTHER_RANGE_EQ_DELETES = FileMetadata.deleteFileBuilder(PARTITION_SPEC) - .ofEqualityDeletes() - .withPath("/path/to/data-3-eq-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("date=2018-06-09") - .withRecordCount(1) - .withMetrics(new Metrics(1L, - null, // no column sizes - ImmutableMap.of(1, 1L, 2, 1L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - null, - ImmutableMap.of(1, longToBuffer(10L)), // lower bounds - ImmutableMap.of(1, longToBuffer(10L)) // upper bounds - )) - .build(); + private static final Schema DATE_SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get())); + + private static final PartitionSpec PARTITION_SPEC = + PartitionSpec.builderFor(DATE_SCHEMA).identity("date").build(); + + private static final DataFile FILE_DAY_1 = + DataFiles.builder(PARTITION_SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("date=2018-06-08") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + null, + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(4L)) // upper bounds + )) + .build(); + + private static final DeleteFile FILE_DAY_1_POS_DELETES = + FileMetadata.deleteFileBuilder(PARTITION_SPEC) + .ofPositionDeletes() + .withPath("/path/to/data-1-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("date=2018-06-08") + .withRecordCount(1) + .build(); + + private static final DataFile FILE_DAY_2 = + DataFiles.builder(PARTITION_SPEC) + .withPath("/path/to/data-2.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("date=2018-06-09") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + null, + ImmutableMap.of(1, longToBuffer(5L)), // lower bounds + ImmutableMap.of(1, longToBuffer(9L)) // upper bounds + )) + .build(); + + private static final DeleteFile FILE_DAY_2_EQ_DELETES = + FileMetadata.deleteFileBuilder(PARTITION_SPEC) + .ofEqualityDeletes() + .withPath("/path/to/data-2-eq-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("date=2018-06-09") + .withRecordCount(1) + .build(); + + private static final DeleteFile FILE_DAY_2_POS_DELETES = + FileMetadata.deleteFileBuilder(PARTITION_SPEC) + .ofPositionDeletes() + .withPath("/path/to/data-2-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("date=2018-06-09") + .withRecordCount(1) + .build(); + + private static final DataFile FILE_DAY_2_MODIFIED = + DataFiles.builder(PARTITION_SPEC) + .withPath("/path/to/data-3.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("date=2018-06-09") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + null, + ImmutableMap.of(1, longToBuffer(5L)), // lower bounds + ImmutableMap.of(1, longToBuffer(9L)) // upper bounds + )) + .build(); + + private static final DataFile FILE_DAY_2_ANOTHER_RANGE = + DataFiles.builder(PARTITION_SPEC) + .withPath("/path/to/data-3.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("date=2018-06-09") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + null, + ImmutableMap.of(1, longToBuffer(10L)), // lower bounds + ImmutableMap.of(1, longToBuffer(14L)) // upper bounds + )) + .build(); + + private static final DeleteFile FILE_DAY_2_ANOTHER_RANGE_EQ_DELETES = + FileMetadata.deleteFileBuilder(PARTITION_SPEC) + .ofEqualityDeletes() + .withPath("/path/to/data-3-eq-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("date=2018-06-09") + .withRecordCount(1) + .withMetrics( + new Metrics( + 1L, + null, // no column sizes + ImmutableMap.of(1, 1L, 2, 1L), // value count + ImmutableMap.of(1, 0L, 2, 0L), // null count + null, + ImmutableMap.of(1, longToBuffer(10L)), // lower bounds + ImmutableMap.of(1, longToBuffer(10L)) // upper bounds + )) + .build(); private static final Expression EXPRESSION_DAY_2 = equal("date", "2018-06-09"); - private static final Expression EXPRESSION_DAY_2_ID_RANGE = and( - greaterThanOrEqual("id", 5L), - lessThanOrEqual("id", 9L)); + private static final Expression EXPRESSION_DAY_2_ID_RANGE = + and(greaterThanOrEqual("id", 5L), lessThanOrEqual("id", 9L)); private static final Expression EXPRESSION_DAY_2_ANOTHER_ID_RANGE = greaterThanOrEqual("id", 10L); @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestOverwriteWithValidation(int formatVersion) { @@ -185,16 +196,15 @@ private static ByteBuffer longToBuffer(long value) { public void before() throws IOException { File tableDir = temp.newFolder(); Assert.assertTrue(tableDir.delete()); - this.table = TestTables.create(tableDir, TABLE_NAME, DATE_SCHEMA, PARTITION_SPEC, formatVersion); + this.table = + TestTables.create(tableDir, TABLE_NAME, DATE_SCHEMA, PARTITION_SPEC, formatVersion); } @Test public void testOverwriteEmptyTableNotValidated() { Assert.assertNull("Should be empty table", table.currentSnapshot()); - table.newOverwrite() - .addFile(FILE_DAY_2_MODIFIED) - .commit(); + table.newOverwrite().addFile(FILE_DAY_2_MODIFIED).commit(); validateTableFiles(table, FILE_DAY_2_MODIFIED); } @@ -203,7 +213,8 @@ public void testOverwriteEmptyTableNotValidated() { public void testOverwriteEmptyTableStrictValidated() { Assert.assertNull("Should be empty table", table.currentSnapshot()); - table.newOverwrite() + table + .newOverwrite() .addFile(FILE_DAY_2_MODIFIED) .conflictDetectionFilter(alwaysTrue()) .validateNoConflictingData() @@ -216,7 +227,8 @@ public void testOverwriteEmptyTableStrictValidated() { public void testOverwriteEmptyTableValidated() { Assert.assertNull("Should be empty table", table.currentSnapshot()); - table.newOverwrite() + table + .newOverwrite() .addFile(FILE_DAY_2_MODIFIED) .conflictDetectionFilter(EXPRESSION_DAY_2) .validateNoConflictingData() @@ -227,33 +239,25 @@ public void testOverwriteEmptyTableValidated() { @Test public void testOverwriteTableNotValidated() { - table.newAppend() - .appendFile(FILE_DAY_1) - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).appendFile(FILE_DAY_2).commit(); Snapshot baseSnapshot = table.currentSnapshot(); validateSnapshot(null, baseSnapshot, FILE_DAY_1, FILE_DAY_2); - table.newOverwrite() - .deleteFile(FILE_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .commit(); + table.newOverwrite().deleteFile(FILE_DAY_2).addFile(FILE_DAY_2_MODIFIED).commit(); validateTableFiles(table, FILE_DAY_1, FILE_DAY_2_MODIFIED); } @Test public void testOverwriteTableStrictValidated() { - table.newAppend() - .appendFile(FILE_DAY_1) - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).appendFile(FILE_DAY_2).commit(); Snapshot baseSnapshot = table.currentSnapshot(); validateSnapshot(null, baseSnapshot, FILE_DAY_1, FILE_DAY_2); - table.newOverwrite() + table + .newOverwrite() .deleteFile(FILE_DAY_2) .addFile(FILE_DAY_2_MODIFIED) .validateFromSnapshot(baseSnapshot.snapshotId()) @@ -266,15 +270,13 @@ public void testOverwriteTableStrictValidated() { @Test public void testOverwriteTableValidated() { - table.newAppend() - .appendFile(FILE_DAY_1) - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).appendFile(FILE_DAY_2).commit(); Snapshot baseSnapshot = table.currentSnapshot(); validateSnapshot(null, baseSnapshot, FILE_DAY_1, FILE_DAY_2); - table.newOverwrite() + table + .newOverwrite() .deleteFile(FILE_DAY_2) .addFile(FILE_DAY_2_MODIFIED) .validateFromSnapshot(baseSnapshot.snapshotId()) @@ -287,19 +289,14 @@ public void testOverwriteTableValidated() { @Test public void testOverwriteCompatibleAdditionNotValidated() { - table.newAppend() - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_2).commit(); validateSnapshot(null, table.currentSnapshot(), FILE_DAY_2); - OverwriteFiles overwrite = table.newOverwrite() - .deleteFile(FILE_DAY_2) - .addFile(FILE_DAY_2_MODIFIED); + OverwriteFiles overwrite = + table.newOverwrite().deleteFile(FILE_DAY_2).addFile(FILE_DAY_2_MODIFIED); - table.newAppend() - .appendFile(FILE_DAY_1) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).commit(); overwrite.commit(); @@ -308,52 +305,52 @@ public void testOverwriteCompatibleAdditionNotValidated() { @Test public void testOverwriteCompatibleAdditionStrictValidated() { - table.newAppend() - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_2).commit(); Snapshot baseSnapshot = table.currentSnapshot(); validateSnapshot(null, baseSnapshot, FILE_DAY_2); - OverwriteFiles overwrite = table.newOverwrite() - .deleteFile(FILE_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(baseSnapshot.snapshotId()) - .conflictDetectionFilter(alwaysTrue()) - .validateNoConflictingData(); + OverwriteFiles overwrite = + table + .newOverwrite() + .deleteFile(FILE_DAY_2) + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(baseSnapshot.snapshotId()) + .conflictDetectionFilter(alwaysTrue()) + .validateNoConflictingData(); - table.newAppend() - .appendFile(FILE_DAY_1) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).commit(); long committedSnapshotId = table.currentSnapshot().snapshotId(); - AssertHelpers.assertThrows("Should reject commit", - ValidationException.class, "Found conflicting files", + AssertHelpers.assertThrows( + "Should reject commit", + ValidationException.class, + "Found conflicting files", overwrite::commit); - Assert.assertEquals("Should not create a new snapshot", - committedSnapshotId, table.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Should not create a new snapshot", + committedSnapshotId, + table.currentSnapshot().snapshotId()); } @Test public void testOverwriteCompatibleAdditionValidated() { - table.newAppend() - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_2).commit(); Snapshot baseSnapshot = table.currentSnapshot(); validateSnapshot(null, baseSnapshot, FILE_DAY_2); - OverwriteFiles overwrite = table.newOverwrite() - .deleteFile(FILE_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(baseSnapshot.snapshotId()) - .conflictDetectionFilter(EXPRESSION_DAY_2) - .validateNoConflictingData(); + OverwriteFiles overwrite = + table + .newOverwrite() + .deleteFile(FILE_DAY_2) + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(baseSnapshot.snapshotId()) + .conflictDetectionFilter(EXPRESSION_DAY_2) + .validateNoConflictingData(); - table.newAppend() - .appendFile(FILE_DAY_1) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).commit(); overwrite.commit(); @@ -362,24 +359,21 @@ public void testOverwriteCompatibleAdditionValidated() { @Test public void testOverwriteCompatibleDeletionValidated() { - table.newAppend() - .appendFile(FILE_DAY_1) - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).appendFile(FILE_DAY_2).commit(); Snapshot baseSnapshot = table.currentSnapshot(); validateSnapshot(null, baseSnapshot, FILE_DAY_1, FILE_DAY_2); - OverwriteFiles overwrite = table.newOverwrite() - .deleteFile(FILE_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(baseSnapshot.snapshotId()) - .conflictDetectionFilter(EXPRESSION_DAY_2) - .validateNoConflictingData(); + OverwriteFiles overwrite = + table + .newOverwrite() + .deleteFile(FILE_DAY_2) + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(baseSnapshot.snapshotId()) + .conflictDetectionFilter(EXPRESSION_DAY_2) + .validateNoConflictingData(); - table.newDelete() - .deleteFile(FILE_DAY_1) - .commit(); + table.newDelete().deleteFile(FILE_DAY_1).commit(); overwrite.commit(); @@ -388,112 +382,112 @@ public void testOverwriteCompatibleDeletionValidated() { @Test public void testOverwriteIncompatibleAdditionValidated() { - table.newAppend() - .appendFile(FILE_DAY_1) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).commit(); Snapshot baseSnapshot = table.currentSnapshot(); validateSnapshot(null, baseSnapshot, FILE_DAY_1); - OverwriteFiles overwrite = table.newOverwrite() - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(baseSnapshot.snapshotId()) - .conflictDetectionFilter(EXPRESSION_DAY_2) - .validateNoConflictingData(); + OverwriteFiles overwrite = + table + .newOverwrite() + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(baseSnapshot.snapshotId()) + .conflictDetectionFilter(EXPRESSION_DAY_2) + .validateNoConflictingData(); - table.newAppend() - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_2).commit(); long committedSnapshotId = table.currentSnapshot().snapshotId(); - AssertHelpers.assertThrows("Should reject commit", - ValidationException.class, "Found conflicting files", + AssertHelpers.assertThrows( + "Should reject commit", + ValidationException.class, + "Found conflicting files", overwrite::commit); - Assert.assertEquals("Should not create a new snapshot", - committedSnapshotId, table.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Should not create a new snapshot", + committedSnapshotId, + table.currentSnapshot().snapshotId()); } @Test public void testOverwriteIncompatibleDeletionValidated() { - table.newAppend() - .appendFile(FILE_DAY_1) - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).appendFile(FILE_DAY_2).commit(); Snapshot baseSnapshot = table.currentSnapshot(); validateSnapshot(null, baseSnapshot, FILE_DAY_1, FILE_DAY_2); - OverwriteFiles overwrite = table.newOverwrite() - .deleteFile(FILE_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(baseSnapshot.snapshotId()) - .conflictDetectionFilter(EXPRESSION_DAY_2) - .validateNoConflictingData(); + OverwriteFiles overwrite = + table + .newOverwrite() + .deleteFile(FILE_DAY_2) + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(baseSnapshot.snapshotId()) + .conflictDetectionFilter(EXPRESSION_DAY_2) + .validateNoConflictingData(); - table.newDelete() - .deleteFile(FILE_DAY_2) - .commit(); + table.newDelete().deleteFile(FILE_DAY_2).commit(); long committedSnapshotId = table.currentSnapshot().snapshotId(); - AssertHelpers.assertThrows("Should reject commit", - ValidationException.class, "Missing required files to delete:", + AssertHelpers.assertThrows( + "Should reject commit", + ValidationException.class, + "Missing required files to delete:", overwrite::commit); - Assert.assertEquals("Should not create a new snapshot", - committedSnapshotId, table.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Should not create a new snapshot", + committedSnapshotId, + table.currentSnapshot().snapshotId()); } @Test public void testOverwriteCompatibleRewriteAllowed() { - table.newAppend() - .appendFile(FILE_DAY_1) - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).appendFile(FILE_DAY_2).commit(); Snapshot baseSnapshot = table.currentSnapshot(); validateSnapshot(null, baseSnapshot, FILE_DAY_1, FILE_DAY_2); - OverwriteFiles overwrite = table.newOverwrite() - .deleteFile(FILE_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(baseSnapshot.snapshotId()) - .conflictDetectionFilter(EXPRESSION_DAY_2) - .validateNoConflictingData(); + OverwriteFiles overwrite = + table + .newOverwrite() + .deleteFile(FILE_DAY_2) + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(baseSnapshot.snapshotId()) + .conflictDetectionFilter(EXPRESSION_DAY_2) + .validateNoConflictingData(); - table.newRewrite() + table + .newRewrite() .rewriteFiles(ImmutableSet.of(FILE_DAY_2), ImmutableSet.of(FILE_DAY_2)) .commit(); long committedSnapshotId = table.currentSnapshot().snapshotId(); overwrite.commit(); - Assert.assertNotEquals("Should successfully commit", committedSnapshotId, table.currentSnapshot().snapshotId()); + Assert.assertNotEquals( + "Should successfully commit", committedSnapshotId, table.currentSnapshot().snapshotId()); } @Test public void testOverwriteCompatibleExpirationAdditionValidated() { - table.newAppend() - .appendFile(FILE_DAY_2) - .commit(); // id 1 + table.newAppend().appendFile(FILE_DAY_2).commit(); // id 1 Snapshot baseSnapshot = table.currentSnapshot(); validateSnapshot(null, baseSnapshot, FILE_DAY_2); - OverwriteFiles overwrite = table.newOverwrite() - .deleteFile(FILE_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(baseSnapshot.snapshotId()) - .conflictDetectionFilter(EXPRESSION_DAY_2) - .validateNoConflictingData(); + OverwriteFiles overwrite = + table + .newOverwrite() + .deleteFile(FILE_DAY_2) + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(baseSnapshot.snapshotId()) + .conflictDetectionFilter(EXPRESSION_DAY_2) + .validateNoConflictingData(); - table.newAppend() - .appendFile(FILE_DAY_1) - .commit(); // id 2 + table.newAppend().appendFile(FILE_DAY_1).commit(); // id 2 - table.expireSnapshots() - .expireSnapshotId(1L) - .commit(); + table.expireSnapshots().expireSnapshotId(1L).commit(); overwrite.commit(); @@ -502,28 +496,23 @@ public void testOverwriteCompatibleExpirationAdditionValidated() { @Test public void testOverwriteCompatibleExpirationDeletionValidated() { - table.newAppend() - .appendFile(FILE_DAY_1) - .appendFile(FILE_DAY_2) - .commit(); // id 1 + table.newAppend().appendFile(FILE_DAY_1).appendFile(FILE_DAY_2).commit(); // id 1 Snapshot baseSnapshot = table.currentSnapshot(); validateSnapshot(null, baseSnapshot, FILE_DAY_1, FILE_DAY_2); - OverwriteFiles overwrite = table.newOverwrite() - .deleteFile(FILE_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(baseSnapshot.snapshotId()) - .conflictDetectionFilter(EXPRESSION_DAY_2) - .validateNoConflictingData(); + OverwriteFiles overwrite = + table + .newOverwrite() + .deleteFile(FILE_DAY_2) + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(baseSnapshot.snapshotId()) + .conflictDetectionFilter(EXPRESSION_DAY_2) + .validateNoConflictingData(); - table.newDelete() - .deleteFile(FILE_DAY_1) - .commit(); // id 2 + table.newDelete().deleteFile(FILE_DAY_1).commit(); // id 2 - table.expireSnapshots() - .expireSnapshotId(1L) - .commit(); + table.expireSnapshots().expireSnapshotId(1L).commit(); overwrite.commit(); @@ -532,80 +521,78 @@ public void testOverwriteCompatibleExpirationDeletionValidated() { @Test public void testOverwriteIncompatibleExpirationValidated() { - table.newAppend() - .appendFile(FILE_DAY_1) - .commit(); // id 1 + table.newAppend().appendFile(FILE_DAY_1).commit(); // id 1 Snapshot baseSnapshot = table.currentSnapshot(); - OverwriteFiles overwrite = table.newOverwrite() - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(baseSnapshot.snapshotId()) - .conflictDetectionFilter(EXPRESSION_DAY_2) - .validateNoConflictingData(); + OverwriteFiles overwrite = + table + .newOverwrite() + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(baseSnapshot.snapshotId()) + .conflictDetectionFilter(EXPRESSION_DAY_2) + .validateNoConflictingData(); - table.newAppend() - .appendFile(FILE_DAY_2) - .commit(); // id 2 + table.newAppend().appendFile(FILE_DAY_2).commit(); // id 2 - table.newDelete() - .deleteFile(FILE_DAY_1) - .commit(); // id 3 + table.newDelete().deleteFile(FILE_DAY_1).commit(); // id 3 - table.expireSnapshots() - .expireSnapshotId(2L) - .commit(); + table.expireSnapshots().expireSnapshotId(2L).commit(); long committedSnapshotId = table.currentSnapshot().snapshotId(); - AssertHelpers.assertThrows("Should reject commit", - ValidationException.class, "Cannot determine history", + AssertHelpers.assertThrows( + "Should reject commit", + ValidationException.class, + "Cannot determine history", overwrite::commit); - Assert.assertEquals("Should not create a new snapshot", - committedSnapshotId, table.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Should not create a new snapshot", + committedSnapshotId, + table.currentSnapshot().snapshotId()); } @Test public void testOverwriteIncompatibleBaseExpirationEmptyTableValidated() { Assert.assertNull("Should be empty table", table.currentSnapshot()); - OverwriteFiles overwrite = table.newOverwrite() - .addFile(FILE_DAY_2_MODIFIED) - .conflictDetectionFilter(EXPRESSION_DAY_2) - .validateNoConflictingData(); + OverwriteFiles overwrite = + table + .newOverwrite() + .addFile(FILE_DAY_2_MODIFIED) + .conflictDetectionFilter(EXPRESSION_DAY_2) + .validateNoConflictingData(); - table.newAppend() - .appendFile(FILE_DAY_2) - .commit(); // id 1 + table.newAppend().appendFile(FILE_DAY_2).commit(); // id 1 - table.newDelete() - .deleteFile(FILE_DAY_1) - .commit(); // id 2 + table.newDelete().deleteFile(FILE_DAY_1).commit(); // id 2 - table.expireSnapshots() - .expireSnapshotId(1L) - .commit(); + table.expireSnapshots().expireSnapshotId(1L).commit(); long committedSnapshotId = table.currentSnapshot().snapshotId(); - AssertHelpers.assertThrows("Should reject commit", - ValidationException.class, "Cannot determine history", + AssertHelpers.assertThrows( + "Should reject commit", + ValidationException.class, + "Cannot determine history", overwrite::commit); - Assert.assertEquals("Should not create a new snapshot", - committedSnapshotId, table.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Should not create a new snapshot", + committedSnapshotId, + table.currentSnapshot().snapshotId()); } @Test public void testOverwriteAnotherRangeValidated() { Assert.assertNull("Should be empty table", table.currentSnapshot()); - OverwriteFiles overwrite = table.newOverwrite() - .addFile(FILE_DAY_2_MODIFIED) - .conflictDetectionFilter(EXPRESSION_DAY_2_ID_RANGE) - .validateNoConflictingData(); + OverwriteFiles overwrite = + table + .newOverwrite() + .addFile(FILE_DAY_2_MODIFIED) + .conflictDetectionFilter(EXPRESSION_DAY_2_ID_RANGE) + .validateNoConflictingData(); - table.newAppend() - .appendFile(FILE_DAY_1) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).commit(); overwrite.commit(); @@ -617,14 +604,14 @@ public void testOverwriteAnotherRangeWithinPartitionValidated() { Assert.assertNull("Should be empty table", table.currentSnapshot()); Expression conflictDetectionFilter = and(EXPRESSION_DAY_2, EXPRESSION_DAY_2_ID_RANGE); - OverwriteFiles overwrite = table.newOverwrite() - .addFile(FILE_DAY_2_MODIFIED) - .conflictDetectionFilter(conflictDetectionFilter) - .validateNoConflictingData(); + OverwriteFiles overwrite = + table + .newOverwrite() + .addFile(FILE_DAY_2_MODIFIED) + .conflictDetectionFilter(conflictDetectionFilter) + .validateNoConflictingData(); - table.newAppend() - .appendFile(FILE_DAY_2_ANOTHER_RANGE) - .commit(); + table.newAppend().appendFile(FILE_DAY_2_ANOTHER_RANGE).commit(); overwrite.commit(); @@ -635,23 +622,20 @@ public void testOverwriteAnotherRangeWithinPartitionValidated() { public void testTransactionCompatibleAdditionValidated() { Assert.assertNull("Should be empty table", table.currentSnapshot()); - table.newAppend() - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_2).commit(); Snapshot baseSnapshot = table.currentSnapshot(); Transaction txn = table.newTransaction(); - OverwriteFiles overwrite = txn.newOverwrite() - .deleteFile(FILE_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(baseSnapshot.snapshotId()) - .conflictDetectionFilter(EXPRESSION_DAY_2) - .validateNoConflictingData(); + OverwriteFiles overwrite = + txn.newOverwrite() + .deleteFile(FILE_DAY_2) + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(baseSnapshot.snapshotId()) + .conflictDetectionFilter(EXPRESSION_DAY_2) + .validateNoConflictingData(); - table.newAppend() - .appendFile(FILE_DAY_1) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).commit(); overwrite.commit(); txn.commitTransaction(); @@ -665,28 +649,29 @@ public void testTransactionIncompatibleAdditionValidated() { Transaction txn = table.newTransaction(); - txn.newAppend() - .appendFile(FILE_DAY_1) - .commit(); + txn.newAppend().appendFile(FILE_DAY_1).commit(); - OverwriteFiles overwrite = txn.newOverwrite() - .addFile(FILE_DAY_2_MODIFIED) - .conflictDetectionFilter(EXPRESSION_DAY_2) - .validateNoConflictingData(); + OverwriteFiles overwrite = + txn.newOverwrite() + .addFile(FILE_DAY_2_MODIFIED) + .conflictDetectionFilter(EXPRESSION_DAY_2) + .validateNoConflictingData(); - table.newAppend() - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_2).commit(); long committedSnapshotId = table.currentSnapshot().snapshotId(); overwrite.commit(); - AssertHelpers.assertThrows("Should reject commit", - ValidationException.class, "Found conflicting files", + AssertHelpers.assertThrows( + "Should reject commit", + ValidationException.class, + "Found conflicting files", txn::commitTransaction); - Assert.assertEquals("Should not create a new snapshot", - committedSnapshotId, table.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Should not create a new snapshot", + committedSnapshotId, + table.currentSnapshot().snapshotId()); } @Test @@ -695,28 +680,24 @@ public void testConcurrentConflictingPositionDeletes() { Assert.assertNull("Should be empty table", table.currentSnapshot()); - table.newAppend() - .appendFile(FILE_DAY_1) - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).appendFile(FILE_DAY_2).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - OverwriteFiles overwrite = table.newOverwrite() - .deleteFile(FILE_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(firstSnapshot.snapshotId()) - .conflictDetectionFilter(EXPRESSION_DAY_2) - .validateNoConflictingData() - .validateNoConflictingDeletes(); + OverwriteFiles overwrite = + table + .newOverwrite() + .deleteFile(FILE_DAY_2) + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(firstSnapshot.snapshotId()) + .conflictDetectionFilter(EXPRESSION_DAY_2) + .validateNoConflictingData() + .validateNoConflictingDeletes(); - table.newRowDelta() - .addDeletes(FILE_DAY_2_POS_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_DAY_2_POS_DELETES).commit(); - AssertHelpers.assertThrows("Should reject commit", - ValidationException.class, "found new delete", - overwrite::commit); + AssertHelpers.assertThrows( + "Should reject commit", ValidationException.class, "found new delete", overwrite::commit); } @Test @@ -725,27 +706,26 @@ public void testConcurrentConflictingPositionDeletesOverwriteByFilter() { Assert.assertNull("Should be empty table", table.currentSnapshot()); - table.newAppend() - .appendFile(FILE_DAY_1) - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).appendFile(FILE_DAY_2).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - OverwriteFiles overwrite = table.newOverwrite() - .overwriteByRowFilter(EXPRESSION_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(firstSnapshot.snapshotId()) - .conflictDetectionFilter(EXPRESSION_DAY_2) - .validateNoConflictingData() - .validateNoConflictingDeletes(); + OverwriteFiles overwrite = + table + .newOverwrite() + .overwriteByRowFilter(EXPRESSION_DAY_2) + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(firstSnapshot.snapshotId()) + .conflictDetectionFilter(EXPRESSION_DAY_2) + .validateNoConflictingData() + .validateNoConflictingDeletes(); - table.newRowDelta() - .addDeletes(FILE_DAY_2_POS_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_DAY_2_POS_DELETES).commit(); - AssertHelpers.assertThrows("Should reject commit", - ValidationException.class, "Found new conflicting delete", + AssertHelpers.assertThrows( + "Should reject commit", + ValidationException.class, + "Found new conflicting delete", overwrite::commit); } @@ -753,26 +733,25 @@ public void testConcurrentConflictingPositionDeletesOverwriteByFilter() { public void testConcurrentConflictingDataFileDeleteOverwriteByFilter() { Assert.assertNull("Should be empty table", table.currentSnapshot()); - table.newAppend() - .appendFile(FILE_DAY_1) - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).appendFile(FILE_DAY_2).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - OverwriteFiles overwrite = table.newOverwrite() - .overwriteByRowFilter(EXPRESSION_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(firstSnapshot.snapshotId()) - .validateNoConflictingData() - .validateNoConflictingDeletes(); + OverwriteFiles overwrite = + table + .newOverwrite() + .overwriteByRowFilter(EXPRESSION_DAY_2) + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(firstSnapshot.snapshotId()) + .validateNoConflictingData() + .validateNoConflictingDeletes(); - table.newOverwrite() - .deleteFile(FILE_DAY_2) - .commit(); + table.newOverwrite().deleteFile(FILE_DAY_2).commit(); - AssertHelpers.assertThrows("Should reject commit", - ValidationException.class, "Found conflicting deleted files", + AssertHelpers.assertThrows( + "Should reject commit", + ValidationException.class, + "Found conflicting deleted files", overwrite::commit); } @@ -780,23 +759,20 @@ public void testConcurrentConflictingDataFileDeleteOverwriteByFilter() { public void testConcurrentNonConflictingDataFileDeleteOverwriteByFilter() { Assert.assertNull("Should be empty table", table.currentSnapshot()); - table.newAppend() - .appendFile(FILE_DAY_1) - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).appendFile(FILE_DAY_2).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - OverwriteFiles overwrite = table.newOverwrite() - .overwriteByRowFilter(EXPRESSION_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(firstSnapshot.snapshotId()) - .validateNoConflictingData() - .validateNoConflictingDeletes(); + OverwriteFiles overwrite = + table + .newOverwrite() + .overwriteByRowFilter(EXPRESSION_DAY_2) + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(firstSnapshot.snapshotId()) + .validateNoConflictingData() + .validateNoConflictingDeletes(); - table.newOverwrite() - .deleteFile(FILE_DAY_1) - .commit(); + table.newOverwrite().deleteFile(FILE_DAY_1).commit(); overwrite.commit(); @@ -809,24 +785,21 @@ public void testConcurrentNonConflictingPositionDeletes() { Assert.assertNull("Should be empty table", table.currentSnapshot()); - table.newAppend() - .appendFile(FILE_DAY_1) - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).appendFile(FILE_DAY_2).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - OverwriteFiles overwrite = table.newOverwrite() - .deleteFile(FILE_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(firstSnapshot.snapshotId()) - .conflictDetectionFilter(EXPRESSION_DAY_2) - .validateNoConflictingData() - .validateNoConflictingDeletes(); + OverwriteFiles overwrite = + table + .newOverwrite() + .deleteFile(FILE_DAY_2) + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(firstSnapshot.snapshotId()) + .conflictDetectionFilter(EXPRESSION_DAY_2) + .validateNoConflictingData() + .validateNoConflictingDeletes(); - table.newRowDelta() - .addDeletes(FILE_DAY_1_POS_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_DAY_1_POS_DELETES).commit(); overwrite.commit(); @@ -840,24 +813,21 @@ public void testConcurrentNonConflictingPositionDeletesOverwriteByFilter() { Assert.assertNull("Should be empty table", table.currentSnapshot()); - table.newAppend() - .appendFile(FILE_DAY_1) - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).appendFile(FILE_DAY_2).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - OverwriteFiles overwrite = table.newOverwrite() - .overwriteByRowFilter(EXPRESSION_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(firstSnapshot.snapshotId()) - .conflictDetectionFilter(EXPRESSION_DAY_2) - .validateNoConflictingData() - .validateNoConflictingDeletes(); + OverwriteFiles overwrite = + table + .newOverwrite() + .overwriteByRowFilter(EXPRESSION_DAY_2) + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(firstSnapshot.snapshotId()) + .conflictDetectionFilter(EXPRESSION_DAY_2) + .validateNoConflictingData() + .validateNoConflictingDeletes(); - table.newRowDelta() - .addDeletes(FILE_DAY_1_POS_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_DAY_1_POS_DELETES).commit(); overwrite.commit(); @@ -871,28 +841,24 @@ public void testConcurrentConflictingEqualityDeletes() { Assert.assertNull("Should be empty table", table.currentSnapshot()); - table.newAppend() - .appendFile(FILE_DAY_1) - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).appendFile(FILE_DAY_2).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - OverwriteFiles overwrite = table.newOverwrite() - .deleteFile(FILE_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(firstSnapshot.snapshotId()) - .conflictDetectionFilter(EXPRESSION_DAY_2) - .validateNoConflictingData() - .validateNoConflictingDeletes(); + OverwriteFiles overwrite = + table + .newOverwrite() + .deleteFile(FILE_DAY_2) + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(firstSnapshot.snapshotId()) + .conflictDetectionFilter(EXPRESSION_DAY_2) + .validateNoConflictingData() + .validateNoConflictingDeletes(); - table.newRowDelta() - .addDeletes(FILE_DAY_2_EQ_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_DAY_2_EQ_DELETES).commit(); - AssertHelpers.assertThrows("Should reject commit", - ValidationException.class, "found new delete", - overwrite::commit); + AssertHelpers.assertThrows( + "Should reject commit", ValidationException.class, "found new delete", overwrite::commit); } @Test @@ -901,24 +867,21 @@ public void testConcurrentNonConflictingEqualityDeletes() { Assert.assertNull("Should be empty table", table.currentSnapshot()); - table.newAppend() - .appendFile(FILE_DAY_2) - .appendFile(FILE_DAY_2_ANOTHER_RANGE) - .commit(); + table.newAppend().appendFile(FILE_DAY_2).appendFile(FILE_DAY_2_ANOTHER_RANGE).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - OverwriteFiles overwrite = table.newOverwrite() - .deleteFile(FILE_DAY_2) - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(firstSnapshot.snapshotId()) - .conflictDetectionFilter(EXPRESSION_DAY_2_ID_RANGE) - .validateNoConflictingData() - .validateNoConflictingDeletes(); + OverwriteFiles overwrite = + table + .newOverwrite() + .deleteFile(FILE_DAY_2) + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(firstSnapshot.snapshotId()) + .conflictDetectionFilter(EXPRESSION_DAY_2_ID_RANGE) + .validateNoConflictingData() + .validateNoConflictingDeletes(); - table.newRowDelta() - .addDeletes(FILE_DAY_2_ANOTHER_RANGE_EQ_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_DAY_2_ANOTHER_RANGE_EQ_DELETES).commit(); overwrite.commit(); @@ -932,24 +895,21 @@ public void testOverwriteByFilterInheritsConflictDetectionFilter() { Assert.assertNull("Should be empty table", table.currentSnapshot()); - table.newAppend() - .appendFile(FILE_DAY_1) - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).appendFile(FILE_DAY_2).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - OverwriteFiles overwrite = table.newOverwrite() - .overwriteByRowFilter(EXPRESSION_DAY_2) - .validateAddedFilesMatchOverwriteFilter() - .addFile(FILE_DAY_2_MODIFIED) - .validateFromSnapshot(firstSnapshot.snapshotId()) - .validateNoConflictingData() - .validateNoConflictingDeletes(); + OverwriteFiles overwrite = + table + .newOverwrite() + .overwriteByRowFilter(EXPRESSION_DAY_2) + .validateAddedFilesMatchOverwriteFilter() + .addFile(FILE_DAY_2_MODIFIED) + .validateFromSnapshot(firstSnapshot.snapshotId()) + .validateNoConflictingData() + .validateNoConflictingDeletes(); - table.newRowDelta() - .addDeletes(FILE_DAY_1_POS_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_DAY_1_POS_DELETES).commit(); overwrite.commit(); @@ -959,43 +919,50 @@ public void testOverwriteByFilterInheritsConflictDetectionFilter() { @Test public void testOverwriteCaseSensitivity() { - table.newAppend() - .appendFile(FILE_DAY_1) - .appendFile(FILE_DAY_2) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).appendFile(FILE_DAY_2).commit(); - table.newAppend() - .appendFile(FILE_DAY_1) - .commit(); + table.newAppend().appendFile(FILE_DAY_1).commit(); Expression rowFilter = equal("dAtE", "2018-06-09"); - AssertHelpers.assertThrows("Should use case sensitive binding by default", - ValidationException.class, "Cannot find field 'dAtE'", - () -> table.newOverwrite() - .addFile(FILE_DAY_2_MODIFIED) - .conflictDetectionFilter(rowFilter) - .validateNoConflictingData() - .commit()); - - AssertHelpers.assertThrows("Should fail with case sensitive binding", - ValidationException.class, "Cannot find field 'dAtE'", - () -> table.newOverwrite() - .caseSensitive(true) - .addFile(FILE_DAY_2_MODIFIED) - .conflictDetectionFilter(rowFilter) - .validateNoConflictingData() - .commit()); + AssertHelpers.assertThrows( + "Should use case sensitive binding by default", + ValidationException.class, + "Cannot find field 'dAtE'", + () -> + table + .newOverwrite() + .addFile(FILE_DAY_2_MODIFIED) + .conflictDetectionFilter(rowFilter) + .validateNoConflictingData() + .commit()); + + AssertHelpers.assertThrows( + "Should fail with case sensitive binding", + ValidationException.class, + "Cannot find field 'dAtE'", + () -> + table + .newOverwrite() + .caseSensitive(true) + .addFile(FILE_DAY_2_MODIFIED) + .conflictDetectionFilter(rowFilter) + .validateNoConflictingData() + .commit()); // binding should succeed and trigger the validation - AssertHelpers.assertThrows("Should trigger the validation", - ValidationException.class, "Found conflicting files", - () -> table.newOverwrite() - .caseSensitive(false) - .addFile(FILE_DAY_2_MODIFIED) - .conflictDetectionFilter(rowFilter) - .validateNoConflictingData() - .commit()); + AssertHelpers.assertThrows( + "Should trigger the validation", + ValidationException.class, + "Found conflicting files", + () -> + table + .newOverwrite() + .caseSensitive(false) + .addFile(FILE_DAY_2_MODIFIED) + .conflictDetectionFilter(rowFilter) + .validateNoConflictingData() + .commit()); } @Test @@ -1004,17 +971,16 @@ public void testMetadataOnlyDeleteWithPositionDeletes() { Assert.assertNull("Should be empty table", table.currentSnapshot()); - table.newAppend() - .appendFile(FILE_DAY_2) - .appendFile(FILE_DAY_2_ANOTHER_RANGE) - .commit(); + table.newAppend().appendFile(FILE_DAY_2).appendFile(FILE_DAY_2_ANOTHER_RANGE).commit(); - table.newRowDelta() + table + .newRowDelta() .addDeletes(FILE_DAY_2_POS_DELETES) .addDeletes(FILE_DAY_2_ANOTHER_RANGE_EQ_DELETES) .commit(); - table.newOverwrite() + table + .newOverwrite() .overwriteByRowFilter(EXPRESSION_DAY_2_ANOTHER_ID_RANGE) .addFile(FILE_DAY_2_MODIFIED) .commit(); diff --git a/core/src/test/java/org/apache/iceberg/TestPartitionSpecInfo.java b/core/src/test/java/org/apache/iceberg/TestPartitionSpecInfo.java index 322ae508128b..b0dc2f551a8e 100644 --- a/core/src/test/java/org/apache/iceberg/TestPartitionSpecInfo.java +++ b/core/src/test/java/org/apache/iceberg/TestPartitionSpecInfo.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -32,21 +33,18 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestPartitionSpecInfo { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - private final Schema schema = new Schema( - required(1, "id", Types.IntegerType.get()), - required(2, "data", Types.StringType.get())); + @Rule public TemporaryFolder temp = new TemporaryFolder(); + private final Schema schema = + new Schema( + required(1, "id", Types.IntegerType.get()), required(2, "data", Types.StringType.get())); private File tableDir = null; @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } private final int formatVersion; @@ -89,24 +87,21 @@ public void testSpecInfoPartitionedTable() { @Test public void testSpecInfoPartitionSpecEvolutionForV1Table() { - PartitionSpec spec = PartitionSpec.builderFor(schema) - .bucket("data", 4) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(schema).bucket("data", 4).build(); TestTables.TestTable table = TestTables.create(tableDir, "test", schema, spec, formatVersion); Assert.assertEquals(spec, table.spec()); TableMetadata base = TestTables.readMetadata("test"); - PartitionSpec newSpec = PartitionSpec.builderFor(table.schema()) - .bucket("data", 10) - .withSpecId(1) - .build(); + PartitionSpec newSpec = + PartitionSpec.builderFor(table.schema()).bucket("data", 10).withSpecId(1).build(); table.ops().commit(base, base.updatePartitionSpec(newSpec)); Assert.assertEquals(newSpec, table.spec()); Assert.assertEquals(newSpec, table.specs().get(newSpec.specId())); Assert.assertEquals(spec, table.specs().get(spec.specId())); - Assert.assertEquals(ImmutableMap.of(spec.specId(), spec, newSpec.specId(), newSpec), table.specs()); + Assert.assertEquals( + ImmutableMap.of(spec.specId(), spec, newSpec.specId(), newSpec), table.specs()); Assert.assertNull(table.specs().get(Integer.MAX_VALUE)); } } diff --git a/core/src/test/java/org/apache/iceberg/TestPartitionSpecParser.java b/core/src/test/java/org/apache/iceberg/TestPartitionSpecParser.java index 847ff4283ab1..1d88e97f9925 100644 --- a/core/src/test/java/org/apache/iceberg/TestPartitionSpecParser.java +++ b/core/src/test/java/org/apache/iceberg/TestPartitionSpecParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.junit.Assert; @@ -29,57 +28,58 @@ public TestPartitionSpecParser() { @Test public void testToJsonForV1Table() { - String expected = "{\n" + - " \"spec-id\" : 0,\n" + - " \"fields\" : [ {\n" + - " \"name\" : \"data_bucket\",\n" + - " \"transform\" : \"bucket[16]\",\n" + - " \"source-id\" : 2,\n" + - " \"field-id\" : 1000\n" + - " } ]\n" + - "}"; + String expected = + "{\n" + + " \"spec-id\" : 0,\n" + + " \"fields\" : [ {\n" + + " \"name\" : \"data_bucket\",\n" + + " \"transform\" : \"bucket[16]\",\n" + + " \"source-id\" : 2,\n" + + " \"field-id\" : 1000\n" + + " } ]\n" + + "}"; Assert.assertEquals(expected, PartitionSpecParser.toJson(table.spec(), true)); - PartitionSpec spec = PartitionSpec.builderFor(table.schema()) - .bucket("id", 8) - .bucket("data", 16) - .build(); + PartitionSpec spec = + PartitionSpec.builderFor(table.schema()).bucket("id", 8).bucket("data", 16).build(); table.ops().commit(table.ops().current(), table.ops().current().updatePartitionSpec(spec)); - expected = "{\n" + - " \"spec-id\" : 1,\n" + - " \"fields\" : [ {\n" + - " \"name\" : \"id_bucket\",\n" + - " \"transform\" : \"bucket[8]\",\n" + - " \"source-id\" : 1,\n" + - " \"field-id\" : 1000\n" + - " }, {\n" + - " \"name\" : \"data_bucket\",\n" + - " \"transform\" : \"bucket[16]\",\n" + - " \"source-id\" : 2,\n" + - " \"field-id\" : 1001\n" + - " } ]\n" + - "}"; + expected = + "{\n" + + " \"spec-id\" : 1,\n" + + " \"fields\" : [ {\n" + + " \"name\" : \"id_bucket\",\n" + + " \"transform\" : \"bucket[8]\",\n" + + " \"source-id\" : 1,\n" + + " \"field-id\" : 1000\n" + + " }, {\n" + + " \"name\" : \"data_bucket\",\n" + + " \"transform\" : \"bucket[16]\",\n" + + " \"source-id\" : 2,\n" + + " \"field-id\" : 1001\n" + + " } ]\n" + + "}"; Assert.assertEquals(expected, PartitionSpecParser.toJson(table.spec(), true)); } @Test public void testFromJsonWithFieldId() { - String specString = "{\n" + - " \"spec-id\" : 1,\n" + - " \"fields\" : [ {\n" + - " \"name\" : \"id_bucket\",\n" + - " \"transform\" : \"bucket[8]\",\n" + - " \"source-id\" : 1,\n" + - " \"field-id\" : 1001\n" + - " }, {\n" + - " \"name\" : \"data_bucket\",\n" + - " \"transform\" : \"bucket[16]\",\n" + - " \"source-id\" : 2,\n" + - " \"field-id\" : 1000\n" + - " } ]\n" + - "}"; + String specString = + "{\n" + + " \"spec-id\" : 1,\n" + + " \"fields\" : [ {\n" + + " \"name\" : \"id_bucket\",\n" + + " \"transform\" : \"bucket[8]\",\n" + + " \"source-id\" : 1,\n" + + " \"field-id\" : 1001\n" + + " }, {\n" + + " \"name\" : \"data_bucket\",\n" + + " \"transform\" : \"bucket[16]\",\n" + + " \"source-id\" : 2,\n" + + " \"field-id\" : 1000\n" + + " } ]\n" + + "}"; PartitionSpec spec = PartitionSpecParser.fromJson(table.schema(), specString); @@ -91,18 +91,19 @@ public void testFromJsonWithFieldId() { @Test public void testFromJsonWithoutFieldId() { - String specString = "{\n" + - " \"spec-id\" : 1,\n" + - " \"fields\" : [ {\n" + - " \"name\" : \"id_bucket\",\n" + - " \"transform\" : \"bucket[8]\",\n" + - " \"source-id\" : 1\n" + - " }, {\n" + - " \"name\" : \"data_bucket\",\n" + - " \"transform\" : \"bucket[16]\",\n" + - " \"source-id\" : 2\n" + - " } ]\n" + - "}"; + String specString = + "{\n" + + " \"spec-id\" : 1,\n" + + " \"fields\" : [ {\n" + + " \"name\" : \"id_bucket\",\n" + + " \"transform\" : \"bucket[8]\",\n" + + " \"source-id\" : 1\n" + + " }, {\n" + + " \"name\" : \"data_bucket\",\n" + + " \"transform\" : \"bucket[16]\",\n" + + " \"source-id\" : 2\n" + + " } ]\n" + + "}"; PartitionSpec spec = PartitionSpecParser.fromJson(table.schema(), specString); @@ -115,12 +116,13 @@ public void testFromJsonWithoutFieldId() { @Test public void testTransforms() { for (PartitionSpec spec : PartitionSpecTestBase.SPECS) { - Assert.assertEquals("To/from JSON should produce equal partition spec", - spec, roundTripJSON(spec)); + Assert.assertEquals( + "To/from JSON should produce equal partition spec", spec, roundTripJSON(spec)); } } private static PartitionSpec roundTripJSON(PartitionSpec spec) { - return PartitionSpecParser.fromJson(PartitionSpecTestBase.SCHEMA, PartitionSpecParser.toJson(spec)); + return PartitionSpecParser.fromJson( + PartitionSpecTestBase.SCHEMA, PartitionSpecParser.toJson(spec)); } } diff --git a/core/src/test/java/org/apache/iceberg/TestPartitioning.java b/core/src/test/java/org/apache/iceberg/TestPartitioning.java index 2610ad5c01cd..482ec9cd8f5b 100644 --- a/core/src/test/java/org/apache/iceberg/TestPartitioning.java +++ b/core/src/test/java/org/apache/iceberg/TestPartitioning.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.exceptions.ValidationException; @@ -33,20 +34,17 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestPartitioning { private static final int V1_FORMAT_VERSION = 1; private static final int V2_FORMAT_VERSION = 2; - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.IntegerType.get()), - required(2, "data", Types.StringType.get()), - required(3, "category", Types.StringType.get()) - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.IntegerType.get()), + required(2, "data", Types.StringType.get()), + required(3, "category", Types.StringType.get())); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableDir = null; @Before @@ -61,127 +59,101 @@ public void cleanupTables() { @Test public void testPartitionTypeWithSpecEvolutionInV1Tables() { - PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA) - .identity("data") - .build(); - TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, initialSpec, V1_FORMAT_VERSION); + PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); + TestTables.TestTable table = + TestTables.create(tableDir, "test", SCHEMA, initialSpec, V1_FORMAT_VERSION); - table.updateSpec() - .addField(Expressions.bucket("category", 8)) - .commit(); + table.updateSpec().addField(Expressions.bucket("category", 8)).commit(); Assert.assertEquals("Should have 2 specs", 2, table.specs().size()); - StructType expectedType = StructType.of( - NestedField.optional(1000, "data", Types.StringType.get()), - NestedField.optional(1001, "category_bucket_8", Types.IntegerType.get()) - ); + StructType expectedType = + StructType.of( + NestedField.optional(1000, "data", Types.StringType.get()), + NestedField.optional(1001, "category_bucket_8", Types.IntegerType.get())); StructType actualType = Partitioning.partitionType(table); Assert.assertEquals("Types must match", expectedType, actualType); } @Test public void testPartitionTypeWithSpecEvolutionInV2Tables() { - PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA) - .identity("data") - .build(); - TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, initialSpec, V2_FORMAT_VERSION); + PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); + TestTables.TestTable table = + TestTables.create(tableDir, "test", SCHEMA, initialSpec, V2_FORMAT_VERSION); - table.updateSpec() - .removeField("data") - .addField("category") - .commit(); + table.updateSpec().removeField("data").addField("category").commit(); Assert.assertEquals("Should have 2 specs", 2, table.specs().size()); - StructType expectedType = StructType.of( - NestedField.optional(1000, "data", Types.StringType.get()), - NestedField.optional(1001, "category", Types.StringType.get()) - ); + StructType expectedType = + StructType.of( + NestedField.optional(1000, "data", Types.StringType.get()), + NestedField.optional(1001, "category", Types.StringType.get())); StructType actualType = Partitioning.partitionType(table); Assert.assertEquals("Types must match", expectedType, actualType); } @Test public void testPartitionTypeWithRenamesInV1Table() { - PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA) - .identity("data", "p1") - .build(); - TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, initialSpec, V1_FORMAT_VERSION); - - table.updateSpec() - .addField("category") - .commit(); - - table.updateSpec() - .renameField("p1", "p2") - .commit(); - - StructType expectedType = StructType.of( - NestedField.optional(1000, "p2", Types.StringType.get()), - NestedField.optional(1001, "category", Types.StringType.get()) - ); + PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA).identity("data", "p1").build(); + TestTables.TestTable table = + TestTables.create(tableDir, "test", SCHEMA, initialSpec, V1_FORMAT_VERSION); + + table.updateSpec().addField("category").commit(); + + table.updateSpec().renameField("p1", "p2").commit(); + + StructType expectedType = + StructType.of( + NestedField.optional(1000, "p2", Types.StringType.get()), + NestedField.optional(1001, "category", Types.StringType.get())); StructType actualType = Partitioning.partitionType(table); Assert.assertEquals("Types must match", expectedType, actualType); } @Test public void testPartitionTypeWithAddingBackSamePartitionFieldInV1Table() { - PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA) - .identity("data") - .build(); - TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, initialSpec, V1_FORMAT_VERSION); + PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); + TestTables.TestTable table = + TestTables.create(tableDir, "test", SCHEMA, initialSpec, V1_FORMAT_VERSION); - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); // in v1, we use void transforms instead of dropping partition fields - StructType expectedType = StructType.of( - NestedField.optional(1000, "data_1000", Types.StringType.get()), - NestedField.optional(1001, "data", Types.StringType.get()) - ); + StructType expectedType = + StructType.of( + NestedField.optional(1000, "data_1000", Types.StringType.get()), + NestedField.optional(1001, "data", Types.StringType.get())); StructType actualType = Partitioning.partitionType(table); Assert.assertEquals("Types must match", expectedType, actualType); } @Test public void testPartitionTypeWithAddingBackSamePartitionFieldInV2Table() { - PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA) - .identity("data") - .build(); - TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, initialSpec, V2_FORMAT_VERSION); + PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); + TestTables.TestTable table = + TestTables.create(tableDir, "test", SCHEMA, initialSpec, V2_FORMAT_VERSION); - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); // in v2, we should be able to reuse the original partition spec - StructType expectedType = StructType.of( - NestedField.optional(1000, "data", Types.StringType.get()) - ); + StructType expectedType = + StructType.of(NestedField.optional(1000, "data", Types.StringType.get())); StructType actualType = Partitioning.partitionType(table); Assert.assertEquals("Types must match", expectedType, actualType); } @Test public void testPartitionTypeWithIncompatibleSpecEvolution() { - PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA) - .identity("data") - .build(); - TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, initialSpec, V1_FORMAT_VERSION); + PartitionSpec initialSpec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); + TestTables.TestTable table = + TestTables.create(tableDir, "test", SCHEMA, initialSpec, V1_FORMAT_VERSION); - PartitionSpec newSpec = PartitionSpec.builderFor(table.schema()) - .identity("category") - .build(); + PartitionSpec newSpec = PartitionSpec.builderFor(table.schema()).identity("category").build(); TableOperations ops = ((HasTableOperations) table).operations(); TableMetadata current = ops.current(); @@ -189,8 +161,10 @@ public void testPartitionTypeWithIncompatibleSpecEvolution() { Assert.assertEquals("Should have 2 specs", 2, table.specs().size()); - AssertHelpers.assertThrows("Should complain about incompatible specs", - ValidationException.class, "Conflicting partition fields", + AssertHelpers.assertThrows( + "Should complain about incompatible specs", + ValidationException.class, + "Conflicting partition fields", () -> Partitioning.partitionType(table)); } } diff --git a/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java b/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java index 9d364bc24225..8174ed16285a 100644 --- a/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java +++ b/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -42,7 +41,7 @@ public class TestRemoveSnapshots extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestRemoveSnapshots(int formatVersion) { @@ -59,17 +58,13 @@ private long waitUntilAfter(long timestampMillis) { @Test public void testExpireOlderThan() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); waitUntilAfter(table.currentSnapshot().timestampMillis()); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); long snapshotId = table.currentSnapshot().snapshotId(); @@ -77,40 +72,39 @@ public void testExpireOlderThan() { Set deletedFiles = Sets.newHashSet(); - table.expireSnapshots() - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .commit(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertEquals("Should remove only the expired manifest list location", - Sets.newHashSet(firstSnapshot.manifestListLocation()), deletedFiles); + table.expireSnapshots().expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add).commit(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertEquals( + "Should remove only the expired manifest list location", + Sets.newHashSet(firstSnapshot.manifestListLocation()), + deletedFiles); } @Test public void testExpireOlderThanWithDelete() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); waitUntilAfter(table.currentSnapshot().timestampMillis()); - table.newDelete() - .deleteFile(FILE_A) - .commit(); + table.newDelete().deleteFile(FILE_A).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create replace manifest with a rewritten manifest", - 1, secondSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create replace manifest with a rewritten manifest", + 1, + secondSnapshot.allManifests(table.io()).size()); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); waitUntilAfter(table.currentSnapshot().timestampMillis()); @@ -120,21 +114,31 @@ public void testExpireOlderThanWithDelete() { Set deletedFiles = Sets.newHashSet(); - table.expireSnapshots() - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .commit(); + table.expireSnapshots().expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add).commit(); - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the second oldest snapshot", table.snapshot(secondSnapshot.snapshotId())); + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the second oldest snapshot", + table.snapshot(secondSnapshot.snapshotId())); - Assert.assertEquals("Should remove expired manifest lists and deleted data file", + Assert.assertEquals( + "Should remove expired manifest lists and deleted data file", Sets.newHashSet( firstSnapshot.manifestListLocation(), // snapshot expired - firstSnapshot.allManifests(table.io()).get(0).path(), // manifest was rewritten for delete + firstSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest was rewritten for delete secondSnapshot.manifestListLocation(), // snapshot expired - secondSnapshot.allManifests(table.io()).get(0).path(), // manifest contained only deletes, was dropped + secondSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest contained only deletes, was dropped FILE_A.path()), // deleted deletedFiles); } @@ -142,30 +146,29 @@ public void testExpireOlderThanWithDelete() { @Test public void testExpireOlderThanWithDeleteInMergedManifests() { // merge every commit - table.updateProperties() - .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); waitUntilAfter(table.currentSnapshot().timestampMillis()); - table.newDelete() + table + .newDelete() .deleteFile(FILE_A) // FILE_B is still in the dataset .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should replace manifest with a rewritten manifest", - 1, secondSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should replace manifest with a rewritten manifest", + 1, + secondSnapshot.allManifests(table.io()).size()); - table.newFastAppend() // do not merge to keep the last snapshot's manifest valid + table + .newFastAppend() // do not merge to keep the last snapshot's manifest valid .appendFile(FILE_C) .commit(); @@ -177,19 +180,26 @@ public void testExpireOlderThanWithDeleteInMergedManifests() { Set deletedFiles = Sets.newHashSet(); - table.expireSnapshots() - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .commit(); + table.expireSnapshots().expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add).commit(); - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the second oldest snapshot", table.snapshot(secondSnapshot.snapshotId())); + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the second oldest snapshot", + table.snapshot(secondSnapshot.snapshotId())); - Assert.assertEquals("Should remove expired manifest lists and deleted data file", + Assert.assertEquals( + "Should remove expired manifest lists and deleted data file", Sets.newHashSet( firstSnapshot.manifestListLocation(), // snapshot expired - firstSnapshot.allManifests(table.io()).get(0).path(), // manifest was rewritten for delete + firstSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest was rewritten for delete secondSnapshot.manifestListLocation(), // snapshot expired FILE_A.path()), // deleted deletedFiles); @@ -198,33 +208,26 @@ public void testExpireOlderThanWithDeleteInMergedManifests() { @Test public void testExpireOlderThanWithRollback() { // merge every commit - table.updateProperties() - .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); waitUntilAfter(table.currentSnapshot().timestampMillis()); - table.newDelete() - .deleteFile(FILE_B) - .commit(); + table.newDelete().deleteFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Set secondSnapshotManifests = Sets.newHashSet(secondSnapshot.allManifests(table.io())); + Set secondSnapshotManifests = + Sets.newHashSet(secondSnapshot.allManifests(table.io())); secondSnapshotManifests.removeAll(firstSnapshot.allManifests(table.io())); - Assert.assertEquals("Should add one new manifest for append", 1, secondSnapshotManifests.size()); + Assert.assertEquals( + "Should add one new manifest for append", 1, secondSnapshotManifests.size()); - table.manageSnapshots() - .rollbackTo(firstSnapshot.snapshotId()) - .commit(); + table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit(); long tAfterCommits = waitUntilAfter(secondSnapshot.timestampMillis()); @@ -232,46 +235,47 @@ public void testExpireOlderThanWithRollback() { Set deletedFiles = Sets.newHashSet(); - table.expireSnapshots() - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .commit(); + table.expireSnapshots().expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add).commit(); - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNotNull("Expire should keep the oldest snapshot, current", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNotNull( + "Expire should keep the oldest snapshot, current", + table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); - Assert.assertEquals("Should remove expired manifest lists and reverted appended data file", + Assert.assertEquals( + "Should remove expired manifest lists and reverted appended data file", Sets.newHashSet( secondSnapshot.manifestListLocation(), // snapshot expired - Iterables.getOnlyElement(secondSnapshotManifests).path()), // manifest is no longer referenced + Iterables.getOnlyElement(secondSnapshotManifests) + .path()), // manifest is no longer referenced deletedFiles); } @Test public void testExpireOlderThanWithRollbackAndMergedManifests() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); waitUntilAfter(table.currentSnapshot().timestampMillis()); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Set secondSnapshotManifests = Sets.newHashSet(secondSnapshot.allManifests(table.io())); + Set secondSnapshotManifests = + Sets.newHashSet(secondSnapshot.allManifests(table.io())); secondSnapshotManifests.removeAll(firstSnapshot.allManifests(table.io())); - Assert.assertEquals("Should add one new manifest for append", 1, secondSnapshotManifests.size()); + Assert.assertEquals( + "Should add one new manifest for append", 1, secondSnapshotManifests.size()); - table.manageSnapshots() - .rollbackTo(firstSnapshot.snapshotId()) - .commit(); + table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit(); long tAfterCommits = waitUntilAfter(secondSnapshot.timestampMillis()); @@ -279,19 +283,24 @@ public void testExpireOlderThanWithRollbackAndMergedManifests() { Set deletedFiles = Sets.newHashSet(); - table.expireSnapshots() - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .commit(); + table.expireSnapshots().expireOlderThan(tAfterCommits).deleteWith(deletedFiles::add).commit(); - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNotNull("Expire should keep the oldest snapshot, current", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNotNull( + "Expire should keep the oldest snapshot, current", + table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); - Assert.assertEquals("Should remove expired manifest lists and reverted appended data file", + Assert.assertEquals( + "Should remove expired manifest lists and reverted appended data file", Sets.newHashSet( secondSnapshot.manifestListLocation(), // snapshot expired - Iterables.getOnlyElement(secondSnapshotManifests).path(), // manifest is no longer referenced + Iterables.getOnlyElement(secondSnapshotManifests) + .path(), // manifest is no longer referenced FILE_B.path()), // added, but rolled back deletedFiles); } @@ -299,7 +308,8 @@ public void testExpireOlderThanWithRollbackAndMergedManifests() { @Test public void testRetainLastWithExpireOlderThan() { long t0 = System.currentTimeMillis(); - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); @@ -308,7 +318,8 @@ public void testRetainLastWithExpireOlderThan() { t1 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); @@ -317,7 +328,8 @@ public void testRetainLastWithExpireOlderThan() { t2 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); @@ -327,21 +339,19 @@ public void testRetainLastWithExpireOlderThan() { } // Retain last 2 snapshots - table.expireSnapshots() - .expireOlderThan(t3) - .retainLast(2) - .commit(); + table.expireSnapshots().expireOlderThan(t3).retainLast(2).commit(); - Assert.assertEquals("Should have two snapshots.", - 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", - null, table.snapshot(firstSnapshotId)); + Assert.assertEquals( + "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); } @Test public void testRetainLastWithExpireById() { long t0 = System.currentTimeMillis(); - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); @@ -350,7 +360,8 @@ public void testRetainLastWithExpireById() { t1 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); @@ -359,7 +370,8 @@ public void testRetainLastWithExpireById() { t2 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); @@ -369,21 +381,19 @@ public void testRetainLastWithExpireById() { } // Retain last 3 snapshots, but explicitly remove the first snapshot - table.expireSnapshots() - .expireSnapshotId(firstSnapshotId) - .retainLast(3) - .commit(); + table.expireSnapshots().expireSnapshotId(firstSnapshotId).retainLast(3).commit(); - Assert.assertEquals("Should have two snapshots.", - 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", - null, table.snapshot(firstSnapshotId)); + Assert.assertEquals( + "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); } @Test public void testRetainNAvailableSnapshotsWithTransaction() { long t0 = System.currentTimeMillis(); - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); @@ -392,7 +402,8 @@ public void testRetainNAvailableSnapshotsWithTransaction() { t1 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); @@ -401,7 +412,8 @@ public void testRetainNAvailableSnapshotsWithTransaction() { t2 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); @@ -412,22 +424,20 @@ public void testRetainNAvailableSnapshotsWithTransaction() { // Retain last 2 snapshots Transaction tx = table.newTransaction(); - tx.expireSnapshots() - .expireOlderThan(t3) - .retainLast(2) - .commit(); + tx.expireSnapshots().expireOlderThan(t3).retainLast(2).commit(); tx.commitTransaction(); - Assert.assertEquals("Should have two snapshots.", - 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", - null, table.snapshot(firstSnapshotId)); + Assert.assertEquals( + "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); } @Test public void testRetainLastWithTooFewSnapshots() { long t0 = System.currentTimeMillis(); - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .appendFile(FILE_B) // data_bucket=1 .commit(); @@ -438,7 +448,8 @@ public void testRetainLastWithTooFewSnapshots() { t1 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); @@ -448,21 +459,21 @@ public void testRetainLastWithTooFewSnapshots() { } // Retain last 3 snapshots - table.expireSnapshots() - .expireOlderThan(t2) - .retainLast(3) - .commit(); - - Assert.assertEquals("Should have two snapshots", - 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should still present", - firstSnapshotId, table.snapshot(firstSnapshotId).snapshotId()); + table.expireSnapshots().expireOlderThan(t2).retainLast(3).commit(); + + Assert.assertEquals( + "Should have two snapshots", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should still present", + firstSnapshotId, + table.snapshot(firstSnapshotId).snapshotId()); } @Test public void testRetainNLargerThanCurrentSnapshots() { // Append 3 files - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); @@ -471,7 +482,8 @@ public void testRetainNLargerThanCurrentSnapshots() { t1 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); @@ -480,7 +492,8 @@ public void testRetainNLargerThanCurrentSnapshots() { t2 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); @@ -491,20 +504,18 @@ public void testRetainNLargerThanCurrentSnapshots() { // Retain last 4 snapshots Transaction tx = table.newTransaction(); - tx.expireSnapshots() - .expireOlderThan(t3) - .retainLast(4) - .commit(); + tx.expireSnapshots().expireOlderThan(t3).retainLast(4).commit(); tx.commitTransaction(); - Assert.assertEquals("Should have three snapshots.", - 3, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "Should have three snapshots.", 3, Lists.newArrayList(table.snapshots()).size()); } @Test public void testRetainLastKeepsExpiringSnapshot() { long t0 = System.currentTimeMillis(); - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long t1 = System.currentTimeMillis(); @@ -512,7 +523,8 @@ public void testRetainLastKeepsExpiringSnapshot() { t1 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); @@ -522,7 +534,8 @@ public void testRetainLastKeepsExpiringSnapshot() { t2 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); @@ -531,7 +544,8 @@ public void testRetainLastKeepsExpiringSnapshot() { t3 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_D) // data_bucket=3 .commit(); @@ -541,21 +555,23 @@ public void testRetainLastKeepsExpiringSnapshot() { } // Retain last 2 snapshots and expire older than t3 - table.expireSnapshots() + table + .expireSnapshots() .expireOlderThan(secondSnapshot.timestampMillis()) .retainLast(2) .commit(); - Assert.assertEquals("Should have three snapshots.", - 3, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNotNull("Second snapshot should present.", - table.snapshot(secondSnapshot.snapshotId())); + Assert.assertEquals( + "Should have three snapshots.", 3, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNotNull( + "Second snapshot should present.", table.snapshot(secondSnapshot.snapshotId())); } @Test public void testExpireOlderThanMultipleCalls() { long t0 = System.currentTimeMillis(); - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long t1 = System.currentTimeMillis(); @@ -563,7 +579,8 @@ public void testExpireOlderThanMultipleCalls() { t1 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); @@ -573,7 +590,8 @@ public void testExpireOlderThanMultipleCalls() { t2 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); @@ -584,21 +602,23 @@ public void testExpireOlderThanMultipleCalls() { } // Retain last 2 snapshots and expire older than t3 - table.expireSnapshots() + table + .expireSnapshots() .expireOlderThan(secondSnapshot.timestampMillis()) .expireOlderThan(thirdSnapshot.timestampMillis()) .commit(); - Assert.assertEquals("Should have one snapshots.", - 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNull("Second snapshot should not present.", - table.snapshot(secondSnapshot.snapshotId())); + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNull( + "Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); } @Test public void testRetainLastMultipleCalls() { long t0 = System.currentTimeMillis(); - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long t1 = System.currentTimeMillis(); @@ -606,7 +626,8 @@ public void testRetainLastMultipleCalls() { t1 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); @@ -616,7 +637,8 @@ public void testRetainLastMultipleCalls() { t2 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); @@ -626,22 +648,18 @@ public void testRetainLastMultipleCalls() { } // Retain last 2 snapshots and expire older than t3 - table.expireSnapshots() - .expireOlderThan(t3) - .retainLast(2) - .retainLast(1) - .commit(); + table.expireSnapshots().expireOlderThan(t3).retainLast(2).retainLast(1).commit(); - Assert.assertEquals("Should have one snapshots.", - 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNull("Second snapshot should not present.", - table.snapshot(secondSnapshot.snapshotId())); + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNull( + "Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); } @Test public void testRetainZeroSnapshots() { - AssertHelpers.assertThrows("Should fail retain 0 snapshots " + - "because number of snapshots to retain cannot be zero", + AssertHelpers.assertThrows( + "Should fail retain 0 snapshots " + "because number of snapshots to retain cannot be zero", IllegalArgumentException.class, "Number of snapshots to retain must be at least 1, cannot be: 0", () -> table.expireSnapshots().retainLast(0).commit()); @@ -649,19 +667,11 @@ public void testRetainZeroSnapshots() { @Test public void testScanExpiredManifestInValidSnapshotAppend() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.newOverwrite() - .addFile(FILE_C) - .deleteFile(FILE_A) - .commit(); + table.newOverwrite().addFile(FILE_C).deleteFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_D).commit(); long t3 = System.currentTimeMillis(); while (t3 <= table.currentSnapshot().timestampMillis()) { @@ -670,35 +680,24 @@ public void testScanExpiredManifestInValidSnapshotAppend() { Set deletedFiles = Sets.newHashSet(); - table.expireSnapshots() - .expireOlderThan(t3) - .deleteWith(deletedFiles::add) - .commit(); + table.expireSnapshots().expireOlderThan(t3).deleteWith(deletedFiles::add).commit(); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); - } @Test public void testScanExpiredManifestInValidSnapshotFastAppend() { - table.updateProperties() + table + .updateProperties() .set(TableProperties.MANIFEST_MERGE_ENABLED, "true") .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1") .commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.newOverwrite() - .addFile(FILE_C) - .deleteFile(FILE_A) - .commit(); + table.newOverwrite().addFile(FILE_C).deleteFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_D) - .commit(); + table.newFastAppend().appendFile(FILE_D).commit(); long t3 = System.currentTimeMillis(); while (t3 <= table.currentSnapshot().timestampMillis()) { @@ -707,32 +706,21 @@ public void testScanExpiredManifestInValidSnapshotFastAppend() { Set deletedFiles = Sets.newHashSet(); - table.expireSnapshots() - .expireOlderThan(t3) - .deleteWith(deletedFiles::add) - .commit(); + table.expireSnapshots().expireOlderThan(t3).deleteWith(deletedFiles::add).commit(); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); } @Test public void dataFilesCleanup() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit(); long thirdSnapshotId = table.currentSnapshot().snapshotId(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit(); long fourthSnapshotId = table.currentSnapshot().snapshotId(); long t4 = System.currentTimeMillis(); @@ -742,10 +730,11 @@ public void dataFilesCleanup() throws IOException { List manifests = table.currentSnapshot().dataManifests(table.io()); - ManifestFile newManifest = writeManifest( - "manifest-file-1.avro", - manifestEntry(Status.EXISTING, thirdSnapshotId, FILE_C), - manifestEntry(Status.EXISTING, fourthSnapshotId, FILE_D)); + ManifestFile newManifest = + writeManifest( + "manifest-file-1.avro", + manifestEntry(Status.EXISTING, thirdSnapshotId, FILE_C), + manifestEntry(Status.EXISTING, fourthSnapshotId, FILE_D)); RewriteManifests rewriteManifests = table.rewriteManifests(); manifests.forEach(rewriteManifests::deleteManifest); @@ -754,10 +743,7 @@ public void dataFilesCleanup() throws IOException { Set deletedFiles = Sets.newHashSet(); - table.expireSnapshots() - .expireOlderThan(t4) - .deleteWith(deletedFiles::add) - .commit(); + table.expireSnapshots().expireOlderThan(t4).deleteWith(deletedFiles::add).commit(); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); Assert.assertTrue("FILE_B should be deleted", deletedFiles.contains(FILE_B.path().toString())); @@ -765,22 +751,14 @@ public void dataFilesCleanup() throws IOException { @Test public void dataFilesCleanupWithParallelTasks() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit(); long thirdSnapshotId = table.currentSnapshot().snapshotId(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit(); long fourthSnapshotId = table.currentSnapshot().snapshotId(); long t4 = System.currentTimeMillis(); @@ -790,10 +768,11 @@ public void dataFilesCleanupWithParallelTasks() throws IOException { List manifests = table.currentSnapshot().dataManifests(table.io()); - ManifestFile newManifest = writeManifest( - "manifest-file-1.avro", - manifestEntry(Status.EXISTING, thirdSnapshotId, FILE_C), - manifestEntry(Status.EXISTING, fourthSnapshotId, FILE_D)); + ManifestFile newManifest = + writeManifest( + "manifest-file-1.avro", + manifestEntry(Status.EXISTING, thirdSnapshotId, FILE_C), + manifestEntry(Status.EXISTING, fourthSnapshotId, FILE_D)); RewriteManifests rewriteManifests = table.rewriteManifests(); manifests.forEach(rewriteManifests::deleteManifest); @@ -805,29 +784,42 @@ public void dataFilesCleanupWithParallelTasks() throws IOException { AtomicInteger deleteThreadsIndex = new AtomicInteger(0); AtomicInteger planThreadsIndex = new AtomicInteger(0); - table.expireSnapshots() - .executeDeleteWith(Executors.newFixedThreadPool(4, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-snapshot-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })) - .planWith(Executors.newFixedThreadPool(1, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("plan-" + planThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })) + table + .expireSnapshots() + .executeDeleteWith( + Executors.newFixedThreadPool( + 4, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("remove-snapshot-" + deleteThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })) + .planWith( + Executors.newFixedThreadPool( + 1, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("plan-" + planThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })) .expireOlderThan(t4) - .deleteWith(s -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(s); - }) + .deleteWith( + s -> { + deleteThreads.add(Thread.currentThread().getName()); + deletedFiles.add(s); + }) .commit(); - // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory - Assert.assertEquals(deleteThreads, - Sets.newHashSet("remove-snapshot-0", "remove-snapshot-1", "remove-snapshot-2", "remove-snapshot-3")); + // Verifies that the delete methods ran in the threads created by the provided ExecutorService + // ThreadFactory + Assert.assertEquals( + deleteThreads, + Sets.newHashSet( + "remove-snapshot-0", "remove-snapshot-1", "remove-snapshot-2", "remove-snapshot-3")); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); Assert.assertTrue("FILE_B should be deleted", deletedFiles.contains(FILE_B.path().toString())); @@ -836,21 +828,13 @@ public void dataFilesCleanupWithParallelTasks() throws IOException { @Test public void noDataFileCleanup() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit(); long t4 = System.currentTimeMillis(); while (t4 <= table.currentSnapshot().timestampMillis()) { @@ -859,7 +843,8 @@ public void noDataFileCleanup() throws IOException { Set deletedFiles = Sets.newHashSet(); - table.expireSnapshots() + table + .expireSnapshots() .cleanExpiredFiles(false) .expireOlderThan(t4) .deleteWith(deletedFiles::add) @@ -869,36 +854,29 @@ public void noDataFileCleanup() throws IOException { } /** - * Test on table below, and expiring the staged commit `B` using `expireOlderThan` API. - * Table: A - C - * ` B (staged) + * Test on table below, and expiring the staged commit `B` using `expireOlderThan` API. Table: A - + * C ` B (staged) */ @Test public void testWithExpiringDanglingStageCommit() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` staged commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); TableMetadata base = readMetadata(); Snapshot snapshotA = base.snapshots().get(0); Snapshot snapshotB = base.snapshots().get(1); // `C` commit - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Set deletedFiles = Sets.newHashSet(); // Expire all commits including dangling staged snapshot. - table.expireSnapshots() + table + .expireSnapshots() .deleteWith(deletedFiles::add) .expireOlderThan(snapshotB.timestampMillis() + 1) .commit(); @@ -907,120 +885,104 @@ public void testWithExpiringDanglingStageCommit() { expectedDeletes.add(snapshotA.manifestListLocation()); // Files should be deleted of dangling staged snapshot - snapshotB.addedDataFiles(table.io()).forEach(i -> { - expectedDeletes.add(i.path().toString()); - }); + snapshotB + .addedDataFiles(table.io()) + .forEach( + i -> { + expectedDeletes.add(i.path().toString()); + }); // ManifestList should be deleted too expectedDeletes.add(snapshotB.manifestListLocation()); - snapshotB.dataManifests(table.io()).forEach(file -> { - // Only the manifest of B should be deleted. - if (file.snapshotId() == snapshotB.snapshotId()) { - expectedDeletes.add(file.path()); - } - }); - Assert.assertSame("Files deleted count should be expected", expectedDeletes.size(), deletedFiles.size()); + snapshotB + .dataManifests(table.io()) + .forEach( + file -> { + // Only the manifest of B should be deleted. + if (file.snapshotId() == snapshotB.snapshotId()) { + expectedDeletes.add(file.path()); + } + }); + Assert.assertSame( + "Files deleted count should be expected", expectedDeletes.size(), deletedFiles.size()); // Take the diff expectedDeletes.removeAll(deletedFiles); Assert.assertTrue("Exactly same files should be deleted", expectedDeletes.isEmpty()); } /** - * Expire cherry-pick the commit as shown below, when `B` is in table's current state - * Table: - * A - B - C <--current snapshot - * `- D (source=B) + * Expire cherry-pick the commit as shown below, when `B` is in table's current state Table: A - B + * - C <--current snapshot `- D (source=B) */ @Test public void testWithCherryPickTableSnapshot() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot snapshotA = table.currentSnapshot(); // `B` commit Set deletedAFiles = Sets.newHashSet(); - table.newOverwrite() - .addFile(FILE_B) - .deleteFile(FILE_A) - .deleteWith(deletedAFiles::add) - .commit(); + table.newOverwrite().addFile(FILE_B).deleteFile(FILE_A).deleteWith(deletedAFiles::add).commit(); Assert.assertTrue("No files should be physically deleted", deletedAFiles.isEmpty()); // pick the snapshot 'B` Snapshot snapshotB = readMetadata().currentSnapshot(); // `C` commit to let cherry-pick take effect, and avoid fast-forward of `B` with cherry-pick - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Snapshot snapshotC = readMetadata().currentSnapshot(); // Move the table back to `A` - table.manageSnapshots() - .setCurrentSnapshot(snapshotA.snapshotId()) - .commit(); + table.manageSnapshots().setCurrentSnapshot(snapshotA.snapshotId()).commit(); // Generate A -> `D (B)` - table.manageSnapshots() - .cherrypick(snapshotB.snapshotId()) - .commit(); + table.manageSnapshots().cherrypick(snapshotB.snapshotId()).commit(); Snapshot snapshotD = readMetadata().currentSnapshot(); // Move the table back to `C` - table.manageSnapshots() - .setCurrentSnapshot(snapshotC.snapshotId()) - .commit(); + table.manageSnapshots().setCurrentSnapshot(snapshotC.snapshotId()).commit(); List deletedFiles = Lists.newArrayList(); // Expire `C` - table.expireSnapshots() + table + .expireSnapshots() .deleteWith(deletedFiles::add) .expireOlderThan(snapshotC.timestampMillis() + 1) .commit(); // Make sure no dataFiles are deleted for the B, C, D snapshot - Lists.newArrayList(snapshotB, snapshotC, snapshotD).forEach(i -> { - i.addedDataFiles(table.io()).forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); + Lists.newArrayList(snapshotB, snapshotC, snapshotD) + .forEach( + i -> { + i.addedDataFiles(table.io()) + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); } /** - * Test on table below, and expiring `B` which is not in current table state. - * 1) Expire `B` - * 2) All commit - * Table: A - C - D (B) - * ` B (staged) + * Test on table below, and expiring `B` which is not in current table state. 1) Expire `B` 2) All + * commit Table: A - C - D (B) ` B (staged) */ @Test public void testWithExpiringStagedThenCherrypick() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); // pick the snapshot that's staged but not committed TableMetadata base = readMetadata(); Snapshot snapshotB = base.snapshots().get(1); // `C` commit to let cherry-pick take effect, and avoid fast-forward of `B` with cherry-pick - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); // `D (B)` cherry-pick commit - table.manageSnapshots() - .cherrypick(snapshotB.snapshotId()) - .commit(); + table.manageSnapshots().cherrypick(snapshotB.snapshotId()).commit(); base = readMetadata(); Snapshot snapshotD = base.snapshots().get(3); @@ -1028,121 +990,116 @@ public void testWithExpiringStagedThenCherrypick() { List deletedFiles = Lists.newArrayList(); // Expire `B` commit. - table.expireSnapshots() + table + .expireSnapshots() .deleteWith(deletedFiles::add) .expireSnapshotId(snapshotB.snapshotId()) .commit(); // Make sure no dataFiles are deleted for the staged snapshot - Lists.newArrayList(snapshotB).forEach(i -> { - i.addedDataFiles(table.io()).forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); + Lists.newArrayList(snapshotB) + .forEach( + i -> { + i.addedDataFiles(table.io()) + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); // Expire all snapshots including cherry-pick - table.expireSnapshots() + table + .expireSnapshots() .deleteWith(deletedFiles::add) .expireOlderThan(table.currentSnapshot().timestampMillis() + 1) .commit(); // Make sure no dataFiles are deleted for the staged and cherry-pick - Lists.newArrayList(snapshotB, snapshotD).forEach(i -> { - i.addedDataFiles(table.io()).forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); + Lists.newArrayList(snapshotB, snapshotD) + .forEach( + i -> { + i.addedDataFiles(table.io()) + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); } @Test public void testExpireSnapshotsWhenGarbageCollectionDisabled() { - table.updateProperties() - .set(TableProperties.GC_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - AssertHelpers.assertThrows("Should complain about expiring snapshots", - ValidationException.class, "Cannot expire snapshots: GC is disabled", + AssertHelpers.assertThrows( + "Should complain about expiring snapshots", + ValidationException.class, + "Cannot expire snapshots: GC is disabled", () -> table.expireSnapshots()); } @Test public void testExpireWithDefaultRetainLast() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Assert.assertEquals("Expected 3 snapshots", 3, Iterables.size(table.snapshots())); - table.updateProperties() - .set(TableProperties.MIN_SNAPSHOTS_TO_KEEP, "3") - .commit(); + table.updateProperties().set(TableProperties.MIN_SNAPSHOTS_TO_KEEP, "3").commit(); Set deletedFiles = Sets.newHashSet(); Snapshot snapshotBeforeExpiration = table.currentSnapshot(); - table.expireSnapshots() + table + .expireSnapshots() .expireOlderThan(System.currentTimeMillis()) .deleteWith(deletedFiles::add) .commit(); - Assert.assertEquals("Should not change current snapshot", snapshotBeforeExpiration, table.currentSnapshot()); + Assert.assertEquals( + "Should not change current snapshot", snapshotBeforeExpiration, table.currentSnapshot()); Assert.assertEquals("Should keep 3 snapshots", 3, Iterables.size(table.snapshots())); Assert.assertTrue("Should not delete data", deletedFiles.isEmpty()); } @Test public void testExpireWithDefaultSnapshotAge() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); waitUntilAfter(firstSnapshot.timestampMillis()); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); waitUntilAfter(secondSnapshot.timestampMillis()); - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Snapshot thirdSnapshot = table.currentSnapshot(); waitUntilAfter(thirdSnapshot.timestampMillis()); Assert.assertEquals("Expected 3 snapshots", 3, Iterables.size(table.snapshots())); - table.updateProperties() - .set(TableProperties.MAX_SNAPSHOT_AGE_MS, "1") - .commit(); + table.updateProperties().set(TableProperties.MAX_SNAPSHOT_AGE_MS, "1").commit(); Set deletedFiles = Sets.newHashSet(); // rely solely on default configs - table.expireSnapshots() - .deleteWith(deletedFiles::add) - .commit(); + table.expireSnapshots().deleteWith(deletedFiles::add).commit(); - Assert.assertEquals("Should not change current snapshot", thirdSnapshot, table.currentSnapshot()); + Assert.assertEquals( + "Should not change current snapshot", thirdSnapshot, table.currentSnapshot()); Assert.assertEquals("Should keep 1 snapshot", 1, Iterables.size(table.snapshots())); - Assert.assertEquals("Should remove expired manifest lists", - Sets.newHashSet(firstSnapshot.manifestListLocation(), secondSnapshot.manifestListLocation()), + Assert.assertEquals( + "Should remove expired manifest lists", + Sets.newHashSet( + firstSnapshot.manifestListLocation(), secondSnapshot.manifestListLocation()), deletedFiles); } @@ -1151,82 +1108,85 @@ public void testExpireWithDeleteFiles() { Assume.assumeTrue("Delete files only supported in V2 spec", formatVersion == 2); // Data Manifest => File_A - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); // Data Manifest => FILE_A // Delete Manifest => FILE_A_DELETES - table.newRowDelta() - .addDeletes(FILE_A_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_A_DELETES).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should have 1 data manifest", 1, secondSnapshot.dataManifests(table.io()).size()); - Assert.assertEquals("Should have 1 delete manifest", 1, secondSnapshot.deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should have 1 data manifest", 1, secondSnapshot.dataManifests(table.io()).size()); + Assert.assertEquals( + "Should have 1 delete manifest", 1, secondSnapshot.deleteManifests(table.io()).size()); // FILE_A and FILE_A_DELETES move into "DELETED" state - table.newRewrite() + table + .newRewrite() .rewriteFiles( ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_A_DELETES), // deleted ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_B_DELETES)) // added .validateFromSnapshot(secondSnapshot.snapshotId()) .commit(); Snapshot thirdSnapshot = table.currentSnapshot(); - Set manifestOfDeletedFiles = thirdSnapshot.allManifests(table.io()).stream().filter( - ManifestFile::hasDeletedFiles).collect(Collectors.toSet()); - Assert.assertEquals("Should have two manifests of deleted files", 2, - manifestOfDeletedFiles.size()); + Set manifestOfDeletedFiles = + thirdSnapshot.allManifests(table.io()).stream() + .filter(ManifestFile::hasDeletedFiles) + .collect(Collectors.toSet()); + Assert.assertEquals( + "Should have two manifests of deleted files", 2, manifestOfDeletedFiles.size()); - // Need one more commit before manifests of files of DELETED state get cleared from current snapshot. - table.newAppend() - .appendFile(FILE_C) - .commit(); + // Need one more commit before manifests of files of DELETED state get cleared from current + // snapshot. + table.newAppend().appendFile(FILE_C).commit(); Snapshot fourthSnapshot = table.currentSnapshot(); long fourthSnapshotTs = waitUntilAfter(fourthSnapshot.timestampMillis()); Set deletedFiles = Sets.newHashSet(); - table.expireSnapshots() + table + .expireSnapshots() .expireOlderThan(fourthSnapshotTs) .deleteWith(deletedFiles::add) .commit(); - Assert.assertEquals("Should remove old delete files and delete file manifests", + Assert.assertEquals( + "Should remove old delete files and delete file manifests", ImmutableSet.builder() .add(FILE_A.path()) .add(FILE_A_DELETES.path()) .add(firstSnapshot.manifestListLocation()) .add(secondSnapshot.manifestListLocation()) .add(thirdSnapshot.manifestListLocation()) - .addAll(secondSnapshot.allManifests(FILE_IO).stream().map(ManifestFile::path).collect(Collectors.toList())) - .addAll(manifestOfDeletedFiles.stream().map(ManifestFile::path).collect(Collectors.toList())) + .addAll( + secondSnapshot.allManifests(FILE_IO).stream() + .map(ManifestFile::path) + .collect(Collectors.toList())) + .addAll( + manifestOfDeletedFiles.stream() + .map(ManifestFile::path) + .collect(Collectors.toList())) .build(), deletedFiles); } @Test public void testTagExpiration() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long now = System.currentTimeMillis(); long maxAgeMs = 100; long expirationTime = now + maxAgeMs; - table.manageSnapshots() + table + .manageSnapshots() .createTag("tag", table.currentSnapshot().snapshotId()) .setMaxRefAgeMs("tag", maxAgeMs) .commit(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); - table.manageSnapshots() - .createBranch("branch", table.currentSnapshot().snapshotId()) - .commit(); + table.manageSnapshots().createBranch("branch", table.currentSnapshot().snapshotId()).commit(); waitUntilAfter(expirationTime); @@ -1239,26 +1199,21 @@ public void testTagExpiration() { @Test public void testBranchExpiration() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long now = System.currentTimeMillis(); long maxAgeMs = 100; long expirationTime = now + maxAgeMs; - table.manageSnapshots() + table + .manageSnapshots() .createBranch("branch", table.currentSnapshot().snapshotId()) .setMaxRefAgeMs("branch", maxAgeMs) .commit(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); - table.manageSnapshots() - .createTag("tag", table.currentSnapshot().snapshotId()) - .commit(); + table.manageSnapshots().createTag("tag", table.currentSnapshot().snapshotId()).commit(); waitUntilAfter(expirationTime); @@ -1271,13 +1226,9 @@ public void testBranchExpiration() { @Test public void testMultipleRefsAndCleanExpiredFilesFails() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.manageSnapshots() - .createTag("TagA", table.currentSnapshot().snapshotId()) - .commit(); + table.manageSnapshots().createTag("TagA", table.currentSnapshot().snapshotId()).commit(); AssertHelpers.assertThrows( "Should fail removing snapshots and files when there is more than 1 ref", @@ -1288,21 +1239,15 @@ public void testMultipleRefsAndCleanExpiredFilesFails() { @Test public void testFailRemovingSnapshotWhenStillReferencedByBranch() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - AppendFiles append = table.newAppend() - .appendFile(FILE_B) - .stageOnly(); + AppendFiles append = table.newAppend().appendFile(FILE_B).stageOnly(); long snapshotId = append.apply().snapshotId(); append.commit(); - table.manageSnapshots() - .createBranch("branch", snapshotId) - .commit(); + table.manageSnapshots().createBranch("branch", snapshotId).commit(); AssertHelpers.assertThrows( "Should fail removing snapshot when it is still referenced", @@ -1313,121 +1258,102 @@ public void testFailRemovingSnapshotWhenStillReferencedByBranch() { @Test public void testFailRemovingSnapshotWhenStillReferencedByTag() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long snapshotId = table.currentSnapshot().snapshotId(); - table.manageSnapshots() - .createTag("tag", snapshotId) - .commit(); + table.manageSnapshots().createTag("tag", snapshotId).commit(); // commit another snapshot so the first one isn't referenced by main - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); AssertHelpers.assertThrows( "Should fail removing snapshot when it is still referenced", - IllegalArgumentException.class, + IllegalArgumentException.class, "Cannot expire 1. Still referenced by refs: [tag]", () -> table.expireSnapshots().expireSnapshotId(snapshotId).commit()); } @Test public void testRetainUnreferencedSnapshotsWithinExpirationAge() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long expireTimestampSnapshotA = waitUntilAfter(table.currentSnapshot().timestampMillis()); waitUntilAfter(expireTimestampSnapshotA); - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); - table.expireSnapshots() - .expireOlderThan(expireTimestampSnapshotA) - .commit(); + table.expireSnapshots().expireOlderThan(expireTimestampSnapshotA).commit(); Assert.assertEquals(2, table.ops().current().snapshots().size()); } @Test public void testUnreferencedSnapshotParentOfTag() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long initialSnapshotId = table.currentSnapshot().snapshotId(); // this will be expired because it is still unreferenced with a tag on its child snapshot - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); long expiredSnapshotId = table.currentSnapshot().snapshotId(); long expireTimestampSnapshotB = waitUntilAfter(table.currentSnapshot().timestampMillis()); waitUntilAfter(expireTimestampSnapshotB); - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); - // create a tag that references the current history and rewrite main to point to the initial snapshot - table.manageSnapshots() + // create a tag that references the current history and rewrite main to point to the initial + // snapshot + table + .manageSnapshots() .createTag("tag", table.currentSnapshot().snapshotId()) .replaceBranch("main", initialSnapshotId) .commit(); - table.expireSnapshots() + table + .expireSnapshots() .expireOlderThan(expireTimestampSnapshotB) .cleanExpiredFiles(false) .commit(); - Assert.assertNull("Should remove unreferenced snapshot beneath a tag", table.snapshot(expiredSnapshotId)); + Assert.assertNull( + "Should remove unreferenced snapshot beneath a tag", table.snapshot(expiredSnapshotId)); Assert.assertEquals(2, table.ops().current().snapshots().size()); } @Test public void testSnapshotParentOfBranchNotUnreferenced() { - // similar to testUnreferencedSnapshotParentOfTag, but checks that branch history is not considered unreferenced - table.newAppend() - .appendFile(FILE_A) - .commit(); + // similar to testUnreferencedSnapshotParentOfTag, but checks that branch history is not + // considered unreferenced + table.newAppend().appendFile(FILE_A).commit(); long initialSnapshotId = table.currentSnapshot().snapshotId(); // this will be expired because it is still unreferenced with a tag on its child snapshot - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); long snapshotId = table.currentSnapshot().snapshotId(); long expireTimestampSnapshotB = waitUntilAfter(table.currentSnapshot().timestampMillis()); waitUntilAfter(expireTimestampSnapshotB); - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); - // create a branch that references the current history and rewrite main to point to the initial snapshot - table.manageSnapshots() + // create a branch that references the current history and rewrite main to point to the initial + // snapshot + table + .manageSnapshots() .createBranch("branch", table.currentSnapshot().snapshotId()) .setMaxSnapshotAgeMs("branch", Long.MAX_VALUE) .replaceBranch("main", initialSnapshotId) .commit(); - table.expireSnapshots() + table + .expireSnapshots() .expireOlderThan(expireTimestampSnapshotB) .cleanExpiredFiles(false) .commit(); @@ -1455,13 +1381,15 @@ public void testMinSnapshotsToKeepMultipleBranches() { long expirationTime = System.currentTimeMillis() + maxSnapshotAgeMs; // configure main so that the initial snapshot will expire - table.manageSnapshots() + table + .manageSnapshots() .setMinSnapshotsToKeep(SnapshotRef.MAIN_BRANCH, 1) .setMaxSnapshotAgeMs(SnapshotRef.MAIN_BRANCH, 1) .commit(); // retain 3 snapshots on branch (including the initial snapshot) - table.manageSnapshots() + table + .manageSnapshots() .createBranch("branch", branchSnapshotId) .setMinSnapshotsToKeep("branch", 3) .setMaxSnapshotAgeMs("branch", maxSnapshotAgeMs) @@ -1470,16 +1398,16 @@ public void testMinSnapshotsToKeepMultipleBranches() { waitUntilAfter(expirationTime); table.expireSnapshots().cleanExpiredFiles(false).commit(); - Assert.assertEquals("Should have 3 snapshots (none removed)", 3, Iterables.size(table.snapshots())); + Assert.assertEquals( + "Should have 3 snapshots (none removed)", 3, Iterables.size(table.snapshots())); // stop retaining snapshots from the branch - table.manageSnapshots() - .setMinSnapshotsToKeep("branch", 1) - .commit(); + table.manageSnapshots().setMinSnapshotsToKeep("branch", 1).commit(); table.expireSnapshots().cleanExpiredFiles(false).commit(); - Assert.assertEquals("Should have 2 snapshots (initial removed)", 2, Iterables.size(table.snapshots())); + Assert.assertEquals( + "Should have 2 snapshots (initial removed)", 2, Iterables.size(table.snapshots())); Assert.assertNull(table.ops().current().snapshot(initialSnapshotId)); } @@ -1496,7 +1424,8 @@ public void testMaxSnapshotAgeMultipleBranches() { table.newAppend().appendFile(FILE_B).commit(); // configure main so that the initial snapshot will expire - table.manageSnapshots() + table + .manageSnapshots() .setMaxSnapshotAgeMs(SnapshotRef.MAIN_BRANCH, ageMs) .setMinSnapshotsToKeep(SnapshotRef.MAIN_BRANCH, 1) .commit(); @@ -1509,7 +1438,8 @@ public void testMaxSnapshotAgeMultipleBranches() { Assert.assertEquals("Should have 3 snapshots", 3, Iterables.size(table.snapshots())); // retain all snapshots on branch (including the initial snapshot) - table.manageSnapshots() + table + .manageSnapshots() .createBranch("branch", branchSnapshotId) .setMinSnapshotsToKeep("branch", 1) .setMaxSnapshotAgeMs("branch", Long.MAX_VALUE) @@ -1517,16 +1447,16 @@ public void testMaxSnapshotAgeMultipleBranches() { table.expireSnapshots().cleanExpiredFiles(false).commit(); - Assert.assertEquals("Should have 3 snapshots (none removed)", 3, Iterables.size(table.snapshots())); + Assert.assertEquals( + "Should have 3 snapshots (none removed)", 3, Iterables.size(table.snapshots())); // allow the initial snapshot to age off from branch - table.manageSnapshots() - .setMaxSnapshotAgeMs("branch", ageMs) - .commit(); + table.manageSnapshots().setMaxSnapshotAgeMs("branch", ageMs).commit(); table.expireSnapshots().cleanExpiredFiles(false).commit(); - Assert.assertEquals("Should have 2 snapshots (initial removed)", 2, Iterables.size(table.snapshots())); + Assert.assertEquals( + "Should have 2 snapshots (initial removed)", 2, Iterables.size(table.snapshots())); Assert.assertNull(table.ops().current().snapshot(initialSnapshotId)); } } diff --git a/core/src/test/java/org/apache/iceberg/TestReplacePartitions.java b/core/src/test/java/org/apache/iceberg/TestReplacePartitions.java index 776c465ca676..9c73d2d9576b 100644 --- a/core/src/test/java/org/apache/iceberg/TestReplacePartitions.java +++ b/core/src/test/java/org/apache/iceberg/TestReplacePartitions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.File; @@ -32,43 +31,48 @@ @RunWith(Parameterized.class) public class TestReplacePartitions extends TableTestBase { - static final DataFile FILE_E = DataFiles.builder(SPEC) - .withPath("/path/to/data-e.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("data_bucket=0") // same partition as FILE_A - .withRecordCount(0) - .build(); - - static final DataFile FILE_F = DataFiles.builder(SPEC) - .withPath("/path/to/data-f.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("data_bucket=1") // same partition as FILE_B - .withRecordCount(0) - .build(); - - static final DataFile FILE_G = DataFiles.builder(SPEC) - .withPath("/path/to/data-g.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("data_bucket=10") // no other partition - .withRecordCount(0) - .build(); - - static final DataFile FILE_UNPARTITIONED_A = DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath("/path/to/data-unpartitioned-a.parquet") - .withFileSizeInBytes(10) - .withRecordCount(1) - .build(); - - static final DeleteFile FILE_UNPARTITIONED_A_DELETES = FileMetadata.deleteFileBuilder(PartitionSpec.unpartitioned()) - .ofPositionDeletes() - .withPath("/path/to/data-unpartitioned-a-deletes.parquet") - .withFileSizeInBytes(10) - .withRecordCount(1) - .build(); + static final DataFile FILE_E = + DataFiles.builder(SPEC) + .withPath("/path/to/data-e.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("data_bucket=0") // same partition as FILE_A + .withRecordCount(0) + .build(); + + static final DataFile FILE_F = + DataFiles.builder(SPEC) + .withPath("/path/to/data-f.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("data_bucket=1") // same partition as FILE_B + .withRecordCount(0) + .build(); + + static final DataFile FILE_G = + DataFiles.builder(SPEC) + .withPath("/path/to/data-g.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("data_bucket=10") // no other partition + .withRecordCount(0) + .build(); + + static final DataFile FILE_UNPARTITIONED_A = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath("/path/to/data-unpartitioned-a.parquet") + .withFileSizeInBytes(10) + .withRecordCount(1) + .build(); + + static final DeleteFile FILE_UNPARTITIONED_A_DELETES = + FileMetadata.deleteFileBuilder(PartitionSpec.unpartitioned()) + .ofPositionDeletes() + .withPath("/path/to/data-unpartitioned-a-deletes.parquet") + .withFileSizeInBytes(10) + .withRecordCount(1) + .build(); @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestReplacePartitions(int formatVersion) { @@ -77,30 +81,29 @@ public TestReplacePartitions(int formatVersion) { @Test public void testReplaceOnePartition() { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); TableMetadata base = readMetadata(); long baseId = base.currentSnapshot().snapshotId(); - table.newReplacePartitions() - .addFile(FILE_E) - .commit(); + table.newReplacePartitions().addFile(FILE_E).commit(); long replaceId = readMetadata().currentSnapshot().snapshotId(); Assert.assertNotEquals("Should create a new snapshot", baseId, replaceId); - Assert.assertEquals("Table should have 2 manifests", - 2, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Table should have 2 manifests", + 2, + table.currentSnapshot().allManifests(table.io()).size()); // manifest is not merged because it is less than the minimum - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(0), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(0), ids(replaceId), files(FILE_E), statuses(Status.ADDED)); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(1), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(1), ids(replaceId, baseId), files(FILE_A, FILE_B), statuses(Status.DELETED, Status.EXISTING)); @@ -111,24 +114,20 @@ public void testReplaceAndMergeOnePartition() { // ensure the overwrite results in a merge table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1").commit(); - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); TableMetadata base = readMetadata(); long baseId = base.currentSnapshot().snapshotId(); - table.newReplacePartitions() - .addFile(FILE_E) - .commit(); + table.newReplacePartitions().addFile(FILE_E).commit(); long replaceId = readMetadata().currentSnapshot().snapshotId(); Assert.assertNotEquals("Should create a new snapshot", baseId, replaceId); - Assert.assertEquals("Table should have 1 manifest", - 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Table should have 1 manifest", 1, table.currentSnapshot().allManifests(table.io()).size()); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(0), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(0), ids(replaceId, replaceId, baseId), files(FILE_E, FILE_A, FILE_B), statuses(Status.ADDED, Status.DELETED, Status.EXISTING)); @@ -139,38 +138,43 @@ public void testReplaceWithUnpartitionedTable() throws IOException { File tableDir = temp.newFolder(); Assert.assertTrue(tableDir.delete()); - Table unpartitioned = TestTables.create( - tableDir, "unpartitioned", SCHEMA, PartitionSpec.unpartitioned(), formatVersion); + Table unpartitioned = + TestTables.create( + tableDir, "unpartitioned", SCHEMA, PartitionSpec.unpartitioned(), formatVersion); - Assert.assertEquals("Table version should be 0", - 0, (long) TestTables.metadataVersion("unpartitioned")); + Assert.assertEquals( + "Table version should be 0", 0, (long) TestTables.metadataVersion("unpartitioned")); - unpartitioned.newAppend() - .appendFile(FILE_A) - .commit(); + unpartitioned.newAppend().appendFile(FILE_A).commit(); // make sure the data was successfully added - Assert.assertEquals("Table version should be 1", - 1, (long) TestTables.metadataVersion("unpartitioned")); + Assert.assertEquals( + "Table version should be 1", 1, (long) TestTables.metadataVersion("unpartitioned")); validateSnapshot(null, TestTables.readMetadata("unpartitioned").currentSnapshot(), FILE_A); - unpartitioned.newReplacePartitions() - .addFile(FILE_B) - .commit(); + unpartitioned.newReplacePartitions().addFile(FILE_B).commit(); - Assert.assertEquals("Table version should be 2", - 2, (long) TestTables.metadataVersion("unpartitioned")); + Assert.assertEquals( + "Table version should be 2", 2, (long) TestTables.metadataVersion("unpartitioned")); TableMetadata replaceMetadata = TestTables.readMetadata("unpartitioned"); long replaceId = replaceMetadata.currentSnapshot().snapshotId(); - Assert.assertEquals("Table should have 2 manifests", - 2, replaceMetadata.currentSnapshot().allManifests(unpartitioned.io()).size()); + Assert.assertEquals( + "Table should have 2 manifests", + 2, + replaceMetadata.currentSnapshot().allManifests(unpartitioned.io()).size()); - validateManifestEntries(replaceMetadata.currentSnapshot().allManifests(unpartitioned.io()).get(0), - ids(replaceId), files(FILE_B), statuses(Status.ADDED)); + validateManifestEntries( + replaceMetadata.currentSnapshot().allManifests(unpartitioned.io()).get(0), + ids(replaceId), + files(FILE_B), + statuses(Status.ADDED)); - validateManifestEntries(replaceMetadata.currentSnapshot().allManifests(unpartitioned.io()).get(1), - ids(replaceId), files(FILE_A), statuses(Status.DELETED)); + validateManifestEntries( + replaceMetadata.currentSnapshot().allManifests(unpartitioned.io()).get(1), + ids(replaceId), + files(FILE_A), + statuses(Status.DELETED)); } @Test @@ -178,90 +182,87 @@ public void testReplaceAndMergeWithUnpartitionedTable() throws IOException { File tableDir = temp.newFolder(); Assert.assertTrue(tableDir.delete()); - Table unpartitioned = TestTables.create( - tableDir, "unpartitioned", SCHEMA, PartitionSpec.unpartitioned(), formatVersion); + Table unpartitioned = + TestTables.create( + tableDir, "unpartitioned", SCHEMA, PartitionSpec.unpartitioned(), formatVersion); // ensure the overwrite results in a merge unpartitioned.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1").commit(); - Assert.assertEquals("Table version should be 1", - 1, (long) TestTables.metadataVersion("unpartitioned")); + Assert.assertEquals( + "Table version should be 1", 1, (long) TestTables.metadataVersion("unpartitioned")); - unpartitioned.newAppend() - .appendFile(FILE_A) - .commit(); + unpartitioned.newAppend().appendFile(FILE_A).commit(); // make sure the data was successfully added - Assert.assertEquals("Table version should be 2", - 2, (long) TestTables.metadataVersion("unpartitioned")); + Assert.assertEquals( + "Table version should be 2", 2, (long) TestTables.metadataVersion("unpartitioned")); validateSnapshot(null, TestTables.readMetadata("unpartitioned").currentSnapshot(), FILE_A); - unpartitioned.newReplacePartitions() - .addFile(FILE_B) - .commit(); + unpartitioned.newReplacePartitions().addFile(FILE_B).commit(); - Assert.assertEquals("Table version should be 3", - 3, (long) TestTables.metadataVersion("unpartitioned")); + Assert.assertEquals( + "Table version should be 3", 3, (long) TestTables.metadataVersion("unpartitioned")); TableMetadata replaceMetadata = TestTables.readMetadata("unpartitioned"); long replaceId = replaceMetadata.currentSnapshot().snapshotId(); - Assert.assertEquals("Table should have 1 manifest", - 1, replaceMetadata.currentSnapshot().allManifests(unpartitioned.io()).size()); + Assert.assertEquals( + "Table should have 1 manifest", + 1, + replaceMetadata.currentSnapshot().allManifests(unpartitioned.io()).size()); - validateManifestEntries(replaceMetadata.currentSnapshot().allManifests(unpartitioned.io()).get(0), - ids(replaceId, replaceId), files(FILE_B, FILE_A), statuses(Status.ADDED, Status.DELETED)); + validateManifestEntries( + replaceMetadata.currentSnapshot().allManifests(unpartitioned.io()).get(0), + ids(replaceId, replaceId), + files(FILE_B, FILE_A), + statuses(Status.ADDED, Status.DELETED)); } @Test public void testValidationFailure() { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); TableMetadata base = readMetadata(); long baseId = base.currentSnapshot().snapshotId(); - ReplacePartitions replace = table.newReplacePartitions() - .addFile(FILE_F) - .addFile(FILE_G) - .validateAppendOnly(); + ReplacePartitions replace = + table.newReplacePartitions().addFile(FILE_F).addFile(FILE_G).validateAppendOnly(); - AssertHelpers.assertThrows("Should reject commit with file not matching delete expression", - ValidationException.class, "Cannot commit file that conflicts with existing partition", + AssertHelpers.assertThrows( + "Should reject commit with file not matching delete expression", + ValidationException.class, + "Cannot commit file that conflicts with existing partition", replace::commit); - Assert.assertEquals("Should not create a new snapshot", - baseId, readMetadata().currentSnapshot().snapshotId()); + Assert.assertEquals( + "Should not create a new snapshot", baseId, readMetadata().currentSnapshot().snapshotId()); } @Test public void testValidationSuccess() { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); TableMetadata base = readMetadata(); long baseId = base.currentSnapshot().snapshotId(); - table.newReplacePartitions() - .addFile(FILE_G) - .validateAppendOnly() - .commit(); + table.newReplacePartitions().addFile(FILE_G).validateAppendOnly().commit(); long replaceId = readMetadata().currentSnapshot().snapshotId(); Assert.assertNotEquals("Should create a new snapshot", baseId, replaceId); - Assert.assertEquals("Table should have 2 manifests", - 2, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Table should have 2 manifests", + 2, + table.currentSnapshot().allManifests(table.io()).size()); // manifest is not merged because it is less than the minimum - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(0), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(0), ids(replaceId), files(FILE_G), statuses(Status.ADDED)); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(1), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(1), ids(baseId, baseId), files(FILE_A, FILE_B), statuses(Status.ADDED, Status.ADDED)); @@ -269,31 +270,33 @@ public void testValidationSuccess() { @Test public void testValidationNotInvoked() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); // Two concurrent ReplacePartitions with No Validation Enabled - table.newReplacePartitions() + table + .newReplacePartitions() .addFile(FILE_E) .validateFromSnapshot(base.currentSnapshot().snapshotId()) .commit(); - table.newReplacePartitions() + table + .newReplacePartitions() .addFile(FILE_A) // Replaces FILE_E which becomes Deleted .addFile(FILE_B) .validateFromSnapshot(base.currentSnapshot().snapshotId()) .commit(); long replaceId = readMetadata().currentSnapshot().snapshotId(); - Assert.assertEquals("Table should have 2 manifest", - 2, table.currentSnapshot().allManifests(table.io()).size()); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(0), + Assert.assertEquals( + "Table should have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(0), ids(replaceId, replaceId), files(FILE_A, FILE_B), statuses(Status.ADDED, Status.ADDED)); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(1), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(1), ids(replaceId), files(FILE_E), statuses(Status.DELETED)); @@ -301,16 +304,15 @@ public void testValidationNotInvoked() { @Test public void testValidateWithDefaultSnapshotId() { - table.newReplacePartitions() - .addFile(FILE_A) - .commit(); + table.newReplacePartitions().addFile(FILE_A).commit(); // Concurrent Replace Partitions should fail with ValidationException ReplacePartitions replace = table.newReplacePartitions(); - AssertHelpers.assertThrows("Should reject commit with file matching partitions replaced", + AssertHelpers.assertThrows( + "Should reject commit with file matching partitions replaced", ValidationException.class, - "Found conflicting files that can contain records matching partitions " + - "[data_bucket=0, data_bucket=1]: [/path/to/data-a.parquet]", + "Found conflicting files that can contain records matching partitions " + + "[data_bucket=0, data_bucket=1]: [/path/to/data-a.parquet]", () -> replace .addFile(FILE_A) @@ -322,25 +324,22 @@ public void testValidateWithDefaultSnapshotId() { @Test public void testConcurrentReplaceConflict() { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); TableMetadata base = readMetadata(); long baseId = base.currentSnapshot().snapshotId(); // Concurrent Replace Partitions should fail with ValidationException - table.newReplacePartitions() - .addFile(FILE_A) - .commit(); + table.newReplacePartitions().addFile(FILE_A).commit(); - AssertHelpers.assertThrows("Should reject commit with file matching partitions replaced", + AssertHelpers.assertThrows( + "Should reject commit with file matching partitions replaced", ValidationException.class, - "Found conflicting files that can contain records matching partitions " + - "[data_bucket=0, data_bucket=1]: [/path/to/data-a.parquet]", + "Found conflicting files that can contain records matching partitions " + + "[data_bucket=0, data_bucket=1]: [/path/to/data-a.parquet]", () -> - table.newReplacePartitions() + table + .newReplacePartitions() .validateFromSnapshot(baseId) .addFile(FILE_A) .addFile(FILE_B) @@ -351,20 +350,17 @@ public void testConcurrentReplaceConflict() { @Test public void testConcurrentReplaceNoConflict() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); long id1 = base.currentSnapshot().snapshotId(); // Concurrent Replace Partitions should not fail if concerning different partitions - table.newReplacePartitions() - .addFile(FILE_A) - .commit(); + table.newReplacePartitions().addFile(FILE_A).commit(); long id2 = readMetadata().currentSnapshot().snapshotId(); - table.newReplacePartitions() + table + .newReplacePartitions() .validateFromSnapshot(id1) .validateNoConflictingData() .validateNoConflictingDeletes() @@ -372,13 +368,17 @@ public void testConcurrentReplaceNoConflict() { .commit(); long id3 = readMetadata().currentSnapshot().snapshotId(); - Assert.assertEquals("Table should have 2 manifests", - 2, table.currentSnapshot().allManifests(table.io()).size()); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(0), + Assert.assertEquals( + "Table should have 2 manifests", + 2, + table.currentSnapshot().allManifests(table.io()).size()); + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(0), ids(id3), files(FILE_B), statuses(Status.ADDED)); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(1), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(1), ids(id2), files(FILE_A), statuses(Status.ADDED)); @@ -386,26 +386,25 @@ public void testConcurrentReplaceNoConflict() { @Test public void testConcurrentReplaceConflictNonPartitioned() { - Table unpartitioned = TestTables.create( - tableDir, "unpartitioned", SCHEMA, PartitionSpec.unpartitioned(), formatVersion); - unpartitioned.newAppend() - .appendFile(FILE_UNPARTITIONED_A) - .commit(); + Table unpartitioned = + TestTables.create( + tableDir, "unpartitioned", SCHEMA, PartitionSpec.unpartitioned(), formatVersion); + unpartitioned.newAppend().appendFile(FILE_UNPARTITIONED_A).commit(); TableMetadata replaceMetadata = TestTables.readMetadata("unpartitioned"); long replaceBaseId = replaceMetadata.currentSnapshot().snapshotId(); // Concurrent ReplacePartitions should fail with ValidationException - unpartitioned.newReplacePartitions() - .addFile(FILE_UNPARTITIONED_A) - .commit(); + unpartitioned.newReplacePartitions().addFile(FILE_UNPARTITIONED_A).commit(); - AssertHelpers.assertThrows("Should reject commit with file matching partitions replaced", + AssertHelpers.assertThrows( + "Should reject commit with file matching partitions replaced", ValidationException.class, - "Found conflicting files that can contain records matching true: " + - "[/path/to/data-unpartitioned-a.parquet]", + "Found conflicting files that can contain records matching true: " + + "[/path/to/data-unpartitioned-a.parquet]", () -> - unpartitioned.newReplacePartitions() + unpartitioned + .newReplacePartitions() .validateFromSnapshot(replaceBaseId) .validateNoConflictingData() .validateNoConflictingDeletes() @@ -415,24 +414,22 @@ public void testConcurrentReplaceConflictNonPartitioned() { @Test public void testAppendReplaceConflict() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); long baseId = base.currentSnapshot().snapshotId(); // Concurrent Append and ReplacePartition should fail with ValidationException - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); - AssertHelpers.assertThrows("Should reject commit with file matching partitions replaced", + AssertHelpers.assertThrows( + "Should reject commit with file matching partitions replaced", ValidationException.class, - "Found conflicting files that can contain records matching partitions " + - "[data_bucket=0, data_bucket=1]: [/path/to/data-b.parquet]", + "Found conflicting files that can contain records matching partitions " + + "[data_bucket=0, data_bucket=1]: [/path/to/data-b.parquet]", () -> - table.newReplacePartitions() + table + .newReplacePartitions() .validateFromSnapshot(baseId) .validateNoConflictingData() .validateNoConflictingDeletes() @@ -443,21 +440,18 @@ public void testAppendReplaceConflict() { @Test public void testAppendReplaceNoConflict() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); long id1 = base.currentSnapshot().snapshotId(); // Concurrent Append and ReplacePartition should not conflict if concerning different partitions - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); long id2 = readMetadata().currentSnapshot().snapshotId(); - table.newReplacePartitions() + table + .newReplacePartitions() .validateFromSnapshot(id1) .validateNoConflictingData() .validateNoConflictingDeletes() @@ -465,17 +459,22 @@ public void testAppendReplaceNoConflict() { .commit(); long id3 = readMetadata().currentSnapshot().snapshotId(); - Assert.assertEquals("Table should have 3 manifests", - 3, table.currentSnapshot().allManifests(table.io()).size()); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(0), + Assert.assertEquals( + "Table should have 3 manifests", + 3, + table.currentSnapshot().allManifests(table.io()).size()); + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(0), ids(id3), files(FILE_A), statuses(Status.ADDED)); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(1), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(1), ids(id2), files(FILE_B), statuses(Status.ADDED)); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(2), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(2), ids(id3), files(FILE_A), statuses(Status.DELETED)); @@ -483,26 +482,25 @@ public void testAppendReplaceNoConflict() { @Test public void testAppendReplaceConflictNonPartitioned() { - Table unpartitioned = TestTables.create( - tableDir, "unpartitioned", SCHEMA, PartitionSpec.unpartitioned(), formatVersion); - unpartitioned.newAppend() - .appendFile(FILE_UNPARTITIONED_A) - .commit(); + Table unpartitioned = + TestTables.create( + tableDir, "unpartitioned", SCHEMA, PartitionSpec.unpartitioned(), formatVersion); + unpartitioned.newAppend().appendFile(FILE_UNPARTITIONED_A).commit(); TableMetadata replaceMetadata = TestTables.readMetadata("unpartitioned"); long replaceBaseId = replaceMetadata.currentSnapshot().snapshotId(); // Concurrent Append and ReplacePartitions should fail with ValidationException - unpartitioned.newAppend() - .appendFile(FILE_UNPARTITIONED_A) - .commit(); + unpartitioned.newAppend().appendFile(FILE_UNPARTITIONED_A).commit(); - AssertHelpers.assertThrows("Should reject commit with file matching partitions replaced", + AssertHelpers.assertThrows( + "Should reject commit with file matching partitions replaced", ValidationException.class, - "Found conflicting files that can contain records matching true: " + - "[/path/to/data-unpartitioned-a.parquet]", + "Found conflicting files that can contain records matching true: " + + "[/path/to/data-unpartitioned-a.parquet]", () -> - unpartitioned.newReplacePartitions() + unpartitioned + .newReplacePartitions() .validateFromSnapshot(replaceBaseId) .validateNoConflictingData() .validateNoConflictingDeletes() @@ -513,25 +511,22 @@ public void testAppendReplaceConflictNonPartitioned() { @Test public void testDeleteReplaceConflict() { Assume.assumeTrue(formatVersion == 2); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); long baseId = base.currentSnapshot().snapshotId(); // Concurrent Delete and ReplacePartition should fail with ValidationException - table.newRowDelta() - .addDeletes(FILE_A_DELETES) - .validateFromSnapshot(baseId) - .commit(); + table.newRowDelta().addDeletes(FILE_A_DELETES).validateFromSnapshot(baseId).commit(); - AssertHelpers.assertThrows("Should reject commit with file matching partitions replaced", + AssertHelpers.assertThrows( + "Should reject commit with file matching partitions replaced", ValidationException.class, - "Found new conflicting delete files that can apply to records matching " + - "[data_bucket=0]: [/path/to/data-a-deletes.parquet]", + "Found new conflicting delete files that can apply to records matching " + + "[data_bucket=0]: [/path/to/data-a-deletes.parquet]", () -> - table.newReplacePartitions() + table + .newReplacePartitions() .validateFromSnapshot(baseId) .validateNoConflictingData() .validateNoConflictingDeletes() @@ -543,26 +538,25 @@ public void testDeleteReplaceConflict() { public void testDeleteReplaceConflictNonPartitioned() { Assume.assumeTrue(formatVersion == 2); - Table unpartitioned = TestTables.create( - tableDir, "unpartitioned", SCHEMA, PartitionSpec.unpartitioned(), formatVersion); - unpartitioned.newAppend() - .appendFile(FILE_A) - .commit(); + Table unpartitioned = + TestTables.create( + tableDir, "unpartitioned", SCHEMA, PartitionSpec.unpartitioned(), formatVersion); + unpartitioned.newAppend().appendFile(FILE_A).commit(); TableMetadata replaceMetadata = TestTables.readMetadata("unpartitioned"); long replaceBaseId = replaceMetadata.currentSnapshot().snapshotId(); // Concurrent Delete and ReplacePartitions should fail with ValidationException - unpartitioned.newRowDelta() - .addDeletes(FILE_UNPARTITIONED_A_DELETES) - .commit(); + unpartitioned.newRowDelta().addDeletes(FILE_UNPARTITIONED_A_DELETES).commit(); - AssertHelpers.assertThrows("Should reject commit with file matching partitions replaced", + AssertHelpers.assertThrows( + "Should reject commit with file matching partitions replaced", ValidationException.class, - "Found new conflicting delete files that can apply to records matching true: " + - "[/path/to/data-unpartitioned-a-deletes.parquet]", + "Found new conflicting delete files that can apply to records matching true: " + + "[/path/to/data-unpartitioned-a-deletes.parquet]", () -> - unpartitioned.newReplacePartitions() + unpartitioned + .newReplacePartitions() .validateFromSnapshot(replaceBaseId) .validateNoConflictingData() .validateNoConflictingDeletes() @@ -573,13 +567,12 @@ public void testDeleteReplaceConflictNonPartitioned() { @Test public void testDeleteReplaceNoConflict() { Assume.assumeTrue(formatVersion == 2); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); long id1 = readMetadata().currentSnapshot().snapshotId(); // Concurrent Delta and ReplacePartition should not conflict if concerning different partitions - table.newRowDelta() + table + .newRowDelta() .addDeletes(FILE_A_DELETES) .validateFromSnapshot(id1) .validateNoConflictingDataFiles() @@ -588,7 +581,8 @@ public void testDeleteReplaceNoConflict() { .commit(); long id2 = readMetadata().currentSnapshot().snapshotId(); - table.newReplacePartitions() + table + .newReplacePartitions() .validateNoConflictingData() .validateNoConflictingDeletes() .validateFromSnapshot(id1) @@ -596,17 +590,20 @@ public void testDeleteReplaceNoConflict() { .commit(); long id3 = readMetadata().currentSnapshot().snapshotId(); - Assert.assertEquals("Table should have 3 manifest", - 3, table.currentSnapshot().allManifests(table.io()).size()); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(0), + Assert.assertEquals( + "Table should have 3 manifest", 3, table.currentSnapshot().allManifests(table.io()).size()); + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(0), ids(id3), files(FILE_B), statuses(Status.ADDED)); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(1), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(1), ids(id1), files(FILE_A), statuses(Status.ADDED)); - validateDeleteManifest(table.currentSnapshot().allManifests(table.io()).get(2), + validateDeleteManifest( + table.currentSnapshot().allManifests(table.io()).get(2), seqs(2), ids(id2), files(FILE_A_DELETES), @@ -616,24 +613,22 @@ public void testDeleteReplaceNoConflict() { @Test public void testOverwriteReplaceConflict() { Assume.assumeTrue(formatVersion == 2); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); long baseId = base.currentSnapshot().snapshotId(); // Concurrent Overwrite and ReplacePartition should fail with ValidationException - table.newOverwrite() - .deleteFile(FILE_A) - .commit(); + table.newOverwrite().deleteFile(FILE_A).commit(); - AssertHelpers.assertThrows("Should reject commit with file matching partitions replaced", + AssertHelpers.assertThrows( + "Should reject commit with file matching partitions replaced", ValidationException.class, - "Found conflicting deleted files that can apply to records matching " + - "[data_bucket=0]: [/path/to/data-a.parquet]", + "Found conflicting deleted files that can apply to records matching " + + "[data_bucket=0]: [/path/to/data-a.parquet]", () -> - table.newReplacePartitions() + table + .newReplacePartitions() .validateFromSnapshot(baseId) .validateNoConflictingData() .validateNoConflictingDeletes() @@ -644,20 +639,17 @@ public void testOverwriteReplaceConflict() { @Test public void testOverwriteReplaceNoConflict() { Assume.assumeTrue(formatVersion == 2); - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); TableMetadata base = readMetadata(); long baseId = base.currentSnapshot().snapshotId(); - // Concurrent Overwrite and ReplacePartition should not fail with if concerning different partitions - table.newOverwrite() - .deleteFile(FILE_A) - .commit(); + // Concurrent Overwrite and ReplacePartition should not fail with if concerning different + // partitions + table.newOverwrite().deleteFile(FILE_A).commit(); - table.newReplacePartitions() + table + .newReplacePartitions() .validateNoConflictingData() .validateNoConflictingDeletes() .validateFromSnapshot(baseId) @@ -666,13 +658,15 @@ public void testOverwriteReplaceNoConflict() { long finalId = readMetadata().currentSnapshot().snapshotId(); - Assert.assertEquals("Table should have 2 manifest", - 2, table.currentSnapshot().allManifests(table.io()).size()); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(0), + Assert.assertEquals( + "Table should have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(0), ids(finalId), files(FILE_B), statuses(Status.ADDED)); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(1), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(1), ids(finalId), files(FILE_B), statuses(Status.DELETED)); @@ -682,27 +676,26 @@ public void testOverwriteReplaceNoConflict() { public void testOverwriteReplaceConflictNonPartitioned() { Assume.assumeTrue(formatVersion == 2); - Table unpartitioned = TestTables.create( - tableDir, "unpartitioned", SCHEMA, PartitionSpec.unpartitioned(), formatVersion); + Table unpartitioned = + TestTables.create( + tableDir, "unpartitioned", SCHEMA, PartitionSpec.unpartitioned(), formatVersion); - unpartitioned.newAppend() - .appendFile(FILE_UNPARTITIONED_A) - .commit(); + unpartitioned.newAppend().appendFile(FILE_UNPARTITIONED_A).commit(); TableMetadata replaceMetadata = TestTables.readMetadata("unpartitioned"); long replaceBaseId = replaceMetadata.currentSnapshot().snapshotId(); // Concurrent Overwrite and ReplacePartitions should fail with ValidationException - unpartitioned.newOverwrite() - .deleteFile(FILE_UNPARTITIONED_A) - .commit(); + unpartitioned.newOverwrite().deleteFile(FILE_UNPARTITIONED_A).commit(); - AssertHelpers.assertThrows("Should reject commit with file matching partitions replaced", + AssertHelpers.assertThrows( + "Should reject commit with file matching partitions replaced", ValidationException.class, - "Found conflicting deleted files that can contain records matching true: " + - "[/path/to/data-unpartitioned-a.parquet]", + "Found conflicting deleted files that can contain records matching true: " + + "[/path/to/data-unpartitioned-a.parquet]", () -> - unpartitioned.newReplacePartitions() + unpartitioned + .newReplacePartitions() .validateFromSnapshot(replaceBaseId) .validateNoConflictingData() .validateNoConflictingDeletes() @@ -712,34 +705,34 @@ public void testOverwriteReplaceConflictNonPartitioned() { @Test public void testValidateOnlyDeletes() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long baseId = readMetadata().currentSnapshot().snapshotId(); // Snapshot Isolation mode: appends do not conflict with replace - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); - table.newReplacePartitions() + table + .newReplacePartitions() .validateFromSnapshot(baseId) .validateNoConflictingDeletes() .addFile(FILE_B) .commit(); long finalId = readMetadata().currentSnapshot().snapshotId(); - Assert.assertEquals("Table should have 3 manifest", - 3, table.currentSnapshot().allManifests(table.io()).size()); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(0), + Assert.assertEquals( + "Table should have 3 manifest", 3, table.currentSnapshot().allManifests(table.io()).size()); + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(0), ids(finalId), files(FILE_B), statuses(Status.ADDED)); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(1), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(1), ids(finalId), files(FILE_B), statuses(Status.DELETED)); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(2), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(2), ids(baseId), files(FILE_A), statuses(Status.ADDED)); @@ -747,7 +740,6 @@ public void testValidateOnlyDeletes() { @Test public void testEmptyPartitionPathWithUnpartitionedTable() { - DataFiles.builder(PartitionSpec.unpartitioned()) - .withPartitionPath(""); + DataFiles.builder(PartitionSpec.unpartitioned()).withPartitionPath(""); } } diff --git a/core/src/test/java/org/apache/iceberg/TestReplaceTransaction.java b/core/src/test/java/org/apache/iceberg/TestReplaceTransaction.java index a27124d5c986..db131134861f 100644 --- a/core/src/test/java/org/apache/iceberg/TestReplaceTransaction.java +++ b/core/src/test/java/org/apache/iceberg/TestReplaceTransaction.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.NullOrder.NULLS_FIRST; +import static org.apache.iceberg.PartitionSpec.unpartitioned; +import static org.apache.iceberg.SortDirection.ASC; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -39,16 +43,11 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.NullOrder.NULLS_FIRST; -import static org.apache.iceberg.PartitionSpec.unpartitioned; -import static org.apache.iceberg.SortDirection.ASC; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestReplaceTransaction extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestReplaceTransaction(int formatVersion) { @@ -60,62 +59,58 @@ public void testReplaceTransactionWithCustomSortOrder() { Snapshot start = table.currentSnapshot(); Schema schema = table.schema(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Assert.assertEquals("Version should be 1", 1L, (long) version()); validateSnapshot(start, table.currentSnapshot(), FILE_A); - SortOrder newSortOrder = SortOrder.builderFor(schema) - .asc("id", NULLS_FIRST) - .build(); + SortOrder newSortOrder = SortOrder.builderFor(schema).asc("id", NULLS_FIRST).build(); Map props = Maps.newHashMap(); - Transaction replace = TestTables.beginReplace(tableDir, "test", schema, unpartitioned(), newSortOrder, props); + Transaction replace = + TestTables.beginReplace(tableDir, "test", schema, unpartitioned(), newSortOrder, props); replace.commitTransaction(); table.refresh(); Assert.assertEquals("Version should be 2", 2L, (long) version()); Assert.assertNull("Table should not have a current snapshot", table.currentSnapshot()); - Assert.assertEquals("Schema should match previous schema", - schema.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Schema should match previous schema", schema.asStruct(), table.schema().asStruct()); PartitionSpec v2Expected = PartitionSpec.builderFor(table.schema()).withSpecId(1).build(); - V2Assert.assertEquals("Table should have an unpartitioned spec", - v2Expected, table.spec()); + V2Assert.assertEquals("Table should have an unpartitioned spec", v2Expected, table.spec()); - PartitionSpec v1Expected = PartitionSpec.builderFor(table.schema()) - .alwaysNull("data", "data_bucket") - .withSpecId(1) - .build(); - V1Assert.assertEquals("Table should have a spec with one void field", - v1Expected, table.spec()); + PartitionSpec v1Expected = + PartitionSpec.builderFor(table.schema()) + .alwaysNull("data", "data_bucket") + .withSpecId(1) + .build(); + V1Assert.assertEquals("Table should have a spec with one void field", v1Expected, table.spec()); Assert.assertEquals("Table should have 2 orders", 2, table.sortOrders().size()); SortOrder sortOrder = table.sortOrder(); Assert.assertEquals("Order ID must match", 1, sortOrder.orderId()); Assert.assertEquals("Order must have 1 field", 1, sortOrder.fields().size()); Assert.assertEquals("Direction must match ", ASC, sortOrder.fields().get(0).direction()); - Assert.assertEquals("Null order must match ", NULLS_FIRST, sortOrder.fields().get(0).nullOrder()); + Assert.assertEquals( + "Null order must match ", NULLS_FIRST, sortOrder.fields().get(0).nullOrder()); Transform transform = Transforms.identity(Types.IntegerType.get()); Assert.assertEquals("Transform must match", transform, sortOrder.fields().get(0).transform()); } @Test public void testReplaceTransaction() { - Schema newSchema = new Schema( - required(4, "id", Types.IntegerType.get()), - required(5, "data", Types.StringType.get())); + Schema newSchema = + new Schema( + required(4, "id", Types.IntegerType.get()), + required(5, "data", Types.StringType.get())); Snapshot start = table.currentSnapshot(); Schema schema = table.schema(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Assert.assertEquals("Version should be 1", 1L, (long) version()); @@ -128,19 +123,18 @@ public void testReplaceTransaction() { Assert.assertEquals("Version should be 2", 2L, (long) version()); Assert.assertNull("Table should not have a current snapshot", table.currentSnapshot()); - Assert.assertEquals("Schema should match previous schema", - schema.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Schema should match previous schema", schema.asStruct(), table.schema().asStruct()); PartitionSpec v2Expected = PartitionSpec.builderFor(table.schema()).withSpecId(1).build(); - V2Assert.assertEquals("Table should have an unpartitioned spec", - v2Expected, table.spec()); + V2Assert.assertEquals("Table should have an unpartitioned spec", v2Expected, table.spec()); - PartitionSpec v1Expected = PartitionSpec.builderFor(table.schema()) - .alwaysNull("data", "data_bucket") - .withSpecId(1) - .build(); - V1Assert.assertEquals("Table should have a spec with one void field", - v1Expected, table.spec()); + PartitionSpec v1Expected = + PartitionSpec.builderFor(table.schema()) + .alwaysNull("data", "data_bucket") + .withSpecId(1) + .build(); + V1Assert.assertEquals("Table should have a spec with one void field", v1Expected, table.spec()); Assert.assertEquals("Table should have 1 order", 1, table.sortOrders().size()); Assert.assertEquals("Table order ID should match", 0, table.sortOrder().orderId()); @@ -149,16 +143,14 @@ public void testReplaceTransaction() { @Test public void testReplaceWithIncompatibleSchemaUpdate() { - Assume.assumeTrue("Fails early for v1 tables because partition spec cannot drop a field", formatVersion == 2); + Assume.assumeTrue( + "Fails early for v1 tables because partition spec cannot drop a field", formatVersion == 2); - Schema newSchema = new Schema( - required(4, "obj_id", Types.IntegerType.get())); + Schema newSchema = new Schema(required(4, "obj_id", Types.IntegerType.get())); Snapshot start = table.currentSnapshot(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Assert.assertEquals("Version should be 1", 1L, (long) version()); @@ -171,7 +163,8 @@ public void testReplaceWithIncompatibleSchemaUpdate() { Assert.assertEquals("Version should be 2", 2L, (long) version()); Assert.assertNull("Table should not have a current snapshot", table.currentSnapshot()); - Assert.assertEquals("Schema should use new schema, not compatible with previous", + Assert.assertEquals( + "Schema should use new schema, not compatible with previous", new Schema(required(3, "obj_id", Types.IntegerType.get())).asStruct(), table.schema().asStruct()); } @@ -183,9 +176,7 @@ public void testReplaceWithNewPartitionSpec() { Snapshot start = table.currentSnapshot(); Schema schema = table.schema(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Assert.assertEquals("Version should be 1", 1L, (long) version()); @@ -198,19 +189,20 @@ public void testReplaceWithNewPartitionSpec() { Assert.assertEquals("Version should be 2", 2L, (long) version()); Assert.assertNull("Table should not have a current snapshot", table.currentSnapshot()); - Assert.assertEquals("Schema should use new schema, not compatible with previous", - schema.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Schema should use new schema, not compatible with previous", + schema.asStruct(), + table.schema().asStruct()); PartitionSpec v2Expected = PartitionSpec.builderFor(table.schema()).withSpecId(1).build(); - V2Assert.assertEquals("Table should have an unpartitioned spec", - v2Expected, table.spec()); - - PartitionSpec v1Expected = PartitionSpec.builderFor(table.schema()) - .alwaysNull("data", "data_bucket") - .withSpecId(1) - .build(); - V1Assert.assertEquals("Table should have a spec with one void field", - v1Expected, table.spec()); + V2Assert.assertEquals("Table should have an unpartitioned spec", v2Expected, table.spec()); + + PartitionSpec v1Expected = + PartitionSpec.builderFor(table.schema()) + .alwaysNull("data", "data_bucket") + .withSpecId(1) + .build(); + V1Assert.assertEquals("Table should have a spec with one void field", v1Expected, table.spec()); } @Test @@ -218,9 +210,7 @@ public void testReplaceWithNewData() { Snapshot start = table.currentSnapshot(); Schema schema = table.schema(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Assert.assertEquals("Version should be 1", 1L, (long) version()); @@ -228,11 +218,7 @@ public void testReplaceWithNewData() { Transaction replace = TestTables.beginReplace(tableDir, "test", table.schema(), table.spec()); - replace.newAppend() - .appendFile(FILE_B) - .appendFile(FILE_C) - .appendFile(FILE_D) - .commit(); + replace.newAppend().appendFile(FILE_B).appendFile(FILE_C).appendFile(FILE_D).commit(); replace.commitTransaction(); @@ -240,8 +226,10 @@ public void testReplaceWithNewData() { Assert.assertEquals("Version should be 2", 2L, (long) version()); Assert.assertNotNull("Table should have a current snapshot", table.currentSnapshot()); - Assert.assertEquals("Schema should use new schema, not compatible with previous", - schema.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Schema should use new schema, not compatible with previous", + schema.asStruct(), + table.schema().asStruct()); validateSnapshot(null, table.currentSnapshot(), FILE_B, FILE_C, FILE_D); } @@ -252,13 +240,16 @@ public void testReplaceDetectsUncommittedChangeOnCommit() { Transaction replace = TestTables.beginReplace(tableDir, "test", table.schema(), table.spec()); - replace.newAppend() // not committed + replace + .newAppend() // not committed .appendFile(FILE_B) .appendFile(FILE_C) .appendFile(FILE_D); - AssertHelpers.assertThrows("Should reject commit when last operation has not committed", - IllegalStateException.class, "Cannot commit transaction: last operation has not committed", + AssertHelpers.assertThrows( + "Should reject commit when last operation has not committed", + IllegalStateException.class, + "Cannot commit transaction: last operation has not committed", replace::commitTransaction); Assert.assertEquals("Version should be 0", 0L, (long) version()); @@ -270,13 +261,17 @@ public void testReplaceDetectsUncommittedChangeOnTableCommit() { Transaction replace = TestTables.beginReplace(tableDir, "test", table.schema(), table.spec()); - replace.table().newAppend() // not committed + replace + .table() + .newAppend() // not committed .appendFile(FILE_B) .appendFile(FILE_C) .appendFile(FILE_D); - AssertHelpers.assertThrows("Should reject commit when last operation has not committed", - IllegalStateException.class, "Cannot commit transaction: last operation has not committed", + AssertHelpers.assertThrows( + "Should reject commit when last operation has not committed", + IllegalStateException.class, + "Cannot commit transaction: last operation has not committed", replace::commitTransaction); Assert.assertEquals("Version should be 0", 0L, (long) version()); @@ -287,9 +282,7 @@ public void testReplaceTransactionRetry() { Snapshot start = table.currentSnapshot(); Schema schema = table.schema(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Assert.assertEquals("Version should be 1", 1L, (long) version()); @@ -297,11 +290,7 @@ public void testReplaceTransactionRetry() { Transaction replace = TestTables.beginReplace(tableDir, "test", table.schema(), table.spec()); - replace.newAppend() - .appendFile(FILE_B) - .appendFile(FILE_C) - .appendFile(FILE_D) - .commit(); + replace.newAppend().appendFile(FILE_B).appendFile(FILE_C).appendFile(FILE_D).commit(); // trigger eventual transaction retry ((TestTables.TestTableOperations) ((BaseTransaction) replace).ops()).failCommits(1); @@ -312,8 +301,10 @@ public void testReplaceTransactionRetry() { Assert.assertEquals("Version should be 2", 2L, (long) version()); Assert.assertNotNull("Table should have a current snapshot", table.currentSnapshot()); - Assert.assertEquals("Schema should use new schema, not compatible with previous", - schema.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Schema should use new schema, not compatible with previous", + schema.asStruct(), + table.schema().asStruct()); validateSnapshot(null, table.currentSnapshot(), FILE_B, FILE_C, FILE_D); } @@ -322,9 +313,7 @@ public void testReplaceTransactionRetry() { public void testReplaceTransactionConflict() { Snapshot start = table.currentSnapshot(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Assert.assertEquals("Version should be 1", 1L, (long) version()); @@ -333,17 +322,15 @@ public void testReplaceTransactionConflict() { Transaction replace = TestTables.beginReplace(tableDir, "test", table.schema(), table.spec()); - replace.newAppend() - .appendFile(FILE_B) - .appendFile(FILE_C) - .appendFile(FILE_D) - .commit(); + replace.newAppend().appendFile(FILE_B).appendFile(FILE_C).appendFile(FILE_D).commit(); // keep failing to trigger eventual transaction failure ((TestTables.TestTableOperations) ((BaseTransaction) replace).ops()).failCommits(100); - AssertHelpers.assertThrows("Should reject commit when retries are exhausted", - CommitFailedException.class, "Injected failure", + AssertHelpers.assertThrows( + "Should reject commit when retries are exhausted", + CommitFailedException.class, + "Injected failure", replace::commitTransaction); Assert.assertEquals("Version should be 1", 1L, (long) version()); @@ -352,7 +339,8 @@ public void testReplaceTransactionConflict() { validateSnapshot(start, table.currentSnapshot(), FILE_A); - Assert.assertEquals("Should clean up replace manifests", manifests, Sets.newHashSet(listManifestFiles())); + Assert.assertEquals( + "Should clean up replace manifests", manifests, Sets.newHashSet(listManifestFiles())); } @Test @@ -363,35 +351,34 @@ public void testReplaceToCreateAndAppend() throws IOException { // this table doesn't exist. Transaction replace = TestTables.beginReplace(tableDir, "test_append", SCHEMA, unpartitioned()); - Assert.assertNull("Starting a create transaction should not commit metadata", + Assert.assertNull( + "Starting a create transaction should not commit metadata", TestTables.readMetadata("test_append")); - Assert.assertNull("Should have no metadata version", - TestTables.metadataVersion("test_append")); + Assert.assertNull("Should have no metadata version", TestTables.metadataVersion("test_append")); - Assert.assertTrue("Should return a transaction table", + Assert.assertTrue( + "Should return a transaction table", replace.table() instanceof BaseTransaction.TransactionTable); - replace.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + replace.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - Assert.assertNull("Appending in a transaction should not commit metadata", + Assert.assertNull( + "Appending in a transaction should not commit metadata", TestTables.readMetadata("test_append")); - Assert.assertNull("Should have no metadata version", - TestTables.metadataVersion("test_append")); + Assert.assertNull("Should have no metadata version", TestTables.metadataVersion("test_append")); replace.commitTransaction(); TableMetadata meta = TestTables.readMetadata("test_append"); Assert.assertNotNull("Table metadata should be created after transaction commits", meta); - Assert.assertEquals("Should have metadata version 0", - 0, (int) TestTables.metadataVersion("test_append")); - Assert.assertEquals("Should have 1 manifest file", - 1, listManifestFiles(tableDir).size()); + Assert.assertEquals( + "Should have metadata version 0", 0, (int) TestTables.metadataVersion("test_append")); + Assert.assertEquals("Should have 1 manifest file", 1, listManifestFiles(tableDir).size()); - Assert.assertEquals("Table schema should match with reassigned IDs", - assignFreshIds(SCHEMA).asStruct(), meta.schema().asStruct()); + Assert.assertEquals( + "Table schema should match with reassigned IDs", + assignFreshIds(SCHEMA).asStruct(), + meta.schema().asStruct()); Assert.assertEquals("Table spec should match", unpartitioned(), meta.spec()); Assert.assertEquals("Table should have one snapshot", 1, meta.snapshots().size()); @@ -400,37 +387,47 @@ public void testReplaceToCreateAndAppend() throws IOException { @Test public void testReplaceTransactionWithUnknownState() { - Schema newSchema = new Schema( - required(4, "id", Types.IntegerType.get()), - required(5, "data", Types.StringType.get())); + Schema newSchema = + new Schema( + required(4, "id", Types.IntegerType.get()), + required(5, "data", Types.StringType.get())); Snapshot start = table.currentSnapshot(); Schema schema = table.schema(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Assert.assertEquals("Version should be 1", 1L, (long) version()); validateSnapshot(start, table.currentSnapshot(), FILE_A); - TestTables.TestTableOperations ops = TestTables.opsWithCommitSucceedButStateUnknown(tableDir, "test"); - Transaction replace = TestTables.beginReplace(tableDir, "test", newSchema, unpartitioned(), - SortOrder.unsorted(), ImmutableMap.of(), ops); - - replace.newAppend() - .appendFile(FILE_B) - .commit(); - - AssertHelpers.assertThrows("Transaction commit should fail with CommitStateUnknownException", - CommitStateUnknownException.class, "datacenter on fire", () -> replace.commitTransaction()); + TestTables.TestTableOperations ops = + TestTables.opsWithCommitSucceedButStateUnknown(tableDir, "test"); + Transaction replace = + TestTables.beginReplace( + tableDir, + "test", + newSchema, + unpartitioned(), + SortOrder.unsorted(), + ImmutableMap.of(), + ops); + + replace.newAppend().appendFile(FILE_B).commit(); + + AssertHelpers.assertThrows( + "Transaction commit should fail with CommitStateUnknownException", + CommitStateUnknownException.class, + "datacenter on fire", + () -> replace.commitTransaction()); table.refresh(); Assert.assertEquals("Version should be 2", 2L, (long) version()); Assert.assertNotNull("Table should have a current snapshot", table.currentSnapshot()); - Assert.assertEquals("Schema should use new schema, not compatible with previous", - schema.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Schema should use new schema, not compatible with previous", + schema.asStruct(), + table.schema().asStruct()); Assert.assertEquals("Should have 4 files in metadata", 4, countAllMetadataFiles(tableDir)); validateSnapshot(null, table.currentSnapshot(), FILE_B); } @@ -441,35 +438,49 @@ public void testCreateTransactionWithUnknownState() throws IOException { Assert.assertTrue(tableDir.delete()); // this table doesn't exist. - TestTables.TestTableOperations ops = TestTables.opsWithCommitSucceedButStateUnknown(tableDir, "test_append"); - Transaction replace = TestTables.beginReplace(tableDir, "test_append", SCHEMA, unpartitioned(), - SortOrder.unsorted(), ImmutableMap.of(), ops); - - Assert.assertNull("Starting a create transaction should not commit metadata", + TestTables.TestTableOperations ops = + TestTables.opsWithCommitSucceedButStateUnknown(tableDir, "test_append"); + Transaction replace = + TestTables.beginReplace( + tableDir, + "test_append", + SCHEMA, + unpartitioned(), + SortOrder.unsorted(), + ImmutableMap.of(), + ops); + + Assert.assertNull( + "Starting a create transaction should not commit metadata", TestTables.readMetadata("test_append")); Assert.assertNull("Should have no metadata version", TestTables.metadataVersion("test_append")); - Assert.assertTrue("Should return a transaction table", replace.table() instanceof BaseTransaction.TransactionTable); + Assert.assertTrue( + "Should return a transaction table", + replace.table() instanceof BaseTransaction.TransactionTable); - replace.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + replace.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - Assert.assertNull("Appending in a transaction should not commit metadata", - TestTables.readMetadata("test_append")); - Assert.assertNull("Should have no metadata version", - TestTables.metadataVersion("test_append")); + Assert.assertNull( + "Appending in a transaction should not commit metadata", + TestTables.readMetadata("test_append")); + Assert.assertNull("Should have no metadata version", TestTables.metadataVersion("test_append")); - AssertHelpers.assertThrows("Transaction commit should fail with CommitStateUnknownException", - CommitStateUnknownException.class, "datacenter on fire", () -> replace.commitTransaction()); + AssertHelpers.assertThrows( + "Transaction commit should fail with CommitStateUnknownException", + CommitStateUnknownException.class, + "datacenter on fire", + () -> replace.commitTransaction()); TableMetadata meta = TestTables.readMetadata("test_append"); Assert.assertNotNull("Table metadata should be created after transaction commits", meta); - Assert.assertEquals("Should have metadata version 0", 0, (int) TestTables.metadataVersion("test_append")); + Assert.assertEquals( + "Should have metadata version 0", 0, (int) TestTables.metadataVersion("test_append")); Assert.assertEquals("Should have 1 manifest file", 1, listManifestFiles(tableDir).size()); Assert.assertEquals("Should have 2 files in metadata", 2, countAllMetadataFiles(tableDir)); - Assert.assertEquals("Table schema should match with reassigned IDs", assignFreshIds(SCHEMA).asStruct(), + Assert.assertEquals( + "Table schema should match with reassigned IDs", + assignFreshIds(SCHEMA).asStruct(), meta.schema().asStruct()); Assert.assertEquals("Table spec should match", unpartitioned(), meta.spec()); Assert.assertEquals("Table should have one snapshot", 1, meta.snapshots().size()); diff --git a/core/src/test/java/org/apache/iceberg/TestRewriteFiles.java b/core/src/test/java/org/apache/iceberg/TestRewriteFiles.java index 4bd52da70e1b..d361fdf63435 100644 --- a/core/src/test/java/org/apache/iceberg/TestRewriteFiles.java +++ b/core/src/test/java/org/apache/iceberg/TestRewriteFiles.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.ManifestEntry.Status.ADDED; +import static org.apache.iceberg.ManifestEntry.Status.DELETED; +import static org.apache.iceberg.ManifestEntry.Status.EXISTING; + import java.io.File; import java.util.Collections; import java.util.List; @@ -33,10 +36,6 @@ import org.junit.runners.Parameterized; import org.mockito.internal.util.collections.Sets; -import static org.apache.iceberg.ManifestEntry.Status.ADDED; -import static org.apache.iceberg.ManifestEntry.Status.DELETED; -import static org.apache.iceberg.ManifestEntry.Status.EXISTING; - @RunWith(Parameterized.class) public class TestRewriteFiles extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") @@ -55,110 +54,134 @@ public void testEmptyTable() { TableMetadata base = readMetadata(); Assert.assertNull("Should not have a current snapshot", base.currentSnapshot()); - AssertHelpers.assertThrows("Expected an exception", + AssertHelpers.assertThrows( + "Expected an exception", ValidationException.class, "Missing required files to delete: /path/to/data-a.parquet", - () -> table.newRewrite() - .rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_B)) - .commit()); + () -> table.newRewrite().rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_B)).commit()); - AssertHelpers.assertThrows("Expected an exception", + AssertHelpers.assertThrows( + "Expected an exception", ValidationException.class, "Missing required files to delete: /path/to/data-a-deletes.parquet", - () -> table.newRewrite() - .rewriteFiles(ImmutableSet.of(), ImmutableSet.of(FILE_A_DELETES), - ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_B_DELETES)) - .commit()); + () -> + table + .newRewrite() + .rewriteFiles( + ImmutableSet.of(), + ImmutableSet.of(FILE_A_DELETES), + ImmutableSet.of(FILE_A), + ImmutableSet.of(FILE_B_DELETES)) + .commit()); } @Test public void testAddOnly() { Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); - AssertHelpers.assertThrows("Expected an exception", + AssertHelpers.assertThrows( + "Expected an exception", ValidationException.class, "Missing required files to delete: /path/to/data-a.parquet", - () -> table.newRewrite() - .rewriteFiles(Sets.newSet(FILE_A), Collections.emptySet()) - .apply()); + () -> table.newRewrite().rewriteFiles(Sets.newSet(FILE_A), Collections.emptySet()).apply()); - AssertHelpers.assertThrows("Expected an exception", + AssertHelpers.assertThrows( + "Expected an exception", IllegalArgumentException.class, "Delete files to add must be empty because there's no delete file to be rewritten", - () -> table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(), - ImmutableSet.of(), ImmutableSet.of(FILE_A_DELETES)) - .apply()); - - AssertHelpers.assertThrows("Expected an exception", + () -> + table + .newRewrite() + .rewriteFiles( + ImmutableSet.of(FILE_A), + ImmutableSet.of(), + ImmutableSet.of(), + ImmutableSet.of(FILE_A_DELETES)) + .apply()); + + AssertHelpers.assertThrows( + "Expected an exception", IllegalArgumentException.class, "Delete files to add must be empty because there's no delete file to be rewritten", - () -> table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(), - ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_B_DELETES)) - .apply()); + () -> + table + .newRewrite() + .rewriteFiles( + ImmutableSet.of(FILE_A), + ImmutableSet.of(), + ImmutableSet.of(FILE_B), + ImmutableSet.of(FILE_B_DELETES)) + .apply()); } @Test public void testDeleteOnly() { Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); - AssertHelpers.assertThrows("Expected an exception", + AssertHelpers.assertThrows( + "Expected an exception", IllegalArgumentException.class, "Files to delete cannot be null or empty", - () -> table.newRewrite() - .rewriteFiles(Collections.emptySet(), Sets.newSet(FILE_A)) - .apply()); + () -> table.newRewrite().rewriteFiles(Collections.emptySet(), Sets.newSet(FILE_A)).apply()); - AssertHelpers.assertThrows("Expected an exception", + AssertHelpers.assertThrows( + "Expected an exception", IllegalArgumentException.class, "Files to delete cannot be null or empty", - () -> table.newRewrite() - .rewriteFiles(ImmutableSet.of(), ImmutableSet.of(), ImmutableSet.of(), ImmutableSet.of(FILE_A_DELETES)) - .apply()); - - AssertHelpers.assertThrows("Expected an exception", + () -> + table + .newRewrite() + .rewriteFiles( + ImmutableSet.of(), + ImmutableSet.of(), + ImmutableSet.of(), + ImmutableSet.of(FILE_A_DELETES)) + .apply()); + + AssertHelpers.assertThrows( + "Expected an exception", IllegalArgumentException.class, "Files to delete cannot be null or empty", - () -> table.newRewrite() - .rewriteFiles(ImmutableSet.of(), ImmutableSet.of(), - ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_A_DELETES)) - .apply()); + () -> + table + .newRewrite() + .rewriteFiles( + ImmutableSet.of(), + ImmutableSet.of(), + ImmutableSet.of(FILE_A), + ImmutableSet.of(FILE_A_DELETES)) + .apply()); } @Test public void testDeleteWithDuplicateEntriesInManifest() { Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_A).appendFile(FILE_B).commit(); TableMetadata base = readMetadata(); long baseSnapshotId = base.currentSnapshot().snapshotId(); - Assert.assertEquals("Should create 1 manifest for initial write", - 1, base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should create 1 manifest for initial write", + 1, + base.currentSnapshot().allManifests(table.io()).size()); ManifestFile initialManifest = base.currentSnapshot().allManifests(table.io()).get(0); - Snapshot pending = table.newRewrite() - .rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_C)) - .apply(); + Snapshot pending = + table.newRewrite().rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_C)).apply(); - Assert.assertEquals("Should contain 2 manifest", - 2, pending.allManifests(table.io()).size()); - Assert.assertFalse("Should not contain manifest from initial write", + Assert.assertEquals("Should contain 2 manifest", 2, pending.allManifests(table.io()).size()); + Assert.assertFalse( + "Should not contain manifest from initial write", pending.allManifests(table.io()).contains(initialManifest)); long pendingId = pending.snapshotId(); - validateManifestEntries(pending.allManifests(table.io()).get(0), - ids(pendingId), - files(FILE_C), - statuses(ADDED)); + validateManifestEntries( + pending.allManifests(table.io()).get(0), ids(pendingId), files(FILE_C), statuses(ADDED)); - validateManifestEntries(pending.allManifests(table.io()).get(1), + validateManifestEntries( + pending.allManifests(table.io()).get(1), ids(pendingId, pendingId, baseSnapshotId), files(FILE_A, FILE_A, FILE_B), statuses(DELETED, DELETED, EXISTING)); @@ -171,34 +194,31 @@ public void testDeleteWithDuplicateEntriesInManifest() { public void testAddAndDelete() { Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); TableMetadata base = readMetadata(); long baseSnapshotId = base.currentSnapshot().snapshotId(); - Assert.assertEquals("Should create 1 manifest for initial write", - 1, base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should create 1 manifest for initial write", + 1, + base.currentSnapshot().allManifests(table.io()).size()); ManifestFile initialManifest = base.currentSnapshot().allManifests(table.io()).get(0); - Snapshot pending = table.newRewrite() - .rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_C)) - .apply(); + Snapshot pending = + table.newRewrite().rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_C)).apply(); - Assert.assertEquals("Should contain 2 manifest", - 2, pending.allManifests(table.io()).size()); - Assert.assertFalse("Should not contain manifest from initial write", + Assert.assertEquals("Should contain 2 manifest", 2, pending.allManifests(table.io()).size()); + Assert.assertFalse( + "Should not contain manifest from initial write", pending.allManifests(table.io()).contains(initialManifest)); long pendingId = pending.snapshotId(); - validateManifestEntries(pending.allManifests(table.io()).get(0), - ids(pendingId), - files(FILE_C), - statuses(ADDED)); + validateManifestEntries( + pending.allManifests(table.io()).get(0), ids(pendingId), files(FILE_C), statuses(ADDED)); - validateManifestEntries(pending.allManifests(table.io()).get(1), + validateManifestEntries( + pending.allManifests(table.io()).get(1), ids(pendingId, baseSnapshotId), files(FILE_A, FILE_B), statuses(DELETED, EXISTING)); @@ -209,10 +229,12 @@ public void testAddAndDelete() { @Test public void testRewriteDataAndDeleteFiles() { - Assume.assumeTrue("Rewriting delete files is only supported in iceberg format v2. ", formatVersion > 1); + Assume.assumeTrue( + "Rewriting delete files is only supported in iceberg format v2. ", formatVersion > 1); Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); - table.newRowDelta() + table + .newRowDelta() .addRows(FILE_A) .addRows(FILE_B) .addRows(FILE_C) @@ -223,42 +245,51 @@ public void testRewriteDataAndDeleteFiles() { TableMetadata base = readMetadata(); Snapshot baseSnap = base.currentSnapshot(); long baseSnapshotId = baseSnap.snapshotId(); - Assert.assertEquals("Should create 2 manifests for initial write", 2, baseSnap.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create 2 manifests for initial write", 2, baseSnap.allManifests(table.io()).size()); List initialManifests = baseSnap.allManifests(table.io()); - validateManifestEntries(initialManifests.get(0), + validateManifestEntries( + initialManifests.get(0), ids(baseSnapshotId, baseSnapshotId, baseSnapshotId), files(FILE_A, FILE_B, FILE_C), statuses(ADDED, ADDED, ADDED)); - validateDeleteManifest(initialManifests.get(1), + validateDeleteManifest( + initialManifests.get(1), seqs(1, 1), ids(baseSnapshotId, baseSnapshotId), files(FILE_A_DELETES, FILE_B_DELETES), statuses(ADDED, ADDED)); // Rewrite the files. - Snapshot pending = table.newRewrite() - .validateFromSnapshot(table.currentSnapshot().snapshotId()) - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_A_DELETES), - ImmutableSet.of(FILE_D), ImmutableSet.of()) - .apply(); + Snapshot pending = + table + .newRewrite() + .validateFromSnapshot(table.currentSnapshot().snapshotId()) + .rewriteFiles( + ImmutableSet.of(FILE_A), + ImmutableSet.of(FILE_A_DELETES), + ImmutableSet.of(FILE_D), + ImmutableSet.of()) + .apply(); Assert.assertEquals("Should contain 3 manifest", 3, pending.allManifests(table.io()).size()); - Assert.assertFalse("Should not contain manifest from initial write", + Assert.assertFalse( + "Should not contain manifest from initial write", pending.allManifests(table.io()).stream().anyMatch(initialManifests::contains)); long pendingId = pending.snapshotId(); - validateManifestEntries(pending.allManifests(table.io()).get(0), - ids(pendingId), - files(FILE_D), - statuses(ADDED)); + validateManifestEntries( + pending.allManifests(table.io()).get(0), ids(pendingId), files(FILE_D), statuses(ADDED)); - validateManifestEntries(pending.allManifests(table.io()).get(1), + validateManifestEntries( + pending.allManifests(table.io()).get(1), ids(pendingId, baseSnapshotId, baseSnapshotId), files(FILE_A, FILE_B, FILE_C), statuses(DELETED, EXISTING, EXISTING)); - validateDeleteManifest(pending.allManifests(table.io()).get(2), + validateDeleteManifest( + pending.allManifests(table.io()).get(2), seqs(2, 1), ids(pendingId, baseSnapshotId), files(FILE_A_DELETES, FILE_B_DELETES), @@ -270,10 +301,12 @@ public void testRewriteDataAndDeleteFiles() { @Test public void testRewriteDataAndAssignOldSequenceNumber() { - Assume.assumeTrue("Sequence number is only supported in iceberg format v2. ", formatVersion > 1); + Assume.assumeTrue( + "Sequence number is only supported in iceberg format v2. ", formatVersion > 1); Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); - table.newRowDelta() + table + .newRowDelta() .addRows(FILE_A) .addRows(FILE_B) .addRows(FILE_C) @@ -284,14 +317,17 @@ public void testRewriteDataAndAssignOldSequenceNumber() { TableMetadata base = readMetadata(); Snapshot baseSnap = base.currentSnapshot(); long baseSnapshotId = baseSnap.snapshotId(); - Assert.assertEquals("Should create 2 manifests for initial write", 2, baseSnap.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create 2 manifests for initial write", 2, baseSnap.allManifests(table.io()).size()); List initialManifests = baseSnap.allManifests(table.io()); - validateManifestEntries(initialManifests.get(0), + validateManifestEntries( + initialManifests.get(0), ids(baseSnapshotId, baseSnapshotId, baseSnapshotId), files(FILE_A, FILE_B, FILE_C), statuses(ADDED, ADDED, ADDED)); - validateDeleteManifest(initialManifests.get(1), + validateDeleteManifest( + initialManifests.get(1), seqs(1, 1), ids(baseSnapshotId, baseSnapshotId), files(FILE_A_DELETES, FILE_B_DELETES), @@ -299,31 +335,40 @@ public void testRewriteDataAndAssignOldSequenceNumber() { // Rewrite the files. long oldSequenceNumber = table.currentSnapshot().sequenceNumber(); - Snapshot pending = table.newRewrite() - .validateFromSnapshot(table.currentSnapshot().snapshotId()) - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_D), oldSequenceNumber) - .apply(); + Snapshot pending = + table + .newRewrite() + .validateFromSnapshot(table.currentSnapshot().snapshotId()) + .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_D), oldSequenceNumber) + .apply(); Assert.assertEquals("Should contain 3 manifest", 3, pending.allManifests(table.io()).size()); - Assert.assertFalse("Should not contain data manifest from initial write", + Assert.assertFalse( + "Should not contain data manifest from initial write", pending.dataManifests(table.io()).stream().anyMatch(initialManifests::contains)); long pendingId = pending.snapshotId(); ManifestFile newManifest = pending.allManifests(table.io()).get(0); validateManifestEntries(newManifest, ids(pendingId), files(FILE_D), statuses(ADDED)); for (ManifestEntry entry : ManifestFiles.read(newManifest, FILE_IO).entries()) { - Assert.assertEquals("Should have old sequence number for manifest entries", - oldSequenceNumber, (long) entry.sequenceNumber()); + Assert.assertEquals( + "Should have old sequence number for manifest entries", + oldSequenceNumber, + (long) entry.sequenceNumber()); } - Assert.assertEquals("Should use new sequence number for the manifest file", - oldSequenceNumber + 1, newManifest.sequenceNumber()); + Assert.assertEquals( + "Should use new sequence number for the manifest file", + oldSequenceNumber + 1, + newManifest.sequenceNumber()); - validateManifestEntries(pending.allManifests(table.io()).get(1), + validateManifestEntries( + pending.allManifests(table.io()).get(1), ids(pendingId, baseSnapshotId, baseSnapshotId), files(FILE_A, FILE_B, FILE_C), statuses(DELETED, EXISTING, EXISTING)); - validateDeleteManifest(pending.allManifests(table.io()).get(2), + validateDeleteManifest( + pending.allManifests(table.io()).get(2), seqs(1, 1), ids(baseSnapshotId, baseSnapshotId), files(FILE_A_DELETES, FILE_B_DELETES), @@ -335,27 +380,26 @@ public void testRewriteDataAndAssignOldSequenceNumber() { @Test public void testFailure() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); table.ops().failCommits(5); - RewriteFiles rewrite = table.newRewrite() - .rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_B)); + RewriteFiles rewrite = + table.newRewrite().rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_B)); Snapshot pending = rewrite.apply(); Assert.assertEquals("Should produce 2 manifests", 2, pending.allManifests(table.io()).size()); ManifestFile manifest1 = pending.allManifests(table.io()).get(0); ManifestFile manifest2 = pending.allManifests(table.io()).get(1); - validateManifestEntries(manifest1, - ids(pending.snapshotId()), files(FILE_B), statuses(ADDED)); - validateManifestEntries(manifest2, - ids(pending.snapshotId()), files(FILE_A), statuses(DELETED)); + validateManifestEntries(manifest1, ids(pending.snapshotId()), files(FILE_B), statuses(ADDED)); + validateManifestEntries(manifest2, ids(pending.snapshotId()), files(FILE_A), statuses(DELETED)); - AssertHelpers.assertThrows("Should retry 4 times and throw last failure", - CommitFailedException.class, "Injected failure", rewrite::commit); + AssertHelpers.assertThrows( + "Should retry 4 times and throw last failure", + CommitFailedException.class, + "Injected failure", + rewrite::commit); Assert.assertFalse("Should clean up new manifest", new File(manifest1.path()).exists()); Assert.assertFalse("Should clean up new manifest", new File(manifest2.path()).exists()); @@ -366,9 +410,11 @@ public void testFailure() { @Test public void testFailureWhenRewriteBothDataAndDeleteFiles() { - Assume.assumeTrue("Rewriting delete files is only supported in iceberg format v2. ", formatVersion > 1); + Assume.assumeTrue( + "Rewriting delete files is only supported in iceberg format v2. ", formatVersion > 1); - table.newRowDelta() + table + .newRowDelta() .addRows(FILE_A) .addRows(FILE_B) .addRows(FILE_C) @@ -379,10 +425,15 @@ public void testFailureWhenRewriteBothDataAndDeleteFiles() { long baseSnapshotId = readMetadata().currentSnapshot().snapshotId(); table.ops().failCommits(5); - RewriteFiles rewrite = table.newRewrite() - .validateFromSnapshot(table.currentSnapshot().snapshotId()) - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_A_DELETES, FILE_B_DELETES), - ImmutableSet.of(FILE_D), ImmutableSet.of()); + RewriteFiles rewrite = + table + .newRewrite() + .validateFromSnapshot(table.currentSnapshot().snapshotId()) + .rewriteFiles( + ImmutableSet.of(FILE_A), + ImmutableSet.of(FILE_A_DELETES, FILE_B_DELETES), + ImmutableSet.of(FILE_D), + ImmutableSet.of()); Snapshot pending = rewrite.apply(); Assert.assertEquals("Should produce 3 manifests", 3, pending.allManifests(table.io()).size()); @@ -390,24 +441,30 @@ public void testFailureWhenRewriteBothDataAndDeleteFiles() { ManifestFile manifest2 = pending.allManifests(table.io()).get(1); ManifestFile manifest3 = pending.allManifests(table.io()).get(2); - validateManifestEntries(pending.allManifests(table.io()).get(0), + validateManifestEntries( + pending.allManifests(table.io()).get(0), ids(pending.snapshotId()), files(FILE_D), statuses(ADDED)); - validateManifestEntries(pending.allManifests(table.io()).get(1), + validateManifestEntries( + pending.allManifests(table.io()).get(1), ids(pending.snapshotId(), baseSnapshotId, baseSnapshotId), files(FILE_A, FILE_B, FILE_C), statuses(DELETED, EXISTING, EXISTING)); - validateDeleteManifest(pending.allManifests(table.io()).get(2), + validateDeleteManifest( + pending.allManifests(table.io()).get(2), seqs(2, 2), ids(pending.snapshotId(), pending.snapshotId()), files(FILE_A_DELETES, FILE_B_DELETES), statuses(DELETED, DELETED)); - AssertHelpers.assertThrows("Should retry 4 times and throw last failure", - CommitFailedException.class, "Injected failure", rewrite::commit); + AssertHelpers.assertThrows( + "Should retry 4 times and throw last failure", + CommitFailedException.class, + "Injected failure", + rewrite::commit); Assert.assertFalse("Should clean up new manifest", new File(manifest1.path()).exists()); Assert.assertFalse("Should clean up new manifest", new File(manifest2.path()).exists()); @@ -419,31 +476,30 @@ public void testFailureWhenRewriteBothDataAndDeleteFiles() { @Test public void testRecovery() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); table.ops().failCommits(3); - RewriteFiles rewrite = table.newRewrite().rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_B)); + RewriteFiles rewrite = + table.newRewrite().rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_B)); Snapshot pending = rewrite.apply(); Assert.assertEquals("Should produce 2 manifests", 2, pending.allManifests(table.io()).size()); ManifestFile manifest1 = pending.allManifests(table.io()).get(0); ManifestFile manifest2 = pending.allManifests(table.io()).get(1); - validateManifestEntries(manifest1, - ids(pending.snapshotId()), files(FILE_B), statuses(ADDED)); - validateManifestEntries(manifest2, - ids(pending.snapshotId()), files(FILE_A), statuses(DELETED)); + validateManifestEntries(manifest1, ids(pending.snapshotId()), files(FILE_B), statuses(ADDED)); + validateManifestEntries(manifest2, ids(pending.snapshotId()), files(FILE_A), statuses(DELETED)); rewrite.commit(); Assert.assertTrue("Should reuse the manifest for appends", new File(manifest1.path()).exists()); - Assert.assertTrue("Should reuse the manifest with deletes", new File(manifest2.path()).exists()); + Assert.assertTrue( + "Should reuse the manifest with deletes", new File(manifest2.path()).exists()); TableMetadata metadata = readMetadata(); - Assert.assertTrue("Should commit the manifest for append", + Assert.assertTrue( + "Should commit the manifest for append", metadata.currentSnapshot().allManifests(table.io()).contains(manifest2)); // 2 manifests added by rewrite and 1 original manifest should be found. @@ -452,9 +508,11 @@ public void testRecovery() { @Test public void testRecoverWhenRewriteBothDataAndDeleteFiles() { - Assume.assumeTrue("Rewriting delete files is only supported in iceberg format v2. ", formatVersion > 1); + Assume.assumeTrue( + "Rewriting delete files is only supported in iceberg format v2. ", formatVersion > 1); - table.newRowDelta() + table + .newRowDelta() .addRows(FILE_A) .addRows(FILE_B) .addRows(FILE_C) @@ -465,10 +523,15 @@ public void testRecoverWhenRewriteBothDataAndDeleteFiles() { long baseSnapshotId = readMetadata().currentSnapshot().snapshotId(); table.ops().failCommits(3); - RewriteFiles rewrite = table.newRewrite() - .validateFromSnapshot(table.currentSnapshot().snapshotId()) - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_A_DELETES, FILE_B_DELETES), - ImmutableSet.of(FILE_D), ImmutableSet.of()); + RewriteFiles rewrite = + table + .newRewrite() + .validateFromSnapshot(table.currentSnapshot().snapshotId()) + .rewriteFiles( + ImmutableSet.of(FILE_A), + ImmutableSet.of(FILE_A_DELETES, FILE_B_DELETES), + ImmutableSet.of(FILE_D), + ImmutableSet.of()); Snapshot pending = rewrite.apply(); Assert.assertEquals("Should produce 3 manifests", 3, pending.allManifests(table.io()).size()); @@ -476,17 +539,16 @@ public void testRecoverWhenRewriteBothDataAndDeleteFiles() { ManifestFile manifest2 = pending.allManifests(table.io()).get(1); ManifestFile manifest3 = pending.allManifests(table.io()).get(2); - validateManifestEntries(manifest1, - ids(pending.snapshotId()), - files(FILE_D), - statuses(ADDED)); + validateManifestEntries(manifest1, ids(pending.snapshotId()), files(FILE_D), statuses(ADDED)); - validateManifestEntries(manifest2, + validateManifestEntries( + manifest2, ids(pending.snapshotId(), baseSnapshotId, baseSnapshotId), files(FILE_A, FILE_B, FILE_C), statuses(DELETED, EXISTING, EXISTING)); - validateDeleteManifest(manifest3, + validateDeleteManifest( + manifest3, seqs(2, 2), ids(pending.snapshotId(), pending.snapshotId()), files(FILE_A_DELETES, FILE_B_DELETES), @@ -500,8 +562,10 @@ public void testRecoverWhenRewriteBothDataAndDeleteFiles() { TableMetadata metadata = readMetadata(); List committedManifests = Lists.newArrayList(manifest1, manifest2, manifest3); - Assert.assertEquals("Should committed the manifests", - metadata.currentSnapshot().allManifests(table.io()), committedManifests); + Assert.assertEquals( + "Should committed the manifests", + metadata.currentSnapshot().allManifests(table.io()), + committedManifests); // As commit success all the manifests added with rewrite should be available. Assert.assertEquals("Only 5 manifest should exist", 5, listManifestFiles().size()); @@ -509,21 +573,21 @@ public void testRecoverWhenRewriteBothDataAndDeleteFiles() { @Test public void testReplaceEqualityDeletesWithPositionDeletes() { - Assume.assumeTrue("Rewriting delete files is only supported in iceberg format v2. ", formatVersion > 1); + Assume.assumeTrue( + "Rewriting delete files is only supported in iceberg format v2. ", formatVersion > 1); - table.newRowDelta() - .addRows(FILE_A2) - .addDeletes(FILE_A2_DELETES) - .commit(); + table.newRowDelta().addRows(FILE_A2).addDeletes(FILE_A2_DELETES).commit(); TableMetadata metadata = readMetadata(); long baseSnapshotId = metadata.currentSnapshot().snapshotId(); // Apply and commit the rewrite transaction. - RewriteFiles rewrite = table.newRewrite().rewriteFiles( - ImmutableSet.of(), ImmutableSet.of(FILE_A2_DELETES), - ImmutableSet.of(), ImmutableSet.of(FILE_B_DELETES) - ); + RewriteFiles rewrite = + table + .newRewrite() + .rewriteFiles( + ImmutableSet.of(), ImmutableSet.of(FILE_A2_DELETES), + ImmutableSet.of(), ImmutableSet.of(FILE_B_DELETES)); Snapshot pending = rewrite.apply(); Assert.assertEquals("Should produce 3 manifests", 3, pending.allManifests(table.io()).size()); @@ -531,22 +595,13 @@ public void testReplaceEqualityDeletesWithPositionDeletes() { ManifestFile manifest2 = pending.allManifests(table.io()).get(1); ManifestFile manifest3 = pending.allManifests(table.io()).get(2); - validateManifestEntries(manifest1, - ids(baseSnapshotId), - files(FILE_A2), - statuses(ADDED)); + validateManifestEntries(manifest1, ids(baseSnapshotId), files(FILE_A2), statuses(ADDED)); - validateDeleteManifest(manifest2, - seqs(2), - ids(pending.snapshotId()), - files(FILE_B_DELETES), - statuses(ADDED)); + validateDeleteManifest( + manifest2, seqs(2), ids(pending.snapshotId()), files(FILE_B_DELETES), statuses(ADDED)); - validateDeleteManifest(manifest3, - seqs(2), - ids(pending.snapshotId()), - files(FILE_A2_DELETES), - statuses(DELETED)); + validateDeleteManifest( + manifest3, seqs(2), ids(pending.snapshotId()), files(FILE_A2_DELETES), statuses(DELETED)); rewrite.commit(); @@ -556,8 +611,10 @@ public void testReplaceEqualityDeletesWithPositionDeletes() { metadata = readMetadata(); List committedManifests = Lists.newArrayList(manifest1, manifest2, manifest3); - Assert.assertEquals("Should committed the manifests", - metadata.currentSnapshot().allManifests(table.io()), committedManifests); + Assert.assertEquals( + "Should committed the manifests", + metadata.currentSnapshot().allManifests(table.io()), + committedManifests); // As commit success all the manifests added with rewrite should be available. Assert.assertEquals("4 manifests should exist", 4, listManifestFiles().size()); @@ -565,36 +622,29 @@ public void testReplaceEqualityDeletesWithPositionDeletes() { @Test public void testRemoveAllDeletes() { - Assume.assumeTrue("Rewriting delete files is only supported in iceberg format v2. ", formatVersion > 1); + Assume.assumeTrue( + "Rewriting delete files is only supported in iceberg format v2. ", formatVersion > 1); - table.newRowDelta() - .addRows(FILE_A) - .addDeletes(FILE_A_DELETES) - .commit(); + table.newRowDelta().addRows(FILE_A).addDeletes(FILE_A_DELETES).commit(); // Apply and commit the rewrite transaction. - RewriteFiles rewrite = table.newRewrite() - .validateFromSnapshot(table.currentSnapshot().snapshotId()) - .rewriteFiles( - ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_A_DELETES), - ImmutableSet.of(), ImmutableSet.of() - ); + RewriteFiles rewrite = + table + .newRewrite() + .validateFromSnapshot(table.currentSnapshot().snapshotId()) + .rewriteFiles( + ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_A_DELETES), + ImmutableSet.of(), ImmutableSet.of()); Snapshot pending = rewrite.apply(); Assert.assertEquals("Should produce 2 manifests", 2, pending.allManifests(table.io()).size()); ManifestFile manifest1 = pending.allManifests(table.io()).get(0); ManifestFile manifest2 = pending.allManifests(table.io()).get(1); - validateManifestEntries(manifest1, - ids(pending.snapshotId()), - files(FILE_A), - statuses(DELETED)); + validateManifestEntries(manifest1, ids(pending.snapshotId()), files(FILE_A), statuses(DELETED)); - validateDeleteManifest(manifest2, - seqs(2), - ids(pending.snapshotId()), - files(FILE_A_DELETES), - statuses(DELETED)); + validateDeleteManifest( + manifest2, seqs(2), ids(pending.snapshotId()), files(FILE_A_DELETES), statuses(DELETED)); rewrite.commit(); @@ -603,7 +653,8 @@ public void testRemoveAllDeletes() { TableMetadata metadata = readMetadata(); List committedManifests = Lists.newArrayList(manifest1, manifest2); - Assert.assertTrue("Should committed the manifests", + Assert.assertTrue( + "Should committed the manifests", metadata.currentSnapshot().allManifests(table.io()).containsAll(committedManifests)); // As commit success all the manifests added with rewrite should be available. @@ -614,21 +665,19 @@ public void testRemoveAllDeletes() { public void testDeleteNonExistentFile() { Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); TableMetadata base = readMetadata(); - Assert.assertEquals("Should create 1 manifest for initial write", - 1, base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should create 1 manifest for initial write", + 1, + base.currentSnapshot().allManifests(table.io()).size()); - AssertHelpers.assertThrows("Expected an exception", + AssertHelpers.assertThrows( + "Expected an exception", ValidationException.class, "Missing required files to delete: /path/to/data-c.parquet", - () -> table.newRewrite() - .rewriteFiles(Sets.newSet(FILE_C), Sets.newSet(FILE_D)) - .commit()); + () -> table.newRewrite().rewriteFiles(Sets.newSet(FILE_C), Sets.newSet(FILE_D)).commit()); Assert.assertEquals("Only 1 manifests should exist", 1, listManifestFiles().size()); } @@ -637,42 +686,37 @@ public void testDeleteNonExistentFile() { public void testAlreadyDeletedFile() { Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); - Assert.assertEquals("Should create 1 manifest for initial write", - 1, base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should create 1 manifest for initial write", + 1, + base.currentSnapshot().allManifests(table.io()).size()); RewriteFiles rewrite = table.newRewrite(); - Snapshot pending = rewrite - .rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_B)) - .apply(); + Snapshot pending = rewrite.rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_B)).apply(); - Assert.assertEquals("Should contain 2 manifest", - 2, pending.allManifests(table.io()).size()); + Assert.assertEquals("Should contain 2 manifest", 2, pending.allManifests(table.io()).size()); long pendingId = pending.snapshotId(); - validateManifestEntries(pending.allManifests(table.io()).get(0), - ids(pendingId), - files(FILE_B), - statuses(ADDED)); + validateManifestEntries( + pending.allManifests(table.io()).get(0), ids(pendingId), files(FILE_B), statuses(ADDED)); - validateManifestEntries(pending.allManifests(table.io()).get(1), + validateManifestEntries( + pending.allManifests(table.io()).get(1), ids(pendingId, base.currentSnapshot().snapshotId()), files(FILE_A), statuses(DELETED)); rewrite.commit(); - AssertHelpers.assertThrows("Expected an exception", + AssertHelpers.assertThrows( + "Expected an exception", ValidationException.class, "Missing required files to delete: /path/to/data-a.parquet", - () -> table.newRewrite() - .rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_D)) - .commit()); + () -> table.newRewrite().rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_D)).commit()); Assert.assertEquals("Only 3 manifests should exist", 3, listManifestFiles().size()); } @@ -681,27 +725,28 @@ public void testAlreadyDeletedFile() { public void testNewDeleteFile() { Assume.assumeTrue("Delete files are only supported in v2", formatVersion > 1); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long snapshotBeforeDeletes = table.currentSnapshot().snapshotId(); - table.newRowDelta() - .addDeletes(FILE_A_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_A_DELETES).commit(); long snapshotAfterDeletes = table.currentSnapshot().snapshotId(); - AssertHelpers.assertThrows("Should fail because deletes were added after the starting snapshot", - ValidationException.class, "Cannot commit, found new delete for replaced data file", - () -> table.newRewrite() - .validateFromSnapshot(snapshotBeforeDeletes) - .rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_A2)) - .apply()); + AssertHelpers.assertThrows( + "Should fail because deletes were added after the starting snapshot", + ValidationException.class, + "Cannot commit, found new delete for replaced data file", + () -> + table + .newRewrite() + .validateFromSnapshot(snapshotBeforeDeletes) + .rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_A2)) + .apply()); // the rewrite should be valid when validating from the snapshot after the deletes - table.newRewrite() + table + .newRewrite() .validateFromSnapshot(snapshotAfterDeletes) .rewriteFiles(Sets.newSet(FILE_A), Sets.newSet(FILE_A2)) .apply(); diff --git a/core/src/test/java/org/apache/iceberg/TestRewriteManifests.java b/core/src/test/java/org/apache/iceberg/TestRewriteManifests.java index ebcd45a24aaf..175c80c4d1e0 100644 --- a/core/src/test/java/org/apache/iceberg/TestRewriteManifests.java +++ b/core/src/test/java/org/apache/iceberg/TestRewriteManifests.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TableProperties.MANIFEST_MERGE_ENABLED; +import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -37,16 +41,11 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TableProperties.MANIFEST_MERGE_ENABLED; -import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - @RunWith(Parameterized.class) public class TestRewriteManifests extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestRewriteManifests(int formatVersion) { @@ -59,28 +58,22 @@ public void testRewriteManifestsAppendedDirectly() throws IOException { table.updateProperties().set(SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); - ManifestFile newManifest = writeManifest( - "manifest-file-1.avro", - manifestEntry(ManifestEntry.Status.ADDED, null, FILE_A)); + ManifestFile newManifest = + writeManifest( + "manifest-file-1.avro", manifestEntry(ManifestEntry.Status.ADDED, null, FILE_A)); - table.newFastAppend() - .appendManifest(newManifest) - .commit(); + table.newFastAppend().appendManifest(newManifest).commit(); long appendId = table.currentSnapshot().snapshotId(); Assert.assertEquals(1, table.currentSnapshot().allManifests(table.io()).size()); - table.rewriteManifests() - .clusterBy(file -> "") - .commit(); + table.rewriteManifests().clusterBy(file -> "").commit(); List manifests = table.currentSnapshot().allManifests(table.io()); Assert.assertEquals(1, manifests.size()); - validateManifestEntries(manifests.get(0), - ids(appendId), - files(FILE_A), - statuses(ManifestEntry.Status.EXISTING)); + validateManifestEntries( + manifests.get(0), ids(appendId), files(FILE_A), statuses(ManifestEntry.Status.EXISTING)); } @Test @@ -89,24 +82,27 @@ public void testRewriteManifestsWithScanExecutor() throws IOException { table.updateProperties().set(SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); - ManifestFile newManifest = writeManifest( - "manifest-file-1.avro", - manifestEntry(ManifestEntry.Status.ADDED, null, FILE_A)); + ManifestFile newManifest = + writeManifest( + "manifest-file-1.avro", manifestEntry(ManifestEntry.Status.ADDED, null, FILE_A)); - table.newFastAppend() - .appendManifest(newManifest) - .commit(); + table.newFastAppend().appendManifest(newManifest).commit(); Assert.assertEquals(1, table.currentSnapshot().allManifests(table.io()).size()); AtomicInteger scanThreadsIndex = new AtomicInteger(0); - table.rewriteManifests() + table + .rewriteManifests() .clusterBy(file -> "") - .scanManifestsWith(Executors.newFixedThreadPool(1, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("scan-" + scanThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })) + .scanManifestsWith( + Executors.newFixedThreadPool( + 1, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("scan-" + scanThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })) .commit(); List manifests = table.currentSnapshot().allManifests(table.io()); @@ -120,25 +116,19 @@ public void testRewriteManifestsGeneratedAndAppendedDirectly() throws IOExceptio table.updateProperties().set(SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); - ManifestFile newManifest = writeManifest( - "manifest-file-1.avro", - manifestEntry(ManifestEntry.Status.ADDED, null, FILE_A)); + ManifestFile newManifest = + writeManifest( + "manifest-file-1.avro", manifestEntry(ManifestEntry.Status.ADDED, null, FILE_A)); - table.newFastAppend() - .appendManifest(newManifest) - .commit(); + table.newFastAppend().appendManifest(newManifest).commit(); long manifestAppendId = table.currentSnapshot().snapshotId(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); long fileAppendId = table.currentSnapshot().snapshotId(); Assert.assertEquals(2, table.currentSnapshot().allManifests(table.io()).size()); - table.rewriteManifests() - .clusterBy(file -> "") - .commit(); + table.rewriteManifests().clusterBy(file -> "").commit(); List manifests = table.currentSnapshot().allManifests(table.io()); Assert.assertEquals("Manifests must be merged into 1", 1, manifests.size()); @@ -156,63 +146,49 @@ public void testRewriteManifestsGeneratedAndAppendedDirectly() throws IOExceptio } } - validateManifestEntries(manifests.get(0), - ids.iterator(), - files.iterator(), - statuses(ManifestEntry.Status.EXISTING, ManifestEntry.Status.EXISTING)); + validateManifestEntries( + manifests.get(0), + ids.iterator(), + files.iterator(), + statuses(ManifestEntry.Status.EXISTING, ManifestEntry.Status.EXISTING)); } @Test public void testReplaceManifestsSeparate() { Table table = load(); - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); long appendId = table.currentSnapshot().snapshotId(); Assert.assertEquals(1, table.currentSnapshot().allManifests(table.io()).size()); // cluster by path will split the manifest into two - table.rewriteManifests() - .clusterBy(file -> file.path()) - .commit(); + table.rewriteManifests().clusterBy(file -> file.path()).commit(); List manifests = table.currentSnapshot().allManifests(table.io()); Assert.assertEquals(2, manifests.size()); manifests.sort(Comparator.comparing(ManifestFile::path)); - validateManifestEntries(manifests.get(0), - ids(appendId), - files(FILE_A), - statuses(ManifestEntry.Status.EXISTING)); - validateManifestEntries(manifests.get(1), - ids(appendId), - files(FILE_B), - statuses(ManifestEntry.Status.EXISTING)); + validateManifestEntries( + manifests.get(0), ids(appendId), files(FILE_A), statuses(ManifestEntry.Status.EXISTING)); + validateManifestEntries( + manifests.get(1), ids(appendId), files(FILE_B), statuses(ManifestEntry.Status.EXISTING)); } @Test public void testReplaceManifestsConsolidate() throws IOException { Table table = load(); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); long appendIdA = table.currentSnapshot().snapshotId(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); long appendIdB = table.currentSnapshot().snapshotId(); Assert.assertEquals(2, table.currentSnapshot().allManifests(table.io()).size()); // cluster by constant will combine manifests into one - table.rewriteManifests() - .clusterBy(file -> "file") - .commit(); + table.rewriteManifests().clusterBy(file -> "file").commit(); List manifests = table.currentSnapshot().allManifests(table.io()); Assert.assertEquals(1, manifests.size()); @@ -230,44 +206,41 @@ public void testReplaceManifestsConsolidate() throws IOException { } } - validateManifestEntries(manifests.get(0), - ids.iterator(), - files.iterator(), - statuses(ManifestEntry.Status.EXISTING, ManifestEntry.Status.EXISTING)); + validateManifestEntries( + manifests.get(0), + ids.iterator(), + files.iterator(), + statuses(ManifestEntry.Status.EXISTING, ManifestEntry.Status.EXISTING)); } @Test public void testReplaceManifestsWithFilter() throws IOException { Table table = load(); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); long appendIdA = table.currentSnapshot().snapshotId(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); long appendIdB = table.currentSnapshot().snapshotId(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); long appendIdC = table.currentSnapshot().snapshotId(); Assert.assertEquals(3, table.currentSnapshot().allManifests(table.io()).size()); // keep the file A manifest, combine the other two - table.rewriteManifests() + table + .rewriteManifests() .clusterBy(file -> "file") - .rewriteIf(manifest -> { - try (ManifestReader reader = ManifestFiles.read(manifest, table.io())) { - return !reader.iterator().next().path().equals(FILE_A.path()); - } catch (IOException x) { - throw new RuntimeIOException(x); - } - }) + .rewriteIf( + manifest -> { + try (ManifestReader reader = ManifestFiles.read(manifest, table.io())) { + return !reader.iterator().next().path().equals(FILE_A.path()); + } catch (IOException x) { + throw new RuntimeIOException(x); + } + }) .commit(); List manifests = table.currentSnapshot().allManifests(table.io()); @@ -286,28 +259,25 @@ public void testReplaceManifestsWithFilter() throws IOException { } } - validateManifestEntries(manifests.get(0), - ids.iterator(), - files.iterator(), - statuses(ManifestEntry.Status.EXISTING, ManifestEntry.Status.EXISTING)); - validateManifestEntries(manifests.get(1), - ids(appendIdA), - files(FILE_A), - statuses(ManifestEntry.Status.ADDED)); + validateManifestEntries( + manifests.get(0), + ids.iterator(), + files.iterator(), + statuses(ManifestEntry.Status.EXISTING, ManifestEntry.Status.EXISTING)); + validateManifestEntries( + manifests.get(1), ids(appendIdA), files(FILE_A), statuses(ManifestEntry.Status.ADDED)); } @Test public void testReplaceManifestsMaxSize() { Table table = load(); - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); long appendId = table.currentSnapshot().snapshotId(); Assert.assertEquals(1, table.currentSnapshot().allManifests(table.io()).size()); - // cluster by constant will combine manifests into one but small target size will create one per entry + // cluster by constant will combine manifests into one but small target size will create one per + // entry BaseRewriteManifests rewriteManifests = spy((BaseRewriteManifests) table.rewriteManifests()); when(rewriteManifests.getManifestTargetSizeBytes()).thenReturn(1L); rewriteManifests.clusterBy(file -> "file").commit(); @@ -316,26 +286,18 @@ public void testReplaceManifestsMaxSize() { Assert.assertEquals(2, manifests.size()); manifests.sort(Comparator.comparing(ManifestFile::path)); - validateManifestEntries(manifests.get(0), - ids(appendId), - files(FILE_A), - statuses(ManifestEntry.Status.EXISTING)); - validateManifestEntries(manifests.get(1), - ids(appendId), - files(FILE_B), - statuses(ManifestEntry.Status.EXISTING)); + validateManifestEntries( + manifests.get(0), ids(appendId), files(FILE_A), statuses(ManifestEntry.Status.EXISTING)); + validateManifestEntries( + manifests.get(1), ids(appendId), files(FILE_B), statuses(ManifestEntry.Status.EXISTING)); } @Test public void testConcurrentRewriteManifest() throws IOException { Table table = load(); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); long appendIdA = table.currentSnapshot().snapshotId(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); long appendIdB = table.currentSnapshot().snapshotId(); // start a rewrite manifests that involves both manifests @@ -343,15 +305,17 @@ public void testConcurrentRewriteManifest() throws IOException { rewrite.clusterBy(file -> "file").apply(); // commit a rewrite manifests that only involves one manifest - table.rewriteManifests() + table + .rewriteManifests() .clusterBy(file -> "file") - .rewriteIf(manifest -> { - try (ManifestReader reader = ManifestFiles.read(manifest, table.io())) { - return !reader.iterator().next().path().equals(FILE_A.path()); - } catch (IOException x) { - throw new RuntimeIOException(x); - } - }) + .rewriteIf( + manifest -> { + try (ManifestReader reader = ManifestFiles.read(manifest, table.io())) { + return !reader.iterator().next().path().equals(FILE_A.path()); + } catch (IOException x) { + throw new RuntimeIOException(x); + } + }) .commit(); Assert.assertEquals(2, table.currentSnapshot().allManifests(table.io()).size()); @@ -376,18 +340,17 @@ public void testConcurrentRewriteManifest() throws IOException { } } - validateManifestEntries(manifests.get(0), - ids.iterator(), - files.iterator(), - statuses(ManifestEntry.Status.EXISTING, ManifestEntry.Status.EXISTING)); + validateManifestEntries( + manifests.get(0), + ids.iterator(), + files.iterator(), + statuses(ManifestEntry.Status.EXISTING, ManifestEntry.Status.EXISTING)); } @Test public void testAppendDuringRewriteManifest() { Table table = load(); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); long appendIdA = table.currentSnapshot().snapshotId(); // start the rewrite manifests @@ -395,9 +358,7 @@ public void testAppendDuringRewriteManifest() { rewrite.clusterBy(file -> "file").apply(); // append a file - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); long appendIdB = table.currentSnapshot().snapshotId(); Assert.assertEquals(2, table.currentSnapshot().allManifests(table.io()).size()); @@ -405,28 +366,23 @@ public void testAppendDuringRewriteManifest() { // commit the rewrite manifests in progress rewrite.commit(); - // the rewrite should only affect the first manifest, so we will end up with 2 manifests even though we + // the rewrite should only affect the first manifest, so we will end up with 2 manifests even + // though we // have a single cluster key, rewritten one should be the first in the list List manifests = table.currentSnapshot().allManifests(table.io()); Assert.assertEquals(2, manifests.size()); - validateManifestEntries(manifests.get(0), - ids(appendIdA), - files(FILE_A), - statuses(ManifestEntry.Status.EXISTING)); - validateManifestEntries(manifests.get(1), - ids(appendIdB), - files(FILE_B), - statuses(ManifestEntry.Status.ADDED)); + validateManifestEntries( + manifests.get(0), ids(appendIdA), files(FILE_A), statuses(ManifestEntry.Status.EXISTING)); + validateManifestEntries( + manifests.get(1), ids(appendIdB), files(FILE_B), statuses(ManifestEntry.Status.ADDED)); } @Test public void testRewriteManifestDuringAppend() { Table table = load(); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); long appendIdA = table.currentSnapshot().snapshotId(); // start an append @@ -434,9 +390,7 @@ public void testRewriteManifestDuringAppend() { append.appendFile(FILE_B).apply(); // rewrite the manifests - only affects the first - table.rewriteManifests() - .clusterBy(file -> "file") - .commit(); + table.rewriteManifests().clusterBy(file -> "file").commit(); Assert.assertEquals(1, table.currentSnapshot().allManifests(table.io()).size()); @@ -449,42 +403,34 @@ public void testRewriteManifestDuringAppend() { // last append should be the first in the list - validateManifestEntries(manifests.get(0), - ids(appendIdB), - files(FILE_B), - statuses(ManifestEntry.Status.ADDED)); - validateManifestEntries(manifests.get(1), - ids(appendIdA), - files(FILE_A), - statuses(ManifestEntry.Status.EXISTING)); + validateManifestEntries( + manifests.get(0), ids(appendIdB), files(FILE_B), statuses(ManifestEntry.Status.ADDED)); + validateManifestEntries( + manifests.get(1), ids(appendIdA), files(FILE_A), statuses(ManifestEntry.Status.EXISTING)); } @Test public void testBasicManifestReplacement() throws IOException { Assert.assertNull("Table should be empty", table.currentSnapshot()); - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); Assert.assertEquals(1, firstSnapshotManifests.size()); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); - table.newFastAppend() - .appendFile(FILE_C) - .appendFile(FILE_D) - .commit(); + table.newFastAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - ManifestFile firstNewManifest = writeManifest( - "manifest-file-1.avro", - manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A)); - ManifestFile secondNewManifest = writeManifest( - "manifest-file-2.avro", - manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_B)); + ManifestFile firstNewManifest = + writeManifest( + "manifest-file-1.avro", + manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A)); + ManifestFile secondNewManifest = + writeManifest( + "manifest-file-2.avro", + manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_B)); RewriteManifests rewriteManifests = table.rewriteManifests(); rewriteManifests.deleteManifest(firstSnapshotManifest); @@ -521,32 +467,26 @@ public void testBasicManifestReplacement() throws IOException { public void testBasicManifestReplacementWithSnapshotIdInheritance() throws IOException { Assert.assertNull("Table should be empty", table.currentSnapshot()); - table.updateProperties() - .set(SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); Assert.assertEquals(1, firstSnapshotManifests.size()); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); - table.newFastAppend() - .appendFile(FILE_C) - .appendFile(FILE_D) - .commit(); + table.newFastAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - ManifestFile firstNewManifest = writeManifest( - "manifest-file-1.avro", - manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A)); - ManifestFile secondNewManifest = writeManifest( - "manifest-file-2.avro", - manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_B)); + ManifestFile firstNewManifest = + writeManifest( + "manifest-file-1.avro", + manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A)); + ManifestFile secondNewManifest = + writeManifest( + "manifest-file-2.avro", + manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_B)); RewriteManifests rewriteManifests = table.rewriteManifests(); rewriteManifests.deleteManifest(firstSnapshotManifest); @@ -579,60 +519,53 @@ public void testBasicManifestReplacementWithSnapshotIdInheritance() throws IOExc statuses(ManifestEntry.Status.ADDED, ManifestEntry.Status.ADDED)); // validate that any subsequent operation does not fail - table.newDelete() - .deleteFromRowFilter(Expressions.alwaysTrue()) - .commit(); + table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); } @Test public void testWithMultiplePartitionSpec() throws IOException { Assert.assertNull("Table should be empty", table.currentSnapshot()); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); TableMetadata base = readMetadata(); - Assert.assertEquals("Should create 1 manifest for initial write", - 1, base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should create 1 manifest for initial write", + 1, + base.currentSnapshot().allManifests(table.io()).size()); ManifestFile initialManifest = base.currentSnapshot().allManifests(table.io()).get(0); int initialPartitionSpecId = initialManifest.partitionSpecId(); // build the new spec using the table's schema, which uses fresh IDs - PartitionSpec newSpec = PartitionSpec.builderFor(base.schema()) - .bucket("data", 16) - .bucket("id", 4) - .build(); + PartitionSpec newSpec = + PartitionSpec.builderFor(base.schema()).bucket("data", 16).bucket("id", 4).build(); // commit the new partition spec to the table manually table.ops().commit(base, base.updatePartitionSpec(newSpec)); - DataFile newFileY = DataFiles.builder(table.spec()) - .withPath("/path/to/data-y.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=2/id_bucket=3") - .withRecordCount(1) - .build(); + DataFile newFileY = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-y.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=2/id_bucket=3") + .withRecordCount(1) + .build(); - table.newAppend() - .appendFile(newFileY) - .commit(); + table.newAppend().appendFile(newFileY).commit(); - DataFile newFileZ = DataFiles.builder(table.spec()) - .withPath("/path/to/data-z.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=2/id_bucket=4") - .withRecordCount(1) - .build(); + DataFile newFileZ = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-z.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=2/id_bucket=4") + .withRecordCount(1) + .build(); - table.newAppend() - .appendFile(newFileZ) - .commit(); + table.newAppend().appendFile(newFileZ).commit(); - Assert.assertEquals("Should use 3 manifest files", - 3, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should use 3 manifest files", 3, table.currentSnapshot().allManifests(table.io()).size()); RewriteManifests rewriteManifests = table.rewriteManifests(); // try to cluster in 1 manifest file, but because of 2 partition specs @@ -640,74 +573,77 @@ public void testWithMultiplePartitionSpec() throws IOException { rewriteManifests.clusterBy(dataFile -> "file").commit(); List manifestFiles = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Rewrite manifest should produce 2 manifest files", - 2, manifestFiles.size()); + Assert.assertEquals( + "Rewrite manifest should produce 2 manifest files", 2, manifestFiles.size()); - Assert.assertEquals("2 manifest files should have different partitionSpecId", - true, manifestFiles.get(0).partitionSpecId() != manifestFiles.get(1).partitionSpecId()); + Assert.assertEquals( + "2 manifest files should have different partitionSpecId", + true, + manifestFiles.get(0).partitionSpecId() != manifestFiles.get(1).partitionSpecId()); matchNumberOfManifestFileWithSpecId(manifestFiles, initialPartitionSpecId, 1); matchNumberOfManifestFileWithSpecId(manifestFiles, table.ops().current().spec().specId(), 1); - Assert.assertEquals("first manifest file should have 2 data files", - Integer.valueOf(2), manifestFiles.get(0).existingFilesCount()); - - Assert.assertEquals("second manifest file should have 2 data files", - Integer.valueOf(2), manifestFiles.get(1).existingFilesCount()); + Assert.assertEquals( + "first manifest file should have 2 data files", + Integer.valueOf(2), + manifestFiles.get(0).existingFilesCount()); + Assert.assertEquals( + "second manifest file should have 2 data files", + Integer.valueOf(2), + manifestFiles.get(1).existingFilesCount()); } @Test public void testManifestSizeWithMultiplePartitionSpec() throws IOException { Assert.assertNull("Table should be empty", table.currentSnapshot()); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); TableMetadata base = readMetadata(); - Assert.assertEquals("Should create 1 manifest for initial write", - 1, base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should create 1 manifest for initial write", + 1, + base.currentSnapshot().allManifests(table.io()).size()); ManifestFile initialManifest = base.currentSnapshot().allManifests(table.io()).get(0); int initialPartitionSpecId = initialManifest.partitionSpecId(); // build the new spec using the table's schema, which uses fresh IDs - PartitionSpec newSpec = PartitionSpec.builderFor(base.schema()) - .bucket("data", 16) - .bucket("id", 4) - .build(); + PartitionSpec newSpec = + PartitionSpec.builderFor(base.schema()).bucket("data", 16).bucket("id", 4).build(); // commit the new partition spec to the table manually table.ops().commit(base, base.updatePartitionSpec(newSpec)); - DataFile newFileY = DataFiles.builder(table.spec()) - .withPath("/path/to/data-y.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=2/id_bucket=3") - .withRecordCount(1) - .build(); + DataFile newFileY = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-y.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=2/id_bucket=3") + .withRecordCount(1) + .build(); - table.newAppend() - .appendFile(newFileY) - .commit(); + table.newAppend().appendFile(newFileY).commit(); - DataFile newFileZ = DataFiles.builder(table.spec()) - .withPath("/path/to/data-z.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=2/id_bucket=4") - .withRecordCount(1) - .build(); + DataFile newFileZ = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-z.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=2/id_bucket=4") + .withRecordCount(1) + .build(); - table.newAppend() - .appendFile(newFileZ) - .commit(); + table.newAppend().appendFile(newFileZ).commit(); - Assert.assertEquals("Rewrite manifests should produce 3 manifest files", - 3, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Rewrite manifests should produce 3 manifest files", + 3, + table.currentSnapshot().allManifests(table.io()).size()); - // cluster by constant will combine manifests into one but small target size will create one per entry + // cluster by constant will combine manifests into one but small target size will create one per + // entry BaseRewriteManifests rewriteManifests = spy((BaseRewriteManifests) table.rewriteManifests()); when(rewriteManifests.getManifestTargetSizeBytes()).thenReturn(1L); @@ -715,56 +651,59 @@ public void testManifestSizeWithMultiplePartitionSpec() throws IOException { rewriteManifests.clusterBy(dataFile -> "file").commit(); List manifestFiles = table.currentSnapshot().allManifests(table.io()); - Assert.assertEquals("Should use 4 manifest files", - 4, manifestFiles.size()); + Assert.assertEquals("Should use 4 manifest files", 4, manifestFiles.size()); matchNumberOfManifestFileWithSpecId(manifestFiles, initialPartitionSpecId, 2); matchNumberOfManifestFileWithSpecId(manifestFiles, table.ops().current().spec().specId(), 2); - Assert.assertEquals("first manifest file should have 1 data files", - Integer.valueOf(1), manifestFiles.get(0).existingFilesCount()); + Assert.assertEquals( + "first manifest file should have 1 data files", + Integer.valueOf(1), + manifestFiles.get(0).existingFilesCount()); - Assert.assertEquals("second manifest file should have 1 data files", - Integer.valueOf(1), manifestFiles.get(1).existingFilesCount()); + Assert.assertEquals( + "second manifest file should have 1 data files", + Integer.valueOf(1), + manifestFiles.get(1).existingFilesCount()); - Assert.assertEquals("third manifest file should have 1 data files", - Integer.valueOf(1), manifestFiles.get(2).existingFilesCount()); + Assert.assertEquals( + "third manifest file should have 1 data files", + Integer.valueOf(1), + manifestFiles.get(2).existingFilesCount()); - Assert.assertEquals("fourth manifest file should have 1 data files", - Integer.valueOf(1), manifestFiles.get(3).existingFilesCount()); + Assert.assertEquals( + "fourth manifest file should have 1 data files", + Integer.valueOf(1), + manifestFiles.get(3).existingFilesCount()); } @Test public void testManifestReplacementConcurrentAppend() throws IOException { Assert.assertNull("Table should be empty", table.currentSnapshot()); - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); Assert.assertEquals(1, firstSnapshotManifests.size()); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); - ManifestFile firstNewManifest = writeManifest( - "manifest-file-1.avro", - manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A)); - ManifestFile secondNewManifest = writeManifest( - "manifest-file-2.avro", - manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_B)); + ManifestFile firstNewManifest = + writeManifest( + "manifest-file-1.avro", + manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A)); + ManifestFile secondNewManifest = + writeManifest( + "manifest-file-2.avro", + manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_B)); RewriteManifests rewriteManifests = table.rewriteManifests(); rewriteManifests.deleteManifest(firstSnapshotManifest); rewriteManifests.addManifest(firstNewManifest); rewriteManifests.addManifest(secondNewManifest); - table.newFastAppend() - .appendFile(FILE_C) - .appendFile(FILE_D) - .commit(); + table.newFastAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); Snapshot secondSnapshot = table.currentSnapshot(); Assert.assertEquals(2, table.currentSnapshot().allManifests(table.io()).size()); @@ -800,41 +739,33 @@ public void testManifestReplacementConcurrentAppend() throws IOException { public void testManifestReplacementConcurrentDelete() throws IOException { Assert.assertNull("Table should be empty", table.currentSnapshot()); - table.updateProperties() - .set(MANIFEST_MERGE_ENABLED, "false") - .commit(); + table.updateProperties().set(MANIFEST_MERGE_ENABLED, "false").commit(); - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); Assert.assertEquals(1, firstSnapshotManifests.size()); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); - table.newFastAppend() - .appendFile(FILE_C) - .appendFile(FILE_D) - .commit(); + table.newFastAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); long secondSnapshotId = table.currentSnapshot().snapshotId(); - ManifestFile firstNewManifest = writeManifest( - "manifest-file-1.avro", - manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A)); - ManifestFile secondNewManifest = writeManifest( - "manifest-file-2.avro", - manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_B)); + ManifestFile firstNewManifest = + writeManifest( + "manifest-file-1.avro", + manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A)); + ManifestFile secondNewManifest = + writeManifest( + "manifest-file-2.avro", + manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_B)); RewriteManifests rewriteManifests = table.rewriteManifests(); rewriteManifests.deleteManifest(firstSnapshotManifest); rewriteManifests.addManifest(firstNewManifest); rewriteManifests.addManifest(secondNewManifest); - table.newDelete() - .deleteFile(FILE_C) - .commit(); + table.newDelete().deleteFile(FILE_C).commit(); long thirdSnapshotId = table.currentSnapshot().snapshotId(); rewriteManifests.commit(); @@ -868,34 +799,33 @@ public void testManifestReplacementConcurrentDelete() throws IOException { public void testManifestReplacementConcurrentConflictingDelete() throws IOException { Assert.assertNull("Table should be empty", table.currentSnapshot()); - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); Assert.assertEquals(1, firstSnapshotManifests.size()); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); - ManifestFile firstNewManifest = writeManifest( - "manifest-file-1.avro", - manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A)); - ManifestFile secondNewManifest = writeManifest( - "manifest-file-2.avro", - manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_B)); + ManifestFile firstNewManifest = + writeManifest( + "manifest-file-1.avro", + manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A)); + ManifestFile secondNewManifest = + writeManifest( + "manifest-file-2.avro", + manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_B)); RewriteManifests rewriteManifests = table.rewriteManifests(); rewriteManifests.deleteManifest(firstSnapshotManifest); rewriteManifests.addManifest(firstNewManifest); rewriteManifests.addManifest(secondNewManifest); - table.newDelete() - .deleteFile(FILE_A) - .commit(); + table.newDelete().deleteFile(FILE_A).commit(); - AssertHelpers.assertThrows("Should reject commit", - ValidationException.class, "Manifest is missing", + AssertHelpers.assertThrows( + "Should reject commit", + ValidationException.class, + "Manifest is missing", rewriteManifests::commit); } @@ -903,46 +833,41 @@ public void testManifestReplacementConcurrentConflictingDelete() throws IOExcept public void testManifestReplacementCombinedWithRewrite() throws IOException { Assert.assertNull("Table should be empty", table.currentSnapshot()); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); Assert.assertEquals(1, firstSnapshotManifests.size()); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); - table.newFastAppend() - .appendFile(FILE_D) - .commit(); + table.newFastAppend().appendFile(FILE_D).commit(); Assert.assertEquals(4, Iterables.size(table.snapshots())); - ManifestFile newManifest = writeManifest( - "manifest-file-1.avro", - manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A)); + ManifestFile newManifest = + writeManifest( + "manifest-file-1.avro", + manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A)); - table.rewriteManifests() + table + .rewriteManifests() .deleteManifest(firstSnapshotManifest) .addManifest(newManifest) .clusterBy(dataFile -> "const-value") - .rewriteIf(manifest -> { - try (ManifestReader reader = ManifestFiles.read(manifest, table.io())) { - return !reader.iterator().next().path().equals(FILE_B.path()); - } catch (IOException x) { - throw new RuntimeIOException(x); - } - }) + .rewriteIf( + manifest -> { + try (ManifestReader reader = ManifestFiles.read(manifest, table.io())) { + return !reader.iterator().next().path().equals(FILE_B.path()); + } catch (IOException x) { + throw new RuntimeIOException(x); + } + }) .commit(); Snapshot snapshot = table.currentSnapshot(); @@ -968,46 +893,39 @@ public void testManifestReplacementCombinedWithRewrite() throws IOException { public void testManifestReplacementCombinedWithRewriteConcurrentDelete() throws IOException { Assert.assertNull("Table should be empty", table.currentSnapshot()); - table.updateProperties() - .set(MANIFEST_MERGE_ENABLED, "false") - .commit(); + table.updateProperties().set(MANIFEST_MERGE_ENABLED, "false").commit(); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); Assert.assertEquals(1, firstSnapshotManifests.size()); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); Assert.assertEquals(3, Iterables.size(table.snapshots())); - ManifestEntry entry = manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A); + ManifestEntry entry = + manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A); // update the entry's sequence number or else it will be rejected by the writer entry.setSequenceNumber(firstSnapshot.sequenceNumber()); ManifestFile newManifest = writeManifest("manifest-file-1.avro", entry); - RewriteManifests rewriteManifests = table.rewriteManifests() - .deleteManifest(firstSnapshotManifest) - .addManifest(newManifest) - .clusterBy(dataFile -> "const-value"); + RewriteManifests rewriteManifests = + table + .rewriteManifests() + .deleteManifest(firstSnapshotManifest) + .addManifest(newManifest) + .clusterBy(dataFile -> "const-value"); rewriteManifests.apply(); - table.newDelete() - .deleteFile(FILE_C) - .commit(); + table.newDelete().deleteFile(FILE_C).commit(); rewriteManifests.commit(); @@ -1034,78 +952,81 @@ public void testManifestReplacementCombinedWithRewriteConcurrentDelete() throws public void testInvalidUsage() throws IOException { Assert.assertNull("Table should be empty", table.currentSnapshot()); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); Snapshot snapshot = table.currentSnapshot(); List manifests = snapshot.allManifests(table.io()); Assert.assertEquals(1, manifests.size()); ManifestFile manifest = manifests.get(0); - ManifestEntry appendEntry = manifestEntry(ManifestEntry.Status.ADDED, snapshot.snapshotId(), FILE_A); + ManifestEntry appendEntry = + manifestEntry(ManifestEntry.Status.ADDED, snapshot.snapshotId(), FILE_A); // update the entry's sequence number or else it will be rejected by the writer appendEntry.setSequenceNumber(snapshot.sequenceNumber()); ManifestFile invalidAddedFileManifest = writeManifest("manifest-file-2.avro", appendEntry); - AssertHelpers.assertThrows("Should reject commit", - IllegalArgumentException.class, "Cannot add manifest with added files", - () -> table.rewriteManifests() - .deleteManifest(manifest) - .addManifest(invalidAddedFileManifest) - .commit()); - - ManifestEntry deleteEntry = manifestEntry(ManifestEntry.Status.DELETED, snapshot.snapshotId(), FILE_A); + AssertHelpers.assertThrows( + "Should reject commit", + IllegalArgumentException.class, + "Cannot add manifest with added files", + () -> + table + .rewriteManifests() + .deleteManifest(manifest) + .addManifest(invalidAddedFileManifest) + .commit()); + + ManifestEntry deleteEntry = + manifestEntry(ManifestEntry.Status.DELETED, snapshot.snapshotId(), FILE_A); // update the entry's sequence number or else it will be rejected by the writer deleteEntry.setSequenceNumber(snapshot.sequenceNumber()); ManifestFile invalidDeletedFileManifest = writeManifest("manifest-file-3.avro", deleteEntry); - AssertHelpers.assertThrows("Should reject commit", - IllegalArgumentException.class, "Cannot add manifest with deleted files", - () -> table.rewriteManifests() - .deleteManifest(manifest) - .addManifest(invalidDeletedFileManifest) - .commit()); - - AssertHelpers.assertThrows("Should reject commit", - ValidationException.class, "must have the same number of active files", - () -> table.rewriteManifests() - .deleteManifest(manifest) - .commit()); + AssertHelpers.assertThrows( + "Should reject commit", + IllegalArgumentException.class, + "Cannot add manifest with deleted files", + () -> + table + .rewriteManifests() + .deleteManifest(manifest) + .addManifest(invalidDeletedFileManifest) + .commit()); + + AssertHelpers.assertThrows( + "Should reject commit", + ValidationException.class, + "must have the same number of active files", + () -> table.rewriteManifests().deleteManifest(manifest).commit()); } @Test public void testManifestReplacementFailure() throws IOException { Assert.assertNull("Table should be empty", table.currentSnapshot()); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); Assert.assertEquals(1, firstSnapshotManifests.size()); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); List secondSnapshotManifests = secondSnapshot.allManifests(table.io()); Assert.assertEquals(2, secondSnapshotManifests.size()); ManifestFile secondSnapshotManifest = secondSnapshotManifests.get(0); - ManifestFile newManifest = writeManifest( - "manifest-file.avro", - manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A), - manifestEntry(ManifestEntry.Status.EXISTING, secondSnapshot.snapshotId(), FILE_B)); + ManifestFile newManifest = + writeManifest( + "manifest-file.avro", + manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A), + manifestEntry(ManifestEntry.Status.EXISTING, secondSnapshot.snapshotId(), FILE_B)); - table.updateProperties() - .set(TableProperties.COMMIT_NUM_RETRIES, "1") - .commit(); + table.updateProperties().set(TableProperties.COMMIT_NUM_RETRIES, "1").commit(); table.ops().failCommits(5); @@ -1114,8 +1035,10 @@ public void testManifestReplacementFailure() throws IOException { rewriteManifests.deleteManifest(secondSnapshotManifest); rewriteManifests.addManifest(newManifest); - AssertHelpers.assertThrows("Should reject commit", - CommitFailedException.class, "Injected failure", + AssertHelpers.assertThrows( + "Should reject commit", + CommitFailedException.class, + "Injected failure", rewriteManifests::commit); Assert.assertTrue("New manifest should not be deleted", new File(newManifest.path()).exists()); @@ -1127,32 +1050,27 @@ public void testManifestReplacementFailureWithSnapshotIdInheritance() throws IOE table.updateProperties().set(SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); List firstSnapshotManifests = firstSnapshot.allManifests(table.io()); Assert.assertEquals(1, firstSnapshotManifests.size()); ManifestFile firstSnapshotManifest = firstSnapshotManifests.get(0); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); List secondSnapshotManifests = secondSnapshot.allManifests(table.io()); Assert.assertEquals(2, secondSnapshotManifests.size()); ManifestFile secondSnapshotManifest = secondSnapshotManifests.get(0); - ManifestFile newManifest = writeManifest( - "manifest-file.avro", - manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A), - manifestEntry(ManifestEntry.Status.EXISTING, secondSnapshot.snapshotId(), FILE_B)); + ManifestFile newManifest = + writeManifest( + "manifest-file.avro", + manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshot.snapshotId(), FILE_A), + manifestEntry(ManifestEntry.Status.EXISTING, secondSnapshot.snapshotId(), FILE_B)); - table.updateProperties() - .set(TableProperties.COMMIT_NUM_RETRIES, "1") - .commit(); + table.updateProperties().set(TableProperties.COMMIT_NUM_RETRIES, "1").commit(); table.ops().failCommits(5); @@ -1161,36 +1079,47 @@ public void testManifestReplacementFailureWithSnapshotIdInheritance() throws IOE rewriteManifests.deleteManifest(secondSnapshotManifest); rewriteManifests.addManifest(newManifest); - AssertHelpers.assertThrows("Should reject commit", - CommitFailedException.class, "Injected failure", + AssertHelpers.assertThrows( + "Should reject commit", + CommitFailedException.class, + "Injected failure", rewriteManifests::commit); Assert.assertTrue("New manifest should not be deleted", new File(newManifest.path()).exists()); } - private void validateSummary(Snapshot snapshot, int replaced, int kept, int created, int entryCount) { + private void validateSummary( + Snapshot snapshot, int replaced, int kept, int created, int entryCount) { Map summary = snapshot.summary(); Assert.assertEquals( "Replaced manifest count should match", - replaced, Integer.parseInt(summary.get("manifests-replaced"))); + replaced, + Integer.parseInt(summary.get("manifests-replaced"))); Assert.assertEquals( - "Kept manifest count should match", - kept, Integer.parseInt(summary.get("manifests-kept"))); + "Kept manifest count should match", kept, Integer.parseInt(summary.get("manifests-kept"))); Assert.assertEquals( "Created manifest count should match", - created, Integer.parseInt(summary.get("manifests-created"))); + created, + Integer.parseInt(summary.get("manifests-created"))); Assert.assertEquals( - "Entry count should match", - entryCount, Integer.parseInt(summary.get("entries-processed"))); + "Entry count should match", entryCount, Integer.parseInt(summary.get("entries-processed"))); } - private void matchNumberOfManifestFileWithSpecId(List manifestFiles, int toBeMatchedPartitionSpecId, - int numberOfManifestWithPartitionSpecID) { - long matchedManifestsCounter = manifestFiles.stream() - .filter(m -> m.partitionSpecId() == toBeMatchedPartitionSpecId).count(); + private void matchNumberOfManifestFileWithSpecId( + List manifestFiles, + int toBeMatchedPartitionSpecId, + int numberOfManifestWithPartitionSpecID) { + long matchedManifestsCounter = + manifestFiles.stream() + .filter(m -> m.partitionSpecId() == toBeMatchedPartitionSpecId) + .count(); - Assert.assertEquals("manifest list should have " + numberOfManifestWithPartitionSpecID + - " manifests matching this partitionSpecId " + toBeMatchedPartitionSpecId, - numberOfManifestWithPartitionSpecID, matchedManifestsCounter); + Assert.assertEquals( + "manifest list should have " + + numberOfManifestWithPartitionSpecID + + " manifests matching this partitionSpecId " + + toBeMatchedPartitionSpecId, + numberOfManifestWithPartitionSpecID, + matchedManifestsCounter); } } diff --git a/core/src/test/java/org/apache/iceberg/TestRowDelta.java b/core/src/test/java/org/apache/iceberg/TestRowDelta.java index 4e317b18891a..6ebb92eb865a 100644 --- a/core/src/test/java/org/apache/iceberg/TestRowDelta.java +++ b/core/src/test/java/org/apache/iceberg/TestRowDelta.java @@ -16,9 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.SnapshotSummary.ADDED_DELETE_FILES_PROP; +import static org.apache.iceberg.SnapshotSummary.ADDED_FILES_PROP; +import static org.apache.iceberg.SnapshotSummary.ADDED_POS_DELETES_PROP; +import static org.apache.iceberg.SnapshotSummary.CHANGED_PARTITION_COUNT_PROP; +import static org.apache.iceberg.SnapshotSummary.CHANGED_PARTITION_PREFIX; +import static org.apache.iceberg.SnapshotSummary.TOTAL_DATA_FILES_PROP; +import static org.apache.iceberg.SnapshotSummary.TOTAL_DELETE_FILES_PROP; +import static org.apache.iceberg.SnapshotSummary.TOTAL_POS_DELETES_PROP; + import java.util.List; import java.util.Map; import java.util.Set; @@ -32,19 +40,11 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.SnapshotSummary.ADDED_DELETE_FILES_PROP; -import static org.apache.iceberg.SnapshotSummary.ADDED_FILES_PROP; -import static org.apache.iceberg.SnapshotSummary.ADDED_POS_DELETES_PROP; -import static org.apache.iceberg.SnapshotSummary.CHANGED_PARTITION_COUNT_PROP; -import static org.apache.iceberg.SnapshotSummary.CHANGED_PARTITION_PREFIX; -import static org.apache.iceberg.SnapshotSummary.TOTAL_DATA_FILES_PROP; -import static org.apache.iceberg.SnapshotSummary.TOTAL_DELETE_FILES_PROP; -import static org.apache.iceberg.SnapshotSummary.TOTAL_POS_DELETES_PROP; - public class TestRowDelta extends V2TableTestBase { @Test public void testAddDeleteFile() { - table.newRowDelta() + table + .newRowDelta() .addRows(FILE_A) .addDeletes(FILE_A_DELETES) .addDeletes(FILE_B_DELETES) @@ -52,8 +52,12 @@ public void testAddDeleteFile() { Snapshot snap = table.currentSnapshot(); Assert.assertEquals("Commit should produce sequence number 1", 1, snap.sequenceNumber()); - Assert.assertEquals("Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); - Assert.assertEquals("Delta commit should use operation 'overwrite'", DataOperations.OVERWRITE, snap.operation()); + Assert.assertEquals( + "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Delta commit should use operation 'overwrite'", + DataOperations.OVERWRITE, + snap.operation()); Assert.assertEquals("Should produce 1 data manifest", 1, snap.dataManifests(table.io()).size()); validateManifest( @@ -63,7 +67,8 @@ public void testAddDeleteFile() { files(FILE_A), statuses(Status.ADDED)); - Assert.assertEquals("Should produce 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should produce 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); validateDeleteManifest( snap.deleteManifests(table.io()).get(0), seqs(1, 1), @@ -74,51 +79,55 @@ public void testAddDeleteFile() { @Test public void testValidateDataFilesExistDefaults() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); // test changes to the table back to the snapshot where FILE_A and FILE_B existed long validateFromSnapshotId = table.currentSnapshot().snapshotId(); // overwrite FILE_A - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_A2) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_A2).commit(); // delete FILE_B - table.newDelete() - .deleteFile(FILE_B) - .commit(); + table.newDelete().deleteFile(FILE_B).commit(); long deleteSnapshotId = table.currentSnapshot().snapshotId(); - AssertHelpers.assertThrows("Should fail to add FILE_A_DELETES because FILE_A is missing", - ValidationException.class, "Cannot commit, missing data files", - () -> table.newRowDelta() - .addDeletes(FILE_A_DELETES) - .validateFromSnapshot(validateFromSnapshotId) - .validateDataFilesExist(ImmutableList.of(FILE_A.path())) - .commit()); - - Assert.assertEquals("Table state should not be modified by failed RowDelta operation", - deleteSnapshotId, table.currentSnapshot().snapshotId()); - - Assert.assertEquals("Table should not have any delete manifests", - 0, table.currentSnapshot().deleteManifests(table.io()).size()); - - table.newRowDelta() + AssertHelpers.assertThrows( + "Should fail to add FILE_A_DELETES because FILE_A is missing", + ValidationException.class, + "Cannot commit, missing data files", + () -> + table + .newRowDelta() + .addDeletes(FILE_A_DELETES) + .validateFromSnapshot(validateFromSnapshotId) + .validateDataFilesExist(ImmutableList.of(FILE_A.path())) + .commit()); + + Assert.assertEquals( + "Table state should not be modified by failed RowDelta operation", + deleteSnapshotId, + table.currentSnapshot().snapshotId()); + + Assert.assertEquals( + "Table should not have any delete manifests", + 0, + table.currentSnapshot().deleteManifests(table.io()).size()); + + table + .newRowDelta() .addDeletes(FILE_B_DELETES) .validateDataFilesExist(ImmutableList.of(FILE_B.path())) .validateFromSnapshot(validateFromSnapshotId) .commit(); - Assert.assertEquals("Table should have one new delete manifest", - 1, table.currentSnapshot().deleteManifests(table.io()).size()); + Assert.assertEquals( + "Table should have one new delete manifest", + 1, + table.currentSnapshot().deleteManifests(table.io()).size()); ManifestFile deletes = table.currentSnapshot().deleteManifests(table.io()).get(0); - validateDeleteManifest(deletes, + validateDeleteManifest( + deletes, seqs(4), ids(table.currentSnapshot().snapshotId()), files(FILE_B_DELETES), @@ -127,89 +136,91 @@ public void testValidateDataFilesExistDefaults() { @Test public void testValidateDataFilesExistOverwrite() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); // test changes to the table back to the snapshot where FILE_A and FILE_B existed long validateFromSnapshotId = table.currentSnapshot().snapshotId(); // overwrite FILE_A - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_A2) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_A2).commit(); long deleteSnapshotId = table.currentSnapshot().snapshotId(); - AssertHelpers.assertThrows("Should fail to add FILE_A_DELETES because FILE_A is missing", - ValidationException.class, "Cannot commit, missing data files", - () -> table.newRowDelta() - .addDeletes(FILE_A_DELETES) - .validateFromSnapshot(validateFromSnapshotId) - .validateDataFilesExist(ImmutableList.of(FILE_A.path())) - .commit()); - - Assert.assertEquals("Table state should not be modified by failed RowDelta operation", - deleteSnapshotId, table.currentSnapshot().snapshotId()); - - Assert.assertEquals("Table should not have any delete manifests", - 0, table.currentSnapshot().deleteManifests(table.io()).size()); + AssertHelpers.assertThrows( + "Should fail to add FILE_A_DELETES because FILE_A is missing", + ValidationException.class, + "Cannot commit, missing data files", + () -> + table + .newRowDelta() + .addDeletes(FILE_A_DELETES) + .validateFromSnapshot(validateFromSnapshotId) + .validateDataFilesExist(ImmutableList.of(FILE_A.path())) + .commit()); + + Assert.assertEquals( + "Table state should not be modified by failed RowDelta operation", + deleteSnapshotId, + table.currentSnapshot().snapshotId()); + + Assert.assertEquals( + "Table should not have any delete manifests", + 0, + table.currentSnapshot().deleteManifests(table.io()).size()); } @Test public void testValidateDataFilesExistReplacePartitions() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); // test changes to the table back to the snapshot where FILE_A and FILE_B existed long validateFromSnapshotId = table.currentSnapshot().snapshotId(); // overwrite FILE_A's partition - table.newReplacePartitions() - .addFile(FILE_A2) - .commit(); + table.newReplacePartitions().addFile(FILE_A2).commit(); long deleteSnapshotId = table.currentSnapshot().snapshotId(); - AssertHelpers.assertThrows("Should fail to add FILE_A_DELETES because FILE_A is missing", - ValidationException.class, "Cannot commit, missing data files", - () -> table.newRowDelta() - .addDeletes(FILE_A_DELETES) - .validateFromSnapshot(validateFromSnapshotId) - .validateDataFilesExist(ImmutableList.of(FILE_A.path())) - .commit()); - - Assert.assertEquals("Table state should not be modified by failed RowDelta operation", - deleteSnapshotId, table.currentSnapshot().snapshotId()); - - Assert.assertEquals("Table should not have any delete manifests", - 0, table.currentSnapshot().deleteManifests(table.io()).size()); + AssertHelpers.assertThrows( + "Should fail to add FILE_A_DELETES because FILE_A is missing", + ValidationException.class, + "Cannot commit, missing data files", + () -> + table + .newRowDelta() + .addDeletes(FILE_A_DELETES) + .validateFromSnapshot(validateFromSnapshotId) + .validateDataFilesExist(ImmutableList.of(FILE_A.path())) + .commit()); + + Assert.assertEquals( + "Table state should not be modified by failed RowDelta operation", + deleteSnapshotId, + table.currentSnapshot().snapshotId()); + + Assert.assertEquals( + "Table should not have any delete manifests", + 0, + table.currentSnapshot().deleteManifests(table.io()).size()); } @Test public void testValidateDataFilesExistFromSnapshot() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); long appendSnapshotId = table.currentSnapshot().snapshotId(); // overwrite FILE_A's partition - table.newReplacePartitions() - .addFile(FILE_A2) - .commit(); + table.newReplacePartitions().addFile(FILE_A2).commit(); // test changes to the table back to the snapshot where FILE_A was overwritten long validateFromSnapshotId = table.currentSnapshot().snapshotId(); long replaceSnapshotId = table.currentSnapshot().snapshotId(); - // even though FILE_A was deleted, it happened before the "from" snapshot, so the validation allows it - table.newRowDelta() + // even though FILE_A was deleted, it happened before the "from" snapshot, so the validation + // allows it + table + .newRowDelta() .addDeletes(FILE_A_DELETES) .validateFromSnapshot(validateFromSnapshotId) .validateDataFilesExist(ImmutableList.of(FILE_A.path())) @@ -217,7 +228,8 @@ public void testValidateDataFilesExistFromSnapshot() { Snapshot snap = table.currentSnapshot(); Assert.assertEquals("Commit should produce sequence number 2", 3, snap.sequenceNumber()); - Assert.assertEquals("Last sequence number should be 3", 3, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Last sequence number should be 3", 3, table.ops().current().lastSequenceNumber()); Assert.assertEquals("Should have 2 data manifests", 2, snap.dataManifests(table.io()).size()); // manifest with FILE_A2 added @@ -236,7 +248,8 @@ public void testValidateDataFilesExistFromSnapshot() { files(FILE_A, FILE_B), statuses(Status.DELETED, Status.EXISTING)); - Assert.assertEquals("Should have 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should have 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); validateDeleteManifest( snap.deleteManifests(table.io()).get(0), seqs(3), @@ -247,117 +260,125 @@ public void testValidateDataFilesExistFromSnapshot() { @Test public void testValidateDataFilesExistRewrite() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); // test changes to the table back to the snapshot where FILE_A and FILE_B existed long validateFromSnapshotId = table.currentSnapshot().snapshotId(); // rewrite FILE_A - table.newRewrite() - .rewriteFiles(Sets.newHashSet(FILE_A), Sets.newHashSet(FILE_A2)) - .commit(); + table.newRewrite().rewriteFiles(Sets.newHashSet(FILE_A), Sets.newHashSet(FILE_A2)).commit(); long deleteSnapshotId = table.currentSnapshot().snapshotId(); - AssertHelpers.assertThrows("Should fail to add FILE_A_DELETES because FILE_A is missing", - ValidationException.class, "Cannot commit, missing data files", - () -> table.newRowDelta() - .addDeletes(FILE_A_DELETES) - .validateFromSnapshot(validateFromSnapshotId) - .validateDataFilesExist(ImmutableList.of(FILE_A.path())) - .commit()); - - Assert.assertEquals("Table state should not be modified by failed RowDelta operation", - deleteSnapshotId, table.currentSnapshot().snapshotId()); - - Assert.assertEquals("Table should not have any delete manifests", - 0, table.currentSnapshot().deleteManifests(table.io()).size()); + AssertHelpers.assertThrows( + "Should fail to add FILE_A_DELETES because FILE_A is missing", + ValidationException.class, + "Cannot commit, missing data files", + () -> + table + .newRowDelta() + .addDeletes(FILE_A_DELETES) + .validateFromSnapshot(validateFromSnapshotId) + .validateDataFilesExist(ImmutableList.of(FILE_A.path())) + .commit()); + + Assert.assertEquals( + "Table state should not be modified by failed RowDelta operation", + deleteSnapshotId, + table.currentSnapshot().snapshotId()); + + Assert.assertEquals( + "Table should not have any delete manifests", + 0, + table.currentSnapshot().deleteManifests(table.io()).size()); } @Test public void testValidateDataFilesExistValidateDeletes() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); // test changes to the table back to the snapshot where FILE_A and FILE_B existed long validateFromSnapshotId = table.currentSnapshot().snapshotId(); // delete FILE_A - table.newDelete() - .deleteFile(FILE_A) - .commit(); + table.newDelete().deleteFile(FILE_A).commit(); long deleteSnapshotId = table.currentSnapshot().snapshotId(); - AssertHelpers.assertThrows("Should fail to add FILE_A_DELETES because FILE_A is missing", - ValidationException.class, "Cannot commit, missing data files", - () -> table.newRowDelta() - .addDeletes(FILE_A_DELETES) - .validateDeletedFiles() - .validateFromSnapshot(validateFromSnapshotId) - .validateDataFilesExist(ImmutableList.of(FILE_A.path())) - .commit()); - - Assert.assertEquals("Table state should not be modified by failed RowDelta operation", - deleteSnapshotId, table.currentSnapshot().snapshotId()); - - Assert.assertEquals("Table should not have any delete manifests", - 0, table.currentSnapshot().deleteManifests(table.io()).size()); + AssertHelpers.assertThrows( + "Should fail to add FILE_A_DELETES because FILE_A is missing", + ValidationException.class, + "Cannot commit, missing data files", + () -> + table + .newRowDelta() + .addDeletes(FILE_A_DELETES) + .validateDeletedFiles() + .validateFromSnapshot(validateFromSnapshotId) + .validateDataFilesExist(ImmutableList.of(FILE_A.path())) + .commit()); + + Assert.assertEquals( + "Table state should not be modified by failed RowDelta operation", + deleteSnapshotId, + table.currentSnapshot().snapshotId()); + + Assert.assertEquals( + "Table should not have any delete manifests", + 0, + table.currentSnapshot().deleteManifests(table.io()).size()); } @Test public void testValidateNoConflicts() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // test changes to the table back to the snapshot where FILE_A and FILE_B existed long validateFromSnapshotId = table.currentSnapshot().snapshotId(); // delete FILE_A - table.newAppend() - .appendFile(FILE_A2) - .commit(); + table.newAppend().appendFile(FILE_A2).commit(); long appendSnapshotId = table.currentSnapshot().snapshotId(); - AssertHelpers.assertThrows("Should fail to add FILE_A_DELETES because FILE_A2 was added", - ValidationException.class, "Found conflicting files", - () -> table.newRowDelta() - .addDeletes(FILE_A_DELETES) - .validateFromSnapshot(validateFromSnapshotId) - .validateNoConflictingAppends(Expressions.equal("data", "u")) // bucket16("u") -> 0 - .commit()); - - Assert.assertEquals("Table state should not be modified by failed RowDelta operation", - appendSnapshotId, table.currentSnapshot().snapshotId()); - - Assert.assertEquals("Table should not have any delete manifests", - 0, table.currentSnapshot().deleteManifests(table.io()).size()); + AssertHelpers.assertThrows( + "Should fail to add FILE_A_DELETES because FILE_A2 was added", + ValidationException.class, + "Found conflicting files", + () -> + table + .newRowDelta() + .addDeletes(FILE_A_DELETES) + .validateFromSnapshot(validateFromSnapshotId) + .validateNoConflictingAppends(Expressions.equal("data", "u")) // bucket16("u") -> 0 + .commit()); + + Assert.assertEquals( + "Table state should not be modified by failed RowDelta operation", + appendSnapshotId, + table.currentSnapshot().snapshotId()); + + Assert.assertEquals( + "Table should not have any delete manifests", + 0, + table.currentSnapshot().deleteManifests(table.io()).size()); } @Test public void testValidateNoConflictsFromSnapshot() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long appendSnapshotId = table.currentSnapshot().snapshotId(); // delete FILE_A - table.newAppend() - .appendFile(FILE_A2) - .commit(); + table.newAppend().appendFile(FILE_A2).commit(); - // even though FILE_A2 was added, it happened before the "from" snapshot, so the validation allows it + // even though FILE_A2 was added, it happened before the "from" snapshot, so the validation + // allows it long validateFromSnapshotId = table.currentSnapshot().snapshotId(); - table.newRowDelta() + table + .newRowDelta() .addDeletes(FILE_A_DELETES) .validateDeletedFiles() .validateFromSnapshot(validateFromSnapshotId) @@ -367,7 +388,8 @@ public void testValidateNoConflictsFromSnapshot() { Snapshot snap = table.currentSnapshot(); Assert.assertEquals("Commit should produce sequence number 2", 3, snap.sequenceNumber()); - Assert.assertEquals("Last sequence number should be 3", 3, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Last sequence number should be 3", 3, table.ops().current().lastSequenceNumber()); Assert.assertEquals("Should have 2 data manifests", 2, snap.dataManifests(table.io()).size()); // manifest with FILE_A2 added @@ -386,7 +408,8 @@ public void testValidateNoConflictsFromSnapshot() { files(FILE_A), statuses(Status.ADDED)); - Assert.assertEquals("Should have 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should have 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); validateDeleteManifest( snap.deleteManifests(table.io()).get(0), seqs(3), @@ -397,24 +420,30 @@ public void testValidateNoConflictsFromSnapshot() { @Test public void testOverwriteWithDeleteFile() { - table.newRowDelta() + table + .newRowDelta() .addRows(FILE_A) .addDeletes(FILE_A_DELETES) .addDeletes(FILE_B_DELETES) .commit(); long deltaSnapshotId = table.currentSnapshot().snapshotId(); - Assert.assertEquals("Commit should produce sequence number 1", 1, table.currentSnapshot().sequenceNumber()); - Assert.assertEquals("Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); - - // overwriting by a filter will also remove delete files that match because all matching data files are removed. - table.newOverwrite() + Assert.assertEquals( + "Commit should produce sequence number 1", 1, table.currentSnapshot().sequenceNumber()); + Assert.assertEquals( + "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + + // overwriting by a filter will also remove delete files that match because all matching data + // files are removed. + table + .newOverwrite() .overwriteByRowFilter(Expressions.equal(Expressions.bucket("data", 16), 0)) .commit(); Snapshot snap = table.currentSnapshot(); Assert.assertEquals("Commit should produce sequence number 2", 2, snap.sequenceNumber()); - Assert.assertEquals("Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); Assert.assertEquals("Should produce 1 data manifest", 1, snap.dataManifests(table.io()).size()); validateManifest( @@ -424,7 +453,8 @@ public void testOverwriteWithDeleteFile() { files(FILE_A), statuses(Status.DELETED)); - Assert.assertEquals("Should produce 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should produce 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); validateDeleteManifest( snap.deleteManifests(table.io()).get(0), seqs(2, 1), @@ -435,26 +465,30 @@ public void testOverwriteWithDeleteFile() { @Test public void testReplacePartitionsWithDeleteFile() { - table.newRowDelta() + table + .newRowDelta() .addRows(FILE_A) .addDeletes(FILE_A_DELETES) .addDeletes(FILE_B_DELETES) .commit(); long deltaSnapshotId = table.currentSnapshot().snapshotId(); - Assert.assertEquals("Commit should produce sequence number 1", 1, table.currentSnapshot().sequenceNumber()); - Assert.assertEquals("Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Commit should produce sequence number 1", 1, table.currentSnapshot().sequenceNumber()); + Assert.assertEquals( + "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); - // overwriting the partition will also remove delete files that match because all matching data files are removed. - table.newReplacePartitions() - .addFile(FILE_A2) - .commit(); + // overwriting the partition will also remove delete files that match because all matching data + // files are removed. + table.newReplacePartitions().addFile(FILE_A2).commit(); Snapshot snap = table.currentSnapshot(); Assert.assertEquals("Commit should produce sequence number 2", 2, snap.sequenceNumber()); - Assert.assertEquals("Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); - Assert.assertEquals("Should produce 2 data manifests", 2, snap.dataManifests(table.io()).size()); + Assert.assertEquals( + "Should produce 2 data manifests", 2, snap.dataManifests(table.io()).size()); int deleteManifestPos = snap.dataManifests(table.io()).get(0).deletedFilesCount() > 0 ? 0 : 1; validateManifest( snap.dataManifests(table.io()).get(deleteManifestPos), @@ -470,7 +504,8 @@ public void testReplacePartitionsWithDeleteFile() { files(FILE_A2), statuses(Status.ADDED)); - Assert.assertEquals("Should produce 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should produce 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); validateDeleteManifest( snap.deleteManifests(table.io()).get(0), seqs(2, 1), @@ -481,24 +516,27 @@ public void testReplacePartitionsWithDeleteFile() { @Test public void testDeleteByExpressionWithDeleteFile() { - table.newRowDelta() + table + .newRowDelta() .addRows(FILE_A) .addDeletes(FILE_A_DELETES) .addDeletes(FILE_B_DELETES) .commit(); long deltaSnapshotId = table.currentSnapshot().snapshotId(); - Assert.assertEquals("Commit should produce sequence number 1", 1, table.currentSnapshot().sequenceNumber()); - Assert.assertEquals("Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Commit should produce sequence number 1", 1, table.currentSnapshot().sequenceNumber()); + Assert.assertEquals( + "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); - // deleting with a filter will also remove delete files that match because all matching data files are removed. - table.newDelete() - .deleteFromRowFilter(Expressions.alwaysTrue()) - .commit(); + // deleting with a filter will also remove delete files that match because all matching data + // files are removed. + table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); Snapshot snap = table.currentSnapshot(); Assert.assertEquals("Commit should produce sequence number 2", 2, snap.sequenceNumber()); - Assert.assertEquals("Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); Assert.assertEquals("Should produce 1 data manifest", 1, snap.dataManifests(table.io()).size()); validateManifest( @@ -508,7 +546,8 @@ public void testDeleteByExpressionWithDeleteFile() { files(FILE_A), statuses(Status.DELETED)); - Assert.assertEquals("Should produce 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should produce 1 delete manifest", 1, snap.deleteManifests(table.io()).size()); validateDeleteManifest( snap.deleteManifests(table.io()).get(0), seqs(2, 2), @@ -519,25 +558,24 @@ public void testDeleteByExpressionWithDeleteFile() { @Test public void testDeleteDataFileWithDeleteFile() { - table.newRowDelta() - .addRows(FILE_A) - .addDeletes(FILE_A_DELETES) - .commit(); + table.newRowDelta().addRows(FILE_A).addDeletes(FILE_A_DELETES).commit(); long deltaSnapshotId = table.currentSnapshot().snapshotId(); - Assert.assertEquals("Commit should produce sequence number 1", 1, table.currentSnapshot().sequenceNumber()); - Assert.assertEquals("Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Commit should produce sequence number 1", 1, table.currentSnapshot().sequenceNumber()); + Assert.assertEquals( + "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); // deleting a specific data file will not affect a delete file - table.newDelete() - .deleteFile(FILE_A) - .commit(); + table.newDelete().deleteFile(FILE_A).commit(); Snapshot deleteSnap = table.currentSnapshot(); Assert.assertEquals("Commit should produce sequence number 2", 2, deleteSnap.sequenceNumber()); - Assert.assertEquals("Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); - Assert.assertEquals("Should produce 1 data manifest", 1, deleteSnap.dataManifests(table.io()).size()); + Assert.assertEquals( + "Should produce 1 data manifest", 1, deleteSnap.dataManifests(table.io()).size()); validateManifest( deleteSnap.dataManifests(table.io()).get(0), seqs(2), @@ -545,7 +583,8 @@ public void testDeleteDataFileWithDeleteFile() { files(FILE_A), statuses(Status.DELETED)); - Assert.assertEquals("Should produce 1 delete manifest", 1, deleteSnap.deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should produce 1 delete manifest", 1, deleteSnap.deleteManifests(table.io()).size()); validateDeleteManifest( deleteSnap.deleteManifests(table.io()).get(0), seqs(1), @@ -553,19 +592,22 @@ public void testDeleteDataFileWithDeleteFile() { files(FILE_A_DELETES), statuses(Status.ADDED)); - // the manifest that removed FILE_A will be dropped next commit, causing the min sequence number of all data files - // to be 2, the largest known sequence number. this will cause FILE_A_DELETES to be removed because it is too old + // the manifest that removed FILE_A will be dropped next commit, causing the min sequence number + // of all data files + // to be 2, the largest known sequence number. this will cause FILE_A_DELETES to be removed + // because it is too old // to apply to any data files. - table.newDelete() - .deleteFile("no-such-file") - .commit(); + table.newDelete().deleteFile("no-such-file").commit(); Snapshot nextSnap = table.currentSnapshot(); Assert.assertEquals("Append should produce sequence number 3", 3, nextSnap.sequenceNumber()); - Assert.assertEquals("Last sequence number should be 3", 3, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Last sequence number should be 3", 3, table.ops().current().lastSequenceNumber()); - Assert.assertEquals("Should have 0 data manifests", 0, nextSnap.dataManifests(table.io()).size()); - Assert.assertEquals("Should produce 1 delete manifest", 1, nextSnap.deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should have 0 data manifests", 0, nextSnap.dataManifests(table.io()).size()); + Assert.assertEquals( + "Should produce 1 delete manifest", 1, nextSnap.deleteManifests(table.io()).size()); validateDeleteManifest( nextSnap.deleteManifests(table.io()).get(0), seqs(3), @@ -576,25 +618,24 @@ public void testDeleteDataFileWithDeleteFile() { @Test public void testFastAppendDoesNotRemoveStaleDeleteFiles() { - table.newRowDelta() - .addRows(FILE_A) - .addDeletes(FILE_A_DELETES) - .commit(); + table.newRowDelta().addRows(FILE_A).addDeletes(FILE_A_DELETES).commit(); long deltaSnapshotId = table.currentSnapshot().snapshotId(); - Assert.assertEquals("Commit should produce sequence number 1", 1, table.currentSnapshot().sequenceNumber()); - Assert.assertEquals("Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Commit should produce sequence number 1", 1, table.currentSnapshot().sequenceNumber()); + Assert.assertEquals( + "Last sequence number should be 1", 1, table.ops().current().lastSequenceNumber()); // deleting a specific data file will not affect a delete file - table.newDelete() - .deleteFile(FILE_A) - .commit(); + table.newDelete().deleteFile(FILE_A).commit(); Snapshot deleteSnap = table.currentSnapshot(); Assert.assertEquals("Commit should produce sequence number 2", 2, deleteSnap.sequenceNumber()); - Assert.assertEquals("Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Last sequence number should be 2", 2, table.ops().current().lastSequenceNumber()); - Assert.assertEquals("Should produce 1 data manifest", 1, deleteSnap.dataManifests(table.io()).size()); + Assert.assertEquals( + "Should produce 1 data manifest", 1, deleteSnap.dataManifests(table.io()).size()); validateManifest( deleteSnap.dataManifests(table.io()).get(0), seqs(2), @@ -602,7 +643,8 @@ public void testFastAppendDoesNotRemoveStaleDeleteFiles() { files(FILE_A), statuses(Status.DELETED)); - Assert.assertEquals("Should produce 1 delete manifest", 1, deleteSnap.deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should produce 1 delete manifest", 1, deleteSnap.deleteManifests(table.io()).size()); validateDeleteManifest( deleteSnap.deleteManifests(table.io()).get(0), seqs(1), @@ -610,17 +652,19 @@ public void testFastAppendDoesNotRemoveStaleDeleteFiles() { files(FILE_A_DELETES), statuses(Status.ADDED)); - // the manifest that removed FILE_A will be dropped next merging commit, but FastAppend will not remove it - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + // the manifest that removed FILE_A will be dropped next merging commit, but FastAppend will not + // remove it + table.newFastAppend().appendFile(FILE_B).commit(); Snapshot nextSnap = table.currentSnapshot(); Assert.assertEquals("Append should produce sequence number 3", 3, nextSnap.sequenceNumber()); - Assert.assertEquals("Last sequence number should be 3", 3, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Last sequence number should be 3", 3, table.ops().current().lastSequenceNumber()); - Assert.assertEquals("Should have 2 data manifests", 2, nextSnap.dataManifests(table.io()).size()); - int deleteManifestPos = nextSnap.dataManifests(table.io()).get(0).deletedFilesCount() > 0 ? 0 : 1; + Assert.assertEquals( + "Should have 2 data manifests", 2, nextSnap.dataManifests(table.io()).size()); + int deleteManifestPos = + nextSnap.dataManifests(table.io()).get(0).deletedFilesCount() > 0 ? 0 : 1; validateManifest( nextSnap.dataManifests(table.io()).get(deleteManifestPos), seqs(2), @@ -635,7 +679,8 @@ public void testFastAppendDoesNotRemoveStaleDeleteFiles() { files(FILE_B), statuses(Status.ADDED)); - Assert.assertEquals("Should produce 1 delete manifest", 1, nextSnap.deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should produce 1 delete manifest", 1, nextSnap.deleteManifests(table.io()).size()); validateDeleteManifest( nextSnap.deleteManifests(table.io()).get(0), seqs(1), @@ -647,67 +692,70 @@ public void testFastAppendDoesNotRemoveStaleDeleteFiles() { @Test public void testValidateDataFilesExistWithConflictDetectionFilter() { // change the spec to be partitioned by data - table.updateSpec() + table + .updateSpec() .removeField(Expressions.bucket("data", 16)) .addField(Expressions.ref("data")) .commit(); // add a data file to partition A - DataFile dataFile1 = DataFiles.builder(table.spec()) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data=a") - .withRecordCount(1) - .build(); - - table.newAppend() - .appendFile(dataFile1) - .commit(); + DataFile dataFile1 = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data=a") + .withRecordCount(1) + .build(); + + table.newAppend().appendFile(dataFile1).commit(); // add a data file to partition B - DataFile dataFile2 = DataFiles.builder(table.spec()) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data=b") - .withRecordCount(1) - .build(); - - table.newAppend() - .appendFile(dataFile2) - .commit(); + DataFile dataFile2 = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data=b") + .withRecordCount(1) + .build(); + + table.newAppend().appendFile(dataFile2).commit(); // use this snapshot as the starting snapshot in rowDelta Snapshot baseSnapshot = table.currentSnapshot(); // add a delete file for partition A - DeleteFile deleteFile = FileMetadata.deleteFileBuilder(table.spec()) - .ofPositionDeletes() - .withPath("/path/to/data-a-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data=a") - .withRecordCount(1) - .build(); + DeleteFile deleteFile = + FileMetadata.deleteFileBuilder(table.spec()) + .ofPositionDeletes() + .withPath("/path/to/data-a-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data=a") + .withRecordCount(1) + .build(); Expression conflictDetectionFilter = Expressions.equal("data", "a"); - RowDelta rowDelta = table.newRowDelta() - .addDeletes(deleteFile) - .validateDataFilesExist(ImmutableList.of(dataFile1.path())) - .validateDeletedFiles() - .validateFromSnapshot(baseSnapshot.snapshotId()) - .validateNoConflictingAppends(conflictDetectionFilter); + RowDelta rowDelta = + table + .newRowDelta() + .addDeletes(deleteFile) + .validateDataFilesExist(ImmutableList.of(dataFile1.path())) + .validateDeletedFiles() + .validateFromSnapshot(baseSnapshot.snapshotId()) + .validateNoConflictingAppends(conflictDetectionFilter); // concurrently delete the file for partition B - table.newDelete() - .deleteFile(dataFile2) - .commit(); + table.newDelete().deleteFile(dataFile2).commit(); // commit the delta for partition A rowDelta.commit(); - Assert.assertEquals("Table should have one new delete manifest", - 1, table.currentSnapshot().deleteManifests(table.io()).size()); + Assert.assertEquals( + "Table should have one new delete manifest", + 1, + table.currentSnapshot().deleteManifests(table.io()).size()); ManifestFile deletes = table.currentSnapshot().deleteManifests(table.io()).get(0); - validateDeleteManifest(deletes, + validateDeleteManifest( + deletes, seqs(4), ids(table.currentSnapshot().snapshotId()), files(deleteFile), @@ -717,99 +765,92 @@ public void testValidateDataFilesExistWithConflictDetectionFilter() { @Test public void testValidateDataFilesDoNotExistWithConflictDetectionFilter() { // change the spec to be partitioned by data - table.updateSpec() + table + .updateSpec() .removeField(Expressions.bucket("data", 16)) .addField(Expressions.ref("data")) .commit(); // add a data file to partition A - DataFile dataFile1 = DataFiles.builder(table.spec()) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data=a") - .withRecordCount(1) - .build(); - - table.newAppend() - .appendFile(dataFile1) - .commit(); + DataFile dataFile1 = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data=a") + .withRecordCount(1) + .build(); + + table.newAppend().appendFile(dataFile1).commit(); // use this snapshot as the starting snapshot in rowDelta Snapshot baseSnapshot = table.currentSnapshot(); // add a delete file for partition A - DeleteFile deleteFile = FileMetadata.deleteFileBuilder(table.spec()) - .ofPositionDeletes() - .withPath("/path/to/data-a-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data=a") - .withRecordCount(1) - .build(); + DeleteFile deleteFile = + FileMetadata.deleteFileBuilder(table.spec()) + .ofPositionDeletes() + .withPath("/path/to/data-a-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data=a") + .withRecordCount(1) + .build(); Expression conflictDetectionFilter = Expressions.equal("data", "a"); - RowDelta rowDelta = table.newRowDelta() - .addDeletes(deleteFile) - .validateDataFilesExist(ImmutableList.of(dataFile1.path())) - .validateDeletedFiles() - .validateFromSnapshot(baseSnapshot.snapshotId()) - .validateNoConflictingAppends(conflictDetectionFilter); + RowDelta rowDelta = + table + .newRowDelta() + .addDeletes(deleteFile) + .validateDataFilesExist(ImmutableList.of(dataFile1.path())) + .validateDeletedFiles() + .validateFromSnapshot(baseSnapshot.snapshotId()) + .validateNoConflictingAppends(conflictDetectionFilter); // concurrently delete the file for partition A - table.newDelete() - .deleteFile(dataFile1) - .commit(); + table.newDelete().deleteFile(dataFile1).commit(); - AssertHelpers.assertThrows("Should fail to add deletes because data file is missing", - ValidationException.class, "Cannot commit, missing data files", + AssertHelpers.assertThrows( + "Should fail to add deletes because data file is missing", + ValidationException.class, + "Cannot commit, missing data files", rowDelta::commit); } @Test public void testAddDeleteFilesMultipleSpecs() { // enable partition summaries - table.updateProperties() - .set(TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, "10") - .commit(); + table.updateProperties().set(TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, "10").commit(); // append a partitioned data file DataFile firstSnapshotDataFile = newDataFile("data_bucket=0"); - table.newAppend() - .appendFile(firstSnapshotDataFile) - .commit(); + table.newAppend().appendFile(firstSnapshotDataFile).commit(); // remove the only partition field to make the spec unpartitioned - table.updateSpec() - .removeField(Expressions.bucket("data", 16)) - .commit(); + table.updateSpec().removeField(Expressions.bucket("data", 16)).commit(); Assert.assertTrue("Spec must be unpartitioned", table.spec().isUnpartitioned()); // append an unpartitioned data file DataFile secondSnapshotDataFile = newDataFile(""); - table.newAppend() - .appendFile(secondSnapshotDataFile) - .commit(); + table.newAppend().appendFile(secondSnapshotDataFile).commit(); // evolve the spec and add a new partition field - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); // append a data file with the new spec DataFile thirdSnapshotDataFile = newDataFile("data=abc"); - table.newAppend() - .appendFile(thirdSnapshotDataFile) - .commit(); + table.newAppend().appendFile(thirdSnapshotDataFile).commit(); Assert.assertEquals("Should have 3 specs", 3, table.specs().size()); - // commit a row delta with 1 data file and 3 delete files where delete files have different specs + // commit a row delta with 1 data file and 3 delete files where delete files have different + // specs DataFile dataFile = newDataFile("data=xyz"); DeleteFile firstDeleteFile = newDeleteFile(firstSnapshotDataFile.specId(), "data_bucket=0"); DeleteFile secondDeleteFile = newDeleteFile(secondSnapshotDataFile.specId(), ""); DeleteFile thirdDeleteFile = newDeleteFile(thirdSnapshotDataFile.specId(), "data=abc"); - table.newRowDelta() + table + .newRowDelta() .addRows(dataFile) .addDeletes(firstDeleteFile) .addDeletes(secondDeleteFile) @@ -818,12 +859,15 @@ public void testAddDeleteFilesMultipleSpecs() { Snapshot snapshot = table.currentSnapshot(); Assert.assertEquals("Commit should produce sequence number 4", 4, snapshot.sequenceNumber()); - Assert.assertEquals("Last sequence number should be 4", 4, table.ops().current().lastSequenceNumber()); - Assert.assertEquals("Delta commit should be 'overwrite'", DataOperations.OVERWRITE, snapshot.operation()); + Assert.assertEquals( + "Last sequence number should be 4", 4, table.ops().current().lastSequenceNumber()); + Assert.assertEquals( + "Delta commit should be 'overwrite'", DataOperations.OVERWRITE, snapshot.operation()); Map summary = snapshot.summary(); - Assert.assertEquals("Should change 4 partitions", "4", summary.get(CHANGED_PARTITION_COUNT_PROP)); + Assert.assertEquals( + "Should change 4 partitions", "4", summary.get(CHANGED_PARTITION_COUNT_PROP)); Assert.assertEquals("Should add 1 data file", "1", summary.get(ADDED_FILES_PROP)); Assert.assertEquals("Should have 4 data files", "4", summary.get(TOTAL_DATA_FILES_PROP)); Assert.assertEquals("Should add 3 delete files", "3", summary.get(ADDED_DELETE_FILES_PROP)); @@ -831,17 +875,26 @@ public void testAddDeleteFilesMultipleSpecs() { Assert.assertEquals("Should add 3 position deletes", "3", summary.get(ADDED_POS_DELETES_PROP)); Assert.assertEquals("Should have 3 position deletes", "3", summary.get(TOTAL_POS_DELETES_PROP)); - Assert.assertTrue("Partition metrics must be correct", + Assert.assertTrue( + "Partition metrics must be correct", summary.get(CHANGED_PARTITION_PREFIX).contains(ADDED_DELETE_FILES_PROP + "=1")); - Assert.assertTrue("Partition metrics must be correct", - summary.get(CHANGED_PARTITION_PREFIX + "data_bucket=0").contains(ADDED_DELETE_FILES_PROP + "=1")); - Assert.assertTrue("Partition metrics must be correct", - summary.get(CHANGED_PARTITION_PREFIX + "data=abc").contains(ADDED_DELETE_FILES_PROP + "=1")); - Assert.assertTrue("Partition metrics must be correct", + Assert.assertTrue( + "Partition metrics must be correct", + summary + .get(CHANGED_PARTITION_PREFIX + "data_bucket=0") + .contains(ADDED_DELETE_FILES_PROP + "=1")); + Assert.assertTrue( + "Partition metrics must be correct", + summary + .get(CHANGED_PARTITION_PREFIX + "data=abc") + .contains(ADDED_DELETE_FILES_PROP + "=1")); + Assert.assertTrue( + "Partition metrics must be correct", summary.get(CHANGED_PARTITION_PREFIX + "data=xyz").contains(ADDED_FILES_PROP + "=1")); // 3 appends + 1 row delta - Assert.assertEquals("Should have 4 data manifest", 4, snapshot.dataManifests(table.io()).size()); + Assert.assertEquals( + "Should have 4 data manifest", 4, snapshot.dataManifests(table.io()).size()); validateManifest( snapshot.dataManifests(table.io()).get(0), seqs(4), @@ -850,10 +903,12 @@ public void testAddDeleteFilesMultipleSpecs() { statuses(Status.ADDED)); // each delete file goes into a separate manifest as the specs are different - Assert.assertEquals("Should produce 3 delete manifest", 3, snapshot.deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should produce 3 delete manifest", 3, snapshot.deleteManifests(table.io()).size()); ManifestFile firstDeleteManifest = snapshot.deleteManifests(table.io()).get(2); - Assert.assertEquals("Spec must match", firstSnapshotDataFile.specId(), firstDeleteManifest.partitionSpecId()); + Assert.assertEquals( + "Spec must match", firstSnapshotDataFile.specId(), firstDeleteManifest.partitionSpecId()); validateDeleteManifest( firstDeleteManifest, seqs(4), @@ -862,7 +917,8 @@ public void testAddDeleteFilesMultipleSpecs() { statuses(Status.ADDED)); ManifestFile secondDeleteManifest = snapshot.deleteManifests(table.io()).get(1); - Assert.assertEquals("Spec must match", secondSnapshotDataFile.specId(), secondDeleteManifest.partitionSpecId()); + Assert.assertEquals( + "Spec must match", secondSnapshotDataFile.specId(), secondDeleteManifest.partitionSpecId()); validateDeleteManifest( secondDeleteManifest, seqs(4), @@ -871,7 +927,8 @@ public void testAddDeleteFilesMultipleSpecs() { statuses(Status.ADDED)); ManifestFile thirdDeleteManifest = snapshot.deleteManifests(table.io()).get(0); - Assert.assertEquals("Spec must match", thirdSnapshotDataFile.specId(), thirdDeleteManifest.partitionSpecId()); + Assert.assertEquals( + "Spec must match", thirdSnapshotDataFile.specId(), thirdDeleteManifest.partitionSpecId()); validateDeleteManifest( thirdDeleteManifest, seqs(4), @@ -883,62 +940,56 @@ public void testAddDeleteFilesMultipleSpecs() { @Test public void testManifestMergingMultipleSpecs() { // make sure we enable manifest merging - table.updateProperties() + table + .updateProperties() .set(TableProperties.MANIFEST_MERGE_ENABLED, "true") .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "2") .commit(); // append a partitioned data file DataFile firstSnapshotDataFile = newDataFile("data_bucket=0"); - table.newAppend() - .appendFile(firstSnapshotDataFile) - .commit(); + table.newAppend().appendFile(firstSnapshotDataFile).commit(); // remove the only partition field to make the spec unpartitioned - table.updateSpec() - .removeField(Expressions.bucket("data", 16)) - .commit(); + table.updateSpec().removeField(Expressions.bucket("data", 16)).commit(); Assert.assertTrue("Spec must be unpartitioned", table.spec().isUnpartitioned()); // append an unpartitioned data file DataFile secondSnapshotDataFile = newDataFile(""); - table.newAppend() - .appendFile(secondSnapshotDataFile) - .commit(); + table.newAppend().appendFile(secondSnapshotDataFile).commit(); // commit two delete files to two specs in a single operation DeleteFile firstDeleteFile = newDeleteFile(firstSnapshotDataFile.specId(), "data_bucket=0"); DeleteFile secondDeleteFile = newDeleteFile(secondSnapshotDataFile.specId(), ""); - table.newRowDelta() - .addDeletes(firstDeleteFile) - .addDeletes(secondDeleteFile) - .commit(); + table.newRowDelta().addDeletes(firstDeleteFile).addDeletes(secondDeleteFile).commit(); Snapshot thirdSnapshot = table.currentSnapshot(); // 2 appends and 1 row delta where delete files belong to different specs - Assert.assertEquals("Should have 2 data manifest", 2, thirdSnapshot.dataManifests(table.io()).size()); - Assert.assertEquals("Should have 2 delete manifest", 2, thirdSnapshot.deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should have 2 data manifest", 2, thirdSnapshot.dataManifests(table.io()).size()); + Assert.assertEquals( + "Should have 2 delete manifest", 2, thirdSnapshot.deleteManifests(table.io()).size()); // commit two more delete files to the same specs to trigger merging DeleteFile thirdDeleteFile = newDeleteFile(firstSnapshotDataFile.specId(), "data_bucket=0"); DeleteFile fourthDeleteFile = newDeleteFile(secondSnapshotDataFile.specId(), ""); - table.newRowDelta() - .addDeletes(thirdDeleteFile) - .addDeletes(fourthDeleteFile) - .commit(); + table.newRowDelta().addDeletes(thirdDeleteFile).addDeletes(fourthDeleteFile).commit(); Snapshot fourthSnapshot = table.currentSnapshot(); // make sure merging respects spec boundaries - Assert.assertEquals("Should have 2 data manifest", 2, fourthSnapshot.dataManifests(table.io()).size()); - Assert.assertEquals("Should have 2 delete manifest", 2, fourthSnapshot.deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should have 2 data manifest", 2, fourthSnapshot.dataManifests(table.io()).size()); + Assert.assertEquals( + "Should have 2 delete manifest", 2, fourthSnapshot.deleteManifests(table.io()).size()); ManifestFile firstDeleteManifest = fourthSnapshot.deleteManifests(table.io()).get(1); - Assert.assertEquals("Spec must match", firstSnapshotDataFile.specId(), firstDeleteManifest.partitionSpecId()); + Assert.assertEquals( + "Spec must match", firstSnapshotDataFile.specId(), firstDeleteManifest.partitionSpecId()); validateDeleteManifest( firstDeleteManifest, seqs(4, 3), @@ -947,7 +998,8 @@ public void testManifestMergingMultipleSpecs() { statuses(Status.ADDED, Status.EXISTING)); ManifestFile secondDeleteManifest = fourthSnapshot.deleteManifests(table.io()).get(0); - Assert.assertEquals("Spec must match", secondSnapshotDataFile.specId(), secondDeleteManifest.partitionSpecId()); + Assert.assertEquals( + "Spec must match", secondSnapshotDataFile.specId(), secondDeleteManifest.partitionSpecId()); validateDeleteManifest( secondDeleteManifest, seqs(4, 3), @@ -960,22 +1012,16 @@ public void testManifestMergingMultipleSpecs() { public void testAbortMultipleSpecs() { // append a partitioned data file DataFile firstSnapshotDataFile = newDataFile("data_bucket=0"); - table.newAppend() - .appendFile(firstSnapshotDataFile) - .commit(); + table.newAppend().appendFile(firstSnapshotDataFile).commit(); // remove the only partition field to make the spec unpartitioned - table.updateSpec() - .removeField(Expressions.bucket("data", 16)) - .commit(); + table.updateSpec().removeField(Expressions.bucket("data", 16)).commit(); Assert.assertTrue("Spec must be unpartitioned", table.spec().isUnpartitioned()); // append an unpartitioned data file DataFile secondSnapshotDataFile = newDataFile(""); - table.newAppend() - .appendFile(secondSnapshotDataFile) - .commit(); + table.newAppend().appendFile(secondSnapshotDataFile).commit(); // prepare two delete files that belong to different specs DeleteFile firstDeleteFile = newDeleteFile(firstSnapshotDataFile.specId(), "data_bucket=0"); @@ -984,22 +1030,24 @@ public void testAbortMultipleSpecs() { // capture all deletes Set deletedFiles = Sets.newHashSet(); - RowDelta rowDelta = table.newRowDelta() - .addDeletes(firstDeleteFile) - .addDeletes(secondDeleteFile) - .deleteWith(deletedFiles::add) - .validateDeletedFiles() - .validateDataFilesExist(ImmutableList.of(firstSnapshotDataFile.path())); + RowDelta rowDelta = + table + .newRowDelta() + .addDeletes(firstDeleteFile) + .addDeletes(secondDeleteFile) + .deleteWith(deletedFiles::add) + .validateDeletedFiles() + .validateDataFilesExist(ImmutableList.of(firstSnapshotDataFile.path())); rowDelta.apply(); // perform a conflicting concurrent operation - table.newDelete() - .deleteFile(firstSnapshotDataFile) - .commit(); + table.newDelete().deleteFile(firstSnapshotDataFile).commit(); - AssertHelpers.assertThrows("Should fail to commit row delta", - ValidationException.class, "Cannot commit, missing data files", + AssertHelpers.assertThrows( + "Should fail to commit row delta", + ValidationException.class, + "Cannot commit, missing data files", rowDelta::commit); // we should clean up 1 manifest list and 2 delete manifests @@ -1008,126 +1056,136 @@ public void testAbortMultipleSpecs() { @Test public void testConcurrentConflictingRowDelta() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); Expression conflictDetectionFilter = Expressions.alwaysTrue(); // mock a MERGE operation with serializable isolation - RowDelta rowDelta = table.newRowDelta() - .addRows(FILE_B) - .addDeletes(FILE_A_DELETES) - .validateFromSnapshot(firstSnapshot.snapshotId()) - .conflictDetectionFilter(conflictDetectionFilter) - .validateNoConflictingDataFiles() - .validateNoConflictingDeleteFiles(); + RowDelta rowDelta = + table + .newRowDelta() + .addRows(FILE_B) + .addDeletes(FILE_A_DELETES) + .validateFromSnapshot(firstSnapshot.snapshotId()) + .conflictDetectionFilter(conflictDetectionFilter) + .validateNoConflictingDataFiles() + .validateNoConflictingDeleteFiles(); - table.newRowDelta() + table + .newRowDelta() .addDeletes(FILE_A_DELETES) .validateFromSnapshot(firstSnapshot.snapshotId()) .validateNoConflictingAppends(conflictDetectionFilter) .commit(); - AssertHelpers.assertThrows("Should reject commit", - ValidationException.class, "Found new conflicting delete files", + AssertHelpers.assertThrows( + "Should reject commit", + ValidationException.class, + "Found new conflicting delete files", rowDelta::commit); } @Test public void testConcurrentConflictingRowDeltaWithoutAppendValidation() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); Expression conflictDetectionFilter = Expressions.alwaysTrue(); // mock a MERGE operation with snapshot isolation (i.e. no append validation) - RowDelta rowDelta = table.newRowDelta() - .addDeletes(FILE_A_DELETES) - .validateFromSnapshot(firstSnapshot.snapshotId()) - .conflictDetectionFilter(conflictDetectionFilter) - .validateNoConflictingDeleteFiles(); + RowDelta rowDelta = + table + .newRowDelta() + .addDeletes(FILE_A_DELETES) + .validateFromSnapshot(firstSnapshot.snapshotId()) + .conflictDetectionFilter(conflictDetectionFilter) + .validateNoConflictingDeleteFiles(); - table.newRowDelta() + table + .newRowDelta() .addDeletes(FILE_A_DELETES) .validateFromSnapshot(firstSnapshot.snapshotId()) .conflictDetectionFilter(conflictDetectionFilter) .validateNoConflictingDataFiles() .commit(); - AssertHelpers.assertThrows("Should reject commit", - ValidationException.class, "Found new conflicting delete files", + AssertHelpers.assertThrows( + "Should reject commit", + ValidationException.class, + "Found new conflicting delete files", rowDelta::commit); } @Test public void testConcurrentNonConflictingRowDelta() { // change the spec to be partitioned by data - table.updateSpec() + table + .updateSpec() .removeField(Expressions.bucket("data", 16)) .addField(Expressions.ref("data")) .commit(); // add a data file to partition A - DataFile dataFile1 = DataFiles.builder(table.spec()) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data=a") - .withRecordCount(1) - .build(); - - table.newAppend() - .appendFile(dataFile1) - .commit(); + DataFile dataFile1 = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data=a") + .withRecordCount(1) + .build(); + + table.newAppend().appendFile(dataFile1).commit(); // add a data file to partition B - DataFile dataFile2 = DataFiles.builder(table.spec()) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data=b") - .withRecordCount(1) - .build(); - - table.newAppend() - .appendFile(dataFile2) - .commit(); + DataFile dataFile2 = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data=b") + .withRecordCount(1) + .build(); + + table.newAppend().appendFile(dataFile2).commit(); Snapshot baseSnapshot = table.currentSnapshot(); Expression conflictDetectionFilter = Expressions.equal("data", "a"); // add a delete file for partition A - DeleteFile deleteFile1 = FileMetadata.deleteFileBuilder(table.spec()) - .ofPositionDeletes() - .withPath("/path/to/data-a-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data=a") - .withRecordCount(1) - .build(); + DeleteFile deleteFile1 = + FileMetadata.deleteFileBuilder(table.spec()) + .ofPositionDeletes() + .withPath("/path/to/data-a-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data=a") + .withRecordCount(1) + .build(); // mock a DELETE operation with serializable isolation - RowDelta rowDelta = table.newRowDelta() - .addDeletes(deleteFile1) - .validateFromSnapshot(baseSnapshot.snapshotId()) - .conflictDetectionFilter(conflictDetectionFilter) - .validateNoConflictingDataFiles() - .validateNoConflictingDeleteFiles(); + RowDelta rowDelta = + table + .newRowDelta() + .addDeletes(deleteFile1) + .validateFromSnapshot(baseSnapshot.snapshotId()) + .conflictDetectionFilter(conflictDetectionFilter) + .validateNoConflictingDataFiles() + .validateNoConflictingDeleteFiles(); // add a delete file for partition B - DeleteFile deleteFile2 = FileMetadata.deleteFileBuilder(table.spec()) - .ofPositionDeletes() - .withPath("/path/to/data-b-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data=b") - .withRecordCount(1) - .build(); - - table.newRowDelta() + DeleteFile deleteFile2 = + FileMetadata.deleteFileBuilder(table.spec()) + .ofPositionDeletes() + .withPath("/path/to/data-b-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data=b") + .withRecordCount(1) + .build(); + + table + .newRowDelta() .addDeletes(deleteFile2) .validateFromSnapshot(baseSnapshot.snapshotId()) .commit(); @@ -1140,7 +1198,8 @@ public void testConcurrentNonConflictingRowDelta() { @Test public void testConcurrentNonConflictingRowDeltaAndRewriteFilesWithSequenceNumber() { // change the spec to be partitioned by data - table.updateSpec() + table + .updateSpec() .removeField(Expressions.bucket("data", 16)) .addField(Expressions.ref("data")) .commit(); @@ -1148,29 +1207,35 @@ public void testConcurrentNonConflictingRowDeltaAndRewriteFilesWithSequenceNumbe // add a data file to partition A DataFile dataFile1 = newDataFile("data=a"); - table.newAppend() - .appendFile(dataFile1) - .commit(); + table.newAppend().appendFile(dataFile1).commit(); Snapshot baseSnapshot = table.currentSnapshot(); // add an equality delete file - DeleteFile deleteFile1 = newEqualityDeleteFile(table.spec().specId(), "data=a", - table.schema().asStruct().fields().get(0).fieldId()); + DeleteFile deleteFile1 = + newEqualityDeleteFile( + table.spec().specId(), "data=a", table.schema().asStruct().fields().get(0).fieldId()); // mock a DELETE operation with serializable isolation - RowDelta rowDelta = table.newRowDelta() - .addDeletes(deleteFile1) - .validateFromSnapshot(baseSnapshot.snapshotId()) - .validateNoConflictingDataFiles() - .validateNoConflictingDeleteFiles(); + RowDelta rowDelta = + table + .newRowDelta() + .addDeletes(deleteFile1) + .validateFromSnapshot(baseSnapshot.snapshotId()) + .validateNoConflictingDataFiles() + .validateNoConflictingDeleteFiles(); // mock a REWRITE operation with serializable isolation DataFile dataFile2 = newDataFile("data=a"); - RewriteFiles rewriteFiles = table.newRewrite() - .rewriteFiles(ImmutableSet.of(dataFile1), ImmutableSet.of(dataFile2), baseSnapshot.sequenceNumber()) - .validateFromSnapshot(baseSnapshot.snapshotId()); + RewriteFiles rewriteFiles = + table + .newRewrite() + .rewriteFiles( + ImmutableSet.of(dataFile1), + ImmutableSet.of(dataFile2), + baseSnapshot.sequenceNumber()) + .validateFromSnapshot(baseSnapshot.snapshotId()); rowDelta.commit(); rewriteFiles.commit(); @@ -1183,7 +1248,8 @@ public void testConcurrentNonConflictingRowDeltaAndRewriteFilesWithSequenceNumbe public void testRowDeltaAndRewriteFilesMergeManifestsWithSequenceNumber() { table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1").commit(); // change the spec to be partitioned by data - table.updateSpec() + table + .updateSpec() .removeField(Expressions.bucket("data", 16)) .addField(Expressions.ref("data")) .commit(); @@ -1191,29 +1257,35 @@ public void testRowDeltaAndRewriteFilesMergeManifestsWithSequenceNumber() { // add a data file to partition A DataFile dataFile1 = newDataFile("data=a"); - table.newAppend() - .appendFile(dataFile1) - .commit(); + table.newAppend().appendFile(dataFile1).commit(); Snapshot baseSnapshot = table.currentSnapshot(); // add an equality delete file - DeleteFile deleteFile1 = newEqualityDeleteFile(table.spec().specId(), "data=a", - table.schema().asStruct().fields().get(0).fieldId()); + DeleteFile deleteFile1 = + newEqualityDeleteFile( + table.spec().specId(), "data=a", table.schema().asStruct().fields().get(0).fieldId()); // mock a DELETE operation with serializable isolation - RowDelta rowDelta = table.newRowDelta() - .addDeletes(deleteFile1) - .validateFromSnapshot(baseSnapshot.snapshotId()) - .validateNoConflictingDataFiles() - .validateNoConflictingDeleteFiles(); + RowDelta rowDelta = + table + .newRowDelta() + .addDeletes(deleteFile1) + .validateFromSnapshot(baseSnapshot.snapshotId()) + .validateNoConflictingDataFiles() + .validateNoConflictingDeleteFiles(); // mock a REWRITE operation with serializable isolation DataFile dataFile2 = newDataFile("data=a"); - RewriteFiles rewriteFiles = table.newRewrite() - .rewriteFiles(ImmutableSet.of(dataFile1), ImmutableSet.of(dataFile2), baseSnapshot.sequenceNumber()) - .validateFromSnapshot(baseSnapshot.snapshotId()); + RewriteFiles rewriteFiles = + table + .newRewrite() + .rewriteFiles( + ImmutableSet.of(dataFile1), + ImmutableSet.of(dataFile2), + baseSnapshot.sequenceNumber()) + .validateFromSnapshot(baseSnapshot.snapshotId()); rowDelta.commit(); rewriteFiles.commit(); @@ -1225,15 +1297,19 @@ public void testRowDeltaAndRewriteFilesMergeManifestsWithSequenceNumber() { long currentSnapshotId = table.currentSnapshot().snapshotId(); - validateManifest(mergedDataManifest, seqs(1, 3), - ids(currentSnapshotId, currentSnapshotId), files(dataFile2, dataFile1), + validateManifest( + mergedDataManifest, + seqs(1, 3), + ids(currentSnapshotId, currentSnapshotId), + files(dataFile2, dataFile1), statuses(Status.ADDED, Status.DELETED)); } @Test public void testConcurrentConflictingRowDeltaAndRewriteFilesWithSequenceNumber() { // change the spec to be partitioned by data - table.updateSpec() + table + .updateSpec() .removeField(Expressions.bucket("data", 16)) .addField(Expressions.ref("data")) .commit(); @@ -1241,9 +1317,7 @@ public void testConcurrentConflictingRowDeltaAndRewriteFilesWithSequenceNumber() // add a data file to partition A DataFile dataFile1 = newDataFile("data=a"); - table.newAppend() - .appendFile(dataFile1) - .commit(); + table.newAppend().appendFile(dataFile1).commit(); Snapshot baseSnapshot = table.currentSnapshot(); @@ -1251,22 +1325,30 @@ public void testConcurrentConflictingRowDeltaAndRewriteFilesWithSequenceNumber() DeleteFile deleteFile1 = newDeleteFile(table.spec().specId(), "data=a"); // mock a DELETE operation with serializable isolation - RowDelta rowDelta = table.newRowDelta() - .addDeletes(deleteFile1) - .validateFromSnapshot(baseSnapshot.snapshotId()) - .validateNoConflictingDataFiles() - .validateNoConflictingDeleteFiles(); + RowDelta rowDelta = + table + .newRowDelta() + .addDeletes(deleteFile1) + .validateFromSnapshot(baseSnapshot.snapshotId()) + .validateNoConflictingDataFiles() + .validateNoConflictingDeleteFiles(); // mock a REWRITE operation with serializable isolation DataFile dataFile2 = newDataFile("data=a"); - RewriteFiles rewriteFiles = table.newRewrite() - .rewriteFiles(ImmutableSet.of(dataFile1), ImmutableSet.of(dataFile2), baseSnapshot.sequenceNumber()) - .validateFromSnapshot(baseSnapshot.snapshotId()); + RewriteFiles rewriteFiles = + table + .newRewrite() + .rewriteFiles( + ImmutableSet.of(dataFile1), + ImmutableSet.of(dataFile2), + baseSnapshot.sequenceNumber()) + .validateFromSnapshot(baseSnapshot.snapshotId()); rowDelta.commit(); - AssertHelpers.assertThrows("Should not allow any new position delete associated with the data file", + AssertHelpers.assertThrows( + "Should not allow any new position delete associated with the data file", ValidationException.class, "Cannot commit, found new position delete for replaced data file", rewriteFiles::commit); @@ -1274,53 +1356,60 @@ public void testConcurrentConflictingRowDeltaAndRewriteFilesWithSequenceNumber() @Test public void testRowDeltaCaseSensitivity() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_A2) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_A2).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - table.newRowDelta() - .addDeletes(FILE_A_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_A_DELETES).commit(); Expression conflictDetectionFilter = Expressions.equal(Expressions.bucket("dAtA", 16), 0); - AssertHelpers.assertThrows("Should use case sensitive binding by default", - ValidationException.class, "Cannot find field 'dAtA'", - () -> table.newRowDelta() - .addRows(FILE_B) - .addDeletes(FILE_A2_DELETES) - .validateFromSnapshot(firstSnapshot.snapshotId()) - .conflictDetectionFilter(conflictDetectionFilter) - .validateNoConflictingDataFiles() - .validateNoConflictingDeleteFiles() - .commit()); - - AssertHelpers.assertThrows("Should fail with case sensitive binding", - ValidationException.class, "Cannot find field 'dAtA'", - () -> table.newRowDelta() - .caseSensitive(true) - .addRows(FILE_B) - .addDeletes(FILE_A2_DELETES) - .validateFromSnapshot(firstSnapshot.snapshotId()) - .conflictDetectionFilter(conflictDetectionFilter) - .validateNoConflictingDataFiles() - .validateNoConflictingDeleteFiles() - .commit()); + AssertHelpers.assertThrows( + "Should use case sensitive binding by default", + ValidationException.class, + "Cannot find field 'dAtA'", + () -> + table + .newRowDelta() + .addRows(FILE_B) + .addDeletes(FILE_A2_DELETES) + .validateFromSnapshot(firstSnapshot.snapshotId()) + .conflictDetectionFilter(conflictDetectionFilter) + .validateNoConflictingDataFiles() + .validateNoConflictingDeleteFiles() + .commit()); + + AssertHelpers.assertThrows( + "Should fail with case sensitive binding", + ValidationException.class, + "Cannot find field 'dAtA'", + () -> + table + .newRowDelta() + .caseSensitive(true) + .addRows(FILE_B) + .addDeletes(FILE_A2_DELETES) + .validateFromSnapshot(firstSnapshot.snapshotId()) + .conflictDetectionFilter(conflictDetectionFilter) + .validateNoConflictingDataFiles() + .validateNoConflictingDeleteFiles() + .commit()); // binding should succeed and trigger the validation - AssertHelpers.assertThrows("Should reject case sensitive binding", - ValidationException.class, "Found new conflicting delete files", - () -> table.newRowDelta() - .caseSensitive(false) - .addRows(FILE_B) - .addDeletes(FILE_A2_DELETES) - .validateFromSnapshot(firstSnapshot.snapshotId()) - .conflictDetectionFilter(conflictDetectionFilter) - .validateNoConflictingDataFiles() - .validateNoConflictingDeleteFiles() - .commit()); + AssertHelpers.assertThrows( + "Should reject case sensitive binding", + ValidationException.class, + "Found new conflicting delete files", + () -> + table + .newRowDelta() + .caseSensitive(false) + .addRows(FILE_B) + .addDeletes(FILE_A2_DELETES) + .validateFromSnapshot(firstSnapshot.snapshotId()) + .conflictDetectionFilter(conflictDetectionFilter) + .validateNoConflictingDataFiles() + .validateNoConflictingDeleteFiles() + .commit()); } } diff --git a/core/src/test/java/org/apache/iceberg/TestScanDataFileColumns.java b/core/src/test/java/org/apache/iceberg/TestScanDataFileColumns.java index bb021f463a28..cfe5796c2695 100644 --- a/core/src/test/java/org/apache/iceberg/TestScanDataFileColumns.java +++ b/core/src/test/java/org/apache/iceberg/TestScanDataFileColumns.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; @@ -33,19 +35,15 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestScanDataFileColumns { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), optional(2, "data", Types.StringType.get())); private static final Configuration CONF = new Configuration(); private static final Tables TABLES = new HadoopTables(CONF); - @Rule - public final TemporaryFolder temp = new TemporaryFolder(); + @Rule public final TemporaryFolder temp = new TemporaryFolder(); private String tableLocation = null; private Table table = null; @@ -55,46 +53,58 @@ public void createTables() throws IOException { File location = temp.newFolder("shared"); Assert.assertTrue(location.delete()); this.tableLocation = location.toString(); - this.table = TABLES.create( - SCHEMA, PartitionSpec.unpartitioned(), - ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, FileFormat.PARQUET.name()), - tableLocation); + this.table = + TABLES.create( + SCHEMA, + PartitionSpec.unpartitioned(), + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, FileFormat.PARQUET.name()), + tableLocation); // commit the test data - table.newAppend() - .appendFile(DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath("file1.parquet") - .withFileSizeInBytes(100) - .withMetrics(new Metrics(3L, - ImmutableMap.of(1, 50L), // column size - ImmutableMap.of(1, 3L), // value count - ImmutableMap.of(1, 0L), // null count - null, - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(2L)))) // upper bounds) - .build()) - .appendFile(DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath("file2.parquet") - .withFileSizeInBytes(100) - .withMetrics(new Metrics(3L, - ImmutableMap.of(1, 60L), // column size - ImmutableMap.of(1, 3L), // value count - ImmutableMap.of(1, 0L), // null count - null, - ImmutableMap.of(1, longToBuffer(10L)), // lower bounds - ImmutableMap.of(1, longToBuffer(12L)))) // upper bounds) - .build()) - .appendFile(DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath("file3.parquet") - .withFileSizeInBytes(100) - .withMetrics(new Metrics(3L, - ImmutableMap.of(1, 70L), // column size - ImmutableMap.of(1, 3L), // value count - ImmutableMap.of(1, 0L), // null count - null, - ImmutableMap.of(1, longToBuffer(20L)), // lower bounds - ImmutableMap.of(1, longToBuffer(22L)))) // upper bounds) - .build()) + table + .newAppend() + .appendFile( + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath("file1.parquet") + .withFileSizeInBytes(100) + .withMetrics( + new Metrics( + 3L, + ImmutableMap.of(1, 50L), // column size + ImmutableMap.of(1, 3L), // value count + ImmutableMap.of(1, 0L), // null count + null, + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(2L)))) // upper bounds) + .build()) + .appendFile( + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath("file2.parquet") + .withFileSizeInBytes(100) + .withMetrics( + new Metrics( + 3L, + ImmutableMap.of(1, 60L), // column size + ImmutableMap.of(1, 3L), // value count + ImmutableMap.of(1, 0L), // null count + null, + ImmutableMap.of(1, longToBuffer(10L)), // lower bounds + ImmutableMap.of(1, longToBuffer(12L)))) // upper bounds) + .build()) + .appendFile( + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath("file3.parquet") + .withFileSizeInBytes(100) + .withMetrics( + new Metrics( + 3L, + ImmutableMap.of(1, 70L), // column size + ImmutableMap.of(1, 3L), // value count + ImmutableMap.of(1, 0L), // null count + null, + ImmutableMap.of(1, longToBuffer(20L)), // lower bounds + ImmutableMap.of(1, longToBuffer(22L)))) // upper bounds) + .build()) .commit(); } diff --git a/core/src/test/java/org/apache/iceberg/TestScanSummary.java b/core/src/test/java/org/apache/iceberg/TestScanSummary.java index 8441556bf634..5f54eecc2638 100644 --- a/core/src/test/java/org/apache/iceberg/TestScanSummary.java +++ b/core/src/test/java/org/apache/iceberg/TestScanSummary.java @@ -16,17 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.util.Pair; -import org.junit.Assert; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - import static org.apache.iceberg.ScanSummary.timestampRange; import static org.apache.iceberg.ScanSummary.toMillis; import static org.apache.iceberg.expressions.Expressions.equal; @@ -35,11 +26,19 @@ import static org.apache.iceberg.expressions.Expressions.lessThan; import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.Pair; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + @RunWith(Parameterized.class) public class TestScanSummary extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestScanSummary(int formatVersion) { @@ -50,7 +49,8 @@ public TestScanSummary(int formatVersion) { public void testSnapshotTimeRangeValidation() { long t0 = System.currentTimeMillis(); - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .appendFile(FILE_B) // data_bucket=1 .commit(); @@ -60,7 +60,8 @@ public void testSnapshotTimeRangeValidation() { t1 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); @@ -72,22 +73,26 @@ public void testSnapshotTimeRangeValidation() { } // expire the first snapshot - table.expireSnapshots() - .expireOlderThan(t1) - .commit(); + table.expireSnapshots().expireOlderThan(t1).commit(); - Assert.assertEquals("Should have one snapshot", - 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("Snapshot should be the second snapshot created", - secondSnapshotId, table.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Should have one snapshot", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "Snapshot should be the second snapshot created", + secondSnapshotId, + table.currentSnapshot().snapshotId()); // this should include the first snapshot, but it was removed from the dataset - TableScan scan = table.newScan() - .filter(greaterThanOrEqual("dateCreated", t0)) - .filter(lessThan("dateCreated", t2)); - - AssertHelpers.assertThrows("Should fail summary because range may include expired snapshots", - IllegalArgumentException.class, "may include expired snapshots", + TableScan scan = + table + .newScan() + .filter(greaterThanOrEqual("dateCreated", t0)) + .filter(lessThan("dateCreated", t2)); + + AssertHelpers.assertThrows( + "Should fail summary because range may include expired snapshots", + IllegalArgumentException.class, + "may include expired snapshots", () -> new ScanSummary.Builder(scan).build()); } @@ -96,50 +101,59 @@ public void testTimestampRanges() { long lower = 1542750188523L; long upper = 1542750695131L; - Assert.assertEquals("Should use inclusive bound", + Assert.assertEquals( + "Should use inclusive bound", Pair.of(Long.MIN_VALUE, upper), timestampRange(ImmutableList.of(lessThanOrEqual("ts_ms", upper)))); - Assert.assertEquals("Should use lower value for upper bound", + Assert.assertEquals( + "Should use lower value for upper bound", Pair.of(Long.MIN_VALUE, upper), - timestampRange(ImmutableList.of( - lessThanOrEqual("ts_ms", upper + 918234), - lessThanOrEqual("ts_ms", upper)))); + timestampRange( + ImmutableList.of( + lessThanOrEqual("ts_ms", upper + 918234), lessThanOrEqual("ts_ms", upper)))); - Assert.assertEquals("Should make upper bound inclusive", + Assert.assertEquals( + "Should make upper bound inclusive", Pair.of(Long.MIN_VALUE, upper - 1), timestampRange(ImmutableList.of(lessThan("ts_ms", upper)))); - Assert.assertEquals("Should use inclusive bound", + Assert.assertEquals( + "Should use inclusive bound", Pair.of(lower, Long.MAX_VALUE), timestampRange(ImmutableList.of(greaterThanOrEqual("ts_ms", lower)))); - Assert.assertEquals("Should use upper value for lower bound", + Assert.assertEquals( + "Should use upper value for lower bound", Pair.of(lower, Long.MAX_VALUE), - timestampRange(ImmutableList.of( - greaterThanOrEqual("ts_ms", lower - 918234), - greaterThanOrEqual("ts_ms", lower)))); + timestampRange( + ImmutableList.of( + greaterThanOrEqual("ts_ms", lower - 918234), greaterThanOrEqual("ts_ms", lower)))); - Assert.assertEquals("Should make lower bound inclusive", + Assert.assertEquals( + "Should make lower bound inclusive", Pair.of(lower + 1, Long.MAX_VALUE), timestampRange(ImmutableList.of(greaterThan("ts_ms", lower)))); - Assert.assertEquals("Should set both bounds for equals", + Assert.assertEquals( + "Should set both bounds for equals", Pair.of(lower, lower), timestampRange(ImmutableList.of(equal("ts_ms", lower)))); - Assert.assertEquals("Should set both bounds", + Assert.assertEquals( + "Should set both bounds", Pair.of(lower, upper - 1), - timestampRange(ImmutableList.of( - greaterThanOrEqual("ts_ms", lower), - lessThan("ts_ms", upper)))); + timestampRange( + ImmutableList.of(greaterThanOrEqual("ts_ms", lower), lessThan("ts_ms", upper)))); // >= lower and < lower is an empty range - AssertHelpers.assertThrows("Should reject empty ranges", - IllegalArgumentException.class, "No timestamps can match filters", - () -> timestampRange(ImmutableList.of( - greaterThanOrEqual("ts_ms", lower), - lessThan("ts_ms", lower)))); + AssertHelpers.assertThrows( + "Should reject empty ranges", + IllegalArgumentException.class, + "No timestamps can match filters", + () -> + timestampRange( + ImmutableList.of(greaterThanOrEqual("ts_ms", lower), lessThan("ts_ms", lower)))); } @Test diff --git a/core/src/test/java/org/apache/iceberg/TestScansAndSchemaEvolution.java b/core/src/test/java/org/apache/iceberg/TestScansAndSchemaEvolution.java index 5a60ece7b0a3..00b18f1f244e 100644 --- a/core/src/test/java/org/apache/iceberg/TestScansAndSchemaEvolution.java +++ b/core/src/test/java/org/apache/iceberg/TestScansAndSchemaEvolution.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -40,22 +41,20 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestScansAndSchemaEvolution { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get()), - required(3, "part", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + required(3, "part", Types.StringType.get())); - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .identity("part") - .build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("part").build(); @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public final int formatVersion; @@ -64,17 +63,15 @@ public TestScansAndSchemaEvolution(int formatVersion) { this.formatVersion = formatVersion; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private DataFile createDataFile(String partValue) throws IOException { List expected = RandomAvroData.generate(SCHEMA, 100, 0L); - OutputFile dataFile = new InMemoryOutputFile(FileFormat.AVRO.addExtension(UUID.randomUUID().toString())); - try (FileAppender writer = Avro.write(dataFile) - .schema(SCHEMA) - .named("test") - .build()) { + OutputFile dataFile = + new InMemoryOutputFile(FileFormat.AVRO.addExtension(UUID.randomUUID().toString())); + try (FileAppender writer = + Avro.write(dataFile).schema(SCHEMA).named("test").build()) { for (GenericData.Record rec : expected) { rec.put("part", partValue); // create just one partition writer.add(rec); @@ -105,23 +102,17 @@ public void testPartitionSourceRename() throws IOException { DataFile fileOne = createDataFile("one"); DataFile fileTwo = createDataFile("two"); - table.newAppend() - .appendFile(fileOne) - .appendFile(fileTwo) - .commit(); + table.newAppend().appendFile(fileOne).appendFile(fileTwo).commit(); - List tasks = Lists.newArrayList( - table.newScan().filter(Expressions.equal("part", "one")).planFiles()); + List tasks = + Lists.newArrayList(table.newScan().filter(Expressions.equal("part", "one")).planFiles()); Assert.assertEquals("Should produce 1 matching file task", 1, tasks.size()); - table.updateSchema() - .renameColumn("part", "p") - .commit(); + table.updateSchema().renameColumn("part", "p").commit(); // plan the scan using the new name in a filter - tasks = Lists.newArrayList( - table.newScan().filter(Expressions.equal("p", "one")).planFiles()); + tasks = Lists.newArrayList(table.newScan().filter(Expressions.equal("p", "one")).planFiles()); Assert.assertEquals("Should produce 1 matching file task", 1, tasks.size()); } diff --git a/core/src/test/java/org/apache/iceberg/TestSchemaAndMappingUpdate.java b/core/src/test/java/org/apache/iceberg/TestSchemaAndMappingUpdate.java index eec1c9bf6fda..541de1b51d00 100644 --- a/core/src/test/java/org/apache/iceberg/TestSchemaAndMappingUpdate.java +++ b/core/src/test/java/org/apache/iceberg/TestSchemaAndMappingUpdate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Objects; @@ -40,7 +39,7 @@ public class TestSchemaAndMappingUpdate extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestSchemaAndMappingUpdate(int formatVersion) { @@ -52,13 +51,9 @@ public void testAddPrimitiveColumn() { NameMapping mapping = MappingUtil.create(table.schema()); String mappingJson = NameMappingParser.toJson(mapping); - table.updateProperties() - .set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson) - .commit(); + table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit(); - table.updateSchema() - .addColumn("count", Types.LongType.get()) - .commit(); + table.updateSchema().addColumn("count", Types.LongType.get()).commit(); String updatedJson = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); NameMapping updated = NameMappingParser.fromJson(updatedJson); @@ -67,8 +62,10 @@ public void testAddPrimitiveColumn() { MappedField newMapping = updated.find("count"); Assert.assertNotNull("Mapping for new column should be added", newMapping); - Assert.assertEquals("Mapping should use the assigned field ID", - (Integer) table.schema().findField("count").fieldId(), updated.find("count").id()); + Assert.assertEquals( + "Mapping should use the assigned field ID", + (Integer) table.schema().findField("count").fieldId(), + updated.find("count").id()); Assert.assertNull("Should not contain a nested mapping", updated.find("count").nestedMapping()); } @@ -77,14 +74,15 @@ public void testAddStructColumn() { NameMapping mapping = MappingUtil.create(table.schema()); String mappingJson = NameMappingParser.toJson(mapping); - table.updateProperties() - .set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson) - .commit(); + table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit(); - table.updateSchema() - .addColumn("location", Types.StructType.of( - Types.NestedField.optional(1, "lat", Types.DoubleType.get()), - Types.NestedField.optional(2, "long", Types.DoubleType.get()))) + table + .updateSchema() + .addColumn( + "location", + Types.StructType.of( + Types.NestedField.optional(1, "lat", Types.DoubleType.get()), + Types.NestedField.optional(2, "long", Types.DoubleType.get()))) .commit(); String updatedJson = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); @@ -95,17 +93,26 @@ public void testAddStructColumn() { MappedField newMapping = updated.find("location"); Assert.assertNotNull("Mapping for new column should be added", newMapping); - Assert.assertEquals("Mapping should use the assigned field ID", - (Integer) table.schema().findField("location").fieldId(), updated.find("location").id()); - Assert.assertNotNull("Should contain a nested mapping", updated.find("location").nestedMapping()); + Assert.assertEquals( + "Mapping should use the assigned field ID", + (Integer) table.schema().findField("location").fieldId(), + updated.find("location").id()); + Assert.assertNotNull( + "Should contain a nested mapping", updated.find("location").nestedMapping()); - Assert.assertEquals("Mapping should use the assigned field ID", - (Integer) table.schema().findField("location.lat").fieldId(), updated.find("location.lat").id()); - Assert.assertNull("Should not contain a nested mapping", updated.find("location.lat").nestedMapping()); + Assert.assertEquals( + "Mapping should use the assigned field ID", + (Integer) table.schema().findField("location.lat").fieldId(), + updated.find("location.lat").id()); + Assert.assertNull( + "Should not contain a nested mapping", updated.find("location.lat").nestedMapping()); - Assert.assertEquals("Mapping should use the assigned field ID", - (Integer) table.schema().findField("location.long").fieldId(), updated.find("location.long").id()); - Assert.assertNull("Should not contain a nested mapping", updated.find("location.long").nestedMapping()); + Assert.assertEquals( + "Mapping should use the assigned field ID", + (Integer) table.schema().findField("location.long").fieldId(), + updated.find("location.long").id()); + Assert.assertNull( + "Should not contain a nested mapping", updated.find("location.long").nestedMapping()); } @Test @@ -113,25 +120,23 @@ public void testRenameColumn() { NameMapping mapping = MappingUtil.create(table.schema()); String mappingJson = NameMappingParser.toJson(mapping); - table.updateProperties() - .set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson) - .commit(); + table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit(); - table.updateSchema() - .renameColumn("id", "object_id") - .commit(); + table.updateSchema().renameColumn("id", "object_id").commit(); String updatedJson = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); NameMapping updated = NameMappingParser.fromJson(updatedJson); int idColumnId = table.schema().findField("object_id").fieldId(); validateUnchanged( - Iterables.filter(mapping.asMappedFields().fields(), field -> !Objects.equals(idColumnId, field.id())), + Iterables.filter( + mapping.asMappedFields().fields(), field -> !Objects.equals(idColumnId, field.id())), updated); MappedField updatedMapping = updated.find(idColumnId); Assert.assertNotNull("Mapping for id column should exist", updatedMapping); - Assert.assertEquals("Should add the new column name to the existing mapping", + Assert.assertEquals( + "Should add the new column name to the existing mapping", MappedField.of(idColumnId, ImmutableList.of("id", "object_id")), updatedMapping); } @@ -141,13 +146,9 @@ public void testDeleteColumn() { NameMapping mapping = MappingUtil.create(table.schema()); String mappingJson = NameMappingParser.toJson(mapping); - table.updateProperties() - .set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson) - .commit(); + table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit(); - table.updateSchema() - .deleteColumn("id") - .commit(); + table.updateSchema().deleteColumn("id").commit(); String updatedJson = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); NameMapping updated = NameMappingParser.fromJson(updatedJson); @@ -161,7 +162,8 @@ public void testModificationWithMetricsMetrics() { NameMapping mapping = MappingUtil.create(table.schema()); String mappingJson = NameMappingParser.toJson(mapping); - table.updateProperties() + table + .updateProperties() .set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson) .set("write.metadata.metrics.column.id", "full") .commit(); @@ -171,10 +173,11 @@ public void testModificationWithMetricsMetrics() { ValidationException.class, null, () -> - table.updateProperties() - .set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson) - .set("write.metadata.metrics.column.ids", "full") - .commit()); + table + .updateProperties() + .set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson) + .set("write.metadata.metrics.column.ids", "full") + .commit()); // Re-naming a column with metrics succeeds; table.updateSchema().renameColumn("id", "bloop").commit(); @@ -201,20 +204,14 @@ public void testDeleteAndAddColumnReassign() { NameMapping mapping = MappingUtil.create(table.schema()); String mappingJson = NameMappingParser.toJson(mapping); - table.updateProperties() - .set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson) - .commit(); + table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit(); int startIdColumnId = table.schema().findField("id").fieldId(); // the original field ID - table.updateSchema() - .deleteColumn("id") - .commit(); + table.updateSchema().deleteColumn("id").commit(); // add the same column name back to the table with a different field ID - table.updateSchema() - .addColumn("id", Types.StringType.get()) - .commit(); + table.updateSchema().addColumn("id", Types.StringType.get()).commit(); String updatedJson = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); NameMapping updated = NameMappingParser.fromJson(updatedJson); @@ -222,17 +219,20 @@ public void testDeleteAndAddColumnReassign() { int idColumnId = table.schema().findField("id").fieldId(); // the new field ID Set changedIds = Sets.newHashSet(startIdColumnId, idColumnId); validateUnchanged( - Iterables.filter(mapping.asMappedFields().fields(), field -> !changedIds.contains(field.id())), + Iterables.filter( + mapping.asMappedFields().fields(), field -> !changedIds.contains(field.id())), updated); MappedField newMapping = updated.find("id"); Assert.assertNotNull("Mapping for id column should exist", newMapping); - Assert.assertEquals("Mapping should use the new field ID", (Integer) idColumnId, newMapping.id()); + Assert.assertEquals( + "Mapping should use the new field ID", (Integer) idColumnId, newMapping.id()); Assert.assertNull("Should not contain a nested mapping", newMapping.nestedMapping()); MappedField updatedMapping = updated.find(startIdColumnId); Assert.assertNotNull("Mapping for original id column should exist", updatedMapping); - Assert.assertEquals("Mapping should use the original field ID", (Integer) startIdColumnId, updatedMapping.id()); + Assert.assertEquals( + "Mapping should use the original field ID", (Integer) startIdColumnId, updatedMapping.id()); Assert.assertFalse("Should not use id as a name", updatedMapping.names().contains("id")); Assert.assertNull("Should not contain a nested mapping", updatedMapping.nestedMapping()); } @@ -242,20 +242,14 @@ public void testDeleteAndRenameColumnReassign() { NameMapping mapping = MappingUtil.create(table.schema()); String mappingJson = NameMappingParser.toJson(mapping); - table.updateProperties() - .set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson) - .commit(); + table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit(); int startIdColumnId = table.schema().findField("id").fieldId(); // the original field ID - table.updateSchema() - .deleteColumn("id") - .commit(); + table.updateSchema().deleteColumn("id").commit(); // rename the data column to id - table.updateSchema() - .renameColumn("data", "id") - .commit(); + table.updateSchema().renameColumn("data", "id").commit(); String updatedJson = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); NameMapping updated = NameMappingParser.fromJson(updatedJson); @@ -263,18 +257,22 @@ public void testDeleteAndRenameColumnReassign() { int idColumnId = table.schema().findField("id").fieldId(); // the new field ID Set changedIds = Sets.newHashSet(startIdColumnId, idColumnId); validateUnchanged( - Iterables.filter(mapping.asMappedFields().fields(), field -> !changedIds.contains(field.id())), + Iterables.filter( + mapping.asMappedFields().fields(), field -> !changedIds.contains(field.id())), updated); MappedField newMapping = updated.find("id"); Assert.assertNotNull("Mapping for id column should exist", newMapping); - Assert.assertEquals("Mapping should use the new field ID", (Integer) idColumnId, newMapping.id()); - Assert.assertEquals("Should have both names", Sets.newHashSet("id", "data"), newMapping.names()); + Assert.assertEquals( + "Mapping should use the new field ID", (Integer) idColumnId, newMapping.id()); + Assert.assertEquals( + "Should have both names", Sets.newHashSet("id", "data"), newMapping.names()); Assert.assertNull("Should not contain a nested mapping", newMapping.nestedMapping()); MappedField updatedMapping = updated.find(startIdColumnId); Assert.assertNotNull("Mapping for original id column should exist", updatedMapping); - Assert.assertEquals("Mapping should use the original field ID", (Integer) startIdColumnId, updatedMapping.id()); + Assert.assertEquals( + "Mapping should use the original field ID", (Integer) startIdColumnId, updatedMapping.id()); Assert.assertFalse("Should not use id as a name", updatedMapping.names().contains("id")); Assert.assertNull("Should not contain a nested mapping", updatedMapping.nestedMapping()); } @@ -284,23 +282,23 @@ public void testRenameAndAddColumnReassign() { NameMapping mapping = MappingUtil.create(table.schema()); String mappingJson = NameMappingParser.toJson(mapping); - table.updateProperties() - .set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson) - .commit(); + table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit(); int startIdColumnId = table.schema().findField("id").fieldId(); // the original field ID - table.updateSchema() - .renameColumn("id", "object_id") - .commit(); + table.updateSchema().renameColumn("id", "object_id").commit(); - NameMapping afterRename = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); - Assert.assertEquals("Renamed column should have both names", - Sets.newHashSet("id", "object_id"), afterRename.find(startIdColumnId).names()); + NameMapping afterRename = + NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + Assert.assertEquals( + "Renamed column should have both names", + Sets.newHashSet("id", "object_id"), + afterRename.find(startIdColumnId).names()); // add a new column with the renamed column's old name // also, rename the original column again to ensure its names are handled correctly - table.updateSchema() + table + .updateSchema() .renameColumn("object_id", "oid") .addColumn("id", Types.StringType.get()) .commit(); @@ -311,18 +309,22 @@ public void testRenameAndAddColumnReassign() { int idColumnId = table.schema().findField("id").fieldId(); // the new field ID Set changedIds = Sets.newHashSet(startIdColumnId, idColumnId); validateUnchanged( - Iterables.filter(afterRename.asMappedFields().fields(), field -> !changedIds.contains(field.id())), + Iterables.filter( + afterRename.asMappedFields().fields(), field -> !changedIds.contains(field.id())), updated); MappedField newMapping = updated.find("id"); Assert.assertNotNull("Mapping for id column should exist", newMapping); - Assert.assertEquals("Mapping should use the new field ID", (Integer) idColumnId, newMapping.id()); + Assert.assertEquals( + "Mapping should use the new field ID", (Integer) idColumnId, newMapping.id()); Assert.assertNull("Should not contain a nested mapping", newMapping.nestedMapping()); MappedField updatedMapping = updated.find(startIdColumnId); Assert.assertNotNull("Mapping for original id column should exist", updatedMapping); - Assert.assertEquals("Mapping should use the original field ID", (Integer) startIdColumnId, updatedMapping.id()); - Assert.assertEquals("Should not use id as a name", Sets.newHashSet("object_id", "oid"), updatedMapping.names()); + Assert.assertEquals( + "Mapping should use the original field ID", (Integer) startIdColumnId, updatedMapping.id()); + Assert.assertEquals( + "Should not use id as a name", Sets.newHashSet("object_id", "oid"), updatedMapping.names()); Assert.assertNull("Should not contain a nested mapping", updatedMapping.nestedMapping()); } @@ -331,26 +333,22 @@ public void testRenameAndRenameColumnReassign() { NameMapping mapping = MappingUtil.create(table.schema()); String mappingJson = NameMappingParser.toJson(mapping); - table.updateProperties() - .set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson) - .commit(); + table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit(); int startIdColumnId = table.schema().findField("id").fieldId(); // the original field ID - table.updateSchema() - .renameColumn("id", "object_id") - .commit(); + table.updateSchema().renameColumn("id", "object_id").commit(); - NameMapping afterRename = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); - Assert.assertEquals("Renamed column should have both names", - Sets.newHashSet("id", "object_id"), afterRename.find(startIdColumnId).names()); + NameMapping afterRename = + NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + Assert.assertEquals( + "Renamed column should have both names", + Sets.newHashSet("id", "object_id"), + afterRename.find(startIdColumnId).names()); // rename the data column to the renamed column's old name // also, rename the original column again to ensure its names are handled correctly - table.updateSchema() - .renameColumn("object_id", "oid") - .renameColumn("data", "id") - .commit(); + table.updateSchema().renameColumn("object_id", "oid").renameColumn("data", "id").commit(); String updatedJson = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); NameMapping updated = NameMappingParser.fromJson(updatedJson); @@ -358,40 +356,42 @@ public void testRenameAndRenameColumnReassign() { int idColumnId = table.schema().findField("id").fieldId(); // the new field ID Set changedIds = Sets.newHashSet(startIdColumnId, idColumnId); validateUnchanged( - Iterables.filter(afterRename.asMappedFields().fields(), field -> !changedIds.contains(field.id())), + Iterables.filter( + afterRename.asMappedFields().fields(), field -> !changedIds.contains(field.id())), updated); MappedField newMapping = updated.find("id"); Assert.assertNotNull("Mapping for id column should exist", newMapping); - Assert.assertEquals("Renamed column should have both names", - Sets.newHashSet("id", "data"), newMapping.names()); - Assert.assertEquals("Mapping should use the new field ID", (Integer) idColumnId, newMapping.id()); + Assert.assertEquals( + "Renamed column should have both names", Sets.newHashSet("id", "data"), newMapping.names()); + Assert.assertEquals( + "Mapping should use the new field ID", (Integer) idColumnId, newMapping.id()); Assert.assertNull("Should not contain a nested mapping", newMapping.nestedMapping()); MappedField updatedMapping = updated.find(startIdColumnId); Assert.assertNotNull("Mapping for original id column should exist", updatedMapping); - Assert.assertEquals("Mapping should use the original field ID", (Integer) startIdColumnId, updatedMapping.id()); - Assert.assertEquals("Should not use id as a name", Sets.newHashSet("object_id", "oid"), updatedMapping.names()); + Assert.assertEquals( + "Mapping should use the original field ID", (Integer) startIdColumnId, updatedMapping.id()); + Assert.assertEquals( + "Should not use id as a name", Sets.newHashSet("object_id", "oid"), updatedMapping.names()); Assert.assertNull("Should not contain a nested mapping", updatedMapping.nestedMapping()); } - /** - * Asserts that the fields in the original mapping are unchanged in the updated mapping. - */ + /** Asserts that the fields in the original mapping are unchanged in the updated mapping. */ private void validateUnchanged(NameMapping original, NameMapping updated) { MappedFields updatedFields = updated.asMappedFields(); for (MappedField field : original.asMappedFields().fields()) { - Assert.assertEquals("Existing fields should not change", field, updatedFields.field(field.id())); + Assert.assertEquals( + "Existing fields should not change", field, updatedFields.field(field.id())); } } - /** - * Asserts that the fields in the original mapping are unchanged in the updated mapping. - */ + /** Asserts that the fields in the original mapping are unchanged in the updated mapping. */ private void validateUnchanged(Iterable fields, NameMapping updated) { MappedFields updatedFields = updated.asMappedFields(); for (MappedField field : fields) { - Assert.assertEquals("Existing fields should not change", field, updatedFields.field(field.id())); + Assert.assertEquals( + "Existing fields should not change", field, updatedFields.field(field.id())); } } } diff --git a/core/src/test/java/org/apache/iceberg/TestSchemaID.java b/core/src/test/java/org/apache/iceberg/TestSchemaID.java index f6c49798619d..f27fd92fa3c1 100644 --- a/core/src/test/java/org/apache/iceberg/TestSchemaID.java +++ b/core/src/test/java/org/apache/iceberg/TestSchemaID.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Arrays; import java.util.Map; import java.util.function.Function; @@ -31,15 +33,12 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSchemaID extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestSchemaID(int formatVersion) { @@ -55,10 +54,13 @@ public void testNoChange() { table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); TestHelpers.assertSameSchemaMap(onlySchemaMap, table.schemas()); - Assert.assertEquals("Current snapshot's schemaId should be the current", - table.schema().schemaId(), (int) table.currentSnapshot().schemaId()); + Assert.assertEquals( + "Current snapshot's schemaId should be the current", + table.schema().schemaId(), + (int) table.currentSnapshot().schemaId()); - Assert.assertEquals("Schema ids should be correct in snapshots", + Assert.assertEquals( + "Schema ids should be correct in snapshots", ImmutableList.of(onlyId), Lists.transform(Lists.newArrayList(table.snapshots()), Snapshot::schemaId)); @@ -66,10 +68,13 @@ public void testNoChange() { table.newDelete().deleteFile(FILE_A).commit(); TestHelpers.assertSameSchemaMap(onlySchemaMap, table.schemas()); - Assert.assertEquals("Current snapshot's schemaId should be the current", - table.schema().schemaId(), (int) table.currentSnapshot().schemaId()); + Assert.assertEquals( + "Current snapshot's schemaId should be the current", + table.schema().schemaId(), + (int) table.currentSnapshot().schemaId()); - Assert.assertEquals("Schema ids should be correct in snapshots", + Assert.assertEquals( + "Schema ids should be correct in snapshots", ImmutableList.of(onlyId, onlyId), Lists.transform(Lists.newArrayList(table.snapshots()), Snapshot::schemaId)); @@ -77,10 +82,13 @@ public void testNoChange() { table.newFastAppend().appendFile(FILE_A2).commit(); TestHelpers.assertSameSchemaMap(onlySchemaMap, table.schemas()); - Assert.assertEquals("Current snapshot's schemaId should be the current", - table.schema().schemaId(), (int) table.currentSnapshot().schemaId()); + Assert.assertEquals( + "Current snapshot's schemaId should be the current", + table.schema().schemaId(), + (int) table.currentSnapshot().schemaId()); - Assert.assertEquals("Schema ids should be correct in snapshots", + Assert.assertEquals( + "Schema ids should be correct in snapshots", ImmutableList.of(onlyId, onlyId, onlyId), Lists.transform(Lists.newArrayList(table.snapshots()), Snapshot::schemaId)); } @@ -93,28 +101,36 @@ public void testSchemaIdChangeInSchemaUpdate() { table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); TestHelpers.assertSameSchemaMap(schemaMap(table.schema()), table.schemas()); - Assert.assertEquals("Current snapshot's schemaId should be the current", - table.schema().schemaId(), (int) table.currentSnapshot().schemaId()); + Assert.assertEquals( + "Current snapshot's schemaId should be the current", + table.schema().schemaId(), + (int) table.currentSnapshot().schemaId()); - Assert.assertEquals("Schema ids should be correct in snapshots", + Assert.assertEquals( + "Schema ids should be correct in snapshots", ImmutableList.of(originalSchema.schemaId()), Lists.transform(Lists.newArrayList(table.snapshots()), Snapshot::schemaId)); // update schema table.updateSchema().addColumn("data2", Types.StringType.get()).commit(); - Schema updatedSchema = new Schema(1, - required(1, "id", Types.IntegerType.get()), - required(2, "data", Types.StringType.get()), - optional(3, "data2", Types.StringType.get()) - ); + Schema updatedSchema = + new Schema( + 1, + required(1, "id", Types.IntegerType.get()), + required(2, "data", Types.StringType.get()), + optional(3, "data2", Types.StringType.get())); TestHelpers.assertSameSchemaMap(schemaMap(originalSchema, updatedSchema), table.schemas()); - Assert.assertEquals("Current snapshot's schemaId should be old since update schema doesn't create new snapshot", - originalSchema.schemaId(), (int) table.currentSnapshot().schemaId()); - Assert.assertEquals("Current schema should match", updatedSchema.asStruct(), table.schema().asStruct()); - - Assert.assertEquals("Schema ids should be correct in snapshots", + Assert.assertEquals( + "Current snapshot's schemaId should be old since update schema doesn't create new snapshot", + originalSchema.schemaId(), + (int) table.currentSnapshot().schemaId()); + Assert.assertEquals( + "Current schema should match", updatedSchema.asStruct(), table.schema().asStruct()); + + Assert.assertEquals( + "Schema ids should be correct in snapshots", ImmutableList.of(originalSchema.schemaId()), Lists.transform(Lists.newArrayList(table.snapshots()), Snapshot::schemaId)); @@ -122,11 +138,15 @@ public void testSchemaIdChangeInSchemaUpdate() { table.newDelete().deleteFile(FILE_A).commit(); TestHelpers.assertSameSchemaMap(schemaMap(originalSchema, updatedSchema), table.schemas()); - Assert.assertEquals("Current snapshot's schemaId should be the current", - updatedSchema.schemaId(), (int) table.currentSnapshot().schemaId()); - Assert.assertEquals("Current schema should match", updatedSchema.asStruct(), table.schema().asStruct()); - - Assert.assertEquals("Schema ids should be correct in snapshots", + Assert.assertEquals( + "Current snapshot's schemaId should be the current", + updatedSchema.schemaId(), + (int) table.currentSnapshot().schemaId()); + Assert.assertEquals( + "Current schema should match", updatedSchema.asStruct(), table.schema().asStruct()); + + Assert.assertEquals( + "Schema ids should be correct in snapshots", ImmutableList.of(originalSchema.schemaId(), updatedSchema.schemaId()), Lists.transform(Lists.newArrayList(table.snapshots()), Snapshot::schemaId)); @@ -134,12 +154,17 @@ public void testSchemaIdChangeInSchemaUpdate() { table.newAppend().appendFile(FILE_A2).commit(); TestHelpers.assertSameSchemaMap(schemaMap(originalSchema, updatedSchema), table.schemas()); - Assert.assertEquals("Current snapshot's schemaId should be the current", - updatedSchema.schemaId(), (int) table.currentSnapshot().schemaId()); - Assert.assertEquals("Current schema should match", updatedSchema.asStruct(), table.schema().asStruct()); - - Assert.assertEquals("Schema ids should be correct in snapshots", - ImmutableList.of(originalSchema.schemaId(), updatedSchema.schemaId(), updatedSchema.schemaId()), + Assert.assertEquals( + "Current snapshot's schemaId should be the current", + updatedSchema.schemaId(), + (int) table.currentSnapshot().schemaId()); + Assert.assertEquals( + "Current schema should match", updatedSchema.asStruct(), table.schema().asStruct()); + + Assert.assertEquals( + "Schema ids should be correct in snapshots", + ImmutableList.of( + originalSchema.schemaId(), updatedSchema.schemaId(), updatedSchema.schemaId()), Lists.transform(Lists.newArrayList(table.snapshots()), Snapshot::schemaId)); } diff --git a/core/src/test/java/org/apache/iceberg/TestSchemaUnionByFieldName.java b/core/src/test/java/org/apache/iceberg/TestSchemaUnionByFieldName.java index fd7875c5312d..3f19fd734354 100644 --- a/core/src/test/java/org/apache/iceberg/TestSchemaUnionByFieldName.java +++ b/core/src/test/java/org/apache/iceberg/TestSchemaUnionByFieldName.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.List; import java.util.concurrent.atomic.AtomicInteger; import org.apache.iceberg.relocated.com.google.common.collect.Lists; @@ -45,16 +47,13 @@ import org.junit.Test; import org.junit.rules.ExpectedException; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSchemaUnionByFieldName { - @Rule - public ExpectedException thrown = ExpectedException.none(); + @Rule public ExpectedException thrown = ExpectedException.none(); private static List primitiveTypes() { - return Lists.newArrayList(StringType.get(), + return Lists.newArrayList( + StringType.get(), TimeType.get(), TimestampType.withoutZone(), TimestampType.withZone(), @@ -67,15 +66,20 @@ private static List primitiveTypes() { FixedType.ofLength(10), DecimalType.of(10, 2), LongType.get(), - FloatType.get() - ); + FloatType.get()); } - private static NestedField[] primitiveFields(Integer initialValue, List primitiveTypes) { + private static NestedField[] primitiveFields( + Integer initialValue, List primitiveTypes) { AtomicInteger atomicInteger = new AtomicInteger(initialValue); return primitiveTypes.stream() - .map(type -> optional(atomicInteger.incrementAndGet(), type.toString(), - Types.fromPrimitiveString(type.toString()))).toArray(NestedField[]::new); + .map( + type -> + optional( + atomicInteger.incrementAndGet(), + type.toString(), + Types.fromPrimitiveString(type.toString()))) + .toArray(NestedField[]::new); } @Test @@ -88,7 +92,8 @@ public void testAddTopLevelPrimitives() { @Test public void testAddTopLevelListOfPrimitives() { for (PrimitiveType primitiveType : primitiveTypes()) { - Schema newSchema = new Schema(optional(1, "aList", Types.ListType.ofOptional(2, primitiveType))); + Schema newSchema = + new Schema(optional(1, "aList", Types.ListType.ofOptional(2, primitiveType))); Schema applied = new SchemaUpdate(new Schema(), 0).unionByNameWith(newSchema).apply(); Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); } @@ -97,7 +102,9 @@ public void testAddTopLevelListOfPrimitives() { @Test public void testAddTopLevelMapOfPrimitives() { for (PrimitiveType primitiveType : primitiveTypes()) { - Schema newSchema = new Schema(optional(1, "aMap", Types.MapType.ofOptional(2, 3, primitiveType, primitiveType))); + Schema newSchema = + new Schema( + optional(1, "aMap", Types.MapType.ofOptional(2, 3, primitiveType, primitiveType))); Schema applied = new SchemaUpdate(new Schema(), 0).unionByNameWith(newSchema).apply(); Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); } @@ -106,8 +113,9 @@ public void testAddTopLevelMapOfPrimitives() { @Test public void testAddTopLevelStructOfPrimitives() { for (PrimitiveType primitiveType : primitiveTypes()) { - Schema currentSchema = new Schema(optional(1, "aStruct", Types.StructType.of( - optional(2, "primitive", primitiveType)))); + Schema currentSchema = + new Schema( + optional(1, "aStruct", Types.StructType.of(optional(2, "primitive", primitiveType)))); Schema applied = new SchemaUpdate(new Schema(), 0).unionByNameWith(currentSchema).apply(); Assert.assertEquals(currentSchema.asStruct(), applied.asStruct()); } @@ -117,8 +125,9 @@ public void testAddTopLevelStructOfPrimitives() { public void testAddNestedPrimitive() { for (PrimitiveType primitiveType : primitiveTypes()) { Schema currentSchema = new Schema(optional(1, "aStruct", Types.StructType.of())); - Schema newSchema = new Schema(optional(1, "aStruct", Types.StructType.of( - optional(2, "primitive", primitiveType)))); + Schema newSchema = + new Schema( + optional(1, "aStruct", Types.StructType.of(optional(2, "primitive", primitiveType)))); Schema applied = new SchemaUpdate(currentSchema, 1).unionByNameWith(newSchema).apply(); Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); } @@ -127,51 +136,107 @@ public void testAddNestedPrimitive() { @Test public void testAddNestedPrimitives() { Schema currentSchema = new Schema(optional(1, "aStruct", Types.StructType.of())); - Schema newSchema = new Schema(optional(1, "aStruct", Types.StructType.of( - primitiveFields(1, primitiveTypes())))); + Schema newSchema = + new Schema( + optional(1, "aStruct", Types.StructType.of(primitiveFields(1, primitiveTypes())))); Schema applied = new SchemaUpdate(currentSchema, 1).unionByNameWith(newSchema).apply(); Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); } @Test public void testAddNestedLists() { - Schema newSchema = new Schema(optional(1, "aList", - Types.ListType.ofOptional(2, - Types.ListType.ofOptional(3, - Types.ListType.ofOptional(4, - Types.ListType.ofOptional(5, - Types.ListType.ofOptional(6, - Types.ListType.ofOptional(7, - Types.ListType.ofOptional(8, - Types.ListType.ofOptional(9, - Types.ListType.ofOptional(10, - DecimalType.of(11, 20)))))))))))); + Schema newSchema = + new Schema( + optional( + 1, + "aList", + Types.ListType.ofOptional( + 2, + Types.ListType.ofOptional( + 3, + Types.ListType.ofOptional( + 4, + Types.ListType.ofOptional( + 5, + Types.ListType.ofOptional( + 6, + Types.ListType.ofOptional( + 7, + Types.ListType.ofOptional( + 8, + Types.ListType.ofOptional( + 9, + Types.ListType.ofOptional( + 10, DecimalType.of(11, 20)))))))))))); Schema applied = new SchemaUpdate(new Schema(), 0).unionByNameWith(newSchema).apply(); Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); } @Test public void testAddNestedStruct() { - Schema newSchema = new Schema(optional(1, "struct1", Types.StructType.of( - optional(2, "struct2", Types.StructType.of( - optional(3, "struct3", Types.StructType.of( - optional(4, "struct4", Types.StructType.of( - optional(5, "struct5", Types.StructType.of( - optional(6, "struct6", Types.StructType.of( - optional(7, "aString", StringType.get())))))))))))))); + Schema newSchema = + new Schema( + optional( + 1, + "struct1", + Types.StructType.of( + optional( + 2, + "struct2", + Types.StructType.of( + optional( + 3, + "struct3", + Types.StructType.of( + optional( + 4, + "struct4", + Types.StructType.of( + optional( + 5, + "struct5", + Types.StructType.of( + optional( + 6, + "struct6", + Types.StructType.of( + optional( + 7, + "aString", + StringType.get())))))))))))))); Schema applied = new SchemaUpdate(new Schema(), 0).unionByNameWith(newSchema).apply(); Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); } @Test public void testAddNestedMaps() { - Schema newSchema = new Schema(optional(1, "struct", Types.MapType.ofOptional( - 2, 3, StringType.get(), Types.MapType.ofOptional( - 4, 5, StringType.get(), Types.MapType.ofOptional( - 6, 7, StringType.get(), Types.MapType.ofOptional( - 8, 9, StringType.get(), Types.MapType.ofOptional( - 10, 11, StringType.get(), Types.MapType.ofOptional( - 12, 13, StringType.get(), StringType.get())))))))); + Schema newSchema = + new Schema( + optional( + 1, + "struct", + Types.MapType.ofOptional( + 2, + 3, + StringType.get(), + Types.MapType.ofOptional( + 4, + 5, + StringType.get(), + Types.MapType.ofOptional( + 6, + 7, + StringType.get(), + Types.MapType.ofOptional( + 8, + 9, + StringType.get(), + Types.MapType.ofOptional( + 10, + 11, + StringType.get(), + Types.MapType.ofOptional( + 12, 13, StringType.get(), StringType.get())))))))); Schema applied = new SchemaUpdate(new Schema(), 0).unionByNameWith(newSchema).apply(); Assert.assertEquals(newSchema.asStruct(), applied.asStruct()); } @@ -181,10 +246,10 @@ public void testDetectInvalidTopLevelList() { thrown.expect(IllegalArgumentException.class); thrown.expectMessage("Cannot change column type: aList.element: string -> long"); - Schema currentSchema = new Schema(optional(1, "aList", - Types.ListType.ofOptional(2, StringType.get()))); - Schema newSchema = new Schema(optional(1, "aList", - Types.ListType.ofOptional(2, LongType.get()))); + Schema currentSchema = + new Schema(optional(1, "aList", Types.ListType.ofOptional(2, StringType.get()))); + Schema newSchema = + new Schema(optional(1, "aList", Types.ListType.ofOptional(2, LongType.get()))); new SchemaUpdate(currentSchema, 2).unionByNameWith(newSchema).apply(); } @@ -193,10 +258,13 @@ public void testDetectInvalidTopLevelMapValue() { thrown.expect(IllegalArgumentException.class); thrown.expectMessage("Cannot change column type: aMap.value: string -> long"); - Schema currentSchema = new Schema(optional(1, "aMap", - Types.MapType.ofOptional(2, 3, StringType.get(), StringType.get()))); - Schema newSchema = new Schema(optional(1, "aMap", - Types.MapType.ofOptional(2, 3, StringType.get(), LongType.get()))); + Schema currentSchema = + new Schema( + optional( + 1, "aMap", Types.MapType.ofOptional(2, 3, StringType.get(), StringType.get()))); + Schema newSchema = + new Schema( + optional(1, "aMap", Types.MapType.ofOptional(2, 3, StringType.get(), LongType.get()))); Schema apply = new SchemaUpdate(currentSchema, 3).unionByNameWith(newSchema).apply(); } @@ -205,10 +273,13 @@ public void testDetectInvalidTopLevelMapKey() { thrown.expect(IllegalArgumentException.class); thrown.expectMessage("Cannot change column type: aMap.key: string -> uuid"); - Schema currentSchema = new Schema(optional(1, "aMap", - Types.MapType.ofOptional(2, 3, StringType.get(), StringType.get()))); - Schema newSchema = new Schema(optional(1, "aMap", - Types.MapType.ofOptional(2, 3, UUIDType.get(), StringType.get()))); + Schema currentSchema = + new Schema( + optional( + 1, "aMap", Types.MapType.ofOptional(2, 3, StringType.get(), StringType.get()))); + Schema newSchema = + new Schema( + optional(1, "aMap", Types.MapType.ofOptional(2, 3, UUIDType.get(), StringType.get()))); new SchemaUpdate(currentSchema, 3).unionByNameWith(newSchema).apply(); } @@ -250,7 +321,8 @@ public void testInvalidTypePromoteDoubleToFloat() { } @Test - // decimal(P,S) Fixed-point decimal; precision P, scale S -> Scale is fixed [1], precision must be 38 or less + // decimal(P,S) Fixed-point decimal; precision P, scale S -> Scale is fixed [1], precision must be + // 38 or less public void testTypePromoteDecimalToFixedScaleWithWiderPrecision() { Schema currentSchema = new Schema(required(1, "aCol", DecimalType.of(20, 1))); Schema newSchema = new Schema(required(1, "aCol", DecimalType.of(22, 1))); @@ -261,29 +333,61 @@ public void testTypePromoteDecimalToFixedScaleWithWiderPrecision() { @Test public void testAddPrimitiveToNestedStruct() { - Schema schema = new Schema( - required(1, "struct1", Types.StructType.of( - optional(2, "struct2", Types.StructType.of( - optional(3, "list", Types.ListType.ofOptional( - 4, Types.StructType.of( - optional(5, "value", StringType.get()))))))))); - - Schema newSchema = new Schema( - required(1, "struct1", Types.StructType.of( - optional(2, "struct2", Types.StructType.of( - optional(3, "list", Types.ListType.ofOptional( - 4, Types.StructType.of( - optional(5, "time", TimeType.get()))))))))); + Schema schema = + new Schema( + required( + 1, + "struct1", + Types.StructType.of( + optional( + 2, + "struct2", + Types.StructType.of( + optional( + 3, + "list", + Types.ListType.ofOptional( + 4, + Types.StructType.of( + optional(5, "value", StringType.get()))))))))); + + Schema newSchema = + new Schema( + required( + 1, + "struct1", + Types.StructType.of( + optional( + 2, + "struct2", + Types.StructType.of( + optional( + 3, + "list", + Types.ListType.ofOptional( + 4, + Types.StructType.of(optional(5, "time", TimeType.get()))))))))); Schema applied = new SchemaUpdate(schema, 5).unionByNameWith(newSchema).apply(); - Schema expected = new Schema( - required(1, "struct1", Types.StructType.of( - optional(2, "struct2", Types.StructType.of( - optional(3, "list", Types.ListType.ofOptional( - 4, Types.StructType.of( - optional(5, "value", StringType.get()), - optional(6, "time", TimeType.get()))))))))); + Schema expected = + new Schema( + required( + 1, + "struct1", + Types.StructType.of( + optional( + 2, + "struct2", + Types.StructType.of( + optional( + 3, + "list", + Types.ListType.ofOptional( + 4, + Types.StructType.of( + optional(5, "value", StringType.get()), + optional(6, "time", TimeType.get()))))))))); Assert.assertEquals(expected.asStruct(), applied.asStruct()); } @@ -293,15 +397,17 @@ public void testReplaceListWithPrimitive() { thrown.expect(IllegalArgumentException.class); thrown.expectMessage("Cannot change column type: aColumn: list -> string"); - Schema currentSchema = new Schema(optional(1, "aColumn", - Types.ListType.ofOptional(2, StringType.get()))); + Schema currentSchema = + new Schema(optional(1, "aColumn", Types.ListType.ofOptional(2, StringType.get()))); Schema newSchema = new Schema(optional(1, "aColumn", StringType.get())); new SchemaUpdate(currentSchema, 2).unionByNameWith(newSchema).apply(); } @Test public void testMirroredSchemas() { - Schema aSchema = new Schema(optional(9, "struct1", Types.StructType.of(optional(8, "string1", StringType.get()))), + Schema aSchema = + new Schema( + optional(9, "struct1", Types.StructType.of(optional(8, "string1", StringType.get()))), optional(6, "list1", Types.ListType.ofOptional(7, StringType.get())), optional(5, "string2", StringType.get()), optional(4, "string3", StringType.get()), @@ -310,7 +416,9 @@ public void testMirroredSchemas() { optional(1, "string6", StringType.get())); // Same schema but the field indices are in reverse order, from lowest to highest - Schema mirrored = new Schema(optional(1, "struct1", Types.StructType.of(optional(2, "string1", StringType.get()))), + Schema mirrored = + new Schema( + optional(1, "struct1", Types.StructType.of(optional(2, "string1", StringType.get()))), optional(3, "list1", Types.ListType.ofOptional(4, StringType.get())), optional(5, "string2", StringType.get()), optional(6, "string3", StringType.get()), @@ -325,20 +433,35 @@ public void testMirroredSchemas() { @Test public void addNewTopLevelStruct() { - Schema schema = new Schema(optional(1, "map1", Types.MapType.ofOptional(2, 3, - Types.StringType.get(), - Types.ListType.ofOptional(4, - Types.StructType.of(optional(5, "string1", Types.StringType.get())))))); - - Schema observed = new Schema(optional(1, "map1", Types.MapType.ofOptional(2, 3, - Types.StringType.get(), - Types.ListType.ofOptional(4, - Types.StructType.of(optional(5, "string1", Types.StringType.get()))))), - optional(6, "struct1", Types.StructType.of( - optional(7, "d1", Types.StructType.of( - optional(8, "d2", Types.StringType.get()) - )) - ))); + Schema schema = + new Schema( + optional( + 1, + "map1", + Types.MapType.ofOptional( + 2, + 3, + Types.StringType.get(), + Types.ListType.ofOptional( + 4, Types.StructType.of(optional(5, "string1", Types.StringType.get())))))); + + Schema observed = + new Schema( + optional( + 1, + "map1", + Types.MapType.ofOptional( + 2, + 3, + Types.StringType.get(), + Types.ListType.ofOptional( + 4, Types.StructType.of(optional(5, "string1", Types.StringType.get()))))), + optional( + 6, + "struct1", + Types.StructType.of( + optional( + 7, "d1", Types.StructType.of(optional(8, "d2", Types.StringType.get())))))); Schema union = new SchemaUpdate(schema, 5).unionByNameWith(observed).apply(); Assert.assertEquals(observed.asStruct(), union.asStruct()); @@ -346,20 +469,50 @@ public void addNewTopLevelStruct() { @Test public void testAppendNestedStruct() { - Schema schema = new Schema(required(1, "s1", StructType.of( - optional(2, "s2", StructType.of( - optional(3, "s3", StructType.of( - optional(4, "s4", StringType.get())))))))); - - Schema observed = new Schema(required(1, "s1", StructType.of( - optional(2, "s2", StructType.of( - optional(3, "s3", StructType.of( - optional(4, "s4", StringType.get()))), - optional(5, "repeat", StructType.of( - optional(6, "s1", StructType.of( - optional(7, "s2", StructType.of( - optional(8, "s3", StructType.of( - optional(9, "s4", StringType.get())))))))))))))); + Schema schema = + new Schema( + required( + 1, + "s1", + StructType.of( + optional( + 2, + "s2", + StructType.of( + optional( + 3, "s3", StructType.of(optional(4, "s4", StringType.get())))))))); + + Schema observed = + new Schema( + required( + 1, + "s1", + StructType.of( + optional( + 2, + "s2", + StructType.of( + optional(3, "s3", StructType.of(optional(4, "s4", StringType.get()))), + optional( + 5, + "repeat", + StructType.of( + optional( + 6, + "s1", + StructType.of( + optional( + 7, + "s2", + StructType.of( + optional( + 8, + "s3", + StructType.of( + optional( + 9, + "s4", + StringType.get())))))))))))))); Schema applied = new SchemaUpdate(schema, 4).unionByNameWith(observed).apply(); Assert.assertEquals(observed.asStruct(), applied.asStruct()); @@ -367,25 +520,66 @@ public void testAppendNestedStruct() { @Test public void testAppendNestedLists() { - Schema schema = new Schema(required(1, "s1", StructType.of( - optional(2, "s2", StructType.of( - optional(3, "s3", StructType.of( - optional(4, "list1", ListType.ofOptional(5, StringType.get()))))))))); - - Schema observed = new Schema(required(1, "s1", StructType.of( - optional(2, "s2", StructType.of( - optional(3, "s3", StructType.of( - optional(4, "list2", ListType.ofOptional(5, StringType.get()))))))))); + Schema schema = + new Schema( + required( + 1, + "s1", + StructType.of( + optional( + 2, + "s2", + StructType.of( + optional( + 3, + "s3", + StructType.of( + optional( + 4, + "list1", + ListType.ofOptional(5, StringType.get()))))))))); + + Schema observed = + new Schema( + required( + 1, + "s1", + StructType.of( + optional( + 2, + "s2", + StructType.of( + optional( + 3, + "s3", + StructType.of( + optional( + 4, + "list2", + ListType.ofOptional(5, StringType.get()))))))))); Schema union = new SchemaUpdate(schema, 5).unionByNameWith(observed).apply(); - Schema expected = new Schema(required(1, "s1", StructType.of( - optional(2, "s2", StructType.of( - optional(3, "s3", StructType.of( - optional(4, "list1", ListType.ofOptional(5, StringType.get())), - optional(6, "list2", ListType.ofOptional(7, StringType.get()))))))))); + Schema expected = + new Schema( + required( + 1, + "s1", + StructType.of( + optional( + 2, + "s2", + StructType.of( + optional( + 3, + "s3", + StructType.of( + optional(4, "list1", ListType.ofOptional(5, StringType.get())), + optional( + 6, + "list2", + ListType.ofOptional(7, StringType.get()))))))))); Assert.assertEquals(expected.asStruct(), union.asStruct()); } - } diff --git a/core/src/test/java/org/apache/iceberg/TestSchemaUpdate.java b/core/src/test/java/org/apache/iceberg/TestSchemaUpdate.java index 29976ded4884..b2c79446f258 100644 --- a/core/src/test/java/org/apache/iceberg/TestSchemaUpdate.java +++ b/core/src/test/java/org/apache/iceberg/TestSchemaUpdate.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.List; import java.util.Set; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; @@ -31,41 +33,48 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSchemaUpdate { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()), - optional(3, "preferences", Types.StructType.of( - required(8, "feature1", Types.BooleanType.get()), - optional(9, "feature2", Types.BooleanType.get()) - ), "struct of named boolean options"), - required(4, "locations", Types.MapType.ofRequired(10, 11, - Types.StructType.of( - required(20, "address", Types.StringType.get()), - required(21, "city", Types.StringType.get()), - required(22, "state", Types.StringType.get()), - required(23, "zip", Types.IntegerType.get()) - ), - Types.StructType.of( - required(12, "lat", Types.FloatType.get()), - required(13, "long", Types.FloatType.get()) - )), "map of address to coordinate"), - optional(5, "points", Types.ListType.ofOptional(14, - Types.StructType.of( - required(15, "x", Types.LongType.get()), - required(16, "y", Types.LongType.get()) - )), "2-D cartesian points"), - required(6, "doubles", Types.ListType.ofRequired(17, - Types.DoubleType.get() - )), - optional(7, "properties", Types.MapType.ofOptional(18, 19, - Types.StringType.get(), - Types.StringType.get() - ), "string map of properties") - ); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get()), + optional( + 3, + "preferences", + Types.StructType.of( + required(8, "feature1", Types.BooleanType.get()), + optional(9, "feature2", Types.BooleanType.get())), + "struct of named boolean options"), + required( + 4, + "locations", + Types.MapType.ofRequired( + 10, + 11, + Types.StructType.of( + required(20, "address", Types.StringType.get()), + required(21, "city", Types.StringType.get()), + required(22, "state", Types.StringType.get()), + required(23, "zip", Types.IntegerType.get())), + Types.StructType.of( + required(12, "lat", Types.FloatType.get()), + required(13, "long", Types.FloatType.get()))), + "map of address to coordinate"), + optional( + 5, + "points", + Types.ListType.ofOptional( + 14, + Types.StructType.of( + required(15, "x", Types.LongType.get()), + required(16, "y", Types.LongType.get()))), + "2-D cartesian points"), + required(6, "doubles", Types.ListType.ofRequired(17, Types.DoubleType.get())), + optional( + 7, + "properties", + Types.MapType.ofOptional(18, 19, Types.StringType.get(), Types.StringType.get()), + "string map of properties")); private static final Set ALL_IDS = ImmutableSet.copyOf(TypeUtil.getProjectedIds(SCHEMA)); @@ -80,9 +89,21 @@ public void testNoChanges() { @Test public void testDeleteFields() { // use schema projection to test column deletes - List columns = Lists.newArrayList("id", "data", "preferences", "preferences.feature1", - "preferences.feature2", "locations", "locations.lat", "locations.long", "points", - "points.x", "points.y", "doubles", "properties"); + List columns = + Lists.newArrayList( + "id", + "data", + "preferences", + "preferences.feature1", + "preferences.feature2", + "locations", + "locations.lat", + "locations.long", + "points", + "points.x", + "points.y", + "doubles", + "properties"); for (String name : columns) { Set selected = Sets.newHashSet(ALL_IDS); // remove the id and any nested fields from the projection @@ -92,78 +113,100 @@ public void testDeleteFields() { Schema del = new SchemaUpdate(SCHEMA, 19).deleteColumn(name).apply(); - Assert.assertEquals("Should match projection with '" + name + "' removed", - TypeUtil.project(SCHEMA, selected).asStruct(), del.asStruct()); + Assert.assertEquals( + "Should match projection with '" + name + "' removed", + TypeUtil.project(SCHEMA, selected).asStruct(), + del.asStruct()); } } @Test public void testUpdateTypes() { - Types.StructType expected = Types.StructType.of( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - optional(3, "preferences", Types.StructType.of( - required(8, "feature1", Types.BooleanType.get()), - optional(9, "feature2", Types.BooleanType.get()) - ), "struct of named boolean options"), - required(4, "locations", Types.MapType.ofRequired(10, 11, - Types.StructType.of( - required(20, "address", Types.StringType.get()), - required(21, "city", Types.StringType.get()), - required(22, "state", Types.StringType.get()), - required(23, "zip", Types.IntegerType.get()) - ), - Types.StructType.of( - required(12, "lat", Types.DoubleType.get()), - required(13, "long", Types.DoubleType.get()) - )), "map of address to coordinate"), - optional(5, "points", Types.ListType.ofOptional(14, - Types.StructType.of( - required(15, "x", Types.LongType.get()), - required(16, "y", Types.LongType.get()) - )), "2-D cartesian points"), - required(6, "doubles", Types.ListType.ofRequired(17, - Types.DoubleType.get() - )), - optional(7, "properties", Types.MapType.ofOptional(18, 19, - Types.StringType.get(), - Types.StringType.get() - ), "string map of properties") - ); - - Schema updated = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .updateColumn("id", Types.LongType.get()) - .updateColumn("locations.lat", Types.DoubleType.get()) - .updateColumn("locations.long", Types.DoubleType.get()) - .apply(); + Types.StructType expected = + Types.StructType.of( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + optional( + 3, + "preferences", + Types.StructType.of( + required(8, "feature1", Types.BooleanType.get()), + optional(9, "feature2", Types.BooleanType.get())), + "struct of named boolean options"), + required( + 4, + "locations", + Types.MapType.ofRequired( + 10, + 11, + Types.StructType.of( + required(20, "address", Types.StringType.get()), + required(21, "city", Types.StringType.get()), + required(22, "state", Types.StringType.get()), + required(23, "zip", Types.IntegerType.get())), + Types.StructType.of( + required(12, "lat", Types.DoubleType.get()), + required(13, "long", Types.DoubleType.get()))), + "map of address to coordinate"), + optional( + 5, + "points", + Types.ListType.ofOptional( + 14, + Types.StructType.of( + required(15, "x", Types.LongType.get()), + required(16, "y", Types.LongType.get()))), + "2-D cartesian points"), + required(6, "doubles", Types.ListType.ofRequired(17, Types.DoubleType.get())), + optional( + 7, + "properties", + Types.MapType.ofOptional(18, 19, Types.StringType.get(), Types.StringType.get()), + "string map of properties")); + + Schema updated = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .updateColumn("id", Types.LongType.get()) + .updateColumn("locations.lat", Types.DoubleType.get()) + .updateColumn("locations.long", Types.DoubleType.get()) + .apply(); Assert.assertEquals("Should convert types", expected, updated.asStruct()); } @Test public void testUpdateFailure() { - Set> allowedUpdates = Sets.newHashSet( - Pair.of(Types.IntegerType.get(), Types.LongType.get()), - Pair.of(Types.FloatType.get(), Types.DoubleType.get()), - Pair.of(Types.DecimalType.of(9, 2), Types.DecimalType.of(18, 2)) - ); - - List primitives = Lists.newArrayList( - Types.BooleanType.get(), Types.IntegerType.get(), Types.LongType.get(), - Types.FloatType.get(), Types.DoubleType.get(), Types.DateType.get(), Types.TimeType.get(), - Types.TimestampType.withZone(), Types.TimestampType.withoutZone(), - Types.StringType.get(), Types.UUIDType.get(), Types.BinaryType.get(), - Types.FixedType.ofLength(3), Types.FixedType.ofLength(4), - Types.DecimalType.of(9, 2), Types.DecimalType.of(9, 3), - Types.DecimalType.of(18, 2) - ); + Set> allowedUpdates = + Sets.newHashSet( + Pair.of(Types.IntegerType.get(), Types.LongType.get()), + Pair.of(Types.FloatType.get(), Types.DoubleType.get()), + Pair.of(Types.DecimalType.of(9, 2), Types.DecimalType.of(18, 2))); + + List primitives = + Lists.newArrayList( + Types.BooleanType.get(), + Types.IntegerType.get(), + Types.LongType.get(), + Types.FloatType.get(), + Types.DoubleType.get(), + Types.DateType.get(), + Types.TimeType.get(), + Types.TimestampType.withZone(), + Types.TimestampType.withoutZone(), + Types.StringType.get(), + Types.UUIDType.get(), + Types.BinaryType.get(), + Types.FixedType.ofLength(3), + Types.FixedType.ofLength(4), + Types.DecimalType.of(9, 2), + Types.DecimalType.of(9, 3), + Types.DecimalType.of(18, 2)); for (Type.PrimitiveType fromType : primitives) { for (Type.PrimitiveType toType : primitives) { Schema fromSchema = new Schema(required(1, "col", fromType)); - if (fromType.equals(toType) || - allowedUpdates.contains(Pair.of(fromType, toType))) { + if (fromType.equals(toType) || allowedUpdates.contains(Pair.of(fromType, toType))) { Schema expected = new Schema(required(1, "col", toType)); Schema result = new SchemaUpdate(fromSchema, 1).updateColumn("col", toType).apply(); Assert.assertEquals("Should allow update", expected.asStruct(), result.asStruct()); @@ -171,8 +214,10 @@ public void testUpdateFailure() { } String typeChange = fromType.toString() + " -> " + toType.toString(); - AssertHelpers.assertThrows("Should reject update: " + typeChange, - IllegalArgumentException.class, "change column type: col: " + typeChange, + AssertHelpers.assertThrows( + "Should reject update: " + typeChange, + IllegalArgumentException.class, + "change column type: col: " + typeChange, () -> new SchemaUpdate(fromSchema, 1).updateColumn("col", toType)); } } @@ -180,94 +225,116 @@ public void testUpdateFailure() { @Test public void testRename() { - Types.StructType expected = Types.StructType.of( - required(1, "id", Types.IntegerType.get()), - optional(2, "json", Types.StringType.get()), - optional(3, "options", Types.StructType.of( - required(8, "feature1", Types.BooleanType.get()), - optional(9, "newfeature", Types.BooleanType.get()) - ), "struct of named boolean options"), - required(4, "locations", Types.MapType.ofRequired(10, 11, - Types.StructType.of( - required(20, "address", Types.StringType.get()), - required(21, "city", Types.StringType.get()), - required(22, "state", Types.StringType.get()), - required(23, "zip", Types.IntegerType.get()) - ), - Types.StructType.of( - required(12, "latitude", Types.FloatType.get()), - required(13, "long", Types.FloatType.get()) - )), "map of address to coordinate"), - optional(5, "points", Types.ListType.ofOptional(14, - Types.StructType.of( - required(15, "X", Types.LongType.get()), - required(16, "y.y", Types.LongType.get()) - )), "2-D cartesian points"), - required(6, "doubles", Types.ListType.ofRequired(17, - Types.DoubleType.get() - )), - optional(7, "properties", Types.MapType.ofOptional(18, 19, - Types.StringType.get(), - Types.StringType.get() - ), "string map of properties") - ); - - Schema renamed = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .renameColumn("data", "json") - .renameColumn("preferences", "options") - .renameColumn("preferences.feature2", "newfeature") // inside a renamed column - .renameColumn("locations.lat", "latitude") - .renameColumn("points.x", "X") - .renameColumn("points.y", "y.y") // has a '.' in the field name - .apply(); + Types.StructType expected = + Types.StructType.of( + required(1, "id", Types.IntegerType.get()), + optional(2, "json", Types.StringType.get()), + optional( + 3, + "options", + Types.StructType.of( + required(8, "feature1", Types.BooleanType.get()), + optional(9, "newfeature", Types.BooleanType.get())), + "struct of named boolean options"), + required( + 4, + "locations", + Types.MapType.ofRequired( + 10, + 11, + Types.StructType.of( + required(20, "address", Types.StringType.get()), + required(21, "city", Types.StringType.get()), + required(22, "state", Types.StringType.get()), + required(23, "zip", Types.IntegerType.get())), + Types.StructType.of( + required(12, "latitude", Types.FloatType.get()), + required(13, "long", Types.FloatType.get()))), + "map of address to coordinate"), + optional( + 5, + "points", + Types.ListType.ofOptional( + 14, + Types.StructType.of( + required(15, "X", Types.LongType.get()), + required(16, "y.y", Types.LongType.get()))), + "2-D cartesian points"), + required(6, "doubles", Types.ListType.ofRequired(17, Types.DoubleType.get())), + optional( + 7, + "properties", + Types.MapType.ofOptional(18, 19, Types.StringType.get(), Types.StringType.get()), + "string map of properties")); + + Schema renamed = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .renameColumn("data", "json") + .renameColumn("preferences", "options") + .renameColumn("preferences.feature2", "newfeature") // inside a renamed column + .renameColumn("locations.lat", "latitude") + .renameColumn("points.x", "X") + .renameColumn("points.y", "y.y") // has a '.' in the field name + .apply(); Assert.assertEquals("Should rename all fields", expected, renamed.asStruct()); } @Test public void testAddFields() { - Schema expected = new Schema( - required(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()), - optional(3, "preferences", Types.StructType.of( - required(8, "feature1", Types.BooleanType.get()), - optional(9, "feature2", Types.BooleanType.get()) - ), "struct of named boolean options"), - required(4, "locations", Types.MapType.ofRequired(10, 11, - Types.StructType.of( - required(20, "address", Types.StringType.get()), - required(21, "city", Types.StringType.get()), - required(22, "state", Types.StringType.get()), - required(23, "zip", Types.IntegerType.get()) - ), - Types.StructType.of( - required(12, "lat", Types.FloatType.get()), - required(13, "long", Types.FloatType.get()), - optional(25, "alt", Types.FloatType.get()) - )), "map of address to coordinate"), - optional(5, "points", Types.ListType.ofOptional(14, - Types.StructType.of( - required(15, "x", Types.LongType.get()), - required(16, "y", Types.LongType.get()), - optional(26, "z", Types.LongType.get()), - optional(27, "t.t", Types.LongType.get()) - )), "2-D cartesian points"), - required(6, "doubles", Types.ListType.ofRequired(17, - Types.DoubleType.get() - )), - optional(7, "properties", Types.MapType.ofOptional(18, 19, - Types.StringType.get(), - Types.StringType.get() - ), "string map of properties"), - optional(24, "toplevel", Types.DecimalType.of(9, 2)) - ); - - Schema added = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .addColumn("toplevel", Types.DecimalType.of(9, 2)) - .addColumn("locations", "alt", Types.FloatType.get()) // map of structs - .addColumn("points", "z", Types.LongType.get()) // list of structs - .addColumn("points", "t.t", Types.LongType.get()) // name with '.' - .apply(); + Schema expected = + new Schema( + required(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get()), + optional( + 3, + "preferences", + Types.StructType.of( + required(8, "feature1", Types.BooleanType.get()), + optional(9, "feature2", Types.BooleanType.get())), + "struct of named boolean options"), + required( + 4, + "locations", + Types.MapType.ofRequired( + 10, + 11, + Types.StructType.of( + required(20, "address", Types.StringType.get()), + required(21, "city", Types.StringType.get()), + required(22, "state", Types.StringType.get()), + required(23, "zip", Types.IntegerType.get())), + Types.StructType.of( + required(12, "lat", Types.FloatType.get()), + required(13, "long", Types.FloatType.get()), + optional(25, "alt", Types.FloatType.get()))), + "map of address to coordinate"), + optional( + 5, + "points", + Types.ListType.ofOptional( + 14, + Types.StructType.of( + required(15, "x", Types.LongType.get()), + required(16, "y", Types.LongType.get()), + optional(26, "z", Types.LongType.get()), + optional(27, "t.t", Types.LongType.get()))), + "2-D cartesian points"), + required(6, "doubles", Types.ListType.ofRequired(17, Types.DoubleType.get())), + optional( + 7, + "properties", + Types.MapType.ofOptional(18, 19, Types.StringType.get(), Types.StringType.get()), + "string map of properties"), + optional(24, "toplevel", Types.DecimalType.of(9, 2))); + + Schema added = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .addColumn("toplevel", Types.DecimalType.of(9, 2)) + .addColumn("locations", "alt", Types.FloatType.get()) // map of structs + .addColumn("points", "z", Types.LongType.get()) // list of structs + .addColumn("points", "t.t", Types.LongType.get()) // name with '.' + .apply(); Assert.assertEquals("Should match with added fields", expected.asStruct(), added.asStruct()); } @@ -275,110 +342,114 @@ public void testAddFields() { @Test public void testAddNestedStruct() { Schema schema = new Schema(required(1, "id", Types.IntegerType.get())); - Types.StructType struct = Types.StructType.of( - required(1, "lat", Types.IntegerType.get()), // conflicts with id - optional(2, "long", Types.IntegerType.get()) - ); + Types.StructType struct = + Types.StructType.of( + required(1, "lat", Types.IntegerType.get()), // conflicts with id + optional(2, "long", Types.IntegerType.get())); - Schema expected = new Schema( - required(1, "id", Types.IntegerType.get()), - optional(2, "location", Types.StructType.of( - required(3, "lat", Types.IntegerType.get()), - optional(4, "long", Types.IntegerType.get()) - )) - ); + Schema expected = + new Schema( + required(1, "id", Types.IntegerType.get()), + optional( + 2, + "location", + Types.StructType.of( + required(3, "lat", Types.IntegerType.get()), + optional(4, "long", Types.IntegerType.get())))); - Schema result = new SchemaUpdate(schema, 1) - .addColumn("location", struct) - .apply(); + Schema result = new SchemaUpdate(schema, 1).addColumn("location", struct).apply(); - Assert.assertEquals("Should add struct and reassign column IDs", - expected.asStruct(), result.asStruct()); + Assert.assertEquals( + "Should add struct and reassign column IDs", expected.asStruct(), result.asStruct()); } @Test public void testAddNestedMapOfStructs() { Schema schema = new Schema(required(1, "id", Types.IntegerType.get())); - Types.MapType map = Types.MapType.ofOptional(1, 2, - Types.StructType.of( - required(20, "address", Types.StringType.get()), - required(21, "city", Types.StringType.get()), - required(22, "state", Types.StringType.get()), - required(23, "zip", Types.IntegerType.get()) - ), - Types.StructType.of( - required(9, "lat", Types.IntegerType.get()), - optional(8, "long", Types.IntegerType.get()) - ) - ); - - Schema expected = new Schema( - required(1, "id", Types.IntegerType.get()), - optional(2, "locations", Types.MapType.ofOptional(3, 4, + Types.MapType map = + Types.MapType.ofOptional( + 1, + 2, Types.StructType.of( - required(5, "address", Types.StringType.get()), - required(6, "city", Types.StringType.get()), - required(7, "state", Types.StringType.get()), - required(8, "zip", Types.IntegerType.get()) - ), + required(20, "address", Types.StringType.get()), + required(21, "city", Types.StringType.get()), + required(22, "state", Types.StringType.get()), + required(23, "zip", Types.IntegerType.get())), Types.StructType.of( required(9, "lat", Types.IntegerType.get()), - optional(10, "long", Types.IntegerType.get()) - ) - )) - ); + optional(8, "long", Types.IntegerType.get()))); + + Schema expected = + new Schema( + required(1, "id", Types.IntegerType.get()), + optional( + 2, + "locations", + Types.MapType.ofOptional( + 3, + 4, + Types.StructType.of( + required(5, "address", Types.StringType.get()), + required(6, "city", Types.StringType.get()), + required(7, "state", Types.StringType.get()), + required(8, "zip", Types.IntegerType.get())), + Types.StructType.of( + required(9, "lat", Types.IntegerType.get()), + optional(10, "long", Types.IntegerType.get()))))); - Schema result = new SchemaUpdate(schema, 1) - .addColumn("locations", map) - .apply(); + Schema result = new SchemaUpdate(schema, 1).addColumn("locations", map).apply(); - Assert.assertEquals("Should add map and reassign column IDs", - expected.asStruct(), result.asStruct()); + Assert.assertEquals( + "Should add map and reassign column IDs", expected.asStruct(), result.asStruct()); } @Test public void testAddNestedListOfStructs() { Schema schema = new Schema(required(1, "id", Types.IntegerType.get())); - Types.ListType list = Types.ListType.ofOptional(1, - Types.StructType.of( - required(9, "lat", Types.IntegerType.get()), - optional(8, "long", Types.IntegerType.get()) - ) - ); - - Schema expected = new Schema( - required(1, "id", Types.IntegerType.get()), - optional(2, "locations", Types.ListType.ofOptional(3, + Types.ListType list = + Types.ListType.ofOptional( + 1, Types.StructType.of( - required(4, "lat", Types.IntegerType.get()), - optional(5, "long", Types.IntegerType.get()) - ) - )) - ); + required(9, "lat", Types.IntegerType.get()), + optional(8, "long", Types.IntegerType.get()))); - Schema result = new SchemaUpdate(schema, 1) - .addColumn("locations", list) - .apply(); + Schema expected = + new Schema( + required(1, "id", Types.IntegerType.get()), + optional( + 2, + "locations", + Types.ListType.ofOptional( + 3, + Types.StructType.of( + required(4, "lat", Types.IntegerType.get()), + optional(5, "long", Types.IntegerType.get()))))); - Assert.assertEquals("Should add map and reassign column IDs", - expected.asStruct(), result.asStruct()); + Schema result = new SchemaUpdate(schema, 1).addColumn("locations", list).apply(); + + Assert.assertEquals( + "Should add map and reassign column IDs", expected.asStruct(), result.asStruct()); } @Test public void testAddRequiredColumn() { Schema schema = new Schema(required(1, "id", Types.IntegerType.get())); - Schema expected = new Schema( - required(1, "id", Types.IntegerType.get()), - required(2, "data", Types.StringType.get())); + Schema expected = + new Schema( + required(1, "id", Types.IntegerType.get()), + required(2, "data", Types.StringType.get())); - AssertHelpers.assertThrows("Should reject add required column if incompatible changes are not allowed", - IllegalArgumentException.class, "Incompatible change: cannot add required column: data", + AssertHelpers.assertThrows( + "Should reject add required column if incompatible changes are not allowed", + IllegalArgumentException.class, + "Incompatible change: cannot add required column: data", () -> new SchemaUpdate(schema, 1).addRequiredColumn("data", Types.StringType.get())); - Schema result = new SchemaUpdate(schema, 1) - .allowIncompatibleChanges() - .addRequiredColumn("data", Types.StringType.get()) - .apply(); + Schema result = + new SchemaUpdate(schema, 1) + .allowIncompatibleChanges() + .addRequiredColumn("data", Types.StringType.get()) + .apply(); Assert.assertEquals("Should add required column", expected.asStruct(), result.asStruct()); } @@ -388,11 +459,10 @@ public void testMakeColumnOptional() { Schema schema = new Schema(required(1, "id", Types.IntegerType.get())); Schema expected = new Schema(optional(1, "id", Types.IntegerType.get())); - Schema result = new SchemaUpdate(schema, 1) - .makeColumnOptional("id") - .apply(); + Schema result = new SchemaUpdate(schema, 1).makeColumnOptional("id").apply(); - Assert.assertEquals("Should update column to be optional", expected.asStruct(), result.asStruct()); + Assert.assertEquals( + "Should update column to be optional", expected.asStruct(), result.asStruct()); } @Test @@ -400,76 +470,89 @@ public void testRequireColumn() { Schema schema = new Schema(optional(1, "id", Types.IntegerType.get())); Schema expected = new Schema(required(1, "id", Types.IntegerType.get())); - AssertHelpers.assertThrows("Should reject change to required if incompatible changes are not allowed", - IllegalArgumentException.class, "Cannot change column nullability: id: optional -> required", + AssertHelpers.assertThrows( + "Should reject change to required if incompatible changes are not allowed", + IllegalArgumentException.class, + "Cannot change column nullability: id: optional -> required", () -> new SchemaUpdate(schema, 1).requireColumn("id")); // required to required is not an incompatible change new SchemaUpdate(expected, 1).requireColumn("id").apply(); - Schema result = new SchemaUpdate(schema, 1) - .allowIncompatibleChanges() - .requireColumn("id") - .apply(); + Schema result = + new SchemaUpdate(schema, 1).allowIncompatibleChanges().requireColumn("id").apply(); - Assert.assertEquals("Should update column to be required", expected.asStruct(), result.asStruct()); + Assert.assertEquals( + "Should update column to be required", expected.asStruct(), result.asStruct()); } @Test public void testMixedChanges() { - Schema expected = new Schema( - required(1, "id", Types.LongType.get(), "unique id"), - required(2, "json", Types.StringType.get()), - optional(3, "options", Types.StructType.of( - required(8, "feature1", Types.BooleanType.get()), - optional(9, "newfeature", Types.BooleanType.get()) - ), "struct of named boolean options"), - required(4, "locations", Types.MapType.ofRequired(10, 11, - Types.StructType.of( - required(20, "address", Types.StringType.get()), - required(21, "city", Types.StringType.get()), - required(22, "state", Types.StringType.get()), - required(23, "zip", Types.IntegerType.get()) - ), - Types.StructType.of( - required(12, "latitude", Types.DoubleType.get(), "latitude"), - optional(25, "alt", Types.FloatType.get()), - required(28, "description", Types.StringType.get(), "Location description") - )), "map of address to coordinate"), - optional(5, "points", Types.ListType.ofOptional(14, - Types.StructType.of( - optional(15, "X", Types.LongType.get()), - required(16, "y.y", Types.LongType.get()), - optional(26, "z", Types.LongType.get()), - optional(27, "t.t", Types.LongType.get(), "name with '.'") - )), "2-D cartesian points"), - required(6, "doubles", Types.ListType.ofRequired(17, - Types.DoubleType.get() - )), - optional(24, "toplevel", Types.DecimalType.of(9, 2)) - ); - - Schema updated = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .addColumn("toplevel", Types.DecimalType.of(9, 2)) - .addColumn("locations", "alt", Types.FloatType.get()) // map of structs - .addColumn("points", "z", Types.LongType.get()) // list of structs - .addColumn("points", "t.t", Types.LongType.get(), "name with '.'") - .renameColumn("data", "json") - .renameColumn("preferences", "options") - .renameColumn("preferences.feature2", "newfeature") // inside a renamed column - .renameColumn("locations.lat", "latitude") - .renameColumn("points.x", "X") - .renameColumn("points.y", "y.y") // has a '.' in the field name - .updateColumn("id", Types.LongType.get(), "unique id") - .updateColumn("locations.lat", Types.DoubleType.get()) // use the original name - .updateColumnDoc("locations.lat", "latitude") - .deleteColumn("locations.long") - .deleteColumn("properties") - .makeColumnOptional("points.x") - .allowIncompatibleChanges() - .requireColumn("data") - .addRequiredColumn("locations", "description", Types.StringType.get(), "Location description") - .apply(); + Schema expected = + new Schema( + required(1, "id", Types.LongType.get(), "unique id"), + required(2, "json", Types.StringType.get()), + optional( + 3, + "options", + Types.StructType.of( + required(8, "feature1", Types.BooleanType.get()), + optional(9, "newfeature", Types.BooleanType.get())), + "struct of named boolean options"), + required( + 4, + "locations", + Types.MapType.ofRequired( + 10, + 11, + Types.StructType.of( + required(20, "address", Types.StringType.get()), + required(21, "city", Types.StringType.get()), + required(22, "state", Types.StringType.get()), + required(23, "zip", Types.IntegerType.get())), + Types.StructType.of( + required(12, "latitude", Types.DoubleType.get(), "latitude"), + optional(25, "alt", Types.FloatType.get()), + required( + 28, "description", Types.StringType.get(), "Location description"))), + "map of address to coordinate"), + optional( + 5, + "points", + Types.ListType.ofOptional( + 14, + Types.StructType.of( + optional(15, "X", Types.LongType.get()), + required(16, "y.y", Types.LongType.get()), + optional(26, "z", Types.LongType.get()), + optional(27, "t.t", Types.LongType.get(), "name with '.'"))), + "2-D cartesian points"), + required(6, "doubles", Types.ListType.ofRequired(17, Types.DoubleType.get())), + optional(24, "toplevel", Types.DecimalType.of(9, 2))); + + Schema updated = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .addColumn("toplevel", Types.DecimalType.of(9, 2)) + .addColumn("locations", "alt", Types.FloatType.get()) // map of structs + .addColumn("points", "z", Types.LongType.get()) // list of structs + .addColumn("points", "t.t", Types.LongType.get(), "name with '.'") + .renameColumn("data", "json") + .renameColumn("preferences", "options") + .renameColumn("preferences.feature2", "newfeature") // inside a renamed column + .renameColumn("locations.lat", "latitude") + .renameColumn("points.x", "X") + .renameColumn("points.y", "y.y") // has a '.' in the field name + .updateColumn("id", Types.LongType.get(), "unique id") + .updateColumn("locations.lat", Types.DoubleType.get()) // use the original name + .updateColumnDoc("locations.lat", "latitude") + .deleteColumn("locations.long") + .deleteColumn("properties") + .makeColumnOptional("points.x") + .allowIncompatibleChanges() + .requireColumn("data") + .addRequiredColumn( + "locations", "description", Types.StringType.get(), "Location description") + .apply(); Assert.assertEquals("Should match with added fields", expected.asStruct(), updated.asStruct()); } @@ -477,28 +560,34 @@ public void testMixedChanges() { @Test public void testAmbiguousAdd() { // preferences.booleans could be top-level or a field of preferences - AssertHelpers.assertThrows("Should reject ambiguous column name", - IllegalArgumentException.class, "ambiguous name: preferences.booleans", () -> { + AssertHelpers.assertThrows( + "Should reject ambiguous column name", + IllegalArgumentException.class, + "ambiguous name: preferences.booleans", + () -> { UpdateSchema update = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID); update.addColumn("preferences.booleans", Types.BooleanType.get()); - } - ); + }); } @Test public void testAddAlreadyExists() { - AssertHelpers.assertThrows("Should reject column name that already exists", - IllegalArgumentException.class, "already exists: preferences.feature1", () -> { + AssertHelpers.assertThrows( + "Should reject column name that already exists", + IllegalArgumentException.class, + "already exists: preferences.feature1", + () -> { UpdateSchema update = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID); update.addColumn("preferences", "feature1", Types.BooleanType.get()); - } - ); - AssertHelpers.assertThrows("Should reject column name that already exists", - IllegalArgumentException.class, "already exists: preferences", () -> { + }); + AssertHelpers.assertThrows( + "Should reject column name that already exists", + IllegalArgumentException.class, + "already exists: preferences", + () -> { UpdateSchema update = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID); update.addColumn("preferences", Types.BooleanType.get()); - } - ); + }); } @Test @@ -506,624 +595,755 @@ public void testDeleteThenAdd() { Schema schema = new Schema(required(1, "id", Types.IntegerType.get())); Schema expected = new Schema(optional(2, "id", Types.IntegerType.get())); - Schema updated = new SchemaUpdate(schema, 1) - .deleteColumn("id") - .addColumn("id", optional(2, "id", Types.IntegerType.get()).type()) - .apply(); + Schema updated = + new SchemaUpdate(schema, 1) + .deleteColumn("id") + .addColumn("id", optional(2, "id", Types.IntegerType.get()).type()) + .apply(); Assert.assertEquals("Should match with added fields", expected.asStruct(), updated.asStruct()); } @Test public void testDeleteThenAddNested() { - Schema expectedNested = new Schema( - required(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()), - optional(3, "preferences", Types.StructType.of( - optional(9, "feature2", Types.BooleanType.get()), - optional(24, "feature1", Types.BooleanType.get()) - ), "struct of named boolean options"), - required(4, "locations", Types.MapType.ofRequired(10, 11, - Types.StructType.of( - required(20, "address", Types.StringType.get()), - required(21, "city", Types.StringType.get()), - required(22, "state", Types.StringType.get()), - required(23, "zip", Types.IntegerType.get()) - ), - Types.StructType.of( - required(12, "lat", Types.FloatType.get()), - required(13, "long", Types.FloatType.get()) - )), "map of address to coordinate"), - optional(5, "points", Types.ListType.ofOptional(14, - Types.StructType.of( - required(15, "x", Types.LongType.get()), - required(16, "y", Types.LongType.get()) - )), "2-D cartesian points"), - required(6, "doubles", Types.ListType.ofRequired(17, - Types.DoubleType.get() - )), - optional(7, "properties", Types.MapType.ofOptional(18, 19, - Types.StringType.get(), - Types.StringType.get() - ), "string map of properties") - ); - - Schema updatedNested = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .deleteColumn("preferences.feature1") - .addColumn("preferences", "feature1", Types.BooleanType.get()) - .apply(); - - Assert.assertEquals("Should match with added fields", expectedNested.asStruct(), updatedNested.asStruct()); + Schema expectedNested = + new Schema( + required(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get()), + optional( + 3, + "preferences", + Types.StructType.of( + optional(9, "feature2", Types.BooleanType.get()), + optional(24, "feature1", Types.BooleanType.get())), + "struct of named boolean options"), + required( + 4, + "locations", + Types.MapType.ofRequired( + 10, + 11, + Types.StructType.of( + required(20, "address", Types.StringType.get()), + required(21, "city", Types.StringType.get()), + required(22, "state", Types.StringType.get()), + required(23, "zip", Types.IntegerType.get())), + Types.StructType.of( + required(12, "lat", Types.FloatType.get()), + required(13, "long", Types.FloatType.get()))), + "map of address to coordinate"), + optional( + 5, + "points", + Types.ListType.ofOptional( + 14, + Types.StructType.of( + required(15, "x", Types.LongType.get()), + required(16, "y", Types.LongType.get()))), + "2-D cartesian points"), + required(6, "doubles", Types.ListType.ofRequired(17, Types.DoubleType.get())), + optional( + 7, + "properties", + Types.MapType.ofOptional(18, 19, Types.StringType.get(), Types.StringType.get()), + "string map of properties")); + + Schema updatedNested = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .deleteColumn("preferences.feature1") + .addColumn("preferences", "feature1", Types.BooleanType.get()) + .apply(); + + Assert.assertEquals( + "Should match with added fields", expectedNested.asStruct(), updatedNested.asStruct()); } @Test public void testDeleteMissingColumn() { - AssertHelpers.assertThrows("Should reject delete missing column", - IllegalArgumentException.class, "missing column: col", () -> { + AssertHelpers.assertThrows( + "Should reject delete missing column", + IllegalArgumentException.class, + "missing column: col", + () -> { UpdateSchema update = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID); update.deleteColumn("col"); - } - ); + }); } @Test public void testAddDeleteConflict() { - AssertHelpers.assertThrows("Should reject add then delete", - IllegalArgumentException.class, "missing column: col", () -> { + AssertHelpers.assertThrows( + "Should reject add then delete", + IllegalArgumentException.class, + "missing column: col", + () -> { UpdateSchema update = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID); update.addColumn("col", Types.IntegerType.get()).deleteColumn("col"); - } - ); - AssertHelpers.assertThrows("Should reject add then delete", - IllegalArgumentException.class, "column that has additions: preferences", () -> { + }); + AssertHelpers.assertThrows( + "Should reject add then delete", + IllegalArgumentException.class, + "column that has additions: preferences", + () -> { UpdateSchema update = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID); - update.addColumn("preferences", "feature3", Types.IntegerType.get()) + update + .addColumn("preferences", "feature3", Types.IntegerType.get()) .deleteColumn("preferences"); - } - ); + }); } @Test public void testRenameMissingColumn() { - AssertHelpers.assertThrows("Should reject rename missing column", - IllegalArgumentException.class, "missing column: col", () -> { + AssertHelpers.assertThrows( + "Should reject rename missing column", + IllegalArgumentException.class, + "missing column: col", + () -> { UpdateSchema update = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID); update.renameColumn("col", "fail"); - } - ); + }); } @Test public void testRenameDeleteConflict() { - AssertHelpers.assertThrows("Should reject rename then delete", - IllegalArgumentException.class, "column that has updates: id", () -> { + AssertHelpers.assertThrows( + "Should reject rename then delete", + IllegalArgumentException.class, + "column that has updates: id", + () -> { UpdateSchema update = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID); update.renameColumn("id", "col").deleteColumn("id"); - } - ); - AssertHelpers.assertThrows("Should reject rename then delete", - IllegalArgumentException.class, "missing column: col", () -> { + }); + AssertHelpers.assertThrows( + "Should reject rename then delete", + IllegalArgumentException.class, + "missing column: col", + () -> { UpdateSchema update = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID); update.renameColumn("id", "col").deleteColumn("col"); - } - ); + }); } @Test public void testDeleteRenameConflict() { - AssertHelpers.assertThrows("Should reject delete then rename", - IllegalArgumentException.class, "column that will be deleted: id", () -> { + AssertHelpers.assertThrows( + "Should reject delete then rename", + IllegalArgumentException.class, + "column that will be deleted: id", + () -> { UpdateSchema update = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID); update.deleteColumn("id").renameColumn("id", "identifier"); - } - ); + }); } @Test public void testUpdateMissingColumn() { - AssertHelpers.assertThrows("Should reject rename missing column", - IllegalArgumentException.class, "missing column: col", () -> { + AssertHelpers.assertThrows( + "Should reject rename missing column", + IllegalArgumentException.class, + "missing column: col", + () -> { UpdateSchema update = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID); update.updateColumn("col", Types.DateType.get()); - } - ); + }); } @Test public void testUpdateDeleteConflict() { - AssertHelpers.assertThrows("Should reject update then delete", - IllegalArgumentException.class, "column that has updates: id", () -> { + AssertHelpers.assertThrows( + "Should reject update then delete", + IllegalArgumentException.class, + "column that has updates: id", + () -> { UpdateSchema update = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID); update.updateColumn("id", Types.LongType.get()).deleteColumn("id"); - } - ); + }); } @Test public void testDeleteUpdateConflict() { - AssertHelpers.assertThrows("Should reject delete then update", - IllegalArgumentException.class, "column that will be deleted: id", () -> { + AssertHelpers.assertThrows( + "Should reject delete then update", + IllegalArgumentException.class, + "column that will be deleted: id", + () -> { UpdateSchema update = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID); update.deleteColumn("id").updateColumn("id", Types.LongType.get()); - } - ); + }); } @Test public void testDeleteMapKey() { - AssertHelpers.assertThrows("Should reject delete map key", - IllegalArgumentException.class, "Cannot delete map keys", () -> { + AssertHelpers.assertThrows( + "Should reject delete map key", + IllegalArgumentException.class, + "Cannot delete map keys", + () -> { new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID).deleteColumn("locations.key").apply(); - } - ); + }); } @Test public void testAddFieldToMapKey() { - AssertHelpers.assertThrows("Should reject add sub-field to map key", - IllegalArgumentException.class, "Cannot add fields to map keys", () -> { + AssertHelpers.assertThrows( + "Should reject add sub-field to map key", + IllegalArgumentException.class, + "Cannot add fields to map keys", + () -> { new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .addColumn("locations.key", "address_line_2", Types.StringType.get()).apply(); - } - ); + .addColumn("locations.key", "address_line_2", Types.StringType.get()) + .apply(); + }); } @Test public void testAlterMapKey() { - AssertHelpers.assertThrows("Should reject alter sub-field of map key", - IllegalArgumentException.class, "Cannot alter map keys", () -> { + AssertHelpers.assertThrows( + "Should reject alter sub-field of map key", + IllegalArgumentException.class, + "Cannot alter map keys", + () -> { new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .updateColumn("locations.key.zip", Types.LongType.get()).apply(); - } - ); + .updateColumn("locations.key.zip", Types.LongType.get()) + .apply(); + }); } @Test public void testUpdateMapKey() { - Schema schema = new Schema(required(1, "m", Types.MapType.ofOptional(2, 3, - Types.IntegerType.get(), Types.DoubleType.get()))); - AssertHelpers.assertThrows("Should reject update map key", - IllegalArgumentException.class, "Cannot update map keys", () -> { + Schema schema = + new Schema( + required( + 1, + "m", + Types.MapType.ofOptional(2, 3, Types.IntegerType.get(), Types.DoubleType.get()))); + AssertHelpers.assertThrows( + "Should reject update map key", + IllegalArgumentException.class, + "Cannot update map keys", + () -> { new SchemaUpdate(schema, 3).updateColumn("m.key", Types.LongType.get()).apply(); - } - ); + }); } @Test public void testUpdateAddedColumnDoc() { Schema schema = new Schema(required(1, "i", Types.IntegerType.get())); - AssertHelpers.assertThrows("Should reject add and update doc", - IllegalArgumentException.class, "Cannot update missing column", () -> { + AssertHelpers.assertThrows( + "Should reject add and update doc", + IllegalArgumentException.class, + "Cannot update missing column", + () -> { new SchemaUpdate(schema, 3) .addColumn("value", Types.LongType.get()) .updateColumnDoc("value", "a value") .apply(); - } - ); + }); } @Test public void testUpdateDeletedColumnDoc() { Schema schema = new Schema(required(1, "i", Types.IntegerType.get())); - AssertHelpers.assertThrows("Should reject add and update doc", - IllegalArgumentException.class, "Cannot update a column that will be deleted", () -> { - new SchemaUpdate(schema, 3) - .deleteColumn("i") - .updateColumnDoc("i", "a value") - .apply(); - } - ); + AssertHelpers.assertThrows( + "Should reject add and update doc", + IllegalArgumentException.class, + "Cannot update a column that will be deleted", + () -> { + new SchemaUpdate(schema, 3).deleteColumn("i").updateColumnDoc("i", "a value").apply(); + }); } @Test public void testMultipleMoves() { - Schema schema = new Schema( - required(1, "a", Types.IntegerType.get()), - required(2, "b", Types.IntegerType.get()), - required(3, "c", Types.IntegerType.get()), - required(4, "d", Types.IntegerType.get())); - - Schema expected = new Schema( - required(3, "c", Types.IntegerType.get()), - required(2, "b", Types.IntegerType.get()), - required(4, "d", Types.IntegerType.get()), - required(1, "a", Types.IntegerType.get())); + Schema schema = + new Schema( + required(1, "a", Types.IntegerType.get()), + required(2, "b", Types.IntegerType.get()), + required(3, "c", Types.IntegerType.get()), + required(4, "d", Types.IntegerType.get())); + + Schema expected = + new Schema( + required(3, "c", Types.IntegerType.get()), + required(2, "b", Types.IntegerType.get()), + required(4, "d", Types.IntegerType.get()), + required(1, "a", Types.IntegerType.get())); // moves are applied in order - Schema actual = new SchemaUpdate(schema, 4) - .moveFirst("d") - .moveFirst("c") - .moveAfter("b", "d") - .moveBefore("d", "a") - .apply(); + Schema actual = + new SchemaUpdate(schema, 4) + .moveFirst("d") + .moveFirst("c") + .moveAfter("b", "d") + .moveBefore("d", "a") + .apply(); Assert.assertEquals("Schema should match", expected.asStruct(), actual.asStruct()); } @Test public void testMoveTopLevelColumnFirst() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get())); - Schema expected = new Schema( - required(2, "data", Types.StringType.get()), - required(1, "id", Types.LongType.get())); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), required(2, "data", Types.StringType.get())); + Schema expected = + new Schema( + required(2, "data", Types.StringType.get()), required(1, "id", Types.LongType.get())); - Schema actual = new SchemaUpdate(schema, 2) - .moveFirst("data") - .apply(); + Schema actual = new SchemaUpdate(schema, 2).moveFirst("data").apply(); Assert.assertEquals("Should move data first", expected.asStruct(), actual.asStruct()); } @Test public void testMoveTopLevelColumnBeforeFirst() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get())); - Schema expected = new Schema( - required(2, "data", Types.StringType.get()), - required(1, "id", Types.LongType.get())); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), required(2, "data", Types.StringType.get())); + Schema expected = + new Schema( + required(2, "data", Types.StringType.get()), required(1, "id", Types.LongType.get())); - Schema actual = new SchemaUpdate(schema, 2) - .moveBefore("data", "id") - .apply(); + Schema actual = new SchemaUpdate(schema, 2).moveBefore("data", "id").apply(); Assert.assertEquals("Should move data first", expected.asStruct(), actual.asStruct()); } @Test public void testMoveTopLevelColumnAfterLast() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get())); - Schema expected = new Schema( - required(2, "data", Types.StringType.get()), - required(1, "id", Types.LongType.get())); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), required(2, "data", Types.StringType.get())); + Schema expected = + new Schema( + required(2, "data", Types.StringType.get()), required(1, "id", Types.LongType.get())); - Schema actual = new SchemaUpdate(schema, 2) - .moveAfter("id", "data") - .apply(); + Schema actual = new SchemaUpdate(schema, 2).moveAfter("id", "data").apply(); Assert.assertEquals("Should move data first", expected.asStruct(), actual.asStruct()); } @Test public void testMoveTopLevelColumnAfter() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get()), - optional(3, "ts", Types.TimestampType.withZone())); - Schema expected = new Schema( - required(1, "id", Types.LongType.get()), - optional(3, "ts", Types.TimestampType.withZone()), - required(2, "data", Types.StringType.get())); - - Schema actual = new SchemaUpdate(schema, 3) - .moveAfter("ts", "id") - .apply(); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + optional(3, "ts", Types.TimestampType.withZone())); + Schema expected = + new Schema( + required(1, "id", Types.LongType.get()), + optional(3, "ts", Types.TimestampType.withZone()), + required(2, "data", Types.StringType.get())); + + Schema actual = new SchemaUpdate(schema, 3).moveAfter("ts", "id").apply(); Assert.assertEquals("Should move data first", expected.asStruct(), actual.asStruct()); } @Test public void testMoveTopLevelColumnBefore() { - Schema schema = new Schema( - optional(3, "ts", Types.TimestampType.withZone()), - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get())); - Schema expected = new Schema( - required(1, "id", Types.LongType.get()), - optional(3, "ts", Types.TimestampType.withZone()), - required(2, "data", Types.StringType.get())); - - Schema actual = new SchemaUpdate(schema, 3) - .moveBefore("ts", "data") - .apply(); + Schema schema = + new Schema( + optional(3, "ts", Types.TimestampType.withZone()), + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get())); + Schema expected = + new Schema( + required(1, "id", Types.LongType.get()), + optional(3, "ts", Types.TimestampType.withZone()), + required(2, "data", Types.StringType.get())); + + Schema actual = new SchemaUpdate(schema, 3).moveBefore("ts", "data").apply(); Assert.assertEquals("Should move data first", expected.asStruct(), actual.asStruct()); } @Test public void testMoveNestedFieldFirst() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - required(3, "count", Types.LongType.get()), - required(4, "data", Types.StringType.get())))); - Schema expected = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - required(4, "data", Types.StringType.get()), - required(3, "count", Types.LongType.get())))); - - Schema actual = new SchemaUpdate(schema, 4) - .moveFirst("struct.data") - .apply(); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "struct", + Types.StructType.of( + required(3, "count", Types.LongType.get()), + required(4, "data", Types.StringType.get())))); + Schema expected = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "struct", + Types.StructType.of( + required(4, "data", Types.StringType.get()), + required(3, "count", Types.LongType.get())))); + + Schema actual = new SchemaUpdate(schema, 4).moveFirst("struct.data").apply(); Assert.assertEquals("Should move data first", expected.asStruct(), actual.asStruct()); } @Test public void testMoveNestedFieldBeforeFirst() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - required(3, "count", Types.LongType.get()), - required(4, "data", Types.StringType.get())))); - Schema expected = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - required(4, "data", Types.StringType.get()), - required(3, "count", Types.LongType.get())))); - - Schema actual = new SchemaUpdate(schema, 4) - .moveBefore("struct.data", "struct.count") - .apply(); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "struct", + Types.StructType.of( + required(3, "count", Types.LongType.get()), + required(4, "data", Types.StringType.get())))); + Schema expected = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "struct", + Types.StructType.of( + required(4, "data", Types.StringType.get()), + required(3, "count", Types.LongType.get())))); + + Schema actual = new SchemaUpdate(schema, 4).moveBefore("struct.data", "struct.count").apply(); Assert.assertEquals("Should move data first", expected.asStruct(), actual.asStruct()); } @Test public void testMoveNestedFieldAfterLast() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - required(3, "count", Types.LongType.get()), - required(4, "data", Types.StringType.get())))); - Schema expected = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - required(4, "data", Types.StringType.get()), - required(3, "count", Types.LongType.get())))); - - Schema actual = new SchemaUpdate(schema, 4) - .moveAfter("struct.count", "struct.data") - .apply(); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "struct", + Types.StructType.of( + required(3, "count", Types.LongType.get()), + required(4, "data", Types.StringType.get())))); + Schema expected = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "struct", + Types.StructType.of( + required(4, "data", Types.StringType.get()), + required(3, "count", Types.LongType.get())))); + + Schema actual = new SchemaUpdate(schema, 4).moveAfter("struct.count", "struct.data").apply(); Assert.assertEquals("Should move data first", expected.asStruct(), actual.asStruct()); } @Test public void testMoveNestedFieldAfter() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - required(3, "count", Types.LongType.get()), - required(4, "data", Types.StringType.get()), - optional(5, "ts", Types.TimestampType.withZone())))); - Schema expected = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - required(3, "count", Types.LongType.get()), - optional(5, "ts", Types.TimestampType.withZone()), - required(4, "data", Types.StringType.get())))); - - Schema actual = new SchemaUpdate(schema, 5) - .moveAfter("struct.ts", "struct.count") - .apply(); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "struct", + Types.StructType.of( + required(3, "count", Types.LongType.get()), + required(4, "data", Types.StringType.get()), + optional(5, "ts", Types.TimestampType.withZone())))); + Schema expected = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "struct", + Types.StructType.of( + required(3, "count", Types.LongType.get()), + optional(5, "ts", Types.TimestampType.withZone()), + required(4, "data", Types.StringType.get())))); + + Schema actual = new SchemaUpdate(schema, 5).moveAfter("struct.ts", "struct.count").apply(); Assert.assertEquals("Should move data first", expected.asStruct(), actual.asStruct()); } @Test public void testMoveNestedFieldBefore() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - optional(5, "ts", Types.TimestampType.withZone()), - required(3, "count", Types.LongType.get()), - required(4, "data", Types.StringType.get())))); - Schema expected = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - required(3, "count", Types.LongType.get()), - optional(5, "ts", Types.TimestampType.withZone()), - required(4, "data", Types.StringType.get())))); - - Schema actual = new SchemaUpdate(schema, 5) - .moveBefore("struct.ts", "struct.data") - .apply(); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "struct", + Types.StructType.of( + optional(5, "ts", Types.TimestampType.withZone()), + required(3, "count", Types.LongType.get()), + required(4, "data", Types.StringType.get())))); + Schema expected = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "struct", + Types.StructType.of( + required(3, "count", Types.LongType.get()), + optional(5, "ts", Types.TimestampType.withZone()), + required(4, "data", Types.StringType.get())))); + + Schema actual = new SchemaUpdate(schema, 5).moveBefore("struct.ts", "struct.data").apply(); Assert.assertEquals("Should move data first", expected.asStruct(), actual.asStruct()); } @Test public void testMoveListElementField() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "list", Types.ListType.ofOptional(6, Types.StructType.of( - optional(5, "ts", Types.TimestampType.withZone()), - required(3, "count", Types.LongType.get()), - required(4, "data", Types.StringType.get()))))); - Schema expected = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "list", Types.ListType.ofOptional(6, Types.StructType.of( - required(3, "count", Types.LongType.get()), - optional(5, "ts", Types.TimestampType.withZone()), - required(4, "data", Types.StringType.get()))))); - - Schema actual = new SchemaUpdate(schema, 6) - .moveBefore("list.ts", "list.data") - .apply(); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "list", + Types.ListType.ofOptional( + 6, + Types.StructType.of( + optional(5, "ts", Types.TimestampType.withZone()), + required(3, "count", Types.LongType.get()), + required(4, "data", Types.StringType.get()))))); + Schema expected = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "list", + Types.ListType.ofOptional( + 6, + Types.StructType.of( + required(3, "count", Types.LongType.get()), + optional(5, "ts", Types.TimestampType.withZone()), + required(4, "data", Types.StringType.get()))))); + + Schema actual = new SchemaUpdate(schema, 6).moveBefore("list.ts", "list.data").apply(); Assert.assertEquals("Should move data first", expected.asStruct(), actual.asStruct()); } @Test public void testMoveMapValueStructField() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "map", Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StructType.of( - optional(5, "ts", Types.TimestampType.withZone()), - required(3, "count", Types.LongType.get()), - required(4, "data", Types.StringType.get()))))); - Schema expected = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "map", Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StructType.of( - required(3, "count", Types.LongType.get()), - optional(5, "ts", Types.TimestampType.withZone()), - required(4, "data", Types.StringType.get()))))); - - Schema actual = new SchemaUpdate(schema, 7) - .moveBefore("map.ts", "map.data") - .apply(); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "map", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + optional(5, "ts", Types.TimestampType.withZone()), + required(3, "count", Types.LongType.get()), + required(4, "data", Types.StringType.get()))))); + Schema expected = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "map", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + required(3, "count", Types.LongType.get()), + optional(5, "ts", Types.TimestampType.withZone()), + required(4, "data", Types.StringType.get()))))); + + Schema actual = new SchemaUpdate(schema, 7).moveBefore("map.ts", "map.data").apply(); Assert.assertEquals("Should move data first", expected.asStruct(), actual.asStruct()); } @Test public void testMoveAddedTopLevelColumn() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get())); - Schema expected = new Schema( - required(1, "id", Types.LongType.get()), - optional(3, "ts", Types.TimestampType.withZone()), - required(2, "data", Types.StringType.get())); - - Schema actual = new SchemaUpdate(schema, 2) - .addColumn("ts", Types.TimestampType.withZone()) - .moveAfter("ts", "id") - .apply(); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), required(2, "data", Types.StringType.get())); + Schema expected = + new Schema( + required(1, "id", Types.LongType.get()), + optional(3, "ts", Types.TimestampType.withZone()), + required(2, "data", Types.StringType.get())); + + Schema actual = + new SchemaUpdate(schema, 2) + .addColumn("ts", Types.TimestampType.withZone()) + .moveAfter("ts", "id") + .apply(); Assert.assertEquals("Should move data first", expected.asStruct(), actual.asStruct()); } @Test public void testMoveAddedTopLevelColumnAfterAddedColumn() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get())); - Schema expected = new Schema( - required(1, "id", Types.LongType.get()), - optional(3, "ts", Types.TimestampType.withZone()), - optional(4, "count", Types.LongType.get()), - required(2, "data", Types.StringType.get())); - - Schema actual = new SchemaUpdate(schema, 2) - .addColumn("ts", Types.TimestampType.withZone()) - .addColumn("count", Types.LongType.get()) - .moveAfter("ts", "id") - .moveAfter("count", "ts") - .apply(); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), required(2, "data", Types.StringType.get())); + Schema expected = + new Schema( + required(1, "id", Types.LongType.get()), + optional(3, "ts", Types.TimestampType.withZone()), + optional(4, "count", Types.LongType.get()), + required(2, "data", Types.StringType.get())); + + Schema actual = + new SchemaUpdate(schema, 2) + .addColumn("ts", Types.TimestampType.withZone()) + .addColumn("count", Types.LongType.get()) + .moveAfter("ts", "id") + .moveAfter("count", "ts") + .apply(); Assert.assertEquals("Should move data first", expected.asStruct(), actual.asStruct()); } @Test public void testMoveAddedNestedStructField() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - required(3, "count", Types.LongType.get()), - required(4, "data", Types.StringType.get())))); - Schema expected = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - optional(5, "ts", Types.TimestampType.withZone()), - required(3, "count", Types.LongType.get()), - required(4, "data", Types.StringType.get())))); - - Schema actual = new SchemaUpdate(schema, 4) - .addColumn("struct", "ts", Types.TimestampType.withZone()) - .moveBefore("struct.ts", "struct.count") - .apply(); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "struct", + Types.StructType.of( + required(3, "count", Types.LongType.get()), + required(4, "data", Types.StringType.get())))); + Schema expected = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "struct", + Types.StructType.of( + optional(5, "ts", Types.TimestampType.withZone()), + required(3, "count", Types.LongType.get()), + required(4, "data", Types.StringType.get())))); + + Schema actual = + new SchemaUpdate(schema, 4) + .addColumn("struct", "ts", Types.TimestampType.withZone()) + .moveBefore("struct.ts", "struct.count") + .apply(); Assert.assertEquals("Should move data first", expected.asStruct(), actual.asStruct()); } @Test public void testMoveAddedNestedStructFieldBeforeAddedColumn() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - required(3, "count", Types.LongType.get()), - required(4, "data", Types.StringType.get())))); - Schema expected = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - optional(6, "size", Types.LongType.get()), - optional(5, "ts", Types.TimestampType.withZone()), - required(3, "count", Types.LongType.get()), - required(4, "data", Types.StringType.get())))); - - Schema actual = new SchemaUpdate(schema, 4) - .addColumn("struct", "ts", Types.TimestampType.withZone()) - .addColumn("struct", "size", Types.LongType.get()) - .moveBefore("struct.ts", "struct.count") - .moveBefore("struct.size", "struct.ts") - .apply(); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "struct", + Types.StructType.of( + required(3, "count", Types.LongType.get()), + required(4, "data", Types.StringType.get())))); + Schema expected = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "struct", + Types.StructType.of( + optional(6, "size", Types.LongType.get()), + optional(5, "ts", Types.TimestampType.withZone()), + required(3, "count", Types.LongType.get()), + required(4, "data", Types.StringType.get())))); + + Schema actual = + new SchemaUpdate(schema, 4) + .addColumn("struct", "ts", Types.TimestampType.withZone()) + .addColumn("struct", "size", Types.LongType.get()) + .moveBefore("struct.ts", "struct.count") + .moveBefore("struct.size", "struct.ts") + .apply(); Assert.assertEquals("Should move data first", expected.asStruct(), actual.asStruct()); } @Test public void testMoveSelfReferenceFails() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), required(2, "data", Types.StringType.get())); - AssertHelpers.assertThrows("Should fail move for a field that is not in the schema", - IllegalArgumentException.class, "Cannot move id before itself", () -> - new SchemaUpdate(schema, 2) - .moveBefore("id", "id") - .apply()); + AssertHelpers.assertThrows( + "Should fail move for a field that is not in the schema", + IllegalArgumentException.class, + "Cannot move id before itself", + () -> new SchemaUpdate(schema, 2).moveBefore("id", "id").apply()); - AssertHelpers.assertThrows("Should fail move for a field that is not in the schema", - IllegalArgumentException.class, "Cannot move id after itself", () -> - new SchemaUpdate(schema, 2) - .moveAfter("id", "id") - .apply()); + AssertHelpers.assertThrows( + "Should fail move for a field that is not in the schema", + IllegalArgumentException.class, + "Cannot move id after itself", + () -> new SchemaUpdate(schema, 2).moveAfter("id", "id").apply()); } @Test public void testMoveMissingColumnFails() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), required(2, "data", Types.StringType.get())); - AssertHelpers.assertThrows("Should fail move for a field that is not in the schema", - IllegalArgumentException.class, "Cannot move missing column", () -> - new SchemaUpdate(schema, 2) - .moveFirst("items") - .apply()); + AssertHelpers.assertThrows( + "Should fail move for a field that is not in the schema", + IllegalArgumentException.class, + "Cannot move missing column", + () -> new SchemaUpdate(schema, 2).moveFirst("items").apply()); - AssertHelpers.assertThrows("Should fail move for a field that is not in the schema", - IllegalArgumentException.class, "Cannot move missing column", () -> - new SchemaUpdate(schema, 2) - .moveBefore("items", "id") - .apply()); + AssertHelpers.assertThrows( + "Should fail move for a field that is not in the schema", + IllegalArgumentException.class, + "Cannot move missing column", + () -> new SchemaUpdate(schema, 2).moveBefore("items", "id").apply()); - AssertHelpers.assertThrows("Should fail move for a field that is not in the schema", - IllegalArgumentException.class, "Cannot move missing column", () -> - new SchemaUpdate(schema, 2) - .moveAfter("items", "data") - .apply()); + AssertHelpers.assertThrows( + "Should fail move for a field that is not in the schema", + IllegalArgumentException.class, + "Cannot move missing column", + () -> new SchemaUpdate(schema, 2).moveAfter("items", "data").apply()); } @Test public void testMoveBeforeAddFails() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), required(2, "data", Types.StringType.get())); - AssertHelpers.assertThrows("Should fail move for a field that has not been added yet", - IllegalArgumentException.class, "Cannot move missing column", () -> + AssertHelpers.assertThrows( + "Should fail move for a field that has not been added yet", + IllegalArgumentException.class, + "Cannot move missing column", + () -> new SchemaUpdate(schema, 2) .moveFirst("ts") .addColumn("ts", Types.TimestampType.withZone()) .apply()); - AssertHelpers.assertThrows("Should fail move for a field that has not been added yet", - IllegalArgumentException.class, "Cannot move missing column", () -> + AssertHelpers.assertThrows( + "Should fail move for a field that has not been added yet", + IllegalArgumentException.class, + "Cannot move missing column", + () -> new SchemaUpdate(schema, 2) .moveBefore("ts", "id") .addColumn("ts", Types.TimestampType.withZone()) .apply()); - AssertHelpers.assertThrows("Should fail move for a field that has not been added yet", - IllegalArgumentException.class, "Cannot move missing column", () -> + AssertHelpers.assertThrows( + "Should fail move for a field that has not been added yet", + IllegalArgumentException.class, + "Cannot move missing column", + () -> new SchemaUpdate(schema, 2) .moveAfter("ts", "data") .addColumn("ts", Types.TimestampType.withZone()) @@ -1132,209 +1352,262 @@ public void testMoveBeforeAddFails() { @Test public void testMoveMissingReferenceColumnFails() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), required(2, "data", Types.StringType.get())); - AssertHelpers.assertThrows("Should fail move before a field that is not in the schema", - IllegalArgumentException.class, "Cannot move id before missing column", () -> - new SchemaUpdate(schema, 2) - .moveBefore("id", "items") - .apply()); + AssertHelpers.assertThrows( + "Should fail move before a field that is not in the schema", + IllegalArgumentException.class, + "Cannot move id before missing column", + () -> new SchemaUpdate(schema, 2).moveBefore("id", "items").apply()); - AssertHelpers.assertThrows("Should fail move after for a field that is not in the schema", - IllegalArgumentException.class, "Cannot move data after missing column", () -> - new SchemaUpdate(schema, 2) - .moveAfter("data", "items") - .apply()); + AssertHelpers.assertThrows( + "Should fail move after for a field that is not in the schema", + IllegalArgumentException.class, + "Cannot move data after missing column", + () -> new SchemaUpdate(schema, 2).moveAfter("data", "items").apply()); } @Test public void testMovePrimitiveMapKeyFails() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get()), - optional(3, "map", Types.MapType.ofRequired(4, 5, Types.StringType.get(), Types.StringType.get()))); - - AssertHelpers.assertThrows("Should fail move for map key", - IllegalArgumentException.class, "Cannot move fields in non-struct type", () -> - new SchemaUpdate(schema, 5) - .moveBefore("map.key", "map.value") - .apply()); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + optional( + 3, + "map", + Types.MapType.ofRequired(4, 5, Types.StringType.get(), Types.StringType.get()))); + + AssertHelpers.assertThrows( + "Should fail move for map key", + IllegalArgumentException.class, + "Cannot move fields in non-struct type", + () -> new SchemaUpdate(schema, 5).moveBefore("map.key", "map.value").apply()); } @Test public void testMovePrimitiveMapValueFails() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get()), - optional(3, "map", Types.MapType.ofRequired(4, 5, Types.StringType.get(), Types.StructType.of()))); - - AssertHelpers.assertThrows("Should fail move for map value", - IllegalArgumentException.class, "Cannot move fields in non-struct type", () -> - new SchemaUpdate(schema, 5) - .moveBefore("map.value", "map.key") - .apply()); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + optional( + 3, + "map", + Types.MapType.ofRequired(4, 5, Types.StringType.get(), Types.StructType.of()))); + + AssertHelpers.assertThrows( + "Should fail move for map value", + IllegalArgumentException.class, + "Cannot move fields in non-struct type", + () -> new SchemaUpdate(schema, 5).moveBefore("map.value", "map.key").apply()); } @Test public void testMovePrimitiveListElementFails() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get()), - optional(3, "list", Types.ListType.ofRequired(4, Types.StringType.get()))); - - AssertHelpers.assertThrows("Should fail move for list element", - IllegalArgumentException.class, "Cannot move fields in non-struct type", () -> - new SchemaUpdate(schema, 4) - .moveBefore("list.element", "list") - .apply()); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + optional(3, "list", Types.ListType.ofRequired(4, Types.StringType.get()))); + + AssertHelpers.assertThrows( + "Should fail move for list element", + IllegalArgumentException.class, + "Cannot move fields in non-struct type", + () -> new SchemaUpdate(schema, 4).moveBefore("list.element", "list").apply()); } @Test public void testMoveTopLevelBetweenStructsFails() { - Schema schema = new Schema( - required(1, "a", Types.IntegerType.get()), - required(2, "b", Types.IntegerType.get()), - required(3, "struct", Types.StructType.of( - required(4, "x", Types.IntegerType.get()), - required(5, "y", Types.IntegerType.get())))); - - AssertHelpers.assertThrows("Should fail move between separate structs", - IllegalArgumentException.class, "Cannot move field a to a different struct", () -> - new SchemaUpdate(schema, 5) - .moveBefore("a", "struct.x") - .apply()); + Schema schema = + new Schema( + required(1, "a", Types.IntegerType.get()), + required(2, "b", Types.IntegerType.get()), + required( + 3, + "struct", + Types.StructType.of( + required(4, "x", Types.IntegerType.get()), + required(5, "y", Types.IntegerType.get())))); + + AssertHelpers.assertThrows( + "Should fail move between separate structs", + IllegalArgumentException.class, + "Cannot move field a to a different struct", + () -> new SchemaUpdate(schema, 5).moveBefore("a", "struct.x").apply()); } @Test public void testMoveBetweenStructsFails() { - Schema schema = new Schema( - required(1, "s1", Types.StructType.of( - required(3, "a", Types.IntegerType.get()), - required(4, "b", Types.IntegerType.get()))), - required(2, "s2", Types.StructType.of( - required(5, "x", Types.IntegerType.get()), - required(6, "y", Types.IntegerType.get())))); - - AssertHelpers.assertThrows("Should fail move between separate structs", - IllegalArgumentException.class, "Cannot move field s2.x to a different struct", () -> - new SchemaUpdate(schema, 6) - .moveBefore("s2.x", "s1.a") - .apply()); + Schema schema = + new Schema( + required( + 1, + "s1", + Types.StructType.of( + required(3, "a", Types.IntegerType.get()), + required(4, "b", Types.IntegerType.get()))), + required( + 2, + "s2", + Types.StructType.of( + required(5, "x", Types.IntegerType.get()), + required(6, "y", Types.IntegerType.get())))); + + AssertHelpers.assertThrows( + "Should fail move between separate structs", + IllegalArgumentException.class, + "Cannot move field s2.x to a different struct", + () -> new SchemaUpdate(schema, 6).moveBefore("s2.x", "s1.a").apply()); } @Test public void testAddExistingIdentifierFields() { - Schema newSchema = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .setIdentifierFields("id") - .apply(); + Schema newSchema = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID).setIdentifierFields("id").apply(); - Assert.assertEquals("add an existing field as identifier field should succeed", + Assert.assertEquals( + "add an existing field as identifier field should succeed", Sets.newHashSet(newSchema.findField("id").fieldId()), newSchema.identifierFieldIds()); } @Test public void testAddNewIdentifierFieldColumns() { - Schema newSchema = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .allowIncompatibleChanges() - .addRequiredColumn("new_field", Types.StringType.get()) - .setIdentifierFields("id", "new_field") - .apply(); - - Assert.assertEquals("add column then set as identifier should succeed", - Sets.newHashSet(newSchema.findField("id").fieldId(), newSchema.findField("new_field").fieldId()), + Schema newSchema = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .allowIncompatibleChanges() + .addRequiredColumn("new_field", Types.StringType.get()) + .setIdentifierFields("id", "new_field") + .apply(); + + Assert.assertEquals( + "add column then set as identifier should succeed", + Sets.newHashSet( + newSchema.findField("id").fieldId(), newSchema.findField("new_field").fieldId()), newSchema.identifierFieldIds()); - newSchema = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .allowIncompatibleChanges() - .setIdentifierFields("id", "new_field") - .addRequiredColumn("new_field", Types.StringType.get()) - .apply(); - - Assert.assertEquals("set identifier then add column should succeed", - Sets.newHashSet(newSchema.findField("id").fieldId(), newSchema.findField("new_field").fieldId()), + newSchema = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .allowIncompatibleChanges() + .setIdentifierFields("id", "new_field") + .addRequiredColumn("new_field", Types.StringType.get()) + .apply(); + + Assert.assertEquals( + "set identifier then add column should succeed", + Sets.newHashSet( + newSchema.findField("id").fieldId(), newSchema.findField("new_field").fieldId()), newSchema.identifierFieldIds()); } @Test public void testAddNestedIdentifierFieldColumns() { - Schema newSchema = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .allowIncompatibleChanges() - .addRequiredColumn("required_struct", Types.StructType.of( - Types.NestedField.required(SCHEMA_LAST_COLUMN_ID + 2, "field", Types.StringType.get()) - )) - .apply(); - - newSchema = new SchemaUpdate(newSchema, SCHEMA_LAST_COLUMN_ID + 2) - .setIdentifierFields("required_struct.field") - .apply(); - - Assert.assertEquals("set existing nested field as identifier should succeed", + Schema newSchema = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .allowIncompatibleChanges() + .addRequiredColumn( + "required_struct", + Types.StructType.of( + Types.NestedField.required( + SCHEMA_LAST_COLUMN_ID + 2, "field", Types.StringType.get()))) + .apply(); + + newSchema = + new SchemaUpdate(newSchema, SCHEMA_LAST_COLUMN_ID + 2) + .setIdentifierFields("required_struct.field") + .apply(); + + Assert.assertEquals( + "set existing nested field as identifier should succeed", Sets.newHashSet(newSchema.findField("required_struct.field").fieldId()), newSchema.identifierFieldIds()); - newSchema = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .allowIncompatibleChanges() - .addRequiredColumn("new", Types.StructType.of( - Types.NestedField.required(SCHEMA_LAST_COLUMN_ID + 2, "field", Types.StringType.get()) - )) - .setIdentifierFields("new.field") - .apply(); - - Assert.assertEquals("set newly added nested field as identifier should succeed", + newSchema = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .allowIncompatibleChanges() + .addRequiredColumn( + "new", + Types.StructType.of( + Types.NestedField.required( + SCHEMA_LAST_COLUMN_ID + 2, "field", Types.StringType.get()))) + .setIdentifierFields("new.field") + .apply(); + + Assert.assertEquals( + "set newly added nested field as identifier should succeed", Sets.newHashSet(newSchema.findField("new.field").fieldId()), newSchema.identifierFieldIds()); - newSchema = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .allowIncompatibleChanges() - .addRequiredColumn("new", Types.StructType.of( - Types.NestedField.required(SCHEMA_LAST_COLUMN_ID + 2, "field", Types.StructType.of( - Types.NestedField.required(SCHEMA_LAST_COLUMN_ID + 3, "nested", Types.StringType.get()))))) - .setIdentifierFields("new.field.nested") - .apply(); - - Assert.assertEquals("set newly added multi-layer nested field as identifier should succeed", + newSchema = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .allowIncompatibleChanges() + .addRequiredColumn( + "new", + Types.StructType.of( + Types.NestedField.required( + SCHEMA_LAST_COLUMN_ID + 2, + "field", + Types.StructType.of( + Types.NestedField.required( + SCHEMA_LAST_COLUMN_ID + 3, "nested", Types.StringType.get()))))) + .setIdentifierFields("new.field.nested") + .apply(); + + Assert.assertEquals( + "set newly added multi-layer nested field as identifier should succeed", Sets.newHashSet(newSchema.findField("new.field.nested").fieldId()), newSchema.identifierFieldIds()); } @Test public void testAddDottedIdentifierFieldColumns() { - Schema newSchema = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .allowIncompatibleChanges() - .addRequiredColumn(null, "dot.field", Types.StringType.get()) - .setIdentifierFields("id", "dot.field") - .apply(); - - Assert.assertEquals("add a field with dot as identifier should succeed", - Sets.newHashSet(newSchema.findField("id").fieldId(), newSchema.findField("dot.field").fieldId()), + Schema newSchema = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .allowIncompatibleChanges() + .addRequiredColumn(null, "dot.field", Types.StringType.get()) + .setIdentifierFields("id", "dot.field") + .apply(); + + Assert.assertEquals( + "add a field with dot as identifier should succeed", + Sets.newHashSet( + newSchema.findField("id").fieldId(), newSchema.findField("dot.field").fieldId()), newSchema.identifierFieldIds()); } @Test public void testRemoveIdentifierFields() { - Schema newSchema = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .allowIncompatibleChanges() - .addRequiredColumn("new_field", Types.StringType.get()) - .addRequiredColumn("new_field2", Types.StringType.get()) - .setIdentifierFields("id", "new_field", "new_field2") - .apply(); - - newSchema = new SchemaUpdate(newSchema, SCHEMA_LAST_COLUMN_ID) - .setIdentifierFields("new_field", "new_field2") - .apply(); - - Assert.assertEquals("remove an identifier field should succeed", - Sets.newHashSet(newSchema.findField("new_field").fieldId(), newSchema.findField("new_field2").fieldId()), + Schema newSchema = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .allowIncompatibleChanges() + .addRequiredColumn("new_field", Types.StringType.get()) + .addRequiredColumn("new_field2", Types.StringType.get()) + .setIdentifierFields("id", "new_field", "new_field2") + .apply(); + + newSchema = + new SchemaUpdate(newSchema, SCHEMA_LAST_COLUMN_ID) + .setIdentifierFields("new_field", "new_field2") + .apply(); + + Assert.assertEquals( + "remove an identifier field should succeed", + Sets.newHashSet( + newSchema.findField("new_field").fieldId(), + newSchema.findField("new_field2").fieldId()), newSchema.identifierFieldIds()); - newSchema = new SchemaUpdate(newSchema, SCHEMA_LAST_COLUMN_ID) - .setIdentifierFields(Sets.newHashSet()) - .apply(); + newSchema = + new SchemaUpdate(newSchema, SCHEMA_LAST_COLUMN_ID) + .setIdentifierFields(Sets.newHashSet()) + .apply(); - Assert.assertEquals("remove all identifier fields should succeed", + Assert.assertEquals( + "remove all identifier fields should succeed", Sets.newHashSet(), newSchema.identifierFieldIds()); } @@ -1342,229 +1615,271 @@ public void testRemoveIdentifierFields() { @SuppressWarnings("MethodLength") @Test public void testSetIdentifierFieldsFails() { - Schema testSchema = new Schema( - optional(1, "id", Types.IntegerType.get()), - required(2, "float", Types.FloatType.get()), - required(3, "double", Types.DoubleType.get()) - ); + Schema testSchema = + new Schema( + optional(1, "id", Types.IntegerType.get()), + required(2, "float", Types.FloatType.get()), + required(3, "double", Types.DoubleType.get())); - AssertHelpers.assertThrows("Creating schema with nonexistent identifier fieldId should fail", + AssertHelpers.assertThrows( + "Creating schema with nonexistent identifier fieldId should fail", IllegalArgumentException.class, "Cannot add fieldId 999 as an identifier field: field does not exist", () -> new Schema(testSchema.asStruct().fields(), ImmutableSet.of(999))); - AssertHelpers.assertThrows("Creating schema with optional identifier field should fail", + AssertHelpers.assertThrows( + "Creating schema with optional identifier field should fail", IllegalArgumentException.class, "Cannot add field id as an identifier field: not a required field", () -> new Schema(testSchema.asStruct().fields(), ImmutableSet.of(1))); - AssertHelpers.assertThrows("Creating schema with float identifier field should fail", + AssertHelpers.assertThrows( + "Creating schema with float identifier field should fail", IllegalArgumentException.class, "Cannot add field float as an identifier field: must not be float or double field", () -> new Schema(testSchema.asStruct().fields(), ImmutableSet.of(2))); - AssertHelpers.assertThrows("Creating schema with double identifier field should fail", + AssertHelpers.assertThrows( + "Creating schema with double identifier field should fail", IllegalArgumentException.class, "Cannot add field double as an identifier field: must not be float or double field", () -> new Schema(testSchema.asStruct().fields(), ImmutableSet.of(3))); - AssertHelpers.assertThrows("add a field with name not exist should fail", + AssertHelpers.assertThrows( + "add a field with name not exist should fail", IllegalArgumentException.class, "not found in current schema or added columns", - () -> new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .setIdentifierFields("unknown") - .apply()); + () -> + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID).setIdentifierFields("unknown").apply()); - AssertHelpers.assertThrows("add a field of non-primitive type should fail", + AssertHelpers.assertThrows( + "add a field of non-primitive type should fail", IllegalArgumentException.class, "not a primitive type field", - () -> new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .setIdentifierFields("locations") - .apply()); + () -> + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .setIdentifierFields("locations") + .apply()); - AssertHelpers.assertThrows("add an optional field should fail", + AssertHelpers.assertThrows( + "add an optional field should fail", IllegalArgumentException.class, "not a required field", - () -> new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .setIdentifierFields("data") - .apply()); + () -> new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID).setIdentifierFields("data").apply()); - AssertHelpers.assertThrows("add a map key nested field should fail", + AssertHelpers.assertThrows( + "add a map key nested field should fail", IllegalArgumentException.class, "must not be nested in " + SCHEMA.findField("locations"), - () -> new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .setIdentifierFields("locations.key.zip") - .apply()); + () -> + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .setIdentifierFields("locations.key.zip") + .apply()); - AssertHelpers.assertThrows("add a nested field in list should fail", + AssertHelpers.assertThrows( + "add a nested field in list should fail", IllegalArgumentException.class, "must not be nested in " + SCHEMA.findField("points"), - () -> new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .setIdentifierFields("points.element.x") - .apply()); - - Schema newSchema = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .allowIncompatibleChanges() - .addRequiredColumn("col_float", Types.FloatType.get()) - .addRequiredColumn("col_double", Types.DoubleType.get()) - .addRequiredColumn("new", Types.StructType.of( - Types.NestedField.required(SCHEMA_LAST_COLUMN_ID + 4, "fields", Types.ListType.ofRequired( - SCHEMA_LAST_COLUMN_ID + 5, Types.StructType.of( - Types.NestedField.required(SCHEMA_LAST_COLUMN_ID + 6, "nested", Types.StringType.get()) - )) - ) - )) - .addRequiredColumn("new_map", Types.MapType.ofRequired(SCHEMA_LAST_COLUMN_ID + 8, SCHEMA_LAST_COLUMN_ID + 9, - Types.StructType.of( - required(SCHEMA_LAST_COLUMN_ID + 10, "key_col", Types.StringType.get()) - ), - Types.StructType.of( - required(SCHEMA_LAST_COLUMN_ID + 11, "val_col", Types.StringType.get()) - )), "map of address to coordinate") - .addRequiredColumn("required_list", Types.ListType.ofRequired(SCHEMA_LAST_COLUMN_ID + 13, - Types.StructType.of( - required(SCHEMA_LAST_COLUMN_ID + 14, "x", Types.LongType.get()), - required(SCHEMA_LAST_COLUMN_ID + 15, "y", Types.LongType.get()) - ))) - .apply(); + () -> + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .setIdentifierFields("points.element.x") + .apply()); + + Schema newSchema = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .allowIncompatibleChanges() + .addRequiredColumn("col_float", Types.FloatType.get()) + .addRequiredColumn("col_double", Types.DoubleType.get()) + .addRequiredColumn( + "new", + Types.StructType.of( + Types.NestedField.required( + SCHEMA_LAST_COLUMN_ID + 4, + "fields", + Types.ListType.ofRequired( + SCHEMA_LAST_COLUMN_ID + 5, + Types.StructType.of( + Types.NestedField.required( + SCHEMA_LAST_COLUMN_ID + 6, + "nested", + Types.StringType.get())))))) + .addRequiredColumn( + "new_map", + Types.MapType.ofRequired( + SCHEMA_LAST_COLUMN_ID + 8, + SCHEMA_LAST_COLUMN_ID + 9, + Types.StructType.of( + required(SCHEMA_LAST_COLUMN_ID + 10, "key_col", Types.StringType.get())), + Types.StructType.of( + required(SCHEMA_LAST_COLUMN_ID + 11, "val_col", Types.StringType.get()))), + "map of address to coordinate") + .addRequiredColumn( + "required_list", + Types.ListType.ofRequired( + SCHEMA_LAST_COLUMN_ID + 13, + Types.StructType.of( + required(SCHEMA_LAST_COLUMN_ID + 14, "x", Types.LongType.get()), + required(SCHEMA_LAST_COLUMN_ID + 15, "y", Types.LongType.get())))) + .apply(); int lastColId = SCHEMA_LAST_COLUMN_ID + 15; - AssertHelpers.assertThrows("add a nested field in list should fail", + AssertHelpers.assertThrows( + "add a nested field in list should fail", IllegalArgumentException.class, "must not be nested in " + newSchema.findField("required_list"), - () -> new SchemaUpdate(newSchema, lastColId) - .setIdentifierFields("required_list.element.x") - .apply()); + () -> + new SchemaUpdate(newSchema, lastColId) + .setIdentifierFields("required_list.element.x") + .apply()); - AssertHelpers.assertThrows("add a double field should fail", + AssertHelpers.assertThrows( + "add a double field should fail", IllegalArgumentException.class, "must not be float or double field", - () -> new SchemaUpdate(newSchema, lastColId) - .setIdentifierFields("col_double") - .apply()); + () -> new SchemaUpdate(newSchema, lastColId).setIdentifierFields("col_double").apply()); - AssertHelpers.assertThrows("add a float field should fail", + AssertHelpers.assertThrows( + "add a float field should fail", IllegalArgumentException.class, "must not be float or double field", - () -> new SchemaUpdate(newSchema, lastColId) - .setIdentifierFields("col_float") - .apply()); + () -> new SchemaUpdate(newSchema, lastColId).setIdentifierFields("col_float").apply()); - AssertHelpers.assertThrows("add a map value nested field should fail", + AssertHelpers.assertThrows( + "add a map value nested field should fail", IllegalArgumentException.class, "must not be nested in " + newSchema.findField("new_map"), - () -> new SchemaUpdate(newSchema, lastColId) - .setIdentifierFields("new_map.value.val_col") - .apply()); + () -> + new SchemaUpdate(newSchema, lastColId) + .setIdentifierFields("new_map.value.val_col") + .apply()); - AssertHelpers.assertThrows("add a nested field in struct of a list should fail", + AssertHelpers.assertThrows( + "add a nested field in struct of a list should fail", IllegalArgumentException.class, "must not be nested in " + newSchema.findField("new.fields"), - () -> new SchemaUpdate(newSchema, lastColId) - .setIdentifierFields("new.fields.element.nested") - .apply()); + () -> + new SchemaUpdate(newSchema, lastColId) + .setIdentifierFields("new.fields.element.nested") + .apply()); - AssertHelpers.assertThrows("add a nested field in an optional struct should fail", + AssertHelpers.assertThrows( + "add a nested field in an optional struct should fail", IllegalArgumentException.class, "must not be nested in an optional field " + newSchema.findField("preferences"), - () -> new SchemaUpdate(newSchema, lastColId) - .setIdentifierFields("preferences.feature1") - .apply()); + () -> + new SchemaUpdate(newSchema, lastColId) + .setIdentifierFields("preferences.feature1") + .apply()); } @Test public void testDeleteIdentifierFieldColumns() { - Schema schemaWithIdentifierFields = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .setIdentifierFields("id") - .apply(); + Schema schemaWithIdentifierFields = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID).setIdentifierFields("id").apply(); - Assert.assertEquals("delete column and then reset identifier field should succeed", + Assert.assertEquals( + "delete column and then reset identifier field should succeed", Sets.newHashSet(), new SchemaUpdate(schemaWithIdentifierFields, SCHEMA_LAST_COLUMN_ID) - .deleteColumn("id").setIdentifierFields(Sets.newHashSet()).apply() + .deleteColumn("id") + .setIdentifierFields(Sets.newHashSet()) + .apply() .identifierFieldIds()); - Assert.assertEquals("delete reset identifier field and then delete column should succeed", + Assert.assertEquals( + "delete reset identifier field and then delete column should succeed", Sets.newHashSet(), new SchemaUpdate(schemaWithIdentifierFields, SCHEMA_LAST_COLUMN_ID) - .setIdentifierFields(Sets.newHashSet()).deleteColumn("id").apply() + .setIdentifierFields(Sets.newHashSet()) + .deleteColumn("id") + .apply() .identifierFieldIds()); } @Test public void testDeleteIdentifierFieldColumnsFails() { - Schema schemaWithIdentifierFields = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .setIdentifierFields("id") - .apply(); + Schema schemaWithIdentifierFields = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID).setIdentifierFields("id").apply(); - AssertHelpers.assertThrows("delete an identifier column without setting identifier fields should fail", + AssertHelpers.assertThrows( + "delete an identifier column without setting identifier fields should fail", IllegalArgumentException.class, - "Cannot delete identifier field 1: id: required int. To force deletion, " + - "also call setIdentifierFields to update identifier fields.", - () -> new SchemaUpdate(schemaWithIdentifierFields, SCHEMA_LAST_COLUMN_ID).deleteColumn("id").apply()); + "Cannot delete identifier field 1: id: required int. To force deletion, " + + "also call setIdentifierFields to update identifier fields.", + () -> + new SchemaUpdate(schemaWithIdentifierFields, SCHEMA_LAST_COLUMN_ID) + .deleteColumn("id") + .apply()); } @Test public void testDeleteContainingNestedIdentifierFieldColumnsFails() { - Schema newSchema = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .allowIncompatibleChanges() - .addRequiredColumn("out", Types.StructType.of( - Types.NestedField.required(SCHEMA_LAST_COLUMN_ID + 2, "nested", Types.StringType.get()) - )) - .setIdentifierFields("out.nested") - .apply(); + Schema newSchema = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .allowIncompatibleChanges() + .addRequiredColumn( + "out", + Types.StructType.of( + Types.NestedField.required( + SCHEMA_LAST_COLUMN_ID + 2, "nested", Types.StringType.get()))) + .setIdentifierFields("out.nested") + .apply(); AssertHelpers.assertThrows( "delete a struct with a nested identifier column without setting identifier fields should fail", IllegalArgumentException.class, - "Cannot delete field 24: out: required struct<25: nested: required string> " + - "as it will delete nested identifier field 25: nested: required string", + "Cannot delete field 24: out: required struct<25: nested: required string> " + + "as it will delete nested identifier field 25: nested: required string", () -> new SchemaUpdate(newSchema, SCHEMA_LAST_COLUMN_ID + 2).deleteColumn("out").apply()); } @Test public void testRenameIdentifierFields() { - Schema schemaWithIdentifierFields = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .setIdentifierFields("id") - .apply(); + Schema schemaWithIdentifierFields = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID).setIdentifierFields("id").apply(); - Schema newSchema = new SchemaUpdate(schemaWithIdentifierFields, SCHEMA_LAST_COLUMN_ID) - .renameColumn("id", "id2") - .apply(); + Schema newSchema = + new SchemaUpdate(schemaWithIdentifierFields, SCHEMA_LAST_COLUMN_ID) + .renameColumn("id", "id2") + .apply(); - Assert.assertEquals("rename should not affect identifier fields", + Assert.assertEquals( + "rename should not affect identifier fields", Sets.newHashSet(SCHEMA.findField("id").fieldId()), newSchema.identifierFieldIds()); } @Test public void testMoveIdentifierFields() { - Schema schemaWithIdentifierFields = new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) - .setIdentifierFields("id") - .apply(); + Schema schemaWithIdentifierFields = + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID).setIdentifierFields("id").apply(); - Schema newSchema = new SchemaUpdate(schemaWithIdentifierFields, SCHEMA_LAST_COLUMN_ID) - .moveAfter("id", "locations") - .apply(); + Schema newSchema = + new SchemaUpdate(schemaWithIdentifierFields, SCHEMA_LAST_COLUMN_ID) + .moveAfter("id", "locations") + .apply(); - Assert.assertEquals("move after should not affect identifier fields", + Assert.assertEquals( + "move after should not affect identifier fields", Sets.newHashSet(SCHEMA.findField("id").fieldId()), newSchema.identifierFieldIds()); - newSchema = new SchemaUpdate(schemaWithIdentifierFields, SCHEMA_LAST_COLUMN_ID) - .moveBefore("id", "locations") - .apply(); + newSchema = + new SchemaUpdate(schemaWithIdentifierFields, SCHEMA_LAST_COLUMN_ID) + .moveBefore("id", "locations") + .apply(); - Assert.assertEquals("move before should not affect identifier fields", + Assert.assertEquals( + "move before should not affect identifier fields", Sets.newHashSet(SCHEMA.findField("id").fieldId()), newSchema.identifierFieldIds()); - newSchema = new SchemaUpdate(schemaWithIdentifierFields, SCHEMA_LAST_COLUMN_ID) - .moveFirst("id") - .apply(); + newSchema = + new SchemaUpdate(schemaWithIdentifierFields, SCHEMA_LAST_COLUMN_ID).moveFirst("id").apply(); - Assert.assertEquals("move first should not affect identifier fields", + Assert.assertEquals( + "move first should not affect identifier fields", Sets.newHashSet(SCHEMA.findField("id").fieldId()), newSchema.identifierFieldIds()); } diff --git a/core/src/test/java/org/apache/iceberg/TestSequenceNumberForV2Table.java b/core/src/test/java/org/apache/iceberg/TestSequenceNumberForV2Table.java index d596dce16a60..8eed4513abd0 100644 --- a/core/src/test/java/org/apache/iceberg/TestSequenceNumberForV2Table.java +++ b/core/src/test/java/org/apache/iceberg/TestSequenceNumberForV2Table.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Set; @@ -41,7 +40,8 @@ public void testRewrite() { validateSnapshot(null, snap1, 1, FILE_A); validateManifest(manifestFile, seqs(1), ids(commitId1), files(FILE_A)); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap1.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); table.newFastAppend().appendFile(FILE_B).commit(); Snapshot snap2 = table.currentSnapshot(); @@ -50,28 +50,33 @@ public void testRewrite() { validateSnapshot(snap1, snap2, 2, FILE_B); validateManifest(manifestFile, seqs(2), ids(commitId2), files(FILE_B)); V2Assert.assertEquals("Snapshot sequence number should be 2", 2, snap2.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); table.rewriteManifests().clusterBy(file -> "").commit(); Snapshot snap3 = table.currentSnapshot(); - ManifestFile newManifest = snap3.allManifests(table.io()).stream() - .filter(manifest -> manifest.snapshotId() == snap3.snapshotId()) - .collect(Collectors.toList()).get(0); + ManifestFile newManifest = + snap3.allManifests(table.io()).stream() + .filter(manifest -> manifest.snapshotId() == snap3.snapshotId()) + .collect(Collectors.toList()) + .get(0); V2Assert.assertEquals("Snapshot sequence number should be 3", 3, snap3.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 3", 3, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 3", 3, readMetadata().lastSequenceNumber()); // FILE_A and FILE_B in manifest may reorder for (ManifestEntry entry : ManifestFiles.read(newManifest, FILE_IO).entries()) { if (entry.file().path().equals(FILE_A.path())) { - V2Assert.assertEquals("FILE_A sequence number should be 1", 1, entry.sequenceNumber().longValue()); + V2Assert.assertEquals( + "FILE_A sequence number should be 1", 1, entry.sequenceNumber().longValue()); } if (entry.file().path().equals(FILE_B.path())) { - V2Assert.assertEquals("FILE_b sequence number should be 2", 2, entry.sequenceNumber().longValue()); + V2Assert.assertEquals( + "FILE_b sequence number should be 2", 2, entry.sequenceNumber().longValue()); } } - } @Test @@ -79,19 +84,17 @@ public void testCommitConflict() { AppendFiles appendA = table.newFastAppend(); appendA.appendFile(FILE_A).apply(); - table.updateProperties() - .set(TableProperties.COMMIT_NUM_RETRIES, "0") - .commit(); + table.updateProperties().set(TableProperties.COMMIT_NUM_RETRIES, "0").commit(); table.ops().failCommits(1); - AssertHelpers.assertThrows("Should reject commit", - CommitFailedException.class, "Injected failure", + AssertHelpers.assertThrows( + "Should reject commit", + CommitFailedException.class, + "Injected failure", () -> table.newFastAppend().appendFile(FILE_B).commit()); - table.updateProperties() - .set(TableProperties.COMMIT_NUM_RETRIES, "5") - .commit(); + table.updateProperties().set(TableProperties.COMMIT_NUM_RETRIES, "5").commit(); appendA.commit(); Snapshot snap1 = table.currentSnapshot(); @@ -100,7 +103,8 @@ public void testCommitConflict() { validateSnapshot(null, snap1, 1, FILE_A); validateManifest(manifestFile, seqs(1), ids(commitId1), files(FILE_A)); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap1.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); AppendFiles appendFiles = table.newFastAppend().appendFile(FILE_C); appendFiles.apply(); @@ -111,7 +115,8 @@ public void testCommitConflict() { validateSnapshot(snap1, snap2, 2, FILE_D); validateManifest(manifestFile, seqs(2), ids(commitId2), files(FILE_D)); V2Assert.assertEquals("Snapshot sequence number should be 2", 2, snap2.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); appendFiles.commit(); Snapshot snap3 = table.currentSnapshot(); @@ -120,7 +125,8 @@ public void testCommitConflict() { validateManifest(manifestFile, seqs(3), ids(commitId3), files(FILE_C)); validateSnapshot(snap2, snap3, 3, FILE_C); V2Assert.assertEquals("Snapshot sequence number should be 3", 3, snap3.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 3", 3, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 3", 3, readMetadata().lastSequenceNumber()); } @Test @@ -132,7 +138,8 @@ public void testRollBack() { validateSnapshot(null, snap1, 1, FILE_A); validateManifest(manifestFile, seqs(1), ids(commitId1), files(FILE_A)); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap1.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); table.newFastAppend().appendFile(FILE_B).commit(); Snapshot snap2 = table.currentSnapshot(); @@ -141,12 +148,14 @@ public void testRollBack() { validateSnapshot(snap1, snap2, 2, FILE_B); validateManifest(manifestFile, seqs(2), ids(commitId2), files(FILE_B)); V2Assert.assertEquals("Snapshot sequence number should be 2", 2, snap2.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); table.manageSnapshots().rollbackTo(commitId1).commit(); Snapshot snap3 = table.currentSnapshot(); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap3.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); table.newFastAppend().appendFile(FILE_C).commit(); Snapshot snap4 = table.currentSnapshot(); @@ -155,7 +164,8 @@ public void testRollBack() { validateSnapshot(snap3, snap4, 3, FILE_C); validateManifest(manifestFile, seqs(3), ids(commitId4), files(FILE_C)); V2Assert.assertEquals("Snapshot sequence number should be 1", 3, snap4.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 3", 3, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 3", 3, readMetadata().lastSequenceNumber()); } @Test @@ -169,7 +179,8 @@ public void testSingleTransaction() { validateSnapshot(null, snap, 1, FILE_A); validateManifest(manifestFile, seqs(1), ids(commitId), files(FILE_A)); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); } @Test @@ -191,7 +202,8 @@ public void testConcurrentTransaction() { validateSnapshot(null, snap1, 1, FILE_A); validateManifest(manifestFile1, seqs(1), ids(commitId1), files(FILE_A)); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap1.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); txn2.commitTransaction(); Snapshot snap2 = table.currentSnapshot(); @@ -200,28 +212,36 @@ public void testConcurrentTransaction() { validateSnapshot(snap1, snap2, 2, FILE_B); validateManifest(manifestFile, seqs(2), ids(commitId2), files(FILE_B)); V2Assert.assertEquals("Snapshot sequence number should be 2", 2, snap2.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); txn3.commitTransaction(); Snapshot snap3 = table.currentSnapshot(); long commitId3 = snap3.snapshotId(); - manifestFile = table.currentSnapshot().allManifests(table.io()).stream() - .filter(manifest -> manifest.snapshotId() == commitId3) - .collect(Collectors.toList()).get(0); + manifestFile = + table.currentSnapshot().allManifests(table.io()).stream() + .filter(manifest -> manifest.snapshotId() == commitId3) + .collect(Collectors.toList()) + .get(0); validateManifest(manifestFile, seqs(3), ids(commitId3), files(FILE_C)); validateSnapshot(snap2, snap3, 3, FILE_C); V2Assert.assertEquals("Snapshot sequence number should be 3", 3, snap3.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 3", 3, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 3", 3, readMetadata().lastSequenceNumber()); txn4.commitTransaction(); Snapshot snap4 = table.currentSnapshot(); long commitId4 = snap4.snapshotId(); - manifestFile = table.currentSnapshot().allManifests(table.io()).stream() - .filter(manifest -> manifest.snapshotId() == commitId4) - .collect(Collectors.toList()).get(0); - validateManifest(manifestFile, seqs(4), ids(commitId4), files(FILE_A), statuses(Status.DELETED)); + manifestFile = + table.currentSnapshot().allManifests(table.io()).stream() + .filter(manifest -> manifest.snapshotId() == commitId4) + .collect(Collectors.toList()) + .get(0); + validateManifest( + manifestFile, seqs(4), ids(commitId4), files(FILE_A), statuses(Status.DELETED)); V2Assert.assertEquals("Snapshot sequence number should be 4", 4, snap4.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 4", 4, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 4", 4, readMetadata().lastSequenceNumber()); } @Test @@ -234,7 +254,8 @@ public void testMultipleOperationsTransaction() { validateSnapshot(null, snap1, 1, FILE_A); validateManifest(manifestFile, seqs(1), ids(commitId1), files(FILE_A)); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap1.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 0", 0, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 0", 0, readMetadata().lastSequenceNumber()); Set toAddFiles = Sets.newHashSet(); Set toDeleteFiles = Sets.newHashSet(); @@ -245,13 +266,16 @@ public void testMultipleOperationsTransaction() { Snapshot snap2 = table.currentSnapshot(); long commitId2 = snap2.snapshotId(); - manifestFile = snap2.allManifests(table.io()).stream() - .filter(manifest -> manifest.snapshotId() == commitId2) - .collect(Collectors.toList()).get(0); + manifestFile = + snap2.allManifests(table.io()).stream() + .filter(manifest -> manifest.snapshotId() == commitId2) + .collect(Collectors.toList()) + .get(0); validateManifest(manifestFile, seqs(2), ids(commitId2), files(FILE_B)); V2Assert.assertEquals("Snapshot sequence number should be 2", 2, snap2.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); } @Test @@ -263,7 +287,8 @@ public void testExpirationInTransaction() { validateSnapshot(null, snap1, 1, FILE_A); validateManifest(manifestFile, seqs(1), ids(commitId1), files(FILE_A)); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap1.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); table.newAppend().appendFile(FILE_B).commit(); Snapshot snap2 = table.currentSnapshot(); @@ -272,73 +297,71 @@ public void testExpirationInTransaction() { validateSnapshot(snap1, snap2, 2, FILE_B); validateManifest(manifestFile, seqs(2), ids(commitId2), files(FILE_B)); V2Assert.assertEquals("Snapshot sequence number should be 2", 2, snap2.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); Transaction txn = table.newTransaction(); txn.expireSnapshots().expireSnapshotId(commitId1).commit(); txn.commitTransaction(); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); } @Test public void testTransactionFailure() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot snap1 = table.currentSnapshot(); long commitId1 = snap1.snapshotId(); ManifestFile manifestFile = table.currentSnapshot().allManifests(table.io()).get(0); validateSnapshot(null, snap1, 1, FILE_A, FILE_B); validateManifest(manifestFile, seqs(1, 1), ids(commitId1, commitId1), files(FILE_A, FILE_B)); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap1.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); - table.updateProperties() - .set(TableProperties.COMMIT_NUM_RETRIES, "0") - .commit(); + table.updateProperties().set(TableProperties.COMMIT_NUM_RETRIES, "0").commit(); table.ops().failCommits(1); Transaction txn = table.newTransaction(); txn.newAppend().appendFile(FILE_C).commit(); - AssertHelpers.assertThrows("Transaction commit should fail", - CommitFailedException.class, "Injected failure", txn::commitTransaction); + AssertHelpers.assertThrows( + "Transaction commit should fail", + CommitFailedException.class, + "Injected failure", + txn::commitTransaction); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); } @Test public void testCherryPicking() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot snap1 = table.currentSnapshot(); long commitId1 = snap1.snapshotId(); ManifestFile manifestFile = snap1.allManifests(table.io()).get(0); validateSnapshot(null, snap1, 1, FILE_A); validateManifest(manifestFile, seqs(1), ids(commitId1), files(FILE_A)); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap1.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); Snapshot snap2 = table.currentSnapshot(); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap2.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); // pick the snapshot that's staged but not committed Snapshot stagedSnapshot = readMetadata().snapshots().get(1); - V2Assert.assertEquals("Snapshot sequence number should be 2", 2, stagedSnapshot.sequenceNumber()); + V2Assert.assertEquals( + "Snapshot sequence number should be 2", 2, stagedSnapshot.sequenceNumber()); // table has new commit - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Snapshot snap3 = table.currentSnapshot(); long commitId3 = snap3.snapshotId(); @@ -346,7 +369,8 @@ public void testCherryPicking() { validateManifest(manifestFile, seqs(3), ids(commitId3), files(FILE_C)); validateSnapshot(snap2, snap3, 3, FILE_C); V2Assert.assertEquals("Snapshot sequence number should be 3", 3, snap3.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 3", 3, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 3", 3, readMetadata().lastSequenceNumber()); // cherry-pick snapshot table.manageSnapshots().cherrypick(stagedSnapshot.snapshotId()).commit(); @@ -356,33 +380,32 @@ public void testCherryPicking() { validateManifest(manifestFile, seqs(4), ids(commitId4), files(FILE_B)); validateSnapshot(snap3, snap4, 4, FILE_B); V2Assert.assertEquals("Snapshot sequence number should be 4", 4, snap4.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 4", 4, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 4", 4, readMetadata().lastSequenceNumber()); } @Test public void testCherryPickFastForward() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot snap1 = table.currentSnapshot(); long commitId1 = snap1.snapshotId(); ManifestFile manifestFile = snap1.allManifests(table.io()).get(0); validateSnapshot(null, snap1, 1, FILE_A); validateManifest(manifestFile, seqs(1), ids(commitId1), files(FILE_A)); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap1.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 1", 1, readMetadata().lastSequenceNumber()); - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); Snapshot snap2 = table.currentSnapshot(); V2Assert.assertEquals("Snapshot sequence number should be 1", 1, snap2.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); // pick the snapshot that's staged but not committed Snapshot stagedSnapshot = readMetadata().snapshots().get(1); - V2Assert.assertEquals("Snapshot sequence number should be 2", 2, stagedSnapshot.sequenceNumber()); + V2Assert.assertEquals( + "Snapshot sequence number should be 2", 2, stagedSnapshot.sequenceNumber()); // cherry-pick snapshot, this will fast forward table.manageSnapshots().cherrypick(stagedSnapshot.snapshotId()).commit(); @@ -392,7 +415,7 @@ public void testCherryPickFastForward() { validateManifest(manifestFile, seqs(2), ids(commitId3), files(FILE_B)); validateSnapshot(snap2, snap3, 2, FILE_B); V2Assert.assertEquals("Snapshot sequence number should be 2", 2, snap3.sequenceNumber()); - V2Assert.assertEquals("Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); + V2Assert.assertEquals( + "Last sequence number should be 2", 2, readMetadata().lastSequenceNumber()); } - } diff --git a/core/src/test/java/org/apache/iceberg/TestSnapshot.java b/core/src/test/java/org/apache/iceberg/TestSnapshot.java index 6bb6384ac4db..1b1e930afe93 100644 --- a/core/src/test/java/org/apache/iceberg/TestSnapshot.java +++ b/core/src/test/java/org/apache/iceberg/TestSnapshot.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.expressions.Expressions; @@ -32,7 +31,7 @@ public class TestSnapshot extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestSnapshot(int formatVersion) { @@ -41,10 +40,7 @@ public TestSnapshot(int formatVersion) { @Test public void testAppendFilesFromTable() { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); // collect data files from deserialization Iterable filesToAdd = table.currentSnapshot().addedDataFiles(table.io()); @@ -64,15 +60,13 @@ public void testAppendFilesFromTable() { @Test public void testAppendFoundFiles() { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - Iterable filesToAdd = FindFiles.in(table) - .inPartition(table.spec(), StaticDataTask.Row.of(0)) - .inPartition(table.spec(), StaticDataTask.Row.of(1)) - .collect(); + Iterable filesToAdd = + FindFiles.in(table) + .inPartition(table.spec(), StaticDataTask.Row.of(0)) + .inPartition(table.spec(), StaticDataTask.Row.of(1)) + .collect(); table.newDelete().deleteFile(FILE_A).deleteFile(FILE_B).commit(); @@ -89,27 +83,17 @@ public void testAppendFoundFiles() { @Test public void testCachedDataFiles() { - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.updateSpec() - .addField(Expressions.truncate("data", 2)) - .commit(); + table.updateSpec().addField(Expressions.truncate("data", 2)).commit(); DataFile secondSnapshotDataFile = newDataFile("data_bucket=8/data_trunc_2=aa"); - table.newFastAppend() - .appendFile(secondSnapshotDataFile) - .commit(); + table.newFastAppend().appendFile(secondSnapshotDataFile).commit(); DataFile thirdSnapshotDataFile = newDataFile("data_bucket=8/data_trunc_2=bb"); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(thirdSnapshotDataFile) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(thirdSnapshotDataFile).commit(); Snapshot thirdSnapshot = table.currentSnapshot(); @@ -126,29 +110,27 @@ public void testCachedDataFiles() { DataFile addedDataFile = Iterables.getOnlyElement(addedDataFiles); Assert.assertEquals("Path must match", thirdSnapshotDataFile.path(), addedDataFile.path()); - Assert.assertEquals("Spec ID must match", thirdSnapshotDataFile.specId(), addedDataFile.specId()); - Assert.assertEquals("Partition must match", thirdSnapshotDataFile.partition(), addedDataFile.partition()); + Assert.assertEquals( + "Spec ID must match", thirdSnapshotDataFile.specId(), addedDataFile.specId()); + Assert.assertEquals( + "Partition must match", thirdSnapshotDataFile.partition(), addedDataFile.partition()); } @Test public void testCachedDeleteFiles() { Assume.assumeTrue("Delete files only supported in V2", formatVersion >= 2); - table.newFastAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.updateSpec() - .addField(Expressions.truncate("data", 2)) - .commit(); + table.updateSpec().addField(Expressions.truncate("data", 2)).commit(); int specId = table.spec().specId(); DataFile secondSnapshotDataFile = newDataFile("data_bucket=8/data_trunc_2=aa"); DeleteFile secondSnapshotDeleteFile = newDeleteFile(specId, "data_bucket=8/data_trunc_2=aa"); - table.newRowDelta() + table + .newRowDelta() .addRows(secondSnapshotDataFile) .addDeletes(secondSnapshotDeleteFile) .commit(); @@ -158,7 +140,8 @@ public void testCachedDeleteFiles() { ImmutableSet replacedDeleteFiles = ImmutableSet.of(secondSnapshotDeleteFile); ImmutableSet newDeleteFiles = ImmutableSet.of(thirdSnapshotDeleteFile); - table.newRewrite() + table + .newRewrite() .rewriteFiles(ImmutableSet.of(), replacedDeleteFiles, ImmutableSet.of(), newDeleteFiles) .commit(); @@ -168,16 +151,23 @@ public void testCachedDeleteFiles() { Assert.assertEquals("Must have 1 removed delete file", 1, Iterables.size(removedDeleteFiles)); DeleteFile removedDeleteFile = Iterables.getOnlyElement(removedDeleteFiles); - Assert.assertEquals("Path must match", secondSnapshotDeleteFile.path(), removedDeleteFile.path()); - Assert.assertEquals("Spec ID must match", secondSnapshotDeleteFile.specId(), removedDeleteFile.specId()); - Assert.assertEquals("Partition must match", secondSnapshotDeleteFile.partition(), removedDeleteFile.partition()); + Assert.assertEquals( + "Path must match", secondSnapshotDeleteFile.path(), removedDeleteFile.path()); + Assert.assertEquals( + "Spec ID must match", secondSnapshotDeleteFile.specId(), removedDeleteFile.specId()); + Assert.assertEquals( + "Partition must match", + secondSnapshotDeleteFile.partition(), + removedDeleteFile.partition()); Iterable addedDeleteFiles = thirdSnapshot.addedDeleteFiles(FILE_IO); Assert.assertEquals("Must have 1 added delete file", 1, Iterables.size(addedDeleteFiles)); DeleteFile addedDeleteFile = Iterables.getOnlyElement(addedDeleteFiles); Assert.assertEquals("Path must match", thirdSnapshotDeleteFile.path(), addedDeleteFile.path()); - Assert.assertEquals("Spec ID must match", thirdSnapshotDeleteFile.specId(), addedDeleteFile.specId()); - Assert.assertEquals("Partition must match", thirdSnapshotDeleteFile.partition(), addedDeleteFile.partition()); + Assert.assertEquals( + "Spec ID must match", thirdSnapshotDeleteFile.specId(), addedDeleteFile.specId()); + Assert.assertEquals( + "Partition must match", thirdSnapshotDeleteFile.partition(), addedDeleteFile.partition()); } } diff --git a/core/src/test/java/org/apache/iceberg/TestSnapshotJson.java b/core/src/test/java/org/apache/iceberg/TestSnapshotJson.java index 130129ab3477..8170c7b6f636 100644 --- a/core/src/test/java/org/apache/iceberg/TestSnapshotJson.java +++ b/core/src/test/java/org/apache/iceberg/TestSnapshotJson.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.Files.localInput; + import java.io.File; import java.io.IOException; import java.util.List; @@ -29,25 +30,26 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.Files.localInput; - public class TestSnapshotJson { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); public TableOperations ops = new LocalTableOperations(temp); @Test public void testJsonConversion() { - Snapshot expected = new BaseSnapshot(ops.io(), System.currentTimeMillis(), 1, - "file:/tmp/manifest1.avro", "file:/tmp/manifest2.avro"); + Snapshot expected = + new BaseSnapshot( + ops.io(), + System.currentTimeMillis(), + 1, + "file:/tmp/manifest1.avro", + "file:/tmp/manifest2.avro"); String json = SnapshotParser.toJson(expected); Snapshot snapshot = SnapshotParser.fromJson(ops.io(), json); - Assert.assertEquals("Snapshot ID should match", - expected.snapshotId(), snapshot.snapshotId()); - Assert.assertEquals("Files should match", - expected.allManifests(ops.io()), snapshot.allManifests(ops.io())); + Assert.assertEquals("Snapshot ID should match", expected.snapshotId(), snapshot.snapshotId()); + Assert.assertEquals( + "Files should match", expected.allManifests(ops.io()), snapshot.allManifests(ops.io())); Assert.assertNull("Operation should be null", snapshot.operation()); Assert.assertNull("Summary should be null", snapshot.summary()); Assert.assertEquals("Schema ID should match", Integer.valueOf(1), snapshot.schemaId()); @@ -55,15 +57,19 @@ public void testJsonConversion() { @Test public void testJsonConversionWithoutSchemaId() { - Snapshot expected = new BaseSnapshot(ops.io(), System.currentTimeMillis(), null, - "file:/tmp/manifest1.avro", "file:/tmp/manifest2.avro"); + Snapshot expected = + new BaseSnapshot( + ops.io(), + System.currentTimeMillis(), + null, + "file:/tmp/manifest1.avro", + "file:/tmp/manifest2.avro"); String json = SnapshotParser.toJson(expected); Snapshot snapshot = SnapshotParser.fromJson(ops.io(), json); - Assert.assertEquals("Snapshot ID should match", - expected.snapshotId(), snapshot.snapshotId()); - Assert.assertEquals("Files should match", - expected.allManifests(ops.io()), snapshot.allManifests(ops.io())); + Assert.assertEquals("Snapshot ID should match", expected.snapshotId(), snapshot.snapshotId()); + Assert.assertEquals( + "Files should match", expected.allManifests(ops.io()), snapshot.allManifests(ops.io())); Assert.assertNull("Operation should be null", snapshot.operation()); Assert.assertNull("Summary should be null", snapshot.summary()); Assert.assertNull("Schema ID should be null", snapshot.schemaId()); @@ -73,77 +79,96 @@ public void testJsonConversionWithoutSchemaId() { public void testJsonConversionWithOperation() { long parentId = 1; long id = 2; - List manifests = ImmutableList.of( - new GenericManifestFile(localInput("file:/tmp/manifest1.avro"), 0), - new GenericManifestFile(localInput("file:/tmp/manifest2.avro"), 0)); - - Snapshot expected = new BaseSnapshot(ops.io(), id, parentId, System.currentTimeMillis(), - DataOperations.REPLACE, ImmutableMap.of("files-added", "4", "files-deleted", "100"), - 3, manifests); + List manifests = + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manifest1.avro"), 0), + new GenericManifestFile(localInput("file:/tmp/manifest2.avro"), 0)); + + Snapshot expected = + new BaseSnapshot( + ops.io(), + id, + parentId, + System.currentTimeMillis(), + DataOperations.REPLACE, + ImmutableMap.of("files-added", "4", "files-deleted", "100"), + 3, + manifests); String json = SnapshotParser.toJson(expected); Snapshot snapshot = SnapshotParser.fromJson(ops.io(), json); - Assert.assertEquals("Sequence number should default to 0 for v1", - 0, snapshot.sequenceNumber()); - Assert.assertEquals("Snapshot ID should match", - expected.snapshotId(), snapshot.snapshotId()); - Assert.assertEquals("Timestamp should match", - expected.timestampMillis(), snapshot.timestampMillis()); - Assert.assertEquals("Parent ID should match", - expected.parentId(), snapshot.parentId()); - Assert.assertEquals("Manifest list should match", - expected.manifestListLocation(), snapshot.manifestListLocation()); - Assert.assertEquals("Files should match", - expected.allManifests(ops.io()), snapshot.allManifests(ops.io())); - Assert.assertEquals("Operation should match", - expected.operation(), snapshot.operation()); - Assert.assertEquals("Summary should match", - expected.summary(), snapshot.summary()); - Assert.assertEquals("Schema ID should match", - expected.schemaId(), snapshot.schemaId()); + Assert.assertEquals("Sequence number should default to 0 for v1", 0, snapshot.sequenceNumber()); + Assert.assertEquals("Snapshot ID should match", expected.snapshotId(), snapshot.snapshotId()); + Assert.assertEquals( + "Timestamp should match", expected.timestampMillis(), snapshot.timestampMillis()); + Assert.assertEquals("Parent ID should match", expected.parentId(), snapshot.parentId()); + Assert.assertEquals( + "Manifest list should match", + expected.manifestListLocation(), + snapshot.manifestListLocation()); + Assert.assertEquals( + "Files should match", expected.allManifests(ops.io()), snapshot.allManifests(ops.io())); + Assert.assertEquals("Operation should match", expected.operation(), snapshot.operation()); + Assert.assertEquals("Summary should match", expected.summary(), snapshot.summary()); + Assert.assertEquals("Schema ID should match", expected.schemaId(), snapshot.schemaId()); } @Test public void testJsonConversionWithManifestList() throws IOException { long parentId = 1; long id = 2; - List manifests = ImmutableList.of( - new GenericManifestFile(localInput("file:/tmp/manifest1.avro"), 0), - new GenericManifestFile(localInput("file:/tmp/manifest2.avro"), 0)); + List manifests = + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manifest1.avro"), 0), + new GenericManifestFile(localInput("file:/tmp/manifest2.avro"), 0)); File manifestList = temp.newFile("manifests"); Assert.assertTrue(manifestList.delete()); manifestList.deleteOnExit(); - try (ManifestListWriter writer = ManifestLists.write(1, Files.localOutput(manifestList), id, parentId, 0)) { + try (ManifestListWriter writer = + ManifestLists.write(1, Files.localOutput(manifestList), id, parentId, 0)) { writer.addAll(manifests); } - Snapshot expected = new BaseSnapshot( - ops.io(), id, 34, parentId, System.currentTimeMillis(), - null, null, 4, localInput(manifestList).location()); - Snapshot inMemory = new BaseSnapshot( - ops.io(), id, parentId, expected.timestampMillis(), null, null, 4, manifests); - - Assert.assertEquals("Files should match in memory list", - inMemory.allManifests(ops.io()), expected.allManifests(ops.io())); + Snapshot expected = + new BaseSnapshot( + ops.io(), + id, + 34, + parentId, + System.currentTimeMillis(), + null, + null, + 4, + localInput(manifestList).location()); + Snapshot inMemory = + new BaseSnapshot( + ops.io(), id, parentId, expected.timestampMillis(), null, null, 4, manifests); + + Assert.assertEquals( + "Files should match in memory list", + inMemory.allManifests(ops.io()), + expected.allManifests(ops.io())); String json = SnapshotParser.toJson(expected); Snapshot snapshot = SnapshotParser.fromJson(ops.io(), json); - Assert.assertEquals("Sequence number should default to 0", - expected.sequenceNumber(), snapshot.sequenceNumber()); - Assert.assertEquals("Snapshot ID should match", - expected.snapshotId(), snapshot.snapshotId()); - Assert.assertEquals("Timestamp should match", - expected.timestampMillis(), snapshot.timestampMillis()); - Assert.assertEquals("Parent ID should match", - expected.parentId(), snapshot.parentId()); - Assert.assertEquals("Manifest list should match", - expected.manifestListLocation(), snapshot.manifestListLocation()); - Assert.assertEquals("Files should match", - expected.allManifests(ops.io()), snapshot.allManifests(ops.io())); + Assert.assertEquals( + "Sequence number should default to 0", + expected.sequenceNumber(), + snapshot.sequenceNumber()); + Assert.assertEquals("Snapshot ID should match", expected.snapshotId(), snapshot.snapshotId()); + Assert.assertEquals( + "Timestamp should match", expected.timestampMillis(), snapshot.timestampMillis()); + Assert.assertEquals("Parent ID should match", expected.parentId(), snapshot.parentId()); + Assert.assertEquals( + "Manifest list should match", + expected.manifestListLocation(), + snapshot.manifestListLocation()); + Assert.assertEquals( + "Files should match", expected.allManifests(ops.io()), snapshot.allManifests(ops.io())); Assert.assertNull("Operation should be null", snapshot.operation()); Assert.assertNull("Summary should be null", snapshot.summary()); Assert.assertEquals("Schema ID should match", expected.schemaId(), snapshot.schemaId()); diff --git a/core/src/test/java/org/apache/iceberg/TestSnapshotManager.java b/core/src/test/java/org/apache/iceberg/TestSnapshotManager.java index a9b231f36bee..2b8b3cae3e39 100644 --- a/core/src/test/java/org/apache/iceberg/TestSnapshotManager.java +++ b/core/src/test/java/org/apache/iceberg/TestSnapshotManager.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.exceptions.ValidationException; @@ -30,24 +29,26 @@ public class TestSnapshotManager extends TableTestBase { // replacement for FILE_A - static final DataFile REPLACEMENT_FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-a-replacement.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("data_bucket=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); + static final DataFile REPLACEMENT_FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a-replacement.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("data_bucket=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); // conflict in the same partition as FILE_A - static final DataFile CONFLICT_FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-a-conflict.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("data_bucket=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); + static final DataFile CONFLICT_FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a-conflict.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("data_bucket=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestSnapshotManager(int formatVersion) { @@ -56,30 +57,23 @@ public TestSnapshotManager(int formatVersion) { @Test public void testCherryPickDynamicOverwrite() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // stage an overwrite that replaces FILE_A - table.newReplacePartitions() - .addFile(REPLACEMENT_FILE_A) - .stageOnly() - .commit(); + table.newReplacePartitions().addFile(REPLACEMENT_FILE_A).stageOnly().commit(); Snapshot staged = Iterables.getLast(table.snapshots()); - Assert.assertEquals("Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); + Assert.assertEquals( + "Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); // add another append so that the original commit can't be fast-forwarded - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); // pick the snapshot into the current state - table.manageSnapshots() - .cherrypick(staged.snapshotId()) - .commit(); + table.manageSnapshots().cherrypick(staged.snapshotId()).commit(); - Assert.assertNotEquals("Should not fast-forward", staged.snapshotId(), table.currentSnapshot().snapshotId()); + Assert.assertNotEquals( + "Should not fast-forward", staged.snapshotId(), table.currentSnapshot().snapshotId()); validateTableFiles(table, FILE_B, REPLACEMENT_FILE_A); } @@ -88,417 +82,361 @@ public void testCherryPickDynamicOverwriteWithoutParent() { Assert.assertNull("Table should not have a current snapshot", table.currentSnapshot()); // stage an overwrite that replaces FILE_A - table.newReplacePartitions() - .addFile(REPLACEMENT_FILE_A) - .stageOnly() - .commit(); + table.newReplacePartitions().addFile(REPLACEMENT_FILE_A).stageOnly().commit(); Snapshot staged = Iterables.getLast(table.snapshots()); - Assert.assertEquals("Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); + Assert.assertEquals( + "Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); // add another append so that the original commit can't be fast-forwarded - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); // pick the snapshot into the current state - table.manageSnapshots() - .cherrypick(staged.snapshotId()) - .commit(); + table.manageSnapshots().cherrypick(staged.snapshotId()).commit(); - Assert.assertNotEquals("Should not fast-forward", staged.snapshotId(), table.currentSnapshot().snapshotId()); + Assert.assertNotEquals( + "Should not fast-forward", staged.snapshotId(), table.currentSnapshot().snapshotId()); validateTableFiles(table, FILE_B, REPLACEMENT_FILE_A); } @Test public void testCherryPickDynamicOverwriteConflict() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // stage an overwrite that replaces FILE_A - table.newReplacePartitions() - .addFile(REPLACEMENT_FILE_A) - .stageOnly() - .commit(); + table.newReplacePartitions().addFile(REPLACEMENT_FILE_A).stageOnly().commit(); Snapshot staged = Iterables.getLast(table.snapshots()); - Assert.assertEquals("Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); + Assert.assertEquals( + "Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); // add another append so that the original commit can't be fast-forwarded - table.newAppend() - .appendFile(CONFLICT_FILE_A) - .commit(); + table.newAppend().appendFile(CONFLICT_FILE_A).commit(); long lastSnapshotId = table.currentSnapshot().snapshotId(); // pick the snapshot into the current state - AssertHelpers.assertThrows("Should reject partition replacement when a partition has been modified", - ValidationException.class, "Cannot cherry-pick replace partitions with changed partition", - () -> table.manageSnapshots() - .cherrypick(staged.snapshotId()) - .commit()); - - Assert.assertEquals("Failed cherry-pick should not change the table state", - lastSnapshotId, table.currentSnapshot().snapshotId()); + AssertHelpers.assertThrows( + "Should reject partition replacement when a partition has been modified", + ValidationException.class, + "Cannot cherry-pick replace partitions with changed partition", + () -> table.manageSnapshots().cherrypick(staged.snapshotId()).commit()); + + Assert.assertEquals( + "Failed cherry-pick should not change the table state", + lastSnapshotId, + table.currentSnapshot().snapshotId()); validateTableFiles(table, FILE_A, CONFLICT_FILE_A); } @Test public void testCherryPickDynamicOverwriteDeleteConflict() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // stage an overwrite that replaces FILE_A - table.newReplacePartitions() - .addFile(REPLACEMENT_FILE_A) - .stageOnly() - .commit(); + table.newReplacePartitions().addFile(REPLACEMENT_FILE_A).stageOnly().commit(); Snapshot staged = Iterables.getLast(table.snapshots()); - Assert.assertEquals("Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); + Assert.assertEquals( + "Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); // add FILE_B s - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); // delete FILE_A so the replace operation is no longer valid - table.newDelete() - .deleteFile(FILE_A) - .commit(); + table.newDelete().deleteFile(FILE_A).commit(); long lastSnapshotId = table.currentSnapshot().snapshotId(); // pick the snapshot into the current state - AssertHelpers.assertThrows("Should reject partition replacement when a partition has been modified", - ValidationException.class, "Missing required files to delete", - () -> table.manageSnapshots() - .cherrypick(staged.snapshotId()) - .commit()); - - Assert.assertEquals("Failed cherry-pick should not change the table state", - lastSnapshotId, table.currentSnapshot().snapshotId()); + AssertHelpers.assertThrows( + "Should reject partition replacement when a partition has been modified", + ValidationException.class, + "Missing required files to delete", + () -> table.manageSnapshots().cherrypick(staged.snapshotId()).commit()); + + Assert.assertEquals( + "Failed cherry-pick should not change the table state", + lastSnapshotId, + table.currentSnapshot().snapshotId()); validateTableFiles(table, FILE_B); } @Test public void testCherryPickFromBranch() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long branchSnapshotId = table.currentSnapshot().snapshotId(); // add a second commit before replacing FILE_A - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); // replace FILE_A - table.newReplacePartitions() - .addFile(REPLACEMENT_FILE_A) - .commit(); + table.newReplacePartitions().addFile(REPLACEMENT_FILE_A).commit(); long replaceSnapshotId = table.currentSnapshot().snapshotId(); - // rewrite history so the replacement is in a branch, not base directly on an ancestor of the current state - table.manageSnapshots() - .rollbackTo(branchSnapshotId) - .commit(); + // rewrite history so the replacement is in a branch, not base directly on an ancestor of the + // current state + table.manageSnapshots().rollbackTo(branchSnapshotId).commit(); long lastSnapshotId = table.currentSnapshot().snapshotId(); // pick the snapshot into the current state - AssertHelpers.assertThrows("Should reject partition replacement when a partition has been modified", - ValidationException.class, "Cannot cherry-pick overwrite not based on an ancestor of the current state", + AssertHelpers.assertThrows( + "Should reject partition replacement when a partition has been modified", + ValidationException.class, + "Cannot cherry-pick overwrite not based on an ancestor of the current state", () -> table.manageSnapshots().cherrypick(replaceSnapshotId).commit()); - Assert.assertEquals("Failed cherry-pick should not change the table state", - lastSnapshotId, table.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Failed cherry-pick should not change the table state", + lastSnapshotId, + table.currentSnapshot().snapshotId()); validateTableFiles(table, FILE_A); } @Test public void testCherryPickOverwrite() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // stage an overwrite to replace FILE_A - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(REPLACEMENT_FILE_A) - .stageOnly() - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(REPLACEMENT_FILE_A).stageOnly().commit(); Snapshot staged = Iterables.getLast(table.snapshots()); - Assert.assertEquals("Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); + Assert.assertEquals( + "Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); // add another append so that the original commit can't be fast-forwarded - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); long lastSnapshotId = table.currentSnapshot().snapshotId(); // pick the snapshot into the current state - AssertHelpers.assertThrows("Should reject partition replacement when a partition has been modified", - ValidationException.class, "not append, dynamic overwrite, or fast-forward", - () -> table.manageSnapshots() - .cherrypick(staged.snapshotId()) - .commit()); - - Assert.assertEquals("Failed cherry-pick should not change the table state", - lastSnapshotId, table.currentSnapshot().snapshotId()); + AssertHelpers.assertThrows( + "Should reject partition replacement when a partition has been modified", + ValidationException.class, + "not append, dynamic overwrite, or fast-forward", + () -> table.manageSnapshots().cherrypick(staged.snapshotId()).commit()); + + Assert.assertEquals( + "Failed cherry-pick should not change the table state", + lastSnapshotId, + table.currentSnapshot().snapshotId()); validateTableFiles(table, FILE_A, FILE_B); } @Test public void testCreateBranch() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long snapshotId = table.currentSnapshot().snapshotId(); // Test a basic case of creating a branch - table.manageSnapshots() - .createBranch("branch1", snapshotId) - .commit(); + table.manageSnapshots().createBranch("branch1", snapshotId).commit(); SnapshotRef expectedBranch = table.ops().refresh().ref("branch1"); - Assert.assertTrue(expectedBranch != null && - expectedBranch.equals(SnapshotRef.branchBuilder(snapshotId).build())); + Assert.assertTrue( + expectedBranch != null + && expectedBranch.equals(SnapshotRef.branchBuilder(snapshotId).build())); } @Test public void testCreateBranchFailsWhenRefAlreadyExists() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long snapshotId = table.currentSnapshot().snapshotId(); - table.manageSnapshots() - .createBranch("branch1", snapshotId) - .commit(); + table.manageSnapshots().createBranch("branch1", snapshotId).commit(); // Trying to create a branch with an existing name should fail - AssertHelpers.assertThrows("Creating branch which already exists should fail", - IllegalArgumentException.class, "Ref branch1 already exists", + AssertHelpers.assertThrows( + "Creating branch which already exists should fail", + IllegalArgumentException.class, + "Ref branch1 already exists", () -> table.manageSnapshots().createBranch("branch1", snapshotId).commit()); // Trying to create another branch within the same chain - AssertHelpers.assertThrows("Creating branch which already exists should fail", - IllegalArgumentException.class, "Ref branch2 already exists", - () -> table.manageSnapshots().createBranch("branch2", snapshotId).createBranch("branch2", snapshotId).commit()); + AssertHelpers.assertThrows( + "Creating branch which already exists should fail", + IllegalArgumentException.class, + "Ref branch2 already exists", + () -> + table + .manageSnapshots() + .createBranch("branch2", snapshotId) + .createBranch("branch2", snapshotId) + .commit()); } - @Test public void testCreateTag() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long snapshotId = table.currentSnapshot().snapshotId(); // Test a basic case of creating a tag - table.manageSnapshots() - .createTag("tag1", snapshotId) - .commit(); + table.manageSnapshots().createTag("tag1", snapshotId).commit(); SnapshotRef expectedTag = table.ops().refresh().ref("tag1"); - Assert.assertTrue(expectedTag != null && - expectedTag.equals(SnapshotRef.tagBuilder(snapshotId).build())); + Assert.assertTrue( + expectedTag != null && expectedTag.equals(SnapshotRef.tagBuilder(snapshotId).build())); } @Test public void testCreateTagFailsWhenRefAlreadyExists() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long snapshotId = table.currentSnapshot().snapshotId(); - table.manageSnapshots() - .createTag("tag1", snapshotId) - .commit(); + table.manageSnapshots().createTag("tag1", snapshotId).commit(); // Trying to create a tag with an existing name should fail - AssertHelpers.assertThrows("Creating tag which already exists should fail", - IllegalArgumentException.class, "Ref tag1 already exists", + AssertHelpers.assertThrows( + "Creating tag which already exists should fail", + IllegalArgumentException.class, + "Ref tag1 already exists", () -> table.manageSnapshots().createTag("tag1", snapshotId).commit()); // Trying to create another tag within the same chain - AssertHelpers.assertThrows("Creating branch which already exists should fail", - IllegalArgumentException.class, "Ref tag2 already exists", - () -> table.manageSnapshots() - .createTag("tag2", snapshotId) - .createTag("tag2", snapshotId).commit()); + AssertHelpers.assertThrows( + "Creating branch which already exists should fail", + IllegalArgumentException.class, + "Ref tag2 already exists", + () -> + table + .manageSnapshots() + .createTag("tag2", snapshotId) + .createTag("tag2", snapshotId) + .commit()); } @Test public void testRemoveBranch() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long snapshotId = table.currentSnapshot().snapshotId(); // Test a basic case of creating and then removing a branch and tag - table.manageSnapshots() - .createBranch("branch1", snapshotId) - .commit(); - table.manageSnapshots() - .removeBranch("branch1") - .commit(); + table.manageSnapshots().createBranch("branch1", snapshotId).commit(); + table.manageSnapshots().removeBranch("branch1").commit(); TableMetadata updated = table.ops().refresh(); SnapshotRef expectedBranch = updated.ref("branch1"); Assert.assertNull(expectedBranch); // Test chained creating and removal of branch and tag - table.manageSnapshots() - .createBranch("branch2", snapshotId) - .removeBranch("branch2") - .commit(); + table.manageSnapshots().createBranch("branch2", snapshotId).removeBranch("branch2").commit(); updated = table.ops().refresh(); Assert.assertNull(updated.ref("branch2")); } @Test public void testRemovingNonExistingBranchFails() { - AssertHelpers.assertThrows("Trying to remove non-existent branch should fail", - IllegalArgumentException.class, "Branch does not exist: non-existing", + AssertHelpers.assertThrows( + "Trying to remove non-existent branch should fail", + IllegalArgumentException.class, + "Branch does not exist: non-existing", () -> table.manageSnapshots().removeBranch("non-existing").commit()); } @Test public void testRemovingMainBranchFails() { - AssertHelpers.assertThrows("Removing main should fail", - IllegalArgumentException.class, "Cannot remove main branch", + AssertHelpers.assertThrows( + "Removing main should fail", + IllegalArgumentException.class, + "Cannot remove main branch", () -> table.manageSnapshots().removeBranch(SnapshotRef.MAIN_BRANCH).commit()); } @Test public void testRemoveTag() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long snapshotId = table.currentSnapshot().snapshotId(); // Test a basic case of creating and then removing a branch and tag - table.manageSnapshots() - .createTag("tag1", snapshotId) - .commit(); - table.manageSnapshots() - .removeTag("tag1") - .commit(); + table.manageSnapshots().createTag("tag1", snapshotId).commit(); + table.manageSnapshots().removeTag("tag1").commit(); TableMetadata updated = table.ops().refresh(); SnapshotRef expectedTag = updated.ref("tag1"); Assert.assertNull(expectedTag); // Test chained creating and removal of a tag - table.manageSnapshots() - .createTag("tag2", snapshotId) - .removeTag("tag2") - .commit(); + table.manageSnapshots().createTag("tag2", snapshotId).removeTag("tag2").commit(); Assert.assertEquals(updated, table.ops().refresh()); Assert.assertNull(updated.ref("tag2")); } @Test public void testRemovingNonExistingTagFails() { - AssertHelpers.assertThrows("Removing a non-existing tag should fail", - IllegalArgumentException.class, "Tag does not exist: non-existing", + AssertHelpers.assertThrows( + "Removing a non-existing tag should fail", + IllegalArgumentException.class, + "Tag does not exist: non-existing", () -> table.manageSnapshots().removeTag("non-existing").commit()); } @Test public void testReplaceBranch() { - table.newAppend() - .appendFile(FILE_A) - .set("wap.id", "123") - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_A).set("wap.id", "123").stageOnly().commit(); Snapshot firstSnapshot = Iterables.getOnlyElement(table.snapshots()); table.manageSnapshots().createBranch("branch1", firstSnapshot.snapshotId()).commit(); - table.newAppend() - .appendFile(FILE_B) - .set("wap.id", "456") - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).set("wap.id", "456").stageOnly().commit(); Snapshot secondSnapshot = Iterables.get(table.snapshots(), 1); table.manageSnapshots().createBranch("branch2", secondSnapshot.snapshotId()).commit(); table.manageSnapshots().replaceBranch("branch1", "branch2").commit(); - Assert.assertEquals(table.ops().refresh().ref("branch1").snapshotId(), secondSnapshot.snapshotId()); + Assert.assertEquals( + table.ops().refresh().ref("branch1").snapshotId(), secondSnapshot.snapshotId()); } @Test public void testReplaceBranchNonExistingTargetBranchFails() { - AssertHelpers.assertThrows("Replacing a non-existing branch should fail", - IllegalArgumentException.class, "Target branch does not exist: non-existing", + AssertHelpers.assertThrows( + "Replacing a non-existing branch should fail", + IllegalArgumentException.class, + "Target branch does not exist: non-existing", () -> table.manageSnapshots().replaceBranch("non-existing", "other-branch").commit()); } @Test public void testReplaceBranchNonExistingSourceFails() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long snapshotId = table.currentSnapshot().snapshotId(); - table.manageSnapshots() - .createBranch("branch1", snapshotId) - .commit(); - AssertHelpers.assertThrows("Replacing where the source ref does not exist should fail", - IllegalArgumentException.class, "Ref does not exist: non-existing", + table.manageSnapshots().createBranch("branch1", snapshotId).commit(); + AssertHelpers.assertThrows( + "Replacing where the source ref does not exist should fail", + IllegalArgumentException.class, + "Ref does not exist: non-existing", () -> table.manageSnapshots().replaceBranch("branch1", "non-existing").commit()); } @Test public void testFastForward() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_B) - .set("wap.id", "123456789") - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).set("wap.id", "123456789").stageOnly().commit(); Assert.assertEquals(table.currentSnapshot().snapshotId(), 1); table.manageSnapshots().createBranch("new-branch-at-staged-snapshot", 2).commit(); - table.manageSnapshots().fastForwardBranch(SnapshotRef.MAIN_BRANCH, "new-branch-at-staged-snapshot").commit(); + table + .manageSnapshots() + .fastForwardBranch(SnapshotRef.MAIN_BRANCH, "new-branch-at-staged-snapshot") + .commit(); Assert.assertEquals(table.currentSnapshot().snapshotId(), 2); } @Test public void testFastForwardWhenTargetIsNotAncestorFails() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_B) - .set("wap.id", "123456789") - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).set("wap.id", "123456789").stageOnly().commit(); long snapshot = table.currentSnapshot().snapshotId(); // Commit a snapshot on main to deviate the branches - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); final String newBranch = "new-branch-at-staged-snapshot"; table.manageSnapshots().createBranch(newBranch, snapshot).commit(); - AssertHelpers.assertThrows("Fast-forward should fail if target is not an ancestor of the source", - IllegalArgumentException.class, "Cannot fast-forward: main is not an ancestor of new-branch-at-staged-snapshot", - () -> table.manageSnapshots().fastForwardBranch(SnapshotRef.MAIN_BRANCH, newBranch).commit()); + AssertHelpers.assertThrows( + "Fast-forward should fail if target is not an ancestor of the source", + IllegalArgumentException.class, + "Cannot fast-forward: main is not an ancestor of new-branch-at-staged-snapshot", + () -> + table.manageSnapshots().fastForwardBranch(SnapshotRef.MAIN_BRANCH, newBranch).commit()); } @Test public void testReplaceTag() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long snapshotId = table.currentSnapshot().snapshotId(); - table.manageSnapshots() - .createTag("tag1", snapshotId) - .commit(); + table.manageSnapshots().createTag("tag1", snapshotId).commit(); // Create a new snapshot and replace the tip of branch1 to be the new snapshot - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); long currentSnapshot = table.ops().refresh().currentSnapshot().snapshotId(); table.manageSnapshots().replaceTag("tag1", currentSnapshot).commit(); Assert.assertEquals(table.ops().refresh().ref("tag1").snapshotId(), currentSnapshot); @@ -506,15 +444,12 @@ public void testReplaceTag() { @Test public void testUpdatingBranchRetention() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long snapshotId = table.currentSnapshot().snapshotId(); // Test creating and updating independently - table.manageSnapshots() - .createBranch("branch1", snapshotId) - .commit(); - table.manageSnapshots() + table.manageSnapshots().createBranch("branch1", snapshotId).commit(); + table + .manageSnapshots() .setMinSnapshotsToKeep("branch1", 10) .setMaxSnapshotAgeMs("branch1", 20000) .commit(); @@ -522,7 +457,8 @@ public void testUpdatingBranchRetention() { Assert.assertEquals(20000, (long) updated.ref("branch1").maxSnapshotAgeMs()); Assert.assertEquals(10, (long) updated.ref("branch1").minSnapshotsToKeep()); // Test creating and updating in a chain - table.manageSnapshots() + table + .manageSnapshots() .createBranch("branch2", snapshotId) .setMinSnapshotsToKeep("branch2", 10) .setMaxSnapshotAgeMs("branch2", 20000) @@ -534,34 +470,40 @@ public void testUpdatingBranchRetention() { @Test public void testSettingBranchRetentionOnTagFails() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long snapshotId = table.currentSnapshot().snapshotId(); - AssertHelpers.assertThrows("Setting minSnapshotsToKeep should fail for tags", - IllegalArgumentException.class, "Tags do not support setting minSnapshotsToKeep", - () -> table.manageSnapshots().createTag("tag1", snapshotId).setMinSnapshotsToKeep("tag1", 10).commit()); - AssertHelpers.assertThrows("Setting maxSnapshotAgeMs should fail for tags", - IllegalArgumentException.class, "Tags do not support setting maxSnapshotAgeMs", - () -> table.manageSnapshots().createTag("tag1", snapshotId).setMaxSnapshotAgeMs("tag1", 10).commit()); + AssertHelpers.assertThrows( + "Setting minSnapshotsToKeep should fail for tags", + IllegalArgumentException.class, + "Tags do not support setting minSnapshotsToKeep", + () -> + table + .manageSnapshots() + .createTag("tag1", snapshotId) + .setMinSnapshotsToKeep("tag1", 10) + .commit()); + AssertHelpers.assertThrows( + "Setting maxSnapshotAgeMs should fail for tags", + IllegalArgumentException.class, + "Tags do not support setting maxSnapshotAgeMs", + () -> + table + .manageSnapshots() + .createTag("tag1", snapshotId) + .setMaxSnapshotAgeMs("tag1", 10) + .commit()); } @Test public void testUpdatingBranchMaxRefAge() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long snapshotId = table.currentSnapshot().snapshotId(); final long maxRefAgeMs = 10000; // Test creating and updating independently - table.manageSnapshots() - .createBranch("branch1", snapshotId) - .commit(); - table.manageSnapshots() - .setMaxRefAgeMs("branch1", 10000) - .commit(); + table.manageSnapshots().createBranch("branch1", snapshotId).commit(); + table.manageSnapshots().setMaxRefAgeMs("branch1", 10000).commit(); TableMetadata updated = table.ops().refresh(); Assert.assertEquals(maxRefAgeMs, (long) updated.ref("branch1").maxRefAgeMs()); Assert.assertEquals(maxRefAgeMs, (long) updated.ref("branch1").maxRefAgeMs()); @@ -569,25 +511,20 @@ public void testUpdatingBranchMaxRefAge() { @Test public void testUpdatingTagMaxRefAge() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long snapshotId = table.currentSnapshot().snapshotId(); final long maxRefAgeMs = 10000; // Test creating and updating independently - table.manageSnapshots() - .createTag("tag1", snapshotId) - .commit(); - table.manageSnapshots() - .setMaxRefAgeMs("tag1", maxRefAgeMs) - .commit(); + table.manageSnapshots().createTag("tag1", snapshotId).commit(); + table.manageSnapshots().setMaxRefAgeMs("tag1", maxRefAgeMs).commit(); TableMetadata updated = table.ops().refresh(); Assert.assertEquals(maxRefAgeMs, (long) updated.ref("tag1").maxRefAgeMs()); // Test creating and updating in a chain - table.manageSnapshots() + table + .manageSnapshots() .createTag("tag2", snapshotId) .setMaxRefAgeMs("tag2", maxRefAgeMs) .commit(); @@ -597,26 +534,19 @@ public void testUpdatingTagMaxRefAge() { @Test public void testRenameBranch() { - table.newAppend() - .appendFile(FILE_A) - .commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); + table.newAppend().appendFile(FILE_A).commit(); long snapshotId = table.currentSnapshot().snapshotId(); // Test creating and renaming independently - table.manageSnapshots() - .createBranch("branch1", snapshotId) - .commit(); - table.manageSnapshots() - .renameBranch("branch1", "branch2") - .commit(); + table.manageSnapshots().createBranch("branch1", snapshotId).commit(); + table.manageSnapshots().renameBranch("branch1", "branch2").commit(); TableMetadata updated = table.ops().refresh(); Assert.assertNull(updated.ref("branch1")); Assert.assertEquals(updated.ref("branch2"), SnapshotRef.branchBuilder(snapshotId).build()); - table.manageSnapshots() + table + .manageSnapshots() .createBranch("branch3", snapshotId) .renameBranch("branch3", "branch4") .commit(); @@ -628,29 +558,31 @@ public void testRenameBranch() { @Test public void testFailRenamingMainBranch() { - AssertHelpers.assertThrows("Renaming main branch should fail", - IllegalArgumentException.class, "Cannot rename main branch", - () -> table.manageSnapshots().renameBranch(SnapshotRef.MAIN_BRANCH, "some-branch").commit()); + AssertHelpers.assertThrows( + "Renaming main branch should fail", + IllegalArgumentException.class, + "Cannot rename main branch", + () -> + table.manageSnapshots().renameBranch(SnapshotRef.MAIN_BRANCH, "some-branch").commit()); } @Test public void testRenamingNonExistingBranchFails() { - AssertHelpers.assertThrows("Renaming non-existent branch should fail", - IllegalArgumentException.class, "Branch does not exist: some-missing-branch", + AssertHelpers.assertThrows( + "Renaming non-existent branch should fail", + IllegalArgumentException.class, + "Branch does not exist: some-missing-branch", () -> table.manageSnapshots().renameBranch("some-missing-branch", "some-branch").commit()); } @Test public void testCreateReferencesAndRollback() { - table.newAppend() - .appendFile(FILE_A) - .commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); + table.newAppend().appendFile(FILE_A).commit(); long snapshotPriorToRollback = table.currentSnapshot().snapshotId(); - table.manageSnapshots() + table + .manageSnapshots() .createBranch("branch1", snapshotPriorToRollback) .createTag("tag1", snapshotPriorToRollback) .rollbackTo(1) @@ -667,19 +599,15 @@ public void testCreateReferencesAndRollback() { @Test public void testCreateReferencesAndCherrypick() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); long currentSnapshot = table.currentSnapshot().snapshotId(); // stage an overwrite that replaces FILE_A - table.newReplacePartitions() - .addFile(REPLACEMENT_FILE_A) - .stageOnly() - .commit(); + table.newReplacePartitions().addFile(REPLACEMENT_FILE_A).stageOnly().commit(); Snapshot staged = Iterables.getLast(table.snapshots()); - table.manageSnapshots() + table + .manageSnapshots() .createBranch("branch1", currentSnapshot) .createTag("tag1", currentSnapshot) .cherrypick(staged.snapshotId()) diff --git a/core/src/test/java/org/apache/iceberg/TestSnapshotRefParser.java b/core/src/test/java/org/apache/iceberg/TestSnapshotRefParser.java index e5ade0eb7083..bc13fd7d1303 100644 --- a/core/src/test/java/org/apache/iceberg/TestSnapshotRefParser.java +++ b/core/src/test/java/org/apache/iceberg/TestSnapshotRefParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.junit.Assert; @@ -28,131 +27,156 @@ public class TestSnapshotRefParser { public void testTagToJsonDefault() { String json = "{\"snapshot-id\":1,\"type\":\"tag\"}"; SnapshotRef ref = SnapshotRef.tagBuilder(1L).build(); - Assert.assertEquals("Should be able to serialize default tag", - json, SnapshotRefParser.toJson(ref)); + Assert.assertEquals( + "Should be able to serialize default tag", json, SnapshotRefParser.toJson(ref)); } @Test public void testTagToJsonAllFields() { String json = "{\"snapshot-id\":1,\"type\":\"tag\",\"max-ref-age-ms\":1}"; - SnapshotRef ref = SnapshotRef.tagBuilder(1L) - .maxRefAgeMs(1L) - .build(); - Assert.assertEquals("Should be able to serialize tag with all fields", - json, SnapshotRefParser.toJson(ref)); + SnapshotRef ref = SnapshotRef.tagBuilder(1L).maxRefAgeMs(1L).build(); + Assert.assertEquals( + "Should be able to serialize tag with all fields", json, SnapshotRefParser.toJson(ref)); } @Test public void testBranchToJsonDefault() { String json = "{\"snapshot-id\":1,\"type\":\"branch\"}"; SnapshotRef ref = SnapshotRef.branchBuilder(1L).build(); - Assert.assertEquals("Should be able to serialize default branch", - json, SnapshotRefParser.toJson(ref)); + Assert.assertEquals( + "Should be able to serialize default branch", json, SnapshotRefParser.toJson(ref)); } @Test public void testBranchToJsonAllFields() { - String json = "{\"snapshot-id\":1,\"type\":\"branch\",\"min-snapshots-to-keep\":2," + - "\"max-snapshot-age-ms\":3,\"max-ref-age-ms\":4}"; - SnapshotRef ref = SnapshotRef.branchBuilder(1L) - .minSnapshotsToKeep(2) - .maxSnapshotAgeMs(3L) - .maxRefAgeMs(4L) - .build(); - Assert.assertEquals("Should be able to serialize branch with all fields", - json, SnapshotRefParser.toJson(ref)); + String json = + "{\"snapshot-id\":1,\"type\":\"branch\",\"min-snapshots-to-keep\":2," + + "\"max-snapshot-age-ms\":3,\"max-ref-age-ms\":4}"; + SnapshotRef ref = + SnapshotRef.branchBuilder(1L) + .minSnapshotsToKeep(2) + .maxSnapshotAgeMs(3L) + .maxRefAgeMs(4L) + .build(); + Assert.assertEquals( + "Should be able to serialize branch with all fields", json, SnapshotRefParser.toJson(ref)); } @Test public void testTagFromJsonDefault() { String json = "{\"snapshot-id\":1,\"type\":\"tag\"}"; SnapshotRef ref = SnapshotRef.tagBuilder(1L).build(); - Assert.assertEquals("Should be able to deserialize default tag", ref, SnapshotRefParser.fromJson(json)); + Assert.assertEquals( + "Should be able to deserialize default tag", ref, SnapshotRefParser.fromJson(json)); } @Test public void testTagFromJsonAllFields() { String json = "{\"snapshot-id\":1,\"type\":\"tag\",\"max-ref-age-ms\":1}"; - SnapshotRef ref = SnapshotRef.tagBuilder(1L) - .maxRefAgeMs(1L) - .build(); - Assert.assertEquals("Should be able to deserialize tag with all fields", ref, SnapshotRefParser.fromJson(json)); + SnapshotRef ref = SnapshotRef.tagBuilder(1L).maxRefAgeMs(1L).build(); + Assert.assertEquals( + "Should be able to deserialize tag with all fields", ref, SnapshotRefParser.fromJson(json)); } @Test public void testBranchFromJsonDefault() { String json = "{\"snapshot-id\":1,\"type\":\"branch\"}"; SnapshotRef ref = SnapshotRef.branchBuilder(1L).build(); - Assert.assertEquals("Should be able to deserialize default branch", ref, SnapshotRefParser.fromJson(json)); + Assert.assertEquals( + "Should be able to deserialize default branch", ref, SnapshotRefParser.fromJson(json)); } @Test public void testBranchFromJsonAllFields() { - String json = "{\"snapshot-id\":1,\"type\":\"branch\",\"min-snapshots-to-keep\":2," + - "\"max-snapshot-age-ms\":3,\"max-ref-age-ms\":4}"; - SnapshotRef ref = SnapshotRef.branchBuilder(1L) - .minSnapshotsToKeep(2) - .maxSnapshotAgeMs(3L) - .maxRefAgeMs(4L) - .build(); - Assert.assertEquals("Should be able to deserialize branch with all fields", ref, SnapshotRefParser.fromJson(json)); + String json = + "{\"snapshot-id\":1,\"type\":\"branch\",\"min-snapshots-to-keep\":2," + + "\"max-snapshot-age-ms\":3,\"max-ref-age-ms\":4}"; + SnapshotRef ref = + SnapshotRef.branchBuilder(1L) + .minSnapshotsToKeep(2) + .maxSnapshotAgeMs(3L) + .maxRefAgeMs(4L) + .build(); + Assert.assertEquals( + "Should be able to deserialize branch with all fields", + ref, + SnapshotRefParser.fromJson(json)); } @Test public void testFailParsingWhenNullOrEmptyJson() { String nullJson = null; - AssertHelpers.assertThrows("SnapshotRefParser should fail to deserialize null JSON string", - IllegalArgumentException.class, "Cannot parse snapshot ref from invalid JSON", - () -> SnapshotRefParser.fromJson(nullJson)); + AssertHelpers.assertThrows( + "SnapshotRefParser should fail to deserialize null JSON string", + IllegalArgumentException.class, + "Cannot parse snapshot ref from invalid JSON", + () -> SnapshotRefParser.fromJson(nullJson)); String emptyJson = ""; - AssertHelpers.assertThrows("SnapshotRefParser should fail to deserialize empty JSON string", - IllegalArgumentException.class, "Cannot parse snapshot ref from invalid JSON", - () -> SnapshotRefParser.fromJson(emptyJson)); + AssertHelpers.assertThrows( + "SnapshotRefParser should fail to deserialize empty JSON string", + IllegalArgumentException.class, + "Cannot parse snapshot ref from invalid JSON", + () -> SnapshotRefParser.fromJson(emptyJson)); } @Test public void testFailParsingWhenMissingRequiredFields() { String refMissingType = "{\"snapshot-id\":1}"; - AssertHelpers.assertThrows("SnapshotRefParser should fail to deserialize ref with missing type", - IllegalArgumentException.class, "Cannot parse missing string", - () -> SnapshotRefParser.fromJson(refMissingType)); + AssertHelpers.assertThrows( + "SnapshotRefParser should fail to deserialize ref with missing type", + IllegalArgumentException.class, + "Cannot parse missing string", + () -> SnapshotRefParser.fromJson(refMissingType)); String refMissingSnapshotId = "{\"type\":\"branch\"}"; - AssertHelpers.assertThrows("SnapshotRefParser should fail to deserialize ref with missing snapshot id", - IllegalArgumentException.class, "Cannot parse missing long", - () -> SnapshotRefParser.fromJson(refMissingSnapshotId)); + AssertHelpers.assertThrows( + "SnapshotRefParser should fail to deserialize ref with missing snapshot id", + IllegalArgumentException.class, + "Cannot parse missing long", + () -> SnapshotRefParser.fromJson(refMissingSnapshotId)); } @Test public void testFailWhenFieldsHaveInvalidValues() { - String invalidSnapshotId = "{\"snapshot-id\":\"invalid-snapshot-id\",\"type\":\"not-a-valid-tag-type\"}"; - AssertHelpers.assertThrows("SnapshotRefParser should fail to deserialize ref with invalid snapshot id", + String invalidSnapshotId = + "{\"snapshot-id\":\"invalid-snapshot-id\",\"type\":\"not-a-valid-tag-type\"}"; + AssertHelpers.assertThrows( + "SnapshotRefParser should fail to deserialize ref with invalid snapshot id", IllegalArgumentException.class, "Cannot parse snapshot-id to a long value", - () -> SnapshotRefParser.fromJson(invalidSnapshotId)); + () -> SnapshotRefParser.fromJson(invalidSnapshotId)); String invalidTagType = "{\"snapshot-id\":1,\"type\":\"not-a-valid-tag-type\"}"; - AssertHelpers.assertThrows("SnapshotRefParser should fail to deserialize ref with invalid tag", + AssertHelpers.assertThrows( + "SnapshotRefParser should fail to deserialize ref with invalid tag", IllegalArgumentException.class, "No enum constant", - () -> SnapshotRefParser.fromJson(invalidTagType)); - - String invalidRefAge = "{\"snapshot-id\":1,\"type\":\"tag\",\"max-ref-age-ms\":\"not-a-valid-value\"}"; - AssertHelpers.assertThrows("SnapshotRefParser should fail to deserialize ref with invalid ref age", - IllegalArgumentException.class, "Cannot parse max-ref-age-ms to a long", - () -> SnapshotRefParser.fromJson(invalidRefAge)); - - String invalidSnapshotsToKeep = "{\"snapshot-id\":1,\"type\":\"branch\", " + - "\"min-snapshots-to-keep\":\"invalid-number\"}"; - AssertHelpers.assertThrows("SnapshotRefParser should fail to deserialize ref with missing snapshot id", - IllegalArgumentException.class, "Cannot parse min-snapshots-to-keep to an integer value", - () -> SnapshotRefParser.fromJson(invalidSnapshotsToKeep)); - - String invalidMaxSnapshotAge = "{\"snapshot-id\":1,\"type\":\"branch\", " + - "\"max-snapshot-age-ms\":\"invalid-age\"}"; - AssertHelpers.assertThrows("SnapshotRefParser should fail to deserialize ref with missing snapshot id", - IllegalArgumentException.class, "Cannot parse max-snapshot-age-ms to a long value", - () -> SnapshotRefParser.fromJson(invalidMaxSnapshotAge)); + () -> SnapshotRefParser.fromJson(invalidTagType)); + + String invalidRefAge = + "{\"snapshot-id\":1,\"type\":\"tag\",\"max-ref-age-ms\":\"not-a-valid-value\"}"; + AssertHelpers.assertThrows( + "SnapshotRefParser should fail to deserialize ref with invalid ref age", + IllegalArgumentException.class, + "Cannot parse max-ref-age-ms to a long", + () -> SnapshotRefParser.fromJson(invalidRefAge)); + + String invalidSnapshotsToKeep = + "{\"snapshot-id\":1,\"type\":\"branch\", " + + "\"min-snapshots-to-keep\":\"invalid-number\"}"; + AssertHelpers.assertThrows( + "SnapshotRefParser should fail to deserialize ref with missing snapshot id", + IllegalArgumentException.class, + "Cannot parse min-snapshots-to-keep to an integer value", + () -> SnapshotRefParser.fromJson(invalidSnapshotsToKeep)); + + String invalidMaxSnapshotAge = + "{\"snapshot-id\":1,\"type\":\"branch\", " + "\"max-snapshot-age-ms\":\"invalid-age\"}"; + AssertHelpers.assertThrows( + "SnapshotRefParser should fail to deserialize ref with missing snapshot id", + IllegalArgumentException.class, + "Cannot parse max-snapshot-age-ms to a long value", + () -> SnapshotRefParser.fromJson(invalidMaxSnapshotAge)); } } diff --git a/core/src/test/java/org/apache/iceberg/TestSnapshotSelection.java b/core/src/test/java/org/apache/iceberg/TestSnapshotSelection.java index 8002ecb78c66..1a9f4646e81a 100644 --- a/core/src/test/java/org/apache/iceberg/TestSnapshotSelection.java +++ b/core/src/test/java/org/apache/iceberg/TestSnapshotSelection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.nio.ByteBuffer; @@ -32,7 +31,7 @@ public class TestSnapshotSelection extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestSnapshotSelection(int formatVersion) { @@ -43,14 +42,10 @@ public TestSnapshotSelection(int formatVersion) { public void testSnapshotSelectionById() { Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); Assert.assertEquals("Table should have two snapshots", 2, Iterables.size(table.snapshots())); @@ -60,23 +55,24 @@ public void testSnapshotSelectionById() { @Test public void testSnapshotStatsForAddedFiles() { - DataFile fileWithStats = DataFiles.builder(SPEC) - .withPath("/path/to/data-with-stats.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("data_bucket=0") - .withRecordCount(10) - .withMetrics(new Metrics(3L, - null, // no column sizes - ImmutableMap.of(1, 3L), // value count - ImmutableMap.of(1, 0L), // null count - null, - ImmutableMap.of(1, longToBuffer(20L)), // lower bounds - ImmutableMap.of(1, longToBuffer(22L)))) // upper bounds - .build(); + DataFile fileWithStats = + DataFiles.builder(SPEC) + .withPath("/path/to/data-with-stats.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0") + .withRecordCount(10) + .withMetrics( + new Metrics( + 3L, + null, // no column sizes + ImmutableMap.of(1, 3L), // value count + ImmutableMap.of(1, 0L), // null count + null, + ImmutableMap.of(1, longToBuffer(20L)), // lower bounds + ImmutableMap.of(1, longToBuffer(22L)))) // upper bounds + .build(); - table.newFastAppend() - .appendFile(fileWithStats) - .commit(); + table.newFastAppend().appendFile(fileWithStats).commit(); Snapshot snapshot = table.currentSnapshot(); Iterable addedFiles = snapshot.addedDataFiles(table.io()); diff --git a/core/src/test/java/org/apache/iceberg/TestSnapshotSummary.java b/core/src/test/java/org/apache/iceberg/TestSnapshotSummary.java index 0149ecbe8268..e5653b4c65d7 100644 --- a/core/src/test/java/org/apache/iceberg/TestSnapshotSummary.java +++ b/core/src/test/java/org/apache/iceberg/TestSnapshotSummary.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Map; @@ -33,7 +32,7 @@ public TestSnapshotSummary(int formatVersion) { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } @Test @@ -41,24 +40,21 @@ public void testFileSizeSummary() { Assert.assertEquals("Table should start empty", 0, listManifestFiles().size()); // fast append - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); Map summary = table.currentSnapshot().summary(); Assert.assertEquals("10", summary.get(SnapshotSummary.ADDED_FILE_SIZE_PROP)); Assert.assertNull(summary.get(SnapshotSummary.REMOVED_FILE_SIZE_PROP)); Assert.assertEquals("10", summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP)); // merge append - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); summary = table.currentSnapshot().summary(); Assert.assertEquals("10", summary.get(SnapshotSummary.ADDED_FILE_SIZE_PROP)); Assert.assertNull(summary.get(SnapshotSummary.REMOVED_FILE_SIZE_PROP)); Assert.assertEquals("20", summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP)); - table.newOverwrite() + table + .newOverwrite() .deleteFile(FILE_A) .deleteFile(FILE_B) .addFile(FILE_C) @@ -70,10 +66,7 @@ public void testFileSizeSummary() { Assert.assertEquals("20", summary.get(SnapshotSummary.REMOVED_FILE_SIZE_PROP)); Assert.assertEquals("30", summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP)); - table.newDelete() - .deleteFile(FILE_C) - .deleteFile(FILE_D) - .commit(); + table.newDelete().deleteFile(FILE_C).deleteFile(FILE_D).commit(); summary = table.currentSnapshot().summary(); Assert.assertNull(summary.get(SnapshotSummary.ADDED_FILE_SIZE_PROP)); Assert.assertEquals("20", summary.get(SnapshotSummary.REMOVED_FILE_SIZE_PROP)); @@ -86,10 +79,7 @@ public void testFileSizeSummaryWithDeletes() { return; } - table.newRowDelta() - .addDeletes(FILE_A_DELETES) - .addDeletes(FILE_A2_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_A_DELETES).addDeletes(FILE_A2_DELETES).commit(); table.refresh(); Map summary = table.currentSnapshot().summary(); diff --git a/core/src/test/java/org/apache/iceberg/TestSortOrder.java b/core/src/test/java/org/apache/iceberg/TestSortOrder.java index fe99e7aaca03..6411818059e3 100644 --- a/core/src/test/java/org/apache/iceberg/TestSortOrder.java +++ b/core/src/test/java/org/apache/iceberg/TestSortOrder.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.NullOrder.NULLS_FIRST; +import static org.apache.iceberg.NullOrder.NULLS_LAST; +import static org.apache.iceberg.expressions.Expressions.bucket; +import static org.apache.iceberg.expressions.Expressions.truncate; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Set; @@ -36,38 +42,37 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.NullOrder.NULLS_FIRST; -import static org.apache.iceberg.NullOrder.NULLS_LAST; -import static org.apache.iceberg.expressions.Expressions.bucket; -import static org.apache.iceberg.expressions.Expressions.truncate; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSortOrder { // column ids will be reassigned during table creation - private static final Schema SCHEMA = new Schema( - required(10, "id", Types.IntegerType.get()), - required(11, "data", Types.StringType.get()), - required(40, "d", Types.DateType.get()), - required(41, "ts", Types.TimestampType.withZone()), - optional(12, "s", Types.StructType.of( - required(17, "id", Types.IntegerType.get()), - optional(18, "b", Types.ListType.ofOptional(3, Types.StructType.of( - optional(19, "i", Types.IntegerType.get()), - optional(20, "s", Types.StringType.get()) - ))) - )), - required(30, "ext", Types.StringType.get())); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final Schema SCHEMA = + new Schema( + required(10, "id", Types.IntegerType.get()), + required(11, "data", Types.StringType.get()), + required(40, "d", Types.DateType.get()), + required(41, "ts", Types.TimestampType.withZone()), + optional( + 12, + "s", + Types.StructType.of( + required(17, "id", Types.IntegerType.get()), + optional( + 18, + "b", + Types.ListType.ofOptional( + 3, + Types.StructType.of( + optional(19, "i", Types.IntegerType.get()), + optional(20, "s", Types.StringType.get())))))), + required(30, "ext", Types.StringType.get())); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableDir = null; @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } private final int formatVersion; @@ -88,15 +93,20 @@ public void cleanupTables() { @Test public void testSortOrderBuilder() { - Assert.assertEquals("Should be able to build unsorted order", + Assert.assertEquals( + "Should be able to build unsorted order", SortOrder.unsorted(), SortOrder.builderFor(SCHEMA).withOrderId(0).build()); - AssertHelpers.assertThrows("Should not allow sort orders ID 0", - IllegalArgumentException.class, "order ID 0 is reserved for unsorted order", + AssertHelpers.assertThrows( + "Should not allow sort orders ID 0", + IllegalArgumentException.class, + "order ID 0 is reserved for unsorted order", () -> SortOrder.builderFor(SCHEMA).asc("data").withOrderId(0).build()); - AssertHelpers.assertThrows("Should not allow unsorted orders with arbitrary IDs", - IllegalArgumentException.class, "order ID must be 0", + AssertHelpers.assertThrows( + "Should not allow unsorted orders with arbitrary IDs", + IllegalArgumentException.class, + "order ID must be 0", () -> SortOrder.builderFor(SCHEMA).withOrderId(1).build()); } @@ -113,22 +123,24 @@ public void testDefaultOrder() { @Test public void testFreshIds() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .withSpecId(5) - .identity("data") - .build(); - SortOrder order = SortOrder.builderFor(SCHEMA) - .withOrderId(10) - .asc("s.id", NULLS_LAST) - .desc(truncate("data", 10), NULLS_FIRST) - .build(); - TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, spec, order, formatVersion); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).withSpecId(5).identity("data").build(); + SortOrder order = + SortOrder.builderFor(SCHEMA) + .withOrderId(10) + .asc("s.id", NULLS_LAST) + .desc(truncate("data", 10), NULLS_FIRST) + .build(); + TestTables.TestTable table = + TestTables.create(tableDir, "test", SCHEMA, spec, order, formatVersion); Assert.assertEquals("Expected 1 sort order", 1, table.sortOrders().size()); - Assert.assertTrue("Order ID must be fresh", table.sortOrders().containsKey(TableMetadata.INITIAL_SORT_ORDER_ID)); + Assert.assertTrue( + "Order ID must be fresh", + table.sortOrders().containsKey(TableMetadata.INITIAL_SORT_ORDER_ID)); SortOrder actualOrder = table.sortOrder(); - Assert.assertEquals("Order ID must be fresh", TableMetadata.INITIAL_SORT_ORDER_ID, actualOrder.orderId()); + Assert.assertEquals( + "Order ID must be fresh", TableMetadata.INITIAL_SORT_ORDER_ID, actualOrder.orderId()); Assert.assertEquals("Order must have 2 fields", 2, actualOrder.fields().size()); Assert.assertEquals("Field id must be fresh", 7, actualOrder.fields().get(0).sourceId()); Assert.assertEquals("Field id must be fresh", 2, actualOrder.fields().get(1).sourceId()); @@ -136,33 +148,31 @@ public void testFreshIds() { @Test public void testCompatibleOrders() { - SortOrder order1 = SortOrder.builderFor(SCHEMA) - .withOrderId(9) - .asc("s.id", NULLS_LAST) - .build(); - - SortOrder order2 = SortOrder.builderFor(SCHEMA) - .withOrderId(10) - .asc("s.id", NULLS_LAST) - .desc(truncate("data", 10), NULLS_FIRST) - .build(); - - SortOrder order3 = SortOrder.builderFor(SCHEMA) - .withOrderId(11) - .asc("s.id", NULLS_LAST) - .desc(truncate("data", 10), NULLS_LAST) - .build(); - - SortOrder order4 = SortOrder.builderFor(SCHEMA) - .withOrderId(11) - .asc("s.id", NULLS_LAST) - .asc(truncate("data", 10), NULLS_FIRST) - .build(); - - SortOrder order5 = SortOrder.builderFor(SCHEMA) - .withOrderId(11) - .desc("s.id", NULLS_LAST) - .build(); + SortOrder order1 = SortOrder.builderFor(SCHEMA).withOrderId(9).asc("s.id", NULLS_LAST).build(); + + SortOrder order2 = + SortOrder.builderFor(SCHEMA) + .withOrderId(10) + .asc("s.id", NULLS_LAST) + .desc(truncate("data", 10), NULLS_FIRST) + .build(); + + SortOrder order3 = + SortOrder.builderFor(SCHEMA) + .withOrderId(11) + .asc("s.id", NULLS_LAST) + .desc(truncate("data", 10), NULLS_LAST) + .build(); + + SortOrder order4 = + SortOrder.builderFor(SCHEMA) + .withOrderId(11) + .asc("s.id", NULLS_LAST) + .asc(truncate("data", 10), NULLS_FIRST) + .build(); + + SortOrder order5 = + SortOrder.builderFor(SCHEMA).withOrderId(11).desc("s.id", NULLS_LAST).build(); // an unsorted order satisfies only itself Assert.assertTrue(SortOrder.unsorted().satisfies(SortOrder.unsorted())); @@ -198,8 +208,10 @@ public void testCompatibleOrders() { @Test public void testSatisfiesTruncateFieldOrder() { SortOrder id = SortOrder.builderFor(SCHEMA).asc("data", NULLS_LAST).build(); - SortOrder truncate4 = SortOrder.builderFor(SCHEMA).asc(Expressions.truncate("data", 4), NULLS_LAST).build(); - SortOrder truncate2 = SortOrder.builderFor(SCHEMA).asc(Expressions.truncate("data", 2), NULLS_LAST).build(); + SortOrder truncate4 = + SortOrder.builderFor(SCHEMA).asc(Expressions.truncate("data", 4), NULLS_LAST).build(); + SortOrder truncate2 = + SortOrder.builderFor(SCHEMA).asc(Expressions.truncate("data", 2), NULLS_LAST).build(); Assert.assertTrue(id.satisfies(truncate2)); Assert.assertTrue(id.satisfies(truncate4)); @@ -262,15 +274,9 @@ public void testSatisfiesTimestampFieldOrder() { @Test public void testSameOrder() { - SortOrder order1 = SortOrder.builderFor(SCHEMA) - .withOrderId(9) - .asc("s.id", NULLS_LAST) - .build(); + SortOrder order1 = SortOrder.builderFor(SCHEMA).withOrderId(9).asc("s.id", NULLS_LAST).build(); - SortOrder order2 = SortOrder.builderFor(SCHEMA) - .withOrderId(10) - .asc("s.id", NULLS_LAST) - .build(); + SortOrder order2 = SortOrder.builderFor(SCHEMA).withOrderId(10).asc("s.id", NULLS_LAST).build(); // orders have different ids but are logically the same Assert.assertNotEquals("Orders must not be equal", order1, order2); @@ -281,19 +287,16 @@ public void testSameOrder() { @Test public void testSchemaEvolutionWithSortOrder() { PartitionSpec spec = PartitionSpec.unpartitioned(); - SortOrder order = SortOrder.builderFor(SCHEMA) - .withOrderId(10) - .asc("s.id") - .desc(truncate("data", 10)) - .build(); - TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, spec, order, formatVersion); + SortOrder order = + SortOrder.builderFor(SCHEMA).withOrderId(10).asc("s.id").desc(truncate("data", 10)).build(); + TestTables.TestTable table = + TestTables.create(tableDir, "test", SCHEMA, spec, order, formatVersion); - table.updateSchema() - .renameColumn("s.id", "s.id2") - .commit(); + table.updateSchema().renameColumn("s.id", "s.id2").commit(); SortOrder actualOrder = table.sortOrder(); - Assert.assertEquals("Order ID must match", TableMetadata.INITIAL_SORT_ORDER_ID, actualOrder.orderId()); + Assert.assertEquals( + "Order ID must match", TableMetadata.INITIAL_SORT_ORDER_ID, actualOrder.orderId()); Assert.assertEquals("Order must have 2 fields", 2, actualOrder.fields().size()); Assert.assertEquals("Field id must match", 7, actualOrder.fields().get(0).sourceId()); Assert.assertEquals("Field id must match", 2, actualOrder.fields().get(1).sourceId()); @@ -302,15 +305,15 @@ public void testSchemaEvolutionWithSortOrder() { @Test public void testIncompatibleSchemaEvolutionWithSortOrder() { PartitionSpec spec = PartitionSpec.unpartitioned(); - SortOrder order = SortOrder.builderFor(SCHEMA) - .withOrderId(10) - .asc("s.id") - .desc(truncate("data", 10)) - .build(); - TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, spec, order, formatVersion); - - AssertHelpers.assertThrows("Should reject deletion of sort columns", - ValidationException.class, "Cannot find source column", + SortOrder order = + SortOrder.builderFor(SCHEMA).withOrderId(10).asc("s.id").desc(truncate("data", 10)).build(); + TestTables.TestTable table = + TestTables.create(tableDir, "test", SCHEMA, spec, order, formatVersion); + + AssertHelpers.assertThrows( + "Should reject deletion of sort columns", + ValidationException.class, + "Cannot find source column", () -> table.updateSchema().deleteColumn("s.id").commit()); } @@ -322,22 +325,20 @@ public void testEmptySortOrder() { @Test public void testSortedColumnNames() { - SortOrder order = SortOrder.builderFor(SCHEMA) - .withOrderId(10) - .asc("s.id") - .desc(truncate("data", 10)) - .build(); + SortOrder order = + SortOrder.builderFor(SCHEMA).withOrderId(10).asc("s.id").desc(truncate("data", 10)).build(); Set sortedCols = SortOrderUtil.orderPreservingSortedColumns(order); Assert.assertEquals(ImmutableSet.of("s.id", "data"), sortedCols); } @Test public void testPreservingOrderSortedColumnNames() { - SortOrder order = SortOrder.builderFor(SCHEMA) - .withOrderId(10) - .asc(bucket("s.id", 5)) - .desc(truncate("data", 10)) - .build(); + SortOrder order = + SortOrder.builderFor(SCHEMA) + .withOrderId(10) + .asc(bucket("s.id", 5)) + .desc(truncate("data", 10)) + .build(); Set sortedCols = SortOrderUtil.orderPreservingSortedColumns(order); Assert.assertEquals(ImmutableSet.of("data"), sortedCols); } diff --git a/core/src/test/java/org/apache/iceberg/TestSortOrderParser.java b/core/src/test/java/org/apache/iceberg/TestSortOrderParser.java index 3acf998782d1..2505bcedce15 100644 --- a/core/src/test/java/org/apache/iceberg/TestSortOrderParser.java +++ b/core/src/test/java/org/apache/iceberg/TestSortOrderParser.java @@ -16,16 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.NullOrder.NULLS_FIRST; +import static org.apache.iceberg.SortDirection.DESC; + import org.apache.iceberg.transforms.UnknownTransform; import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.NullOrder.NULLS_FIRST; -import static org.apache.iceberg.SortDirection.DESC; - public class TestSortOrderParser extends TableTestBase { public TestSortOrderParser() { super(1); @@ -33,21 +32,23 @@ public TestSortOrderParser() { @Test public void testUnknownTransforms() { - String jsonString = "{\n" + - " \"order-id\" : 10,\n" + - " \"fields\" : [ {\n" + - " \"transform\" : \"custom_transform\",\n" + - " \"source-id\" : 2,\n" + - " \"direction\" : \"desc\",\n" + - " \"null-order\" : \"nulls-first\"\n" + - " } ]\n" + - "}"; + String jsonString = + "{\n" + + " \"order-id\" : 10,\n" + + " \"fields\" : [ {\n" + + " \"transform\" : \"custom_transform\",\n" + + " \"source-id\" : 2,\n" + + " \"direction\" : \"desc\",\n" + + " \"null-order\" : \"nulls-first\"\n" + + " } ]\n" + + "}"; SortOrder order = SortOrderParser.fromJson(table.schema(), jsonString); Assert.assertEquals(10, order.orderId()); Assert.assertEquals(1, order.fields().size()); - org.assertj.core.api.Assertions.assertThat(order.fields().get(0).transform()).isInstanceOf(UnknownTransform.class); + org.assertj.core.api.Assertions.assertThat(order.fields().get(0).transform()) + .isInstanceOf(UnknownTransform.class); Assert.assertEquals("custom_transform", order.fields().get(0).transform().toString()); Assert.assertEquals(2, order.fields().get(0).sourceId()); Assert.assertEquals(DESC, order.fields().get(0).direction()); diff --git a/core/src/test/java/org/apache/iceberg/TestSplitPlanning.java b/core/src/test/java/org/apache/iceberg/TestSplitPlanning.java index 4f71a49e891b..89beb352c1c7 100644 --- a/core/src/test/java/org/apache/iceberg/TestSplitPlanning.java +++ b/core/src/test/java/org/apache/iceberg/TestSplitPlanning.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.util.List; @@ -40,25 +41,21 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; - @RunWith(Parameterized.class) public class TestSplitPlanning extends TableTestBase { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table = null; @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestSplitPlanning(int formatVersion) { @@ -71,7 +68,8 @@ public void setupTable() throws IOException { File tableDir = temp.newFolder(); String tableLocation = tableDir.toURI().toString(); table = TABLES.create(SCHEMA, tableLocation); - table.updateProperties() + table + .updateProperties() .set(TableProperties.SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)) .set(TableProperties.SPLIT_OPEN_FILE_COST, String.valueOf(4 * 1024 * 1024)) .set(TableProperties.SPLIT_LOOKBACK, String.valueOf(Integer.MAX_VALUE)) @@ -107,9 +105,7 @@ public void testSplitPlanningWithSmallFiles() { @Test public void testSplitPlanningWithNoMinWeight() { - table.updateProperties() - .set(TableProperties.SPLIT_OPEN_FILE_COST, "0") - .commit(); + table.updateProperties().set(TableProperties.SPLIT_OPEN_FILE_COST, "0").commit(); List files60Mb = newFiles(2, 60 * 1024 * 1024); List files5Kb = newFiles(100, 5 * 1024); Iterable files = Iterables.concat(files60Mb, files5Kb); @@ -123,8 +119,8 @@ public void testSplitPlanningWithOverridenSize() { List files128Mb = newFiles(4, 128 * 1024 * 1024); appendFiles(files128Mb); // we expect 2 bins since we are overriding split size in scan with 256MB - TableScan scan = table.newScan() - .option(TableProperties.SPLIT_SIZE, String.valueOf(256L * 1024 * 1024)); + TableScan scan = + table.newScan().option(TableProperties.SPLIT_SIZE, String.valueOf(256L * 1024 * 1024)); Assert.assertEquals(2, Iterables.size(scan.planTasks())); } @@ -133,8 +129,8 @@ public void testSplitPlanningWithOverriddenSizeForMetadataJsonFile() { List files8Mb = newFiles(32, 8 * 1024 * 1024, FileFormat.METADATA); appendFiles(files8Mb); // we expect 16 bins since we are overriding split size in scan with 16MB - TableScan scan = table.newScan() - .option(TableProperties.SPLIT_SIZE, String.valueOf(16L * 1024 * 1024)); + TableScan scan = + table.newScan().option(TableProperties.SPLIT_SIZE, String.valueOf(16L * 1024 * 1024)); Assert.assertEquals(16, Iterables.size(scan.planTasks())); } @@ -142,9 +138,10 @@ public void testSplitPlanningWithOverriddenSizeForMetadataJsonFile() { public void testSplitPlanningWithOverriddenSizeForLargeMetadataJsonFile() { List files128Mb = newFiles(4, 128 * 1024 * 1024, FileFormat.METADATA); appendFiles(files128Mb); - // although overriding split size in scan with 8MB, we expect 4 bins since metadata file is not splittable - TableScan scan = table.newScan() - .option(TableProperties.SPLIT_SIZE, String.valueOf(8L * 1024 * 1024)); + // although overriding split size in scan with 8MB, we expect 4 bins since metadata file is not + // splittable + TableScan scan = + table.newScan().option(TableProperties.SPLIT_SIZE, String.valueOf(8L * 1024 * 1024)); Assert.assertEquals(4, Iterables.size(scan.planTasks())); } @@ -155,8 +152,7 @@ public void testSplitPlanningWithOverridenLookback() { Iterable files = Iterables.concat(files120Mb, file128Mb); appendFiles(files); // we expect 2 bins from non-overriden table properties - TableScan scan = table.newScan() - .option(TableProperties.SPLIT_LOOKBACK, "1"); + TableScan scan = table.newScan().option(TableProperties.SPLIT_LOOKBACK, "1"); CloseableIterable tasks = scan.planTasks(); Assert.assertEquals(2, Iterables.size(tasks)); @@ -172,8 +168,10 @@ public void testSplitPlanningWithOverridenOpenCostSize() { appendFiles(files16Mb); // we expect 4 bins since we are overriding open file cost in scan with a cost of 32MB // we can fit at most 128Mb/32Mb = 4 files per bin - TableScan scan = table.newScan() - .option(TableProperties.SPLIT_OPEN_FILE_COST, String.valueOf(32L * 1024 * 1024)); + TableScan scan = + table + .newScan() + .option(TableProperties.SPLIT_OPEN_FILE_COST, String.valueOf(32L * 1024 * 1024)); Assert.assertEquals(4, Iterables.size(scan.planTasks())); } @@ -184,9 +182,7 @@ public void testSplitPlanningWithNegativeValues() { IllegalArgumentException.class, "Invalid split size (negative or 0): -10", () -> { - table.newScan() - .option(TableProperties.SPLIT_SIZE, String.valueOf(-10)) - .planTasks(); + table.newScan().option(TableProperties.SPLIT_SIZE, String.valueOf(-10)).planTasks(); }); AssertHelpers.assertThrows( @@ -194,9 +190,7 @@ public void testSplitPlanningWithNegativeValues() { IllegalArgumentException.class, "Invalid split planning lookback (negative or 0): -10", () -> { - table.newScan() - .option(TableProperties.SPLIT_LOOKBACK, String.valueOf(-10)) - .planTasks(); + table.newScan().option(TableProperties.SPLIT_LOOKBACK, String.valueOf(-10)).planTasks(); }); AssertHelpers.assertThrows( @@ -204,7 +198,8 @@ public void testSplitPlanningWithNegativeValues() { IllegalArgumentException.class, "Invalid file open cost (negative): -10", () -> { - table.newScan() + table + .newScan() .option(TableProperties.SPLIT_OPEN_FILE_COST, String.valueOf(-10)) .planTasks(); }); @@ -217,9 +212,10 @@ public void testSplitPlanningWithOffsets() { // Split Size is slightly larger than rowGroup Size, but we should still end up with // 1 split per row group - TableScan scan = table.newScan() - .option(TableProperties.SPLIT_SIZE, String.valueOf(10L * 1024 * 1024)); - Assert.assertEquals("We should get one task per row group", 32, Iterables.size(scan.planTasks())); + TableScan scan = + table.newScan().option(TableProperties.SPLIT_SIZE, String.valueOf(10L * 1024 * 1024)); + Assert.assertEquals( + "We should get one task per row group", 32, Iterables.size(scan.planTasks())); } @Test @@ -227,11 +223,15 @@ public void testSplitPlanningWithOffsetsUnableToSplit() { List files16Mb = newFiles(16, 16 * 1024 * 1024, 2); appendFiles(files16Mb); - // Split Size does not match up with offsets, so even though we want 4 cuts per file we still only get 2 - TableScan scan = table.newScan() - .option(TableProperties.SPLIT_OPEN_FILE_COST, String.valueOf(0)) - .option(TableProperties.SPLIT_SIZE, String.valueOf(4L * 1024 * 1024)); - Assert.assertEquals("We should still only get 2 tasks per file", 32, Iterables.size(scan.planTasks())); + // Split Size does not match up with offsets, so even though we want 4 cuts per file we still + // only get 2 + TableScan scan = + table + .newScan() + .option(TableProperties.SPLIT_OPEN_FILE_COST, String.valueOf(0)) + .option(TableProperties.SPLIT_SIZE, String.valueOf(4L * 1024 * 1024)); + Assert.assertEquals( + "We should still only get 2 tasks per file", 32, Iterables.size(scan.planTasks())); } private void appendFiles(Iterable files) { @@ -252,7 +252,8 @@ private List newFiles(int numFiles, long sizeInBytes, FileFormat fileF return newFiles(numFiles, sizeInBytes, fileFormat, 1); } - private List newFiles(int numFiles, long sizeInBytes, FileFormat fileFormat, int numOffset) { + private List newFiles( + int numFiles, long sizeInBytes, FileFormat fileFormat, int numOffset) { List files = Lists.newArrayList(); for (int fileNum = 0; fileNum < numFiles; fileNum++) { files.add(newFile(sizeInBytes, fileFormat, numOffset)); @@ -262,14 +263,19 @@ private List newFiles(int numFiles, long sizeInBytes, FileFormat fileF private DataFile newFile(long sizeInBytes, FileFormat fileFormat, int numOffsets) { String fileName = UUID.randomUUID().toString(); - Builder builder = DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath(fileFormat.addExtension(fileName)) - .withFileSizeInBytes(sizeInBytes) - .withRecordCount(2); + Builder builder = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath(fileFormat.addExtension(fileName)) + .withFileSizeInBytes(sizeInBytes) + .withRecordCount(2); if (numOffsets > 1) { long stepSize = sizeInBytes / numOffsets; - List offsets = LongStream.range(0, numOffsets).map(i -> i * stepSize).boxed().collect(Collectors.toList()); + List offsets = + LongStream.range(0, numOffsets) + .map(i -> i * stepSize) + .boxed() + .collect(Collectors.toList()); builder.withSplitOffsets(offsets); } diff --git a/core/src/test/java/org/apache/iceberg/TestTableMetadata.java b/core/src/test/java/org/apache/iceberg/TestTableMetadata.java index 7d9ec39a669b..e3c69f82993a 100644 --- a/core/src/test/java/org/apache/iceberg/TestTableMetadata.java +++ b/core/src/test/java/org/apache/iceberg/TestTableMetadata.java @@ -16,9 +16,20 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.Files.localInput; +import static org.apache.iceberg.TableMetadataParser.CURRENT_SNAPSHOT_ID; +import static org.apache.iceberg.TableMetadataParser.FORMAT_VERSION; +import static org.apache.iceberg.TableMetadataParser.LAST_COLUMN_ID; +import static org.apache.iceberg.TableMetadataParser.LAST_UPDATED_MILLIS; +import static org.apache.iceberg.TableMetadataParser.LOCATION; +import static org.apache.iceberg.TableMetadataParser.PARTITION_SPEC; +import static org.apache.iceberg.TableMetadataParser.PROPERTIES; +import static org.apache.iceberg.TableMetadataParser.SCHEMA; +import static org.apache.iceberg.TableMetadataParser.SNAPSHOTS; +import static org.apache.iceberg.TestHelpers.assertSameSchemaList; + import com.fasterxml.jackson.core.JsonGenerator; import java.io.IOException; import java.io.StringWriter; @@ -48,126 +59,164 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.Files.localInput; -import static org.apache.iceberg.TableMetadataParser.CURRENT_SNAPSHOT_ID; -import static org.apache.iceberg.TableMetadataParser.FORMAT_VERSION; -import static org.apache.iceberg.TableMetadataParser.LAST_COLUMN_ID; -import static org.apache.iceberg.TableMetadataParser.LAST_UPDATED_MILLIS; -import static org.apache.iceberg.TableMetadataParser.LOCATION; -import static org.apache.iceberg.TableMetadataParser.PARTITION_SPEC; -import static org.apache.iceberg.TableMetadataParser.PROPERTIES; -import static org.apache.iceberg.TableMetadataParser.SCHEMA; -import static org.apache.iceberg.TableMetadataParser.SNAPSHOTS; -import static org.apache.iceberg.TestHelpers.assertSameSchemaList; - public class TestTableMetadata { private static final String TEST_LOCATION = "s3://bucket/test/location"; - private static final Schema TEST_SCHEMA = new Schema(7, - Types.NestedField.required(1, "x", Types.LongType.get()), - Types.NestedField.required(2, "y", Types.LongType.get(), "comment"), - Types.NestedField.required(3, "z", Types.LongType.get()) - ); + private static final Schema TEST_SCHEMA = + new Schema( + 7, + Types.NestedField.required(1, "x", Types.LongType.get()), + Types.NestedField.required(2, "y", Types.LongType.get(), "comment"), + Types.NestedField.required(3, "z", Types.LongType.get())); private static final long SEQ_NO = 34; private static final int LAST_ASSIGNED_COLUMN_ID = 3; - private static final PartitionSpec SPEC_5 = PartitionSpec.builderFor(TEST_SCHEMA).withSpecId(5).build(); - private static final SortOrder SORT_ORDER_3 = SortOrder.builderFor(TEST_SCHEMA) - .withOrderId(3) - .asc("y", NullOrder.NULLS_FIRST) - .desc(Expressions.bucket("z", 4), NullOrder.NULLS_LAST) - .build(); + private static final PartitionSpec SPEC_5 = + PartitionSpec.builderFor(TEST_SCHEMA).withSpecId(5).build(); + private static final SortOrder SORT_ORDER_3 = + SortOrder.builderFor(TEST_SCHEMA) + .withOrderId(3) + .asc("y", NullOrder.NULLS_FIRST) + .desc(Expressions.bucket("z", 4), NullOrder.NULLS_LAST) + .build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); public TableOperations ops = new LocalTableOperations(temp); @Test public void testJsonConversion() throws Exception { long previousSnapshotId = System.currentTimeMillis() - new Random(1234).nextInt(3600); - Snapshot previousSnapshot = new BaseSnapshot( - ops.io(), previousSnapshotId, null, previousSnapshotId, null, null, null, ImmutableList.of( - new GenericManifestFile(localInput("file:/tmp/manfiest.1.avro"), SPEC_5.specId()))); + Snapshot previousSnapshot = + new BaseSnapshot( + ops.io(), + previousSnapshotId, + null, + previousSnapshotId, + null, + null, + null, + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manfiest.1.avro"), SPEC_5.specId()))); long currentSnapshotId = System.currentTimeMillis(); - Snapshot currentSnapshot = new BaseSnapshot( - ops.io(), currentSnapshotId, previousSnapshotId, currentSnapshotId, null, null, 7, ImmutableList.of( - new GenericManifestFile(localInput("file:/tmp/manfiest.2.avro"), SPEC_5.specId()))); - - List snapshotLog = ImmutableList.builder() - .add(new SnapshotLogEntry(previousSnapshot.timestampMillis(), previousSnapshot.snapshotId())) - .add(new SnapshotLogEntry(currentSnapshot.timestampMillis(), currentSnapshot.snapshotId())) - .build(); - - Schema schema = new Schema(6, - Types.NestedField.required(10, "x", Types.StringType.get())); - - Map refs = ImmutableMap.of( - "main", SnapshotRef.branchBuilder(currentSnapshotId).build(), - "previous", SnapshotRef.tagBuilder(previousSnapshotId).build(), - "test", SnapshotRef.branchBuilder(previousSnapshotId).build() - ); - - TableMetadata expected = new TableMetadata(null, 2, UUID.randomUUID().toString(), TEST_LOCATION, - SEQ_NO, System.currentTimeMillis(), 3, - 7, ImmutableList.of(TEST_SCHEMA, schema), - 5, ImmutableList.of(SPEC_5), SPEC_5.lastAssignedFieldId(), - 3, ImmutableList.of(SORT_ORDER_3), ImmutableMap.of("property", "value"), currentSnapshotId, - Arrays.asList(previousSnapshot, currentSnapshot), snapshotLog, ImmutableList.of(), refs, - ImmutableList.of()); + Snapshot currentSnapshot = + new BaseSnapshot( + ops.io(), + currentSnapshotId, + previousSnapshotId, + currentSnapshotId, + null, + null, + 7, + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manfiest.2.avro"), SPEC_5.specId()))); + + List snapshotLog = + ImmutableList.builder() + .add( + new SnapshotLogEntry( + previousSnapshot.timestampMillis(), previousSnapshot.snapshotId())) + .add( + new SnapshotLogEntry( + currentSnapshot.timestampMillis(), currentSnapshot.snapshotId())) + .build(); + + Schema schema = new Schema(6, Types.NestedField.required(10, "x", Types.StringType.get())); + + Map refs = + ImmutableMap.of( + "main", SnapshotRef.branchBuilder(currentSnapshotId).build(), + "previous", SnapshotRef.tagBuilder(previousSnapshotId).build(), + "test", SnapshotRef.branchBuilder(previousSnapshotId).build()); + + TableMetadata expected = + new TableMetadata( + null, + 2, + UUID.randomUUID().toString(), + TEST_LOCATION, + SEQ_NO, + System.currentTimeMillis(), + 3, + 7, + ImmutableList.of(TEST_SCHEMA, schema), + 5, + ImmutableList.of(SPEC_5), + SPEC_5.lastAssignedFieldId(), + 3, + ImmutableList.of(SORT_ORDER_3), + ImmutableMap.of("property", "value"), + currentSnapshotId, + Arrays.asList(previousSnapshot, currentSnapshot), + snapshotLog, + ImmutableList.of(), + refs, + ImmutableList.of()); String asJson = TableMetadataParser.toJson(expected); TableMetadata metadata = TableMetadataParser.fromJson(ops.io(), asJson); - Assert.assertEquals("Format version should match", - expected.formatVersion(), metadata.formatVersion()); - Assert.assertEquals("Table UUID should match", - expected.uuid(), metadata.uuid()); - Assert.assertEquals("Table location should match", - expected.location(), metadata.location()); - Assert.assertEquals("Last sequence number should match", - expected.lastSequenceNumber(), metadata.lastSequenceNumber()); - Assert.assertEquals("Last column ID should match", - expected.lastColumnId(), metadata.lastColumnId()); - Assert.assertEquals("Current schema id should match", - expected.currentSchemaId(), metadata.currentSchemaId()); + Assert.assertEquals( + "Format version should match", expected.formatVersion(), metadata.formatVersion()); + Assert.assertEquals("Table UUID should match", expected.uuid(), metadata.uuid()); + Assert.assertEquals("Table location should match", expected.location(), metadata.location()); + Assert.assertEquals( + "Last sequence number should match", + expected.lastSequenceNumber(), + metadata.lastSequenceNumber()); + Assert.assertEquals( + "Last column ID should match", expected.lastColumnId(), metadata.lastColumnId()); + Assert.assertEquals( + "Current schema id should match", expected.currentSchemaId(), metadata.currentSchemaId()); assertSameSchemaList(expected.schemas(), metadata.schemas()); - Assert.assertEquals("Partition spec should match", - expected.spec().toString(), metadata.spec().toString()); - Assert.assertEquals("Default spec ID should match", - expected.defaultSpecId(), metadata.defaultSpecId()); - Assert.assertEquals("PartitionSpec map should match", - expected.specs(), metadata.specs()); - Assert.assertEquals("lastAssignedFieldId across all PartitionSpecs should match", - expected.spec().lastAssignedFieldId(), metadata.lastAssignedPartitionId()); - Assert.assertEquals("Default sort ID should match", - expected.defaultSortOrderId(), metadata.defaultSortOrderId()); - Assert.assertEquals("Sort order should match", - expected.sortOrder(), metadata.sortOrder()); - Assert.assertEquals("Sort order map should match", - expected.sortOrders(), metadata.sortOrders()); - Assert.assertEquals("Properties should match", - expected.properties(), metadata.properties()); - Assert.assertEquals("Snapshot logs should match", - expected.snapshotLog(), metadata.snapshotLog()); - Assert.assertEquals("Current snapshot ID should match", - currentSnapshotId, metadata.currentSnapshot().snapshotId()); - Assert.assertEquals("Parent snapshot ID should match", - (Long) previousSnapshotId, metadata.currentSnapshot().parentId()); - Assert.assertEquals("Current snapshot files should match", - currentSnapshot.allManifests(ops.io()), metadata.currentSnapshot().allManifests(ops.io())); - Assert.assertEquals("Schema ID for current snapshot should match", - (Integer) 7, metadata.currentSnapshot().schemaId()); - Assert.assertEquals("Previous snapshot ID should match", - previousSnapshotId, metadata.snapshot(previousSnapshotId).snapshotId()); - Assert.assertEquals("Previous snapshot files should match", + Assert.assertEquals( + "Partition spec should match", expected.spec().toString(), metadata.spec().toString()); + Assert.assertEquals( + "Default spec ID should match", expected.defaultSpecId(), metadata.defaultSpecId()); + Assert.assertEquals("PartitionSpec map should match", expected.specs(), metadata.specs()); + Assert.assertEquals( + "lastAssignedFieldId across all PartitionSpecs should match", + expected.spec().lastAssignedFieldId(), + metadata.lastAssignedPartitionId()); + Assert.assertEquals( + "Default sort ID should match", + expected.defaultSortOrderId(), + metadata.defaultSortOrderId()); + Assert.assertEquals("Sort order should match", expected.sortOrder(), metadata.sortOrder()); + Assert.assertEquals( + "Sort order map should match", expected.sortOrders(), metadata.sortOrders()); + Assert.assertEquals("Properties should match", expected.properties(), metadata.properties()); + Assert.assertEquals( + "Snapshot logs should match", expected.snapshotLog(), metadata.snapshotLog()); + Assert.assertEquals( + "Current snapshot ID should match", + currentSnapshotId, + metadata.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Parent snapshot ID should match", + (Long) previousSnapshotId, + metadata.currentSnapshot().parentId()); + Assert.assertEquals( + "Current snapshot files should match", + currentSnapshot.allManifests(ops.io()), + metadata.currentSnapshot().allManifests(ops.io())); + Assert.assertEquals( + "Schema ID for current snapshot should match", + (Integer) 7, + metadata.currentSnapshot().schemaId()); + Assert.assertEquals( + "Previous snapshot ID should match", + previousSnapshotId, + metadata.snapshot(previousSnapshotId).snapshotId()); + Assert.assertEquals( + "Previous snapshot files should match", previousSnapshot.allManifests(ops.io()), metadata.snapshot(previousSnapshotId).allManifests(ops.io())); - Assert.assertNull("Previous snapshot's schema ID should be null", + Assert.assertNull( + "Previous snapshot's schema ID should be null", metadata.snapshot(previousSnapshotId).schemaId()); - Assert.assertEquals("Refs map should match", - refs, metadata.refs()); + Assert.assertEquals("Refs map should match", refs, metadata.refs()); } @Test @@ -177,153 +226,283 @@ public void testBackwardCompat() throws Exception { Schema schema = new Schema(TableMetadata.INITIAL_SCHEMA_ID, TEST_SCHEMA.columns()); long previousSnapshotId = System.currentTimeMillis() - new Random(1234).nextInt(3600); - Snapshot previousSnapshot = new BaseSnapshot( - ops.io(), previousSnapshotId, null, previousSnapshotId, null, null, null, ImmutableList.of( - new GenericManifestFile(localInput("file:/tmp/manfiest.1.avro"), spec.specId()))); + Snapshot previousSnapshot = + new BaseSnapshot( + ops.io(), + previousSnapshotId, + null, + previousSnapshotId, + null, + null, + null, + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manfiest.1.avro"), spec.specId()))); long currentSnapshotId = System.currentTimeMillis(); - Snapshot currentSnapshot = new BaseSnapshot( - ops.io(), currentSnapshotId, previousSnapshotId, currentSnapshotId, null, null, null, ImmutableList.of( - new GenericManifestFile(localInput("file:/tmp/manfiest.2.avro"), spec.specId()))); - - TableMetadata expected = new TableMetadata(null, 1, null, TEST_LOCATION, - 0, System.currentTimeMillis(), 3, TableMetadata.INITIAL_SCHEMA_ID, - ImmutableList.of(schema), 6, ImmutableList.of(spec), spec.lastAssignedFieldId(), - TableMetadata.INITIAL_SORT_ORDER_ID, ImmutableList.of(sortOrder), ImmutableMap.of("property", "value"), - currentSnapshotId, Arrays.asList(previousSnapshot, currentSnapshot), ImmutableList.of(), ImmutableList.of(), - ImmutableMap.of(), ImmutableList.of()); + Snapshot currentSnapshot = + new BaseSnapshot( + ops.io(), + currentSnapshotId, + previousSnapshotId, + currentSnapshotId, + null, + null, + null, + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manfiest.2.avro"), spec.specId()))); + + TableMetadata expected = + new TableMetadata( + null, + 1, + null, + TEST_LOCATION, + 0, + System.currentTimeMillis(), + 3, + TableMetadata.INITIAL_SCHEMA_ID, + ImmutableList.of(schema), + 6, + ImmutableList.of(spec), + spec.lastAssignedFieldId(), + TableMetadata.INITIAL_SORT_ORDER_ID, + ImmutableList.of(sortOrder), + ImmutableMap.of("property", "value"), + currentSnapshotId, + Arrays.asList(previousSnapshot, currentSnapshot), + ImmutableList.of(), + ImmutableList.of(), + ImmutableMap.of(), + ImmutableList.of()); String asJson = toJsonWithoutSpecAndSchemaList(expected); TableMetadata metadata = TableMetadataParser.fromJson(ops.io(), asJson); - Assert.assertEquals("Format version should match", - expected.formatVersion(), metadata.formatVersion()); + Assert.assertEquals( + "Format version should match", expected.formatVersion(), metadata.formatVersion()); Assert.assertNull("Table UUID should not be assigned", metadata.uuid()); - Assert.assertEquals("Table location should match", - expected.location(), metadata.location()); - Assert.assertEquals("Last sequence number should default to 0", - expected.lastSequenceNumber(), metadata.lastSequenceNumber()); - Assert.assertEquals("Last column ID should match", - expected.lastColumnId(), metadata.lastColumnId()); - Assert.assertEquals("Current schema ID should be default to TableMetadata.INITIAL_SCHEMA_ID", - TableMetadata.INITIAL_SCHEMA_ID, metadata.currentSchemaId()); - Assert.assertEquals("Schemas size should match", - 1, metadata.schemas().size()); - Assert.assertEquals("Schemas should contain the schema", - metadata.schemas().get(0).asStruct(), schema.asStruct()); - Assert.assertEquals("Partition spec should be the default", - expected.spec().toString(), metadata.spec().toString()); - Assert.assertEquals("Default spec ID should default to TableMetadata.INITIAL_SPEC_ID", - TableMetadata.INITIAL_SPEC_ID, metadata.defaultSpecId()); - Assert.assertEquals("PartitionSpec should contain the spec", - 1, metadata.specs().size()); - Assert.assertTrue("PartitionSpec should contain the spec", - metadata.specs().get(0).compatibleWith(spec)); - Assert.assertEquals("PartitionSpec should have ID TableMetadata.INITIAL_SPEC_ID", - TableMetadata.INITIAL_SPEC_ID, metadata.specs().get(0).specId()); - Assert.assertEquals("lastAssignedFieldId across all PartitionSpecs should match", - expected.spec().lastAssignedFieldId(), metadata.lastAssignedPartitionId()); - Assert.assertEquals("Properties should match", - expected.properties(), metadata.properties()); - Assert.assertEquals("Snapshot logs should match", - expected.snapshotLog(), metadata.snapshotLog()); - Assert.assertEquals("Current snapshot ID should match", - currentSnapshotId, metadata.currentSnapshot().snapshotId()); - Assert.assertEquals("Parent snapshot ID should match", - (Long) previousSnapshotId, metadata.currentSnapshot().parentId()); - Assert.assertEquals("Current snapshot files should match", - currentSnapshot.allManifests(ops.io()), metadata.currentSnapshot().allManifests(ops.io())); - Assert.assertNull("Current snapshot's schema ID should be null", - metadata.currentSnapshot().schemaId()); - Assert.assertEquals("Previous snapshot ID should match", - previousSnapshotId, metadata.snapshot(previousSnapshotId).snapshotId()); - Assert.assertEquals("Previous snapshot files should match", + Assert.assertEquals("Table location should match", expected.location(), metadata.location()); + Assert.assertEquals( + "Last sequence number should default to 0", + expected.lastSequenceNumber(), + metadata.lastSequenceNumber()); + Assert.assertEquals( + "Last column ID should match", expected.lastColumnId(), metadata.lastColumnId()); + Assert.assertEquals( + "Current schema ID should be default to TableMetadata.INITIAL_SCHEMA_ID", + TableMetadata.INITIAL_SCHEMA_ID, + metadata.currentSchemaId()); + Assert.assertEquals("Schemas size should match", 1, metadata.schemas().size()); + Assert.assertEquals( + "Schemas should contain the schema", + metadata.schemas().get(0).asStruct(), + schema.asStruct()); + Assert.assertEquals( + "Partition spec should be the default", + expected.spec().toString(), + metadata.spec().toString()); + Assert.assertEquals( + "Default spec ID should default to TableMetadata.INITIAL_SPEC_ID", + TableMetadata.INITIAL_SPEC_ID, + metadata.defaultSpecId()); + Assert.assertEquals("PartitionSpec should contain the spec", 1, metadata.specs().size()); + Assert.assertTrue( + "PartitionSpec should contain the spec", metadata.specs().get(0).compatibleWith(spec)); + Assert.assertEquals( + "PartitionSpec should have ID TableMetadata.INITIAL_SPEC_ID", + TableMetadata.INITIAL_SPEC_ID, + metadata.specs().get(0).specId()); + Assert.assertEquals( + "lastAssignedFieldId across all PartitionSpecs should match", + expected.spec().lastAssignedFieldId(), + metadata.lastAssignedPartitionId()); + Assert.assertEquals("Properties should match", expected.properties(), metadata.properties()); + Assert.assertEquals( + "Snapshot logs should match", expected.snapshotLog(), metadata.snapshotLog()); + Assert.assertEquals( + "Current snapshot ID should match", + currentSnapshotId, + metadata.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Parent snapshot ID should match", + (Long) previousSnapshotId, + metadata.currentSnapshot().parentId()); + Assert.assertEquals( + "Current snapshot files should match", + currentSnapshot.allManifests(ops.io()), + metadata.currentSnapshot().allManifests(ops.io())); + Assert.assertNull( + "Current snapshot's schema ID should be null", metadata.currentSnapshot().schemaId()); + Assert.assertEquals( + "Previous snapshot ID should match", + previousSnapshotId, + metadata.snapshot(previousSnapshotId).snapshotId()); + Assert.assertEquals( + "Previous snapshot files should match", previousSnapshot.allManifests(ops.io()), metadata.snapshot(previousSnapshotId).allManifests(ops.io())); - Assert.assertEquals("Snapshot logs should match", - expected.previousFiles(), metadata.previousFiles()); - Assert.assertNull("Previous snapshot's schema ID should be null", + Assert.assertEquals( + "Snapshot logs should match", expected.previousFiles(), metadata.previousFiles()); + Assert.assertNull( + "Previous snapshot's schema ID should be null", metadata.snapshot(previousSnapshotId).schemaId()); } @Test public void testInvalidMainBranch() { long previousSnapshotId = System.currentTimeMillis() - new Random(1234).nextInt(3600); - Snapshot previousSnapshot = new BaseSnapshot( - ops.io(), previousSnapshotId, null, previousSnapshotId, null, null, null, - ImmutableList.of(new GenericManifestFile(localInput("file:/tmp/manfiest.1.avro"), SPEC_5.specId()))); + Snapshot previousSnapshot = + new BaseSnapshot( + ops.io(), + previousSnapshotId, + null, + previousSnapshotId, + null, + null, + null, + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manfiest.1.avro"), SPEC_5.specId()))); long currentSnapshotId = System.currentTimeMillis(); - Snapshot currentSnapshot = new BaseSnapshot( - ops.io(), currentSnapshotId, previousSnapshotId, currentSnapshotId, null, null, 7, - ImmutableList.of(new GenericManifestFile(localInput("file:/tmp/manfiest.2.avro"), SPEC_5.specId()))); - - List snapshotLog = ImmutableList.builder() - .add(new SnapshotLogEntry(previousSnapshot.timestampMillis(), previousSnapshot.snapshotId())) - .add(new SnapshotLogEntry(currentSnapshot.timestampMillis(), currentSnapshot.snapshotId())) - .build(); - - Schema schema = new Schema(6, - Types.NestedField.required(10, "x", Types.StringType.get())); - - Map refs = ImmutableMap.of( - "main", SnapshotRef.branchBuilder(previousSnapshotId).build() - ); - - AssertHelpers.assertThrows("Should fail if main branch snapshot ID does not match currentSnapshotId", - IllegalArgumentException.class, "Current snapshot ID does not match main branch", - () -> new TableMetadata(null, 2, UUID.randomUUID().toString(), TEST_LOCATION, - SEQ_NO, System.currentTimeMillis(), 3, - 7, ImmutableList.of(TEST_SCHEMA, schema), - 5, ImmutableList.of(SPEC_5), SPEC_5.lastAssignedFieldId(), - 3, ImmutableList.of(SORT_ORDER_3), ImmutableMap.of("property", "value"), currentSnapshotId, - Arrays.asList(previousSnapshot, currentSnapshot), snapshotLog, ImmutableList.of(), refs, - ImmutableList.of())); + Snapshot currentSnapshot = + new BaseSnapshot( + ops.io(), + currentSnapshotId, + previousSnapshotId, + currentSnapshotId, + null, + null, + 7, + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manfiest.2.avro"), SPEC_5.specId()))); + + List snapshotLog = + ImmutableList.builder() + .add( + new SnapshotLogEntry( + previousSnapshot.timestampMillis(), previousSnapshot.snapshotId())) + .add( + new SnapshotLogEntry( + currentSnapshot.timestampMillis(), currentSnapshot.snapshotId())) + .build(); + + Schema schema = new Schema(6, Types.NestedField.required(10, "x", Types.StringType.get())); + + Map refs = + ImmutableMap.of("main", SnapshotRef.branchBuilder(previousSnapshotId).build()); + + AssertHelpers.assertThrows( + "Should fail if main branch snapshot ID does not match currentSnapshotId", + IllegalArgumentException.class, + "Current snapshot ID does not match main branch", + () -> + new TableMetadata( + null, + 2, + UUID.randomUUID().toString(), + TEST_LOCATION, + SEQ_NO, + System.currentTimeMillis(), + 3, + 7, + ImmutableList.of(TEST_SCHEMA, schema), + 5, + ImmutableList.of(SPEC_5), + SPEC_5.lastAssignedFieldId(), + 3, + ImmutableList.of(SORT_ORDER_3), + ImmutableMap.of("property", "value"), + currentSnapshotId, + Arrays.asList(previousSnapshot, currentSnapshot), + snapshotLog, + ImmutableList.of(), + refs, + ImmutableList.of())); } @Test public void testMainWithoutCurrent() { long snapshotId = System.currentTimeMillis() - new Random(1234).nextInt(3600); - Snapshot snapshot = new BaseSnapshot( - ops.io(), snapshotId, null, snapshotId, null, null, null, - ImmutableList.of(new GenericManifestFile(localInput("file:/tmp/manfiest.1.avro"), SPEC_5.specId()))); - - Schema schema = new Schema(6, - Types.NestedField.required(10, "x", Types.StringType.get())); - - Map refs = ImmutableMap.of( - "main", SnapshotRef.branchBuilder(snapshotId).build() - ); - - AssertHelpers.assertThrows("Should fail if main branch snapshot ID does not match currentSnapshotId", - IllegalArgumentException.class, "Current snapshot is not set, but main branch exists", - () -> new TableMetadata(null, 2, UUID.randomUUID().toString(), TEST_LOCATION, - SEQ_NO, System.currentTimeMillis(), 3, - 7, ImmutableList.of(TEST_SCHEMA, schema), - 5, ImmutableList.of(SPEC_5), SPEC_5.lastAssignedFieldId(), - 3, ImmutableList.of(SORT_ORDER_3), ImmutableMap.of("property", "value"), -1, - ImmutableList.of(snapshot), ImmutableList.of(), ImmutableList.of(), refs, - ImmutableList.of())); + Snapshot snapshot = + new BaseSnapshot( + ops.io(), + snapshotId, + null, + snapshotId, + null, + null, + null, + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manfiest.1.avro"), SPEC_5.specId()))); + + Schema schema = new Schema(6, Types.NestedField.required(10, "x", Types.StringType.get())); + + Map refs = + ImmutableMap.of("main", SnapshotRef.branchBuilder(snapshotId).build()); + + AssertHelpers.assertThrows( + "Should fail if main branch snapshot ID does not match currentSnapshotId", + IllegalArgumentException.class, + "Current snapshot is not set, but main branch exists", + () -> + new TableMetadata( + null, + 2, + UUID.randomUUID().toString(), + TEST_LOCATION, + SEQ_NO, + System.currentTimeMillis(), + 3, + 7, + ImmutableList.of(TEST_SCHEMA, schema), + 5, + ImmutableList.of(SPEC_5), + SPEC_5.lastAssignedFieldId(), + 3, + ImmutableList.of(SORT_ORDER_3), + ImmutableMap.of("property", "value"), + -1, + ImmutableList.of(snapshot), + ImmutableList.of(), + ImmutableList.of(), + refs, + ImmutableList.of())); } @Test public void testBranchSnapshotMissing() { long snapshotId = System.currentTimeMillis() - new Random(1234).nextInt(3600); - Schema schema = new Schema(6, - Types.NestedField.required(10, "x", Types.StringType.get())); + Schema schema = new Schema(6, Types.NestedField.required(10, "x", Types.StringType.get())); - Map refs = ImmutableMap.of( - "main", SnapshotRef.branchBuilder(snapshotId).build() - ); + Map refs = + ImmutableMap.of("main", SnapshotRef.branchBuilder(snapshotId).build()); - AssertHelpers.assertThrows("Should fail if main branch snapshot ID does not match currentSnapshotId", - IllegalArgumentException.class, "does not exist in the existing snapshots list", - () -> new TableMetadata(null, 2, UUID.randomUUID().toString(), TEST_LOCATION, - SEQ_NO, System.currentTimeMillis(), 3, - 7, ImmutableList.of(TEST_SCHEMA, schema), - 5, ImmutableList.of(SPEC_5), SPEC_5.lastAssignedFieldId(), - 3, ImmutableList.of(SORT_ORDER_3), ImmutableMap.of("property", "value"), -1, - ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), refs, - ImmutableList.of())); + AssertHelpers.assertThrows( + "Should fail if main branch snapshot ID does not match currentSnapshotId", + IllegalArgumentException.class, + "does not exist in the existing snapshots list", + () -> + new TableMetadata( + null, + 2, + UUID.randomUUID().toString(), + TEST_LOCATION, + SEQ_NO, + System.currentTimeMillis(), + 3, + 7, + ImmutableList.of(TEST_SCHEMA, schema), + 5, + ImmutableList.of(SPEC_5), + SPEC_5.lastAssignedFieldId(), + 3, + ImmutableList.of(SORT_ORDER_3), + ImmutableMap.of("property", "value"), + -1, + ImmutableList.of(), + ImmutableList.of(), + ImmutableList.of(), + refs, + ImmutableList.of())); } private static String toJsonWithoutSpecAndSchemaList(TableMetadata metadata) { @@ -352,7 +531,8 @@ private static String toJsonWithoutSpecAndSchemaList(TableMetadata metadata) { } generator.writeEndObject(); - generator.writeNumberField(CURRENT_SNAPSHOT_ID, + generator.writeNumberField( + CURRENT_SNAPSHOT_ID, metadata.currentSnapshot() != null ? metadata.currentSnapshot().snapshotId() : -1); generator.writeArrayFieldStart(SNAPSHOTS); @@ -374,206 +554,420 @@ private static String toJsonWithoutSpecAndSchemaList(TableMetadata metadata) { @Test public void testJsonWithPreviousMetadataLog() throws Exception { long previousSnapshotId = System.currentTimeMillis() - new Random(1234).nextInt(3600); - Snapshot previousSnapshot = new BaseSnapshot( - ops.io(), previousSnapshotId, null, previousSnapshotId, null, null, null, ImmutableList.of( - new GenericManifestFile(localInput("file:/tmp/manfiest.1.avro"), SPEC_5.specId()))); + Snapshot previousSnapshot = + new BaseSnapshot( + ops.io(), + previousSnapshotId, + null, + previousSnapshotId, + null, + null, + null, + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manfiest.1.avro"), SPEC_5.specId()))); long currentSnapshotId = System.currentTimeMillis(); - Snapshot currentSnapshot = new BaseSnapshot( - ops.io(), currentSnapshotId, previousSnapshotId, currentSnapshotId, null, null, null, ImmutableList.of( - new GenericManifestFile(localInput("file:/tmp/manfiest.2.avro"), SPEC_5.specId()))); + Snapshot currentSnapshot = + new BaseSnapshot( + ops.io(), + currentSnapshotId, + previousSnapshotId, + currentSnapshotId, + null, + null, + null, + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manfiest.2.avro"), SPEC_5.specId()))); List reversedSnapshotLog = Lists.newArrayList(); long currentTimestamp = System.currentTimeMillis(); List previousMetadataLog = Lists.newArrayList(); - previousMetadataLog.add(new MetadataLogEntry(currentTimestamp, - "/tmp/000001-" + UUID.randomUUID().toString() + ".metadata.json")); - - TableMetadata base = new TableMetadata(null, 1, UUID.randomUUID().toString(), TEST_LOCATION, - 0, System.currentTimeMillis(), 3, - 7, ImmutableList.of(TEST_SCHEMA), 5, ImmutableList.of(SPEC_5), SPEC_5.lastAssignedFieldId(), - 3, ImmutableList.of(SORT_ORDER_3), ImmutableMap.of("property", "value"), currentSnapshotId, - Arrays.asList(previousSnapshot, currentSnapshot), reversedSnapshotLog, - ImmutableList.copyOf(previousMetadataLog), ImmutableMap.of(), ImmutableList.of()); + previousMetadataLog.add( + new MetadataLogEntry( + currentTimestamp, "/tmp/000001-" + UUID.randomUUID().toString() + ".metadata.json")); + + TableMetadata base = + new TableMetadata( + null, + 1, + UUID.randomUUID().toString(), + TEST_LOCATION, + 0, + System.currentTimeMillis(), + 3, + 7, + ImmutableList.of(TEST_SCHEMA), + 5, + ImmutableList.of(SPEC_5), + SPEC_5.lastAssignedFieldId(), + 3, + ImmutableList.of(SORT_ORDER_3), + ImmutableMap.of("property", "value"), + currentSnapshotId, + Arrays.asList(previousSnapshot, currentSnapshot), + reversedSnapshotLog, + ImmutableList.copyOf(previousMetadataLog), + ImmutableMap.of(), + ImmutableList.of()); String asJson = TableMetadataParser.toJson(base); TableMetadata metadataFromJson = TableMetadataParser.fromJson(ops.io(), asJson); - Assert.assertEquals("Metadata logs should match", previousMetadataLog, metadataFromJson.previousFiles()); + Assert.assertEquals( + "Metadata logs should match", previousMetadataLog, metadataFromJson.previousFiles()); } @Test public void testAddPreviousMetadataRemoveNone() { long previousSnapshotId = System.currentTimeMillis() - new Random(1234).nextInt(3600); - Snapshot previousSnapshot = new BaseSnapshot( - ops.io(), previousSnapshotId, null, previousSnapshotId, null, null, null, ImmutableList.of( - new GenericManifestFile(localInput("file:/tmp/manfiest.1.avro"), SPEC_5.specId()))); + Snapshot previousSnapshot = + new BaseSnapshot( + ops.io(), + previousSnapshotId, + null, + previousSnapshotId, + null, + null, + null, + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manfiest.1.avro"), SPEC_5.specId()))); long currentSnapshotId = System.currentTimeMillis(); - Snapshot currentSnapshot = new BaseSnapshot( - ops.io(), currentSnapshotId, previousSnapshotId, currentSnapshotId, null, null, null, ImmutableList.of( - new GenericManifestFile(localInput("file:/tmp/manfiest.2.avro"), SPEC_5.specId()))); + Snapshot currentSnapshot = + new BaseSnapshot( + ops.io(), + currentSnapshotId, + previousSnapshotId, + currentSnapshotId, + null, + null, + null, + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manfiest.2.avro"), SPEC_5.specId()))); List reversedSnapshotLog = Lists.newArrayList(); - reversedSnapshotLog.add(new SnapshotLogEntry(previousSnapshot.timestampMillis(), previousSnapshotId)); - reversedSnapshotLog.add(new SnapshotLogEntry(currentSnapshot.timestampMillis(), currentSnapshotId)); + reversedSnapshotLog.add( + new SnapshotLogEntry(previousSnapshot.timestampMillis(), previousSnapshotId)); + reversedSnapshotLog.add( + new SnapshotLogEntry(currentSnapshot.timestampMillis(), currentSnapshotId)); long currentTimestamp = System.currentTimeMillis(); List previousMetadataLog = Lists.newArrayList(); - previousMetadataLog.add(new MetadataLogEntry(currentTimestamp - 100, - "/tmp/000001-" + UUID.randomUUID().toString() + ".metadata.json")); - previousMetadataLog.add(new MetadataLogEntry(currentTimestamp - 90, - "/tmp/000002-" + UUID.randomUUID().toString() + ".metadata.json")); - - MetadataLogEntry latestPreviousMetadata = new MetadataLogEntry(currentTimestamp - 80, - "/tmp/000003-" + UUID.randomUUID().toString() + ".metadata.json"); - - TableMetadata base = new TableMetadata(latestPreviousMetadata.file(), 1, UUID.randomUUID().toString(), - TEST_LOCATION, 0, currentTimestamp - 80, 3, - 7, ImmutableList.of(TEST_SCHEMA), 5, ImmutableList.of(SPEC_5), SPEC_5.lastAssignedFieldId(), - 3, ImmutableList.of(SORT_ORDER_3), ImmutableMap.of("property", "value"), currentSnapshotId, - Arrays.asList(previousSnapshot, currentSnapshot), reversedSnapshotLog, - ImmutableList.copyOf(previousMetadataLog), ImmutableMap.of(), ImmutableList.of()); + previousMetadataLog.add( + new MetadataLogEntry( + currentTimestamp - 100, + "/tmp/000001-" + UUID.randomUUID().toString() + ".metadata.json")); + previousMetadataLog.add( + new MetadataLogEntry( + currentTimestamp - 90, + "/tmp/000002-" + UUID.randomUUID().toString() + ".metadata.json")); + + MetadataLogEntry latestPreviousMetadata = + new MetadataLogEntry( + currentTimestamp - 80, + "/tmp/000003-" + UUID.randomUUID().toString() + ".metadata.json"); + + TableMetadata base = + new TableMetadata( + latestPreviousMetadata.file(), + 1, + UUID.randomUUID().toString(), + TEST_LOCATION, + 0, + currentTimestamp - 80, + 3, + 7, + ImmutableList.of(TEST_SCHEMA), + 5, + ImmutableList.of(SPEC_5), + SPEC_5.lastAssignedFieldId(), + 3, + ImmutableList.of(SORT_ORDER_3), + ImmutableMap.of("property", "value"), + currentSnapshotId, + Arrays.asList(previousSnapshot, currentSnapshot), + reversedSnapshotLog, + ImmutableList.copyOf(previousMetadataLog), + ImmutableMap.of(), + ImmutableList.of()); previousMetadataLog.add(latestPreviousMetadata); - TableMetadata metadata = base.replaceProperties( - ImmutableMap.of(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "5")); + TableMetadata metadata = + base.replaceProperties( + ImmutableMap.of(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "5")); Set removedPreviousMetadata = Sets.newHashSet(base.previousFiles()); removedPreviousMetadata.removeAll(metadata.previousFiles()); - Assert.assertEquals("Metadata logs should match", previousMetadataLog, metadata.previousFiles()); + Assert.assertEquals( + "Metadata logs should match", previousMetadataLog, metadata.previousFiles()); Assert.assertEquals("Removed Metadata logs should be empty", 0, removedPreviousMetadata.size()); } @Test public void testAddPreviousMetadataRemoveOne() { long previousSnapshotId = System.currentTimeMillis() - new Random(1234).nextInt(3600); - Snapshot previousSnapshot = new BaseSnapshot( - ops.io(), previousSnapshotId, null, previousSnapshotId, null, null, null, ImmutableList.of( - new GenericManifestFile(localInput("file:/tmp/manfiest.1.avro"), SPEC_5.specId()))); + Snapshot previousSnapshot = + new BaseSnapshot( + ops.io(), + previousSnapshotId, + null, + previousSnapshotId, + null, + null, + null, + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manfiest.1.avro"), SPEC_5.specId()))); long currentSnapshotId = System.currentTimeMillis(); - Snapshot currentSnapshot = new BaseSnapshot( - ops.io(), currentSnapshotId, previousSnapshotId, currentSnapshotId, null, null, null, ImmutableList.of( - new GenericManifestFile(localInput("file:/tmp/manfiest.2.avro"), SPEC_5.specId()))); + Snapshot currentSnapshot = + new BaseSnapshot( + ops.io(), + currentSnapshotId, + previousSnapshotId, + currentSnapshotId, + null, + null, + null, + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manfiest.2.avro"), SPEC_5.specId()))); List reversedSnapshotLog = Lists.newArrayList(); - reversedSnapshotLog.add(new SnapshotLogEntry(previousSnapshot.timestampMillis(), previousSnapshotId)); - reversedSnapshotLog.add(new SnapshotLogEntry(currentSnapshot.timestampMillis(), currentSnapshotId)); + reversedSnapshotLog.add( + new SnapshotLogEntry(previousSnapshot.timestampMillis(), previousSnapshotId)); + reversedSnapshotLog.add( + new SnapshotLogEntry(currentSnapshot.timestampMillis(), currentSnapshotId)); long currentTimestamp = System.currentTimeMillis(); List previousMetadataLog = Lists.newArrayList(); - previousMetadataLog.add(new MetadataLogEntry(currentTimestamp - 100, - "/tmp/000001-" + UUID.randomUUID().toString() + ".metadata.json")); - previousMetadataLog.add(new MetadataLogEntry(currentTimestamp - 90, - "/tmp/000002-" + UUID.randomUUID().toString() + ".metadata.json")); - previousMetadataLog.add(new MetadataLogEntry(currentTimestamp - 80, - "/tmp/000003-" + UUID.randomUUID().toString() + ".metadata.json")); - previousMetadataLog.add(new MetadataLogEntry(currentTimestamp - 70, - "/tmp/000004-" + UUID.randomUUID().toString() + ".metadata.json")); - previousMetadataLog.add(new MetadataLogEntry(currentTimestamp - 60, - "/tmp/000005-" + UUID.randomUUID().toString() + ".metadata.json")); - - MetadataLogEntry latestPreviousMetadata = new MetadataLogEntry(currentTimestamp - 50, - "/tmp/000006-" + UUID.randomUUID().toString() + ".metadata.json"); - - TableMetadata base = new TableMetadata(latestPreviousMetadata.file(), 1, UUID.randomUUID().toString(), - TEST_LOCATION, 0, currentTimestamp - 50, 3, - 7, ImmutableList.of(TEST_SCHEMA), 5, - ImmutableList.of(SPEC_5), SPEC_5.lastAssignedFieldId(), 3, ImmutableList.of(SORT_ORDER_3), - ImmutableMap.of("property", "value"), currentSnapshotId, - Arrays.asList(previousSnapshot, currentSnapshot), reversedSnapshotLog, - ImmutableList.copyOf(previousMetadataLog), ImmutableMap.of(), ImmutableList.of()); + previousMetadataLog.add( + new MetadataLogEntry( + currentTimestamp - 100, + "/tmp/000001-" + UUID.randomUUID().toString() + ".metadata.json")); + previousMetadataLog.add( + new MetadataLogEntry( + currentTimestamp - 90, + "/tmp/000002-" + UUID.randomUUID().toString() + ".metadata.json")); + previousMetadataLog.add( + new MetadataLogEntry( + currentTimestamp - 80, + "/tmp/000003-" + UUID.randomUUID().toString() + ".metadata.json")); + previousMetadataLog.add( + new MetadataLogEntry( + currentTimestamp - 70, + "/tmp/000004-" + UUID.randomUUID().toString() + ".metadata.json")); + previousMetadataLog.add( + new MetadataLogEntry( + currentTimestamp - 60, + "/tmp/000005-" + UUID.randomUUID().toString() + ".metadata.json")); + + MetadataLogEntry latestPreviousMetadata = + new MetadataLogEntry( + currentTimestamp - 50, + "/tmp/000006-" + UUID.randomUUID().toString() + ".metadata.json"); + + TableMetadata base = + new TableMetadata( + latestPreviousMetadata.file(), + 1, + UUID.randomUUID().toString(), + TEST_LOCATION, + 0, + currentTimestamp - 50, + 3, + 7, + ImmutableList.of(TEST_SCHEMA), + 5, + ImmutableList.of(SPEC_5), + SPEC_5.lastAssignedFieldId(), + 3, + ImmutableList.of(SORT_ORDER_3), + ImmutableMap.of("property", "value"), + currentSnapshotId, + Arrays.asList(previousSnapshot, currentSnapshot), + reversedSnapshotLog, + ImmutableList.copyOf(previousMetadataLog), + ImmutableMap.of(), + ImmutableList.of()); previousMetadataLog.add(latestPreviousMetadata); - TableMetadata metadata = base.replaceProperties( - ImmutableMap.of(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "5")); + TableMetadata metadata = + base.replaceProperties( + ImmutableMap.of(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "5")); SortedSet removedPreviousMetadata = Sets.newTreeSet(Comparator.comparingLong(MetadataLogEntry::timestampMillis)); removedPreviousMetadata.addAll(base.previousFiles()); removedPreviousMetadata.removeAll(metadata.previousFiles()); - Assert.assertEquals("Metadata logs should match", previousMetadataLog.subList(1, 6), - metadata.previousFiles()); - Assert.assertEquals("Removed Metadata logs should contain 1", previousMetadataLog.subList(0, 1), + Assert.assertEquals( + "Metadata logs should match", previousMetadataLog.subList(1, 6), metadata.previousFiles()); + Assert.assertEquals( + "Removed Metadata logs should contain 1", + previousMetadataLog.subList(0, 1), ImmutableList.copyOf(removedPreviousMetadata)); } @Test public void testAddPreviousMetadataRemoveMultiple() { long previousSnapshotId = System.currentTimeMillis() - new Random(1234).nextInt(3600); - Snapshot previousSnapshot = new BaseSnapshot( - ops.io(), previousSnapshotId, null, previousSnapshotId, null, null, null, ImmutableList.of( - new GenericManifestFile(localInput("file:/tmp/manfiest.1.avro"), SPEC_5.specId()))); + Snapshot previousSnapshot = + new BaseSnapshot( + ops.io(), + previousSnapshotId, + null, + previousSnapshotId, + null, + null, + null, + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manfiest.1.avro"), SPEC_5.specId()))); long currentSnapshotId = System.currentTimeMillis(); - Snapshot currentSnapshot = new BaseSnapshot( - ops.io(), currentSnapshotId, previousSnapshotId, currentSnapshotId, null, null, null, ImmutableList.of( - new GenericManifestFile(localInput("file:/tmp/manfiest.2.avro"), SPEC_5.specId()))); + Snapshot currentSnapshot = + new BaseSnapshot( + ops.io(), + currentSnapshotId, + previousSnapshotId, + currentSnapshotId, + null, + null, + null, + ImmutableList.of( + new GenericManifestFile(localInput("file:/tmp/manfiest.2.avro"), SPEC_5.specId()))); List reversedSnapshotLog = Lists.newArrayList(); - reversedSnapshotLog.add(new SnapshotLogEntry(previousSnapshot.timestampMillis(), previousSnapshotId)); - reversedSnapshotLog.add(new SnapshotLogEntry(currentSnapshot.timestampMillis(), currentSnapshotId)); + reversedSnapshotLog.add( + new SnapshotLogEntry(previousSnapshot.timestampMillis(), previousSnapshotId)); + reversedSnapshotLog.add( + new SnapshotLogEntry(currentSnapshot.timestampMillis(), currentSnapshotId)); long currentTimestamp = System.currentTimeMillis(); List previousMetadataLog = Lists.newArrayList(); - previousMetadataLog.add(new MetadataLogEntry(currentTimestamp - 100, - "/tmp/000001-" + UUID.randomUUID().toString() + ".metadata.json")); - previousMetadataLog.add(new MetadataLogEntry(currentTimestamp - 90, - "/tmp/000002-" + UUID.randomUUID().toString() + ".metadata.json")); - previousMetadataLog.add(new MetadataLogEntry(currentTimestamp - 80, - "/tmp/000003-" + UUID.randomUUID().toString() + ".metadata.json")); - previousMetadataLog.add(new MetadataLogEntry(currentTimestamp - 70, - "/tmp/000004-" + UUID.randomUUID().toString() + ".metadata.json")); - previousMetadataLog.add(new MetadataLogEntry(currentTimestamp - 60, - "/tmp/000005-" + UUID.randomUUID().toString() + ".metadata.json")); - - MetadataLogEntry latestPreviousMetadata = new MetadataLogEntry(currentTimestamp - 50, - "/tmp/000006-" + UUID.randomUUID().toString() + ".metadata.json"); - - TableMetadata base = new TableMetadata(latestPreviousMetadata.file(), 1, UUID.randomUUID().toString(), - TEST_LOCATION, 0, currentTimestamp - 50, 3, 7, ImmutableList.of(TEST_SCHEMA), SPEC_5.specId(), - ImmutableList.of(SPEC_5), SPEC_5.lastAssignedFieldId(), - SortOrder.unsorted().orderId(), ImmutableList.of(SortOrder.unsorted()), - ImmutableMap.of("property", "value"), currentSnapshotId, - Arrays.asList(previousSnapshot, currentSnapshot), reversedSnapshotLog, - ImmutableList.copyOf(previousMetadataLog), ImmutableMap.of(), ImmutableList.of()); + previousMetadataLog.add( + new MetadataLogEntry( + currentTimestamp - 100, + "/tmp/000001-" + UUID.randomUUID().toString() + ".metadata.json")); + previousMetadataLog.add( + new MetadataLogEntry( + currentTimestamp - 90, + "/tmp/000002-" + UUID.randomUUID().toString() + ".metadata.json")); + previousMetadataLog.add( + new MetadataLogEntry( + currentTimestamp - 80, + "/tmp/000003-" + UUID.randomUUID().toString() + ".metadata.json")); + previousMetadataLog.add( + new MetadataLogEntry( + currentTimestamp - 70, + "/tmp/000004-" + UUID.randomUUID().toString() + ".metadata.json")); + previousMetadataLog.add( + new MetadataLogEntry( + currentTimestamp - 60, + "/tmp/000005-" + UUID.randomUUID().toString() + ".metadata.json")); + + MetadataLogEntry latestPreviousMetadata = + new MetadataLogEntry( + currentTimestamp - 50, + "/tmp/000006-" + UUID.randomUUID().toString() + ".metadata.json"); + + TableMetadata base = + new TableMetadata( + latestPreviousMetadata.file(), + 1, + UUID.randomUUID().toString(), + TEST_LOCATION, + 0, + currentTimestamp - 50, + 3, + 7, + ImmutableList.of(TEST_SCHEMA), + SPEC_5.specId(), + ImmutableList.of(SPEC_5), + SPEC_5.lastAssignedFieldId(), + SortOrder.unsorted().orderId(), + ImmutableList.of(SortOrder.unsorted()), + ImmutableMap.of("property", "value"), + currentSnapshotId, + Arrays.asList(previousSnapshot, currentSnapshot), + reversedSnapshotLog, + ImmutableList.copyOf(previousMetadataLog), + ImmutableMap.of(), + ImmutableList.of()); previousMetadataLog.add(latestPreviousMetadata); - TableMetadata metadata = base.replaceProperties( - ImmutableMap.of(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "2")); + TableMetadata metadata = + base.replaceProperties( + ImmutableMap.of(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "2")); SortedSet removedPreviousMetadata = Sets.newTreeSet(Comparator.comparingLong(MetadataLogEntry::timestampMillis)); removedPreviousMetadata.addAll(base.previousFiles()); removedPreviousMetadata.removeAll(metadata.previousFiles()); - Assert.assertEquals("Metadata logs should match", previousMetadataLog.subList(4, 6), - metadata.previousFiles()); - Assert.assertEquals("Removed Metadata logs should contain 4", previousMetadataLog.subList(0, 4), + Assert.assertEquals( + "Metadata logs should match", previousMetadataLog.subList(4, 6), metadata.previousFiles()); + Assert.assertEquals( + "Removed Metadata logs should contain 4", + previousMetadataLog.subList(0, 4), ImmutableList.copyOf(removedPreviousMetadata)); } @Test public void testV2UUIDValidation() { - AssertHelpers.assertThrows("Should reject v2 metadata without a UUID", - IllegalArgumentException.class, "UUID is required in format v2", - () -> new TableMetadata(null, 2, null, TEST_LOCATION, SEQ_NO, System.currentTimeMillis(), - LAST_ASSIGNED_COLUMN_ID, 7, ImmutableList.of(TEST_SCHEMA), - SPEC_5.specId(), ImmutableList.of(SPEC_5), SPEC_5.lastAssignedFieldId(), - 3, ImmutableList.of(SORT_ORDER_3), ImmutableMap.of(), -1L, - ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), ImmutableMap.of(), ImmutableList.of()) - ); + AssertHelpers.assertThrows( + "Should reject v2 metadata without a UUID", + IllegalArgumentException.class, + "UUID is required in format v2", + () -> + new TableMetadata( + null, + 2, + null, + TEST_LOCATION, + SEQ_NO, + System.currentTimeMillis(), + LAST_ASSIGNED_COLUMN_ID, + 7, + ImmutableList.of(TEST_SCHEMA), + SPEC_5.specId(), + ImmutableList.of(SPEC_5), + SPEC_5.lastAssignedFieldId(), + 3, + ImmutableList.of(SORT_ORDER_3), + ImmutableMap.of(), + -1L, + ImmutableList.of(), + ImmutableList.of(), + ImmutableList.of(), + ImmutableMap.of(), + ImmutableList.of())); } @Test public void testVersionValidation() { int unsupportedVersion = TableMetadata.SUPPORTED_TABLE_FORMAT_VERSION + 1; - AssertHelpers.assertThrows("Should reject unsupported metadata", - IllegalArgumentException.class, "Unsupported format version: v" + unsupportedVersion, - () -> new TableMetadata(null, unsupportedVersion, null, TEST_LOCATION, SEQ_NO, - System.currentTimeMillis(), LAST_ASSIGNED_COLUMN_ID, - 7, ImmutableList.of(TEST_SCHEMA), SPEC_5.specId(), ImmutableList.of(SPEC_5), - SPEC_5.lastAssignedFieldId(), 3, ImmutableList.of(SORT_ORDER_3), ImmutableMap.of(), -1L, - ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), ImmutableMap.of(), ImmutableList.of()) - ); + AssertHelpers.assertThrows( + "Should reject unsupported metadata", + IllegalArgumentException.class, + "Unsupported format version: v" + unsupportedVersion, + () -> + new TableMetadata( + null, + unsupportedVersion, + null, + TEST_LOCATION, + SEQ_NO, + System.currentTimeMillis(), + LAST_ASSIGNED_COLUMN_ID, + 7, + ImmutableList.of(TEST_SCHEMA), + SPEC_5.specId(), + ImmutableList.of(SPEC_5), + SPEC_5.lastAssignedFieldId(), + 3, + ImmutableList.of(SORT_ORDER_3), + ImmutableMap.of(), + -1L, + ImmutableList.of(), + ImmutableList.of(), + ImmutableList.of(), + ImmutableMap.of(), + ImmutableList.of())); } @Test @@ -587,56 +981,63 @@ public void testParserVersionValidation() throws Exception { Assert.assertNotNull("Should successfully read supported metadata version", parsed2); String unsupportedVersion = readTableMetadataInputFile("TableMetadataUnsupportedVersion.json"); - AssertHelpers.assertThrows("Should not read unsupported metadata", - IllegalArgumentException.class, "Cannot read unsupported version", - () -> TableMetadataParser.fromJson(ops.io(), unsupportedVersion) - ); + AssertHelpers.assertThrows( + "Should not read unsupported metadata", + IllegalArgumentException.class, + "Cannot read unsupported version", + () -> TableMetadataParser.fromJson(ops.io(), unsupportedVersion)); } - @Test public void testParserV2PartitionSpecsValidation() throws Exception { - String unsupportedVersion = readTableMetadataInputFile("TableMetadataV2MissingPartitionSpecs.json"); - AssertHelpers.assertThrows("Should reject v2 metadata without partition specs", - IllegalArgumentException.class, "partition-specs must exist in format v2", - () -> TableMetadataParser.fromJson(ops.io(), unsupportedVersion) - ); + String unsupportedVersion = + readTableMetadataInputFile("TableMetadataV2MissingPartitionSpecs.json"); + AssertHelpers.assertThrows( + "Should reject v2 metadata without partition specs", + IllegalArgumentException.class, + "partition-specs must exist in format v2", + () -> TableMetadataParser.fromJson(ops.io(), unsupportedVersion)); } @Test public void testParserV2LastAssignedFieldIdValidation() throws Exception { - String unsupportedVersion = readTableMetadataInputFile("TableMetadataV2MissingLastPartitionId.json"); - AssertHelpers.assertThrows("Should reject v2 metadata without last assigned partition field id", - IllegalArgumentException.class, "last-partition-id must exist in format v2", - () -> TableMetadataParser.fromJson(ops.io(), unsupportedVersion) - ); + String unsupportedVersion = + readTableMetadataInputFile("TableMetadataV2MissingLastPartitionId.json"); + AssertHelpers.assertThrows( + "Should reject v2 metadata without last assigned partition field id", + IllegalArgumentException.class, + "last-partition-id must exist in format v2", + () -> TableMetadataParser.fromJson(ops.io(), unsupportedVersion)); } @Test public void testParserV2SortOrderValidation() throws Exception { String unsupportedVersion = readTableMetadataInputFile("TableMetadataV2MissingSortOrder.json"); - AssertHelpers.assertThrows("Should reject v2 metadata without sort order", - IllegalArgumentException.class, "sort-orders must exist in format v2", - () -> TableMetadataParser.fromJson(ops.io(), unsupportedVersion) - ); + AssertHelpers.assertThrows( + "Should reject v2 metadata without sort order", + IllegalArgumentException.class, + "sort-orders must exist in format v2", + () -> TableMetadataParser.fromJson(ops.io(), unsupportedVersion)); } @Test public void testParserV2CurrentSchemaIdValidation() throws Exception { String unsupported = readTableMetadataInputFile("TableMetadataV2CurrentSchemaNotFound.json"); - AssertHelpers.assertThrows("Should reject v2 metadata without valid schema id", - IllegalArgumentException.class, "Cannot find schema with current-schema-id=2 from schemas", - () -> TableMetadataParser.fromJson(ops.io(), unsupported) - ); + AssertHelpers.assertThrows( + "Should reject v2 metadata without valid schema id", + IllegalArgumentException.class, + "Cannot find schema with current-schema-id=2 from schemas", + () -> TableMetadataParser.fromJson(ops.io(), unsupported)); } @Test public void testParserV2SchemasValidation() throws Exception { String unsupported = readTableMetadataInputFile("TableMetadataV2MissingSchemas.json"); - AssertHelpers.assertThrows("Should reject v2 metadata without schemas", - IllegalArgumentException.class, "schemas must exist in format v2", - () -> TableMetadataParser.fromJson(ops.io(), unsupported) - ); + AssertHelpers.assertThrows( + "Should reject v2 metadata without schemas", + IllegalArgumentException.class, + "schemas must exist in format v2", + () -> TableMetadataParser.fromJson(ops.io(), unsupported)); } private String readTableMetadataInputFile(String fileName) throws Exception { @@ -646,146 +1047,162 @@ private String readTableMetadataInputFile(String fileName) throws Exception { @Test public void testNewTableMetadataReassignmentAllIds() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(3, "x", Types.LongType.get()), - Types.NestedField.required(4, "y", Types.LongType.get()), - Types.NestedField.required(5, "z", Types.LongType.get()) - ); - - PartitionSpec spec = PartitionSpec.builderFor(schema).withSpecId(5) - .add(3, 1005, "x_partition", "bucket[4]") - .add(5, 1003, "z_partition", "bucket[8]") - .build(); + Schema schema = + new Schema( + Types.NestedField.required(3, "x", Types.LongType.get()), + Types.NestedField.required(4, "y", Types.LongType.get()), + Types.NestedField.required(5, "z", Types.LongType.get())); + + PartitionSpec spec = + PartitionSpec.builderFor(schema) + .withSpecId(5) + .add(3, 1005, "x_partition", "bucket[4]") + .add(5, 1003, "z_partition", "bucket[8]") + .build(); String location = "file://tmp/db/table"; - TableMetadata metadata = TableMetadata.newTableMetadata(schema, spec, location, ImmutableMap.of()); + TableMetadata metadata = + TableMetadata.newTableMetadata(schema, spec, location, ImmutableMap.of()); // newTableMetadata should reassign column ids and partition field ids. - PartitionSpec expected = PartitionSpec.builderFor(metadata.schema()).withSpecId(0) - .add(1, 1000, "x_partition", "bucket[4]") - .add(3, 1001, "z_partition", "bucket[8]") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(metadata.schema()) + .withSpecId(0) + .add(1, 1000, "x_partition", "bucket[4]") + .add(3, 1001, "z_partition", "bucket[8]") + .build(); Assert.assertEquals(expected, metadata.spec()); } @Test public void testInvalidUpdatePartitionSpecForV1Table() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(1, "x", Types.LongType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(1, "x", Types.LongType.get())); - PartitionSpec spec = PartitionSpec.builderFor(schema).withSpecId(5) - .add(1, 1005, "x_partition", "bucket[4]") - .build(); + PartitionSpec spec = + PartitionSpec.builderFor(schema) + .withSpecId(5) + .add(1, 1005, "x_partition", "bucket[4]") + .build(); String location = "file://tmp/db/table"; - TableMetadata metadata = TableMetadata.newTableMetadata( - schema, PartitionSpec.unpartitioned(), location, ImmutableMap.of()); - - AssertHelpers.assertThrows("Should fail to update an invalid partition spec", - ValidationException.class, "Spec does not use sequential IDs that are required in v1", + TableMetadata metadata = + TableMetadata.newTableMetadata( + schema, PartitionSpec.unpartitioned(), location, ImmutableMap.of()); + + AssertHelpers.assertThrows( + "Should fail to update an invalid partition spec", + ValidationException.class, + "Spec does not use sequential IDs that are required in v1", () -> metadata.updatePartitionSpec(spec)); } @Test public void testBuildReplacementForV1Table() { - Schema schema = new Schema( - Types.NestedField.required(1, "x", Types.LongType.get()), - Types.NestedField.required(2, "y", Types.LongType.get()) - ); - PartitionSpec spec = PartitionSpec.builderFor(schema).withSpecId(0) - .identity("x") - .identity("y") - .build(); + Schema schema = + new Schema( + Types.NestedField.required(1, "x", Types.LongType.get()), + Types.NestedField.required(2, "y", Types.LongType.get())); + PartitionSpec spec = + PartitionSpec.builderFor(schema).withSpecId(0).identity("x").identity("y").build(); String location = "file://tmp/db/table"; - TableMetadata metadata = TableMetadata.newTableMetadata( - schema, spec, SortOrder.unsorted(), location, ImmutableMap.of(), 1); + TableMetadata metadata = + TableMetadata.newTableMetadata( + schema, spec, SortOrder.unsorted(), location, ImmutableMap.of(), 1); Assert.assertEquals(spec, metadata.spec()); - Schema updatedSchema = new Schema( - Types.NestedField.required(1, "x", Types.LongType.get()), - Types.NestedField.required(2, "z", Types.StringType.get()), - Types.NestedField.required(3, "y", Types.LongType.get()) - ); - PartitionSpec updatedSpec = PartitionSpec.builderFor(updatedSchema).withSpecId(0) - .bucket("z", 8) - .identity("x") - .build(); - TableMetadata updated = metadata.buildReplacement( - updatedSchema, updatedSpec, SortOrder.unsorted(), location, ImmutableMap.of()); - PartitionSpec expected = PartitionSpec.builderFor(updated.schema()).withSpecId(1) - .add(1, 1000, "x", "identity") - .add(2, 1001, "y", "void") - .add(3, 1002, "z_bucket", "bucket[8]") - .build(); + Schema updatedSchema = + new Schema( + Types.NestedField.required(1, "x", Types.LongType.get()), + Types.NestedField.required(2, "z", Types.StringType.get()), + Types.NestedField.required(3, "y", Types.LongType.get())); + PartitionSpec updatedSpec = + PartitionSpec.builderFor(updatedSchema).withSpecId(0).bucket("z", 8).identity("x").build(); + TableMetadata updated = + metadata.buildReplacement( + updatedSchema, updatedSpec, SortOrder.unsorted(), location, ImmutableMap.of()); + PartitionSpec expected = + PartitionSpec.builderFor(updated.schema()) + .withSpecId(1) + .add(1, 1000, "x", "identity") + .add(2, 1001, "y", "void") + .add(3, 1002, "z_bucket", "bucket[8]") + .build(); Assert.assertEquals( "Should reassign the partition field IDs and reuse any existing IDs for equivalent fields", - expected, updated.spec()); + expected, + updated.spec()); } @Test public void testBuildReplacementForV2Table() { - Schema schema = new Schema( - Types.NestedField.required(1, "x", Types.LongType.get()), - Types.NestedField.required(2, "y", Types.LongType.get()) - ); - PartitionSpec spec = PartitionSpec.builderFor(schema).withSpecId(0) - .identity("x") - .identity("y") - .build(); + Schema schema = + new Schema( + Types.NestedField.required(1, "x", Types.LongType.get()), + Types.NestedField.required(2, "y", Types.LongType.get())); + PartitionSpec spec = + PartitionSpec.builderFor(schema).withSpecId(0).identity("x").identity("y").build(); String location = "file://tmp/db/table"; - TableMetadata metadata = TableMetadata.newTableMetadata( - schema, spec, SortOrder.unsorted(), location, ImmutableMap.of(), 2); + TableMetadata metadata = + TableMetadata.newTableMetadata( + schema, spec, SortOrder.unsorted(), location, ImmutableMap.of(), 2); Assert.assertEquals(spec, metadata.spec()); - Schema updatedSchema = new Schema( - Types.NestedField.required(1, "x", Types.LongType.get()), - Types.NestedField.required(2, "z", Types.StringType.get()) - ); - PartitionSpec updatedSpec = PartitionSpec.builderFor(updatedSchema).withSpecId(0) - .bucket("z", 8) - .identity("x") - .build(); - TableMetadata updated = metadata.buildReplacement( - updatedSchema, updatedSpec, SortOrder.unsorted(), location, ImmutableMap.of()); - PartitionSpec expected = PartitionSpec.builderFor(updated.schema()).withSpecId(1) - .add(3, 1002, "z_bucket", "bucket[8]") - .add(1, 1000, "x", "identity") - .build(); + Schema updatedSchema = + new Schema( + Types.NestedField.required(1, "x", Types.LongType.get()), + Types.NestedField.required(2, "z", Types.StringType.get())); + PartitionSpec updatedSpec = + PartitionSpec.builderFor(updatedSchema).withSpecId(0).bucket("z", 8).identity("x").build(); + TableMetadata updated = + metadata.buildReplacement( + updatedSchema, updatedSpec, SortOrder.unsorted(), location, ImmutableMap.of()); + PartitionSpec expected = + PartitionSpec.builderFor(updated.schema()) + .withSpecId(1) + .add(3, 1002, "z_bucket", "bucket[8]") + .add(1, 1000, "x", "identity") + .build(); Assert.assertEquals( "Should reassign the partition field IDs and reuse any existing IDs for equivalent fields", - expected, updated.spec()); + expected, + updated.spec()); } @Test public void testSortOrder() { - Schema schema = new Schema( - Types.NestedField.required(10, "x", Types.StringType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(10, "x", Types.StringType.get())); - TableMetadata meta = TableMetadata.newTableMetadata( - schema, PartitionSpec.unpartitioned(), null, ImmutableMap.of()); + TableMetadata meta = + TableMetadata.newTableMetadata( + schema, PartitionSpec.unpartitioned(), null, ImmutableMap.of()); Assert.assertTrue("Should default to unsorted order", meta.sortOrder().isUnsorted()); - Assert.assertSame("Should detect identical unsorted order", meta, meta.replaceSortOrder(SortOrder.unsorted())); + Assert.assertSame( + "Should detect identical unsorted order", + meta, + meta.replaceSortOrder(SortOrder.unsorted())); } @Test public void testUpdateSortOrder() { - Schema schema = new Schema( - Types.NestedField.required(10, "x", Types.StringType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(10, "x", Types.StringType.get())); SortOrder order = SortOrder.builderFor(schema).asc("x").build(); - TableMetadata sortedByX = TableMetadata.newTableMetadata( - schema, PartitionSpec.unpartitioned(), order, null, ImmutableMap.of()); + TableMetadata sortedByX = + TableMetadata.newTableMetadata( + schema, PartitionSpec.unpartitioned(), order, null, ImmutableMap.of()); Assert.assertEquals("Should have 1 sort order", 1, sortedByX.sortOrders().size()); Assert.assertEquals("Should use orderId 1", 1, sortedByX.sortOrder().orderId()); Assert.assertEquals("Should be sorted by one field", 1, sortedByX.sortOrder().fields().size()); - Assert.assertEquals("Should use the table's field ids", 1, sortedByX.sortOrder().fields().get(0).sourceId()); - Assert.assertEquals("Should be ascending", - SortDirection.ASC, sortedByX.sortOrder().fields().get(0).direction()); - Assert.assertEquals("Should be nulls first", - NullOrder.NULLS_FIRST, sortedByX.sortOrder().fields().get(0).nullOrder()); + Assert.assertEquals( + "Should use the table's field ids", 1, sortedByX.sortOrder().fields().get(0).sourceId()); + Assert.assertEquals( + "Should be ascending", + SortDirection.ASC, + sortedByX.sortOrder().fields().get(0).direction()); + Assert.assertEquals( + "Should be nulls first", + NullOrder.NULLS_FIRST, + sortedByX.sortOrder().fields().get(0).nullOrder()); // build an equivalent order with the correct schema SortOrder newOrder = SortOrder.builderFor(sortedByX.schema()).asc("x").build(); @@ -798,15 +1215,24 @@ public void testUpdateSortOrder() { Assert.assertEquals("Should use orderId 0", 0, unsorted.sortOrder().orderId()); Assert.assertTrue("Should be unsorted", unsorted.sortOrder().isUnsorted()); - TableMetadata sortedByXDesc = unsorted.replaceSortOrder(SortOrder.builderFor(unsorted.schema()).desc("x").build()); + TableMetadata sortedByXDesc = + unsorted.replaceSortOrder(SortOrder.builderFor(unsorted.schema()).desc("x").build()); Assert.assertEquals("Should have 3 sort orders", 3, sortedByXDesc.sortOrders().size()); Assert.assertEquals("Should use orderId 2", 2, sortedByXDesc.sortOrder().orderId()); - Assert.assertEquals("Should be sorted by one field", 1, sortedByXDesc.sortOrder().fields().size()); - Assert.assertEquals("Should use the table's field ids", 1, sortedByXDesc.sortOrder().fields().get(0).sourceId()); - Assert.assertEquals("Should be ascending", - SortDirection.DESC, sortedByXDesc.sortOrder().fields().get(0).direction()); - Assert.assertEquals("Should be nulls first", - NullOrder.NULLS_FIRST, sortedByX.sortOrder().fields().get(0).nullOrder()); + Assert.assertEquals( + "Should be sorted by one field", 1, sortedByXDesc.sortOrder().fields().size()); + Assert.assertEquals( + "Should use the table's field ids", + 1, + sortedByXDesc.sortOrder().fields().get(0).sourceId()); + Assert.assertEquals( + "Should be ascending", + SortDirection.DESC, + sortedByXDesc.sortOrder().fields().get(0).direction()); + Assert.assertEquals( + "Should be nulls first", + NullOrder.NULLS_FIRST, + sortedByX.sortOrder().fields().get(0).nullOrder()); } @Test @@ -819,17 +1245,16 @@ public void testParseSchemaIdentifierFields() throws Exception { @Test public void testUpdateSchemaIdentifierFields() { - Schema schema = new Schema( - Types.NestedField.required(10, "x", Types.StringType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(10, "x", Types.StringType.get())); - TableMetadata meta = TableMetadata.newTableMetadata( - schema, PartitionSpec.unpartitioned(), null, ImmutableMap.of()); + TableMetadata meta = + TableMetadata.newTableMetadata( + schema, PartitionSpec.unpartitioned(), null, ImmutableMap.of()); - Schema newSchema = new Schema( - Lists.newArrayList(Types.NestedField.required(1, "x", Types.StringType.get())), - Sets.newHashSet(1) - ); + Schema newSchema = + new Schema( + Lists.newArrayList(Types.NestedField.required(1, "x", Types.StringType.get())), + Sets.newHashSet(1)); TableMetadata newMeta = meta.updateSchema(newSchema, 1); Assert.assertEquals(2, newMeta.schemas().size()); Assert.assertEquals(Sets.newHashSet(1), newMeta.schema().identifierFieldIds()); @@ -837,148 +1262,197 @@ public void testUpdateSchemaIdentifierFields() { @Test public void testUpdateSchema() { - Schema schema = new Schema(0, - Types.NestedField.required(1, "y", Types.LongType.get(), "comment") - ); - TableMetadata freshTable = TableMetadata.newTableMetadata( - schema, PartitionSpec.unpartitioned(), null, ImmutableMap.of()); - Assert.assertEquals("Should use TableMetadata.INITIAL_SCHEMA_ID for current schema id", - TableMetadata.INITIAL_SCHEMA_ID, freshTable.currentSchemaId()); + Schema schema = + new Schema(0, Types.NestedField.required(1, "y", Types.LongType.get(), "comment")); + TableMetadata freshTable = + TableMetadata.newTableMetadata( + schema, PartitionSpec.unpartitioned(), null, ImmutableMap.of()); + Assert.assertEquals( + "Should use TableMetadata.INITIAL_SCHEMA_ID for current schema id", + TableMetadata.INITIAL_SCHEMA_ID, + freshTable.currentSchemaId()); assertSameSchemaList(ImmutableList.of(schema), freshTable.schemas()); - Assert.assertEquals("Should have expected schema upon return", - schema.asStruct(), freshTable.schema().asStruct()); + Assert.assertEquals( + "Should have expected schema upon return", + schema.asStruct(), + freshTable.schema().asStruct()); Assert.assertEquals("Should return expected last column id", 1, freshTable.lastColumnId()); // update schema - Schema schema2 = new Schema( - Types.NestedField.required(1, "y", Types.LongType.get(), "comment"), - Types.NestedField.required(2, "x", Types.StringType.get()) - ); + Schema schema2 = + new Schema( + Types.NestedField.required(1, "y", Types.LongType.get(), "comment"), + Types.NestedField.required(2, "x", Types.StringType.get())); TableMetadata twoSchemasTable = freshTable.updateSchema(schema2, 2); - Assert.assertEquals("Should have current schema id as 1", - 1, twoSchemasTable.currentSchemaId()); - assertSameSchemaList(ImmutableList.of(schema, new Schema(1, schema2.columns())), - twoSchemasTable.schemas()); - Assert.assertEquals("Should have expected schema upon return", - schema2.asStruct(), twoSchemasTable.schema().asStruct()); + Assert.assertEquals("Should have current schema id as 1", 1, twoSchemasTable.currentSchemaId()); + assertSameSchemaList( + ImmutableList.of(schema, new Schema(1, schema2.columns())), twoSchemasTable.schemas()); + Assert.assertEquals( + "Should have expected schema upon return", + schema2.asStruct(), + twoSchemasTable.schema().asStruct()); Assert.assertEquals("Should return expected last column id", 2, twoSchemasTable.lastColumnId()); // update schema with the the same schema and last column ID as current shouldn't cause change - Schema sameSchema2 = new Schema( - Types.NestedField.required(1, "y", Types.LongType.get(), "comment"), - Types.NestedField.required(2, "x", Types.StringType.get()) - ); + Schema sameSchema2 = + new Schema( + Types.NestedField.required(1, "y", Types.LongType.get(), "comment"), + Types.NestedField.required(2, "x", Types.StringType.get())); TableMetadata sameSchemaTable = twoSchemasTable.updateSchema(sameSchema2, 2); - Assert.assertSame("Should return same table metadata", - twoSchemasTable, sameSchemaTable); + Assert.assertSame("Should return same table metadata", twoSchemasTable, sameSchemaTable); - // update schema with the the same schema and different last column ID as current should create a new table + // update schema with the the same schema and different last column ID as current should create + // a new table TableMetadata differentColumnIdTable = sameSchemaTable.updateSchema(sameSchema2, 3); - Assert.assertEquals("Should have current schema id as 1", - 1, differentColumnIdTable.currentSchemaId()); - assertSameSchemaList(ImmutableList.of(schema, new Schema(1, schema2.columns())), + Assert.assertEquals( + "Should have current schema id as 1", 1, differentColumnIdTable.currentSchemaId()); + assertSameSchemaList( + ImmutableList.of(schema, new Schema(1, schema2.columns())), differentColumnIdTable.schemas()); - Assert.assertEquals("Should have expected schema upon return", - schema2.asStruct(), differentColumnIdTable.schema().asStruct()); - Assert.assertEquals("Should return expected last column id", - 3, differentColumnIdTable.lastColumnId()); + Assert.assertEquals( + "Should have expected schema upon return", + schema2.asStruct(), + differentColumnIdTable.schema().asStruct()); + Assert.assertEquals( + "Should return expected last column id", 3, differentColumnIdTable.lastColumnId()); // update schema with old schema does not change schemas TableMetadata revertSchemaTable = differentColumnIdTable.updateSchema(schema, 3); - Assert.assertEquals("Should have current schema id as 0", - 0, revertSchemaTable.currentSchemaId()); - assertSameSchemaList(ImmutableList.of(schema, new Schema(1, schema2.columns())), - revertSchemaTable.schemas()); - Assert.assertEquals("Should have expected schema upon return", - schema.asStruct(), revertSchemaTable.schema().asStruct()); - Assert.assertEquals("Should return expected last column id", - 3, revertSchemaTable.lastColumnId()); + Assert.assertEquals( + "Should have current schema id as 0", 0, revertSchemaTable.currentSchemaId()); + assertSameSchemaList( + ImmutableList.of(schema, new Schema(1, schema2.columns())), revertSchemaTable.schemas()); + Assert.assertEquals( + "Should have expected schema upon return", + schema.asStruct(), + revertSchemaTable.schema().asStruct()); + Assert.assertEquals( + "Should return expected last column id", 3, revertSchemaTable.lastColumnId()); // create new schema will use the largest schema id + 1 - Schema schema3 = new Schema( - Types.NestedField.required(2, "y", Types.LongType.get(), "comment"), - Types.NestedField.required(4, "x", Types.StringType.get()), - Types.NestedField.required(6, "z", Types.IntegerType.get()) - ); + Schema schema3 = + new Schema( + Types.NestedField.required(2, "y", Types.LongType.get(), "comment"), + Types.NestedField.required(4, "x", Types.StringType.get()), + Types.NestedField.required(6, "z", Types.IntegerType.get())); TableMetadata threeSchemaTable = revertSchemaTable.updateSchema(schema3, 6); - Assert.assertEquals("Should have current schema id as 2", - 2, threeSchemaTable.currentSchemaId()); - assertSameSchemaList(ImmutableList.of(schema, - new Schema(1, schema2.columns()), - new Schema(2, schema3.columns())), threeSchemaTable.schemas()); - Assert.assertEquals("Should have expected schema upon return", - schema3.asStruct(), threeSchemaTable.schema().asStruct()); - Assert.assertEquals("Should return expected last column id", - 6, threeSchemaTable.lastColumnId()); + Assert.assertEquals( + "Should have current schema id as 2", 2, threeSchemaTable.currentSchemaId()); + assertSameSchemaList( + ImmutableList.of( + schema, new Schema(1, schema2.columns()), new Schema(2, schema3.columns())), + threeSchemaTable.schemas()); + Assert.assertEquals( + "Should have expected schema upon return", + schema3.asStruct(), + threeSchemaTable.schema().asStruct()); + Assert.assertEquals( + "Should return expected last column id", 6, threeSchemaTable.lastColumnId()); } @Test public void testCreateV2MetadataThroughTableProperty() { - Schema schema = new Schema( - Types.NestedField.required(10, "x", Types.StringType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(10, "x", Types.StringType.get())); - TableMetadata meta = TableMetadata.newTableMetadata(schema, PartitionSpec.unpartitioned(), null, - ImmutableMap.of(TableProperties.FORMAT_VERSION, "2", "key", "val")); + TableMetadata meta = + TableMetadata.newTableMetadata( + schema, + PartitionSpec.unpartitioned(), + null, + ImmutableMap.of(TableProperties.FORMAT_VERSION, "2", "key", "val")); - Assert.assertEquals("format version should be configured based on the format-version key", - 2, meta.formatVersion()); - Assert.assertEquals("should not contain format-version in properties", - ImmutableMap.of("key", "val"), meta.properties()); + Assert.assertEquals( + "format version should be configured based on the format-version key", + 2, + meta.formatVersion()); + Assert.assertEquals( + "should not contain format-version in properties", + ImmutableMap.of("key", "val"), + meta.properties()); } @Test public void testReplaceV1MetadataToV2ThroughTableProperty() { - Schema schema = new Schema( - Types.NestedField.required(10, "x", Types.StringType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(10, "x", Types.StringType.get())); + + TableMetadata meta = + TableMetadata.newTableMetadata( + schema, + PartitionSpec.unpartitioned(), + null, + ImmutableMap.of(TableProperties.FORMAT_VERSION, "1", "key", "val")); + + meta = + meta.buildReplacement( + meta.schema(), + meta.spec(), + meta.sortOrder(), + meta.location(), + ImmutableMap.of(TableProperties.FORMAT_VERSION, "2", "key2", "val2")); - TableMetadata meta = TableMetadata.newTableMetadata(schema, PartitionSpec.unpartitioned(), null, - ImmutableMap.of(TableProperties.FORMAT_VERSION, "1", "key", "val")); - - meta = meta.buildReplacement(meta.schema(), meta.spec(), meta.sortOrder(), meta.location(), - ImmutableMap.of(TableProperties.FORMAT_VERSION, "2", "key2", "val2")); - - Assert.assertEquals("format version should be configured based on the format-version key", - 2, meta.formatVersion()); - Assert.assertEquals("should not contain format-version but should contain old and new properties", - ImmutableMap.of("key", "val", "key2", "val2"), meta.properties()); + Assert.assertEquals( + "format version should be configured based on the format-version key", + 2, + meta.formatVersion()); + Assert.assertEquals( + "should not contain format-version but should contain old and new properties", + ImmutableMap.of("key", "val", "key2", "val2"), + meta.properties()); } @Test public void testUpgradeV1MetadataToV2ThroughTableProperty() { - Schema schema = new Schema( - Types.NestedField.required(10, "x", Types.StringType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(10, "x", Types.StringType.get())); - TableMetadata meta = TableMetadata.newTableMetadata(schema, PartitionSpec.unpartitioned(), null, - ImmutableMap.of(TableProperties.FORMAT_VERSION, "1", "key", "val")); + TableMetadata meta = + TableMetadata.newTableMetadata( + schema, + PartitionSpec.unpartitioned(), + null, + ImmutableMap.of(TableProperties.FORMAT_VERSION, "1", "key", "val")); - meta = meta.replaceProperties(ImmutableMap.of(TableProperties.FORMAT_VERSION, - "2", "key2", "val2")); + meta = + meta.replaceProperties( + ImmutableMap.of(TableProperties.FORMAT_VERSION, "2", "key2", "val2")); - Assert.assertEquals("format version should be configured based on the format-version key", - 2, meta.formatVersion()); - Assert.assertEquals("should not contain format-version but should contain new properties", - ImmutableMap.of("key2", "val2"), meta.properties()); + Assert.assertEquals( + "format version should be configured based on the format-version key", + 2, + meta.formatVersion()); + Assert.assertEquals( + "should not contain format-version but should contain new properties", + ImmutableMap.of("key2", "val2"), + meta.properties()); } @Test public void testNoReservedPropertyForTableMetadataCreation() { - Schema schema = new Schema( - Types.NestedField.required(10, "x", Types.StringType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(10, "x", Types.StringType.get())); - AssertHelpers.assertThrows("should not allow reserved table property when creating table metadata", + AssertHelpers.assertThrows( + "should not allow reserved table property when creating table metadata", IllegalArgumentException.class, "Table properties should not contain reserved properties, but got {format-version=1}", - () -> TableMetadata.newTableMetadata(schema, PartitionSpec.unpartitioned(), null, "/tmp", - ImmutableMap.of(TableProperties.FORMAT_VERSION, "1"), 1)); - - AssertHelpers.assertThrows("should not allow reserved table property when creating table metadata", + () -> + TableMetadata.newTableMetadata( + schema, + PartitionSpec.unpartitioned(), + null, + "/tmp", + ImmutableMap.of(TableProperties.FORMAT_VERSION, "1"), + 1)); + + AssertHelpers.assertThrows( + "should not allow reserved table property when creating table metadata", IllegalArgumentException.class, "Table properties should not contain reserved properties, but got {uuid=uuid}", - () -> TableMetadata.newTableMetadata(schema, PartitionSpec.unpartitioned(), null, "/tmp", - ImmutableMap.of(TableProperties.UUID, "uuid"), 1)); + () -> + TableMetadata.newTableMetadata( + schema, + PartitionSpec.unpartitioned(), + null, + "/tmp", + ImmutableMap.of(TableProperties.UUID, "uuid"), + 1)); } } diff --git a/core/src/test/java/org/apache/iceberg/TestTableMetadataSerialization.java b/core/src/test/java/org/apache/iceberg/TestTableMetadataSerialization.java index e140b469d0a7..d343586ef778 100644 --- a/core/src/test/java/org/apache/iceberg/TestTableMetadataSerialization.java +++ b/core/src/test/java/org/apache/iceberg/TestTableMetadataSerialization.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TestHelpers.assertSameSchemaList; + import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.ObjectInputStream; @@ -29,13 +30,11 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TestHelpers.assertSameSchemaList; - @RunWith(Parameterized.class) public class TestTableMetadataSerialization extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestTableMetadataSerialization(int formatVersion) { @@ -45,10 +44,7 @@ public TestTableMetadataSerialization(int formatVersion) { @Test public void testSerialization() throws Exception { // add a commit to the metadata so there is at least one snapshot, and history - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); TableMetadata meta = table.ops().current(); @@ -63,21 +59,29 @@ public void testSerialization() throws Exception { result = (TableMetadata) reader.readObject(); } - Assert.assertEquals("Metadata file location should match", - meta.metadataFileLocation(), result.metadataFileLocation()); + Assert.assertEquals( + "Metadata file location should match", + meta.metadataFileLocation(), + result.metadataFileLocation()); Assert.assertEquals("UUID should match", meta.uuid(), result.uuid()); Assert.assertEquals("Location should match", meta.location(), result.location()); - Assert.assertEquals("Last updated should match", meta.lastUpdatedMillis(), result.lastUpdatedMillis()); + Assert.assertEquals( + "Last updated should match", meta.lastUpdatedMillis(), result.lastUpdatedMillis()); Assert.assertEquals("Last column id", meta.lastColumnId(), result.lastColumnId()); - Assert.assertEquals("Schema should match", meta.schema().asStruct(), result.schema().asStruct()); + Assert.assertEquals( + "Schema should match", meta.schema().asStruct(), result.schema().asStruct()); assertSameSchemaList(meta.schemas(), result.schemas()); - Assert.assertEquals("Current schema id should match", meta.currentSchemaId(), result.currentSchemaId()); + Assert.assertEquals( + "Current schema id should match", meta.currentSchemaId(), result.currentSchemaId()); Assert.assertEquals("Spec should match", meta.defaultSpecId(), result.defaultSpecId()); Assert.assertEquals("Spec list should match", meta.specs(), result.specs()); Assert.assertEquals("Properties should match", meta.properties(), result.properties()); - Assert.assertEquals("Current snapshot ID should match", - meta.currentSnapshot().snapshotId(), result.currentSnapshot().snapshotId()); - Assert.assertEquals("Snapshots should match", + Assert.assertEquals( + "Current snapshot ID should match", + meta.currentSnapshot().snapshotId(), + result.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Snapshots should match", Lists.transform(meta.snapshots(), Snapshot::snapshotId), Lists.transform(result.snapshots(), Snapshot::snapshotId)); Assert.assertEquals("History should match", meta.snapshotLog(), result.snapshotLog()); diff --git a/core/src/test/java/org/apache/iceberg/TestTableUpdatePartitionSpec.java b/core/src/test/java/org/apache/iceberg/TestTableUpdatePartitionSpec.java index 1e5bd41af0a0..6c3414748a68 100644 --- a/core/src/test/java/org/apache/iceberg/TestTableUpdatePartitionSpec.java +++ b/core/src/test/java/org/apache/iceberg/TestTableUpdatePartitionSpec.java @@ -16,26 +16,24 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.expressions.Expressions.bucket; +import static org.apache.iceberg.expressions.Expressions.truncate; + import org.junit.Assert; import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.expressions.Expressions.bucket; -import static org.apache.iceberg.expressions.Expressions.truncate; - @RunWith(Parameterized.class) public class TestTableUpdatePartitionSpec extends TableTestBase { @Parameterized.Parameters public static Object[][] parameters() { return new Object[][] { - new Object[] { 1 }, - new Object[] { 2 }, + new Object[] {1}, new Object[] {2}, }; } @@ -53,35 +51,41 @@ public void verifyInitialSpec() { @Test public void testCommitUpdatedSpec() { - table.updateSpec() - .addField(bucket("id", 8)) - .commit(); - - PartitionSpec evolvedSpec = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .bucket("data", 16) - .bucket("id", 8, "id_bucket_8") - .build(); + table.updateSpec().addField(bucket("id", 8)).commit(); + + PartitionSpec evolvedSpec = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .bucket("data", 16) + .bucket("id", 8, "id_bucket_8") + .build(); Assert.assertEquals("Should append a partition field to the spec", evolvedSpec, table.spec()); Assert.assertEquals(1001, table.spec().lastAssignedFieldId()); - table.updateSpec() + table + .updateSpec() .removeField("id_bucket_8") .removeField("data_bucket") .addField(truncate("data", 8)) .commit(); - V1Assert.assertEquals("Should soft delete id and data buckets", PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("data", "data_bucket") - .alwaysNull("id", "id_bucket_8") - .truncate("data", 8, "data_trunc_8") - .build(), table.spec()); - - V2Assert.assertEquals("Should hard delete id and data buckets", PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .add(2, 1002, "data_trunc_8", "truncate[8]") - .build(), table.spec()); + V1Assert.assertEquals( + "Should soft delete id and data buckets", + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("data", "data_bucket") + .alwaysNull("id", "id_bucket_8") + .truncate("data", 8, "data_trunc_8") + .build(), + table.spec()); + + V2Assert.assertEquals( + "Should hard delete id and data buckets", + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .add(2, 1002, "data_trunc_8", "truncate[8]") + .build(), + table.spec()); Assert.assertEquals(1002, table.spec().lastAssignedFieldId()); } @@ -100,9 +104,7 @@ public void testNoopCommit() { Assert.assertEquals(currentVersion, updatedVersion.intValue()); // no-op commit due to no-op rename - table.updateSpec() - .renameField("data_bucket", "data_bucket") - .commit(); + table.updateSpec().renameField("data_bucket", "data_bucket").commit(); updated = table.ops().current(); updatedVersion = TestTables.metadataVersion("test"); Assert.assertEquals(current, updated); @@ -112,31 +114,35 @@ public void testNoopCommit() { @Test public void testRenameField() { - table.updateSpec() + table + .updateSpec() .renameField("data_bucket", "data_partition") .addField(bucket("id", 8)) .commit(); - PartitionSpec evolvedSpec = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .bucket("data", 16, "data_partition") - .bucket("id", 8, "id_bucket_8") - .build(); + PartitionSpec evolvedSpec = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .bucket("data", 16, "data_partition") + .bucket("id", 8, "id_bucket_8") + .build(); Assert.assertEquals("should match evolved spec", evolvedSpec, table.spec()); Assert.assertEquals(1001, table.spec().lastAssignedFieldId()); - table.updateSpec() + table + .updateSpec() .addField(truncate("id", 4)) .renameField("data_partition", "data_bucket") .commit(); - evolvedSpec = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .bucket("data", 16, "data_bucket") - .bucket("id", 8, "id_bucket_8") - .truncate("id", 4, "id_trunc_4") - .build(); + evolvedSpec = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .bucket("data", 16, "data_bucket") + .bucket("id", 8, "id_bucket_8") + .truncate("id", 4, "id_trunc_4") + .build(); Assert.assertEquals("should match evolved spec", evolvedSpec, table.spec()); Assert.assertEquals(1002, table.spec().lastAssignedFieldId()); @@ -144,14 +150,13 @@ public void testRenameField() { @Test public void testRenameOnlyEvolution() { - table.updateSpec() - .renameField("data_bucket", "data_partition") - .commit(); + table.updateSpec().renameField("data_bucket", "data_partition").commit(); - PartitionSpec evolvedSpec = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .bucket("data", 16, "data_partition") - .build(); + PartitionSpec evolvedSpec = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .bucket("data", 16, "data_partition") + .build(); Assert.assertEquals("should match evolved spec", evolvedSpec, table.spec()); Assert.assertEquals(1000, table.spec().lastAssignedFieldId()); @@ -159,76 +164,88 @@ public void testRenameOnlyEvolution() { @Test public void testRemoveAndAddField() { - table.updateSpec() - .removeField("data_bucket") - .addField(bucket("id", 8)) - .commit(); - - V1Assert.assertEquals("Should soft delete data bucket", PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .alwaysNull("data", "data_bucket") - .bucket("id", 8, "id_bucket_8") - .build(), table.spec()); - - V2Assert.assertEquals("Should hard delete data bucket", PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .add(1, 1001, "id_bucket_8", "bucket[8]") - .build(), table.spec()); + table.updateSpec().removeField("data_bucket").addField(bucket("id", 8)).commit(); + + V1Assert.assertEquals( + "Should soft delete data bucket", + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .alwaysNull("data", "data_bucket") + .bucket("id", 8, "id_bucket_8") + .build(), + table.spec()); + + V2Assert.assertEquals( + "Should hard delete data bucket", + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .add(1, 1001, "id_bucket_8", "bucket[8]") + .build(), + table.spec()); Assert.assertEquals(1001, table.spec().lastAssignedFieldId()); } @Test public void testAddAndRemoveField() { - table.updateSpec() - .addField(bucket("data", 6)) - .removeField("data_bucket") - .commit(); - - V1Assert.assertEquals("Should remove and then add a bucket field", PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .alwaysNull("data", "data_bucket") - .bucket("data", 6, "data_bucket_6") - .build(), table.spec()); - V2Assert.assertEquals("Should remove and then add a bucket field", PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .add(2, 1001, "data_bucket_6", "bucket[6]") - .build(), table.spec()); + table.updateSpec().addField(bucket("data", 6)).removeField("data_bucket").commit(); + + V1Assert.assertEquals( + "Should remove and then add a bucket field", + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .alwaysNull("data", "data_bucket") + .bucket("data", 6, "data_bucket_6") + .build(), + table.spec()); + V2Assert.assertEquals( + "Should remove and then add a bucket field", + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .add(2, 1001, "data_bucket_6", "bucket[6]") + .build(), + table.spec()); Assert.assertEquals(1001, table.spec().lastAssignedFieldId()); } @Test public void testAddAfterLastFieldRemoved() { - table.updateSpec() - .removeField("data_bucket") - .commit(); - - V1Assert.assertEquals("Should add a new id bucket", PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .alwaysNull("data", "data_bucket") - .build(), table.spec()); - V1Assert.assertEquals("Should match the last assigned field id", - 1000, table.spec().lastAssignedFieldId()); - V2Assert.assertEquals("Should add a new id bucket", PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .build(), table.spec()); - V2Assert.assertEquals("Should match the last assigned field id", - 999, table.spec().lastAssignedFieldId()); + table.updateSpec().removeField("data_bucket").commit(); + + V1Assert.assertEquals( + "Should add a new id bucket", + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .alwaysNull("data", "data_bucket") + .build(), + table.spec()); + V1Assert.assertEquals( + "Should match the last assigned field id", 1000, table.spec().lastAssignedFieldId()); + V2Assert.assertEquals( + "Should add a new id bucket", + PartitionSpec.builderFor(table.schema()).withSpecId(1).build(), + table.spec()); + V2Assert.assertEquals( + "Should match the last assigned field id", 999, table.spec().lastAssignedFieldId()); Assert.assertEquals(1000, table.ops().current().lastAssignedPartitionId()); - table.updateSpec() - .addField(bucket("id", 8)) - .commit(); - - V1Assert.assertEquals("Should add a new id bucket", PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("data", "data_bucket") - .bucket("id", 8, "id_bucket_8") - .build(), table.spec()); - V2Assert.assertEquals("Should add a new id bucket", PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .add(1, 1001, "id_bucket_8", "bucket[8]") - .build(), table.spec()); + table.updateSpec().addField(bucket("id", 8)).commit(); + + V1Assert.assertEquals( + "Should add a new id bucket", + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("data", "data_bucket") + .bucket("id", 8, "id_bucket_8") + .build(), + table.spec()); + V2Assert.assertEquals( + "Should add a new id bucket", + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .add(1, 1001, "id_bucket_8", "bucket[8]") + .build(), + table.spec()); Assert.assertEquals(1001, table.spec().lastAssignedFieldId()); Assert.assertEquals(1001, table.ops().current().lastAssignedPartitionId()); } diff --git a/core/src/test/java/org/apache/iceberg/TestTables.java b/core/src/test/java/org/apache/iceberg/TestTables.java index 89bff30c4eb4..15ad6b900ef7 100644 --- a/core/src/test/java/org/apache/iceberg/TestTables.java +++ b/core/src/test/java/org/apache/iceberg/TestTables.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TableMetadata.newTableMetadata; + import java.io.File; import java.util.Map; import org.apache.iceberg.exceptions.AlreadyExistsException; @@ -33,12 +34,9 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import static org.apache.iceberg.TableMetadata.newTableMetadata; - public class TestTables { - private TestTables() { - } + private TestTables() {} private static TestTable upgrade(File temp, String name, int newFormatVersion) { TestTable table = load(temp, name); @@ -48,18 +46,27 @@ private static TestTable upgrade(File temp, String name, int newFormatVersion) { return table; } - public static TestTable create(File temp, String name, Schema schema, PartitionSpec spec, int formatVersion) { + public static TestTable create( + File temp, String name, Schema schema, PartitionSpec spec, int formatVersion) { return create(temp, name, schema, spec, SortOrder.unsorted(), formatVersion); } - public static TestTable create(File temp, String name, Schema schema, PartitionSpec spec, - SortOrder sortOrder, int formatVersion) { + public static TestTable create( + File temp, + String name, + Schema schema, + PartitionSpec spec, + SortOrder sortOrder, + int formatVersion) { TestTableOperations ops = new TestTableOperations(name, temp); if (ops.current() != null) { throw new AlreadyExistsException("Table %s already exists at location: %s", name, temp); } - ops.commit(null, newTableMetadata(schema, spec, sortOrder, temp.toString(), ImmutableMap.of(), formatVersion)); + ops.commit( + null, + newTableMetadata( + schema, spec, sortOrder, temp.toString(), ImmutableMap.of(), formatVersion)); return new TestTable(ops, name); } @@ -68,30 +75,50 @@ public static Transaction beginCreate(File temp, String name, Schema schema, Par return beginCreate(temp, name, schema, spec, SortOrder.unsorted()); } - public static Transaction beginCreate(File temp, String name, Schema schema, - PartitionSpec spec, SortOrder sortOrder) { + public static Transaction beginCreate( + File temp, String name, Schema schema, PartitionSpec spec, SortOrder sortOrder) { TableOperations ops = new TestTableOperations(name, temp); if (ops.current() != null) { throw new AlreadyExistsException("Table %s already exists at location: %s", name, temp); } - TableMetadata metadata = newTableMetadata(schema, spec, sortOrder, temp.toString(), ImmutableMap.of(), 1); + TableMetadata metadata = + newTableMetadata(schema, spec, sortOrder, temp.toString(), ImmutableMap.of(), 1); return Transactions.createTableTransaction(name, ops, metadata); } - public static Transaction beginReplace(File temp, String name, Schema schema, PartitionSpec spec) { - return beginReplace(temp, name, schema, spec, SortOrder.unsorted(), ImmutableMap.of(), - new TestTableOperations(name, temp)); + public static Transaction beginReplace( + File temp, String name, Schema schema, PartitionSpec spec) { + return beginReplace( + temp, + name, + schema, + spec, + SortOrder.unsorted(), + ImmutableMap.of(), + new TestTableOperations(name, temp)); } - public static Transaction beginReplace(File temp, String name, Schema schema, PartitionSpec spec, - SortOrder sortOrder, Map properties) { - return beginReplace(temp, name, schema, spec, sortOrder, properties, new TestTableOperations(name, temp)); + public static Transaction beginReplace( + File temp, + String name, + Schema schema, + PartitionSpec spec, + SortOrder sortOrder, + Map properties) { + return beginReplace( + temp, name, schema, spec, sortOrder, properties, new TestTableOperations(name, temp)); } - public static Transaction beginReplace(File temp, String name, Schema schema, PartitionSpec spec, - SortOrder sortOrder, Map properties, TestTableOperations ops) { + public static Transaction beginReplace( + File temp, + String name, + Schema schema, + PartitionSpec spec, + SortOrder sortOrder, + Map properties, + TestTableOperations ops) { TableMetadata current = ops.current(); TableMetadata metadata; if (current != null) { @@ -228,8 +255,8 @@ public FileIO io() { @Override public LocationProvider locationProvider() { - Preconditions.checkNotNull(current, - "Current metadata should not be null when locationProvider is called"); + Preconditions.checkNotNull( + current, "Current metadata should not be null when locationProvider is called"); return LocationProviders.locationsFor(current.location(), current.properties()); } diff --git a/core/src/test/java/org/apache/iceberg/TestTimestampPartitions.java b/core/src/test/java/org/apache/iceberg/TestTimestampPartitions.java index c6817622506b..7cf993307e3d 100644 --- a/core/src/test/java/org/apache/iceberg/TestTimestampPartitions.java +++ b/core/src/test/java/org/apache/iceberg/TestTimestampPartitions.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.types.Types; @@ -27,14 +29,11 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestTimestampPartitions extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestTimestampPartitions(int formatVersion) { @@ -43,34 +42,34 @@ public TestTimestampPartitions(int formatVersion) { @Test public void testPartitionAppend() throws IOException { - Schema dateSchema = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "timestamp", Types.TimestampType.withoutZone()) - ); + Schema dateSchema = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "timestamp", Types.TimestampType.withoutZone())); - PartitionSpec partitionSpec = PartitionSpec - .builderFor(dateSchema) - .day("timestamp", "date") - .build(); + PartitionSpec partitionSpec = + PartitionSpec.builderFor(dateSchema).day("timestamp", "date").build(); - DataFile dataFile = DataFiles.builder(partitionSpec) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withRecordCount(0) - .withPartitionPath("date=2018-06-08") - .build(); + DataFile dataFile = + DataFiles.builder(partitionSpec) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(0) + .withRecordCount(0) + .withPartitionPath("date=2018-06-08") + .build(); File tableDir = temp.newFolder(); Assert.assertTrue(tableDir.delete()); - this.table = TestTables.create(tableDir, "test_date_partition", dateSchema, partitionSpec, formatVersion); + this.table = + TestTables.create( + tableDir, "test_date_partition", dateSchema, partitionSpec, formatVersion); - table.newAppend() - .appendFile(dataFile) - .commit(); + table.newAppend().appendFile(dataFile).commit(); long id = table.currentSnapshot().snapshotId(); Assert.assertEquals(table.currentSnapshot().allManifests(table.io()).size(), 1); - validateManifestEntries(table.currentSnapshot().allManifests(table.io()).get(0), + validateManifestEntries( + table.currentSnapshot().allManifests(table.io()).get(0), ids(id), files(dataFile), statuses(ManifestEntry.Status.ADDED)); diff --git a/core/src/test/java/org/apache/iceberg/TestTransaction.java b/core/src/test/java/org/apache/iceberg/TestTransaction.java index 6c0fd69312bf..0e1a4f7a26db 100644 --- a/core/src/test/java/org/apache/iceberg/TestTransaction.java +++ b/core/src/test/java/org/apache/iceberg/TestTransaction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.File; @@ -40,7 +39,7 @@ public class TestTransaction extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestTransaction(int formatVersion) { @@ -56,8 +55,8 @@ public void testEmptyTransaction() { Transaction txn = table.newTransaction(); txn.commitTransaction(); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 0", 0, (int) version()); } @@ -69,17 +68,14 @@ public void testSingleOperationTransaction() { Transaction txn = table.newTransaction(); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 0 after txn create", 0, (int) version()); - txn.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + txn.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - Assert.assertSame("Base metadata should not change when an append is committed", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when an append is committed", base, readMetadata()); Assert.assertEquals("Table should be on version 0 after append", 0, (int) version()); txn.commitTransaction(); @@ -96,47 +92,50 @@ public void testMultipleOperationTransaction() { Transaction txn = table.newTransaction(); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 0 after txn create", 0, (int) version()); - txn.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + txn.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 0 after txn create", 0, (int) version()); Snapshot appendSnapshot = txn.table().currentSnapshot(); - txn.newDelete() - .deleteFile(FILE_A) - .commit(); + txn.newDelete().deleteFile(FILE_A).commit(); Snapshot deleteSnapshot = txn.table().currentSnapshot(); - Assert.assertSame("Base metadata should not change when an append is committed", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when an append is committed", base, readMetadata()); Assert.assertEquals("Table should be on version 0 after append", 0, (int) version()); txn.commitTransaction(); Assert.assertEquals("Table should be on version 1 after commit", 1, (int) version()); - Assert.assertEquals("Table should have one manifest after commit", - 1, readMetadata().currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals("Table snapshot should be the delete snapshot", - deleteSnapshot, readMetadata().currentSnapshot()); - validateManifestEntries(readMetadata().currentSnapshot().allManifests(table.io()).get(0), + Assert.assertEquals( + "Table should have one manifest after commit", + 1, + readMetadata().currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Table snapshot should be the delete snapshot", + deleteSnapshot, + readMetadata().currentSnapshot()); + validateManifestEntries( + readMetadata().currentSnapshot().allManifests(table.io()).get(0), ids(deleteSnapshot.snapshotId(), appendSnapshot.snapshotId()), - files(FILE_A, FILE_B), statuses(Status.DELETED, Status.EXISTING)); + files(FILE_A, FILE_B), + statuses(Status.DELETED, Status.EXISTING)); - Assert.assertEquals("Table should have a snapshot for each operation", - 2, readMetadata().snapshots().size()); - validateManifestEntries(readMetadata().snapshots().get(0).allManifests(table.io()).get(0), + Assert.assertEquals( + "Table should have a snapshot for each operation", 2, readMetadata().snapshots().size()); + validateManifestEntries( + readMetadata().snapshots().get(0).allManifests(table.io()).get(0), ids(appendSnapshot.snapshotId(), appendSnapshot.snapshotId()), - files(FILE_A, FILE_B), statuses(Status.ADDED, Status.ADDED)); + files(FILE_A, FILE_B), + statuses(Status.ADDED, Status.ADDED)); } @Test @@ -147,47 +146,50 @@ public void testMultipleOperationTransactionFromTable() { Transaction txn = table.newTransaction(); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 0 after txn create", 0, (int) version()); - txn.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + txn.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 0 after txn create", 0, (int) version()); Snapshot appendSnapshot = txn.table().currentSnapshot(); - txn.table().newDelete() - .deleteFile(FILE_A) - .commit(); + txn.table().newDelete().deleteFile(FILE_A).commit(); Snapshot deleteSnapshot = txn.table().currentSnapshot(); - Assert.assertSame("Base metadata should not change when an append is committed", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when an append is committed", base, readMetadata()); Assert.assertEquals("Table should be on version 0 after append", 0, (int) version()); txn.commitTransaction(); Assert.assertEquals("Table should be on version 1 after commit", 1, (int) version()); - Assert.assertEquals("Table should have one manifest after commit", - 1, readMetadata().currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals("Table snapshot should be the delete snapshot", - deleteSnapshot, readMetadata().currentSnapshot()); - validateManifestEntries(readMetadata().currentSnapshot().allManifests(table.io()).get(0), + Assert.assertEquals( + "Table should have one manifest after commit", + 1, + readMetadata().currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Table snapshot should be the delete snapshot", + deleteSnapshot, + readMetadata().currentSnapshot()); + validateManifestEntries( + readMetadata().currentSnapshot().allManifests(table.io()).get(0), ids(deleteSnapshot.snapshotId(), appendSnapshot.snapshotId()), - files(FILE_A, FILE_B), statuses(Status.DELETED, Status.EXISTING)); + files(FILE_A, FILE_B), + statuses(Status.DELETED, Status.EXISTING)); - Assert.assertEquals("Table should have a snapshot for each operation", - 2, readMetadata().snapshots().size()); - validateManifestEntries(readMetadata().snapshots().get(0).allManifests(table.io()).get(0), + Assert.assertEquals( + "Table should have a snapshot for each operation", 2, readMetadata().snapshots().size()); + validateManifestEntries( + readMetadata().snapshots().get(0).allManifests(table.io()).get(0), ids(appendSnapshot.snapshotId(), appendSnapshot.snapshotId()), - files(FILE_A, FILE_B), statuses(Status.ADDED, Status.ADDED)); + files(FILE_A, FILE_B), + statuses(Status.ADDED, Status.ADDED)); } @Test @@ -198,17 +200,18 @@ public void testDetectsUncommittedChange() { Transaction txn = table.newTransaction(); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 0 after txn create", 0, (int) version()); txn.newAppend().appendFile(FILE_A).appendFile(FILE_B); // not committed - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 0 after txn create", 0, (int) version()); - AssertHelpers.assertThrows("Should reject commit when last operation has not committed", + AssertHelpers.assertThrows( + "Should reject commit when last operation has not committed", IllegalStateException.class, "Cannot create new DeleteFiles: last operation has not committed", txn::newDelete); @@ -222,17 +225,18 @@ public void testDetectsUncommittedChangeOnCommit() { Transaction txn = table.newTransaction(); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 0 after txn create", 0, (int) version()); txn.newAppend().appendFile(FILE_A).appendFile(FILE_B); // not committed - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 0 after txn create", 0, (int) version()); - AssertHelpers.assertThrows("Should reject commit when last operation has not committed", + AssertHelpers.assertThrows( + "Should reject commit when last operation has not committed", IllegalStateException.class, "Cannot commit transaction: last operation has not committed", txn::commitTransaction); @@ -241,9 +245,7 @@ public void testDetectsUncommittedChangeOnCommit() { @Test public void testTransactionConflict() { // set retries to 0 to catch the failure - table.updateProperties() - .set(TableProperties.COMMIT_NUM_RETRIES, "0") - .commit(); + table.updateProperties().set(TableProperties.COMMIT_NUM_RETRIES, "0").commit(); Assert.assertEquals("Table should be on version 1", 1, (int) version()); @@ -251,32 +253,30 @@ public void testTransactionConflict() { Transaction txn = table.newTransaction(); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 1 after txn create", 1, (int) version()); - txn.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + txn.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 1 after append", 1, (int) version()); // cause the transaction commit to fail table.ops().failCommits(1); - AssertHelpers.assertThrows("Transaction commit should fail", - CommitFailedException.class, "Injected failure", txn::commitTransaction); + AssertHelpers.assertThrows( + "Transaction commit should fail", + CommitFailedException.class, + "Injected failure", + txn::commitTransaction); } @Test public void testTransactionRetry() { // use only one retry - table.updateProperties() - .set(TableProperties.COMMIT_NUM_RETRIES, "1") - .commit(); + table.updateProperties().set(TableProperties.COMMIT_NUM_RETRIES, "1").commit(); Assert.assertEquals("Table should be on version 1", 1, (int) version()); @@ -284,19 +284,17 @@ public void testTransactionRetry() { Transaction txn = table.newTransaction(); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 1 after txn create", 1, (int) version()); - txn.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + txn.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - Set appendManifests = Sets.newHashSet(txn.table().currentSnapshot().allManifests(table.io())); + Set appendManifests = + Sets.newHashSet(txn.table().currentSnapshot().allManifests(table.io())); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 1 after append", 1, (int) version()); // cause the transaction commit to fail @@ -306,16 +304,16 @@ public void testTransactionRetry() { Assert.assertEquals("Table should be on version 2 after commit", 2, (int) version()); - Assert.assertEquals("Should reuse manifests from initial append commit", - appendManifests, Sets.newHashSet(table.currentSnapshot().allManifests(table.io()))); + Assert.assertEquals( + "Should reuse manifests from initial append commit", + appendManifests, + Sets.newHashSet(table.currentSnapshot().allManifests(table.io()))); } @Test public void testTransactionRetryMergeAppend() { // use only one retry - table.updateProperties() - .set(TableProperties.COMMIT_NUM_RETRIES, "1") - .commit(); + table.updateProperties().set(TableProperties.COMMIT_NUM_RETRIES, "1").commit(); Assert.assertEquals("Table should be on version 1", 1, (int) version()); @@ -323,30 +321,26 @@ public void testTransactionRetryMergeAppend() { Transaction txn = table.newTransaction(); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 1 after txn create", 1, (int) version()); - txn.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + txn.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - Set appendManifests = Sets.newHashSet(txn.table().currentSnapshot().allManifests(table.io())); + Set appendManifests = + Sets.newHashSet(txn.table().currentSnapshot().allManifests(table.io())); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 1 after append", 1, (int) version()); // cause the transaction commit to fail - table.newAppend() - .appendFile(FILE_C) - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); Assert.assertEquals("Table should be on version 2 after real append", 2, (int) version()); - Set conflictAppendManifests = Sets.newHashSet(table.currentSnapshot().allManifests(table.io())); + Set conflictAppendManifests = + Sets.newHashSet(table.currentSnapshot().allManifests(table.io())); txn.commitTransaction(); @@ -356,14 +350,17 @@ public void testTransactionRetryMergeAppend() { expectedManifests.addAll(appendManifests); expectedManifests.addAll(conflictAppendManifests); - Assert.assertEquals("Should reuse manifests from initial append commit and conflicting append", - expectedManifests, Sets.newHashSet(table.currentSnapshot().allManifests(table.io()))); + Assert.assertEquals( + "Should reuse manifests from initial append commit and conflicting append", + expectedManifests, + Sets.newHashSet(table.currentSnapshot().allManifests(table.io()))); } @Test public void testMultipleUpdateTransactionRetryMergeCleanup() { // use only one retry and aggressively merge manifests - table.updateProperties() + table + .updateProperties() .set(TableProperties.COMMIT_NUM_RETRIES, "1") .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") .commit(); @@ -374,36 +371,31 @@ public void testMultipleUpdateTransactionRetryMergeCleanup() { Transaction txn = table.newTransaction(); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 1 after txn create", 1, (int) version()); - txn.updateProperties() - .set("test-property", "test-value") - .commit(); + txn.updateProperties().set("test-property", "test-value").commit(); - txn.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + txn.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - Assert.assertEquals("Append should create one manifest", - 1, txn.table().currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Append should create one manifest", + 1, + txn.table().currentSnapshot().allManifests(table.io()).size()); ManifestFile appendManifest = txn.table().currentSnapshot().allManifests(table.io()).get(0); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 1 after append", 1, (int) version()); // cause the transaction commit to fail - table.newAppend() - .appendFile(FILE_C) - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); Assert.assertEquals("Table should be on version 2 after real append", 2, (int) version()); - Set conflictAppendManifests = Sets.newHashSet(table.currentSnapshot().allManifests(table.io())); + Set conflictAppendManifests = + Sets.newHashSet(table.currentSnapshot().allManifests(table.io())); txn.commitTransaction(); @@ -413,49 +405,52 @@ public void testMultipleUpdateTransactionRetryMergeCleanup() { previousManifests.add(appendManifest); previousManifests.addAll(conflictAppendManifests); - Assert.assertEquals("Should merge both commit manifests into a single manifest", - 1, table.currentSnapshot().allManifests(table.io()).size()); - Assert.assertFalse("Should merge both commit manifests into a new manifest", + Assert.assertEquals( + "Should merge both commit manifests into a single manifest", + 1, + table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertFalse( + "Should merge both commit manifests into a new manifest", previousManifests.contains(table.currentSnapshot().allManifests(table.io()).get(0))); - Assert.assertFalse("Append manifest should be deleted", new File(appendManifest.path()).exists()); + Assert.assertFalse( + "Append manifest should be deleted", new File(appendManifest.path()).exists()); } @Test public void testTransactionRetrySchemaUpdate() { // use only one retry - table.updateProperties() - .set(TableProperties.COMMIT_NUM_RETRIES, "1") - .commit(); + table.updateProperties().set(TableProperties.COMMIT_NUM_RETRIES, "1").commit(); // start a transaction Transaction txn = table.newTransaction(); // add column "new-column" - txn.updateSchema() - .addColumn("new-column", Types.IntegerType.get()) - .commit(); + txn.updateSchema().addColumn("new-column", Types.IntegerType.get()).commit(); int schemaId = txn.table().schema().schemaId(); - // directly update the table for adding "another-column" (which causes in-progress txn commit fail) - table.updateSchema() - .addColumn("another-column", Types.IntegerType.get()) - .commit(); + // directly update the table for adding "another-column" (which causes in-progress txn commit + // fail) + table.updateSchema().addColumn("another-column", Types.IntegerType.get()).commit(); int conflictingSchemaId = table.schema().schemaId(); - Assert.assertEquals("Both schema IDs should be the same in order to cause a conflict", + Assert.assertEquals( + "Both schema IDs should be the same in order to cause a conflict", conflictingSchemaId, schemaId); // commit the transaction for adding "new-column" - AssertHelpers.assertThrows("Should fail due to conflicting transaction even after retry", - CommitFailedException.class, "Table metadata refresh is required", + AssertHelpers.assertThrows( + "Should fail due to conflicting transaction even after retry", + CommitFailedException.class, + "Table metadata refresh is required", txn::commitTransaction); } @Test public void testTransactionRetryMergeCleanup() { // use only one retry and aggressively merge manifests - table.updateProperties() + table + .updateProperties() .set(TableProperties.COMMIT_NUM_RETRIES, "1") .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") .commit(); @@ -466,32 +461,29 @@ public void testTransactionRetryMergeCleanup() { Transaction txn = table.newTransaction(); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 1 after txn create", 1, (int) version()); - txn.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + txn.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - Assert.assertEquals("Append should create one manifest", - 1, txn.table().currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Append should create one manifest", + 1, + txn.table().currentSnapshot().allManifests(table.io()).size()); ManifestFile appendManifest = txn.table().currentSnapshot().allManifests(table.io()).get(0); - Assert.assertSame("Base metadata should not change when commit is created", - base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 1 after append", 1, (int) version()); // cause the transaction commit to fail - table.newAppend() - .appendFile(FILE_C) - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_C).appendFile(FILE_D).commit(); Assert.assertEquals("Table should be on version 2 after real append", 2, (int) version()); - Set conflictAppendManifests = Sets.newHashSet(table.currentSnapshot().allManifests(table.io())); + Set conflictAppendManifests = + Sets.newHashSet(table.currentSnapshot().allManifests(table.io())); txn.commitTransaction(); @@ -501,38 +493,43 @@ public void testTransactionRetryMergeCleanup() { previousManifests.add(appendManifest); previousManifests.addAll(conflictAppendManifests); - Assert.assertEquals("Should merge both commit manifests into a single manifest", - 1, table.currentSnapshot().allManifests(table.io()).size()); - Assert.assertFalse("Should merge both commit manifests into a new manifest", + Assert.assertEquals( + "Should merge both commit manifests into a single manifest", + 1, + table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertFalse( + "Should merge both commit manifests into a new manifest", previousManifests.contains(table.currentSnapshot().allManifests(table.io()).get(0))); - Assert.assertFalse("Append manifest should be deleted", new File(appendManifest.path()).exists()); + Assert.assertFalse( + "Append manifest should be deleted", new File(appendManifest.path()).exists()); } @Test public void testTransactionRetryAndAppendManifests() throws Exception { // use only one retry and aggressively merge manifests - table.updateProperties() + table + .updateProperties() .set(TableProperties.COMMIT_NUM_RETRIES, "1") .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") .commit(); Assert.assertEquals("Table should be on version 1", 1, (int) version()); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Assert.assertEquals("Table should be on version 2 after append", 2, (int) version()); - Assert.assertEquals("Append should create one manifest", - 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Append should create one manifest", + 1, + table.currentSnapshot().allManifests(table.io()).size()); ManifestFile v1manifest = table.currentSnapshot().allManifests(table.io()).get(0); TableMetadata base = readMetadata(); // create a manifest append - OutputFile manifestLocation = Files.localOutput("/tmp/" + UUID.randomUUID().toString() + ".avro"); + OutputFile manifestLocation = + Files.localOutput("/tmp/" + UUID.randomUUID().toString() + ".avro"); ManifestWriter writer = ManifestFiles.write(table.spec(), manifestLocation); try { writer.add(FILE_D); @@ -542,30 +539,35 @@ public void testTransactionRetryAndAppendManifests() throws Exception { Transaction txn = table.newTransaction(); - txn.newAppend() - .appendManifest(writer.toManifestFile()) - .commit(); + txn.newAppend().appendManifest(writer.toManifestFile()).commit(); - Assert.assertSame("Base metadata should not change when commit is created", base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 2 after txn create", 2, (int) version()); - Assert.assertEquals("Append should have one merged manifest", - 1, txn.table().currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Append should have one merged manifest", + 1, + txn.table().currentSnapshot().allManifests(table.io()).size()); ManifestFile mergedManifest = txn.table().currentSnapshot().allManifests(table.io()).get(0); // find the initial copy of the appended manifest - String copiedAppendManifest = Iterables.getOnlyElement(Iterables.filter( - Iterables.transform(listManifestFiles(), File::getPath), - path -> !v1manifest.path().contains(path) && !mergedManifest.path().contains(path))); - - Assert.assertTrue("Transaction should hijack the delete of the original copied manifest", + String copiedAppendManifest = + Iterables.getOnlyElement( + Iterables.filter( + Iterables.transform(listManifestFiles(), File::getPath), + path -> + !v1manifest.path().contains(path) && !mergedManifest.path().contains(path))); + + Assert.assertTrue( + "Transaction should hijack the delete of the original copied manifest", ((BaseTransaction) txn).deletedFiles().contains(copiedAppendManifest)); - Assert.assertTrue("Copied append manifest should not be deleted yet", new File(copiedAppendManifest).exists()); + Assert.assertTrue( + "Copied append manifest should not be deleted yet", + new File(copiedAppendManifest).exists()); // cause the transaction commit to fail and retry - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Assert.assertEquals("Table should be on version 3 after real append", 3, (int) version()); @@ -573,21 +575,28 @@ public void testTransactionRetryAndAppendManifests() throws Exception { Assert.assertEquals("Table should be on version 4 after commit", 4, (int) version()); - Assert.assertTrue("Transaction should hijack the delete of the original copied manifest", + Assert.assertTrue( + "Transaction should hijack the delete of the original copied manifest", ((BaseTransaction) txn).deletedFiles().contains(copiedAppendManifest)); - Assert.assertFalse("Append manifest should be deleted", new File(copiedAppendManifest).exists()); - Assert.assertTrue("Transaction should hijack the delete of the first merged manifest", + Assert.assertFalse( + "Append manifest should be deleted", new File(copiedAppendManifest).exists()); + Assert.assertTrue( + "Transaction should hijack the delete of the first merged manifest", ((BaseTransaction) txn).deletedFiles().contains(mergedManifest.path())); - Assert.assertFalse("Append manifest should be deleted", new File(mergedManifest.path()).exists()); + Assert.assertFalse( + "Append manifest should be deleted", new File(mergedManifest.path()).exists()); - Assert.assertEquals("Should merge all commit manifests into a single manifest", - 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should merge all commit manifests into a single manifest", + 1, + table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testTransactionRetryAndAppendManifestsWithSnapshotIdInheritance() throws Exception { // use only one retry and aggressively merge manifests - table.updateProperties() + table + .updateProperties() .set(TableProperties.COMMIT_NUM_RETRIES, "1") .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") @@ -595,35 +604,33 @@ public void testTransactionRetryAndAppendManifestsWithSnapshotIdInheritance() th Assert.assertEquals("Table should be on version 1", 1, (int) version()); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Assert.assertEquals("Table should be on version 2 after append", 2, (int) version()); - Assert.assertEquals("Append should create one manifest", - 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Append should create one manifest", + 1, + table.currentSnapshot().allManifests(table.io()).size()); TableMetadata base = readMetadata(); Transaction txn = table.newTransaction(); ManifestFile appendManifest = writeManifestWithName("input.m0", FILE_D); - txn.newAppend() - .appendManifest(appendManifest) - .commit(); + txn.newAppend().appendManifest(appendManifest).commit(); - Assert.assertSame("Base metadata should not change when commit is created", base, readMetadata()); + Assert.assertSame( + "Base metadata should not change when commit is created", base, readMetadata()); Assert.assertEquals("Table should be on version 2 after txn create", 2, (int) version()); - Assert.assertEquals("Append should have one merged manifest", - 1, txn.table().currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Append should have one merged manifest", + 1, + txn.table().currentSnapshot().allManifests(table.io()).size()); ManifestFile mergedManifest = txn.table().currentSnapshot().allManifests(table.io()).get(0); // cause the transaction commit to fail and retry - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Assert.assertEquals("Table should be on version 3 after real append", 3, (int) version()); @@ -631,44 +638,48 @@ public void testTransactionRetryAndAppendManifestsWithSnapshotIdInheritance() th Assert.assertEquals("Table should be on version 4 after commit", 4, (int) version()); - Assert.assertTrue("Transaction should hijack the delete of the original append manifest", + Assert.assertTrue( + "Transaction should hijack the delete of the original append manifest", ((BaseTransaction) txn).deletedFiles().contains(appendManifest.path())); - Assert.assertFalse("Append manifest should be deleted", new File(appendManifest.path()).exists()); + Assert.assertFalse( + "Append manifest should be deleted", new File(appendManifest.path()).exists()); - Assert.assertTrue("Transaction should hijack the delete of the first merged manifest", + Assert.assertTrue( + "Transaction should hijack the delete of the first merged manifest", ((BaseTransaction) txn).deletedFiles().contains(mergedManifest.path())); - Assert.assertFalse("Merged append manifest should be deleted", new File(mergedManifest.path()).exists()); + Assert.assertFalse( + "Merged append manifest should be deleted", new File(mergedManifest.path()).exists()); - Assert.assertEquals("Should merge all commit manifests into a single manifest", - 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should merge all commit manifests into a single manifest", + 1, + table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testTransactionNoCustomDeleteFunc() { - AssertHelpers.assertThrows("Should fail setting a custom delete function with a transaction", - IllegalArgumentException.class, "Cannot set delete callback more than once", - () -> table.newTransaction() - .newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .deleteWith(file -> { })); + AssertHelpers.assertThrows( + "Should fail setting a custom delete function with a transaction", + IllegalArgumentException.class, + "Cannot set delete callback more than once", + () -> + table + .newTransaction() + .newAppend() + .appendFile(FILE_A) + .appendFile(FILE_B) + .deleteWith(file -> {})); } @Test public void testTransactionFastAppends() { - table.updateProperties() - .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); Transaction txn = table.newTransaction(); - txn.newFastAppend() - .appendFile(FILE_A) - .commit(); + txn.newFastAppend().appendFile(FILE_A).commit(); - txn.newFastAppend() - .appendFile(FILE_B) - .commit(); + txn.newFastAppend().appendFile(FILE_B).commit(); txn.commitTransaction(); @@ -680,28 +691,26 @@ public void testTransactionFastAppends() { public void testTransactionRewriteManifestsAppendedDirectly() throws IOException { Table table = load(); - table.updateProperties() + table + .updateProperties() .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") .commit(); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); long secondSnapshotId = table.currentSnapshot().snapshotId(); List manifests = table.currentSnapshot().allManifests(table.io()); Assert.assertEquals("Should have 2 manifests after 2 appends", 2, manifests.size()); - ManifestFile newManifest = writeManifest( - "manifest-file-1.avro", - manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshotId, FILE_A), - manifestEntry(ManifestEntry.Status.EXISTING, secondSnapshotId, FILE_B)); + ManifestFile newManifest = + writeManifest( + "manifest-file-1.avro", + manifestEntry(ManifestEntry.Status.EXISTING, firstSnapshotId, FILE_A), + manifestEntry(ManifestEntry.Status.EXISTING, secondSnapshotId, FILE_B)); Transaction txn = table.newTransaction(); txn.rewriteManifests() @@ -709,30 +718,31 @@ public void testTransactionRewriteManifestsAppendedDirectly() throws IOException .deleteManifest(manifests.get(1)) .addManifest(newManifest) .commit(); - txn.newAppend() - .appendFile(FILE_C) - .commit(); + txn.newAppend().appendFile(FILE_C).commit(); txn.commitTransaction(); long finalSnapshotId = table.currentSnapshot().snapshotId(); long finalSnapshotTimestamp = System.currentTimeMillis(); - Assert.assertTrue("Append manifest should not be deleted", new File(newManifest.path()).exists()); + Assert.assertTrue( + "Append manifest should not be deleted", new File(newManifest.path()).exists()); List finalManifests = table.currentSnapshot().allManifests(table.io()); Assert.assertEquals("Should have 1 final manifest", 1, finalManifests.size()); - validateManifestEntries(finalManifests.get(0), + validateManifestEntries( + finalManifests.get(0), ids(finalSnapshotId, firstSnapshotId, secondSnapshotId), files(FILE_C, FILE_A, FILE_B), - statuses(ManifestEntry.Status.ADDED, ManifestEntry.Status.EXISTING, ManifestEntry.Status.EXISTING)); + statuses( + ManifestEntry.Status.ADDED, + ManifestEntry.Status.EXISTING, + ManifestEntry.Status.EXISTING)); - table.expireSnapshots() - .expireOlderThan(finalSnapshotTimestamp + 1) - .retainLast(1) - .commit(); + table.expireSnapshots().expireOlderThan(finalSnapshotTimestamp + 1).retainLast(1).commit(); - Assert.assertFalse("Append manifest should be deleted on expiry", new File(newManifest.path()).exists()); + Assert.assertFalse( + "Append manifest should be deleted on expiry", new File(newManifest.path()).exists()); } @Test @@ -740,13 +750,12 @@ public void testSimpleTransactionNotDeletingMetadataOnUnknownSate() throws IOExc Table table = TestTables.tableWithCommitSucceedButStateUnknown(tableDir, "test"); Transaction transaction = table.newTransaction(); - transaction.newAppend() - .appendFile(FILE_A) - .commit(); + transaction.newAppend().appendFile(FILE_A).commit(); AssertHelpers.assertThrows( "Transaction commit should fail with CommitStateUnknownException", - CommitStateUnknownException.class, "datacenter on fire", + CommitStateUnknownException.class, + "datacenter on fire", () -> transaction.commitTransaction()); // Make sure metadata files still exist diff --git a/core/src/test/java/org/apache/iceberg/TestUpdatePartitionSpec.java b/core/src/test/java/org/apache/iceberg/TestUpdatePartitionSpec.java index 1c080e7fd1b3..a4f6f8518ccf 100644 --- a/core/src/test/java/org/apache/iceberg/TestUpdatePartitionSpec.java +++ b/core/src/test/java/org/apache/iceberg/TestUpdatePartitionSpec.java @@ -16,17 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.transforms.Transforms; -import org.apache.iceberg.types.Types; -import org.junit.Assert; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - import static org.apache.iceberg.expressions.Expressions.bucket; import static org.apache.iceberg.expressions.Expressions.day; import static org.apache.iceberg.expressions.Expressions.hour; @@ -35,25 +26,34 @@ import static org.apache.iceberg.expressions.Expressions.truncate; import static org.apache.iceberg.expressions.Expressions.year; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.transforms.Transforms; +import org.apache.iceberg.types.Types; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + @RunWith(Parameterized.class) public class TestUpdatePartitionSpec extends TableTestBase { - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.required(2, "ts", Types.TimestampType.withZone()), - Types.NestedField.required(3, "category", Types.StringType.get()), - Types.NestedField.optional(4, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.required(2, "ts", Types.TimestampType.withZone()), + Types.NestedField.required(3, "category", Types.StringType.get()), + Types.NestedField.optional(4, "data", Types.StringType.get())); private static final PartitionSpec UNPARTITIONED = PartitionSpec.builderFor(SCHEMA).build(); - private static final PartitionSpec PARTITIONED = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .day("ts") - .bucket("id", 16, "shard") - .build(); + private static final PartitionSpec PARTITIONED = + PartitionSpec.builderFor(SCHEMA) + .identity("category") + .day("ts") + .bucket("id", 16, "shard") + .build(); @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestUpdatePartitionSpec(int formatVersion) { @@ -62,611 +62,665 @@ public TestUpdatePartitionSpec(int formatVersion) { @Test public void testAddIdentityByName() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) - .addField("category") - .apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED).addField("category").apply(); - PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .build(); + PartitionSpec expected = PartitionSpec.builderFor(SCHEMA).identity("category").build(); Assert.assertEquals("Should match expected spec", expected, updated); } @Test public void testAddIdentityByTerm() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) - .addField(ref("category")) - .apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED).addField(ref("category")).apply(); - PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .build(); + PartitionSpec expected = PartitionSpec.builderFor(SCHEMA).identity("category").build(); Assert.assertEquals("Should match expected spec", expected, updated); } @Test public void testAddYear() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) - .addField(year("ts")) - .apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED).addField(year("ts")).apply(); - PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) - .year("ts") - .build(); + PartitionSpec expected = PartitionSpec.builderFor(SCHEMA).year("ts").build(); Assert.assertEquals("Should match expected spec", expected, updated); } @Test public void testAddMonth() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) - .addField(month("ts")) - .apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED).addField(month("ts")).apply(); - PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) - .month("ts") - .build(); + PartitionSpec expected = PartitionSpec.builderFor(SCHEMA).month("ts").build(); Assert.assertEquals("Should match expected spec", expected, updated); } @Test public void testAddDay() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) - .addField(day("ts")) - .apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED).addField(day("ts")).apply(); - PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) - .day("ts") - .build(); + PartitionSpec expected = PartitionSpec.builderFor(SCHEMA).day("ts").build(); Assert.assertEquals("Should match expected spec", expected, updated); } @Test public void testAddHour() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) - .addField(hour("ts")) - .apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED).addField(hour("ts")).apply(); - PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) - .hour("ts") - .build(); + PartitionSpec expected = PartitionSpec.builderFor(SCHEMA).hour("ts").build(); Assert.assertEquals("Should match expected spec", expected, updated); } @Test public void testAddBucket() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) - .addField(bucket("id", 16)) - .apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) + .addField(bucket("id", 16)) + .apply(); // added fields have different default names to avoid conflicts - PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) - .bucket("id", 16, "id_bucket_16") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(SCHEMA).bucket("id", 16, "id_bucket_16").build(); Assert.assertEquals("Should match expected spec", expected, updated); } @Test public void testAddTruncate() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) - .addField(truncate("data", 4)) - .apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) + .addField(truncate("data", 4)) + .apply(); // added fields have different default names to avoid conflicts - PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) - .truncate("data", 4, "data_trunc_4") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(SCHEMA).truncate("data", 4, "data_trunc_4").build(); Assert.assertEquals("Should match expected spec", expected, updated); } @Test public void testAddNamedPartition() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) - .addField("shard", bucket("id", 16)) - .apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) + .addField("shard", bucket("id", 16)) + .apply(); - PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) - .bucket("id", 16, "shard") - .build(); + PartitionSpec expected = PartitionSpec.builderFor(SCHEMA).bucket("id", 16, "shard").build(); Assert.assertEquals("Should match expected spec", expected, updated); } @Test public void testAddToExisting() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .addField(truncate("data", 4)) - .apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .addField(truncate("data", 4)) + .apply(); - PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .day("ts") - .bucket("id", 16, "shard") - .truncate("data", 4, "data_trunc_4") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(SCHEMA) + .identity("category") + .day("ts") + .bucket("id", 16, "shard") + .truncate("data", 4, "data_trunc_4") + .build(); Assert.assertEquals("Should match expected spec", expected, updated); } @Test public void testMultipleAdds() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) - .addField("category") - .addField(day("ts")) - .addField("shard", bucket("id", 16)) - .addField("prefix", truncate("data", 4)) - .apply(); - - PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .day("ts") - .bucket("id", 16, "shard") - .truncate("data", 4, "prefix") - .build(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) + .addField("category") + .addField(day("ts")) + .addField("shard", bucket("id", 16)) + .addField("prefix", truncate("data", 4)) + .apply(); + + PartitionSpec expected = + PartitionSpec.builderFor(SCHEMA) + .identity("category") + .day("ts") + .bucket("id", 16, "shard") + .truncate("data", 4, "prefix") + .build(); Assert.assertEquals("Should match expected spec", expected, updated); } @Test public void testAddHourToDay() { - // multiple partitions for the same source with different time granularity is not allowed by the builder, but is + // multiple partitions for the same source with different time granularity is not allowed by the + // builder, but is // allowed when updating a spec so that existing columns in metadata continue to work. - PartitionSpec byDay = new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) - .addField(day("ts")) - .apply(); + PartitionSpec byDay = + new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED).addField(day("ts")).apply(); - PartitionSpec byHour = new BaseUpdatePartitionSpec(formatVersion, byDay) - .addField(hour("ts")) - .apply(); + PartitionSpec byHour = + new BaseUpdatePartitionSpec(formatVersion, byDay).addField(hour("ts")).apply(); - Assert.assertEquals("Should have a day and an hour time field", + Assert.assertEquals( + "Should have a day and an hour time field", ImmutableList.of( new PartitionField(2, 1000, "ts_day", Transforms.day(Types.TimestampType.withZone())), - new PartitionField(2, 1001, "ts_hour", Transforms.hour(Types.TimestampType.withZone()))), + new PartitionField( + 2, 1001, "ts_hour", Transforms.hour(Types.TimestampType.withZone()))), byHour.fields()); } @Test public void testAddMultipleBuckets() { - PartitionSpec bucket16 = new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) - .addField(bucket("id", 16)) - .apply(); + PartitionSpec bucket16 = + new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) + .addField(bucket("id", 16)) + .apply(); - PartitionSpec bucket8 = new BaseUpdatePartitionSpec(formatVersion, bucket16) - .addField(bucket("id", 8)) - .apply(); + PartitionSpec bucket8 = + new BaseUpdatePartitionSpec(formatVersion, bucket16).addField(bucket("id", 8)).apply(); - PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) - .bucket("id", 16, "id_bucket_16") - .bucket("id", 8, "id_bucket_8") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(SCHEMA) + .bucket("id", 16, "id_bucket_16") + .bucket("id", 8, "id_bucket_8") + .build(); Assert.assertEquals("Should have a day and an hour time field", expected, bucket8); } @Test public void testRemoveIdentityByName() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .removeField("category") - .apply(); - - PartitionSpec v1Expected = PartitionSpec.builderFor(SCHEMA) - .alwaysNull("category", "category") - .day("ts") - .bucket("id", 16, "shard") - .build(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED).removeField("category").apply(); + + PartitionSpec v1Expected = + PartitionSpec.builderFor(SCHEMA) + .alwaysNull("category", "category") + .day("ts") + .bucket("id", 16, "shard") + .build(); V1Assert.assertEquals("Should match expected spec", v1Expected, updated); - PartitionSpec v2Expected = PartitionSpec.builderFor(SCHEMA) - .add(id("ts"), 1001, "ts_day", Transforms.day(Types.TimestampType.withZone())) - .add(id("id"), 1002, "shard", Transforms.bucket(Types.LongType.get(), 16)) - .build(); + PartitionSpec v2Expected = + PartitionSpec.builderFor(SCHEMA) + .add(id("ts"), 1001, "ts_day", Transforms.day(Types.TimestampType.withZone())) + .add(id("id"), 1002, "shard", Transforms.bucket(Types.LongType.get(), 16)) + .build(); V2Assert.assertEquals("Should match expected spec", v2Expected, updated); } @Test public void testRemoveBucketByName() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .removeField("shard") - .apply(); - - PartitionSpec v1Expected = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .day("ts") - .alwaysNull("id", "shard") - .build(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED).removeField("shard").apply(); + + PartitionSpec v1Expected = + PartitionSpec.builderFor(SCHEMA) + .identity("category") + .day("ts") + .alwaysNull("id", "shard") + .build(); V1Assert.assertEquals("Should match expected spec", v1Expected, updated); - PartitionSpec v2Expected = PartitionSpec.builderFor(SCHEMA) - .add(id("category"), 1000, "category", Transforms.identity(Types.StringType.get())) - .add(id("ts"), 1001, "ts_day", Transforms.day(Types.TimestampType.withZone())) - .build(); + PartitionSpec v2Expected = + PartitionSpec.builderFor(SCHEMA) + .add(id("category"), 1000, "category", Transforms.identity(Types.StringType.get())) + .add(id("ts"), 1001, "ts_day", Transforms.day(Types.TimestampType.withZone())) + .build(); V2Assert.assertEquals("Should match expected spec", v2Expected, updated); } @Test public void testRemoveIdentityByEquivalent() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .removeField(ref("category")) - .apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .removeField(ref("category")) + .apply(); - PartitionSpec v1Expected = PartitionSpec.builderFor(SCHEMA) - .alwaysNull("category", "category") - .day("ts") - .bucket("id", 16, "shard") - .build(); + PartitionSpec v1Expected = + PartitionSpec.builderFor(SCHEMA) + .alwaysNull("category", "category") + .day("ts") + .bucket("id", 16, "shard") + .build(); V1Assert.assertEquals("Should match expected spec", v1Expected, updated); - PartitionSpec v2Expected = PartitionSpec.builderFor(SCHEMA) - .add(id("ts"), 1001, "ts_day", Transforms.day(Types.TimestampType.withZone())) - .add(id("id"), 1002, "shard", Transforms.bucket(Types.LongType.get(), 16)) - .build(); + PartitionSpec v2Expected = + PartitionSpec.builderFor(SCHEMA) + .add(id("ts"), 1001, "ts_day", Transforms.day(Types.TimestampType.withZone())) + .add(id("id"), 1002, "shard", Transforms.bucket(Types.LongType.get(), 16)) + .build(); V2Assert.assertEquals("Should match expected spec", v2Expected, updated); } @Test public void testRemoveDayByEquivalent() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .removeField(day("ts")) - .apply(); - - PartitionSpec v1Expected = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .alwaysNull("ts", "ts_day") - .bucket("id", 16, "shard") - .build(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED).removeField(day("ts")).apply(); + + PartitionSpec v1Expected = + PartitionSpec.builderFor(SCHEMA) + .identity("category") + .alwaysNull("ts", "ts_day") + .bucket("id", 16, "shard") + .build(); V1Assert.assertEquals("Should match expected spec", v1Expected, updated); - PartitionSpec v2Expected = PartitionSpec.builderFor(SCHEMA) - .add(id("category"), 1000, "category", Transforms.identity(Types.StringType.get())) - .add(id("id"), 1002, "shard", Transforms.bucket(Types.LongType.get(), 16)) - .build(); + PartitionSpec v2Expected = + PartitionSpec.builderFor(SCHEMA) + .add(id("category"), 1000, "category", Transforms.identity(Types.StringType.get())) + .add(id("id"), 1002, "shard", Transforms.bucket(Types.LongType.get(), 16)) + .build(); V2Assert.assertEquals("Should match expected spec", v2Expected, updated); } @Test public void testRemoveBucketByEquivalent() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .removeField(bucket("id", 16)) - .apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .removeField(bucket("id", 16)) + .apply(); - PartitionSpec v1Expected = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .day("ts") - .alwaysNull("id", "shard") - .build(); + PartitionSpec v1Expected = + PartitionSpec.builderFor(SCHEMA) + .identity("category") + .day("ts") + .alwaysNull("id", "shard") + .build(); V1Assert.assertEquals("Should match expected spec", v1Expected, updated); - PartitionSpec v2Expected = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .day("ts") - .build(); + PartitionSpec v2Expected = + PartitionSpec.builderFor(SCHEMA).identity("category").day("ts").build(); V2Assert.assertEquals("Should match expected spec", v2Expected, updated); } @Test public void testRename() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .renameField("shard", "id_bucket") // rename back to default - .apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .renameField("shard", "id_bucket") // rename back to default + .apply(); - PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .day("ts") - .bucket("id", 16) - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(SCHEMA).identity("category").day("ts").bucket("id", 16).build(); Assert.assertEquals("Should match expected spec", expected, updated); } @Test public void testMultipleChanges() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .renameField("shard", "id_bucket") // rename back to default - .removeField(day("ts")) - .addField("prefix", truncate("data", 4)) - .apply(); - - PartitionSpec v1Expected = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .alwaysNull("ts", "ts_day") - .bucket("id", 16) - .truncate("data", 4, "prefix") - .build(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .renameField("shard", "id_bucket") // rename back to default + .removeField(day("ts")) + .addField("prefix", truncate("data", 4)) + .apply(); + + PartitionSpec v1Expected = + PartitionSpec.builderFor(SCHEMA) + .identity("category") + .alwaysNull("ts", "ts_day") + .bucket("id", 16) + .truncate("data", 4, "prefix") + .build(); V1Assert.assertEquals("Should match expected spec", v1Expected, updated); - PartitionSpec v2Expected = PartitionSpec.builderFor(SCHEMA) - .add(id("category"), 1000, "category", Transforms.identity(Types.StringType.get())) - .add(id("id"), 1002, "id_bucket", Transforms.bucket(Types.LongType.get(), 16)) - .add(id("data"), 1003, "prefix", Transforms.truncate(Types.StringType.get(), 4)) - .build(); + PartitionSpec v2Expected = + PartitionSpec.builderFor(SCHEMA) + .add(id("category"), 1000, "category", Transforms.identity(Types.StringType.get())) + .add(id("id"), 1002, "id_bucket", Transforms.bucket(Types.LongType.get(), 16)) + .add(id("data"), 1003, "prefix", Transforms.truncate(Types.StringType.get(), 4)) + .build(); V2Assert.assertEquals("Should match expected spec", v2Expected, updated); } @Test public void testAddDeletedName() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .removeField(bucket("id", 16)) - .apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .removeField(bucket("id", 16)) + .apply(); - PartitionSpec v1Expected = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .day("ts") - .alwaysNull("id", "shard") - .build(); + PartitionSpec v1Expected = + PartitionSpec.builderFor(SCHEMA) + .identity("category") + .day("ts") + .alwaysNull("id", "shard") + .build(); V1Assert.assertEquals("Should match expected spec", v1Expected, updated); - PartitionSpec v2Expected = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .day("ts") - .build(); + PartitionSpec v2Expected = + PartitionSpec.builderFor(SCHEMA).identity("category").day("ts").build(); V2Assert.assertEquals("Should match expected spec", v2Expected, updated); } @Test public void testRemoveNewlyAddedFieldByName() { - AssertHelpers.assertThrows("Should fail trying to remove unknown field", - IllegalArgumentException.class, "Cannot delete newly added field", - () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .addField("prefix", truncate("data", 4)) - .removeField("prefix") - ); + AssertHelpers.assertThrows( + "Should fail trying to remove unknown field", + IllegalArgumentException.class, + "Cannot delete newly added field", + () -> + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .addField("prefix", truncate("data", 4)) + .removeField("prefix")); } @Test public void testRemoveNewlyAddedFieldByTransform() { - AssertHelpers.assertThrows("Should fail adding a duplicate field", - IllegalArgumentException.class, "Cannot delete newly added field", - () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .addField("prefix", truncate("data", 4)) - .removeField(truncate("data", 4))); + AssertHelpers.assertThrows( + "Should fail adding a duplicate field", + IllegalArgumentException.class, + "Cannot delete newly added field", + () -> + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .addField("prefix", truncate("data", 4)) + .removeField(truncate("data", 4))); } @Test public void testAddAlreadyAddedFieldByTransform() { - AssertHelpers.assertThrows("Should fail adding a duplicate field", - IllegalArgumentException.class, "Cannot add duplicate partition field", - () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .addField("prefix", truncate("data", 4)) - .addField(truncate("data", 4))); + AssertHelpers.assertThrows( + "Should fail adding a duplicate field", + IllegalArgumentException.class, + "Cannot add duplicate partition field", + () -> + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .addField("prefix", truncate("data", 4)) + .addField(truncate("data", 4))); } @Test public void testAddAlreadyAddedFieldByName() { - AssertHelpers.assertThrows("Should fail adding a duplicate field", - IllegalArgumentException.class, "Cannot add duplicate partition field", - () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .addField("prefix", truncate("data", 4)) - .addField("prefix", truncate("data", 6))); + AssertHelpers.assertThrows( + "Should fail adding a duplicate field", + IllegalArgumentException.class, + "Cannot add duplicate partition field", + () -> + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .addField("prefix", truncate("data", 4)) + .addField("prefix", truncate("data", 6))); } @Test public void testAddRedundantTimePartition() { - AssertHelpers.assertThrows("Should fail adding a duplicate field", - IllegalArgumentException.class, "Cannot add redundant partition field", - () -> new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) - .addField(day("ts")) - .addField(hour("ts"))); // conflicts with hour - - AssertHelpers.assertThrows("Should fail adding a duplicate field", - IllegalArgumentException.class, "Cannot add redundant partition", - () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .addField(hour("ts")) // does not conflict with day because day already exists - .addField(month("ts"))); // conflicts with hour + AssertHelpers.assertThrows( + "Should fail adding a duplicate field", + IllegalArgumentException.class, + "Cannot add redundant partition field", + () -> + new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) + .addField(day("ts")) + .addField(hour("ts"))); // conflicts with hour + + AssertHelpers.assertThrows( + "Should fail adding a duplicate field", + IllegalArgumentException.class, + "Cannot add redundant partition", + () -> + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .addField(hour("ts")) // does not conflict with day because day already exists + .addField(month("ts"))); // conflicts with hour } @Test public void testNoEffectAddDeletedSameFieldWithSameName() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .removeField("shard") - .addField("shard", bucket("id", 16)) - .apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .removeField("shard") + .addField("shard", bucket("id", 16)) + .apply(); Assert.assertEquals(PARTITIONED, updated); - updated = new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .removeField("shard") - .addField(bucket("id", 16)) - .apply(); + updated = + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .removeField("shard") + .addField(bucket("id", 16)) + .apply(); Assert.assertEquals(PARTITIONED, updated); } @Test public void testGenerateNewSpecAddDeletedSameFieldWithDifferentName() { - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .removeField("shard") - .addField("new_shard", bucket("id", 16)) - .apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .removeField("shard") + .addField("new_shard", bucket("id", 16)) + .apply(); Assert.assertEquals("Should match expected field size", 3, updated.fields().size()); - Assert.assertEquals("Should match expected field name", "category", - updated.fields().get(0).name()); - Assert.assertEquals("Should match expected field name", "ts_day", - updated.fields().get(1).name()); - Assert.assertEquals("Should match expected field name", "new_shard", - updated.fields().get(2).name()); - Assert.assertEquals("Should match expected field transform", "identity", + Assert.assertEquals( + "Should match expected field name", "category", updated.fields().get(0).name()); + Assert.assertEquals( + "Should match expected field name", "ts_day", updated.fields().get(1).name()); + Assert.assertEquals( + "Should match expected field name", "new_shard", updated.fields().get(2).name()); + Assert.assertEquals( + "Should match expected field transform", + "identity", updated.fields().get(0).transform().toString()); - Assert.assertEquals("Should match expected field transform", "day", + Assert.assertEquals( + "Should match expected field transform", + "day", updated.fields().get(1).transform().toString()); - Assert.assertEquals("Should match expected field transform", "bucket[16]", + Assert.assertEquals( + "Should match expected field transform", + "bucket[16]", updated.fields().get(2).transform().toString()); } @Test public void testAddDuplicateByName() { - AssertHelpers.assertThrows("Should fail adding a duplicate field", - IllegalArgumentException.class, "Cannot add duplicate partition field", - () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .addField("category")); + AssertHelpers.assertThrows( + "Should fail adding a duplicate field", + IllegalArgumentException.class, + "Cannot add duplicate partition field", + () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED).addField("category")); } @Test public void testAddDuplicateByRef() { - AssertHelpers.assertThrows("Should fail adding a duplicate field", - IllegalArgumentException.class, "Cannot add duplicate partition field", - () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .addField(ref("category"))); + AssertHelpers.assertThrows( + "Should fail adding a duplicate field", + IllegalArgumentException.class, + "Cannot add duplicate partition field", + () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED).addField(ref("category"))); } @Test public void testAddDuplicateTransform() { - AssertHelpers.assertThrows("Should fail adding a duplicate field", - IllegalArgumentException.class, "Cannot add duplicate partition field", - () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .addField(bucket("id", 16))); + AssertHelpers.assertThrows( + "Should fail adding a duplicate field", + IllegalArgumentException.class, + "Cannot add duplicate partition field", + () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED).addField(bucket("id", 16))); } @Test public void testAddNamedDuplicate() { - AssertHelpers.assertThrows("Should fail adding a duplicate field", - IllegalArgumentException.class, "Cannot add duplicate partition field", - () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .addField("b16", bucket("id", 16))); + AssertHelpers.assertThrows( + "Should fail adding a duplicate field", + IllegalArgumentException.class, + "Cannot add duplicate partition field", + () -> + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .addField("b16", bucket("id", 16))); } @Test public void testRemoveUnknownFieldByName() { - AssertHelpers.assertThrows("Should fail trying to remove unknown field", - IllegalArgumentException.class, "Cannot find partition field to remove", - () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED).removeField("moon") - ); + AssertHelpers.assertThrows( + "Should fail trying to remove unknown field", + IllegalArgumentException.class, + "Cannot find partition field to remove", + () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED).removeField("moon")); } @Test public void testRemoveUnknownFieldByEquivalent() { - AssertHelpers.assertThrows("Should fail trying to remove unknown field", - IllegalArgumentException.class, "Cannot find partition field to remove", - () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED).removeField(hour("ts")) // day(ts) exists - ); + AssertHelpers.assertThrows( + "Should fail trying to remove unknown field", + IllegalArgumentException.class, + "Cannot find partition field to remove", + () -> + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .removeField(hour("ts")) // day(ts) exists + ); } @Test public void testRenameUnknownField() { - AssertHelpers.assertThrows("Should fail trying to rename an unknown field", - IllegalArgumentException.class, "Cannot find partition field to rename", - () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED).renameField("shake", "seal") - ); + AssertHelpers.assertThrows( + "Should fail trying to rename an unknown field", + IllegalArgumentException.class, + "Cannot find partition field to rename", + () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED).renameField("shake", "seal")); } @Test public void testRenameAfterAdd() { - AssertHelpers.assertThrows("Should fail trying to rename an added field", - IllegalArgumentException.class, "Cannot rename newly added partition field", - () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .addField("data_trunc", truncate("data", 4)) - .renameField("data_trunc", "prefix") - ); + AssertHelpers.assertThrows( + "Should fail trying to rename an added field", + IllegalArgumentException.class, + "Cannot rename newly added partition field", + () -> + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .addField("data_trunc", truncate("data", 4)) + .renameField("data_trunc", "prefix")); } @Test public void testDeleteAndRename() { - AssertHelpers.assertThrows("Should fail trying to rename a deleted field", - IllegalArgumentException.class, "Cannot rename and delete partition field", - () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .renameField("shard", "id_bucket") - .removeField(bucket("id", 16))); + AssertHelpers.assertThrows( + "Should fail trying to rename a deleted field", + IllegalArgumentException.class, + "Cannot rename and delete partition field", + () -> + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .renameField("shard", "id_bucket") + .removeField(bucket("id", 16))); } @Test public void testRenameAndDelete() { - AssertHelpers.assertThrows("Should fail trying to delete a renamed field", - IllegalArgumentException.class, "Cannot delete and rename partition field", - () -> new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) - .removeField(bucket("id", 16)) - .renameField("shard", "id_bucket")); + AssertHelpers.assertThrows( + "Should fail trying to delete a renamed field", + IllegalArgumentException.class, + "Cannot delete and rename partition field", + () -> + new BaseUpdatePartitionSpec(formatVersion, PARTITIONED) + .removeField(bucket("id", 16)) + .renameField("shard", "id_bucket")); } @Test public void testRemoveAndAddMultiTimes() { - PartitionSpec addFirstTime = new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) + PartitionSpec addFirstTime = + new BaseUpdatePartitionSpec(formatVersion, UNPARTITIONED) .addField("ts_date", day("ts")) .apply(); - PartitionSpec removeFirstTime = new BaseUpdatePartitionSpec(formatVersion, addFirstTime) - .removeField(day("ts")) - .apply(); - PartitionSpec addSecondTime = new BaseUpdatePartitionSpec(formatVersion, removeFirstTime) + PartitionSpec removeFirstTime = + new BaseUpdatePartitionSpec(formatVersion, addFirstTime).removeField(day("ts")).apply(); + PartitionSpec addSecondTime = + new BaseUpdatePartitionSpec(formatVersion, removeFirstTime) .addField("ts_date", day("ts")) .apply(); - PartitionSpec removeSecondTime = new BaseUpdatePartitionSpec(formatVersion, addSecondTime) - .removeField(day("ts")) - .apply(); - PartitionSpec addThirdTime = new BaseUpdatePartitionSpec(formatVersion, removeSecondTime) - .addField(month("ts")) - .apply(); - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, addThirdTime) + PartitionSpec removeSecondTime = + new BaseUpdatePartitionSpec(formatVersion, addSecondTime).removeField(day("ts")).apply(); + PartitionSpec addThirdTime = + new BaseUpdatePartitionSpec(formatVersion, removeSecondTime).addField(month("ts")).apply(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, addThirdTime) .renameField("ts_month", "ts_date") .apply(); if (formatVersion == 1) { Assert.assertEquals("Should match expected spec field size", 3, updated.fields().size()); - Assert.assertTrue("Should match expected field name", - updated.fields().get(0).name().matches("^ts_date(?:_\\d+)+$")); - Assert.assertTrue("Should match expected field name", - updated.fields().get(1).name().matches("^ts_date_(?:\\d+)+$")); - Assert.assertEquals("Should match expected field name", "ts_date", updated.fields().get(2).name()); - - Assert.assertEquals("Should match expected field transform", "void", - updated.fields().get(0).transform().toString()); - Assert.assertEquals("Should match expected field transform", "void", - updated.fields().get(1).transform().toString()); - Assert.assertEquals("Should match expected field transform", "month", - updated.fields().get(2).transform().toString()); + Assert.assertTrue( + "Should match expected field name", + updated.fields().get(0).name().matches("^ts_date(?:_\\d+)+$")); + Assert.assertTrue( + "Should match expected field name", + updated.fields().get(1).name().matches("^ts_date_(?:\\d+)+$")); + Assert.assertEquals( + "Should match expected field name", "ts_date", updated.fields().get(2).name()); + + Assert.assertEquals( + "Should match expected field transform", + "void", + updated.fields().get(0).transform().toString()); + Assert.assertEquals( + "Should match expected field transform", + "void", + updated.fields().get(1).transform().toString()); + Assert.assertEquals( + "Should match expected field transform", + "month", + updated.fields().get(2).transform().toString()); } - PartitionSpec v2Expected = PartitionSpec.builderFor(SCHEMA) - .month("ts", "ts_date") - .build(); + PartitionSpec v2Expected = PartitionSpec.builderFor(SCHEMA).month("ts", "ts_date").build(); V2Assert.assertEquals("Should match expected spec", v2Expected, updated); } @Test public void testRemoveAndUpdateWithDifferentTransformation() { - PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) - .month("ts", "ts_transformed") - .build(); - PartitionSpec updated = new BaseUpdatePartitionSpec(formatVersion, expected) - .removeField("ts_transformed") - .addField("ts_transformed", day("ts")) - .apply(); + PartitionSpec expected = PartitionSpec.builderFor(SCHEMA).month("ts", "ts_transformed").build(); + PartitionSpec updated = + new BaseUpdatePartitionSpec(formatVersion, expected) + .removeField("ts_transformed") + .addField("ts_transformed", day("ts")) + .apply(); if (formatVersion == 1) { Assert.assertEquals("Should match expected spec field size", 2, updated.fields().size()); - Assert.assertEquals("Should match expected field name", "ts_transformed_1000", - updated.fields().get(0).name()); - Assert.assertEquals("Should match expected field name", "ts_transformed", - updated.fields().get(1).name()); - - Assert.assertEquals("Should match expected field transform", "void", - updated.fields().get(0).transform().toString()); - Assert.assertEquals("Should match expected field transform", "day", - updated.fields().get(1).transform().toString()); + Assert.assertEquals( + "Should match expected field name", + "ts_transformed_1000", + updated.fields().get(0).name()); + Assert.assertEquals( + "Should match expected field name", "ts_transformed", updated.fields().get(1).name()); + + Assert.assertEquals( + "Should match expected field transform", + "void", + updated.fields().get(0).transform().toString()); + Assert.assertEquals( + "Should match expected field transform", + "day", + updated.fields().get(1).transform().toString()); } else { Assert.assertEquals("Should match expected spec field size", 1, updated.fields().size()); - Assert.assertEquals("Should match expected field name", "ts_transformed", - updated.fields().get(0).name()); - Assert.assertEquals("Should match expected field transform", "day", - updated.fields().get(0).transform().toString()); + Assert.assertEquals( + "Should match expected field name", "ts_transformed", updated.fields().get(0).name()); + Assert.assertEquals( + "Should match expected field transform", + "day", + updated.fields().get(0).transform().toString()); } } diff --git a/core/src/test/java/org/apache/iceberg/TestV1ToV2RowDeltaDelete.java b/core/src/test/java/org/apache/iceberg/TestV1ToV2RowDeltaDelete.java index 8d9c52b7bd48..f2d0fa40cd3c 100644 --- a/core/src/test/java/org/apache/iceberg/TestV1ToV2RowDeltaDelete.java +++ b/core/src/test/java/org/apache/iceberg/TestV1ToV2RowDeltaDelete.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.expressions.Expressions.bucket; +import static org.apache.iceberg.expressions.Expressions.equal; + import java.util.List; import java.util.Optional; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; @@ -27,52 +29,46 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.expressions.Expressions.bucket; -import static org.apache.iceberg.expressions.Expressions.equal; - public class TestV1ToV2RowDeltaDelete extends TableTestBase { public TestV1ToV2RowDeltaDelete() { super(1 /* table format version */); } - static final DeleteFile FILE_A_POS_1 = FileMetadata.deleteFileBuilder(SPEC) - .ofPositionDeletes() - .withPath("/path/to/data-a-pos-deletes.parquet") - .withFileSizeInBytes(10) - .withPartition(FILE_A.partition()) - .withRecordCount(1) - .build(); - - static final DeleteFile FILE_A_EQ_1 = FileMetadata.deleteFileBuilder(SPEC) - .ofEqualityDeletes() - .withPath("/path/to/data-a-eq-deletes.parquet") - .withFileSizeInBytes(10) - .withPartition(FILE_A.partition()) - .withRecordCount(1) - .build(); - - private void verifyManifestSequenceNumber(ManifestFile mf, long sequenceNum, long minSequenceNum) { - Assert.assertEquals("sequence number should be " + sequenceNum, - mf.sequenceNumber(), sequenceNum); - Assert.assertEquals("min sequence number should be " + minSequenceNum, - mf.minSequenceNumber(), minSequenceNum); + static final DeleteFile FILE_A_POS_1 = + FileMetadata.deleteFileBuilder(SPEC) + .ofPositionDeletes() + .withPath("/path/to/data-a-pos-deletes.parquet") + .withFileSizeInBytes(10) + .withPartition(FILE_A.partition()) + .withRecordCount(1) + .build(); + + static final DeleteFile FILE_A_EQ_1 = + FileMetadata.deleteFileBuilder(SPEC) + .ofEqualityDeletes() + .withPath("/path/to/data-a-eq-deletes.parquet") + .withFileSizeInBytes(10) + .withPartition(FILE_A.partition()) + .withRecordCount(1) + .build(); + + private void verifyManifestSequenceNumber( + ManifestFile mf, long sequenceNum, long minSequenceNum) { + Assert.assertEquals( + "sequence number should be " + sequenceNum, mf.sequenceNumber(), sequenceNum); + Assert.assertEquals( + "min sequence number should be " + minSequenceNum, mf.minSequenceNumber(), minSequenceNum); } @Test public void testPartitionedTableWithPartitionEqDeletes() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).appendFile(FILE_C).commit(); List dataManifests = table.currentSnapshot().dataManifests(table.io()); List deleteManifests = table.currentSnapshot().deleteManifests(table.io()); - Assert.assertEquals("Should have one data manifest file", - 1, dataManifests.size()); - Assert.assertEquals("Should have zero delete manifest file", - 0, deleteManifests.size()); + Assert.assertEquals("Should have one data manifest file", 1, dataManifests.size()); + Assert.assertEquals("Should have zero delete manifest file", 0, deleteManifests.size()); ManifestFile dataManifest = dataManifests.get(0); verifyManifestSequenceNumber(dataManifest, 0, 0); @@ -81,166 +77,152 @@ public void testPartitionedTableWithPartitionEqDeletes() { TableMetadata base = ops.current(); ops.commit(base, base.upgradeToFormatVersion(2)); - table.newRowDelta() - .addDeletes(FILE_A_EQ_1) - .commit(); + table.newRowDelta().addDeletes(FILE_A_EQ_1).commit(); dataManifests = table.currentSnapshot().dataManifests(ops.io()); deleteManifests = table.currentSnapshot().deleteManifests(ops.io()); - Assert.assertEquals("Should have one data manifest file", - 1, dataManifests.size()); - Assert.assertEquals("Should have one delete manifest file", - 1, deleteManifests.size()); + Assert.assertEquals("Should have one data manifest file", 1, dataManifests.size()); + Assert.assertEquals("Should have one delete manifest file", 1, deleteManifests.size()); Assert.assertEquals(dataManifest, dataManifests.get(0)); // data manifest not changed ManifestFile deleteManifest = deleteManifests.get(0); verifyManifestSequenceNumber(deleteManifest, 1, 1); List tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); Assert.assertEquals("Should have three task", 3, tasks.size()); - Optional task = tasks.stream().filter(t -> t.file().path().equals(FILE_A.path())).findFirst(); + Optional task = + tasks.stream().filter(t -> t.file().path().equals(FILE_A.path())).findFirst(); Assert.assertTrue(task.isPresent()); - Assert.assertEquals("Should have one associated delete file", - 1, task.get().deletes().size()); - Assert.assertEquals("Should have only pos delete file", - FILE_A_EQ_1.path(), task.get().deletes().get(0).path()); + Assert.assertEquals("Should have one associated delete file", 1, task.get().deletes().size()); + Assert.assertEquals( + "Should have only pos delete file", FILE_A_EQ_1.path(), task.get().deletes().get(0).path()); // first commit after row-delta changes table.newDelete().deleteFile(FILE_B).commit(); dataManifests = table.currentSnapshot().dataManifests(ops.io()); deleteManifests = table.currentSnapshot().deleteManifests(ops.io()); - Assert.assertEquals("Should have one data manifest file", - 1, dataManifests.size()); - Assert.assertEquals("Should have one delete manifest file", - 1, deleteManifests.size()); + Assert.assertEquals("Should have one data manifest file", 1, dataManifests.size()); + Assert.assertEquals("Should have one delete manifest file", 1, deleteManifests.size()); ManifestFile dataManifest2 = dataManifests.get(0); verifyManifestSequenceNumber(dataManifest2, 2, 0); Assert.assertNotEquals(dataManifest, dataManifest2); - Assert.assertEquals(deleteManifest, deleteManifests.get(0)); // delete manifest not changed + Assert.assertEquals(deleteManifest, deleteManifests.get(0)); // delete manifest not changed tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); Assert.assertEquals("Should have two task", 2, tasks.size()); task = tasks.stream().filter(t -> t.file().path().equals(FILE_A.path())).findFirst(); Assert.assertTrue(task.isPresent()); - Assert.assertEquals("Should have one associated delete file", - 1, task.get().deletes().size()); + Assert.assertEquals("Should have one associated delete file", 1, task.get().deletes().size()); // second commit after row-delta changes table.newDelete().deleteFile(FILE_C).commit(); dataManifests = table.currentSnapshot().dataManifests(ops.io()); deleteManifests = table.currentSnapshot().deleteManifests(ops.io()); - Assert.assertEquals("Should have one data manifest file", - 1, dataManifests.size()); - Assert.assertEquals("Should have one delete manifest file", - 1, deleteManifests.size()); + Assert.assertEquals("Should have one data manifest file", 1, dataManifests.size()); + Assert.assertEquals("Should have one delete manifest file", 1, deleteManifests.size()); ManifestFile dataManifest3 = dataManifests.get(0); verifyManifestSequenceNumber(dataManifest3, 3, 0); Assert.assertNotEquals(dataManifest2, dataManifest3); - Assert.assertEquals(deleteManifest, deleteManifests.get(0)); // delete manifest not changed + Assert.assertEquals(deleteManifest, deleteManifests.get(0)); // delete manifest not changed tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); Assert.assertEquals("Should have one task", 1, tasks.size()); task = tasks.stream().filter(t -> t.file().path().equals(FILE_A.path())).findFirst(); Assert.assertTrue(task.isPresent()); - Assert.assertEquals("Should have one associated delete file", - 1, task.get().deletes().size()); + Assert.assertEquals("Should have one associated delete file", 1, task.get().deletes().size()); } @Test public void testPartitionedTableWithUnrelatedPartitionDeletes() { - table.newAppend() - .appendFile(FILE_B) - .appendFile(FILE_C) - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_B).appendFile(FILE_C).appendFile(FILE_D).commit(); // update table version to 2 TableOperations ops = ((BaseTable) table).operations(); TableMetadata base = ops.current(); ops.commit(base, base.upgradeToFormatVersion(2)); - table.newRowDelta() - .addDeletes(FILE_A_POS_1) - .addDeletes(FILE_A_EQ_1) - .commit(); + table.newRowDelta().addDeletes(FILE_A_POS_1).addDeletes(FILE_A_EQ_1).commit(); List tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); Assert.assertEquals("Should have three task", 3, tasks.size()); - Assert.assertEquals("Should have the correct data file path", - FILE_B.path(), tasks.get(0).file().path()); - Assert.assertEquals("Should have zero associated delete file", - 0, tasks.get(0).deletes().size()); + Assert.assertEquals( + "Should have the correct data file path", FILE_B.path(), tasks.get(0).file().path()); + Assert.assertEquals( + "Should have zero associated delete file", 0, tasks.get(0).deletes().size()); table.newDelete().deleteFile(FILE_B).commit(); tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); Assert.assertEquals("Should have two task", 2, tasks.size()); - Assert.assertEquals("Should have zero associated delete file", - 0, tasks.get(0).deletes().size()); + Assert.assertEquals( + "Should have zero associated delete file", 0, tasks.get(0).deletes().size()); table.newDelete().deleteFile(FILE_C).commit(); tasks = Lists.newArrayList(table.newScan().planFiles().iterator()); Assert.assertEquals("Should have one task", 1, tasks.size()); - Assert.assertEquals("Should have zero associated delete file", - 0, tasks.get(0).deletes().size()); + Assert.assertEquals( + "Should have zero associated delete file", 0, tasks.get(0).deletes().size()); } - @Test public void testPartitionedTableWithExistingDeleteFile() { - table.updateProperties() - .set(TableProperties.MANIFEST_MERGE_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MERGE_ENABLED, "false").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // update table version to 2 TableOperations ops = ((BaseTable) table).operations(); TableMetadata base = ops.current(); ops.commit(base, base.upgradeToFormatVersion(2)); - table.newRowDelta() - .addDeletes(FILE_A_EQ_1) - .commit(); + table.newRowDelta().addDeletes(FILE_A_EQ_1).commit(); - table.newRowDelta() - .addDeletes(FILE_A_POS_1) - .commit(); + table.newRowDelta().addDeletes(FILE_A_POS_1).commit(); - table.updateProperties() + table + .updateProperties() .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1") .set(TableProperties.MANIFEST_MERGE_ENABLED, "true") .commit(); - Assert.assertEquals("Should have two delete manifests", - 2, table.currentSnapshot().deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should have two delete manifests", + 2, + table.currentSnapshot().deleteManifests(table.io()).size()); // merge delete manifests - table.newAppend() - .appendFile(FILE_B) - .commit(); - - Assert.assertEquals("Should have one delete manifest", - 1, table.currentSnapshot().deleteManifests(table.io()).size()); - Assert.assertEquals("Should have zero added delete file", - 0, table.currentSnapshot().deleteManifests(table.io()).get(0).addedFilesCount().intValue()); - Assert.assertEquals("Should have zero deleted delete file", - 0, table.currentSnapshot().deleteManifests(table.io()).get(0).deletedFilesCount().intValue()); - Assert.assertEquals("Should have two existing delete files", - 2, table.currentSnapshot().deleteManifests(table.io()).get(0).existingFilesCount().intValue()); + table.newAppend().appendFile(FILE_B).commit(); + + Assert.assertEquals( + "Should have one delete manifest", + 1, + table.currentSnapshot().deleteManifests(table.io()).size()); + Assert.assertEquals( + "Should have zero added delete file", + 0, + table.currentSnapshot().deleteManifests(table.io()).get(0).addedFilesCount().intValue()); + Assert.assertEquals( + "Should have zero deleted delete file", + 0, + table.currentSnapshot().deleteManifests(table.io()).get(0).deletedFilesCount().intValue()); + Assert.assertEquals( + "Should have two existing delete files", + 2, + table.currentSnapshot().deleteManifests(table.io()).get(0).existingFilesCount().intValue()); List tasks = - Lists.newArrayList(table.newScan().filter(equal(bucket("data", BUCKETS_NUMBER), 0)) - .planFiles().iterator()); + Lists.newArrayList( + table + .newScan() + .filter(equal(bucket("data", BUCKETS_NUMBER), 0)) + .planFiles() + .iterator()); Assert.assertEquals("Should have one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should have the correct data file path", - FILE_A.path(), task.file().path()); - Assert.assertEquals("Should have two associated delete files", - 2, task.deletes().size()); - Assert.assertEquals("Should have expected delete files", + Assert.assertEquals( + "Should have the correct data file path", FILE_A.path(), task.file().path()); + Assert.assertEquals("Should have two associated delete files", 2, task.deletes().size()); + Assert.assertEquals( + "Should have expected delete files", Sets.newHashSet(FILE_A_EQ_1.path(), FILE_A_POS_1.path()), Sets.newHashSet(Iterables.transform(task.deletes(), ContentFile::path))); } - } diff --git a/core/src/test/java/org/apache/iceberg/TestWapWorkflow.java b/core/src/test/java/org/apache/iceberg/TestWapWorkflow.java index 3d78fc0faf0f..978027b81434 100644 --- a/core/src/test/java/org/apache/iceberg/TestWapWorkflow.java +++ b/core/src/test/java/org/apache/iceberg/TestWapWorkflow.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import org.apache.iceberg.exceptions.CherrypickAncestorCommitException; @@ -34,7 +33,7 @@ public class TestWapWorkflow extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestWapWorkflow(int formatVersion) { @@ -48,23 +47,18 @@ public void setupTableProperties() { @Test public void testCherryPickOverwrite() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .stageOnly() - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).stageOnly().commit(); // the overwrite should only be staged validateTableFiles(table, FILE_A); - Snapshot overwrite = Streams.stream(table.snapshots()) - .filter(snap -> DataOperations.OVERWRITE.equals(snap.operation())) - .findFirst() - .get(); + Snapshot overwrite = + Streams.stream(table.snapshots()) + .filter(snap -> DataOperations.OVERWRITE.equals(snap.operation())) + .findFirst() + .get(); // cherry-pick the overwrite; this works because it is a fast-forward commit table.manageSnapshots().cherrypick(overwrite.snapshotId()).commit(); @@ -75,32 +69,27 @@ public void testCherryPickOverwrite() { @Test public void testCherryPickOverwriteFailsIfCurrentHasChanged() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .stageOnly() - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).stageOnly().commit(); // add a commit after the overwrite that prevents it from being a fast-forward operation - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); // the overwrite should only be staged validateTableFiles(table, FILE_A, FILE_C); - Snapshot overwrite = Streams.stream(table.snapshots()) - .filter(snap -> DataOperations.OVERWRITE.equals(snap.operation())) - .findFirst() - .get(); + Snapshot overwrite = + Streams.stream(table.snapshots()) + .filter(snap -> DataOperations.OVERWRITE.equals(snap.operation())) + .findFirst() + .get(); // try to cherry-pick, which should fail because the overwrite's parent is no longer current - AssertHelpers.assertThrows("Should reject overwrite that is not a fast-forward commit", - ValidationException.class, "not append, dynamic overwrite, or fast-forward", + AssertHelpers.assertThrows( + "Should reject overwrite that is not a fast-forward commit", + ValidationException.class, + "not append, dynamic overwrite, or fast-forward", () -> table.manageSnapshots().cherrypick(overwrite.snapshotId()).commit()); // the table state should not have changed @@ -110,101 +99,103 @@ public void testCherryPickOverwriteFailsIfCurrentHasChanged() { @Test public void testCurrentSnapshotOperation() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); long firstSnapshotId = base.currentSnapshot().snapshotId(); - table.newAppend() - .appendFile(FILE_B) - .set("wap.id", "123456789") - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).set("wap.id", "123456789").stageOnly().commit(); base = readMetadata(); Snapshot wapSnapshot = base.snapshots().get(1); Assert.assertEquals("Metadata should have both snapshots", 2, base.snapshots().size()); - Assert.assertEquals("Snapshot should have wap id in summary", "123456789", - wapSnapshot.summary().get("wap.id")); - Assert.assertEquals("Current snapshot should be first commit's snapshot", - firstSnapshotId, base.currentSnapshot().snapshotId()); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 1, - base.snapshotLog().size()); + Assert.assertEquals( + "Snapshot should have wap id in summary", "123456789", wapSnapshot.summary().get("wap.id")); + Assert.assertEquals( + "Current snapshot should be first commit's snapshot", + firstSnapshotId, + base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); // do setCurrentSnapshot table.manageSnapshots().setCurrentSnapshot(wapSnapshot.snapshotId()).commit(); base = readMetadata(); - Assert.assertEquals("Current snapshot should be what we rolled back to", - wapSnapshot.snapshotId(), base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Current snapshot should be what we rolled back to", + wapSnapshot.snapshotId(), + base.currentSnapshot().snapshotId()); Assert.assertEquals("Metadata should have both snapshots", 2, base.snapshots().size()); - Assert.assertEquals("Should contain manifests for both files", - 2, base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals("Should contain append from last commit", 1, + Assert.assertEquals( + "Should contain manifests for both files", + 2, + base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should contain append from last commit", + 1, Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 2, - base.snapshotLog().size()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 2, base.snapshotLog().size()); } @Test public void testSetCurrentSnapshotNoWAP() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); Snapshot firstSnapshot = base.currentSnapshot(); long firstSnapshotId = firstSnapshot.snapshotId(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); // do setCurrentSnapshot table.manageSnapshots().setCurrentSnapshot(firstSnapshotId).commit(); base = readMetadata(); - Assert.assertEquals("Current snapshot should be what we rolled back to", - firstSnapshotId, base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Current snapshot should be what we rolled back to", + firstSnapshotId, + base.currentSnapshot().snapshotId()); Assert.assertEquals("Metadata should have both snapshots", 2, base.snapshots().size()); - Assert.assertEquals("Should contain manifests for both files", - 1, base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals("Should contain append from last commit", 1, + Assert.assertEquals( + "Should contain manifests for both files", + 1, + base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should contain append from last commit", + 1, Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 3, - base.snapshotLog().size()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 3, base.snapshotLog().size()); } @Test public void testRollbackOnInvalidNonAncestor() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); long firstSnapshotId = base.currentSnapshot().snapshotId(); - table.newAppend() - .appendFile(FILE_B) - .set("wap.id", "123456789") - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).set("wap.id", "123456789").stageOnly().commit(); base = readMetadata(); Snapshot wapSnapshot = base.snapshots().get(1); Assert.assertEquals("Metadata should have both snapshots", 2, base.snapshots().size()); - Assert.assertEquals("Snapshot should have wap id in summary", "123456789", - wapSnapshot.summary().get("wap.id")); - Assert.assertEquals("Current snapshot should be first commit's snapshot", - firstSnapshotId, base.currentSnapshot().snapshotId()); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 1, - base.snapshotLog().size()); + Assert.assertEquals( + "Snapshot should have wap id in summary", "123456789", wapSnapshot.summary().get("wap.id")); + Assert.assertEquals( + "Current snapshot should be first commit's snapshot", + firstSnapshotId, + base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); // do rollback - AssertHelpers.assertThrows("should fail on invalid snapshot", ValidationException.class, + AssertHelpers.assertThrows( + "should fail on invalid snapshot", + ValidationException.class, "Cannot roll back to snapshot, not an ancestor of the current state: 2", () -> { // rollback to snapshot that is not an ancestor @@ -212,120 +203,113 @@ public void testRollbackOnInvalidNonAncestor() { }); base = readMetadata(); - Assert.assertEquals("Current snapshot should be what we rolled back to", firstSnapshotId, + Assert.assertEquals( + "Current snapshot should be what we rolled back to", + firstSnapshotId, base.currentSnapshot().snapshotId()); Assert.assertEquals("Metadata should have both snapshots", 2, base.snapshots().size()); - Assert.assertEquals("Should contain manifests for one snapshot", 1, + Assert.assertEquals( + "Should contain manifests for one snapshot", + 1, base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals("Should contain append from last commit", 1, + Assert.assertEquals( + "Should contain append from last commit", + 1, Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 1, - base.snapshotLog().size()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); } @Test public void testRollbackAndCherrypick() { // first snapshot - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); Snapshot firstSnapshot = base.currentSnapshot(); long firstSnapshotId = firstSnapshot.snapshotId(); // second snapshot - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); base = readMetadata(); Snapshot secondSnapshot = base.currentSnapshot(); // third snapshot - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); base = readMetadata(); Snapshot thirdSnapshot = base.currentSnapshot(); // rollback to first snapshot table.manageSnapshots().rollbackTo(firstSnapshotId).commit(); base = readMetadata(); - Assert.assertEquals("Should be at first snapshot", firstSnapshotId, base.currentSnapshot().snapshotId()); - Assert.assertEquals("Should have all three snapshots in the system", 3, base.snapshots().size()); + Assert.assertEquals( + "Should be at first snapshot", firstSnapshotId, base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Should have all three snapshots in the system", 3, base.snapshots().size()); // fast forward to third snapshot table.manageSnapshots().cherrypick(thirdSnapshot.snapshotId()).commit(); base = readMetadata(); - Assert.assertEquals("Current state should be at third snapshot", 4, - base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Current state should be at third snapshot", 4, base.currentSnapshot().snapshotId()); // fast forward to 2nd snapshot table.manageSnapshots().cherrypick(secondSnapshot.snapshotId()).commit(); base = readMetadata(); - Assert.assertEquals("Current state should be at second snapshot", 5, - base.currentSnapshot().snapshotId()); - Assert.assertEquals("Count all snapshots", 5, - base.snapshots().size()); + Assert.assertEquals( + "Current state should be at second snapshot", 5, base.currentSnapshot().snapshotId()); + Assert.assertEquals("Count all snapshots", 5, base.snapshots().size()); } @Test public void testRollbackToTime() { // first snapshot - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); Snapshot firstSnapshot = base.currentSnapshot(); long firstSnapshotId = firstSnapshot.snapshotId(); // second snapshot - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); base = readMetadata(); Snapshot secondSnapshot = base.currentSnapshot(); // third snapshot - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); // rollback to before the second snapshot's time table.manageSnapshots().rollbackToTime(secondSnapshot.timestampMillis()).commit(); base = readMetadata(); - Assert.assertEquals("Should be at first snapshot", firstSnapshotId, base.currentSnapshot().snapshotId()); - Assert.assertEquals("Should have all three snapshots in the system", 3, base.snapshots().size()); + Assert.assertEquals( + "Should be at first snapshot", firstSnapshotId, base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Should have all three snapshots in the system", 3, base.snapshots().size()); } @Test public void testWithCherryPicking() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); long firstSnapshotId = base.currentSnapshot().snapshotId(); // first WAP commit - table.newAppend() - .appendFile(FILE_B) - .set("wap.id", "123456789") - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).set("wap.id", "123456789").stageOnly().commit(); base = readMetadata(); // pick the snapshot that's staged but not committed Snapshot wapSnapshot = base.snapshots().get(1); Assert.assertEquals("Should have both snapshots", 2, base.snapshots().size()); - Assert.assertEquals("Should have first wap id in summary", "123456789", - wapSnapshot.summary().get("wap.id")); - Assert.assertEquals("Current snapshot should be first commit's snapshot", - firstSnapshotId, base.currentSnapshot().snapshotId()); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 1, - base.snapshotLog().size()); + Assert.assertEquals( + "Should have first wap id in summary", "123456789", wapSnapshot.summary().get("wap.id")); + Assert.assertEquals( + "Current snapshot should be first commit's snapshot", + firstSnapshotId, + base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); // cherry-pick snapshot table.manageSnapshots().cherrypick(wapSnapshot.snapshotId()).commit(); @@ -333,41 +317,37 @@ public void testWithCherryPicking() { // check if the effective current snapshot is set to the new snapshot created // as a result of the cherry-pick operation - Assert.assertEquals("Current snapshot should be fast-forwarded to wap snapshot", - wapSnapshot.snapshotId(), base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Current snapshot should be fast-forwarded to wap snapshot", + wapSnapshot.snapshotId(), + base.currentSnapshot().snapshotId()); Assert.assertEquals("Should have two snapshots", 2, base.snapshots().size()); - Assert.assertEquals("Should contain manifests for both files", - 2, base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals("Should contain append from last commit", 1, + Assert.assertEquals( + "Should contain manifests for both files", + 2, + base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should contain append from last commit", + 1, Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 2, - base.snapshotLog().size()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 2, base.snapshotLog().size()); } @Test public void testWithTwoPhaseCherryPicking() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); // load current snapshot Snapshot parentSnapshot = base.currentSnapshot(); long firstSnapshotId = parentSnapshot.snapshotId(); // first WAP commit - table.newAppend() - .appendFile(FILE_B) - .set("wap.id", "123456789") - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).set("wap.id", "123456789").stageOnly().commit(); // second WAP commit - table.newAppend() - .appendFile(FILE_C) - .set("wap.id", "987654321") - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_C).set("wap.id", "987654321").stageOnly().commit(); base = readMetadata(); // pick the snapshot that's staged but not committed @@ -375,18 +355,24 @@ public void testWithTwoPhaseCherryPicking() { Snapshot wap2Snapshot = base.snapshots().get(2); Assert.assertEquals("Should have three snapshots", 3, base.snapshots().size()); - Assert.assertEquals("Should have first wap id in summary", "123456789", - wap1Snapshot.summary().get("wap.id")); - Assert.assertEquals("Should have second wap id in summary", "987654321", - wap2Snapshot.summary().get("wap.id")); - Assert.assertEquals("Current snapshot should be first commit's snapshot", - firstSnapshotId, base.currentSnapshot().snapshotId()); - Assert.assertEquals("Parent snapshot id should be same for first WAP snapshot", - firstSnapshotId, wap1Snapshot.parentId().longValue()); - Assert.assertEquals("Parent snapshot id should be same for second WAP snapshot", - firstSnapshotId, wap2Snapshot.parentId().longValue()); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 1, - base.snapshotLog().size()); + Assert.assertEquals( + "Should have first wap id in summary", "123456789", wap1Snapshot.summary().get("wap.id")); + Assert.assertEquals( + "Should have second wap id in summary", "987654321", wap2Snapshot.summary().get("wap.id")); + Assert.assertEquals( + "Current snapshot should be first commit's snapshot", + firstSnapshotId, + base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Parent snapshot id should be same for first WAP snapshot", + firstSnapshotId, + wap1Snapshot.parentId().longValue()); + Assert.assertEquals( + "Parent snapshot id should be same for second WAP snapshot", + firstSnapshotId, + wap2Snapshot.parentId().longValue()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); // load current snapshot parentSnapshot = base.currentSnapshot(); @@ -400,14 +386,20 @@ public void testWithTwoPhaseCherryPicking() { "Current snapshot should be set to one after wap snapshot", parentSnapshot.snapshotId() + 1, base.currentSnapshot().snapshotId()); - Assert.assertEquals("Should contain manifests for both files", 2, + Assert.assertEquals( + "Should contain manifests for both files", + 2, base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals("Should contain append from last commit", 1, + Assert.assertEquals( + "Should contain append from last commit", + 1, Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals("Parent snapshot id should change to latest snapshot before commit", - parentSnapshot.snapshotId(), base.currentSnapshot().parentId().longValue()); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 2, - base.snapshotLog().size()); + Assert.assertEquals( + "Parent snapshot id should change to latest snapshot before commit", + parentSnapshot.snapshotId(), + base.currentSnapshot().parentId().longValue()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 2, base.snapshotLog().size()); // load current snapshot parentSnapshot = base.currentSnapshot(); @@ -417,42 +409,39 @@ public void testWithTwoPhaseCherryPicking() { // check if the effective current snapshot is set to the new snapshot created // as a result of the cherry-pick operation - Assert.assertEquals("Current snapshot should be set to one after wap snapshot", + Assert.assertEquals( + "Current snapshot should be set to one after wap snapshot", parentSnapshot.snapshotId() + 1 /* one fast-forwarded snapshot */ + 1, base.currentSnapshot().snapshotId()); - Assert.assertEquals("Should contain manifests for both files", - 3, base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals("Should contain append from last commit", 1, + Assert.assertEquals( + "Should contain manifests for both files", + 3, + base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should contain append from last commit", + 1, Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals("Parent snapshot id should change to latest snapshot before commit", - parentSnapshot.snapshotId(), base.currentSnapshot().parentId().longValue()); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 3, - base.snapshotLog().size()); + Assert.assertEquals( + "Parent snapshot id should change to latest snapshot before commit", + parentSnapshot.snapshotId(), + base.currentSnapshot().parentId().longValue()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 3, base.snapshotLog().size()); } @Test public void testWithCommitsBetweenCherryPicking() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); // load current snapshot Snapshot parentSnapshot = base.currentSnapshot(); long firstSnapshotId = parentSnapshot.snapshotId(); // first WAP commit - table.newAppend() - .appendFile(FILE_B) - .set("wap.id", "123456789") - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).set("wap.id", "123456789").stageOnly().commit(); // second WAP commit - table.newAppend() - .appendFile(FILE_C) - .set("wap.id", "987654321") - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_C).set("wap.id", "987654321").stageOnly().commit(); base = readMetadata(); // pick the snapshot that's staged but not committed @@ -460,35 +449,43 @@ public void testWithCommitsBetweenCherryPicking() { Snapshot wap2Snapshot = base.snapshots().get(2); Assert.assertEquals("Should have three snapshots", 3, base.snapshots().size()); - Assert.assertEquals("Should have first wap id in summary", "123456789", - wap1Snapshot.summary().get("wap.id")); - Assert.assertEquals("Should have second wap id in summary", "987654321", - wap2Snapshot.summary().get("wap.id")); - Assert.assertEquals("Current snapshot should be first commit's snapshot", - firstSnapshotId, base.currentSnapshot().snapshotId()); - Assert.assertEquals("Parent snapshot id should be same for first WAP snapshot", - firstSnapshotId, wap1Snapshot.parentId().longValue()); - Assert.assertEquals("Parent snapshot id should be same for second WAP snapshot", - firstSnapshotId, wap2Snapshot.parentId().longValue()); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 1, - base.snapshotLog().size()); + Assert.assertEquals( + "Should have first wap id in summary", "123456789", wap1Snapshot.summary().get("wap.id")); + Assert.assertEquals( + "Should have second wap id in summary", "987654321", wap2Snapshot.summary().get("wap.id")); + Assert.assertEquals( + "Current snapshot should be first commit's snapshot", + firstSnapshotId, + base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Parent snapshot id should be same for first WAP snapshot", + firstSnapshotId, + wap1Snapshot.parentId().longValue()); + Assert.assertEquals( + "Parent snapshot id should be same for second WAP snapshot", + firstSnapshotId, + wap2Snapshot.parentId().longValue()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); // load current snapshot parentSnapshot = base.currentSnapshot(); // table has new commit - table.newAppend() - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_D).commit(); base = readMetadata(); Assert.assertEquals("Should have four snapshots", 4, base.snapshots().size()); - Assert.assertEquals("Current snapshot should carry over the parent snapshot", - parentSnapshot.snapshotId(), base.currentSnapshot().parentId().longValue()); - Assert.assertEquals("Should contain manifests for two files", 2, + Assert.assertEquals( + "Current snapshot should carry over the parent snapshot", + parentSnapshot.snapshotId(), + base.currentSnapshot().parentId().longValue()); + Assert.assertEquals( + "Should contain manifests for two files", + 2, base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 2, - base.snapshotLog().size()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 2, base.snapshotLog().size()); // load current snapshot parentSnapshot = base.currentSnapshot(); @@ -499,16 +496,24 @@ public void testWithCommitsBetweenCherryPicking() { // check if the effective current snapshot is set to the new snapshot created // as a result of the cherry-pick operation Assert.assertEquals("Should have five snapshots", 5, base.snapshots().size()); - Assert.assertEquals("Current snapshot should be set to one after wap snapshot", - parentSnapshot.snapshotId() + 1, base.currentSnapshot().snapshotId()); - Assert.assertEquals("Should contain manifests for three files", 3, + Assert.assertEquals( + "Current snapshot should be set to one after wap snapshot", + parentSnapshot.snapshotId() + 1, + base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Should contain manifests for three files", + 3, base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals("Should contain append from last commit", 1, + Assert.assertEquals( + "Should contain append from last commit", + 1, Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals("Parent snapshot id should point to same snapshot", - parentSnapshot.snapshotId(), base.currentSnapshot().parentId().longValue()); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 3, - base.snapshotLog().size()); + Assert.assertEquals( + "Parent snapshot id should point to same snapshot", + parentSnapshot.snapshotId(), + base.currentSnapshot().parentId().longValue()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 3, base.snapshotLog().size()); // load current snapshot parentSnapshot = base.currentSnapshot(); @@ -519,35 +524,37 @@ public void testWithCommitsBetweenCherryPicking() { // check if the effective current snapshot is set to the new snapshot created // as a result of the cherry-pick operation Assert.assertEquals("Should have all the snapshots", 6, base.snapshots().size()); - Assert.assertEquals("Current snapshot should be set to one after wap snapshot", - parentSnapshot.snapshotId() + 1, base.currentSnapshot().snapshotId()); - Assert.assertEquals("Should contain manifests for four files", 4, + Assert.assertEquals( + "Current snapshot should be set to one after wap snapshot", + parentSnapshot.snapshotId() + 1, + base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Should contain manifests for four files", + 4, base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals("Should contain append from last commit", 1, + Assert.assertEquals( + "Should contain append from last commit", + 1, Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals("Parent snapshot id should point to same snapshot", - parentSnapshot.snapshotId(), base.currentSnapshot().parentId().longValue()); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 4, - base.snapshotLog().size()); + Assert.assertEquals( + "Parent snapshot id should point to same snapshot", + parentSnapshot.snapshotId(), + base.currentSnapshot().parentId().longValue()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 4, base.snapshotLog().size()); } @Test public void testWithCherryPickingWithCommitRetry() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); // load current snapshot Snapshot parentSnapshot = base.currentSnapshot(); long firstSnapshotId = parentSnapshot.snapshotId(); // first WAP commit - table.newAppend() - .appendFile(FILE_B) - .set("wap.id", "123456789") - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).set("wap.id", "123456789").stageOnly().commit(); base = readMetadata(); @@ -555,14 +562,18 @@ public void testWithCherryPickingWithCommitRetry() { Snapshot wap1Snapshot = base.snapshots().get(1); Assert.assertEquals("Should have two snapshots", 2, base.snapshots().size()); - Assert.assertEquals("Should have first wap id in summary", "123456789", - wap1Snapshot.summary().get("wap.id")); - Assert.assertEquals("Current snapshot should be first commit's snapshot", - firstSnapshotId, base.currentSnapshot().snapshotId()); - Assert.assertEquals("Parent snapshot id should be same for first WAP snapshot", - firstSnapshotId, wap1Snapshot.parentId().longValue()); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 1, - base.snapshotLog().size()); + Assert.assertEquals( + "Should have first wap id in summary", "123456789", wap1Snapshot.summary().get("wap.id")); + Assert.assertEquals( + "Current snapshot should be first commit's snapshot", + firstSnapshotId, + base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Parent snapshot id should be same for first WAP snapshot", + firstSnapshotId, + wap1Snapshot.parentId().longValue()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); // load current snapshot base = readMetadata(); @@ -578,43 +589,45 @@ public void testWithCherryPickingWithCommitRetry() { "Current snapshot should be set to one after wap snapshot", parentSnapshot.snapshotId() + 1, base.currentSnapshot().snapshotId()); - Assert.assertEquals("Should contain manifests for both files", 2, + Assert.assertEquals( + "Should contain manifests for both files", + 2, base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals("Should not contain redundant append due to retry", 1, + Assert.assertEquals( + "Should not contain redundant append due to retry", + 1, Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals("Parent snapshot id should change to latest snapshot before commit", - parentSnapshot.snapshotId(), base.currentSnapshot().parentId().longValue()); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 2, - base.snapshotLog().size()); + Assert.assertEquals( + "Parent snapshot id should change to latest snapshot before commit", + parentSnapshot.snapshotId(), + base.currentSnapshot().parentId().longValue()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 2, base.snapshotLog().size()); } @Test public void testCherrypickingAncestor() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); long firstSnapshotId = base.currentSnapshot().snapshotId(); // first WAP commit - table.newAppend() - .appendFile(FILE_B) - .set("wap.id", "123456789") - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).set("wap.id", "123456789").stageOnly().commit(); base = readMetadata(); // pick the snapshot that's staged but not committed Snapshot wapSnapshot = base.snapshots().get(1); Assert.assertEquals("Should have both snapshots", 2, base.snapshots().size()); - Assert.assertEquals("Should have first wap id in summary", "123456789", - wapSnapshot.summary().get("wap.id")); - Assert.assertEquals("Current snapshot should be first commit's snapshot", - firstSnapshotId, base.currentSnapshot().snapshotId()); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 1, - base.snapshotLog().size()); + Assert.assertEquals( + "Should have first wap id in summary", "123456789", wapSnapshot.summary().get("wap.id")); + Assert.assertEquals( + "Current snapshot should be first commit's snapshot", + firstSnapshotId, + base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); // cherry-pick snapshot table.manageSnapshots().cherrypick(wapSnapshot.snapshotId()).commit(); @@ -623,44 +636,42 @@ public void testCherrypickingAncestor() { // check if the effective current snapshot is set to the new snapshot created // as a result of the cherry-pick operation - Assert.assertEquals("Current snapshot should be fast-forwarded to wap snapshot", - wapSnapshot.snapshotId(), base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Current snapshot should be fast-forwarded to wap snapshot", + wapSnapshot.snapshotId(), + base.currentSnapshot().snapshotId()); Assert.assertEquals("Should have two snapshots", 2, base.snapshots().size()); - Assert.assertEquals("Should contain manifests for both files", - 2, base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals("Should contain append from last commit", 1, + Assert.assertEquals( + "Should contain manifests for both files", + 2, + base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should contain append from last commit", + 1, Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 2, - base.snapshotLog().size()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 2, base.snapshotLog().size()); - AssertHelpers.assertThrows("should throw exception", CherrypickAncestorCommitException.class, - String.format("Cannot cherrypick snapshot %s: already an ancestor", 1), () -> { + AssertHelpers.assertThrows( + "should throw exception", + CherrypickAncestorCommitException.class, + String.format("Cannot cherrypick snapshot %s: already an ancestor", 1), + () -> { // duplicate cherry-pick snapshot table.manageSnapshots().cherrypick(firstSnapshotId).commit(); - } - ); + }); } @Test public void testDuplicateCherrypick() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); long firstSnapshotId = base.currentSnapshot().snapshotId(); // stage first WAP commit - table.newAppend() - .appendFile(FILE_B) - .set("wap.id", "123456789") - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).set("wap.id", "123456789").stageOnly().commit(); // stage second WAP commit with same wap.id - table.newAppend() - .appendFile(FILE_C) - .set("wap.id", "123456789") - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_C).set("wap.id", "123456789").stageOnly().commit(); base = readMetadata(); // pick the snapshot that's staged but not committed @@ -668,83 +679,104 @@ public void testDuplicateCherrypick() { Snapshot wapSnapshot2 = base.snapshots().get(2); Assert.assertEquals("Should have both snapshots", 3, base.snapshots().size()); - Assert.assertEquals("Should have wap id in first wap snapshot summary", "123456789", + Assert.assertEquals( + "Should have wap id in first wap snapshot summary", + "123456789", wapSnapshot1.summary().get("wap.id")); - Assert.assertEquals("Should have wap id in second wap snapshot summary", "123456789", + Assert.assertEquals( + "Should have wap id in second wap snapshot summary", + "123456789", wapSnapshot2.summary().get("wap.id")); - Assert.assertEquals("Current snapshot should be first commit's snapshot", - firstSnapshotId, base.currentSnapshot().snapshotId()); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 1, - base.snapshotLog().size()); + Assert.assertEquals( + "Current snapshot should be first commit's snapshot", + firstSnapshotId, + base.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 1, base.snapshotLog().size()); // cherry-pick snapshot table.manageSnapshots().cherrypick(wapSnapshot1.snapshotId()).commit(); base = readMetadata(); Assert.assertEquals("Should have three snapshots", 3, base.snapshots().size()); - Assert.assertEquals("Should contain manifests for both files", - 2, base.currentSnapshot().allManifests(table.io()).size()); - Assert.assertEquals("Should contain append from last commit", 1, + Assert.assertEquals( + "Should contain manifests for both files", + 2, + base.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Should contain append from last commit", + 1, Iterables.size(base.currentSnapshot().addedDataFiles(table.io()))); - Assert.assertEquals("Snapshot log should indicate number of snapshots committed", 2, - base.snapshotLog().size()); + Assert.assertEquals( + "Snapshot log should indicate number of snapshots committed", 2, base.snapshotLog().size()); - AssertHelpers.assertThrows("should throw exception", DuplicateWAPCommitException.class, - String.format("Duplicate request to cherry pick wap id that was published already: %s", 123456789), () -> { + AssertHelpers.assertThrows( + "should throw exception", + DuplicateWAPCommitException.class, + String.format( + "Duplicate request to cherry pick wap id that was published already: %s", 123456789), + () -> { // duplicate cherry-pick snapshot table.manageSnapshots().cherrypick(wapSnapshot2.snapshotId()).commit(); - } - ); + }); } @Test public void testNonWapCherrypick() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableMetadata base = readMetadata(); long firstSnapshotId = base.currentSnapshot().snapshotId(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); base = readMetadata(); long secondSnapshotId = base.currentSnapshot().snapshotId(); - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); base = readMetadata(); long thirdSnapshotId = base.currentSnapshot().snapshotId(); - Assert.assertEquals("Should be pointing to third snapshot", thirdSnapshotId, + Assert.assertEquals( + "Should be pointing to third snapshot", + thirdSnapshotId, table.currentSnapshot().snapshotId()); // NOOP commit table.manageSnapshots().commit(); - Assert.assertEquals("Should still be pointing to third snapshot", thirdSnapshotId, + Assert.assertEquals( + "Should still be pointing to third snapshot", + thirdSnapshotId, table.currentSnapshot().snapshotId()); // Rollback to second snapshot table.manageSnapshots().rollbackTo(secondSnapshotId).commit(); - Assert.assertEquals("Should be pointing to second snapshot", secondSnapshotId, + Assert.assertEquals( + "Should be pointing to second snapshot", + secondSnapshotId, table.currentSnapshot().snapshotId()); // Cherrypick down to third table.manageSnapshots().cherrypick(thirdSnapshotId).commit(); - Assert.assertEquals("Should be re-using wap snapshot after cherrypick", 3, + Assert.assertEquals( + "Should be re-using wap snapshot after cherrypick", + 3, table.currentSnapshot().snapshotId()); // try double cherrypicking of the third snapshot - AssertHelpers.assertThrows("should not allow cherrypicking ancestor", CherrypickAncestorCommitException.class, - String.format("Cannot cherrypick snapshot %s: already an ancestor", 3), () -> { - // double cherrypicking of second snapshot + AssertHelpers.assertThrows( + "should not allow cherrypicking ancestor", + CherrypickAncestorCommitException.class, + String.format("Cannot cherrypick snapshot %s: already an ancestor", 3), + () -> { + // double cherrypicking of second snapshot table.manageSnapshots().cherrypick(thirdSnapshotId).commit(); }); // try cherrypicking an ancestor - AssertHelpers.assertThrows("should not allow double cherrypick", CherrypickAncestorCommitException.class, - String.format("Cannot cherrypick snapshot %s: already an ancestor", firstSnapshotId), () -> { + AssertHelpers.assertThrows( + "should not allow double cherrypick", + CherrypickAncestorCommitException.class, + String.format("Cannot cherrypick snapshot %s: already an ancestor", firstSnapshotId), + () -> { // double cherrypicking of second snapshot table.manageSnapshots().cherrypick(firstSnapshotId).commit(); }); diff --git a/core/src/test/java/org/apache/iceberg/TestableCachingCatalog.java b/core/src/test/java/org/apache/iceberg/TestableCachingCatalog.java index ed28e108cdac..db3d526e8d6d 100644 --- a/core/src/test/java/org/apache/iceberg/TestableCachingCatalog.java +++ b/core/src/test/java/org/apache/iceberg/TestableCachingCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.github.benmanes.caffeine.cache.Cache; @@ -32,26 +31,31 @@ */ public class TestableCachingCatalog extends CachingCatalog { - public static TestableCachingCatalog wrap(Catalog catalog, Duration expirationInterval, Ticker ticker) { - return new TestableCachingCatalog(catalog, true /* caseSensitive */, expirationInterval, ticker); + public static TestableCachingCatalog wrap( + Catalog catalog, Duration expirationInterval, Ticker ticker) { + return new TestableCachingCatalog( + catalog, true /* caseSensitive */, expirationInterval, ticker); } private final Duration cacheExpirationInterval; - TestableCachingCatalog(Catalog catalog, boolean caseSensitive, Duration expirationInterval, Ticker ticker) { + TestableCachingCatalog( + Catalog catalog, boolean caseSensitive, Duration expirationInterval, Ticker ticker) { super(catalog, caseSensitive, expirationInterval.toMillis(), ticker); this.cacheExpirationInterval = expirationInterval; } public Cache cache() { - // cleanUp must be called as tests apply assertions directly on the underlying map, but metadata table + // cleanUp must be called as tests apply assertions directly on the underlying map, but metadata + // table // map entries are cleaned up asynchronously. tableCache.cleanUp(); return tableCache; } public boolean isCacheExpirationEnabled() { - return tableCache.policy().expireAfterAccess().isPresent() || tableCache.policy().expireAfterWrite().isPresent(); + return tableCache.policy().expireAfterAccess().isPresent() + || tableCache.policy().expireAfterWrite().isPresent(); } // Throws a NoSuchElementException if this entry is not in the cache (has already been TTL'd). diff --git a/core/src/test/java/org/apache/iceberg/V2TableTestBase.java b/core/src/test/java/org/apache/iceberg/V2TableTestBase.java index 5ca960716e36..5e46f927f545 100644 --- a/core/src/test/java/org/apache/iceberg/V2TableTestBase.java +++ b/core/src/test/java/org/apache/iceberg/V2TableTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; public class V2TableTestBase extends TableTestBase { diff --git a/core/src/test/java/org/apache/iceberg/actions/TestBinPackStrategy.java b/core/src/test/java/org/apache/iceberg/actions/TestBinPackStrategy.java index d1aafea2881d..5a0baf9aff04 100644 --- a/core/src/test/java/org/apache/iceberg/actions/TestBinPackStrategy.java +++ b/core/src/test/java/org/apache/iceberg/actions/TestBinPackStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.util.Arrays; @@ -67,7 +66,9 @@ public Set rewriteFiles(List filesToRewrite) { } private List filesOfSize(long... sizes) { - return Arrays.stream(sizes).mapToObj(size -> new MockFileScanTask(size * MB)).collect(Collectors.toList()); + return Arrays.stream(sizes) + .mapToObj(size -> new MockFileScanTask(size * MB)) + .collect(Collectors.toList()); } private RewriteStrategy defaultBinPack() { @@ -79,7 +80,8 @@ public void testFilteringAllValid() { RewriteStrategy strategy = defaultBinPack(); Iterable testFiles = filesOfSize(100, 100, 100, 100, 1000); - Iterable filtered = ImmutableList.copyOf(strategy.selectFilesToRewrite(testFiles)); + Iterable filtered = + ImmutableList.copyOf(strategy.selectFilesToRewrite(testFiles)); Assert.assertEquals("No files should be removed from the set", testFiles, filtered); } @@ -89,227 +91,286 @@ public void testFilteringRemoveInvalid() { RewriteStrategy strategy = defaultBinPack(); Iterable testFiles = filesOfSize(500, 500, 500, 600, 600); - Iterable filtered = ImmutableList.copyOf(strategy.selectFilesToRewrite(testFiles)); + Iterable filtered = + ImmutableList.copyOf(strategy.selectFilesToRewrite(testFiles)); - Assert.assertEquals("All files should be removed from the set", Collections.emptyList(), filtered); + Assert.assertEquals( + "All files should be removed from the set", Collections.emptyList(), filtered); } @Test public void testFilteringCustomMinMaxFileSize() { - RewriteStrategy strategy = defaultBinPack().options(ImmutableMap.of( - BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(550 * MB), - BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(490 * MB) - )); + RewriteStrategy strategy = + defaultBinPack() + .options( + ImmutableMap.of( + BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(550 * MB), + BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(490 * MB))); Iterable testFiles = filesOfSize(500, 500, 480, 480, 560, 520); Iterable expectedFiles = filesOfSize(480, 480, 560); - Iterable filtered = ImmutableList.copyOf(strategy.selectFilesToRewrite(testFiles)); + Iterable filtered = + ImmutableList.copyOf(strategy.selectFilesToRewrite(testFiles)); - Assert.assertEquals("Should remove files that exceed or are smaller than new bounds", expectedFiles, filtered); + Assert.assertEquals( + "Should remove files that exceed or are smaller than new bounds", expectedFiles, filtered); } @Test public void testFilteringWithDeletes() { - RewriteStrategy strategy = defaultBinPack().options(ImmutableMap.of( - BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(550 * MB), - BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(490 * MB), - BinPackStrategy.DELETE_FILE_THRESHOLD, Integer.toString(2) - )); + RewriteStrategy strategy = + defaultBinPack() + .options( + ImmutableMap.of( + BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(550 * MB), + BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(490 * MB), + BinPackStrategy.DELETE_FILE_THRESHOLD, Integer.toString(2))); List testFiles = filesOfSize(500, 500, 480, 480, 560, 520); testFiles.add(MockFileScanTask.mockTaskWithDeletes(500 * MB, 2)); Iterable expectedFiles = filesOfSize(480, 480, 560, 500); - Iterable filtered = ImmutableList.copyOf(strategy.selectFilesToRewrite(testFiles)); + Iterable filtered = + ImmutableList.copyOf(strategy.selectFilesToRewrite(testFiles)); Assert.assertEquals("Should include file with deletes", expectedFiles, filtered); } @Test public void testGroupingMinInputFilesInvalid() { - RewriteStrategy strategy = defaultBinPack().options(ImmutableMap.of( - BinPackStrategy.MIN_INPUT_FILES, Integer.toString(5) - )); + RewriteStrategy strategy = + defaultBinPack() + .options(ImmutableMap.of(BinPackStrategy.MIN_INPUT_FILES, Integer.toString(5))); Iterable testFiles = filesOfSize(1, 1, 1, 1); Iterable> grouped = strategy.planFileGroups(testFiles); - Assert.assertEquals("Should plan 0 groups, not enough input files", - 0, Iterables.size(grouped)); + Assert.assertEquals("Should plan 0 groups, not enough input files", 0, Iterables.size(grouped)); } @Test public void testGroupingMinInputFilesAsOne() { - RewriteStrategy strategy = defaultBinPack().options(ImmutableMap.of( - BinPackStrategy.MIN_INPUT_FILES, Integer.toString(1), - BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(3 * MB), - RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(2 * MB), - BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(MB), - BinPackStrategy.DELETE_FILE_THRESHOLD, Integer.toString(2) - )); + RewriteStrategy strategy = + defaultBinPack() + .options( + ImmutableMap.of( + BinPackStrategy.MIN_INPUT_FILES, Integer.toString(1), + BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(3 * MB), + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(2 * MB), + BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(MB), + BinPackStrategy.DELETE_FILE_THRESHOLD, Integer.toString(2))); Iterable testFiles1 = filesOfSize(1); Iterable> grouped1 = strategy.planFileGroups(testFiles1); - Assert.assertEquals("Should plan 0 groups, 1 file is too small but no deletes are present so rewriting is " + - "a NOOP", - 0, Iterables.size(grouped1)); + Assert.assertEquals( + "Should plan 0 groups, 1 file is too small but no deletes are present so rewriting is " + + "a NOOP", + 0, + Iterables.size(grouped1)); Iterable testFiles2 = filesOfSize(4); Iterable> grouped2 = strategy.planFileGroups(testFiles2); - Assert.assertEquals("Should plan 1 group because the file present is larger than maxFileSize and can be " + - "split", 1, Iterables.size(grouped2)); + Assert.assertEquals( + "Should plan 1 group because the file present is larger than maxFileSize and can be " + + "split", + 1, + Iterables.size(grouped2)); List testFiles3 = Lists.newArrayList(); testFiles3.add(MockFileScanTask.mockTaskWithDeletes(MB, 2)); Iterable> grouped3 = strategy.planFileGroups(testFiles3); - Assert.assertEquals("Should plan 1 group, the data file has delete files and can be re-written without " + - "deleted row", 1, Iterables.size(grouped3)); + Assert.assertEquals( + "Should plan 1 group, the data file has delete files and can be re-written without " + + "deleted row", + 1, + Iterables.size(grouped3)); } @Test public void testGroupWithLargeFileMinInputFiles() { - RewriteStrategy strategy = defaultBinPack().options(ImmutableMap.of( - BinPackStrategy.MIN_INPUT_FILES, Integer.toString(5) - )); + RewriteStrategy strategy = + defaultBinPack() + .options(ImmutableMap.of(BinPackStrategy.MIN_INPUT_FILES, Integer.toString(5))); Iterable testFiles1 = filesOfSize(2000); Iterable> grouped1 = strategy.planFileGroups(testFiles1); - Assert.assertEquals("Should plan 1 group, not enough input files but the input file exceeds our max" + - "and can be written into at least one new target-file-size files", - ImmutableList.of(testFiles1), grouped1); + Assert.assertEquals( + "Should plan 1 group, not enough input files but the input file exceeds our max" + + "and can be written into at least one new target-file-size files", + ImmutableList.of(testFiles1), + grouped1); Iterable testFiles2 = filesOfSize(500, 500, 500); Iterable> grouped2 = strategy.planFileGroups(testFiles2); - Assert.assertEquals("Should plan 1 group, not enough input files but the sum of file sizes exceeds " + - "target-file-size and files within the group is greater than 1", - ImmutableList.of(testFiles2), grouped2); + Assert.assertEquals( + "Should plan 1 group, not enough input files but the sum of file sizes exceeds " + + "target-file-size and files within the group is greater than 1", + ImmutableList.of(testFiles2), + grouped2); Iterable testFiles3 = filesOfSize(10, 10, 10); Iterable> grouped3 = strategy.planFileGroups(testFiles3); - Assert.assertEquals("Should plan 0 groups, not enough input files and the sum of file sizes does not " + - "exceeds target-file-size and files within the group is greater than 1", - ImmutableList.of(), grouped3); + Assert.assertEquals( + "Should plan 0 groups, not enough input files and the sum of file sizes does not " + + "exceeds target-file-size and files within the group is greater than 1", + ImmutableList.of(), + grouped3); } @Test public void testGroupingMinInputFilesValid() { - RewriteStrategy strategy = defaultBinPack().options(ImmutableMap.of( - BinPackStrategy.MIN_INPUT_FILES, Integer.toString(5) - )); + RewriteStrategy strategy = + defaultBinPack() + .options(ImmutableMap.of(BinPackStrategy.MIN_INPUT_FILES, Integer.toString(5))); Iterable testFiles = filesOfSize(1, 1, 1, 1, 1); Iterable> grouped = strategy.planFileGroups(testFiles); - Assert.assertEquals("Should plan 1 groups since there are enough input files", - ImmutableList.of(testFiles), grouped); + Assert.assertEquals( + "Should plan 1 groups since there are enough input files", + ImmutableList.of(testFiles), + grouped); } @Test public void testGroupingWithDeletes() { - RewriteStrategy strategy = defaultBinPack().options(ImmutableMap.of( - BinPackStrategy.MIN_INPUT_FILES, Integer.toString(5), - BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(550 * MB), - BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(490 * MB), - BinPackStrategy.DELETE_FILE_THRESHOLD, Integer.toString(2) - )); + RewriteStrategy strategy = + defaultBinPack() + .options( + ImmutableMap.of( + BinPackStrategy.MIN_INPUT_FILES, Integer.toString(5), + BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(550 * MB), + BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(490 * MB), + BinPackStrategy.DELETE_FILE_THRESHOLD, Integer.toString(2))); List testFiles = Lists.newArrayList(); testFiles.add(MockFileScanTask.mockTaskWithDeletes(500 * MB, 2)); Iterable> grouped = strategy.planFileGroups(testFiles); - Assert.assertEquals("Should plan 1 groups since there are enough input files", - ImmutableList.of(testFiles), grouped); + Assert.assertEquals( + "Should plan 1 groups since there are enough input files", + ImmutableList.of(testFiles), + grouped); } @Test public void testMaxGroupSize() { - RewriteStrategy strategy = defaultBinPack().options(ImmutableMap.of( - RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Long.toString(1000 * MB) - )); + RewriteStrategy strategy = + defaultBinPack() + .options( + ImmutableMap.of( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Long.toString(1000 * MB))); Iterable testFiles = filesOfSize(300, 300, 300, 300, 300, 300); Iterable> grouped = strategy.planFileGroups(testFiles); - Assert.assertEquals("Should plan 2 groups since there is enough data for two groups", - 2, Iterables.size(grouped)); + Assert.assertEquals( + "Should plan 2 groups since there is enough data for two groups", + 2, + Iterables.size(grouped)); } @Test public void testNumOuputFiles() { BinPackStrategy strategy = (BinPackStrategy) defaultBinPack(); long targetFileSize = strategy.targetFileSize(); - Assert.assertEquals("Should keep remainder if the remainder is a valid size", - 2, strategy.numOutputFiles(targetFileSize + 450 * MB)); - Assert.assertEquals("Should discard remainder file if the remainder is very small", - 1, strategy.numOutputFiles(targetFileSize + 40 * MB)); - Assert.assertEquals("Should keep remainder file if it would change average file size greatly", - 2, strategy.numOutputFiles((long) (targetFileSize + 0.40 * targetFileSize))); - Assert.assertEquals("Should discard remainder if file is small and wouldn't change average that much", - 200, strategy.numOutputFiles(200 * targetFileSize + 13 * MB)); - Assert.assertEquals("Should keep remainder if it's a valid size", - 201, strategy.numOutputFiles(200 * targetFileSize + 499 * MB)); - Assert.assertEquals("Should not return 0 even for very small files", - 1, strategy.numOutputFiles(1)); + Assert.assertEquals( + "Should keep remainder if the remainder is a valid size", + 2, + strategy.numOutputFiles(targetFileSize + 450 * MB)); + Assert.assertEquals( + "Should discard remainder file if the remainder is very small", + 1, + strategy.numOutputFiles(targetFileSize + 40 * MB)); + Assert.assertEquals( + "Should keep remainder file if it would change average file size greatly", + 2, + strategy.numOutputFiles((long) (targetFileSize + 0.40 * targetFileSize))); + Assert.assertEquals( + "Should discard remainder if file is small and wouldn't change average that much", + 200, + strategy.numOutputFiles(200 * targetFileSize + 13 * MB)); + Assert.assertEquals( + "Should keep remainder if it's a valid size", + 201, + strategy.numOutputFiles(200 * targetFileSize + 499 * MB)); + Assert.assertEquals( + "Should not return 0 even for very small files", 1, strategy.numOutputFiles(1)); } @Test public void testInvalidOptions() { - AssertHelpers.assertThrows("Should not allow max size smaller than target", - IllegalArgumentException.class, () -> { - defaultBinPack().options(ImmutableMap.of( - BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(1 * MB))); + AssertHelpers.assertThrows( + "Should not allow max size smaller than target", + IllegalArgumentException.class, + () -> { + defaultBinPack() + .options(ImmutableMap.of(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(1 * MB))); }); - AssertHelpers.assertThrows("Should not allow min size larger than target", - IllegalArgumentException.class, () -> { - defaultBinPack().options(ImmutableMap.of( - BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(1000 * MB))); + AssertHelpers.assertThrows( + "Should not allow min size larger than target", + IllegalArgumentException.class, + () -> { + defaultBinPack() + .options( + ImmutableMap.of(BinPackStrategy.MIN_FILE_SIZE_BYTES, Long.toString(1000 * MB))); }); - AssertHelpers.assertThrows("Should not allow min input size smaller than 1", - IllegalArgumentException.class, () -> { - defaultBinPack().options(ImmutableMap.of( - BinPackStrategy.MIN_INPUT_FILES, Long.toString(-5))); + AssertHelpers.assertThrows( + "Should not allow min input size smaller than 1", + IllegalArgumentException.class, + () -> { + defaultBinPack() + .options(ImmutableMap.of(BinPackStrategy.MIN_INPUT_FILES, Long.toString(-5))); }); - AssertHelpers.assertThrows("Should not allow min deletes per file smaller than 1", - IllegalArgumentException.class, () -> { - defaultBinPack().options(ImmutableMap.of( - BinPackStrategy.DELETE_FILE_THRESHOLD, Long.toString(-5))); - }); + AssertHelpers.assertThrows( + "Should not allow min deletes per file smaller than 1", + IllegalArgumentException.class, + () -> { + defaultBinPack() + .options(ImmutableMap.of(BinPackStrategy.DELETE_FILE_THRESHOLD, Long.toString(-5))); + }); - AssertHelpers.assertThrows("Should not allow negative target size", - IllegalArgumentException.class, () -> { - defaultBinPack().options(ImmutableMap.of( - RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(-5))); + AssertHelpers.assertThrows( + "Should not allow negative target size", + IllegalArgumentException.class, + () -> { + defaultBinPack() + .options(ImmutableMap.of(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(-5))); }); } @Test public void testRewriteAllSelectFilesToRewrite() { - RewriteStrategy strategy = defaultBinPack().options(ImmutableMap.of( - BinPackStrategy.REWRITE_ALL, "true" - )); + RewriteStrategy strategy = + defaultBinPack().options(ImmutableMap.of(BinPackStrategy.REWRITE_ALL, "true")); Iterable testFiles = filesOfSize(500, 500, 480, 480, 560, 520); Iterable expectedFiles = filesOfSize(500, 500, 480, 480, 560, 520); - Iterable filtered = ImmutableList.copyOf(strategy.selectFilesToRewrite(testFiles)); + Iterable filtered = + ImmutableList.copyOf(strategy.selectFilesToRewrite(testFiles)); Assert.assertEquals("Should rewrite all files", expectedFiles, filtered); } @Test public void testRewriteAllPlanFileGroups() { - RewriteStrategy strategy = defaultBinPack().options(ImmutableMap.of( - BinPackStrategy.MIN_INPUT_FILES, Integer.toString(5), - BinPackStrategy.REWRITE_ALL, "true" - )); + RewriteStrategy strategy = + defaultBinPack() + .options( + ImmutableMap.of( + BinPackStrategy.MIN_INPUT_FILES, + Integer.toString(5), + BinPackStrategy.REWRITE_ALL, + "true")); Iterable testFiles = filesOfSize(1, 1, 1, 1); Iterable> grouped = strategy.planFileGroups(testFiles); diff --git a/core/src/test/java/org/apache/iceberg/actions/TestSortStrategy.java b/core/src/test/java/org/apache/iceberg/actions/TestSortStrategy.java index 373728fcb3b1..8d199512bec3 100644 --- a/core/src/test/java/org/apache/iceberg/actions/TestSortStrategy.java +++ b/core/src/test/java/org/apache/iceberg/actions/TestSortStrategy.java @@ -16,10 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; - import java.util.Collections; import java.util.List; import java.util.Set; @@ -80,19 +78,26 @@ private SortStrategy defaultSort() { private List tasksForSortOrder(int sortOrderId, int... fileSizesMB) { ImmutableList.Builder files = ImmutableList.builder(); - IntStream.of(fileSizesMB).forEach(length -> files.add(MockFileScanTask.mockTask(length * MB, sortOrderId))); + IntStream.of(fileSizesMB) + .forEach(length -> files.add(MockFileScanTask.mockTask(length * MB, sortOrderId))); return files.build(); } @Test public void testInvalidSortOrder() { - AssertHelpers.assertThrows("Should not allow an unsorted Sort order", IllegalArgumentException.class, + AssertHelpers.assertThrows( + "Should not allow an unsorted Sort order", + IllegalArgumentException.class, () -> defaultSort().sortOrder(SortOrder.unsorted()).options(Collections.emptyMap())); - AssertHelpers.assertThrows("Should not allow a Sort order with bad columns", ValidationException.class, + AssertHelpers.assertThrows( + "Should not allow a Sort order with bad columns", + ValidationException.class, () -> { - Schema badSchema = new Schema( - ImmutableList.of(Types.NestedField.required(0, "nonexistant", Types.IntegerType.get()))); + Schema badSchema = + new Schema( + ImmutableList.of( + Types.NestedField.required(0, "nonexistant", Types.IntegerType.get()))); defaultSort() .sortOrder(SortOrder.builderFor(badSchema).asc("nonexistant").build()) @@ -102,41 +107,48 @@ public void testInvalidSortOrder() { @Test public void testSelectAll() { - List invalid = ImmutableList.builder() - .addAll(tasksForSortOrder(-1, 500, 500, 500, 500)) - .addAll(tasksForSortOrder(table.sortOrder().orderId(), 10, 10, 2000, 10)) - .build(); - - List expected = ImmutableList.builder() - .addAll(invalid) - .addAll(tasksForSortOrder(table.sortOrder().orderId(), 500, 490, 520)) - .build(); - - RewriteStrategy strategy = defaultSort().options(ImmutableMap.of(SortStrategy.REWRITE_ALL, "true")); + List invalid = + ImmutableList.builder() + .addAll(tasksForSortOrder(-1, 500, 500, 500, 500)) + .addAll(tasksForSortOrder(table.sortOrder().orderId(), 10, 10, 2000, 10)) + .build(); + + List expected = + ImmutableList.builder() + .addAll(invalid) + .addAll(tasksForSortOrder(table.sortOrder().orderId(), 500, 490, 520)) + .build(); + + RewriteStrategy strategy = + defaultSort().options(ImmutableMap.of(SortStrategy.REWRITE_ALL, "true")); List actual = ImmutableList.copyOf(strategy.selectFilesToRewrite(expected)); - Assert.assertEquals("Should mark all files for rewrite", - expected, actual); + Assert.assertEquals("Should mark all files for rewrite", expected, actual); } @Test public void testUseSizeOptions() { - List expected = ImmutableList.builder() - .addAll(tasksForSortOrder(table.sortOrder().orderId(), 498, 551)) - .build(); - - List fileScanTasks = ImmutableList.builder() - .addAll(expected) - .addAll(tasksForSortOrder(table.sortOrder().orderId(), 500, 500)) - .build(); - - RewriteStrategy strategy = defaultSort().options(ImmutableMap.of( - SortStrategy.MAX_FILE_SIZE_BYTES, Long.toString(550 * MB), - SortStrategy.MIN_FILE_SIZE_BYTES, Long.toString(499 * MB))); + List expected = + ImmutableList.builder() + .addAll(tasksForSortOrder(table.sortOrder().orderId(), 498, 551)) + .build(); + + List fileScanTasks = + ImmutableList.builder() + .addAll(expected) + .addAll(tasksForSortOrder(table.sortOrder().orderId(), 500, 500)) + .build(); + + RewriteStrategy strategy = + defaultSort() + .options( + ImmutableMap.of( + SortStrategy.MAX_FILE_SIZE_BYTES, Long.toString(550 * MB), + SortStrategy.MIN_FILE_SIZE_BYTES, Long.toString(499 * MB))); List actual = ImmutableList.copyOf(strategy.selectFilesToRewrite(fileScanTasks)); - Assert.assertEquals("Should mark files for rewrite with adjusted min and max size", - expected, actual); + Assert.assertEquals( + "Should mark files for rewrite with adjusted min and max size", expected, actual); } } diff --git a/core/src/test/java/org/apache/iceberg/avro/AvroDataTest.java b/core/src/test/java/org/apache/iceberg/avro/AvroDataTest.java index 59a8150c4ed8..8015d65572db 100644 --- a/core/src/test/java/org/apache/iceberg/avro/AvroDataTest.java +++ b/core/src/test/java/org/apache/iceberg/avro/AvroDataTest.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.concurrent.atomic.AtomicInteger; import org.apache.iceberg.Schema; @@ -32,35 +34,31 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class AvroDataTest { protected abstract void writeAndValidate(Schema schema) throws IOException; - private static final StructType SUPPORTED_PRIMITIVES = StructType.of( - required(100, "id", LongType.get()), - optional(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - optional(103, "i", Types.IntegerType.get()), - required(104, "l", LongType.get()), - optional(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - optional(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - required(111, "uuid", Types.UUIDType.get()), - required(112, "fixed", Types.FixedType.ofLength(7)), - optional(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)), // maximum precision - required(117, "time", Types.TimeType.get()) - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final StructType SUPPORTED_PRIMITIVES = + StructType.of( + required(100, "id", LongType.get()), + optional(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + optional(103, "i", Types.IntegerType.get()), + required(104, "l", LongType.get()), + optional(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + optional(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + required(111, "uuid", Types.UUIDType.get()), + required(112, "fixed", Types.FixedType.ofLength(7)), + optional(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)), // maximum precision + required(117, "time", Types.TimeType.get())); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testSimpleStruct() throws IOException { @@ -69,102 +67,131 @@ public void testSimpleStruct() throws IOException { @Test public void testArray() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", ListType.ofOptional(2, Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, Types.StringType.get()))); writeAndValidate(schema); } @Test public void testArrayOfStructs() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", ListType.ofOptional(2, SUPPORTED_PRIMITIVES))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, SUPPORTED_PRIMITIVES))); writeAndValidate(schema); } @Test public void testMap() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StringType.get(), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional(2, 3, Types.StringType.get(), Types.StringType.get()))); writeAndValidate(schema); } @Test public void testNumericMapKey() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.LongType.get(), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, "data", MapType.ofOptional(2, 3, Types.LongType.get(), Types.StringType.get()))); writeAndValidate(schema); } @Test public void testComplexMapKey() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StructType.of( - required(4, "i", Types.IntegerType.get()), - optional(5, "s", Types.StringType.get())), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional( + 2, + 3, + Types.StructType.of( + required(4, "i", Types.IntegerType.get()), + optional(5, "s", Types.StringType.get())), + Types.StringType.get()))); writeAndValidate(schema); } @Test public void testMapOfStructs() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StringType.get(), - SUPPORTED_PRIMITIVES))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, "data", MapType.ofOptional(2, 3, Types.StringType.get(), SUPPORTED_PRIMITIVES))); writeAndValidate(schema); } @Test public void testMixedTypes() throws IOException { - StructType structType = StructType.of( - required(0, "id", LongType.get()), - optional(1, "list_of_maps", - ListType.ofOptional(2, MapType.ofOptional(3, 4, - Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - optional(5, "map_of_lists", - MapType.ofOptional(6, 7, - Types.StringType.get(), - ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), - required(9, "list_of_lists", - ListType.ofOptional(10, ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), - required(12, "map_of_maps", - MapType.ofOptional(13, 14, - Types.StringType.get(), - MapType.ofOptional(15, 16, + StructType structType = + StructType.of( + required(0, "id", LongType.get()), + optional( + 1, + "list_of_maps", + ListType.ofOptional( + 2, MapType.ofOptional(3, 4, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + optional( + 5, + "map_of_lists", + MapType.ofOptional( + 6, 7, Types.StringType.get(), ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), + required( + 9, + "list_of_lists", + ListType.ofOptional(10, ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), + required( + 12, + "map_of_maps", + MapType.ofOptional( + 13, + 14, Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - required(17, "list_of_struct_of_nested_types", ListType.ofOptional(19, StructType.of( - Types.NestedField.required(20, "m1", MapType.ofOptional(21, 22, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(23, "l1", ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), - Types.NestedField.required(25, "l2", ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(27, "m2", MapType.ofOptional(28, 29, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)) - ))) - ); - - Schema schema = new Schema(TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) - .asStructType().fields()); + MapType.ofOptional(15, 16, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + required( + 17, + "list_of_struct_of_nested_types", + ListType.ofOptional( + 19, + StructType.of( + Types.NestedField.required( + 20, + "m1", + MapType.ofOptional( + 21, 22, Types.StringType.get(), SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 23, "l1", ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), + Types.NestedField.required( + 25, "l2", ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 27, + "m2", + MapType.ofOptional( + 28, 29, Types.StringType.get(), SUPPORTED_PRIMITIVES)))))); + + Schema schema = + new Schema( + TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) + .asStructType() + .fields()); writeAndValidate(schema); } diff --git a/core/src/test/java/org/apache/iceberg/avro/AvroTestHelpers.java b/core/src/test/java/org/apache/iceberg/avro/AvroTestHelpers.java index 8b70499c70be..974c29c14f86 100644 --- a/core/src/test/java/org/apache/iceberg/avro/AvroTestHelpers.java +++ b/core/src/test/java/org/apache/iceberg/avro/AvroTestHelpers.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; +import static org.apache.iceberg.avro.AvroSchemaUtil.toOption; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -30,16 +31,12 @@ import org.assertj.core.api.Assertions; import org.junit.Assert; -import static org.apache.iceberg.avro.AvroSchemaUtil.toOption; - class AvroTestHelpers { - private AvroTestHelpers() { - } + private AvroTestHelpers() {} static Schema.Field optionalField(int id, String name, Schema schema) { return addId(id, new Schema.Field(name, toOption(schema), null, JsonProperties.NULL_VALUE)); - } static Schema.Field requiredField(int id, String name, Schema schema) { @@ -130,7 +127,9 @@ private static void assertEquals(Type type, Object expected, Object actual) { Assert.assertEquals("Primitive value should be equal to expected", expected, actual); break; case STRUCT: - Assertions.assertThat(expected).as("Expected should be a Record").isInstanceOf(Record.class); + Assertions.assertThat(expected) + .as("Expected should be a Record") + .isInstanceOf(Record.class); Assertions.assertThat(actual).as("Actual should be a Record").isInstanceOf(Record.class); assertEquals(type.asStructType(), (Record) expected, (Record) actual); break; diff --git a/core/src/test/java/org/apache/iceberg/avro/RandomAvroData.java b/core/src/test/java/org/apache/iceberg/avro/RandomAvroData.java index 1a461e101559..473a468f7e94 100644 --- a/core/src/test/java/org/apache/iceberg/avro/RandomAvroData.java +++ b/core/src/test/java/org/apache/iceberg/avro/RandomAvroData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.nio.ByteBuffer; @@ -40,8 +39,7 @@ public class RandomAvroData { - private RandomAvroData() { - } + private RandomAvroData() {} public static List generate(Schema schema, int numRecords, long seed) { RandomDataGenerator generator = new RandomDataGenerator(schema, seed); @@ -147,8 +145,7 @@ public Object primitive(Type.PrimitiveType primitive) { case STRING: return new Utf8((String) result); case FIXED: - return new GenericData.Fixed(typeToSchema.get(primitive), - (byte[]) result); + return new GenericData.Fixed(typeToSchema.get(primitive), (byte[]) result); case BINARY: return ByteBuffer.wrap((byte[]) result); case UUID: diff --git a/core/src/test/java/org/apache/iceberg/avro/TestAvroDataWriter.java b/core/src/test/java/org/apache/iceberg/avro/TestAvroDataWriter.java index 1d12fffe3734..c7a6a4da9386 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestAvroDataWriter.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestAvroDataWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.IOException; @@ -43,14 +42,14 @@ import org.junit.rules.TemporaryFolder; public class TestAvroDataWriter { - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); private List records; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Before public void createRecords() { @@ -70,18 +69,16 @@ public void createRecords() { public void testDataWriter() throws IOException { OutputFile file = Files.localOutput(temp.newFile()); - SortOrder sortOrder = SortOrder.builderFor(SCHEMA) - .withOrderId(10) - .asc("id") - .build(); + SortOrder sortOrder = SortOrder.builderFor(SCHEMA).withOrderId(10).asc("id").build(); - DataWriter dataWriter = Avro.writeData(file) - .schema(SCHEMA) - .createWriterFunc(org.apache.iceberg.data.avro.DataWriter::create) - .overwrite() - .withSpec(PartitionSpec.unpartitioned()) - .withSortOrder(sortOrder) - .build(); + DataWriter dataWriter = + Avro.writeData(file) + .schema(SCHEMA) + .createWriterFunc(org.apache.iceberg.data.avro.DataWriter::create) + .overwrite() + .withSpec(PartitionSpec.unpartitioned()) + .withSortOrder(sortOrder) + .build(); try { for (Record record : records) { @@ -97,14 +94,16 @@ public void testDataWriter() throws IOException { Assert.assertEquals("Should be data file", FileContent.DATA, dataFile.content()); Assert.assertEquals("Record count should match", records.size(), dataFile.recordCount()); Assert.assertEquals("Partition should be empty", 0, dataFile.partition().size()); - Assert.assertEquals("Sort order should match", sortOrder.orderId(), (int) dataFile.sortOrderId()); + Assert.assertEquals( + "Sort order should match", sortOrder.orderId(), (int) dataFile.sortOrderId()); Assert.assertNull("Key metadata should be null", dataFile.keyMetadata()); List writtenRecords; - try (AvroIterable reader = Avro.read(file.toInputFile()) - .project(SCHEMA) - .createReaderFunc(org.apache.iceberg.data.avro.DataReader::create) - .build()) { + try (AvroIterable reader = + Avro.read(file.toInputFile()) + .project(SCHEMA) + .createReaderFunc(org.apache.iceberg.data.avro.DataReader::create) + .build()) { writtenRecords = Lists.newArrayList(reader); } diff --git a/core/src/test/java/org/apache/iceberg/avro/TestAvroDeleteWriters.java b/core/src/test/java/org/apache/iceberg/avro/TestAvroDeleteWriters.java index d772f54dd626..160fd2f81c8a 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestAvroDeleteWriters.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestAvroDeleteWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.File; @@ -49,14 +48,14 @@ import org.junit.rules.TemporaryFolder; public class TestAvroDeleteWriters { - private static final Schema SCHEMA = new Schema( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); private List records; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Before public void createDeleteRecords() { @@ -75,13 +74,14 @@ public void createDeleteRecords() { @Test public void testEqualityDeleteWriter() throws IOException { OutputFile out = new InMemoryOutputFile(); - EqualityDeleteWriter deleteWriter = Avro.writeDeletes(out) - .createWriterFunc(DataWriter::create) - .overwrite() - .rowSchema(SCHEMA) - .withSpec(PartitionSpec.unpartitioned()) - .equalityFieldIds(1) - .buildEqualityWriter(); + EqualityDeleteWriter deleteWriter = + Avro.writeDeletes(out) + .createWriterFunc(DataWriter::create) + .overwrite() + .rowSchema(SCHEMA) + .withSpec(PartitionSpec.unpartitioned()) + .equalityFieldIds(1) + .buildEqualityWriter(); try (EqualityDeleteWriter writer = deleteWriter) { writer.deleteAll(records); @@ -89,16 +89,15 @@ public void testEqualityDeleteWriter() throws IOException { DeleteFile metadata = deleteWriter.toDeleteFile(); Assert.assertEquals("Format should be Avro", FileFormat.AVRO, metadata.format()); - Assert.assertEquals("Should be equality deletes", FileContent.EQUALITY_DELETES, metadata.content()); + Assert.assertEquals( + "Should be equality deletes", FileContent.EQUALITY_DELETES, metadata.content()); Assert.assertEquals("Record count should be correct", records.size(), metadata.recordCount()); Assert.assertEquals("Partition should be empty", 0, metadata.partition().size()); Assert.assertNull("Key metadata should be null", metadata.keyMetadata()); List deletedRecords; - try (AvroIterable reader = Avro.read(out.toInputFile()) - .project(SCHEMA) - .createReaderFunc(DataReader::create) - .build()) { + try (AvroIterable reader = + Avro.read(out.toInputFile()).project(SCHEMA).createReaderFunc(DataReader::create).build()) { deletedRecords = Lists.newArrayList(reader); } @@ -107,96 +106,104 @@ public void testEqualityDeleteWriter() throws IOException { @Test public void testPositionDeleteWriter() throws IOException { - Schema deleteSchema = new Schema( - MetadataColumns.DELETE_FILE_PATH, - MetadataColumns.DELETE_FILE_POS, - NestedField.optional(MetadataColumns.DELETE_FILE_ROW_FIELD_ID, "row", SCHEMA.asStruct())); + Schema deleteSchema = + new Schema( + MetadataColumns.DELETE_FILE_PATH, + MetadataColumns.DELETE_FILE_POS, + NestedField.optional( + MetadataColumns.DELETE_FILE_ROW_FIELD_ID, "row", SCHEMA.asStruct())); String deletePath = "s3://bucket/path/file.parquet"; GenericRecord posDelete = GenericRecord.create(deleteSchema); List expectedDeleteRecords = Lists.newArrayList(); OutputFile out = new InMemoryOutputFile(); - PositionDeleteWriter deleteWriter = Avro.writeDeletes(out) - .createWriterFunc(DataWriter::create) - .overwrite() - .rowSchema(SCHEMA) - .withSpec(PartitionSpec.unpartitioned()) - .buildPositionWriter(); + PositionDeleteWriter deleteWriter = + Avro.writeDeletes(out) + .createWriterFunc(DataWriter::create) + .overwrite() + .rowSchema(SCHEMA) + .withSpec(PartitionSpec.unpartitioned()) + .buildPositionWriter(); try (PositionDeleteWriter writer = deleteWriter) { for (int i = 0; i < records.size(); i += 1) { int pos = i * 3 + 2; writer.delete(deletePath, pos, records.get(i)); - expectedDeleteRecords.add(posDelete.copy(ImmutableMap.of( - "file_path", deletePath, - "pos", (long) pos, - "row", records.get(i)))); + expectedDeleteRecords.add( + posDelete.copy( + ImmutableMap.of( + "file_path", deletePath, "pos", (long) pos, "row", records.get(i)))); } } DeleteFile metadata = deleteWriter.toDeleteFile(); Assert.assertEquals("Format should be Avro", FileFormat.AVRO, metadata.format()); - Assert.assertEquals("Should be position deletes", FileContent.POSITION_DELETES, metadata.content()); + Assert.assertEquals( + "Should be position deletes", FileContent.POSITION_DELETES, metadata.content()); Assert.assertEquals("Record count should be correct", records.size(), metadata.recordCount()); Assert.assertEquals("Partition should be empty", 0, metadata.partition().size()); Assert.assertNull("Key metadata should be null", metadata.keyMetadata()); List deletedRecords; - try (AvroIterable reader = Avro.read(out.toInputFile()) - .project(deleteSchema) - .createReaderFunc(DataReader::create) - .build()) { + try (AvroIterable reader = + Avro.read(out.toInputFile()) + .project(deleteSchema) + .createReaderFunc(DataReader::create) + .build()) { deletedRecords = Lists.newArrayList(reader); } - Assert.assertEquals("Deleted records should match expected", expectedDeleteRecords, deletedRecords); + Assert.assertEquals( + "Deleted records should match expected", expectedDeleteRecords, deletedRecords); } @Test public void testPositionDeleteWriterWithEmptyRow() throws IOException { File deleteFile = temp.newFile(); - Schema deleteSchema = new Schema( - MetadataColumns.DELETE_FILE_PATH, - MetadataColumns.DELETE_FILE_POS); + Schema deleteSchema = + new Schema(MetadataColumns.DELETE_FILE_PATH, MetadataColumns.DELETE_FILE_POS); String deletePath = "s3://bucket/path/file.parquet"; GenericRecord posDelete = GenericRecord.create(deleteSchema); List expectedDeleteRecords = Lists.newArrayList(); OutputFile out = Files.localOutput(deleteFile); - PositionDeleteWriter deleteWriter = Avro.writeDeletes(out) - .createWriterFunc(DataWriter::create) - .overwrite() - .withSpec(PartitionSpec.unpartitioned()) - .buildPositionWriter(); + PositionDeleteWriter deleteWriter = + Avro.writeDeletes(out) + .createWriterFunc(DataWriter::create) + .overwrite() + .withSpec(PartitionSpec.unpartitioned()) + .buildPositionWriter(); try (PositionDeleteWriter writer = deleteWriter) { for (int i = 0; i < records.size(); i += 1) { int pos = i * 3 + 2; writer.delete(deletePath, pos, null); - expectedDeleteRecords.add(posDelete.copy(ImmutableMap.of( - "file_path", deletePath, - "pos", (long) pos))); + expectedDeleteRecords.add( + posDelete.copy(ImmutableMap.of("file_path", deletePath, "pos", (long) pos))); } } DeleteFile metadata = deleteWriter.toDeleteFile(); Assert.assertEquals("Format should be Avro", FileFormat.AVRO, metadata.format()); - Assert.assertEquals("Should be position deletes", FileContent.POSITION_DELETES, metadata.content()); + Assert.assertEquals( + "Should be position deletes", FileContent.POSITION_DELETES, metadata.content()); Assert.assertEquals("Record count should be correct", records.size(), metadata.recordCount()); Assert.assertEquals("Partition should be empty", 0, metadata.partition().size()); Assert.assertNull("Key metadata should be null", metadata.keyMetadata()); List deletedRecords; - try (AvroIterable reader = Avro.read(out.toInputFile()) - .project(deleteSchema) - .createReaderFunc(DataReader::create) - .build()) { + try (AvroIterable reader = + Avro.read(out.toInputFile()) + .project(deleteSchema) + .createReaderFunc(DataReader::create) + .build()) { deletedRecords = Lists.newArrayList(reader); } - Assert.assertEquals("Deleted records should match expected", expectedDeleteRecords, deletedRecords); + Assert.assertEquals( + "Deleted records should match expected", expectedDeleteRecords, deletedRecords); } } diff --git a/core/src/test/java/org/apache/iceberg/avro/TestAvroEncoderUtil.java b/core/src/test/java/org/apache/iceberg/avro/TestAvroEncoderUtil.java index 483493f77992..20accda172d2 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestAvroEncoderUtil.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestAvroEncoderUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.IOException; @@ -39,8 +38,10 @@ protected void writeAndValidate(org.apache.iceberg.Schema schema) throws IOExcep byte[] serializedData = AvroEncoderUtil.encode(record, avroSchema); GenericData.Record expectedRecord = AvroEncoderUtil.decode(serializedData); - // Fallback to compare the record's string, because its equals implementation will depend on the avro schema. - // While the avro schema will convert the 'map' type to be a list of key/value pairs for non-string keys, it + // Fallback to compare the record's string, because its equals implementation will depend on + // the avro schema. + // While the avro schema will convert the 'map' type to be a list of key/value pairs for + // non-string keys, it // would be failing to read the 'array' from a 'map'. Assert.assertEquals(expectedRecord.toString(), record.toString()); diff --git a/core/src/test/java/org/apache/iceberg/avro/TestAvroEnums.java b/core/src/test/java/org/apache/iceberg/avro/TestAvroEnums.java index 5afd2a6e5de1..4ce5f9094d5d 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestAvroEnums.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestAvroEnums.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.File; @@ -38,20 +37,20 @@ public class TestAvroEnums { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void writeAndValidateEnums() throws IOException { - org.apache.avro.Schema avroSchema = SchemaBuilder.record("root") - .fields() - .name("enumCol") - .type() - .nullable() - .enumeration("testEnum") - .symbols("SYMB1", "SYMB2") - .enumDefault("SYMB2") - .endRecord(); + org.apache.avro.Schema avroSchema = + SchemaBuilder.record("root") + .fields() + .name("enumCol") + .type() + .nullable() + .enumeration("testEnum") + .symbols("SYMB1", "SYMB2") + .enumDefault("SYMB2") + .endRecord(); org.apache.avro.Schema enumSchema = avroSchema.getField("enumCol").schema().getTypes().get(0); Record enumRecord1 = new GenericData.Record(avroSchema); @@ -73,11 +72,13 @@ public void writeAndValidateEnums() throws IOException { Schema schema = new Schema(AvroSchemaUtil.convert(avroSchema).asStructType().fields()); List rows; - try (AvroIterable reader = Avro.read(Files.localInput(testFile)).project(schema).build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)).project(schema).build()) { rows = Lists.newArrayList(reader); } - // Iceberg will return enums as strings, so compare String value of enum field instead of comparing Record objects + // Iceberg will return enums as strings, so compare String value of enum field instead of + // comparing Record objects for (int i = 0; i < expected.size(); i += 1) { String expectedEnumString = expected.get(i).get("enumCol") == null ? null : expected.get(i).get("enumCol").toString(); diff --git a/core/src/test/java/org/apache/iceberg/avro/TestAvroFileSplit.java b/core/src/test/java/org/apache/iceberg/avro/TestAvroFileSplit.java index 135fe9fb7a15..99e4e69a119e 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestAvroFileSplit.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestAvroFileSplit.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.IOException; @@ -44,14 +43,14 @@ import org.junit.rules.TemporaryFolder; public class TestAvroFileSplit { - private static final Schema SCHEMA = new Schema( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.required(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.required(2, "data", Types.StringType.get())); private static final int NUM_RECORDS = 100_000; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); public List expected = null; public InputFile file = null; @@ -62,18 +61,17 @@ public void writeDataFile() throws IOException { OutputFile out = Files.localOutput(temp.newFile()); - try (FileAppender writer = Avro.write(out) - .set(TableProperties.AVRO_COMPRESSION, "uncompressed") - .createWriterFunc(DataWriter::create) - .schema(SCHEMA) - .overwrite() - .build()) { + try (FileAppender writer = + Avro.write(out) + .set(TableProperties.AVRO_COMPRESSION, "uncompressed") + .createWriterFunc(DataWriter::create) + .schema(SCHEMA) + .overwrite() + .build()) { Record record = GenericRecord.create(SCHEMA); for (long i = 0; i < NUM_RECORDS; i += 1) { - Record next = record.copy(ImmutableMap.of( - "id", i, - "data", UUID.randomUUID().toString())); + Record next = record.copy(ImmutableMap.of("id", i, "data", UUID.randomUUID().toString())); expected.add(next); writer.add(next); } @@ -93,8 +91,10 @@ public void testSplitDataSkipping() throws IOException { List secondHalf = readAvro(file, SCHEMA, splitLocation + 1, end - splitLocation - 1); Assert.assertNotEquals("Second split should not be empty", 0, secondHalf.size()); - Assert.assertEquals("Total records should match expected", - expected.size(), firstHalf.size() + secondHalf.size()); + Assert.assertEquals( + "Total records should match expected", + expected.size(), + firstHalf.size() + secondHalf.size()); for (int i = 0; i < firstHalf.size(); i += 1) { Assert.assertEquals(expected.get(i), firstHalf.get(i)); @@ -107,67 +107,78 @@ public void testSplitDataSkipping() throws IOException { @Test public void testPosField() throws IOException { - Schema projection = new Schema( - SCHEMA.columns().get(0), - MetadataColumns.ROW_POSITION, - SCHEMA.columns().get(1)); + Schema projection = + new Schema(SCHEMA.columns().get(0), MetadataColumns.ROW_POSITION, SCHEMA.columns().get(1)); List records = readAvro(file, projection, 0, file.getLength()); for (int i = 0; i < expected.size(); i += 1) { - Assert.assertEquals("Field _pos should match", - (long) i, records.get(i).getField(MetadataColumns.ROW_POSITION.name())); - Assert.assertEquals("Field id should match", - expected.get(i).getField("id"), records.get(i).getField("id")); - Assert.assertEquals("Field data should match", - expected.get(i).getField("data"), records.get(i).getField("data")); + Assert.assertEquals( + "Field _pos should match", + (long) i, + records.get(i).getField(MetadataColumns.ROW_POSITION.name())); + Assert.assertEquals( + "Field id should match", expected.get(i).getField("id"), records.get(i).getField("id")); + Assert.assertEquals( + "Field data should match", + expected.get(i).getField("data"), + records.get(i).getField("data")); } } @Test public void testPosFieldWithSplits() throws IOException { - Schema projection = new Schema( - SCHEMA.columns().get(0), - MetadataColumns.ROW_POSITION, - SCHEMA.columns().get(1)); + Schema projection = + new Schema(SCHEMA.columns().get(0), MetadataColumns.ROW_POSITION, SCHEMA.columns().get(1)); long end = file.getLength(); long splitLocation = end / 2; - List secondHalf = readAvro(file, projection, splitLocation + 1, end - splitLocation - 1); + List secondHalf = + readAvro(file, projection, splitLocation + 1, end - splitLocation - 1); Assert.assertNotEquals("Second split should not be empty", 0, secondHalf.size()); List firstHalf = readAvro(file, projection, 0, splitLocation); Assert.assertNotEquals("First split should not be empty", 0, firstHalf.size()); - Assert.assertEquals("Total records should match expected", - expected.size(), firstHalf.size() + secondHalf.size()); + Assert.assertEquals( + "Total records should match expected", + expected.size(), + firstHalf.size() + secondHalf.size()); for (int i = 0; i < firstHalf.size(); i += 1) { - Assert.assertEquals("Field _pos should match", - (long) i, firstHalf.get(i).getField(MetadataColumns.ROW_POSITION.name())); - Assert.assertEquals("Field id should match", - expected.get(i).getField("id"), firstHalf.get(i).getField("id")); - Assert.assertEquals("Field data should match", - expected.get(i).getField("data"), firstHalf.get(i).getField("data")); + Assert.assertEquals( + "Field _pos should match", + (long) i, + firstHalf.get(i).getField(MetadataColumns.ROW_POSITION.name())); + Assert.assertEquals( + "Field id should match", expected.get(i).getField("id"), firstHalf.get(i).getField("id")); + Assert.assertEquals( + "Field data should match", + expected.get(i).getField("data"), + firstHalf.get(i).getField("data")); } for (int i = 0; i < secondHalf.size(); i += 1) { - Assert.assertEquals("Field _pos should match", - (long) (firstHalf.size() + i), secondHalf.get(i).getField(MetadataColumns.ROW_POSITION.name())); - Assert.assertEquals("Field id should match", - expected.get(firstHalf.size() + i).getField("id"), secondHalf.get(i).getField("id")); - Assert.assertEquals("Field data should match", - expected.get(firstHalf.size() + i).getField("data"), secondHalf.get(i).getField("data")); + Assert.assertEquals( + "Field _pos should match", + (long) (firstHalf.size() + i), + secondHalf.get(i).getField(MetadataColumns.ROW_POSITION.name())); + Assert.assertEquals( + "Field id should match", + expected.get(firstHalf.size() + i).getField("id"), + secondHalf.get(i).getField("id")); + Assert.assertEquals( + "Field data should match", + expected.get(firstHalf.size() + i).getField("data"), + secondHalf.get(i).getField("data")); } } @Test public void testPosWithEOFSplit() throws IOException { - Schema projection = new Schema( - SCHEMA.columns().get(0), - MetadataColumns.ROW_POSITION, - SCHEMA.columns().get(1)); + Schema projection = + new Schema(SCHEMA.columns().get(0), MetadataColumns.ROW_POSITION, SCHEMA.columns().get(1)); long end = file.getLength(); @@ -175,12 +186,14 @@ public void testPosWithEOFSplit() throws IOException { Assert.assertEquals("Should not read any records", 0, records.size()); } - public List readAvro(InputFile in, Schema projection, long start, long length) throws IOException { - try (AvroIterable reader = Avro.read(in) - .createReaderFunc(DataReader::create) - .split(start, length) - .project(projection) - .build()) { + public List readAvro(InputFile in, Schema projection, long start, long length) + throws IOException { + try (AvroIterable reader = + Avro.read(in) + .createReaderFunc(DataReader::create) + .split(start, length) + .project(projection) + .build()) { return Lists.newArrayList(reader); } } diff --git a/core/src/test/java/org/apache/iceberg/avro/TestAvroNameMapping.java b/core/src/test/java/org/apache/iceberg/avro/TestAvroNameMapping.java index f3e75e1f2a92..26f8625ea1f1 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestAvroNameMapping.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestAvroNameMapping.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; +import static org.apache.avro.generic.GenericData.Record; + import java.io.File; import java.io.IOException; import java.util.List; @@ -43,69 +44,86 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.avro.generic.GenericData.Record; - public class TestAvroNameMapping extends TestAvroReadProjection { @Test public void testMapProjections() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "location", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.optional(2, "long", Types.FloatType.get()) - ) - ))); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "location", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.optional(2, "long", Types.FloatType.get()))))); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); - Record location = new Record(AvroSchemaUtil.fromOption( - AvroSchemaUtil.fromOption(record.getSchema().getField("location").schema()) - .getValueType())); + Record location = + new Record( + AvroSchemaUtil.fromOption( + AvroSchemaUtil.fromOption(record.getSchema().getField("location").schema()) + .getValueType())); location.put("lat", 52.995143f); location.put("long", -1.539054f); record.put("location", ImmutableMap.of("l1", location)); // Table mapping does not project `location` map - NameMapping nameMapping = MappingUtil.create(new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()))); + NameMapping nameMapping = + MappingUtil.create(new Schema(Types.NestedField.required(0, "id", Types.LongType.get()))); Schema readSchema = writeSchema; Record projected = writeAndRead(writeSchema, readSchema, record, nameMapping); // field id 5 comes from read schema - Assert.assertNotNull("Field missing from table mapping is renamed", projected.getSchema().getField("location_r5")); + Assert.assertNotNull( + "Field missing from table mapping is renamed", + projected.getSchema().getField("location_r5")); Assert.assertNull("location field should not be read", projected.get("location_r5")); Assert.assertEquals(34L, projected.get("id")); // Table mapping partially project `location` map value - nameMapping = MappingUtil.create(new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "location", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get())))))); + nameMapping = + MappingUtil.create( + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "location", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get())))))); projected = writeAndRead(writeSchema, readSchema, record, nameMapping); Record projectedL1 = ((Map) projected.get("location")).get("l1"); - Assert.assertNotNull("Field missing from table mapping is renamed", projectedL1.getSchema().getField("long_r2")); + Assert.assertNotNull( + "Field missing from table mapping is renamed", projectedL1.getSchema().getField("long_r2")); Assert.assertNull("location.value.long, should not be read", projectedL1.get("long_r2")); } @Test public void testComplexMapKeys() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(5, "location", Types.MapType.ofRequired(6, 7, - Types.StructType.of( - Types.NestedField.required(3, "k1", Types.StringType.get()), - Types.NestedField.required(4, "k2", Types.StringType.get()) - ), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.optional(2, "long", Types.FloatType.get()) - ) - ))); + Schema writeSchema = + new Schema( + Types.NestedField.required( + 5, + "location", + Types.MapType.ofRequired( + 6, + 7, + Types.StructType.of( + Types.NestedField.required(3, "k1", Types.StringType.get()), + Types.NestedField.required(4, "k2", Types.StringType.get())), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.optional(2, "long", Types.FloatType.get()))))); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); org.apache.avro.Schema locationSchema = record.getSchema().getField("location").schema(); @@ -122,36 +140,45 @@ public void testComplexMapKeys() throws IOException { record.put("location", ImmutableList.of(locationElement)); // project a subset of the map's value columns in NameMapping - NameMapping nameMapping = MappingUtil.create(new Schema( - Types.NestedField.required(5, "location", Types.MapType.ofOptional(6, 7, - Types.StructType.of( - Types.NestedField.required(3, "k1", Types.StringType.get()), - Types.NestedField.optional(4, "k2", Types.StringType.get()) - ), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()) - ) - )))); - - Schema readSchema = new Schema( - Types.NestedField.required(5, "location", Types.MapType.ofOptional(6, 7, - Types.StructType.of( - Types.NestedField.required(3, "k1", Types.StringType.get()), - Types.NestedField.optional(4, "k2", Types.StringType.get()) - ), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.optional(2, "long", Types.FloatType.get()) - ) - ))); + NameMapping nameMapping = + MappingUtil.create( + new Schema( + Types.NestedField.required( + 5, + "location", + Types.MapType.ofOptional( + 6, + 7, + Types.StructType.of( + Types.NestedField.required(3, "k1", Types.StringType.get()), + Types.NestedField.optional(4, "k2", Types.StringType.get())), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get())))))); + + Schema readSchema = + new Schema( + Types.NestedField.required( + 5, + "location", + Types.MapType.ofOptional( + 6, + 7, + Types.StructType.of( + Types.NestedField.required(3, "k1", Types.StringType.get()), + Types.NestedField.optional(4, "k2", Types.StringType.get())), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.optional(2, "long", Types.FloatType.get()))))); Record projected = writeAndRead(writeSchema, readSchema, record, nameMapping); // The data is read back as a map Map projectedLocation = (Map) projected.get("location"); Record projectedKey = projectedLocation.keySet().iterator().next(); Record projectedValue = projectedLocation.values().iterator().next(); - Assert.assertEquals(0, Comparators.charSequences().compare("k1", (CharSequence) projectedKey.get("k1"))); - Assert.assertEquals(0, Comparators.charSequences().compare("k2", (CharSequence) projectedKey.get("k2"))); + Assert.assertEquals( + 0, Comparators.charSequences().compare("k1", (CharSequence) projectedKey.get("k1"))); + Assert.assertEquals( + 0, Comparators.charSequences().compare("k2", (CharSequence) projectedKey.get("k2"))); Assert.assertEquals(52.995143f, projectedValue.get("lat")); Assert.assertNotNull(projectedValue.getSchema().getField("long_r2")); Assert.assertNull(projectedValue.get("long_r2")); @@ -159,69 +186,86 @@ public void testComplexMapKeys() throws IOException { @Test public void testMissingRequiredFields() { - Schema writeSchema = new Schema( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get())); + Schema writeSchema = + new Schema( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get())); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("x", 1); record.put("y", 2); // table mapping not projecting a required field 'x' - NameMapping nameMapping = MappingUtil.create(new Schema( - Types.NestedField.optional(18, "y", Types.IntegerType.get()))); + NameMapping nameMapping = + MappingUtil.create( + new Schema(Types.NestedField.optional(18, "y", Types.IntegerType.get()))); Schema readSchema = writeSchema; - AssertHelpers.assertThrows("Missing required field in nameMapping", - IllegalArgumentException.class, "Missing required field: x", + AssertHelpers.assertThrows( + "Missing required field in nameMapping", + IllegalArgumentException.class, + "Missing required field: x", // In this case, pruneColumns result is an empty record () -> writeAndRead(writeSchema, readSchema, record, nameMapping)); } @Test public void testArrayProjections() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(22, "point", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()) - )) - ) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 22, + "point", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); - Record pointRecord = new Record(AvroSchemaUtil.fromOption( - AvroSchemaUtil.fromOption(record.getSchema().getField("point").schema()).getElementType())); + Record pointRecord = + new Record( + AvroSchemaUtil.fromOption( + AvroSchemaUtil.fromOption(record.getSchema().getField("point").schema()) + .getElementType())); pointRecord.put("x", 1); pointRecord.put("y", 2); record.put("point", ImmutableList.of(pointRecord)); - NameMapping nameMapping = MappingUtil.create(new Schema( - // Optional array field missing. - Types.NestedField.required(0, "id", Types.LongType.get()))); + NameMapping nameMapping = + MappingUtil.create( + new Schema( + // Optional array field missing. + Types.NestedField.required(0, "id", Types.LongType.get()))); Schema readSchema = writeSchema; Record projected = writeAndRead(writeSchema, readSchema, record, nameMapping); - Assert.assertNotNull("Field missing from table mapping is renamed", projected.getSchema().getField("point_r22")); + Assert.assertNotNull( + "Field missing from table mapping is renamed", projected.getSchema().getField("point_r22")); Assert.assertNull("point field is not projected", projected.get("point_r22")); Assert.assertEquals(34L, projected.get("id")); // point array is partially projected - nameMapping = MappingUtil.create(new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(22, "point", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()))) - ) - )); + nameMapping = + MappingUtil.create( + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 22, + "point", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get())))))); projected = writeAndRead(writeSchema, readSchema, record, nameMapping); Record point = ((List) projected.get("point")).get(0); - Assert.assertNotNull("Field missing from table mapping is renamed", point.getSchema().getField("y_r18")); + Assert.assertNotNull( + "Field missing from table mapping is renamed", point.getSchema().getField("y_r18")); Assert.assertEquals("point.x is projected", 1, point.get("x")); Assert.assertNull("point.y is not projected", point.get("y_r18")); Assert.assertEquals(34L, projected.get("id")); @@ -229,47 +273,74 @@ public void testArrayProjections() throws Exception { @Test public void testAliases() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()))))); + Schema writeSchema = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()))))); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); - Record pointRecord = new Record(AvroSchemaUtil.fromOption( - AvroSchemaUtil.fromOption(record.getSchema().getField("points").schema()).getElementType())); + Record pointRecord = + new Record( + AvroSchemaUtil.fromOption( + AvroSchemaUtil.fromOption(record.getSchema().getField("points").schema()) + .getElementType())); pointRecord.put("x", 1); record.put("points", ImmutableList.of(pointRecord)); - NameMapping nameMapping = NameMapping.of( - MappedFields.of( - MappedField.of(22, "points", MappedFields.of( - MappedField.of(21, "element", MappedFields.of( - MappedField.of(19, Lists.newArrayList("x")))))))); - - Schema readSchema = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - // x renamed to y - Types.NestedField.required(19, "y", Types.IntegerType.get()))))); + NameMapping nameMapping = + NameMapping.of( + MappedFields.of( + MappedField.of( + 22, + "points", + MappedFields.of( + MappedField.of( + 21, + "element", + MappedFields.of(MappedField.of(19, Lists.newArrayList("x")))))))); + + Schema readSchema = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + // x renamed to y + Types.NestedField.required(19, "y", Types.IntegerType.get()))))); Record projected = writeAndRead(writeSchema, readSchema, record, nameMapping); - Assert.assertEquals("x is read as y", 1, ((List) projected.get("points")).get(0).get("y")); - - readSchema = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - // x renamed to z - Types.NestedField.required(19, "z", Types.IntegerType.get()))))); + Assert.assertEquals( + "x is read as y", 1, ((List) projected.get("points")).get(0).get("y")); + + readSchema = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + // x renamed to z + Types.NestedField.required(19, "z", Types.IntegerType.get()))))); projected = writeAndRead(writeSchema, readSchema, record, nameMapping); - Assert.assertEquals("x is read as z", 1, ((List) projected.get("points")).get(0).get("z")); + Assert.assertEquals( + "x is read as z", 1, ((List) projected.get("points")).get(0).get("z")); } @Test public void testInferredMapping() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get())); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); @@ -288,27 +359,23 @@ public void testAvroArrayAsLogicalMap() { } @Override - protected Record writeAndRead(String desc, - Schema writeSchema, - Schema readSchema, - Record inputRecord) throws IOException { + protected Record writeAndRead( + String desc, Schema writeSchema, Schema readSchema, Record inputRecord) throws IOException { // Use all existing TestAvroReadProjection tests to verify that // we get the same projected Avro record whether we use // NameMapping together with file schema without field-ids or we // use a file schema having field-ids Record record = super.writeAndRead(desc, writeSchema, readSchema, inputRecord); - Record projectedWithNameMapping = writeAndRead( - writeSchema, readSchema, inputRecord, MappingUtil.create(writeSchema)); + Record projectedWithNameMapping = + writeAndRead(writeSchema, readSchema, inputRecord, MappingUtil.create(writeSchema)); Assert.assertEquals(record, projectedWithNameMapping); return record; } - - private Record writeAndRead(Schema writeSchema, - Schema readSchema, - Record record, - NameMapping nameMapping) throws IOException { + private Record writeAndRead( + Schema writeSchema, Schema readSchema, Record record, NameMapping nameMapping) + throws IOException { File file = temp.newFile(); // Write without file ids @@ -319,10 +386,8 @@ private Record writeAndRead(Schema writeSchema, dataFileWriter.append(record); } - Iterable records = Avro.read(Files.localInput(file)) - .project(readSchema) - .withNameMapping(nameMapping) - .build(); + Iterable records = + Avro.read(Files.localInput(file)).project(readSchema).withNameMapping(nameMapping).build(); return Iterables.getOnlyElement(records); } diff --git a/core/src/test/java/org/apache/iceberg/avro/TestAvroOptionsWithNonNullDefaults.java b/core/src/test/java/org/apache/iceberg/avro/TestAvroOptionsWithNonNullDefaults.java index 13efc18a7727..a6f0c393a98a 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestAvroOptionsWithNonNullDefaults.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestAvroOptionsWithNonNullDefaults.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; +import static org.apache.avro.Schema.Type.INT; +import static org.apache.avro.Schema.Type.LONG; +import static org.apache.avro.Schema.Type.NULL; + import java.io.File; import java.io.IOException; import java.util.List; @@ -34,23 +37,24 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.avro.Schema.Type.INT; -import static org.apache.avro.Schema.Type.LONG; -import static org.apache.avro.Schema.Type.NULL; - public class TestAvroOptionsWithNonNullDefaults { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void writeAndValidateOptionWithNonNullDefaultsPruning() throws IOException { - Schema writeSchema = Schema.createRecord("root", null, null, false, - ImmutableList.of( - new Schema.Field("field", Schema.createUnion(Schema.createArray(Schema.create(INT)), Schema.create(NULL)), - null, ImmutableList.of()) - ) - ); + Schema writeSchema = + Schema.createRecord( + "root", + null, + null, + false, + ImmutableList.of( + new Schema.Field( + "field", + Schema.createUnion(Schema.createArray(Schema.create(INT)), Schema.create(NULL)), + null, + ImmutableList.of()))); GenericData.Record record1 = new GenericData.Record(writeSchema); record1.put("field", ImmutableList.of(1, 2, 3)); @@ -60,7 +64,8 @@ public void writeAndValidateOptionWithNonNullDefaultsPruning() throws IOExceptio File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (DataFileWriter writer = new DataFileWriter<>(new GenericDatumWriter<>())) { + try (DataFileWriter writer = + new DataFileWriter<>(new GenericDatumWriter<>())) { writer.create(writeSchema, testFile); writer.append(record1); writer.append(record2); @@ -70,8 +75,8 @@ public void writeAndValidateOptionWithNonNullDefaultsPruning() throws IOExceptio org.apache.iceberg.Schema readIcebergSchema = AvroSchemaUtil.toIceberg(writeSchema); List rows; - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .project(readIcebergSchema).build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)).project(readIcebergSchema).build()) { rows = Lists.newArrayList(reader); } @@ -82,11 +87,18 @@ public void writeAndValidateOptionWithNonNullDefaultsPruning() throws IOExceptio @Test public void writeAndValidateOptionWithNonNullDefaultsEvolution() throws IOException { - Schema writeSchema = Schema.createRecord("root", null, null, false, - ImmutableList.of( - new Schema.Field("field", Schema.createUnion(Schema.create(INT), Schema.create(NULL)), null, -1) - ) - ); + Schema writeSchema = + Schema.createRecord( + "root", + null, + null, + false, + ImmutableList.of( + new Schema.Field( + "field", + Schema.createUnion(Schema.create(INT), Schema.create(NULL)), + null, + -1))); GenericData.Record record1 = new GenericData.Record(writeSchema); record1.put("field", 1); @@ -96,17 +108,25 @@ public void writeAndValidateOptionWithNonNullDefaultsEvolution() throws IOExcept File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (DataFileWriter writer = new DataFileWriter<>(new GenericDatumWriter<>())) { + try (DataFileWriter writer = + new DataFileWriter<>(new GenericDatumWriter<>())) { writer.create(writeSchema, testFile); writer.append(record1); writer.append(record2); } - Schema readSchema = Schema.createRecord("root", null, null, false, - ImmutableList.of( - new Schema.Field("field", Schema.createUnion(Schema.create(LONG), Schema.create(NULL)), null, -1L) - ) - ); + Schema readSchema = + Schema.createRecord( + "root", + null, + null, + false, + ImmutableList.of( + new Schema.Field( + "field", + Schema.createUnion(Schema.create(LONG), Schema.create(NULL)), + null, + -1L))); GenericData.Record expectedRecord1 = new GenericData.Record(readSchema); expectedRecord1.put("field", 1L); @@ -116,8 +136,8 @@ public void writeAndValidateOptionWithNonNullDefaultsEvolution() throws IOExcept org.apache.iceberg.Schema readIcebergSchema = AvroSchemaUtil.toIceberg(readSchema); List rows; - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .project(readIcebergSchema).build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)).project(readIcebergSchema).build()) { rows = Lists.newArrayList(reader); } diff --git a/core/src/test/java/org/apache/iceberg/avro/TestAvroReadProjection.java b/core/src/test/java/org/apache/iceberg/avro/TestAvroReadProjection.java index 65edefaf60da..0049357def77 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestAvroReadProjection.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestAvroReadProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.File; @@ -36,46 +35,50 @@ public class TestAvroReadProjection extends TestReadProjection { @Override - protected GenericData.Record writeAndRead(String desc, - Schema writeSchema, - Schema readSchema, - GenericData.Record record) + protected GenericData.Record writeAndRead( + String desc, Schema writeSchema, Schema readSchema, GenericData.Record record) throws IOException { File file = temp.newFile(desc + ".avro"); file.delete(); - try (FileAppender appender = Avro.write(Files.localOutput(file)) - .schema(writeSchema) - .build()) { + try (FileAppender appender = + Avro.write(Files.localOutput(file)).schema(writeSchema).build()) { appender.add(record); } - Iterable records = Avro.read(Files.localInput(file)) - .project(readSchema) - .build(); + Iterable records = + Avro.read(Files.localInput(file)).project(readSchema).build(); return Iterables.getOnlyElement(records); } @Test public void testAvroArrayAsLogicalMap() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.optional(0, "map", Types.MapType.ofOptional(2, 3, - Types.LongType.get(), - Types.ListType.ofRequired(1, Types.LongType.get()) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.optional( + 0, + "map", + Types.MapType.ofOptional( + 2, + 3, + Types.LongType.get(), + Types.ListType.ofRequired(1, Types.LongType.get())))); List values1 = ImmutableList.of(101L, 102L); List values2 = ImmutableList.of(201L, 202L, 203L); - GenericData.Record record = new GenericData.Record(AvroSchemaUtil.convert(writeSchema, "table")); + GenericData.Record record = + new GenericData.Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("map", ImmutableMap.of(100L, values1, 200L, values2)); - GenericData.Record projected = writeAndRead("full_projection", writeSchema, writeSchema, record); - Assert.assertEquals("Should contain correct value list", + GenericData.Record projected = + writeAndRead("full_projection", writeSchema, writeSchema, record); + Assert.assertEquals( + "Should contain correct value list", values1, ((Map>) projected.get("map")).get(100L)); - Assert.assertEquals("Should contain correct value list", + Assert.assertEquals( + "Should contain correct value list", values2, ((Map>) projected.get("map")).get(200L)); } diff --git a/core/src/test/java/org/apache/iceberg/avro/TestAvroSchemaProjection.java b/core/src/test/java/org/apache/iceberg/avro/TestAvroSchemaProjection.java index 71dc9a6cb17e..37731d7a9f18 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestAvroSchemaProjection.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestAvroSchemaProjection.java @@ -16,33 +16,63 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; +import static org.junit.Assert.assertFalse; + import java.util.Collections; import org.apache.avro.SchemaBuilder; import org.apache.iceberg.Schema; import org.junit.Test; -import static org.junit.Assert.assertFalse; - public class TestAvroSchemaProjection { @Test public void projectWithListSchemaChanged() { - final org.apache.avro.Schema currentAvroSchema = SchemaBuilder.record("myrecord").namespace("unit.test").fields() - .name("f1").type().nullable().array().items( - SchemaBuilder.record("elem").fields() - .name("f11").type().nullable().intType().noDefault().endRecord()) - .noDefault().endRecord(); - - final org.apache.avro.Schema updatedAvroSchema = SchemaBuilder.record("myrecord").namespace("unit.test").fields() - .name("f1").type().nullable().array().items( - SchemaBuilder.record("elem").fields() - .name("f11").type().nullable().intType().noDefault() - .name("f12").type().nullable().stringType().noDefault() - .endRecord()) - .noDefault().endRecord(); + final org.apache.avro.Schema currentAvroSchema = + SchemaBuilder.record("myrecord") + .namespace("unit.test") + .fields() + .name("f1") + .type() + .nullable() + .array() + .items( + SchemaBuilder.record("elem") + .fields() + .name("f11") + .type() + .nullable() + .intType() + .noDefault() + .endRecord()) + .noDefault() + .endRecord(); + + final org.apache.avro.Schema updatedAvroSchema = + SchemaBuilder.record("myrecord") + .namespace("unit.test") + .fields() + .name("f1") + .type() + .nullable() + .array() + .items( + SchemaBuilder.record("elem") + .fields() + .name("f11") + .type() + .nullable() + .intType() + .noDefault() + .name("f12") + .type() + .nullable() + .stringType() + .noDefault() + .endRecord()) + .noDefault() + .endRecord(); final Schema currentIcebergSchema = AvroSchemaUtil.toIceberg(currentAvroSchema); @@ -51,28 +81,60 @@ public void projectWithListSchemaChanged() { AvroSchemaUtil.convert(AvroSchemaUtil.toIceberg(updatedAvroSchema).asStruct()); final org.apache.avro.Schema projectedAvroSchema = - AvroSchemaUtil.buildAvroProjection(idAllocatedUpdatedAvroSchema, currentIcebergSchema, Collections.emptyMap()); + AvroSchemaUtil.buildAvroProjection( + idAllocatedUpdatedAvroSchema, currentIcebergSchema, Collections.emptyMap()); - assertFalse("Result of buildAvroProjection is missing some IDs", + assertFalse( + "Result of buildAvroProjection is missing some IDs", AvroSchemaUtil.missingIds(projectedAvroSchema)); } - @Test public void projectWithMapSchemaChanged() { - final org.apache.avro.Schema currentAvroSchema = SchemaBuilder.record("myrecord").namespace("unit.test").fields() - .name("f1").type().nullable().map().values( - SchemaBuilder.record("elem").fields() - .name("f11").type().nullable().intType().noDefault().endRecord()) - .noDefault().endRecord(); - - final org.apache.avro.Schema updatedAvroSchema = SchemaBuilder.record("myrecord").namespace("unit.test").fields() - .name("f1").type().nullable().map().values( - SchemaBuilder.record("elem").fields() - .name("f11").type().nullable().intType().noDefault() - .name("f12").type().nullable().stringType().noDefault() - .endRecord()) - .noDefault().endRecord(); + final org.apache.avro.Schema currentAvroSchema = + SchemaBuilder.record("myrecord") + .namespace("unit.test") + .fields() + .name("f1") + .type() + .nullable() + .map() + .values( + SchemaBuilder.record("elem") + .fields() + .name("f11") + .type() + .nullable() + .intType() + .noDefault() + .endRecord()) + .noDefault() + .endRecord(); + + final org.apache.avro.Schema updatedAvroSchema = + SchemaBuilder.record("myrecord") + .namespace("unit.test") + .fields() + .name("f1") + .type() + .nullable() + .map() + .values( + SchemaBuilder.record("elem") + .fields() + .name("f11") + .type() + .nullable() + .intType() + .noDefault() + .name("f12") + .type() + .nullable() + .stringType() + .noDefault() + .endRecord()) + .noDefault() + .endRecord(); final Schema currentIcebergSchema = AvroSchemaUtil.toIceberg(currentAvroSchema); @@ -81,10 +143,11 @@ public void projectWithMapSchemaChanged() { AvroSchemaUtil.convert(AvroSchemaUtil.toIceberg(updatedAvroSchema).asStruct()); final org.apache.avro.Schema projectedAvroSchema = - AvroSchemaUtil.buildAvroProjection(idAllocatedUpdatedAvroSchema, currentIcebergSchema, Collections.emptyMap()); + AvroSchemaUtil.buildAvroProjection( + idAllocatedUpdatedAvroSchema, currentIcebergSchema, Collections.emptyMap()); - assertFalse("Result of buildAvroProjection is missing some IDs", + assertFalse( + "Result of buildAvroProjection is missing some IDs", AvroSchemaUtil.missingIds(projectedAvroSchema)); } - } diff --git a/core/src/test/java/org/apache/iceberg/avro/TestBuildAvroProjection.java b/core/src/test/java/org/apache/iceberg/avro/TestBuildAvroProjection.java index 79a54f886c1d..edee46685e32 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestBuildAvroProjection.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestBuildAvroProjection.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.junit.Assert.assertEquals; + import java.util.Collections; import java.util.function.Supplier; import org.apache.avro.SchemaBuilder; @@ -26,234 +28,386 @@ import org.apache.iceberg.types.Types; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.junit.Assert.assertEquals; - public class TestBuildAvroProjection { @Test public void projectArrayWithElementSchemaUnchanged() { - final Type icebergType = Types.ListType.ofRequired(0, - Types.StructType.of( - optional(1, "int1", Types.IntegerType.get()), - optional(2, "string1", Types.StringType.get()) - ) - ); - - final org.apache.avro.Schema expected = SchemaBuilder.array().prop(AvroSchemaUtil.ELEMENT_ID_PROP, "0") - .items( - SchemaBuilder.record("elem").namespace("unit.test").fields() - .name("int1").prop(AvroSchemaUtil.FIELD_ID_PROP, "1").type().nullable().intType().noDefault() - .name("string1").prop(AvroSchemaUtil.FIELD_ID_PROP, "2").type().nullable().stringType().noDefault() - .endRecord()); - - final BuildAvroProjection testSubject = new BuildAvroProjection(icebergType, Collections.emptyMap()); + final Type icebergType = + Types.ListType.ofRequired( + 0, + Types.StructType.of( + optional(1, "int1", Types.IntegerType.get()), + optional(2, "string1", Types.StringType.get()))); + + final org.apache.avro.Schema expected = + SchemaBuilder.array() + .prop(AvroSchemaUtil.ELEMENT_ID_PROP, "0") + .items( + SchemaBuilder.record("elem") + .namespace("unit.test") + .fields() + .name("int1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "1") + .type() + .nullable() + .intType() + .noDefault() + .name("string1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "2") + .type() + .nullable() + .stringType() + .noDefault() + .endRecord()); + + final BuildAvroProjection testSubject = + new BuildAvroProjection(icebergType, Collections.emptyMap()); final Supplier supplier = expected::getElementType; final org.apache.avro.Schema actual = testSubject.array(expected, supplier); - assertEquals("Array projection produced undesired array schema", - expected, actual); - assertEquals("Unexpected element ID discovered on the projected array schema", - 0, Integer.valueOf(actual.getProp(AvroSchemaUtil.ELEMENT_ID_PROP)).intValue()); + assertEquals("Array projection produced undesired array schema", expected, actual); + assertEquals( + "Unexpected element ID discovered on the projected array schema", + 0, + Integer.valueOf(actual.getProp(AvroSchemaUtil.ELEMENT_ID_PROP)).intValue()); } @Test public void projectArrayWithExtraFieldInElementSchema() { - final Type icebergType = Types.ListType.ofRequired(0, - Types.StructType.of( - optional(1, "int1", Types.IntegerType.get()), - optional(2, "string1", Types.StringType.get()) - ) - ); - - final org.apache.avro.Schema extraField = SchemaBuilder.array().prop(AvroSchemaUtil.ELEMENT_ID_PROP, "0") - .items( - SchemaBuilder.record("elem").namespace("unit.test").fields() - .name("int1").prop(AvroSchemaUtil.FIELD_ID_PROP, "1").type().nullable().intType().noDefault() - .name("string1").prop(AvroSchemaUtil.FIELD_ID_PROP, "2").type().nullable().stringType().noDefault() - .name("float1").prop(AvroSchemaUtil.FIELD_ID_PROP, "3").type().nullable().floatType().noDefault() - .endRecord()); + final Type icebergType = + Types.ListType.ofRequired( + 0, + Types.StructType.of( + optional(1, "int1", Types.IntegerType.get()), + optional(2, "string1", Types.StringType.get()))); + + final org.apache.avro.Schema extraField = + SchemaBuilder.array() + .prop(AvroSchemaUtil.ELEMENT_ID_PROP, "0") + .items( + SchemaBuilder.record("elem") + .namespace("unit.test") + .fields() + .name("int1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "1") + .type() + .nullable() + .intType() + .noDefault() + .name("string1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "2") + .type() + .nullable() + .stringType() + .noDefault() + .name("float1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "3") + .type() + .nullable() + .floatType() + .noDefault() + .endRecord()); // once projected onto iceberg schema, the avro schema will lose the extra float field - final org.apache.avro.Schema expected = SchemaBuilder.array().prop(AvroSchemaUtil.ELEMENT_ID_PROP, "0") - .items( - SchemaBuilder.record("elem").namespace("unit.test").fields() - .name("int1").prop(AvroSchemaUtil.FIELD_ID_PROP, "1").type().nullable().intType().noDefault() - .name("string1").prop(AvroSchemaUtil.FIELD_ID_PROP, "2").type().nullable().stringType().noDefault() - .endRecord()); - - final BuildAvroProjection testSubject = new BuildAvroProjection(icebergType, Collections.emptyMap()); + final org.apache.avro.Schema expected = + SchemaBuilder.array() + .prop(AvroSchemaUtil.ELEMENT_ID_PROP, "0") + .items( + SchemaBuilder.record("elem") + .namespace("unit.test") + .fields() + .name("int1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "1") + .type() + .nullable() + .intType() + .noDefault() + .name("string1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "2") + .type() + .nullable() + .stringType() + .noDefault() + .endRecord()); + + final BuildAvroProjection testSubject = + new BuildAvroProjection(icebergType, Collections.emptyMap()); final Supplier supplier = expected::getElementType; final org.apache.avro.Schema actual = testSubject.array(extraField, supplier); - assertEquals("Array projection produced undesired array schema", - expected, actual); - assertEquals("Unexpected element ID discovered on the projected array schema", - 0, Integer.valueOf(actual.getProp(AvroSchemaUtil.ELEMENT_ID_PROP)).intValue()); + assertEquals("Array projection produced undesired array schema", expected, actual); + assertEquals( + "Unexpected element ID discovered on the projected array schema", + 0, + Integer.valueOf(actual.getProp(AvroSchemaUtil.ELEMENT_ID_PROP)).intValue()); } @Test public void projectArrayWithLessFieldInElementSchema() { - final Type icebergType = Types.ListType.ofRequired(0, - Types.StructType.of( - optional(1, "int1", Types.IntegerType.get()), - optional(2, "string1", Types.StringType.get()) - ) - ); - - final org.apache.avro.Schema lessField = SchemaBuilder.array().prop(AvroSchemaUtil.ELEMENT_ID_PROP, "0") - .items( - SchemaBuilder.record("elem").namespace("unit.test").fields() - .name("int1").prop(AvroSchemaUtil.FIELD_ID_PROP, "1").type().nullable().intType().noDefault() - .endRecord()); + final Type icebergType = + Types.ListType.ofRequired( + 0, + Types.StructType.of( + optional(1, "int1", Types.IntegerType.get()), + optional(2, "string1", Types.StringType.get()))); + + final org.apache.avro.Schema lessField = + SchemaBuilder.array() + .prop(AvroSchemaUtil.ELEMENT_ID_PROP, "0") + .items( + SchemaBuilder.record("elem") + .namespace("unit.test") + .fields() + .name("int1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "1") + .type() + .nullable() + .intType() + .noDefault() + .endRecord()); // once projected onto iceberg schema, the avro schema will have an extra string column - final org.apache.avro.Schema expected = SchemaBuilder.array().prop(AvroSchemaUtil.ELEMENT_ID_PROP, "0") - .items( - SchemaBuilder.record("elem").namespace("unit.test").fields() - .name("int1").prop(AvroSchemaUtil.FIELD_ID_PROP, "1").type().nullable().intType().noDefault() - .name("string1_r").prop(AvroSchemaUtil.FIELD_ID_PROP, "2").type().nullable().stringType().noDefault() - .endRecord()); - - final BuildAvroProjection testSubject = new BuildAvroProjection(icebergType, Collections.emptyMap()); + final org.apache.avro.Schema expected = + SchemaBuilder.array() + .prop(AvroSchemaUtil.ELEMENT_ID_PROP, "0") + .items( + SchemaBuilder.record("elem") + .namespace("unit.test") + .fields() + .name("int1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "1") + .type() + .nullable() + .intType() + .noDefault() + .name("string1_r") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "2") + .type() + .nullable() + .stringType() + .noDefault() + .endRecord()); + + final BuildAvroProjection testSubject = + new BuildAvroProjection(icebergType, Collections.emptyMap()); final Supplier supplier = expected::getElementType; final org.apache.avro.Schema actual = testSubject.array(lessField, supplier); - assertEquals("Array projection produced undesired array schema", - expected, actual); - assertEquals("Unexpected element ID discovered on the projected array schema", - 0, Integer.valueOf(actual.getProp(AvroSchemaUtil.ELEMENT_ID_PROP)).intValue()); + assertEquals("Array projection produced undesired array schema", expected, actual); + assertEquals( + "Unexpected element ID discovered on the projected array schema", + 0, + Integer.valueOf(actual.getProp(AvroSchemaUtil.ELEMENT_ID_PROP)).intValue()); } @Test public void projectMapWithValueSchemaUnchanged() { - final Type icebergType = Types.MapType.ofRequired(0, 1, - Types.StringType.get(), - Types.StructType.of( - optional(2, "int1", Types.IntegerType.get()), - optional(3, "string1", Types.StringType.get()) - ) - ); - - final org.apache.avro.Schema expected = SchemaBuilder.map() - .prop(AvroSchemaUtil.KEY_ID_PROP, "0") - .prop(AvroSchemaUtil.VALUE_ID_PROP, "1") - .values( - SchemaBuilder.record("value").namespace("unit.test").fields() - .name("int1").prop(AvroSchemaUtil.FIELD_ID_PROP, "1").type().nullable().intType().noDefault() - .name("string1").prop(AvroSchemaUtil.FIELD_ID_PROP, "2").type().nullable().stringType().noDefault() - .endRecord()); - - final BuildAvroProjection testSubject = new BuildAvroProjection(icebergType, Collections.emptyMap()); + final Type icebergType = + Types.MapType.ofRequired( + 0, + 1, + Types.StringType.get(), + Types.StructType.of( + optional(2, "int1", Types.IntegerType.get()), + optional(3, "string1", Types.StringType.get()))); + + final org.apache.avro.Schema expected = + SchemaBuilder.map() + .prop(AvroSchemaUtil.KEY_ID_PROP, "0") + .prop(AvroSchemaUtil.VALUE_ID_PROP, "1") + .values( + SchemaBuilder.record("value") + .namespace("unit.test") + .fields() + .name("int1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "1") + .type() + .nullable() + .intType() + .noDefault() + .name("string1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "2") + .type() + .nullable() + .stringType() + .noDefault() + .endRecord()); + + final BuildAvroProjection testSubject = + new BuildAvroProjection(icebergType, Collections.emptyMap()); final Supplier supplier = expected::getValueType; final org.apache.avro.Schema actual = testSubject.map(expected, supplier); - assertEquals("Map projection produced undesired map schema", - expected, actual); - assertEquals("Unexpected key ID discovered on the projected map schema", - 0, Integer.valueOf(actual.getProp(AvroSchemaUtil.KEY_ID_PROP)).intValue()); - assertEquals("Unexpected value ID discovered on the projected map schema", - 1, Integer.valueOf(actual.getProp(AvroSchemaUtil.VALUE_ID_PROP)).intValue()); + assertEquals("Map projection produced undesired map schema", expected, actual); + assertEquals( + "Unexpected key ID discovered on the projected map schema", + 0, + Integer.valueOf(actual.getProp(AvroSchemaUtil.KEY_ID_PROP)).intValue()); + assertEquals( + "Unexpected value ID discovered on the projected map schema", + 1, + Integer.valueOf(actual.getProp(AvroSchemaUtil.VALUE_ID_PROP)).intValue()); } @Test public void projectMapWithExtraFieldInValueSchema() { - final Type icebergType = Types.MapType.ofRequired(0, 1, - Types.StringType.get(), - Types.StructType.of( - optional(2, "int1", Types.IntegerType.get()), - optional(3, "string1", Types.StringType.get()) - ) - ); - - final org.apache.avro.Schema extraField = SchemaBuilder.map() - .prop(AvroSchemaUtil.KEY_ID_PROP, "0") - .prop(AvroSchemaUtil.VALUE_ID_PROP, "1") - .values( - SchemaBuilder.record("value").namespace("unit.test").fields() - .name("int1").prop(AvroSchemaUtil.FIELD_ID_PROP, "1").type().nullable().intType().noDefault() - .name("string1").prop(AvroSchemaUtil.FIELD_ID_PROP, "2").type().nullable().stringType().noDefault() - .name("float1").prop(AvroSchemaUtil.FIELD_ID_PROP, "3").type().nullable().floatType().noDefault() - .endRecord()); + final Type icebergType = + Types.MapType.ofRequired( + 0, + 1, + Types.StringType.get(), + Types.StructType.of( + optional(2, "int1", Types.IntegerType.get()), + optional(3, "string1", Types.StringType.get()))); + + final org.apache.avro.Schema extraField = + SchemaBuilder.map() + .prop(AvroSchemaUtil.KEY_ID_PROP, "0") + .prop(AvroSchemaUtil.VALUE_ID_PROP, "1") + .values( + SchemaBuilder.record("value") + .namespace("unit.test") + .fields() + .name("int1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "1") + .type() + .nullable() + .intType() + .noDefault() + .name("string1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "2") + .type() + .nullable() + .stringType() + .noDefault() + .name("float1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "3") + .type() + .nullable() + .floatType() + .noDefault() + .endRecord()); // once projected onto iceberg schema, the avro schema will lose the extra float field - final org.apache.avro.Schema expected = SchemaBuilder.map() - .prop(AvroSchemaUtil.KEY_ID_PROP, "0") - .prop(AvroSchemaUtil.VALUE_ID_PROP, "1") - .values( - SchemaBuilder.record("value").namespace("unit.test").fields() - .name("int1").prop(AvroSchemaUtil.FIELD_ID_PROP, "1").type().nullable().intType().noDefault() - .name("string1").prop(AvroSchemaUtil.FIELD_ID_PROP, "2").type().nullable().stringType().noDefault() - .endRecord()); - - final BuildAvroProjection testSubject = new BuildAvroProjection(icebergType, Collections.emptyMap()); + final org.apache.avro.Schema expected = + SchemaBuilder.map() + .prop(AvroSchemaUtil.KEY_ID_PROP, "0") + .prop(AvroSchemaUtil.VALUE_ID_PROP, "1") + .values( + SchemaBuilder.record("value") + .namespace("unit.test") + .fields() + .name("int1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "1") + .type() + .nullable() + .intType() + .noDefault() + .name("string1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "2") + .type() + .nullable() + .stringType() + .noDefault() + .endRecord()); + + final BuildAvroProjection testSubject = + new BuildAvroProjection(icebergType, Collections.emptyMap()); final Supplier supplier = expected::getValueType; final org.apache.avro.Schema actual = testSubject.map(extraField, supplier); - assertEquals("Map projection produced undesired map schema", - expected, actual); - assertEquals("Unexpected key ID discovered on the projected map schema", - 0, Integer.valueOf(actual.getProp(AvroSchemaUtil.KEY_ID_PROP)).intValue()); - assertEquals("Unexpected value ID discovered on the projected map schema", - 1, Integer.valueOf(actual.getProp(AvroSchemaUtil.VALUE_ID_PROP)).intValue()); + assertEquals("Map projection produced undesired map schema", expected, actual); + assertEquals( + "Unexpected key ID discovered on the projected map schema", + 0, + Integer.valueOf(actual.getProp(AvroSchemaUtil.KEY_ID_PROP)).intValue()); + assertEquals( + "Unexpected value ID discovered on the projected map schema", + 1, + Integer.valueOf(actual.getProp(AvroSchemaUtil.VALUE_ID_PROP)).intValue()); } - @Test public void projectMapWithLessFieldInValueSchema() { - final Type icebergType = Types.MapType.ofRequired(0, 1, - Types.StringType.get(), - Types.StructType.of( - optional(2, "int1", Types.IntegerType.get()), - optional(3, "string1", Types.StringType.get()) - ) - ); - - final org.apache.avro.Schema lessField = SchemaBuilder.map() - .prop(AvroSchemaUtil.KEY_ID_PROP, "0") - .prop(AvroSchemaUtil.VALUE_ID_PROP, "1") - .values( - SchemaBuilder.record("value").namespace("unit.test").fields() - .name("int1").prop(AvroSchemaUtil.FIELD_ID_PROP, "1").type().nullable().intType().noDefault() - .endRecord()); + final Type icebergType = + Types.MapType.ofRequired( + 0, + 1, + Types.StringType.get(), + Types.StructType.of( + optional(2, "int1", Types.IntegerType.get()), + optional(3, "string1", Types.StringType.get()))); + + final org.apache.avro.Schema lessField = + SchemaBuilder.map() + .prop(AvroSchemaUtil.KEY_ID_PROP, "0") + .prop(AvroSchemaUtil.VALUE_ID_PROP, "1") + .values( + SchemaBuilder.record("value") + .namespace("unit.test") + .fields() + .name("int1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "1") + .type() + .nullable() + .intType() + .noDefault() + .endRecord()); // once projected onto iceberg schema, the avro schema will have an extra string column - final org.apache.avro.Schema expected = SchemaBuilder.map() - .prop(AvroSchemaUtil.KEY_ID_PROP, "0") - .prop(AvroSchemaUtil.VALUE_ID_PROP, "1") - .values( - SchemaBuilder.record("value").namespace("unit.test").fields() - .name("int1").prop(AvroSchemaUtil.FIELD_ID_PROP, "1").type().nullable().intType().noDefault() - .name("string1_r2").prop(AvroSchemaUtil.FIELD_ID_PROP, "2").type().nullable().stringType().noDefault() - .endRecord()); - - final BuildAvroProjection testSubject = new BuildAvroProjection(icebergType, Collections.emptyMap()); + final org.apache.avro.Schema expected = + SchemaBuilder.map() + .prop(AvroSchemaUtil.KEY_ID_PROP, "0") + .prop(AvroSchemaUtil.VALUE_ID_PROP, "1") + .values( + SchemaBuilder.record("value") + .namespace("unit.test") + .fields() + .name("int1") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "1") + .type() + .nullable() + .intType() + .noDefault() + .name("string1_r2") + .prop(AvroSchemaUtil.FIELD_ID_PROP, "2") + .type() + .nullable() + .stringType() + .noDefault() + .endRecord()); + + final BuildAvroProjection testSubject = + new BuildAvroProjection(icebergType, Collections.emptyMap()); final Supplier supplier = expected::getValueType; final org.apache.avro.Schema actual = testSubject.map(lessField, supplier); - assertEquals("Map projection produced undesired map schema", - expected, actual); - assertEquals("Unexpected key ID discovered on the projected map schema", - 0, Integer.valueOf(actual.getProp(AvroSchemaUtil.KEY_ID_PROP)).intValue()); - assertEquals("Unexpected value ID discovered on the projected map schema", - 1, Integer.valueOf(actual.getProp(AvroSchemaUtil.VALUE_ID_PROP)).intValue()); + assertEquals("Map projection produced undesired map schema", expected, actual); + assertEquals( + "Unexpected key ID discovered on the projected map schema", + 0, + Integer.valueOf(actual.getProp(AvroSchemaUtil.KEY_ID_PROP)).intValue()); + assertEquals( + "Unexpected value ID discovered on the projected map schema", + 1, + Integer.valueOf(actual.getProp(AvroSchemaUtil.VALUE_ID_PROP)).intValue()); } } diff --git a/core/src/test/java/org/apache/iceberg/avro/TestGenericAvro.java b/core/src/test/java/org/apache/iceberg/avro/TestGenericAvro.java index f88c89127c88..db4e29943cb7 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestGenericAvro.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestGenericAvro.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.IOException; @@ -34,19 +33,16 @@ protected void writeAndValidate(Schema schema) throws IOException { List expected = RandomAvroData.generate(schema, 100, 0L); OutputFile outputFile = new InMemoryOutputFile(); - try (FileAppender writer = Avro.write(outputFile) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Avro.write(outputFile).schema(schema).named("test").build()) { for (Record rec : expected) { writer.add(rec); } } List rows; - try (AvroIterable reader = Avro.read(outputFile.toInputFile()) - .project(schema) - .build()) { + try (AvroIterable reader = + Avro.read(outputFile.toInputFile()).project(schema).build()) { rows = Lists.newArrayList(reader); } diff --git a/core/src/test/java/org/apache/iceberg/avro/TestHasIds.java b/core/src/test/java/org/apache/iceberg/avro/TestHasIds.java index a9b84db3ae71..dfbab7e37dc3 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestHasIds.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestHasIds.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import org.apache.iceberg.Schema; @@ -27,14 +26,19 @@ public class TestHasIds { @Test public void test() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "location", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.optional(2, "long", Types.FloatType.get()) - )))); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "location", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.optional(2, "long", Types.FloatType.get()))))); org.apache.avro.Schema avroSchema = RemoveIds.removeIds(schema); Assert.assertFalse(AvroSchemaUtil.hasIds(avroSchema)); @@ -45,9 +49,16 @@ public void test() { // Create a fresh copy avroSchema = RemoveIds.removeIds(schema); avroSchema - .getFields().get(1).schema() - .getTypes().get(1).getValueType() - .getTypes().get(1).getFields().get(1) + .getFields() + .get(1) + .schema() + .getTypes() + .get(1) + .getValueType() + .getTypes() + .get(1) + .getFields() + .get(1) .addProp("field-id", 1); Assert.assertTrue(AvroSchemaUtil.hasIds(avroSchema)); } diff --git a/core/src/test/java/org/apache/iceberg/avro/TestReadProjection.java b/core/src/test/java/org/apache/iceberg/avro/TestReadProjection.java index 6c86155a0acd..6b8ba00891b0 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestReadProjection.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestReadProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.IOException; @@ -37,20 +36,17 @@ import org.junit.rules.TemporaryFolder; public abstract class TestReadProjection { - protected abstract Record writeAndRead(String desc, - Schema writeSchema, - Schema readSchema, - Record record) throws IOException; + protected abstract Record writeAndRead( + String desc, Schema writeSchema, Schema readSchema, Record record) throws IOException; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = new Record(AvroSchemaUtil.convert(schema, "table")); record.put("id", 34L); @@ -60,26 +56,25 @@ public void testFullProjection() throws Exception { Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.get("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.get("data")); + int cmp = Comparators.charSequences().compare("test", (CharSequence) projected.get("data")); Assert.assertTrue("Should contain the correct data value", cmp == 0); } @Test public void testReorderedFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = new Record(AvroSchemaUtil.convert(schema, "table")); record.put("id", 34L); record.put("data", "test"); - Schema reordered = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("full_projection", schema, reordered, record); @@ -89,20 +84,20 @@ public void testReorderedFullProjection() throws Exception { @Test public void testReorderedProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = new Record(AvroSchemaUtil.convert(schema, "table")); record.put("id", 34L); record.put("data", "test"); - Schema reordered = new Schema( - Types.NestedField.optional(2, "missing_1", Types.StringType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(3, "missing_2", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(2, "missing_1", Types.StringType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(3, "missing_2", Types.LongType.get())); Record projected = writeAndRead("full_projection", schema, reordered, record); @@ -113,10 +108,10 @@ public void testReorderedProjection() throws Exception { @Test public void testEmptyProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = new Record(AvroSchemaUtil.convert(schema, "table")); record.put("id", 34L); @@ -135,131 +130,129 @@ public void testEmptyProjection() throws Exception { @Test public void testBasicProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); record.put("data", "test"); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("basic_projection_id", writeSchema, idOnly, record); AssertHelpers.assertEmptyAvroField(projected, "data"); Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.get("id")); - Schema dataOnly = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, record); AssertHelpers.assertEmptyAvroField(projected, "id"); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.get("data")); + int cmp = Comparators.charSequences().compare("test", (CharSequence) projected.get("data")); Assert.assertEquals("Should contain the correct data value", 0, cmp); } @Test public void testRename() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); record.put("data", "test"); - Schema readSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get()) - ); + Schema readSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); Record projected = writeAndRead("project_and_rename", writeSchema, readSchema, record); Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.get("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.get("renamed")); + int cmp = Comparators.charSequences().compare("test", (CharSequence) projected.get("renamed")); Assert.assertEquals("Should contain the correct data/renamed value", 0, cmp); } @Test public void testNestedStructProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); - Record location = new Record( - AvroSchemaUtil.fromOption(record.getSchema().getField("location").schema())); + Record location = + new Record(AvroSchemaUtil.fromOption(record.getSchema().getField("location").schema())); location.put("lat", 52.995143f); location.put("long", -1.539054f); record.put("location", location); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); AssertHelpers.assertEmptyAvroField(projected, "location"); Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.get("id")); - Schema latOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()) - )) - ); + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); projected = writeAndRead("latitude_only", writeSchema, latOnly, record); Record projectedLocation = (Record) projected.get("location"); AssertHelpers.assertEmptyAvroField(projected, "id"); Assert.assertNotNull("Should project location", projected.get("location")); AssertHelpers.assertEmptyAvroField(projectedLocation, "long"); - Assert.assertEquals("Should project latitude", - 52.995143f, (float) projectedLocation.get("lat"), 0.000001f); + Assert.assertEquals( + "Should project latitude", 52.995143f, (float) projectedLocation.get("lat"), 0.000001f); - Schema longOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); projected = writeAndRead("longitude_only", writeSchema, longOnly, record); projectedLocation = (Record) projected.get("location"); AssertHelpers.assertEmptyAvroField(projected, "id"); Assert.assertNotNull("Should project location", projected.get("location")); AssertHelpers.assertEmptyAvroField(projectedLocation, "lat"); - Assert.assertEquals("Should project longitude", - -1.539054f, (float) projectedLocation.get("long"), 0.000001f); + Assert.assertEquals( + "Should project longitude", -1.539054f, (float) projectedLocation.get("long"), 0.000001f); Schema locationOnly = writeSchema.select("location"); projected = writeAndRead("location_only", writeSchema, locationOnly, record); projectedLocation = (Record) projected.get("location"); AssertHelpers.assertEmptyAvroField(projected, "id"); Assert.assertNotNull("Should project location", projected.get("location")); - Assert.assertEquals("Should project latitude", - 52.995143f, (float) projectedLocation.get("lat"), 0.000001f); - Assert.assertEquals("Should project longitude", - -1.539054f, (float) projectedLocation.get("long"), 0.000001f); + Assert.assertEquals( + "Should project latitude", 52.995143f, (float) projectedLocation.get("lat"), 0.000001f); + Assert.assertEquals( + "Should project longitude", -1.539054f, (float) projectedLocation.get("long"), 0.000001f); } @Test public void testMapProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "properties", - Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "properties", + Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); Map properties = ImmutableMap.of("a", "A", "b", "B"); @@ -267,9 +260,7 @@ public void testMapProjection() throws IOException { record.put("id", 34L); record.put("properties", properties); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.get("id")); @@ -278,20 +269,20 @@ public void testMapProjection() throws IOException { Schema keyOnly = writeSchema.select("properties.key"); projected = writeAndRead("key_only", writeSchema, keyOnly, record); AssertHelpers.assertEmptyAvroField(projected, "id"); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.get("properties"))); + Assert.assertEquals( + "Should project entire map", properties, toStringMap((Map) projected.get("properties"))); Schema valueOnly = writeSchema.select("properties.value"); projected = writeAndRead("value_only", writeSchema, valueOnly, record); AssertHelpers.assertEmptyAvroField(projected, "id"); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.get("properties"))); + Assert.assertEquals( + "Should project entire map", properties, toStringMap((Map) projected.get("properties"))); Schema mapOnly = writeSchema.select("properties"); projected = writeAndRead("map_only", writeSchema, mapOnly, record); AssertHelpers.assertEmptyAvroField(projected, "id"); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.get("properties"))); + Assert.assertEquals( + "Should project entire map", properties, toStringMap((Map) projected.get("properties"))); } private Map toStringMap(Map map) { @@ -308,22 +299,27 @@ public void testMapProjection() throws IOException { @Test public void testMapOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); - Record l1 = new Record(AvroSchemaUtil.fromOption( - AvroSchemaUtil.fromOption(record.getSchema().getField("locations").schema()) - .getValueType())); + Record l1 = + new Record( + AvroSchemaUtil.fromOption( + AvroSchemaUtil.fromOption(record.getSchema().getField("locations").schema()) + .getValueType())); l1.put("lat", 53.992811f); l1.put("long", -1.542616f); Record l2 = new Record(l1.getSchema()); @@ -331,9 +327,7 @@ public void testMapOfStructsProjection() throws IOException { l2.put("long", -1.539054f); record.put("locations", ImmutableMap.of("L1", l1, "L2", l2)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.get("id")); @@ -341,81 +335,85 @@ public void testMapOfStructsProjection() throws IOException { projected = writeAndRead("all_locations", writeSchema, writeSchema.select("locations"), record); AssertHelpers.assertEmptyAvroField(projected, "id"); - Assert.assertEquals("Should project locations map", - record.get("locations"), toStringMap((Map) projected.get("locations"))); + Assert.assertEquals( + "Should project locations map", + record.get("locations"), + toStringMap((Map) projected.get("locations"))); - projected = writeAndRead("lat_only", - writeSchema, writeSchema.select("locations.lat"), record); + projected = writeAndRead("lat_only", writeSchema, writeSchema.select("locations.lat"), record); AssertHelpers.assertEmptyAvroField(projected, "id"); Map locations = toStringMap((Map) projected.get("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); Record projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain lat", - 53.992811f, (float) projectedL1.get("lat"), 0.000001); + Assert.assertEquals( + "L1 should contain lat", 53.992811f, (float) projectedL1.get("lat"), 0.000001); AssertHelpers.assertEmptyAvroField(projectedL1, "long"); Record projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain lat", - 52.995143f, (float) projectedL2.get("lat"), 0.000001); + Assert.assertEquals( + "L2 should contain lat", 52.995143f, (float) projectedL2.get("lat"), 0.000001); AssertHelpers.assertEmptyAvroField(projectedL2, "y"); - projected = writeAndRead("long_only", - writeSchema, writeSchema.select("locations.long"), record); + projected = + writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), record); AssertHelpers.assertEmptyAvroField(projected, "id"); locations = toStringMap((Map) projected.get("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); AssertHelpers.assertEmptyAvroField(projectedL1, "lat"); - Assert.assertEquals("L1 should contain long", - -1.542616f, (float) projectedL1.get("long"), 0.000001); + Assert.assertEquals( + "L1 should contain long", -1.542616f, (float) projectedL1.get("long"), 0.000001); projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); AssertHelpers.assertEmptyAvroField(projectedL2, "lat"); - Assert.assertEquals("L2 should contain long", - -1.539054f, (float) projectedL2.get("long"), 0.000001); - - Schema latitiudeRenamed = new Schema( - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "latitude", Types.FloatType.get()) - ) - )) - ); + Assert.assertEquals( + "L2 should contain long", -1.539054f, (float) projectedL2.get("long"), 0.000001); + + Schema latitiudeRenamed = + new Schema( + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, record); AssertHelpers.assertEmptyAvroField(projected, "id"); locations = toStringMap((Map) projected.get("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain latitude", - 53.992811f, (float) projectedL1.get("latitude"), 0.000001); + Assert.assertEquals( + "L1 should contain latitude", 53.992811f, (float) projectedL1.get("latitude"), 0.000001); AssertHelpers.assertEmptyAvroField(projectedL1, "lat"); AssertHelpers.assertEmptyAvroField(projectedL1, "long"); projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain latitude", - 52.995143f, (float) projectedL2.get("latitude"), 0.000001); + Assert.assertEquals( + "L2 should contain latitude", 52.995143f, (float) projectedL2.get("latitude"), 0.000001); AssertHelpers.assertEmptyAvroField(projectedL2, "lat"); AssertHelpers.assertEmptyAvroField(projectedL2, "long"); } @Test public void testListProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(10, "values", - Types.ListType.ofOptional(11, Types.LongType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); List values = ImmutableList.of(56L, 57L, 58L); @@ -423,9 +421,7 @@ public void testListProjection() throws IOException { record.put("id", 34L); record.put("values", values); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.get("id")); @@ -445,21 +441,25 @@ public void testListProjection() throws IOException { @Test @SuppressWarnings("unchecked") public void testListOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()) - )) - ) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); - Record p1 = new Record(AvroSchemaUtil.fromOption( - AvroSchemaUtil.fromOption(record.getSchema().getField("points").schema()) - .getElementType())); + Record p1 = + new Record( + AvroSchemaUtil.fromOption( + AvroSchemaUtil.fromOption(record.getSchema().getField("points").schema()) + .getElementType())); p1.put("x", 1); p1.put("y", 2); Record p2 = new Record(p1.getSchema()); @@ -467,9 +467,7 @@ public void testListOfStructsProjection() throws IOException { p2.put("y", null); record.put("points", ImmutableList.of(p1, p2)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.get("id")); @@ -477,8 +475,8 @@ public void testListOfStructsProjection() throws IOException { projected = writeAndRead("all_points", writeSchema, writeSchema.select("points"), record); AssertHelpers.assertEmptyAvroField(projected, "id"); - Assert.assertEquals("Should project points list", - record.get("points"), projected.get("points")); + Assert.assertEquals( + "Should project points list", record.get("points"), projected.get("points")); projected = writeAndRead("x_only", writeSchema, writeSchema.select("points.x"), record); AssertHelpers.assertEmptyAvroField(projected, "id"); @@ -504,13 +502,15 @@ public void testListOfStructsProjection() throws IOException { AssertHelpers.assertEmptyAvroField(projectedP2, "x"); Assert.assertEquals("Should project null y", null, projectedP2.get("y")); - Schema yRenamed = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.optional(18, "z", Types.IntegerType.get()) - )) - ) - ); + Schema yRenamed = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); projected = writeAndRead("y_renamed", writeSchema, yRenamed, record); AssertHelpers.assertEmptyAvroField(projected, "id"); @@ -529,25 +529,26 @@ public void testListOfStructsProjection() throws IOException { @Test public void testEmptyStructProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); - Record location = new Record( - AvroSchemaUtil.fromOption(record.getSchema().getField("location").schema())); + Record location = + new Record(AvroSchemaUtil.fromOption(record.getSchema().getField("location").schema())); location.put("lat", 52.995143f); location.put("long", -1.539054f); record.put("location", location); - Schema emptyStruct = new Schema( - Types.NestedField.required(3, "location", Types.StructType.of()) - ); + Schema emptyStruct = + new Schema(Types.NestedField.required(3, "location", Types.StructType.of())); Record projected = writeAndRead("empty_proj", writeSchema, emptyStruct, record); AssertHelpers.assertEmptyAvroField(projected, "id"); @@ -561,13 +562,15 @@ public void testEmptyStructProjection() throws Exception { @Test public void testEmptyStructRequiredProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.required(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.required( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); @@ -576,9 +579,8 @@ public void testEmptyStructRequiredProjection() throws Exception { location.put("long", -1.539054f); record.put("location", location); - Schema emptyStruct = new Schema( - Types.NestedField.required(3, "location", Types.StructType.of()) - ); + Schema emptyStruct = + new Schema(Types.NestedField.required(3, "location", Types.StructType.of())); Record projected = writeAndRead("empty_req_proj", writeSchema, emptyStruct, record); AssertHelpers.assertEmptyAvroField(projected, "id"); @@ -591,14 +593,16 @@ public void testEmptyStructRequiredProjection() throws Exception { @Test public void testRequiredEmptyStructInRequiredStruct() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.required(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()), - Types.NestedField.required(4, "empty", Types.StructType.of()) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.required( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()), + Types.NestedField.required(4, "empty", Types.StructType.of())))); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); @@ -607,12 +611,14 @@ public void testRequiredEmptyStructInRequiredStruct() throws Exception { location.put("long", -1.539054f); record.put("location", location); - Schema emptyStruct = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.required(3, "location", Types.StructType.of( - Types.NestedField.required(4, "empty", Types.StructType.of()) - )) - ); + Schema emptyStruct = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.required( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(4, "empty", Types.StructType.of())))); Record projected = writeAndRead("req_empty_req_proj", writeSchema, emptyStruct, record); Assert.assertEquals("Should project id", 34L, projected.get("id")); @@ -623,37 +629,44 @@ public void testRequiredEmptyStructInRequiredStruct() throws Exception { AssertHelpers.assertEmptyAvroField(result, "long"); Assert.assertNotNull("Should project empty", result.getSchema().getField("empty")); Assert.assertNotNull("Empty should not be null", result.get("empty")); - Assert.assertEquals("Empty should be empty", 0, - ((Record) result.get("empty")).getSchema().getFields().size()); + Assert.assertEquals( + "Empty should be empty", 0, ((Record) result.get("empty")).getSchema().getFields().size()); } @Test public void testEmptyNestedStructProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "outer", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.optional(2, "inner", Types.StructType.of( - Types.NestedField.required(5, "lon", Types.FloatType.get()) - ) - ) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "outer", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.optional( + 2, + "inner", + Types.StructType.of( + Types.NestedField.required(5, "lon", Types.FloatType.get())))))); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); - Record outer = new Record( - AvroSchemaUtil.fromOption(record.getSchema().getField("outer").schema())); - Record inner = new Record(AvroSchemaUtil.fromOption(outer.getSchema().getField("inner").schema())); + Record outer = + new Record(AvroSchemaUtil.fromOption(record.getSchema().getField("outer").schema())); + Record inner = + new Record(AvroSchemaUtil.fromOption(outer.getSchema().getField("inner").schema())); inner.put("lon", 32.14f); outer.put("lat", 52.995143f); outer.put("inner", inner); record.put("outer", outer); - Schema emptyStruct = new Schema( - Types.NestedField.required(3, "outer", Types.StructType.of( - Types.NestedField.required(2, "inner", Types.StructType.of()) - ))); + Schema emptyStruct = + new Schema( + Types.NestedField.required( + 3, + "outer", + Types.StructType.of( + Types.NestedField.required(2, "inner", Types.StructType.of())))); Record projected = writeAndRead("nested_empty_proj", writeSchema, emptyStruct, record); AssertHelpers.assertEmptyAvroField(projected, "id"); @@ -669,16 +682,19 @@ public void testEmptyNestedStructProjection() throws Exception { @Test public void testEmptyNestedStructRequiredProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.required(3, "outer", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "inner", Types.StructType.of( - Types.NestedField.required(5, "lon", Types.FloatType.get()) - ) - ) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.required( + 3, + "outer", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required( + 2, + "inner", + Types.StructType.of( + Types.NestedField.required(5, "lon", Types.FloatType.get())))))); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); @@ -689,10 +705,13 @@ public void testEmptyNestedStructRequiredProjection() throws Exception { outer.put("inner", inner); record.put("outer", outer); - Schema emptyStruct = new Schema( - Types.NestedField.required(3, "outer", Types.StructType.of( - Types.NestedField.required(2, "inner", Types.StructType.of()) - ))); + Schema emptyStruct = + new Schema( + Types.NestedField.required( + 3, + "outer", + Types.StructType.of( + Types.NestedField.required(2, "inner", Types.StructType.of())))); Record projected = writeAndRead("nested_empty_req_proj", writeSchema, emptyStruct, record); AssertHelpers.assertEmptyAvroField(projected, "id"); diff --git a/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java b/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java index 4d649dcde49f..8d57aac29c96 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java @@ -16,9 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; +import static org.apache.iceberg.avro.AvroTestHelpers.addElementId; +import static org.apache.iceberg.avro.AvroTestHelpers.addKeyId; +import static org.apache.iceberg.avro.AvroTestHelpers.addValueId; +import static org.apache.iceberg.avro.AvroTestHelpers.optionalField; +import static org.apache.iceberg.avro.AvroTestHelpers.record; +import static org.apache.iceberg.avro.AvroTestHelpers.requiredField; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.List; import org.apache.avro.LogicalTypes; import org.apache.avro.Schema; @@ -30,66 +38,60 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.avro.AvroTestHelpers.addElementId; -import static org.apache.iceberg.avro.AvroTestHelpers.addKeyId; -import static org.apache.iceberg.avro.AvroTestHelpers.addValueId; -import static org.apache.iceberg.avro.AvroTestHelpers.optionalField; -import static org.apache.iceberg.avro.AvroTestHelpers.record; -import static org.apache.iceberg.avro.AvroTestHelpers.requiredField; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSchemaConversions { @Test public void testPrimitiveTypes() { - List primitives = Lists.newArrayList( - Types.BooleanType.get(), - Types.IntegerType.get(), - Types.LongType.get(), - Types.FloatType.get(), - Types.DoubleType.get(), - Types.DateType.get(), - Types.TimeType.get(), - Types.TimestampType.withZone(), - Types.TimestampType.withoutZone(), - Types.StringType.get(), - Types.UUIDType.get(), - Types.FixedType.ofLength(12), - Types.BinaryType.get(), - Types.DecimalType.of(9, 4) - ); - - List avroPrimitives = Lists.newArrayList( - Schema.create(Schema.Type.BOOLEAN), - Schema.create(Schema.Type.INT), - Schema.create(Schema.Type.LONG), - Schema.create(Schema.Type.FLOAT), - Schema.create(Schema.Type.DOUBLE), - LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT)), - LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG)), - addAdjustToUtc(LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)), true), - addAdjustToUtc(LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)), false), - Schema.create(Schema.Type.STRING), - LogicalTypes.uuid().addToSchema(Schema.createFixed("uuid_fixed", null, null, 16)), - Schema.createFixed("fixed_12", null, null, 12), - Schema.create(Schema.Type.BYTES), - LogicalTypes.decimal(9, 4).addToSchema(Schema.createFixed("decimal_9_4", null, null, 4)) - ); + List primitives = + Lists.newArrayList( + Types.BooleanType.get(), + Types.IntegerType.get(), + Types.LongType.get(), + Types.FloatType.get(), + Types.DoubleType.get(), + Types.DateType.get(), + Types.TimeType.get(), + Types.TimestampType.withZone(), + Types.TimestampType.withoutZone(), + Types.StringType.get(), + Types.UUIDType.get(), + Types.FixedType.ofLength(12), + Types.BinaryType.get(), + Types.DecimalType.of(9, 4)); + + List avroPrimitives = + Lists.newArrayList( + Schema.create(Schema.Type.BOOLEAN), + Schema.create(Schema.Type.INT), + Schema.create(Schema.Type.LONG), + Schema.create(Schema.Type.FLOAT), + Schema.create(Schema.Type.DOUBLE), + LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT)), + LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG)), + addAdjustToUtc( + LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)), true), + addAdjustToUtc( + LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)), false), + Schema.create(Schema.Type.STRING), + LogicalTypes.uuid().addToSchema(Schema.createFixed("uuid_fixed", null, null, 16)), + Schema.createFixed("fixed_12", null, null, 12), + Schema.create(Schema.Type.BYTES), + LogicalTypes.decimal(9, 4) + .addToSchema(Schema.createFixed("decimal_9_4", null, null, 4))); for (int i = 0; i < primitives.size(); i += 1) { Type type = primitives.get(i); Schema avro = avroPrimitives.get(i); - Assert.assertEquals("Avro schema to primitive: " + avro, - type, AvroSchemaUtil.convert(avro)); - Assert.assertEquals("Primitive to avro schema: " + type, - avro, AvroSchemaUtil.convert(type)); + Assert.assertEquals("Avro schema to primitive: " + avro, type, AvroSchemaUtil.convert(avro)); + Assert.assertEquals("Primitive to avro schema: " + type, avro, AvroSchemaUtil.convert(type)); } } @Test public void testAvroToIcebergTimestampTypeWithoutAdjustToUTC() { - // Not included in the primitives test because there is not a way to round trip the avro<->iceberg conversion - // This is because iceberg types can only can encode adjust-to-utc=true|false but not a missing adjust-to-utc + // Not included in the primitives test because there is not a way to round trip the + // avro<->iceberg conversion + // This is because iceberg types can only can encode adjust-to-utc=true|false but not a missing + // adjust-to-utc Type expectedIcebergType = Types.TimestampType.withoutZone(); Schema avroType = LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)); @@ -103,187 +105,213 @@ private Schema addAdjustToUtc(Schema schema, boolean adjustToUTC) { @Test public void testStructAndPrimitiveTypes() { - Types.StructType struct = Types.StructType.of( - optional(20, "bool", Types.BooleanType.get()), - optional(21, "int", Types.IntegerType.get()), - optional(22, "long", Types.LongType.get()), - optional(23, "float", Types.FloatType.get()), - optional(24, "double", Types.DoubleType.get()), - optional(25, "date", Types.DateType.get()), - optional(27, "time", Types.TimeType.get()), - optional(28, "timestamptz", Types.TimestampType.withZone()), - optional(29, "timestamp", Types.TimestampType.withoutZone()), - optional(30, "string", Types.StringType.get()), - optional(31, "uuid", Types.UUIDType.get()), - optional(32, "fixed", Types.FixedType.ofLength(16)), - optional(33, "binary", Types.BinaryType.get()), - optional(34, "decimal", Types.DecimalType.of(14, 2)) - ); - - Schema schema = record("primitives", - optionalField(20, "bool", Schema.create(Schema.Type.BOOLEAN)), - optionalField(21, "int", Schema.create(Schema.Type.INT)), - optionalField(22, "long", Schema.create(Schema.Type.LONG)), - optionalField(23, "float", Schema.create(Schema.Type.FLOAT)), - optionalField(24, "double", Schema.create(Schema.Type.DOUBLE)), - optionalField(25, "date", LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT))), - optionalField(27, "time", LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG))), - optionalField( - 28, - "timestamptz", - addAdjustToUtc(LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)), true)), - optionalField( - 29, - "timestamp", - addAdjustToUtc(LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)), false)), - optionalField(30, "string", Schema.create(Schema.Type.STRING)), - optionalField(31, "uuid", LogicalTypes.uuid().addToSchema(Schema.createFixed("uuid_fixed", null, null, 16))), - optionalField(32, "fixed", Schema.createFixed("fixed_16", null, null, 16)), - optionalField(33, "binary", Schema.create(Schema.Type.BYTES)), - optionalField( - 34, - "decimal", - LogicalTypes.decimal(14, 2).addToSchema(Schema.createFixed("decimal_14_2", null, null, 6))) - ); - - Assert.assertEquals("Test conversion from Avro schema", - struct, AvroSchemaUtil.convert(schema)); - Assert.assertEquals("Test conversion to Avro schema", - schema, AvroSchemaUtil.convert(struct, "primitives")); + Types.StructType struct = + Types.StructType.of( + optional(20, "bool", Types.BooleanType.get()), + optional(21, "int", Types.IntegerType.get()), + optional(22, "long", Types.LongType.get()), + optional(23, "float", Types.FloatType.get()), + optional(24, "double", Types.DoubleType.get()), + optional(25, "date", Types.DateType.get()), + optional(27, "time", Types.TimeType.get()), + optional(28, "timestamptz", Types.TimestampType.withZone()), + optional(29, "timestamp", Types.TimestampType.withoutZone()), + optional(30, "string", Types.StringType.get()), + optional(31, "uuid", Types.UUIDType.get()), + optional(32, "fixed", Types.FixedType.ofLength(16)), + optional(33, "binary", Types.BinaryType.get()), + optional(34, "decimal", Types.DecimalType.of(14, 2))); + + Schema schema = + record( + "primitives", + optionalField(20, "bool", Schema.create(Schema.Type.BOOLEAN)), + optionalField(21, "int", Schema.create(Schema.Type.INT)), + optionalField(22, "long", Schema.create(Schema.Type.LONG)), + optionalField(23, "float", Schema.create(Schema.Type.FLOAT)), + optionalField(24, "double", Schema.create(Schema.Type.DOUBLE)), + optionalField( + 25, "date", LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT))), + optionalField( + 27, "time", LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG))), + optionalField( + 28, + "timestamptz", + addAdjustToUtc( + LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)), + true)), + optionalField( + 29, + "timestamp", + addAdjustToUtc( + LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG)), + false)), + optionalField(30, "string", Schema.create(Schema.Type.STRING)), + optionalField( + 31, + "uuid", + LogicalTypes.uuid().addToSchema(Schema.createFixed("uuid_fixed", null, null, 16))), + optionalField(32, "fixed", Schema.createFixed("fixed_16", null, null, 16)), + optionalField(33, "binary", Schema.create(Schema.Type.BYTES)), + optionalField( + 34, + "decimal", + LogicalTypes.decimal(14, 2) + .addToSchema(Schema.createFixed("decimal_14_2", null, null, 6)))); + + Assert.assertEquals("Test conversion from Avro schema", struct, AvroSchemaUtil.convert(schema)); + Assert.assertEquals( + "Test conversion to Avro schema", schema, AvroSchemaUtil.convert(struct, "primitives")); } @Test public void testList() { Type list = Types.ListType.ofRequired(34, Types.UUIDType.get()); - Schema schema = addElementId(34, SchemaBuilder.array().items( - LogicalTypes.uuid().addToSchema(Schema.createFixed("uuid_fixed", null, null, 16)))); + Schema schema = + addElementId( + 34, + SchemaBuilder.array() + .items( + LogicalTypes.uuid() + .addToSchema(Schema.createFixed("uuid_fixed", null, null, 16)))); - Assert.assertEquals("Avro schema to list", - list, AvroSchemaUtil.convert(schema)); - Assert.assertEquals("List to Avro schema", - schema, AvroSchemaUtil.convert(list)); + Assert.assertEquals("Avro schema to list", list, AvroSchemaUtil.convert(schema)); + Assert.assertEquals("List to Avro schema", schema, AvroSchemaUtil.convert(list)); } @Test public void testListOfStructs() { - Type list = Types.ListType.ofRequired(34, Types.StructType.of( - required(35, "lat", Types.FloatType.get()), - required(36, "long", Types.FloatType.get()) - )); - - Schema schema = addElementId(34, SchemaBuilder.array().items( - record("r34", - requiredField(35, "lat", Schema.create(Schema.Type.FLOAT)), - requiredField(36, "long", Schema.create(Schema.Type.FLOAT))) - )); - - Assert.assertEquals("Avro schema to list", - list, AvroSchemaUtil.convert(schema)); - Assert.assertEquals("List to Avro schema", - schema, AvroSchemaUtil.convert(list)); + Type list = + Types.ListType.ofRequired( + 34, + Types.StructType.of( + required(35, "lat", Types.FloatType.get()), + required(36, "long", Types.FloatType.get()))); + + Schema schema = + addElementId( + 34, + SchemaBuilder.array() + .items( + record( + "r34", + requiredField(35, "lat", Schema.create(Schema.Type.FLOAT)), + requiredField(36, "long", Schema.create(Schema.Type.FLOAT))))); + + Assert.assertEquals("Avro schema to list", list, AvroSchemaUtil.convert(schema)); + Assert.assertEquals("List to Avro schema", schema, AvroSchemaUtil.convert(list)); } @Test public void testMapOfLongToBytes() { Type map = Types.MapType.ofRequired(33, 34, Types.LongType.get(), Types.BinaryType.get()); - Schema schema = AvroSchemaUtil.createMap( - 33, Schema.create(Schema.Type.LONG), - 34, Schema.create(Schema.Type.BYTES)); - - Assert.assertEquals("Avro schema to map", - map, AvroSchemaUtil.convert(schema)); - Assert.assertEquals("Map to Avro schema", - schema, AvroSchemaUtil.convert(map)); + Schema schema = + AvroSchemaUtil.createMap( + 33, Schema.create(Schema.Type.LONG), + 34, Schema.create(Schema.Type.BYTES)); + + Assert.assertEquals("Avro schema to map", map, AvroSchemaUtil.convert(schema)); + Assert.assertEquals("Map to Avro schema", schema, AvroSchemaUtil.convert(map)); } @Test public void testMapOfStringToBytes() { Type map = Types.MapType.ofRequired(33, 34, Types.StringType.get(), Types.BinaryType.get()); - Schema schema = addKeyId(33, addValueId(34, SchemaBuilder.map().values( - Schema.create(Schema.Type.BYTES)))); + Schema schema = + addKeyId(33, addValueId(34, SchemaBuilder.map().values(Schema.create(Schema.Type.BYTES)))); - Assert.assertEquals("Avro schema to map", - map, AvroSchemaUtil.convert(schema)); - Assert.assertEquals("Map to Avro schema", - schema, AvroSchemaUtil.convert(map)); + Assert.assertEquals("Avro schema to map", map, AvroSchemaUtil.convert(schema)); + Assert.assertEquals("Map to Avro schema", schema, AvroSchemaUtil.convert(map)); } @Test public void testMapOfListToStructs() { - Type map = Types.MapType.ofRequired(33, 34, - Types.ListType.ofRequired(35, Types.IntegerType.get()), - Types.StructType.of( - required(36, "a", Types.IntegerType.get()), - optional(37, "b", Types.IntegerType.get()) - )); - Schema schema = AvroSchemaUtil.createMap( - 33, addElementId(35, Schema.createArray(Schema.create(Schema.Type.INT))), - 34, record("r34", - requiredField(36, "a", Schema.create(Schema.Type.INT)), - optionalField(37, "b", Schema.create(Schema.Type.INT)))); - - Assert.assertEquals("Avro schema to map", - map, AvroSchemaUtil.convert(schema)); - Assert.assertEquals("Map to Avro schema", - schema, AvroSchemaUtil.convert(map)); + Type map = + Types.MapType.ofRequired( + 33, + 34, + Types.ListType.ofRequired(35, Types.IntegerType.get()), + Types.StructType.of( + required(36, "a", Types.IntegerType.get()), + optional(37, "b", Types.IntegerType.get()))); + Schema schema = + AvroSchemaUtil.createMap( + 33, addElementId(35, Schema.createArray(Schema.create(Schema.Type.INT))), + 34, + record( + "r34", + requiredField(36, "a", Schema.create(Schema.Type.INT)), + optionalField(37, "b", Schema.create(Schema.Type.INT)))); + + Assert.assertEquals("Avro schema to map", map, AvroSchemaUtil.convert(schema)); + Assert.assertEquals("Map to Avro schema", schema, AvroSchemaUtil.convert(map)); } @Test public void testMapOfStringToStructs() { - Type map = Types.MapType.ofRequired(33, 34, Types.StringType.get(), Types.StructType.of( - required(35, "a", Types.IntegerType.get()), - optional(36, "b", Types.IntegerType.get()) - )); - Schema schema = addKeyId(33, addValueId(34, SchemaBuilder.map().values( - record("r34", - requiredField(35, "a", Schema.create(Schema.Type.INT)), - optionalField(36, "b", Schema.create(Schema.Type.INT)))))); - - Assert.assertEquals("Avro schema to map", - map, AvroSchemaUtil.convert(schema)); - Assert.assertEquals("Map to Avro schema", - schema, AvroSchemaUtil.convert(map)); + Type map = + Types.MapType.ofRequired( + 33, + 34, + Types.StringType.get(), + Types.StructType.of( + required(35, "a", Types.IntegerType.get()), + optional(36, "b", Types.IntegerType.get()))); + Schema schema = + addKeyId( + 33, + addValueId( + 34, + SchemaBuilder.map() + .values( + record( + "r34", + requiredField(35, "a", Schema.create(Schema.Type.INT)), + optionalField(36, "b", Schema.create(Schema.Type.INT)))))); + + Assert.assertEquals("Avro schema to map", map, AvroSchemaUtil.convert(schema)); + Assert.assertEquals("Map to Avro schema", schema, AvroSchemaUtil.convert(map)); } @Test public void testComplexSchema() { - org.apache.iceberg.Schema schema = new org.apache.iceberg.Schema( - required(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()), - optional( - 3, - "preferences", - Types.StructType - .of(required( - 8, - "feature1", - Types.BooleanType.get()), optional(9, "feature2", Types.BooleanType.get()))), - required( - 4, - "locations", - Types.MapType.ofRequired( - 10, - 11, + org.apache.iceberg.Schema schema = + new org.apache.iceberg.Schema( + required(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get()), + optional( + 3, + "preferences", Types.StructType.of( - required(20, "address", Types.StringType.get()), - required(21, "city", Types.StringType.get()), - required(22, "state", Types.StringType.get()), - required(23, "zip", Types.IntegerType.get()) - ), - Types.StructType.of(required( - 12, "lat", Types.FloatType.get()), required(13, "long", Types.FloatType.get())) - ) - ), - optional( - 5, - "points", - Types.ListType.ofOptional( - 14, - Types.StructType.of(required( - 15, "x", Types.LongType.get()), required(16, "y", Types.LongType.get())))), - required(6, "doubles", Types.ListType.ofRequired(17, Types.DoubleType.get())), - optional(7, "properties", Types.MapType.ofOptional( - 18, 19, Types.StringType.get(), Types.StringType.get()))); + required(8, "feature1", Types.BooleanType.get()), + optional(9, "feature2", Types.BooleanType.get()))), + required( + 4, + "locations", + Types.MapType.ofRequired( + 10, + 11, + Types.StructType.of( + required(20, "address", Types.StringType.get()), + required(21, "city", Types.StringType.get()), + required(22, "state", Types.StringType.get()), + required(23, "zip", Types.IntegerType.get())), + Types.StructType.of( + required(12, "lat", Types.FloatType.get()), + required(13, "long", Types.FloatType.get())))), + optional( + 5, + "points", + Types.ListType.ofOptional( + 14, + Types.StructType.of( + required(15, "x", Types.LongType.get()), + required(16, "y", Types.LongType.get())))), + required(6, "doubles", Types.ListType.ofRequired(17, Types.DoubleType.get())), + optional( + 7, + "properties", + Types.MapType.ofOptional(18, 19, Types.StringType.get(), Types.StringType.get()))); AvroSchemaUtil.convert(schema, "newTableName").toString(true); } @@ -291,38 +319,46 @@ public void testComplexSchema() { @Test public void testSpecialChars() { List names = Lists.newArrayList("9x", "x_", "a.b", "☃", "a#b"); - org.apache.iceberg.Schema schema = new org.apache.iceberg.Schema( - required(1, names.get(0), Types.IntegerType.get()), - required(2, names.get(1), Types.StringType.get()), - required(3, names.get(2), Types.IntegerType.get()), - required(4, names.get(3), Types.IntegerType.get()), - required(5, names.get(4), Types.IntegerType.get())); + org.apache.iceberg.Schema schema = + new org.apache.iceberg.Schema( + required(1, names.get(0), Types.IntegerType.get()), + required(2, names.get(1), Types.StringType.get()), + required(3, names.get(2), Types.IntegerType.get()), + required(4, names.get(3), Types.IntegerType.get()), + required(5, names.get(4), Types.IntegerType.get())); Schema avroSchema = AvroSchemaUtil.convert(schema.asStruct()); - List sanitizedNames = Lists.newArrayList(Iterables.transform(avroSchema.getFields(), Schema.Field::name)); - List expectedSanitizedNames = Lists.newArrayList("_9x", "x_", "a_x2Eb", "_x2603", "a_x23b"); + List sanitizedNames = + Lists.newArrayList(Iterables.transform(avroSchema.getFields(), Schema.Field::name)); + List expectedSanitizedNames = + Lists.newArrayList("_9x", "x_", "a_x2Eb", "_x2603", "a_x23b"); Assert.assertEquals(expectedSanitizedNames, sanitizedNames); - List origNames = Lists.newArrayList( - Iterables.transform(avroSchema.getFields(), f -> f.getProp(AvroSchemaUtil.ICEBERG_FIELD_NAME_PROP))); + List origNames = + Lists.newArrayList( + Iterables.transform( + avroSchema.getFields(), f -> f.getProp(AvroSchemaUtil.ICEBERG_FIELD_NAME_PROP))); List expectedOrigNames = Lists.newArrayList(names); - expectedOrigNames.set(1, null); // Name at pos 1 is valid so ICEBERG_FIELD_NAME_PROP is not set + expectedOrigNames.set(1, null); // Name at pos 1 is valid so ICEBERG_FIELD_NAME_PROP is not set Assert.assertEquals(expectedOrigNames, origNames); } @Test public void testFieldDocsArePreserved() { List fieldDocs = Lists.newArrayList(null, "iceberg originating field doc"); - org.apache.iceberg.Schema icebergSchema = new org.apache.iceberg.Schema( - required(1, "id", Types.IntegerType.get(), fieldDocs.get(0)), - optional(2, "data", Types.StringType.get(), fieldDocs.get(1))); + org.apache.iceberg.Schema icebergSchema = + new org.apache.iceberg.Schema( + required(1, "id", Types.IntegerType.get(), fieldDocs.get(0)), + optional(2, "data", Types.StringType.get(), fieldDocs.get(1))); Schema avroSchema = AvroSchemaUtil.convert(icebergSchema.asStruct()); - List avroFieldDocs = Lists.newArrayList(Iterables.transform(avroSchema.getFields(), Schema.Field::doc)); + List avroFieldDocs = + Lists.newArrayList(Iterables.transform(avroSchema.getFields(), Schema.Field::doc)); Assert.assertEquals(avroFieldDocs, fieldDocs); org.apache.iceberg.Schema origSchema = AvroSchemaUtil.toIceberg(avroSchema); - List origFieldDocs = Lists.newArrayList(Iterables.transform(origSchema.columns(), Types.NestedField::doc)); + List origFieldDocs = + Lists.newArrayList(Iterables.transform(origSchema.columns(), Types.NestedField::doc)); Assert.assertEquals(origFieldDocs, fieldDocs); } } diff --git a/core/src/test/java/org/apache/iceberg/catalog/CatalogTests.java b/core/src/test/java/org/apache/iceberg/catalog/CatalogTests.java index 135bef28c58a..0130dbdb6e43 100644 --- a/core/src/test/java/org/apache/iceberg/catalog/CatalogTests.java +++ b/core/src/test/java/org/apache/iceberg/catalog/CatalogTests.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.catalog; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.io.UncheckedIOException; import java.util.Arrays; @@ -63,86 +64,76 @@ import org.junit.Assume; import org.junit.jupiter.api.Test; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class CatalogTests { private static final Namespace NS = Namespace.of("newdb"); protected static final TableIdentifier TABLE = TableIdentifier.of(NS, "table"); private static final TableIdentifier RENAMED_TABLE = TableIdentifier.of(NS, "table_renamed"); // Schema passed to create tables - protected static final Schema SCHEMA = new Schema( - required(3, "id", Types.IntegerType.get(), "unique ID"), - required(4, "data", Types.StringType.get()) - ); + protected static final Schema SCHEMA = + new Schema( + required(3, "id", Types.IntegerType.get(), "unique ID"), + required(4, "data", Types.StringType.get())); // This is the actual schema for the table, with column IDs reassigned - private static final Schema TABLE_SCHEMA = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); + private static final Schema TABLE_SCHEMA = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); // This is the actual schema for the table, with column IDs reassigned - private static final Schema REPLACE_SCHEMA = new Schema( - required(2, "id", Types.IntegerType.get(), "unique ID"), - required(3, "data", Types.StringType.get()) - ); + private static final Schema REPLACE_SCHEMA = + new Schema( + required(2, "id", Types.IntegerType.get(), "unique ID"), + required(3, "data", Types.StringType.get())); // another schema that is not the same - private static final Schema OTHER_SCHEMA = new Schema( - required(1, "some_id", Types.IntegerType.get()) - ); + private static final Schema OTHER_SCHEMA = + new Schema(required(1, "some_id", Types.IntegerType.get())); // Partition spec used to create tables - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .bucket("id", 16) - .build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).bucket("id", 16).build(); - private static final PartitionSpec TABLE_SPEC = PartitionSpec.builderFor(TABLE_SCHEMA) - .bucket("id", 16) - .build(); + private static final PartitionSpec TABLE_SPEC = + PartitionSpec.builderFor(TABLE_SCHEMA).bucket("id", 16).build(); - private static final PartitionSpec REPLACE_SPEC = PartitionSpec.builderFor(REPLACE_SCHEMA) - .bucket("id", 16) - .withSpecId(1) - .build(); + private static final PartitionSpec REPLACE_SPEC = + PartitionSpec.builderFor(REPLACE_SCHEMA).bucket("id", 16).withSpecId(1).build(); // Partition spec used to create tables - static final SortOrder WRITE_ORDER = SortOrder.builderFor(SCHEMA) - .asc(Expressions.bucket("id", 16)) - .asc("id") - .build(); - - static final SortOrder TABLE_WRITE_ORDER = SortOrder.builderFor(TABLE_SCHEMA) - .asc(Expressions.bucket("id", 16)) - .asc("id") - .build(); - - static final SortOrder REPLACE_WRITE_ORDER = SortOrder.builderFor(REPLACE_SCHEMA) - .asc(Expressions.bucket("id", 16)) - .asc("id") - .build(); - - static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("id_bucket=0") // easy way to set partition data for now - .withRecordCount(2) // needs at least one record or else metrics will filter it out - .build(); - - static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("id_bucket=1") // easy way to set partition data for now - .withRecordCount(2) // needs at least one record or else metrics will filter it out - .build(); - - static final DataFile FILE_C = DataFiles.builder(SPEC) - .withPath("/path/to/data-c.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("id_bucket=2") // easy way to set partition data for now - .withRecordCount(2) // needs at least one record or else metrics will filter it out - .build(); + static final SortOrder WRITE_ORDER = + SortOrder.builderFor(SCHEMA).asc(Expressions.bucket("id", 16)).asc("id").build(); + + static final SortOrder TABLE_WRITE_ORDER = + SortOrder.builderFor(TABLE_SCHEMA).asc(Expressions.bucket("id", 16)).asc("id").build(); + + static final SortOrder REPLACE_WRITE_ORDER = + SortOrder.builderFor(REPLACE_SCHEMA).asc(Expressions.bucket("id", 16)).asc("id").build(); + + static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("id_bucket=0") // easy way to set partition data for now + .withRecordCount(2) // needs at least one record or else metrics will filter it out + .build(); + + static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("id_bucket=1") // easy way to set partition data for now + .withRecordCount(2) // needs at least one record or else metrics will filter it out + .build(); + + static final DataFile FILE_C = + DataFiles.builder(SPEC) + .withPath("/path/to/data-c.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("id_bucket=2") // easy way to set partition data for now + .withRecordCount(2) // needs at least one record or else metrics will filter it out + .build(); protected abstract C catalog(); @@ -177,7 +168,8 @@ public void testCreateNamespace() { Assert.assertFalse("Namespace should not exist", catalog.namespaceExists(NS)); catalog.createNamespace(NS); - Assert.assertTrue("Catalog should have the created namespace", catalog.listNamespaces().contains(NS)); + Assert.assertTrue( + "Catalog should have the created namespace", catalog.listNamespaces().contains(NS)); Assert.assertTrue("Namespace should exist", catalog.namespaceExists(NS)); } @@ -190,8 +182,11 @@ public void testCreateExistingNamespace() { catalog.createNamespace(NS); Assert.assertTrue("Namespace should exist", catalog.namespaceExists(NS)); - AssertHelpers.assertThrows("Should fail to create an existing database", - AlreadyExistsException.class, "newdb", () -> catalog.createNamespace(NS)); + AssertHelpers.assertThrows( + "Should fail to create an existing database", + AlreadyExistsException.class, + "newdb", + () -> catalog.createNamespace(NS)); Assert.assertTrue("Namespace should still exist", catalog.namespaceExists(NS)); } @@ -208,7 +203,8 @@ public void testCreateNamespaceWithProperties() { Assert.assertTrue("Namespace should exist", catalog.namespaceExists(NS)); Map props = catalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Create properties should be a subset of returned properties", + Assert.assertEquals( + "Create properties should be a subset of returned properties", createProps.entrySet(), Sets.intersection(createProps.entrySet(), props.entrySet())); } @@ -219,8 +215,10 @@ public void testLoadNamespaceMetadata() { Assert.assertFalse("Namespace should not exist", catalog.namespaceExists(NS)); - AssertHelpers.assertThrows("Should fail to load nonexistent namespace metadata", - NoSuchNamespaceException.class, "newdb", + AssertHelpers.assertThrows( + "Should fail to load nonexistent namespace metadata", + NoSuchNamespaceException.class, + "newdb", () -> catalog.loadNamespaceMetadata(NS)); catalog.createNamespace(NS); @@ -243,7 +241,8 @@ public void testSetNamespaceProperties() { catalog.setProperties(NS, properties); Map actualProperties = catalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Set properties should be a subset of returned properties", + Assert.assertEquals( + "Set properties should be a subset of returned properties", properties.entrySet(), Sets.intersection(properties.entrySet(), actualProperties.entrySet())); } @@ -260,7 +259,8 @@ public void testUpdateNamespaceProperties() { catalog.setProperties(NS, initialProperties); Map actualProperties = catalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Set properties should be a subset of returned properties", + Assert.assertEquals( + "Set properties should be a subset of returned properties", initialProperties.entrySet(), Sets.intersection(initialProperties.entrySet(), actualProperties.entrySet())); @@ -269,7 +269,8 @@ public void testUpdateNamespaceProperties() { catalog.setProperties(NS, updatedProperties); Map finalProperties = catalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Updated properties should be a subset of returned properties", + Assert.assertEquals( + "Updated properties should be a subset of returned properties", updatedProperties.entrySet(), Sets.intersection(updatedProperties.entrySet(), finalProperties.entrySet())); } @@ -286,16 +287,19 @@ public void testUpdateAndSetNamespaceProperties() { catalog.setProperties(NS, initialProperties); Map actualProperties = catalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Set properties should be a subset of returned properties", + Assert.assertEquals( + "Set properties should be a subset of returned properties", initialProperties.entrySet(), Sets.intersection(initialProperties.entrySet(), actualProperties.entrySet())); - Map updatedProperties = ImmutableMap.of("owner", "newuser", "last-modified-at", "now"); + Map updatedProperties = + ImmutableMap.of("owner", "newuser", "last-modified-at", "now"); catalog.setProperties(NS, updatedProperties); Map finalProperties = catalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Updated properties should be a subset of returned properties", + Assert.assertEquals( + "Updated properties should be a subset of returned properties", updatedProperties.entrySet(), Sets.intersection(updatedProperties.entrySet(), finalProperties.entrySet())); } @@ -306,8 +310,10 @@ public void testSetNamespacePropertiesNamespaceDoesNotExist() { C catalog = catalog(); - AssertHelpers.assertThrows("setProperties should fail if the namespace does not exist", - NoSuchNamespaceException.class, "does not exist", + AssertHelpers.assertThrows( + "setProperties should fail if the namespace does not exist", + NoSuchNamespaceException.class, + "does not exist", () -> catalog.setProperties(NS, ImmutableMap.of("test", "value"))); } @@ -324,8 +330,10 @@ public void testRemoveNamespaceProperties() { catalog.removeProperties(NS, ImmutableSet.of("created-at")); Map actualProperties = catalog.loadNamespaceMetadata(NS); - Assert.assertFalse("Should not contain deleted property key", actualProperties.containsKey("created-at")); - Assert.assertEquals("Expected properties should be a subset of returned properties", + Assert.assertFalse( + "Should not contain deleted property key", actualProperties.containsKey("created-at")); + Assert.assertEquals( + "Expected properties should be a subset of returned properties", ImmutableMap.of("owner", "user").entrySet(), Sets.intersection(properties.entrySet(), actualProperties.entrySet())); } @@ -336,8 +344,10 @@ public void testRemoveNamespacePropertiesNamespaceDoesNotExist() { C catalog = catalog(); - AssertHelpers.assertThrows("setProperties should fail if the namespace does not exist", - NoSuchNamespaceException.class, "does not exist", + AssertHelpers.assertThrows( + "setProperties should fail if the namespace does not exist", + NoSuchNamespaceException.class, + "does not exist", () -> catalog.removeProperties(NS, ImmutableSet.of("a", "b"))); } @@ -350,7 +360,8 @@ public void testDropNamespace() { catalog.createNamespace(NS); Assert.assertTrue("Namespace should exist", catalog.namespaceExists(NS)); - Assert.assertTrue("Dropping an existing namespace should return true", catalog.dropNamespace(NS)); + Assert.assertTrue( + "Dropping an existing namespace should return true", catalog.dropNamespace(NS)); Assert.assertFalse("Namespace should not exist", catalog.namespaceExists(NS)); } @@ -358,7 +369,8 @@ public void testDropNamespace() { public void testDropNonexistentNamespace() { C catalog = catalog(); - Assert.assertFalse("Dropping a nonexistent namespace should return false", catalog.dropNamespace(NS)); + Assert.assertFalse( + "Dropping a nonexistent namespace should return false", catalog.dropNamespace(NS)); } @Test @@ -372,26 +384,28 @@ public void testListNamespaces() { catalog.createNamespace(ns1); Assertions.assertThat(catalog.listNamespaces()) - .withFailMessage("Should include newdb_1") - .hasSameElementsAs(concat(starting, ns1)); + .withFailMessage("Should include newdb_1") + .hasSameElementsAs(concat(starting, ns1)); catalog.createNamespace(ns2); Assertions.assertThat(catalog.listNamespaces()) - .withFailMessage("Should include newdb_1 and newdb_2") - .hasSameElementsAs(concat(starting, ns1, ns2)); + .withFailMessage("Should include newdb_1 and newdb_2") + .hasSameElementsAs(concat(starting, ns1, ns2)); catalog.dropNamespace(ns1); Assertions.assertThat(catalog.listNamespaces()) - .withFailMessage("Should include newdb_2, not newdb_1") - .hasSameElementsAs(concat(starting, ns2)); + .withFailMessage("Should include newdb_2, not newdb_1") + .hasSameElementsAs(concat(starting, ns2)); catalog.dropNamespace(ns2); - Assert.assertTrue("Should include only starting namespaces", catalog.listNamespaces().containsAll(starting)); + Assert.assertTrue( + "Should include only starting namespaces", catalog.listNamespaces().containsAll(starting)); } @Test public void testListNestedNamespaces() { - Assume.assumeTrue("Only valid when the catalog supports nested namespaces", supportsNestedNamespaces()); + Assume.assumeTrue( + "Only valid when the catalog supports nested namespaces", supportsNestedNamespaces()); C catalog = catalog(); @@ -486,9 +500,12 @@ public void testBasicCreateTable() { Assert.assertTrue("Table should exist", catalog.tableExists(ident)); // validate table settings - Assert.assertEquals("Table name should report its full name", catalog.name() + "." + ident, table.name()); - Assert.assertEquals("Schema should match expected ID assignment", - TABLE_SCHEMA.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Table name should report its full name", catalog.name() + "." + ident, table.name()); + Assert.assertEquals( + "Schema should match expected ID assignment", + TABLE_SCHEMA.asStruct(), + table.schema().asStruct()); Assert.assertNotNull("Should have a location", table.location()); Assert.assertTrue("Should be unpartitioned", table.spec().isUnpartitioned()); Assert.assertTrue("Should be unsorted", table.sortOrder().isUnsorted()); @@ -512,8 +529,10 @@ public void testTableNameWithSlash() { Assert.assertTrue("Table should exist", catalog.tableExists(ident)); Table loaded = catalog.loadTable(ident); - Assert.assertEquals("Schema should match expected ID assignment", - TABLE_SCHEMA.asStruct(), loaded.schema().asStruct()); + Assert.assertEquals( + "Schema should match expected ID assignment", + TABLE_SCHEMA.asStruct(), + loaded.schema().asStruct()); catalog.dropTable(ident); @@ -535,8 +554,10 @@ public void testTableNameWithDot() { Assert.assertTrue("Table should exist", catalog.tableExists(ident)); Table loaded = catalog.loadTable(ident); - Assert.assertEquals("Schema should match expected ID assignment", - TABLE_SCHEMA.asStruct(), loaded.schema().asStruct()); + Assert.assertEquals( + "Schema should match expected ID assignment", + TABLE_SCHEMA.asStruct(), + loaded.schema().asStruct()); catalog.dropTable(ident); @@ -554,13 +575,17 @@ public void testBasicCreateTableThatAlreadyExists() { catalog.buildTable(ident, SCHEMA).create(); Assert.assertTrue("Table should exist", catalog.tableExists(ident)); - AssertHelpers.assertThrows("Should fail to create a table that already exists", - AlreadyExistsException.class, "ns.table", + AssertHelpers.assertThrows( + "Should fail to create a table that already exists", + AlreadyExistsException.class, + "ns.table", () -> catalog.buildTable(ident, OTHER_SCHEMA).create()); Table table = catalog.loadTable(ident); - Assert.assertEquals("Schema should match original table schema", - TABLE_SCHEMA.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Schema should match original table schema", + TABLE_SCHEMA.asStruct(), + table.schema().asStruct()); } @Test @@ -571,23 +596,30 @@ public void testCompleteCreateTable() { Assert.assertFalse("Table should not exist", catalog.tableExists(ident)); - Map properties = ImmutableMap.of("user", "someone", "created-at", "2022-02-25T00:38:19"); - Table table = catalog.buildTable(ident, SCHEMA) - .withLocation("file:/tmp/ns/table") - .withPartitionSpec(SPEC) - .withSortOrder(WRITE_ORDER) - .withProperties(properties) - .create(); + Map properties = + ImmutableMap.of("user", "someone", "created-at", "2022-02-25T00:38:19"); + Table table = + catalog + .buildTable(ident, SCHEMA) + .withLocation("file:/tmp/ns/table") + .withPartitionSpec(SPEC) + .withSortOrder(WRITE_ORDER) + .withProperties(properties) + .create(); // validate table settings - Assert.assertEquals("Table name should report its full name", catalog.name() + "." + ident, table.name()); + Assert.assertEquals( + "Table name should report its full name", catalog.name() + "." + ident, table.name()); Assert.assertTrue("Table should exist", catalog.tableExists(ident)); - Assert.assertEquals("Schema should match expected ID assignment", - TABLE_SCHEMA.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Schema should match expected ID assignment", + TABLE_SCHEMA.asStruct(), + table.schema().asStruct()); Assert.assertNotNull("Should have a location", table.location()); Assert.assertEquals("Should use requested partition spec", TABLE_SPEC, table.spec()); Assert.assertEquals("Should use requested write order", TABLE_WRITE_ORDER, table.sortOrder()); - Assert.assertEquals("Table properties should be a superset of the requested properties", + Assert.assertEquals( + "Table properties should be a superset of the requested properties", properties.entrySet(), Sets.intersection(properties.entrySet(), table.properties().entrySet())); } @@ -600,8 +632,10 @@ public void testLoadTable() { Assert.assertFalse("Table should not exist", catalog.tableExists(ident)); - Map properties = ImmutableMap.of("user", "someone", "created-at", "2022-02-25T00:38:19"); - catalog.buildTable(ident, SCHEMA) + Map properties = + ImmutableMap.of("user", "someone", "created-at", "2022-02-25T00:38:19"); + catalog + .buildTable(ident, SCHEMA) .withLocation("file:/tmp/ns/table") .withPartitionSpec(SPEC) .withSortOrder(WRITE_ORDER) @@ -611,14 +645,18 @@ public void testLoadTable() { Table table = catalog.loadTable(ident); // validate table settings - Assert.assertEquals("Table name should report its full name", catalog.name() + "." + ident, table.name()); + Assert.assertEquals( + "Table name should report its full name", catalog.name() + "." + ident, table.name()); Assert.assertTrue("Table should exist", catalog.tableExists(ident)); - Assert.assertEquals("Schema should match expected ID assignment", - TABLE_SCHEMA.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Schema should match expected ID assignment", + TABLE_SCHEMA.asStruct(), + table.schema().asStruct()); Assert.assertNotNull("Should have a location", table.location()); Assert.assertEquals("Should use requested partition spec", TABLE_SPEC, table.spec()); Assert.assertEquals("Should use requested write order", TABLE_WRITE_ORDER, table.sortOrder()); - Assert.assertEquals("Table properties should be a superset of the requested properties", + Assert.assertEquals( + "Table properties should be a superset of the requested properties", properties.entrySet(), Sets.intersection(properties.entrySet(), table.properties().entrySet())); } @@ -649,8 +687,10 @@ public void testLoadMissingTable() { TableIdentifier ident = TableIdentifier.of("ns", "table"); Assert.assertFalse("Table should not exist", catalog.tableExists(ident)); - AssertHelpers.assertThrows("Should fail to load a nonexistent table", - NoSuchTableException.class, ident.toString(), + AssertHelpers.assertThrows( + "Should fail to load a nonexistent table", + NoSuchTableException.class, + ident.toString(), () -> catalog.loadTable(ident)); } @@ -667,7 +707,8 @@ public void testRenameTable() { catalog.buildTable(TABLE, SCHEMA).create(); Assert.assertTrue("Table should exist after create", catalog.tableExists(TABLE)); - Assert.assertFalse("Destination table should not exist before rename", catalog.tableExists(RENAMED_TABLE)); + Assert.assertFalse( + "Destination table should not exist before rename", catalog.tableExists(RENAMED_TABLE)); catalog.renameTable(TABLE, RENAMED_TABLE); Assert.assertTrue("Table should exist with new name", catalog.tableExists(RENAMED_TABLE)); @@ -686,14 +727,18 @@ public void testRenameTableMissingSourceTable() { } Assert.assertFalse("Source table should not exist before rename", catalog.tableExists(TABLE)); - Assert.assertFalse("Destination table should not exist before rename", catalog.tableExists(RENAMED_TABLE)); + Assert.assertFalse( + "Destination table should not exist before rename", catalog.tableExists(RENAMED_TABLE)); - AssertHelpers.assertThrows("Should reject renaming a table that does not exist", + AssertHelpers.assertThrows( + "Should reject renaming a table that does not exist", NoSuchTableException.class, "Table does not exist", () -> catalog.renameTable(TABLE, RENAMED_TABLE)); - Assert.assertFalse("Destination table should not exist after failed rename", catalog.tableExists(RENAMED_TABLE)); + Assert.assertFalse( + "Destination table should not exist after failed rename", + catalog.tableExists(RENAMED_TABLE)); } @Test @@ -708,22 +753,32 @@ public void testRenameTableDestinationTableAlreadyExists() { catalog.buildTable(TABLE, SCHEMA).create(); Assert.assertTrue("Source table should exist after create", catalog.tableExists(TABLE)); - Assert.assertFalse("Destination table should not exist before create", catalog.tableExists(RENAMED_TABLE)); + Assert.assertFalse( + "Destination table should not exist before create", catalog.tableExists(RENAMED_TABLE)); catalog.buildTable(RENAMED_TABLE, SCHEMA).create(); - Assert.assertTrue("Destination table should exist after create", catalog.tableExists(RENAMED_TABLE)); + Assert.assertTrue( + "Destination table should exist after create", catalog.tableExists(RENAMED_TABLE)); - AssertHelpers.assertThrows("Should reject renaming a table if the new name already exists", + AssertHelpers.assertThrows( + "Should reject renaming a table if the new name already exists", AlreadyExistsException.class, "Table already exists", () -> catalog.renameTable(TABLE, RENAMED_TABLE)); - Assert.assertTrue("Source table should still exist after failed rename", catalog.tableExists(TABLE)); - Assert.assertTrue("Destination table should still exist after failed rename", catalog.tableExists(RENAMED_TABLE)); + Assert.assertTrue( + "Source table should still exist after failed rename", catalog.tableExists(TABLE)); + Assert.assertTrue( + "Destination table should still exist after failed rename", + catalog.tableExists(RENAMED_TABLE)); - String sourceTableUUID = ((HasTableOperations) catalog.loadTable(TABLE)).operations().current().uuid(); - String destinationTableUUID = ((HasTableOperations) catalog.loadTable(RENAMED_TABLE)).operations().current().uuid(); - Assert.assertNotEquals("Source and destination table should remain distinct after failed rename", - sourceTableUUID, destinationTableUUID); + String sourceTableUUID = + ((HasTableOperations) catalog.loadTable(TABLE)).operations().current().uuid(); + String destinationTableUUID = + ((HasTableOperations) catalog.loadTable(RENAMED_TABLE)).operations().current().uuid(); + Assert.assertNotEquals( + "Source and destination table should remain distinct after failed rename", + sourceTableUUID, + destinationTableUUID); } @Test @@ -754,7 +809,8 @@ public void testDropMissingTable() { TableIdentifier noSuchTableIdent = TableIdentifier.of(NS, "notable"); Assert.assertFalse("Table should not exist", catalog.tableExists(noSuchTableIdent)); - Assert.assertFalse("Should not drop a table that does not exist", catalog.dropTable(noSuchTableIdent)); + Assert.assertFalse( + "Should not drop a table that does not exist", catalog.dropTable(noSuchTableIdent)); } @Test @@ -778,34 +834,50 @@ public void testListTables() { catalog.buildTable(ns1Table1, SCHEMA).create(); - Assert.assertEquals("Should contain ns_1.table_1 after create", - ImmutableSet.of(ns1Table1), Sets.newHashSet(catalog.listTables(ns1))); + Assert.assertEquals( + "Should contain ns_1.table_1 after create", + ImmutableSet.of(ns1Table1), + Sets.newHashSet(catalog.listTables(ns1))); catalog.buildTable(ns2Table1, SCHEMA).create(); - Assert.assertEquals("Should contain ns_2.table_1 after create", - ImmutableSet.of(ns2Table1), Sets.newHashSet(catalog.listTables(ns2))); - Assert.assertEquals("Should not show changes to ns_2 in ns_1", - ImmutableSet.of(ns1Table1), Sets.newHashSet(catalog.listTables(ns1))); + Assert.assertEquals( + "Should contain ns_2.table_1 after create", + ImmutableSet.of(ns2Table1), + Sets.newHashSet(catalog.listTables(ns2))); + Assert.assertEquals( + "Should not show changes to ns_2 in ns_1", + ImmutableSet.of(ns1Table1), + Sets.newHashSet(catalog.listTables(ns1))); catalog.buildTable(ns1Table2, SCHEMA).create(); - Assert.assertEquals("Should not show changes to ns_1 in ns_2", - ImmutableSet.of(ns2Table1), Sets.newHashSet(catalog.listTables(ns2))); - Assert.assertEquals("Should contain ns_1.table_2 after create", - ImmutableSet.of(ns1Table1, ns1Table2), Sets.newHashSet(catalog.listTables(ns1))); + Assert.assertEquals( + "Should not show changes to ns_1 in ns_2", + ImmutableSet.of(ns2Table1), + Sets.newHashSet(catalog.listTables(ns2))); + Assert.assertEquals( + "Should contain ns_1.table_2 after create", + ImmutableSet.of(ns1Table1, ns1Table2), + Sets.newHashSet(catalog.listTables(ns1))); catalog.dropTable(ns1Table1); - Assert.assertEquals("Should not show changes to ns_1 in ns_2", - ImmutableSet.of(ns2Table1), Sets.newHashSet(catalog.listTables(ns2))); - Assert.assertEquals("Should not contain ns_1.table_1 after drop", - ImmutableSet.of(ns1Table2), Sets.newHashSet(catalog.listTables(ns1))); + Assert.assertEquals( + "Should not show changes to ns_1 in ns_2", + ImmutableSet.of(ns2Table1), + Sets.newHashSet(catalog.listTables(ns2))); + Assert.assertEquals( + "Should not contain ns_1.table_1 after drop", + ImmutableSet.of(ns1Table2), + Sets.newHashSet(catalog.listTables(ns1))); catalog.dropTable(ns1Table2); - Assert.assertEquals("Should not show changes to ns_1 in ns_2", - ImmutableSet.of(ns2Table1), Sets.newHashSet(catalog.listTables(ns2))); + Assert.assertEquals( + "Should not show changes to ns_1 in ns_2", + ImmutableSet.of(ns2Table1), + Sets.newHashSet(catalog.listTables(ns2))); assertEmpty("Should not contain ns_1.table_2 after drop", catalog, ns1); catalog.dropTable(ns2Table1); @@ -817,8 +889,7 @@ public void testUpdateTableSchema() { C catalog = catalog(); Table table = catalog.buildTable(TABLE, SCHEMA).create(); - UpdateSchema update = table.updateSchema() - .addColumn("new_col", Types.LongType.get()); + UpdateSchema update = table.updateSchema().addColumn("new_col", Types.LongType.get()); Schema expected = update.apply(); @@ -826,7 +897,10 @@ public void testUpdateTableSchema() { Table loaded = catalog.loadTable(TABLE); - Assert.assertEquals("Loaded table should have expected schema", expected.asStruct(), loaded.schema().asStruct()); + Assert.assertEquals( + "Loaded table should have expected schema", + expected.asStruct(), + loaded.schema().asStruct()); } @Test @@ -834,42 +908,49 @@ public void testUUIDValidation() { C catalog = catalog(); Table table = catalog.buildTable(TABLE, SCHEMA).create(); - UpdateSchema update = table.updateSchema() - .addColumn("new_col", Types.LongType.get()); + UpdateSchema update = table.updateSchema().addColumn("new_col", Types.LongType.get()); Assert.assertTrue("Should successfully drop table", catalog.dropTable(TABLE)); catalog.buildTable(TABLE, OTHER_SCHEMA).create(); - String expectedMessage = supportsServerSideRetry() ? "Requirement failed: UUID does not match" : "Cannot commit"; - AssertHelpers.assertThrows("Should reject changes to tables that have been dropped and recreated", - CommitFailedException.class, expectedMessage, update::commit); + String expectedMessage = + supportsServerSideRetry() ? "Requirement failed: UUID does not match" : "Cannot commit"; + AssertHelpers.assertThrows( + "Should reject changes to tables that have been dropped and recreated", + CommitFailedException.class, + expectedMessage, + update::commit); Table loaded = catalog.loadTable(TABLE); - Assert.assertEquals("Loaded table should have expected schema", - OTHER_SCHEMA.asStruct(), loaded.schema().asStruct()); + Assert.assertEquals( + "Loaded table should have expected schema", + OTHER_SCHEMA.asStruct(), + loaded.schema().asStruct()); } @Test public void testUpdateTableSchemaServerSideRetry() { - Assume.assumeTrue("Schema update recovery is only supported with server-side retry", supportsServerSideRetry()); + Assume.assumeTrue( + "Schema update recovery is only supported with server-side retry", + supportsServerSideRetry()); C catalog = catalog(); Table table = catalog.buildTable(TABLE, SCHEMA).create(); - UpdateSchema update = table.updateSchema() - .addColumn("new_col", Types.LongType.get()); + UpdateSchema update = table.updateSchema().addColumn("new_col", Types.LongType.get()); Schema expected = update.apply(); // update the spec concurrently so that the first update fails, but can succeed on retry - catalog.loadTable(TABLE).updateSpec() - .addField("shard", Expressions.bucket("id", 16)) - .commit(); + catalog.loadTable(TABLE).updateSpec().addField("shard", Expressions.bucket("id", 16)).commit(); // commit the original update update.commit(); Table loaded = catalog.loadTable(TABLE); - Assert.assertEquals("Loaded table should have expected schema", expected.asStruct(), loaded.schema().asStruct()); + Assert.assertEquals( + "Loaded table should have expected schema", + expected.asStruct(), + loaded.schema().asStruct()); } @Test @@ -878,23 +959,27 @@ public void testUpdateTableSchemaConflict() { Table table = catalog.buildTable(TABLE, SCHEMA).create(); - UpdateSchema update = table.updateSchema() - .addColumn("new_col", Types.LongType.get()); + UpdateSchema update = table.updateSchema().addColumn("new_col", Types.LongType.get()); // update the schema concurrently so that the original update fails - UpdateSchema concurrent = catalog.loadTable(TABLE).updateSchema() - .deleteColumn("data"); + UpdateSchema concurrent = catalog.loadTable(TABLE).updateSchema().deleteColumn("data"); Schema expected = concurrent.apply(); concurrent.commit(); // attempt to commit the original update - String expectedMessage = supportsServerSideRetry() ? - "Requirement failed: current schema changed" : "Cannot commit"; - AssertHelpers.assertThrows("Second schema update commit should fail because of a conflict", - CommitFailedException.class, expectedMessage, update::commit); + String expectedMessage = + supportsServerSideRetry() ? "Requirement failed: current schema changed" : "Cannot commit"; + AssertHelpers.assertThrows( + "Second schema update commit should fail because of a conflict", + CommitFailedException.class, + expectedMessage, + update::commit); Table loaded = catalog.loadTable(TABLE); - Assert.assertEquals("Loaded table should have expected schema", expected.asStruct(), loaded.schema().asStruct()); + Assert.assertEquals( + "Loaded table should have expected schema", + expected.asStruct(), + loaded.schema().asStruct()); } @Test @@ -903,23 +988,30 @@ public void testUpdateTableSchemaAssignmentConflict() { Table table = catalog.buildTable(TABLE, SCHEMA).create(); - UpdateSchema update = table.updateSchema() - .addColumn("new_col", Types.LongType.get()); + UpdateSchema update = table.updateSchema().addColumn("new_col", Types.LongType.get()); // update the schema concurrently so that the original update fails - UpdateSchema concurrent = catalog.loadTable(TABLE).updateSchema() - .addColumn("another_col", Types.StringType.get()); + UpdateSchema concurrent = + catalog.loadTable(TABLE).updateSchema().addColumn("another_col", Types.StringType.get()); Schema expected = concurrent.apply(); concurrent.commit(); // attempt to commit the original update - String expectedMessage = supportsServerSideRetry() ? - "Requirement failed: last assigned field id changed" : "Cannot commit"; - AssertHelpers.assertThrows("Second schema update commit should fail because of a conflict", - CommitFailedException.class, expectedMessage, update::commit); + String expectedMessage = + supportsServerSideRetry() + ? "Requirement failed: last assigned field id changed" + : "Cannot commit"; + AssertHelpers.assertThrows( + "Second schema update commit should fail because of a conflict", + CommitFailedException.class, + expectedMessage, + update::commit); Table loaded = catalog.loadTable(TABLE); - Assert.assertEquals("Loaded table should have expected schema", expected.asStruct(), loaded.schema().asStruct()); + Assert.assertEquals( + "Loaded table should have expected schema", + expected.asStruct(), + loaded.schema().asStruct()); } @Test @@ -928,19 +1020,19 @@ public void testUpdateTableSchemaThenRevert() { Table table = catalog.buildTable(TABLE, SCHEMA).create(); - table.updateSchema() + table + .updateSchema() .addColumn("col1", Types.StringType.get()) .addColumn("col2", Types.StringType.get()) .addColumn("col3", Types.StringType.get()) .commit(); - table.updateSchema() - .deleteColumn("col1") - .deleteColumn("col2") - .deleteColumn("col3") - .commit(); + table.updateSchema().deleteColumn("col1").deleteColumn("col2").deleteColumn("col3").commit(); - Assert.assertEquals("Loaded table should have expected schema", TABLE_SCHEMA.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Loaded table should have expected schema", + TABLE_SCHEMA.asStruct(), + table.schema().asStruct()); } @Test @@ -948,8 +1040,7 @@ public void testUpdateTableSpec() { C catalog = catalog(); Table table = catalog.buildTable(TABLE, SCHEMA).create(); - UpdatePartitionSpec update = table.updateSpec() - .addField("shard", Expressions.bucket("id", 16)); + UpdatePartitionSpec update = table.updateSpec().addField("shard", Expressions.bucket("id", 16)); PartitionSpec expected = update.apply(); @@ -958,22 +1049,25 @@ public void testUpdateTableSpec() { Table loaded = catalog.loadTable(TABLE); // the spec ID may not match, so check equality of the fields - Assert.assertEquals("Loaded table should have expected spec", expected.fields(), loaded.spec().fields()); + Assert.assertEquals( + "Loaded table should have expected spec", expected.fields(), loaded.spec().fields()); } @Test public void testUpdateTableSpecServerSideRetry() { - Assume.assumeTrue("Spec update recovery is only supported with server-side retry", supportsServerSideRetry()); + Assume.assumeTrue( + "Spec update recovery is only supported with server-side retry", supportsServerSideRetry()); C catalog = catalog(); Table table = catalog.buildTable(TABLE, SCHEMA).create(); - UpdatePartitionSpec update = table.updateSpec() - .addField("shard", Expressions.bucket("id", 16)); + UpdatePartitionSpec update = table.updateSpec().addField("shard", Expressions.bucket("id", 16)); PartitionSpec expected = update.apply(); // update the schema concurrently so that the first update fails, but can succeed on retry - catalog.loadTable(TABLE).updateSchema() + catalog + .loadTable(TABLE) + .updateSchema() .addColumn("another_col", Types.StringType.get()) .commit(); @@ -983,7 +1077,8 @@ public void testUpdateTableSpecServerSideRetry() { Table loaded = catalog.loadTable(TABLE); // the spec ID may not match, so check equality of the fields - Assert.assertEquals("Loaded table should have expected spec", expected.fields(), loaded.spec().fields()); + Assert.assertEquals( + "Loaded table should have expected spec", expected.fields(), loaded.spec().fields()); } @Test @@ -992,25 +1087,31 @@ public void testUpdateTableSpecConflict() { Table table = catalog.buildTable(TABLE, SCHEMA).withPartitionSpec(SPEC).create(); - UpdatePartitionSpec update = table.updateSpec() - .addField("shard", Expressions.bucket("data", 16)); + UpdatePartitionSpec update = + table.updateSpec().addField("shard", Expressions.bucket("data", 16)); // update the spec concurrently so that the original update fails - UpdatePartitionSpec concurrent = catalog.loadTable(TABLE).updateSpec() - .removeField(Expressions.bucket("id", 16)); + UpdatePartitionSpec concurrent = + catalog.loadTable(TABLE).updateSpec().removeField(Expressions.bucket("id", 16)); PartitionSpec expected = concurrent.apply(); concurrent.commit(); // attempt to commit the original update - String expectedMessage = supportsServerSideRetry() ? - "Requirement failed: default partition spec changed" : "Cannot commit"; - AssertHelpers.assertThrows("Second partition spec update commit should fail because of a conflict", - CommitFailedException.class, expectedMessage, update::commit); + String expectedMessage = + supportsServerSideRetry() + ? "Requirement failed: default partition spec changed" + : "Cannot commit"; + AssertHelpers.assertThrows( + "Second partition spec update commit should fail because of a conflict", + CommitFailedException.class, + expectedMessage, + update::commit); Table loaded = catalog.loadTable(TABLE); // the spec ID may not match, so check equality of the fields - Assert.assertEquals("Loaded table should have expected spec", expected.fields(), loaded.spec().fields()); + Assert.assertEquals( + "Loaded table should have expected spec", expected.fields(), loaded.spec().fields()); } @Test @@ -1019,45 +1120,50 @@ public void testUpdateTableAssignmentSpecConflict() { Table table = catalog.buildTable(TABLE, SCHEMA).create(); - UpdatePartitionSpec update = table.updateSpec() - .addField("shard", Expressions.bucket("id", 16)); + UpdatePartitionSpec update = table.updateSpec().addField("shard", Expressions.bucket("id", 16)); // update the spec concurrently so that the original update fails - UpdatePartitionSpec concurrent = catalog.loadTable(TABLE).updateSpec() - .addField("shard", Expressions.truncate("id", 100)); + UpdatePartitionSpec concurrent = + catalog.loadTable(TABLE).updateSpec().addField("shard", Expressions.truncate("id", 100)); PartitionSpec expected = concurrent.apply(); concurrent.commit(); // attempt to commit the original update - String expectedMessage = supportsServerSideRetry() ? - "Requirement failed: last assigned partition id changed" : "Cannot commit"; - AssertHelpers.assertThrows("Second partition spec update commit should fail because of a conflict", - CommitFailedException.class, expectedMessage, update::commit); + String expectedMessage = + supportsServerSideRetry() + ? "Requirement failed: last assigned partition id changed" + : "Cannot commit"; + AssertHelpers.assertThrows( + "Second partition spec update commit should fail because of a conflict", + CommitFailedException.class, + expectedMessage, + update::commit); Table loaded = catalog.loadTable(TABLE); // the spec ID may not match, so check equality of the fields - Assert.assertEquals("Loaded table should have expected spec", expected.fields(), loaded.spec().fields()); + Assert.assertEquals( + "Loaded table should have expected spec", expected.fields(), loaded.spec().fields()); } @Test public void testUpdateTableSpecThenRevert() { C catalog = catalog(); - // create a v2 table. otherwise the spec update would produce a different spec with a void partition field - Table table = catalog.buildTable(TABLE, SCHEMA) - .withPartitionSpec(SPEC) - .withProperty("format-version", "2") - .create(); - Assert.assertEquals("Should be a v2 table", 2, ((BaseTable) table).operations().current().formatVersion()); + // create a v2 table. otherwise the spec update would produce a different spec with a void + // partition field + Table table = + catalog + .buildTable(TABLE, SCHEMA) + .withPartitionSpec(SPEC) + .withProperty("format-version", "2") + .create(); + Assert.assertEquals( + "Should be a v2 table", 2, ((BaseTable) table).operations().current().formatVersion()); - table.updateSpec() - .addField("id") - .commit(); + table.updateSpec().addField("id").commit(); - table.updateSpec() - .removeField("id") - .commit(); + table.updateSpec().removeField("id").commit(); Assert.assertEquals("Loaded table should have expected spec", TABLE_SPEC, table.spec()); } @@ -1067,9 +1173,7 @@ public void testUpdateTableSortOrder() { C catalog = catalog(); Table table = catalog.buildTable(TABLE, SCHEMA).create(); - ReplaceSortOrder update = table.replaceSortOrder() - .asc(Expressions.bucket("id", 16)) - .asc("id"); + ReplaceSortOrder update = table.replaceSortOrder().asc(Expressions.bucket("id", 16)).asc("id"); SortOrder expected = update.apply(); @@ -1078,23 +1182,26 @@ public void testUpdateTableSortOrder() { Table loaded = catalog.loadTable(TABLE); // the sort order ID may not match, so check equality of the fields - Assert.assertEquals("Loaded table should have expected order", expected.fields(), loaded.sortOrder().fields()); + Assert.assertEquals( + "Loaded table should have expected order", expected.fields(), loaded.sortOrder().fields()); } @Test public void testUpdateTableSortOrderServerSideRetry() { - Assume.assumeTrue("Sort order update recovery is only supported with server-side retry", supportsServerSideRetry()); + Assume.assumeTrue( + "Sort order update recovery is only supported with server-side retry", + supportsServerSideRetry()); C catalog = catalog(); Table table = catalog.buildTable(TABLE, SCHEMA).create(); - ReplaceSortOrder update = table.replaceSortOrder() - .asc(Expressions.bucket("id", 16)) - .asc("id"); + ReplaceSortOrder update = table.replaceSortOrder().asc(Expressions.bucket("id", 16)).asc("id"); SortOrder expected = update.apply(); // update the schema concurrently so that the first update fails, but can succeed on retry - catalog.loadTable(TABLE).updateSchema() + catalog + .loadTable(TABLE) + .updateSchema() .addColumn("another_col", Types.StringType.get()) .commit(); @@ -1104,36 +1211,29 @@ public void testUpdateTableSortOrderServerSideRetry() { Table loaded = catalog.loadTable(TABLE); // the sort order ID may not match, so check equality of the fields - Assert.assertEquals("Loaded table should have expected order", expected.fields(), loaded.sortOrder().fields()); + Assert.assertEquals( + "Loaded table should have expected order", expected.fields(), loaded.sortOrder().fields()); } @Test public void testUpdateTableOrderThenRevert() { C catalog = catalog(); - Table table = catalog.buildTable(TABLE, SCHEMA) - .withSortOrder(WRITE_ORDER) - .create(); + Table table = catalog.buildTable(TABLE, SCHEMA).withSortOrder(WRITE_ORDER).create(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - table.replaceSortOrder() - .asc(Expressions.bucket("id", 16)) - .asc("id") - .commit(); + table.replaceSortOrder().asc(Expressions.bucket("id", 16)).asc("id").commit(); - Assert.assertEquals("Loaded table should have expected order", TABLE_WRITE_ORDER, table.sortOrder()); + Assert.assertEquals( + "Loaded table should have expected order", TABLE_WRITE_ORDER, table.sortOrder()); } @Test public void testAppend() throws IOException { C catalog = catalog(); - Table table = catalog.buildTable(TABLE, SCHEMA) - .withPartitionSpec(SPEC) - .create(); + Table table = catalog.buildTable(TABLE, SCHEMA).withPartitionSpec(SPEC).create(); try (CloseableIterable tasks = table.newScan().planFiles()) { Assert.assertFalse("Should contain no files", tasks.iterator().hasNext()); @@ -1148,9 +1248,7 @@ public void testAppend() throws IOException { public void testConcurrentAppendEmptyTable() { C catalog = catalog(); - Table table = catalog.buildTable(TABLE, SCHEMA) - .withPartitionSpec(SPEC) - .create(); + Table table = catalog.buildTable(TABLE, SCHEMA).withPartitionSpec(SPEC).create(); assertNoFiles(table); @@ -1170,9 +1268,7 @@ public void testConcurrentAppendEmptyTable() { public void testConcurrentAppendNonEmptyTable() { C catalog = catalog(); - Table table = catalog.buildTable(TABLE, SCHEMA) - .withPartitionSpec(SPEC) - .create(); + Table table = catalog.buildTable(TABLE, SCHEMA).withPartitionSpec(SPEC).create(); assertNoFiles(table); @@ -1201,13 +1297,13 @@ public void testUpdateTransaction() { Transaction transaction = table.newTransaction(); - UpdateSchema updateSchema = transaction.updateSchema() - .addColumn("new_col", Types.LongType.get()); + UpdateSchema updateSchema = + transaction.updateSchema().addColumn("new_col", Types.LongType.get()); Schema expectedSchema = updateSchema.apply(); updateSchema.commit(); - UpdatePartitionSpec updateSpec = transaction.updateSpec() - .addField("shard", Expressions.bucket("id", 16)); + UpdatePartitionSpec updateSpec = + transaction.updateSpec().addField("shard", Expressions.bucket("id", 16)); PartitionSpec expectedSpec = updateSpec.apply(); updateSpec.commit(); @@ -1215,10 +1311,12 @@ public void testUpdateTransaction() { Table loaded = catalog.loadTable(TABLE); - Assert.assertEquals("Loaded table should have expected schema", - expectedSchema.asStruct(), loaded.schema().asStruct()); - Assert.assertEquals("Loaded table should have expected spec", - expectedSpec.fields(), loaded.spec().fields()); + Assert.assertEquals( + "Loaded table should have expected schema", + expectedSchema.asStruct(), + loaded.schema().asStruct()); + Assert.assertEquals( + "Loaded table should have expected spec", expectedSpec.fields(), loaded.spec().fields()); assertPreviousMetadataFileCount(loaded, 1); } @@ -1229,11 +1327,10 @@ public void testCreateTransaction() { Transaction create = catalog.buildTable(TABLE, SCHEMA).createTransaction(); - Assert.assertFalse("Table should not exist after createTransaction", catalog.tableExists(TABLE)); + Assert.assertFalse( + "Table should not exist after createTransaction", catalog.tableExists(TABLE)); - create.newFastAppend() - .appendFile(FILE_A) - .commit(); + create.newFastAppend().appendFile(FILE_A).commit(); Assert.assertFalse("Table should not exist after append commit", catalog.tableExists(TABLE)); @@ -1249,19 +1346,21 @@ public void testCreateTransaction() { public void testCompleteCreateTransaction() { C catalog = catalog(); - Map properties = ImmutableMap.of("user", "someone", "created-at", "2022-02-25T00:38:19"); - Transaction create = catalog.buildTable(TABLE, SCHEMA) - .withLocation("file:/tmp/ns/table") - .withPartitionSpec(SPEC) - .withSortOrder(WRITE_ORDER) - .withProperties(properties) - .createTransaction(); + Map properties = + ImmutableMap.of("user", "someone", "created-at", "2022-02-25T00:38:19"); + Transaction create = + catalog + .buildTable(TABLE, SCHEMA) + .withLocation("file:/tmp/ns/table") + .withPartitionSpec(SPEC) + .withSortOrder(WRITE_ORDER) + .withProperties(properties) + .createTransaction(); - Assert.assertFalse("Table should not exist after createTransaction", catalog.tableExists(TABLE)); + Assert.assertFalse( + "Table should not exist after createTransaction", catalog.tableExists(TABLE)); - create.newFastAppend() - .appendFile(FILE_A) - .commit(); + create.newFastAppend().appendFile(FILE_A).commit(); Assert.assertFalse("Table should not exist after append commit", catalog.tableExists(TABLE)); @@ -1270,15 +1369,21 @@ public void testCompleteCreateTransaction() { Assert.assertTrue("Table should exist after append commit", catalog.tableExists(TABLE)); Table table = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match the new schema", - TABLE_SCHEMA.asStruct(), table.schema().asStruct()); - Assert.assertEquals("Table should have create partition spec", TABLE_SPEC.fields(), table.spec().fields()); - Assert.assertEquals("Table should have create sort order", TABLE_WRITE_ORDER, table.sortOrder()); - Assert.assertEquals("Table properties should be a superset of the requested properties", + Assert.assertEquals( + "Table schema should match the new schema", + TABLE_SCHEMA.asStruct(), + table.schema().asStruct()); + Assert.assertEquals( + "Table should have create partition spec", TABLE_SPEC.fields(), table.spec().fields()); + Assert.assertEquals( + "Table should have create sort order", TABLE_WRITE_ORDER, table.sortOrder()); + Assert.assertEquals( + "Table properties should be a superset of the requested properties", properties.entrySet(), Sets.intersection(properties.entrySet(), table.properties().entrySet())); if (!overridesRequestedLocation()) { - Assert.assertEquals("Table location should match requested", "file:/tmp/ns/table", table.location()); + Assert.assertEquals( + "Table location should match requested", "file:/tmp/ns/table", table.location()); } assertFiles(table, FILE_A); assertFilesPartitionSpec(table); @@ -1289,15 +1394,19 @@ public void testCompleteCreateTransaction() { public void testCompleteCreateTransactionMultipleSchemas() { C catalog = catalog(); - Map properties = ImmutableMap.of("user", "someone", "created-at", "2022-02-25T00:38:19"); - Transaction create = catalog.buildTable(TABLE, SCHEMA) - .withLocation("file:/tmp/ns/table") - .withPartitionSpec(SPEC) - .withSortOrder(WRITE_ORDER) - .withProperties(properties) - .createTransaction(); + Map properties = + ImmutableMap.of("user", "someone", "created-at", "2022-02-25T00:38:19"); + Transaction create = + catalog + .buildTable(TABLE, SCHEMA) + .withLocation("file:/tmp/ns/table") + .withPartitionSpec(SPEC) + .withSortOrder(WRITE_ORDER) + .withProperties(properties) + .createTransaction(); - Assert.assertFalse("Table should not exist after createTransaction", catalog.tableExists(TABLE)); + Assert.assertFalse( + "Table should not exist after createTransaction", catalog.tableExists(TABLE)); create.newFastAppend().appendFile(FILE_A).commit(); @@ -1313,12 +1422,13 @@ public void testCompleteCreateTransactionMultipleSchemas() { SortOrder newSortOrder = replaceSortOrder.apply(); replaceSortOrder.commit(); - DataFile anotherFile = DataFiles.builder(newSpec) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("id_bucket=0/new_col=0") // easy way to set partition data for now - .withRecordCount(2) // needs at least one record or else metrics will filter it out - .build(); + DataFile anotherFile = + DataFiles.builder(newSpec) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("id_bucket=0/new_col=0") // easy way to set partition data for now + .withRecordCount(2) // needs at least one record or else metrics will filter it out + .build(); create.newFastAppend().appendFile(anotherFile).commit(); @@ -1337,19 +1447,27 @@ public void testCompleteCreateTransactionMultipleSchemas() { final int updateSpecId = initialSpecId + 1; final int updateOrderId = initialOrderId + 1; - Assert.assertEquals("Table schema should match the new schema", - newSchema.asStruct(), table.schema().asStruct()); - Assert.assertEquals("Table schema should match the new schema ID", - updateSchemaId, table.schema().schemaId()); - Assert.assertEquals("Table should have updated partition spec", newSpec.fields(), table.spec().fields()); - Assert.assertEquals("Table should have updated partition spec ID", updateSpecId, table.spec().specId()); - Assert.assertEquals("Table should have updated sort order", newSortOrder.fields(), table.sortOrder().fields()); - Assert.assertEquals("Table should have updated sort order ID", updateOrderId, table.sortOrder().orderId()); - Assert.assertEquals("Table properties should be a superset of the requested properties", + Assert.assertEquals( + "Table schema should match the new schema", + newSchema.asStruct(), + table.schema().asStruct()); + Assert.assertEquals( + "Table schema should match the new schema ID", updateSchemaId, table.schema().schemaId()); + Assert.assertEquals( + "Table should have updated partition spec", newSpec.fields(), table.spec().fields()); + Assert.assertEquals( + "Table should have updated partition spec ID", updateSpecId, table.spec().specId()); + Assert.assertEquals( + "Table should have updated sort order", newSortOrder.fields(), table.sortOrder().fields()); + Assert.assertEquals( + "Table should have updated sort order ID", updateOrderId, table.sortOrder().orderId()); + Assert.assertEquals( + "Table properties should be a superset of the requested properties", properties.entrySet(), Sets.intersection(properties.entrySet(), table.properties().entrySet())); if (!overridesRequestedLocation()) { - Assert.assertEquals("Table location should match requested", "file:/tmp/ns/table", table.location()); + Assert.assertEquals( + "Table location should match requested", "file:/tmp/ns/table", table.location()); } assertFiles(table, FILE_A, anotherFile); assertFilePartitionSpec(table, FILE_A, initialSpecId); @@ -1362,20 +1480,22 @@ public void testCompleteCreateTransactionV2() { C catalog = catalog(); Map properties = - ImmutableMap.of("user", "someone", "created-at", "2022-02-25T00:38:19", "format-version", "2"); + ImmutableMap.of( + "user", "someone", "created-at", "2022-02-25T00:38:19", "format-version", "2"); - Transaction create = catalog.buildTable(TABLE, SCHEMA) + Transaction create = + catalog + .buildTable(TABLE, SCHEMA) .withLocation("file:/tmp/ns/table") .withPartitionSpec(SPEC) .withSortOrder(WRITE_ORDER) .withProperties(properties) .createTransaction(); - Assert.assertFalse("Table should not exist after createTransaction", catalog.tableExists(TABLE)); + Assert.assertFalse( + "Table should not exist after createTransaction", catalog.tableExists(TABLE)); - create.newFastAppend() - .appendFile(FILE_A) - .commit(); + create.newFastAppend().appendFile(FILE_A).commit(); Assert.assertFalse("Table should not exist after append commit", catalog.tableExists(TABLE)); @@ -1387,17 +1507,25 @@ public void testCompleteCreateTransactionV2() { Map expectedProps = Maps.newHashMap(properties); expectedProps.remove("format-version"); - Assert.assertEquals("Table schema should match the new schema", - TABLE_SCHEMA.asStruct(), table.schema().asStruct()); - Assert.assertEquals("Table should have create partition spec", TABLE_SPEC.fields(), table.spec().fields()); - Assert.assertEquals("Table should have create sort order", TABLE_WRITE_ORDER, table.sortOrder()); - Assert.assertEquals("Table properties should be a superset of the requested properties", - expectedProps.entrySet(), - Sets.intersection(properties.entrySet(), table.properties().entrySet())); Assert.assertEquals( - "Sequence number should start at 1 for v2 format", 1, table.currentSnapshot().sequenceNumber()); + "Table schema should match the new schema", + TABLE_SCHEMA.asStruct(), + table.schema().asStruct()); + Assert.assertEquals( + "Table should have create partition spec", TABLE_SPEC.fields(), table.spec().fields()); + Assert.assertEquals( + "Table should have create sort order", TABLE_WRITE_ORDER, table.sortOrder()); + Assert.assertEquals( + "Table properties should be a superset of the requested properties", + expectedProps.entrySet(), + Sets.intersection(properties.entrySet(), table.properties().entrySet())); + Assert.assertEquals( + "Sequence number should start at 1 for v2 format", + 1, + table.currentSnapshot().sequenceNumber()); if (!overridesRequestedLocation()) { - Assert.assertEquals("Table location should match requested", "file:/tmp/ns/table", table.location()); + Assert.assertEquals( + "Table location should match requested", "file:/tmp/ns/table", table.location()); } assertFiles(table, FILE_A); assertFilesPartitionSpec(table); @@ -1410,11 +1538,10 @@ public void testConcurrentCreateTransaction() { Transaction create = catalog.buildTable(TABLE, SCHEMA).createTransaction(); - Assert.assertFalse("Table should not exist after createTransaction", catalog.tableExists(TABLE)); + Assert.assertFalse( + "Table should not exist after createTransaction", catalog.tableExists(TABLE)); - create.newFastAppend() - .appendFile(FILE_A) - .commit(); + create.newFastAppend().appendFile(FILE_A).commit(); Assert.assertFalse("Table should not exist after append commit", catalog.tableExists(TABLE)); @@ -1422,14 +1549,21 @@ public void testConcurrentCreateTransaction() { Assertions.setMaxStackTraceElementsDisplayed(Integer.MAX_VALUE); String expectedMessage = - supportsServerSideRetry() ? "Requirement failed: table already exists" : "Table already exists"; - AssertHelpers.assertThrows("Should fail because table was created concurrently", - AlreadyExistsException.class, expectedMessage, create::commitTransaction); + supportsServerSideRetry() + ? "Requirement failed: table already exists" + : "Table already exists"; + AssertHelpers.assertThrows( + "Should fail because table was created concurrently", + AlreadyExistsException.class, + expectedMessage, + create::commitTransaction); // validate the concurrently created table is unmodified Table table = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match concurrent create", - OTHER_SCHEMA.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Table schema should match concurrent create", + OTHER_SCHEMA.asStruct(), + table.schema().asStruct()); assertNoFiles(table); } @@ -1439,11 +1573,10 @@ public void testCreateOrReplaceTransactionCreate() { Transaction create = catalog.buildTable(TABLE, SCHEMA).createOrReplaceTransaction(); - Assert.assertFalse("Table should not exist after createTransaction", catalog.tableExists(TABLE)); + Assert.assertFalse( + "Table should not exist after createTransaction", catalog.tableExists(TABLE)); - create.newFastAppend() - .appendFile(FILE_A) - .commit(); + create.newFastAppend().appendFile(FILE_A).commit(); Assert.assertFalse("Table should not exist after append commit", catalog.tableExists(TABLE)); @@ -1459,19 +1592,21 @@ public void testCreateOrReplaceTransactionCreate() { public void testCompleteCreateOrReplaceTransactionCreate() { C catalog = catalog(); - Map properties = ImmutableMap.of("user", "someone", "created-at", "2022-02-25T00:38:19"); - Transaction createOrReplace = catalog.buildTable(TABLE, SCHEMA) - .withLocation("file:/tmp/ns/table") - .withPartitionSpec(SPEC) - .withSortOrder(WRITE_ORDER) - .withProperties(properties) - .createOrReplaceTransaction(); + Map properties = + ImmutableMap.of("user", "someone", "created-at", "2022-02-25T00:38:19"); + Transaction createOrReplace = + catalog + .buildTable(TABLE, SCHEMA) + .withLocation("file:/tmp/ns/table") + .withPartitionSpec(SPEC) + .withSortOrder(WRITE_ORDER) + .withProperties(properties) + .createOrReplaceTransaction(); - Assert.assertFalse("Table should not exist after createTransaction", catalog.tableExists(TABLE)); + Assert.assertFalse( + "Table should not exist after createTransaction", catalog.tableExists(TABLE)); - createOrReplace.newFastAppend() - .appendFile(FILE_A) - .commit(); + createOrReplace.newFastAppend().appendFile(FILE_A).commit(); Assert.assertFalse("Table should not exist after append commit", catalog.tableExists(TABLE)); @@ -1480,15 +1615,21 @@ public void testCompleteCreateOrReplaceTransactionCreate() { Assert.assertTrue("Table should exist after append commit", catalog.tableExists(TABLE)); Table table = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match the new schema", - TABLE_SCHEMA.asStruct(), table.schema().asStruct()); - Assert.assertEquals("Table should have create partition spec", TABLE_SPEC.fields(), table.spec().fields()); - Assert.assertEquals("Table should have create sort order", TABLE_WRITE_ORDER, table.sortOrder()); - Assert.assertEquals("Table properties should be a superset of the requested properties", + Assert.assertEquals( + "Table schema should match the new schema", + TABLE_SCHEMA.asStruct(), + table.schema().asStruct()); + Assert.assertEquals( + "Table should have create partition spec", TABLE_SPEC.fields(), table.spec().fields()); + Assert.assertEquals( + "Table should have create sort order", TABLE_WRITE_ORDER, table.sortOrder()); + Assert.assertEquals( + "Table properties should be a superset of the requested properties", properties.entrySet(), Sets.intersection(properties.entrySet(), table.properties().entrySet())); if (!overridesRequestedLocation()) { - Assert.assertEquals("Table location should match requested", "file:/tmp/ns/table", table.location()); + Assert.assertEquals( + "Table location should match requested", "file:/tmp/ns/table", table.location()); } assertFiles(table, FILE_A); assertFilesPartitionSpec(table); @@ -1505,16 +1646,17 @@ public void testCreateOrReplaceReplaceTransactionReplace() { Transaction createOrReplace = catalog.buildTable(TABLE, SCHEMA).createOrReplaceTransaction(); - Assert.assertTrue("Table should still exist after replaceTransaction", catalog.tableExists(TABLE)); + Assert.assertTrue( + "Table should still exist after replaceTransaction", catalog.tableExists(TABLE)); - createOrReplace.newFastAppend() - .appendFile(FILE_A) - .commit(); + createOrReplace.newFastAppend().appendFile(FILE_A).commit(); // validate table has not changed Table table = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match concurrent create", - OTHER_SCHEMA.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Table schema should match concurrent create", + OTHER_SCHEMA.asStruct(), + table.schema().asStruct()); assertUUIDsMatch(original, table); assertNoFiles(table); @@ -1526,8 +1668,10 @@ public void testCreateOrReplaceReplaceTransactionReplace() { Table loaded = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match the new schema", - REPLACE_SCHEMA.asStruct(), loaded.schema().asStruct()); + Assert.assertEquals( + "Table schema should match the new schema", + REPLACE_SCHEMA.asStruct(), + loaded.schema().asStruct()); assertUUIDsMatch(original, loaded); assertFiles(loaded, FILE_A); assertPreviousMetadataFileCount(loaded, 1); @@ -1541,29 +1685,32 @@ public void testCompleteCreateOrReplaceTransactionReplace() { Assert.assertTrue("Table should exist before replaceTransaction", catalog.tableExists(TABLE)); - Map properties = ImmutableMap.of("user", "someone", "created-at", "2022-02-25T00:38:19"); - Transaction createOrReplace = catalog.buildTable(TABLE, SCHEMA) - .withLocation("file:/tmp/ns/table") - .withPartitionSpec(SPEC) - .withSortOrder(WRITE_ORDER) - .withProperties(properties) - .createOrReplaceTransaction(); + Map properties = + ImmutableMap.of("user", "someone", "created-at", "2022-02-25T00:38:19"); + Transaction createOrReplace = + catalog + .buildTable(TABLE, SCHEMA) + .withLocation("file:/tmp/ns/table") + .withPartitionSpec(SPEC) + .withSortOrder(WRITE_ORDER) + .withProperties(properties) + .createOrReplaceTransaction(); - Assert.assertTrue("Table should still exist after replaceTransaction", catalog.tableExists(TABLE)); + Assert.assertTrue( + "Table should still exist after replaceTransaction", catalog.tableExists(TABLE)); - createOrReplace.newFastAppend() - .appendFile(FILE_A) - .commit(); + createOrReplace.newFastAppend().appendFile(FILE_A).commit(); // validate table has not changed Table table = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match concurrent create", - OTHER_SCHEMA.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Table schema should match concurrent create", + OTHER_SCHEMA.asStruct(), + table.schema().asStruct()); Assert.assertTrue("Table should be unpartitioned", table.spec().isUnpartitioned()); Assert.assertTrue("Table should be unsorted", table.sortOrder().isUnsorted()); - Assert.assertNotEquals("Created at should not match", - table.properties().get("created-at"), - "2022-02-25T00:38:19"); + Assert.assertNotEquals( + "Created at should not match", table.properties().get("created-at"), "2022-02-25T00:38:19"); assertUUIDsMatch(original, table); assertNoFiles(table); @@ -1575,15 +1722,20 @@ public void testCompleteCreateOrReplaceTransactionReplace() { Table loaded = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match the new schema", - REPLACE_SCHEMA.asStruct(), loaded.schema().asStruct()); + Assert.assertEquals( + "Table schema should match the new schema", + REPLACE_SCHEMA.asStruct(), + loaded.schema().asStruct()); Assert.assertEquals("Table should have replace partition spec", REPLACE_SPEC, loaded.spec()); - Assert.assertEquals("Table should have replace sort order", REPLACE_WRITE_ORDER, loaded.sortOrder()); - Assert.assertEquals("Table properties should be a superset of the requested properties", + Assert.assertEquals( + "Table should have replace sort order", REPLACE_WRITE_ORDER, loaded.sortOrder()); + Assert.assertEquals( + "Table properties should be a superset of the requested properties", properties.entrySet(), Sets.intersection(properties.entrySet(), loaded.properties().entrySet())); if (!overridesRequestedLocation()) { - Assert.assertEquals("Table location should be replaced", "file:/tmp/ns/table", table.location()); + Assert.assertEquals( + "Table location should be replaced", "file:/tmp/ns/table", table.location()); } assertUUIDsMatch(original, loaded); assertFiles(loaded, FILE_A); @@ -1592,31 +1744,39 @@ public void testCompleteCreateOrReplaceTransactionReplace() { @Test public void testCreateOrReplaceTransactionConcurrentCreate() { - Assume.assumeTrue("Conversion to replace transaction is not supported by REST catalog", supportsServerSideRetry()); + Assume.assumeTrue( + "Conversion to replace transaction is not supported by REST catalog", + supportsServerSideRetry()); C catalog = catalog(); Transaction createOrReplace = catalog.buildTable(TABLE, SCHEMA).createOrReplaceTransaction(); - Assert.assertFalse("Table should not exist after createTransaction", catalog.tableExists(TABLE)); + Assert.assertFalse( + "Table should not exist after createTransaction", catalog.tableExists(TABLE)); - createOrReplace.newFastAppend() - .appendFile(FILE_A) - .commit(); + createOrReplace.newFastAppend().appendFile(FILE_A).commit(); Assert.assertFalse("Table should not exist after append commit", catalog.tableExists(TABLE)); catalog.buildTable(TABLE, OTHER_SCHEMA).create(); String expectedMessage = - supportsServerSideRetry() ? "Requirement failed: table already exists" : "Table already exists"; - AssertHelpers.assertThrows("Should fail because table was created concurrently", - AlreadyExistsException.class, expectedMessage, createOrReplace::commitTransaction); + supportsServerSideRetry() + ? "Requirement failed: table already exists" + : "Table already exists"; + AssertHelpers.assertThrows( + "Should fail because table was created concurrently", + AlreadyExistsException.class, + expectedMessage, + createOrReplace::commitTransaction); // validate the concurrently created table is unmodified Table table = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match concurrent create", - OTHER_SCHEMA.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Table schema should match concurrent create", + OTHER_SCHEMA.asStruct(), + table.schema().asStruct()); assertNoFiles(table); } @@ -1630,16 +1790,17 @@ public void testReplaceTransaction() { Transaction replace = catalog.buildTable(TABLE, SCHEMA).replaceTransaction(); - Assert.assertTrue("Table should still exist after replaceTransaction", catalog.tableExists(TABLE)); + Assert.assertTrue( + "Table should still exist after replaceTransaction", catalog.tableExists(TABLE)); - replace.newFastAppend() - .appendFile(FILE_A) - .commit(); + replace.newFastAppend().appendFile(FILE_A).commit(); // validate table has not changed Table table = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match concurrent create", - OTHER_SCHEMA.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Table schema should match concurrent create", + OTHER_SCHEMA.asStruct(), + table.schema().asStruct()); assertUUIDsMatch(original, table); assertNoFiles(table); @@ -1651,8 +1812,10 @@ public void testReplaceTransaction() { Table loaded = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match the new schema", - REPLACE_SCHEMA.asStruct(), loaded.schema().asStruct()); + Assert.assertEquals( + "Table schema should match the new schema", + REPLACE_SCHEMA.asStruct(), + loaded.schema().asStruct()); assertUUIDsMatch(original, loaded); assertFiles(loaded, FILE_A); assertPreviousMetadataFileCount(loaded, 1); @@ -1666,29 +1829,32 @@ public void testCompleteReplaceTransaction() { Assert.assertTrue("Table should exist before replaceTransaction", catalog.tableExists(TABLE)); - Map properties = ImmutableMap.of("user", "someone", "created-at", "2022-02-25T00:38:19"); - Transaction replace = catalog.buildTable(TABLE, SCHEMA) - .withLocation("file:/tmp/ns/table") - .withPartitionSpec(SPEC) - .withSortOrder(WRITE_ORDER) - .withProperties(properties) - .replaceTransaction(); + Map properties = + ImmutableMap.of("user", "someone", "created-at", "2022-02-25T00:38:19"); + Transaction replace = + catalog + .buildTable(TABLE, SCHEMA) + .withLocation("file:/tmp/ns/table") + .withPartitionSpec(SPEC) + .withSortOrder(WRITE_ORDER) + .withProperties(properties) + .replaceTransaction(); - Assert.assertTrue("Table should still exist after replaceTransaction", catalog.tableExists(TABLE)); + Assert.assertTrue( + "Table should still exist after replaceTransaction", catalog.tableExists(TABLE)); - replace.newFastAppend() - .appendFile(FILE_A) - .commit(); + replace.newFastAppend().appendFile(FILE_A).commit(); // validate table has not changed Table table = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match concurrent create", - OTHER_SCHEMA.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Table schema should match concurrent create", + OTHER_SCHEMA.asStruct(), + table.schema().asStruct()); Assert.assertTrue("Table should be unpartitioned", table.spec().isUnpartitioned()); Assert.assertTrue("Table should be unsorted", table.sortOrder().isUnsorted()); - Assert.assertNotEquals("Created at should not match", - table.properties().get("created-at"), - "2022-02-25T00:38:19"); + Assert.assertNotEquals( + "Created at should not match", table.properties().get("created-at"), "2022-02-25T00:38:19"); assertUUIDsMatch(original, table); assertNoFiles(table); @@ -1700,15 +1866,20 @@ public void testCompleteReplaceTransaction() { Table loaded = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match the new schema", - REPLACE_SCHEMA.asStruct(), loaded.schema().asStruct()); + Assert.assertEquals( + "Table schema should match the new schema", + REPLACE_SCHEMA.asStruct(), + loaded.schema().asStruct()); Assert.assertEquals("Table should have replace partition spec", REPLACE_SPEC, loaded.spec()); - Assert.assertEquals("Table should have replace sort order", REPLACE_WRITE_ORDER, loaded.sortOrder()); - Assert.assertEquals("Table properties should be a superset of the requested properties", + Assert.assertEquals( + "Table should have replace sort order", REPLACE_WRITE_ORDER, loaded.sortOrder()); + Assert.assertEquals( + "Table properties should be a superset of the requested properties", properties.entrySet(), Sets.intersection(properties.entrySet(), loaded.properties().entrySet())); if (!overridesRequestedLocation()) { - Assert.assertEquals("Table location should be replaced", "file:/tmp/ns/table", table.location()); + Assert.assertEquals( + "Table location should be replaced", "file:/tmp/ns/table", table.location()); } assertUUIDsMatch(original, loaded); assertFiles(loaded, FILE_A); @@ -1719,8 +1890,10 @@ public void testCompleteReplaceTransaction() { public void testReplaceTransactionRequiresTableExists() { C catalog = catalog(); - AssertHelpers.assertThrows("Should fail to create replace transaction with a missing table", - NoSuchTableException.class, "Table does not exist", + AssertHelpers.assertThrows( + "Should fail to create replace transaction with a missing table", + NoSuchTableException.class, + "Table does not exist", () -> catalog.buildTable(TABLE, SCHEMA).replaceTransaction()); } @@ -1729,46 +1902,38 @@ public void testConcurrentReplaceTransactions() { C catalog = catalog(); Transaction transaction = catalog.buildTable(TABLE, SCHEMA).createTransaction(); - transaction.newFastAppend() - .appendFile(FILE_A) - .commit(); + transaction.newFastAppend().appendFile(FILE_A).commit(); transaction.commitTransaction(); Table original = catalog.loadTable(TABLE); assertFiles(original, FILE_A); - Transaction secondReplace = catalog.buildTable(TABLE, SCHEMA) - .replaceTransaction(); - secondReplace.newFastAppend() - .appendFile(FILE_C) - .commit(); + Transaction secondReplace = catalog.buildTable(TABLE, SCHEMA).replaceTransaction(); + secondReplace.newFastAppend().appendFile(FILE_C).commit(); - Transaction firstReplace = catalog.buildTable(TABLE, SCHEMA) - .replaceTransaction(); - firstReplace.newFastAppend() - .appendFile(FILE_B) - .commit(); + Transaction firstReplace = catalog.buildTable(TABLE, SCHEMA).replaceTransaction(); + firstReplace.newFastAppend().appendFile(FILE_B).commit(); firstReplace.commitTransaction(); Table afterFirstReplace = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match the original schema", - original.schema().asStruct(), afterFirstReplace.schema().asStruct()); - Assert.assertTrue("Table should be unpartitioned", - afterFirstReplace.spec().isUnpartitioned()); - Assert.assertTrue("Table should be unsorted", - afterFirstReplace.sortOrder().isUnsorted()); + Assert.assertEquals( + "Table schema should match the original schema", + original.schema().asStruct(), + afterFirstReplace.schema().asStruct()); + Assert.assertTrue("Table should be unpartitioned", afterFirstReplace.spec().isUnpartitioned()); + Assert.assertTrue("Table should be unsorted", afterFirstReplace.sortOrder().isUnsorted()); assertUUIDsMatch(original, afterFirstReplace); assertFiles(afterFirstReplace, FILE_B); secondReplace.commitTransaction(); Table afterSecondReplace = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match the original schema", - original.schema().asStruct(), afterSecondReplace.schema().asStruct()); - Assert.assertTrue("Table should be unpartitioned", - afterSecondReplace.spec().isUnpartitioned()); - Assert.assertTrue("Table should be unsorted", - afterSecondReplace.sortOrder().isUnsorted()); + Assert.assertEquals( + "Table schema should match the original schema", + original.schema().asStruct(), + afterSecondReplace.schema().asStruct()); + Assert.assertTrue("Table should be unpartitioned", afterSecondReplace.spec().isUnpartitioned()); + Assert.assertTrue("Table should be unsorted", afterSecondReplace.sortOrder().isUnsorted()); assertUUIDsMatch(original, afterSecondReplace); assertFiles(afterSecondReplace, FILE_C); } @@ -1778,38 +1943,34 @@ public void testConcurrentReplaceTransactionSchema() { C catalog = catalog(); Transaction transaction = catalog.buildTable(TABLE, OTHER_SCHEMA).createTransaction(); - transaction.newFastAppend() - .appendFile(FILE_A) - .commit(); + transaction.newFastAppend().appendFile(FILE_A).commit(); transaction.commitTransaction(); Table original = catalog.loadTable(TABLE); assertFiles(original, FILE_A); - Transaction secondReplace = catalog.buildTable(TABLE, OTHER_SCHEMA) - .replaceTransaction(); - secondReplace.newFastAppend() - .appendFile(FILE_C) - .commit(); + Transaction secondReplace = catalog.buildTable(TABLE, OTHER_SCHEMA).replaceTransaction(); + secondReplace.newFastAppend().appendFile(FILE_C).commit(); - Transaction firstReplace = catalog.buildTable(TABLE, SCHEMA) - .replaceTransaction(); - firstReplace.newFastAppend() - .appendFile(FILE_B) - .commit(); + Transaction firstReplace = catalog.buildTable(TABLE, SCHEMA).replaceTransaction(); + firstReplace.newFastAppend().appendFile(FILE_B).commit(); firstReplace.commitTransaction(); Table afterFirstReplace = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match the new schema", - REPLACE_SCHEMA.asStruct(), afterFirstReplace.schema().asStruct()); + Assert.assertEquals( + "Table schema should match the new schema", + REPLACE_SCHEMA.asStruct(), + afterFirstReplace.schema().asStruct()); assertUUIDsMatch(original, afterFirstReplace); assertFiles(afterFirstReplace, FILE_B); secondReplace.commitTransaction(); Table afterSecondReplace = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match the original schema", - original.schema().asStruct(), afterSecondReplace.schema().asStruct()); + Assert.assertEquals( + "Table schema should match the original schema", + original.schema().asStruct(), + afterSecondReplace.schema().asStruct()); assertUUIDsMatch(original, afterSecondReplace); assertFiles(afterSecondReplace, FILE_C); } @@ -1819,38 +1980,34 @@ public void testConcurrentReplaceTransactionSchema2() { C catalog = catalog(); Transaction transaction = catalog.buildTable(TABLE, OTHER_SCHEMA).createTransaction(); - transaction.newFastAppend() - .appendFile(FILE_A) - .commit(); + transaction.newFastAppend().appendFile(FILE_A).commit(); transaction.commitTransaction(); Table original = catalog.loadTable(TABLE); assertFiles(original, FILE_A); - Transaction secondReplace = catalog.buildTable(TABLE, SCHEMA) - .replaceTransaction(); - secondReplace.newFastAppend() - .appendFile(FILE_C) - .commit(); + Transaction secondReplace = catalog.buildTable(TABLE, SCHEMA).replaceTransaction(); + secondReplace.newFastAppend().appendFile(FILE_C).commit(); - Transaction firstReplace = catalog.buildTable(TABLE, OTHER_SCHEMA) - .replaceTransaction(); - firstReplace.newFastAppend() - .appendFile(FILE_B) - .commit(); + Transaction firstReplace = catalog.buildTable(TABLE, OTHER_SCHEMA).replaceTransaction(); + firstReplace.newFastAppend().appendFile(FILE_B).commit(); firstReplace.commitTransaction(); Table afterFirstReplace = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match the original schema", - original.schema().asStruct(), afterFirstReplace.schema().asStruct()); + Assert.assertEquals( + "Table schema should match the original schema", + original.schema().asStruct(), + afterFirstReplace.schema().asStruct()); assertUUIDsMatch(original, afterFirstReplace); assertFiles(afterFirstReplace, FILE_B); secondReplace.commitTransaction(); Table afterSecondReplace = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match the new schema", - REPLACE_SCHEMA.asStruct(), afterSecondReplace.schema().asStruct()); + Assert.assertEquals( + "Table schema should match the new schema", + REPLACE_SCHEMA.asStruct(), + afterSecondReplace.schema().asStruct()); assertUUIDsMatch(original, afterSecondReplace); assertFiles(afterSecondReplace, FILE_C); } @@ -1862,36 +2019,34 @@ public void testConcurrentReplaceTransactionSchemaConflict() { C catalog = catalog(); Transaction transaction = catalog.buildTable(TABLE, OTHER_SCHEMA).createTransaction(); - transaction.newFastAppend() - .appendFile(FILE_A) - .commit(); + transaction.newFastAppend().appendFile(FILE_A).commit(); transaction.commitTransaction(); Table original = catalog.loadTable(TABLE); assertFiles(original, FILE_A); - Transaction secondReplace = catalog.buildTable(TABLE, SCHEMA) - .replaceTransaction(); - secondReplace.newFastAppend() - .appendFile(FILE_C) - .commit(); + Transaction secondReplace = catalog.buildTable(TABLE, SCHEMA).replaceTransaction(); + secondReplace.newFastAppend().appendFile(FILE_C).commit(); - Transaction firstReplace = catalog.buildTable(TABLE, SCHEMA) - .replaceTransaction(); - firstReplace.newFastAppend() - .appendFile(FILE_B) - .commit(); + Transaction firstReplace = catalog.buildTable(TABLE, SCHEMA).replaceTransaction(); + firstReplace.newFastAppend().appendFile(FILE_B).commit(); firstReplace.commitTransaction(); Table afterFirstReplace = catalog.loadTable(TABLE); - Assert.assertEquals("Table schema should match the original schema", - REPLACE_SCHEMA.asStruct(), afterFirstReplace.schema().asStruct()); + Assert.assertEquals( + "Table schema should match the original schema", + REPLACE_SCHEMA.asStruct(), + afterFirstReplace.schema().asStruct()); assertUUIDsMatch(original, afterFirstReplace); assertFiles(afterFirstReplace, FILE_B); - // even though the new schema is identical, the assertion that the last assigned id has not changed will fail - AssertHelpers.assertThrows("Should reject concurrent schema update", - CommitFailedException.class, "last assigned field id changed", secondReplace::commitTransaction); + // even though the new schema is identical, the assertion that the last assigned id has not + // changed will fail + AssertHelpers.assertThrows( + "Should reject concurrent schema update", + CommitFailedException.class, + "last assigned field id changed", + secondReplace::commitTransaction); } @Test @@ -1899,39 +2054,32 @@ public void testConcurrentReplaceTransactionPartitionSpec() { C catalog = catalog(); Transaction transaction = catalog.buildTable(TABLE, SCHEMA).createTransaction(); - transaction.newFastAppend() - .appendFile(FILE_A) - .commit(); + transaction.newFastAppend().appendFile(FILE_A).commit(); transaction.commitTransaction(); Table original = catalog.loadTable(TABLE); assertFiles(original, FILE_A); - Transaction secondReplace = catalog.buildTable(TABLE, SCHEMA) - .replaceTransaction(); - secondReplace.newFastAppend() - .appendFile(FILE_C) - .commit(); + Transaction secondReplace = catalog.buildTable(TABLE, SCHEMA).replaceTransaction(); + secondReplace.newFastAppend().appendFile(FILE_C).commit(); - Transaction firstReplace = catalog.buildTable(TABLE, SCHEMA) - .withPartitionSpec(SPEC) - .replaceTransaction(); - firstReplace.newFastAppend() - .appendFile(FILE_B) - .commit(); + Transaction firstReplace = + catalog.buildTable(TABLE, SCHEMA).withPartitionSpec(SPEC).replaceTransaction(); + firstReplace.newFastAppend().appendFile(FILE_B).commit(); firstReplace.commitTransaction(); Table afterFirstReplace = catalog.loadTable(TABLE); - Assert.assertEquals("Table spec should match the new spec", - TABLE_SPEC.fields(), afterFirstReplace.spec().fields()); + Assert.assertEquals( + "Table spec should match the new spec", + TABLE_SPEC.fields(), + afterFirstReplace.spec().fields()); assertUUIDsMatch(original, afterFirstReplace); assertFiles(afterFirstReplace, FILE_B); secondReplace.commitTransaction(); Table afterSecondReplace = catalog.loadTable(TABLE); - Assert.assertTrue("Table should be unpartitioned", - afterSecondReplace.spec().isUnpartitioned()); + Assert.assertTrue("Table should be unpartitioned", afterSecondReplace.spec().isUnpartitioned()); assertUUIDsMatch(original, afterSecondReplace); assertFiles(afterSecondReplace, FILE_C); } @@ -1941,39 +2089,32 @@ public void testConcurrentReplaceTransactionPartitionSpec2() { C catalog = catalog(); Transaction transaction = catalog.buildTable(TABLE, SCHEMA).createTransaction(); - transaction.newFastAppend() - .appendFile(FILE_A) - .commit(); + transaction.newFastAppend().appendFile(FILE_A).commit(); transaction.commitTransaction(); Table original = catalog.loadTable(TABLE); assertFiles(original, FILE_A); - Transaction secondReplace = catalog.buildTable(TABLE, SCHEMA) - .withPartitionSpec(SPEC) - .replaceTransaction(); - secondReplace.newFastAppend() - .appendFile(FILE_C) - .commit(); + Transaction secondReplace = + catalog.buildTable(TABLE, SCHEMA).withPartitionSpec(SPEC).replaceTransaction(); + secondReplace.newFastAppend().appendFile(FILE_C).commit(); - Transaction firstReplace = catalog.buildTable(TABLE, SCHEMA) - .replaceTransaction(); - firstReplace.newFastAppend() - .appendFile(FILE_B) - .commit(); + Transaction firstReplace = catalog.buildTable(TABLE, SCHEMA).replaceTransaction(); + firstReplace.newFastAppend().appendFile(FILE_B).commit(); firstReplace.commitTransaction(); Table afterFirstReplace = catalog.loadTable(TABLE); - Assert.assertTrue("Table should be unpartitioned", - afterFirstReplace.spec().isUnpartitioned()); + Assert.assertTrue("Table should be unpartitioned", afterFirstReplace.spec().isUnpartitioned()); assertUUIDsMatch(original, afterFirstReplace); assertFiles(afterFirstReplace, FILE_B); secondReplace.commitTransaction(); Table afterSecondReplace = catalog.loadTable(TABLE); - Assert.assertEquals("Table spec should match the new spec", - TABLE_SPEC.fields(), afterSecondReplace.spec().fields()); + Assert.assertEquals( + "Table spec should match the new spec", + TABLE_SPEC.fields(), + afterSecondReplace.spec().fields()); assertUUIDsMatch(original, afterSecondReplace); assertFiles(afterSecondReplace, FILE_C); } @@ -1984,38 +2125,36 @@ public void testConcurrentReplaceTransactionPartitionSpecConflict() { C catalog = catalog(); Transaction transaction = catalog.buildTable(TABLE, SCHEMA).createTransaction(); - transaction.newFastAppend() - .appendFile(FILE_A) - .commit(); + transaction.newFastAppend().appendFile(FILE_A).commit(); transaction.commitTransaction(); Table original = catalog.loadTable(TABLE); assertFiles(original, FILE_A); - Transaction secondReplace = catalog.buildTable(TABLE, SCHEMA) - .withPartitionSpec(SPEC) - .replaceTransaction(); - secondReplace.newFastAppend() - .appendFile(FILE_C) - .commit(); + Transaction secondReplace = + catalog.buildTable(TABLE, SCHEMA).withPartitionSpec(SPEC).replaceTransaction(); + secondReplace.newFastAppend().appendFile(FILE_C).commit(); - Transaction firstReplace = catalog.buildTable(TABLE, SCHEMA) - .withPartitionSpec(SPEC) - .replaceTransaction(); - firstReplace.newFastAppend() - .appendFile(FILE_B) - .commit(); + Transaction firstReplace = + catalog.buildTable(TABLE, SCHEMA).withPartitionSpec(SPEC).replaceTransaction(); + firstReplace.newFastAppend().appendFile(FILE_B).commit(); firstReplace.commitTransaction(); Table afterFirstReplace = catalog.loadTable(TABLE); - Assert.assertEquals("Table spec should match the new spec", - TABLE_SPEC.fields(), afterFirstReplace.spec().fields()); + Assert.assertEquals( + "Table spec should match the new spec", + TABLE_SPEC.fields(), + afterFirstReplace.spec().fields()); assertUUIDsMatch(original, afterFirstReplace); assertFiles(afterFirstReplace, FILE_B); - // even though the new spec is identical, the assertion that the last assigned id has not changed will fail - AssertHelpers.assertThrows("Should reject concurrent spec update", - CommitFailedException.class, "last assigned partition id changed", secondReplace::commitTransaction); + // even though the new spec is identical, the assertion that the last assigned id has not + // changed will fail + AssertHelpers.assertThrows( + "Should reject concurrent spec update", + CommitFailedException.class, + "last assigned partition id changed", + secondReplace::commitTransaction); } @Test @@ -2023,39 +2162,30 @@ public void testConcurrentReplaceTransactionSortOrder() { C catalog = catalog(); Transaction transaction = catalog.buildTable(TABLE, SCHEMA).createTransaction(); - transaction.newFastAppend() - .appendFile(FILE_A) - .commit(); + transaction.newFastAppend().appendFile(FILE_A).commit(); transaction.commitTransaction(); Table original = catalog.loadTable(TABLE); assertFiles(original, FILE_A); - Transaction secondReplace = catalog.buildTable(TABLE, SCHEMA) - .replaceTransaction(); - secondReplace.newFastAppend() - .appendFile(FILE_C) - .commit(); + Transaction secondReplace = catalog.buildTable(TABLE, SCHEMA).replaceTransaction(); + secondReplace.newFastAppend().appendFile(FILE_C).commit(); - Transaction firstReplace = catalog.buildTable(TABLE, SCHEMA) - .withSortOrder(WRITE_ORDER) - .replaceTransaction(); - firstReplace.newFastAppend() - .appendFile(FILE_B) - .commit(); + Transaction firstReplace = + catalog.buildTable(TABLE, SCHEMA).withSortOrder(WRITE_ORDER).replaceTransaction(); + firstReplace.newFastAppend().appendFile(FILE_B).commit(); firstReplace.commitTransaction(); Table afterFirstReplace = catalog.loadTable(TABLE); - Assert.assertEquals("Table order should match the new order", - TABLE_WRITE_ORDER, afterFirstReplace.sortOrder()); + Assert.assertEquals( + "Table order should match the new order", TABLE_WRITE_ORDER, afterFirstReplace.sortOrder()); assertUUIDsMatch(original, afterFirstReplace); assertFiles(afterFirstReplace, FILE_B); secondReplace.commitTransaction(); Table afterSecondReplace = catalog.loadTable(TABLE); - Assert.assertTrue("Table should be unsorted", - afterSecondReplace.sortOrder().isUnsorted()); + Assert.assertTrue("Table should be unsorted", afterSecondReplace.sortOrder().isUnsorted()); assertUUIDsMatch(original, afterSecondReplace); assertFiles(afterSecondReplace, FILE_C); } @@ -2065,43 +2195,37 @@ public void testConcurrentReplaceTransactionSortOrderConflict() { C catalog = catalog(); Transaction transaction = catalog.buildTable(TABLE, SCHEMA).createTransaction(); - transaction.newFastAppend() - .appendFile(FILE_A) - .commit(); + transaction.newFastAppend().appendFile(FILE_A).commit(); transaction.commitTransaction(); Table original = catalog.loadTable(TABLE); assertFiles(original, FILE_A); - Transaction secondReplace = catalog.buildTable(TABLE, SCHEMA) - .withSortOrder(WRITE_ORDER) - .replaceTransaction(); - secondReplace.newFastAppend() - .appendFile(FILE_C) - .commit(); - - Transaction firstReplace = catalog.buildTable(TABLE, SCHEMA) - .withSortOrder(SortOrder.builderFor(SCHEMA) - .desc(Expressions.bucket("id", 16)) - .desc("id") - .build()) - .replaceTransaction(); - firstReplace.newFastAppend() - .appendFile(FILE_B) - .commit(); + Transaction secondReplace = + catalog.buildTable(TABLE, SCHEMA).withSortOrder(WRITE_ORDER).replaceTransaction(); + secondReplace.newFastAppend().appendFile(FILE_C).commit(); + + Transaction firstReplace = + catalog + .buildTable(TABLE, SCHEMA) + .withSortOrder( + SortOrder.builderFor(SCHEMA).desc(Expressions.bucket("id", 16)).desc("id").build()) + .replaceTransaction(); + firstReplace.newFastAppend().appendFile(FILE_B).commit(); firstReplace.commitTransaction(); Table afterFirstReplace = catalog.loadTable(TABLE); - Assert.assertTrue("Table order should be set", - afterFirstReplace.sortOrder().isSorted()); + Assert.assertTrue("Table order should be set", afterFirstReplace.sortOrder().isSorted()); assertUUIDsMatch(original, afterFirstReplace); assertFiles(afterFirstReplace, FILE_B); secondReplace.commitTransaction(); Table afterSecondReplace = catalog.loadTable(TABLE); - Assert.assertEquals("Table order should match the new order", - TABLE_WRITE_ORDER.fields(), afterSecondReplace.sortOrder().fields()); + Assert.assertEquals( + "Table order should match the new order", + TABLE_WRITE_ORDER.fields(), + afterSecondReplace.sortOrder().fields()); assertUUIDsMatch(original, afterSecondReplace); assertFiles(afterSecondReplace, FILE_C); } @@ -2115,15 +2239,18 @@ private static void assertEmpty(String context, Catalog catalog, Namespace ns) { } public void assertUUIDsMatch(Table expected, Table actual) { - Assert.assertEquals("Table UUID should not change", + Assert.assertEquals( + "Table UUID should not change", ((BaseTable) expected).operations().current().uuid(), ((BaseTable) actual).operations().current().uuid()); } public void assertPreviousMetadataFileCount(Table table, int metadataFileCount) { TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("Table should have correct number of previous metadata locations", - metadataFileCount, ops.current().previousFiles().size()); + Assert.assertEquals( + "Table should have correct number of previous metadata locations", + metadataFileCount, + ops.current().previousFiles().size()); } public void assertNoFiles(Table table) { @@ -2136,12 +2263,15 @@ public void assertNoFiles(Table table) { public void assertFiles(Table table, DataFile... files) { try (CloseableIterable tasks = table.newScan().planFiles()) { - List paths = Streams.stream(tasks) - .map(FileScanTask::file) - .map(DataFile::path) - .collect(Collectors.toList()); - Assert.assertEquals("Should contain expected number of data files", files.length, paths.size()); - Assert.assertEquals("Should contain correct file paths", + List paths = + Streams.stream(tasks) + .map(FileScanTask::file) + .map(DataFile::path) + .collect(Collectors.toList()); + Assert.assertEquals( + "Should contain expected number of data files", files.length, paths.size()); + Assert.assertEquals( + "Should contain correct file paths", CharSequenceSet.of(Iterables.transform(Arrays.asList(files), DataFile::path)), CharSequenceSet.of(paths)); } catch (IOException e) { diff --git a/core/src/test/java/org/apache/iceberg/catalog/TestTableIdentifierParser.java b/core/src/test/java/org/apache/iceberg/catalog/TestTableIdentifierParser.java index 81831bb8ccd4..235ea1d1c4e5 100644 --- a/core/src/test/java/org/apache/iceberg/catalog/TestTableIdentifierParser.java +++ b/core/src/test/java/org/apache/iceberg/catalog/TestTableIdentifierParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.catalog; import org.apache.iceberg.AssertHelpers; @@ -29,74 +28,97 @@ public class TestTableIdentifierParser { public void testTableIdentifierToJson() { String json = "{\"namespace\":[\"accounting\",\"tax\"],\"name\":\"paid\"}"; TableIdentifier identifier = TableIdentifier.of(Namespace.of("accounting", "tax"), "paid"); - Assert.assertEquals("Should be able to serialize a table identifier with both namespace and name", - json, TableIdentifierParser.toJson(identifier)); + Assert.assertEquals( + "Should be able to serialize a table identifier with both namespace and name", + json, + TableIdentifierParser.toJson(identifier)); TableIdentifier identifierWithEmptyNamespace = TableIdentifier.of(Namespace.empty(), "paid"); String jsonWithEmptyNamespace = "{\"namespace\":[],\"name\":\"paid\"}"; - Assert.assertEquals("Should be able to serialize a table identifier that uses the empty namespace", - jsonWithEmptyNamespace, TableIdentifierParser.toJson(identifierWithEmptyNamespace)); + Assert.assertEquals( + "Should be able to serialize a table identifier that uses the empty namespace", + jsonWithEmptyNamespace, + TableIdentifierParser.toJson(identifierWithEmptyNamespace)); } @Test public void testTableIdentifierFromJson() { String json = "{\"namespace\":[\"accounting\",\"tax\"],\"name\":\"paid\"}"; TableIdentifier identifier = TableIdentifier.of(Namespace.of("accounting", "tax"), "paid"); - Assert.assertEquals("Should be able to deserialize a valid table identifier", - identifier, TableIdentifierParser.fromJson(json)); + Assert.assertEquals( + "Should be able to deserialize a valid table identifier", + identifier, + TableIdentifierParser.fromJson(json)); TableIdentifier identifierWithEmptyNamespace = TableIdentifier.of(Namespace.empty(), "paid"); String jsonWithEmptyNamespace = "{\"namespace\":[],\"name\":\"paid\"}"; - Assert.assertEquals("Should be able to deserialize a valid multi-level table identifier", - identifierWithEmptyNamespace, TableIdentifierParser.fromJson(jsonWithEmptyNamespace)); + Assert.assertEquals( + "Should be able to deserialize a valid multi-level table identifier", + identifierWithEmptyNamespace, + TableIdentifierParser.fromJson(jsonWithEmptyNamespace)); String identifierMissingNamespace = "{\"name\":\"paid\"}"; - Assert.assertEquals("Should implicitly convert a missing namespace into the the empty namespace when parsing", - identifierWithEmptyNamespace, TableIdentifierParser.fromJson(identifierMissingNamespace)); + Assert.assertEquals( + "Should implicitly convert a missing namespace into the the empty namespace when parsing", + identifierWithEmptyNamespace, + TableIdentifierParser.fromJson(identifierMissingNamespace)); } @Test public void testFailParsingWhenNullOrEmptyJson() { String nullJson = null; - AssertHelpers.assertThrows("TableIdentifierParser should fail to deserialize null JSON string", - IllegalArgumentException.class, "Cannot parse table identifier from invalid JSON: null", - () -> TableIdentifierParser.fromJson(nullJson)); + AssertHelpers.assertThrows( + "TableIdentifierParser should fail to deserialize null JSON string", + IllegalArgumentException.class, + "Cannot parse table identifier from invalid JSON: null", + () -> TableIdentifierParser.fromJson(nullJson)); String emptyString = ""; - AssertHelpers.assertThrows("TableIdentifierParser should fail to deserialize an empty string", - IllegalArgumentException.class, "Cannot parse table identifier from invalid JSON: ''", - () -> TableIdentifierParser.fromJson(emptyString)); + AssertHelpers.assertThrows( + "TableIdentifierParser should fail to deserialize an empty string", + IllegalArgumentException.class, + "Cannot parse table identifier from invalid JSON: ''", + () -> TableIdentifierParser.fromJson(emptyString)); String emptyJson = "{}"; - AssertHelpers.assertThrows("TableIdentifierParser should fail to deserialize an empty JSON string", - IllegalArgumentException.class, "Cannot parse missing string name", - () -> TableIdentifierParser.fromJson(emptyJson)); + AssertHelpers.assertThrows( + "TableIdentifierParser should fail to deserialize an empty JSON string", + IllegalArgumentException.class, + "Cannot parse missing string name", + () -> TableIdentifierParser.fromJson(emptyJson)); String emptyJsonArray = "[]"; - AssertHelpers.assertThrows("TableIdentifierParser should fail to deserialize an empty JSON array", - IllegalArgumentException.class, "Cannot parse missing or non-object table identifier: []", - () -> TableIdentifierParser.fromJson(emptyJsonArray)); + AssertHelpers.assertThrows( + "TableIdentifierParser should fail to deserialize an empty JSON array", + IllegalArgumentException.class, + "Cannot parse missing or non-object table identifier: []", + () -> TableIdentifierParser.fromJson(emptyJsonArray)); } @Test public void testFailParsingWhenMissingRequiredFields() { String identifierMissingName = "{\"namespace\":[\"accounting\",\"tax\"]}"; - AssertHelpers.assertThrows("TableIdentifierParser should fail to deserialize table with missing name", - IllegalArgumentException.class, "Cannot parse missing string name", - () -> TableIdentifierParser.fromJson(identifierMissingName)); + AssertHelpers.assertThrows( + "TableIdentifierParser should fail to deserialize table with missing name", + IllegalArgumentException.class, + "Cannot parse missing string name", + () -> TableIdentifierParser.fromJson(identifierMissingName)); } @Test public void testFailWhenFieldsHaveInvalidValues() { String invalidNamespace = "{\"namespace\":\"accounting.tax\",\"name\":\"paid\"}"; - AssertHelpers.assertThrows("TableIdentifierParser should fail to deserialize table with invalid namespace", + AssertHelpers.assertThrows( + "TableIdentifierParser should fail to deserialize table with invalid namespace", IllegalArgumentException.class, "Cannot parse namespace from non-array value: \"accounting.tax\"", - () -> TableIdentifierParser.fromJson(invalidNamespace)); + () -> TableIdentifierParser.fromJson(invalidNamespace)); String invalidName = "{\"namespace\":[\"accounting\",\"tax\"],\"name\":1234}"; - AssertHelpers.assertThrows("TableIdentifierParser should fail to deserialize table with invalid name", - IllegalArgumentException.class, "Cannot parse name to a string value: 1234", - () -> TableIdentifierParser.fromJson(invalidName)); + AssertHelpers.assertThrows( + "TableIdentifierParser should fail to deserialize table with invalid name", + IllegalArgumentException.class, + "Cannot parse name to a string value: 1234", + () -> TableIdentifierParser.fromJson(invalidName)); } } diff --git a/core/src/test/java/org/apache/iceberg/deletes/TestEqualityFilter.java b/core/src/test/java/org/apache/iceberg/deletes/TestEqualityFilter.java index 7e66d6369391..12dd72b7e474 100644 --- a/core/src/test/java/org/apache/iceberg/deletes/TestEqualityFilter.java +++ b/core/src/test/java/org/apache/iceberg/deletes/TestEqualityFilter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.deletes; import java.util.List; @@ -32,113 +31,119 @@ import org.junit.Test; public class TestEqualityFilter { - private static final Schema ROW_SCHEMA = new Schema( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.required(2, "name", Types.StringType.get()), - NestedField.optional(3, "description", Types.StringType.get())); - - private static final CloseableIterable ROWS = CloseableIterable.withNoopClose(Lists.newArrayList( - Row.of(0L, "a", "panda"), - Row.of(1L, "b", "koala"), - Row.of(2L, "c", new Utf8("kodiak")), - Row.of(4L, new Utf8("d"), "gummy"), - Row.of(5L, "e", "brown"), - Row.of(6L, "f", new Utf8("teddy")), - Row.of(7L, "g", "grizzly"), - Row.of(8L, "h", null) - )); + private static final Schema ROW_SCHEMA = + new Schema( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.required(2, "name", Types.StringType.get()), + NestedField.optional(3, "description", Types.StringType.get())); + + private static final CloseableIterable ROWS = + CloseableIterable.withNoopClose( + Lists.newArrayList( + Row.of(0L, "a", "panda"), + Row.of(1L, "b", "koala"), + Row.of(2L, "c", new Utf8("kodiak")), + Row.of(4L, new Utf8("d"), "gummy"), + Row.of(5L, "e", "brown"), + Row.of(6L, "f", new Utf8("teddy")), + Row.of(7L, "g", "grizzly"), + Row.of(8L, "h", null))); @Test public void testEqualitySetFilterLongColumn() { - CloseableIterable deletes = CloseableIterable.withNoopClose(Lists.newArrayList( - Row.of(4L), - Row.of(3L), - Row.of(6L) - )); - - List expected = Lists.newArrayList( - Row.of(0L, "a", "panda"), - Row.of(1L, "b", "koala"), - Row.of(2L, "c", new Utf8("kodiak")), - Row.of(5L, "e", "brown"), - Row.of(7L, "g", "grizzly"), - Row.of(8L, "h", null) - ); - - Assert.assertEquals("Filter should produce expected rows", + CloseableIterable deletes = + CloseableIterable.withNoopClose(Lists.newArrayList(Row.of(4L), Row.of(3L), Row.of(6L))); + + List expected = + Lists.newArrayList( + Row.of(0L, "a", "panda"), + Row.of(1L, "b", "koala"), + Row.of(2L, "c", new Utf8("kodiak")), + Row.of(5L, "e", "brown"), + Row.of(7L, "g", "grizzly"), + Row.of(8L, "h", null)); + + Assert.assertEquals( + "Filter should produce expected rows", expected, - Lists.newArrayList(Deletes.filter(ROWS, - row -> Row.of(row.get(0, Long.class)), - Deletes.toEqualitySet(deletes, ROW_SCHEMA.select("id").asStruct())))); + Lists.newArrayList( + Deletes.filter( + ROWS, + row -> Row.of(row.get(0, Long.class)), + Deletes.toEqualitySet(deletes, ROW_SCHEMA.select("id").asStruct())))); } @Test public void testEqualitySetFilterStringColumn() { - CloseableIterable deletes = CloseableIterable.withNoopClose(Lists.newArrayList( - Row.of("a"), - Row.of("d"), - Row.of("h") - )); - - List expected = Lists.newArrayList( - Row.of(1L, "b", "koala"), - Row.of(2L, "c", new Utf8("kodiak")), - Row.of(5L, "e", "brown"), - Row.of(6L, "f", new Utf8("teddy")), - Row.of(7L, "g", "grizzly") - ); - - Assert.assertEquals("Filter should produce expected rows", + CloseableIterable deletes = + CloseableIterable.withNoopClose(Lists.newArrayList(Row.of("a"), Row.of("d"), Row.of("h"))); + + List expected = + Lists.newArrayList( + Row.of(1L, "b", "koala"), + Row.of(2L, "c", new Utf8("kodiak")), + Row.of(5L, "e", "brown"), + Row.of(6L, "f", new Utf8("teddy")), + Row.of(7L, "g", "grizzly")); + + Assert.assertEquals( + "Filter should produce expected rows", expected, - Lists.newArrayList(Deletes.filter(ROWS, - row -> Row.of(row.get(1, CharSequence.class)), - Deletes.toEqualitySet(deletes, ROW_SCHEMA.select("name").asStruct())))); + Lists.newArrayList( + Deletes.filter( + ROWS, + row -> Row.of(row.get(1, CharSequence.class)), + Deletes.toEqualitySet(deletes, ROW_SCHEMA.select("name").asStruct())))); } @Test public void testEqualitySetFilterStringColumnWithNull() { - CloseableIterable deletes = CloseableIterable.withNoopClose(Lists.newArrayList( - Row.of(new Object[] { null }) - )); - - List expected = Lists.newArrayList( - Row.of(0L, "a", "panda"), - Row.of(1L, "b", "koala"), - Row.of(2L, "c", new Utf8("kodiak")), - Row.of(4L, new Utf8("d"), "gummy"), - Row.of(5L, "e", "brown"), - Row.of(6L, "f", new Utf8("teddy")), - Row.of(7L, "g", "grizzly") - ); - - Assert.assertEquals("Filter should produce expected rows", + CloseableIterable deletes = + CloseableIterable.withNoopClose(Lists.newArrayList(Row.of(new Object[] {null}))); + + List expected = + Lists.newArrayList( + Row.of(0L, "a", "panda"), + Row.of(1L, "b", "koala"), + Row.of(2L, "c", new Utf8("kodiak")), + Row.of(4L, new Utf8("d"), "gummy"), + Row.of(5L, "e", "brown"), + Row.of(6L, "f", new Utf8("teddy")), + Row.of(7L, "g", "grizzly")); + + Assert.assertEquals( + "Filter should produce expected rows", expected, - Lists.newArrayList(Deletes.filter(ROWS, - row -> Row.of(row.get(2, CharSequence.class)), - Deletes.toEqualitySet(deletes, ROW_SCHEMA.select("description").asStruct())))); + Lists.newArrayList( + Deletes.filter( + ROWS, + row -> Row.of(row.get(2, CharSequence.class)), + Deletes.toEqualitySet(deletes, ROW_SCHEMA.select("description").asStruct())))); } @Test public void testEqualitySetFilterMultipleColumns() { - CloseableIterable deletes = CloseableIterable.withNoopClose(Lists.newArrayList( - Row.of(2L, "kodiak"), - Row.of(3L, "care"), - Row.of(8L, null) - )); - - List expected = Lists.newArrayList( - Row.of(0L, "a", "panda"), - Row.of(1L, "b", "koala"), - Row.of(4L, new Utf8("d"), "gummy"), - Row.of(5L, "e", "brown"), - Row.of(6L, "f", new Utf8("teddy")), - Row.of(7L, "g", "grizzly") - ); - - Assert.assertEquals("Filter should produce expected rows", + CloseableIterable deletes = + CloseableIterable.withNoopClose( + Lists.newArrayList(Row.of(2L, "kodiak"), Row.of(3L, "care"), Row.of(8L, null))); + + List expected = + Lists.newArrayList( + Row.of(0L, "a", "panda"), + Row.of(1L, "b", "koala"), + Row.of(4L, new Utf8("d"), "gummy"), + Row.of(5L, "e", "brown"), + Row.of(6L, "f", new Utf8("teddy")), + Row.of(7L, "g", "grizzly")); + + Assert.assertEquals( + "Filter should produce expected rows", expected, - Lists.newArrayList(Deletes.filter(ROWS, - row -> Row.of(row.get(0, Long.class), row.get(2, CharSequence.class)), - Deletes.toEqualitySet(deletes, ROW_SCHEMA.select("id", "description").asStruct())))); + Lists.newArrayList( + Deletes.filter( + ROWS, + row -> Row.of(row.get(0, Long.class), row.get(2, CharSequence.class)), + Deletes.toEqualitySet( + deletes, ROW_SCHEMA.select("id", "description").asStruct())))); } } diff --git a/core/src/test/java/org/apache/iceberg/deletes/TestPositionFilter.java b/core/src/test/java/org/apache/iceberg/deletes/TestPositionFilter.java index dc255c7c4ab0..e195f61515f5 100644 --- a/core/src/test/java/org/apache/iceberg/deletes/TestPositionFilter.java +++ b/core/src/test/java/org/apache/iceberg/deletes/TestPositionFilter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.deletes; import java.util.List; @@ -34,137 +33,156 @@ public class TestPositionFilter { @Test public void testPositionFileFilter() { - List positionDeletes = Lists.newArrayList( - Row.of("file_a.avro", 0L), - Row.of("file_a.avro", 3L), - Row.of(new Utf8("file_a.avro"), 9L), - Row.of("file_a.avro", 22L), - Row.of("file_a.avro", 56L), - Row.of(new Utf8("file_b.avro"), 16L), - Row.of("file_b.avro", 19L), - Row.of("file_b.avro", 63L), - Row.of("file_b.avro", 70L), - Row.of("file_b.avro", 91L) - ); - - Assert.assertEquals("Should contain only file_a positions", + List positionDeletes = + Lists.newArrayList( + Row.of("file_a.avro", 0L), + Row.of("file_a.avro", 3L), + Row.of(new Utf8("file_a.avro"), 9L), + Row.of("file_a.avro", 22L), + Row.of("file_a.avro", 56L), + Row.of(new Utf8("file_b.avro"), 16L), + Row.of("file_b.avro", 19L), + Row.of("file_b.avro", 63L), + Row.of("file_b.avro", 70L), + Row.of("file_b.avro", 91L)); + + Assert.assertEquals( + "Should contain only file_a positions", Lists.newArrayList(0L, 3L, 9L, 22L, 56L), - Lists.newArrayList(Deletes.deletePositions("file_a.avro", CloseableIterable.withNoopClose(positionDeletes)))); + Lists.newArrayList( + Deletes.deletePositions( + "file_a.avro", CloseableIterable.withNoopClose(positionDeletes)))); - Assert.assertEquals("Should contain only file_b positions", + Assert.assertEquals( + "Should contain only file_b positions", Lists.newArrayList(16L, 19L, 63L, 70L, 91L), - Lists.newArrayList(Deletes.deletePositions("file_b.avro", CloseableIterable.withNoopClose(positionDeletes)))); + Lists.newArrayList( + Deletes.deletePositions( + "file_b.avro", CloseableIterable.withNoopClose(positionDeletes)))); - Assert.assertEquals("Should contain no positions for file_c", + Assert.assertEquals( + "Should contain no positions for file_c", Lists.newArrayList(), - Lists.newArrayList(Deletes.deletePositions("file_c.avro", CloseableIterable.withNoopClose(positionDeletes)))); + Lists.newArrayList( + Deletes.deletePositions( + "file_c.avro", CloseableIterable.withNoopClose(positionDeletes)))); } @Test public void testPositionMerging() { - List positionDeletes1 = Lists.newArrayList( - Row.of("file_a.avro", 0L), - Row.of("file_a.avro", 3L), - Row.of("file_a.avro", 9L), - Row.of("file_a.avro", 22L), - Row.of("file_a.avro", 56L) - ); - - List positionDeletes2 = Lists.newArrayList( - Row.of("file_a.avro", 16L), - Row.of("file_a.avro", 19L), - Row.of("file_a.avro", 63L), - Row.of("file_a.avro", 70L), - Row.of("file_a.avro", 91L) - ); - - List positionDeletes3 = Lists.newArrayList( - Row.of("file_a.avro", 3L), - Row.of("file_a.avro", 19L), - Row.of("file_a.avro", 22L) - ); - - List> deletes = Lists.newArrayList( - CloseableIterable.withNoopClose(positionDeletes1), - CloseableIterable.withNoopClose(positionDeletes2), - CloseableIterable.withNoopClose(positionDeletes3) - ); - - Assert.assertEquals("Should merge deletes in order, with duplicates", + List positionDeletes1 = + Lists.newArrayList( + Row.of("file_a.avro", 0L), + Row.of("file_a.avro", 3L), + Row.of("file_a.avro", 9L), + Row.of("file_a.avro", 22L), + Row.of("file_a.avro", 56L)); + + List positionDeletes2 = + Lists.newArrayList( + Row.of("file_a.avro", 16L), + Row.of("file_a.avro", 19L), + Row.of("file_a.avro", 63L), + Row.of("file_a.avro", 70L), + Row.of("file_a.avro", 91L)); + + List positionDeletes3 = + Lists.newArrayList( + Row.of("file_a.avro", 3L), Row.of("file_a.avro", 19L), Row.of("file_a.avro", 22L)); + + List> deletes = + Lists.newArrayList( + CloseableIterable.withNoopClose(positionDeletes1), + CloseableIterable.withNoopClose(positionDeletes2), + CloseableIterable.withNoopClose(positionDeletes3)); + + Assert.assertEquals( + "Should merge deletes in order, with duplicates", Lists.newArrayList(0L, 3L, 3L, 9L, 16L, 19L, 19L, 22L, 22L, 56L, 63L, 70L, 91L), Lists.newArrayList(Deletes.deletePositions("file_a.avro", deletes))); } @Test public void testPositionStreamRowFilter() { - CloseableIterable rows = CloseableIterable.withNoopClose(Lists.newArrayList( - Row.of(0L, "a"), - Row.of(1L, "b"), - Row.of(2L, "c"), - Row.of(3L, "d"), - Row.of(4L, "e"), - Row.of(5L, "f"), - Row.of(6L, "g"), - Row.of(7L, "h"), - Row.of(8L, "i"), - Row.of(9L, "j") - )); - - CloseableIterable deletes = CloseableIterable.withNoopClose(Lists.newArrayList(0L, 3L, 4L, 7L, 9L)); - - CloseableIterable actual = Deletes.streamingFilter(rows, row -> row.get(0, Long.class), deletes); - Assert.assertEquals("Filter should produce expected rows", + CloseableIterable rows = + CloseableIterable.withNoopClose( + Lists.newArrayList( + Row.of(0L, "a"), + Row.of(1L, "b"), + Row.of(2L, "c"), + Row.of(3L, "d"), + Row.of(4L, "e"), + Row.of(5L, "f"), + Row.of(6L, "g"), + Row.of(7L, "h"), + Row.of(8L, "i"), + Row.of(9L, "j"))); + + CloseableIterable deletes = + CloseableIterable.withNoopClose(Lists.newArrayList(0L, 3L, 4L, 7L, 9L)); + + CloseableIterable actual = + Deletes.streamingFilter(rows, row -> row.get(0, Long.class), deletes); + Assert.assertEquals( + "Filter should produce expected rows", Lists.newArrayList(1L, 2L, 5L, 6L, 8L), Lists.newArrayList(Iterables.transform(actual, row -> row.get(0, Long.class)))); } @Test public void testPositionStreamRowDeleteMarker() { - CloseableIterable rows = CloseableIterable.withNoopClose(Lists.newArrayList( - Row.of(0L, "a", false), - Row.of(1L, "b", false), - Row.of(2L, "c", false), - Row.of(3L, "d", false), - Row.of(4L, "e", false), - Row.of(5L, "f", false), - Row.of(6L, "g", false), - Row.of(7L, "h", false), - Row.of(8L, "i", false), - Row.of(9L, "j", false) - )); - - CloseableIterable deletes = CloseableIterable.withNoopClose(Lists.newArrayList(0L, 3L, 4L, 7L, 9L)); - - CloseableIterable actual = Deletes.streamingMarker(rows, - row -> row.get(0, Long.class), /* row to position */ - deletes, - row -> row.set(2, true) /* delete marker */ - ); - Assert.assertEquals("Filter should produce expected rows", + CloseableIterable rows = + CloseableIterable.withNoopClose( + Lists.newArrayList( + Row.of(0L, "a", false), + Row.of(1L, "b", false), + Row.of(2L, "c", false), + Row.of(3L, "d", false), + Row.of(4L, "e", false), + Row.of(5L, "f", false), + Row.of(6L, "g", false), + Row.of(7L, "h", false), + Row.of(8L, "i", false), + Row.of(9L, "j", false))); + + CloseableIterable deletes = + CloseableIterable.withNoopClose(Lists.newArrayList(0L, 3L, 4L, 7L, 9L)); + + CloseableIterable actual = + Deletes.streamingMarker( + rows, + row -> row.get(0, Long.class), /* row to position */ + deletes, + row -> row.set(2, true) /* delete marker */); + Assert.assertEquals( + "Filter should produce expected rows", Lists.newArrayList(true, false, false, true, true, false, false, true, false, true), Lists.newArrayList(Iterables.transform(actual, row -> row.get(2, Boolean.class)))); } @Test public void testPositionStreamRowFilterWithDuplicates() { - CloseableIterable rows = CloseableIterable.withNoopClose(Lists.newArrayList( - Row.of(0L, "a"), - Row.of(1L, "b"), - Row.of(2L, "c"), - Row.of(3L, "d"), - Row.of(4L, "e"), - Row.of(5L, "f"), - Row.of(6L, "g"), - Row.of(7L, "h"), - Row.of(8L, "i"), - Row.of(9L, "j") - )); - - CloseableIterable deletes = CloseableIterable.withNoopClose( - Lists.newArrayList(0L, 0L, 0L, 3L, 4L, 7L, 7L, 9L, 9L, 9L)); - - CloseableIterable actual = Deletes.streamingFilter(rows, row -> row.get(0, Long.class), deletes); - Assert.assertEquals("Filter should produce expected rows", + CloseableIterable rows = + CloseableIterable.withNoopClose( + Lists.newArrayList( + Row.of(0L, "a"), + Row.of(1L, "b"), + Row.of(2L, "c"), + Row.of(3L, "d"), + Row.of(4L, "e"), + Row.of(5L, "f"), + Row.of(6L, "g"), + Row.of(7L, "h"), + Row.of(8L, "i"), + Row.of(9L, "j"))); + + CloseableIterable deletes = + CloseableIterable.withNoopClose(Lists.newArrayList(0L, 0L, 0L, 3L, 4L, 7L, 7L, 9L, 9L, 9L)); + + CloseableIterable actual = + Deletes.streamingFilter(rows, row -> row.get(0, Long.class), deletes); + Assert.assertEquals( + "Filter should produce expected rows", Lists.newArrayList(1L, 2L, 5L, 6L, 8L), Lists.newArrayList(Iterables.transform(actual, row -> row.get(0, Long.class)))); } @@ -172,122 +190,138 @@ public void testPositionStreamRowFilterWithDuplicates() { @Test public void testPositionStreamRowFilterWithRowGaps() { // test the case where row position is greater than the delete position - CloseableIterable rows = CloseableIterable.withNoopClose(Lists.newArrayList( - Row.of(2L, "c"), - Row.of(3L, "d"), - Row.of(5L, "f"), - Row.of(6L, "g") - )); + CloseableIterable rows = + CloseableIterable.withNoopClose( + Lists.newArrayList(Row.of(2L, "c"), Row.of(3L, "d"), Row.of(5L, "f"), Row.of(6L, "g"))); - CloseableIterable deletes = CloseableIterable.withNoopClose(Lists.newArrayList(0L, 2L, 3L, 4L, 7L, 9L)); + CloseableIterable deletes = + CloseableIterable.withNoopClose(Lists.newArrayList(0L, 2L, 3L, 4L, 7L, 9L)); - CloseableIterable actual = Deletes.streamingFilter(rows, row -> row.get(0, Long.class), deletes); - Assert.assertEquals("Filter should produce expected rows", + CloseableIterable actual = + Deletes.streamingFilter(rows, row -> row.get(0, Long.class), deletes); + Assert.assertEquals( + "Filter should produce expected rows", Lists.newArrayList(5L, 6L), Lists.newArrayList(Iterables.transform(actual, row -> row.get(0, Long.class)))); } @Test public void testCombinedPositionStreamRowFilter() { - CloseableIterable positionDeletes1 = CloseableIterable.withNoopClose(Lists.newArrayList( - Row.of("file_a.avro", 0L), - Row.of("file_a.avro", 3L), - Row.of("file_a.avro", 9L), - Row.of("file_b.avro", 5L), - Row.of("file_b.avro", 6L) - )); - - CloseableIterable positionDeletes2 = CloseableIterable.withNoopClose(Lists.newArrayList( - Row.of("file_a.avro", 3L), - Row.of("file_a.avro", 4L), - Row.of("file_a.avro", 7L), - Row.of("file_b.avro", 2L) - )); - - CloseableIterable rows = CloseableIterable.withNoopClose(Lists.newArrayList( - Row.of(0L, "a"), - Row.of(1L, "b"), - Row.of(2L, "c"), - Row.of(3L, "d"), - Row.of(4L, "e"), - Row.of(5L, "f"), - Row.of(6L, "g"), - Row.of(7L, "h"), - Row.of(8L, "i"), - Row.of(9L, "j") - )); - - CloseableIterable actual = Deletes.streamingFilter( - rows, - row -> row.get(0, Long.class), - Deletes.deletePositions("file_a.avro", ImmutableList.of(positionDeletes1, positionDeletes2))); - - Assert.assertEquals("Filter should produce expected rows", + CloseableIterable positionDeletes1 = + CloseableIterable.withNoopClose( + Lists.newArrayList( + Row.of("file_a.avro", 0L), + Row.of("file_a.avro", 3L), + Row.of("file_a.avro", 9L), + Row.of("file_b.avro", 5L), + Row.of("file_b.avro", 6L))); + + CloseableIterable positionDeletes2 = + CloseableIterable.withNoopClose( + Lists.newArrayList( + Row.of("file_a.avro", 3L), + Row.of("file_a.avro", 4L), + Row.of("file_a.avro", 7L), + Row.of("file_b.avro", 2L))); + + CloseableIterable rows = + CloseableIterable.withNoopClose( + Lists.newArrayList( + Row.of(0L, "a"), + Row.of(1L, "b"), + Row.of(2L, "c"), + Row.of(3L, "d"), + Row.of(4L, "e"), + Row.of(5L, "f"), + Row.of(6L, "g"), + Row.of(7L, "h"), + Row.of(8L, "i"), + Row.of(9L, "j"))); + + CloseableIterable actual = + Deletes.streamingFilter( + rows, + row -> row.get(0, Long.class), + Deletes.deletePositions( + "file_a.avro", ImmutableList.of(positionDeletes1, positionDeletes2))); + + Assert.assertEquals( + "Filter should produce expected rows", Lists.newArrayList(1L, 2L, 5L, 6L, 8L), Lists.newArrayList(Iterables.transform(actual, row -> row.get(0, Long.class)))); } @Test public void testPositionSetRowFilter() { - CloseableIterable rows = CloseableIterable.withNoopClose(Lists.newArrayList( - Row.of(0L, "a"), - Row.of(1L, "b"), - Row.of(2L, "c"), - Row.of(3L, "d"), - Row.of(4L, "e"), - Row.of(5L, "f"), - Row.of(6L, "g"), - Row.of(7L, "h"), - Row.of(8L, "i"), - Row.of(9L, "j") - )); - - CloseableIterable deletes = CloseableIterable.withNoopClose(Lists.newArrayList(0L, 3L, 4L, 7L, 9L)); - - Predicate shouldKeep = row -> !Deletes.toPositionIndex(deletes).isDeleted(row.get(0, Long.class)); + CloseableIterable rows = + CloseableIterable.withNoopClose( + Lists.newArrayList( + Row.of(0L, "a"), + Row.of(1L, "b"), + Row.of(2L, "c"), + Row.of(3L, "d"), + Row.of(4L, "e"), + Row.of(5L, "f"), + Row.of(6L, "g"), + Row.of(7L, "h"), + Row.of(8L, "i"), + Row.of(9L, "j"))); + + CloseableIterable deletes = + CloseableIterable.withNoopClose(Lists.newArrayList(0L, 3L, 4L, 7L, 9L)); + + Predicate shouldKeep = + row -> !Deletes.toPositionIndex(deletes).isDeleted(row.get(0, Long.class)); CloseableIterable actual = CloseableIterable.filter(rows, shouldKeep); - Assert.assertEquals("Filter should produce expected rows", + Assert.assertEquals( + "Filter should produce expected rows", Lists.newArrayList(1L, 2L, 5L, 6L, 8L), Lists.newArrayList(Iterables.transform(actual, row -> row.get(0, Long.class)))); } @Test public void testCombinedPositionSetRowFilter() { - CloseableIterable positionDeletes1 = CloseableIterable.withNoopClose(Lists.newArrayList( - Row.of("file_a.avro", 0L), - Row.of("file_a.avro", 3L), - Row.of("file_a.avro", 9L), - Row.of("file_b.avro", 5L), - Row.of("file_b.avro", 6L) - )); - - CloseableIterable positionDeletes2 = CloseableIterable.withNoopClose(Lists.newArrayList( - Row.of("file_a.avro", 3L), - Row.of("file_a.avro", 4L), - Row.of("file_a.avro", 7L), - Row.of("file_b.avro", 2L) - )); - - CloseableIterable rows = CloseableIterable.withNoopClose(Lists.newArrayList( - Row.of(0L, "a"), - Row.of(1L, "b"), - Row.of(2L, "c"), - Row.of(3L, "d"), - Row.of(4L, "e"), - Row.of(5L, "f"), - Row.of(6L, "g"), - Row.of(7L, "h"), - Row.of(8L, "i"), - Row.of(9L, "j") - )); - - Predicate isDeleted = row -> Deletes - .toPositionIndex("file_a.avro", ImmutableList.of(positionDeletes1, positionDeletes2)) - .isDeleted(row.get(0, Long.class)); + CloseableIterable positionDeletes1 = + CloseableIterable.withNoopClose( + Lists.newArrayList( + Row.of("file_a.avro", 0L), + Row.of("file_a.avro", 3L), + Row.of("file_a.avro", 9L), + Row.of("file_b.avro", 5L), + Row.of("file_b.avro", 6L))); + + CloseableIterable positionDeletes2 = + CloseableIterable.withNoopClose( + Lists.newArrayList( + Row.of("file_a.avro", 3L), + Row.of("file_a.avro", 4L), + Row.of("file_a.avro", 7L), + Row.of("file_b.avro", 2L))); + + CloseableIterable rows = + CloseableIterable.withNoopClose( + Lists.newArrayList( + Row.of(0L, "a"), + Row.of(1L, "b"), + Row.of(2L, "c"), + Row.of(3L, "d"), + Row.of(4L, "e"), + Row.of(5L, "f"), + Row.of(6L, "g"), + Row.of(7L, "h"), + Row.of(8L, "i"), + Row.of(9L, "j"))); + + Predicate isDeleted = + row -> + Deletes.toPositionIndex( + "file_a.avro", ImmutableList.of(positionDeletes1, positionDeletes2)) + .isDeleted(row.get(0, Long.class)); CloseableIterable actual = CloseableIterable.filter(rows, isDeleted.negate()); - Assert.assertEquals("Filter should produce expected rows", + Assert.assertEquals( + "Filter should produce expected rows", Lists.newArrayList(1L, 2L, 5L, 6L, 8L), Lists.newArrayList(Iterables.transform(actual, row -> row.get(0, Long.class)))); } diff --git a/core/src/test/java/org/apache/iceberg/encryption/TestCiphers.java b/core/src/test/java/org/apache/iceberg/encryption/TestCiphers.java index 26aaeae0d486..e5b7d7245a64 100644 --- a/core/src/test/java/org/apache/iceberg/encryption/TestCiphers.java +++ b/core/src/test/java/org/apache/iceberg/encryption/TestCiphers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption; import java.nio.charset.StandardCharsets; diff --git a/core/src/test/java/org/apache/iceberg/encryption/kms/KeyStoreKmsClient.java b/core/src/test/java/org/apache/iceberg/encryption/kms/KeyStoreKmsClient.java index 07b60e209645..1cd049b85542 100644 --- a/core/src/test/java/org/apache/iceberg/encryption/kms/KeyStoreKmsClient.java +++ b/core/src/test/java/org/apache/iceberg/encryption/kms/KeyStoreKmsClient.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption.kms; import java.io.FileInputStream; @@ -35,12 +34,13 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; /** - * KMS client demo class, based on the Java KeyStore API that reads keys from standard PKCS12 keystore files. - * Not for use in production. + * KMS client demo class, based on the Java KeyStore API that reads keys from standard PKCS12 + * keystore files. Not for use in production. */ public class KeyStoreKmsClient extends MemoryMockKMS { - // Path to keystore file. Preferably kept in volatile storage, such as ramdisk. Don't store with data. + // Path to keystore file. Preferably kept in volatile storage, such as ramdisk. Don't store with + // data. public static final String KEYSTORE_FILE_PATH_PROP = "keystore.kms.client.file.path"; // Credentials (such as keystore password) must never be kept in a persistent storage. @@ -62,12 +62,12 @@ public ByteBuffer unwrapKey(String wrappedKey, String wrappingKeyId) { @Override public void initialize(Map properties) { String keystorePath = properties.get(KEYSTORE_FILE_PATH_PROP); - Preconditions.checkNotNull(keystorePath, KEYSTORE_FILE_PATH_PROP + " must be set in hadoop or table " + - "properties"); + Preconditions.checkNotNull( + keystorePath, KEYSTORE_FILE_PATH_PROP + " must be set in hadoop or table " + "properties"); String keystorePassword = System.getenv(KEYSTORE_PASSWORD_ENV_VAR); - Preconditions.checkNotNull(keystorePassword, KEYSTORE_PASSWORD_ENV_VAR + " environment variable " + - "must be set"); + Preconditions.checkNotNull( + keystorePassword, KEYSTORE_PASSWORD_ENV_VAR + " environment variable " + "must be set"); KeyStore keyStore; try { @@ -93,7 +93,7 @@ public void initialize(Map properties) { Enumeration keyAliases; try { - keyAliases = keyStore.aliases(); + keyAliases = keyStore.aliases(); } catch (KeyStoreException e) { throw new RuntimeException("Failed to get key aliases in keystore file " + keystorePath, e); } diff --git a/core/src/test/java/org/apache/iceberg/encryption/kms/MemoryMockKMS.java b/core/src/test/java/org/apache/iceberg/encryption/kms/MemoryMockKMS.java index 0485e770bff2..af61ef000d89 100644 --- a/core/src/test/java/org/apache/iceberg/encryption/kms/MemoryMockKMS.java +++ b/core/src/test/java/org/apache/iceberg/encryption/kms/MemoryMockKMS.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.encryption.kms; import java.nio.ByteBuffer; @@ -25,9 +24,7 @@ import org.apache.iceberg.encryption.Ciphers; import org.apache.iceberg.encryption.KmsClient; -/** - * For testing and demonstrations; not for use in production. - */ +/** For testing and demonstrations; not for use in production. */ public abstract class MemoryMockKMS implements KmsClient { protected Map masterKeys; @@ -36,7 +33,8 @@ public abstract class MemoryMockKMS implements KmsClient { public String wrapKey(ByteBuffer key, String wrappingKeyId) { byte[] wrappingKey = masterKeys.get(wrappingKeyId); if (null == wrappingKey) { - throw new RuntimeException("Cannot wrap, because wrapping key " + wrappingKeyId + " is not found"); + throw new RuntimeException( + "Cannot wrap, because wrapping key " + wrappingKeyId + " is not found"); } Ciphers.AesGcmEncryptor keyEncryptor = new Ciphers.AesGcmEncryptor(wrappingKey); byte[] encryptedKey = keyEncryptor.encrypt(key.array(), null); @@ -48,7 +46,8 @@ public ByteBuffer unwrapKey(String wrappedKey, String wrappingKeyId) { byte[] encryptedKey = Base64.getDecoder().decode(wrappedKey); byte[] wrappingKey = masterKeys.get(wrappingKeyId); if (null == wrappingKey) { - throw new RuntimeException("Cannot unwrap, because wrapping key " + wrappingKeyId + " is not found"); + throw new RuntimeException( + "Cannot unwrap, because wrapping key " + wrappingKeyId + " is not found"); } Ciphers.AesGcmDecryptor keyDecryptor = new Ciphers.AesGcmDecryptor(wrappingKey); byte[] key = keyDecryptor.decrypt(encryptedKey, null); diff --git a/core/src/test/java/org/apache/iceberg/hadoop/HadoopFileIOTest.java b/core/src/test/java/org/apache/iceberg/hadoop/HadoopFileIOTest.java index 0721d69997c6..40320aa61a27 100644 --- a/core/src/test/java/org/apache/iceberg/hadoop/HadoopFileIOTest.java +++ b/core/src/test/java/org/apache/iceberg/hadoop/HadoopFileIOTest.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + import java.io.File; import java.io.IOException; import java.io.UncheckedIOException; @@ -33,17 +35,13 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertThrows; - public class HadoopFileIOTest { private final Random random = new Random(1); private FileSystem fs; private HadoopFileIO hadoopFileIO; - @TempDir - static File tempDir; + @TempDir static File tempDir; @BeforeEach public void before() throws Exception { @@ -59,15 +57,21 @@ public void testListPrefix() { List scaleSizes = Lists.newArrayList(1, 1000, 2500); - scaleSizes.parallelStream().forEach(scale -> { - Path scalePath = new Path(parent, Integer.toString(scale)); + scaleSizes + .parallelStream() + .forEach( + scale -> { + Path scalePath = new Path(parent, Integer.toString(scale)); - createRandomFiles(scalePath, scale); - assertEquals((long) scale, Streams.stream(hadoopFileIO.listPrefix(scalePath.toUri().toString())).count()); - }); + createRandomFiles(scalePath, scale); + assertEquals( + (long) scale, + Streams.stream(hadoopFileIO.listPrefix(scalePath.toUri().toString())).count()); + }); long totalFiles = scaleSizes.stream().mapToLong(Integer::longValue).sum(); - assertEquals(totalFiles, Streams.stream(hadoopFileIO.listPrefix(parent.toUri().toString())).count()); + assertEquals( + totalFiles, Streams.stream(hadoopFileIO.listPrefix(parent.toUri().toString())).count()); } @Test @@ -76,29 +80,39 @@ public void testDeletePrefix() { List scaleSizes = Lists.newArrayList(1, 1000, 2500); - scaleSizes.parallelStream().forEach(scale -> { - Path scalePath = new Path(parent, Integer.toString(scale)); + scaleSizes + .parallelStream() + .forEach( + scale -> { + Path scalePath = new Path(parent, Integer.toString(scale)); - createRandomFiles(scalePath, scale); - hadoopFileIO.deletePrefix(scalePath.toUri().toString()); + createRandomFiles(scalePath, scale); + hadoopFileIO.deletePrefix(scalePath.toUri().toString()); - // Hadoop filesystem will throw if the path does not exist - assertThrows(UncheckedIOException.class, () -> hadoopFileIO.listPrefix(scalePath.toUri().toString()).iterator()); - }); + // Hadoop filesystem will throw if the path does not exist + assertThrows( + UncheckedIOException.class, + () -> hadoopFileIO.listPrefix(scalePath.toUri().toString()).iterator()); + }); hadoopFileIO.deletePrefix(parent.toUri().toString()); // Hadoop filesystem will throw if the path does not exist - assertThrows(UncheckedIOException.class, () -> hadoopFileIO.listPrefix(parent.toUri().toString()).iterator()); + assertThrows( + UncheckedIOException.class, + () -> hadoopFileIO.listPrefix(parent.toUri().toString()).iterator()); } private void createRandomFiles(Path parent, int count) { - random.ints(count).parallel().forEach(i -> { - try { - fs.createNewFile(new Path(parent, "file-" + i)); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - ); + random + .ints(count) + .parallel() + .forEach( + i -> { + try { + fs.createNewFile(new Path(parent, "file-" + i)); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + }); } } diff --git a/core/src/test/java/org/apache/iceberg/hadoop/HadoopTableTestBase.java b/core/src/test/java/org/apache/iceberg/hadoop/HadoopTableTestBase.java index 4cf0033bc5e8..0c2493856431 100644 --- a/core/src/test/java/org/apache/iceberg/hadoop/HadoopTableTestBase.java +++ b/core/src/test/java/org/apache/iceberg/hadoop/HadoopTableTestBase.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; +import static org.apache.iceberg.Files.localInput; +import static org.apache.iceberg.TableMetadataParser.getFileExtension; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; @@ -48,71 +52,68 @@ import org.junit.Rule; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.Files.localInput; -import static org.apache.iceberg.TableMetadataParser.getFileExtension; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class HadoopTableTestBase { // Schema passed to create tables - static final Schema SCHEMA = new Schema( - required(3, "id", Types.IntegerType.get(), "unique ID"), - required(4, "data", Types.StringType.get()) - ); + static final Schema SCHEMA = + new Schema( + required(3, "id", Types.IntegerType.get(), "unique ID"), + required(4, "data", Types.StringType.get())); // This is the actual schema for the table, with column IDs reassigned - static final Schema TABLE_SCHEMA = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); + static final Schema TABLE_SCHEMA = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); - static final Schema UPDATED_SCHEMA = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()), - optional(3, "n", Types.IntegerType.get()) - ); + static final Schema UPDATED_SCHEMA = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get()), + optional(3, "n", Types.IntegerType.get())); // Partition spec used to create tables - static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .bucket("data", 16) - .build(); + static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).bucket("data", 16).build(); static final HadoopTables TABLES = new HadoopTables(new Configuration()); - static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("data_bucket=0") // easy way to set partition data for now - .withRecordCount(2) // needs at least one record or else metrics will filter it out - .build(); - static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("data_bucket=1") // easy way to set partition data for now - .withRecordCount(2) // needs at least one record or else metrics will filter it out - .build(); - static final DeleteFile FILE_B_DELETES = FileMetadata.deleteFileBuilder(SPEC) - .ofPositionDeletes() - .withPath("/path/to/data-b-deletes.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("data_bucket=1") - .withRecordCount(1) - .build(); - static final DataFile FILE_C = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("data_bucket=2") // easy way to set partition data for now - .withRecordCount(2) // needs at least one record or else metrics will filter it out - .build(); - static final DataFile FILE_D = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("data_bucket=3") // easy way to set partition data for now - .withRecordCount(2) // needs at least one record or else metrics will filter it out - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("data_bucket=0") // easy way to set partition data for now + .withRecordCount(2) // needs at least one record or else metrics will filter it out + .build(); + static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("data_bucket=1") // easy way to set partition data for now + .withRecordCount(2) // needs at least one record or else metrics will filter it out + .build(); + static final DeleteFile FILE_B_DELETES = + FileMetadata.deleteFileBuilder(SPEC) + .ofPositionDeletes() + .withPath("/path/to/data-b-deletes.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("data_bucket=1") + .withRecordCount(1) + .build(); + static final DataFile FILE_C = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("data_bucket=2") // easy way to set partition data for now + .withRecordCount(2) // needs at least one record or else metrics will filter it out + .build(); + static final DataFile FILE_D = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("data_bucket=3") // easy way to set partition data for now + .withRecordCount(2) // needs at least one record or else metrics will filter it out + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); File tableDir = null; String tableLocation = null; @@ -132,22 +133,26 @@ public void setupTable() throws Exception { } List listManifestFiles() { - return Lists.newArrayList(metadataDir.listFiles((dir, name) -> - !name.startsWith("snap") && Files.getFileExtension(name).equalsIgnoreCase("avro"))); + return Lists.newArrayList( + metadataDir.listFiles( + (dir, name) -> + !name.startsWith("snap") && Files.getFileExtension(name).equalsIgnoreCase("avro"))); } List listMetadataJsonFiles() { - return Lists.newArrayList(metadataDir.listFiles((dir, name) -> - name.endsWith(".metadata.json") || name.endsWith(".metadata.json.gz"))); + return Lists.newArrayList( + metadataDir.listFiles( + (dir, name) -> name.endsWith(".metadata.json") || name.endsWith(".metadata.json.gz"))); } File version(int versionNumber) { - return new File(metadataDir, "v" + versionNumber + getFileExtension(TableMetadataParser.Codec.NONE)); + return new File( + metadataDir, "v" + versionNumber + getFileExtension(TableMetadataParser.Codec.NONE)); } TableMetadata readMetadataVersion(int version) { - return TableMetadataParser.read(new TestTables.TestTableOperations("table", tableDir).io(), - localInput(version(version))); + return TableMetadataParser.read( + new TestTables.TestTableOperations("table", tableDir).io(), localInput(version(version))); } int readVersionHint() throws IOException { @@ -168,7 +173,8 @@ void rewriteMetadataAsGzipWithOldExtension() throws IOException { List metadataJsonFiles = listMetadataJsonFiles(); for (File file : metadataJsonFiles) { try (FileInputStream input = new FileInputStream(file)) { - try (GZIPOutputStream gzOutput = new GZIPOutputStream(new FileOutputStream(file.getAbsolutePath() + ".gz"))) { + try (GZIPOutputStream gzOutput = + new GZIPOutputStream(new FileOutputStream(file.getAbsolutePath() + ".gz"))) { int bb; while ((bb = input.read()) != -1) { gzOutput.write(bb); diff --git a/core/src/test/java/org/apache/iceberg/hadoop/TestCachingCatalog.java b/core/src/test/java/org/apache/iceberg/hadoop/TestCachingCatalog.java index 7c8236957795..7cee683aa982 100644 --- a/core/src/test/java/org/apache/iceberg/hadoop/TestCachingCatalog.java +++ b/core/src/test/java/org/apache/iceberg/hadoop/TestCachingCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; import com.github.benmanes.caffeine.cache.Cache; @@ -79,7 +78,8 @@ public void testInvalidateMetadataTablesIfBaseTableIsModified() throws Exception TableIdentifier filesMetaTableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl", "files"); Table filesMetaTable = catalog.loadTable(filesMetaTableIdent); - TableIdentifier manifestsMetaTableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl", "manifests"); + TableIdentifier manifestsMetaTableIdent = + TableIdentifier.of("db", "ns1", "ns2", "tbl", "manifests"); Table manifestsMetaTable = catalog.loadTable(manifestsMetaTableIdent); table.newAppend().appendFile(FILE_B).commit(); @@ -115,7 +115,8 @@ public void testInvalidateMetadataTablesIfBaseTableIsDropped() throws IOExceptio // populate the cache with metadata tables for (MetadataTableType type : MetadataTableType.values()) { catalog.loadTable(TableIdentifier.parse(tableIdent + "." + type.name())); - catalog.loadTable(TableIdentifier.parse(tableIdent + "." + type.name().toLowerCase(Locale.ROOT))); + catalog.loadTable( + TableIdentifier.parse(tableIdent + "." + type.name().toLowerCase(Locale.ROOT))); } // drop the original table @@ -137,7 +138,8 @@ public void testInvalidateMetadataTablesIfBaseTableIsDropped() throws IOExceptio Table metadataTable1 = catalog.loadTable(metadataIdent1); Assert.assertEquals("Snapshot must be new", newSnapshot, metadataTable1.currentSnapshot()); - TableIdentifier metadataIdent2 = TableIdentifier.parse(tableIdent + "." + type.name().toLowerCase(Locale.ROOT)); + TableIdentifier metadataIdent2 = + TableIdentifier.parse(tableIdent + "." + type.name().toLowerCase(Locale.ROOT)); Table metadataTable2 = catalog.loadTable(metadataIdent2); Assert.assertEquals("Snapshot must be new", newSnapshot, metadataTable2.currentSnapshot()); } @@ -152,14 +154,17 @@ public void testTableName() throws Exception { Table table = catalog.loadTable(tableIdent); Assert.assertEquals("Name must match", "hadoop.db.ns1.ns2.tbl", table.name()); - TableIdentifier snapshotsTableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl", "snapshots"); + TableIdentifier snapshotsTableIdent = + TableIdentifier.of("db", "ns1", "ns2", "tbl", "snapshots"); Table snapshotsTable = catalog.loadTable(snapshotsTableIdent); - Assert.assertEquals("Name must match", "hadoop.db.ns1.ns2.tbl.snapshots", snapshotsTable.name()); + Assert.assertEquals( + "Name must match", "hadoop.db.ns1.ns2.tbl.snapshots", snapshotsTable.name()); } @Test public void testTableExpiresAfterInterval() throws IOException { - TestableCachingCatalog catalog = TestableCachingCatalog.wrap(hadoopCatalog(), EXPIRATION_TTL, ticker); + TestableCachingCatalog catalog = + TestableCachingCatalog.wrap(hadoopCatalog(), EXPIRATION_TTL, ticker); Namespace namespace = Namespace.of("db", "ns1", "ns2"); TableIdentifier tableIdent = TableIdentifier.of(namespace, "tbl"); @@ -167,21 +172,30 @@ public void testTableExpiresAfterInterval() throws IOException { // Ensure table is cached with full ttl remaining upon creation Assertions.assertThat(catalog.cache().asMap()).containsKey(tableIdent); - Assertions.assertThat(catalog.remainingAgeFor(tableIdent)).isPresent().get().isEqualTo(EXPIRATION_TTL); + Assertions.assertThat(catalog.remainingAgeFor(tableIdent)) + .isPresent() + .get() + .isEqualTo(EXPIRATION_TTL); ticker.advance(HALF_OF_EXPIRATION); Assertions.assertThat(catalog.cache().asMap()).containsKey(tableIdent); - Assertions.assertThat(catalog.ageOf(tableIdent)).isPresent().get().isEqualTo(HALF_OF_EXPIRATION); + Assertions.assertThat(catalog.ageOf(tableIdent)) + .isPresent() + .get() + .isEqualTo(HALF_OF_EXPIRATION); ticker.advance(HALF_OF_EXPIRATION.plus(Duration.ofSeconds(10))); Assertions.assertThat(catalog.cache().asMap()).doesNotContainKey(tableIdent); - Assert.assertNotSame("CachingCatalog should return a new instance after expiration", - table, catalog.loadTable(tableIdent)); + Assert.assertNotSame( + "CachingCatalog should return a new instance after expiration", + table, + catalog.loadTable(tableIdent)); } @Test public void testCatalogExpirationTtlRefreshesAfterAccessViaCatalog() throws IOException { - TestableCachingCatalog catalog = TestableCachingCatalog.wrap(hadoopCatalog(), EXPIRATION_TTL, ticker); + TestableCachingCatalog catalog = + TestableCachingCatalog.wrap(hadoopCatalog(), EXPIRATION_TTL, ticker); Namespace namespace = Namespace.of("db", "ns1", "ns2"); TableIdentifier tableIdent = TableIdentifier.of(namespace, "tbl"); @@ -191,14 +205,25 @@ public void testCatalogExpirationTtlRefreshesAfterAccessViaCatalog() throws IOEx ticker.advance(HALF_OF_EXPIRATION); Assertions.assertThat(catalog.cache().asMap()).containsKey(tableIdent); - Assertions.assertThat(catalog.ageOf(tableIdent)).isPresent().get().isEqualTo(HALF_OF_EXPIRATION); - Assertions.assertThat(catalog.remainingAgeFor(tableIdent)).isPresent().get().isEqualTo(HALF_OF_EXPIRATION); + Assertions.assertThat(catalog.ageOf(tableIdent)) + .isPresent() + .get() + .isEqualTo(HALF_OF_EXPIRATION); + Assertions.assertThat(catalog.remainingAgeFor(tableIdent)) + .isPresent() + .get() + .isEqualTo(HALF_OF_EXPIRATION); Duration oneMinute = Duration.ofMinutes(1L); ticker.advance(oneMinute); Assertions.assertThat(catalog.cache().asMap()).containsKey(tableIdent); - Assertions.assertThat(catalog.ageOf(tableIdent)).isPresent().get().isEqualTo(HALF_OF_EXPIRATION.plus(oneMinute)); - Assertions.assertThat(catalog.remainingAgeFor(tableIdent)).get().isEqualTo(HALF_OF_EXPIRATION.minus(oneMinute)); + Assertions.assertThat(catalog.ageOf(tableIdent)) + .isPresent() + .get() + .isEqualTo(HALF_OF_EXPIRATION.plus(oneMinute)); + Assertions.assertThat(catalog.remainingAgeFor(tableIdent)) + .get() + .isEqualTo(HALF_OF_EXPIRATION.minus(oneMinute)); // Access the table via the catalog, which should refresh the TTL Table table = catalog.loadTable(tableIdent); @@ -221,7 +246,8 @@ public void testCatalogExpirationTtlRefreshesAfterAccessViaCatalog() throws IOEx @Test public void testCacheExpirationEagerlyRemovesMetadataTables() throws IOException { - TestableCachingCatalog catalog = TestableCachingCatalog.wrap(hadoopCatalog(), EXPIRATION_TTL, ticker); + TestableCachingCatalog catalog = + TestableCachingCatalog.wrap(hadoopCatalog(), EXPIRATION_TTL, ticker); Namespace namespace = Namespace.of("db", "ns1", "ns2"); TableIdentifier tableIdent = TableIdentifier.of(namespace, "tbl"); Table table = catalog.createTable(tableIdent, SCHEMA, SPEC, ImmutableMap.of("key2", "value2")); @@ -242,8 +268,10 @@ public void testCacheExpirationEagerlyRemovesMetadataTables() throws IOException .isNotEmpty() .allMatch(age -> age.isPresent() && age.get().equals(Duration.ZERO)); - Assert.assertEquals("Loading a non-cached metadata table should refresh the main table's age", - Optional.of(EXPIRATION_TTL), catalog.remainingAgeFor(tableIdent)); + Assert.assertEquals( + "Loading a non-cached metadata table should refresh the main table's age", + Optional.of(EXPIRATION_TTL), + catalog.remainingAgeFor(tableIdent)); // Move time forward and access already cached metadata tables. ticker.advance(HALF_OF_EXPIRATION); @@ -252,22 +280,28 @@ public void testCacheExpirationEagerlyRemovesMetadataTables() throws IOException .isNotEmpty() .allMatch(age -> age.isPresent() && age.get().equals(Duration.ZERO)); - Assert.assertEquals("Accessing a cached metadata table should not affect the main table's age", - Optional.of(HALF_OF_EXPIRATION), catalog.remainingAgeFor(tableIdent)); + Assert.assertEquals( + "Accessing a cached metadata table should not affect the main table's age", + Optional.of(HALF_OF_EXPIRATION), + catalog.remainingAgeFor(tableIdent)); // Move time forward so the data table drops. ticker.advance(HALF_OF_EXPIRATION); Assertions.assertThat(catalog.cache().asMap()).doesNotContainKey(tableIdent); - Arrays.stream(metadataTables(tableIdent)).forEach(metadataTable -> - Assert.assertFalse("When a data table expires, its metadata tables should expire regardless of age", - catalog.cache().asMap().containsKey(metadataTable))); + Arrays.stream(metadataTables(tableIdent)) + .forEach( + metadataTable -> + Assert.assertFalse( + "When a data table expires, its metadata tables should expire regardless of age", + catalog.cache().asMap().containsKey(metadataTable))); } @Test public void testDeadlock() throws IOException, InterruptedException { HadoopCatalog underlyingCatalog = hadoopCatalog(); - TestableCachingCatalog catalog = TestableCachingCatalog.wrap(underlyingCatalog, Duration.ofSeconds(1), ticker); + TestableCachingCatalog catalog = + TestableCachingCatalog.wrap(underlyingCatalog, Duration.ofSeconds(1), ticker); Namespace namespace = Namespace.of("db", "ns1", "ns2"); int numThreads = 20; List createdTables = Lists.newArrayList(); @@ -284,17 +318,19 @@ public void testDeadlock() throws IOException, InterruptedException { for (int i = 0; i < numThreads; i++) { if (i % 2 == 0) { String table = "tbl" + i; - executor.submit(() -> { - ticker.advance(Duration.ofSeconds(2)); - cache.get(TableIdentifier.of(namespace, table), underlyingCatalog::loadTable); - cacheGetCount.incrementAndGet(); - }); + executor.submit( + () -> { + ticker.advance(Duration.ofSeconds(2)); + cache.get(TableIdentifier.of(namespace, table), underlyingCatalog::loadTable); + cacheGetCount.incrementAndGet(); + }); } else { - executor.submit(() -> { - ticker.advance(Duration.ofSeconds(2)); - cache.cleanUp(); - cacheCleanupCount.incrementAndGet(); - }); + executor.submit( + () -> { + ticker.advance(Duration.ofSeconds(2)); + cache.cleanUp(); + cacheCleanupCount.incrementAndGet(); + }); } } executor.awaitTermination(2, TimeUnit.SECONDS); @@ -307,17 +343,19 @@ public void testDeadlock() throws IOException, InterruptedException { @Test public void testCachingCatalogRejectsExpirationIntervalOfZero() { - AssertHelpers - .assertThrows( + AssertHelpers.assertThrows( "Caching catalog should disallow an expiration interval of zero, as zero signifies not to cache at all", - IllegalArgumentException.class, - () -> TestableCachingCatalog.wrap(hadoopCatalog(), Duration.ZERO, ticker)); + IllegalArgumentException.class, + () -> TestableCachingCatalog.wrap(hadoopCatalog(), Duration.ZERO, ticker)); } @Test public void testCacheExpirationIsDisabledByANegativeValue() throws IOException { - TestableCachingCatalog catalog = TestableCachingCatalog - .wrap(hadoopCatalog(), Duration.ofMillis(CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS_OFF), ticker); + TestableCachingCatalog catalog = + TestableCachingCatalog.wrap( + hadoopCatalog(), + Duration.ofMillis(CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS_OFF), + ticker); Assert.assertFalse( "When a negative value is used as the expiration interval, the cache should not expire entries based on a TTL", @@ -326,8 +364,10 @@ public void testCacheExpirationIsDisabledByANegativeValue() throws IOException { @Test public void testInvalidateTableForChainedCachingCatalogs() throws Exception { - TestableCachingCatalog wrappedCatalog = TestableCachingCatalog.wrap(hadoopCatalog(), EXPIRATION_TTL, ticker); - TestableCachingCatalog catalog = TestableCachingCatalog.wrap(wrappedCatalog, EXPIRATION_TTL, ticker); + TestableCachingCatalog wrappedCatalog = + TestableCachingCatalog.wrap(hadoopCatalog(), EXPIRATION_TTL, ticker); + TestableCachingCatalog catalog = + TestableCachingCatalog.wrap(wrappedCatalog, EXPIRATION_TTL, ticker); Namespace namespace = Namespace.of("db", "ns1", "ns2"); TableIdentifier tableIdent = TableIdentifier.of(namespace, "tbl"); catalog.createTable(tableIdent, SCHEMA, SPEC, ImmutableMap.of("key2", "value2")); diff --git a/core/src/test/java/org/apache/iceberg/hadoop/TestCatalogUtilDropTable.java b/core/src/test/java/org/apache/iceberg/hadoop/TestCatalogUtilDropTable.java index 449445a91833..c61bb3da6383 100644 --- a/core/src/test/java/org/apache/iceberg/hadoop/TestCatalogUtilDropTable.java +++ b/core/src/test/java/org/apache/iceberg/hadoop/TestCatalogUtilDropTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; import java.util.List; @@ -40,12 +39,8 @@ public class TestCatalogUtilDropTable extends HadoopTableTestBase { @Test public void dropTableDataDeletesExpectedFiles() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); + table.newAppend().appendFile(FILE_B).commit(); TableMetadata tableMetadata = readMetadataVersion(3); Set snapshotSet = Sets.newHashSet(table.snapshots()); @@ -61,30 +56,38 @@ public void dropTableDataDeletesExpectedFiles() { Mockito.when(fileIO.newInputFile(Mockito.anyString())) .thenAnswer(invocation -> table.io().newInputFile(invocation.getArgument(0))); Mockito.when(fileIO.newInputFile(Mockito.anyString(), Mockito.anyLong())) - .thenAnswer(invocation -> table.io().newInputFile(invocation.getArgument(0), invocation.getArgument(1))); + .thenAnswer( + invocation -> + table.io().newInputFile(invocation.getArgument(0), invocation.getArgument(1))); CatalogUtil.dropTableData(fileIO, tableMetadata); ArgumentCaptor argumentCaptor = ArgumentCaptor.forClass(String.class); - Mockito.verify(fileIO, Mockito.times(manifestListLocations.size() + - manifestLocations.size() + dataLocations.size() + metadataLocations.size())) + Mockito.verify( + fileIO, + Mockito.times( + manifestListLocations.size() + + manifestLocations.size() + + dataLocations.size() + + metadataLocations.size())) .deleteFile(argumentCaptor.capture()); List deletedPaths = argumentCaptor.getAllValues(); - Assert.assertTrue("should contain all created manifest lists", deletedPaths.containsAll(manifestListLocations)); - Assert.assertTrue("should contain all created manifests", deletedPaths.containsAll(manifestLocations)); + Assert.assertTrue( + "should contain all created manifest lists", + deletedPaths.containsAll(manifestListLocations)); + Assert.assertTrue( + "should contain all created manifests", deletedPaths.containsAll(manifestLocations)); Assert.assertTrue("should contain all created data", deletedPaths.containsAll(dataLocations)); - Assert.assertTrue("should contain all created metadata locations", deletedPaths.containsAll(metadataLocations)); + Assert.assertTrue( + "should contain all created metadata locations", + deletedPaths.containsAll(metadataLocations)); } @Test public void dropTableDataDoNotThrowWhenDeletesFail() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); + table.newAppend().appendFile(FILE_B).commit(); TableMetadata tableMetadata = readMetadataVersion(3); Set snapshotSet = Sets.newHashSet(table.snapshots()); @@ -93,25 +96,27 @@ public void dropTableDataDoNotThrowWhenDeletesFail() { Mockito.when(fileIO.newInputFile(Mockito.anyString())) .thenAnswer(invocation -> table.io().newInputFile(invocation.getArgument(0))); Mockito.when(fileIO.newInputFile(Mockito.anyString(), Mockito.anyLong())) - .thenAnswer(invocation -> table.io().newInputFile(invocation.getArgument(0), invocation.getArgument(1))); + .thenAnswer( + invocation -> + table.io().newInputFile(invocation.getArgument(0), invocation.getArgument(1))); Mockito.doThrow(new RuntimeException()).when(fileIO).deleteFile(ArgumentMatchers.anyString()); CatalogUtil.dropTableData(fileIO, tableMetadata); - Mockito.verify(fileIO, Mockito.times( - manifestListLocations(snapshotSet).size() + manifestLocations(snapshotSet, fileIO).size() + - dataLocations(snapshotSet, table.io()).size() + metadataLocations(tableMetadata).size())) + Mockito.verify( + fileIO, + Mockito.times( + manifestListLocations(snapshotSet).size() + + manifestLocations(snapshotSet, fileIO).size() + + dataLocations(snapshotSet, table.io()).size() + + metadataLocations(tableMetadata).size())) .deleteFile(ArgumentMatchers.anyString()); } @Test public void shouldNotDropDataFilesIfGcNotEnabled() { table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - table.newFastAppend() - .appendFile(FILE_A) - .commit(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); + table.newAppend().appendFile(FILE_B).commit(); TableMetadata tableMetadata = readMetadataVersion(4); Set snapshotSet = Sets.newHashSet(table.snapshots()); @@ -129,20 +134,25 @@ public void shouldNotDropDataFilesIfGcNotEnabled() { CatalogUtil.dropTableData(fileIO, tableMetadata); ArgumentCaptor argumentCaptor = ArgumentCaptor.forClass(String.class); - Mockito.verify(fileIO, Mockito.times(manifestListLocations.size() + manifestLocations.size() + - metadataLocations.size())).deleteFile(argumentCaptor.capture()); + Mockito.verify( + fileIO, + Mockito.times( + manifestListLocations.size() + manifestLocations.size() + metadataLocations.size())) + .deleteFile(argumentCaptor.capture()); List deletedPaths = argumentCaptor.getAllValues(); - Assert.assertTrue("should contain all created manifest lists", deletedPaths.containsAll(manifestListLocations)); - Assert.assertTrue("should contain all created manifests", deletedPaths.containsAll(manifestLocations)); - Assert.assertTrue("should contain all created metadata locations", deletedPaths.containsAll(metadataLocations)); - + Assert.assertTrue( + "should contain all created manifest lists", + deletedPaths.containsAll(manifestListLocations)); + Assert.assertTrue( + "should contain all created manifests", deletedPaths.containsAll(manifestLocations)); + Assert.assertTrue( + "should contain all created metadata locations", + deletedPaths.containsAll(metadataLocations)); } private Set manifestListLocations(Set snapshotSet) { - return snapshotSet.stream() - .map(Snapshot::manifestListLocation) - .collect(Collectors.toSet()); + return snapshotSet.stream().map(Snapshot::manifestListLocation).collect(Collectors.toSet()); } private Set manifestLocations(Set snapshotSet, FileIO io) { @@ -160,9 +170,10 @@ private Set dataLocations(Set snapshotSet, FileIO io) { } private Set metadataLocations(TableMetadata tableMetadata) { - Set metadataLocations = tableMetadata.previousFiles().stream() - .map(TableMetadata.MetadataLogEntry::file) - .collect(Collectors.toSet()); + Set metadataLocations = + tableMetadata.previousFiles().stream() + .map(TableMetadata.MetadataLogEntry::file) + .collect(Collectors.toSet()); metadataLocations.add(tableMetadata.metadataFileLocation()); return metadataLocations; } diff --git a/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopCatalog.java b/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopCatalog.java index ad696bb5a2d0..7d5599919d21 100644 --- a/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopCatalog.java +++ b/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopCatalog.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; +import static org.apache.iceberg.NullOrder.NULLS_FIRST; +import static org.apache.iceberg.SortDirection.ASC; + import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.List; @@ -54,21 +56,20 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.NullOrder.NULLS_FIRST; -import static org.apache.iceberg.SortDirection.ASC; - public class TestHadoopCatalog extends HadoopTableTestBase { private static ImmutableMap meta = ImmutableMap.of(); @Test public void testCreateTableBuilder() throws Exception { TableIdentifier tableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl"); - Table table = hadoopCatalog().buildTable(tableIdent, SCHEMA) - .withPartitionSpec(SPEC) - .withProperties(null) - .withProperty("key1", "value1") - .withProperties(ImmutableMap.of("key2", "value2")) - .create(); + Table table = + hadoopCatalog() + .buildTable(tableIdent, SCHEMA) + .withPartitionSpec(SPEC) + .withProperties(null) + .withProperty("key1", "value1") + .withProperties(ImmutableMap.of("key2", "value2")) + .create(); Assert.assertEquals(TABLE_SCHEMA.toString(), table.schema().toString()); Assert.assertEquals(1, table.spec().fields().size()); @@ -80,9 +81,8 @@ public void testCreateTableBuilder() throws Exception { public void testCreateTableTxnBuilder() throws Exception { HadoopCatalog catalog = hadoopCatalog(); TableIdentifier tableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl"); - Transaction txn = catalog.buildTable(tableIdent, SCHEMA) - .withPartitionSpec(null) - .createTransaction(); + Transaction txn = + catalog.buildTable(tableIdent, SCHEMA).withPartitionSpec(null).createTransaction(); txn.commitTransaction(); Table table = catalog.loadTable(tableIdent); @@ -95,33 +95,32 @@ public void testReplaceTxnBuilder() throws Exception { HadoopCatalog catalog = hadoopCatalog(); TableIdentifier tableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl"); - Transaction createTxn = catalog.buildTable(tableIdent, SCHEMA) - .withPartitionSpec(SPEC) - .withProperty("key1", "value1") - .createOrReplaceTransaction(); + Transaction createTxn = + catalog + .buildTable(tableIdent, SCHEMA) + .withPartitionSpec(SPEC) + .withProperty("key1", "value1") + .createOrReplaceTransaction(); - createTxn.newAppend() - .appendFile(FILE_A) - .commit(); + createTxn.newAppend().appendFile(FILE_A).commit(); createTxn.commitTransaction(); Table table = catalog.loadTable(tableIdent); Assert.assertNotNull(table.currentSnapshot()); - Transaction replaceTxn = catalog.buildTable(tableIdent, SCHEMA) - .withProperty("key2", "value2") - .replaceTransaction(); + Transaction replaceTxn = + catalog.buildTable(tableIdent, SCHEMA).withProperty("key2", "value2").replaceTransaction(); replaceTxn.commitTransaction(); table = catalog.loadTable(tableIdent); Assert.assertNull(table.currentSnapshot()); - PartitionSpec v1Expected = PartitionSpec.builderFor(table.schema()) - .alwaysNull("data", "data_bucket") - .withSpecId(1) - .build(); - Assert.assertEquals("Table should have a spec with one void field", - v1Expected, table.spec()); + PartitionSpec v1Expected = + PartitionSpec.builderFor(table.schema()) + .alwaysNull("data", "data_bucket") + .withSpecId(1) + .build(); + Assert.assertEquals("Table should have a spec with one void field", v1Expected, table.spec()); Assert.assertEquals("value1", table.properties().get("key1")); Assert.assertEquals("value2", table.properties().get("key2")); @@ -132,17 +131,27 @@ public void testTableBuilderWithLocation() throws Exception { HadoopCatalog catalog = hadoopCatalog(); TableIdentifier tableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl"); - AssertHelpers.assertThrows("Should reject a custom location", - IllegalArgumentException.class, "Cannot set a custom location for a path-based table", + AssertHelpers.assertThrows( + "Should reject a custom location", + IllegalArgumentException.class, + "Cannot set a custom location for a path-based table", () -> catalog.buildTable(tableIdent, SCHEMA).withLocation("custom").create()); - AssertHelpers.assertThrows("Should reject a custom location", - IllegalArgumentException.class, "Cannot set a custom location for a path-based table", + AssertHelpers.assertThrows( + "Should reject a custom location", + IllegalArgumentException.class, + "Cannot set a custom location for a path-based table", () -> catalog.buildTable(tableIdent, SCHEMA).withLocation("custom").createTransaction()); - AssertHelpers.assertThrows("Should reject a custom location", - IllegalArgumentException.class, "Cannot set a custom location for a path-based table", - () -> catalog.buildTable(tableIdent, SCHEMA).withLocation("custom").createOrReplaceTransaction()); + AssertHelpers.assertThrows( + "Should reject a custom location", + IllegalArgumentException.class, + "Cannot set a custom location for a path-based table", + () -> + catalog + .buildTable(tableIdent, SCHEMA) + .withLocation("custom") + .createOrReplaceTransaction()); } @Test @@ -158,19 +167,20 @@ public void testCreateTableDefaultSortOrder() throws Exception { @Test public void testCreateTableCustomSortOrder() throws Exception { TableIdentifier tableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl"); - SortOrder order = SortOrder.builderFor(SCHEMA) - .asc("id", NULLS_FIRST) - .build(); - Table table = hadoopCatalog().buildTable(tableIdent, SCHEMA) - .withPartitionSpec(SPEC) - .withSortOrder(order) - .create(); + SortOrder order = SortOrder.builderFor(SCHEMA).asc("id", NULLS_FIRST).build(); + Table table = + hadoopCatalog() + .buildTable(tableIdent, SCHEMA) + .withPartitionSpec(SPEC) + .withSortOrder(order) + .create(); SortOrder sortOrder = table.sortOrder(); Assert.assertEquals("Order ID must match", 1, sortOrder.orderId()); Assert.assertEquals("Order must have 1 field", 1, sortOrder.fields().size()); Assert.assertEquals("Direction must match ", ASC, sortOrder.fields().get(0).direction()); - Assert.assertEquals("Null order must match ", NULLS_FIRST, sortOrder.fields().get(0).nullOrder()); + Assert.assertEquals( + "Null order must match ", NULLS_FIRST, sortOrder.fields().get(0).nullOrder()); Transform transform = Transforms.identity(Types.IntegerType.get()); Assert.assertEquals("Transform must match", transform, sortOrder.fields().get(0).transform()); } @@ -242,11 +252,13 @@ public void testRenameTable() throws Exception { HadoopCatalog catalog = hadoopCatalog(); TableIdentifier testTable = TableIdentifier.of("db", "tbl1"); catalog.createTable(testTable, SCHEMA, PartitionSpec.unpartitioned()); - AssertHelpers.assertThrows("should throw exception", UnsupportedOperationException.class, - "Cannot rename Hadoop tables", () -> { + AssertHelpers.assertThrows( + "should throw exception", + UnsupportedOperationException.class, + "Cannot rename Hadoop tables", + () -> { catalog.renameTable(testTable, TableIdentifier.of("db", "tbl2")); - } - ); + }); } @Test @@ -258,9 +270,8 @@ public void testListTables() throws Exception { TableIdentifier tbl3 = TableIdentifier.of("db", "ns1", "tbl3"); TableIdentifier tbl4 = TableIdentifier.of("db", "metadata", "metadata"); - Lists.newArrayList(tbl1, tbl2, tbl3, tbl4).forEach(t -> - catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned()) - ); + Lists.newArrayList(tbl1, tbl2, tbl3, tbl4) + .forEach(t -> catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned())); List tbls1 = catalog.listTables(Namespace.of("db")); Set tblSet = Sets.newHashSet(tbls1.stream().map(t -> t.name()).iterator()); @@ -272,10 +283,13 @@ public void testListTables() throws Exception { Assert.assertEquals("table identifiers", 1, tbls2.size()); Assert.assertEquals("table name", "tbl3", tbls2.get(0).name()); - AssertHelpers.assertThrows("should throw exception", NoSuchNamespaceException.class, - "Namespace does not exist: ", () -> { - catalog.listTables(Namespace.of("db", "ns1", "ns2")); - }); + AssertHelpers.assertThrows( + "should throw exception", + NoSuchNamespaceException.class, + "Namespace does not exist: ", + () -> { + catalog.listTables(Namespace.of("db", "ns1", "ns2")); + }); } @Test @@ -284,10 +298,11 @@ public void testCallingLocationProviderWhenNoCurrentMetadata() throws IOExceptio TableIdentifier tableIdent = TableIdentifier.of("ns1", "ns2", "table1"); Transaction create = catalog.newCreateTableTransaction(tableIdent, SCHEMA); - create.table().locationProvider(); // NPE triggered if not handled appropriately + create.table().locationProvider(); // NPE triggered if not handled appropriately create.commitTransaction(); - Assert.assertEquals("1 table expected", 1, catalog.listTables(Namespace.of("ns1", "ns2")).size()); + Assert.assertEquals( + "1 table expected", 1, catalog.listTables(Namespace.of("ns1", "ns2")).size()); catalog.dropTable(tableIdent, true); } @@ -296,14 +311,13 @@ public void testCreateNamespace() throws Exception { String warehouseLocation = temp.newFolder().getAbsolutePath(); HadoopCatalog catalog = new HadoopCatalog(); catalog.setConf(new Configuration()); - catalog.initialize("hadoop", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation)); + catalog.initialize( + "hadoop", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation)); TableIdentifier tbl1 = TableIdentifier.of("db", "ns1", "ns2", "metadata"); TableIdentifier tbl2 = TableIdentifier.of("db", "ns2", "ns3", "tbl2"); - Lists.newArrayList(tbl1, tbl2).forEach(t -> - catalog.createNamespace(t.namespace(), meta) - ); + Lists.newArrayList(tbl1, tbl2).forEach(t -> catalog.createNamespace(t.namespace(), meta)); String metaLocation1 = warehouseLocation + "/" + "db/ns1/ns2"; FileSystem fs1 = Util.getFs(new Path(metaLocation1), catalog.getConf()); @@ -313,9 +327,11 @@ public void testCreateNamespace() throws Exception { FileSystem fs2 = Util.getFs(new Path(metaLocation2), catalog.getConf()); Assert.assertTrue(fs2.isDirectory(new Path(metaLocation2))); - AssertHelpers.assertThrows("Should fail to create when namespace already exist: " + tbl1.namespace(), + AssertHelpers.assertThrows( + "Should fail to create when namespace already exist: " + tbl1.namespace(), org.apache.iceberg.exceptions.AlreadyExistsException.class, - "Namespace already exists: " + tbl1.namespace(), () -> { + "Namespace already exists: " + tbl1.namespace(), + () -> { catalog.createNamespace(tbl1.namespace()); }); } @@ -330,9 +346,8 @@ public void testListNamespace() throws Exception { TableIdentifier tbl4 = TableIdentifier.of("db", "metadata"); TableIdentifier tbl5 = TableIdentifier.of("db2", "metadata"); - Lists.newArrayList(tbl1, tbl2, tbl3, tbl4, tbl5).forEach(t -> - catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned()) - ); + Lists.newArrayList(tbl1, tbl2, tbl3, tbl4, tbl5) + .forEach(t -> catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned())); List nsp1 = catalog.listNamespaces(Namespace.of("db")); Set tblSet = Sets.newHashSet(nsp1.stream().map(t -> t.toString()).iterator()); @@ -357,8 +372,11 @@ public void testListNamespace() throws Exception { Assert.assertTrue(tblSet3.contains("db")); Assert.assertTrue(tblSet3.contains("db2")); - AssertHelpers.assertThrows("Should fail to list namespace doesn't exist", NoSuchNamespaceException.class, - "Namespace does not exist: ", () -> { + AssertHelpers.assertThrows( + "Should fail to list namespace doesn't exist", + NoSuchNamespaceException.class, + "Namespace does not exist: ", + () -> { catalog.listNamespaces(Namespace.of("db", "db2", "ns2")); }); } @@ -372,13 +390,15 @@ public void testLoadNamespaceMeta() throws IOException { TableIdentifier tbl3 = TableIdentifier.of("db", "ns3", "tbl4"); TableIdentifier tbl4 = TableIdentifier.of("db", "metadata"); - Lists.newArrayList(tbl1, tbl2, tbl3, tbl4).forEach(t -> - catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned()) - ); + Lists.newArrayList(tbl1, tbl2, tbl3, tbl4) + .forEach(t -> catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned())); catalog.loadNamespaceMetadata(Namespace.of("db")); - AssertHelpers.assertThrows("Should fail to load namespace doesn't exist", NoSuchNamespaceException.class, - "Namespace does not exist: ", () -> { + AssertHelpers.assertThrows( + "Should fail to load namespace doesn't exist", + NoSuchNamespaceException.class, + "Namespace does not exist: ", + () -> { catalog.loadNamespaceMetadata(Namespace.of("db", "db2", "ns2")); }); } @@ -392,21 +412,26 @@ public void testNamespaceExists() throws IOException { TableIdentifier tbl3 = TableIdentifier.of("db", "ns3", "tbl4"); TableIdentifier tbl4 = TableIdentifier.of("db", "metadata"); - Lists.newArrayList(tbl1, tbl2, tbl3, tbl4).forEach(t -> - catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned()) - ); - Assert.assertTrue("Should true to namespace exist", + Lists.newArrayList(tbl1, tbl2, tbl3, tbl4) + .forEach(t -> catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned())); + Assert.assertTrue( + "Should true to namespace exist", catalog.namespaceExists(Namespace.of("db", "ns1", "ns2"))); - Assert.assertTrue("Should false to namespace doesn't exist", + Assert.assertTrue( + "Should false to namespace doesn't exist", !catalog.namespaceExists(Namespace.of("db", "db2", "ns2"))); } @Test public void testAlterNamespaceMeta() throws IOException { HadoopCatalog catalog = hadoopCatalog(); - AssertHelpers.assertThrows("Should fail to change namespace", UnsupportedOperationException.class, - "Cannot set namespace properties db.db2.ns2 : setProperties is not supported", () -> { - catalog.setProperties(Namespace.of("db", "db2", "ns2"), ImmutableMap.of("property", "test")); + AssertHelpers.assertThrows( + "Should fail to change namespace", + UnsupportedOperationException.class, + "Cannot set namespace properties db.db2.ns2 : setProperties is not supported", + () -> { + catalog.setProperties( + Namespace.of("db", "db2", "ns2"), ImmutableMap.of("property", "test")); }); } @@ -415,24 +440,26 @@ public void testDropNamespace() throws IOException { String warehouseLocation = temp.newFolder().getAbsolutePath(); HadoopCatalog catalog = new HadoopCatalog(); catalog.setConf(new Configuration()); - catalog.initialize("hadoop", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation)); + catalog.initialize( + "hadoop", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation)); Namespace namespace1 = Namespace.of("db"); Namespace namespace2 = Namespace.of("db", "ns1"); TableIdentifier tbl1 = TableIdentifier.of(namespace1, "tbl1"); TableIdentifier tbl2 = TableIdentifier.of(namespace2, "tbl1"); - Lists.newArrayList(tbl1, tbl2).forEach(t -> - catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned()) - ); + Lists.newArrayList(tbl1, tbl2) + .forEach(t -> catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned())); - AssertHelpers.assertThrows("Should fail to drop namespace is not empty " + namespace1, + AssertHelpers.assertThrows( + "Should fail to drop namespace is not empty " + namespace1, NamespaceNotEmptyException.class, - "Namespace " + namespace1 + " is not empty.", () -> { + "Namespace " + namespace1 + " is not empty.", + () -> { catalog.dropNamespace(Namespace.of("db")); }); - Assert.assertFalse("Should fail to drop namespace doesn't exist", - catalog.dropNamespace(Namespace.of("db2"))); + Assert.assertFalse( + "Should fail to drop namespace doesn't exist", catalog.dropNamespace(Namespace.of("db2"))); Assert.assertTrue(catalog.dropTable(tbl1)); Assert.assertTrue(catalog.dropTable(tbl2)); Assert.assertTrue(catalog.dropNamespace(namespace2)); @@ -446,7 +473,8 @@ public void testDropNamespace() throws IOException { public void testVersionHintFileErrorWithFile() throws Exception { addVersionsToTable(table); - HadoopTableOperations tableOperations = (HadoopTableOperations) TABLES.newTableOps(tableLocation); + HadoopTableOperations tableOperations = + (HadoopTableOperations) TABLES.newTableOps(tableLocation); long secondSnapshotId = table.currentSnapshot().snapshotId(); @@ -459,7 +487,8 @@ public void testVersionHintFileErrorWithFile() throws Exception { // Check the result of the findVersion(), and load the table and check the current snapshotId Assert.assertEquals(1, tableOperations.findVersion()); - Assert.assertEquals(secondSnapshotId, TABLES.load(tableLocation).currentSnapshot().snapshotId()); + Assert.assertEquals( + secondSnapshotId, TABLES.load(tableLocation).currentSnapshot().snapshotId()); // Write newer data to confirm that we are writing the correct file io.deleteFile(versionHintFile.getPath()); @@ -469,7 +498,8 @@ public void testVersionHintFileErrorWithFile() throws Exception { // Check the result of the findVersion(), and load the table and check the current snapshotId Assert.assertEquals(3, tableOperations.findVersion()); - Assert.assertEquals(secondSnapshotId, TABLES.load(tableLocation).currentSnapshot().snapshotId()); + Assert.assertEquals( + secondSnapshotId, TABLES.load(tableLocation).currentSnapshot().snapshotId()); // Write an empty version hint file io.deleteFile(versionHintFile.getPath()); @@ -477,21 +507,24 @@ public void testVersionHintFileErrorWithFile() throws Exception { // Check the result of the findVersion(), and load the table and check the current snapshotId Assert.assertEquals(3, tableOperations.findVersion()); - Assert.assertEquals(secondSnapshotId, TABLES.load(tableLocation).currentSnapshot().snapshotId()); + Assert.assertEquals( + secondSnapshotId, TABLES.load(tableLocation).currentSnapshot().snapshotId()); // Just delete the file io.deleteFile(versionHintFile.getPath()); // Check the result of the versionHint(), and load the table and check the current snapshotId Assert.assertEquals(3, tableOperations.findVersion()); - Assert.assertEquals(secondSnapshotId, TABLES.load(tableLocation).currentSnapshot().snapshotId()); + Assert.assertEquals( + secondSnapshotId, TABLES.load(tableLocation).currentSnapshot().snapshotId()); } @Test public void testVersionHintFileMissingMetadata() throws Exception { addVersionsToTable(table); - HadoopTableOperations tableOperations = (HadoopTableOperations) TABLES.newTableOps(tableLocation); + HadoopTableOperations tableOperations = + (HadoopTableOperations) TABLES.newTableOps(tableLocation); long secondSnapshotId = table.currentSnapshot().snapshotId(); @@ -504,13 +537,15 @@ public void testVersionHintFileMissingMetadata() throws Exception { // Check the result of the findVersion(), and load the table and check the current snapshotId Assert.assertEquals(3, tableOperations.findVersion()); - Assert.assertEquals(secondSnapshotId, TABLES.load(tableLocation).currentSnapshot().snapshotId()); + Assert.assertEquals( + secondSnapshotId, TABLES.load(tableLocation).currentSnapshot().snapshotId()); // Remove all the version files, and see if we can recover. Hint... not :) io.deleteFile(tableOperations.getMetadataFile(2).toString()); io.deleteFile(tableOperations.getMetadataFile(3).toString()); - // Check that we got 0 findVersion, and a NoSuchTableException is thrown when trying to load the table + // Check that we got 0 findVersion, and a NoSuchTableException is thrown when trying to load the + // table Assert.assertEquals(0, tableOperations.findVersion()); AssertHelpers.assertThrows( "Should not be able to find the table", @@ -523,30 +558,32 @@ public void testVersionHintFileMissingMetadata() throws Exception { public void testTableName() throws Exception { HadoopCatalog catalog = hadoopCatalog(); TableIdentifier tableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl"); - catalog.buildTable(tableIdent, SCHEMA) - .withPartitionSpec(SPEC) - .create(); + catalog.buildTable(tableIdent, SCHEMA).withPartitionSpec(SPEC).create(); Table table = catalog.loadTable(tableIdent); Assert.assertEquals("Name must match", "hadoop.db.ns1.ns2.tbl", table.name()); - TableIdentifier snapshotsTableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl", "snapshots"); + TableIdentifier snapshotsTableIdent = + TableIdentifier.of("db", "ns1", "ns2", "tbl", "snapshots"); Table snapshotsTable = catalog.loadTable(snapshotsTableIdent); - Assert.assertEquals("Name must match", "hadoop.db.ns1.ns2.tbl.snapshots", snapshotsTable.name()); + Assert.assertEquals( + "Name must match", "hadoop.db.ns1.ns2.tbl.snapshots", snapshotsTable.name()); } private static void addVersionsToTable(Table table) { - DataFile dataFile1 = DataFiles.builder(SPEC) - .withPath("/a.parquet") - .withFileSizeInBytes(10) - .withRecordCount(1) - .build(); - - DataFile dataFile2 = DataFiles.builder(SPEC) - .withPath("/b.parquet") - .withFileSizeInBytes(10) - .withRecordCount(1) - .build(); + DataFile dataFile1 = + DataFiles.builder(SPEC) + .withPath("/a.parquet") + .withFileSizeInBytes(10) + .withRecordCount(1) + .build(); + + DataFile dataFile2 = + DataFiles.builder(SPEC) + .withPath("/b.parquet") + .withFileSizeInBytes(10) + .withRecordCount(1) + .build(); table.newAppend().appendFile(dataFile1).commit(); table.newAppend().appendFile(dataFile2).commit(); @@ -555,20 +592,23 @@ private static void addVersionsToTable(Table table) { @Test public void testTablePropsDefinedAtCatalogLevel() throws IOException { TableIdentifier tableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl"); - ImmutableMap catalogProps = ImmutableMap.of( - "table-default.key1", "catalog-default-key1", - "table-default.key2", "catalog-default-key2", - "table-default.key3", "catalog-default-key3", - "table-override.key3", "catalog-override-key3", - "table-override.key4", "catalog-override-key4"); - - Table table = hadoopCatalog(catalogProps).buildTable(tableIdent, SCHEMA) - .withPartitionSpec(SPEC) - .withProperties(null) - .withProperty("key2", "table-key2") - .withProperty("key3", "table-key3") - .withProperty("key5", "table-key5") - .create(); + ImmutableMap catalogProps = + ImmutableMap.of( + "table-default.key1", "catalog-default-key1", + "table-default.key2", "catalog-default-key2", + "table-default.key3", "catalog-default-key3", + "table-override.key3", "catalog-override-key3", + "table-override.key4", "catalog-override-key4"); + + Table table = + hadoopCatalog(catalogProps) + .buildTable(tableIdent, SCHEMA) + .withPartitionSpec(SPEC) + .withProperties(null) + .withProperty("key2", "table-key2") + .withProperty("key3", "table-key3") + .withProperty("key5", "table-key5") + .create(); Assert.assertEquals( "Table defaults set for the catalog must be added to the table properties.", @@ -579,8 +619,8 @@ public void testTablePropsDefinedAtCatalogLevel() throws IOException { "table-key2", table.properties().get("key2")); Assert.assertEquals( - "Table property override set at catalog level must override table default" + - " properties set at catalog level and table property specified.", + "Table property override set at catalog level must override table default" + + " properties set at catalog level and table property specified.", "catalog-override-key3", table.properties().get("key3")); Assert.assertEquals( @@ -588,8 +628,8 @@ public void testTablePropsDefinedAtCatalogLevel() throws IOException { "catalog-override-key4", table.properties().get("key4")); Assert.assertEquals( - "Table properties without any catalog level default or override should be added to table" + - " properties.", + "Table properties without any catalog level default or override should be added to table" + + " properties.", "table-key5", table.properties().get("key5")); } diff --git a/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopCommits.java b/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopCommits.java index 614734ba2d1d..db7d4f719323 100644 --- a/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopCommits.java +++ b/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopCommits.java @@ -16,9 +16,20 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.mockito.Mockito.any; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + import java.io.File; import java.io.IOException; import java.util.List; @@ -51,46 +62,29 @@ import org.junit.Test; import org.mockito.Mockito; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; -import static org.mockito.Mockito.any; -import static org.mockito.Mockito.doReturn; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - public class TestHadoopCommits extends HadoopTableTestBase { @Test public void testCreateTable() throws Exception { - PartitionSpec expectedSpec = PartitionSpec.builderFor(TABLE_SCHEMA) - .bucket("data", 16) - .build(); + PartitionSpec expectedSpec = PartitionSpec.builderFor(TABLE_SCHEMA).bucket("data", 16).build(); - Assert.assertEquals("Table schema should match schema with reassigned ids", - TABLE_SCHEMA.asStruct(), table.schema().asStruct()); - Assert.assertEquals("Table partition spec should match with reassigned ids", - expectedSpec, table.spec()); + Assert.assertEquals( + "Table schema should match schema with reassigned ids", + TABLE_SCHEMA.asStruct(), + table.schema().asStruct()); + Assert.assertEquals( + "Table partition spec should match with reassigned ids", expectedSpec, table.spec()); List tasks = Lists.newArrayList(table.newScan().planFiles()); Assert.assertEquals("Should not create any scan tasks", 0, tasks.size()); - Assert.assertTrue("Table location should exist", - tableDir.exists()); - Assert.assertTrue("Should create metadata folder", - metadataDir.exists() && metadataDir.isDirectory()); - Assert.assertTrue("Should create v1 metadata", - version(1).exists() && version(1).isFile()); - Assert.assertFalse("Should not create v2 or newer versions", - version(2).exists()); - Assert.assertTrue("Should create version hint file", - versionHintFile.exists()); - Assert.assertEquals("Should write the current version to the hint file", - 1, readVersionHint()); + Assert.assertTrue("Table location should exist", tableDir.exists()); + Assert.assertTrue( + "Should create metadata folder", metadataDir.exists() && metadataDir.isDirectory()); + Assert.assertTrue("Should create v1 metadata", version(1).exists() && version(1).isFile()); + Assert.assertFalse("Should not create v2 or newer versions", version(2).exists()); + Assert.assertTrue("Should create version hint file", versionHintFile.exists()); + Assert.assertEquals("Should write the current version to the hint file", 1, readVersionHint()); List manifests = listManifestFiles(); Assert.assertEquals("Should contain 0 Avro manifest files", 0, manifests.size()); @@ -98,22 +92,19 @@ public void testCreateTable() throws Exception { @Test public void testSchemaUpdate() throws Exception { - Assert.assertTrue("Should create v1 metadata", - version(1).exists() && version(1).isFile()); - Assert.assertFalse("Should not create v2 or newer versions", - version(2).exists()); + Assert.assertTrue("Should create v1 metadata", version(1).exists() && version(1).isFile()); + Assert.assertFalse("Should not create v2 or newer versions", version(2).exists()); - table.updateSchema() - .addColumn("n", Types.IntegerType.get()) - .commit(); + table.updateSchema().addColumn("n", Types.IntegerType.get()).commit(); - Assert.assertTrue("Should create v2 for the update", - version(2).exists() && version(2).isFile()); - Assert.assertEquals("Should write the current version to the hint file", - 2, readVersionHint()); + Assert.assertTrue( + "Should create v2 for the update", version(2).exists() && version(2).isFile()); + Assert.assertEquals("Should write the current version to the hint file", 2, readVersionHint()); - Assert.assertEquals("Table schema should match schema with reassigned ids", - UPDATED_SCHEMA.asStruct(), table.schema().asStruct()); + Assert.assertEquals( + "Table schema should match schema with reassigned ids", + UPDATED_SCHEMA.asStruct(), + table.schema().asStruct()); List tasks = Lists.newArrayList(table.newScan().planFiles()); Assert.assertEquals("Should not create any scan tasks", 0, tasks.size()); @@ -124,42 +115,44 @@ public void testSchemaUpdate() throws Exception { @Test public void testSchemaUpdateComplexType() throws Exception { - Assert.assertTrue("Should create v1 metadata", - version(1).exists() && version(1).isFile()); - Assert.assertFalse("Should not create v2 or newer versions", - version(2).exists()); - - Types.StructType complexColumn = Types.StructType.of( - required(0, "w", Types.IntegerType.get()), - required(1, "x", Types.StringType.get()), - required(2, "y", Types.BooleanType.get()), - optional(3, "z", Types.MapType.ofOptional( - 0, 1, Types.IntegerType.get(), Types.StringType.get() - )) - ); - Schema updatedSchema = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()), - optional(3, "complex", Types.StructType.of( - required(4, "w", Types.IntegerType.get()), - required(5, "x", Types.StringType.get()), - required(6, "y", Types.BooleanType.get()), - optional(7, "z", Types.MapType.ofOptional( - 8, 9, Types.IntegerType.get(), Types.StringType.get() - )) - )) - ); - - table.updateSchema() - .addColumn("complex", complexColumn) - .commit(); - - Assert.assertTrue("Should create v2 for the update", - version(2).exists() && version(2).isFile()); - Assert.assertEquals("Should write the current version to the hint file", - 2, readVersionHint()); - Assert.assertEquals("Table schema should match schema with reassigned ids", - updatedSchema.asStruct(), table.schema().asStruct()); + Assert.assertTrue("Should create v1 metadata", version(1).exists() && version(1).isFile()); + Assert.assertFalse("Should not create v2 or newer versions", version(2).exists()); + + Types.StructType complexColumn = + Types.StructType.of( + required(0, "w", Types.IntegerType.get()), + required(1, "x", Types.StringType.get()), + required(2, "y", Types.BooleanType.get()), + optional( + 3, + "z", + Types.MapType.ofOptional(0, 1, Types.IntegerType.get(), Types.StringType.get()))); + Schema updatedSchema = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get()), + optional( + 3, + "complex", + Types.StructType.of( + required(4, "w", Types.IntegerType.get()), + required(5, "x", Types.StringType.get()), + required(6, "y", Types.BooleanType.get()), + optional( + 7, + "z", + Types.MapType.ofOptional( + 8, 9, Types.IntegerType.get(), Types.StringType.get()))))); + + table.updateSchema().addColumn("complex", complexColumn).commit(); + + Assert.assertTrue( + "Should create v2 for the update", version(2).exists() && version(2).isFile()); + Assert.assertEquals("Should write the current version to the hint file", 2, readVersionHint()); + Assert.assertEquals( + "Table schema should match schema with reassigned ids", + updatedSchema.asStruct(), + table.schema().asStruct()); List tasks = Lists.newArrayList(table.newScan().planFiles()); Assert.assertEquals("Should not create any scan tasks", 0, tasks.size()); @@ -170,28 +163,29 @@ public void testSchemaUpdateComplexType() throws Exception { @Test public void testSchemaUpdateIdentifierFields() throws Exception { - Assert.assertTrue("Should create v1 metadata", - version(1).exists() && version(1).isFile()); - Assert.assertFalse("Should not create v2 or newer versions", - version(2).exists()); - - Schema updatedSchema = new Schema(Lists.newArrayList( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ), Sets.newHashSet(1)); - - table.updateSchema() - .setIdentifierFields("id") - .commit(); - - Assert.assertTrue("Should create v2 for the update", - version(2).exists() && version(2).isFile()); - Assert.assertEquals("Should write the current version to the hint file", - 2, readVersionHint()); - Assert.assertEquals("Table schema should match schema with reassigned ids", - updatedSchema.asStruct(), table.schema().asStruct()); - Assert.assertEquals("Identifier fields should match schema with reassigned ids", - updatedSchema.identifierFieldIds(), table.schema().identifierFieldIds()); + Assert.assertTrue("Should create v1 metadata", version(1).exists() && version(1).isFile()); + Assert.assertFalse("Should not create v2 or newer versions", version(2).exists()); + + Schema updatedSchema = + new Schema( + Lists.newArrayList( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())), + Sets.newHashSet(1)); + + table.updateSchema().setIdentifierFields("id").commit(); + + Assert.assertTrue( + "Should create v2 for the update", version(2).exists() && version(2).isFile()); + Assert.assertEquals("Should write the current version to the hint file", 2, readVersionHint()); + Assert.assertEquals( + "Table schema should match schema with reassigned ids", + updatedSchema.asStruct(), + table.schema().asStruct()); + Assert.assertEquals( + "Identifier fields should match schema with reassigned ids", + updatedSchema.identifierFieldIds(), + table.schema().identifierFieldIds()); } @Test @@ -200,15 +194,16 @@ public void testFailedCommit() throws Exception { UpdateSchema update = table.updateSchema().addColumn("n", Types.IntegerType.get()); update.apply(); - Assert.assertTrue("Should create v1 metadata", - version(1).exists() && version(1).isFile()); - Assert.assertFalse("Should not create v2 or newer versions", - version(2).exists()); + Assert.assertTrue("Should create v1 metadata", version(1).exists() && version(1).isFile()); + Assert.assertFalse("Should not create v2 or newer versions", version(2).exists()); version(2).createNewFile(); - AssertHelpers.assertThrows("Should fail to commit change based on v1 when v2 exists", - CommitFailedException.class, "Version 2 already exists", update::commit); + AssertHelpers.assertThrows( + "Should fail to commit change based on v1 when v2 exists", + CommitFailedException.class, + "Version 2 already exists", + update::commit); List manifests = listManifestFiles(); Assert.assertEquals("Should contain 0 Avro manifest files", 0, manifests.size()); @@ -218,33 +213,33 @@ public void testFailedCommit() throws Exception { public void testStaleMetadata() throws Exception { Table tableCopy = TABLES.load(tableLocation); - Assert.assertTrue("Should create v1 metadata", - version(1).exists() && version(1).isFile()); - Assert.assertFalse("Should not create v2 or newer versions", - version(2).exists()); + Assert.assertTrue("Should create v1 metadata", version(1).exists() && version(1).isFile()); + Assert.assertFalse("Should not create v2 or newer versions", version(2).exists()); // prepare changes on the copy without committing - UpdateSchema updateCopy = tableCopy.updateSchema() - .addColumn("m", Types.IntegerType.get()); + UpdateSchema updateCopy = tableCopy.updateSchema().addColumn("m", Types.IntegerType.get()); updateCopy.apply(); - table.updateSchema() - .addColumn("n", Types.IntegerType.get()) - .commit(); + table.updateSchema().addColumn("n", Types.IntegerType.get()).commit(); - Assert.assertTrue("Should create v2 for the update", - version(2).exists() && version(2).isFile()); - Assert.assertNotEquals("Unmodified copy should be out of date after update", - table.schema().asStruct(), tableCopy.schema().asStruct()); + Assert.assertTrue( + "Should create v2 for the update", version(2).exists() && version(2).isFile()); + Assert.assertNotEquals( + "Unmodified copy should be out of date after update", + table.schema().asStruct(), + tableCopy.schema().asStruct()); // update the table tableCopy.refresh(); - Assert.assertEquals("Copy should be back in sync", - table.schema().asStruct(), tableCopy.schema().asStruct()); + Assert.assertEquals( + "Copy should be back in sync", table.schema().asStruct(), tableCopy.schema().asStruct()); - AssertHelpers.assertThrows("Should fail with stale base metadata", - CommitFailedException.class, "based on stale table metadata", updateCopy::commit); + AssertHelpers.assertThrows( + "Should fail with stale base metadata", + CommitFailedException.class, + "based on stale table metadata", + updateCopy::commit); List manifests = listManifestFiles(); Assert.assertEquals("Should contain 0 Avro manifest files", 0, manifests.size()); @@ -254,46 +249,44 @@ public void testStaleMetadata() throws Exception { public void testStaleVersionHint() throws Exception { Table stale = TABLES.load(tableLocation); - Assert.assertTrue("Should create v1 metadata", - version(1).exists() && version(1).isFile()); - Assert.assertFalse("Should not create v2 or newer versions", - version(2).exists()); + Assert.assertTrue("Should create v1 metadata", version(1).exists() && version(1).isFile()); + Assert.assertFalse("Should not create v2 or newer versions", version(2).exists()); - table.updateSchema() - .addColumn("n", Types.IntegerType.get()) - .commit(); + table.updateSchema().addColumn("n", Types.IntegerType.get()).commit(); - Assert.assertTrue("Should create v2 for the update", - version(2).exists() && version(2).isFile()); - Assert.assertEquals("Should write the current version to the hint file", - 2, readVersionHint()); + Assert.assertTrue( + "Should create v2 for the update", version(2).exists() && version(2).isFile()); + Assert.assertEquals("Should write the current version to the hint file", 2, readVersionHint()); - Assert.assertNotEquals("Stable table schema should not match", - UPDATED_SCHEMA.asStruct(), stale.schema().asStruct()); + Assert.assertNotEquals( + "Stable table schema should not match", + UPDATED_SCHEMA.asStruct(), + stale.schema().asStruct()); // roll the version hint back to 1 replaceVersionHint(1); Table reloaded = TABLES.load(tableLocation); - Assert.assertEquals("Updated schema for newly loaded table should match", - UPDATED_SCHEMA.asStruct(), reloaded.schema().asStruct()); + Assert.assertEquals( + "Updated schema for newly loaded table should match", + UPDATED_SCHEMA.asStruct(), + reloaded.schema().asStruct()); stale.refresh(); - Assert.assertEquals("Refreshed schema for stale table should match", - UPDATED_SCHEMA.asStruct(), reloaded.schema().asStruct()); + Assert.assertEquals( + "Refreshed schema for stale table should match", + UPDATED_SCHEMA.asStruct(), + reloaded.schema().asStruct()); } @Test public void testFastAppend() throws Exception { // first append - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - Assert.assertTrue("Should create v2 for the update", - version(2).exists() && version(2).isFile()); - Assert.assertEquals("Should write the current version to the hint file", - 2, readVersionHint()); + Assert.assertTrue( + "Should create v2 for the update", version(2).exists() && version(2).isFile()); + Assert.assertEquals("Should write the current version to the hint file", 2, readVersionHint()); List tasks = Lists.newArrayList(table.newScan().planFiles()); Assert.assertEquals("Should scan 1 file", 1, tasks.size()); @@ -302,24 +295,22 @@ public void testFastAppend() throws Exception { Assert.assertEquals("Should contain only one Avro manifest file", 1, manifests.size()); // second append - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); - Assert.assertTrue("Should create v3 for the update", - version(3).exists() && version(3).isFile()); - Assert.assertEquals("Should write the current version to the hint file", - 3, readVersionHint()); + Assert.assertTrue( + "Should create v3 for the update", version(3).exists() && version(3).isFile()); + Assert.assertEquals("Should write the current version to the hint file", 3, readVersionHint()); tasks = Lists.newArrayList(table.newScan().planFiles()); Assert.assertEquals("Should scan 2 files", 2, tasks.size()); - Assert.assertEquals("Should contain 2 Avro manifest files", - 2, listManifestFiles().size()); + Assert.assertEquals("Should contain 2 Avro manifest files", 2, listManifestFiles().size()); TableMetadata metadata = readMetadataVersion(3); - Assert.assertEquals("Current snapshot should contain 2 manifests", - 2, metadata.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Current snapshot should contain 2 manifests", + 2, + metadata.currentSnapshot().allManifests(table.io()).size()); } @Test @@ -330,19 +321,18 @@ public void testMergeAppend() throws Exception { table.updateProperties().set("commit.manifest.min-count-to-merge", "1").commit(); // third append - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); List tasks = Lists.newArrayList(table.newScan().planFiles()); Assert.assertEquals("Should scan 3 files", 3, tasks.size()); - Assert.assertEquals("Should contain 3 Avro manifest files", - 3, listManifestFiles().size()); + Assert.assertEquals("Should contain 3 Avro manifest files", 3, listManifestFiles().size()); TableMetadata metadata = readMetadataVersion(5); - Assert.assertEquals("Current snapshot should contain 1 merged manifest", - 1, metadata.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Current snapshot should contain 1 merged manifest", + 1, + metadata.currentSnapshot().allManifests(table.io()).size()); } @Test @@ -362,28 +352,23 @@ public void testRenameThrow() throws Exception { } /** - * Test rename during {@link HadoopTableOperations#commit(TableMetadata, TableMetadata)} with the provided - * {@link FileSystem} object. The provided FileSystem will be injected for commit call. + * Test rename during {@link HadoopTableOperations#commit(TableMetadata, TableMetadata)} with the + * provided {@link FileSystem} object. The provided FileSystem will be injected for commit call. */ private void testRenameWithFileSystem(FileSystem mockFs) throws Exception { - assertTrue("Should create v1 metadata", - version(1).exists() && version(1).isFile()); - assertFalse("Should not create v2 or newer versions", - version(2).exists()); + assertTrue("Should create v1 metadata", version(1).exists() && version(1).isFile()); + assertFalse("Should not create v2 or newer versions", version(2).exists()); assertTrue(table instanceof BaseTable); BaseTable baseTable = (BaseTable) table; // use v1 metafile as the test rename destination. TableMetadata meta1 = baseTable.operations().current(); // create v2 metafile as base. This is solely for the convenience of rename testing later - // (so that we have 2 valid and different metadata files, which will reach the rename part during commit) - table.updateSchema() - .addColumn("n", Types.IntegerType.get()) - .commit(); - assertTrue("Should create v2 for the update", - version(2).exists() && version(2).isFile()); - assertEquals("Should write the current version to the hint file", - 2, readVersionHint()); + // (so that we have 2 valid and different metadata files, which will reach the rename part + // during commit) + table.updateSchema().addColumn("n", Types.IntegerType.get()).commit(); + assertTrue("Should create v2 for the update", version(2).exists() && version(2).isFile()); + assertEquals("Should write the current version to the hint file", 2, readVersionHint()); // mock / spy the classes for testing TableOperations tops = baseTable.operations(); @@ -399,29 +384,29 @@ private void testRenameWithFileSystem(FileSystem mockFs) throws Exception { } // Verifies that there is no temporary metadata.json files left on rename failures. - Set actual = listMetadataJsonFiles().stream().map(File::getName).collect(Collectors.toSet()); + Set actual = + listMetadataJsonFiles().stream().map(File::getName).collect(Collectors.toSet()); Set expected = Sets.newHashSet("v1.metadata.json", "v2.metadata.json"); assertEquals("only v1 and v2 metadata.json should exist.", expected, actual); } @Test public void testCanReadOldCompressedManifestFiles() throws Exception { - assertTrue("Should create v1 metadata", - version(1).exists() && version(1).isFile()); + assertTrue("Should create v1 metadata", version(1).exists() && version(1).isFile()); // do a file append - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - // since we don't generate old file extensions anymore, let's convert existing metadata to old .metadata.json.gz + // since we don't generate old file extensions anymore, let's convert existing metadata to old + // .metadata.json.gz // to test backwards compatibility rewriteMetadataAsGzipWithOldExtension(); List metadataFiles = listMetadataJsonFiles(); assertEquals("Should have two versions", 2, metadataFiles.size()); - assertTrue("Metadata should be compressed with old format.", + assertTrue( + "Metadata should be compressed with old format.", metadataFiles.stream().allMatch(f -> f.getName().endsWith(".metadata.json.gz"))); Table reloaded = TABLES.load(tableLocation); @@ -437,39 +422,47 @@ public void testConcurrentFastAppends() throws Exception { dir.delete(); int threadsCount = 5; int numberOfCommitedFilesPerThread = 10; - Table tableWithHighRetries = TABLES.create(SCHEMA, SPEC, - ImmutableMap.of(COMMIT_NUM_RETRIES, String.valueOf(threadsCount)), dir.toURI().toString()); + Table tableWithHighRetries = + TABLES.create( + SCHEMA, + SPEC, + ImmutableMap.of(COMMIT_NUM_RETRIES, String.valueOf(threadsCount)), + dir.toURI().toString()); String fileName = UUID.randomUUID().toString(); - DataFile file = DataFiles.builder(tableWithHighRetries.spec()) - .withPath(FileFormat.PARQUET.addExtension(fileName)) - .withRecordCount(2) - .withFileSizeInBytes(0) - .build(); + DataFile file = + DataFiles.builder(tableWithHighRetries.spec()) + .withPath(FileFormat.PARQUET.addExtension(fileName)) + .withRecordCount(2) + .withFileSizeInBytes(0) + .build(); ExecutorService executorService = Executors.newFixedThreadPool(threadsCount); AtomicInteger barrier = new AtomicInteger(0); - Tasks - .range(threadsCount) + Tasks.range(threadsCount) .stopOnFailure() .throwFailureWhenFinished() .executeWith(executorService) - .run(index -> { - for (int numCommittedFiles = 0; numCommittedFiles < numberOfCommitedFilesPerThread; numCommittedFiles++) { - while (barrier.get() < numCommittedFiles * threadsCount) { - try { - Thread.sleep(10); - } catch (InterruptedException e) { - throw new RuntimeException(e); + .run( + index -> { + for (int numCommittedFiles = 0; + numCommittedFiles < numberOfCommitedFilesPerThread; + numCommittedFiles++) { + while (barrier.get() < numCommittedFiles * threadsCount) { + try { + Thread.sleep(10); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + tableWithHighRetries.newFastAppend().appendFile(file).commit(); + barrier.incrementAndGet(); } - } - tableWithHighRetries.newFastAppend().appendFile(file).commit(); - barrier.incrementAndGet(); - } - }); + }); tableWithHighRetries.refresh(); - assertEquals(threadsCount * numberOfCommitedFilesPerThread, + assertEquals( + threadsCount * numberOfCommitedFilesPerThread, Lists.newArrayList(tableWithHighRetries.snapshots()).size()); } } diff --git a/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopTables.java b/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopTables.java index 56145bb0f798..ef7c8252a6f5 100644 --- a/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopTables.java +++ b/core/src/test/java/org/apache/iceberg/hadoop/TestHadoopTables.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; +import static org.apache.iceberg.NullOrder.NULLS_FIRST; +import static org.apache.iceberg.SortDirection.ASC; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -44,20 +47,15 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.NullOrder.NULLS_FIRST; -import static org.apache.iceberg.SortDirection.ASC; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestHadoopTables { private static final HadoopTables TABLES = new HadoopTables(); - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableDir = null; @Before @@ -68,9 +66,7 @@ public void setupTableLocation() throws Exception { @Test public void testTableExists() { Assert.assertFalse(TABLES.exists(tableDir.toURI().toString())); - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .bucket("data", 16) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).bucket("data", 16).build(); TABLES.create(SCHEMA, spec, tableDir.toURI().toString()); Assert.assertTrue(TABLES.exists(tableDir.toURI().toString())); } @@ -80,8 +76,10 @@ public void testDropTable() { TABLES.create(SCHEMA, tableDir.toURI().toString()); TABLES.dropTable(tableDir.toURI().toString()); AssertHelpers.assertThrows( - "Should complain about missing table", NoSuchTableException.class, - "Table does not exist", () -> TABLES.load(tableDir.toURI().toString())); + "Should complain about missing table", + NoSuchTableException.class, + "Table does not exist", + () -> TABLES.load(tableDir.toURI().toString())); } @Test @@ -92,8 +90,10 @@ public void testDropTableWithPurge() throws IOException { TABLES.dropTable(tableDir.toURI().toString(), true); AssertHelpers.assertThrows( - "Should complain about missing table", NoSuchTableException.class, - "Table does not exist", () -> TABLES.load(tableDir.toURI().toString())); + "Should complain about missing table", + NoSuchTableException.class, + "Table does not exist", + () -> TABLES.load(tableDir.toURI().toString())); Assert.assertEquals(0, dataDir.listFiles().length); Assert.assertFalse(tableDir.exists()); @@ -109,8 +109,10 @@ public void testDropTableWithoutPurge() throws IOException { TABLES.dropTable(tableDir.toURI().toString(), false); AssertHelpers.assertThrows( - "Should complain about missing table", NoSuchTableException.class, - "Table does not exist", () -> TABLES.load(tableDir.toURI().toString())); + "Should complain about missing table", + NoSuchTableException.class, + "Table does not exist", + () -> TABLES.load(tableDir.toURI().toString())); Assert.assertEquals(1, dataDir.listFiles().length); Assert.assertFalse(tableDir.exists()); @@ -120,9 +122,7 @@ public void testDropTableWithoutPurge() throws IOException { @Test public void testDefaultSortOrder() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .bucket("data", 16) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).bucket("data", 16).build(); Table table = TABLES.create(SCHEMA, spec, tableDir.toURI().toString()); SortOrder sortOrder = table.sortOrder(); @@ -132,28 +132,24 @@ public void testDefaultSortOrder() { @Test public void testCustomSortOrder() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .bucket("data", 16) - .build(); - SortOrder order = SortOrder.builderFor(SCHEMA) - .asc("id", NULLS_FIRST) - .build(); - Table table = TABLES.create(SCHEMA, spec, order, Maps.newHashMap(), tableDir.toURI().toString()); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).bucket("data", 16).build(); + SortOrder order = SortOrder.builderFor(SCHEMA).asc("id", NULLS_FIRST).build(); + Table table = + TABLES.create(SCHEMA, spec, order, Maps.newHashMap(), tableDir.toURI().toString()); SortOrder sortOrder = table.sortOrder(); Assert.assertEquals("Order ID must match", 1, sortOrder.orderId()); Assert.assertEquals("Order must have 1 field", 1, sortOrder.fields().size()); Assert.assertEquals("Direction must match ", ASC, sortOrder.fields().get(0).direction()); - Assert.assertEquals("Null order must match ", NULLS_FIRST, sortOrder.fields().get(0).nullOrder()); + Assert.assertEquals( + "Null order must match ", NULLS_FIRST, sortOrder.fields().get(0).nullOrder()); Transform transform = Transforms.identity(Types.IntegerType.get()); Assert.assertEquals("Transform must match", transform, sortOrder.fields().get(0).transform()); } @Test public void testTableName() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .bucket("data", 16) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).bucket("data", 16).build(); String location = tableDir.toURI().toString(); TABLES.create(SCHEMA, spec, location); @@ -169,11 +165,12 @@ private static void createDummyTable(File tableDir, File dataDir) throws IOExcep AppendFiles append = table.newAppend(); String data = dataDir.getPath() + "/data.parquet"; Files.write(Paths.get(data), Lists.newArrayList(), StandardCharsets.UTF_8); - DataFile dataFile = DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath(data) - .withFileSizeInBytes(10) - .withRecordCount(1) - .build(); + DataFile dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath(data) + .withFileSizeInBytes(10) + .withRecordCount(1) + .build(); append.appendFile(dataFile); append.commit(); diff --git a/core/src/test/java/org/apache/iceberg/hadoop/TestStaticTable.java b/core/src/test/java/org/apache/iceberg/hadoop/TestStaticTable.java index 4569951458f3..9a73ef9e1500 100644 --- a/core/src/test/java/org/apache/iceberg/hadoop/TestStaticTable.java +++ b/core/src/test/java/org/apache/iceberg/hadoop/TestStaticTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; import org.apache.iceberg.AssertHelpers; @@ -35,20 +34,24 @@ private Table getStaticTable() { } private Table getStaticTable(MetadataTableType type) { - return TABLES.load(((HasTableOperations) table).operations().current().metadataFileLocation() + "#" + type); + return TABLES.load( + ((HasTableOperations) table).operations().current().metadataFileLocation() + "#" + type); } @Test public void testLoadFromMetadata() { Table staticTable = getStaticTable(); - Assert.assertTrue("Loading a metadata file based table should return StaticTableOperations", + Assert.assertTrue( + "Loading a metadata file based table should return StaticTableOperations", ((HasTableOperations) staticTable).operations() instanceof StaticTableOperations); } @Test public void testCannotBeAddedTo() { Table staticTable = getStaticTable(); - AssertHelpers.assertThrows("Cannot modify a static table", UnsupportedOperationException.class, + AssertHelpers.assertThrows( + "Cannot modify a static table", + UnsupportedOperationException.class, () -> staticTable.newOverwrite().addFile(FILE_A).commit()); } @@ -56,7 +59,9 @@ public void testCannotBeAddedTo() { public void testCannotBeDeletedFrom() { table.newAppend().appendFile(FILE_A).commit(); Table staticTable = getStaticTable(); - AssertHelpers.assertThrows("Cannot modify a static table", UnsupportedOperationException.class, + AssertHelpers.assertThrows( + "Cannot modify a static table", + UnsupportedOperationException.class, () -> staticTable.newDelete().deleteFile(FILE_A).commit()); } @@ -66,7 +71,8 @@ public void testCannotDoIncrementalScanOnMetadataTable() { for (MetadataTableType type : MetadataTableType.values()) { Table staticTable = getStaticTable(type); - AssertHelpers.assertThrows("Static tables do not support incremental scans", + AssertHelpers.assertThrows( + "Static tables do not support incremental scans", UnsupportedOperationException.class, String.format("Cannot incrementally scan table of type %s", type), () -> staticTable.newScan().appendsAfter(1)); @@ -79,11 +85,12 @@ public void testHasSameProperties() { table.newAppend().appendFile(FILE_B).commit(); table.newOverwrite().deleteFile(FILE_B).addFile(FILE_C).commit(); Table staticTable = getStaticTable(); - Assert.assertTrue("Same history?", - table.history().containsAll(staticTable.history())); - Assert.assertTrue("Same snapshot?", - table.currentSnapshot().snapshotId() == staticTable.currentSnapshot().snapshotId()); - Assert.assertTrue("Same properties?", + Assert.assertTrue("Same history?", table.history().containsAll(staticTable.history())); + Assert.assertTrue( + "Same snapshot?", + table.currentSnapshot().snapshotId() == staticTable.currentSnapshot().snapshotId()); + Assert.assertTrue( + "Same properties?", Maps.difference(table.properties(), staticTable.properties()).areEqual()); } @@ -97,18 +104,19 @@ public void testImmutable() { table.newOverwrite().deleteFile(FILE_B).addFile(FILE_C).commit(); staticTable.refresh(); - Assert.assertEquals("Snapshot unchanged after table modified", - staticTable.currentSnapshot().snapshotId(), originalSnapshot); + Assert.assertEquals( + "Snapshot unchanged after table modified", + staticTable.currentSnapshot().snapshotId(), + originalSnapshot); } @Test public void testMetadataTables() { for (MetadataTableType type : MetadataTableType.values()) { String enumName = type.name().replace("_", "").toLowerCase(); - Assert.assertTrue("Should be able to get MetadataTable of type : " + type, + Assert.assertTrue( + "Should be able to get MetadataTable of type : " + type, getStaticTable(type).getClass().getName().toLowerCase().contains(enumName)); } } - - } diff --git a/core/src/test/java/org/apache/iceberg/hadoop/TestTableSerialization.java b/core/src/test/java/org/apache/iceberg/hadoop/TestTableSerialization.java index 4159ce6cd25c..6fa0a901afb5 100644 --- a/core/src/test/java/org/apache/iceberg/hadoop/TestTableSerialization.java +++ b/core/src/test/java/org/apache/iceberg/hadoop/TestTableSerialization.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hadoop; import java.io.ByteArrayInputStream; @@ -45,42 +44,26 @@ public class TestTableSerialization extends HadoopTableTestBase { @Test public void testSerializableTable() throws IOException, ClassNotFoundException { - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - table.updateProperties() - .set("k1", "v1") - .set("k2", "v2") - .commit(); + table.updateProperties().set("k1", "v1").set("k2", "v2").commit(); - table.updateSchema() - .addColumn("new_col", Types.IntegerType.get()) - .commit(); + table.updateSchema().addColumn("new_col", Types.IntegerType.get()).commit(); TestHelpers.assertSerializedAndLoadedMetadata(table, TestHelpers.roundTripSerialize(table)); } @Test public void testSerializableTxnTable() throws IOException, ClassNotFoundException { - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - table.updateProperties() - .set("k1", "v1") - .set("k2", "v2") - .commit(); + table.updateProperties().set("k1", "v1").set("k2", "v2").commit(); - table.updateSchema() - .addColumn("new_col", Types.IntegerType.get()) - .commit(); + table.updateSchema().addColumn("new_col", Types.IntegerType.get()).commit(); Transaction txn = table.newTransaction(); - txn.updateProperties() - .set("k3", "v3") - .commit(); + txn.updateProperties().set("k3", "v3").commit(); // txn tables have metadata locations as null so we check only serialized metadata TestHelpers.assertSerializedMetadata(txn.table(), TestHelpers.roundTripSerialize(txn.table())); @@ -90,23 +73,20 @@ public void testSerializableTxnTable() throws IOException, ClassNotFoundExceptio public void testSerializableMetadataTable() throws IOException, ClassNotFoundException { for (MetadataTableType type : MetadataTableType.values()) { Table metadataTable = getMetaDataTable(table, type); - TestHelpers.assertSerializedAndLoadedMetadata(metadataTable, TestHelpers.roundTripSerialize(metadataTable)); + TestHelpers.assertSerializedAndLoadedMetadata( + metadataTable, TestHelpers.roundTripSerialize(metadataTable)); } } @Test public void testSerializableTablePlanning() throws IOException { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); byte[] serialized = serializeToBytes(table); Set expected = getFiles(table); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); Table deserialized = deserializeFromBytes(serialized); @@ -121,13 +101,9 @@ public void testSerializableTablePlanning() throws IOException { @Test public void testSerializableMetadataTablesPlanning() throws IOException { - table.updateProperties() - .set(TableProperties.FORMAT_VERSION, "2") - .commit(); + table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Map serialized = Maps.newHashMap(); Map> expected = Maps.newHashMap(); @@ -140,12 +116,8 @@ public void testSerializableMetadataTablesPlanning() throws IOException { expected.put(type, getFiles(metaTable)); } - table.newAppend() - .appendFile(FILE_B) - .commit(); - table.newRowDelta() - .addDeletes(FILE_B_DELETES) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); + table.newRowDelta().addDeletes(FILE_B_DELETES).commit(); for (MetadataTableType type : MetadataTableType.values()) { // Collect the deserialized data @@ -163,7 +135,8 @@ public void testSerializableMetadataTablesPlanning() throws IOException { } private static Table getMetaDataTable(Table table, MetadataTableType type) { - return TABLES.load(((HasTableOperations) table).operations().current().metadataFileLocation() + "#" + type); + return TABLES.load( + ((HasTableOperations) table).operations().current().metadataFileLocation() + "#" + type); } private static Set getFiles(Table table) throws IOException { @@ -179,7 +152,7 @@ private static Set getFiles(Table table) throws IOException { private static byte[] serializeToBytes(Object obj) { try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - ObjectOutputStream oos = new ObjectOutputStream(baos)) { + ObjectOutputStream oos = new ObjectOutputStream(baos)) { oos.writeObject(obj); return baos.toByteArray(); } catch (IOException e) { @@ -194,7 +167,7 @@ private static T deserializeFromBytes(byte[] bytes) { } try (ByteArrayInputStream bais = new ByteArrayInputStream(bytes); - ObjectInputStream ois = new ObjectInputStream(bais)) { + ObjectInputStream ois = new ObjectInputStream(bais)) { return (T) ois.readObject(); } catch (IOException e) { throw new UncheckedIOException("Failed to deserialize object", e); diff --git a/core/src/test/java/org/apache/iceberg/io/InMemoryInputFile.java b/core/src/test/java/org/apache/iceberg/io/InMemoryInputFile.java index fb4d52a52115..dc1311c1b952 100644 --- a/core/src/test/java/org/apache/iceberg/io/InMemoryInputFile.java +++ b/core/src/test/java/org/apache/iceberg/io/InMemoryInputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.ByteArrayInputStream; @@ -81,8 +80,11 @@ public long getPos() throws IOException { public void seek(long newPos) throws IOException { checkOpen(); delegate.reset(); // resets to a marked position - Preconditions.checkState(delegate.skip(newPos) == newPos, - "Invalid position %s within stream of length %s", newPos, length); + Preconditions.checkState( + delegate.skip(newPos) == newPos, + "Invalid position %s within stream of length %s", + newPos, + length); } @Override @@ -139,7 +141,8 @@ public void close() throws IOException { } private void checkOpen() { - // ByteArrayInputStream can be used even after close, so for test purposes disallow such use explicitly + // ByteArrayInputStream can be used even after close, so for test purposes disallow such use + // explicitly Preconditions.checkState(!closed, "Stream is closed"); } } diff --git a/core/src/test/java/org/apache/iceberg/io/InMemoryOutputFile.java b/core/src/test/java/org/apache/iceberg/io/InMemoryOutputFile.java index 3f03470d64ab..e8740b125fa3 100644 --- a/core/src/test/java/org/apache/iceberg/io/InMemoryOutputFile.java +++ b/core/src/test/java/org/apache/iceberg/io/InMemoryOutputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.ByteArrayOutputStream; @@ -116,7 +115,8 @@ public void close() throws IOException { } private void checkOpen() { - // ByteArrayOutputStream can be used even after close, so for test purposes disallow such use explicitly + // ByteArrayOutputStream can be used even after close, so for test purposes disallow such use + // explicitly Preconditions.checkState(!closed, "Stream is closed"); } } diff --git a/core/src/test/java/org/apache/iceberg/io/MockInputStream.java b/core/src/test/java/org/apache/iceberg/io/MockInputStream.java index 7cc251ebd641..17a6d537b267 100644 --- a/core/src/test/java/org/apache/iceberg/io/MockInputStream.java +++ b/core/src/test/java/org/apache/iceberg/io/MockInputStream.java @@ -16,14 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.ByteArrayInputStream; class MockInputStream extends ByteArrayInputStream { - static final byte[] TEST_ARRAY = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; + static final byte[] TEST_ARRAY = new byte[] {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; private int[] lengths; private int current = 0; @@ -55,4 +54,3 @@ public long getPos() { return this.pos; } } - diff --git a/core/src/test/java/org/apache/iceberg/io/TestByteBufferInputStreams.java b/core/src/test/java/org/apache/iceberg/io/TestByteBufferInputStreams.java index 3bab66b81575..59a23865ba5c 100644 --- a/core/src/test/java/org/apache/iceberg/io/TestByteBufferInputStreams.java +++ b/core/src/test/java/org/apache/iceberg/io/TestByteBufferInputStreams.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.EOFException; @@ -33,6 +32,7 @@ public abstract class TestByteBufferInputStreams { protected abstract ByteBufferInputStream newStream(); + protected abstract void checkOriginalData(); @Test @@ -46,8 +46,7 @@ public void testRead0() throws Exception { int bytesRead = stream.read(new byte[100]); Assert.assertTrue("Should read to end of stream", bytesRead < 100); - Assert.assertEquals("Should read 0 bytes at end of stream", - 0, stream.read(bytes)); + Assert.assertEquals("Should read 0 bytes at end of stream", 0, stream.read(bytes)); } @Test @@ -57,22 +56,18 @@ public void testReadAll() throws Exception { ByteBufferInputStream stream = newStream(); int bytesRead = stream.read(bytes); - Assert.assertEquals("Should read the entire buffer", - bytes.length, bytesRead); + Assert.assertEquals("Should read the entire buffer", bytes.length, bytesRead); for (int i = 0; i < bytes.length; i += 1) { Assert.assertEquals("Byte i should be i", i, bytes[i]); Assert.assertEquals("Should advance position", 35, stream.getPos()); } - Assert.assertEquals("Should have no more remaining content", - 0, stream.available()); + Assert.assertEquals("Should have no more remaining content", 0, stream.available()); - Assert.assertEquals("Should return -1 at end of stream", - -1, stream.read(bytes)); + Assert.assertEquals("Should return -1 at end of stream", -1, stream.read(bytes)); - Assert.assertEquals("Should have no more remaining content", - 0, stream.available()); + Assert.assertEquals("Should have no more remaining content", 0, stream.available()); checkOriginalData(); } @@ -87,13 +82,11 @@ public void testSmallReads() throws Exception { int lastBytesRead = bytes.length; for (int offset = 0; offset < length; offset += bytes.length) { - Assert.assertEquals("Should read requested len", - bytes.length, lastBytesRead); + Assert.assertEquals("Should read requested len", bytes.length, lastBytesRead); lastBytesRead = stream.read(bytes, 0, bytes.length); - Assert.assertEquals("Should advance position", - offset + lastBytesRead, stream.getPos()); + Assert.assertEquals("Should advance position", offset + lastBytesRead, stream.getPos()); // validate the bytes that were read for (int i = 0; i < lastBytesRead; i += 1) { @@ -101,17 +94,16 @@ public void testSmallReads() throws Exception { } } - Assert.assertEquals("Should read fewer bytes at end of buffer", - length % bytes.length, lastBytesRead % bytes.length); + Assert.assertEquals( + "Should read fewer bytes at end of buffer", + length % bytes.length, + lastBytesRead % bytes.length); - Assert.assertEquals("Should have no more remaining content", - 0, stream.available()); + Assert.assertEquals("Should have no more remaining content", 0, stream.available()); - Assert.assertEquals("Should return -1 at end of stream", - -1, stream.read(bytes)); + Assert.assertEquals("Should return -1 at end of stream", -1, stream.read(bytes)); - Assert.assertEquals("Should have no more remaining content", - 0, stream.available()); + Assert.assertEquals("Should have no more remaining content", 0, stream.available()); } checkOriginalData(); @@ -128,35 +120,30 @@ public void testPartialBufferReads() throws Exception { for (int offset = 0; offset < bytes.length; offset += size) { Assert.assertEquals("Should read requested len", size, lastBytesRead); - lastBytesRead = stream.read( - bytes, offset, Math.min(size, bytes.length - offset)); + lastBytesRead = stream.read(bytes, offset, Math.min(size, bytes.length - offset)); - Assert.assertEquals("Should advance position", + Assert.assertEquals( + "Should advance position", lastBytesRead > 0 ? offset + lastBytesRead : offset, stream.getPos()); } - Assert.assertEquals("Should read fewer bytes at end of buffer", - bytes.length % size, lastBytesRead % size); + Assert.assertEquals( + "Should read fewer bytes at end of buffer", bytes.length % size, lastBytesRead % size); for (int i = 0; i < bytes.length; i += 1) { Assert.assertEquals("Byte i should be i", i, bytes[i]); } - Assert.assertEquals("Should have no more remaining content", - 2, stream.available()); + Assert.assertEquals("Should have no more remaining content", 2, stream.available()); - Assert.assertEquals("Should return 2 more bytes", - 2, stream.read(bytes)); + Assert.assertEquals("Should return 2 more bytes", 2, stream.read(bytes)); - Assert.assertEquals("Should have no more remaining content", - 0, stream.available()); + Assert.assertEquals("Should have no more remaining content", 0, stream.available()); - Assert.assertEquals("Should return -1 at end of stream", - -1, stream.read(bytes)); + Assert.assertEquals("Should return -1 at end of stream", -1, stream.read(bytes)); - Assert.assertEquals("Should have no more remaining content", - 0, stream.available()); + Assert.assertEquals("Should have no more remaining content", 0, stream.available()); } checkOriginalData(); @@ -172,8 +159,10 @@ public void testReadByte() throws Exception { Assert.assertEquals(i, stream.read()); } - AssertHelpers.assertThrows("Should throw EOFException at end of stream", - EOFException.class, (Callable) stream::read); + AssertHelpers.assertThrows( + "Should throw EOFException at end of stream", + EOFException.class, + (Callable) stream::read); checkOriginalData(); } @@ -186,8 +175,7 @@ public void testSlice() throws Exception { ByteBuffer empty = stream.slice(0); Assert.assertNotNull("slice(0) should produce a non-null buffer", empty); - Assert.assertEquals("slice(0) should produce an empty buffer", - 0, empty.remaining()); + Assert.assertEquals("slice(0) should produce an empty buffer", 0, empty.remaining()); Assert.assertEquals("Position should be at start", 0, stream.getPos()); @@ -212,8 +200,8 @@ public void testSlice() throws Exception { public void testSliceBuffers0() throws Exception { ByteBufferInputStream stream = newStream(); - Assert.assertEquals("Should return an empty list", - Collections.emptyList(), stream.sliceBuffers(0)); + Assert.assertEquals( + "Should return an empty list", Collections.emptyList(), stream.sliceBuffers(0)); } @Test @@ -225,8 +213,10 @@ public void testWholeSliceBuffers() throws Exception { Assert.assertEquals("Should consume all buffers", length, stream.getPos()); - AssertHelpers.assertThrows("Should throw EOFException when empty", - EOFException.class, () -> stream.sliceBuffers(length)); + AssertHelpers.assertThrows( + "Should throw EOFException when empty", + EOFException.class, + () -> stream.sliceBuffers(length)); ByteBufferInputStream copy = ByteBufferInputStream.wrap(buffers); for (int i = 0; i < length; i += 1) { @@ -247,8 +237,7 @@ public void testSliceBuffersCoverage() throws Exception { buffers.addAll(stream.sliceBuffers(Math.min(size, stream.available()))); } - Assert.assertEquals("Should consume all content", - length, stream.getPos()); + Assert.assertEquals("Should consume all content", length, stream.getPos()); ByteBufferInputStream newStream = new MultiBufferInputStream(buffers); @@ -267,17 +256,15 @@ public void testSliceBuffersModification() throws Exception { int sliceLength = 5; List buffers = stream.sliceBuffers(sliceLength); - Assert.assertEquals("Should advance the original stream", - length - sliceLength, stream.available()); - Assert.assertEquals("Should advance the original stream position", - sliceLength, stream.getPos()); + Assert.assertEquals( + "Should advance the original stream", length - sliceLength, stream.available()); + Assert.assertEquals( + "Should advance the original stream position", sliceLength, stream.getPos()); - Assert.assertEquals("Should return a slice of the first buffer", - 1, buffers.size()); + Assert.assertEquals("Should return a slice of the first buffer", 1, buffers.size()); ByteBuffer buffer = buffers.get(0); - Assert.assertEquals("Should have requested bytes", - sliceLength, buffer.remaining()); + Assert.assertEquals("Should have requested bytes", sliceLength, buffer.remaining()); // read the buffer one past the returned limit. this should not change the // next value in the original stream @@ -286,10 +273,10 @@ public void testSliceBuffersModification() throws Exception { Assert.assertEquals("Should have correct data", i, buffer.get()); } - Assert.assertEquals("Reading a slice shouldn't advance the original stream", - sliceLength, stream.getPos()); - Assert.assertEquals("Reading a slice shouldn't change the underlying data", - sliceLength, stream.read()); + Assert.assertEquals( + "Reading a slice shouldn't advance the original stream", sliceLength, stream.getPos()); + Assert.assertEquals( + "Reading a slice shouldn't change the underlying data", sliceLength, stream.read()); // change the underlying data buffer buffer.limit(sliceLength + 2); @@ -301,10 +288,10 @@ public void testSliceBuffersModification() throws Exception { Assert.assertEquals( "Writing to a slice shouldn't advance the original stream", - sliceLength + 1, stream.getPos()); + sliceLength + 1, + stream.getPos()); Assert.assertEquals( - "Writing to a slice should change the underlying data", - 255, stream.read()); + "Writing to a slice should change the underlying data", 255, stream.read()); } finally { undoBuffer.put((byte) originalValue); @@ -317,18 +304,16 @@ public void testSkip() throws Exception { while (stream.available() > 0) { int bytesToSkip = Math.min(stream.available(), 10); - Assert.assertEquals("Should skip all, regardless of backing buffers", - bytesToSkip, stream.skip(bytesToSkip)); + Assert.assertEquals( + "Should skip all, regardless of backing buffers", bytesToSkip, stream.skip(bytesToSkip)); } stream = newStream(); Assert.assertEquals(0, stream.skip(0)); int length = stream.available(); - Assert.assertEquals("Should stop at end when out of bytes", - length, stream.skip(length + 10)); - Assert.assertEquals("Should return -1 when at end", - -1, stream.skip(10)); + Assert.assertEquals("Should stop at end when out of bytes", length, stream.skip(length + 10)); + Assert.assertEquals("Should return -1 when at end", -1, stream.skip(10)); } @Test @@ -341,8 +326,10 @@ public void testSkipFully() throws Exception { stream.skipFully(bytesToSkip); - Assert.assertEquals("Should skip all, regardless of backing buffers", - bytesToSkip, stream.getPos() - lastPosition); + Assert.assertEquals( + "Should skip all, regardless of backing buffers", + bytesToSkip, + stream.getPos() - lastPosition); lastPosition = stream.getPos(); } @@ -352,8 +339,10 @@ public void testSkipFully() throws Exception { Assert.assertEquals(0, stream2.getPos()); int length = stream2.available(); - AssertHelpers.assertThrows("Should throw when out of bytes", - EOFException.class, () -> { + AssertHelpers.assertThrows( + "Should throw when out of bytes", + EOFException.class, + () -> { stream2.skipFully(length + 10); return null; }); @@ -375,17 +364,15 @@ public void testMark() throws Exception { stream.reset(); - Assert.assertEquals("Position should return to the mark", - mark, stream.getPos()); + Assert.assertEquals("Position should return to the mark", mark, stream.getPos()); byte[] afterReset = new byte[100]; int bytesReadAfterReset = stream.read(afterReset); - Assert.assertEquals("Should read the same number of bytes", - expectedBytesRead, bytesReadAfterReset); + Assert.assertEquals( + "Should read the same number of bytes", expectedBytesRead, bytesReadAfterReset); - Assert.assertEquals("Read should end at the same position", - end, stream.getPos()); + Assert.assertEquals("Read should end at the same position", end, stream.getPos()); Assert.assertArrayEquals("Content should be equal", expected, afterReset); } @@ -407,17 +394,15 @@ public void testMarkTwice() throws Exception { stream.reset(); - Assert.assertEquals("Position should return to the mark", - mark, stream.getPos()); + Assert.assertEquals("Position should return to the mark", mark, stream.getPos()); byte[] afterReset = new byte[100]; int bytesReadAfterReset = stream.read(afterReset); - Assert.assertEquals("Should read the same number of bytes", - expectedBytesRead, bytesReadAfterReset); + Assert.assertEquals( + "Should read the same number of bytes", expectedBytesRead, bytesReadAfterReset); - Assert.assertEquals("Read should end at the same position", - end, stream.getPos()); + Assert.assertEquals("Read should end at the same position", end, stream.getPos()); Assert.assertArrayEquals("Content should be equal", expected, afterReset); } @@ -437,14 +422,12 @@ public void testMarkAtStart() throws Exception { stream.reset(); - Assert.assertEquals("Position should return to the mark", - mark, stream.getPos()); + Assert.assertEquals("Position should return to the mark", mark, stream.getPos()); byte[] afterReset = new byte[10]; Assert.assertEquals("Should read 10 bytes", 10, stream.read(afterReset)); - Assert.assertEquals("Read should end at the same position", - end, stream.getPos()); + Assert.assertEquals("Read should end at the same position", end, stream.getPos()); Assert.assertArrayEquals("Content should be equal", expected, afterReset); } @@ -467,14 +450,12 @@ public void testMarkAtEnd() throws Exception { stream.reset(); - Assert.assertEquals("Position should return to the mark", - mark, stream.getPos()); + Assert.assertEquals("Position should return to the mark", mark, stream.getPos()); byte[] afterReset = new byte[10]; Assert.assertEquals("Should read 0 bytes", -1, stream.read(afterReset)); - Assert.assertEquals("Read should end at the same position", - end, stream.getPos()); + Assert.assertEquals("Read should end at the same position", end, stream.getPos()); Assert.assertArrayEquals("Content should be equal", expected, afterReset); } @@ -483,8 +464,10 @@ public void testMarkAtEnd() throws Exception { public void testMarkUnset() { ByteBufferInputStream stream = newStream(); - AssertHelpers.assertThrows("Should throw an error for reset() without mark()", - IOException.class, () -> { + AssertHelpers.assertThrows( + "Should throw an error for reset() without mark()", + IOException.class, + () -> { stream.reset(); return null; }); @@ -496,27 +479,22 @@ public void testMarkAndResetTwiceOverSameRange() throws Exception { byte[] expected = new byte[6]; stream.mark(10); - Assert.assertEquals("Should read expected bytes", - expected.length, stream.read(expected)); + Assert.assertEquals("Should read expected bytes", expected.length, stream.read(expected)); stream.reset(); stream.mark(10); byte[] firstRead = new byte[6]; - Assert.assertEquals("Should read firstRead bytes", - firstRead.length, stream.read(firstRead)); + Assert.assertEquals("Should read firstRead bytes", firstRead.length, stream.read(firstRead)); stream.reset(); byte[] secondRead = new byte[6]; - Assert.assertEquals("Should read secondRead bytes", - secondRead.length, stream.read(secondRead)); + Assert.assertEquals("Should read secondRead bytes", secondRead.length, stream.read(secondRead)); - Assert.assertArrayEquals("First read should be correct", - expected, firstRead); + Assert.assertArrayEquals("First read should be correct", expected, firstRead); - Assert.assertArrayEquals("Second read should be correct", - expected, secondRead); + Assert.assertArrayEquals("Second read should be correct", expected, secondRead); } @Test @@ -530,8 +508,10 @@ public void testMarkLimit() throws Exception { Assert.assertEquals("Should read 6 bytes", 6, stream.read(new byte[6])); - AssertHelpers.assertThrows("Should throw an error for reset() after limit", - IOException.class, () -> { + AssertHelpers.assertThrows( + "Should throw an error for reset() after limit", + IOException.class, + () -> { stream.reset(); return null; }); @@ -546,8 +526,10 @@ public void testMarkDoubleReset() throws Exception { stream.reset(); - AssertHelpers.assertThrows("Should throw an error for double reset()", - IOException.class, () -> { + AssertHelpers.assertThrows( + "Should throw an error for double reset()", + IOException.class, + () -> { stream.reset(); return null; }); diff --git a/core/src/test/java/org/apache/iceberg/io/TestIOUtil.java b/core/src/test/java/org/apache/iceberg/io/TestIOUtil.java index 708eda4827ab..8bafca09ecdd 100644 --- a/core/src/test/java/org/apache/iceberg/io/TestIOUtil.java +++ b/core/src/test/java/org/apache/iceberg/io/TestIOUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.EOFException; @@ -38,8 +37,10 @@ public void testReadFully() throws Exception { MockInputStream stream = new MockInputStream(); IOUtil.readFully(stream, buffer, 0, buffer.length); - Assert.assertArrayEquals("Byte array contents should match", - Arrays.copyOfRange(MockInputStream.TEST_ARRAY, 0, 5), buffer); + Assert.assertArrayEquals( + "Byte array contents should match", + Arrays.copyOfRange(MockInputStream.TEST_ARRAY, 0, 5), + buffer); Assert.assertEquals("Stream position should reflect bytes read", 5, stream.getPos()); } @@ -50,8 +51,10 @@ public void testReadFullySmallReads() throws Exception { MockInputStream stream = new MockInputStream(2, 3, 3); IOUtil.readFully(stream, buffer, 0, buffer.length); - Assert.assertArrayEquals("Byte array contents should match", - Arrays.copyOfRange(MockInputStream.TEST_ARRAY, 0, 5), buffer); + Assert.assertArrayEquals( + "Byte array contents should match", + Arrays.copyOfRange(MockInputStream.TEST_ARRAY, 0, 5), + buffer); Assert.assertEquals("Stream position should reflect bytes read", 5, stream.getPos()); } @@ -62,11 +65,14 @@ public void testReadFullyJustRight() throws Exception { final MockInputStream stream = new MockInputStream(2, 3, 3); IOUtil.readFully(stream, buffer, 0, buffer.length); - Assert.assertArrayEquals("Byte array contents should match", MockInputStream.TEST_ARRAY, buffer); + Assert.assertArrayEquals( + "Byte array contents should match", MockInputStream.TEST_ARRAY, buffer); Assert.assertEquals("Stream position should reflect bytes read", 10, stream.getPos()); - AssertHelpers.assertThrows("Should throw EOFException if no more bytes left", - EOFException.class, () -> { + AssertHelpers.assertThrows( + "Should throw EOFException if no more bytes left", + EOFException.class, + () -> { IOUtil.readFully(stream, buffer, 0, 1); return null; }); @@ -78,14 +84,18 @@ public void testReadFullyUnderflow() { final MockInputStream stream = new MockInputStream(2, 3, 3); - AssertHelpers.assertThrows("Should throw EOFException if no more bytes left", - EOFException.class, () -> { + AssertHelpers.assertThrows( + "Should throw EOFException if no more bytes left", + EOFException.class, + () -> { IOUtil.readFully(stream, buffer, 0, buffer.length); return null; }); - Assert.assertArrayEquals("Should have consumed bytes", - MockInputStream.TEST_ARRAY, Arrays.copyOfRange(buffer, 0, 10)); + Assert.assertArrayEquals( + "Should have consumed bytes", + MockInputStream.TEST_ARRAY, + Arrays.copyOfRange(buffer, 0, 10)); Assert.assertEquals("Stream position should reflect bytes read", 10, stream.getPos()); } @@ -96,8 +106,10 @@ public void testReadFullyStartAndLength() throws IOException { MockInputStream stream = new MockInputStream(); IOUtil.readFully(stream, buffer, 2, 5); - Assert.assertArrayEquals("Byte array contents should match", - Arrays.copyOfRange(MockInputStream.TEST_ARRAY, 0, 5), Arrays.copyOfRange(buffer, 2, 7)); + Assert.assertArrayEquals( + "Byte array contents should match", + Arrays.copyOfRange(MockInputStream.TEST_ARRAY, 0, 5), + Arrays.copyOfRange(buffer, 2, 7)); Assert.assertEquals("Stream position should reflect bytes read", 5, stream.getPos()); } @@ -118,20 +130,20 @@ public void testReadFullySmallReadsWithStartAndLength() throws IOException { MockInputStream stream = new MockInputStream(2, 2, 3); IOUtil.readFully(stream, buffer, 2, 5); - Assert.assertArrayEquals("Byte array contents should match", - Arrays.copyOfRange(MockInputStream.TEST_ARRAY, 0, 5), Arrays.copyOfRange(buffer, 2, 7)); + Assert.assertArrayEquals( + "Byte array contents should match", + Arrays.copyOfRange(MockInputStream.TEST_ARRAY, 0, 5), + Arrays.copyOfRange(buffer, 2, 7)); Assert.assertEquals("Stream position should reflect bytes read", 5, stream.getPos()); } @Test public void testWriteFully() throws Exception { - byte[] input = Strings.repeat("Welcome to Warsaw!\n", 12345) - .getBytes(StandardCharsets.UTF_8); + byte[] input = Strings.repeat("Welcome to Warsaw!\n", 12345).getBytes(StandardCharsets.UTF_8); InMemoryOutputFile outputFile = new InMemoryOutputFile(); try (PositionOutputStream outputStream = outputFile.create()) { IOUtil.writeFully(outputStream, ByteBuffer.wrap(input.clone())); } - Assertions.assertThat(outputFile.toByteArray()) - .isEqualTo(input); + Assertions.assertThat(outputFile.toByteArray()).isEqualTo(input); } } diff --git a/core/src/test/java/org/apache/iceberg/io/TestInMemoryInputFile.java b/core/src/test/java/org/apache/iceberg/io/TestInMemoryInputFile.java index 95a10fcf4bb8..66f097d34651 100644 --- a/core/src/test/java/org/apache/iceberg/io/TestInMemoryInputFile.java +++ b/core/src/test/java/org/apache/iceberg/io/TestInMemoryInputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -29,11 +28,11 @@ public class TestInMemoryInputFile { @Test public void testReadAfterClose() throws IOException { - InMemoryInputFile inputFile = new InMemoryInputFile("abc".getBytes(StandardCharsets.ISO_8859_1)); + InMemoryInputFile inputFile = + new InMemoryInputFile("abc".getBytes(StandardCharsets.ISO_8859_1)); InputStream inputStream = inputFile.newStream(); Assert.assertEquals('a', inputStream.read()); inputStream.close(); - Assertions.assertThatThrownBy(inputStream::read) - .hasMessage("Stream is closed"); + Assertions.assertThatThrownBy(inputStream::read).hasMessage("Stream is closed"); } } diff --git a/core/src/test/java/org/apache/iceberg/io/TestInMemoryOutputFile.java b/core/src/test/java/org/apache/iceberg/io/TestInMemoryOutputFile.java index 1c9c4b8e4c39..d5dba8d6cdbc 100644 --- a/core/src/test/java/org/apache/iceberg/io/TestInMemoryOutputFile.java +++ b/core/src/test/java/org/apache/iceberg/io/TestInMemoryOutputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -33,8 +32,7 @@ public void testWriteAfterClose() throws IOException { outputStream.write('a'); outputStream.write('b'); outputStream.close(); - Assertions.assertThatThrownBy(() -> outputStream.write('c')) - .hasMessage("Stream is closed"); + Assertions.assertThatThrownBy(() -> outputStream.write('c')).hasMessage("Stream is closed"); Assertions.assertThat(outputFile.toByteArray()) .isEqualTo("ab".getBytes(StandardCharsets.ISO_8859_1)); } diff --git a/core/src/test/java/org/apache/iceberg/io/TestMultiBufferInputStream.java b/core/src/test/java/org/apache/iceberg/io/TestMultiBufferInputStream.java index eca929e040df..c4b244d01adb 100644 --- a/core/src/test/java/org/apache/iceberg/io/TestMultiBufferInputStream.java +++ b/core/src/test/java/org/apache/iceberg/io/TestMultiBufferInputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.nio.ByteBuffer; @@ -27,15 +26,15 @@ import org.junit.Test; public class TestMultiBufferInputStream extends TestByteBufferInputStreams { - private static final List DATA = Arrays.asList( - ByteBuffer.wrap(new byte[] { 0, 1, 2, 3, 4, 5, 6, 7, 8 }), - ByteBuffer.wrap(new byte[] { 9, 10, 11, 12 }), - ByteBuffer.wrap(new byte[] { }), - ByteBuffer.wrap(new byte[] { 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 }), - ByteBuffer.wrap(new byte[] { 25 }), - ByteBuffer.wrap(new byte[] { 26, 27, 28, 29, 30, 31, 32 }), - ByteBuffer.wrap(new byte[] { 33, 34 }) - ); + private static final List DATA = + Arrays.asList( + ByteBuffer.wrap(new byte[] {0, 1, 2, 3, 4, 5, 6, 7, 8}), + ByteBuffer.wrap(new byte[] {9, 10, 11, 12}), + ByteBuffer.wrap(new byte[] {}), + ByteBuffer.wrap(new byte[] {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}), + ByteBuffer.wrap(new byte[] {25}), + ByteBuffer.wrap(new byte[] {26, 27, 28, 29, 30, 31, 32}), + ByteBuffer.wrap(new byte[] {33, 34})); @Override protected ByteBufferInputStream newStream() { @@ -46,8 +45,7 @@ protected ByteBufferInputStream newStream() { protected void checkOriginalData() { for (ByteBuffer buffer : DATA) { Assert.assertEquals("Position should not change", 0, buffer.position()); - Assert.assertEquals("Limit should not change", - buffer.array().length, buffer.limit()); + Assert.assertEquals("Limit should not change", buffer.array().length, buffer.limit()); } } @@ -71,8 +69,7 @@ public void testSliceData() throws Exception { // one is a view of the first buffer because it is smaller ByteBuffer one = buffers.get(0); - Assert.assertSame("Should be a duplicate of the first array", - one.array(), DATA.get(0).array()); + Assert.assertSame("Should be a duplicate of the first array", one.array(), DATA.get(0).array()); Assert.assertEquals(8, one.remaining()); Assert.assertEquals(0, one.position()); Assert.assertEquals(8, one.limit()); @@ -93,8 +90,8 @@ public void testSliceData() throws Exception { // three is a copy of part of the 4th buffer ByteBuffer three = buffers.get(2); - Assert.assertSame("Should be a duplicate of the fourth array", - three.array(), DATA.get(3).array()); + Assert.assertSame( + "Should be a duplicate of the fourth array", three.array(), DATA.get(3).array()); Assert.assertEquals(8, three.remaining()); Assert.assertEquals(3, three.position()); Assert.assertEquals(11, three.limit()); @@ -136,7 +133,7 @@ public void testSliceBuffersData() throws Exception { } } - Assert.assertEquals("Should return duplicates of all non-empty buffers", - nonEmptyBuffers, buffers); + Assert.assertEquals( + "Should return duplicates of all non-empty buffers", nonEmptyBuffers, buffers); } } diff --git a/core/src/test/java/org/apache/iceberg/io/TestOutputFileFactory.java b/core/src/test/java/org/apache/iceberg/io/TestOutputFileFactory.java index 10210831b37e..616a42791dd6 100644 --- a/core/src/test/java/org/apache/iceberg/io/TestOutputFileFactory.java +++ b/core/src/test/java/org/apache/iceberg/io/TestOutputFileFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import org.apache.iceberg.FileFormat; @@ -37,7 +36,7 @@ public class TestOutputFileFactory extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } private static final int PARTITION_ID = 1; @@ -49,25 +48,23 @@ public TestOutputFileFactory(int formatVersion) { @Test public void testOutputFileFactoryWithCustomFormat() { - table.updateProperties() - .defaultFormat(FileFormat.ORC) - .commit(); + table.updateProperties().defaultFormat(FileFormat.ORC).commit(); - OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, PARTITION_ID, TASK_ID) - .format(FileFormat.AVRO) - .build(); + OutputFileFactory fileFactory = + OutputFileFactory.builderFor(table, PARTITION_ID, TASK_ID).format(FileFormat.AVRO).build(); String location = fileFactory.newOutputFile().encryptingOutputFile().location(); - Assert.assertEquals("File format should be correct", FileFormat.AVRO, FileFormat.fromFileName(location)); + Assert.assertEquals( + "File format should be correct", FileFormat.AVRO, FileFormat.fromFileName(location)); } @Test public void testOutputFileFactoryWithMultipleSpecs() { - OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, PARTITION_ID, TASK_ID) - .operationId("append") - .build(); + OutputFileFactory fileFactory = + OutputFileFactory.builderFor(table, PARTITION_ID, TASK_ID).operationId("append").build(); - EncryptedOutputFile unpartitionedFile = fileFactory.newOutputFile(PartitionSpec.unpartitioned(), null); + EncryptedOutputFile unpartitionedFile = + fileFactory.newOutputFile(PartitionSpec.unpartitioned(), null); String unpartitionedFileLocation = unpartitionedFile.encryptingOutputFile().location(); Assert.assertTrue(unpartitionedFileLocation.endsWith("data/00001-100-append-00001.parquet")); @@ -76,6 +73,7 @@ public void testOutputFileFactoryWithMultipleSpecs() { partitionKey.partition(record); EncryptedOutputFile partitionedFile = fileFactory.newOutputFile(table.spec(), partitionKey); String partitionedFileLocation = partitionedFile.encryptingOutputFile().location(); - Assert.assertTrue(partitionedFileLocation.endsWith("data_bucket=7/00001-100-append-00002.parquet")); + Assert.assertTrue( + partitionedFileLocation.endsWith("data_bucket=7/00001-100-append-00002.parquet")); } } diff --git a/core/src/test/java/org/apache/iceberg/io/TestSingleBufferInputStream.java b/core/src/test/java/org/apache/iceberg/io/TestSingleBufferInputStream.java index 512eaf2afcd4..5715f3f74e29 100644 --- a/core/src/test/java/org/apache/iceberg/io/TestSingleBufferInputStream.java +++ b/core/src/test/java/org/apache/iceberg/io/TestSingleBufferInputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.nio.ByteBuffer; @@ -27,9 +26,12 @@ import org.junit.Test; public class TestSingleBufferInputStream extends TestByteBufferInputStreams { - private static final ByteBuffer DATA = ByteBuffer.wrap(new byte[] { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, - 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34 }); + private static final ByteBuffer DATA = + ByteBuffer.wrap( + new byte[] { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34 + }); @Override protected ByteBufferInputStream newStream() { @@ -39,8 +41,7 @@ protected ByteBufferInputStream newStream() { @Override protected void checkOriginalData() { Assert.assertEquals("Position should not change", 0, DATA.position()); - Assert.assertEquals("Limit should not change", - DATA.array().length, DATA.limit()); + Assert.assertEquals("Limit should not change", DATA.array().length, DATA.limit()); } @Test @@ -62,8 +63,7 @@ public void testSliceData() throws Exception { int i = 0; ByteBuffer one = buffers.get(0); - Assert.assertSame("Should use the same backing array", - one.array(), DATA.array()); + Assert.assertSame("Should use the same backing array", one.array(), DATA.array()); Assert.assertEquals(8, one.remaining()); Assert.assertEquals(0, one.position()); Assert.assertEquals(8, one.limit()); @@ -73,8 +73,7 @@ public void testSliceData() throws Exception { } ByteBuffer two = buffers.get(1); - Assert.assertSame("Should use the same backing array", - two.array(), DATA.array()); + Assert.assertSame("Should use the same backing array", two.array(), DATA.array()); Assert.assertEquals(8, two.remaining()); Assert.assertEquals(8, two.position()); Assert.assertEquals(16, two.limit()); @@ -85,8 +84,7 @@ public void testSliceData() throws Exception { // three is a copy of part of the 4th buffer ByteBuffer three = buffers.get(2); - Assert.assertSame("Should use the same backing array", - three.array(), DATA.array()); + Assert.assertSame("Should use the same backing array", three.array(), DATA.array()); Assert.assertEquals(8, three.remaining()); Assert.assertEquals(16, three.position()); Assert.assertEquals(24, three.limit()); @@ -97,8 +95,7 @@ public void testSliceData() throws Exception { // four should be a copy of the next 8 bytes ByteBuffer four = buffers.get(3); - Assert.assertSame("Should use the same backing array", - four.array(), DATA.array()); + Assert.assertSame("Should use the same backing array", four.array(), DATA.array()); Assert.assertEquals(8, four.remaining()); Assert.assertEquals(24, four.position()); Assert.assertEquals(32, four.limit()); @@ -109,8 +106,7 @@ public void testSliceData() throws Exception { // five should be a copy of the next 8 bytes ByteBuffer five = buffers.get(4); - Assert.assertSame("Should use the same backing array", - five.array(), DATA.array()); + Assert.assertSame("Should use the same backing array", five.array(), DATA.array()); Assert.assertEquals(3, five.remaining()); Assert.assertEquals(32, five.position()); Assert.assertEquals(35, five.limit()); @@ -125,7 +121,9 @@ public void testWholeSliceBuffersData() throws Exception { ByteBufferInputStream stream = newStream(); List buffers = stream.sliceBuffers(stream.available()); - Assert.assertEquals("Should return duplicates of all non-empty buffers", - Collections.singletonList(DATA), buffers); + Assert.assertEquals( + "Should return duplicates of all non-empty buffers", + Collections.singletonList(DATA), + buffers); } } diff --git a/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcCatalog.java b/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcCatalog.java index 42fd769d2090..4ca8cfd5af33 100644 --- a/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcCatalog.java +++ b/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcCatalog.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.jdbc; +import static org.apache.iceberg.NullOrder.NULLS_FIRST; +import static org.apache.iceberg.SortDirection.ASC; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -66,26 +69,20 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; -import static org.apache.iceberg.NullOrder.NULLS_FIRST; -import static org.apache.iceberg.SortDirection.ASC; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestJdbcCatalog extends CatalogTests { - static final Schema SCHEMA = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); - static final PartitionSpec PARTITION_SPEC = PartitionSpec.builderFor(SCHEMA) - .bucket("data", 16) - .build(); + static final Schema SCHEMA = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); + static final PartitionSpec PARTITION_SPEC = + PartitionSpec.builderFor(SCHEMA).bucket("data", 16).build(); static Configuration conf = new Configuration(); private static JdbcCatalog catalog; private static String warehouseLocation; - @TempDir - java.nio.file.Path tableDir; + @TempDir java.nio.file.Path tableDir; @Override protected JdbcCatalog catalog() { @@ -107,8 +104,7 @@ protected List metadataVersionFiles(String location) { .filter(file -> !file.isDirectory()) .map(File::getName) .filter(fileName -> fileName.endsWith("metadata.json")) - .collect(Collectors.toList()) - ; + .collect(Collectors.toList()); } protected List manifestFiles(String location) { @@ -116,14 +112,14 @@ protected List manifestFiles(String location) { .filter(file -> !file.isDirectory()) .map(File::getName) .filter(fileName -> fileName.endsWith(".avro")) - .collect(Collectors.toList()) - ; + .collect(Collectors.toList()); } @BeforeEach public void setupTable() throws Exception { Map properties = Maps.newHashMap(); - properties.put(CatalogProperties.URI, + properties.put( + CatalogProperties.URI, "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", "")); properties.put(JdbcCatalog.PROPERTY_PREFIX + "username", "user"); @@ -151,12 +147,14 @@ public void testInitialize() { @Test public void testCreateTableBuilder() { TableIdentifier tableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl"); - Table table = catalog.buildTable(tableIdent, SCHEMA) - .withPartitionSpec(PARTITION_SPEC) - .withProperties(null) - .withProperty("key1", "value1") - .withProperties(ImmutableMap.of("key2", "value2")) - .create(); + Table table = + catalog + .buildTable(tableIdent, SCHEMA) + .withPartitionSpec(PARTITION_SPEC) + .withProperties(null) + .withProperty("key1", "value1") + .withProperties(ImmutableMap.of("key2", "value2")) + .create(); Assert.assertEquals(SCHEMA.toString(), table.schema().toString()); Assert.assertEquals(1, table.spec().fields().size()); @@ -167,10 +165,12 @@ public void testCreateTableBuilder() { @Test public void testCreateTableTxnBuilder() { TableIdentifier tableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl"); - Transaction txn = catalog.buildTable(tableIdent, SCHEMA) - .withPartitionSpec(null) - .withProperty("key1", "testval1") - .createTransaction(); + Transaction txn = + catalog + .buildTable(tableIdent, SCHEMA) + .withPartitionSpec(null) + .withProperty("key1", "testval1") + .createTransaction(); txn.commitTransaction(); Table table = catalog.loadTable(tableIdent); @@ -183,40 +183,40 @@ public void testCreateTableTxnBuilder() { public void testReplaceTxnBuilder() { TableIdentifier tableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl"); - final DataFile fileA = DataFiles.builder(PARTITION_SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(0) - .withPartitionPath("data_bucket=0") // easy way to set partition data for now - .withRecordCount(2) // needs at least one record or else metrics will filter it out - .build(); + final DataFile fileA = + DataFiles.builder(PARTITION_SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(0) + .withPartitionPath("data_bucket=0") // easy way to set partition data for now + .withRecordCount(2) // needs at least one record or else metrics will filter it out + .build(); - Transaction createTxn = catalog.buildTable(tableIdent, SCHEMA) - .withPartitionSpec(PARTITION_SPEC) - .withProperty("key1", "value1") - .createOrReplaceTransaction(); + Transaction createTxn = + catalog + .buildTable(tableIdent, SCHEMA) + .withPartitionSpec(PARTITION_SPEC) + .withProperty("key1", "value1") + .createOrReplaceTransaction(); - createTxn.newAppend() - .appendFile(fileA) - .commit(); + createTxn.newAppend().appendFile(fileA).commit(); createTxn.commitTransaction(); Table table = catalog.loadTable(tableIdent); Assert.assertNotNull(table.currentSnapshot()); - Transaction replaceTxn = catalog.buildTable(tableIdent, SCHEMA) - .withProperty("key2", "value2") - .replaceTransaction(); + Transaction replaceTxn = + catalog.buildTable(tableIdent, SCHEMA).withProperty("key2", "value2").replaceTransaction(); replaceTxn.commitTransaction(); table = catalog.loadTable(tableIdent); Assert.assertNull(table.currentSnapshot()); - PartitionSpec v1Expected = PartitionSpec.builderFor(table.schema()) - .alwaysNull("data", "data_bucket") - .withSpecId(1) - .build(); - Assert.assertEquals("Table should have a spec with one void field", - v1Expected, table.spec()); + PartitionSpec v1Expected = + PartitionSpec.builderFor(table.schema()) + .alwaysNull("data", "data_bucket") + .withSpecId(1) + .build(); + Assert.assertEquals("Table should have a spec with one void field", v1Expected, table.spec()); Assert.assertEquals("value1", table.properties().get("key1")); Assert.assertEquals("value2", table.properties().get("key2")); @@ -235,19 +235,20 @@ public void testCreateTableDefaultSortOrder() { @Test public void testCreateTableCustomSortOrder() { TableIdentifier tableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl"); - SortOrder order = SortOrder.builderFor(SCHEMA) - .asc("id", NULLS_FIRST) - .build(); - Table table = catalog.buildTable(tableIdent, SCHEMA) - .withPartitionSpec(PARTITION_SPEC) - .withSortOrder(order) - .create(); + SortOrder order = SortOrder.builderFor(SCHEMA).asc("id", NULLS_FIRST).build(); + Table table = + catalog + .buildTable(tableIdent, SCHEMA) + .withPartitionSpec(PARTITION_SPEC) + .withSortOrder(order) + .create(); SortOrder sortOrder = table.sortOrder(); Assert.assertEquals("Order ID must match", 1, sortOrder.orderId()); Assert.assertEquals("Order must have 1 field", 1, sortOrder.fields().size()); Assert.assertEquals("Direction must match ", ASC, sortOrder.fields().get(0).direction()); - Assert.assertEquals("Null order must match ", NULLS_FIRST, sortOrder.fields().get(0).nullOrder()); + Assert.assertEquals( + "Null order must match ", NULLS_FIRST, sortOrder.fields().get(0).nullOrder()); Transform transform = Transforms.identity(Types.IntegerType.get()); Assert.assertEquals("Transform must match", transform, sortOrder.fields().get(0).transform()); } @@ -261,10 +262,11 @@ public void testBasicCatalog() throws Exception { FileSystem fs = Util.getFs(new Path(metaLocation), conf); Assert.assertTrue(fs.isDirectory(new Path(metaLocation))); - AssertHelpers.assertThrows("should throw exception", AlreadyExistsException.class, - "already exists", () -> - catalog.createTable(testTable, SCHEMA, PartitionSpec.unpartitioned()) - ); + AssertHelpers.assertThrows( + "should throw exception", + AlreadyExistsException.class, + "already exists", + () -> catalog.createTable(testTable, SCHEMA, PartitionSpec.unpartitioned())); catalog.dropTable(testTable); } @@ -288,9 +290,10 @@ public void testCreateAndDropTableWithoutNamespace() throws Exception { public void testDefaultWarehouseLocation() throws Exception { TableIdentifier testTable = TableIdentifier.of("tbl"); TableIdentifier testTable2 = TableIdentifier.of(Namespace.of("ns"), "tbl"); - Assert.assertEquals(catalog.defaultWarehouseLocation(testTable), - warehouseLocation + "/" + testTable.name()); - Assert.assertEquals(catalog.defaultWarehouseLocation(testTable2), + Assert.assertEquals( + catalog.defaultWarehouseLocation(testTable), warehouseLocation + "/" + testTable.name()); + Assert.assertEquals( + catalog.defaultWarehouseLocation(testTable2), warehouseLocation + "/" + testTable2.namespace() + "/" + testTable2.name()); } @@ -301,25 +304,29 @@ public void testConcurrentCommit() throws IOException { // append file and commit! String data = tableDir.resolve("data.parquet").toAbsolutePath().toString(); Files.write(Paths.get(data), Lists.newArrayList(), StandardCharsets.UTF_8); - DataFile dataFile = DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath(data) - .withFileSizeInBytes(10) - .withRecordCount(1) - .build(); + DataFile dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath(data) + .withFileSizeInBytes(10) + .withRecordCount(1) + .build(); table.newAppend().appendFile(dataFile).commit(); Assert.assertEquals(1, table.history().size()); catalog.dropTable(tableIdentifier); data = tableDir.resolve("data2.parquet").toAbsolutePath().toString(); Files.write(Paths.get(data), Lists.newArrayList(), StandardCharsets.UTF_8); - DataFile dataFile2 = DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath(data) - .withFileSizeInBytes(10) - .withRecordCount(1) - .build(); + DataFile dataFile2 = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath(data) + .withFileSizeInBytes(10) + .withRecordCount(1) + .build(); - AssertHelpers.assertThrows("Should fail", NoSuchTableException.class, - "Failed to load table", () -> table.newAppend().appendFile(dataFile2).commit() - ); + AssertHelpers.assertThrows( + "Should fail", + NoSuchTableException.class, + "Failed to load table", + () -> table.newAppend().appendFile(dataFile2).commit()); } @Test @@ -330,31 +337,34 @@ public void testCommitHistory() throws IOException { String data = tableDir.resolve("data.parquet").toAbsolutePath().toString(); Files.write(Paths.get(data), Lists.newArrayList(), StandardCharsets.UTF_8); - DataFile dataFile = DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath(data) - .withFileSizeInBytes(10) - .withRecordCount(1) - .build(); + DataFile dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath(data) + .withFileSizeInBytes(10) + .withRecordCount(1) + .build(); table.newAppend().appendFile(dataFile).commit(); Assert.assertEquals(1, table.history().size()); data = tableDir.resolve("data2.parquet").toAbsolutePath().toString(); Files.write(Paths.get(data), Lists.newArrayList(), StandardCharsets.UTF_8); - dataFile = DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath(data) - .withFileSizeInBytes(10) - .withRecordCount(1) - .build(); + dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath(data) + .withFileSizeInBytes(10) + .withRecordCount(1) + .build(); table.newAppend().appendFile(dataFile).commit(); Assert.assertEquals(2, table.history().size()); data = tableDir.resolve("data3.parquet").toAbsolutePath().toString(); Files.write(Paths.get(data), Lists.newArrayList(), StandardCharsets.UTF_8); - dataFile = DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath(data) - .withFileSizeInBytes(10) - .withRecordCount(1) - .build(); + dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath(data) + .withFileSizeInBytes(10) + .withRecordCount(1) + .build(); table.newAppend().appendFile(dataFile).commit(); Assert.assertEquals(3, table.history().size()); } @@ -368,9 +378,11 @@ public void testDropTable() { catalog.dropTable(testTable); Assert.assertFalse(catalog.listTables(testTable.namespace()).contains(testTable)); catalog.dropTable(testTable2); - AssertHelpers.assertThrows("should throw exception", NoSuchNamespaceException.class, - "not exist", () -> catalog.listTables(testTable2.namespace()) - ); + AssertHelpers.assertThrows( + "should throw exception", + NoSuchNamespaceException.class, + "not exist", + () -> catalog.listTables(testTable2.namespace())); Assert.assertFalse(catalog.dropTable(TableIdentifier.of("db", "tbl-not-exists"))); } @@ -385,17 +397,20 @@ public void testRenameTable() { Assert.assertFalse(catalog.listTables(to.namespace()).contains(from)); Assert.assertTrue(catalog.loadTable(to).name().endsWith(to.name())); - AssertHelpers.assertThrows("should throw exception", NoSuchTableException.class, - "Table does not exist", () -> - catalog.renameTable(TableIdentifier.of("db", "tbl-not-exists"), to) - ); + AssertHelpers.assertThrows( + "should throw exception", + NoSuchTableException.class, + "Table does not exist", + () -> catalog.renameTable(TableIdentifier.of("db", "tbl-not-exists"), to)); // rename table to existing table name! TableIdentifier from2 = TableIdentifier.of("db", "tbl2"); catalog.createTable(from2, SCHEMA, PartitionSpec.unpartitioned()); - AssertHelpers.assertThrows("should throw exception", AlreadyExistsException.class, - "Table already exists", () -> catalog.renameTable(from2, to) - ); + AssertHelpers.assertThrows( + "should throw exception", + AlreadyExistsException.class, + "Table already exists", + () -> catalog.renameTable(from2, to)); } @Test @@ -406,9 +421,8 @@ public void testListTables() { TableIdentifier tbl4 = TableIdentifier.of("db", "ns1", "tbl3"); TableIdentifier tbl5 = TableIdentifier.of("db", "metadata", "metadata"); - Lists.newArrayList(tbl1, tbl2, tbl3, tbl4, tbl5).forEach(t -> - catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned()) - ); + Lists.newArrayList(tbl1, tbl2, tbl3, tbl4, tbl5) + .forEach(t -> catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned())); List tbls1 = catalog.listTables(Namespace.of("db")); Set tblSet = Sets.newHashSet(tbls1.stream().map(TableIdentifier::name).iterator()); @@ -420,18 +434,22 @@ public void testListTables() { Assert.assertEquals(tbls2.size(), 1); Assert.assertEquals("tbl3", tbls2.get(0).name()); - AssertHelpers.assertThrows("should throw exception", NoSuchNamespaceException.class, - "does not exist", () -> catalog.listTables(Namespace.of("db", "ns1", "ns2"))); + AssertHelpers.assertThrows( + "should throw exception", + NoSuchNamespaceException.class, + "does not exist", + () -> catalog.listTables(Namespace.of("db", "ns1", "ns2"))); } @Test public void testCallingLocationProviderWhenNoCurrentMetadata() { TableIdentifier tableIdent = TableIdentifier.of("ns1", "ns2", "table1"); Transaction create = catalog.newCreateTableTransaction(tableIdent, SCHEMA); - create.table().locationProvider(); // NPE triggered if not handled appropriately + create.table().locationProvider(); // NPE triggered if not handled appropriately create.commitTransaction(); - Assert.assertEquals("1 table expected", 1, catalog.listTables(Namespace.of("ns1", "ns2")).size()); + Assert.assertEquals( + "1 table expected", 1, catalog.listTables(Namespace.of("ns1", "ns2")).size()); catalog.dropTable(tableIdent, true); } @@ -439,7 +457,7 @@ public void testCallingLocationProviderWhenNoCurrentMetadata() { public void testExistingTableUpdate() { TableIdentifier tableIdent = TableIdentifier.of("ns1", "ns2", "table1"); Transaction create = catalog.newCreateTableTransaction(tableIdent, SCHEMA); - create.table().locationProvider(); // NPE triggered if not handled appropriately + create.table().locationProvider(); // NPE triggered if not handled appropriately create.commitTransaction(); Table icebergTable = catalog.loadTable(tableIdent); // add a column @@ -455,13 +473,12 @@ public void testExistingTableUpdate() { @Test public void testTableName() { TableIdentifier tableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl"); - catalog.buildTable(tableIdent, SCHEMA) - .withPartitionSpec(PARTITION_SPEC) - .create(); + catalog.buildTable(tableIdent, SCHEMA).withPartitionSpec(PARTITION_SPEC).create(); Table table = catalog.loadTable(tableIdent); Assert.assertEquals("Name must match", catalog.name() + ".db.ns1.ns2.tbl", table.name()); - TableIdentifier snapshotsTableIdent = TableIdentifier.of("db", "ns1", "ns2", "tbl", "snapshots"); + TableIdentifier snapshotsTableIdent = + TableIdentifier.of("db", "ns1", "ns2", "tbl", "snapshots"); Table snapshotsTable = catalog.loadTable(snapshotsTableIdent); Assert.assertEquals( "Name must match", catalog.name() + ".db.ns1.ns2.tbl.snapshots", snapshotsTable.name()); @@ -476,9 +493,8 @@ public void testListNamespace() { TableIdentifier tbl5 = TableIdentifier.of("db2", "metadata"); TableIdentifier tbl6 = TableIdentifier.of("tbl6"); - Lists.newArrayList(tbl1, tbl2, tbl3, tbl4, tbl5, tbl6).forEach(t -> - catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned()) - ); + Lists.newArrayList(tbl1, tbl2, tbl3, tbl4, tbl5, tbl6) + .forEach(t -> catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned())); List nsp1 = catalog.listNamespaces(Namespace.of("db")); Assert.assertEquals(nsp1.size(), 3); @@ -506,9 +522,11 @@ public void testListNamespace() { Assert.assertTrue(tblSet3.contains("db2")); Assert.assertTrue(tblSet3.contains("")); - AssertHelpers.assertThrows("Should fail to list namespace doesn't exist", NoSuchNamespaceException.class, - "Namespace does not exist", () -> catalog.listNamespaces(Namespace.of("db", "db2", "ns2") - )); + AssertHelpers.assertThrows( + "Should fail to list namespace doesn't exist", + NoSuchNamespaceException.class, + "Namespace does not exist", + () -> catalog.listNamespaces(Namespace.of("db", "db2", "ns2"))); } @Test @@ -518,15 +536,16 @@ public void testLoadNamespaceMeta() { TableIdentifier tbl3 = TableIdentifier.of("db", "ns3", "tbl4"); TableIdentifier tbl4 = TableIdentifier.of("db", "metadata"); - Lists.newArrayList(tbl1, tbl2, tbl3, tbl4).forEach(t -> - catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned()) - ); + Lists.newArrayList(tbl1, tbl2, tbl3, tbl4) + .forEach(t -> catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned())); Assert.assertTrue(catalog.loadNamespaceMetadata(Namespace.of("db")).containsKey("location")); - AssertHelpers.assertThrows("Should fail to load namespace doesn't exist", - NoSuchNamespaceException.class, "Namespace does not exist", () -> - catalog.loadNamespaceMetadata(Namespace.of("db", "db2", "ns2"))); + AssertHelpers.assertThrows( + "Should fail to load namespace doesn't exist", + NoSuchNamespaceException.class, + "Namespace does not exist", + () -> catalog.loadNamespaceMetadata(Namespace.of("db", "db2", "ns2"))); } @Test @@ -536,19 +555,20 @@ public void testNamespaceExists() { TableIdentifier tbl3 = TableIdentifier.of("db", "ns3", "tbl4"); TableIdentifier tbl4 = TableIdentifier.of("db", "metadata"); - Lists.newArrayList(tbl1, tbl2, tbl3, tbl4).forEach(t -> - catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned()) - ); - Assert.assertTrue("Should true to namespace exist", + Lists.newArrayList(tbl1, tbl2, tbl3, tbl4) + .forEach(t -> catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned())); + Assert.assertTrue( + "Should true to namespace exist", catalog.namespaceExists(Namespace.of("db", "ns1", "ns2"))); - Assert.assertFalse("Should false to namespace doesn't exist", + Assert.assertFalse( + "Should false to namespace doesn't exist", catalog.namespaceExists(Namespace.of("db", "db2", "not_exist"))); } - @Test public void testDropNamespace() { - Assert.assertFalse("Should return false if drop does not modify state", + Assert.assertFalse( + "Should return false if drop does not modify state", catalog.dropNamespace(Namespace.of("db", "ns1_not_exitss"))); TableIdentifier tbl0 = TableIdentifier.of("db", "ns1", "ns2", "tbl2"); @@ -557,16 +577,24 @@ public void testDropNamespace() { TableIdentifier tbl3 = TableIdentifier.of("db", "ns3", "tbl4"); TableIdentifier tbl4 = TableIdentifier.of("db", "tbl"); - Lists.newArrayList(tbl0, tbl1, tbl2, tbl3, tbl4).forEach(t -> - catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned()) - ); - - AssertHelpers.assertThrows("Should fail to drop namespace has tables", NamespaceNotEmptyException.class, - "is not empty. 2 tables exist.", () -> catalog.dropNamespace(tbl1.namespace())); - AssertHelpers.assertThrows("Should fail to drop namespace has tables", NamespaceNotEmptyException.class, - "is not empty. 1 tables exist.", () -> catalog.dropNamespace(tbl2.namespace())); - AssertHelpers.assertThrows("Should fail to drop namespace has tables", NamespaceNotEmptyException.class, - "is not empty. 1 tables exist.", () -> catalog.dropNamespace(tbl4.namespace())); + Lists.newArrayList(tbl0, tbl1, tbl2, tbl3, tbl4) + .forEach(t -> catalog.createTable(t, SCHEMA, PartitionSpec.unpartitioned())); + + AssertHelpers.assertThrows( + "Should fail to drop namespace has tables", + NamespaceNotEmptyException.class, + "is not empty. 2 tables exist.", + () -> catalog.dropNamespace(tbl1.namespace())); + AssertHelpers.assertThrows( + "Should fail to drop namespace has tables", + NamespaceNotEmptyException.class, + "is not empty. 1 tables exist.", + () -> catalog.dropNamespace(tbl2.namespace())); + AssertHelpers.assertThrows( + "Should fail to drop namespace has tables", + NamespaceNotEmptyException.class, + "is not empty. 1 tables exist.", + () -> catalog.dropNamespace(tbl4.namespace())); } @Test @@ -582,7 +610,8 @@ public void testCreateNamespaceWithMetadata() { Namespace testNamespace = Namespace.of("testDb", "ns1", "ns2"); // Test with metadata - Map testMetadata = ImmutableMap.of("key_1", "value_1", "key_2", "value_2", "key_3", "value_3"); + Map testMetadata = + ImmutableMap.of("key_1", "value_1", "key_2", "value_2", "key_3", "value_3"); catalog.createNamespace(testNamespace, testMetadata); Assert.assertTrue(catalog.namespaceExists(testNamespace)); } @@ -590,13 +619,23 @@ public void testCreateNamespaceWithMetadata() { @Test public void testSetProperties() { Namespace testNamespace = Namespace.of("testDb", "ns1", "ns2"); - Map testMetadata = ImmutableMap.of("key_1", "value_1", "key_2", "value_2", - "key_3", "value_3"); + Map testMetadata = + ImmutableMap.of("key_1", "value_1", "key_2", "value_2", "key_3", "value_3"); catalog.createNamespace(testNamespace, testMetadata); // Add more properties to set to test insert and update - Map propertiesToSet = ImmutableMap.of("key_5", "value_5", "key_3", "new_value_3", - "key_1", "new_value_1", "key_4", "value_4", "key_2", "new_value_2"); + Map propertiesToSet = + ImmutableMap.of( + "key_5", + "value_5", + "key_3", + "new_value_3", + "key_1", + "new_value_1", + "key_4", + "value_4", + "key_2", + "new_value_2"); Assert.assertTrue(catalog.namespaceExists(testNamespace)); Assert.assertTrue(catalog.setProperties(testNamespace, propertiesToSet)); @@ -604,20 +643,25 @@ public void testSetProperties() { Assert.assertEquals(6, allProperties.size()); Map namespaceProperties = catalog.loadNamespaceMetadata(testNamespace); - Assert.assertEquals("All new keys should be in the namespace properties", - propertiesToSet.keySet(), Sets.intersection(propertiesToSet.keySet(), namespaceProperties.keySet())); + Assert.assertEquals( + "All new keys should be in the namespace properties", + propertiesToSet.keySet(), + Sets.intersection(propertiesToSet.keySet(), namespaceProperties.keySet())); // values should match for (Map.Entry keyValue : propertiesToSet.entrySet()) { - Assert.assertEquals("Value for key " + keyValue.getKey() + " should match", - keyValue.getValue(), namespaceProperties.get(keyValue.getKey())); + Assert.assertEquals( + "Value for key " + keyValue.getKey() + " should match", + keyValue.getValue(), + namespaceProperties.get(keyValue.getKey())); } } @Test public void testRemoveProperties() { Namespace testNamespace = Namespace.of("testDb", "ns1", "ns2"); - Map testMetadata = ImmutableMap.of("key_1", "value_1", "key_2", "value_2", - "key_3", "value_3", "key_4", "value_4"); + Map testMetadata = + ImmutableMap.of( + "key_1", "value_1", "key_2", "value_2", "key_3", "value_3", "key_4", "value_4"); catalog.createNamespace(testNamespace, testMetadata); Set propertiesToRemove = ImmutableSet.of("key_2", "key_4"); diff --git a/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcTableConcurrency.java b/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcTableConcurrency.java index 2467497eb3a2..7b59e0a4cec8 100644 --- a/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcTableConcurrency.java +++ b/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcTableConcurrency.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.jdbc; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -46,20 +50,14 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestJdbcTableConcurrency { static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of("db", "test_table"); - static final Schema SCHEMA = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + static final Schema SCHEMA = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); + @Rule public TemporaryFolder temp = new TemporaryFolder(); File tableDir; @Test @@ -77,33 +75,37 @@ public synchronized void testConcurrentFastAppends() throws IOException { Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER); String fileName = UUID.randomUUID().toString(); - DataFile file = DataFiles.builder(icebergTable.spec()) - .withPath(FileFormat.PARQUET.addExtension(fileName)) - .withRecordCount(2) - .withFileSizeInBytes(0) - .build(); + DataFile file = + DataFiles.builder(icebergTable.spec()) + .withPath(FileFormat.PARQUET.addExtension(fileName)) + .withRecordCount(2) + .withFileSizeInBytes(0) + .build(); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); Tasks.range(2) - .stopOnFailure().throwFailureWhenFinished() + .stopOnFailure() + .throwFailureWhenFinished() .executeWith(executorService) - .run(index -> { - for (int numCommittedFiles = 0; numCommittedFiles < 10; numCommittedFiles++) { - while (barrier.get() < numCommittedFiles * 2) { - try { - Thread.sleep(10); - } catch (InterruptedException e) { - throw new RuntimeException(e); + .run( + index -> { + for (int numCommittedFiles = 0; numCommittedFiles < 10; numCommittedFiles++) { + while (barrier.get() < numCommittedFiles * 2) { + try { + Thread.sleep(10); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + + icebergTable.newFastAppend().appendFile(file).commit(); + barrier.incrementAndGet(); } - } - - icebergTable.newFastAppend().appendFile(file).commit(); - barrier.incrementAndGet(); - } - }); + }); icebergTable.refresh(); Assert.assertEquals(20, icebergTable.currentSnapshot().allManifests(icebergTable.io()).size()); @@ -123,21 +125,24 @@ public synchronized void testConcurrentConnections() throws InterruptedException Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER); - icebergTable.updateProperties() + icebergTable + .updateProperties() .set(COMMIT_NUM_RETRIES, "20") .set(COMMIT_MIN_RETRY_WAIT_MS, "25") .set(COMMIT_MAX_RETRY_WAIT_MS, "25") .commit(); String fileName = UUID.randomUUID().toString(); - DataFile file = DataFiles.builder(icebergTable.spec()) - .withPath(FileFormat.PARQUET.addExtension(fileName)) - .withRecordCount(2) - .withFileSizeInBytes(0) - .build(); - - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(7)); + DataFile file = + DataFiles.builder(icebergTable.spec()) + .withPath(FileFormat.PARQUET.addExtension(fileName)) + .withRecordCount(2) + .withFileSizeInBytes(0) + .build(); + + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(7)); for (int i = 0; i < 7; i++) { executorService.submit(() -> icebergTable.newAppend().appendFile(file).commit()); diff --git a/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcUtil.java b/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcUtil.java index 38c1f915761a..3577aba3ebf7 100644 --- a/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcUtil.java +++ b/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.jdbc; import java.util.Map; diff --git a/core/src/test/java/org/apache/iceberg/mapping/TestMappingUpdates.java b/core/src/test/java/org/apache/iceberg/mapping/TestMappingUpdates.java index eeee39c4f119..6b59095225d8 100644 --- a/core/src/test/java/org/apache/iceberg/mapping/TestMappingUpdates.java +++ b/core/src/test/java/org/apache/iceberg/mapping/TestMappingUpdates.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mapping; +import static org.apache.iceberg.types.Types.NestedField.required; + import org.apache.iceberg.TableProperties; import org.apache.iceberg.TableTestBase; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; @@ -28,13 +29,11 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestMappingUpdates extends TableTestBase { @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestMappingUpdates(int formatVersion) { @@ -44,188 +43,182 @@ public TestMappingUpdates(int formatVersion) { @Test public void testAddColumnMappingUpdate() { NameMapping mapping = MappingUtil.create(table.schema()); - table.updateProperties() + table + .updateProperties() .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(mapping)) .commit(); Assert.assertEquals( - MappedFields.of( - MappedField.of(1, "id"), - MappedField.of(2, "data")), + MappedFields.of(MappedField.of(1, "id"), MappedField.of(2, "data")), mapping.asMappedFields()); - table.updateSchema() - .addColumn("ts", Types.TimestampType.withZone()) - .commit(); + table.updateSchema().addColumn("ts", Types.TimestampType.withZone()).commit(); - NameMapping updated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + NameMapping updated = + NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); Assert.assertEquals( MappedFields.of( - MappedField.of(1, "id"), - MappedField.of(2, "data"), - MappedField.of(3, "ts")), + MappedField.of(1, "id"), MappedField.of(2, "data"), MappedField.of(3, "ts")), updated.asMappedFields()); } @Test public void testAddNestedColumnMappingUpdate() { NameMapping mapping = MappingUtil.create(table.schema()); - table.updateProperties() + table + .updateProperties() .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(mapping)) .commit(); Assert.assertEquals( - MappedFields.of( - MappedField.of(1, "id"), - MappedField.of(2, "data")), + MappedFields.of(MappedField.of(1, "id"), MappedField.of(2, "data")), mapping.asMappedFields()); - table.updateSchema() - .addColumn("point", Types.StructType.of( - required(1, "x", Types.DoubleType.get()), - required(2, "y", Types.DoubleType.get()))) + table + .updateSchema() + .addColumn( + "point", + Types.StructType.of( + required(1, "x", Types.DoubleType.get()), required(2, "y", Types.DoubleType.get()))) .commit(); - NameMapping updated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + NameMapping updated = + NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); Assert.assertEquals( MappedFields.of( MappedField.of(1, "id"), MappedField.of(2, "data"), - MappedField.of(3, "point", MappedFields.of( - MappedField.of(4, "x"), - MappedField.of(5, "y") - ))), + MappedField.of( + 3, "point", MappedFields.of(MappedField.of(4, "x"), MappedField.of(5, "y")))), updated.asMappedFields()); - table.updateSchema() - .addColumn("point", "z", Types.DoubleType.get()) - .commit(); + table.updateSchema().addColumn("point", "z", Types.DoubleType.get()).commit(); - NameMapping pointUpdated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + NameMapping pointUpdated = + NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); Assert.assertEquals( MappedFields.of( MappedField.of(1, "id"), MappedField.of(2, "data"), - MappedField.of(3, "point", MappedFields.of( - MappedField.of(4, "x"), - MappedField.of(5, "y"), - MappedField.of(6, "z") - ))), + MappedField.of( + 3, + "point", + MappedFields.of( + MappedField.of(4, "x"), MappedField.of(5, "y"), MappedField.of(6, "z")))), pointUpdated.asMappedFields()); } @Test public void testRenameMappingUpdate() { NameMapping mapping = MappingUtil.create(table.schema()); - table.updateProperties() + table + .updateProperties() .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(mapping)) .commit(); Assert.assertEquals( - MappedFields.of( - MappedField.of(1, "id"), - MappedField.of(2, "data")), + MappedFields.of(MappedField.of(1, "id"), MappedField.of(2, "data")), mapping.asMappedFields()); - table.updateSchema() - .renameColumn("id", "object_id") - .commit(); + table.updateSchema().renameColumn("id", "object_id").commit(); - NameMapping updated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + NameMapping updated = + NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); Assert.assertEquals( MappedFields.of( - MappedField.of(1, ImmutableList.of("id", "object_id")), - MappedField.of(2, "data")), + MappedField.of(1, ImmutableList.of("id", "object_id")), MappedField.of(2, "data")), updated.asMappedFields()); } @Test public void testRenameNestedFieldMappingUpdate() { NameMapping mapping = MappingUtil.create(table.schema()); - table.updateProperties() + table + .updateProperties() .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(mapping)) .commit(); - table.updateSchema() - .addColumn("point", Types.StructType.of( - required(1, "x", Types.DoubleType.get()), - required(2, "y", Types.DoubleType.get()))) + table + .updateSchema() + .addColumn( + "point", + Types.StructType.of( + required(1, "x", Types.DoubleType.get()), required(2, "y", Types.DoubleType.get()))) .commit(); - NameMapping updated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + NameMapping updated = + NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); Assert.assertEquals( MappedFields.of( MappedField.of(1, "id"), MappedField.of(2, "data"), - MappedField.of(3, "point", MappedFields.of( - MappedField.of(4, "x"), - MappedField.of(5, "y") - ))), + MappedField.of( + 3, "point", MappedFields.of(MappedField.of(4, "x"), MappedField.of(5, "y")))), updated.asMappedFields()); - table.updateSchema() - .renameColumn("point.x", "X") - .renameColumn("point.y", "Y") - .commit(); + table.updateSchema().renameColumn("point.x", "X").renameColumn("point.y", "Y").commit(); - NameMapping pointUpdated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + NameMapping pointUpdated = + NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); Assert.assertEquals( MappedFields.of( MappedField.of(1, "id"), MappedField.of(2, "data"), - MappedField.of(3, "point", MappedFields.of( - MappedField.of(4, ImmutableList.of("x", "X")), - MappedField.of(5, ImmutableList.of("y", "Y")) - ))), + MappedField.of( + 3, + "point", + MappedFields.of( + MappedField.of(4, ImmutableList.of("x", "X")), + MappedField.of(5, ImmutableList.of("y", "Y"))))), pointUpdated.asMappedFields()); } - @Test public void testRenameComplexFieldMappingUpdate() { NameMapping mapping = MappingUtil.create(table.schema()); - table.updateProperties() + table + .updateProperties() .set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(mapping)) .commit(); - table.updateSchema() - .addColumn("point", Types.StructType.of( - required(1, "x", Types.DoubleType.get()), - required(2, "y", Types.DoubleType.get()))) + table + .updateSchema() + .addColumn( + "point", + Types.StructType.of( + required(1, "x", Types.DoubleType.get()), required(2, "y", Types.DoubleType.get()))) .commit(); - NameMapping updated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + NameMapping updated = + NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); Assert.assertEquals( MappedFields.of( MappedField.of(1, "id"), MappedField.of(2, "data"), - MappedField.of(3, "point", MappedFields.of( - MappedField.of(4, "x"), - MappedField.of(5, "y") - ))), + MappedField.of( + 3, "point", MappedFields.of(MappedField.of(4, "x"), MappedField.of(5, "y")))), updated.asMappedFields()); - table.updateSchema() - .renameColumn("point", "p2") - .commit(); + table.updateSchema().renameColumn("point", "p2").commit(); - NameMapping pointUpdated = NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); + NameMapping pointUpdated = + NameMappingParser.fromJson(table.properties().get(TableProperties.DEFAULT_NAME_MAPPING)); Assert.assertEquals( MappedFields.of( MappedField.of(1, "id"), MappedField.of(2, "data"), - MappedField.of(3, ImmutableList.of("point", "p2"), MappedFields.of( - MappedField.of(4, "x"), - MappedField.of(5, "y") - ))), + MappedField.of( + 3, + ImmutableList.of("point", "p2"), + MappedFields.of(MappedField.of(4, "x"), MappedField.of(5, "y")))), pointUpdated.asMappedFields()); } } diff --git a/core/src/test/java/org/apache/iceberg/mapping/TestNameMapping.java b/core/src/test/java/org/apache/iceberg/mapping/TestNameMapping.java index c311cec30465..30cda4886015 100644 --- a/core/src/test/java/org/apache/iceberg/mapping/TestNameMapping.java +++ b/core/src/test/java/org/apache/iceberg/mapping/TestNameMapping.java @@ -16,27 +16,24 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mapping; +import static org.apache.iceberg.types.Types.NestedField.required; + import org.apache.iceberg.AssertHelpers; import org.apache.iceberg.Schema; import org.apache.iceberg.types.Types; import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestNameMapping { @Test public void testFlatSchemaToMapping() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), required(2, "data", Types.StringType.get())); - MappedFields expected = MappedFields.of( - MappedField.of(1, "id"), - MappedField.of(2, "data")); + MappedFields expected = MappedFields.of(MappedField.of(1, "id"), MappedField.of(2, "data")); NameMapping mapping = MappingUtil.create(schema); Assert.assertEquals(expected, mapping.asMappedFields()); @@ -44,21 +41,25 @@ public void testFlatSchemaToMapping() { @Test public void testNestedStructSchemaToMapping() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get()), - required(3, "location", Types.StructType.of( - required(4, "latitude", Types.FloatType.get()), - required(5, "longitude", Types.FloatType.get()) - ))); - - MappedFields expected = MappedFields.of( - MappedField.of(1, "id"), - MappedField.of(2, "data"), - MappedField.of(3, "location", MappedFields.of( - MappedField.of(4, "latitude"), - MappedField.of(5, "longitude") - ))); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + required( + 3, + "location", + Types.StructType.of( + required(4, "latitude", Types.FloatType.get()), + required(5, "longitude", Types.FloatType.get())))); + + MappedFields expected = + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of( + 3, + "location", + MappedFields.of(MappedField.of(4, "latitude"), MappedField.of(5, "longitude")))); NameMapping mapping = MappingUtil.create(schema); Assert.assertEquals(expected, mapping.asMappedFields()); @@ -66,20 +67,21 @@ public void testNestedStructSchemaToMapping() { @Test public void testMapSchemaToMapping() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get()), - required(3, "map", Types.MapType.ofRequired(4, 5, - Types.StringType.get(), - Types.DoubleType.get()))); - - MappedFields expected = MappedFields.of( - MappedField.of(1, "id"), - MappedField.of(2, "data"), - MappedField.of(3, "map", MappedFields.of( - MappedField.of(4, "key"), - MappedField.of(5, "value") - ))); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + required( + 3, + "map", + Types.MapType.ofRequired(4, 5, Types.StringType.get(), Types.DoubleType.get()))); + + MappedFields expected = + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of( + 3, "map", MappedFields.of(MappedField.of(4, "key"), MappedField.of(5, "value")))); NameMapping mapping = MappingUtil.create(schema); Assert.assertEquals(expected, mapping.asMappedFields()); @@ -87,25 +89,32 @@ public void testMapSchemaToMapping() { @Test public void testComplexKeyMapSchemaToMapping() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get()), - required(3, "map", Types.MapType.ofRequired(4, 5, - Types.StructType.of( - required(6, "x", Types.DoubleType.get()), - required(7, "y", Types.DoubleType.get())), - Types.DoubleType.get()))); - - MappedFields expected = MappedFields.of( - MappedField.of(1, "id"), - MappedField.of(2, "data"), - MappedField.of(3, "map", MappedFields.of( - MappedField.of(4, "key", MappedFields.of( - MappedField.of(6, "x"), - MappedField.of(7, "y") - )), - MappedField.of(5, "value") - ))); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + required( + 3, + "map", + Types.MapType.ofRequired( + 4, + 5, + Types.StructType.of( + required(6, "x", Types.DoubleType.get()), + required(7, "y", Types.DoubleType.get())), + Types.DoubleType.get()))); + + MappedFields expected = + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of( + 3, + "map", + MappedFields.of( + MappedField.of( + 4, "key", MappedFields.of(MappedField.of(6, "x"), MappedField.of(7, "y"))), + MappedField.of(5, "value")))); NameMapping mapping = MappingUtil.create(schema); Assert.assertEquals(expected, mapping.asMappedFields()); @@ -113,26 +122,34 @@ public void testComplexKeyMapSchemaToMapping() { @Test public void testComplexValueMapSchemaToMapping() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get()), - required(3, "map", Types.MapType.ofRequired(4, 5, - Types.DoubleType.get(), - Types.StructType.of( - required(6, "x", Types.DoubleType.get()), - required(7, "y", Types.DoubleType.get())) - ))); - - MappedFields expected = MappedFields.of( - MappedField.of(1, "id"), - MappedField.of(2, "data"), - MappedField.of(3, "map", MappedFields.of( - MappedField.of(4, "key"), - MappedField.of(5, "value", MappedFields.of( - MappedField.of(6, "x"), - MappedField.of(7, "y") - )) - ))); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + required( + 3, + "map", + Types.MapType.ofRequired( + 4, + 5, + Types.DoubleType.get(), + Types.StructType.of( + required(6, "x", Types.DoubleType.get()), + required(7, "y", Types.DoubleType.get()))))); + + MappedFields expected = + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of( + 3, + "map", + MappedFields.of( + MappedField.of(4, "key"), + MappedField.of( + 5, + "value", + MappedFields.of(MappedField.of(6, "x"), MappedField.of(7, "y")))))); NameMapping mapping = MappingUtil.create(schema); Assert.assertEquals(expected, mapping.asMappedFields()); @@ -140,17 +157,17 @@ public void testComplexValueMapSchemaToMapping() { @Test public void testListSchemaToMapping() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get()), - required(3, "list", Types.ListType.ofRequired(4, Types.StringType.get()))); - - MappedFields expected = MappedFields.of( - MappedField.of(1, "id"), - MappedField.of(2, "data"), - MappedField.of(3, "list", MappedFields.of( - MappedField.of(4, "element") - ))); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + required(3, "list", Types.ListType.ofRequired(4, Types.StringType.get()))); + + MappedFields expected = + MappedFields.of( + MappedField.of(1, "id"), + MappedField.of(2, "data"), + MappedField.of(3, "list", MappedFields.of(MappedField.of(4, "element")))); NameMapping mapping = MappingUtil.create(schema); Assert.assertEquals(expected, mapping.asMappedFields()); @@ -159,44 +176,56 @@ public void testListSchemaToMapping() { @Test public void testFailsDuplicateId() { // the schema can be created because ID indexing is lazy - AssertHelpers.assertThrows("Should fail if IDs are reused", - IllegalArgumentException.class, "Multiple entries with same", - () -> new Schema( - required(1, "id", Types.LongType.get()), - required(1, "data", Types.StringType.get()))); + AssertHelpers.assertThrows( + "Should fail if IDs are reused", + IllegalArgumentException.class, + "Multiple entries with same", + () -> + new Schema( + required(1, "id", Types.LongType.get()), + required(1, "data", Types.StringType.get()))); } @Test public void testFailsDuplicateName() { - AssertHelpers.assertThrows("Should fail if names are reused", - IllegalArgumentException.class, "Multiple entries with same key", + AssertHelpers.assertThrows( + "Should fail if names are reused", + IllegalArgumentException.class, + "Multiple entries with same key", () -> new NameMapping(MappedFields.of(MappedField.of(1, "x"), MappedField.of(2, "x")))); } @Test public void testAllowsDuplicateNamesInSeparateContexts() { - new NameMapping(MappedFields.of( - MappedField.of(1, "x", MappedFields.of(MappedField.of(3, "x"))), - MappedField.of(2, "y", MappedFields.of(MappedField.of(4, "x"))) - )); + new NameMapping( + MappedFields.of( + MappedField.of(1, "x", MappedFields.of(MappedField.of(3, "x"))), + MappedField.of(2, "y", MappedFields.of(MappedField.of(4, "x"))))); } @Test public void testMappingFindById() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get()), - required(3, "map", Types.MapType.ofRequired(4, 5, - Types.DoubleType.get(), - Types.StructType.of( - required(6, "x", Types.DoubleType.get()), - required(7, "y", Types.DoubleType.get())))), - required(8, "list", Types.ListType.ofRequired(9, - Types.StringType.get())), - required(10, "location", Types.StructType.of( - required(11, "latitude", Types.FloatType.get()), - required(12, "longitude", Types.FloatType.get()) - ))); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + required( + 3, + "map", + Types.MapType.ofRequired( + 4, + 5, + Types.DoubleType.get(), + Types.StructType.of( + required(6, "x", Types.DoubleType.get()), + required(7, "y", Types.DoubleType.get())))), + required(8, "list", Types.ListType.ofRequired(9, Types.StringType.get())), + required( + 10, + "location", + Types.StructType.of( + required(11, "latitude", Types.FloatType.get()), + required(12, "longitude", Types.FloatType.get())))); NameMapping mapping = MappingUtil.create(schema); @@ -206,32 +235,41 @@ public void testMappingFindById() { Assert.assertEquals(MappedField.of(9, "element"), mapping.find(9)); Assert.assertEquals(MappedField.of(11, "latitude"), mapping.find(11)); Assert.assertEquals( - MappedField.of(10, "location", MappedFields.of( - MappedField.of(11, "latitude"), - MappedField.of(12, "longitude"))), + MappedField.of( + 10, + "location", + MappedFields.of(MappedField.of(11, "latitude"), MappedField.of(12, "longitude"))), mapping.find(10)); } @Test public void testMappingFindByName() { - Schema schema = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get()), - required(3, "map", Types.MapType.ofRequired(4, 5, - Types.DoubleType.get(), - Types.StructType.of( - required(6, "x", Types.DoubleType.get()), - required(7, "y", Types.DoubleType.get())))), - required(8, "list", Types.ListType.ofRequired(9, - Types.StringType.get())), - required(10, "location", Types.StructType.of( - required(11, "latitude", Types.FloatType.get()), - required(12, "longitude", Types.FloatType.get()) - ))); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required(2, "data", Types.StringType.get()), + required( + 3, + "map", + Types.MapType.ofRequired( + 4, + 5, + Types.DoubleType.get(), + Types.StructType.of( + required(6, "x", Types.DoubleType.get()), + required(7, "y", Types.DoubleType.get())))), + required(8, "list", Types.ListType.ofRequired(9, Types.StringType.get())), + required( + 10, + "location", + Types.StructType.of( + required(11, "latitude", Types.FloatType.get()), + required(12, "longitude", Types.FloatType.get())))); NameMapping mapping = MappingUtil.create(schema); - Assert.assertNull("Should not return a field mapping for a nested name", mapping.find("element")); + Assert.assertNull( + "Should not return a field mapping for a nested name", mapping.find("element")); Assert.assertNull("Should not return a field mapping for a nested name", mapping.find("x")); Assert.assertNull("Should not return a field mapping for a nested name", mapping.find("key")); Assert.assertNull("Should not return a field mapping for a nested name", mapping.find("value")); @@ -240,9 +278,10 @@ public void testMappingFindByName() { Assert.assertEquals(MappedField.of(9, "element"), mapping.find("list", "element")); Assert.assertEquals(MappedField.of(11, "latitude"), mapping.find("location", "latitude")); Assert.assertEquals( - MappedField.of(10, "location", MappedFields.of( - MappedField.of(11, "latitude"), - MappedField.of(12, "longitude"))), + MappedField.of( + 10, + "location", + MappedFields.of(MappedField.of(11, "latitude"), MappedField.of(12, "longitude"))), mapping.find("location")); } } diff --git a/core/src/test/java/org/apache/iceberg/puffin/PuffinFormatTestUtil.java b/core/src/test/java/org/apache/iceberg/puffin/PuffinFormatTestUtil.java index 62a67c27c308..d12810a4eb15 100644 --- a/core/src/test/java/org/apache/iceberg/puffin/PuffinFormatTestUtil.java +++ b/core/src/test/java/org/apache/iceberg/puffin/PuffinFormatTestUtil.java @@ -16,14 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.puffin; import org.apache.iceberg.relocated.com.google.common.io.Resources; public final class PuffinFormatTestUtil { - private PuffinFormatTestUtil() { - } + private PuffinFormatTestUtil() {} // footer size for v1/empty-puffin-uncompressed.bin public static final long EMPTY_PUFFIN_UNCOMPRESSED_FOOTER_SIZE = 28; diff --git a/core/src/test/java/org/apache/iceberg/puffin/TestFileMetadataParser.java b/core/src/test/java/org/apache/iceberg/puffin/TestFileMetadataParser.java index 3922d227389b..c16b23237e1d 100644 --- a/core/src/test/java/org/apache/iceberg/puffin/TestFileMetadataParser.java +++ b/core/src/test/java/org/apache/iceberg/puffin/TestFileMetadataParser.java @@ -16,17 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.puffin; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + import java.io.UncheckedIOException; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.junit.Test; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - public class TestFileMetadataParser { @Test public void testInvalidJson() { @@ -51,33 +50,31 @@ public void testInvalidJson() { public void testMinimalFileMetadata() { testJsonSerialization( new FileMetadata(ImmutableList.of(), ImmutableMap.of()), - "{\n" + - " \"blobs\" : [ ]\n" + - "}"); + "{\n" + " \"blobs\" : [ ]\n" + "}"); } @Test public void testFileProperties() { testJsonSerialization( new FileMetadata(ImmutableList.of(), ImmutableMap.of("a property", "a property value")), - "{\n" + - " \"blobs\" : [ ],\n" + - " \"properties\" : {\n" + - " \"a property\" : \"a property value\"\n" + - " }\n" + - "}"); + "{\n" + + " \"blobs\" : [ ],\n" + + " \"properties\" : {\n" + + " \"a property\" : \"a property value\"\n" + + " }\n" + + "}"); testJsonSerialization( new FileMetadata( ImmutableList.of(), ImmutableMap.of("a property", "a property value", "another one", "also with value")), - "{\n" + - " \"blobs\" : [ ],\n" + - " \"properties\" : {\n" + - " \"a property\" : \"a property value\",\n" + - " \"another one\" : \"also with value\"\n" + - " }\n" + - "}"); + "{\n" + + " \"blobs\" : [ ],\n" + + " \"properties\" : {\n" + + " \"a property\" : \"a property value\",\n" + + " \"another one\" : \"also with value\"\n" + + " }\n" + + "}"); } @Test @@ -99,27 +96,35 @@ public void testBlobMetadata() { testJsonSerialization( new FileMetadata( ImmutableList.of( - new BlobMetadata("type-a", ImmutableList.of(1), 14, 3, 4, 16, null, ImmutableMap.of()), - new BlobMetadata("type-bbb", ImmutableList.of(2, 3, 4), 77, 4, Integer.MAX_VALUE * 10000L, 79834, null, + new BlobMetadata( + "type-a", ImmutableList.of(1), 14, 3, 4, 16, null, ImmutableMap.of()), + new BlobMetadata( + "type-bbb", + ImmutableList.of(2, 3, 4), + 77, + 4, + Integer.MAX_VALUE * 10000L, + 79834, + null, ImmutableMap.of())), ImmutableMap.of()), - "{\n" + - " \"blobs\" : [ {\n" + - " \"type\" : \"type-a\",\n" + - " \"fields\" : [ 1 ],\n" + - " \"snapshot-id\" : 14,\n" + - " \"sequence-number\" : 3,\n" + - " \"offset\" : 4,\n" + - " \"length\" : 16\n" + - " }, {\n" + - " \"type\" : \"type-bbb\",\n" + - " \"fields\" : [ 2, 3, 4 ],\n" + - " \"snapshot-id\" : 77,\n" + - " \"sequence-number\" : 4,\n" + - " \"offset\" : 21474836470000,\n" + - " \"length\" : 79834\n" + - " } ]\n" + - "}"); + "{\n" + + " \"blobs\" : [ {\n" + + " \"type\" : \"type-a\",\n" + + " \"fields\" : [ 1 ],\n" + + " \"snapshot-id\" : 14,\n" + + " \"sequence-number\" : 3,\n" + + " \"offset\" : 4,\n" + + " \"length\" : 16\n" + + " }, {\n" + + " \"type\" : \"type-bbb\",\n" + + " \"fields\" : [ 2, 3, 4 ],\n" + + " \"snapshot-id\" : 77,\n" + + " \"sequence-number\" : 4,\n" + + " \"offset\" : 21474836470000,\n" + + " \"length\" : 79834\n" + + " } ]\n" + + "}"); } @Test @@ -128,46 +133,54 @@ public void testBlobProperties() { new FileMetadata( ImmutableList.of( new BlobMetadata( - "type-a", ImmutableList.of(1), 14, 3, 4, 16, null, + "type-a", + ImmutableList.of(1), + 14, + 3, + 4, + 16, + null, ImmutableMap.of("some key", "some value"))), ImmutableMap.of()), - "{\n" + - " \"blobs\" : [ {\n" + - " \"type\" : \"type-a\",\n" + - " \"fields\" : [ 1 ],\n" + - " \"snapshot-id\" : 14,\n" + - " \"sequence-number\" : 3,\n" + - " \"offset\" : 4,\n" + - " \"length\" : 16,\n" + - " \"properties\" : {\n" + - " \"some key\" : \"some value\"\n" + - " }\n" + - " } ]\n" + - "}"); + "{\n" + + " \"blobs\" : [ {\n" + + " \"type\" : \"type-a\",\n" + + " \"fields\" : [ 1 ],\n" + + " \"snapshot-id\" : 14,\n" + + " \"sequence-number\" : 3,\n" + + " \"offset\" : 4,\n" + + " \"length\" : 16,\n" + + " \"properties\" : {\n" + + " \"some key\" : \"some value\"\n" + + " }\n" + + " } ]\n" + + "}"); } @Test public void testFieldNumberOutOfRange() { - assertThatThrownBy(() -> FileMetadataParser.fromJson( - "{\n" + - " \"blobs\" : [ {\n" + - " \"type\" : \"type-a\",\n" + - " \"fields\" : [ " + (Integer.MAX_VALUE + 1L) + " ],\n" + - " \"offset\" : 4,\n" + - " \"length\" : 16\n" + - " } ]\n" + - "}")) + assertThatThrownBy( + () -> + FileMetadataParser.fromJson( + "{\n" + + " \"blobs\" : [ {\n" + + " \"type\" : \"type-a\",\n" + + " \"fields\" : [ " + + (Integer.MAX_VALUE + 1L) + + " ],\n" + + " \"offset\" : 4,\n" + + " \"length\" : 16\n" + + " } ]\n" + + "}")) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot parse integer from non-int value: 2147483648"); } private void testJsonSerialization(FileMetadata fileMetadata, String json) { - assertThat(FileMetadataParser.toJson(fileMetadata, true)) - .isEqualTo(json); + assertThat(FileMetadataParser.toJson(fileMetadata, true)).isEqualTo(json); // Test round-trip. Note that FileMetadata doesn't implement equals() FileMetadata parsed = FileMetadataParser.fromJson(json); - assertThat(FileMetadataParser.toJson(parsed, true)) - .isEqualTo(json); + assertThat(FileMetadataParser.toJson(parsed, true)).isEqualTo(json); } } diff --git a/core/src/test/java/org/apache/iceberg/puffin/TestPuffinFormat.java b/core/src/test/java/org/apache/iceberg/puffin/TestPuffinFormat.java index b54a81cc7661..191658072911 100644 --- a/core/src/test/java/org/apache/iceberg/puffin/TestPuffinFormat.java +++ b/core/src/test/java/org/apache/iceberg/puffin/TestPuffinFormat.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.puffin; +import static org.apache.iceberg.puffin.PuffinFormat.readIntegerLittleEndian; +import static org.apache.iceberg.puffin.PuffinFormat.writeIntegerLittleEndian; +import static org.apache.iceberg.relocated.com.google.common.base.Preconditions.checkArgument; +import static org.assertj.core.api.Assertions.assertThat; + import java.io.ByteArrayOutputStream; import java.nio.ByteBuffer; import java.nio.ByteOrder; @@ -26,11 +30,6 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.junit.Test; -import static org.apache.iceberg.puffin.PuffinFormat.readIntegerLittleEndian; -import static org.apache.iceberg.puffin.PuffinFormat.writeIntegerLittleEndian; -import static org.apache.iceberg.relocated.com.google.common.base.Preconditions.checkArgument; -import static org.assertj.core.api.Assertions.assertThat; - public class TestPuffinFormat { @Test public void testWriteIntegerLittleEndian() throws Exception { @@ -66,9 +65,10 @@ public void testReadIntegerLittleEndian() { private void testReadIntegerLittleEndian(byte[] input, int offset, int expected) { // Sanity check: validate the expectation Preconditions.checkArgument( - expected == ByteBuffer.wrap(input.clone(), offset, input.length - offset) - .order(ByteOrder.LITTLE_ENDIAN) - .getInt(), + expected + == ByteBuffer.wrap(input.clone(), offset, input.length - offset) + .order(ByteOrder.LITTLE_ENDIAN) + .getInt(), "Invalid expected value"); // actual test assertThat(readIntegerLittleEndian(input, offset)).isEqualTo(expected); diff --git a/core/src/test/java/org/apache/iceberg/puffin/TestPuffinReader.java b/core/src/test/java/org/apache/iceberg/puffin/TestPuffinReader.java index 867c8bcd1dcb..d3be12d81811 100644 --- a/core/src/test/java/org/apache/iceberg/puffin/TestPuffinReader.java +++ b/core/src/test/java/org/apache/iceberg/puffin/TestPuffinReader.java @@ -16,19 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.puffin; -import java.util.Map; -import javax.annotation.Nullable; -import org.apache.iceberg.io.InMemoryInputFile; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Streams; -import org.apache.iceberg.util.ByteBuffers; -import org.apache.iceberg.util.Pair; -import org.junit.Test; - import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.iceberg.puffin.PuffinCompressionCodec.NONE; import static org.apache.iceberg.puffin.PuffinCompressionCodec.ZSTD; @@ -39,6 +28,16 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.util.Map; +import javax.annotation.Nullable; +import org.apache.iceberg.io.InMemoryInputFile; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Streams; +import org.apache.iceberg.util.ByteBuffers; +import org.apache.iceberg.util.Pair; +import org.junit.Test; + public class TestPuffinReader { @Test public void testEmptyFooterUncompressed() throws Exception { @@ -52,17 +51,14 @@ public void testEmptyWithUnknownFooterSize() throws Exception { private void testEmpty(String resourceName, @Nullable Long footerSize) throws Exception { InMemoryInputFile inputFile = new InMemoryInputFile(readTestResource(resourceName)); - Puffin.ReadBuilder readBuilder = Puffin.read(inputFile) - .withFileSize(inputFile.getLength()); + Puffin.ReadBuilder readBuilder = Puffin.read(inputFile).withFileSize(inputFile.getLength()); if (footerSize != null) { readBuilder = readBuilder.withFooterSize(footerSize); } try (PuffinReader reader = readBuilder.build()) { FileMetadata fileMetadata = reader.fileMetadata(); - assertThat(fileMetadata.properties()).as("file properties") - .isEqualTo(ImmutableMap.of()); - assertThat(fileMetadata.blobs()).as("blob list") - .isEmpty(); + assertThat(fileMetadata.properties()).as("file properties").isEqualTo(ImmutableMap.of()); + assertThat(fileMetadata.blobs()).as("blob list").isEmpty(); } } @@ -78,18 +74,17 @@ public void testWrongFooterSize() throws Exception { testWrongFooterSize(resourceName, footerSize + 10000, "Invalid footer size"); } - private void testWrongFooterSize(String resourceName, long wrongFooterSize, String expectedMessagePrefix) - throws Exception { + private void testWrongFooterSize( + String resourceName, long wrongFooterSize, String expectedMessagePrefix) throws Exception { InMemoryInputFile inputFile = new InMemoryInputFile(readTestResource(resourceName)); - Puffin.ReadBuilder builder = Puffin.read(inputFile) - .withFileSize(inputFile.getLength()) - .withFooterSize(wrongFooterSize); + Puffin.ReadBuilder builder = + Puffin.read(inputFile).withFileSize(inputFile.getLength()).withFooterSize(wrongFooterSize); assertThatThrownBy( - () -> { - try (PuffinReader reader = builder.build()) { - reader.fileMetadata(); - } - }) + () -> { + try (PuffinReader reader = builder.build()) { + reader.fileMetadata(); + } + }) .hasMessageStartingWith(expectedMessagePrefix); } @@ -103,49 +98,58 @@ public void testReadMetricDataCompressedZstd() throws Exception { testReadMetricData("v1/sample-metric-data-compressed-zstd.bin", ZSTD); } - private void testReadMetricData(String resourceName, PuffinCompressionCodec expectedCodec) throws Exception { + private void testReadMetricData(String resourceName, PuffinCompressionCodec expectedCodec) + throws Exception { InMemoryInputFile inputFile = new InMemoryInputFile(readTestResource(resourceName)); try (PuffinReader reader = Puffin.read(inputFile).build()) { FileMetadata fileMetadata = reader.fileMetadata(); - assertThat(fileMetadata.properties()).as("file properties") + assertThat(fileMetadata.properties()) + .as("file properties") .isEqualTo(ImmutableMap.of("created-by", "Test 1234")); - assertThat(fileMetadata.blobs()).as("blob list") - .hasSize(2); + assertThat(fileMetadata.blobs()).as("blob list").hasSize(2); BlobMetadata firstBlob = fileMetadata.blobs().get(0); assertThat(firstBlob.type()).as("type").isEqualTo("some-blob"); assertThat(firstBlob.inputFields()).as("columns").isEqualTo(ImmutableList.of(1)); assertThat(firstBlob.offset()).as("offset").isEqualTo(4); - assertThat(firstBlob.compressionCodec()).as("compression codec") + assertThat(firstBlob.compressionCodec()) + .as("compression codec") .isEqualTo(expectedCodec.codecName()); BlobMetadata secondBlob = fileMetadata.blobs().get(1); assertThat(secondBlob.type()).as("type").isEqualTo("some-other-blob"); assertThat(secondBlob.inputFields()).as("columns").isEqualTo(ImmutableList.of(2)); - assertThat(secondBlob.offset()).as("offset") + assertThat(secondBlob.offset()) + .as("offset") .isEqualTo(firstBlob.offset() + firstBlob.length()); - assertThat(secondBlob.compressionCodec()).as("compression codec") + assertThat(secondBlob.compressionCodec()) + .as("compression codec") .isEqualTo(expectedCodec.codecName()); - Map read = Streams.stream(reader.readAll(ImmutableList.of(firstBlob, secondBlob))) - .collect(toImmutableMap(Pair::first, pair -> ByteBuffers.toByteArray(pair.second()))); + Map read = + Streams.stream(reader.readAll(ImmutableList.of(firstBlob, secondBlob))) + .collect(toImmutableMap(Pair::first, pair -> ByteBuffers.toByteArray(pair.second()))); - assertThat(read).as("read") + assertThat(read) + .as("read") .containsOnlyKeys(firstBlob, secondBlob) .containsEntry(firstBlob, "abcdefghi".getBytes(UTF_8)) .containsEntry( secondBlob, - "some blob \u0000 binary data 🤯 that is not very very very very very very long, is it?".getBytes(UTF_8)); + "some blob \u0000 binary data 🤯 that is not very very very very very very long, is it?" + .getBytes(UTF_8)); } } @Test public void testValidateFooterSizeValue() throws Exception { // Ensure the definition of SAMPLE_METRIC_DATA_COMPRESSED_ZSTD_FOOTER_SIZE remains accurate - InMemoryInputFile inputFile = new InMemoryInputFile(readTestResource("v1/sample-metric-data-compressed-zstd.bin")); - try (PuffinReader reader = Puffin.read(inputFile) - .withFooterSize(SAMPLE_METRIC_DATA_COMPRESSED_ZSTD_FOOTER_SIZE) - .build()) { + InMemoryInputFile inputFile = + new InMemoryInputFile(readTestResource("v1/sample-metric-data-compressed-zstd.bin")); + try (PuffinReader reader = + Puffin.read(inputFile) + .withFooterSize(SAMPLE_METRIC_DATA_COMPRESSED_ZSTD_FOOTER_SIZE) + .build()) { assertThat(reader.fileMetadata().properties()) .isEqualTo(ImmutableMap.of("created-by", "Test 1234")); } diff --git a/core/src/test/java/org/apache/iceberg/puffin/TestPuffinWriter.java b/core/src/test/java/org/apache/iceberg/puffin/TestPuffinWriter.java index f45229c03049..461e76eb0a2c 100644 --- a/core/src/test/java/org/apache/iceberg/puffin/TestPuffinWriter.java +++ b/core/src/test/java/org/apache/iceberg/puffin/TestPuffinWriter.java @@ -16,15 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.puffin; -import java.nio.ByteBuffer; -import org.apache.iceberg.io.InMemoryOutputFile; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.Test; - import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.iceberg.puffin.PuffinCompressionCodec.NONE; import static org.apache.iceberg.puffin.PuffinCompressionCodec.ZSTD; @@ -33,14 +26,18 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.nio.ByteBuffer; +import org.apache.iceberg.io.InMemoryOutputFile; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.Test; + public class TestPuffinWriter { @Test public void testEmptyFooterCompressed() { InMemoryOutputFile outputFile = new InMemoryOutputFile(); - PuffinWriter writer = Puffin.write(outputFile) - .compressFooter() - .build(); + PuffinWriter writer = Puffin.write(outputFile).compressFooter().build(); assertThatThrownBy(writer::footerSize) .isInstanceOf(IllegalStateException.class) .hasMessage("Footer not written yet"); @@ -55,8 +52,7 @@ public void testEmptyFooterCompressed() { @Test public void testEmptyFooterUncompressed() throws Exception { InMemoryOutputFile outputFile = new InMemoryOutputFile(); - PuffinWriter writer = Puffin.write(outputFile) - .build(); + PuffinWriter writer = Puffin.write(outputFile).build(); assertThatThrownBy(writer::footerSize) .isInstanceOf(IllegalStateException.class) .hasMessage("Footer not written yet"); @@ -73,8 +69,7 @@ public void testEmptyFooterUncompressed() throws Exception { @Test public void testImplicitFinish() throws Exception { InMemoryOutputFile outputFile = new InMemoryOutputFile(); - PuffinWriter writer = Puffin.write(outputFile) - .build(); + PuffinWriter writer = Puffin.write(outputFile).build(); writer.close(); assertThat(outputFile.toByteArray()) .isEqualTo(readTestResource("v1/empty-puffin-uncompressed.bin")); @@ -91,20 +86,33 @@ public void testWriteMetricDataCompressedZstd() throws Exception { testWriteMetric(ZSTD, "v1/sample-metric-data-compressed-zstd.bin"); } - private void testWriteMetric(PuffinCompressionCodec compression, String expectedResource) throws Exception { + private void testWriteMetric(PuffinCompressionCodec compression, String expectedResource) + throws Exception { InMemoryOutputFile outputFile = new InMemoryOutputFile(); - try (PuffinWriter writer = Puffin.write(outputFile) - .createdBy("Test 1234") - .build()) { - writer.add(new Blob("some-blob", ImmutableList.of(1), 2, 1, ByteBuffer.wrap("abcdefghi".getBytes(UTF_8)), - compression, ImmutableMap.of())); + try (PuffinWriter writer = Puffin.write(outputFile).createdBy("Test 1234").build()) { + writer.add( + new Blob( + "some-blob", + ImmutableList.of(1), + 2, + 1, + ByteBuffer.wrap("abcdefghi".getBytes(UTF_8)), + compression, + ImmutableMap.of())); // "xxx"s are stripped away by data offsets byte[] bytes = - "xxx some blob \u0000 binary data 🤯 that is not very very very very very very long, is it? xxx".getBytes( - UTF_8); - writer.add(new Blob("some-other-blob", ImmutableList.of(2), 2, 1, ByteBuffer.wrap(bytes, 4, bytes.length - 8), - compression, ImmutableMap.of())); + "xxx some blob \u0000 binary data 🤯 that is not very very very very very very long, is it? xxx" + .getBytes(UTF_8); + writer.add( + new Blob( + "some-other-blob", + ImmutableList.of(2), + 2, + 1, + ByteBuffer.wrap(bytes, 4, bytes.length - 8), + compression, + ImmutableMap.of())); assertThat(writer.writtenBlobsMetadata()).hasSize(2); BlobMetadata firstMetadata = writer.writtenBlobsMetadata().get(0); @@ -118,7 +126,6 @@ private void testWriteMetric(PuffinCompressionCodec compression, String expected } byte[] expected = readTestResource(expectedResource); - assertThat(outputFile.toByteArray()) - .isEqualTo(expected); + assertThat(outputFile.toByteArray()).isEqualTo(expected); } } diff --git a/core/src/test/java/org/apache/iceberg/rest/HttpMethod.java b/core/src/test/java/org/apache/iceberg/rest/HttpMethod.java index cd659fdde860..a6d08b0c9d22 100644 --- a/core/src/test/java/org/apache/iceberg/rest/HttpMethod.java +++ b/core/src/test/java/org/apache/iceberg/rest/HttpMethod.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; public enum HttpMethod { diff --git a/core/src/test/java/org/apache/iceberg/rest/RESTCatalogAdapter.java b/core/src/test/java/org/apache/iceberg/rest/RESTCatalogAdapter.java index 5837129f9a79..d5af4d913b03 100644 --- a/core/src/test/java/org/apache/iceberg/rest/RESTCatalogAdapter.java +++ b/core/src/test/java/org/apache/iceberg/rest/RESTCatalogAdapter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; import java.io.IOException; @@ -51,35 +50,34 @@ import org.apache.iceberg.rest.responses.OAuthTokenResponse; import org.apache.iceberg.util.Pair; -/** - * Adaptor class to translate REST requests into {@link Catalog} API calls. - */ +/** Adaptor class to translate REST requests into {@link Catalog} API calls. */ public class RESTCatalogAdapter implements RESTClient { private static final Splitter SLASH = Splitter.on('/'); - private static final Map, Integer> EXCEPTION_ERROR_CODES = ImmutableMap - ., Integer>builder() - .put(IllegalArgumentException.class, 400) - .put(ValidationException.class, 400) - .put(NamespaceNotEmptyException.class, 400) // TODO: should this be more specific? - .put(NotAuthorizedException.class, 401) - .put(ForbiddenException.class, 403) - .put(NoSuchNamespaceException.class, 404) - .put(NoSuchTableException.class, 404) - .put(NoSuchIcebergTableException.class, 404) - .put(UnsupportedOperationException.class, 406) - .put(AlreadyExistsException.class, 409) - .put(CommitFailedException.class, 409) - .put(UnprocessableEntityException.class, 422) - .put(CommitStateUnknownException.class, 500) - .build(); + private static final Map, Integer> EXCEPTION_ERROR_CODES = + ImmutableMap., Integer>builder() + .put(IllegalArgumentException.class, 400) + .put(ValidationException.class, 400) + .put(NamespaceNotEmptyException.class, 400) // TODO: should this be more specific? + .put(NotAuthorizedException.class, 401) + .put(ForbiddenException.class, 403) + .put(NoSuchNamespaceException.class, 404) + .put(NoSuchTableException.class, 404) + .put(NoSuchIcebergTableException.class, 404) + .put(UnsupportedOperationException.class, 406) + .put(AlreadyExistsException.class, 409) + .put(CommitFailedException.class, 409) + .put(UnprocessableEntityException.class, 422) + .put(CommitStateUnknownException.class, 500) + .build(); private final Catalog catalog; private final SupportsNamespaces asNamespaceCatalog; public RESTCatalogAdapter(Catalog catalog) { this.catalog = catalog; - this.asNamespaceCatalog = catalog instanceof SupportsNamespaces ? (SupportsNamespaces) catalog : null; + this.asNamespaceCatalog = + catalog instanceof SupportsNamespaces ? (SupportsNamespaces) catalog : null; } enum HTTPMethod { @@ -131,9 +129,14 @@ private enum Route { } private boolean matches(HTTPMethod requestMethod, List requestPath) { - return method == requestMethod && requriedLength == requestPath.size() && - requirements.entrySet().stream().allMatch( - requirement -> requirement.getValue().equalsIgnoreCase(requestPath.get(requirement.getKey()))); + return method == requestMethod + && requriedLength == requestPath.size() + && requirements.entrySet().stream() + .allMatch( + requirement -> + requirement + .getValue() + .equalsIgnoreCase(requestPath.get(requirement.getKey()))); } private Map variables(List requestPath) { @@ -154,37 +157,42 @@ public static Pair> from(HTTPMethod method, String pa } } - public T handleRequest(Route route, Map vars, - Object body, Class responseType) { + public T handleRequest( + Route route, Map vars, Object body, Class responseType) { switch (route) { - case TOKENS: { - @SuppressWarnings("unchecked") - Map request = (Map) castRequest(Map.class, body); - String grantType = request.get("grant_type"); - switch (grantType) { - case "client_credentials": - return castResponse(responseType, OAuthTokenResponse.builder() - .withToken("client-credentials-token:sub=" + request.get("client_id")) - .withIssuedTokenType("urn:ietf:params:oauth:token-type:access_token") - .withTokenType("Bearer") - .build()); - - case "urn:ietf:params:oauth:grant-type:token-exchange": - String actor = request.get("actor_token"); - String token = String.format( - "token-exchange-token:sub=%s%s", - request.get("subject_token"), - actor != null ? ",act=" + actor : ""); - return castResponse(responseType, OAuthTokenResponse.builder() - .withToken(token) - .withIssuedTokenType("urn:ietf:params:oauth:token-type:access_token") - .withTokenType("Bearer") - .build()); - - default: - throw new UnsupportedOperationException("Unsupported grant_type: " + grantType); + case TOKENS: + { + @SuppressWarnings("unchecked") + Map request = (Map) castRequest(Map.class, body); + String grantType = request.get("grant_type"); + switch (grantType) { + case "client_credentials": + return castResponse( + responseType, + OAuthTokenResponse.builder() + .withToken("client-credentials-token:sub=" + request.get("client_id")) + .withIssuedTokenType("urn:ietf:params:oauth:token-type:access_token") + .withTokenType("Bearer") + .build()); + + case "urn:ietf:params:oauth:grant-type:token-exchange": + String actor = request.get("actor_token"); + String token = + String.format( + "token-exchange-token:sub=%s%s", + request.get("subject_token"), actor != null ? ",act=" + actor : ""); + return castResponse( + responseType, + OAuthTokenResponse.builder() + .withToken(token) + .withIssuedTokenType("urn:ietf:params:oauth:token-type:access_token") + .withTokenType("Bearer") + .build()); + + default: + throw new UnsupportedOperationException("Unsupported grant_type: " + grantType); + } } - } case CONFIG: return castResponse(responseType, ConfigResponse.builder().build()); @@ -193,8 +201,11 @@ public T handleRequest(Route route, Map if (asNamespaceCatalog != null) { Namespace ns; if (vars.containsKey("parent")) { - ns = Namespace.of( - RESTUtil.NAMESPACE_SPLITTER.splitToStream(vars.get("parent")).toArray(String[]::new)); + ns = + Namespace.of( + RESTUtil.NAMESPACE_SPLITTER + .splitToStream(vars.get("parent")) + .toArray(String[]::new)); } else { ns = Namespace.empty(); } @@ -206,14 +217,16 @@ public T handleRequest(Route route, Map case CREATE_NAMESPACE: if (asNamespaceCatalog != null) { CreateNamespaceRequest request = castRequest(CreateNamespaceRequest.class, body); - return castResponse(responseType, CatalogHandlers.createNamespace(asNamespaceCatalog, request)); + return castResponse( + responseType, CatalogHandlers.createNamespace(asNamespaceCatalog, request)); } break; case LOAD_NAMESPACE: if (asNamespaceCatalog != null) { Namespace namespace = namespaceFromPathVars(vars); - return castResponse(responseType, CatalogHandlers.loadNamespace(asNamespaceCatalog, namespace)); + return castResponse( + responseType, CatalogHandlers.loadNamespace(asNamespaceCatalog, namespace)); } break; @@ -227,49 +240,59 @@ public T handleRequest(Route route, Map case UPDATE_NAMESPACE: if (asNamespaceCatalog != null) { Namespace namespace = namespaceFromPathVars(vars); - UpdateNamespacePropertiesRequest request = castRequest(UpdateNamespacePropertiesRequest.class, body); - return castResponse(responseType, + UpdateNamespacePropertiesRequest request = + castRequest(UpdateNamespacePropertiesRequest.class, body); + return castResponse( + responseType, CatalogHandlers.updateNamespaceProperties(asNamespaceCatalog, namespace, request)); } break; - case LIST_TABLES: { - Namespace namespace = namespaceFromPathVars(vars); - return castResponse(responseType, CatalogHandlers.listTables(catalog, namespace)); - } + case LIST_TABLES: + { + Namespace namespace = namespaceFromPathVars(vars); + return castResponse(responseType, CatalogHandlers.listTables(catalog, namespace)); + } - case CREATE_TABLE: { - Namespace namespace = namespaceFromPathVars(vars); - CreateTableRequest request = castRequest(CreateTableRequest.class, body); - request.validate(); - if (request.stageCreate()) { - return castResponse(responseType, CatalogHandlers.stageTableCreate(catalog, namespace, request)); - } else { - return castResponse(responseType, CatalogHandlers.createTable(catalog, namespace, request)); + case CREATE_TABLE: + { + Namespace namespace = namespaceFromPathVars(vars); + CreateTableRequest request = castRequest(CreateTableRequest.class, body); + request.validate(); + if (request.stageCreate()) { + return castResponse( + responseType, CatalogHandlers.stageTableCreate(catalog, namespace, request)); + } else { + return castResponse( + responseType, CatalogHandlers.createTable(catalog, namespace, request)); + } } - } - case DROP_TABLE: { - CatalogHandlers.dropTable(catalog, identFromPathVars(vars)); - return null; - } + case DROP_TABLE: + { + CatalogHandlers.dropTable(catalog, identFromPathVars(vars)); + return null; + } - case LOAD_TABLE: { - TableIdentifier ident = identFromPathVars(vars); - return castResponse(responseType, CatalogHandlers.loadTable(catalog, ident)); - } + case LOAD_TABLE: + { + TableIdentifier ident = identFromPathVars(vars); + return castResponse(responseType, CatalogHandlers.loadTable(catalog, ident)); + } - case UPDATE_TABLE: { - TableIdentifier ident = identFromPathVars(vars); - UpdateTableRequest request = castRequest(UpdateTableRequest.class, body); - return castResponse(responseType, CatalogHandlers.updateTable(catalog, ident, request)); - } + case UPDATE_TABLE: + { + TableIdentifier ident = identFromPathVars(vars); + UpdateTableRequest request = castRequest(UpdateTableRequest.class, body); + return castResponse(responseType, CatalogHandlers.updateTable(catalog, ident, request)); + } - case RENAME_TABLE: { - RenameTableRequest request = castRequest(RenameTableRequest.class, body); - CatalogHandlers.renameTable(catalog, request); - return null; - } + case RENAME_TABLE: + { + RenameTableRequest request = castRequest(RenameTableRequest.class, body); + CatalogHandlers.renameTable(catalog, request); + return null; + } default: } @@ -277,9 +300,14 @@ public T handleRequest(Route route, Map return null; } - public T execute(HTTPMethod method, String path, Map queryParams, - Object body, Class responseType, Map headers, - Consumer errorHandler) { + public T execute( + HTTPMethod method, + String path, + Map queryParams, + Object body, + Class responseType, + Map headers, + Consumer errorHandler) { ErrorResponse.Builder errorBuilder = ErrorResponse.builder(); Pair> routeAndVars = Route.from(method, path); if (routeAndVars != null) { @@ -311,20 +339,31 @@ public T execute(HTTPMethod method, String path, Map T delete(String path, Class responseType, Map headers, - Consumer errorHandler) { + public T delete( + String path, + Class responseType, + Map headers, + Consumer errorHandler) { return execute(HTTPMethod.DELETE, path, null, null, responseType, headers, errorHandler); } @Override - public T post(String path, RESTRequest body, Class responseType, - Map headers, Consumer errorHandler) { + public T post( + String path, + RESTRequest body, + Class responseType, + Map headers, + Consumer errorHandler) { return execute(HTTPMethod.POST, path, null, body, responseType, headers, errorHandler); } @Override - public T get(String path, Map queryParams, Class responseType, - Map headers, Consumer errorHandler) { + public T get( + String path, + Map queryParams, + Class responseType, + Map headers, + Consumer errorHandler) { return execute(HTTPMethod.GET, path, queryParams, null, responseType, headers, errorHandler); } @@ -334,8 +373,12 @@ public void head(String path, Map headers, Consumer T postForm(String path, Map formData, Class responseType, - Map headers, Consumer errorHandler) { + public T postForm( + String path, + Map formData, + Class responseType, + Map headers, + Consumer errorHandler) { return execute(HTTPMethod.POST, path, null, formData, responseType, headers, errorHandler); } @@ -348,7 +391,8 @@ public void close() throws IOException { private static class BadResponseType extends RuntimeException { private BadResponseType(Class responseType, Object response) { - super(String.format("Invalid response object, not a %s: %s", responseType.getName(), response)); + super( + String.format("Invalid response object, not a %s: %s", responseType.getName(), response)); } } @@ -374,7 +418,8 @@ public static T castResponse(Class responseType, Obj throw new BadResponseType(responseType, response); } - public static void configureResponseFromException(Exception exc, ErrorResponse.Builder errorBuilder) { + public static void configureResponseFromException( + Exception exc, ErrorResponse.Builder errorBuilder) { errorBuilder .responseCode(EXCEPTION_ERROR_CODES.getOrDefault(exc.getClass(), 500)) .withType(exc.getClass().getSimpleName()) @@ -387,6 +432,7 @@ private static Namespace namespaceFromPathVars(Map pathVars) { } private static TableIdentifier identFromPathVars(Map pathVars) { - return TableIdentifier.of(namespaceFromPathVars(pathVars), RESTUtil.decodeString(pathVars.get("table"))); + return TableIdentifier.of( + namespaceFromPathVars(pathVars), RESTUtil.decodeString(pathVars.get("table"))); } } diff --git a/core/src/test/java/org/apache/iceberg/rest/RequestResponseTestBase.java b/core/src/test/java/org/apache/iceberg/rest/RequestResponseTestBase.java index 4a0abaecffab..cf5879f8ea29 100644 --- a/core/src/test/java/org/apache/iceberg/rest/RequestResponseTestBase.java +++ b/core/src/test/java/org/apache/iceberg/rest/RequestResponseTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; import com.fasterxml.jackson.core.JsonProcessingException; @@ -37,45 +36,38 @@ public static ObjectMapper mapper() { return MAPPER; } - /** - * Return a list of all the fields used in this class, as defined in the spec. - */ + /** Return a list of all the fields used in this class, as defined in the spec. */ public abstract String[] allFieldsFromSpec(); - /** - * Return a valid instance of the request / response object. Used when validating fields. - */ + /** Return a valid instance of the request / response object. Used when validating fields. */ public abstract T createExampleInstance(); /** * Compare if two request / response objects are equivalent. - *

- * This helper method is used as opposed to implementing equals so that fields that deserialize into - * null can be compared to the fields of instances created via the corresponding Builder, which typically - * have a default value (such as an empty collection) for those fields. + * + *

This helper method is used as opposed to implementing equals so that fields that deserialize + * into null can be compared to the fields of instances created via the corresponding Builder, + * which typically have a default value (such as an empty collection) for those fields. + * * @param actual - request / response object to validate - * @param expected - the corresponding object to check that {@code actual} is semantically equivalent to. + * @param expected - the corresponding object to check that {@code actual} is semantically + * equivalent to. */ public abstract void assertEquals(T actual, T expected); - /** - * Parse and return the input json into a value of type T. - */ + /** Parse and return the input json into a value of type T. */ public abstract T deserialize(String json) throws JsonProcessingException; - /** - * Serialize T to a String. - */ + /** Serialize T to a String. */ public String serialize(T object) throws JsonProcessingException { return MAPPER.writeValueAsString(object); } /** - * This test ensures that the serialized JSON of each class has only fields that are expected from the spec. - * Only top level fields are checked presently, as nested fields generally come from some existing type that is - * tested elsewhere. - * The fields from the spec should be populated into each subclass's - * {@link RequestResponseTestBase#allFieldsFromSpec()}. + * This test ensures that the serialized JSON of each class has only fields that are expected from + * the spec. Only top level fields are checked presently, as nested fields generally come from + * some existing type that is tested elsewhere. The fields from the spec should be populated into + * each subclass's {@link RequestResponseTestBase#allFieldsFromSpec()}. */ @Test public void testHasOnlyKnownFields() { @@ -99,7 +91,8 @@ public void testHasOnlyKnownFields() { * Test that the input JSON can be parsed into an equivalent object as {@code expected}, and then * re-serialized into the same JSON. */ - protected void assertRoundTripSerializesEquallyFrom(String json, T expected) throws JsonProcessingException { + protected void assertRoundTripSerializesEquallyFrom(String json, T expected) + throws JsonProcessingException { // Check that the JSON deserializes into the expected value; T actual = deserialize(json); assertEquals(actual, expected); diff --git a/core/src/test/java/org/apache/iceberg/rest/TestHTTPClient.java b/core/src/test/java/org/apache/iceberg/rest/TestHTTPClient.java index dcd28002ce60..6cf9fed73f32 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestHTTPClient.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestHTTPClient.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; +import static org.mockserver.integration.ClientAndServer.startClientAndServer; +import static org.mockserver.model.HttpRequest.request; +import static org.mockserver.model.HttpResponse.response; + import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; @@ -41,12 +44,9 @@ import org.mockserver.model.HttpRequest; import org.mockserver.model.HttpResponse; -import static org.mockserver.integration.ClientAndServer.startClientAndServer; -import static org.mockserver.model.HttpRequest.request; -import static org.mockserver.model.HttpResponse.response; - -/*** - * Exercises the RESTClient interface, specifically over a mocked-server using the actual HttpRESTClient code. +/** + * * Exercises the RESTClient interface, specifically over a mocked-server using the actual + * HttpRESTClient code. */ public class TestHTTPClient { @@ -118,47 +118,59 @@ public static void testHttpMethodOnSuccess(HttpMethod method) throws JsonProcess Item body = new Item(0L, "hank"); int statusCode = 200; AtomicInteger errorCounter = new AtomicInteger(0); - Consumer onError = (error) -> { - errorCounter.incrementAndGet(); - throw new RuntimeException("Failure response"); - }; + Consumer onError = + (error) -> { + errorCounter.incrementAndGet(); + throw new RuntimeException("Failure response"); + }; String path = addRequestTestCaseAndGetPath(method, body, statusCode); Item successResponse = doExecuteRequest(method, path, body, onError); if (method.usesRequestBody()) { - Assert.assertEquals("On a successful " + method + ", the correct response body should be returned", - successResponse, body); + Assert.assertEquals( + "On a successful " + method + ", the correct response body should be returned", + successResponse, + body); } - Assert.assertEquals("On a successful " + method + ", the error handler should not be called", - 0, errorCounter.get()); + Assert.assertEquals( + "On a successful " + method + ", the error handler should not be called", + 0, + errorCounter.get()); } public static void testHttpMethodOnFailure(HttpMethod method) throws JsonProcessingException { Item body = new Item(0L, "hank"); int statusCode = 404; AtomicInteger errorCounter = new AtomicInteger(0); - Consumer onError = error -> { - errorCounter.incrementAndGet(); - throw new RuntimeException( - String.format("Called error handler for method %s due to status code: %d", method, statusCode)); - }; + Consumer onError = + error -> { + errorCounter.incrementAndGet(); + throw new RuntimeException( + String.format( + "Called error handler for method %s due to status code: %d", method, statusCode)); + }; String path = addRequestTestCaseAndGetPath(method, body, statusCode); AssertHelpers.assertThrows( "A response indicating a failed request should throw", RuntimeException.class, - String.format("Called error handler for method %s due to status code: %d", method, statusCode), + String.format( + "Called error handler for method %s due to status code: %d", method, statusCode), () -> doExecuteRequest(method, path, body, onError)); - Assert.assertEquals("On an unsuccessful " + method + ", the error handler should be called", - 1, errorCounter.get()); + Assert.assertEquals( + "On an unsuccessful " + method + ", the error handler should be called", + 1, + errorCounter.get()); } - // Adds a request that the mock-server can match against, based on the method, path, body, and headers. - // Return the path generated for the test case, so that the client can call that path to exercise it. + // Adds a request that the mock-server can match against, based on the method, path, body, and + // headers. + // Return the path generated for the test case, so that the client can call that path to exercise + // it. private static String addRequestTestCaseAndGetPath(HttpMethod method, Item body, int statusCode) throws JsonProcessingException { @@ -170,11 +182,13 @@ private static String addRequestTestCaseAndGetPath(HttpMethod method, Item body, // Build the expected request String asJson = body != null ? MAPPER.writeValueAsString(body) : null; - HttpRequest mockRequest = request("/" + path) - .withMethod(method.name().toUpperCase(Locale.ROOT)) - .withHeader("Authorization", "Bearer " + BEARER_AUTH_TOKEN) - .withHeader(HTTPClientFactory.CLIENT_VERSION_HEADER, icebergBuildFullVersion) - .withHeader(HTTPClientFactory.CLIENT_GIT_COMMIT_SHORT_HEADER, icebergBuildGitCommitShort); + HttpRequest mockRequest = + request("/" + path) + .withMethod(method.name().toUpperCase(Locale.ROOT)) + .withHeader("Authorization", "Bearer " + BEARER_AUTH_TOKEN) + .withHeader(HTTPClientFactory.CLIENT_VERSION_HEADER, icebergBuildFullVersion) + .withHeader( + HTTPClientFactory.CLIENT_GIT_COMMIT_SHORT_HEADER, icebergBuildGitCommitShort); if (method.usesRequestBody()) { mockRequest = mockRequest.withBody(asJson); @@ -188,20 +202,19 @@ private static String addRequestTestCaseAndGetPath(HttpMethod method, Item body, // Simply return the passed in item in the success case. mockResponse = mockResponse.withBody(asJson); } else { - ErrorResponse response = ErrorResponse.builder() - .responseCode(statusCode).withMessage("Not found").build(); + ErrorResponse response = + ErrorResponse.builder().responseCode(statusCode).withMessage("Not found").build(); mockResponse = mockResponse.withBody(ErrorResponseParser.toJson(response)); } } - mockServer - .when(mockRequest) - .respond(mockResponse); + mockServer.when(mockRequest).respond(mockResponse); return path; } - private static Item doExecuteRequest(HttpMethod method, String path, Item body, Consumer onError) { + private static Item doExecuteRequest( + HttpMethod method, String path, Item body, Consumer onError) { Map headers = ImmutableMap.of("Authorization", "Bearer " + BEARER_AUTH_TOKEN); switch (method) { case POST: @@ -224,8 +237,7 @@ public static class Item implements RESTRequest, RESTResponse { // Required for Jackson deserialization @SuppressWarnings("unused") - public Item() { - } + public Item() {} public Item(Long id, String data) { this.id = id; @@ -233,8 +245,7 @@ public Item(Long id, String data) { } @Override - public void validate() { - } + public void validate() {} @Override public int hashCode() { diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java index 6a681b45efda..2159c1c4ada4 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.times; + import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import java.io.File; @@ -52,16 +56,10 @@ import org.mockito.Mockito; import org.mockito.stubbing.Answer; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.eq; -import static org.mockito.Mockito.times; - public class TestRESTCatalog extends CatalogTests { private static final ObjectMapper MAPPER = RESTObjectMapper.mapper(); - @TempDir - public Path temp; + @TempDir public Path temp; private RESTCatalog restCatalog; private JdbcCatalog backendCatalog; @@ -73,44 +71,65 @@ public void createCatalog() { this.backendCatalog = new JdbcCatalog(); backendCatalog.setConf(conf); - Map backendCatalogProperties = ImmutableMap.of( - CatalogProperties.WAREHOUSE_LOCATION, warehouse.getAbsolutePath(), - CatalogProperties.URI, "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", ""), - JdbcCatalog.PROPERTY_PREFIX + "username", "user", - JdbcCatalog.PROPERTY_PREFIX + "password", "password"); + Map backendCatalogProperties = + ImmutableMap.of( + CatalogProperties.WAREHOUSE_LOCATION, + warehouse.getAbsolutePath(), + CatalogProperties.URI, + "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", ""), + JdbcCatalog.PROPERTY_PREFIX + "username", + "user", + JdbcCatalog.PROPERTY_PREFIX + "password", + "password"); backendCatalog.initialize("backend", backendCatalogProperties); - Map catalogHeaders = ImmutableMap.of( - "Authorization", "Bearer client-credentials-token:sub=catalog"); - Map contextHeaders = ImmutableMap.of( - "Authorization", "Bearer client-credentials-token:sub=user"); - - RESTCatalogAdapter adaptor = new RESTCatalogAdapter(backendCatalog) { - @Override - public T execute(RESTCatalogAdapter.HTTPMethod method, String path, - Map queryParams, Object body, Class responseType, - Map headers, Consumer errorHandler) { - // this doesn't use a Mockito spy because this is used for catalog tests, which have different method calls - if (!"v1/oauth/tokens".equals(path)) { - if ("v1/config".equals(path)) { - Assertions.assertEquals(catalogHeaders, headers, "Headers did not match for path: " + path); - } else { - Assertions.assertEquals(contextHeaders, headers, "Headers did not match for path: " + path); + Map catalogHeaders = + ImmutableMap.of("Authorization", "Bearer client-credentials-token:sub=catalog"); + Map contextHeaders = + ImmutableMap.of("Authorization", "Bearer client-credentials-token:sub=user"); + + RESTCatalogAdapter adaptor = + new RESTCatalogAdapter(backendCatalog) { + @Override + public T execute( + RESTCatalogAdapter.HTTPMethod method, + String path, + Map queryParams, + Object body, + Class responseType, + Map headers, + Consumer errorHandler) { + // this doesn't use a Mockito spy because this is used for catalog tests, which have + // different method calls + if (!"v1/oauth/tokens".equals(path)) { + if ("v1/config".equals(path)) { + Assertions.assertEquals( + catalogHeaders, headers, "Headers did not match for path: " + path); + } else { + Assertions.assertEquals( + contextHeaders, headers, "Headers did not match for path: " + path); + } + } + Object request = roundTripSerialize(body, "request"); + T response = + super.execute( + method, path, queryParams, request, responseType, headers, errorHandler); + T responseAfterSerialization = roundTripSerialize(response, "response"); + return responseAfterSerialization; } - } - Object request = roundTripSerialize(body, "request"); - T response = super.execute(method, path, queryParams, request, responseType, headers, errorHandler); - T responseAfterSerialization = roundTripSerialize(response, "response"); - return responseAfterSerialization; - } - }; + }; - SessionCatalog.SessionContext context = new SessionCatalog.SessionContext( - UUID.randomUUID().toString(), "user", ImmutableMap.of("credential", "user:12345"), ImmutableMap.of()); + SessionCatalog.SessionContext context = + new SessionCatalog.SessionContext( + UUID.randomUUID().toString(), + "user", + ImmutableMap.of("credential", "user:12345"), + ImmutableMap.of()); this.restCatalog = new RESTCatalog(context, (config) -> adaptor); restCatalog.setConf(conf); - restCatalog.initialize("prod", ImmutableMap.of(CatalogProperties.URI, "ignored", "credential", "catalog:12345")); + restCatalog.initialize( + "prod", ImmutableMap.of(CatalogProperties.URI, "ignored", "credential", "catalog:12345")); } @SuppressWarnings("unchecked") @@ -166,46 +185,59 @@ protected boolean supportsNestedNamespaces() { @Test public void testConfigRoute() throws IOException { - RESTClient testClient = new RESTCatalogAdapter(backendCatalog) { - @Override - public T get(String path, Map queryParams, Class responseType, - Map headers, Consumer errorHandler) { - if ("v1/config".equals(path)) { - return castResponse(responseType, ConfigResponse - .builder() - .withDefaults(ImmutableMap.of(CatalogProperties.CLIENT_POOL_SIZE, "1")) - .withOverrides(ImmutableMap.of(CatalogProperties.CACHE_ENABLED, "false")) - .build()); - } - return super.get(path, queryParams, responseType, headers, errorHandler); - } - }; + RESTClient testClient = + new RESTCatalogAdapter(backendCatalog) { + @Override + public T get( + String path, + Map queryParams, + Class responseType, + Map headers, + Consumer errorHandler) { + if ("v1/config".equals(path)) { + return castResponse( + responseType, + ConfigResponse.builder() + .withDefaults(ImmutableMap.of(CatalogProperties.CLIENT_POOL_SIZE, "1")) + .withOverrides(ImmutableMap.of(CatalogProperties.CACHE_ENABLED, "false")) + .build()); + } + return super.get(path, queryParams, responseType, headers, errorHandler); + } + }; RESTCatalog restCat = new RESTCatalog((config) -> testClient); - Map initialConfig = ImmutableMap.of( - CatalogProperties.URI, "http://localhost:8080", - CatalogProperties.CACHE_ENABLED, "true"); + Map initialConfig = + ImmutableMap.of( + CatalogProperties.URI, "http://localhost:8080", + CatalogProperties.CACHE_ENABLED, "true"); restCat.setConf(new Configuration()); restCat.initialize("prod", initialConfig); - Assert.assertEquals("Catalog properties after initialize should use the server's override properties", - "false", restCat.properties().get(CatalogProperties.CACHE_ENABLED)); + Assert.assertEquals( + "Catalog properties after initialize should use the server's override properties", + "false", + restCat.properties().get(CatalogProperties.CACHE_ENABLED)); - Assert.assertEquals("Catalog after initialize should use the server's default properties if not specified", - "1", restCat.properties().get(CatalogProperties.CLIENT_POOL_SIZE)); + Assert.assertEquals( + "Catalog after initialize should use the server's default properties if not specified", + "1", + restCat.properties().get(CatalogProperties.CLIENT_POOL_SIZE)); restCat.close(); } @Test public void testInitializeWithBadArguments() throws IOException { RESTCatalog restCat = new RESTCatalog(); - AssertHelpers.assertThrows("Configuration passed to initialize cannot be null", + AssertHelpers.assertThrows( + "Configuration passed to initialize cannot be null", IllegalArgumentException.class, "Invalid configuration: null", () -> restCat.initialize("prod", null)); - AssertHelpers.assertThrows("Configuration passed to initialize must have uri", + AssertHelpers.assertThrows( + "Configuration passed to initialize must have uri", IllegalArgumentException.class, "REST Catalog server URI is required", () -> restCat.initialize("prod", ImmutableMap.of())); @@ -215,214 +247,279 @@ public void testInitializeWithBadArguments() throws IOException { @Test public void testCatalogBasicBearerToken() { - Map catalogHeaders = ImmutableMap.of( - "Authorization", "Bearer bearer-token"); + Map catalogHeaders = ImmutableMap.of("Authorization", "Bearer bearer-token"); RESTCatalogAdapter adapter = Mockito.spy(new RESTCatalogAdapter(backendCatalog)); - RESTCatalog catalog = new RESTCatalog(SessionCatalog.SessionContext.createEmpty(), (config) -> adapter); - catalog.initialize("prod", ImmutableMap.of( - CatalogProperties.URI, "ignored", - "token", "bearer-token")); + RESTCatalog catalog = + new RESTCatalog(SessionCatalog.SessionContext.createEmpty(), (config) -> adapter); + catalog.initialize( + "prod", ImmutableMap.of(CatalogProperties.URI, "ignored", "token", "bearer-token")); Assertions.assertFalse(catalog.tableExists(TableIdentifier.of("ns", "table"))); // the bearer token should be used for all interactions - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), eq("v1/config"), any(), any(), eq(ConfigResponse.class), eq(catalogHeaders), any()); - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), - eq("v1/namespaces/ns/tables/table"), - any(), - any(), - eq(LoadTableResponse.class), - eq(catalogHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/config"), + any(), + any(), + eq(ConfigResponse.class), + eq(catalogHeaders), + any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/namespaces/ns/tables/table"), + any(), + any(), + eq(LoadTableResponse.class), + eq(catalogHeaders), + any()); } @Test public void testCatalogCredential() { Map emptyHeaders = ImmutableMap.of(); - Map catalogHeaders = ImmutableMap.of( - "Authorization", "Bearer client-credentials-token:sub=catalog"); + Map catalogHeaders = + ImmutableMap.of("Authorization", "Bearer client-credentials-token:sub=catalog"); RESTCatalogAdapter adapter = Mockito.spy(new RESTCatalogAdapter(backendCatalog)); - RESTCatalog catalog = new RESTCatalog(SessionCatalog.SessionContext.createEmpty(), (config) -> adapter); - catalog.initialize("prod", ImmutableMap.of( - CatalogProperties.URI, "ignored", - "credential", "catalog:secret")); + RESTCatalog catalog = + new RESTCatalog(SessionCatalog.SessionContext.createEmpty(), (config) -> adapter); + catalog.initialize( + "prod", ImmutableMap.of(CatalogProperties.URI, "ignored", "credential", "catalog:secret")); Assertions.assertFalse(catalog.tableExists(TableIdentifier.of("ns", "table"))); // no token or credential for catalog token exchange - Mockito.verify(adapter).execute( - eq(HTTPMethod.POST), - eq("v1/oauth/tokens"), - any(), - any(), - eq(OAuthTokenResponse.class), - eq(emptyHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/oauth/tokens"), + any(), + any(), + eq(OAuthTokenResponse.class), + eq(emptyHeaders), + any()); // no token or credential for config - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), eq("v1/config"), any(), any(), eq(ConfigResponse.class), eq(catalogHeaders), any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/config"), + any(), + any(), + eq(ConfigResponse.class), + eq(catalogHeaders), + any()); // use the catalog token for all interactions - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), - eq("v1/namespaces/ns/tables/table"), - any(), - any(), - eq(LoadTableResponse.class), - eq(catalogHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/namespaces/ns/tables/table"), + any(), + any(), + eq(LoadTableResponse.class), + eq(catalogHeaders), + any()); } @Test public void testCatalogBearerTokenWithClientCredential() { - Map contextHeaders = ImmutableMap.of( - "Authorization", "Bearer client-credentials-token:sub=user"); - Map catalogHeaders = ImmutableMap.of( - "Authorization", "Bearer bearer-token"); + Map contextHeaders = + ImmutableMap.of("Authorization", "Bearer client-credentials-token:sub=user"); + Map catalogHeaders = ImmutableMap.of("Authorization", "Bearer bearer-token"); RESTCatalogAdapter adapter = Mockito.spy(new RESTCatalogAdapter(backendCatalog)); - SessionCatalog.SessionContext context = new SessionCatalog.SessionContext( - UUID.randomUUID().toString(), "user", ImmutableMap.of("credential", "user:secret"), ImmutableMap.of()); + SessionCatalog.SessionContext context = + new SessionCatalog.SessionContext( + UUID.randomUUID().toString(), + "user", + ImmutableMap.of("credential", "user:secret"), + ImmutableMap.of()); RESTCatalog catalog = new RESTCatalog(context, (config) -> adapter); - catalog.initialize("prod", ImmutableMap.of( - CatalogProperties.URI, "ignored", - "token", "bearer-token")); + catalog.initialize( + "prod", ImmutableMap.of(CatalogProperties.URI, "ignored", "token", "bearer-token")); Assertions.assertFalse(catalog.tableExists(TableIdentifier.of("ns", "table"))); // use the bearer token for config - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), eq("v1/config"), any(), any(), eq(ConfigResponse.class), eq(catalogHeaders), any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/config"), + any(), + any(), + eq(ConfigResponse.class), + eq(catalogHeaders), + any()); // use the bearer token to fetch the context token - Mockito.verify(adapter).execute( - eq(HTTPMethod.POST), - eq("v1/oauth/tokens"), - any(), - any(), - eq(OAuthTokenResponse.class), - eq(catalogHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/oauth/tokens"), + any(), + any(), + eq(OAuthTokenResponse.class), + eq(catalogHeaders), + any()); // use the context token for table load - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), - eq("v1/namespaces/ns/tables/table"), - any(), - any(), - eq(LoadTableResponse.class), - eq(contextHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/namespaces/ns/tables/table"), + any(), + any(), + eq(LoadTableResponse.class), + eq(contextHeaders), + any()); } @Test public void testCatalogCredentialWithClientCredential() { Map emptyHeaders = ImmutableMap.of(); - Map contextHeaders = ImmutableMap.of( - "Authorization", "Bearer client-credentials-token:sub=user"); - Map catalogHeaders = ImmutableMap.of( - "Authorization", "Bearer client-credentials-token:sub=catalog"); + Map contextHeaders = + ImmutableMap.of("Authorization", "Bearer client-credentials-token:sub=user"); + Map catalogHeaders = + ImmutableMap.of("Authorization", "Bearer client-credentials-token:sub=catalog"); RESTCatalogAdapter adapter = Mockito.spy(new RESTCatalogAdapter(backendCatalog)); - SessionCatalog.SessionContext context = new SessionCatalog.SessionContext( - UUID.randomUUID().toString(), "user", ImmutableMap.of("credential", "user:secret"), ImmutableMap.of()); + SessionCatalog.SessionContext context = + new SessionCatalog.SessionContext( + UUID.randomUUID().toString(), + "user", + ImmutableMap.of("credential", "user:secret"), + ImmutableMap.of()); RESTCatalog catalog = new RESTCatalog(context, (config) -> adapter); - catalog.initialize("prod", ImmutableMap.of( - CatalogProperties.URI, "ignored", - "credential", "catalog:secret")); + catalog.initialize( + "prod", ImmutableMap.of(CatalogProperties.URI, "ignored", "credential", "catalog:secret")); Assertions.assertFalse(catalog.tableExists(TableIdentifier.of("ns", "table"))); // call client credentials with no initial auth - Mockito.verify(adapter).execute( - eq(HTTPMethod.POST), - eq("v1/oauth/tokens"), - any(), - any(), - eq(OAuthTokenResponse.class), - eq(emptyHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/oauth/tokens"), + any(), + any(), + eq(OAuthTokenResponse.class), + eq(emptyHeaders), + any()); // use the client credential token for config - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), eq("v1/config"), any(), any(), eq(ConfigResponse.class), eq(catalogHeaders), any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/config"), + any(), + any(), + eq(ConfigResponse.class), + eq(catalogHeaders), + any()); // use the client credential to fetch the context token - Mockito.verify(adapter).execute( - eq(HTTPMethod.POST), - eq("v1/oauth/tokens"), - any(), - any(), - eq(OAuthTokenResponse.class), - eq(catalogHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/oauth/tokens"), + any(), + any(), + eq(OAuthTokenResponse.class), + eq(catalogHeaders), + any()); // use the context token for table load - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), - eq("v1/namespaces/ns/tables/table"), - any(), - any(), - eq(LoadTableResponse.class), - eq(contextHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/namespaces/ns/tables/table"), + any(), + any(), + eq(LoadTableResponse.class), + eq(contextHeaders), + any()); } @Test public void testCatalogBearerTokenAndCredentialWithClientCredential() { - Map contextHeaders = ImmutableMap.of( - "Authorization", "Bearer client-credentials-token:sub=user"); - Map initHeaders = ImmutableMap.of( - "Authorization", "Bearer bearer-token"); - Map catalogHeaders = ImmutableMap.of( - "Authorization", "Bearer client-credentials-token:sub=catalog"); + Map contextHeaders = + ImmutableMap.of("Authorization", "Bearer client-credentials-token:sub=user"); + Map initHeaders = ImmutableMap.of("Authorization", "Bearer bearer-token"); + Map catalogHeaders = + ImmutableMap.of("Authorization", "Bearer client-credentials-token:sub=catalog"); RESTCatalogAdapter adapter = Mockito.spy(new RESTCatalogAdapter(backendCatalog)); - SessionCatalog.SessionContext context = new SessionCatalog.SessionContext( - UUID.randomUUID().toString(), "user", ImmutableMap.of("credential", "user:secret"), ImmutableMap.of()); + SessionCatalog.SessionContext context = + new SessionCatalog.SessionContext( + UUID.randomUUID().toString(), + "user", + ImmutableMap.of("credential", "user:secret"), + ImmutableMap.of()); RESTCatalog catalog = new RESTCatalog(context, (config) -> adapter); - catalog.initialize("prod", ImmutableMap.of( - CatalogProperties.URI, "ignored", - "credential", "catalog:secret", - "token", "bearer-token")); + catalog.initialize( + "prod", + ImmutableMap.of( + CatalogProperties.URI, + "ignored", + "credential", + "catalog:secret", + "token", + "bearer-token")); Assertions.assertFalse(catalog.tableExists(TableIdentifier.of("ns", "table"))); // use the bearer token for client credentials - Mockito.verify(adapter).execute( - eq(HTTPMethod.POST), eq("v1/oauth/tokens"), any(), any(), eq(OAuthTokenResponse.class), eq(initHeaders), any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/oauth/tokens"), + any(), + any(), + eq(OAuthTokenResponse.class), + eq(initHeaders), + any()); // use the client credential token for config - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), eq("v1/config"), any(), any(), eq(ConfigResponse.class), eq(catalogHeaders), any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/config"), + any(), + any(), + eq(ConfigResponse.class), + eq(catalogHeaders), + any()); // use the client credential to fetch the context token - Mockito.verify(adapter).execute( - eq(HTTPMethod.POST), - eq("v1/oauth/tokens"), - any(), - any(), - eq(OAuthTokenResponse.class), - eq(catalogHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/oauth/tokens"), + any(), + any(), + eq(OAuthTokenResponse.class), + eq(catalogHeaders), + any()); // use the context token for table load - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), - eq("v1/namespaces/ns/tables/table"), - any(), - any(), - eq(LoadTableResponse.class), - eq(contextHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/namespaces/ns/tables/table"), + any(), + any(), + eq(LoadTableResponse.class), + eq(contextHeaders), + any()); } @Test public void testClientBearerToken() { - testClientAuth("bearer-token", + testClientAuth( + "bearer-token", ImmutableMap.of( "token", "client-bearer-token", "credential", "user:secret", @@ -436,7 +533,8 @@ public void testClientBearerToken() { @Test public void testClientCredential() { - testClientAuth("bearer-token", + testClientAuth( + "bearer-token", ImmutableMap.of( "credential", "user:secret", "urn:ietf:params:oauth:token-type:id_token", "id-token", @@ -449,96 +547,118 @@ public void testClientCredential() { @Test public void testClientIDToken() { - testClientAuth("bearer-token", + testClientAuth( + "bearer-token", ImmutableMap.of( "urn:ietf:params:oauth:token-type:id_token", "id-token", "urn:ietf:params:oauth:token-type:access_token", "access-token", "urn:ietf:params:oauth:token-type:jwt", "jwt-token", "urn:ietf:params:oauth:token-type:saml2", "saml2-token", "urn:ietf:params:oauth:token-type:saml1", "saml1-token"), - ImmutableMap.of("Authorization", "Bearer token-exchange-token:sub=id-token,act=bearer-token")); + ImmutableMap.of( + "Authorization", "Bearer token-exchange-token:sub=id-token,act=bearer-token")); } @Test public void testClientAccessToken() { - testClientAuth("bearer-token", + testClientAuth( + "bearer-token", ImmutableMap.of( "urn:ietf:params:oauth:token-type:access_token", "access-token", "urn:ietf:params:oauth:token-type:jwt", "jwt-token", "urn:ietf:params:oauth:token-type:saml2", "saml2-token", "urn:ietf:params:oauth:token-type:saml1", "saml1-token"), - ImmutableMap.of("Authorization", "Bearer token-exchange-token:sub=access-token,act=bearer-token")); + ImmutableMap.of( + "Authorization", "Bearer token-exchange-token:sub=access-token,act=bearer-token")); } @Test public void testClientJWTToken() { - testClientAuth("bearer-token", + testClientAuth( + "bearer-token", ImmutableMap.of( "urn:ietf:params:oauth:token-type:jwt", "jwt-token", "urn:ietf:params:oauth:token-type:saml2", "saml2-token", "urn:ietf:params:oauth:token-type:saml1", "saml1-token"), - ImmutableMap.of("Authorization", "Bearer token-exchange-token:sub=jwt-token,act=bearer-token")); + ImmutableMap.of( + "Authorization", "Bearer token-exchange-token:sub=jwt-token,act=bearer-token")); } @Test public void testClientSAML2Token() { - testClientAuth("bearer-token", + testClientAuth( + "bearer-token", ImmutableMap.of( "urn:ietf:params:oauth:token-type:saml2", "saml2-token", "urn:ietf:params:oauth:token-type:saml1", "saml1-token"), - ImmutableMap.of("Authorization", "Bearer token-exchange-token:sub=saml2-token,act=bearer-token")); + ImmutableMap.of( + "Authorization", "Bearer token-exchange-token:sub=saml2-token,act=bearer-token")); } @Test public void testClientSAML1Token() { - testClientAuth("bearer-token", + testClientAuth( + "bearer-token", ImmutableMap.of("urn:ietf:params:oauth:token-type:saml1", "saml1-token"), - ImmutableMap.of("Authorization", "Bearer token-exchange-token:sub=saml1-token,act=bearer-token")); + ImmutableMap.of( + "Authorization", "Bearer token-exchange-token:sub=saml1-token,act=bearer-token")); } - private void testClientAuth(String catalogToken, Map credentials, - Map expectedHeaders) { - Map catalogHeaders = ImmutableMap.of( - "Authorization", "Bearer " + catalogToken); + private void testClientAuth( + String catalogToken, Map credentials, Map expectedHeaders) { + Map catalogHeaders = ImmutableMap.of("Authorization", "Bearer " + catalogToken); RESTCatalogAdapter adapter = Mockito.spy(new RESTCatalogAdapter(backendCatalog)); - SessionCatalog.SessionContext context = new SessionCatalog.SessionContext( - UUID.randomUUID().toString(), "user", credentials, ImmutableMap.of()); + SessionCatalog.SessionContext context = + new SessionCatalog.SessionContext( + UUID.randomUUID().toString(), "user", credentials, ImmutableMap.of()); RESTCatalog catalog = new RESTCatalog(context, (config) -> adapter); - catalog.initialize("prod", ImmutableMap.of(CatalogProperties.URI, "ignored", "token", catalogToken)); + catalog.initialize( + "prod", ImmutableMap.of(CatalogProperties.URI, "ignored", "token", catalogToken)); Assertions.assertFalse(catalog.tableExists(TableIdentifier.of("ns", "table"))); - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), eq("v1/config"), any(), any(), eq(ConfigResponse.class), eq(catalogHeaders), any()); - - // token passes a static token. otherwise, validate a client credentials or token exchange request + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/config"), + any(), + any(), + eq(ConfigResponse.class), + eq(catalogHeaders), + any()); + + // token passes a static token. otherwise, validate a client credentials or token exchange + // request if (!credentials.containsKey("token")) { - Mockito.verify(adapter).execute( - eq(HTTPMethod.POST), - eq("v1/oauth/tokens"), - any(), - any(), - eq(OAuthTokenResponse.class), - eq(catalogHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/oauth/tokens"), + any(), + any(), + eq(OAuthTokenResponse.class), + eq(catalogHeaders), + any()); } - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), - eq("v1/namespaces/ns/tables/table"), - any(), - any(), - eq(LoadTableResponse.class), - eq(expectedHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/namespaces/ns/tables/table"), + any(), + any(), + eq(LoadTableResponse.class), + eq(expectedHeaders), + any()); } @Test public void testTableBearerToken() { - testTableAuth("catalog", + testTableAuth( + "catalog", ImmutableMap.of("urn:ietf:params:oauth:token-type:id_token", "id-token"), ImmutableMap.of("token", "table-bearer-token"), ImmutableMap.of("Authorization", "Bearer token-exchange-token:sub=id-token,act=catalog"), @@ -547,265 +667,314 @@ public void testTableBearerToken() { @Test public void testTableIDToken() { - testTableAuth("catalog", + testTableAuth( + "catalog", ImmutableMap.of("urn:ietf:params:oauth:token-type:id_token", "id-token"), ImmutableMap.of("urn:ietf:params:oauth:token-type:id_token", "table-id-token"), ImmutableMap.of("Authorization", "Bearer token-exchange-token:sub=id-token,act=catalog"), - ImmutableMap.of("Authorization", + ImmutableMap.of( + "Authorization", "Bearer token-exchange-token:sub=table-id-token,act=token-exchange-token:sub=id-token,act=catalog")); } @Test public void testTableCredential() { - testTableAuth("catalog", + testTableAuth( + "catalog", ImmutableMap.of("urn:ietf:params:oauth:token-type:id_token", "id-token"), ImmutableMap.of("credential", "table-user:secret"), ImmutableMap.of("Authorization", "Bearer token-exchange-token:sub=id-token,act=catalog"), ImmutableMap.of("Authorization", "Bearer client-credentials-token:sub=table-user")); } - public void testTableAuth(String catalogToken, Map credentials, Map tableConfig, - Map expectedContextHeaders, Map expectedTableHeaders) { + public void testTableAuth( + String catalogToken, + Map credentials, + Map tableConfig, + Map expectedContextHeaders, + Map expectedTableHeaders) { TableIdentifier ident = TableIdentifier.of("ns", "table"); - Map catalogHeaders = ImmutableMap.of( - "Authorization", "Bearer " + catalogToken); + Map catalogHeaders = ImmutableMap.of("Authorization", "Bearer " + catalogToken); RESTCatalogAdapter adapter = Mockito.spy(new RESTCatalogAdapter(backendCatalog)); // inject the expected table config - Answer addTableConfig = invocation -> { - LoadTableResponse loadTable = (LoadTableResponse) invocation.callRealMethod(); - return LoadTableResponse.builder() - .withTableMetadata(loadTable.tableMetadata()) - .addAllConfig(loadTable.config()) - .addAllConfig(tableConfig) - .build(); - }; - - Mockito.doAnswer(addTableConfig).when(adapter).execute( - eq(HTTPMethod.POST), - eq("v1/namespaces/ns/tables"), - any(), - any(), - eq(LoadTableResponse.class), - eq(expectedContextHeaders), - any()); - - Mockito.doAnswer(addTableConfig).when(adapter).execute( - eq(HTTPMethod.GET), - eq("v1/namespaces/ns/tables/table"), - any(), - any(), - eq(LoadTableResponse.class), - eq(expectedContextHeaders), - any()); - - SessionCatalog.SessionContext context = new SessionCatalog.SessionContext( - UUID.randomUUID().toString(), "user", credentials, ImmutableMap.of()); + Answer addTableConfig = + invocation -> { + LoadTableResponse loadTable = (LoadTableResponse) invocation.callRealMethod(); + return LoadTableResponse.builder() + .withTableMetadata(loadTable.tableMetadata()) + .addAllConfig(loadTable.config()) + .addAllConfig(tableConfig) + .build(); + }; + + Mockito.doAnswer(addTableConfig) + .when(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/namespaces/ns/tables"), + any(), + any(), + eq(LoadTableResponse.class), + eq(expectedContextHeaders), + any()); + + Mockito.doAnswer(addTableConfig) + .when(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/namespaces/ns/tables/table"), + any(), + any(), + eq(LoadTableResponse.class), + eq(expectedContextHeaders), + any()); + + SessionCatalog.SessionContext context = + new SessionCatalog.SessionContext( + UUID.randomUUID().toString(), "user", credentials, ImmutableMap.of()); RESTCatalog catalog = new RESTCatalog(context, (config) -> adapter); - catalog.initialize("prod", ImmutableMap.of(CatalogProperties.URI, "ignored", "token", catalogToken)); + catalog.initialize( + "prod", ImmutableMap.of(CatalogProperties.URI, "ignored", "token", catalogToken)); - Schema expectedSchema = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); Table table = catalog.createTable(ident, expectedSchema); - Assertions.assertEquals(expectedSchema.asStruct(), table.schema().asStruct(), "Schema should match"); + Assertions.assertEquals( + expectedSchema.asStruct(), table.schema().asStruct(), "Schema should match"); Table loaded = catalog.loadTable(ident); // the first load will send the token - Assertions.assertEquals(expectedSchema.asStruct(), loaded.schema().asStruct(), "Schema should match"); + Assertions.assertEquals( + expectedSchema.asStruct(), loaded.schema().asStruct(), "Schema should match"); loaded.refresh(); // refresh to force reload - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), eq("v1/config"), any(), any(), eq(ConfigResponse.class), eq(catalogHeaders), any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/config"), + any(), + any(), + eq(ConfigResponse.class), + eq(catalogHeaders), + any()); // session client credentials flow - Mockito.verify(adapter).execute( - eq(HTTPMethod.POST), - eq("v1/oauth/tokens"), - any(), - any(), - eq(OAuthTokenResponse.class), - eq(catalogHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/oauth/tokens"), + any(), + any(), + eq(OAuthTokenResponse.class), + eq(catalogHeaders), + any()); // create table request - Mockito.verify(adapter).execute( - eq(HTTPMethod.POST), - eq("v1/namespaces/ns/tables"), - any(), - any(), - eq(LoadTableResponse.class), - eq(expectedContextHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/namespaces/ns/tables"), + any(), + any(), + eq(LoadTableResponse.class), + eq(expectedContextHeaders), + any()); // if the table returned a bearer token, there will be no token request if (!tableConfig.containsKey("token")) { // client credentials or token exchange to get a table token - Mockito.verify(adapter, times(2)).execute( - eq(HTTPMethod.POST), - eq("v1/oauth/tokens"), - any(), - any(), - eq(OAuthTokenResponse.class), - eq(expectedContextHeaders), - any()); + Mockito.verify(adapter, times(2)) + .execute( + eq(HTTPMethod.POST), + eq("v1/oauth/tokens"), + any(), + any(), + eq(OAuthTokenResponse.class), + eq(expectedContextHeaders), + any()); } // automatic refresh when metadata is accessed after commit - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), - eq("v1/namespaces/ns/tables/table"), - any(), - any(), - eq(LoadTableResponse.class), - eq(expectedTableHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/namespaces/ns/tables/table"), + any(), + any(), + eq(LoadTableResponse.class), + eq(expectedTableHeaders), + any()); // load table from catalog - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), - eq("v1/namespaces/ns/tables/table"), - any(), - any(), - eq(LoadTableResponse.class), - eq(expectedContextHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/namespaces/ns/tables/table"), + any(), + any(), + eq(LoadTableResponse.class), + eq(expectedContextHeaders), + any()); // refresh loaded table - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), - eq("v1/namespaces/ns/tables/table"), - any(), - any(), - eq(LoadTableResponse.class), - eq(expectedTableHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/namespaces/ns/tables/table"), + any(), + any(), + eq(LoadTableResponse.class), + eq(expectedTableHeaders), + any()); } @Test public void testCatalogTokenRefresh() throws Exception { Map emptyHeaders = ImmutableMap.of(); - Map catalogHeaders = ImmutableMap.of( - "Authorization", "Bearer client-credentials-token:sub=catalog"); + Map catalogHeaders = + ImmutableMap.of("Authorization", "Bearer client-credentials-token:sub=catalog"); RESTCatalogAdapter adapter = Mockito.spy(new RESTCatalogAdapter(backendCatalog)); - Answer addOneSecondExpiration = invocation -> { - OAuthTokenResponse response = (OAuthTokenResponse) invocation.callRealMethod(); - return OAuthTokenResponse.builder() - .withToken(response.token()) - .withTokenType(response.tokenType()) - .withIssuedTokenType(response.issuedTokenType()) - .addScopes(response.scopes()) - .setExpirationInSeconds(1) - .build(); - }; - - Mockito.doAnswer(addOneSecondExpiration).when(adapter).execute( - eq(HTTPMethod.POST), - eq("v1/oauth/tokens"), - any(), - any(), - eq(OAuthTokenResponse.class), - any(), - any()); + Answer addOneSecondExpiration = + invocation -> { + OAuthTokenResponse response = (OAuthTokenResponse) invocation.callRealMethod(); + return OAuthTokenResponse.builder() + .withToken(response.token()) + .withTokenType(response.tokenType()) + .withIssuedTokenType(response.issuedTokenType()) + .addScopes(response.scopes()) + .setExpirationInSeconds(1) + .build(); + }; + + Mockito.doAnswer(addOneSecondExpiration) + .when(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/oauth/tokens"), + any(), + any(), + eq(OAuthTokenResponse.class), + any(), + any()); Map contextCredentials = ImmutableMap.of(); - SessionCatalog.SessionContext context = new SessionCatalog.SessionContext( - UUID.randomUUID().toString(), "user", contextCredentials, ImmutableMap.of()); + SessionCatalog.SessionContext context = + new SessionCatalog.SessionContext( + UUID.randomUUID().toString(), "user", contextCredentials, ImmutableMap.of()); RESTCatalog catalog = new RESTCatalog(context, (config) -> adapter); - catalog.initialize("prod", ImmutableMap.of(CatalogProperties.URI, "ignored", "credential", "catalog:secret")); + catalog.initialize( + "prod", ImmutableMap.of(CatalogProperties.URI, "ignored", "credential", "catalog:secret")); Thread.sleep(3_000); // sleep until after 2 refresh calls // call client credentials with no initial auth - Mockito.verify(adapter).execute( - eq(HTTPMethod.POST), - eq("v1/oauth/tokens"), - any(), - any(), - eq(OAuthTokenResponse.class), - eq(emptyHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/oauth/tokens"), + any(), + any(), + eq(OAuthTokenResponse.class), + eq(emptyHeaders), + any()); // use the client credential token for config - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), eq("v1/config"), any(), any(), eq(ConfigResponse.class), eq(catalogHeaders), any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/config"), + any(), + any(), + eq(ConfigResponse.class), + eq(catalogHeaders), + any()); // verify the first token exchange - Map firstRefreshRequest = ImmutableMap.of( - "grant_type", "urn:ietf:params:oauth:grant-type:token-exchange", - "subject_token", "client-credentials-token:sub=catalog", - "subject_token_type", "urn:ietf:params:oauth:token-type:access_token", - "scope", "catalog" - ); - Mockito.verify(adapter).execute( - eq(HTTPMethod.POST), - eq("v1/oauth/tokens"), - any(), - Mockito.argThat(firstRefreshRequest::equals), - eq(OAuthTokenResponse.class), - eq(catalogHeaders), - any()); + Map firstRefreshRequest = + ImmutableMap.of( + "grant_type", "urn:ietf:params:oauth:grant-type:token-exchange", + "subject_token", "client-credentials-token:sub=catalog", + "subject_token_type", "urn:ietf:params:oauth:token-type:access_token", + "scope", "catalog"); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/oauth/tokens"), + any(), + Mockito.argThat(firstRefreshRequest::equals), + eq(OAuthTokenResponse.class), + eq(catalogHeaders), + any()); // verify that a second exchange occurs - Map secondRefreshRequest = ImmutableMap.of( - "grant_type", "urn:ietf:params:oauth:grant-type:token-exchange", - "subject_token", "token-exchange-token:sub=client-credentials-token:sub=catalog", - "subject_token_type", "urn:ietf:params:oauth:token-type:access_token", - "scope", "catalog" - ); - Map secondRefreshHeaders = ImmutableMap.of( - "Authorization", "Bearer token-exchange-token:sub=client-credentials-token:sub=catalog" - ); - Mockito.verify(adapter).execute( - eq(HTTPMethod.POST), - eq("v1/oauth/tokens"), - any(), - Mockito.argThat(secondRefreshRequest::equals), - eq(OAuthTokenResponse.class), - eq(secondRefreshHeaders), - any()); + Map secondRefreshRequest = + ImmutableMap.of( + "grant_type", "urn:ietf:params:oauth:grant-type:token-exchange", + "subject_token", "token-exchange-token:sub=client-credentials-token:sub=catalog", + "subject_token_type", "urn:ietf:params:oauth:token-type:access_token", + "scope", "catalog"); + Map secondRefreshHeaders = + ImmutableMap.of( + "Authorization", + "Bearer token-exchange-token:sub=client-credentials-token:sub=catalog"); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/oauth/tokens"), + any(), + Mockito.argThat(secondRefreshRequest::equals), + eq(OAuthTokenResponse.class), + eq(secondRefreshHeaders), + any()); } @Test public void testCatalogRefreshedTokenIsUsed() throws Exception { Map emptyHeaders = ImmutableMap.of(); - Map catalogHeaders = ImmutableMap.of( - "Authorization", "Bearer client-credentials-token:sub=catalog"); + Map catalogHeaders = + ImmutableMap.of("Authorization", "Bearer client-credentials-token:sub=catalog"); RESTCatalogAdapter adapter = Mockito.spy(new RESTCatalogAdapter(backendCatalog)); - Answer addOneSecondExpiration = invocation -> { - OAuthTokenResponse response = (OAuthTokenResponse) invocation.callRealMethod(); - return OAuthTokenResponse.builder() - .withToken(response.token()) - .withTokenType(response.tokenType()) - .withIssuedTokenType(response.issuedTokenType()) - .addScopes(response.scopes()) - .setExpirationInSeconds(1) - .build(); - }; - - Mockito.doAnswer(addOneSecondExpiration).when(adapter).execute( - eq(HTTPMethod.POST), - eq("v1/oauth/tokens"), - any(), - any(), - eq(OAuthTokenResponse.class), - any(), - any()); + Answer addOneSecondExpiration = + invocation -> { + OAuthTokenResponse response = (OAuthTokenResponse) invocation.callRealMethod(); + return OAuthTokenResponse.builder() + .withToken(response.token()) + .withTokenType(response.tokenType()) + .withIssuedTokenType(response.issuedTokenType()) + .addScopes(response.scopes()) + .setExpirationInSeconds(1) + .build(); + }; + + Mockito.doAnswer(addOneSecondExpiration) + .when(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/oauth/tokens"), + any(), + any(), + eq(OAuthTokenResponse.class), + any(), + any()); Map contextCredentials = ImmutableMap.of(); - SessionCatalog.SessionContext context = new SessionCatalog.SessionContext( - UUID.randomUUID().toString(), "user", contextCredentials, ImmutableMap.of()); + SessionCatalog.SessionContext context = + new SessionCatalog.SessionContext( + UUID.randomUUID().toString(), "user", contextCredentials, ImmutableMap.of()); RESTCatalog catalog = new RESTCatalog(context, (config) -> adapter); - catalog.initialize("prod", ImmutableMap.of(CatalogProperties.URI, "ignored", "credential", "catalog:secret")); + catalog.initialize( + "prod", ImmutableMap.of(CatalogProperties.URI, "ignored", "credential", "catalog:secret")); Thread.sleep(1_100); // sleep until after 2 refresh calls @@ -813,46 +982,57 @@ public void testCatalogRefreshedTokenIsUsed() throws Exception { Assertions.assertFalse(catalog.tableExists(TableIdentifier.of("ns", "table"))); // call client credentials with no initial auth - Mockito.verify(adapter).execute( - eq(HTTPMethod.POST), - eq("v1/oauth/tokens"), - any(), - any(), - eq(OAuthTokenResponse.class), - eq(emptyHeaders), - any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/oauth/tokens"), + any(), + any(), + eq(OAuthTokenResponse.class), + eq(emptyHeaders), + any()); // use the client credential token for config - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), eq("v1/config"), any(), any(), eq(ConfigResponse.class), eq(catalogHeaders), any()); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/config"), + any(), + any(), + eq(ConfigResponse.class), + eq(catalogHeaders), + any()); // verify the first token exchange - Map firstRefreshRequest = ImmutableMap.of( - "grant_type", "urn:ietf:params:oauth:grant-type:token-exchange", - "subject_token", "client-credentials-token:sub=catalog", - "subject_token_type", "urn:ietf:params:oauth:token-type:access_token", - "scope", "catalog" - ); - Mockito.verify(adapter).execute( - eq(HTTPMethod.POST), - eq("v1/oauth/tokens"), - any(), - Mockito.argThat(firstRefreshRequest::equals), - eq(OAuthTokenResponse.class), - eq(catalogHeaders), - any()); + Map firstRefreshRequest = + ImmutableMap.of( + "grant_type", "urn:ietf:params:oauth:grant-type:token-exchange", + "subject_token", "client-credentials-token:sub=catalog", + "subject_token_type", "urn:ietf:params:oauth:token-type:access_token", + "scope", "catalog"); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.POST), + eq("v1/oauth/tokens"), + any(), + Mockito.argThat(firstRefreshRequest::equals), + eq(OAuthTokenResponse.class), + eq(catalogHeaders), + any()); // use the refreshed context token for table load - Map refreshedCatalogHeader = ImmutableMap.of( - "Authorization", "Bearer token-exchange-token:sub=client-credentials-token:sub=catalog" - ); - Mockito.verify(adapter).execute( - eq(HTTPMethod.GET), - eq("v1/namespaces/ns/tables/table"), - any(), - any(), - eq(LoadTableResponse.class), - eq(refreshedCatalogHeader), - any()); + Map refreshedCatalogHeader = + ImmutableMap.of( + "Authorization", + "Bearer token-exchange-token:sub=client-credentials-token:sub=catalog"); + Mockito.verify(adapter) + .execute( + eq(HTTPMethod.GET), + eq("v1/namespaces/ns/tables/table"), + any(), + any(), + eq(LoadTableResponse.class), + eq(refreshedCatalogHeader), + any()); } } diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTUtil.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTUtil.java index ddb8eef87e48..57c25d4a5553 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTUtil.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; import java.util.Map; @@ -29,19 +28,20 @@ public class TestRESTUtil { @Test public void testExtractPrefixMap() { - Map input = ImmutableMap.of( - "warehouse", "/tmp/warehouse", - "rest.prefix", "/ws/ralphs_catalog", - "rest.token", "YnVybiBhZnRlciByZWFkaW5nIC0gYWxzbyBoYW5rIGFuZCByYXVsIDQgZXZlcgo=", - "rest.rest.uri", "https://localhost:1080/", - "doesnt_start_with_prefix.rest", "", - "", ""); - - Map expected = ImmutableMap.of( - "prefix", "/ws/ralphs_catalog", - "token", "YnVybiBhZnRlciByZWFkaW5nIC0gYWxzbyBoYW5rIGFuZCByYXVsIDQgZXZlcgo=", - "rest.uri", "https://localhost:1080/" - ); + Map input = + ImmutableMap.of( + "warehouse", "/tmp/warehouse", + "rest.prefix", "/ws/ralphs_catalog", + "rest.token", "YnVybiBhZnRlciByZWFkaW5nIC0gYWxzbyBoYW5rIGFuZCByYXVsIDQgZXZlcgo=", + "rest.rest.uri", "https://localhost:1080/", + "doesnt_start_with_prefix.rest", "", + "", ""); + + Map expected = + ImmutableMap.of( + "prefix", "/ws/ralphs_catalog", + "token", "YnVybiBhZnRlciByZWFkaW5nIC0gYWxzbyBoYW5rIGFuZCByYXVsIDQgZXZlcgo=", + "rest.uri", "https://localhost:1080/"); Map actual = RESTUtil.extractPrefixMap(input, "rest."); @@ -50,12 +50,13 @@ public void testExtractPrefixMap() { @Test public void testStripTrailingSlash() { - String[][] testCases = new String[][] { - new String[] {"https://foo", "https://foo"}, - new String[] {"https://foo/", "https://foo"}, - new String[] {"https://foo////", "https://foo"}, - new String[] {null, null} - }; + String[][] testCases = + new String[][] { + new String[] {"https://foo", "https://foo"}, + new String[] {"https://foo/", "https://foo"}, + new String[] {"https://foo////", "https://foo"}, + new String[] {null, null} + }; for (String[] testCase : testCases) { String input = testCase[0]; @@ -67,16 +68,17 @@ public void testStripTrailingSlash() { @Test public void testRoundTripUrlEncodeDecodeNamespace() { // Namespace levels and their expected url encoded form - Object[][] testCases = new Object[][] { - new Object[] {new String[] {"dogs"}, "dogs"}, - new Object[] {new String[] {"dogs.named.hank"}, "dogs.named.hank"}, - new Object[] {new String[] {"dogs/named/hank"}, "dogs%2Fnamed%2Fhank"}, - new Object[] {new String[] {"dogs", "named", "hank"}, "dogs%1Fnamed%1Fhank"}, - new Object[] { + Object[][] testCases = + new Object[][] { + new Object[] {new String[] {"dogs"}, "dogs"}, + new Object[] {new String[] {"dogs.named.hank"}, "dogs.named.hank"}, + new Object[] {new String[] {"dogs/named/hank"}, "dogs%2Fnamed%2Fhank"}, + new Object[] {new String[] {"dogs", "named", "hank"}, "dogs%1Fnamed%1Fhank"}, + new Object[] { new String[] {"dogs.and.cats", "named", "hank.or.james-westfall"}, "dogs.and.cats%1Fnamed%1Fhank.or.james-westfall" - } - }; + } + }; for (Object[] namespaceWithEncoding : testCases) { String[] levels = (String[]) namespaceWithEncoding[0]; @@ -85,8 +87,7 @@ public void testRoundTripUrlEncodeDecodeNamespace() { Namespace namespace = Namespace.of(levels); // To be placed into a URL path as query parameter or path parameter - Assertions.assertThat(RESTUtil.encodeNamespace(namespace)) - .isEqualTo(encodedNs); + Assertions.assertThat(RESTUtil.encodeNamespace(namespace)).isEqualTo(encodedNs); // Decoded (after pulling as String) from URL Namespace asNamespace = RESTUtil.decodeNamespace(encodedNs); diff --git a/core/src/test/java/org/apache/iceberg/rest/TestResourcePaths.java b/core/src/test/java/org/apache/iceberg/rest/TestResourcePaths.java index 65667cff0263..dc1fbb0c9c95 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestResourcePaths.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestResourcePaths.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest; import org.apache.iceberg.catalog.Namespace; @@ -27,7 +26,8 @@ public class TestResourcePaths { private final String prefix = "ws/catalog"; - private final ResourcePaths withPrefix = ResourcePaths.forCatalogProperties(ImmutableMap.of("prefix", prefix)); + private final ResourcePaths withPrefix = + ResourcePaths.forCatalogProperties(ImmutableMap.of("prefix", prefix)); private final ResourcePaths withoutPrefix = ResourcePaths.forCatalogProperties(ImmutableMap.of()); @Test @@ -66,21 +66,24 @@ public void testNamespaceWithMultipartNamespace() { @Test public void testNamespaceProperties() { Namespace ns = Namespace.of("ns"); - Assert.assertEquals("v1/ws/catalog/namespaces/ns/properties", withPrefix.namespaceProperties(ns)); + Assert.assertEquals( + "v1/ws/catalog/namespaces/ns/properties", withPrefix.namespaceProperties(ns)); Assert.assertEquals("v1/namespaces/ns/properties", withoutPrefix.namespaceProperties(ns)); } @Test public void testNamespacePropertiesWithSlash() { Namespace ns = Namespace.of("n/s"); - Assert.assertEquals("v1/ws/catalog/namespaces/n%2Fs/properties", withPrefix.namespaceProperties(ns)); + Assert.assertEquals( + "v1/ws/catalog/namespaces/n%2Fs/properties", withPrefix.namespaceProperties(ns)); Assert.assertEquals("v1/namespaces/n%2Fs/properties", withoutPrefix.namespaceProperties(ns)); } @Test public void testNamespacePropertiesWithMultipartNamespace() { Namespace ns = Namespace.of("n", "s"); - Assert.assertEquals("v1/ws/catalog/namespaces/n%1Fs/properties", withPrefix.namespaceProperties(ns)); + Assert.assertEquals( + "v1/ws/catalog/namespaces/n%1Fs/properties", withPrefix.namespaceProperties(ns)); Assert.assertEquals("v1/namespaces/n%1Fs/properties", withoutPrefix.namespaceProperties(ns)); } diff --git a/core/src/test/java/org/apache/iceberg/rest/auth/TestOAuth2Util.java b/core/src/test/java/org/apache/iceberg/rest/auth/TestOAuth2Util.java index 3a1d59061cc6..c85131cda2c7 100644 --- a/core/src/test/java/org/apache/iceberg/rest/auth/TestOAuth2Util.java +++ b/core/src/test/java/org/apache/iceberg/rest/auth/TestOAuth2Util.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.auth; import org.junit.Assert; @@ -30,16 +29,25 @@ public void testOAuthScopeTokenValidation() { Assert.assertFalse("Should reject scope token with \\", OAuth2Util.isValidScopeToken("a\\b")); Assert.assertFalse("Should reject scope token with space", OAuth2Util.isValidScopeToken("a b")); Assert.assertFalse("Should reject scope token with \"", OAuth2Util.isValidScopeToken("a\"b")); - Assert.assertFalse("Should reject scope token with DEL", OAuth2Util.isValidScopeToken("\u007F")); + Assert.assertFalse( + "Should reject scope token with DEL", OAuth2Util.isValidScopeToken("\u007F")); // test all characters that are inside of the ! to ~ range and are not excluded - Assert.assertTrue("Should accept scope token with !-/", OAuth2Util.isValidScopeToken("!#$%&'()*+,-./")); - Assert.assertTrue("Should accept scope token with 0-9", OAuth2Util.isValidScopeToken("0123456789")); - Assert.assertTrue("Should accept scope token with :-@", OAuth2Util.isValidScopeToken(":;<=>?@")); - Assert.assertTrue("Should accept scope token with A-M", OAuth2Util.isValidScopeToken("ABCDEFGHIJKLM")); - Assert.assertTrue("Should accept scope token with N-Z", OAuth2Util.isValidScopeToken("NOPQRSTUVWXYZ")); - Assert.assertTrue("Should accept scope token with [-`, not \\", OAuth2Util.isValidScopeToken("[]^_`")); - Assert.assertTrue("Should accept scope token with a-m", OAuth2Util.isValidScopeToken("abcdefghijklm")); - Assert.assertTrue("Should accept scope token with n-z", OAuth2Util.isValidScopeToken("nopqrstuvwxyz")); + Assert.assertTrue( + "Should accept scope token with !-/", OAuth2Util.isValidScopeToken("!#$%&'()*+,-./")); + Assert.assertTrue( + "Should accept scope token with 0-9", OAuth2Util.isValidScopeToken("0123456789")); + Assert.assertTrue( + "Should accept scope token with :-@", OAuth2Util.isValidScopeToken(":;<=>?@")); + Assert.assertTrue( + "Should accept scope token with A-M", OAuth2Util.isValidScopeToken("ABCDEFGHIJKLM")); + Assert.assertTrue( + "Should accept scope token with N-Z", OAuth2Util.isValidScopeToken("NOPQRSTUVWXYZ")); + Assert.assertTrue( + "Should accept scope token with [-`, not \\", OAuth2Util.isValidScopeToken("[]^_`")); + Assert.assertTrue( + "Should accept scope token with a-m", OAuth2Util.isValidScopeToken("abcdefghijklm")); + Assert.assertTrue( + "Should accept scope token with n-z", OAuth2Util.isValidScopeToken("nopqrstuvwxyz")); Assert.assertTrue("Should accept scope token with {-~", OAuth2Util.isValidScopeToken("{|}~")); } } diff --git a/core/src/test/java/org/apache/iceberg/rest/requests/TestCreateNamespaceRequest.java b/core/src/test/java/org/apache/iceberg/rest/requests/TestCreateNamespaceRequest.java index cadbb9b07739..2ea7e75aaf51 100644 --- a/core/src/test/java/org/apache/iceberg/rest/requests/TestCreateNamespaceRequest.java +++ b/core/src/test/java/org/apache/iceberg/rest/requests/TestCreateNamespaceRequest.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.requests; import com.fasterxml.jackson.core.JsonProcessingException; @@ -39,23 +38,27 @@ public class TestCreateNamespaceRequest extends RequestResponseTestBase deserialize(jsonIncorrectTypeForNamespace) - ); + () -> deserialize(jsonIncorrectTypeForNamespace)); - String jsonIncorrectTypeForProperties = "{\"namespace\":[\"accounting\",\"tax\"],\"properties\":[]}"; + String jsonIncorrectTypeForProperties = + "{\"namespace\":[\"accounting\",\"tax\"],\"properties\":[]}"; AssertHelpers.assertThrows( "A JSON request with incorrect types for fields should fail to parse and validate", JsonProcessingException.class, - () -> deserialize(jsonIncorrectTypeForProperties) - ); + () -> deserialize(jsonIncorrectTypeForProperties)); - String jsonMisspelledKeys = "{\"namepsace\":[\"accounting\",\"tax\"],\"propertiezzzz\":{\"owner\":\"Hank\"}}"; + String jsonMisspelledKeys = + "{\"namepsace\":[\"accounting\",\"tax\"],\"propertiezzzz\":{\"owner\":\"Hank\"}}"; AssertHelpers.assertThrows( "A JSON request with the keys spelled incorrectly should fail to deserialize and validate", IllegalArgumentException.class, "Invalid namespace: null", - () -> deserialize(jsonMisspelledKeys) - ); + () -> deserialize(jsonMisspelledKeys)); String emptyJson = "{}"; AssertHelpers.assertThrows( "An empty JSON object should not parse into a CreateNamespaceRequest instance that passes validation", IllegalArgumentException.class, "Invalid namespace: null", - () -> deserialize(emptyJson) - ); + () -> deserialize(emptyJson)); AssertHelpers.assertThrows( "An empty JSON request should fail to deserialize", IllegalArgumentException.class, - () -> deserialize(null) - ); + () -> deserialize(null)); } @Test @@ -116,15 +117,13 @@ public void testBuilderDoesNotBuildInvalidRequests() { "The builder should not allow using null for the namespace", NullPointerException.class, "Invalid namespace: null", - () -> CreateNamespaceRequest.builder().withNamespace(null).build() - ); + () -> CreateNamespaceRequest.builder().withNamespace(null).build()); AssertHelpers.assertThrows( "The builder should not allow passing a null collection of properties", NullPointerException.class, "Invalid collection of properties: null", - () -> CreateNamespaceRequest.builder().setProperties(null).build() - ); + () -> CreateNamespaceRequest.builder().setProperties(null).build()); Map mapWithNullKey = Maps.newHashMap(); mapWithNullKey.put(null, "hello"); @@ -132,8 +131,7 @@ public void testBuilderDoesNotBuildInvalidRequests() { "The builder should not allow using null as a key in the properties to set", IllegalArgumentException.class, "Invalid property: null", - () -> CreateNamespaceRequest.builder().setProperties(mapWithNullKey).build() - ); + () -> CreateNamespaceRequest.builder().setProperties(mapWithNullKey).build()); Map mapWithMultipleNullValues = Maps.newHashMap(); mapWithMultipleNullValues.put("a", null); @@ -142,8 +140,7 @@ public void testBuilderDoesNotBuildInvalidRequests() { "The builder should not allow using null as a value in the properties to set", IllegalArgumentException.class, "Invalid value for properties [a]: null", - () -> CreateNamespaceRequest.builder().setProperties(mapWithMultipleNullValues).build() - ); + () -> CreateNamespaceRequest.builder().setProperties(mapWithMultipleNullValues).build()); } @Override diff --git a/core/src/test/java/org/apache/iceberg/rest/requests/TestCreateTableRequest.java b/core/src/test/java/org/apache/iceberg/rest/requests/TestCreateTableRequest.java index 4ef9dedaeadb..9176e8ad4d24 100644 --- a/core/src/test/java/org/apache/iceberg/rest/requests/TestCreateTableRequest.java +++ b/core/src/test/java/org/apache/iceberg/rest/requests/TestCreateTableRequest.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.requests; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import com.fasterxml.jackson.core.JsonProcessingException; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -36,9 +38,6 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestCreateTableRequest extends RequestResponseTestBase { /* Values used to fill in request fields */ @@ -46,92 +45,104 @@ public class TestCreateTableRequest extends RequestResponseTestBase EMPTY_PROPERTIES = ImmutableMap.of(); private static final String SAMPLE_NAME = "test_tbl"; private static final String SAMPLE_LOCATION = "file://tmp/location/"; - private static final Schema SAMPLE_SCHEMA = new Schema( - required(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get())); + private static final Schema SAMPLE_SCHEMA = + new Schema( + required(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); private static final String SAMPLE_SCHEMA_JSON = SchemaParser.toJson(SAMPLE_SCHEMA); - private static final PartitionSpec SAMPLE_SPEC = PartitionSpec.builderFor(SAMPLE_SCHEMA) - .bucket("id", 16) - .build(); - private static final SortOrder SAMPLE_WRITE_ORDER = SortOrder.builderFor(SAMPLE_SCHEMA) - .asc("data", NullOrder.NULLS_LAST) - .build(); + private static final PartitionSpec SAMPLE_SPEC = + PartitionSpec.builderFor(SAMPLE_SCHEMA).bucket("id", 16).build(); + private static final SortOrder SAMPLE_WRITE_ORDER = + SortOrder.builderFor(SAMPLE_SCHEMA).asc("data", NullOrder.NULLS_LAST).build(); @Test // Test cases that are JSON that can be created via the Builder public void testRoundTripSerDe() throws JsonProcessingException { String fullJsonRaw = - "{\"name\":\"test_tbl\",\"location\":\"file://tmp/location/\",\"schema\":{\"type\":\"struct\"," + - "\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"id\",\"required\":true,\"type\":\"int\"}," + - "{\"id\":2,\"name\":\"data\",\"required\":false,\"type\":\"string\"}]},\"partition-spec\":{\"spec-id\":0," + - "\"fields\":[{\"name\":\"id_bucket\",\"transform\":\"bucket[16]\",\"source-id\":1,\"field-id\":1000}]}," + - "\"write-order\":{\"order-id\":1,\"fields\":" + - "[{\"transform\":\"identity\",\"source-id\":2,\"direction\":\"asc\",\"null-order\":\"nulls-last\"}]}," + - "\"properties\":{\"owner\":\"Hank\"},\"stage-create\":false}"; - - CreateTableRequest req = CreateTableRequest.builder() - .withName(SAMPLE_NAME) - .withLocation(SAMPLE_LOCATION) - .withSchema(SAMPLE_SCHEMA) - .setProperties(SAMPLE_PROPERTIES) - .withPartitionSpec(SAMPLE_SPEC) - .withWriteOrder(SAMPLE_WRITE_ORDER) - .build(); + "{\"name\":\"test_tbl\",\"location\":\"file://tmp/location/\",\"schema\":{\"type\":\"struct\"," + + "\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"id\",\"required\":true,\"type\":\"int\"}," + + "{\"id\":2,\"name\":\"data\",\"required\":false,\"type\":\"string\"}]},\"partition-spec\":{\"spec-id\":0," + + "\"fields\":[{\"name\":\"id_bucket\",\"transform\":\"bucket[16]\",\"source-id\":1,\"field-id\":1000}]}," + + "\"write-order\":{\"order-id\":1,\"fields\":" + + "[{\"transform\":\"identity\",\"source-id\":2,\"direction\":\"asc\",\"null-order\":\"nulls-last\"}]}," + + "\"properties\":{\"owner\":\"Hank\"},\"stage-create\":false}"; + + CreateTableRequest req = + CreateTableRequest.builder() + .withName(SAMPLE_NAME) + .withLocation(SAMPLE_LOCATION) + .withSchema(SAMPLE_SCHEMA) + .setProperties(SAMPLE_PROPERTIES) + .withPartitionSpec(SAMPLE_SPEC) + .withWriteOrder(SAMPLE_WRITE_ORDER) + .build(); assertRoundTripSerializesEquallyFrom(fullJsonRaw, req); - // The same JSON but using existing parsers for clarity and staging the request instead of committing - String jsonStagedReq = String.format( - "{\"name\":\"%s\",\"location\":\"%s\",\"schema\":%s,\"partition-spec\":%s," + - "\"write-order\":%s,\"properties\":%s,\"stage-create\":%b}", - SAMPLE_NAME, SAMPLE_LOCATION, SchemaParser.toJson(SAMPLE_SCHEMA), - PartitionSpecParser.toJson(SAMPLE_SPEC.toUnbound()), SortOrderParser.toJson(SAMPLE_WRITE_ORDER.toUnbound()), - mapper().writeValueAsString(SAMPLE_PROPERTIES), true); - - CreateTableRequest stagedReq = CreateTableRequest.builder() - .withName(SAMPLE_NAME) - .withLocation(SAMPLE_LOCATION) - .withSchema(SAMPLE_SCHEMA) - .setProperty("owner", "Hank") - .withPartitionSpec(SAMPLE_SPEC) - .withWriteOrder(SAMPLE_WRITE_ORDER) - .stageCreate() - .build(); + // The same JSON but using existing parsers for clarity and staging the request instead of + // committing + String jsonStagedReq = + String.format( + "{\"name\":\"%s\",\"location\":\"%s\",\"schema\":%s,\"partition-spec\":%s," + + "\"write-order\":%s,\"properties\":%s,\"stage-create\":%b}", + SAMPLE_NAME, + SAMPLE_LOCATION, + SchemaParser.toJson(SAMPLE_SCHEMA), + PartitionSpecParser.toJson(SAMPLE_SPEC.toUnbound()), + SortOrderParser.toJson(SAMPLE_WRITE_ORDER.toUnbound()), + mapper().writeValueAsString(SAMPLE_PROPERTIES), + true); + + CreateTableRequest stagedReq = + CreateTableRequest.builder() + .withName(SAMPLE_NAME) + .withLocation(SAMPLE_LOCATION) + .withSchema(SAMPLE_SCHEMA) + .setProperty("owner", "Hank") + .withPartitionSpec(SAMPLE_SPEC) + .withWriteOrder(SAMPLE_WRITE_ORDER) + .stageCreate() + .build(); assertRoundTripSerializesEquallyFrom(jsonStagedReq, stagedReq); - // Partition spec and write order can be null or use PartitionSpec.unpartitioned() and SortOrder.unsorted() - String jsonWithExplicitUnsortedUnordered = String.format( - "{\"name\":\"%s\",\"location\":null,\"schema\":%s,\"partition-spec\":%s," + - "\"write-order\":%s,\"properties\":{},\"stage-create\":%b}", - SAMPLE_NAME, SchemaParser.toJson(SAMPLE_SCHEMA), - PartitionSpecParser.toJson(PartitionSpec.unpartitioned()), - SortOrderParser.toJson(SortOrder.unsorted().toUnbound()), - /* stageCreate */ false); - - CreateTableRequest reqOnlyRequiredFieldsExplicitDefaults = CreateTableRequest.builder() - .withName(SAMPLE_NAME) - .withLocation(null) - .withSchema(SAMPLE_SCHEMA) - .setProperties(EMPTY_PROPERTIES) - .withPartitionSpec(PartitionSpec.unpartitioned()) - .withWriteOrder(SortOrder.unsorted()) - .build(); + // Partition spec and write order can be null or use PartitionSpec.unpartitioned() and + // SortOrder.unsorted() + String jsonWithExplicitUnsortedUnordered = + String.format( + "{\"name\":\"%s\",\"location\":null,\"schema\":%s,\"partition-spec\":%s," + + "\"write-order\":%s,\"properties\":{},\"stage-create\":%b}", + SAMPLE_NAME, + SchemaParser.toJson(SAMPLE_SCHEMA), + PartitionSpecParser.toJson(PartitionSpec.unpartitioned()), + SortOrderParser.toJson(SortOrder.unsorted().toUnbound()), + /* stageCreate */ false); + + CreateTableRequest reqOnlyRequiredFieldsExplicitDefaults = + CreateTableRequest.builder() + .withName(SAMPLE_NAME) + .withLocation(null) + .withSchema(SAMPLE_SCHEMA) + .setProperties(EMPTY_PROPERTIES) + .withPartitionSpec(PartitionSpec.unpartitioned()) + .withWriteOrder(SortOrder.unsorted()) + .build(); assertRoundTripSerializesEquallyFrom( jsonWithExplicitUnsortedUnordered, reqOnlyRequiredFieldsExplicitDefaults); - String jsonOnlyRequiredFieldsNullAsDefault = String.format( - "{\"name\":\"%s\",\"location\":null,\"schema\":%s,\"partition-spec\":null,\"write-order\":null," + - "\"properties\":{},\"stage-create\":false}", - SAMPLE_NAME, SchemaParser.toJson(SAMPLE_SCHEMA)); + String jsonOnlyRequiredFieldsNullAsDefault = + String.format( + "{\"name\":\"%s\",\"location\":null,\"schema\":%s,\"partition-spec\":null,\"write-order\":null," + + "\"properties\":{},\"stage-create\":false}", + SAMPLE_NAME, SchemaParser.toJson(SAMPLE_SCHEMA)); - CreateTableRequest reqOnlyRequiredFieldsMissingDefaults = CreateTableRequest.builder() - .withName(SAMPLE_NAME) - .withSchema(SAMPLE_SCHEMA) - .withPartitionSpec(null) - .withWriteOrder(null) - .build(); + CreateTableRequest reqOnlyRequiredFieldsMissingDefaults = + CreateTableRequest.builder() + .withName(SAMPLE_NAME) + .withSchema(SAMPLE_SCHEMA) + .withPartitionSpec(null) + .withWriteOrder(null) + .build(); assertRoundTripSerializesEquallyFrom( jsonOnlyRequiredFieldsNullAsDefault, reqOnlyRequiredFieldsMissingDefaults); @@ -141,60 +152,59 @@ public void testRoundTripSerDe() throws JsonProcessingException { // Test cases that can't be constructed with our Builder class but that will parse correctly public void testCanDeserializeWithoutDefaultValues() throws JsonProcessingException { // Name and schema are only two required fields - String jsonOnlyRequiredFieldsMissingDefaults = String.format( - "{\"name\":\"%s\",\"schema\":%s}", SAMPLE_NAME, SchemaParser.toJson(SAMPLE_SCHEMA)); + String jsonOnlyRequiredFieldsMissingDefaults = + String.format( + "{\"name\":\"%s\",\"schema\":%s}", SAMPLE_NAME, SchemaParser.toJson(SAMPLE_SCHEMA)); - CreateTableRequest reqOnlyRequiredFieldsMissingDefaults = CreateTableRequest.builder() - .withName(SAMPLE_NAME) - .withSchema(SAMPLE_SCHEMA) - .build(); + CreateTableRequest reqOnlyRequiredFieldsMissingDefaults = + CreateTableRequest.builder().withName(SAMPLE_NAME).withSchema(SAMPLE_SCHEMA).build(); - assertEquals(deserialize(jsonOnlyRequiredFieldsMissingDefaults), reqOnlyRequiredFieldsMissingDefaults); + assertEquals( + deserialize(jsonOnlyRequiredFieldsMissingDefaults), reqOnlyRequiredFieldsMissingDefaults); } @Test public void testDeserializeInvalidRequest() { String jsonMissingSchema = - "{\"name\":\"foo\",\"location\":null,\"partition-spec\":null,\"write-order\":null,\"properties\":{}," + - "\"stage-create\":false}"; + "{\"name\":\"foo\",\"location\":null,\"partition-spec\":null,\"write-order\":null,\"properties\":{}," + + "\"stage-create\":false}"; AssertHelpers.assertThrows( "A JSON request with the keys spelled incorrectly should fail to deserialize and validate", IllegalArgumentException.class, "Invalid schema: null", - () -> deserialize(jsonMissingSchema) - ); + () -> deserialize(jsonMissingSchema)); - String jsonMissingName = String.format( - "{\"location\":null,\"schema\":%s,\"spec\":null,\"write-order\":null,\"properties\":{}," + - "\"stage-create\":false}", SAMPLE_SCHEMA_JSON); + String jsonMissingName = + String.format( + "{\"location\":null,\"schema\":%s,\"spec\":null,\"write-order\":null,\"properties\":{}," + + "\"stage-create\":false}", + SAMPLE_SCHEMA_JSON); AssertHelpers.assertThrows( "A JSON request with the keys spelled incorrectly should fail to deserialize and validate", IllegalArgumentException.class, "Invalid table name: null", - () -> deserialize(jsonMissingName) - ); + () -> deserialize(jsonMissingName)); - String jsonIncorrectTypeForProperties = String.format( - "{\"name\":\"foo\",\"location\":null,\"schema\":%s,\"partition-spec\":null,\"write-order\":null," + - "\"properties\":[],\"stage-create\":false}", SAMPLE_SCHEMA_JSON); + String jsonIncorrectTypeForProperties = + String.format( + "{\"name\":\"foo\",\"location\":null,\"schema\":%s,\"partition-spec\":null,\"write-order\":null," + + "\"properties\":[],\"stage-create\":false}", + SAMPLE_SCHEMA_JSON); AssertHelpers.assertThrows( "A JSON request with incorrect types for fields should fail to parse and validate", JsonProcessingException.class, - () -> deserialize(jsonIncorrectTypeForProperties) - ); + () -> deserialize(jsonIncorrectTypeForProperties)); AssertHelpers.assertThrows( "An empty JSON object should not parse into a CreateNamespaceRequest instance that passes validation", IllegalArgumentException.class, "Invalid table name: null", - () -> deserialize("{}") - ); + () -> deserialize("{}")); AssertHelpers.assertThrows( "An empty JSON request should fail to deserialize", IllegalArgumentException.class, - () -> deserialize(null) - ); + () -> deserialize(null)); } @Test @@ -203,22 +213,19 @@ public void testBuilderDoesNotBuildInvalidRequests() { "The builder should not allow using null for the namespace", NullPointerException.class, "Invalid name: null", - () -> CreateTableRequest.builder().withName(null) - ); + () -> CreateTableRequest.builder().withName(null)); AssertHelpers.assertThrows( "The builder should not allow using null for the schema", NullPointerException.class, "Invalid schema: null", - () -> CreateTableRequest.builder().withSchema(null) - ); + () -> CreateTableRequest.builder().withSchema(null)); AssertHelpers.assertThrows( "The builder should not allow passing a null collection of properties", NullPointerException.class, "Invalid collection of properties: null", - () -> CreateTableRequest.builder().setProperties(null) - ); + () -> CreateTableRequest.builder().setProperties(null)); Map mapWithNullKey = Maps.newHashMap(); mapWithNullKey.put(null, "hello"); @@ -226,8 +233,7 @@ public void testBuilderDoesNotBuildInvalidRequests() { "The builder should not allow using null as a key in the properties to set", IllegalArgumentException.class, "Invalid property: null", - () -> CreateTableRequest.builder().setProperties(mapWithNullKey) - ); + () -> CreateTableRequest.builder().setProperties(mapWithNullKey)); Map mapWithNullValue = Maps.newHashMap(); mapWithNullValue.put("a", null); @@ -236,27 +242,26 @@ public void testBuilderDoesNotBuildInvalidRequests() { "The builder should not allow using null as a value in the properties to set", IllegalArgumentException.class, "Invalid value for properties [a]: null", - () -> CreateTableRequest.builder().setProperties(mapWithNullValue).build() - ); + () -> CreateTableRequest.builder().setProperties(mapWithNullValue).build()); AssertHelpers.assertThrows( "The builder should not allow using null as a value when setting a single property", IllegalArgumentException.class, "Invalid value for property foo: null", - () -> CreateTableRequest.builder().setProperty("foo", null) - ); + () -> CreateTableRequest.builder().setProperty("foo", null)); AssertHelpers.assertThrows( "The builder should not allow using null as a key when setting a single property", IllegalArgumentException.class, "Invalid property: null", - () -> CreateTableRequest.builder().setProperty(null, "foo") - ); + () -> CreateTableRequest.builder().setProperty(null, "foo")); } @Override public String[] allFieldsFromSpec() { - return new String[] {"name", "location", "schema", "partition-spec", "write-order", "stage-create", "properties"}; + return new String[] { + "name", "location", "schema", "partition-spec", "write-order", "stage-create", "properties" + }; } @Override @@ -275,13 +280,19 @@ public CreateTableRequest createExampleInstance() { @Override public void assertEquals(CreateTableRequest actual, CreateTableRequest expected) { Assert.assertEquals("Name should be the same", expected.name(), actual.name()); - Assert.assertEquals("Location should be the same if provided", expected.location(), actual.location()); - Assert.assertTrue("Schemas should be equivalent and have same schema id", - expected.schema().sameSchema(actual.schema()) && expected.schema().schemaId() == actual.schema().schemaId()); + Assert.assertEquals( + "Location should be the same if provided", expected.location(), actual.location()); + Assert.assertTrue( + "Schemas should be equivalent and have same schema id", + expected.schema().sameSchema(actual.schema()) + && expected.schema().schemaId() == actual.schema().schemaId()); Assert.assertEquals("Partition spec should be equal", expected.spec(), actual.spec()); - Assert.assertEquals("Write [sort] order should be the same", expected.writeOrder(), actual.writeOrder()); - Assert.assertEquals("Properties should be the same", expected.properties(), actual.properties()); - Assert.assertEquals("Stage create should be equal", expected.stageCreate(), actual.stageCreate()); + Assert.assertEquals( + "Write [sort] order should be the same", expected.writeOrder(), actual.writeOrder()); + Assert.assertEquals( + "Properties should be the same", expected.properties(), actual.properties()); + Assert.assertEquals( + "Stage create should be equal", expected.stageCreate(), actual.stageCreate()); } @Override diff --git a/core/src/test/java/org/apache/iceberg/rest/requests/TestRenameTableRequest.java b/core/src/test/java/org/apache/iceberg/rest/requests/TestRenameTableRequest.java index 7bdbc59bc08e..13c0a77edc44 100644 --- a/core/src/test/java/org/apache/iceberg/rest/requests/TestRenameTableRequest.java +++ b/core/src/test/java/org/apache/iceberg/rest/requests/TestRenameTableRequest.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.requests; import com.fasterxml.jackson.core.JsonProcessingException; @@ -33,15 +32,17 @@ public class TestRenameTableRequest extends RequestResponseTestBase deserialize(jsonSourceNullName) - ); + () -> deserialize(jsonSourceNullName)); String jsonDestinationNullName = - "{\"source\":{\"namespace\":[\"accounting\",\"tax\"],\"name\":\"paid\"}," + - "\"destination\":{\"namespace\":[\"accounting\",\"tax\"],\"name\":null}}"; + "{\"source\":{\"namespace\":[\"accounting\",\"tax\"],\"name\":\"paid\"}," + + "\"destination\":{\"namespace\":[\"accounting\",\"tax\"],\"name\":null}}"; AssertHelpers.assertThrows( "A JSON request with an invalid destination table, with null for the name, should fail to deserialize", JsonProcessingException.class, "Cannot parse name to a string value: null", - () -> deserialize(jsonDestinationNullName) - ); + () -> deserialize(jsonDestinationNullName)); String jsonSourceMissingName = - "{\"source\":{\"namespace\":[\"accounting\",\"tax\"]}," + - "\"destination\":{\"name\":\"paid_2022\"}}"; + "{\"source\":{\"namespace\":[\"accounting\",\"tax\"]}," + + "\"destination\":{\"name\":\"paid_2022\"}}"; AssertHelpers.assertThrows( "A JSON request with an invalid source table identifier, with no name, should fail to deserialize", JsonProcessingException.class, "Cannot parse missing string name", - () -> deserialize(jsonSourceMissingName) - ); + () -> deserialize(jsonSourceMissingName)); String jsonDestinationMissingName = - "{\"source\":{\"namespace\":[\"accounting\",\"tax\"],\"name\":\"paid\"}," + - "\"destination\":{\"namespace\":[\"accounting\",\"tax\"]}}"; + "{\"source\":{\"namespace\":[\"accounting\",\"tax\"],\"name\":\"paid\"}," + + "\"destination\":{\"namespace\":[\"accounting\",\"tax\"]}}"; AssertHelpers.assertThrows( "A JSON request with an invalid destination table identifier, with no name, should fail to deserialize", JsonProcessingException.class, "Cannot parse missing string name", - () -> deserialize(jsonDestinationMissingName) - ); + () -> deserialize(jsonDestinationMissingName)); String emptyJson = "{}"; AssertHelpers.assertThrows( "An empty JSON object should not parse into a valid RenameTableRequest instance", IllegalArgumentException.class, "Invalid source table: null", - () -> deserialize(emptyJson) - ); + () -> deserialize(emptyJson)); AssertHelpers.assertThrows( "An empty JSON request should fail to deserialize", IllegalArgumentException.class, - () -> deserialize(null) - ); + () -> deserialize(null)); } @Test @@ -109,15 +104,17 @@ public void testBuilderDoesNotBuildInvalidRequests() { "The builder should not allow using null for the source table", NullPointerException.class, "Invalid source table identifier: null", - () -> RenameTableRequest.builder().withSource(null).withDestination(TAX_PAID_RENAMED).build() - ); + () -> + RenameTableRequest.builder() + .withSource(null) + .withDestination(TAX_PAID_RENAMED) + .build()); AssertHelpers.assertThrows( "The builder should not allow using null for the destination table", NullPointerException.class, "Invalid destination table identifier: null", - () -> RenameTableRequest.builder().withSource(TAX_PAID).withDestination(null).build() - ); + () -> RenameTableRequest.builder().withSource(TAX_PAID).withDestination(null).build()); } @Override @@ -135,8 +132,12 @@ public RenameTableRequest createExampleInstance() { @Override public void assertEquals(RenameTableRequest actual, RenameTableRequest expected) { - Assert.assertEquals("Source table identifier should be equal", expected.source(), actual.source()); - Assert.assertEquals("Destination table identifier should be equal", expected.destination(), actual.destination()); + Assert.assertEquals( + "Source table identifier should be equal", expected.source(), actual.source()); + Assert.assertEquals( + "Destination table identifier should be equal", + expected.destination(), + actual.destination()); } @Override diff --git a/core/src/test/java/org/apache/iceberg/rest/requests/TestUpdateNamespacePropertiesRequest.java b/core/src/test/java/org/apache/iceberg/rest/requests/TestUpdateNamespacePropertiesRequest.java index 4aad6e469e1a..2d6b65807c6c 100644 --- a/core/src/test/java/org/apache/iceberg/rest/requests/TestUpdateNamespacePropertiesRequest.java +++ b/core/src/test/java/org/apache/iceberg/rest/requests/TestUpdateNamespacePropertiesRequest.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.requests; import com.fasterxml.jackson.core.JsonProcessingException; @@ -32,7 +31,8 @@ import org.junit.Assert; import org.junit.Test; -public class TestUpdateNamespacePropertiesRequest extends RequestResponseTestBase { +public class TestUpdateNamespacePropertiesRequest + extends RequestResponseTestBase { /* Values used to fill in request fields */ private static final Map UPDATES = ImmutableMap.of("owner", "Hank"); @@ -45,12 +45,17 @@ public void testRoundTripSerDe() throws JsonProcessingException { // Full request String fullJson = "{\"removals\":[\"foo\",\"bar\"],\"updates\":{\"owner\":\"Hank\"}}"; assertRoundTripSerializesEquallyFrom( - fullJson, UpdateNamespacePropertiesRequest.builder().updateAll(UPDATES).removeAll(REMOVALS).build()); + fullJson, + UpdateNamespacePropertiesRequest.builder().updateAll(UPDATES).removeAll(REMOVALS).build()); // Only updates String emptyRemoval = "{\"removals\":[],\"updates\":{\"owner\":\"Hank\"}}"; assertRoundTripSerializesEquallyFrom( - emptyRemoval, UpdateNamespacePropertiesRequest.builder().updateAll(UPDATES).removeAll(EMPTY_REMOVALS).build()); + emptyRemoval, + UpdateNamespacePropertiesRequest.builder() + .updateAll(UPDATES) + .removeAll(EMPTY_REMOVALS) + .build()); assertRoundTripSerializesEquallyFrom( emptyRemoval, UpdateNamespacePropertiesRequest.builder().update("owner", "Hank").build()); @@ -58,10 +63,15 @@ public void testRoundTripSerDe() throws JsonProcessingException { // Only removals String emptyUpdates = "{\"removals\":[\"foo\",\"bar\"],\"updates\":{}}"; assertRoundTripSerializesEquallyFrom( - emptyUpdates, UpdateNamespacePropertiesRequest.builder().removeAll(REMOVALS).updateAll(EMPTY_UPDATES).build()); + emptyUpdates, + UpdateNamespacePropertiesRequest.builder() + .removeAll(REMOVALS) + .updateAll(EMPTY_UPDATES) + .build()); assertRoundTripSerializesEquallyFrom( - emptyUpdates, UpdateNamespacePropertiesRequest.builder().remove("foo").remove("bar").build()); + emptyUpdates, + UpdateNamespacePropertiesRequest.builder().remove("foo").remove("bar").build()); // All empty String jsonAllFieldsEmpty = "{\"removals\":[],\"updates\":{}}"; @@ -73,7 +83,8 @@ public void testRoundTripSerDe() throws JsonProcessingException { // Test cases that can't be constructed with our Builder class e2e but that will parse correctly public void testCanDeserializeWithoutDefaultValues() throws JsonProcessingException { // `removals` is null - UpdateNamespacePropertiesRequest noRemovals = UpdateNamespacePropertiesRequest.builder().updateAll(UPDATES).build(); + UpdateNamespacePropertiesRequest noRemovals = + UpdateNamespacePropertiesRequest.builder().updateAll(UPDATES).build(); String jsonWithNullRemovals = "{\"removals\":null,\"updates\":{\"owner\":\"Hank\"}}"; UpdateNamespacePropertiesRequest parsed = deserialize(jsonWithNullRemovals); assertEquals(parsed, noRemovals); @@ -83,7 +94,8 @@ public void testCanDeserializeWithoutDefaultValues() throws JsonProcessingExcept assertEquals(deserialize(jsonWithMissingRemovals), noRemovals); // `updates` is null - UpdateNamespacePropertiesRequest noUpdates = UpdateNamespacePropertiesRequest.builder().removeAll(REMOVALS).build(); + UpdateNamespacePropertiesRequest noUpdates = + UpdateNamespacePropertiesRequest.builder().removeAll(REMOVALS).build(); String jsonWithNullUpdates = "{\"removals\":[\"foo\",\"bar\"],\"updates\":null}"; assertEquals(deserialize(jsonWithNullUpdates), noUpdates); @@ -92,7 +104,8 @@ public void testCanDeserializeWithoutDefaultValues() throws JsonProcessingExcept assertEquals(deserialize(jsonWithMissingUpdates), noUpdates); // all null / no values set - UpdateNamespacePropertiesRequest allMissing = UpdateNamespacePropertiesRequest.builder().build(); + UpdateNamespacePropertiesRequest allMissing = + UpdateNamespacePropertiesRequest.builder().build(); String jsonAllNull = "{\"removals\":null,\"updates\":null}"; assertEquals(deserialize(jsonAllNull), allMissing); @@ -108,16 +121,14 @@ public void testParseInvalidJson() { AssertHelpers.assertThrows( "A JSON request with an invalid type for one of the fields should fail to parse", JsonProcessingException.class, - () -> deserialize(jsonInvalidTypeOnRemovalField) - ); + () -> deserialize(jsonInvalidTypeOnRemovalField)); String jsonInvalidTypeOnUpdatesField = "{\"removals\":[\"foo\":\"bar\"],\"updates\":[\"owner\"]}"; AssertHelpers.assertThrows( "A JSON value with an invalid type for one of the fields should fail to parse", JsonProcessingException.class, - () -> deserialize(jsonInvalidTypeOnUpdatesField) - ); + () -> deserialize(jsonInvalidTypeOnUpdatesField)); // Valid top-level (array) types, but at least one entry in the list is not the expected type // NOTE: non-string values that are integral types will still parse into a string list. @@ -127,16 +138,14 @@ public void testParseInvalidJson() { AssertHelpers.assertThrows( "A JSON value with an invalid type inside one of the list fields should fail to parse", JsonProcessingException.class, - () -> deserialize(invalidJsonWrongTypeInRemovalsList) - ); + () -> deserialize(invalidJsonWrongTypeInRemovalsList)); String nullJson = null; AssertHelpers.assertThrows( "A null JSON should fail to parse", IllegalArgumentException.class, "argument \"content\" is null", - () -> deserialize(nullJson) - ); + () -> deserialize(nullJson)); } @Test @@ -145,44 +154,38 @@ public void testBuilderDoesNotCreateInvalidObjects() { "The builder should not allow using null as a key to remove", NullPointerException.class, "Invalid property to remove: null", - () -> UpdateNamespacePropertiesRequest.builder().remove(null).build() - ); + () -> UpdateNamespacePropertiesRequest.builder().remove(null).build()); AssertHelpers.assertThrows( "The builder should not allow passing a null list of properties to remove", NullPointerException.class, "Invalid list of properties to remove: null", - () -> UpdateNamespacePropertiesRequest.builder().removeAll(null).build() - ); + () -> UpdateNamespacePropertiesRequest.builder().removeAll(null).build()); List listWithNull = Lists.newArrayList("a", null, null); AssertHelpers.assertThrows( "The builder should not allow passing a list of properties to remove with a null element", IllegalArgumentException.class, "Invalid property to remove: null", - () -> UpdateNamespacePropertiesRequest.builder().removeAll(listWithNull).build() - ); + () -> UpdateNamespacePropertiesRequest.builder().removeAll(listWithNull).build()); AssertHelpers.assertThrows( "The builder should not allow using null as a key to update", NullPointerException.class, "Invalid property to update: null", - () -> UpdateNamespacePropertiesRequest.builder().update(null, "100").build() - ); + () -> UpdateNamespacePropertiesRequest.builder().update(null, "100").build()); AssertHelpers.assertThrows( "The builder should not allow using null as a value to update", NullPointerException.class, "Invalid value to update for key [owner]: null. Use remove instead", - () -> UpdateNamespacePropertiesRequest.builder().update("owner", null).build() - ); + () -> UpdateNamespacePropertiesRequest.builder().update("owner", null).build()); AssertHelpers.assertThrows( "The builder should not allow passing a null collection of properties to update", NullPointerException.class, "Invalid collection of properties to update: null", - () -> UpdateNamespacePropertiesRequest.builder().updateAll(null).build() - ); + () -> UpdateNamespacePropertiesRequest.builder().updateAll(null).build()); Map mapWithNullKey = Maps.newHashMap(); mapWithNullKey.put(null, "hello"); @@ -190,8 +193,7 @@ public void testBuilderDoesNotCreateInvalidObjects() { "The builder should not allow using null as a key to update from a collection of updates", IllegalArgumentException.class, "Invalid property to update: null", - () -> UpdateNamespacePropertiesRequest.builder().updateAll(mapWithNullKey).build() - ); + () -> UpdateNamespacePropertiesRequest.builder().updateAll(mapWithNullKey).build()); Map mapWithMultipleNullValues = Maps.newHashMap(); mapWithMultipleNullValues.put("a", null); @@ -200,13 +202,15 @@ public void testBuilderDoesNotCreateInvalidObjects() { "The builder should not allow using null as a value to update from a collection of updates", IllegalArgumentException.class, "Invalid value to update for properties [a]: null. Use remove instead", - () -> UpdateNamespacePropertiesRequest.builder().updateAll(mapWithMultipleNullValues).build() - ); + () -> + UpdateNamespacePropertiesRequest.builder() + .updateAll(mapWithMultipleNullValues) + .build()); } @Override public String[] allFieldsFromSpec() { - return new String[] { "updates", "removals" }; + return new String[] {"updates", "removals"}; } @Override @@ -218,15 +222,20 @@ public UpdateNamespacePropertiesRequest createExampleInstance() { } @Override - public void assertEquals(UpdateNamespacePropertiesRequest actual, UpdateNamespacePropertiesRequest expected) { - Assert.assertEquals("Properties to update should be equal", actual.updates(), expected.updates()); - Assert.assertEquals("Properties to remove should be equal", - Sets.newHashSet(actual.removals()), Sets.newHashSet(expected.removals())); + public void assertEquals( + UpdateNamespacePropertiesRequest actual, UpdateNamespacePropertiesRequest expected) { + Assert.assertEquals( + "Properties to update should be equal", actual.updates(), expected.updates()); + Assert.assertEquals( + "Properties to remove should be equal", + Sets.newHashSet(actual.removals()), + Sets.newHashSet(expected.removals())); } @Override public UpdateNamespacePropertiesRequest deserialize(String json) throws JsonProcessingException { - UpdateNamespacePropertiesRequest request = mapper().readValue(json, UpdateNamespacePropertiesRequest.class); + UpdateNamespacePropertiesRequest request = + mapper().readValue(json, UpdateNamespacePropertiesRequest.class); request.validate(); return request; } diff --git a/core/src/test/java/org/apache/iceberg/rest/requests/TestUpdateRequirementParser.java b/core/src/test/java/org/apache/iceberg/rest/requests/TestUpdateRequirementParser.java index 65f1226b888b..d47294c61067 100644 --- a/core/src/test/java/org/apache/iceberg/rest/requests/TestUpdateRequirementParser.java +++ b/core/src/test/java/org/apache/iceberg/rest/requests/TestUpdateRequirementParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.requests; import java.util.List; @@ -31,10 +30,10 @@ public class TestUpdateRequirementParser { @Test public void testUpdateRequirementWithoutRequirementTypeCannotParse() { - List invalidJson = ImmutableList.of( - "{\"type\":null,\"uuid\":\"2cc52516-5e73-41f2-b139-545d41a4e151\"}", - "{\"uuid\":\"2cc52516-5e73-41f2-b139-545d41a4e151\"}" - ); + List invalidJson = + ImmutableList.of( + "{\"type\":null,\"uuid\":\"2cc52516-5e73-41f2-b139-545d41a4e151\"}", + "{\"uuid\":\"2cc52516-5e73-41f2-b139-545d41a4e151\"}"); for (String json : invalidJson) { AssertHelpers.assertThrows( @@ -59,8 +58,10 @@ public void testAssertUUIDToJson() { String uuid = "2cc52516-5e73-41f2-b139-545d41a4e151"; String expected = String.format("{\"type\":\"assert-table-uuid\",\"uuid\":\"%s\"}", uuid); UpdateRequirement actual = new UpdateRequirement.AssertTableUUID(uuid); - Assert.assertEquals("AssertTableUUID should convert to the correct JSON value", - expected, UpdateRequirementParser.toJson(actual)); + Assert.assertEquals( + "AssertTableUUID should convert to the correct JSON value", + expected, + UpdateRequirementParser.toJson(actual)); } @Test @@ -73,10 +74,12 @@ public void testAssertTableDoesNotExistFromJson() { @Test public void testAssertTableDoesNotExistToJson() { - String expected = "{\"type\":\"assert-create\"}"; + String expected = "{\"type\":\"assert-create\"}"; UpdateRequirement actual = new UpdateRequirement.AssertTableDoesNotExist(); - Assert.assertEquals("AssertTableDoesNotExist should convert to the correct JSON value", - expected, UpdateRequirementParser.toJson(actual)); + Assert.assertEquals( + "AssertTableDoesNotExist should convert to the correct JSON value", + expected, + UpdateRequirementParser.toJson(actual)); } @Test @@ -84,8 +87,10 @@ public void testAssertRefSnapshotIdToJson() { String requirementType = UpdateRequirementParser.ASSERT_REF_SNAPSHOT_ID; String refName = "snapshot-name"; Long snapshotId = 1L; - String json = String.format("{\"type\":\"%s\",\"ref\":\"%s\",\"snapshot-id\":%d}", - requirementType, refName, snapshotId); + String json = + String.format( + "{\"type\":\"%s\",\"ref\":\"%s\",\"snapshot-id\":%d}", + requirementType, refName, snapshotId); UpdateRequirement expected = new UpdateRequirement.AssertRefSnapshotID(refName, snapshotId); assertEquals(requirementType, expected, UpdateRequirementParser.fromJson(json)); } @@ -95,8 +100,10 @@ public void testAssertRefSnapshotIdToJsonWithNullSnapshotId() { String requirementType = UpdateRequirementParser.ASSERT_REF_SNAPSHOT_ID; String refName = "snapshot-name"; Long snapshotId = null; - String json = String.format("{\"type\":\"%s\",\"ref\":\"%s\",\"snapshot-id\":%d}", - requirementType, refName, snapshotId); + String json = + String.format( + "{\"type\":\"%s\",\"ref\":\"%s\",\"snapshot-id\":%d}", + requirementType, refName, snapshotId); UpdateRequirement expected = new UpdateRequirement.AssertRefSnapshotID(refName, snapshotId); assertEquals(requirementType, expected, UpdateRequirementParser.fromJson(json)); } @@ -106,11 +113,15 @@ public void testAssertRefSnapshotIdFromJson() { String requirementType = UpdateRequirementParser.ASSERT_REF_SNAPSHOT_ID; String refName = "snapshot-name"; Long snapshotId = 1L; - String expected = String.format("{\"type\":\"%s\",\"ref\":\"%s\",\"snapshot-id\":%d}", - requirementType, refName, snapshotId); + String expected = + String.format( + "{\"type\":\"%s\",\"ref\":\"%s\",\"snapshot-id\":%d}", + requirementType, refName, snapshotId); UpdateRequirement actual = new UpdateRequirement.AssertRefSnapshotID(refName, snapshotId); - Assert.assertEquals("AssertRefSnapshotId should convert to the correct JSON value", - expected, UpdateRequirementParser.toJson(actual)); + Assert.assertEquals( + "AssertRefSnapshotId should convert to the correct JSON value", + expected, + UpdateRequirementParser.toJson(actual)); } @Test @@ -118,20 +129,27 @@ public void testAssertRefSnapshotIdFromJsonWithNullSnapshotId() { String requirementType = UpdateRequirementParser.ASSERT_REF_SNAPSHOT_ID; String refName = "snapshot-name"; Long snapshotId = null; - String expected = String.format("{\"type\":\"%s\",\"ref\":\"%s\",\"snapshot-id\":%d}", - requirementType, refName, snapshotId); + String expected = + String.format( + "{\"type\":\"%s\",\"ref\":\"%s\",\"snapshot-id\":%d}", + requirementType, refName, snapshotId); UpdateRequirement actual = new UpdateRequirement.AssertRefSnapshotID(refName, snapshotId); - Assert.assertEquals("AssertRefSnapshotId should convert to the correct JSON value", - expected, UpdateRequirementParser.toJson(actual)); + Assert.assertEquals( + "AssertRefSnapshotId should convert to the correct JSON value", + expected, + UpdateRequirementParser.toJson(actual)); } @Test public void testAssertLastAssignedFieldIdFromJson() { String requirementType = UpdateRequirementParser.ASSERT_LAST_ASSIGNED_FIELD_ID; int lastAssignedFieldId = 12; - String json = String.format("{\"type\":\"%s\",\"last-assigned-field-id\":%d}", - requirementType, lastAssignedFieldId); - UpdateRequirement expected = new UpdateRequirement.AssertLastAssignedFieldId(lastAssignedFieldId); + String json = + String.format( + "{\"type\":\"%s\",\"last-assigned-field-id\":%d}", + requirementType, lastAssignedFieldId); + UpdateRequirement expected = + new UpdateRequirement.AssertLastAssignedFieldId(lastAssignedFieldId); assertEquals(requirementType, expected, UpdateRequirementParser.fromJson(json)); } @@ -139,18 +157,23 @@ public void testAssertLastAssignedFieldIdFromJson() { public void testAssertLastAssignedFieldIdToJson() { String requirementType = UpdateRequirementParser.ASSERT_LAST_ASSIGNED_FIELD_ID; int lastAssignedFieldId = 12; - String expected = String.format("{\"type\":\"%s\",\"last-assigned-field-id\":%d}", - requirementType, lastAssignedFieldId); + String expected = + String.format( + "{\"type\":\"%s\",\"last-assigned-field-id\":%d}", + requirementType, lastAssignedFieldId); UpdateRequirement actual = new UpdateRequirement.AssertLastAssignedFieldId(lastAssignedFieldId); - Assert.assertEquals("AssertLastAssignedFieldId should convert to the correct JSON value", - expected, UpdateRequirementParser.toJson(actual)); + Assert.assertEquals( + "AssertLastAssignedFieldId should convert to the correct JSON value", + expected, + UpdateRequirementParser.toJson(actual)); } @Test public void testAssertCurrentSchemaIdFromJson() { String requirementType = UpdateRequirementParser.ASSERT_CURRENT_SCHEMA_ID; int schemaId = 4; - String json = String.format("{\"type\":\"%s\",\"current-schema-id\":%d}", requirementType, schemaId); + String json = + String.format("{\"type\":\"%s\",\"current-schema-id\":%d}", requirementType, schemaId); UpdateRequirement expected = new UpdateRequirement.AssertCurrentSchemaID(schemaId); assertEquals(requirementType, expected, UpdateRequirementParser.fromJson(json)); } @@ -159,19 +182,25 @@ public void testAssertCurrentSchemaIdFromJson() { public void testAssertCurrentSchemaIdToJson() { String requirementType = UpdateRequirementParser.ASSERT_CURRENT_SCHEMA_ID; int schemaId = 4; - String expected = String.format("{\"type\":\"%s\",\"current-schema-id\":%d}", requirementType, schemaId); + String expected = + String.format("{\"type\":\"%s\",\"current-schema-id\":%d}", requirementType, schemaId); UpdateRequirement actual = new UpdateRequirement.AssertCurrentSchemaID(schemaId); - Assert.assertEquals("AssertCurrentSchemaId should convert to the correct JSON value", - expected, UpdateRequirementParser.toJson(actual)); + Assert.assertEquals( + "AssertCurrentSchemaId should convert to the correct JSON value", + expected, + UpdateRequirementParser.toJson(actual)); } @Test public void testAssertLastAssignedPartitionIdFromJson() { String requirementType = UpdateRequirementParser.ASSERT_LAST_ASSIGNED_PARTITION_ID; int lastAssignedPartitionId = 1004; - String json = String.format("{\"type\":\"%s\",\"last-assigned-partition-id\":%d}", - requirementType, lastAssignedPartitionId); - UpdateRequirement expected = new UpdateRequirement.AssertLastAssignedPartitionId(lastAssignedPartitionId); + String json = + String.format( + "{\"type\":\"%s\",\"last-assigned-partition-id\":%d}", + requirementType, lastAssignedPartitionId); + UpdateRequirement expected = + new UpdateRequirement.AssertLastAssignedPartitionId(lastAssignedPartitionId); assertEquals(requirementType, expected, UpdateRequirementParser.fromJson(json)); } @@ -179,18 +208,24 @@ public void testAssertLastAssignedPartitionIdFromJson() { public void testAssertLastAssignedPartitionIdToJson() { String requirementType = UpdateRequirementParser.ASSERT_LAST_ASSIGNED_PARTITION_ID; int lastAssignedPartitionId = 1004; - String expected = String.format("{\"type\":\"%s\",\"last-assigned-partition-id\":%d}", - requirementType, lastAssignedPartitionId); - UpdateRequirement actual = new UpdateRequirement.AssertLastAssignedPartitionId(lastAssignedPartitionId); - Assert.assertEquals("AssertLastAssignedPartitionId should convert to the correct JSON value", - expected, UpdateRequirementParser.toJson(actual)); + String expected = + String.format( + "{\"type\":\"%s\",\"last-assigned-partition-id\":%d}", + requirementType, lastAssignedPartitionId); + UpdateRequirement actual = + new UpdateRequirement.AssertLastAssignedPartitionId(lastAssignedPartitionId); + Assert.assertEquals( + "AssertLastAssignedPartitionId should convert to the correct JSON value", + expected, + UpdateRequirementParser.toJson(actual)); } @Test public void testAssertDefaultSpecIdFromJson() { String requirementType = UpdateRequirementParser.ASSERT_DEFAULT_SPEC_ID; int specId = 5; - String json = String.format("{\"type\":\"%s\",\"default-spec-id\":%d}", requirementType, specId); + String json = + String.format("{\"type\":\"%s\",\"default-spec-id\":%d}", requirementType, specId); UpdateRequirement expected = new UpdateRequirement.AssertDefaultSpecID(specId); assertEquals(requirementType, expected, UpdateRequirementParser.fromJson(json)); } @@ -199,17 +234,22 @@ public void testAssertDefaultSpecIdFromJson() { public void testAssertDefaultSpecIdToJson() { String requirementType = UpdateRequirementParser.ASSERT_DEFAULT_SPEC_ID; int specId = 5; - String expected = String.format("{\"type\":\"%s\",\"default-spec-id\":%d}", requirementType, specId); + String expected = + String.format("{\"type\":\"%s\",\"default-spec-id\":%d}", requirementType, specId); UpdateRequirement actual = new UpdateRequirement.AssertDefaultSpecID(specId); - Assert.assertEquals("AssertDefaultSpecId should convert to the correct JSON value", - expected, UpdateRequirementParser.toJson(actual)); + Assert.assertEquals( + "AssertDefaultSpecId should convert to the correct JSON value", + expected, + UpdateRequirementParser.toJson(actual)); } @Test public void testAssertDefaultSortOrderIdFromJson() { String requirementType = UpdateRequirementParser.ASSERT_DEFAULT_SORT_ORDER_ID; int sortOrderId = 10; - String json = String.format("{\"type\":\"%s\",\"default-sort-order-id\":%d}", requirementType, sortOrderId); + String json = + String.format( + "{\"type\":\"%s\",\"default-sort-order-id\":%d}", requirementType, sortOrderId); UpdateRequirement expected = new UpdateRequirement.AssertDefaultSortOrderID(sortOrderId); assertEquals(requirementType, expected, UpdateRequirementParser.fromJson(json)); } @@ -218,37 +258,48 @@ public void testAssertDefaultSortOrderIdFromJson() { public void testAssertDefaultSortOrderIdToJson() { String requirementType = UpdateRequirementParser.ASSERT_DEFAULT_SORT_ORDER_ID; int sortOrderId = 10; - String expected = String.format("{\"type\":\"%s\",\"default-sort-order-id\":%d}", requirementType, sortOrderId); + String expected = + String.format( + "{\"type\":\"%s\",\"default-sort-order-id\":%d}", requirementType, sortOrderId); UpdateRequirement actual = new UpdateRequirement.AssertDefaultSortOrderID(sortOrderId); - Assert.assertEquals("AssertDefaultSortOrderId should convert to the correct JSON value", - expected, UpdateRequirementParser.toJson(actual)); + Assert.assertEquals( + "AssertDefaultSortOrderId should convert to the correct JSON value", + expected, + UpdateRequirementParser.toJson(actual)); } - public void assertEquals(String requirementType, UpdateRequirement expected, UpdateRequirement actual) { + public void assertEquals( + String requirementType, UpdateRequirement expected, UpdateRequirement actual) { switch (requirementType) { case UpdateRequirementParser.ASSERT_TABLE_UUID: - compareAssertTableUUID((UpdateRequirement.AssertTableUUID) expected, + compareAssertTableUUID( + (UpdateRequirement.AssertTableUUID) expected, (UpdateRequirement.AssertTableUUID) actual); break; case UpdateRequirementParser.ASSERT_TABLE_DOES_NOT_EXIST: - // Don't cast here as the function explicitly tests that the types are correct, given that the generated JSON + // Don't cast here as the function explicitly tests that the types are correct, given that + // the generated JSON // for ASSERT_TABLE_DOES_NOT_EXIST does not have any fields other than the requirement type. compareAssertTableDoesNotExist(expected, actual); break; case UpdateRequirementParser.ASSERT_REF_SNAPSHOT_ID: - compareAssertRefSnapshotId((UpdateRequirement.AssertRefSnapshotID) expected, + compareAssertRefSnapshotId( + (UpdateRequirement.AssertRefSnapshotID) expected, (UpdateRequirement.AssertRefSnapshotID) actual); break; case UpdateRequirementParser.ASSERT_LAST_ASSIGNED_FIELD_ID: - compareAssertLastAssignedFieldId((UpdateRequirement.AssertLastAssignedFieldId) expected, + compareAssertLastAssignedFieldId( + (UpdateRequirement.AssertLastAssignedFieldId) expected, (UpdateRequirement.AssertLastAssignedFieldId) actual); break; case UpdateRequirementParser.ASSERT_CURRENT_SCHEMA_ID: - compareAssertCurrentSchemaId((UpdateRequirement.AssertCurrentSchemaID) expected, + compareAssertCurrentSchemaId( + (UpdateRequirement.AssertCurrentSchemaID) expected, (UpdateRequirement.AssertCurrentSchemaID) actual); break; case UpdateRequirementParser.ASSERT_LAST_ASSIGNED_PARTITION_ID: - compareAssertLastAssignedPartitionId((UpdateRequirement.AssertLastAssignedPartitionId) expected, + compareAssertLastAssignedPartitionId( + (UpdateRequirement.AssertLastAssignedPartitionId) expected, (UpdateRequirement.AssertLastAssignedPartitionId) actual); break; case UpdateRequirementParser.ASSERT_DEFAULT_SPEC_ID: @@ -267,16 +318,19 @@ public void assertEquals(String requirementType, UpdateRequirement expected, Upd } private static void compareAssertTableUUID( - UpdateRequirement.AssertTableUUID expected, - UpdateRequirement.AssertTableUUID actual) { + UpdateRequirement.AssertTableUUID expected, UpdateRequirement.AssertTableUUID actual) { Assertions.assertThat(actual.uuid()) - .as("UUID from JSON should not be null").isNotNull() - .as("UUID should parse correctly from JSON").isEqualTo(expected.uuid()); + .as("UUID from JSON should not be null") + .isNotNull() + .as("UUID should parse correctly from JSON") + .isEqualTo(expected.uuid()); } - // AssertTableDoesNotExist does not have any fields beyond the requirement type, so just check that the classes + // AssertTableDoesNotExist does not have any fields beyond the requirement type, so just check + // that the classes // are the same and as expected. - private static void compareAssertTableDoesNotExist(UpdateRequirement expected, UpdateRequirement actual) { + private static void compareAssertTableDoesNotExist( + UpdateRequirement expected, UpdateRequirement actual) { Assertions.assertThat(actual) .isOfAnyClassIn(UpdateRequirement.AssertTableDoesNotExist.class) .hasSameClassAs(expected); diff --git a/core/src/test/java/org/apache/iceberg/rest/responses/TestConfigResponse.java b/core/src/test/java/org/apache/iceberg/rest/responses/TestConfigResponse.java index 49f53195ac92..223ee360cf34 100644 --- a/core/src/test/java/org/apache/iceberg/rest/responses/TestConfigResponse.java +++ b/core/src/test/java/org/apache/iceberg/rest/responses/TestConfigResponse.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; import com.fasterxml.jackson.core.JsonProcessingException; @@ -31,7 +30,8 @@ public class TestConfigResponse extends RequestResponseTestBase { - private static final Map DEFAULTS = ImmutableMap.of("warehouse", "s3://bucket/warehouse"); + private static final Map DEFAULTS = + ImmutableMap.of("warehouse", "s3://bucket/warehouse"); private static final Map OVERRIDES = ImmutableMap.of("clients", "5"); private static final Map DEFAULTS_WITH_NULL_VALUE = Maps.newHashMap(); @@ -47,33 +47,32 @@ public static void beforeAllForRestCatalogConfig() { // Test cases that are JSON that can be created via the Builder public void testRoundTripSerDe() throws JsonProcessingException { // Both fields have values without nulls - String fullJson = "{\"defaults\":{\"warehouse\":\"s3://bucket/warehouse\"},\"overrides\":{\"clients\":\"5\"}}"; + String fullJson = + "{\"defaults\":{\"warehouse\":\"s3://bucket/warehouse\"},\"overrides\":{\"clients\":\"5\"}}"; assertRoundTripSerializesEquallyFrom( - fullJson, - ConfigResponse.builder() - .withOverrides(OVERRIDES).withDefaults(DEFAULTS).build()); + fullJson, ConfigResponse.builder().withOverrides(OVERRIDES).withDefaults(DEFAULTS).build()); assertRoundTripSerializesEquallyFrom( fullJson, ConfigResponse.builder() - .withOverride("clients", "5").withDefault("warehouse", "s3://bucket/warehouse").build()); + .withOverride("clients", "5") + .withDefault("warehouse", "s3://bucket/warehouse") + .build()); // `defaults` is empty String jsonEmptyDefaults = "{\"defaults\":{},\"overrides\":{\"clients\":\"5\"}}"; assertRoundTripSerializesEquallyFrom( - jsonEmptyDefaults, - ConfigResponse.builder().withOverrides(OVERRIDES).build()); + jsonEmptyDefaults, ConfigResponse.builder().withOverrides(OVERRIDES).build()); assertRoundTripSerializesEquallyFrom( jsonEmptyDefaults, ConfigResponse.builder().withOverrides(OVERRIDES).withDefaults(ImmutableMap.of()).build()); assertRoundTripSerializesEquallyFrom( - jsonEmptyDefaults, - ConfigResponse.builder().withOverride("clients", "5").build()); + jsonEmptyDefaults, ConfigResponse.builder().withOverride("clients", "5").build()); // `overrides` is empty - String jsonEmptyOverrides = "{\"defaults\":{\"warehouse\":\"s3://bucket/warehouse\"},\"overrides\":{}}"; + String jsonEmptyOverrides = + "{\"defaults\":{\"warehouse\":\"s3://bucket/warehouse\"},\"overrides\":{}}"; assertRoundTripSerializesEquallyFrom( - jsonEmptyOverrides, - ConfigResponse.builder().withDefaults(DEFAULTS).build()); + jsonEmptyOverrides, ConfigResponse.builder().withDefaults(DEFAULTS).build()); assertRoundTripSerializesEquallyFrom( jsonEmptyOverrides, ConfigResponse.builder().withDefault("warehouse", "s3://bucket/warehouse").build()); @@ -83,12 +82,13 @@ public void testRoundTripSerDe() throws JsonProcessingException { // Both are empty String emptyJson = "{\"defaults\":{},\"overrides\":{}}"; + assertRoundTripSerializesEquallyFrom(emptyJson, ConfigResponse.builder().build()); assertRoundTripSerializesEquallyFrom( emptyJson, - ConfigResponse.builder().build()); - assertRoundTripSerializesEquallyFrom( - emptyJson, - ConfigResponse.builder().withOverrides(ImmutableMap.of()).withDefaults(ImmutableMap.of()).build()); + ConfigResponse.builder() + .withOverrides(ImmutableMap.of()) + .withDefaults(ImmutableMap.of()) + .build()); } @Test @@ -97,7 +97,8 @@ public void testCanDeserializeWithoutDefaultValues() throws JsonProcessingExcept ConfigResponse noOverrides = ConfigResponse.builder().withDefaults(DEFAULTS).build(); String jsonMissingOverrides = "{\"defaults\":{\"warehouse\":\"s3://bucket/warehouse\"}}"; assertEquals(deserialize(jsonMissingOverrides), noOverrides); - String jsonNullOverrides = "{\"defaults\":{\"warehouse\":\"s3://bucket/warehouse\"},\"overrides\":null}"; + String jsonNullOverrides = + "{\"defaults\":{\"warehouse\":\"s3://bucket/warehouse\"},\"overrides\":null}"; assertEquals(deserialize(jsonNullOverrides), noOverrides); ConfigResponse noDefaults = ConfigResponse.builder().withOverrides(OVERRIDES).build(); @@ -120,22 +121,24 @@ public void testCanUseNullAsPropertyValue() throws JsonProcessingException { assertRoundTripSerializesEquallyFrom( jsonNullValueInDefaults, ConfigResponse.builder() - .withDefaults(DEFAULTS_WITH_NULL_VALUE).withOverrides(OVERRIDES).build()); + .withDefaults(DEFAULTS_WITH_NULL_VALUE) + .withOverrides(OVERRIDES) + .build()); assertRoundTripSerializesEquallyFrom( jsonNullValueInDefaults, - ConfigResponse.builder() - .withDefault("warehouse", null).withOverrides(OVERRIDES).build()); + ConfigResponse.builder().withDefault("warehouse", null).withOverrides(OVERRIDES).build()); String jsonNullValueInOverrides = "{\"defaults\":{\"warehouse\":\"s3://bucket/warehouse\"},\"overrides\":{\"clients\":null}}"; assertRoundTripSerializesEquallyFrom( jsonNullValueInOverrides, ConfigResponse.builder() - .withDefaults(DEFAULTS).withOverrides(OVERRIDES_WITH_NULL_VALUE).build()); + .withDefaults(DEFAULTS) + .withOverrides(OVERRIDES_WITH_NULL_VALUE) + .build()); assertRoundTripSerializesEquallyFrom( jsonNullValueInOverrides, - ConfigResponse.builder() - .withDefaults(DEFAULTS).withOverride("clients", null).build()); + ConfigResponse.builder().withDefaults(DEFAULTS).withOverride("clients", null).build()); } @Test @@ -145,22 +148,19 @@ public void testDeserializeInvalidResponse() { AssertHelpers.assertThrows( "A JSON response with the wrong type for the defaults field should fail to deserialize", JsonProcessingException.class, - () -> deserialize(jsonDefaultsHasWrongType) - ); + () -> deserialize(jsonDefaultsHasWrongType)); String jsonOverridesHasWrongType = "{\"defaults\":{\"warehouse\":\"s3://bucket/warehouse\"},\"overrides\":\"clients\"}"; AssertHelpers.assertThrows( "A JSON response with the wrong type for the overrides field should fail to deserialize", JsonProcessingException.class, - () -> deserialize(jsonOverridesHasWrongType) - ); + () -> deserialize(jsonOverridesHasWrongType)); AssertHelpers.assertThrows( "A null JSON response body should fail to deserialize", IllegalArgumentException.class, - () -> deserialize(null) - ); + () -> deserialize(null)); } @Test @@ -169,29 +169,25 @@ public void testBuilderDoesNotCreateInvalidObjects() { "The builder should not allow using null as a key in the properties to override", NullPointerException.class, "Invalid override property: null", - () -> ConfigResponse.builder().withOverride(null, "100").build() - ); + () -> ConfigResponse.builder().withOverride(null, "100").build()); AssertHelpers.assertThrows( "The builder should not allow using null as a key in the default properties", NullPointerException.class, "Invalid default property: null", - () -> ConfigResponse.builder().withDefault(null, "100").build() - ); + () -> ConfigResponse.builder().withDefault(null, "100").build()); AssertHelpers.assertThrows( "The builder should not allow passing a null map of config properties to override", NullPointerException.class, "Invalid override properties map: null", - () -> ConfigResponse.builder().withOverrides(null).build() - ); + () -> ConfigResponse.builder().withOverrides(null).build()); AssertHelpers.assertThrows( "The builder should not allow passing a null map of default config properties", NullPointerException.class, "Invalid default properties map: null", - () -> ConfigResponse.builder().withDefaults(null).build() - ); + () -> ConfigResponse.builder().withDefaults(null).build()); Map mapWithNullKey = Maps.newHashMap(); mapWithNullKey.put(null, "a"); @@ -200,15 +196,13 @@ public void testBuilderDoesNotCreateInvalidObjects() { "The builder should not allow passing a map of default config properties with a null key", IllegalArgumentException.class, "Invalid default property: null", - () -> ConfigResponse.builder().withDefaults(mapWithNullKey).build() - ); + () -> ConfigResponse.builder().withDefaults(mapWithNullKey).build()); AssertHelpers.assertThrows( "The builder should not allow passing a map of properties to override with a null key", IllegalArgumentException.class, "Invalid override property: null", - () -> ConfigResponse.builder().withOverrides(mapWithNullKey).build() - ); + () -> ConfigResponse.builder().withOverrides(mapWithNullKey).build()); } @Test @@ -221,20 +215,23 @@ public void testMergeStripsNullValuedEntries() { Map defaults = ImmutableMap.of("a", "from_defaults"); Map clientConfig = ImmutableMap.of("a", "from_client", "c", "from_client"); - ConfigResponse resp = ConfigResponse.builder() - .withOverrides(overrides).withDefaults(defaults).build(); + ConfigResponse resp = + ConfigResponse.builder().withOverrides(overrides).withDefaults(defaults).build(); - // "a" isn't present as it was marked as `null` in the overrides, so the provided client configuration is discarded + // "a" isn't present as it was marked as `null` in the overrides, so the provided client + // configuration is discarded Map merged = resp.merge(clientConfig); - Map expected = ImmutableMap.of( + Map expected = + ImmutableMap.of( "b", "from_overrides", - "c", "from_client" - ); + "c", "from_client"); Assert.assertEquals( "The merged properties map should use values from defaults, then client config, and finally overrides", - expected, merged); - Assert.assertFalse("The merged properties map should omit keys with null values", merged.containsValue(null)); + expected, + merged); + Assert.assertFalse( + "The merged properties map should omit keys with null values", merged.containsValue(null)); } @Override @@ -244,18 +241,19 @@ public String[] allFieldsFromSpec() { @Override public ConfigResponse createExampleInstance() { - return ConfigResponse.builder() - .withDefaults(DEFAULTS) - .withOverrides(OVERRIDES) - .build(); + return ConfigResponse.builder().withDefaults(DEFAULTS).withOverrides(OVERRIDES).build(); } @Override public void assertEquals(ConfigResponse actual, ConfigResponse expected) { - Assert.assertEquals("Config properties to use as defaults should be equal", - actual.defaults(), expected.defaults()); - Assert.assertEquals("Config properties to use as overrides should be equal", - actual.overrides(), expected.overrides()); + Assert.assertEquals( + "Config properties to use as defaults should be equal", + actual.defaults(), + expected.defaults()); + Assert.assertEquals( + "Config properties to use as overrides should be equal", + actual.overrides(), + expected.overrides()); } @Override diff --git a/core/src/test/java/org/apache/iceberg/rest/responses/TestCreateNamespaceResponse.java b/core/src/test/java/org/apache/iceberg/rest/responses/TestCreateNamespaceResponse.java index 7c9bf1f773d7..df392563065a 100644 --- a/core/src/test/java/org/apache/iceberg/rest/responses/TestCreateNamespaceResponse.java +++ b/core/src/test/java/org/apache/iceberg/rest/responses/TestCreateNamespaceResponse.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; import com.fasterxml.jackson.core.JsonProcessingException; @@ -39,29 +38,37 @@ public class TestCreateNamespaceResponse extends RequestResponseTestBase deserialize(jsonResponseMalformedNamespaceValue) - ); + () -> deserialize(jsonResponseMalformedNamespaceValue)); - String jsonResponsePropertiesHasWrongType = "{\"namespace\":[\"accounting\",\"tax\"],\"properties\":[]}"; + String jsonResponsePropertiesHasWrongType = + "{\"namespace\":[\"accounting\",\"tax\"],\"properties\":[]}"; AssertHelpers.assertThrows( "A JSON response with the wrong type for the properties field should fail to deserialize", JsonProcessingException.class, - () -> deserialize(jsonResponsePropertiesHasWrongType) - ); + () -> deserialize(jsonResponsePropertiesHasWrongType)); AssertHelpers.assertThrows( "An empty JSON response should fail to deserialize", IllegalArgumentException.class, "Invalid namespace: null", - () -> deserialize("{}") - ); + () -> deserialize("{}")); - String jsonMisspelledKeys = "{\"namepsace\":[\"accounting\",\"tax\"],\"propertiezzzz\":{\"owner\":\"Hank\"}}"; + String jsonMisspelledKeys = + "{\"namepsace\":[\"accounting\",\"tax\"],\"propertiezzzz\":{\"owner\":\"Hank\"}}"; AssertHelpers.assertThrows( "A JSON response with the keys spelled incorrectly should fail to deserialize and validate", IllegalArgumentException.class, "Invalid namespace: null", - () -> deserialize(jsonMisspelledKeys) - ); + () -> deserialize(jsonMisspelledKeys)); AssertHelpers.assertThrows( "A null JSON response body should fail to deserialize", IllegalArgumentException.class, - () -> deserialize(null) - ); + () -> deserialize(null)); } @Test @@ -119,15 +124,13 @@ public void testBuilderDoesNotBuildInvalidRequests() { "The builder should not allow using null for the namespace", NullPointerException.class, "Invalid namespace: null", - () -> CreateNamespaceResponse.builder().withNamespace(null).build() - ); + () -> CreateNamespaceResponse.builder().withNamespace(null).build()); AssertHelpers.assertThrows( "The builder should not allow passing a null collection of properties", NullPointerException.class, "Invalid collection of properties: null", - () -> CreateNamespaceResponse.builder().setProperties(null).build() - ); + () -> CreateNamespaceResponse.builder().setProperties(null).build()); Map mapWithNullKey = Maps.newHashMap(); mapWithNullKey.put(null, "hello"); @@ -135,8 +138,7 @@ public void testBuilderDoesNotBuildInvalidRequests() { "The builder should not allow using null as a key in the properties to set", IllegalArgumentException.class, "Invalid property to set: null", - () -> CreateNamespaceResponse.builder().setProperties(mapWithNullKey).build() - ); + () -> CreateNamespaceResponse.builder().setProperties(mapWithNullKey).build()); Map mapWithMultipleNullValues = Maps.newHashMap(); mapWithMultipleNullValues.put("a", null); @@ -145,13 +147,12 @@ public void testBuilderDoesNotBuildInvalidRequests() { "The builder should not allow using null as a value in the properties to set", IllegalArgumentException.class, "Invalid value to set for properties [a]: null", - () -> CreateNamespaceResponse.builder().setProperties(mapWithMultipleNullValues).build() - ); + () -> CreateNamespaceResponse.builder().setProperties(mapWithMultipleNullValues).build()); } @Override public String[] allFieldsFromSpec() { - return new String[] { "namespace", "properties" }; + return new String[] {"namespace", "properties"}; } @Override @@ -175,4 +176,3 @@ public CreateNamespaceResponse deserialize(String json) throws JsonProcessingExc return response; } } - diff --git a/core/src/test/java/org/apache/iceberg/rest/responses/TestErrorResponseParser.java b/core/src/test/java/org/apache/iceberg/rest/responses/TestErrorResponseParser.java index 08c757729bfe..98b764b67852 100644 --- a/core/src/test/java/org/apache/iceberg/rest/responses/TestErrorResponseParser.java +++ b/core/src/test/java/org/apache/iceberg/rest/responses/TestErrorResponseParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; import java.util.Arrays; @@ -31,11 +30,15 @@ public void testErrorResponseToJson() { String message = "The given namespace does not exist"; String type = "NoSuchNamespaceException"; Integer code = 404; - String errorModelJson = String.format("{\"message\":\"%s\",\"type\":\"%s\",\"code\":%d}", message, type, code); + String errorModelJson = + String.format("{\"message\":\"%s\",\"type\":\"%s\",\"code\":%d}", message, type, code); String json = "{\"error\":" + errorModelJson + "}"; - ErrorResponse response = ErrorResponse.builder().withMessage(message).withType(type).responseCode(code).build(); - Assert.assertEquals("Should be able to serialize an error response as json", - ErrorResponseParser.toJson(response), json); + ErrorResponse response = + ErrorResponse.builder().withMessage(message).withType(type).responseCode(code).build(); + Assert.assertEquals( + "Should be able to serialize an error response as json", + ErrorResponseParser.toJson(response), + json); } @Test @@ -44,17 +47,22 @@ public void testErrorResponseToJsonWithStack() { String type = "NoSuchNamespaceException"; Integer code = 404; List stack = Arrays.asList("a", "b"); - String errorModelJson = String.format( - "{\"message\":\"%s\",\"type\":\"%s\",\"code\":%d,\"stack\":[\"a\",\"b\"]}", message, type, code); + String errorModelJson = + String.format( + "{\"message\":\"%s\",\"type\":\"%s\",\"code\":%d,\"stack\":[\"a\",\"b\"]}", + message, type, code); String json = "{\"error\":" + errorModelJson + "}"; - ErrorResponse response = ErrorResponse.builder() - .withMessage(message) - .withType(type) - .responseCode(code) - .withStackTrace(stack) - .build(); - Assert.assertEquals("Should be able to serialize an error response as json", - json, ErrorResponseParser.toJson(response)); + ErrorResponse response = + ErrorResponse.builder() + .withMessage(message) + .withType(type) + .responseCode(code) + .withStackTrace(stack) + .build(); + Assert.assertEquals( + "Should be able to serialize an error response as json", + json, + ErrorResponseParser.toJson(response)); } @Test @@ -62,10 +70,12 @@ public void testErrorResponseFromJson() { String message = "The given namespace does not exist"; String type = "NoSuchNamespaceException"; Integer code = 404; - String errorModelJson = String.format("{\"message\":\"%s\",\"type\":\"%s\",\"code\":%d}", message, type, code); + String errorModelJson = + String.format("{\"message\":\"%s\",\"type\":\"%s\",\"code\":%d}", message, type, code); String json = "{\"error\":" + errorModelJson + "}"; - ErrorResponse expected = ErrorResponse.builder().withMessage(message).withType(type).responseCode(code).build(); + ErrorResponse expected = + ErrorResponse.builder().withMessage(message).withType(type).responseCode(code).build(); assertEquals(expected, ErrorResponseParser.fromJson(json)); } @@ -75,16 +85,19 @@ public void testErrorResponseFromJsonWithStack() { String type = "NoSuchNamespaceException"; Integer code = 404; List stack = Arrays.asList("a", "b"); - String errorModelJson = String.format( - "{\"message\":\"%s\",\"type\":\"%s\",\"code\":%d,\"stack\":[\"a\",\"b\"]}", message, type, code); + String errorModelJson = + String.format( + "{\"message\":\"%s\",\"type\":\"%s\",\"code\":%d,\"stack\":[\"a\",\"b\"]}", + message, type, code); String json = "{\"error\":" + errorModelJson + "}"; - ErrorResponse expected = ErrorResponse.builder() - .withMessage(message) - .withType(type) - .responseCode(code) - .withStackTrace(stack) - .build(); + ErrorResponse expected = + ErrorResponse.builder() + .withMessage(message) + .withType(type) + .responseCode(code) + .withStackTrace(stack) + .build(); assertEquals(expected, ErrorResponseParser.fromJson(json)); } @@ -94,16 +107,18 @@ public void testErrorResponseFromJsonWithExplicitNullStack() { String type = "NoSuchNamespaceException"; Integer code = 404; List stack = null; - String errorModelJson = String.format( - "{\"message\":\"%s\",\"type\":\"%s\",\"code\":%d,\"stack\":null}", message, type, code); + String errorModelJson = + String.format( + "{\"message\":\"%s\",\"type\":\"%s\",\"code\":%d,\"stack\":null}", message, type, code); String json = "{\"error\":" + errorModelJson + "}"; - ErrorResponse expected = ErrorResponse.builder() - .withMessage(message) - .withType(type) - .responseCode(code) - .withStackTrace(stack) - .build(); + ErrorResponse expected = + ErrorResponse.builder() + .withMessage(message) + .withType(type) + .responseCode(code) + .withStackTrace(stack) + .build(); assertEquals(expected, ErrorResponseParser.fromJson(json)); } diff --git a/core/src/test/java/org/apache/iceberg/rest/responses/TestGetNamespaceResponse.java b/core/src/test/java/org/apache/iceberg/rest/responses/TestGetNamespaceResponse.java index 60856020e2a4..2095ceb9a9f7 100644 --- a/core/src/test/java/org/apache/iceberg/rest/responses/TestGetNamespaceResponse.java +++ b/core/src/test/java/org/apache/iceberg/rest/responses/TestGetNamespaceResponse.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; import com.fasterxml.jackson.core.JsonProcessingException; @@ -39,22 +38,28 @@ public class TestGetNamespaceResponse extends RequestResponseTestBase deserialize(jsonNamespaceHasWrongType) - ); + () -> deserialize(jsonNamespaceHasWrongType)); - String jsonPropertiesHasWrongType = "{\"namespace\":[\"accounting\",\"tax\"],\"properties\":[]}"; + String jsonPropertiesHasWrongType = + "{\"namespace\":[\"accounting\",\"tax\"],\"properties\":[]}"; AssertHelpers.assertThrows( "A JSON response with the wrong type for a field should fail to deserialize", JsonProcessingException.class, - () -> deserialize(jsonPropertiesHasWrongType) - ); + () -> deserialize(jsonPropertiesHasWrongType)); String emptyJson = "{}"; AssertHelpers.assertThrows( "An empty JSON request should fail to deserialize after validation", IllegalArgumentException.class, "Invalid namespace: null", - () -> deserialize(emptyJson) - ); + () -> deserialize(emptyJson)); String jsonWithKeysSpelledIncorrectly = "{\"namepsace\":[\"accounting\",\"tax\"],\"propertiezzzz\":{\"owner\":\"Hank\"}}"; @@ -90,15 +93,13 @@ public void testDeserializeInvalidResponse() { "A JSON response with the keys spelled incorrectly should fail to deserialize", IllegalArgumentException.class, "Invalid namespace: null", - () -> deserialize(jsonWithKeysSpelledIncorrectly) - ); + () -> deserialize(jsonWithKeysSpelledIncorrectly)); String nullJson = null; AssertHelpers.assertThrows( "An empty JSON request should fail to deserialize", IllegalArgumentException.class, - () -> deserialize(nullJson) - ); + () -> deserialize(nullJson)); } @Test @@ -107,15 +108,13 @@ public void testBuilderDoesNotBuildInvalidRequests() { "The builder should not allow using null for the namespace", NullPointerException.class, "Invalid namespace: null", - () -> GetNamespaceResponse.builder().withNamespace(null).build() - ); + () -> GetNamespaceResponse.builder().withNamespace(null).build()); AssertHelpers.assertThrows( "The builder should not allow passing a null collection of properties", NullPointerException.class, "Invalid properties map: null", - () -> GetNamespaceResponse.builder().setProperties(null).build() - ); + () -> GetNamespaceResponse.builder().setProperties(null).build()); Map mapWithNullKey = Maps.newHashMap(); mapWithNullKey.put(null, "hello"); @@ -123,8 +122,7 @@ public void testBuilderDoesNotBuildInvalidRequests() { "The builder should not allow using null as a key in the properties to set", IllegalArgumentException.class, "Invalid property: null", - () -> GetNamespaceResponse.builder().setProperties(mapWithNullKey).build() - ); + () -> GetNamespaceResponse.builder().setProperties(mapWithNullKey).build()); Map mapWithMultipleNullValues = Maps.newHashMap(); mapWithMultipleNullValues.put("a", null); @@ -133,13 +131,12 @@ public void testBuilderDoesNotBuildInvalidRequests() { "The builder should not allow using null as a value in the properties to set", IllegalArgumentException.class, "Invalid value for properties [a]: null", - () -> GetNamespaceResponse.builder().setProperties(mapWithMultipleNullValues).build() - ); + () -> GetNamespaceResponse.builder().setProperties(mapWithMultipleNullValues).build()); } @Override public String[] allFieldsFromSpec() { - return new String[] { "namespace", "properties" }; + return new String[] {"namespace", "properties"}; } @Override diff --git a/core/src/test/java/org/apache/iceberg/rest/responses/TestListNamespacesResponse.java b/core/src/test/java/org/apache/iceberg/rest/responses/TestListNamespacesResponse.java index 2918f1ee3d00..2c2c7bbfeb0f 100644 --- a/core/src/test/java/org/apache/iceberg/rest/responses/TestListNamespacesResponse.java +++ b/core/src/test/java/org/apache/iceberg/rest/responses/TestListNamespacesResponse.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; import com.fasterxml.jackson.core.JsonProcessingException; @@ -32,9 +31,8 @@ public class TestListNamespacesResponse extends RequestResponseTestBase { - private static final List NAMESPACES = ImmutableList.of( - Namespace.of("accounting"), - Namespace.of("tax")); + private static final List NAMESPACES = + ImmutableList.of(Namespace.of("accounting"), Namespace.of("tax")); @Test public void testRoundTripSerDe() throws JsonProcessingException { @@ -52,32 +50,27 @@ public void testDeserializeInvalidResponseThrows() { AssertHelpers.assertThrows( "A malformed JSON response with the wrong type for a field should fail to deserialize", JsonProcessingException.class, - () -> deserialize(jsonNamespacesHasWrongType) - ); + () -> deserialize(jsonNamespacesHasWrongType)); String emptyJson = "{}"; AssertHelpers.assertThrows( "An empty JSON response will deserialize, but not into a valid object", IllegalArgumentException.class, "Invalid namespace: null", - () -> deserialize(emptyJson) - ); + () -> deserialize(emptyJson)); - String jsonWithKeysSpelledIncorrectly = - "{\"namepsacezz\":[\"accounting\",\"tax\"]}"; + String jsonWithKeysSpelledIncorrectly = "{\"namepsacezz\":[\"accounting\",\"tax\"]}"; AssertHelpers.assertThrows( "A JSON response with the keys spelled incorrectly should fail to deserialize", IllegalArgumentException.class, "Invalid namespace: null", - () -> deserialize(jsonWithKeysSpelledIncorrectly) - ); + () -> deserialize(jsonWithKeysSpelledIncorrectly)); String nullJson = null; AssertHelpers.assertThrows( "A null JSON response should fail to deserialize", IllegalArgumentException.class, - () -> deserialize(nullJson) - ); + () -> deserialize(nullJson)); } @Test @@ -86,43 +79,38 @@ public void testBuilderDoesNotCreateInvalidObjects() { "The builder should not allow using null as a namespace to add to the list", NullPointerException.class, "Invalid namespace: null", - () -> ListNamespacesResponse.builder().add(null).build() - ); + () -> ListNamespacesResponse.builder().add(null).build()); AssertHelpers.assertThrows( "The builder should not allow passing a null list of namespaces to add", NullPointerException.class, "Invalid namespace list: null", - () -> ListNamespacesResponse.builder().addAll(null).build() - ); + () -> ListNamespacesResponse.builder().addAll(null).build()); List listWithNullElement = Lists.newArrayList(Namespace.of("a"), null); AssertHelpers.assertThrows( "The builder should not allow passing a collection of namespaces with a null element in it", IllegalArgumentException.class, "Invalid namespace: null", - () -> ListNamespacesResponse.builder().addAll(listWithNullElement).build() - ); + () -> ListNamespacesResponse.builder().addAll(listWithNullElement).build()); } - @Override public String[] allFieldsFromSpec() { - return new String[] { "namespaces" }; + return new String[] {"namespaces"}; } @Override public ListNamespacesResponse createExampleInstance() { - return ListNamespacesResponse.builder() - .addAll(NAMESPACES) - .build(); + return ListNamespacesResponse.builder().addAll(NAMESPACES).build(); } @Override public void assertEquals(ListNamespacesResponse actual, ListNamespacesResponse expected) { - Assert.assertTrue("Namespaces list should be equal", - actual.namespaces().size() == expected.namespaces().size() && - Sets.newHashSet(actual.namespaces()).equals(Sets.newHashSet(expected.namespaces()))); + Assert.assertTrue( + "Namespaces list should be equal", + actual.namespaces().size() == expected.namespaces().size() + && Sets.newHashSet(actual.namespaces()).equals(Sets.newHashSet(expected.namespaces()))); } @Override @@ -132,4 +120,3 @@ public ListNamespacesResponse deserialize(String json) throws JsonProcessingExce return resp; } } - diff --git a/core/src/test/java/org/apache/iceberg/rest/responses/TestListTablesResponse.java b/core/src/test/java/org/apache/iceberg/rest/responses/TestListTablesResponse.java index 9c868c051ab8..144a967d3070 100644 --- a/core/src/test/java/org/apache/iceberg/rest/responses/TestListTablesResponse.java +++ b/core/src/test/java/org/apache/iceberg/rest/responses/TestListTablesResponse.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; import com.fasterxml.jackson.core.JsonProcessingException; @@ -33,13 +32,15 @@ public class TestListTablesResponse extends RequestResponseTestBase { - private static final List IDENTIFIERS = ImmutableList.of( - TableIdentifier.of(Namespace.of("accounting", "tax"), "paid")); + private static final List IDENTIFIERS = + ImmutableList.of(TableIdentifier.of(Namespace.of("accounting", "tax"), "paid")); @Test public void testRoundTripSerDe() throws JsonProcessingException { - String fullJson = "{\"identifiers\":[{\"namespace\":[\"accounting\",\"tax\"],\"name\":\"paid\"}]}"; - assertRoundTripSerializesEquallyFrom(fullJson, ListTablesResponse.builder().addAll(IDENTIFIERS).build()); + String fullJson = + "{\"identifiers\":[{\"namespace\":[\"accounting\",\"tax\"],\"name\":\"paid\"}]}"; + assertRoundTripSerializesEquallyFrom( + fullJson, ListTablesResponse.builder().addAll(IDENTIFIERS).build()); String emptyIdentifiers = "{\"identifiers\":[]}"; assertRoundTripSerializesEquallyFrom(emptyIdentifiers, ListTablesResponse.builder().build()); @@ -73,31 +74,27 @@ public void testDeserializeInvalidResponsesThrows() { AssertHelpers.assertThrows( "A JSON response with an invalid identifier in the list of identifiers should fail to parse", JsonProcessingException.class, - () -> deserialize(jsonWithInvalidIdentifiersInList) - ); + () -> deserialize(jsonWithInvalidIdentifiersInList)); String jsonWithInvalidIdentifiersInList2 = "{\"identifiers\":[{\"namespace\":[\"accounting\",\"tax\"],\"name\":\"paid\"},\"accounting.tax.paid\"]}"; AssertHelpers.assertThrows( "A JSON response with an invalid identifier in the list of identifiers should fail to parse", JsonProcessingException.class, - () -> deserialize(jsonWithInvalidIdentifiersInList2) - ); + () -> deserialize(jsonWithInvalidIdentifiersInList2)); String jsonWithInvalidTypeForNamePartOfIdentifier = "{\"identifiers\":[{\"namespace\":[\"accounting\",\"tax\"],\"name\":true}]}"; AssertHelpers.assertThrows( "A JSON response with an invalid identifier in the list of identifiers should fail to parse", JsonProcessingException.class, - () -> deserialize(jsonWithInvalidTypeForNamePartOfIdentifier) - ); + () -> deserialize(jsonWithInvalidTypeForNamePartOfIdentifier)); String nullJson = null; AssertHelpers.assertThrows( "A null JSON response should fail to deserialize", IllegalArgumentException.class, - () -> deserialize(nullJson) - ); + () -> deserialize(nullJson)); } @Test @@ -106,43 +103,40 @@ public void testBuilderDoesNotCreateInvalidObjects() { "The builder should not allow using null as a table identifier to add to the list", NullPointerException.class, "Invalid table identifier: null", - () -> ListTablesResponse.builder().add(null).build() - ); + () -> ListTablesResponse.builder().add(null).build()); AssertHelpers.assertThrows( "The builder should not allow passing a null list of table identifiers to add", NullPointerException.class, "Invalid table identifier list: null", - () -> ListTablesResponse.builder().addAll(null).build() - ); + () -> ListTablesResponse.builder().addAll(null).build()); - List listWithNullElement = Lists.newArrayList( - TableIdentifier.of(Namespace.of("foo"), "bar"), null); + List listWithNullElement = + Lists.newArrayList(TableIdentifier.of(Namespace.of("foo"), "bar"), null); AssertHelpers.assertThrows( "The builder should not allow passing a collection of table identifiers with a null element in it", IllegalArgumentException.class, "Invalid table identifier: null", - () -> ListTablesResponse.builder().addAll(listWithNullElement).build() - ); + () -> ListTablesResponse.builder().addAll(listWithNullElement).build()); } @Override public String[] allFieldsFromSpec() { - return new String[] { "identifiers" }; + return new String[] {"identifiers"}; } @Override public ListTablesResponse createExampleInstance() { - return ListTablesResponse.builder() - .addAll(IDENTIFIERS) - .build(); + return ListTablesResponse.builder().addAll(IDENTIFIERS).build(); } @Override public void assertEquals(ListTablesResponse actual, ListTablesResponse expected) { - Assert.assertTrue("Identifiers should be equal", - actual.identifiers().size() == expected.identifiers().size() && - Sets.newHashSet(actual.identifiers()).equals(Sets.newHashSet(expected.identifiers()))); + Assert.assertTrue( + "Identifiers should be equal", + actual.identifiers().size() == expected.identifiers().size() + && Sets.newHashSet(actual.identifiers()) + .equals(Sets.newHashSet(expected.identifiers()))); } @Override diff --git a/core/src/test/java/org/apache/iceberg/rest/responses/TestLoadTableResponse.java b/core/src/test/java/org/apache/iceberg/rest/responses/TestLoadTableResponse.java index 6cb2ef2e0a08..de07fc51816c 100644 --- a/core/src/test/java/org/apache/iceberg/rest/responses/TestLoadTableResponse.java +++ b/core/src/test/java/org/apache/iceberg/rest/responses/TestLoadTableResponse.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; +import static org.apache.iceberg.TestHelpers.assertSameSchemaList; + import com.fasterxml.jackson.core.JsonProcessingException; import java.nio.file.Path; import java.nio.file.Paths; @@ -39,53 +40,53 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TestHelpers.assertSameSchemaList; - public class TestLoadTableResponse extends RequestResponseTestBase { - private static final String TEST_METADATA_LOCATION = "s3://bucket/test/location/metadata/v1.metadata.json"; + private static final String TEST_METADATA_LOCATION = + "s3://bucket/test/location/metadata/v1.metadata.json"; private static final String TEST_TABLE_LOCATION = "s3://bucket/test/location"; - private static final Schema SCHEMA_7 = new Schema(7, - Types.NestedField.required(1, "x", Types.LongType.get()), - Types.NestedField.required(2, "y", Types.LongType.get(), "comment"), - Types.NestedField.required(3, "z", Types.LongType.get()) - ); + private static final Schema SCHEMA_7 = + new Schema( + 7, + Types.NestedField.required(1, "x", Types.LongType.get()), + Types.NestedField.required(2, "y", Types.LongType.get(), "comment"), + Types.NestedField.required(3, "z", Types.LongType.get())); - private static final PartitionSpec SPEC_5 = PartitionSpec.builderFor(SCHEMA_7).withSpecId(5).build(); + private static final PartitionSpec SPEC_5 = + PartitionSpec.builderFor(SCHEMA_7).withSpecId(5).build(); - private static final SortOrder SORT_ORDER_3 = SortOrder.builderFor(SCHEMA_7) - .withOrderId(3) - .asc("y", NullOrder.NULLS_FIRST) - .desc(Expressions.bucket("z", 4), NullOrder.NULLS_LAST) - .build(); + private static final SortOrder SORT_ORDER_3 = + SortOrder.builderFor(SCHEMA_7) + .withOrderId(3) + .asc("y", NullOrder.NULLS_FIRST) + .desc(Expressions.bucket("z", 4), NullOrder.NULLS_LAST) + .build(); - private static final Map TABLE_PROPS = ImmutableMap.of( - "format-version", "1", - "owner", "hank"); + private static final Map TABLE_PROPS = + ImmutableMap.of( + "format-version", "1", + "owner", "hank"); private static final Map CONFIG = ImmutableMap.of("foo", "bar"); @Override public String[] allFieldsFromSpec() { - return new String[] { "metadata-location", "metadata", "config" }; + return new String[] {"metadata-location", "metadata", "config"}; } @Override public LoadTableResponse createExampleInstance() { TableMetadata metadata = - TableMetadata - .buildFrom( - TableMetadata.newTableMetadata(SCHEMA_7, SPEC_5, SORT_ORDER_3, TEST_TABLE_LOCATION, TABLE_PROPS)) + TableMetadata.buildFrom( + TableMetadata.newTableMetadata( + SCHEMA_7, SPEC_5, SORT_ORDER_3, TEST_TABLE_LOCATION, TABLE_PROPS)) .discardChanges() .withMetadataLocation(TEST_METADATA_LOCATION) .build(); - return LoadTableResponse.builder() - .withTableMetadata(metadata) - .addAllConfig(CONFIG) - .build(); + return LoadTableResponse.builder().withTableMetadata(metadata).addAllConfig(CONFIG).build(); } @Override @@ -107,16 +108,17 @@ public void testFailures() { @Test public void testRoundTripSerdeWithV1TableMetadata() throws Exception { String tableMetadataJson = readTableMetadataInputFile("TableMetadataV1Valid.json"); - TableMetadata v1Metadata = TableMetadataParser.fromJson(null, TEST_METADATA_LOCATION, tableMetadataJson); - // Convert the TableMetadata JSON from the file to an object and then back to JSON so that missing fields + TableMetadata v1Metadata = + TableMetadataParser.fromJson(null, TEST_METADATA_LOCATION, tableMetadataJson); + // Convert the TableMetadata JSON from the file to an object and then back to JSON so that + // missing fields // are filled in with their default values. - String json = String.format( - "{\"metadata-location\":\"%s\",\"metadata\":%s,\"config\":{\"foo\":\"bar\"}}", - TEST_METADATA_LOCATION, TableMetadataParser.toJson(v1Metadata)); - LoadTableResponse resp = LoadTableResponse.builder() - .withTableMetadata(v1Metadata) - .addAllConfig(CONFIG) - .build(); + String json = + String.format( + "{\"metadata-location\":\"%s\",\"metadata\":%s,\"config\":{\"foo\":\"bar\"}}", + TEST_METADATA_LOCATION, TableMetadataParser.toJson(v1Metadata)); + LoadTableResponse resp = + LoadTableResponse.builder().withTableMetadata(v1Metadata).addAllConfig(CONFIG).build(); assertRoundTripSerializesEquallyFrom(json, resp); } @@ -128,23 +130,23 @@ public void testMissingSchemaType() throws Exception { "Cannot parse type from json when there is no type", IllegalArgumentException.class, "Cannot parse type from json:", - () -> TableMetadataParser.fromJson(null, TEST_METADATA_LOCATION, tableMetadataJson) - ); + () -> TableMetadataParser.fromJson(null, TEST_METADATA_LOCATION, tableMetadataJson)); } @Test public void testRoundTripSerdeWithV2TableMetadata() throws Exception { String tableMetadataJson = readTableMetadataInputFile("TableMetadataV2Valid.json"); - TableMetadata v2Metadata = TableMetadataParser.fromJson(null, TEST_METADATA_LOCATION, tableMetadataJson); - // Convert the TableMetadata JSON from the file to an object and then back to JSON so that missing fields + TableMetadata v2Metadata = + TableMetadataParser.fromJson(null, TEST_METADATA_LOCATION, tableMetadataJson); + // Convert the TableMetadata JSON from the file to an object and then back to JSON so that + // missing fields // are filled in with their default values. - String json = String.format( - "{\"metadata-location\":\"%s\",\"metadata\":%s,\"config\":{\"foo\":\"bar\"}}", - TEST_METADATA_LOCATION, TableMetadataParser.toJson(v2Metadata)); - LoadTableResponse resp = LoadTableResponse.builder() - .withTableMetadata(v2Metadata) - .addAllConfig(CONFIG) - .build(); + String json = + String.format( + "{\"metadata-location\":\"%s\",\"metadata\":%s,\"config\":{\"foo\":\"bar\"}}", + TEST_METADATA_LOCATION, TableMetadataParser.toJson(v2Metadata)); + LoadTableResponse resp = + LoadTableResponse.builder().withTableMetadata(v2Metadata).addAllConfig(CONFIG).build(); assertRoundTripSerializesEquallyFrom(json, resp); } @@ -152,14 +154,17 @@ public void testRoundTripSerdeWithV2TableMetadata() throws Exception { public void testCanDeserializeWithoutDefaultValues() throws Exception { String metadataJson = readTableMetadataInputFile("TableMetadataV1Valid.json"); // `config` is missing in the JSON - String json = String.format("{\"metadata-location\":\"%s\",\"metadata\":%s}", TEST_METADATA_LOCATION, metadataJson); - TableMetadata metadata = TableMetadataParser.fromJson(null, TEST_METADATA_LOCATION, metadataJson); + String json = + String.format( + "{\"metadata-location\":\"%s\",\"metadata\":%s}", TEST_METADATA_LOCATION, metadataJson); + TableMetadata metadata = + TableMetadataParser.fromJson(null, TEST_METADATA_LOCATION, metadataJson); LoadTableResponse actual = deserialize(json); - LoadTableResponse expected = LoadTableResponse.builder() - .withTableMetadata(metadata) - .build(); + LoadTableResponse expected = LoadTableResponse.builder().withTableMetadata(metadata).build(); assertEquals(actual, expected); - Assert.assertEquals("Deserialized JSON with missing fields should have the default values", ImmutableMap.of(), + Assert.assertEquals( + "Deserialized JSON with missing fields should have the default values", + ImmutableMap.of(), actual.config()); } @@ -167,63 +172,76 @@ public void testCanDeserializeWithoutDefaultValues() throws Exception { public void assertEquals(LoadTableResponse actual, LoadTableResponse expected) { Assert.assertEquals("Should have the same configuration", expected.config(), actual.config()); assertEqualTableMetadata(actual.tableMetadata(), expected.tableMetadata()); - Assert.assertEquals("Should have the same metadata location", - expected.metadataLocation(), actual.metadataLocation()); + Assert.assertEquals( + "Should have the same metadata location", + expected.metadataLocation(), + actual.metadataLocation()); } private void assertEqualTableMetadata(TableMetadata actual, TableMetadata expected) { - Assert.assertEquals("Format version should match", - expected.formatVersion(), actual.formatVersion()); - Assert.assertEquals("Table UUID should match", - expected.uuid(), actual.uuid()); - Assert.assertEquals("Table location should match", - expected.location(), actual.location()); + Assert.assertEquals( + "Format version should match", expected.formatVersion(), actual.formatVersion()); + Assert.assertEquals("Table UUID should match", expected.uuid(), actual.uuid()); + Assert.assertEquals("Table location should match", expected.location(), actual.location()); Assert.assertEquals("Last column id", expected.lastColumnId(), actual.lastColumnId()); - Assert.assertEquals("Schema should match", expected.schema().asStruct(), actual.schema().asStruct()); + Assert.assertEquals( + "Schema should match", expected.schema().asStruct(), actual.schema().asStruct()); assertSameSchemaList(expected.schemas(), actual.schemas()); - Assert.assertEquals("Current schema id should match", expected.currentSchemaId(), actual.currentSchemaId()); - Assert.assertEquals("Schema should match", expected.schema().asStruct(), actual.schema().asStruct()); - Assert.assertEquals("Last sequence number should match", - expected.lastSequenceNumber(), actual.lastSequenceNumber()); - Assert.assertEquals("Partition spec should match", - expected.spec().toString(), actual.spec().toString()); - Assert.assertEquals("Default spec ID should match", - expected.defaultSpecId(), actual.defaultSpecId()); - Assert.assertEquals("PartitionSpec map should match", - expected.specs(), actual.specs()); - Assert.assertEquals("Default Sort ID should match", - expected.defaultSortOrderId(), actual.defaultSortOrderId()); - Assert.assertEquals("Sort order should match", - expected.sortOrder(), actual.sortOrder()); - Assert.assertEquals("Sort order map should match", - expected.sortOrders(), actual.sortOrders()); - Assert.assertEquals("Properties should match", - expected.properties(), actual.properties()); - Assert.assertEquals("Snapshots should match", + Assert.assertEquals( + "Current schema id should match", expected.currentSchemaId(), actual.currentSchemaId()); + Assert.assertEquals( + "Schema should match", expected.schema().asStruct(), actual.schema().asStruct()); + Assert.assertEquals( + "Last sequence number should match", + expected.lastSequenceNumber(), + actual.lastSequenceNumber()); + Assert.assertEquals( + "Partition spec should match", expected.spec().toString(), actual.spec().toString()); + Assert.assertEquals( + "Default spec ID should match", expected.defaultSpecId(), actual.defaultSpecId()); + Assert.assertEquals("PartitionSpec map should match", expected.specs(), actual.specs()); + Assert.assertEquals( + "Default Sort ID should match", expected.defaultSortOrderId(), actual.defaultSortOrderId()); + Assert.assertEquals("Sort order should match", expected.sortOrder(), actual.sortOrder()); + Assert.assertEquals("Sort order map should match", expected.sortOrders(), actual.sortOrders()); + Assert.assertEquals("Properties should match", expected.properties(), actual.properties()); + Assert.assertEquals( + "Snapshots should match", Lists.transform(expected.snapshots(), Snapshot::snapshotId), Lists.transform(actual.snapshots(), Snapshot::snapshotId)); Assert.assertEquals("History should match", expected.snapshotLog(), actual.snapshotLog()); Snapshot expectedCurrentSnapshot = expected.currentSnapshot(); Snapshot actualCurrentSnapshot = actual.currentSnapshot(); - Assert.assertTrue("Both expected and actual current snapshot should either be null or non-null", - (expectedCurrentSnapshot != null && actualCurrentSnapshot != null) || - (expectedCurrentSnapshot == null && actualCurrentSnapshot == null)); + Assert.assertTrue( + "Both expected and actual current snapshot should either be null or non-null", + (expectedCurrentSnapshot != null && actualCurrentSnapshot != null) + || (expectedCurrentSnapshot == null && actualCurrentSnapshot == null)); if (expectedCurrentSnapshot != null) { - Assert.assertEquals("Current snapshot ID should match", - expected.currentSnapshot().snapshotId(), actual.currentSnapshot().snapshotId()); - Assert.assertEquals("Parent snapshot ID should match", - expected.currentSnapshot().parentId(), actual.currentSnapshot().parentId()); - Assert.assertEquals("Schema ID for current snapshot should match", - expected.currentSnapshot().schemaId(), actual.currentSnapshot().schemaId()); + Assert.assertEquals( + "Current snapshot ID should match", + expected.currentSnapshot().snapshotId(), + actual.currentSnapshot().snapshotId()); + Assert.assertEquals( + "Parent snapshot ID should match", + expected.currentSnapshot().parentId(), + actual.currentSnapshot().parentId()); + Assert.assertEquals( + "Schema ID for current snapshot should match", + expected.currentSnapshot().schemaId(), + actual.currentSnapshot().schemaId()); } - Assert.assertEquals("Metadata file location should match", - expected.metadataFileLocation(), actual.metadataFileLocation()); - Assert.assertEquals("Last column id should match", expected.lastColumnId(), actual.lastColumnId()); - Assert.assertEquals("Schema should match", expected.schema().asStruct(), actual.schema().asStruct()); + Assert.assertEquals( + "Metadata file location should match", + expected.metadataFileLocation(), + actual.metadataFileLocation()); + Assert.assertEquals( + "Last column id should match", expected.lastColumnId(), actual.lastColumnId()); + Assert.assertEquals( + "Schema should match", expected.schema().asStruct(), actual.schema().asStruct()); assertSameSchemaList(expected.schemas(), actual.schemas()); - Assert.assertEquals("Current schema id should match", expected.currentSchemaId(), actual.currentSchemaId()); - Assert.assertEquals("Refs map should match", - expected.refs(), actual.refs()); + Assert.assertEquals( + "Current schema id should match", expected.currentSchemaId(), actual.currentSchemaId()); + Assert.assertEquals("Refs map should match", expected.refs(), actual.refs()); } private String readTableMetadataInputFile(String fileName) throws Exception { diff --git a/core/src/test/java/org/apache/iceberg/rest/responses/TestOAuthTokenResponse.java b/core/src/test/java/org/apache/iceberg/rest/responses/TestOAuthTokenResponse.java index e9fe379e3242..6377b40489c4 100644 --- a/core/src/test/java/org/apache/iceberg/rest/responses/TestOAuthTokenResponse.java +++ b/core/src/test/java/org/apache/iceberg/rest/responses/TestOAuthTokenResponse.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; import com.fasterxml.jackson.core.JsonProcessingException; @@ -29,7 +28,7 @@ public class TestOAuthTokenResponse extends RequestResponseTestBase { @Override public String[] allFieldsFromSpec() { - return new String[] { "access_token", "token_type", "issued_token_type", "expires_in", "scope" }; + return new String[] {"access_token", "token_type", "issued_token_type", "expires_in", "scope"}; } @Override @@ -47,8 +46,10 @@ public OAuthTokenResponse createExampleInstance() { public void assertEquals(OAuthTokenResponse actual, OAuthTokenResponse expected) { Assert.assertEquals("Token should match", expected.token(), actual.token()); Assert.assertEquals("Token type should match", expected.tokenType(), actual.tokenType()); - Assert.assertEquals("Issued token type should match", expected.issuedTokenType(), actual.issuedTokenType()); - Assert.assertEquals("Expiration should match", expected.expiresInSeconds(), actual.expiresInSeconds()); + Assert.assertEquals( + "Issued token type should match", expected.issuedTokenType(), actual.issuedTokenType()); + Assert.assertEquals( + "Expiration should match", expected.expiresInSeconds(), actual.expiresInSeconds()); Assert.assertEquals("Scope should match", expected.scopes(), actual.scopes()); } @@ -69,8 +70,8 @@ public void testRoundTrip() throws Exception { OAuthTokenResponse.builder().withToken("bearer-token").withTokenType("bearer").build()); assertRoundTripSerializesEquallyFrom( - "{\"access_token\":\"bearer-token\",\"token_type\":\"bearer\"," + - "\"issued_token_type\":\"urn:ietf:params:oauth:token-type:access_token\"}", + "{\"access_token\":\"bearer-token\",\"token_type\":\"bearer\"," + + "\"issued_token_type\":\"urn:ietf:params:oauth:token-type:access_token\"}", OAuthTokenResponse.builder() .withToken("bearer-token") .withTokenType("bearer") @@ -95,9 +96,9 @@ public void testRoundTrip() throws Exception { .build()); assertRoundTripSerializesEquallyFrom( - "{\"access_token\":\"bearer-token\",\"token_type\":\"bearer\"," + - "\"issued_token_type\":\"urn:ietf:params:oauth:token-type:access_token\"," + - "\"expires_in\":600,\"scope\":\"a b\"}", + "{\"access_token\":\"bearer-token\",\"token_type\":\"bearer\"," + + "\"issued_token_type\":\"urn:ietf:params:oauth:token-type:access_token\"," + + "\"expires_in\":600,\"scope\":\"a b\"}", OAuthTokenResponse.builder() .withToken("bearer-token") .withTokenType("bearer") diff --git a/core/src/test/java/org/apache/iceberg/rest/responses/TestUpdateNamespacePropertiesResponse.java b/core/src/test/java/org/apache/iceberg/rest/responses/TestUpdateNamespacePropertiesResponse.java index 9adf2be9b861..4fbf386c02a0 100644 --- a/core/src/test/java/org/apache/iceberg/rest/responses/TestUpdateNamespacePropertiesResponse.java +++ b/core/src/test/java/org/apache/iceberg/rest/responses/TestUpdateNamespacePropertiesResponse.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.rest.responses; import com.fasterxml.jackson.core.JsonProcessingException; @@ -29,7 +28,8 @@ import org.junit.Assert; import org.junit.Test; -public class TestUpdateNamespacePropertiesResponse extends RequestResponseTestBase { +public class TestUpdateNamespacePropertiesResponse + extends RequestResponseTestBase { /* Values used to fill in response fields */ private static final List UPDATED = ImmutableList.of("owner"); @@ -44,7 +44,10 @@ public void testRoundTripSerDe() throws JsonProcessingException { assertRoundTripSerializesEquallyFrom( fullJson, UpdateNamespacePropertiesResponse.builder() - .addUpdated(UPDATED).addRemoved(REMOVED).addMissing(MISSING).build()); + .addUpdated(UPDATED) + .addRemoved(REMOVED) + .addMissing(MISSING) + .build()); // Only updated String jsonOnlyUpdated = "{\"removed\":[],\"updated\":[\"owner\"],\"missing\":[]}"; @@ -56,7 +59,10 @@ public void testRoundTripSerDe() throws JsonProcessingException { assertRoundTripSerializesEquallyFrom( jsonOnlyUpdated, UpdateNamespacePropertiesResponse.builder() - .addUpdated(UPDATED).addMissing(EMPTY_LIST).addRemoved(EMPTY_LIST).build()); + .addUpdated(UPDATED) + .addMissing(EMPTY_LIST) + .addRemoved(EMPTY_LIST) + .build()); // Only removed String jsonOnlyRemoved = "{\"removed\":[\"foo\"],\"updated\":[],\"missing\":[]}"; @@ -68,7 +74,10 @@ public void testRoundTripSerDe() throws JsonProcessingException { assertRoundTripSerializesEquallyFrom( jsonOnlyRemoved, UpdateNamespacePropertiesResponse.builder() - .addRemoved(REMOVED).addUpdated(EMPTY_LIST).addMissing(EMPTY_LIST).build()); + .addRemoved(REMOVED) + .addUpdated(EMPTY_LIST) + .addMissing(EMPTY_LIST) + .build()); // Only missing String jsonOnlyMissing = "{\"removed\":[],\"updated\":[],\"missing\":[\"bar\"]}"; @@ -81,11 +90,13 @@ public void testRoundTripSerDe() throws JsonProcessingException { assertRoundTripSerializesEquallyFrom( jsonOnlyMissing, UpdateNamespacePropertiesResponse.builder() - .addMissing(MISSING).addUpdated(EMPTY_LIST).addRemoved(EMPTY_LIST).build()); + .addMissing(MISSING) + .addUpdated(EMPTY_LIST) + .addRemoved(EMPTY_LIST) + .build()); // All fields are empty - String jsonWithAllFieldsAsEmptyList = - "{\"removed\":[],\"updated\":[],\"missing\":[]}"; + String jsonWithAllFieldsAsEmptyList = "{\"removed\":[],\"updated\":[],\"missing\":[]}"; assertRoundTripSerializesEquallyFrom( jsonWithAllFieldsAsEmptyList, UpdateNamespacePropertiesResponse.builder().build()); } @@ -94,37 +105,39 @@ public void testRoundTripSerDe() throws JsonProcessingException { // Test cases that can't be constructed with our Builder class e2e but that will parse correctly public void testCanDeserializeWithoutDefaultValues() throws JsonProcessingException { // only updated - UpdateNamespacePropertiesResponse onlyUpdated = UpdateNamespacePropertiesResponse.builder() - .addUpdated(UPDATED).build(); - String jsonOnlyUpdatedOthersNull = "{\"removed\":null,\"updated\":[\"owner\"],\"missing\":null}"; + UpdateNamespacePropertiesResponse onlyUpdated = + UpdateNamespacePropertiesResponse.builder().addUpdated(UPDATED).build(); + String jsonOnlyUpdatedOthersNull = + "{\"removed\":null,\"updated\":[\"owner\"],\"missing\":null}"; assertEquals(deserialize(jsonOnlyUpdatedOthersNull), onlyUpdated); String jsonOnlyUpdatedOthersMissing = "{\"updated\":[\"owner\"]}"; assertEquals(deserialize(jsonOnlyUpdatedOthersMissing), onlyUpdated); // Only removed - UpdateNamespacePropertiesResponse onlyRemoved = UpdateNamespacePropertiesResponse.builder() - .addRemoved(REMOVED).build(); - String jsonOnlyRemovedOthersNull = "{\"removed\":[\"foo\"],\"updated\":null,\"missing\":null}"; + UpdateNamespacePropertiesResponse onlyRemoved = + UpdateNamespacePropertiesResponse.builder().addRemoved(REMOVED).build(); + String jsonOnlyRemovedOthersNull = "{\"removed\":[\"foo\"],\"updated\":null,\"missing\":null}"; assertEquals(deserialize(jsonOnlyRemovedOthersNull), onlyRemoved); - String jsonOnlyRemovedOthersMissing = "{\"removed\":[\"foo\"]}"; + String jsonOnlyRemovedOthersMissing = "{\"removed\":[\"foo\"]}"; assertEquals(deserialize(jsonOnlyRemovedOthersMissing), onlyRemoved); // Only missing - UpdateNamespacePropertiesResponse onlyMissing = UpdateNamespacePropertiesResponse.builder() - .addMissing(MISSING).build(); - String jsonOnlyMissingFieldOthersNull = "{\"removed\":null,\"updated\":null,\"missing\":[\"bar\"]}"; + UpdateNamespacePropertiesResponse onlyMissing = + UpdateNamespacePropertiesResponse.builder().addMissing(MISSING).build(); + String jsonOnlyMissingFieldOthersNull = + "{\"removed\":null,\"updated\":null,\"missing\":[\"bar\"]}"; assertEquals(deserialize(jsonOnlyMissingFieldOthersNull), onlyMissing); String jsonOnlyMissingFieldIsPresent = "{\"missing\":[\"bar\"]}"; assertEquals(deserialize(jsonOnlyMissingFieldIsPresent), onlyMissing); // all fields are missing - UpdateNamespacePropertiesResponse noValues = UpdateNamespacePropertiesResponse.builder().build(); + UpdateNamespacePropertiesResponse noValues = + UpdateNamespacePropertiesResponse.builder().build(); String emptyJson = "{}"; assertEquals(deserialize(emptyJson), noValues); - } @Test @@ -135,16 +148,13 @@ public void testDeserializeInvalidResponse() { AssertHelpers.assertThrows( "A JSON response with an invalid type for one of the fields should fail to parse", JsonProcessingException.class, - () -> deserialize(jsonInvalidTypeOnRemovedField) - ); + () -> deserialize(jsonInvalidTypeOnRemovedField)); - String jsonInvalidTypeOnUpdatedField = - "{\"updated\":\"owner\",\"missing\":[\"bar\"]}"; + String jsonInvalidTypeOnUpdatedField = "{\"updated\":\"owner\",\"missing\":[\"bar\"]}"; AssertHelpers.assertThrows( "A JSON response with an invalid type for one of the fields should fail to parse", JsonProcessingException.class, - () -> deserialize(jsonInvalidTypeOnUpdatedField) - ); + () -> deserialize(jsonInvalidTypeOnUpdatedField)); // Valid top-level (array) types, but at least one entry in the list is not the expected type String jsonInvalidValueOfTypeIntNestedInRemovedList = @@ -152,15 +162,13 @@ public void testDeserializeInvalidResponse() { AssertHelpers.assertThrows( "A JSON response with an invalid type inside one of the list fields should fail to deserialize", JsonProcessingException.class, - () -> deserialize(jsonInvalidValueOfTypeIntNestedInRemovedList) - ); + () -> deserialize(jsonInvalidValueOfTypeIntNestedInRemovedList)); // Exception comes from Jackson AssertHelpers.assertThrows( "A null JSON response body should fail to deserialize", IllegalArgumentException.class, - () -> deserialize(null) - ); + () -> deserialize(null)); } @Test @@ -172,71 +180,62 @@ public void testBuilderDoesNotCreateInvalidObjects() { "The builder should not allow using null as a property that was updated", NullPointerException.class, "Invalid updated property: null", - () -> UpdateNamespacePropertiesResponse.builder().addUpdated((String) null).build() - ); + () -> UpdateNamespacePropertiesResponse.builder().addUpdated((String) null).build()); AssertHelpers.assertThrows( "The builder should not allow passing a null list of properties that were removed", NullPointerException.class, "Invalid updated property list: null", - () -> UpdateNamespacePropertiesResponse.builder().addUpdated((List) null).build() - ); + () -> UpdateNamespacePropertiesResponse.builder().addUpdated((List) null).build()); AssertHelpers.assertThrows( "The builder should not allow passing a list of properties that were removed with a null element", IllegalArgumentException.class, "Invalid updated property: null", - () -> UpdateNamespacePropertiesResponse.builder().addUpdated(listContainingNull).build() - ); + () -> UpdateNamespacePropertiesResponse.builder().addUpdated(listContainingNull).build()); // removed AssertHelpers.assertThrows( "The builder should not allow using null as a property that was removed", NullPointerException.class, "Invalid removed property: null", - () -> UpdateNamespacePropertiesResponse.builder().addRemoved((String) null).build() - ); + () -> UpdateNamespacePropertiesResponse.builder().addRemoved((String) null).build()); AssertHelpers.assertThrows( "The builder should not allow passing a null list of properties that were removed", NullPointerException.class, "Invalid removed property list: null", - () -> UpdateNamespacePropertiesResponse.builder().addRemoved((List) null).build() - ); + () -> UpdateNamespacePropertiesResponse.builder().addRemoved((List) null).build()); AssertHelpers.assertThrows( "The builder should not allow passing a list of properties that were removed with a null element", IllegalArgumentException.class, "Invalid removed property: null", - () -> UpdateNamespacePropertiesResponse.builder().addRemoved(listContainingNull).build() - ); + () -> UpdateNamespacePropertiesResponse.builder().addRemoved(listContainingNull).build()); // missing AssertHelpers.assertThrows( "The builder should not allow using null as a property that was missing", NullPointerException.class, "Invalid missing property: null", - () -> UpdateNamespacePropertiesResponse.builder().addMissing((String) null).build() - ); + () -> UpdateNamespacePropertiesResponse.builder().addMissing((String) null).build()); AssertHelpers.assertThrows( "The builder should not allow passing a null list of properties that were missing", NullPointerException.class, "Invalid missing property list: null", - () -> UpdateNamespacePropertiesResponse.builder().addMissing((List) null).build() - ); + () -> UpdateNamespacePropertiesResponse.builder().addMissing((List) null).build()); AssertHelpers.assertThrows( "The builder should not allow passing a list of properties that were missing with a null element", IllegalArgumentException.class, "Invalid missing property: null", - () -> UpdateNamespacePropertiesResponse.builder().addMissing(listContainingNull).build() - ); + () -> UpdateNamespacePropertiesResponse.builder().addMissing(listContainingNull).build()); } @Override public String[] allFieldsFromSpec() { - return new String[] { "updated", "removed", "missing" }; + return new String[] {"updated", "removed", "missing"}; } @Override @@ -249,18 +248,26 @@ public UpdateNamespacePropertiesResponse createExampleInstance() { } @Override - public void assertEquals(UpdateNamespacePropertiesResponse actual, UpdateNamespacePropertiesResponse expected) { - Assert.assertEquals("Properties updated should be equal", - Sets.newHashSet(actual.updated()), Sets.newHashSet(expected.updated())); - Assert.assertEquals("Properties removed should be equal", - Sets.newHashSet(actual.removed()), Sets.newHashSet(expected.removed())); - Assert.assertEquals("Properties missing should be equal", - Sets.newHashSet(actual.missing()), Sets.newHashSet(expected.missing())); + public void assertEquals( + UpdateNamespacePropertiesResponse actual, UpdateNamespacePropertiesResponse expected) { + Assert.assertEquals( + "Properties updated should be equal", + Sets.newHashSet(actual.updated()), + Sets.newHashSet(expected.updated())); + Assert.assertEquals( + "Properties removed should be equal", + Sets.newHashSet(actual.removed()), + Sets.newHashSet(expected.removed())); + Assert.assertEquals( + "Properties missing should be equal", + Sets.newHashSet(actual.missing()), + Sets.newHashSet(expected.missing())); } @Override public UpdateNamespacePropertiesResponse deserialize(String json) throws JsonProcessingException { - UpdateNamespacePropertiesResponse resp = mapper().readValue(json, UpdateNamespacePropertiesResponse.class); + UpdateNamespacePropertiesResponse resp = + mapper().readValue(json, UpdateNamespacePropertiesResponse.class); resp.validate(); return resp; } diff --git a/core/src/test/java/org/apache/iceberg/util/FakeTicker.java b/core/src/test/java/org/apache/iceberg/util/FakeTicker.java index ddfa7b2e62a8..66316b6ab26d 100644 --- a/core/src/test/java/org/apache/iceberg/util/FakeTicker.java +++ b/core/src/test/java/org/apache/iceberg/util/FakeTicker.java @@ -16,22 +16,18 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import com.github.benmanes.caffeine.cache.Ticker; import java.time.Duration; import java.util.concurrent.atomic.AtomicLong; -/** - * A {@code Ticker} whose value can be advanced programmatically in tests - */ +/** A {@code Ticker} whose value can be advanced programmatically in tests */ public class FakeTicker implements Ticker { private final AtomicLong nanos = new AtomicLong(); - public FakeTicker() { - } + public FakeTicker() {} public FakeTicker advance(Duration duration) { nanos.addAndGet(duration.toNanos()); diff --git a/core/src/test/java/org/apache/iceberg/util/TestBinPacking.java b/core/src/test/java/org/apache/iceberg/util/TestBinPacking.java index fd8267ff0c3a..59d64d542ba7 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestBinPacking.java +++ b/core/src/test/java/org/apache/iceberg/util/TestBinPacking.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.List; @@ -28,107 +27,169 @@ public class TestBinPacking { @Test public void testBasicBinPacking() { - Assert.assertEquals("Should pack the first 2 values", - list(list(1, 2), list(3), list(4), list(5)), pack(list(1, 2, 3, 4, 5), 3)); + Assert.assertEquals( + "Should pack the first 2 values", + list(list(1, 2), list(3), list(4), list(5)), + pack(list(1, 2, 3, 4, 5), 3)); - Assert.assertEquals("Should pack the first 2 values", - list(list(1, 2), list(3), list(4), list(5)), pack(list(1, 2, 3, 4, 5), 5)); + Assert.assertEquals( + "Should pack the first 2 values", + list(list(1, 2), list(3), list(4), list(5)), + pack(list(1, 2, 3, 4, 5), 5)); - Assert.assertEquals("Should pack the first 3 values", - list(list(1, 2, 3), list(4), list(5)), pack(list(1, 2, 3, 4, 5), 6)); + Assert.assertEquals( + "Should pack the first 3 values", + list(list(1, 2, 3), list(4), list(5)), + pack(list(1, 2, 3, 4, 5), 6)); - Assert.assertEquals("Should pack the first 3 values", - list(list(1, 2, 3), list(4), list(5)), pack(list(1, 2, 3, 4, 5), 8)); + Assert.assertEquals( + "Should pack the first 3 values", + list(list(1, 2, 3), list(4), list(5)), + pack(list(1, 2, 3, 4, 5), 8)); - Assert.assertEquals("Should pack the first 3 values, last 2 values", - list(list(1, 2, 3), list(4, 5)), pack(list(1, 2, 3, 4, 5), 9)); + Assert.assertEquals( + "Should pack the first 3 values, last 2 values", + list(list(1, 2, 3), list(4, 5)), + pack(list(1, 2, 3, 4, 5), 9)); - Assert.assertEquals("Should pack the first 4 values", - list(list(1, 2, 3, 4), list(5)), pack(list(1, 2, 3, 4, 5), 10)); + Assert.assertEquals( + "Should pack the first 4 values", + list(list(1, 2, 3, 4), list(5)), + pack(list(1, 2, 3, 4, 5), 10)); - Assert.assertEquals("Should pack the first 4 values", - list(list(1, 2, 3, 4), list(5)), pack(list(1, 2, 3, 4, 5), 14)); + Assert.assertEquals( + "Should pack the first 4 values", + list(list(1, 2, 3, 4), list(5)), + pack(list(1, 2, 3, 4, 5), 14)); - Assert.assertEquals("Should pack the first 5 values", - list(list(1, 2, 3, 4, 5)), pack(list(1, 2, 3, 4, 5), 15)); + Assert.assertEquals( + "Should pack the first 5 values", list(list(1, 2, 3, 4, 5)), pack(list(1, 2, 3, 4, 5), 15)); } @Test public void testReverseBinPackingSingleLookback() { - Assert.assertEquals("Should pack the first 2 values", - list(list(1, 2), list(3), list(4), list(5)), packEnd(list(1, 2, 3, 4, 5), 3, 1)); + Assert.assertEquals( + "Should pack the first 2 values", + list(list(1, 2), list(3), list(4), list(5)), + packEnd(list(1, 2, 3, 4, 5), 3, 1)); - Assert.assertEquals("Should pack the first 2 values", - list(list(1, 2), list(3), list(4), list(5)), packEnd(list(1, 2, 3, 4, 5), 4, 1)); + Assert.assertEquals( + "Should pack the first 2 values", + list(list(1, 2), list(3), list(4), list(5)), + packEnd(list(1, 2, 3, 4, 5), 4, 1)); - Assert.assertEquals("Should pack the second and third values", - list(list(1), list(2, 3), list(4), list(5)), packEnd(list(1, 2, 3, 4, 5), 5, 1)); + Assert.assertEquals( + "Should pack the second and third values", + list(list(1), list(2, 3), list(4), list(5)), + packEnd(list(1, 2, 3, 4, 5), 5, 1)); - Assert.assertEquals("Should pack the first 3 values", - list(list(1, 2, 3), list(4), list(5)), packEnd(list(1, 2, 3, 4, 5), 6, 1)); + Assert.assertEquals( + "Should pack the first 3 values", + list(list(1, 2, 3), list(4), list(5)), + packEnd(list(1, 2, 3, 4, 5), 6, 1)); - Assert.assertEquals("Should pack the first two pairs of values", - list(list(1, 2), list(3, 4), list(5)), packEnd(list(1, 2, 3, 4, 5), 7, 1)); + Assert.assertEquals( + "Should pack the first two pairs of values", + list(list(1, 2), list(3, 4), list(5)), + packEnd(list(1, 2, 3, 4, 5), 7, 1)); - Assert.assertEquals("Should pack the first two pairs of values", - list(list(1, 2), list(3, 4), list(5)), packEnd(list(1, 2, 3, 4, 5), 8, 1)); + Assert.assertEquals( + "Should pack the first two pairs of values", + list(list(1, 2), list(3, 4), list(5)), + packEnd(list(1, 2, 3, 4, 5), 8, 1)); - Assert.assertEquals("Should pack the first 3 values, last 2 values", - list(list(1, 2, 3), list(4, 5)), packEnd(list(1, 2, 3, 4, 5), 9, 1)); + Assert.assertEquals( + "Should pack the first 3 values, last 2 values", + list(list(1, 2, 3), list(4, 5)), + packEnd(list(1, 2, 3, 4, 5), 9, 1)); - Assert.assertEquals("Should pack the first 3 values, last 2 values", - list(list(1, 2, 3), list(4, 5)), packEnd(list(1, 2, 3, 4, 5), 11, 1)); + Assert.assertEquals( + "Should pack the first 3 values, last 2 values", + list(list(1, 2, 3), list(4, 5)), + packEnd(list(1, 2, 3, 4, 5), 11, 1)); - Assert.assertEquals("Should pack the first 3 values, last 2 values", - list(list(1, 2), list(3, 4, 5)), packEnd(list(1, 2, 3, 4, 5), 12, 1)); + Assert.assertEquals( + "Should pack the first 3 values, last 2 values", + list(list(1, 2), list(3, 4, 5)), + packEnd(list(1, 2, 3, 4, 5), 12, 1)); - Assert.assertEquals("Should pack the last 4 values", - list(list(1), list(2, 3, 4, 5)), packEnd(list(1, 2, 3, 4, 5), 14, 1)); + Assert.assertEquals( + "Should pack the last 4 values", + list(list(1), list(2, 3, 4, 5)), + packEnd(list(1, 2, 3, 4, 5), 14, 1)); - Assert.assertEquals("Should pack the first 5 values", - list(list(1, 2, 3, 4, 5)), packEnd(list(1, 2, 3, 4, 5), 15, 1)); + Assert.assertEquals( + "Should pack the first 5 values", + list(list(1, 2, 3, 4, 5)), + packEnd(list(1, 2, 3, 4, 5), 15, 1)); } @Test public void testReverseBinPackingUnlimitedLookback() { - Assert.assertEquals("Should pack the first 2 values", - list(list(1, 2), list(3), list(4), list(5)), packEnd(list(1, 2, 3, 4, 5), 3)); + Assert.assertEquals( + "Should pack the first 2 values", + list(list(1, 2), list(3), list(4), list(5)), + packEnd(list(1, 2, 3, 4, 5), 3)); - Assert.assertEquals("Should pack 1 with 3", - list(list(2), list(1, 3), list(4), list(5)), packEnd(list(1, 2, 3, 4, 5), 4)); + Assert.assertEquals( + "Should pack 1 with 3", + list(list(2), list(1, 3), list(4), list(5)), + packEnd(list(1, 2, 3, 4, 5), 4)); - Assert.assertEquals("Should pack 2,3 and 1,4", - list(list(2, 3), list(1, 4), list(5)), packEnd(list(1, 2, 3, 4, 5), 5)); + Assert.assertEquals( + "Should pack 2,3 and 1,4", + list(list(2, 3), list(1, 4), list(5)), + packEnd(list(1, 2, 3, 4, 5), 5)); - Assert.assertEquals("Should pack 2,4 and 1,5", - list(list(3), list(2, 4), list(1, 5)), packEnd(list(1, 2, 3, 4, 5), 6)); + Assert.assertEquals( + "Should pack 2,4 and 1,5", + list(list(3), list(2, 4), list(1, 5)), + packEnd(list(1, 2, 3, 4, 5), 6)); - Assert.assertEquals("Should pack 3,4 and 2,5", - list(list(1), list(3, 4), list(2, 5)), packEnd(list(1, 2, 3, 4, 5), 7)); + Assert.assertEquals( + "Should pack 3,4 and 2,5", + list(list(1), list(3, 4), list(2, 5)), + packEnd(list(1, 2, 3, 4, 5), 7)); - Assert.assertEquals("Should pack 1,2,3 and 3,5", - list(list(1, 2, 4), list(3, 5)), packEnd(list(1, 2, 3, 4, 5), 8)); + Assert.assertEquals( + "Should pack 1,2,3 and 3,5", + list(list(1, 2, 4), list(3, 5)), + packEnd(list(1, 2, 3, 4, 5), 8)); - Assert.assertEquals("Should pack the first 3 values, last 2 values", - list(list(1, 2, 3), list(4, 5)), packEnd(list(1, 2, 3, 4, 5), 9)); + Assert.assertEquals( + "Should pack the first 3 values, last 2 values", + list(list(1, 2, 3), list(4, 5)), + packEnd(list(1, 2, 3, 4, 5), 9)); - Assert.assertEquals("Should pack 2,3 and 1,4,5", - list(list(2, 3), list(1, 4, 5)), packEnd(list(1, 2, 3, 4, 5), 10)); + Assert.assertEquals( + "Should pack 2,3 and 1,4,5", + list(list(2, 3), list(1, 4, 5)), + packEnd(list(1, 2, 3, 4, 5), 10)); - Assert.assertEquals("Should pack 1,3 and 2,4,5", - list(list(1, 3), list(2, 4, 5)), packEnd(list(1, 2, 3, 4, 5), 11)); + Assert.assertEquals( + "Should pack 1,3 and 2,4,5", + list(list(1, 3), list(2, 4, 5)), + packEnd(list(1, 2, 3, 4, 5), 11)); - Assert.assertEquals("Should pack 1,2 and 3,4,5", - list(list(1, 2), list(3, 4, 5)), packEnd(list(1, 2, 3, 4, 5), 12)); + Assert.assertEquals( + "Should pack 1,2 and 3,4,5", + list(list(1, 2), list(3, 4, 5)), + packEnd(list(1, 2, 3, 4, 5), 12)); - Assert.assertEquals("Should pack 1,2 and 3,4,5", - list(list(2), list(1, 3, 4, 5)), packEnd(list(1, 2, 3, 4, 5), 13)); + Assert.assertEquals( + "Should pack 1,2 and 3,4,5", + list(list(2), list(1, 3, 4, 5)), + packEnd(list(1, 2, 3, 4, 5), 13)); - Assert.assertEquals("Should pack the last 4 values", - list(list(1), list(2, 3, 4, 5)), packEnd(list(1, 2, 3, 4, 5), 14)); + Assert.assertEquals( + "Should pack the last 4 values", + list(list(1), list(2, 3, 4, 5)), + packEnd(list(1, 2, 3, 4, 5), 14)); - Assert.assertEquals("Should pack the first 5 values", - list(list(1, 2, 3, 4, 5)), packEnd(list(1, 2, 3, 4, 5), 15)); + Assert.assertEquals( + "Should pack the first 5 values", + list(list(1, 2, 3, 4, 5)), + packEnd(list(1, 2, 3, 4, 5), 15)); } @Test @@ -140,8 +201,10 @@ public void testBinPackingLookBack() { // 4. [5, 1, 1], [5] // 5. [5, 1, 1], [5], [5] // 6. [5, 1, 1, 1], [5], [5] - Assert.assertEquals("Unlimited look-back: should merge ones into first bin", - list(list(5, 1, 1, 1), list(5), list(5)), pack(list(5, 1, 5, 1, 5, 1), 8)); + Assert.assertEquals( + "Unlimited look-back: should merge ones into first bin", + list(list(5, 1, 1, 1), list(5), list(5)), + pack(list(5, 1, 5, 1, 5, 1), 8)); // lookback state: // 1. [5] @@ -150,8 +213,10 @@ public void testBinPackingLookBack() { // 4. [5, 1, 1], [5] // 5. [5], [5] ([5, 1, 1] drops out of look-back) // 6. [5, 1], [5] - Assert.assertEquals("2 bin look-back: should merge two ones into first bin", - list(list(5, 1, 1), list(5, 1), list(5)), pack(list(5, 1, 5, 1, 5, 1), 8, 2)); + Assert.assertEquals( + "2 bin look-back: should merge two ones into first bin", + list(list(5, 1, 1), list(5, 1), list(5)), + pack(list(5, 1, 5, 1, 5, 1), 8, 2)); // lookback state: // 1. [5] @@ -160,10 +225,13 @@ public void testBinPackingLookBack() { // 4. [5, 1] // 5. [5] ([5, 1] #2 drops out of look-back) // 6. [5, 1] - Assert.assertEquals("1 bin look-back: should merge ones with fives", - list(list(5, 1), list(5, 1), list(5, 1)), pack(list(5, 1, 5, 1, 5, 1), 8, 1)); + Assert.assertEquals( + "1 bin look-back: should merge ones with fives", + list(list(5, 1), list(5, 1), list(5, 1)), + pack(list(5, 1, 5, 1, 5, 1), 8, 1)); - Assert.assertEquals("2 bin look-back: should merge until targetWeight when largestBinFirst is enabled", + Assert.assertEquals( + "2 bin look-back: should merge until targetWeight when largestBinFirst is enabled", list(list(36, 36, 36), list(128), list(36, 65), list(65)), pack(list(36, 36, 36, 36, 65, 65, 128), 128, 2, true)); @@ -181,7 +249,8 @@ private List> pack(List items, long targetWeight, int loo return pack(items, targetWeight, lookback, false); } - private List> pack(List items, long targetWeight, int lookback, boolean largestBinFirst) { + private List> pack( + List items, long targetWeight, int lookback, boolean largestBinFirst) { ListPacker packer = new ListPacker<>(targetWeight, lookback, largestBinFirst); return packer.pack(items, Integer::longValue); } diff --git a/core/src/test/java/org/apache/iceberg/util/TestDateTimeUtil.java b/core/src/test/java/org/apache/iceberg/util/TestDateTimeUtil.java index 4c27f69b69ef..3563e92118e7 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestDateTimeUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestDateTimeUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.time.ZonedDateTime; @@ -33,6 +32,7 @@ public void formatTimestampMillis() { timestamp = "1970-01-01T00:16:40+00:00"; Assertions.assertThat(DateTimeUtil.formatTimestampMillis(1000000L)).isEqualTo(timestamp); - Assertions.assertThat(ZonedDateTime.parse(timestamp).toInstant().toEpochMilli()).isEqualTo(1000000L); + Assertions.assertThat(ZonedDateTime.parse(timestamp).toInstant().toEpochMilli()) + .isEqualTo(1000000L); } } diff --git a/core/src/test/java/org/apache/iceberg/util/TestEnvironmentUtil.java b/core/src/test/java/org/apache/iceberg/util/TestEnvironmentUtil.java index 6ad52a1f969a..d7fb17b0e7eb 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestEnvironmentUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestEnvironmentUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.Map; @@ -35,9 +34,10 @@ public void testEnvironmentSubstitution() { @Test public void testMultipleEnvironmentSubstitutions() { - Map result = EnvironmentUtil.resolveAll( - ImmutableMap.of("USER", "u", "VAR", "value"), - ImmutableMap.of("user-test", "env:USER", "other", "left-alone", "var", "env:VAR")); + Map result = + EnvironmentUtil.resolveAll( + ImmutableMap.of("USER", "u", "VAR", "value"), + ImmutableMap.of("user-test", "env:USER", "other", "left-alone", "var", "env:VAR")); Assertions.assertEquals( ImmutableMap.of("user-test", "u", "other", "left-alone", "var", "value"), @@ -47,13 +47,10 @@ public void testMultipleEnvironmentSubstitutions() { @Test public void testEnvironmentSubstitutionWithMissingVar() { - Map result = EnvironmentUtil.resolveAll( - ImmutableMap.of(), - ImmutableMap.of("user-test", "env:USER")); + Map result = + EnvironmentUtil.resolveAll(ImmutableMap.of(), ImmutableMap.of("user-test", "env:USER")); Assertions.assertEquals( - ImmutableMap.of(), - result, - "Should not contain values with missing environment variables"); + ImmutableMap.of(), result, "Should not contain values with missing environment variables"); } } diff --git a/core/src/test/java/org/apache/iceberg/util/TestInMemoryLockManager.java b/core/src/test/java/org/apache/iceberg/util/TestInMemoryLockManager.java index e62f7ea8ca4c..4d56d1b2028c 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestInMemoryLockManager.java +++ b/core/src/test/java/org/apache/iceberg/util/TestInMemoryLockManager.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.List; @@ -42,8 +41,7 @@ public class TestInMemoryLockManager { private String lockEntityId; private String ownerId; - @Rule - public Timeout timeout = new Timeout(5, TimeUnit.SECONDS); + @Rule public Timeout timeout = new Timeout(5, TimeUnit.SECONDS); @Before public void before() { @@ -60,7 +58,8 @@ public void after() { @Test public void testAcquireOnceSingleProcess() { lockManager.acquireOnce(lockEntityId, ownerId); - AssertHelpers.assertThrows("should fail when acquire again", + AssertHelpers.assertThrows( + "should fail when acquire again", IllegalStateException.class, "currently held", () -> lockManager.acquireOnce(lockEntityId, ownerId)); @@ -68,99 +67,106 @@ public void testAcquireOnceSingleProcess() { @Test public void testAcquireOnceMultiProcesses() { - List results = IntStream.range(0, 10).parallel() - .mapToObj(i -> { - try { - lockManager.acquireOnce(lockEntityId, ownerId); - return true; - } catch (IllegalStateException e) { - return false; - } - }) - .collect(Collectors.toList()); + List results = + IntStream.range(0, 10) + .parallel() + .mapToObj( + i -> { + try { + lockManager.acquireOnce(lockEntityId, ownerId); + return true; + } catch (IllegalStateException e) { + return false; + } + }) + .collect(Collectors.toList()); Assert.assertEquals( - "only 1 thread should have acquired the lock", - 1, results.stream().filter(s -> s).count()); + "only 1 thread should have acquired the lock", 1, results.stream().filter(s -> s).count()); } @Test public void testReleaseAndAcquire() { Assert.assertTrue(lockManager.acquire(lockEntityId, ownerId)); Assert.assertTrue(lockManager.release(lockEntityId, ownerId)); - Assert.assertTrue("acquire after release should succeed", lockManager.acquire(lockEntityId, ownerId)); + Assert.assertTrue( + "acquire after release should succeed", lockManager.acquire(lockEntityId, ownerId)); } @Test public void testReleaseWithWrongOwner() { Assert.assertTrue(lockManager.acquire(lockEntityId, ownerId)); - Assert.assertFalse("should return false if ownerId is wrong", + Assert.assertFalse( + "should return false if ownerId is wrong", lockManager.release(lockEntityId, UUID.randomUUID().toString())); } @Test public void testAcquireSingleProcess() throws Exception { - lockManager.initialize(ImmutableMap.of( - CatalogProperties.LOCK_ACQUIRE_INTERVAL_MS, "500", - CatalogProperties.LOCK_ACQUIRE_TIMEOUT_MS, "2000" - )); + lockManager.initialize( + ImmutableMap.of( + CatalogProperties.LOCK_ACQUIRE_INTERVAL_MS, "500", + CatalogProperties.LOCK_ACQUIRE_TIMEOUT_MS, "2000")); Assert.assertTrue(lockManager.acquire(lockEntityId, ownerId)); String oldOwner = ownerId; - CompletableFuture.supplyAsync(() -> { - try { - Thread.sleep(200); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - Assert.assertTrue(lockManager.release(lockEntityId, oldOwner)); - return null; - }); + CompletableFuture.supplyAsync( + () -> { + try { + Thread.sleep(200); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + Assert.assertTrue(lockManager.release(lockEntityId, oldOwner)); + return null; + }); ownerId = UUID.randomUUID().toString(); long start = System.currentTimeMillis(); Assert.assertTrue(lockManager.acquire(lockEntityId, ownerId)); - Assert.assertTrue("should succeed after 200ms", - System.currentTimeMillis() - start >= 200); + Assert.assertTrue("should succeed after 200ms", System.currentTimeMillis() - start >= 200); } @Test public void testAcquireMultiProcessAllSucceed() { - lockManager.initialize(ImmutableMap.of( - CatalogProperties.LOCK_ACQUIRE_INTERVAL_MS, "500" - )); + lockManager.initialize(ImmutableMap.of(CatalogProperties.LOCK_ACQUIRE_INTERVAL_MS, "500")); long start = System.currentTimeMillis(); - List results = IntStream.range(0, 3).parallel() - .mapToObj(i -> { - String owner = UUID.randomUUID().toString(); - boolean succeeded = lockManager.acquire(lockEntityId, owner); - if (succeeded) { - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - Assert.assertTrue(lockManager.release(lockEntityId, owner)); - } - return succeeded; - }) - .collect(Collectors.toList()); - Assert.assertEquals("all lock acquire should succeed sequentially", - 3, results.stream().filter(s -> s).count()); + List results = + IntStream.range(0, 3) + .parallel() + .mapToObj( + i -> { + String owner = UUID.randomUUID().toString(); + boolean succeeded = lockManager.acquire(lockEntityId, owner); + if (succeeded) { + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + Assert.assertTrue(lockManager.release(lockEntityId, owner)); + } + return succeeded; + }) + .collect(Collectors.toList()); + Assert.assertEquals( + "all lock acquire should succeed sequentially", 3, results.stream().filter(s -> s).count()); Assert.assertTrue("must take more than 3 seconds", System.currentTimeMillis() - start >= 3000); } @Test public void testAcquireMultiProcessOnlyOneSucceed() { - lockManager.initialize(ImmutableMap.of( - CatalogProperties.LOCK_HEARTBEAT_INTERVAL_MS, "100", - CatalogProperties.LOCK_ACQUIRE_INTERVAL_MS, "500", - CatalogProperties.LOCK_ACQUIRE_TIMEOUT_MS, "2000" - )); - - List results = IntStream.range(0, 3).parallel() - .mapToObj(i -> lockManager.acquire(lockEntityId, ownerId)) - .collect(Collectors.toList()); - Assert.assertEquals("only 1 thread should have acquired the lock", - 1, results.stream().filter(s -> s).count()); + lockManager.initialize( + ImmutableMap.of( + CatalogProperties.LOCK_HEARTBEAT_INTERVAL_MS, "100", + CatalogProperties.LOCK_ACQUIRE_INTERVAL_MS, "500", + CatalogProperties.LOCK_ACQUIRE_TIMEOUT_MS, "2000")); + + List results = + IntStream.range(0, 3) + .parallel() + .mapToObj(i -> lockManager.acquire(lockEntityId, ownerId)) + .collect(Collectors.toList()); + Assert.assertEquals( + "only 1 thread should have acquired the lock", 1, results.stream().filter(s -> s).count()); } } diff --git a/core/src/test/java/org/apache/iceberg/util/TestLocationUtil.java b/core/src/test/java/org/apache/iceberg/util/TestLocationUtil.java index 69ba60d6d100..89d41371ffd4 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestLocationUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestLocationUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import org.apache.iceberg.AssertHelpers; @@ -28,30 +27,38 @@ public class TestLocationUtil { @Test public void testStripTrailingSlash() { String pathWithoutTrailingSlash = "s3://bucket/db/tbl"; - Assert.assertEquals("Should have no trailing slashes", pathWithoutTrailingSlash, + Assert.assertEquals( + "Should have no trailing slashes", + pathWithoutTrailingSlash, LocationUtil.stripTrailingSlash(pathWithoutTrailingSlash)); - String pathWithSingleTrailingSlash = pathWithoutTrailingSlash + "/"; - Assert.assertEquals("Should have no trailing slashes", pathWithoutTrailingSlash, + String pathWithSingleTrailingSlash = pathWithoutTrailingSlash + "/"; + Assert.assertEquals( + "Should have no trailing slashes", + pathWithoutTrailingSlash, LocationUtil.stripTrailingSlash(pathWithSingleTrailingSlash)); - String pathWithMultipleTrailingSlash = pathWithoutTrailingSlash + "////"; - Assert.assertEquals("Should have no trailing slashes", pathWithoutTrailingSlash, + String pathWithMultipleTrailingSlash = pathWithoutTrailingSlash + "////"; + Assert.assertEquals( + "Should have no trailing slashes", + pathWithoutTrailingSlash, LocationUtil.stripTrailingSlash(pathWithMultipleTrailingSlash)); String pathWithOnlySlash = "////"; - Assert.assertEquals("Should have no trailing slashes", "", - LocationUtil.stripTrailingSlash(pathWithOnlySlash)); + Assert.assertEquals( + "Should have no trailing slashes", "", LocationUtil.stripTrailingSlash(pathWithOnlySlash)); } @Test public void testStripTrailingSlashWithInvalidPath() { - String [] invalidPaths = new String[] {null, ""}; + String[] invalidPaths = new String[] {null, ""}; for (String invalidPath : invalidPaths) { - AssertHelpers.assertThrows("path must be valid", IllegalArgumentException.class, "path must not be null or empty", + AssertHelpers.assertThrows( + "path must be valid", + IllegalArgumentException.class, + "path must not be null or empty", () -> LocationUtil.stripTrailingSlash(invalidPath)); } - } } diff --git a/core/src/test/java/org/apache/iceberg/util/TestLockManagers.java b/core/src/test/java/org/apache/iceberg/util/TestLockManagers.java index 9bb80e025ea7..a1a5c0cb9166 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestLockManagers.java +++ b/core/src/test/java/org/apache/iceberg/util/TestLockManagers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.Map; @@ -30,7 +29,8 @@ public class TestLockManagers { @Test public void testLoadDefaultLockManager() { - Assertions.assertThat(LockManagers.defaultLockManager()).isInstanceOf(LockManagers.InMemoryLockManager.class); + Assertions.assertThat(LockManagers.defaultLockManager()) + .isInstanceOf(LockManagers.InMemoryLockManager.class); } @Test @@ -53,13 +53,9 @@ public boolean release(String entityId, String ownerId) { } @Override - public void close() throws Exception { - - } + public void close() throws Exception {} @Override - public void initialize(Map properties) { - - } + public void initialize(Map properties) {} } } diff --git a/core/src/test/java/org/apache/iceberg/util/TestReachableFileUtil.java b/core/src/test/java/org/apache/iceberg/util/TestReachableFileUtil.java index 596becefb2f1..4ef213e320c8 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestReachableFileUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestReachableFileUtil.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.util.List; import java.util.Set; @@ -45,30 +46,28 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestReachableFileUtil { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), optional(2, "c2", Types.StringType.get())); private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("c1").build(); - private static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withRecordCount(1) - .build(); - private static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withRecordCount(1) - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withRecordCount(1) + .build(); + private static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withRecordCount(1) + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @@ -81,13 +80,9 @@ public void setupTableLocation() throws Exception { @Test public void testManifestListLocations() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); List manifestListPaths = ReachableFileUtil.manifestListLocations(table); Assert.assertEquals(manifestListPaths.size(), 2); @@ -95,17 +90,11 @@ public void testManifestListLocations() { @Test public void testMetadataFileLocations() { - table.updateProperties() - .set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1") - .commit(); + table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); Set metadataFileLocations = ReachableFileUtil.metadataFileLocations(table, true); Assert.assertEquals(metadataFileLocations.size(), 4); @@ -116,19 +105,13 @@ public void testMetadataFileLocations() { @Test public void testMetadataFileLocationsWithMissingFiles() { - table.updateProperties() - .set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1") - .commit(); + table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); TableOperations operations = ((HasTableOperations) table).operations(); String location = operations.current().metadataFileLocation(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); // delete v3.metadata.json making v2.metadata.json and v1.metadata.json inaccessible table.io().deleteFile(location); diff --git a/core/src/test/java/org/apache/iceberg/util/TestSortOrderUtil.java b/core/src/test/java/org/apache/iceberg/util/TestSortOrderUtil.java index 090b5b8aaf57..516279bc7f7b 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestSortOrderUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestSortOrderUtil.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; +import static org.apache.iceberg.NullOrder.NULLS_LAST; +import static org.apache.iceberg.SortDirection.ASC; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.PartitionSpec; @@ -34,22 +37,17 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.NullOrder.NULLS_LAST; -import static org.apache.iceberg.SortDirection.ASC; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSortOrderUtil { // column ids will be reassigned during table creation - private static final Schema SCHEMA = new Schema( - required(10, "id", Types.IntegerType.get()), - required(11, "data", Types.StringType.get()), - required(12, "ts", Types.TimestampType.withZone()), - required(13, "category", Types.StringType.get()) - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final Schema SCHEMA = + new Schema( + required(10, "id", Types.IntegerType.get()), + required(11, "data", Types.StringType.get()), + required(12, "ts", Types.TimestampType.withZone()), + required(13, "category", Types.StringType.get())); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableDir = null; @Before @@ -65,10 +63,7 @@ public void cleanupTables() { @Test public void testEmptySpecsV1() { PartitionSpec spec = PartitionSpec.unpartitioned(); - SortOrder order = SortOrder.builderFor(SCHEMA) - .withOrderId(1) - .asc("id", NULLS_LAST) - .build(); + SortOrder order = SortOrder.builderFor(SCHEMA).withOrderId(1).asc("id", NULLS_LAST).build(); TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, spec, order, 1); // pass PartitionSpec.unpartitioned() on purpose as it has an empty schema @@ -78,16 +73,14 @@ public void testEmptySpecsV1() { Assert.assertEquals("Order must have 1 field", 1, actualOrder.fields().size()); Assert.assertEquals("Field id must be fresh", 1, actualOrder.fields().get(0).sourceId()); Assert.assertEquals("Direction must match", ASC, actualOrder.fields().get(0).direction()); - Assert.assertEquals("Null order must match", NULLS_LAST, actualOrder.fields().get(0).nullOrder()); + Assert.assertEquals( + "Null order must match", NULLS_LAST, actualOrder.fields().get(0).nullOrder()); } @Test public void testEmptySpecsV2() { PartitionSpec spec = PartitionSpec.unpartitioned(); - SortOrder order = SortOrder.builderFor(SCHEMA) - .withOrderId(1) - .asc("id", NULLS_LAST) - .build(); + SortOrder order = SortOrder.builderFor(SCHEMA).withOrderId(1).asc("id", NULLS_LAST).build(); TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, spec, order, 2); // pass PartitionSpec.unpartitioned() on purpose as it has an empty schema @@ -97,221 +90,217 @@ public void testEmptySpecsV2() { Assert.assertEquals("Order must have 1 field", 1, actualOrder.fields().size()); Assert.assertEquals("Field id must be fresh", 1, actualOrder.fields().get(0).sourceId()); Assert.assertEquals("Direction must match", ASC, actualOrder.fields().get(0).direction()); - Assert.assertEquals("Null order must match", NULLS_LAST, actualOrder.fields().get(0).nullOrder()); + Assert.assertEquals( + "Null order must match", NULLS_LAST, actualOrder.fields().get(0).nullOrder()); } @Test public void testSortOrderClusteringNoPartitionFields() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .day("ts") - .identity("category") - .build(); - SortOrder order = SortOrder.builderFor(SCHEMA) - .withOrderId(1) - .desc("id") - .build(); - - SortOrder expected = SortOrder.builderFor(SCHEMA) - .withOrderId(1) - .asc(Expressions.day("ts")) - .asc("category") - .desc("id") - .build(); - - Assert.assertEquals("Should add spec fields as prefix", - expected, SortOrderUtil.buildSortOrder(SCHEMA, spec, order)); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("ts").identity("category").build(); + SortOrder order = SortOrder.builderFor(SCHEMA).withOrderId(1).desc("id").build(); + + SortOrder expected = + SortOrder.builderFor(SCHEMA) + .withOrderId(1) + .asc(Expressions.day("ts")) + .asc("category") + .desc("id") + .build(); + + Assert.assertEquals( + "Should add spec fields as prefix", + expected, + SortOrderUtil.buildSortOrder(SCHEMA, spec, order)); } @Test public void testSortOrderClusteringAllPartitionFields() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .day("ts") - .identity("category") - .build(); - SortOrder order = SortOrder.builderFor(SCHEMA) - .withOrderId(1) - .asc(Expressions.day("ts")) - .asc("category") - .desc("id") - .build(); - - Assert.assertEquals("Should leave the order unchanged", - order, SortOrderUtil.buildSortOrder(SCHEMA, spec, order)); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("ts").identity("category").build(); + SortOrder order = + SortOrder.builderFor(SCHEMA) + .withOrderId(1) + .asc(Expressions.day("ts")) + .asc("category") + .desc("id") + .build(); + + Assert.assertEquals( + "Should leave the order unchanged", + order, + SortOrderUtil.buildSortOrder(SCHEMA, spec, order)); } @Test public void testSortOrderClusteringAllPartitionFieldsReordered() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .day("ts") - .build(); - SortOrder order = SortOrder.builderFor(SCHEMA) - .withOrderId(1) - .asc(Expressions.day("ts")) - .asc("category") - .desc("id") - .build(); - - Assert.assertEquals("Should leave the order unchanged", - order, SortOrderUtil.buildSortOrder(SCHEMA, spec, order)); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("category").day("ts").build(); + SortOrder order = + SortOrder.builderFor(SCHEMA) + .withOrderId(1) + .asc(Expressions.day("ts")) + .asc("category") + .desc("id") + .build(); + + Assert.assertEquals( + "Should leave the order unchanged", + order, + SortOrderUtil.buildSortOrder(SCHEMA, spec, order)); } @Test public void testSortOrderClusteringSomePartitionFields() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .day("ts") - .build(); - SortOrder order = SortOrder.builderFor(SCHEMA) - .withOrderId(1) - .asc("category") - .desc("id") - .build(); - - SortOrder expected = SortOrder.builderFor(SCHEMA) - .withOrderId(1) - .asc(Expressions.day("ts")) - .asc("category") - .desc("id") - .build(); - - Assert.assertEquals("Should add spec fields as prefix", - expected, SortOrderUtil.buildSortOrder(SCHEMA, spec, order)); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("category").day("ts").build(); + SortOrder order = + SortOrder.builderFor(SCHEMA).withOrderId(1).asc("category").desc("id").build(); + + SortOrder expected = + SortOrder.builderFor(SCHEMA) + .withOrderId(1) + .asc(Expressions.day("ts")) + .asc("category") + .desc("id") + .build(); + + Assert.assertEquals( + "Should add spec fields as prefix", + expected, + SortOrderUtil.buildSortOrder(SCHEMA, spec, order)); } @Test public void testSortOrderClusteringSatisfiedPartitionLast() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("category") - .day("ts") - .build(); - SortOrder order = SortOrder.builderFor(SCHEMA) - .withOrderId(1) - .asc("category") - .asc("ts") // satisfies the ordering of days(ts) - .desc("id") - .build(); - - SortOrder expected = SortOrder.builderFor(SCHEMA) - .withOrderId(1) - .asc("category") - .asc("ts") - .desc("id") - .build(); - - Assert.assertEquals("Should add spec fields as prefix", - expected, SortOrderUtil.buildSortOrder(SCHEMA, spec, order)); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("category").day("ts").build(); + SortOrder order = + SortOrder.builderFor(SCHEMA) + .withOrderId(1) + .asc("category") + .asc("ts") // satisfies the ordering of days(ts) + .desc("id") + .build(); + + SortOrder expected = + SortOrder.builderFor(SCHEMA).withOrderId(1).asc("category").asc("ts").desc("id").build(); + + Assert.assertEquals( + "Should add spec fields as prefix", + expected, + SortOrderUtil.buildSortOrder(SCHEMA, spec, order)); } @Test public void testSortOrderClusteringSatisfiedPartitionFirst() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .day("ts") - .identity("category") - .build(); - SortOrder order = SortOrder.builderFor(SCHEMA) - .withOrderId(1) - .asc("ts") // satisfies the ordering of days(ts) - .asc("category") - .desc("id") - .build(); - - SortOrder expected = SortOrder.builderFor(SCHEMA) - .withOrderId(1) - .asc("category") // prefix is added, the rest of the sort order stays the same - .asc("ts") - .asc("category") - .desc("id") - .build(); - - Assert.assertEquals("Should add spec fields as prefix", - expected, SortOrderUtil.buildSortOrder(SCHEMA, spec, order)); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("ts").identity("category").build(); + SortOrder order = + SortOrder.builderFor(SCHEMA) + .withOrderId(1) + .asc("ts") // satisfies the ordering of days(ts) + .asc("category") + .desc("id") + .build(); + + SortOrder expected = + SortOrder.builderFor(SCHEMA) + .withOrderId(1) + .asc("category") // prefix is added, the rest of the sort order stays the same + .asc("ts") + .asc("category") + .desc("id") + .build(); + + Assert.assertEquals( + "Should add spec fields as prefix", + expected, + SortOrderUtil.buildSortOrder(SCHEMA, spec, order)); } @Test public void testSortOrderClusteringSatisfiedPartitionFields() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .day("ts") - .identity("category") - .build(); - - SortOrder order = SortOrder.builderFor(SCHEMA) - .withOrderId(1) - .asc("ts") // satisfies the ordering of days(ts) - .asc("category") - .desc("id") - .build(); - - SortOrder expected = SortOrder.builderFor(SCHEMA) - .withOrderId(1) - .asc("category") // prefix is added, the rest of the sort order stays the same - .asc("ts") - .asc("category") - .desc("id") - .build(); - - Assert.assertEquals("Should add spec fields as prefix", - expected, SortOrderUtil.buildSortOrder(SCHEMA, spec, order)); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("ts").identity("category").build(); + + SortOrder order = + SortOrder.builderFor(SCHEMA) + .withOrderId(1) + .asc("ts") // satisfies the ordering of days(ts) + .asc("category") + .desc("id") + .build(); + + SortOrder expected = + SortOrder.builderFor(SCHEMA) + .withOrderId(1) + .asc("category") // prefix is added, the rest of the sort order stays the same + .asc("ts") + .asc("category") + .desc("id") + .build(); + + Assert.assertEquals( + "Should add spec fields as prefix", + expected, + SortOrderUtil.buildSortOrder(SCHEMA, spec, order)); } @Test public void testSortOrderClusteringWithRedundantPartitionFields() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .day("ts") - .identity("category") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("ts").identity("category").build(); - // Specs with redundant time fields can't be constructed directly and have to use UpdatePartitionSpec - TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, spec, SortOrder.unsorted(), 2); + // Specs with redundant time fields can't be constructed directly and have to use + // UpdatePartitionSpec + TestTables.TestTable table = + TestTables.create(tableDir, "test", SCHEMA, spec, SortOrder.unsorted(), 2); table.updateSpec().addField(Expressions.hour("ts")).commit(); PartitionSpec updatedSpec = table.spec(); - SortOrder order = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("category") - .asc("ts") // satisfies the ordering of days(ts) and hours(ts) - .desc("id") - .build(); - - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("category") - .asc("ts") - .desc("id") - .build(); - - Assert.assertEquals("Should add spec fields as prefix", - expected, SortOrderUtil.buildSortOrder(table.schema(), updatedSpec, order)); + SortOrder order = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .asc("category") + .asc("ts") // satisfies the ordering of days(ts) and hours(ts) + .desc("id") + .build(); + + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .asc("category") + .asc("ts") + .desc("id") + .build(); + + Assert.assertEquals( + "Should add spec fields as prefix", + expected, + SortOrderUtil.buildSortOrder(table.schema(), updatedSpec, order)); } @Test public void testSortOrderClusteringWithRedundantPartitionFieldsMissing() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .day("ts") - .identity("category") - .build(); - - // Specs with redundant time fields can't be constructed directly and have to use UpdatePartitionSpec - TestTables.TestTable table = TestTables.create(tableDir, "test", SCHEMA, spec, SortOrder.unsorted(), 1); - table.updateSpec() + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).day("ts").identity("category").build(); + + // Specs with redundant time fields can't be constructed directly and have to use + // UpdatePartitionSpec + TestTables.TestTable table = + TestTables.create(tableDir, "test", SCHEMA, spec, SortOrder.unsorted(), 1); + table + .updateSpec() .removeField("ts_day") // introduce a void transform .addField(Expressions.hour("ts")) .commit(); PartitionSpec updatedSpec = table.spec(); - SortOrder order = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .desc("id") - .build(); + SortOrder order = SortOrder.builderFor(table.schema()).withOrderId(1).desc("id").build(); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("category") - .asc(Expressions.hour("ts")) - .desc("id") - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .asc("category") + .asc(Expressions.hour("ts")) + .desc("id") + .build(); - Assert.assertEquals("Should add spec fields as prefix", - expected, SortOrderUtil.buildSortOrder(table.schema(), updatedSpec, order)); + Assert.assertEquals( + "Should add spec fields as prefix", + expected, + SortOrderUtil.buildSortOrder(table.schema(), updatedSpec, order)); } } diff --git a/core/src/test/java/org/apache/iceberg/util/TestStructLikeMap.java b/core/src/test/java/org/apache/iceberg/util/TestStructLikeMap.java index 891a7782217a..184fa18fc265 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestStructLikeMap.java +++ b/core/src/test/java/org/apache/iceberg/util/TestStructLikeMap.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.Collection; @@ -33,10 +32,10 @@ import org.junit.Test; public class TestStructLikeMap { - private static final Types.StructType STRUCT_TYPE = Types.StructType.of( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.LongType.get()) - ); + private static final Types.StructType STRUCT_TYPE = + Types.StructType.of( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.LongType.get())); @Test public void testSingleRecord() { diff --git a/core/src/test/java/org/apache/iceberg/util/TestStructLikeSet.java b/core/src/test/java/org/apache/iceberg/util/TestStructLikeSet.java index 09606b951529..77726ee6359d 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestStructLikeSet.java +++ b/core/src/test/java/org/apache/iceberg/util/TestStructLikeSet.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.Set; @@ -28,10 +27,10 @@ import org.junit.Test; public class TestStructLikeSet { - private static final Types.StructType STRUCT_TYPE = Types.StructType.of( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.LongType.get()) - ); + private static final Types.StructType STRUCT_TYPE = + Types.StructType.of( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.LongType.get())); @Test public void testNullElements() { diff --git a/core/src/test/java/org/apache/iceberg/util/TestTableScanUtil.java b/core/src/test/java/org/apache/iceberg/util/TestTableScanUtil.java index 55c98cde3a7d..4eb635d168ce 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestTableScanUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestTableScanUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.util; import java.util.Arrays; @@ -44,12 +43,16 @@ public class TestTableScanUtil { private List tasksWithDataAndDeleteSizes(List> sizePairs) { - return sizePairs.stream().map(sizePair -> { - DataFile dataFile = dataFileWithSize(sizePair.first()); - DeleteFile[] deleteFiles = deleteFilesWithSizes( - Arrays.stream(sizePair.second()).mapToLong(Long::longValue).toArray()); - return new MockFileScanTask(dataFile, deleteFiles); - }).collect(Collectors.toList()); + return sizePairs.stream() + .map( + sizePair -> { + DataFile dataFile = dataFileWithSize(sizePair.first()); + DeleteFile[] deleteFiles = + deleteFilesWithSizes( + Arrays.stream(sizePair.second()).mapToLong(Long::longValue).toArray()); + return new MockFileScanTask(dataFile, deleteFiles); + }) + .collect(Collectors.toList()); } private DataFile dataFileWithSize(long size) { @@ -59,77 +62,84 @@ private DataFile dataFileWithSize(long size) { } private DeleteFile[] deleteFilesWithSizes(long... sizes) { - return Arrays.stream(sizes).mapToObj(size -> { - DeleteFile mockDeleteFile = Mockito.mock(DeleteFile.class); - Mockito.when(mockDeleteFile.fileSizeInBytes()).thenReturn(size); - return mockDeleteFile; - }).toArray(DeleteFile[]::new); + return Arrays.stream(sizes) + .mapToObj( + size -> { + DeleteFile mockDeleteFile = Mockito.mock(DeleteFile.class); + Mockito.when(mockDeleteFile.fileSizeInBytes()).thenReturn(size); + return mockDeleteFile; + }) + .toArray(DeleteFile[]::new); } @Test public void testPlanTaskWithDeleteFiles() { - List testFiles = tasksWithDataAndDeleteSizes( + List testFiles = + tasksWithDataAndDeleteSizes( + Arrays.asList( + Pair.of(150L, new Long[] {50L, 100L}), + Pair.of(50L, new Long[] {1L, 50L}), + Pair.of(50L, new Long[] {100L}), + Pair.of(1L, new Long[] {1L, 1L}), + Pair.of(75L, new Long[] {75L}))); + + List combinedScanTasks = + Lists.newArrayList( + TableScanUtil.planTasks(CloseableIterable.withNoopClose(testFiles), 300L, 3, 50L)); + + List expectedCombinedTasks = Arrays.asList( - Pair.of(150L, new Long[] {50L, 100L}), - Pair.of(50L, new Long[] {1L, 50L}), - Pair.of(50L, new Long[] {100L}), - Pair.of(1L, new Long[] {1L, 1L}), - Pair.of(75L, new Long[] {75L}) - )); - - List combinedScanTasks = Lists.newArrayList( - TableScanUtil.planTasks(CloseableIterable.withNoopClose(testFiles), 300L, 3, 50L) - ); - - List expectedCombinedTasks = Arrays.asList( - new BaseCombinedScanTask(Collections.singletonList(testFiles.get(0))), - new BaseCombinedScanTask(Arrays.asList(testFiles.get(1), testFiles.get(2))), - new BaseCombinedScanTask(Arrays.asList(testFiles.get(3), testFiles.get(4))) - ); - - Assert.assertEquals("Should plan 3 Combined tasks since there is delete files to be considered", - 3, combinedScanTasks.size()); + new BaseCombinedScanTask(Collections.singletonList(testFiles.get(0))), + new BaseCombinedScanTask(Arrays.asList(testFiles.get(1), testFiles.get(2))), + new BaseCombinedScanTask(Arrays.asList(testFiles.get(3), testFiles.get(4)))); + + Assert.assertEquals( + "Should plan 3 Combined tasks since there is delete files to be considered", + 3, + combinedScanTasks.size()); for (int i = 0; i < expectedCombinedTasks.size(); ++i) { - Assert.assertEquals("Scan tasks detail in combined task check failed", - expectedCombinedTasks.get(i).files(), combinedScanTasks.get(i).files()); + Assert.assertEquals( + "Scan tasks detail in combined task check failed", + expectedCombinedTasks.get(i).files(), + combinedScanTasks.get(i).files()); } } @Test public void testTaskGroupPlanning() { - List tasks = ImmutableList.of( - new ChildTask1(64), - new ChildTask1(32), - new ChildTask3(64), - new ChildTask3(32), - new ChildTask2(128), - new ChildTask3(32), - new ChildTask3(32) - ); - - CloseableIterable> taskGroups = TableScanUtil.planTaskGroups( - CloseableIterable.withNoopClose(tasks), 128, 10, 4); + List tasks = + ImmutableList.of( + new ChildTask1(64), + new ChildTask1(32), + new ChildTask3(64), + new ChildTask3(32), + new ChildTask2(128), + new ChildTask3(32), + new ChildTask3(32)); + + CloseableIterable> taskGroups = + TableScanUtil.planTaskGroups(CloseableIterable.withNoopClose(tasks), 128, 10, 4); Assert.assertEquals("Must have 3 task groups", 3, Iterables.size(taskGroups)); } @Test public void testTaskMerging() { - List tasks = ImmutableList.of( - new ChildTask1(64), - new ChildTask1(64), - new ChildTask2(128), - new ChildTask3(32), - new ChildTask3(32) - ); + List tasks = + ImmutableList.of( + new ChildTask1(64), + new ChildTask1(64), + new ChildTask2(128), + new ChildTask3(32), + new ChildTask3(32)); List mergedTasks = TableScanUtil.mergeTasks(tasks); Assert.assertEquals("Appropriate tasks should be merged", 3, mergedTasks.size()); } - private interface ParentTask extends ScanTask { - } + private interface ParentTask extends ScanTask {} - private static class ChildTask1 implements ParentTask, SplittableScanTask, MergeableScanTask { + private static class ChildTask1 + implements ParentTask, SplittableScanTask, MergeableScanTask { private final long sizeBytes; ChildTask1(long sizeBytes) { diff --git a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java index 1a2174b679ba..96c9126bbdb3 100644 --- a/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java +++ b/core/src/test/java/org/apache/iceberg/util/TestZOrderByteUtil.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.util; import java.nio.ByteBuffer; @@ -51,26 +49,20 @@ private String bytesToString(byte[] bytes) { return result.toString(); } - /** - * Returns a non-0 length byte array - */ - private byte[] generateRandomBytes() { + /** Returns a non-0 length byte array */ + private byte[] generateRandomBytes() { int length = Math.abs(random.nextInt(100) + 1); return generateRandomBytes(length); } - /** - * Returns a byte array of a specified length - */ - private byte[] generateRandomBytes(int length) { + /** Returns a byte array of a specified length */ + private byte[] generateRandomBytes(int length) { byte[] result = new byte[length]; random.nextBytes(result); return result; } - /** - * Test method to ensure correctness of byte interleaving code - */ + /** Test method to ensure correctness of byte interleaving code */ private String interleaveStrings(String[] strings) { StringBuilder result = new StringBuilder(); int totalLength = Arrays.stream(strings).mapToInt(String::length).sum(); @@ -89,17 +81,17 @@ private String interleaveStrings(String[] strings) { } /** - * Compares the result of a string based interleaving algorithm implemented above - * versus the binary bit-shifting algorithm used in ZOrderByteUtils. Either both - * algorithms are identically wrong or are both identically correct. + * Compares the result of a string based interleaving algorithm implemented above versus the + * binary bit-shifting algorithm used in ZOrderByteUtils. Either both algorithms are identically + * wrong or are both identically correct. */ @Test public void testInterleaveRandomExamples() { for (int test = 0; test < NUM_INTERLEAVE_TESTS; test++) { int numByteArrays = Math.abs(random.nextInt(6)) + 1; - byte[][] testBytes = new byte[numByteArrays][]; + byte[][] testBytes = new byte[numByteArrays][]; String[] testStrings = new String[numByteArrays]; - for (int byteIndex = 0; byteIndex < numByteArrays; byteIndex++) { + for (int byteIndex = 0; byteIndex < numByteArrays; byteIndex++) { testBytes[byteIndex] = generateRandomBytes(); testStrings[byteIndex] = bytesToString(testBytes[byteIndex]); } @@ -110,7 +102,8 @@ public void testInterleaveRandomExamples() { String stringResult = interleaveStrings(testStrings); - Assert.assertEquals("String interleave didn't match byte interleave", stringResult, byteResultAsString); + Assert.assertEquals( + "String interleave didn't match byte interleave", stringResult, byteResultAsString); } } @@ -120,19 +113,21 @@ public void testReuseInterleaveBuffer() { int colLength = 16; ByteBuffer interleaveBuffer = ByteBuffer.allocate(numByteArrays * colLength); for (int test = 0; test < NUM_INTERLEAVE_TESTS; test++) { - byte[][] testBytes = new byte[numByteArrays][]; + byte[][] testBytes = new byte[numByteArrays][]; String[] testStrings = new String[numByteArrays]; - for (int byteIndex = 0; byteIndex < numByteArrays; byteIndex++) { + for (int byteIndex = 0; byteIndex < numByteArrays; byteIndex++) { testBytes[byteIndex] = generateRandomBytes(colLength); testStrings[byteIndex] = bytesToString(testBytes[byteIndex]); } - byte[] byteResult = ZOrderByteUtils.interleaveBits(testBytes, numByteArrays * colLength, interleaveBuffer); + byte[] byteResult = + ZOrderByteUtils.interleaveBits(testBytes, numByteArrays * colLength, interleaveBuffer); String byteResultAsString = bytesToString(byteResult); String stringResult = interleaveStrings(testStrings); - Assert.assertEquals("String interleave didn't match byte interleave", stringResult, byteResultAsString); + Assert.assertEquals( + "String interleave didn't match byte interleave", stringResult, byteResultAsString); } } @@ -141,37 +136,36 @@ public void testInterleaveEmptyBits() { byte[][] test = new byte[4][10]; byte[] expected = new byte[40]; - Assert.assertArrayEquals("Should combine empty arrays", - expected, ZOrderByteUtils.interleaveBits(test, 40)); + Assert.assertArrayEquals( + "Should combine empty arrays", expected, ZOrderByteUtils.interleaveBits(test, 40)); } @Test public void testInterleaveFullBits() { byte[][] test = new byte[4][]; - test[0] = new byte[]{IIIIIIII, IIIIIIII}; - test[1] = new byte[]{IIIIIIII}; + test[0] = new byte[] {IIIIIIII, IIIIIIII}; + test[1] = new byte[] {IIIIIIII}; test[2] = new byte[0]; - test[3] = new byte[]{IIIIIIII, IIIIIIII, IIIIIIII}; - byte[] expected = new byte[]{IIIIIIII, IIIIIIII, IIIIIIII, IIIIIIII, IIIIIIII, IIIIIIII}; + test[3] = new byte[] {IIIIIIII, IIIIIIII, IIIIIIII}; + byte[] expected = new byte[] {IIIIIIII, IIIIIIII, IIIIIIII, IIIIIIII, IIIIIIII, IIIIIIII}; - Assert.assertArrayEquals("Should combine full arrays", - expected, ZOrderByteUtils.interleaveBits(test, 6)); + Assert.assertArrayEquals( + "Should combine full arrays", expected, ZOrderByteUtils.interleaveBits(test, 6)); } @Test public void testInterleaveMixedBits() { byte[][] test = new byte[4][]; - test[0] = new byte[]{OOOOOOOI, IIIIIIII, OOOOOOOO, OOOOIIII}; - test[1] = new byte[]{OOOOOOOI, OOOOOOOO, IIIIIIII}; - test[2] = new byte[]{OOOOOOOI}; - test[3] = new byte[]{OOOOOOOI}; - byte[] expected = new byte[]{ - OOOOOOOO, OOOOOOOO, OOOOOOOO, OOOOIIII, - IOIOIOIO, IOIOIOIO, - OIOIOIOI, OIOIOIOI, - OOOOIIII}; - Assert.assertArrayEquals("Should combine mixed byte arrays", - expected, ZOrderByteUtils.interleaveBits(test, 9)); + test[0] = new byte[] {OOOOOOOI, IIIIIIII, OOOOOOOO, OOOOIIII}; + test[1] = new byte[] {OOOOOOOI, OOOOOOOO, IIIIIIII}; + test[2] = new byte[] {OOOOOOOI}; + test[3] = new byte[] {OOOOOOOI}; + byte[] expected = + new byte[] { + OOOOOOOO, OOOOOOOO, OOOOOOOO, OOOOIIII, IOIOIOIO, IOIOIOIO, OIOIOIOI, OIOIOIOI, OOOOIIII + }; + Assert.assertArrayEquals( + "Should combine mixed byte arrays", expected, ZOrderByteUtils.interleaveBits(test, 9)); } @Test @@ -184,12 +178,20 @@ public void testIntOrdering() { int intCompare = Integer.signum(Integer.compare(aInt, bInt)); byte[] aBytes = ZOrderByteUtils.intToOrderedBytes(aInt, aBuffer).array(); byte[] bBytes = ZOrderByteUtils.intToOrderedBytes(bInt, bBuffer).array(); - int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); - - Assert.assertEquals(String.format( - "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", - aInt, bInt, intCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), - intCompare, byteCompare); + int byteCompare = + Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); + + Assert.assertEquals( + String.format( + "Ordering of ints should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", + aInt, + bInt, + intCompare, + Arrays.toString(aBytes), + Arrays.toString(bBytes), + byteCompare), + intCompare, + byteCompare); } } @@ -203,12 +205,20 @@ public void testLongOrdering() { int longCompare = Integer.signum(Long.compare(aLong, bLong)); byte[] aBytes = ZOrderByteUtils.longToOrderedBytes(aLong, aBuffer).array(); byte[] bBytes = ZOrderByteUtils.longToOrderedBytes(bLong, bBuffer).array(); - int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); + int byteCompare = + Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); - Assert.assertEquals(String.format( - "Ordering of longs should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", - aLong, bLong, longCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), - longCompare, byteCompare); + Assert.assertEquals( + String.format( + "Ordering of longs should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", + aLong, + bLong, + longCompare, + Arrays.toString(aBytes), + Arrays.toString(bBytes), + byteCompare), + longCompare, + byteCompare); } } @@ -222,12 +232,20 @@ public void testShortOrdering() { int longCompare = Integer.signum(Long.compare(aShort, bShort)); byte[] aBytes = ZOrderByteUtils.shortToOrderedBytes(aShort, aBuffer).array(); byte[] bBytes = ZOrderByteUtils.shortToOrderedBytes(bShort, bBuffer).array(); - int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); + int byteCompare = + Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); - Assert.assertEquals(String.format( + Assert.assertEquals( + String.format( "Ordering of longs should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", - aShort, bShort, longCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), - longCompare, byteCompare); + aShort, + bShort, + longCompare, + Arrays.toString(aBytes), + Arrays.toString(bBytes), + byteCompare), + longCompare, + byteCompare); } } @@ -241,12 +259,20 @@ public void testTinyOrdering() { int longCompare = Integer.signum(Long.compare(aByte, bByte)); byte[] aBytes = ZOrderByteUtils.tinyintToOrderedBytes(aByte, aBuffer).array(); byte[] bBytes = ZOrderByteUtils.tinyintToOrderedBytes(bByte, bBuffer).array(); - int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); + int byteCompare = + Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); - Assert.assertEquals(String.format( + Assert.assertEquals( + String.format( "Ordering of longs should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", - aByte, bByte, longCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), - longCompare, byteCompare); + aByte, + bByte, + longCompare, + Arrays.toString(aBytes), + Arrays.toString(bBytes), + byteCompare), + longCompare, + byteCompare); } } @@ -260,12 +286,20 @@ public void testFloatOrdering() { int floatCompare = Integer.signum(Float.compare(aFloat, bFloat)); byte[] aBytes = ZOrderByteUtils.floatToOrderedBytes(aFloat, aBuffer).array(); byte[] bBytes = ZOrderByteUtils.floatToOrderedBytes(bFloat, bBuffer).array(); - int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); - - Assert.assertEquals(String.format( - "Ordering of floats should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", - aFloat, bFloat, floatCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), - floatCompare, byteCompare); + int byteCompare = + Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); + + Assert.assertEquals( + String.format( + "Ordering of floats should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", + aFloat, + bFloat, + floatCompare, + Arrays.toString(aBytes), + Arrays.toString(bBytes), + byteCompare), + floatCompare, + byteCompare); } } @@ -279,32 +313,48 @@ public void testDoubleOrdering() { int doubleCompare = Integer.signum(Double.compare(aDouble, bDouble)); byte[] aBytes = ZOrderByteUtils.doubleToOrderedBytes(aDouble, aBuffer).array(); byte[] bBytes = ZOrderByteUtils.doubleToOrderedBytes(bDouble, bBuffer).array(); - int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); - - Assert.assertEquals(String.format( - "Ordering of doubles should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", - aDouble, bDouble, doubleCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), - doubleCompare, byteCompare); + int byteCompare = + Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); + + Assert.assertEquals( + String.format( + "Ordering of doubles should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", + aDouble, + bDouble, + doubleCompare, + Arrays.toString(aBytes), + Arrays.toString(bBytes), + byteCompare), + doubleCompare, + byteCompare); } } @Test public void testStringOrdering() { - CharsetEncoder encoder = StandardCharsets.UTF_8.newEncoder(); + CharsetEncoder encoder = StandardCharsets.UTF_8.newEncoder(); ByteBuffer aBuffer = ByteBuffer.allocate(128); ByteBuffer bBuffer = ByteBuffer.allocate(128); for (int i = 0; i < NUM_TESTS; i++) { - String aString = (String) RandomUtil.generatePrimitive(Types.StringType.get(), random); - String bString = (String) RandomUtil.generatePrimitive(Types.StringType.get(), random); + String aString = (String) RandomUtil.generatePrimitive(Types.StringType.get(), random); + String bString = (String) RandomUtil.generatePrimitive(Types.StringType.get(), random); int stringCompare = Integer.signum(aString.compareTo(bString)); byte[] aBytes = ZOrderByteUtils.stringToOrderedBytes(aString, 128, aBuffer, encoder).array(); byte[] bBytes = ZOrderByteUtils.stringToOrderedBytes(bString, 128, bBuffer, encoder).array(); - int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); + int byteCompare = + Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); - Assert.assertEquals(String.format( - "Ordering of strings should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", - aString, bString, stringCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), - stringCompare, byteCompare); + Assert.assertEquals( + String.format( + "Ordering of strings should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", + aString, + bString, + stringCompare, + Arrays.toString(aBytes), + Arrays.toString(bBytes), + byteCompare), + stringCompare, + byteCompare); } } @@ -313,17 +363,26 @@ public void testByteTruncateOrFill() { ByteBuffer aBuffer = ByteBuffer.allocate(128); ByteBuffer bBuffer = ByteBuffer.allocate(128); for (int i = 0; i < NUM_TESTS; i++) { - byte[] aBytesRaw = (byte[]) RandomUtil.generatePrimitive(Types.BinaryType.get(), random); - byte[] bBytesRaw = (byte[]) RandomUtil.generatePrimitive(Types.BinaryType.get(), random); - int stringCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytesRaw, bBytesRaw)); + byte[] aBytesRaw = (byte[]) RandomUtil.generatePrimitive(Types.BinaryType.get(), random); + byte[] bBytesRaw = (byte[]) RandomUtil.generatePrimitive(Types.BinaryType.get(), random); + int stringCompare = + Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytesRaw, bBytesRaw)); byte[] aBytes = ZOrderByteUtils.byteTruncateOrFill(aBytesRaw, 128, aBuffer).array(); byte[] bBytes = ZOrderByteUtils.byteTruncateOrFill(bBytesRaw, 128, bBuffer).array(); - int byteCompare = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); + int byteCompare = + Integer.signum(UnsignedBytes.lexicographicalComparator().compare(aBytes, bBytes)); - Assert.assertEquals(String.format( + Assert.assertEquals( + String.format( "Ordering of strings should match ordering of bytes, %s ~ %s -> %s != %s ~ %s -> %s ", - aBytesRaw, bBytesRaw, stringCompare, Arrays.toString(aBytes), Arrays.toString(bBytes), byteCompare), - stringCompare, byteCompare); + aBytesRaw, + bBytesRaw, + stringCompare, + Arrays.toString(aBytes), + Arrays.toString(bBytes), + byteCompare), + stringCompare, + byteCompare); } } } diff --git a/data/src/jmh/java/org/apache/iceberg/GenericOrcReaderBenchmark.java b/data/src/jmh/java/org/apache/iceberg/GenericOrcReaderBenchmark.java index 023e11dd7ae0..1adfacb3ca65 100644 --- a/data/src/jmh/java/org/apache/iceberg/GenericOrcReaderBenchmark.java +++ b/data/src/jmh/java/org/apache/iceberg/GenericOrcReaderBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.File; diff --git a/data/src/jmh/java/org/apache/iceberg/GenericParquetReaderBenchmark.java b/data/src/jmh/java/org/apache/iceberg/GenericParquetReaderBenchmark.java index 0be7729a8c81..2c5f1f82bf0a 100644 --- a/data/src/jmh/java/org/apache/iceberg/GenericParquetReaderBenchmark.java +++ b/data/src/jmh/java/org/apache/iceberg/GenericParquetReaderBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.File; diff --git a/data/src/jmh/java/org/apache/iceberg/ReaderBenchmark.java b/data/src/jmh/java/org/apache/iceberg/ReaderBenchmark.java index 3aa34d3b23ad..556ced674ed0 100644 --- a/data/src/jmh/java/org/apache/iceberg/ReaderBenchmark.java +++ b/data/src/jmh/java/org/apache/iceberg/ReaderBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.nio.file.Files; @@ -41,9 +43,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @Fork(1) @State(Scope.Benchmark) @Warmup(iterations = 3) @@ -52,15 +51,16 @@ public abstract class ReaderBenchmark { private static final Logger LOG = LoggerFactory.getLogger(ReaderBenchmark.class); - private static final Schema TEST_SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + private static final Schema TEST_SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); private static final int NUM_ROWS = 2500000; private static final int SEED = -1; diff --git a/data/src/main/java/org/apache/iceberg/data/BaseFileWriterFactory.java b/data/src/main/java/org/apache/iceberg/data/BaseFileWriterFactory.java index 3791d348a845..976b98b0a9fe 100644 --- a/data/src/main/java/org/apache/iceberg/data/BaseFileWriterFactory.java +++ b/data/src/main/java/org/apache/iceberg/data/BaseFileWriterFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; import java.io.IOException; @@ -40,9 +39,7 @@ import org.apache.iceberg.orc.ORC; import org.apache.iceberg.parquet.Parquet; -/** - * A base writer factory to be extended by query engine integrations. - */ +/** A base writer factory to be extended by query engine integrations. */ public abstract class BaseFileWriterFactory implements FileWriterFactory { private final Table table; private final FileFormat dataFileFormat; @@ -54,10 +51,16 @@ public abstract class BaseFileWriterFactory implements FileWriterFactory { private final SortOrder equalityDeleteSortOrder; private final Schema positionDeleteRowSchema; - protected BaseFileWriterFactory(Table table, FileFormat dataFileFormat, Schema dataSchema, - SortOrder dataSortOrder, FileFormat deleteFileFormat, - int[] equalityFieldIds, Schema equalityDeleteRowSchema, - SortOrder equalityDeleteSortOrder, Schema positionDeleteRowSchema) { + protected BaseFileWriterFactory( + Table table, + FileFormat dataFileFormat, + Schema dataSchema, + SortOrder dataSortOrder, + FileFormat deleteFileFormat, + int[] equalityFieldIds, + Schema equalityDeleteRowSchema, + SortOrder equalityDeleteSortOrder, + Schema positionDeleteRowSchema) { this.table = table; this.dataFileFormat = dataFileFormat; this.dataSchema = dataSchema; @@ -70,19 +73,26 @@ protected BaseFileWriterFactory(Table table, FileFormat dataFileFormat, Schema d } protected abstract void configureDataWrite(Avro.DataWriteBuilder builder); + protected abstract void configureEqualityDelete(Avro.DeleteWriteBuilder builder); + protected abstract void configurePositionDelete(Avro.DeleteWriteBuilder builder); protected abstract void configureDataWrite(Parquet.DataWriteBuilder builder); + protected abstract void configureEqualityDelete(Parquet.DeleteWriteBuilder builder); + protected abstract void configurePositionDelete(Parquet.DeleteWriteBuilder builder); protected abstract void configureDataWrite(ORC.DataWriteBuilder builder); + protected abstract void configureEqualityDelete(ORC.DeleteWriteBuilder builder); + protected abstract void configurePositionDelete(ORC.DeleteWriteBuilder builder); @Override - public DataWriter newDataWriter(EncryptedOutputFile file, PartitionSpec spec, StructLike partition) { + public DataWriter newDataWriter( + EncryptedOutputFile file, PartitionSpec spec, StructLike partition) { OutputFile outputFile = file.encryptingOutputFile(); EncryptionKeyMetadata keyMetadata = file.keyMetadata(); Map properties = table.properties(); @@ -91,52 +101,56 @@ public DataWriter newDataWriter(EncryptedOutputFile file, PartitionSpec spec, try { switch (dataFileFormat) { case AVRO: - Avro.DataWriteBuilder avroBuilder = Avro.writeData(outputFile) - .schema(dataSchema) - .setAll(properties) - .metricsConfig(metricsConfig) - .withSpec(spec) - .withPartition(partition) - .withKeyMetadata(keyMetadata) - .withSortOrder(dataSortOrder) - .overwrite(); + Avro.DataWriteBuilder avroBuilder = + Avro.writeData(outputFile) + .schema(dataSchema) + .setAll(properties) + .metricsConfig(metricsConfig) + .withSpec(spec) + .withPartition(partition) + .withKeyMetadata(keyMetadata) + .withSortOrder(dataSortOrder) + .overwrite(); configureDataWrite(avroBuilder); return avroBuilder.build(); case PARQUET: - Parquet.DataWriteBuilder parquetBuilder = Parquet.writeData(outputFile) - .schema(dataSchema) - .setAll(properties) - .metricsConfig(metricsConfig) - .withSpec(spec) - .withPartition(partition) - .withKeyMetadata(keyMetadata) - .withSortOrder(dataSortOrder) - .overwrite(); + Parquet.DataWriteBuilder parquetBuilder = + Parquet.writeData(outputFile) + .schema(dataSchema) + .setAll(properties) + .metricsConfig(metricsConfig) + .withSpec(spec) + .withPartition(partition) + .withKeyMetadata(keyMetadata) + .withSortOrder(dataSortOrder) + .overwrite(); configureDataWrite(parquetBuilder); return parquetBuilder.build(); case ORC: - ORC.DataWriteBuilder orcBuilder = ORC.writeData(outputFile) - .schema(dataSchema) - .setAll(properties) - .metricsConfig(metricsConfig) - .withSpec(spec) - .withPartition(partition) - .withKeyMetadata(keyMetadata) - .withSortOrder(dataSortOrder) - .overwrite(); + ORC.DataWriteBuilder orcBuilder = + ORC.writeData(outputFile) + .schema(dataSchema) + .setAll(properties) + .metricsConfig(metricsConfig) + .withSpec(spec) + .withPartition(partition) + .withKeyMetadata(keyMetadata) + .withSortOrder(dataSortOrder) + .overwrite(); configureDataWrite(orcBuilder); return orcBuilder.build(); default: - throw new UnsupportedOperationException("Unsupported data file format: " + dataFileFormat); + throw new UnsupportedOperationException( + "Unsupported data file format: " + dataFileFormat); } } catch (IOException e) { throw new UncheckedIOException(e); @@ -144,8 +158,8 @@ public DataWriter newDataWriter(EncryptedOutputFile file, PartitionSpec spec, } @Override - public EqualityDeleteWriter newEqualityDeleteWriter(EncryptedOutputFile file, PartitionSpec spec, - StructLike partition) { + public EqualityDeleteWriter newEqualityDeleteWriter( + EncryptedOutputFile file, PartitionSpec spec, StructLike partition) { OutputFile outputFile = file.encryptingOutputFile(); EncryptionKeyMetadata keyMetadata = file.keyMetadata(); Map properties = table.properties(); @@ -154,55 +168,59 @@ public EqualityDeleteWriter newEqualityDeleteWriter(EncryptedOutputFile file, try { switch (deleteFileFormat) { case AVRO: - Avro.DeleteWriteBuilder avroBuilder = Avro.writeDeletes(outputFile) - .setAll(properties) - .metricsConfig(metricsConfig) - .rowSchema(equalityDeleteRowSchema) - .equalityFieldIds(equalityFieldIds) - .withSpec(spec) - .withPartition(partition) - .withKeyMetadata(keyMetadata) - .withSortOrder(equalityDeleteSortOrder) - .overwrite(); + Avro.DeleteWriteBuilder avroBuilder = + Avro.writeDeletes(outputFile) + .setAll(properties) + .metricsConfig(metricsConfig) + .rowSchema(equalityDeleteRowSchema) + .equalityFieldIds(equalityFieldIds) + .withSpec(spec) + .withPartition(partition) + .withKeyMetadata(keyMetadata) + .withSortOrder(equalityDeleteSortOrder) + .overwrite(); configureEqualityDelete(avroBuilder); return avroBuilder.buildEqualityWriter(); case PARQUET: - Parquet.DeleteWriteBuilder parquetBuilder = Parquet.writeDeletes(outputFile) - .setAll(properties) - .metricsConfig(metricsConfig) - .rowSchema(equalityDeleteRowSchema) - .equalityFieldIds(equalityFieldIds) - .withSpec(spec) - .withPartition(partition) - .withKeyMetadata(keyMetadata) - .withSortOrder(equalityDeleteSortOrder) - .overwrite(); + Parquet.DeleteWriteBuilder parquetBuilder = + Parquet.writeDeletes(outputFile) + .setAll(properties) + .metricsConfig(metricsConfig) + .rowSchema(equalityDeleteRowSchema) + .equalityFieldIds(equalityFieldIds) + .withSpec(spec) + .withPartition(partition) + .withKeyMetadata(keyMetadata) + .withSortOrder(equalityDeleteSortOrder) + .overwrite(); configureEqualityDelete(parquetBuilder); return parquetBuilder.buildEqualityWriter(); case ORC: - ORC.DeleteWriteBuilder orcBuilder = ORC.writeDeletes(outputFile) - .setAll(properties) - .metricsConfig(metricsConfig) - .rowSchema(equalityDeleteRowSchema) - .equalityFieldIds(equalityFieldIds) - .withSpec(spec) - .withPartition(partition) - .withKeyMetadata(keyMetadata) - .withSortOrder(equalityDeleteSortOrder) - .overwrite(); + ORC.DeleteWriteBuilder orcBuilder = + ORC.writeDeletes(outputFile) + .setAll(properties) + .metricsConfig(metricsConfig) + .rowSchema(equalityDeleteRowSchema) + .equalityFieldIds(equalityFieldIds) + .withSpec(spec) + .withPartition(partition) + .withKeyMetadata(keyMetadata) + .withSortOrder(equalityDeleteSortOrder) + .overwrite(); configureEqualityDelete(orcBuilder); return orcBuilder.buildEqualityWriter(); default: - throw new UnsupportedOperationException("Unsupported format for equality deletes: " + deleteFileFormat); + throw new UnsupportedOperationException( + "Unsupported format for equality deletes: " + deleteFileFormat); } } catch (IOException e) { throw new UncheckedIOException("Failed to create new equality delete writer", e); @@ -210,8 +228,8 @@ public EqualityDeleteWriter newEqualityDeleteWriter(EncryptedOutputFile file, } @Override - public PositionDeleteWriter newPositionDeleteWriter(EncryptedOutputFile file, PartitionSpec spec, - StructLike partition) { + public PositionDeleteWriter newPositionDeleteWriter( + EncryptedOutputFile file, PartitionSpec spec, StructLike partition) { OutputFile outputFile = file.encryptingOutputFile(); EncryptionKeyMetadata keyMetadata = file.keyMetadata(); Map properties = table.properties(); @@ -220,49 +238,53 @@ public PositionDeleteWriter newPositionDeleteWriter(EncryptedOutputFile file, try { switch (deleteFileFormat) { case AVRO: - Avro.DeleteWriteBuilder avroBuilder = Avro.writeDeletes(outputFile) - .setAll(properties) - .metricsConfig(metricsConfig) - .rowSchema(positionDeleteRowSchema) - .withSpec(spec) - .withPartition(partition) - .withKeyMetadata(keyMetadata) - .overwrite(); + Avro.DeleteWriteBuilder avroBuilder = + Avro.writeDeletes(outputFile) + .setAll(properties) + .metricsConfig(metricsConfig) + .rowSchema(positionDeleteRowSchema) + .withSpec(spec) + .withPartition(partition) + .withKeyMetadata(keyMetadata) + .overwrite(); configurePositionDelete(avroBuilder); return avroBuilder.buildPositionWriter(); case PARQUET: - Parquet.DeleteWriteBuilder parquetBuilder = Parquet.writeDeletes(outputFile) - .setAll(properties) - .metricsConfig(metricsConfig) - .rowSchema(positionDeleteRowSchema) - .withSpec(spec) - .withPartition(partition) - .withKeyMetadata(keyMetadata) - .overwrite(); + Parquet.DeleteWriteBuilder parquetBuilder = + Parquet.writeDeletes(outputFile) + .setAll(properties) + .metricsConfig(metricsConfig) + .rowSchema(positionDeleteRowSchema) + .withSpec(spec) + .withPartition(partition) + .withKeyMetadata(keyMetadata) + .overwrite(); configurePositionDelete(parquetBuilder); return parquetBuilder.buildPositionWriter(); case ORC: - ORC.DeleteWriteBuilder orcBuilder = ORC.writeDeletes(outputFile) - .setAll(properties) - .metricsConfig(metricsConfig) - .rowSchema(positionDeleteRowSchema) - .withSpec(spec) - .withPartition(partition) - .withKeyMetadata(keyMetadata) - .overwrite(); + ORC.DeleteWriteBuilder orcBuilder = + ORC.writeDeletes(outputFile) + .setAll(properties) + .metricsConfig(metricsConfig) + .rowSchema(positionDeleteRowSchema) + .withSpec(spec) + .withPartition(partition) + .withKeyMetadata(keyMetadata) + .overwrite(); configurePositionDelete(orcBuilder); return orcBuilder.buildPositionWriter(); default: - throw new UnsupportedOperationException("Unsupported format for position deletes: " + deleteFileFormat); + throw new UnsupportedOperationException( + "Unsupported format for position deletes: " + deleteFileFormat); } } catch (IOException e) { diff --git a/data/src/main/java/org/apache/iceberg/data/DeleteFilter.java b/data/src/main/java/org/apache/iceberg/data/DeleteFilter.java index 90c7133cdc4e..f56748aeca04 100644 --- a/data/src/main/java/org/apache/iceberg/data/DeleteFilter.java +++ b/data/src/main/java/org/apache/iceberg/data/DeleteFilter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; import java.util.Collection; @@ -56,9 +55,8 @@ public abstract class DeleteFilter { private static final long DEFAULT_SET_FILTER_THRESHOLD = 100_000L; - private static final Schema POS_DELETE_SCHEMA = new Schema( - MetadataColumns.DELETE_FILE_PATH, - MetadataColumns.DELETE_FILE_POS); + private static final Schema POS_DELETE_SCHEMA = + new Schema(MetadataColumns.DELETE_FILE_PATH, MetadataColumns.DELETE_FILE_POS); private final long setFilterThreshold; private final String filePath; @@ -73,7 +71,8 @@ public abstract class DeleteFilter { private List> isInDeleteSets = null; private Predicate eqDeleteRows = null; - protected DeleteFilter(String filePath, List deletes, Schema tableSchema, Schema requestedSchema) { + protected DeleteFilter( + String filePath, List deletes, Schema tableSchema, Schema requestedSchema) { this.setFilterThreshold = DEFAULT_SET_FILTER_THRESHOLD; this.filePath = filePath; @@ -88,7 +87,8 @@ protected DeleteFilter(String filePath, List deletes, Schema tableSc eqDeleteBuilder.add(delete); break; default: - throw new UnsupportedOperationException("Unknown delete file content: " + delete.content()); + throw new UnsupportedOperationException( + "Unknown delete file content: " + delete.content()); } } @@ -96,7 +96,8 @@ protected DeleteFilter(String filePath, List deletes, Schema tableSc this.eqDeletes = eqDeleteBuilder.build(); this.requiredSchema = fileProjection(tableSchema, requestedSchema, posDeletes, eqDeletes); this.posAccessor = requiredSchema.accessorForField(MetadataColumns.ROW_POSITION.fieldId()); - this.hasIsDeletedColumn = requiredSchema.findField(MetadataColumns.IS_DELETED.fieldId()) != null; + this.hasIsDeletedColumn = + requiredSchema.findField(MetadataColumns.IS_DELETED.fieldId()) != null; this.isDeletedColumnPosition = requiredSchema.columns().indexOf(MetadataColumns.IS_DELETED); } @@ -142,12 +143,14 @@ private List> applyEqDeletes() { return isInDeleteSets; } - Multimap, DeleteFile> filesByDeleteIds = Multimaps.newMultimap(Maps.newHashMap(), Lists::newArrayList); + Multimap, DeleteFile> filesByDeleteIds = + Multimaps.newMultimap(Maps.newHashMap(), Lists::newArrayList); for (DeleteFile delete : eqDeletes) { filesByDeleteIds.put(Sets.newHashSet(delete.equalityFieldIds()), delete); } - for (Map.Entry, Collection> entry : filesByDeleteIds.asMap().entrySet()) { + for (Map.Entry, Collection> entry : + filesByDeleteIds.asMap().entrySet()) { Set ids = entry.getKey(); Iterable deletes = entry.getValue(); @@ -157,17 +160,19 @@ private List> applyEqDeletes() { // a projection to select and reorder fields of the file schema to match the delete rows StructProjection projectRow = StructProjection.create(requiredSchema, deleteSchema); - Iterable> deleteRecords = Iterables.transform(deletes, - delete -> openDeletes(delete, deleteSchema)); + Iterable> deleteRecords = + Iterables.transform(deletes, delete -> openDeletes(delete, deleteSchema)); // copy the delete records because they will be held in a set - CloseableIterable records = CloseableIterable.transform( - CloseableIterable.concat(deleteRecords), Record::copy); + CloseableIterable records = + CloseableIterable.transform(CloseableIterable.concat(deleteRecords), Record::copy); - StructLikeSet deleteSet = Deletes.toEqualitySet( - CloseableIterable.transform(records, wrapper::copyFor), deleteSchema.asStruct()); + StructLikeSet deleteSet = + Deletes.toEqualitySet( + CloseableIterable.transform(records, wrapper::copyFor), deleteSchema.asStruct()); - Predicate isInDeleteSet = record -> deleteSet.contains(projectRow.wrap(asStructLike(record))); + Predicate isInDeleteSet = + record -> deleteSet.contains(projectRow.wrap(asStructLike(record))); isInDeleteSets.add(isInDeleteSet); } @@ -176,31 +181,26 @@ private List> applyEqDeletes() { public CloseableIterable findEqualityDeleteRows(CloseableIterable records) { // Predicate to test whether a row has been deleted by equality deletions. - Predicate deletedRows = applyEqDeletes().stream() - .reduce(Predicate::or) - .orElse(t -> false); + Predicate deletedRows = applyEqDeletes().stream().reduce(Predicate::or).orElse(t -> false); return CloseableIterable.filter(records, deletedRows); } private CloseableIterable applyEqDeletes(CloseableIterable records) { - Predicate isEqDeleted = applyEqDeletes().stream() - .reduce(Predicate::or) - .orElse(t -> false); + Predicate isEqDeleted = applyEqDeletes().stream().reduce(Predicate::or).orElse(t -> false); return createDeleteIterable(records, isEqDeleted); } protected void markRowDeleted(T item) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement markRowDeleted"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement markRowDeleted"); } public Predicate eqDeletedRowFilter() { if (eqDeleteRows == null) { - eqDeleteRows = applyEqDeletes().stream() - .map(Predicate::negate) - .reduce(Predicate::and) - .orElse(t -> true); + eqDeleteRows = + applyEqDeletes().stream().map(Predicate::negate).reduce(Predicate::and).orElse(t -> true); } return eqDeleteRows; } @@ -231,15 +231,17 @@ private CloseableIterable applyPosDeletes(CloseableIterable records) { return createDeleteIterable(records, isDeleted); } - return hasIsDeletedColumn ? - Deletes.streamingMarker(records, this::pos, Deletes.deletePositions(filePath, deletes), this::markRowDeleted) : - Deletes.streamingFilter(records, this::pos, Deletes.deletePositions(filePath, deletes)); + return hasIsDeletedColumn + ? Deletes.streamingMarker( + records, this::pos, Deletes.deletePositions(filePath, deletes), this::markRowDeleted) + : Deletes.streamingFilter(records, this::pos, Deletes.deletePositions(filePath, deletes)); } - private CloseableIterable createDeleteIterable(CloseableIterable records, Predicate isDeleted) { - return hasIsDeletedColumn ? - Deletes.markDeleted(records, isDeleted, this::markRowDeleted) : - Deletes.filterDeleted(records, isDeleted); + private CloseableIterable createDeleteIterable( + CloseableIterable records, Predicate isDeleted) { + return hasIsDeletedColumn + ? Deletes.markDeleted(records, isDeleted, this::markRowDeleted) + : Deletes.filterDeleted(records, isDeleted); } private CloseableIterable openPosDeletes(DeleteFile file) { @@ -257,10 +259,12 @@ private CloseableIterable openDeletes(DeleteFile deleteFile, Schema dele .build(); case PARQUET: - Parquet.ReadBuilder builder = Parquet.read(input) - .project(deleteSchema) - .reuseContainers() - .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(deleteSchema, fileSchema)); + Parquet.ReadBuilder builder = + Parquet.read(input) + .project(deleteSchema) + .reuseContainers() + .createReaderFunc( + fileSchema -> GenericParquetReaders.buildReader(deleteSchema, fileSchema)); if (deleteFile.content() == FileContent.POSITION_DELETES) { builder.filter(Expressions.equal(MetadataColumns.DELETE_FILE_PATH.name(), filePath)); @@ -270,9 +274,11 @@ private CloseableIterable openDeletes(DeleteFile deleteFile, Schema dele case ORC: // Reusing containers is automatic for ORC. No need to set 'reuseContainers' here. - ORC.ReadBuilder orcBuilder = ORC.read(input) - .project(deleteSchema) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(deleteSchema, fileSchema)); + ORC.ReadBuilder orcBuilder = + ORC.read(input) + .project(deleteSchema) + .createReaderFunc( + fileSchema -> GenericOrcReader.buildReader(deleteSchema, fileSchema)); if (deleteFile.content() == FileContent.POSITION_DELETES) { orcBuilder.filter(Expressions.equal(MetadataColumns.DELETE_FILE_PATH.name(), filePath)); @@ -280,13 +286,18 @@ private CloseableIterable openDeletes(DeleteFile deleteFile, Schema dele return orcBuilder.build(); default: - throw new UnsupportedOperationException(String.format( - "Cannot read deletes, %s is not a supported format: %s", deleteFile.format().name(), deleteFile.path())); + throw new UnsupportedOperationException( + String.format( + "Cannot read deletes, %s is not a supported format: %s", + deleteFile.format().name(), deleteFile.path())); } } - private static Schema fileProjection(Schema tableSchema, Schema requestedSchema, - List posDeletes, List eqDeletes) { + private static Schema fileProjection( + Schema tableSchema, + Schema requestedSchema, + List posDeletes, + List eqDeletes) { if (posDeletes.isEmpty() && eqDeletes.isEmpty()) { return requestedSchema; } @@ -300,17 +311,20 @@ private static Schema fileProjection(Schema tableSchema, Schema requestedSchema, requiredIds.addAll(eqDelete.equalityFieldIds()); } - Set missingIds = Sets.newLinkedHashSet( - Sets.difference(requiredIds, TypeUtil.getProjectedIds(requestedSchema))); + Set missingIds = + Sets.newLinkedHashSet( + Sets.difference(requiredIds, TypeUtil.getProjectedIds(requestedSchema))); if (missingIds.isEmpty()) { return requestedSchema; } - // TODO: support adding nested columns. this will currently fail when finding nested columns to add + // TODO: support adding nested columns. this will currently fail when finding nested columns to + // add List columns = Lists.newArrayList(requestedSchema.columns()); for (int fieldId : missingIds) { - if (fieldId == MetadataColumns.ROW_POSITION.fieldId() || fieldId == MetadataColumns.IS_DELETED.fieldId()) { + if (fieldId == MetadataColumns.ROW_POSITION.fieldId() + || fieldId == MetadataColumns.IS_DELETED.fieldId()) { continue; // add _pos and _deleted at the end } diff --git a/data/src/main/java/org/apache/iceberg/data/GenericAppenderFactory.java b/data/src/main/java/org/apache/iceberg/data/GenericAppenderFactory.java index 9caf2717661c..23a94ebc9944 100644 --- a/data/src/main/java/org/apache/iceberg/data/GenericAppenderFactory.java +++ b/data/src/main/java/org/apache/iceberg/data/GenericAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; import java.io.IOException; @@ -42,9 +41,7 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; -/** - * Factory to create a new {@link FileAppender} to write {@link Record}s. - */ +/** Factory to create a new {@link FileAppender} to write {@link Record}s. */ public class GenericAppenderFactory implements FileAppenderFactory { private final Schema schema; @@ -62,10 +59,12 @@ public GenericAppenderFactory(Schema schema, PartitionSpec spec) { this(schema, spec, null, null, null); } - public GenericAppenderFactory(Schema schema, PartitionSpec spec, - int[] equalityFieldIds, - Schema eqDeleteRowSchema, - Schema posDeleteRowSchema) { + public GenericAppenderFactory( + Schema schema, + PartitionSpec spec, + int[] equalityFieldIds, + Schema eqDeleteRowSchema, + Schema posDeleteRowSchema) { this.schema = schema; this.spec = spec; this.equalityFieldIds = equalityFieldIds; @@ -116,7 +115,8 @@ public FileAppender newAppender(OutputFile outputFile, FileFormat fileFo .build(); default: - throw new UnsupportedOperationException("Cannot write unknown file format: " + fileFormat); + throw new UnsupportedOperationException( + "Cannot write unknown file format: " + fileFormat); } } catch (IOException e) { throw new UncheckedIOException(e); @@ -124,19 +124,25 @@ public FileAppender newAppender(OutputFile outputFile, FileFormat fileFo } @Override - public org.apache.iceberg.io.DataWriter newDataWriter(EncryptedOutputFile file, FileFormat format, - StructLike partition) { + public org.apache.iceberg.io.DataWriter newDataWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { return new org.apache.iceberg.io.DataWriter<>( - newAppender(file.encryptingOutputFile(), format), format, - file.encryptingOutputFile().location(), spec, partition, file.keyMetadata()); + newAppender(file.encryptingOutputFile(), format), + format, + file.encryptingOutputFile().location(), + spec, + partition, + file.keyMetadata()); } @Override - public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile file, FileFormat format, - StructLike partition) { - Preconditions.checkState(equalityFieldIds != null && equalityFieldIds.length > 0, + public EqualityDeleteWriter newEqDeleteWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { + Preconditions.checkState( + equalityFieldIds != null && equalityFieldIds.length > 0, "Equality field ids shouldn't be null or empty when creating equality-delete writer"); - Preconditions.checkNotNull(eqDeleteRowSchema, + Preconditions.checkNotNull( + eqDeleteRowSchema, "Equality delete row schema shouldn't be null when creating equality-delete writer"); MetricsConfig metricsConfig = MetricsConfig.fromProperties(config); @@ -190,8 +196,8 @@ public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile file, } @Override - public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile file, FileFormat format, - StructLike partition) { + public PositionDeleteWriter newPosDeleteWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { MetricsConfig metricsConfig = MetricsConfig.fromProperties(config); try { switch (format) { @@ -230,7 +236,8 @@ public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile file, .buildPositionWriter(); default: - throw new UnsupportedOperationException("Cannot write pos-deletes for unsupported file format: " + format); + throw new UnsupportedOperationException( + "Cannot write pos-deletes for unsupported file format: " + format); } } catch (IOException e) { throw new UncheckedIOException(e); diff --git a/data/src/main/java/org/apache/iceberg/data/GenericDeleteFilter.java b/data/src/main/java/org/apache/iceberg/data/GenericDeleteFilter.java index cae426a93a7f..0779ed09ce1e 100644 --- a/data/src/main/java/org/apache/iceberg/data/GenericDeleteFilter.java +++ b/data/src/main/java/org/apache/iceberg/data/GenericDeleteFilter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; import org.apache.iceberg.FileScanTask; @@ -29,7 +28,8 @@ public class GenericDeleteFilter extends DeleteFilter { private final FileIO io; private final InternalRecordWrapper asStructLike; - public GenericDeleteFilter(FileIO io, FileScanTask task, Schema tableSchema, Schema requestedSchema) { + public GenericDeleteFilter( + FileIO io, FileScanTask task, Schema tableSchema, Schema requestedSchema) { super(task.file().path().toString(), task.deletes(), tableSchema, requestedSchema); this.io = io; this.asStructLike = new InternalRecordWrapper(requiredSchema().asStruct()); diff --git a/data/src/main/java/org/apache/iceberg/data/GenericReader.java b/data/src/main/java/org/apache/iceberg/data/GenericReader.java index 913e7835f515..3637bf00bd58 100644 --- a/data/src/main/java/org/apache/iceberg/data/GenericReader.java +++ b/data/src/main/java/org/apache/iceberg/data/GenericReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; import java.io.Serializable; @@ -61,7 +60,8 @@ class GenericReader implements Serializable { } CloseableIterator open(CloseableIterable tasks) { - Iterable fileTasks = Iterables.concat(Iterables.transform(tasks, CombinedScanTask::files)); + Iterable fileTasks = + Iterables.concat(Iterables.transform(tasks, CombinedScanTask::files)); return CloseableIterable.concat(Iterables.transform(fileTasks, this::open)).iterator(); } @@ -80,8 +80,8 @@ public CloseableIterable open(FileScanTask task) { return records; } - private CloseableIterable applyResidual(CloseableIterable records, Schema recordSchema, - Expression residual) { + private CloseableIterable applyResidual( + CloseableIterable records, Schema recordSchema, Expression residual) { if (residual != null && residual != Expressions.alwaysTrue()) { InternalRecordWrapper wrapper = new InternalRecordWrapper(recordSchema.asStruct()); Evaluator filter = new Evaluator(recordSchema.asStruct(), residual, caseSensitive); @@ -91,18 +91,19 @@ private CloseableIterable applyResidual(CloseableIterable record return records; } - private CloseableIterable openFile(FileScanTask task, Schema fileProjection) { InputFile input = io.newInputFile(task.file().path().toString()); - Map partition = PartitionUtil.constantsMap(task, IdentityPartitionConverters::convertConstant); + Map partition = + PartitionUtil.constantsMap(task, IdentityPartitionConverters::convertConstant); switch (task.file().format()) { case AVRO: - Avro.ReadBuilder avro = Avro.read(input) - .project(fileProjection) - .createReaderFunc( - avroSchema -> DataReader.create(fileProjection, avroSchema, partition)) - .split(task.start(), task.length()); + Avro.ReadBuilder avro = + Avro.read(input) + .project(fileProjection) + .createReaderFunc( + avroSchema -> DataReader.create(fileProjection, avroSchema, partition)) + .split(task.start(), task.length()); if (reuseContainers) { avro.reuseContainers(); @@ -111,11 +112,14 @@ private CloseableIterable openFile(FileScanTask task, Schema fileProject return avro.build(); case PARQUET: - Parquet.ReadBuilder parquet = Parquet.read(input) - .project(fileProjection) - .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(fileProjection, fileSchema, partition)) - .split(task.start(), task.length()) - .filter(task.residual()); + Parquet.ReadBuilder parquet = + Parquet.read(input) + .project(fileProjection) + .createReaderFunc( + fileSchema -> + GenericParquetReaders.buildReader(fileProjection, fileSchema, partition)) + .split(task.start(), task.length()) + .filter(task.residual()); if (reuseContainers) { parquet.reuseContainers(); @@ -124,19 +128,24 @@ private CloseableIterable openFile(FileScanTask task, Schema fileProject return parquet.build(); case ORC: - Schema projectionWithoutConstantAndMetadataFields = TypeUtil.selectNot(fileProjection, - Sets.union(partition.keySet(), MetadataColumns.metadataFieldIds())); - ORC.ReadBuilder orc = ORC.read(input) - .project(projectionWithoutConstantAndMetadataFields) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(fileProjection, fileSchema, partition)) - .split(task.start(), task.length()) - .filter(task.residual()); + Schema projectionWithoutConstantAndMetadataFields = + TypeUtil.selectNot( + fileProjection, Sets.union(partition.keySet(), MetadataColumns.metadataFieldIds())); + ORC.ReadBuilder orc = + ORC.read(input) + .project(projectionWithoutConstantAndMetadataFields) + .createReaderFunc( + fileSchema -> + GenericOrcReader.buildReader(fileProjection, fileSchema, partition)) + .split(task.start(), task.length()) + .filter(task.residual()); return orc.build(); default: - throw new UnsupportedOperationException(String.format("Cannot read %s file: %s", - task.file().format().name(), task.file().path())); + throw new UnsupportedOperationException( + String.format( + "Cannot read %s file: %s", task.file().format().name(), task.file().path())); } } @@ -149,8 +158,9 @@ private CombinedTaskIterable(CombinedScanTask task) { @Override public CloseableIterator iterator() { - CloseableIterator iter = CloseableIterable.concat( - Iterables.transform(task.files(), GenericReader.this::open)).iterator(); + CloseableIterator iter = + CloseableIterable.concat(Iterables.transform(task.files(), GenericReader.this::open)) + .iterator(); addCloseable(iter); return iter; } diff --git a/data/src/main/java/org/apache/iceberg/data/IcebergGenerics.java b/data/src/main/java/org/apache/iceberg/data/IcebergGenerics.java index 9cea82292384..966427855934 100644 --- a/data/src/main/java/org/apache/iceberg/data/IcebergGenerics.java +++ b/data/src/main/java/org/apache/iceberg/data/IcebergGenerics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; import org.apache.iceberg.Schema; @@ -27,8 +26,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; public class IcebergGenerics { - private IcebergGenerics() { - } + private IcebergGenerics() {} /** * Returns a builder to configure a read of the given table that produces generic records. @@ -94,10 +92,7 @@ public ScanBuilder appendsAfter(long fromSnapshotId) { } public CloseableIterable build() { - return new TableScanIterable( - tableScan, - reuseContainers - ); + return new TableScanIterable(tableScan, reuseContainers); } } } diff --git a/data/src/main/java/org/apache/iceberg/data/InternalRecordWrapper.java b/data/src/main/java/org/apache/iceberg/data/InternalRecordWrapper.java index 6378e383f7c1..871000279848 100644 --- a/data/src/main/java/org/apache/iceberg/data/InternalRecordWrapper.java +++ b/data/src/main/java/org/apache/iceberg/data/InternalRecordWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; import java.lang.reflect.Array; @@ -37,9 +36,11 @@ public class InternalRecordWrapper implements StructLike { @SuppressWarnings("unchecked") public InternalRecordWrapper(Types.StructType struct) { - this(struct.fields().stream() - .map(field -> converter(field.type())) - .toArray(length -> (Function[]) Array.newInstance(Function.class, length))); + this( + struct.fields().stream() + .map(field -> converter(field.type())) + .toArray( + length -> (Function[]) Array.newInstance(Function.class, length))); } private InternalRecordWrapper(Function[] transforms) { diff --git a/data/src/main/java/org/apache/iceberg/data/TableMigrationUtil.java b/data/src/main/java/org/apache/iceberg/data/TableMigrationUtil.java index 8e967b200bea..aa1885a31e8c 100644 --- a/data/src/main/java/org/apache/iceberg/data/TableMigrationUtil.java +++ b/data/src/main/java/org/apache/iceberg/data/TableMigrationUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; import java.io.IOException; @@ -53,52 +52,63 @@ public class TableMigrationUtil { private static final PathFilter HIDDEN_PATH_FILTER = p -> !p.getName().startsWith("_") && !p.getName().startsWith("."); - private TableMigrationUtil() { - } + private TableMigrationUtil() {} /** * Returns the data files in a partition by listing the partition location. - *

- * For Parquet and ORC partitions, this will read metrics from the file footer. For Avro partitions, - * metrics are set to null. - *

- * Note: certain metrics, like NaN counts, that are only supported by iceberg file writers but not file footers, - * will not be populated. + * + *

For Parquet and ORC partitions, this will read metrics from the file footer. For Avro + * partitions, metrics are set to null. + * + *

Note: certain metrics, like NaN counts, that are only supported by iceberg file writers but + * not file footers, will not be populated. * * @param partition partition key, e.g., "a=1/b=2" - * @param uri partition location URI - * @param format partition format, avro, parquet or orc - * @param spec a partition spec - * @param conf a Hadoop conf + * @param uri partition location URI + * @param format partition format, avro, parquet or orc + * @param spec a partition spec + * @param conf a Hadoop conf * @param metricsConfig a metrics conf - * @param mapping a name mapping + * @param mapping a name mapping * @return a List of DataFile */ - public static List listPartition(Map partition, String uri, String format, - PartitionSpec spec, Configuration conf, MetricsConfig metricsConfig, - NameMapping mapping) { + public static List listPartition( + Map partition, + String uri, + String format, + PartitionSpec spec, + Configuration conf, + MetricsConfig metricsConfig, + NameMapping mapping) { return listPartition(partition, uri, format, spec, conf, metricsConfig, mapping, 1); } - public static List listPartition(Map partitionPath, String partitionUri, String format, - PartitionSpec spec, Configuration conf, MetricsConfig metricsSpec, - NameMapping mapping, int parallelism) { + public static List listPartition( + Map partitionPath, + String partitionUri, + String format, + PartitionSpec spec, + Configuration conf, + MetricsConfig metricsSpec, + NameMapping mapping, + int parallelism) { ExecutorService service = null; try { - String partitionKey = spec.fields().stream() + String partitionKey = + spec.fields().stream() .map(PartitionField::name) .map(name -> String.format("%s=%s", name, partitionPath.get(name))) .collect(Collectors.joining("/")); Path partition = new Path(partitionUri); FileSystem fs = partition.getFileSystem(conf); - List fileStatus = Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER)) + List fileStatus = + Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER)) .filter(FileStatus::isFile) .collect(Collectors.toList()); DataFile[] datafiles = new DataFile[fileStatus.size()]; - Tasks.Builder task = Tasks.range(fileStatus.size()) - .stopOnFailure() - .throwFailureWhenFinished(); + Tasks.Builder task = + Tasks.range(fileStatus.size()).stopOnFailure().throwFailureWhenFinished(); if (parallelism > 1) { service = migrationService(parallelism); @@ -106,20 +116,28 @@ public static List listPartition(Map partitionPath, St } if (format.contains("avro")) { - task.run(index -> { - Metrics metrics = getAvroMetrics(fileStatus.get(index).getPath(), conf); - datafiles[index] = buildDataFile(fileStatus.get(index), partitionKey, spec, metrics, "avro"); - }); + task.run( + index -> { + Metrics metrics = getAvroMetrics(fileStatus.get(index).getPath(), conf); + datafiles[index] = + buildDataFile(fileStatus.get(index), partitionKey, spec, metrics, "avro"); + }); } else if (format.contains("parquet")) { - task.run(index -> { - Metrics metrics = getParquetMetrics(fileStatus.get(index).getPath(), conf, metricsSpec, mapping); - datafiles[index] = buildDataFile(fileStatus.get(index), partitionKey, spec, metrics, "parquet"); - }); + task.run( + index -> { + Metrics metrics = + getParquetMetrics(fileStatus.get(index).getPath(), conf, metricsSpec, mapping); + datafiles[index] = + buildDataFile(fileStatus.get(index), partitionKey, spec, metrics, "parquet"); + }); } else if (format.contains("orc")) { - task.run(index -> { - Metrics metrics = getOrcMetrics(fileStatus.get(index).getPath(), conf, metricsSpec, mapping); - datafiles[index] = buildDataFile(fileStatus.get(index), partitionKey, spec, metrics, "orc"); - }); + task.run( + index -> { + Metrics metrics = + getOrcMetrics(fileStatus.get(index).getPath(), conf, metricsSpec, mapping); + datafiles[index] = + buildDataFile(fileStatus.get(index), partitionKey, spec, metrics, "orc"); + }); } else { throw new UnsupportedOperationException("Unknown partition format: " + format); } @@ -133,56 +151,51 @@ public static List listPartition(Map partitionPath, St } } - private static Metrics getAvroMetrics(Path path, Configuration conf) { + private static Metrics getAvroMetrics(Path path, Configuration conf) { try { InputFile file = HadoopInputFile.fromPath(path, conf); long rowCount = Avro.rowCount(file); return new Metrics(rowCount, null, null, null, null); } catch (UncheckedIOException e) { - throw new RuntimeException("Unable to read Avro file: " + - path, e); + throw new RuntimeException("Unable to read Avro file: " + path, e); } } - private static Metrics getParquetMetrics(Path path, Configuration conf, - MetricsConfig metricsSpec, NameMapping mapping) { + private static Metrics getParquetMetrics( + Path path, Configuration conf, MetricsConfig metricsSpec, NameMapping mapping) { try { InputFile file = HadoopInputFile.fromPath(path, conf); return ParquetUtil.fileMetrics(file, metricsSpec, mapping); } catch (UncheckedIOException e) { - throw new RuntimeException("Unable to read the metrics of the Parquet file: " + - path, e); + throw new RuntimeException("Unable to read the metrics of the Parquet file: " + path, e); } } - private static Metrics getOrcMetrics(Path path, Configuration conf, - MetricsConfig metricsSpec, NameMapping mapping) { + private static Metrics getOrcMetrics( + Path path, Configuration conf, MetricsConfig metricsSpec, NameMapping mapping) { try { - return OrcMetrics.fromInputFile(HadoopInputFile.fromPath(path, conf), - metricsSpec, mapping); + return OrcMetrics.fromInputFile(HadoopInputFile.fromPath(path, conf), metricsSpec, mapping); } catch (UncheckedIOException e) { - throw new RuntimeException("Unable to read the metrics of the Orc file: " + - path, e); + throw new RuntimeException("Unable to read the metrics of the Orc file: " + path, e); } } - private static DataFile buildDataFile(FileStatus stat, String partitionKey, - PartitionSpec spec, Metrics metrics, String format) { - return DataFiles.builder(spec) - .withPath(stat.getPath().toString()) - .withFormat(format) - .withFileSizeInBytes(stat.getLen()) - .withMetrics(metrics) - .withPartitionPath(partitionKey) - .build(); + private static DataFile buildDataFile( + FileStatus stat, String partitionKey, PartitionSpec spec, Metrics metrics, String format) { + return DataFiles.builder(spec) + .withPath(stat.getPath().toString()) + .withFormat(format) + .withFileSizeInBytes(stat.getLen()) + .withMetrics(metrics) + .withPartitionPath(partitionKey) + .build(); } private static ExecutorService migrationService(int concurrentDeletes) { return MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool( - concurrentDeletes, - new ThreadFactoryBuilder() - .setNameFormat("table-migration-%d") - .build())); + (ThreadPoolExecutor) + Executors.newFixedThreadPool( + concurrentDeletes, + new ThreadFactoryBuilder().setNameFormat("table-migration-%d").build())); } } diff --git a/data/src/main/java/org/apache/iceberg/data/TableScanIterable.java b/data/src/main/java/org/apache/iceberg/data/TableScanIterable.java index fdbfaab27c80..baf610b21679 100644 --- a/data/src/main/java/org/apache/iceberg/data/TableScanIterable.java +++ b/data/src/main/java/org/apache/iceberg/data/TableScanIterable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; import java.io.IOException; diff --git a/data/src/test/java/org/apache/iceberg/RecordWrapperTest.java b/data/src/test/java/org/apache/iceberg/RecordWrapperTest.java index 06814728869e..1084958f528b 100644 --- a/data/src/test/java/org/apache/iceberg/RecordWrapperTest.java +++ b/data/src/test/java/org/apache/iceberg/RecordWrapperTest.java @@ -16,46 +16,46 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import org.apache.iceberg.types.Types; import org.apache.iceberg.util.StructLikeWrapper; import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class RecordWrapperTest { - private static final Types.StructType PRIMITIVE_WITHOUT_TIME = Types.StructType.of( - required(100, "id", Types.LongType.get()), - optional(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - optional(103, "i", Types.IntegerType.get()), - required(104, "l", Types.LongType.get()), - optional(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - optional(107, "date", Types.DateType.get()), - required(108, "ts_tz", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - required(112, "fixed", Types.FixedType.ofLength(7)), - optional(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10))// maximum precision - ); + private static final Types.StructType PRIMITIVE_WITHOUT_TIME = + Types.StructType.of( + required(100, "id", Types.LongType.get()), + optional(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + optional(103, "i", Types.IntegerType.get()), + required(104, "l", Types.LongType.get()), + optional(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + optional(107, "date", Types.DateType.get()), + required(108, "ts_tz", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + required(112, "fixed", Types.FixedType.ofLength(7)), + optional(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision + ); - private static final Types.StructType TIMESTAMP_WITHOUT_ZONE = Types.StructType.of( - required(101, "ts0", Types.TimestampType.withoutZone()), - required(102, "ts1", Types.TimestampType.withoutZone()) - ); + private static final Types.StructType TIMESTAMP_WITHOUT_ZONE = + Types.StructType.of( + required(101, "ts0", Types.TimestampType.withoutZone()), + required(102, "ts1", Types.TimestampType.withoutZone())); - protected static final Types.StructType TIME = Types.StructType.of( - required(100, "time0", Types.TimeType.get()), - optional(101, "time1", Types.TimeType.get()) - ); + protected static final Types.StructType TIME = + Types.StructType.of( + required(100, "time0", Types.TimeType.get()), + optional(101, "time1", Types.TimeType.get())); @Test public void testSimpleStructWithoutTime() { @@ -74,20 +74,30 @@ public void testTime() { @Test public void testNestedSchema() { - Types.StructType structType = Types.StructType.of( - required(0, "id", Types.LongType.get()), - required(1, "level1", Types.StructType.of( - optional(2, "level2", Types.StructType.of( - required(3, "level3", Types.StructType.of( - optional(4, "level4", Types.StructType.of( - required(5, "level5", Types.StructType.of( - PRIMITIVE_WITHOUT_TIME.fields() - )) - )) - )) - )) - )) - ); + Types.StructType structType = + Types.StructType.of( + required(0, "id", Types.LongType.get()), + required( + 1, + "level1", + Types.StructType.of( + optional( + 2, + "level2", + Types.StructType.of( + required( + 3, + "level3", + Types.StructType.of( + optional( + 4, + "level4", + Types.StructType.of( + required( + 5, + "level5", + Types.StructType.of( + PRIMITIVE_WITHOUT_TIME.fields()))))))))))); generateAndValidate(new Schema(structType.fields())); } diff --git a/data/src/test/java/org/apache/iceberg/TestGenericAppenderFactory.java b/data/src/test/java/org/apache/iceberg/TestGenericAppenderFactory.java index 60dcd633eda4..8f27e734b42c 100644 --- a/data/src/test/java/org/apache/iceberg/TestGenericAppenderFactory.java +++ b/data/src/test/java/org/apache/iceberg/TestGenericAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.List; @@ -39,11 +38,14 @@ public TestGenericAppenderFactory(String fileFormat, boolean partitioned) { } @Override - protected FileAppenderFactory createAppenderFactory(List equalityFieldIds, - Schema eqDeleteSchema, - Schema posDeleteRowSchema) { - return new GenericAppenderFactory(table.schema(), table.spec(), ArrayUtil.toIntArray(equalityFieldIds), - eqDeleteSchema, posDeleteRowSchema); + protected FileAppenderFactory createAppenderFactory( + List equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema) { + return new GenericAppenderFactory( + table.schema(), + table.spec(), + ArrayUtil.toIntArray(equalityFieldIds), + eqDeleteSchema, + posDeleteRowSchema); } @Override diff --git a/data/src/test/java/org/apache/iceberg/TestMergingMetrics.java b/data/src/test/java/org/apache/iceberg/TestMergingMetrics.java index acbb33ff51bc..0e8464c51275 100644 --- a/data/src/test/java/org/apache/iceberg/TestMergingMetrics.java +++ b/data/src/test/java/org/apache/iceberg/TestMergingMetrics.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.nio.ByteBuffer; import java.util.Collection; import java.util.List; @@ -46,62 +48,73 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public abstract class TestMergingMetrics { - // all supported fields, except for UUID which is on deprecation path: see https://github.com/apache/iceberg/pull/1611 - // as well as Types.TimeType and Types.TimestampType.withoutZone as both are not supported by Spark + // all supported fields, except for UUID which is on deprecation path: see + // https://github.com/apache/iceberg/pull/1611 + // as well as Types.TimeType and Types.TimestampType.withoutZone as both are not supported by + // Spark protected static final Types.NestedField ID_FIELD = required(1, "id", Types.IntegerType.get()); protected static final Types.NestedField DATA_FIELD = optional(2, "data", Types.StringType.get()); - protected static final Types.NestedField FLOAT_FIELD = required(3, "float", Types.FloatType.get()); - protected static final Types.NestedField DOUBLE_FIELD = optional(4, "double", Types.DoubleType.get()); - protected static final Types.NestedField DECIMAL_FIELD = optional(5, "decimal", Types.DecimalType.of(5, 3)); - protected static final Types.NestedField FIXED_FIELD = optional(7, "fixed", Types.FixedType.ofLength(4)); - protected static final Types.NestedField BINARY_FIELD = optional(8, "binary", Types.BinaryType.get()); - protected static final Types.NestedField FLOAT_LIST = optional(9, "floatlist", - Types.ListType.ofRequired(10, Types.FloatType.get())); + protected static final Types.NestedField FLOAT_FIELD = + required(3, "float", Types.FloatType.get()); + protected static final Types.NestedField DOUBLE_FIELD = + optional(4, "double", Types.DoubleType.get()); + protected static final Types.NestedField DECIMAL_FIELD = + optional(5, "decimal", Types.DecimalType.of(5, 3)); + protected static final Types.NestedField FIXED_FIELD = + optional(7, "fixed", Types.FixedType.ofLength(4)); + protected static final Types.NestedField BINARY_FIELD = + optional(8, "binary", Types.BinaryType.get()); + protected static final Types.NestedField FLOAT_LIST = + optional(9, "floatlist", Types.ListType.ofRequired(10, Types.FloatType.get())); protected static final Types.NestedField LONG_FIELD = optional(11, "long", Types.LongType.get()); - protected static final Types.NestedField MAP_FIELD_1 = optional(17, "map1", - Types.MapType.ofOptional(18, 19, Types.FloatType.get(), Types.StringType.get()) - ); - protected static final Types.NestedField MAP_FIELD_2 = optional(20, "map2", - Types.MapType.ofOptional(21, 22, Types.IntegerType.get(), Types.DoubleType.get()) - ); - protected static final Types.NestedField STRUCT_FIELD = optional(23, "structField", Types.StructType.of( - required(24, "booleanField", Types.BooleanType.get()), - optional(25, "date", Types.DateType.get()), - optional(27, "timestamp", Types.TimestampType.withZone()) - )); - - private static final Map FIELDS_WITH_NAN_COUNT_TO_ID = ImmutableMap.of( - FLOAT_FIELD, 3, DOUBLE_FIELD, 4, FLOAT_LIST, 10, MAP_FIELD_1, 18, MAP_FIELD_2, 22 - ); + protected static final Types.NestedField MAP_FIELD_1 = + optional( + 17, + "map1", + Types.MapType.ofOptional(18, 19, Types.FloatType.get(), Types.StringType.get())); + protected static final Types.NestedField MAP_FIELD_2 = + optional( + 20, + "map2", + Types.MapType.ofOptional(21, 22, Types.IntegerType.get(), Types.DoubleType.get())); + protected static final Types.NestedField STRUCT_FIELD = + optional( + 23, + "structField", + Types.StructType.of( + required(24, "booleanField", Types.BooleanType.get()), + optional(25, "date", Types.DateType.get()), + optional(27, "timestamp", Types.TimestampType.withZone()))); + + private static final Map FIELDS_WITH_NAN_COUNT_TO_ID = + ImmutableMap.of( + FLOAT_FIELD, 3, DOUBLE_FIELD, 4, FLOAT_LIST, 10, MAP_FIELD_1, 18, MAP_FIELD_2, 22); // create a schema with all supported fields - protected static final Schema SCHEMA = new Schema( - ID_FIELD, - DATA_FIELD, - FLOAT_FIELD, - DOUBLE_FIELD, - DECIMAL_FIELD, - FIXED_FIELD, - BINARY_FIELD, - FLOAT_LIST, - LONG_FIELD, - MAP_FIELD_1, - MAP_FIELD_2, - STRUCT_FIELD - ); + protected static final Schema SCHEMA = + new Schema( + ID_FIELD, + DATA_FIELD, + FLOAT_FIELD, + DOUBLE_FIELD, + DECIMAL_FIELD, + FIXED_FIELD, + BINARY_FIELD, + FLOAT_LIST, + LONG_FIELD, + MAP_FIELD_1, + MAP_FIELD_2, + STRUCT_FIELD); protected final FileFormat fileFormat; @Parameterized.Parameters(name = "fileFormat = {0}") public static Object[] parameters() { - return new Object[] { FileFormat.PARQUET, FileFormat.ORC }; + return new Object[] {FileFormat.PARQUET, FileFormat.ORC}; } public TestMergingMetrics(FileFormat fileFormat) { @@ -110,8 +123,7 @@ public TestMergingMetrics(FileFormat fileFormat) { protected abstract FileAppender writeAndGetAppender(List records) throws Exception; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void verifySingleRecordMetric() throws Exception { @@ -119,10 +131,15 @@ public void verifySingleRecordMetric() throws Exception { record.setField(ID_FIELD.name(), 3); record.setField(FLOAT_FIELD.name(), Float.NaN); // FLOAT_FIELD - 1 record.setField(DOUBLE_FIELD.name(), Double.NaN); // DOUBLE_FIELD - 1 - record.setField(FLOAT_LIST.name(), ImmutableList.of(3.3F, 2.8F, Float.NaN, -25.1F, Float.NaN)); // FLOAT_LIST - 2 - record.setField(MAP_FIELD_1.name(), ImmutableMap.of(Float.NaN, "a", 0F, "b")); // MAP_FIELD_1 - 1 - record.setField(MAP_FIELD_2.name(), ImmutableMap.of( - 0, 0D, 1, Double.NaN, 2, 2D, 3, Double.NaN, 4, Double.NaN)); // MAP_FIELD_2 - 3 + record.setField( + FLOAT_LIST.name(), + ImmutableList.of(3.3F, 2.8F, Float.NaN, -25.1F, Float.NaN)); // FLOAT_LIST - 2 + record.setField( + MAP_FIELD_1.name(), ImmutableMap.of(Float.NaN, "a", 0F, "b")); // MAP_FIELD_1 - 1 + record.setField( + MAP_FIELD_2.name(), + ImmutableMap.of( + 0, 0D, 1, Double.NaN, 2, 2D, 3, Double.NaN, 4, Double.NaN)); // MAP_FIELD_2 - 3 FileAppender appender = writeAndGetAppender(ImmutableList.of(record)); Metrics metrics = appender.metrics(); @@ -163,24 +180,32 @@ public void verifyRandomlyGeneratedRecordsMetric() throws Exception { populateExpectedValues(recordList, expectedUpperBounds, expectedLowerBounds, expectedNaNCount); Metrics metrics = appender.metrics(); - expectedUpperBounds.forEach((key, value) -> assertBoundValueMatch(value.get(), metrics.upperBounds(), key)); - expectedLowerBounds.forEach((key, value) -> assertBoundValueMatch(value.get(), metrics.lowerBounds(), key)); - expectedNaNCount.forEach((key, value) -> assertNaNCountMatch(value.get(), metrics.nanValueCounts(), key)); + expectedUpperBounds.forEach( + (key, value) -> assertBoundValueMatch(value.get(), metrics.upperBounds(), key)); + expectedLowerBounds.forEach( + (key, value) -> assertBoundValueMatch(value.get(), metrics.lowerBounds(), key)); + expectedNaNCount.forEach( + (key, value) -> assertNaNCountMatch(value.get(), metrics.nanValueCounts(), key)); SCHEMA.columns().stream() .filter(column -> !FIELDS_WITH_NAN_COUNT_TO_ID.containsKey(column)) .map(Types.NestedField::fieldId) - .forEach(id -> Assert.assertNull("NaN count for field %s should be null", - metrics.nanValueCounts().get(id))); + .forEach( + id -> + Assert.assertNull( + "NaN count for field %s should be null", metrics.nanValueCounts().get(id))); } - private void assertNaNCountMatch(Long expected, Map nanValueCount, Types.NestedField field) { + private void assertNaNCountMatch( + Long expected, Map nanValueCount, Types.NestedField field) { Assert.assertEquals( String.format("NaN count for field %s does not match expected", field.name()), - expected, nanValueCount.get(FIELDS_WITH_NAN_COUNT_TO_ID.get(field))); + expected, + nanValueCount.get(FIELDS_WITH_NAN_COUNT_TO_ID.get(field))); } - private void assertBoundValueMatch(Number expected, Map boundMap, Types.NestedField field) { + private void assertBoundValueMatch( + Number expected, Map boundMap, Types.NestedField field) { if (field.type().isNestedType() && fileFormat == FileFormat.ORC) { // we don't update floating column bounds values within ORC nested columns return; @@ -189,37 +214,51 @@ private void assertBoundValueMatch(Number expected, Map bou int actualFieldId = FIELDS_WITH_NAN_COUNT_TO_ID.get(field); ByteBuffer byteBuffer = boundMap.get(actualFieldId); Type type = SCHEMA.findType(actualFieldId); - Assert.assertEquals(String.format("Bound value for field %s must match", field.name()), - expected, byteBuffer == null ? null : Conversions.fromByteBuffer(type, byteBuffer)); + Assert.assertEquals( + String.format("Bound value for field %s must match", field.name()), + expected, + byteBuffer == null ? null : Conversions.fromByteBuffer(type, byteBuffer)); } - private void populateExpectedValues(List records, - Map> upperBounds, - Map> lowerBounds, - Map expectedNaNCount) { + private void populateExpectedValues( + List records, + Map> upperBounds, + Map> lowerBounds, + Map expectedNaNCount) { for (Types.NestedField field : FIELDS_WITH_NAN_COUNT_TO_ID.keySet()) { expectedNaNCount.put(field, new AtomicLong(0)); } for (Record record : records) { - updateExpectedValuePerRecord(upperBounds, lowerBounds, expectedNaNCount, - FLOAT_FIELD, (Float) record.getField(FLOAT_FIELD.name())); - updateExpectedValuePerRecord(upperBounds, lowerBounds, expectedNaNCount, - DOUBLE_FIELD, (Double) record.getField(DOUBLE_FIELD.name())); + updateExpectedValuePerRecord( + upperBounds, + lowerBounds, + expectedNaNCount, + FLOAT_FIELD, + (Float) record.getField(FLOAT_FIELD.name())); + updateExpectedValuePerRecord( + upperBounds, + lowerBounds, + expectedNaNCount, + DOUBLE_FIELD, + (Double) record.getField(DOUBLE_FIELD.name())); List floatList = (List) record.getField(FLOAT_LIST.name()); if (floatList != null) { - updateExpectedValueFromRecords(upperBounds, lowerBounds, expectedNaNCount, FLOAT_LIST, floatList); + updateExpectedValueFromRecords( + upperBounds, lowerBounds, expectedNaNCount, FLOAT_LIST, floatList); } Map map1 = (Map) record.getField(MAP_FIELD_1.name()); if (map1 != null) { - updateExpectedValueFromRecords(upperBounds, lowerBounds, expectedNaNCount, MAP_FIELD_1, map1.keySet()); + updateExpectedValueFromRecords( + upperBounds, lowerBounds, expectedNaNCount, MAP_FIELD_1, map1.keySet()); } Map map2 = (Map) record.getField(MAP_FIELD_2.name()); if (map2 != null) { - updateExpectedValueFromRecords(upperBounds, lowerBounds, expectedNaNCount, MAP_FIELD_2, map2.values()); + updateExpectedValueFromRecords( + upperBounds, lowerBounds, expectedNaNCount, MAP_FIELD_2, map2.values()); } } } @@ -228,22 +267,30 @@ private void updateExpectedValueFromRecords( Map> upperBounds, Map> lowerBounds, Map expectedNaNCount, - Types.NestedField key, Collection vals) { - List nonNullNumbers = vals.stream().filter(v -> !NaNUtil.isNaN(v)).collect(Collectors.toList()); - Optional maxOptional = nonNullNumbers.stream().filter(Objects::nonNull) - .reduce((v1, v2) -> getMinOrMax(v1, v2, true)); - Optional minOptional = nonNullNumbers.stream().filter(Objects::nonNull) - .reduce((v1, v2) -> getMinOrMax(v1, v2, false)); + Types.NestedField key, + Collection vals) { + List nonNullNumbers = + vals.stream().filter(v -> !NaNUtil.isNaN(v)).collect(Collectors.toList()); + Optional maxOptional = + nonNullNumbers.stream() + .filter(Objects::nonNull) + .reduce((v1, v2) -> getMinOrMax(v1, v2, true)); + Optional minOptional = + nonNullNumbers.stream() + .filter(Objects::nonNull) + .reduce((v1, v2) -> getMinOrMax(v1, v2, false)); expectedNaNCount.get(key).addAndGet(vals.size() - nonNullNumbers.size()); maxOptional.ifPresent(max -> updateBound(key, max, upperBounds, true)); minOptional.ifPresent(min -> updateBound(key, min, lowerBounds, false)); } - private void updateExpectedValuePerRecord(Map> upperBounds, - Map> lowerBounds, - Map expectedNaNCount, - Types.NestedField key, Number val) { + private void updateExpectedValuePerRecord( + Map> upperBounds, + Map> lowerBounds, + Map expectedNaNCount, + Types.NestedField key, + Number val) { if (NaNUtil.isNaN(val)) { expectedNaNCount.get(key).incrementAndGet(); } else if (val != null) { @@ -253,13 +300,20 @@ private void updateExpectedValuePerRecord(Map> bounds, boolean isMax) { - bounds.computeIfAbsent(key, k -> new AtomicReference<>(val)).updateAndGet(old -> getMinOrMax(old, val, isMax)); + Types.NestedField key, + Number val, + Map> bounds, + boolean isMax) { + bounds + .computeIfAbsent(key, k -> new AtomicReference<>(val)) + .updateAndGet(old -> getMinOrMax(old, val, isMax)); } private Number getMinOrMax(Number val1, Number val2, boolean isMax) { if (val1 instanceof Double) { - return isMax ? Double.max((Double) val1, (Double) val2) : Double.min((Double) val1, (Double) val2); + return isMax + ? Double.max((Double) val1, (Double) val2) + : Double.min((Double) val1, (Double) val2); } else { return isMax ? Float.max((Float) val1, (Float) val2) : Float.min((Float) val1, (Float) val2); } diff --git a/data/src/test/java/org/apache/iceberg/TestSplitScan.java b/data/src/test/java/org/apache/iceberg/TestSplitScan.java index 2f191f6160d7..34cacffdeac6 100644 --- a/data/src/test/java/org/apache/iceberg/TestSplitScan.java +++ b/data/src/test/java/org/apache/iceberg/TestSplitScan.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -40,8 +41,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSplitScan { private static final Configuration CONF = new Configuration(); @@ -49,21 +48,19 @@ public class TestSplitScan { private static final long SPLIT_SIZE = 16 * 1024 * 1024; - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.IntegerType.get()), - required(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.IntegerType.get()), required(2, "data", Types.StringType.get())); private Table table; private File tableLocation; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private List expectedRecords; @Parameterized.Parameters(name = "format = {0}") public static Object[] parameters() { - return new Object[] { "parquet", "avro" }; + return new Object[] {"parquet", "avro"}; } private final FileFormat format; @@ -82,7 +79,8 @@ public void before() throws IOException { public void test() { Assert.assertEquals( "There should be 4 tasks created since file size is approximately close to 64MB and split size 16MB", - 4, Lists.newArrayList(table.newScan().planTasks()).size()); + 4, + Lists.newArrayList(table.newScan().planTasks()).size()); List records = Lists.newArrayList(IcebergGenerics.read(table).build()); Assert.assertEquals(expectedRecords.size(), records.size()); for (int i = 0; i < expectedRecords.size(); i++) { @@ -92,9 +90,7 @@ public void test() { private void setupTable() throws IOException { table = TABLES.create(SCHEMA, tableLocation.toString()); - table.updateProperties() - .set(TableProperties.SPLIT_SIZE, String.valueOf(SPLIT_SIZE)) - .commit(); + table.updateProperties().set(TableProperties.SPLIT_SIZE, String.valueOf(SPLIT_SIZE)).commit(); // With these number of records and the given SCHEMA // we can effectively write a file of approximate size 64 MB @@ -102,12 +98,13 @@ private void setupTable() throws IOException { expectedRecords = RandomGenericData.generate(SCHEMA, numRecords, 0L); File file = writeToFile(expectedRecords, format); - DataFile dataFile = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(expectedRecords.size()) - .withFileSizeInBytes(file.length()) - .withPath(file.toString()) - .withFormat(format) - .build(); + DataFile dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(expectedRecords.size()) + .withFileSizeInBytes(file.length()) + .withPath(file.toString()) + .withFormat(format) + .build(); table.newAppend().appendFile(dataFile).commit(); } @@ -116,8 +113,9 @@ private File writeToFile(List records, FileFormat fileFormat) throws IOE File file = temp.newFile(); Assert.assertTrue(file.delete()); - GenericAppenderFactory factory = new GenericAppenderFactory(SCHEMA).set( - TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, String.valueOf(SPLIT_SIZE)); + GenericAppenderFactory factory = + new GenericAppenderFactory(SCHEMA) + .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, String.valueOf(SPLIT_SIZE)); try (FileAppender appender = factory.newAppender(Files.localOutput(file), fileFormat)) { appender.addAll(records); } diff --git a/data/src/test/java/org/apache/iceberg/data/DataTest.java b/data/src/test/java/org/apache/iceberg/data/DataTest.java index 5566a75309d4..7e32da4c6edf 100644 --- a/data/src/test/java/org/apache/iceberg/data/DataTest.java +++ b/data/src/test/java/org/apache/iceberg/data/DataTest.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.concurrent.atomic.AtomicInteger; import org.apache.iceberg.Schema; @@ -32,35 +34,31 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class DataTest { protected abstract void writeAndValidate(Schema schema) throws IOException; - private static final StructType SUPPORTED_PRIMITIVES = StructType.of( - required(100, "id", LongType.get()), - optional(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - optional(103, "i", Types.IntegerType.get()), - required(104, "l", LongType.get()), - optional(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - optional(107, "date", Types.DateType.get()), - required(108, "ts_tz", Types.TimestampType.withZone()), - required(109, "ts", Types.TimestampType.withoutZone()), - required(110, "s", Types.StringType.get()), - required(112, "fixed", Types.FixedType.ofLength(7)), - optional(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)), // maximum precision - required(117, "time", Types.TimeType.get()) - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final StructType SUPPORTED_PRIMITIVES = + StructType.of( + required(100, "id", LongType.get()), + optional(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + optional(103, "i", Types.IntegerType.get()), + required(104, "l", LongType.get()), + optional(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + optional(107, "date", Types.DateType.get()), + required(108, "ts_tz", Types.TimestampType.withZone()), + required(109, "ts", Types.TimestampType.withoutZone()), + required(110, "s", Types.StringType.get()), + required(112, "fixed", Types.FixedType.ofLength(7)), + optional(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)), // maximum precision + required(117, "time", Types.TimeType.get())); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testSimpleStruct() throws IOException { @@ -69,102 +67,130 @@ public void testSimpleStruct() throws IOException { @Test public void testArray() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", ListType.ofOptional(2, Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, Types.StringType.get()))); writeAndValidate(schema); } @Test public void testArrayOfStructs() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", ListType.ofOptional(2, SUPPORTED_PRIMITIVES))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, SUPPORTED_PRIMITIVES))); writeAndValidate(schema); } @Test public void testMap() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StringType.get(), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional(2, 3, Types.StringType.get(), Types.StringType.get()))); writeAndValidate(schema); } @Test public void testNumericMapKey() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - LongType.get(), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", MapType.ofOptional(2, 3, LongType.get(), Types.StringType.get()))); writeAndValidate(schema); } @Test public void testComplexMapKey() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - StructType.of( - required(4, "i", Types.IntegerType.get()), - optional(5, "s", Types.StringType.get())), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional( + 2, + 3, + StructType.of( + required(4, "i", Types.IntegerType.get()), + optional(5, "s", Types.StringType.get())), + Types.StringType.get()))); writeAndValidate(schema); } @Test public void testMapOfStructs() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StringType.get(), - SUPPORTED_PRIMITIVES))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, "data", MapType.ofOptional(2, 3, Types.StringType.get(), SUPPORTED_PRIMITIVES))); writeAndValidate(schema); } @Test public void testMixedTypes() throws IOException { - StructType structType = StructType.of( - required(0, "id", LongType.get()), - optional(1, "list_of_maps", - ListType.ofOptional(2, MapType.ofOptional(3, 4, - Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - optional(5, "map_of_lists", - MapType.ofOptional(6, 7, - Types.StringType.get(), - ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), - required(9, "list_of_lists", - ListType.ofOptional(10, ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), - required(12, "map_of_maps", - MapType.ofOptional(13, 14, - Types.StringType.get(), - MapType.ofOptional(15, 16, + StructType structType = + StructType.of( + required(0, "id", LongType.get()), + optional( + 1, + "list_of_maps", + ListType.ofOptional( + 2, MapType.ofOptional(3, 4, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + optional( + 5, + "map_of_lists", + MapType.ofOptional( + 6, 7, Types.StringType.get(), ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), + required( + 9, + "list_of_lists", + ListType.ofOptional(10, ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), + required( + 12, + "map_of_maps", + MapType.ofOptional( + 13, + 14, Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - required(17, "list_of_struct_of_nested_types", ListType.ofOptional(19, StructType.of( - Types.NestedField.required(20, "m1", MapType.ofOptional(21, 22, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(23, "l1", ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), - Types.NestedField.required(25, "l2", ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(27, "m2", MapType.ofOptional(28, 29, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)) - ))) - ); - - Schema schema = new Schema(TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) - .asStructType().fields()); + MapType.ofOptional(15, 16, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + required( + 17, + "list_of_struct_of_nested_types", + ListType.ofOptional( + 19, + StructType.of( + Types.NestedField.required( + 20, + "m1", + MapType.ofOptional( + 21, 22, Types.StringType.get(), SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 23, "l1", ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), + Types.NestedField.required( + 25, "l2", ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 27, + "m2", + MapType.ofOptional( + 28, 29, Types.StringType.get(), SUPPORTED_PRIMITIVES)))))); + + Schema schema = + new Schema( + TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) + .asStructType() + .fields()); writeAndValidate(schema); } diff --git a/data/src/test/java/org/apache/iceberg/data/DataTestHelpers.java b/data/src/test/java/org/apache/iceberg/data/DataTestHelpers.java index 949716ee9165..d4813ca7bc77 100644 --- a/data/src/test/java/org/apache/iceberg/data/DataTestHelpers.java +++ b/data/src/test/java/org/apache/iceberg/data/DataTestHelpers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; import java.util.List; @@ -27,8 +26,7 @@ import org.junit.Assert; public class DataTestHelpers { - private DataTestHelpers() { - } + private DataTestHelpers() {} public static void assertEquals(Types.StructType struct, Record expected, Record actual) { List fields = struct.fields(); @@ -86,16 +84,21 @@ private static void assertEquals(Type type, Object expected, Object actual) { case UUID: case BINARY: case DECIMAL: - Assert.assertEquals("Primitive value should be equal to expected for type " + type, expected, actual); + Assert.assertEquals( + "Primitive value should be equal to expected for type " + type, expected, actual); break; case FIXED: - Assertions.assertThat(expected).as("Expected should be a byte[]").isInstanceOf(byte[].class); + Assertions.assertThat(expected) + .as("Expected should be a byte[]") + .isInstanceOf(byte[].class); Assertions.assertThat(expected).as("Actual should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Array contents should be equal", - (byte[]) expected, (byte[]) actual); + Assert.assertArrayEquals( + "Array contents should be equal", (byte[]) expected, (byte[]) actual); break; case STRUCT: - Assertions.assertThat(expected).as("Expected should be a Record").isInstanceOf(Record.class); + Assertions.assertThat(expected) + .as("Expected should be a Record") + .isInstanceOf(Record.class); Assertions.assertThat(actual).as("Actual should be a Record").isInstanceOf(Record.class); assertEquals(type.asStructType(), (Record) expected, (Record) actual); break; diff --git a/data/src/test/java/org/apache/iceberg/data/DeleteReadTests.java b/data/src/test/java/org/apache/iceberg/data/DeleteReadTests.java index 75e1657d54e6..659ac895ff8e 100644 --- a/data/src/test/java/org/apache/iceberg/data/DeleteReadTests.java +++ b/data/src/test/java/org/apache/iceberg/data/DeleteReadTests.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; import java.io.IOException; @@ -48,28 +47,25 @@ public abstract class DeleteReadTests { // Schema passed to create tables - public static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get()) - ); + public static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); - public static final Schema DATE_SCHEMA = new Schema( - Types.NestedField.required(1, "dt", Types.DateType.get()), - Types.NestedField.required(2, "data", Types.StringType.get()), - Types.NestedField.required(3, "id", Types.IntegerType.get()) - ); + public static final Schema DATE_SCHEMA = + new Schema( + Types.NestedField.required(1, "dt", Types.DateType.get()), + Types.NestedField.required(2, "data", Types.StringType.get()), + Types.NestedField.required(3, "id", Types.IntegerType.get())); // Partition spec used to create tables - public static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .bucket("data", 16) - .build(); + public static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).bucket("data", 16).build(); - public static final PartitionSpec DATE_SPEC = PartitionSpec.builderFor(DATE_SCHEMA) - .day("dt") - .build(); + public static final PartitionSpec DATE_SPEC = + PartitionSpec.builderFor(DATE_SCHEMA).day("dt").build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); protected String tableName = null; protected String dateTableName = null; @@ -95,11 +91,10 @@ public void writeTestDataFile() throws IOException { records.add(record.copy("id", 121, "data", "f")); records.add(record.copy("id", 122, "data", "g")); - this.dataFile = FileHelpers.writeDataFile(table, Files.localOutput(temp.newFile()), Row.of(0), records); + this.dataFile = + FileHelpers.writeDataFile(table, Files.localOutput(temp.newFile()), Row.of(0), records); - table.newAppend() - .appendFile(dataFile) - .commit(); + table.newAppend().appendFile(dataFile).commit(); } @After @@ -115,30 +110,47 @@ private void initDateTable() throws IOException { GenericRecord record = GenericRecord.create(dateTable.schema()); - this.dateRecords = Lists.newArrayList( - record.copy("dt", LocalDate.parse("2021-09-01"), "data", "a", "id", 1), - record.copy("dt", LocalDate.parse("2021-09-02"), "data", "b", "id", 2), - record.copy("dt", LocalDate.parse("2021-09-03"), "data", "c", "id", 3), - record.copy("dt", LocalDate.parse("2021-09-04"), "data", "d", "id", 4), - record.copy("dt", LocalDate.parse("2021-09-05"), "data", "e", "id", 5)); - - DataFile dataFile1 = FileHelpers.writeDataFile( - dateTable, Files.localOutput(temp.newFile()), - Row.of(DateTimeUtil.daysFromDate(LocalDate.parse("2021-09-01"))), dateRecords.subList(0, 1)); - DataFile dataFile2 = FileHelpers.writeDataFile( - dateTable, Files.localOutput(temp.newFile()), - Row.of(DateTimeUtil.daysFromDate(LocalDate.parse("2021-09-02"))), dateRecords.subList(1, 2)); - DataFile dataFile3 = FileHelpers.writeDataFile( - dateTable, Files.localOutput(temp.newFile()), - Row.of(DateTimeUtil.daysFromDate(LocalDate.parse("2021-09-03"))), dateRecords.subList(2, 3)); - DataFile dataFile4 = FileHelpers.writeDataFile( - dateTable, Files.localOutput(temp.newFile()), - Row.of(DateTimeUtil.daysFromDate(LocalDate.parse("2021-09-04"))), dateRecords.subList(3, 4)); - DataFile dataFile5 = FileHelpers.writeDataFile( - dateTable, Files.localOutput(temp.newFile()), - Row.of(DateTimeUtil.daysFromDate(LocalDate.parse("2021-09-05"))), dateRecords.subList(4, 5)); - - dateTable.newAppend() + this.dateRecords = + Lists.newArrayList( + record.copy("dt", LocalDate.parse("2021-09-01"), "data", "a", "id", 1), + record.copy("dt", LocalDate.parse("2021-09-02"), "data", "b", "id", 2), + record.copy("dt", LocalDate.parse("2021-09-03"), "data", "c", "id", 3), + record.copy("dt", LocalDate.parse("2021-09-04"), "data", "d", "id", 4), + record.copy("dt", LocalDate.parse("2021-09-05"), "data", "e", "id", 5)); + + DataFile dataFile1 = + FileHelpers.writeDataFile( + dateTable, + Files.localOutput(temp.newFile()), + Row.of(DateTimeUtil.daysFromDate(LocalDate.parse("2021-09-01"))), + dateRecords.subList(0, 1)); + DataFile dataFile2 = + FileHelpers.writeDataFile( + dateTable, + Files.localOutput(temp.newFile()), + Row.of(DateTimeUtil.daysFromDate(LocalDate.parse("2021-09-02"))), + dateRecords.subList(1, 2)); + DataFile dataFile3 = + FileHelpers.writeDataFile( + dateTable, + Files.localOutput(temp.newFile()), + Row.of(DateTimeUtil.daysFromDate(LocalDate.parse("2021-09-03"))), + dateRecords.subList(2, 3)); + DataFile dataFile4 = + FileHelpers.writeDataFile( + dateTable, + Files.localOutput(temp.newFile()), + Row.of(DateTimeUtil.daysFromDate(LocalDate.parse("2021-09-04"))), + dateRecords.subList(3, 4)); + DataFile dataFile5 = + FileHelpers.writeDataFile( + dateTable, + Files.localOutput(temp.newFile()), + Row.of(DateTimeUtil.daysFromDate(LocalDate.parse("2021-09-05"))), + dateRecords.subList(4, 5)); + + dateTable + .newAppend() .appendFile(dataFile1) .appendFile(dataFile2) .appendFile(dataFile3) @@ -147,11 +159,13 @@ private void initDateTable() throws IOException { .commit(); } - protected abstract Table createTable(String name, Schema schema, PartitionSpec spec) throws IOException; + protected abstract Table createTable(String name, Schema schema, PartitionSpec spec) + throws IOException; protected abstract void dropTable(String name) throws IOException; - protected abstract StructLikeSet rowSet(String name, Table testTable, String... columns) throws IOException; + protected abstract StructLikeSet rowSet(String name, Table testTable, String... columns) + throws IOException; protected boolean expectPruned() { return true; @@ -161,18 +175,18 @@ protected boolean expectPruned() { public void testEqualityDeletes() throws IOException { Schema deleteRowSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d"), // id = 89 - dataDelete.copy("data", "g") // id = 122 - ); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d"), // id = 89 + dataDelete.copy("data", "g") // id = 122 + ); - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, deleteRowSchema); + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, deleteRowSchema); - table.newRowDelta() - .addDeletes(eqDeletes) - .commit(); + table.newRowDelta().addDeletes(eqDeletes).commit(); StructLikeSet expected = rowSetWithoutIds(table, records, 29, 89, 122); StructLikeSet actual = rowSet(tableName, table, "*"); @@ -186,23 +200,36 @@ public void testEqualityDateDeletes() throws IOException { Schema deleteRowSchema = dateTable.schema().select("*"); Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("dt", LocalDate.parse("2021-09-01"), "data", "a", "id", 1), - dataDelete.copy("dt", LocalDate.parse("2021-09-02"), "data", "b", "id", 2), - dataDelete.copy("dt", LocalDate.parse("2021-09-03"), "data", "c", "id", 3) - ); - - DeleteFile eqDeletes1 = FileHelpers.writeDeleteFile( - dateTable, Files.localOutput(temp.newFile()), - Row.of(DateTimeUtil.daysFromDate(LocalDate.parse("2021-09-01"))), dataDeletes.subList(0, 1), deleteRowSchema); - DeleteFile eqDeletes2 = FileHelpers.writeDeleteFile( - dateTable, Files.localOutput(temp.newFile()), - Row.of(DateTimeUtil.daysFromDate(LocalDate.parse("2021-09-02"))), dataDeletes.subList(1, 2), deleteRowSchema); - DeleteFile eqDeletes3 = FileHelpers.writeDeleteFile( - dateTable, Files.localOutput(temp.newFile()), - Row.of(DateTimeUtil.daysFromDate(LocalDate.parse("2021-09-03"))), dataDeletes.subList(2, 3), deleteRowSchema); - - dateTable.newRowDelta() + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("dt", LocalDate.parse("2021-09-01"), "data", "a", "id", 1), + dataDelete.copy("dt", LocalDate.parse("2021-09-02"), "data", "b", "id", 2), + dataDelete.copy("dt", LocalDate.parse("2021-09-03"), "data", "c", "id", 3)); + + DeleteFile eqDeletes1 = + FileHelpers.writeDeleteFile( + dateTable, + Files.localOutput(temp.newFile()), + Row.of(DateTimeUtil.daysFromDate(LocalDate.parse("2021-09-01"))), + dataDeletes.subList(0, 1), + deleteRowSchema); + DeleteFile eqDeletes2 = + FileHelpers.writeDeleteFile( + dateTable, + Files.localOutput(temp.newFile()), + Row.of(DateTimeUtil.daysFromDate(LocalDate.parse("2021-09-02"))), + dataDeletes.subList(1, 2), + deleteRowSchema); + DeleteFile eqDeletes3 = + FileHelpers.writeDeleteFile( + dateTable, + Files.localOutput(temp.newFile()), + Row.of(DateTimeUtil.daysFromDate(LocalDate.parse("2021-09-03"))), + dataDeletes.subList(2, 3), + deleteRowSchema); + + dateTable + .newRowDelta() .addDeletes(eqDeletes1) .addDeletes(eqDeletes2) .addDeletes(eqDeletes3) @@ -219,18 +246,18 @@ public void testEqualityDateDeletes() throws IOException { public void testEqualityDeletesWithRequiredEqColumn() throws IOException { Schema deleteRowSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d"), // id = 89 - dataDelete.copy("data", "g") // id = 122 - ); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d"), // id = 89 + dataDelete.copy("data", "g") // id = 122 + ); - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, deleteRowSchema); + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, deleteRowSchema); - table.newRowDelta() - .addDeletes(eqDeletes) - .commit(); + table.newRowDelta().addDeletes(eqDeletes).commit(); StructLikeSet expected = selectColumns(rowSetWithoutIds(table, records, 29, 89, 122), "id"); StructLikeSet actual = rowSet(tableName, table, "id"); @@ -238,8 +265,10 @@ public void testEqualityDeletesWithRequiredEqColumn() throws IOException { if (expectPruned()) { Assert.assertEquals("Table should contain expected rows", expected, actual); } else { - // data is added by the reader to apply the eq deletes, use StructProjection to remove it from comparison - Assert.assertEquals("Table should contain expected rows", expected, selectColumns(actual, "id")); + // data is added by the reader to apply the eq deletes, use StructProjection to remove it from + // comparison + Assert.assertEquals( + "Table should contain expected rows", expected, selectColumns(actual, "id")); } } @@ -249,26 +278,25 @@ public void testEqualityDeletesSpanningMultipleDataFiles() throws IOException { GenericRecord record = GenericRecord.create(table.schema()); records.add(record.copy("id", 144, "data", "a")); - this.dataFile = FileHelpers.writeDataFile(table, Files.localOutput(temp.newFile()), Row.of(0), records); + this.dataFile = + FileHelpers.writeDataFile(table, Files.localOutput(temp.newFile()), Row.of(0), records); - table.newAppend() - .appendFile(dataFile) - .commit(); + table.newAppend().appendFile(dataFile).commit(); Schema deleteRowSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29, 144 - dataDelete.copy("data", "d"), // id = 89 - dataDelete.copy("data", "g") // id = 122 - ); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29, 144 + dataDelete.copy("data", "d"), // id = 89 + dataDelete.copy("data", "g") // id = 122 + ); - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, deleteRowSchema); + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, deleteRowSchema); - table.newRowDelta() - .addDeletes(eqDeletes) - .commit(); + table.newRowDelta().addDeletes(eqDeletes).commit(); StructLikeSet expected = rowSetWithoutIds(table, records, 29, 89, 122, 144); StructLikeSet actual = rowSet(tableName, table, "*"); @@ -278,16 +306,18 @@ public void testEqualityDeletesSpanningMultipleDataFiles() throws IOException { @Test public void testPositionDeletes() throws IOException { - List> deletes = Lists.newArrayList( - Pair.of(dataFile.path(), 0L), // id = 29 - Pair.of(dataFile.path(), 3L), // id = 89 - Pair.of(dataFile.path(), 6L) // id = 122 - ); - - Pair posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), Row.of(0), deletes); - - table.newRowDelta() + List> deletes = + Lists.newArrayList( + Pair.of(dataFile.path(), 0L), // id = 29 + Pair.of(dataFile.path(), 3L), // id = 89 + Pair.of(dataFile.path(), 6L) // id = 122 + ); + + Pair posDeletes = + FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), Row.of(0), deletes); + + table + .newRowDelta() .addDeletes(posDeletes.first()) .validateDataFilesExist(posDeletes.second()) .commit(); @@ -300,27 +330,31 @@ public void testPositionDeletes() throws IOException { @Test public void testMultiplePosDeleteFiles() throws IOException { - List> deletes = Lists.newArrayList( - Pair.of(dataFile.path(), 0L), // id = 29 - Pair.of(dataFile.path(), 3L) // id = 89 - ); + List> deletes = + Lists.newArrayList( + Pair.of(dataFile.path(), 0L), // id = 29 + Pair.of(dataFile.path(), 3L) // id = 89 + ); - Pair posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), Row.of(0), deletes); + Pair posDeletes = + FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), Row.of(0), deletes); - table.newRowDelta() + table + .newRowDelta() .addDeletes(posDeletes.first()) .validateDataFilesExist(posDeletes.second()) .commit(); - deletes = Lists.newArrayList( - Pair.of(dataFile.path(), 6L) // id = 122 - ); + deletes = + Lists.newArrayList( + Pair.of(dataFile.path(), 6L) // id = 122 + ); - posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), Row.of(0), deletes); + posDeletes = + FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), Row.of(0), deletes); - table.newRowDelta() + table + .newRowDelta() .addDeletes(posDeletes.first()) .validateDataFilesExist(posDeletes.second()) .commit(); @@ -335,24 +369,28 @@ public void testMultiplePosDeleteFiles() throws IOException { public void testMixedPositionAndEqualityDeletes() throws IOException { Schema dataSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(dataSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d"), // id = 89 - dataDelete.copy("data", "g") // id = 122 - ); - - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, dataSchema); - - List> deletes = Lists.newArrayList( - Pair.of(dataFile.path(), 3L), // id = 89 - Pair.of(dataFile.path(), 5L) // id = 121 - ); - - Pair posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), Row.of(0), deletes); - - table.newRowDelta() + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d"), // id = 89 + dataDelete.copy("data", "g") // id = 122 + ); + + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, dataSchema); + + List> deletes = + Lists.newArrayList( + Pair.of(dataFile.path(), 3L), // id = 89 + Pair.of(dataFile.path(), 5L) // id = 121 + ); + + Pair posDeletes = + FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), Row.of(0), deletes); + + table + .newRowDelta() .addDeletes(eqDeletes) .addDeletes(posDeletes.first()) .validateDataFilesExist(posDeletes.second()) @@ -368,29 +406,30 @@ public void testMixedPositionAndEqualityDeletes() throws IOException { public void testMultipleEqualityDeleteSchemas() throws IOException { Schema dataSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(dataSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d"), // id = 89 - dataDelete.copy("data", "g") // id = 122 - ); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d"), // id = 89 + dataDelete.copy("data", "g") // id = 122 + ); - DeleteFile dataEqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, dataSchema); + DeleteFile dataEqDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, dataSchema); Schema idSchema = table.schema().select("id"); Record idDelete = GenericRecord.create(idSchema); - List idDeletes = Lists.newArrayList( - idDelete.copy("id", 121), // id = 121 - idDelete.copy("id", 29) // id = 29 - ); + List idDeletes = + Lists.newArrayList( + idDelete.copy("id", 121), // id = 121 + idDelete.copy("id", 29) // id = 29 + ); - DeleteFile idEqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), Row.of(0), idDeletes, idSchema); + DeleteFile idEqDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), Row.of(0), idDeletes, idSchema); - table.newRowDelta() - .addDeletes(dataEqDeletes) - .addDeletes(idEqDeletes) - .commit(); + table.newRowDelta().addDeletes(dataEqDeletes).addDeletes(idEqDeletes).commit(); StructLikeSet expected = rowSetWithoutIds(table, records, 29, 89, 121, 122); StructLikeSet actual = rowSet(tableName, table, "*"); @@ -401,33 +440,32 @@ public void testMultipleEqualityDeleteSchemas() throws IOException { @Test public void testEqualityDeleteByNull() throws IOException { // data is required in the test table; make it optional for this test - table.updateSchema() - .makeColumnOptional("data") - .commit(); + table.updateSchema().makeColumnOptional("data").commit(); // add a new data file with a record where data is null Record record = GenericRecord.create(table.schema()); - DataFile dataFileWithNull = FileHelpers.writeDataFile( - table, Files.localOutput(temp.newFile()), Row.of(0), - Lists.newArrayList(record.copy("id", 131, "data", null))); + DataFile dataFileWithNull = + FileHelpers.writeDataFile( + table, + Files.localOutput(temp.newFile()), + Row.of(0), + Lists.newArrayList(record.copy("id", 131, "data", null))); - table.newAppend() - .appendFile(dataFileWithNull) - .commit(); + table.newAppend().appendFile(dataFileWithNull).commit(); // delete where data is null Schema dataSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(dataSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", null) // id = 131 - ); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", null) // id = 131 + ); - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, dataSchema); + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, dataSchema); - table.newRowDelta() - .addDeletes(eqDeletes) - .commit(); + table.newRowDelta().addDeletes(eqDeletes).commit(); StructLikeSet expected = rowSetWithoutIds(table, records, 131); StructLikeSet actual = rowSet(tableName, table, "*"); @@ -444,7 +482,8 @@ private StructLikeSet selectColumns(StructLikeSet rows, String... columns) { return set; } - protected static StructLikeSet rowSetWithoutIds(Table table, List recordList, int... idsToRemove) { + protected static StructLikeSet rowSetWithoutIds( + Table table, List recordList, int... idsToRemove) { Set deletedIds = Sets.newHashSet(ArrayUtil.toIntList(idsToRemove)); StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); recordList.stream() @@ -457,9 +496,7 @@ protected static StructLikeSet rowSetWithoutIds(Table table, List record protected StructLikeSet rowSetWithIds(int... idsToRetain) { Set deletedIds = Sets.newHashSet(ArrayUtil.toIntList(idsToRetain)); StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - records.stream() - .filter(row -> deletedIds.contains(row.getField("id"))) - .forEach(set::add); + records.stream().filter(row -> deletedIds.contains(row.getField("id"))).forEach(set::add); return set; } } diff --git a/data/src/test/java/org/apache/iceberg/data/FileHelpers.java b/data/src/test/java/org/apache/iceberg/data/FileHelpers.java index dfcb75b04b01..463349c82785 100644 --- a/data/src/test/java/org/apache/iceberg/data/FileHelpers.java +++ b/data/src/test/java/org/apache/iceberg/data/FileHelpers.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; + import java.io.Closeable; import java.io.IOException; import java.util.List; @@ -43,21 +45,16 @@ import org.apache.iceberg.util.CharSequenceSet; import org.apache.iceberg.util.Pair; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; - public class FileHelpers { - private FileHelpers() { - } + private FileHelpers() {} - public static Pair writeDeleteFile(Table table, OutputFile out, - List> deletes) - throws IOException { + public static Pair writeDeleteFile( + Table table, OutputFile out, List> deletes) throws IOException { return writeDeleteFile(table, out, null, deletes); } - public static Pair writeDeleteFile(Table table, OutputFile out, StructLike partition, - List> deletes) + public static Pair writeDeleteFile( + Table table, OutputFile out, StructLike partition, List> deletes) throws IOException { FileFormat format = defaultFormat(table.properties()); FileAppenderFactory factory = new GenericAppenderFactory(table.schema(), table.spec()); @@ -72,20 +69,28 @@ public static Pair writeDeleteFile(Table table, Out return Pair.of(writer.toDeleteFile(), writer.referencedDataFiles()); } - public static DeleteFile writeDeleteFile(Table table, OutputFile out, List deletes, Schema deleteRowSchema) + public static DeleteFile writeDeleteFile( + Table table, OutputFile out, List deletes, Schema deleteRowSchema) throws IOException { return writeDeleteFile(table, out, null, deletes, deleteRowSchema); } - public static DeleteFile writeDeleteFile(Table table, OutputFile out, StructLike partition, - List deletes, Schema deleteRowSchema) + public static DeleteFile writeDeleteFile( + Table table, + OutputFile out, + StructLike partition, + List deletes, + Schema deleteRowSchema) throws IOException { FileFormat format = defaultFormat(table.properties()); - int[] equalityFieldIds = deleteRowSchema.columns().stream().mapToInt(Types.NestedField::fieldId).toArray(); - FileAppenderFactory factory = new GenericAppenderFactory(table.schema(), table.spec(), - equalityFieldIds, deleteRowSchema, null); - - EqualityDeleteWriter writer = factory.newEqDeleteWriter(encrypt(out), format, partition); + int[] equalityFieldIds = + deleteRowSchema.columns().stream().mapToInt(Types.NestedField::fieldId).toArray(); + FileAppenderFactory factory = + new GenericAppenderFactory( + table.schema(), table.spec(), equalityFieldIds, deleteRowSchema, null); + + EqualityDeleteWriter writer = + factory.newEqDeleteWriter(encrypt(out), format, partition); try (Closeable toClose = writer) { writer.deleteAll(deletes); } @@ -93,7 +98,8 @@ public static DeleteFile writeDeleteFile(Table table, OutputFile out, StructLike return writer.toDeleteFile(); } - public static DataFile writeDataFile(Table table, OutputFile out, List rows) throws IOException { + public static DataFile writeDataFile(Table table, OutputFile out, List rows) + throws IOException { FileFormat format = defaultFormat(table.properties()); GenericAppenderFactory factory = new GenericAppenderFactory(table.schema()); @@ -111,8 +117,8 @@ public static DataFile writeDataFile(Table table, OutputFile out, List r .build(); } - public static DataFile writeDataFile(Table table, OutputFile out, StructLike partition, List rows) - throws IOException { + public static DataFile writeDataFile( + Table table, OutputFile out, StructLike partition, List rows) throws IOException { FileFormat format = defaultFormat(table.properties()); GenericAppenderFactory factory = new GenericAppenderFactory(table.schema(), table.spec()); diff --git a/data/src/test/java/org/apache/iceberg/data/GenericAppenderHelper.java b/data/src/test/java/org/apache/iceberg/data/GenericAppenderHelper.java index 7d9d7add5dd0..96d0a96c72bf 100644 --- a/data/src/test/java/org/apache/iceberg/data/GenericAppenderHelper.java +++ b/data/src/test/java/org/apache/iceberg/data/GenericAppenderHelper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; import java.io.File; @@ -35,9 +34,7 @@ import org.junit.Assert; import org.junit.rules.TemporaryFolder; -/** - * Helper for appending {@link DataFile} to a table or appending {@link Record}s to a table. - */ +/** Helper for appending {@link DataFile} to a table or appending {@link Record}s to a table. */ public class GenericAppenderHelper { private static final String ORC_CONFIG_PREFIX = "^orc.*"; @@ -47,7 +44,8 @@ public class GenericAppenderHelper { private final TemporaryFolder tmp; private final Configuration conf; - public GenericAppenderHelper(Table table, FileFormat fileFormat, TemporaryFolder tmp, Configuration conf) { + public GenericAppenderHelper( + Table table, FileFormat fileFormat, TemporaryFolder tmp, Configuration conf) { this.table = table; this.fileFormat = fileFormat; this.tmp = tmp; @@ -85,8 +83,13 @@ public DataFile writeFile(StructLike partition, List records) throws IOE return appendToLocalFile(table, file, fileFormat, partition, records, conf); } - private static DataFile appendToLocalFile(Table table, File file, FileFormat format, StructLike partition, - List records, Configuration conf) + private static DataFile appendToLocalFile( + Table table, + File file, + FileFormat format, + StructLike partition, + List records, + Configuration conf) throws IOException { GenericAppenderFactory appenderFactory = new GenericAppenderFactory(table.schema()); @@ -95,8 +98,7 @@ private static DataFile appendToLocalFile(Table table, File file, FileFormat for appenderFactory.setAll(conf.getValByRegex(ORC_CONFIG_PREFIX)); } - FileAppender appender = appenderFactory.newAppender( - Files.localOutput(file), format); + FileAppender appender = appenderFactory.newAppender(Files.localOutput(file), format); try (FileAppender fileAppender = appender) { fileAppender.addAll(records); } diff --git a/data/src/test/java/org/apache/iceberg/data/RandomGenericData.java b/data/src/test/java/org/apache/iceberg/data/RandomGenericData.java index 978c511f64ea..06cdd4d30bce 100644 --- a/data/src/test/java/org/apache/iceberg/data/RandomGenericData.java +++ b/data/src/test/java/org/apache/iceberg/data/RandomGenericData.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; +import static java.time.temporal.ChronoUnit.MICROS; + import java.nio.ByteBuffer; import java.time.Instant; import java.time.LocalDate; @@ -42,44 +43,46 @@ import org.apache.iceberg.types.Types; import org.apache.iceberg.util.RandomUtil; -import static java.time.temporal.ChronoUnit.MICROS; - public class RandomGenericData { - private RandomGenericData() { - } + private RandomGenericData() {} public static List generate(Schema schema, int numRecords, long seed) { - return Lists.newArrayList(generateIcebergGenerics(schema, numRecords, () -> new RandomRecordGenerator(seed))); + return Lists.newArrayList( + generateIcebergGenerics(schema, numRecords, () -> new RandomRecordGenerator(seed))); } - public static Iterable generateFallbackRecords(Schema schema, int numRecords, long seed, long numDictRows) { - return generateIcebergGenerics(schema, numRecords, () -> new FallbackGenerator(seed, numDictRows)); + public static Iterable generateFallbackRecords( + Schema schema, int numRecords, long seed, long numDictRows) { + return generateIcebergGenerics( + schema, numRecords, () -> new FallbackGenerator(seed, numDictRows)); } - public static Iterable generateDictionaryEncodableRecords(Schema schema, int numRecords, long seed) { + public static Iterable generateDictionaryEncodableRecords( + Schema schema, int numRecords, long seed) { return generateIcebergGenerics(schema, numRecords, () -> new DictionaryEncodedGenerator(seed)); } - private static Iterable generateIcebergGenerics(Schema schema, int numRecords, - Supplier> supplier) { - return () -> new Iterator() { - private final RandomDataGenerator generator = supplier.get(); - private int count = 0; + private static Iterable generateIcebergGenerics( + Schema schema, int numRecords, Supplier> supplier) { + return () -> + new Iterator() { + private final RandomDataGenerator generator = supplier.get(); + private int count = 0; - @Override - public boolean hasNext() { - return count < numRecords; - } + @Override + public boolean hasNext() { + return count < numRecords; + } - @Override - public Record next() { - if (!hasNext()) { - throw new NoSuchElementException(); - } - ++count; - return (Record) TypeUtil.visit(schema, generator); - } - }; + @Override + public Record next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + ++count; + return (Record) TypeUtil.visit(schema, generator); + } + }; } private static class RandomRecordGenerator extends RandomDataGenerator { @@ -105,16 +108,19 @@ public Record struct(Types.StructType struct, Iterable fieldResults) { } } - private static class DictionaryEncodedGenerator extends RandomRecordGenerator { + private static class DictionaryEncodedGenerator extends RandomRecordGenerator { DictionaryEncodedGenerator(long seed) { super(seed); } @Override protected int getMaxEntries() { - // Here we limited the max entries in LIST or MAP to be 3, because we have the mechanism to duplicate - // the keys in RandomDataGenerator#map while the dictionary encoder will generate a string with - // limited values("0","1","2"). It's impossible for us to request the generator to generate more than 3 keys, + // Here we limited the max entries in LIST or MAP to be 3, because we have the mechanism to + // duplicate + // the keys in RandomDataGenerator#map while the dictionary encoder will generate a string + // with + // limited values("0","1","2"). It's impossible for us to request the generator to generate + // more than 3 keys, // otherwise we will get in a infinite loop in RandomDataGenerator#map. return 3; } @@ -145,7 +151,8 @@ protected Object randomValue(Type.PrimitiveType primitive, Random rand) { } } - public abstract static class RandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor { + public abstract static class RandomDataGenerator + extends TypeUtil.CustomOrderSchemaVisitor { private final Random random; private static final int MAX_ENTRIES = 20; diff --git a/data/src/test/java/org/apache/iceberg/data/TestDataFileIndexStatsFilters.java b/data/src/test/java/org/apache/iceberg/data/TestDataFileIndexStatsFilters.java index 9a3a04255dc7..d87c2eeb4c98 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestDataFileIndexStatsFilters.java +++ b/data/src/test/java/org/apache/iceberg/data/TestDataFileIndexStatsFilters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; import java.io.File; @@ -44,13 +43,13 @@ import org.junit.rules.TemporaryFolder; public class TestDataFileIndexStatsFilters { - private static final Schema SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.required(3, "category", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.required(3, "category", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; private List records = null; @@ -76,10 +75,20 @@ public void createTableAndData() throws IOException { records.add(record.copy("id", 8, "data", null, "category", "even")); this.dataFile = FileHelpers.writeDataFile(table, Files.localOutput(temp.newFile()), records); - this.dataFileWithoutNulls = FileHelpers.writeDataFile(table, Files.localOutput(temp.newFile()), - records.stream().filter(rec -> rec.getField("data") != null).collect(Collectors.toList())); - this.dataFileOnlyNulls = FileHelpers.writeDataFile(table, Files.localOutput(temp.newFile()), - records.stream().filter(rec -> rec.getField("data") == null).collect(Collectors.toList())); + this.dataFileWithoutNulls = + FileHelpers.writeDataFile( + table, + Files.localOutput(temp.newFile()), + records.stream() + .filter(rec -> rec.getField("data") != null) + .collect(Collectors.toList())); + this.dataFileOnlyNulls = + FileHelpers.writeDataFile( + table, + Files.localOutput(temp.newFile()), + records.stream() + .filter(rec -> rec.getField("data") == null) + .collect(Collectors.toList())); } @After @@ -89,17 +98,16 @@ public void dropTable() { @Test public void testPositionDeletePlanningPath() throws IOException { - table.newAppend() - .appendFile(dataFile) - .commit(); + table.newAppend().appendFile(dataFile).commit(); List> deletes = Lists.newArrayList(); deletes.add(Pair.of(dataFile.path(), 0L)); deletes.add(Pair.of(dataFile.path(), 1L)); - Pair posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), deletes); - table.newRowDelta() + Pair posDeletes = + FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes); + table + .newRowDelta() .addDeletes(posDeletes.first()) .validateDataFilesExist(posDeletes.second()) .commit(); @@ -116,17 +124,16 @@ public void testPositionDeletePlanningPath() throws IOException { @Test public void testPositionDeletePlanningPathFilter() throws IOException { - table.newAppend() - .appendFile(dataFile) - .commit(); + table.newAppend().appendFile(dataFile).commit(); List> deletes = Lists.newArrayList(); deletes.add(Pair.of("some-other-file.parquet", 0L)); deletes.add(Pair.of("some-other-file.parquet", 1L)); - Pair posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), deletes); - table.newRowDelta() + Pair posDeletes = + FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), deletes); + table + .newRowDelta() .addDeletes(posDeletes.first()) .validateDataFilesExist(posDeletes.second()) .commit(); @@ -138,26 +145,24 @@ public void testPositionDeletePlanningPathFilter() throws IOException { Assert.assertEquals("Should produce one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should not have delete file, filtered by file_path stats", 0, task.deletes().size()); + Assert.assertEquals( + "Should not have delete file, filtered by file_path stats", 0, task.deletes().size()); } @Test public void testEqualityDeletePlanningStats() throws IOException { - table.newAppend() - .appendFile(dataFile) - .commit(); + table.newAppend().appendFile(dataFile).commit(); List deletes = Lists.newArrayList(); Schema deleteRowSchema = SCHEMA.select("data"); Record delete = GenericRecord.create(deleteRowSchema); deletes.add(delete.copy("data", "d")); - DeleteFile posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema); + DeleteFile posDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema); - table.newRowDelta() - .addDeletes(posDeletes) - .commit(); + table.newRowDelta().addDeletes(posDeletes).commit(); List tasks; try (CloseableIterable tasksIterable = table.newScan().planFiles()) { @@ -166,14 +171,13 @@ public void testEqualityDeletePlanningStats() throws IOException { Assert.assertEquals("Should produce one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should have one delete file, data contains a matching value", 1, task.deletes().size()); + Assert.assertEquals( + "Should have one delete file, data contains a matching value", 1, task.deletes().size()); } @Test public void testEqualityDeletePlanningStatsFilter() throws IOException { - table.newAppend() - .appendFile(dataFile) - .commit(); + table.newAppend().appendFile(dataFile).commit(); List deletes = Lists.newArrayList(); Schema deleteRowSchema = table.schema().select("data"); @@ -182,12 +186,11 @@ public void testEqualityDeletePlanningStatsFilter() throws IOException { deletes.add(delete.copy("data", "y")); deletes.add(delete.copy("data", "z")); - DeleteFile posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema); + DeleteFile posDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema); - table.newRowDelta() - .addDeletes(posDeletes) - .commit(); + table.newRowDelta().addDeletes(posDeletes).commit(); List tasks; try (CloseableIterable tasksIterable = table.newScan().planFiles()) { @@ -196,26 +199,24 @@ public void testEqualityDeletePlanningStatsFilter() throws IOException { Assert.assertEquals("Should produce one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should not have delete file, filtered by data column stats", 0, task.deletes().size()); + Assert.assertEquals( + "Should not have delete file, filtered by data column stats", 0, task.deletes().size()); } @Test public void testEqualityDeletePlanningStatsNullValueWithAllNullDeletes() throws IOException { - table.newAppend() - .appendFile(dataFile) - .commit(); + table.newAppend().appendFile(dataFile).commit(); List deletes = Lists.newArrayList(); Schema deleteRowSchema = SCHEMA.select("data"); Record delete = GenericRecord.create(deleteRowSchema); deletes.add(delete.copy("data", null)); - DeleteFile posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema); + DeleteFile posDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema); - table.newRowDelta() - .addDeletes(posDeletes) - .commit(); + table.newRowDelta().addDeletes(posDeletes).commit(); List tasks; try (CloseableIterable tasksIterable = table.newScan().planFiles()) { @@ -224,12 +225,14 @@ public void testEqualityDeletePlanningStatsNullValueWithAllNullDeletes() throws Assert.assertEquals("Should produce one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should have delete file, data contains a null value", 1, task.deletes().size()); + Assert.assertEquals( + "Should have delete file, data contains a null value", 1, task.deletes().size()); } @Test public void testEqualityDeletePlanningStatsNoNullValuesWithAllNullDeletes() throws IOException { - table.newAppend() + table + .newAppend() .appendFile(dataFileWithoutNulls) // note that there are no nulls in the data column .commit(); @@ -238,12 +241,11 @@ public void testEqualityDeletePlanningStatsNoNullValuesWithAllNullDeletes() thro Record delete = GenericRecord.create(deleteRowSchema); deletes.add(delete.copy("data", null)); - DeleteFile posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema); + DeleteFile posDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema); - table.newRowDelta() - .addDeletes(posDeletes) - .commit(); + table.newRowDelta().addDeletes(posDeletes).commit(); List tasks; try (CloseableIterable tasksIterable = table.newScan().planFiles()) { @@ -252,12 +254,14 @@ public void testEqualityDeletePlanningStatsNoNullValuesWithAllNullDeletes() thro Assert.assertEquals("Should produce one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should have no delete files, data contains no null values", 0, task.deletes().size()); + Assert.assertEquals( + "Should have no delete files, data contains no null values", 0, task.deletes().size()); } @Test public void testEqualityDeletePlanningStatsAllNullValuesWithNoNullDeletes() throws IOException { - table.newAppend() + table + .newAppend() .appendFile(dataFileOnlyNulls) // note that there are only nulls in the data column .commit(); @@ -266,12 +270,11 @@ public void testEqualityDeletePlanningStatsAllNullValuesWithNoNullDeletes() thro Record delete = GenericRecord.create(deleteRowSchema); deletes.add(delete.copy("data", "d")); - DeleteFile posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema); + DeleteFile posDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema); - table.newRowDelta() - .addDeletes(posDeletes) - .commit(); + table.newRowDelta().addDeletes(posDeletes).commit(); List tasks; try (CloseableIterable tasksIterable = table.newScan().planFiles()) { @@ -280,12 +283,15 @@ public void testEqualityDeletePlanningStatsAllNullValuesWithNoNullDeletes() thro Assert.assertEquals("Should produce one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should have no delete files, data contains no null values", 0, task.deletes().size()); + Assert.assertEquals( + "Should have no delete files, data contains no null values", 0, task.deletes().size()); } @Test - public void testEqualityDeletePlanningStatsSomeNullValuesWithSomeNullDeletes() throws IOException { - table.newAppend() + public void testEqualityDeletePlanningStatsSomeNullValuesWithSomeNullDeletes() + throws IOException { + table + .newAppend() .appendFile(dataFile) // note that there are some nulls in the data column .commit(); @@ -296,12 +302,11 @@ public void testEqualityDeletePlanningStatsSomeNullValuesWithSomeNullDeletes() t deletes.add(delete.copy("data", null)); deletes.add(delete.copy("data", "x")); - DeleteFile posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema); + DeleteFile posDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), deletes, deleteRowSchema); - table.newRowDelta() - .addDeletes(posDeletes) - .commit(); + table.newRowDelta().addDeletes(posDeletes).commit(); List tasks; try (CloseableIterable tasksIterable = table.newScan().planFiles()) { @@ -310,6 +315,7 @@ public void testEqualityDeletePlanningStatsSomeNullValuesWithSomeNullDeletes() t Assert.assertEquals("Should produce one task", 1, tasks.size()); FileScanTask task = tasks.get(0); - Assert.assertEquals("Should have one delete file, data and deletes have null values", 1, task.deletes().size()); + Assert.assertEquals( + "Should have one delete file, data and deletes have null values", 1, task.deletes().size()); } } diff --git a/data/src/test/java/org/apache/iceberg/data/TestGenericReaderDeletes.java b/data/src/test/java/org/apache/iceberg/data/TestGenericReaderDeletes.java index e92c0daec385..7a17c142eea2 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestGenericReaderDeletes.java +++ b/data/src/test/java/org/apache/iceberg/data/TestGenericReaderDeletes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; import java.io.File; @@ -49,8 +48,10 @@ protected void dropTable(String name) { public StructLikeSet rowSet(String name, Table table, String... columns) throws IOException { StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); try (CloseableIterable reader = IcebergGenerics.read(table).select(columns).build()) { - Iterables.addAll(set, CloseableIterable.transform( - reader, record -> new InternalRecordWrapper(table.schema().asStruct()).wrap(record))); + Iterables.addAll( + set, + CloseableIterable.transform( + reader, record -> new InternalRecordWrapper(table.schema().asStruct()).wrap(record))); } return set; } diff --git a/data/src/test/java/org/apache/iceberg/data/TestGenericRecord.java b/data/src/test/java/org/apache/iceberg/data/TestGenericRecord.java index 690dff29f782..01d31ae117ba 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestGenericRecord.java +++ b/data/src/test/java/org/apache/iceberg/data/TestGenericRecord.java @@ -7,25 +7,25 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ - package org.apache.iceberg.data; +import static org.apache.iceberg.types.Types.NestedField.optional; + import org.apache.iceberg.AssertHelpers; import org.apache.iceberg.Schema; import org.apache.iceberg.types.Types; import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestGenericRecord { @Test @@ -54,7 +54,8 @@ public void testGetIncorrectClassInstance() { GenericRecord record = GenericRecord.create(schema); record.set(0, 10L); - AssertHelpers.assertThrows("Should fail on incorrect class instance", + AssertHelpers.assertThrows( + "Should fail on incorrect class instance", IllegalStateException.class, "Not an instance of java.lang.CharSequence: 10", () -> record.get(0, CharSequence.class)); diff --git a/data/src/test/java/org/apache/iceberg/data/TestLocalScan.java b/data/src/test/java/org/apache/iceberg/data/TestLocalScan.java index 63dba7c58c2e..6fbf2be2c809 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestLocalScan.java +++ b/data/src/test/java/org/apache/iceberg/data/TestLocalScan.java @@ -16,9 +16,18 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.hadoop.HadoopOutputFile.fromPath; +import static org.apache.iceberg.relocated.com.google.common.collect.Iterables.concat; +import static org.apache.iceberg.relocated.com.google.common.collect.Iterables.filter; +import static org.apache.iceberg.relocated.com.google.common.collect.Iterables.transform; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; @@ -59,31 +68,20 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.hadoop.HadoopOutputFile.fromPath; -import static org.apache.iceberg.relocated.com.google.common.collect.Iterables.concat; -import static org.apache.iceberg.relocated.com.google.common.collect.Iterables.filter; -import static org.apache.iceberg.relocated.com.google.common.collect.Iterables.transform; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestLocalScan { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), optional(2, "data", Types.StringType.get())); private static final Configuration CONF = new Configuration(); private static final Tables TABLES = new HadoopTables(CONF); - @Rule - public final TemporaryFolder temp = new TemporaryFolder(); + @Rule public final TemporaryFolder temp = new TemporaryFolder(); @Parameterized.Parameters(name = "format = {0}") public static Object[] parameters() { - return new Object[] { "parquet", "orc", "avro" }; + return new Object[] {"parquet", "orc", "avro"}; } private final FileFormat format; @@ -97,71 +95,79 @@ public TestLocalScan(String format) { private final Record genericRecord = GenericRecord.create(SCHEMA); - private final List file1FirstSnapshotRecords = ImmutableList.of( - genericRecord.copy(ImmutableMap.of("id", 0L, "data", "clarification")), - genericRecord.copy(ImmutableMap.of("id", 1L, "data", "risky")), - genericRecord.copy(ImmutableMap.of("id", 2L, "data", "falafel")) - ); - private final List file2FirstSnapshotRecords = ImmutableList.of( - genericRecord.copy(ImmutableMap.of("id", 10L, "data", "clammy")), - genericRecord.copy(ImmutableMap.of("id", 11L, "data", "evacuate")), - genericRecord.copy(ImmutableMap.of("id", 12L, "data", "tissue")) - ); - private final List file3FirstSnapshotRecords = ImmutableList.of( - genericRecord.copy(ImmutableMap.of("id", 20L, "data", "ocean")), - genericRecord.copy(ImmutableMap.of("id", 21L, "data", "holistic")), - genericRecord.copy(ImmutableMap.of("id", 22L, "data", "preventative")) - ); - - private final List file1SecondSnapshotRecords = ImmutableList.of( - genericRecord.copy(ImmutableMap.of("id", 4L, "data", "obscure")), - genericRecord.copy(ImmutableMap.of("id", 5L, "data", "secure")), - genericRecord.copy(ImmutableMap.of("id", 6L, "data", "fetta")) - ); - private final List file2SecondSnapshotRecords = ImmutableList.of( - genericRecord.copy(ImmutableMap.of("id", 14L, "data", "radical")), - genericRecord.copy(ImmutableMap.of("id", 15L, "data", "collocation")), - genericRecord.copy(ImmutableMap.of("id", 16L, "data", "book")) - ); - private final List file3SecondSnapshotRecords = ImmutableList.of( - genericRecord.copy(ImmutableMap.of("id", 24L, "data", "cloud")), - genericRecord.copy(ImmutableMap.of("id", 25L, "data", "zen")), - genericRecord.copy(ImmutableMap.of("id", 26L, "data", "sky")) - ); - - private final List file1ThirdSnapshotRecords = ImmutableList.of( - genericRecord.copy(ImmutableMap.of("id", 6L, "data", "brainy")), - genericRecord.copy(ImmutableMap.of("id", 7L, "data", "film")), - genericRecord.copy(ImmutableMap.of("id", 8L, "data", "fetta")) - ); - private final List file2ThirdSnapshotRecords = ImmutableList.of( - genericRecord.copy(ImmutableMap.of("id", 16L, "data", "cake")), - genericRecord.copy(ImmutableMap.of("id", 17L, "data", "intrinsic")), - genericRecord.copy(ImmutableMap.of("id", 18L, "data", "paper")) - ); - private final List file3ThirdSnapshotRecords = ImmutableList.of( - genericRecord.copy(ImmutableMap.of("id", 26L, "data", "belleview")), - genericRecord.copy(ImmutableMap.of("id", 27L, "data", "overview")), - genericRecord.copy(ImmutableMap.of("id", 28L, "data", "tender")) - ); + private final List file1FirstSnapshotRecords = + ImmutableList.of( + genericRecord.copy(ImmutableMap.of("id", 0L, "data", "clarification")), + genericRecord.copy(ImmutableMap.of("id", 1L, "data", "risky")), + genericRecord.copy(ImmutableMap.of("id", 2L, "data", "falafel"))); + private final List file2FirstSnapshotRecords = + ImmutableList.of( + genericRecord.copy(ImmutableMap.of("id", 10L, "data", "clammy")), + genericRecord.copy(ImmutableMap.of("id", 11L, "data", "evacuate")), + genericRecord.copy(ImmutableMap.of("id", 12L, "data", "tissue"))); + private final List file3FirstSnapshotRecords = + ImmutableList.of( + genericRecord.copy(ImmutableMap.of("id", 20L, "data", "ocean")), + genericRecord.copy(ImmutableMap.of("id", 21L, "data", "holistic")), + genericRecord.copy(ImmutableMap.of("id", 22L, "data", "preventative"))); + + private final List file1SecondSnapshotRecords = + ImmutableList.of( + genericRecord.copy(ImmutableMap.of("id", 4L, "data", "obscure")), + genericRecord.copy(ImmutableMap.of("id", 5L, "data", "secure")), + genericRecord.copy(ImmutableMap.of("id", 6L, "data", "fetta"))); + private final List file2SecondSnapshotRecords = + ImmutableList.of( + genericRecord.copy(ImmutableMap.of("id", 14L, "data", "radical")), + genericRecord.copy(ImmutableMap.of("id", 15L, "data", "collocation")), + genericRecord.copy(ImmutableMap.of("id", 16L, "data", "book"))); + private final List file3SecondSnapshotRecords = + ImmutableList.of( + genericRecord.copy(ImmutableMap.of("id", 24L, "data", "cloud")), + genericRecord.copy(ImmutableMap.of("id", 25L, "data", "zen")), + genericRecord.copy(ImmutableMap.of("id", 26L, "data", "sky"))); + + private final List file1ThirdSnapshotRecords = + ImmutableList.of( + genericRecord.copy(ImmutableMap.of("id", 6L, "data", "brainy")), + genericRecord.copy(ImmutableMap.of("id", 7L, "data", "film")), + genericRecord.copy(ImmutableMap.of("id", 8L, "data", "fetta"))); + private final List file2ThirdSnapshotRecords = + ImmutableList.of( + genericRecord.copy(ImmutableMap.of("id", 16L, "data", "cake")), + genericRecord.copy(ImmutableMap.of("id", 17L, "data", "intrinsic")), + genericRecord.copy(ImmutableMap.of("id", 18L, "data", "paper"))); + private final List file3ThirdSnapshotRecords = + ImmutableList.of( + genericRecord.copy(ImmutableMap.of("id", 26L, "data", "belleview")), + genericRecord.copy(ImmutableMap.of("id", 27L, "data", "overview")), + genericRecord.copy(ImmutableMap.of("id", 28L, "data", "tender"))); private void overwriteExistingData() throws IOException { - DataFile file12 = writeFile(sharedTableLocation, format.addExtension("file-12"), file1SecondSnapshotRecords); - DataFile file22 = writeFile(sharedTableLocation, format.addExtension("file-22"), file2SecondSnapshotRecords); - DataFile file32 = writeFile(sharedTableLocation, format.addExtension("file-32"), file3SecondSnapshotRecords); - - sharedTable.newOverwrite() + DataFile file12 = + writeFile(sharedTableLocation, format.addExtension("file-12"), file1SecondSnapshotRecords); + DataFile file22 = + writeFile(sharedTableLocation, format.addExtension("file-22"), file2SecondSnapshotRecords); + DataFile file32 = + writeFile(sharedTableLocation, format.addExtension("file-32"), file3SecondSnapshotRecords); + + sharedTable + .newOverwrite() .overwriteByRowFilter(Expressions.alwaysTrue()) .addFile(file12) .addFile(file22) .addFile(file32) .commit(); - DataFile file13 = writeFile(sharedTableLocation, format.addExtension("file-13"), file1ThirdSnapshotRecords); - DataFile file23 = writeFile(sharedTableLocation, format.addExtension("file-23"), file2ThirdSnapshotRecords); - DataFile file33 = writeFile(sharedTableLocation, format.addExtension("file-33"), file3ThirdSnapshotRecords); + DataFile file13 = + writeFile(sharedTableLocation, format.addExtension("file-13"), file1ThirdSnapshotRecords); + DataFile file23 = + writeFile(sharedTableLocation, format.addExtension("file-23"), file2ThirdSnapshotRecords); + DataFile file33 = + writeFile(sharedTableLocation, format.addExtension("file-33"), file3ThirdSnapshotRecords); - sharedTable.newOverwrite() + sharedTable + .newOverwrite() .overwriteByRowFilter(Expressions.alwaysTrue()) .addFile(file13) .addFile(file23) @@ -170,25 +176,23 @@ private void overwriteExistingData() throws IOException { } private void appendData() throws IOException { - DataFile file12 = writeFile(sharedTableLocation, format.addExtension("file-12"), file1SecondSnapshotRecords); - DataFile file22 = writeFile(sharedTableLocation, format.addExtension("file-22"), file2SecondSnapshotRecords); - DataFile file32 = writeFile(sharedTableLocation, format.addExtension("file-32"), file3SecondSnapshotRecords); - - sharedTable.newFastAppend() - .appendFile(file12) - .appendFile(file22) - .appendFile(file32) - .commit(); - - DataFile file13 = writeFile(sharedTableLocation, format.addExtension("file-13"), file1ThirdSnapshotRecords); - DataFile file23 = writeFile(sharedTableLocation, format.addExtension("file-23"), file2ThirdSnapshotRecords); - DataFile file33 = writeFile(sharedTableLocation, format.addExtension("file-33"), file3ThirdSnapshotRecords); - - sharedTable.newFastAppend() - .appendFile(file13) - .appendFile(file23) - .appendFile(file33) - .commit(); + DataFile file12 = + writeFile(sharedTableLocation, format.addExtension("file-12"), file1SecondSnapshotRecords); + DataFile file22 = + writeFile(sharedTableLocation, format.addExtension("file-22"), file2SecondSnapshotRecords); + DataFile file32 = + writeFile(sharedTableLocation, format.addExtension("file-32"), file3SecondSnapshotRecords); + + sharedTable.newFastAppend().appendFile(file12).appendFile(file22).appendFile(file32).commit(); + + DataFile file13 = + writeFile(sharedTableLocation, format.addExtension("file-13"), file1ThirdSnapshotRecords); + DataFile file23 = + writeFile(sharedTableLocation, format.addExtension("file-23"), file2ThirdSnapshotRecords); + DataFile file33 = + writeFile(sharedTableLocation, format.addExtension("file-33"), file3ThirdSnapshotRecords); + + sharedTable.newFastAppend().appendFile(file13).appendFile(file23).appendFile(file33).commit(); } @Before @@ -196,28 +200,29 @@ public void createTables() throws IOException { File location = temp.newFolder("shared"); Assert.assertTrue(location.delete()); this.sharedTableLocation = location.toString(); - this.sharedTable = TABLES.create( - SCHEMA, PartitionSpec.unpartitioned(), - ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()), - sharedTableLocation); + this.sharedTable = + TABLES.create( + SCHEMA, + PartitionSpec.unpartitioned(), + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()), + sharedTableLocation); Record record = GenericRecord.create(SCHEMA); - DataFile file1 = writeFile(sharedTableLocation, format.addExtension("file-1"), file1FirstSnapshotRecords); + DataFile file1 = + writeFile(sharedTableLocation, format.addExtension("file-1"), file1FirstSnapshotRecords); Record nullData = record.copy(); nullData.setField("id", 11L); nullData.setField("data", null); - DataFile file2 = writeFile(sharedTableLocation, format.addExtension("file-2"), file2FirstSnapshotRecords); - DataFile file3 = writeFile(sharedTableLocation, format.addExtension("file-3"), file3FirstSnapshotRecords); + DataFile file2 = + writeFile(sharedTableLocation, format.addExtension("file-2"), file2FirstSnapshotRecords); + DataFile file3 = + writeFile(sharedTableLocation, format.addExtension("file-3"), file3FirstSnapshotRecords); // commit the test data - sharedTable.newAppend() - .appendFile(file1) - .appendFile(file2) - .appendFile(file3) - .commit(); + sharedTable.newAppend().appendFile(file1).appendFile(file2).appendFile(file3).commit(); } @Test @@ -226,9 +231,12 @@ public void testRandomData() throws IOException { File location = temp.newFolder(format.name()); Assert.assertTrue(location.delete()); - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), - ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()), - location.toString()); + Table table = + TABLES.create( + SCHEMA, + PartitionSpec.unpartitioned(), + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()), + location.toString()); AppendFiles append = table.newAppend(); @@ -245,10 +253,11 @@ public void testRandomData() throws IOException { } writeFile(location.toString(), format.addExtension("file-" + fileNum), records); - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(numRecords) - .withInputFile(HadoopInputFile.fromPath(path, CONF)) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(numRecords) + .withInputFile(HadoopInputFile.fromPath(path, CONF)) + .build(); append.appendFile(file); fileNum += 1; @@ -257,10 +266,9 @@ public void testRandomData() throws IOException { append.commit(); Set records = Sets.newHashSet(IcebergGenerics.read(table).build()); - Assert.assertEquals("Should produce correct number of records", - expected.size(), records.size()); - Assert.assertEquals("Random record set should match", - Sets.newHashSet(expected), records); + Assert.assertEquals( + "Should produce correct number of records", expected.size(), records.size()); + Assert.assertEquals("Random record set should match", Sets.newHashSet(expected), records); } @Test @@ -273,22 +281,24 @@ public void testFullScan() { expected.addAll(file3FirstSnapshotRecords); Set records = Sets.newHashSet(results); - Assert.assertEquals("Should produce correct number of records", - expected.size(), records.size()); - Assert.assertEquals("Random record set should match", - Sets.newHashSet(expected), records); + Assert.assertEquals( + "Should produce correct number of records", expected.size(), records.size()); + Assert.assertEquals("Random record set should match", Sets.newHashSet(expected), records); } @Test public void testFilter() { Iterable result = IcebergGenerics.read(sharedTable).where(lessThan("id", 3)).build(); - Assert.assertEquals("Records should match file 1", - Sets.newHashSet(file1FirstSnapshotRecords), Sets.newHashSet(result)); + Assert.assertEquals( + "Records should match file 1", + Sets.newHashSet(file1FirstSnapshotRecords), + Sets.newHashSet(result)); result = IcebergGenerics.read(sharedTable).where(lessThanOrEqual("id", 1)).build(); - Assert.assertEquals("Records should match file 1 without id 2", + Assert.assertEquals( + "Records should match file 1 without id 2", Sets.newHashSet(filter(file1FirstSnapshotRecords, r -> (Long) r.getField("id") <= 1)), Sets.newHashSet(result)); } @@ -300,15 +310,20 @@ public void testProject() { private void verifyProjectIdColumn(Iterable results) { Set expected = Sets.newHashSet(); - expected.addAll(Lists.transform(file1FirstSnapshotRecords, record -> (Long) record.getField("id"))); - expected.addAll(Lists.transform(file2FirstSnapshotRecords, record -> (Long) record.getField("id"))); - expected.addAll(Lists.transform(file3FirstSnapshotRecords, record -> (Long) record.getField("id"))); - - results.forEach(record -> - Assert.assertEquals("Record should have one projected field", 1, record.size())); - - Assert.assertEquals("Should project only id columns", - expected, Sets.newHashSet(transform(results, record -> (Long) record.getField("id")))); + expected.addAll( + Lists.transform(file1FirstSnapshotRecords, record -> (Long) record.getField("id"))); + expected.addAll( + Lists.transform(file2FirstSnapshotRecords, record -> (Long) record.getField("id"))); + expected.addAll( + Lists.transform(file3FirstSnapshotRecords, record -> (Long) record.getField("id"))); + + results.forEach( + record -> Assert.assertEquals("Record should have one projected field", 1, record.size())); + + Assert.assertEquals( + "Should project only id columns", + expected, + Sets.newHashSet(transform(results, record -> (Long) record.getField("id")))); } @Test @@ -330,25 +345,30 @@ public void testProjectWithSchema() { // Test with unknown field schema = new Schema(optional(999, "unknown", Types.LongType.get())); - IcebergGenerics.read(sharedTable).project(schema).build().forEach(r -> Assert.assertNull(r.get(0))); + IcebergGenerics.read(sharedTable) + .project(schema) + .build() + .forEach(r -> Assert.assertNull(r.get(0))); // Test with reading some metadata columns - schema = new Schema( - required(1, "id", Types.LongType.get()), - MetadataColumns.metadataColumn(sharedTable, MetadataColumns.PARTITION_COLUMN_NAME), - optional(2, "data", Types.StringType.get()), - MetadataColumns.SPEC_ID, - MetadataColumns.ROW_POSITION - ); + schema = + new Schema( + required(1, "id", Types.LongType.get()), + MetadataColumns.metadataColumn(sharedTable, MetadataColumns.PARTITION_COLUMN_NAME), + optional(2, "data", Types.StringType.get()), + MetadataColumns.SPEC_ID, + MetadataColumns.ROW_POSITION); Iterator iterator = - IcebergGenerics.read(sharedTable).project(schema).where(equal("data", "falafel")).build().iterator(); - - GenericRecord expectedRecord = GenericRecord.create(schema).copy( - ImmutableMap.of("id", 2L, - "data", "falafel", - "_spec_id", 0, - "_pos", 2L)); + IcebergGenerics.read(sharedTable) + .project(schema) + .where(equal("data", "falafel")) + .build() + .iterator(); + + GenericRecord expectedRecord = + GenericRecord.create(schema) + .copy(ImmutableMap.of("id", 2L, "data", "falafel", "_spec_id", 0, "_pos", 2L)); expectedRecord.setField("_partition", null); Assert.assertEquals(expectedRecord, iterator.next()); Assert.assertFalse(iterator.hasNext()); @@ -356,23 +376,27 @@ public void testProjectWithSchema() { @Test public void testProjectWithMissingFilterColumn() { - Iterable results = IcebergGenerics.read(sharedTable) - .where(Expressions.greaterThanOrEqual("id", 1)) - .where(Expressions.lessThan("id", 21)) - .select("data").build(); + Iterable results = + IcebergGenerics.read(sharedTable) + .where(Expressions.greaterThanOrEqual("id", 1)) + .where(Expressions.lessThan("id", 21)) + .select("data") + .build(); Set expected = Sets.newHashSet(); - for (Record record : concat(file1FirstSnapshotRecords, file2FirstSnapshotRecords, file3FirstSnapshotRecords)) { + for (Record record : + concat(file1FirstSnapshotRecords, file2FirstSnapshotRecords, file3FirstSnapshotRecords)) { Long id = (Long) record.getField("id"); if (id >= 1 && id < 21) { expected.add(record.getField("data").toString()); } } - results.forEach(record -> - Assert.assertEquals("Record should have two projected fields", 2, record.size())); + results.forEach( + record -> Assert.assertEquals("Record should have two projected fields", 2, record.size())); - Assert.assertEquals("Should project correct rows", + Assert.assertEquals( + "Should project correct rows", expected, Sets.newHashSet(transform(results, record -> record.getField("data").toString()))); } @@ -380,9 +404,10 @@ public void testProjectWithMissingFilterColumn() { @Test public void testUseSnapshot() throws IOException { overwriteExistingData(); - Iterable results = IcebergGenerics.read(sharedTable) - .useSnapshot(/* first snapshot */ sharedTable.history().get(1).snapshotId()) - .build(); + Iterable results = + IcebergGenerics.read(sharedTable) + .useSnapshot(/* first snapshot */ sharedTable.history().get(1).snapshotId()) + .build(); Set expected = Sets.newHashSet(); expected.addAll(file1SecondSnapshotRecords); @@ -390,10 +415,9 @@ public void testUseSnapshot() throws IOException { expected.addAll(file3SecondSnapshotRecords); Set records = Sets.newHashSet(results); - Assert.assertEquals("Should produce correct number of records", - expected.size(), records.size()); - Assert.assertEquals("Record set should match", - Sets.newHashSet(expected), records); + Assert.assertEquals( + "Should produce correct number of records", expected.size(), records.size()); + Assert.assertEquals("Record set should match", Sets.newHashSet(expected), records); Assert.assertNotNull(Iterables.get(records, 0).getField("id")); Assert.assertNotNull(Iterables.get(records, 0).getField("data")); } @@ -401,9 +425,10 @@ public void testUseSnapshot() throws IOException { @Test public void testAsOfTime() throws IOException { overwriteExistingData(); - Iterable results = IcebergGenerics.read(sharedTable) - .asOfTime(/* timestamp first snapshot */ sharedTable.history().get(2).timestampMillis()) - .build(); + Iterable results = + IcebergGenerics.read(sharedTable) + .asOfTime(/* timestamp first snapshot */ sharedTable.history().get(2).timestampMillis()) + .build(); Set expected = Sets.newHashSet(); expected.addAll(file1ThirdSnapshotRecords); @@ -411,10 +436,9 @@ public void testAsOfTime() throws IOException { expected.addAll(file3ThirdSnapshotRecords); Set records = Sets.newHashSet(results); - Assert.assertEquals("Should produce correct number of records", - expected.size(), records.size()); - Assert.assertEquals("Record set should match", - Sets.newHashSet(expected), records); + Assert.assertEquals( + "Should produce correct number of records", expected.size(), records.size()); + Assert.assertEquals("Record set should match", Sets.newHashSet(expected), records); Assert.assertNotNull(Iterables.get(records, 0).getField("id")); Assert.assertNotNull(Iterables.get(records, 0).getField("data")); } @@ -422,9 +446,12 @@ public void testAsOfTime() throws IOException { @Test public void testAppendsBetween() throws IOException { appendData(); - Iterable results = IcebergGenerics.read(sharedTable) - .appendsBetween(sharedTable.history().get(1).snapshotId(), sharedTable.currentSnapshot().snapshotId()) - .build(); + Iterable results = + IcebergGenerics.read(sharedTable) + .appendsBetween( + sharedTable.history().get(1).snapshotId(), + sharedTable.currentSnapshot().snapshotId()) + .build(); Set expected = Sets.newHashSet(); expected.addAll(file1ThirdSnapshotRecords); @@ -432,10 +459,9 @@ public void testAppendsBetween() throws IOException { expected.addAll(file3ThirdSnapshotRecords); Set records = Sets.newHashSet(results); - Assert.assertEquals("Should produce correct number of records", - expected.size(), records.size()); - Assert.assertEquals("Record set should match", - Sets.newHashSet(expected), records); + Assert.assertEquals( + "Should produce correct number of records", expected.size(), records.size()); + Assert.assertEquals("Record set should match", Sets.newHashSet(expected), records); Assert.assertNotNull(Iterables.get(records, 0).getField("id")); Assert.assertNotNull(Iterables.get(records, 0).getField("data")); } @@ -443,9 +469,10 @@ public void testAppendsBetween() throws IOException { @Test public void testAppendsAfter() throws IOException { appendData(); - Iterable results = IcebergGenerics.read(sharedTable) - .appendsAfter(sharedTable.history().get(0).snapshotId()) - .build(); + Iterable results = + IcebergGenerics.read(sharedTable) + .appendsAfter(sharedTable.history().get(0).snapshotId()) + .build(); Set expected = Sets.newHashSet(); expected.addAll(file1SecondSnapshotRecords); @@ -456,21 +483,22 @@ public void testAppendsAfter() throws IOException { expected.addAll(file3ThirdSnapshotRecords); Set records = Sets.newHashSet(results); - Assert.assertEquals("Should produce correct number of records", - expected.size(), records.size()); - Assert.assertEquals("Record set should match", - Sets.newHashSet(expected), records); + Assert.assertEquals( + "Should produce correct number of records", expected.size(), records.size()); + Assert.assertEquals("Record set should match", Sets.newHashSet(expected), records); Assert.assertNotNull(Iterables.get(records, 0).getField("id")); Assert.assertNotNull(Iterables.get(records, 0).getField("data")); } @Test public void testUnknownSnapshotId() { - Long minSnapshotId = sharedTable.history().stream().map(h -> h.snapshotId()).min(Long::compareTo).get(); + Long minSnapshotId = + sharedTable.history().stream().map(h -> h.snapshotId()).min(Long::compareTo).get(); IcebergGenerics.ScanBuilder scanBuilder = IcebergGenerics.read(sharedTable); - AssertHelpers.assertThrows("Should fail on unknown snapshot id", + AssertHelpers.assertThrows( + "Should fail on unknown snapshot id", IllegalArgumentException.class, "Cannot find snapshot with ID ", () -> scanBuilder.useSnapshot(/* unknown snapshot id */ minSnapshotId - 1)); @@ -480,23 +508,29 @@ public void testUnknownSnapshotId() { public void testAsOfTimeOlderThanFirstSnapshot() { IcebergGenerics.ScanBuilder scanBuilder = IcebergGenerics.read(sharedTable); - AssertHelpers.assertThrows("Should fail on timestamp sooner than first write", + AssertHelpers.assertThrows( + "Should fail on timestamp sooner than first write", IllegalArgumentException.class, "Cannot find a snapshot older than ", - () -> scanBuilder.asOfTime(/* older than first snapshot */ sharedTable.history().get(0).timestampMillis() - 1)); + () -> + scanBuilder.asOfTime( + /* older than first snapshot */ sharedTable.history().get(0).timestampMillis() + - 1)); } - private DataFile writeFile(String location, String filename, List records) throws IOException { + private DataFile writeFile(String location, String filename, List records) + throws IOException { return writeFile(location, filename, SCHEMA, records); } - private DataFile writeFile(String location, String filename, Schema schema, List records) throws IOException { + private DataFile writeFile(String location, String filename, Schema schema, List records) + throws IOException { Path path = new Path(location, filename); FileFormat fileFormat = FileFormat.fromFileName(filename); Preconditions.checkNotNull(fileFormat, "Cannot determine format for file: %s", filename); - FileAppender fileAppender = new GenericAppenderFactory(schema).newAppender( - fromPath(path, CONF), fileFormat); + FileAppender fileAppender = + new GenericAppenderFactory(schema).newAppender(fromPath(path, CONF), fileFormat); try (FileAppender appender = fileAppender) { appender.addAll(records); } @@ -509,38 +543,45 @@ private DataFile writeFile(String location, String filename, Schema schema, List @Test public void testFilterWithDateAndTimestamp() throws IOException { - // TODO: Add multiple timestamp tests - there's an issue with ORC caching TZ in ThreadLocal, so it's not possible + // TODO: Add multiple timestamp tests - there's an issue with ORC caching TZ in ThreadLocal, so + // it's not possible // to change TZ and test with ORC as they will produce incompatible values. - Schema schema = new Schema( - required(1, "timestamp_with_zone", Types.TimestampType.withZone()), - required(2, "timestamp_without_zone", Types.TimestampType.withoutZone()), - required(3, "date", Types.DateType.get()), - required(4, "time", Types.TimeType.get()) - ); + Schema schema = + new Schema( + required(1, "timestamp_with_zone", Types.TimestampType.withZone()), + required(2, "timestamp_without_zone", Types.TimestampType.withoutZone()), + required(3, "date", Types.DateType.get()), + required(4, "time", Types.TimeType.get())); File tableLocation = temp.newFolder("complex_filter_table"); Assert.assertTrue(tableLocation.delete()); - Table table = TABLES.create( - schema, PartitionSpec.unpartitioned(), - ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()), - tableLocation.getAbsolutePath()); + Table table = + TABLES.create( + schema, + PartitionSpec.unpartitioned(), + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()), + tableLocation.getAbsolutePath()); List expected = RandomGenericData.generate(schema, 100, 435691832918L); - DataFile file = writeFile(tableLocation.toString(), format.addExtension("record-file"), schema, expected); + DataFile file = + writeFile(tableLocation.toString(), format.addExtension("record-file"), schema, expected); table.newFastAppend().appendFile(file).commit(); for (Record r : expected) { - Iterable filterResult = IcebergGenerics.read(table) - .where(equal("timestamp_with_zone", r.getField("timestamp_with_zone").toString())) - .where(equal("timestamp_without_zone", r.getField("timestamp_without_zone").toString())) - .where(equal("date", r.getField("date").toString())) - .where(equal("time", r.getField("time").toString())) - .build(); + Iterable filterResult = + IcebergGenerics.read(table) + .where(equal("timestamp_with_zone", r.getField("timestamp_with_zone").toString())) + .where( + equal("timestamp_without_zone", r.getField("timestamp_without_zone").toString())) + .where(equal("date", r.getField("date").toString())) + .where(equal("time", r.getField("time").toString())) + .build(); Assert.assertTrue(filterResult.iterator().hasNext()); Record readRecord = filterResult.iterator().next(); - Assert.assertEquals(r.getField("timestamp_with_zone"), readRecord.getField("timestamp_with_zone")); + Assert.assertEquals( + r.getField("timestamp_with_zone"), readRecord.getField("timestamp_with_zone")); } } diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java index 9c065c26bd58..abaa6389eb89 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java +++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java @@ -16,9 +16,29 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; +import static org.apache.iceberg.avro.AvroSchemaUtil.convert; +import static org.apache.iceberg.expressions.Expressions.and; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.isNaN; +import static org.apache.iceberg.expressions.Expressions.isNull; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.not; +import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.expressions.Expressions.notIn; +import static org.apache.iceberg.expressions.Expressions.notNaN; +import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; +import static org.apache.iceberg.expressions.Expressions.or; +import static org.apache.iceberg.expressions.Expressions.startsWith; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.io.UncheckedIOException; @@ -67,33 +87,12 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.avro.AvroSchemaUtil.convert; -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.isNull; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.not; -import static org.apache.iceberg.expressions.Expressions.notEqual; -import static org.apache.iceberg.expressions.Expressions.notIn; -import static org.apache.iceberg.expressions.Expressions.notNaN; -import static org.apache.iceberg.expressions.Expressions.notNull; -import static org.apache.iceberg.expressions.Expressions.notStartsWith; -import static org.apache.iceberg.expressions.Expressions.or; -import static org.apache.iceberg.expressions.Expressions.startsWith; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestMetricsRowGroupFilter { @Parameterized.Parameters(name = "format = {0}") public static Object[] parameters() { - return new Object[] { "parquet", "orc" }; + return new Object[] {"parquet", "orc"}; } private final FileFormat format; @@ -105,39 +104,41 @@ public TestMetricsRowGroupFilter(String format) { private static final Types.StructType structFieldType = Types.StructType.of(Types.NestedField.required(8, "int_field", IntegerType.get())); - private static final Schema SCHEMA = new Schema( - required(1, "id", IntegerType.get()), - optional(2, "no_stats_parquet", StringType.get()), - required(3, "required", StringType.get()), - optional(4, "all_nulls", DoubleType.get()), - optional(5, "some_nulls", StringType.get()), - optional(6, "no_nulls", StringType.get()), - optional(7, "struct_not_null", structFieldType), - optional(9, "not_in_file", FloatType.get()), - optional(10, "str", StringType.get()), - optional(11, "map_not_null", - Types.MapType.ofRequired(12, 13, StringType.get(), IntegerType.get())), - optional(14, "all_nans", DoubleType.get()), - optional(15, "some_nans", FloatType.get()), - optional(16, "no_nans", DoubleType.get()) - ); + private static final Schema SCHEMA = + new Schema( + required(1, "id", IntegerType.get()), + optional(2, "no_stats_parquet", StringType.get()), + required(3, "required", StringType.get()), + optional(4, "all_nulls", DoubleType.get()), + optional(5, "some_nulls", StringType.get()), + optional(6, "no_nulls", StringType.get()), + optional(7, "struct_not_null", structFieldType), + optional(9, "not_in_file", FloatType.get()), + optional(10, "str", StringType.get()), + optional( + 11, + "map_not_null", + Types.MapType.ofRequired(12, 13, StringType.get(), IntegerType.get())), + optional(14, "all_nans", DoubleType.get()), + optional(15, "some_nans", FloatType.get()), + optional(16, "no_nans", DoubleType.get())); private static final Types.StructType _structFieldType = Types.StructType.of(Types.NestedField.required(8, "_int_field", IntegerType.get())); - private static final Schema FILE_SCHEMA = new Schema( - required(1, "_id", IntegerType.get()), - optional(2, "_no_stats_parquet", StringType.get()), - required(3, "_required", StringType.get()), - optional(4, "_all_nulls", DoubleType.get()), - optional(5, "_some_nulls", StringType.get()), - optional(6, "_no_nulls", StringType.get()), - optional(7, "_struct_not_null", _structFieldType), - optional(10, "_str", StringType.get()), - optional(14, "_all_nans", Types.DoubleType.get()), - optional(15, "_some_nans", FloatType.get()), - optional(16, "_no_nans", Types.DoubleType.get()) - ); + private static final Schema FILE_SCHEMA = + new Schema( + required(1, "_id", IntegerType.get()), + optional(2, "_no_stats_parquet", StringType.get()), + required(3, "_required", StringType.get()), + optional(4, "_all_nulls", DoubleType.get()), + optional(5, "_some_nulls", StringType.get()), + optional(6, "_no_nulls", StringType.get()), + optional(7, "_struct_not_null", _structFieldType), + optional(10, "_str", StringType.get()), + optional(14, "_all_nans", Types.DoubleType.get()), + optional(15, "_some_nans", FloatType.get()), + optional(16, "_no_nans", Types.DoubleType.get())); private static final String TOO_LONG_FOR_STATS_PARQUET; @@ -156,8 +157,7 @@ public TestMetricsRowGroupFilter(String format) { private MessageType parquetSchema = null; private BlockMetaData rowGroupMetadata = null; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Before public void createInputFile() throws IOException { @@ -169,7 +169,8 @@ public void createInputFile() throws IOException { createParquetInputFile(); break; default: - throw new UnsupportedOperationException("Row group filter tests not supported for " + format); + throw new UnsupportedOperationException( + "Row group filter tests not supported for " + format); } } @@ -178,16 +179,19 @@ public void createOrcInputFile() throws IOException { Assert.assertTrue(orcFile.delete()); OutputFile outFile = Files.localOutput(orcFile); - try (FileAppender appender = ORC.write(outFile) - .schema(FILE_SCHEMA) - .createWriterFunc(GenericOrcWriter::buildWriter) - .build()) { + try (FileAppender appender = + ORC.write(outFile) + .schema(FILE_SCHEMA) + .createWriterFunc(GenericOrcWriter::buildWriter) + .build()) { GenericRecord record = GenericRecord.create(FILE_SCHEMA); // create 50 records for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { record.setField("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0 - record.setField("_no_stats_parquet", TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats - // in Parquet, but will produce stats for ORC + record.setField( + "_no_stats_parquet", + TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats + // in Parquet, but will produce stats for ORC record.setField("_required", "req"); // required, always non-null record.setField("_all_nulls", null); // never non-null record.setField("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values @@ -206,8 +210,9 @@ public void createOrcInputFile() throws IOException { } InputFile inFile = Files.localInput(orcFile); - try (Reader reader = OrcFile.createReader(new Path(inFile.location()), - OrcFile.readerOptions(new Configuration()))) { + try (Reader reader = + OrcFile.createReader( + new Path(inFile.location()), OrcFile.readerOptions(new Configuration()))) { Assert.assertEquals("Should create only one stripe", 1, reader.getStripes().size()); } @@ -222,15 +227,15 @@ private void createParquetInputFile() throws IOException { org.apache.avro.Schema structSchema = AvroSchemaUtil.convert(_structFieldType); OutputFile outFile = Files.localOutput(parquetFile); - try (FileAppender appender = Parquet.write(outFile) - .schema(FILE_SCHEMA) - .build()) { + try (FileAppender appender = Parquet.write(outFile).schema(FILE_SCHEMA).build()) { GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table")); // create 50 records for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { builder.set("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0 - builder.set("_no_stats_parquet", TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats - // in Parquet + builder.set( + "_no_stats_parquet", + TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats + // in Parquet builder.set("_required", "req"); // required, always non-null builder.set("_all_nulls", null); // never non-null builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values @@ -307,13 +312,15 @@ public void testIsNaN() { shouldRead = shouldRead(isNaN("no_nans")); switch (format) { case ORC: - Assert.assertFalse("Should read 0 rows due to the ORC filter push-down feature", shouldRead); + Assert.assertFalse( + "Should read 0 rows due to the ORC filter push-down feature", shouldRead); break; case PARQUET: Assert.assertTrue("Should read: NaN counts are not tracked in Parquet metrics", shouldRead); break; default: - throw new UnsupportedOperationException("Row group filter tests not supported for " + format); + throw new UnsupportedOperationException( + "Row group filter tests not supported for " + format); } shouldRead = shouldRead(isNaN("all_nulls")); @@ -333,7 +340,6 @@ public void testNotNaN() { shouldRead = shouldRead(notNaN("all_nulls")); Assert.assertTrue("Should read: NaN counts are not tracked in Parquet metrics", shouldRead); - } @Test @@ -345,14 +351,14 @@ public void testRequiredColumn() { Assert.assertFalse("Should skip: required columns are always non-null", shouldRead); } - @Rule - public ExpectedException exceptionRule = ExpectedException.none(); + @Rule public ExpectedException exceptionRule = ExpectedException.none(); @Test public void testMissingColumn() { exceptionRule.expect(ValidationException.class); exceptionRule.expectMessage("Cannot find field 'missing'"); - exceptionRule.reportMissingExceptionWithMessage("Should complain about missing column in expression"); + exceptionRule.reportMissingExceptionWithMessage( + "Should complain about missing column in expression"); shouldRead(lessThan("missing", 5)); } @@ -361,20 +367,19 @@ public void testColumnNotInFile() { Assume.assumeFalse( "If a column is not in file, ORC does NOT try to apply predicates assuming null values for the column", format == FileFormat.ORC); - Expression[] cannotMatch = new Expression[] { - lessThan("not_in_file", 1.0f), lessThanOrEqual("not_in_file", 1.0f), - equal("not_in_file", 1.0f), greaterThan("not_in_file", 1.0f), - greaterThanOrEqual("not_in_file", 1.0f), notNull("not_in_file") - }; + Expression[] cannotMatch = + new Expression[] { + lessThan("not_in_file", 1.0f), lessThanOrEqual("not_in_file", 1.0f), + equal("not_in_file", 1.0f), greaterThan("not_in_file", 1.0f), + greaterThanOrEqual("not_in_file", 1.0f), notNull("not_in_file") + }; for (Expression expr : cannotMatch) { boolean shouldRead = shouldRead(expr); Assert.assertFalse("Should skip when column is not in file (all nulls): " + expr, shouldRead); } - Expression[] canMatch = new Expression[] { - isNull("not_in_file"), notEqual("not_in_file", 1.0f) - }; + Expression[] canMatch = new Expression[] {isNull("not_in_file"), notEqual("not_in_file", 1.0f)}; for (Expression expr : canMatch) { boolean shouldRead = shouldRead(expr); @@ -385,12 +390,19 @@ public void testColumnNotInFile() { @Test public void testMissingStatsParquet() { Assume.assumeTrue(format == FileFormat.PARQUET); - Expression[] exprs = new Expression[] { - lessThan("no_stats_parquet", "a"), lessThanOrEqual("no_stats_parquet", "b"), equal("no_stats_parquet", "c"), - greaterThan("no_stats_parquet", "d"), greaterThanOrEqual("no_stats_parquet", "e"), - notEqual("no_stats_parquet", "f"), isNull("no_stats_parquet"), notNull("no_stats_parquet"), - startsWith("no_stats_parquet", "a"), notStartsWith("no_stats_parquet", "a") - }; + Expression[] exprs = + new Expression[] { + lessThan("no_stats_parquet", "a"), + lessThanOrEqual("no_stats_parquet", "b"), + equal("no_stats_parquet", "c"), + greaterThan("no_stats_parquet", "d"), + greaterThanOrEqual("no_stats_parquet", "e"), + notEqual("no_stats_parquet", "f"), + isNull("no_stats_parquet"), + notNull("no_stats_parquet"), + startsWith("no_stats_parquet", "a"), + notStartsWith("no_stats_parquet", "a") + }; for (Expression expr : exprs) { boolean shouldRead = shouldRead(expr); @@ -404,11 +416,17 @@ public void testZeroRecordFileParquet() { BlockMetaData emptyBlock = new BlockMetaData(); emptyBlock.setRowCount(0); - Expression[] exprs = new Expression[] { - lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78), - greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"), - notNull("some_nulls") - }; + Expression[] exprs = + new Expression[] { + lessThan("id", 5), + lessThanOrEqual("id", 30), + equal("id", 70), + greaterThan("id", 78), + greaterThanOrEqual("id", 90), + notEqual("id", 101), + isNull("some_nulls"), + notNull("some_nulls") + }; for (Expression expr : exprs) { boolean shouldRead = shouldReadParquet(expr, true, parquetSchema, emptyBlock); @@ -429,25 +447,33 @@ public void testNot() { @Test public void testAnd() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = shouldRead(and(lessThan("id", INT_MIN_VALUE - 25), - greaterThanOrEqual("id", INT_MIN_VALUE - 30))); + boolean shouldRead = + shouldRead( + and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MIN_VALUE - 30))); Assert.assertFalse("Should skip: and(false, true)", shouldRead); - shouldRead = shouldRead(and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))); + shouldRead = + shouldRead( + and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))); Assert.assertFalse("Should skip: and(false, false)", shouldRead); - shouldRead = shouldRead(and(greaterThan("id", INT_MIN_VALUE - 25), lessThanOrEqual("id", INT_MIN_VALUE))); + shouldRead = + shouldRead( + and(greaterThan("id", INT_MIN_VALUE - 25), lessThanOrEqual("id", INT_MIN_VALUE))); Assert.assertTrue("Should read: and(true, true)", shouldRead); } @Test public void testOr() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = shouldRead(or(lessThan("id", INT_MIN_VALUE - 25), - greaterThanOrEqual("id", INT_MAX_VALUE + 1))); + boolean shouldRead = + shouldRead( + or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))); Assert.assertFalse("Should skip: or(false, false)", shouldRead); - shouldRead = shouldRead(or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE - 19))); + shouldRead = + shouldRead( + or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE - 19))); Assert.assertTrue("Should read: or(false, true)", shouldRead); } @@ -600,7 +626,8 @@ public void testStructFieldLt() { @Test public void testStructFieldLtEq() { - boolean shouldRead = shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 25)); + boolean shouldRead = + shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 25)); Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); shouldRead = shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 1)); @@ -630,7 +657,8 @@ public void testStructFieldGt() { @Test public void testStructFieldGtEq() { - boolean shouldRead = shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 6)); + boolean shouldRead = + shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 6)); Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); shouldRead = shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 1)); @@ -699,7 +727,8 @@ public void testCaseInsensitive() { @Test public void testStringStartsWith() { - Assume.assumeFalse("ORC row group filter does not support StringStartsWith", format == FileFormat.ORC); + Assume.assumeFalse( + "ORC row group filter does not support StringStartsWith", format == FileFormat.ORC); boolean shouldRead = shouldRead(startsWith("str", "1")); Assert.assertTrue("Should read: range matches", shouldRead); @@ -730,7 +759,8 @@ public void testStringStartsWith() { @Test public void testStringNotStartsWith() { - Assume.assumeFalse("ORC row group filter does not support StringStartsWith", format == FileFormat.ORC); + Assume.assumeFalse( + "ORC row group filter does not support StringStartsWith", format == FileFormat.ORC); boolean shouldRead = shouldRead(notStartsWith("str", "1")); Assert.assertTrue("Should read: range matches", shouldRead); @@ -774,7 +804,8 @@ public void testIntegerIn() { Assert.assertTrue("Should read: id equal to lower bound (30 == 30)", shouldRead); shouldRead = shouldRead(in("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)); - Assert.assertTrue("Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)", shouldRead); + Assert.assertTrue( + "Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)", shouldRead); shouldRead = shouldRead(in("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)); Assert.assertTrue("Should read: id equal to upper bound (79 == 79)", shouldRead); @@ -807,7 +838,8 @@ public void testIntegerNotIn() { Assert.assertTrue("Should read: id equal to lower bound (30 == 30)", shouldRead); shouldRead = shouldRead(notIn("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)); - Assert.assertTrue("Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)", shouldRead); + Assert.assertTrue( + "Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)", shouldRead); shouldRead = shouldRead(notIn("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)); Assert.assertTrue("Should read: id equal to upper bound (79 == 79)", shouldRead); @@ -826,7 +858,8 @@ public void testIntegerNotIn() { shouldRead = shouldRead(notIn("no_nulls", "aaa", "")); if (format == FileFormat.PARQUET) { - // no_nulls column has all values == "", so notIn("no_nulls", "") should always be false and so should be skipped + // no_nulls column has all values == "", so notIn("no_nulls", "") should always be false and + // so should be skipped // However, the metrics evaluator in Parquets always reads row group for a notIn filter Assert.assertTrue("Should read: notIn on no nulls column", shouldRead); } else { @@ -860,8 +893,9 @@ public void testInLimitParquet() { public void testParquetTypePromotion() { Assume.assumeTrue("Only valid for Parquet", format == FileFormat.PARQUET); Schema promotedSchema = new Schema(required(1, "id", Types.LongType.get())); - boolean shouldRead = new ParquetMetricsRowGroupFilter(promotedSchema, equal("id", INT_MIN_VALUE + 1), true) - .shouldRead(parquetSchema, rowGroupMetadata); + boolean shouldRead = + new ParquetMetricsRowGroupFilter(promotedSchema, equal("id", INT_MIN_VALUE + 1), true) + .shouldRead(parquetSchema, rowGroupMetadata); Assert.assertTrue("Should succeed with promoted schema", shouldRead); } @@ -876,24 +910,29 @@ private boolean shouldRead(Expression expression, boolean caseSensitive) { case PARQUET: return shouldReadParquet(expression, caseSensitive, parquetSchema, rowGroupMetadata); default: - throw new UnsupportedOperationException("Row group filter tests not supported for " + format); + throw new UnsupportedOperationException( + "Row group filter tests not supported for " + format); } } private boolean shouldReadOrc(Expression expression, boolean caseSensitive) { - try (CloseableIterable reader = ORC.read(Files.localInput(orcFile)) - .project(SCHEMA) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(SCHEMA, fileSchema)) - .filter(expression) - .caseSensitive(caseSensitive) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(orcFile)) + .project(SCHEMA) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(SCHEMA, fileSchema)) + .filter(expression) + .caseSensitive(caseSensitive) + .build()) { return Lists.newArrayList(reader).size() > 0; } catch (IOException e) { throw new UncheckedIOException(e); } } - private boolean shouldReadParquet(Expression expression, boolean caseSensitive, MessageType messageType, + private boolean shouldReadParquet( + Expression expression, + boolean caseSensitive, + MessageType messageType, BlockMetaData blockMetaData) { return new ParquetMetricsRowGroupFilter(SCHEMA, expression, caseSensitive) .shouldRead(messageType, blockMetaData); diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilterTypes.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilterTypes.java index e7a26f233f0b..be2d7c7c868b 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilterTypes.java +++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilterTypes.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.io.UncheckedIOException; @@ -76,62 +78,62 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.types.Types.NestedField.optional; - @RunWith(Parameterized.class) public class TestMetricsRowGroupFilterTypes { - private static final Schema SCHEMA = new Schema( - optional(1, "boolean", BooleanType.get()), - optional(2, "int", IntegerType.get()), - optional(3, "long", LongType.get()), - optional(4, "float", FloatType.get()), - optional(5, "double", DoubleType.get()), - optional(6, "date", DateType.get()), - optional(7, "time", TimeType.get()), - optional(8, "timestamp", TimestampType.withoutZone()), - optional(9, "timestamptz", TimestampType.withZone()), - optional(10, "string", StringType.get()), - optional(11, "uuid", UUIDType.get()), - optional(12, "fixed", FixedType.ofLength(4)), - optional(13, "binary", BinaryType.get()), - optional(14, "int_decimal", Types.DecimalType.of(8, 2)), - optional(15, "long_decimal", Types.DecimalType.of(14, 2)), - optional(16, "fixed_decimal", Types.DecimalType.of(31, 2)) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "boolean", BooleanType.get()), + optional(2, "int", IntegerType.get()), + optional(3, "long", LongType.get()), + optional(4, "float", FloatType.get()), + optional(5, "double", DoubleType.get()), + optional(6, "date", DateType.get()), + optional(7, "time", TimeType.get()), + optional(8, "timestamp", TimestampType.withoutZone()), + optional(9, "timestamptz", TimestampType.withZone()), + optional(10, "string", StringType.get()), + optional(11, "uuid", UUIDType.get()), + optional(12, "fixed", FixedType.ofLength(4)), + optional(13, "binary", BinaryType.get()), + optional(14, "int_decimal", Types.DecimalType.of(8, 2)), + optional(15, "long_decimal", Types.DecimalType.of(14, 2)), + optional(16, "fixed_decimal", Types.DecimalType.of(31, 2))); - private static final Schema FILE_SCHEMA = new Schema( - optional(1, "_boolean", BooleanType.get()), - optional(2, "_int", IntegerType.get()), - optional(3, "_long", LongType.get()), - optional(4, "_float", FloatType.get()), - optional(5, "_double", DoubleType.get()), - optional(6, "_date", DateType.get()), - optional(7, "_time", TimeType.get()), - optional(8, "_timestamp", TimestampType.withoutZone()), - optional(9, "_timestamptz", TimestampType.withZone()), - optional(10, "_string", StringType.get()), - optional(11, "_uuid", UUIDType.get()), - optional(12, "_fixed", FixedType.ofLength(4)), - optional(13, "_binary", BinaryType.get()), - optional(14, "_int_decimal", Types.DecimalType.of(8, 2)), - optional(15, "_long_decimal", Types.DecimalType.of(14, 2)), - optional(16, "_fixed_decimal", Types.DecimalType.of(31, 2)) - ); + private static final Schema FILE_SCHEMA = + new Schema( + optional(1, "_boolean", BooleanType.get()), + optional(2, "_int", IntegerType.get()), + optional(3, "_long", LongType.get()), + optional(4, "_float", FloatType.get()), + optional(5, "_double", DoubleType.get()), + optional(6, "_date", DateType.get()), + optional(7, "_time", TimeType.get()), + optional(8, "_timestamp", TimestampType.withoutZone()), + optional(9, "_timestamptz", TimestampType.withZone()), + optional(10, "_string", StringType.get()), + optional(11, "_uuid", UUIDType.get()), + optional(12, "_fixed", FixedType.ofLength(4)), + optional(13, "_binary", BinaryType.get()), + optional(14, "_int_decimal", Types.DecimalType.of(8, 2)), + optional(15, "_long_decimal", Types.DecimalType.of(14, 2)), + optional(16, "_fixed_decimal", Types.DecimalType.of(31, 2))); private static final File ORC_FILE = new File("/tmp/stats-row-group-filter-types-test.orc"); - private static final File PARQUET_FILE = new File("/tmp/stats-row-group-filter-types-test.parquet"); + private static final File PARQUET_FILE = + new File("/tmp/stats-row-group-filter-types-test.parquet"); private static MessageType parquetSchema = null; private static BlockMetaData rowGroupMetadata = null; private static final UUID uuid = UUID.randomUUID(); - private static final LocalDate date = LocalDate.parse("2018-06-29", DateTimeFormatter.ISO_LOCAL_DATE); - private static final LocalTime time = LocalTime.parse("10:02:34.000000", DateTimeFormatter.ISO_LOCAL_TIME); - private static final OffsetDateTime timestamptz = OffsetDateTime.parse("2018-06-29T10:02:34.000000+00:00", - DateTimeFormatter.ISO_DATE_TIME); - private static final LocalDateTime timestamp = LocalDateTime.parse("2018-06-29T10:02:34.000000", - DateTimeFormatter.ISO_LOCAL_DATE_TIME); + private static final LocalDate date = + LocalDate.parse("2018-06-29", DateTimeFormatter.ISO_LOCAL_DATE); + private static final LocalTime time = + LocalTime.parse("10:02:34.000000", DateTimeFormatter.ISO_LOCAL_TIME); + private static final OffsetDateTime timestamptz = + OffsetDateTime.parse("2018-06-29T10:02:34.000000+00:00", DateTimeFormatter.ISO_DATE_TIME); + private static final LocalDateTime timestamp = + LocalDateTime.parse("2018-06-29T10:02:34.000000", DateTimeFormatter.ISO_LOCAL_DATE_TIME); private static final byte[] fixed = "abcd".getBytes(StandardCharsets.UTF_8); @Before @@ -150,8 +152,9 @@ public void createInputFile() throws IOException { record.setField("_timestamp", timestamp); record.setField("_timestamptz", timestamptz); record.setField("_string", "tapir"); - // record.setField("_uuid", uuid); // Disable writing UUID value as GenericParquetWriter does not handle UUID type - // correctly; Also UUID tests are disabled for both ORC and Parquet anyway + // record.setField("_uuid", uuid); // Disable writing UUID value as GenericParquetWriter does + // not handle UUID type + // correctly; Also UUID tests are disabled for both ORC and Parquet anyway record.setField("_fixed", fixed); record.setField("_binary", ByteBuffer.wrap("xyz".getBytes(StandardCharsets.UTF_8))); record.setField("_int_decimal", new BigDecimal("77.77")); @@ -167,7 +170,8 @@ public void createInputFile() throws IOException { createParquetInputFile(records); break; default: - throw new UnsupportedOperationException("Row group filter types tests not supported for " + format); + throw new UnsupportedOperationException( + "Row group filter types tests not supported for " + format); } } @@ -177,16 +181,18 @@ public void createOrcInputFile(List records) throws IOException { } OutputFile outFile = Files.localOutput(ORC_FILE); - try (FileAppender appender = ORC.write(outFile) - .schema(FILE_SCHEMA) - .createWriterFunc(GenericOrcWriter::buildWriter) - .build()) { + try (FileAppender appender = + ORC.write(outFile) + .schema(FILE_SCHEMA) + .createWriterFunc(GenericOrcWriter::buildWriter) + .build()) { appender.addAll(records); } InputFile inFile = Files.localInput(ORC_FILE); - try (Reader reader = OrcFile.createReader(new Path(inFile.location()), - OrcFile.readerOptions(new Configuration()))) { + try (Reader reader = + OrcFile.createReader( + new Path(inFile.location()), OrcFile.readerOptions(new Configuration()))) { Assert.assertEquals("Should create only one stripe", 1, reader.getStripes().size()); } @@ -199,10 +205,11 @@ public void createParquetInputFile(List records) throws IOException { } OutputFile outFile = Files.localOutput(PARQUET_FILE); - try (FileAppender appender = Parquet.write(outFile) - .schema(FILE_SCHEMA) - .createWriterFunc(GenericParquetWriter::buildWriter) - .build()) { + try (FileAppender appender = + Parquet.write(outFile) + .schema(FILE_SCHEMA) + .createWriterFunc(GenericParquetWriter::buildWriter) + .build()) { appender.addAll(records); } @@ -224,44 +231,52 @@ public void createParquetInputFile(List records) throws IOException { @Parameterized.Parameters(name = "format = {0} column = {1} readValue = {2} skipValue = {3}") public static Object[][] parameters() { return new Object[][] { - { "parquet", "boolean", false, true }, - { "parquet", "int", 5, 55 }, - { "parquet", "long", 5_000_000_049L, 5_000L }, - { "parquet", "float", 1.97f, 2.11f }, - { "parquet", "double", 2.11d, 1.97d }, - { "parquet", "date", "2018-06-29", "2018-05-03" }, - { "parquet", "time", "10:02:34.000000", "10:02:34.000001" }, - { "parquet", "timestamp", "2018-06-29T10:02:34.000000", "2018-06-29T15:02:34.000000" }, - { "parquet", "timestamptz", "2018-06-29T10:02:34.000000+00:00", "2018-06-29T10:02:34.000000-07:00" }, - { "parquet", "string", "tapir", "monthly" }, - // { "parquet", "uuid", uuid, UUID.randomUUID() }, // not supported yet - { "parquet", "fixed", "abcd".getBytes(StandardCharsets.UTF_8), new byte[] { 0, 1, 2, 3 } }, - { "parquet", "binary", "xyz".getBytes(StandardCharsets.UTF_8), new byte[] { 0, 1, 2, 3, 4, 5 } }, - { "parquet", "int_decimal", "77.77", "12.34" }, - { "parquet", "long_decimal", "88.88", "12.34" }, - { "parquet", "fixed_decimal", "99.99", "12.34" }, - - { "orc", "boolean", false, true }, - { "orc", "int", 5, 55 }, - { "orc", "long", 5_000_000_049L, 5_000L }, - { "orc", "float", 1.97f, 2.11f }, - { "orc", "double", 2.11d, 1.97d }, - { "orc", "date", "2018-06-29", "2018-05-03" }, - { "orc", "time", "10:02:34.000000", "10:02:34.000001" }, - { "orc", "timestamp", "2018-06-29T10:02:34.000000", "2018-06-29T15:02:34.000000" }, - { "orc", "timestamptz", "2018-06-29T10:02:34.000000+00:00", "2018-06-29T10:02:34.000000-07:00" }, - { "orc", "string", "tapir", "monthly" }, - // uuid, fixed and binary types not supported yet - // { "orc", "uuid", uuid, UUID.randomUUID() }, - // { "orc", "fixed", "abcd".getBytes(StandardCharsets.UTF_8), new byte[] { 0, 1, 2, 3 } }, - // { "orc", "binary", "xyz".getBytes(StandardCharsets.UTF_8), new byte[] { 0, 1, 2, 3, 4, 5 } }, - { "orc", "int_decimal", "77.77", "12.34" }, - { "orc", "long_decimal", "88.88", "12.34" }, - { "orc", "fixed_decimal", "99.99", "12.34" }, + {"parquet", "boolean", false, true}, + {"parquet", "int", 5, 55}, + {"parquet", "long", 5_000_000_049L, 5_000L}, + {"parquet", "float", 1.97f, 2.11f}, + {"parquet", "double", 2.11d, 1.97d}, + {"parquet", "date", "2018-06-29", "2018-05-03"}, + {"parquet", "time", "10:02:34.000000", "10:02:34.000001"}, + {"parquet", "timestamp", "2018-06-29T10:02:34.000000", "2018-06-29T15:02:34.000000"}, + { + "parquet", + "timestamptz", + "2018-06-29T10:02:34.000000+00:00", + "2018-06-29T10:02:34.000000-07:00" + }, + {"parquet", "string", "tapir", "monthly"}, + // { "parquet", "uuid", uuid, UUID.randomUUID() }, // not supported yet + {"parquet", "fixed", "abcd".getBytes(StandardCharsets.UTF_8), new byte[] {0, 1, 2, 3}}, + {"parquet", "binary", "xyz".getBytes(StandardCharsets.UTF_8), new byte[] {0, 1, 2, 3, 4, 5}}, + {"parquet", "int_decimal", "77.77", "12.34"}, + {"parquet", "long_decimal", "88.88", "12.34"}, + {"parquet", "fixed_decimal", "99.99", "12.34"}, + {"orc", "boolean", false, true}, + {"orc", "int", 5, 55}, + {"orc", "long", 5_000_000_049L, 5_000L}, + {"orc", "float", 1.97f, 2.11f}, + {"orc", "double", 2.11d, 1.97d}, + {"orc", "date", "2018-06-29", "2018-05-03"}, + {"orc", "time", "10:02:34.000000", "10:02:34.000001"}, + {"orc", "timestamp", "2018-06-29T10:02:34.000000", "2018-06-29T15:02:34.000000"}, + { + "orc", "timestamptz", "2018-06-29T10:02:34.000000+00:00", "2018-06-29T10:02:34.000000-07:00" + }, + {"orc", "string", "tapir", "monthly"}, + // uuid, fixed and binary types not supported yet + // { "orc", "uuid", uuid, UUID.randomUUID() }, + // { "orc", "fixed", "abcd".getBytes(StandardCharsets.UTF_8), new byte[] { 0, 1, 2, 3 } }, + // { "orc", "binary", "xyz".getBytes(StandardCharsets.UTF_8), new byte[] { 0, 1, 2, 3, 4, 5 } + // }, + {"orc", "int_decimal", "77.77", "12.34"}, + {"orc", "long_decimal", "88.88", "12.34"}, + {"orc", "fixed_decimal", "99.99", "12.34"}, }; } - public TestMetricsRowGroupFilterTypes(String format, String column, Object readValue, Object skipValue) { + public TestMetricsRowGroupFilterTypes( + String format, String column, Object readValue, Object skipValue) { this.format = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH)); this.column = column; this.readValue = readValue; @@ -284,16 +299,18 @@ private boolean shouldRead(Object value) { case PARQUET: return shouldReadParquet(value); default: - throw new UnsupportedOperationException("Row group filter types tests not supported for " + format); + throw new UnsupportedOperationException( + "Row group filter types tests not supported for " + format); } } private boolean shouldReadOrc(Object value) { - try (CloseableIterable reader = ORC.read(Files.localInput(ORC_FILE)) - .project(SCHEMA) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(SCHEMA, fileSchema)) - .filter(Expressions.equal(column, value)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(ORC_FILE)) + .project(SCHEMA) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(SCHEMA, fileSchema)) + .filter(Expressions.equal(column, value)) + .build()) { return Lists.newArrayList(reader).size() > 0; } catch (IOException e) { throw new UncheckedIOException(e); diff --git a/data/src/test/java/org/apache/iceberg/data/TestReadProjection.java b/data/src/test/java/org/apache/iceberg/data/TestReadProjection.java index 9271bce075f7..42b9d957dd62 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestReadProjection.java +++ b/data/src/test/java/org/apache/iceberg/data/TestReadProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data; import java.io.IOException; @@ -37,20 +36,17 @@ import org.junit.rules.TemporaryFolder; public abstract class TestReadProjection { - protected abstract Record writeAndRead(String desc, - Schema writeSchema, - Schema readSchema, - Record record) throws IOException; + protected abstract Record writeAndRead( + String desc, Schema writeSchema, Schema readSchema, Record record) throws IOException; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema.asStruct()); record.setField("id", 34L); @@ -58,19 +54,20 @@ public void testFullProjection() throws Exception { Record projected = writeAndRead("full_projection", schema, schema, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("data")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("data")); Assert.assertTrue("Should contain the correct data value", cmp == 0); } @Test public void testSpecialCharacterProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "user id", Types.LongType.get()), - Types.NestedField.optional(1, "data%0", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "user id", Types.LongType.get()), + Types.NestedField.optional(1, "data%0", Types.StringType.get())); Record record = GenericRecord.create(schema.asStruct()); record.setField("user id", 34L); @@ -78,34 +75,37 @@ public void testSpecialCharacterProjection() throws Exception { Record full = writeAndRead("special_chars", schema, schema, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) full.getField("user id")); - Assert.assertEquals("Should contain the correct data value", + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) full.getField("user id")); + Assert.assertEquals( + "Should contain the correct data value", 0, Comparators.charSequences().compare("test", (CharSequence) full.getField("data%0"))); Record projected = writeAndRead("special_characters", schema, schema.select("data%0"), record); Assert.assertNull("Should not contain id value", projected.getField("user id")); - Assert.assertEquals("Should contain the correct data value", + Assert.assertEquals( + "Should contain the correct data value", 0, Comparators.charSequences().compare("test", (CharSequence) projected.getField("data%0"))); } @Test public void testReorderedFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema.asStruct()); record.setField("id", 34L); record.setField("data", "test"); - Schema reordered = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("full_projection", schema, reordered, record); @@ -115,20 +115,20 @@ public void testReorderedFullProjection() throws Exception { @Test public void testReorderedProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema.asStruct()); record.setField("id", 34L); record.setField("data", "test"); - Schema reordered = new Schema( - Types.NestedField.optional(2, "missing_1", Types.StringType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(3, "missing_2", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(2, "missing_1", Types.StringType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(3, "missing_2", Types.LongType.get())); Record projected = writeAndRead("full_projection", schema, reordered, record); @@ -139,41 +139,45 @@ public void testReorderedProjection() throws Exception { @Test public void testRenamedAddedField() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(1, "a", Types.LongType.get()), - Types.NestedField.required(2, "b", Types.LongType.get()), - Types.NestedField.required(3, "d", Types.LongType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(1, "a", Types.LongType.get()), + Types.NestedField.required(2, "b", Types.LongType.get()), + Types.NestedField.required(3, "d", Types.LongType.get())); Record record = GenericRecord.create(schema.asStruct()); record.setField("a", 100L); record.setField("b", 200L); record.setField("d", 300L); - Schema renamedAdded = new Schema( - Types.NestedField.optional(1, "a", Types.LongType.get()), - Types.NestedField.optional(2, "b", Types.LongType.get()), - Types.NestedField.optional(3, "c", Types.LongType.get()), - Types.NestedField.optional(4, "d", Types.LongType.get()) - ); + Schema renamedAdded = + new Schema( + Types.NestedField.optional(1, "a", Types.LongType.get()), + Types.NestedField.optional(2, "b", Types.LongType.get()), + Types.NestedField.optional(3, "c", Types.LongType.get()), + Types.NestedField.optional(4, "d", Types.LongType.get())); - Record projected = writeAndRead("rename_and_add_column_projection", schema, renamedAdded, record); + Record projected = + writeAndRead("rename_and_add_column_projection", schema, renamedAdded, record); Assert.assertEquals("Should contain the correct value in column 1", projected.get(0), 100L); - Assert.assertEquals("Should contain the correct value in column a", projected.getField("a"), 100L); + Assert.assertEquals( + "Should contain the correct value in column a", projected.getField("a"), 100L); Assert.assertEquals("Should contain the correct value in column 2", projected.get(1), 200L); - Assert.assertEquals("Should contain the correct value in column b", projected.getField("b"), 200L); + Assert.assertEquals( + "Should contain the correct value in column b", projected.getField("b"), 200L); Assert.assertEquals("Should contain the correct value in column 3", projected.get(2), 300L); - Assert.assertEquals("Should contain the correct value in column c", projected.getField("c"), 300L); + Assert.assertEquals( + "Should contain the correct value in column c", projected.getField("c"), 300L); Assert.assertNull("Should contain empty value on new column 4", projected.get(3)); Assert.assertNull("Should contain the correct value in column d", projected.getField("d")); } @Test public void testEmptyProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema.asStruct()); record.setField("id", 34L); @@ -192,70 +196,70 @@ public void testEmptyProjection() throws Exception { @Test public void testBasicProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(2, "time", Types.TimestampType.withZone()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(2, "time", Types.TimestampType.withZone())); Record record = GenericRecord.create(writeSchema.asStruct()); record.setField("id", 34L); record.setField("data", "test"); record.setField("time", OffsetDateTime.now(ZoneOffset.UTC)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("basic_projection_id", writeSchema, idOnly, record); Assert.assertNull("Should not project data", projected.getField("data")); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); - Schema dataOnly = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("data")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("data")); Assert.assertTrue("Should contain the correct data value", cmp == 0); } @Test public void testRename() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(writeSchema.asStruct()); record.setField("id", 34L); record.setField("data", "test"); - Schema readSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get()) - ); + Schema readSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); Record projected = writeAndRead("project_and_rename", writeSchema, readSchema, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("renamed")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("renamed")); Assert.assertTrue("Should contain the correct data/renamed value", cmp == 0); } @Test public void testNestedStructProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); Record record = GenericRecord.create(writeSchema.asStruct()); record.setField("id", 34L); @@ -264,61 +268,76 @@ public void testNestedStructProjection() throws Exception { location.setField("long", -1.539054f); record.setField("location", location); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); Record projectedLocation = (Record) projected.getField("location"); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project location", projectedLocation); - Schema latOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()) - )) - ); + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); projected = writeAndRead("latitude_only", writeSchema, latOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); Assert.assertNull("Should not project longitude", projectedLocation.getField("long")); - Assert.assertEquals("Should project latitude", - 52.995143f, (float) projectedLocation.getField("lat"), 0.000001f); - - Schema longOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Assert.assertEquals( + "Should project latitude", + 52.995143f, + (float) projectedLocation.getField("lat"), + 0.000001f); + + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); projected = writeAndRead("longitude_only", writeSchema, longOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); Assert.assertNull("Should not project latitutde", projectedLocation.getField("lat")); - Assert.assertEquals("Should project longitude", - -1.539054f, (float) projectedLocation.getField("long"), 0.000001f); + Assert.assertEquals( + "Should project longitude", + -1.539054f, + (float) projectedLocation.getField("long"), + 0.000001f); Schema locationOnly = writeSchema.select("location"); projected = writeAndRead("location_only", writeSchema, locationOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); - Assert.assertEquals("Should project latitude", - 52.995143f, (float) projectedLocation.getField("lat"), 0.000001f); - Assert.assertEquals("Should project longitude", - -1.539054f, (float) projectedLocation.getField("long"), 0.000001f); + Assert.assertEquals( + "Should project latitude", + 52.995143f, + (float) projectedLocation.getField("lat"), + 0.000001f); + Assert.assertEquals( + "Should project longitude", + -1.539054f, + (float) projectedLocation.getField("long"), + 0.000001f); } @Test public void testMapProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "properties", - Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "properties", + Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); Map properties = ImmutableMap.of("a", "A", "b", "B"); @@ -326,31 +345,36 @@ public void testMapProjection() throws IOException { record.setField("id", 34L); record.setField("properties", properties); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project properties map", projected.getField("properties")); Schema keyOnly = writeSchema.select("properties.key"); projected = writeAndRead("key_only", writeSchema, keyOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); Schema valueOnly = writeSchema.select("properties.value"); projected = writeAndRead("value_only", writeSchema, valueOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); Schema mapOnly = writeSchema.select("properties"); projected = writeAndRead("map_only", writeSchema, mapOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); } private Map toStringMap(Map map) { @@ -367,112 +391,128 @@ public void testMapProjection() throws IOException { @Test public void testMapOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); Record record = GenericRecord.create(writeSchema.asStruct()); record.setField("id", 34L); - Record l1 = GenericRecord.create(writeSchema.findType("locations").asMapType().valueType().asStructType()); + Record l1 = + GenericRecord.create( + writeSchema.findType("locations").asMapType().valueType().asStructType()); l1.setField("lat", 53.992811f); l1.setField("long", -1.542616f); - Record l2 = GenericRecord.create(writeSchema.findType("locations").asMapType().valueType().asStructType()); + Record l2 = + GenericRecord.create( + writeSchema.findType("locations").asMapType().valueType().asStructType()); l2.setField("lat", 52.995143f); l2.setField("long", -1.539054f); record.setField("locations", ImmutableMap.of("L1", l1, "L2", l2)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project locations map", projected.getField("locations")); projected = writeAndRead("all_locations", writeSchema, writeSchema.select("locations"), record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project locations map", - record.getField("locations"), toStringMap((Map) projected.getField("locations"))); + Assert.assertEquals( + "Should project locations map", + record.getField("locations"), + toStringMap((Map) projected.getField("locations"))); - projected = writeAndRead("lat_only", - writeSchema, writeSchema.select("locations.lat"), record); + projected = writeAndRead("lat_only", writeSchema, writeSchema.select("locations.lat"), record); Assert.assertNull("Should not project id", projected.getField("id")); Map locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); Record projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain lat", - 53.992811f, (float) projectedL1.getField("lat"), 0.000001); + Assert.assertEquals( + "L1 should contain lat", 53.992811f, (float) projectedL1.getField("lat"), 0.000001); Assert.assertNull("L1 should not contain long", projectedL1.getField("long")); Record projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain lat", - 52.995143f, (float) projectedL2.getField("lat"), 0.000001); + Assert.assertEquals( + "L2 should contain lat", 52.995143f, (float) projectedL2.getField("lat"), 0.000001); Assert.assertNull("L2 should not contain long", projectedL2.getField("long")); - projected = writeAndRead("long_only", - writeSchema, writeSchema.select("locations.long"), record); + projected = + writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), record); Assert.assertNull("Should not project id", projected.getField("id")); locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); Assert.assertNull("L1 should not contain lat", projectedL1.getField("lat")); - Assert.assertEquals("L1 should contain long", - -1.542616f, (float) projectedL1.getField("long"), 0.000001); + Assert.assertEquals( + "L1 should contain long", -1.542616f, (float) projectedL1.getField("long"), 0.000001); projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); Assert.assertNull("L2 should not contain lat", projectedL2.getField("lat")); - Assert.assertEquals("L2 should contain long", - -1.539054f, (float) projectedL2.getField("long"), 0.000001); - - Schema latitiudeRenamed = new Schema( - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "latitude", Types.FloatType.get()) - ) - )) - ); + Assert.assertEquals( + "L2 should contain long", -1.539054f, (float) projectedL2.getField("long"), 0.000001); + + Schema latitiudeRenamed = + new Schema( + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, record); Assert.assertNull("Should not project id", projected.getField("id")); locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain latitude", - 53.992811f, (float) projectedL1.getField("latitude"), 0.000001); + Assert.assertEquals( + "L1 should contain latitude", + 53.992811f, + (float) projectedL1.getField("latitude"), + 0.000001); Assert.assertNull("L1 should not contain lat", projectedL1.getField("lat")); Assert.assertNull("L1 should not contain long", projectedL1.getField("long")); projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain latitude", - 52.995143f, (float) projectedL2.getField("latitude"), 0.000001); + Assert.assertEquals( + "L2 should contain latitude", + 52.995143f, + (float) projectedL2.getField("latitude"), + 0.000001); Assert.assertNull("L2 should not contain lat", projectedL2.getField("lat")); Assert.assertNull("L2 should not contain long", projectedL2.getField("long")); } @Test public void testListProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(10, "values", - Types.ListType.ofOptional(11, Types.LongType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); List values = ImmutableList.of(56L, 57L, 58L); @@ -480,12 +520,11 @@ public void testListProjection() throws IOException { record.setField("id", 34L); record.setField("values", values); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project values list", projected.getField("values")); Schema elementOnly = writeSchema.select("values.element"); @@ -502,38 +541,43 @@ public void testListProjection() throws IOException { @Test @SuppressWarnings("unchecked") public void testListOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()) - )) - ) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); Record record = GenericRecord.create(writeSchema.asStruct()); record.setField("id", 34L); - Record p1 = GenericRecord.create(writeSchema.findType("points").asListType().elementType().asStructType()); + Record p1 = + GenericRecord.create( + writeSchema.findType("points").asListType().elementType().asStructType()); p1.setField("x", 1); p1.setField("y", 2); - Record p2 = GenericRecord.create(writeSchema.findType("points").asListType().elementType().asStructType()); + Record p2 = + GenericRecord.create( + writeSchema.findType("points").asListType().elementType().asStructType()); p2.setField("x", 3); p2.setField("y", null); record.setField("points", ImmutableList.of(p1, p2)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project points list", projected.getField("points")); projected = writeAndRead("all_points", writeSchema, writeSchema.select("points"), record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project points list", - record.getField("points"), projected.getField("points")); + Assert.assertEquals( + "Should project points list", record.getField("points"), projected.getField("points")); projected = writeAndRead("x_only", writeSchema, writeSchema.select("points.x"), record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -559,13 +603,15 @@ public void testListOfStructsProjection() throws IOException { Assert.assertNull("Should not project x", projectedP2.getField("x")); Assert.assertEquals("Should project null y", null, projectedP2.getField("y")); - Schema yRenamed = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.optional(18, "z", Types.IntegerType.get()) - )) - ) - ); + Schema yRenamed = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); projected = writeAndRead("y_renamed", writeSchema, yRenamed, record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -584,25 +630,29 @@ public void testListOfStructsProjection() throws IOException { @Test public void testAddedFieldsWithRequiredChildren() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(1, "a", Types.LongType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(1, "a", Types.LongType.get())); Record record = GenericRecord.create(schema.asStruct()); record.setField("a", 100L); - Schema addedFields = new Schema( - Types.NestedField.optional(1, "a", Types.LongType.get()), - Types.NestedField.optional(2, "b", Types.StructType.of( - Types.NestedField.required(3, "c", Types.LongType.get()) - )), - Types.NestedField.optional(4, "d", Types.ListType.ofRequired(5, Types.LongType.get())), - Types.NestedField.optional(6, "e", Types.MapType.ofRequired(7, 8, Types.LongType.get(), Types.LongType.get())) - ); - - Record projected = writeAndRead("add_fields_with_required_children_projection", schema, addedFields, record); + Schema addedFields = + new Schema( + Types.NestedField.optional(1, "a", Types.LongType.get()), + Types.NestedField.optional( + 2, + "b", + Types.StructType.of(Types.NestedField.required(3, "c", Types.LongType.get()))), + Types.NestedField.optional(4, "d", Types.ListType.ofRequired(5, Types.LongType.get())), + Types.NestedField.optional( + 6, + "e", + Types.MapType.ofRequired(7, 8, Types.LongType.get(), Types.LongType.get()))); + + Record projected = + writeAndRead("add_fields_with_required_children_projection", schema, addedFields, record); Assert.assertEquals("Should contain the correct value in column 1", projected.get(0), 100L); - Assert.assertEquals("Should contain the correct value in column a", projected.getField("a"), 100L); + Assert.assertEquals( + "Should contain the correct value in column a", projected.getField("a"), 100L); Assert.assertNull("Should contain empty value in new column 2", projected.get(1)); Assert.assertNull("Should contain empty value in column b", projected.getField("b")); Assert.assertNull("Should contain empty value in new column 4", projected.get(2)); diff --git a/data/src/test/java/org/apache/iceberg/data/avro/TestGenericData.java b/data/src/test/java/org/apache/iceberg/data/avro/TestGenericData.java index 287e54c203ee..ef690919ae07 100644 --- a/data/src/test/java/org/apache/iceberg/data/avro/TestGenericData.java +++ b/data/src/test/java/org/apache/iceberg/data/avro/TestGenericData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.avro; import java.io.File; @@ -42,21 +41,23 @@ protected void writeAndValidate(Schema schema) throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Avro.write(Files.localOutput(testFile)) - .schema(schema) - .createWriterFunc(DataWriter::create) - .named("test") - .build()) { + try (FileAppender writer = + Avro.write(Files.localOutput(testFile)) + .schema(schema) + .createWriterFunc(DataWriter::create) + .named("test") + .build()) { for (Record rec : expected) { writer.add(rec); } } List rows; - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(DataReader::create) - .build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(DataReader::create) + .build()) { rows = Lists.newArrayList(reader); } diff --git a/data/src/test/java/org/apache/iceberg/data/avro/TestGenericReadProjection.java b/data/src/test/java/org/apache/iceberg/data/avro/TestGenericReadProjection.java index 3450348b402e..7b2f2933716a 100644 --- a/data/src/test/java/org/apache/iceberg/data/avro/TestGenericReadProjection.java +++ b/data/src/test/java/org/apache/iceberg/data/avro/TestGenericReadProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.avro; import java.io.File; @@ -36,17 +35,19 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema File file = temp.newFile(desc + ".avro"); file.delete(); - try (FileAppender appender = Avro.write(Files.localOutput(file)) - .schema(writeSchema) - .createWriterFunc(DataWriter::create) - .build()) { + try (FileAppender appender = + Avro.write(Files.localOutput(file)) + .schema(writeSchema) + .createWriterFunc(DataWriter::create) + .build()) { appender.add(record); } - Iterable records = Avro.read(Files.localInput(file)) - .project(readSchema) - .createReaderFunc(DataReader::create) - .build(); + Iterable records = + Avro.read(Files.localInput(file)) + .project(readSchema) + .createReaderFunc(DataReader::create) + .build(); return Iterables.getOnlyElement(records); } diff --git a/data/src/test/java/org/apache/iceberg/data/avro/TestSingleMessageEncoding.java b/data/src/test/java/org/apache/iceberg/data/avro/TestSingleMessageEncoding.java index 84ac6062063b..ce85e8a90519 100644 --- a/data/src/test/java/org/apache/iceberg/data/avro/TestSingleMessageEncoding.java +++ b/data/src/test/java/org/apache/iceberg/data/avro/TestSingleMessageEncoding.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.avro; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.nio.ByteBuffer; import java.util.Arrays; import java.util.List; @@ -42,14 +44,10 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSingleMessageEncoding { - private static final Schema SCHEMA_V1 = new Schema( - required(0, "id", Types.IntegerType.get()), - optional(1, "msg", Types.StringType.get()) - ); + private static final Schema SCHEMA_V1 = + new Schema( + required(0, "id", Types.IntegerType.get()), optional(1, "msg", Types.StringType.get())); private static Record v1Record(int id, String msg) { Record rec = GenericRecord.create(SCHEMA_V1.asStruct()); @@ -58,18 +56,14 @@ private static Record v1Record(int id, String msg) { return rec; } - private static final List V1_RECORDS = Arrays.asList( - v1Record(1, "m-1"), - v1Record(2, "m-2"), - v1Record(4, "m-4"), - v1Record(6, "m-6") - ); + private static final List V1_RECORDS = + Arrays.asList(v1Record(1, "m-1"), v1Record(2, "m-2"), v1Record(4, "m-4"), v1Record(6, "m-6")); - private static final Schema SCHEMA_V2 = new Schema( - required(0, "id", Types.LongType.get()), - optional(1, "message", Types.StringType.get()), - optional(2, "data", Types.DoubleType.get()) - ); + private static final Schema SCHEMA_V2 = + new Schema( + required(0, "id", Types.LongType.get()), + optional(1, "message", Types.StringType.get()), + optional(2, "data", Types.DoubleType.get())); private static Record v2Record(long id, String message, Double data) { Record rec = GenericRecord.create(SCHEMA_V2.asStruct()); @@ -79,12 +73,12 @@ private static Record v2Record(long id, String message, Double data) { return rec; } - private static final List V2_RECORDS = Arrays.asList( - v2Record(3L, "m-3", 12.3), - v2Record(5L, "m-5", 23.4), - v2Record(7L, "m-7", 34.5), - v2Record(8L, "m-8", 35.6) - ); + private static final List V2_RECORDS = + Arrays.asList( + v2Record(3L, "m-3", 12.3), + v2Record(5L, "m-5", 23.4), + v2Record(7L, "m-7", 34.5), + v2Record(8L, "m-8", 35.6)); @Test public void testByteBufferRoundTrip() throws Exception { @@ -93,17 +87,15 @@ public void testByteBufferRoundTrip() throws Exception { Record copy = decoder.decode(encoder.encode(V2_RECORDS.get(0))); - Assert.assertTrue("Copy should not be the same object", - copy != V2_RECORDS.get(0)); - Assert.assertEquals("Record should be identical after round-trip", - V2_RECORDS.get(0), copy); + Assert.assertTrue("Copy should not be the same object", copy != V2_RECORDS.get(0)); + Assert.assertEquals("Record should be identical after round-trip", V2_RECORDS.get(0), copy); } @Test public void testSchemaEvolution() throws Exception { List buffers = Lists.newArrayList(); - List records = Ordering.usingToString().sortedCopy( - Iterables.concat(V1_RECORDS, V2_RECORDS)); + List records = + Ordering.usingToString().sortedCopy(Iterables.concat(V1_RECORDS, V2_RECORDS)); MessageEncoder v1Encoder = new IcebergEncoder<>(SCHEMA_V1); MessageEncoder v2Encoder = new IcebergEncoder<>(SCHEMA_V2); @@ -141,8 +133,8 @@ public void testCompatibleReadFailsWithoutSchema() throws Exception { ByteBuffer v1Buffer = v1Encoder.encode(V1_RECORDS.get(3)); Assertions.assertThatThrownBy(() -> v2Decoder.decode(v1Buffer)) - .isInstanceOf(MissingSchemaException.class) - .hasMessageContaining("Cannot resolve schema for fingerprint"); + .isInstanceOf(MissingSchemaException.class) + .hasMessageContaining("Cannot resolve schema for fingerprint"); } @Test @@ -186,8 +178,8 @@ public void testBufferReuse() throws Exception { Assert.assertEquals(b0.array(), b1.array()); MessageDecoder decoder = new IcebergDecoder<>(SCHEMA_V1); - Assert.assertEquals("Buffer was reused, decode(b0) should be record 1", - V1_RECORDS.get(1), decoder.decode(b0)); + Assert.assertEquals( + "Buffer was reused, decode(b0) should be record 1", V1_RECORDS.get(1), decoder.decode(b0)); } @Test @@ -201,8 +193,8 @@ public void testBufferCopy() throws Exception { MessageDecoder decoder = new IcebergDecoder<>(SCHEMA_V1); // bytes are not changed by reusing the encoder - Assert.assertEquals("Buffer was copied, decode(b0) should be record 0", - V1_RECORDS.get(0), decoder.decode(b0)); + Assert.assertEquals( + "Buffer was copied, decode(b0) should be record 0", V1_RECORDS.get(0), decoder.decode(b0)); } @Test @@ -215,8 +207,8 @@ public void testByteBufferMissingPayload() throws Exception { buffer.limit(12); Assertions.assertThatThrownBy(() -> decoder.decode(buffer)) - .isInstanceOf(AvroRuntimeException.class) - .hasMessage("Decoding datum failed"); + .isInstanceOf(AvroRuntimeException.class) + .hasMessage("Decoding datum failed"); } @Test @@ -229,8 +221,8 @@ public void testByteBufferMissingFullHeader() throws Exception { buffer.limit(8); Assertions.assertThatThrownBy(() -> decoder.decode(buffer)) - .isInstanceOf(BadHeaderException.class) - .hasMessage("Not enough header bytes"); + .isInstanceOf(BadHeaderException.class) + .hasMessage("Not enough header bytes"); } @Test @@ -242,8 +234,8 @@ public void testByteBufferBadMarkerByte() throws Exception { buffer.array()[0] = 0x00; Assertions.assertThatThrownBy(() -> decoder.decode(buffer)) - .isInstanceOf(BadHeaderException.class) - .hasMessageContaining("Unrecognized header bytes"); + .isInstanceOf(BadHeaderException.class) + .hasMessageContaining("Unrecognized header bytes"); } @Test @@ -255,8 +247,8 @@ public void testByteBufferBadVersionByte() throws Exception { buffer.array()[1] = 0x00; Assertions.assertThatThrownBy(() -> decoder.decode(buffer)) - .isInstanceOf(BadHeaderException.class) - .hasMessageContaining("Unrecognized header bytes"); + .isInstanceOf(BadHeaderException.class) + .hasMessageContaining("Unrecognized header bytes"); } @Test @@ -268,7 +260,7 @@ public void testByteBufferUnknownSchema() throws Exception { buffer.array()[4] = 0x00; Assertions.assertThatThrownBy(() -> decoder.decode(buffer)) - .isInstanceOf(MissingSchemaException.class) - .hasMessageContaining("Cannot resolve schema for fingerprint"); + .isInstanceOf(MissingSchemaException.class) + .hasMessageContaining("Cannot resolve schema for fingerprint"); } } diff --git a/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java b/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java index f86db77dd36d..180f7a6ad0fc 100644 --- a/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java +++ b/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.orc; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -50,9 +52,6 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestGenericData extends DataTest { @Override @@ -64,11 +63,12 @@ protected void writeAndValidate(Schema schema) throws IOException { @Test public void writeAndValidateRepeatingRecords() throws IOException { - Schema structSchema = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()) - ); - List expectedRepeating = Collections.nCopies(100, RandomGenericData.generate(structSchema, 1, 0L).get(0)); + Schema structSchema = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get())); + List expectedRepeating = + Collections.nCopies(100, RandomGenericData.generate(structSchema, 1, 0L).get(0)); writeAndValidateRecords(structSchema, expectedRepeating); } @@ -77,10 +77,10 @@ public void writeAndValidateRepeatingRecords() throws IOException { public void writeAndValidateTimestamps() throws IOException { TimeZone currentTz = TimeZone.getDefault(); try { - Schema timestampSchema = new Schema( - required(1, "tsTzCol", Types.TimestampType.withZone()), - required(2, "tsCol", Types.TimestampType.withoutZone()) - ); + Schema timestampSchema = + new Schema( + required(1, "tsTzCol", Types.TimestampType.withZone()), + required(2, "tsCol", Types.TimestampType.withoutZone())); // Write using America/New_York timezone TimeZone.setDefault(TimeZone.getTimeZone("America/New_York")); @@ -100,10 +100,11 @@ public void writeAndValidateTimestamps() throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = ORC.write(Files.localOutput(testFile)) - .schema(timestampSchema) - .createWriterFunc(GenericOrcWriter::buildWriter) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .schema(timestampSchema) + .createWriterFunc(GenericOrcWriter::buildWriter) + .build()) { writer.add(record1); writer.add(record2); writer.add(record3); @@ -113,21 +114,31 @@ public void writeAndValidateTimestamps() throws IOException { // Read using Asia/Kolkata timezone TimeZone.setDefault(TimeZone.getTimeZone("Asia/Kolkata")); List rows; - try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) - .project(timestampSchema) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(timestampSchema, fileSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(testFile)) + .project(timestampSchema) + .createReaderFunc( + fileSchema -> GenericOrcReader.buildReader(timestampSchema, fileSchema)) + .build()) { rows = Lists.newArrayList(reader); } - Assert.assertEquals(OffsetDateTime.parse("2017-01-17T01:10:34Z"), rows.get(0).getField("tsTzCol")); - Assert.assertEquals(LocalDateTime.parse("1970-01-01T00:01:00"), rows.get(0).getField("tsCol")); - Assert.assertEquals(OffsetDateTime.parse("2017-05-17T01:10:34Z"), rows.get(1).getField("tsTzCol")); - Assert.assertEquals(LocalDateTime.parse("1970-05-01T00:01:00"), rows.get(1).getField("tsCol")); - Assert.assertEquals(OffsetDateTime.parse("1935-01-17T01:10:34Z"), rows.get(2).getField("tsTzCol")); - Assert.assertEquals(LocalDateTime.parse("1935-01-01T00:01:00"), rows.get(2).getField("tsCol")); - Assert.assertEquals(OffsetDateTime.parse("1935-05-17T01:10:34Z"), rows.get(3).getField("tsTzCol")); - Assert.assertEquals(LocalDateTime.parse("1935-05-01T00:01:00"), rows.get(3).getField("tsCol")); + Assert.assertEquals( + OffsetDateTime.parse("2017-01-17T01:10:34Z"), rows.get(0).getField("tsTzCol")); + Assert.assertEquals( + LocalDateTime.parse("1970-01-01T00:01:00"), rows.get(0).getField("tsCol")); + Assert.assertEquals( + OffsetDateTime.parse("2017-05-17T01:10:34Z"), rows.get(1).getField("tsTzCol")); + Assert.assertEquals( + LocalDateTime.parse("1970-05-01T00:01:00"), rows.get(1).getField("tsCol")); + Assert.assertEquals( + OffsetDateTime.parse("1935-01-17T01:10:34Z"), rows.get(2).getField("tsTzCol")); + Assert.assertEquals( + LocalDateTime.parse("1935-01-01T00:01:00"), rows.get(2).getField("tsCol")); + Assert.assertEquals( + OffsetDateTime.parse("1935-05-17T01:10:34Z"), rows.get(3).getField("tsTzCol")); + Assert.assertEquals( + LocalDateTime.parse("1935-05-01T00:01:00"), rows.get(3).getField("tsCol")); } finally { TimeZone.setDefault(currentTz); } @@ -139,10 +150,11 @@ public void writeAndValidateExternalData() throws IOException { Assert.assertTrue("Delete should succeed", testFile.delete()); Configuration conf = new Configuration(); - TypeDescription writerSchema = TypeDescription.fromString("struct"); - Writer writer = OrcFile.createWriter(new Path(testFile.toString()), - OrcFile.writerOptions(conf) - .setSchema(writerSchema)); + TypeDescription writerSchema = + TypeDescription.fromString("struct"); + Writer writer = + OrcFile.createWriter( + new Path(testFile.toString()), OrcFile.writerOptions(conf).setSchema(writerSchema)); VectorizedRowBatch batch = writerSchema.createRowBatch(); batch.ensureSize(1); batch.size = 1; @@ -154,16 +166,17 @@ public void writeAndValidateExternalData() throws IOException { writer.close(); List rows; - Schema readSchema = new Schema( - optional(1, "a", Types.IntegerType.get()), - optional(2, "b", Types.IntegerType.get()), - optional(3, "c", Types.StringType.get()), - optional(4, "d", Types.StringType.get()) - ); - try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) - .project(readSchema) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(readSchema, fileSchema)) - .build()) { + Schema readSchema = + new Schema( + optional(1, "a", Types.IntegerType.get()), + optional(2, "b", Types.IntegerType.get()), + optional(3, "c", Types.StringType.get()), + optional(4, "d", Types.StringType.get())); + try (CloseableIterable reader = + ORC.read(Files.localInput(testFile)) + .project(readSchema) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(readSchema, fileSchema)) + .build()) { rows = Lists.newArrayList(reader); } Assert.assertEquals(1, rows.get(0).getField("a")); @@ -176,20 +189,22 @@ private void writeAndValidateRecords(Schema schema, List expected) throw File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = ORC.write(Files.localOutput(testFile)) - .schema(schema) - .createWriterFunc(GenericOrcWriter::buildWriter) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .schema(schema) + .createWriterFunc(GenericOrcWriter::buildWriter) + .build()) { for (Record rec : expected) { writer.add(rec); } } List rows; - try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(schema, fileSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(schema, fileSchema)) + .build()) { rows = Lists.newArrayList(reader); } diff --git a/data/src/test/java/org/apache/iceberg/data/orc/TestGenericReadProjection.java b/data/src/test/java/org/apache/iceberg/data/orc/TestGenericReadProjection.java index 1aab27dbedb4..0d8a58e4771b 100644 --- a/data/src/test/java/org/apache/iceberg/data/orc/TestGenericReadProjection.java +++ b/data/src/test/java/org/apache/iceberg/data/orc/TestGenericReadProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.orc; import java.io.File; @@ -32,23 +31,24 @@ public class TestGenericReadProjection extends TestReadProjection { @Override - protected Record writeAndRead(String desc, - Schema writeSchema, Schema readSchema, - Record record) throws IOException { + protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, Record record) + throws IOException { File file = temp.newFile(desc + ".orc"); file.delete(); - try (FileAppender appender = ORC.write(Files.localOutput(file)) - .schema(writeSchema) - .createWriterFunc(GenericOrcWriter::buildWriter) - .build()) { + try (FileAppender appender = + ORC.write(Files.localOutput(file)) + .schema(writeSchema) + .createWriterFunc(GenericOrcWriter::buildWriter) + .build()) { appender.add(record); } - Iterable records = ORC.read(Files.localInput(file)) - .project(readSchema) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(readSchema, fileSchema)) - .build(); + Iterable records = + ORC.read(Files.localInput(file)) + .project(readSchema) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(readSchema, fileSchema)) + .build(); return Iterables.getOnlyElement(records); } diff --git a/data/src/test/java/org/apache/iceberg/data/orc/TestOrcDataWriter.java b/data/src/test/java/org/apache/iceberg/data/orc/TestOrcDataWriter.java index e3fd87dc6278..f2e2f1d4f354 100644 --- a/data/src/test/java/org/apache/iceberg/data/orc/TestOrcDataWriter.java +++ b/data/src/test/java/org/apache/iceberg/data/orc/TestOrcDataWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.orc; import java.io.IOException; @@ -45,14 +44,14 @@ import org.junit.rules.TemporaryFolder; public class TestOrcDataWriter { - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); private List records; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Before public void createRecords() { @@ -72,18 +71,16 @@ public void createRecords() { public void testDataWriter() throws IOException { OutputFile file = Files.localOutput(temp.newFile()); - SortOrder sortOrder = SortOrder.builderFor(SCHEMA) - .withOrderId(10) - .asc("id") - .build(); + SortOrder sortOrder = SortOrder.builderFor(SCHEMA).withOrderId(10).asc("id").build(); - DataWriter dataWriter = ORC.writeData(file) - .schema(SCHEMA) - .createWriterFunc(GenericOrcWriter::buildWriter) - .overwrite() - .withSpec(PartitionSpec.unpartitioned()) - .withSortOrder(sortOrder) - .build(); + DataWriter dataWriter = + ORC.writeData(file) + .schema(SCHEMA) + .createWriterFunc(GenericOrcWriter::buildWriter) + .overwrite() + .withSpec(PartitionSpec.unpartitioned()) + .withSortOrder(sortOrder) + .build(); try { for (Record record : records) { @@ -99,14 +96,16 @@ public void testDataWriter() throws IOException { Assert.assertEquals("Should be data file", FileContent.DATA, dataFile.content()); Assert.assertEquals("Record count should match", records.size(), dataFile.recordCount()); Assert.assertEquals("Partition should be empty", 0, dataFile.partition().size()); - Assert.assertEquals("Sort order should match", sortOrder.orderId(), (int) dataFile.sortOrderId()); + Assert.assertEquals( + "Sort order should match", sortOrder.orderId(), (int) dataFile.sortOrderId()); Assert.assertNull("Key metadata should be null", dataFile.keyMetadata()); List writtenRecords; - try (CloseableIterable reader = ORC.read(file.toInputFile()) - .project(SCHEMA) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(SCHEMA, fileSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(file.toInputFile()) + .project(SCHEMA) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(SCHEMA, fileSchema)) + .build()) { writtenRecords = Lists.newArrayList(reader); } diff --git a/data/src/test/java/org/apache/iceberg/data/orc/TestOrcRowIterator.java b/data/src/test/java/org/apache/iceberg/data/orc/TestOrcRowIterator.java index ab646b22e69c..8dcf73776007 100644 --- a/data/src/test/java/org/apache/iceberg/data/orc/TestOrcRowIterator.java +++ b/data/src/test/java/org/apache/iceberg/data/orc/TestOrcRowIterator.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.orc; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -42,13 +43,9 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestOrcRowIterator { - private static final Schema DATA_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()) - ); + private static final Schema DATA_SCHEMA = new Schema(required(100, "id", Types.LongType.get())); private static final int NUM_ROWS = 8000; private static final List DATA_ROWS; @@ -62,8 +59,7 @@ public class TestOrcRowIterator { } } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File testFile; @@ -72,15 +68,17 @@ public void writeFile() throws IOException { testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = ORC.write(Files.localOutput(testFile)) - .createWriterFunc(GenericOrcWriter::buildWriter) - .schema(DATA_SCHEMA) - // write in such a way that the file contains 2 stripes each with 4 row groups of 1000 rows - .set("iceberg.orc.vectorbatch.size", "1000") - .set(OrcConf.ROW_INDEX_STRIDE.getAttribute(), "1000") - .set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "4000") - .set(OrcConf.STRIPE_SIZE.getAttribute(), "1") - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .createWriterFunc(GenericOrcWriter::buildWriter) + .schema(DATA_SCHEMA) + // write in such a way that the file contains 2 stripes each with 4 row groups of 1000 + // rows + .set("iceberg.orc.vectorbatch.size", "1000") + .set(OrcConf.ROW_INDEX_STRIDE.getAttribute(), "1000") + .set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "4000") + .set(OrcConf.STRIPE_SIZE.getAttribute(), "1") + .build()) { writer.addAll(DATA_ROWS); } } @@ -98,17 +96,20 @@ public void testReadFilteredRowGroupInMiddle() throws IOException { // We skip the 2nd row group [1000, 2000] in Stripe 1 // With default batch size of 1024, will read the following batches // Stripe 1: 1000, 1024, 976 - readAndValidate(Expressions.in("id", 500, 2500, 3500), - Lists.newArrayList(Iterables.concat(DATA_ROWS.subList(0, 1000), DATA_ROWS.subList(2000, 4000)))); + readAndValidate( + Expressions.in("id", 500, 2500, 3500), + Lists.newArrayList( + Iterables.concat(DATA_ROWS.subList(0, 1000), DATA_ROWS.subList(2000, 4000)))); } private void readAndValidate(Expression filter, List expected) throws IOException { List rows; - try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) - .project(DATA_SCHEMA) - .filter(filter) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(DATA_SCHEMA, fileSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(testFile)) + .project(DATA_SCHEMA) + .filter(filter) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(DATA_SCHEMA, fileSchema)) + .build()) { rows = Lists.newArrayList(reader); } diff --git a/data/src/test/java/org/apache/iceberg/data/parquet/TestGenericData.java b/data/src/test/java/org/apache/iceberg/data/parquet/TestGenericData.java index 73839ef2131a..71dd16d584a2 100644 --- a/data/src/test/java/org/apache/iceberg/data/parquet/TestGenericData.java +++ b/data/src/test/java/org/apache/iceberg/data/parquet/TestGenericData.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; @@ -46,8 +47,6 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestGenericData extends DataTest { @Override protected void writeAndValidate(Schema schema) throws IOException { @@ -56,18 +55,20 @@ protected void writeAndValidate(Schema schema) throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender appender = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .createWriterFunc(GenericParquetWriter::buildWriter) - .build()) { + try (FileAppender appender = + Parquet.write(Files.localOutput(testFile)) + .schema(schema) + .createWriterFunc(GenericParquetWriter::buildWriter) + .build()) { appender.addAll(expected); } List rows; - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema)) + .build()) { rows = Lists.newArrayList(reader); } @@ -76,7 +77,8 @@ protected void writeAndValidate(Schema schema) throws IOException { } // test reuseContainers - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) .project(schema) .reuseContainers() .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema)) @@ -93,22 +95,22 @@ protected void writeAndValidate(Schema schema) throws IOException { @Test public void testTwoLevelList() throws IOException { - Schema schema = new Schema( - optional(1, "arraybytes", Types.ListType.ofRequired(3, Types.BinaryType.get())), - optional(2, "topbytes", Types.BinaryType.get()) - ); + Schema schema = + new Schema( + optional(1, "arraybytes", Types.ListType.ofRequired(3, Types.BinaryType.get())), + optional(2, "topbytes", Types.BinaryType.get())); org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(schema.asStruct()); File testFile = temp.newFile(); Assert.assertTrue(testFile.delete()); - ParquetWriter - writer = AvroParquetWriter.builder(new Path(testFile.toURI())) - .withDataModel(GenericData.get()) - .withSchema(avroSchema) - .config("parquet.avro.add-list-element-records", "true") - .config("parquet.avro.write-old-list-structure", "true") - .build(); + ParquetWriter writer = + AvroParquetWriter.builder(new Path(testFile.toURI())) + .withDataModel(GenericData.get()) + .withSchema(avroSchema) + .config("parquet.avro.add-list-element-records", "true") + .config("parquet.avro.write-old-list-structure", "true") + .build(); GenericRecordBuilder recordBuilder = new GenericRecordBuilder(avroSchema); List expectedByteList = new ArrayList(); @@ -123,11 +125,12 @@ public void testTwoLevelList() throws IOException { writer.close(); // test reuseContainers - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(schema) - .reuseContainers() - .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .reuseContainers() + .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema)) + .build()) { CloseableIterator it = reader.iterator(); Assert.assertTrue("Should have at least one row", it.hasNext()); while (it.hasNext()) { diff --git a/data/src/test/java/org/apache/iceberg/data/parquet/TestGenericReadProjection.java b/data/src/test/java/org/apache/iceberg/data/parquet/TestGenericReadProjection.java index 9a0810dc16e3..87f770bfccfd 100644 --- a/data/src/test/java/org/apache/iceberg/data/parquet/TestGenericReadProjection.java +++ b/data/src/test/java/org/apache/iceberg/data/parquet/TestGenericReadProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.parquet; import java.io.File; @@ -36,17 +35,20 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema File file = temp.newFile(desc + ".parquet"); file.delete(); - try (FileAppender appender = Parquet.write(Files.localOutput(file)) - .schema(writeSchema) - .createWriterFunc(GenericParquetWriter::buildWriter) - .build()) { + try (FileAppender appender = + Parquet.write(Files.localOutput(file)) + .schema(writeSchema) + .createWriterFunc(GenericParquetWriter::buildWriter) + .build()) { appender.add(record); } - Iterable records = Parquet.read(Files.localInput(file)) - .project(readSchema) - .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(readSchema, fileSchema)) - .build(); + Iterable records = + Parquet.read(Files.localInput(file)) + .project(readSchema) + .createReaderFunc( + fileSchema -> GenericParquetReaders.buildReader(readSchema, fileSchema)) + .build(); return Iterables.getOnlyElement(records); } diff --git a/data/src/test/java/org/apache/iceberg/io/TestAppenderFactory.java b/data/src/test/java/org/apache/iceberg/io/TestAppenderFactory.java index bea63e0d93fc..4e8de087d6cd 100644 --- a/data/src/test/java/org/apache/iceberg/io/TestAppenderFactory.java +++ b/data/src/test/java/org/apache/iceberg/io/TestAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.File; @@ -67,16 +66,15 @@ public abstract class TestAppenderFactory extends TableTestBase { @Parameterized.Parameters(name = "FileFormat={0}, Partitioned={1}") public static Object[] parameters() { return new Object[][] { - new Object[] {"avro", false}, - new Object[] {"avro", true}, - new Object[] {"orc", false}, - new Object[] {"orc", true}, - new Object[] {"parquet", false}, - new Object[] {"parquet", true} + new Object[] {"avro", false}, + new Object[] {"avro", true}, + new Object[] {"orc", false}, + new Object[] {"orc", true}, + new Object[] {"parquet", false}, + new Object[] {"parquet", true} }; } - public TestAppenderFactory(String fileFormat, boolean partitioned) { super(FORMAT_V2); this.format = FileFormat.valueOf(fileFormat.toUpperCase(Locale.ENGLISH)); @@ -99,14 +97,11 @@ public void setupTable() throws Exception { this.partition = createPartitionKey(); this.fileFactory = OutputFileFactory.builderFor(table, 1, 1).format(format).build(); - table.updateProperties() - .defaultFormat(format) - .commit(); + table.updateProperties().defaultFormat(format).commit(); } - protected abstract FileAppenderFactory createAppenderFactory(List equalityFieldIds, - Schema eqDeleteSchema, - Schema posDeleteRowSchema); + protected abstract FileAppenderFactory createAppenderFactory( + List equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema); protected abstract T createRow(Integer id, String data); @@ -147,12 +142,13 @@ private List testRowSet() { createRow(2, "bbb"), createRow(3, "ccc"), createRow(4, "ddd"), - createRow(5, "eee") - ); + createRow(5, "eee")); } - private DataFile prepareDataFile(List rowSet, FileAppenderFactory appenderFactory) throws IOException { - DataWriter writer = appenderFactory.newDataWriter(createEncryptedOutputFile(), format, partition); + private DataFile prepareDataFile(List rowSet, FileAppenderFactory appenderFactory) + throws IOException { + DataWriter writer = + appenderFactory.newDataWriter(createEncryptedOutputFile(), format, partition); try (DataWriter closeableWriter = writer) { for (T row : rowSet) { closeableWriter.write(row); @@ -169,58 +165,49 @@ public void testDataWriter() throws IOException { List rowSet = testRowSet(); DataFile dataFile = prepareDataFile(rowSet, appenderFactory); - table.newRowDelta() - .addRows(dataFile) - .commit(); + table.newRowDelta().addRows(dataFile).commit(); - Assert.assertEquals("Should have the expected records.", expectedRowSet(rowSet), actualRowSet("*")); + Assert.assertEquals( + "Should have the expected records.", expectedRowSet(rowSet), actualRowSet("*")); } @Test public void testEqDeleteWriter() throws IOException { List equalityFieldIds = Lists.newArrayList(table.schema().findField("id").fieldId()); Schema eqDeleteRowSchema = table.schema().select("id"); - FileAppenderFactory appenderFactory = createAppenderFactory(equalityFieldIds, eqDeleteRowSchema, null); + FileAppenderFactory appenderFactory = + createAppenderFactory(equalityFieldIds, eqDeleteRowSchema, null); List rowSet = testRowSet(); DataFile dataFile = prepareDataFile(rowSet, appenderFactory); - table.newRowDelta() - .addRows(dataFile) - .commit(); + table.newRowDelta().addRows(dataFile).commit(); - // The equality field is 'id'. No matter what the value of 'data' field is, we should delete the 1th, 3th, 5th + // The equality field is 'id'. No matter what the value of 'data' field is, we should delete + // the 1th, 3th, 5th // rows. - List deletes = Lists.newArrayList( - createRow(1, "aaa"), - createRow(3, "bbb"), - createRow(5, "ccc") - ); + List deletes = + Lists.newArrayList(createRow(1, "aaa"), createRow(3, "bbb"), createRow(5, "ccc")); EncryptedOutputFile out = createEncryptedOutputFile(); - EqualityDeleteWriter eqDeleteWriter = appenderFactory.newEqDeleteWriter(out, format, partition); + EqualityDeleteWriter eqDeleteWriter = + appenderFactory.newEqDeleteWriter(out, format, partition); try (EqualityDeleteWriter closeableWriter = eqDeleteWriter) { closeableWriter.deleteAll(deletes); } // Check that the delete equality file has the expected equality deletes. GenericRecord gRecord = GenericRecord.create(eqDeleteRowSchema); - Set expectedDeletes = Sets.newHashSet( - gRecord.copy("id", 1), - gRecord.copy("id", 3), - gRecord.copy("id", 5) - ); - Assert.assertEquals(expectedDeletes, + Set expectedDeletes = + Sets.newHashSet(gRecord.copy("id", 1), gRecord.copy("id", 3), gRecord.copy("id", 5)); + Assert.assertEquals( + expectedDeletes, Sets.newHashSet(createReader(eqDeleteRowSchema, out.encryptingOutputFile().toInputFile()))); - table.newRowDelta() - .addDeletes(eqDeleteWriter.toDeleteFile()) - .commit(); + table.newRowDelta().addDeletes(eqDeleteWriter.toDeleteFile()).commit(); - List expected = Lists.newArrayList( - createRow(2, "bbb"), - createRow(4, "ddd") - ); - Assert.assertEquals("Should have the expected records", expectedRowSet(expected), actualRowSet("*")); + List expected = Lists.newArrayList(createRow(2, "bbb"), createRow(4, "ddd")); + Assert.assertEquals( + "Should have the expected records", expectedRowSet(expected), actualRowSet("*")); } @Test @@ -231,14 +218,15 @@ public void testPosDeleteWriter() throws IOException { List rowSet = testRowSet(); DataFile dataFile = prepareDataFile(rowSet, appenderFactory); - List> deletes = Lists.newArrayList( - Pair.of(dataFile.path(), 0L), - Pair.of(dataFile.path(), 2L), - Pair.of(dataFile.path(), 4L) - ); + List> deletes = + Lists.newArrayList( + Pair.of(dataFile.path(), 0L), + Pair.of(dataFile.path(), 2L), + Pair.of(dataFile.path(), 4L)); EncryptedOutputFile out = createEncryptedOutputFile(); - PositionDeleteWriter eqDeleteWriter = appenderFactory.newPosDeleteWriter(out, format, partition); + PositionDeleteWriter eqDeleteWriter = + appenderFactory.newPosDeleteWriter(out, format, partition); try (PositionDeleteWriter closeableWriter = eqDeleteWriter) { for (Pair delete : deletes) { closeableWriter.delete(delete.first(), delete.second()); @@ -248,26 +236,26 @@ public void testPosDeleteWriter() throws IOException { // Check that the pos delete file has the expected pos deletes. Schema pathPosSchema = DeleteSchemaUtil.pathPosSchema(); GenericRecord gRecord = GenericRecord.create(pathPosSchema); - Set expectedDeletes = Sets.newHashSet( - gRecord.copy("file_path", dataFile.path(), "pos", 0L), - gRecord.copy("file_path", dataFile.path(), "pos", 2L), - gRecord.copy("file_path", dataFile.path(), "pos", 4L) - ); - Assert.assertEquals(expectedDeletes, + Set expectedDeletes = + Sets.newHashSet( + gRecord.copy("file_path", dataFile.path(), "pos", 0L), + gRecord.copy("file_path", dataFile.path(), "pos", 2L), + gRecord.copy("file_path", dataFile.path(), "pos", 4L)); + Assert.assertEquals( + expectedDeletes, Sets.newHashSet(createReader(pathPosSchema, out.encryptingOutputFile().toInputFile()))); - table.newRowDelta() + table + .newRowDelta() .addRows(dataFile) .addDeletes(eqDeleteWriter.toDeleteFile()) .validateDataFilesExist(eqDeleteWriter.referencedDataFiles()) .validateDeletedFiles() .commit(); - List expected = Lists.newArrayList( - createRow(2, "bbb"), - createRow(4, "ddd") - ); - Assert.assertEquals("Should have the expected records", expectedRowSet(expected), actualRowSet("*")); + List expected = Lists.newArrayList(createRow(2, "bbb"), createRow(4, "ddd")); + Assert.assertEquals( + "Should have the expected records", expectedRowSet(expected), actualRowSet("*")); } @Test @@ -277,14 +265,15 @@ public void testPosDeleteWriterWithRowSchema() throws IOException { List rowSet = testRowSet(); DataFile dataFile = prepareDataFile(rowSet, appenderFactory); - List> deletes = Lists.newArrayList( - positionDelete(dataFile.path(), 0, rowSet.get(0)), - positionDelete(dataFile.path(), 2, rowSet.get(2)), - positionDelete(dataFile.path(), 4, rowSet.get(4)) - ); + List> deletes = + Lists.newArrayList( + positionDelete(dataFile.path(), 0, rowSet.get(0)), + positionDelete(dataFile.path(), 2, rowSet.get(2)), + positionDelete(dataFile.path(), 4, rowSet.get(4))); EncryptedOutputFile out = createEncryptedOutputFile(); - PositionDeleteWriter eqDeleteWriter = appenderFactory.newPosDeleteWriter(out, format, partition); + PositionDeleteWriter eqDeleteWriter = + appenderFactory.newPosDeleteWriter(out, format, partition); try (PositionDeleteWriter closeableWriter = eqDeleteWriter) { for (PositionDelete delete : deletes) { closeableWriter.delete(delete.path(), delete.pos(), delete.row()); @@ -295,26 +284,44 @@ public void testPosDeleteWriterWithRowSchema() throws IOException { Schema pathPosRowSchema = DeleteSchemaUtil.posDeleteSchema(table.schema()); GenericRecord gRecord = GenericRecord.create(pathPosRowSchema); GenericRecord rowRecord = GenericRecord.create(table.schema()); - Set expectedDeletes = Sets.newHashSet( - gRecord.copy("file_path", dataFile.path(), "pos", 0L, "row", rowRecord.copy("id", 1, "data", "aaa")), - gRecord.copy("file_path", dataFile.path(), "pos", 2L, "row", rowRecord.copy("id", 3, "data", "ccc")), - gRecord.copy("file_path", dataFile.path(), "pos", 4L, "row", rowRecord.copy("id", 5, "data", "eee")) - ); - Assert.assertEquals(expectedDeletes, + Set expectedDeletes = + Sets.newHashSet( + gRecord.copy( + "file_path", + dataFile.path(), + "pos", + 0L, + "row", + rowRecord.copy("id", 1, "data", "aaa")), + gRecord.copy( + "file_path", + dataFile.path(), + "pos", + 2L, + "row", + rowRecord.copy("id", 3, "data", "ccc")), + gRecord.copy( + "file_path", + dataFile.path(), + "pos", + 4L, + "row", + rowRecord.copy("id", 5, "data", "eee"))); + Assert.assertEquals( + expectedDeletes, Sets.newHashSet(createReader(pathPosRowSchema, out.encryptingOutputFile().toInputFile()))); - table.newRowDelta() + table + .newRowDelta() .addRows(dataFile) .addDeletes(eqDeleteWriter.toDeleteFile()) .validateDataFilesExist(eqDeleteWriter.referencedDataFiles()) .validateDeletedFiles() .commit(); - List expected = Lists.newArrayList( - createRow(2, "bbb"), - createRow(4, "ddd") - ); - Assert.assertEquals("Should have the expected records", expectedRowSet(expected), actualRowSet("*")); + List expected = Lists.newArrayList(createRow(2, "bbb"), createRow(4, "ddd")); + Assert.assertEquals( + "Should have the expected records", expectedRowSet(expected), actualRowSet("*")); } private CloseableIterable createReader(Schema schema, InputFile inputFile) { @@ -326,10 +333,7 @@ private CloseableIterable createReader(Schema schema, InputFile inputFil .build(); case AVRO: - return Avro.read(inputFile) - .project(schema) - .createReaderFunc(DataReader::create) - .build(); + return Avro.read(inputFile).project(schema).createReaderFunc(DataReader::create).build(); case ORC: return ORC.read(inputFile) diff --git a/data/src/test/java/org/apache/iceberg/io/TestBaseTaskWriter.java b/data/src/test/java/org/apache/iceberg/io/TestBaseTaskWriter.java index 9bb762727ca0..b5a863bf85a5 100644 --- a/data/src/test/java/org/apache/iceberg/io/TestBaseTaskWriter.java +++ b/data/src/test/java/org/apache/iceberg/io/TestBaseTaskWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.File; @@ -56,11 +55,7 @@ public class TestBaseTaskWriter extends TableTestBase { @Parameterized.Parameters(name = "FileFormat = {0}") public static Object[][] parameters() { - return new Object[][] { - {"avro"}, - {"orc"}, - {"parquet"} - }; + return new Object[][] {{"avro"}, {"orc"}, {"parquet"}}; } public TestBaseTaskWriter(String fileFormat) { @@ -81,12 +76,15 @@ public void setupTable() throws IOException { int firstFieldId = table.schema().findField("id").fieldId(); int secondFieldId = table.schema().findField("data").fieldId(); - this.appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), - new int[] {firstFieldId, secondFieldId}, table.schema(), null); - - table.updateProperties() - .defaultFormat(format) - .commit(); + this.appenderFactory = + new GenericAppenderFactory( + table.schema(), + table.spec(), + new int[] {firstFieldId, secondFieldId}, + table.schema(), + null); + + table.updateProperties().defaultFormat(format).commit(); } private Record createRecord(Integer id, String data) { @@ -127,9 +125,10 @@ public void testAbort() throws IOException { taskWriter.close(); // Assert the current data file count. - files = Files.list(Paths.get(tableDir.getPath(), "data")) - .filter(p -> !p.toString().endsWith(".crc")) - .collect(Collectors.toList()); + files = + Files.list(Paths.get(tableDir.getPath(), "data")) + .filter(p -> !p.toString().endsWith(".crc")) + .collect(Collectors.toList()); Assert.assertEquals("Should have 4 files but the files are: " + files, 4, files.size()); // Abort to clean all delete files and data files. @@ -189,7 +188,8 @@ public void testRollIfExceedTargetFileSize() throws IOException { Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); rowDelta.commit(); - Assert.assertEquals("Should have expected records", expectedRowSet(expected), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", expectedRowSet(expected), actualRowSet("*")); } private StructLikeSet expectedRowSet(Iterable records) { @@ -207,7 +207,8 @@ private StructLikeSet actualRowSet(String... columns) throws IOException { } private TestTaskWriter createTaskWriter(long targetFileSize) { - return new TestTaskWriter(table.spec(), format, appenderFactory, fileFactory, table.io(), targetFileSize); + return new TestTaskWriter( + table.spec(), format, appenderFactory, fileFactory, table.io(), targetFileSize); } private static class TestTaskWriter extends BaseTaskWriter { @@ -215,10 +216,13 @@ private static class TestTaskWriter extends BaseTaskWriter { private RollingFileWriter dataWriter; private RollingEqDeleteWriter deleteWriter; - private TestTaskWriter(PartitionSpec spec, FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO io, - long targetFileSize) { + private TestTaskWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.dataWriter = new RollingFileWriter(null); this.deleteWriter = new RollingEqDeleteWriter(null); diff --git a/data/src/test/java/org/apache/iceberg/io/TestFileWriterFactory.java b/data/src/test/java/org/apache/iceberg/io/TestFileWriterFactory.java index 94edd247d495..c15d981999ac 100644 --- a/data/src/test/java/org/apache/iceberg/io/TestFileWriterFactory.java +++ b/data/src/test/java/org/apache/iceberg/io/TestFileWriterFactory.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_PATH; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_POS; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; + import java.io.File; import java.io.IOException; import java.util.List; @@ -53,21 +56,17 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_PATH; -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_POS; -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; - @RunWith(Parameterized.class) public abstract class TestFileWriterFactory extends WriterTestBase { @Parameterized.Parameters(name = "FileFormat={0}, Partitioned={1}") public static Object[] parameters() { return new Object[][] { - new Object[]{FileFormat.AVRO, false}, - new Object[]{FileFormat.AVRO, true}, - new Object[]{FileFormat.PARQUET, false}, - new Object[]{FileFormat.PARQUET, true}, - new Object[]{FileFormat.ORC, false}, - new Object[]{FileFormat.ORC, true} + new Object[] {FileFormat.AVRO, false}, + new Object[] {FileFormat.AVRO, true}, + new Object[] {FileFormat.PARQUET, false}, + new Object[] {FileFormat.PARQUET, true}, + new Object[] {FileFormat.ORC, false}, + new Object[] {FileFormat.ORC, true} }; } @@ -85,13 +84,9 @@ public TestFileWriterFactory(FileFormat fileFormat, boolean partitioned) { super(TABLE_FORMAT_VERSION); this.fileFormat = fileFormat; this.partitioned = partitioned; - this.dataRows = ImmutableList.of( - toRow(1, "aaa"), - toRow(2, "aaa"), - toRow(3, "aaa"), - toRow(4, "aaa"), - toRow(5, "aaa") - ); + this.dataRows = + ImmutableList.of( + toRow(1, "aaa"), toRow(2, "aaa"), toRow(3, "aaa"), toRow(4, "aaa"), toRow(5, "aaa")); } protected abstract StructLikeSet toSet(Iterable records); @@ -125,9 +120,7 @@ public void testDataWriter() throws IOException { DataFile dataFile = writeData(writerFactory, dataRows, table.spec(), partition); - table.newRowDelta() - .addRows(dataFile) - .commit(); + table.newRowDelta().addRows(dataFile).commit(); Assert.assertEquals("Records should match", toSet(dataRows), actualRowSet("*")); } @@ -136,45 +129,33 @@ public void testDataWriter() throws IOException { public void testEqualityDeleteWriter() throws IOException { List equalityFieldIds = ImmutableList.of(table.schema().findField("id").fieldId()); Schema equalityDeleteRowSchema = table.schema().select("id"); - FileWriterFactory writerFactory = newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema); + FileWriterFactory writerFactory = + newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema); // write a data file DataFile dataFile = writeData(writerFactory, dataRows, table.spec(), partition); // commit the written data file - table.newRowDelta() - .addRows(dataFile) - .commit(); + table.newRowDelta().addRows(dataFile).commit(); // write an equality delete file - List deletes = ImmutableList.of( - toRow(1, "aaa"), - toRow(3, "bbb"), - toRow(5, "ccc") - ); + List deletes = ImmutableList.of(toRow(1, "aaa"), toRow(3, "bbb"), toRow(5, "ccc")); DeleteFile deleteFile = writeEqualityDeletes(writerFactory, deletes, table.spec(), partition); // verify the written delete file GenericRecord deleteRecord = GenericRecord.create(equalityDeleteRowSchema); - List expectedDeletes = ImmutableList.of( - deleteRecord.copy("id", 1), - deleteRecord.copy("id", 3), - deleteRecord.copy("id", 5) - ); + List expectedDeletes = + ImmutableList.of( + deleteRecord.copy("id", 1), deleteRecord.copy("id", 3), deleteRecord.copy("id", 5)); InputFile inputDeleteFile = table.io().newInputFile(deleteFile.path().toString()); List actualDeletes = readFile(equalityDeleteRowSchema, inputDeleteFile); Assert.assertEquals("Delete records must match", expectedDeletes, actualDeletes); // commit the written delete file - table.newRowDelta() - .addDeletes(deleteFile) - .commit(); + table.newRowDelta().addDeletes(deleteFile).commit(); // verify the delete file is applied correctly - List expectedRows = ImmutableList.of( - toRow(2, "aaa"), - toRow(4, "aaa") - ); + List expectedRows = ImmutableList.of(toRow(2, "aaa"), toRow(4, "aaa")); Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*")); } @@ -184,59 +165,49 @@ public void testEqualityDeleteWriterWithMultipleSpecs() throws IOException { List equalityFieldIds = ImmutableList.of(table.schema().findField("id").fieldId()); Schema equalityDeleteRowSchema = table.schema().select("id"); - FileWriterFactory writerFactory = newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema); + FileWriterFactory writerFactory = + newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema); // write an unpartitioned data file DataFile firstDataFile = writeData(writerFactory, dataRows, table.spec(), partition); - Assert.assertEquals("First data file must be unpartitioned", 0, firstDataFile.partition().size()); + Assert.assertEquals( + "First data file must be unpartitioned", 0, firstDataFile.partition().size()); - List deletes = ImmutableList.of( - toRow(1, "aaa"), - toRow(2, "aaa"), - toRow(3, "aaa"), - toRow(4, "aaa") - ); + List deletes = + ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(3, "aaa"), toRow(4, "aaa")); // write an unpartitioned delete file - DeleteFile firstDeleteFile = writeEqualityDeletes(writerFactory, deletes, table.spec(), partition); - Assert.assertEquals("First delete file must be unpartitioned", 0, firstDeleteFile.partition().size()); + DeleteFile firstDeleteFile = + writeEqualityDeletes(writerFactory, deletes, table.spec(), partition); + Assert.assertEquals( + "First delete file must be unpartitioned", 0, firstDeleteFile.partition().size()); // commit the first data and delete files - table.newAppend() - .appendFile(firstDataFile) - .commit(); - table.newRowDelta() - .addDeletes(firstDeleteFile) - .commit(); + table.newAppend().appendFile(firstDataFile).commit(); + table.newRowDelta().addDeletes(firstDeleteFile).commit(); // evolve the spec - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); partition = partitionKey(table.spec(), PARTITION_VALUE); // write a partitioned data file DataFile secondDataFile = writeData(writerFactory, dataRows, table.spec(), partition); - Assert.assertEquals("Second data file must be partitioned", 1, secondDataFile.partition().size()); + Assert.assertEquals( + "Second data file must be partitioned", 1, secondDataFile.partition().size()); // write a partitioned delete file - DeleteFile secondDeleteFile = writeEqualityDeletes(writerFactory, deletes, table.spec(), partition); - Assert.assertEquals("Second delete file must be artitioned", 1, secondDeleteFile.partition().size()); + DeleteFile secondDeleteFile = + writeEqualityDeletes(writerFactory, deletes, table.spec(), partition); + Assert.assertEquals( + "Second delete file must be artitioned", 1, secondDeleteFile.partition().size()); // commit the second data and delete files - table.newAppend() - .appendFile(secondDataFile) - .commit(); - table.newRowDelta() - .addDeletes(secondDeleteFile) - .commit(); + table.newAppend().appendFile(secondDataFile).commit(); + table.newRowDelta().addDeletes(secondDeleteFile).commit(); // verify both delete files are applied correctly - List expectedRows = ImmutableList.of( - toRow(5, "aaa"), - toRow(5, "aaa") - ); + List expectedRows = ImmutableList.of(toRow(5, "aaa"), toRow(5, "aaa")); Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*")); } @@ -248,28 +219,31 @@ public void testPositionDeleteWriter() throws IOException { DataFile dataFile = writeData(writerFactory, dataRows, table.spec(), partition); // write a position delete file - List> deletes = ImmutableList.of( - positionDelete(dataFile.path(), 0L, null), - positionDelete(dataFile.path(), 2L, null), - positionDelete(dataFile.path(), 4L, null) - ); - Pair result = writePositionDeletes(writerFactory, deletes, table.spec(), partition); + List> deletes = + ImmutableList.of( + positionDelete(dataFile.path(), 0L, null), + positionDelete(dataFile.path(), 2L, null), + positionDelete(dataFile.path(), 4L, null)); + Pair result = + writePositionDeletes(writerFactory, deletes, table.spec(), partition); DeleteFile deleteFile = result.first(); CharSequenceSet referencedDataFiles = result.second(); // verify the written delete file GenericRecord deleteRecord = GenericRecord.create(DeleteSchemaUtil.pathPosSchema()); - List expectedDeletes = ImmutableList.of( - deleteRecord.copy(DELETE_FILE_PATH.name(), dataFile.path(), DELETE_FILE_POS.name(), 0L), - deleteRecord.copy(DELETE_FILE_PATH.name(), dataFile.path(), DELETE_FILE_POS.name(), 2L), - deleteRecord.copy(DELETE_FILE_PATH.name(), dataFile.path(), DELETE_FILE_POS.name(), 4L) - ); + List expectedDeletes = + ImmutableList.of( + deleteRecord.copy(DELETE_FILE_PATH.name(), dataFile.path(), DELETE_FILE_POS.name(), 0L), + deleteRecord.copy(DELETE_FILE_PATH.name(), dataFile.path(), DELETE_FILE_POS.name(), 2L), + deleteRecord.copy( + DELETE_FILE_PATH.name(), dataFile.path(), DELETE_FILE_POS.name(), 4L)); InputFile inputDeleteFile = table.io().newInputFile(deleteFile.path().toString()); List actualDeletes = readFile(DeleteSchemaUtil.pathPosSchema(), inputDeleteFile); Assert.assertEquals("Delete records must match", expectedDeletes, actualDeletes); // commit the data and delete files - table.newRowDelta() + table + .newRowDelta() .addRows(dataFile) .addDeletes(deleteFile) .validateDataFilesExist(referencedDataFiles) @@ -277,10 +251,7 @@ public void testPositionDeleteWriter() throws IOException { .commit(); // verify the delete file is applied correctly - List expectedRows = ImmutableList.of( - toRow(2, "aaa"), - toRow(4, "aaa") - ); + List expectedRows = ImmutableList.of(toRow(2, "aaa"), toRow(4, "aaa")); Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*")); } @@ -292,29 +263,33 @@ public void testPositionDeleteWriterWithRow() throws IOException { DataFile dataFile = writeData(writerFactory, dataRows, table.spec(), partition); // write a position delete file and persist the deleted row - List> deletes = ImmutableList.of( - positionDelete(dataFile.path(), 0, dataRows.get(0)) - ); - Pair result = writePositionDeletes(writerFactory, deletes, table.spec(), partition); + List> deletes = + ImmutableList.of(positionDelete(dataFile.path(), 0, dataRows.get(0))); + Pair result = + writePositionDeletes(writerFactory, deletes, table.spec(), partition); DeleteFile deleteFile = result.first(); CharSequenceSet referencedDataFiles = result.second(); - // verify the written delete file + // verify the written delete file GenericRecord deletedRow = GenericRecord.create(table.schema()); Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(table.schema()); GenericRecord deleteRecord = GenericRecord.create(positionDeleteSchema); - Map deleteRecordColumns = ImmutableMap.of( - DELETE_FILE_PATH.name(), dataFile.path(), - DELETE_FILE_POS.name(), 0L, - DELETE_FILE_ROW_FIELD_NAME, deletedRow.copy("id", 1, "data", "aaa") - ); + Map deleteRecordColumns = + ImmutableMap.of( + DELETE_FILE_PATH.name(), + dataFile.path(), + DELETE_FILE_POS.name(), + 0L, + DELETE_FILE_ROW_FIELD_NAME, + deletedRow.copy("id", 1, "data", "aaa")); List expectedDeletes = ImmutableList.of(deleteRecord.copy(deleteRecordColumns)); InputFile inputDeleteFile = table.io().newInputFile(deleteFile.path().toString()); List actualDeletes = readFile(positionDeleteSchema, inputDeleteFile); Assert.assertEquals("Delete records must match", expectedDeletes, actualDeletes); // commit the data and delete files - table.newRowDelta() + table + .newRowDelta() .addRows(dataFile) .addDeletes(deleteFile) .validateDataFilesExist(referencedDataFiles) @@ -322,17 +297,14 @@ public void testPositionDeleteWriterWithRow() throws IOException { .commit(); // verify the delete file is applied correctly - List expectedRows = ImmutableList.of( - toRow(2, "aaa"), - toRow(3, "aaa"), - toRow(4, "aaa"), - toRow(5, "aaa") - ); + List expectedRows = + ImmutableList.of(toRow(2, "aaa"), toRow(3, "aaa"), toRow(4, "aaa"), toRow(5, "aaa")); Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*")); } - private DataFile writeData(FileWriterFactory writerFactory, List rows, - PartitionSpec spec, StructLike partitionKey) throws IOException { + private DataFile writeData( + FileWriterFactory writerFactory, List rows, PartitionSpec spec, StructLike partitionKey) + throws IOException { EncryptedOutputFile file = newOutputFile(spec, partitionKey); DataWriter writer = writerFactory.newDataWriter(file, spec, partitionKey); @@ -346,11 +318,16 @@ private DataFile writeData(FileWriterFactory writerFactory, List rows, return writer.toDataFile(); } - private DeleteFile writeEqualityDeletes(FileWriterFactory writerFactory, List deletes, - PartitionSpec spec, StructLike partitionKey) throws IOException { + private DeleteFile writeEqualityDeletes( + FileWriterFactory writerFactory, + List deletes, + PartitionSpec spec, + StructLike partitionKey) + throws IOException { EncryptedOutputFile file = newOutputFile(spec, partitionKey); - EqualityDeleteWriter writer = writerFactory.newEqualityDeleteWriter(file, spec, partitionKey); + EqualityDeleteWriter writer = + writerFactory.newEqualityDeleteWriter(file, spec, partitionKey); try (EqualityDeleteWriter closableWriter = writer) { closableWriter.deleteAll(deletes); @@ -359,13 +336,16 @@ private DeleteFile writeEqualityDeletes(FileWriterFactory writerFactory, List return writer.toDeleteFile(); } - private Pair writePositionDeletes(FileWriterFactory writerFactory, - List> deletes, - PartitionSpec spec, - StructLike partitionKey) throws IOException { + private Pair writePositionDeletes( + FileWriterFactory writerFactory, + List> deletes, + PartitionSpec spec, + StructLike partitionKey) + throws IOException { EncryptedOutputFile file = newOutputFile(spec, partitionKey); - PositionDeleteWriter writer = writerFactory.newPositionDeleteWriter(file, spec, partitionKey); + PositionDeleteWriter writer = + writerFactory.newPositionDeleteWriter(file, spec, partitionKey); try (PositionDeleteWriter closableWriter = writer) { for (PositionDelete delete : deletes) { @@ -379,28 +359,29 @@ private Pair writePositionDeletes(FileWriterFactory private List readFile(Schema schema, InputFile inputFile) throws IOException { switch (fileFormat) { case PARQUET: - try (CloseableIterable records = Parquet.read(inputFile) - .project(schema) - .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema)) - .build()) { + try (CloseableIterable records = + Parquet.read(inputFile) + .project(schema) + .createReaderFunc( + fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema)) + .build()) { return ImmutableList.copyOf(records); } case AVRO: - try (CloseableIterable records = Avro.read(inputFile) - .project(schema) - .createReaderFunc(DataReader::create) - .build()) { + try (CloseableIterable records = + Avro.read(inputFile).project(schema).createReaderFunc(DataReader::create).build()) { return ImmutableList.copyOf(records); } case ORC: - try (CloseableIterable records = ORC.read(inputFile) - .project(schema) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(schema, fileSchema)) - .build()) { + try (CloseableIterable records = + ORC.read(inputFile) + .project(schema) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(schema, fileSchema)) + .build()) { return ImmutableList.copyOf(records); } diff --git a/data/src/test/java/org/apache/iceberg/io/TestGenericSortedPosDeleteWriter.java b/data/src/test/java/org/apache/iceberg/io/TestGenericSortedPosDeleteWriter.java index dbe105eff796..53b0ecbfa819 100644 --- a/data/src/test/java/org/apache/iceberg/io/TestGenericSortedPosDeleteWriter.java +++ b/data/src/test/java/org/apache/iceberg/io/TestGenericSortedPosDeleteWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.File; @@ -62,11 +61,7 @@ public class TestGenericSortedPosDeleteWriter extends TableTestBase { @Parameterized.Parameters(name = "FileFormat={0}") public static Object[] parameters() { - return new Object[][] { - new Object[] {"avro"}, - new Object[] {"orc"}, - new Object[] {"parquet"} - }; + return new Object[][] {new Object[] {"avro"}, new Object[] {"orc"}, new Object[] {"parquet"}}; } public TestGenericSortedPosDeleteWriter(String fileFormat) { @@ -86,9 +81,7 @@ public void setupTable() throws IOException { this.fileFactory = OutputFileFactory.builderFor(table, 1, 1).format(format).build(); - table.updateProperties() - .defaultFormat(format) - .commit(); + table.updateProperties().defaultFormat(format).commit(); } private EncryptedOutputFile createEncryptedOutputFile() { @@ -97,7 +90,8 @@ private EncryptedOutputFile createEncryptedOutputFile() { private DataFile prepareDataFile(FileAppenderFactory appenderFactory, List rowSet) throws IOException { - DataWriter writer = appenderFactory.newDataWriter(createEncryptedOutputFile(), format, null); + DataWriter writer = + appenderFactory.newDataWriter(createEncryptedOutputFile(), format, null); try (DataWriter closeableWriter = writer) { for (Record record : rowSet) { closeableWriter.write(record); @@ -130,19 +124,20 @@ private StructLikeSet actualRowSet(String... columns) throws IOException { @Test public void testSortedPosDelete() throws IOException { - List rowSet = Lists.newArrayList( - createRow(0, "aaa"), - createRow(1, "bbb"), - createRow(2, "ccc"), - createRow(3, "ddd"), - createRow(4, "eee") - ); - - FileAppenderFactory appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), - null, null, null); + List rowSet = + Lists.newArrayList( + createRow(0, "aaa"), + createRow(1, "bbb"), + createRow(2, "ccc"), + createRow(3, "ddd"), + createRow(4, "eee")); + + FileAppenderFactory appenderFactory = + new GenericAppenderFactory(table.schema(), table.spec(), null, null, null); DataFile dataFile = prepareDataFile(appenderFactory, rowSet); - SortedPosDeleteWriter writer = new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, null, 100); + SortedPosDeleteWriter writer = + new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, null, 100); try (SortedPosDeleteWriter closeableWriter = writer) { for (int index = rowSet.size() - 1; index >= 0; index -= 2) { closeableWriter.delete(dataFile.path(), index); @@ -156,68 +151,68 @@ public void testSortedPosDelete() throws IOException { // Check whether the path-pos pairs are sorted as expected. Schema pathPosSchema = DeleteSchemaUtil.pathPosSchema(); Record record = GenericRecord.create(pathPosSchema); - List expectedDeletes = Lists.newArrayList( - record.copy("file_path", dataFile.path(), "pos", 0L), - record.copy("file_path", dataFile.path(), "pos", 2L), - record.copy("file_path", dataFile.path(), "pos", 4L) - ); + List expectedDeletes = + Lists.newArrayList( + record.copy("file_path", dataFile.path(), "pos", 0L), + record.copy("file_path", dataFile.path(), "pos", 2L), + record.copy("file_path", dataFile.path(), "pos", 4L)); Assert.assertEquals(expectedDeletes, readRecordsAsList(pathPosSchema, deleteFile.path())); - table.newRowDelta() + table + .newRowDelta() .addRows(dataFile) .addDeletes(deleteFiles.get(0)) .validateDataFilesExist(writer.referencedDataFiles()) .validateDeletedFiles() .commit(); - List expectedData = Lists.newArrayList( - createRow(1, "bbb"), - createRow(3, "ddd") - ); - Assert.assertEquals("Should have the expected records", expectedRowSet(expectedData), actualRowSet("*")); + List expectedData = Lists.newArrayList(createRow(1, "bbb"), createRow(3, "ddd")); + Assert.assertEquals( + "Should have the expected records", expectedRowSet(expectedData), actualRowSet("*")); } @Test public void testSortedPosDeleteWithSchemaAndNullRow() throws IOException { - List rowSet = Lists.newArrayList( - createRow(0, "aaa"), - createRow(1, "bbb"), - createRow(2, "ccc") - ); + List rowSet = + Lists.newArrayList(createRow(0, "aaa"), createRow(1, "bbb"), createRow(2, "ccc")); // Create a FileAppenderFactory which requires pos-delete row schema. - FileAppenderFactory appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), - null, null, table.schema()); + FileAppenderFactory appenderFactory = + new GenericAppenderFactory(table.schema(), table.spec(), null, null, table.schema()); DataFile dataFile = prepareDataFile(appenderFactory, rowSet); - SortedPosDeleteWriter writer = new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, null, 1); + SortedPosDeleteWriter writer = + new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, null, 1); boolean caughtError = false; try { writer.delete(dataFile.path(), 0L); } catch (Exception e) { caughtError = true; } - Assert.assertTrue("Should fail because the appender are required non-null rows to write", caughtError); + Assert.assertTrue( + "Should fail because the appender are required non-null rows to write", caughtError); } @Test public void testSortedPosDeleteWithRow() throws IOException { - List rowSet = Lists.newArrayList( - createRow(0, "aaa"), - createRow(1, "bbb"), - createRow(2, "ccc"), - createRow(3, "ddd"), - createRow(4, "eee") - ); - - FileAppenderFactory appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), - null, null, table.schema()); + List rowSet = + Lists.newArrayList( + createRow(0, "aaa"), + createRow(1, "bbb"), + createRow(2, "ccc"), + createRow(3, "ddd"), + createRow(4, "eee")); + + FileAppenderFactory appenderFactory = + new GenericAppenderFactory(table.schema(), table.spec(), null, null, table.schema()); DataFile dataFile = prepareDataFile(appenderFactory, rowSet); - SortedPosDeleteWriter writer = new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, null, 100); + SortedPosDeleteWriter writer = + new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, null, 100); try (SortedPosDeleteWriter closeableWriter = writer) { for (int index = rowSet.size() - 1; index >= 0; index -= 2) { - closeableWriter.delete(dataFile.path(), index, rowSet.get(index)); // Write deletes with row. + closeableWriter.delete( + dataFile.path(), index, rowSet.get(index)); // Write deletes with row. } } @@ -228,31 +223,30 @@ public void testSortedPosDeleteWithRow() throws IOException { // Check whether the path-pos pairs are sorted as expected. Schema pathPosSchema = DeleteSchemaUtil.posDeleteSchema(table.schema()); Record record = GenericRecord.create(pathPosSchema); - List expectedDeletes = Lists.newArrayList( - record.copy("file_path", dataFile.path(), "pos", 0L, "row", createRow(0, "aaa")), - record.copy("file_path", dataFile.path(), "pos", 2L, "row", createRow(2, "ccc")), - record.copy("file_path", dataFile.path(), "pos", 4L, "row", createRow(4, "eee")) - ); + List expectedDeletes = + Lists.newArrayList( + record.copy("file_path", dataFile.path(), "pos", 0L, "row", createRow(0, "aaa")), + record.copy("file_path", dataFile.path(), "pos", 2L, "row", createRow(2, "ccc")), + record.copy("file_path", dataFile.path(), "pos", 4L, "row", createRow(4, "eee"))); Assert.assertEquals(expectedDeletes, readRecordsAsList(pathPosSchema, deleteFile.path())); - table.newRowDelta() + table + .newRowDelta() .addRows(dataFile) .addDeletes(deleteFiles.get(0)) .validateDataFilesExist(writer.referencedDataFiles()) .validateDeletedFiles() .commit(); - List expectedData = Lists.newArrayList( - createRow(1, "bbb"), - createRow(3, "ddd") - ); - Assert.assertEquals("Should have the expected records", expectedRowSet(expectedData), actualRowSet("*")); + List expectedData = Lists.newArrayList(createRow(1, "bbb"), createRow(3, "ddd")); + Assert.assertEquals( + "Should have the expected records", expectedRowSet(expectedData), actualRowSet("*")); } @Test public void testMultipleFlush() throws IOException { - FileAppenderFactory appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), - null, null, null); + FileAppenderFactory appenderFactory = + new GenericAppenderFactory(table.schema(), table.spec(), null, null, null); // It will produce 5 record lists, each list will write into a separate data file: // The 1th file has: <0 , val-0> , <1 , val-1> , ... , <99 , val-99> @@ -277,7 +271,8 @@ public void testMultipleFlush() throws IOException { dataFiles.forEach(rowDelta::addRows); rowDelta.commit(); - SortedPosDeleteWriter writer = new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, null, 50); + SortedPosDeleteWriter writer = + new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, null, 50); try (SortedPosDeleteWriter closeableWriter = writer) { for (int pos = 0; pos < 100; pos++) { for (int fileIndex = 4; fileIndex >= 0; fileIndex--) { @@ -308,7 +303,8 @@ public void testMultipleFlush() throws IOException { deleteFiles.forEach(rowDelta::addDeletes); rowDelta.commit(); - Assert.assertEquals("Should have no record.", expectedRowSet(ImmutableList.of()), actualRowSet("*")); + Assert.assertEquals( + "Should have no record.", expectedRowSet(ImmutableList.of()), actualRowSet("*")); } private List readRecordsAsList(Schema schema, CharSequence path) throws IOException { @@ -317,24 +313,25 @@ private List readRecordsAsList(Schema schema, CharSequence path) throws InputFile inputFile = Files.localInput(path.toString()); switch (format) { case PARQUET: - iterable = Parquet.read(inputFile) - .project(schema) - .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema)) - .build(); + iterable = + Parquet.read(inputFile) + .project(schema) + .createReaderFunc( + fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema)) + .build(); break; case AVRO: - iterable = Avro.read(inputFile) - .project(schema) - .createReaderFunc(DataReader::create) - .build(); + iterable = + Avro.read(inputFile).project(schema).createReaderFunc(DataReader::create).build(); break; case ORC: - iterable = ORC.read(inputFile) - .project(schema) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(schema, fileSchema)) - .build(); + iterable = + ORC.read(inputFile) + .project(schema) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(schema, fileSchema)) + .build(); break; default: diff --git a/data/src/test/java/org/apache/iceberg/io/TestPartitioningWriters.java b/data/src/test/java/org/apache/iceberg/io/TestPartitioningWriters.java index 7253730dbcf4..166592fd2250 100644 --- a/data/src/test/java/org/apache/iceberg/io/TestPartitioningWriters.java +++ b/data/src/test/java/org/apache/iceberg/io/TestPartitioningWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.File; @@ -44,9 +43,9 @@ public abstract class TestPartitioningWriters extends WriterTestBase { @Parameterized.Parameters(name = "FileFormat={0}") public static Object[] parameters() { return new Object[][] { - new Object[]{FileFormat.AVRO}, - new Object[]{FileFormat.PARQUET}, - new Object[]{FileFormat.ORC}, + new Object[] {FileFormat.AVRO}, + new Object[] {FileFormat.PARQUET}, + new Object[] {FileFormat.ORC}, }; } @@ -81,8 +80,8 @@ public void setupTable() throws Exception { @Test public void testClusteredDataWriterNoRecords() throws IOException { FileWriterFactory writerFactory = newWriterFactory(table.schema()); - ClusteredDataWriter writer = new ClusteredDataWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + ClusteredDataWriter writer = + new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); writer.close(); Assert.assertEquals("Must be no data files", 0, writer.result().dataFiles().size()); @@ -93,13 +92,11 @@ public void testClusteredDataWriterNoRecords() throws IOException { @Test public void testClusteredDataWriterMultiplePartitions() throws IOException { - table.updateSpec() - .addField(Expressions.ref("data")) - .commit(); + table.updateSpec().addField(Expressions.ref("data")).commit(); FileWriterFactory writerFactory = newWriterFactory(table.schema()); - ClusteredDataWriter writer = new ClusteredDataWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + ClusteredDataWriter writer = + new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); PartitionSpec spec = table.spec(); @@ -118,25 +115,19 @@ public void testClusteredDataWriterMultiplePartitions() throws IOException { result.dataFiles().forEach(rowDelta::addRows); rowDelta.commit(); - List expectedRows = ImmutableList.of( - toRow(1, "aaa"), - toRow(2, "aaa"), - toRow(3, "bbb"), - toRow(4, "bbb"), - toRow(5, "ccc") - ); + List expectedRows = + ImmutableList.of( + toRow(1, "aaa"), toRow(2, "aaa"), toRow(3, "bbb"), toRow(4, "bbb"), toRow(5, "ccc")); Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*")); } @Test public void testClusteredDataWriterOutOfOrderPartitions() throws IOException { - table.updateSpec() - .addField(Expressions.ref("data")) - .commit(); + table.updateSpec().addField(Expressions.ref("data")).commit(); FileWriterFactory writerFactory = newWriterFactory(table.schema()); - ClusteredDataWriter writer = new ClusteredDataWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + ClusteredDataWriter writer = + new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); PartitionSpec spec = table.spec(); @@ -146,8 +137,10 @@ public void testClusteredDataWriterOutOfOrderPartitions() throws IOException { writer.write(toRow(4, "bbb"), spec, partitionKey(spec, "bbb")); writer.write(toRow(5, "ccc"), spec, partitionKey(spec, "ccc")); - AssertHelpers.assertThrows("Should fail to write out of order partitions", - IllegalStateException.class, "Encountered records that belong to already closed files", + AssertHelpers.assertThrows( + "Should fail to write out of order partitions", + IllegalStateException.class, + "Encountered records that belong to already closed files", () -> writer.write(toRow(6, "aaa"), spec, partitionKey(spec, "aaa"))); writer.close(); @@ -157,9 +150,11 @@ public void testClusteredDataWriterOutOfOrderPartitions() throws IOException { public void testClusteredEqualityDeleteWriterNoRecords() throws IOException { List equalityFieldIds = ImmutableList.of(table.schema().findField("id").fieldId()); Schema equalityDeleteRowSchema = table.schema().select("id"); - FileWriterFactory writerFactory = newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema); - ClusteredEqualityDeleteWriter writer = new ClusteredEqualityDeleteWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + FileWriterFactory writerFactory = + newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema); + ClusteredEqualityDeleteWriter writer = + new ClusteredEqualityDeleteWriter<>( + writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); writer.close(); Assert.assertEquals(0, writer.result().deleteFiles().size()); @@ -176,53 +171,41 @@ public void testClusteredEqualityDeleteWriterNoRecords() throws IOException { public void testClusteredEqualityDeleteWriterMultipleSpecs() throws IOException { List equalityFieldIds = ImmutableList.of(table.schema().findField("id").fieldId()); Schema equalityDeleteRowSchema = table.schema().select("id"); - FileWriterFactory writerFactory = newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema); + FileWriterFactory writerFactory = + newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema); // add an unpartitioned data file - ImmutableList rows1 = ImmutableList.of( - toRow(1, "aaa"), - toRow(2, "aaa"), - toRow(11, "aaa") - ); + ImmutableList rows1 = ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(11, "aaa")); DataFile dataFile1 = writeData(writerFactory, fileFactory, rows1, table.spec(), null); - table.newFastAppend() - .appendFile(dataFile1) - .commit(); + table.newFastAppend().appendFile(dataFile1).commit(); // partition by bucket - table.updateSpec() - .addField(Expressions.bucket("data", 16)) - .commit(); + table.updateSpec().addField(Expressions.bucket("data", 16)).commit(); // add a data file partitioned by bucket - ImmutableList rows2 = ImmutableList.of( - toRow(3, "bbb"), - toRow(4, "bbb"), - toRow(12, "bbb") - ); - DataFile dataFile2 = writeData(writerFactory, fileFactory, rows2, table.spec(), partitionKey(table.spec(), "bbb")); - table.newFastAppend() - .appendFile(dataFile2) - .commit(); + ImmutableList rows2 = ImmutableList.of(toRow(3, "bbb"), toRow(4, "bbb"), toRow(12, "bbb")); + DataFile dataFile2 = + writeData( + writerFactory, fileFactory, rows2, table.spec(), partitionKey(table.spec(), "bbb")); + table.newFastAppend().appendFile(dataFile2).commit(); // partition by data - table.updateSpec() + table + .updateSpec() .removeField(Expressions.bucket("data", 16)) .addField(Expressions.ref("data")) .commit(); // add a data file partitioned by data - ImmutableList rows3 = ImmutableList.of( - toRow(5, "ccc"), - toRow(13, "ccc") - ); - DataFile dataFile3 = writeData(writerFactory, fileFactory, rows3, table.spec(), partitionKey(table.spec(), "ccc")); - table.newFastAppend() - .appendFile(dataFile3) - .commit(); + ImmutableList rows3 = ImmutableList.of(toRow(5, "ccc"), toRow(13, "ccc")); + DataFile dataFile3 = + writeData( + writerFactory, fileFactory, rows3, table.spec(), partitionKey(table.spec(), "ccc")); + table.newFastAppend().appendFile(dataFile3).commit(); - ClusteredEqualityDeleteWriter writer = new ClusteredEqualityDeleteWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + ClusteredEqualityDeleteWriter writer = + new ClusteredEqualityDeleteWriter<>( + writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); PartitionSpec unpartitionedSpec = table.specs().get(0); PartitionSpec bucketSpec = table.specs().get(1); @@ -238,18 +221,15 @@ public void testClusteredEqualityDeleteWriterMultipleSpecs() throws IOException DeleteWriteResult result = writer.result(); Assert.assertEquals("Must be 3 delete files", 3, result.deleteFiles().size()); - Assert.assertEquals("Must not reference data files", 0, writer.result().referencedDataFiles().size()); + Assert.assertEquals( + "Must not reference data files", 0, writer.result().referencedDataFiles().size()); Assert.assertFalse("Must not reference data files", writer.result().referencesDataFiles()); RowDelta rowDelta = table.newRowDelta(); result.deleteFiles().forEach(rowDelta::addDeletes); rowDelta.commit(); - List expectedRows = ImmutableList.of( - toRow(11, "aaa"), - toRow(12, "bbb"), - toRow(13, "ccc") - ); + List expectedRows = ImmutableList.of(toRow(11, "aaa"), toRow(12, "bbb"), toRow(13, "ccc")); Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*")); } @@ -257,19 +237,20 @@ public void testClusteredEqualityDeleteWriterMultipleSpecs() throws IOException public void testClusteredEqualityDeleteWriterOutOfOrderSpecsAndPartitions() throws IOException { List equalityFieldIds = ImmutableList.of(table.schema().findField("id").fieldId()); Schema equalityDeleteRowSchema = table.schema().select("id"); - FileWriterFactory writerFactory = newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema); + FileWriterFactory writerFactory = + newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema); - table.updateSpec() - .addField(Expressions.bucket("data", 16)) - .commit(); + table.updateSpec().addField(Expressions.bucket("data", 16)).commit(); - table.updateSpec() + table + .updateSpec() .removeField(Expressions.bucket("data", 16)) .addField(Expressions.ref("data")) .commit(); - ClusteredEqualityDeleteWriter writer = new ClusteredEqualityDeleteWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + ClusteredEqualityDeleteWriter writer = + new ClusteredEqualityDeleteWriter<>( + writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); PartitionSpec unpartitionedSpec = table.specs().get(0); PartitionSpec bucketSpec = table.specs().get(1); @@ -282,12 +263,16 @@ public void testClusteredEqualityDeleteWriterOutOfOrderSpecsAndPartitions() thro writer.write(toRow(5, "ccc"), identitySpec, partitionKey(identitySpec, "ccc")); writer.write(toRow(6, "ddd"), identitySpec, partitionKey(identitySpec, "ddd")); - AssertHelpers.assertThrows("Should fail to write out of order partitions", - IllegalStateException.class, "Encountered records that belong to already closed files", + AssertHelpers.assertThrows( + "Should fail to write out of order partitions", + IllegalStateException.class, + "Encountered records that belong to already closed files", () -> writer.write(toRow(7, "ccc"), identitySpec, partitionKey(identitySpec, "ccc"))); - AssertHelpers.assertThrows("Should fail to write out of order specs", - IllegalStateException.class, "Encountered records that belong to already closed files", + AssertHelpers.assertThrows( + "Should fail to write out of order specs", + IllegalStateException.class, + "Encountered records that belong to already closed files", () -> writer.write(toRow(7, "aaa"), unpartitionedSpec, null)); writer.close(); @@ -296,8 +281,9 @@ public void testClusteredEqualityDeleteWriterOutOfOrderSpecsAndPartitions() thro @Test public void testClusteredPositionDeleteWriterNoRecords() throws IOException { FileWriterFactory writerFactory = newWriterFactory(table.schema()); - ClusteredPositionDeleteWriter writer = new ClusteredPositionDeleteWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + ClusteredPositionDeleteWriter writer = + new ClusteredPositionDeleteWriter<>( + writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); writer.close(); Assert.assertEquals(0, writer.result().deleteFiles().size()); @@ -315,50 +301,37 @@ public void testClusteredPositionDeleteWriterMultipleSpecs() throws IOException FileWriterFactory writerFactory = newWriterFactory(table.schema()); // add an unpartitioned data file - ImmutableList rows1 = ImmutableList.of( - toRow(1, "aaa"), - toRow(2, "aaa"), - toRow(11, "aaa") - ); + ImmutableList rows1 = ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(11, "aaa")); DataFile dataFile1 = writeData(writerFactory, fileFactory, rows1, table.spec(), null); - table.newFastAppend() - .appendFile(dataFile1) - .commit(); + table.newFastAppend().appendFile(dataFile1).commit(); // partition by bucket - table.updateSpec() - .addField(Expressions.bucket("data", 16)) - .commit(); + table.updateSpec().addField(Expressions.bucket("data", 16)).commit(); // add a data file partitioned by bucket - ImmutableList rows2 = ImmutableList.of( - toRow(3, "bbb"), - toRow(4, "bbb"), - toRow(12, "bbb") - ); - DataFile dataFile2 = writeData(writerFactory, fileFactory, rows2, table.spec(), partitionKey(table.spec(), "bbb")); - table.newFastAppend() - .appendFile(dataFile2) - .commit(); + ImmutableList rows2 = ImmutableList.of(toRow(3, "bbb"), toRow(4, "bbb"), toRow(12, "bbb")); + DataFile dataFile2 = + writeData( + writerFactory, fileFactory, rows2, table.spec(), partitionKey(table.spec(), "bbb")); + table.newFastAppend().appendFile(dataFile2).commit(); // partition by data - table.updateSpec() + table + .updateSpec() .removeField(Expressions.bucket("data", 16)) .addField(Expressions.ref("data")) .commit(); // add a data file partitioned by data - ImmutableList rows3 = ImmutableList.of( - toRow(5, "ccc"), - toRow(13, "ccc") - ); - DataFile dataFile3 = writeData(writerFactory, fileFactory, rows3, table.spec(), partitionKey(table.spec(), "ccc")); - table.newFastAppend() - .appendFile(dataFile3) - .commit(); + ImmutableList rows3 = ImmutableList.of(toRow(5, "ccc"), toRow(13, "ccc")); + DataFile dataFile3 = + writeData( + writerFactory, fileFactory, rows3, table.spec(), partitionKey(table.spec(), "ccc")); + table.newFastAppend().appendFile(dataFile3).commit(); - ClusteredPositionDeleteWriter writer = new ClusteredPositionDeleteWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + ClusteredPositionDeleteWriter writer = + new ClusteredPositionDeleteWriter<>( + writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); PartitionSpec unpartitionedSpec = table.specs().get(0); PartitionSpec bucketSpec = table.specs().get(1); @@ -366,26 +339,28 @@ public void testClusteredPositionDeleteWriterMultipleSpecs() throws IOException writer.write(positionDelete(dataFile1.path(), 0L, null), unpartitionedSpec, null); writer.write(positionDelete(dataFile1.path(), 1L, null), unpartitionedSpec, null); - writer.write(positionDelete(dataFile2.path(), 0L, null), bucketSpec, partitionKey(bucketSpec, "bbb")); - writer.write(positionDelete(dataFile2.path(), 1L, null), bucketSpec, partitionKey(bucketSpec, "bbb")); - writer.write(positionDelete(dataFile3.path(), 0L, null), identitySpec, partitionKey(identitySpec, "ccc")); + writer.write( + positionDelete(dataFile2.path(), 0L, null), bucketSpec, partitionKey(bucketSpec, "bbb")); + writer.write( + positionDelete(dataFile2.path(), 1L, null), bucketSpec, partitionKey(bucketSpec, "bbb")); + writer.write( + positionDelete(dataFile3.path(), 0L, null), + identitySpec, + partitionKey(identitySpec, "ccc")); writer.close(); DeleteWriteResult result = writer.result(); Assert.assertEquals("Must be 3 delete files", 3, result.deleteFiles().size()); - Assert.assertEquals("Must reference 3 data files", 3, writer.result().referencedDataFiles().size()); + Assert.assertEquals( + "Must reference 3 data files", 3, writer.result().referencedDataFiles().size()); Assert.assertTrue("Must reference data files", writer.result().referencesDataFiles()); RowDelta rowDelta = table.newRowDelta(); result.deleteFiles().forEach(rowDelta::addDeletes); rowDelta.commit(); - List expectedRows = ImmutableList.of( - toRow(11, "aaa"), - toRow(12, "bbb"), - toRow(13, "ccc") - ); + List expectedRows = ImmutableList.of(toRow(11, "aaa"), toRow(12, "bbb"), toRow(13, "ccc")); Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*")); } @@ -393,17 +368,17 @@ public void testClusteredPositionDeleteWriterMultipleSpecs() throws IOException public void testClusteredPositionDeleteWriterOutOfOrderSpecsAndPartitions() throws IOException { FileWriterFactory writerFactory = newWriterFactory(table.schema()); - table.updateSpec() - .addField(Expressions.bucket("data", 16)) - .commit(); + table.updateSpec().addField(Expressions.bucket("data", 16)).commit(); - table.updateSpec() + table + .updateSpec() .removeField(Expressions.bucket("data", 16)) .addField(Expressions.ref("data")) .commit(); - ClusteredPositionDeleteWriter writer = new ClusteredPositionDeleteWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + ClusteredPositionDeleteWriter writer = + new ClusteredPositionDeleteWriter<>( + writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); PartitionSpec unpartitionedSpec = table.specs().get(0); PartitionSpec bucketSpec = table.specs().get(1); @@ -411,20 +386,32 @@ public void testClusteredPositionDeleteWriterOutOfOrderSpecsAndPartitions() thro writer.write(positionDelete("file-1.parquet", 0L, null), unpartitionedSpec, null); writer.write(positionDelete("file-1.parquet", 1L, null), unpartitionedSpec, null); - writer.write(positionDelete("file-2.parquet", 0L, null), bucketSpec, partitionKey(bucketSpec, "bbb")); - writer.write(positionDelete("file-2.parquet", 1L, null), bucketSpec, partitionKey(bucketSpec, "bbb")); - writer.write(positionDelete("file-3.parquet", 0L, null), identitySpec, partitionKey(identitySpec, "ccc")); - writer.write(positionDelete("file-4.parquet", 0L, null), identitySpec, partitionKey(identitySpec, "ddd")); - - AssertHelpers.assertThrows("Should fail to write out of order partitions", - IllegalStateException.class, "Encountered records that belong to already closed files", + writer.write( + positionDelete("file-2.parquet", 0L, null), bucketSpec, partitionKey(bucketSpec, "bbb")); + writer.write( + positionDelete("file-2.parquet", 1L, null), bucketSpec, partitionKey(bucketSpec, "bbb")); + writer.write( + positionDelete("file-3.parquet", 0L, null), + identitySpec, + partitionKey(identitySpec, "ccc")); + writer.write( + positionDelete("file-4.parquet", 0L, null), + identitySpec, + partitionKey(identitySpec, "ddd")); + + AssertHelpers.assertThrows( + "Should fail to write out of order partitions", + IllegalStateException.class, + "Encountered records that belong to already closed files", () -> { PositionDelete positionDelete = positionDelete("file-5.parquet", 1L, null); writer.write(positionDelete, identitySpec, partitionKey(identitySpec, "ccc")); }); - AssertHelpers.assertThrows("Should fail to write out of order specs", - IllegalStateException.class, "Encountered records that belong to already closed files", + AssertHelpers.assertThrows( + "Should fail to write out of order specs", + IllegalStateException.class, + "Encountered records that belong to already closed files", () -> { PositionDelete positionDelete = positionDelete("file-1.parquet", 3L, null); writer.write(positionDelete, unpartitionedSpec, null); @@ -436,8 +423,8 @@ public void testClusteredPositionDeleteWriterOutOfOrderSpecsAndPartitions() thro @Test public void testFanoutDataWriterNoRecords() throws IOException { FileWriterFactory writerFactory = newWriterFactory(table.schema()); - FanoutDataWriter writer = new FanoutDataWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + FanoutDataWriter writer = + new FanoutDataWriter<>(writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); writer.close(); Assert.assertEquals("Must be no data files", 0, writer.result().dataFiles().size()); @@ -448,13 +435,11 @@ public void testFanoutDataWriterNoRecords() throws IOException { @Test public void testFanoutDataWriterMultiplePartitions() throws IOException { - table.updateSpec() - .addField(Expressions.ref("data")) - .commit(); + table.updateSpec().addField(Expressions.ref("data")).commit(); FileWriterFactory writerFactory = newWriterFactory(table.schema()); - FanoutDataWriter writer = new FanoutDataWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + FanoutDataWriter writer = + new FanoutDataWriter<>(writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); PartitionSpec spec = table.spec(); @@ -473,13 +458,9 @@ public void testFanoutDataWriterMultiplePartitions() throws IOException { result.dataFiles().forEach(rowDelta::addRows); rowDelta.commit(); - List expectedRows = ImmutableList.of( - toRow(1, "aaa"), - toRow(2, "aaa"), - toRow(3, "bbb"), - toRow(4, "bbb"), - toRow(5, "ccc") - ); + List expectedRows = + ImmutableList.of( + toRow(1, "aaa"), toRow(2, "aaa"), toRow(3, "bbb"), toRow(4, "bbb"), toRow(5, "ccc")); Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*")); } } diff --git a/data/src/test/java/org/apache/iceberg/io/TestPositionDeltaWriters.java b/data/src/test/java/org/apache/iceberg/io/TestPositionDeltaWriters.java index 275ee5e0fc88..f1295296b92d 100644 --- a/data/src/test/java/org/apache/iceberg/io/TestPositionDeltaWriters.java +++ b/data/src/test/java/org/apache/iceberg/io/TestPositionDeltaWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.File; @@ -42,11 +41,12 @@ public abstract class TestPositionDeltaWriters extends WriterTestBase { @Parameterized.Parameters(name = "FileFormat={0}") public static Object[] parameters() { return new Object[][] { - new Object[]{FileFormat.AVRO}, - new Object[]{FileFormat.ORC}, - new Object[]{FileFormat.PARQUET} + new Object[] {FileFormat.AVRO}, + new Object[] {FileFormat.ORC}, + new Object[] {FileFormat.PARQUET} }; } + private static final int TABLE_FORMAT_VERSION = 2; private static final long TARGET_FILE_SIZE = 128L * 1024 * 1024; @@ -79,13 +79,15 @@ public void setupTable() throws Exception { public void testPositionDeltaInsertOnly() throws IOException { FileWriterFactory writerFactory = newWriterFactory(table.schema()); - ClusteredDataWriter insertWriter = new ClusteredDataWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); - ClusteredDataWriter updateWriter = new ClusteredDataWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); - ClusteredPositionDeleteWriter deleteWriter = new ClusteredPositionDeleteWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); - PositionDeltaWriter deltaWriter = new BasePositionDeltaWriter<>(insertWriter, updateWriter, deleteWriter); + ClusteredDataWriter insertWriter = + new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + ClusteredDataWriter updateWriter = + new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + ClusteredPositionDeleteWriter deleteWriter = + new ClusteredPositionDeleteWriter<>( + writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + PositionDeltaWriter deltaWriter = + new BasePositionDeltaWriter<>(insertWriter, updateWriter, deleteWriter); deltaWriter.insert(toRow(1, "aaa"), table.spec(), null); deltaWriter.close(); @@ -105,9 +107,7 @@ public void testPositionDeltaInsertOnly() throws IOException { } rowDelta.commit(); - List expectedRows = ImmutableList.of( - toRow(1, "aaa") - ); + List expectedRows = ImmutableList.of(toRow(1, "aaa")); Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*")); } @@ -116,41 +116,32 @@ public void testPositionDeltaDeleteOnly() throws IOException { FileWriterFactory writerFactory = newWriterFactory(table.schema()); // add an unpartitioned data file - ImmutableList rows1 = ImmutableList.of( - toRow(1, "aaa"), - toRow(2, "aaa"), - toRow(11, "aaa") - ); + ImmutableList rows1 = ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(11, "aaa")); DataFile dataFile1 = writeData(writerFactory, fileFactory, rows1, table.spec(), null); - table.newFastAppend() - .appendFile(dataFile1) - .commit(); + table.newFastAppend().appendFile(dataFile1).commit(); // partition by data - table.updateSpec() - .addField(Expressions.ref("data")) - .commit(); + table.updateSpec().addField(Expressions.ref("data")).commit(); // add a data file partitioned by data - ImmutableList rows2 = ImmutableList.of( - toRow(3, "bbb"), - toRow(4, "bbb") - ); - DataFile dataFile2 = writeData(writerFactory, fileFactory, rows2, table.spec(), partitionKey(table.spec(), "bbb")); - table.newFastAppend() - .appendFile(dataFile2) - .commit(); + ImmutableList rows2 = ImmutableList.of(toRow(3, "bbb"), toRow(4, "bbb")); + DataFile dataFile2 = + writeData( + writerFactory, fileFactory, rows2, table.spec(), partitionKey(table.spec(), "bbb")); + table.newFastAppend().appendFile(dataFile2).commit(); PartitionSpec unpartitionedSpec = table.specs().get(0); PartitionSpec partitionedSpec = table.specs().get(1); - ClusteredDataWriter insertWriter = new ClusteredDataWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); - ClusteredDataWriter updateWriter = new ClusteredDataWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); - ClusteredPositionDeleteWriter deleteWriter = new ClusteredPositionDeleteWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); - PositionDeltaWriter deltaWriter = new BasePositionDeltaWriter<>(insertWriter, updateWriter, deleteWriter); + ClusteredDataWriter insertWriter = + new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + ClusteredDataWriter updateWriter = + new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + ClusteredPositionDeleteWriter deleteWriter = + new ClusteredPositionDeleteWriter<>( + writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + PositionDeltaWriter deltaWriter = + new BasePositionDeltaWriter<>(insertWriter, updateWriter, deleteWriter); deltaWriter.delete(dataFile1.path(), 2L, unpartitionedSpec, null); deltaWriter.delete(dataFile2.path(), 1L, partitionedSpec, partitionKey(partitionedSpec, "bbb")); @@ -172,11 +163,7 @@ public void testPositionDeltaDeleteOnly() throws IOException { } rowDelta.commit(); - List expectedRows = ImmutableList.of( - toRow(1, "aaa"), - toRow(2, "aaa"), - toRow(3, "bbb") - ); + List expectedRows = ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(3, "bbb")); Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*")); } @@ -185,41 +172,32 @@ public void testPositionDeltaMultipleSpecs() throws IOException { FileWriterFactory writerFactory = newWriterFactory(table.schema()); // add an unpartitioned data file - ImmutableList rows1 = ImmutableList.of( - toRow(1, "aaa"), - toRow(2, "aaa"), - toRow(11, "aaa") - ); + ImmutableList rows1 = ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(11, "aaa")); DataFile dataFile1 = writeData(writerFactory, fileFactory, rows1, table.spec(), null); - table.newFastAppend() - .appendFile(dataFile1) - .commit(); + table.newFastAppend().appendFile(dataFile1).commit(); // partition by data - table.updateSpec() - .addField(Expressions.ref("data")) - .commit(); + table.updateSpec().addField(Expressions.ref("data")).commit(); // add a data file partitioned by data - ImmutableList rows2 = ImmutableList.of( - toRow(3, "bbb"), - toRow(4, "bbb") - ); - DataFile dataFile2 = writeData(writerFactory, fileFactory, rows2, table.spec(), partitionKey(table.spec(), "bbb")); - table.newFastAppend() - .appendFile(dataFile2) - .commit(); + ImmutableList rows2 = ImmutableList.of(toRow(3, "bbb"), toRow(4, "bbb")); + DataFile dataFile2 = + writeData( + writerFactory, fileFactory, rows2, table.spec(), partitionKey(table.spec(), "bbb")); + table.newFastAppend().appendFile(dataFile2).commit(); PartitionSpec unpartitionedSpec = table.specs().get(0); PartitionSpec partitionedSpec = table.specs().get(1); - ClusteredDataWriter insertWriter = new ClusteredDataWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); - ClusteredDataWriter updateWriter = new ClusteredDataWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); - ClusteredPositionDeleteWriter deleteWriter = new ClusteredPositionDeleteWriter<>( - writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); - PositionDeltaWriter deltaWriter = new BasePositionDeltaWriter<>(insertWriter, updateWriter, deleteWriter); + ClusteredDataWriter insertWriter = + new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + ClusteredDataWriter updateWriter = + new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + ClusteredPositionDeleteWriter deleteWriter = + new ClusteredPositionDeleteWriter<>( + writerFactory, fileFactory, table.io(), TARGET_FILE_SIZE); + PositionDeltaWriter deltaWriter = + new BasePositionDeltaWriter<>(insertWriter, updateWriter, deleteWriter); deltaWriter.delete(dataFile1.path(), 2L, unpartitionedSpec, null); deltaWriter.delete(dataFile2.path(), 1L, partitionedSpec, partitionKey(partitionedSpec, "bbb")); @@ -245,12 +223,8 @@ public void testPositionDeltaMultipleSpecs() throws IOException { } rowDelta.commit(); - List expectedRows = ImmutableList.of( - toRow(1, "aaa"), - toRow(2, "aaa"), - toRow(3, "bbb"), - toRow(10, "ccc") - ); + List expectedRows = + ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(3, "bbb"), toRow(10, "ccc")); Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*")); } } diff --git a/data/src/test/java/org/apache/iceberg/io/TestRollingFileWriters.java b/data/src/test/java/org/apache/iceberg/io/TestRollingFileWriters.java index 5c11d345e82f..b229d4871f55 100644 --- a/data/src/test/java/org/apache/iceberg/io/TestRollingFileWriters.java +++ b/data/src/test/java/org/apache/iceberg/io/TestRollingFileWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.File; @@ -41,12 +40,12 @@ public abstract class TestRollingFileWriters extends WriterTestBase { @Parameterized.Parameters(name = "FileFormat={0}, Partitioned={1}") public static Object[] parameters() { return new Object[][] { - new Object[]{FileFormat.AVRO, false}, - new Object[]{FileFormat.AVRO, true}, - new Object[]{FileFormat.PARQUET, false}, - new Object[]{FileFormat.PARQUET, true}, - new Object[]{FileFormat.ORC, false}, - new Object[]{FileFormat.ORC, true} + new Object[] {FileFormat.AVRO, false}, + new Object[] {FileFormat.AVRO, true}, + new Object[] {FileFormat.PARQUET, false}, + new Object[] {FileFormat.PARQUET, true}, + new Object[] {FileFormat.ORC, false}, + new Object[] {FileFormat.ORC, true} }; } @@ -93,9 +92,9 @@ public void setupTable() throws Exception { @Test public void testRollingDataWriterNoRecords() throws IOException { FileWriterFactory writerFactory = newWriterFactory(table.schema()); - RollingDataWriter writer = new RollingDataWriter<>( - writerFactory, fileFactory, table.io(), - DEFAULT_FILE_SIZE, table.spec(), partition); + RollingDataWriter writer = + new RollingDataWriter<>( + writerFactory, fileFactory, table.io(), DEFAULT_FILE_SIZE, table.spec(), partition); writer.close(); Assert.assertEquals("Must be no data files", 0, writer.result().dataFiles().size()); @@ -107,9 +106,9 @@ public void testRollingDataWriterNoRecords() throws IOException { @Test public void testRollingDataWriterSplitData() throws IOException { FileWriterFactory writerFactory = newWriterFactory(table.schema()); - RollingDataWriter writer = new RollingDataWriter<>( - writerFactory, fileFactory, table.io(), - SMALL_FILE_SIZE, table.spec(), partition); + RollingDataWriter writer = + new RollingDataWriter<>( + writerFactory, fileFactory, table.io(), SMALL_FILE_SIZE, table.spec(), partition); List rows = Lists.newArrayListWithExpectedSize(4 * FILE_SIZE_CHECK_ROWS_DIVISOR); for (int index = 0; index < 4 * FILE_SIZE_CHECK_ROWS_DIVISOR; index++) { @@ -130,10 +129,11 @@ public void testRollingDataWriterSplitData() throws IOException { public void testRollingEqualityDeleteWriterNoRecords() throws IOException { List equalityFieldIds = ImmutableList.of(table.schema().findField("id").fieldId()); Schema equalityDeleteRowSchema = table.schema().select("id"); - FileWriterFactory writerFactory = newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema); - RollingEqualityDeleteWriter writer = new RollingEqualityDeleteWriter<>( - writerFactory, fileFactory, table.io(), - DEFAULT_FILE_SIZE, table.spec(), partition); + FileWriterFactory writerFactory = + newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema); + RollingEqualityDeleteWriter writer = + new RollingEqualityDeleteWriter<>( + writerFactory, fileFactory, table.io(), DEFAULT_FILE_SIZE, table.spec(), partition); writer.close(); Assert.assertEquals(0, writer.result().deleteFiles().size()); @@ -150,10 +150,11 @@ public void testRollingEqualityDeleteWriterNoRecords() throws IOException { public void testRollingEqualityDeleteWriterSplitDeletes() throws IOException { List equalityFieldIds = ImmutableList.of(table.schema().findField("id").fieldId()); Schema equalityDeleteRowSchema = table.schema().select("id"); - FileWriterFactory writerFactory = newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema); - RollingEqualityDeleteWriter writer = new RollingEqualityDeleteWriter<>( - writerFactory, fileFactory, table.io(), - SMALL_FILE_SIZE, table.spec(), partition); + FileWriterFactory writerFactory = + newWriterFactory(table.schema(), equalityFieldIds, equalityDeleteRowSchema); + RollingEqualityDeleteWriter writer = + new RollingEqualityDeleteWriter<>( + writerFactory, fileFactory, table.io(), SMALL_FILE_SIZE, table.spec(), partition); List deletes = Lists.newArrayListWithExpectedSize(4 * FILE_SIZE_CHECK_ROWS_DIVISOR); for (int index = 0; index < 4 * FILE_SIZE_CHECK_ROWS_DIVISOR; index++) { @@ -176,9 +177,9 @@ public void testRollingEqualityDeleteWriterSplitDeletes() throws IOException { @Test public void testRollingPositionDeleteWriterNoRecords() throws IOException { FileWriterFactory writerFactory = newWriterFactory(table.schema()); - RollingPositionDeleteWriter writer = new RollingPositionDeleteWriter<>( - writerFactory, fileFactory, table.io(), - DEFAULT_FILE_SIZE, table.spec(), partition); + RollingPositionDeleteWriter writer = + new RollingPositionDeleteWriter<>( + writerFactory, fileFactory, table.io(), DEFAULT_FILE_SIZE, table.spec(), partition); writer.close(); Assert.assertEquals(0, writer.result().deleteFiles().size()); @@ -194,11 +195,12 @@ public void testRollingPositionDeleteWriterNoRecords() throws IOException { @Test public void testRollingPositionDeleteWriterSplitDeletes() throws IOException { FileWriterFactory writerFactory = newWriterFactory(table.schema()); - RollingPositionDeleteWriter writer = new RollingPositionDeleteWriter<>( - writerFactory, fileFactory, table.io(), - SMALL_FILE_SIZE, table.spec(), partition); + RollingPositionDeleteWriter writer = + new RollingPositionDeleteWriter<>( + writerFactory, fileFactory, table.io(), SMALL_FILE_SIZE, table.spec(), partition); - List> deletes = Lists.newArrayListWithExpectedSize(4 * FILE_SIZE_CHECK_ROWS_DIVISOR); + List> deletes = + Lists.newArrayListWithExpectedSize(4 * FILE_SIZE_CHECK_ROWS_DIVISOR); for (int index = 0; index < 4 * FILE_SIZE_CHECK_ROWS_DIVISOR; index++) { deletes.add(positionDelete("path/to/data/file-1.parquet", index, null)); } diff --git a/data/src/test/java/org/apache/iceberg/io/TestTaskEqualityDeltaWriter.java b/data/src/test/java/org/apache/iceberg/io/TestTaskEqualityDeltaWriter.java index 82c194436b85..061a7f49a3aa 100644 --- a/data/src/test/java/org/apache/iceberg/io/TestTaskEqualityDeltaWriter.java +++ b/data/src/test/java/org/apache/iceberg/io/TestTaskEqualityDeltaWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.File; @@ -71,11 +70,7 @@ public class TestTaskEqualityDeltaWriter extends TableTestBase { @Parameterized.Parameters(name = "FileFormat = {0}") public static Object[][] parameters() { - return new Object[][] { - {"avro"}, - {"orc"}, - {"parquet"} - }; + return new Object[][] {{"avro"}, {"orc"}, {"parquet"}}; } public TestTaskEqualityDeltaWriter(String fileFormat) { @@ -97,9 +92,7 @@ public void setupTable() throws IOException { this.idFieldId = table.schema().findField("id").fieldId(); this.dataFieldId = table.schema().findField("data").fieldId(); - table.updateProperties() - .defaultFormat(format) - .commit(); + table.updateProperties().defaultFormat(format).commit(); } private Record createRecord(Integer id, String data) { @@ -124,7 +117,8 @@ public void testPureInsert() throws IOException { Assert.assertEquals("Should only have a data file.", 1, result.dataFiles().length); Assert.assertEquals("Should have no delete file", 0, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet(expected), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", expectedRowSet(expected), actualRowSet("*")); deltaWriter = createTaskWriter(eqDeleteFieldIds, eqDeleteRowSchema); for (int i = 20; i < 30; i++) { @@ -137,7 +131,8 @@ public void testPureInsert() throws IOException { Assert.assertEquals("Should only have a data file.", 1, result.dataFiles().length); Assert.assertEquals("Should have no delete file", 0, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet(expected), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", expectedRowSet(expected), actualRowSet("*")); } @Test @@ -161,36 +156,42 @@ public void testInsertDuplicatedKey() throws IOException { Assert.assertEquals("Should have a data file.", 1, result.dataFiles().length); Assert.assertEquals("Should have a pos-delete file", 1, result.deleteFiles().length); DeleteFile posDeleteFile = result.deleteFiles()[0]; - Assert.assertEquals("Should be a pos-delete file", FileContent.POSITION_DELETES, posDeleteFile.content()); + Assert.assertEquals( + "Should be a pos-delete file", FileContent.POSITION_DELETES, posDeleteFile.content()); Assert.assertEquals(1, result.referencedDataFiles().length); - Assert.assertEquals("Should have expected records", expectedRowSet(ImmutableList.of( - createRecord(4, "eee"), - createRecord(3, "fff"), - createRecord(2, "ggg"), - createRecord(1, "hhh") - )), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", + expectedRowSet( + ImmutableList.of( + createRecord(4, "eee"), + createRecord(3, "fff"), + createRecord(2, "ggg"), + createRecord(1, "hhh"))), + actualRowSet("*")); // Check records in the data file. DataFile dataFile = result.dataFiles()[0]; - Assert.assertEquals(ImmutableList.of( - createRecord(1, "aaa"), - createRecord(2, "bbb"), - createRecord(3, "ccc"), - createRecord(4, "ddd"), - createRecord(4, "eee"), - createRecord(3, "fff"), - createRecord(2, "ggg"), - createRecord(1, "hhh") - ), readRecordsAsList(table.schema(), dataFile.path())); + Assert.assertEquals( + ImmutableList.of( + createRecord(1, "aaa"), + createRecord(2, "bbb"), + createRecord(3, "ccc"), + createRecord(4, "ddd"), + createRecord(4, "eee"), + createRecord(3, "fff"), + createRecord(2, "ggg"), + createRecord(1, "hhh")), + readRecordsAsList(table.schema(), dataFile.path())); // Check records in the pos-delete file. Schema posDeleteSchema = DeleteSchemaUtil.pathPosSchema(); - Assert.assertEquals(ImmutableList.of( - posRecord.copy("file_path", dataFile.path(), "pos", 0L), - posRecord.copy("file_path", dataFile.path(), "pos", 1L), - posRecord.copy("file_path", dataFile.path(), "pos", 2L), - posRecord.copy("file_path", dataFile.path(), "pos", 3L) - ), readRecordsAsList(posDeleteSchema, posDeleteFile.path())); + Assert.assertEquals( + ImmutableList.of( + posRecord.copy("file_path", dataFile.path(), "pos", 0L), + posRecord.copy("file_path", dataFile.path(), "pos", 1L), + posRecord.copy("file_path", dataFile.path(), "pos", 2L), + posRecord.copy("file_path", dataFile.path(), "pos", 3L)), + readRecordsAsList(posDeleteSchema, posDeleteFile.path())); } @Test @@ -211,17 +212,21 @@ public void testUpsertSameRow() throws IOException { Assert.assertEquals("Should have a data file.", 1, result.dataFiles().length); Assert.assertEquals("Should have a pos-delete file.", 1, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have an expected record", expectedRowSet(ImmutableList.of(record)), actualRowSet("*")); + Assert.assertEquals( + "Should have an expected record", + expectedRowSet(ImmutableList.of(record)), + actualRowSet("*")); // Check records in the data file. DataFile dataFile = result.dataFiles()[0]; - Assert.assertEquals(ImmutableList.of(record, record), readRecordsAsList(table.schema(), dataFile.path())); + Assert.assertEquals( + ImmutableList.of(record, record), readRecordsAsList(table.schema(), dataFile.path())); // Check records in the pos-delete file. DeleteFile posDeleteFile = result.deleteFiles()[0]; - Assert.assertEquals(ImmutableList.of( - posRecord.copy("file_path", dataFile.path(), "pos", 0L) - ), readRecordsAsList(DeleteSchemaUtil.pathPosSchema(), posDeleteFile.path())); + Assert.assertEquals( + ImmutableList.of(posRecord.copy("file_path", dataFile.path(), "pos", 0L)), + readRecordsAsList(DeleteSchemaUtil.pathPosSchema(), posDeleteFile.path())); deltaWriter = createTaskWriter(eqDeleteFieldIds, eqDeleteRowSchema); deltaWriter.delete(record); @@ -229,7 +234,8 @@ public void testUpsertSameRow() throws IOException { Assert.assertEquals("Should have 0 data file.", 0, result.dataFiles().length); Assert.assertEquals("Should have 1 eq-delete file", 1, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have no record", expectedRowSet(ImmutableList.of()), actualRowSet("*")); + Assert.assertEquals( + "Should have no record", expectedRowSet(ImmutableList.of()), actualRowSet("*")); } @Test @@ -247,16 +253,21 @@ public void testUpsertData() throws IOException { // Commit the 1th transaction. WriteResult result = deltaWriter.complete(); Assert.assertEquals("Should have a data file", 1, result.dataFiles().length); - Assert.assertEquals("Should have a pos-delete file for deduplication purpose", 1, result.deleteFiles().length); - Assert.assertEquals("Should be pos-delete file", FileContent.POSITION_DELETES, result.deleteFiles()[0].content()); + Assert.assertEquals( + "Should have a pos-delete file for deduplication purpose", 1, result.deleteFiles().length); + Assert.assertEquals( + "Should be pos-delete file", + FileContent.POSITION_DELETES, + result.deleteFiles()[0].content()); Assert.assertEquals(1, result.referencedDataFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet(ImmutableList.of( - createRecord(2, "bbb"), - createRecord(3, "aaa"), - createRecord(4, "ccc") - )), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", + expectedRowSet( + ImmutableList.of( + createRecord(2, "bbb"), createRecord(3, "aaa"), createRecord(4, "ccc"))), + actualRowSet("*")); // Start the 2nd transaction. deltaWriter = createTaskWriter(eqDeleteFieldIds, eqDeleteRowSchema); @@ -284,35 +295,31 @@ public void testUpsertData() throws IOException { Assert.assertEquals(2, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet(ImmutableList.of( - createRecord(6, "aaa"), - createRecord(7, "ccc") - )), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", + expectedRowSet(ImmutableList.of(createRecord(6, "aaa"), createRecord(7, "ccc"))), + actualRowSet("*")); // Check records in the data file. DataFile dataFile = result.dataFiles()[0]; - Assert.assertEquals(ImmutableList.of( - createRecord(5, "aaa"), - createRecord(6, "aaa"), - createRecord(7, "ccc") - ), readRecordsAsList(table.schema(), dataFile.path())); + Assert.assertEquals( + ImmutableList.of(createRecord(5, "aaa"), createRecord(6, "aaa"), createRecord(7, "ccc")), + readRecordsAsList(table.schema(), dataFile.path())); // Check records in the eq-delete file. DeleteFile eqDeleteFile = result.deleteFiles()[0]; Assert.assertEquals(FileContent.EQUALITY_DELETES, eqDeleteFile.content()); - Assert.assertEquals(ImmutableList.of( - keyFunc.apply("aaa"), - keyFunc.apply("ccc"), - keyFunc.apply("bbb") - ), readRecordsAsList(eqDeleteRowSchema, eqDeleteFile.path())); + Assert.assertEquals( + ImmutableList.of(keyFunc.apply("aaa"), keyFunc.apply("ccc"), keyFunc.apply("bbb")), + readRecordsAsList(eqDeleteRowSchema, eqDeleteFile.path())); // Check records in the pos-delete file. DeleteFile posDeleteFile = result.deleteFiles()[1]; Schema posDeleteSchema = DeleteSchemaUtil.pathPosSchema(); Assert.assertEquals(FileContent.POSITION_DELETES, posDeleteFile.content()); - Assert.assertEquals(ImmutableList.of( - posRecord.copy("file_path", dataFile.path(), "pos", 0L) - ), readRecordsAsList(posDeleteSchema, posDeleteFile.path())); + Assert.assertEquals( + ImmutableList.of(posRecord.copy("file_path", dataFile.path(), "pos", 0L)), + readRecordsAsList(posDeleteSchema, posDeleteFile.path())); } @Test @@ -330,16 +337,21 @@ public void testUpsertDataWithFullRowSchema() throws IOException { // Commit the 1th transaction. WriteResult result = deltaWriter.complete(); Assert.assertEquals("Should have a data file", 1, result.dataFiles().length); - Assert.assertEquals("Should have a pos-delete file for deduplication purpose", 1, result.deleteFiles().length); - Assert.assertEquals("Should be pos-delete file", FileContent.POSITION_DELETES, result.deleteFiles()[0].content()); + Assert.assertEquals( + "Should have a pos-delete file for deduplication purpose", 1, result.deleteFiles().length); + Assert.assertEquals( + "Should be pos-delete file", + FileContent.POSITION_DELETES, + result.deleteFiles()[0].content()); Assert.assertEquals(1, result.referencedDataFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet(ImmutableList.of( - createRecord(2, "bbb"), - createRecord(3, "aaa"), - createRecord(4, "ccc") - )), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", + expectedRowSet( + ImmutableList.of( + createRecord(2, "bbb"), createRecord(3, "aaa"), createRecord(4, "ccc"))), + actualRowSet("*")); // Start the 2nd transaction. deltaWriter = createTaskWriter(eqDeleteFieldIds, eqDeleteRowSchema); @@ -366,35 +378,31 @@ public void testUpsertDataWithFullRowSchema() throws IOException { Assert.assertEquals(1, result.referencedDataFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet(ImmutableList.of( - createRecord(6, "aaa"), - createRecord(7, "ccc") - )), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", + expectedRowSet(ImmutableList.of(createRecord(6, "aaa"), createRecord(7, "ccc"))), + actualRowSet("*")); // Check records in the data file. DataFile dataFile = result.dataFiles()[0]; - Assert.assertEquals(ImmutableList.of( - createRecord(5, "aaa"), - createRecord(6, "aaa"), - createRecord(7, "ccc") - ), readRecordsAsList(table.schema(), dataFile.path())); + Assert.assertEquals( + ImmutableList.of(createRecord(5, "aaa"), createRecord(6, "aaa"), createRecord(7, "ccc")), + readRecordsAsList(table.schema(), dataFile.path())); // Check records in the eq-delete file. DeleteFile eqDeleteFile = result.deleteFiles()[0]; Assert.assertEquals(FileContent.EQUALITY_DELETES, eqDeleteFile.content()); - Assert.assertEquals(ImmutableList.of( - createRecord(3, "aaa"), - createRecord(4, "ccc"), - createRecord(2, "bbb") - ), readRecordsAsList(eqDeleteRowSchema, eqDeleteFile.path())); + Assert.assertEquals( + ImmutableList.of(createRecord(3, "aaa"), createRecord(4, "ccc"), createRecord(2, "bbb")), + readRecordsAsList(eqDeleteRowSchema, eqDeleteFile.path())); // Check records in the pos-delete file. DeleteFile posDeleteFile = result.deleteFiles()[1]; Schema posDeleteSchema = DeleteSchemaUtil.pathPosSchema(); Assert.assertEquals(FileContent.POSITION_DELETES, posDeleteFile.content()); - Assert.assertEquals(ImmutableList.of( - posRecord.copy("file_path", dataFile.path(), "pos", 0L) - ), readRecordsAsList(posDeleteSchema, posDeleteFile.path())); + Assert.assertEquals( + ImmutableList.of(posRecord.copy("file_path", dataFile.path(), "pos", 0L)), + readRecordsAsList(posDeleteSchema, posDeleteFile.path())); } private void commitTransaction(WriteResult result) { @@ -402,7 +410,8 @@ private void commitTransaction(WriteResult result) { Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); - rowDelta.validateDeletedFiles() + rowDelta + .validateDeletedFiles() .validateDataFilesExist(Lists.newArrayList(result.referencedDataFiles())) .commit(); } @@ -424,13 +433,19 @@ private StructLikeSet actualRowSet(String... columns) throws IOException { /** * Create a generic task equality delta writer. * - * @param equalityFieldIds defines the equality field ids. - * @param eqDeleteRowSchema defines the schema of rows that eq-delete writer will write, it could be the entire fields - * of the table schema. + * @param equalityFieldIds defines the equality field ids. + * @param eqDeleteRowSchema defines the schema of rows that eq-delete writer will write, it could + * be the entire fields of the table schema. */ - private GenericTaskDeltaWriter createTaskWriter(List equalityFieldIds, Schema eqDeleteRowSchema) { - FileAppenderFactory appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), - ArrayUtil.toIntArray(equalityFieldIds), eqDeleteRowSchema, null); + private GenericTaskDeltaWriter createTaskWriter( + List equalityFieldIds, Schema eqDeleteRowSchema) { + FileAppenderFactory appenderFactory = + new GenericAppenderFactory( + table.schema(), + table.spec(), + ArrayUtil.toIntArray(equalityFieldIds), + eqDeleteRowSchema, + null); List columns = Lists.newArrayList(); for (Integer fieldId : equalityFieldIds) { @@ -438,16 +453,29 @@ private GenericTaskDeltaWriter createTaskWriter(List equalityFieldIds, } Schema deleteSchema = table.schema().select(columns); - return new GenericTaskDeltaWriter(table.schema(), deleteSchema, table.spec(), format, appenderFactory, - fileFactory, table.io(), TARGET_FILE_SIZE); + return new GenericTaskDeltaWriter( + table.schema(), + deleteSchema, + table.spec(), + format, + appenderFactory, + fileFactory, + table.io(), + TARGET_FILE_SIZE); } private static class GenericTaskDeltaWriter extends BaseTaskWriter { private final GenericEqualityDeltaWriter deltaWriter; - private GenericTaskDeltaWriter(Schema schema, Schema deleteSchema, PartitionSpec spec, FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize) { + private GenericTaskDeltaWriter( + Schema schema, + Schema deleteSchema, + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.deltaWriter = new GenericEqualityDeltaWriter(null, schema, deleteSchema); } @@ -472,7 +500,8 @@ public void close() throws IOException { } private class GenericEqualityDeltaWriter extends BaseEqualityDeltaWriter { - private GenericEqualityDeltaWriter(PartitionKey partition, Schema schema, Schema eqDeleteSchema) { + private GenericEqualityDeltaWriter( + PartitionKey partition, Schema schema, Schema eqDeleteSchema) { super(partition, schema, eqDeleteSchema); } @@ -494,24 +523,25 @@ private List readRecordsAsList(Schema schema, CharSequence path) throws InputFile inputFile = Files.localInput(path.toString()); switch (format) { case PARQUET: - iterable = Parquet.read(inputFile) - .project(schema) - .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema)) - .build(); + iterable = + Parquet.read(inputFile) + .project(schema) + .createReaderFunc( + fileSchema -> GenericParquetReaders.buildReader(schema, fileSchema)) + .build(); break; case AVRO: - iterable = Avro.read(inputFile) - .project(schema) - .createReaderFunc(DataReader::create) - .build(); + iterable = + Avro.read(inputFile).project(schema).createReaderFunc(DataReader::create).build(); break; case ORC: - iterable = ORC.read(inputFile) - .project(schema) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(schema, fileSchema)) - .build(); + iterable = + ORC.read(inputFile) + .project(schema) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(schema, fileSchema)) + .build(); break; default: diff --git a/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java b/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java index c1575b46743b..b559b976afa1 100644 --- a/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java +++ b/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; @@ -51,9 +53,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public abstract class TestWriterMetrics { @@ -62,18 +61,15 @@ public abstract class TestWriterMetrics { protected static final Types.NestedField ID_FIELD = required(1, "id", Types.IntegerType.get()); protected static final Types.NestedField DATA_FIELD = optional(2, "data", Types.StringType.get()); - protected static final Types.StructType NESTED_FIELDS = Types.StructType.of( - required(4, "booleanField", Types.BooleanType.get()), - optional(5, "longValue", Types.LongType.get())); + protected static final Types.StructType NESTED_FIELDS = + Types.StructType.of( + required(4, "booleanField", Types.BooleanType.get()), + optional(5, "longValue", Types.LongType.get())); protected static final Types.NestedField STRUCT_FIELD = optional(3, "structField", NESTED_FIELDS); // create a schema with all supported fields - protected static final Schema SCHEMA = new Schema( - ID_FIELD, - DATA_FIELD, - STRUCT_FIELD - ); + protected static final Schema SCHEMA = new Schema(ID_FIELD, DATA_FIELD, STRUCT_FIELD); protected static final SortOrder sortOrder = SortOrder.builderFor(SCHEMA).asc("id").asc("structField.longValue").build(); @@ -81,8 +77,7 @@ public abstract class TestWriterMetrics { protected static final Map properties = ImmutableMap.of(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); protected FileFormat fileFormat; protected TestTables.TestTable table = null; @@ -90,10 +85,7 @@ public abstract class TestWriterMetrics { @Parameterized.Parameters(name = "FileFormat = {0}") public static Object[][] parameters() { - return new Object[][] { - {FileFormat.ORC}, - {FileFormat.PARQUET} - }; + return new Object[][] {{FileFormat.ORC}, {FileFormat.PARQUET}}; } public TestWriterMetrics(FileFormat fileFormat) { @@ -111,13 +103,9 @@ public void setupTable() throws Exception { File tableDir = temp.newFolder(); tableDir.delete(); // created by table create - this.table = TestTables.create( - tableDir, - "test", - SCHEMA, - PartitionSpec.unpartitioned(), - sortOrder, - FORMAT_V2); + this.table = + TestTables.create( + tableDir, "test", SCHEMA, PartitionSpec.unpartitioned(), sortOrder, FORMAT_V2); table.updateProperties().set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none").commit(); this.fileFactory = OutputFileFactory.builderFor(table, 1, 1).format(fileFormat).build(); @@ -131,36 +119,39 @@ public void after() { @Test public void verifySortedColMetric() throws Exception { T row = toRow(3, "3", true, 3L); - DataWriter dataWriter = newWriterFactory(table).newDataWriter( - fileFactory.newOutputFile(), - PartitionSpec.unpartitioned(), - null - ); + DataWriter dataWriter = + newWriterFactory(table) + .newDataWriter(fileFactory.newOutputFile(), PartitionSpec.unpartitioned(), null); dataWriter.write(row); dataWriter.close(); DataFile dataFile = dataWriter.toDataFile(); // Only two sorted fields (id, structField.longValue) will have metrics Map lowerBounds = dataFile.lowerBounds(); - Assert.assertEquals(3, (int) Conversions.fromByteBuffer(Types.IntegerType.get(), lowerBounds.get(1))); + Assert.assertEquals( + 3, (int) Conversions.fromByteBuffer(Types.IntegerType.get(), lowerBounds.get(1))); Assert.assertFalse(lowerBounds.containsKey(2)); Assert.assertFalse(lowerBounds.containsKey(3)); Assert.assertFalse(lowerBounds.containsKey(4)); - Assert.assertEquals(3L, (long) Conversions.fromByteBuffer(Types.LongType.get(), lowerBounds.get(5))); + Assert.assertEquals( + 3L, (long) Conversions.fromByteBuffer(Types.LongType.get(), lowerBounds.get(5))); Map upperBounds = dataFile.upperBounds(); - Assert.assertEquals(3, (int) Conversions.fromByteBuffer(Types.IntegerType.get(), upperBounds.get(1))); + Assert.assertEquals( + 3, (int) Conversions.fromByteBuffer(Types.IntegerType.get(), upperBounds.get(1))); Assert.assertFalse(upperBounds.containsKey(2)); Assert.assertFalse(upperBounds.containsKey(3)); Assert.assertFalse(upperBounds.containsKey(4)); - Assert.assertEquals(3L, (long) Conversions.fromByteBuffer(Types.LongType.get(), upperBounds.get(5))); + Assert.assertEquals( + 3L, (long) Conversions.fromByteBuffer(Types.LongType.get(), upperBounds.get(5))); } @Test public void testPositionDeleteMetrics() throws IOException { FileWriterFactory writerFactory = newWriterFactory(table); EncryptedOutputFile outputFile = fileFactory.newOutputFile(); - PositionDeleteWriter deleteWriter = writerFactory.newPositionDeleteWriter(outputFile, table.spec(), null); + PositionDeleteWriter deleteWriter = + writerFactory.newPositionDeleteWriter(outputFile, table.spec(), null); try { T deletedRow = toRow(3, "3", true, 3L); @@ -176,33 +167,40 @@ public void testPositionDeleteMetrics() throws IOException { int pathFieldId = MetadataColumns.DELETE_FILE_PATH.fieldId(); int posFieldId = MetadataColumns.DELETE_FILE_POS.fieldId(); - // should have metrics for _file and _pos as well as two sorted fields (id, structField.longValue) + // should have metrics for _file and _pos as well as two sorted fields (id, + // structField.longValue) Map lowerBounds = deleteFile.lowerBounds(); Assert.assertEquals( CharBuffer.wrap("File A"), Conversions.fromByteBuffer(Types.StringType.get(), lowerBounds.get(pathFieldId))); - Assert.assertEquals(1L, (long) Conversions.fromByteBuffer(Types.LongType.get(), lowerBounds.get(posFieldId))); + Assert.assertEquals( + 1L, (long) Conversions.fromByteBuffer(Types.LongType.get(), lowerBounds.get(posFieldId))); - Assert.assertEquals(3, (int) Conversions.fromByteBuffer(Types.IntegerType.get(), lowerBounds.get(1))); + Assert.assertEquals( + 3, (int) Conversions.fromByteBuffer(Types.IntegerType.get(), lowerBounds.get(1))); Assert.assertFalse(lowerBounds.containsKey(2)); Assert.assertFalse(lowerBounds.containsKey(3)); Assert.assertFalse(lowerBounds.containsKey(4)); - Assert.assertEquals(3L, (long) Conversions.fromByteBuffer(Types.LongType.get(), lowerBounds.get(5))); + Assert.assertEquals( + 3L, (long) Conversions.fromByteBuffer(Types.LongType.get(), lowerBounds.get(5))); Map upperBounds = deleteFile.upperBounds(); Assert.assertEquals( CharBuffer.wrap("File A"), Conversions.fromByteBuffer(Types.StringType.get(), upperBounds.get(pathFieldId))); - Assert.assertEquals(1L, (long) Conversions.fromByteBuffer(Types.LongType.get(), upperBounds.get(posFieldId))); + Assert.assertEquals( + 1L, (long) Conversions.fromByteBuffer(Types.LongType.get(), upperBounds.get(posFieldId))); - Assert.assertEquals(3, (int) Conversions.fromByteBuffer(Types.IntegerType.get(), upperBounds.get(1))); + Assert.assertEquals( + 3, (int) Conversions.fromByteBuffer(Types.IntegerType.get(), upperBounds.get(1))); Assert.assertFalse(upperBounds.containsKey(2)); Assert.assertFalse(upperBounds.containsKey(3)); Assert.assertFalse(upperBounds.containsKey(4)); - Assert.assertEquals(3L, (long) Conversions.fromByteBuffer(Types.LongType.get(), upperBounds.get(5))); + Assert.assertEquals( + 3L, (long) Conversions.fromByteBuffer(Types.LongType.get(), upperBounds.get(5))); } @Test @@ -217,22 +215,21 @@ public void testMaxColumns() throws IOException { } Schema maxColSchema = new Schema(fields); - Table maxColumnTable = TestTables.create( - tableDir, - "max_col_table", - maxColSchema, - PartitionSpec.unpartitioned(), - SortOrder.unsorted(), - FORMAT_V2); - OutputFileFactory maxColFactory = OutputFileFactory.builderFor(maxColumnTable, 1, 1) - .format(fileFormat).build(); + Table maxColumnTable = + TestTables.create( + tableDir, + "max_col_table", + maxColSchema, + PartitionSpec.unpartitioned(), + SortOrder.unsorted(), + FORMAT_V2); + OutputFileFactory maxColFactory = + OutputFileFactory.builderFor(maxColumnTable, 1, 1).format(fileFormat).build(); T row = toGenericRow(1, numColumns); - DataWriter dataWriter = newWriterFactory(maxColumnTable).newDataWriter( - maxColFactory.newOutputFile(), - PartitionSpec.unpartitioned(), - null - ); + DataWriter dataWriter = + newWriterFactory(maxColumnTable) + .newDataWriter(maxColFactory.newOutputFile(), PartitionSpec.unpartitioned(), null); dataWriter.add(row); dataWriter.close(); DataFile dataFile = dataWriter.toDataFile(); @@ -242,7 +239,9 @@ public void testMaxColumns() throws IOException { for (; id <= 32; id += 1) { Assert.assertNotNull("Should have lower bound metrics", dataFile.lowerBounds().get(id)); Assert.assertNotNull("Should have upper bound metrics", dataFile.upperBounds().get(id)); - Assert.assertNull("Should not have nan value metrics (not floating point)", dataFile.nanValueCounts().get(id)); + Assert.assertNull( + "Should not have nan value metrics (not floating point)", + dataFile.nanValueCounts().get(id)); Assert.assertNotNull("Should have null value metrics", dataFile.nullValueCounts().get(id)); Assert.assertNotNull("Should have value metrics", dataFile.valueCounts().get(id)); } @@ -252,7 +251,8 @@ public void testMaxColumns() throws IOException { Assert.assertNull("Should not have any lower bound metrics", dataFile.lowerBounds().get(id)); Assert.assertNull("Should not have any upper bound metrics", dataFile.upperBounds().get(id)); Assert.assertNull("Should not have any nan value metrics", dataFile.nanValueCounts().get(id)); - Assert.assertNull("Should not have any null value metrics", dataFile.nullValueCounts().get(id)); + Assert.assertNull( + "Should not have any null value metrics", dataFile.nullValueCounts().get(id)); Assert.assertNull("Should not have any value metrics", dataFile.valueCounts().get(id)); } } @@ -269,24 +269,27 @@ public void testMaxColumnsWithDefaultOverride() throws IOException { } Schema maxColSchema = new Schema(fields); - Table maxColumnTable = TestTables.create( - tableDir, - "max_col_table", - maxColSchema, - PartitionSpec.unpartitioned(), - SortOrder.unsorted(), - FORMAT_V2); - maxColumnTable.updateProperties().set(TableProperties.DEFAULT_WRITE_METRICS_MODE, - TableProperties.DEFAULT_WRITE_METRICS_MODE_DEFAULT).commit(); - OutputFileFactory maxColFactory = OutputFileFactory.builderFor(maxColumnTable, 1, 1) - .format(fileFormat).build(); + Table maxColumnTable = + TestTables.create( + tableDir, + "max_col_table", + maxColSchema, + PartitionSpec.unpartitioned(), + SortOrder.unsorted(), + FORMAT_V2); + maxColumnTable + .updateProperties() + .set( + TableProperties.DEFAULT_WRITE_METRICS_MODE, + TableProperties.DEFAULT_WRITE_METRICS_MODE_DEFAULT) + .commit(); + OutputFileFactory maxColFactory = + OutputFileFactory.builderFor(maxColumnTable, 1, 1).format(fileFormat).build(); T row = toGenericRow(1, numColumns); - DataWriter dataWriter = newWriterFactory(maxColumnTable).newDataWriter( - maxColFactory.newOutputFile(), - PartitionSpec.unpartitioned(), - null - ); + DataWriter dataWriter = + newWriterFactory(maxColumnTable) + .newDataWriter(maxColFactory.newOutputFile(), PartitionSpec.unpartitioned(), null); dataWriter.add(row); dataWriter.close(); DataFile dataFile = dataWriter.toDataFile(); @@ -295,8 +298,10 @@ public void testMaxColumnsWithDefaultOverride() throws IOException { Map upperBounds = dataFile.upperBounds(); Map lowerBounds = dataFile.upperBounds(); for (int i = 0; i < numColumns; i++) { - Assert.assertEquals(1, (int) Conversions.fromByteBuffer(Types.IntegerType.get(), upperBounds.get(1))); - Assert.assertEquals(1, (int) Conversions.fromByteBuffer(Types.IntegerType.get(), lowerBounds.get(1))); + Assert.assertEquals( + 1, (int) Conversions.fromByteBuffer(Types.IntegerType.get(), upperBounds.get(1))); + Assert.assertEquals( + 1, (int) Conversions.fromByteBuffer(Types.IntegerType.get(), lowerBounds.get(1))); } } } diff --git a/data/src/test/java/org/apache/iceberg/io/WriterTestBase.java b/data/src/test/java/org/apache/iceberg/io/WriterTestBase.java index 661ab642f516..b8e99515598a 100644 --- a/data/src/test/java/org/apache/iceberg/io/WriterTestBase.java +++ b/data/src/test/java/org/apache/iceberg/io/WriterTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.io; import java.io.IOException; @@ -40,16 +39,19 @@ public WriterTestBase(int formatVersion) { super(formatVersion); } - protected abstract FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema); + protected abstract FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema); - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, List equalityFieldIds, Schema equalityDeleteRowSchema) { return newWriterFactory(dataSchema, equalityFieldIds, equalityDeleteRowSchema, null); } - protected FileWriterFactory newWriterFactory(Schema dataSchema, Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, Schema positionDeleteRowSchema) { return newWriterFactory(dataSchema, null, null, positionDeleteRowSchema); } @@ -76,8 +78,13 @@ protected StructLikeSet actualRowSet(String... columns) throws IOException { return set; } - protected DataFile writeData(FileWriterFactory writerFactory, OutputFileFactory fileFactory, - List rows, PartitionSpec spec, StructLike partitionKey) throws IOException { + protected DataFile writeData( + FileWriterFactory writerFactory, + OutputFileFactory fileFactory, + List rows, + PartitionSpec spec, + StructLike partitionKey) + throws IOException { EncryptedOutputFile file = fileFactory.newOutputFile(spec, partitionKey); DataWriter writer = writerFactory.newDataWriter(file, spec, partitionKey); diff --git a/data/src/test/java/org/apache/iceberg/orc/TestOrcMetrics.java b/data/src/test/java/org/apache/iceberg/orc/TestOrcMetrics.java index 975b695748aa..724970bc09ed 100644 --- a/data/src/test/java/org/apache/iceberg/orc/TestOrcMetrics.java +++ b/data/src/test/java/org/apache/iceberg/orc/TestOrcMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.io.File; @@ -42,18 +41,16 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -/** - * Test Metrics for ORC. - */ +/** Test Metrics for ORC. */ @RunWith(Parameterized.class) public class TestOrcMetrics extends TestMetrics { - static final ImmutableSet BINARY_TYPES = ImmutableSet.of(Type.TypeID.BINARY, - Type.TypeID.FIXED, Type.TypeID.UUID); + static final ImmutableSet BINARY_TYPES = + ImmutableSet.of(Type.TypeID.BINARY, Type.TypeID.FIXED, Type.TypeID.UUID); @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestOrcMetrics(int formatVersion) { @@ -78,23 +75,31 @@ public Metrics getMetrics(Schema schema, Record... records) throws IOException { } @Override - public Metrics getMetrics(Schema schema, MetricsConfig metricsConfig, Record... records) throws IOException { + public Metrics getMetrics(Schema schema, MetricsConfig metricsConfig, Record... records) + throws IOException { return getMetrics(schema, createOutputFile(), ImmutableMap.of(), metricsConfig, records); } @Override - protected Metrics getMetricsForRecordsWithSmallRowGroups(Schema schema, OutputFile outputFile, Record... records) { + protected Metrics getMetricsForRecordsWithSmallRowGroups( + Schema schema, OutputFile outputFile, Record... records) { throw new UnsupportedOperationException("supportsSmallRowGroups = " + supportsSmallRowGroups()); } - private Metrics getMetrics(Schema schema, OutputFile file, Map properties, - MetricsConfig metricsConfig, Record... records) throws IOException { - FileAppender writer = ORC.write(file) - .schema(schema) - .setAll(properties) - .createWriterFunc(GenericOrcWriter::buildWriter) - .metricsConfig(metricsConfig) - .build(); + private Metrics getMetrics( + Schema schema, + OutputFile file, + Map properties, + MetricsConfig metricsConfig, + Record... records) + throws IOException { + FileAppender writer = + ORC.write(file) + .schema(schema) + .setAll(properties) + .createWriterFunc(GenericOrcWriter::buildWriter) + .metricsConfig(metricsConfig) + .build(); try (FileAppender appender = writer) { appender.addAll(Lists.newArrayList(records)); } @@ -111,11 +116,14 @@ private boolean isBinaryType(Type type) { } @Override - protected void assertBounds(int fieldId, Type type, T lowerBound, T upperBound, Metrics metrics) { + protected void assertBounds( + int fieldId, Type type, T lowerBound, T upperBound, Metrics metrics) { if (isBinaryType(type)) { - Assert.assertFalse("ORC binary field should not have lower bounds.", + Assert.assertFalse( + "ORC binary field should not have lower bounds.", metrics.lowerBounds().containsKey(fieldId)); - Assert.assertFalse("ORC binary field should not have upper bounds.", + Assert.assertFalse( + "ORC binary field should not have upper bounds.", metrics.upperBounds().containsKey(fieldId)); return; } diff --git a/data/src/test/java/org/apache/iceberg/parquet/TestGenericMergingMetrics.java b/data/src/test/java/org/apache/iceberg/parquet/TestGenericMergingMetrics.java index 72db36a9908c..b1c03872981a 100644 --- a/data/src/test/java/org/apache/iceberg/parquet/TestGenericMergingMetrics.java +++ b/data/src/test/java/org/apache/iceberg/parquet/TestGenericMergingMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.io.IOException; @@ -35,8 +34,9 @@ public TestGenericMergingMetrics(FileFormat fileFormat) { @Override protected FileAppender writeAndGetAppender(List records) throws IOException { - FileAppender appender = new GenericAppenderFactory(SCHEMA).newAppender( - org.apache.iceberg.Files.localOutput(temp.newFile()), fileFormat); + FileAppender appender = + new GenericAppenderFactory(SCHEMA) + .newAppender(org.apache.iceberg.Files.localOutput(temp.newFile()), fileFormat); try (FileAppender fileAppender = appender) { records.forEach(fileAppender::add); } diff --git a/data/src/test/java/org/apache/iceberg/parquet/TestParquetMetrics.java b/data/src/test/java/org/apache/iceberg/parquet/TestParquetMetrics.java index 457d9c3df2fc..f363e5d979d3 100644 --- a/data/src/test/java/org/apache/iceberg/parquet/TestParquetMetrics.java +++ b/data/src/test/java/org/apache/iceberg/parquet/TestParquetMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.io.File; @@ -41,17 +40,15 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -/** - * Test Metrics for Parquet. - */ +/** Test Metrics for Parquet. */ @RunWith(Parameterized.class) public class TestParquetMetrics extends TestMetrics { - private static final Map SMALL_ROW_GROUP_CONFIG = ImmutableMap.of( - TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, "1600"); + private static final Map SMALL_ROW_GROUP_CONFIG = + ImmutableMap.of(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, "1600"); @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { - return new Object[] { 1, 2 }; + return new Object[] {1, 2}; } public TestParquetMetrics(int formatVersion) { @@ -76,18 +73,25 @@ public Metrics getMetrics(Schema schema, Record... records) throws IOException { } @Override - public Metrics getMetrics(Schema schema, MetricsConfig metricsConfig, Record... records) throws IOException { + public Metrics getMetrics(Schema schema, MetricsConfig metricsConfig, Record... records) + throws IOException { return getMetrics(schema, createOutputFile(), ImmutableMap.of(), metricsConfig, records); } - private Metrics getMetrics(Schema schema, OutputFile file, Map properties, - MetricsConfig metricsConfig, Record... records) throws IOException { - FileAppender writer = Parquet.write(file) - .schema(schema) - .setAll(properties) - .createWriterFunc(GenericParquetWriter::buildWriter) - .metricsConfig(metricsConfig) - .build(); + private Metrics getMetrics( + Schema schema, + OutputFile file, + Map properties, + MetricsConfig metricsConfig, + Record... records) + throws IOException { + FileAppender writer = + Parquet.write(file) + .schema(schema) + .setAll(properties) + .createWriterFunc(GenericParquetWriter::buildWriter) + .metricsConfig(metricsConfig) + .build(); try (FileAppender appender = writer) { appender.addAll(Lists.newArrayList(records)); } @@ -97,7 +101,8 @@ private Metrics getMetrics(Schema schema, OutputFile file, Map p @Override protected Metrics getMetricsForRecordsWithSmallRowGroups( Schema schema, OutputFile outputFile, Record... records) throws IOException { - return getMetrics(schema, outputFile, SMALL_ROW_GROUP_CONFIG, MetricsConfig.getDefault(), records); + return getMetrics( + schema, outputFile, SMALL_ROW_GROUP_CONFIG, MetricsConfig.getDefault(), records); } @Override diff --git a/dell/src/main/java/org/apache/iceberg/dell/DellClientFactories.java b/dell/src/main/java/org/apache/iceberg/dell/DellClientFactories.java index 1048b231ed8a..614fc017063f 100644 --- a/dell/src/main/java/org/apache/iceberg/dell/DellClientFactories.java +++ b/dell/src/main/java/org/apache/iceberg/dell/DellClientFactories.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell; import com.emc.object.s3.S3Client; @@ -29,12 +28,12 @@ public class DellClientFactories { - private DellClientFactories() { - } + private DellClientFactories() {} public static DellClientFactory from(Map properties) { - String factoryImpl = PropertyUtil.propertyAsString( - properties, DellProperties.CLIENT_FACTORY, DefaultDellClientFactory.class.getName()); + String factoryImpl = + PropertyUtil.propertyAsString( + properties, DellProperties.CLIENT_FACTORY, DefaultDellClientFactory.class.getName()); return loadClientFactory(factoryImpl, properties); } @@ -43,8 +42,10 @@ private static DellClientFactory loadClientFactory(String impl, Map properties) { this.ecsS3AccessKeyId = properties.get(DellProperties.ECS_S3_ACCESS_KEY_ID); diff --git a/dell/src/main/java/org/apache/iceberg/dell/ecs/BaseEcsFile.java b/dell/src/main/java/org/apache/iceberg/dell/ecs/BaseEcsFile.java index 6494ae7c4649..479b466e3927 100644 --- a/dell/src/main/java/org/apache/iceberg/dell/ecs/BaseEcsFile.java +++ b/dell/src/main/java/org/apache/iceberg/dell/ecs/BaseEcsFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; import com.emc.object.s3.S3Client; diff --git a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsAppendOutputStream.java b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsAppendOutputStream.java index 779a9cf23b22..733cf448d328 100644 --- a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsAppendOutputStream.java +++ b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsAppendOutputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; import com.emc.object.s3.S3Client; @@ -29,9 +28,7 @@ import org.apache.iceberg.metrics.MetricsContext.Counter; import org.apache.iceberg.metrics.MetricsContext.Unit; -/** - * Use ECS append API to write data. - */ +/** Use ECS append API to write data. */ class EcsAppendOutputStream extends PositionOutputStream { private final S3Client client; @@ -40,49 +37,42 @@ class EcsAppendOutputStream extends PositionOutputStream { /** * Local bytes cache that avoid too many requests - *

- * Use {@link ByteBuffer} to maintain offset. + * + *

Use {@link ByteBuffer} to maintain offset. */ private final ByteBuffer localCache; - /** - * A marker for data file to put first part instead of append first part. - */ + /** A marker for data file to put first part instead of append first part. */ private boolean firstPart = true; - /** - * Pos for {@link PositionOutputStream} - */ + /** Pos for {@link PositionOutputStream} */ private long pos; private final Counter writeBytes; private final Counter writeOperations; - private EcsAppendOutputStream(S3Client client, EcsURI uri, byte[] localCache, MetricsContext metrics) { + private EcsAppendOutputStream( + S3Client client, EcsURI uri, byte[] localCache, MetricsContext metrics) { this.client = client; this.uri = uri; this.localCache = ByteBuffer.wrap(localCache); this.writeBytes = metrics.counter(FileIOMetricsContext.WRITE_BYTES, Long.class, Unit.BYTES); - this.writeOperations = metrics.counter(FileIOMetricsContext.WRITE_OPERATIONS, Integer.class, Unit.COUNT); + this.writeOperations = + metrics.counter(FileIOMetricsContext.WRITE_OPERATIONS, Integer.class, Unit.COUNT); } - /** - * Use built-in 1 KiB byte buffer - */ + /** Use built-in 1 KiB byte buffer */ static EcsAppendOutputStream create(S3Client client, EcsURI uri, MetricsContext metrics) { return createWithBufferSize(client, uri, 1024, metrics); } - /** - * Create {@link PositionOutputStream} with specific buffer size. - */ - static EcsAppendOutputStream createWithBufferSize(S3Client client, EcsURI uri, int size, MetricsContext metrics) { + /** Create {@link PositionOutputStream} with specific buffer size. */ + static EcsAppendOutputStream createWithBufferSize( + S3Client client, EcsURI uri, int size, MetricsContext metrics) { return new EcsAppendOutputStream(client, uri, new byte[size], metrics); } - /** - * Write a byte. If buffer is full, upload the buffer. - */ + /** Write a byte. If buffer is full, upload the buffer. */ @Override public void write(int b) { if (!checkBuffer(1)) { @@ -96,9 +86,8 @@ public void write(int b) { } /** - * Write a byte. - * If buffer is full, upload the buffer. - * If buffer size < input bytes, upload input bytes. + * Write a byte. If buffer is full, upload the buffer. If buffer size < input bytes, upload + * input bytes. */ @Override public void write(byte[] b, int off, int len) { @@ -124,25 +113,23 @@ private boolean checkBuffer(int nextWrite) { private void flushBuffer(byte[] buffer, int offset, int length) { if (firstPart) { - client.putObject(new PutObjectRequest(uri.bucket(), uri.name(), - new ByteArrayInputStream(buffer, offset, length))); + client.putObject( + new PutObjectRequest( + uri.bucket(), uri.name(), new ByteArrayInputStream(buffer, offset, length))); firstPart = false; } else { - client.appendObject(uri.bucket(), uri.name(), new ByteArrayInputStream(buffer, offset, length)); + client.appendObject( + uri.bucket(), uri.name(), new ByteArrayInputStream(buffer, offset, length)); } } - /** - * Pos of the file - */ + /** Pos of the file */ @Override public long getPos() { return pos; } - /** - * Write cached bytes if present. - */ + /** Write cached bytes if present. */ @Override public void flush() { if (localCache.remaining() < localCache.capacity()) { @@ -152,9 +139,7 @@ public void flush() { } } - /** - * Trigger flush() when closing stream. - */ + /** Trigger flush() when closing stream. */ @Override public void close() { flush(); diff --git a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsCatalog.java b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsCatalog.java index 09f1f7445e50..20831f147316 100644 --- a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsCatalog.java +++ b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; import com.emc.object.s3.S3Client; @@ -65,19 +64,13 @@ public class EcsCatalog extends BaseMetastoreCatalog implements Closeable, SupportsNamespaces, Configurable { - /** - * Suffix of table metadata object - */ + /** Suffix of table metadata object */ private static final String TABLE_OBJECT_SUFFIX = ".table"; - /** - * Suffix of namespace metadata object - */ + /** Suffix of namespace metadata object */ private static final String NAMESPACE_OBJECT_SUFFIX = ".namespace"; - /** - * Key of properties version in ECS object user metadata. - */ + /** Key of properties version in ECS object user metadata. */ private static final String PROPERTIES_VERSION_USER_METADATA_KEY = "iceberg_properties_version"; private static final Logger LOG = LoggerFactory.getLogger(EcsCatalog.class); @@ -86,25 +79,24 @@ public class EcsCatalog extends BaseMetastoreCatalog private Object hadoopConf; private String catalogName; - /** - * Warehouse is unified with other catalog that without delimiter. - */ + /** Warehouse is unified with other catalog that without delimiter. */ private EcsURI warehouseLocation; + private FileIO fileIO; private CloseableGroup closeableGroup; /** * No-arg constructor to load the catalog dynamically. - *

- * All fields are initialized by calling {@link EcsCatalog#initialize(String, Map)} later. + * + *

All fields are initialized by calling {@link EcsCatalog#initialize(String, Map)} later. */ - public EcsCatalog() { - } + public EcsCatalog() {} @Override public void initialize(String name, Map properties) { String inputWarehouseLocation = properties.get(CatalogProperties.WAREHOUSE_LOCATION); - Preconditions.checkArgument(inputWarehouseLocation != null && inputWarehouseLocation.length() > 0, + Preconditions.checkArgument( + inputWarehouseLocation != null && inputWarehouseLocation.length() > 0, "Cannot initialize EcsCatalog because warehousePath must not be null or empty"); this.catalogName = name; @@ -131,18 +123,20 @@ private FileIO initializeFileIO(Map properties) { @Override protected TableOperations newTableOps(TableIdentifier tableIdentifier) { - return new EcsTableOperations(String.format("%s.%s", catalogName, tableIdentifier), - tableURI(tableIdentifier), fileIO, this); + return new EcsTableOperations( + String.format("%s.%s", catalogName, tableIdentifier), + tableURI(tableIdentifier), + fileIO, + this); } @Override protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { - return String.format("%s/%s", namespacePrefix(tableIdentifier.namespace()), tableIdentifier.name()); + return String.format( + "%s/%s", namespacePrefix(tableIdentifier.namespace()), tableIdentifier.name()); } - /** - * Iterate all table objects with the namespace prefix. - */ + /** Iterate all table objects with the namespace prefix. */ @Override public List listTables(Namespace namespace) { if (!namespace.isEmpty() && !namespaceExists(namespace)) { @@ -154,49 +148,46 @@ public List listTables(Namespace namespace) { // Add the end slash when delimiter listing EcsURI prefix = new EcsURI(String.format("%s/", namespacePrefix(namespace))); do { - ListObjectsResult listObjectsResult = client.listObjects( - new ListObjectsRequest(prefix.bucket()) - .withDelimiter("/") - .withPrefix(prefix.name()) - .withMarker(marker)); + ListObjectsResult listObjectsResult = + client.listObjects( + new ListObjectsRequest(prefix.bucket()) + .withDelimiter("/") + .withPrefix(prefix.name()) + .withMarker(marker)); marker = listObjectsResult.getNextMarker(); - results.addAll(listObjectsResult.getObjects().stream() - .filter(s3Object -> s3Object.getKey().endsWith(TABLE_OBJECT_SUFFIX)) - .map(object -> parseTableId(namespace, prefix, object)) - .collect(Collectors.toList())); + results.addAll( + listObjectsResult.getObjects().stream() + .filter(s3Object -> s3Object.getKey().endsWith(TABLE_OBJECT_SUFFIX)) + .map(object -> parseTableId(namespace, prefix, object)) + .collect(Collectors.toList())); } while (marker != null); LOG.debug("Listing of namespace: {} resulted in the following tables: {}", namespace, results); return results; } - /** - * Get object prefix of namespace without the end slash. - */ + /** Get object prefix of namespace without the end slash. */ private String namespacePrefix(Namespace namespace) { if (namespace.isEmpty()) { return warehouseLocation.location(); } else { // If the warehouseLocation.name is empty, the leading slash will be ignored - return String.format("%s/%s", warehouseLocation.location(), - String.join("/", namespace.levels())); + return String.format( + "%s/%s", warehouseLocation.location(), String.join("/", namespace.levels())); } } private TableIdentifier parseTableId(Namespace namespace, EcsURI prefix, S3Object s3Object) { String key = s3Object.getKey(); - Preconditions.checkArgument(key.startsWith(prefix.name()), - "List result should have same prefix", key, prefix); + Preconditions.checkArgument( + key.startsWith(prefix.name()), "List result should have same prefix", key, prefix); - String tableName = key.substring( - prefix.name().length(), - key.length() - TABLE_OBJECT_SUFFIX.length()); + String tableName = + key.substring(prefix.name().length(), key.length() - TABLE_OBJECT_SUFFIX.length()); return TableIdentifier.of(namespace, tableName); } - /** - * Remove table object. If the purge flag is set, remove all data objects. - */ + /** Remove table object. If the purge flag is set, remove all data objects. */ @Override public boolean dropTable(TableIdentifier identifier, boolean purge) { if (!tableExists(identifier)) { @@ -220,24 +211,26 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { } private EcsURI tableURI(TableIdentifier id) { - return new EcsURI(String.format("%s/%s%s", namespacePrefix(id.namespace()), id.name(), TABLE_OBJECT_SUFFIX)); + return new EcsURI( + String.format("%s/%s%s", namespacePrefix(id.namespace()), id.name(), TABLE_OBJECT_SUFFIX)); } /** * Table rename will only move table object, the data objects will still be in-place. * * @param from identifier of the table to rename - * @param to new table name + * @param to new table name */ @Override public void renameTable(TableIdentifier from, TableIdentifier to) { if (!namespaceExists(to.namespace())) { - throw new NoSuchNamespaceException("Cannot rename %s to %s because namespace %s does not exist", - from, to, to.namespace()); + throw new NoSuchNamespaceException( + "Cannot rename %s to %s because namespace %s does not exist", from, to, to.namespace()); } if (tableExists(to)) { - throw new AlreadyExistsException("Cannot rename %s because destination table %s exists", from, to); + throw new AlreadyExistsException( + "Cannot rename %s because destination table %s exists", from, to); } EcsURI fromURI = tableURI(from); @@ -249,7 +242,8 @@ public void renameTable(TableIdentifier from, TableIdentifier to) { EcsURI toURI = tableURI(to); if (!putNewProperties(toURI, properties.content())) { - throw new AlreadyExistsException("Cannot rename %s because destination table %s exists", from, to); + throw new AlreadyExistsException( + "Cannot rename %s because destination table %s exists", from, to); } client.deleteObject(fromURI.bucket(), fromURI.name()); @@ -260,7 +254,8 @@ public void renameTable(TableIdentifier from, TableIdentifier to) { public void createNamespace(Namespace namespace, Map properties) { EcsURI namespaceObject = namespaceURI(namespace); if (!putNewProperties(namespaceObject, properties)) { - throw new AlreadyExistsException("namespace %s(%s) has already existed", namespace, namespaceObject); + throw new AlreadyExistsException( + "namespace %s(%s) has already existed", namespace, namespaceObject); } } @@ -279,16 +274,18 @@ public List listNamespaces(Namespace namespace) throws NoSuchNamespac // Add the end slash when delimiter listing EcsURI prefix = new EcsURI(String.format("%s/", namespacePrefix(namespace))); do { - ListObjectsResult listObjectsResult = client.listObjects( - new ListObjectsRequest(prefix.bucket()) - .withDelimiter("/") - .withPrefix(prefix.name()) - .withMarker(marker)); + ListObjectsResult listObjectsResult = + client.listObjects( + new ListObjectsRequest(prefix.bucket()) + .withDelimiter("/") + .withPrefix(prefix.name()) + .withMarker(marker)); marker = listObjectsResult.getNextMarker(); - results.addAll(listObjectsResult.getObjects().stream() - .filter(s3Object -> s3Object.getKey().endsWith(NAMESPACE_OBJECT_SUFFIX)) - .map(object -> parseNamespace(namespace, prefix, object)) - .collect(Collectors.toList())); + results.addAll( + listObjectsResult.getObjects().stream() + .filter(s3Object -> s3Object.getKey().endsWith(NAMESPACE_OBJECT_SUFFIX)) + .map(object -> parseNamespace(namespace, prefix, object)) + .collect(Collectors.toList())); } while (marker != null); LOG.debug("Listing namespace {} returned namespaces: {}", namespace, results); @@ -297,25 +294,24 @@ public List listNamespaces(Namespace namespace) throws NoSuchNamespac private Namespace parseNamespace(Namespace parent, EcsURI prefix, S3Object s3Object) { String key = s3Object.getKey(); - Preconditions.checkArgument(key.startsWith(prefix.name()), - "List result should have same prefix", key, prefix); + Preconditions.checkArgument( + key.startsWith(prefix.name()), "List result should have same prefix", key, prefix); - String namespaceName = key.substring( - prefix.name().length(), - key.length() - NAMESPACE_OBJECT_SUFFIX.length()); + String namespaceName = + key.substring(prefix.name().length(), key.length() - NAMESPACE_OBJECT_SUFFIX.length()); String[] namespace = Arrays.copyOf(parent.levels(), parent.levels().length + 1); namespace[namespace.length - 1] = namespaceName; return Namespace.of(namespace); } - /** - * Load namespace properties. - */ + /** Load namespace properties. */ @Override - public Map loadNamespaceMetadata(Namespace namespace) throws NoSuchNamespaceException { + public Map loadNamespaceMetadata(Namespace namespace) + throws NoSuchNamespaceException { EcsURI namespaceObject = namespaceURI(namespace); if (!objectMetadata(namespaceObject).isPresent()) { - throw new NoSuchNamespaceException("Namespace %s(%s) properties object is absent", namespace, namespaceObject); + throw new NoSuchNamespaceException( + "Namespace %s(%s) properties object is absent", namespace, namespaceObject); } Map result = loadProperties(namespaceObject).content(); @@ -341,12 +337,14 @@ public boolean dropNamespace(Namespace namespace) throws NamespaceNotEmptyExcept } @Override - public boolean setProperties(Namespace namespace, Map properties) throws NoSuchNamespaceException { + public boolean setProperties(Namespace namespace, Map properties) + throws NoSuchNamespaceException { return updateProperties(namespace, r -> r.putAll(properties)); } @Override - public boolean removeProperties(Namespace namespace, Set properties) throws NoSuchNamespaceException { + public boolean removeProperties(Namespace namespace, Set properties) + throws NoSuchNamespaceException { return updateProperties(namespace, r -> r.keySet().removeAll(properties)); } @@ -374,17 +372,19 @@ public boolean tableExists(TableIdentifier identifier) { } private void checkURI(EcsURI uri) { - Preconditions.checkArgument(uri.bucket().equals(warehouseLocation.bucket()), + Preconditions.checkArgument( + uri.bucket().equals(warehouseLocation.bucket()), "Properties object %s should be in same bucket %s", - uri.location(), warehouseLocation.bucket()); - Preconditions.checkArgument(uri.name().startsWith(warehouseLocation.name()), + uri.location(), + warehouseLocation.bucket()); + Preconditions.checkArgument( + uri.name().startsWith(warehouseLocation.name()), "Properties object %s should have the expected prefix %s", - uri.location(), warehouseLocation.name()); + uri.location(), + warehouseLocation.name()); } - /** - * Get S3 object metadata which include E-Tag, user metadata and so on. - */ + /** Get S3 object metadata which include E-Tag, user metadata and so on. */ public Optional objectMetadata(EcsURI uri) { checkURI(uri); try { @@ -398,9 +398,7 @@ public Optional objectMetadata(EcsURI uri) { } } - /** - * Record class of properties content and E-Tag - */ + /** Record class of properties content and E-Tag */ static class Properties { private final String eTag; private final Map content; @@ -419,9 +417,7 @@ public Map content() { } } - /** - * Parse object content and metadata as properties. - */ + /** Parse object content and metadata as properties. */ Properties loadProperties(EcsURI uri) { checkURI(uri); GetObjectResult result = client.getObject(uri.bucket(), uri.name()); @@ -437,15 +433,15 @@ Properties loadProperties(EcsURI uri) { return new Properties(objectMetadata.getETag(), content); } - /** - * Create a new object to store properties. - */ + /** Create a new object to store properties. */ boolean putNewProperties(EcsURI uri, Map properties) { checkURI(uri); - PutObjectRequest request = new PutObjectRequest(uri.bucket(), uri.name(), PropertiesSerDesUtil.toBytes(properties)); - request.setObjectMetadata(new S3ObjectMetadata().addUserMetadata( - PROPERTIES_VERSION_USER_METADATA_KEY, - PropertiesSerDesUtil.currentVersion())); + PutObjectRequest request = + new PutObjectRequest(uri.bucket(), uri.name(), PropertiesSerDesUtil.toBytes(properties)); + request.setObjectMetadata( + new S3ObjectMetadata() + .addUserMetadata( + PROPERTIES_VERSION_USER_METADATA_KEY, PropertiesSerDesUtil.currentVersion())); request.setIfNoneMatch("*"); try { client.putObject(request); @@ -459,20 +455,19 @@ boolean putNewProperties(EcsURI uri, Map properties) { } } - /** - * Update a exist object to store properties. - */ + /** Update a exist object to store properties. */ boolean updatePropertiesObject(EcsURI uri, String eTag, Map properties) { checkURI(uri); // Exclude some keys Map newProperties = new LinkedHashMap<>(properties); // Replace properties object - PutObjectRequest request = new PutObjectRequest(uri.bucket(), uri.name(), - PropertiesSerDesUtil.toBytes(newProperties)); - request.setObjectMetadata(new S3ObjectMetadata().addUserMetadata( - PROPERTIES_VERSION_USER_METADATA_KEY, - PropertiesSerDesUtil.currentVersion())); + PutObjectRequest request = + new PutObjectRequest(uri.bucket(), uri.name(), PropertiesSerDesUtil.toBytes(newProperties)); + request.setObjectMetadata( + new S3ObjectMetadata() + .addUserMetadata( + PROPERTIES_VERSION_USER_METADATA_KEY, PropertiesSerDesUtil.currentVersion())); request.setIfMatch(eTag); try { client.putObject(request); diff --git a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsFileIO.java b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsFileIO.java index a5e957bca60f..54b11e8b3790 100644 --- a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsFileIO.java +++ b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsFileIO.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; import com.emc.object.s3.S3Client; @@ -36,15 +35,16 @@ /** * FileIO implementation backed by Dell EMC ECS. - *

- * Locations used must follow the conventions for ECS URIs (e.g. ecs://bucket/path...). - * URIs with schemes s3, s3a, s3n, https are also treated as ECS object paths. - * Using this FileIO with other schemes will result in {@link org.apache.iceberg.exceptions.ValidationException}. + * + *

Locations used must follow the conventions for ECS URIs (e.g. ecs://bucket/path...). URIs with + * schemes s3, s3a, s3n, https are also treated as ECS object paths. Using this FileIO with other + * schemes will result in {@link org.apache.iceberg.exceptions.ValidationException}. */ public class EcsFileIO implements FileIO { private static final Logger LOG = LoggerFactory.getLogger(EcsFileIO.class); - private static final String DEFAULT_METRICS_IMPL = "org.apache.iceberg.hadoop.HadoopMetricsContext"; + private static final String DEFAULT_METRICS_IMPL = + "org.apache.iceberg.hadoop.HadoopMetricsContext"; private SerializableSupplier s3; private DellProperties dellProperties; @@ -90,12 +90,17 @@ public void initialize(Map properties) { // Report Hadoop metrics if Hadoop is available try { DynConstructors.Ctor ctor = - DynConstructors.builder(MetricsContext.class).hiddenImpl(DEFAULT_METRICS_IMPL, String.class).buildChecked(); + DynConstructors.builder(MetricsContext.class) + .hiddenImpl(DEFAULT_METRICS_IMPL, String.class) + .buildChecked(); MetricsContext context = ctor.newInstance("ecs"); context.initialize(properties); this.metrics = context; } catch (NoClassDefFoundError | NoSuchMethodException | ClassCastException e) { - LOG.warn("Unable to load metrics class: '{}', falling back to null metrics", DEFAULT_METRICS_IMPL, e); + LOG.warn( + "Unable to load metrics class: '{}', falling back to null metrics", + DEFAULT_METRICS_IMPL, + e); } } diff --git a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsInputFile.java b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsInputFile.java index 6bf064fb98cb..dfbef48ed038 100644 --- a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsInputFile.java +++ b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsInputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; import com.emc.object.s3.S3Client; @@ -28,15 +27,18 @@ class EcsInputFile extends BaseEcsFile implements InputFile { public static EcsInputFile fromLocation(String location, S3Client client) { - return new EcsInputFile(client, new EcsURI(location), new DellProperties(), MetricsContext.nullMetrics()); + return new EcsInputFile( + client, new EcsURI(location), new DellProperties(), MetricsContext.nullMetrics()); } - public static EcsInputFile fromLocation(String location, S3Client client, DellProperties dellProperties) { - return new EcsInputFile(client, new EcsURI(location), dellProperties, MetricsContext.nullMetrics()); + public static EcsInputFile fromLocation( + String location, S3Client client, DellProperties dellProperties) { + return new EcsInputFile( + client, new EcsURI(location), dellProperties, MetricsContext.nullMetrics()); } - static EcsInputFile fromLocation(String location, S3Client client, DellProperties dellProperties, - MetricsContext metrics) { + static EcsInputFile fromLocation( + String location, S3Client client, DellProperties dellProperties, MetricsContext metrics) { return new EcsInputFile(client, new EcsURI(location), dellProperties, metrics); } diff --git a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsOutputFile.java b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsOutputFile.java index dd5e4db279ec..b8d0d0739be9 100644 --- a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsOutputFile.java +++ b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsOutputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; import com.emc.object.s3.S3Client; @@ -30,25 +29,29 @@ class EcsOutputFile extends BaseEcsFile implements OutputFile { public static EcsOutputFile fromLocation(String location, S3Client client) { - return new EcsOutputFile(client, new EcsURI(location), new DellProperties(), MetricsContext.nullMetrics()); + return new EcsOutputFile( + client, new EcsURI(location), new DellProperties(), MetricsContext.nullMetrics()); } - public static EcsOutputFile fromLocation(String location, S3Client client, DellProperties dellProperties) { - return new EcsOutputFile(client, new EcsURI(location), dellProperties, MetricsContext.nullMetrics()); + public static EcsOutputFile fromLocation( + String location, S3Client client, DellProperties dellProperties) { + return new EcsOutputFile( + client, new EcsURI(location), dellProperties, MetricsContext.nullMetrics()); } - static EcsOutputFile fromLocation(String location, S3Client client, DellProperties dellProperties, - MetricsContext metrics) { + static EcsOutputFile fromLocation( + String location, S3Client client, DellProperties dellProperties, MetricsContext metrics) { return new EcsOutputFile(client, new EcsURI(location), dellProperties, metrics); } - EcsOutputFile(S3Client client, EcsURI uri, DellProperties dellProperties, MetricsContext metrics) { + EcsOutputFile( + S3Client client, EcsURI uri, DellProperties dellProperties, MetricsContext metrics) { super(client, uri, dellProperties, metrics); } /** - * Create an output stream for the specified location if the target object - * does not exist in ECS at the time of invocation. + * Create an output stream for the specified location if the target object does not exist in ECS + * at the time of invocation. * * @return output stream */ diff --git a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsSeekableInputStream.java b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsSeekableInputStream.java index e350f8e1c800..39698cd5fab3 100644 --- a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsSeekableInputStream.java +++ b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsSeekableInputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; import com.emc.object.Range; @@ -30,11 +29,14 @@ import org.apache.iceberg.metrics.MetricsContext.Unit; /** - * A {@link SeekableInputStream} implementation that warp {@link S3Client#readObjectStream(String, String, Range)} + * A {@link SeekableInputStream} implementation that warp {@link S3Client#readObjectStream(String, + * String, Range)} + * *

    - *
  1. The stream is only be loaded when start reading.
  2. - *
  3. This class won't cache any bytes of content. It only maintains pos of {@link SeekableInputStream}
  4. - *
  5. This class is not thread-safe.
  6. + *
  7. The stream is only be loaded when start reading. + *
  8. This class won't cache any bytes of content. It only maintains pos of {@link + * SeekableInputStream} + *
  9. This class is not thread-safe. *
*/ class EcsSeekableInputStream extends SeekableInputStream { @@ -42,14 +44,11 @@ class EcsSeekableInputStream extends SeekableInputStream { private final S3Client client; private final EcsURI uri; - /** - * Mutable pos set by {@link #seek(long)} - */ + /** Mutable pos set by {@link #seek(long)} */ private long newPos = 0; - /** - * Current pos of object content - */ + /** Current pos of object content */ private long pos = -1; + private InputStream internalStream; private final Counter readBytes; @@ -59,7 +58,8 @@ class EcsSeekableInputStream extends SeekableInputStream { this.client = client; this.uri = uri; this.readBytes = metrics.counter(FileIOMetricsContext.READ_BYTES, Long.class, Unit.BYTES); - this.readOperations = metrics.counter(FileIOMetricsContext.READ_OPERATIONS, Integer.class, Unit.COUNT); + this.readOperations = + metrics.counter(FileIOMetricsContext.READ_OPERATIONS, Integer.class, Unit.COUNT); } @Override diff --git a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsTableOperations.java b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsTableOperations.java index b1ac6b05dca5..9f2c24ac3e60 100644 --- a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsTableOperations.java +++ b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsTableOperations.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; import java.util.Map; @@ -45,7 +44,8 @@ public class EcsTableOperations extends BaseMetastoreTableOperations { */ private String eTag; - public EcsTableOperations(String tableName, EcsURI tableObject, FileIO fileIO, EcsCatalog catalog) { + public EcsTableOperations( + String tableName, EcsURI tableObject, FileIO fileIO, EcsCatalog catalog) { this.tableName = tableName; this.tableObject = tableObject; this.fileIO = fileIO; @@ -67,8 +67,10 @@ protected void doRefresh() { String metadataLocation; if (!catalog.objectMetadata(tableObject).isPresent()) { if (currentMetadataLocation() != null) { - throw new NoSuchTableException("Metadata object %s is absent while refresh a loaded table. " + - "Maybe the table is deleted/moved.", tableObject); + throw new NoSuchTableException( + "Metadata object %s is absent while refresh a loaded table. " + + "Maybe the table is deleted/moved.", + tableObject); } else { metadataLocation = null; } @@ -76,8 +78,8 @@ protected void doRefresh() { EcsCatalog.Properties metadata = catalog.loadProperties(tableObject); this.eTag = metadata.eTag(); metadataLocation = metadata.content().get(ICEBERG_METADATA_LOCATION); - Preconditions.checkNotNull(metadataLocation, - "Can't find location from table metadata %s", tableObject); + Preconditions.checkNotNull( + metadataLocation, "Can't find location from table metadata %s", tableObject); } refreshFromMetadataLocation(metadataLocation); @@ -95,17 +97,17 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { String cachedETag = eTag; Preconditions.checkNotNull(cachedETag, "E-Tag must be not null when update table"); // replace to a new version, the E-Tag should be present and matched - boolean result = catalog.updatePropertiesObject( - tableObject, cachedETag, buildProperties(newMetadataLocation)); + boolean result = + catalog.updatePropertiesObject( + tableObject, cachedETag, buildProperties(newMetadataLocation)); if (!result) { - throw new CommitFailedException("Replace failed, E-Tag %s mismatch for table %s", cachedETag, tableName()); + throw new CommitFailedException( + "Replace failed, E-Tag %s mismatch for table %s", cachedETag, tableName()); } } } - /** - * Build properties for table - */ + /** Build properties for table */ private Map buildProperties(String metadataLocation) { return ImmutableMap.of(ICEBERG_METADATA_LOCATION, metadataLocation); } diff --git a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsURI.java b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsURI.java index 835f32d8121f..b782c6c76969 100644 --- a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsURI.java +++ b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsURI.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; import java.net.URI; @@ -25,9 +24,7 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -/** - * An immutable record class of ECS location - */ +/** An immutable record class of ECS location */ class EcsURI { private static final Set VALID_SCHEME = ImmutableSet.of("ecs", "s3", "s3a", "s3n"); @@ -43,39 +40,29 @@ class EcsURI { URI uri = URI.create(location); ValidationException.check( - VALID_SCHEME.contains(uri.getScheme().toLowerCase()), - "Invalid ecs location: %s", - location); + VALID_SCHEME.contains(uri.getScheme().toLowerCase()), "Invalid ecs location: %s", location); this.bucket = uri.getHost(); this.name = uri.getPath().replaceAll("^/*", ""); } - /** - * The leading slashes of name will be ignored. - */ + /** The leading slashes of name will be ignored. */ EcsURI(String bucket, String name) { this.bucket = bucket; this.name = name.replaceAll("^/*", ""); this.location = String.format("ecs://%s/%s", bucket, name); } - /** - * Returns ECS bucket name. - */ + /** Returns ECS bucket name. */ public String bucket() { return bucket; } - /** - * Returns ECS object name. - */ + /** Returns ECS object name. */ public String name() { return name; } - /** - * Returns original location. - */ + /** Returns original location. */ public String location() { return location; } diff --git a/dell/src/main/java/org/apache/iceberg/dell/ecs/PropertiesSerDesUtil.java b/dell/src/main/java/org/apache/iceberg/dell/ecs/PropertiesSerDesUtil.java index 2139c608f748..0f018e1d3150 100644 --- a/dell/src/main/java/org/apache/iceberg/dell/ecs/PropertiesSerDesUtil.java +++ b/dell/src/main/java/org/apache/iceberg/dell/ecs/PropertiesSerDesUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; import java.io.ByteArrayInputStream; @@ -37,17 +36,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * Convert Map properties to bytes. - */ +/** Convert Map properties to bytes. */ public class PropertiesSerDesUtil { - private PropertiesSerDesUtil() { - } + private PropertiesSerDesUtil() {} - /** - * Version of current implementation. - */ + /** Version of current implementation. */ private static final String CURRENT_VERSION = "0"; private static final Logger LOG = LoggerFactory.getLogger(PropertiesSerDesUtil.class); @@ -58,10 +52,11 @@ private PropertiesSerDesUtil() { * @param version is the version of {@link PropertiesSerDesUtil} */ public static Map read(byte[] content, String version) { - Preconditions.checkArgument(CURRENT_VERSION.equals(version), - "Properties version is not match", version); + Preconditions.checkArgument( + CURRENT_VERSION.equals(version), "Properties version is not match", version); Properties jdkProperties = new Properties(); - try (Reader reader = new InputStreamReader(new ByteArrayInputStream(content), StandardCharsets.UTF_8)) { + try (Reader reader = + new InputStreamReader(new ByteArrayInputStream(content), StandardCharsets.UTF_8)) { jdkProperties.load(reader); } catch (IOException e) { LOG.error("Fail to read properties", e); @@ -77,9 +72,7 @@ public static Map read(byte[] content, String version) { return Collections.unmodifiableMap(properties); } - /** - * Write properties, the version is {@link #currentVersion()} - */ + /** Write properties, the version is {@link #currentVersion()} */ public static byte[] toBytes(Map value) { Properties jdkProperties = new Properties(); for (Map.Entry entry : value.entrySet()) { @@ -87,7 +80,7 @@ public static byte[] toBytes(Map value) { } try (ByteArrayOutputStream output = new ByteArrayOutputStream(); - Writer writer = new OutputStreamWriter(output, StandardCharsets.UTF_8)) { + Writer writer = new OutputStreamWriter(output, StandardCharsets.UTF_8)) { jdkProperties.store(writer, null); return output.toByteArray(); } catch (IOException e) { @@ -96,9 +89,7 @@ public static byte[] toBytes(Map value) { } } - /** - * Get version of current serializer implementation. - */ + /** Get version of current serializer implementation. */ public static String currentVersion() { return CURRENT_VERSION; } diff --git a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsAppendOutputStream.java b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsAppendOutputStream.java index f37c4f502ba0..18b5d621b80e 100644 --- a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsAppendOutputStream.java +++ b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsAppendOutputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; import com.emc.object.Range; @@ -32,17 +31,17 @@ public class TestEcsAppendOutputStream { - @ClassRule - public static EcsS3MockRule rule = EcsS3MockRule.create(); + @ClassRule public static EcsS3MockRule rule = EcsS3MockRule.create(); @Test public void testBaseObjectWrite() throws IOException { String objectName = rule.randomObjectName(); - try (EcsAppendOutputStream output = EcsAppendOutputStream.createWithBufferSize( - rule.client(), - new EcsURI(rule.bucket(), objectName), - 10, - MetricsContext.nullMetrics())) { + try (EcsAppendOutputStream output = + EcsAppendOutputStream.createWithBufferSize( + rule.client(), + new EcsURI(rule.bucket(), objectName), + 10, + MetricsContext.nullMetrics())) { // write 1 byte output.write('1'); // write 3 bytes @@ -53,9 +52,11 @@ public void testBaseObjectWrite() throws IOException { output.write("12345678901".getBytes()); } - try (InputStream input = rule.client().readObjectStream(rule.bucket(), objectName, - Range.fromOffset(0))) { - Assert.assertEquals("Must write all the object content", "1" + "123" + "1234567" + "12345678901", + try (InputStream input = + rule.client().readObjectStream(rule.bucket(), objectName, Range.fromOffset(0))) { + Assert.assertEquals( + "Must write all the object content", + "1" + "123" + "1234567" + "12345678901", new String(ByteStreams.toByteArray(input), StandardCharsets.UTF_8)); } } @@ -63,28 +64,32 @@ public void testBaseObjectWrite() throws IOException { @Test public void testRewrite() throws IOException { String objectName = rule.randomObjectName(); - try (EcsAppendOutputStream output = EcsAppendOutputStream.createWithBufferSize( - rule.client(), - new EcsURI(rule.bucket(), objectName), - 10, - MetricsContext.nullMetrics())) { + try (EcsAppendOutputStream output = + EcsAppendOutputStream.createWithBufferSize( + rule.client(), + new EcsURI(rule.bucket(), objectName), + 10, + MetricsContext.nullMetrics())) { // write 7 bytes output.write("7654321".getBytes()); } - try (EcsAppendOutputStream output = EcsAppendOutputStream.createWithBufferSize( - rule.client(), - new EcsURI(rule.bucket(), objectName), - 10, - MetricsContext.nullMetrics())) { + try (EcsAppendOutputStream output = + EcsAppendOutputStream.createWithBufferSize( + rule.client(), + new EcsURI(rule.bucket(), objectName), + 10, + MetricsContext.nullMetrics())) { // write 14 bytes output.write("1234567".getBytes()); output.write("1234567".getBytes()); } - try (InputStream input = rule.client().readObjectStream(rule.bucket(), objectName, - Range.fromOffset(0))) { - Assert.assertEquals("Must replace the object content", "1234567" + "1234567", + try (InputStream input = + rule.client().readObjectStream(rule.bucket(), objectName, Range.fromOffset(0))) { + Assert.assertEquals( + "Must replace the object content", + "1234567" + "1234567", new String(ByteStreams.toByteArray(input), StandardCharsets.UTF_8)); } } diff --git a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsCatalog.java b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsCatalog.java index 28c9836d8e9c..8c667f6898c6 100644 --- a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsCatalog.java +++ b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsCatalog.java @@ -7,7 +7,7 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -46,15 +47,11 @@ import org.junit.Rule; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestEcsCatalog { - static final Schema SCHEMA = new Schema( - required(1, "id", Types.IntegerType.get())); + static final Schema SCHEMA = new Schema(required(1, "id", Types.IntegerType.get())); - @Rule - public EcsS3MockRule rule = EcsS3MockRule.create(); + @Rule public EcsS3MockRule rule = EcsS3MockRule.create(); private EcsCatalog ecsCatalog; @@ -105,17 +102,23 @@ public void testListTablesAndNamespaces() { public void testNamespaceProperties() { ecsCatalog.createNamespace(Namespace.of("a"), ImmutableMap.of("a", "a")); - Assert.assertEquals("The initial properties", ImmutableMap.of("a", "a"), + Assert.assertEquals( + "The initial properties", + ImmutableMap.of("a", "a"), ecsCatalog.loadNamespaceMetadata(Namespace.of("a"))); ecsCatalog.setProperties(Namespace.of("a"), ImmutableMap.of("b", "b")); - Assert.assertEquals("Update properties", ImmutableMap.of("a", "a", "b", "b"), + Assert.assertEquals( + "Update properties", + ImmutableMap.of("a", "a", "b", "b"), ecsCatalog.loadNamespaceMetadata(Namespace.of("a"))); ecsCatalog.removeProperties(Namespace.of("a"), ImmutableSet.of("a")); - Assert.assertEquals("Remove properties", ImmutableMap.of("b", "b"), + Assert.assertEquals( + "Remove properties", + ImmutableMap.of("b", "b"), ecsCatalog.loadNamespaceMetadata(Namespace.of("a"))); } @@ -137,7 +140,8 @@ public void testDropNamespace() { Assert.assertTrue("Drop namespace [a, b1]", ecsCatalog.dropNamespace(Namespace.of("a", "b1"))); - Assert.assertFalse("The [a, b1] is absent", ecsCatalog.namespaceExists(Namespace.of("a", "b1"))); + Assert.assertFalse( + "The [a, b1] is absent", ecsCatalog.namespaceExists(Namespace.of("a", "b1"))); Assert.assertTrue( "The [a, b1] is not in list result of [a]", ecsCatalog.listNamespaces(Namespace.of("a")).isEmpty()); @@ -148,8 +152,7 @@ public void testDropTable() { ecsCatalog.createTable(TableIdentifier.of("a"), SCHEMA); Assert.assertFalse( - "Drop an unknown table return false", - ecsCatalog.dropTable(TableIdentifier.of("unknown"))); + "Drop an unknown table return false", ecsCatalog.dropTable(TableIdentifier.of("unknown"))); Assert.assertTrue("Drop a table", ecsCatalog.dropTable(TableIdentifier.of("a"), true)); } @@ -168,11 +171,14 @@ public void testRenameTable() { AssertHelpers.assertThrows( "Rename to an unknown namespace should throw exception", NoSuchNamespaceException.class, - () -> ecsCatalog.renameTable(TableIdentifier.of("a", "t1"), TableIdentifier.of("unknown", "t2"))); + () -> + ecsCatalog.renameTable( + TableIdentifier.of("a", "t1"), TableIdentifier.of("unknown", "t2"))); ecsCatalog.renameTable(TableIdentifier.of("a", "t1"), TableIdentifier.of("b", "t2")); - Assert.assertFalse("Old table does not exist", ecsCatalog.tableExists(TableIdentifier.of("a", "t1"))); + Assert.assertFalse( + "Old table does not exist", ecsCatalog.tableExists(TableIdentifier.of("a", "t1"))); Assert.assertTrue("New table exists", ecsCatalog.tableExists(TableIdentifier.of("b", "t2"))); } diff --git a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsInputFile.java b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsInputFile.java index 57a947d0c224..e2bb907baccc 100644 --- a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsInputFile.java +++ b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsInputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; import com.emc.object.s3.request.PutObjectRequest; @@ -31,26 +30,24 @@ public class TestEcsInputFile { - @ClassRule - public static EcsS3MockRule rule = EcsS3MockRule.create(); + @ClassRule public static EcsS3MockRule rule = EcsS3MockRule.create(); @Test public void testAbsentFile() { String objectName = rule.randomObjectName(); - EcsInputFile inputFile = EcsInputFile.fromLocation( - new EcsURI(rule.bucket(), objectName).toString(), - rule.client()); + EcsInputFile inputFile = + EcsInputFile.fromLocation(new EcsURI(rule.bucket(), objectName).toString(), rule.client()); Assert.assertFalse("File is absent", inputFile.exists()); } @Test public void testFileRead() throws IOException { String objectName = rule.randomObjectName(); - EcsInputFile inputFile = EcsInputFile.fromLocation( - new EcsURI(rule.bucket(), objectName).toString(), - rule.client()); + EcsInputFile inputFile = + EcsInputFile.fromLocation(new EcsURI(rule.bucket(), objectName).toString(), rule.client()); - rule.client().putObject(new PutObjectRequest(rule.bucket(), objectName, "0123456789".getBytes())); + rule.client() + .putObject(new PutObjectRequest(rule.bucket(), objectName, "0123456789".getBytes())); Assert.assertTrue("File should exists", inputFile.exists()); Assert.assertEquals("File length should be 10", 10, inputFile.getLength()); diff --git a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsOutputFile.java b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsOutputFile.java index 5a1f6cd7bfa3..95c302bf3e18 100644 --- a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsOutputFile.java +++ b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsOutputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; import com.emc.object.Range; @@ -34,25 +33,24 @@ public class TestEcsOutputFile { - @ClassRule - public static EcsS3MockRule rule = EcsS3MockRule.create(); + @ClassRule public static EcsS3MockRule rule = EcsS3MockRule.create(); @Test public void testFileWrite() throws IOException { String objectName = rule.randomObjectName(); - EcsOutputFile outputFile = EcsOutputFile.fromLocation( - new EcsURI(rule.bucket(), objectName).toString(), - rule.client()); + EcsOutputFile outputFile = + EcsOutputFile.fromLocation(new EcsURI(rule.bucket(), objectName).toString(), rule.client()); // File write try (PositionOutputStream output = outputFile.create()) { output.write("1234567890".getBytes()); } - try (InputStream input = rule.client().readObjectStream( - rule.bucket(), objectName, - Range.fromOffset(0))) { - Assert.assertEquals("File content is expected", "1234567890", + try (InputStream input = + rule.client().readObjectStream(rule.bucket(), objectName, Range.fromOffset(0))) { + Assert.assertEquals( + "File content is expected", + "1234567890", new String(ByteStreams.toByteArray(input), StandardCharsets.UTF_8)); } } @@ -60,9 +58,8 @@ public void testFileWrite() throws IOException { @Test public void testFileOverwrite() throws IOException { String objectName = rule.randomObjectName(); - EcsOutputFile outputFile = EcsOutputFile.fromLocation( - new EcsURI(rule.bucket(), objectName).toString(), - rule.client()); + EcsOutputFile outputFile = + EcsOutputFile.fromLocation(new EcsURI(rule.bucket(), objectName).toString(), rule.client()); try (PositionOutputStream output = outputFile.create()) { output.write("1234567890".getBytes()); @@ -72,10 +69,11 @@ public void testFileOverwrite() throws IOException { output.write("abcdefghij".getBytes()); } - try (InputStream input = rule.client().readObjectStream( - rule.bucket(), objectName, - Range.fromOffset(0))) { - Assert.assertEquals("File content should be overwritten", "abcdefghij", + try (InputStream input = + rule.client().readObjectStream(rule.bucket(), objectName, Range.fromOffset(0))) { + Assert.assertEquals( + "File content should be overwritten", + "abcdefghij", new String(ByteStreams.toByteArray(input), StandardCharsets.UTF_8)); } } @@ -83,15 +81,16 @@ public void testFileOverwrite() throws IOException { @Test public void testFileAlreadyExists() throws IOException { String objectName = rule.randomObjectName(); - EcsOutputFile outputFile = EcsOutputFile.fromLocation( - new EcsURI(rule.bucket(), objectName).toString(), - rule.client()); + EcsOutputFile outputFile = + EcsOutputFile.fromLocation(new EcsURI(rule.bucket(), objectName).toString(), rule.client()); try (PositionOutputStream output = outputFile.create()) { output.write("1234567890".getBytes()); } - AssertHelpers.assertThrows("Create should throw exception", AlreadyExistsException.class, + AssertHelpers.assertThrows( + "Create should throw exception", + AlreadyExistsException.class, outputFile.location(), outputFile::create); } diff --git a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsSeekableInputStream.java b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsSeekableInputStream.java index 9f9164e7bafa..6e747f2ace7b 100644 --- a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsSeekableInputStream.java +++ b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsSeekableInputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; import com.emc.object.s3.request.PutObjectRequest; @@ -30,18 +29,17 @@ public class TestEcsSeekableInputStream { - @ClassRule - public static EcsS3MockRule rule = EcsS3MockRule.create(); + @ClassRule public static EcsS3MockRule rule = EcsS3MockRule.create(); @Test public void testSeekPosRead() throws IOException { String objectName = rule.randomObjectName(); - rule.client().putObject(new PutObjectRequest(rule.bucket(), objectName, "0123456789".getBytes())); + rule.client() + .putObject(new PutObjectRequest(rule.bucket(), objectName, "0123456789".getBytes())); - try (EcsSeekableInputStream input = new EcsSeekableInputStream( - rule.client(), - new EcsURI(rule.bucket(), objectName), - MetricsContext.nullMetrics())) { + try (EcsSeekableInputStream input = + new EcsSeekableInputStream( + rule.client(), new EcsURI(rule.bucket(), objectName), MetricsContext.nullMetrics())) { input.seek(2); Assert.assertEquals("Expect 2 when seek to 2", '2', input.read()); } @@ -50,12 +48,12 @@ public void testSeekPosRead() throws IOException { @Test public void testMultipleSeekPosRead() throws IOException { String objectName = rule.randomObjectName(); - rule.client().putObject(new PutObjectRequest(rule.bucket(), objectName, "0123456789".getBytes())); + rule.client() + .putObject(new PutObjectRequest(rule.bucket(), objectName, "0123456789".getBytes())); - try (EcsSeekableInputStream input = new EcsSeekableInputStream( - rule.client(), - new EcsURI(rule.bucket(), objectName), - MetricsContext.nullMetrics())) { + try (EcsSeekableInputStream input = + new EcsSeekableInputStream( + rule.client(), new EcsURI(rule.bucket(), objectName), MetricsContext.nullMetrics())) { input.seek(999); input.seek(3); Assert.assertEquals("Expect 3 when seek to 3 finally", '3', input.read()); @@ -65,12 +63,12 @@ public void testMultipleSeekPosRead() throws IOException { @Test public void testReadOneByte() throws IOException { String objectName = rule.randomObjectName(); - rule.client().putObject(new PutObjectRequest(rule.bucket(), objectName, "0123456789".getBytes())); + rule.client() + .putObject(new PutObjectRequest(rule.bucket(), objectName, "0123456789".getBytes())); - try (EcsSeekableInputStream input = new EcsSeekableInputStream( - rule.client(), - new EcsURI(rule.bucket(), objectName), - MetricsContext.nullMetrics())) { + try (EcsSeekableInputStream input = + new EcsSeekableInputStream( + rule.client(), new EcsURI(rule.bucket(), objectName), MetricsContext.nullMetrics())) { Assert.assertEquals("The first byte should be 0 ", '0', input.read()); } } @@ -78,16 +76,16 @@ public void testReadOneByte() throws IOException { @Test public void testReadBytes() throws IOException { String objectName = rule.randomObjectName(); - rule.client().putObject(new PutObjectRequest(rule.bucket(), objectName, "0123456789".getBytes())); + rule.client() + .putObject(new PutObjectRequest(rule.bucket(), objectName, "0123456789".getBytes())); - try (EcsSeekableInputStream input = new EcsSeekableInputStream( - rule.client(), - new EcsURI(rule.bucket(), objectName), - MetricsContext.nullMetrics())) { + try (EcsSeekableInputStream input = + new EcsSeekableInputStream( + rule.client(), new EcsURI(rule.bucket(), objectName), MetricsContext.nullMetrics())) { byte[] buffer = new byte[3]; Assert.assertEquals("The first read should be 3 bytes", 3, input.read(buffer)); - Assert.assertEquals("The first 3 bytes should be 012", "012", - new String(buffer, StandardCharsets.UTF_8)); + Assert.assertEquals( + "The first 3 bytes should be 012", "012", new String(buffer, StandardCharsets.UTF_8)); } } } diff --git a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsTableOperations.java b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsTableOperations.java index c4239c70fff3..0bddc2515bfe 100644 --- a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsTableOperations.java +++ b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsTableOperations.java @@ -7,7 +7,7 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.iceberg.AssertHelpers; import org.apache.iceberg.CatalogProperties; @@ -36,15 +37,11 @@ import org.junit.Rule; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestEcsTableOperations { - static final Schema SCHEMA = new Schema( - required(1, "id", Types.IntegerType.get())); + static final Schema SCHEMA = new Schema(required(1, "id", Types.IntegerType.get())); - @Rule - public EcsS3MockRule rule = EcsS3MockRule.create(); + @Rule public EcsS3MockRule rule = EcsS3MockRule.create(); @Test public void testConcurrentCommit() { @@ -55,9 +52,7 @@ public void testConcurrentCommit() { Table catalog2Table = catalog2.loadTable(TableIdentifier.of("t1")); // Generate a new version - catalog1Table.updateProperties() - .set("a", "a") - .commit(); + catalog1Table.updateProperties().set("a", "a").commit(); // Use the TableOperations to test the CommitFailedException // High level actions, such as Table#updateProperties(), may refresh metadata. @@ -65,9 +60,10 @@ public void testConcurrentCommit() { AssertHelpers.assertThrows( "Commit failed when use out-dated status", CommitFailedException.class, - () -> operations.commit( - operations.current(), - TableMetadata.buildFrom(operations.current()) + () -> + operations.commit( + operations.current(), + TableMetadata.buildFrom(operations.current()) .removeProperties(ImmutableSet.of("a")) .build())); } diff --git a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsURI.java b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsURI.java index 59140850df22..27ae70004f9f 100644 --- a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsURI.java +++ b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsURI.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; import org.apache.iceberg.AssertHelpers; diff --git a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestPropertiesSerDesUtil.java b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestPropertiesSerDesUtil.java index 7e0b8e2ef82b..90587bf695cf 100644 --- a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestPropertiesSerDesUtil.java +++ b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestPropertiesSerDesUtil.java @@ -7,7 +7,7 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.ecs; import java.util.Map; @@ -30,7 +29,8 @@ public class TestPropertiesSerDesUtil { public void testPropertiesSerDes() { Map properties = ImmutableMap.of("a", "a", "b", "b"); byte[] byteValue = PropertiesSerDesUtil.toBytes(properties); - Map result = PropertiesSerDesUtil.read(byteValue, PropertiesSerDesUtil.currentVersion()); + Map result = + PropertiesSerDesUtil.read(byteValue, PropertiesSerDesUtil.currentVersion()); Assert.assertEquals("Ser/Des will return the same content.", properties, result); } } diff --git a/dell/src/test/java/org/apache/iceberg/dell/mock/MockDellClientFactory.java b/dell/src/test/java/org/apache/iceberg/dell/mock/MockDellClientFactory.java index 42baae314770..a8a7ddcd9e4e 100644 --- a/dell/src/test/java/org/apache/iceberg/dell/mock/MockDellClientFactory.java +++ b/dell/src/test/java/org/apache/iceberg/dell/mock/MockDellClientFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.mock; import com.emc.object.s3.S3Client; @@ -24,16 +23,12 @@ import org.apache.iceberg.dell.DellClientFactory; import org.apache.iceberg.dell.mock.ecs.EcsS3MockRule; -/** - * Provide client which initialized by {@link EcsS3MockRule} - */ +/** Provide client which initialized by {@link EcsS3MockRule} */ public class MockDellClientFactory implements DellClientFactory { public static final String ID_KEY = "mock.dell.client.factory.id"; - /** - * Use ID to avoid using client in other instance. - */ + /** Use ID to avoid using client in other instance. */ private String id; @Override diff --git a/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/EcsS3MockRule.java b/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/EcsS3MockRule.java index d744260613a2..f02a0d13a6b0 100644 --- a/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/EcsS3MockRule.java +++ b/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/EcsS3MockRule.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.mock.ecs; import com.emc.object.s3.S3Client; @@ -39,14 +38,12 @@ /** * Mock rule of ECS S3 mock. - *

- * Use environment parameter to specify use mock client or real client. + * + *

Use environment parameter to specify use mock client or real client. */ public class EcsS3MockRule implements TestRule { - /** - * Object ID generator - */ + /** Object ID generator */ private static final AtomicInteger ID = new AtomicInteger(0); // Config fields @@ -71,9 +68,7 @@ public static EcsS3MockRule manualCreateBucket() { private static final ThreadLocal TEST_RULE_FOR_MOCK_CLIENT = new ThreadLocal<>(); - /** - * Load rule from thread local and check bucket - */ + /** Load rule from thread local and check bucket */ public static EcsS3MockRule rule(String id) { EcsS3MockRule rule = TEST_RULE_FOR_MOCK_CLIENT.get(); Assert.assertTrue("Test Rule must match id", rule != null && rule.bucket().equals(id)); @@ -112,8 +107,11 @@ private void initialize() { } else { mock = false; Map properties = new LinkedHashMap<>(); - properties.put(DellProperties.ECS_S3_ACCESS_KEY_ID, System.getenv(DellProperties.ECS_S3_ACCESS_KEY_ID)); - properties.put(DellProperties.ECS_S3_SECRET_ACCESS_KEY, System.getenv(DellProperties.ECS_S3_SECRET_ACCESS_KEY)); + properties.put( + DellProperties.ECS_S3_ACCESS_KEY_ID, System.getenv(DellProperties.ECS_S3_ACCESS_KEY_ID)); + properties.put( + DellProperties.ECS_S3_SECRET_ACCESS_KEY, + System.getenv(DellProperties.ECS_S3_SECRET_ACCESS_KEY)); properties.put(DellProperties.ECS_S3_ENDPOINT, System.getenv(DellProperties.ECS_S3_ENDPOINT)); clientProperties = properties; client = DellClientFactories.from(properties).ecsS3(); @@ -154,10 +152,10 @@ private void deleteBucket() { break; } - List keys = result.getObjects() - .stream() - .map(it -> new ObjectKey(it.getKey())) - .collect(Collectors.toList()); + List keys = + result.getObjects().stream() + .map(it -> new ObjectKey(it.getKey())) + .collect(Collectors.toList()); client().deleteObjects(new DeleteObjectsRequest(bucket).withKeys(keys)); } diff --git a/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/MockS3Client.java b/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/MockS3Client.java index 2656993e382b..fc401c7c4257 100644 --- a/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/MockS3Client.java +++ b/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/MockS3Client.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.mock.ecs; import com.emc.object.Protocol; @@ -94,26 +93,30 @@ import org.apache.iceberg.relocated.com.google.common.io.ByteStreams; import org.junit.Assert; -/** - * Memorized s3 client used in tests. - */ +/** Memorized s3 client used in tests. */ public class MockS3Client implements S3Client { /** * The object data of this client. - *

- * Current {@link S3ObjectMetadata} only store the user metadata. + * + *

Current {@link S3ObjectMetadata} only store the user metadata. */ private final Map objectData = Maps.newConcurrentMap(); @Override public PutObjectResult putObject(PutObjectRequest request) { ObjectId objectId = new ObjectId(request.getBucketName(), request.getKey()); - ObjectData data = ObjectData.create(convertContent(request.getEntity()), request.getObjectMetadata()); + ObjectData data = + ObjectData.create(convertContent(request.getEntity()), request.getObjectMetadata()); if (request.getIfMatch() != null) { // Compare and swap - if (this.objectData.computeIfPresent(objectId, (ignored, oldData) -> - oldData.createFullMetadata().getETag().equals(request.getIfMatch()) ? data : oldData) != data) { + if (this.objectData.computeIfPresent( + objectId, + (ignored, oldData) -> + oldData.createFullMetadata().getETag().equals(request.getIfMatch()) + ? data + : oldData) + != data) { throw new S3Exception("", 412, "PreconditionFailed", ""); } } else if (request.getIfNoneMatch() != null) { @@ -155,7 +158,8 @@ private byte[] convertContent(Object entity) { return ((byte[]) entity).clone(); } - throw new IllegalArgumentException(String.format("Invalid object entity type %s", entity.getClass())); + throw new IllegalArgumentException( + String.format("Invalid object entity type %s", entity.getClass())); } @Override @@ -190,12 +194,13 @@ public GetObjectResult getObject(String bucketName, String key) { throw new S3Exception("", 404, "NoSuchKey", ""); } - GetObjectResult result = new GetObjectResult() { - @Override - public S3ObjectMetadata getObjectMetadata() { - return data.createFullMetadata(); - } - }; + GetObjectResult result = + new GetObjectResult() { + @Override + public S3ObjectMetadata getObjectMetadata() { + return data.createFullMetadata(); + } + }; result.setObject(data.createInputStream(Range.fromOffset(0))); return result; } @@ -213,7 +218,8 @@ public ListObjectsResult listObjects(ListObjectsRequest request) { List objectResults = Lists.newArrayListWithCapacity(maxKeys); Set prefixResults = Sets.newHashSet(); String nextMarker = null; - for (Map.Entry entry : objectData.entrySet().stream() + for (Map.Entry entry : + objectData.entrySet().stream() .sorted(Map.Entry.comparingByKey()) .collect(Collectors.toList())) { ObjectId id = entry.getKey(); @@ -244,20 +250,20 @@ public ListObjectsResult listObjects(ListObjectsRequest request) { } } - ListObjectsResult result = new ListObjectsResult() { - @Override - public List getCommonPrefixes() { - return prefixResults.stream().sorted().collect(Collectors.toList()); - } - }; + ListObjectsResult result = + new ListObjectsResult() { + @Override + public List getCommonPrefixes() { + return prefixResults.stream().sorted().collect(Collectors.toList()); + } + }; result.setObjects(objectResults); result.setNextMarker(nextMarker); return result; } @Override - public void destroy() { - } + public void destroy() {} @Override @Deprecated @@ -479,8 +485,7 @@ public T readObject(String bucketName, String key, String versionId, Class GetObjectResult getObject( - GetObjectRequest request, Class objectType) { + public GetObjectResult getObject(GetObjectRequest request, Class objectType) { return wontImplement(); } @@ -565,7 +570,8 @@ public String initiateMultipartUpload(String bucketName, String key) { } @Override - public InitiateMultipartUploadResult initiateMultipartUpload(InitiateMultipartUploadRequest request) { + public InitiateMultipartUploadResult initiateMultipartUpload( + InitiateMultipartUploadRequest request) { return wontImplement(); } @@ -590,7 +596,8 @@ public CopyPartResult copyPart(CopyPartRequest request) { } @Override - public CompleteMultipartUploadResult completeMultipartUpload(CompleteMultipartUploadRequest request) { + public CompleteMultipartUploadResult completeMultipartUpload( + CompleteMultipartUploadRequest request) { return wontImplement(); } diff --git a/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/ObjectData.java b/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/ObjectData.java index 480e50adacaa..6f16a2f73c0a 100644 --- a/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/ObjectData.java +++ b/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/ObjectData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.mock.ecs; import com.emc.object.Range; @@ -28,9 +27,7 @@ import java.util.LinkedHashMap; import java.util.Map; -/** - * Object data in memory. - */ +/** Object data in memory. */ public class ObjectData { public final byte[] content; public final Map userMetadata; diff --git a/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/ObjectId.java b/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/ObjectId.java index 3ed87c9966c5..0c91fde2ce85 100644 --- a/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/ObjectId.java +++ b/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/ObjectId.java @@ -7,7 +7,7 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.mock.ecs; import java.util.Comparator; @@ -46,8 +45,7 @@ public boolean equals(Object o) { } ObjectId objectId = (ObjectId) o; - return Objects.equal(bucket, objectId.bucket) && - Objects.equal(name, objectId.name); + return Objects.equal(bucket, objectId.bucket) && Objects.equal(name, objectId.name); } @Override @@ -57,10 +55,7 @@ public int hashCode() { @Override public String toString() { - return MoreObjects.toStringHelper(this) - .add("bucket", bucket) - .add("name", name) - .toString(); + return MoreObjects.toStringHelper(this).add("bucket", bucket).add("name", name).toString(); } @Override diff --git a/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/TestExceptionCode.java b/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/TestExceptionCode.java index 4da6463dcc5f..57a75f61334c 100644 --- a/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/TestExceptionCode.java +++ b/dell/src/test/java/org/apache/iceberg/dell/mock/ecs/TestExceptionCode.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.dell.mock.ecs; import com.emc.object.Range; @@ -26,30 +25,39 @@ import org.junit.Rule; import org.junit.Test; -/** - * Verify the error codes between real client and mock client. - */ +/** Verify the error codes between real client and mock client. */ public class TestExceptionCode { - @Rule - public EcsS3MockRule rule = EcsS3MockRule.create(); + @Rule public EcsS3MockRule rule = EcsS3MockRule.create(); @Test public void testExceptionCode() { String object = "test"; - assertS3Exception("Append absent object", 404, "NoSuchKey", + assertS3Exception( + "Append absent object", + 404, + "NoSuchKey", () -> rule.client().appendObject(rule.bucket(), object, "abc".getBytes())); - assertS3Exception("Get object", 404, "NoSuchKey", + assertS3Exception( + "Get object", + 404, + "NoSuchKey", () -> rule.client().readObjectStream(rule.bucket(), object, Range.fromOffset(0))); rule.client().putObject(new PutObjectRequest(rule.bucket(), object, "abc".getBytes())); - assertS3Exception("Put object with unexpect E-Tag", 412, "PreconditionFailed", + assertS3Exception( + "Put object with unexpect E-Tag", + 412, + "PreconditionFailed", () -> { PutObjectRequest request = new PutObjectRequest(rule.bucket(), object, "def".getBytes()); request.setIfMatch("abc"); rule.client().putObject(request); }); - assertS3Exception("Put object if absent", 412, "PreconditionFailed", + assertS3Exception( + "Put object if absent", + 412, + "PreconditionFailed", () -> { PutObjectRequest request = new PutObjectRequest(rule.bucket(), object, "def".getBytes()); request.setIfNoneMatch("*"); diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java index 1d53586a2db5..7c098cf20d03 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.Serializable; @@ -32,21 +31,21 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; -/** - * Serializable loader to load an Iceberg {@link Catalog}. - */ +/** Serializable loader to load an Iceberg {@link Catalog}. */ public interface CatalogLoader extends Serializable { /** - * Create a new catalog with the provided properties. NOTICE: for flink, we may initialize the {@link CatalogLoader} - * at flink sql client side or job manager side, and then serialize this catalog loader to task manager, finally - * deserialize it and create a new catalog at task manager side. + * Create a new catalog with the provided properties. NOTICE: for flink, we may initialize the + * {@link CatalogLoader} at flink sql client side or job manager side, and then serialize this + * catalog loader to task manager, finally deserialize it and create a new catalog at task manager + * side. * * @return a newly created {@link Catalog} */ Catalog loadCatalog(); - static CatalogLoader hadoop(String name, Configuration hadoopConf, Map properties) { + static CatalogLoader hadoop( + String name, Configuration hadoopConf, Map properties) { return new HadoopCatalogLoader(name, hadoopConf, properties); } @@ -54,7 +53,8 @@ static CatalogLoader hive(String name, Configuration hadoopConf, Map properties, Configuration hadoopConf, String impl) { + static CatalogLoader custom( + String name, Map properties, Configuration hadoopConf, String impl) { return new CustomCatalogLoader(name, properties, hadoopConf, impl); } @@ -65,9 +65,7 @@ class HadoopCatalogLoader implements CatalogLoader { private final Map properties; private HadoopCatalogLoader( - String catalogName, - Configuration conf, - Map properties) { + String catalogName, Configuration conf, Map properties) { this.catalogName = catalogName; this.hadoopConf = new SerializableConfiguration(conf); this.warehouseLocation = properties.get(CatalogProperties.WAREHOUSE_LOCATION); @@ -76,7 +74,8 @@ private HadoopCatalogLoader( @Override public Catalog loadCatalog() { - return CatalogUtil.loadCatalog(HadoopCatalog.class.getName(), catalogName, properties, hadoopConf.get()); + return CatalogUtil.loadCatalog( + HadoopCatalog.class.getName(), catalogName, properties, hadoopConf.get()); } @Override @@ -96,20 +95,23 @@ class HiveCatalogLoader implements CatalogLoader { private final int clientPoolSize; private final Map properties; - private HiveCatalogLoader(String catalogName, Configuration conf, Map properties) { + private HiveCatalogLoader( + String catalogName, Configuration conf, Map properties) { this.catalogName = catalogName; this.hadoopConf = new SerializableConfiguration(conf); this.uri = properties.get(CatalogProperties.URI); this.warehouse = properties.get(CatalogProperties.WAREHOUSE_LOCATION); - this.clientPoolSize = properties.containsKey(CatalogProperties.CLIENT_POOL_SIZE) ? - Integer.parseInt(properties.get(CatalogProperties.CLIENT_POOL_SIZE)) : - CatalogProperties.CLIENT_POOL_SIZE_DEFAULT; + this.clientPoolSize = + properties.containsKey(CatalogProperties.CLIENT_POOL_SIZE) + ? Integer.parseInt(properties.get(CatalogProperties.CLIENT_POOL_SIZE)) + : CatalogProperties.CLIENT_POOL_SIZE_DEFAULT; this.properties = Maps.newHashMap(properties); } @Override public Catalog loadCatalog() { - return CatalogUtil.loadCatalog(HiveCatalog.class.getName(), catalogName, properties, hadoopConf.get()); + return CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), catalogName, properties, hadoopConf.get()); } @Override @@ -131,14 +133,13 @@ class CustomCatalogLoader implements CatalogLoader { private final String impl; private CustomCatalogLoader( - String name, - Map properties, - Configuration conf, - String impl) { + String name, Map properties, Configuration conf, String impl) { this.hadoopConf = new SerializableConfiguration(conf); this.properties = Maps.newHashMap(properties); // wrap into a hashmap for serialization this.name = name; - this.impl = Preconditions.checkNotNull(impl, "Cannot initialize custom Catalog, impl class name is null"); + this.impl = + Preconditions.checkNotNull( + impl, "Cannot initialize custom Catalog, impl class name is null"); } @Override @@ -148,11 +149,7 @@ public Catalog loadCatalog() { @Override public String toString() { - return MoreObjects.toStringHelper(this) - .add("name", name) - .add("impl", impl) - .toString(); + return MoreObjects.toStringHelper(this).add("name", name).add("impl", impl).toString(); } } - } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java index 7690e027194f..75d09732189f 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.Closeable; @@ -80,13 +79,14 @@ /** * A Flink Catalog implementation that wraps an Iceberg {@link Catalog}. - *

- * The mapping between Flink database and Iceberg namespace: - * Supplying a base namespace for a given catalog, so if you have a catalog that supports a 2-level namespace, you - * would supply the first level in the catalog configuration and the second level would be exposed as Flink databases. - *

- * The Iceberg table manages its partitions by itself. The partition of the Iceberg table is independent of the - * partition of Flink. + * + *

The mapping between Flink database and Iceberg namespace: Supplying a base namespace for a + * given catalog, so if you have a catalog that supports a 2-level namespace, you would supply the + * first level in the catalog configuration and the second level would be exposed as Flink + * databases. + * + *

The Iceberg table manages its partitions by itself. The partition of the Iceberg table is + * independent of the partition of Flink. */ public class FlinkCatalog extends AbstractCatalog { @@ -110,7 +110,8 @@ public FlinkCatalog( Catalog originalCatalog = catalogLoader.loadCatalog(); icebergCatalog = cacheEnabled ? CachingCatalog.wrap(originalCatalog) : originalCatalog; - asNamespaceCatalog = originalCatalog instanceof SupportsNamespaces ? (SupportsNamespaces) originalCatalog : null; + asNamespaceCatalog = + originalCatalog instanceof SupportsNamespaces ? (SupportsNamespaces) originalCatalog : null; closeable = originalCatalog instanceof Closeable ? (Closeable) originalCatalog : null; } @@ -162,7 +163,8 @@ public List listDatabases() throws CatalogException { } @Override - public CatalogDatabase getDatabase(String databaseName) throws DatabaseNotExistException, CatalogException { + public CatalogDatabase getDatabase(String databaseName) + throws DatabaseNotExistException, CatalogException { if (asNamespaceCatalog == null) { if (!getDefaultDatabase().equals(databaseName)) { throw new DatabaseNotExistException(getName(), databaseName); @@ -194,10 +196,12 @@ public boolean databaseExists(String databaseName) throws CatalogException { @Override public void createDatabase(String name, CatalogDatabase database, boolean ignoreIfExists) throws DatabaseAlreadyExistException, CatalogException { - createDatabase(name, mergeComment(database.getProperties(), database.getComment()), ignoreIfExists); + createDatabase( + name, mergeComment(database.getProperties(), database.getComment()), ignoreIfExists); } - private void createDatabase(String databaseName, Map metadata, boolean ignoreIfExists) + private void createDatabase( + String databaseName, Map metadata, boolean ignoreIfExists) throws DatabaseAlreadyExistException, CatalogException { if (asNamespaceCatalog != null) { try { @@ -208,7 +212,8 @@ private void createDatabase(String databaseName, Map metadata, b } } } else { - throw new UnsupportedOperationException("Namespaces are not supported by catalog: " + getName()); + throw new UnsupportedOperationException( + "Namespaces are not supported by catalog: " + getName()); } } @@ -257,7 +262,8 @@ public void alterDatabase(String name, CatalogDatabase newDatabase, boolean igno try { Map oldProperties = asNamespaceCatalog.loadNamespaceMetadata(namespace); - Map newProperties = mergeComment(newDatabase.getProperties(), newDatabase.getComment()); + Map newProperties = + mergeComment(newDatabase.getProperties(), newDatabase.getComment()); for (String key : oldProperties.keySet()) { if (!newProperties.containsKey(key)) { @@ -296,7 +302,8 @@ public void alterDatabase(String name, CatalogDatabase newDatabase, boolean igno } @Override - public List listTables(String databaseName) throws DatabaseNotExistException, CatalogException { + public List listTables(String databaseName) + throws DatabaseNotExistException, CatalogException { try { return icebergCatalog.listTables(toNamespace(databaseName)).stream() .map(TableIdentifier::name) @@ -307,7 +314,8 @@ public List listTables(String databaseName) throws DatabaseNotExistExcep } @Override - public CatalogTable getTable(ObjectPath tablePath) throws TableNotExistException, CatalogException { + public CatalogTable getTable(ObjectPath tablePath) + throws TableNotExistException, CatalogException { Table table = loadIcebergTable(tablePath); return toCatalogTable(table); } @@ -361,10 +369,12 @@ public void renameTable(ObjectPath tablePath, String newTableName, boolean ignor @Override public void createTable(ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) throws CatalogException, TableAlreadyExistException { - if (Objects.equals(table.getOptions().get("connector"), FlinkDynamicTableFactory.FACTORY_IDENTIFIER)) { - throw new IllegalArgumentException("Cannot create the table with 'connector'='iceberg' table property in " + - "an iceberg catalog, Please create table with 'connector'='iceberg' property in a non-iceberg catalog or " + - "create table without 'connector'='iceberg' related properties in an iceberg table."); + if (Objects.equals( + table.getOptions().get("connector"), FlinkDynamicTableFactory.FACTORY_IDENTIFIER)) { + throw new IllegalArgumentException( + "Cannot create the table with 'connector'='iceberg' table property in " + + "an iceberg catalog, Please create table with 'connector'='iceberg' property in a non-iceberg catalog or " + + "create table without 'connector'='iceberg' related properties in an iceberg table."); } createIcebergTable(tablePath, table, ignoreIfExists); @@ -389,11 +399,7 @@ void createIcebergTable(ObjectPath tablePath, CatalogBaseTable table, boolean ig try { icebergCatalog.createTable( - toIdentifier(tablePath), - icebergSchema, - spec, - location, - properties.build()); + toIdentifier(tablePath), icebergSchema, spec, location, properties.build()); } catch (AlreadyExistsException e) { if (!ignoreIfExists) { throw new TableAlreadyExistException(getName(), tablePath, e); @@ -421,7 +427,8 @@ public void alterTable(ObjectPath tablePath, CatalogBaseTable newTable, boolean // Currently, Flink SQL only support altering table properties. - // For current Flink Catalog API, support for adding/removing/renaming columns cannot be done by comparing + // For current Flink Catalog API, support for adding/removing/renaming columns cannot be done by + // comparing // CatalogTable instances, unless the Flink schema contains Iceberg column IDs. if (!table.getSchema().equals(newTable.getSchema())) { throw new UnsupportedOperationException("Altering schema is not supported yet."); @@ -457,27 +464,36 @@ public void alterTable(ObjectPath tablePath, CatalogBaseTable newTable, boolean } } - oldProperties.keySet().forEach(k -> { - if (!newTable.getOptions().containsKey(k)) { - setProperties.put(k, null); - } - }); + oldProperties + .keySet() + .forEach( + k -> { + if (!newTable.getOptions().containsKey(k)) { + setProperties.put(k, null); + } + }); commitChanges(icebergTable, setLocation, setSnapshotId, pickSnapshotId, setProperties); } private static void validateFlinkTable(CatalogBaseTable table) { - Preconditions.checkArgument(table instanceof CatalogTable, "The Table should be a CatalogTable."); + Preconditions.checkArgument( + table instanceof CatalogTable, "The Table should be a CatalogTable."); TableSchema schema = table.getSchema(); - schema.getTableColumns().forEach(column -> { - if (!FlinkCompatibilityUtil.isPhysicalColumn(column)) { - throw new UnsupportedOperationException("Creating table with computed columns is not supported yet."); - } - }); + schema + .getTableColumns() + .forEach( + column -> { + if (!FlinkCompatibilityUtil.isPhysicalColumn(column)) { + throw new UnsupportedOperationException( + "Creating table with computed columns is not supported yet."); + } + }); if (!schema.getWatermarkSpecs().isEmpty()) { - throw new UnsupportedOperationException("Creating table with watermark specs is not supported yet."); + throw new UnsupportedOperationException( + "Creating table with watermark specs is not supported yet."); } } @@ -502,11 +518,17 @@ private static List toPartitionKeys(PartitionSpec spec, Schema icebergSc return partitionKeysBuilder.build(); } - private static void commitChanges(Table table, String setLocation, String setSnapshotId, - String pickSnapshotId, Map setProperties) { - // don't allow setting the snapshot and picking a commit at the same time because order is ambiguous and choosing + private static void commitChanges( + Table table, + String setLocation, + String setSnapshotId, + String pickSnapshotId, + Map setProperties) { + // don't allow setting the snapshot and picking a commit at the same time because order is + // ambiguous and choosing // one order leads to different results - Preconditions.checkArgument(setSnapshotId == null || pickSnapshotId == null, + Preconditions.checkArgument( + setSnapshotId == null || pickSnapshotId == null, "Cannot set the current snapshot ID and cherry-pick snapshot changes"); if (setSnapshotId != null) { @@ -523,20 +545,19 @@ private static void commitChanges(Table table, String setLocation, String setSna Transaction transaction = table.newTransaction(); if (setLocation != null) { - transaction.updateLocation() - .setLocation(setLocation) - .commit(); + transaction.updateLocation().setLocation(setLocation).commit(); } if (!setProperties.isEmpty()) { UpdateProperties updateProperties = transaction.updateProperties(); - setProperties.forEach((k, v) -> { - if (v == null) { - updateProperties.remove(k); - } else { - updateProperties.set(k, v); - } - }); + setProperties.forEach( + (k, v) -> { + if (v == null) { + updateProperties.remove(k); + } else { + updateProperties.set(k, v); + } + }); updateProperties.commit(); } @@ -547,7 +568,8 @@ static CatalogTable toCatalogTable(Table table) { TableSchema schema = FlinkSchemaUtil.toSchema(table.schema()); List partitionKeys = toPartitionKeys(table.spec(), table.schema()); - // NOTE: We can not create a IcebergCatalogTable extends CatalogTable, because Flink optimizer may use + // NOTE: We can not create a IcebergCatalogTable extends CatalogTable, because Flink optimizer + // may use // CatalogTableImpl to copy a new catalog table. // Let's re-loading table from Iceberg catalog when creating source/sink operators. // Iceberg does not have Table comment, so pass a null (Default comment value in Flink). @@ -563,7 +585,8 @@ CatalogLoader getCatalogLoader() { return catalogLoader; } - // ------------------------------ Unsupported methods --------------------------------------------- + // ------------------------------ Unsupported methods + // --------------------------------------------- @Override public List listViews(String databaseName) throws CatalogException { @@ -577,25 +600,35 @@ public CatalogPartition getPartition(ObjectPath tablePath, CatalogPartitionSpec } @Override - public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { + public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void createPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec, CatalogPartition partition, - boolean ignoreIfExists) throws CatalogException { + public void createPartition( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogPartition partition, + boolean ignoreIfExists) + throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void dropPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) + public void dropPartition( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void alterPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec, CatalogPartition newPartition, - boolean ignoreIfNotExists) throws CatalogException { + public void alterPartition( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogPartition newPartition, + boolean ignoreIfNotExists) + throws CatalogException { throw new UnsupportedOperationException(); } @@ -605,7 +638,8 @@ public List listFunctions(String dbName) throws CatalogException { } @Override - public CatalogFunction getFunction(ObjectPath functionPath) throws FunctionNotExistException, CatalogException { + public CatalogFunction getFunction(ObjectPath functionPath) + throws FunctionNotExistException, CatalogException { throw new FunctionNotExistException(getName(), functionPath); } @@ -615,13 +649,15 @@ public boolean functionExists(ObjectPath functionPath) throws CatalogException { } @Override - public void createFunction(ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) + public void createFunction( + ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void alterFunction(ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) + public void alterFunction( + ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) throws CatalogException { throw new UnsupportedOperationException(); } @@ -633,26 +669,36 @@ public void dropFunction(ObjectPath functionPath, boolean ignoreIfNotExists) } @Override - public void alterTableStatistics(ObjectPath tablePath, CatalogTableStatistics tableStatistics, - boolean ignoreIfNotExists) throws CatalogException { + public void alterTableStatistics( + ObjectPath tablePath, CatalogTableStatistics tableStatistics, boolean ignoreIfNotExists) + throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void alterTableColumnStatistics(ObjectPath tablePath, CatalogColumnStatistics columnStatistics, - boolean ignoreIfNotExists) throws CatalogException { + public void alterTableColumnStatistics( + ObjectPath tablePath, CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) + throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void alterPartitionStatistics(ObjectPath tablePath, CatalogPartitionSpec partitionSpec, - CatalogTableStatistics partitionStatistics, boolean ignoreIfNotExists) throws CatalogException { + public void alterPartitionStatistics( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogTableStatistics partitionStatistics, + boolean ignoreIfNotExists) + throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void alterPartitionColumnStatistics(ObjectPath tablePath, CatalogPartitionSpec partitionSpec, - CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) throws CatalogException { + public void alterPartitionColumnStatistics( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogColumnStatistics columnStatistics, + boolean ignoreIfNotExists) + throws CatalogException { throw new UnsupportedOperationException(); } @@ -677,31 +723,32 @@ public List listPartitions(ObjectPath tablePath) set.add(new CatalogPartitionSpec(map)); } } catch (IOException e) { - throw new CatalogException(String.format("Failed to list partitions of table %s", tablePath), e); + throw new CatalogException( + String.format("Failed to list partitions of table %s", tablePath), e); } return Lists.newArrayList(set); } @Override - public List listPartitions(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { + public List listPartitions( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { throw new UnsupportedOperationException(); } @Override - public List listPartitionsByFilter(ObjectPath tablePath, List filters) - throws CatalogException { + public List listPartitionsByFilter( + ObjectPath tablePath, List filters) throws CatalogException { throw new UnsupportedOperationException(); } - // After partition pruning and filter push down, the statistics have become very inaccurate, so the statistics from + // After partition pruning and filter push down, the statistics have become very inaccurate, so + // the statistics from // here are of little significance. // Flink will support something like SupportsReportStatistics in future. @Override - public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) - throws CatalogException { + public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) throws CatalogException { return CatalogTableStatistics.UNKNOWN; } @@ -712,14 +759,14 @@ public CatalogColumnStatistics getTableColumnStatistics(ObjectPath tablePath) } @Override - public CatalogTableStatistics getPartitionStatistics(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { + public CatalogTableStatistics getPartitionStatistics( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { return CatalogTableStatistics.UNKNOWN; } @Override - public CatalogColumnStatistics getPartitionColumnStatistics(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { + public CatalogColumnStatistics getPartitionColumnStatistics( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { return CatalogColumnStatistics.UNKNOWN; } } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java index 12a3dc6b95aa..aee3cfb38daa 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.net.URL; @@ -40,20 +39,22 @@ /** * A Flink Catalog factory implementation that creates {@link FlinkCatalog}. - *

- * This supports the following catalog configuration options: + * + *

This supports the following catalog configuration options: + * *

    - *
  • type - Flink catalog factory key, should be "iceberg"
  • - *
  • catalog-type - iceberg catalog type, "hive" or "hadoop"
  • - *
  • uri - the Hive Metastore URI (Hive catalog only)
  • - *
  • clients - the Hive Client Pool Size (Hive catalog only)
  • - *
  • warehouse - the warehouse path (Hadoop catalog only)
  • - *
  • default-database - a database name to use as the default
  • - *
  • base-namespace - a base namespace as the prefix for all databases (Hadoop catalog only)
  • - *
  • cache-enabled - whether to enable catalog cache
  • + *
  • type - Flink catalog factory key, should be "iceberg" + *
  • catalog-type - iceberg catalog type, "hive" or "hadoop" + *
  • uri - the Hive Metastore URI (Hive catalog only) + *
  • clients - the Hive Client Pool Size (Hive catalog only) + *
  • warehouse - the warehouse path (Hadoop catalog only) + *
  • default-database - a database name to use as the default + *
  • base-namespace - a base namespace as the prefix for all databases (Hadoop + * catalog only) + *
  • cache-enabled - whether to enable catalog cache *
- *

- * To use a custom catalog that is not a Hive or Hadoop catalog, extend this class and override + * + *

To use a custom catalog that is not a Hive or Hadoop catalog, extend this class and override * {@link #createCatalogLoader(String, Map, Configuration)}. */ public class FlinkCatalogFactory implements CatalogFactory { @@ -74,27 +75,33 @@ public class FlinkCatalogFactory implements CatalogFactory { public static final String PROPERTY_VERSION = "property-version"; /** - * Create an Iceberg {@link org.apache.iceberg.catalog.Catalog} loader to be used by this Flink catalog adapter. + * Create an Iceberg {@link org.apache.iceberg.catalog.Catalog} loader to be used by this Flink + * catalog adapter. * - * @param name Flink's catalog name + * @param name Flink's catalog name * @param properties Flink's catalog properties * @param hadoopConf Hadoop configuration for catalog * @return an Iceberg catalog loader */ - static CatalogLoader createCatalogLoader(String name, Map properties, Configuration hadoopConf) { + static CatalogLoader createCatalogLoader( + String name, Map properties, Configuration hadoopConf) { String catalogImpl = properties.get(CatalogProperties.CATALOG_IMPL); if (catalogImpl != null) { String catalogType = properties.get(ICEBERG_CATALOG_TYPE); - Preconditions.checkArgument(catalogType == null, + Preconditions.checkArgument( + catalogType == null, "Cannot create catalog %s, both catalog-type and catalog-impl are set: catalog-type=%s, catalog-impl=%s", - name, catalogType, catalogImpl); + name, + catalogType, + catalogImpl); return CatalogLoader.custom(name, properties, hadoopConf, catalogImpl); } String catalogType = properties.getOrDefault(ICEBERG_CATALOG_TYPE, ICEBERG_CATALOG_TYPE_HIVE); switch (catalogType.toLowerCase(Locale.ENGLISH)) { case ICEBERG_CATALOG_TYPE_HIVE: - // The values of properties 'uri', 'warehouse', 'hive-conf-dir' are allowed to be null, in that case it will + // The values of properties 'uri', 'warehouse', 'hive-conf-dir' are allowed to be null, in + // that case it will // fallback to parse those values from hadoop configuration which is loaded from classpath. String hiveConfDir = properties.get(HIVE_CONF_DIR); String hadoopConfDir = properties.get(HADOOP_CONF_DIR); @@ -105,8 +112,8 @@ static CatalogLoader createCatalogLoader(String name, Map proper return CatalogLoader.hadoop(name, hadoopConf, properties); default: - throw new UnsupportedOperationException("Unknown catalog-type: " + catalogType + - " (Must be 'hive' or 'hadoop')"); + throw new UnsupportedOperationException( + "Unknown catalog-type: " + catalogType + " (Must be 'hive' or 'hadoop')"); } } @@ -128,7 +135,8 @@ public Catalog createCatalog(String name, Map properties) { return createCatalog(name, properties, clusterHadoopConf()); } - protected Catalog createCatalog(String name, Map properties, Configuration hadoopConf) { + protected Catalog createCatalog( + String name, Map properties, Configuration hadoopConf) { CatalogLoader catalogLoader = createCatalogLoader(name, properties, hadoopConf); String defaultDatabase = properties.getOrDefault(DEFAULT_DATABASE, DEFAULT_DATABASE_NAME); @@ -141,14 +149,18 @@ protected Catalog createCatalog(String name, Map properties, Con return new FlinkCatalog(name, defaultDatabase, baseNamespace, catalogLoader, cacheEnabled); } - private static Configuration mergeHiveConf(Configuration hadoopConf, String hiveConfDir, String hadoopConfDir) { + private static Configuration mergeHiveConf( + Configuration hadoopConf, String hiveConfDir, String hadoopConfDir) { Configuration newConf = new Configuration(hadoopConf); if (!Strings.isNullOrEmpty(hiveConfDir)) { - Preconditions.checkState(Files.exists(Paths.get(hiveConfDir, "hive-site.xml")), - "There should be a hive-site.xml file under the directory %s", hiveConfDir); + Preconditions.checkState( + Files.exists(Paths.get(hiveConfDir, "hive-site.xml")), + "There should be a hive-site.xml file under the directory %s", + hiveConfDir); newConf.addResource(new Path(hiveConfDir, "hive-site.xml")); } else { - // If don't provide the hive-site.xml path explicitly, it will try to load resource from classpath. If still + // If don't provide the hive-site.xml path explicitly, it will try to load resource from + // classpath. If still // couldn't load the configuration file, then it will throw exception in HiveCatalog. URL configFile = CatalogLoader.class.getClassLoader().getResource("hive-site.xml"); if (configFile != null) { @@ -157,11 +169,15 @@ private static Configuration mergeHiveConf(Configuration hadoopConf, String hive } if (!Strings.isNullOrEmpty(hadoopConfDir)) { - Preconditions.checkState(Files.exists(Paths.get(hadoopConfDir, "hdfs-site.xml")), - "Failed to load Hadoop configuration: missing %s", Paths.get(hadoopConfDir, "hdfs-site.xml")); + Preconditions.checkState( + Files.exists(Paths.get(hadoopConfDir, "hdfs-site.xml")), + "Failed to load Hadoop configuration: missing %s", + Paths.get(hadoopConfDir, "hdfs-site.xml")); newConf.addResource(new Path(hadoopConfDir, "hdfs-site.xml")); - Preconditions.checkState(Files.exists(Paths.get(hadoopConfDir, "core-site.xml")), - "Failed to load Hadoop configuration: missing %s", Paths.get(hadoopConfDir, "core-site.xml")); + Preconditions.checkState( + Files.exists(Paths.get(hadoopConfDir, "core-site.xml")), + "Failed to load Hadoop configuration: missing %s", + Paths.get(hadoopConfDir, "core-site.xml")); newConf.addResource(new Path(hadoopConfDir, "core-site.xml")); } return newConf; diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java index e984f6875920..83fa09de544c 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java index 49cef328e9ca..603cb3961c02 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java @@ -16,24 +16,22 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; - import org.apache.flink.configuration.ConfigOption; import org.apache.flink.configuration.ConfigOptions; public class FlinkConfigOptions { - private FlinkConfigOptions() { - } + private FlinkConfigOptions() {} public static final ConfigOption TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM = ConfigOptions.key("table.exec.iceberg.infer-source-parallelism") .booleanType() .defaultValue(true) - .withDescription("If is false, parallelism of source are set by config.\n" + - "If is true, source parallelism is inferred according to splits number.\n"); + .withDescription( + "If is false, parallelism of source are set by config.\n" + + "If is true, source parallelism is inferred according to splits number.\n"); public static final ConfigOption TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX = ConfigOptions.key("table.exec.iceberg.infer-source-parallelism.max") @@ -45,5 +43,6 @@ private FlinkConfigOptions() { ConfigOptions.key("table.exec.iceberg.expose-split-locality-info") .booleanType() .noDefaultValue() - .withDescription("Expose split host information to use Flink's locality aware split assigner."); + .withDescription( + "Expose split host information to use Flink's locality aware split assigner."); } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java index f4730f8a16e9..6431409c83ff 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.Map; @@ -43,7 +42,8 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.relocated.com.google.common.collect.Sets; -public class FlinkDynamicTableFactory implements DynamicTableSinkFactory, DynamicTableSourceFactory { +public class FlinkDynamicTableFactory + implements DynamicTableSinkFactory, DynamicTableSourceFactory { static final String FACTORY_IDENTIFIER = "iceberg"; private static final ConfigOption CATALOG_NAME = @@ -91,8 +91,12 @@ public DynamicTableSource createDynamicTableSource(Context context) { if (catalog != null) { tableLoader = createTableLoader(catalog, objectIdentifier.toObjectPath()); } else { - tableLoader = createTableLoader(catalogTable, tableProps, objectIdentifier.getDatabaseName(), - objectIdentifier.getObjectName()); + tableLoader = + createTableLoader( + catalogTable, + tableProps, + objectIdentifier.getDatabaseName(), + objectIdentifier.getObjectName()); } return new IcebergTableSource(tableLoader, tableSchema, tableProps, context.getConfiguration()); @@ -109,8 +113,9 @@ public DynamicTableSink createDynamicTableSink(Context context) { if (catalog != null) { tableLoader = createTableLoader(catalog, objectPath); } else { - tableLoader = createTableLoader(catalogTable, tableProps, objectPath.getDatabaseName(), - objectPath.getObjectName()); + tableLoader = + createTableLoader( + catalogTable, tableProps, objectPath.getDatabaseName(), objectPath.getObjectName()); } return new IcebergTableSink(tableLoader, tableSchema); @@ -137,15 +142,17 @@ public String factoryIdentifier() { return FACTORY_IDENTIFIER; } - private static TableLoader createTableLoader(CatalogBaseTable catalogBaseTable, - Map tableProps, - String databaseName, - String tableName) { + private static TableLoader createTableLoader( + CatalogBaseTable catalogBaseTable, + Map tableProps, + String databaseName, + String tableName) { Configuration flinkConf = new Configuration(); tableProps.forEach(flinkConf::setString); String catalogName = flinkConf.getString(CATALOG_NAME); - Preconditions.checkNotNull(catalogName, "Table property '%s' cannot be null", CATALOG_NAME.key()); + Preconditions.checkNotNull( + catalogName, "Table property '%s' cannot be null", CATALOG_NAME.key()); String catalogDatabase = flinkConf.getString(CATALOG_DATABASE, databaseName); Preconditions.checkNotNull(catalogDatabase, "The iceberg database name cannot be null"); @@ -155,15 +162,20 @@ private static TableLoader createTableLoader(CatalogBaseTable catalogBaseTable, org.apache.hadoop.conf.Configuration hadoopConf = FlinkCatalogFactory.clusterHadoopConf(); FlinkCatalogFactory factory = new FlinkCatalogFactory(); - FlinkCatalog flinkCatalog = (FlinkCatalog) factory.createCatalog(catalogName, tableProps, hadoopConf); + FlinkCatalog flinkCatalog = + (FlinkCatalog) factory.createCatalog(catalogName, tableProps, hadoopConf); ObjectPath objectPath = new ObjectPath(catalogDatabase, catalogTable); // Create database if not exists in the external catalog. if (!flinkCatalog.databaseExists(catalogDatabase)) { try { - flinkCatalog.createDatabase(catalogDatabase, new CatalogDatabaseImpl(Maps.newHashMap(), null), true); + flinkCatalog.createDatabase( + catalogDatabase, new CatalogDatabaseImpl(Maps.newHashMap(), null), true); } catch (DatabaseAlreadyExistException e) { - throw new AlreadyExistsException(e, "Database %s already exists in the iceberg catalog %s.", catalogName, + throw new AlreadyExistsException( + e, + "Database %s already exists in the iceberg catalog %s.", + catalogName, catalogDatabase); } } @@ -173,12 +185,17 @@ private static TableLoader createTableLoader(CatalogBaseTable catalogBaseTable, try { flinkCatalog.createIcebergTable(objectPath, catalogBaseTable, true); } catch (TableAlreadyExistException e) { - throw new AlreadyExistsException(e, "Table %s already exists in the database %s and catalog %s", - catalogTable, catalogDatabase, catalogName); + throw new AlreadyExistsException( + e, + "Table %s already exists in the database %s and catalog %s", + catalogTable, + catalogDatabase, + catalogName); } } - return TableLoader.fromCatalog(flinkCatalog.getCatalogLoader(), TableIdentifier.of(catalogDatabase, catalogTable)); + return TableLoader.fromCatalog( + flinkCatalog.getCatalogLoader(), TableIdentifier.of(catalogDatabase, catalogTable)); } private static TableLoader createTableLoader(FlinkCatalog catalog, ObjectPath objectPath) { diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java index 5e5c9c1fe0fb..717de9ef5acc 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.time.Instant; @@ -43,38 +42,38 @@ import org.apache.iceberg.util.NaNUtil; public class FlinkFilters { - private FlinkFilters() { - } + private FlinkFilters() {} private static final Pattern STARTS_WITH_PATTERN = Pattern.compile("([^%]+)%"); - private static final Map FILTERS = ImmutableMap - .builder() - .put(BuiltInFunctionDefinitions.EQUALS, Operation.EQ) - .put(BuiltInFunctionDefinitions.NOT_EQUALS, Operation.NOT_EQ) - .put(BuiltInFunctionDefinitions.GREATER_THAN, Operation.GT) - .put(BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL, Operation.GT_EQ) - .put(BuiltInFunctionDefinitions.LESS_THAN, Operation.LT) - .put(BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL, Operation.LT_EQ) - .put(BuiltInFunctionDefinitions.IS_NULL, Operation.IS_NULL) - .put(BuiltInFunctionDefinitions.IS_NOT_NULL, Operation.NOT_NULL) - .put(BuiltInFunctionDefinitions.AND, Operation.AND) - .put(BuiltInFunctionDefinitions.OR, Operation.OR) - .put(BuiltInFunctionDefinitions.NOT, Operation.NOT) - .put(BuiltInFunctionDefinitions.LIKE, Operation.STARTS_WITH) - .build(); + private static final Map FILTERS = + ImmutableMap.builder() + .put(BuiltInFunctionDefinitions.EQUALS, Operation.EQ) + .put(BuiltInFunctionDefinitions.NOT_EQUALS, Operation.NOT_EQ) + .put(BuiltInFunctionDefinitions.GREATER_THAN, Operation.GT) + .put(BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL, Operation.GT_EQ) + .put(BuiltInFunctionDefinitions.LESS_THAN, Operation.LT) + .put(BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL, Operation.LT_EQ) + .put(BuiltInFunctionDefinitions.IS_NULL, Operation.IS_NULL) + .put(BuiltInFunctionDefinitions.IS_NOT_NULL, Operation.NOT_NULL) + .put(BuiltInFunctionDefinitions.AND, Operation.AND) + .put(BuiltInFunctionDefinitions.OR, Operation.OR) + .put(BuiltInFunctionDefinitions.NOT, Operation.NOT) + .put(BuiltInFunctionDefinitions.LIKE, Operation.STARTS_WITH) + .build(); /** * Convert flink expression to iceberg expression. - *

- * the BETWEEN, NOT_BETWEEN, IN expression will be converted by flink automatically. the BETWEEN will be converted to - * (GT_EQ AND LT_EQ), the NOT_BETWEEN will be converted to (LT_EQ OR GT_EQ), the IN will be converted to OR, so we do - * not add the conversion here + * + *

the BETWEEN, NOT_BETWEEN, IN expression will be converted by flink automatically. the + * BETWEEN will be converted to (GT_EQ AND LT_EQ), the NOT_BETWEEN will be converted to (LT_EQ OR + * GT_EQ), the IN will be converted to OR, so we do not add the conversion here * * @param flinkExpression the flink expression * @return the iceberg expression */ - public static Optional convert(org.apache.flink.table.expressions.Expression flinkExpression) { + public static Optional convert( + org.apache.flink.table.expressions.Expression flinkExpression) { if (!(flinkExpression instanceof CallExpression)) { return Optional.empty(); } @@ -97,34 +96,42 @@ public static Optional convert(org.apache.flink.table.expressions.Ex return convertFieldAndLiteral(Expressions::lessThan, Expressions::greaterThan, call); case LT_EQ: - return convertFieldAndLiteral(Expressions::lessThanOrEqual, Expressions::greaterThanOrEqual, call); + return convertFieldAndLiteral( + Expressions::lessThanOrEqual, Expressions::greaterThanOrEqual, call); case GT: return convertFieldAndLiteral(Expressions::greaterThan, Expressions::lessThan, call); case GT_EQ: - return convertFieldAndLiteral(Expressions::greaterThanOrEqual, Expressions::lessThanOrEqual, call); + return convertFieldAndLiteral( + Expressions::greaterThanOrEqual, Expressions::lessThanOrEqual, call); case EQ: - return convertFieldAndLiteral((ref, lit) -> { - if (NaNUtil.isNaN(lit)) { - return Expressions.isNaN(ref); - } else { - return Expressions.equal(ref, lit); - } - }, call); + return convertFieldAndLiteral( + (ref, lit) -> { + if (NaNUtil.isNaN(lit)) { + return Expressions.isNaN(ref); + } else { + return Expressions.equal(ref, lit); + } + }, + call); case NOT_EQ: - return convertFieldAndLiteral((ref, lit) -> { - if (NaNUtil.isNaN(lit)) { - return Expressions.notNaN(ref); - } else { - return Expressions.notEqual(ref, lit); - } - }, call); + return convertFieldAndLiteral( + (ref, lit) -> { + if (NaNUtil.isNaN(lit)) { + return Expressions.notNaN(ref); + } else { + return Expressions.notEqual(ref, lit); + } + }, + call); case NOT: - return onlyChildAs(call, CallExpression.class).flatMap(FlinkFilters::convert).map(Expressions::not); + return onlyChildAs(call, CallExpression.class) + .flatMap(FlinkFilters::convert) + .map(Expressions::not); case AND: return convertLogicExpression(Expressions::and, call); @@ -140,8 +147,8 @@ public static Optional convert(org.apache.flink.table.expressions.Ex return Optional.empty(); } - private static Optional onlyChildAs(CallExpression call, - Class expectedChildClass) { + private static Optional onlyChildAs( + CallExpression call, Class expectedChildClass) { List children = call.getResolvedChildren(); if (children.size() != 1) { return Optional.empty(); @@ -166,26 +173,28 @@ private static Optional convertLike(CallExpression call) { if (left instanceof FieldReferenceExpression && right instanceof ValueLiteralExpression) { String name = ((FieldReferenceExpression) left).getName(); - return convertLiteral((ValueLiteralExpression) right).flatMap(lit -> { - if (lit instanceof String) { - String pattern = (String) lit; - Matcher matcher = STARTS_WITH_PATTERN.matcher(pattern); - // exclude special char of LIKE - // '_' is the wildcard of the SQL LIKE - if (!pattern.contains("_") && matcher.matches()) { - return Optional.of(Expressions.startsWith(name, matcher.group(1))); - } - } - - return Optional.empty(); - }); + return convertLiteral((ValueLiteralExpression) right) + .flatMap( + lit -> { + if (lit instanceof String) { + String pattern = (String) lit; + Matcher matcher = STARTS_WITH_PATTERN.matcher(pattern); + // exclude special char of LIKE + // '_' is the wildcard of the SQL LIKE + if (!pattern.contains("_") && matcher.matches()) { + return Optional.of(Expressions.startsWith(name, matcher.group(1))); + } + } + + return Optional.empty(); + }); } return Optional.empty(); } - private static Optional convertLogicExpression(BiFunction function, - CallExpression call) { + private static Optional convertLogicExpression( + BiFunction function, CallExpression call) { List args = call.getResolvedChildren(); if (args == null || args.size() != 2) { return Optional.empty(); @@ -201,29 +210,33 @@ private static Optional convertLogicExpression(BiFunction convertLiteral(ValueLiteralExpression expression) { - Optional value = expression.getValueAs(expression.getOutputDataType().getLogicalType().getDefaultConversion()); - return value.map(o -> { - if (o instanceof LocalDateTime) { - return DateTimeUtil.microsFromTimestamp((LocalDateTime) o); - } else if (o instanceof Instant) { - return DateTimeUtil.microsFromInstant((Instant) o); - } else if (o instanceof LocalTime) { - return DateTimeUtil.microsFromTime((LocalTime) o); - } else if (o instanceof LocalDate) { - return DateTimeUtil.daysFromDate((LocalDate) o); - } + Optional value = + expression.getValueAs( + expression.getOutputDataType().getLogicalType().getDefaultConversion()); + return value.map( + o -> { + if (o instanceof LocalDateTime) { + return DateTimeUtil.microsFromTimestamp((LocalDateTime) o); + } else if (o instanceof Instant) { + return DateTimeUtil.microsFromInstant((Instant) o); + } else if (o instanceof LocalTime) { + return DateTimeUtil.microsFromTime((LocalTime) o); + } else if (o instanceof LocalDate) { + return DateTimeUtil.daysFromDate((LocalDate) o); + } - return o; - }); + return o; + }); } - private static Optional convertFieldAndLiteral(BiFunction expr, - CallExpression call) { + private static Optional convertFieldAndLiteral( + BiFunction expr, CallExpression call) { return convertFieldAndLiteral(expr, expr, call); } private static Optional convertFieldAndLiteral( - BiFunction convertLR, BiFunction convertRL, + BiFunction convertLR, + BiFunction convertRL, CallExpression call) { List args = call.getResolvedChildren(); if (args.size() != 2) { @@ -239,7 +252,8 @@ private static Optional convertFieldAndLiteral( if (lit.isPresent()) { return Optional.of(convertLR.apply(name, lit.get())); } - } else if (left instanceof ValueLiteralExpression && right instanceof FieldReferenceExpression) { + } else if (left instanceof ValueLiteralExpression + && right instanceof FieldReferenceExpression) { Optional lit = convertLiteral((ValueLiteralExpression) left); String name = ((FieldReferenceExpression) right).getName(); if (lit.isPresent()) { diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java index 6501c0226e44..767d4497ac91 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import org.apache.iceberg.Schema; @@ -36,8 +35,8 @@ private FlinkFixupTypes(Schema referenceSchema) { } static Schema fixup(Schema schema, Schema referenceSchema) { - return new Schema(TypeUtil.visit(schema, - new FlinkFixupTypes(referenceSchema)).asStructType().fields()); + return new Schema( + TypeUtil.visit(schema, new FlinkFixupTypes(referenceSchema)).asStructType().fields()); } @Override diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java index 0827b21786c1..97439b7bb0d6 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -34,33 +33,33 @@ import org.apache.iceberg.types.Types; /** - * Converter between Flink types and Iceberg type. - * The conversion is not a 1:1 mapping that not allows back-and-forth conversion. So some information might get lost - * during the back-and-forth conversion. - *

- * This inconsistent types: + * Converter between Flink types and Iceberg type. The conversion is not a 1:1 mapping that not + * allows back-and-forth conversion. So some information might get lost during the back-and-forth + * conversion. + * + *

This inconsistent types: + * *

    - *
  • map Iceberg UUID type to Flink BinaryType(16)
  • - *
  • map Flink VarCharType(_) and CharType(_) to Iceberg String type
  • - *
  • map Flink VarBinaryType(_) to Iceberg Binary type
  • - *
  • map Flink TimeType(_) to Iceberg Time type (microseconds)
  • - *
  • map Flink TimestampType(_) to Iceberg Timestamp without zone type (microseconds)
  • - *
  • map Flink LocalZonedTimestampType(_) to Iceberg Timestamp with zone type (microseconds)
  • - *
  • map Flink MultiSetType to Iceberg Map type(element, int)
  • + *
  • map Iceberg UUID type to Flink BinaryType(16) + *
  • map Flink VarCharType(_) and CharType(_) to Iceberg String type + *
  • map Flink VarBinaryType(_) to Iceberg Binary type + *
  • map Flink TimeType(_) to Iceberg Time type (microseconds) + *
  • map Flink TimestampType(_) to Iceberg Timestamp without zone type (microseconds) + *
  • map Flink LocalZonedTimestampType(_) to Iceberg Timestamp with zone type (microseconds) + *
  • map Flink MultiSetType to Iceberg Map type(element, int) *
+ * *

*/ public class FlinkSchemaUtil { - private FlinkSchemaUtil() { - } + private FlinkSchemaUtil() {} - /** - * Convert the flink table schema to apache iceberg schema. - */ + /** Convert the flink table schema to apache iceberg schema. */ public static Schema convert(TableSchema schema) { LogicalType schemaType = schema.toRowDataType().getLogicalType(); - Preconditions.checkArgument(schemaType instanceof RowType, "Schema logical type should be RowType."); + Preconditions.checkArgument( + schemaType instanceof RowType, "Schema logical type should be RowType."); RowType root = (RowType) schemaType; Type converted = root.accept(new FlinkTypeToType(root)); @@ -75,8 +74,11 @@ private static Schema freshIdentifierFieldIds(Schema iSchema, TableSchema schema if (schema.getPrimaryKey().isPresent()) { for (String column : schema.getPrimaryKey().get().getColumns()) { Types.NestedField field = iSchema.findField(column); - Preconditions.checkNotNull(field, - "Cannot find field ID for the primary key column %s in schema %s", column, iSchema); + Preconditions.checkNotNull( + field, + "Cannot find field ID for the primary key column %s in schema %s", + column, + iSchema); identifierFieldIds.add(field.fieldId()); } } @@ -86,11 +88,11 @@ private static Schema freshIdentifierFieldIds(Schema iSchema, TableSchema schema /** * Convert a Flink {@link TableSchema} to a {@link Schema} based on the given schema. - *

- * This conversion does not assign new ids; it uses ids from the base schema. - *

- * Data types, field order, and nullability will match the Flink type. This conversion may return - * a schema that is not compatible with base schema. + * + *

This conversion does not assign new ids; it uses ids from the base schema. + * + *

Data types, field order, and nullability will match the Flink type. This conversion may + * return a schema that is not compatible with base schema. * * @param baseSchema a Schema on which conversion is based * @param flinkSchema a Flink TableSchema @@ -163,7 +165,8 @@ public static TableSchema toSchema(Schema schema) { List columns = Lists.newArrayListWithExpectedSize(identifierFieldIds.size()); for (Integer identifierFieldId : identifierFieldIds) { String columnName = schema.findColumnName(identifierFieldId); - Preconditions.checkNotNull(columnName, "Cannot find field with id %s in schema %s", identifierFieldId, schema); + Preconditions.checkNotNull( + columnName, "Cannot find field with id %s in schema %s", identifierFieldId, schema); columns.add(columnName); } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java index 88276d86d3df..6f8bfef2ef44 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -176,9 +175,10 @@ public Type visit(RowType rowType) { List newFields = Lists.newArrayListWithExpectedSize(rowType.getFieldCount()); boolean isRoot = root == rowType; - List types = rowType.getFields().stream() - .map(f -> f.getType().accept(this)) - .collect(Collectors.toList()); + List types = + rowType.getFields().stream() + .map(f -> f.getType().accept(this)) + .collect(Collectors.toList()); for (int i = 0; i < rowType.getFieldCount(); i++) { int id = isRoot ? i : getNextId(); diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java index 9d1a3c492cd7..f3de2416088c 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import org.apache.flink.table.types.logical.DayTimeIntervalType; diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java index 9c3b08139996..5f02293f6bfe 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.Locale; @@ -29,29 +28,34 @@ /** * A class for common Iceberg configs for Flink writes. - *

- * If a config is set at multiple levels, the following order of precedence is used (top to bottom): + * + *

If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * *

    - *
  1. Write options
  2. - *
  3. flink ReadableConfig
  4. - *
  5. Table metadata
  6. + *
  7. Write options + *
  8. flink ReadableConfig + *
  9. Table metadata *
- * The most specific value is set in write options and takes precedence over all other configs. - * If no write option is provided, this class checks the flink configuration for any overrides. - * If no applicable value is found in the write options, this class uses the table metadata. - *

- * Note this class is NOT meant to be serialized. + * + * The most specific value is set in write options and takes precedence over all other configs. If + * no write option is provided, this class checks the flink configuration for any overrides. If no + * applicable value is found in the write options, this class uses the table metadata. + * + *

Note this class is NOT meant to be serialized. */ public class FlinkWriteConf { private final FlinkConfParser confParser; - public FlinkWriteConf(Table table, Map writeOptions, ReadableConfig readableConfig) { + public FlinkWriteConf( + Table table, Map writeOptions, ReadableConfig readableConfig) { this.confParser = new FlinkConfParser(table, writeOptions, readableConfig); } public boolean overwriteMode() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(FlinkWriteOptions.OVERWRITE_MODE.key()) .flinkConfig(FlinkWriteOptions.OVERWRITE_MODE) .defaultValue(FlinkWriteOptions.OVERWRITE_MODE.defaultValue()) @@ -59,7 +63,8 @@ public boolean overwriteMode() { } public boolean upsertMode() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(FlinkWriteOptions.WRITE_UPSERT_ENABLED.key()) .flinkConfig(FlinkWriteOptions.WRITE_UPSERT_ENABLED) .tableProperty(TableProperties.UPSERT_ENABLED) @@ -68,17 +73,20 @@ public boolean upsertMode() { } public FileFormat dataFileFormat() { - String valueAsString = confParser.stringConf() - .option(FlinkWriteOptions.WRITE_FORMAT.key()) - .flinkConfig(FlinkWriteOptions.WRITE_FORMAT) - .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) - .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) - .parse(); + String valueAsString = + confParser + .stringConf() + .option(FlinkWriteOptions.WRITE_FORMAT.key()) + .flinkConfig(FlinkWriteOptions.WRITE_FORMAT) + .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) + .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) + .parse(); return FileFormat.valueOf(valueAsString.toUpperCase(Locale.ENGLISH)); } public long targetDataFileSize() { - return confParser.longConf() + return confParser + .longConf() .option(FlinkWriteOptions.TARGET_FILE_SIZE_BYTES.key()) .flinkConfig(FlinkWriteOptions.TARGET_FILE_SIZE_BYTES) .tableProperty(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES) @@ -87,12 +95,14 @@ public long targetDataFileSize() { } public DistributionMode distributionMode() { - String modeName = confParser.stringConf() - .option(FlinkWriteOptions.DISTRIBUTION_MODE.key()) - .flinkConfig(FlinkWriteOptions.DISTRIBUTION_MODE) - .tableProperty(TableProperties.WRITE_DISTRIBUTION_MODE) - .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_NONE) - .parse(); + String modeName = + confParser + .stringConf() + .option(FlinkWriteOptions.DISTRIBUTION_MODE.key()) + .flinkConfig(FlinkWriteOptions.DISTRIBUTION_MODE) + .tableProperty(TableProperties.WRITE_DISTRIBUTION_MODE) + .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_NONE) + .parse(); return DistributionMode.fromName(modeName); } } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java index d0dc9c7fdeb1..a3091d5779c7 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java @@ -16,42 +16,32 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import org.apache.flink.configuration.ConfigOption; import org.apache.flink.configuration.ConfigOptions; -/** - * Flink sink write options - */ +/** Flink sink write options */ public class FlinkWriteOptions { - private FlinkWriteOptions() { - } + private FlinkWriteOptions() {} // File format for write operations(default: Table write.format.default ) public static final ConfigOption WRITE_FORMAT = - ConfigOptions.key("write-format") - .stringType().noDefaultValue(); + ConfigOptions.key("write-format").stringType().noDefaultValue(); // Overrides this table's write.target-file-size-bytes public static final ConfigOption TARGET_FILE_SIZE_BYTES = - ConfigOptions.key("target-file-size-bytes") - .longType().noDefaultValue(); + ConfigOptions.key("target-file-size-bytes").longType().noDefaultValue(); // Overrides this table's write.upsert.enabled public static final ConfigOption WRITE_UPSERT_ENABLED = - ConfigOptions.key("upsert-enabled") - .booleanType().noDefaultValue(); + ConfigOptions.key("upsert-enabled").booleanType().noDefaultValue(); public static final ConfigOption OVERWRITE_MODE = - ConfigOptions.key("overwrite-enabled") - .booleanType().defaultValue(false); + ConfigOptions.key("overwrite-enabled").booleanType().defaultValue(false); // Overrides the table's write.distribution-mode public static final ConfigOption DISTRIBUTION_MODE = - ConfigOptions.key("distribution-mode") - .stringType().noDefaultValue(); - + ConfigOptions.key("distribution-mode").stringType().noDefaultValue(); } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java index 93cf5ada7840..21fb196d3d58 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -52,24 +51,27 @@ public IcebergTableSink(TableLoader tableLoader, TableSchema tableSchema) { @Override public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { - Preconditions.checkState(!overwrite || context.isBounded(), + Preconditions.checkState( + !overwrite || context.isBounded(), "Unbounded data stream doesn't support overwrite operation."); - List equalityColumns = tableSchema.getPrimaryKey() - .map(UniqueConstraint::getColumns) - .orElseGet(ImmutableList::of); + List equalityColumns = + tableSchema.getPrimaryKey().map(UniqueConstraint::getColumns).orElseGet(ImmutableList::of); - return (DataStreamSinkProvider) dataStream -> FlinkSink.forRowData(dataStream) - .tableLoader(tableLoader) - .tableSchema(tableSchema) - .equalityFieldColumns(equalityColumns) - .overwrite(overwrite) - .append(); + return (DataStreamSinkProvider) + dataStream -> + FlinkSink.forRowData(dataStream) + .tableLoader(tableLoader) + .tableSchema(tableSchema) + .equalityFieldColumns(equalityColumns) + .overwrite(overwrite) + .append(); } @Override public void applyStaticPartition(Map partition) { - // The flink's PartitionFanoutWriter will handle the static partition write policy automatically. + // The flink's PartitionFanoutWriter will handle the static partition write policy + // automatically. } @Override diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSource.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSource.java index dd8f6454ebc4..3bd7335f74c5 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSource.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.Arrays; @@ -43,11 +42,12 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -/** - * Flink Iceberg table source. - */ +/** Flink Iceberg table source. */ public class IcebergTableSource - implements ScanTableSource, SupportsProjectionPushDown, SupportsFilterPushDown, SupportsLimitPushDown { + implements ScanTableSource, + SupportsProjectionPushDown, + SupportsFilterPushDown, + SupportsLimitPushDown { private int[] projectedFields; private long limit; @@ -70,14 +70,23 @@ private IcebergTableSource(IcebergTableSource toCopy) { this.readableConfig = toCopy.readableConfig; } - public IcebergTableSource(TableLoader loader, TableSchema schema, Map properties, - ReadableConfig readableConfig) { + public IcebergTableSource( + TableLoader loader, + TableSchema schema, + Map properties, + ReadableConfig readableConfig) { this(loader, schema, properties, null, false, -1, ImmutableList.of(), readableConfig); } - private IcebergTableSource(TableLoader loader, TableSchema schema, Map properties, - int[] projectedFields, boolean isLimitPushDown, - long limit, List filters, ReadableConfig readableConfig) { + private IcebergTableSource( + TableLoader loader, + TableSchema schema, + Map properties, + int[] projectedFields, + boolean isLimitPushDown, + long limit, + List filters, + ReadableConfig readableConfig) { this.loader = loader; this.schema = schema; this.properties = properties; @@ -92,8 +101,8 @@ private IcebergTableSource(TableLoader loader, TableSchema schema, Map fullNames[i]).toArray(String[]::new), - Arrays.stream(projectedFields).mapToObj(i -> fullTypes[i]).toArray(DataType[]::new)).build(); + return TableSchema.builder() + .fields( + Arrays.stream(projectedFields).mapToObj(i -> fullNames[i]).toArray(String[]::new), + Arrays.stream(projectedFields).mapToObj(i -> fullTypes[i]).toArray(DataType[]::new)) + .build(); } } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java index 401e9db65992..d4cec7a3e80b 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.lang.reflect.Array; @@ -77,7 +76,8 @@ public T get(int pos, Class javaClass) { @Override public void set(int pos, T value) { - throw new UnsupportedOperationException("Could not set a field in the RowDataWrapper because rowData is read-only"); + throw new UnsupportedOperationException( + "Could not set a field in the RowDataWrapper because rowData is read-only"); } private interface PositionalGetter { @@ -104,16 +104,19 @@ private static PositionalGetter buildGetter(LogicalType logicalType, Type typ case DECIMAL: DecimalType decimalType = (DecimalType) logicalType; - return (row, pos) -> row.getDecimal(pos, decimalType.getPrecision(), decimalType.getScale()).toBigDecimal(); + return (row, pos) -> + row.getDecimal(pos, decimalType.getPrecision(), decimalType.getScale()).toBigDecimal(); case TIME_WITHOUT_TIME_ZONE: - // Time in RowData is in milliseconds (Integer), while iceberg's time is microseconds (Long). + // Time in RowData is in milliseconds (Integer), while iceberg's time is microseconds + // (Long). return (row, pos) -> ((long) row.getInt(pos)) * 1_000; case TIMESTAMP_WITHOUT_TIME_ZONE: TimestampType timestampType = (TimestampType) logicalType; return (row, pos) -> { - LocalDateTime localDateTime = row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); return DateTimeUtil.microsFromTimestamp(localDateTime); }; @@ -121,7 +124,8 @@ private static PositionalGetter buildGetter(LogicalType logicalType, Type typ LocalZonedTimestampType lzTs = (LocalZonedTimestampType) logicalType; return (row, pos) -> { TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); - return timestampData.getMillisecond() * 1000 + timestampData.getNanoOfMillisecond() / 1000; + return timestampData.getMillisecond() * 1000 + + timestampData.getNanoOfMillisecond() / 1000; }; case ROW: diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java index ebcb1fb0b7b4..e128badb8461 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.Closeable; @@ -31,9 +30,9 @@ import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; /** - * Serializable loader to load an Iceberg {@link Table}. - * Flink needs to get {@link Table} objects in the cluster (for example, to get splits), not just on the client side. - * So we need an Iceberg table loader to get the {@link Table} object. + * Serializable loader to load an Iceberg {@link Table}. Flink needs to get {@link Table} objects in + * the cluster (for example, to get splits), not just on the client side. So we need an Iceberg + * table loader to get the {@link Table} object. */ public interface TableLoader extends Closeable, Serializable { @@ -78,14 +77,11 @@ public Table loadTable() { } @Override - public void close() { - } + public void close() {} @Override public String toString() { - return MoreObjects.toStringHelper(this) - .add("location", location) - .toString(); + return MoreObjects.toStringHelper(this).add("location", location).toString(); } } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java index cf594b364f5f..f8f1b74b1ceb 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -44,8 +43,7 @@ import org.apache.iceberg.types.Types; class TypeToFlinkType extends TypeUtil.SchemaVisitor { - TypeToFlinkType() { - } + TypeToFlinkType() {} @Override public LogicalType schema(Schema schema, LogicalType structType) { @@ -60,8 +58,8 @@ public LogicalType struct(Types.StructType struct, List fieldResult for (int i = 0; i < fields.size(); i += 1) { Types.NestedField field = fields.get(i); LogicalType type = fieldResults.get(i); - RowType.RowField flinkField = new RowType.RowField( - field.name(), type.copy(field.isOptional()), field.doc()); + RowType.RowField flinkField = + new RowType.RowField(field.name(), type.copy(field.isOptional()), field.doc()); flinkFields.add(flinkField); } @@ -100,9 +98,11 @@ public LogicalType primitive(Type.PrimitiveType primitive) { case DATE: return new DateType(); case TIME: - // For the type: Flink only support TimeType with default precision (second) now. The precision of time is + // For the type: Flink only support TimeType with default precision (second) now. The + // precision of time is // not supported in Flink, so we can think of it as a simple time type directly. - // For the data: Flink uses int that support mills to represent time data, so it supports mills precision. + // For the data: Flink uses int that support mills to represent time data, so it supports + // mills precision. return new TimeType(); case TIMESTAMP: Types.TimestampType timestamp = (Types.TimestampType) primitive; diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java index 98702ceb57f1..06ac54617ae6 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.actions; import org.apache.flink.configuration.Configuration; @@ -26,9 +25,10 @@ public class Actions { - public static final Configuration CONFIG = new Configuration() - // disable classloader check as Avro may cache class/object in the serializers. - .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); + public static final Configuration CONFIG = + new Configuration() + // disable classloader check as Avro may cache class/object in the serializers. + .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); private StreamExecutionEnvironment env; private Table table; @@ -49,5 +49,4 @@ public static Actions forTable(Table table) { public RewriteDataFilesAction rewriteDataFiles() { return new RewriteDataFilesAction(env, table); } - } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java index cbd4aed73c8a..9876bb3861c4 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.actions; import java.util.List; @@ -51,7 +50,8 @@ protected List rewriteDataForTasks(List combinedScan int size = combinedScanTasks.size(); int parallelism = Math.min(size, maxParallelism); DataStream dataStream = env.fromCollection(combinedScanTasks); - RowDataRewriter rowDataRewriter = new RowDataRewriter(table(), caseSensitive(), fileIO(), encryptionManager()); + RowDataRewriter rowDataRewriter = + new RowDataRewriter(table(), caseSensitive(), fileIO(), encryptionManager()); try { return rowDataRewriter.rewriteDataForTasks(dataStream, parallelism); } catch (Exception e) { diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java index 1ccc3b787e33..8103224a0b6c 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import org.apache.flink.table.types.logical.ArrayType; @@ -29,7 +28,8 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.util.Pair; -public abstract class AvroWithFlinkSchemaVisitor extends AvroWithPartnerByStructureVisitor { +public abstract class AvroWithFlinkSchemaVisitor + extends AvroWithPartnerByStructureVisitor { @Override protected boolean isStringType(LogicalType logicalType) { @@ -43,7 +43,8 @@ protected boolean isMapType(LogicalType logicalType) { @Override protected LogicalType arrayElementType(LogicalType arrayType) { - Preconditions.checkArgument(arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); + Preconditions.checkArgument( + arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); return ((ArrayType) arrayType).getElementType(); } @@ -61,7 +62,8 @@ protected LogicalType mapValueType(LogicalType mapType) { @Override protected Pair fieldNameAndType(LogicalType structType, int pos) { - Preconditions.checkArgument(structType instanceof RowType, "Invalid struct: %s is not a struct", structType); + Preconditions.checkArgument( + structType instanceof RowType, "Invalid struct: %s is not a struct", structType); RowType.RowField field = ((RowType) structType).getFields().get(pos); return Pair.of(field.getName(), field.getType()); } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java index 991ef6336297..86404959735a 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.IOException; @@ -49,10 +48,12 @@ public FlinkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSche } @SuppressWarnings("unchecked") - public FlinkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { + public FlinkAvroReader( + org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { this.readSchema = readSchema; - this.reader = (ValueReader) AvroSchemaWithTypeVisitor - .visit(expectedSchema, readSchema, new ReadBuilder(constants)); + this.reader = + (ValueReader) + AvroSchemaWithTypeVisitor.visit(expectedSchema, readSchema, new ReadBuilder(constants)); } @Override @@ -80,8 +81,8 @@ private ReadBuilder(Map idToConstant) { } @Override - public ValueReader record(Types.StructType expected, Schema record, List names, - List> fields) { + public ValueReader record( + Types.StructType expected, Schema record, List names, List> fields) { return FlinkValueReaders.struct(fields, expected.asStructType(), idToConstant); } @@ -91,13 +92,14 @@ public ValueReader union(Type expected, Schema union, List> op } @Override - public ValueReader array(Types.ListType expected, Schema array, ValueReader elementReader) { + public ValueReader array( + Types.ListType expected, Schema array, ValueReader elementReader) { return FlinkValueReaders.array(elementReader); } @Override - public ValueReader map(Types.MapType expected, Schema map, - ValueReader keyReader, ValueReader valueReader) { + public ValueReader map( + Types.MapType expected, Schema map, ValueReader keyReader, ValueReader valueReader) { return FlinkValueReaders.arrayMap(keyReader, valueReader); } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java index b069a35d3bbb..873e65783119 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.IOException; @@ -47,8 +46,9 @@ public FlinkAvroWriter(RowType rowType) { @Override @SuppressWarnings("unchecked") public void setSchema(Schema schema) { - this.writer = (ValueWriter) AvroWithFlinkSchemaVisitor - .visit(rowType, schema, new WriteBuilder()); + this.writer = + (ValueWriter) + AvroWithFlinkSchemaVisitor.visit(rowType, schema, new WriteBuilder()); } @Override @@ -63,17 +63,23 @@ public Stream metrics() { private static class WriteBuilder extends AvroWithFlinkSchemaVisitor> { @Override - public ValueWriter record(LogicalType struct, Schema record, List names, List> fields) { - return FlinkValueWriters.row(fields, IntStream.range(0, names.size()) - .mapToObj(i -> fieldNameAndType(struct, i).second()).collect(Collectors.toList())); + public ValueWriter record( + LogicalType struct, Schema record, List names, List> fields) { + return FlinkValueWriters.row( + fields, + IntStream.range(0, names.size()) + .mapToObj(i -> fieldNameAndType(struct, i).second()) + .collect(Collectors.toList())); } @Override public ValueWriter union(LogicalType type, Schema union, List> options) { - Preconditions.checkArgument(options.contains(ValueWriters.nulls()), - "Cannot create writer for non-option union: %s", union); - Preconditions.checkArgument(options.size() == 2, - "Cannot create writer for non-option union: %s", union); + Preconditions.checkArgument( + options.contains(ValueWriters.nulls()), + "Cannot create writer for non-option union: %s", + union); + Preconditions.checkArgument( + options.size() == 2, "Cannot create writer for non-option union: %s", union); if (union.getTypes().get(0).getType() == Schema.Type.NULL) { return ValueWriters.option(0, options.get(1)); } else { @@ -88,12 +94,15 @@ public ValueWriter array(LogicalType sArray, Schema array, ValueWriter ele @Override public ValueWriter map(LogicalType sMap, Schema map, ValueWriter valueReader) { - return FlinkValueWriters.map(FlinkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); + return FlinkValueWriters.map( + FlinkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); } @Override - public ValueWriter map(LogicalType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { - return FlinkValueWriters.arrayMap(keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); + public ValueWriter map( + LogicalType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { + return FlinkValueWriters.arrayMap( + keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); } @Override diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java index 4c4e2050263b..65b9d44ad4b8 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.List; @@ -44,7 +43,8 @@ public FlinkOrcReader(Schema iSchema, TypeDescription readSchema) { } public FlinkOrcReader(Schema iSchema, TypeDescription readSchema, Map idToConstant) { - this.reader = OrcSchemaWithTypeVisitor.visit(iSchema, readSchema, new ReadBuilder(idToConstant)); + this.reader = + OrcSchemaWithTypeVisitor.visit(iSchema, readSchema, new ReadBuilder(idToConstant)); } @Override @@ -65,21 +65,26 @@ private ReadBuilder(Map idToConstant) { } @Override - public OrcValueReader record(Types.StructType iStruct, TypeDescription record, List names, - List> fields) { + public OrcValueReader record( + Types.StructType iStruct, + TypeDescription record, + List names, + List> fields) { return FlinkOrcReaders.struct(fields, iStruct, idToConstant); } @Override - public OrcValueReader list(Types.ListType iList, TypeDescription array, - OrcValueReader elementReader) { + public OrcValueReader list( + Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { return FlinkOrcReaders.array(elementReader); } @Override - public OrcValueReader map(Types.MapType iMap, TypeDescription map, - OrcValueReader keyReader, - OrcValueReader valueReader) { + public OrcValueReader map( + Types.MapType iMap, + TypeDescription map, + OrcValueReader keyReader, + OrcValueReader valueReader) { return FlinkOrcReaders.map(keyReader, valueReader); } @@ -117,8 +122,9 @@ public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescriptio Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; return FlinkOrcReaders.decimals(decimalType.precision(), decimalType.scale()); default: - throw new IllegalArgumentException(String.format("Invalid iceberg type %s corresponding to ORC type %s", - iPrimitive, primitive)); + throw new IllegalArgumentException( + String.format( + "Invalid iceberg type %s corresponding to ORC type %s", iPrimitive, primitive)); } } } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java index 744a05eb2d21..7a4a15c7e600 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.math.BigDecimal; @@ -50,8 +49,7 @@ import org.apache.orc.storage.serde2.io.HiveDecimalWritable; class FlinkOrcReaders { - private FlinkOrcReaders() { - } + private FlinkOrcReaders() {} static OrcValueReader strings() { return StringReader.INSTANCE; @@ -87,13 +85,13 @@ static OrcValueReader array(OrcValueReader elementReader) { return new ArrayReader<>(elementReader); } - public static OrcValueReader map(OrcValueReader keyReader, OrcValueReader valueReader) { + public static OrcValueReader map( + OrcValueReader keyReader, OrcValueReader valueReader) { return new MapReader<>(keyReader, valueReader); } - public static OrcValueReader struct(List> readers, - Types.StructType struct, - Map idToConstant) { + public static OrcValueReader struct( + List> readers, Types.StructType struct, Map idToConstant) { return new StructReader(readers, struct, idToConstant); } @@ -103,7 +101,8 @@ private static class StringReader implements OrcValueReader { @Override public StringData nonNullRead(ColumnVector vector, int row) { BytesColumnVector bytesVector = (BytesColumnVector) vector; - return StringData.fromBytes(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); + return StringData.fromBytes( + bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); } } @@ -130,8 +129,12 @@ public DecimalData nonNullRead(ColumnVector vector, int row) { HiveDecimalWritable value = ((DecimalColumnVector) vector).vector[row]; // The hive ORC writer may will adjust the scale of decimal data. - Preconditions.checkArgument(value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); return DecimalData.fromUnscaledLong(value.serialize64(scale), precision, scale); } @@ -148,10 +151,15 @@ private static class Decimal38Reader implements OrcValueReader { @Override public DecimalData nonNullRead(ColumnVector vector, int row) { - BigDecimal value = ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); + BigDecimal value = + ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); - Preconditions.checkArgument(value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); return DecimalData.fromBigDecimal(value, precision, scale); } @@ -174,9 +182,10 @@ private static class TimestampReader implements OrcValueReader { @Override public TimestampData nonNullRead(ColumnVector vector, int row) { TimestampColumnVector tcv = (TimestampColumnVector) vector; - LocalDateTime localDate = Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) - .atOffset(ZoneOffset.UTC) - .toLocalDateTime(); + LocalDateTime localDate = + Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) + .atOffset(ZoneOffset.UTC) + .toLocalDateTime(); return TimestampData.fromLocalDateTime(localDate); } } @@ -187,9 +196,10 @@ private static class TimestampTzReader implements OrcValueReader @Override public TimestampData nonNullRead(ColumnVector vector, int row) { TimestampColumnVector tcv = (TimestampColumnVector) vector; - Instant instant = Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) - .atOffset(ZoneOffset.UTC) - .toInstant(); + Instant instant = + Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) + .atOffset(ZoneOffset.UTC) + .toInstant(); return TimestampData.fromInstant(instant); } } @@ -254,7 +264,8 @@ public void setBatchContext(long batchOffsetInFile) { private static class StructReader extends OrcValueReaders.StructReader { private final int numFields; - StructReader(List> readers, Types.StructType struct, Map idToConstant) { + StructReader( + List> readers, Types.StructType struct, Map idToConstant) { super(readers, struct, idToConstant); this.numFields = struct.fields().size(); } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java index 2eeb268095f5..6a31accffd22 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.Deque; @@ -40,7 +39,9 @@ public class FlinkOrcWriter implements OrcRowWriter { private final FlinkOrcWriters.RowDataWriter writer; private FlinkOrcWriter(RowType rowType, Schema iSchema) { - this.writer = (FlinkOrcWriters.RowDataWriter) FlinkSchemaVisitor.visit(rowType, iSchema, new WriteBuilder()); + this.writer = + (FlinkOrcWriters.RowDataWriter) + FlinkSchemaVisitor.visit(rowType, iSchema, new WriteBuilder()); } public static OrcRowWriter buildWriter(RowType rowType, Schema iSchema) { @@ -66,8 +67,7 @@ public Stream> metrics() { private static class WriteBuilder extends FlinkSchemaVisitor> { private final Deque fieldIds = Lists.newLinkedList(); - private WriteBuilder() { - } + private WriteBuilder() {} @Override public void beforeField(Types.NestedField field) { @@ -80,20 +80,24 @@ public void afterField(Types.NestedField field) { } @Override - public OrcValueWriter record(Types.StructType iStruct, - List> results, - List fieldType) { + public OrcValueWriter record( + Types.StructType iStruct, List> results, List fieldType) { return FlinkOrcWriters.struct(results, fieldType); } @Override - public OrcValueWriter map(Types.MapType iMap, OrcValueWriter key, OrcValueWriter value, - LogicalType keyType, LogicalType valueType) { + public OrcValueWriter map( + Types.MapType iMap, + OrcValueWriter key, + OrcValueWriter value, + LogicalType keyType, + LogicalType valueType) { return FlinkOrcWriters.map(key, value, keyType, valueType); } @Override - public OrcValueWriter list(Types.ListType iList, OrcValueWriter element, LogicalType elementType) { + public OrcValueWriter list( + Types.ListType iList, OrcValueWriter element, LogicalType elementType) { return FlinkOrcWriters.list(element, elementType); } @@ -113,14 +117,20 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType fl case LONG: return GenericOrcWriters.longs(); case FLOAT: - Preconditions.checkArgument(fieldIds.peek() != null, - String.format("[BUG] Cannot find field id for primitive field with type %s. This is likely because id " + - "information is not properly pushed during schema visiting.", iPrimitive)); + Preconditions.checkArgument( + fieldIds.peek() != null, + String.format( + "[BUG] Cannot find field id for primitive field with type %s. This is likely because id " + + "information is not properly pushed during schema visiting.", + iPrimitive)); return GenericOrcWriters.floats(fieldIds.peek()); case DOUBLE: - Preconditions.checkArgument(fieldIds.peek() != null, - String.format("[BUG] Cannot find field id for primitive field with type %s. This is likely because id " + - "information is not properly pushed during schema visiting.", iPrimitive)); + Preconditions.checkArgument( + fieldIds.peek() != null, + String.format( + "[BUG] Cannot find field id for primitive field with type %s. This is likely because id " + + "information is not properly pushed during schema visiting.", + iPrimitive)); return GenericOrcWriters.doubles(fieldIds.peek()); case DATE: return FlinkOrcWriters.dates(); @@ -143,8 +153,10 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType fl Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; return FlinkOrcWriters.decimals(decimalType.precision(), decimalType.scale()); default: - throw new IllegalArgumentException(String.format( - "Invalid iceberg type %s corresponding to Flink logical type %s", iPrimitive, flinkPrimitive)); + throw new IllegalArgumentException( + String.format( + "Invalid iceberg type %s corresponding to Flink logical type %s", + iPrimitive, flinkPrimitive)); } } } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java index 6b596ac2063c..9fe777a6f82d 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.time.Instant; @@ -47,8 +46,7 @@ class FlinkOrcWriters { - private FlinkOrcWriters() { - } + private FlinkOrcWriters() {} static OrcValueWriter strings() { return StringWriter.INSTANCE; @@ -80,12 +78,16 @@ static OrcValueWriter decimals(int precision, int scale) { } } - static OrcValueWriter list(OrcValueWriter elementWriter, LogicalType elementType) { + static OrcValueWriter list( + OrcValueWriter elementWriter, LogicalType elementType) { return new ListWriter<>(elementWriter, elementType); } - static OrcValueWriter map(OrcValueWriter keyWriter, OrcValueWriter valueWriter, - LogicalType keyType, LogicalType valueType) { + static OrcValueWriter map( + OrcValueWriter keyWriter, + OrcValueWriter valueWriter, + LogicalType keyType, + LogicalType valueType) { return new MapWriter<>(keyWriter, valueWriter, keyType, valueType); } @@ -132,7 +134,8 @@ public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { cv.setIsUTC(true); // millis OffsetDateTime offsetDateTime = data.toInstant().atOffset(ZoneOffset.UTC); - cv.time[rowId] = offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; + cv.time[rowId] = + offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; // truncate nanos to only keep microsecond precision. cv.nanos[rowId] = (offsetDateTime.getNano() / 1_000) * 1_000; } @@ -163,12 +166,21 @@ private static class Decimal18Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) { - Preconditions.checkArgument(scale == data.scale(), - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, data); - Preconditions.checkArgument(data.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, data); - - ((DecimalColumnVector) output).vector[rowId].setFromLongAndScale(data.toUnscaledLong(), data.scale()); + Preconditions.checkArgument( + scale == data.scale(), + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + data); + Preconditions.checkArgument( + data.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + data); + + ((DecimalColumnVector) output) + .vector[rowId].setFromLongAndScale(data.toUnscaledLong(), data.scale()); } } @@ -183,12 +195,21 @@ private static class Decimal38Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) { - Preconditions.checkArgument(scale == data.scale(), - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, data); - Preconditions.checkArgument(data.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, data); - - ((DecimalColumnVector) output).vector[rowId].set(HiveDecimal.create(data.toBigDecimal(), false)); + Preconditions.checkArgument( + scale == data.scale(), + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + data); + Preconditions.checkArgument( + data.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + data); + + ((DecimalColumnVector) output) + .vector[rowId].set(HiveDecimal.create(data.toBigDecimal(), false)); } } @@ -221,7 +242,6 @@ public void nonNullWrite(int rowId, ArrayData data, ColumnVector output) { public Stream> metrics() { return elementWriter.metrics(); } - } static class MapWriter implements OrcValueWriter { @@ -230,8 +250,11 @@ static class MapWriter implements OrcValueWriter { private final ArrayData.ElementGetter keyGetter; private final ArrayData.ElementGetter valueGetter; - MapWriter(OrcValueWriter keyWriter, OrcValueWriter valueWriter, - LogicalType keyType, LogicalType valueType) { + MapWriter( + OrcValueWriter keyWriter, + OrcValueWriter valueWriter, + LogicalType keyType, + LogicalType valueType) { this.keyWriter = keyWriter; this.valueWriter = valueWriter; this.keyGetter = ArrayData.createElementGetter(keyType); @@ -282,7 +305,6 @@ static class RowDataWriter extends GenericOrcWriters.StructWriter { protected Object get(RowData struct, int index) { return fieldGetters.get(index).getFieldOrNull(struct); } - } private static void growColumnVector(ColumnVector cv, int requestedSize) { diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java index 1c2e5e9c6a2b..2b21d77b70e0 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.math.BigDecimal; @@ -56,20 +55,19 @@ import org.apache.parquet.schema.Type; public class FlinkParquetReaders { - private FlinkParquetReaders() { - } + private FlinkParquetReaders() {} - public static ParquetValueReader buildReader(Schema expectedSchema, MessageType fileSchema) { + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema) { return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); } @SuppressWarnings("unchecked") - public static ParquetValueReader buildReader(Schema expectedSchema, - MessageType fileSchema, - Map idToConstant) { - return (ParquetValueReader) TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, - new ReadBuilder(fileSchema, idToConstant) - ); + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema, Map idToConstant) { + return (ParquetValueReader) + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); } private static class ReadBuilder extends TypeWithSchemaVisitor> { @@ -82,14 +80,14 @@ private static class ReadBuilder extends TypeWithSchemaVisitor message(Types.StructType expected, MessageType message, - List> fieldReaders) { + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { return struct(expected, message.asGroupType(), fieldReaders); } @Override - public ParquetValueReader struct(Types.StructType expected, GroupType struct, - List> fieldReaders) { + public ParquetValueReader struct( + Types.StructType expected, GroupType struct, List> fieldReaders) { // match the expected struct's order Map> readersById = Maps.newHashMap(); Map typesById = Maps.newHashMap(); @@ -106,10 +104,10 @@ public ParquetValueReader struct(Types.StructType expected, GroupType s } } - List expectedFields = expected != null ? - expected.fields() : ImmutableList.of(); - List> reorderedFields = Lists.newArrayListWithExpectedSize( - expectedFields.size()); + List expectedFields = + expected != null ? expected.fields() : ImmutableList.of(); + List> reorderedFields = + Lists.newArrayListWithExpectedSize(expectedFields.size()); List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); for (Types.NestedField field : expectedFields) { int id = field.fieldId(); @@ -139,8 +137,8 @@ public ParquetValueReader struct(Types.StructType expected, GroupType s } @Override - public ParquetValueReader list(Types.ListType expectedList, GroupType array, - ParquetValueReader elementReader) { + public ParquetValueReader list( + Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { if (expectedList == null) { return null; } @@ -154,13 +152,16 @@ public ParquetValueReader list(Types.ListType expectedList, GroupType array, Type elementType = repeated.getType(0); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - return new ArrayReader<>(repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); + return new ArrayReader<>( + repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); } @Override - public ParquetValueReader map(Types.MapType expectedMap, GroupType map, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { + public ParquetValueReader map( + Types.MapType expectedMap, + GroupType map, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { if (expectedMap == null) { return null; } @@ -176,15 +177,17 @@ public ParquetValueReader map(Types.MapType expectedMap, GroupType map, Type valueType = repeatedKeyValue.getType(1); int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - return new MapReader<>(repeatedD, repeatedR, + return new MapReader<>( + repeatedD, + repeatedR, ParquetValueReaders.option(keyType, keyD, keyReader), ParquetValueReaders.option(valueType, valueD, valueReader)); } @Override @SuppressWarnings("CyclomaticComplexity") - public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveType expected, - PrimitiveType primitive) { + public ParquetValueReader primitive( + org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { if (expected == null) { return null; } @@ -225,7 +228,8 @@ public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveTy return new MillisToTimestampReader(desc); } case DECIMAL: - DecimalLogicalTypeAnnotation decimal = (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); + DecimalLogicalTypeAnnotation decimal = + (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); switch (primitive.getPrimitiveTypeName()) { case BINARY: case FIXED_LEN_BYTE_ARRAY: @@ -272,7 +276,8 @@ public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveTy } } - private static class BinaryDecimalReader extends ParquetValueReaders.PrimitiveReader { + private static class BinaryDecimalReader + extends ParquetValueReaders.PrimitiveReader { private final int precision; private final int scale; @@ -291,7 +296,8 @@ public DecimalData read(DecimalData ignored) { } } - private static class IntegerDecimalReader extends ParquetValueReaders.PrimitiveReader { + private static class IntegerDecimalReader + extends ParquetValueReaders.PrimitiveReader { private final int precision; private final int scale; @@ -323,7 +329,8 @@ public DecimalData read(DecimalData ignored) { } } - private static class MicrosToTimestampTzReader extends ParquetValueReaders.UnboxedReader { + private static class MicrosToTimestampTzReader + extends ParquetValueReaders.UnboxedReader { MicrosToTimestampTzReader(ColumnDescriptor desc) { super(desc); } @@ -331,10 +338,11 @@ private static class MicrosToTimestampTzReader extends ParquetValueReaders.Unbox @Override public TimestampData read(TimestampData ignored) { long value = readLong(); - return TimestampData.fromLocalDateTime(Instant.ofEpochSecond(Math.floorDiv(value, 1000_000), - Math.floorMod(value, 1000_000) * 1000) - .atOffset(ZoneOffset.UTC) - .toLocalDateTime()); + return TimestampData.fromLocalDateTime( + Instant.ofEpochSecond( + Math.floorDiv(value, 1000_000), Math.floorMod(value, 1000_000) * 1000) + .atOffset(ZoneOffset.UTC) + .toLocalDateTime()); } @Override @@ -343,7 +351,8 @@ public long readLong() { } } - private static class MicrosToTimestampReader extends ParquetValueReaders.UnboxedReader { + private static class MicrosToTimestampReader + extends ParquetValueReaders.UnboxedReader { MicrosToTimestampReader(ColumnDescriptor desc) { super(desc); } @@ -351,8 +360,9 @@ private static class MicrosToTimestampReader extends ParquetValueReaders.Unboxed @Override public TimestampData read(TimestampData ignored) { long value = readLong(); - return TimestampData.fromInstant(Instant.ofEpochSecond(Math.floorDiv(value, 1000_000), - Math.floorMod(value, 1000_000) * 1000)); + return TimestampData.fromInstant( + Instant.ofEpochSecond( + Math.floorDiv(value, 1000_000), Math.floorMod(value, 1000_000) * 1000)); } @Override @@ -361,7 +371,8 @@ public long readLong() { } } - private static class MillisToTimestampReader extends ParquetValueReaders.UnboxedReader { + private static class MillisToTimestampReader + extends ParquetValueReaders.UnboxedReader { MillisToTimestampReader(ColumnDescriptor desc) { super(desc); } @@ -378,7 +389,8 @@ public long readLong() { } } - private static class MillisToTimestampTzReader extends ParquetValueReaders.UnboxedReader { + private static class MillisToTimestampTzReader + extends ParquetValueReaders.UnboxedReader { MillisToTimestampTzReader(ColumnDescriptor desc) { super(desc); } @@ -386,9 +398,8 @@ private static class MillisToTimestampTzReader extends ParquetValueReaders.Unbox @Override public TimestampData read(TimestampData ignored) { long millis = readLong(); - return TimestampData.fromLocalDateTime(Instant.ofEpochMilli(millis) - .atOffset(ZoneOffset.UTC) - .toLocalDateTime()); + return TimestampData.fromLocalDateTime( + Instant.ofEpochMilli(millis).atOffset(ZoneOffset.UTC).toLocalDateTime()); } @Override @@ -415,7 +426,8 @@ public StringData read(StringData ignored) { } } - private static class LossyMicrosToMillisTimeReader extends ParquetValueReaders.PrimitiveReader { + private static class LossyMicrosToMillisTimeReader + extends ParquetValueReaders.PrimitiveReader { LossyMicrosToMillisTimeReader(ColumnDescriptor desc) { super(desc); } @@ -438,7 +450,8 @@ public Integer read(Integer reuse) { } } - private static class ArrayReader extends ParquetValueReaders.RepeatedReader { + private static class ArrayReader + extends ParquetValueReaders.RepeatedReader { private int readPos = 0; private int writePos = 0; @@ -484,23 +497,29 @@ protected void addElement(ReusableArrayData reused, E element) { @Override protected ArrayData buildList(ReusableArrayData list) { - // Since ReusableArrayData is not accepted by Flink, use GenericArrayData temporarily to walk around it. + // Since ReusableArrayData is not accepted by Flink, use GenericArrayData temporarily to walk + // around it. // Revert this to use ReusableArrayData once it is fixed in Flink. // For your reference, https://issues.apache.org/jira/browse/FLINK-25238. return new GenericArrayData(Arrays.copyOf(list.values, writePos)); } } - private static class MapReader extends - ParquetValueReaders.RepeatedKeyValueReader { + private static class MapReader + extends ParquetValueReaders.RepeatedKeyValueReader { private int readPos = 0; private int writePos = 0; - private final ParquetValueReaders.ReusableEntry entry = new ParquetValueReaders.ReusableEntry<>(); - private final ParquetValueReaders.ReusableEntry nullEntry = new ParquetValueReaders.ReusableEntry<>(); + private final ParquetValueReaders.ReusableEntry entry = + new ParquetValueReaders.ReusableEntry<>(); + private final ParquetValueReaders.ReusableEntry nullEntry = + new ParquetValueReaders.ReusableEntry<>(); - MapReader(int definitionLevel, int repetitionLevel, - ParquetValueReader keyReader, ParquetValueReader valueReader) { + MapReader( + int definitionLevel, + int repetitionLevel, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { super(definitionLevel, repetitionLevel, keyReader, valueReader); } @@ -549,7 +568,8 @@ protected MapData buildMap(ReusableMapData map) { } } - private static class RowDataReader extends ParquetValueReaders.StructReader { + private static class RowDataReader + extends ParquetValueReaders.StructReader { private final int numFields; RowDataReader(List types, List> readers) { diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java index 6154ef1cfa2b..db4f1730a134 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.Iterator; @@ -52,12 +51,12 @@ import org.apache.parquet.schema.Type; public class FlinkParquetWriters { - private FlinkParquetWriters() { - } + private FlinkParquetWriters() {} @SuppressWarnings("unchecked") public static ParquetValueWriter buildWriter(LogicalType schema, MessageType type) { - return (ParquetValueWriter) ParquetWithFlinkSchemaVisitor.visit(schema, type, new WriteBuilder(type)); + return (ParquetValueWriter) + ParquetWithFlinkSchemaVisitor.visit(schema, type, new WriteBuilder(type)); } private static class WriteBuilder extends ParquetWithFlinkSchemaVisitor> { @@ -68,13 +67,14 @@ private static class WriteBuilder extends ParquetWithFlinkSchemaVisitor message(RowType sStruct, MessageType message, List> fields) { + public ParquetValueWriter message( + RowType sStruct, MessageType message, List> fields) { return struct(sStruct, message.asGroupType(), fields); } @Override - public ParquetValueWriter struct(RowType sStruct, GroupType struct, - List> fieldWriters) { + public ParquetValueWriter struct( + RowType sStruct, GroupType struct, List> fieldWriters) { List fields = struct.getFields(); List flinkFields = sStruct.getFields(); List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); @@ -88,34 +88,42 @@ public ParquetValueWriter struct(RowType sStruct, GroupType struct, } @Override - public ParquetValueWriter list(ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { + public ParquetValueWriter list( + ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - return new ArrayDataWriter<>(repeatedD, repeatedR, + return new ArrayDataWriter<>( + repeatedD, + repeatedR, newOption(repeated.getType(0), elementWriter), sArray.getElementType()); } @Override - public ParquetValueWriter map(MapType sMap, GroupType map, - ParquetValueWriter keyWriter, ParquetValueWriter valueWriter) { + public ParquetValueWriter map( + MapType sMap, + GroupType map, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - return new MapDataWriter<>(repeatedD, repeatedR, + return new MapDataWriter<>( + repeatedD, + repeatedR, newOption(repeatedKeyValue.getType(0), keyWriter), newOption(repeatedKeyValue.getType(1), valueWriter), - sMap.getKeyType(), sMap.getValueType()); + sMap.getKeyType(), + sMap.getValueType()); } - private ParquetValueWriter newOption(Type fieldType, ParquetValueWriter writer) { int maxD = type.getMaxDefinitionLevel(path(fieldType.getName())); return ParquetValueWriters.option(fieldType, maxD, writer); @@ -143,7 +151,8 @@ public ParquetValueWriter primitive(LogicalType fType, PrimitiveType primitiv case TIMESTAMP_MICROS: return timestamps(desc); case DECIMAL: - DecimalLogicalTypeAnnotation decimal = (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); + DecimalLogicalTypeAnnotation decimal = + (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); switch (primitive.getPrimitiveTypeName()) { case INT32: return decimalAsInteger(desc, decimal.getPrecision(), decimal.getScale()); @@ -184,7 +193,8 @@ public ParquetValueWriter primitive(LogicalType fType, PrimitiveType primitiv } } - private static ParquetValueWriters.PrimitiveWriter ints(LogicalType type, ColumnDescriptor desc) { + private static ParquetValueWriters.PrimitiveWriter ints( + LogicalType type, ColumnDescriptor desc) { if (type instanceof TinyIntType) { return ParquetValueWriters.tinyints(desc); } else if (type instanceof SmallIntType) { @@ -201,26 +211,33 @@ private static ParquetValueWriters.PrimitiveWriter timeMicros(ColumnDes return new TimeMicrosWriter(desc); } - private static ParquetValueWriters.PrimitiveWriter decimalAsInteger(ColumnDescriptor desc, - int precision, int scale) { - Preconditions.checkArgument(precision <= 9, "Cannot write decimal value as integer with precision larger than 9," + - " wrong precision %s", precision); + private static ParquetValueWriters.PrimitiveWriter decimalAsInteger( + ColumnDescriptor desc, int precision, int scale) { + Preconditions.checkArgument( + precision <= 9, + "Cannot write decimal value as integer with precision larger than 9," + + " wrong precision %s", + precision); return new IntegerDecimalWriter(desc, precision, scale); } - private static ParquetValueWriters.PrimitiveWriter decimalAsLong(ColumnDescriptor desc, - int precision, int scale) { - Preconditions.checkArgument(precision <= 18, "Cannot write decimal value as long with precision larger than 18, " + - " wrong precision %s", precision); + private static ParquetValueWriters.PrimitiveWriter decimalAsLong( + ColumnDescriptor desc, int precision, int scale) { + Preconditions.checkArgument( + precision <= 18, + "Cannot write decimal value as long with precision larger than 18, " + + " wrong precision %s", + precision); return new LongDecimalWriter(desc, precision, scale); } - private static ParquetValueWriters.PrimitiveWriter decimalAsFixed(ColumnDescriptor desc, - int precision, int scale) { + private static ParquetValueWriters.PrimitiveWriter decimalAsFixed( + ColumnDescriptor desc, int precision, int scale) { return new FixedDecimalWriter(desc, precision, scale); } - private static ParquetValueWriters.PrimitiveWriter timestamps(ColumnDescriptor desc) { + private static ParquetValueWriters.PrimitiveWriter timestamps( + ColumnDescriptor desc) { return new TimestampDataWriter(desc); } @@ -251,7 +268,8 @@ public void write(int repetitionLevel, Integer value) { } } - private static class IntegerDecimalWriter extends ParquetValueWriters.PrimitiveWriter { + private static class IntegerDecimalWriter + extends ParquetValueWriters.PrimitiveWriter { private final int precision; private final int scale; @@ -263,10 +281,18 @@ private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, DecimalData decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeInteger(repetitionLevel, (int) decimal.toUnscaledLong()); } @@ -284,10 +310,18 @@ private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, DecimalData decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeLong(repetitionLevel, decimal.toUnscaledLong()); } @@ -302,24 +336,28 @@ private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { super(desc); this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(int repetitionLevel, DecimalData decimal) { - byte[] binary = DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toBigDecimal(), bytes.get()); + byte[] binary = + DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toBigDecimal(), bytes.get()); column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(binary)); } } - private static class TimestampDataWriter extends ParquetValueWriters.PrimitiveWriter { + private static class TimestampDataWriter + extends ParquetValueWriters.PrimitiveWriter { private TimestampDataWriter(ColumnDescriptor desc) { super(desc); } @Override public void write(int repetitionLevel, TimestampData value) { - column.writeLong(repetitionLevel, value.getMillisecond() * 1000 + value.getNanoOfMillisecond() / 1000); + column.writeLong( + repetitionLevel, value.getMillisecond() * 1000 + value.getNanoOfMillisecond() / 1000); } } @@ -337,8 +375,11 @@ public void write(int repetitionLevel, byte[] bytes) { private static class ArrayDataWriter extends ParquetValueWriters.RepeatedWriter { private final LogicalType elementType; - private ArrayDataWriter(int definitionLevel, int repetitionLevel, - ParquetValueWriter writer, LogicalType elementType) { + private ArrayDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter writer, + LogicalType elementType) { super(definitionLevel, repetitionLevel, writer); this.elementType = elementType; } @@ -381,13 +422,18 @@ public E next() { } } - private static class MapDataWriter extends ParquetValueWriters.RepeatedKeyValueWriter { + private static class MapDataWriter + extends ParquetValueWriters.RepeatedKeyValueWriter { private final LogicalType keyType; private final LogicalType valueType; - private MapDataWriter(int definitionLevel, int repetitionLevel, - ParquetValueWriter keyWriter, ParquetValueWriter valueWriter, - LogicalType keyType, LogicalType valueType) { + private MapDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter, + LogicalType keyType, + LogicalType valueType) { super(definitionLevel, repetitionLevel, keyWriter, valueWriter); this.keyType = keyType; this.valueType = valueType; @@ -429,7 +475,9 @@ public Map.Entry next() { throw new NoSuchElementException(); } - entry.set((K) keyGetter.getElementOrNull(keys, index), (V) valueGetter.getElementOrNull(values, index)); + entry.set( + (K) keyGetter.getElementOrNull(keys, index), + (V) valueGetter.getElementOrNull(values, index)); index += 1; return entry; diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java index 0909e1b53a85..ba4e1a7a7aec 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.List; @@ -85,8 +84,8 @@ private static T visit(LogicalType flinkType, Type iType, FlinkSchemaVisitor } } - private static T visitRecord(LogicalType flinkType, Types.StructType struct, - FlinkSchemaVisitor visitor) { + private static T visitRecord( + LogicalType flinkType, Types.StructType struct, FlinkSchemaVisitor visitor) { Preconditions.checkArgument(flinkType instanceof RowType, "%s is not a RowType.", flinkType); RowType rowType = (RowType) flinkType; @@ -98,8 +97,8 @@ private static T visitRecord(LogicalType flinkType, Types.StructType struct, for (int i = 0; i < fieldSize; i++) { Types.NestedField iField = nestedFields.get(i); int fieldIndex = rowType.getFieldIndex(iField.name()); - Preconditions.checkArgument(fieldIndex >= 0, - "NestedField: %s is not found in flink RowType: %s", iField, rowType); + Preconditions.checkArgument( + fieldIndex >= 0, "NestedField: %s is not found in flink RowType: %s", iField, rowType); LogicalType fieldFlinkType = rowType.getTypeAt(fieldIndex); @@ -132,11 +131,9 @@ public T primitive(Type.PrimitiveType iPrimitive, LogicalType flinkPrimitive) { return null; } - public void beforeField(Types.NestedField field) { - } + public void beforeField(Types.NestedField field) {} - public void afterField(Types.NestedField field) { - } + public void afterField(Types.NestedField field) {} public void beforeListElement(Types.NestedField elementField) { beforeField(elementField); diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java index 1b7a98f7dc8f..32f6c3a2ccfd 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.IOException; @@ -44,8 +43,7 @@ public class FlinkValueReaders { - private FlinkValueReaders() { - } + private FlinkValueReaders() {} static ValueReader strings() { return StringReader.INSTANCE; @@ -71,7 +69,8 @@ static ValueReader timestampMicros() { return TimestampMicrosReader.INSTANCE; } - static ValueReader decimal(ValueReader unscaledReader, int precision, int scale) { + static ValueReader decimal( + ValueReader unscaledReader, int precision, int scale) { return new DecimalReader(unscaledReader, precision, scale); } @@ -79,8 +78,7 @@ static ValueReader array(ValueReader elementReader) { return new ArrayReader(elementReader); } - static ValueReader arrayMap(ValueReader keyReader, - ValueReader valueReader) { + static ValueReader arrayMap(ValueReader keyReader, ValueReader valueReader) { return new ArrayMapReader(keyReader, valueReader); } @@ -88,16 +86,15 @@ static ValueReader map(ValueReader keyReader, ValueReader valueRe return new MapReader(keyReader, valueReader); } - static ValueReader struct(List> readers, Types.StructType struct, - Map idToConstant) { + static ValueReader struct( + List> readers, Types.StructType struct, Map idToConstant) { return new StructReader(readers, struct, idToConstant); } private static class StringReader implements ValueReader { private static final StringReader INSTANCE = new StringReader(); - private StringReader() { - } + private StringReader() {} @Override public StringData read(Decoder decoder, Object reuse) throws IOException { @@ -143,7 +140,8 @@ private DecimalReader(ValueReader bytesReader, int precision, int scale) @Override public DecimalData read(Decoder decoder, Object reuse) throws IOException { byte[] bytes = bytesReader.read(decoder, null); - return DecimalData.fromBigDecimal(new BigDecimal(new BigInteger(bytes), scale), precision, scale); + return DecimalData.fromBigDecimal( + new BigDecimal(new BigInteger(bytes), scale), precision, scale); } } @@ -287,7 +285,8 @@ public MapData read(Decoder decoder, Object reuse) throws IOException { private static class StructReader extends ValueReaders.StructReader { private final int numFields; - private StructReader(List> readers, Types.StructType struct, Map idToConstant) { + private StructReader( + List> readers, Types.StructType struct, Map idToConstant) { super(readers, struct, idToConstant); this.numFields = readers.size(); } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java index 517d7d8e1527..4e86ecce28b5 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.IOException; @@ -37,8 +36,7 @@ public class FlinkValueWriters { - private FlinkValueWriters() { - } + private FlinkValueWriters() {} static ValueWriter strings() { return StringWriter.INSTANCE; @@ -60,13 +58,19 @@ static ValueWriter array(ValueWriter elementWriter, LogicalTyp return new ArrayWriter<>(elementWriter, elementType); } - static ValueWriter arrayMap(ValueWriter keyWriter, LogicalType keyType, - ValueWriter valueWriter, LogicalType valueType) { + static ValueWriter arrayMap( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { return new ArrayMapWriter<>(keyWriter, keyType, valueWriter, valueType); } - static ValueWriter map(ValueWriter keyWriter, LogicalType keyType, - ValueWriter valueWriter, LogicalType valueType) { + static ValueWriter map( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { return new MapWriter<>(keyWriter, keyType, valueWriter, valueType); } @@ -77,8 +81,7 @@ static ValueWriter row(List> writers, List private static class StringWriter implements ValueWriter { private static final StringWriter INSTANCE = new StringWriter(); - private StringWriter() { - } + private StringWriter() {} @Override public void write(StringData s, Encoder encoder) throws IOException { @@ -95,12 +98,14 @@ private static class DecimalWriter implements ValueWriter { private DecimalWriter(int precision, int scale) { this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(DecimalData d, Encoder encoder) throws IOException { - encoder.writeFixed(DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toBigDecimal(), bytes.get())); + encoder.writeFixed( + DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toBigDecimal(), bytes.get())); } } @@ -118,7 +123,8 @@ private static class TimestampMicrosWriter implements ValueWriter @Override public void write(TimestampData timestampData, Encoder encoder) throws IOException { - long micros = timestampData.getMillisecond() * 1000 + timestampData.getNanoOfMillisecond() / 1000; + long micros = + timestampData.getMillisecond() * 1000 + timestampData.getNanoOfMillisecond() / 1000; encoder.writeLong(micros); } } @@ -152,8 +158,11 @@ private static class ArrayMapWriter implements ValueWriter { private final ArrayData.ElementGetter keyGetter; private final ArrayData.ElementGetter valueGetter; - private ArrayMapWriter(ValueWriter keyWriter, LogicalType keyType, - ValueWriter valueWriter, LogicalType valueType) { + private ArrayMapWriter( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { this.keyWriter = keyWriter; this.keyGetter = ArrayData.createElementGetter(keyType); this.valueWriter = valueWriter; @@ -183,8 +192,11 @@ private static class MapWriter implements ValueWriter { private final ArrayData.ElementGetter keyGetter; private final ArrayData.ElementGetter valueGetter; - private MapWriter(ValueWriter keyWriter, LogicalType keyType, - ValueWriter valueWriter, LogicalType valueType) { + private MapWriter( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { this.keyWriter = keyWriter; this.keyGetter = ArrayData.createElementGetter(keyType); this.valueWriter = valueWriter; diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java index 541986f93889..33feb2e32118 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.Deque; @@ -38,12 +37,15 @@ public class ParquetWithFlinkSchemaVisitor { private final Deque fieldNames = Lists.newLinkedList(); - public static T visit(LogicalType sType, Type type, ParquetWithFlinkSchemaVisitor visitor) { + public static T visit( + LogicalType sType, Type type, ParquetWithFlinkSchemaVisitor visitor) { Preconditions.checkArgument(sType != null, "Invalid DataType: null"); if (type instanceof MessageType) { - Preconditions.checkArgument(sType instanceof RowType, "Invalid struct: %s is not a struct", sType); + Preconditions.checkArgument( + sType instanceof RowType, "Invalid struct: %s is not a struct", sType); RowType struct = (RowType) sType; - return visitor.message(struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); + return visitor.message( + struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); } else if (type.isPrimitive()) { return visitor.primitive(sType, type.asPrimitiveType()); } else { @@ -53,21 +55,30 @@ public static T visit(LogicalType sType, Type type, ParquetWithFlinkSchemaVi if (annotation != null) { switch (annotation) { case LIST: - Preconditions.checkArgument(!group.isRepetition(Type.Repetition.REPEATED), - "Invalid list: top-level group is repeated: %s", group); - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid list: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + !group.isRepetition(Type.Repetition.REPEATED), + "Invalid list: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid list: does not contain single repeated field: %s", + group); GroupType repeatedElement = group.getFields().get(0).asGroupType(); - Preconditions.checkArgument(repeatedElement.isRepetition(Type.Repetition.REPEATED), + Preconditions.checkArgument( + repeatedElement.isRepetition(Type.Repetition.REPEATED), "Invalid list: inner group is not repeated"); - Preconditions.checkArgument(repeatedElement.getFieldCount() <= 1, - "Invalid list: repeated group is not a single field: %s", group); + Preconditions.checkArgument( + repeatedElement.getFieldCount() <= 1, + "Invalid list: repeated group is not a single field: %s", + group); - Preconditions.checkArgument(sType instanceof ArrayType, "Invalid list: %s is not an array", sType); + Preconditions.checkArgument( + sType instanceof ArrayType, "Invalid list: %s is not an array", sType); ArrayType array = (ArrayType) sType; - RowType.RowField element = new RowField( - "element", array.getElementType(), "element of " + array.asSummaryString()); + RowType.RowField element = + new RowField( + "element", array.getElementType(), "element of " + array.asSummaryString()); visitor.fieldNames.push(repeatedElement.getName()); try { @@ -83,22 +94,30 @@ public static T visit(LogicalType sType, Type type, ParquetWithFlinkSchemaVi } case MAP: - Preconditions.checkArgument(!group.isRepetition(Type.Repetition.REPEATED), - "Invalid map: top-level group is repeated: %s", group); - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid map: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + !group.isRepetition(Type.Repetition.REPEATED), + "Invalid map: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid map: does not contain single repeated field: %s", + group); GroupType repeatedKeyValue = group.getType(0).asGroupType(); - Preconditions.checkArgument(repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), + Preconditions.checkArgument( + repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), "Invalid map: inner group is not repeated"); - Preconditions.checkArgument(repeatedKeyValue.getFieldCount() <= 2, + Preconditions.checkArgument( + repeatedKeyValue.getFieldCount() <= 2, "Invalid map: repeated group does not have 2 fields"); - Preconditions.checkArgument(sType instanceof MapType, "Invalid map: %s is not a map", sType); + Preconditions.checkArgument( + sType instanceof MapType, "Invalid map: %s is not a map", sType); MapType map = (MapType) sType; - RowField keyField = new RowField("key", map.getKeyType(), "key of " + map.asSummaryString()); - RowField valueField = new RowField( - "value", map.getValueType(), "value of " + map.asSummaryString()); + RowField keyField = + new RowField("key", map.getKeyType(), "key of " + map.asSummaryString()); + RowField valueField = + new RowField("value", map.getValueType(), "value of " + map.asSummaryString()); visitor.fieldNames.push(repeatedKeyValue.getName()); try { @@ -134,13 +153,15 @@ public static T visit(LogicalType sType, Type type, ParquetWithFlinkSchemaVi default: } } - Preconditions.checkArgument(sType instanceof RowType, "Invalid struct: %s is not a struct", sType); + Preconditions.checkArgument( + sType instanceof RowType, "Invalid struct: %s is not a struct", sType); RowType struct = (RowType) sType; return visitor.struct(struct, group, visitFields(struct, group, visitor)); } } - private static T visitField(RowType.RowField sField, Type field, ParquetWithFlinkSchemaVisitor visitor) { + private static T visitField( + RowType.RowField sField, Type field, ParquetWithFlinkSchemaVisitor visitor) { visitor.fieldNames.push(field.getName()); try { return visit(sField.getType(), field, visitor); @@ -149,17 +170,20 @@ private static T visitField(RowType.RowField sField, Type field, ParquetWith } } - private static List visitFields(RowType struct, GroupType group, - ParquetWithFlinkSchemaVisitor visitor) { + private static List visitFields( + RowType struct, GroupType group, ParquetWithFlinkSchemaVisitor visitor) { List sFields = struct.getFields(); - Preconditions.checkArgument(sFields.size() == group.getFieldCount(), - "Structs do not match: %s and %s", struct, group); + Preconditions.checkArgument( + sFields.size() == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); for (int i = 0; i < sFields.size(); i += 1) { Type field = group.getFields().get(i); RowType.RowField sField = sFields.get(i); - Preconditions.checkArgument(field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.getName())), - "Structs do not match: field %s != %s", field.getName(), sField.getName()); + Preconditions.checkArgument( + field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.getName())), + "Structs do not match: field %s != %s", + field.getName(), + sField.getName()); results.add(visitField(sField, field, visitor)); } @@ -195,5 +219,4 @@ protected String[] path(String name) { list.add(name); return list.toArray(new String[0]); } - } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java index 6334a00fd0d7..e41bae686d1e 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.Map; @@ -38,35 +37,40 @@ public class RowDataProjection implements RowData { /** * Creates a projecting wrapper for {@link RowData} rows. - *

- * This projection will not project the nested children types of repeated types like lists and maps. + * + *

This projection will not project the nested children types of repeated types like lists and + * maps. * * @param schema schema of rows wrapped by this projection * @param projectedSchema result schema of the projected rows * @return a wrapper to project rows */ public static RowDataProjection create(Schema schema, Schema projectedSchema) { - return RowDataProjection.create(FlinkSchemaUtil.convert(schema), schema.asStruct(), projectedSchema.asStruct()); + return RowDataProjection.create( + FlinkSchemaUtil.convert(schema), schema.asStruct(), projectedSchema.asStruct()); } /** * Creates a projecting wrapper for {@link RowData} rows. - *

- * This projection will not project the nested children types of repeated types like lists and maps. + * + *

This projection will not project the nested children types of repeated types like lists and + * maps. * * @param rowType flink row type of rows wrapped by this projection * @param schema schema of rows wrapped by this projection * @param projectedSchema result schema of the projected rows * @return a wrapper to project rows */ - public static RowDataProjection create(RowType rowType, Types.StructType schema, Types.StructType projectedSchema) { + public static RowDataProjection create( + RowType rowType, Types.StructType schema, Types.StructType projectedSchema) { return new RowDataProjection(rowType, schema, projectedSchema); } private final RowData.FieldGetter[] getters; private RowData rowData; - private RowDataProjection(RowType rowType, Types.StructType rowStruct, Types.StructType projectType) { + private RowDataProjection( + RowType rowType, Types.StructType rowStruct, Types.StructType projectType) { Map fieldIdToPosition = Maps.newHashMap(); for (int i = 0; i < rowStruct.fields().size(); i++) { fieldIdToPosition.put(rowStruct.fields().get(i).fieldId(), i); @@ -77,27 +81,34 @@ private RowDataProjection(RowType rowType, Types.StructType rowStruct, Types.Str Types.NestedField projectField = projectType.fields().get(i); Types.NestedField rowField = rowStruct.field(projectField.fieldId()); - Preconditions.checkNotNull(rowField, - "Cannot locate the project field <%s> in the iceberg struct <%s>", projectField, rowStruct); + Preconditions.checkNotNull( + rowField, + "Cannot locate the project field <%s> in the iceberg struct <%s>", + projectField, + rowStruct); - getters[i] = createFieldGetter(rowType, fieldIdToPosition.get(projectField.fieldId()), rowField, projectField); + getters[i] = + createFieldGetter( + rowType, fieldIdToPosition.get(projectField.fieldId()), rowField, projectField); } } - private static RowData.FieldGetter createFieldGetter(RowType rowType, - int position, - Types.NestedField rowField, - Types.NestedField projectField) { - Preconditions.checkArgument(rowField.type().typeId() == projectField.type().typeId(), - "Different iceberg type between row field <%s> and project field <%s>", rowField, projectField); + private static RowData.FieldGetter createFieldGetter( + RowType rowType, int position, Types.NestedField rowField, Types.NestedField projectField) { + Preconditions.checkArgument( + rowField.type().typeId() == projectField.type().typeId(), + "Different iceberg type between row field <%s> and project field <%s>", + rowField, + projectField); switch (projectField.type().typeId()) { case STRUCT: RowType nestedRowType = (RowType) rowType.getTypeAt(position); return row -> { - RowData nestedRow = row.isNullAt(position) ? null : row.getRow(position, nestedRowType.getFieldCount()); - return RowDataProjection - .create(nestedRowType, rowField.type().asStructType(), projectField.type().asStructType()) + RowData nestedRow = + row.isNullAt(position) ? null : row.getRow(position, nestedRowType.getFieldCount()); + return RowDataProjection.create( + nestedRowType, rowField.type().asStructType(), projectField.type().asStructType()) .wrap(nestedRow); }; @@ -105,13 +116,17 @@ private static RowData.FieldGetter createFieldGetter(RowType rowType, Types.MapType projectedMap = projectField.type().asMapType(); Types.MapType originalMap = rowField.type().asMapType(); - boolean keyProjectable = !projectedMap.keyType().isNestedType() || - projectedMap.keyType().equals(originalMap.keyType()); - boolean valueProjectable = !projectedMap.valueType().isNestedType() || - projectedMap.valueType().equals(originalMap.valueType()); - Preconditions.checkArgument(keyProjectable && valueProjectable, + boolean keyProjectable = + !projectedMap.keyType().isNestedType() + || projectedMap.keyType().equals(originalMap.keyType()); + boolean valueProjectable = + !projectedMap.valueType().isNestedType() + || projectedMap.valueType().equals(originalMap.valueType()); + Preconditions.checkArgument( + keyProjectable && valueProjectable, "Cannot project a partial map key or value with non-primitive type. Trying to project <%s> out of <%s>", - projectField, rowField); + projectField, + rowField); return RowData.createFieldGetter(rowType.getTypeAt(position), position); @@ -119,11 +134,14 @@ private static RowData.FieldGetter createFieldGetter(RowType rowType, Types.ListType projectedList = projectField.type().asListType(); Types.ListType originalList = rowField.type().asListType(); - boolean elementProjectable = !projectedList.elementType().isNestedType() || - projectedList.elementType().equals(originalList.elementType()); - Preconditions.checkArgument(elementProjectable, + boolean elementProjectable = + !projectedList.elementType().isNestedType() + || projectedList.elementType().equals(originalList.elementType()); + Preconditions.checkArgument( + elementProjectable, "Cannot project a partial list element with non-primitive type. Trying to project <%s> out of <%s>", - projectField, rowField); + projectField, + rowField); return RowData.createFieldGetter(rowType.getTypeAt(position), position); diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java index 931880fc360c..c5cb51b7eae4 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.math.BigDecimal; @@ -38,9 +37,7 @@ public class RowDataUtil { - private RowDataUtil() { - - } + private RowDataUtil() {} public static Object convertConstant(Type type, Object value) { if (value == null) { @@ -76,12 +73,13 @@ public static Object convertConstant(Type type, Object value) { } /** - * Similar to the private {@link RowDataSerializer#copyRowData(RowData, RowData)} method. - * This skips the check the arity of rowType and from, - * because the from RowData may contains additional column for position deletes. - * Using {@link RowDataSerializer#copy(RowData, RowData)} will fail the arity check. + * Similar to the private {@link RowDataSerializer#copyRowData(RowData, RowData)} method. This + * skips the check the arity of rowType and from, because the from RowData may contains additional + * column for position deletes. Using {@link RowDataSerializer#copy(RowData, RowData)} will fail + * the arity check. */ - public static RowData clone(RowData from, RowData reuse, RowType rowType, TypeSerializer[] fieldSerializers) { + public static RowData clone( + RowData from, RowData reuse, RowType rowType, TypeSerializer[] fieldSerializers) { GenericRowData ret; if (reuse instanceof GenericRowData) { ret = (GenericRowData) reuse; @@ -99,5 +97,4 @@ public static RowData clone(RowData from, RowData reuse, RowType rowType, TypeSe } return ret; } - } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java index 5d9064cdb07e..9e0bc69bd54e 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -47,22 +46,24 @@ abstract class BaseDeltaTaskWriter extends BaseTaskWriter { private final RowDataProjection keyProjection; private final boolean upsert; - BaseDeltaTaskWriter(PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - List equalityFieldIds, - boolean upsert) { + BaseDeltaTaskWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema, + List equalityFieldIds, + boolean upsert) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.schema = schema; this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)); this.wrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); this.upsert = upsert; - this.keyWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(deleteSchema), deleteSchema.asStruct()); + this.keyWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(deleteSchema), deleteSchema.asStruct()); this.keyProjection = RowDataProjection.create(schema, deleteSchema); } @@ -87,7 +88,8 @@ public void write(RowData row) throws IOException { case UPDATE_BEFORE: if (upsert) { - break; // UPDATE_BEFORE is not necessary for UPSERT, we do nothing to prevent delete one row twice + break; // UPDATE_BEFORE is not necessary for UPSERT, we do nothing to prevent delete one + // row twice } writer.delete(row); break; diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java index 866b785d7e1e..036970c06d5b 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -36,7 +35,8 @@ class DeltaManifests { this(dataManifest, deleteManifest, EMPTY_REF_DATA_FILES); } - DeltaManifests(ManifestFile dataManifest, ManifestFile deleteManifest, CharSequence[] referencedDataFiles) { + DeltaManifests( + ManifestFile dataManifest, ManifestFile deleteManifest, CharSequence[] referencedDataFiles) { Preconditions.checkNotNull(referencedDataFiles, "Referenced data files shouldn't be null."); this.dataManifest = dataManifest; diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java index 859f97940116..c4d6e713bb73 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.ByteArrayInputStream; @@ -43,7 +42,8 @@ public int getVersion() { @Override public byte[] serialize(DeltaManifests deltaManifests) throws IOException { - Preconditions.checkNotNull(deltaManifests, "DeltaManifests to be serialized should not be null"); + Preconditions.checkNotNull( + deltaManifests, "DeltaManifests to be serialized should not be null"); ByteArrayOutputStream binaryOut = new ByteArrayOutputStream(); DataOutputStream out = new DataOutputStream(binaryOut); diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java index 56689567a1d2..18b269d6c3e9 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -31,8 +30,8 @@ import org.apache.iceberg.util.StructProjection; /** - * Create a {@link KeySelector} to shuffle by equality fields, to ensure same equality fields record will be emitted to - * same writer in order. + * Create a {@link KeySelector} to shuffle by equality fields, to ensure same equality fields record + * will be emitted to same writer in order. */ class EqualityFieldKeySelector implements KeySelector { @@ -51,8 +50,8 @@ class EqualityFieldKeySelector implements KeySelector { } /** - * Construct the {@link RowDataWrapper} lazily here because few members in it are not serializable. In this way, we - * don't have to serialize them with forcing. + * Construct the {@link RowDataWrapper} lazily here because few members in it are not + * serializable. In this way, we don't have to serialize them with forcing. */ protected RowDataWrapper lazyRowDataWrapper() { if (rowDataWrapper == null) { @@ -61,9 +60,7 @@ protected RowDataWrapper lazyRowDataWrapper() { return rowDataWrapper; } - /** - * Construct the {@link StructProjection} lazily because it is not serializable. - */ + /** Construct the {@link StructProjection} lazily because it is not serializable. */ protected StructProjection lazyStructProjection() { if (structProjection == null) { structProjection = StructProjection.create(schema, deleteSchema); @@ -71,9 +68,7 @@ protected StructProjection lazyStructProjection() { return structProjection; } - /** - * Construct the {@link StructLikeWrapper} lazily because it is not serializable. - */ + /** Construct the {@link StructLikeWrapper} lazily because it is not serializable. */ protected StructLikeWrapper lazyStructLikeWrapper() { if (structLikeWrapper == null) { structLikeWrapper = StructLikeWrapper.forType(deleteSchema.asStruct()); diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java index ade5c28837ec..b5d08b46be58 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -60,13 +59,19 @@ public class FlinkAppenderFactory implements FileAppenderFactory, Seria private RowType eqDeleteFlinkSchema = null; private RowType posDeleteFlinkSchema = null; - public FlinkAppenderFactory(Schema schema, RowType flinkSchema, Map props, PartitionSpec spec) { + public FlinkAppenderFactory( + Schema schema, RowType flinkSchema, Map props, PartitionSpec spec) { this(schema, flinkSchema, props, spec, null, null, null); } - public FlinkAppenderFactory(Schema schema, RowType flinkSchema, Map props, - PartitionSpec spec, int[] equalityFieldIds, - Schema eqDeleteRowSchema, Schema posDeleteRowSchema) { + public FlinkAppenderFactory( + Schema schema, + RowType flinkSchema, + Map props, + PartitionSpec spec, + int[] equalityFieldIds, + Schema eqDeleteRowSchema, + Schema posDeleteRowSchema) { this.schema = schema; this.flinkSchema = flinkSchema; this.props = props; @@ -108,7 +113,8 @@ public FileAppender newAppender(OutputFile outputFile, FileFormat forma case ORC: return ORC.write(outputFile) - .createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) + .createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) .setAll(props) .metricsConfig(metricsConfig) .schema(schema) @@ -133,18 +139,25 @@ public FileAppender newAppender(OutputFile outputFile, FileFormat forma } @Override - public DataWriter newDataWriter(EncryptedOutputFile file, FileFormat format, StructLike partition) { + public DataWriter newDataWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { return new DataWriter<>( - newAppender(file.encryptingOutputFile(), format), format, - file.encryptingOutputFile().location(), spec, partition, file.keyMetadata()); + newAppender(file.encryptingOutputFile(), format), + format, + file.encryptingOutputFile().location(), + spec, + partition, + file.keyMetadata()); } @Override - public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile outputFile, FileFormat format, - StructLike partition) { - Preconditions.checkState(equalityFieldIds != null && equalityFieldIds.length > 0, + public EqualityDeleteWriter newEqDeleteWriter( + EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { + Preconditions.checkState( + equalityFieldIds != null && equalityFieldIds.length > 0, "Equality field ids shouldn't be null or empty when creating equality-delete writer"); - Preconditions.checkNotNull(eqDeleteRowSchema, + Preconditions.checkNotNull( + eqDeleteRowSchema, "Equality delete row schema shouldn't be null when creating equality-delete writer"); MetricsConfig metricsConfig = MetricsConfig.fromProperties(props); @@ -164,7 +177,8 @@ public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile outpu case ORC: return ORC.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) + .createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) .withPartition(partition) .overwrite() .setAll(props) @@ -176,7 +190,8 @@ public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile outpu case PARQUET: return Parquet.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(lazyEqDeleteFlinkSchema(), msgType)) + .createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(lazyEqDeleteFlinkSchema(), msgType)) .withPartition(partition) .overwrite() .setAll(props) @@ -197,8 +212,8 @@ public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile outpu } @Override - public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile outputFile, FileFormat format, - StructLike partition) { + public PositionDeleteWriter newPosDeleteWriter( + EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { MetricsConfig metricsConfig = MetricsConfig.fromProperties(props); try { switch (format) { @@ -214,9 +229,11 @@ public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile outp .buildPositionWriter(); case ORC: - RowType orcPosDeleteSchema = FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); + RowType orcPosDeleteSchema = + FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); return ORC.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(orcPosDeleteSchema, iSchema)) + .createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(orcPosDeleteSchema, iSchema)) .withPartition(partition) .overwrite() .setAll(props) @@ -228,9 +245,11 @@ public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile outp .buildPositionWriter(); case PARQUET: - RowType flinkPosDeleteSchema = FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); + RowType flinkPosDeleteSchema = + FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); return Parquet.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(flinkPosDeleteSchema, msgType)) + .createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(flinkPosDeleteSchema, msgType)) .withPartition(partition) .overwrite() .setAll(props) @@ -242,7 +261,8 @@ public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile outp .buildPositionWriter(); default: - throw new UnsupportedOperationException("Cannot write pos-deletes for unsupported file format: " + format); + throw new UnsupportedOperationException( + "Cannot write pos-deletes for unsupported file format: " + format); } } catch (IOException e) { throw new UncheckedIOException(e); diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java index 55a9539c78d1..5872fed36d65 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; +import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; + import java.io.Serializable; import java.util.Locale; import java.util.Map; @@ -40,24 +44,35 @@ import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; -import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; - class FlinkFileWriterFactory extends BaseFileWriterFactory implements Serializable { private RowType dataFlinkType; private RowType equalityDeleteFlinkType; private RowType positionDeleteFlinkType; - FlinkFileWriterFactory(Table table, FileFormat dataFileFormat, Schema dataSchema, RowType dataFlinkType, - SortOrder dataSortOrder, FileFormat deleteFileFormat, - int[] equalityFieldIds, Schema equalityDeleteRowSchema, RowType equalityDeleteFlinkType, - SortOrder equalityDeleteSortOrder, Schema positionDeleteRowSchema, - RowType positionDeleteFlinkType) { - - super(table, dataFileFormat, dataSchema, dataSortOrder, deleteFileFormat, equalityFieldIds, - equalityDeleteRowSchema, equalityDeleteSortOrder, positionDeleteRowSchema); + FlinkFileWriterFactory( + Table table, + FileFormat dataFileFormat, + Schema dataSchema, + RowType dataFlinkType, + SortOrder dataSortOrder, + FileFormat deleteFileFormat, + int[] equalityFieldIds, + Schema equalityDeleteRowSchema, + RowType equalityDeleteFlinkType, + SortOrder equalityDeleteSortOrder, + Schema positionDeleteRowSchema, + RowType positionDeleteFlinkType) { + + super( + table, + dataFileFormat, + dataSchema, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteSortOrder, + positionDeleteRowSchema); this.dataFlinkType = dataFlinkType; this.equalityDeleteFlinkType = equalityDeleteFlinkType; @@ -83,7 +98,8 @@ protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { int rowFieldIndex = positionDeleteFlinkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME); if (rowFieldIndex >= 0) { // FlinkAvroWriter accepts just the Flink type of the row ignoring the path and pos - RowType positionDeleteRowFlinkType = (RowType) positionDeleteFlinkType().getTypeAt(rowFieldIndex); + RowType positionDeleteRowFlinkType = + (RowType) positionDeleteFlinkType().getTypeAt(rowFieldIndex); builder.createWriterFunc(ignored -> new FlinkAvroWriter(positionDeleteRowFlinkType)); } } @@ -95,28 +111,33 @@ protected void configureDataWrite(Parquet.DataWriteBuilder builder) { @Override protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(equalityDeleteFlinkType(), msgType)); + builder.createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(equalityDeleteFlinkType(), msgType)); } @Override protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(positionDeleteFlinkType(), msgType)); + builder.createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(positionDeleteFlinkType(), msgType)); builder.transformPaths(path -> StringData.fromString(path.toString())); } @Override protected void configureDataWrite(ORC.DataWriteBuilder builder) { - builder.createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(dataFlinkType(), iSchema)); + builder.createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(dataFlinkType(), iSchema)); } @Override protected void configureEqualityDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(equalityDeleteFlinkType(), iSchema)); + builder.createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(equalityDeleteFlinkType(), iSchema)); } @Override protected void configurePositionDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(positionDeleteFlinkType(), iSchema)); + builder.createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(positionDeleteFlinkType(), iSchema)); builder.transformPaths(path -> StringData.fromString(path.toString())); } @@ -131,7 +152,8 @@ private RowType dataFlinkType() { private RowType equalityDeleteFlinkType() { if (equalityDeleteFlinkType == null) { - Preconditions.checkNotNull(equalityDeleteRowSchema(), "Equality delete schema must not be null"); + Preconditions.checkNotNull( + equalityDeleteRowSchema(), "Equality delete schema must not be null"); this.equalityDeleteFlinkType = FlinkSchemaUtil.convert(equalityDeleteRowSchema()); } @@ -140,7 +162,8 @@ private RowType equalityDeleteFlinkType() { private RowType positionDeleteFlinkType() { if (positionDeleteFlinkType == null) { - // wrap the optional row schema into the position delete schema that contains path and position + // wrap the optional row schema into the position delete schema that contains path and + // position Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema()); this.positionDeleteFlinkType = FlinkSchemaUtil.convert(positionDeleteSchema); } @@ -167,10 +190,12 @@ static class Builder { Map properties = table.properties(); - String dataFileFormatName = properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); + String dataFileFormatName = + properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); this.dataFileFormat = FileFormat.valueOf(dataFileFormatName.toUpperCase(Locale.ENGLISH)); - String deleteFileFormatName = properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); + String deleteFileFormatName = + properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); this.deleteFileFormat = FileFormat.valueOf(deleteFileFormatName.toUpperCase(Locale.ENGLISH)); } @@ -186,8 +211,8 @@ Builder dataSchema(Schema newDataSchema) { /** * Sets a Flink type for data. - *

- * If not set, the value is derived from the provided Iceberg schema. + * + *

If not set, the value is derived from the provided Iceberg schema. */ Builder dataFlinkType(RowType newDataFlinkType) { this.dataFlinkType = newDataFlinkType; @@ -216,8 +241,8 @@ Builder equalityDeleteRowSchema(Schema newEqualityDeleteRowSchema) { /** * Sets a Flink type for equality deletes. - *

- * If not set, the value is derived from the provided Iceberg schema. + * + *

If not set, the value is derived from the provided Iceberg schema. */ Builder equalityDeleteFlinkType(RowType newEqualityDeleteFlinkType) { this.equalityDeleteFlinkType = newEqualityDeleteFlinkType; @@ -236,8 +261,8 @@ Builder positionDeleteRowSchema(Schema newPositionDeleteRowSchema) { /** * Sets a Flink type for position deletes. - *

- * If not set, the value is derived from the provided Iceberg schema. + * + *

If not set, the value is derived from the provided Iceberg schema. */ Builder positionDeleteFlinkType(RowType newPositionDeleteFlinkType) { this.positionDeleteFlinkType = newPositionDeleteFlinkType; @@ -247,13 +272,23 @@ Builder positionDeleteFlinkType(RowType newPositionDeleteFlinkType) { FlinkFileWriterFactory build() { boolean noEqualityDeleteConf = equalityFieldIds == null && equalityDeleteRowSchema == null; boolean fullEqualityDeleteConf = equalityFieldIds != null && equalityDeleteRowSchema != null; - Preconditions.checkArgument(noEqualityDeleteConf || fullEqualityDeleteConf, + Preconditions.checkArgument( + noEqualityDeleteConf || fullEqualityDeleteConf, "Equality field IDs and equality delete row schema must be set together"); return new FlinkFileWriterFactory( - table, dataFileFormat, dataSchema, dataFlinkType, dataSortOrder, deleteFileFormat, - equalityFieldIds, equalityDeleteRowSchema, equalityDeleteFlinkType, equalityDeleteSortOrder, - positionDeleteRowSchema, positionDeleteFlinkType); + table, + dataFileFormat, + dataSchema, + dataFlinkType, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteFlinkType, + equalityDeleteSortOrder, + positionDeleteRowSchema, + positionDeleteFlinkType); } } } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java index b00018b3b770..996e4bbb1b01 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -41,12 +40,12 @@ class FlinkManifestUtil { private static final int FORMAT_V2 = 2; private static final Long DUMMY_SNAPSHOT_ID = 0L; - private FlinkManifestUtil() { - } + private FlinkManifestUtil() {} - static ManifestFile writeDataFiles(OutputFile outputFile, PartitionSpec spec, List dataFiles) - throws IOException { - ManifestWriter writer = ManifestFiles.write(FORMAT_V2, spec, outputFile, DUMMY_SNAPSHOT_ID); + static ManifestFile writeDataFiles( + OutputFile outputFile, PartitionSpec spec, List dataFiles) throws IOException { + ManifestWriter writer = + ManifestFiles.write(FORMAT_V2, spec, outputFile, DUMMY_SNAPSHOT_ID); try (ManifestWriter closeableWriter = writer) { closeableWriter.addAll(dataFiles); @@ -61,30 +60,32 @@ static List readDataFiles(ManifestFile manifestFile, FileIO io) throws } } - static ManifestOutputFileFactory createOutputFileFactory(Table table, String flinkJobId, int subTaskId, - long attemptNumber) { + static ManifestOutputFileFactory createOutputFileFactory( + Table table, String flinkJobId, int subTaskId, long attemptNumber) { TableOperations ops = ((HasTableOperations) table).operations(); - return new ManifestOutputFileFactory(ops, table.io(), table.properties(), flinkJobId, subTaskId, attemptNumber); + return new ManifestOutputFileFactory( + ops, table.io(), table.properties(), flinkJobId, subTaskId, attemptNumber); } - static DeltaManifests writeCompletedFiles(WriteResult result, - Supplier outputFileSupplier, - PartitionSpec spec) throws IOException { + static DeltaManifests writeCompletedFiles( + WriteResult result, Supplier outputFileSupplier, PartitionSpec spec) + throws IOException { ManifestFile dataManifest = null; ManifestFile deleteManifest = null; // Write the completed data files into a newly created data manifest file. if (result.dataFiles() != null && result.dataFiles().length > 0) { - dataManifest = writeDataFiles(outputFileSupplier.get(), spec, Lists.newArrayList(result.dataFiles())); + dataManifest = + writeDataFiles(outputFileSupplier.get(), spec, Lists.newArrayList(result.dataFiles())); } // Write the completed delete files into a newly created delete manifest file. if (result.deleteFiles() != null && result.deleteFiles().length > 0) { OutputFile deleteManifestFile = outputFileSupplier.get(); - ManifestWriter deleteManifestWriter = ManifestFiles.writeDeleteManifest(FORMAT_V2, spec, - deleteManifestFile, DUMMY_SNAPSHOT_ID); + ManifestWriter deleteManifestWriter = + ManifestFiles.writeDeleteManifest(FORMAT_V2, spec, deleteManifestFile, DUMMY_SNAPSHOT_ID); try (ManifestWriter writer = deleteManifestWriter) { for (DeleteFile deleteFile : result.deleteFiles()) { writer.add(deleteFile); @@ -97,7 +98,8 @@ static DeltaManifests writeCompletedFiles(WriteResult result, return new DeltaManifests(dataManifest, deleteManifest, result.referencedDataFiles()); } - static WriteResult readCompletedFiles(DeltaManifests deltaManifests, FileIO io) throws IOException { + static WriteResult readCompletedFiles(DeltaManifests deltaManifests, FileIO io) + throws IOException { WriteResult.Builder builder = WriteResult.builder(); // Read the completed data files from persisted data manifest file. @@ -107,13 +109,12 @@ static WriteResult readCompletedFiles(DeltaManifests deltaManifests, FileIO io) // Read the completed delete files from persisted delete manifests file. if (deltaManifests.deleteManifest() != null) { - try (CloseableIterable deleteFiles = ManifestFiles - .readDeleteManifest(deltaManifests.deleteManifest(), io, null)) { + try (CloseableIterable deleteFiles = + ManifestFiles.readDeleteManifest(deltaManifests.deleteManifest(), io, null)) { builder.addDeleteFiles(deleteFiles); } } - return builder.addReferencedDataFiles(deltaManifests.referencedDataFiles()) - .build(); + return builder.addReferencedDataFiles(deltaManifests.referencedDataFiles()).build(); } } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java index 4f11a676dfe9..8846bb137fe7 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; + import java.io.IOException; import java.io.UncheckedIOException; import java.util.List; @@ -62,40 +63,39 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; - public class FlinkSink { private static final Logger LOG = LoggerFactory.getLogger(FlinkSink.class); - private static final String ICEBERG_STREAM_WRITER_NAME = IcebergStreamWriter.class.getSimpleName(); - private static final String ICEBERG_FILES_COMMITTER_NAME = IcebergFilesCommitter.class.getSimpleName(); + private static final String ICEBERG_STREAM_WRITER_NAME = + IcebergStreamWriter.class.getSimpleName(); + private static final String ICEBERG_FILES_COMMITTER_NAME = + IcebergFilesCommitter.class.getSimpleName(); - private FlinkSink() { - } + private FlinkSink() {} /** - * Initialize a {@link Builder} to export the data from generic input data stream into iceberg table. We use - * {@link RowData} inside the sink connector, so users need to provide a mapper function and a - * {@link TypeInformation} to convert those generic records to a RowData DataStream. + * Initialize a {@link Builder} to export the data from generic input data stream into iceberg + * table. We use {@link RowData} inside the sink connector, so users need to provide a mapper + * function and a {@link TypeInformation} to convert those generic records to a RowData + * DataStream. * - * @param input the generic source input data stream. - * @param mapper function to convert the generic data to {@link RowData} + * @param input the generic source input data stream. + * @param mapper function to convert the generic data to {@link RowData} * @param outputType to define the {@link TypeInformation} for the input data. - * @param the data type of records. + * @param the data type of records. * @return {@link Builder} to connect the iceberg table. */ - public static Builder builderFor(DataStream input, - MapFunction mapper, - TypeInformation outputType) { + public static Builder builderFor( + DataStream input, MapFunction mapper, TypeInformation outputType) { return new Builder().forMapperOutputType(input, mapper, outputType); } /** - * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into iceberg table. We use - * {@link RowData} inside the sink connector, so users need to provide a {@link TableSchema} for builder to convert - * those {@link Row}s to a {@link RowData} DataStream. + * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into + * iceberg table. We use {@link RowData} inside the sink connector, so users need to provide a + * {@link TableSchema} for builder to convert those {@link Row}s to a {@link RowData} DataStream. * - * @param input the source input data stream with {@link Row}s. + * @param input the source input data stream with {@link Row}s. * @param tableSchema defines the {@link TypeInformation} for input data. * @return {@link Builder} to connect the iceberg table. */ @@ -103,13 +103,15 @@ public static Builder forRow(DataStream input, TableSchema tableSchema) { RowType rowType = (RowType) tableSchema.toRowDataType().getLogicalType(); DataType[] fieldDataTypes = tableSchema.getFieldDataTypes(); - DataFormatConverters.RowConverter rowConverter = new DataFormatConverters.RowConverter(fieldDataTypes); + DataFormatConverters.RowConverter rowConverter = + new DataFormatConverters.RowConverter(fieldDataTypes); return builderFor(input, rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)) .tableSchema(tableSchema); } /** - * Initialize a {@link Builder} to export the data from input data stream with {@link RowData}s into iceberg table. + * Initialize a {@link Builder} to export the data from input data stream with {@link RowData}s + * into iceberg table. * * @param input the source input data stream with {@link RowData}s. * @return {@link Builder} to connect the iceberg table. @@ -130,34 +132,35 @@ public static class Builder { private final Map writeOptions = Maps.newHashMap(); private FlinkWriteConf flinkWriteConf = null; - private Builder() { - } + private Builder() {} private Builder forRowData(DataStream newRowDataInput) { this.inputCreator = ignored -> newRowDataInput; return this; } - private Builder forMapperOutputType(DataStream input, - MapFunction mapper, - TypeInformation outputType) { - this.inputCreator = newUidPrefix -> { - // Input stream order is crucial for some situation(e.g. in cdc case). Therefore, we need to set the parallelism - // of map operator same as its input to keep map operator chaining its input, and avoid rebalanced by default. - SingleOutputStreamOperator inputStream = input.map(mapper, outputType) - .setParallelism(input.getParallelism()); - if (newUidPrefix != null) { - inputStream.name(operatorName(newUidPrefix)).uid(newUidPrefix + "-mapper"); - } - return inputStream; - }; + private Builder forMapperOutputType( + DataStream input, MapFunction mapper, TypeInformation outputType) { + this.inputCreator = + newUidPrefix -> { + // Input stream order is crucial for some situation(e.g. in cdc case). Therefore, we + // need to set the parallelism + // of map operator same as its input to keep map operator chaining its input, and avoid + // rebalanced by default. + SingleOutputStreamOperator inputStream = + input.map(mapper, outputType).setParallelism(input.getParallelism()); + if (newUidPrefix != null) { + inputStream.name(operatorName(newUidPrefix)).uid(newUidPrefix + "-mapper"); + } + return inputStream; + }; return this; } /** - * This iceberg {@link Table} instance is used for initializing {@link IcebergStreamWriter} which will write all - * the records into {@link DataFile}s and emit them to downstream operator. Providing a table would avoid so many - * table loading from each separate task. + * This iceberg {@link Table} instance is used for initializing {@link IcebergStreamWriter} + * which will write all the records into {@link DataFile}s and emit them to downstream operator. + * Providing a table would avoid so many table loading from each separate task. * * @param newTable the loaded iceberg table instance. * @return {@link Builder} to connect the iceberg table. @@ -168,9 +171,9 @@ public Builder table(Table newTable) { } /** - * The table loader is used for loading tables in {@link IcebergFilesCommitter} lazily, we need this loader because - * {@link Table} is not serializable and could not just use the loaded table from Builder#table in the remote task - * manager. + * The table loader is used for loading tables in {@link IcebergFilesCommitter} lazily, we need + * this loader because {@link Table} is not serializable and could not just use the loaded table + * from Builder#table in the remote task manager. * * @param newTableLoader to load iceberg table inside tasks. * @return {@link Builder} to connect the iceberg table. @@ -181,8 +184,8 @@ public Builder tableLoader(TableLoader newTableLoader) { } /** - * Set the write properties for Flink sink. - * View the supported properties in {@link FlinkWriteOptions} + * Set the write properties for Flink sink. View the supported properties in {@link + * FlinkWriteOptions} */ public Builder set(String property, String value) { writeOptions.put(property, value); @@ -190,8 +193,8 @@ public Builder set(String property, String value) { } /** - * Set the write properties for Flink sink. - * View the supported properties in {@link FlinkWriteOptions} + * Set the write properties for Flink sink. View the supported properties in {@link + * FlinkWriteOptions} */ public Builder setAll(Map properties) { writeOptions.putAll(properties); @@ -214,14 +217,15 @@ public Builder flinkConf(ReadableConfig config) { } /** - * Configure the write {@link DistributionMode} that the flink sink will use. Currently, flink support - * {@link DistributionMode#NONE} and {@link DistributionMode#HASH}. + * Configure the write {@link DistributionMode} that the flink sink will use. Currently, flink + * support {@link DistributionMode#NONE} and {@link DistributionMode#HASH}. * * @param mode to specify the write distribution mode. * @return {@link Builder} to connect the iceberg table. */ public Builder distributionMode(DistributionMode mode) { - Preconditions.checkArgument(!DistributionMode.RANGE.equals(mode), + Preconditions.checkArgument( + !DistributionMode.RANGE.equals(mode), "Flink does not support 'range' write distribution mode now."); if (mode != null) { writeOptions.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), mode.modeName()); @@ -241,10 +245,10 @@ public Builder writeParallelism(int newWriteParallelism) { } /** - * All INSERT/UPDATE_AFTER events from input stream will be transformed to UPSERT events, which means it will - * DELETE the old records and then INSERT the new records. In partitioned table, the partition fields should be - * a subset of equality fields, otherwise the old row that located in partition-A could not be deleted by the - * new row that located in partition-B. + * All INSERT/UPDATE_AFTER events from input stream will be transformed to UPSERT events, which + * means it will DELETE the old records and then INSERT the new records. In partitioned table, + * the partition fields should be a subset of equality fields, otherwise the old row that + * located in partition-A could not be deleted by the new row that located in partition-B. * * @param enabled indicate whether it should transform all INSERT/UPDATE_AFTER events to UPSERT. * @return {@link Builder} to connect the iceberg table. @@ -266,22 +270,25 @@ public Builder equalityFieldColumns(List columns) { } /** - * Set the uid prefix for FlinkSink operators. Note that FlinkSink internally consists of multiple operators (like - * writer, committer, dummy sink etc.) Actually operator uid will be appended with a suffix like "uidPrefix-writer". - *

- * If provided, this prefix is also applied to operator names. - *

- * Flink auto generates operator uid if not set explicitly. It is a recommended - * - * best-practice to set uid for all operators before deploying to production. Flink has an option to {@code - * pipeline.auto-generate-uid=false} to disable auto-generation and force explicit setting of all operator uid. - *

- * Be careful with setting this for an existing job, because now we are changing the operator uid from an - * auto-generated one to this new value. When deploying the change with a checkpoint, Flink won't be able to restore - * the previous Flink sink operator state (more specifically the committer operator state). You need to use {@code - * --allowNonRestoredState} to ignore the previous sink state. During restore Flink sink state is used to check if - * last commit was actually successful or not. {@code --allowNonRestoredState} can lead to data loss if the - * Iceberg commit failed in the last completed checkpoint. + * Set the uid prefix for FlinkSink operators. Note that FlinkSink internally consists of + * multiple operators (like writer, committer, dummy sink etc.) Actually operator uid will be + * appended with a suffix like "uidPrefix-writer".
+ *
+ * If provided, this prefix is also applied to operator names.
+ *
+ * Flink auto generates operator uid if not set explicitly. It is a recommended + * best-practice to set uid for all operators before deploying to production. Flink has an + * option to {@code pipeline.auto-generate-uid=false} to disable auto-generation and force + * explicit setting of all operator uid.
+ *
+ * Be careful with setting this for an existing job, because now we are changing the operator + * uid from an auto-generated one to this new value. When deploying the change with a + * checkpoint, Flink won't be able to restore the previous Flink sink operator state (more + * specifically the committer operator state). You need to use {@code --allowNonRestoredState} + * to ignore the previous sink state. During restore Flink sink state is used to check if last + * commit was actually successful or not. {@code --allowNonRestoredState} can lead to data loss + * if the Iceberg commit failed in the last completed checkpoint. * * @param newPrefix prefix for Flink sink operator uid and name * @return {@link Builder} to connect the iceberg table. @@ -292,7 +299,8 @@ public Builder uidPrefix(String newPrefix) { } private DataStreamSink chainIcebergOperators() { - Preconditions.checkArgument(inputCreator != null, + Preconditions.checkArgument( + inputCreator != null, "Please use forRowData() or forMapperOutputType() to initialize the input DataStream."); Preconditions.checkNotNull(tableLoader, "Table loader shouldn't be null"); @@ -303,7 +311,8 @@ private DataStreamSink chainIcebergOperators() { try (TableLoader loader = tableLoader) { this.table = loader.loadTable(); } catch (IOException e) { - throw new UncheckedIOException("Failed to load iceberg table from table loader: " + tableLoader, e); + throw new UncheckedIOException( + "Failed to load iceberg table from table loader: " + tableLoader, e); } } @@ -315,13 +324,15 @@ private DataStreamSink chainIcebergOperators() { // Convert the requested flink table schema to flink row type. RowType flinkRowType = toFlinkRowType(table.schema(), tableSchema); - // Distribute the records from input data stream based on the write.distribution-mode and equality fields. - DataStream distributeStream = distributeDataStream( - rowDataInput, equalityFieldIds, table.spec(), table.schema(), flinkRowType); + // Distribute the records from input data stream based on the write.distribution-mode and + // equality fields. + DataStream distributeStream = + distributeDataStream( + rowDataInput, equalityFieldIds, table.spec(), table.schema(), flinkRowType); // Add parallel writers that append rows to files - SingleOutputStreamOperator writerStream = appendWriter(distributeStream, flinkRowType, - equalityFieldIds); + SingleOutputStreamOperator writerStream = + appendWriter(distributeStream, flinkRowType, equalityFieldIds); // Add single-parallelism committer that commits files // after successful checkpoint or end of input @@ -348,18 +359,24 @@ private String operatorName(String suffix) { List checkAndGetEqualityFieldIds() { List equalityFieldIds = Lists.newArrayList(table.schema().identifierFieldIds()); if (equalityFieldColumns != null && equalityFieldColumns.size() > 0) { - Set equalityFieldSet = Sets.newHashSetWithExpectedSize(equalityFieldColumns.size()); + Set equalityFieldSet = + Sets.newHashSetWithExpectedSize(equalityFieldColumns.size()); for (String column : equalityFieldColumns) { org.apache.iceberg.types.Types.NestedField field = table.schema().findField(column); - Preconditions.checkNotNull(field, "Missing required equality field column '%s' in table schema %s", - column, table.schema()); + Preconditions.checkNotNull( + field, + "Missing required equality field column '%s' in table schema %s", + column, + table.schema()); equalityFieldSet.add(field.fieldId()); } if (!equalityFieldSet.equals(table.schema().identifierFieldIds())) { - LOG.warn("The configured equality field column IDs {} are not matched with the schema identifier field IDs" + - " {}, use job specified equality field columns as the equality fields by default.", - equalityFieldSet, table.schema().identifierFieldIds()); + LOG.warn( + "The configured equality field column IDs {} are not matched with the schema identifier field IDs" + + " {}, use job specified equality field columns as the equality fields by default.", + equalityFieldSet, + table.schema().identifierFieldIds()); } equalityFieldIds = Lists.newArrayList(equalityFieldSet); } @@ -367,64 +384,78 @@ List checkAndGetEqualityFieldIds() { } @SuppressWarnings("unchecked") - private DataStreamSink appendDummySink(SingleOutputStreamOperator committerStream) { - DataStreamSink resultStream = committerStream - .addSink(new DiscardingSink()) - .name(operatorName(String.format("IcebergSink %s", this.table.name()))) - .setParallelism(1); + private DataStreamSink appendDummySink( + SingleOutputStreamOperator committerStream) { + DataStreamSink resultStream = + committerStream + .addSink(new DiscardingSink()) + .name(operatorName(String.format("IcebergSink %s", this.table.name()))) + .setParallelism(1); if (uidPrefix != null) { resultStream = resultStream.uid(uidPrefix + "-dummysink"); } return resultStream; } - private SingleOutputStreamOperator appendCommitter(SingleOutputStreamOperator writerStream) { - IcebergFilesCommitter filesCommitter = new IcebergFilesCommitter(tableLoader, flinkWriteConf.overwriteMode()); - SingleOutputStreamOperator committerStream = writerStream - .transform(operatorName(ICEBERG_FILES_COMMITTER_NAME), Types.VOID, filesCommitter) - .setParallelism(1) - .setMaxParallelism(1); + private SingleOutputStreamOperator appendCommitter( + SingleOutputStreamOperator writerStream) { + IcebergFilesCommitter filesCommitter = + new IcebergFilesCommitter(tableLoader, flinkWriteConf.overwriteMode()); + SingleOutputStreamOperator committerStream = + writerStream + .transform(operatorName(ICEBERG_FILES_COMMITTER_NAME), Types.VOID, filesCommitter) + .setParallelism(1) + .setMaxParallelism(1); if (uidPrefix != null) { committerStream = committerStream.uid(uidPrefix + "-committer"); } return committerStream; } - private SingleOutputStreamOperator appendWriter(DataStream input, RowType flinkRowType, - List equalityFieldIds) { + private SingleOutputStreamOperator appendWriter( + DataStream input, RowType flinkRowType, List equalityFieldIds) { // Validate the equality fields and partition fields if we enable the upsert mode. if (flinkWriteConf.upsertMode()) { - Preconditions.checkState(!flinkWriteConf.overwriteMode(), + Preconditions.checkState( + !flinkWriteConf.overwriteMode(), "OVERWRITE mode shouldn't be enable when configuring to use UPSERT data stream."); - Preconditions.checkState(!equalityFieldIds.isEmpty(), + Preconditions.checkState( + !equalityFieldIds.isEmpty(), "Equality field columns shouldn't be empty when configuring to use UPSERT data stream."); if (!table.spec().isUnpartitioned()) { for (PartitionField partitionField : table.spec().fields()) { - Preconditions.checkState(equalityFieldIds.contains(partitionField.sourceId()), + Preconditions.checkState( + equalityFieldIds.contains(partitionField.sourceId()), "In UPSERT mode, partition field '%s' should be included in equality fields: '%s'", - partitionField, equalityFieldColumns); + partitionField, + equalityFieldColumns); } } } - IcebergStreamWriter streamWriter = createStreamWriter(table, flinkWriteConf, - flinkRowType, equalityFieldIds); + IcebergStreamWriter streamWriter = + createStreamWriter(table, flinkWriteConf, flinkRowType, equalityFieldIds); int parallelism = writeParallelism == null ? input.getParallelism() : writeParallelism; - SingleOutputStreamOperator writerStream = input - .transform(operatorName(ICEBERG_STREAM_WRITER_NAME), TypeInformation.of(WriteResult.class), streamWriter) - .setParallelism(parallelism); + SingleOutputStreamOperator writerStream = + input + .transform( + operatorName(ICEBERG_STREAM_WRITER_NAME), + TypeInformation.of(WriteResult.class), + streamWriter) + .setParallelism(parallelism); if (uidPrefix != null) { writerStream = writerStream.uid(uidPrefix + "-writer"); } return writerStream; } - private DataStream distributeDataStream(DataStream input, - List equalityFieldIds, - PartitionSpec partitionSpec, - Schema iSchema, - RowType flinkRowType) { + private DataStream distributeDataStream( + DataStream input, + List equalityFieldIds, + PartitionSpec partitionSpec, + Schema iSchema, + RowType flinkRowType) { DistributionMode writeMode = flinkWriteConf.distributionMode(); LOG.info("Write distribution mode is '{}'", writeMode.modeName()); @@ -434,28 +465,35 @@ private DataStream distributeDataStream(DataStream input, return input; } else { LOG.info("Distribute rows by equality fields, because there are equality fields set"); - return input.keyBy(new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + return input.keyBy( + new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); } case HASH: if (equalityFieldIds.isEmpty()) { if (partitionSpec.isUnpartitioned()) { - LOG.warn("Fallback to use 'none' distribution mode, because there are no equality fields set " + - "and table is unpartitioned"); + LOG.warn( + "Fallback to use 'none' distribution mode, because there are no equality fields set " + + "and table is unpartitioned"); return input; } else { return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); } } else { if (partitionSpec.isUnpartitioned()) { - LOG.info("Distribute rows by equality fields, because there are equality fields set " + - "and table is unpartitioned"); - return input.keyBy(new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + LOG.info( + "Distribute rows by equality fields, because there are equality fields set " + + "and table is unpartitioned"); + return input.keyBy( + new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); } else { for (PartitionField partitionField : partitionSpec.fields()) { - Preconditions.checkState(equalityFieldIds.contains(partitionField.sourceId()), - "In 'hash' distribution mode with equality fields set, partition field '%s' " + - "should be included in equality fields: '%s'", partitionField, equalityFieldColumns); + Preconditions.checkState( + equalityFieldIds.contains(partitionField.sourceId()), + "In 'hash' distribution mode with equality fields set, partition field '%s' " + + "should be included in equality fields: '%s'", + partitionField, + equalityFieldColumns); } return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); } @@ -463,13 +501,18 @@ private DataStream distributeDataStream(DataStream input, case RANGE: if (equalityFieldIds.isEmpty()) { - LOG.warn("Fallback to use 'none' distribution mode, because there are no equality fields set " + - "and {}=range is not supported yet in flink", WRITE_DISTRIBUTION_MODE); + LOG.warn( + "Fallback to use 'none' distribution mode, because there are no equality fields set " + + "and {}=range is not supported yet in flink", + WRITE_DISTRIBUTION_MODE); return input; } else { - LOG.info("Distribute rows by equality fields, because there are equality fields set " + - "and{}=range is not supported yet in flink", WRITE_DISTRIBUTION_MODE); - return input.keyBy(new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + LOG.info( + "Distribute rows by equality fields, because there are equality fields set " + + "and{}=range is not supported yet in flink", + WRITE_DISTRIBUTION_MODE); + return input.keyBy( + new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); } default: @@ -480,13 +523,17 @@ private DataStream distributeDataStream(DataStream input, static RowType toFlinkRowType(Schema schema, TableSchema requestedSchema) { if (requestedSchema != null) { - // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing iceberg schema. + // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing + // iceberg schema. Schema writeSchema = TypeUtil.reassignIds(FlinkSchemaUtil.convert(requestedSchema), schema); TypeUtil.validateWriteSchema(schema, writeSchema, true, true); - // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will be promoted to - // iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT (backend by 1 'byte'), we will - // read 4 bytes rather than 1 byte, it will mess up the byte array in BinaryRowData. So here we must use flink + // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will + // be promoted to + // iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT (backend by 1 + // 'byte'), we will + // read 4 bytes rather than 1 byte, it will mess up the byte array in BinaryRowData. So here + // we must use flink // schema. return (RowType) requestedSchema.toRowDataType().getLogicalType(); } else { @@ -494,16 +541,22 @@ static RowType toFlinkRowType(Schema schema, TableSchema requestedSchema) { } } - static IcebergStreamWriter createStreamWriter(Table table, - FlinkWriteConf flinkWriteConf, - RowType flinkRowType, - List equalityFieldIds) { + static IcebergStreamWriter createStreamWriter( + Table table, + FlinkWriteConf flinkWriteConf, + RowType flinkRowType, + List equalityFieldIds) { Preconditions.checkArgument(table != null, "Iceberg table shouldn't be null"); Table serializableTable = SerializableTable.copyOf(table); - TaskWriterFactory taskWriterFactory = new RowDataTaskWriterFactory( - serializableTable, flinkRowType, flinkWriteConf.targetDataFileSize(), - flinkWriteConf.dataFileFormat(), equalityFieldIds, flinkWriteConf.upsertMode()); + TaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + serializableTable, + flinkRowType, + flinkWriteConf.targetDataFileSize(), + flinkWriteConf.dataFileFormat(), + equalityFieldIds, + flinkWriteConf.upsertMode()); return new IcebergStreamWriter<>(table.name(), taskWriterFactory); } } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java index 010df8cf5da2..dd189b575ea8 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -68,9 +67,12 @@ class IcebergFilesCommitter extends AbstractStreamOperator private static final Logger LOG = LoggerFactory.getLogger(IcebergFilesCommitter.class); private static final String FLINK_JOB_ID = "flink.job-id"; - // The max checkpoint id we've committed to iceberg table. As the flink's checkpoint is always increasing, so we could - // correctly commit all the data files whose checkpoint id is greater than the max committed one to iceberg table, for - // avoiding committing the same data files twice. This id will be attached to iceberg's meta when committing the + // The max checkpoint id we've committed to iceberg table. As the flink's checkpoint is always + // increasing, so we could + // correctly commit all the data files whose checkpoint id is greater than the max committed one + // to iceberg table, for + // avoiding committing the same data files twice. This id will be attached to iceberg's meta when + // committing the // iceberg transaction. private static final String MAX_COMMITTED_CHECKPOINT_ID = "flink.max-committed-checkpoint-id"; static final String MAX_CONTINUOUS_EMPTY_COMMITS = "flink.max-continuous-empty-commits"; @@ -79,15 +81,21 @@ class IcebergFilesCommitter extends AbstractStreamOperator private final TableLoader tableLoader; private final boolean replacePartitions; - // A sorted map to maintain the completed data files for each pending checkpointId (which have not been committed - // to iceberg table). We need a sorted map here because there's possible that few checkpoints snapshot failed, for - // example: the 1st checkpoint have 2 data files <1, >, the 2st checkpoint have 1 data files - // <2, >. Snapshot for checkpoint#1 interrupted because of network/disk failure etc, while we don't expect - // any data loss in iceberg table. So we keep the finished files <1, > in memory and retry to commit + // A sorted map to maintain the completed data files for each pending checkpointId (which have not + // been committed + // to iceberg table). We need a sorted map here because there's possible that few checkpoints + // snapshot failed, for + // example: the 1st checkpoint have 2 data files <1, >, the 2st checkpoint have 1 + // data files + // <2, >. Snapshot for checkpoint#1 interrupted because of network/disk failure etc, while + // we don't expect + // any data loss in iceberg table. So we keep the finished files <1, > in memory and + // retry to commit // iceberg table when the next checkpoint happen. private final NavigableMap dataFilesPerCheckpoint = Maps.newTreeMap(); - // The completed files cache for current checkpoint. Once the snapshot barrier received, it will be flushed to the + // The completed files cache for current checkpoint. Once the snapshot barrier received, it will + // be flushed to the // 'dataFilesPerCheckpoint'. private final List writeResultsOfCurrentCkpt = Lists.newArrayList(); @@ -98,15 +106,19 @@ class IcebergFilesCommitter extends AbstractStreamOperator private transient long maxCommittedCheckpointId; private transient int continuousEmptyCheckpoints; private transient int maxContinuousEmptyCommits; - // There're two cases that we restore from flink checkpoints: the first case is restoring from snapshot created by the - // same flink job; another case is restoring from snapshot created by another different job. For the second case, we - // need to maintain the old flink job's id in flink state backend to find the max-committed-checkpoint-id when + // There're two cases that we restore from flink checkpoints: the first case is restoring from + // snapshot created by the + // same flink job; another case is restoring from snapshot created by another different job. For + // the second case, we + // need to maintain the old flink job's id in flink state backend to find the + // max-committed-checkpoint-id when // traversing iceberg table's snapshots. - private static final ListStateDescriptor JOB_ID_DESCRIPTOR = new ListStateDescriptor<>( - "iceberg-flink-job-id", BasicTypeInfo.STRING_TYPE_INFO); + private static final ListStateDescriptor JOB_ID_DESCRIPTOR = + new ListStateDescriptor<>("iceberg-flink-job-id", BasicTypeInfo.STRING_TYPE_INFO); private transient ListState jobIdState; // All pending checkpoints states for this function. - private static final ListStateDescriptor> STATE_DESCRIPTOR = buildStateDescriptor(); + private static final ListStateDescriptor> STATE_DESCRIPTOR = + buildStateDescriptor(); private transient ListState> checkpointsState; IcebergFilesCommitter(TableLoader tableLoader, boolean replacePartitions) { @@ -123,30 +135,35 @@ public void initializeState(StateInitializationContext context) throws Exception this.tableLoader.open(); this.table = tableLoader.loadTable(); - maxContinuousEmptyCommits = PropertyUtil.propertyAsInt(table.properties(), MAX_CONTINUOUS_EMPTY_COMMITS, 10); - Preconditions.checkArgument(maxContinuousEmptyCommits > 0, - MAX_CONTINUOUS_EMPTY_COMMITS + " must be positive"); + maxContinuousEmptyCommits = + PropertyUtil.propertyAsInt(table.properties(), MAX_CONTINUOUS_EMPTY_COMMITS, 10); + Preconditions.checkArgument( + maxContinuousEmptyCommits > 0, MAX_CONTINUOUS_EMPTY_COMMITS + " must be positive"); int subTaskId = getRuntimeContext().getIndexOfThisSubtask(); int attemptId = getRuntimeContext().getAttemptNumber(); - this.manifestOutputFileFactory = FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, subTaskId, attemptId); + this.manifestOutputFileFactory = + FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, subTaskId, attemptId); this.maxCommittedCheckpointId = INITIAL_CHECKPOINT_ID; this.checkpointsState = context.getOperatorStateStore().getListState(STATE_DESCRIPTOR); this.jobIdState = context.getOperatorStateStore().getListState(JOB_ID_DESCRIPTOR); if (context.isRestored()) { String restoredFlinkJobId = jobIdState.get().iterator().next(); - Preconditions.checkState(!Strings.isNullOrEmpty(restoredFlinkJobId), + Preconditions.checkState( + !Strings.isNullOrEmpty(restoredFlinkJobId), "Flink job id parsed from checkpoint snapshot shouldn't be null or empty"); - // Since flink's checkpoint id will start from the max-committed-checkpoint-id + 1 in the new flink job even if - // it's restored from a snapshot created by another different flink job, so it's safe to assign the max committed + // Since flink's checkpoint id will start from the max-committed-checkpoint-id + 1 in the new + // flink job even if + // it's restored from a snapshot created by another different flink job, so it's safe to + // assign the max committed // checkpoint id from restored flink job to the current flink job. this.maxCommittedCheckpointId = getMaxCommittedCheckpointId(table, restoredFlinkJobId); - NavigableMap uncommittedDataFiles = Maps - .newTreeMap(checkpointsState.get().iterator().next()) - .tailMap(maxCommittedCheckpointId, false); + NavigableMap uncommittedDataFiles = + Maps.newTreeMap(checkpointsState.get().iterator().next()) + .tailMap(maxCommittedCheckpointId, false); if (!uncommittedDataFiles.isEmpty()) { // Committed all uncommitted data files from the old flink job to iceberg table. long maxUncommittedCheckpointId = uncommittedDataFiles.lastKey(); @@ -159,7 +176,10 @@ public void initializeState(StateInitializationContext context) throws Exception public void snapshotState(StateSnapshotContext context) throws Exception { super.snapshotState(context); long checkpointId = context.getCheckpointId(); - LOG.info("Start to flush snapshot state to state backend, table: {}, checkpointId: {}", table, checkpointId); + LOG.info( + "Start to flush snapshot state to state backend, table: {}, checkpointId: {}", + table, + checkpointId); // Update the checkpoint state. dataFilesPerCheckpoint.put(checkpointId, writeToManifest(checkpointId)); @@ -182,7 +202,8 @@ public void notifyCheckpointComplete(long checkpointId) throws Exception { // 2. snapshotState(ckpId+1); // 3. notifyCheckpointComplete(ckpId+1); // 4. notifyCheckpointComplete(ckpId); - // For step#4, we don't need to commit iceberg table again because in step#3 we've committed all the files, + // For step#4, we don't need to commit iceberg table again because in step#3 we've committed all + // the files, // Besides, we need to maintain the max-committed-checkpoint-id to be increasing. if (checkpointId > maxCommittedCheckpointId) { commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, checkpointId); @@ -190,9 +211,9 @@ public void notifyCheckpointComplete(long checkpointId) throws Exception { } } - private void commitUpToCheckpoint(NavigableMap deltaManifestsMap, - String newFlinkJobId, - long checkpointId) throws IOException { + private void commitUpToCheckpoint( + NavigableMap deltaManifestsMap, String newFlinkJobId, long checkpointId) + throws IOException { NavigableMap pendingMap = deltaManifestsMap.headMap(checkpointId, true); List manifests = Lists.newArrayList(); NavigableMap pendingResults = Maps.newTreeMap(); @@ -202,14 +223,18 @@ private void commitUpToCheckpoint(NavigableMap deltaManifestsMap, continue; } - DeltaManifests deltaManifests = SimpleVersionedSerialization - .readVersionAndDeSerialize(DeltaManifestsSerializer.INSTANCE, e.getValue()); - pendingResults.put(e.getKey(), FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io())); + DeltaManifests deltaManifests = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, e.getValue()); + pendingResults.put( + e.getKey(), FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io())); manifests.addAll(deltaManifests.manifests()); } - int totalFiles = pendingResults.values().stream() - .mapToInt(r -> r.dataFiles().length + r.deleteFiles().length).sum(); + int totalFiles = + pendingResults.values().stream() + .mapToInt(r -> r.dataFiles().length + r.deleteFiles().length) + .sum(); continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0; if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) { if (replacePartitions) { @@ -227,21 +252,25 @@ private void commitUpToCheckpoint(NavigableMap deltaManifestsMap, table.io().deleteFile(manifest.path()); } catch (Exception e) { // The flink manifests cleaning failure shouldn't abort the completed checkpoint. - String details = MoreObjects.toStringHelper(this) - .add("flinkJobId", newFlinkJobId) - .add("checkpointId", checkpointId) - .add("manifestPath", manifest.path()) - .toString(); - LOG.warn("The iceberg transaction has been committed, but we failed to clean the temporary flink manifests: {}", - details, e); + String details = + MoreObjects.toStringHelper(this) + .add("flinkJobId", newFlinkJobId) + .add("checkpointId", checkpointId) + .add("manifestPath", manifest.path()) + .toString(); + LOG.warn( + "The iceberg transaction has been committed, but we failed to clean the temporary flink manifests: {}", + details, + e); } } } - private void replacePartitions(NavigableMap pendingResults, String newFlinkJobId, - long checkpointId) { + private void replacePartitions( + NavigableMap pendingResults, String newFlinkJobId, long checkpointId) { // Partition overwrite does not support delete files. - int deleteFilesNum = pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum(); + int deleteFilesNum = + pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum(); Preconditions.checkState(deleteFilesNum == 0, "Cannot overwrite partitions with delete files."); // Commit the overwrite transaction. @@ -249,17 +278,21 @@ private void replacePartitions(NavigableMap pendingResults, S int numFiles = 0; for (WriteResult result : pendingResults.values()) { - Preconditions.checkState(result.referencedDataFiles().length == 0, "Should have no referenced data files."); + Preconditions.checkState( + result.referencedDataFiles().length == 0, "Should have no referenced data files."); numFiles += result.dataFiles().length; Arrays.stream(result.dataFiles()).forEach(dynamicOverwrite::addFile); } - commitOperation(dynamicOverwrite, numFiles, 0, "dynamic partition overwrite", newFlinkJobId, checkpointId); + commitOperation( + dynamicOverwrite, numFiles, 0, "dynamic partition overwrite", newFlinkJobId, checkpointId); } - private void commitDeltaTxn(NavigableMap pendingResults, String newFlinkJobId, long checkpointId) { - int deleteFilesNum = pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum(); + private void commitDeltaTxn( + NavigableMap pendingResults, String newFlinkJobId, long checkpointId) { + int deleteFilesNum = + pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum(); if (deleteFilesNum == 0) { // To be compatible with iceberg format V1. @@ -267,7 +300,8 @@ private void commitDeltaTxn(NavigableMap pendingResults, Stri int numFiles = 0; for (WriteResult result : pendingResults.values()) { - Preconditions.checkState(result.referencedDataFiles().length == 0, "Should have no referenced data files."); + Preconditions.checkState( + result.referencedDataFiles().length == 0, "Should have no referenced data files."); numFiles += result.dataFiles().length; Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); @@ -277,16 +311,23 @@ private void commitDeltaTxn(NavigableMap pendingResults, Stri } else { // To be compatible with iceberg format V2. for (Map.Entry e : pendingResults.entrySet()) { - // We don't commit the merged result into a single transaction because for the sequential transaction txn1 and - // txn2, the equality-delete files of txn2 are required to be applied to data files from txn1. Committing the + // We don't commit the merged result into a single transaction because for the sequential + // transaction txn1 and + // txn2, the equality-delete files of txn2 are required to be applied to data files from + // txn1. Committing the // merged one will lead to the incorrect delete semantic. WriteResult result = e.getValue(); - // Row delta validations are not needed for streaming changes that write equality deletes. Equality deletes - // are applied to data in all previous sequence numbers, so retries may push deletes further in the future, - // but do not affect correctness. Position deletes committed to the table in this path are used only to delete - // rows from data files that are being added in this commit. There is no way for data files added along with - // the delete files to be concurrently removed, so there is no need to validate the files referenced by the + // Row delta validations are not needed for streaming changes that write equality deletes. + // Equality deletes + // are applied to data in all previous sequence numbers, so retries may push deletes further + // in the future, + // but do not affect correctness. Position deletes committed to the table in this path are + // used only to delete + // rows from data files that are being added in this commit. There is no way for data files + // added along with + // the delete files to be concurrently removed, so there is no need to validate the files + // referenced by the // position delete files that are being committed. RowDelta rowDelta = table.newRowDelta(); @@ -296,15 +337,25 @@ private void commitDeltaTxn(NavigableMap pendingResults, Stri int numDeleteFiles = result.deleteFiles().length; Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); - commitOperation(rowDelta, numDataFiles, numDeleteFiles, "rowDelta", newFlinkJobId, e.getKey()); + commitOperation( + rowDelta, numDataFiles, numDeleteFiles, "rowDelta", newFlinkJobId, e.getKey()); } } } - private void commitOperation(SnapshotUpdate operation, int numDataFiles, int numDeleteFiles, String description, - String newFlinkJobId, long checkpointId) { - LOG.info("Committing {} with {} data files and {} delete files to table {}", description, numDataFiles, - numDeleteFiles, table); + private void commitOperation( + SnapshotUpdate operation, + int numDataFiles, + int numDeleteFiles, + String description, + String newFlinkJobId, + long checkpointId) { + LOG.info( + "Committing {} with {} data files and {} delete files to table {}", + description, + numDataFiles, + numDeleteFiles, + table); operation.set(MAX_COMMITTED_CHECKPOINT_ID, Long.toString(checkpointId)); operation.set(FLINK_JOB_ID, newFlinkJobId); @@ -330,7 +381,8 @@ public void endInput() throws IOException { } /** - * Write all the complete data files to a newly created manifest file and return the manifest's avro serialized bytes. + * Write all the complete data files to a newly created manifest file and return the manifest's + * avro serialized bytes. */ private byte[] writeToManifest(long checkpointId) throws IOException { if (writeResultsOfCurrentCkpt.isEmpty()) { @@ -338,10 +390,12 @@ private byte[] writeToManifest(long checkpointId) throws IOException { } WriteResult result = WriteResult.builder().addAll(writeResultsOfCurrentCkpt).build(); - DeltaManifests deltaManifests = FlinkManifestUtil.writeCompletedFiles(result, - () -> manifestOutputFileFactory.create(checkpointId), table.spec()); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + result, () -> manifestOutputFileFactory.create(checkpointId), table.spec()); - return SimpleVersionedSerialization.writeVersionAndSerialize(DeltaManifestsSerializer.INSTANCE, deltaManifests); + return SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, deltaManifests); } @Override @@ -354,9 +408,11 @@ public void dispose() throws Exception { private static ListStateDescriptor> buildStateDescriptor() { Comparator longComparator = Comparators.forType(Types.LongType.get()); // Construct a SortedMapTypeInfo. - SortedMapTypeInfo sortedMapTypeInfo = new SortedMapTypeInfo<>( - BasicTypeInfo.LONG_TYPE_INFO, PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO, longComparator - ); + SortedMapTypeInfo sortedMapTypeInfo = + new SortedMapTypeInfo<>( + BasicTypeInfo.LONG_TYPE_INFO, + PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO, + longComparator); return new ListStateDescriptor<>("iceberg-files-committer-state", sortedMapTypeInfo); } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java index 6d12310dd1dc..34905c30cc2a 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -83,7 +82,8 @@ public void dispose() throws Exception { @Override public void endInput() throws IOException { - // For bounded stream, it may don't enable the checkpoint mechanism so we'd better to emit the remaining + // For bounded stream, it may don't enable the checkpoint mechanism so we'd better to emit the + // remaining // completed files to downstream before closing the writer so that we won't miss any of them. emit(writer.complete()); } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java index fca86080b11a..a5f5adef7dad 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.Map; @@ -28,7 +27,8 @@ import org.apache.iceberg.relocated.com.google.common.base.Strings; class ManifestOutputFileFactory { - // Users could define their own flink manifests directory by setting this value in table properties. + // Users could define their own flink manifests directory by setting this value in table + // properties. static final String FLINK_MANIFEST_LOCATION = "flink.manifests.location"; private final TableOperations ops; @@ -39,8 +39,13 @@ class ManifestOutputFileFactory { private final long attemptNumber; private final AtomicInteger fileCount = new AtomicInteger(0); - ManifestOutputFileFactory(TableOperations ops, FileIO io, Map props, - String flinkJobId, int subTaskId, long attemptNumber) { + ManifestOutputFileFactory( + TableOperations ops, + FileIO io, + Map props, + String flinkJobId, + int subTaskId, + long attemptNumber) { this.ops = ops; this.io = io; this.props = props; @@ -50,8 +55,10 @@ class ManifestOutputFileFactory { } private String generatePath(long checkpointId) { - return FileFormat.AVRO.addExtension(String.format("%s-%05d-%d-%d-%05d", flinkJobId, subTaskId, - attemptNumber, checkpointId, fileCount.incrementAndGet())); + return FileFormat.AVRO.addExtension( + String.format( + "%s-%05d-%d-%d-%05d", + flinkJobId, subTaskId, attemptNumber, checkpointId, fileCount.incrementAndGet())); } OutputFile create(long checkpointId) { @@ -62,7 +69,8 @@ OutputFile create(long checkpointId) { // User don't specify any flink manifest directory, so just use the default metadata path. newManifestFullPath = ops.metadataFileLocation(generatePath(checkpointId)); } else { - newManifestFullPath = String.format("%s/%s", stripTrailingSlash(flinkManifestDir), generatePath(checkpointId)); + newManifestFullPath = + String.format("%s/%s", stripTrailingSlash(flinkManifestDir), generatePath(checkpointId)); } return io.newOutputFile(newManifestFullPath); diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java index 598df09eee83..df951684b446 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import org.apache.flink.api.java.functions.KeySelector; @@ -28,8 +27,9 @@ import org.apache.iceberg.flink.RowDataWrapper; /** - * Create a {@link KeySelector} to shuffle by partition key, then each partition/bucket will be wrote by only one - * task. That will reduce lots of small files in partitioned fanout write policy for {@link FlinkSink}. + * Create a {@link KeySelector} to shuffle by partition key, then each partition/bucket will be + * wrote by only one task. That will reduce lots of small files in partitioned fanout write policy + * for {@link FlinkSink}. */ class PartitionKeySelector implements KeySelector { @@ -46,8 +46,8 @@ class PartitionKeySelector implements KeySelector { } /** - * Construct the {@link RowDataWrapper} lazily here because few members in it are not serializable. In this way, we - * don't have to serialize them with forcing. + * Construct the {@link RowDataWrapper} lazily here because few members in it are not + * serializable. In this way, we don't have to serialize them with forcing. */ private RowDataWrapper lazyRowDataWrapper() { if (rowDataWrapper == null) { diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java index 1eee6298e933..38062dd1a2c4 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -41,17 +40,27 @@ class PartitionedDeltaWriter extends BaseDeltaTaskWriter { private final Map writers = Maps.newHashMap(); - PartitionedDeltaWriter(PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - List equalityFieldIds, - boolean upsert) { - super(spec, format, appenderFactory, fileFactory, io, targetFileSize, schema, flinkSchema, equalityFieldIds, + PartitionedDeltaWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema, + List equalityFieldIds, + boolean upsert) { + super( + spec, + format, + appenderFactory, + fileFactory, + io, + targetFileSize, + schema, + flinkSchema, + equalityFieldIds, upsert); this.partitionKey = new PartitionKey(spec, schema); } @@ -62,7 +71,8 @@ RowDataDeltaWriter route(RowData row) { RowDataDeltaWriter writer = writers.get(partitionKey); if (writer == null) { - // NOTICE: we need to copy a new partition key here, in case of messing up the keys in writers. + // NOTICE: we need to copy a new partition key here, in case of messing up the keys in + // writers. PartitionKey copiedKey = partitionKey.copy(); writer = new RowDataDeltaWriter(copiedKey); writers.put(copiedKey, writer); diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java index f6ee976e637f..1c330434d019 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -53,12 +52,13 @@ public class RowDataTaskWriterFactory implements TaskWriterFactory { private transient OutputFileFactory outputFileFactory; - public RowDataTaskWriterFactory(Table table, - RowType flinkSchema, - long targetFileSizeBytes, - FileFormat format, - List equalityFieldIds, - boolean upsert) { + public RowDataTaskWriterFactory( + Table table, + RowType flinkSchema, + long targetFileSizeBytes, + FileFormat format, + List equalityFieldIds, + boolean upsert) { this.table = table; this.schema = table.schema(); this.flinkSchema = flinkSchema; @@ -70,47 +70,90 @@ public RowDataTaskWriterFactory(Table table, this.upsert = upsert; if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { - this.appenderFactory = new FlinkAppenderFactory(schema, flinkSchema, table.properties(), spec); + this.appenderFactory = + new FlinkAppenderFactory(schema, flinkSchema, table.properties(), spec); } else if (upsert) { - // In upsert mode, only the new row is emitted using INSERT row kind. Therefore, any column of the inserted row - // may differ from the deleted row other than the primary key fields, and the delete file must contain values + // In upsert mode, only the new row is emitted using INSERT row kind. Therefore, any column of + // the inserted row + // may differ from the deleted row other than the primary key fields, and the delete file must + // contain values // that are correct for the deleted row. Therefore, only write the equality delete fields. - this.appenderFactory = new FlinkAppenderFactory(schema, flinkSchema, table.properties(), spec, - ArrayUtil.toIntArray(equalityFieldIds), TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)), null); + this.appenderFactory = + new FlinkAppenderFactory( + schema, + flinkSchema, + table.properties(), + spec, + ArrayUtil.toIntArray(equalityFieldIds), + TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)), + null); } else { - this.appenderFactory = new FlinkAppenderFactory(schema, flinkSchema, table.properties(), spec, - ArrayUtil.toIntArray(equalityFieldIds), schema, null); + this.appenderFactory = + new FlinkAppenderFactory( + schema, + flinkSchema, + table.properties(), + spec, + ArrayUtil.toIntArray(equalityFieldIds), + schema, + null); } } @Override public void initialize(int taskId, int attemptId) { - this.outputFileFactory = OutputFileFactory.builderFor(table, taskId, attemptId) - .format(format) - .build(); + this.outputFileFactory = + OutputFileFactory.builderFor(table, taskId, attemptId).format(format).build(); } @Override public TaskWriter create() { - Preconditions.checkNotNull(outputFileFactory, + Preconditions.checkNotNull( + outputFileFactory, "The outputFileFactory shouldn't be null if we have invoked the initialize()."); if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { // Initialize a task writer to write INSERT only. if (spec.isUnpartitioned()) { - return new UnpartitionedWriter<>(spec, format, appenderFactory, outputFileFactory, io, targetFileSizeBytes); + return new UnpartitionedWriter<>( + spec, format, appenderFactory, outputFileFactory, io, targetFileSizeBytes); } else { - return new RowDataPartitionedFanoutWriter(spec, format, appenderFactory, outputFileFactory, - io, targetFileSizeBytes, schema, flinkSchema); + return new RowDataPartitionedFanoutWriter( + spec, + format, + appenderFactory, + outputFileFactory, + io, + targetFileSizeBytes, + schema, + flinkSchema); } } else { // Initialize a task writer to write both INSERT and equality DELETE. if (spec.isUnpartitioned()) { - return new UnpartitionedDeltaWriter(spec, format, appenderFactory, outputFileFactory, io, - targetFileSizeBytes, schema, flinkSchema, equalityFieldIds, upsert); + return new UnpartitionedDeltaWriter( + spec, + format, + appenderFactory, + outputFileFactory, + io, + targetFileSizeBytes, + schema, + flinkSchema, + equalityFieldIds, + upsert); } else { - return new PartitionedDeltaWriter(spec, format, appenderFactory, outputFileFactory, io, - targetFileSizeBytes, schema, flinkSchema, equalityFieldIds, upsert); + return new PartitionedDeltaWriter( + spec, + format, + appenderFactory, + outputFileFactory, + io, + targetFileSizeBytes, + schema, + flinkSchema, + equalityFieldIds, + upsert); } } } @@ -120,9 +163,15 @@ private static class RowDataPartitionedFanoutWriter extends PartitionedFanoutWri private final PartitionKey partitionKey; private final RowDataWrapper rowDataWrapper; - RowDataPartitionedFanoutWriter(PartitionSpec spec, FileFormat format, FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize, Schema schema, - RowType flinkSchema) { + RowDataPartitionedFanoutWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.partitionKey = new PartitionKey(spec, schema); this.rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java index 9d56ec6a812a..e3a1245e8cbd 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.Serializable; @@ -32,7 +31,7 @@ public interface TaskWriterFactory extends Serializable { /** * Initialize the factory with a given taskId and attemptId. * - * @param taskId the identifier of task. + * @param taskId the identifier of task. * @param attemptId the attempt id of this task. */ void initialize(int taskId, int attemptId); diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java index 331ed7c78192..7680fb933b20 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -33,17 +32,27 @@ class UnpartitionedDeltaWriter extends BaseDeltaTaskWriter { private final RowDataDeltaWriter writer; - UnpartitionedDeltaWriter(PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - List equalityFieldIds, - boolean upsert) { - super(spec, format, appenderFactory, fileFactory, io, targetFileSize, schema, flinkSchema, equalityFieldIds, + UnpartitionedDeltaWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema, + List equalityFieldIds, + boolean upsert) { + super( + spec, + format, + appenderFactory, + fileFactory, + io, + targetFileSize, + schema, + flinkSchema, + equalityFieldIds, upsert); this.writer = new RowDataDeltaWriter(null); } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java index d470b0752304..85c848b3d8ea 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -44,8 +43,11 @@ public class DataIterator implements CloseableIterator { private Iterator tasks; private CloseableIterator currentIterator; - public DataIterator(FileScanTaskReader fileScanTaskReader, CombinedScanTask task, - FileIO io, EncryptionManager encryption) { + public DataIterator( + FileScanTaskReader fileScanTaskReader, + CombinedScanTask task, + FileIO io, + EncryptionManager encryption) { this.fileScanTaskReader = fileScanTaskReader; this.inputFilesDecryptor = new InputFilesDecryptor(task, io, encryption); @@ -65,10 +67,7 @@ public T next() { return currentIterator.next(); } - /** - * Updates the current iterator field to ensure that the current Iterator - * is not exhausted. - */ + /** Updates the current iterator field to ensure that the current Iterator is not exhausted. */ private void updateCurrentIterator() { try { while (!currentIterator.hasNext() && tasks.hasNext()) { diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java index 04273016ee2d..927a804a4792 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.Serializable; diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java index 8f544d7b1795..6f8d6e3461aa 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -35,9 +34,7 @@ import org.apache.iceberg.io.FileIO; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -/** - * Flink {@link InputFormat} for Iceberg. - */ +/** Flink {@link InputFormat} for Iceberg. */ public class FlinkInputFormat extends RichInputFormat { private static final long serialVersionUID = 1L; @@ -51,14 +48,19 @@ public class FlinkInputFormat extends RichInputFormat private transient DataIterator iterator; private transient long currentReadCount = 0L; - FlinkInputFormat(TableLoader tableLoader, Schema tableSchema, FileIO io, EncryptionManager encryption, - ScanContext context) { + FlinkInputFormat( + TableLoader tableLoader, + Schema tableSchema, + FileIO io, + EncryptionManager encryption, + ScanContext context) { this.tableLoader = tableLoader; this.io = io; this.encryption = encryption; this.context = context; - this.rowDataReader = new RowDataFileScanTaskReader(tableSchema, - context.project(), context.nameMapping(), context.caseSensitive()); + this.rowDataReader = + new RowDataFileScanTaskReader( + tableSchema, context.project(), context.nameMapping(), context.caseSensitive()); } @VisibleForTesting @@ -84,14 +86,13 @@ public FlinkInputSplit[] createInputSplits(int minNumSplits) throws IOException @Override public InputSplitAssigner getInputSplitAssigner(FlinkInputSplit[] inputSplits) { - return context.exposeLocality() ? - new LocatableInputSplitAssigner(inputSplits) : - new DefaultInputSplitAssigner(inputSplits); + return context.exposeLocality() + ? new LocatableInputSplitAssigner(inputSplits) + : new DefaultInputSplitAssigner(inputSplits); } @Override - public void configure(Configuration parameters) { - } + public void configure(Configuration parameters) {} @Override public void open(FlinkInputSplit split) { diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java index 5bb85fe7162a..16fd4f39596c 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Arrays; diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java index d1159ed90e52..7ad81a7e9e2b 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -53,20 +52,21 @@ public class FlinkSource { private static final Logger LOG = LoggerFactory.getLogger(FlinkSource.class); - private FlinkSource() { - } + private FlinkSource() {} /** - * Initialize a {@link Builder} to read the data from iceberg table. Equivalent to {@link TableScan}. See more options - * in {@link ScanContext}. - *

- * The Source can be read static data in bounded mode. It can also continuously check the arrival of new data and read - * records incrementally. + * Initialize a {@link Builder} to read the data from iceberg table. Equivalent to {@link + * TableScan}. See more options in {@link ScanContext}. + * + *

The Source can be read static data in bounded mode. It can also continuously check the + * arrival of new data and read records incrementally. + * *

    - *
  • Without startSnapshotId: Bounded
  • - *
  • With startSnapshotId and with endSnapshotId: Bounded
  • - *
  • With startSnapshotId (-1 means unbounded preceding) and Without endSnapshotId: Unbounded
  • + *
  • Without startSnapshotId: Bounded + *
  • With startSnapshotId and with endSnapshotId: Bounded + *
  • With startSnapshotId (-1 means unbounded preceding) and Without endSnapshotId: Unbounded *
+ * *

* * @return {@link Builder} to connect the iceberg table. @@ -75,9 +75,7 @@ public static Builder forRowData() { return new Builder(); } - /** - * Source builder to build {@link DataStream}. - */ + /** Source builder to build {@link DataStream}. */ public static class Builder { private static final Set FILE_SYSTEM_SUPPORT_LOCALITY = ImmutableSet.of("hdfs"); @@ -224,7 +222,8 @@ public FlinkInputFormat buildFormat() { } contextBuilder.exposeLocality(localityEnabled()); - return new FlinkInputFormat(tableLoader, icebergSchema, io, encryption, contextBuilder.build()); + return new FlinkInputFormat( + tableLoader, icebergSchema, io, encryption, contextBuilder.build()); } public DataStream build() { @@ -232,7 +231,8 @@ public DataStream build() { FlinkInputFormat format = buildFormat(); ScanContext context = contextBuilder.build(); - TypeInformation typeInfo = FlinkCompatibilityUtil.toTypeInfo(FlinkSchemaUtil.convert(context.project())); + TypeInformation typeInfo = + FlinkCompatibilityUtil.toTypeInfo(FlinkSchemaUtil.convert(context.project())); if (!context.isStreaming()) { int parallelism = inferParallelism(format, context); @@ -252,25 +252,30 @@ public DataStream build() { } int inferParallelism(FlinkInputFormat format, ScanContext context) { - int parallelism = readableConfig.get(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM); + int parallelism = + readableConfig.get(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM); if (readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM)) { - int maxInferParallelism = readableConfig.get(FlinkConfigOptions - .TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX); - Preconditions.checkState(maxInferParallelism >= 1, - FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX.key() + " cannot be less than 1"); + int maxInferParallelism = + readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX); + Preconditions.checkState( + maxInferParallelism >= 1, + FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX.key() + + " cannot be less than 1"); int splitNum; try { FlinkInputSplit[] splits = format.createInputSplits(0); splitNum = splits.length; } catch (IOException e) { - throw new UncheckedIOException("Failed to create iceberg input splits for table: " + table, e); + throw new UncheckedIOException( + "Failed to create iceberg input splits for table: " + table, e); } parallelism = Math.min(splitNum, maxInferParallelism); } if (context.limit() > 0) { - int limit = context.limit() >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) context.limit(); + int limit = + context.limit() >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) context.limit(); parallelism = Math.min(parallelism, limit); } @@ -281,8 +286,10 @@ int inferParallelism(FlinkInputFormat format, ScanContext context) { private boolean localityEnabled() { Boolean localityEnabled = - this.exposeLocality != null ? this.exposeLocality : - readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO); + this.exposeLocality != null + ? this.exposeLocality + : readableConfig.get( + FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO); if (localityEnabled != null && !localityEnabled) { return false; @@ -292,10 +299,14 @@ private boolean localityEnabled() { if (fileIO instanceof HadoopFileIO) { HadoopFileIO hadoopFileIO = (HadoopFileIO) fileIO; try { - String scheme = new Path(table.location()).getFileSystem(hadoopFileIO.getConf()).getScheme(); + String scheme = + new Path(table.location()).getFileSystem(hadoopFileIO.getConf()).getScheme(); return FILE_SYSTEM_SUPPORT_LOCALITY.contains(scheme); } catch (IOException e) { - LOG.warn("Failed to determine whether the locality information can be exposed for table: {}", table, e); + LOG.warn( + "Failed to determine whether the locality information can be exposed for table: {}", + table, + e); } } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java index f332e85f36c2..2473d167ff68 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -34,8 +33,7 @@ import org.apache.iceberg.util.ThreadPools; class FlinkSplitGenerator { - private FlinkSplitGenerator() { - } + private FlinkSplitGenerator() {} static FlinkInputSplit[] createInputSplits(Table table, ScanContext context) { List tasks = tasks(table, context); @@ -45,22 +43,21 @@ static FlinkInputSplit[] createInputSplits(Table table, ScanContext context) { Tasks.range(tasks.size()) .stopOnFailure() .executeWith(exposeLocality ? ThreadPools.getWorkerPool() : null) - .run(index -> { - CombinedScanTask task = tasks.get(index); - String[] hostnames = null; - if (exposeLocality) { - hostnames = Util.blockLocations(table.io(), task); - } - splits[index] = new FlinkInputSplit(index, task, hostnames); - }); + .run( + index -> { + CombinedScanTask task = tasks.get(index); + String[] hostnames = null; + if (exposeLocality) { + hostnames = Util.blockLocations(table.io(), task); + } + splits[index] = new FlinkInputSplit(index, task, hostnames); + }); return splits; } private static List tasks(Table table, ScanContext context) { - TableScan scan = table - .newScan() - .caseSensitive(context.caseSensitive()) - .project(context.project()); + TableScan scan = + table.newScan().caseSensitive(context.caseSensitive()).project(context.project()); if (context.snapshotId() != null) { scan = scan.useSnapshot(context.snapshotId()); @@ -87,7 +84,8 @@ private static List tasks(Table table, ScanContext context) { } if (context.splitOpenFileCost() != null) { - scan = scan.option(TableProperties.SPLIT_OPEN_FILE_COST, context.splitOpenFileCost().toString()); + scan = + scan.option(TableProperties.SPLIT_OPEN_FILE_COST, context.splitOpenFileCost().toString()); } if (context.filters() != null) { diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java index b71f2b0fafe5..5fada27d5471 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Map; @@ -56,8 +55,8 @@ public class RowDataFileScanTaskReader implements FileScanTaskReader { private final String nameMapping; private final boolean caseSensitive; - public RowDataFileScanTaskReader(Schema tableSchema, Schema projectedSchema, - String nameMapping, boolean caseSensitive) { + public RowDataFileScanTaskReader( + Schema tableSchema, Schema projectedSchema, String nameMapping, boolean caseSensitive) { this.tableSchema = tableSchema; this.projectedSchema = projectedSchema; this.nameMapping = nameMapping; @@ -65,21 +64,28 @@ public RowDataFileScanTaskReader(Schema tableSchema, Schema projectedSchema, } @Override - public CloseableIterator open(FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { + public CloseableIterator open( + FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { Schema partitionSchema = TypeUtil.select(projectedSchema, task.spec().identitySourceIds()); - Map idToConstant = partitionSchema.columns().isEmpty() ? ImmutableMap.of() : - PartitionUtil.constantsMap(task, RowDataUtil::convertConstant); + Map idToConstant = + partitionSchema.columns().isEmpty() + ? ImmutableMap.of() + : PartitionUtil.constantsMap(task, RowDataUtil::convertConstant); - FlinkDeleteFilter deletes = new FlinkDeleteFilter(task, tableSchema, projectedSchema, inputFilesDecryptor); - CloseableIterable iterable = deletes.filter( - newIterable(task, deletes.requiredSchema(), idToConstant, inputFilesDecryptor) - ); + FlinkDeleteFilter deletes = + new FlinkDeleteFilter(task, tableSchema, projectedSchema, inputFilesDecryptor); + CloseableIterable iterable = + deletes.filter( + newIterable(task, deletes.requiredSchema(), idToConstant, inputFilesDecryptor)); // Project the RowData to remove the extra meta columns. if (!projectedSchema.sameSchema(deletes.requiredSchema())) { - RowDataProjection rowDataProjection = RowDataProjection.create( - deletes.requiredRowType(), deletes.requiredSchema().asStruct(), projectedSchema.asStruct()); + RowDataProjection rowDataProjection = + RowDataProjection.create( + deletes.requiredRowType(), + deletes.requiredSchema().asStruct(), + projectedSchema.asStruct()); iterable = CloseableIterable.transform(iterable, rowDataProjection::wrap); } @@ -87,7 +93,10 @@ public CloseableIterator open(FileScanTask task, InputFilesDecryptor in } private CloseableIterable newIterable( - FileScanTask task, Schema schema, Map idToConstant, InputFilesDecryptor inputFilesDecryptor) { + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { CloseableIterable iter; if (task.isDataTask()) { throw new UnsupportedOperationException("Cannot read data task."); @@ -115,12 +124,16 @@ private CloseableIterable newIterable( } private CloseableIterable newAvroIterable( - FileScanTask task, Schema schema, Map idToConstant, InputFilesDecryptor inputFilesDecryptor) { - Avro.ReadBuilder builder = Avro.read(inputFilesDecryptor.getInputFile(task)) - .reuseContainers() - .project(schema) - .split(task.start(), task.length()) - .createReaderFunc(readSchema -> new FlinkAvroReader(schema, readSchema, idToConstant)); + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Avro.ReadBuilder builder = + Avro.read(inputFilesDecryptor.getInputFile(task)) + .reuseContainers() + .project(schema) + .split(task.start(), task.length()) + .createReaderFunc(readSchema -> new FlinkAvroReader(schema, readSchema, idToConstant)); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -130,14 +143,19 @@ private CloseableIterable newAvroIterable( } private CloseableIterable newParquetIterable( - FileScanTask task, Schema schema, Map idToConstant, InputFilesDecryptor inputFilesDecryptor) { - Parquet.ReadBuilder builder = Parquet.read(inputFilesDecryptor.getInputFile(task)) - .split(task.start(), task.length()) - .project(schema) - .createReaderFunc(fileSchema -> FlinkParquetReaders.buildReader(schema, fileSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive) - .reuseContainers(); + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Parquet.ReadBuilder builder = + Parquet.read(inputFilesDecryptor.getInputFile(task)) + .split(task.start(), task.length()) + .project(schema) + .createReaderFunc( + fileSchema -> FlinkParquetReaders.buildReader(schema, fileSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive) + .reuseContainers(); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -147,16 +165,22 @@ private CloseableIterable newParquetIterable( } private CloseableIterable newOrcIterable( - FileScanTask task, Schema schema, Map idToConstant, InputFilesDecryptor inputFilesDecryptor) { - Schema readSchemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(schema, - Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); - - ORC.ReadBuilder builder = ORC.read(inputFilesDecryptor.getInputFile(task)) - .project(readSchemaWithoutConstantAndMetadataFields) - .split(task.start(), task.length()) - .createReaderFunc(readOrcSchema -> new FlinkOrcReader(schema, readOrcSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive); + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Schema readSchemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot( + schema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); + + ORC.ReadBuilder builder = + ORC.read(inputFilesDecryptor.getInputFile(task)) + .project(readSchemaWithoutConstantAndMetadataFields) + .split(task.start(), task.length()) + .createReaderFunc( + readOrcSchema -> new FlinkOrcReader(schema, readOrcSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -170,8 +194,11 @@ private static class FlinkDeleteFilter extends DeleteFilter { private final RowDataWrapper asStructLike; private final InputFilesDecryptor inputFilesDecryptor; - FlinkDeleteFilter(FileScanTask task, Schema tableSchema, Schema requestedSchema, - InputFilesDecryptor inputFilesDecryptor) { + FlinkDeleteFilter( + FileScanTask task, + Schema tableSchema, + Schema requestedSchema, + InputFilesDecryptor inputFilesDecryptor) { super(task.file().path().toString(), task.deletes(), tableSchema, requestedSchema); this.requiredRowType = FlinkSchemaUtil.convert(requiredSchema()); this.asStructLike = new RowDataWrapper(requiredRowType, requiredSchema().asStruct()); diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java index 5e8837c5d47b..a721755c276f 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; +import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; + import java.util.Collection; import java.util.List; import java.util.Locale; @@ -46,8 +47,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; - public class RowDataRewriter { private static final Logger LOG = LoggerFactory.getLogger(RowDataRewriter.class); @@ -60,31 +59,36 @@ public class RowDataRewriter { private final TaskWriterFactory taskWriterFactory; private final String tableName; - public RowDataRewriter(Table table, boolean caseSensitive, FileIO io, EncryptionManager encryptionManager) { + public RowDataRewriter( + Table table, boolean caseSensitive, FileIO io, EncryptionManager encryptionManager) { this.schema = table.schema(); this.caseSensitive = caseSensitive; this.io = io; this.encryptionManager = encryptionManager; - this.nameMapping = PropertyUtil.propertyAsString(table.properties(), DEFAULT_NAME_MAPPING, null); + this.nameMapping = + PropertyUtil.propertyAsString(table.properties(), DEFAULT_NAME_MAPPING, null); this.tableName = table.name(); - String formatString = PropertyUtil.propertyAsString(table.properties(), TableProperties.DEFAULT_FILE_FORMAT, - TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); + String formatString = + PropertyUtil.propertyAsString( + table.properties(), + TableProperties.DEFAULT_FILE_FORMAT, + TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); FileFormat format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH)); RowType flinkSchema = FlinkSchemaUtil.convert(table.schema()); - this.taskWriterFactory = new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - flinkSchema, - Long.MAX_VALUE, - format, - null, - false); + this.taskWriterFactory = + new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), flinkSchema, Long.MAX_VALUE, format, null, false); } - public List rewriteDataForTasks(DataStream dataStream, int parallelism) throws Exception { - RewriteMap map = new RewriteMap(schema, nameMapping, io, caseSensitive, encryptionManager, taskWriterFactory); + public List rewriteDataForTasks( + DataStream dataStream, int parallelism) throws Exception { + RewriteMap map = + new RewriteMap( + schema, nameMapping, io, caseSensitive, encryptionManager, taskWriterFactory); DataStream> ds = dataStream.map(map).setParallelism(parallelism); - return Lists.newArrayList(ds.executeAndCollect("Rewrite table :" + tableName)).stream().flatMap(Collection::stream) + return Lists.newArrayList(ds.executeAndCollect("Rewrite table :" + tableName)).stream() + .flatMap(Collection::stream) .collect(Collectors.toList()); } @@ -102,15 +106,21 @@ public static class RewriteMap extends RichMapFunction taskWriterFactory; private final RowDataFileScanTaskReader rowDataReader; - public RewriteMap(Schema schema, String nameMapping, FileIO io, boolean caseSensitive, - EncryptionManager encryptionManager, TaskWriterFactory taskWriterFactory) { + public RewriteMap( + Schema schema, + String nameMapping, + FileIO io, + boolean caseSensitive, + EncryptionManager encryptionManager, + TaskWriterFactory taskWriterFactory) { this.schema = schema; this.nameMapping = nameMapping; this.io = io; this.caseSensitive = caseSensitive; this.encryptionManager = encryptionManager; this.taskWriterFactory = taskWriterFactory; - this.rowDataReader = new RowDataFileScanTaskReader(schema, schema, nameMapping, caseSensitive); + this.rowDataReader = + new RowDataFileScanTaskReader(schema, schema, nameMapping, caseSensitive); } @Override @@ -126,7 +136,7 @@ public List map(CombinedScanTask task) throws Exception { // Initialize the task writer. this.writer = taskWriterFactory.create(); try (DataIterator iterator = - new DataIterator<>(rowDataReader, task, io, encryptionManager)) { + new DataIterator<>(rowDataReader, task, io, encryptionManager)) { while (iterator.hasNext()) { RowData rowData = iterator.next(); writer.write(rowData); diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java index ee49a517fd59..b78d0e643aa9 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; +import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; + import java.io.Serializable; import java.time.Duration; import java.util.List; @@ -29,11 +30,7 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.expressions.Expression; -import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; - -/** - * Context object with optional arguments for a Flink Scan. - */ +/** Context object with optional arguments for a Flink Scan. */ class ScanContext implements Serializable { private static final long serialVersionUID = 1L; @@ -89,11 +86,23 @@ class ScanContext implements Serializable { private final List filters; private final long limit; - private ScanContext(boolean caseSensitive, Long snapshotId, Long startSnapshotId, Long endSnapshotId, - Long asOfTimestamp, Long splitSize, Integer splitLookback, Long splitOpenFileCost, - boolean isStreaming, Duration monitorInterval, String nameMapping, - Schema schema, List filters, long limit, boolean exposeLocality, - int maxPlanningSnapshotCount) { + private ScanContext( + boolean caseSensitive, + Long snapshotId, + Long startSnapshotId, + Long endSnapshotId, + Long asOfTimestamp, + Long splitSize, + Integer splitLookback, + Long splitOpenFileCost, + boolean isStreaming, + Duration monitorInterval, + String nameMapping, + Schema schema, + List filters, + long limit, + boolean exposeLocality, + int maxPlanningSnapshotCount) { this.caseSensitive = caseSensitive; this.snapshotId = snapshotId; this.startSnapshotId = startSnapshotId; @@ -241,8 +250,7 @@ static class Builder { private long limit = -1L; private boolean exposeLocality; - private Builder() { - } + private Builder() {} Builder caseSensitive(boolean newCaseSensitive) { this.caseSensitive = newCaseSensitive; @@ -343,10 +351,23 @@ Builder fromProperties(Map properties) { } public ScanContext build() { - return new ScanContext(caseSensitive, snapshotId, startSnapshotId, - endSnapshotId, asOfTimestamp, splitSize, splitLookback, - splitOpenFileCost, isStreaming, monitorInterval, nameMapping, projectedSchema, - filters, limit, exposeLocality, maxPlanningSnapshotCount); + return new ScanContext( + caseSensitive, + snapshotId, + startSnapshotId, + endSnapshotId, + asOfTimestamp, + splitSize, + splitLookback, + splitOpenFileCost, + isStreaming, + monitorInterval, + nameMapping, + projectedSchema, + filters, + limit, + exposeLocality, + maxPlanningSnapshotCount); } } } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java index e16f115a94f6..59eb1366d136 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -39,19 +38,20 @@ import org.slf4j.LoggerFactory; /** - * This is the single (non-parallel) monitoring task which takes a {@link FlinkInputFormat}, - * it is responsible for: + * This is the single (non-parallel) monitoring task which takes a {@link FlinkInputFormat}, it is + * responsible for: * *

    - *
  1. Monitoring snapshots of the Iceberg table.
  2. - *
  3. Creating the {@link FlinkInputSplit splits} corresponding to the incremental files
  4. - *
  5. Assigning them to downstream tasks for further processing.
  6. + *
  7. Monitoring snapshots of the Iceberg table. + *
  8. Creating the {@link FlinkInputSplit splits} corresponding to the incremental files + *
  9. Assigning them to downstream tasks for further processing. *
* - *

The splits to be read are forwarded to the downstream {@link StreamingReaderOperator} - * which can have parallelism greater than one. + *

The splits to be read are forwarded to the downstream {@link StreamingReaderOperator} which + * can have parallelism greater than one. */ -public class StreamingMonitorFunction extends RichSourceFunction implements CheckpointedFunction { +public class StreamingMonitorFunction extends RichSourceFunction + implements CheckpointedFunction { private static final Logger LOG = LoggerFactory.getLogger(StreamingMonitorFunction.class); @@ -62,7 +62,8 @@ public class StreamingMonitorFunction extends RichSourceFunction lastSnapshotIdState; public StreamingMonitorFunction(TableLoader tableLoader, ScanContext scanContext) { - Preconditions.checkArgument(scanContext.snapshotId() == null, - "Cannot set snapshot-id option for streaming reader"); - Preconditions.checkArgument(scanContext.asOfTimestamp() == null, + Preconditions.checkArgument( + scanContext.snapshotId() == null, "Cannot set snapshot-id option for streaming reader"); + Preconditions.checkArgument( + scanContext.asOfTimestamp() == null, "Cannot set as-of-timestamp option for streaming reader"); - Preconditions.checkArgument(scanContext.endSnapshotId() == null, + Preconditions.checkArgument( + scanContext.endSnapshotId() == null, "Cannot set end-snapshot-id option for streaming reader"); - Preconditions.checkArgument(scanContext.maxPlanningSnapshotCount() > 0, + Preconditions.checkArgument( + scanContext.maxPlanningSnapshotCount() > 0, "The max-planning-snapshot-count must be greater than zero"); this.tableLoader = tableLoader; this.scanContext = scanContext; @@ -90,21 +94,24 @@ public void initializeState(FunctionInitializationContext context) throws Except table = tableLoader.loadTable(); // Initialize the flink state for last snapshot id. - lastSnapshotIdState = context.getOperatorStateStore().getListState( - new ListStateDescriptor<>( - "snapshot-id-state", - LongSerializer.INSTANCE)); + lastSnapshotIdState = + context + .getOperatorStateStore() + .getListState(new ListStateDescriptor<>("snapshot-id-state", LongSerializer.INSTANCE)); // Restore the last-snapshot-id from flink's state if possible. if (context.isRestored()) { LOG.info("Restoring state for the {}.", getClass().getSimpleName()); lastSnapshotId = lastSnapshotIdState.get().iterator().next(); } else if (scanContext.startSnapshotId() != null) { - Preconditions.checkNotNull(table.currentSnapshot(), "Don't have any available snapshot in table."); + Preconditions.checkNotNull( + table.currentSnapshot(), "Don't have any available snapshot in table."); long currentSnapshotId = table.currentSnapshot().snapshotId(); - Preconditions.checkState(SnapshotUtil.isAncestorOf(table, currentSnapshotId, scanContext.startSnapshotId()), - "The option start-snapshot-id %s is not an ancestor of the current snapshot.", scanContext.startSnapshotId()); + Preconditions.checkState( + SnapshotUtil.isAncestorOf(table, currentSnapshotId, scanContext.startSnapshotId()), + "The option start-snapshot-id %s is not an ancestor of the current snapshot.", + scanContext.startSnapshotId()); lastSnapshotId = scanContext.startSnapshotId(); } @@ -125,13 +132,15 @@ public void run(SourceContext ctx) throws Exception { } } - private long toSnapshotIdInclusive(long lastConsumedSnapshotId, long currentSnapshotId, - int maxPlanningSnapshotCount) { - List snapshotIds = SnapshotUtil.snapshotIdsBetween(table, lastConsumedSnapshotId, currentSnapshotId); + private long toSnapshotIdInclusive( + long lastConsumedSnapshotId, long currentSnapshotId, int maxPlanningSnapshotCount) { + List snapshotIds = + SnapshotUtil.snapshotIdsBetween(table, lastConsumedSnapshotId, currentSnapshotId); if (snapshotIds.size() <= maxPlanningSnapshotCount) { return currentSnapshotId; } else { - // It uses reverted index since snapshotIdsBetween returns Ids that are ordered by committed time descending. + // It uses reverted index since snapshotIdsBetween returns Ids that are ordered by committed + // time descending. return snapshotIds.get(snapshotIds.size() - maxPlanningSnapshotCount); } } @@ -154,14 +163,22 @@ void monitorAndForwardSplits() { if (lastSnapshotId == INIT_LAST_SNAPSHOT_ID) { newScanContext = scanContext.copyWithSnapshotId(snapshotId); } else { - snapshotId = toSnapshotIdInclusive(lastSnapshotId, snapshotId, scanContext.maxPlanningSnapshotCount()); + snapshotId = + toSnapshotIdInclusive( + lastSnapshotId, snapshotId, scanContext.maxPlanningSnapshotCount()); newScanContext = scanContext.copyWithAppendsBetween(lastSnapshotId, snapshotId); } - LOG.debug("Start discovering splits from {} (exclusive) to {} (inclusive)", lastSnapshotId, snapshotId); + LOG.debug( + "Start discovering splits from {} (exclusive) to {} (inclusive)", + lastSnapshotId, + snapshotId); long start = System.currentTimeMillis(); FlinkInputSplit[] splits = FlinkSplitGenerator.createInputSplits(table, newScanContext); - LOG.debug("Discovered {} splits, time elapsed {}ms", splits.length, System.currentTimeMillis() - start); + LOG.debug( + "Discovered {} splits, time elapsed {}ms", + splits.length, + System.currentTimeMillis() - start); // only need to hold the checkpoint lock when emitting the splits and updating lastSnapshotId start = System.currentTimeMillis(); @@ -172,7 +189,10 @@ void monitorAndForwardSplits() { lastSnapshotId = snapshotId; } - LOG.debug("Forwarded {} splits, time elapsed {}ms", splits.length, System.currentTimeMillis() - start); + LOG.debug( + "Forwarded {} splits, time elapsed {}ms", + splits.length, + System.currentTimeMillis() - start); } } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java index 235b17332f5d..6dcf6c3e3f98 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -47,20 +46,23 @@ /** * The operator that reads the {@link FlinkInputSplit splits} received from the preceding {@link - * StreamingMonitorFunction}. Contrary to the {@link StreamingMonitorFunction} which has a parallelism of 1, - * this operator can have multiple parallelism. + * StreamingMonitorFunction}. Contrary to the {@link StreamingMonitorFunction} which has a + * parallelism of 1, this operator can have multiple parallelism. * - *

As soon as a split descriptor is received, it is put in a queue, and use {@link MailboxExecutor} - * read the actual data of the split. This architecture allows the separation of the reading thread from the one split - * processing the checkpoint barriers, thus removing any potential back-pressure. + *

As soon as a split descriptor is received, it is put in a queue, and use {@link + * MailboxExecutor} read the actual data of the split. This architecture allows the separation of + * the reading thread from the one split processing the checkpoint barriers, thus removing any + * potential back-pressure. */ public class StreamingReaderOperator extends AbstractStreamOperator implements OneInputStreamOperator { private static final Logger LOG = LoggerFactory.getLogger(StreamingReaderOperator.class); - // It's the same thread that is running this operator and checkpoint actions. we use this executor to schedule only - // one split for future reading, so that a new checkpoint could be triggered without blocking long time for exhausting + // It's the same thread that is running this operator and checkpoint actions. we use this executor + // to schedule only + // one split for future reading, so that a new checkpoint could be triggered without blocking long + // time for exhausting // all scheduled splits. private final MailboxExecutor executor; private FlinkInputFormat format; @@ -70,17 +72,21 @@ public class StreamingReaderOperator extends AbstractStreamOperator private transient ListState inputSplitsState; private transient Queue splits; - // Splits are read by the same thread that calls processElement. Each read task is submitted to that thread by adding - // them to the executor. This state is used to ensure that only one read task is in that queue at a time, so that read - // tasks do not accumulate ahead of checkpoint tasks. When there is a read task in the queue, this is set to RUNNING. + // Splits are read by the same thread that calls processElement. Each read task is submitted to + // that thread by adding + // them to the executor. This state is used to ensure that only one read task is in that queue at + // a time, so that read + // tasks do not accumulate ahead of checkpoint tasks. When there is a read task in the queue, this + // is set to RUNNING. // When there are no more files to read, this will be set to IDLE. private transient SplitState currentSplitState; - private StreamingReaderOperator(FlinkInputFormat format, ProcessingTimeService timeService, - MailboxExecutor mailboxExecutor) { + private StreamingReaderOperator( + FlinkInputFormat format, ProcessingTimeService timeService, MailboxExecutor mailboxExecutor) { this.format = Preconditions.checkNotNull(format, "The InputFormat should not be null."); this.processingTimeService = timeService; - this.executor = Preconditions.checkNotNull(mailboxExecutor, "The mailboxExecutor should not be null."); + this.executor = + Preconditions.checkNotNull(mailboxExecutor, "The mailboxExecutor should not be null."); } @Override @@ -89,8 +95,10 @@ public void initializeState(StateInitializationContext context) throws Exception // TODO Replace Java serialization with Avro approach to keep state compatibility. // See issue: https://github.com/apache/iceberg/issues/1698 - inputSplitsState = context.getOperatorStateStore().getListState( - new ListStateDescriptor<>("splits", new JavaSerializer<>())); + inputSplitsState = + context + .getOperatorStateStore() + .getListState(new ListStateDescriptor<>("splits", new JavaSerializer<>())); // Initialize the current split state to IDLE. currentSplitState = SplitState.IDLE; @@ -106,14 +114,15 @@ public void initializeState(StateInitializationContext context) throws Exception } } - this.sourceContext = StreamSourceContexts.getSourceContext( - getOperatorConfig().getTimeCharacteristic(), - getProcessingTimeService(), - new Object(), // no actual locking needed - getContainingTask().getStreamStatusMaintainer(), - output, - getRuntimeContext().getExecutionConfig().getAutoWatermarkInterval(), - -1); + this.sourceContext = + StreamSourceContexts.getSourceContext( + getOperatorConfig().getTimeCharacteristic(), + getProcessingTimeService(), + new Object(), // no actual locking needed + getContainingTask().getStreamStatusMaintainer(), + output, + getRuntimeContext().getExecutionConfig().getAutoWatermarkInterval(), + -1); // Enqueue to process the recovered input splits. enqueueProcessSplits(); @@ -197,11 +206,13 @@ static OneInputStreamOperatorFactory factory(FlinkInpu } private enum SplitState { - IDLE, RUNNING + IDLE, + RUNNING } private static class OperatorFactory extends AbstractStreamOperatorFactory - implements YieldingOperatorFactory, OneInputStreamOperatorFactory { + implements YieldingOperatorFactory, + OneInputStreamOperatorFactory { private final FlinkInputFormat format; @@ -218,9 +229,12 @@ public void setMailboxExecutor(MailboxExecutor mailboxExecutor) { @SuppressWarnings("unchecked") @Override - public > O createStreamOperator(StreamOperatorParameters parameters) { - StreamingReaderOperator operator = new StreamingReaderOperator(format, processingTimeService, mailboxExecutor); - operator.setup(parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); + public > O createStreamOperator( + StreamOperatorParameters parameters) { + StreamingReaderOperator operator = + new StreamingReaderOperator(format, processingTimeService, mailboxExecutor); + operator.setup( + parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); return (O) operator; } diff --git a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java index 274d2a8d17a0..2c5c587f4ebf 100644 --- a/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java +++ b/flink/v1.13/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.util; import org.apache.flink.api.common.typeinfo.TypeInformation; @@ -26,14 +25,12 @@ import org.apache.flink.table.types.logical.RowType; /** - * This is a small util class that try to hide calls to Flink - * Internal or PublicEvolve interfaces as Flink can change - * those APIs during minor version release. + * This is a small util class that try to hide calls to Flink Internal or PublicEvolve interfaces as + * Flink can change those APIs during minor version release. */ public class FlinkCompatibilityUtil { - private FlinkCompatibilityUtil() { - } + private FlinkCompatibilityUtil() {} public static TypeInformation toTypeInfo(RowType rowType) { return InternalTypeInfo.of(rowType); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/FlinkCatalogTestBase.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/FlinkCatalogTestBase.java index 3c5f25e9d876..d4da736dcd83 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/FlinkCatalogTestBase.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/FlinkCatalogTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.IOException; @@ -74,8 +73,7 @@ public static Iterable parameters() { return Lists.newArrayList( new Object[] {"testhive", Namespace.empty()}, new Object[] {"testhadoop", Namespace.empty()}, - new Object[] {"testhadoop_basenamespace", Namespace.of("l0", "l1")} - ); + new Object[] {"testhadoop_basenamespace", Namespace.of("l0", "l1")}); } protected final String catalogName; @@ -92,9 +90,10 @@ public FlinkCatalogTestBase(String catalogName, Namespace baseNamespace) { this.catalogName = catalogName; this.baseNamespace = baseNamespace; this.isHadoopCatalog = catalogName.startsWith("testhadoop"); - this.validationCatalog = isHadoopCatalog ? - new HadoopCatalog(hiveConf, "file:" + hadoopWarehouse.getRoot()) : - catalog; + this.validationCatalog = + isHadoopCatalog + ? new HadoopCatalog(hiveConf, "file:" + hadoopWarehouse.getRoot()) + : catalog; this.validationNamespaceCatalog = (SupportsNamespaces) validationCatalog; config.put("type", "iceberg"); @@ -110,7 +109,8 @@ public FlinkCatalogTestBase(String catalogName, Namespace baseNamespace) { config.put(CatalogProperties.WAREHOUSE_LOCATION, String.format("file://%s", warehouseRoot())); this.flinkDatabase = catalogName + "." + DATABASE; - this.icebergNamespace = Namespace.of(ArrayUtils.concat(baseNamespace.levels(), new String[] {DATABASE})); + this.icebergNamespace = + Namespace.of(ArrayUtils.concat(baseNamespace.levels(), new String[] {DATABASE})); } protected String warehouseRoot() { @@ -139,8 +139,14 @@ static String toWithClause(Map props) { if (propCount > 0) { builder.append(","); } - builder.append("'").append(entry.getKey()).append("'").append("=") - .append("'").append(entry.getValue()).append("'"); + builder + .append("'") + .append(entry.getKey()) + .append("'") + .append("=") + .append("'") + .append(entry.getValue()) + .append("'"); propCount++; } builder.append(")"); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/FlinkTestBase.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/FlinkTestBase.java index 8f5e50802c87..3b9f6268eb22 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/FlinkTestBase.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/FlinkTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -45,8 +44,7 @@ public abstract class FlinkTestBase extends TestBaseUtils { public static MiniClusterWithClientResource miniClusterResource = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private static TestHiveMetastore metastore = null; protected static HiveConf hiveConf = null; @@ -59,8 +57,10 @@ public static void startMetastore() { FlinkTestBase.metastore = new TestHiveMetastore(); metastore.start(); FlinkTestBase.hiveConf = metastore.hiveConf(); - FlinkTestBase.catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + FlinkTestBase.catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); } @AfterClass @@ -73,14 +73,13 @@ protected TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { if (tEnv == null) { - EnvironmentSettings settings = EnvironmentSettings - .newInstance() - .useBlinkPlanner() - .inBatchMode() - .build(); + EnvironmentSettings settings = + EnvironmentSettings.newInstance().useBlinkPlanner().inBatchMode().build(); TableEnvironment env = TableEnvironment.create(settings); - env.getConfig().getConfiguration().set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); + env.getConfig() + .getConfiguration() + .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); tEnv = env; } } @@ -106,9 +105,7 @@ protected List sql(String query, Object... args) { } protected void assertSameElements(Iterable expected, Iterable actual) { - Assertions.assertThat(actual) - .isNotNull() - .containsExactlyInAnyOrderElementsOf(expected); + Assertions.assertThat(actual).isNotNull().containsExactlyInAnyOrderElementsOf(expected); } protected void assertSameElements(String message, Iterable expected, Iterable actual) { diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java index 9dfa1acf2719..45af9241b743 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import org.apache.flink.configuration.Configuration; @@ -29,20 +28,18 @@ public class MiniClusterResource { private static final int DEFAULT_TM_NUM = 1; private static final int DEFAULT_PARALLELISM = 4; - public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = new Configuration() - // disable classloader check as Avro may cache class/object in the serializers. - .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - - private MiniClusterResource() { + public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = + new Configuration() + // disable classloader check as Avro may cache class/object in the serializers. + .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - } + private MiniClusterResource() {} /** - * It will start a mini cluster with classloader.check-leaked-classloader=false, - * so that we won't break the unit tests because of the class loader leak issue. - * In our iceberg integration tests, there're some that will assert the results - * after finished the flink jobs, so actually we may access the class loader - * that has been closed by the flink task managers if we enable the switch + * It will start a mini cluster with classloader.check-leaked-classloader=false, so that we won't + * break the unit tests because of the class loader leak issue. In our iceberg integration tests, + * there're some that will assert the results after finished the flink jobs, so actually we may + * access the class loader that has been closed by the flink task managers if we enable the switch * classloader.check-leaked-classloader by default. */ public static MiniClusterWithClientResource createWithClassloaderCheckDisabled() { @@ -53,5 +50,4 @@ public static MiniClusterWithClientResource createWithClassloaderCheckDisabled() .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) .build()); } - } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java index 59306d638ee2..c73fa1e4bc97 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.math.BigDecimal; @@ -50,8 +49,7 @@ public class RowDataConverter { private static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC); private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); - private RowDataConverter() { - } + private RowDataConverter() {} public static RowData convert(Schema iSchema, Record record) { return convert(iSchema.asStruct(), record); @@ -117,11 +115,14 @@ private static Object convert(Type type, Object object) { return bb.array(); case BINARY: ByteBuffer buffer = (ByteBuffer) object; - return Arrays.copyOfRange(buffer.array(), buffer.arrayOffset() + buffer.position(), + return Arrays.copyOfRange( + buffer.array(), + buffer.arrayOffset() + buffer.position(), buffer.arrayOffset() + buffer.remaining()); case DECIMAL: Types.DecimalType decimalType = (Types.DecimalType) type; - return DecimalData.fromBigDecimal((BigDecimal) object, decimalType.precision(), decimalType.scale()); + return DecimalData.fromBigDecimal( + (BigDecimal) object, decimalType.precision(), decimalType.scale()); case STRUCT: return convert(type.asStructType(), (Record) object); case LIST: @@ -137,8 +138,7 @@ private static Object convert(Type type, Object object) { for (Map.Entry entry : map.entrySet()) { convertedMap.put( convert(type.asMapType().keyType(), entry.getKey()), - convert(type.asMapType().valueType(), entry.getValue()) - ); + convert(type.asMapType().valueType(), entry.getValue())); } return new GenericMapData(convertedMap); default: diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java index 38b1cf36038e..2202e4328c65 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.hadoop.HadoopOutputFile.fromPath; + import java.io.IOException; import java.util.Collections; import java.util.List; @@ -67,28 +68,24 @@ import org.apache.iceberg.util.StructLikeWrapper; import org.junit.Assert; -import static org.apache.iceberg.hadoop.HadoopOutputFile.fromPath; - public class SimpleDataUtil { - private SimpleDataUtil() { - } + private SimpleDataUtil() {} - public static final Schema SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + public static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); - public static final TableSchema FLINK_SCHEMA = TableSchema.builder() - .field("id", DataTypes.INT()) - .field("data", DataTypes.STRING()) - .build(); + public static final TableSchema FLINK_SCHEMA = + TableSchema.builder().field("id", DataTypes.INT()).field("data", DataTypes.STRING()).build(); public static final RowType ROW_TYPE = (RowType) FLINK_SCHEMA.toRowDataType().getLogicalType(); public static final Record RECORD = GenericRecord.create(SCHEMA); - public static Table createTable(String path, Map properties, boolean partitioned) { + public static Table createTable( + String path, Map properties, boolean partitioned) { PartitionSpec spec; if (partitioned) { spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); @@ -125,8 +122,13 @@ public static RowData createUpdateAfter(Integer id, String data) { return GenericRowData.ofKind(RowKind.UPDATE_AFTER, id, StringData.fromString(data)); } - public static DataFile writeFile(Schema schema, PartitionSpec spec, Configuration conf, - String location, String filename, List rows) + public static DataFile writeFile( + Schema schema, + PartitionSpec spec, + Configuration conf, + String location, + String filename, + List rows) throws IOException { Path path = new Path(location, filename); FileFormat fileFormat = FileFormat.fromFileName(filename); @@ -147,27 +149,38 @@ public static DataFile writeFile(Schema schema, PartitionSpec spec, Configuratio .build(); } - public static DeleteFile writeEqDeleteFile(Table table, FileFormat format, String tablePath, String filename, - FileAppenderFactory appenderFactory, - List deletes) throws IOException { + public static DeleteFile writeEqDeleteFile( + Table table, + FileFormat format, + String tablePath, + String filename, + FileAppenderFactory appenderFactory, + List deletes) + throws IOException { EncryptedOutputFile outputFile = table.encryption().encrypt(fromPath(new Path(tablePath, filename), new Configuration())); - EqualityDeleteWriter eqWriter = appenderFactory.newEqDeleteWriter(outputFile, format, null); + EqualityDeleteWriter eqWriter = + appenderFactory.newEqDeleteWriter(outputFile, format, null); try (EqualityDeleteWriter writer = eqWriter) { writer.deleteAll(deletes); } return eqWriter.toDeleteFile(); } - public static DeleteFile writePosDeleteFile(Table table, FileFormat format, String tablePath, - String filename, - FileAppenderFactory appenderFactory, - List> positions) throws IOException { + public static DeleteFile writePosDeleteFile( + Table table, + FileFormat format, + String tablePath, + String filename, + FileAppenderFactory appenderFactory, + List> positions) + throws IOException { EncryptedOutputFile outputFile = table.encryption().encrypt(fromPath(new Path(tablePath, filename), new Configuration())); - PositionDeleteWriter posWriter = appenderFactory.newPosDeleteWriter(outputFile, format, null); + PositionDeleteWriter posWriter = + appenderFactory.newPosDeleteWriter(outputFile, format, null); try (PositionDeleteWriter writer = posWriter) { for (Pair p : positions) { writer.delete(p.first(), p.second()); @@ -212,7 +225,8 @@ public static void assertTableRecords(Table table, List expected) throws } } - public static void assertTableRecords(String tablePath, List expected) throws IOException { + public static void assertTableRecords(String tablePath, List expected) + throws IOException { Preconditions.checkArgument(expected != null, "expected records shouldn't be null"); assertTableRecords(new HadoopTables().load(tablePath), expected); } @@ -227,14 +241,15 @@ public static StructLikeSet actualRowSet(Table table, String... columns) throws return actualRowSet(table, null, columns); } - public static StructLikeSet actualRowSet(Table table, Long snapshotId, String... columns) throws IOException { + public static StructLikeSet actualRowSet(Table table, Long snapshotId, String... columns) + throws IOException { table.refresh(); StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - try (CloseableIterable reader = IcebergGenerics - .read(table) - .useSnapshot(snapshotId == null ? table.currentSnapshot().snapshotId() : snapshotId) - .select(columns) - .build()) { + try (CloseableIterable reader = + IcebergGenerics.read(table) + .useSnapshot(snapshotId == null ? table.currentSnapshot().snapshotId() : snapshotId) + .select(columns) + .build()) { reader.forEach(set::add); } return set; @@ -246,16 +261,14 @@ public static List partitionDataFiles(Table table, Map Types.StructType partitionType = table.spec().partitionType(); Record partitionRecord = GenericRecord.create(partitionType).copy(partitionValues); - StructLikeWrapper expectedWrapper = StructLikeWrapper - .forType(partitionType) - .set(partitionRecord); + StructLikeWrapper expectedWrapper = + StructLikeWrapper.forType(partitionType).set(partitionRecord); List dataFiles = Lists.newArrayList(); try (CloseableIterable fileScanTasks = table.newScan().planFiles()) { for (FileScanTask scanTask : fileScanTasks) { - StructLikeWrapper wrapper = StructLikeWrapper - .forType(partitionType) - .set(scanTask.file().partition()); + StructLikeWrapper wrapper = + StructLikeWrapper.forType(partitionType).set(scanTask.file().partition()); if (expectedWrapper.equals(wrapper)) { dataFiles.add(scanTask.file()); @@ -281,7 +294,9 @@ public static Map> snapshotToDataFiles(Table table) throws tableScan = tableScan.useSnapshot(current.snapshotId()); } try (CloseableIterable scanTasks = tableScan.planFiles()) { - result.put(current.snapshotId(), ImmutableList.copyOf(Iterables.transform(scanTasks, FileScanTask::file))); + result.put( + current.snapshotId(), + ImmutableList.copyOf(Iterables.transform(scanTasks, FileScanTask::file))); } // Continue to traverse the parent snapshot if exists. @@ -298,13 +313,14 @@ public static List matchingPartitions( List dataFiles, PartitionSpec partitionSpec, Map partitionValues) { Types.StructType partitionType = partitionSpec.partitionType(); Record partitionRecord = GenericRecord.create(partitionType).copy(partitionValues); - StructLikeWrapper expected = StructLikeWrapper - .forType(partitionType) - .set(partitionRecord); - return dataFiles.stream().filter(df -> { - StructLikeWrapper wrapper = StructLikeWrapper.forType(partitionType).set(df.partition()); - return wrapper.equals(expected); - }).collect(Collectors.toList()); + StructLikeWrapper expected = StructLikeWrapper.forType(partitionType).set(partitionRecord); + return dataFiles.stream() + .filter( + df -> { + StructLikeWrapper wrapper = + StructLikeWrapper.forType(partitionType).set(df.partition()); + return wrapper.equals(expected); + }) + .collect(Collectors.toList()); } - } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java index f0c4197681d0..3b0fe69c5655 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.ByteArrayInputStream; @@ -41,14 +40,13 @@ import org.junit.BeforeClass; import org.junit.Test; -/** - * Test for {@link CatalogLoader} and {@link TableLoader}. - */ +/** Test for {@link CatalogLoader} and {@link TableLoader}. */ public class TestCatalogTableLoader extends FlinkTestBase { private static File warehouse = null; private static final TableIdentifier IDENTIFIER = TableIdentifier.of("default", "my_table"); - private static final Schema SCHEMA = new Schema(Types.NestedField.required(1, "f1", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema(Types.NestedField.required(1, "f1", Types.StringType.get())); @BeforeClass public static void createWarehouse() throws IOException { @@ -91,12 +89,14 @@ public void testHiveCatalogTableLoader() throws IOException, ClassNotFoundExcept validateTableLoader(TableLoader.fromCatalog(catalogLoader, IDENTIFIER)); } - private static void validateCatalogLoader(CatalogLoader loader) throws IOException, ClassNotFoundException { + private static void validateCatalogLoader(CatalogLoader loader) + throws IOException, ClassNotFoundException { Table table = javaSerAndDeSer(loader).loadCatalog().createTable(IDENTIFIER, SCHEMA); validateHadoopConf(table); } - private static void validateTableLoader(TableLoader loader) throws IOException, ClassNotFoundException { + private static void validateTableLoader(TableLoader loader) + throws IOException, ClassNotFoundException { TableLoader copied = javaSerAndDeSer(loader); copied.open(); try { @@ -108,7 +108,9 @@ private static void validateTableLoader(TableLoader loader) throws IOException, private static void validateHadoopConf(Table table) { FileIO io = table.io(); - Assertions.assertThat(io).as("FileIO should be a HadoopFileIO").isInstanceOf(HadoopFileIO.class); + Assertions.assertThat(io) + .as("FileIO should be a HadoopFileIO") + .isInstanceOf(HadoopFileIO.class); HadoopFileIO hadoopIO = (HadoopFileIO) io; Assert.assertEquals("my_value", hadoopIO.conf().get("my_key")); } @@ -120,7 +122,8 @@ private static T javaSerAndDeSer(T object) throws IOException, ClassNotFound out.writeObject(object); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { return (T) in.readObject(); } } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java index 5c04c855149f..9987a16c7682 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -49,8 +48,9 @@ import org.junit.runners.Parameterized; /** - * In this test case, we mainly cover the impact of primary key selection, multiple operations within a single - * transaction, and multiple operations between different txn on the correctness of the data. + * In this test case, we mainly cover the impact of primary key selection, multiple operations + * within a single transaction, and multiple operations between different txn on the correctness of + * the data. */ @RunWith(Parameterized.class) public class TestChangeLogTable extends ChangeLogTableTestBase { @@ -66,10 +66,7 @@ public class TestChangeLogTable extends ChangeLogTableTestBase { @Parameterized.Parameters(name = "PartitionedTable={0}") public static Iterable parameters() { - return ImmutableList.of( - new Object[] {true}, - new Object[] {false} - ); + return ImmutableList.of(new Object[] {true}, new Object[] {false}); } public TestChangeLogTable(boolean partitioned) { @@ -85,7 +82,8 @@ public static void createWarehouse() throws IOException { @Before public void before() { - sql("CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + sql( + "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", CATALOG_NAME, warehouse); sql("USE CATALOG %s", CATALOG_NAME); sql("CREATE DATABASE %s", DATABASE_NAME); @@ -103,137 +101,112 @@ public void clean() { @Test public void testSqlChangeLogOnIdKey() throws Exception { - List> inputRowsPerCheckpoint = ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(1, "bbb"), - insertRow(2, "aaa"), - deleteRow(2, "aaa"), - insertRow(2, "bbb") - ), + List> inputRowsPerCheckpoint = ImmutableList.of( - updateBeforeRow(2, "bbb"), - updateAfterRow(2, "ccc"), - deleteRow(2, "ccc"), - insertRow(2, "ddd") - ), + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(1, "bbb"), + insertRow(2, "aaa"), + deleteRow(2, "aaa"), + insertRow(2, "bbb")), + ImmutableList.of( + updateBeforeRow(2, "bbb"), + updateAfterRow(2, "ccc"), + deleteRow(2, "ccc"), + insertRow(2, "ddd")), + ImmutableList.of( + deleteRow(1, "bbb"), + insertRow(1, "ccc"), + deleteRow(1, "ccc"), + insertRow(1, "ddd"))); + + List> expectedRecordsPerCheckpoint = ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(1, "ccc"), - deleteRow(1, "ccc"), - insertRow(1, "ddd") - ) - ); - - List> expectedRecordsPerCheckpoint = ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "bbb")), - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "ddd")), - ImmutableList.of(insertRow(1, "ddd"), insertRow(2, "ddd")) - ); - - testSqlChangeLog(TABLE_NAME, ImmutableList.of("id"), inputRowsPerCheckpoint, - expectedRecordsPerCheckpoint); + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "bbb")), + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "ddd")), + ImmutableList.of(insertRow(1, "ddd"), insertRow(2, "ddd"))); + + testSqlChangeLog( + TABLE_NAME, ImmutableList.of("id"), inputRowsPerCheckpoint, expectedRecordsPerCheckpoint); } @Test public void testChangeLogOnDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( + List> elementsPerCheckpoint = ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(1, "bbb"), - insertRow(2, "aaa") - ), + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(2, "bbb"), + insertRow(1, "bbb"), + insertRow(2, "aaa")), + ImmutableList.of( + updateBeforeRow(2, "aaa"), updateAfterRow(1, "ccc"), insertRow(1, "aaa")), + ImmutableList.of(deleteRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "ccc"))); + + List> expectedRecords = ImmutableList.of( - updateBeforeRow(2, "aaa"), - updateAfterRow(1, "ccc"), - insertRow(1, "aaa") - ), - ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(2, "aaa"), - insertRow(2, "ccc") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa")), - ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc")), - ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "ccc"), insertRow(2, "aaa"), insertRow(2, "ccc")) - ); + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa")), + ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc")), + ImmutableList.of( + insertRow(1, "aaa"), + insertRow(1, "ccc"), + insertRow(2, "aaa"), + insertRow(2, "ccc"))); testSqlChangeLog(TABLE_NAME, ImmutableList.of("data"), elementsPerCheckpoint, expectedRecords); } @Test public void testChangeLogOnIdDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(1, "bbb"), - insertRow(2, "aaa") - ), + List> elementsPerCheckpoint = ImmutableList.of( - updateBeforeRow(2, "aaa"), - updateAfterRow(1, "ccc"), - insertRow(1, "aaa") - ), + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(2, "bbb"), + insertRow(1, "bbb"), + insertRow(2, "aaa")), + ImmutableList.of( + updateBeforeRow(2, "aaa"), updateAfterRow(1, "ccc"), insertRow(1, "aaa")), + ImmutableList.of(deleteRow(1, "bbb"), insertRow(2, "aaa"))); + + List> expectedRecords = ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(2, "aaa") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "bbb")), - ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc"), insertRow(2, "bbb")), - ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "ccc"), insertRow(2, "aaa"), insertRow(2, "bbb")) - ); - - testSqlChangeLog(TABLE_NAME, ImmutableList.of("data", "id"), elementsPerCheckpoint, expectedRecords); + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "bbb")), + ImmutableList.of( + insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc"), insertRow(2, "bbb")), + ImmutableList.of( + insertRow(1, "aaa"), + insertRow(1, "ccc"), + insertRow(2, "aaa"), + insertRow(2, "bbb"))); + + testSqlChangeLog( + TABLE_NAME, ImmutableList.of("data", "id"), elementsPerCheckpoint, expectedRecords); } @Test public void testPureInsertOnIdKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( + List> elementsPerCheckpoint = ImmutableList.of( - insertRow(1, "aaa"), - insertRow(2, "bbb") - ), - ImmutableList.of( - insertRow(3, "ccc"), - insertRow(4, "ddd") - ), - ImmutableList.of( - insertRow(5, "eee"), - insertRow(6, "fff") - ) - ); + ImmutableList.of(insertRow(1, "aaa"), insertRow(2, "bbb")), + ImmutableList.of(insertRow(3, "ccc"), insertRow(4, "ddd")), + ImmutableList.of(insertRow(5, "eee"), insertRow(6, "fff"))); - List> expectedRecords = ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - insertRow(2, "bbb") - ), - ImmutableList.of( - insertRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(3, "ccc"), - insertRow(4, "ddd") - ), + List> expectedRecords = ImmutableList.of( - insertRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(3, "ccc"), - insertRow(4, "ddd"), - insertRow(5, "eee"), - insertRow(6, "fff") - ) - ); + ImmutableList.of(insertRow(1, "aaa"), insertRow(2, "bbb")), + ImmutableList.of( + insertRow(1, "aaa"), insertRow(2, "bbb"), insertRow(3, "ccc"), insertRow(4, "ddd")), + ImmutableList.of( + insertRow(1, "aaa"), + insertRow(2, "bbb"), + insertRow(3, "ccc"), + insertRow(4, "ddd"), + insertRow(5, "eee"), + insertRow(6, "fff"))); testSqlChangeLog(TABLE_NAME, ImmutableList.of("data"), elementsPerCheckpoint, expectedRecords); } @@ -244,13 +217,14 @@ private static Record record(int id, String data) { private Table createTable(String tableName, List key, boolean isPartitioned) { String partitionByCause = isPartitioned ? "PARTITIONED BY (data)" : ""; - sql("CREATE TABLE %s(id INT, data VARCHAR, PRIMARY KEY(%s) NOT ENFORCED) %s", + sql( + "CREATE TABLE %s(id INT, data VARCHAR, PRIMARY KEY(%s) NOT ENFORCED) %s", tableName, Joiner.on(',').join(key), partitionByCause); // Upgrade the iceberg table to format v2. - CatalogLoader loader = CatalogLoader.hadoop("my_catalog", CONF, ImmutableMap.of( - CatalogProperties.WAREHOUSE_LOCATION, warehouse - )); + CatalogLoader loader = + CatalogLoader.hadoop( + "my_catalog", CONF, ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouse)); Table table = loader.loadCatalog().loadTable(TableIdentifier.of(DATABASE_NAME, TABLE_NAME)); TableOperations ops = ((BaseTable) table).operations(); TableMetadata meta = ops.current(); @@ -259,15 +233,20 @@ private Table createTable(String tableName, List key, boolean isPartitio return table; } - private void testSqlChangeLog(String tableName, - List key, - List> inputRowsPerCheckpoint, - List> expectedRecordsPerCheckpoint) throws Exception { + private void testSqlChangeLog( + String tableName, + List key, + List> inputRowsPerCheckpoint, + List> expectedRecordsPerCheckpoint) + throws Exception { String dataId = BoundedTableFactory.registerDataSet(inputRowsPerCheckpoint); - sql("CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + - " WITH ('connector'='BoundedSource', 'data-id'='%s')", SOURCE_TABLE, dataId); + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + + " WITH ('connector'='BoundedSource', 'data-id'='%s')", + SOURCE_TABLE, dataId); - Assert.assertEquals("Should have the expected rows", + Assert.assertEquals( + "Should have the expected rows", listJoin(inputRowsPerCheckpoint), sql("SELECT * FROM %s", SOURCE_TABLE)); @@ -277,17 +256,21 @@ private void testSqlChangeLog(String tableName, table.refresh(); List snapshots = findValidSnapshots(table); int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); - Assert.assertEquals("Should have the expected snapshot number", expectedSnapshotNum, snapshots.size()); + Assert.assertEquals( + "Should have the expected snapshot number", expectedSnapshotNum, snapshots.size()); for (int i = 0; i < expectedSnapshotNum; i++) { long snapshotId = snapshots.get(i).snapshotId(); List expectedRows = expectedRecordsPerCheckpoint.get(i); - Assert.assertEquals("Should have the expected records for the checkpoint#" + i, - expectedRowSet(table, expectedRows), actualRowSet(table, snapshotId)); + Assert.assertEquals( + "Should have the expected records for the checkpoint#" + i, + expectedRowSet(table, expectedRows), + actualRowSet(table, snapshotId)); } if (expectedSnapshotNum > 0) { - Assert.assertEquals("Should have the expected rows in the final table", + Assert.assertEquals( + "Should have the expected rows in the final table", Sets.newHashSet(expectedRecordsPerCheckpoint.get(expectedSnapshotNum - 1)), Sets.newHashSet(sql("SELECT * FROM %s", tableName))); } @@ -296,7 +279,8 @@ private void testSqlChangeLog(String tableName, private List findValidSnapshots(Table table) { List validSnapshots = Lists.newArrayList(); for (Snapshot snapshot : table.snapshots()) { - if (snapshot.allManifests(table.io()).stream().anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { + if (snapshot.allManifests(table.io()).stream() + .anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { validSnapshots.add(snapshot); } } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java index fe9deb37684f..e9372adda4c1 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -45,21 +47,17 @@ import org.assertj.core.api.Assertions; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestDataFileSerialization { - private static final Schema DATE_SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema DATE_SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec PARTITION_SPEC = PartitionSpec - .builderFor(DATE_SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec PARTITION_SPEC = + PartitionSpec.builderFor(DATE_SCHEMA).identity("date").build(); private static final Map COLUMN_SIZES = Maps.newHashMap(); private static final Map VALUE_COUNTS = Maps.newHashMap(); @@ -81,40 +79,43 @@ public class TestDataFileSerialization { UPPER_BOUNDS.put(1, longToBuffer(4L)); } - private static final Metrics METRICS = new Metrics( - 5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS); - - private static final DataFile DATA_FILE = DataFiles - .builder(PARTITION_SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(1234) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withSplitOffsets(ImmutableList.of(4L)) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) - .withSortOrder(SortOrder.unsorted()) - .build(); - - private static final DeleteFile POS_DELETE_FILE = FileMetadata.deleteFileBuilder(PARTITION_SPEC) - .ofPositionDeletes() - .withPath("/path/to/pos-delete.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) - .withRecordCount(23) - .build(); - - private static final DeleteFile EQ_DELETE_FILE = FileMetadata.deleteFileBuilder(PARTITION_SPEC) - .ofEqualityDeletes(2, 3) - .withPath("/path/to/equality-delete.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) - .withRecordCount(23) - .withSortOrder(SortOrder.unsorted()) - .build(); + private static final Metrics METRICS = + new Metrics( + 5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS); + + private static final DataFile DATA_FILE = + DataFiles.builder(PARTITION_SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(1234) + .withPartitionPath("date=2018-06-08") + .withMetrics(METRICS) + .withSplitOffsets(ImmutableList.of(4L)) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) + .withSortOrder(SortOrder.unsorted()) + .build(); + + private static final DeleteFile POS_DELETE_FILE = + FileMetadata.deleteFileBuilder(PARTITION_SPEC) + .ofPositionDeletes() + .withPath("/path/to/pos-delete.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("date=2018-06-08") + .withMetrics(METRICS) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) + .withRecordCount(23) + .build(); + + private static final DeleteFile EQ_DELETE_FILE = + FileMetadata.deleteFileBuilder(PARTITION_SPEC) + .ofEqualityDeletes(2, 3) + .withPath("/path/to/equality-delete.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("date=2018-06-08") + .withMetrics(METRICS) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) + .withRecordCount(23) + .withSortOrder(SortOrder.unsorted()) + .build(); @Test public void testJavaSerialization() throws Exception { @@ -130,7 +131,8 @@ public void testJavaSerialization() throws Exception { out.writeObject(EQ_DELETE_FILE.copy()); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { for (int i = 0; i < 2; i += 1) { Object obj = in.readObject(); Assertions.assertThat(obj).as("Should be a DataFile").isInstanceOf(DataFile.class); @@ -139,13 +141,17 @@ public void testJavaSerialization() throws Exception { for (int i = 0; i < 2; i += 1) { Object obj = in.readObject(); - Assertions.assertThat(obj).as("Should be a position DeleteFile").isInstanceOf(DeleteFile.class); + Assertions.assertThat(obj) + .as("Should be a position DeleteFile") + .isInstanceOf(DeleteFile.class); TestHelpers.assertEquals(POS_DELETE_FILE, (DeleteFile) obj); } for (int i = 0; i < 2; i += 1) { Object obj = in.readObject(); - Assertions.assertThat(obj).as("Should be a equality DeleteFile").isInstanceOf(DeleteFile.class); + Assertions.assertThat(obj) + .as("Should be a equality DeleteFile") + .isInstanceOf(DeleteFile.class); TestHelpers.assertEquals(EQ_DELETE_FILE, (DeleteFile) obj); } } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java index 71532c59402c..c4cb78f034ad 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java @@ -16,32 +16,28 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.types.Types.NestedField.required; + import org.apache.flink.table.types.logical.RowType; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestFixtures { - private TestFixtures() { - - } + private TestFixtures() {} - public static final Schema SCHEMA = new Schema( - required(1, "data", Types.StringType.get()), - required(2, "id", Types.LongType.get()), - required(3, "dt", Types.StringType.get())); + public static final Schema SCHEMA = + new Schema( + required(1, "data", Types.StringType.get()), + required(2, "id", Types.LongType.get()), + required(3, "dt", Types.StringType.get())); - public static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .identity("dt") - .bucket("id", 1) - .build(); + public static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("dt").bucket("id", 1).build(); public static final RowType ROW_TYPE = FlinkSchemaUtil.convert(SCHEMA); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java index 180a2bc5f01b..d4de12c62300 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -58,16 +57,21 @@ public void testCreateNamespace() { sql("CREATE DATABASE %s", flinkDatabase); - Assert.assertTrue("Database should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Database should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); - Assert.assertTrue("Database should still exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Database should still exist", + validationNamespaceCatalog.namespaceExists(icebergNamespace)); sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - Assert.assertFalse("Database should be dropped", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertFalse( + "Database should be dropped", validationNamespaceCatalog.namespaceExists(icebergNamespace)); sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); - Assert.assertTrue("Database should be created", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Database should be created", validationNamespaceCatalog.namespaceExists(icebergNamespace)); } @Test @@ -75,9 +79,12 @@ public void testDefaultDatabase() { sql("USE CATALOG %s", catalogName); sql("SHOW TABLES"); - Assert.assertEquals("Should use the current catalog", getTableEnv().getCurrentCatalog(), catalogName); - Assert.assertEquals("Should use the configured default namespace", - getTableEnv().getCurrentDatabase(), "default"); + Assert.assertEquals( + "Should use the current catalog", getTableEnv().getCurrentCatalog(), catalogName); + Assert.assertEquals( + "Should use the configured default namespace", + getTableEnv().getCurrentDatabase(), + "default"); } @Test @@ -88,7 +95,8 @@ public void testDropEmptyDatabase() { sql("CREATE DATABASE %s", flinkDatabase); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); sql("DROP DATABASE %s", flinkDatabase); @@ -99,7 +107,8 @@ public void testDropEmptyDatabase() { @Test public void testDropNonEmptyNamespace() { - Assume.assumeFalse("Hadoop catalog throws IOException: Directory is not empty.", isHadoopCatalog); + Assume.assumeFalse( + "Hadoop catalog throws IOException: Directory is not empty.", isHadoopCatalog); Assert.assertFalse( "Namespace should not already exist", @@ -111,8 +120,11 @@ public void testDropNonEmptyNamespace() { TableIdentifier.of(icebergNamespace, "tl"), new Schema(Types.NestedField.optional(0, "id", Types.LongType.get()))); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - Assert.assertTrue("Table should exist", validationCatalog.tableExists(TableIdentifier.of(icebergNamespace, "tl"))); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Table should exist", + validationCatalog.tableExists(TableIdentifier.of(icebergNamespace, "tl"))); AssertHelpers.assertThrowsCause( "Should fail if trying to delete a non-empty database", @@ -133,7 +145,8 @@ public void testListTables() { sql("USE CATALOG %s", catalogName); sql("USE %s", DATABASE); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); Assert.assertEquals("Should not list any tables", 0, sql("SHOW TABLES").size()); @@ -155,29 +168,35 @@ public void testListNamespace() { sql("CREATE DATABASE %s", flinkDatabase); sql("USE CATALOG %s", catalogName); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); List databases = sql("SHOW DATABASES"); if (isHadoopCatalog) { Assert.assertEquals("Should have 2 database", 2, databases.size()); - Assert.assertEquals("Should have db and default database", + Assert.assertEquals( + "Should have db and default database", Sets.newHashSet("default", "db"), Sets.newHashSet(databases.get(0).getField(0), databases.get(1).getField(0))); if (!baseNamespace.isEmpty()) { // test namespace not belongs to this catalog - validationNamespaceCatalog.createNamespace(Namespace.of(baseNamespace.level(0), "UNKNOWN_NAMESPACE")); + validationNamespaceCatalog.createNamespace( + Namespace.of(baseNamespace.level(0), "UNKNOWN_NAMESPACE")); databases = sql("SHOW DATABASES"); Assert.assertEquals("Should have 2 database", 2, databases.size()); - Assert.assertEquals("Should have db and default database", + Assert.assertEquals( + "Should have db and default database", Sets.newHashSet("default", "db"), Sets.newHashSet(databases.get(0).getField(0), databases.get(1).getField(0))); } } else { - // If there are multiple classes extends FlinkTestBase, TestHiveMetastore may loose the creation for default + // If there are multiple classes extends FlinkTestBase, TestHiveMetastore may loose the + // creation for default // database. See HiveMetaStore.HMSHandler.init. - Assert.assertTrue("Should have db database", + Assert.assertTrue( + "Should have db database", databases.stream().anyMatch(d -> Objects.equals(d.getField(0), "db"))); } } @@ -192,11 +211,14 @@ public void testCreateNamespaceWithMetadata() { sql("CREATE DATABASE %s WITH ('prop'='value')", flinkDatabase); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - Assert.assertEquals("Namespace should have expected prop value", "value", nsMetadata.get("prop")); + Assert.assertEquals( + "Namespace should have expected prop value", "value", nsMetadata.get("prop")); } @Test @@ -209,11 +231,14 @@ public void testCreateNamespaceWithComment() { sql("CREATE DATABASE %s COMMENT 'namespace doc'", flinkDatabase); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - Assert.assertEquals("Namespace should have expected comment", "namespace doc", nsMetadata.get("comment")); + Assert.assertEquals( + "Namespace should have expected comment", "namespace doc", nsMetadata.get("comment")); } @Test @@ -229,12 +254,16 @@ public void testCreateNamespaceWithLocation() throws Exception { sql("CREATE DATABASE %s WITH ('location'='%s')", flinkDatabase, location); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - Assert.assertEquals("Namespace should have expected location", - "file:" + location.getPath(), nsMetadata.get("location")); + Assert.assertEquals( + "Namespace should have expected location", + "file:" + location.getPath(), + nsMetadata.get("location")); } @Test @@ -247,16 +276,21 @@ public void testSetProperties() { sql("CREATE DATABASE %s", flinkDatabase); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - Map defaultMetadata = validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - Assert.assertFalse("Default metadata should not have custom property", defaultMetadata.containsKey("prop")); + Map defaultMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + Assert.assertFalse( + "Default metadata should not have custom property", defaultMetadata.containsKey("prop")); sql("ALTER DATABASE %s SET ('prop'='value')", flinkDatabase); - Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - Assert.assertEquals("Namespace should have expected prop value", "value", nsMetadata.get("prop")); + Assert.assertEquals( + "Namespace should have expected prop value", "value", nsMetadata.get("prop")); } @Test diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java index cd893d836dc8..f7edd5653ebd 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.Map; @@ -46,11 +45,12 @@ public void before() { @Test public void testCreateCreateCatalogHive() { String catalogName = "hiveCatalog"; - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); + props.put( + FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); - Catalog catalog = FlinkCatalogFactory - .createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); + Catalog catalog = + FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) + .loadCatalog(); Assertions.assertThat(catalog).isNotNull().isInstanceOf(HiveCatalog.class); } @@ -58,11 +58,12 @@ public void testCreateCreateCatalogHive() { @Test public void testCreateCreateCatalogHadoop() { String catalogName = "hadoopCatalog"; - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HADOOP); + props.put( + FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HADOOP); - Catalog catalog = FlinkCatalogFactory - .createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); + Catalog catalog = + FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) + .loadCatalog(); Assertions.assertThat(catalog).isNotNull().isInstanceOf(HadoopCatalog.class); } @@ -72,9 +73,9 @@ public void testCreateCreateCatalogCustom() { String catalogName = "customCatalog"; props.put(CatalogProperties.CATALOG_IMPL, CustomHadoopCatalog.class.getName()); - Catalog catalog = FlinkCatalogFactory - .createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); + Catalog catalog = + FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) + .loadCatalog(); Assertions.assertThat(catalog).isNotNull().isInstanceOf(CustomHadoopCatalog.class); } @@ -83,13 +84,14 @@ public void testCreateCreateCatalogCustom() { public void testCreateCreateCatalogCustomWithHiveCatalogTypeSet() { String catalogName = "customCatalog"; props.put(CatalogProperties.CATALOG_IMPL, CustomHadoopCatalog.class.getName()); - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); + props.put( + FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); AssertHelpers.assertThrows( "Should throw when both catalog-type and catalog-impl are set", IllegalArgumentException.class, - "both catalog-type and catalog-impl are set", () -> - FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())); + "both catalog-type and catalog-impl are set", + () -> FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())); } @Test @@ -100,20 +102,18 @@ public void testLoadCatalogUnknown() { AssertHelpers.assertThrows( "Should throw when an unregistered / unknown catalog is set as the catalog factor's`type` setting", UnsupportedOperationException.class, - "Unknown catalog-type", () -> - FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) - ); + "Unknown catalog-type", + () -> FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())); } public static class CustomHadoopCatalog extends HadoopCatalog { - public CustomHadoopCatalog() { - - } + public CustomHadoopCatalog() {} public CustomHadoopCatalog(Configuration conf, String warehouseLocation) { setConf(conf); - initialize("custom", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation)); + initialize( + "custom", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation)); } } } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java index 577691eec798..897480d495c3 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.Arrays; @@ -89,26 +88,27 @@ public void testGetTable() { sql("CREATE TABLE tl(id BIGINT, strV STRING)"); Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, "tl")); - Schema iSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "strV", Types.StringType.get()) - ); - Assert.assertEquals("Should load the expected iceberg schema", iSchema.toString(), table.schema().toString()); + Schema iSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "strV", Types.StringType.get())); + Assert.assertEquals( + "Should load the expected iceberg schema", iSchema.toString(), table.schema().toString()); } @Test public void testRenameTable() { Assume.assumeFalse("HadoopCatalog does not support rename table", isHadoopCatalog); - final Schema tableSchema = new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); + final Schema tableSchema = + new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); validationCatalog.createTable(TableIdentifier.of(icebergNamespace, "tl"), tableSchema); sql("ALTER TABLE tl RENAME TO tl2"); AssertHelpers.assertThrows( "Should fail if trying to get a nonexistent table", ValidationException.class, "Table `tl` was not found.", - () -> getTableEnv().from("tl") - ); + () -> getTableEnv().from("tl")); Schema actualSchema = FlinkSchemaUtil.convert(getTableEnv().from("tl2").getSchema()); Assert.assertEquals(tableSchema.asStruct(), actualSchema.asStruct()); } @@ -124,7 +124,8 @@ public void testCreateTable() throws TableNotExistException { Assert.assertEquals(Maps.newHashMap(), table.properties()); CatalogTable catalogTable = catalogTable("tl"); - Assert.assertEquals(TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); + Assert.assertEquals( + TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); Assert.assertEquals(Maps.newHashMap(), catalogTable.getOptions()); } @@ -133,33 +134,41 @@ public void testCreateTableWithPrimaryKey() throws Exception { sql("CREATE TABLE tl(id BIGINT, data STRING, key STRING PRIMARY KEY NOT ENFORCED)"); Table table = table("tl"); - Assert.assertEquals("Should have the expected row key.", + Assert.assertEquals( + "Should have the expected row key.", Sets.newHashSet(table.schema().findField("key").fieldId()), table.schema().identifierFieldIds()); CatalogTable catalogTable = catalogTable("tl"); Optional uniqueConstraintOptional = catalogTable.getSchema().getPrimaryKey(); - Assert.assertTrue("Should have the expected unique constraint", uniqueConstraintOptional.isPresent()); - Assert.assertEquals("Should have the expected columns", - ImmutableList.of("key"), uniqueConstraintOptional.get().getColumns()); + Assert.assertTrue( + "Should have the expected unique constraint", uniqueConstraintOptional.isPresent()); + Assert.assertEquals( + "Should have the expected columns", + ImmutableList.of("key"), + uniqueConstraintOptional.get().getColumns()); } @Test public void testCreateTableWithMultiColumnsInPrimaryKey() throws Exception { - sql("CREATE TABLE tl(id BIGINT, data STRING, CONSTRAINT pk_constraint PRIMARY KEY(data, id) NOT ENFORCED)"); + sql( + "CREATE TABLE tl(id BIGINT, data STRING, CONSTRAINT pk_constraint PRIMARY KEY(data, id) NOT ENFORCED)"); Table table = table("tl"); - Assert.assertEquals("Should have the expected RowKey", + Assert.assertEquals( + "Should have the expected RowKey", Sets.newHashSet( - table.schema().findField("id").fieldId(), - table.schema().findField("data").fieldId()), + table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId()), table.schema().identifierFieldIds()); CatalogTable catalogTable = catalogTable("tl"); Optional uniqueConstraintOptional = catalogTable.getSchema().getPrimaryKey(); - Assert.assertTrue("Should have the expected unique constraint", uniqueConstraintOptional.isPresent()); - Assert.assertEquals("Should have the expected columns", - ImmutableSet.of("data", "id"), ImmutableSet.copyOf(uniqueConstraintOptional.get().getColumns())); + Assert.assertTrue( + "Should have the expected unique constraint", uniqueConstraintOptional.isPresent()); + Assert.assertEquals( + "Should have the expected columns", + ImmutableSet.of("data", "id"), + ImmutableSet.copyOf(uniqueConstraintOptional.get().getColumns())); } @Test @@ -170,7 +179,8 @@ public void testCreateTableIfNotExists() { Assert.assertEquals(Maps.newHashMap(), table("tl").properties()); sql("DROP TABLE tl"); - AssertHelpers.assertThrows("Table 'tl' should be dropped", + AssertHelpers.assertThrows( + "Table 'tl' should be dropped", NoSuchTableException.class, "Table does not exist: " + getFullQualifiedTableName("tl"), () -> table("tl")); @@ -179,14 +189,12 @@ public void testCreateTableIfNotExists() { Assert.assertEquals(Maps.newHashMap(), table("tl").properties()); final Map expectedProperties = ImmutableMap.of("key", "value"); - table("tl").updateProperties() - .set("key", "value") - .commit(); + table("tl").updateProperties().set("key", "value").commit(); Assert.assertEquals(expectedProperties, table("tl").properties()); sql("CREATE TABLE IF NOT EXISTS tl(id BIGINT)"); - Assert.assertEquals("Should still be the old table.", - expectedProperties, table("tl").properties()); + Assert.assertEquals( + "Should still be the old table.", expectedProperties, table("tl").properties()); } @Test @@ -201,13 +209,15 @@ public void testCreateTableLike() throws TableNotExistException { Assert.assertEquals(Maps.newHashMap(), table.properties()); CatalogTable catalogTable = catalogTable("tl2"); - Assert.assertEquals(TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); + Assert.assertEquals( + TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); Assert.assertEquals(Maps.newHashMap(), catalogTable.getOptions()); } @Test public void testCreateTableLocation() { - Assume.assumeFalse("HadoopCatalog does not support creating table with location", isHadoopCatalog); + Assume.assumeFalse( + "HadoopCatalog does not support creating table with location", isHadoopCatalog); sql("CREATE TABLE tl(id BIGINT) WITH ('location'='file:///tmp/location')"); @@ -226,15 +236,20 @@ public void testCreatePartitionTable() throws TableNotExistException { Table table = table("tl"); Assert.assertEquals( new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())).asStruct(), + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct(), table.schema().asStruct()); - Assert.assertEquals(PartitionSpec.builderFor(table.schema()).identity("dt").build(), table.spec()); + Assert.assertEquals( + PartitionSpec.builderFor(table.schema()).identity("dt").build(), table.spec()); Assert.assertEquals(Maps.newHashMap(), table.properties()); CatalogTable catalogTable = catalogTable("tl"); Assert.assertEquals( - TableSchema.builder().field("id", DataTypes.BIGINT()).field("dt", DataTypes.STRING()).build(), + TableSchema.builder() + .field("id", DataTypes.BIGINT()) + .field("dt", DataTypes.STRING()) + .build(), catalogTable.getSchema()); Assert.assertEquals(Maps.newHashMap(), catalogTable.getOptions()); Assert.assertEquals(Collections.singletonList("dt"), catalogTable.getPartitionKeys()); @@ -245,8 +260,10 @@ public void testCreateTableWithFormatV2ThroughTableProperty() throws Exception { sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='2')"); Table table = table("tl"); - Assert.assertEquals("should create table using format v2", - 2, ((BaseTable) table).operations().current().formatVersion()); + Assert.assertEquals( + "should create table using format v2", + 2, + ((BaseTable) table).operations().current().formatVersion()); } @Test @@ -255,12 +272,10 @@ public void testUpgradeTableWithFormatV2ThroughTableProperty() throws Exception Table table = table("tl"); TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("should create table using format v1", - 1, ops.refresh().formatVersion()); + Assert.assertEquals("should create table using format v1", 1, ops.refresh().formatVersion()); sql("ALTER TABLE tl SET('format-version'='2')"); - Assert.assertEquals("should update table to use format v2", - 2, ops.refresh().formatVersion()); + Assert.assertEquals("should update table to use format v2", 2, ops.refresh().formatVersion()); } @Test @@ -269,10 +284,10 @@ public void testDowngradeTableToFormatV1ThroughTablePropertyFails() throws Excep Table table = table("tl"); TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("should create table using format v2", - 2, ops.refresh().formatVersion()); + Assert.assertEquals("should create table using format v2", 2, ops.refresh().formatVersion()); - AssertHelpers.assertThrowsRootCause("should fail to downgrade to v1", + AssertHelpers.assertThrowsRootCause( + "should fail to downgrade to v1", IllegalArgumentException.class, "Cannot downgrade v2 table to v1", () -> sql("ALTER TABLE tl SET('format-version'='1')")); @@ -282,13 +297,13 @@ public void testDowngradeTableToFormatV1ThroughTablePropertyFails() throws Excep public void testLoadTransformPartitionTable() throws TableNotExistException { Schema schema = new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); validationCatalog.createTable( - TableIdentifier.of(icebergNamespace, "tl"), schema, + TableIdentifier.of(icebergNamespace, "tl"), + schema, PartitionSpec.builderFor(schema).bucket("id", 100).build()); CatalogTable catalogTable = catalogTable("tl"); Assert.assertEquals( - TableSchema.builder().field("id", DataTypes.BIGINT()).build(), - catalogTable.getSchema()); + TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); Assert.assertEquals(Maps.newHashMap(), catalogTable.getOptions()); Assert.assertEquals(Collections.emptyList(), catalogTable.getPartitionKeys()); } @@ -312,8 +327,10 @@ public void testAlterTable() throws TableNotExistException { // remove property CatalogTable catalogTable = catalogTable("tl"); properties.remove("oldK"); - getTableEnv().getCatalog(getTableEnv().getCurrentCatalog()).get().alterTable( - new ObjectPath(DATABASE, "tl"), catalogTable.copy(properties), false); + getTableEnv() + .getCatalog(getTableEnv().getCurrentCatalog()) + .get() + .alterTable(new ObjectPath(DATABASE, "tl"), catalogTable.copy(properties), false); Assert.assertEquals(properties, table("tl").properties()); } @@ -332,43 +349,40 @@ public void testSetCurrentAndCherryPickSnapshotId() { Table table = table("tl"); - DataFile fileA = DataFiles.builder(table.spec()) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - DataFile fileB = DataFiles.builder(table.spec()) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=1") // easy way to set partition data for now - .withRecordCount(1) - .build(); - DataFile replacementFile = DataFiles.builder(table.spec()) - .withPath("/path/to/data-a-replacement.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - - table.newAppend() - .appendFile(fileA) - .commit(); + DataFile fileA = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + DataFile fileB = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=1") // easy way to set partition data for now + .withRecordCount(1) + .build(); + DataFile replacementFile = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-a-replacement.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + + table.newAppend().appendFile(fileA).commit(); long snapshotId = table.currentSnapshot().snapshotId(); // stage an overwrite that replaces FILE_A - table.newReplacePartitions() - .addFile(replacementFile) - .stageOnly() - .commit(); + table.newReplacePartitions().addFile(replacementFile).stageOnly().commit(); Snapshot staged = Iterables.getLast(table.snapshots()); - Assert.assertEquals("Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); + Assert.assertEquals( + "Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); // add another append so that the original commit can't be fast-forwarded - table.newAppend() - .appendFile(fileB) - .commit(); + table.newAppend().appendFile(fileB).commit(); // test cherry pick sql("ALTER TABLE tl SET('cherry-pick-snapshot-id'='%s')", staged.snapshotId()); @@ -381,10 +395,13 @@ public void testSetCurrentAndCherryPickSnapshotId() { private void validateTableFiles(Table tbl, DataFile... expectedFiles) { tbl.refresh(); - Set expectedFilePaths = Arrays.stream(expectedFiles).map(DataFile::path).collect(Collectors.toSet()); - Set actualFilePaths = StreamSupport.stream(tbl.newScan().planFiles().spliterator(), false) - .map(FileScanTask::file).map(ContentFile::path) - .collect(Collectors.toSet()); + Set expectedFilePaths = + Arrays.stream(expectedFiles).map(DataFile::path).collect(Collectors.toSet()); + Set actualFilePaths = + StreamSupport.stream(tbl.newScan().planFiles().spliterator(), false) + .map(FileScanTask::file) + .map(ContentFile::path) + .collect(Collectors.toSet()); Assert.assertEquals("Files should match", expectedFilePaths, actualFilePaths); } @@ -393,7 +410,10 @@ private Table table(String name) { } private CatalogTable catalogTable(String name) throws TableNotExistException { - return (CatalogTable) getTableEnv().getCatalog(getTableEnv().getCurrentCatalog()).get() - .getTable(new ObjectPath(DATABASE, name)); + return (CatalogTable) + getTableEnv() + .getCatalog(getTableEnv().getCurrentCatalog()) + .get() + .getTable(new ObjectPath(DATABASE, name)); } } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java index b6c4812861f7..839700f50127 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.flink.FlinkCatalogFactory.CACHE_ENABLED; + import java.util.List; import org.apache.flink.table.catalog.CatalogPartitionSpec; import org.apache.flink.table.catalog.ObjectPath; @@ -35,18 +36,18 @@ import org.junit.Test; import org.junit.runners.Parameterized; -import static org.apache.iceberg.flink.FlinkCatalogFactory.CACHE_ENABLED; - public class TestFlinkCatalogTablePartitions extends FlinkCatalogTestBase { private String tableName = "test_table"; private final FileFormat format; - @Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, cacheEnabled={3}") + @Parameterized.Parameters( + name = "catalogName={0}, baseNamespace={1}, format={2}, cacheEnabled={3}") public static Iterable parameters() { List parameters = Lists.newArrayList(); - for (FileFormat format : new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { + for (FileFormat format : + new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { for (Boolean cacheEnabled : new Boolean[] {true, false}) { for (Object[] catalogParams : FlinkCatalogTestBase.parameters()) { String catalogName = (String) catalogParams[0]; @@ -58,8 +59,8 @@ public static Iterable parameters() { return parameters; } - public TestFlinkCatalogTablePartitions(String catalogName, Namespace baseNamespace, FileFormat format, - boolean cacheEnabled) { + public TestFlinkCatalogTablePartitions( + String catalogName, Namespace baseNamespace, FileFormat format, boolean cacheEnabled) { super(catalogName, baseNamespace); this.format = format; config.put(CACHE_ENABLED, String.valueOf(cacheEnabled)); @@ -83,20 +84,26 @@ public void cleanNamespaces() { @Test public void testListPartitionsWithUnpartitionedTable() { - sql("CREATE TABLE %s (id INT, data VARCHAR) with ('write.format.default'='%s')", + sql( + "CREATE TABLE %s (id INT, data VARCHAR) with ('write.format.default'='%s')", tableName, format.name()); sql("INSERT INTO %s SELECT 1,'a'", tableName); ObjectPath objectPath = new ObjectPath(DATABASE, tableName); FlinkCatalog flinkCatalog = (FlinkCatalog) getTableEnv().getCatalog(catalogName).get(); - AssertHelpers.assertThrows("Should not list partitions for unpartitioned table.", - TableNotPartitionedException.class, () -> flinkCatalog.listPartitions(objectPath)); + AssertHelpers.assertThrows( + "Should not list partitions for unpartitioned table.", + TableNotPartitionedException.class, + () -> flinkCatalog.listPartitions(objectPath)); } @Test - public void testListPartitionsWithPartitionedTable() throws TableNotExistException, TableNotPartitionedException { - sql("CREATE TABLE %s (id INT, data VARCHAR) PARTITIONED BY (data) " + - "with ('write.format.default'='%s')", tableName, format.name()); + public void testListPartitionsWithPartitionedTable() + throws TableNotExistException, TableNotPartitionedException { + sql( + "CREATE TABLE %s (id INT, data VARCHAR) PARTITIONED BY (data) " + + "with ('write.format.default'='%s')", + tableName, format.name()); sql("INSERT INTO %s SELECT 1,'a'", tableName); sql("INSERT INTO %s SELECT 2,'b'", tableName); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java index 0044acf57da2..c89ea4f53054 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.math.BigDecimal; @@ -56,36 +55,38 @@ public class TestFlinkFilters { - private static final TableSchema TABLE_SCHEMA = TableSchema.builder() - .field("field1", DataTypes.INT()) - .field("field2", DataTypes.BIGINT()) - .field("field3", DataTypes.FLOAT()) - .field("field4", DataTypes.DOUBLE()) - .field("field5", DataTypes.STRING()) - .field("field6", DataTypes.BOOLEAN()) - .field("field7", DataTypes.BINARY(2)) - .field("field8", DataTypes.DECIMAL(10, 2)) - .field("field9", DataTypes.DATE()) - .field("field10", DataTypes.TIME()) - .field("field11", DataTypes.TIMESTAMP()) - .field("field12", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) - .build(); - - // A map list of fields and values used to verify the conversion of flink expression to iceberg expression - private static final List> FIELD_VALUE_LIST = ImmutableList.of( - Pair.of("field1", 1), - Pair.of("field2", 2L), - Pair.of("field3", 3F), - Pair.of("field4", 4D), - Pair.of("field5", "iceberg"), - Pair.of("field6", true), - Pair.of("field7", new byte[] {'a', 'b'}), - Pair.of("field8", BigDecimal.valueOf(10.12)), - Pair.of("field9", DateTimeUtil.daysFromDate(LocalDate.now())), - Pair.of("field10", DateTimeUtil.microsFromTime(LocalTime.now())), - Pair.of("field11", DateTimeUtil.microsFromTimestamp(LocalDateTime.now())), - Pair.of("field12", DateTimeUtil.microsFromInstant(Instant.now())) - ); + private static final TableSchema TABLE_SCHEMA = + TableSchema.builder() + .field("field1", DataTypes.INT()) + .field("field2", DataTypes.BIGINT()) + .field("field3", DataTypes.FLOAT()) + .field("field4", DataTypes.DOUBLE()) + .field("field5", DataTypes.STRING()) + .field("field6", DataTypes.BOOLEAN()) + .field("field7", DataTypes.BINARY(2)) + .field("field8", DataTypes.DECIMAL(10, 2)) + .field("field9", DataTypes.DATE()) + .field("field10", DataTypes.TIME()) + .field("field11", DataTypes.TIMESTAMP()) + .field("field12", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) + .build(); + + // A map list of fields and values used to verify the conversion of flink expression to iceberg + // expression + private static final List> FIELD_VALUE_LIST = + ImmutableList.of( + Pair.of("field1", 1), + Pair.of("field2", 2L), + Pair.of("field3", 3F), + Pair.of("field4", 4D), + Pair.of("field5", "iceberg"), + Pair.of("field6", true), + Pair.of("field7", new byte[] {'a', 'b'}), + Pair.of("field8", BigDecimal.valueOf(10.12)), + Pair.of("field9", DateTimeUtil.daysFromDate(LocalDate.now())), + Pair.of("field10", DateTimeUtil.microsFromTime(LocalTime.now())), + Pair.of("field11", DateTimeUtil.microsFromTimestamp(LocalDateTime.now())), + Pair.of("field12", DateTimeUtil.microsFromInstant(Instant.now()))); @Test public void testFlinkDataTypeEqual() { @@ -114,15 +115,18 @@ public void testFlinkDataTypeEqual() { @Test public void testEquals() { for (Pair pair : FIELD_VALUE_LIST) { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.equal(pair.first(), pair.second()); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.equal(pair.first(), pair.second()); Optional actual = - FlinkFilters.convert(resolve(Expressions.$(pair.first()).isEqual(Expressions.lit(pair.second())))); + FlinkFilters.convert( + resolve(Expressions.$(pair.first()).isEqual(Expressions.lit(pair.second())))); Assert.assertTrue("Conversion should succeed", actual.isPresent()); assertPredicatesMatch(expected, actual.get()); Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(pair.second()).isEqual(Expressions.$(pair.first())))); + FlinkFilters.convert( + resolve(Expressions.lit(pair.second()).isEqual(Expressions.$(pair.first())))); Assert.assertTrue("Conversion should succeed", actual1.isPresent()); assertPredicatesMatch(expected, actual1.get()); } @@ -146,15 +150,18 @@ public void testEqualsNaN() { @Test public void testNotEquals() { for (Pair pair : FIELD_VALUE_LIST) { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.notEqual(pair.first(), pair.second()); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.notEqual(pair.first(), pair.second()); Optional actual = - FlinkFilters.convert(resolve(Expressions.$(pair.first()).isNotEqual(Expressions.lit(pair.second())))); + FlinkFilters.convert( + resolve(Expressions.$(pair.first()).isNotEqual(Expressions.lit(pair.second())))); Assert.assertTrue("Conversion should succeed", actual.isPresent()); assertPredicatesMatch(expected, actual.get()); Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(pair.second()).isNotEqual(Expressions.$(pair.first())))); + FlinkFilters.convert( + resolve(Expressions.lit(pair.second()).isNotEqual(Expressions.$(pair.first())))); Assert.assertTrue("Conversion should succeed", actual1.isPresent()); assertPredicatesMatch(expected, actual1.get()); } @@ -165,19 +172,22 @@ public void testNotEqualsNaN() { UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.notNaN("field3"); Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field3").isNotEqual(Expressions.lit(Float.NaN)))); + FlinkFilters.convert( + resolve(Expressions.$("field3").isNotEqual(Expressions.lit(Float.NaN)))); Assert.assertTrue("Conversion should succeed", actual.isPresent()); assertPredicatesMatch(expected, actual.get()); Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(Float.NaN).isNotEqual(Expressions.$("field3")))); + FlinkFilters.convert( + resolve(Expressions.lit(Float.NaN).isNotEqual(Expressions.$("field3")))); Assert.assertTrue("Conversion should succeed", actual1.isPresent()); assertPredicatesMatch(expected, actual1.get()); } @Test public void testGreaterThan() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.greaterThan("field1", 1); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.greaterThan("field1", 1); Optional actual = FlinkFilters.convert(resolve(Expressions.$("field1").isGreater(Expressions.lit(1)))); @@ -192,7 +202,8 @@ public void testGreaterThan() { @Test public void testGreaterThanEquals() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.greaterThanOrEqual("field1", 1); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.greaterThanOrEqual("field1", 1); Optional actual = FlinkFilters.convert(resolve(Expressions.$("field1").isGreaterOrEqual(Expressions.lit(1)))); @@ -207,7 +218,8 @@ public void testGreaterThanEquals() { @Test public void testLessThan() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.lessThan("field1", 1); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.lessThan("field1", 1); Optional actual = FlinkFilters.convert(resolve(Expressions.$("field1").isLess(Expressions.lit(1)))); @@ -222,7 +234,8 @@ public void testLessThan() { @Test public void testLessThanEquals() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.lessThanOrEqual("field1", 1); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.lessThanOrEqual("field1", 1); Optional actual = FlinkFilters.convert(resolve(Expressions.$("field1").isLessOrEqual(Expressions.lit(1)))); @@ -249,20 +262,26 @@ public void testIsNotNull() { Expression expr = resolve(Expressions.$("field1").isNotNull()); Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.notNull("field1"); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.notNull("field1"); assertPredicatesMatch(expected, actual.get()); } @Test public void testAnd() { - Expression expr = resolve( - Expressions.$("field1").isEqual(Expressions.lit(1)).and(Expressions.$("field2").isEqual(Expressions.lit(2L)))); + Expression expr = + resolve( + Expressions.$("field1") + .isEqual(Expressions.lit(1)) + .and(Expressions.$("field2").isEqual(Expressions.lit(2L)))); Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); And and = (And) actual.get(); - And expected = (And) org.apache.iceberg.expressions.Expressions.and( - org.apache.iceberg.expressions.Expressions.equal("field1", 1), - org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); + And expected = + (And) + org.apache.iceberg.expressions.Expressions.and( + org.apache.iceberg.expressions.Expressions.equal("field1", 1), + org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); assertPredicatesMatch(expected.left(), and.left()); assertPredicatesMatch(expected.right(), and.right()); @@ -270,14 +289,19 @@ public void testAnd() { @Test public void testOr() { - Expression expr = resolve( - Expressions.$("field1").isEqual(Expressions.lit(1)).or(Expressions.$("field2").isEqual(Expressions.lit(2L)))); + Expression expr = + resolve( + Expressions.$("field1") + .isEqual(Expressions.lit(1)) + .or(Expressions.$("field2").isEqual(Expressions.lit(2L)))); Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); Or or = (Or) actual.get(); - Or expected = (Or) org.apache.iceberg.expressions.Expressions.or( - org.apache.iceberg.expressions.Expressions.equal("field1", 1), - org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); + Or expected = + (Or) + org.apache.iceberg.expressions.Expressions.or( + org.apache.iceberg.expressions.Expressions.equal("field1", 1), + org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); assertPredicatesMatch(expected.left(), or.left()); assertPredicatesMatch(expected.right(), or.right()); @@ -285,13 +309,18 @@ public void testOr() { @Test public void testNot() { - Expression expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.NOT, Expressions.$("field1").isEqual(Expressions.lit(1)))); + Expression expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.NOT, + Expressions.$("field1").isEqual(Expressions.lit(1)))); Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); Not not = (Not) actual.get(); - Not expected = (Not) org.apache.iceberg.expressions.Expressions.not( - org.apache.iceberg.expressions.Expressions.equal("field1", 1)); + Not expected = + (Not) + org.apache.iceberg.expressions.Expressions.not( + org.apache.iceberg.expressions.Expressions.equal("field1", 1)); Assert.assertEquals("Predicate operation should match", expected.op(), not.op()); assertPredicatesMatch(expected.child(), not.child()); @@ -299,40 +328,59 @@ public void testNot() { @Test public void testLike() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.startsWith("field5", "abc"); - Expression expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("abc%"))); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.startsWith("field5", "abc"); + Expression expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("abc%"))); Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); assertPredicatesMatch(expected, actual.get()); - expr = resolve(ApiExpressionUtils - .unresolvedCall(BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%abc"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%abc"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); - expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%abc%"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, + Expressions.$("field5"), + Expressions.lit("%abc%"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); - expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("abc%d"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, + Expressions.$("field5"), + Expressions.lit("abc%d"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); - expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); - expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a_"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a_"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); - expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a%b"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a%b"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); } @@ -343,13 +391,15 @@ private void matchLiteral(String fieldName, Object flinkLiteral, T icebergLi Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); org.apache.iceberg.expressions.Expression expression = actual.get(); - Assertions.assertThat(expression).as("The expression should be a UnboundPredicate") + Assertions.assertThat(expression) + .as("The expression should be a UnboundPredicate") .isInstanceOf(UnboundPredicate.class); UnboundPredicate unboundPredicate = (UnboundPredicate) expression; org.apache.iceberg.expressions.Expression expression1 = unboundPredicate.bind(FlinkSchemaUtil.convert(TABLE_SCHEMA).asStruct(), false); - Assertions.assertThat(expression1).as("The expression should be a BoundLiteralPredicate") + Assertions.assertThat(expression1) + .as("The expression should be a BoundLiteralPredicate") .isInstanceOf(BoundLiteralPredicate.class); BoundLiteralPredicate predicate = (BoundLiteralPredicate) expression1; @@ -357,49 +407,61 @@ private void matchLiteral(String fieldName, Object flinkLiteral, T icebergLi } private static Expression resolve(Expression originalExpression) { - return originalExpression.accept(new ApiExpressionDefaultVisitor() { - @Override - public Expression visit(UnresolvedReferenceExpression unresolvedReference) { - String name = unresolvedReference.getName(); - Optional field = TABLE_SCHEMA.getTableColumn(name); - if (field.isPresent()) { - int index = TABLE_SCHEMA.getTableColumns().indexOf(field.get()); - return new FieldReferenceExpression(name, field.get().getType(), 0, index); - } else { - return null; - } - } - - @Override - public Expression visit(UnresolvedCallExpression unresolvedCall) { - List children = - unresolvedCall.getChildren().stream().map(e -> (ResolvedExpression) e.accept(this)) - .collect(Collectors.toList()); - return new CallExpression(unresolvedCall.getFunctionDefinition(), children, DataTypes.STRING()); - } - - @Override - public Expression visit(ValueLiteralExpression valueLiteral) { - return valueLiteral; - } - - @Override - protected Expression defaultMethod(Expression expression) { - throw new UnsupportedOperationException(String.format("unsupported expression: %s", expression)); - } - }); + return originalExpression.accept( + new ApiExpressionDefaultVisitor() { + @Override + public Expression visit(UnresolvedReferenceExpression unresolvedReference) { + String name = unresolvedReference.getName(); + Optional field = TABLE_SCHEMA.getTableColumn(name); + if (field.isPresent()) { + int index = TABLE_SCHEMA.getTableColumns().indexOf(field.get()); + return new FieldReferenceExpression(name, field.get().getType(), 0, index); + } else { + return null; + } + } + + @Override + public Expression visit(UnresolvedCallExpression unresolvedCall) { + List children = + unresolvedCall.getChildren().stream() + .map(e -> (ResolvedExpression) e.accept(this)) + .collect(Collectors.toList()); + return new CallExpression( + unresolvedCall.getFunctionDefinition(), children, DataTypes.STRING()); + } + + @Override + public Expression visit(ValueLiteralExpression valueLiteral) { + return valueLiteral; + } + + @Override + protected Expression defaultMethod(Expression expression) { + throw new UnsupportedOperationException( + String.format("unsupported expression: %s", expression)); + } + }); } - private void assertPredicatesMatch(org.apache.iceberg.expressions.Expression expected, - org.apache.iceberg.expressions.Expression actual) { - Assertions.assertThat(expected).as("The expected expression should be a UnboundPredicate") + private void assertPredicatesMatch( + org.apache.iceberg.expressions.Expression expected, + org.apache.iceberg.expressions.Expression actual) { + Assertions.assertThat(expected) + .as("The expected expression should be a UnboundPredicate") .isInstanceOf(UnboundPredicate.class); - Assertions.assertThat(actual).as("The actual expression should be a UnboundPredicate") + Assertions.assertThat(actual) + .as("The actual expression should be a UnboundPredicate") .isInstanceOf(UnboundPredicate.class); UnboundPredicate predicateExpected = (UnboundPredicate) expected; UnboundPredicate predicateActual = (UnboundPredicate) actual; - Assert.assertEquals("Predicate operation should match", predicateExpected.op(), predicateActual.op()); - Assert.assertEquals("Predicate literal should match", predicateExpected.literal(), predicateActual.literal()); - Assert.assertEquals("Predicate name should match", predicateExpected.ref().name(), predicateActual.ref().name()); + Assert.assertEquals( + "Predicate operation should match", predicateExpected.op(), predicateActual.op()); + Assert.assertEquals( + "Predicate literal should match", predicateExpected.literal(), predicateActual.literal()); + Assert.assertEquals( + "Predicate name should match", + predicateExpected.ref().name(), + predicateActual.ref().name()); } } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java index 24065015795e..64746356636b 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -36,8 +35,7 @@ public class TestFlinkHiveCatalog extends FlinkTestBase { - @Rule - public TemporaryFolder tempFolder = new TemporaryFolder(); + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); @Test public void testCreateCatalogWithWarehouseLocation() throws IOException { @@ -61,7 +59,8 @@ public void testCreateCatalogWithHiveConfDir() throws IOException { try (FileOutputStream fos = new FileOutputStream(hiveSiteXML)) { Configuration newConf = new Configuration(hiveConf); // Set another new directory which is different with the hive metastore's warehouse path. - newConf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, "file://" + warehouseDir.getAbsolutePath()); + newConf.set( + HiveConf.ConfVars.METASTOREWAREHOUSE.varname, "file://" + warehouseDir.getAbsolutePath()); newConf.writeXml(fos); } Assert.assertTrue("hive-site.xml should be created now.", Files.exists(hiveSiteXML.toPath())); @@ -77,8 +76,11 @@ public void testCreateCatalogWithHiveConfDir() throws IOException { checkSQLQuery(props, warehouseDir); } - private void checkSQLQuery(Map catalogProperties, File warehouseDir) throws IOException { - sql("CREATE CATALOG test_catalog WITH %s", FlinkCatalogTestBase.toWithClause(catalogProperties)); + private void checkSQLQuery(Map catalogProperties, File warehouseDir) + throws IOException { + sql( + "CREATE CATALOG test_catalog WITH %s", + FlinkCatalogTestBase.toWithClause(catalogProperties)); sql("USE CATALOG test_catalog"); sql("CREATE DATABASE test_db"); sql("USE test_db"); @@ -93,7 +95,8 @@ private void checkSQLQuery(Map catalogProperties, File warehouse Path dataPath = tablePath.resolve("data"); Assert.assertTrue("Table data path should exist", Files.exists(dataPath)); - Assert.assertEquals("Should have a .crc file and a .parquet file", 2, Files.list(dataPath).count()); + Assert.assertEquals( + "Should have a .crc file and a .parquet file", 2, Files.list(dataPath).count()); sql("DROP TABLE test_table"); sql("DROP DATABASE test_db"); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java index 01f8524464e0..b5dfb9cb2f6b 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import org.apache.flink.table.api.DataTypes; @@ -45,194 +44,270 @@ public class TestFlinkSchemaUtil { @Test public void testConvertFlinkSchemaToIcebergSchema() { - TableSchema flinkSchema = TableSchema.builder() - .field("id", DataTypes.INT().notNull()) - .field("name", DataTypes.STRING()) /* optional by default */ - .field("salary", DataTypes.DOUBLE().notNull()) - .field("locations", DataTypes.MAP(DataTypes.STRING(), - DataTypes.ROW(DataTypes.FIELD("posX", DataTypes.DOUBLE().notNull(), "X field"), - DataTypes.FIELD("posY", DataTypes.DOUBLE().notNull(), "Y field")))) - .field("strArray", DataTypes.ARRAY(DataTypes.STRING()).nullable()) - .field("intArray", DataTypes.ARRAY(DataTypes.INT()).nullable()) - .field("char", DataTypes.CHAR(10).notNull()) - .field("varchar", DataTypes.VARCHAR(10).notNull()) - .field("boolean", DataTypes.BOOLEAN().nullable()) - .field("tinyint", DataTypes.TINYINT()) - .field("smallint", DataTypes.SMALLINT()) - .field("bigint", DataTypes.BIGINT()) - .field("varbinary", DataTypes.VARBINARY(10)) - .field("binary", DataTypes.BINARY(10)) - .field("time", DataTypes.TIME()) - .field("timestampWithoutZone", DataTypes.TIMESTAMP()) - .field("timestampWithZone", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) - .field("date", DataTypes.DATE()) - .field("decimal", DataTypes.DECIMAL(2, 2)) - .field("decimal2", DataTypes.DECIMAL(38, 2)) - .field("decimal3", DataTypes.DECIMAL(10, 1)) - .field("multiset", DataTypes.MULTISET(DataTypes.STRING().notNull())) - .build(); + TableSchema flinkSchema = + TableSchema.builder() + .field("id", DataTypes.INT().notNull()) + .field("name", DataTypes.STRING()) /* optional by default */ + .field("salary", DataTypes.DOUBLE().notNull()) + .field( + "locations", + DataTypes.MAP( + DataTypes.STRING(), + DataTypes.ROW( + DataTypes.FIELD("posX", DataTypes.DOUBLE().notNull(), "X field"), + DataTypes.FIELD("posY", DataTypes.DOUBLE().notNull(), "Y field")))) + .field("strArray", DataTypes.ARRAY(DataTypes.STRING()).nullable()) + .field("intArray", DataTypes.ARRAY(DataTypes.INT()).nullable()) + .field("char", DataTypes.CHAR(10).notNull()) + .field("varchar", DataTypes.VARCHAR(10).notNull()) + .field("boolean", DataTypes.BOOLEAN().nullable()) + .field("tinyint", DataTypes.TINYINT()) + .field("smallint", DataTypes.SMALLINT()) + .field("bigint", DataTypes.BIGINT()) + .field("varbinary", DataTypes.VARBINARY(10)) + .field("binary", DataTypes.BINARY(10)) + .field("time", DataTypes.TIME()) + .field("timestampWithoutZone", DataTypes.TIMESTAMP()) + .field("timestampWithZone", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) + .field("date", DataTypes.DATE()) + .field("decimal", DataTypes.DECIMAL(2, 2)) + .field("decimal2", DataTypes.DECIMAL(38, 2)) + .field("decimal3", DataTypes.DECIMAL(10, 1)) + .field("multiset", DataTypes.MULTISET(DataTypes.STRING().notNull())) + .build(); - Schema icebergSchema = new Schema( - Types.NestedField.required(0, "id", Types.IntegerType.get(), null), - Types.NestedField.optional(1, "name", Types.StringType.get(), null), - Types.NestedField.required(2, "salary", Types.DoubleType.get(), null), - Types.NestedField.optional(3, "locations", Types.MapType.ofOptional(24, 25, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(22, "posX", Types.DoubleType.get(), "X field"), - Types.NestedField.required(23, "posY", Types.DoubleType.get(), "Y field") - ))), - Types.NestedField.optional(4, "strArray", Types.ListType.ofOptional(26, Types.StringType.get())), - Types.NestedField.optional(5, "intArray", Types.ListType.ofOptional(27, Types.IntegerType.get())), - Types.NestedField.required(6, "char", Types.StringType.get()), - Types.NestedField.required(7, "varchar", Types.StringType.get()), - Types.NestedField.optional(8, "boolean", Types.BooleanType.get()), - Types.NestedField.optional(9, "tinyint", Types.IntegerType.get()), - Types.NestedField.optional(10, "smallint", Types.IntegerType.get()), - Types.NestedField.optional(11, "bigint", Types.LongType.get()), - Types.NestedField.optional(12, "varbinary", Types.BinaryType.get()), - Types.NestedField.optional(13, "binary", Types.FixedType.ofLength(10)), - Types.NestedField.optional(14, "time", Types.TimeType.get()), - Types.NestedField.optional(15, "timestampWithoutZone", Types.TimestampType.withoutZone()), - Types.NestedField.optional(16, "timestampWithZone", Types.TimestampType.withZone()), - Types.NestedField.optional(17, "date", Types.DateType.get()), - Types.NestedField.optional(18, "decimal", Types.DecimalType.of(2, 2)), - Types.NestedField.optional(19, "decimal2", Types.DecimalType.of(38, 2)), - Types.NestedField.optional(20, "decimal3", Types.DecimalType.of(10, 1)), - Types.NestedField.optional(21, "multiset", Types.MapType.ofRequired(28, 29, - Types.StringType.get(), - Types.IntegerType.get())) - ); + Schema icebergSchema = + new Schema( + Types.NestedField.required(0, "id", Types.IntegerType.get(), null), + Types.NestedField.optional(1, "name", Types.StringType.get(), null), + Types.NestedField.required(2, "salary", Types.DoubleType.get(), null), + Types.NestedField.optional( + 3, + "locations", + Types.MapType.ofOptional( + 24, + 25, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(22, "posX", Types.DoubleType.get(), "X field"), + Types.NestedField.required( + 23, "posY", Types.DoubleType.get(), "Y field")))), + Types.NestedField.optional( + 4, "strArray", Types.ListType.ofOptional(26, Types.StringType.get())), + Types.NestedField.optional( + 5, "intArray", Types.ListType.ofOptional(27, Types.IntegerType.get())), + Types.NestedField.required(6, "char", Types.StringType.get()), + Types.NestedField.required(7, "varchar", Types.StringType.get()), + Types.NestedField.optional(8, "boolean", Types.BooleanType.get()), + Types.NestedField.optional(9, "tinyint", Types.IntegerType.get()), + Types.NestedField.optional(10, "smallint", Types.IntegerType.get()), + Types.NestedField.optional(11, "bigint", Types.LongType.get()), + Types.NestedField.optional(12, "varbinary", Types.BinaryType.get()), + Types.NestedField.optional(13, "binary", Types.FixedType.ofLength(10)), + Types.NestedField.optional(14, "time", Types.TimeType.get()), + Types.NestedField.optional( + 15, "timestampWithoutZone", Types.TimestampType.withoutZone()), + Types.NestedField.optional(16, "timestampWithZone", Types.TimestampType.withZone()), + Types.NestedField.optional(17, "date", Types.DateType.get()), + Types.NestedField.optional(18, "decimal", Types.DecimalType.of(2, 2)), + Types.NestedField.optional(19, "decimal2", Types.DecimalType.of(38, 2)), + Types.NestedField.optional(20, "decimal3", Types.DecimalType.of(10, 1)), + Types.NestedField.optional( + 21, + "multiset", + Types.MapType.ofRequired(28, 29, Types.StringType.get(), Types.IntegerType.get()))); checkSchema(flinkSchema, icebergSchema); } @Test public void testMapField() { - TableSchema flinkSchema = TableSchema.builder() - .field("map_int_long", DataTypes.MAP(DataTypes.INT(), DataTypes.BIGINT()).notNull()) /* Required */ - .field("map_int_array_string", DataTypes.MAP(DataTypes.ARRAY(DataTypes.INT()), DataTypes.STRING())) - .field("map_decimal_string", DataTypes.MAP(DataTypes.DECIMAL(10, 2), DataTypes.STRING())) - .field("map_fields_fields", - DataTypes.MAP( - DataTypes.ROW( - DataTypes.FIELD("field_int", DataTypes.INT(), "doc - int"), - DataTypes.FIELD("field_string", DataTypes.STRING(), "doc - string") - ).notNull(), /* Required */ - DataTypes.ROW( - DataTypes.FIELD("field_array", DataTypes.ARRAY(DataTypes.STRING()), "doc - array") - ).notNull() /* Required */ - ).notNull() /* Required */ - ) - .build(); + TableSchema flinkSchema = + TableSchema.builder() + .field( + "map_int_long", + DataTypes.MAP(DataTypes.INT(), DataTypes.BIGINT()).notNull()) /* Required */ + .field( + "map_int_array_string", + DataTypes.MAP(DataTypes.ARRAY(DataTypes.INT()), DataTypes.STRING())) + .field( + "map_decimal_string", DataTypes.MAP(DataTypes.DECIMAL(10, 2), DataTypes.STRING())) + .field( + "map_fields_fields", + DataTypes.MAP( + DataTypes.ROW( + DataTypes.FIELD("field_int", DataTypes.INT(), "doc - int"), + DataTypes.FIELD("field_string", DataTypes.STRING(), "doc - string")) + .notNull(), /* Required */ + DataTypes.ROW( + DataTypes.FIELD( + "field_array", + DataTypes.ARRAY(DataTypes.STRING()), + "doc - array")) + .notNull() /* Required */) + .notNull() /* Required */) + .build(); - Schema icebergSchema = new Schema( - Types.NestedField.required(0, "map_int_long", - Types.MapType.ofOptional(4, 5, Types.IntegerType.get(), Types.LongType.get()), null), - Types.NestedField.optional(1, "map_int_array_string", - Types.MapType.ofOptional(7, 8, - Types.ListType.ofOptional(6, Types.IntegerType.get()), Types.StringType.get()), null), - Types.NestedField.optional(2, "map_decimal_string", Types.MapType.ofOptional(9, 10, - Types.DecimalType.of(10, 2), Types.StringType.get())), - Types.NestedField.required(3, "map_fields_fields", - Types.MapType.ofRequired( - 15, 16, - Types.StructType.of(Types.NestedField.optional(11, "field_int", Types.IntegerType.get(), "doc - int"), - Types.NestedField.optional(12, "field_string", Types.StringType.get(), "doc - string")), - Types.StructType.of(Types.NestedField.optional(14, "field_array", - Types.ListType.ofOptional(13, Types.StringType.get()), "doc - array")) - ) - ) - ); + Schema icebergSchema = + new Schema( + Types.NestedField.required( + 0, + "map_int_long", + Types.MapType.ofOptional(4, 5, Types.IntegerType.get(), Types.LongType.get()), + null), + Types.NestedField.optional( + 1, + "map_int_array_string", + Types.MapType.ofOptional( + 7, + 8, + Types.ListType.ofOptional(6, Types.IntegerType.get()), + Types.StringType.get()), + null), + Types.NestedField.optional( + 2, + "map_decimal_string", + Types.MapType.ofOptional( + 9, 10, Types.DecimalType.of(10, 2), Types.StringType.get())), + Types.NestedField.required( + 3, + "map_fields_fields", + Types.MapType.ofRequired( + 15, + 16, + Types.StructType.of( + Types.NestedField.optional( + 11, "field_int", Types.IntegerType.get(), "doc - int"), + Types.NestedField.optional( + 12, "field_string", Types.StringType.get(), "doc - string")), + Types.StructType.of( + Types.NestedField.optional( + 14, + "field_array", + Types.ListType.ofOptional(13, Types.StringType.get()), + "doc - array"))))); checkSchema(flinkSchema, icebergSchema); } @Test public void testStructField() { - TableSchema flinkSchema = TableSchema.builder() - .field("struct_int_string_decimal", DataTypes.ROW( - DataTypes.FIELD("field_int", DataTypes.INT()), - DataTypes.FIELD("field_string", DataTypes.STRING()), - DataTypes.FIELD("field_decimal", DataTypes.DECIMAL(19, 2)), - DataTypes.FIELD("field_struct", DataTypes.ROW( - DataTypes.FIELD("inner_struct_int", DataTypes.INT()), - DataTypes.FIELD("inner_struct_float_array", DataTypes.ARRAY(DataTypes.FLOAT())) - ).notNull()) /* Row is required */ - ).notNull()) /* Required */ - .field("struct_map_int_int", DataTypes.ROW( - DataTypes.FIELD("field_map", DataTypes.MAP(DataTypes.INT(), DataTypes.INT())) - ).nullable()) /* Optional */ - .build(); + TableSchema flinkSchema = + TableSchema.builder() + .field( + "struct_int_string_decimal", + DataTypes.ROW( + DataTypes.FIELD("field_int", DataTypes.INT()), + DataTypes.FIELD("field_string", DataTypes.STRING()), + DataTypes.FIELD("field_decimal", DataTypes.DECIMAL(19, 2)), + DataTypes.FIELD( + "field_struct", + DataTypes.ROW( + DataTypes.FIELD("inner_struct_int", DataTypes.INT()), + DataTypes.FIELD( + "inner_struct_float_array", + DataTypes.ARRAY(DataTypes.FLOAT()))) + .notNull()) /* Row is required */) + .notNull()) /* Required */ + .field( + "struct_map_int_int", + DataTypes.ROW( + DataTypes.FIELD( + "field_map", DataTypes.MAP(DataTypes.INT(), DataTypes.INT()))) + .nullable()) /* Optional */ + .build(); - Schema icebergSchema = new Schema( - Types.NestedField.required(0, "struct_int_string_decimal", - Types.StructType.of( - Types.NestedField.optional(5, "field_int", Types.IntegerType.get()), - Types.NestedField.optional(6, "field_string", Types.StringType.get()), - Types.NestedField.optional(7, "field_decimal", Types.DecimalType.of(19, 2)), - Types.NestedField.required(8, "field_struct", - Types.StructType.of( - Types.NestedField.optional(3, "inner_struct_int", Types.IntegerType.get()), - Types.NestedField.optional(4, "inner_struct_float_array", - Types.ListType.ofOptional(2, Types.FloatType.get())) - )) - )), - Types.NestedField.optional(1, "struct_map_int_int", - Types.StructType.of( - Types.NestedField.optional(11, "field_map", Types.MapType.ofOptional(9, 10, - Types.IntegerType.get(), Types.IntegerType.get())) - ) - ) - ); + Schema icebergSchema = + new Schema( + Types.NestedField.required( + 0, + "struct_int_string_decimal", + Types.StructType.of( + Types.NestedField.optional(5, "field_int", Types.IntegerType.get()), + Types.NestedField.optional(6, "field_string", Types.StringType.get()), + Types.NestedField.optional(7, "field_decimal", Types.DecimalType.of(19, 2)), + Types.NestedField.required( + 8, + "field_struct", + Types.StructType.of( + Types.NestedField.optional( + 3, "inner_struct_int", Types.IntegerType.get()), + Types.NestedField.optional( + 4, + "inner_struct_float_array", + Types.ListType.ofOptional(2, Types.FloatType.get())))))), + Types.NestedField.optional( + 1, + "struct_map_int_int", + Types.StructType.of( + Types.NestedField.optional( + 11, + "field_map", + Types.MapType.ofOptional( + 9, 10, Types.IntegerType.get(), Types.IntegerType.get()))))); checkSchema(flinkSchema, icebergSchema); } @Test public void testListField() { - TableSchema flinkSchema = TableSchema.builder() - .field("list_struct_fields", DataTypes.ARRAY( - DataTypes.ROW( - DataTypes.FIELD("field_int", DataTypes.INT()) - ) - ).notNull()) /* Required */ - .field("list_optional_struct_fields", DataTypes.ARRAY( - DataTypes.ROW( - DataTypes.FIELD( - "field_timestamp_with_local_time_zone", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE() - ) - ) - ).nullable()) /* Optional */ - .field("list_map_fields", DataTypes.ARRAY( - DataTypes.MAP( - DataTypes.ARRAY(DataTypes.INT().notNull()), /* Key of map must be required */ - DataTypes.ROW( - DataTypes.FIELD("field_0", DataTypes.INT(), "doc - int") - ) - ).notNull() - ).notNull()) /* Required */ - .build(); + TableSchema flinkSchema = + TableSchema.builder() + .field( + "list_struct_fields", + DataTypes.ARRAY(DataTypes.ROW(DataTypes.FIELD("field_int", DataTypes.INT()))) + .notNull()) /* Required */ + .field( + "list_optional_struct_fields", + DataTypes.ARRAY( + DataTypes.ROW( + DataTypes.FIELD( + "field_timestamp_with_local_time_zone", + DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()))) + .nullable()) /* Optional */ + .field( + "list_map_fields", + DataTypes.ARRAY( + DataTypes.MAP( + DataTypes.ARRAY( + DataTypes.INT().notNull()), /* Key of map must be required */ + DataTypes.ROW( + DataTypes.FIELD("field_0", DataTypes.INT(), "doc - int"))) + .notNull()) + .notNull()) /* Required */ + .build(); - Schema icebergSchema = new Schema( - Types.NestedField.required(0, "list_struct_fields", - Types.ListType.ofOptional(4, Types.StructType.of( - Types.NestedField.optional(3, "field_int", Types.IntegerType.get()) - ))), - Types.NestedField.optional(1, "list_optional_struct_fields", - Types.ListType.ofOptional(6, Types.StructType.of( - Types.NestedField.optional(5, "field_timestamp_with_local_time_zone", Types.TimestampType.withZone()) - ))), - Types.NestedField.required(2, "list_map_fields", - Types.ListType.ofRequired(11, - Types.MapType.ofOptional(9, 10, - Types.ListType.ofRequired(7, Types.IntegerType.get()), + Schema icebergSchema = + new Schema( + Types.NestedField.required( + 0, + "list_struct_fields", + Types.ListType.ofOptional( + 4, Types.StructType.of( - Types.NestedField.optional(8, "field_0", Types.IntegerType.get(), "doc - int") - ) - ) - )) - ); + Types.NestedField.optional(3, "field_int", Types.IntegerType.get())))), + Types.NestedField.optional( + 1, + "list_optional_struct_fields", + Types.ListType.ofOptional( + 6, + Types.StructType.of( + Types.NestedField.optional( + 5, + "field_timestamp_with_local_time_zone", + Types.TimestampType.withZone())))), + Types.NestedField.required( + 2, + "list_map_fields", + Types.ListType.ofRequired( + 11, + Types.MapType.ofOptional( + 9, + 10, + Types.ListType.ofRequired(7, Types.IntegerType.get()), + Types.StructType.of( + Types.NestedField.optional( + 8, "field_0", Types.IntegerType.get(), "doc - int")))))); checkSchema(flinkSchema, icebergSchema); } @@ -242,34 +317,43 @@ private void checkSchema(TableSchema flinkSchema, Schema icebergSchema) { // The conversion is not a 1:1 mapping, so we just check iceberg types. Assert.assertEquals( icebergSchema.asStruct(), - FlinkSchemaUtil.convert(FlinkSchemaUtil.toSchema(FlinkSchemaUtil.convert(icebergSchema))).asStruct()); + FlinkSchemaUtil.convert(FlinkSchemaUtil.toSchema(FlinkSchemaUtil.convert(icebergSchema))) + .asStruct()); } @Test public void testInconsistentTypes() { checkInconsistentType( - Types.UUIDType.get(), new BinaryType(16), - new BinaryType(16), Types.FixedType.ofLength(16)); + Types.UUIDType.get(), new BinaryType(16), new BinaryType(16), Types.FixedType.ofLength(16)); checkInconsistentType( - Types.StringType.get(), new VarCharType(VarCharType.MAX_LENGTH), - new CharType(100), Types.StringType.get()); + Types.StringType.get(), + new VarCharType(VarCharType.MAX_LENGTH), + new CharType(100), + Types.StringType.get()); checkInconsistentType( - Types.BinaryType.get(), new VarBinaryType(VarBinaryType.MAX_LENGTH), - new VarBinaryType(100), Types.BinaryType.get()); + Types.BinaryType.get(), + new VarBinaryType(VarBinaryType.MAX_LENGTH), + new VarBinaryType(100), + Types.BinaryType.get()); checkInconsistentType( - Types.TimeType.get(), new TimeType(), - new TimeType(3), Types.TimeType.get()); + Types.TimeType.get(), new TimeType(), new TimeType(3), Types.TimeType.get()); checkInconsistentType( - Types.TimestampType.withoutZone(), new TimestampType(6), - new TimestampType(3), Types.TimestampType.withoutZone()); + Types.TimestampType.withoutZone(), + new TimestampType(6), + new TimestampType(3), + Types.TimestampType.withoutZone()); checkInconsistentType( - Types.TimestampType.withZone(), new LocalZonedTimestampType(6), - new LocalZonedTimestampType(3), Types.TimestampType.withZone()); + Types.TimestampType.withZone(), + new LocalZonedTimestampType(6), + new LocalZonedTimestampType(3), + Types.TimestampType.withZone()); } private void checkInconsistentType( - Type icebergType, LogicalType flinkExpectedType, - LogicalType flinkType, Type icebergExpectedType) { + Type icebergType, + LogicalType flinkExpectedType, + LogicalType flinkType, + Type icebergExpectedType) { Assert.assertEquals(flinkExpectedType, FlinkSchemaUtil.convert(icebergType)); Assert.assertEquals( Types.StructType.of(Types.NestedField.optional(0, "f0", icebergExpectedType)), @@ -278,19 +362,19 @@ private void checkInconsistentType( @Test public void testConvertFlinkSchemaBaseOnIcebergSchema() { - Schema baseSchema = new Schema( - Lists.newArrayList( - Types.NestedField.required(101, "int", Types.IntegerType.get()), - Types.NestedField.optional(102, "string", Types.StringType.get()) - ), - Sets.newHashSet(101) - ); + Schema baseSchema = + new Schema( + Lists.newArrayList( + Types.NestedField.required(101, "int", Types.IntegerType.get()), + Types.NestedField.optional(102, "string", Types.StringType.get())), + Sets.newHashSet(101)); - TableSchema flinkSchema = TableSchema.builder() - .field("int", DataTypes.INT().notNull()) - .field("string", DataTypes.STRING().nullable()) - .primaryKey("int") - .build(); + TableSchema flinkSchema = + TableSchema.builder() + .field("int", DataTypes.INT().notNull()) + .field("string", DataTypes.STRING().nullable()) + .primaryKey("int") + .build(); Schema convertedSchema = FlinkSchemaUtil.convert(baseSchema, flinkSchema); Assert.assertEquals(baseSchema.asStruct(), convertedSchema.asStruct()); Assert.assertEquals(ImmutableSet.of(101), convertedSchema.identifierFieldIds()); @@ -298,29 +382,33 @@ public void testConvertFlinkSchemaBaseOnIcebergSchema() { @Test public void testConvertFlinkSchemaWithPrimaryKeys() { - Schema icebergSchema = new Schema( - Lists.newArrayList( - Types.NestedField.required(1, "int", Types.IntegerType.get()), - Types.NestedField.required(2, "string", Types.StringType.get()) - ), - Sets.newHashSet(1, 2) - ); + Schema icebergSchema = + new Schema( + Lists.newArrayList( + Types.NestedField.required(1, "int", Types.IntegerType.get()), + Types.NestedField.required(2, "string", Types.StringType.get())), + Sets.newHashSet(1, 2)); TableSchema tableSchema = FlinkSchemaUtil.toSchema(icebergSchema); Assert.assertTrue(tableSchema.getPrimaryKey().isPresent()); - Assert.assertEquals(ImmutableSet.of("int", "string"), + Assert.assertEquals( + ImmutableSet.of("int", "string"), ImmutableSet.copyOf(tableSchema.getPrimaryKey().get().getColumns())); } @Test public void testConvertFlinkSchemaWithNestedColumnInPrimaryKeys() { - Schema icebergSchema = new Schema( - Lists.newArrayList(Types.NestedField.required(1, "struct", - Types.StructType.of(Types.NestedField.required(2, "inner", Types.IntegerType.get()))) - ), - Sets.newHashSet(2) - ); - AssertHelpers.assertThrows("Does not support the nested columns in flink schema's primary keys", + Schema icebergSchema = + new Schema( + Lists.newArrayList( + Types.NestedField.required( + 1, + "struct", + Types.StructType.of( + Types.NestedField.required(2, "inner", Types.IntegerType.get())))), + Sets.newHashSet(2)); + AssertHelpers.assertThrows( + "Does not support the nested columns in flink schema's primary keys", ValidationException.class, "Column 'struct.inner' does not exist", () -> FlinkSchemaUtil.toSchema(icebergSchema)); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java index 0c30b09166fc..23bd7cf47c17 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -60,8 +59,7 @@ public class TestFlinkTableSink extends FlinkCatalogTestBase { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private static final String SOURCE_TABLE = "default_catalog.default_database.bounded_source"; private static final String TABLE_NAME = "test_table"; @@ -71,10 +69,12 @@ public class TestFlinkTableSink extends FlinkCatalogTestBase { private final FileFormat format; private final boolean isStreamingJob; - @Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") + @Parameterized.Parameters( + name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") public static Iterable parameters() { List parameters = Lists.newArrayList(); - for (FileFormat format : new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { + for (FileFormat format : + new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { for (Boolean isStreaming : new Boolean[] {true, false}) { for (Object[] catalogParams : FlinkCatalogTestBase.parameters()) { String catalogName = (String) catalogParams[0]; @@ -86,7 +86,8 @@ public static Iterable parameters() { return parameters; } - public TestFlinkTableSink(String catalogName, Namespace baseNamespace, FileFormat format, Boolean isStreamingJob) { + public TestFlinkTableSink( + String catalogName, Namespace baseNamespace, FileFormat format, Boolean isStreamingJob) { super(catalogName, baseNamespace); this.format = format; this.isStreamingJob = isStreamingJob; @@ -96,13 +97,13 @@ public TestFlinkTableSink(String catalogName, Namespace baseNamespace, FileForma protected TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings - .newInstance() - .useBlinkPlanner(); + EnvironmentSettings.Builder settingsBuilder = + EnvironmentSettings.newInstance().useBlinkPlanner(); if (isStreamingJob) { settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); env.enableCheckpointing(400); env.setMaxParallelism(2); @@ -124,7 +125,9 @@ public void before() { sql("CREATE DATABASE %s", flinkDatabase); sql("USE CATALOG %s", catalogName); sql("USE %s", DATABASE); - sql("CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", TABLE_NAME, format.name()); + sql( + "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", + TABLE_NAME, format.name()); icebergTable = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); } @@ -140,78 +143,86 @@ public void clean() { @Test public void testInsertFromSourceTable() throws Exception { // Register the rows into a temporary table. - getTableEnv().createTemporaryView("sourceTable", - getTableEnv().fromValues(SimpleDataUtil.FLINK_SCHEMA.toRowDataType(), - Expressions.row(1, "hello"), - Expressions.row(2, "world"), - Expressions.row(3, (String) null), - Expressions.row(null, "bar") - ) - ); + getTableEnv() + .createTemporaryView( + "sourceTable", + getTableEnv() + .fromValues( + SimpleDataUtil.FLINK_SCHEMA.toRowDataType(), + Expressions.row(1, "hello"), + Expressions.row(2, "world"), + Expressions.row(3, (String) null), + Expressions.row(null, "bar"))); // Redirect the records from source table to destination table. sql("INSERT INTO %s SELECT id,data from sourceTable", TABLE_NAME); // Assert the table records as expected. - SimpleDataUtil.assertTableRecords(icebergTable, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), - SimpleDataUtil.createRecord(2, "world"), - SimpleDataUtil.createRecord(3, null), - SimpleDataUtil.createRecord(null, "bar") - )); + SimpleDataUtil.assertTableRecords( + icebergTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hello"), + SimpleDataUtil.createRecord(2, "world"), + SimpleDataUtil.createRecord(3, null), + SimpleDataUtil.createRecord(null, "bar"))); } @Test public void testOverwriteTable() throws Exception { - Assume.assumeFalse("Flink unbounded streaming does not support overwrite operation", isStreamingJob); + Assume.assumeFalse( + "Flink unbounded streaming does not support overwrite operation", isStreamingJob); sql("INSERT INTO %s SELECT 1, 'a'", TABLE_NAME); - SimpleDataUtil.assertTableRecords(icebergTable, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a") - )); + SimpleDataUtil.assertTableRecords( + icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(1, "a"))); sql("INSERT OVERWRITE %s SELECT 2, 'b'", TABLE_NAME); - SimpleDataUtil.assertTableRecords(icebergTable, Lists.newArrayList( - SimpleDataUtil.createRecord(2, "b") - )); + SimpleDataUtil.assertTableRecords( + icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(2, "b"))); } @Test public void testReplacePartitions() throws Exception { - Assume.assumeFalse("Flink unbounded streaming does not support overwrite operation", isStreamingJob); + Assume.assumeFalse( + "Flink unbounded streaming does not support overwrite operation", isStreamingJob); String tableName = "test_partition"; - sql("CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", tableName, format.name()); try { - Table partitionedTable = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); + Table partitionedTable = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); sql("INSERT INTO %s SELECT 1, 'a'", tableName); sql("INSERT INTO %s SELECT 2, 'b'", tableName); sql("INSERT INTO %s SELECT 3, 'c'", tableName); - SimpleDataUtil.assertTableRecords(partitionedTable, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "b"), - SimpleDataUtil.createRecord(3, "c") - )); + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "b"), + SimpleDataUtil.createRecord(3, "c"))); sql("INSERT OVERWRITE %s SELECT 4, 'b'", tableName); sql("INSERT OVERWRITE %s SELECT 5, 'a'", tableName); - SimpleDataUtil.assertTableRecords(partitionedTable, Lists.newArrayList( - SimpleDataUtil.createRecord(5, "a"), - SimpleDataUtil.createRecord(4, "b"), - SimpleDataUtil.createRecord(3, "c") - )); + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(5, "a"), + SimpleDataUtil.createRecord(4, "b"), + SimpleDataUtil.createRecord(3, "c"))); sql("INSERT OVERWRITE %s PARTITION (data='a') SELECT 6", tableName); - SimpleDataUtil.assertTableRecords(partitionedTable, Lists.newArrayList( - SimpleDataUtil.createRecord(6, "a"), - SimpleDataUtil.createRecord(4, "b"), - SimpleDataUtil.createRecord(3, "c") - )); + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(6, "a"), + SimpleDataUtil.createRecord(4, "b"), + SimpleDataUtil.createRecord(3, "c"))); } finally { sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); } @@ -220,34 +231,38 @@ public void testReplacePartitions() throws Exception { @Test public void testInsertIntoPartition() throws Exception { String tableName = "test_insert_into_partition"; - sql("CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", tableName, format.name()); try { - Table partitionedTable = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); + Table partitionedTable = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); // Full partition. sql("INSERT INTO %s PARTITION (data='a') SELECT 1", tableName); sql("INSERT INTO %s PARTITION (data='a') SELECT 2", tableName); sql("INSERT INTO %s PARTITION (data='b') SELECT 3", tableName); - SimpleDataUtil.assertTableRecords(partitionedTable, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "a"), - SimpleDataUtil.createRecord(3, "b") - )); + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "a"), + SimpleDataUtil.createRecord(3, "b"))); // Partial partition. sql("INSERT INTO %s SELECT 4, 'c'", tableName); sql("INSERT INTO %s SELECT 5, 'd'", tableName); - SimpleDataUtil.assertTableRecords(partitionedTable, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "a"), - SimpleDataUtil.createRecord(3, "b"), - SimpleDataUtil.createRecord(4, "c"), - SimpleDataUtil.createRecord(5, "d") - )); + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "a"), + SimpleDataUtil.createRecord(3, "b"), + SimpleDataUtil.createRecord(4, "c"), + SimpleDataUtil.createRecord(5, "d"))); } finally { sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); } @@ -256,34 +271,45 @@ public void testInsertIntoPartition() throws Exception { @Test public void testHashDistributeMode() throws Exception { String tableName = "test_hash_distribution_mode"; - Map tableProps = ImmutableMap.of( - "write.format.default", format.name(), - TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName() - ); + Map tableProps = + ImmutableMap.of( + "write.format.default", + format.name(), + TableProperties.WRITE_DISTRIBUTION_MODE, + DistributionMode.HASH.modeName()); // Initialize a BoundedSource table to precisely emit those rows in only one checkpoint. - List dataSet = IntStream.range(1, 1000) - .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) - .flatMap(List::stream) - .collect(Collectors.toList()); + List dataSet = + IntStream.range(1, 1000) + .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) + .flatMap(List::stream) + .collect(Collectors.toList()); String dataId = BoundedTableFactory.registerDataSet(ImmutableList.of(dataSet)); - sql("CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + - " WITH ('connector'='BoundedSource', 'data-id'='%s')", SOURCE_TABLE, dataId); - Assert.assertEquals("Should have the expected rows in source table.", Sets.newHashSet(dataSet), + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + + " WITH ('connector'='BoundedSource', 'data-id'='%s')", + SOURCE_TABLE, dataId); + Assert.assertEquals( + "Should have the expected rows in source table.", + Sets.newHashSet(dataSet), Sets.newHashSet(sql("SELECT * FROM %s", SOURCE_TABLE))); - sql("CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH %s", + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH %s", tableName, toWithClause(tableProps)); try { // Insert data set. sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); - Assert.assertEquals("Should have the expected rows in sink table.", Sets.newHashSet(dataSet), + Assert.assertEquals( + "Should have the expected rows in sink table.", + Sets.newHashSet(dataSet), Sets.newHashSet(sql("SELECT * FROM %s", tableName))); // Sometimes we will have more than one checkpoint if we pass the auto checkpoint interval, - // thus producing multiple snapshots. Here we assert that each snapshot has only 1 file per partition. + // thus producing multiple snapshots. Here we assert that each snapshot has only 1 file per + // partition. Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); Map> snapshotToDataFiles = SimpleDataUtil.snapshotToDataFiles(table); for (List dataFiles : snapshotToDataFiles.values()) { @@ -291,12 +317,24 @@ public void testHashDistributeMode() throws Exception { continue; } - Assert.assertEquals("There should be 1 data file in partition 'aaa'", 1, - SimpleDataUtil.matchingPartitions(dataFiles, table.spec(), ImmutableMap.of("data", "aaa")).size()); - Assert.assertEquals("There should be 1 data file in partition 'bbb'", 1, - SimpleDataUtil.matchingPartitions(dataFiles, table.spec(), ImmutableMap.of("data", "bbb")).size()); - Assert.assertEquals("There should be 1 data file in partition 'ccc'", 1, - SimpleDataUtil.matchingPartitions(dataFiles, table.spec(), ImmutableMap.of("data", "ccc")).size()); + Assert.assertEquals( + "There should be 1 data file in partition 'aaa'", + 1, + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "aaa")) + .size()); + Assert.assertEquals( + "There should be 1 data file in partition 'bbb'", + 1, + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "bbb")) + .size()); + Assert.assertEquals( + "There should be 1 data file in partition 'ccc'", + 1, + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "ccc")) + .size()); } } finally { sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSource.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSource.java index fe9c9d832a36..8f30f13db7e0 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSource.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -52,18 +51,17 @@ public class TestFlinkTableSource extends FlinkTestBase { public TestFlinkTableSource() { // register a scan event listener to validate pushdown - Listeners.register(event -> { - scanEventCount += 1; - lastScanEvent = event; - }, ScanEvent.class); + Listeners.register( + event -> { + scanEventCount += 1; + lastScanEvent = event; + }, + ScanEvent.class); } @Override protected TableEnvironment getTableEnv() { - super.getTableEnv() - .getConfig() - .getConfiguration() - .set(CoreOptions.DEFAULT_PARALLELISM, 1); + super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1); return super.getTableEnv(); } @@ -77,14 +75,18 @@ public static void createWarehouse() throws IOException { @Before public void before() { - sql("CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", CATALOG_NAME, - warehouse); + sql( + "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_NAME, warehouse); sql("USE CATALOG %s", CATALOG_NAME); sql("CREATE DATABASE %s", DATABASE_NAME); sql("USE %s", DATABASE_NAME); - sql("CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) WITH ('write.format.default'='%s')", TABLE_NAME, - format.name()); - sql("INSERT INTO %s VALUES (1,'iceberg',10),(2,'b',20),(3,CAST(NULL AS VARCHAR),30)", TABLE_NAME); + sql( + "CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) WITH ('write.format.default'='%s')", + TABLE_NAME, format.name()); + sql( + "INSERT INTO %s VALUES (1,'iceberg',10),(2,'b',20),(3,CAST(NULL AS VARCHAR),30)", + TABLE_NAME); this.scanEventCount = 0; this.lastScanEvent = null; @@ -100,19 +102,19 @@ public void clean() { @Test public void testLimitPushDown() { - AssertHelpers.assertThrows("Invalid limit number: -1 ", SqlParserException.class, + AssertHelpers.assertThrows( + "Invalid limit number: -1 ", + SqlParserException.class, () -> sql("SELECT * FROM %s LIMIT -1", TABLE_NAME)); - Assert.assertEquals("Should have 0 record", 0, sql("SELECT * FROM %s LIMIT 0", TABLE_NAME).size()); + Assert.assertEquals( + "Should have 0 record", 0, sql("SELECT * FROM %s LIMIT 0", TABLE_NAME).size()); String sqlLimitExceed = String.format("SELECT * FROM %s LIMIT 4", TABLE_NAME); List resultExceed = sql(sqlLimitExceed); Assert.assertEquals("Should have 3 records", 3, resultExceed.size()); - List expectedList = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedList = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedList, resultExceed); String querySql = String.format("SELECT * FROM %s LIMIT 1", TABLE_NAME); @@ -121,26 +123,24 @@ public void testLimitPushDown() { Assert.assertTrue("Explain should contain LimitPushDown", explain.contains(expectedExplain)); List result = sql(querySql); Assert.assertEquals("Should have 1 record", 1, result.size()); - Assertions.assertThat(result) - .containsAnyElementsOf(expectedList); + Assertions.assertThat(result).containsAnyElementsOf(expectedList); String sqlMixed = String.format("SELECT * FROM %s WHERE id = 1 LIMIT 2", TABLE_NAME); List mixedResult = sql(sqlMixed); Assert.assertEquals("Should have 1 record", 1, mixedResult.size()); - Assert.assertEquals("Should produce the expected records", Row.of(1, "iceberg", 10.0), mixedResult.get(0)); + Assert.assertEquals( + "Should produce the expected records", Row.of(1, "iceberg", 10.0), mixedResult.get(0)); } @Test public void testNoFilterPushDown() { String sql = String.format("SELECT * FROM %s ", TABLE_NAME); List result = sql(sql); - List expectedRecords = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedRecords = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedRecords, result); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); } @Test @@ -150,10 +150,12 @@ public void testFilterPushDownEqual() { List result = sql(sqlLiteralRight); Assert.assertEquals("Should have 1 record", 1, result.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), result.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), result.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -172,10 +174,12 @@ public void testFilterPushDownEqualLiteralOnLeft() { List resultLeft = sql(sqlLiteralLeft); Assert.assertEquals("Should have 1 record", 1, resultLeft.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLeft.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLeft.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -186,13 +190,11 @@ public void testFilterPushDownNoEqual() { List resultNE = sql(sqlNE); Assert.assertEquals("Should have 2 records", 2, resultNE.size()); - List expectedNE = Lists.newArrayList( - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedNE = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedNE, resultNE); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -206,15 +208,18 @@ public void testFilterPushDownNoEqualNull() { @Test public void testFilterPushDownAnd() { - String sqlAnd = String.format("SELECT * FROM %s WHERE id = 1 AND data = 'iceberg' ", TABLE_NAME); + String sqlAnd = + String.format("SELECT * FROM %s WHERE id = 1 AND data = 'iceberg' ", TABLE_NAME); List resultAnd = sql(sqlAnd); Assert.assertEquals("Should have 1 record", 1, resultAnd.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultAnd.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultAnd.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); String expected = "(ref(name=\"id\") == 1 and ref(name=\"data\") == \"iceberg\")"; - Assert.assertEquals("Should contain the push down filter", expected, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expected, lastScanEvent.filter().toString()); } @Test @@ -225,14 +230,12 @@ public void testFilterPushDownOr() { List resultOr = sql(sqlOr); Assert.assertEquals("Should have 2 record", 2, resultOr.size()); - List expectedOR = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expectedOR = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expectedOR, resultOr); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -243,14 +246,12 @@ public void testFilterPushDownGreaterThan() { List resultGT = sql(sqlGT); Assert.assertEquals("Should have 2 record", 2, resultGT.size()); - List expectedGT = Lists.newArrayList( - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedGT = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedGT, resultGT); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -270,14 +271,12 @@ public void testFilterPushDownGreaterThanLiteralOnLeft() { List resultGT = sql(sqlGT); Assert.assertEquals("Should have 2 records", 2, resultGT.size()); - List expectedGT = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expectedGT = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expectedGT, resultGT); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -288,14 +287,12 @@ public void testFilterPushDownGreaterThanEqual() { List resultGTE = sql(sqlGTE); Assert.assertEquals("Should have 2 records", 2, resultGTE.size()); - List expectedGTE = Lists.newArrayList( - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedGTE = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedGTE, resultGTE); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -315,14 +312,12 @@ public void testFilterPushDownGreaterThanEqualLiteralOnLeft() { List resultGTE = sql(sqlGTE); Assert.assertEquals("Should have 2 records", 2, resultGTE.size()); - List expectedGTE = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expectedGTE = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expectedGTE, resultGTE); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -332,10 +327,12 @@ public void testFilterPushDownLessThan() { List resultLT = sql(sqlLT); Assert.assertEquals("Should have 1 record", 1, resultLT.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLT.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLT.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -354,10 +351,12 @@ public void testFilterPushDownLessThanLiteralOnLeft() { List resultLT = sql(sqlLT); Assert.assertEquals("Should have 1 record", 1, resultLT.size()); - Assert.assertEquals("Should produce the expected record", Row.of(3, null, 30.0), resultLT.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(3, null, 30.0), resultLT.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -367,10 +366,12 @@ public void testFilterPushDownLessThanEqual() { List resultLTE = sql(sqlLTE); Assert.assertEquals("Should have 1 record", 1, resultLTE.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLTE.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLTE.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -389,10 +390,12 @@ public void testFilterPushDownLessThanEqualLiteralOnLeft() { List resultLTE = sql(sqlLTE); Assert.assertEquals("Should have 1 record", 1, resultLTE.size()); - Assert.assertEquals("Should produce the expected record", Row.of(3, null, 30.0), resultLTE.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(3, null, 30.0), resultLTE.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -402,23 +405,24 @@ public void testFilterPushDownIn() { List resultIN = sql(sqlIN); Assert.assertEquals("Should have 2 records", 2, resultIN.size()); - List expectedIN = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expectedIN = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expectedIN, resultIN); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test public void testFilterPushDownInNull() { - String sqlInNull = String.format("SELECT * FROM %s WHERE data IN ('iceberg',NULL) ", TABLE_NAME); + String sqlInNull = + String.format("SELECT * FROM %s WHERE data IN ('iceberg',NULL) ", TABLE_NAME); List result = sql(sqlInNull); Assert.assertEquals("Should have 1 record", 1, result.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), result.get(0)); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), result.get(0)); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); } @Test @@ -427,10 +431,12 @@ public void testFilterPushDownNotIn() { List resultNotIn = sql(sqlNotIn); Assert.assertEquals("Should have 1 record", 1, resultNotIn.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultNotIn.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultNotIn.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); String expectedScan = "(ref(name=\"id\") != 2 and ref(name=\"id\") != 3)"; - Assert.assertEquals("Should contain the push down filter", expectedScan, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedScan, lastScanEvent.filter().toString()); } @Test @@ -438,7 +444,8 @@ public void testFilterPushDownNotInNull() { String sqlNotInNull = String.format("SELECT * FROM %s WHERE id NOT IN (1,2,NULL) ", TABLE_NAME); List resultGT = sql(sqlNotInNull); Assert.assertEquals("Should have 0 record", 0, resultGT.size()); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); } @Test @@ -449,14 +456,12 @@ public void testFilterPushDownIsNotNull() { List resultNotNull = sql(sqlNotNull); Assert.assertEquals("Should have 2 record", 2, resultNotNull.size()); - List expected = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expected = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expected, resultNotNull); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -466,10 +471,12 @@ public void testFilterPushDownIsNull() { List resultNull = sql(sqlNull); Assert.assertEquals("Should have 1 record", 1, resultNull.size()); - Assert.assertEquals("Should produce the expected record", Row.of(3, null, 30.0), resultNull.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(3, null, 30.0), resultNull.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -478,11 +485,13 @@ public void testFilterPushDownNot() { List resultNot = sql(sqlNot); Assert.assertEquals("Should have 1 record", 1, resultNot.size()); - Assert.assertEquals("Should produce the expected record", Row.of(3, null, 30.0), resultNot.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(3, null, 30.0), resultNot.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); String expectedFilter = "(ref(name=\"id\") != 1 and ref(name=\"id\") != 2)"; - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -492,28 +501,30 @@ public void testFilterPushDownBetween() { List resultBetween = sql(sqlBetween); Assert.assertEquals("Should have 2 record", 2, resultBetween.size()); - List expectedBetween = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expectedBetween = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expectedBetween, resultBetween); Assert.assertEquals("Should create only one scan", 1, scanEventCount); String expected = "(ref(name=\"id\") >= 1 and ref(name=\"id\") <= 2)"; - Assert.assertEquals("Should contain the push down filter", expected, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expected, lastScanEvent.filter().toString()); } @Test public void testFilterPushDownNotBetween() { - String sqlNotBetween = String.format("SELECT * FROM %s WHERE id NOT BETWEEN 2 AND 3 ", TABLE_NAME); + String sqlNotBetween = + String.format("SELECT * FROM %s WHERE id NOT BETWEEN 2 AND 3 ", TABLE_NAME); String expectedFilter = "(ref(name=\"id\") < 2 or ref(name=\"id\") > 3)"; List resultNotBetween = sql(sqlNotBetween); Assert.assertEquals("Should have 1 record", 1, resultNotBetween.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultNotBetween.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultNotBetween.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -523,10 +534,13 @@ public void testFilterPushDownLike() { String sqlLike = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'ice%%' "; List resultLike = sql(sqlLike); Assert.assertEquals("Should have 1 record", 1, resultLike.size()); - Assert.assertEquals("The like result should produce the expected record", - Row.of(1, "iceberg", 10.0), resultLike.get(0)); + Assert.assertEquals( + "The like result should produce the expected record", + Row.of(1, "iceberg", 10.0), + resultLike.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -535,85 +549,105 @@ public void testFilterNotPushDownLike() { String sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i' "; List resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 1 record", 0, resultLike.size()); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i%%' "; resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 1 record", 1, resultLike.size()); Assert.assertEquals("Should produce the expected record", expectRecord, resultLike.get(0)); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%ice%%g' "; resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 1 record", 1, resultLike.size()); Assert.assertEquals("Should produce the expected record", expectRecord, resultLike.get(0)); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%' "; resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 3 records", 3, resultLike.size()); - List expectedRecords = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedRecords = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedRecords, resultLike); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'iceber_' "; resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 1 record", 1, resultLike.size()); Assert.assertEquals("Should produce the expected record", expectRecord, resultLike.get(0)); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'i%%g' "; resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 1 record", 1, resultLike.size()); Assert.assertEquals("Should produce the expected record", expectRecord, resultLike.get(0)); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); } @Test public void testFilterPushDown2Literal() { String sql2Literal = String.format("SELECT * FROM %s WHERE 1 > 0 ", TABLE_NAME); List result = sql(sql2Literal); - List expectedRecords = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedRecords = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedRecords, result); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); } /** - * NaN is not supported by flink now, so we add the test case to assert the parse error, when we upgrade the flink - * that supports NaN, we will delele the method, and add some test case to test NaN. + * NaN is not supported by flink now, so we add the test case to assert the parse error, when we + * upgrade the flink that supports NaN, we will delele the method, and add some test case to test + * NaN. */ @Test public void testSqlParseError() { - String sqlParseErrorEqual = String.format("SELECT * FROM %s WHERE d = CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorEqual)); - - String sqlParseErrorNotEqual = String.format("SELECT * FROM %s WHERE d <> CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorNotEqual)); - - String sqlParseErrorGT = String.format("SELECT * FROM %s WHERE d > CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorGT)); - - String sqlParseErrorLT = String.format("SELECT * FROM %s WHERE d < CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorLT)); - - String sqlParseErrorGTE = String.format("SELECT * FROM %s WHERE d >= CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorGTE)); - - String sqlParseErrorLTE = String.format("SELECT * FROM %s WHERE d <= CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorLTE)); + String sqlParseErrorEqual = + String.format("SELECT * FROM %s WHERE d = CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorEqual)); + + String sqlParseErrorNotEqual = + String.format("SELECT * FROM %s WHERE d <> CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorNotEqual)); + + String sqlParseErrorGT = + String.format("SELECT * FROM %s WHERE d > CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorGT)); + + String sqlParseErrorLT = + String.format("SELECT * FROM %s WHERE d < CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorLT)); + + String sqlParseErrorGTE = + String.format("SELECT * FROM %s WHERE d >= CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorGTE)); + + String sqlParseErrorLTE = + String.format("SELECT * FROM %s WHERE d <= CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorLTE)); } } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java index f0afac17a9e7..d4b93bc9d4a2 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.time.LocalDate; @@ -49,14 +48,14 @@ public class TestFlinkUpsert extends FlinkCatalogTestBase { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private final boolean isStreamingJob; private final Map tableUpsertProps = Maps.newHashMap(); private TableEnvironment tEnv; - public TestFlinkUpsert(String catalogName, Namespace baseNamespace, FileFormat format, Boolean isStreamingJob) { + public TestFlinkUpsert( + String catalogName, Namespace baseNamespace, FileFormat format, Boolean isStreamingJob) { super(catalogName, baseNamespace); this.isStreamingJob = isStreamingJob; tableUpsertProps.put(TableProperties.FORMAT_VERSION, "2"); @@ -64,13 +63,16 @@ public TestFlinkUpsert(String catalogName, Namespace baseNamespace, FileFormat f tableUpsertProps.put(TableProperties.DEFAULT_FILE_FORMAT, format.name()); } - @Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") + @Parameterized.Parameters( + name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") public static Iterable parameters() { List parameters = Lists.newArrayList(); - for (FileFormat format : new FileFormat[] {FileFormat.PARQUET, FileFormat.AVRO, FileFormat.ORC}) { + for (FileFormat format : + new FileFormat[] {FileFormat.PARQUET, FileFormat.AVRO, FileFormat.ORC}) { for (Boolean isStreaming : new Boolean[] {true, false}) { // Only test with one catalog as this is a file operation concern. - // FlinkCatalogTestBase requires the catalog name start with testhadoop if using hadoop catalog. + // FlinkCatalogTestBase requires the catalog name start with testhadoop if using hadoop + // catalog. String catalogName = "testhadoop"; Namespace baseNamespace = Namespace.of("default"); parameters.add(new Object[] {catalogName, baseNamespace, format, isStreaming}); @@ -83,12 +85,12 @@ public static Iterable parameters() { protected TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings - .newInstance(); + EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); if (isStreamingJob) { settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); @@ -124,33 +126,36 @@ public void testUpsertAndQuery() { LocalDate dt20220301 = LocalDate.of(2022, 3, 1); LocalDate dt20220302 = LocalDate.of(2022, 3, 2); - sql("CREATE TABLE %s(id INT NOT NULL, province STRING NOT NULL, dt DATE, PRIMARY KEY(id,province) NOT ENFORCED) " + - "PARTITIONED BY (province) WITH %s", + sql( + "CREATE TABLE %s(id INT NOT NULL, province STRING NOT NULL, dt DATE, PRIMARY KEY(id,province) NOT ENFORCED) " + + "PARTITIONED BY (province) WITH %s", tableName, toWithClause(tableUpsertProps)); try { - sql("INSERT INTO %s VALUES " + - "(1, 'a', DATE '2022-03-01')," + - "(2, 'b', DATE '2022-03-01')," + - "(1, 'b', DATE '2022-03-01')", + sql( + "INSERT INTO %s VALUES " + + "(1, 'a', DATE '2022-03-01')," + + "(2, 'b', DATE '2022-03-01')," + + "(1, 'b', DATE '2022-03-01')", tableName); - sql("INSERT INTO %s VALUES " + - "(4, 'a', DATE '2022-03-02')," + - "(5, 'b', DATE '2022-03-02')," + - "(1, 'b', DATE '2022-03-02')", + sql( + "INSERT INTO %s VALUES " + + "(4, 'a', DATE '2022-03-02')," + + "(5, 'b', DATE '2022-03-02')," + + "(1, 'b', DATE '2022-03-02')", tableName); - List rowsOn20220301 = Lists.newArrayList(Row.of(2, "b", dt20220301), Row.of(1, "a", dt20220301)); + List rowsOn20220301 = + Lists.newArrayList(Row.of(2, "b", dt20220301), Row.of(1, "a", dt20220301)); TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt < '2022-03-02'", tableName), - rowsOn20220301); + sql("SELECT * FROM %s WHERE dt < '2022-03-02'", tableName), rowsOn20220301); - List rowsOn20220302 = Lists.newArrayList( - Row.of(1, "b", dt20220302), Row.of(4, "a", dt20220302), Row.of(5, "b", dt20220302)); + List rowsOn20220302 = + Lists.newArrayList( + Row.of(1, "b", dt20220302), Row.of(4, "a", dt20220302), Row.of(5, "b", dt20220302)); TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt = '2022-03-02'", tableName), - rowsOn20220302); + sql("SELECT * FROM %s WHERE dt = '2022-03-02'", tableName), rowsOn20220302); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), @@ -165,33 +170,24 @@ public void testPrimaryKeyEqualToPartitionKey() { // This is an SQL based reproduction of TestFlinkIcebergSinkV2#testUpsertOnDataKey String tableName = "upsert_on_data_key"; try { - sql("CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL, PRIMARY KEY(data) NOT ENFORCED) " + - "PARTITIONED BY (data) WITH %s", + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL, PRIMARY KEY(data) NOT ENFORCED) " + + "PARTITIONED BY (data) WITH %s", tableName, toWithClause(tableUpsertProps)); - sql("INSERT INTO %s VALUES " + - "(1, 'aaa')," + - "(2, 'aaa')," + - "(3, 'bbb')", - tableName); + sql("INSERT INTO %s VALUES " + "(1, 'aaa')," + "(2, 'aaa')," + "(3, 'bbb')", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of(2, "aaa"), Row.of(3, "bbb"))); - sql("INSERT INTO %s VALUES " + - "(4, 'aaa')," + - "(5, 'bbb')", - tableName); + sql("INSERT INTO %s VALUES " + "(4, 'aaa')," + "(5, 'bbb')", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of(4, "aaa"), Row.of(5, "bbb"))); - sql("INSERT INTO %s VALUES " + - "(6, 'aaa')," + - "(7, 'bbb')", - tableName); + sql("INSERT INTO %s VALUES " + "(6, 'aaa')," + "(7, 'bbb')", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), @@ -206,32 +202,36 @@ public void testPrimaryKeyFieldsAtBeginningOfSchema() { String tableName = "upsert_on_pk_at_schema_start"; LocalDate dt = LocalDate.of(2022, 3, 1); try { - sql("CREATE TABLE %s(data STRING NOT NULL, dt DATE NOT NULL, id INT, PRIMARY KEY(data,dt) NOT ENFORCED) " + - "PARTITIONED BY (data) WITH %s", + sql( + "CREATE TABLE %s(data STRING NOT NULL, dt DATE NOT NULL, id INT, PRIMARY KEY(data,dt) NOT ENFORCED) " + + "PARTITIONED BY (data) WITH %s", tableName, toWithClause(tableUpsertProps)); - sql("INSERT INTO %s VALUES " + - "('aaa', DATE '2022-03-01', 1)," + - "('aaa', DATE '2022-03-01', 2)," + - "('bbb', DATE '2022-03-01', 3)", + sql( + "INSERT INTO %s VALUES " + + "('aaa', DATE '2022-03-01', 1)," + + "('aaa', DATE '2022-03-01', 2)," + + "('bbb', DATE '2022-03-01', 3)", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of("aaa", dt, 2), Row.of("bbb", dt, 3))); - sql("INSERT INTO %s VALUES " + - "('aaa', DATE '2022-03-01', 4)," + - "('bbb', DATE '2022-03-01', 5)", + sql( + "INSERT INTO %s VALUES " + + "('aaa', DATE '2022-03-01', 4)," + + "('bbb', DATE '2022-03-01', 5)", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of("aaa", dt, 4), Row.of("bbb", dt, 5))); - sql("INSERT INTO %s VALUES " + - "('aaa', DATE '2022-03-01', 6)," + - "('bbb', DATE '2022-03-01', 7)", + sql( + "INSERT INTO %s VALUES " + + "('aaa', DATE '2022-03-01', 6)," + + "('bbb', DATE '2022-03-01', 7)", tableName); TestHelpers.assertRows( @@ -244,37 +244,42 @@ public void testPrimaryKeyFieldsAtBeginningOfSchema() { @Test public void testPrimaryKeyFieldsAtEndOfTableSchema() { - // This is the same test case as testPrimaryKeyFieldsAtBeginningOfSchema, but the primary key fields + // This is the same test case as testPrimaryKeyFieldsAtBeginningOfSchema, but the primary key + // fields // are located at the end of the flink schema. String tableName = "upsert_on_pk_at_schema_end"; LocalDate dt = LocalDate.of(2022, 3, 1); try { - sql("CREATE TABLE %s(id INT, data STRING NOT NULL, dt DATE NOT NULL, PRIMARY KEY(data,dt) NOT ENFORCED) " + - "PARTITIONED BY (data) WITH %s", + sql( + "CREATE TABLE %s(id INT, data STRING NOT NULL, dt DATE NOT NULL, PRIMARY KEY(data,dt) NOT ENFORCED) " + + "PARTITIONED BY (data) WITH %s", tableName, toWithClause(tableUpsertProps)); - sql("INSERT INTO %s VALUES " + - "(1, 'aaa', DATE '2022-03-01')," + - "(2, 'aaa', DATE '2022-03-01')," + - "(3, 'bbb', DATE '2022-03-01')", + sql( + "INSERT INTO %s VALUES " + + "(1, 'aaa', DATE '2022-03-01')," + + "(2, 'aaa', DATE '2022-03-01')," + + "(3, 'bbb', DATE '2022-03-01')", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of(2, "aaa", dt), Row.of(3, "bbb", dt))); - sql("INSERT INTO %s VALUES " + - "(4, 'aaa', DATE '2022-03-01')," + - "(5, 'bbb', DATE '2022-03-01')", + sql( + "INSERT INTO %s VALUES " + + "(4, 'aaa', DATE '2022-03-01')," + + "(5, 'bbb', DATE '2022-03-01')", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of(4, "aaa", dt), Row.of(5, "bbb", dt))); - sql("INSERT INTO %s VALUES " + - "(6, 'aaa', DATE '2022-03-01')," + - "(7, 'bbb', DATE '2022-03-01')", + sql( + "INSERT INTO %s VALUES " + + "(6, 'aaa', DATE '2022-03-01')," + + "(7, 'bbb', DATE '2022-03-01')", tableName); TestHelpers.assertRows( diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java index 51c981ee531b..e840ba842bef 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.IOException; @@ -67,8 +66,7 @@ import org.junit.Assert; public class TestHelpers { - private TestHelpers() { - } + private TestHelpers() {} public static T roundTripKryoSerialize(Class clazz, T table) throws IOException { KryoSerializer kryo = new KryoSerializer<>(clazz, new ExecutionConfig()); @@ -81,13 +79,15 @@ public static T roundTripKryoSerialize(Class clazz, T table) throws IOExc } public static RowData copyRowData(RowData from, RowType rowType) { - TypeSerializer[] fieldSerializers = rowType.getChildren().stream() - .map((LogicalType type) -> InternalSerializers.create(type)) - .toArray(TypeSerializer[]::new); + TypeSerializer[] fieldSerializers = + rowType.getChildren().stream() + .map((LogicalType type) -> InternalSerializers.create(type)) + .toArray(TypeSerializer[]::new); return RowDataUtil.clone(from, null, rowType, fieldSerializers); } - public static void readRowData(FlinkInputFormat input, Consumer visitor) throws IOException { + public static void readRowData(FlinkInputFormat input, Consumer visitor) + throws IOException { for (FlinkInputSplit s : input.createInputSplits(0)) { input.open(s); try { @@ -101,19 +101,21 @@ public static void readRowData(FlinkInputFormat input, Consumer visitor } } - public static List readRowData(FlinkInputFormat inputFormat, RowType rowType) throws IOException { + public static List readRowData(FlinkInputFormat inputFormat, RowType rowType) + throws IOException { List results = Lists.newArrayList(); readRowData(inputFormat, row -> results.add(copyRowData(row, rowType))); return results; } - public static List readRows(FlinkInputFormat inputFormat, RowType rowType) throws IOException { + public static List readRows(FlinkInputFormat inputFormat, RowType rowType) + throws IOException { return convertRowDataToRow(readRowData(inputFormat, rowType), rowType); } public static List convertRowDataToRow(List rowDataList, RowType rowType) { - DataStructureConverter converter = DataStructureConverters.getConverter( - TypeConversions.fromLogicalToDataType(rowType)); + DataStructureConverter converter = + DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); return rowDataList.stream() .map(converter::toExternal) .map(Row.class::cast) @@ -123,9 +125,12 @@ public static List convertRowDataToRow(List rowDataList, RowType r public static void assertRecords(List results, List expectedRecords, Schema schema) { List expected = Lists.newArrayList(); @SuppressWarnings("unchecked") - DataStructureConverter converter = (DataStructureConverter) DataStructureConverters.getConverter( - TypeConversions.fromLogicalToDataType(FlinkSchemaUtil.convert(schema))); - expectedRecords.forEach(r -> expected.add(converter.toExternal(RowDataConverter.convert(schema, r)))); + DataStructureConverter converter = + (DataStructureConverter) + DataStructureConverters.getConverter( + TypeConversions.fromLogicalToDataType(FlinkSchemaUtil.convert(schema))); + expectedRecords.forEach( + r -> expected.add(converter.toExternal(RowDataConverter.convert(schema, r)))); assertRows(results, expected); } @@ -141,13 +146,17 @@ public static void assertRowData(Schema schema, StructLike expected, RowData act assertRowData(schema.asStruct(), FlinkSchemaUtil.convert(schema), expected, actual); } - public static void assertRowData(Types.StructType structType, LogicalType rowType, StructLike expectedRecord, - RowData actualRowData) { + public static void assertRowData( + Types.StructType structType, + LogicalType rowType, + StructLike expectedRecord, + RowData actualRowData) { if (expectedRecord == null && actualRowData == null) { return; } - Assert.assertTrue("expected Record and actual RowData should be both null or not null", + Assert.assertTrue( + "expected Record and actual RowData should be both null or not null", expectedRecord != null && actualRowData != null); List types = Lists.newArrayList(); @@ -158,24 +167,30 @@ public static void assertRowData(Types.StructType structType, LogicalType rowTyp for (int i = 0; i < types.size(); i += 1) { LogicalType logicalType = ((RowType) rowType).getTypeAt(i); Object expected = expectedRecord.get(i, Object.class); - // The RowData.createFieldGetter won't return null for the required field. But in the projection case, if we are - // projecting a nested required field from an optional struct, then we should give a null for the projected field - // if the outer struct value is null. So we need to check the nullable for actualRowData here. For more details + // The RowData.createFieldGetter won't return null for the required field. But in the + // projection case, if we are + // projecting a nested required field from an optional struct, then we should give a null for + // the projected field + // if the outer struct value is null. So we need to check the nullable for actualRowData here. + // For more details // please see issue #2738. - Object actual = actualRowData.isNullAt(i) ? null : - RowData.createFieldGetter(logicalType, i).getFieldOrNull(actualRowData); + Object actual = + actualRowData.isNullAt(i) + ? null + : RowData.createFieldGetter(logicalType, i).getFieldOrNull(actualRowData); assertEquals(types.get(i), logicalType, expected, actual); } } - private static void assertEquals(Type type, LogicalType logicalType, Object expected, Object actual) { + private static void assertEquals( + Type type, LogicalType logicalType, Object expected, Object actual) { if (expected == null && actual == null) { return; } - Assert.assertTrue("expected and actual should be both null or not null", - expected != null && actual != null); + Assert.assertTrue( + "expected and actual should be both null or not null", expected != null && actual != null); switch (type.typeId()) { case BOOLEAN: @@ -194,7 +209,9 @@ private static void assertEquals(Type type, LogicalType logicalType, Object expe Assert.assertEquals("double value should be equal", expected, actual); break; case STRING: - Assertions.assertThat(expected).as("Should expect a CharSequence").isInstanceOf(CharSequence.class); + Assertions.assertThat(expected) + .as("Should expect a CharSequence") + .isInstanceOf(CharSequence.class); Assert.assertEquals("string should be equal", String.valueOf(expected), actual.toString()); break; case DATE: @@ -203,40 +220,56 @@ private static void assertEquals(Type type, LogicalType logicalType, Object expe Assert.assertEquals("date should be equal", expected, date); break; case TIME: - Assertions.assertThat(expected).as("Should expect a LocalTime").isInstanceOf(LocalTime.class); + Assertions.assertThat(expected) + .as("Should expect a LocalTime") + .isInstanceOf(LocalTime.class); int milliseconds = (int) (((LocalTime) expected).toNanoOfDay() / 1000_000); Assert.assertEquals("time millis should be equal", milliseconds, actual); break; case TIMESTAMP: if (((Types.TimestampType) type).shouldAdjustToUTC()) { - Assertions.assertThat(expected).as("Should expect a OffsetDataTime").isInstanceOf(OffsetDateTime.class); + Assertions.assertThat(expected) + .as("Should expect a OffsetDataTime") + .isInstanceOf(OffsetDateTime.class); OffsetDateTime ts = (OffsetDateTime) expected; - Assert.assertEquals("OffsetDataTime should be equal", ts.toLocalDateTime(), + Assert.assertEquals( + "OffsetDataTime should be equal", + ts.toLocalDateTime(), ((TimestampData) actual).toLocalDateTime()); } else { - Assertions.assertThat(expected).as("Should expect a LocalDataTime").isInstanceOf(LocalDateTime.class); + Assertions.assertThat(expected) + .as("Should expect a LocalDataTime") + .isInstanceOf(LocalDateTime.class); LocalDateTime ts = (LocalDateTime) expected; - Assert.assertEquals("LocalDataTime should be equal", ts, - ((TimestampData) actual).toLocalDateTime()); + Assert.assertEquals( + "LocalDataTime should be equal", ts, ((TimestampData) actual).toLocalDateTime()); } break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assert.assertEquals("binary should be equal", expected, ByteBuffer.wrap((byte[]) actual)); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); BigDecimal bd = (BigDecimal) expected; - Assert.assertEquals("decimal value should be equal", bd, - ((DecimalData) actual).toBigDecimal()); + Assert.assertEquals( + "decimal value should be equal", bd, ((DecimalData) actual).toBigDecimal()); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Collection expectedArrayData = (Collection) expected; ArrayData actualArrayData = (ArrayData) actual; LogicalType elementType = ((ArrayType) logicalType).getElementType(); - Assert.assertEquals("array length should be equal", expectedArrayData.size(), actualArrayData.size()); - assertArrayValues(type.asListType().elementType(), elementType, expectedArrayData, actualArrayData); + Assert.assertEquals( + "array length should be equal", expectedArrayData.size(), actualArrayData.size()); + assertArrayValues( + type.asListType().elementType(), elementType, expectedArrayData, actualArrayData); break; case MAP: Assertions.assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); @@ -248,7 +281,9 @@ private static void assertEquals(Type type, LogicalType logicalType, Object expe break; case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); - Assert.assertEquals("UUID should be equal", expected.toString(), + Assert.assertEquals( + "UUID should be equal", + expected.toString(), UUID.nameUUIDFromBytes((byte[]) actual).toString()); break; case FIXED: @@ -260,8 +295,8 @@ private static void assertEquals(Type type, LogicalType logicalType, Object expe } } - private static void assertArrayValues(Type type, LogicalType logicalType, Collection expectedArray, - ArrayData actualArray) { + private static void assertArrayValues( + Type type, LogicalType logicalType, Collection expectedArray, ArrayData actualArray) { List expectedElements = Lists.newArrayList(expectedArray); for (int i = 0; i < expectedArray.size(); i += 1) { if (expectedElements.get(i) == null) { @@ -271,12 +306,16 @@ private static void assertArrayValues(Type type, LogicalType logicalType, Collec Object expected = expectedElements.get(i); - assertEquals(type, logicalType, expected, + assertEquals( + type, + logicalType, + expected, ArrayData.createElementGetter(logicalType).getElementOrNull(actualArray, i)); } } - private static void assertMapValues(Types.MapType mapType, LogicalType type, Map expected, MapData actual) { + private static void assertMapValues( + Types.MapType mapType, LogicalType type, Map expected, MapData actual) { Assert.assertEquals("map size should be equal", expected.size(), actual.size()); ArrayData actualKeyArrayData = actual.keyArray(); @@ -305,7 +344,10 @@ private static void assertMapValues(Types.MapType mapType, LogicalType type, Map } Assert.assertNotNull("Should have a matching key", matchedActualKey); final int valueIndex = matchedKeyIndex; - assertEquals(valueType, actualValueType, entry.getValue(), + assertEquals( + valueType, + actualValueType, + entry.getValue(), valueGetter.getElementOrNull(actualValueArrayData, valueIndex)); } } @@ -319,31 +361,55 @@ public static void assertEquals(ManifestFile expected, ManifestFile actual) { Assert.assertEquals("Length must match", expected.length(), actual.length()); Assert.assertEquals("Spec id must match", expected.partitionSpecId(), actual.partitionSpecId()); Assert.assertEquals("ManifestContent must match", expected.content(), actual.content()); - Assert.assertEquals("SequenceNumber must match", expected.sequenceNumber(), actual.sequenceNumber()); - Assert.assertEquals("MinSequenceNumber must match", expected.minSequenceNumber(), actual.minSequenceNumber()); + Assert.assertEquals( + "SequenceNumber must match", expected.sequenceNumber(), actual.sequenceNumber()); + Assert.assertEquals( + "MinSequenceNumber must match", expected.minSequenceNumber(), actual.minSequenceNumber()); Assert.assertEquals("Snapshot id must match", expected.snapshotId(), actual.snapshotId()); - Assert.assertEquals("Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); - Assert.assertEquals("Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); - Assert.assertEquals("Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); - Assert.assertEquals("Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); - Assert.assertEquals("Existing files count must match", expected.existingFilesCount(), actual.existingFilesCount()); - Assert.assertEquals("Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); - Assert.assertEquals("Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); - Assert.assertEquals("Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); - Assert.assertEquals("Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); + Assert.assertEquals( + "Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); + Assert.assertEquals( + "Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); + Assert.assertEquals( + "Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); + Assert.assertEquals( + "Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); + Assert.assertEquals( + "Existing files count must match", + expected.existingFilesCount(), + actual.existingFilesCount()); + Assert.assertEquals( + "Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); + Assert.assertEquals( + "Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); + Assert.assertEquals( + "Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); + Assert.assertEquals( + "Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); List expectedSummaries = expected.partitions(); List actualSummaries = actual.partitions(); - Assert.assertEquals("PartitionFieldSummary size does not match", expectedSummaries.size(), actualSummaries.size()); + Assert.assertEquals( + "PartitionFieldSummary size does not match", + expectedSummaries.size(), + actualSummaries.size()); for (int i = 0; i < expectedSummaries.size(); i++) { - Assert.assertEquals("Null flag in partition must match", - expectedSummaries.get(i).containsNull(), actualSummaries.get(i).containsNull()); - Assert.assertEquals("NaN flag in partition must match", - expectedSummaries.get(i).containsNaN(), actualSummaries.get(i).containsNaN()); - Assert.assertEquals("Lower bounds in partition must match", - expectedSummaries.get(i).lowerBound(), actualSummaries.get(i).lowerBound()); - Assert.assertEquals("Upper bounds in partition must match", - expectedSummaries.get(i).upperBound(), actualSummaries.get(i).upperBound()); + Assert.assertEquals( + "Null flag in partition must match", + expectedSummaries.get(i).containsNull(), + actualSummaries.get(i).containsNull()); + Assert.assertEquals( + "NaN flag in partition must match", + expectedSummaries.get(i).containsNaN(), + actualSummaries.get(i).containsNaN()); + Assert.assertEquals( + "Lower bounds in partition must match", + expectedSummaries.get(i).lowerBound(), + actualSummaries.get(i).lowerBound()); + Assert.assertEquals( + "Upper bounds in partition must match", + expectedSummaries.get(i).upperBound(), + actualSummaries.get(i).upperBound()); } } @@ -358,7 +424,8 @@ public static void assertEquals(ContentFile expected, ContentFile actual) Assert.assertEquals("Format", expected.format(), actual.format()); Assert.assertEquals("Partition size", expected.partition().size(), actual.partition().size()); for (int i = 0; i < expected.partition().size(); i++) { - Assert.assertEquals("Partition data at index " + i, + Assert.assertEquals( + "Partition data at index " + i, expected.partition().get(i, Object.class), actual.partition().get(i, Object.class)); } @@ -371,6 +438,7 @@ public static void assertEquals(ContentFile expected, ContentFile actual) Assert.assertEquals("Upper bounds", expected.upperBounds(), actual.upperBounds()); Assert.assertEquals("Key metadata", expected.keyMetadata(), actual.keyMetadata()); Assert.assertEquals("Split offsets", expected.splitOffsets(), actual.splitOffsets()); - Assert.assertEquals("Equality field id list", actual.equalityFieldIds(), expected.equalityFieldIds()); + Assert.assertEquals( + "Equality field id list", actual.equalityFieldIds(), expected.equalityFieldIds()); } } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java index 93962a3d7e60..088e1cf4731e 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.flink; import java.io.IOException; @@ -54,8 +52,7 @@ public class TestIcebergConnector extends FlinkTestBase { private static final String TABLE_NAME = "test_table"; - @ClassRule - public static final TemporaryFolder WAREHOUSE = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder WAREHOUSE = new TemporaryFolder(); private final String catalogName; private final Map properties; @@ -67,118 +64,106 @@ public static Iterable parameters() { return Lists.newArrayList( // Create iceberg table in the hadoop catalog and default database. new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop" - ), - true + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop"), + true }, new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-table", "not_existing_table" - ), - true + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-table", "not_existing_table"), + true }, new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop" - ), - false + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop"), + false }, // Create iceberg table in the hadoop catalog and not_existing_db. new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db" - ), - true + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-database", "not_existing_db"), + true }, new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db", - "catalog-table", "not_existing_table" - ), - true + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-database", "not_existing_db", + "catalog-table", "not_existing_table"), + true }, new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db" - ), - false + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-database", "not_existing_db"), + false }, // Create iceberg table in the hive catalog and default database. new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive" - ), - true + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive"), + true }, new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-table", "not_existing_table" - ), - true + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-table", "not_existing_table"), + true }, new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive" - ), - false + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive"), + false }, // Create iceberg table in the hive catalog and not_existing_db. new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db" - ), - true + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-database", "not_existing_db"), + true }, new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db", - "catalog-table", "not_existing_table" - ), - true + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-database", "not_existing_db", + "catalog-table", "not_existing_table"), + true }, new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db" - ), - false - } - ); + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-database", "not_existing_db"), + false + }); } - public TestIcebergConnector(String catalogName, Map properties, boolean isStreaming) { + public TestIcebergConnector( + String catalogName, Map properties, boolean isStreaming) { this.catalogName = catalogName; this.properties = properties; this.isStreaming = isStreaming; @@ -189,13 +174,13 @@ protected TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { if (tEnv == null) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings - .newInstance() - .useBlinkPlanner(); + EnvironmentSettings.Builder settingsBuilder = + EnvironmentSettings.newInstance().useBlinkPlanner(); if (isStreaming) { settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); @@ -205,7 +190,8 @@ protected TableEnvironment getTableEnv() { tEnv = TableEnvironment.create(settingsBuilder.build()); } // Set only one parallelism. - tEnv.getConfig().getConfiguration() + tEnv.getConfig() + .getConfiguration() .set(CoreOptions.DEFAULT_PARALLELISM, 1) .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); } @@ -242,21 +228,24 @@ private void testCreateConnectorTable() { // Create table under the flink's current database. sql("CREATE TABLE %s (id BIGINT, data STRING) WITH %s", TABLE_NAME, toWithClause(tableProps)); sql("INSERT INTO %s VALUES (1, 'AAA'), (2, 'BBB'), (3, 'CCC')", TABLE_NAME); - Assert.assertEquals("Should have expected rows", + Assert.assertEquals( + "Should have expected rows", Sets.newHashSet(Row.of(1L, "AAA"), Row.of(2L, "BBB"), Row.of(3L, "CCC")), Sets.newHashSet(sql("SELECT * FROM %s", TABLE_NAME))); FlinkCatalogFactory factory = new FlinkCatalogFactory(); Catalog flinkCatalog = factory.createCatalog(catalogName, tableProps, new Configuration()); - Assert.assertTrue("Should have created the expected database", - flinkCatalog.databaseExists(databaseName())); - Assert.assertTrue("Should have created the expected table", + Assert.assertTrue( + "Should have created the expected database", flinkCatalog.databaseExists(databaseName())); + Assert.assertTrue( + "Should have created the expected table", flinkCatalog.tableExists(new ObjectPath(databaseName(), tableName()))); // Drop and create it again. sql("DROP TABLE %s", TABLE_NAME); sql("CREATE TABLE %s (id BIGINT, data STRING) WITH %s", TABLE_NAME, toWithClause(tableProps)); - Assert.assertEquals("Should have expected rows", + Assert.assertEquals( + "Should have expected rows", Sets.newHashSet(Row.of(1L, "AAA"), Row.of(2L, "BBB"), Row.of(3L, "CCC")), Sets.newHashSet(sql("SELECT * FROM %s", TABLE_NAME))); } @@ -274,7 +263,8 @@ public void testCatalogDatabaseConflictWithFlinkDatabase() { try { testCreateConnectorTable(); // Ensure that the table was created under the specific database. - AssertHelpers.assertThrows("Table should already exists", + AssertHelpers.assertThrows( + "Table should already exists", ValidationException.class, "Could not execute CreateTable in path", () -> sql("CREATE TABLE `default_catalog`.`%s`.`%s`", databaseName(), TABLE_NAME)); @@ -305,15 +295,14 @@ public void testConnectorTableInIcebergCatalog() { // Create a connector table in an iceberg catalog. sql("CREATE CATALOG `test_catalog` WITH %s", toWithClause(catalogProps)); try { - AssertHelpers.assertThrowsCause("Cannot create the iceberg connector table in iceberg catalog", + AssertHelpers.assertThrowsCause( + "Cannot create the iceberg connector table in iceberg catalog", IllegalArgumentException.class, "Cannot create the table with 'connector'='iceberg' table property in an iceberg catalog", - () -> sql("CREATE TABLE `test_catalog`.`%s`.`%s` (id BIGINT, data STRING) WITH %s", - FlinkCatalogFactory.DEFAULT_DATABASE_NAME, - TABLE_NAME, - toWithClause(tableProps) - ) - ); + () -> + sql( + "CREATE TABLE `test_catalog`.`%s`.`%s` (id BIGINT, data STRING) WITH %s", + FlinkCatalogFactory.DEFAULT_DATABASE_NAME, TABLE_NAME, toWithClause(tableProps))); } finally { sql("DROP CATALOG IF EXISTS `test_catalog`"); } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java index e90a9a469e47..6bd94e9ca61c 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; @@ -52,61 +54,62 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestManifestFileSerialization { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - required(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("double") - .build(); - - private static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withPartition(org.apache.iceberg.TestHelpers.Row.of(1D)) - .withPartitionPath("double=1") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - ImmutableMap.of(), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); - - private static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(0) - .withPartition(org.apache.iceberg.TestHelpers.Row.of(Double.NaN)) - .withPartitionPath("double=NaN") - .withMetrics(new Metrics(1L, - null, // no column sizes - ImmutableMap.of(1, 1L, 4, 1L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - ImmutableMap.of(4, 1L), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(1L)) // upper bounds - )) - .build(); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + required(4, "double", Types.DoubleType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("double").build(); + + private static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(0) + .withPartition(org.apache.iceberg.TestHelpers.Row.of(1D)) + .withPartitionPath("double=1") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + ImmutableMap.of(), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(4L)) // upper bounds + )) + .build(); + + private static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-2.parquet") + .withFileSizeInBytes(0) + .withPartition(org.apache.iceberg.TestHelpers.Row.of(Double.NaN)) + .withPartitionPath("double=NaN") + .withMetrics( + new Metrics( + 1L, + null, // no column sizes + ImmutableMap.of(1, 1L, 4, 1L), // value count + ImmutableMap.of(1, 0L, 2, 0L), // null count + ImmutableMap.of(4, 1L), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(1L)) // upper bounds + )) + .build(); private static final FileIO FILE_IO = new HadoopFileIO(new Configuration()); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testKryoSerialization() throws IOException { - KryoSerializer kryo = new KryoSerializer<>(ManifestFile.class, new ExecutionConfig()); + KryoSerializer kryo = + new KryoSerializer<>(ManifestFile.class, new ExecutionConfig()); DataOutputSerializer outputView = new DataOutputSerializer(1024); @@ -138,7 +141,8 @@ public void testJavaSerialization() throws Exception { out.writeObject(GenericManifestFile.copyOf(manifest).build()); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { for (int i = 0; i < 3; i += 1) { Object obj = in.readObject(); Assertions.assertThat(obj).as("Should be a ManifestFile").isInstanceOf(ManifestFile.class); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java index 9012fc564bd1..ae4ab2844bc8 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.Iterator; @@ -34,28 +33,32 @@ public class TestRowDataWrapper extends RecordWrapperTest { /** - * Flink's time type has been truncated to millis seconds, so we need a customized assert method to check the - * values. + * Flink's time type has been truncated to millis seconds, so we need a customized assert method + * to check the values. */ @Override public void testTime() { - generateAndValidate(new Schema(TIME.fields()), (message, expectedWrapper, actualWrapper) -> { - for (int pos = 0; pos < TIME.fields().size(); pos++) { - Object expected = expectedWrapper.get().get(pos, Object.class); - Object actual = actualWrapper.get().get(pos, Object.class); - if (expected == actual) { - return; - } + generateAndValidate( + new Schema(TIME.fields()), + (message, expectedWrapper, actualWrapper) -> { + for (int pos = 0; pos < TIME.fields().size(); pos++) { + Object expected = expectedWrapper.get().get(pos, Object.class); + Object actual = actualWrapper.get().get(pos, Object.class); + if (expected == actual) { + return; + } - if (expected == null || actual == null) { - Assert.fail(String.format("The expected value is %s but actual value is %s", expected, actual)); - } + if (expected == null || actual == null) { + Assert.fail( + String.format( + "The expected value is %s but actual value is %s", expected, actual)); + } - int expectedMilliseconds = (int) ((long) expected / 1000_000); - int actualMilliseconds = (int) ((long) actual / 1000_000); - Assert.assertEquals(message, expectedMilliseconds, actualMilliseconds); - } - }); + int expectedMilliseconds = (int) ((long) expected / 1000_000); + int actualMilliseconds = (int) ((long) actual / 1000_000); + Assert.assertEquals(message, expectedMilliseconds, actualMilliseconds); + } + }); } @Override @@ -65,7 +68,8 @@ protected void generateAndValidate(Schema schema, RecordWrapperTest.AssertMethod Iterable rowDataList = RandomRowData.generate(schema, numRecords, 101L); InternalRecordWrapper recordWrapper = new InternalRecordWrapper(schema.asStruct()); - RowDataWrapper rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + RowDataWrapper rowDataWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); Iterator actual = recordList.iterator(); Iterator expected = rowDataList.iterator(); @@ -79,8 +83,10 @@ protected void generateAndValidate(Schema schema, RecordWrapperTest.AssertMethod StructLike recordStructLike = recordWrapper.wrap(actual.next()); StructLike rowDataStructLike = rowDataWrapper.wrap(expected.next()); - assertMethod.assertEquals("Should have expected StructLike values", - actualWrapper.set(recordStructLike), expectedWrapper.set(rowDataStructLike)); + assertMethod.assertEquals( + "Should have expected StructLike values", + actualWrapper.set(recordStructLike), + expectedWrapper.set(rowDataStructLike)); } Assert.assertFalse("Shouldn't have more record", actual.hasNext()); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java index 5f7ae29ec737..61a821a9ac5a 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -35,9 +34,7 @@ public TestTableLoader(String dir) { } @Override - public void open() { - - } + public void open() {} @Override public Table loadTable() { @@ -45,7 +42,5 @@ public Table loadTable() { } @Override - public void close() { - - } + public void close() {} } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java index 0d5d5a4ec34a..3ad1d53db8d5 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.flink.TestHelpers.roundTripKryoSerialize; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -42,30 +45,22 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.flink.TestHelpers.roundTripKryoSerialize; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestTableSerialization { private static final HadoopTables TABLES = new HadoopTables(); - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("date").build(); - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA) - .asc("id") - .build(); + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @Before @@ -81,16 +76,18 @@ public void initTable() throws IOException { @Test public void testSerializableTableKryoSerialization() throws IOException { SerializableTable serializableTable = (SerializableTable) SerializableTable.copyOf(table); - org.apache.iceberg.TestHelpers.assertSerializedAndLoadedMetadata(table, - roundTripKryoSerialize(SerializableTable.class, serializableTable)); + org.apache.iceberg.TestHelpers.assertSerializedAndLoadedMetadata( + table, roundTripKryoSerialize(SerializableTable.class, serializableTable)); } @Test public void testSerializableMetadataTableKryoSerialization() throws IOException { for (MetadataTableType type : MetadataTableType.values()) { TableOperations ops = ((HasTableOperations) table).operations(); - Table metadataTable = MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); - SerializableTable serializableMetadataTable = (SerializableTable) SerializableTable.copyOf(metadataTable); + Table metadataTable = + MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); + SerializableTable serializableMetadataTable = + (SerializableTable) SerializableTable.copyOf(metadataTable); org.apache.iceberg.TestHelpers.assertSerializedAndLoadedMetadata( metadataTable, @@ -102,14 +99,12 @@ public void testSerializableMetadataTableKryoSerialization() throws IOException public void testSerializableTransactionTableKryoSerialization() throws IOException { Transaction txn = table.newTransaction(); - txn.updateProperties() - .set("k1", "v1") - .commit(); + txn.updateProperties().set("k1", "v1").commit(); Table txnTable = txn.table(); SerializableTable serializableTxnTable = (SerializableTable) SerializableTable.copyOf(txnTable); - TestHelpers.assertSerializedMetadata(txnTable, - roundTripKryoSerialize(SerializableTable.class, serializableTxnTable)); + TestHelpers.assertSerializedMetadata( + txnTable, roundTripKryoSerialize(SerializableTable.class, serializableTxnTable)); } } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java index 03cdcd80dec1..e59d7dacd978 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java @@ -16,10 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.flink.actions; +import static org.apache.iceberg.flink.SimpleDataUtil.RECORD; + import java.io.File; import java.io.IOException; import java.util.List; @@ -58,8 +58,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.flink.SimpleDataUtil.RECORD; - @RunWith(Parameterized.class) public class TestRewriteDataFilesAction extends FlinkCatalogTestBase { @@ -69,24 +67,23 @@ public class TestRewriteDataFilesAction extends FlinkCatalogTestBase { private Table icebergTableUnPartitioned; private Table icebergTablePartitioned; - public TestRewriteDataFilesAction(String catalogName, Namespace baseNamespace, FileFormat format) { + public TestRewriteDataFilesAction( + String catalogName, Namespace baseNamespace, FileFormat format) { super(catalogName, baseNamespace); this.format = format; } @Override protected TableEnvironment getTableEnv() { - super.getTableEnv() - .getConfig() - .getConfiguration() - .set(CoreOptions.DEFAULT_PARALLELISM, 1); + super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1); return super.getTableEnv(); } @Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}") public static Iterable parameters() { List parameters = Lists.newArrayList(); - for (FileFormat format : new FileFormat[] {FileFormat.AVRO, FileFormat.ORC, FileFormat.PARQUET}) { + for (FileFormat format : + new FileFormat[] {FileFormat.AVRO, FileFormat.ORC, FileFormat.PARQUET}) { for (Object[] catalogParams : FlinkCatalogTestBase.parameters()) { String catalogName = (String) catalogParams[0]; Namespace baseNamespace = (Namespace) catalogParams[1]; @@ -96,8 +93,7 @@ public static Iterable parameters() { return parameters; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Override @Before @@ -106,16 +102,18 @@ public void before() { sql("CREATE DATABASE %s", flinkDatabase); sql("USE CATALOG %s", catalogName); sql("USE %s", DATABASE); - sql("CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", TABLE_NAME_UNPARTITIONED, - format.name()); - icebergTableUnPartitioned = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, - TABLE_NAME_UNPARTITIONED)); - - sql("CREATE TABLE %s (id int, data varchar,spec varchar) " + - " PARTITIONED BY (data,spec) with ('write.format.default'='%s')", + sql( + "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", + TABLE_NAME_UNPARTITIONED, format.name()); + icebergTableUnPartitioned = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_UNPARTITIONED)); + + sql( + "CREATE TABLE %s (id int, data varchar,spec varchar) " + + " PARTITIONED BY (data,spec) with ('write.format.default'='%s')", TABLE_NAME_PARTITIONED, format.name()); - icebergTablePartitioned = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, - TABLE_NAME_PARTITIONED)); + icebergTablePartitioned = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_PARTITIONED)); } @Override @@ -130,13 +128,10 @@ public void clean() { @Test public void testRewriteDataFilesEmptyTable() throws Exception { Assert.assertNull("Table must be empty", icebergTableUnPartitioned.currentSnapshot()); - Actions.forTable(icebergTableUnPartitioned) - .rewriteDataFiles() - .execute(); + Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute(); Assert.assertNull("Table must stay empty", icebergTableUnPartitioned.currentSnapshot()); } - @Test public void testRewriteDataFilesUnpartitionedTable() throws Exception { sql("INSERT INTO %s SELECT 1, 'hello'", TABLE_NAME_UNPARTITIONED); @@ -145,13 +140,12 @@ public void testRewriteDataFilesUnpartitionedTable() throws Exception { icebergTableUnPartitioned.refresh(); CloseableIterable tasks = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 2 data files before rewrite", 2, dataFiles.size()); RewriteDataFilesActionResult result = - Actions.forTable(icebergTableUnPartitioned) - .rewriteDataFiles() - .execute(); + Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); @@ -159,14 +153,15 @@ public void testRewriteDataFilesUnpartitionedTable() throws Exception { icebergTableUnPartitioned.refresh(); CloseableIterable tasks1 = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 1 data files after rewrite", 1, dataFiles1.size()); // Assert the table records as expected. - SimpleDataUtil.assertTableRecords(icebergTableUnPartitioned, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), - SimpleDataUtil.createRecord(2, "world") - )); + SimpleDataUtil.assertTableRecords( + icebergTableUnPartitioned, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hello"), SimpleDataUtil.createRecord(2, "world"))); } @Test @@ -179,13 +174,12 @@ public void testRewriteDataFilesPartitionedTable() throws Exception { icebergTablePartitioned.refresh(); CloseableIterable tasks = icebergTablePartitioned.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles.size()); RewriteDataFilesActionResult result = - Actions.forTable(icebergTablePartitioned) - .rewriteDataFiles() - .execute(); + Actions.forTable(icebergTablePartitioned).rewriteDataFiles().execute(); Assert.assertEquals("Action should rewrite 4 data files", 4, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 2 data file", 2, result.addedDataFiles().size()); @@ -193,26 +187,27 @@ public void testRewriteDataFilesPartitionedTable() throws Exception { icebergTablePartitioned.refresh(); CloseableIterable tasks1 = icebergTablePartitioned.newScan().planFiles(); - List dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 2 data files after rewrite", 2, dataFiles1.size()); // Assert the table records as expected. - Schema schema = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "spec", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "spec", Types.StringType.get())); Record record = GenericRecord.create(schema); - SimpleDataUtil.assertTableRecords(icebergTablePartitioned, Lists.newArrayList( - record.copy("id", 1, "data", "hello", "spec", "a"), - record.copy("id", 2, "data", "hello", "spec", "a"), - record.copy("id", 3, "data", "world", "spec", "b"), - record.copy("id", 4, "data", "world", "spec", "b") - )); + SimpleDataUtil.assertTableRecords( + icebergTablePartitioned, + Lists.newArrayList( + record.copy("id", 1, "data", "hello", "spec", "a"), + record.copy("id", 2, "data", "hello", "spec", "a"), + record.copy("id", 3, "data", "world", "spec", "b"), + record.copy("id", 4, "data", "world", "spec", "b"))); } - @Test public void testRewriteDataFilesWithFilter() throws Exception { sql("INSERT INTO %s SELECT 1, 'hello' ,'a'", TABLE_NAME_PARTITIONED); @@ -224,7 +219,8 @@ public void testRewriteDataFilesWithFilter() throws Exception { icebergTablePartitioned.refresh(); CloseableIterable tasks = icebergTablePartitioned.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 5 data files before rewrite", 5, dataFiles.size()); RewriteDataFilesActionResult result = @@ -240,24 +236,26 @@ public void testRewriteDataFilesWithFilter() throws Exception { icebergTablePartitioned.refresh(); CloseableIterable tasks1 = icebergTablePartitioned.newScan().planFiles(); - List dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 4 data files after rewrite", 4, dataFiles1.size()); // Assert the table records as expected. - Schema schema = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "spec", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "spec", Types.StringType.get())); Record record = GenericRecord.create(schema); - SimpleDataUtil.assertTableRecords(icebergTablePartitioned, Lists.newArrayList( - record.copy("id", 1, "data", "hello", "spec", "a"), - record.copy("id", 2, "data", "hello", "spec", "a"), - record.copy("id", 3, "data", "world", "spec", "a"), - record.copy("id", 4, "data", "world", "spec", "b"), - record.copy("id", 5, "data", "world", "spec", "b") - )); + SimpleDataUtil.assertTableRecords( + icebergTablePartitioned, + Lists.newArrayList( + record.copy("id", 1, "data", "hello", "spec", "a"), + record.copy("id", 2, "data", "hello", "spec", "a"), + record.copy("id", 3, "data", "world", "spec", "a"), + record.copy("id", 4, "data", "world", "spec", "b"), + record.copy("id", 5, "data", "world", "spec", "b"))); } @Test @@ -285,22 +283,23 @@ public void testRewriteLargeTableHasResiduals() throws IOException { icebergTableUnPartitioned.refresh(); - CloseableIterable tasks = icebergTableUnPartitioned.newScan() - .ignoreResiduals() - .filter(Expressions.equal("data", "0")) - .planFiles(); + CloseableIterable tasks = + icebergTableUnPartitioned + .newScan() + .ignoreResiduals() + .filter(Expressions.equal("data", "0")) + .planFiles(); for (FileScanTask task : tasks) { Assert.assertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual()); } - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 2 data files before rewrite", 2, dataFiles.size()); Actions actions = Actions.forTable(icebergTableUnPartitioned); - RewriteDataFilesActionResult result = actions - .rewriteDataFiles() - .filter(Expressions.equal("data", "0")) - .execute(); + RewriteDataFilesActionResult result = + actions.rewriteDataFiles().filter(Expressions.equal("data", "0")).execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); @@ -310,13 +309,14 @@ public void testRewriteLargeTableHasResiduals() throws IOException { /** * a test case to test avoid repeate compress - *

- * If datafile cannot be combined to CombinedScanTask with other DataFiles, the size of the CombinedScanTask list size - * is 1, so we remove these CombinedScanTasks to avoid compressed repeatedly. - *

- * In this test case,we generated 3 data files and set targetSizeInBytes greater than the largest file size so that it - * cannot be combined a CombinedScanTask with other datafiles. The datafile with the largest file size will not be - * compressed. + * + *

If datafile cannot be combined to CombinedScanTask with other DataFiles, the size of the + * CombinedScanTask list size is 1, so we remove these CombinedScanTasks to avoid compressed + * repeatedly. + * + *

In this test case,we generated 3 data files and set targetSizeInBytes greater than the + * largest file size so that it cannot be combined a CombinedScanTask with other datafiles. The + * datafile with the largest file size will not be compressed. * * @throws IOException IOException */ @@ -327,7 +327,8 @@ public void testRewriteAvoidRepeateCompress() throws IOException { GenericAppenderFactory genericAppenderFactory = new GenericAppenderFactory(schema); File file = temp.newFile(); int count = 0; - try (FileAppender fileAppender = genericAppenderFactory.newAppender(Files.localOutput(file), format)) { + try (FileAppender fileAppender = + genericAppenderFactory.newAppender(Files.localOutput(file), format)) { long filesize = 20000; for (; fileAppender.length() < filesize; count++) { Record record = SimpleDataUtil.createRecord(count, UUID.randomUUID().toString()); @@ -336,16 +337,15 @@ public void testRewriteAvoidRepeateCompress() throws IOException { } } - DataFile dataFile = DataFiles.builder(icebergTableUnPartitioned.spec()) - .withPath(file.getAbsolutePath()) - .withFileSizeInBytes(file.length()) - .withFormat(format) - .withRecordCount(count) - .build(); + DataFile dataFile = + DataFiles.builder(icebergTableUnPartitioned.spec()) + .withPath(file.getAbsolutePath()) + .withFileSizeInBytes(file.length()) + .withFormat(format) + .withRecordCount(count) + .build(); - icebergTableUnPartitioned.newAppend() - .appendFile(dataFile) - .commit(); + icebergTableUnPartitioned.newAppend().appendFile(dataFile).commit(); sql("INSERT INTO %s SELECT 1,'a' ", TABLE_NAME_UNPARTITIONED); sql("INSERT INTO %s SELECT 2,'b' ", TABLE_NAME_UNPARTITIONED); @@ -353,28 +353,32 @@ public void testRewriteAvoidRepeateCompress() throws IOException { icebergTableUnPartitioned.refresh(); CloseableIterable tasks = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 3 data files before rewrite", 3, dataFiles.size()); Actions actions = Actions.forTable(icebergTableUnPartitioned); long targetSizeInBytes = file.length() + 10; - RewriteDataFilesActionResult result = actions - .rewriteDataFiles() - .targetSizeInBytes(targetSizeInBytes) - .splitOpenFileCost(1) - .execute(); + RewriteDataFilesActionResult result = + actions + .rewriteDataFiles() + .targetSizeInBytes(targetSizeInBytes) + .splitOpenFileCost(1) + .execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); icebergTableUnPartitioned.refresh(); CloseableIterable tasks1 = icebergTableUnPartitioned.newScan().planFiles(); - List dataFilesRewrote = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + List dataFilesRewrote = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 2 data files after rewrite", 2, dataFilesRewrote.size()); // the biggest file do not be rewrote - List rewroteDataFileNames = dataFilesRewrote.stream().map(ContentFile::path).collect(Collectors.toList()); + List rewroteDataFileNames = + dataFilesRewrote.stream().map(ContentFile::path).collect(Collectors.toList()); Assert.assertTrue(rewroteDataFileNames.contains(file.getAbsolutePath())); // Assert the table records as expected. diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java index b6fee9259f53..cc58d9817ac6 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import org.apache.flink.table.data.RowData; @@ -27,8 +26,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Iterables; public class RandomRowData { - private RandomRowData() { - } + private RandomRowData() {} public static Iterable generate(Schema schema, int numRecords, long seed) { return convert(schema, RandomGenericData.generate(schema, numRecords, seed)); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java index 88288f893eaf..64acecfb0415 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.File; @@ -53,18 +52,21 @@ protected void writeAndValidate(Schema schema) throws IOException { File recordsFile = temp.newFile(); Assert.assertTrue("Delete should succeed", recordsFile.delete()); - // Write the expected records into AVRO file, then read them into RowData and assert with the expected Record list. - try (FileAppender writer = Avro.write(Files.localOutput(recordsFile)) - .schema(schema) - .createWriterFunc(DataWriter::create) - .build()) { + // Write the expected records into AVRO file, then read them into RowData and assert with the + // expected Record list. + try (FileAppender writer = + Avro.write(Files.localOutput(recordsFile)) + .schema(schema) + .createWriterFunc(DataWriter::create) + .build()) { writer.addAll(expectedRecords); } - try (CloseableIterable reader = Avro.read(Files.localInput(recordsFile)) - .project(schema) - .createReaderFunc(FlinkAvroReader::new) - .build()) { + try (CloseableIterable reader = + Avro.read(Files.localInput(recordsFile)) + .project(schema) + .createReaderFunc(FlinkAvroReader::new) + .build()) { Iterator expected = expectedRecords.iterator(); Iterator rows = reader.iterator(); for (int i = 0; i < NUM_RECORDS; i++) { @@ -77,18 +79,21 @@ protected void writeAndValidate(Schema schema) throws IOException { File rowDataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", rowDataFile.delete()); - // Write the expected RowData into AVRO file, then read them into Record and assert with the expected RowData list. - try (FileAppender writer = Avro.write(Files.localOutput(rowDataFile)) - .schema(schema) - .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) - .build()) { + // Write the expected RowData into AVRO file, then read them into Record and assert with the + // expected RowData list. + try (FileAppender writer = + Avro.write(Files.localOutput(rowDataFile)) + .schema(schema) + .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) + .build()) { writer.addAll(expectedRows); } - try (CloseableIterable reader = Avro.read(Files.localInput(rowDataFile)) - .project(schema) - .createReaderFunc(DataReader::create) - .build()) { + try (CloseableIterable reader = + Avro.read(Files.localInput(rowDataFile)) + .project(schema) + .createReaderFunc(DataReader::create) + .build()) { Iterator expected = expectedRows.iterator(); Iterator records = reader.iterator(); for (int i = 0; i < NUM_RECORDS; i += 1) { diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java index 5f4a7c00d1c8..fdffc0e01c20 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.File; @@ -52,18 +51,21 @@ protected void writeAndValidate(Schema schema) throws IOException { File recordsFile = temp.newFile(); Assert.assertTrue("Delete should succeed", recordsFile.delete()); - // Write the expected records into ORC file, then read them into RowData and assert with the expected Record list. - try (FileAppender writer = ORC.write(Files.localOutput(recordsFile)) - .schema(schema) - .createWriterFunc(GenericOrcWriter::buildWriter) - .build()) { + // Write the expected records into ORC file, then read them into RowData and assert with the + // expected Record list. + try (FileAppender writer = + ORC.write(Files.localOutput(recordsFile)) + .schema(schema) + .createWriterFunc(GenericOrcWriter::buildWriter) + .build()) { writer.addAll(expectedRecords); } - try (CloseableIterable reader = ORC.read(Files.localInput(recordsFile)) - .project(schema) - .createReaderFunc(type -> new FlinkOrcReader(schema, type)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(recordsFile)) + .project(schema) + .createReaderFunc(type -> new FlinkOrcReader(schema, type)) + .build()) { Iterator expected = expectedRecords.iterator(); Iterator rows = reader.iterator(); for (int i = 0; i < NUM_RECORDS; i++) { @@ -76,19 +78,22 @@ protected void writeAndValidate(Schema schema) throws IOException { File rowDataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", rowDataFile.delete()); - // Write the expected RowData into ORC file, then read them into Record and assert with the expected RowData list. + // Write the expected RowData into ORC file, then read them into Record and assert with the + // expected RowData list. RowType rowType = FlinkSchemaUtil.convert(schema); - try (FileAppender writer = ORC.write(Files.localOutput(rowDataFile)) - .schema(schema) - .createWriterFunc((iSchema, typeDesc) -> FlinkOrcWriter.buildWriter(rowType, iSchema)) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(rowDataFile)) + .schema(schema) + .createWriterFunc((iSchema, typeDesc) -> FlinkOrcWriter.buildWriter(rowType, iSchema)) + .build()) { writer.addAll(expectedRows); } - try (CloseableIterable reader = ORC.read(Files.localInput(rowDataFile)) - .project(schema) - .createReaderFunc(type -> GenericOrcReader.buildReader(schema, type)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(rowDataFile)) + .project(schema) + .createReaderFunc(type -> GenericOrcReader.buildReader(schema, type)) + .build()) { Iterator expected = expectedRows.iterator(); Iterator records = reader.iterator(); for (int i = 0; i < NUM_RECORDS; i += 1) { diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java index 9af37c85d72a..51060e14e1ae 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.File; @@ -44,17 +43,19 @@ private void writeAndValidate(Iterable iterable, Schema schema) throws I File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .createWriterFunc(GenericParquetWriter::buildWriter) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(schema) + .createWriterFunc(GenericParquetWriter::buildWriter) + .build()) { writer.addAll(iterable); } - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) + .build()) { Iterator expected = iterable.iterator(); Iterator rows = reader.iterator(); LogicalType rowType = FlinkSchemaUtil.convert(schema); @@ -69,7 +70,10 @@ private void writeAndValidate(Iterable iterable, Schema schema) throws I @Override protected void writeAndValidate(Schema schema) throws IOException { writeAndValidate(RandomGenericData.generate(schema, NUM_RECORDS, 19981), schema); - writeAndValidate(RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124), schema); - writeAndValidate(RandomGenericData.generateFallbackRecords(schema, NUM_RECORDS, 21124, NUM_RECORDS / 20), schema); + writeAndValidate( + RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124), schema); + writeAndValidate( + RandomGenericData.generateFallbackRecords(schema, NUM_RECORDS, 21124, NUM_RECORDS / 20), + schema); } } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java index 1db0f8767518..7b868eafc311 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.File; @@ -42,8 +41,7 @@ public class TestFlinkParquetWriter extends DataTest { private static final int NUM_RECORDS = 100; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private void writeAndValidate(Iterable iterable, Schema schema) throws IOException { File testFile = temp.newFile(); @@ -51,17 +49,19 @@ private void writeAndValidate(Iterable iterable, Schema schema) throws LogicalType logicalType = FlinkSchemaUtil.convert(schema); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(logicalType, msgType)) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(schema) + .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(logicalType, msgType)) + .build()) { writer.addAll(iterable); } - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(msgType -> GenericParquetReaders.buildReader(schema, msgType)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(msgType -> GenericParquetReaders.buildReader(schema, msgType)) + .build()) { Iterator expected = iterable.iterator(); Iterator actual = reader.iterator(); LogicalType rowType = FlinkSchemaUtil.convert(schema); @@ -75,15 +75,19 @@ private void writeAndValidate(Iterable iterable, Schema schema) throws @Override protected void writeAndValidate(Schema schema) throws IOException { - writeAndValidate( - RandomRowData.generate(schema, NUM_RECORDS, 19981), schema); + writeAndValidate(RandomRowData.generate(schema, NUM_RECORDS, 19981), schema); - writeAndValidate(RandomRowData.convert(schema, - RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124)), + writeAndValidate( + RandomRowData.convert( + schema, + RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124)), schema); - writeAndValidate(RandomRowData.convert(schema, - RandomGenericData.generateFallbackRecords(schema, NUM_RECORDS, 21124, NUM_RECORDS / 20)), + writeAndValidate( + RandomRowData.convert( + schema, + RandomGenericData.generateFallbackRecords( + schema, NUM_RECORDS, 21124, NUM_RECORDS / 20)), schema); } } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java index 37016adfbdf2..4cb77b11fd7b 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.Iterator; @@ -36,98 +35,96 @@ public class TestRowDataProjection { @Test public void testFullProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); generateAndValidate(schema, schema); } @Test public void testReorderedFullProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); - Schema reordered = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); generateAndValidate(schema, reordered); } @Test public void testBasicProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); - Schema id = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); - Schema data = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + Schema id = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + Schema data = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); generateAndValidate(schema, id); generateAndValidate(schema, data); } @Test public void testEmptyProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); generateAndValidate(schema, schema.select()); } @Test public void testRename() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); - - Schema renamed = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + Schema renamed = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); generateAndValidate(schema, renamed); } @Test public void testNestedProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); // Project id only. - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); generateAndValidate(schema, idOnly); // Project lat only. - Schema latOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()) - )) - ); + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); generateAndValidate(schema, latOnly); // Project long only. - Schema longOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); generateAndValidate(schema, longOnly); // Project location. @@ -137,37 +134,40 @@ public void testNestedProjection() { @Test public void testPrimitiveTypeProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(2, "b", Types.BooleanType.get()), - Types.NestedField.optional(3, "i", Types.IntegerType.get()), - Types.NestedField.required(4, "l", Types.LongType.get()), - Types.NestedField.optional(5, "f", Types.FloatType.get()), - Types.NestedField.required(6, "d", Types.DoubleType.get()), - Types.NestedField.optional(7, "date", Types.DateType.get()), - Types.NestedField.optional(8, "time", Types.TimeType.get()), - Types.NestedField.required(9, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.required(10, "ts_tz", Types.TimestampType.withZone()), - Types.NestedField.required(11, "s", Types.StringType.get()), - Types.NestedField.required(12, "fixed", Types.FixedType.ofLength(7)), - Types.NestedField.optional(13, "bytes", Types.BinaryType.get()), - Types.NestedField.required(14, "dec_9_0", Types.DecimalType.of(9, 0)), - Types.NestedField.required(15, "dec_11_2", Types.DecimalType.of(11, 2)), - Types.NestedField.required(16, "dec_38_10", Types.DecimalType.of(38, 10))// maximum precision - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(2, "b", Types.BooleanType.get()), + Types.NestedField.optional(3, "i", Types.IntegerType.get()), + Types.NestedField.required(4, "l", Types.LongType.get()), + Types.NestedField.optional(5, "f", Types.FloatType.get()), + Types.NestedField.required(6, "d", Types.DoubleType.get()), + Types.NestedField.optional(7, "date", Types.DateType.get()), + Types.NestedField.optional(8, "time", Types.TimeType.get()), + Types.NestedField.required(9, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.required(10, "ts_tz", Types.TimestampType.withZone()), + Types.NestedField.required(11, "s", Types.StringType.get()), + Types.NestedField.required(12, "fixed", Types.FixedType.ofLength(7)), + Types.NestedField.optional(13, "bytes", Types.BinaryType.get()), + Types.NestedField.required(14, "dec_9_0", Types.DecimalType.of(9, 0)), + Types.NestedField.required(15, "dec_11_2", Types.DecimalType.of(11, 2)), + Types.NestedField.required( + 16, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision + ); generateAndValidate(schema, schema); } @Test public void testPrimitiveMapTypeProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "map", Types.MapType.ofOptional( - 1, 2, Types.IntegerType.get(), Types.StringType.get() - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "map", + Types.MapType.ofOptional(1, 2, Types.IntegerType.get(), Types.StringType.get()))); // Project id only. Schema idOnly = schema.select("id"); @@ -183,20 +183,21 @@ public void testPrimitiveMapTypeProjection() { @Test public void testNestedMapTypeProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(7, "map", Types.MapType.ofOptional( - 5, 6, - Types.StructType.of( - Types.NestedField.required(1, "key", Types.LongType.get()), - Types.NestedField.required(2, "keyData", Types.LongType.get()) - ), - Types.StructType.of( - Types.NestedField.required(3, "value", Types.LongType.get()), - Types.NestedField.required(4, "valueData", Types.LongType.get()) - ) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 7, + "map", + Types.MapType.ofOptional( + 5, + 6, + Types.StructType.of( + Types.NestedField.required(1, "key", Types.LongType.get()), + Types.NestedField.required(2, "keyData", Types.LongType.get())), + Types.StructType.of( + Types.NestedField.required(3, "value", Types.LongType.get()), + Types.NestedField.required(4, "valueData", Types.LongType.get()))))); // Project id only. Schema idOnly = schema.select("id"); @@ -210,50 +211,52 @@ public void testNestedMapTypeProjection() { generateAndValidate(schema, schema); // Project partial map key. - Schema partialMapKey = new Schema( - Types.NestedField.optional(7, "map", Types.MapType.ofOptional( - 5, 6, - Types.StructType.of( - Types.NestedField.required(1, "key", Types.LongType.get()) - ), - Types.StructType.of( - Types.NestedField.required(3, "value", Types.LongType.get()), - Types.NestedField.required(4, "valueData", Types.LongType.get()) - ) - )) - ); - AssertHelpers.assertThrows("Should not allow to project a partial map key with non-primitive type.", - IllegalArgumentException.class, "Cannot project a partial map key or value", - () -> generateAndValidate(schema, partialMapKey) - ); + Schema partialMapKey = + new Schema( + Types.NestedField.optional( + 7, + "map", + Types.MapType.ofOptional( + 5, + 6, + Types.StructType.of(Types.NestedField.required(1, "key", Types.LongType.get())), + Types.StructType.of( + Types.NestedField.required(3, "value", Types.LongType.get()), + Types.NestedField.required(4, "valueData", Types.LongType.get()))))); + AssertHelpers.assertThrows( + "Should not allow to project a partial map key with non-primitive type.", + IllegalArgumentException.class, + "Cannot project a partial map key or value", + () -> generateAndValidate(schema, partialMapKey)); // Project partial map key. - Schema partialMapValue = new Schema( - Types.NestedField.optional(7, "map", Types.MapType.ofOptional( - 5, 6, - Types.StructType.of( - Types.NestedField.required(1, "key", Types.LongType.get()), - Types.NestedField.required(2, "keyData", Types.LongType.get()) - ), - Types.StructType.of( - Types.NestedField.required(3, "value", Types.LongType.get()) - ) - )) - ); - AssertHelpers.assertThrows("Should not allow to project a partial map value with non-primitive type.", - IllegalArgumentException.class, "Cannot project a partial map key or value", - () -> generateAndValidate(schema, partialMapValue) - ); + Schema partialMapValue = + new Schema( + Types.NestedField.optional( + 7, + "map", + Types.MapType.ofOptional( + 5, + 6, + Types.StructType.of( + Types.NestedField.required(1, "key", Types.LongType.get()), + Types.NestedField.required(2, "keyData", Types.LongType.get())), + Types.StructType.of( + Types.NestedField.required(3, "value", Types.LongType.get()))))); + AssertHelpers.assertThrows( + "Should not allow to project a partial map value with non-primitive type.", + IllegalArgumentException.class, + "Cannot project a partial map key or value", + () -> generateAndValidate(schema, partialMapValue)); } @Test public void testPrimitiveListTypeProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(2, "list", Types.ListType.ofOptional( - 1, Types.StringType.get() - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 2, "list", Types.ListType.ofOptional(1, Types.StringType.get()))); // Project id only. Schema idOnly = schema.select("id"); @@ -269,16 +272,18 @@ public void testPrimitiveListTypeProjection() { @Test public void testNestedListTypeProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "list", Types.ListType.ofOptional( - 4, Types.StructType.of( - Types.NestedField.required(1, "nestedListField1", Types.LongType.get()), - Types.NestedField.required(2, "nestedListField2", Types.LongType.get()), - Types.NestedField.required(3, "nestedListField3", Types.LongType.get()) - ) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "list", + Types.ListType.ofOptional( + 4, + Types.StructType.of( + Types.NestedField.required(1, "nestedListField1", Types.LongType.get()), + Types.NestedField.required(2, "nestedListField2", Types.LongType.get()), + Types.NestedField.required(3, "nestedListField3", Types.LongType.get()))))); // Project id only. Schema idOnly = schema.select("id"); @@ -292,17 +297,20 @@ public void testNestedListTypeProjection() { generateAndValidate(schema, schema); // Project partial list value. - Schema partialList = new Schema( - Types.NestedField.optional(5, "list", Types.ListType.ofOptional( - 4, Types.StructType.of( - Types.NestedField.required(2, "nestedListField2", Types.LongType.get()) - ) - )) - ); - AssertHelpers.assertThrows("Should not allow to project a partial list element with non-primitive type.", - IllegalArgumentException.class, "Cannot project a partial list element", - () -> generateAndValidate(schema, partialList) - ); + Schema partialList = + new Schema( + Types.NestedField.optional( + 5, + "list", + Types.ListType.ofOptional( + 4, + Types.StructType.of( + Types.NestedField.required(2, "nestedListField2", Types.LongType.get()))))); + AssertHelpers.assertThrows( + "Should not allow to project a partial list element with non-primitive type.", + IllegalArgumentException.class, + "Cannot project a partial list element", + () -> generateAndValidate(schema, partialList)); } private void generateAndValidate(Schema schema, Schema projectSchema) { diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java index 9ccb1d56c0ed..df2e6ae21c7e 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.File; @@ -45,34 +44,36 @@ public class TestRowProjection { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private RowData writeAndRead(String desc, Schema writeSchema, Schema readSchema, RowData row) throws IOException { + private RowData writeAndRead(String desc, Schema writeSchema, Schema readSchema, RowData row) + throws IOException { File file = temp.newFile(desc + ".avro"); Assert.assertTrue(file.delete()); - try (FileAppender appender = Avro.write(Files.localOutput(file)) - .schema(writeSchema) - .createWriterFunc(ignore -> new FlinkAvroWriter(FlinkSchemaUtil.convert(writeSchema))) - .build()) { + try (FileAppender appender = + Avro.write(Files.localOutput(file)) + .schema(writeSchema) + .createWriterFunc(ignore -> new FlinkAvroWriter(FlinkSchemaUtil.convert(writeSchema))) + .build()) { appender.add(row); } - Iterable records = Avro.read(Files.localInput(file)) - .project(readSchema) - .createReaderFunc(FlinkAvroReader::new) - .build(); + Iterable records = + Avro.read(Files.localInput(file)) + .project(readSchema) + .createReaderFunc(FlinkAvroReader::new) + .build(); return Iterables.getOnlyElement(records); } @Test public void testFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); @@ -80,93 +81,96 @@ public void testFullProjection() throws Exception { Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - int cmp = Comparators.charSequences() - .compare("test", projected.getString(1).toString()); + int cmp = Comparators.charSequences().compare("test", projected.getString(1).toString()); Assert.assertEquals("Should contain the correct data value", cmp, 0); } @Test public void testSpecialCharacterProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "user id", Types.LongType.get()), - Types.NestedField.optional(1, "data%0", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "user id", Types.LongType.get()), + Types.NestedField.optional(1, "data%0", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); RowData full = writeAndRead("special_chars", schema, schema, row); Assert.assertEquals("Should contain the correct id value", 34L, full.getLong(0)); - Assert.assertEquals("Should contain the correct data value", + Assert.assertEquals( + "Should contain the correct data value", 0, Comparators.charSequences().compare("test", full.getString(1).toString())); RowData projected = writeAndRead("special_characters", schema, schema.select("data%0"), full); Assert.assertEquals("Should not contain id value", 1, projected.getArity()); - Assert.assertEquals("Should contain the correct data value", + Assert.assertEquals( + "Should contain the correct data value", 0, Comparators.charSequences().compare("test", projected.getString(0).toString())); } @Test public void testReorderedFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); - Schema reordered = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("full_projection", schema, reordered, row); - Assert.assertEquals("Should contain the correct 0 value", "test", projected.getString(0).toString()); + Assert.assertEquals( + "Should contain the correct 0 value", "test", projected.getString(0).toString()); Assert.assertEquals("Should contain the correct 1 value", 34L, projected.getLong(1)); } @Test public void testReorderedProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); - Schema reordered = new Schema( - Types.NestedField.optional(2, "missing_1", Types.StringType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(3, "missing_2", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(2, "missing_1", Types.StringType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(3, "missing_2", Types.LongType.get())); RowData projected = writeAndRead("full_projection", schema, reordered, row); Assert.assertTrue("Should contain the correct 0 value", projected.isNullAt(0)); - Assert.assertEquals("Should contain the correct 1 value", "test", projected.getString(1).toString()); + Assert.assertEquals( + "Should contain the correct 1 value", "test", projected.getString(1).toString()); Assert.assertTrue("Should contain the correct 2 value", projected.isNullAt(2)); } @Test public void testRenamedAddedField() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(1, "a", Types.LongType.get()), - Types.NestedField.required(2, "b", Types.LongType.get()), - Types.NestedField.required(3, "d", Types.LongType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(1, "a", Types.LongType.get()), + Types.NestedField.required(2, "b", Types.LongType.get()), + Types.NestedField.required(3, "d", Types.LongType.get())); RowData row = GenericRowData.of(100L, 200L, 300L); - Schema renamedAdded = new Schema( - Types.NestedField.optional(1, "a", Types.LongType.get()), - Types.NestedField.optional(2, "b", Types.LongType.get()), - Types.NestedField.optional(3, "c", Types.LongType.get()), - Types.NestedField.optional(4, "d", Types.LongType.get()) - ); + Schema renamedAdded = + new Schema( + Types.NestedField.optional(1, "a", Types.LongType.get()), + Types.NestedField.optional(2, "b", Types.LongType.get()), + Types.NestedField.optional(3, "c", Types.LongType.get()), + Types.NestedField.optional(4, "d", Types.LongType.get())); RowData projected = writeAndRead("rename_and_add_column_projection", schema, renamedAdded, row); Assert.assertEquals("Should contain the correct value in column 1", projected.getLong(0), 100L); @@ -177,10 +181,10 @@ public void testRenamedAddedField() throws Exception { @Test public void testEmptyProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); @@ -192,24 +196,20 @@ public void testEmptyProjection() throws Exception { @Test public void testBasicProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("basic_projection_id", writeSchema, idOnly, row); Assert.assertEquals("Should not project data", 1, projected.getArity()); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - Schema dataOnly = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, row); @@ -220,17 +220,17 @@ public void testBasicProjection() throws Exception { @Test public void testRename() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); - Schema readSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get()) - ); + Schema readSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); RowData projected = writeAndRead("project_and_rename", writeSchema, readSchema, row); @@ -241,83 +241,87 @@ public void testRename() throws Exception { @Test public void testNestedStructProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); RowData location = GenericRowData.of(52.995143f, -1.539054f); RowData record = GenericRowData.of(34L, location); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("id_only", writeSchema, idOnly, record); Assert.assertEquals("Should not project location", 1, projected.getArity()); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - Schema latOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()) - )) - ); + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); projected = writeAndRead("latitude_only", writeSchema, latOnly, record); RowData projectedLocation = projected.getRow(0, 1); Assert.assertEquals("Should not project id", 1, projected.getArity()); Assert.assertFalse("Should project location", projected.isNullAt(0)); Assert.assertEquals("Should not project longitude", 1, projectedLocation.getArity()); - Assert.assertEquals("Should project latitude", - 52.995143f, projectedLocation.getFloat(0), 0.000001f); + Assert.assertEquals( + "Should project latitude", 52.995143f, projectedLocation.getFloat(0), 0.000001f); - Schema longOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); projected = writeAndRead("longitude_only", writeSchema, longOnly, record); projectedLocation = projected.getRow(0, 1); Assert.assertEquals("Should not project id", 1, projected.getArity()); Assert.assertFalse("Should project location", projected.isNullAt(0)); Assert.assertEquals("Should not project latitutde", 1, projectedLocation.getArity()); - Assert.assertEquals("Should project longitude", - -1.539054f, projectedLocation.getFloat(0), 0.000001f); + Assert.assertEquals( + "Should project longitude", -1.539054f, projectedLocation.getFloat(0), 0.000001f); Schema locationOnly = writeSchema.select("location"); projected = writeAndRead("location_only", writeSchema, locationOnly, record); projectedLocation = projected.getRow(0, 1); Assert.assertEquals("Should not project id", 1, projected.getArity()); Assert.assertFalse("Should project location", projected.isNullAt(0)); - Assert.assertEquals("Should project latitude", - 52.995143f, projectedLocation.getFloat(0), 0.000001f); - Assert.assertEquals("Should project longitude", - -1.539054f, projectedLocation.getFloat(1), 0.000001f); + Assert.assertEquals( + "Should project latitude", 52.995143f, projectedLocation.getFloat(0), 0.000001f); + Assert.assertEquals( + "Should project longitude", -1.539054f, projectedLocation.getFloat(1), 0.000001f); } @Test public void testMapProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "properties", - Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get())) - ); - - GenericMapData properties = new GenericMapData(ImmutableMap.of( - StringData.fromString("a"), - StringData.fromString("A"), - StringData.fromString("b"), - StringData.fromString("B"))); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "properties", + Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); + + GenericMapData properties = + new GenericMapData( + ImmutableMap.of( + StringData.fromString("a"), + StringData.fromString("A"), + StringData.fromString("b"), + StringData.fromString("B"))); RowData row = GenericRowData.of(34L, properties); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); @@ -353,26 +357,28 @@ public void testMapProjection() throws IOException { @Test public void testMapOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); RowData l1 = GenericRowData.of(53.992811f, -1.542616f); RowData l2 = GenericRowData.of(52.995143f, -1.539054f); - GenericMapData map = new GenericMapData(ImmutableMap.of( - StringData.fromString("L1"), l1, StringData.fromString("L2"), l2)); + GenericMapData map = + new GenericMapData( + ImmutableMap.of(StringData.fromString("L1"), l1, StringData.fromString("L2"), l2)); RowData row = GenericRowData.of(34L, map); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); @@ -386,21 +392,19 @@ public void testMapOfStructsProjection() throws IOException { GenericMapData locations = (GenericMapData) projected.getMap(0); Assert.assertNotNull("Should project locations map", locations); GenericArrayData l1l2Array = - new GenericArrayData(new Object[] {StringData.fromString("L2"), StringData.fromString("L1")}); + new GenericArrayData( + new Object[] {StringData.fromString("L2"), StringData.fromString("L1")}); Assert.assertEquals("Should contain L1 and L2", l1l2Array, locations.keyArray()); RowData projectedL1 = (RowData) locations.get(StringData.fromString("L1")); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain lat", - 53.992811f, projectedL1.getFloat(0), 0.000001); + Assert.assertEquals("L1 should contain lat", 53.992811f, projectedL1.getFloat(0), 0.000001); Assert.assertEquals("L1 should not contain long", 1, projectedL1.getArity()); RowData projectedL2 = (RowData) locations.get(StringData.fromString("L2")); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain lat", - 52.995143f, projectedL2.getFloat(0), 0.000001); + Assert.assertEquals("L2 should contain lat", 52.995143f, projectedL2.getFloat(0), 0.000001); Assert.assertEquals("L2 should not contain long", 1, projectedL2.getArity()); - projected = writeAndRead("long_only", - writeSchema, writeSchema.select("locations.long"), row); + projected = writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), row); Assert.assertEquals("Should not project id", 1, projected.getArity()); locations = (GenericMapData) projected.getMap(0); Assert.assertNotNull("Should project locations map", locations); @@ -408,22 +412,23 @@ public void testMapOfStructsProjection() throws IOException { projectedL1 = (RowData) locations.get(StringData.fromString("L1")); Assert.assertNotNull("L1 should not be null", projectedL1); Assert.assertEquals("L1 should not contain lat", 1, projectedL1.getArity()); - Assert.assertEquals("L1 should contain long", - -1.542616f, projectedL1.getFloat(0), 0.000001); + Assert.assertEquals("L1 should contain long", -1.542616f, projectedL1.getFloat(0), 0.000001); projectedL2 = (RowData) locations.get(StringData.fromString("L2")); Assert.assertNotNull("L2 should not be null", projectedL2); Assert.assertEquals("L2 should not contain lat", 1, projectedL2.getArity()); - Assert.assertEquals("L2 should contain long", - -1.539054f, projectedL2.getFloat(0), 0.000001); - - Schema latitiudeRenamed = new Schema( - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "latitude", Types.FloatType.get()) - ) - )) - ); + Assert.assertEquals("L2 should contain long", -1.539054f, projectedL2.getFloat(0), 0.000001); + + Schema latitiudeRenamed = + new Schema( + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, row); Assert.assertEquals("Should not project id", 1, projected.getArity()); @@ -432,29 +437,27 @@ public void testMapOfStructsProjection() throws IOException { Assert.assertEquals("Should contain L1 and L2", l1l2Array, locations.keyArray()); projectedL1 = (RowData) locations.get(StringData.fromString("L1")); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain latitude", - 53.992811f, projectedL1.getFloat(0), 0.000001); + Assert.assertEquals( + "L1 should contain latitude", 53.992811f, projectedL1.getFloat(0), 0.000001); projectedL2 = (RowData) locations.get(StringData.fromString("L2")); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain latitude", - 52.995143f, projectedL2.getFloat(0), 0.000001); + Assert.assertEquals( + "L2 should contain latitude", 52.995143f, projectedL2.getFloat(0), 0.000001); } @Test public void testListProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(10, "values", - Types.ListType.ofOptional(11, Types.LongType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); GenericArrayData values = new GenericArrayData(new Long[] {56L, 57L, 58L}); RowData row = GenericRowData.of(34L, values); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); @@ -474,24 +477,24 @@ public void testListProjection() throws IOException { @Test @SuppressWarnings("unchecked") public void testListOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()) - )) - ) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); RowData p1 = GenericRowData.of(1, 2); RowData p2 = GenericRowData.of(3, null); GenericArrayData arrayData = new GenericArrayData(new RowData[] {p1, p2}); RowData row = GenericRowData.of(34L, arrayData); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); @@ -525,13 +528,15 @@ public void testListOfStructsProjection() throws IOException { Assert.assertEquals("Should not project x", 1, projectedP2.getArity()); Assert.assertTrue("Should project null y", projectedP2.isNullAt(0)); - Schema yRenamed = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.optional(18, "z", Types.IntegerType.get()) - )) - ) - ); + Schema yRenamed = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); projected = writeAndRead("y_renamed", writeSchema, yRenamed, row); Assert.assertEquals("Should not project id", 1, projected.getArity()); @@ -548,22 +553,25 @@ public void testListOfStructsProjection() throws IOException { @Test public void testAddedFieldsWithRequiredChildren() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(1, "a", Types.LongType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(1, "a", Types.LongType.get())); RowData row = GenericRowData.of(100L); - Schema addedFields = new Schema( - Types.NestedField.optional(1, "a", Types.LongType.get()), - Types.NestedField.optional(2, "b", Types.StructType.of( - Types.NestedField.required(3, "c", Types.LongType.get()) - )), - Types.NestedField.optional(4, "d", Types.ListType.ofRequired(5, Types.LongType.get())), - Types.NestedField.optional(6, "e", Types.MapType.ofRequired(7, 8, Types.LongType.get(), Types.LongType.get())) - ); - - RowData projected = writeAndRead("add_fields_with_required_children_projection", schema, addedFields, row); + Schema addedFields = + new Schema( + Types.NestedField.optional(1, "a", Types.LongType.get()), + Types.NestedField.optional( + 2, + "b", + Types.StructType.of(Types.NestedField.required(3, "c", Types.LongType.get()))), + Types.NestedField.optional(4, "d", Types.ListType.ofRequired(5, Types.LongType.get())), + Types.NestedField.optional( + 6, + "e", + Types.MapType.ofRequired(7, 8, Types.LongType.get(), Types.LongType.get()))); + + RowData projected = + writeAndRead("add_fields_with_required_children_projection", schema, addedFields, row); Assert.assertEquals("Should contain the correct value in column 1", projected.getLong(0), 100L); Assert.assertTrue("Should contain empty value in new column 2", projected.isNullAt(1)); Assert.assertTrue("Should contain empty value in new column 4", projected.isNullAt(2)); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java index 34b7f03673e2..a9800303aa4f 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; +import static org.apache.iceberg.flink.SimpleDataUtil.createDelete; +import static org.apache.iceberg.flink.SimpleDataUtil.createInsert; +import static org.apache.iceberg.flink.SimpleDataUtil.createRecord; +import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateAfter; +import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateBefore; + import java.io.File; import java.io.IOException; import java.nio.file.Files; @@ -50,12 +55,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.flink.SimpleDataUtil.createDelete; -import static org.apache.iceberg.flink.SimpleDataUtil.createInsert; -import static org.apache.iceberg.flink.SimpleDataUtil.createRecord; -import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateAfter; -import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateBefore; - @RunWith(Parameterized.class) public class TestDeltaTaskWriter extends TableTestBase { private static final int FORMAT_V2 = 2; @@ -64,11 +63,7 @@ public class TestDeltaTaskWriter extends TableTestBase { @Parameterized.Parameters(name = "FileFormat = {0}") public static Object[][] parameters() { - return new Object[][] { - {"avro"}, - {"orc"}, - {"parquet"} - }; + return new Object[][] {{"avro"}, {"orc"}, {"parquet"}}; } public TestDeltaTaskWriter(String fileFormat) { @@ -92,7 +87,8 @@ private void initTable(boolean partitioned) { this.table = create(SCHEMA, PartitionSpec.unpartitioned()); } - table.updateProperties() + table + .updateProperties() .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, String.valueOf(8 * 1024)) .defaultFormat(format) .commit(); @@ -139,12 +135,14 @@ private void testCdcEvents(boolean partitioned) throws IOException { Assert.assertEquals(partitioned ? 3 : 1, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records.", expectedRowSet( - createRecord(1, "eee"), - createRecord(2, "ddd"), - createRecord(4, "fff"), - createRecord(5, "ggg") - ), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records.", + expectedRowSet( + createRecord(1, "eee"), + createRecord(2, "ddd"), + createRecord(4, "fff"), + createRecord(5, "ggg")), + actualRowSet("*")); // Start the 2nd transaction. writer = taskWriterFactory.create(); @@ -165,11 +163,10 @@ private void testCdcEvents(boolean partitioned) throws IOException { Assert.assertEquals(partitioned ? 3 : 1, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet( - createRecord(1, "eee"), - createRecord(5, "iii"), - createRecord(6, "hhh") - ), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", + expectedRowSet(createRecord(1, "eee"), createRecord(5, "iii"), createRecord(6, "hhh")), + actualRowSet("*")); } @Test @@ -229,11 +226,15 @@ private void testAbort(boolean partitioned) throws IOException { } // Assert the current data/delete file count. - List files = Files.walk(Paths.get(tableDir.getPath(), "data")) - .filter(p -> p.toFile().isFile()) - .filter(p -> !p.toString().endsWith(".crc")) - .collect(Collectors.toList()); - Assert.assertEquals("Should have expected file count, but files are: " + files, partitioned ? 4 : 2, files.size()); + List files = + Files.walk(Paths.get(tableDir.getPath(), "data")) + .filter(p -> p.toFile().isFile()) + .filter(p -> !p.toString().endsWith(".crc")) + .collect(Collectors.toList()); + Assert.assertEquals( + "Should have expected file count, but files are: " + files, + partitioned ? 4 : 2, + files.size()); writer.abort(); for (Path file : files) { @@ -270,11 +271,10 @@ public void testPartitionedTableWithDataAsKey() throws IOException { Assert.assertEquals(1, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet( - createRecord(2, "aaa"), - createRecord(3, "bbb"), - createRecord(4, "ccc") - ), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", + expectedRowSet(createRecord(2, "aaa"), createRecord(3, "bbb"), createRecord(4, "ccc")), + actualRowSet("*")); // Start the 2nd transaction. writer = taskWriterFactory.create(); @@ -287,12 +287,14 @@ public void testPartitionedTableWithDataAsKey() throws IOException { Assert.assertEquals(1, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet( - createRecord(2, "aaa"), - createRecord(5, "aaa"), - createRecord(3, "bbb"), - createRecord(6, "bbb") - ), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", + expectedRowSet( + createRecord(2, "aaa"), + createRecord(5, "aaa"), + createRecord(3, "bbb"), + createRecord(6, "bbb")), + actualRowSet("*")); } @Test @@ -311,20 +313,21 @@ public void testPartitionedTableWithDataAndIdAsKey() throws IOException { WriteResult result = writer.complete(); Assert.assertEquals(1, result.dataFiles().length); Assert.assertEquals(1, result.deleteFiles().length); - Assert.assertEquals(Sets.newHashSet(FileContent.POSITION_DELETES), - Sets.newHashSet(result.deleteFiles()[0].content())); + Assert.assertEquals( + Sets.newHashSet(FileContent.POSITION_DELETES), + Sets.newHashSet(result.deleteFiles()[0].content())); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet( - createRecord(1, "aaa") - ), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", expectedRowSet(createRecord(1, "aaa")), actualRowSet("*")); } private void commitTransaction(WriteResult result) { RowDelta rowDelta = table.newRowDelta(); Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); - rowDelta.validateDeletedFiles() + rowDelta + .validateDeletedFiles() .validateDataFilesExist(Lists.newArrayList(result.referencedDataFiles())) .commit(); } @@ -339,7 +342,11 @@ private StructLikeSet actualRowSet(String... columns) throws IOException { private TaskWriterFactory createTaskWriterFactory(List equalityFieldIds) { return new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), FlinkSchemaUtil.convert(table.schema()), - 128 * 1024 * 1024, format, equalityFieldIds, false); + SerializableTable.copyOf(table), + FlinkSchemaUtil.convert(table.schema()), + 128 * 1024 * 1024, + format, + equalityFieldIds, + false); } } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java index 8d7fa86eac50..4c17cd7607df 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -41,11 +40,16 @@ public TestFlinkAppenderFactory(String fileFormat, boolean partitioned) { } @Override - protected FileAppenderFactory createAppenderFactory(List equalityFieldIds, - Schema eqDeleteSchema, - Schema posDeleteRowSchema) { - return new FlinkAppenderFactory(table.schema(), rowType, table.properties(), table.spec(), - ArrayUtil.toIntArray(equalityFieldIds), eqDeleteSchema, posDeleteRowSchema); + protected FileAppenderFactory createAppenderFactory( + List equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema) { + return new FlinkAppenderFactory( + table.schema(), + rowType, + table.properties(), + table.spec(), + ArrayUtil.toIntArray(equalityFieldIds), + eqDeleteSchema, + posDeleteRowSchema); } @Override diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java index 3223b6e28b92..da45241256f5 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -39,9 +38,11 @@ public TestFlinkFileWriterFactory(FileFormat fileFormat, boolean partitioned) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return FlinkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java index 38de1bd4f942..79daac3750a6 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.File; @@ -63,13 +62,12 @@ public class TestFlinkIcebergSink { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); - private static final TypeInformation ROW_TYPE_INFO = new RowTypeInfo( - SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); - private static final DataFormatConverters.RowConverter CONVERTER = new DataFormatConverters.RowConverter( - SimpleDataUtil.FLINK_SCHEMA.getFieldDataTypes()); + private static final TypeInformation ROW_TYPE_INFO = + new RowTypeInfo(SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); + private static final DataFormatConverters.RowConverter CONVERTER = + new DataFormatConverters.RowConverter(SimpleDataUtil.FLINK_SCHEMA.getFieldDataTypes()); private String tablePath; private Table table; @@ -83,18 +81,18 @@ public class TestFlinkIcebergSink { @Parameterized.Parameters(name = "format={0}, parallelism = {1}, partitioned = {2}") public static Object[][] parameters() { return new Object[][] { - {"avro", 1, true}, - {"avro", 1, false}, - {"avro", 2, true}, - {"avro", 2, false}, - {"orc", 1, true}, - {"orc", 1, false}, - {"orc", 2, true}, - {"orc", 2, false}, - {"parquet", 1, true}, - {"parquet", 1, false}, - {"parquet", 2, true}, - {"parquet", 2, false} + {"avro", 1, true}, + {"avro", 1, false}, + {"avro", 2, true}, + {"avro", 2, false}, + {"orc", 1, true}, + {"orc", 1, false}, + {"orc", 2, true}, + {"orc", 2, false}, + {"parquet", 1, true}, + {"parquet", 1, false}, + {"parquet", 2, true}, + {"parquet", 2, false} }; } @@ -115,10 +113,12 @@ public void before() throws IOException { Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); table = SimpleDataUtil.createTable(tablePath, props, partitioned); - env = StreamExecutionEnvironment.getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); tableLoader = TableLoader.fromHadoopTable(tablePath); } @@ -133,13 +133,10 @@ private BoundedTestSource createBoundedSource(List rows) { @Test public void testWriteRowData() throws Exception { - List rows = Lists.newArrayList( - Row.of(1, "hello"), - Row.of(2, "world"), - Row.of(3, "foo") - ); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) - .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); + List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) + .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); FlinkSink.forRowData(dataStream) .table(table) @@ -164,11 +161,11 @@ private List createRows(String prefix) { Row.of(2, prefix + "ccc"), Row.of(3, prefix + "aaa"), Row.of(3, prefix + "bbb"), - Row.of(3, prefix + "ccc") - ); + Row.of(3, prefix + "ccc")); } - private void testWriteRow(TableSchema tableSchema, DistributionMode distributionMode) throws Exception { + private void testWriteRow(TableSchema tableSchema, DistributionMode distributionMode) + throws Exception { List rows = createRows(""); DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); @@ -202,7 +199,8 @@ public void testWriteRowWithTableSchema() throws Exception { @Test public void testJobNoneDistributeMode() throws Exception { - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) .commit(); @@ -218,12 +216,15 @@ public void testJobNoneDistributeMode() throws Exception { @Test public void testJobHashDistributionMode() { - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) .commit(); - AssertHelpers.assertThrows("Does not support range distribution-mode now.", - IllegalArgumentException.class, "Flink does not support 'range' write distribution mode now.", + AssertHelpers.assertThrows( + "Does not support range distribution-mode now.", + IllegalArgumentException.class, + "Flink does not support 'range' write distribution mode now.", () -> { testWriteRow(null, DistributionMode.RANGE); return null; @@ -232,16 +233,20 @@ public void testJobHashDistributionMode() { @Test public void testJobNullDistributionMode() throws Exception { - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) .commit(); testWriteRow(null, null); if (partitioned) { - Assert.assertEquals("There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); - Assert.assertEquals("There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); - Assert.assertEquals("There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); + Assert.assertEquals( + "There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); + Assert.assertEquals( + "There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); + Assert.assertEquals( + "There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); } } @@ -249,9 +254,12 @@ public void testJobNullDistributionMode() throws Exception { public void testPartitionWriteMode() throws Exception { testWriteRow(null, DistributionMode.HASH); if (partitioned) { - Assert.assertEquals("There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); - Assert.assertEquals("There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); - Assert.assertEquals("There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); + Assert.assertEquals( + "There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); + Assert.assertEquals( + "There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); + Assert.assertEquals( + "There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); } } @@ -259,9 +267,12 @@ public void testPartitionWriteMode() throws Exception { public void testShuffleByPartitionWithSchema() throws Exception { testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.HASH); if (partitioned) { - Assert.assertEquals("There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); - Assert.assertEquals("There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); - Assert.assertEquals("There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); + Assert.assertEquals( + "There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); + Assert.assertEquals( + "There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); + Assert.assertEquals( + "There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); } } @@ -279,17 +290,19 @@ public void testTwoSinksInDisjointedDAG() throws Exception { Table rightTable = SimpleDataUtil.createTable(rightTablePath, props, partitioned); TableLoader rightTableLoader = TableLoader.fromHadoopTable(rightTablePath); - env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); env.getConfig().disableAutoGeneratedUIDs(); List leftRows = createRows("left-"); - DataStream leftStream = env.fromCollection(leftRows, ROW_TYPE_INFO) - .name("leftCustomSource") - .uid("leftCustomSource"); + DataStream leftStream = + env.fromCollection(leftRows, ROW_TYPE_INFO) + .name("leftCustomSource") + .uid("leftCustomSource"); FlinkSink.forRow(leftStream, SimpleDataUtil.FLINK_SCHEMA) .table(leftTable) .tableLoader(leftTableLoader) @@ -299,9 +312,10 @@ public void testTwoSinksInDisjointedDAG() throws Exception { .append(); List rightRows = createRows("right-"); - DataStream rightStream = env.fromCollection(rightRows, ROW_TYPE_INFO) - .name("rightCustomSource") - .uid("rightCustomSource"); + DataStream rightStream = + env.fromCollection(rightRows, ROW_TYPE_INFO) + .name("rightCustomSource") + .uid("rightCustomSource"); FlinkSink.forRow(rightStream, SimpleDataUtil.FLINK_SCHEMA) .table(rightTable) .tableLoader(rightTableLoader) @@ -326,14 +340,17 @@ public void testOverrideWriteConfigWithUnknownDistributionMode() { List rows = createRows(""); DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - FlinkSink.Builder builder = FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .setAll(newProps); - - AssertHelpers.assertThrows("Should fail with invalid distribution mode.", - IllegalArgumentException.class, "No enum constant org.apache.iceberg.DistributionMode.UNRECOGNIZED", + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .setAll(newProps); + + AssertHelpers.assertThrows( + "Should fail with invalid distribution mode.", + IllegalArgumentException.class, + "No enum constant org.apache.iceberg.DistributionMode.UNRECOGNIZED", () -> { builder.append(); @@ -351,14 +368,17 @@ public void testOverrideWriteConfigWithUnknownFileFormat() { List rows = createRows(""); DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - FlinkSink.Builder builder = FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .setAll(newProps); - - AssertHelpers.assertThrows("Should fail with invalid file format.", - IllegalArgumentException.class, "No enum constant org.apache.iceberg.FileFormat.UNRECOGNIZED", + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .setAll(newProps); + + AssertHelpers.assertThrows( + "Should fail with invalid file format.", + IllegalArgumentException.class, + "No enum constant org.apache.iceberg.FileFormat.UNRECOGNIZED", () -> { builder.append(); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java index 97506b90ba46..cb840ada5ac5 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.File; @@ -67,18 +66,18 @@ public class TestFlinkIcebergSinkV2 extends TableTestBase { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private static final int FORMAT_V2 = 2; private static final TypeInformation ROW_TYPE_INFO = new RowTypeInfo(SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); - private static final Map ROW_KIND_MAP = ImmutableMap.of( - "+I", RowKind.INSERT, - "-D", RowKind.DELETE, - "-U", RowKind.UPDATE_BEFORE, - "+U", RowKind.UPDATE_AFTER); + private static final Map ROW_KIND_MAP = + ImmutableMap.of( + "+I", RowKind.INSERT, + "-D", RowKind.DELETE, + "-U", RowKind.UPDATE_BEFORE, + "+U", RowKind.UPDATE_AFTER); private static final int ROW_ID_POS = 0; private static final int ROW_DATA_POS = 1; @@ -91,27 +90,27 @@ public class TestFlinkIcebergSinkV2 extends TableTestBase { private StreamExecutionEnvironment env; private TestTableLoader tableLoader; - @Parameterized.Parameters(name = "FileFormat = {0}, Parallelism = {1}, Partitioned={2}, WriteDistributionMode ={3}") + @Parameterized.Parameters( + name = "FileFormat = {0}, Parallelism = {1}, Partitioned={2}, WriteDistributionMode ={3}") public static Object[][] parameters() { return new Object[][] { - new Object[] {"avro", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {"avro", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {"avro", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {"avro", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - - new Object[] {"orc", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {"orc", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {"orc", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {"orc", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - - new Object[] {"parquet", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, - new Object[] {"parquet", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, - new Object[] {"parquet", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, - new Object[] {"parquet", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE} + new Object[] {"avro", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, + new Object[] {"avro", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, + new Object[] {"avro", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, + new Object[] {"avro", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, + new Object[] {"orc", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, + new Object[] {"orc", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, + new Object[] {"orc", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, + new Object[] {"orc", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, + new Object[] {"parquet", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, + new Object[] {"parquet", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, + new Object[] {"parquet", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, + new Object[] {"parquet", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE} }; } - public TestFlinkIcebergSinkV2(String format, int parallelism, boolean partitioned, String writeDistributionMode) { + public TestFlinkIcebergSinkV2( + String format, int parallelism, boolean partitioned, String writeDistributionMode) { super(FORMAT_V2); this.format = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH)); this.parallelism = parallelism; @@ -128,18 +127,24 @@ public void setupTable() throws IOException { if (!partitioned) { table = create(SimpleDataUtil.SCHEMA, PartitionSpec.unpartitioned()); } else { - table = create(SimpleDataUtil.SCHEMA, PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build()); + table = + create( + SimpleDataUtil.SCHEMA, + PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build()); } - table.updateProperties() + table + .updateProperties() .set(TableProperties.DEFAULT_FILE_FORMAT, format.name()) .set(TableProperties.WRITE_DISTRIBUTION_MODE, writeDistributionMode) .commit(); - env = StreamExecutionEnvironment.getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100L) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100L) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); tableLoader = new TestTableLoader(tableDir.getAbsolutePath()); } @@ -147,19 +152,23 @@ public void setupTable() throws IOException { private List findValidSnapshots(Table table) { List validSnapshots = Lists.newArrayList(); for (Snapshot snapshot : table.snapshots()) { - if (snapshot.allManifests(table.io()).stream().anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { + if (snapshot.allManifests(table.io()).stream() + .anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { validSnapshots.add(snapshot); } } return validSnapshots; } - private void testChangeLogs(List equalityFieldColumns, - KeySelector keySelector, - boolean insertAsUpsert, - List> elementsPerCheckpoint, - List> expectedRecordsPerCheckpoint) throws Exception { - DataStream dataStream = env.addSource(new BoundedTestSource<>(elementsPerCheckpoint), ROW_TYPE_INFO); + private void testChangeLogs( + List equalityFieldColumns, + KeySelector keySelector, + boolean insertAsUpsert, + List> elementsPerCheckpoint, + List> expectedRecordsPerCheckpoint) + throws Exception { + DataStream dataStream = + env.addSource(new BoundedTestSource<>(elementsPerCheckpoint), ROW_TYPE_INFO); FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) .tableLoader(tableLoader) @@ -175,13 +184,16 @@ private void testChangeLogs(List equalityFieldColumns, table.refresh(); List snapshots = findValidSnapshots(table); int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); - Assert.assertEquals("Should have the expected snapshot number", expectedSnapshotNum, snapshots.size()); + Assert.assertEquals( + "Should have the expected snapshot number", expectedSnapshotNum, snapshots.size()); for (int i = 0; i < expectedSnapshotNum; i++) { long snapshotId = snapshots.get(i).snapshotId(); List expectedRecords = expectedRecordsPerCheckpoint.get(i); - Assert.assertEquals("Should have the expected records for the checkpoint#" + i, - expectedRowSet(expectedRecords.toArray(new Record[0])), actualRowSet(snapshotId, "*")); + Assert.assertEquals( + "Should have the expected records for the checkpoint#" + i, + expectedRowSet(expectedRecords.toArray(new Record[0])), + actualRowSet(snapshotId, "*")); } } @@ -200,232 +212,227 @@ private Record record(int id, String data) { @Test public void testCheckAndGetEqualityFieldIds() { - table.updateSchema() + table + .updateSchema() .allowIncompatibleChanges() .addRequiredColumn("type", Types.StringType.get()) .setIdentifierFields("type") .commit(); - DataStream dataStream = env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); - FlinkSink.Builder builder = FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA).table(table); + DataStream dataStream = + env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA).table(table); // Use schema identifier field IDs as equality field id list by default Assert.assertEquals( table.schema().identifierFieldIds(), - Sets.newHashSet(builder.checkAndGetEqualityFieldIds()) - ); + Sets.newHashSet(builder.checkAndGetEqualityFieldIds())); // Use user-provided equality field column as equality field id list builder.equalityFieldColumns(Lists.newArrayList("id")); Assert.assertEquals( Sets.newHashSet(table.schema().findField("id").fieldId()), - Sets.newHashSet(builder.checkAndGetEqualityFieldIds()) - ); + Sets.newHashSet(builder.checkAndGetEqualityFieldIds())); builder.equalityFieldColumns(Lists.newArrayList("type")); Assert.assertEquals( Sets.newHashSet(table.schema().findField("type").fieldId()), - Sets.newHashSet(builder.checkAndGetEqualityFieldIds()) - ); + Sets.newHashSet(builder.checkAndGetEqualityFieldIds())); } @Test public void testChangeLogOnIdKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( + List> elementsPerCheckpoint = ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa"), - row("-D", 2, "aaa"), - row("+I", 2, "bbb") - ), + ImmutableList.of( + row("+I", 1, "aaa"), + row("-D", 1, "aaa"), + row("+I", 1, "bbb"), + row("+I", 2, "aaa"), + row("-D", 2, "aaa"), + row("+I", 2, "bbb")), + ImmutableList.of( + row("-U", 2, "bbb"), row("+U", 2, "ccc"), row("-D", 2, "ccc"), row("+I", 2, "ddd")), + ImmutableList.of( + row("-D", 1, "bbb"), + row("+I", 1, "ccc"), + row("-D", 1, "ccc"), + row("+I", 1, "ddd"))); + + List> expectedRecords = ImmutableList.of( - row("-U", 2, "bbb"), - row("+U", 2, "ccc"), - row("-D", 2, "ccc"), - row("+I", 2, "ddd") - ), - ImmutableList.of( - row("-D", 1, "bbb"), - row("+I", 1, "ccc"), - row("-D", 1, "ccc"), - row("+I", 1, "ddd") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "bbb")), - ImmutableList.of(record(1, "bbb"), record(2, "ddd")), - ImmutableList.of(record(1, "ddd"), record(2, "ddd")) - ); + ImmutableList.of(record(1, "bbb"), record(2, "bbb")), + ImmutableList.of(record(1, "bbb"), record(2, "ddd")), + ImmutableList.of(record(1, "ddd"), record(2, "ddd"))); if (partitioned && writeDistributionMode.equals(TableProperties.WRITE_DISTRIBUTION_MODE_HASH)) { - AssertHelpers.assertThrows("Should be error because equality field columns don't include all partition keys", - IllegalStateException.class, "should be included in equality fields", + AssertHelpers.assertThrows( + "Should be error because equality field columns don't include all partition keys", + IllegalStateException.class, + "should be included in equality fields", () -> { - testChangeLogs(ImmutableList.of("id"), row -> row.getField(ROW_ID_POS), false, - elementsPerCheckpoint, expectedRecords); + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + false, + elementsPerCheckpoint, + expectedRecords); return null; }); } else { - testChangeLogs(ImmutableList.of("id"), row -> row.getField(ROW_ID_POS), false, - elementsPerCheckpoint, expectedRecords); + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + false, + elementsPerCheckpoint, + expectedRecords); } } @Test public void testChangeLogOnDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 2, "bbb"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa") - ), + List> elementsPerCheckpoint = ImmutableList.of( - row("-U", 2, "aaa"), - row("+U", 1, "ccc"), - row("+I", 1, "aaa") - ), + ImmutableList.of( + row("+I", 1, "aaa"), + row("-D", 1, "aaa"), + row("+I", 2, "bbb"), + row("+I", 1, "bbb"), + row("+I", 2, "aaa")), + ImmutableList.of(row("-U", 2, "aaa"), row("+U", 1, "ccc"), row("+I", 1, "aaa")), + ImmutableList.of(row("-D", 1, "bbb"), row("+I", 2, "aaa"), row("+I", 2, "ccc"))); + + List> expectedRecords = ImmutableList.of( - row("-D", 1, "bbb"), - row("+I", 2, "aaa"), - row("+I", 2, "ccc") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "aaa")), - ImmutableList.of(record(1, "aaa"), record(1, "bbb"), record(1, "ccc")), - ImmutableList.of(record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "ccc")) - ); - - testChangeLogs(ImmutableList.of("data"), row -> row.getField(ROW_DATA_POS), false, - elementsPerCheckpoint, expectedRecords); + ImmutableList.of(record(1, "bbb"), record(2, "aaa")), + ImmutableList.of(record(1, "aaa"), record(1, "bbb"), record(1, "ccc")), + ImmutableList.of( + record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "ccc"))); + + testChangeLogs( + ImmutableList.of("data"), + row -> row.getField(ROW_DATA_POS), + false, + elementsPerCheckpoint, + expectedRecords); } @Test public void testChangeLogOnIdDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( + List> elementsPerCheckpoint = ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 2, "bbb"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa") - ), + ImmutableList.of( + row("+I", 1, "aaa"), + row("-D", 1, "aaa"), + row("+I", 2, "bbb"), + row("+I", 1, "bbb"), + row("+I", 2, "aaa")), + ImmutableList.of(row("-U", 2, "aaa"), row("+U", 1, "ccc"), row("+I", 1, "aaa")), + ImmutableList.of(row("-D", 1, "bbb"), row("+I", 2, "aaa"))); + + List> expectedRecords = ImmutableList.of( - row("-U", 2, "aaa"), - row("+U", 1, "ccc"), - row("+I", 1, "aaa") - ), - ImmutableList.of( - row("-D", 1, "bbb"), - row("+I", 2, "aaa") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "aaa"), record(2, "bbb")), - ImmutableList.of(record(1, "aaa"), record(1, "bbb"), record(1, "ccc"), record(2, "bbb")), - ImmutableList.of(record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "bbb")) - ); - - testChangeLogs(ImmutableList.of("data", "id"), row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - false, elementsPerCheckpoint, expectedRecords); + ImmutableList.of(record(1, "bbb"), record(2, "aaa"), record(2, "bbb")), + ImmutableList.of( + record(1, "aaa"), record(1, "bbb"), record(1, "ccc"), record(2, "bbb")), + ImmutableList.of( + record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "bbb"))); + + testChangeLogs( + ImmutableList.of("data", "id"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + false, + elementsPerCheckpoint, + expectedRecords); } @Test public void testChangeLogOnSameKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( - // Checkpoint #1 - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 1, "aaa") - ), - // Checkpoint #2 - ImmutableList.of( - row("-U", 1, "aaa"), - row("+U", 1, "aaa") - ), - // Checkpoint #3 + List> elementsPerCheckpoint = ImmutableList.of( - row("-D", 1, "aaa"), - row("+I", 1, "aaa") - ), - // Checkpoint #4 + // Checkpoint #1 + ImmutableList.of(row("+I", 1, "aaa"), row("-D", 1, "aaa"), row("+I", 1, "aaa")), + // Checkpoint #2 + ImmutableList.of(row("-U", 1, "aaa"), row("+U", 1, "aaa")), + // Checkpoint #3 + ImmutableList.of(row("-D", 1, "aaa"), row("+I", 1, "aaa")), + // Checkpoint #4 + ImmutableList.of(row("-U", 1, "aaa"), row("+U", 1, "aaa"), row("+I", 1, "aaa"))); + + List> expectedRecords = ImmutableList.of( - row("-U", 1, "aaa"), - row("+U", 1, "aaa"), - row("+I", 1, "aaa") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa"), record(1, "aaa")) - ); - - testChangeLogs(ImmutableList.of("id", "data"), row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - false, elementsPerCheckpoint, expectedRecords); + ImmutableList.of(record(1, "aaa")), + ImmutableList.of(record(1, "aaa")), + ImmutableList.of(record(1, "aaa")), + ImmutableList.of(record(1, "aaa"), record(1, "aaa"))); + + testChangeLogs( + ImmutableList.of("id", "data"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + false, + elementsPerCheckpoint, + expectedRecords); } @Test public void testUpsertModeCheck() throws Exception { - DataStream dataStream = env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); - FlinkSink.Builder builder = FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .tableLoader(tableLoader) - .tableSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .upsert(true); - - AssertHelpers.assertThrows("Should be error because upsert mode and overwrite mode enable at the same time.", - IllegalStateException.class, "OVERWRITE mode shouldn't be enable", - () -> builder.equalityFieldColumns(ImmutableList.of("id", "data")).overwrite(true).append() - ); - - AssertHelpers.assertThrows("Should be error because equality field columns are empty.", - IllegalStateException.class, "Equality field columns shouldn't be empty", - () -> builder.equalityFieldColumns(ImmutableList.of()).overwrite(false).append() - ); + DataStream dataStream = + env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .tableLoader(tableLoader) + .tableSchema(SimpleDataUtil.FLINK_SCHEMA) + .writeParallelism(parallelism) + .upsert(true); + + AssertHelpers.assertThrows( + "Should be error because upsert mode and overwrite mode enable at the same time.", + IllegalStateException.class, + "OVERWRITE mode shouldn't be enable", + () -> + builder.equalityFieldColumns(ImmutableList.of("id", "data")).overwrite(true).append()); + + AssertHelpers.assertThrows( + "Should be error because equality field columns are empty.", + IllegalStateException.class, + "Equality field columns shouldn't be empty", + () -> builder.equalityFieldColumns(ImmutableList.of()).overwrite(false).append()); } @Test public void testUpsertOnIdKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("+U", 1, "bbb") - ), - ImmutableList.of( - row("+I", 1, "ccc") - ), + List> elementsPerCheckpoint = ImmutableList.of( - row("+U", 1, "ddd"), - row("+I", 1, "eee") - ) - ); + ImmutableList.of(row("+I", 1, "aaa"), row("+U", 1, "bbb")), + ImmutableList.of(row("+I", 1, "ccc")), + ImmutableList.of(row("+U", 1, "ddd"), row("+I", 1, "eee"))); - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "bbb")), - ImmutableList.of(record(1, "ccc")), - ImmutableList.of(record(1, "eee")) - ); + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(1, "bbb")), + ImmutableList.of(record(1, "ccc")), + ImmutableList.of(record(1, "eee"))); if (!partitioned) { - testChangeLogs(ImmutableList.of("id"), row -> row.getField(ROW_ID_POS), true, - elementsPerCheckpoint, expectedRecords); + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + true, + elementsPerCheckpoint, + expectedRecords); } else { - AssertHelpers.assertThrows("Should be error because equality field columns don't include all partition keys", - IllegalStateException.class, "should be included in equality fields", + AssertHelpers.assertThrows( + "Should be error because equality field columns don't include all partition keys", + IllegalStateException.class, + "should be included in equality fields", () -> { - testChangeLogs(ImmutableList.of("id"), row -> row.getField(ROW_ID_POS), true, - elementsPerCheckpoint, expectedRecords); + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + true, + elementsPerCheckpoint, + expectedRecords); return null; }); } @@ -433,61 +440,46 @@ public void testUpsertOnIdKey() throws Exception { @Test public void testUpsertOnDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("+I", 2, "aaa"), - row("+I", 3, "bbb") - ), + List> elementsPerCheckpoint = ImmutableList.of( - row("+U", 4, "aaa"), - row("-U", 3, "bbb"), - row("+U", 5, "bbb") - ), + ImmutableList.of(row("+I", 1, "aaa"), row("+I", 2, "aaa"), row("+I", 3, "bbb")), + ImmutableList.of(row("+U", 4, "aaa"), row("-U", 3, "bbb"), row("+U", 5, "bbb")), + ImmutableList.of(row("+I", 6, "aaa"), row("+U", 7, "bbb"))); + + List> expectedRecords = ImmutableList.of( - row("+I", 6, "aaa"), - row("+U", 7, "bbb") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(2, "aaa"), record(3, "bbb")), - ImmutableList.of(record(4, "aaa"), record(5, "bbb")), - ImmutableList.of(record(6, "aaa"), record(7, "bbb")) - ); - - testChangeLogs(ImmutableList.of("data"), row -> row.getField(ROW_DATA_POS), true, - elementsPerCheckpoint, expectedRecords); + ImmutableList.of(record(2, "aaa"), record(3, "bbb")), + ImmutableList.of(record(4, "aaa"), record(5, "bbb")), + ImmutableList.of(record(6, "aaa"), record(7, "bbb"))); + + testChangeLogs( + ImmutableList.of("data"), + row -> row.getField(ROW_DATA_POS), + true, + elementsPerCheckpoint, + expectedRecords); } @Test public void testUpsertOnIdDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( + List> elementsPerCheckpoint = ImmutableList.of( - row("+I", 1, "aaa"), - row("+U", 1, "aaa"), - row("+I", 2, "bbb") - ), - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 2, "bbb"), - row("+I", 2, "ccc") - ), + ImmutableList.of(row("+I", 1, "aaa"), row("+U", 1, "aaa"), row("+I", 2, "bbb")), + ImmutableList.of(row("+I", 1, "aaa"), row("-D", 2, "bbb"), row("+I", 2, "ccc")), + ImmutableList.of(row("+U", 1, "bbb"), row("-U", 1, "ccc"), row("-D", 1, "aaa"))); + + List> expectedRecords = ImmutableList.of( - row("+U", 1, "bbb"), - row("-U", 1, "ccc"), - row("-D", 1, "aaa") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "aaa"), record(2, "bbb")), - ImmutableList.of(record(1, "aaa"), record(2, "ccc")), - ImmutableList.of(record(1, "bbb"), record(2, "ccc")) - ); - - testChangeLogs(ImmutableList.of("id", "data"), row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - true, elementsPerCheckpoint, expectedRecords); + ImmutableList.of(record(1, "aaa"), record(2, "bbb")), + ImmutableList.of(record(1, "aaa"), record(2, "ccc")), + ImmutableList.of(record(1, "bbb"), record(2, "ccc"))); + + testChangeLogs( + ImmutableList.of("id", "data"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + true, + elementsPerCheckpoint, + expectedRecords); } private StructLikeSet expectedRowSet(Record... records) { @@ -497,10 +489,8 @@ private StructLikeSet expectedRowSet(Record... records) { private StructLikeSet actualRowSet(long snapshotId, String... columns) throws IOException { table.refresh(); StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - try (CloseableIterable reader = IcebergGenerics.read(table) - .useSnapshot(snapshotId) - .select(columns) - .build()) { + try (CloseableIterable reader = + IcebergGenerics.read(table).useSnapshot(snapshotId).select(columns).build()) { reader.forEach(set::add); } return set; diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java index c1538bcaff9d..3c67662f6c34 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; +import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; + import java.io.File; import java.io.IOException; import java.nio.file.Paths; @@ -51,13 +52,10 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; - public class TestFlinkManifest { private static final Configuration CONF = new Configuration(); - @Rule - public TemporaryFolder tempFolder = new TemporaryFolder(); + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); private String tablePath; private Table table; @@ -75,15 +73,21 @@ public void before() throws IOException { // Construct the iceberg table. table = SimpleDataUtil.createTable(tablePath, ImmutableMap.of(), false); - int[] equalityFieldIds = new int[] { - table.schema().findField("id").fieldId(), - table.schema().findField("data").fieldId() - }; - this.appenderFactory = new FlinkAppenderFactory(table.schema(), FlinkSchemaUtil.convert(table.schema()), - table.properties(), table.spec(), equalityFieldIds, table.schema(), null); + int[] equalityFieldIds = + new int[] { + table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() + }; + this.appenderFactory = + new FlinkAppenderFactory( + table.schema(), + FlinkSchemaUtil.convert(table.schema()), + table.properties(), + table.spec(), + equalityFieldIds, + table.schema(), + null); } - @Test public void testIO() throws IOException { String flinkJobId = newFlinkJobId(); @@ -95,13 +99,15 @@ public void testIO() throws IOException { List dataFiles = generateDataFiles(10); List eqDeleteFiles = generateEqDeleteFiles(5); List posDeleteFiles = generatePosDeleteFiles(5); - DeltaManifests deltaManifests = FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder() - .addDataFiles(dataFiles) - .addDeleteFiles(eqDeleteFiles) - .addDeleteFiles(posDeleteFiles) - .build(), - () -> factory.create(curCkpId), table.spec()); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + WriteResult.builder() + .addDataFiles(dataFiles) + .addDeleteFiles(eqDeleteFiles) + .addDeleteFiles(posDeleteFiles) + .build(), + () -> factory.create(curCkpId), + table.spec()); WriteResult result = FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io()); Assert.assertEquals("Size of data file list are not equal.", 10, result.deleteFiles().length); @@ -123,30 +129,33 @@ public void testUserProvidedManifestLocation() throws IOException { long checkpointId = 1; String flinkJobId = newFlinkJobId(); File userProvidedFolder = tempFolder.newFolder(); - Map props = ImmutableMap.of(FLINK_MANIFEST_LOCATION, userProvidedFolder.getAbsolutePath() + "///"); - ManifestOutputFileFactory factory = new ManifestOutputFileFactory( - ((HasTableOperations) table).operations(), table.io(), props, - flinkJobId, 1, 1); + Map props = + ImmutableMap.of(FLINK_MANIFEST_LOCATION, userProvidedFolder.getAbsolutePath() + "///"); + ManifestOutputFileFactory factory = + new ManifestOutputFileFactory( + ((HasTableOperations) table).operations(), table.io(), props, flinkJobId, 1, 1); List dataFiles = generateDataFiles(5); - DeltaManifests deltaManifests = FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder() - .addDataFiles(dataFiles) - .build(), - () -> factory.create(checkpointId), - table.spec()); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + WriteResult.builder().addDataFiles(dataFiles).build(), + () -> factory.create(checkpointId), + table.spec()); Assert.assertNotNull("Data manifest shouldn't be null", deltaManifests.dataManifest()); Assert.assertNull("Delete manifest should be null", deltaManifests.deleteManifest()); - Assert.assertEquals("The newly created manifest file should be located under the user provided directory", - userProvidedFolder.toPath(), Paths.get(deltaManifests.dataManifest().path()).getParent()); + Assert.assertEquals( + "The newly created manifest file should be located under the user provided directory", + userProvidedFolder.toPath(), + Paths.get(deltaManifests.dataManifest().path()).getParent()); WriteResult result = FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io()); Assert.assertEquals(0, result.deleteFiles().length); Assert.assertEquals(5, result.dataFiles().length); - Assert.assertEquals("Size of data file list are not equal.", dataFiles.size(), result.dataFiles().length); + Assert.assertEquals( + "Size of data file list are not equal.", dataFiles.size(), result.dataFiles().length); for (int i = 0; i < dataFiles.size(); i++) { TestHelpers.assertEquals(dataFiles.get(i), result.dataFiles()[i]); } @@ -156,28 +165,34 @@ public void testUserProvidedManifestLocation() throws IOException { public void testVersionedSerializer() throws IOException { long checkpointId = 1; String flinkJobId = newFlinkJobId(); - ManifestOutputFileFactory factory = FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, 1, 1); + ManifestOutputFileFactory factory = + FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, 1, 1); List dataFiles = generateDataFiles(10); List eqDeleteFiles = generateEqDeleteFiles(10); List posDeleteFiles = generatePosDeleteFiles(10); - DeltaManifests expected = FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder() - .addDataFiles(dataFiles) - .addDeleteFiles(eqDeleteFiles) - .addDeleteFiles(posDeleteFiles) - .build(), - () -> factory.create(checkpointId), table.spec()); + DeltaManifests expected = + FlinkManifestUtil.writeCompletedFiles( + WriteResult.builder() + .addDataFiles(dataFiles) + .addDeleteFiles(eqDeleteFiles) + .addDeleteFiles(posDeleteFiles) + .build(), + () -> factory.create(checkpointId), + table.spec()); byte[] versionedSerializeData = - SimpleVersionedSerialization.writeVersionAndSerialize(DeltaManifestsSerializer.INSTANCE, expected); - DeltaManifests actual = SimpleVersionedSerialization - .readVersionAndDeSerialize(DeltaManifestsSerializer.INSTANCE, versionedSerializeData); + SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, expected); + DeltaManifests actual = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, versionedSerializeData); TestHelpers.assertEquals(expected.dataManifest(), actual.dataManifest()); TestHelpers.assertEquals(expected.deleteManifest(), actual.deleteManifest()); byte[] versionedSerializeData2 = - SimpleVersionedSerialization.writeVersionAndSerialize(DeltaManifestsSerializer.INSTANCE, actual); + SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, actual); Assert.assertArrayEquals(versionedSerializeData, versionedSerializeData2); } @@ -186,16 +201,21 @@ public void testCompatibility() throws IOException { // The v2 deserializer should be able to deserialize the v1 binary. long checkpointId = 1; String flinkJobId = newFlinkJobId(); - ManifestOutputFileFactory factory = FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, 1, 1); + ManifestOutputFileFactory factory = + FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, 1, 1); List dataFiles = generateDataFiles(10); - ManifestFile manifest = FlinkManifestUtil.writeDataFiles(factory.create(checkpointId), table.spec(), dataFiles); - byte[] dataV1 = SimpleVersionedSerialization.writeVersionAndSerialize(new V1Serializer(), manifest); + ManifestFile manifest = + FlinkManifestUtil.writeDataFiles(factory.create(checkpointId), table.spec(), dataFiles); + byte[] dataV1 = + SimpleVersionedSerialization.writeVersionAndSerialize(new V1Serializer(), manifest); DeltaManifests delta = - SimpleVersionedSerialization.readVersionAndDeSerialize(DeltaManifestsSerializer.INSTANCE, dataV1); + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, dataV1); Assert.assertNull("Serialization v1 don't include delete files.", delta.deleteManifest()); - Assert.assertNotNull("Serialization v1 should not have null data manifest.", delta.dataManifest()); + Assert.assertNotNull( + "Serialization v1 should not have null data manifest.", delta.dataManifest()); TestHelpers.assertEquals(manifest, delta.dataManifest()); List actualFiles = FlinkManifestUtil.readDataFiles(delta.dataManifest(), table.io()); @@ -224,18 +244,24 @@ public ManifestFile deserialize(int version, byte[] serialized) throws IOExcepti } private DataFile writeDataFile(String filename, List rows) throws IOException { - return SimpleDataUtil.writeFile(table.schema(), table.spec(), CONF, - tablePath, FileFormat.PARQUET.addExtension(filename), rows); + return SimpleDataUtil.writeFile( + table.schema(), + table.spec(), + CONF, + tablePath, + FileFormat.PARQUET.addExtension(filename), + rows); } private DeleteFile writeEqDeleteFile(String filename, List deletes) throws IOException { - return SimpleDataUtil.writeEqDeleteFile(table, FileFormat.PARQUET, tablePath, filename, appenderFactory, deletes); + return SimpleDataUtil.writeEqDeleteFile( + table, FileFormat.PARQUET, tablePath, filename, appenderFactory, deletes); } private DeleteFile writePosDeleteFile(String filename, List> positions) throws IOException { - return SimpleDataUtil - .writePosDeleteFile(table, FileFormat.PARQUET, tablePath, filename, appenderFactory, positions); + return SimpleDataUtil.writePosDeleteFile( + table, FileFormat.PARQUET, tablePath, filename, appenderFactory, positions); } private List generateDataFiles(int fileNum) throws IOException { @@ -253,7 +279,8 @@ private List generateEqDeleteFiles(int fileNum) throws IOException { List deleteFiles = Lists.newArrayList(); for (int i = 0; i < fileNum; i++) { rowDataList.add(SimpleDataUtil.createDelete(i, "a" + i)); - deleteFiles.add(writeEqDeleteFile("eq-delete-file-" + fileCount.incrementAndGet(), rowDataList)); + deleteFiles.add( + writeEqDeleteFile("eq-delete-file-" + fileCount.incrementAndGet(), rowDataList)); } return deleteFiles; } @@ -263,7 +290,8 @@ private List generatePosDeleteFiles(int fileNum) throws IOException List deleteFiles = Lists.newArrayList(); for (int i = 0; i < fileNum; i++) { positions.add(Pair.of("data-file-1", (long) i)); - deleteFiles.add(writePosDeleteFile("pos-delete-file-" + fileCount.incrementAndGet(), positions)); + deleteFiles.add( + writePosDeleteFile("pos-delete-file-" + fileCount.incrementAndGet(), positions)); } return deleteFiles; } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java index 934b5a0d75de..3951c2e70f65 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -39,9 +38,11 @@ public TestFlinkPartitioningWriters(FileFormat fileFormat) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return FlinkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java index 5fd5c5eebee9..9e846efe6fc9 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -39,9 +38,11 @@ public TestFlinkPositionDeltaWriters(FileFormat fileFormat) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return FlinkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java index 9339e5ac2c3e..07716b9c3e60 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -35,9 +34,11 @@ public TestFlinkRollingFileWriters(FileFormat fileFormat, boolean partitioned) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return FlinkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java index aa31c1819d10..e6d64ef2c720 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import org.apache.flink.table.data.GenericRowData; diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java index 135fa84ee94a..3241bc658fb6 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.flink.sink.IcebergFilesCommitter.MAX_CONTINUOUS_EMPTY_COMMITS; +import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; + import java.io.File; import java.io.IOException; import java.nio.file.Files; @@ -64,10 +67,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.flink.sink.IcebergFilesCommitter.MAX_CONTINUOUS_EMPTY_COMMITS; -import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; - @RunWith(Parameterized.class) public class TestIcebergFilesCommitter extends TableTestBase { private static final Configuration CONF = new Configuration(); @@ -80,12 +79,12 @@ public class TestIcebergFilesCommitter extends TableTestBase { @Parameterized.Parameters(name = "FileFormat = {0}, FormatVersion={1}") public static Object[][] parameters() { return new Object[][] { - new Object[] {"avro", 1}, - new Object[] {"avro", 2}, - new Object[] {"parquet", 1}, - new Object[] {"parquet", 2}, - new Object[] {"orc", 1}, - new Object[] {"orc", 2} + new Object[] {"avro", 1}, + new Object[] {"avro", 2}, + new Object[] {"parquet", 1}, + new Object[] {"parquet", 2}, + new Object[] {"orc", 1}, + new Object[] {"orc", 2} }; } @@ -108,7 +107,8 @@ public void setupTable() throws IOException { // Construct the iceberg table. table = create(SimpleDataUtil.SCHEMA, PartitionSpec.unpartitioned()); - table.updateProperties() + table + .updateProperties() .set(DEFAULT_FILE_FORMAT, format.name()) .set(FLINK_MANIFEST_LOCATION, flinkManifestFolder.getAbsolutePath()) .set(MAX_CONTINUOUS_EMPTY_COMMITS, "1") @@ -128,7 +128,8 @@ public void testCommitTxnWithoutDataFiles() throws Exception { assertSnapshotSize(0); assertMaxCommittedCheckpointId(jobId, -1L); - // It's better to advance the max-committed-checkpoint-id in iceberg snapshot, so that the future flink job + // It's better to advance the max-committed-checkpoint-id in iceberg snapshot, so that the + // future flink job // failover won't fail. for (int i = 1; i <= 3; i++) { harness.snapshot(++checkpointId, ++timestamp); @@ -145,9 +146,7 @@ public void testCommitTxnWithoutDataFiles() throws Exception { @Test public void testMaxContinuousEmptyCommits() throws Exception { - table.updateProperties() - .set(MAX_CONTINUOUS_EMPTY_COMMITS, "3") - .commit(); + table.updateProperties().set(MAX_CONTINUOUS_EMPTY_COMMITS, "3").commit(); JobID jobId = new JobID(); long checkpointId = 0; @@ -370,7 +369,8 @@ public void testRecoveryFromValidSnapshot() throws Exception { @Test public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Exception { - // We've two steps in checkpoint: 1. snapshotState(ckp); 2. notifyCheckpointComplete(ckp). It's possible that we + // We've two steps in checkpoint: 1. snapshotState(ckp); 2. notifyCheckpointComplete(ckp). It's + // possible that we // flink job will restore from a checkpoint with only step#1 finished. long checkpointId = 0; long timestamp = 0; @@ -400,7 +400,8 @@ public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Except harness.initializeState(snapshot); harness.open(); - // All flink manifests should be cleaned because it has committed the unfinished iceberg transaction. + // All flink manifests should be cleaned because it has committed the unfinished iceberg + // transaction. assertFlinkManifests(0); SimpleDataUtil.assertTableRows(table, expectedRows); @@ -428,12 +429,14 @@ public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Except // Redeploying flink job from external checkpoint. JobID newJobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(newJobId)) { + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(newJobId)) { harness.setup(); harness.initializeState(snapshot); harness.open(); - // All flink manifests should be cleaned because it has committed the unfinished iceberg transaction. + // All flink manifests should be cleaned because it has committed the unfinished iceberg + // transaction. assertFlinkManifests(0); assertMaxCommittedCheckpointId(newJobId, -1); @@ -466,7 +469,8 @@ public void testStartAnotherJobToWriteSameTable() throws Exception { List tableRows = Lists.newArrayList(); JobID oldJobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(oldJobId)) { + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(oldJobId)) { harness.setup(); harness.open(); @@ -495,7 +499,8 @@ public void testStartAnotherJobToWriteSameTable() throws Exception { checkpointId = 0; timestamp = 0; JobID newJobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(newJobId)) { + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(newJobId)) { harness.setup(); harness.open(); @@ -599,11 +604,14 @@ public void testFlinkManifests() throws Exception { harness.snapshot(checkpoint, ++timestamp); List manifestPaths = assertFlinkManifests(1); Path manifestPath = manifestPaths.get(0); - Assert.assertEquals("File name should have the expected pattern.", - String.format("%s-%05d-%d-%d-%05d.avro", jobId, 0, 0, checkpoint, 1), manifestPath.getFileName().toString()); + Assert.assertEquals( + "File name should have the expected pattern.", + String.format("%s-%05d-%d-%d-%05d.avro", jobId, 0, 0, checkpoint, 1), + manifestPath.getFileName().toString()); // 2. Read the data files from manifests and assert. - List dataFiles = FlinkManifestUtil.readDataFiles(createTestingManifestFile(manifestPath), table.io()); + List dataFiles = + FlinkManifestUtil.readDataFiles(createTestingManifestFile(manifestPath), table.io()); Assert.assertEquals(1, dataFiles.size()); TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); @@ -640,11 +648,14 @@ public void testDeleteFiles() throws Exception { harness.snapshot(checkpoint, ++timestamp); List manifestPaths = assertFlinkManifests(1); Path manifestPath = manifestPaths.get(0); - Assert.assertEquals("File name should have the expected pattern.", - String.format("%s-%05d-%d-%d-%05d.avro", jobId, 0, 0, checkpoint, 1), manifestPath.getFileName().toString()); + Assert.assertEquals( + "File name should have the expected pattern.", + String.format("%s-%05d-%d-%d-%05d.avro", jobId, 0, 0, checkpoint, 1), + manifestPath.getFileName().toString()); // 2. Read the data files from manifests and assert. - List dataFiles = FlinkManifestUtil.readDataFiles(createTestingManifestFile(manifestPath), table.io()); + List dataFiles = + FlinkManifestUtil.readDataFiles(createTestingManifestFile(manifestPath), table.io()); Assert.assertEquals(1, dataFiles.size()); TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); @@ -659,11 +670,10 @@ public void testDeleteFiles() throws Exception { DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(row2)); RowData delete1 = SimpleDataUtil.createDelete(1, "aaa"); - DeleteFile deleteFile1 = writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete1)); - harness.processElement(WriteResult.builder() - .addDataFiles(dataFile2) - .addDeleteFiles(deleteFile1) - .build(), + DeleteFile deleteFile1 = + writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete1)); + harness.processElement( + WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile1).build(), ++timestamp); assertMaxCommittedCheckpointId(jobId, checkpoint); @@ -699,11 +709,10 @@ public void testCommitTwoCheckpointsInSingleTxn() throws Exception { RowData insert2 = SimpleDataUtil.createInsert(2, "bbb"); RowData delete3 = SimpleDataUtil.createDelete(3, "ccc"); DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(insert1, insert2)); - DeleteFile deleteFile1 = writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete3)); - harness.processElement(WriteResult.builder() - .addDataFiles(dataFile1) - .addDeleteFiles(deleteFile1) - .build(), + DeleteFile deleteFile1 = + writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete3)); + harness.processElement( + WriteResult.builder().addDataFiles(dataFile1).addDeleteFiles(deleteFile1).build(), ++timestamp); // The 1th snapshotState. @@ -712,11 +721,10 @@ public void testCommitTwoCheckpointsInSingleTxn() throws Exception { RowData insert4 = SimpleDataUtil.createInsert(4, "ddd"); RowData delete2 = SimpleDataUtil.createDelete(2, "bbb"); DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(insert4)); - DeleteFile deleteFile2 = writeEqDeleteFile(appenderFactory, "delete-file-2", ImmutableList.of(delete2)); - harness.processElement(WriteResult.builder() - .addDataFiles(dataFile2) - .addDeleteFiles(deleteFile2) - .build(), + DeleteFile deleteFile2 = + writeEqDeleteFile(appenderFactory, "delete-file-2", ImmutableList.of(delete2)); + harness.processElement( + WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile2).build(), ++timestamp); // The 2nd snapshotState. @@ -727,48 +735,76 @@ public void testCommitTwoCheckpointsInSingleTxn() throws Exception { SimpleDataUtil.assertTableRows(table, ImmutableList.of(insert1, insert4)); assertMaxCommittedCheckpointId(jobId, checkpoint); assertFlinkManifests(0); - Assert.assertEquals("Should have committed 2 txn.", 2, ImmutableList.copyOf(table.snapshots()).size()); + Assert.assertEquals( + "Should have committed 2 txn.", 2, ImmutableList.copyOf(table.snapshots()).size()); } } - private DeleteFile writeEqDeleteFile(FileAppenderFactory appenderFactory, - String filename, List deletes) throws IOException { - return SimpleDataUtil.writeEqDeleteFile(table, FileFormat.PARQUET, tablePath, filename, appenderFactory, deletes); + private DeleteFile writeEqDeleteFile( + FileAppenderFactory appenderFactory, String filename, List deletes) + throws IOException { + return SimpleDataUtil.writeEqDeleteFile( + table, FileFormat.PARQUET, tablePath, filename, appenderFactory, deletes); } - private DeleteFile writePosDeleteFile(FileAppenderFactory appenderFactory, - String filename, - List> positions) throws IOException { - return SimpleDataUtil.writePosDeleteFile(table, FileFormat.PARQUET, tablePath, filename, appenderFactory, - positions); + private DeleteFile writePosDeleteFile( + FileAppenderFactory appenderFactory, + String filename, + List> positions) + throws IOException { + return SimpleDataUtil.writePosDeleteFile( + table, FileFormat.PARQUET, tablePath, filename, appenderFactory, positions); } private FileAppenderFactory createDeletableAppenderFactory() { - int[] equalityFieldIds = new int[] { - table.schema().findField("id").fieldId(), - table.schema().findField("data").fieldId() - }; - return new FlinkAppenderFactory(table.schema(), - FlinkSchemaUtil.convert(table.schema()), table.properties(), table.spec(), equalityFieldIds, - table.schema(), null); + int[] equalityFieldIds = + new int[] { + table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() + }; + return new FlinkAppenderFactory( + table.schema(), + FlinkSchemaUtil.convert(table.schema()), + table.properties(), + table.spec(), + equalityFieldIds, + table.schema(), + null); } private ManifestFile createTestingManifestFile(Path manifestPath) { - return new GenericManifestFile(manifestPath.toAbsolutePath().toString(), manifestPath.toFile().length(), 0, - ManifestContent.DATA, 0, 0, 0L, 0, 0, 0, 0, 0, 0, null, null); + return new GenericManifestFile( + manifestPath.toAbsolutePath().toString(), + manifestPath.toFile().length(), + 0, + ManifestContent.DATA, + 0, + 0, + 0L, + 0, + 0, + 0, + 0, + 0, + 0, + null, + null); } private List assertFlinkManifests(int expectedCount) throws IOException { - List manifests = Files.list(flinkManifestFolder.toPath()) - .filter(p -> !p.toString().endsWith(".crc")) - .collect(Collectors.toList()); - Assert.assertEquals(String.format("Expected %s flink manifests, but the list is: %s", expectedCount, manifests), - expectedCount, manifests.size()); + List manifests = + Files.list(flinkManifestFolder.toPath()) + .filter(p -> !p.toString().endsWith(".crc")) + .collect(Collectors.toList()); + Assert.assertEquals( + String.format("Expected %s flink manifests, but the list is: %s", expectedCount, manifests), + expectedCount, + manifests.size()); return manifests; } private DataFile writeDataFile(String filename, List rows) throws IOException { - return SimpleDataUtil.writeFile(table.schema(), table.spec(), CONF, tablePath, format.addExtension(filename), rows); + return SimpleDataUtil.writeFile( + table.schema(), table.spec(), CONF, tablePath, format.addExtension(filename), rows); } private void assertMaxCommittedCheckpointId(JobID jobID, long expectedId) { @@ -815,8 +851,10 @@ private static TestOperatorFactory of(String tablePath) { @Override @SuppressWarnings("unchecked") - public > T createStreamOperator(StreamOperatorParameters param) { - IcebergFilesCommitter committer = new IcebergFilesCommitter(new TestTableLoader(tablePath), false); + public > T createStreamOperator( + StreamOperatorParameters param) { + IcebergFilesCommitter committer = + new IcebergFilesCommitter(new TestTableLoader(tablePath), false); committer.setup(param.getContainingTask(), param.getStreamConfig(), param.getOutput()); return (T) committer; } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java index 7400449b368d..6f45f10b34be 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.File; @@ -68,8 +67,7 @@ @RunWith(Parameterized.class) public class TestIcebergStreamWriter { - @Rule - public TemporaryFolder tempFolder = new TemporaryFolder(); + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); private String tablePath; private Table table; @@ -80,12 +78,12 @@ public class TestIcebergStreamWriter { @Parameterized.Parameters(name = "format = {0}, partitioned = {1}") public static Object[][] parameters() { return new Object[][] { - {"avro", true}, - {"avro", false}, - {"orc", true}, - {"orc", false}, - {"parquet", true}, - {"parquet", false} + {"avro", true}, + {"avro", false}, + {"orc", true}, + {"orc", false}, + {"parquet", true}, + {"parquet", false} }; } @@ -107,7 +105,8 @@ public void before() throws IOException { @Test public void testWritingTable() throws Exception { long checkpointId = 1L; - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { // The first checkpoint testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 1); @@ -137,13 +136,14 @@ public void testWritingTable() throws Exception { appendFiles.commit(); // Assert the table records. - SimpleDataUtil.assertTableRecords(tablePath, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), - SimpleDataUtil.createRecord(2, "world"), - SimpleDataUtil.createRecord(3, "hello"), - SimpleDataUtil.createRecord(4, "foo"), - SimpleDataUtil.createRecord(5, "bar") - )); + SimpleDataUtil.assertTableRecords( + tablePath, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hello"), + SimpleDataUtil.createRecord(2, "world"), + SimpleDataUtil.createRecord(3, "hello"), + SimpleDataUtil.createRecord(4, "foo"), + SimpleDataUtil.createRecord(5, "bar"))); } } @@ -151,7 +151,8 @@ public void testWritingTable() throws Exception { public void testSnapshotTwice() throws Exception { long checkpointId = 1; long timestamp = 1; - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), timestamp++); testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), timestamp); @@ -174,13 +175,15 @@ public void testSnapshotTwice() throws Exception { @Test public void testTableWithoutSnapshot() throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { Assert.assertEquals(0, testHarness.extractOutputValues().size()); } // Even if we closed the iceberg stream writer, there's no orphan data file. Assert.assertEquals(0, scanDataFiles().size()); - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); // Still not emit the data file yet, because there is no checkpoint. Assert.assertEquals(0, testHarness.extractOutputValues().size()); @@ -212,7 +215,8 @@ private Set scanDataFiles() throws IOException { @Test public void testBoundedStreamCloseWithEmittingDataFiles() throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 2); @@ -236,7 +240,8 @@ public void testBoundedStreamCloseWithEmittingDataFiles() throws Exception { @Test public void testTableWithTargetFileSize() throws Exception { // Adjust the target-file-size in table properties. - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "4") // ~4 bytes; low enough to trigger .commit(); @@ -249,7 +254,8 @@ public void testTableWithTargetFileSize() throws Exception { } } - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { for (RowData row : rows) { testHarness.processElement(row, 1); } @@ -277,20 +283,26 @@ public void testTableWithTargetFileSize() throws Exception { @Test public void testPromotedFlinkDataType() throws Exception { - Schema iSchema = new Schema( - Types.NestedField.required(1, "tinyint", Types.IntegerType.get()), - Types.NestedField.required(2, "smallint", Types.IntegerType.get()), - Types.NestedField.optional(3, "int", Types.IntegerType.get()) - ); - TableSchema flinkSchema = TableSchema.builder() - .field("tinyint", DataTypes.TINYINT().notNull()) - .field("smallint", DataTypes.SMALLINT().notNull()) - .field("int", DataTypes.INT().nullable()) - .build(); + Schema iSchema = + new Schema( + Types.NestedField.required(1, "tinyint", Types.IntegerType.get()), + Types.NestedField.required(2, "smallint", Types.IntegerType.get()), + Types.NestedField.optional(3, "int", Types.IntegerType.get())); + TableSchema flinkSchema = + TableSchema.builder() + .field("tinyint", DataTypes.TINYINT().notNull()) + .field("smallint", DataTypes.SMALLINT().notNull()) + .field("int", DataTypes.INT().nullable()) + .build(); PartitionSpec spec; if (partitioned) { - spec = PartitionSpec.builderFor(iSchema).identity("smallint").identity("tinyint").identity("int").build(); + spec = + PartitionSpec.builderFor(iSchema) + .identity("smallint") + .identity("tinyint") + .identity("int") + .build(); } else { spec = PartitionSpec.unpartitioned(); } @@ -299,21 +311,21 @@ public void testPromotedFlinkDataType() throws Exception { Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); Table icebergTable = new HadoopTables().create(iSchema, spec, props, location); - List rows = Lists.newArrayList( - GenericRowData.of((byte) 0x01, (short) -32768, 101), - GenericRowData.of((byte) 0x02, (short) 0, 102), - GenericRowData.of((byte) 0x03, (short) 32767, 103) - ); + List rows = + Lists.newArrayList( + GenericRowData.of((byte) 0x01, (short) -32768, 101), + GenericRowData.of((byte) 0x02, (short) 0, 102), + GenericRowData.of((byte) 0x03, (short) 32767, 103)); Record record = GenericRecord.create(iSchema); - List expected = Lists.newArrayList( - record.copy(ImmutableMap.of("tinyint", 1, "smallint", -32768, "int", 101)), - record.copy(ImmutableMap.of("tinyint", 2, "smallint", 0, "int", 102)), - record.copy(ImmutableMap.of("tinyint", 3, "smallint", 32767, "int", 103)) - ); - - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter(icebergTable, - flinkSchema)) { + List expected = + Lists.newArrayList( + record.copy(ImmutableMap.of("tinyint", 1, "smallint", -32768, "int", 101)), + record.copy(ImmutableMap.of("tinyint", 2, "smallint", 0, "int", 102)), + record.copy(ImmutableMap.of("tinyint", 3, "smallint", 32767, "int", 103))); + + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter(icebergTable, flinkSchema)) { for (RowData row : rows) { testHarness.processElement(row, 1); } @@ -331,7 +343,8 @@ public void testPromotedFlinkDataType() throws Exception { SimpleDataUtil.assertTableRecords(location, expected); } - private OneInputStreamOperatorTestHarness createIcebergStreamWriter() throws Exception { + private OneInputStreamOperatorTestHarness createIcebergStreamWriter() + throws Exception { return createIcebergStreamWriter(table, SimpleDataUtil.FLINK_SCHEMA); } @@ -339,14 +352,13 @@ private OneInputStreamOperatorTestHarness createIcebergStr Table icebergTable, TableSchema flinkSchema) throws Exception { RowType flinkRowType = FlinkSink.toFlinkRowType(icebergTable.schema(), flinkSchema); FlinkWriteConf flinkWriteConfig = - new FlinkWriteConf(icebergTable, Maps.newHashMap(), new org.apache.flink.configuration.Configuration()); - - IcebergStreamWriter streamWriter = FlinkSink.createStreamWriter( - icebergTable, - flinkWriteConfig, - flinkRowType, null); - OneInputStreamOperatorTestHarness harness = new OneInputStreamOperatorTestHarness<>( - streamWriter, 1, 1, 0); + new FlinkWriteConf( + icebergTable, Maps.newHashMap(), new org.apache.flink.configuration.Configuration()); + + IcebergStreamWriter streamWriter = + FlinkSink.createStreamWriter(icebergTable, flinkWriteConfig, flinkRowType, null); + OneInputStreamOperatorTestHarness harness = + new OneInputStreamOperatorTestHarness<>(streamWriter, 1, 1, 0); harness.setup(); harness.open(); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java index 29a1f78a531e..b6c785cb144b 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -40,53 +39,54 @@ import org.junit.Test; public class TestRowDataPartitionKey { - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(0, "boolType", Types.BooleanType.get()), - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "longType", Types.LongType.get()), - Types.NestedField.required(3, "dateType", Types.DateType.get()), - Types.NestedField.required(4, "timeType", Types.TimeType.get()), - Types.NestedField.required(5, "stringType", Types.StringType.get()), - Types.NestedField.required(6, "timestampWithoutZone", Types.TimestampType.withoutZone()), - Types.NestedField.required(7, "timestampWithZone", Types.TimestampType.withZone()), - Types.NestedField.required(8, "fixedType", Types.FixedType.ofLength(5)), - Types.NestedField.required(9, "uuidType", Types.UUIDType.get()), - Types.NestedField.required(10, "binaryType", Types.BinaryType.get()), - Types.NestedField.required(11, "decimalType1", Types.DecimalType.of(18, 3)), - Types.NestedField.required(12, "decimalType2", Types.DecimalType.of(10, 5)), - Types.NestedField.required(13, "decimalType3", Types.DecimalType.of(38, 19)), - Types.NestedField.required(14, "floatType", Types.FloatType.get()), - Types.NestedField.required(15, "doubleType", Types.DoubleType.get()) - ); - - private static final List SUPPORTED_PRIMITIVES = SCHEMA.asStruct().fields().stream() - .map(Types.NestedField::name).collect(Collectors.toList()); - - private static final Schema NESTED_SCHEMA = new Schema( - Types.NestedField.required(1, "structType", Types.StructType.of( - Types.NestedField.optional(2, "innerStringType", Types.StringType.get()), - Types.NestedField.optional(3, "innerIntegerType", Types.IntegerType.get()) - )) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(0, "boolType", Types.BooleanType.get()), + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "longType", Types.LongType.get()), + Types.NestedField.required(3, "dateType", Types.DateType.get()), + Types.NestedField.required(4, "timeType", Types.TimeType.get()), + Types.NestedField.required(5, "stringType", Types.StringType.get()), + Types.NestedField.required(6, "timestampWithoutZone", Types.TimestampType.withoutZone()), + Types.NestedField.required(7, "timestampWithZone", Types.TimestampType.withZone()), + Types.NestedField.required(8, "fixedType", Types.FixedType.ofLength(5)), + Types.NestedField.required(9, "uuidType", Types.UUIDType.get()), + Types.NestedField.required(10, "binaryType", Types.BinaryType.get()), + Types.NestedField.required(11, "decimalType1", Types.DecimalType.of(18, 3)), + Types.NestedField.required(12, "decimalType2", Types.DecimalType.of(10, 5)), + Types.NestedField.required(13, "decimalType3", Types.DecimalType.of(38, 19)), + Types.NestedField.required(14, "floatType", Types.FloatType.get()), + Types.NestedField.required(15, "doubleType", Types.DoubleType.get())); + + private static final List SUPPORTED_PRIMITIVES = + SCHEMA.asStruct().fields().stream().map(Types.NestedField::name).collect(Collectors.toList()); + + private static final Schema NESTED_SCHEMA = + new Schema( + Types.NestedField.required( + 1, + "structType", + Types.StructType.of( + Types.NestedField.optional(2, "innerStringType", Types.StringType.get()), + Types.NestedField.optional(3, "innerIntegerType", Types.IntegerType.get())))); @Test public void testNullPartitionValue() { - Schema schema = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .identity("data") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("data").build(); - List rows = Lists.newArrayList( - GenericRowData.of(1, StringData.fromString("a")), - GenericRowData.of(2, StringData.fromString("b")), - GenericRowData.of(3, null) - ); + List rows = + Lists.newArrayList( + GenericRowData.of(1, StringData.fromString("a")), + GenericRowData.of(2, StringData.fromString("b")), + GenericRowData.of(3, null)); - RowDataWrapper rowWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + RowDataWrapper rowWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); for (RowData row : rows) { PartitionKey partitionKey = new PartitionKey(spec, schema); @@ -100,16 +100,15 @@ public void testNullPartitionValue() { @Test public void testPartitionWithOneNestedField() { - RowDataWrapper rowWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); + RowDataWrapper rowWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); List records = RandomGenericData.generate(NESTED_SCHEMA, 10, 1991); List rows = Lists.newArrayList(RandomRowData.convert(NESTED_SCHEMA, records)); - PartitionSpec spec1 = PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerStringType") - .build(); - PartitionSpec spec2 = PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerIntegerType") - .build(); + PartitionSpec spec1 = + PartitionSpec.builderFor(NESTED_SCHEMA).identity("structType.innerStringType").build(); + PartitionSpec spec2 = + PartitionSpec.builderFor(NESTED_SCHEMA).identity("structType.innerIntegerType").build(); for (int i = 0; i < rows.size(); i++) { RowData row = rows.get(i); @@ -131,18 +130,21 @@ public void testPartitionWithOneNestedField() { @Test public void testPartitionMultipleNestedField() { - RowDataWrapper rowWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); + RowDataWrapper rowWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); List records = RandomGenericData.generate(NESTED_SCHEMA, 10, 1992); List rows = Lists.newArrayList(RandomRowData.convert(NESTED_SCHEMA, records)); - PartitionSpec spec1 = PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerIntegerType") - .identity("structType.innerStringType") - .build(); - PartitionSpec spec2 = PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerStringType") - .identity("structType.innerIntegerType") - .build(); + PartitionSpec spec1 = + PartitionSpec.builderFor(NESTED_SCHEMA) + .identity("structType.innerIntegerType") + .identity("structType.innerStringType") + .build(); + PartitionSpec spec2 = + PartitionSpec.builderFor(NESTED_SCHEMA) + .identity("structType.innerStringType") + .identity("structType.innerIntegerType") + .build(); PartitionKey pk1 = new PartitionKey(spec1, NESTED_SCHEMA); PartitionKey pk2 = new PartitionKey(spec2, NESTED_SCHEMA); @@ -188,14 +190,19 @@ public void testPartitionValueTypes() { pk.partition(rowWrapper.wrap(row)); expectedPK.partition(recordWrapper.wrap(record)); - Assert.assertEquals("Partition with column " + column + " should have one field.", 1, pk.size()); + Assert.assertEquals( + "Partition with column " + column + " should have one field.", 1, pk.size()); if (column.equals("timeType")) { - Assert.assertEquals("Partition with column " + column + " should have the expected values", - expectedPK.get(0, Long.class) / 1000, pk.get(0, Long.class) / 1000); + Assert.assertEquals( + "Partition with column " + column + " should have the expected values", + expectedPK.get(0, Long.class) / 1000, + pk.get(0, Long.class) / 1000); } else { - Assert.assertEquals("Partition with column " + column + " should have the expected values", - expectedPK.get(0, javaClasses[0]), pk.get(0, javaClasses[0])); + Assert.assertEquals( + "Partition with column " + column + " should have the expected values", + expectedPK.get(0, javaClasses[0]), + pk.get(0, javaClasses[0])); } } } @@ -225,15 +232,19 @@ public void testNestedPartitionValues() { pk.partition(rowWrapper.wrap(rows.get(j))); expectedPK.partition(recordWrapper.wrap(records.get(j))); - Assert.assertEquals("Partition with nested column " + column + " should have one field.", - 1, pk.size()); + Assert.assertEquals( + "Partition with nested column " + column + " should have one field.", 1, pk.size()); if (column.equals("nested.timeType")) { - Assert.assertEquals("Partition with nested column " + column + " should have the expected values.", - expectedPK.get(0, Long.class) / 1000, pk.get(0, Long.class) / 1000); + Assert.assertEquals( + "Partition with nested column " + column + " should have the expected values.", + expectedPK.get(0, Long.class) / 1000, + pk.get(0, Long.class) / 1000); } else { - Assert.assertEquals("Partition with nested column " + column + " should have the expected values.", - expectedPK.get(0, javaClasses[0]), pk.get(0, javaClasses[0])); + Assert.assertEquals( + "Partition with nested column " + column + " should have the expected values.", + expectedPK.get(0, javaClasses[0]), + pk.get(0, javaClasses[0])); } } } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java index 2595b098dfea..a47a80ae367e 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.File; @@ -54,18 +53,17 @@ public class TestTaskWriters { private static final Configuration CONF = new Configuration(); private static final long TARGET_FILE_SIZE = 128 * 1024 * 1024; - @Rule - public final TemporaryFolder tempFolder = new TemporaryFolder(); + @Rule public final TemporaryFolder tempFolder = new TemporaryFolder(); @Parameterized.Parameters(name = "format = {0}, partitioned = {1}") public static Object[][] parameters() { return new Object[][] { - {"avro", true}, - {"avro", false}, - {"orc", true}, - {"orc", false}, - {"parquet", true}, - {"parquet", false} + {"avro", true}, + {"avro", false}, + {"orc", true}, + {"orc", false}, + {"parquet", true}, + {"parquet", false} }; } @@ -172,12 +170,13 @@ public void testCompleteFiles() throws IOException { appendFiles.commit(); // Assert the data rows. - SimpleDataUtil.assertTableRecords(path, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "b"), - SimpleDataUtil.createRecord(3, "c"), - SimpleDataUtil.createRecord(4, "d") - )); + SimpleDataUtil.assertTableRecords( + path, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "b"), + SimpleDataUtil.createRecord(3, "c"), + SimpleDataUtil.createRecord(4, "d"))); } } @@ -233,9 +232,14 @@ public void testRandomData() throws IOException { } private TaskWriter createTaskWriter(long targetFileSize) { - TaskWriterFactory taskWriterFactory = new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), (RowType) SimpleDataUtil.FLINK_SCHEMA.toRowDataType().getLogicalType(), - targetFileSize, format, null, false); + TaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + (RowType) SimpleDataUtil.FLINK_SCHEMA.toRowDataType().getLogicalType(), + targetFileSize, + format, + null, + false); taskWriterFactory.initialize(1, 1); return taskWriterFactory.create(); } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java index b0041c3bc04d..b0be3daf7b49 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.List; @@ -53,7 +52,8 @@ public class BoundedTableFactory implements DynamicTableSourceFactory { private static final AtomicInteger DATA_SET_ID = new AtomicInteger(0); private static final Map>> DATA_SETS = Maps.newHashMap(); - private static final ConfigOption DATA_ID = ConfigOptions.key("data-id").stringType().noDefaultValue(); + private static final ConfigOption DATA_ID = + ConfigOptions.key("data-id").stringType().noDefaultValue(); public static String registerDataSet(List> dataSet) { String dataSetId = String.valueOf(DATA_SET_ID.incrementAndGet()); @@ -67,12 +67,13 @@ public static void clearDataSets() { @Override public DynamicTableSource createDynamicTableSource(Context context) { - TableSchema tableSchema = TableSchemaUtils.getPhysicalSchema(context.getCatalogTable().getSchema()); + TableSchema tableSchema = + TableSchemaUtils.getPhysicalSchema(context.getCatalogTable().getSchema()); Configuration configuration = Configuration.fromMap(context.getCatalogTable().getOptions()); String dataId = configuration.getString(DATA_ID); - Preconditions.checkArgument(DATA_SETS.containsKey(dataId), - "data-id %s does not found in registered data set.", dataId); + Preconditions.checkArgument( + DATA_SETS.containsKey(dataId), "data-id %s does not found in registered data set.", dataId); return new BoundedTableSource(DATA_SETS.get(dataId), tableSchema); } @@ -112,8 +113,7 @@ public ChangelogMode getChangelogMode() { Supplier> supplier = () -> elementsPerCheckpoint.stream().flatMap(List::stream); // Add the INSERT row kind by default. - ChangelogMode.Builder builder = ChangelogMode.newBuilder() - .addContainedKind(RowKind.INSERT); + ChangelogMode.Builder builder = ChangelogMode.newBuilder().addContainedKind(RowKind.INSERT); if (supplier.get().anyMatch(r -> r.getKind() == RowKind.DELETE)) { builder.addContainedKind(RowKind.DELETE); @@ -136,12 +136,13 @@ public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderCon @Override public DataStream produceDataStream(StreamExecutionEnvironment env) { boolean checkpointEnabled = env.getCheckpointConfig().isCheckpointingEnabled(); - SourceFunction source = new BoundedTestSource<>(elementsPerCheckpoint, checkpointEnabled); + SourceFunction source = + new BoundedTestSource<>(elementsPerCheckpoint, checkpointEnabled); RowType rowType = (RowType) tableSchema.toRowDataType().getLogicalType(); // Converter to convert the Row to RowData. - DataFormatConverters.RowConverter rowConverter = new DataFormatConverters - .RowConverter(tableSchema.getFieldDataTypes()); + DataFormatConverters.RowConverter rowConverter = + new DataFormatConverters.RowConverter(tableSchema.getFieldDataTypes()); return env.addSource(source, new RowTypeInfo(tableSchema.getFieldTypes())) .map(rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java index 54e44ee5b008..7b435d059845 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Arrays; @@ -28,12 +27,10 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; /** - * A stream source that: - * 1) emits the elements from elementsPerCheckpoint.get(0) without allowing checkpoints. - * 2) then waits for the checkpoint to complete. - * 3) emits the elements from elementsPerCheckpoint.get(1) without allowing checkpoints. - * 4) then waits for the checkpoint to complete. - * 5) ... + * A stream source that: 1) emits the elements from elementsPerCheckpoint.get(0) without allowing + * checkpoints. 2) then waits for the checkpoint to complete. 3) emits the elements from + * elementsPerCheckpoint.get(1) without allowing checkpoints. 4) then waits for the checkpoint to + * complete. 5) ... * *

Util all the list from elementsPerCheckpoint are exhausted. */ @@ -45,9 +42,7 @@ public final class BoundedTestSource implements SourceFunction, Checkpoint private final AtomicInteger numCheckpointsComplete = new AtomicInteger(0); - /** - * Emits all those elements in several checkpoints. - */ + /** Emits all those elements in several checkpoints. */ public BoundedTestSource(List> elementsPerCheckpoint, boolean checkpointEnabled) { this.elementsPerCheckpoint = elementsPerCheckpoint; this.checkpointEnabled = checkpointEnabled; @@ -57,9 +52,7 @@ public BoundedTestSource(List> elementsPerCheckpoint) { this(elementsPerCheckpoint, true); } - /** - * Emits all those elements in a single checkpoint. - */ + /** Emits all those elements in a single checkpoint. */ public BoundedTestSource(T... elements) { this(Collections.singletonList(Arrays.asList(elements))); } @@ -67,8 +60,9 @@ public BoundedTestSource(T... elements) { @Override public void run(SourceContext ctx) throws Exception { if (!checkpointEnabled) { - Preconditions.checkArgument(elementsPerCheckpoint.size() <= 1, - "There should be at most one list in the elementsPerCheckpoint when checkpoint is disabled."); + Preconditions.checkArgument( + elementsPerCheckpoint.size() <= 1, + "There should be at most one list in the elementsPerCheckpoint when checkpoint is disabled."); elementsPerCheckpoint.stream().flatMap(List::stream).forEach(ctx::collect); return; } @@ -77,11 +71,16 @@ public void run(SourceContext ctx) throws Exception { final int checkpointToAwait; synchronized (ctx.getCheckpointLock()) { - // Let's say checkpointToAwait = numCheckpointsComplete.get() + delta, in fact the value of delta should not - // affect the final table records because we only need to make sure that there will be exactly - // elementsPerCheckpoint.size() checkpoints to emit each records buffer from the original elementsPerCheckpoint. - // Even if the checkpoints that emitted results are not continuous, the correctness of the data should not be - // affected in the end. Setting the delta to be 2 is introducing the variable that produce un-continuous + // Let's say checkpointToAwait = numCheckpointsComplete.get() + delta, in fact the value of + // delta should not + // affect the final table records because we only need to make sure that there will be + // exactly + // elementsPerCheckpoint.size() checkpoints to emit each records buffer from the original + // elementsPerCheckpoint. + // Even if the checkpoints that emitted results are not continuous, the correctness of the + // data should not be + // affected in the end. Setting the delta to be 2 is introducing the variable that produce + // un-continuous // checkpoints that emit the records buffer from elementsPerCheckpoints. checkpointToAwait = numCheckpointsComplete.get() + 2; for (T element : elements) { diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java index a445e7eb06ce..4f210abff729 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.List; @@ -36,8 +35,7 @@ public class ChangeLogTableTestBase extends FlinkTestBase { private volatile TableEnvironment tEnv = null; - @Rule - public TestName name = new TestName(); + @Rule public TestName name = new TestName(); @After public void clean() { @@ -50,17 +48,15 @@ protected TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { if (tEnv == null) { - EnvironmentSettings settings = EnvironmentSettings - .newInstance() - .useBlinkPlanner() - .inStreamingMode() - .build(); + EnvironmentSettings settings = + EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build(); - StreamExecutionEnvironment env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(400) - .setMaxParallelism(1) - .setParallelism(1); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(400) + .setMaxParallelism(1) + .setParallelism(1); tEnv = StreamTableEnvironment.create(env, settings); } @@ -86,8 +82,6 @@ protected static Row updateAfterRow(Object... values) { } protected static List listJoin(List> lists) { - return lists.stream() - .flatMap(List::stream) - .collect(Collectors.toList()); + return lists.stream().flatMap(List::stream).collect(Collectors.toList()); } } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java index d163b84c09c6..7b5f9328694c 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.List; @@ -36,46 +35,53 @@ public void testEmptyDataSet() { List> emptyDataSet = ImmutableList.of(); String dataId = BoundedTableFactory.registerDataSet(emptyDataSet); - sql("CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", table, dataId); + sql( + "CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", + table, dataId); - Assert.assertEquals("Should have caught empty change log set.", ImmutableList.of(), + Assert.assertEquals( + "Should have caught empty change log set.", + ImmutableList.of(), sql("SELECT * FROM %s", table)); } @Test public void testBoundedTableFactory() { String table = name.getMethodName(); - List> dataSet = ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(1, "bbb"), - insertRow(2, "aaa"), - deleteRow(2, "aaa"), - insertRow(2, "bbb") - ), - ImmutableList.of( - updateBeforeRow(2, "bbb"), - updateAfterRow(2, "ccc"), - deleteRow(2, "ccc"), - insertRow(2, "ddd") - ), + List> dataSet = ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(1, "ccc"), - deleteRow(1, "ccc"), - insertRow(1, "ddd") - ) - ); + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(1, "bbb"), + insertRow(2, "aaa"), + deleteRow(2, "aaa"), + insertRow(2, "bbb")), + ImmutableList.of( + updateBeforeRow(2, "bbb"), + updateAfterRow(2, "ccc"), + deleteRow(2, "ccc"), + insertRow(2, "ddd")), + ImmutableList.of( + deleteRow(1, "bbb"), + insertRow(1, "ccc"), + deleteRow(1, "ccc"), + insertRow(1, "ddd"))); String dataId = BoundedTableFactory.registerDataSet(dataSet); - sql("CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", table, dataId); + sql( + "CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", + table, dataId); List rowSet = dataSet.stream().flatMap(Streams::stream).collect(Collectors.toList()); - Assert.assertEquals("Should have the expected change log events.", rowSet, sql("SELECT * FROM %s", table)); + Assert.assertEquals( + "Should have the expected change log events.", rowSet, sql("SELECT * FROM %s", table)); - Assert.assertEquals("Should have the expected change log events", - rowSet.stream().filter(r -> Objects.equals(r.getField(1), "aaa")).collect(Collectors.toList()), + Assert.assertEquals( + "Should have the expected change log events", + rowSet.stream() + .filter(r -> Objects.equals(r.getField(1), "aaa")) + .collect(Collectors.toList()), sql("SELECT * FROM %s WHERE data='aaa'", table)); } } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java index eae3233a6546..69b8ac269267 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.List; import java.util.Map; @@ -38,11 +39,7 @@ import org.apache.iceberg.types.Types; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.required; - -/** - * Test {@link FlinkInputFormat}. - */ +/** Test {@link FlinkInputFormat}. */ public class TestFlinkInputFormat extends TestFlinkSource { public TestFlinkInputFormat(String fileFormat) { @@ -56,20 +53,27 @@ public void before() throws IOException { @Override protected List run( - FlinkSource.Builder formatBuilder, Map sqlOptions, String sqlFilter, String... sqlSelectedFields) + FlinkSource.Builder formatBuilder, + Map sqlOptions, + String sqlFilter, + String... sqlSelectedFields) throws Exception { return runFormat(formatBuilder.tableLoader(tableLoader()).buildFormat()); } @Test public void testNestedProjection() throws Exception { - Schema schema = new Schema( - required(1, "data", Types.StringType.get()), - required(2, "nested", Types.StructType.of( - Types.NestedField.required(3, "f1", Types.StringType.get()), - Types.NestedField.required(4, "f2", Types.StringType.get()), - Types.NestedField.required(5, "f3", Types.LongType.get()))), - required(6, "id", Types.LongType.get())); + Schema schema = + new Schema( + required(1, "data", Types.StringType.get()), + required( + 2, + "nested", + Types.StructType.of( + Types.NestedField.required(3, "f1", Types.StringType.get()), + Types.NestedField.required(4, "f2", Types.StringType.get()), + Types.NestedField.required(5, "f3", Types.LongType.get()))), + required(6, "id", Types.LongType.get())); Table table = catalog.createTable(TableIdentifier.of("default", "t"), schema); @@ -81,13 +85,17 @@ public void testNestedProjection() throws Exception { // The Flink SQL output: [f2, data] // The FlinkInputFormat output: [nested[f2], data] - TableSchema projectedSchema = TableSchema.builder() - .field("nested", DataTypes.ROW(DataTypes.FIELD("f2", DataTypes.STRING()))) - .field("data", DataTypes.STRING()).build(); - List result = runFormat(FlinkSource.forRowData() - .tableLoader(tableLoader()) - .project(projectedSchema) - .buildFormat()); + TableSchema projectedSchema = + TableSchema.builder() + .field("nested", DataTypes.ROW(DataTypes.FIELD("f2", DataTypes.STRING()))) + .field("data", DataTypes.STRING()) + .build(); + List result = + runFormat( + FlinkSource.forRowData() + .tableLoader(tableLoader()) + .project(projectedSchema) + .buildFormat()); List expected = Lists.newArrayList(); for (Record record : writeRecords) { @@ -100,23 +108,28 @@ public void testNestedProjection() throws Exception { @Test public void testBasicProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(2, "time", Types.TimestampType.withZone()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(2, "time", Types.TimestampType.withZone())); Table table = catalog.createTable(TableIdentifier.of("default", "t"), writeSchema); List writeRecords = RandomGenericData.generate(writeSchema, 2, 0L); new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).appendToTable(writeRecords); - TableSchema projectedSchema = TableSchema.builder() - .field("id", DataTypes.BIGINT()) - .field("data", DataTypes.STRING()) - .build(); - List result = runFormat(FlinkSource.forRowData() - .tableLoader(tableLoader()).project(projectedSchema).buildFormat()); + TableSchema projectedSchema = + TableSchema.builder() + .field("id", DataTypes.BIGINT()) + .field("data", DataTypes.STRING()) + .build(); + List result = + runFormat( + FlinkSource.forRowData() + .tableLoader(tableLoader()) + .project(projectedSchema) + .buildFormat()); List expected = Lists.newArrayList(); for (Record record : writeRecords) { diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java index 2a593c4702b4..b2f914e51299 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -43,26 +42,35 @@ public TestFlinkInputFormatReaderDeletes(FileFormat inputFormat) { } @Override - protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) throws IOException { + protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) + throws IOException { Schema projected = testTable.schema().select(columns); RowType rowType = FlinkSchemaUtil.convert(projected); Map properties = Maps.newHashMap(); - properties.put(CatalogProperties.WAREHOUSE_LOCATION, hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); + properties.put( + CatalogProperties.WAREHOUSE_LOCATION, + hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); properties.put(CatalogProperties.URI, hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)); - properties.put(CatalogProperties.CLIENT_POOL_SIZE, + properties.put( + CatalogProperties.CLIENT_POOL_SIZE, Integer.toString(hiveConf.getInt("iceberg.hive.client-pool-size", 5))); CatalogLoader hiveCatalogLoader = CatalogLoader.hive(catalog.name(), hiveConf, properties); - FlinkInputFormat inputFormat = FlinkSource.forRowData() - .tableLoader(TableLoader.fromCatalog(hiveCatalogLoader, TableIdentifier.of("default", tableName))) - .project(FlinkSchemaUtil.toSchema(rowType)).buildFormat(); + FlinkInputFormat inputFormat = + FlinkSource.forRowData() + .tableLoader( + TableLoader.fromCatalog( + hiveCatalogLoader, TableIdentifier.of("default", tableName))) + .project(FlinkSchemaUtil.toSchema(rowType)) + .buildFormat(); StructLikeSet set = StructLikeSet.create(projected.asStruct()); - TestHelpers.readRowData(inputFormat, rowType).forEach(rowData -> { - RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); - set.add(wrapper.wrap(rowData)); - }); + TestHelpers.readRowData(inputFormat, rowType) + .forEach( + rowData -> { + RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); + set.add(wrapper.wrap(rowData)); + }); return set; } - } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java index 1670ed733421..3a7ec96cb1d6 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -44,7 +43,8 @@ protected FileAppender writeAndGetAppender(List records) throws RowType flinkSchema = FlinkSchemaUtil.convert(SCHEMA); FileAppender appender = - new FlinkAppenderFactory(SCHEMA, flinkSchema, ImmutableMap.of(), PartitionSpec.unpartitioned()) + new FlinkAppenderFactory( + SCHEMA, flinkSchema, ImmutableMap.of(), PartitionSpec.unpartitioned()) .newAppender(org.apache.iceberg.Files.localOutput(temp.newFile()), fileFormat); try (FileAppender fileAppender = appender) { records.stream().map(r -> RowDataConverter.convert(SCHEMA, r)).forEach(fileAppender::add); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java index cc3c71716ef7..987d79fed3c3 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Map; @@ -46,8 +45,7 @@ @RunWith(Parameterized.class) public abstract class TestFlinkReaderDeletesBase extends DeleteReadTests { - @ClassRule - public static final TemporaryFolder TEMP_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMP_FOLDER = new TemporaryFolder(); protected static String databaseName = "default"; @@ -60,9 +58,9 @@ public abstract class TestFlinkReaderDeletesBase extends DeleteReadTests { @Parameterized.Parameters(name = "fileFormat={0}") public static Object[][] parameters() { return new Object[][] { - new Object[] { FileFormat.PARQUET }, - new Object[] { FileFormat.AVRO }, - new Object[] { FileFormat.ORC } + new Object[] {FileFormat.PARQUET}, + new Object[] {FileFormat.AVRO}, + new Object[] {FileFormat.ORC} }; } @@ -75,8 +73,10 @@ public static void startMetastore() { metastore = new TestHiveMetastore(); metastore.start(); hiveConf = metastore.hiveConf(); - catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); } @AfterClass diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java index 357f5ab14f7a..4b05899b9ec0 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.File; @@ -69,8 +68,7 @@ public abstract class TestFlinkScan { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); protected HadoopCatalog catalog; protected String warehouse; @@ -100,16 +98,18 @@ public void before() throws IOException { } @After - public void after() throws IOException { - } + public void after() throws IOException {} protected TableLoader tableLoader() { return TableLoader.fromHadoopTable(location); } protected abstract List runWithProjection(String... projected) throws Exception; + protected abstract List runWithFilter(Expression filter, String sqlFilter) throws Exception; + protected abstract List runWithOptions(Map options) throws Exception; + protected abstract List run() throws Exception; @Test @@ -122,31 +122,33 @@ public void testUnpartitionedTable() throws Exception { @Test public void testPartitionedTable() throws Exception { - Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + Table table = + catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); expectedRecords.get(0).set(2, "2020-03-20"); - new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).appendToTable( - org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); + new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER) + .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); } @Test public void testProjection() throws Exception { - Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + Table table = + catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); List inputRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).appendToTable( - org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), inputRecords); + new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER) + .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), inputRecords); assertRows(runWithProjection("data"), Row.of(inputRecords.get(0).get(0))); } @Test public void testIdentityPartitionProjections() throws Exception { - Schema logSchema = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get()) - ); + Schema logSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get()), + Types.NestedField.optional(3, "level", Types.StringType.get()), + Types.NestedField.optional(4, "message", Types.StringType.get())); PartitionSpec spec = PartitionSpec.builderFor(logSchema).identity("dt").identity("level").build(); @@ -158,8 +160,11 @@ public void testIdentityPartitionProjections() throws Exception { for (Record record : inputRecords) { record.set(1, "2020-03-2" + idx); record.set(2, Integer.toString(idx)); - append.appendFile(new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).writeFile( - org.apache.iceberg.TestHelpers.Row.of("2020-03-2" + idx, Integer.toString(idx)), ImmutableList.of(record))); + append.appendFile( + new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER) + .writeFile( + org.apache.iceberg.TestHelpers.Row.of("2020-03-2" + idx, Integer.toString(idx)), + ImmutableList.of(record))); idx += 1; } append.commit(); @@ -178,12 +183,18 @@ public void testIdentityPartitionProjections() throws Exception { validateIdentityPartitionProjections(table, Arrays.asList("message", "level"), inputRecords); validateIdentityPartitionProjections(table, Arrays.asList("level", "dt"), inputRecords); // out-of-order triplets - validateIdentityPartitionProjections(table, Arrays.asList("dt", "level", "message"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("level", "dt", "message"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("dt", "message", "level"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("level", "message", "dt"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("message", "dt", "level"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("message", "level", "dt"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("dt", "level", "message"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("level", "dt", "message"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("dt", "message", "level"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("level", "message", "dt"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("message", "dt", "level"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("message", "level", "dt"), inputRecords); } private void validateIdentityPartitionProjections( @@ -197,7 +208,9 @@ private void validateIdentityPartitionProjections( for (int i = 0; i < projectedFields.size(); i++) { String name = projectedFields.get(i); Assert.assertEquals( - "Projected field " + name + " should match", inputRecord.getField(name), actualRecord.getField(i)); + "Projected field " + name + " should match", + inputRecord.getField(name), + actualRecord.getField(i)); } } } @@ -220,10 +233,12 @@ public void testSnapshotReads() throws Exception { TestHelpers.assertRecords( runWithOptions(ImmutableMap.of("snapshot-id", Long.toString(snapshotId))), - expectedRecords, TestFixtures.SCHEMA); + expectedRecords, + TestFixtures.SCHEMA); TestHelpers.assertRecords( runWithOptions(ImmutableMap.of("as-of-timestamp", Long.toString(timestampMillis))), - expectedRecords, TestFixtures.SCHEMA); + expectedRecords, + TestFixtures.SCHEMA); } @Test @@ -250,57 +265,74 @@ public void testIncrementalRead() throws Exception { List expected2 = Lists.newArrayList(); expected2.addAll(records2); expected2.addAll(records3); - TestHelpers.assertRecords(runWithOptions( - ImmutableMap.builder() - .put("start-snapshot-id", Long.toString(snapshotId1)) - .put("end-snapshot-id", Long.toString(snapshotId3)).build()), - expected2, TestFixtures.SCHEMA); + TestHelpers.assertRecords( + runWithOptions( + ImmutableMap.builder() + .put("start-snapshot-id", Long.toString(snapshotId1)) + .put("end-snapshot-id", Long.toString(snapshotId3)) + .build()), + expected2, + TestFixtures.SCHEMA); } @Test public void testFilterExp() throws Exception { - Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + Table table = + catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); expectedRecords.get(0).set(2, "2020-03-20"); expectedRecords.get(1).set(2, "2020-03-20"); GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); - DataFile dataFile1 = helper.writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); - DataFile dataFile2 = helper.writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-21", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); + DataFile dataFile1 = + helper.writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); + DataFile dataFile2 = + helper.writeFile( + org.apache.iceberg.TestHelpers.Row.of("2020-03-21", 0), + RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); helper.appendToTable(dataFile1, dataFile2); - TestHelpers.assertRecords(runWithFilter( - Expressions.equal("dt", "2020-03-20"), "where dt='2020-03-20'"), + TestHelpers.assertRecords( + runWithFilter(Expressions.equal("dt", "2020-03-20"), "where dt='2020-03-20'"), expectedRecords, TestFixtures.SCHEMA); } @Test public void testPartitionTypes() throws Exception { - Schema typesSchema = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "decimal", Types.DecimalType.of(38, 18)), - Types.NestedField.optional(3, "str", Types.StringType.get()), - Types.NestedField.optional(4, "binary", Types.BinaryType.get()), - Types.NestedField.optional(5, "date", Types.DateType.get()), - Types.NestedField.optional(6, "time", Types.TimeType.get()), - Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone()) - ); - PartitionSpec spec = PartitionSpec.builderFor(typesSchema).identity("decimal").identity("str").identity("binary") - .identity("date").identity("time").identity("timestamp").build(); + Schema typesSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "decimal", Types.DecimalType.of(38, 18)), + Types.NestedField.optional(3, "str", Types.StringType.get()), + Types.NestedField.optional(4, "binary", Types.BinaryType.get()), + Types.NestedField.optional(5, "date", Types.DateType.get()), + Types.NestedField.optional(6, "time", Types.TimeType.get()), + Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone())); + PartitionSpec spec = + PartitionSpec.builderFor(typesSchema) + .identity("decimal") + .identity("str") + .identity("binary") + .identity("date") + .identity("time") + .identity("timestamp") + .build(); Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, typesSchema, spec); List records = RandomGenericData.generate(typesSchema, 10, 0L); GenericAppenderHelper appender = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); for (Record record : records) { - org.apache.iceberg.TestHelpers.Row partition = org.apache.iceberg.TestHelpers.Row.of( - record.get(1), - record.get(2), - record.get(3), - record.get(4) == null ? null : DateTimeUtil.daysFromDate((LocalDate) record.get(4)), - record.get(5) == null ? null : DateTimeUtil.microsFromTime((LocalTime) record.get(5)), - record.get(6) == null ? null : DateTimeUtil.microsFromTimestamp((LocalDateTime) record.get(6))); + org.apache.iceberg.TestHelpers.Row partition = + org.apache.iceberg.TestHelpers.Row.of( + record.get(1), + record.get(2), + record.get(3), + record.get(4) == null ? null : DateTimeUtil.daysFromDate((LocalDate) record.get(4)), + record.get(5) == null ? null : DateTimeUtil.microsFromTime((LocalTime) record.get(5)), + record.get(6) == null + ? null + : DateTimeUtil.microsFromTimestamp((LocalDateTime) record.get(6))); appender.appendToTable(partition, Collections.singletonList(record)); } @@ -309,10 +341,14 @@ public void testPartitionTypes() throws Exception { @Test public void testCustomizedFlinkDataTypes() throws Exception { - Schema schema = new Schema( - Types.NestedField.required( - 1, "map", Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.StringType.get())), - Types.NestedField.required(4, "arr", Types.ListType.ofRequired(5, Types.StringType.get()))); + Schema schema = + new Schema( + Types.NestedField.required( + 1, + "map", + Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.StringType.get())), + Types.NestedField.required( + 4, "arr", Types.ListType.ofRequired(5, Types.StringType.get()))); Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, schema); List records = RandomGenericData.generate(schema, 10, 0L); GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java index 5b94a009112e..c1a813417b46 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -48,9 +47,7 @@ import org.junit.Assert; import org.junit.Test; -/** - * Test Flink SELECT SQLs. - */ +/** Test Flink SELECT SQLs. */ public class TestFlinkScanSql extends TestFlinkSource { private volatile TableEnvironment tEnv; @@ -62,19 +59,23 @@ public TestFlinkScanSql(String fileFormat) { @Override public void before() throws IOException { super.before(); - sql("create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", warehouse); + sql( + "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + warehouse); sql("use catalog iceberg_catalog"); - getTableEnv().getConfig().getConfiguration().set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); + getTableEnv() + .getConfig() + .getConfiguration() + .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); } private TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { if (tEnv == null) { - this.tEnv = TableEnvironment.create(EnvironmentSettings - .newInstance() - .useBlinkPlanner() - .inBatchMode().build()); + this.tEnv = + TableEnvironment.create( + EnvironmentSettings.newInstance().useBlinkPlanner().inBatchMode().build()); } } } @@ -82,8 +83,11 @@ private TableEnvironment getTableEnv() { } @Override - protected List run(FlinkSource.Builder formatBuilder, Map sqlOptions, String sqlFilter, - String... sqlSelectedFields) { + protected List run( + FlinkSource.Builder formatBuilder, + Map sqlOptions, + String sqlFilter, + String... sqlSelectedFields) { String select = String.join(",", sqlSelectedFields); StringBuilder builder = new StringBuilder(); @@ -104,7 +108,9 @@ protected List run(FlinkSource.Builder formatBuilder, Map s @Test public void testResiduals() throws Exception { - Table table = catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); + Table table = + catalog.createTable( + TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); List writeRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); writeRecords.get(0).set(1, 123L); @@ -118,21 +124,29 @@ public void testResiduals() throws Exception { expectedRecords.add(writeRecords.get(0)); DataFile dataFile1 = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), writeRecords); - DataFile dataFile2 = helper.writeFile(TestHelpers.Row.of("2020-03-21", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); + DataFile dataFile2 = + helper.writeFile( + TestHelpers.Row.of("2020-03-21", 0), + RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); helper.appendToTable(dataFile1, dataFile2); - Expression filter = Expressions.and(Expressions.equal("dt", "2020-03-20"), Expressions.equal("id", 123)); - org.apache.iceberg.flink.TestHelpers.assertRecords(runWithFilter( - filter, "where dt='2020-03-20' and id=123"), expectedRecords, TestFixtures.SCHEMA); + Expression filter = + Expressions.and(Expressions.equal("dt", "2020-03-20"), Expressions.equal("id", 123)); + org.apache.iceberg.flink.TestHelpers.assertRecords( + runWithFilter(filter, "where dt='2020-03-20' and id=123"), + expectedRecords, + TestFixtures.SCHEMA); } @Test public void testInferedParallelism() throws IOException { - Table table = catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); + Table table = + catalog.createTable( + TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); TableLoader tableLoader = TableLoader.fromHadoopTable(table.location()); - FlinkInputFormat flinkInputFormat = FlinkSource.forRowData().tableLoader(tableLoader).table(table).buildFormat(); + FlinkInputFormat flinkInputFormat = + FlinkSource.forRowData().tableLoader(tableLoader).table(table).buildFormat(); ScanContext scanContext = ScanContext.builder().build(); // Empty table, infer parallelism should be at least 1 @@ -140,44 +154,57 @@ public void testInferedParallelism() throws IOException { Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); - DataFile dataFile1 = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); - DataFile dataFile2 = helper.writeFile(TestHelpers.Row.of("2020-03-21", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); + DataFile dataFile1 = + helper.writeFile( + TestHelpers.Row.of("2020-03-20", 0), + RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); + DataFile dataFile2 = + helper.writeFile( + TestHelpers.Row.of("2020-03-21", 0), + RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); helper.appendToTable(dataFile1, dataFile2); // Make sure to generate 2 CombinedScanTasks long maxFileLen = Math.max(dataFile1.fileSizeInBytes(), dataFile2.fileSizeInBytes()); - sql("ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", maxFileLen); + sql( + "ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", + maxFileLen); - // 2 splits (max infer is the default value 100 , max > splits num), the parallelism is splits num : 2 + // 2 splits (max infer is the default value 100 , max > splits num), the parallelism is splits + // num : 2 parallelism = FlinkSource.forRowData().inferParallelism(flinkInputFormat, scanContext); Assert.assertEquals("Should produce the expected parallelism.", 2, parallelism); // 2 splits and limit is 1 , max infer parallelism is default 100, // which is greater than splits num and limit, the parallelism is the limit value : 1 - parallelism = FlinkSource.forRowData().inferParallelism(flinkInputFormat, ScanContext.builder().limit(1).build()); + parallelism = + FlinkSource.forRowData() + .inferParallelism(flinkInputFormat, ScanContext.builder().limit(1).build()); Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); // 2 splits and max infer parallelism is 1 (max < splits num), the parallelism is 1 Configuration configuration = new Configuration(); configuration.setInteger(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX, 1); - parallelism = FlinkSource.forRowData() - .flinkConf(configuration) - .inferParallelism(flinkInputFormat, ScanContext.builder().build()); + parallelism = + FlinkSource.forRowData() + .flinkConf(configuration) + .inferParallelism(flinkInputFormat, ScanContext.builder().build()); Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); - // 2 splits, max infer parallelism is 1, limit is 3, the parallelism is max infer parallelism : 1 - parallelism = FlinkSource.forRowData() - .flinkConf(configuration) - .inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build()); + // 2 splits, max infer parallelism is 1, limit is 3, the parallelism is max infer parallelism : + // 1 + parallelism = + FlinkSource.forRowData() + .flinkConf(configuration) + .inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build()); Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); // 2 splits, infer parallelism is disabled, the parallelism is flink default parallelism 1 configuration.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); - parallelism = FlinkSource.forRowData() - .flinkConf(configuration) - .inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build()); + parallelism = + FlinkSource.forRowData() + .flinkConf(configuration) + .inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build()); Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); } @@ -186,7 +213,8 @@ public void testInferParallelismWithGlobalSetting() throws IOException { Configuration cfg = tEnv.getConfig().getConfiguration(); cfg.set(PipelineOptions.MAX_PARALLELISM, 1); - Table table = catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, null); + Table table = + catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, null); GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); List expectedRecords = Lists.newArrayList(); @@ -200,16 +228,20 @@ public void testInferParallelismWithGlobalSetting() throws IOException { } // Make sure to generate multiple CombinedScanTasks - sql("ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", maxFileLen); + sql( + "ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", + maxFileLen); List results = run(null, Maps.newHashMap(), "", "*"); - org.apache.iceberg.flink.TestHelpers.assertRecords(results, expectedRecords, TestFixtures.SCHEMA); + org.apache.iceberg.flink.TestHelpers.assertRecords( + results, expectedRecords, TestFixtures.SCHEMA); } @Test public void testExposeLocality() throws Exception { Table table = - catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); + catalog.createTable( + TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); TableLoader tableLoader = TableLoader.fromHadoopTable(table.location()); List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 10, 0L); @@ -221,22 +253,30 @@ public void testExposeLocality() throws Exception { // test sql api Configuration tableConf = getTableEnv().getConfig().getConfiguration(); - tableConf.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), false); + tableConf.setBoolean( + FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), false); List results = sql("select * from t"); - org.apache.iceberg.flink.TestHelpers.assertRecords(results, expectedRecords, TestFixtures.SCHEMA); + org.apache.iceberg.flink.TestHelpers.assertRecords( + results, expectedRecords, TestFixtures.SCHEMA); // test table api - tableConf.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), true); + tableConf.setBoolean( + FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), true); FlinkSource.Builder builder = FlinkSource.forRowData().tableLoader(tableLoader).table(table); Boolean localityEnabled = - DynMethods.builder("localityEnabled").hiddenImpl(builder.getClass()).build().invoke(builder); - // When running with CI or local, `localityEnabled` will be false even if this configuration is enabled + DynMethods.builder("localityEnabled") + .hiddenImpl(builder.getClass()) + .build() + .invoke(builder); + // When running with CI or local, `localityEnabled` will be false even if this configuration is + // enabled Assert.assertFalse("Expose split locality info should be false.", localityEnabled); results = run(builder, Maps.newHashMap(), "where dt='2020-03-20'", "*"); - org.apache.iceberg.flink.TestHelpers.assertRecords(results, expectedRecords, TestFixtures.SCHEMA); + org.apache.iceberg.flink.TestHelpers.assertRecords( + results, expectedRecords, TestFixtures.SCHEMA); } private List sql(String query, Object... args) { diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java index 633a32a4c3d1..3a01952cd9ec 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Collections; @@ -40,8 +39,10 @@ public abstract class TestFlinkSource extends TestFlinkScan { @Override protected List runWithProjection(String... projected) throws Exception { TableSchema.Builder builder = TableSchema.builder(); - TableSchema schema = FlinkSchemaUtil.toSchema(FlinkSchemaUtil.convert( - catalog.loadTable(TableIdentifier.of("default", "t")).schema())); + TableSchema schema = + FlinkSchemaUtil.toSchema( + FlinkSchemaUtil.convert( + catalog.loadTable(TableIdentifier.of("default", "t")).schema())); for (String field : projected) { TableColumn column = schema.getTableColumn(field).get(); builder.field(column.getName(), column.getType()); @@ -51,14 +52,16 @@ protected List runWithProjection(String... projected) throws Exception { @Override protected List runWithFilter(Expression filter, String sqlFilter) throws Exception { - FlinkSource.Builder builder = FlinkSource.forRowData().filters(Collections.singletonList(filter)); + FlinkSource.Builder builder = + FlinkSource.forRowData().filters(Collections.singletonList(filter)); return run(builder, Maps.newHashMap(), sqlFilter, "*"); } @Override protected List runWithOptions(Map options) throws Exception { FlinkSource.Builder builder = FlinkSource.forRowData(); - Optional.ofNullable(options.get("snapshot-id")).ifPresent(value -> builder.snapshotId(Long.parseLong(value))); + Optional.ofNullable(options.get("snapshot-id")) + .ifPresent(value -> builder.snapshotId(Long.parseLong(value))); Optional.ofNullable(options.get("start-snapshot-id")) .ifPresent(value -> builder.startSnapshotId(Long.parseLong(value))); Optional.ofNullable(options.get("end-snapshot-id")) @@ -73,6 +76,10 @@ protected List run() throws Exception { return run(FlinkSource.forRowData(), Maps.newHashMap(), "", "*"); } - protected abstract List run(FlinkSource.Builder formatBuilder, Map sqlOptions, String sqlFilter, - String... sqlSelectedFields) throws Exception; + protected abstract List run( + FlinkSource.Builder formatBuilder, + Map sqlOptions, + String sqlFilter, + String... sqlSelectedFields) + throws Exception; } diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java index baccadc31f08..bc63e4a0b282 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -51,8 +50,7 @@ @RunWith(Parameterized.class) public class TestProjectMetaColumn { - @Rule - public final TemporaryFolder folder = new TemporaryFolder(); + @Rule public final TemporaryFolder folder = new TemporaryFolder(); private final FileFormat format; @Parameterized.Parameters(name = "fileFormat={0}") @@ -60,8 +58,7 @@ public static Iterable parameters() { return Lists.newArrayList( new Object[] {FileFormat.PARQUET}, new Object[] {FileFormat.ORC}, - new Object[] {FileFormat.AVRO} - ); + new Object[] {FileFormat.AVRO}); } public TestProjectMetaColumn(FileFormat format) { @@ -71,28 +68,30 @@ public TestProjectMetaColumn(FileFormat format) { private void testSkipToRemoveMetaColumn(int formatVersion) throws IOException { // Create the table with given format version. String location = folder.getRoot().getAbsolutePath(); - Table table = SimpleDataUtil.createTable(location, - ImmutableMap.of(TableProperties.FORMAT_VERSION, String.valueOf(formatVersion)), - false); - - List rows = Lists.newArrayList( - SimpleDataUtil.createInsert(1, "AAA"), - SimpleDataUtil.createInsert(2, "BBB"), - SimpleDataUtil.createInsert(3, "CCC") - ); + Table table = + SimpleDataUtil.createTable( + location, + ImmutableMap.of(TableProperties.FORMAT_VERSION, String.valueOf(formatVersion)), + false); + + List rows = + Lists.newArrayList( + SimpleDataUtil.createInsert(1, "AAA"), + SimpleDataUtil.createInsert(2, "BBB"), + SimpleDataUtil.createInsert(3, "CCC")); writeAndCommit(table, ImmutableList.of(), false, rows); - FlinkInputFormat input = FlinkSource - .forRowData() - .tableLoader(TableLoader.fromHadoopTable(location)) - .buildFormat(); + FlinkInputFormat input = + FlinkSource.forRowData().tableLoader(TableLoader.fromHadoopTable(location)).buildFormat(); List results = Lists.newArrayList(); - TestHelpers.readRowData(input, rowData -> { - // If project to remove the meta columns, it will get a RowDataProjection. - Assert.assertTrue(rowData instanceof GenericRowData); - results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); - }); + TestHelpers.readRowData( + input, + rowData -> { + // If project to remove the meta columns, it will get a RowDataProjection. + Assert.assertTrue(rowData instanceof GenericRowData); + results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); + }); // Assert the results. TestHelpers.assertRows(rows, results, SimpleDataUtil.ROW_TYPE); @@ -112,37 +111,41 @@ public void testV2SkipToRemoveMetaColumn() throws IOException { public void testV2RemoveMetaColumn() throws Exception { // Create the v2 table. String location = folder.getRoot().getAbsolutePath(); - Table table = SimpleDataUtil.createTable(location, ImmutableMap.of(TableProperties.FORMAT_VERSION, "2"), false); - - List rows = Lists.newArrayList( - SimpleDataUtil.createInsert(1, "AAA"), - SimpleDataUtil.createDelete(1, "AAA"), - SimpleDataUtil.createInsert(2, "AAA"), - SimpleDataUtil.createInsert(2, "BBB") - ); + Table table = + SimpleDataUtil.createTable( + location, ImmutableMap.of(TableProperties.FORMAT_VERSION, "2"), false); + + List rows = + Lists.newArrayList( + SimpleDataUtil.createInsert(1, "AAA"), + SimpleDataUtil.createDelete(1, "AAA"), + SimpleDataUtil.createInsert(2, "AAA"), + SimpleDataUtil.createInsert(2, "BBB")); int eqFieldId = table.schema().findField("data").fieldId(); writeAndCommit(table, ImmutableList.of(eqFieldId), true, rows); - FlinkInputFormat input = FlinkSource - .forRowData() - .tableLoader(TableLoader.fromHadoopTable(location)) - .buildFormat(); + FlinkInputFormat input = + FlinkSource.forRowData().tableLoader(TableLoader.fromHadoopTable(location)).buildFormat(); List results = Lists.newArrayList(); - TestHelpers.readRowData(input, rowData -> { - // If project to remove the meta columns, it will get a RowDataProjection. - Assert.assertTrue(rowData instanceof RowDataProjection); - results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); - }); + TestHelpers.readRowData( + input, + rowData -> { + // If project to remove the meta columns, it will get a RowDataProjection. + Assert.assertTrue(rowData instanceof RowDataProjection); + results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); + }); // Assert the results. - TestHelpers.assertRows(ImmutableList.of( - SimpleDataUtil.createInsert(2, "AAA"), - SimpleDataUtil.createInsert(2, "BBB") - ), results, SimpleDataUtil.ROW_TYPE); + TestHelpers.assertRows( + ImmutableList.of( + SimpleDataUtil.createInsert(2, "AAA"), SimpleDataUtil.createInsert(2, "BBB")), + results, + SimpleDataUtil.ROW_TYPE); } - private void writeAndCommit(Table table, List eqFieldIds, boolean upsert, List rows) + private void writeAndCommit( + Table table, List eqFieldIds, boolean upsert, List rows) throws IOException { TaskWriter writer = createTaskWriter(table, eqFieldIds, upsert); try (TaskWriter io = writer) { @@ -165,14 +168,16 @@ private void writeAndCommit(Table table, List eqFieldIds, boolean upser delta.commit(); } - private TaskWriter createTaskWriter(Table table, List equalityFieldIds, boolean upsert) { - TaskWriterFactory taskWriterFactory = new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - SimpleDataUtil.ROW_TYPE, - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, - format, - equalityFieldIds, - upsert); + private TaskWriter createTaskWriter( + Table table, List equalityFieldIds, boolean upsert) { + TaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + SimpleDataUtil.ROW_TYPE, + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, + format, + equalityFieldIds, + upsert); taskWriterFactory.initialize(1, 1); return taskWriterFactory.create(); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java index e658442d9943..1ab77b9b7039 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -64,18 +63,19 @@ protected TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { if (tEnv == null) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings - .newInstance() - .useBlinkPlanner() - .inStreamingMode(); + EnvironmentSettings.Builder settingsBuilder = + EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode(); - StreamExecutionEnvironment env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); env.enableCheckpointing(400); - StreamTableEnvironment streamTableEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); - streamTableEnv.getConfig() + StreamTableEnvironment streamTableEnv = + StreamTableEnvironment.create(env, settingsBuilder.build()); + streamTableEnv + .getConfig() .getConfiguration() .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); tEnv = streamTableEnv; @@ -108,11 +108,11 @@ private void insertRows(String partition, Table table, Row... rows) throws IOExc GenericRecord gRecord = GenericRecord.create(table.schema()); List records = Lists.newArrayList(); for (Row row : rows) { - records.add(gRecord.copy( - "id", row.getField(0), - "data", row.getField(1), - "dt", row.getField(2) - )); + records.add( + gRecord.copy( + "id", row.getField(0), + "data", row.getField(1), + "dt", row.getField(2))); } if (partition != null) { @@ -132,9 +132,12 @@ private void assertRows(List expectedRows, Iterator iterator) { Row actualRow = iterator.next(); Assert.assertEquals("Should have expected fields", 3, actualRow.getArity()); - Assert.assertEquals("Should have expected id", expectedRow.getField(0), actualRow.getField(0)); - Assert.assertEquals("Should have expected data", expectedRow.getField(1), actualRow.getField(1)); - Assert.assertEquals("Should have expected dt", expectedRow.getField(2), actualRow.getField(2)); + Assert.assertEquals( + "Should have expected id", expectedRow.getField(0), actualRow.getField(0)); + Assert.assertEquals( + "Should have expected data", expectedRow.getField(1), actualRow.getField(1)); + Assert.assertEquals( + "Should have expected dt", expectedRow.getField(2), actualRow.getField(2)); } } @@ -143,7 +146,8 @@ public void testUnPartitionedTable() throws Exception { sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - TableResult result = exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); try (CloseableIterator iterator = result.collect()) { Row row1 = Row.of(1, "aaa", "2021-01-01"); @@ -157,13 +161,13 @@ public void testUnPartitionedTable() throws Exception { result.getJobClient().ifPresent(JobClient::cancel); } - @Test public void testPartitionedTable() throws Exception { sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR) PARTITIONED BY (dt)", TABLE); Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - TableResult result = exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); try (CloseableIterator iterator = result.collect()) { Row row1 = Row.of(1, "aaa", "2021-01-01"); insertRows("2021-01-01", table, row1); @@ -193,7 +197,8 @@ public void testConsumeFromBeginning() throws Exception { Row row2 = Row.of(2, "bbb", "2021-01-01"); insertRows(table, row1, row2); - TableResult result = exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); try (CloseableIterator iterator = result.collect()) { assertRows(ImmutableList.of(row1, row2), iterator); @@ -225,8 +230,11 @@ public void testConsumeFromStartSnapshotId() throws Exception { Row row4 = Row.of(4, "ddd", "2021-01-01"); insertRows(table, row3, row4); - TableResult result = exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', " + - "'start-snapshot-id'='%d')*/", TABLE, startSnapshotId); + TableResult result = + exec( + "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', " + + "'start-snapshot-id'='%d')*/", + TABLE, startSnapshotId); try (CloseableIterator iterator = result.collect()) { // The row2 in start snapshot will be excluded. assertRows(ImmutableList.of(row3, row4), iterator); diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java index 6f6ffd964911..91b11cbeb24d 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.File; @@ -56,19 +55,16 @@ @RunWith(Parameterized.class) public class TestStreamingMonitorFunction extends TableTestBase { - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET; private static final long WAIT_TIME_MILLIS = 10 * 1000L; @Parameterized.Parameters(name = "FormatVersion={0}") public static Iterable parameters() { - return ImmutableList.of( - new Object[] {1}, - new Object[] {2} - ); + return ImmutableList.of(new Object[] {1}, new Object[] {2}); } public TestStreamingMonitorFunction(int formatVersion) { @@ -86,23 +82,24 @@ public void setupTable() throws IOException { table = create(SCHEMA, PartitionSpec.unpartitioned()); } - private void runSourceFunctionInTask(TestSourceContext sourceContext, StreamingMonitorFunction function) { - Thread task = new Thread(() -> { - try { - function.run(sourceContext); - } catch (Exception e) { - throw new RuntimeException(e); - } - }); + private void runSourceFunctionInTask( + TestSourceContext sourceContext, StreamingMonitorFunction function) { + Thread task = + new Thread( + () -> { + try { + function.run(sourceContext); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); task.start(); } @Test public void testConsumeWithoutStartSnapshotId() throws Exception { List> recordsList = generateRecordsAndCommitTxn(10); - ScanContext scanContext = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .build(); + ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build(); StreamingMonitorFunction function = createFunction(scanContext); try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { @@ -113,14 +110,16 @@ public void testConsumeWithoutStartSnapshotId() throws Exception { TestSourceContext sourceContext = new TestSourceContext(latch); runSourceFunctionInTask(sourceContext, function); - Assert.assertTrue("Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); + Assert.assertTrue( + "Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); Thread.sleep(1000L); // Stop the stream task. function.close(); Assert.assertEquals("Should produce the expected splits", 1, sourceContext.splits.size()); - TestHelpers.assertRecords(sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); } } @@ -133,10 +132,11 @@ public void testConsumeFromStartSnapshotId() throws Exception { // Commit the next five transactions. List> recordsList = generateRecordsAndCommitTxn(5); - ScanContext scanContext = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .startSnapshotId(startSnapshotId) - .build(); + ScanContext scanContext = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .startSnapshotId(startSnapshotId) + .build(); StreamingMonitorFunction function = createFunction(scanContext); try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { @@ -147,23 +147,23 @@ public void testConsumeFromStartSnapshotId() throws Exception { TestSourceContext sourceContext = new TestSourceContext(latch); runSourceFunctionInTask(sourceContext, function); - Assert.assertTrue("Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); + Assert.assertTrue( + "Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); Thread.sleep(1000L); // Stop the stream task. function.close(); Assert.assertEquals("Should produce the expected splits", 1, sourceContext.splits.size()); - TestHelpers.assertRecords(sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); } } @Test public void testCheckpointRestore() throws Exception { List> recordsList = generateRecordsAndCommitTxn(10); - ScanContext scanContext = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .build(); + ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build(); StreamingMonitorFunction func = createFunction(scanContext); OperatorSubtaskState state; @@ -175,7 +175,8 @@ public void testCheckpointRestore() throws Exception { TestSourceContext sourceContext = new TestSourceContext(latch); runSourceFunctionInTask(sourceContext, func); - Assert.assertTrue("Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); + Assert.assertTrue( + "Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); Thread.sleep(1000L); state = harness.snapshot(1, 1); @@ -184,7 +185,8 @@ public void testCheckpointRestore() throws Exception { func.close(); Assert.assertEquals("Should produce the expected splits", 1, sourceContext.splits.size()); - TestHelpers.assertRecords(sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); } List> newRecordsList = generateRecordsAndCommitTxn(10); @@ -199,44 +201,50 @@ public void testCheckpointRestore() throws Exception { TestSourceContext sourceContext = new TestSourceContext(latch); runSourceFunctionInTask(sourceContext, newFunc); - Assert.assertTrue("Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); + Assert.assertTrue( + "Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); Thread.sleep(1000L); // Stop the stream task. newFunc.close(); Assert.assertEquals("Should produce the expected splits", 1, sourceContext.splits.size()); - TestHelpers.assertRecords(sourceContext.toRows(), Lists.newArrayList(Iterables.concat(newRecordsList)), SCHEMA); + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(newRecordsList)), SCHEMA); } } @Test public void testInvalidMaxPlanningSnapshotCount() { - ScanContext scanContext1 = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .maxPlanningSnapshotCount(0) - .build(); - - AssertHelpers.assertThrows("Should throw exception because of invalid config", - IllegalArgumentException.class, "must be greater than zero", + ScanContext scanContext1 = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .maxPlanningSnapshotCount(0) + .build(); + + AssertHelpers.assertThrows( + "Should throw exception because of invalid config", + IllegalArgumentException.class, + "must be greater than zero", () -> { createFunction(scanContext1); return null; - } - ); - - ScanContext scanContext2 = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .maxPlanningSnapshotCount(-10) - .build(); - - AssertHelpers.assertThrows("Should throw exception because of invalid config", - IllegalArgumentException.class, "must be greater than zero", + }); + + ScanContext scanContext2 = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .maxPlanningSnapshotCount(-10) + .build(); + + AssertHelpers.assertThrows( + "Should throw exception because of invalid config", + IllegalArgumentException.class, + "must be greater than zero", () -> { createFunction(scanContext2); return null; - } - ); + }); } @Test @@ -246,25 +254,28 @@ public void testConsumeWithMaxPlanningSnapshotCount() throws Exception { // Use the oldest snapshot as starting to avoid the initial case. long oldestSnapshotId = SnapshotUtil.oldestAncestor(table).snapshotId(); - ScanContext scanContext = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .splitSize(1000L) - .startSnapshotId(oldestSnapshotId) - .maxPlanningSnapshotCount(Integer.MAX_VALUE) - .build(); + ScanContext scanContext = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .splitSize(1000L) + .startSnapshotId(oldestSnapshotId) + .maxPlanningSnapshotCount(Integer.MAX_VALUE) + .build(); FlinkInputSplit[] expectedSplits = FlinkSplitGenerator.createInputSplits(table, scanContext); Assert.assertEquals("should produce 9 splits", 9, expectedSplits.length); - // This covers three cases that maxPlanningSnapshotCount is less than, equal or greater than the total splits number + // This covers three cases that maxPlanningSnapshotCount is less than, equal or greater than the + // total splits number for (int maxPlanningSnapshotCount : ImmutableList.of(1, 9, 15)) { - scanContext = ScanContext.builder() - .monitorInterval(Duration.ofMillis(500)) - .startSnapshotId(oldestSnapshotId) - .splitSize(1000L) - .maxPlanningSnapshotCount(maxPlanningSnapshotCount) - .build(); + scanContext = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(500)) + .startSnapshotId(oldestSnapshotId) + .splitSize(1000L) + .maxPlanningSnapshotCount(maxPlanningSnapshotCount) + .build(); StreamingMonitorFunction function = createFunction(scanContext); try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { @@ -277,8 +288,10 @@ public void testConsumeWithMaxPlanningSnapshotCount() throws Exception { function.monitorAndForwardSplits(); if (maxPlanningSnapshotCount < 10) { - Assert.assertEquals("Should produce same splits as max-planning-snapshot-count", - maxPlanningSnapshotCount, sourceContext.splits.size()); + Assert.assertEquals( + "Should produce same splits as max-planning-snapshot-count", + maxPlanningSnapshotCount, + sourceContext.splits.size()); } } } @@ -302,12 +315,14 @@ private void writeRecords(List records) throws IOException { } private StreamingMonitorFunction createFunction(ScanContext scanContext) { - return new StreamingMonitorFunction(TestTableLoader.of(tableDir.getAbsolutePath()), scanContext); + return new StreamingMonitorFunction( + TestTableLoader.of(tableDir.getAbsolutePath()), scanContext); } - private AbstractStreamOperatorTestHarness createHarness(StreamingMonitorFunction function) - throws Exception { - StreamSource streamSource = new StreamSource<>(function); + private AbstractStreamOperatorTestHarness createHarness( + StreamingMonitorFunction function) throws Exception { + StreamSource streamSource = + new StreamSource<>(function); return new AbstractStreamOperatorTestHarness<>(streamSource, 1, 1, 0); } @@ -332,14 +347,10 @@ public void collectWithTimestamp(FlinkInputSplit element, long timestamp) { } @Override - public void emitWatermark(Watermark mark) { - - } + public void emitWatermark(Watermark mark) {} @Override - public void markAsTemporarilyIdle() { - - } + public void markAsTemporarilyIdle() {} @Override public Object getCheckpointLock() { @@ -347,14 +358,13 @@ public Object getCheckpointLock() { } @Override - public void close() { - - } + public void close() {} private List toRows() throws IOException { - FlinkInputFormat format = FlinkSource.forRowData() - .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) - .buildFormat(); + FlinkInputFormat format = + FlinkSource.forRowData() + .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) + .buildFormat(); List rows = Lists.newArrayList(); for (FlinkInputSplit split : splits) { diff --git a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java index 19c2b6ad7d76..e51afaa22f9b 100644 --- a/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java +++ b/flink/v1.13/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.File; @@ -55,18 +54,15 @@ @RunWith(Parameterized.class) public class TestStreamingReaderOperator extends TableTestBase { - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET; @Parameterized.Parameters(name = "FormatVersion={0}") public static Iterable parameters() { - return ImmutableList.of( - new Object[] {1}, - new Object[] {2} - ); + return ImmutableList.of(new Object[] {1}, new Object[] {2}); } public TestStreamingReaderOperator(int formatVersion) { @@ -114,7 +110,8 @@ public void testProcessAllRecords() throws Exception { @Test public void testTriggerCheckpoint() throws Exception { - // Received emitted splits: split1, split2, split3, checkpoint request is triggered when reading records from + // Received emitted splits: split1, split2, split3, checkpoint request is triggered when reading + // records from // split1. List> expectedRecords = generateRecordsAndCommitTxn(3); @@ -133,11 +130,11 @@ public void testTriggerCheckpoint() throws Exception { harness.processElement(splits.get(2), ++timestamp); // Trigger snapshot state, it will start to work once all records from split0 are read. - processor.getMainMailboxExecutor() - .execute(() -> harness.snapshot(1, 3), "Trigger snapshot"); + processor.getMainMailboxExecutor().execute(() -> harness.snapshot(1, 3), "Trigger snapshot"); Assert.assertTrue("Should have processed the split0", processor.runMailboxStep()); - Assert.assertTrue("Should have processed the snapshot state action", processor.runMailboxStep()); + Assert.assertTrue( + "Should have processed the snapshot state action", processor.runMailboxStep()); TestHelpers.assertRecords(readOutputValues(harness), expectedRecords.get(0), SCHEMA); @@ -147,8 +144,8 @@ public void testTriggerCheckpoint() throws Exception { // Read records from split2. Assert.assertTrue("Should have processed the split2", processor.runMailboxStep()); - TestHelpers.assertRecords(readOutputValues(harness), - Lists.newArrayList(Iterables.concat(expectedRecords)), SCHEMA); + TestHelpers.assertRecords( + readOutputValues(harness), Lists.newArrayList(Iterables.concat(expectedRecords)), SCHEMA); } } @@ -210,7 +207,8 @@ public void testCheckpointRestore() throws Exception { } } - private List readOutputValues(OneInputStreamOperatorTestHarness harness) { + private List readOutputValues( + OneInputStreamOperatorTestHarness harness) { List results = Lists.newArrayList(); for (RowData rowData : harness.extractOutputValues()) { results.add(Row.of(rowData.getInt(0), rowData.getString(1).toString())); @@ -243,15 +241,14 @@ private List generateSplits() { ScanContext scanContext; if (i == snapshotIds.size() - 1) { // Generate the splits from the first snapshot. - scanContext = ScanContext.builder() - .useSnapshotId(snapshotIds.get(i)) - .build(); + scanContext = ScanContext.builder().useSnapshotId(snapshotIds.get(i)).build(); } else { // Generate the splits between the previous snapshot and current snapshot. - scanContext = ScanContext.builder() - .startSnapshotId(snapshotIds.get(i + 1)) - .endSnapshotId(snapshotIds.get(i)) - .build(); + scanContext = + ScanContext.builder() + .startSnapshotId(snapshotIds.get(i + 1)) + .endSnapshotId(snapshotIds.get(i)) + .build(); } Collections.addAll(inputSplits, FlinkSplitGenerator.createInputSplits(table, scanContext)); @@ -260,15 +257,18 @@ private List generateSplits() { return inputSplits; } - private OneInputStreamOperatorTestHarness createReader() throws Exception { + private OneInputStreamOperatorTestHarness createReader() + throws Exception { // This input format is used to opening the emitted split. - FlinkInputFormat inputFormat = FlinkSource.forRowData() - .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) - .buildFormat(); - - OneInputStreamOperatorFactory factory = StreamingReaderOperator.factory(inputFormat); - OneInputStreamOperatorTestHarness harness = new OneInputStreamOperatorTestHarness<>( - factory, 1, 1, 0); + FlinkInputFormat inputFormat = + FlinkSource.forRowData() + .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) + .buildFormat(); + + OneInputStreamOperatorFactory factory = + StreamingReaderOperator.factory(inputFormat); + OneInputStreamOperatorTestHarness harness = + new OneInputStreamOperatorTestHarness<>(factory, 1, 1, 0); harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime); return harness; diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java index 1d53586a2db5..7c098cf20d03 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.Serializable; @@ -32,21 +31,21 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; -/** - * Serializable loader to load an Iceberg {@link Catalog}. - */ +/** Serializable loader to load an Iceberg {@link Catalog}. */ public interface CatalogLoader extends Serializable { /** - * Create a new catalog with the provided properties. NOTICE: for flink, we may initialize the {@link CatalogLoader} - * at flink sql client side or job manager side, and then serialize this catalog loader to task manager, finally - * deserialize it and create a new catalog at task manager side. + * Create a new catalog with the provided properties. NOTICE: for flink, we may initialize the + * {@link CatalogLoader} at flink sql client side or job manager side, and then serialize this + * catalog loader to task manager, finally deserialize it and create a new catalog at task manager + * side. * * @return a newly created {@link Catalog} */ Catalog loadCatalog(); - static CatalogLoader hadoop(String name, Configuration hadoopConf, Map properties) { + static CatalogLoader hadoop( + String name, Configuration hadoopConf, Map properties) { return new HadoopCatalogLoader(name, hadoopConf, properties); } @@ -54,7 +53,8 @@ static CatalogLoader hive(String name, Configuration hadoopConf, Map properties, Configuration hadoopConf, String impl) { + static CatalogLoader custom( + String name, Map properties, Configuration hadoopConf, String impl) { return new CustomCatalogLoader(name, properties, hadoopConf, impl); } @@ -65,9 +65,7 @@ class HadoopCatalogLoader implements CatalogLoader { private final Map properties; private HadoopCatalogLoader( - String catalogName, - Configuration conf, - Map properties) { + String catalogName, Configuration conf, Map properties) { this.catalogName = catalogName; this.hadoopConf = new SerializableConfiguration(conf); this.warehouseLocation = properties.get(CatalogProperties.WAREHOUSE_LOCATION); @@ -76,7 +74,8 @@ private HadoopCatalogLoader( @Override public Catalog loadCatalog() { - return CatalogUtil.loadCatalog(HadoopCatalog.class.getName(), catalogName, properties, hadoopConf.get()); + return CatalogUtil.loadCatalog( + HadoopCatalog.class.getName(), catalogName, properties, hadoopConf.get()); } @Override @@ -96,20 +95,23 @@ class HiveCatalogLoader implements CatalogLoader { private final int clientPoolSize; private final Map properties; - private HiveCatalogLoader(String catalogName, Configuration conf, Map properties) { + private HiveCatalogLoader( + String catalogName, Configuration conf, Map properties) { this.catalogName = catalogName; this.hadoopConf = new SerializableConfiguration(conf); this.uri = properties.get(CatalogProperties.URI); this.warehouse = properties.get(CatalogProperties.WAREHOUSE_LOCATION); - this.clientPoolSize = properties.containsKey(CatalogProperties.CLIENT_POOL_SIZE) ? - Integer.parseInt(properties.get(CatalogProperties.CLIENT_POOL_SIZE)) : - CatalogProperties.CLIENT_POOL_SIZE_DEFAULT; + this.clientPoolSize = + properties.containsKey(CatalogProperties.CLIENT_POOL_SIZE) + ? Integer.parseInt(properties.get(CatalogProperties.CLIENT_POOL_SIZE)) + : CatalogProperties.CLIENT_POOL_SIZE_DEFAULT; this.properties = Maps.newHashMap(properties); } @Override public Catalog loadCatalog() { - return CatalogUtil.loadCatalog(HiveCatalog.class.getName(), catalogName, properties, hadoopConf.get()); + return CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), catalogName, properties, hadoopConf.get()); } @Override @@ -131,14 +133,13 @@ class CustomCatalogLoader implements CatalogLoader { private final String impl; private CustomCatalogLoader( - String name, - Map properties, - Configuration conf, - String impl) { + String name, Map properties, Configuration conf, String impl) { this.hadoopConf = new SerializableConfiguration(conf); this.properties = Maps.newHashMap(properties); // wrap into a hashmap for serialization this.name = name; - this.impl = Preconditions.checkNotNull(impl, "Cannot initialize custom Catalog, impl class name is null"); + this.impl = + Preconditions.checkNotNull( + impl, "Cannot initialize custom Catalog, impl class name is null"); } @Override @@ -148,11 +149,7 @@ public Catalog loadCatalog() { @Override public String toString() { - return MoreObjects.toStringHelper(this) - .add("name", name) - .add("impl", impl) - .toString(); + return MoreObjects.toStringHelper(this).add("name", name).add("impl", impl).toString(); } } - } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java index 678a5a9c0dd7..8779e4656da5 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.Closeable; @@ -80,13 +79,14 @@ /** * A Flink Catalog implementation that wraps an Iceberg {@link Catalog}. - *

- * The mapping between Flink database and Iceberg namespace: - * Supplying a base namespace for a given catalog, so if you have a catalog that supports a 2-level namespace, you - * would supply the first level in the catalog configuration and the second level would be exposed as Flink databases. - *

- * The Iceberg table manages its partitions by itself. The partition of the Iceberg table is independent of the - * partition of Flink. + * + *

The mapping between Flink database and Iceberg namespace: Supplying a base namespace for a + * given catalog, so if you have a catalog that supports a 2-level namespace, you would supply the + * first level in the catalog configuration and the second level would be exposed as Flink + * databases. + * + *

The Iceberg table manages its partitions by itself. The partition of the Iceberg table is + * independent of the partition of Flink. */ public class FlinkCatalog extends AbstractCatalog { @@ -110,7 +110,8 @@ public FlinkCatalog( Catalog originalCatalog = catalogLoader.loadCatalog(); icebergCatalog = cacheEnabled ? CachingCatalog.wrap(originalCatalog) : originalCatalog; - asNamespaceCatalog = originalCatalog instanceof SupportsNamespaces ? (SupportsNamespaces) originalCatalog : null; + asNamespaceCatalog = + originalCatalog instanceof SupportsNamespaces ? (SupportsNamespaces) originalCatalog : null; closeable = originalCatalog instanceof Closeable ? (Closeable) originalCatalog : null; } @@ -162,7 +163,8 @@ public List listDatabases() throws CatalogException { } @Override - public CatalogDatabase getDatabase(String databaseName) throws DatabaseNotExistException, CatalogException { + public CatalogDatabase getDatabase(String databaseName) + throws DatabaseNotExistException, CatalogException { if (asNamespaceCatalog == null) { if (!getDefaultDatabase().equals(databaseName)) { throw new DatabaseNotExistException(getName(), databaseName); @@ -194,10 +196,12 @@ public boolean databaseExists(String databaseName) throws CatalogException { @Override public void createDatabase(String name, CatalogDatabase database, boolean ignoreIfExists) throws DatabaseAlreadyExistException, CatalogException { - createDatabase(name, mergeComment(database.getProperties(), database.getComment()), ignoreIfExists); + createDatabase( + name, mergeComment(database.getProperties(), database.getComment()), ignoreIfExists); } - private void createDatabase(String databaseName, Map metadata, boolean ignoreIfExists) + private void createDatabase( + String databaseName, Map metadata, boolean ignoreIfExists) throws DatabaseAlreadyExistException, CatalogException { if (asNamespaceCatalog != null) { try { @@ -208,7 +212,8 @@ private void createDatabase(String databaseName, Map metadata, b } } } else { - throw new UnsupportedOperationException("Namespaces are not supported by catalog: " + getName()); + throw new UnsupportedOperationException( + "Namespaces are not supported by catalog: " + getName()); } } @@ -257,7 +262,8 @@ public void alterDatabase(String name, CatalogDatabase newDatabase, boolean igno try { Map oldProperties = asNamespaceCatalog.loadNamespaceMetadata(namespace); - Map newProperties = mergeComment(newDatabase.getProperties(), newDatabase.getComment()); + Map newProperties = + mergeComment(newDatabase.getProperties(), newDatabase.getComment()); for (String key : oldProperties.keySet()) { if (!newProperties.containsKey(key)) { @@ -296,7 +302,8 @@ public void alterDatabase(String name, CatalogDatabase newDatabase, boolean igno } @Override - public List listTables(String databaseName) throws DatabaseNotExistException, CatalogException { + public List listTables(String databaseName) + throws DatabaseNotExistException, CatalogException { try { return icebergCatalog.listTables(toNamespace(databaseName)).stream() .map(TableIdentifier::name) @@ -307,7 +314,8 @@ public List listTables(String databaseName) throws DatabaseNotExistExcep } @Override - public CatalogTable getTable(ObjectPath tablePath) throws TableNotExistException, CatalogException { + public CatalogTable getTable(ObjectPath tablePath) + throws TableNotExistException, CatalogException { Table table = loadIcebergTable(tablePath); return toCatalogTable(table); } @@ -361,10 +369,12 @@ public void renameTable(ObjectPath tablePath, String newTableName, boolean ignor @Override public void createTable(ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) throws CatalogException, TableAlreadyExistException { - if (Objects.equals(table.getOptions().get("connector"), FlinkDynamicTableFactory.FACTORY_IDENTIFIER)) { - throw new IllegalArgumentException("Cannot create the table with 'connector'='iceberg' table property in " + - "an iceberg catalog, Please create table with 'connector'='iceberg' property in a non-iceberg catalog or " + - "create table without 'connector'='iceberg' related properties in an iceberg table."); + if (Objects.equals( + table.getOptions().get("connector"), FlinkDynamicTableFactory.FACTORY_IDENTIFIER)) { + throw new IllegalArgumentException( + "Cannot create the table with 'connector'='iceberg' table property in " + + "an iceberg catalog, Please create table with 'connector'='iceberg' property in a non-iceberg catalog or " + + "create table without 'connector'='iceberg' related properties in an iceberg table."); } createIcebergTable(tablePath, table, ignoreIfExists); @@ -389,11 +399,7 @@ void createIcebergTable(ObjectPath tablePath, CatalogBaseTable table, boolean ig try { icebergCatalog.createTable( - toIdentifier(tablePath), - icebergSchema, - spec, - location, - properties.build()); + toIdentifier(tablePath), icebergSchema, spec, location, properties.build()); } catch (AlreadyExistsException e) { if (!ignoreIfExists) { throw new TableAlreadyExistException(getName(), tablePath, e); @@ -408,15 +414,16 @@ private static void validateTableSchemaAndPartition(CatalogTable ct1, CatalogTab if (ts1.getPrimaryKey().isPresent() && ts2.getPrimaryKey().isPresent()) { equalsPrimary = - Objects.equals(ts1.getPrimaryKey().get().getType(), ts2.getPrimaryKey().get().getType()) && - Objects.equals(ts1.getPrimaryKey().get().getColumns(), ts2.getPrimaryKey().get().getColumns()); + Objects.equals(ts1.getPrimaryKey().get().getType(), ts2.getPrimaryKey().get().getType()) + && Objects.equals( + ts1.getPrimaryKey().get().getColumns(), ts2.getPrimaryKey().get().getColumns()); } else if (!ts1.getPrimaryKey().isPresent() && !ts2.getPrimaryKey().isPresent()) { equalsPrimary = true; } - if (!(Objects.equals(ts1.getTableColumns(), ts2.getTableColumns()) && - Objects.equals(ts1.getWatermarkSpecs(), ts2.getWatermarkSpecs()) && - equalsPrimary)) { + if (!(Objects.equals(ts1.getTableColumns(), ts2.getTableColumns()) + && Objects.equals(ts1.getWatermarkSpecs(), ts2.getWatermarkSpecs()) + && equalsPrimary)) { throw new UnsupportedOperationException("Altering schema is not supported yet."); } @@ -445,7 +452,8 @@ public void alterTable(ObjectPath tablePath, CatalogBaseTable newTable, boolean // Currently, Flink SQL only support altering table properties. - // For current Flink Catalog API, support for adding/removing/renaming columns cannot be done by comparing + // For current Flink Catalog API, support for adding/removing/renaming columns cannot be done by + // comparing // CatalogTable instances, unless the Flink schema contains Iceberg column IDs. validateTableSchemaAndPartition(table, (CatalogTable) newTable); @@ -475,27 +483,36 @@ public void alterTable(ObjectPath tablePath, CatalogBaseTable newTable, boolean } } - oldProperties.keySet().forEach(k -> { - if (!newTable.getOptions().containsKey(k)) { - setProperties.put(k, null); - } - }); + oldProperties + .keySet() + .forEach( + k -> { + if (!newTable.getOptions().containsKey(k)) { + setProperties.put(k, null); + } + }); commitChanges(icebergTable, setLocation, setSnapshotId, pickSnapshotId, setProperties); } private static void validateFlinkTable(CatalogBaseTable table) { - Preconditions.checkArgument(table instanceof CatalogTable, "The Table should be a CatalogTable."); + Preconditions.checkArgument( + table instanceof CatalogTable, "The Table should be a CatalogTable."); TableSchema schema = table.getSchema(); - schema.getTableColumns().forEach(column -> { - if (!FlinkCompatibilityUtil.isPhysicalColumn(column)) { - throw new UnsupportedOperationException("Creating table with computed columns is not supported yet."); - } - }); + schema + .getTableColumns() + .forEach( + column -> { + if (!FlinkCompatibilityUtil.isPhysicalColumn(column)) { + throw new UnsupportedOperationException( + "Creating table with computed columns is not supported yet."); + } + }); if (!schema.getWatermarkSpecs().isEmpty()) { - throw new UnsupportedOperationException("Creating table with watermark specs is not supported yet."); + throw new UnsupportedOperationException( + "Creating table with watermark specs is not supported yet."); } } @@ -520,11 +537,17 @@ private static List toPartitionKeys(PartitionSpec spec, Schema icebergSc return partitionKeysBuilder.build(); } - private static void commitChanges(Table table, String setLocation, String setSnapshotId, - String pickSnapshotId, Map setProperties) { - // don't allow setting the snapshot and picking a commit at the same time because order is ambiguous and choosing + private static void commitChanges( + Table table, + String setLocation, + String setSnapshotId, + String pickSnapshotId, + Map setProperties) { + // don't allow setting the snapshot and picking a commit at the same time because order is + // ambiguous and choosing // one order leads to different results - Preconditions.checkArgument(setSnapshotId == null || pickSnapshotId == null, + Preconditions.checkArgument( + setSnapshotId == null || pickSnapshotId == null, "Cannot set the current snapshot ID and cherry-pick snapshot changes"); if (setSnapshotId != null) { @@ -541,20 +564,19 @@ private static void commitChanges(Table table, String setLocation, String setSna Transaction transaction = table.newTransaction(); if (setLocation != null) { - transaction.updateLocation() - .setLocation(setLocation) - .commit(); + transaction.updateLocation().setLocation(setLocation).commit(); } if (!setProperties.isEmpty()) { UpdateProperties updateProperties = transaction.updateProperties(); - setProperties.forEach((k, v) -> { - if (v == null) { - updateProperties.remove(k); - } else { - updateProperties.set(k, v); - } - }); + setProperties.forEach( + (k, v) -> { + if (v == null) { + updateProperties.remove(k); + } else { + updateProperties.set(k, v); + } + }); updateProperties.commit(); } @@ -565,7 +587,8 @@ static CatalogTable toCatalogTable(Table table) { TableSchema schema = FlinkSchemaUtil.toSchema(table.schema()); List partitionKeys = toPartitionKeys(table.spec(), table.schema()); - // NOTE: We can not create a IcebergCatalogTable extends CatalogTable, because Flink optimizer may use + // NOTE: We can not create a IcebergCatalogTable extends CatalogTable, because Flink optimizer + // may use // CatalogTableImpl to copy a new catalog table. // Let's re-loading table from Iceberg catalog when creating source/sink operators. // Iceberg does not have Table comment, so pass a null (Default comment value in Flink). @@ -581,7 +604,8 @@ CatalogLoader getCatalogLoader() { return catalogLoader; } - // ------------------------------ Unsupported methods --------------------------------------------- + // ------------------------------ Unsupported methods + // --------------------------------------------- @Override public List listViews(String databaseName) throws CatalogException { @@ -595,25 +619,35 @@ public CatalogPartition getPartition(ObjectPath tablePath, CatalogPartitionSpec } @Override - public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { + public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void createPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec, CatalogPartition partition, - boolean ignoreIfExists) throws CatalogException { + public void createPartition( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogPartition partition, + boolean ignoreIfExists) + throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void dropPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) + public void dropPartition( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void alterPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec, CatalogPartition newPartition, - boolean ignoreIfNotExists) throws CatalogException { + public void alterPartition( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogPartition newPartition, + boolean ignoreIfNotExists) + throws CatalogException { throw new UnsupportedOperationException(); } @@ -623,7 +657,8 @@ public List listFunctions(String dbName) throws CatalogException { } @Override - public CatalogFunction getFunction(ObjectPath functionPath) throws FunctionNotExistException, CatalogException { + public CatalogFunction getFunction(ObjectPath functionPath) + throws FunctionNotExistException, CatalogException { throw new FunctionNotExistException(getName(), functionPath); } @@ -633,13 +668,15 @@ public boolean functionExists(ObjectPath functionPath) throws CatalogException { } @Override - public void createFunction(ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) + public void createFunction( + ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void alterFunction(ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) + public void alterFunction( + ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) throws CatalogException { throw new UnsupportedOperationException(); } @@ -651,26 +688,36 @@ public void dropFunction(ObjectPath functionPath, boolean ignoreIfNotExists) } @Override - public void alterTableStatistics(ObjectPath tablePath, CatalogTableStatistics tableStatistics, - boolean ignoreIfNotExists) throws CatalogException { + public void alterTableStatistics( + ObjectPath tablePath, CatalogTableStatistics tableStatistics, boolean ignoreIfNotExists) + throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void alterTableColumnStatistics(ObjectPath tablePath, CatalogColumnStatistics columnStatistics, - boolean ignoreIfNotExists) throws CatalogException { + public void alterTableColumnStatistics( + ObjectPath tablePath, CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) + throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void alterPartitionStatistics(ObjectPath tablePath, CatalogPartitionSpec partitionSpec, - CatalogTableStatistics partitionStatistics, boolean ignoreIfNotExists) throws CatalogException { + public void alterPartitionStatistics( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogTableStatistics partitionStatistics, + boolean ignoreIfNotExists) + throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void alterPartitionColumnStatistics(ObjectPath tablePath, CatalogPartitionSpec partitionSpec, - CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) throws CatalogException { + public void alterPartitionColumnStatistics( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogColumnStatistics columnStatistics, + boolean ignoreIfNotExists) + throws CatalogException { throw new UnsupportedOperationException(); } @@ -695,31 +742,32 @@ public List listPartitions(ObjectPath tablePath) set.add(new CatalogPartitionSpec(map)); } } catch (IOException e) { - throw new CatalogException(String.format("Failed to list partitions of table %s", tablePath), e); + throw new CatalogException( + String.format("Failed to list partitions of table %s", tablePath), e); } return Lists.newArrayList(set); } @Override - public List listPartitions(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { + public List listPartitions( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { throw new UnsupportedOperationException(); } @Override - public List listPartitionsByFilter(ObjectPath tablePath, List filters) - throws CatalogException { + public List listPartitionsByFilter( + ObjectPath tablePath, List filters) throws CatalogException { throw new UnsupportedOperationException(); } - // After partition pruning and filter push down, the statistics have become very inaccurate, so the statistics from + // After partition pruning and filter push down, the statistics have become very inaccurate, so + // the statistics from // here are of little significance. // Flink will support something like SupportsReportStatistics in future. @Override - public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) - throws CatalogException { + public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) throws CatalogException { return CatalogTableStatistics.UNKNOWN; } @@ -730,14 +778,14 @@ public CatalogColumnStatistics getTableColumnStatistics(ObjectPath tablePath) } @Override - public CatalogTableStatistics getPartitionStatistics(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { + public CatalogTableStatistics getPartitionStatistics( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { return CatalogTableStatistics.UNKNOWN; } @Override - public CatalogColumnStatistics getPartitionColumnStatistics(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { + public CatalogColumnStatistics getPartitionColumnStatistics( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { return CatalogColumnStatistics.UNKNOWN; } } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java index ffa54c0eb95d..1047a5067d4c 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.net.URL; @@ -40,20 +39,22 @@ /** * A Flink Catalog factory implementation that creates {@link FlinkCatalog}. - *

- * This supports the following catalog configuration options: + * + *

This supports the following catalog configuration options: + * *

    - *
  • type - Flink catalog factory key, should be "iceberg"
  • - *
  • catalog-type - iceberg catalog type, "hive" or "hadoop"
  • - *
  • uri - the Hive Metastore URI (Hive catalog only)
  • - *
  • clients - the Hive Client Pool Size (Hive catalog only)
  • - *
  • warehouse - the warehouse path (Hadoop catalog only)
  • - *
  • default-database - a database name to use as the default
  • - *
  • base-namespace - a base namespace as the prefix for all databases (Hadoop catalog only)
  • - *
  • cache-enabled - whether to enable catalog cache
  • + *
  • type - Flink catalog factory key, should be "iceberg" + *
  • catalog-type - iceberg catalog type, "hive" or "hadoop" + *
  • uri - the Hive Metastore URI (Hive catalog only) + *
  • clients - the Hive Client Pool Size (Hive catalog only) + *
  • warehouse - the warehouse path (Hadoop catalog only) + *
  • default-database - a database name to use as the default + *
  • base-namespace - a base namespace as the prefix for all databases (Hadoop + * catalog only) + *
  • cache-enabled - whether to enable catalog cache *
- *

- * To use a custom catalog that is not a Hive or Hadoop catalog, extend this class and override + * + *

To use a custom catalog that is not a Hive or Hadoop catalog, extend this class and override * {@link #createCatalogLoader(String, Map, Configuration)}. */ public class FlinkCatalogFactory implements CatalogFactory { @@ -73,27 +74,33 @@ public class FlinkCatalogFactory implements CatalogFactory { public static final String PROPERTY_VERSION = "property-version"; /** - * Create an Iceberg {@link org.apache.iceberg.catalog.Catalog} loader to be used by this Flink catalog adapter. + * Create an Iceberg {@link org.apache.iceberg.catalog.Catalog} loader to be used by this Flink + * catalog adapter. * - * @param name Flink's catalog name + * @param name Flink's catalog name * @param properties Flink's catalog properties * @param hadoopConf Hadoop configuration for catalog * @return an Iceberg catalog loader */ - static CatalogLoader createCatalogLoader(String name, Map properties, Configuration hadoopConf) { + static CatalogLoader createCatalogLoader( + String name, Map properties, Configuration hadoopConf) { String catalogImpl = properties.get(CatalogProperties.CATALOG_IMPL); if (catalogImpl != null) { String catalogType = properties.get(ICEBERG_CATALOG_TYPE); - Preconditions.checkArgument(catalogType == null, + Preconditions.checkArgument( + catalogType == null, "Cannot create catalog %s, both catalog-type and catalog-impl are set: catalog-type=%s, catalog-impl=%s", - name, catalogType, catalogImpl); + name, + catalogType, + catalogImpl); return CatalogLoader.custom(name, properties, hadoopConf, catalogImpl); } String catalogType = properties.getOrDefault(ICEBERG_CATALOG_TYPE, ICEBERG_CATALOG_TYPE_HIVE); switch (catalogType.toLowerCase(Locale.ENGLISH)) { case ICEBERG_CATALOG_TYPE_HIVE: - // The values of properties 'uri', 'warehouse', 'hive-conf-dir' are allowed to be null, in that case it will + // The values of properties 'uri', 'warehouse', 'hive-conf-dir' are allowed to be null, in + // that case it will // fallback to parse those values from hadoop configuration which is loaded from classpath. String hiveConfDir = properties.get(HIVE_CONF_DIR); Configuration newHadoopConf = mergeHiveConf(hadoopConf, hiveConfDir); @@ -103,8 +110,8 @@ static CatalogLoader createCatalogLoader(String name, Map proper return CatalogLoader.hadoop(name, hadoopConf, properties); default: - throw new UnsupportedOperationException("Unknown catalog-type: " + catalogType + - " (Must be 'hive' or 'hadoop')"); + throw new UnsupportedOperationException( + "Unknown catalog-type: " + catalogType + " (Must be 'hive' or 'hadoop')"); } } @@ -126,7 +133,8 @@ public Catalog createCatalog(String name, Map properties) { return createCatalog(name, properties, clusterHadoopConf()); } - protected Catalog createCatalog(String name, Map properties, Configuration hadoopConf) { + protected Catalog createCatalog( + String name, Map properties, Configuration hadoopConf) { CatalogLoader catalogLoader = createCatalogLoader(name, properties, hadoopConf); String defaultDatabase = properties.getOrDefault(DEFAULT_DATABASE, DEFAULT_DATABASE_NAME); @@ -142,11 +150,14 @@ protected Catalog createCatalog(String name, Map properties, Con private static Configuration mergeHiveConf(Configuration hadoopConf, String hiveConfDir) { Configuration newConf = new Configuration(hadoopConf); if (!Strings.isNullOrEmpty(hiveConfDir)) { - Preconditions.checkState(Files.exists(Paths.get(hiveConfDir, "hive-site.xml")), - "There should be a hive-site.xml file under the directory %s", hiveConfDir); + Preconditions.checkState( + Files.exists(Paths.get(hiveConfDir, "hive-site.xml")), + "There should be a hive-site.xml file under the directory %s", + hiveConfDir); newConf.addResource(new Path(hiveConfDir, "hive-site.xml")); } else { - // If don't provide the hive-site.xml path explicitly, it will try to load resource from classpath. If still + // If don't provide the hive-site.xml path explicitly, it will try to load resource from + // classpath. If still // couldn't load the configuration file, then it will throw exception in HiveCatalog. URL configFile = CatalogLoader.class.getClassLoader().getResource("hive-site.xml"); if (configFile != null) { diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java index e984f6875920..83fa09de544c 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java index 796a709d51e0..b57bf03d3379 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java @@ -16,10 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; - import org.apache.flink.configuration.ConfigOption; import org.apache.flink.configuration.ConfigOptions; import org.apache.flink.configuration.Configuration; @@ -27,8 +25,9 @@ import org.apache.iceberg.util.ThreadPools; /** - * When constructing Flink Iceberg source via Java API, - * configs can be set in {@link Configuration} passed to source builder. E.g. + * When constructing Flink Iceberg source via Java API, configs can be set in {@link Configuration} + * passed to source builder. E.g. + * *

  *   configuration.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, true);
  *   FlinkSource.forRowData()
@@ -36,9 +35,9 @@
  *       ...
  * 
* - *

+ *

When using Flink SQL/table API, connector options can be set in Flink's {@link + * TableEnvironment}. * - * When using Flink SQL/table API, connector options can be set in Flink's {@link TableEnvironment}. *

  *   TableEnvironment tEnv = createTableEnv();
  *   tEnv.getConfig()
@@ -48,15 +47,15 @@
  */
 public class FlinkConfigOptions {
 
-  private FlinkConfigOptions() {
-  }
+  private FlinkConfigOptions() {}
 
   public static final ConfigOption TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM =
       ConfigOptions.key("table.exec.iceberg.infer-source-parallelism")
           .booleanType()
           .defaultValue(true)
-          .withDescription("If is false, parallelism of source are set by config.\n" +
-              "If is true, source parallelism is inferred according to splits number.\n");
+          .withDescription(
+              "If is false, parallelism of source are set by config.\n"
+                  + "If is true, source parallelism is inferred according to splits number.\n");
 
   public static final ConfigOption TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX =
       ConfigOptions.key("table.exec.iceberg.infer-source-parallelism.max")
@@ -68,13 +67,14 @@ private FlinkConfigOptions() {
       ConfigOptions.key("table.exec.iceberg.expose-split-locality-info")
           .booleanType()
           .noDefaultValue()
-          .withDescription("Expose split host information to use Flink's locality aware split assigner.");
+          .withDescription(
+              "Expose split host information to use Flink's locality aware split assigner.");
 
-  public static final ConfigOption SOURCE_READER_FETCH_BATCH_RECORD_COUNT = ConfigOptions
-      .key("table.exec.iceberg.fetch-batch-record-count")
-      .intType()
-      .defaultValue(2048)
-      .withDescription("The target number of records for Iceberg reader fetch batch.");
+  public static final ConfigOption SOURCE_READER_FETCH_BATCH_RECORD_COUNT =
+      ConfigOptions.key("table.exec.iceberg.fetch-batch-record-count")
+          .intType()
+          .defaultValue(2048)
+          .withDescription("The target number of records for Iceberg reader fetch batch.");
 
   public static final ConfigOption TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE =
       ConfigOptions.key("table.exec.iceberg.worker-pool-size")
diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java
index f8250dc48efa..0c3cd3f69afc 100644
--- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java
+++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.flink;
 
 import java.util.Map;
@@ -43,7 +42,8 @@
 import org.apache.iceberg.relocated.com.google.common.collect.Maps;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
 
-public class FlinkDynamicTableFactory implements DynamicTableSinkFactory, DynamicTableSourceFactory {
+public class FlinkDynamicTableFactory
+    implements DynamicTableSinkFactory, DynamicTableSourceFactory {
   static final String FACTORY_IDENTIFIER = "iceberg";
 
   private static final ConfigOption CATALOG_NAME =
@@ -91,8 +91,12 @@ public DynamicTableSource createDynamicTableSource(Context context) {
     if (catalog != null) {
       tableLoader = createTableLoader(catalog, objectIdentifier.toObjectPath());
     } else {
-      tableLoader = createTableLoader(catalogTable, tableProps, objectIdentifier.getDatabaseName(),
-          objectIdentifier.getObjectName());
+      tableLoader =
+          createTableLoader(
+              catalogTable,
+              tableProps,
+              objectIdentifier.getDatabaseName(),
+              objectIdentifier.getObjectName());
     }
 
     return new IcebergTableSource(tableLoader, tableSchema, tableProps, context.getConfiguration());
@@ -109,8 +113,9 @@ public DynamicTableSink createDynamicTableSink(Context context) {
     if (catalog != null) {
       tableLoader = createTableLoader(catalog, objectPath);
     } else {
-      tableLoader = createTableLoader(catalogTable, tableProps, objectPath.getDatabaseName(),
-          objectPath.getObjectName());
+      tableLoader =
+          createTableLoader(
+              catalogTable, tableProps, objectPath.getDatabaseName(), objectPath.getObjectName());
     }
 
     return new IcebergTableSink(tableLoader, tableSchema, context.getConfiguration());
@@ -137,15 +142,17 @@ public String factoryIdentifier() {
     return FACTORY_IDENTIFIER;
   }
 
-  private static TableLoader createTableLoader(CatalogBaseTable catalogBaseTable,
-                                               Map tableProps,
-                                               String databaseName,
-                                               String tableName) {
+  private static TableLoader createTableLoader(
+      CatalogBaseTable catalogBaseTable,
+      Map tableProps,
+      String databaseName,
+      String tableName) {
     Configuration flinkConf = new Configuration();
     tableProps.forEach(flinkConf::setString);
 
     String catalogName = flinkConf.getString(CATALOG_NAME);
-    Preconditions.checkNotNull(catalogName, "Table property '%s' cannot be null", CATALOG_NAME.key());
+    Preconditions.checkNotNull(
+        catalogName, "Table property '%s' cannot be null", CATALOG_NAME.key());
 
     String catalogDatabase = flinkConf.getString(CATALOG_DATABASE, databaseName);
     Preconditions.checkNotNull(catalogDatabase, "The iceberg database name cannot be null");
@@ -155,15 +162,20 @@ private static TableLoader createTableLoader(CatalogBaseTable catalogBaseTable,
 
     org.apache.hadoop.conf.Configuration hadoopConf = FlinkCatalogFactory.clusterHadoopConf();
     FlinkCatalogFactory factory = new FlinkCatalogFactory();
-    FlinkCatalog flinkCatalog = (FlinkCatalog) factory.createCatalog(catalogName, tableProps, hadoopConf);
+    FlinkCatalog flinkCatalog =
+        (FlinkCatalog) factory.createCatalog(catalogName, tableProps, hadoopConf);
     ObjectPath objectPath = new ObjectPath(catalogDatabase, catalogTable);
 
     // Create database if not exists in the external catalog.
     if (!flinkCatalog.databaseExists(catalogDatabase)) {
       try {
-        flinkCatalog.createDatabase(catalogDatabase, new CatalogDatabaseImpl(Maps.newHashMap(), null), true);
+        flinkCatalog.createDatabase(
+            catalogDatabase, new CatalogDatabaseImpl(Maps.newHashMap(), null), true);
       } catch (DatabaseAlreadyExistException e) {
-        throw new AlreadyExistsException(e, "Database %s already exists in the iceberg catalog %s.", catalogName,
+        throw new AlreadyExistsException(
+            e,
+            "Database %s already exists in the iceberg catalog %s.",
+            catalogName,
             catalogDatabase);
       }
     }
@@ -173,12 +185,17 @@ private static TableLoader createTableLoader(CatalogBaseTable catalogBaseTable,
       try {
         flinkCatalog.createIcebergTable(objectPath, catalogBaseTable, true);
       } catch (TableAlreadyExistException e) {
-        throw new AlreadyExistsException(e, "Table %s already exists in the database %s and catalog %s",
-            catalogTable, catalogDatabase, catalogName);
+        throw new AlreadyExistsException(
+            e,
+            "Table %s already exists in the database %s and catalog %s",
+            catalogTable,
+            catalogDatabase,
+            catalogName);
       }
     }
 
-    return TableLoader.fromCatalog(flinkCatalog.getCatalogLoader(), TableIdentifier.of(catalogDatabase, catalogTable));
+    return TableLoader.fromCatalog(
+        flinkCatalog.getCatalogLoader(), TableIdentifier.of(catalogDatabase, catalogTable));
   }
 
   private static TableLoader createTableLoader(FlinkCatalog catalog, ObjectPath objectPath) {
diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java
index 5e5c9c1fe0fb..717de9ef5acc 100644
--- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java
+++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.iceberg.flink;
 
 import java.time.Instant;
@@ -43,38 +42,38 @@
 import org.apache.iceberg.util.NaNUtil;
 
 public class FlinkFilters {
-  private FlinkFilters() {
-  }
+  private FlinkFilters() {}
 
   private static final Pattern STARTS_WITH_PATTERN = Pattern.compile("([^%]+)%");
 
-  private static final Map FILTERS = ImmutableMap
-      .builder()
-      .put(BuiltInFunctionDefinitions.EQUALS, Operation.EQ)
-      .put(BuiltInFunctionDefinitions.NOT_EQUALS, Operation.NOT_EQ)
-      .put(BuiltInFunctionDefinitions.GREATER_THAN, Operation.GT)
-      .put(BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL, Operation.GT_EQ)
-      .put(BuiltInFunctionDefinitions.LESS_THAN, Operation.LT)
-      .put(BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL, Operation.LT_EQ)
-      .put(BuiltInFunctionDefinitions.IS_NULL, Operation.IS_NULL)
-      .put(BuiltInFunctionDefinitions.IS_NOT_NULL, Operation.NOT_NULL)
-      .put(BuiltInFunctionDefinitions.AND, Operation.AND)
-      .put(BuiltInFunctionDefinitions.OR, Operation.OR)
-      .put(BuiltInFunctionDefinitions.NOT, Operation.NOT)
-      .put(BuiltInFunctionDefinitions.LIKE, Operation.STARTS_WITH)
-      .build();
+  private static final Map FILTERS =
+      ImmutableMap.builder()
+          .put(BuiltInFunctionDefinitions.EQUALS, Operation.EQ)
+          .put(BuiltInFunctionDefinitions.NOT_EQUALS, Operation.NOT_EQ)
+          .put(BuiltInFunctionDefinitions.GREATER_THAN, Operation.GT)
+          .put(BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL, Operation.GT_EQ)
+          .put(BuiltInFunctionDefinitions.LESS_THAN, Operation.LT)
+          .put(BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL, Operation.LT_EQ)
+          .put(BuiltInFunctionDefinitions.IS_NULL, Operation.IS_NULL)
+          .put(BuiltInFunctionDefinitions.IS_NOT_NULL, Operation.NOT_NULL)
+          .put(BuiltInFunctionDefinitions.AND, Operation.AND)
+          .put(BuiltInFunctionDefinitions.OR, Operation.OR)
+          .put(BuiltInFunctionDefinitions.NOT, Operation.NOT)
+          .put(BuiltInFunctionDefinitions.LIKE, Operation.STARTS_WITH)
+          .build();
 
   /**
    * Convert flink expression to iceberg expression.
-   * 

- * the BETWEEN, NOT_BETWEEN, IN expression will be converted by flink automatically. the BETWEEN will be converted to - * (GT_EQ AND LT_EQ), the NOT_BETWEEN will be converted to (LT_EQ OR GT_EQ), the IN will be converted to OR, so we do - * not add the conversion here + * + *

the BETWEEN, NOT_BETWEEN, IN expression will be converted by flink automatically. the + * BETWEEN will be converted to (GT_EQ AND LT_EQ), the NOT_BETWEEN will be converted to (LT_EQ OR + * GT_EQ), the IN will be converted to OR, so we do not add the conversion here * * @param flinkExpression the flink expression * @return the iceberg expression */ - public static Optional convert(org.apache.flink.table.expressions.Expression flinkExpression) { + public static Optional convert( + org.apache.flink.table.expressions.Expression flinkExpression) { if (!(flinkExpression instanceof CallExpression)) { return Optional.empty(); } @@ -97,34 +96,42 @@ public static Optional convert(org.apache.flink.table.expressions.Ex return convertFieldAndLiteral(Expressions::lessThan, Expressions::greaterThan, call); case LT_EQ: - return convertFieldAndLiteral(Expressions::lessThanOrEqual, Expressions::greaterThanOrEqual, call); + return convertFieldAndLiteral( + Expressions::lessThanOrEqual, Expressions::greaterThanOrEqual, call); case GT: return convertFieldAndLiteral(Expressions::greaterThan, Expressions::lessThan, call); case GT_EQ: - return convertFieldAndLiteral(Expressions::greaterThanOrEqual, Expressions::lessThanOrEqual, call); + return convertFieldAndLiteral( + Expressions::greaterThanOrEqual, Expressions::lessThanOrEqual, call); case EQ: - return convertFieldAndLiteral((ref, lit) -> { - if (NaNUtil.isNaN(lit)) { - return Expressions.isNaN(ref); - } else { - return Expressions.equal(ref, lit); - } - }, call); + return convertFieldAndLiteral( + (ref, lit) -> { + if (NaNUtil.isNaN(lit)) { + return Expressions.isNaN(ref); + } else { + return Expressions.equal(ref, lit); + } + }, + call); case NOT_EQ: - return convertFieldAndLiteral((ref, lit) -> { - if (NaNUtil.isNaN(lit)) { - return Expressions.notNaN(ref); - } else { - return Expressions.notEqual(ref, lit); - } - }, call); + return convertFieldAndLiteral( + (ref, lit) -> { + if (NaNUtil.isNaN(lit)) { + return Expressions.notNaN(ref); + } else { + return Expressions.notEqual(ref, lit); + } + }, + call); case NOT: - return onlyChildAs(call, CallExpression.class).flatMap(FlinkFilters::convert).map(Expressions::not); + return onlyChildAs(call, CallExpression.class) + .flatMap(FlinkFilters::convert) + .map(Expressions::not); case AND: return convertLogicExpression(Expressions::and, call); @@ -140,8 +147,8 @@ public static Optional convert(org.apache.flink.table.expressions.Ex return Optional.empty(); } - private static Optional onlyChildAs(CallExpression call, - Class expectedChildClass) { + private static Optional onlyChildAs( + CallExpression call, Class expectedChildClass) { List children = call.getResolvedChildren(); if (children.size() != 1) { return Optional.empty(); @@ -166,26 +173,28 @@ private static Optional convertLike(CallExpression call) { if (left instanceof FieldReferenceExpression && right instanceof ValueLiteralExpression) { String name = ((FieldReferenceExpression) left).getName(); - return convertLiteral((ValueLiteralExpression) right).flatMap(lit -> { - if (lit instanceof String) { - String pattern = (String) lit; - Matcher matcher = STARTS_WITH_PATTERN.matcher(pattern); - // exclude special char of LIKE - // '_' is the wildcard of the SQL LIKE - if (!pattern.contains("_") && matcher.matches()) { - return Optional.of(Expressions.startsWith(name, matcher.group(1))); - } - } - - return Optional.empty(); - }); + return convertLiteral((ValueLiteralExpression) right) + .flatMap( + lit -> { + if (lit instanceof String) { + String pattern = (String) lit; + Matcher matcher = STARTS_WITH_PATTERN.matcher(pattern); + // exclude special char of LIKE + // '_' is the wildcard of the SQL LIKE + if (!pattern.contains("_") && matcher.matches()) { + return Optional.of(Expressions.startsWith(name, matcher.group(1))); + } + } + + return Optional.empty(); + }); } return Optional.empty(); } - private static Optional convertLogicExpression(BiFunction function, - CallExpression call) { + private static Optional convertLogicExpression( + BiFunction function, CallExpression call) { List args = call.getResolvedChildren(); if (args == null || args.size() != 2) { return Optional.empty(); @@ -201,29 +210,33 @@ private static Optional convertLogicExpression(BiFunction convertLiteral(ValueLiteralExpression expression) { - Optional value = expression.getValueAs(expression.getOutputDataType().getLogicalType().getDefaultConversion()); - return value.map(o -> { - if (o instanceof LocalDateTime) { - return DateTimeUtil.microsFromTimestamp((LocalDateTime) o); - } else if (o instanceof Instant) { - return DateTimeUtil.microsFromInstant((Instant) o); - } else if (o instanceof LocalTime) { - return DateTimeUtil.microsFromTime((LocalTime) o); - } else if (o instanceof LocalDate) { - return DateTimeUtil.daysFromDate((LocalDate) o); - } + Optional value = + expression.getValueAs( + expression.getOutputDataType().getLogicalType().getDefaultConversion()); + return value.map( + o -> { + if (o instanceof LocalDateTime) { + return DateTimeUtil.microsFromTimestamp((LocalDateTime) o); + } else if (o instanceof Instant) { + return DateTimeUtil.microsFromInstant((Instant) o); + } else if (o instanceof LocalTime) { + return DateTimeUtil.microsFromTime((LocalTime) o); + } else if (o instanceof LocalDate) { + return DateTimeUtil.daysFromDate((LocalDate) o); + } - return o; - }); + return o; + }); } - private static Optional convertFieldAndLiteral(BiFunction expr, - CallExpression call) { + private static Optional convertFieldAndLiteral( + BiFunction expr, CallExpression call) { return convertFieldAndLiteral(expr, expr, call); } private static Optional convertFieldAndLiteral( - BiFunction convertLR, BiFunction convertRL, + BiFunction convertLR, + BiFunction convertRL, CallExpression call) { List args = call.getResolvedChildren(); if (args.size() != 2) { @@ -239,7 +252,8 @@ private static Optional convertFieldAndLiteral( if (lit.isPresent()) { return Optional.of(convertLR.apply(name, lit.get())); } - } else if (left instanceof ValueLiteralExpression && right instanceof FieldReferenceExpression) { + } else if (left instanceof ValueLiteralExpression + && right instanceof FieldReferenceExpression) { Optional lit = convertLiteral((ValueLiteralExpression) left); String name = ((FieldReferenceExpression) right).getName(); if (lit.isPresent()) { diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java index 6501c0226e44..767d4497ac91 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import org.apache.iceberg.Schema; @@ -36,8 +35,8 @@ private FlinkFixupTypes(Schema referenceSchema) { } static Schema fixup(Schema schema, Schema referenceSchema) { - return new Schema(TypeUtil.visit(schema, - new FlinkFixupTypes(referenceSchema)).asStructType().fields()); + return new Schema( + TypeUtil.visit(schema, new FlinkFixupTypes(referenceSchema)).asStructType().fields()); } @Override diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java index 0827b21786c1..97439b7bb0d6 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -34,33 +33,33 @@ import org.apache.iceberg.types.Types; /** - * Converter between Flink types and Iceberg type. - * The conversion is not a 1:1 mapping that not allows back-and-forth conversion. So some information might get lost - * during the back-and-forth conversion. - *

- * This inconsistent types: + * Converter between Flink types and Iceberg type. The conversion is not a 1:1 mapping that not + * allows back-and-forth conversion. So some information might get lost during the back-and-forth + * conversion. + * + *

This inconsistent types: + * *

    - *
  • map Iceberg UUID type to Flink BinaryType(16)
  • - *
  • map Flink VarCharType(_) and CharType(_) to Iceberg String type
  • - *
  • map Flink VarBinaryType(_) to Iceberg Binary type
  • - *
  • map Flink TimeType(_) to Iceberg Time type (microseconds)
  • - *
  • map Flink TimestampType(_) to Iceberg Timestamp without zone type (microseconds)
  • - *
  • map Flink LocalZonedTimestampType(_) to Iceberg Timestamp with zone type (microseconds)
  • - *
  • map Flink MultiSetType to Iceberg Map type(element, int)
  • + *
  • map Iceberg UUID type to Flink BinaryType(16) + *
  • map Flink VarCharType(_) and CharType(_) to Iceberg String type + *
  • map Flink VarBinaryType(_) to Iceberg Binary type + *
  • map Flink TimeType(_) to Iceberg Time type (microseconds) + *
  • map Flink TimestampType(_) to Iceberg Timestamp without zone type (microseconds) + *
  • map Flink LocalZonedTimestampType(_) to Iceberg Timestamp with zone type (microseconds) + *
  • map Flink MultiSetType to Iceberg Map type(element, int) *
+ * *

*/ public class FlinkSchemaUtil { - private FlinkSchemaUtil() { - } + private FlinkSchemaUtil() {} - /** - * Convert the flink table schema to apache iceberg schema. - */ + /** Convert the flink table schema to apache iceberg schema. */ public static Schema convert(TableSchema schema) { LogicalType schemaType = schema.toRowDataType().getLogicalType(); - Preconditions.checkArgument(schemaType instanceof RowType, "Schema logical type should be RowType."); + Preconditions.checkArgument( + schemaType instanceof RowType, "Schema logical type should be RowType."); RowType root = (RowType) schemaType; Type converted = root.accept(new FlinkTypeToType(root)); @@ -75,8 +74,11 @@ private static Schema freshIdentifierFieldIds(Schema iSchema, TableSchema schema if (schema.getPrimaryKey().isPresent()) { for (String column : schema.getPrimaryKey().get().getColumns()) { Types.NestedField field = iSchema.findField(column); - Preconditions.checkNotNull(field, - "Cannot find field ID for the primary key column %s in schema %s", column, iSchema); + Preconditions.checkNotNull( + field, + "Cannot find field ID for the primary key column %s in schema %s", + column, + iSchema); identifierFieldIds.add(field.fieldId()); } } @@ -86,11 +88,11 @@ private static Schema freshIdentifierFieldIds(Schema iSchema, TableSchema schema /** * Convert a Flink {@link TableSchema} to a {@link Schema} based on the given schema. - *

- * This conversion does not assign new ids; it uses ids from the base schema. - *

- * Data types, field order, and nullability will match the Flink type. This conversion may return - * a schema that is not compatible with base schema. + * + *

This conversion does not assign new ids; it uses ids from the base schema. + * + *

Data types, field order, and nullability will match the Flink type. This conversion may + * return a schema that is not compatible with base schema. * * @param baseSchema a Schema on which conversion is based * @param flinkSchema a Flink TableSchema @@ -163,7 +165,8 @@ public static TableSchema toSchema(Schema schema) { List columns = Lists.newArrayListWithExpectedSize(identifierFieldIds.size()); for (Integer identifierFieldId : identifierFieldIds) { String columnName = schema.findColumnName(identifierFieldId); - Preconditions.checkNotNull(columnName, "Cannot find field with id %s in schema %s", identifierFieldId, schema); + Preconditions.checkNotNull( + columnName, "Cannot find field with id %s in schema %s", identifierFieldId, schema); columns.add(columnName); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java index 88276d86d3df..6f8bfef2ef44 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -176,9 +175,10 @@ public Type visit(RowType rowType) { List newFields = Lists.newArrayListWithExpectedSize(rowType.getFieldCount()); boolean isRoot = root == rowType; - List types = rowType.getFields().stream() - .map(f -> f.getType().accept(this)) - .collect(Collectors.toList()); + List types = + rowType.getFields().stream() + .map(f -> f.getType().accept(this)) + .collect(Collectors.toList()); for (int i = 0; i < rowType.getFieldCount(); i++) { int id = isRoot ? i : getNextId(); diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java index 9d1a3c492cd7..f3de2416088c 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import org.apache.flink.table.types.logical.DayTimeIntervalType; diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java index ddb5f18c52fe..c1af5e49e5f4 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.Locale; @@ -29,29 +28,34 @@ /** * A class for common Iceberg configs for Flink writes. - *

- * If a config is set at multiple levels, the following order of precedence is used (top to bottom): + * + *

If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * *

    - *
  1. Write options
  2. - *
  3. flink ReadableConfig
  4. - *
  5. Table metadata
  6. + *
  7. Write options + *
  8. flink ReadableConfig + *
  9. Table metadata *
- * The most specific value is set in write options and takes precedence over all other configs. - * If no write option is provided, this class checks the flink configuration for any overrides. - * If no applicable value is found in the write options, this class uses the table metadata. - *

- * Note this class is NOT meant to be serialized. + * + * The most specific value is set in write options and takes precedence over all other configs. If + * no write option is provided, this class checks the flink configuration for any overrides. If no + * applicable value is found in the write options, this class uses the table metadata. + * + *

Note this class is NOT meant to be serialized. */ public class FlinkWriteConf { private final FlinkConfParser confParser; - public FlinkWriteConf(Table table, Map writeOptions, ReadableConfig readableConfig) { + public FlinkWriteConf( + Table table, Map writeOptions, ReadableConfig readableConfig) { this.confParser = new FlinkConfParser(table, writeOptions, readableConfig); } public boolean overwriteMode() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(FlinkWriteOptions.OVERWRITE_MODE.key()) .flinkConfig(FlinkWriteOptions.OVERWRITE_MODE) .defaultValue(FlinkWriteOptions.OVERWRITE_MODE.defaultValue()) @@ -59,7 +63,8 @@ public boolean overwriteMode() { } public boolean upsertMode() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(FlinkWriteOptions.WRITE_UPSERT_ENABLED.key()) .flinkConfig(FlinkWriteOptions.WRITE_UPSERT_ENABLED) .tableProperty(TableProperties.UPSERT_ENABLED) @@ -68,17 +73,20 @@ public boolean upsertMode() { } public FileFormat dataFileFormat() { - String valueAsString = confParser.stringConf() - .option(FlinkWriteOptions.WRITE_FORMAT.key()) - .flinkConfig(FlinkWriteOptions.WRITE_FORMAT) - .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) - .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) - .parse(); + String valueAsString = + confParser + .stringConf() + .option(FlinkWriteOptions.WRITE_FORMAT.key()) + .flinkConfig(FlinkWriteOptions.WRITE_FORMAT) + .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) + .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) + .parse(); return FileFormat.valueOf(valueAsString.toUpperCase(Locale.ENGLISH)); } public long targetDataFileSize() { - return confParser.longConf() + return confParser + .longConf() .option(FlinkWriteOptions.TARGET_FILE_SIZE_BYTES.key()) .flinkConfig(FlinkWriteOptions.TARGET_FILE_SIZE_BYTES) .tableProperty(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES) @@ -87,17 +95,20 @@ public long targetDataFileSize() { } public DistributionMode distributionMode() { - String modeName = confParser.stringConf() - .option(FlinkWriteOptions.DISTRIBUTION_MODE.key()) - .flinkConfig(FlinkWriteOptions.DISTRIBUTION_MODE) - .tableProperty(TableProperties.WRITE_DISTRIBUTION_MODE) - .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_NONE) - .parse(); + String modeName = + confParser + .stringConf() + .option(FlinkWriteOptions.DISTRIBUTION_MODE.key()) + .flinkConfig(FlinkWriteOptions.DISTRIBUTION_MODE) + .tableProperty(TableProperties.WRITE_DISTRIBUTION_MODE) + .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_NONE) + .parse(); return DistributionMode.fromName(modeName); } public int workerPoolSize() { - return confParser.intConf() + return confParser + .intConf() .flinkConfig(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE) .defaultValue(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue()) .parse(); diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java index d0dc9c7fdeb1..a3091d5779c7 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java @@ -16,42 +16,32 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import org.apache.flink.configuration.ConfigOption; import org.apache.flink.configuration.ConfigOptions; -/** - * Flink sink write options - */ +/** Flink sink write options */ public class FlinkWriteOptions { - private FlinkWriteOptions() { - } + private FlinkWriteOptions() {} // File format for write operations(default: Table write.format.default ) public static final ConfigOption WRITE_FORMAT = - ConfigOptions.key("write-format") - .stringType().noDefaultValue(); + ConfigOptions.key("write-format").stringType().noDefaultValue(); // Overrides this table's write.target-file-size-bytes public static final ConfigOption TARGET_FILE_SIZE_BYTES = - ConfigOptions.key("target-file-size-bytes") - .longType().noDefaultValue(); + ConfigOptions.key("target-file-size-bytes").longType().noDefaultValue(); // Overrides this table's write.upsert.enabled public static final ConfigOption WRITE_UPSERT_ENABLED = - ConfigOptions.key("upsert-enabled") - .booleanType().noDefaultValue(); + ConfigOptions.key("upsert-enabled").booleanType().noDefaultValue(); public static final ConfigOption OVERWRITE_MODE = - ConfigOptions.key("overwrite-enabled") - .booleanType().defaultValue(false); + ConfigOptions.key("overwrite-enabled").booleanType().defaultValue(false); // Overrides the table's write.distribution-mode public static final ConfigOption DISTRIBUTION_MODE = - ConfigOptions.key("distribution-mode") - .stringType().noDefaultValue(); - + ConfigOptions.key("distribution-mode").stringType().noDefaultValue(); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java index f2eecb9b6646..8f7e09064c53 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -48,7 +47,8 @@ private IcebergTableSink(IcebergTableSink toCopy) { this.readableConfig = toCopy.readableConfig; } - public IcebergTableSink(TableLoader tableLoader, TableSchema tableSchema, ReadableConfig readableConfig) { + public IcebergTableSink( + TableLoader tableLoader, TableSchema tableSchema, ReadableConfig readableConfig) { this.tableLoader = tableLoader; this.tableSchema = tableSchema; this.readableConfig = readableConfig; @@ -56,25 +56,28 @@ public IcebergTableSink(TableLoader tableLoader, TableSchema tableSchema, Readab @Override public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { - Preconditions.checkState(!overwrite || context.isBounded(), + Preconditions.checkState( + !overwrite || context.isBounded(), "Unbounded data stream doesn't support overwrite operation."); - List equalityColumns = tableSchema.getPrimaryKey() - .map(UniqueConstraint::getColumns) - .orElseGet(ImmutableList::of); + List equalityColumns = + tableSchema.getPrimaryKey().map(UniqueConstraint::getColumns).orElseGet(ImmutableList::of); - return (DataStreamSinkProvider) dataStream -> FlinkSink.forRowData(dataStream) - .tableLoader(tableLoader) - .tableSchema(tableSchema) - .equalityFieldColumns(equalityColumns) - .overwrite(overwrite) - .flinkConf(readableConfig) - .append(); + return (DataStreamSinkProvider) + dataStream -> + FlinkSink.forRowData(dataStream) + .tableLoader(tableLoader) + .tableSchema(tableSchema) + .equalityFieldColumns(equalityColumns) + .overwrite(overwrite) + .flinkConf(readableConfig) + .append(); } @Override public void applyStaticPartition(Map partition) { - // The flink's PartitionFanoutWriter will handle the static partition write policy automatically. + // The flink's PartitionFanoutWriter will handle the static partition write policy + // automatically. } @Override diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSource.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSource.java index dd8f6454ebc4..3bd7335f74c5 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSource.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.Arrays; @@ -43,11 +42,12 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -/** - * Flink Iceberg table source. - */ +/** Flink Iceberg table source. */ public class IcebergTableSource - implements ScanTableSource, SupportsProjectionPushDown, SupportsFilterPushDown, SupportsLimitPushDown { + implements ScanTableSource, + SupportsProjectionPushDown, + SupportsFilterPushDown, + SupportsLimitPushDown { private int[] projectedFields; private long limit; @@ -70,14 +70,23 @@ private IcebergTableSource(IcebergTableSource toCopy) { this.readableConfig = toCopy.readableConfig; } - public IcebergTableSource(TableLoader loader, TableSchema schema, Map properties, - ReadableConfig readableConfig) { + public IcebergTableSource( + TableLoader loader, + TableSchema schema, + Map properties, + ReadableConfig readableConfig) { this(loader, schema, properties, null, false, -1, ImmutableList.of(), readableConfig); } - private IcebergTableSource(TableLoader loader, TableSchema schema, Map properties, - int[] projectedFields, boolean isLimitPushDown, - long limit, List filters, ReadableConfig readableConfig) { + private IcebergTableSource( + TableLoader loader, + TableSchema schema, + Map properties, + int[] projectedFields, + boolean isLimitPushDown, + long limit, + List filters, + ReadableConfig readableConfig) { this.loader = loader; this.schema = schema; this.properties = properties; @@ -92,8 +101,8 @@ private IcebergTableSource(TableLoader loader, TableSchema schema, Map fullNames[i]).toArray(String[]::new), - Arrays.stream(projectedFields).mapToObj(i -> fullTypes[i]).toArray(DataType[]::new)).build(); + return TableSchema.builder() + .fields( + Arrays.stream(projectedFields).mapToObj(i -> fullNames[i]).toArray(String[]::new), + Arrays.stream(projectedFields).mapToObj(i -> fullTypes[i]).toArray(DataType[]::new)) + .build(); } } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java index 401e9db65992..d4cec7a3e80b 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.lang.reflect.Array; @@ -77,7 +76,8 @@ public T get(int pos, Class javaClass) { @Override public void set(int pos, T value) { - throw new UnsupportedOperationException("Could not set a field in the RowDataWrapper because rowData is read-only"); + throw new UnsupportedOperationException( + "Could not set a field in the RowDataWrapper because rowData is read-only"); } private interface PositionalGetter { @@ -104,16 +104,19 @@ private static PositionalGetter buildGetter(LogicalType logicalType, Type typ case DECIMAL: DecimalType decimalType = (DecimalType) logicalType; - return (row, pos) -> row.getDecimal(pos, decimalType.getPrecision(), decimalType.getScale()).toBigDecimal(); + return (row, pos) -> + row.getDecimal(pos, decimalType.getPrecision(), decimalType.getScale()).toBigDecimal(); case TIME_WITHOUT_TIME_ZONE: - // Time in RowData is in milliseconds (Integer), while iceberg's time is microseconds (Long). + // Time in RowData is in milliseconds (Integer), while iceberg's time is microseconds + // (Long). return (row, pos) -> ((long) row.getInt(pos)) * 1_000; case TIMESTAMP_WITHOUT_TIME_ZONE: TimestampType timestampType = (TimestampType) logicalType; return (row, pos) -> { - LocalDateTime localDateTime = row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); return DateTimeUtil.microsFromTimestamp(localDateTime); }; @@ -121,7 +124,8 @@ private static PositionalGetter buildGetter(LogicalType logicalType, Type typ LocalZonedTimestampType lzTs = (LocalZonedTimestampType) logicalType; return (row, pos) -> { TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); - return timestampData.getMillisecond() * 1000 + timestampData.getNanoOfMillisecond() / 1000; + return timestampData.getMillisecond() * 1000 + + timestampData.getNanoOfMillisecond() / 1000; }; case ROW: diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java index ebcb1fb0b7b4..e128badb8461 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.Closeable; @@ -31,9 +30,9 @@ import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; /** - * Serializable loader to load an Iceberg {@link Table}. - * Flink needs to get {@link Table} objects in the cluster (for example, to get splits), not just on the client side. - * So we need an Iceberg table loader to get the {@link Table} object. + * Serializable loader to load an Iceberg {@link Table}. Flink needs to get {@link Table} objects in + * the cluster (for example, to get splits), not just on the client side. So we need an Iceberg + * table loader to get the {@link Table} object. */ public interface TableLoader extends Closeable, Serializable { @@ -78,14 +77,11 @@ public Table loadTable() { } @Override - public void close() { - } + public void close() {} @Override public String toString() { - return MoreObjects.toStringHelper(this) - .add("location", location) - .toString(); + return MoreObjects.toStringHelper(this).add("location", location).toString(); } } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java index cf594b364f5f..f8f1b74b1ceb 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -44,8 +43,7 @@ import org.apache.iceberg.types.Types; class TypeToFlinkType extends TypeUtil.SchemaVisitor { - TypeToFlinkType() { - } + TypeToFlinkType() {} @Override public LogicalType schema(Schema schema, LogicalType structType) { @@ -60,8 +58,8 @@ public LogicalType struct(Types.StructType struct, List fieldResult for (int i = 0; i < fields.size(); i += 1) { Types.NestedField field = fields.get(i); LogicalType type = fieldResults.get(i); - RowType.RowField flinkField = new RowType.RowField( - field.name(), type.copy(field.isOptional()), field.doc()); + RowType.RowField flinkField = + new RowType.RowField(field.name(), type.copy(field.isOptional()), field.doc()); flinkFields.add(flinkField); } @@ -100,9 +98,11 @@ public LogicalType primitive(Type.PrimitiveType primitive) { case DATE: return new DateType(); case TIME: - // For the type: Flink only support TimeType with default precision (second) now. The precision of time is + // For the type: Flink only support TimeType with default precision (second) now. The + // precision of time is // not supported in Flink, so we can think of it as a simple time type directly. - // For the data: Flink uses int that support mills to represent time data, so it supports mills precision. + // For the data: Flink uses int that support mills to represent time data, so it supports + // mills precision. return new TimeType(); case TIMESTAMP: Types.TimestampType timestamp = (Types.TimestampType) primitive; diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java index 98702ceb57f1..06ac54617ae6 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.actions; import org.apache.flink.configuration.Configuration; @@ -26,9 +25,10 @@ public class Actions { - public static final Configuration CONFIG = new Configuration() - // disable classloader check as Avro may cache class/object in the serializers. - .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); + public static final Configuration CONFIG = + new Configuration() + // disable classloader check as Avro may cache class/object in the serializers. + .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); private StreamExecutionEnvironment env; private Table table; @@ -49,5 +49,4 @@ public static Actions forTable(Table table) { public RewriteDataFilesAction rewriteDataFiles() { return new RewriteDataFilesAction(env, table); } - } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java index cbd4aed73c8a..9876bb3861c4 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.actions; import java.util.List; @@ -51,7 +50,8 @@ protected List rewriteDataForTasks(List combinedScan int size = combinedScanTasks.size(); int parallelism = Math.min(size, maxParallelism); DataStream dataStream = env.fromCollection(combinedScanTasks); - RowDataRewriter rowDataRewriter = new RowDataRewriter(table(), caseSensitive(), fileIO(), encryptionManager()); + RowDataRewriter rowDataRewriter = + new RowDataRewriter(table(), caseSensitive(), fileIO(), encryptionManager()); try { return rowDataRewriter.rewriteDataForTasks(dataStream, parallelism); } catch (Exception e) { diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java index 1ccc3b787e33..8103224a0b6c 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import org.apache.flink.table.types.logical.ArrayType; @@ -29,7 +28,8 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.util.Pair; -public abstract class AvroWithFlinkSchemaVisitor extends AvroWithPartnerByStructureVisitor { +public abstract class AvroWithFlinkSchemaVisitor + extends AvroWithPartnerByStructureVisitor { @Override protected boolean isStringType(LogicalType logicalType) { @@ -43,7 +43,8 @@ protected boolean isMapType(LogicalType logicalType) { @Override protected LogicalType arrayElementType(LogicalType arrayType) { - Preconditions.checkArgument(arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); + Preconditions.checkArgument( + arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); return ((ArrayType) arrayType).getElementType(); } @@ -61,7 +62,8 @@ protected LogicalType mapValueType(LogicalType mapType) { @Override protected Pair fieldNameAndType(LogicalType structType, int pos) { - Preconditions.checkArgument(structType instanceof RowType, "Invalid struct: %s is not a struct", structType); + Preconditions.checkArgument( + structType instanceof RowType, "Invalid struct: %s is not a struct", structType); RowType.RowField field = ((RowType) structType).getFields().get(pos); return Pair.of(field.getName(), field.getType()); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java index 991ef6336297..86404959735a 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.IOException; @@ -49,10 +48,12 @@ public FlinkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSche } @SuppressWarnings("unchecked") - public FlinkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { + public FlinkAvroReader( + org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { this.readSchema = readSchema; - this.reader = (ValueReader) AvroSchemaWithTypeVisitor - .visit(expectedSchema, readSchema, new ReadBuilder(constants)); + this.reader = + (ValueReader) + AvroSchemaWithTypeVisitor.visit(expectedSchema, readSchema, new ReadBuilder(constants)); } @Override @@ -80,8 +81,8 @@ private ReadBuilder(Map idToConstant) { } @Override - public ValueReader record(Types.StructType expected, Schema record, List names, - List> fields) { + public ValueReader record( + Types.StructType expected, Schema record, List names, List> fields) { return FlinkValueReaders.struct(fields, expected.asStructType(), idToConstant); } @@ -91,13 +92,14 @@ public ValueReader union(Type expected, Schema union, List> op } @Override - public ValueReader array(Types.ListType expected, Schema array, ValueReader elementReader) { + public ValueReader array( + Types.ListType expected, Schema array, ValueReader elementReader) { return FlinkValueReaders.array(elementReader); } @Override - public ValueReader map(Types.MapType expected, Schema map, - ValueReader keyReader, ValueReader valueReader) { + public ValueReader map( + Types.MapType expected, Schema map, ValueReader keyReader, ValueReader valueReader) { return FlinkValueReaders.arrayMap(keyReader, valueReader); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java index b069a35d3bbb..873e65783119 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.IOException; @@ -47,8 +46,9 @@ public FlinkAvroWriter(RowType rowType) { @Override @SuppressWarnings("unchecked") public void setSchema(Schema schema) { - this.writer = (ValueWriter) AvroWithFlinkSchemaVisitor - .visit(rowType, schema, new WriteBuilder()); + this.writer = + (ValueWriter) + AvroWithFlinkSchemaVisitor.visit(rowType, schema, new WriteBuilder()); } @Override @@ -63,17 +63,23 @@ public Stream metrics() { private static class WriteBuilder extends AvroWithFlinkSchemaVisitor> { @Override - public ValueWriter record(LogicalType struct, Schema record, List names, List> fields) { - return FlinkValueWriters.row(fields, IntStream.range(0, names.size()) - .mapToObj(i -> fieldNameAndType(struct, i).second()).collect(Collectors.toList())); + public ValueWriter record( + LogicalType struct, Schema record, List names, List> fields) { + return FlinkValueWriters.row( + fields, + IntStream.range(0, names.size()) + .mapToObj(i -> fieldNameAndType(struct, i).second()) + .collect(Collectors.toList())); } @Override public ValueWriter union(LogicalType type, Schema union, List> options) { - Preconditions.checkArgument(options.contains(ValueWriters.nulls()), - "Cannot create writer for non-option union: %s", union); - Preconditions.checkArgument(options.size() == 2, - "Cannot create writer for non-option union: %s", union); + Preconditions.checkArgument( + options.contains(ValueWriters.nulls()), + "Cannot create writer for non-option union: %s", + union); + Preconditions.checkArgument( + options.size() == 2, "Cannot create writer for non-option union: %s", union); if (union.getTypes().get(0).getType() == Schema.Type.NULL) { return ValueWriters.option(0, options.get(1)); } else { @@ -88,12 +94,15 @@ public ValueWriter array(LogicalType sArray, Schema array, ValueWriter ele @Override public ValueWriter map(LogicalType sMap, Schema map, ValueWriter valueReader) { - return FlinkValueWriters.map(FlinkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); + return FlinkValueWriters.map( + FlinkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); } @Override - public ValueWriter map(LogicalType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { - return FlinkValueWriters.arrayMap(keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); + public ValueWriter map( + LogicalType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { + return FlinkValueWriters.arrayMap( + keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); } @Override diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java index 4c4e2050263b..65b9d44ad4b8 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.List; @@ -44,7 +43,8 @@ public FlinkOrcReader(Schema iSchema, TypeDescription readSchema) { } public FlinkOrcReader(Schema iSchema, TypeDescription readSchema, Map idToConstant) { - this.reader = OrcSchemaWithTypeVisitor.visit(iSchema, readSchema, new ReadBuilder(idToConstant)); + this.reader = + OrcSchemaWithTypeVisitor.visit(iSchema, readSchema, new ReadBuilder(idToConstant)); } @Override @@ -65,21 +65,26 @@ private ReadBuilder(Map idToConstant) { } @Override - public OrcValueReader record(Types.StructType iStruct, TypeDescription record, List names, - List> fields) { + public OrcValueReader record( + Types.StructType iStruct, + TypeDescription record, + List names, + List> fields) { return FlinkOrcReaders.struct(fields, iStruct, idToConstant); } @Override - public OrcValueReader list(Types.ListType iList, TypeDescription array, - OrcValueReader elementReader) { + public OrcValueReader list( + Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { return FlinkOrcReaders.array(elementReader); } @Override - public OrcValueReader map(Types.MapType iMap, TypeDescription map, - OrcValueReader keyReader, - OrcValueReader valueReader) { + public OrcValueReader map( + Types.MapType iMap, + TypeDescription map, + OrcValueReader keyReader, + OrcValueReader valueReader) { return FlinkOrcReaders.map(keyReader, valueReader); } @@ -117,8 +122,9 @@ public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescriptio Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; return FlinkOrcReaders.decimals(decimalType.precision(), decimalType.scale()); default: - throw new IllegalArgumentException(String.format("Invalid iceberg type %s corresponding to ORC type %s", - iPrimitive, primitive)); + throw new IllegalArgumentException( + String.format( + "Invalid iceberg type %s corresponding to ORC type %s", iPrimitive, primitive)); } } } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java index 744a05eb2d21..7a4a15c7e600 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.math.BigDecimal; @@ -50,8 +49,7 @@ import org.apache.orc.storage.serde2.io.HiveDecimalWritable; class FlinkOrcReaders { - private FlinkOrcReaders() { - } + private FlinkOrcReaders() {} static OrcValueReader strings() { return StringReader.INSTANCE; @@ -87,13 +85,13 @@ static OrcValueReader array(OrcValueReader elementReader) { return new ArrayReader<>(elementReader); } - public static OrcValueReader map(OrcValueReader keyReader, OrcValueReader valueReader) { + public static OrcValueReader map( + OrcValueReader keyReader, OrcValueReader valueReader) { return new MapReader<>(keyReader, valueReader); } - public static OrcValueReader struct(List> readers, - Types.StructType struct, - Map idToConstant) { + public static OrcValueReader struct( + List> readers, Types.StructType struct, Map idToConstant) { return new StructReader(readers, struct, idToConstant); } @@ -103,7 +101,8 @@ private static class StringReader implements OrcValueReader { @Override public StringData nonNullRead(ColumnVector vector, int row) { BytesColumnVector bytesVector = (BytesColumnVector) vector; - return StringData.fromBytes(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); + return StringData.fromBytes( + bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); } } @@ -130,8 +129,12 @@ public DecimalData nonNullRead(ColumnVector vector, int row) { HiveDecimalWritable value = ((DecimalColumnVector) vector).vector[row]; // The hive ORC writer may will adjust the scale of decimal data. - Preconditions.checkArgument(value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); return DecimalData.fromUnscaledLong(value.serialize64(scale), precision, scale); } @@ -148,10 +151,15 @@ private static class Decimal38Reader implements OrcValueReader { @Override public DecimalData nonNullRead(ColumnVector vector, int row) { - BigDecimal value = ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); + BigDecimal value = + ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); - Preconditions.checkArgument(value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); return DecimalData.fromBigDecimal(value, precision, scale); } @@ -174,9 +182,10 @@ private static class TimestampReader implements OrcValueReader { @Override public TimestampData nonNullRead(ColumnVector vector, int row) { TimestampColumnVector tcv = (TimestampColumnVector) vector; - LocalDateTime localDate = Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) - .atOffset(ZoneOffset.UTC) - .toLocalDateTime(); + LocalDateTime localDate = + Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) + .atOffset(ZoneOffset.UTC) + .toLocalDateTime(); return TimestampData.fromLocalDateTime(localDate); } } @@ -187,9 +196,10 @@ private static class TimestampTzReader implements OrcValueReader @Override public TimestampData nonNullRead(ColumnVector vector, int row) { TimestampColumnVector tcv = (TimestampColumnVector) vector; - Instant instant = Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) - .atOffset(ZoneOffset.UTC) - .toInstant(); + Instant instant = + Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) + .atOffset(ZoneOffset.UTC) + .toInstant(); return TimestampData.fromInstant(instant); } } @@ -254,7 +264,8 @@ public void setBatchContext(long batchOffsetInFile) { private static class StructReader extends OrcValueReaders.StructReader { private final int numFields; - StructReader(List> readers, Types.StructType struct, Map idToConstant) { + StructReader( + List> readers, Types.StructType struct, Map idToConstant) { super(readers, struct, idToConstant); this.numFields = struct.fields().size(); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java index 2eeb268095f5..6a31accffd22 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.Deque; @@ -40,7 +39,9 @@ public class FlinkOrcWriter implements OrcRowWriter { private final FlinkOrcWriters.RowDataWriter writer; private FlinkOrcWriter(RowType rowType, Schema iSchema) { - this.writer = (FlinkOrcWriters.RowDataWriter) FlinkSchemaVisitor.visit(rowType, iSchema, new WriteBuilder()); + this.writer = + (FlinkOrcWriters.RowDataWriter) + FlinkSchemaVisitor.visit(rowType, iSchema, new WriteBuilder()); } public static OrcRowWriter buildWriter(RowType rowType, Schema iSchema) { @@ -66,8 +67,7 @@ public Stream> metrics() { private static class WriteBuilder extends FlinkSchemaVisitor> { private final Deque fieldIds = Lists.newLinkedList(); - private WriteBuilder() { - } + private WriteBuilder() {} @Override public void beforeField(Types.NestedField field) { @@ -80,20 +80,24 @@ public void afterField(Types.NestedField field) { } @Override - public OrcValueWriter record(Types.StructType iStruct, - List> results, - List fieldType) { + public OrcValueWriter record( + Types.StructType iStruct, List> results, List fieldType) { return FlinkOrcWriters.struct(results, fieldType); } @Override - public OrcValueWriter map(Types.MapType iMap, OrcValueWriter key, OrcValueWriter value, - LogicalType keyType, LogicalType valueType) { + public OrcValueWriter map( + Types.MapType iMap, + OrcValueWriter key, + OrcValueWriter value, + LogicalType keyType, + LogicalType valueType) { return FlinkOrcWriters.map(key, value, keyType, valueType); } @Override - public OrcValueWriter list(Types.ListType iList, OrcValueWriter element, LogicalType elementType) { + public OrcValueWriter list( + Types.ListType iList, OrcValueWriter element, LogicalType elementType) { return FlinkOrcWriters.list(element, elementType); } @@ -113,14 +117,20 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType fl case LONG: return GenericOrcWriters.longs(); case FLOAT: - Preconditions.checkArgument(fieldIds.peek() != null, - String.format("[BUG] Cannot find field id for primitive field with type %s. This is likely because id " + - "information is not properly pushed during schema visiting.", iPrimitive)); + Preconditions.checkArgument( + fieldIds.peek() != null, + String.format( + "[BUG] Cannot find field id for primitive field with type %s. This is likely because id " + + "information is not properly pushed during schema visiting.", + iPrimitive)); return GenericOrcWriters.floats(fieldIds.peek()); case DOUBLE: - Preconditions.checkArgument(fieldIds.peek() != null, - String.format("[BUG] Cannot find field id for primitive field with type %s. This is likely because id " + - "information is not properly pushed during schema visiting.", iPrimitive)); + Preconditions.checkArgument( + fieldIds.peek() != null, + String.format( + "[BUG] Cannot find field id for primitive field with type %s. This is likely because id " + + "information is not properly pushed during schema visiting.", + iPrimitive)); return GenericOrcWriters.doubles(fieldIds.peek()); case DATE: return FlinkOrcWriters.dates(); @@ -143,8 +153,10 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType fl Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; return FlinkOrcWriters.decimals(decimalType.precision(), decimalType.scale()); default: - throw new IllegalArgumentException(String.format( - "Invalid iceberg type %s corresponding to Flink logical type %s", iPrimitive, flinkPrimitive)); + throw new IllegalArgumentException( + String.format( + "Invalid iceberg type %s corresponding to Flink logical type %s", + iPrimitive, flinkPrimitive)); } } } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java index 2de5586a33fe..da2f95cf822f 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.time.Instant; @@ -47,8 +46,7 @@ class FlinkOrcWriters { - private FlinkOrcWriters() { - } + private FlinkOrcWriters() {} static OrcValueWriter strings() { return StringWriter.INSTANCE; @@ -80,12 +78,16 @@ static OrcValueWriter decimals(int precision, int scale) { } } - static OrcValueWriter list(OrcValueWriter elementWriter, LogicalType elementType) { + static OrcValueWriter list( + OrcValueWriter elementWriter, LogicalType elementType) { return new ListWriter<>(elementWriter, elementType); } - static OrcValueWriter map(OrcValueWriter keyWriter, OrcValueWriter valueWriter, - LogicalType keyType, LogicalType valueType) { + static OrcValueWriter map( + OrcValueWriter keyWriter, + OrcValueWriter valueWriter, + LogicalType keyType, + LogicalType valueType) { return new MapWriter<>(keyWriter, valueWriter, keyType, valueType); } @@ -132,7 +134,8 @@ public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { cv.setIsUTC(true); // millis OffsetDateTime offsetDateTime = data.toInstant().atOffset(ZoneOffset.UTC); - cv.time[rowId] = offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; + cv.time[rowId] = + offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; // truncate nanos to only keep microsecond precision. cv.nanos[rowId] = (offsetDateTime.getNano() / 1_000) * 1_000; } @@ -164,12 +167,21 @@ private static class Decimal18Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) { - Preconditions.checkArgument(scale == data.scale(), - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, data); - Preconditions.checkArgument(data.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, data); - - ((DecimalColumnVector) output).vector[rowId].setFromLongAndScale(data.toUnscaledLong(), data.scale()); + Preconditions.checkArgument( + scale == data.scale(), + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + data); + Preconditions.checkArgument( + data.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + data); + + ((DecimalColumnVector) output) + .vector[rowId].setFromLongAndScale(data.toUnscaledLong(), data.scale()); } } @@ -184,12 +196,21 @@ private static class Decimal38Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) { - Preconditions.checkArgument(scale == data.scale(), - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, data); - Preconditions.checkArgument(data.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, data); - - ((DecimalColumnVector) output).vector[rowId].set(HiveDecimal.create(data.toBigDecimal(), false)); + Preconditions.checkArgument( + scale == data.scale(), + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + data); + Preconditions.checkArgument( + data.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + data); + + ((DecimalColumnVector) output) + .vector[rowId].set(HiveDecimal.create(data.toBigDecimal(), false)); } } @@ -222,7 +243,6 @@ public void nonNullWrite(int rowId, ArrayData data, ColumnVector output) { public Stream> metrics() { return elementWriter.metrics(); } - } static class MapWriter implements OrcValueWriter { @@ -231,8 +251,11 @@ static class MapWriter implements OrcValueWriter { private final ArrayData.ElementGetter keyGetter; private final ArrayData.ElementGetter valueGetter; - MapWriter(OrcValueWriter keyWriter, OrcValueWriter valueWriter, - LogicalType keyType, LogicalType valueType) { + MapWriter( + OrcValueWriter keyWriter, + OrcValueWriter valueWriter, + LogicalType keyType, + LogicalType valueType) { this.keyWriter = keyWriter; this.valueWriter = valueWriter; this.keyGetter = ArrayData.createElementGetter(keyType); @@ -283,7 +306,6 @@ static class RowDataWriter extends GenericOrcWriters.StructWriter { protected Object get(RowData struct, int index) { return fieldGetters.get(index).getFieldOrNull(struct); } - } private static void growColumnVector(ColumnVector cv, int requestedSize) { diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java index 30184d899453..4189d0ae429b 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.math.BigDecimal; @@ -57,20 +56,19 @@ import org.apache.parquet.schema.Type; public class FlinkParquetReaders { - private FlinkParquetReaders() { - } + private FlinkParquetReaders() {} - public static ParquetValueReader buildReader(Schema expectedSchema, MessageType fileSchema) { + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema) { return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); } @SuppressWarnings("unchecked") - public static ParquetValueReader buildReader(Schema expectedSchema, - MessageType fileSchema, - Map idToConstant) { - return (ParquetValueReader) TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, - new ReadBuilder(fileSchema, idToConstant) - ); + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema, Map idToConstant) { + return (ParquetValueReader) + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); } private static class ReadBuilder extends TypeWithSchemaVisitor> { @@ -83,14 +81,14 @@ private static class ReadBuilder extends TypeWithSchemaVisitor message(Types.StructType expected, MessageType message, - List> fieldReaders) { + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { return struct(expected, message.asGroupType(), fieldReaders); } @Override - public ParquetValueReader struct(Types.StructType expected, GroupType struct, - List> fieldReaders) { + public ParquetValueReader struct( + Types.StructType expected, GroupType struct, List> fieldReaders) { // match the expected struct's order Map> readersById = Maps.newHashMap(); Map typesById = Maps.newHashMap(); @@ -107,10 +105,10 @@ public ParquetValueReader struct(Types.StructType expected, GroupType s } } - List expectedFields = expected != null ? - expected.fields() : ImmutableList.of(); - List> reorderedFields = Lists.newArrayListWithExpectedSize( - expectedFields.size()); + List expectedFields = + expected != null ? expected.fields() : ImmutableList.of(); + List> reorderedFields = + Lists.newArrayListWithExpectedSize(expectedFields.size()); List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); for (Types.NestedField field : expectedFields) { int id = field.fieldId(); @@ -140,8 +138,8 @@ public ParquetValueReader struct(Types.StructType expected, GroupType s } @Override - public ParquetValueReader list(Types.ListType expectedList, GroupType array, - ParquetValueReader elementReader) { + public ParquetValueReader list( + Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { if (expectedList == null) { return null; } @@ -154,13 +152,16 @@ public ParquetValueReader list(Types.ListType expectedList, GroupType array, Type elementType = ParquetSchemaUtil.determineListElementType(array); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - return new ArrayReader<>(repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); + return new ArrayReader<>( + repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); } @Override - public ParquetValueReader map(Types.MapType expectedMap, GroupType map, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { + public ParquetValueReader map( + Types.MapType expectedMap, + GroupType map, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { if (expectedMap == null) { return null; } @@ -176,15 +177,17 @@ public ParquetValueReader map(Types.MapType expectedMap, GroupType map, Type valueType = repeatedKeyValue.getType(1); int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - return new MapReader<>(repeatedD, repeatedR, + return new MapReader<>( + repeatedD, + repeatedR, ParquetValueReaders.option(keyType, keyD, keyReader), ParquetValueReaders.option(valueType, valueD, valueReader)); } @Override @SuppressWarnings("CyclomaticComplexity") - public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveType expected, - PrimitiveType primitive) { + public ParquetValueReader primitive( + org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { if (expected == null) { return null; } @@ -225,7 +228,8 @@ public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveTy return new MillisToTimestampReader(desc); } case DECIMAL: - DecimalLogicalTypeAnnotation decimal = (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); + DecimalLogicalTypeAnnotation decimal = + (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); switch (primitive.getPrimitiveTypeName()) { case BINARY: case FIXED_LEN_BYTE_ARRAY: @@ -272,7 +276,8 @@ public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveTy } } - private static class BinaryDecimalReader extends ParquetValueReaders.PrimitiveReader { + private static class BinaryDecimalReader + extends ParquetValueReaders.PrimitiveReader { private final int precision; private final int scale; @@ -291,7 +296,8 @@ public DecimalData read(DecimalData ignored) { } } - private static class IntegerDecimalReader extends ParquetValueReaders.PrimitiveReader { + private static class IntegerDecimalReader + extends ParquetValueReaders.PrimitiveReader { private final int precision; private final int scale; @@ -323,7 +329,8 @@ public DecimalData read(DecimalData ignored) { } } - private static class MicrosToTimestampTzReader extends ParquetValueReaders.UnboxedReader { + private static class MicrosToTimestampTzReader + extends ParquetValueReaders.UnboxedReader { MicrosToTimestampTzReader(ColumnDescriptor desc) { super(desc); } @@ -331,10 +338,11 @@ private static class MicrosToTimestampTzReader extends ParquetValueReaders.Unbox @Override public TimestampData read(TimestampData ignored) { long value = readLong(); - return TimestampData.fromLocalDateTime(Instant.ofEpochSecond(Math.floorDiv(value, 1000_000), - Math.floorMod(value, 1000_000) * 1000) - .atOffset(ZoneOffset.UTC) - .toLocalDateTime()); + return TimestampData.fromLocalDateTime( + Instant.ofEpochSecond( + Math.floorDiv(value, 1000_000), Math.floorMod(value, 1000_000) * 1000) + .atOffset(ZoneOffset.UTC) + .toLocalDateTime()); } @Override @@ -343,7 +351,8 @@ public long readLong() { } } - private static class MicrosToTimestampReader extends ParquetValueReaders.UnboxedReader { + private static class MicrosToTimestampReader + extends ParquetValueReaders.UnboxedReader { MicrosToTimestampReader(ColumnDescriptor desc) { super(desc); } @@ -351,8 +360,9 @@ private static class MicrosToTimestampReader extends ParquetValueReaders.Unboxed @Override public TimestampData read(TimestampData ignored) { long value = readLong(); - return TimestampData.fromInstant(Instant.ofEpochSecond(Math.floorDiv(value, 1000_000), - Math.floorMod(value, 1000_000) * 1000)); + return TimestampData.fromInstant( + Instant.ofEpochSecond( + Math.floorDiv(value, 1000_000), Math.floorMod(value, 1000_000) * 1000)); } @Override @@ -361,7 +371,8 @@ public long readLong() { } } - private static class MillisToTimestampReader extends ParquetValueReaders.UnboxedReader { + private static class MillisToTimestampReader + extends ParquetValueReaders.UnboxedReader { MillisToTimestampReader(ColumnDescriptor desc) { super(desc); } @@ -378,7 +389,8 @@ public long readLong() { } } - private static class MillisToTimestampTzReader extends ParquetValueReaders.UnboxedReader { + private static class MillisToTimestampTzReader + extends ParquetValueReaders.UnboxedReader { MillisToTimestampTzReader(ColumnDescriptor desc) { super(desc); } @@ -386,9 +398,8 @@ private static class MillisToTimestampTzReader extends ParquetValueReaders.Unbox @Override public TimestampData read(TimestampData ignored) { long millis = readLong(); - return TimestampData.fromLocalDateTime(Instant.ofEpochMilli(millis) - .atOffset(ZoneOffset.UTC) - .toLocalDateTime()); + return TimestampData.fromLocalDateTime( + Instant.ofEpochMilli(millis).atOffset(ZoneOffset.UTC).toLocalDateTime()); } @Override @@ -415,7 +426,8 @@ public StringData read(StringData ignored) { } } - private static class LossyMicrosToMillisTimeReader extends ParquetValueReaders.PrimitiveReader { + private static class LossyMicrosToMillisTimeReader + extends ParquetValueReaders.PrimitiveReader { LossyMicrosToMillisTimeReader(ColumnDescriptor desc) { super(desc); } @@ -438,7 +450,8 @@ public Integer read(Integer reuse) { } } - private static class ArrayReader extends ParquetValueReaders.RepeatedReader { + private static class ArrayReader + extends ParquetValueReaders.RepeatedReader { private int readPos = 0; private int writePos = 0; @@ -484,23 +497,29 @@ protected void addElement(ReusableArrayData reused, E element) { @Override protected ArrayData buildList(ReusableArrayData list) { - // Since ReusableArrayData is not accepted by Flink, use GenericArrayData temporarily to walk around it. + // Since ReusableArrayData is not accepted by Flink, use GenericArrayData temporarily to walk + // around it. // Revert this to use ReusableArrayData once it is fixed in Flink. // For your reference, https://issues.apache.org/jira/browse/FLINK-25238. return new GenericArrayData(Arrays.copyOf(list.values, writePos)); } } - private static class MapReader extends - ParquetValueReaders.RepeatedKeyValueReader { + private static class MapReader + extends ParquetValueReaders.RepeatedKeyValueReader { private int readPos = 0; private int writePos = 0; - private final ParquetValueReaders.ReusableEntry entry = new ParquetValueReaders.ReusableEntry<>(); - private final ParquetValueReaders.ReusableEntry nullEntry = new ParquetValueReaders.ReusableEntry<>(); + private final ParquetValueReaders.ReusableEntry entry = + new ParquetValueReaders.ReusableEntry<>(); + private final ParquetValueReaders.ReusableEntry nullEntry = + new ParquetValueReaders.ReusableEntry<>(); - MapReader(int definitionLevel, int repetitionLevel, - ParquetValueReader keyReader, ParquetValueReader valueReader) { + MapReader( + int definitionLevel, + int repetitionLevel, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { super(definitionLevel, repetitionLevel, keyReader, valueReader); } @@ -549,7 +568,8 @@ protected MapData buildMap(ReusableMapData map) { } } - private static class RowDataReader extends ParquetValueReaders.StructReader { + private static class RowDataReader + extends ParquetValueReaders.StructReader { private final int numFields; RowDataReader(List types, List> readers) { diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java index 6154ef1cfa2b..db4f1730a134 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.Iterator; @@ -52,12 +51,12 @@ import org.apache.parquet.schema.Type; public class FlinkParquetWriters { - private FlinkParquetWriters() { - } + private FlinkParquetWriters() {} @SuppressWarnings("unchecked") public static ParquetValueWriter buildWriter(LogicalType schema, MessageType type) { - return (ParquetValueWriter) ParquetWithFlinkSchemaVisitor.visit(schema, type, new WriteBuilder(type)); + return (ParquetValueWriter) + ParquetWithFlinkSchemaVisitor.visit(schema, type, new WriteBuilder(type)); } private static class WriteBuilder extends ParquetWithFlinkSchemaVisitor> { @@ -68,13 +67,14 @@ private static class WriteBuilder extends ParquetWithFlinkSchemaVisitor message(RowType sStruct, MessageType message, List> fields) { + public ParquetValueWriter message( + RowType sStruct, MessageType message, List> fields) { return struct(sStruct, message.asGroupType(), fields); } @Override - public ParquetValueWriter struct(RowType sStruct, GroupType struct, - List> fieldWriters) { + public ParquetValueWriter struct( + RowType sStruct, GroupType struct, List> fieldWriters) { List fields = struct.getFields(); List flinkFields = sStruct.getFields(); List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); @@ -88,34 +88,42 @@ public ParquetValueWriter struct(RowType sStruct, GroupType struct, } @Override - public ParquetValueWriter list(ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { + public ParquetValueWriter list( + ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - return new ArrayDataWriter<>(repeatedD, repeatedR, + return new ArrayDataWriter<>( + repeatedD, + repeatedR, newOption(repeated.getType(0), elementWriter), sArray.getElementType()); } @Override - public ParquetValueWriter map(MapType sMap, GroupType map, - ParquetValueWriter keyWriter, ParquetValueWriter valueWriter) { + public ParquetValueWriter map( + MapType sMap, + GroupType map, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - return new MapDataWriter<>(repeatedD, repeatedR, + return new MapDataWriter<>( + repeatedD, + repeatedR, newOption(repeatedKeyValue.getType(0), keyWriter), newOption(repeatedKeyValue.getType(1), valueWriter), - sMap.getKeyType(), sMap.getValueType()); + sMap.getKeyType(), + sMap.getValueType()); } - private ParquetValueWriter newOption(Type fieldType, ParquetValueWriter writer) { int maxD = type.getMaxDefinitionLevel(path(fieldType.getName())); return ParquetValueWriters.option(fieldType, maxD, writer); @@ -143,7 +151,8 @@ public ParquetValueWriter primitive(LogicalType fType, PrimitiveType primitiv case TIMESTAMP_MICROS: return timestamps(desc); case DECIMAL: - DecimalLogicalTypeAnnotation decimal = (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); + DecimalLogicalTypeAnnotation decimal = + (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); switch (primitive.getPrimitiveTypeName()) { case INT32: return decimalAsInteger(desc, decimal.getPrecision(), decimal.getScale()); @@ -184,7 +193,8 @@ public ParquetValueWriter primitive(LogicalType fType, PrimitiveType primitiv } } - private static ParquetValueWriters.PrimitiveWriter ints(LogicalType type, ColumnDescriptor desc) { + private static ParquetValueWriters.PrimitiveWriter ints( + LogicalType type, ColumnDescriptor desc) { if (type instanceof TinyIntType) { return ParquetValueWriters.tinyints(desc); } else if (type instanceof SmallIntType) { @@ -201,26 +211,33 @@ private static ParquetValueWriters.PrimitiveWriter timeMicros(ColumnDes return new TimeMicrosWriter(desc); } - private static ParquetValueWriters.PrimitiveWriter decimalAsInteger(ColumnDescriptor desc, - int precision, int scale) { - Preconditions.checkArgument(precision <= 9, "Cannot write decimal value as integer with precision larger than 9," + - " wrong precision %s", precision); + private static ParquetValueWriters.PrimitiveWriter decimalAsInteger( + ColumnDescriptor desc, int precision, int scale) { + Preconditions.checkArgument( + precision <= 9, + "Cannot write decimal value as integer with precision larger than 9," + + " wrong precision %s", + precision); return new IntegerDecimalWriter(desc, precision, scale); } - private static ParquetValueWriters.PrimitiveWriter decimalAsLong(ColumnDescriptor desc, - int precision, int scale) { - Preconditions.checkArgument(precision <= 18, "Cannot write decimal value as long with precision larger than 18, " + - " wrong precision %s", precision); + private static ParquetValueWriters.PrimitiveWriter decimalAsLong( + ColumnDescriptor desc, int precision, int scale) { + Preconditions.checkArgument( + precision <= 18, + "Cannot write decimal value as long with precision larger than 18, " + + " wrong precision %s", + precision); return new LongDecimalWriter(desc, precision, scale); } - private static ParquetValueWriters.PrimitiveWriter decimalAsFixed(ColumnDescriptor desc, - int precision, int scale) { + private static ParquetValueWriters.PrimitiveWriter decimalAsFixed( + ColumnDescriptor desc, int precision, int scale) { return new FixedDecimalWriter(desc, precision, scale); } - private static ParquetValueWriters.PrimitiveWriter timestamps(ColumnDescriptor desc) { + private static ParquetValueWriters.PrimitiveWriter timestamps( + ColumnDescriptor desc) { return new TimestampDataWriter(desc); } @@ -251,7 +268,8 @@ public void write(int repetitionLevel, Integer value) { } } - private static class IntegerDecimalWriter extends ParquetValueWriters.PrimitiveWriter { + private static class IntegerDecimalWriter + extends ParquetValueWriters.PrimitiveWriter { private final int precision; private final int scale; @@ -263,10 +281,18 @@ private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, DecimalData decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeInteger(repetitionLevel, (int) decimal.toUnscaledLong()); } @@ -284,10 +310,18 @@ private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, DecimalData decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeLong(repetitionLevel, decimal.toUnscaledLong()); } @@ -302,24 +336,28 @@ private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { super(desc); this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(int repetitionLevel, DecimalData decimal) { - byte[] binary = DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toBigDecimal(), bytes.get()); + byte[] binary = + DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toBigDecimal(), bytes.get()); column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(binary)); } } - private static class TimestampDataWriter extends ParquetValueWriters.PrimitiveWriter { + private static class TimestampDataWriter + extends ParquetValueWriters.PrimitiveWriter { private TimestampDataWriter(ColumnDescriptor desc) { super(desc); } @Override public void write(int repetitionLevel, TimestampData value) { - column.writeLong(repetitionLevel, value.getMillisecond() * 1000 + value.getNanoOfMillisecond() / 1000); + column.writeLong( + repetitionLevel, value.getMillisecond() * 1000 + value.getNanoOfMillisecond() / 1000); } } @@ -337,8 +375,11 @@ public void write(int repetitionLevel, byte[] bytes) { private static class ArrayDataWriter extends ParquetValueWriters.RepeatedWriter { private final LogicalType elementType; - private ArrayDataWriter(int definitionLevel, int repetitionLevel, - ParquetValueWriter writer, LogicalType elementType) { + private ArrayDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter writer, + LogicalType elementType) { super(definitionLevel, repetitionLevel, writer); this.elementType = elementType; } @@ -381,13 +422,18 @@ public E next() { } } - private static class MapDataWriter extends ParquetValueWriters.RepeatedKeyValueWriter { + private static class MapDataWriter + extends ParquetValueWriters.RepeatedKeyValueWriter { private final LogicalType keyType; private final LogicalType valueType; - private MapDataWriter(int definitionLevel, int repetitionLevel, - ParquetValueWriter keyWriter, ParquetValueWriter valueWriter, - LogicalType keyType, LogicalType valueType) { + private MapDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter, + LogicalType keyType, + LogicalType valueType) { super(definitionLevel, repetitionLevel, keyWriter, valueWriter); this.keyType = keyType; this.valueType = valueType; @@ -429,7 +475,9 @@ public Map.Entry next() { throw new NoSuchElementException(); } - entry.set((K) keyGetter.getElementOrNull(keys, index), (V) valueGetter.getElementOrNull(values, index)); + entry.set( + (K) keyGetter.getElementOrNull(keys, index), + (V) valueGetter.getElementOrNull(values, index)); index += 1; return entry; diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java index 0909e1b53a85..ba4e1a7a7aec 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.List; @@ -85,8 +84,8 @@ private static T visit(LogicalType flinkType, Type iType, FlinkSchemaVisitor } } - private static T visitRecord(LogicalType flinkType, Types.StructType struct, - FlinkSchemaVisitor visitor) { + private static T visitRecord( + LogicalType flinkType, Types.StructType struct, FlinkSchemaVisitor visitor) { Preconditions.checkArgument(flinkType instanceof RowType, "%s is not a RowType.", flinkType); RowType rowType = (RowType) flinkType; @@ -98,8 +97,8 @@ private static T visitRecord(LogicalType flinkType, Types.StructType struct, for (int i = 0; i < fieldSize; i++) { Types.NestedField iField = nestedFields.get(i); int fieldIndex = rowType.getFieldIndex(iField.name()); - Preconditions.checkArgument(fieldIndex >= 0, - "NestedField: %s is not found in flink RowType: %s", iField, rowType); + Preconditions.checkArgument( + fieldIndex >= 0, "NestedField: %s is not found in flink RowType: %s", iField, rowType); LogicalType fieldFlinkType = rowType.getTypeAt(fieldIndex); @@ -132,11 +131,9 @@ public T primitive(Type.PrimitiveType iPrimitive, LogicalType flinkPrimitive) { return null; } - public void beforeField(Types.NestedField field) { - } + public void beforeField(Types.NestedField field) {} - public void afterField(Types.NestedField field) { - } + public void afterField(Types.NestedField field) {} public void beforeListElement(Types.NestedField elementField) { beforeField(elementField); diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java index 1b7a98f7dc8f..32f6c3a2ccfd 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.IOException; @@ -44,8 +43,7 @@ public class FlinkValueReaders { - private FlinkValueReaders() { - } + private FlinkValueReaders() {} static ValueReader strings() { return StringReader.INSTANCE; @@ -71,7 +69,8 @@ static ValueReader timestampMicros() { return TimestampMicrosReader.INSTANCE; } - static ValueReader decimal(ValueReader unscaledReader, int precision, int scale) { + static ValueReader decimal( + ValueReader unscaledReader, int precision, int scale) { return new DecimalReader(unscaledReader, precision, scale); } @@ -79,8 +78,7 @@ static ValueReader array(ValueReader elementReader) { return new ArrayReader(elementReader); } - static ValueReader arrayMap(ValueReader keyReader, - ValueReader valueReader) { + static ValueReader arrayMap(ValueReader keyReader, ValueReader valueReader) { return new ArrayMapReader(keyReader, valueReader); } @@ -88,16 +86,15 @@ static ValueReader map(ValueReader keyReader, ValueReader valueRe return new MapReader(keyReader, valueReader); } - static ValueReader struct(List> readers, Types.StructType struct, - Map idToConstant) { + static ValueReader struct( + List> readers, Types.StructType struct, Map idToConstant) { return new StructReader(readers, struct, idToConstant); } private static class StringReader implements ValueReader { private static final StringReader INSTANCE = new StringReader(); - private StringReader() { - } + private StringReader() {} @Override public StringData read(Decoder decoder, Object reuse) throws IOException { @@ -143,7 +140,8 @@ private DecimalReader(ValueReader bytesReader, int precision, int scale) @Override public DecimalData read(Decoder decoder, Object reuse) throws IOException { byte[] bytes = bytesReader.read(decoder, null); - return DecimalData.fromBigDecimal(new BigDecimal(new BigInteger(bytes), scale), precision, scale); + return DecimalData.fromBigDecimal( + new BigDecimal(new BigInteger(bytes), scale), precision, scale); } } @@ -287,7 +285,8 @@ public MapData read(Decoder decoder, Object reuse) throws IOException { private static class StructReader extends ValueReaders.StructReader { private final int numFields; - private StructReader(List> readers, Types.StructType struct, Map idToConstant) { + private StructReader( + List> readers, Types.StructType struct, Map idToConstant) { super(readers, struct, idToConstant); this.numFields = readers.size(); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java index 517d7d8e1527..4e86ecce28b5 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.IOException; @@ -37,8 +36,7 @@ public class FlinkValueWriters { - private FlinkValueWriters() { - } + private FlinkValueWriters() {} static ValueWriter strings() { return StringWriter.INSTANCE; @@ -60,13 +58,19 @@ static ValueWriter array(ValueWriter elementWriter, LogicalTyp return new ArrayWriter<>(elementWriter, elementType); } - static ValueWriter arrayMap(ValueWriter keyWriter, LogicalType keyType, - ValueWriter valueWriter, LogicalType valueType) { + static ValueWriter arrayMap( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { return new ArrayMapWriter<>(keyWriter, keyType, valueWriter, valueType); } - static ValueWriter map(ValueWriter keyWriter, LogicalType keyType, - ValueWriter valueWriter, LogicalType valueType) { + static ValueWriter map( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { return new MapWriter<>(keyWriter, keyType, valueWriter, valueType); } @@ -77,8 +81,7 @@ static ValueWriter row(List> writers, List private static class StringWriter implements ValueWriter { private static final StringWriter INSTANCE = new StringWriter(); - private StringWriter() { - } + private StringWriter() {} @Override public void write(StringData s, Encoder encoder) throws IOException { @@ -95,12 +98,14 @@ private static class DecimalWriter implements ValueWriter { private DecimalWriter(int precision, int scale) { this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(DecimalData d, Encoder encoder) throws IOException { - encoder.writeFixed(DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toBigDecimal(), bytes.get())); + encoder.writeFixed( + DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toBigDecimal(), bytes.get())); } } @@ -118,7 +123,8 @@ private static class TimestampMicrosWriter implements ValueWriter @Override public void write(TimestampData timestampData, Encoder encoder) throws IOException { - long micros = timestampData.getMillisecond() * 1000 + timestampData.getNanoOfMillisecond() / 1000; + long micros = + timestampData.getMillisecond() * 1000 + timestampData.getNanoOfMillisecond() / 1000; encoder.writeLong(micros); } } @@ -152,8 +158,11 @@ private static class ArrayMapWriter implements ValueWriter { private final ArrayData.ElementGetter keyGetter; private final ArrayData.ElementGetter valueGetter; - private ArrayMapWriter(ValueWriter keyWriter, LogicalType keyType, - ValueWriter valueWriter, LogicalType valueType) { + private ArrayMapWriter( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { this.keyWriter = keyWriter; this.keyGetter = ArrayData.createElementGetter(keyType); this.valueWriter = valueWriter; @@ -183,8 +192,11 @@ private static class MapWriter implements ValueWriter { private final ArrayData.ElementGetter keyGetter; private final ArrayData.ElementGetter valueGetter; - private MapWriter(ValueWriter keyWriter, LogicalType keyType, - ValueWriter valueWriter, LogicalType valueType) { + private MapWriter( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { this.keyWriter = keyWriter; this.keyGetter = ArrayData.createElementGetter(keyType); this.valueWriter = valueWriter; diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java index 87501891a68c..7a8f61fb13e3 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.Deque; @@ -38,12 +37,15 @@ public class ParquetWithFlinkSchemaVisitor { private final Deque fieldNames = Lists.newLinkedList(); - public static T visit(LogicalType sType, Type type, ParquetWithFlinkSchemaVisitor visitor) { + public static T visit( + LogicalType sType, Type type, ParquetWithFlinkSchemaVisitor visitor) { Preconditions.checkArgument(sType != null, "Invalid DataType: null"); if (type instanceof MessageType) { - Preconditions.checkArgument(sType instanceof RowType, "Invalid struct: %s is not a struct", sType); + Preconditions.checkArgument( + sType instanceof RowType, "Invalid struct: %s is not a struct", sType); RowType struct = (RowType) sType; - return visitor.message(struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); + return visitor.message( + struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); } else if (type.isPrimitive()) { return visitor.primitive(sType, type.asPrimitiveType()); } else { @@ -53,19 +55,26 @@ public static T visit(LogicalType sType, Type type, ParquetWithFlinkSchemaVi if (annotation != null) { switch (annotation) { case LIST: - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid list: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid list: does not contain single repeated field: %s", + group); GroupType repeatedElement = group.getFields().get(0).asGroupType(); - Preconditions.checkArgument(repeatedElement.isRepetition(Type.Repetition.REPEATED), + Preconditions.checkArgument( + repeatedElement.isRepetition(Type.Repetition.REPEATED), "Invalid list: inner group is not repeated"); - Preconditions.checkArgument(repeatedElement.getFieldCount() <= 1, - "Invalid list: repeated group is not a single field: %s", group); + Preconditions.checkArgument( + repeatedElement.getFieldCount() <= 1, + "Invalid list: repeated group is not a single field: %s", + group); - Preconditions.checkArgument(sType instanceof ArrayType, "Invalid list: %s is not an array", sType); + Preconditions.checkArgument( + sType instanceof ArrayType, "Invalid list: %s is not an array", sType); ArrayType array = (ArrayType) sType; - RowType.RowField element = new RowField( - "element", array.getElementType(), "element of " + array.asSummaryString()); + RowType.RowField element = + new RowField( + "element", array.getElementType(), "element of " + array.asSummaryString()); visitor.fieldNames.push(repeatedElement.getName()); try { @@ -81,22 +90,30 @@ public static T visit(LogicalType sType, Type type, ParquetWithFlinkSchemaVi } case MAP: - Preconditions.checkArgument(!group.isRepetition(Type.Repetition.REPEATED), - "Invalid map: top-level group is repeated: %s", group); - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid map: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + !group.isRepetition(Type.Repetition.REPEATED), + "Invalid map: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid map: does not contain single repeated field: %s", + group); GroupType repeatedKeyValue = group.getType(0).asGroupType(); - Preconditions.checkArgument(repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), + Preconditions.checkArgument( + repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), "Invalid map: inner group is not repeated"); - Preconditions.checkArgument(repeatedKeyValue.getFieldCount() <= 2, + Preconditions.checkArgument( + repeatedKeyValue.getFieldCount() <= 2, "Invalid map: repeated group does not have 2 fields"); - Preconditions.checkArgument(sType instanceof MapType, "Invalid map: %s is not a map", sType); + Preconditions.checkArgument( + sType instanceof MapType, "Invalid map: %s is not a map", sType); MapType map = (MapType) sType; - RowField keyField = new RowField("key", map.getKeyType(), "key of " + map.asSummaryString()); - RowField valueField = new RowField( - "value", map.getValueType(), "value of " + map.asSummaryString()); + RowField keyField = + new RowField("key", map.getKeyType(), "key of " + map.asSummaryString()); + RowField valueField = + new RowField("value", map.getValueType(), "value of " + map.asSummaryString()); visitor.fieldNames.push(repeatedKeyValue.getName()); try { @@ -132,13 +149,15 @@ public static T visit(LogicalType sType, Type type, ParquetWithFlinkSchemaVi default: } } - Preconditions.checkArgument(sType instanceof RowType, "Invalid struct: %s is not a struct", sType); + Preconditions.checkArgument( + sType instanceof RowType, "Invalid struct: %s is not a struct", sType); RowType struct = (RowType) sType; return visitor.struct(struct, group, visitFields(struct, group, visitor)); } } - private static T visitField(RowType.RowField sField, Type field, ParquetWithFlinkSchemaVisitor visitor) { + private static T visitField( + RowType.RowField sField, Type field, ParquetWithFlinkSchemaVisitor visitor) { visitor.fieldNames.push(field.getName()); try { return visit(sField.getType(), field, visitor); @@ -147,17 +166,20 @@ private static T visitField(RowType.RowField sField, Type field, ParquetWith } } - private static List visitFields(RowType struct, GroupType group, - ParquetWithFlinkSchemaVisitor visitor) { + private static List visitFields( + RowType struct, GroupType group, ParquetWithFlinkSchemaVisitor visitor) { List sFields = struct.getFields(); - Preconditions.checkArgument(sFields.size() == group.getFieldCount(), - "Structs do not match: %s and %s", struct, group); + Preconditions.checkArgument( + sFields.size() == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); for (int i = 0; i < sFields.size(); i += 1) { Type field = group.getFields().get(i); RowType.RowField sField = sFields.get(i); - Preconditions.checkArgument(field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.getName())), - "Structs do not match: field %s != %s", field.getName(), sField.getName()); + Preconditions.checkArgument( + field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.getName())), + "Structs do not match: field %s != %s", + field.getName(), + sField.getName()); results.add(visitField(sField, field, visitor)); } @@ -193,5 +215,4 @@ protected String[] path(String name) { list.add(name); return list.toArray(new String[0]); } - } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java index 6334a00fd0d7..e41bae686d1e 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.Map; @@ -38,35 +37,40 @@ public class RowDataProjection implements RowData { /** * Creates a projecting wrapper for {@link RowData} rows. - *

- * This projection will not project the nested children types of repeated types like lists and maps. + * + *

This projection will not project the nested children types of repeated types like lists and + * maps. * * @param schema schema of rows wrapped by this projection * @param projectedSchema result schema of the projected rows * @return a wrapper to project rows */ public static RowDataProjection create(Schema schema, Schema projectedSchema) { - return RowDataProjection.create(FlinkSchemaUtil.convert(schema), schema.asStruct(), projectedSchema.asStruct()); + return RowDataProjection.create( + FlinkSchemaUtil.convert(schema), schema.asStruct(), projectedSchema.asStruct()); } /** * Creates a projecting wrapper for {@link RowData} rows. - *

- * This projection will not project the nested children types of repeated types like lists and maps. + * + *

This projection will not project the nested children types of repeated types like lists and + * maps. * * @param rowType flink row type of rows wrapped by this projection * @param schema schema of rows wrapped by this projection * @param projectedSchema result schema of the projected rows * @return a wrapper to project rows */ - public static RowDataProjection create(RowType rowType, Types.StructType schema, Types.StructType projectedSchema) { + public static RowDataProjection create( + RowType rowType, Types.StructType schema, Types.StructType projectedSchema) { return new RowDataProjection(rowType, schema, projectedSchema); } private final RowData.FieldGetter[] getters; private RowData rowData; - private RowDataProjection(RowType rowType, Types.StructType rowStruct, Types.StructType projectType) { + private RowDataProjection( + RowType rowType, Types.StructType rowStruct, Types.StructType projectType) { Map fieldIdToPosition = Maps.newHashMap(); for (int i = 0; i < rowStruct.fields().size(); i++) { fieldIdToPosition.put(rowStruct.fields().get(i).fieldId(), i); @@ -77,27 +81,34 @@ private RowDataProjection(RowType rowType, Types.StructType rowStruct, Types.Str Types.NestedField projectField = projectType.fields().get(i); Types.NestedField rowField = rowStruct.field(projectField.fieldId()); - Preconditions.checkNotNull(rowField, - "Cannot locate the project field <%s> in the iceberg struct <%s>", projectField, rowStruct); + Preconditions.checkNotNull( + rowField, + "Cannot locate the project field <%s> in the iceberg struct <%s>", + projectField, + rowStruct); - getters[i] = createFieldGetter(rowType, fieldIdToPosition.get(projectField.fieldId()), rowField, projectField); + getters[i] = + createFieldGetter( + rowType, fieldIdToPosition.get(projectField.fieldId()), rowField, projectField); } } - private static RowData.FieldGetter createFieldGetter(RowType rowType, - int position, - Types.NestedField rowField, - Types.NestedField projectField) { - Preconditions.checkArgument(rowField.type().typeId() == projectField.type().typeId(), - "Different iceberg type between row field <%s> and project field <%s>", rowField, projectField); + private static RowData.FieldGetter createFieldGetter( + RowType rowType, int position, Types.NestedField rowField, Types.NestedField projectField) { + Preconditions.checkArgument( + rowField.type().typeId() == projectField.type().typeId(), + "Different iceberg type between row field <%s> and project field <%s>", + rowField, + projectField); switch (projectField.type().typeId()) { case STRUCT: RowType nestedRowType = (RowType) rowType.getTypeAt(position); return row -> { - RowData nestedRow = row.isNullAt(position) ? null : row.getRow(position, nestedRowType.getFieldCount()); - return RowDataProjection - .create(nestedRowType, rowField.type().asStructType(), projectField.type().asStructType()) + RowData nestedRow = + row.isNullAt(position) ? null : row.getRow(position, nestedRowType.getFieldCount()); + return RowDataProjection.create( + nestedRowType, rowField.type().asStructType(), projectField.type().asStructType()) .wrap(nestedRow); }; @@ -105,13 +116,17 @@ private static RowData.FieldGetter createFieldGetter(RowType rowType, Types.MapType projectedMap = projectField.type().asMapType(); Types.MapType originalMap = rowField.type().asMapType(); - boolean keyProjectable = !projectedMap.keyType().isNestedType() || - projectedMap.keyType().equals(originalMap.keyType()); - boolean valueProjectable = !projectedMap.valueType().isNestedType() || - projectedMap.valueType().equals(originalMap.valueType()); - Preconditions.checkArgument(keyProjectable && valueProjectable, + boolean keyProjectable = + !projectedMap.keyType().isNestedType() + || projectedMap.keyType().equals(originalMap.keyType()); + boolean valueProjectable = + !projectedMap.valueType().isNestedType() + || projectedMap.valueType().equals(originalMap.valueType()); + Preconditions.checkArgument( + keyProjectable && valueProjectable, "Cannot project a partial map key or value with non-primitive type. Trying to project <%s> out of <%s>", - projectField, rowField); + projectField, + rowField); return RowData.createFieldGetter(rowType.getTypeAt(position), position); @@ -119,11 +134,14 @@ private static RowData.FieldGetter createFieldGetter(RowType rowType, Types.ListType projectedList = projectField.type().asListType(); Types.ListType originalList = rowField.type().asListType(); - boolean elementProjectable = !projectedList.elementType().isNestedType() || - projectedList.elementType().equals(originalList.elementType()); - Preconditions.checkArgument(elementProjectable, + boolean elementProjectable = + !projectedList.elementType().isNestedType() + || projectedList.elementType().equals(originalList.elementType()); + Preconditions.checkArgument( + elementProjectable, "Cannot project a partial list element with non-primitive type. Trying to project <%s> out of <%s>", - projectField, rowField); + projectField, + rowField); return RowData.createFieldGetter(rowType.getTypeAt(position), position); diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java index 931880fc360c..c5cb51b7eae4 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.math.BigDecimal; @@ -38,9 +37,7 @@ public class RowDataUtil { - private RowDataUtil() { - - } + private RowDataUtil() {} public static Object convertConstant(Type type, Object value) { if (value == null) { @@ -76,12 +73,13 @@ public static Object convertConstant(Type type, Object value) { } /** - * Similar to the private {@link RowDataSerializer#copyRowData(RowData, RowData)} method. - * This skips the check the arity of rowType and from, - * because the from RowData may contains additional column for position deletes. - * Using {@link RowDataSerializer#copy(RowData, RowData)} will fail the arity check. + * Similar to the private {@link RowDataSerializer#copyRowData(RowData, RowData)} method. This + * skips the check the arity of rowType and from, because the from RowData may contains additional + * column for position deletes. Using {@link RowDataSerializer#copy(RowData, RowData)} will fail + * the arity check. */ - public static RowData clone(RowData from, RowData reuse, RowType rowType, TypeSerializer[] fieldSerializers) { + public static RowData clone( + RowData from, RowData reuse, RowType rowType, TypeSerializer[] fieldSerializers) { GenericRowData ret; if (reuse instanceof GenericRowData) { ret = (GenericRowData) reuse; @@ -99,5 +97,4 @@ public static RowData clone(RowData from, RowData reuse, RowType rowType, TypeSe } return ret; } - } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java index 3bff219d6e6e..b8786f259a9c 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -47,21 +46,23 @@ abstract class BaseDeltaTaskWriter extends BaseTaskWriter { private final RowDataProjection keyProjection; private final boolean upsert; - BaseDeltaTaskWriter(PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - List equalityFieldIds, - boolean upsert) { + BaseDeltaTaskWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema, + List equalityFieldIds, + boolean upsert) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.schema = schema; this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)); this.wrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - this.keyWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(deleteSchema), deleteSchema.asStruct()); + this.keyWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(deleteSchema), deleteSchema.asStruct()); this.keyProjection = RowDataProjection.create(schema, deleteSchema); this.upsert = upsert; } @@ -87,7 +88,8 @@ public void write(RowData row) throws IOException { case UPDATE_BEFORE: if (upsert) { - break; // UPDATE_BEFORE is not necessary for UPSERT, we do nothing to prevent delete one row twice + break; // UPDATE_BEFORE is not necessary for UPSERT, we do nothing to prevent delete one + // row twice } writer.delete(row); break; diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java index 866b785d7e1e..036970c06d5b 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -36,7 +35,8 @@ class DeltaManifests { this(dataManifest, deleteManifest, EMPTY_REF_DATA_FILES); } - DeltaManifests(ManifestFile dataManifest, ManifestFile deleteManifest, CharSequence[] referencedDataFiles) { + DeltaManifests( + ManifestFile dataManifest, ManifestFile deleteManifest, CharSequence[] referencedDataFiles) { Preconditions.checkNotNull(referencedDataFiles, "Referenced data files shouldn't be null."); this.dataManifest = dataManifest; diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java index 859f97940116..c4d6e713bb73 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.ByteArrayInputStream; @@ -43,7 +42,8 @@ public int getVersion() { @Override public byte[] serialize(DeltaManifests deltaManifests) throws IOException { - Preconditions.checkNotNull(deltaManifests, "DeltaManifests to be serialized should not be null"); + Preconditions.checkNotNull( + deltaManifests, "DeltaManifests to be serialized should not be null"); ByteArrayOutputStream binaryOut = new ByteArrayOutputStream(); DataOutputStream out = new DataOutputStream(binaryOut); diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java index 56689567a1d2..18b269d6c3e9 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -31,8 +30,8 @@ import org.apache.iceberg.util.StructProjection; /** - * Create a {@link KeySelector} to shuffle by equality fields, to ensure same equality fields record will be emitted to - * same writer in order. + * Create a {@link KeySelector} to shuffle by equality fields, to ensure same equality fields record + * will be emitted to same writer in order. */ class EqualityFieldKeySelector implements KeySelector { @@ -51,8 +50,8 @@ class EqualityFieldKeySelector implements KeySelector { } /** - * Construct the {@link RowDataWrapper} lazily here because few members in it are not serializable. In this way, we - * don't have to serialize them with forcing. + * Construct the {@link RowDataWrapper} lazily here because few members in it are not + * serializable. In this way, we don't have to serialize them with forcing. */ protected RowDataWrapper lazyRowDataWrapper() { if (rowDataWrapper == null) { @@ -61,9 +60,7 @@ protected RowDataWrapper lazyRowDataWrapper() { return rowDataWrapper; } - /** - * Construct the {@link StructProjection} lazily because it is not serializable. - */ + /** Construct the {@link StructProjection} lazily because it is not serializable. */ protected StructProjection lazyStructProjection() { if (structProjection == null) { structProjection = StructProjection.create(schema, deleteSchema); @@ -71,9 +68,7 @@ protected StructProjection lazyStructProjection() { return structProjection; } - /** - * Construct the {@link StructLikeWrapper} lazily because it is not serializable. - */ + /** Construct the {@link StructLikeWrapper} lazily because it is not serializable. */ protected StructLikeWrapper lazyStructLikeWrapper() { if (structLikeWrapper == null) { structLikeWrapper = StructLikeWrapper.forType(deleteSchema.asStruct()); diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java index ade5c28837ec..b5d08b46be58 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -60,13 +59,19 @@ public class FlinkAppenderFactory implements FileAppenderFactory, Seria private RowType eqDeleteFlinkSchema = null; private RowType posDeleteFlinkSchema = null; - public FlinkAppenderFactory(Schema schema, RowType flinkSchema, Map props, PartitionSpec spec) { + public FlinkAppenderFactory( + Schema schema, RowType flinkSchema, Map props, PartitionSpec spec) { this(schema, flinkSchema, props, spec, null, null, null); } - public FlinkAppenderFactory(Schema schema, RowType flinkSchema, Map props, - PartitionSpec spec, int[] equalityFieldIds, - Schema eqDeleteRowSchema, Schema posDeleteRowSchema) { + public FlinkAppenderFactory( + Schema schema, + RowType flinkSchema, + Map props, + PartitionSpec spec, + int[] equalityFieldIds, + Schema eqDeleteRowSchema, + Schema posDeleteRowSchema) { this.schema = schema; this.flinkSchema = flinkSchema; this.props = props; @@ -108,7 +113,8 @@ public FileAppender newAppender(OutputFile outputFile, FileFormat forma case ORC: return ORC.write(outputFile) - .createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) + .createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) .setAll(props) .metricsConfig(metricsConfig) .schema(schema) @@ -133,18 +139,25 @@ public FileAppender newAppender(OutputFile outputFile, FileFormat forma } @Override - public DataWriter newDataWriter(EncryptedOutputFile file, FileFormat format, StructLike partition) { + public DataWriter newDataWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { return new DataWriter<>( - newAppender(file.encryptingOutputFile(), format), format, - file.encryptingOutputFile().location(), spec, partition, file.keyMetadata()); + newAppender(file.encryptingOutputFile(), format), + format, + file.encryptingOutputFile().location(), + spec, + partition, + file.keyMetadata()); } @Override - public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile outputFile, FileFormat format, - StructLike partition) { - Preconditions.checkState(equalityFieldIds != null && equalityFieldIds.length > 0, + public EqualityDeleteWriter newEqDeleteWriter( + EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { + Preconditions.checkState( + equalityFieldIds != null && equalityFieldIds.length > 0, "Equality field ids shouldn't be null or empty when creating equality-delete writer"); - Preconditions.checkNotNull(eqDeleteRowSchema, + Preconditions.checkNotNull( + eqDeleteRowSchema, "Equality delete row schema shouldn't be null when creating equality-delete writer"); MetricsConfig metricsConfig = MetricsConfig.fromProperties(props); @@ -164,7 +177,8 @@ public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile outpu case ORC: return ORC.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) + .createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) .withPartition(partition) .overwrite() .setAll(props) @@ -176,7 +190,8 @@ public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile outpu case PARQUET: return Parquet.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(lazyEqDeleteFlinkSchema(), msgType)) + .createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(lazyEqDeleteFlinkSchema(), msgType)) .withPartition(partition) .overwrite() .setAll(props) @@ -197,8 +212,8 @@ public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile outpu } @Override - public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile outputFile, FileFormat format, - StructLike partition) { + public PositionDeleteWriter newPosDeleteWriter( + EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { MetricsConfig metricsConfig = MetricsConfig.fromProperties(props); try { switch (format) { @@ -214,9 +229,11 @@ public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile outp .buildPositionWriter(); case ORC: - RowType orcPosDeleteSchema = FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); + RowType orcPosDeleteSchema = + FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); return ORC.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(orcPosDeleteSchema, iSchema)) + .createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(orcPosDeleteSchema, iSchema)) .withPartition(partition) .overwrite() .setAll(props) @@ -228,9 +245,11 @@ public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile outp .buildPositionWriter(); case PARQUET: - RowType flinkPosDeleteSchema = FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); + RowType flinkPosDeleteSchema = + FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); return Parquet.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(flinkPosDeleteSchema, msgType)) + .createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(flinkPosDeleteSchema, msgType)) .withPartition(partition) .overwrite() .setAll(props) @@ -242,7 +261,8 @@ public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile outp .buildPositionWriter(); default: - throw new UnsupportedOperationException("Cannot write pos-deletes for unsupported file format: " + format); + throw new UnsupportedOperationException( + "Cannot write pos-deletes for unsupported file format: " + format); } } catch (IOException e) { throw new UncheckedIOException(e); diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java index 55a9539c78d1..5872fed36d65 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; +import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; + import java.io.Serializable; import java.util.Locale; import java.util.Map; @@ -40,24 +44,35 @@ import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; -import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; - class FlinkFileWriterFactory extends BaseFileWriterFactory implements Serializable { private RowType dataFlinkType; private RowType equalityDeleteFlinkType; private RowType positionDeleteFlinkType; - FlinkFileWriterFactory(Table table, FileFormat dataFileFormat, Schema dataSchema, RowType dataFlinkType, - SortOrder dataSortOrder, FileFormat deleteFileFormat, - int[] equalityFieldIds, Schema equalityDeleteRowSchema, RowType equalityDeleteFlinkType, - SortOrder equalityDeleteSortOrder, Schema positionDeleteRowSchema, - RowType positionDeleteFlinkType) { - - super(table, dataFileFormat, dataSchema, dataSortOrder, deleteFileFormat, equalityFieldIds, - equalityDeleteRowSchema, equalityDeleteSortOrder, positionDeleteRowSchema); + FlinkFileWriterFactory( + Table table, + FileFormat dataFileFormat, + Schema dataSchema, + RowType dataFlinkType, + SortOrder dataSortOrder, + FileFormat deleteFileFormat, + int[] equalityFieldIds, + Schema equalityDeleteRowSchema, + RowType equalityDeleteFlinkType, + SortOrder equalityDeleteSortOrder, + Schema positionDeleteRowSchema, + RowType positionDeleteFlinkType) { + + super( + table, + dataFileFormat, + dataSchema, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteSortOrder, + positionDeleteRowSchema); this.dataFlinkType = dataFlinkType; this.equalityDeleteFlinkType = equalityDeleteFlinkType; @@ -83,7 +98,8 @@ protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { int rowFieldIndex = positionDeleteFlinkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME); if (rowFieldIndex >= 0) { // FlinkAvroWriter accepts just the Flink type of the row ignoring the path and pos - RowType positionDeleteRowFlinkType = (RowType) positionDeleteFlinkType().getTypeAt(rowFieldIndex); + RowType positionDeleteRowFlinkType = + (RowType) positionDeleteFlinkType().getTypeAt(rowFieldIndex); builder.createWriterFunc(ignored -> new FlinkAvroWriter(positionDeleteRowFlinkType)); } } @@ -95,28 +111,33 @@ protected void configureDataWrite(Parquet.DataWriteBuilder builder) { @Override protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(equalityDeleteFlinkType(), msgType)); + builder.createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(equalityDeleteFlinkType(), msgType)); } @Override protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(positionDeleteFlinkType(), msgType)); + builder.createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(positionDeleteFlinkType(), msgType)); builder.transformPaths(path -> StringData.fromString(path.toString())); } @Override protected void configureDataWrite(ORC.DataWriteBuilder builder) { - builder.createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(dataFlinkType(), iSchema)); + builder.createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(dataFlinkType(), iSchema)); } @Override protected void configureEqualityDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(equalityDeleteFlinkType(), iSchema)); + builder.createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(equalityDeleteFlinkType(), iSchema)); } @Override protected void configurePositionDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(positionDeleteFlinkType(), iSchema)); + builder.createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(positionDeleteFlinkType(), iSchema)); builder.transformPaths(path -> StringData.fromString(path.toString())); } @@ -131,7 +152,8 @@ private RowType dataFlinkType() { private RowType equalityDeleteFlinkType() { if (equalityDeleteFlinkType == null) { - Preconditions.checkNotNull(equalityDeleteRowSchema(), "Equality delete schema must not be null"); + Preconditions.checkNotNull( + equalityDeleteRowSchema(), "Equality delete schema must not be null"); this.equalityDeleteFlinkType = FlinkSchemaUtil.convert(equalityDeleteRowSchema()); } @@ -140,7 +162,8 @@ private RowType equalityDeleteFlinkType() { private RowType positionDeleteFlinkType() { if (positionDeleteFlinkType == null) { - // wrap the optional row schema into the position delete schema that contains path and position + // wrap the optional row schema into the position delete schema that contains path and + // position Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema()); this.positionDeleteFlinkType = FlinkSchemaUtil.convert(positionDeleteSchema); } @@ -167,10 +190,12 @@ static class Builder { Map properties = table.properties(); - String dataFileFormatName = properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); + String dataFileFormatName = + properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); this.dataFileFormat = FileFormat.valueOf(dataFileFormatName.toUpperCase(Locale.ENGLISH)); - String deleteFileFormatName = properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); + String deleteFileFormatName = + properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); this.deleteFileFormat = FileFormat.valueOf(deleteFileFormatName.toUpperCase(Locale.ENGLISH)); } @@ -186,8 +211,8 @@ Builder dataSchema(Schema newDataSchema) { /** * Sets a Flink type for data. - *

- * If not set, the value is derived from the provided Iceberg schema. + * + *

If not set, the value is derived from the provided Iceberg schema. */ Builder dataFlinkType(RowType newDataFlinkType) { this.dataFlinkType = newDataFlinkType; @@ -216,8 +241,8 @@ Builder equalityDeleteRowSchema(Schema newEqualityDeleteRowSchema) { /** * Sets a Flink type for equality deletes. - *

- * If not set, the value is derived from the provided Iceberg schema. + * + *

If not set, the value is derived from the provided Iceberg schema. */ Builder equalityDeleteFlinkType(RowType newEqualityDeleteFlinkType) { this.equalityDeleteFlinkType = newEqualityDeleteFlinkType; @@ -236,8 +261,8 @@ Builder positionDeleteRowSchema(Schema newPositionDeleteRowSchema) { /** * Sets a Flink type for position deletes. - *

- * If not set, the value is derived from the provided Iceberg schema. + * + *

If not set, the value is derived from the provided Iceberg schema. */ Builder positionDeleteFlinkType(RowType newPositionDeleteFlinkType) { this.positionDeleteFlinkType = newPositionDeleteFlinkType; @@ -247,13 +272,23 @@ Builder positionDeleteFlinkType(RowType newPositionDeleteFlinkType) { FlinkFileWriterFactory build() { boolean noEqualityDeleteConf = equalityFieldIds == null && equalityDeleteRowSchema == null; boolean fullEqualityDeleteConf = equalityFieldIds != null && equalityDeleteRowSchema != null; - Preconditions.checkArgument(noEqualityDeleteConf || fullEqualityDeleteConf, + Preconditions.checkArgument( + noEqualityDeleteConf || fullEqualityDeleteConf, "Equality field IDs and equality delete row schema must be set together"); return new FlinkFileWriterFactory( - table, dataFileFormat, dataSchema, dataFlinkType, dataSortOrder, deleteFileFormat, - equalityFieldIds, equalityDeleteRowSchema, equalityDeleteFlinkType, equalityDeleteSortOrder, - positionDeleteRowSchema, positionDeleteFlinkType); + table, + dataFileFormat, + dataSchema, + dataFlinkType, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteFlinkType, + equalityDeleteSortOrder, + positionDeleteRowSchema, + positionDeleteFlinkType); } } } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java index d20859377ffc..25badc372abf 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -41,12 +40,12 @@ class FlinkManifestUtil { private static final int FORMAT_V2 = 2; private static final Long DUMMY_SNAPSHOT_ID = 0L; - private FlinkManifestUtil() { - } + private FlinkManifestUtil() {} - static ManifestFile writeDataFiles(OutputFile outputFile, PartitionSpec spec, List dataFiles) - throws IOException { - ManifestWriter writer = ManifestFiles.write(FORMAT_V2, spec, outputFile, DUMMY_SNAPSHOT_ID); + static ManifestFile writeDataFiles( + OutputFile outputFile, PartitionSpec spec, List dataFiles) throws IOException { + ManifestWriter writer = + ManifestFiles.write(FORMAT_V2, spec, outputFile, DUMMY_SNAPSHOT_ID); try (ManifestWriter closeableWriter = writer) { closeableWriter.addAll(dataFiles); @@ -61,31 +60,38 @@ static List readDataFiles(ManifestFile manifestFile, FileIO io) throws } } - static ManifestOutputFileFactory createOutputFileFactory(Table table, String flinkJobId, String operatorUniqueId, - int subTaskId, long attemptNumber) { + static ManifestOutputFileFactory createOutputFileFactory( + Table table, String flinkJobId, String operatorUniqueId, int subTaskId, long attemptNumber) { TableOperations ops = ((HasTableOperations) table).operations(); - return new ManifestOutputFileFactory(ops, table.io(), table.properties(), flinkJobId, operatorUniqueId, - subTaskId, attemptNumber); + return new ManifestOutputFileFactory( + ops, + table.io(), + table.properties(), + flinkJobId, + operatorUniqueId, + subTaskId, + attemptNumber); } - static DeltaManifests writeCompletedFiles(WriteResult result, - Supplier outputFileSupplier, - PartitionSpec spec) throws IOException { + static DeltaManifests writeCompletedFiles( + WriteResult result, Supplier outputFileSupplier, PartitionSpec spec) + throws IOException { ManifestFile dataManifest = null; ManifestFile deleteManifest = null; // Write the completed data files into a newly created data manifest file. if (result.dataFiles() != null && result.dataFiles().length > 0) { - dataManifest = writeDataFiles(outputFileSupplier.get(), spec, Lists.newArrayList(result.dataFiles())); + dataManifest = + writeDataFiles(outputFileSupplier.get(), spec, Lists.newArrayList(result.dataFiles())); } // Write the completed delete files into a newly created delete manifest file. if (result.deleteFiles() != null && result.deleteFiles().length > 0) { OutputFile deleteManifestFile = outputFileSupplier.get(); - ManifestWriter deleteManifestWriter = ManifestFiles.writeDeleteManifest(FORMAT_V2, spec, - deleteManifestFile, DUMMY_SNAPSHOT_ID); + ManifestWriter deleteManifestWriter = + ManifestFiles.writeDeleteManifest(FORMAT_V2, spec, deleteManifestFile, DUMMY_SNAPSHOT_ID); try (ManifestWriter writer = deleteManifestWriter) { for (DeleteFile deleteFile : result.deleteFiles()) { writer.add(deleteFile); @@ -98,7 +104,8 @@ static DeltaManifests writeCompletedFiles(WriteResult result, return new DeltaManifests(dataManifest, deleteManifest, result.referencedDataFiles()); } - static WriteResult readCompletedFiles(DeltaManifests deltaManifests, FileIO io) throws IOException { + static WriteResult readCompletedFiles(DeltaManifests deltaManifests, FileIO io) + throws IOException { WriteResult.Builder builder = WriteResult.builder(); // Read the completed data files from persisted data manifest file. @@ -108,13 +115,12 @@ static WriteResult readCompletedFiles(DeltaManifests deltaManifests, FileIO io) // Read the completed delete files from persisted delete manifests file. if (deltaManifests.deleteManifest() != null) { - try (CloseableIterable deleteFiles = ManifestFiles - .readDeleteManifest(deltaManifests.deleteManifest(), io, null)) { + try (CloseableIterable deleteFiles = + ManifestFiles.readDeleteManifest(deltaManifests.deleteManifest(), io, null)) { builder.addDeleteFiles(deleteFiles); } } - return builder.addReferencedDataFiles(deltaManifests.referencedDataFiles()) - .build(); + return builder.addReferencedDataFiles(deltaManifests.referencedDataFiles()).build(); } } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java index a86d3ed884bb..b10dc278fb65 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; + import java.io.IOException; import java.io.UncheckedIOException; import java.util.List; @@ -62,40 +63,39 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; - public class FlinkSink { private static final Logger LOG = LoggerFactory.getLogger(FlinkSink.class); - private static final String ICEBERG_STREAM_WRITER_NAME = IcebergStreamWriter.class.getSimpleName(); - private static final String ICEBERG_FILES_COMMITTER_NAME = IcebergFilesCommitter.class.getSimpleName(); + private static final String ICEBERG_STREAM_WRITER_NAME = + IcebergStreamWriter.class.getSimpleName(); + private static final String ICEBERG_FILES_COMMITTER_NAME = + IcebergFilesCommitter.class.getSimpleName(); - private FlinkSink() { - } + private FlinkSink() {} /** - * Initialize a {@link Builder} to export the data from generic input data stream into iceberg table. We use - * {@link RowData} inside the sink connector, so users need to provide a mapper function and a - * {@link TypeInformation} to convert those generic records to a RowData DataStream. + * Initialize a {@link Builder} to export the data from generic input data stream into iceberg + * table. We use {@link RowData} inside the sink connector, so users need to provide a mapper + * function and a {@link TypeInformation} to convert those generic records to a RowData + * DataStream. * - * @param input the generic source input data stream. - * @param mapper function to convert the generic data to {@link RowData} + * @param input the generic source input data stream. + * @param mapper function to convert the generic data to {@link RowData} * @param outputType to define the {@link TypeInformation} for the input data. - * @param the data type of records. + * @param the data type of records. * @return {@link Builder} to connect the iceberg table. */ - public static Builder builderFor(DataStream input, - MapFunction mapper, - TypeInformation outputType) { + public static Builder builderFor( + DataStream input, MapFunction mapper, TypeInformation outputType) { return new Builder().forMapperOutputType(input, mapper, outputType); } /** - * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into iceberg table. We use - * {@link RowData} inside the sink connector, so users need to provide a {@link TableSchema} for builder to convert - * those {@link Row}s to a {@link RowData} DataStream. + * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into + * iceberg table. We use {@link RowData} inside the sink connector, so users need to provide a + * {@link TableSchema} for builder to convert those {@link Row}s to a {@link RowData} DataStream. * - * @param input the source input data stream with {@link Row}s. + * @param input the source input data stream with {@link Row}s. * @param tableSchema defines the {@link TypeInformation} for input data. * @return {@link Builder} to connect the iceberg table. */ @@ -103,13 +103,15 @@ public static Builder forRow(DataStream input, TableSchema tableSchema) { RowType rowType = (RowType) tableSchema.toRowDataType().getLogicalType(); DataType[] fieldDataTypes = tableSchema.getFieldDataTypes(); - DataFormatConverters.RowConverter rowConverter = new DataFormatConverters.RowConverter(fieldDataTypes); + DataFormatConverters.RowConverter rowConverter = + new DataFormatConverters.RowConverter(fieldDataTypes); return builderFor(input, rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)) .tableSchema(tableSchema); } /** - * Initialize a {@link Builder} to export the data from input data stream with {@link RowData}s into iceberg table. + * Initialize a {@link Builder} to export the data from input data stream with {@link RowData}s + * into iceberg table. * * @param input the source input data stream with {@link RowData}s. * @return {@link Builder} to connect the iceberg table. @@ -134,34 +136,35 @@ public static class Builder { private final Map writeOptions = Maps.newHashMap(); private FlinkWriteConf flinkWriteConf = null; - private Builder() { - } + private Builder() {} private Builder forRowData(DataStream newRowDataInput) { this.inputCreator = ignored -> newRowDataInput; return this; } - private Builder forMapperOutputType(DataStream input, - MapFunction mapper, - TypeInformation outputType) { - this.inputCreator = newUidPrefix -> { - // Input stream order is crucial for some situation(e.g. in cdc case). Therefore, we need to set the parallelism - // of map operator same as its input to keep map operator chaining its input, and avoid rebalanced by default. - SingleOutputStreamOperator inputStream = input.map(mapper, outputType) - .setParallelism(input.getParallelism()); - if (newUidPrefix != null) { - inputStream.name(operatorName(newUidPrefix)).uid(newUidPrefix + "-mapper"); - } - return inputStream; - }; + private Builder forMapperOutputType( + DataStream input, MapFunction mapper, TypeInformation outputType) { + this.inputCreator = + newUidPrefix -> { + // Input stream order is crucial for some situation(e.g. in cdc case). Therefore, we + // need to set the parallelism + // of map operator same as its input to keep map operator chaining its input, and avoid + // rebalanced by default. + SingleOutputStreamOperator inputStream = + input.map(mapper, outputType).setParallelism(input.getParallelism()); + if (newUidPrefix != null) { + inputStream.name(operatorName(newUidPrefix)).uid(newUidPrefix + "-mapper"); + } + return inputStream; + }; return this; } /** - * This iceberg {@link Table} instance is used for initializing {@link IcebergStreamWriter} which will write all - * the records into {@link DataFile}s and emit them to downstream operator. Providing a table would avoid so many - * table loading from each separate task. + * This iceberg {@link Table} instance is used for initializing {@link IcebergStreamWriter} + * which will write all the records into {@link DataFile}s and emit them to downstream operator. + * Providing a table would avoid so many table loading from each separate task. * * @param newTable the loaded iceberg table instance. * @return {@link Builder} to connect the iceberg table. @@ -172,9 +175,9 @@ public Builder table(Table newTable) { } /** - * The table loader is used for loading tables in {@link IcebergFilesCommitter} lazily, we need this loader because - * {@link Table} is not serializable and could not just use the loaded table from Builder#table in the remote task - * manager. + * The table loader is used for loading tables in {@link IcebergFilesCommitter} lazily, we need + * this loader because {@link Table} is not serializable and could not just use the loaded table + * from Builder#table in the remote task manager. * * @param newTableLoader to load iceberg table inside tasks. * @return {@link Builder} to connect the iceberg table. @@ -185,8 +188,8 @@ public Builder tableLoader(TableLoader newTableLoader) { } /** - * Set the write properties for Flink sink. - * View the supported properties in {@link FlinkWriteOptions} + * Set the write properties for Flink sink. View the supported properties in {@link + * FlinkWriteOptions} */ public Builder set(String property, String value) { writeOptions.put(property, value); @@ -194,8 +197,8 @@ public Builder set(String property, String value) { } /** - * Set the write properties for Flink sink. - * View the supported properties in {@link FlinkWriteOptions} + * Set the write properties for Flink sink. View the supported properties in {@link + * FlinkWriteOptions} */ public Builder setAll(Map properties) { writeOptions.putAll(properties); @@ -218,14 +221,15 @@ public Builder flinkConf(ReadableConfig config) { } /** - * Configure the write {@link DistributionMode} that the flink sink will use. Currently, flink support - * {@link DistributionMode#NONE} and {@link DistributionMode#HASH}. + * Configure the write {@link DistributionMode} that the flink sink will use. Currently, flink + * support {@link DistributionMode#NONE} and {@link DistributionMode#HASH}. * * @param mode to specify the write distribution mode. * @return {@link Builder} to connect the iceberg table. */ public Builder distributionMode(DistributionMode mode) { - Preconditions.checkArgument(!DistributionMode.RANGE.equals(mode), + Preconditions.checkArgument( + !DistributionMode.RANGE.equals(mode), "Flink does not support 'range' write distribution mode now."); if (mode != null) { writeOptions.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), mode.modeName()); @@ -245,10 +249,10 @@ public Builder writeParallelism(int newWriteParallelism) { } /** - * All INSERT/UPDATE_AFTER events from input stream will be transformed to UPSERT events, which means it will - * DELETE the old records and then INSERT the new records. In partitioned table, the partition fields should be - * a subset of equality fields, otherwise the old row that located in partition-A could not be deleted by the - * new row that located in partition-B. + * All INSERT/UPDATE_AFTER events from input stream will be transformed to UPSERT events, which + * means it will DELETE the old records and then INSERT the new records. In partitioned table, + * the partition fields should be a subset of equality fields, otherwise the old row that + * located in partition-A could not be deleted by the new row that located in partition-B. * * @param enabled indicate whether it should transform all INSERT/UPDATE_AFTER events to UPSERT. * @return {@link Builder} to connect the iceberg table. @@ -270,22 +274,25 @@ public Builder equalityFieldColumns(List columns) { } /** - * Set the uid prefix for FlinkSink operators. Note that FlinkSink internally consists of multiple operators (like - * writer, committer, dummy sink etc.) Actually operator uid will be appended with a suffix like "uidPrefix-writer". - *

- * If provided, this prefix is also applied to operator names. - *

- * Flink auto generates operator uid if not set explicitly. It is a recommended - * - * best-practice to set uid for all operators before deploying to production. Flink has an option to {@code - * pipeline.auto-generate-uid=false} to disable auto-generation and force explicit setting of all operator uid. - *

- * Be careful with setting this for an existing job, because now we are changing the operator uid from an - * auto-generated one to this new value. When deploying the change with a checkpoint, Flink won't be able to restore - * the previous Flink sink operator state (more specifically the committer operator state). You need to use {@code - * --allowNonRestoredState} to ignore the previous sink state. During restore Flink sink state is used to check if - * last commit was actually successful or not. {@code --allowNonRestoredState} can lead to data loss if the - * Iceberg commit failed in the last completed checkpoint. + * Set the uid prefix for FlinkSink operators. Note that FlinkSink internally consists of + * multiple operators (like writer, committer, dummy sink etc.) Actually operator uid will be + * appended with a suffix like "uidPrefix-writer".
+ *
+ * If provided, this prefix is also applied to operator names.
+ *
+ * Flink auto generates operator uid if not set explicitly. It is a recommended + * best-practice to set uid for all operators before deploying to production. Flink has an + * option to {@code pipeline.auto-generate-uid=false} to disable auto-generation and force + * explicit setting of all operator uid.
+ *
+ * Be careful with setting this for an existing job, because now we are changing the operator + * uid from an auto-generated one to this new value. When deploying the change with a + * checkpoint, Flink won't be able to restore the previous Flink sink operator state (more + * specifically the committer operator state). You need to use {@code --allowNonRestoredState} + * to ignore the previous sink state. During restore Flink sink state is used to check if last + * commit was actually successful or not. {@code --allowNonRestoredState} can lead to data loss + * if the Iceberg commit failed in the last completed checkpoint. * * @param newPrefix prefix for Flink sink operator uid and name * @return {@link Builder} to connect the iceberg table. @@ -306,7 +313,8 @@ public Builder setSnapshotProperty(String property, String value) { } private DataStreamSink chainIcebergOperators() { - Preconditions.checkArgument(inputCreator != null, + Preconditions.checkArgument( + inputCreator != null, "Please use forRowData() or forMapperOutputType() to initialize the input DataStream."); Preconditions.checkNotNull(tableLoader, "Table loader shouldn't be null"); @@ -317,7 +325,8 @@ private DataStreamSink chainIcebergOperators() { try (TableLoader loader = tableLoader) { this.table = loader.loadTable(); } catch (IOException e) { - throw new UncheckedIOException("Failed to load iceberg table from table loader: " + tableLoader, e); + throw new UncheckedIOException( + "Failed to load iceberg table from table loader: " + tableLoader, e); } } @@ -329,13 +338,15 @@ private DataStreamSink chainIcebergOperators() { // Convert the requested flink table schema to flink row type. RowType flinkRowType = toFlinkRowType(table.schema(), tableSchema); - // Distribute the records from input data stream based on the write.distribution-mode and equality fields. - DataStream distributeStream = distributeDataStream( - rowDataInput, equalityFieldIds, table.spec(), table.schema(), flinkRowType); + // Distribute the records from input data stream based on the write.distribution-mode and + // equality fields. + DataStream distributeStream = + distributeDataStream( + rowDataInput, equalityFieldIds, table.spec(), table.schema(), flinkRowType); // Add parallel writers that append rows to files - SingleOutputStreamOperator writerStream = appendWriter(distributeStream, flinkRowType, - equalityFieldIds); + SingleOutputStreamOperator writerStream = + appendWriter(distributeStream, flinkRowType, equalityFieldIds); // Add single-parallelism committer that commits files // after successful checkpoint or end of input @@ -362,18 +373,24 @@ private String operatorName(String suffix) { List checkAndGetEqualityFieldIds() { List equalityFieldIds = Lists.newArrayList(table.schema().identifierFieldIds()); if (equalityFieldColumns != null && equalityFieldColumns.size() > 0) { - Set equalityFieldSet = Sets.newHashSetWithExpectedSize(equalityFieldColumns.size()); + Set equalityFieldSet = + Sets.newHashSetWithExpectedSize(equalityFieldColumns.size()); for (String column : equalityFieldColumns) { org.apache.iceberg.types.Types.NestedField field = table.schema().findField(column); - Preconditions.checkNotNull(field, "Missing required equality field column '%s' in table schema %s", - column, table.schema()); + Preconditions.checkNotNull( + field, + "Missing required equality field column '%s' in table schema %s", + column, + table.schema()); equalityFieldSet.add(field.fieldId()); } if (!equalityFieldSet.equals(table.schema().identifierFieldIds())) { - LOG.warn("The configured equality field column IDs {} are not matched with the schema identifier field IDs" + - " {}, use job specified equality field columns as the equality fields by default.", - equalityFieldSet, table.schema().identifierFieldIds()); + LOG.warn( + "The configured equality field column IDs {} are not matched with the schema identifier field IDs" + + " {}, use job specified equality field columns as the equality fields by default.", + equalityFieldSet, + table.schema().identifierFieldIds()); } equalityFieldIds = Lists.newArrayList(equalityFieldSet); } @@ -381,66 +398,82 @@ List checkAndGetEqualityFieldIds() { } @SuppressWarnings("unchecked") - private DataStreamSink appendDummySink(SingleOutputStreamOperator committerStream) { - DataStreamSink resultStream = committerStream - .addSink(new DiscardingSink()) - .name(operatorName(String.format("IcebergSink %s", this.table.name()))) - .setParallelism(1); + private DataStreamSink appendDummySink( + SingleOutputStreamOperator committerStream) { + DataStreamSink resultStream = + committerStream + .addSink(new DiscardingSink()) + .name(operatorName(String.format("IcebergSink %s", this.table.name()))) + .setParallelism(1); if (uidPrefix != null) { resultStream = resultStream.uid(uidPrefix + "-dummysink"); } return resultStream; } - private SingleOutputStreamOperator appendCommitter(SingleOutputStreamOperator writerStream) { - IcebergFilesCommitter filesCommitter = new IcebergFilesCommitter( - tableLoader, flinkWriteConf.overwriteMode(), snapshotProperties, - flinkWriteConf.workerPoolSize()); - SingleOutputStreamOperator committerStream = writerStream - .transform(operatorName(ICEBERG_FILES_COMMITTER_NAME), Types.VOID, filesCommitter) - .setParallelism(1) - .setMaxParallelism(1); + private SingleOutputStreamOperator appendCommitter( + SingleOutputStreamOperator writerStream) { + IcebergFilesCommitter filesCommitter = + new IcebergFilesCommitter( + tableLoader, + flinkWriteConf.overwriteMode(), + snapshotProperties, + flinkWriteConf.workerPoolSize()); + SingleOutputStreamOperator committerStream = + writerStream + .transform(operatorName(ICEBERG_FILES_COMMITTER_NAME), Types.VOID, filesCommitter) + .setParallelism(1) + .setMaxParallelism(1); if (uidPrefix != null) { committerStream = committerStream.uid(uidPrefix + "-committer"); } return committerStream; } - private SingleOutputStreamOperator appendWriter(DataStream input, RowType flinkRowType, - List equalityFieldIds) { + private SingleOutputStreamOperator appendWriter( + DataStream input, RowType flinkRowType, List equalityFieldIds) { // Validate the equality fields and partition fields if we enable the upsert mode. if (flinkWriteConf.upsertMode()) { - Preconditions.checkState(!flinkWriteConf.overwriteMode(), + Preconditions.checkState( + !flinkWriteConf.overwriteMode(), "OVERWRITE mode shouldn't be enable when configuring to use UPSERT data stream."); - Preconditions.checkState(!equalityFieldIds.isEmpty(), + Preconditions.checkState( + !equalityFieldIds.isEmpty(), "Equality field columns shouldn't be empty when configuring to use UPSERT data stream."); if (!table.spec().isUnpartitioned()) { for (PartitionField partitionField : table.spec().fields()) { - Preconditions.checkState(equalityFieldIds.contains(partitionField.sourceId()), + Preconditions.checkState( + equalityFieldIds.contains(partitionField.sourceId()), "In UPSERT mode, partition field '%s' should be included in equality fields: '%s'", - partitionField, equalityFieldColumns); + partitionField, + equalityFieldColumns); } } } - IcebergStreamWriter streamWriter = createStreamWriter(table, flinkWriteConf, - flinkRowType, equalityFieldIds); + IcebergStreamWriter streamWriter = + createStreamWriter(table, flinkWriteConf, flinkRowType, equalityFieldIds); int parallelism = writeParallelism == null ? input.getParallelism() : writeParallelism; - SingleOutputStreamOperator writerStream = input - .transform(operatorName(ICEBERG_STREAM_WRITER_NAME), TypeInformation.of(WriteResult.class), streamWriter) - .setParallelism(parallelism); + SingleOutputStreamOperator writerStream = + input + .transform( + operatorName(ICEBERG_STREAM_WRITER_NAME), + TypeInformation.of(WriteResult.class), + streamWriter) + .setParallelism(parallelism); if (uidPrefix != null) { writerStream = writerStream.uid(uidPrefix + "-writer"); } return writerStream; } - private DataStream distributeDataStream(DataStream input, - List equalityFieldIds, - PartitionSpec partitionSpec, - Schema iSchema, - RowType flinkRowType) { + private DataStream distributeDataStream( + DataStream input, + List equalityFieldIds, + PartitionSpec partitionSpec, + Schema iSchema, + RowType flinkRowType) { DistributionMode writeMode = flinkWriteConf.distributionMode(); LOG.info("Write distribution mode is '{}'", writeMode.modeName()); @@ -450,28 +483,35 @@ private DataStream distributeDataStream(DataStream input, return input; } else { LOG.info("Distribute rows by equality fields, because there are equality fields set"); - return input.keyBy(new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + return input.keyBy( + new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); } case HASH: if (equalityFieldIds.isEmpty()) { if (partitionSpec.isUnpartitioned()) { - LOG.warn("Fallback to use 'none' distribution mode, because there are no equality fields set " + - "and table is unpartitioned"); + LOG.warn( + "Fallback to use 'none' distribution mode, because there are no equality fields set " + + "and table is unpartitioned"); return input; } else { return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); } } else { if (partitionSpec.isUnpartitioned()) { - LOG.info("Distribute rows by equality fields, because there are equality fields set " + - "and table is unpartitioned"); - return input.keyBy(new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + LOG.info( + "Distribute rows by equality fields, because there are equality fields set " + + "and table is unpartitioned"); + return input.keyBy( + new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); } else { for (PartitionField partitionField : partitionSpec.fields()) { - Preconditions.checkState(equalityFieldIds.contains(partitionField.sourceId()), - "In 'hash' distribution mode with equality fields set, partition field '%s' " + - "should be included in equality fields: '%s'", partitionField, equalityFieldColumns); + Preconditions.checkState( + equalityFieldIds.contains(partitionField.sourceId()), + "In 'hash' distribution mode with equality fields set, partition field '%s' " + + "should be included in equality fields: '%s'", + partitionField, + equalityFieldColumns); } return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); } @@ -479,13 +519,18 @@ private DataStream distributeDataStream(DataStream input, case RANGE: if (equalityFieldIds.isEmpty()) { - LOG.warn("Fallback to use 'none' distribution mode, because there are no equality fields set " + - "and {}=range is not supported yet in flink", WRITE_DISTRIBUTION_MODE); + LOG.warn( + "Fallback to use 'none' distribution mode, because there are no equality fields set " + + "and {}=range is not supported yet in flink", + WRITE_DISTRIBUTION_MODE); return input; } else { - LOG.info("Distribute rows by equality fields, because there are equality fields set " + - "and{}=range is not supported yet in flink", WRITE_DISTRIBUTION_MODE); - return input.keyBy(new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + LOG.info( + "Distribute rows by equality fields, because there are equality fields set " + + "and{}=range is not supported yet in flink", + WRITE_DISTRIBUTION_MODE); + return input.keyBy( + new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); } default: @@ -496,13 +541,17 @@ private DataStream distributeDataStream(DataStream input, static RowType toFlinkRowType(Schema schema, TableSchema requestedSchema) { if (requestedSchema != null) { - // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing iceberg schema. + // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing + // iceberg schema. Schema writeSchema = TypeUtil.reassignIds(FlinkSchemaUtil.convert(requestedSchema), schema); TypeUtil.validateWriteSchema(schema, writeSchema, true, true); - // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will be promoted to - // iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT (backend by 1 'byte'), we will - // read 4 bytes rather than 1 byte, it will mess up the byte array in BinaryRowData. So here we must use flink + // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will + // be promoted to + // iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT (backend by 1 + // 'byte'), we will + // read 4 bytes rather than 1 byte, it will mess up the byte array in BinaryRowData. So here + // we must use flink // schema. return (RowType) requestedSchema.toRowDataType().getLogicalType(); } else { @@ -510,16 +559,22 @@ static RowType toFlinkRowType(Schema schema, TableSchema requestedSchema) { } } - static IcebergStreamWriter createStreamWriter(Table table, - FlinkWriteConf flinkWriteConf, - RowType flinkRowType, - List equalityFieldIds) { + static IcebergStreamWriter createStreamWriter( + Table table, + FlinkWriteConf flinkWriteConf, + RowType flinkRowType, + List equalityFieldIds) { Preconditions.checkArgument(table != null, "Iceberg table shouldn't be null"); Table serializableTable = SerializableTable.copyOf(table); - TaskWriterFactory taskWriterFactory = new RowDataTaskWriterFactory( - serializableTable, flinkRowType, flinkWriteConf.targetDataFileSize(), - flinkWriteConf.dataFileFormat(), equalityFieldIds, flinkWriteConf.upsertMode()); + TaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + serializableTable, + flinkRowType, + flinkWriteConf.targetDataFileSize(), + flinkWriteConf.dataFileFormat(), + equalityFieldIds, + flinkWriteConf.upsertMode()); return new IcebergStreamWriter<>(table.name(), taskWriterFactory); } } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java index bd332d96d466..c5b7643b27a7 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -70,9 +69,12 @@ class IcebergFilesCommitter extends AbstractStreamOperator private static final Logger LOG = LoggerFactory.getLogger(IcebergFilesCommitter.class); private static final String FLINK_JOB_ID = "flink.job-id"; - // The max checkpoint id we've committed to iceberg table. As the flink's checkpoint is always increasing, so we could - // correctly commit all the data files whose checkpoint id is greater than the max committed one to iceberg table, for - // avoiding committing the same data files twice. This id will be attached to iceberg's meta when committing the + // The max checkpoint id we've committed to iceberg table. As the flink's checkpoint is always + // increasing, so we could + // correctly commit all the data files whose checkpoint id is greater than the max committed one + // to iceberg table, for + // avoiding committing the same data files twice. This id will be attached to iceberg's meta when + // committing the // iceberg transaction. private static final String MAX_COMMITTED_CHECKPOINT_ID = "flink.max-committed-checkpoint-id"; static final String MAX_CONTINUOUS_EMPTY_COMMITS = "flink.max-continuous-empty-commits"; @@ -82,15 +84,21 @@ class IcebergFilesCommitter extends AbstractStreamOperator private final boolean replacePartitions; private final Map snapshotProperties; - // A sorted map to maintain the completed data files for each pending checkpointId (which have not been committed - // to iceberg table). We need a sorted map here because there's possible that few checkpoints snapshot failed, for - // example: the 1st checkpoint have 2 data files <1, >, the 2st checkpoint have 1 data files - // <2, >. Snapshot for checkpoint#1 interrupted because of network/disk failure etc, while we don't expect - // any data loss in iceberg table. So we keep the finished files <1, > in memory and retry to commit + // A sorted map to maintain the completed data files for each pending checkpointId (which have not + // been committed + // to iceberg table). We need a sorted map here because there's possible that few checkpoints + // snapshot failed, for + // example: the 1st checkpoint have 2 data files <1, >, the 2st checkpoint have 1 + // data files + // <2, >. Snapshot for checkpoint#1 interrupted because of network/disk failure etc, while + // we don't expect + // any data loss in iceberg table. So we keep the finished files <1, > in memory and + // retry to commit // iceberg table when the next checkpoint happen. private final NavigableMap dataFilesPerCheckpoint = Maps.newTreeMap(); - // The completed files cache for current checkpoint. Once the snapshot barrier received, it will be flushed to the + // The completed files cache for current checkpoint. Once the snapshot barrier received, it will + // be flushed to the // 'dataFilesPerCheckpoint'. private final List writeResultsOfCurrentCkpt = Lists.newArrayList(); @@ -101,22 +109,29 @@ class IcebergFilesCommitter extends AbstractStreamOperator private transient long maxCommittedCheckpointId; private transient int continuousEmptyCheckpoints; private transient int maxContinuousEmptyCommits; - // There're two cases that we restore from flink checkpoints: the first case is restoring from snapshot created by the - // same flink job; another case is restoring from snapshot created by another different job. For the second case, we - // need to maintain the old flink job's id in flink state backend to find the max-committed-checkpoint-id when + // There're two cases that we restore from flink checkpoints: the first case is restoring from + // snapshot created by the + // same flink job; another case is restoring from snapshot created by another different job. For + // the second case, we + // need to maintain the old flink job's id in flink state backend to find the + // max-committed-checkpoint-id when // traversing iceberg table's snapshots. - private static final ListStateDescriptor JOB_ID_DESCRIPTOR = new ListStateDescriptor<>( - "iceberg-flink-job-id", BasicTypeInfo.STRING_TYPE_INFO); + private static final ListStateDescriptor JOB_ID_DESCRIPTOR = + new ListStateDescriptor<>("iceberg-flink-job-id", BasicTypeInfo.STRING_TYPE_INFO); private transient ListState jobIdState; // All pending checkpoints states for this function. - private static final ListStateDescriptor> STATE_DESCRIPTOR = buildStateDescriptor(); + private static final ListStateDescriptor> STATE_DESCRIPTOR = + buildStateDescriptor(); private transient ListState> checkpointsState; private final Integer workerPoolSize; private transient ExecutorService workerPool; - IcebergFilesCommitter(TableLoader tableLoader, boolean replacePartitions, Map snapshotProperties, - Integer workerPoolSize) { + IcebergFilesCommitter( + TableLoader tableLoader, + boolean replacePartitions, + Map snapshotProperties, + Integer workerPoolSize) { this.tableLoader = tableLoader; this.replacePartitions = replacePartitions; this.snapshotProperties = snapshotProperties; @@ -132,32 +147,37 @@ public void initializeState(StateInitializationContext context) throws Exception this.tableLoader.open(); this.table = tableLoader.loadTable(); - maxContinuousEmptyCommits = PropertyUtil.propertyAsInt(table.properties(), MAX_CONTINUOUS_EMPTY_COMMITS, 10); - Preconditions.checkArgument(maxContinuousEmptyCommits > 0, - MAX_CONTINUOUS_EMPTY_COMMITS + " must be positive"); + maxContinuousEmptyCommits = + PropertyUtil.propertyAsInt(table.properties(), MAX_CONTINUOUS_EMPTY_COMMITS, 10); + Preconditions.checkArgument( + maxContinuousEmptyCommits > 0, MAX_CONTINUOUS_EMPTY_COMMITS + " must be positive"); int subTaskId = getRuntimeContext().getIndexOfThisSubtask(); int attemptId = getRuntimeContext().getAttemptNumber(); String operatorUniqueId = getRuntimeContext().getOperatorUniqueID(); - this.manifestOutputFileFactory = FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, operatorUniqueId, - subTaskId, attemptId); + this.manifestOutputFileFactory = + FlinkManifestUtil.createOutputFileFactory( + table, flinkJobId, operatorUniqueId, subTaskId, attemptId); this.maxCommittedCheckpointId = INITIAL_CHECKPOINT_ID; this.checkpointsState = context.getOperatorStateStore().getListState(STATE_DESCRIPTOR); this.jobIdState = context.getOperatorStateStore().getListState(JOB_ID_DESCRIPTOR); if (context.isRestored()) { String restoredFlinkJobId = jobIdState.get().iterator().next(); - Preconditions.checkState(!Strings.isNullOrEmpty(restoredFlinkJobId), + Preconditions.checkState( + !Strings.isNullOrEmpty(restoredFlinkJobId), "Flink job id parsed from checkpoint snapshot shouldn't be null or empty"); - // Since flink's checkpoint id will start from the max-committed-checkpoint-id + 1 in the new flink job even if - // it's restored from a snapshot created by another different flink job, so it's safe to assign the max committed + // Since flink's checkpoint id will start from the max-committed-checkpoint-id + 1 in the new + // flink job even if + // it's restored from a snapshot created by another different flink job, so it's safe to + // assign the max committed // checkpoint id from restored flink job to the current flink job. this.maxCommittedCheckpointId = getMaxCommittedCheckpointId(table, restoredFlinkJobId); - NavigableMap uncommittedDataFiles = Maps - .newTreeMap(checkpointsState.get().iterator().next()) - .tailMap(maxCommittedCheckpointId, false); + NavigableMap uncommittedDataFiles = + Maps.newTreeMap(checkpointsState.get().iterator().next()) + .tailMap(maxCommittedCheckpointId, false); if (!uncommittedDataFiles.isEmpty()) { // Committed all uncommitted data files from the old flink job to iceberg table. long maxUncommittedCheckpointId = uncommittedDataFiles.lastKey(); @@ -170,7 +190,10 @@ public void initializeState(StateInitializationContext context) throws Exception public void snapshotState(StateSnapshotContext context) throws Exception { super.snapshotState(context); long checkpointId = context.getCheckpointId(); - LOG.info("Start to flush snapshot state to state backend, table: {}, checkpointId: {}", table, checkpointId); + LOG.info( + "Start to flush snapshot state to state backend, table: {}, checkpointId: {}", + table, + checkpointId); // Update the checkpoint state. dataFilesPerCheckpoint.put(checkpointId, writeToManifest(checkpointId)); @@ -193,7 +216,8 @@ public void notifyCheckpointComplete(long checkpointId) throws Exception { // 2. snapshotState(ckpId+1); // 3. notifyCheckpointComplete(ckpId+1); // 4. notifyCheckpointComplete(ckpId); - // For step#4, we don't need to commit iceberg table again because in step#3 we've committed all the files, + // For step#4, we don't need to commit iceberg table again because in step#3 we've committed all + // the files, // Besides, we need to maintain the max-committed-checkpoint-id to be increasing. if (checkpointId > maxCommittedCheckpointId) { commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, checkpointId); @@ -201,9 +225,9 @@ public void notifyCheckpointComplete(long checkpointId) throws Exception { } } - private void commitUpToCheckpoint(NavigableMap deltaManifestsMap, - String newFlinkJobId, - long checkpointId) throws IOException { + private void commitUpToCheckpoint( + NavigableMap deltaManifestsMap, String newFlinkJobId, long checkpointId) + throws IOException { NavigableMap pendingMap = deltaManifestsMap.headMap(checkpointId, true); List manifests = Lists.newArrayList(); NavigableMap pendingResults = Maps.newTreeMap(); @@ -213,14 +237,18 @@ private void commitUpToCheckpoint(NavigableMap deltaManifestsMap, continue; } - DeltaManifests deltaManifests = SimpleVersionedSerialization - .readVersionAndDeSerialize(DeltaManifestsSerializer.INSTANCE, e.getValue()); - pendingResults.put(e.getKey(), FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io())); + DeltaManifests deltaManifests = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, e.getValue()); + pendingResults.put( + e.getKey(), FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io())); manifests.addAll(deltaManifests.manifests()); } - int totalFiles = pendingResults.values().stream() - .mapToInt(r -> r.dataFiles().length + r.deleteFiles().length).sum(); + int totalFiles = + pendingResults.values().stream() + .mapToInt(r -> r.dataFiles().length + r.deleteFiles().length) + .sum(); continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0; if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) { if (replacePartitions) { @@ -238,21 +266,25 @@ private void commitUpToCheckpoint(NavigableMap deltaManifestsMap, table.io().deleteFile(manifest.path()); } catch (Exception e) { // The flink manifests cleaning failure shouldn't abort the completed checkpoint. - String details = MoreObjects.toStringHelper(this) - .add("flinkJobId", newFlinkJobId) - .add("checkpointId", checkpointId) - .add("manifestPath", manifest.path()) - .toString(); - LOG.warn("The iceberg transaction has been committed, but we failed to clean the temporary flink manifests: {}", - details, e); + String details = + MoreObjects.toStringHelper(this) + .add("flinkJobId", newFlinkJobId) + .add("checkpointId", checkpointId) + .add("manifestPath", manifest.path()) + .toString(); + LOG.warn( + "The iceberg transaction has been committed, but we failed to clean the temporary flink manifests: {}", + details, + e); } } } - private void replacePartitions(NavigableMap pendingResults, String newFlinkJobId, - long checkpointId) { + private void replacePartitions( + NavigableMap pendingResults, String newFlinkJobId, long checkpointId) { // Partition overwrite does not support delete files. - int deleteFilesNum = pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum(); + int deleteFilesNum = + pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum(); Preconditions.checkState(deleteFilesNum == 0, "Cannot overwrite partitions with delete files."); // Commit the overwrite transaction. @@ -260,17 +292,21 @@ private void replacePartitions(NavigableMap pendingResults, S int numFiles = 0; for (WriteResult result : pendingResults.values()) { - Preconditions.checkState(result.referencedDataFiles().length == 0, "Should have no referenced data files."); + Preconditions.checkState( + result.referencedDataFiles().length == 0, "Should have no referenced data files."); numFiles += result.dataFiles().length; Arrays.stream(result.dataFiles()).forEach(dynamicOverwrite::addFile); } - commitOperation(dynamicOverwrite, numFiles, 0, "dynamic partition overwrite", newFlinkJobId, checkpointId); + commitOperation( + dynamicOverwrite, numFiles, 0, "dynamic partition overwrite", newFlinkJobId, checkpointId); } - private void commitDeltaTxn(NavigableMap pendingResults, String newFlinkJobId, long checkpointId) { - int deleteFilesNum = pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum(); + private void commitDeltaTxn( + NavigableMap pendingResults, String newFlinkJobId, long checkpointId) { + int deleteFilesNum = + pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum(); if (deleteFilesNum == 0) { // To be compatible with iceberg format V1. @@ -278,7 +314,8 @@ private void commitDeltaTxn(NavigableMap pendingResults, Stri int numFiles = 0; for (WriteResult result : pendingResults.values()) { - Preconditions.checkState(result.referencedDataFiles().length == 0, "Should have no referenced data files."); + Preconditions.checkState( + result.referencedDataFiles().length == 0, "Should have no referenced data files."); numFiles += result.dataFiles().length; Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); @@ -288,16 +325,23 @@ private void commitDeltaTxn(NavigableMap pendingResults, Stri } else { // To be compatible with iceberg format V2. for (Map.Entry e : pendingResults.entrySet()) { - // We don't commit the merged result into a single transaction because for the sequential transaction txn1 and - // txn2, the equality-delete files of txn2 are required to be applied to data files from txn1. Committing the + // We don't commit the merged result into a single transaction because for the sequential + // transaction txn1 and + // txn2, the equality-delete files of txn2 are required to be applied to data files from + // txn1. Committing the // merged one will lead to the incorrect delete semantic. WriteResult result = e.getValue(); - // Row delta validations are not needed for streaming changes that write equality deletes. Equality deletes - // are applied to data in all previous sequence numbers, so retries may push deletes further in the future, - // but do not affect correctness. Position deletes committed to the table in this path are used only to delete - // rows from data files that are being added in this commit. There is no way for data files added along with - // the delete files to be concurrently removed, so there is no need to validate the files referenced by the + // Row delta validations are not needed for streaming changes that write equality deletes. + // Equality deletes + // are applied to data in all previous sequence numbers, so retries may push deletes further + // in the future, + // but do not affect correctness. Position deletes committed to the table in this path are + // used only to delete + // rows from data files that are being added in this commit. There is no way for data files + // added along with + // the delete files to be concurrently removed, so there is no need to validate the files + // referenced by the // position delete files that are being committed. RowDelta rowDelta = table.newRowDelta().scanManifestsWith(workerPool); @@ -307,17 +351,28 @@ private void commitDeltaTxn(NavigableMap pendingResults, Stri int numDeleteFiles = result.deleteFiles().length; Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); - commitOperation(rowDelta, numDataFiles, numDeleteFiles, "rowDelta", newFlinkJobId, e.getKey()); + commitOperation( + rowDelta, numDataFiles, numDeleteFiles, "rowDelta", newFlinkJobId, e.getKey()); } } } - private void commitOperation(SnapshotUpdate operation, int numDataFiles, int numDeleteFiles, String description, - String newFlinkJobId, long checkpointId) { - LOG.info("Committing {} with {} data files and {} delete files to table {}", description, numDataFiles, - numDeleteFiles, table); + private void commitOperation( + SnapshotUpdate operation, + int numDataFiles, + int numDeleteFiles, + String description, + String newFlinkJobId, + long checkpointId) { + LOG.info( + "Committing {} with {} data files and {} delete files to table {}", + description, + numDataFiles, + numDeleteFiles, + table); snapshotProperties.forEach(operation::set); - // custom snapshot metadata properties will be overridden if they conflict with internal ones used by the sink. + // custom snapshot metadata properties will be overridden if they conflict with internal ones + // used by the sink. operation.set(MAX_COMMITTED_CHECKPOINT_ID, Long.toString(checkpointId)); operation.set(FLINK_JOB_ID, newFlinkJobId); @@ -343,7 +398,8 @@ public void endInput() throws IOException { } /** - * Write all the complete data files to a newly created manifest file and return the manifest's avro serialized bytes. + * Write all the complete data files to a newly created manifest file and return the manifest's + * avro serialized bytes. */ private byte[] writeToManifest(long checkpointId) throws IOException { if (writeResultsOfCurrentCkpt.isEmpty()) { @@ -351,10 +407,12 @@ private byte[] writeToManifest(long checkpointId) throws IOException { } WriteResult result = WriteResult.builder().addAll(writeResultsOfCurrentCkpt).build(); - DeltaManifests deltaManifests = FlinkManifestUtil.writeCompletedFiles(result, - () -> manifestOutputFileFactory.create(checkpointId), table.spec()); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + result, () -> manifestOutputFileFactory.create(checkpointId), table.spec()); - return SimpleVersionedSerialization.writeVersionAndSerialize(DeltaManifestsSerializer.INSTANCE, deltaManifests); + return SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, deltaManifests); } @Override @@ -362,7 +420,8 @@ public void open() throws Exception { super.open(); final String operatorID = getRuntimeContext().getOperatorUniqueID(); - this.workerPool = ThreadPools.newWorkerPool("iceberg-worker-pool-" + operatorID, workerPoolSize); + this.workerPool = + ThreadPools.newWorkerPool("iceberg-worker-pool-" + operatorID, workerPoolSize); } @Override @@ -379,9 +438,11 @@ public void close() throws Exception { private static ListStateDescriptor> buildStateDescriptor() { Comparator longComparator = Comparators.forType(Types.LongType.get()); // Construct a SortedMapTypeInfo. - SortedMapTypeInfo sortedMapTypeInfo = new SortedMapTypeInfo<>( - BasicTypeInfo.LONG_TYPE_INFO, PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO, longComparator - ); + SortedMapTypeInfo sortedMapTypeInfo = + new SortedMapTypeInfo<>( + BasicTypeInfo.LONG_TYPE_INFO, + PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO, + longComparator); return new ListStateDescriptor<>("iceberg-files-committer-state", sortedMapTypeInfo); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java index cc8e6ce8284f..693ecb06da67 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -83,7 +82,8 @@ public void close() throws Exception { @Override public void endInput() throws IOException { - // For bounded stream, it may don't enable the checkpoint mechanism so we'd better to emit the remaining + // For bounded stream, it may don't enable the checkpoint mechanism so we'd better to emit the + // remaining // completed files to downstream before closing the writer so that we won't miss any of them. emit(writer.complete()); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java index b7d575bb446b..045e45a4ceae 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.Map; @@ -28,7 +27,8 @@ import org.apache.iceberg.relocated.com.google.common.base.Strings; class ManifestOutputFileFactory { - // Users could define their own flink manifests directory by setting this value in table properties. + // Users could define their own flink manifests directory by setting this value in table + // properties. static final String FLINK_MANIFEST_LOCATION = "flink.manifests.location"; private final TableOperations ops; @@ -40,8 +40,14 @@ class ManifestOutputFileFactory { private final long attemptNumber; private final AtomicInteger fileCount = new AtomicInteger(0); - ManifestOutputFileFactory(TableOperations ops, FileIO io, Map props, - String flinkJobId, String operatorUniqueId, int subTaskId, long attemptNumber) { + ManifestOutputFileFactory( + TableOperations ops, + FileIO io, + Map props, + String flinkJobId, + String operatorUniqueId, + int subTaskId, + long attemptNumber) { this.ops = ops; this.io = io; this.props = props; @@ -52,8 +58,15 @@ class ManifestOutputFileFactory { } private String generatePath(long checkpointId) { - return FileFormat.AVRO.addExtension(String.format("%s-%s-%05d-%d-%d-%05d", flinkJobId, operatorUniqueId, - subTaskId, attemptNumber, checkpointId, fileCount.incrementAndGet())); + return FileFormat.AVRO.addExtension( + String.format( + "%s-%s-%05d-%d-%d-%05d", + flinkJobId, + operatorUniqueId, + subTaskId, + attemptNumber, + checkpointId, + fileCount.incrementAndGet())); } OutputFile create(long checkpointId) { @@ -64,7 +77,8 @@ OutputFile create(long checkpointId) { // User don't specify any flink manifest directory, so just use the default metadata path. newManifestFullPath = ops.metadataFileLocation(generatePath(checkpointId)); } else { - newManifestFullPath = String.format("%s/%s", stripTrailingSlash(flinkManifestDir), generatePath(checkpointId)); + newManifestFullPath = + String.format("%s/%s", stripTrailingSlash(flinkManifestDir), generatePath(checkpointId)); } return io.newOutputFile(newManifestFullPath); diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java index 598df09eee83..df951684b446 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import org.apache.flink.api.java.functions.KeySelector; @@ -28,8 +27,9 @@ import org.apache.iceberg.flink.RowDataWrapper; /** - * Create a {@link KeySelector} to shuffle by partition key, then each partition/bucket will be wrote by only one - * task. That will reduce lots of small files in partitioned fanout write policy for {@link FlinkSink}. + * Create a {@link KeySelector} to shuffle by partition key, then each partition/bucket will be + * wrote by only one task. That will reduce lots of small files in partitioned fanout write policy + * for {@link FlinkSink}. */ class PartitionKeySelector implements KeySelector { @@ -46,8 +46,8 @@ class PartitionKeySelector implements KeySelector { } /** - * Construct the {@link RowDataWrapper} lazily here because few members in it are not serializable. In this way, we - * don't have to serialize them with forcing. + * Construct the {@link RowDataWrapper} lazily here because few members in it are not + * serializable. In this way, we don't have to serialize them with forcing. */ private RowDataWrapper lazyRowDataWrapper() { if (rowDataWrapper == null) { diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java index 1eee6298e933..38062dd1a2c4 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -41,17 +40,27 @@ class PartitionedDeltaWriter extends BaseDeltaTaskWriter { private final Map writers = Maps.newHashMap(); - PartitionedDeltaWriter(PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - List equalityFieldIds, - boolean upsert) { - super(spec, format, appenderFactory, fileFactory, io, targetFileSize, schema, flinkSchema, equalityFieldIds, + PartitionedDeltaWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema, + List equalityFieldIds, + boolean upsert) { + super( + spec, + format, + appenderFactory, + fileFactory, + io, + targetFileSize, + schema, + flinkSchema, + equalityFieldIds, upsert); this.partitionKey = new PartitionKey(spec, schema); } @@ -62,7 +71,8 @@ RowDataDeltaWriter route(RowData row) { RowDataDeltaWriter writer = writers.get(partitionKey); if (writer == null) { - // NOTICE: we need to copy a new partition key here, in case of messing up the keys in writers. + // NOTICE: we need to copy a new partition key here, in case of messing up the keys in + // writers. PartitionKey copiedKey = partitionKey.copy(); writer = new RowDataDeltaWriter(copiedKey); writers.put(copiedKey, writer); diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java index f6ee976e637f..1c330434d019 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -53,12 +52,13 @@ public class RowDataTaskWriterFactory implements TaskWriterFactory { private transient OutputFileFactory outputFileFactory; - public RowDataTaskWriterFactory(Table table, - RowType flinkSchema, - long targetFileSizeBytes, - FileFormat format, - List equalityFieldIds, - boolean upsert) { + public RowDataTaskWriterFactory( + Table table, + RowType flinkSchema, + long targetFileSizeBytes, + FileFormat format, + List equalityFieldIds, + boolean upsert) { this.table = table; this.schema = table.schema(); this.flinkSchema = flinkSchema; @@ -70,47 +70,90 @@ public RowDataTaskWriterFactory(Table table, this.upsert = upsert; if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { - this.appenderFactory = new FlinkAppenderFactory(schema, flinkSchema, table.properties(), spec); + this.appenderFactory = + new FlinkAppenderFactory(schema, flinkSchema, table.properties(), spec); } else if (upsert) { - // In upsert mode, only the new row is emitted using INSERT row kind. Therefore, any column of the inserted row - // may differ from the deleted row other than the primary key fields, and the delete file must contain values + // In upsert mode, only the new row is emitted using INSERT row kind. Therefore, any column of + // the inserted row + // may differ from the deleted row other than the primary key fields, and the delete file must + // contain values // that are correct for the deleted row. Therefore, only write the equality delete fields. - this.appenderFactory = new FlinkAppenderFactory(schema, flinkSchema, table.properties(), spec, - ArrayUtil.toIntArray(equalityFieldIds), TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)), null); + this.appenderFactory = + new FlinkAppenderFactory( + schema, + flinkSchema, + table.properties(), + spec, + ArrayUtil.toIntArray(equalityFieldIds), + TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)), + null); } else { - this.appenderFactory = new FlinkAppenderFactory(schema, flinkSchema, table.properties(), spec, - ArrayUtil.toIntArray(equalityFieldIds), schema, null); + this.appenderFactory = + new FlinkAppenderFactory( + schema, + flinkSchema, + table.properties(), + spec, + ArrayUtil.toIntArray(equalityFieldIds), + schema, + null); } } @Override public void initialize(int taskId, int attemptId) { - this.outputFileFactory = OutputFileFactory.builderFor(table, taskId, attemptId) - .format(format) - .build(); + this.outputFileFactory = + OutputFileFactory.builderFor(table, taskId, attemptId).format(format).build(); } @Override public TaskWriter create() { - Preconditions.checkNotNull(outputFileFactory, + Preconditions.checkNotNull( + outputFileFactory, "The outputFileFactory shouldn't be null if we have invoked the initialize()."); if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { // Initialize a task writer to write INSERT only. if (spec.isUnpartitioned()) { - return new UnpartitionedWriter<>(spec, format, appenderFactory, outputFileFactory, io, targetFileSizeBytes); + return new UnpartitionedWriter<>( + spec, format, appenderFactory, outputFileFactory, io, targetFileSizeBytes); } else { - return new RowDataPartitionedFanoutWriter(spec, format, appenderFactory, outputFileFactory, - io, targetFileSizeBytes, schema, flinkSchema); + return new RowDataPartitionedFanoutWriter( + spec, + format, + appenderFactory, + outputFileFactory, + io, + targetFileSizeBytes, + schema, + flinkSchema); } } else { // Initialize a task writer to write both INSERT and equality DELETE. if (spec.isUnpartitioned()) { - return new UnpartitionedDeltaWriter(spec, format, appenderFactory, outputFileFactory, io, - targetFileSizeBytes, schema, flinkSchema, equalityFieldIds, upsert); + return new UnpartitionedDeltaWriter( + spec, + format, + appenderFactory, + outputFileFactory, + io, + targetFileSizeBytes, + schema, + flinkSchema, + equalityFieldIds, + upsert); } else { - return new PartitionedDeltaWriter(spec, format, appenderFactory, outputFileFactory, io, - targetFileSizeBytes, schema, flinkSchema, equalityFieldIds, upsert); + return new PartitionedDeltaWriter( + spec, + format, + appenderFactory, + outputFileFactory, + io, + targetFileSizeBytes, + schema, + flinkSchema, + equalityFieldIds, + upsert); } } } @@ -120,9 +163,15 @@ private static class RowDataPartitionedFanoutWriter extends PartitionedFanoutWri private final PartitionKey partitionKey; private final RowDataWrapper rowDataWrapper; - RowDataPartitionedFanoutWriter(PartitionSpec spec, FileFormat format, FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize, Schema schema, - RowType flinkSchema) { + RowDataPartitionedFanoutWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.partitionKey = new PartitionKey(spec, schema); this.rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java index 9d56ec6a812a..e3a1245e8cbd 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.Serializable; @@ -32,7 +31,7 @@ public interface TaskWriterFactory extends Serializable { /** * Initialize the factory with a given taskId and attemptId. * - * @param taskId the identifier of task. + * @param taskId the identifier of task. * @param attemptId the attempt id of this task. */ void initialize(int taskId, int attemptId); diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java index 331ed7c78192..7680fb933b20 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -33,17 +32,27 @@ class UnpartitionedDeltaWriter extends BaseDeltaTaskWriter { private final RowDataDeltaWriter writer; - UnpartitionedDeltaWriter(PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - List equalityFieldIds, - boolean upsert) { - super(spec, format, appenderFactory, fileFactory, io, targetFileSize, schema, flinkSchema, equalityFieldIds, + UnpartitionedDeltaWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema, + List equalityFieldIds, + boolean upsert) { + super( + spec, + format, + appenderFactory, + fileFactory, + io, + targetFileSize, + schema, + flinkSchema, + equalityFieldIds, upsert); this.writer = new RowDataDeltaWriter(null); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java index 805cea50131b..91d975349b19 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -49,8 +48,11 @@ public class DataIterator implements CloseableIterator { private int fileOffset; private long recordOffset; - public DataIterator(FileScanTaskReader fileScanTaskReader, CombinedScanTask task, - FileIO io, EncryptionManager encryption) { + public DataIterator( + FileScanTaskReader fileScanTaskReader, + CombinedScanTask task, + FileIO io, + EncryptionManager encryption) { this.fileScanTaskReader = fileScanTaskReader; this.inputFilesDecryptor = new InputFilesDecryptor(task, io, encryption); @@ -67,17 +69,20 @@ public DataIterator(FileScanTaskReader fileScanTaskReader, CombinedScanTask t } /** - * (startingFileOffset, startingRecordOffset) points to the next row that reader should resume from. - * E.g., if the seek position is (file=0, record=1), seek moves the iterator position to the 2nd row - * in file 0. When next() is called after seek, 2nd row from file 0 should be returned. + * (startingFileOffset, startingRecordOffset) points to the next row that reader should resume + * from. E.g., if the seek position is (file=0, record=1), seek moves the iterator position to the + * 2nd row in file 0. When next() is called after seek, 2nd row from file 0 should be returned. */ public void seek(int startingFileOffset, long startingRecordOffset) { - Preconditions.checkState(fileOffset == -1, - "Seek should be called before any other iterator actions"); + Preconditions.checkState( + fileOffset == -1, "Seek should be called before any other iterator actions"); // skip files - Preconditions.checkState(startingFileOffset < combinedTask.files().size(), + Preconditions.checkState( + startingFileOffset < combinedTask.files().size(), "Invalid starting file offset %s for combined scan task with %s files: %s", - startingFileOffset, combinedTask.files().size(), combinedTask); + startingFileOffset, + combinedTask.files().size(), + combinedTask); for (long i = 0L; i < startingFileOffset; ++i) { tasks.next(); } @@ -88,9 +93,10 @@ public void seek(int startingFileOffset, long startingRecordOffset) { if (currentFileHasNext() && hasNext()) { next(); } else { - throw new IllegalStateException(String.format( - "Invalid starting record offset %d for file %d from CombinedScanTask: %s", - startingRecordOffset, startingFileOffset, combinedTask)); + throw new IllegalStateException( + String.format( + "Invalid starting record offset %d for file %d from CombinedScanTask: %s", + startingRecordOffset, startingFileOffset, combinedTask)); } } @@ -115,10 +121,7 @@ public boolean currentFileHasNext() { return currentIterator.hasNext(); } - /** - * Updates the current iterator field to ensure that the current Iterator - * is not exhausted. - */ + /** Updates the current iterator field to ensure that the current Iterator is not exhausted. */ private void updateCurrentIterator() { try { while (!currentIterator.hasNext() && tasks.hasNext()) { diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java index 04273016ee2d..927a804a4792 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.Serializable; diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java index ee71ab7fe594..44b35522becb 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -37,9 +36,7 @@ import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.util.ThreadPools; -/** - * Flink {@link InputFormat} for Iceberg. - */ +/** Flink {@link InputFormat} for Iceberg. */ public class FlinkInputFormat extends RichInputFormat { private static final long serialVersionUID = 1L; @@ -53,14 +50,19 @@ public class FlinkInputFormat extends RichInputFormat private transient DataIterator iterator; private transient long currentReadCount = 0L; - FlinkInputFormat(TableLoader tableLoader, Schema tableSchema, FileIO io, EncryptionManager encryption, - ScanContext context) { + FlinkInputFormat( + TableLoader tableLoader, + Schema tableSchema, + FileIO io, + EncryptionManager encryption, + ScanContext context) { this.tableLoader = tableLoader; this.io = io; this.encryption = encryption; this.context = context; - this.rowDataReader = new RowDataFileScanTaskReader(tableSchema, - context.project(), context.nameMapping(), context.caseSensitive()); + this.rowDataReader = + new RowDataFileScanTaskReader( + tableSchema, context.project(), context.nameMapping(), context.caseSensitive()); } @VisibleForTesting @@ -78,7 +80,8 @@ public BaseStatistics getStatistics(BaseStatistics cachedStatistics) { public FlinkInputSplit[] createInputSplits(int minNumSplits) throws IOException { // Called in Job manager, so it is OK to load table from catalog. tableLoader.open(); - final ExecutorService workerPool = ThreadPools.newWorkerPool("iceberg-plan-worker-pool", context.planParallelism()); + final ExecutorService workerPool = + ThreadPools.newWorkerPool("iceberg-plan-worker-pool", context.planParallelism()); try (TableLoader loader = tableLoader) { Table table = loader.loadTable(); return FlinkSplitPlanner.planInputSplits(table, context, workerPool); @@ -89,14 +92,13 @@ public FlinkInputSplit[] createInputSplits(int minNumSplits) throws IOException @Override public InputSplitAssigner getInputSplitAssigner(FlinkInputSplit[] inputSplits) { - return context.exposeLocality() ? - new LocatableInputSplitAssigner(inputSplits) : - new DefaultInputSplitAssigner(inputSplits); + return context.exposeLocality() + ? new LocatableInputSplitAssigner(inputSplits) + : new DefaultInputSplitAssigner(inputSplits); } @Override - public void configure(Configuration parameters) { - } + public void configure(Configuration parameters) {} @Override public void open(FlinkInputSplit split) { diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java index 5bb85fe7162a..16fd4f39596c 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Arrays; diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java index f6cf902b1729..7ac6c5162483 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -53,20 +52,21 @@ public class FlinkSource { private static final Logger LOG = LoggerFactory.getLogger(FlinkSource.class); - private FlinkSource() { - } + private FlinkSource() {} /** - * Initialize a {@link Builder} to read the data from iceberg table. Equivalent to {@link TableScan}. See more options - * in {@link ScanContext}. - *

- * The Source can be read static data in bounded mode. It can also continuously check the arrival of new data and read - * records incrementally. + * Initialize a {@link Builder} to read the data from iceberg table. Equivalent to {@link + * TableScan}. See more options in {@link ScanContext}. + * + *

The Source can be read static data in bounded mode. It can also continuously check the + * arrival of new data and read records incrementally. + * *

    - *
  • Without startSnapshotId: Bounded
  • - *
  • With startSnapshotId and with endSnapshotId: Bounded
  • - *
  • With startSnapshotId (-1 means unbounded preceding) and Without endSnapshotId: Unbounded
  • + *
  • Without startSnapshotId: Bounded + *
  • With startSnapshotId and with endSnapshotId: Bounded + *
  • With startSnapshotId (-1 means unbounded preceding) and Without endSnapshotId: Unbounded *
+ * *

* * @return {@link Builder} to connect the iceberg table. @@ -75,9 +75,7 @@ public static Builder forRowData() { return new Builder(); } - /** - * Source builder to build {@link DataStream}. - */ + /** Source builder to build {@link DataStream}. */ public static class Builder { private static final Set FILE_SYSTEM_SUPPORT_LOCALITY = ImmutableSet.of("hdfs"); @@ -223,9 +221,11 @@ public FlinkInputFormat buildFormat() { contextBuilder.project(FlinkSchemaUtil.convert(icebergSchema, projectedSchema)); } contextBuilder.exposeLocality(localityEnabled()); - contextBuilder.planParallelism(readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE)); + contextBuilder.planParallelism( + readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE)); - return new FlinkInputFormat(tableLoader, icebergSchema, io, encryption, contextBuilder.build()); + return new FlinkInputFormat( + tableLoader, icebergSchema, io, encryption, contextBuilder.build()); } public DataStream build() { @@ -233,7 +233,8 @@ public DataStream build() { FlinkInputFormat format = buildFormat(); ScanContext context = contextBuilder.build(); - TypeInformation typeInfo = FlinkCompatibilityUtil.toTypeInfo(FlinkSchemaUtil.convert(context.project())); + TypeInformation typeInfo = + FlinkCompatibilityUtil.toTypeInfo(FlinkSchemaUtil.convert(context.project())); if (!context.isStreaming()) { int parallelism = inferParallelism(format, context); @@ -253,26 +254,30 @@ public DataStream build() { } int inferParallelism(FlinkInputFormat format, ScanContext context) { - int parallelism = readableConfig.get(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM); + int parallelism = + readableConfig.get(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM); if (readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM)) { - int maxInferParallelism = readableConfig.get(FlinkConfigOptions - .TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX); + int maxInferParallelism = + readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX); Preconditions.checkState( maxInferParallelism >= 1, - FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX.key() + " cannot be less than 1"); + FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX.key() + + " cannot be less than 1"); int splitNum; try { FlinkInputSplit[] splits = format.createInputSplits(0); splitNum = splits.length; } catch (IOException e) { - throw new UncheckedIOException("Failed to create iceberg input splits for table: " + table, e); + throw new UncheckedIOException( + "Failed to create iceberg input splits for table: " + table, e); } parallelism = Math.min(splitNum, maxInferParallelism); } if (context.limit() > 0) { - int limit = context.limit() >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) context.limit(); + int limit = + context.limit() >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) context.limit(); parallelism = Math.min(parallelism, limit); } @@ -283,8 +288,10 @@ int inferParallelism(FlinkInputFormat format, ScanContext context) { private boolean localityEnabled() { Boolean localityEnabled = - this.exposeLocality != null ? this.exposeLocality : - readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO); + this.exposeLocality != null + ? this.exposeLocality + : readableConfig.get( + FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO); if (localityEnabled != null && !localityEnabled) { return false; @@ -294,10 +301,14 @@ private boolean localityEnabled() { if (fileIO instanceof HadoopFileIO) { HadoopFileIO hadoopFileIO = (HadoopFileIO) fileIO; try { - String scheme = new Path(table.location()).getFileSystem(hadoopFileIO.getConf()).getScheme(); + String scheme = + new Path(table.location()).getFileSystem(hadoopFileIO.getConf()).getScheme(); return FILE_SYSTEM_SUPPORT_LOCALITY.contains(scheme); } catch (IOException e) { - LOG.warn("Failed to determine whether the locality information can be exposed for table: {}", table, e); + LOG.warn( + "Failed to determine whether the locality information can be exposed for table: {}", + table, + e); } } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java index d1628fbca794..4746625310b1 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -40,11 +39,12 @@ @Internal public class FlinkSplitPlanner { - private FlinkSplitPlanner() { - } + private FlinkSplitPlanner() {} - static FlinkInputSplit[] planInputSplits(Table table, ScanContext context, ExecutorService workerPool) { - try (CloseableIterable tasksIterable = planTasks(table, context, workerPool)) { + static FlinkInputSplit[] planInputSplits( + Table table, ScanContext context, ExecutorService workerPool) { + try (CloseableIterable tasksIterable = + planTasks(table, context, workerPool)) { List tasks = Lists.newArrayList(tasksIterable); FlinkInputSplit[] splits = new FlinkInputSplit[tasks.size()]; boolean exposeLocality = context.exposeLocality(); @@ -52,33 +52,35 @@ static FlinkInputSplit[] planInputSplits(Table table, ScanContext context, Execu Tasks.range(tasks.size()) .stopOnFailure() .executeWith(exposeLocality ? workerPool : null) - .run(index -> { - CombinedScanTask task = tasks.get(index); - String[] hostnames = null; - if (exposeLocality) { - hostnames = Util.blockLocations(table.io(), task); - } - splits[index] = new FlinkInputSplit(index, task, hostnames); - }); + .run( + index -> { + CombinedScanTask task = tasks.get(index); + String[] hostnames = null; + if (exposeLocality) { + hostnames = Util.blockLocations(table.io(), task); + } + splits[index] = new FlinkInputSplit(index, task, hostnames); + }); return splits; } catch (IOException e) { throw new UncheckedIOException("Failed to process tasks iterable", e); } } - /** - * This returns splits for the FLIP-27 source - */ + /** This returns splits for the FLIP-27 source */ public static List planIcebergSourceSplits( Table table, ScanContext context, ExecutorService workerPool) { - try (CloseableIterable tasksIterable = planTasks(table, context, workerPool)) { - return Lists.newArrayList(CloseableIterable.transform(tasksIterable, IcebergSourceSplit::fromCombinedScanTask)); + try (CloseableIterable tasksIterable = + planTasks(table, context, workerPool)) { + return Lists.newArrayList( + CloseableIterable.transform(tasksIterable, IcebergSourceSplit::fromCombinedScanTask)); } catch (IOException e) { throw new UncheckedIOException("Failed to process task iterable: ", e); } } - static CloseableIterable planTasks(Table table, ScanContext context, ExecutorService workerPool) { + static CloseableIterable planTasks( + Table table, ScanContext context, ExecutorService workerPool) { ScanMode scanMode = checkScanMode(context); if (scanMode == ScanMode.INCREMENTAL_APPEND_SCAN) { IncrementalAppendScan scan = table.newIncrementalAppendScan(); @@ -115,22 +117,20 @@ private enum ScanMode { } private static ScanMode checkScanMode(ScanContext context) { - if (context.isStreaming() || context.startSnapshotId() != null || context.endSnapshotId() != null) { + if (context.isStreaming() + || context.startSnapshotId() != null + || context.endSnapshotId() != null) { return ScanMode.INCREMENTAL_APPEND_SCAN; } else { return ScanMode.BATCH; } } - /** - * refine scan with common configs - */ + /** refine scan with common configs */ private static > T refineScanWithBaseConfigs( T scan, ScanContext context, ExecutorService workerPool) { - T refinedScan = scan - .caseSensitive(context.caseSensitive()) - .project(context.project()) - .planWith(workerPool); + T refinedScan = + scan.caseSensitive(context.caseSensitive()).project(context.project()).planWith(workerPool); if (context.includeColumnStats()) { refinedScan = refinedScan.includeColumnStats(); @@ -141,11 +141,14 @@ private static > T refineScanW } if (context.splitLookback() != null) { - refinedScan = refinedScan.option(TableProperties.SPLIT_LOOKBACK, context.splitLookback().toString()); + refinedScan = + refinedScan.option(TableProperties.SPLIT_LOOKBACK, context.splitLookback().toString()); } if (context.splitOpenFileCost() != null) { - refinedScan = refinedScan.option(TableProperties.SPLIT_OPEN_FILE_COST, context.splitOpenFileCost().toString()); + refinedScan = + refinedScan.option( + TableProperties.SPLIT_OPEN_FILE_COST, context.splitOpenFileCost().toString()); } if (context.filters() != null) { diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java index 9a3e98d2fee5..85a9cd2b78fd 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -95,8 +94,7 @@ public Boundedness getBoundedness() { @Override public SourceReader createReader(SourceReaderContext readerContext) { - ReaderMetricsContext readerMetrics = - new ReaderMetricsContext(readerContext.metricGroup()); + ReaderMetricsContext readerMetrics = new ReaderMetricsContext(readerContext.metricGroup()); return new IcebergSourceReader<>(readerFunction, readerContext, readerMetrics); } @@ -130,20 +128,28 @@ private SplitEnumerator createEnumer if (enumState == null) { assigner = assignerFactory.createAssigner(); } else { - LOG.info("Iceberg source restored {} splits from state for table {}", - enumState.pendingSplits().size(), table.name()); + LOG.info( + "Iceberg source restored {} splits from state for table {}", + enumState.pendingSplits().size(), + table.name()); assigner = assignerFactory.createAssigner(enumState.pendingSplits()); } if (scanContext.isStreaming()) { - // Ideally, operatorId should be used as the threadPoolName as Flink guarantees its uniqueness within a job. - // SplitEnumeratorContext doesn't expose the OperatorCoordinator.Context, which would contain the OperatorID. - // Need to discuss with Flink community whether it is ok to expose a public API like the protected method - // "OperatorCoordinator.Context getCoordinatorContext()" from SourceCoordinatorContext implementation. + // Ideally, operatorId should be used as the threadPoolName as Flink guarantees its uniqueness + // within a job. + // SplitEnumeratorContext doesn't expose the OperatorCoordinator.Context, which would contain + // the OperatorID. + // Need to discuss with Flink community whether it is ok to expose a public API like the + // protected method + // "OperatorCoordinator.Context getCoordinatorContext()" from SourceCoordinatorContext + // implementation. // For now, - is used as the unique thread pool name. - ContinuousSplitPlanner splitPlanner = new ContinuousSplitPlannerImpl( - table, scanContext, table.name() + "-" + UUID.randomUUID()); - return new ContinuousIcebergEnumerator(enumContext, assigner, scanContext, splitPlanner, enumState); + ContinuousSplitPlanner splitPlanner = + new ContinuousSplitPlannerImpl( + table, scanContext, table.name() + "-" + UUID.randomUUID()); + return new ContinuousIcebergEnumerator( + enumContext, assigner, scanContext, splitPlanner, enumState); } else { return new StaticIcebergEnumerator(enumContext, assigner, table, scanContext, enumState); } @@ -168,8 +174,7 @@ public static class Builder { // optional private final ScanContext.Builder contextBuilder = ScanContext.builder(); - Builder() { - } + Builder() {} public Builder tableLoader(TableLoader loader) { this.tableLoader = loader; @@ -292,8 +297,15 @@ public IcebergSource build() { try (TableLoader loader = tableLoader) { loader.open(); Table table = tableLoader.loadTable(); - RowDataReaderFunction rowDataReaderFunction = new RowDataReaderFunction(flinkConfig, table.schema(), - context.project(), context.nameMapping(), context.caseSensitive(), table.io(), table.encryption()); + RowDataReaderFunction rowDataReaderFunction = + new RowDataReaderFunction( + flinkConfig, + table.schema(), + context.project(), + context.nameMapping(), + context.caseSensitive(), + table.io(), + table.encryption()); this.readerFunction = (ReaderFunction) rowDataReaderFunction; } catch (IOException e) { throw new UncheckedIOException(e); diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java index b71f2b0fafe5..5fada27d5471 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Map; @@ -56,8 +55,8 @@ public class RowDataFileScanTaskReader implements FileScanTaskReader { private final String nameMapping; private final boolean caseSensitive; - public RowDataFileScanTaskReader(Schema tableSchema, Schema projectedSchema, - String nameMapping, boolean caseSensitive) { + public RowDataFileScanTaskReader( + Schema tableSchema, Schema projectedSchema, String nameMapping, boolean caseSensitive) { this.tableSchema = tableSchema; this.projectedSchema = projectedSchema; this.nameMapping = nameMapping; @@ -65,21 +64,28 @@ public RowDataFileScanTaskReader(Schema tableSchema, Schema projectedSchema, } @Override - public CloseableIterator open(FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { + public CloseableIterator open( + FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { Schema partitionSchema = TypeUtil.select(projectedSchema, task.spec().identitySourceIds()); - Map idToConstant = partitionSchema.columns().isEmpty() ? ImmutableMap.of() : - PartitionUtil.constantsMap(task, RowDataUtil::convertConstant); + Map idToConstant = + partitionSchema.columns().isEmpty() + ? ImmutableMap.of() + : PartitionUtil.constantsMap(task, RowDataUtil::convertConstant); - FlinkDeleteFilter deletes = new FlinkDeleteFilter(task, tableSchema, projectedSchema, inputFilesDecryptor); - CloseableIterable iterable = deletes.filter( - newIterable(task, deletes.requiredSchema(), idToConstant, inputFilesDecryptor) - ); + FlinkDeleteFilter deletes = + new FlinkDeleteFilter(task, tableSchema, projectedSchema, inputFilesDecryptor); + CloseableIterable iterable = + deletes.filter( + newIterable(task, deletes.requiredSchema(), idToConstant, inputFilesDecryptor)); // Project the RowData to remove the extra meta columns. if (!projectedSchema.sameSchema(deletes.requiredSchema())) { - RowDataProjection rowDataProjection = RowDataProjection.create( - deletes.requiredRowType(), deletes.requiredSchema().asStruct(), projectedSchema.asStruct()); + RowDataProjection rowDataProjection = + RowDataProjection.create( + deletes.requiredRowType(), + deletes.requiredSchema().asStruct(), + projectedSchema.asStruct()); iterable = CloseableIterable.transform(iterable, rowDataProjection::wrap); } @@ -87,7 +93,10 @@ public CloseableIterator open(FileScanTask task, InputFilesDecryptor in } private CloseableIterable newIterable( - FileScanTask task, Schema schema, Map idToConstant, InputFilesDecryptor inputFilesDecryptor) { + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { CloseableIterable iter; if (task.isDataTask()) { throw new UnsupportedOperationException("Cannot read data task."); @@ -115,12 +124,16 @@ private CloseableIterable newIterable( } private CloseableIterable newAvroIterable( - FileScanTask task, Schema schema, Map idToConstant, InputFilesDecryptor inputFilesDecryptor) { - Avro.ReadBuilder builder = Avro.read(inputFilesDecryptor.getInputFile(task)) - .reuseContainers() - .project(schema) - .split(task.start(), task.length()) - .createReaderFunc(readSchema -> new FlinkAvroReader(schema, readSchema, idToConstant)); + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Avro.ReadBuilder builder = + Avro.read(inputFilesDecryptor.getInputFile(task)) + .reuseContainers() + .project(schema) + .split(task.start(), task.length()) + .createReaderFunc(readSchema -> new FlinkAvroReader(schema, readSchema, idToConstant)); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -130,14 +143,19 @@ private CloseableIterable newAvroIterable( } private CloseableIterable newParquetIterable( - FileScanTask task, Schema schema, Map idToConstant, InputFilesDecryptor inputFilesDecryptor) { - Parquet.ReadBuilder builder = Parquet.read(inputFilesDecryptor.getInputFile(task)) - .split(task.start(), task.length()) - .project(schema) - .createReaderFunc(fileSchema -> FlinkParquetReaders.buildReader(schema, fileSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive) - .reuseContainers(); + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Parquet.ReadBuilder builder = + Parquet.read(inputFilesDecryptor.getInputFile(task)) + .split(task.start(), task.length()) + .project(schema) + .createReaderFunc( + fileSchema -> FlinkParquetReaders.buildReader(schema, fileSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive) + .reuseContainers(); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -147,16 +165,22 @@ private CloseableIterable newParquetIterable( } private CloseableIterable newOrcIterable( - FileScanTask task, Schema schema, Map idToConstant, InputFilesDecryptor inputFilesDecryptor) { - Schema readSchemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(schema, - Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); - - ORC.ReadBuilder builder = ORC.read(inputFilesDecryptor.getInputFile(task)) - .project(readSchemaWithoutConstantAndMetadataFields) - .split(task.start(), task.length()) - .createReaderFunc(readOrcSchema -> new FlinkOrcReader(schema, readOrcSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive); + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Schema readSchemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot( + schema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); + + ORC.ReadBuilder builder = + ORC.read(inputFilesDecryptor.getInputFile(task)) + .project(readSchemaWithoutConstantAndMetadataFields) + .split(task.start(), task.length()) + .createReaderFunc( + readOrcSchema -> new FlinkOrcReader(schema, readOrcSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -170,8 +194,11 @@ private static class FlinkDeleteFilter extends DeleteFilter { private final RowDataWrapper asStructLike; private final InputFilesDecryptor inputFilesDecryptor; - FlinkDeleteFilter(FileScanTask task, Schema tableSchema, Schema requestedSchema, - InputFilesDecryptor inputFilesDecryptor) { + FlinkDeleteFilter( + FileScanTask task, + Schema tableSchema, + Schema requestedSchema, + InputFilesDecryptor inputFilesDecryptor) { super(task.file().path().toString(), task.deletes(), tableSchema, requestedSchema); this.requiredRowType = FlinkSchemaUtil.convert(requiredSchema()); this.asStructLike = new RowDataWrapper(requiredRowType, requiredSchema().asStruct()); diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java index 5e8837c5d47b..a721755c276f 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; +import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; + import java.util.Collection; import java.util.List; import java.util.Locale; @@ -46,8 +47,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; - public class RowDataRewriter { private static final Logger LOG = LoggerFactory.getLogger(RowDataRewriter.class); @@ -60,31 +59,36 @@ public class RowDataRewriter { private final TaskWriterFactory taskWriterFactory; private final String tableName; - public RowDataRewriter(Table table, boolean caseSensitive, FileIO io, EncryptionManager encryptionManager) { + public RowDataRewriter( + Table table, boolean caseSensitive, FileIO io, EncryptionManager encryptionManager) { this.schema = table.schema(); this.caseSensitive = caseSensitive; this.io = io; this.encryptionManager = encryptionManager; - this.nameMapping = PropertyUtil.propertyAsString(table.properties(), DEFAULT_NAME_MAPPING, null); + this.nameMapping = + PropertyUtil.propertyAsString(table.properties(), DEFAULT_NAME_MAPPING, null); this.tableName = table.name(); - String formatString = PropertyUtil.propertyAsString(table.properties(), TableProperties.DEFAULT_FILE_FORMAT, - TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); + String formatString = + PropertyUtil.propertyAsString( + table.properties(), + TableProperties.DEFAULT_FILE_FORMAT, + TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); FileFormat format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH)); RowType flinkSchema = FlinkSchemaUtil.convert(table.schema()); - this.taskWriterFactory = new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - flinkSchema, - Long.MAX_VALUE, - format, - null, - false); + this.taskWriterFactory = + new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), flinkSchema, Long.MAX_VALUE, format, null, false); } - public List rewriteDataForTasks(DataStream dataStream, int parallelism) throws Exception { - RewriteMap map = new RewriteMap(schema, nameMapping, io, caseSensitive, encryptionManager, taskWriterFactory); + public List rewriteDataForTasks( + DataStream dataStream, int parallelism) throws Exception { + RewriteMap map = + new RewriteMap( + schema, nameMapping, io, caseSensitive, encryptionManager, taskWriterFactory); DataStream> ds = dataStream.map(map).setParallelism(parallelism); - return Lists.newArrayList(ds.executeAndCollect("Rewrite table :" + tableName)).stream().flatMap(Collection::stream) + return Lists.newArrayList(ds.executeAndCollect("Rewrite table :" + tableName)).stream() + .flatMap(Collection::stream) .collect(Collectors.toList()); } @@ -102,15 +106,21 @@ public static class RewriteMap extends RichMapFunction taskWriterFactory; private final RowDataFileScanTaskReader rowDataReader; - public RewriteMap(Schema schema, String nameMapping, FileIO io, boolean caseSensitive, - EncryptionManager encryptionManager, TaskWriterFactory taskWriterFactory) { + public RewriteMap( + Schema schema, + String nameMapping, + FileIO io, + boolean caseSensitive, + EncryptionManager encryptionManager, + TaskWriterFactory taskWriterFactory) { this.schema = schema; this.nameMapping = nameMapping; this.io = io; this.caseSensitive = caseSensitive; this.encryptionManager = encryptionManager; this.taskWriterFactory = taskWriterFactory; - this.rowDataReader = new RowDataFileScanTaskReader(schema, schema, nameMapping, caseSensitive); + this.rowDataReader = + new RowDataFileScanTaskReader(schema, schema, nameMapping, caseSensitive); } @Override @@ -126,7 +136,7 @@ public List map(CombinedScanTask task) throws Exception { // Initialize the task writer. this.writer = taskWriterFactory.create(); try (DataIterator iterator = - new DataIterator<>(rowDataReader, task, io, encryptionManager)) { + new DataIterator<>(rowDataReader, task, io, encryptionManager)) { while (iterator.hasNext()) { RowData rowData = iterator.next(); writer.write(rowData); diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java index 8746809d9a1c..f57f1f5fdf0b 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; +import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; + import java.io.Serializable; import java.time.Duration; import java.util.List; @@ -32,11 +33,7 @@ import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.flink.FlinkConfigOptions; -import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; - -/** - * Context object with optional arguments for a Flink Scan. - */ +/** Context object with optional arguments for a Flink Scan. */ @Internal public class ScanContext implements Serializable { @@ -52,7 +49,8 @@ public class ScanContext implements Serializable { ConfigOptions.key("as-of-timestamp").longType().defaultValue(null); private static final ConfigOption STARTING_STRATEGY = - ConfigOptions.key("starting-strategy").enumType(StreamingStartingStrategy.class) + ConfigOptions.key("starting-strategy") + .enumType(StreamingStartingStrategy.class) .defaultValue(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT); private static final ConfigOption START_SNAPSHOT_TIMESTAMP = @@ -107,12 +105,27 @@ public class ScanContext implements Serializable { private final boolean includeColumnStats; private final Integer planParallelism; - private ScanContext(boolean caseSensitive, Long snapshotId, StreamingStartingStrategy startingStrategy, - Long startSnapshotTimestamp, Long startSnapshotId, Long endSnapshotId, Long asOfTimestamp, - Long splitSize, Integer splitLookback, Long splitOpenFileCost, boolean isStreaming, - Duration monitorInterval, String nameMapping, Schema schema, List filters, - long limit, boolean includeColumnStats, boolean exposeLocality, Integer planParallelism, - int maxPlanningSnapshotCount) { + private ScanContext( + boolean caseSensitive, + Long snapshotId, + StreamingStartingStrategy startingStrategy, + Long startSnapshotTimestamp, + Long startSnapshotId, + Long endSnapshotId, + Long asOfTimestamp, + Long splitSize, + Integer splitLookback, + Long splitOpenFileCost, + boolean isStreaming, + Duration monitorInterval, + String nameMapping, + Schema schema, + List filters, + long limit, + boolean includeColumnStats, + boolean exposeLocality, + Integer planParallelism, + int maxPlanningSnapshotCount) { this.caseSensitive = caseSensitive; this.snapshotId = snapshotId; this.startingStrategy = startingStrategy; @@ -141,15 +154,19 @@ private ScanContext(boolean caseSensitive, Long snapshotId, StreamingStartingStr private void validate() { if (isStreaming) { if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) { - Preconditions.checkArgument(startSnapshotId != null, + Preconditions.checkArgument( + startSnapshotId != null, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); - Preconditions.checkArgument(startSnapshotTimestamp == null, + Preconditions.checkArgument( + startSnapshotTimestamp == null, "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); } if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) { - Preconditions.checkArgument(startSnapshotTimestamp != null, + Preconditions.checkArgument( + startSnapshotTimestamp != null, "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); - Preconditions.checkArgument(startSnapshotId == null, + Preconditions.checkArgument( + startSnapshotId == null, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); } } @@ -304,11 +321,11 @@ public static class Builder { private long limit = -1L; private boolean includeColumnStats = INCLUDE_COLUMN_STATS.defaultValue(); private boolean exposeLocality; - private Integer planParallelism = FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue(); + private Integer planParallelism = + FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue(); private int maxPlanningSnapshotCount = MAX_PLANNING_SNAPSHOT_COUNT.defaultValue(); - private Builder() { - } + private Builder() {} public Builder caseSensitive(boolean newCaseSensitive) { this.caseSensitive = newCaseSensitive; @@ -432,10 +449,27 @@ public Builder fromProperties(Map properties) { } public ScanContext build() { - return new ScanContext(caseSensitive, snapshotId, startingStrategy, startSnapshotTimestamp, - startSnapshotId, endSnapshotId, asOfTimestamp, splitSize, splitLookback, - splitOpenFileCost, isStreaming, monitorInterval, nameMapping, projectedSchema, - filters, limit, includeColumnStats, exposeLocality, planParallelism, maxPlanningSnapshotCount); + return new ScanContext( + caseSensitive, + snapshotId, + startingStrategy, + startSnapshotTimestamp, + startSnapshotId, + endSnapshotId, + asOfTimestamp, + splitSize, + splitLookback, + splitOpenFileCost, + isStreaming, + monitorInterval, + nameMapping, + projectedSchema, + filters, + limit, + includeColumnStats, + exposeLocality, + planParallelism, + maxPlanningSnapshotCount); } } } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java index 06def6508938..75791c95bd4a 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -45,19 +44,20 @@ import org.slf4j.LoggerFactory; /** - * This is the single (non-parallel) monitoring task which takes a {@link FlinkInputFormat}, - * it is responsible for: + * This is the single (non-parallel) monitoring task which takes a {@link FlinkInputFormat}, it is + * responsible for: * *
    - *
  1. Monitoring snapshots of the Iceberg table.
  2. - *
  3. Creating the {@link FlinkInputSplit splits} corresponding to the incremental files
  4. - *
  5. Assigning them to downstream tasks for further processing.
  6. + *
  7. Monitoring snapshots of the Iceberg table. + *
  8. Creating the {@link FlinkInputSplit splits} corresponding to the incremental files + *
  9. Assigning them to downstream tasks for further processing. *
* - *

The splits to be read are forwarded to the downstream {@link StreamingReaderOperator} - * which can have parallelism greater than one. + *

The splits to be read are forwarded to the downstream {@link StreamingReaderOperator} which + * can have parallelism greater than one. */ -public class StreamingMonitorFunction extends RichSourceFunction implements CheckpointedFunction { +public class StreamingMonitorFunction extends RichSourceFunction + implements CheckpointedFunction { private static final Logger LOG = LoggerFactory.getLogger(StreamingMonitorFunction.class); @@ -68,7 +68,8 @@ public class StreamingMonitorFunction extends RichSourceFunction 0, + Preconditions.checkArgument( + scanContext.maxPlanningSnapshotCount() > 0, "The max-planning-snapshot-count must be greater than zero"); this.tableLoader = tableLoader; this.scanContext = scanContext; @@ -96,9 +100,12 @@ public void open(Configuration parameters) throws Exception { final RuntimeContext runtimeContext = getRuntimeContext(); ValidationException.check( - runtimeContext instanceof StreamingRuntimeContext, "context should be instance of StreamingRuntimeContext"); + runtimeContext instanceof StreamingRuntimeContext, + "context should be instance of StreamingRuntimeContext"); final String operatorID = ((StreamingRuntimeContext) runtimeContext).getOperatorUniqueID(); - this.workerPool = ThreadPools.newWorkerPool("iceberg-worker-pool-" + operatorID, scanContext.planParallelism()); + this.workerPool = + ThreadPools.newWorkerPool( + "iceberg-worker-pool-" + operatorID, scanContext.planParallelism()); } @Override @@ -108,21 +115,24 @@ public void initializeState(FunctionInitializationContext context) throws Except table = tableLoader.loadTable(); // Initialize the flink state for last snapshot id. - lastSnapshotIdState = context.getOperatorStateStore().getListState( - new ListStateDescriptor<>( - "snapshot-id-state", - LongSerializer.INSTANCE)); + lastSnapshotIdState = + context + .getOperatorStateStore() + .getListState(new ListStateDescriptor<>("snapshot-id-state", LongSerializer.INSTANCE)); // Restore the last-snapshot-id from flink's state if possible. if (context.isRestored()) { LOG.info("Restoring state for the {}.", getClass().getSimpleName()); lastSnapshotId = lastSnapshotIdState.get().iterator().next(); } else if (scanContext.startSnapshotId() != null) { - Preconditions.checkNotNull(table.currentSnapshot(), "Don't have any available snapshot in table."); + Preconditions.checkNotNull( + table.currentSnapshot(), "Don't have any available snapshot in table."); long currentSnapshotId = table.currentSnapshot().snapshotId(); - Preconditions.checkState(SnapshotUtil.isAncestorOf(table, currentSnapshotId, scanContext.startSnapshotId()), - "The option start-snapshot-id %s is not an ancestor of the current snapshot.", scanContext.startSnapshotId()); + Preconditions.checkState( + SnapshotUtil.isAncestorOf(table, currentSnapshotId, scanContext.startSnapshotId()), + "The option start-snapshot-id %s is not an ancestor of the current snapshot.", + scanContext.startSnapshotId()); lastSnapshotId = scanContext.startSnapshotId(); } @@ -143,13 +153,15 @@ public void run(SourceContext ctx) throws Exception { } } - private long toSnapshotIdInclusive(long lastConsumedSnapshotId, long currentSnapshotId, - int maxPlanningSnapshotCount) { - List snapshotIds = SnapshotUtil.snapshotIdsBetween(table, lastConsumedSnapshotId, currentSnapshotId); + private long toSnapshotIdInclusive( + long lastConsumedSnapshotId, long currentSnapshotId, int maxPlanningSnapshotCount) { + List snapshotIds = + SnapshotUtil.snapshotIdsBetween(table, lastConsumedSnapshotId, currentSnapshotId); if (snapshotIds.size() <= maxPlanningSnapshotCount) { return currentSnapshotId; } else { - // It uses reverted index since snapshotIdsBetween returns Ids that are ordered by committed time descending. + // It uses reverted index since snapshotIdsBetween returns Ids that are ordered by committed + // time descending. return snapshotIds.get(snapshotIds.size() - maxPlanningSnapshotCount); } } @@ -172,14 +184,23 @@ void monitorAndForwardSplits() { if (lastSnapshotId == INIT_LAST_SNAPSHOT_ID) { newScanContext = scanContext.copyWithSnapshotId(snapshotId); } else { - snapshotId = toSnapshotIdInclusive(lastSnapshotId, snapshotId, scanContext.maxPlanningSnapshotCount()); + snapshotId = + toSnapshotIdInclusive( + lastSnapshotId, snapshotId, scanContext.maxPlanningSnapshotCount()); newScanContext = scanContext.copyWithAppendsBetween(lastSnapshotId, snapshotId); } - LOG.debug("Start discovering splits from {} (exclusive) to {} (inclusive)", lastSnapshotId, snapshotId); + LOG.debug( + "Start discovering splits from {} (exclusive) to {} (inclusive)", + lastSnapshotId, + snapshotId); long start = System.currentTimeMillis(); - FlinkInputSplit[] splits = FlinkSplitPlanner.planInputSplits(table, newScanContext, workerPool); - LOG.debug("Discovered {} splits, time elapsed {}ms", splits.length, System.currentTimeMillis() - start); + FlinkInputSplit[] splits = + FlinkSplitPlanner.planInputSplits(table, newScanContext, workerPool); + LOG.debug( + "Discovered {} splits, time elapsed {}ms", + splits.length, + System.currentTimeMillis() - start); // only need to hold the checkpoint lock when emitting the splits and updating lastSnapshotId start = System.currentTimeMillis(); @@ -190,7 +211,10 @@ void monitorAndForwardSplits() { lastSnapshotId = snapshotId; } - LOG.debug("Forwarded {} splits, time elapsed {}ms", splits.length, System.currentTimeMillis() - start); + LOG.debug( + "Forwarded {} splits, time elapsed {}ms", + splits.length, + System.currentTimeMillis() - start); } } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java index c8efc2b5992f..ee6f7b63988d 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -47,20 +46,23 @@ /** * The operator that reads the {@link FlinkInputSplit splits} received from the preceding {@link - * StreamingMonitorFunction}. Contrary to the {@link StreamingMonitorFunction} which has a parallelism of 1, - * this operator can have multiple parallelism. + * StreamingMonitorFunction}. Contrary to the {@link StreamingMonitorFunction} which has a + * parallelism of 1, this operator can have multiple parallelism. * - *

As soon as a split descriptor is received, it is put in a queue, and use {@link MailboxExecutor} - * read the actual data of the split. This architecture allows the separation of the reading thread from the one split - * processing the checkpoint barriers, thus removing any potential back-pressure. + *

As soon as a split descriptor is received, it is put in a queue, and use {@link + * MailboxExecutor} read the actual data of the split. This architecture allows the separation of + * the reading thread from the one split processing the checkpoint barriers, thus removing any + * potential back-pressure. */ public class StreamingReaderOperator extends AbstractStreamOperator implements OneInputStreamOperator { private static final Logger LOG = LoggerFactory.getLogger(StreamingReaderOperator.class); - // It's the same thread that is running this operator and checkpoint actions. we use this executor to schedule only - // one split for future reading, so that a new checkpoint could be triggered without blocking long time for exhausting + // It's the same thread that is running this operator and checkpoint actions. we use this executor + // to schedule only + // one split for future reading, so that a new checkpoint could be triggered without blocking long + // time for exhausting // all scheduled splits. private final MailboxExecutor executor; private FlinkInputFormat format; @@ -70,17 +72,21 @@ public class StreamingReaderOperator extends AbstractStreamOperator private transient ListState inputSplitsState; private transient Queue splits; - // Splits are read by the same thread that calls processElement. Each read task is submitted to that thread by adding - // them to the executor. This state is used to ensure that only one read task is in that queue at a time, so that read - // tasks do not accumulate ahead of checkpoint tasks. When there is a read task in the queue, this is set to RUNNING. + // Splits are read by the same thread that calls processElement. Each read task is submitted to + // that thread by adding + // them to the executor. This state is used to ensure that only one read task is in that queue at + // a time, so that read + // tasks do not accumulate ahead of checkpoint tasks. When there is a read task in the queue, this + // is set to RUNNING. // When there are no more files to read, this will be set to IDLE. private transient SplitState currentSplitState; - private StreamingReaderOperator(FlinkInputFormat format, ProcessingTimeService timeService, - MailboxExecutor mailboxExecutor) { + private StreamingReaderOperator( + FlinkInputFormat format, ProcessingTimeService timeService, MailboxExecutor mailboxExecutor) { this.format = Preconditions.checkNotNull(format, "The InputFormat should not be null."); this.processingTimeService = timeService; - this.executor = Preconditions.checkNotNull(mailboxExecutor, "The mailboxExecutor should not be null."); + this.executor = + Preconditions.checkNotNull(mailboxExecutor, "The mailboxExecutor should not be null."); } @Override @@ -89,8 +95,10 @@ public void initializeState(StateInitializationContext context) throws Exception // TODO Replace Java serialization with Avro approach to keep state compatibility. // See issue: https://github.com/apache/iceberg/issues/1698 - inputSplitsState = context.getOperatorStateStore().getListState( - new ListStateDescriptor<>("splits", new JavaSerializer<>())); + inputSplitsState = + context + .getOperatorStateStore() + .getListState(new ListStateDescriptor<>("splits", new JavaSerializer<>())); // Initialize the current split state to IDLE. currentSplitState = SplitState.IDLE; @@ -106,14 +114,15 @@ public void initializeState(StateInitializationContext context) throws Exception } } - this.sourceContext = StreamSourceContexts.getSourceContext( - getOperatorConfig().getTimeCharacteristic(), - getProcessingTimeService(), - new Object(), // no actual locking needed - output, - getRuntimeContext().getExecutionConfig().getAutoWatermarkInterval(), - -1, - true); + this.sourceContext = + StreamSourceContexts.getSourceContext( + getOperatorConfig().getTimeCharacteristic(), + getProcessingTimeService(), + new Object(), // no actual locking needed + output, + getRuntimeContext().getExecutionConfig().getAutoWatermarkInterval(), + -1, + true); // Enqueue to process the recovered input splits. enqueueProcessSplits(); @@ -197,11 +206,13 @@ static OneInputStreamOperatorFactory factory(FlinkInpu } private enum SplitState { - IDLE, RUNNING + IDLE, + RUNNING } private static class OperatorFactory extends AbstractStreamOperatorFactory - implements YieldingOperatorFactory, OneInputStreamOperatorFactory { + implements YieldingOperatorFactory, + OneInputStreamOperatorFactory { private final FlinkInputFormat format; @@ -218,9 +229,12 @@ public void setMailboxExecutor(MailboxExecutor mailboxExecutor) { @SuppressWarnings("unchecked") @Override - public > O createStreamOperator(StreamOperatorParameters parameters) { - StreamingReaderOperator operator = new StreamingReaderOperator(format, processingTimeService, mailboxExecutor); - operator.setup(parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); + public > O createStreamOperator( + StreamOperatorParameters parameters) { + StreamingReaderOperator operator = + new StreamingReaderOperator(format, processingTimeService, mailboxExecutor); + operator.setup( + parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); return (O) operator; } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java index 3e83fbe7f5af..11707bf82a0f 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java @@ -16,43 +16,39 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; -/** - * Starting strategy for streaming execution. - */ +/** Starting strategy for streaming execution. */ public enum StreamingStartingStrategy { /** * Do a regular table scan then switch to the incremental mode. - *

- * The incremental mode starts from the current snapshot exclusive. + * + *

The incremental mode starts from the current snapshot exclusive. */ TABLE_SCAN_THEN_INCREMENTAL, /** * Start incremental mode from the latest snapshot inclusive. - *

- * If it is an empty map, all future append snapshots should be discovered. + * + *

If it is an empty map, all future append snapshots should be discovered. */ INCREMENTAL_FROM_LATEST_SNAPSHOT, /** * Start incremental mode from the earliest snapshot inclusive. - *

- * If it is an empty map, all future append snapshots should be discovered. + * + *

If it is an empty map, all future append snapshots should be discovered. */ INCREMENTAL_FROM_EARLIEST_SNAPSHOT, - /** - * Start incremental mode from a snapshot with a specific id inclusive. - */ + /** Start incremental mode from a snapshot with a specific id inclusive. */ INCREMENTAL_FROM_SNAPSHOT_ID, /** * Start incremental mode from a snapshot with a specific timestamp inclusive. - *

- * If the timestamp is between two snapshots, it should start from the snapshot after the timestamp. + * + *

If the timestamp is between two snapshots, it should start from the snapshot after the + * timestamp. */ INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java index 21feab660ba7..72deaeb890f3 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.assigner; import org.apache.flink.annotation.Internal; @@ -27,18 +26,15 @@ public class GetSplitResult { public enum Status { - AVAILABLE, /** - * There are pending splits. But they can't be assigned - * due to constraints (like event time alignment) + * There are pending splits. But they can't be assigned due to constraints (like event time + * alignment) */ CONSTRAINED, - /** - * Assigner doesn't have pending splits. - */ + /** Assigner doesn't have pending splits. */ UNAVAILABLE } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssigner.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssigner.java index ed70ad3774aa..c1da261a5555 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssigner.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssigner.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.assigner; import java.util.ArrayDeque; @@ -31,8 +30,8 @@ import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; /** - * Since all methods are called in the source coordinator thread by enumerator, - * there is no need for locking. + * Since all methods are called in the source coordinator thread by enumerator, there is no need for + * locking. */ @Internal public class SimpleSplitAssigner implements SplitAssigner { @@ -79,9 +78,7 @@ private void addSplits(Collection splits) { } } - /** - * Simple assigner only tracks unassigned splits - */ + /** Simple assigner only tracks unassigned splits */ @Override public Collection state() { return pendingSplits.stream() diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java index 76f3d66e9086..1c14f4fcf9b9 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.assigner; import java.util.Collection; import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; -/** - * Create simple assigner that hands out splits without any guarantee in order or locality. - */ +/** Create simple assigner that hands out splits without any guarantee in order or locality. */ public class SimpleSplitAssignerFactory implements SplitAssignerFactory { @Override diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java index 78ef02cad43a..b17a554f5e65 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.assigner; import java.io.Closeable; @@ -28,19 +27,19 @@ import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; /** - * SplitAssigner interface is extracted out as a separate component so that - * we can plug in different split assignment strategy for different requirements. E.g. + * SplitAssigner interface is extracted out as a separate component so that we can plug in different + * split assignment strategy for different requirements. E.g. + * *

    - *
  • Simple assigner with no ordering guarantee or locality aware optimization.
  • - *
  • Locality aware assigner that prefer splits that are local.
  • - *
  • Snapshot aware assigner that assign splits based on the order they are committed.
  • - *
  • Event time alignment assigner that assign splits satisfying certain time ordering - * within a single source or across sources.
  • + *
  • Simple assigner with no ordering guarantee or locality aware optimization. + *
  • Locality aware assigner that prefer splits that are local. + *
  • Snapshot aware assigner that assign splits based on the order they are committed. + *
  • Event time alignment assigner that assign splits satisfying certain time ordering within a + * single source or across sources. *
* - *

- * Enumerator should call the assigner APIs from the coordinator thread. - * This is to simplify the thread safety for assigner implementation. + *

Enumerator should call the assigner APIs from the coordinator thread. This is to simplify the + * thread safety for assigner implementation. */ public interface SplitAssigner extends Closeable { @@ -48,64 +47,54 @@ public interface SplitAssigner extends Closeable { * Some assigners may need to start background threads or perform other activity such as * registering as listeners to updates from other event sources e.g., watermark tracker. */ - default void start() { - } + default void start() {} /** - * Some assigners may need to perform certain actions - * when their corresponding enumerators are closed + * Some assigners may need to perform certain actions when their corresponding enumerators are + * closed */ @Override - default void close() { - } + default void close() {} /** - * Request a new split from the assigner - * when enumerator trying to assign splits to awaiting readers. - *

- * If enumerator wasn't able to assign the split (e.g., reader disconnected), - * enumerator should call {@link SplitAssigner#onUnassignedSplits} to return the split. + * Request a new split from the assigner when enumerator trying to assign splits to awaiting + * readers. + * + *

If enumerator wasn't able to assign the split (e.g., reader disconnected), enumerator should + * call {@link SplitAssigner#onUnassignedSplits} to return the split. */ GetSplitResult getNext(@Nullable String hostname); - /** - * Add new splits discovered by enumerator - */ + /** Add new splits discovered by enumerator */ void onDiscoveredSplits(Collection splits); - /** - * Forward addSplitsBack event (for failed reader) to assigner - */ + /** Forward addSplitsBack event (for failed reader) to assigner */ void onUnassignedSplits(Collection splits); /** - * Some assigner (like event time alignment) may rack in-progress splits - * to advance watermark upon completed splits + * Some assigner (like event time alignment) may rack in-progress splits to advance watermark upon + * completed splits */ - default void onCompletedSplits(Collection completedSplitIds) { - } + default void onCompletedSplits(Collection completedSplitIds) {} /** - * Get assigner state for checkpointing. - * This is a super-set API that works for all currently imagined assigners. + * Get assigner state for checkpointing. This is a super-set API that works for all currently + * imagined assigners. */ Collection state(); /** - * Enumerator can get a notification via CompletableFuture - * when the assigner has more splits available later. - * Enumerator should schedule assignment in the thenAccept action of the future. - *

- * Assigner will return the same future if this method is called again - * before the previous future is completed. - *

- * The future can be completed from other thread, - * e.g. the coordinator thread from another thread - * for event time alignment. - *

- * If enumerator need to trigger action upon the future completion, - * it may want to run it in the coordinator thread - * using {@link SplitEnumeratorContext#runInCoordinatorThread(Runnable)}. + * Enumerator can get a notification via CompletableFuture when the assigner has more splits + * available later. Enumerator should schedule assignment in the thenAccept action of the future. + * + *

Assigner will return the same future if this method is called again before the previous + * future is completed. + * + *

The future can be completed from other thread, e.g. the coordinator thread from another + * thread for event time alignment. + * + *

If enumerator need to trigger action upon the future completion, it may want to run it in + * the coordinator thread using {@link SplitEnumeratorContext#runInCoordinatorThread(Runnable)}. */ CompletableFuture isAvailable(); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java index bc4ff0479f72..6e02a556ffcd 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.assigner; import java.io.Serializable; diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java index 8c9be862bf33..3aca390755ed 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.IOException; @@ -38,9 +37,11 @@ import org.slf4j.LoggerFactory; /** - * TODO: publish enumerator monitor metrics like number of pending metrics after FLINK-21000 is resolved + * TODO: publish enumerator monitor metrics like number of pending metrics after FLINK-21000 is + * resolved */ -abstract class AbstractIcebergEnumerator implements SplitEnumerator { +abstract class AbstractIcebergEnumerator + implements SplitEnumerator { private static final Logger LOG = LoggerFactory.getLogger(AbstractIcebergEnumerator.class); private final SplitEnumeratorContext enumeratorContext; @@ -49,8 +50,7 @@ abstract class AbstractIcebergEnumerator implements SplitEnumerator> availableFuture; AbstractIcebergEnumerator( - SplitEnumeratorContext enumeratorContext, - SplitAssigner assigner) { + SplitEnumeratorContext enumeratorContext, SplitAssigner assigner) { this.enumeratorContext = enumeratorContext; this.assigner = assigner; this.readersAwaitingSplit = new LinkedHashMap<>(); @@ -70,29 +70,32 @@ public void close() throws IOException { @Override public void handleSplitRequest(int subtaskId, @Nullable String requesterHostname) { // Iceberg source uses custom split request event to piggyback finished split ids. - throw new UnsupportedOperationException(String.format("Received invalid default split request event " + - "from subtask %d as Iceberg source uses custom split request event", subtaskId)); + throw new UnsupportedOperationException( + String.format( + "Received invalid default split request event " + + "from subtask %d as Iceberg source uses custom split request event", + subtaskId)); } @Override public void handleSourceEvent(int subtaskId, SourceEvent sourceEvent) { if (sourceEvent instanceof SplitRequestEvent) { - SplitRequestEvent splitRequestEvent = - (SplitRequestEvent) sourceEvent; + SplitRequestEvent splitRequestEvent = (SplitRequestEvent) sourceEvent; LOG.info("Received request split event from subtask {}", subtaskId); assigner.onCompletedSplits(splitRequestEvent.finishedSplitIds()); readersAwaitingSplit.put(subtaskId, splitRequestEvent.requesterHostname()); assignSplits(); } else { - throw new IllegalArgumentException(String.format("Received unknown event from subtask %d: %s", - subtaskId, sourceEvent.getClass().getCanonicalName())); + throw new IllegalArgumentException( + String.format( + "Received unknown event from subtask %d: %s", + subtaskId, sourceEvent.getClass().getCanonicalName())); } } @Override public void addSplitsBack(List splits, int subtaskId) { - LOG.info("Add {} splits back to the pool for failed subtask {}", - splits.size(), subtaskId); + LOG.info("Add {} splits back to the pool for failed subtask {}", splits.size(), subtaskId); assigner.onUnassignedSplits(splits); assignSplits(); } @@ -140,10 +143,7 @@ private void assignSplits() { } } - /** - * return true if enumerator should wait for splits - * like in the continuous enumerator case - */ + /** return true if enumerator should wait for splits like in the continuous enumerator case */ protected abstract boolean shouldWaitForMoreSplits(); private synchronized void getAvailableFutureIfNeeded() { @@ -151,18 +151,22 @@ private synchronized void getAvailableFutureIfNeeded() { return; } - CompletableFuture future = assigner.isAvailable() - .thenAccept(ignore -> - // Must run assignSplits in coordinator thread - // because the future may be completed from other threads. - // E.g., in event time alignment assigner, - // watermark advancement from another source may - // cause the available future to be completed - enumeratorContext.runInCoordinatorThread(() -> { - LOG.debug("Executing callback of assignSplits"); - availableFuture.set(null); - assignSplits(); - })); + CompletableFuture future = + assigner + .isAvailable() + .thenAccept( + ignore -> + // Must run assignSplits in coordinator thread + // because the future may be completed from other threads. + // E.g., in event time alignment assigner, + // watermark advancement from another source may + // cause the available future to be completed + enumeratorContext.runInCoordinatorThread( + () -> { + LOG.debug("Executing callback of assignSplits"); + availableFuture.set(null); + assignSplits(); + })); availableFuture.set(future); LOG.debug("Registered callback for future available splits"); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java index 8c20f2cf22bc..41863ffee60b 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.util.Collection; diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java index e2b94b8c3e2b..d104f46fdaaf 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.IOException; @@ -42,8 +41,8 @@ public class ContinuousIcebergEnumerator extends AbstractIcebergEnumerator { private final ContinuousSplitPlanner splitPlanner; /** - * snapshotId for the last enumerated snapshot. next incremental enumeration - * should be based off this as the starting position. + * snapshotId for the last enumerated snapshot. next incremental enumeration should be based off + * this as the starting position. */ private final AtomicReference enumeratorPosition; @@ -91,30 +90,36 @@ public IcebergEnumeratorState snapshotState(long checkpointId) { return new IcebergEnumeratorState(enumeratorPosition.get(), assigner.state()); } - /** - * This method is executed in an IO thread pool. - */ + /** This method is executed in an IO thread pool. */ private ContinuousEnumerationResult discoverSplits() { return splitPlanner.planSplits(enumeratorPosition.get()); } - /** - * This method is executed in a single coordinator thread. - */ + /** This method is executed in a single coordinator thread. */ private void processDiscoveredSplits(ContinuousEnumerationResult result, Throwable error) { if (error == null) { if (!Objects.equals(result.fromPosition(), enumeratorPosition.get())) { - // Multiple discoverSplits() may be triggered with the same starting snapshot to the I/O thread pool. - // E.g., the splitDiscoveryInterval is very short (like 10 ms in some unit tests) or the thread - // pool is busy and multiple discovery actions are executed concurrently. Discovery result should - // only be accepted if the starting position matches the enumerator position (like compare-and-swap). - LOG.info("Skip {} discovered splits because the scan starting position doesn't match " + - "the current enumerator position: enumerator position = {}, scan starting position = {}", - result.splits().size(), enumeratorPosition.get(), result.fromPosition()); + // Multiple discoverSplits() may be triggered with the same starting snapshot to the I/O + // thread pool. + // E.g., the splitDiscoveryInterval is very short (like 10 ms in some unit tests) or the + // thread + // pool is busy and multiple discovery actions are executed concurrently. Discovery result + // should + // only be accepted if the starting position matches the enumerator position (like + // compare-and-swap). + LOG.info( + "Skip {} discovered splits because the scan starting position doesn't match " + + "the current enumerator position: enumerator position = {}, scan starting position = {}", + result.splits().size(), + enumeratorPosition.get(), + result.fromPosition()); } else { assigner.onDiscoveredSplits(result.splits()); - LOG.info("Added {} splits discovered between ({}, {}] to the assigner", - result.splits().size(), result.fromPosition(), result.toPosition()); + LOG.info( + "Added {} splits discovered between ({}, {}] to the assigner", + result.splits().size(), + result.fromPosition(), + result.toPosition()); // update the enumerator position even if there is no split discovered // or the toPosition is empty (e.g. for empty table). enumeratorPosition.set(result.toPosition()); diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java index 1737ae6a5023..2a1325178873 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java @@ -16,20 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.Closeable; import org.apache.flink.annotation.Internal; -/** - * This interface is introduced so that we can plug in different split planner for unit test - */ +/** This interface is introduced so that we can plug in different split planner for unit test */ @Internal public interface ContinuousSplitPlanner extends Closeable { - /** - * Discover the files appended between {@code lastPosition} and current table snapshot - */ + /** Discover the files appended between {@code lastPosition} and current table snapshot */ ContinuousEnumerationResult planSplits(IcebergEnumeratorPosition lastPosition); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java index 2bbaaf940b63..a3ac8549909d 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.IOException; @@ -48,15 +47,18 @@ public class ContinuousSplitPlannerImpl implements ContinuousSplitPlanner { private final ExecutorService workerPool; /** - * @param threadName thread name prefix for worker pool to run the split planning. - * If null, a shared worker pool will be used. + * @param threadName thread name prefix for worker pool to run the split planning. If null, a + * shared worker pool will be used. */ public ContinuousSplitPlannerImpl(Table table, ScanContext scanContext, String threadName) { this.table = table; this.scanContext = scanContext; this.isSharedPool = threadName == null; - this.workerPool = isSharedPool ? ThreadPools.getWorkerPool() - : ThreadPools.newWorkerPool("iceberg-plan-worker-pool-" + threadName, scanContext.planParallelism()); + this.workerPool = + isSharedPool + ? ThreadPools.getWorkerPool() + : ThreadPools.newWorkerPool( + "iceberg-plan-worker-pool-" + threadName, scanContext.planParallelism()); } @Override @@ -76,63 +78,75 @@ public ContinuousEnumerationResult planSplits(IcebergEnumeratorPosition lastPosi } } - /** - * Discover incremental changes between @{code lastPosition} and current table snapshot - */ - private ContinuousEnumerationResult discoverIncrementalSplits(IcebergEnumeratorPosition lastPosition) { + /** Discover incremental changes between @{code lastPosition} and current table snapshot */ + private ContinuousEnumerationResult discoverIncrementalSplits( + IcebergEnumeratorPosition lastPosition) { Snapshot currentSnapshot = table.currentSnapshot(); if (currentSnapshot == null) { // empty table - Preconditions.checkArgument(lastPosition.snapshotId() == null, + Preconditions.checkArgument( + lastPosition.snapshotId() == null, "Invalid last enumerated position for an empty table: not null"); LOG.info("Skip incremental scan because table is empty"); return new ContinuousEnumerationResult(Collections.emptyList(), lastPosition, lastPosition); - } else if (lastPosition.snapshotId() != null && currentSnapshot.snapshotId() == lastPosition.snapshotId()) { + } else if (lastPosition.snapshotId() != null + && currentSnapshot.snapshotId() == lastPosition.snapshotId()) { LOG.info("Current table snapshot is already enumerated: {}", currentSnapshot.snapshotId()); return new ContinuousEnumerationResult(Collections.emptyList(), lastPosition, lastPosition); } else { - IcebergEnumeratorPosition newPosition = IcebergEnumeratorPosition.of( - currentSnapshot.snapshotId(), currentSnapshot.timestampMillis()); - ScanContext incrementalScan = scanContext - .copyWithAppendsBetween(lastPosition.snapshotId(), currentSnapshot.snapshotId()); - List splits = FlinkSplitPlanner.planIcebergSourceSplits(table, incrementalScan, workerPool); - LOG.info("Discovered {} splits from incremental scan: " + - "from snapshot (exclusive) is {}, to snapshot (inclusive) is {}", - splits.size(), lastPosition, newPosition); + IcebergEnumeratorPosition newPosition = + IcebergEnumeratorPosition.of( + currentSnapshot.snapshotId(), currentSnapshot.timestampMillis()); + ScanContext incrementalScan = + scanContext.copyWithAppendsBetween( + lastPosition.snapshotId(), currentSnapshot.snapshotId()); + List splits = + FlinkSplitPlanner.planIcebergSourceSplits(table, incrementalScan, workerPool); + LOG.info( + "Discovered {} splits from incremental scan: " + + "from snapshot (exclusive) is {}, to snapshot (inclusive) is {}", + splits.size(), + lastPosition, + newPosition); return new ContinuousEnumerationResult(splits, lastPosition, newPosition); } } /** * Discovery initial set of splits based on {@link StreamingStartingStrategy}. - * - *

  • {@link ContinuousEnumerationResult#splits()} should contain initial splits - * discovered from table scan for {@link StreamingStartingStrategy#TABLE_SCAN_THEN_INCREMENTAL}. - * For all other strategies, splits collection should be empty. - *
  • {@link ContinuousEnumerationResult#toPosition()} points to the starting position - * for the next incremental split discovery with exclusive behavior. Meaning files committed - * by the snapshot from the position in {@code ContinuousEnumerationResult} won't be included - * in the next incremental scan. + *
  • {@link ContinuousEnumerationResult#splits()} should contain initial splits discovered from + * table scan for {@link StreamingStartingStrategy#TABLE_SCAN_THEN_INCREMENTAL}. For all other + * strategies, splits collection should be empty. + *
  • {@link ContinuousEnumerationResult#toPosition()} points to the starting position for the + * next incremental split discovery with exclusive behavior. Meaning files committed by the + * snapshot from the position in {@code ContinuousEnumerationResult} won't be included in the + * next incremental scan. */ private ContinuousEnumerationResult discoverInitialSplits() { Optional startSnapshotOptional = startSnapshot(table, scanContext); if (!startSnapshotOptional.isPresent()) { - return new ContinuousEnumerationResult(Collections.emptyList(), null, - IcebergEnumeratorPosition.empty()); + return new ContinuousEnumerationResult( + Collections.emptyList(), null, IcebergEnumeratorPosition.empty()); } Snapshot startSnapshot = startSnapshotOptional.get(); - LOG.info("Get starting snapshot id {} based on strategy {}", - startSnapshot.snapshotId(), scanContext.streamingStartingStrategy()); + LOG.info( + "Get starting snapshot id {} based on strategy {}", + startSnapshot.snapshotId(), + scanContext.streamingStartingStrategy()); List splits; IcebergEnumeratorPosition toPosition; - if (scanContext.streamingStartingStrategy() == StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) { + if (scanContext.streamingStartingStrategy() + == StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) { // do a batch table scan first splits = FlinkSplitPlanner.planIcebergSourceSplits(table, scanContext, workerPool); - LOG.info("Discovered {} splits from initial batch table scan with snapshot Id {}", - splits.size(), startSnapshot.snapshotId()); + LOG.info( + "Discovered {} splits from initial batch table scan with snapshot Id {}", + splits.size(), + startSnapshot.snapshotId()); // For TABLE_SCAN_THEN_INCREMENTAL, incremental mode starts exclusive from the startSnapshot - toPosition = IcebergEnumeratorPosition.of(startSnapshot.snapshotId(), startSnapshot.timestampMillis()); + toPosition = + IcebergEnumeratorPosition.of(startSnapshot.snapshotId(), startSnapshot.timestampMillis()); } else { // For all other modes, starting snapshot should be consumed inclusively. // Use parentId to achieve the inclusive behavior. It is fine if parentId is null. @@ -140,24 +154,29 @@ private ContinuousEnumerationResult discoverInitialSplits() { Long parentSnapshotId = startSnapshot.parentId(); if (parentSnapshotId != null) { Snapshot parentSnapshot = table.snapshot(parentSnapshotId); - Long parentSnapshotTimestampMs = parentSnapshot != null ? parentSnapshot.timestampMillis() : null; + Long parentSnapshotTimestampMs = + parentSnapshot != null ? parentSnapshot.timestampMillis() : null; toPosition = IcebergEnumeratorPosition.of(parentSnapshotId, parentSnapshotTimestampMs); } else { toPosition = IcebergEnumeratorPosition.empty(); } - LOG.info("Start incremental scan with start snapshot (inclusive): id = {}, timestamp = {}", - startSnapshot.snapshotId(), startSnapshot.timestampMillis()); + LOG.info( + "Start incremental scan with start snapshot (inclusive): id = {}, timestamp = {}", + startSnapshot.snapshotId(), + startSnapshot.timestampMillis()); } return new ContinuousEnumerationResult(splits, null, toPosition); } /** - * Calculate the starting snapshot based on the {@link StreamingStartingStrategy} defined in {@code ScanContext}. - *

    - * If the {@link StreamingStartingStrategy} is not {@link StreamingStartingStrategy#TABLE_SCAN_THEN_INCREMENTAL}, - * the start snapshot should be consumed inclusively. + * Calculate the starting snapshot based on the {@link StreamingStartingStrategy} defined in + * {@code ScanContext}. + * + *

    If the {@link StreamingStartingStrategy} is not {@link + * StreamingStartingStrategy#TABLE_SCAN_THEN_INCREMENTAL}, the start snapshot should be consumed + * inclusively. */ @VisibleForTesting static Optional startSnapshot(Table table, ScanContext scanContext) { @@ -169,21 +188,25 @@ static Optional startSnapshot(Table table, ScanContext scanContext) { return Optional.ofNullable(SnapshotUtil.oldestAncestor(table)); case INCREMENTAL_FROM_SNAPSHOT_ID: Snapshot matchedSnapshotById = table.snapshot(scanContext.startSnapshotId()); - Preconditions.checkArgument(matchedSnapshotById != null, + Preconditions.checkArgument( + matchedSnapshotById != null, "Start snapshot id not found in history: " + scanContext.startSnapshotId()); return Optional.of(matchedSnapshotById); case INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP: - long snapshotIdAsOfTime = SnapshotUtil.snapshotIdAsOfTime(table, scanContext.startSnapshotTimestamp()); + long snapshotIdAsOfTime = + SnapshotUtil.snapshotIdAsOfTime(table, scanContext.startSnapshotTimestamp()); Snapshot matchedSnapshotByTimestamp = table.snapshot(snapshotIdAsOfTime); if (matchedSnapshotByTimestamp.timestampMillis() == scanContext.startSnapshotTimestamp()) { return Optional.of(matchedSnapshotByTimestamp); } else { - // if the snapshotIdAsOfTime has the timestamp value smaller than the scanContext.startSnapshotTimestamp(), + // if the snapshotIdAsOfTime has the timestamp value smaller than the + // scanContext.startSnapshotTimestamp(), // return the child snapshot whose timestamp value is larger return Optional.of(SnapshotUtil.snapshotAfter(table, snapshotIdAsOfTime)); } default: - throw new IllegalArgumentException("Unknown starting strategy: " + scanContext.streamingStartingStrategy()); + throw new IllegalArgumentException( + "Unknown starting strategy: " + scanContext.streamingStartingStrategy()); } } } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java index e024473da3c9..96aba296f8cf 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; @@ -62,9 +61,7 @@ public String toString() { @Override public int hashCode() { - return Objects.hashCode( - snapshotId, - snapshotTimestampMs); + return Objects.hashCode(snapshotId, snapshotTimestampMs); } @Override @@ -76,7 +73,7 @@ public boolean equals(Object o) { return false; } IcebergEnumeratorPosition other = (IcebergEnumeratorPosition) o; - return Objects.equal(snapshotId, other.snapshotId()) && - Objects.equal(snapshotTimestampMs, other.snapshotTimestampMs()); + return Objects.equal(snapshotId, other.snapshotId()) + && Objects.equal(snapshotTimestampMs, other.snapshotTimestampMs()); } } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java index 83b230e80e08..1c63807361c5 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.IOException; @@ -24,9 +23,11 @@ import org.apache.flink.core.memory.DataInputDeserializer; import org.apache.flink.core.memory.DataOutputSerializer; -class IcebergEnumeratorPositionSerializer implements SimpleVersionedSerializer { +class IcebergEnumeratorPositionSerializer + implements SimpleVersionedSerializer { - public static final IcebergEnumeratorPositionSerializer INSTANCE = new IcebergEnumeratorPositionSerializer(); + public static final IcebergEnumeratorPositionSerializer INSTANCE = + new IcebergEnumeratorPositionSerializer(); private static final int VERSION = 1; diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java index bd2f44c0059b..7913f7b4350e 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.Serializable; @@ -24,12 +23,9 @@ import javax.annotation.Nullable; import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; -/** - * Enumerator state for checkpointing - */ +/** Enumerator state for checkpointing */ public class IcebergEnumeratorState implements Serializable { - @Nullable - private final IcebergEnumeratorPosition lastEnumeratedPosition; + @Nullable private final IcebergEnumeratorPosition lastEnumeratedPosition; private final Collection pendingSplits; public IcebergEnumeratorState(Collection pendingSplits) { diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java index 8f020bbe539e..f72804363894 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.IOException; @@ -32,17 +31,21 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; @Internal -public class IcebergEnumeratorStateSerializer implements SimpleVersionedSerializer { +public class IcebergEnumeratorStateSerializer + implements SimpleVersionedSerializer { - public static final IcebergEnumeratorStateSerializer INSTANCE = new IcebergEnumeratorStateSerializer(); + public static final IcebergEnumeratorStateSerializer INSTANCE = + new IcebergEnumeratorStateSerializer(); private static final int VERSION = 1; private static final ThreadLocal SERIALIZER_CACHE = ThreadLocal.withInitial(() -> new DataOutputSerializer(1024)); - private final IcebergEnumeratorPositionSerializer positionSerializer = IcebergEnumeratorPositionSerializer.INSTANCE; - private final IcebergSourceSplitSerializer splitSerializer = IcebergSourceSplitSerializer.INSTANCE; + private final IcebergEnumeratorPositionSerializer positionSerializer = + IcebergEnumeratorPositionSerializer.INSTANCE; + private final IcebergSourceSplitSerializer splitSerializer = + IcebergSourceSplitSerializer.INSTANCE; @Override public int getVersion() { @@ -108,7 +111,8 @@ private IcebergEnumeratorState deserializeV1(byte[] serialized) throws IOExcepti in.read(splitBytes); IcebergSourceSplit split = splitSerializer.deserialize(splitSerializerVersion, splitBytes); String statusName = in.readUTF(); - pendingSplits.add(new IcebergSourceSplitState(split, IcebergSourceSplitStatus.valueOf(statusName))); + pendingSplits.add( + new IcebergSourceSplitState(split, IcebergSourceSplitStatus.valueOf(statusName))); } return new IcebergEnumeratorState(enumeratorPosition, pendingSplits); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java index 8d33b6f0734d..1897f2368a41 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.util.List; @@ -34,9 +33,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * One-time split enumeration at the start-up for batch execution - */ +/** One-time split enumeration at the start-up for batch execution */ @Internal public class StaticIcebergEnumerator extends AbstractIcebergEnumerator { private static final Logger LOG = LoggerFactory.getLogger(StaticIcebergEnumerator.class); @@ -64,18 +61,26 @@ public StaticIcebergEnumerator( public void start() { super.start(); if (shouldEnumerate) { - // Ideally, operatorId should be used as the threadPoolName as Flink guarantees its uniqueness within a job. - // SplitEnumeratorContext doesn't expose the OperatorCoordinator.Context, which would contain the OperatorID. - // Need to discuss with Flink community whether it is ok to expose a public API like the protected method - // "OperatorCoordinator.Context getCoordinatorContext()" from SourceCoordinatorContext implementation. + // Ideally, operatorId should be used as the threadPoolName as Flink guarantees its uniqueness + // within a job. + // SplitEnumeratorContext doesn't expose the OperatorCoordinator.Context, which would contain + // the OperatorID. + // Need to discuss with Flink community whether it is ok to expose a public API like the + // protected method + // "OperatorCoordinator.Context getCoordinatorContext()" from SourceCoordinatorContext + // implementation. // For now,

  • - is used as the unique thread pool name. String threadName = "iceberg-plan-worker-pool-" + table.name() + "-" + UUID.randomUUID(); - ExecutorService workerPool = ThreadPools.newWorkerPool(threadName, scanContext.planParallelism()); + ExecutorService workerPool = + ThreadPools.newWorkerPool(threadName, scanContext.planParallelism()); try { - List splits = FlinkSplitPlanner.planIcebergSourceSplits(table, scanContext, workerPool); + List splits = + FlinkSplitPlanner.planIcebergSourceSplits(table, scanContext, workerPool); assigner.onDiscoveredSplits(splits); - LOG.info("Discovered {} splits from table {} during job initialization", - splits.size(), table.name()); + LOG.info( + "Discovered {} splits from table {} during job initialization", + splits.size(), + table.name()); } finally { workerPool.shutdown(); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java index f4e23a09a9f2..7b94c364c976 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.util.Collections; @@ -33,22 +32,20 @@ * {@link RecordsWithSplitIds} is used to pass a batch of records from fetcher to source reader. * Batching is to improve the efficiency for records handover. * - * {@link RecordsWithSplitIds} interface can encapsulate batches from multiple splits. - * This is the case for Kafka source where fetchers can retrieve records from multiple - * Kafka partitions at the same time. + *

    {@link RecordsWithSplitIds} interface can encapsulate batches from multiple splits. This is + * the case for Kafka source where fetchers can retrieve records from multiple Kafka partitions at + * the same time. * - * For file-based sources like Iceberg, readers always read one split/file at a time. - * Hence, we will only have a batch of records for one split here. + *

    For file-based sources like Iceberg, readers always read one split/file at a time. Hence, we + * will only have a batch of records for one split here. * - * This class uses array to store a batch of records from the same file (with the same fileOffset). + *

    This class uses array to store a batch of records from the same file (with the same + * fileOffset). */ class ArrayBatchRecords implements RecordsWithSplitIds> { - @Nullable - private String splitId; - @Nullable - private final Pool.Recycler recycler; - @Nullable - private final T[] records; + @Nullable private String splitId; + @Nullable private final Pool.Recycler recycler; + @Nullable private final T[] records; private final int numberOfRecords; private final Set finishedSplits; private final RecordAndPosition recordAndPosition; @@ -57,8 +54,13 @@ class ArrayBatchRecords implements RecordsWithSplitIds> private int position; private ArrayBatchRecords( - @Nullable String splitId, @Nullable Pool.Recycler recycler, @Nullable T[] records, - int numberOfRecords, int fileOffset, long startingRecordOffset, Set finishedSplits) { + @Nullable String splitId, + @Nullable Pool.Recycler recycler, + @Nullable T[] records, + int numberOfRecords, + int fileOffset, + long startingRecordOffset, + Set finishedSplits) { Preconditions.checkArgument(numberOfRecords >= 0, "numberOfRecords can't be negative"); Preconditions.checkArgument(fileOffset >= 0, "fileOffset can't be negative"); Preconditions.checkArgument(startingRecordOffset >= 0, "numberOfRecords can't be negative"); @@ -67,7 +69,8 @@ private ArrayBatchRecords( this.recycler = recycler; this.records = records; this.numberOfRecords = numberOfRecords; - this.finishedSplits = Preconditions.checkNotNull(finishedSplits, "finishedSplits can be empty but not null"); + this.finishedSplits = + Preconditions.checkNotNull(finishedSplits, "finishedSplits can be empty but not null"); this.recordAndPosition = new RecordAndPosition<>(); recordAndPosition.set(null, fileOffset, startingRecordOffset); @@ -97,8 +100,8 @@ public RecordAndPosition nextRecordFromSplit() { } /** - * This method is called when all records from this batch has been emitted. - * If recycler is set, it should be called to return the records array back to pool. + * This method is called when all records from this batch has been emitted. If recycler is set, it + * should be called to return the records array back to pool. */ @Override public void recycle() { @@ -125,15 +128,15 @@ int numberOfRecords() { /** * Create a ArrayBatchRecords backed up an array with records from the same file * - * @param splitId Iceberg source only read from one split a time. - * We never have multiple records from multiple splits. - * @param recycler Because {@link DataIterator} with {@link RowData} returns an iterator of reused RowData object, - * we need to clone RowData eagerly when constructing a batch of records. - * We can use object pool to reuse the RowData array object which can be expensive to create. - * This recycler can be provided to recycle the array object back to pool after read is exhausted. - * If the {@link DataIterator} returns an iterator of non-reused objects, - * we don't need to clone objects. It is cheap to just create the batch array. - * Hence, we don't need object pool and recycler can be set to null. + * @param splitId Iceberg source only read from one split a time. We never have multiple records + * from multiple splits. + * @param recycler Because {@link DataIterator} with {@link RowData} returns an iterator of reused + * RowData object, we need to clone RowData eagerly when constructing a batch of records. We + * can use object pool to reuse the RowData array object which can be expensive to create. + * This recycler can be provided to recycle the array object back to pool after read is + * exhausted. If the {@link DataIterator} returns an iterator of non-reused objects, we don't + * need to clone objects. It is cheap to just create the batch array. Hence, we don't need + * object pool and recycler can be set to null. * @param records an array (maybe reused) holding a batch of records * @param numberOfRecords actual number of records in the array * @param fileOffset fileOffset for all records in this batch @@ -141,10 +144,20 @@ int numberOfRecords() { * @param record type */ public static ArrayBatchRecords forRecords( - String splitId, Pool.Recycler recycler, T[] records, int numberOfRecords, - int fileOffset, long startingRecordOffset) { - return new ArrayBatchRecords<>(splitId, recycler, records, numberOfRecords, - fileOffset, startingRecordOffset, Collections.emptySet()); + String splitId, + Pool.Recycler recycler, + T[] records, + int numberOfRecords, + int fileOffset, + long startingRecordOffset) { + return new ArrayBatchRecords<>( + splitId, + recycler, + records, + numberOfRecords, + fileOffset, + startingRecordOffset, + Collections.emptySet()); } /** @@ -153,7 +166,6 @@ public static ArrayBatchRecords forRecords( * @param splitId for the split that is just exhausted */ public static ArrayBatchRecords finishedSplit(String splitId) { - return new ArrayBatchRecords<>(null, null, null, - 0, 0, 0, Collections.singleton(splitId)); + return new ArrayBatchRecords<>(null, null, null, 0, 0, 0, Collections.singleton(splitId)); } } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java index 22da79ebf1e0..306afd1811be 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.io.IOException; @@ -30,9 +29,7 @@ import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -/** - * This implementation stores record batch in array from recyclable pool - */ +/** This implementation stores record batch in array from recyclable pool */ class ArrayPoolDataIteratorBatcher implements DataIteratorBatcher { private final int batchSize; private final int handoverQueueSize; @@ -67,7 +64,8 @@ private Pool createPoolOfBatches(int numBatches) { return poolOfBatches; } - private class ArrayPoolBatchIterator implements CloseableIterator>> { + private class ArrayPoolBatchIterator + implements CloseableIterator>> { private final String splitId; private final DataIterator inputIterator; @@ -106,8 +104,13 @@ public RecordsWithSplitIds> next() { } } - return ArrayBatchRecords.forRecords(splitId, pool.recycler(), batch, recordCount, - inputIterator.fileOffset(), inputIterator.recordOffset() - recordCount); + return ArrayBatchRecords.forRecords( + splitId, + pool.recycler(), + batch, + recordCount, + inputIterator.fileOffset(), + inputIterator.recordOffset() - recordCount); } @Override diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java index f95a7f95e669..c376e359c600 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.io.Serializable; @@ -26,10 +25,12 @@ import org.apache.iceberg.io.CloseableIterator; /** - * Batcher converts iterator of T into iterator of batched {@code RecordsWithSplitIds>}, - * as FLIP-27's {@link SplitReader#fetch()} returns batched records. + * Batcher converts iterator of T into iterator of batched {@code + * RecordsWithSplitIds>}, as FLIP-27's {@link SplitReader#fetch()} returns + * batched records. */ @FunctionalInterface public interface DataIteratorBatcher extends Serializable { - CloseableIterator>> batch(String splitId, DataIterator inputIterator); + CloseableIterator>> batch( + String splitId, DataIterator inputIterator); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java index 5c2c2b4b3ae8..bbf797ef4aa8 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; @@ -24,9 +23,7 @@ import org.apache.iceberg.flink.source.split.IcebergSourceSplit; import org.apache.iceberg.io.CloseableIterator; -/** - * A {@link ReaderFunction} implementation that uses {@link DataIterator}. - */ +/** A {@link ReaderFunction} implementation that uses {@link DataIterator}. */ public abstract class DataIteratorReaderFunction implements ReaderFunction { private final DataIteratorBatcher batcher; @@ -37,10 +34,10 @@ public DataIteratorReaderFunction(DataIteratorBatcher batcher) { protected abstract DataIterator createDataIterator(IcebergSourceSplit split); @Override - public CloseableIterator>> apply(IcebergSourceSplit split) { + public CloseableIterator>> apply( + IcebergSourceSplit split) { DataIterator inputIterator = createDataIterator(split); inputIterator.seek(split.fileOffset(), split.recordOffset()); return batcher.batch(split.splitId(), inputIterator); } - } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java index 44d3ba572dca..db8d2fcd56e3 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.util.Collection; @@ -28,13 +27,12 @@ import org.apache.iceberg.flink.source.split.SplitRequestEvent; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -public class IcebergSourceReader extends - SingleThreadMultiplexSourceReaderBase, T, IcebergSourceSplit, IcebergSourceSplit> { +public class IcebergSourceReader + extends SingleThreadMultiplexSourceReaderBase< + RecordAndPosition, T, IcebergSourceSplit, IcebergSourceSplit> { public IcebergSourceReader( - ReaderFunction readerFunction, - SourceReaderContext context, - ReaderMetricsContext metrics) { + ReaderFunction readerFunction, SourceReaderContext context, ReaderMetricsContext metrics) { super( () -> new IcebergSourceSplitReader<>(readerFunction, context, metrics), new IcebergSourceRecordEmitter<>(), diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceRecordEmitter.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceRecordEmitter.java index 4e467db92e93..337d9d3c4223 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceRecordEmitter.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceRecordEmitter.java @@ -16,23 +16,20 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import org.apache.flink.api.connector.source.SourceOutput; import org.apache.flink.connector.base.source.reader.RecordEmitter; import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -final class IcebergSourceRecordEmitter implements RecordEmitter, T, IcebergSourceSplit> { +final class IcebergSourceRecordEmitter + implements RecordEmitter, T, IcebergSourceSplit> { - IcebergSourceRecordEmitter() { - } + IcebergSourceRecordEmitter() {} @Override public void emitRecord( - RecordAndPosition element, - SourceOutput output, - IcebergSourceSplit split) { + RecordAndPosition element, SourceOutput output, IcebergSourceSplit split) { output.collect(element.record()); split.updatePosition(element.fileOffset(), element.recordOffset()); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java index 8c6bb90a64c1..8e9e2b296e39 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.io.IOException; @@ -55,18 +54,24 @@ class IcebergSourceSplitReader implements SplitReader, I private IcebergSourceSplit currentSplit; private String currentSplitId; - IcebergSourceSplitReader(ReaderFunction openSplitFunction, - SourceReaderContext context, - ReaderMetricsContext metrics) { + IcebergSourceSplitReader( + ReaderFunction openSplitFunction, + SourceReaderContext context, + ReaderMetricsContext metrics) { this.openSplitFunction = openSplitFunction; this.indexOfSubtask = context.getIndexOfSubtask(); this.splits = new ArrayDeque<>(); - this.assignedSplits = metrics.counter(ReaderMetricsContext.ASSIGNED_SPLITS, Long.class, Unit.COUNT); - this.assignedBytes = metrics.counter(ReaderMetricsContext.ASSIGNED_BYTES, Long.class, Unit.COUNT); - this.finishedSplits = metrics.counter(ReaderMetricsContext.FINISHED_SPLITS, Long.class, Unit.COUNT); - this.finishedBytes = metrics.counter(ReaderMetricsContext.FINISHED_BYTES, Long.class, Unit.COUNT); - this.splitReaderFetchCalls = metrics.counter(ReaderMetricsContext.SPLIT_READER_FETCH_CALLS, Long.class, Unit.COUNT); + this.assignedSplits = + metrics.counter(ReaderMetricsContext.ASSIGNED_SPLITS, Long.class, Unit.COUNT); + this.assignedBytes = + metrics.counter(ReaderMetricsContext.ASSIGNED_BYTES, Long.class, Unit.COUNT); + this.finishedSplits = + metrics.counter(ReaderMetricsContext.FINISHED_SPLITS, Long.class, Unit.COUNT); + this.finishedBytes = + metrics.counter(ReaderMetricsContext.FINISHED_BYTES, Long.class, Unit.COUNT); + this.splitReaderFetchCalls = + metrics.counter(ReaderMetricsContext.SPLIT_READER_FETCH_CALLS, Long.class, Unit.COUNT); } @Override @@ -101,8 +106,8 @@ public RecordsWithSplitIds> fetch() throws IOException { @Override public void handleSplitsChanges(SplitsChange splitsChange) { if (!(splitsChange instanceof SplitsAddition)) { - throw new UnsupportedOperationException(String.format( - "Unsupported split change: %s", splitsChange.getClass())); + throw new UnsupportedOperationException( + String.format("Unsupported split change: %s", splitsChange.getClass())); } LOG.info("Add {} splits to reader", splitsChange.splits().size()); @@ -112,8 +117,7 @@ public void handleSplitsChanges(SplitsChange splitsChange) { } @Override - public void wakeUp() { - } + public void wakeUp() {} @Override public void close() throws Exception { @@ -124,15 +128,11 @@ public void close() throws Exception { } private long calculateBytes(IcebergSourceSplit split) { - return split.task().files().stream() - .map(FileScanTask::length) - .reduce(0L, Long::sum); + return split.task().files().stream().map(FileScanTask::length).reduce(0L, Long::sum); } private long calculateBytes(SplitsChange splitsChanges) { - return splitsChanges.splits().stream() - .map(this::calculateBytes) - .reduce(0L, Long::sum); + return splitsChanges.splits().stream().map(this::calculateBytes).reduce(0L, Long::sum); } private ArrayBatchRecords finishSplit() throws IOException { diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java index eff0156229de..1ea91f10b4e7 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.io.Serializable; @@ -26,6 +25,7 @@ import org.apache.iceberg.io.CloseableIterator; @FunctionalInterface -public interface ReaderFunction extends Serializable, - Function>>> { -} +public interface ReaderFunction + extends Serializable, + Function< + IcebergSourceSplit, CloseableIterator>>> {} diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderMetricsContext.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderMetricsContext.java index 1141e6e6771b..15ff3cc8bd63 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderMetricsContext.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderMetricsContext.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.util.concurrent.atomic.AtomicLong; @@ -71,8 +70,10 @@ public Counter counter(String name, Class type, Unit un ValidationException.check(type == Long.class, "'%s' requires Integer type", FINISHED_BYTES); return (Counter) longCounter(finishedBytes::addAndGet, finishedBytes::get); case SPLIT_READER_FETCH_CALLS: - ValidationException.check(type == Long.class, "'%s' requires Integer type", SPLIT_READER_FETCH_CALLS); - return (Counter) longCounter(splitReaderFetchCalls::addAndGet, splitReaderFetchCalls::get); + ValidationException.check( + type == Long.class, "'%s' requires Integer type", SPLIT_READER_FETCH_CALLS); + return (Counter) + longCounter(splitReaderFetchCalls::addAndGet, splitReaderFetchCalls::get); default: throw new IllegalArgumentException(String.format("Unsupported counter: '%s'", name)); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java index 46538516cbda..6ac92592b6aa 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import org.apache.flink.annotation.Internal; @@ -43,8 +42,7 @@ public RecordAndPosition(T record, int fileOffset, long recordOffset) { this.recordOffset = recordOffset; } - public RecordAndPosition() { - } + public RecordAndPosition() {} // ------------------------------------------------------------------------ @@ -77,5 +75,4 @@ public void record(T nextRecord) { public String toString() { return String.format("%s @ %d + %d", record, fileOffset, recordOffset); } - } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java index 4134b071622f..ef92e2e6b81f 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java @@ -16,25 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.io.Serializable; /** - * In FLIP-27 source, SplitReader#fetch() returns a batch of records. - * Since DataIterator for RowData returns an iterator of reused RowData objects, - * RecordFactory is needed to (1) create object array that is recyclable via pool. - * (2) clone RowData element from DataIterator to the batch array. + * In FLIP-27 source, SplitReader#fetch() returns a batch of records. Since DataIterator for RowData + * returns an iterator of reused RowData objects, RecordFactory is needed to (1) create object array + * that is recyclable via pool. (2) clone RowData element from DataIterator to the batch array. */ interface RecordFactory extends Serializable { - /** - * Create a batch of records - */ + /** Create a batch of records */ T[] createBatch(int batchSize); - /** - * Clone record into the specified position of the batch array - */ + /** Clone record into the specified position of the batch array */ void clone(T from, T[] batch, int position); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java index cdb460f2e7ab..c747375d2a28 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import org.apache.flink.configuration.ReadableConfig; @@ -39,10 +38,18 @@ public class RowDataReaderFunction extends DataIteratorReaderFunction { private final EncryptionManager encryption; public RowDataReaderFunction( - ReadableConfig config, Schema tableSchema, Schema projectedSchema, - String nameMapping, boolean caseSensitive, FileIO io, EncryptionManager encryption) { - super(new ArrayPoolDataIteratorBatcher<>(config, new RowDataRecordFactory( - FlinkSchemaUtil.convert(readSchema(tableSchema, projectedSchema))))); + ReadableConfig config, + Schema tableSchema, + Schema projectedSchema, + String nameMapping, + boolean caseSensitive, + FileIO io, + EncryptionManager encryption) { + super( + new ArrayPoolDataIteratorBatcher<>( + config, + new RowDataRecordFactory( + FlinkSchemaUtil.convert(readSchema(tableSchema, projectedSchema))))); this.tableSchema = tableSchema; this.readSchema = readSchema(tableSchema, projectedSchema); this.nameMapping = nameMapping; @@ -54,17 +61,14 @@ public RowDataReaderFunction( @Override public DataIterator createDataIterator(IcebergSourceSplit split) { return new DataIterator<>( - new RowDataFileScanTaskReader( - tableSchema, - readSchema, - nameMapping, - caseSensitive), - split.task(), io, encryption); + new RowDataFileScanTaskReader(tableSchema, readSchema, nameMapping, caseSensitive), + split.task(), + io, + encryption); } private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); return projectedSchema == null ? tableSchema : projectedSchema; } - } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java index a91c7b45ed61..1e265b2663ce 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import org.apache.flink.api.common.typeutils.TypeSerializer; diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java index b46096af0e67..35f8ade9843d 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.split; import java.io.IOException; @@ -43,8 +42,7 @@ public class IcebergSourceSplit implements SourceSplit, Serializable { // The splits are frequently serialized into checkpoints. // Caching the byte representation makes repeated serialization cheap. - @Nullable - private transient byte[] serializedBytesCache; + @Nullable private transient byte[] serializedBytesCache; private IcebergSourceSplit(CombinedScanTask task, int fileOffset, long recordOffset) { this.task = task; @@ -75,9 +73,7 @@ public long recordOffset() { @Override public String splitId() { - return MoreObjects.toStringHelper(this) - .add("files", toString(task.files())) - .toString(); + return MoreObjects.toStringHelper(this).add("files", toString(task.files())).toString(); } public void updatePosition(int newFileOffset, long newRecordOffset) { @@ -97,12 +93,16 @@ public String toString() { } private String toString(Collection files) { - return Iterables.toString(files.stream().map(fileScanTask -> - MoreObjects.toStringHelper(fileScanTask) - .add("file", fileScanTask.file().path().toString()) - .add("start", fileScanTask.start()) - .add("length", fileScanTask.length()) - .toString()).collect(Collectors.toList())); + return Iterables.toString( + files.stream() + .map( + fileScanTask -> + MoreObjects.toStringHelper(fileScanTask) + .add("file", fileScanTask.file().path().toString()) + .add("start", fileScanTask.start()) + .add("length", fileScanTask.length()) + .toString()) + .collect(Collectors.toList())); } byte[] serializeV1() throws IOException { @@ -114,7 +114,8 @@ byte[] serializeV1() throws IOException { static IcebergSourceSplit deserializeV1(byte[] serialized) throws IOException { try { - return InstantiationUtil.deserializeObject(serialized, IcebergSourceSplit.class.getClassLoader()); + return InstantiationUtil.deserializeObject( + serialized, IcebergSourceSplit.class.getClassLoader()); } catch (ClassNotFoundException e) { throw new RuntimeException("Failed to deserialize the split.", e); } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java index 9e32af5429b9..ee0f364e17d6 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.split; import java.io.IOException; @@ -24,9 +23,8 @@ import org.apache.flink.core.io.SimpleVersionedSerializer; /** - * TODO: use Java serialization for now. - * Will switch to more stable serializer from - * issue-1698. + * TODO: use Java serialization for now. Will switch to more stable serializer from issue-1698. */ @Internal public class IcebergSourceSplitSerializer implements SimpleVersionedSerializer { @@ -49,8 +47,11 @@ public IcebergSourceSplit deserialize(int version, byte[] serialized) throws IOE case 1: return IcebergSourceSplit.deserializeV1(serialized); default: - throw new IOException(String.format("Failed to deserialize IcebergSourceSplit. " + - "Encountered unsupported version: %d. Supported version are [1]", version)); + throw new IOException( + String.format( + "Failed to deserialize IcebergSourceSplit. " + + "Encountered unsupported version: %d. Supported version are [1]", + version)); } } } diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java index b08218b93ce9..d9061e049e00 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.split; public class IcebergSourceSplitState { diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java index 78ce70b22f7a..d4a84a165e1a 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.split; public enum IcebergSourceSplitStatus { diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java index 7a7610cc1978..eabd757aa638 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.split; import java.util.Collection; @@ -24,9 +23,7 @@ import org.apache.flink.annotation.Internal; import org.apache.flink.api.connector.source.SourceEvent; -/** - * We can remove this class once FLINK-21364 is resolved. - */ +/** We can remove this class once FLINK-21364 is resolved. */ @Internal public class SplitRequestEvent implements SourceEvent { private static final long serialVersionUID = 1L; diff --git a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java index 274d2a8d17a0..2c5c587f4ebf 100644 --- a/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java +++ b/flink/v1.14/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.util; import org.apache.flink.api.common.typeinfo.TypeInformation; @@ -26,14 +25,12 @@ import org.apache.flink.table.types.logical.RowType; /** - * This is a small util class that try to hide calls to Flink - * Internal or PublicEvolve interfaces as Flink can change - * those APIs during minor version release. + * This is a small util class that try to hide calls to Flink Internal or PublicEvolve interfaces as + * Flink can change those APIs during minor version release. */ public class FlinkCompatibilityUtil { - private FlinkCompatibilityUtil() { - } + private FlinkCompatibilityUtil() {} public static TypeInformation toTypeInfo(RowType rowType) { return InternalTypeInfo.of(rowType); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/FlinkCatalogTestBase.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/FlinkCatalogTestBase.java index 3c5f25e9d876..d4da736dcd83 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/FlinkCatalogTestBase.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/FlinkCatalogTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.IOException; @@ -74,8 +73,7 @@ public static Iterable parameters() { return Lists.newArrayList( new Object[] {"testhive", Namespace.empty()}, new Object[] {"testhadoop", Namespace.empty()}, - new Object[] {"testhadoop_basenamespace", Namespace.of("l0", "l1")} - ); + new Object[] {"testhadoop_basenamespace", Namespace.of("l0", "l1")}); } protected final String catalogName; @@ -92,9 +90,10 @@ public FlinkCatalogTestBase(String catalogName, Namespace baseNamespace) { this.catalogName = catalogName; this.baseNamespace = baseNamespace; this.isHadoopCatalog = catalogName.startsWith("testhadoop"); - this.validationCatalog = isHadoopCatalog ? - new HadoopCatalog(hiveConf, "file:" + hadoopWarehouse.getRoot()) : - catalog; + this.validationCatalog = + isHadoopCatalog + ? new HadoopCatalog(hiveConf, "file:" + hadoopWarehouse.getRoot()) + : catalog; this.validationNamespaceCatalog = (SupportsNamespaces) validationCatalog; config.put("type", "iceberg"); @@ -110,7 +109,8 @@ public FlinkCatalogTestBase(String catalogName, Namespace baseNamespace) { config.put(CatalogProperties.WAREHOUSE_LOCATION, String.format("file://%s", warehouseRoot())); this.flinkDatabase = catalogName + "." + DATABASE; - this.icebergNamespace = Namespace.of(ArrayUtils.concat(baseNamespace.levels(), new String[] {DATABASE})); + this.icebergNamespace = + Namespace.of(ArrayUtils.concat(baseNamespace.levels(), new String[] {DATABASE})); } protected String warehouseRoot() { @@ -139,8 +139,14 @@ static String toWithClause(Map props) { if (propCount > 0) { builder.append(","); } - builder.append("'").append(entry.getKey()).append("'").append("=") - .append("'").append(entry.getValue()).append("'"); + builder + .append("'") + .append(entry.getKey()) + .append("'") + .append("=") + .append("'") + .append(entry.getValue()) + .append("'"); propCount++; } builder.append(")"); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/FlinkTestBase.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/FlinkTestBase.java index d19ff3467eca..95471ac88257 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/FlinkTestBase.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/FlinkTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -45,8 +44,7 @@ public abstract class FlinkTestBase extends TestBaseUtils { public static MiniClusterWithClientResource miniClusterResource = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private static TestHiveMetastore metastore = null; protected static HiveConf hiveConf = null; @@ -59,8 +57,10 @@ public static void startMetastore() { FlinkTestBase.metastore = new TestHiveMetastore(); metastore.start(); FlinkTestBase.hiveConf = metastore.hiveConf(); - FlinkTestBase.catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + FlinkTestBase.catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); } @AfterClass @@ -73,13 +73,12 @@ protected TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { if (tEnv == null) { - EnvironmentSettings settings = EnvironmentSettings - .newInstance() - .inBatchMode() - .build(); + EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); TableEnvironment env = TableEnvironment.create(settings); - env.getConfig().getConfiguration().set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); + env.getConfig() + .getConfiguration() + .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); tEnv = env; } } @@ -105,9 +104,7 @@ protected List sql(String query, Object... args) { } protected void assertSameElements(Iterable expected, Iterable actual) { - Assertions.assertThat(actual) - .isNotNull() - .containsExactlyInAnyOrderElementsOf(expected); + Assertions.assertThat(actual).isNotNull().containsExactlyInAnyOrderElementsOf(expected); } protected void assertSameElements(String message, Iterable expected, Iterable actual) { diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/HadoopTableResource.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/HadoopTableResource.java index bc4e209a46ba..a4338a881bc8 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/HadoopTableResource.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/HadoopTableResource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -41,12 +40,17 @@ public class HadoopTableResource extends ExternalResource { private TableLoader tableLoader; private Table table; - public HadoopTableResource(TemporaryFolder temporaryFolder, String database, String tableName, Schema schema) { + public HadoopTableResource( + TemporaryFolder temporaryFolder, String database, String tableName, Schema schema) { this(temporaryFolder, database, tableName, schema, null); } - public HadoopTableResource(TemporaryFolder temporaryFolder, String database, String tableName, - Schema schema, PartitionSpec partitionSpec) { + public HadoopTableResource( + TemporaryFolder temporaryFolder, + String database, + String tableName, + Schema schema, + PartitionSpec partitionSpec) { this.temporaryFolder = temporaryFolder; this.database = database; this.tableName = tableName; @@ -67,7 +71,8 @@ protected void before() throws Throwable { if (partitionSpec == null) { this.table = catalog.createTable(TableIdentifier.of(database, tableName), schema); } else { - this.table = catalog.createTable(TableIdentifier.of(database, tableName), schema, partitionSpec); + this.table = + catalog.createTable(TableIdentifier.of(database, tableName), schema, partitionSpec); } tableLoader.open(); } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java index 9dfa1acf2719..45af9241b743 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import org.apache.flink.configuration.Configuration; @@ -29,20 +28,18 @@ public class MiniClusterResource { private static final int DEFAULT_TM_NUM = 1; private static final int DEFAULT_PARALLELISM = 4; - public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = new Configuration() - // disable classloader check as Avro may cache class/object in the serializers. - .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - - private MiniClusterResource() { + public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = + new Configuration() + // disable classloader check as Avro may cache class/object in the serializers. + .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - } + private MiniClusterResource() {} /** - * It will start a mini cluster with classloader.check-leaked-classloader=false, - * so that we won't break the unit tests because of the class loader leak issue. - * In our iceberg integration tests, there're some that will assert the results - * after finished the flink jobs, so actually we may access the class loader - * that has been closed by the flink task managers if we enable the switch + * It will start a mini cluster with classloader.check-leaked-classloader=false, so that we won't + * break the unit tests because of the class loader leak issue. In our iceberg integration tests, + * there're some that will assert the results after finished the flink jobs, so actually we may + * access the class loader that has been closed by the flink task managers if we enable the switch * classloader.check-leaked-classloader by default. */ public static MiniClusterWithClientResource createWithClassloaderCheckDisabled() { @@ -53,5 +50,4 @@ public static MiniClusterWithClientResource createWithClassloaderCheckDisabled() .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) .build()); } - } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java index 59306d638ee2..c73fa1e4bc97 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.math.BigDecimal; @@ -50,8 +49,7 @@ public class RowDataConverter { private static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC); private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); - private RowDataConverter() { - } + private RowDataConverter() {} public static RowData convert(Schema iSchema, Record record) { return convert(iSchema.asStruct(), record); @@ -117,11 +115,14 @@ private static Object convert(Type type, Object object) { return bb.array(); case BINARY: ByteBuffer buffer = (ByteBuffer) object; - return Arrays.copyOfRange(buffer.array(), buffer.arrayOffset() + buffer.position(), + return Arrays.copyOfRange( + buffer.array(), + buffer.arrayOffset() + buffer.position(), buffer.arrayOffset() + buffer.remaining()); case DECIMAL: Types.DecimalType decimalType = (Types.DecimalType) type; - return DecimalData.fromBigDecimal((BigDecimal) object, decimalType.precision(), decimalType.scale()); + return DecimalData.fromBigDecimal( + (BigDecimal) object, decimalType.precision(), decimalType.scale()); case STRUCT: return convert(type.asStructType(), (Record) object); case LIST: @@ -137,8 +138,7 @@ private static Object convert(Type type, Object object) { for (Map.Entry entry : map.entrySet()) { convertedMap.put( convert(type.asMapType().keyType(), entry.getKey()), - convert(type.asMapType().valueType(), entry.getValue()) - ); + convert(type.asMapType().valueType(), entry.getValue())); } return new GenericMapData(convertedMap); default: diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java index f83703eb7cfe..74f3638c8d5a 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.hadoop.HadoopOutputFile.fromPath; + import java.io.IOException; import java.time.Duration; import java.util.Collections; @@ -68,28 +69,24 @@ import org.apache.iceberg.util.StructLikeWrapper; import org.junit.Assert; -import static org.apache.iceberg.hadoop.HadoopOutputFile.fromPath; - public class SimpleDataUtil { - private SimpleDataUtil() { - } + private SimpleDataUtil() {} - public static final Schema SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + public static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); - public static final TableSchema FLINK_SCHEMA = TableSchema.builder() - .field("id", DataTypes.INT()) - .field("data", DataTypes.STRING()) - .build(); + public static final TableSchema FLINK_SCHEMA = + TableSchema.builder().field("id", DataTypes.INT()).field("data", DataTypes.STRING()).build(); public static final RowType ROW_TYPE = (RowType) FLINK_SCHEMA.toRowDataType().getLogicalType(); public static final Record RECORD = GenericRecord.create(SCHEMA); - public static Table createTable(String path, Map properties, boolean partitioned) { + public static Table createTable( + String path, Map properties, boolean partitioned) { PartitionSpec spec; if (partitioned) { spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); @@ -126,8 +123,13 @@ public static RowData createUpdateAfter(Integer id, String data) { return GenericRowData.ofKind(RowKind.UPDATE_AFTER, id, StringData.fromString(data)); } - public static DataFile writeFile(Schema schema, PartitionSpec spec, Configuration conf, - String location, String filename, List rows) + public static DataFile writeFile( + Schema schema, + PartitionSpec spec, + Configuration conf, + String location, + String filename, + List rows) throws IOException { Path path = new Path(location, filename); FileFormat fileFormat = FileFormat.fromFileName(filename); @@ -148,27 +150,38 @@ public static DataFile writeFile(Schema schema, PartitionSpec spec, Configuratio .build(); } - public static DeleteFile writeEqDeleteFile(Table table, FileFormat format, String tablePath, String filename, - FileAppenderFactory appenderFactory, - List deletes) throws IOException { + public static DeleteFile writeEqDeleteFile( + Table table, + FileFormat format, + String tablePath, + String filename, + FileAppenderFactory appenderFactory, + List deletes) + throws IOException { EncryptedOutputFile outputFile = table.encryption().encrypt(fromPath(new Path(tablePath, filename), new Configuration())); - EqualityDeleteWriter eqWriter = appenderFactory.newEqDeleteWriter(outputFile, format, null); + EqualityDeleteWriter eqWriter = + appenderFactory.newEqDeleteWriter(outputFile, format, null); try (EqualityDeleteWriter writer = eqWriter) { writer.deleteAll(deletes); } return eqWriter.toDeleteFile(); } - public static DeleteFile writePosDeleteFile(Table table, FileFormat format, String tablePath, - String filename, - FileAppenderFactory appenderFactory, - List> positions) throws IOException { + public static DeleteFile writePosDeleteFile( + Table table, + FileFormat format, + String tablePath, + String filename, + FileAppenderFactory appenderFactory, + List> positions) + throws IOException { EncryptedOutputFile outputFile = table.encryption().encrypt(fromPath(new Path(tablePath, filename), new Configuration())); - PositionDeleteWriter posWriter = appenderFactory.newPosDeleteWriter(outputFile, format, null); + PositionDeleteWriter posWriter = + appenderFactory.newPosDeleteWriter(outputFile, format, null); try (PositionDeleteWriter writer = posWriter) { for (Pair p : positions) { writer.delete(p.first(), p.second()); @@ -195,9 +208,7 @@ public static void assertTableRows(Table table, List expected) throws I assertTableRecords(table, convertToRecords(expected)); } - /** - * Get all rows for a table - */ + /** Get all rows for a table */ public static List tableRecords(Table table) throws IOException { table.refresh(); List records = Lists.newArrayList(); @@ -221,7 +232,8 @@ private static boolean equalsRecords(List expected, List actual, return expectedSet.equals(actualSet); } - private static void assertRecordsEqual(List expected, List actual, Schema schema) { + private static void assertRecordsEqual( + List expected, List actual, Schema schema) { Assert.assertEquals(expected.size(), actual.size()); Types.StructType type = schema.asStruct(); StructLikeSet expectedSet = StructLikeSet.create(type); @@ -232,12 +244,12 @@ private static void assertRecordsEqual(List expected, List actua } /** - * Assert table contains the expected list of records after - * waiting up to {@code maxCheckCount} with {@code checkInterval} + * Assert table contains the expected list of records after waiting up to {@code maxCheckCount} + * with {@code checkInterval} */ public static void assertTableRecords( - Table table, List expected, Duration checkInterval, int maxCheckCount) - throws IOException, InterruptedException { + Table table, List expected, Duration checkInterval, int maxCheckCount) + throws IOException, InterruptedException { for (int i = 0; i < maxCheckCount; ++i) { if (equalsRecords(expected, tableRecords(table), table.schema())) { break; @@ -267,7 +279,8 @@ public static void assertTableRecords(Table table, List expected) throws } } - public static void assertTableRecords(String tablePath, List expected) throws IOException { + public static void assertTableRecords(String tablePath, List expected) + throws IOException { Preconditions.checkArgument(expected != null, "expected records shouldn't be null"); assertTableRecords(new HadoopTables().load(tablePath), expected); } @@ -282,14 +295,15 @@ public static StructLikeSet actualRowSet(Table table, String... columns) throws return actualRowSet(table, null, columns); } - public static StructLikeSet actualRowSet(Table table, Long snapshotId, String... columns) throws IOException { + public static StructLikeSet actualRowSet(Table table, Long snapshotId, String... columns) + throws IOException { table.refresh(); StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - try (CloseableIterable reader = IcebergGenerics - .read(table) - .useSnapshot(snapshotId == null ? table.currentSnapshot().snapshotId() : snapshotId) - .select(columns) - .build()) { + try (CloseableIterable reader = + IcebergGenerics.read(table) + .useSnapshot(snapshotId == null ? table.currentSnapshot().snapshotId() : snapshotId) + .select(columns) + .build()) { reader.forEach(set::add); } return set; @@ -301,16 +315,14 @@ public static List partitionDataFiles(Table table, Map Types.StructType partitionType = table.spec().partitionType(); Record partitionRecord = GenericRecord.create(partitionType).copy(partitionValues); - StructLikeWrapper expectedWrapper = StructLikeWrapper - .forType(partitionType) - .set(partitionRecord); + StructLikeWrapper expectedWrapper = + StructLikeWrapper.forType(partitionType).set(partitionRecord); List dataFiles = Lists.newArrayList(); try (CloseableIterable fileScanTasks = table.newScan().planFiles()) { for (FileScanTask scanTask : fileScanTasks) { - StructLikeWrapper wrapper = StructLikeWrapper - .forType(partitionType) - .set(scanTask.file().partition()); + StructLikeWrapper wrapper = + StructLikeWrapper.forType(partitionType).set(scanTask.file().partition()); if (expectedWrapper.equals(wrapper)) { dataFiles.add(scanTask.file()); @@ -336,7 +348,9 @@ public static Map> snapshotToDataFiles(Table table) throws tableScan = tableScan.useSnapshot(current.snapshotId()); } try (CloseableIterable scanTasks = tableScan.planFiles()) { - result.put(current.snapshotId(), ImmutableList.copyOf(Iterables.transform(scanTasks, FileScanTask::file))); + result.put( + current.snapshotId(), + ImmutableList.copyOf(Iterables.transform(scanTasks, FileScanTask::file))); } // Continue to traverse the parent snapshot if exists. @@ -353,13 +367,14 @@ public static List matchingPartitions( List dataFiles, PartitionSpec partitionSpec, Map partitionValues) { Types.StructType partitionType = partitionSpec.partitionType(); Record partitionRecord = GenericRecord.create(partitionType).copy(partitionValues); - StructLikeWrapper expected = StructLikeWrapper - .forType(partitionType) - .set(partitionRecord); - return dataFiles.stream().filter(df -> { - StructLikeWrapper wrapper = StructLikeWrapper.forType(partitionType).set(df.partition()); - return wrapper.equals(expected); - }).collect(Collectors.toList()); + StructLikeWrapper expected = StructLikeWrapper.forType(partitionType).set(partitionRecord); + return dataFiles.stream() + .filter( + df -> { + StructLikeWrapper wrapper = + StructLikeWrapper.forType(partitionType).set(df.partition()); + return wrapper.equals(expected); + }) + .collect(Collectors.toList()); } - } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java index 6ecb169b44d7..e77c62c3849d 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.ByteArrayInputStream; @@ -43,14 +42,13 @@ import org.junit.BeforeClass; import org.junit.Test; -/** - * Test for {@link CatalogLoader} and {@link TableLoader}. - */ +/** Test for {@link CatalogLoader} and {@link TableLoader}. */ public class TestCatalogTableLoader extends FlinkTestBase { private static File warehouse = null; private static final TableIdentifier IDENTIFIER = TableIdentifier.of("default", "my_table"); - private static final Schema SCHEMA = new Schema(Types.NestedField.required(1, "f1", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema(Types.NestedField.required(1, "f1", Types.StringType.get())); @BeforeClass public static void createWarehouse() throws IOException { @@ -95,12 +93,14 @@ public void testHiveCatalogTableLoader() throws IOException, ClassNotFoundExcept validateTableLoader(TableLoader.fromCatalog(catalogLoader, IDENTIFIER)); } - private static void validateCatalogLoader(CatalogLoader loader) throws IOException, ClassNotFoundException { + private static void validateCatalogLoader(CatalogLoader loader) + throws IOException, ClassNotFoundException { Table table = javaSerAndDeSer(loader).loadCatalog().createTable(IDENTIFIER, SCHEMA); validateHadoopConf(table); } - private static void validateTableLoader(TableLoader loader) throws IOException, ClassNotFoundException { + private static void validateTableLoader(TableLoader loader) + throws IOException, ClassNotFoundException { TableLoader copied = javaSerAndDeSer(loader); copied.open(); try { @@ -112,7 +112,9 @@ private static void validateTableLoader(TableLoader loader) throws IOException, private static void validateHadoopConf(Table table) { FileIO io = table.io(); - Assertions.assertThat(io).as("FileIO should be a HadoopFileIO").isInstanceOf(HadoopFileIO.class); + Assertions.assertThat(io) + .as("FileIO should be a HadoopFileIO") + .isInstanceOf(HadoopFileIO.class); HadoopFileIO hadoopIO = (HadoopFileIO) io; Assert.assertEquals("my_value", hadoopIO.conf().get("my_key")); } @@ -124,7 +126,8 @@ private static T javaSerAndDeSer(T object) throws IOException, ClassNotFound out.writeObject(object); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { return (T) in.readObject(); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java index 5c04c855149f..9987a16c7682 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -49,8 +48,9 @@ import org.junit.runners.Parameterized; /** - * In this test case, we mainly cover the impact of primary key selection, multiple operations within a single - * transaction, and multiple operations between different txn on the correctness of the data. + * In this test case, we mainly cover the impact of primary key selection, multiple operations + * within a single transaction, and multiple operations between different txn on the correctness of + * the data. */ @RunWith(Parameterized.class) public class TestChangeLogTable extends ChangeLogTableTestBase { @@ -66,10 +66,7 @@ public class TestChangeLogTable extends ChangeLogTableTestBase { @Parameterized.Parameters(name = "PartitionedTable={0}") public static Iterable parameters() { - return ImmutableList.of( - new Object[] {true}, - new Object[] {false} - ); + return ImmutableList.of(new Object[] {true}, new Object[] {false}); } public TestChangeLogTable(boolean partitioned) { @@ -85,7 +82,8 @@ public static void createWarehouse() throws IOException { @Before public void before() { - sql("CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + sql( + "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", CATALOG_NAME, warehouse); sql("USE CATALOG %s", CATALOG_NAME); sql("CREATE DATABASE %s", DATABASE_NAME); @@ -103,137 +101,112 @@ public void clean() { @Test public void testSqlChangeLogOnIdKey() throws Exception { - List> inputRowsPerCheckpoint = ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(1, "bbb"), - insertRow(2, "aaa"), - deleteRow(2, "aaa"), - insertRow(2, "bbb") - ), + List> inputRowsPerCheckpoint = ImmutableList.of( - updateBeforeRow(2, "bbb"), - updateAfterRow(2, "ccc"), - deleteRow(2, "ccc"), - insertRow(2, "ddd") - ), + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(1, "bbb"), + insertRow(2, "aaa"), + deleteRow(2, "aaa"), + insertRow(2, "bbb")), + ImmutableList.of( + updateBeforeRow(2, "bbb"), + updateAfterRow(2, "ccc"), + deleteRow(2, "ccc"), + insertRow(2, "ddd")), + ImmutableList.of( + deleteRow(1, "bbb"), + insertRow(1, "ccc"), + deleteRow(1, "ccc"), + insertRow(1, "ddd"))); + + List> expectedRecordsPerCheckpoint = ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(1, "ccc"), - deleteRow(1, "ccc"), - insertRow(1, "ddd") - ) - ); - - List> expectedRecordsPerCheckpoint = ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "bbb")), - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "ddd")), - ImmutableList.of(insertRow(1, "ddd"), insertRow(2, "ddd")) - ); - - testSqlChangeLog(TABLE_NAME, ImmutableList.of("id"), inputRowsPerCheckpoint, - expectedRecordsPerCheckpoint); + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "bbb")), + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "ddd")), + ImmutableList.of(insertRow(1, "ddd"), insertRow(2, "ddd"))); + + testSqlChangeLog( + TABLE_NAME, ImmutableList.of("id"), inputRowsPerCheckpoint, expectedRecordsPerCheckpoint); } @Test public void testChangeLogOnDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( + List> elementsPerCheckpoint = ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(1, "bbb"), - insertRow(2, "aaa") - ), + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(2, "bbb"), + insertRow(1, "bbb"), + insertRow(2, "aaa")), + ImmutableList.of( + updateBeforeRow(2, "aaa"), updateAfterRow(1, "ccc"), insertRow(1, "aaa")), + ImmutableList.of(deleteRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "ccc"))); + + List> expectedRecords = ImmutableList.of( - updateBeforeRow(2, "aaa"), - updateAfterRow(1, "ccc"), - insertRow(1, "aaa") - ), - ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(2, "aaa"), - insertRow(2, "ccc") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa")), - ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc")), - ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "ccc"), insertRow(2, "aaa"), insertRow(2, "ccc")) - ); + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa")), + ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc")), + ImmutableList.of( + insertRow(1, "aaa"), + insertRow(1, "ccc"), + insertRow(2, "aaa"), + insertRow(2, "ccc"))); testSqlChangeLog(TABLE_NAME, ImmutableList.of("data"), elementsPerCheckpoint, expectedRecords); } @Test public void testChangeLogOnIdDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(1, "bbb"), - insertRow(2, "aaa") - ), + List> elementsPerCheckpoint = ImmutableList.of( - updateBeforeRow(2, "aaa"), - updateAfterRow(1, "ccc"), - insertRow(1, "aaa") - ), + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(2, "bbb"), + insertRow(1, "bbb"), + insertRow(2, "aaa")), + ImmutableList.of( + updateBeforeRow(2, "aaa"), updateAfterRow(1, "ccc"), insertRow(1, "aaa")), + ImmutableList.of(deleteRow(1, "bbb"), insertRow(2, "aaa"))); + + List> expectedRecords = ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(2, "aaa") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "bbb")), - ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc"), insertRow(2, "bbb")), - ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "ccc"), insertRow(2, "aaa"), insertRow(2, "bbb")) - ); - - testSqlChangeLog(TABLE_NAME, ImmutableList.of("data", "id"), elementsPerCheckpoint, expectedRecords); + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "bbb")), + ImmutableList.of( + insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc"), insertRow(2, "bbb")), + ImmutableList.of( + insertRow(1, "aaa"), + insertRow(1, "ccc"), + insertRow(2, "aaa"), + insertRow(2, "bbb"))); + + testSqlChangeLog( + TABLE_NAME, ImmutableList.of("data", "id"), elementsPerCheckpoint, expectedRecords); } @Test public void testPureInsertOnIdKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( + List> elementsPerCheckpoint = ImmutableList.of( - insertRow(1, "aaa"), - insertRow(2, "bbb") - ), - ImmutableList.of( - insertRow(3, "ccc"), - insertRow(4, "ddd") - ), - ImmutableList.of( - insertRow(5, "eee"), - insertRow(6, "fff") - ) - ); + ImmutableList.of(insertRow(1, "aaa"), insertRow(2, "bbb")), + ImmutableList.of(insertRow(3, "ccc"), insertRow(4, "ddd")), + ImmutableList.of(insertRow(5, "eee"), insertRow(6, "fff"))); - List> expectedRecords = ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - insertRow(2, "bbb") - ), - ImmutableList.of( - insertRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(3, "ccc"), - insertRow(4, "ddd") - ), + List> expectedRecords = ImmutableList.of( - insertRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(3, "ccc"), - insertRow(4, "ddd"), - insertRow(5, "eee"), - insertRow(6, "fff") - ) - ); + ImmutableList.of(insertRow(1, "aaa"), insertRow(2, "bbb")), + ImmutableList.of( + insertRow(1, "aaa"), insertRow(2, "bbb"), insertRow(3, "ccc"), insertRow(4, "ddd")), + ImmutableList.of( + insertRow(1, "aaa"), + insertRow(2, "bbb"), + insertRow(3, "ccc"), + insertRow(4, "ddd"), + insertRow(5, "eee"), + insertRow(6, "fff"))); testSqlChangeLog(TABLE_NAME, ImmutableList.of("data"), elementsPerCheckpoint, expectedRecords); } @@ -244,13 +217,14 @@ private static Record record(int id, String data) { private Table createTable(String tableName, List key, boolean isPartitioned) { String partitionByCause = isPartitioned ? "PARTITIONED BY (data)" : ""; - sql("CREATE TABLE %s(id INT, data VARCHAR, PRIMARY KEY(%s) NOT ENFORCED) %s", + sql( + "CREATE TABLE %s(id INT, data VARCHAR, PRIMARY KEY(%s) NOT ENFORCED) %s", tableName, Joiner.on(',').join(key), partitionByCause); // Upgrade the iceberg table to format v2. - CatalogLoader loader = CatalogLoader.hadoop("my_catalog", CONF, ImmutableMap.of( - CatalogProperties.WAREHOUSE_LOCATION, warehouse - )); + CatalogLoader loader = + CatalogLoader.hadoop( + "my_catalog", CONF, ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouse)); Table table = loader.loadCatalog().loadTable(TableIdentifier.of(DATABASE_NAME, TABLE_NAME)); TableOperations ops = ((BaseTable) table).operations(); TableMetadata meta = ops.current(); @@ -259,15 +233,20 @@ private Table createTable(String tableName, List key, boolean isPartitio return table; } - private void testSqlChangeLog(String tableName, - List key, - List> inputRowsPerCheckpoint, - List> expectedRecordsPerCheckpoint) throws Exception { + private void testSqlChangeLog( + String tableName, + List key, + List> inputRowsPerCheckpoint, + List> expectedRecordsPerCheckpoint) + throws Exception { String dataId = BoundedTableFactory.registerDataSet(inputRowsPerCheckpoint); - sql("CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + - " WITH ('connector'='BoundedSource', 'data-id'='%s')", SOURCE_TABLE, dataId); + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + + " WITH ('connector'='BoundedSource', 'data-id'='%s')", + SOURCE_TABLE, dataId); - Assert.assertEquals("Should have the expected rows", + Assert.assertEquals( + "Should have the expected rows", listJoin(inputRowsPerCheckpoint), sql("SELECT * FROM %s", SOURCE_TABLE)); @@ -277,17 +256,21 @@ private void testSqlChangeLog(String tableName, table.refresh(); List snapshots = findValidSnapshots(table); int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); - Assert.assertEquals("Should have the expected snapshot number", expectedSnapshotNum, snapshots.size()); + Assert.assertEquals( + "Should have the expected snapshot number", expectedSnapshotNum, snapshots.size()); for (int i = 0; i < expectedSnapshotNum; i++) { long snapshotId = snapshots.get(i).snapshotId(); List expectedRows = expectedRecordsPerCheckpoint.get(i); - Assert.assertEquals("Should have the expected records for the checkpoint#" + i, - expectedRowSet(table, expectedRows), actualRowSet(table, snapshotId)); + Assert.assertEquals( + "Should have the expected records for the checkpoint#" + i, + expectedRowSet(table, expectedRows), + actualRowSet(table, snapshotId)); } if (expectedSnapshotNum > 0) { - Assert.assertEquals("Should have the expected rows in the final table", + Assert.assertEquals( + "Should have the expected rows in the final table", Sets.newHashSet(expectedRecordsPerCheckpoint.get(expectedSnapshotNum - 1)), Sets.newHashSet(sql("SELECT * FROM %s", tableName))); } @@ -296,7 +279,8 @@ private void testSqlChangeLog(String tableName, private List findValidSnapshots(Table table) { List validSnapshots = Lists.newArrayList(); for (Snapshot snapshot : table.snapshots()) { - if (snapshot.allManifests(table.io()).stream().anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { + if (snapshot.allManifests(table.io()).stream() + .anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { validSnapshots.add(snapshot); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java index fe9deb37684f..e9372adda4c1 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -45,21 +47,17 @@ import org.assertj.core.api.Assertions; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestDataFileSerialization { - private static final Schema DATE_SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema DATE_SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec PARTITION_SPEC = PartitionSpec - .builderFor(DATE_SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec PARTITION_SPEC = + PartitionSpec.builderFor(DATE_SCHEMA).identity("date").build(); private static final Map COLUMN_SIZES = Maps.newHashMap(); private static final Map VALUE_COUNTS = Maps.newHashMap(); @@ -81,40 +79,43 @@ public class TestDataFileSerialization { UPPER_BOUNDS.put(1, longToBuffer(4L)); } - private static final Metrics METRICS = new Metrics( - 5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS); - - private static final DataFile DATA_FILE = DataFiles - .builder(PARTITION_SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(1234) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withSplitOffsets(ImmutableList.of(4L)) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) - .withSortOrder(SortOrder.unsorted()) - .build(); - - private static final DeleteFile POS_DELETE_FILE = FileMetadata.deleteFileBuilder(PARTITION_SPEC) - .ofPositionDeletes() - .withPath("/path/to/pos-delete.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) - .withRecordCount(23) - .build(); - - private static final DeleteFile EQ_DELETE_FILE = FileMetadata.deleteFileBuilder(PARTITION_SPEC) - .ofEqualityDeletes(2, 3) - .withPath("/path/to/equality-delete.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) - .withRecordCount(23) - .withSortOrder(SortOrder.unsorted()) - .build(); + private static final Metrics METRICS = + new Metrics( + 5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS); + + private static final DataFile DATA_FILE = + DataFiles.builder(PARTITION_SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(1234) + .withPartitionPath("date=2018-06-08") + .withMetrics(METRICS) + .withSplitOffsets(ImmutableList.of(4L)) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) + .withSortOrder(SortOrder.unsorted()) + .build(); + + private static final DeleteFile POS_DELETE_FILE = + FileMetadata.deleteFileBuilder(PARTITION_SPEC) + .ofPositionDeletes() + .withPath("/path/to/pos-delete.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("date=2018-06-08") + .withMetrics(METRICS) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) + .withRecordCount(23) + .build(); + + private static final DeleteFile EQ_DELETE_FILE = + FileMetadata.deleteFileBuilder(PARTITION_SPEC) + .ofEqualityDeletes(2, 3) + .withPath("/path/to/equality-delete.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("date=2018-06-08") + .withMetrics(METRICS) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) + .withRecordCount(23) + .withSortOrder(SortOrder.unsorted()) + .build(); @Test public void testJavaSerialization() throws Exception { @@ -130,7 +131,8 @@ public void testJavaSerialization() throws Exception { out.writeObject(EQ_DELETE_FILE.copy()); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { for (int i = 0; i < 2; i += 1) { Object obj = in.readObject(); Assertions.assertThat(obj).as("Should be a DataFile").isInstanceOf(DataFile.class); @@ -139,13 +141,17 @@ public void testJavaSerialization() throws Exception { for (int i = 0; i < 2; i += 1) { Object obj = in.readObject(); - Assertions.assertThat(obj).as("Should be a position DeleteFile").isInstanceOf(DeleteFile.class); + Assertions.assertThat(obj) + .as("Should be a position DeleteFile") + .isInstanceOf(DeleteFile.class); TestHelpers.assertEquals(POS_DELETE_FILE, (DeleteFile) obj); } for (int i = 0; i < 2; i += 1) { Object obj = in.readObject(); - Assertions.assertThat(obj).as("Should be a equality DeleteFile").isInstanceOf(DeleteFile.class); + Assertions.assertThat(obj) + .as("Should be a equality DeleteFile") + .isInstanceOf(DeleteFile.class); TestHelpers.assertEquals(EQ_DELETE_FILE, (DeleteFile) obj); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java index 1596f9fddc66..884ea2d1d3b1 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java @@ -16,32 +16,28 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.types.Types.NestedField.required; + import org.apache.flink.table.types.logical.RowType; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestFixtures { - private TestFixtures() { - - } + private TestFixtures() {} - public static final Schema SCHEMA = new Schema( - required(1, "data", Types.StringType.get()), - required(2, "id", Types.LongType.get()), - required(3, "dt", Types.StringType.get())); + public static final Schema SCHEMA = + new Schema( + required(1, "data", Types.StringType.get()), + required(2, "id", Types.LongType.get()), + required(3, "dt", Types.StringType.get())); - public static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .identity("dt") - .bucket("id", 1) - .build(); + public static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("dt").bucket("id", 1).build(); public static final RowType ROW_TYPE = FlinkSchemaUtil.convert(SCHEMA); @@ -51,13 +47,13 @@ private TestFixtures() { public static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of(DATABASE, TABLE); - public static final Schema TS_SCHEMA = new Schema( + public static final Schema TS_SCHEMA = + new Schema( required(1, "ts", Types.TimestampType.withoutZone()), required(2, "str", Types.StringType.get())); - public static final PartitionSpec TS_SPEC = PartitionSpec.builderFor(TS_SCHEMA) - .hour("ts") - .build(); + public static final PartitionSpec TS_SPEC = + PartitionSpec.builderFor(TS_SCHEMA).hour("ts").build(); public static final RowType TS_ROW_TYPE = FlinkSchemaUtil.convert(TS_SCHEMA); } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java index 180a2bc5f01b..d4de12c62300 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -58,16 +57,21 @@ public void testCreateNamespace() { sql("CREATE DATABASE %s", flinkDatabase); - Assert.assertTrue("Database should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Database should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); - Assert.assertTrue("Database should still exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Database should still exist", + validationNamespaceCatalog.namespaceExists(icebergNamespace)); sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - Assert.assertFalse("Database should be dropped", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertFalse( + "Database should be dropped", validationNamespaceCatalog.namespaceExists(icebergNamespace)); sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); - Assert.assertTrue("Database should be created", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Database should be created", validationNamespaceCatalog.namespaceExists(icebergNamespace)); } @Test @@ -75,9 +79,12 @@ public void testDefaultDatabase() { sql("USE CATALOG %s", catalogName); sql("SHOW TABLES"); - Assert.assertEquals("Should use the current catalog", getTableEnv().getCurrentCatalog(), catalogName); - Assert.assertEquals("Should use the configured default namespace", - getTableEnv().getCurrentDatabase(), "default"); + Assert.assertEquals( + "Should use the current catalog", getTableEnv().getCurrentCatalog(), catalogName); + Assert.assertEquals( + "Should use the configured default namespace", + getTableEnv().getCurrentDatabase(), + "default"); } @Test @@ -88,7 +95,8 @@ public void testDropEmptyDatabase() { sql("CREATE DATABASE %s", flinkDatabase); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); sql("DROP DATABASE %s", flinkDatabase); @@ -99,7 +107,8 @@ public void testDropEmptyDatabase() { @Test public void testDropNonEmptyNamespace() { - Assume.assumeFalse("Hadoop catalog throws IOException: Directory is not empty.", isHadoopCatalog); + Assume.assumeFalse( + "Hadoop catalog throws IOException: Directory is not empty.", isHadoopCatalog); Assert.assertFalse( "Namespace should not already exist", @@ -111,8 +120,11 @@ public void testDropNonEmptyNamespace() { TableIdentifier.of(icebergNamespace, "tl"), new Schema(Types.NestedField.optional(0, "id", Types.LongType.get()))); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - Assert.assertTrue("Table should exist", validationCatalog.tableExists(TableIdentifier.of(icebergNamespace, "tl"))); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Table should exist", + validationCatalog.tableExists(TableIdentifier.of(icebergNamespace, "tl"))); AssertHelpers.assertThrowsCause( "Should fail if trying to delete a non-empty database", @@ -133,7 +145,8 @@ public void testListTables() { sql("USE CATALOG %s", catalogName); sql("USE %s", DATABASE); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); Assert.assertEquals("Should not list any tables", 0, sql("SHOW TABLES").size()); @@ -155,29 +168,35 @@ public void testListNamespace() { sql("CREATE DATABASE %s", flinkDatabase); sql("USE CATALOG %s", catalogName); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); List databases = sql("SHOW DATABASES"); if (isHadoopCatalog) { Assert.assertEquals("Should have 2 database", 2, databases.size()); - Assert.assertEquals("Should have db and default database", + Assert.assertEquals( + "Should have db and default database", Sets.newHashSet("default", "db"), Sets.newHashSet(databases.get(0).getField(0), databases.get(1).getField(0))); if (!baseNamespace.isEmpty()) { // test namespace not belongs to this catalog - validationNamespaceCatalog.createNamespace(Namespace.of(baseNamespace.level(0), "UNKNOWN_NAMESPACE")); + validationNamespaceCatalog.createNamespace( + Namespace.of(baseNamespace.level(0), "UNKNOWN_NAMESPACE")); databases = sql("SHOW DATABASES"); Assert.assertEquals("Should have 2 database", 2, databases.size()); - Assert.assertEquals("Should have db and default database", + Assert.assertEquals( + "Should have db and default database", Sets.newHashSet("default", "db"), Sets.newHashSet(databases.get(0).getField(0), databases.get(1).getField(0))); } } else { - // If there are multiple classes extends FlinkTestBase, TestHiveMetastore may loose the creation for default + // If there are multiple classes extends FlinkTestBase, TestHiveMetastore may loose the + // creation for default // database. See HiveMetaStore.HMSHandler.init. - Assert.assertTrue("Should have db database", + Assert.assertTrue( + "Should have db database", databases.stream().anyMatch(d -> Objects.equals(d.getField(0), "db"))); } } @@ -192,11 +211,14 @@ public void testCreateNamespaceWithMetadata() { sql("CREATE DATABASE %s WITH ('prop'='value')", flinkDatabase); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - Assert.assertEquals("Namespace should have expected prop value", "value", nsMetadata.get("prop")); + Assert.assertEquals( + "Namespace should have expected prop value", "value", nsMetadata.get("prop")); } @Test @@ -209,11 +231,14 @@ public void testCreateNamespaceWithComment() { sql("CREATE DATABASE %s COMMENT 'namespace doc'", flinkDatabase); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - Assert.assertEquals("Namespace should have expected comment", "namespace doc", nsMetadata.get("comment")); + Assert.assertEquals( + "Namespace should have expected comment", "namespace doc", nsMetadata.get("comment")); } @Test @@ -229,12 +254,16 @@ public void testCreateNamespaceWithLocation() throws Exception { sql("CREATE DATABASE %s WITH ('location'='%s')", flinkDatabase, location); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - Assert.assertEquals("Namespace should have expected location", - "file:" + location.getPath(), nsMetadata.get("location")); + Assert.assertEquals( + "Namespace should have expected location", + "file:" + location.getPath(), + nsMetadata.get("location")); } @Test @@ -247,16 +276,21 @@ public void testSetProperties() { sql("CREATE DATABASE %s", flinkDatabase); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - Map defaultMetadata = validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - Assert.assertFalse("Default metadata should not have custom property", defaultMetadata.containsKey("prop")); + Map defaultMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + Assert.assertFalse( + "Default metadata should not have custom property", defaultMetadata.containsKey("prop")); sql("ALTER DATABASE %s SET ('prop'='value')", flinkDatabase); - Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - Assert.assertEquals("Namespace should have expected prop value", "value", nsMetadata.get("prop")); + Assert.assertEquals( + "Namespace should have expected prop value", "value", nsMetadata.get("prop")); } @Test diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java index cd893d836dc8..f7edd5653ebd 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.Map; @@ -46,11 +45,12 @@ public void before() { @Test public void testCreateCreateCatalogHive() { String catalogName = "hiveCatalog"; - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); + props.put( + FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); - Catalog catalog = FlinkCatalogFactory - .createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); + Catalog catalog = + FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) + .loadCatalog(); Assertions.assertThat(catalog).isNotNull().isInstanceOf(HiveCatalog.class); } @@ -58,11 +58,12 @@ public void testCreateCreateCatalogHive() { @Test public void testCreateCreateCatalogHadoop() { String catalogName = "hadoopCatalog"; - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HADOOP); + props.put( + FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HADOOP); - Catalog catalog = FlinkCatalogFactory - .createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); + Catalog catalog = + FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) + .loadCatalog(); Assertions.assertThat(catalog).isNotNull().isInstanceOf(HadoopCatalog.class); } @@ -72,9 +73,9 @@ public void testCreateCreateCatalogCustom() { String catalogName = "customCatalog"; props.put(CatalogProperties.CATALOG_IMPL, CustomHadoopCatalog.class.getName()); - Catalog catalog = FlinkCatalogFactory - .createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); + Catalog catalog = + FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) + .loadCatalog(); Assertions.assertThat(catalog).isNotNull().isInstanceOf(CustomHadoopCatalog.class); } @@ -83,13 +84,14 @@ public void testCreateCreateCatalogCustom() { public void testCreateCreateCatalogCustomWithHiveCatalogTypeSet() { String catalogName = "customCatalog"; props.put(CatalogProperties.CATALOG_IMPL, CustomHadoopCatalog.class.getName()); - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); + props.put( + FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); AssertHelpers.assertThrows( "Should throw when both catalog-type and catalog-impl are set", IllegalArgumentException.class, - "both catalog-type and catalog-impl are set", () -> - FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())); + "both catalog-type and catalog-impl are set", + () -> FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())); } @Test @@ -100,20 +102,18 @@ public void testLoadCatalogUnknown() { AssertHelpers.assertThrows( "Should throw when an unregistered / unknown catalog is set as the catalog factor's`type` setting", UnsupportedOperationException.class, - "Unknown catalog-type", () -> - FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) - ); + "Unknown catalog-type", + () -> FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())); } public static class CustomHadoopCatalog extends HadoopCatalog { - public CustomHadoopCatalog() { - - } + public CustomHadoopCatalog() {} public CustomHadoopCatalog(Configuration conf, String warehouseLocation) { setConf(conf); - initialize("custom", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation)); + initialize( + "custom", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation)); } } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java index 3bb2861d8778..45b3da5fe661 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.Arrays; @@ -89,26 +88,27 @@ public void testGetTable() { sql("CREATE TABLE tl(id BIGINT, strV STRING)"); Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, "tl")); - Schema iSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "strV", Types.StringType.get()) - ); - Assert.assertEquals("Should load the expected iceberg schema", iSchema.toString(), table.schema().toString()); + Schema iSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "strV", Types.StringType.get())); + Assert.assertEquals( + "Should load the expected iceberg schema", iSchema.toString(), table.schema().toString()); } @Test public void testRenameTable() { Assume.assumeFalse("HadoopCatalog does not support rename table", isHadoopCatalog); - final Schema tableSchema = new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); + final Schema tableSchema = + new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); validationCatalog.createTable(TableIdentifier.of(icebergNamespace, "tl"), tableSchema); sql("ALTER TABLE tl RENAME TO tl2"); AssertHelpers.assertThrows( "Should fail if trying to get a nonexistent table", ValidationException.class, "Table `tl` was not found.", - () -> getTableEnv().from("tl") - ); + () -> getTableEnv().from("tl")); Schema actualSchema = FlinkSchemaUtil.convert(getTableEnv().from("tl2").getSchema()); Assert.assertEquals(tableSchema.asStruct(), actualSchema.asStruct()); } @@ -124,7 +124,8 @@ public void testCreateTable() throws TableNotExistException { Assert.assertEquals(Maps.newHashMap(), table.properties()); CatalogTable catalogTable = catalogTable("tl"); - Assert.assertEquals(TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); + Assert.assertEquals( + TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); Assert.assertEquals(Maps.newHashMap(), catalogTable.getOptions()); } @@ -133,33 +134,41 @@ public void testCreateTableWithPrimaryKey() throws Exception { sql("CREATE TABLE tl(id BIGINT, data STRING, key STRING PRIMARY KEY NOT ENFORCED)"); Table table = table("tl"); - Assert.assertEquals("Should have the expected row key.", + Assert.assertEquals( + "Should have the expected row key.", Sets.newHashSet(table.schema().findField("key").fieldId()), table.schema().identifierFieldIds()); CatalogTable catalogTable = catalogTable("tl"); Optional uniqueConstraintOptional = catalogTable.getSchema().getPrimaryKey(); - Assert.assertTrue("Should have the expected unique constraint", uniqueConstraintOptional.isPresent()); - Assert.assertEquals("Should have the expected columns", - ImmutableList.of("key"), uniqueConstraintOptional.get().getColumns()); + Assert.assertTrue( + "Should have the expected unique constraint", uniqueConstraintOptional.isPresent()); + Assert.assertEquals( + "Should have the expected columns", + ImmutableList.of("key"), + uniqueConstraintOptional.get().getColumns()); } @Test public void testCreateTableWithMultiColumnsInPrimaryKey() throws Exception { - sql("CREATE TABLE tl(id BIGINT, data STRING, CONSTRAINT pk_constraint PRIMARY KEY(data, id) NOT ENFORCED)"); + sql( + "CREATE TABLE tl(id BIGINT, data STRING, CONSTRAINT pk_constraint PRIMARY KEY(data, id) NOT ENFORCED)"); Table table = table("tl"); - Assert.assertEquals("Should have the expected RowKey", + Assert.assertEquals( + "Should have the expected RowKey", Sets.newHashSet( - table.schema().findField("id").fieldId(), - table.schema().findField("data").fieldId()), + table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId()), table.schema().identifierFieldIds()); CatalogTable catalogTable = catalogTable("tl"); Optional uniqueConstraintOptional = catalogTable.getSchema().getPrimaryKey(); - Assert.assertTrue("Should have the expected unique constraint", uniqueConstraintOptional.isPresent()); - Assert.assertEquals("Should have the expected columns", - ImmutableSet.of("data", "id"), ImmutableSet.copyOf(uniqueConstraintOptional.get().getColumns())); + Assert.assertTrue( + "Should have the expected unique constraint", uniqueConstraintOptional.isPresent()); + Assert.assertEquals( + "Should have the expected columns", + ImmutableSet.of("data", "id"), + ImmutableSet.copyOf(uniqueConstraintOptional.get().getColumns())); } @Test @@ -170,7 +179,8 @@ public void testCreateTableIfNotExists() { Assert.assertEquals(Maps.newHashMap(), table("tl").properties()); sql("DROP TABLE tl"); - AssertHelpers.assertThrows("Table 'tl' should be dropped", + AssertHelpers.assertThrows( + "Table 'tl' should be dropped", NoSuchTableException.class, "Table does not exist: " + getFullQualifiedTableName("tl"), () -> table("tl")); @@ -179,14 +189,12 @@ public void testCreateTableIfNotExists() { Assert.assertEquals(Maps.newHashMap(), table("tl").properties()); final Map expectedProperties = ImmutableMap.of("key", "value"); - table("tl").updateProperties() - .set("key", "value") - .commit(); + table("tl").updateProperties().set("key", "value").commit(); Assert.assertEquals(expectedProperties, table("tl").properties()); sql("CREATE TABLE IF NOT EXISTS tl(id BIGINT)"); - Assert.assertEquals("Should still be the old table.", - expectedProperties, table("tl").properties()); + Assert.assertEquals( + "Should still be the old table.", expectedProperties, table("tl").properties()); } @Test @@ -201,13 +209,15 @@ public void testCreateTableLike() throws TableNotExistException { Assert.assertEquals(Maps.newHashMap(), table.properties()); CatalogTable catalogTable = catalogTable("tl2"); - Assert.assertEquals(TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); + Assert.assertEquals( + TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); Assert.assertEquals(Maps.newHashMap(), catalogTable.getOptions()); } @Test public void testCreateTableLocation() { - Assume.assumeFalse("HadoopCatalog does not support creating table with location", isHadoopCatalog); + Assume.assumeFalse( + "HadoopCatalog does not support creating table with location", isHadoopCatalog); sql("CREATE TABLE tl(id BIGINT) WITH ('location'='file:///tmp/location')"); @@ -226,15 +236,20 @@ public void testCreatePartitionTable() throws TableNotExistException { Table table = table("tl"); Assert.assertEquals( new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())).asStruct(), + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct(), table.schema().asStruct()); - Assert.assertEquals(PartitionSpec.builderFor(table.schema()).identity("dt").build(), table.spec()); + Assert.assertEquals( + PartitionSpec.builderFor(table.schema()).identity("dt").build(), table.spec()); Assert.assertEquals(Maps.newHashMap(), table.properties()); CatalogTable catalogTable = catalogTable("tl"); Assert.assertEquals( - TableSchema.builder().field("id", DataTypes.BIGINT()).field("dt", DataTypes.STRING()).build(), + TableSchema.builder() + .field("id", DataTypes.BIGINT()) + .field("dt", DataTypes.STRING()) + .build(), catalogTable.getSchema()); Assert.assertEquals(Maps.newHashMap(), catalogTable.getOptions()); Assert.assertEquals(Collections.singletonList("dt"), catalogTable.getPartitionKeys()); @@ -245,8 +260,10 @@ public void testCreateTableWithFormatV2ThroughTableProperty() throws Exception { sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='2')"); Table table = table("tl"); - Assert.assertEquals("should create table using format v2", - 2, ((BaseTable) table).operations().current().formatVersion()); + Assert.assertEquals( + "should create table using format v2", + 2, + ((BaseTable) table).operations().current().formatVersion()); } @Test @@ -255,12 +272,10 @@ public void testUpgradeTableWithFormatV2ThroughTableProperty() throws Exception Table table = table("tl"); TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("should create table using format v1", - 1, ops.refresh().formatVersion()); + Assert.assertEquals("should create table using format v1", 1, ops.refresh().formatVersion()); sql("ALTER TABLE tl SET('format-version'='2')"); - Assert.assertEquals("should update table to use format v2", - 2, ops.refresh().formatVersion()); + Assert.assertEquals("should update table to use format v2", 2, ops.refresh().formatVersion()); } @Test @@ -269,10 +284,10 @@ public void testDowngradeTableToFormatV1ThroughTablePropertyFails() throws Excep Table table = table("tl"); TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("should create table using format v2", - 2, ops.refresh().formatVersion()); + Assert.assertEquals("should create table using format v2", 2, ops.refresh().formatVersion()); - AssertHelpers.assertThrowsRootCause("should fail to downgrade to v1", + AssertHelpers.assertThrowsRootCause( + "should fail to downgrade to v1", IllegalArgumentException.class, "Cannot downgrade v2 table to v1", () -> sql("ALTER TABLE tl SET('format-version'='1')")); @@ -282,13 +297,13 @@ public void testDowngradeTableToFormatV1ThroughTablePropertyFails() throws Excep public void testLoadTransformPartitionTable() throws TableNotExistException { Schema schema = new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); validationCatalog.createTable( - TableIdentifier.of(icebergNamespace, "tl"), schema, + TableIdentifier.of(icebergNamespace, "tl"), + schema, PartitionSpec.builderFor(schema).bucket("id", 100).build()); CatalogTable catalogTable = catalogTable("tl"); Assert.assertEquals( - TableSchema.builder().field("id", DataTypes.BIGINT()).build(), - catalogTable.getSchema()); + TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); Assert.assertEquals(Maps.newHashMap(), catalogTable.getOptions()); Assert.assertEquals(Collections.emptyList(), catalogTable.getPartitionKeys()); } @@ -312,8 +327,10 @@ public void testAlterTable() throws TableNotExistException { // remove property CatalogTable catalogTable = catalogTable("tl"); properties.remove("oldK"); - getTableEnv().getCatalog(getTableEnv().getCurrentCatalog()).get().alterTable( - new ObjectPath(DATABASE, "tl"), catalogTable.copy(properties), false); + getTableEnv() + .getCatalog(getTableEnv().getCurrentCatalog()) + .get() + .alterTable(new ObjectPath(DATABASE, "tl"), catalogTable.copy(properties), false); Assert.assertEquals(properties, table("tl").properties()); } @@ -336,8 +353,10 @@ public void testAlterTableWithPrimaryKey() throws TableNotExistException { // remove property CatalogTable catalogTable = catalogTable("tl"); properties.remove("oldK"); - getTableEnv().getCatalog(getTableEnv().getCurrentCatalog()).get().alterTable( - new ObjectPath(DATABASE, "tl"), catalogTable.copy(properties), false); + getTableEnv() + .getCatalog(getTableEnv().getCurrentCatalog()) + .get() + .alterTable(new ObjectPath(DATABASE, "tl"), catalogTable.copy(properties), false); Assert.assertEquals(properties, table("tl").properties()); } @@ -356,43 +375,40 @@ public void testSetCurrentAndCherryPickSnapshotId() { Table table = table("tl"); - DataFile fileA = DataFiles.builder(table.spec()) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - DataFile fileB = DataFiles.builder(table.spec()) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=1") // easy way to set partition data for now - .withRecordCount(1) - .build(); - DataFile replacementFile = DataFiles.builder(table.spec()) - .withPath("/path/to/data-a-replacement.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - - table.newAppend() - .appendFile(fileA) - .commit(); + DataFile fileA = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + DataFile fileB = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=1") // easy way to set partition data for now + .withRecordCount(1) + .build(); + DataFile replacementFile = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-a-replacement.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + + table.newAppend().appendFile(fileA).commit(); long snapshotId = table.currentSnapshot().snapshotId(); // stage an overwrite that replaces FILE_A - table.newReplacePartitions() - .addFile(replacementFile) - .stageOnly() - .commit(); + table.newReplacePartitions().addFile(replacementFile).stageOnly().commit(); Snapshot staged = Iterables.getLast(table.snapshots()); - Assert.assertEquals("Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); + Assert.assertEquals( + "Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); // add another append so that the original commit can't be fast-forwarded - table.newAppend() - .appendFile(fileB) - .commit(); + table.newAppend().appendFile(fileB).commit(); // test cherry pick sql("ALTER TABLE tl SET('cherry-pick-snapshot-id'='%s')", staged.snapshotId()); @@ -405,10 +421,13 @@ public void testSetCurrentAndCherryPickSnapshotId() { private void validateTableFiles(Table tbl, DataFile... expectedFiles) { tbl.refresh(); - Set expectedFilePaths = Arrays.stream(expectedFiles).map(DataFile::path).collect(Collectors.toSet()); - Set actualFilePaths = StreamSupport.stream(tbl.newScan().planFiles().spliterator(), false) - .map(FileScanTask::file).map(ContentFile::path) - .collect(Collectors.toSet()); + Set expectedFilePaths = + Arrays.stream(expectedFiles).map(DataFile::path).collect(Collectors.toSet()); + Set actualFilePaths = + StreamSupport.stream(tbl.newScan().planFiles().spliterator(), false) + .map(FileScanTask::file) + .map(ContentFile::path) + .collect(Collectors.toSet()); Assert.assertEquals("Files should match", expectedFilePaths, actualFilePaths); } @@ -417,7 +436,10 @@ private Table table(String name) { } private CatalogTable catalogTable(String name) throws TableNotExistException { - return (CatalogTable) getTableEnv().getCatalog(getTableEnv().getCurrentCatalog()).get() - .getTable(new ObjectPath(DATABASE, name)); + return (CatalogTable) + getTableEnv() + .getCatalog(getTableEnv().getCurrentCatalog()) + .get() + .getTable(new ObjectPath(DATABASE, name)); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java index b6c4812861f7..839700f50127 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.flink.FlinkCatalogFactory.CACHE_ENABLED; + import java.util.List; import org.apache.flink.table.catalog.CatalogPartitionSpec; import org.apache.flink.table.catalog.ObjectPath; @@ -35,18 +36,18 @@ import org.junit.Test; import org.junit.runners.Parameterized; -import static org.apache.iceberg.flink.FlinkCatalogFactory.CACHE_ENABLED; - public class TestFlinkCatalogTablePartitions extends FlinkCatalogTestBase { private String tableName = "test_table"; private final FileFormat format; - @Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, cacheEnabled={3}") + @Parameterized.Parameters( + name = "catalogName={0}, baseNamespace={1}, format={2}, cacheEnabled={3}") public static Iterable parameters() { List parameters = Lists.newArrayList(); - for (FileFormat format : new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { + for (FileFormat format : + new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { for (Boolean cacheEnabled : new Boolean[] {true, false}) { for (Object[] catalogParams : FlinkCatalogTestBase.parameters()) { String catalogName = (String) catalogParams[0]; @@ -58,8 +59,8 @@ public static Iterable parameters() { return parameters; } - public TestFlinkCatalogTablePartitions(String catalogName, Namespace baseNamespace, FileFormat format, - boolean cacheEnabled) { + public TestFlinkCatalogTablePartitions( + String catalogName, Namespace baseNamespace, FileFormat format, boolean cacheEnabled) { super(catalogName, baseNamespace); this.format = format; config.put(CACHE_ENABLED, String.valueOf(cacheEnabled)); @@ -83,20 +84,26 @@ public void cleanNamespaces() { @Test public void testListPartitionsWithUnpartitionedTable() { - sql("CREATE TABLE %s (id INT, data VARCHAR) with ('write.format.default'='%s')", + sql( + "CREATE TABLE %s (id INT, data VARCHAR) with ('write.format.default'='%s')", tableName, format.name()); sql("INSERT INTO %s SELECT 1,'a'", tableName); ObjectPath objectPath = new ObjectPath(DATABASE, tableName); FlinkCatalog flinkCatalog = (FlinkCatalog) getTableEnv().getCatalog(catalogName).get(); - AssertHelpers.assertThrows("Should not list partitions for unpartitioned table.", - TableNotPartitionedException.class, () -> flinkCatalog.listPartitions(objectPath)); + AssertHelpers.assertThrows( + "Should not list partitions for unpartitioned table.", + TableNotPartitionedException.class, + () -> flinkCatalog.listPartitions(objectPath)); } @Test - public void testListPartitionsWithPartitionedTable() throws TableNotExistException, TableNotPartitionedException { - sql("CREATE TABLE %s (id INT, data VARCHAR) PARTITIONED BY (data) " + - "with ('write.format.default'='%s')", tableName, format.name()); + public void testListPartitionsWithPartitionedTable() + throws TableNotExistException, TableNotPartitionedException { + sql( + "CREATE TABLE %s (id INT, data VARCHAR) PARTITIONED BY (data) " + + "with ('write.format.default'='%s')", + tableName, format.name()); sql("INSERT INTO %s SELECT 1,'a'", tableName); sql("INSERT INTO %s SELECT 2,'b'", tableName); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java index 0044acf57da2..c89ea4f53054 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.math.BigDecimal; @@ -56,36 +55,38 @@ public class TestFlinkFilters { - private static final TableSchema TABLE_SCHEMA = TableSchema.builder() - .field("field1", DataTypes.INT()) - .field("field2", DataTypes.BIGINT()) - .field("field3", DataTypes.FLOAT()) - .field("field4", DataTypes.DOUBLE()) - .field("field5", DataTypes.STRING()) - .field("field6", DataTypes.BOOLEAN()) - .field("field7", DataTypes.BINARY(2)) - .field("field8", DataTypes.DECIMAL(10, 2)) - .field("field9", DataTypes.DATE()) - .field("field10", DataTypes.TIME()) - .field("field11", DataTypes.TIMESTAMP()) - .field("field12", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) - .build(); - - // A map list of fields and values used to verify the conversion of flink expression to iceberg expression - private static final List> FIELD_VALUE_LIST = ImmutableList.of( - Pair.of("field1", 1), - Pair.of("field2", 2L), - Pair.of("field3", 3F), - Pair.of("field4", 4D), - Pair.of("field5", "iceberg"), - Pair.of("field6", true), - Pair.of("field7", new byte[] {'a', 'b'}), - Pair.of("field8", BigDecimal.valueOf(10.12)), - Pair.of("field9", DateTimeUtil.daysFromDate(LocalDate.now())), - Pair.of("field10", DateTimeUtil.microsFromTime(LocalTime.now())), - Pair.of("field11", DateTimeUtil.microsFromTimestamp(LocalDateTime.now())), - Pair.of("field12", DateTimeUtil.microsFromInstant(Instant.now())) - ); + private static final TableSchema TABLE_SCHEMA = + TableSchema.builder() + .field("field1", DataTypes.INT()) + .field("field2", DataTypes.BIGINT()) + .field("field3", DataTypes.FLOAT()) + .field("field4", DataTypes.DOUBLE()) + .field("field5", DataTypes.STRING()) + .field("field6", DataTypes.BOOLEAN()) + .field("field7", DataTypes.BINARY(2)) + .field("field8", DataTypes.DECIMAL(10, 2)) + .field("field9", DataTypes.DATE()) + .field("field10", DataTypes.TIME()) + .field("field11", DataTypes.TIMESTAMP()) + .field("field12", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) + .build(); + + // A map list of fields and values used to verify the conversion of flink expression to iceberg + // expression + private static final List> FIELD_VALUE_LIST = + ImmutableList.of( + Pair.of("field1", 1), + Pair.of("field2", 2L), + Pair.of("field3", 3F), + Pair.of("field4", 4D), + Pair.of("field5", "iceberg"), + Pair.of("field6", true), + Pair.of("field7", new byte[] {'a', 'b'}), + Pair.of("field8", BigDecimal.valueOf(10.12)), + Pair.of("field9", DateTimeUtil.daysFromDate(LocalDate.now())), + Pair.of("field10", DateTimeUtil.microsFromTime(LocalTime.now())), + Pair.of("field11", DateTimeUtil.microsFromTimestamp(LocalDateTime.now())), + Pair.of("field12", DateTimeUtil.microsFromInstant(Instant.now()))); @Test public void testFlinkDataTypeEqual() { @@ -114,15 +115,18 @@ public void testFlinkDataTypeEqual() { @Test public void testEquals() { for (Pair pair : FIELD_VALUE_LIST) { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.equal(pair.first(), pair.second()); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.equal(pair.first(), pair.second()); Optional actual = - FlinkFilters.convert(resolve(Expressions.$(pair.first()).isEqual(Expressions.lit(pair.second())))); + FlinkFilters.convert( + resolve(Expressions.$(pair.first()).isEqual(Expressions.lit(pair.second())))); Assert.assertTrue("Conversion should succeed", actual.isPresent()); assertPredicatesMatch(expected, actual.get()); Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(pair.second()).isEqual(Expressions.$(pair.first())))); + FlinkFilters.convert( + resolve(Expressions.lit(pair.second()).isEqual(Expressions.$(pair.first())))); Assert.assertTrue("Conversion should succeed", actual1.isPresent()); assertPredicatesMatch(expected, actual1.get()); } @@ -146,15 +150,18 @@ public void testEqualsNaN() { @Test public void testNotEquals() { for (Pair pair : FIELD_VALUE_LIST) { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.notEqual(pair.first(), pair.second()); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.notEqual(pair.first(), pair.second()); Optional actual = - FlinkFilters.convert(resolve(Expressions.$(pair.first()).isNotEqual(Expressions.lit(pair.second())))); + FlinkFilters.convert( + resolve(Expressions.$(pair.first()).isNotEqual(Expressions.lit(pair.second())))); Assert.assertTrue("Conversion should succeed", actual.isPresent()); assertPredicatesMatch(expected, actual.get()); Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(pair.second()).isNotEqual(Expressions.$(pair.first())))); + FlinkFilters.convert( + resolve(Expressions.lit(pair.second()).isNotEqual(Expressions.$(pair.first())))); Assert.assertTrue("Conversion should succeed", actual1.isPresent()); assertPredicatesMatch(expected, actual1.get()); } @@ -165,19 +172,22 @@ public void testNotEqualsNaN() { UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.notNaN("field3"); Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field3").isNotEqual(Expressions.lit(Float.NaN)))); + FlinkFilters.convert( + resolve(Expressions.$("field3").isNotEqual(Expressions.lit(Float.NaN)))); Assert.assertTrue("Conversion should succeed", actual.isPresent()); assertPredicatesMatch(expected, actual.get()); Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(Float.NaN).isNotEqual(Expressions.$("field3")))); + FlinkFilters.convert( + resolve(Expressions.lit(Float.NaN).isNotEqual(Expressions.$("field3")))); Assert.assertTrue("Conversion should succeed", actual1.isPresent()); assertPredicatesMatch(expected, actual1.get()); } @Test public void testGreaterThan() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.greaterThan("field1", 1); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.greaterThan("field1", 1); Optional actual = FlinkFilters.convert(resolve(Expressions.$("field1").isGreater(Expressions.lit(1)))); @@ -192,7 +202,8 @@ public void testGreaterThan() { @Test public void testGreaterThanEquals() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.greaterThanOrEqual("field1", 1); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.greaterThanOrEqual("field1", 1); Optional actual = FlinkFilters.convert(resolve(Expressions.$("field1").isGreaterOrEqual(Expressions.lit(1)))); @@ -207,7 +218,8 @@ public void testGreaterThanEquals() { @Test public void testLessThan() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.lessThan("field1", 1); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.lessThan("field1", 1); Optional actual = FlinkFilters.convert(resolve(Expressions.$("field1").isLess(Expressions.lit(1)))); @@ -222,7 +234,8 @@ public void testLessThan() { @Test public void testLessThanEquals() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.lessThanOrEqual("field1", 1); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.lessThanOrEqual("field1", 1); Optional actual = FlinkFilters.convert(resolve(Expressions.$("field1").isLessOrEqual(Expressions.lit(1)))); @@ -249,20 +262,26 @@ public void testIsNotNull() { Expression expr = resolve(Expressions.$("field1").isNotNull()); Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.notNull("field1"); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.notNull("field1"); assertPredicatesMatch(expected, actual.get()); } @Test public void testAnd() { - Expression expr = resolve( - Expressions.$("field1").isEqual(Expressions.lit(1)).and(Expressions.$("field2").isEqual(Expressions.lit(2L)))); + Expression expr = + resolve( + Expressions.$("field1") + .isEqual(Expressions.lit(1)) + .and(Expressions.$("field2").isEqual(Expressions.lit(2L)))); Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); And and = (And) actual.get(); - And expected = (And) org.apache.iceberg.expressions.Expressions.and( - org.apache.iceberg.expressions.Expressions.equal("field1", 1), - org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); + And expected = + (And) + org.apache.iceberg.expressions.Expressions.and( + org.apache.iceberg.expressions.Expressions.equal("field1", 1), + org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); assertPredicatesMatch(expected.left(), and.left()); assertPredicatesMatch(expected.right(), and.right()); @@ -270,14 +289,19 @@ public void testAnd() { @Test public void testOr() { - Expression expr = resolve( - Expressions.$("field1").isEqual(Expressions.lit(1)).or(Expressions.$("field2").isEqual(Expressions.lit(2L)))); + Expression expr = + resolve( + Expressions.$("field1") + .isEqual(Expressions.lit(1)) + .or(Expressions.$("field2").isEqual(Expressions.lit(2L)))); Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); Or or = (Or) actual.get(); - Or expected = (Or) org.apache.iceberg.expressions.Expressions.or( - org.apache.iceberg.expressions.Expressions.equal("field1", 1), - org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); + Or expected = + (Or) + org.apache.iceberg.expressions.Expressions.or( + org.apache.iceberg.expressions.Expressions.equal("field1", 1), + org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); assertPredicatesMatch(expected.left(), or.left()); assertPredicatesMatch(expected.right(), or.right()); @@ -285,13 +309,18 @@ public void testOr() { @Test public void testNot() { - Expression expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.NOT, Expressions.$("field1").isEqual(Expressions.lit(1)))); + Expression expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.NOT, + Expressions.$("field1").isEqual(Expressions.lit(1)))); Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); Not not = (Not) actual.get(); - Not expected = (Not) org.apache.iceberg.expressions.Expressions.not( - org.apache.iceberg.expressions.Expressions.equal("field1", 1)); + Not expected = + (Not) + org.apache.iceberg.expressions.Expressions.not( + org.apache.iceberg.expressions.Expressions.equal("field1", 1)); Assert.assertEquals("Predicate operation should match", expected.op(), not.op()); assertPredicatesMatch(expected.child(), not.child()); @@ -299,40 +328,59 @@ public void testNot() { @Test public void testLike() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.startsWith("field5", "abc"); - Expression expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("abc%"))); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.startsWith("field5", "abc"); + Expression expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("abc%"))); Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); assertPredicatesMatch(expected, actual.get()); - expr = resolve(ApiExpressionUtils - .unresolvedCall(BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%abc"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%abc"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); - expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%abc%"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, + Expressions.$("field5"), + Expressions.lit("%abc%"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); - expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("abc%d"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, + Expressions.$("field5"), + Expressions.lit("abc%d"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); - expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); - expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a_"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a_"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); - expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a%b"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a%b"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); } @@ -343,13 +391,15 @@ private void matchLiteral(String fieldName, Object flinkLiteral, T icebergLi Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); org.apache.iceberg.expressions.Expression expression = actual.get(); - Assertions.assertThat(expression).as("The expression should be a UnboundPredicate") + Assertions.assertThat(expression) + .as("The expression should be a UnboundPredicate") .isInstanceOf(UnboundPredicate.class); UnboundPredicate unboundPredicate = (UnboundPredicate) expression; org.apache.iceberg.expressions.Expression expression1 = unboundPredicate.bind(FlinkSchemaUtil.convert(TABLE_SCHEMA).asStruct(), false); - Assertions.assertThat(expression1).as("The expression should be a BoundLiteralPredicate") + Assertions.assertThat(expression1) + .as("The expression should be a BoundLiteralPredicate") .isInstanceOf(BoundLiteralPredicate.class); BoundLiteralPredicate predicate = (BoundLiteralPredicate) expression1; @@ -357,49 +407,61 @@ private void matchLiteral(String fieldName, Object flinkLiteral, T icebergLi } private static Expression resolve(Expression originalExpression) { - return originalExpression.accept(new ApiExpressionDefaultVisitor() { - @Override - public Expression visit(UnresolvedReferenceExpression unresolvedReference) { - String name = unresolvedReference.getName(); - Optional field = TABLE_SCHEMA.getTableColumn(name); - if (field.isPresent()) { - int index = TABLE_SCHEMA.getTableColumns().indexOf(field.get()); - return new FieldReferenceExpression(name, field.get().getType(), 0, index); - } else { - return null; - } - } - - @Override - public Expression visit(UnresolvedCallExpression unresolvedCall) { - List children = - unresolvedCall.getChildren().stream().map(e -> (ResolvedExpression) e.accept(this)) - .collect(Collectors.toList()); - return new CallExpression(unresolvedCall.getFunctionDefinition(), children, DataTypes.STRING()); - } - - @Override - public Expression visit(ValueLiteralExpression valueLiteral) { - return valueLiteral; - } - - @Override - protected Expression defaultMethod(Expression expression) { - throw new UnsupportedOperationException(String.format("unsupported expression: %s", expression)); - } - }); + return originalExpression.accept( + new ApiExpressionDefaultVisitor() { + @Override + public Expression visit(UnresolvedReferenceExpression unresolvedReference) { + String name = unresolvedReference.getName(); + Optional field = TABLE_SCHEMA.getTableColumn(name); + if (field.isPresent()) { + int index = TABLE_SCHEMA.getTableColumns().indexOf(field.get()); + return new FieldReferenceExpression(name, field.get().getType(), 0, index); + } else { + return null; + } + } + + @Override + public Expression visit(UnresolvedCallExpression unresolvedCall) { + List children = + unresolvedCall.getChildren().stream() + .map(e -> (ResolvedExpression) e.accept(this)) + .collect(Collectors.toList()); + return new CallExpression( + unresolvedCall.getFunctionDefinition(), children, DataTypes.STRING()); + } + + @Override + public Expression visit(ValueLiteralExpression valueLiteral) { + return valueLiteral; + } + + @Override + protected Expression defaultMethod(Expression expression) { + throw new UnsupportedOperationException( + String.format("unsupported expression: %s", expression)); + } + }); } - private void assertPredicatesMatch(org.apache.iceberg.expressions.Expression expected, - org.apache.iceberg.expressions.Expression actual) { - Assertions.assertThat(expected).as("The expected expression should be a UnboundPredicate") + private void assertPredicatesMatch( + org.apache.iceberg.expressions.Expression expected, + org.apache.iceberg.expressions.Expression actual) { + Assertions.assertThat(expected) + .as("The expected expression should be a UnboundPredicate") .isInstanceOf(UnboundPredicate.class); - Assertions.assertThat(actual).as("The actual expression should be a UnboundPredicate") + Assertions.assertThat(actual) + .as("The actual expression should be a UnboundPredicate") .isInstanceOf(UnboundPredicate.class); UnboundPredicate predicateExpected = (UnboundPredicate) expected; UnboundPredicate predicateActual = (UnboundPredicate) actual; - Assert.assertEquals("Predicate operation should match", predicateExpected.op(), predicateActual.op()); - Assert.assertEquals("Predicate literal should match", predicateExpected.literal(), predicateActual.literal()); - Assert.assertEquals("Predicate name should match", predicateExpected.ref().name(), predicateActual.ref().name()); + Assert.assertEquals( + "Predicate operation should match", predicateExpected.op(), predicateActual.op()); + Assert.assertEquals( + "Predicate literal should match", predicateExpected.literal(), predicateActual.literal()); + Assert.assertEquals( + "Predicate name should match", + predicateExpected.ref().name(), + predicateActual.ref().name()); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java index 24065015795e..64746356636b 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -36,8 +35,7 @@ public class TestFlinkHiveCatalog extends FlinkTestBase { - @Rule - public TemporaryFolder tempFolder = new TemporaryFolder(); + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); @Test public void testCreateCatalogWithWarehouseLocation() throws IOException { @@ -61,7 +59,8 @@ public void testCreateCatalogWithHiveConfDir() throws IOException { try (FileOutputStream fos = new FileOutputStream(hiveSiteXML)) { Configuration newConf = new Configuration(hiveConf); // Set another new directory which is different with the hive metastore's warehouse path. - newConf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, "file://" + warehouseDir.getAbsolutePath()); + newConf.set( + HiveConf.ConfVars.METASTOREWAREHOUSE.varname, "file://" + warehouseDir.getAbsolutePath()); newConf.writeXml(fos); } Assert.assertTrue("hive-site.xml should be created now.", Files.exists(hiveSiteXML.toPath())); @@ -77,8 +76,11 @@ public void testCreateCatalogWithHiveConfDir() throws IOException { checkSQLQuery(props, warehouseDir); } - private void checkSQLQuery(Map catalogProperties, File warehouseDir) throws IOException { - sql("CREATE CATALOG test_catalog WITH %s", FlinkCatalogTestBase.toWithClause(catalogProperties)); + private void checkSQLQuery(Map catalogProperties, File warehouseDir) + throws IOException { + sql( + "CREATE CATALOG test_catalog WITH %s", + FlinkCatalogTestBase.toWithClause(catalogProperties)); sql("USE CATALOG test_catalog"); sql("CREATE DATABASE test_db"); sql("USE test_db"); @@ -93,7 +95,8 @@ private void checkSQLQuery(Map catalogProperties, File warehouse Path dataPath = tablePath.resolve("data"); Assert.assertTrue("Table data path should exist", Files.exists(dataPath)); - Assert.assertEquals("Should have a .crc file and a .parquet file", 2, Files.list(dataPath).count()); + Assert.assertEquals( + "Should have a .crc file and a .parquet file", 2, Files.list(dataPath).count()); sql("DROP TABLE test_table"); sql("DROP DATABASE test_db"); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java index 01f8524464e0..b5dfb9cb2f6b 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import org.apache.flink.table.api.DataTypes; @@ -45,194 +44,270 @@ public class TestFlinkSchemaUtil { @Test public void testConvertFlinkSchemaToIcebergSchema() { - TableSchema flinkSchema = TableSchema.builder() - .field("id", DataTypes.INT().notNull()) - .field("name", DataTypes.STRING()) /* optional by default */ - .field("salary", DataTypes.DOUBLE().notNull()) - .field("locations", DataTypes.MAP(DataTypes.STRING(), - DataTypes.ROW(DataTypes.FIELD("posX", DataTypes.DOUBLE().notNull(), "X field"), - DataTypes.FIELD("posY", DataTypes.DOUBLE().notNull(), "Y field")))) - .field("strArray", DataTypes.ARRAY(DataTypes.STRING()).nullable()) - .field("intArray", DataTypes.ARRAY(DataTypes.INT()).nullable()) - .field("char", DataTypes.CHAR(10).notNull()) - .field("varchar", DataTypes.VARCHAR(10).notNull()) - .field("boolean", DataTypes.BOOLEAN().nullable()) - .field("tinyint", DataTypes.TINYINT()) - .field("smallint", DataTypes.SMALLINT()) - .field("bigint", DataTypes.BIGINT()) - .field("varbinary", DataTypes.VARBINARY(10)) - .field("binary", DataTypes.BINARY(10)) - .field("time", DataTypes.TIME()) - .field("timestampWithoutZone", DataTypes.TIMESTAMP()) - .field("timestampWithZone", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) - .field("date", DataTypes.DATE()) - .field("decimal", DataTypes.DECIMAL(2, 2)) - .field("decimal2", DataTypes.DECIMAL(38, 2)) - .field("decimal3", DataTypes.DECIMAL(10, 1)) - .field("multiset", DataTypes.MULTISET(DataTypes.STRING().notNull())) - .build(); + TableSchema flinkSchema = + TableSchema.builder() + .field("id", DataTypes.INT().notNull()) + .field("name", DataTypes.STRING()) /* optional by default */ + .field("salary", DataTypes.DOUBLE().notNull()) + .field( + "locations", + DataTypes.MAP( + DataTypes.STRING(), + DataTypes.ROW( + DataTypes.FIELD("posX", DataTypes.DOUBLE().notNull(), "X field"), + DataTypes.FIELD("posY", DataTypes.DOUBLE().notNull(), "Y field")))) + .field("strArray", DataTypes.ARRAY(DataTypes.STRING()).nullable()) + .field("intArray", DataTypes.ARRAY(DataTypes.INT()).nullable()) + .field("char", DataTypes.CHAR(10).notNull()) + .field("varchar", DataTypes.VARCHAR(10).notNull()) + .field("boolean", DataTypes.BOOLEAN().nullable()) + .field("tinyint", DataTypes.TINYINT()) + .field("smallint", DataTypes.SMALLINT()) + .field("bigint", DataTypes.BIGINT()) + .field("varbinary", DataTypes.VARBINARY(10)) + .field("binary", DataTypes.BINARY(10)) + .field("time", DataTypes.TIME()) + .field("timestampWithoutZone", DataTypes.TIMESTAMP()) + .field("timestampWithZone", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) + .field("date", DataTypes.DATE()) + .field("decimal", DataTypes.DECIMAL(2, 2)) + .field("decimal2", DataTypes.DECIMAL(38, 2)) + .field("decimal3", DataTypes.DECIMAL(10, 1)) + .field("multiset", DataTypes.MULTISET(DataTypes.STRING().notNull())) + .build(); - Schema icebergSchema = new Schema( - Types.NestedField.required(0, "id", Types.IntegerType.get(), null), - Types.NestedField.optional(1, "name", Types.StringType.get(), null), - Types.NestedField.required(2, "salary", Types.DoubleType.get(), null), - Types.NestedField.optional(3, "locations", Types.MapType.ofOptional(24, 25, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(22, "posX", Types.DoubleType.get(), "X field"), - Types.NestedField.required(23, "posY", Types.DoubleType.get(), "Y field") - ))), - Types.NestedField.optional(4, "strArray", Types.ListType.ofOptional(26, Types.StringType.get())), - Types.NestedField.optional(5, "intArray", Types.ListType.ofOptional(27, Types.IntegerType.get())), - Types.NestedField.required(6, "char", Types.StringType.get()), - Types.NestedField.required(7, "varchar", Types.StringType.get()), - Types.NestedField.optional(8, "boolean", Types.BooleanType.get()), - Types.NestedField.optional(9, "tinyint", Types.IntegerType.get()), - Types.NestedField.optional(10, "smallint", Types.IntegerType.get()), - Types.NestedField.optional(11, "bigint", Types.LongType.get()), - Types.NestedField.optional(12, "varbinary", Types.BinaryType.get()), - Types.NestedField.optional(13, "binary", Types.FixedType.ofLength(10)), - Types.NestedField.optional(14, "time", Types.TimeType.get()), - Types.NestedField.optional(15, "timestampWithoutZone", Types.TimestampType.withoutZone()), - Types.NestedField.optional(16, "timestampWithZone", Types.TimestampType.withZone()), - Types.NestedField.optional(17, "date", Types.DateType.get()), - Types.NestedField.optional(18, "decimal", Types.DecimalType.of(2, 2)), - Types.NestedField.optional(19, "decimal2", Types.DecimalType.of(38, 2)), - Types.NestedField.optional(20, "decimal3", Types.DecimalType.of(10, 1)), - Types.NestedField.optional(21, "multiset", Types.MapType.ofRequired(28, 29, - Types.StringType.get(), - Types.IntegerType.get())) - ); + Schema icebergSchema = + new Schema( + Types.NestedField.required(0, "id", Types.IntegerType.get(), null), + Types.NestedField.optional(1, "name", Types.StringType.get(), null), + Types.NestedField.required(2, "salary", Types.DoubleType.get(), null), + Types.NestedField.optional( + 3, + "locations", + Types.MapType.ofOptional( + 24, + 25, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(22, "posX", Types.DoubleType.get(), "X field"), + Types.NestedField.required( + 23, "posY", Types.DoubleType.get(), "Y field")))), + Types.NestedField.optional( + 4, "strArray", Types.ListType.ofOptional(26, Types.StringType.get())), + Types.NestedField.optional( + 5, "intArray", Types.ListType.ofOptional(27, Types.IntegerType.get())), + Types.NestedField.required(6, "char", Types.StringType.get()), + Types.NestedField.required(7, "varchar", Types.StringType.get()), + Types.NestedField.optional(8, "boolean", Types.BooleanType.get()), + Types.NestedField.optional(9, "tinyint", Types.IntegerType.get()), + Types.NestedField.optional(10, "smallint", Types.IntegerType.get()), + Types.NestedField.optional(11, "bigint", Types.LongType.get()), + Types.NestedField.optional(12, "varbinary", Types.BinaryType.get()), + Types.NestedField.optional(13, "binary", Types.FixedType.ofLength(10)), + Types.NestedField.optional(14, "time", Types.TimeType.get()), + Types.NestedField.optional( + 15, "timestampWithoutZone", Types.TimestampType.withoutZone()), + Types.NestedField.optional(16, "timestampWithZone", Types.TimestampType.withZone()), + Types.NestedField.optional(17, "date", Types.DateType.get()), + Types.NestedField.optional(18, "decimal", Types.DecimalType.of(2, 2)), + Types.NestedField.optional(19, "decimal2", Types.DecimalType.of(38, 2)), + Types.NestedField.optional(20, "decimal3", Types.DecimalType.of(10, 1)), + Types.NestedField.optional( + 21, + "multiset", + Types.MapType.ofRequired(28, 29, Types.StringType.get(), Types.IntegerType.get()))); checkSchema(flinkSchema, icebergSchema); } @Test public void testMapField() { - TableSchema flinkSchema = TableSchema.builder() - .field("map_int_long", DataTypes.MAP(DataTypes.INT(), DataTypes.BIGINT()).notNull()) /* Required */ - .field("map_int_array_string", DataTypes.MAP(DataTypes.ARRAY(DataTypes.INT()), DataTypes.STRING())) - .field("map_decimal_string", DataTypes.MAP(DataTypes.DECIMAL(10, 2), DataTypes.STRING())) - .field("map_fields_fields", - DataTypes.MAP( - DataTypes.ROW( - DataTypes.FIELD("field_int", DataTypes.INT(), "doc - int"), - DataTypes.FIELD("field_string", DataTypes.STRING(), "doc - string") - ).notNull(), /* Required */ - DataTypes.ROW( - DataTypes.FIELD("field_array", DataTypes.ARRAY(DataTypes.STRING()), "doc - array") - ).notNull() /* Required */ - ).notNull() /* Required */ - ) - .build(); + TableSchema flinkSchema = + TableSchema.builder() + .field( + "map_int_long", + DataTypes.MAP(DataTypes.INT(), DataTypes.BIGINT()).notNull()) /* Required */ + .field( + "map_int_array_string", + DataTypes.MAP(DataTypes.ARRAY(DataTypes.INT()), DataTypes.STRING())) + .field( + "map_decimal_string", DataTypes.MAP(DataTypes.DECIMAL(10, 2), DataTypes.STRING())) + .field( + "map_fields_fields", + DataTypes.MAP( + DataTypes.ROW( + DataTypes.FIELD("field_int", DataTypes.INT(), "doc - int"), + DataTypes.FIELD("field_string", DataTypes.STRING(), "doc - string")) + .notNull(), /* Required */ + DataTypes.ROW( + DataTypes.FIELD( + "field_array", + DataTypes.ARRAY(DataTypes.STRING()), + "doc - array")) + .notNull() /* Required */) + .notNull() /* Required */) + .build(); - Schema icebergSchema = new Schema( - Types.NestedField.required(0, "map_int_long", - Types.MapType.ofOptional(4, 5, Types.IntegerType.get(), Types.LongType.get()), null), - Types.NestedField.optional(1, "map_int_array_string", - Types.MapType.ofOptional(7, 8, - Types.ListType.ofOptional(6, Types.IntegerType.get()), Types.StringType.get()), null), - Types.NestedField.optional(2, "map_decimal_string", Types.MapType.ofOptional(9, 10, - Types.DecimalType.of(10, 2), Types.StringType.get())), - Types.NestedField.required(3, "map_fields_fields", - Types.MapType.ofRequired( - 15, 16, - Types.StructType.of(Types.NestedField.optional(11, "field_int", Types.IntegerType.get(), "doc - int"), - Types.NestedField.optional(12, "field_string", Types.StringType.get(), "doc - string")), - Types.StructType.of(Types.NestedField.optional(14, "field_array", - Types.ListType.ofOptional(13, Types.StringType.get()), "doc - array")) - ) - ) - ); + Schema icebergSchema = + new Schema( + Types.NestedField.required( + 0, + "map_int_long", + Types.MapType.ofOptional(4, 5, Types.IntegerType.get(), Types.LongType.get()), + null), + Types.NestedField.optional( + 1, + "map_int_array_string", + Types.MapType.ofOptional( + 7, + 8, + Types.ListType.ofOptional(6, Types.IntegerType.get()), + Types.StringType.get()), + null), + Types.NestedField.optional( + 2, + "map_decimal_string", + Types.MapType.ofOptional( + 9, 10, Types.DecimalType.of(10, 2), Types.StringType.get())), + Types.NestedField.required( + 3, + "map_fields_fields", + Types.MapType.ofRequired( + 15, + 16, + Types.StructType.of( + Types.NestedField.optional( + 11, "field_int", Types.IntegerType.get(), "doc - int"), + Types.NestedField.optional( + 12, "field_string", Types.StringType.get(), "doc - string")), + Types.StructType.of( + Types.NestedField.optional( + 14, + "field_array", + Types.ListType.ofOptional(13, Types.StringType.get()), + "doc - array"))))); checkSchema(flinkSchema, icebergSchema); } @Test public void testStructField() { - TableSchema flinkSchema = TableSchema.builder() - .field("struct_int_string_decimal", DataTypes.ROW( - DataTypes.FIELD("field_int", DataTypes.INT()), - DataTypes.FIELD("field_string", DataTypes.STRING()), - DataTypes.FIELD("field_decimal", DataTypes.DECIMAL(19, 2)), - DataTypes.FIELD("field_struct", DataTypes.ROW( - DataTypes.FIELD("inner_struct_int", DataTypes.INT()), - DataTypes.FIELD("inner_struct_float_array", DataTypes.ARRAY(DataTypes.FLOAT())) - ).notNull()) /* Row is required */ - ).notNull()) /* Required */ - .field("struct_map_int_int", DataTypes.ROW( - DataTypes.FIELD("field_map", DataTypes.MAP(DataTypes.INT(), DataTypes.INT())) - ).nullable()) /* Optional */ - .build(); + TableSchema flinkSchema = + TableSchema.builder() + .field( + "struct_int_string_decimal", + DataTypes.ROW( + DataTypes.FIELD("field_int", DataTypes.INT()), + DataTypes.FIELD("field_string", DataTypes.STRING()), + DataTypes.FIELD("field_decimal", DataTypes.DECIMAL(19, 2)), + DataTypes.FIELD( + "field_struct", + DataTypes.ROW( + DataTypes.FIELD("inner_struct_int", DataTypes.INT()), + DataTypes.FIELD( + "inner_struct_float_array", + DataTypes.ARRAY(DataTypes.FLOAT()))) + .notNull()) /* Row is required */) + .notNull()) /* Required */ + .field( + "struct_map_int_int", + DataTypes.ROW( + DataTypes.FIELD( + "field_map", DataTypes.MAP(DataTypes.INT(), DataTypes.INT()))) + .nullable()) /* Optional */ + .build(); - Schema icebergSchema = new Schema( - Types.NestedField.required(0, "struct_int_string_decimal", - Types.StructType.of( - Types.NestedField.optional(5, "field_int", Types.IntegerType.get()), - Types.NestedField.optional(6, "field_string", Types.StringType.get()), - Types.NestedField.optional(7, "field_decimal", Types.DecimalType.of(19, 2)), - Types.NestedField.required(8, "field_struct", - Types.StructType.of( - Types.NestedField.optional(3, "inner_struct_int", Types.IntegerType.get()), - Types.NestedField.optional(4, "inner_struct_float_array", - Types.ListType.ofOptional(2, Types.FloatType.get())) - )) - )), - Types.NestedField.optional(1, "struct_map_int_int", - Types.StructType.of( - Types.NestedField.optional(11, "field_map", Types.MapType.ofOptional(9, 10, - Types.IntegerType.get(), Types.IntegerType.get())) - ) - ) - ); + Schema icebergSchema = + new Schema( + Types.NestedField.required( + 0, + "struct_int_string_decimal", + Types.StructType.of( + Types.NestedField.optional(5, "field_int", Types.IntegerType.get()), + Types.NestedField.optional(6, "field_string", Types.StringType.get()), + Types.NestedField.optional(7, "field_decimal", Types.DecimalType.of(19, 2)), + Types.NestedField.required( + 8, + "field_struct", + Types.StructType.of( + Types.NestedField.optional( + 3, "inner_struct_int", Types.IntegerType.get()), + Types.NestedField.optional( + 4, + "inner_struct_float_array", + Types.ListType.ofOptional(2, Types.FloatType.get())))))), + Types.NestedField.optional( + 1, + "struct_map_int_int", + Types.StructType.of( + Types.NestedField.optional( + 11, + "field_map", + Types.MapType.ofOptional( + 9, 10, Types.IntegerType.get(), Types.IntegerType.get()))))); checkSchema(flinkSchema, icebergSchema); } @Test public void testListField() { - TableSchema flinkSchema = TableSchema.builder() - .field("list_struct_fields", DataTypes.ARRAY( - DataTypes.ROW( - DataTypes.FIELD("field_int", DataTypes.INT()) - ) - ).notNull()) /* Required */ - .field("list_optional_struct_fields", DataTypes.ARRAY( - DataTypes.ROW( - DataTypes.FIELD( - "field_timestamp_with_local_time_zone", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE() - ) - ) - ).nullable()) /* Optional */ - .field("list_map_fields", DataTypes.ARRAY( - DataTypes.MAP( - DataTypes.ARRAY(DataTypes.INT().notNull()), /* Key of map must be required */ - DataTypes.ROW( - DataTypes.FIELD("field_0", DataTypes.INT(), "doc - int") - ) - ).notNull() - ).notNull()) /* Required */ - .build(); + TableSchema flinkSchema = + TableSchema.builder() + .field( + "list_struct_fields", + DataTypes.ARRAY(DataTypes.ROW(DataTypes.FIELD("field_int", DataTypes.INT()))) + .notNull()) /* Required */ + .field( + "list_optional_struct_fields", + DataTypes.ARRAY( + DataTypes.ROW( + DataTypes.FIELD( + "field_timestamp_with_local_time_zone", + DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()))) + .nullable()) /* Optional */ + .field( + "list_map_fields", + DataTypes.ARRAY( + DataTypes.MAP( + DataTypes.ARRAY( + DataTypes.INT().notNull()), /* Key of map must be required */ + DataTypes.ROW( + DataTypes.FIELD("field_0", DataTypes.INT(), "doc - int"))) + .notNull()) + .notNull()) /* Required */ + .build(); - Schema icebergSchema = new Schema( - Types.NestedField.required(0, "list_struct_fields", - Types.ListType.ofOptional(4, Types.StructType.of( - Types.NestedField.optional(3, "field_int", Types.IntegerType.get()) - ))), - Types.NestedField.optional(1, "list_optional_struct_fields", - Types.ListType.ofOptional(6, Types.StructType.of( - Types.NestedField.optional(5, "field_timestamp_with_local_time_zone", Types.TimestampType.withZone()) - ))), - Types.NestedField.required(2, "list_map_fields", - Types.ListType.ofRequired(11, - Types.MapType.ofOptional(9, 10, - Types.ListType.ofRequired(7, Types.IntegerType.get()), + Schema icebergSchema = + new Schema( + Types.NestedField.required( + 0, + "list_struct_fields", + Types.ListType.ofOptional( + 4, Types.StructType.of( - Types.NestedField.optional(8, "field_0", Types.IntegerType.get(), "doc - int") - ) - ) - )) - ); + Types.NestedField.optional(3, "field_int", Types.IntegerType.get())))), + Types.NestedField.optional( + 1, + "list_optional_struct_fields", + Types.ListType.ofOptional( + 6, + Types.StructType.of( + Types.NestedField.optional( + 5, + "field_timestamp_with_local_time_zone", + Types.TimestampType.withZone())))), + Types.NestedField.required( + 2, + "list_map_fields", + Types.ListType.ofRequired( + 11, + Types.MapType.ofOptional( + 9, + 10, + Types.ListType.ofRequired(7, Types.IntegerType.get()), + Types.StructType.of( + Types.NestedField.optional( + 8, "field_0", Types.IntegerType.get(), "doc - int")))))); checkSchema(flinkSchema, icebergSchema); } @@ -242,34 +317,43 @@ private void checkSchema(TableSchema flinkSchema, Schema icebergSchema) { // The conversion is not a 1:1 mapping, so we just check iceberg types. Assert.assertEquals( icebergSchema.asStruct(), - FlinkSchemaUtil.convert(FlinkSchemaUtil.toSchema(FlinkSchemaUtil.convert(icebergSchema))).asStruct()); + FlinkSchemaUtil.convert(FlinkSchemaUtil.toSchema(FlinkSchemaUtil.convert(icebergSchema))) + .asStruct()); } @Test public void testInconsistentTypes() { checkInconsistentType( - Types.UUIDType.get(), new BinaryType(16), - new BinaryType(16), Types.FixedType.ofLength(16)); + Types.UUIDType.get(), new BinaryType(16), new BinaryType(16), Types.FixedType.ofLength(16)); checkInconsistentType( - Types.StringType.get(), new VarCharType(VarCharType.MAX_LENGTH), - new CharType(100), Types.StringType.get()); + Types.StringType.get(), + new VarCharType(VarCharType.MAX_LENGTH), + new CharType(100), + Types.StringType.get()); checkInconsistentType( - Types.BinaryType.get(), new VarBinaryType(VarBinaryType.MAX_LENGTH), - new VarBinaryType(100), Types.BinaryType.get()); + Types.BinaryType.get(), + new VarBinaryType(VarBinaryType.MAX_LENGTH), + new VarBinaryType(100), + Types.BinaryType.get()); checkInconsistentType( - Types.TimeType.get(), new TimeType(), - new TimeType(3), Types.TimeType.get()); + Types.TimeType.get(), new TimeType(), new TimeType(3), Types.TimeType.get()); checkInconsistentType( - Types.TimestampType.withoutZone(), new TimestampType(6), - new TimestampType(3), Types.TimestampType.withoutZone()); + Types.TimestampType.withoutZone(), + new TimestampType(6), + new TimestampType(3), + Types.TimestampType.withoutZone()); checkInconsistentType( - Types.TimestampType.withZone(), new LocalZonedTimestampType(6), - new LocalZonedTimestampType(3), Types.TimestampType.withZone()); + Types.TimestampType.withZone(), + new LocalZonedTimestampType(6), + new LocalZonedTimestampType(3), + Types.TimestampType.withZone()); } private void checkInconsistentType( - Type icebergType, LogicalType flinkExpectedType, - LogicalType flinkType, Type icebergExpectedType) { + Type icebergType, + LogicalType flinkExpectedType, + LogicalType flinkType, + Type icebergExpectedType) { Assert.assertEquals(flinkExpectedType, FlinkSchemaUtil.convert(icebergType)); Assert.assertEquals( Types.StructType.of(Types.NestedField.optional(0, "f0", icebergExpectedType)), @@ -278,19 +362,19 @@ private void checkInconsistentType( @Test public void testConvertFlinkSchemaBaseOnIcebergSchema() { - Schema baseSchema = new Schema( - Lists.newArrayList( - Types.NestedField.required(101, "int", Types.IntegerType.get()), - Types.NestedField.optional(102, "string", Types.StringType.get()) - ), - Sets.newHashSet(101) - ); + Schema baseSchema = + new Schema( + Lists.newArrayList( + Types.NestedField.required(101, "int", Types.IntegerType.get()), + Types.NestedField.optional(102, "string", Types.StringType.get())), + Sets.newHashSet(101)); - TableSchema flinkSchema = TableSchema.builder() - .field("int", DataTypes.INT().notNull()) - .field("string", DataTypes.STRING().nullable()) - .primaryKey("int") - .build(); + TableSchema flinkSchema = + TableSchema.builder() + .field("int", DataTypes.INT().notNull()) + .field("string", DataTypes.STRING().nullable()) + .primaryKey("int") + .build(); Schema convertedSchema = FlinkSchemaUtil.convert(baseSchema, flinkSchema); Assert.assertEquals(baseSchema.asStruct(), convertedSchema.asStruct()); Assert.assertEquals(ImmutableSet.of(101), convertedSchema.identifierFieldIds()); @@ -298,29 +382,33 @@ public void testConvertFlinkSchemaBaseOnIcebergSchema() { @Test public void testConvertFlinkSchemaWithPrimaryKeys() { - Schema icebergSchema = new Schema( - Lists.newArrayList( - Types.NestedField.required(1, "int", Types.IntegerType.get()), - Types.NestedField.required(2, "string", Types.StringType.get()) - ), - Sets.newHashSet(1, 2) - ); + Schema icebergSchema = + new Schema( + Lists.newArrayList( + Types.NestedField.required(1, "int", Types.IntegerType.get()), + Types.NestedField.required(2, "string", Types.StringType.get())), + Sets.newHashSet(1, 2)); TableSchema tableSchema = FlinkSchemaUtil.toSchema(icebergSchema); Assert.assertTrue(tableSchema.getPrimaryKey().isPresent()); - Assert.assertEquals(ImmutableSet.of("int", "string"), + Assert.assertEquals( + ImmutableSet.of("int", "string"), ImmutableSet.copyOf(tableSchema.getPrimaryKey().get().getColumns())); } @Test public void testConvertFlinkSchemaWithNestedColumnInPrimaryKeys() { - Schema icebergSchema = new Schema( - Lists.newArrayList(Types.NestedField.required(1, "struct", - Types.StructType.of(Types.NestedField.required(2, "inner", Types.IntegerType.get()))) - ), - Sets.newHashSet(2) - ); - AssertHelpers.assertThrows("Does not support the nested columns in flink schema's primary keys", + Schema icebergSchema = + new Schema( + Lists.newArrayList( + Types.NestedField.required( + 1, + "struct", + Types.StructType.of( + Types.NestedField.required(2, "inner", Types.IntegerType.get())))), + Sets.newHashSet(2)); + AssertHelpers.assertThrows( + "Does not support the nested columns in flink schema's primary keys", ValidationException.class, "Column 'struct.inner' does not exist", () -> FlinkSchemaUtil.toSchema(icebergSchema)); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java index 76618329be7f..c4c75edd9edd 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -59,8 +58,7 @@ public class TestFlinkTableSink extends FlinkCatalogTestBase { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private static final String SOURCE_TABLE = "default_catalog.default_database.bounded_source"; private static final String TABLE_NAME = "test_table"; @@ -70,10 +68,12 @@ public class TestFlinkTableSink extends FlinkCatalogTestBase { private final FileFormat format; private final boolean isStreamingJob; - @Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") + @Parameterized.Parameters( + name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") public static Iterable parameters() { List parameters = Lists.newArrayList(); - for (FileFormat format : new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { + for (FileFormat format : + new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { for (Boolean isStreaming : new Boolean[] {true, false}) { for (Object[] catalogParams : FlinkCatalogTestBase.parameters()) { String catalogName = (String) catalogParams[0]; @@ -85,7 +85,8 @@ public static Iterable parameters() { return parameters; } - public TestFlinkTableSink(String catalogName, Namespace baseNamespace, FileFormat format, Boolean isStreamingJob) { + public TestFlinkTableSink( + String catalogName, Namespace baseNamespace, FileFormat format, Boolean isStreamingJob) { super(catalogName, baseNamespace); this.format = format; this.isStreamingJob = isStreamingJob; @@ -98,8 +99,9 @@ protected TableEnvironment getTableEnv() { EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); if (isStreamingJob) { settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); @@ -120,7 +122,9 @@ public void before() { sql("CREATE DATABASE %s", flinkDatabase); sql("USE CATALOG %s", catalogName); sql("USE %s", DATABASE); - sql("CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", TABLE_NAME, format.name()); + sql( + "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", + TABLE_NAME, format.name()); icebergTable = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); } @@ -136,78 +140,86 @@ public void clean() { @Test public void testInsertFromSourceTable() throws Exception { // Register the rows into a temporary table. - getTableEnv().createTemporaryView("sourceTable", - getTableEnv().fromValues(SimpleDataUtil.FLINK_SCHEMA.toRowDataType(), - Expressions.row(1, "hello"), - Expressions.row(2, "world"), - Expressions.row(3, (String) null), - Expressions.row(null, "bar") - ) - ); + getTableEnv() + .createTemporaryView( + "sourceTable", + getTableEnv() + .fromValues( + SimpleDataUtil.FLINK_SCHEMA.toRowDataType(), + Expressions.row(1, "hello"), + Expressions.row(2, "world"), + Expressions.row(3, (String) null), + Expressions.row(null, "bar"))); // Redirect the records from source table to destination table. sql("INSERT INTO %s SELECT id,data from sourceTable", TABLE_NAME); // Assert the table records as expected. - SimpleDataUtil.assertTableRecords(icebergTable, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), - SimpleDataUtil.createRecord(2, "world"), - SimpleDataUtil.createRecord(3, null), - SimpleDataUtil.createRecord(null, "bar") - )); + SimpleDataUtil.assertTableRecords( + icebergTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hello"), + SimpleDataUtil.createRecord(2, "world"), + SimpleDataUtil.createRecord(3, null), + SimpleDataUtil.createRecord(null, "bar"))); } @Test public void testOverwriteTable() throws Exception { - Assume.assumeFalse("Flink unbounded streaming does not support overwrite operation", isStreamingJob); + Assume.assumeFalse( + "Flink unbounded streaming does not support overwrite operation", isStreamingJob); sql("INSERT INTO %s SELECT 1, 'a'", TABLE_NAME); - SimpleDataUtil.assertTableRecords(icebergTable, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a") - )); + SimpleDataUtil.assertTableRecords( + icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(1, "a"))); sql("INSERT OVERWRITE %s SELECT 2, 'b'", TABLE_NAME); - SimpleDataUtil.assertTableRecords(icebergTable, Lists.newArrayList( - SimpleDataUtil.createRecord(2, "b") - )); + SimpleDataUtil.assertTableRecords( + icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(2, "b"))); } @Test public void testReplacePartitions() throws Exception { - Assume.assumeFalse("Flink unbounded streaming does not support overwrite operation", isStreamingJob); + Assume.assumeFalse( + "Flink unbounded streaming does not support overwrite operation", isStreamingJob); String tableName = "test_partition"; - sql("CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", tableName, format.name()); try { - Table partitionedTable = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); + Table partitionedTable = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); sql("INSERT INTO %s SELECT 1, 'a'", tableName); sql("INSERT INTO %s SELECT 2, 'b'", tableName); sql("INSERT INTO %s SELECT 3, 'c'", tableName); - SimpleDataUtil.assertTableRecords(partitionedTable, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "b"), - SimpleDataUtil.createRecord(3, "c") - )); + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "b"), + SimpleDataUtil.createRecord(3, "c"))); sql("INSERT OVERWRITE %s SELECT 4, 'b'", tableName); sql("INSERT OVERWRITE %s SELECT 5, 'a'", tableName); - SimpleDataUtil.assertTableRecords(partitionedTable, Lists.newArrayList( - SimpleDataUtil.createRecord(5, "a"), - SimpleDataUtil.createRecord(4, "b"), - SimpleDataUtil.createRecord(3, "c") - )); + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(5, "a"), + SimpleDataUtil.createRecord(4, "b"), + SimpleDataUtil.createRecord(3, "c"))); sql("INSERT OVERWRITE %s PARTITION (data='a') SELECT 6", tableName); - SimpleDataUtil.assertTableRecords(partitionedTable, Lists.newArrayList( - SimpleDataUtil.createRecord(6, "a"), - SimpleDataUtil.createRecord(4, "b"), - SimpleDataUtil.createRecord(3, "c") - )); + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(6, "a"), + SimpleDataUtil.createRecord(4, "b"), + SimpleDataUtil.createRecord(3, "c"))); } finally { sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); } @@ -216,34 +228,38 @@ public void testReplacePartitions() throws Exception { @Test public void testInsertIntoPartition() throws Exception { String tableName = "test_insert_into_partition"; - sql("CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", tableName, format.name()); try { - Table partitionedTable = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); + Table partitionedTable = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); // Full partition. sql("INSERT INTO %s PARTITION (data='a') SELECT 1", tableName); sql("INSERT INTO %s PARTITION (data='a') SELECT 2", tableName); sql("INSERT INTO %s PARTITION (data='b') SELECT 3", tableName); - SimpleDataUtil.assertTableRecords(partitionedTable, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "a"), - SimpleDataUtil.createRecord(3, "b") - )); + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "a"), + SimpleDataUtil.createRecord(3, "b"))); // Partial partition. sql("INSERT INTO %s SELECT 4, 'c'", tableName); sql("INSERT INTO %s SELECT 5, 'd'", tableName); - SimpleDataUtil.assertTableRecords(partitionedTable, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "a"), - SimpleDataUtil.createRecord(3, "b"), - SimpleDataUtil.createRecord(4, "c"), - SimpleDataUtil.createRecord(5, "d") - )); + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "a"), + SimpleDataUtil.createRecord(3, "b"), + SimpleDataUtil.createRecord(4, "c"), + SimpleDataUtil.createRecord(5, "d"))); } finally { sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); } @@ -252,34 +268,45 @@ public void testInsertIntoPartition() throws Exception { @Test public void testHashDistributeMode() throws Exception { String tableName = "test_hash_distribution_mode"; - Map tableProps = ImmutableMap.of( - "write.format.default", format.name(), - TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName() - ); + Map tableProps = + ImmutableMap.of( + "write.format.default", + format.name(), + TableProperties.WRITE_DISTRIBUTION_MODE, + DistributionMode.HASH.modeName()); // Initialize a BoundedSource table to precisely emit those rows in only one checkpoint. - List dataSet = IntStream.range(1, 1000) - .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) - .flatMap(List::stream) - .collect(Collectors.toList()); + List dataSet = + IntStream.range(1, 1000) + .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) + .flatMap(List::stream) + .collect(Collectors.toList()); String dataId = BoundedTableFactory.registerDataSet(ImmutableList.of(dataSet)); - sql("CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + - " WITH ('connector'='BoundedSource', 'data-id'='%s')", SOURCE_TABLE, dataId); - Assert.assertEquals("Should have the expected rows in source table.", Sets.newHashSet(dataSet), + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + + " WITH ('connector'='BoundedSource', 'data-id'='%s')", + SOURCE_TABLE, dataId); + Assert.assertEquals( + "Should have the expected rows in source table.", + Sets.newHashSet(dataSet), Sets.newHashSet(sql("SELECT * FROM %s", SOURCE_TABLE))); - sql("CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH %s", + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH %s", tableName, toWithClause(tableProps)); try { // Insert data set. sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); - Assert.assertEquals("Should have the expected rows in sink table.", Sets.newHashSet(dataSet), + Assert.assertEquals( + "Should have the expected rows in sink table.", + Sets.newHashSet(dataSet), Sets.newHashSet(sql("SELECT * FROM %s", tableName))); // Sometimes we will have more than one checkpoint if we pass the auto checkpoint interval, - // thus producing multiple snapshots. Here we assert that each snapshot has only 1 file per partition. + // thus producing multiple snapshots. Here we assert that each snapshot has only 1 file per + // partition. Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); Map> snapshotToDataFiles = SimpleDataUtil.snapshotToDataFiles(table); for (List dataFiles : snapshotToDataFiles.values()) { @@ -287,12 +314,24 @@ public void testHashDistributeMode() throws Exception { continue; } - Assert.assertEquals("There should be 1 data file in partition 'aaa'", 1, - SimpleDataUtil.matchingPartitions(dataFiles, table.spec(), ImmutableMap.of("data", "aaa")).size()); - Assert.assertEquals("There should be 1 data file in partition 'bbb'", 1, - SimpleDataUtil.matchingPartitions(dataFiles, table.spec(), ImmutableMap.of("data", "bbb")).size()); - Assert.assertEquals("There should be 1 data file in partition 'ccc'", 1, - SimpleDataUtil.matchingPartitions(dataFiles, table.spec(), ImmutableMap.of("data", "ccc")).size()); + Assert.assertEquals( + "There should be 1 data file in partition 'aaa'", + 1, + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "aaa")) + .size()); + Assert.assertEquals( + "There should be 1 data file in partition 'bbb'", + 1, + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "bbb")) + .size()); + Assert.assertEquals( + "There should be 1 data file in partition 'ccc'", + 1, + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "ccc")) + .size()); } } finally { sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSource.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSource.java index fe9c9d832a36..8f30f13db7e0 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSource.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -52,18 +51,17 @@ public class TestFlinkTableSource extends FlinkTestBase { public TestFlinkTableSource() { // register a scan event listener to validate pushdown - Listeners.register(event -> { - scanEventCount += 1; - lastScanEvent = event; - }, ScanEvent.class); + Listeners.register( + event -> { + scanEventCount += 1; + lastScanEvent = event; + }, + ScanEvent.class); } @Override protected TableEnvironment getTableEnv() { - super.getTableEnv() - .getConfig() - .getConfiguration() - .set(CoreOptions.DEFAULT_PARALLELISM, 1); + super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1); return super.getTableEnv(); } @@ -77,14 +75,18 @@ public static void createWarehouse() throws IOException { @Before public void before() { - sql("CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", CATALOG_NAME, - warehouse); + sql( + "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_NAME, warehouse); sql("USE CATALOG %s", CATALOG_NAME); sql("CREATE DATABASE %s", DATABASE_NAME); sql("USE %s", DATABASE_NAME); - sql("CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) WITH ('write.format.default'='%s')", TABLE_NAME, - format.name()); - sql("INSERT INTO %s VALUES (1,'iceberg',10),(2,'b',20),(3,CAST(NULL AS VARCHAR),30)", TABLE_NAME); + sql( + "CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) WITH ('write.format.default'='%s')", + TABLE_NAME, format.name()); + sql( + "INSERT INTO %s VALUES (1,'iceberg',10),(2,'b',20),(3,CAST(NULL AS VARCHAR),30)", + TABLE_NAME); this.scanEventCount = 0; this.lastScanEvent = null; @@ -100,19 +102,19 @@ public void clean() { @Test public void testLimitPushDown() { - AssertHelpers.assertThrows("Invalid limit number: -1 ", SqlParserException.class, + AssertHelpers.assertThrows( + "Invalid limit number: -1 ", + SqlParserException.class, () -> sql("SELECT * FROM %s LIMIT -1", TABLE_NAME)); - Assert.assertEquals("Should have 0 record", 0, sql("SELECT * FROM %s LIMIT 0", TABLE_NAME).size()); + Assert.assertEquals( + "Should have 0 record", 0, sql("SELECT * FROM %s LIMIT 0", TABLE_NAME).size()); String sqlLimitExceed = String.format("SELECT * FROM %s LIMIT 4", TABLE_NAME); List resultExceed = sql(sqlLimitExceed); Assert.assertEquals("Should have 3 records", 3, resultExceed.size()); - List expectedList = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedList = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedList, resultExceed); String querySql = String.format("SELECT * FROM %s LIMIT 1", TABLE_NAME); @@ -121,26 +123,24 @@ public void testLimitPushDown() { Assert.assertTrue("Explain should contain LimitPushDown", explain.contains(expectedExplain)); List result = sql(querySql); Assert.assertEquals("Should have 1 record", 1, result.size()); - Assertions.assertThat(result) - .containsAnyElementsOf(expectedList); + Assertions.assertThat(result).containsAnyElementsOf(expectedList); String sqlMixed = String.format("SELECT * FROM %s WHERE id = 1 LIMIT 2", TABLE_NAME); List mixedResult = sql(sqlMixed); Assert.assertEquals("Should have 1 record", 1, mixedResult.size()); - Assert.assertEquals("Should produce the expected records", Row.of(1, "iceberg", 10.0), mixedResult.get(0)); + Assert.assertEquals( + "Should produce the expected records", Row.of(1, "iceberg", 10.0), mixedResult.get(0)); } @Test public void testNoFilterPushDown() { String sql = String.format("SELECT * FROM %s ", TABLE_NAME); List result = sql(sql); - List expectedRecords = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedRecords = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedRecords, result); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); } @Test @@ -150,10 +150,12 @@ public void testFilterPushDownEqual() { List result = sql(sqlLiteralRight); Assert.assertEquals("Should have 1 record", 1, result.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), result.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), result.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -172,10 +174,12 @@ public void testFilterPushDownEqualLiteralOnLeft() { List resultLeft = sql(sqlLiteralLeft); Assert.assertEquals("Should have 1 record", 1, resultLeft.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLeft.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLeft.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -186,13 +190,11 @@ public void testFilterPushDownNoEqual() { List resultNE = sql(sqlNE); Assert.assertEquals("Should have 2 records", 2, resultNE.size()); - List expectedNE = Lists.newArrayList( - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedNE = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedNE, resultNE); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -206,15 +208,18 @@ public void testFilterPushDownNoEqualNull() { @Test public void testFilterPushDownAnd() { - String sqlAnd = String.format("SELECT * FROM %s WHERE id = 1 AND data = 'iceberg' ", TABLE_NAME); + String sqlAnd = + String.format("SELECT * FROM %s WHERE id = 1 AND data = 'iceberg' ", TABLE_NAME); List resultAnd = sql(sqlAnd); Assert.assertEquals("Should have 1 record", 1, resultAnd.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultAnd.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultAnd.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); String expected = "(ref(name=\"id\") == 1 and ref(name=\"data\") == \"iceberg\")"; - Assert.assertEquals("Should contain the push down filter", expected, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expected, lastScanEvent.filter().toString()); } @Test @@ -225,14 +230,12 @@ public void testFilterPushDownOr() { List resultOr = sql(sqlOr); Assert.assertEquals("Should have 2 record", 2, resultOr.size()); - List expectedOR = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expectedOR = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expectedOR, resultOr); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -243,14 +246,12 @@ public void testFilterPushDownGreaterThan() { List resultGT = sql(sqlGT); Assert.assertEquals("Should have 2 record", 2, resultGT.size()); - List expectedGT = Lists.newArrayList( - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedGT = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedGT, resultGT); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -270,14 +271,12 @@ public void testFilterPushDownGreaterThanLiteralOnLeft() { List resultGT = sql(sqlGT); Assert.assertEquals("Should have 2 records", 2, resultGT.size()); - List expectedGT = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expectedGT = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expectedGT, resultGT); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -288,14 +287,12 @@ public void testFilterPushDownGreaterThanEqual() { List resultGTE = sql(sqlGTE); Assert.assertEquals("Should have 2 records", 2, resultGTE.size()); - List expectedGTE = Lists.newArrayList( - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedGTE = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedGTE, resultGTE); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -315,14 +312,12 @@ public void testFilterPushDownGreaterThanEqualLiteralOnLeft() { List resultGTE = sql(sqlGTE); Assert.assertEquals("Should have 2 records", 2, resultGTE.size()); - List expectedGTE = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expectedGTE = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expectedGTE, resultGTE); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -332,10 +327,12 @@ public void testFilterPushDownLessThan() { List resultLT = sql(sqlLT); Assert.assertEquals("Should have 1 record", 1, resultLT.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLT.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLT.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -354,10 +351,12 @@ public void testFilterPushDownLessThanLiteralOnLeft() { List resultLT = sql(sqlLT); Assert.assertEquals("Should have 1 record", 1, resultLT.size()); - Assert.assertEquals("Should produce the expected record", Row.of(3, null, 30.0), resultLT.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(3, null, 30.0), resultLT.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -367,10 +366,12 @@ public void testFilterPushDownLessThanEqual() { List resultLTE = sql(sqlLTE); Assert.assertEquals("Should have 1 record", 1, resultLTE.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLTE.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLTE.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -389,10 +390,12 @@ public void testFilterPushDownLessThanEqualLiteralOnLeft() { List resultLTE = sql(sqlLTE); Assert.assertEquals("Should have 1 record", 1, resultLTE.size()); - Assert.assertEquals("Should produce the expected record", Row.of(3, null, 30.0), resultLTE.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(3, null, 30.0), resultLTE.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -402,23 +405,24 @@ public void testFilterPushDownIn() { List resultIN = sql(sqlIN); Assert.assertEquals("Should have 2 records", 2, resultIN.size()); - List expectedIN = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expectedIN = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expectedIN, resultIN); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test public void testFilterPushDownInNull() { - String sqlInNull = String.format("SELECT * FROM %s WHERE data IN ('iceberg',NULL) ", TABLE_NAME); + String sqlInNull = + String.format("SELECT * FROM %s WHERE data IN ('iceberg',NULL) ", TABLE_NAME); List result = sql(sqlInNull); Assert.assertEquals("Should have 1 record", 1, result.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), result.get(0)); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), result.get(0)); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); } @Test @@ -427,10 +431,12 @@ public void testFilterPushDownNotIn() { List resultNotIn = sql(sqlNotIn); Assert.assertEquals("Should have 1 record", 1, resultNotIn.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultNotIn.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultNotIn.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); String expectedScan = "(ref(name=\"id\") != 2 and ref(name=\"id\") != 3)"; - Assert.assertEquals("Should contain the push down filter", expectedScan, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedScan, lastScanEvent.filter().toString()); } @Test @@ -438,7 +444,8 @@ public void testFilterPushDownNotInNull() { String sqlNotInNull = String.format("SELECT * FROM %s WHERE id NOT IN (1,2,NULL) ", TABLE_NAME); List resultGT = sql(sqlNotInNull); Assert.assertEquals("Should have 0 record", 0, resultGT.size()); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); } @Test @@ -449,14 +456,12 @@ public void testFilterPushDownIsNotNull() { List resultNotNull = sql(sqlNotNull); Assert.assertEquals("Should have 2 record", 2, resultNotNull.size()); - List expected = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expected = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expected, resultNotNull); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -466,10 +471,12 @@ public void testFilterPushDownIsNull() { List resultNull = sql(sqlNull); Assert.assertEquals("Should have 1 record", 1, resultNull.size()); - Assert.assertEquals("Should produce the expected record", Row.of(3, null, 30.0), resultNull.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(3, null, 30.0), resultNull.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -478,11 +485,13 @@ public void testFilterPushDownNot() { List resultNot = sql(sqlNot); Assert.assertEquals("Should have 1 record", 1, resultNot.size()); - Assert.assertEquals("Should produce the expected record", Row.of(3, null, 30.0), resultNot.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(3, null, 30.0), resultNot.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); String expectedFilter = "(ref(name=\"id\") != 1 and ref(name=\"id\") != 2)"; - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -492,28 +501,30 @@ public void testFilterPushDownBetween() { List resultBetween = sql(sqlBetween); Assert.assertEquals("Should have 2 record", 2, resultBetween.size()); - List expectedBetween = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expectedBetween = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expectedBetween, resultBetween); Assert.assertEquals("Should create only one scan", 1, scanEventCount); String expected = "(ref(name=\"id\") >= 1 and ref(name=\"id\") <= 2)"; - Assert.assertEquals("Should contain the push down filter", expected, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expected, lastScanEvent.filter().toString()); } @Test public void testFilterPushDownNotBetween() { - String sqlNotBetween = String.format("SELECT * FROM %s WHERE id NOT BETWEEN 2 AND 3 ", TABLE_NAME); + String sqlNotBetween = + String.format("SELECT * FROM %s WHERE id NOT BETWEEN 2 AND 3 ", TABLE_NAME); String expectedFilter = "(ref(name=\"id\") < 2 or ref(name=\"id\") > 3)"; List resultNotBetween = sql(sqlNotBetween); Assert.assertEquals("Should have 1 record", 1, resultNotBetween.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultNotBetween.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultNotBetween.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -523,10 +534,13 @@ public void testFilterPushDownLike() { String sqlLike = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'ice%%' "; List resultLike = sql(sqlLike); Assert.assertEquals("Should have 1 record", 1, resultLike.size()); - Assert.assertEquals("The like result should produce the expected record", - Row.of(1, "iceberg", 10.0), resultLike.get(0)); + Assert.assertEquals( + "The like result should produce the expected record", + Row.of(1, "iceberg", 10.0), + resultLike.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -535,85 +549,105 @@ public void testFilterNotPushDownLike() { String sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i' "; List resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 1 record", 0, resultLike.size()); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i%%' "; resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 1 record", 1, resultLike.size()); Assert.assertEquals("Should produce the expected record", expectRecord, resultLike.get(0)); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%ice%%g' "; resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 1 record", 1, resultLike.size()); Assert.assertEquals("Should produce the expected record", expectRecord, resultLike.get(0)); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%' "; resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 3 records", 3, resultLike.size()); - List expectedRecords = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedRecords = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedRecords, resultLike); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'iceber_' "; resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 1 record", 1, resultLike.size()); Assert.assertEquals("Should produce the expected record", expectRecord, resultLike.get(0)); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'i%%g' "; resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 1 record", 1, resultLike.size()); Assert.assertEquals("Should produce the expected record", expectRecord, resultLike.get(0)); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); } @Test public void testFilterPushDown2Literal() { String sql2Literal = String.format("SELECT * FROM %s WHERE 1 > 0 ", TABLE_NAME); List result = sql(sql2Literal); - List expectedRecords = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedRecords = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedRecords, result); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); } /** - * NaN is not supported by flink now, so we add the test case to assert the parse error, when we upgrade the flink - * that supports NaN, we will delele the method, and add some test case to test NaN. + * NaN is not supported by flink now, so we add the test case to assert the parse error, when we + * upgrade the flink that supports NaN, we will delele the method, and add some test case to test + * NaN. */ @Test public void testSqlParseError() { - String sqlParseErrorEqual = String.format("SELECT * FROM %s WHERE d = CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorEqual)); - - String sqlParseErrorNotEqual = String.format("SELECT * FROM %s WHERE d <> CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorNotEqual)); - - String sqlParseErrorGT = String.format("SELECT * FROM %s WHERE d > CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorGT)); - - String sqlParseErrorLT = String.format("SELECT * FROM %s WHERE d < CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorLT)); - - String sqlParseErrorGTE = String.format("SELECT * FROM %s WHERE d >= CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorGTE)); - - String sqlParseErrorLTE = String.format("SELECT * FROM %s WHERE d <= CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorLTE)); + String sqlParseErrorEqual = + String.format("SELECT * FROM %s WHERE d = CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorEqual)); + + String sqlParseErrorNotEqual = + String.format("SELECT * FROM %s WHERE d <> CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorNotEqual)); + + String sqlParseErrorGT = + String.format("SELECT * FROM %s WHERE d > CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorGT)); + + String sqlParseErrorLT = + String.format("SELECT * FROM %s WHERE d < CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorLT)); + + String sqlParseErrorGTE = + String.format("SELECT * FROM %s WHERE d >= CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorGTE)); + + String sqlParseErrorLTE = + String.format("SELECT * FROM %s WHERE d <= CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorLTE)); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java index f0afac17a9e7..d4b93bc9d4a2 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.time.LocalDate; @@ -49,14 +48,14 @@ public class TestFlinkUpsert extends FlinkCatalogTestBase { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private final boolean isStreamingJob; private final Map tableUpsertProps = Maps.newHashMap(); private TableEnvironment tEnv; - public TestFlinkUpsert(String catalogName, Namespace baseNamespace, FileFormat format, Boolean isStreamingJob) { + public TestFlinkUpsert( + String catalogName, Namespace baseNamespace, FileFormat format, Boolean isStreamingJob) { super(catalogName, baseNamespace); this.isStreamingJob = isStreamingJob; tableUpsertProps.put(TableProperties.FORMAT_VERSION, "2"); @@ -64,13 +63,16 @@ public TestFlinkUpsert(String catalogName, Namespace baseNamespace, FileFormat f tableUpsertProps.put(TableProperties.DEFAULT_FILE_FORMAT, format.name()); } - @Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") + @Parameterized.Parameters( + name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") public static Iterable parameters() { List parameters = Lists.newArrayList(); - for (FileFormat format : new FileFormat[] {FileFormat.PARQUET, FileFormat.AVRO, FileFormat.ORC}) { + for (FileFormat format : + new FileFormat[] {FileFormat.PARQUET, FileFormat.AVRO, FileFormat.ORC}) { for (Boolean isStreaming : new Boolean[] {true, false}) { // Only test with one catalog as this is a file operation concern. - // FlinkCatalogTestBase requires the catalog name start with testhadoop if using hadoop catalog. + // FlinkCatalogTestBase requires the catalog name start with testhadoop if using hadoop + // catalog. String catalogName = "testhadoop"; Namespace baseNamespace = Namespace.of("default"); parameters.add(new Object[] {catalogName, baseNamespace, format, isStreaming}); @@ -83,12 +85,12 @@ public static Iterable parameters() { protected TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings - .newInstance(); + EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); if (isStreamingJob) { settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); @@ -124,33 +126,36 @@ public void testUpsertAndQuery() { LocalDate dt20220301 = LocalDate.of(2022, 3, 1); LocalDate dt20220302 = LocalDate.of(2022, 3, 2); - sql("CREATE TABLE %s(id INT NOT NULL, province STRING NOT NULL, dt DATE, PRIMARY KEY(id,province) NOT ENFORCED) " + - "PARTITIONED BY (province) WITH %s", + sql( + "CREATE TABLE %s(id INT NOT NULL, province STRING NOT NULL, dt DATE, PRIMARY KEY(id,province) NOT ENFORCED) " + + "PARTITIONED BY (province) WITH %s", tableName, toWithClause(tableUpsertProps)); try { - sql("INSERT INTO %s VALUES " + - "(1, 'a', DATE '2022-03-01')," + - "(2, 'b', DATE '2022-03-01')," + - "(1, 'b', DATE '2022-03-01')", + sql( + "INSERT INTO %s VALUES " + + "(1, 'a', DATE '2022-03-01')," + + "(2, 'b', DATE '2022-03-01')," + + "(1, 'b', DATE '2022-03-01')", tableName); - sql("INSERT INTO %s VALUES " + - "(4, 'a', DATE '2022-03-02')," + - "(5, 'b', DATE '2022-03-02')," + - "(1, 'b', DATE '2022-03-02')", + sql( + "INSERT INTO %s VALUES " + + "(4, 'a', DATE '2022-03-02')," + + "(5, 'b', DATE '2022-03-02')," + + "(1, 'b', DATE '2022-03-02')", tableName); - List rowsOn20220301 = Lists.newArrayList(Row.of(2, "b", dt20220301), Row.of(1, "a", dt20220301)); + List rowsOn20220301 = + Lists.newArrayList(Row.of(2, "b", dt20220301), Row.of(1, "a", dt20220301)); TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt < '2022-03-02'", tableName), - rowsOn20220301); + sql("SELECT * FROM %s WHERE dt < '2022-03-02'", tableName), rowsOn20220301); - List rowsOn20220302 = Lists.newArrayList( - Row.of(1, "b", dt20220302), Row.of(4, "a", dt20220302), Row.of(5, "b", dt20220302)); + List rowsOn20220302 = + Lists.newArrayList( + Row.of(1, "b", dt20220302), Row.of(4, "a", dt20220302), Row.of(5, "b", dt20220302)); TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt = '2022-03-02'", tableName), - rowsOn20220302); + sql("SELECT * FROM %s WHERE dt = '2022-03-02'", tableName), rowsOn20220302); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), @@ -165,33 +170,24 @@ public void testPrimaryKeyEqualToPartitionKey() { // This is an SQL based reproduction of TestFlinkIcebergSinkV2#testUpsertOnDataKey String tableName = "upsert_on_data_key"; try { - sql("CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL, PRIMARY KEY(data) NOT ENFORCED) " + - "PARTITIONED BY (data) WITH %s", + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL, PRIMARY KEY(data) NOT ENFORCED) " + + "PARTITIONED BY (data) WITH %s", tableName, toWithClause(tableUpsertProps)); - sql("INSERT INTO %s VALUES " + - "(1, 'aaa')," + - "(2, 'aaa')," + - "(3, 'bbb')", - tableName); + sql("INSERT INTO %s VALUES " + "(1, 'aaa')," + "(2, 'aaa')," + "(3, 'bbb')", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of(2, "aaa"), Row.of(3, "bbb"))); - sql("INSERT INTO %s VALUES " + - "(4, 'aaa')," + - "(5, 'bbb')", - tableName); + sql("INSERT INTO %s VALUES " + "(4, 'aaa')," + "(5, 'bbb')", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of(4, "aaa"), Row.of(5, "bbb"))); - sql("INSERT INTO %s VALUES " + - "(6, 'aaa')," + - "(7, 'bbb')", - tableName); + sql("INSERT INTO %s VALUES " + "(6, 'aaa')," + "(7, 'bbb')", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), @@ -206,32 +202,36 @@ public void testPrimaryKeyFieldsAtBeginningOfSchema() { String tableName = "upsert_on_pk_at_schema_start"; LocalDate dt = LocalDate.of(2022, 3, 1); try { - sql("CREATE TABLE %s(data STRING NOT NULL, dt DATE NOT NULL, id INT, PRIMARY KEY(data,dt) NOT ENFORCED) " + - "PARTITIONED BY (data) WITH %s", + sql( + "CREATE TABLE %s(data STRING NOT NULL, dt DATE NOT NULL, id INT, PRIMARY KEY(data,dt) NOT ENFORCED) " + + "PARTITIONED BY (data) WITH %s", tableName, toWithClause(tableUpsertProps)); - sql("INSERT INTO %s VALUES " + - "('aaa', DATE '2022-03-01', 1)," + - "('aaa', DATE '2022-03-01', 2)," + - "('bbb', DATE '2022-03-01', 3)", + sql( + "INSERT INTO %s VALUES " + + "('aaa', DATE '2022-03-01', 1)," + + "('aaa', DATE '2022-03-01', 2)," + + "('bbb', DATE '2022-03-01', 3)", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of("aaa", dt, 2), Row.of("bbb", dt, 3))); - sql("INSERT INTO %s VALUES " + - "('aaa', DATE '2022-03-01', 4)," + - "('bbb', DATE '2022-03-01', 5)", + sql( + "INSERT INTO %s VALUES " + + "('aaa', DATE '2022-03-01', 4)," + + "('bbb', DATE '2022-03-01', 5)", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of("aaa", dt, 4), Row.of("bbb", dt, 5))); - sql("INSERT INTO %s VALUES " + - "('aaa', DATE '2022-03-01', 6)," + - "('bbb', DATE '2022-03-01', 7)", + sql( + "INSERT INTO %s VALUES " + + "('aaa', DATE '2022-03-01', 6)," + + "('bbb', DATE '2022-03-01', 7)", tableName); TestHelpers.assertRows( @@ -244,37 +244,42 @@ public void testPrimaryKeyFieldsAtBeginningOfSchema() { @Test public void testPrimaryKeyFieldsAtEndOfTableSchema() { - // This is the same test case as testPrimaryKeyFieldsAtBeginningOfSchema, but the primary key fields + // This is the same test case as testPrimaryKeyFieldsAtBeginningOfSchema, but the primary key + // fields // are located at the end of the flink schema. String tableName = "upsert_on_pk_at_schema_end"; LocalDate dt = LocalDate.of(2022, 3, 1); try { - sql("CREATE TABLE %s(id INT, data STRING NOT NULL, dt DATE NOT NULL, PRIMARY KEY(data,dt) NOT ENFORCED) " + - "PARTITIONED BY (data) WITH %s", + sql( + "CREATE TABLE %s(id INT, data STRING NOT NULL, dt DATE NOT NULL, PRIMARY KEY(data,dt) NOT ENFORCED) " + + "PARTITIONED BY (data) WITH %s", tableName, toWithClause(tableUpsertProps)); - sql("INSERT INTO %s VALUES " + - "(1, 'aaa', DATE '2022-03-01')," + - "(2, 'aaa', DATE '2022-03-01')," + - "(3, 'bbb', DATE '2022-03-01')", + sql( + "INSERT INTO %s VALUES " + + "(1, 'aaa', DATE '2022-03-01')," + + "(2, 'aaa', DATE '2022-03-01')," + + "(3, 'bbb', DATE '2022-03-01')", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of(2, "aaa", dt), Row.of(3, "bbb", dt))); - sql("INSERT INTO %s VALUES " + - "(4, 'aaa', DATE '2022-03-01')," + - "(5, 'bbb', DATE '2022-03-01')", + sql( + "INSERT INTO %s VALUES " + + "(4, 'aaa', DATE '2022-03-01')," + + "(5, 'bbb', DATE '2022-03-01')", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of(4, "aaa", dt), Row.of(5, "bbb", dt))); - sql("INSERT INTO %s VALUES " + - "(6, 'aaa', DATE '2022-03-01')," + - "(7, 'bbb', DATE '2022-03-01')", + sql( + "INSERT INTO %s VALUES " + + "(6, 'aaa', DATE '2022-03-01')," + + "(7, 'bbb', DATE '2022-03-01')", tableName); TestHelpers.assertRows( diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java index 51c981ee531b..e840ba842bef 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.IOException; @@ -67,8 +66,7 @@ import org.junit.Assert; public class TestHelpers { - private TestHelpers() { - } + private TestHelpers() {} public static T roundTripKryoSerialize(Class clazz, T table) throws IOException { KryoSerializer kryo = new KryoSerializer<>(clazz, new ExecutionConfig()); @@ -81,13 +79,15 @@ public static T roundTripKryoSerialize(Class clazz, T table) throws IOExc } public static RowData copyRowData(RowData from, RowType rowType) { - TypeSerializer[] fieldSerializers = rowType.getChildren().stream() - .map((LogicalType type) -> InternalSerializers.create(type)) - .toArray(TypeSerializer[]::new); + TypeSerializer[] fieldSerializers = + rowType.getChildren().stream() + .map((LogicalType type) -> InternalSerializers.create(type)) + .toArray(TypeSerializer[]::new); return RowDataUtil.clone(from, null, rowType, fieldSerializers); } - public static void readRowData(FlinkInputFormat input, Consumer visitor) throws IOException { + public static void readRowData(FlinkInputFormat input, Consumer visitor) + throws IOException { for (FlinkInputSplit s : input.createInputSplits(0)) { input.open(s); try { @@ -101,19 +101,21 @@ public static void readRowData(FlinkInputFormat input, Consumer visitor } } - public static List readRowData(FlinkInputFormat inputFormat, RowType rowType) throws IOException { + public static List readRowData(FlinkInputFormat inputFormat, RowType rowType) + throws IOException { List results = Lists.newArrayList(); readRowData(inputFormat, row -> results.add(copyRowData(row, rowType))); return results; } - public static List readRows(FlinkInputFormat inputFormat, RowType rowType) throws IOException { + public static List readRows(FlinkInputFormat inputFormat, RowType rowType) + throws IOException { return convertRowDataToRow(readRowData(inputFormat, rowType), rowType); } public static List convertRowDataToRow(List rowDataList, RowType rowType) { - DataStructureConverter converter = DataStructureConverters.getConverter( - TypeConversions.fromLogicalToDataType(rowType)); + DataStructureConverter converter = + DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); return rowDataList.stream() .map(converter::toExternal) .map(Row.class::cast) @@ -123,9 +125,12 @@ public static List convertRowDataToRow(List rowDataList, RowType r public static void assertRecords(List results, List expectedRecords, Schema schema) { List expected = Lists.newArrayList(); @SuppressWarnings("unchecked") - DataStructureConverter converter = (DataStructureConverter) DataStructureConverters.getConverter( - TypeConversions.fromLogicalToDataType(FlinkSchemaUtil.convert(schema))); - expectedRecords.forEach(r -> expected.add(converter.toExternal(RowDataConverter.convert(schema, r)))); + DataStructureConverter converter = + (DataStructureConverter) + DataStructureConverters.getConverter( + TypeConversions.fromLogicalToDataType(FlinkSchemaUtil.convert(schema))); + expectedRecords.forEach( + r -> expected.add(converter.toExternal(RowDataConverter.convert(schema, r)))); assertRows(results, expected); } @@ -141,13 +146,17 @@ public static void assertRowData(Schema schema, StructLike expected, RowData act assertRowData(schema.asStruct(), FlinkSchemaUtil.convert(schema), expected, actual); } - public static void assertRowData(Types.StructType structType, LogicalType rowType, StructLike expectedRecord, - RowData actualRowData) { + public static void assertRowData( + Types.StructType structType, + LogicalType rowType, + StructLike expectedRecord, + RowData actualRowData) { if (expectedRecord == null && actualRowData == null) { return; } - Assert.assertTrue("expected Record and actual RowData should be both null or not null", + Assert.assertTrue( + "expected Record and actual RowData should be both null or not null", expectedRecord != null && actualRowData != null); List types = Lists.newArrayList(); @@ -158,24 +167,30 @@ public static void assertRowData(Types.StructType structType, LogicalType rowTyp for (int i = 0; i < types.size(); i += 1) { LogicalType logicalType = ((RowType) rowType).getTypeAt(i); Object expected = expectedRecord.get(i, Object.class); - // The RowData.createFieldGetter won't return null for the required field. But in the projection case, if we are - // projecting a nested required field from an optional struct, then we should give a null for the projected field - // if the outer struct value is null. So we need to check the nullable for actualRowData here. For more details + // The RowData.createFieldGetter won't return null for the required field. But in the + // projection case, if we are + // projecting a nested required field from an optional struct, then we should give a null for + // the projected field + // if the outer struct value is null. So we need to check the nullable for actualRowData here. + // For more details // please see issue #2738. - Object actual = actualRowData.isNullAt(i) ? null : - RowData.createFieldGetter(logicalType, i).getFieldOrNull(actualRowData); + Object actual = + actualRowData.isNullAt(i) + ? null + : RowData.createFieldGetter(logicalType, i).getFieldOrNull(actualRowData); assertEquals(types.get(i), logicalType, expected, actual); } } - private static void assertEquals(Type type, LogicalType logicalType, Object expected, Object actual) { + private static void assertEquals( + Type type, LogicalType logicalType, Object expected, Object actual) { if (expected == null && actual == null) { return; } - Assert.assertTrue("expected and actual should be both null or not null", - expected != null && actual != null); + Assert.assertTrue( + "expected and actual should be both null or not null", expected != null && actual != null); switch (type.typeId()) { case BOOLEAN: @@ -194,7 +209,9 @@ private static void assertEquals(Type type, LogicalType logicalType, Object expe Assert.assertEquals("double value should be equal", expected, actual); break; case STRING: - Assertions.assertThat(expected).as("Should expect a CharSequence").isInstanceOf(CharSequence.class); + Assertions.assertThat(expected) + .as("Should expect a CharSequence") + .isInstanceOf(CharSequence.class); Assert.assertEquals("string should be equal", String.valueOf(expected), actual.toString()); break; case DATE: @@ -203,40 +220,56 @@ private static void assertEquals(Type type, LogicalType logicalType, Object expe Assert.assertEquals("date should be equal", expected, date); break; case TIME: - Assertions.assertThat(expected).as("Should expect a LocalTime").isInstanceOf(LocalTime.class); + Assertions.assertThat(expected) + .as("Should expect a LocalTime") + .isInstanceOf(LocalTime.class); int milliseconds = (int) (((LocalTime) expected).toNanoOfDay() / 1000_000); Assert.assertEquals("time millis should be equal", milliseconds, actual); break; case TIMESTAMP: if (((Types.TimestampType) type).shouldAdjustToUTC()) { - Assertions.assertThat(expected).as("Should expect a OffsetDataTime").isInstanceOf(OffsetDateTime.class); + Assertions.assertThat(expected) + .as("Should expect a OffsetDataTime") + .isInstanceOf(OffsetDateTime.class); OffsetDateTime ts = (OffsetDateTime) expected; - Assert.assertEquals("OffsetDataTime should be equal", ts.toLocalDateTime(), + Assert.assertEquals( + "OffsetDataTime should be equal", + ts.toLocalDateTime(), ((TimestampData) actual).toLocalDateTime()); } else { - Assertions.assertThat(expected).as("Should expect a LocalDataTime").isInstanceOf(LocalDateTime.class); + Assertions.assertThat(expected) + .as("Should expect a LocalDataTime") + .isInstanceOf(LocalDateTime.class); LocalDateTime ts = (LocalDateTime) expected; - Assert.assertEquals("LocalDataTime should be equal", ts, - ((TimestampData) actual).toLocalDateTime()); + Assert.assertEquals( + "LocalDataTime should be equal", ts, ((TimestampData) actual).toLocalDateTime()); } break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assert.assertEquals("binary should be equal", expected, ByteBuffer.wrap((byte[]) actual)); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); BigDecimal bd = (BigDecimal) expected; - Assert.assertEquals("decimal value should be equal", bd, - ((DecimalData) actual).toBigDecimal()); + Assert.assertEquals( + "decimal value should be equal", bd, ((DecimalData) actual).toBigDecimal()); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Collection expectedArrayData = (Collection) expected; ArrayData actualArrayData = (ArrayData) actual; LogicalType elementType = ((ArrayType) logicalType).getElementType(); - Assert.assertEquals("array length should be equal", expectedArrayData.size(), actualArrayData.size()); - assertArrayValues(type.asListType().elementType(), elementType, expectedArrayData, actualArrayData); + Assert.assertEquals( + "array length should be equal", expectedArrayData.size(), actualArrayData.size()); + assertArrayValues( + type.asListType().elementType(), elementType, expectedArrayData, actualArrayData); break; case MAP: Assertions.assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); @@ -248,7 +281,9 @@ private static void assertEquals(Type type, LogicalType logicalType, Object expe break; case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); - Assert.assertEquals("UUID should be equal", expected.toString(), + Assert.assertEquals( + "UUID should be equal", + expected.toString(), UUID.nameUUIDFromBytes((byte[]) actual).toString()); break; case FIXED: @@ -260,8 +295,8 @@ private static void assertEquals(Type type, LogicalType logicalType, Object expe } } - private static void assertArrayValues(Type type, LogicalType logicalType, Collection expectedArray, - ArrayData actualArray) { + private static void assertArrayValues( + Type type, LogicalType logicalType, Collection expectedArray, ArrayData actualArray) { List expectedElements = Lists.newArrayList(expectedArray); for (int i = 0; i < expectedArray.size(); i += 1) { if (expectedElements.get(i) == null) { @@ -271,12 +306,16 @@ private static void assertArrayValues(Type type, LogicalType logicalType, Collec Object expected = expectedElements.get(i); - assertEquals(type, logicalType, expected, + assertEquals( + type, + logicalType, + expected, ArrayData.createElementGetter(logicalType).getElementOrNull(actualArray, i)); } } - private static void assertMapValues(Types.MapType mapType, LogicalType type, Map expected, MapData actual) { + private static void assertMapValues( + Types.MapType mapType, LogicalType type, Map expected, MapData actual) { Assert.assertEquals("map size should be equal", expected.size(), actual.size()); ArrayData actualKeyArrayData = actual.keyArray(); @@ -305,7 +344,10 @@ private static void assertMapValues(Types.MapType mapType, LogicalType type, Map } Assert.assertNotNull("Should have a matching key", matchedActualKey); final int valueIndex = matchedKeyIndex; - assertEquals(valueType, actualValueType, entry.getValue(), + assertEquals( + valueType, + actualValueType, + entry.getValue(), valueGetter.getElementOrNull(actualValueArrayData, valueIndex)); } } @@ -319,31 +361,55 @@ public static void assertEquals(ManifestFile expected, ManifestFile actual) { Assert.assertEquals("Length must match", expected.length(), actual.length()); Assert.assertEquals("Spec id must match", expected.partitionSpecId(), actual.partitionSpecId()); Assert.assertEquals("ManifestContent must match", expected.content(), actual.content()); - Assert.assertEquals("SequenceNumber must match", expected.sequenceNumber(), actual.sequenceNumber()); - Assert.assertEquals("MinSequenceNumber must match", expected.minSequenceNumber(), actual.minSequenceNumber()); + Assert.assertEquals( + "SequenceNumber must match", expected.sequenceNumber(), actual.sequenceNumber()); + Assert.assertEquals( + "MinSequenceNumber must match", expected.minSequenceNumber(), actual.minSequenceNumber()); Assert.assertEquals("Snapshot id must match", expected.snapshotId(), actual.snapshotId()); - Assert.assertEquals("Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); - Assert.assertEquals("Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); - Assert.assertEquals("Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); - Assert.assertEquals("Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); - Assert.assertEquals("Existing files count must match", expected.existingFilesCount(), actual.existingFilesCount()); - Assert.assertEquals("Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); - Assert.assertEquals("Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); - Assert.assertEquals("Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); - Assert.assertEquals("Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); + Assert.assertEquals( + "Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); + Assert.assertEquals( + "Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); + Assert.assertEquals( + "Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); + Assert.assertEquals( + "Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); + Assert.assertEquals( + "Existing files count must match", + expected.existingFilesCount(), + actual.existingFilesCount()); + Assert.assertEquals( + "Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); + Assert.assertEquals( + "Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); + Assert.assertEquals( + "Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); + Assert.assertEquals( + "Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); List expectedSummaries = expected.partitions(); List actualSummaries = actual.partitions(); - Assert.assertEquals("PartitionFieldSummary size does not match", expectedSummaries.size(), actualSummaries.size()); + Assert.assertEquals( + "PartitionFieldSummary size does not match", + expectedSummaries.size(), + actualSummaries.size()); for (int i = 0; i < expectedSummaries.size(); i++) { - Assert.assertEquals("Null flag in partition must match", - expectedSummaries.get(i).containsNull(), actualSummaries.get(i).containsNull()); - Assert.assertEquals("NaN flag in partition must match", - expectedSummaries.get(i).containsNaN(), actualSummaries.get(i).containsNaN()); - Assert.assertEquals("Lower bounds in partition must match", - expectedSummaries.get(i).lowerBound(), actualSummaries.get(i).lowerBound()); - Assert.assertEquals("Upper bounds in partition must match", - expectedSummaries.get(i).upperBound(), actualSummaries.get(i).upperBound()); + Assert.assertEquals( + "Null flag in partition must match", + expectedSummaries.get(i).containsNull(), + actualSummaries.get(i).containsNull()); + Assert.assertEquals( + "NaN flag in partition must match", + expectedSummaries.get(i).containsNaN(), + actualSummaries.get(i).containsNaN()); + Assert.assertEquals( + "Lower bounds in partition must match", + expectedSummaries.get(i).lowerBound(), + actualSummaries.get(i).lowerBound()); + Assert.assertEquals( + "Upper bounds in partition must match", + expectedSummaries.get(i).upperBound(), + actualSummaries.get(i).upperBound()); } } @@ -358,7 +424,8 @@ public static void assertEquals(ContentFile expected, ContentFile actual) Assert.assertEquals("Format", expected.format(), actual.format()); Assert.assertEquals("Partition size", expected.partition().size(), actual.partition().size()); for (int i = 0; i < expected.partition().size(); i++) { - Assert.assertEquals("Partition data at index " + i, + Assert.assertEquals( + "Partition data at index " + i, expected.partition().get(i, Object.class), actual.partition().get(i, Object.class)); } @@ -371,6 +438,7 @@ public static void assertEquals(ContentFile expected, ContentFile actual) Assert.assertEquals("Upper bounds", expected.upperBounds(), actual.upperBounds()); Assert.assertEquals("Key metadata", expected.keyMetadata(), actual.keyMetadata()); Assert.assertEquals("Split offsets", expected.splitOffsets(), actual.splitOffsets()); - Assert.assertEquals("Equality field id list", actual.equalityFieldIds(), expected.equalityFieldIds()); + Assert.assertEquals( + "Equality field id list", actual.equalityFieldIds(), expected.equalityFieldIds()); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java index 272f7a716530..c3b106997289 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.flink; import java.io.IOException; @@ -54,8 +52,7 @@ public class TestIcebergConnector extends FlinkTestBase { private static final String TABLE_NAME = "test_table"; - @ClassRule - public static final TemporaryFolder WAREHOUSE = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder WAREHOUSE = new TemporaryFolder(); private final String catalogName; private final Map properties; @@ -67,118 +64,106 @@ public static Iterable parameters() { return Lists.newArrayList( // Create iceberg table in the hadoop catalog and default database. new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop" - ), - true + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop"), + true }, new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-table", "not_existing_table" - ), - true + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-table", "not_existing_table"), + true }, new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop" - ), - false + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop"), + false }, // Create iceberg table in the hadoop catalog and not_existing_db. new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db" - ), - true + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-database", "not_existing_db"), + true }, new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db", - "catalog-table", "not_existing_table" - ), - true + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-database", "not_existing_db", + "catalog-table", "not_existing_table"), + true }, new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db" - ), - false + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-database", "not_existing_db"), + false }, // Create iceberg table in the hive catalog and default database. new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive" - ), - true + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive"), + true }, new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-table", "not_existing_table" - ), - true + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-table", "not_existing_table"), + true }, new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive" - ), - false + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive"), + false }, // Create iceberg table in the hive catalog and not_existing_db. new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db" - ), - true + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-database", "not_existing_db"), + true }, new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db", - "catalog-table", "not_existing_table" - ), - true + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-database", "not_existing_db", + "catalog-table", "not_existing_table"), + true }, new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db" - ), - false - } - ); + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-database", "not_existing_db"), + false + }); } - public TestIcebergConnector(String catalogName, Map properties, boolean isStreaming) { + public TestIcebergConnector( + String catalogName, Map properties, boolean isStreaming) { this.catalogName = catalogName; this.properties = properties; this.isStreaming = isStreaming; @@ -192,8 +177,9 @@ protected TableEnvironment getTableEnv() { EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); if (isStreaming) { settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); @@ -203,7 +189,8 @@ protected TableEnvironment getTableEnv() { tEnv = TableEnvironment.create(settingsBuilder.build()); } // Set only one parallelism. - tEnv.getConfig().getConfiguration() + tEnv.getConfig() + .getConfiguration() .set(CoreOptions.DEFAULT_PARALLELISM, 1) .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); } @@ -240,21 +227,24 @@ private void testCreateConnectorTable() { // Create table under the flink's current database. sql("CREATE TABLE %s (id BIGINT, data STRING) WITH %s", TABLE_NAME, toWithClause(tableProps)); sql("INSERT INTO %s VALUES (1, 'AAA'), (2, 'BBB'), (3, 'CCC')", TABLE_NAME); - Assert.assertEquals("Should have expected rows", + Assert.assertEquals( + "Should have expected rows", Sets.newHashSet(Row.of(1L, "AAA"), Row.of(2L, "BBB"), Row.of(3L, "CCC")), Sets.newHashSet(sql("SELECT * FROM %s", TABLE_NAME))); FlinkCatalogFactory factory = new FlinkCatalogFactory(); Catalog flinkCatalog = factory.createCatalog(catalogName, tableProps, new Configuration()); - Assert.assertTrue("Should have created the expected database", - flinkCatalog.databaseExists(databaseName())); - Assert.assertTrue("Should have created the expected table", + Assert.assertTrue( + "Should have created the expected database", flinkCatalog.databaseExists(databaseName())); + Assert.assertTrue( + "Should have created the expected table", flinkCatalog.tableExists(new ObjectPath(databaseName(), tableName()))); // Drop and create it again. sql("DROP TABLE %s", TABLE_NAME); sql("CREATE TABLE %s (id BIGINT, data STRING) WITH %s", TABLE_NAME, toWithClause(tableProps)); - Assert.assertEquals("Should have expected rows", + Assert.assertEquals( + "Should have expected rows", Sets.newHashSet(Row.of(1L, "AAA"), Row.of(2L, "BBB"), Row.of(3L, "CCC")), Sets.newHashSet(sql("SELECT * FROM %s", TABLE_NAME))); } @@ -272,7 +262,8 @@ public void testCatalogDatabaseConflictWithFlinkDatabase() { try { testCreateConnectorTable(); // Ensure that the table was created under the specific database. - AssertHelpers.assertThrows("Table should already exists", + AssertHelpers.assertThrows( + "Table should already exists", ValidationException.class, "Could not execute CreateTable in path", () -> sql("CREATE TABLE `default_catalog`.`%s`.`%s`", databaseName(), TABLE_NAME)); @@ -303,15 +294,14 @@ public void testConnectorTableInIcebergCatalog() { // Create a connector table in an iceberg catalog. sql("CREATE CATALOG `test_catalog` WITH %s", toWithClause(catalogProps)); try { - AssertHelpers.assertThrowsCause("Cannot create the iceberg connector table in iceberg catalog", + AssertHelpers.assertThrowsCause( + "Cannot create the iceberg connector table in iceberg catalog", IllegalArgumentException.class, "Cannot create the table with 'connector'='iceberg' table property in an iceberg catalog", - () -> sql("CREATE TABLE `test_catalog`.`%s`.`%s` (id BIGINT, data STRING) WITH %s", - FlinkCatalogFactory.DEFAULT_DATABASE_NAME, - TABLE_NAME, - toWithClause(tableProps) - ) - ); + () -> + sql( + "CREATE TABLE `test_catalog`.`%s`.`%s` (id BIGINT, data STRING) WITH %s", + FlinkCatalogFactory.DEFAULT_DATABASE_NAME, TABLE_NAME, toWithClause(tableProps))); } finally { sql("DROP CATALOG IF EXISTS `test_catalog`"); } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java index e90a9a469e47..6bd94e9ca61c 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; @@ -52,61 +54,62 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestManifestFileSerialization { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - required(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("double") - .build(); - - private static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withPartition(org.apache.iceberg.TestHelpers.Row.of(1D)) - .withPartitionPath("double=1") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - ImmutableMap.of(), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); - - private static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(0) - .withPartition(org.apache.iceberg.TestHelpers.Row.of(Double.NaN)) - .withPartitionPath("double=NaN") - .withMetrics(new Metrics(1L, - null, // no column sizes - ImmutableMap.of(1, 1L, 4, 1L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - ImmutableMap.of(4, 1L), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(1L)) // upper bounds - )) - .build(); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + required(4, "double", Types.DoubleType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("double").build(); + + private static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(0) + .withPartition(org.apache.iceberg.TestHelpers.Row.of(1D)) + .withPartitionPath("double=1") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + ImmutableMap.of(), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(4L)) // upper bounds + )) + .build(); + + private static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-2.parquet") + .withFileSizeInBytes(0) + .withPartition(org.apache.iceberg.TestHelpers.Row.of(Double.NaN)) + .withPartitionPath("double=NaN") + .withMetrics( + new Metrics( + 1L, + null, // no column sizes + ImmutableMap.of(1, 1L, 4, 1L), // value count + ImmutableMap.of(1, 0L, 2, 0L), // null count + ImmutableMap.of(4, 1L), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(1L)) // upper bounds + )) + .build(); private static final FileIO FILE_IO = new HadoopFileIO(new Configuration()); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testKryoSerialization() throws IOException { - KryoSerializer kryo = new KryoSerializer<>(ManifestFile.class, new ExecutionConfig()); + KryoSerializer kryo = + new KryoSerializer<>(ManifestFile.class, new ExecutionConfig()); DataOutputSerializer outputView = new DataOutputSerializer(1024); @@ -138,7 +141,8 @@ public void testJavaSerialization() throws Exception { out.writeObject(GenericManifestFile.copyOf(manifest).build()); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { for (int i = 0; i < 3; i += 1) { Object obj = in.readObject(); Assertions.assertThat(obj).as("Should be a ManifestFile").isInstanceOf(ManifestFile.class); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java index 9012fc564bd1..ae4ab2844bc8 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.Iterator; @@ -34,28 +33,32 @@ public class TestRowDataWrapper extends RecordWrapperTest { /** - * Flink's time type has been truncated to millis seconds, so we need a customized assert method to check the - * values. + * Flink's time type has been truncated to millis seconds, so we need a customized assert method + * to check the values. */ @Override public void testTime() { - generateAndValidate(new Schema(TIME.fields()), (message, expectedWrapper, actualWrapper) -> { - for (int pos = 0; pos < TIME.fields().size(); pos++) { - Object expected = expectedWrapper.get().get(pos, Object.class); - Object actual = actualWrapper.get().get(pos, Object.class); - if (expected == actual) { - return; - } + generateAndValidate( + new Schema(TIME.fields()), + (message, expectedWrapper, actualWrapper) -> { + for (int pos = 0; pos < TIME.fields().size(); pos++) { + Object expected = expectedWrapper.get().get(pos, Object.class); + Object actual = actualWrapper.get().get(pos, Object.class); + if (expected == actual) { + return; + } - if (expected == null || actual == null) { - Assert.fail(String.format("The expected value is %s but actual value is %s", expected, actual)); - } + if (expected == null || actual == null) { + Assert.fail( + String.format( + "The expected value is %s but actual value is %s", expected, actual)); + } - int expectedMilliseconds = (int) ((long) expected / 1000_000); - int actualMilliseconds = (int) ((long) actual / 1000_000); - Assert.assertEquals(message, expectedMilliseconds, actualMilliseconds); - } - }); + int expectedMilliseconds = (int) ((long) expected / 1000_000); + int actualMilliseconds = (int) ((long) actual / 1000_000); + Assert.assertEquals(message, expectedMilliseconds, actualMilliseconds); + } + }); } @Override @@ -65,7 +68,8 @@ protected void generateAndValidate(Schema schema, RecordWrapperTest.AssertMethod Iterable rowDataList = RandomRowData.generate(schema, numRecords, 101L); InternalRecordWrapper recordWrapper = new InternalRecordWrapper(schema.asStruct()); - RowDataWrapper rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + RowDataWrapper rowDataWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); Iterator actual = recordList.iterator(); Iterator expected = rowDataList.iterator(); @@ -79,8 +83,10 @@ protected void generateAndValidate(Schema schema, RecordWrapperTest.AssertMethod StructLike recordStructLike = recordWrapper.wrap(actual.next()); StructLike rowDataStructLike = rowDataWrapper.wrap(expected.next()); - assertMethod.assertEquals("Should have expected StructLike values", - actualWrapper.set(recordStructLike), expectedWrapper.set(rowDataStructLike)); + assertMethod.assertEquals( + "Should have expected StructLike values", + actualWrapper.set(recordStructLike), + expectedWrapper.set(rowDataStructLike)); } Assert.assertFalse("Shouldn't have more record", actual.hasNext()); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java index 5f7ae29ec737..61a821a9ac5a 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -35,9 +34,7 @@ public TestTableLoader(String dir) { } @Override - public void open() { - - } + public void open() {} @Override public Table loadTable() { @@ -45,7 +42,5 @@ public Table loadTable() { } @Override - public void close() { - - } + public void close() {} } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java index 10c0d269bc50..27124d93fef4 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.flink.TestHelpers.roundTripKryoSerialize; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -42,30 +45,22 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.flink.TestHelpers.roundTripKryoSerialize; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestTableSerialization { private static final HadoopTables TABLES = new HadoopTables(); - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("date").build(); - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA) - .asc("id") - .build(); + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @Before @@ -82,16 +77,17 @@ public void initTable() throws IOException { public void testSerializableTableKryoSerialization() throws IOException { SerializableTable serializableTable = (SerializableTable) SerializableTable.copyOf(table); TestHelpers.assertSerializedAndLoadedMetadata( - table, - roundTripKryoSerialize(SerializableTable.class, serializableTable)); + table, roundTripKryoSerialize(SerializableTable.class, serializableTable)); } @Test public void testSerializableMetadataTableKryoSerialization() throws IOException { for (MetadataTableType type : MetadataTableType.values()) { TableOperations ops = ((HasTableOperations) table).operations(); - Table metadataTable = MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); - SerializableTable serializableMetadataTable = (SerializableTable) SerializableTable.copyOf(metadataTable); + Table metadataTable = + MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); + SerializableTable serializableMetadataTable = + (SerializableTable) SerializableTable.copyOf(metadataTable); TestHelpers.assertSerializedAndLoadedMetadata( metadataTable, @@ -103,15 +99,12 @@ public void testSerializableMetadataTableKryoSerialization() throws IOException public void testSerializableTransactionTableKryoSerialization() throws IOException { Transaction txn = table.newTransaction(); - txn.updateProperties() - .set("k1", "v1") - .commit(); + txn.updateProperties().set("k1", "v1").commit(); Table txnTable = txn.table(); SerializableTable serializableTxnTable = (SerializableTable) SerializableTable.copyOf(txnTable); TestHelpers.assertSerializedMetadata( - txnTable, - roundTripKryoSerialize(SerializableTable.class, serializableTxnTable)); + txnTable, roundTripKryoSerialize(SerializableTable.class, serializableTxnTable)); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java index 03cdcd80dec1..e59d7dacd978 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java @@ -16,10 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.flink.actions; +import static org.apache.iceberg.flink.SimpleDataUtil.RECORD; + import java.io.File; import java.io.IOException; import java.util.List; @@ -58,8 +58,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.flink.SimpleDataUtil.RECORD; - @RunWith(Parameterized.class) public class TestRewriteDataFilesAction extends FlinkCatalogTestBase { @@ -69,24 +67,23 @@ public class TestRewriteDataFilesAction extends FlinkCatalogTestBase { private Table icebergTableUnPartitioned; private Table icebergTablePartitioned; - public TestRewriteDataFilesAction(String catalogName, Namespace baseNamespace, FileFormat format) { + public TestRewriteDataFilesAction( + String catalogName, Namespace baseNamespace, FileFormat format) { super(catalogName, baseNamespace); this.format = format; } @Override protected TableEnvironment getTableEnv() { - super.getTableEnv() - .getConfig() - .getConfiguration() - .set(CoreOptions.DEFAULT_PARALLELISM, 1); + super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1); return super.getTableEnv(); } @Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}") public static Iterable parameters() { List parameters = Lists.newArrayList(); - for (FileFormat format : new FileFormat[] {FileFormat.AVRO, FileFormat.ORC, FileFormat.PARQUET}) { + for (FileFormat format : + new FileFormat[] {FileFormat.AVRO, FileFormat.ORC, FileFormat.PARQUET}) { for (Object[] catalogParams : FlinkCatalogTestBase.parameters()) { String catalogName = (String) catalogParams[0]; Namespace baseNamespace = (Namespace) catalogParams[1]; @@ -96,8 +93,7 @@ public static Iterable parameters() { return parameters; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Override @Before @@ -106,16 +102,18 @@ public void before() { sql("CREATE DATABASE %s", flinkDatabase); sql("USE CATALOG %s", catalogName); sql("USE %s", DATABASE); - sql("CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", TABLE_NAME_UNPARTITIONED, - format.name()); - icebergTableUnPartitioned = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, - TABLE_NAME_UNPARTITIONED)); - - sql("CREATE TABLE %s (id int, data varchar,spec varchar) " + - " PARTITIONED BY (data,spec) with ('write.format.default'='%s')", + sql( + "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", + TABLE_NAME_UNPARTITIONED, format.name()); + icebergTableUnPartitioned = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_UNPARTITIONED)); + + sql( + "CREATE TABLE %s (id int, data varchar,spec varchar) " + + " PARTITIONED BY (data,spec) with ('write.format.default'='%s')", TABLE_NAME_PARTITIONED, format.name()); - icebergTablePartitioned = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, - TABLE_NAME_PARTITIONED)); + icebergTablePartitioned = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_PARTITIONED)); } @Override @@ -130,13 +128,10 @@ public void clean() { @Test public void testRewriteDataFilesEmptyTable() throws Exception { Assert.assertNull("Table must be empty", icebergTableUnPartitioned.currentSnapshot()); - Actions.forTable(icebergTableUnPartitioned) - .rewriteDataFiles() - .execute(); + Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute(); Assert.assertNull("Table must stay empty", icebergTableUnPartitioned.currentSnapshot()); } - @Test public void testRewriteDataFilesUnpartitionedTable() throws Exception { sql("INSERT INTO %s SELECT 1, 'hello'", TABLE_NAME_UNPARTITIONED); @@ -145,13 +140,12 @@ public void testRewriteDataFilesUnpartitionedTable() throws Exception { icebergTableUnPartitioned.refresh(); CloseableIterable tasks = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 2 data files before rewrite", 2, dataFiles.size()); RewriteDataFilesActionResult result = - Actions.forTable(icebergTableUnPartitioned) - .rewriteDataFiles() - .execute(); + Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); @@ -159,14 +153,15 @@ public void testRewriteDataFilesUnpartitionedTable() throws Exception { icebergTableUnPartitioned.refresh(); CloseableIterable tasks1 = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 1 data files after rewrite", 1, dataFiles1.size()); // Assert the table records as expected. - SimpleDataUtil.assertTableRecords(icebergTableUnPartitioned, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), - SimpleDataUtil.createRecord(2, "world") - )); + SimpleDataUtil.assertTableRecords( + icebergTableUnPartitioned, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hello"), SimpleDataUtil.createRecord(2, "world"))); } @Test @@ -179,13 +174,12 @@ public void testRewriteDataFilesPartitionedTable() throws Exception { icebergTablePartitioned.refresh(); CloseableIterable tasks = icebergTablePartitioned.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles.size()); RewriteDataFilesActionResult result = - Actions.forTable(icebergTablePartitioned) - .rewriteDataFiles() - .execute(); + Actions.forTable(icebergTablePartitioned).rewriteDataFiles().execute(); Assert.assertEquals("Action should rewrite 4 data files", 4, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 2 data file", 2, result.addedDataFiles().size()); @@ -193,26 +187,27 @@ public void testRewriteDataFilesPartitionedTable() throws Exception { icebergTablePartitioned.refresh(); CloseableIterable tasks1 = icebergTablePartitioned.newScan().planFiles(); - List dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 2 data files after rewrite", 2, dataFiles1.size()); // Assert the table records as expected. - Schema schema = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "spec", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "spec", Types.StringType.get())); Record record = GenericRecord.create(schema); - SimpleDataUtil.assertTableRecords(icebergTablePartitioned, Lists.newArrayList( - record.copy("id", 1, "data", "hello", "spec", "a"), - record.copy("id", 2, "data", "hello", "spec", "a"), - record.copy("id", 3, "data", "world", "spec", "b"), - record.copy("id", 4, "data", "world", "spec", "b") - )); + SimpleDataUtil.assertTableRecords( + icebergTablePartitioned, + Lists.newArrayList( + record.copy("id", 1, "data", "hello", "spec", "a"), + record.copy("id", 2, "data", "hello", "spec", "a"), + record.copy("id", 3, "data", "world", "spec", "b"), + record.copy("id", 4, "data", "world", "spec", "b"))); } - @Test public void testRewriteDataFilesWithFilter() throws Exception { sql("INSERT INTO %s SELECT 1, 'hello' ,'a'", TABLE_NAME_PARTITIONED); @@ -224,7 +219,8 @@ public void testRewriteDataFilesWithFilter() throws Exception { icebergTablePartitioned.refresh(); CloseableIterable tasks = icebergTablePartitioned.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 5 data files before rewrite", 5, dataFiles.size()); RewriteDataFilesActionResult result = @@ -240,24 +236,26 @@ public void testRewriteDataFilesWithFilter() throws Exception { icebergTablePartitioned.refresh(); CloseableIterable tasks1 = icebergTablePartitioned.newScan().planFiles(); - List dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 4 data files after rewrite", 4, dataFiles1.size()); // Assert the table records as expected. - Schema schema = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "spec", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "spec", Types.StringType.get())); Record record = GenericRecord.create(schema); - SimpleDataUtil.assertTableRecords(icebergTablePartitioned, Lists.newArrayList( - record.copy("id", 1, "data", "hello", "spec", "a"), - record.copy("id", 2, "data", "hello", "spec", "a"), - record.copy("id", 3, "data", "world", "spec", "a"), - record.copy("id", 4, "data", "world", "spec", "b"), - record.copy("id", 5, "data", "world", "spec", "b") - )); + SimpleDataUtil.assertTableRecords( + icebergTablePartitioned, + Lists.newArrayList( + record.copy("id", 1, "data", "hello", "spec", "a"), + record.copy("id", 2, "data", "hello", "spec", "a"), + record.copy("id", 3, "data", "world", "spec", "a"), + record.copy("id", 4, "data", "world", "spec", "b"), + record.copy("id", 5, "data", "world", "spec", "b"))); } @Test @@ -285,22 +283,23 @@ public void testRewriteLargeTableHasResiduals() throws IOException { icebergTableUnPartitioned.refresh(); - CloseableIterable tasks = icebergTableUnPartitioned.newScan() - .ignoreResiduals() - .filter(Expressions.equal("data", "0")) - .planFiles(); + CloseableIterable tasks = + icebergTableUnPartitioned + .newScan() + .ignoreResiduals() + .filter(Expressions.equal("data", "0")) + .planFiles(); for (FileScanTask task : tasks) { Assert.assertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual()); } - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 2 data files before rewrite", 2, dataFiles.size()); Actions actions = Actions.forTable(icebergTableUnPartitioned); - RewriteDataFilesActionResult result = actions - .rewriteDataFiles() - .filter(Expressions.equal("data", "0")) - .execute(); + RewriteDataFilesActionResult result = + actions.rewriteDataFiles().filter(Expressions.equal("data", "0")).execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); @@ -310,13 +309,14 @@ public void testRewriteLargeTableHasResiduals() throws IOException { /** * a test case to test avoid repeate compress - *

    - * If datafile cannot be combined to CombinedScanTask with other DataFiles, the size of the CombinedScanTask list size - * is 1, so we remove these CombinedScanTasks to avoid compressed repeatedly. - *

    - * In this test case,we generated 3 data files and set targetSizeInBytes greater than the largest file size so that it - * cannot be combined a CombinedScanTask with other datafiles. The datafile with the largest file size will not be - * compressed. + * + *

    If datafile cannot be combined to CombinedScanTask with other DataFiles, the size of the + * CombinedScanTask list size is 1, so we remove these CombinedScanTasks to avoid compressed + * repeatedly. + * + *

    In this test case,we generated 3 data files and set targetSizeInBytes greater than the + * largest file size so that it cannot be combined a CombinedScanTask with other datafiles. The + * datafile with the largest file size will not be compressed. * * @throws IOException IOException */ @@ -327,7 +327,8 @@ public void testRewriteAvoidRepeateCompress() throws IOException { GenericAppenderFactory genericAppenderFactory = new GenericAppenderFactory(schema); File file = temp.newFile(); int count = 0; - try (FileAppender fileAppender = genericAppenderFactory.newAppender(Files.localOutput(file), format)) { + try (FileAppender fileAppender = + genericAppenderFactory.newAppender(Files.localOutput(file), format)) { long filesize = 20000; for (; fileAppender.length() < filesize; count++) { Record record = SimpleDataUtil.createRecord(count, UUID.randomUUID().toString()); @@ -336,16 +337,15 @@ public void testRewriteAvoidRepeateCompress() throws IOException { } } - DataFile dataFile = DataFiles.builder(icebergTableUnPartitioned.spec()) - .withPath(file.getAbsolutePath()) - .withFileSizeInBytes(file.length()) - .withFormat(format) - .withRecordCount(count) - .build(); + DataFile dataFile = + DataFiles.builder(icebergTableUnPartitioned.spec()) + .withPath(file.getAbsolutePath()) + .withFileSizeInBytes(file.length()) + .withFormat(format) + .withRecordCount(count) + .build(); - icebergTableUnPartitioned.newAppend() - .appendFile(dataFile) - .commit(); + icebergTableUnPartitioned.newAppend().appendFile(dataFile).commit(); sql("INSERT INTO %s SELECT 1,'a' ", TABLE_NAME_UNPARTITIONED); sql("INSERT INTO %s SELECT 2,'b' ", TABLE_NAME_UNPARTITIONED); @@ -353,28 +353,32 @@ public void testRewriteAvoidRepeateCompress() throws IOException { icebergTableUnPartitioned.refresh(); CloseableIterable tasks = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 3 data files before rewrite", 3, dataFiles.size()); Actions actions = Actions.forTable(icebergTableUnPartitioned); long targetSizeInBytes = file.length() + 10; - RewriteDataFilesActionResult result = actions - .rewriteDataFiles() - .targetSizeInBytes(targetSizeInBytes) - .splitOpenFileCost(1) - .execute(); + RewriteDataFilesActionResult result = + actions + .rewriteDataFiles() + .targetSizeInBytes(targetSizeInBytes) + .splitOpenFileCost(1) + .execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); icebergTableUnPartitioned.refresh(); CloseableIterable tasks1 = icebergTableUnPartitioned.newScan().planFiles(); - List dataFilesRewrote = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + List dataFilesRewrote = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 2 data files after rewrite", 2, dataFilesRewrote.size()); // the biggest file do not be rewrote - List rewroteDataFileNames = dataFilesRewrote.stream().map(ContentFile::path).collect(Collectors.toList()); + List rewroteDataFileNames = + dataFilesRewrote.stream().map(ContentFile::path).collect(Collectors.toList()); Assert.assertTrue(rewroteDataFileNames.contains(file.getAbsolutePath())); // Assert the table records as expected. diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java index b6fee9259f53..cc58d9817ac6 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import org.apache.flink.table.data.RowData; @@ -27,8 +26,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Iterables; public class RandomRowData { - private RandomRowData() { - } + private RandomRowData() {} public static Iterable generate(Schema schema, int numRecords, long seed) { return convert(schema, RandomGenericData.generate(schema, numRecords, seed)); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java index 549a6ed3a586..74b1da6007e6 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import org.apache.flink.api.common.functions.RichMapFunction; @@ -40,8 +39,8 @@ public RowDataToRowMapper(RowType rowType) { @Override public void open(Configuration parameters) throws Exception { - this.converter = DataStructureConverters.getConverter( - TypeConversions.fromLogicalToDataType(rowType)); + this.converter = + DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); } @Override diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java index 8ea9e06a9f8d..e8aab824ea2d 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.File; @@ -52,17 +51,17 @@ public class TestFlinkAvroReaderWriter extends DataTest { private static final int NUM_RECORDS = 100; - private static final Schema SCHEMA_NUM_TYPE = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "int", Types.IntegerType.get()), - Types.NestedField.optional(3, "float", Types.FloatType.get()), - Types.NestedField.optional(4, "double", Types.DoubleType.get()), - Types.NestedField.optional(5, "date", Types.DateType.get()), - Types.NestedField.optional(6, "time", Types.TimeType.get()), - Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone()), - Types.NestedField.optional(8, "bigint", Types.LongType.get()), - Types.NestedField.optional(9, "decimal", Types.DecimalType.of(4, 2)) - ); + private static final Schema SCHEMA_NUM_TYPE = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "int", Types.IntegerType.get()), + Types.NestedField.optional(3, "float", Types.FloatType.get()), + Types.NestedField.optional(4, "double", Types.DoubleType.get()), + Types.NestedField.optional(5, "date", Types.DateType.get()), + Types.NestedField.optional(6, "time", Types.TimeType.get()), + Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone()), + Types.NestedField.optional(8, "bigint", Types.LongType.get()), + Types.NestedField.optional(9, "decimal", Types.DecimalType.of(4, 2))); @Override protected void writeAndValidate(Schema schema) throws IOException { @@ -70,25 +69,29 @@ protected void writeAndValidate(Schema schema) throws IOException { writeAndValidate(schema, expectedRecords, NUM_RECORDS); } - private void writeAndValidate(Schema schema, List expectedRecords, int numRecord) throws IOException { + private void writeAndValidate(Schema schema, List expectedRecords, int numRecord) + throws IOException { RowType flinkSchema = FlinkSchemaUtil.convert(schema); List expectedRows = Lists.newArrayList(RandomRowData.convert(schema, expectedRecords)); File recordsFile = temp.newFile(); Assert.assertTrue("Delete should succeed", recordsFile.delete()); - // Write the expected records into AVRO file, then read them into RowData and assert with the expected Record list. - try (FileAppender writer = Avro.write(Files.localOutput(recordsFile)) - .schema(schema) - .createWriterFunc(DataWriter::create) - .build()) { + // Write the expected records into AVRO file, then read them into RowData and assert with the + // expected Record list. + try (FileAppender writer = + Avro.write(Files.localOutput(recordsFile)) + .schema(schema) + .createWriterFunc(DataWriter::create) + .build()) { writer.addAll(expectedRecords); } - try (CloseableIterable reader = Avro.read(Files.localInput(recordsFile)) - .project(schema) - .createReaderFunc(FlinkAvroReader::new) - .build()) { + try (CloseableIterable reader = + Avro.read(Files.localInput(recordsFile)) + .project(schema) + .createReaderFunc(FlinkAvroReader::new) + .build()) { Iterator expected = expectedRecords.iterator(); Iterator rows = reader.iterator(); for (int i = 0; i < numRecord; i++) { @@ -101,18 +104,21 @@ private void writeAndValidate(Schema schema, List expectedRecords, int n File rowDataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", rowDataFile.delete()); - // Write the expected RowData into AVRO file, then read them into Record and assert with the expected RowData list. - try (FileAppender writer = Avro.write(Files.localOutput(rowDataFile)) - .schema(schema) - .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) - .build()) { + // Write the expected RowData into AVRO file, then read them into Record and assert with the + // expected RowData list. + try (FileAppender writer = + Avro.write(Files.localOutput(rowDataFile)) + .schema(schema) + .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) + .build()) { writer.addAll(expectedRows); } - try (CloseableIterable reader = Avro.read(Files.localInput(rowDataFile)) - .project(schema) - .createReaderFunc(DataReader::create) - .build()) { + try (CloseableIterable reader = + Avro.read(Files.localInput(rowDataFile)) + .project(schema) + .createReaderFunc(DataReader::create) + .build()) { Iterator expected = expectedRows.iterator(); Iterator records = reader.iterator(); for (int i = 0; i < numRecord; i += 1) { @@ -124,14 +130,22 @@ private void writeAndValidate(Schema schema, List expectedRecords, int n } private Record recordNumType( - int id, int intV, float floatV, double doubleV, long date, long time, long timestamp, - long bigint, double decimal) { + int id, + int intV, + float floatV, + double doubleV, + long date, + long time, + long timestamp, + long bigint, + double decimal) { Record record = GenericRecord.create(SCHEMA_NUM_TYPE); record.setField("id", id); record.setField("int", intV); record.setField("float", floatV); record.setField("double", doubleV); - record.setField("date", DateTimeUtil.dateFromDays((int) new Date(date).toLocalDate().toEpochDay())); + record.setField( + "date", DateTimeUtil.dateFromDays((int) new Date(date).toLocalDate().toEpochDay())); record.setField("time", new Time(time).toLocalTime()); record.setField("timestamp", DateTimeUtil.timestampFromMicros(timestamp * 1000)); record.setField("bigint", bigint); @@ -142,11 +156,28 @@ private Record recordNumType( @Test public void testNumericTypes() throws IOException { - List expected = ImmutableList.of( - recordNumType(2, Integer.MAX_VALUE, Float.MAX_VALUE, Double.MAX_VALUE, Long.MAX_VALUE, - 1643811742000L, 1643811742000L, 1643811742000L, 10.24d), - recordNumType(2, Integer.MIN_VALUE, Float.MIN_VALUE, Double.MIN_VALUE, Long.MIN_VALUE, - 1643811742000L, 1643811742000L, 1643811742000L, 10.24d)); + List expected = + ImmutableList.of( + recordNumType( + 2, + Integer.MAX_VALUE, + Float.MAX_VALUE, + Double.MAX_VALUE, + Long.MAX_VALUE, + 1643811742000L, + 1643811742000L, + 1643811742000L, + 10.24d), + recordNumType( + 2, + Integer.MIN_VALUE, + Float.MIN_VALUE, + Double.MIN_VALUE, + Long.MIN_VALUE, + 1643811742000L, + 1643811742000L, + 1643811742000L, + 10.24d)); writeAndValidate(SCHEMA_NUM_TYPE, expected, 2); } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java index 5f4a7c00d1c8..fdffc0e01c20 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.File; @@ -52,18 +51,21 @@ protected void writeAndValidate(Schema schema) throws IOException { File recordsFile = temp.newFile(); Assert.assertTrue("Delete should succeed", recordsFile.delete()); - // Write the expected records into ORC file, then read them into RowData and assert with the expected Record list. - try (FileAppender writer = ORC.write(Files.localOutput(recordsFile)) - .schema(schema) - .createWriterFunc(GenericOrcWriter::buildWriter) - .build()) { + // Write the expected records into ORC file, then read them into RowData and assert with the + // expected Record list. + try (FileAppender writer = + ORC.write(Files.localOutput(recordsFile)) + .schema(schema) + .createWriterFunc(GenericOrcWriter::buildWriter) + .build()) { writer.addAll(expectedRecords); } - try (CloseableIterable reader = ORC.read(Files.localInput(recordsFile)) - .project(schema) - .createReaderFunc(type -> new FlinkOrcReader(schema, type)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(recordsFile)) + .project(schema) + .createReaderFunc(type -> new FlinkOrcReader(schema, type)) + .build()) { Iterator expected = expectedRecords.iterator(); Iterator rows = reader.iterator(); for (int i = 0; i < NUM_RECORDS; i++) { @@ -76,19 +78,22 @@ protected void writeAndValidate(Schema schema) throws IOException { File rowDataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", rowDataFile.delete()); - // Write the expected RowData into ORC file, then read them into Record and assert with the expected RowData list. + // Write the expected RowData into ORC file, then read them into Record and assert with the + // expected RowData list. RowType rowType = FlinkSchemaUtil.convert(schema); - try (FileAppender writer = ORC.write(Files.localOutput(rowDataFile)) - .schema(schema) - .createWriterFunc((iSchema, typeDesc) -> FlinkOrcWriter.buildWriter(rowType, iSchema)) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(rowDataFile)) + .schema(schema) + .createWriterFunc((iSchema, typeDesc) -> FlinkOrcWriter.buildWriter(rowType, iSchema)) + .build()) { writer.addAll(expectedRows); } - try (CloseableIterable reader = ORC.read(Files.localInput(rowDataFile)) - .project(schema) - .createReaderFunc(type -> GenericOrcReader.buildReader(schema, type)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(rowDataFile)) + .project(schema) + .createReaderFunc(type -> GenericOrcReader.buildReader(schema, type)) + .build()) { Iterator expected = expectedRows.iterator(); Iterator records = reader.iterator(); for (int i = 0; i < NUM_RECORDS; i += 1) { diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java index 6c3a4e34efaa..30a2a7bb51ce 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; @@ -49,28 +50,27 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestFlinkParquetReader extends DataTest { private static final int NUM_RECORDS = 100; @Test public void testTwoLevelList() throws IOException { - Schema schema = new Schema( - optional(1, "arraybytes", Types.ListType.ofRequired(3, Types.BinaryType.get())), - optional(2, "topbytes", Types.BinaryType.get()) - ); + Schema schema = + new Schema( + optional(1, "arraybytes", Types.ListType.ofRequired(3, Types.BinaryType.get())), + optional(2, "topbytes", Types.BinaryType.get())); org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(schema.asStruct()); File testFile = temp.newFile(); Assert.assertTrue(testFile.delete()); - ParquetWriter writer = AvroParquetWriter.builder(new Path(testFile.toURI())) - .withDataModel(GenericData.get()) - .withSchema(avroSchema) - .config("parquet.avro.add-list-element-records", "true") - .config("parquet.avro.write-old-list-structure", "true") - .build(); + ParquetWriter writer = + AvroParquetWriter.builder(new Path(testFile.toURI())) + .withDataModel(GenericData.get()) + .withSchema(avroSchema) + .config("parquet.avro.add-list-element-records", "true") + .config("parquet.avro.write-old-list-structure", "true") + .build(); GenericRecordBuilder recordBuilder = new GenericRecordBuilder(avroSchema); List expectedByteList = Lists.newArrayList(); @@ -84,10 +84,11 @@ public void testTwoLevelList() throws IOException { writer.write(expectedRecord); writer.close(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) + .build()) { Iterator rows = reader.iterator(); Assert.assertTrue("Should have at least one row", rows.hasNext()); RowData rowData = rows.next(); @@ -101,17 +102,19 @@ private void writeAndValidate(Iterable iterable, Schema schema) throws I File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .createWriterFunc(GenericParquetWriter::buildWriter) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(schema) + .createWriterFunc(GenericParquetWriter::buildWriter) + .build()) { writer.addAll(iterable); } - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) + .build()) { Iterator expected = iterable.iterator(); Iterator rows = reader.iterator(); LogicalType rowType = FlinkSchemaUtil.convert(schema); @@ -126,7 +129,10 @@ private void writeAndValidate(Iterable iterable, Schema schema) throws I @Override protected void writeAndValidate(Schema schema) throws IOException { writeAndValidate(RandomGenericData.generate(schema, NUM_RECORDS, 19981), schema); - writeAndValidate(RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124), schema); - writeAndValidate(RandomGenericData.generateFallbackRecords(schema, NUM_RECORDS, 21124, NUM_RECORDS / 20), schema); + writeAndValidate( + RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124), schema); + writeAndValidate( + RandomGenericData.generateFallbackRecords(schema, NUM_RECORDS, 21124, NUM_RECORDS / 20), + schema); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java index 1db0f8767518..7b868eafc311 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.File; @@ -42,8 +41,7 @@ public class TestFlinkParquetWriter extends DataTest { private static final int NUM_RECORDS = 100; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private void writeAndValidate(Iterable iterable, Schema schema) throws IOException { File testFile = temp.newFile(); @@ -51,17 +49,19 @@ private void writeAndValidate(Iterable iterable, Schema schema) throws LogicalType logicalType = FlinkSchemaUtil.convert(schema); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(logicalType, msgType)) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(schema) + .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(logicalType, msgType)) + .build()) { writer.addAll(iterable); } - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(msgType -> GenericParquetReaders.buildReader(schema, msgType)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(msgType -> GenericParquetReaders.buildReader(schema, msgType)) + .build()) { Iterator expected = iterable.iterator(); Iterator actual = reader.iterator(); LogicalType rowType = FlinkSchemaUtil.convert(schema); @@ -75,15 +75,19 @@ private void writeAndValidate(Iterable iterable, Schema schema) throws @Override protected void writeAndValidate(Schema schema) throws IOException { - writeAndValidate( - RandomRowData.generate(schema, NUM_RECORDS, 19981), schema); + writeAndValidate(RandomRowData.generate(schema, NUM_RECORDS, 19981), schema); - writeAndValidate(RandomRowData.convert(schema, - RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124)), + writeAndValidate( + RandomRowData.convert( + schema, + RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124)), schema); - writeAndValidate(RandomRowData.convert(schema, - RandomGenericData.generateFallbackRecords(schema, NUM_RECORDS, 21124, NUM_RECORDS / 20)), + writeAndValidate( + RandomRowData.convert( + schema, + RandomGenericData.generateFallbackRecords( + schema, NUM_RECORDS, 21124, NUM_RECORDS / 20)), schema); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java index 37016adfbdf2..4cb77b11fd7b 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.Iterator; @@ -36,98 +35,96 @@ public class TestRowDataProjection { @Test public void testFullProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); generateAndValidate(schema, schema); } @Test public void testReorderedFullProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); - Schema reordered = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); generateAndValidate(schema, reordered); } @Test public void testBasicProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); - Schema id = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); - Schema data = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + Schema id = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + Schema data = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); generateAndValidate(schema, id); generateAndValidate(schema, data); } @Test public void testEmptyProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); generateAndValidate(schema, schema.select()); } @Test public void testRename() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); - - Schema renamed = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + Schema renamed = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); generateAndValidate(schema, renamed); } @Test public void testNestedProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); // Project id only. - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); generateAndValidate(schema, idOnly); // Project lat only. - Schema latOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()) - )) - ); + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); generateAndValidate(schema, latOnly); // Project long only. - Schema longOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); generateAndValidate(schema, longOnly); // Project location. @@ -137,37 +134,40 @@ public void testNestedProjection() { @Test public void testPrimitiveTypeProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(2, "b", Types.BooleanType.get()), - Types.NestedField.optional(3, "i", Types.IntegerType.get()), - Types.NestedField.required(4, "l", Types.LongType.get()), - Types.NestedField.optional(5, "f", Types.FloatType.get()), - Types.NestedField.required(6, "d", Types.DoubleType.get()), - Types.NestedField.optional(7, "date", Types.DateType.get()), - Types.NestedField.optional(8, "time", Types.TimeType.get()), - Types.NestedField.required(9, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.required(10, "ts_tz", Types.TimestampType.withZone()), - Types.NestedField.required(11, "s", Types.StringType.get()), - Types.NestedField.required(12, "fixed", Types.FixedType.ofLength(7)), - Types.NestedField.optional(13, "bytes", Types.BinaryType.get()), - Types.NestedField.required(14, "dec_9_0", Types.DecimalType.of(9, 0)), - Types.NestedField.required(15, "dec_11_2", Types.DecimalType.of(11, 2)), - Types.NestedField.required(16, "dec_38_10", Types.DecimalType.of(38, 10))// maximum precision - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(2, "b", Types.BooleanType.get()), + Types.NestedField.optional(3, "i", Types.IntegerType.get()), + Types.NestedField.required(4, "l", Types.LongType.get()), + Types.NestedField.optional(5, "f", Types.FloatType.get()), + Types.NestedField.required(6, "d", Types.DoubleType.get()), + Types.NestedField.optional(7, "date", Types.DateType.get()), + Types.NestedField.optional(8, "time", Types.TimeType.get()), + Types.NestedField.required(9, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.required(10, "ts_tz", Types.TimestampType.withZone()), + Types.NestedField.required(11, "s", Types.StringType.get()), + Types.NestedField.required(12, "fixed", Types.FixedType.ofLength(7)), + Types.NestedField.optional(13, "bytes", Types.BinaryType.get()), + Types.NestedField.required(14, "dec_9_0", Types.DecimalType.of(9, 0)), + Types.NestedField.required(15, "dec_11_2", Types.DecimalType.of(11, 2)), + Types.NestedField.required( + 16, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision + ); generateAndValidate(schema, schema); } @Test public void testPrimitiveMapTypeProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "map", Types.MapType.ofOptional( - 1, 2, Types.IntegerType.get(), Types.StringType.get() - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "map", + Types.MapType.ofOptional(1, 2, Types.IntegerType.get(), Types.StringType.get()))); // Project id only. Schema idOnly = schema.select("id"); @@ -183,20 +183,21 @@ public void testPrimitiveMapTypeProjection() { @Test public void testNestedMapTypeProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(7, "map", Types.MapType.ofOptional( - 5, 6, - Types.StructType.of( - Types.NestedField.required(1, "key", Types.LongType.get()), - Types.NestedField.required(2, "keyData", Types.LongType.get()) - ), - Types.StructType.of( - Types.NestedField.required(3, "value", Types.LongType.get()), - Types.NestedField.required(4, "valueData", Types.LongType.get()) - ) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 7, + "map", + Types.MapType.ofOptional( + 5, + 6, + Types.StructType.of( + Types.NestedField.required(1, "key", Types.LongType.get()), + Types.NestedField.required(2, "keyData", Types.LongType.get())), + Types.StructType.of( + Types.NestedField.required(3, "value", Types.LongType.get()), + Types.NestedField.required(4, "valueData", Types.LongType.get()))))); // Project id only. Schema idOnly = schema.select("id"); @@ -210,50 +211,52 @@ public void testNestedMapTypeProjection() { generateAndValidate(schema, schema); // Project partial map key. - Schema partialMapKey = new Schema( - Types.NestedField.optional(7, "map", Types.MapType.ofOptional( - 5, 6, - Types.StructType.of( - Types.NestedField.required(1, "key", Types.LongType.get()) - ), - Types.StructType.of( - Types.NestedField.required(3, "value", Types.LongType.get()), - Types.NestedField.required(4, "valueData", Types.LongType.get()) - ) - )) - ); - AssertHelpers.assertThrows("Should not allow to project a partial map key with non-primitive type.", - IllegalArgumentException.class, "Cannot project a partial map key or value", - () -> generateAndValidate(schema, partialMapKey) - ); + Schema partialMapKey = + new Schema( + Types.NestedField.optional( + 7, + "map", + Types.MapType.ofOptional( + 5, + 6, + Types.StructType.of(Types.NestedField.required(1, "key", Types.LongType.get())), + Types.StructType.of( + Types.NestedField.required(3, "value", Types.LongType.get()), + Types.NestedField.required(4, "valueData", Types.LongType.get()))))); + AssertHelpers.assertThrows( + "Should not allow to project a partial map key with non-primitive type.", + IllegalArgumentException.class, + "Cannot project a partial map key or value", + () -> generateAndValidate(schema, partialMapKey)); // Project partial map key. - Schema partialMapValue = new Schema( - Types.NestedField.optional(7, "map", Types.MapType.ofOptional( - 5, 6, - Types.StructType.of( - Types.NestedField.required(1, "key", Types.LongType.get()), - Types.NestedField.required(2, "keyData", Types.LongType.get()) - ), - Types.StructType.of( - Types.NestedField.required(3, "value", Types.LongType.get()) - ) - )) - ); - AssertHelpers.assertThrows("Should not allow to project a partial map value with non-primitive type.", - IllegalArgumentException.class, "Cannot project a partial map key or value", - () -> generateAndValidate(schema, partialMapValue) - ); + Schema partialMapValue = + new Schema( + Types.NestedField.optional( + 7, + "map", + Types.MapType.ofOptional( + 5, + 6, + Types.StructType.of( + Types.NestedField.required(1, "key", Types.LongType.get()), + Types.NestedField.required(2, "keyData", Types.LongType.get())), + Types.StructType.of( + Types.NestedField.required(3, "value", Types.LongType.get()))))); + AssertHelpers.assertThrows( + "Should not allow to project a partial map value with non-primitive type.", + IllegalArgumentException.class, + "Cannot project a partial map key or value", + () -> generateAndValidate(schema, partialMapValue)); } @Test public void testPrimitiveListTypeProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(2, "list", Types.ListType.ofOptional( - 1, Types.StringType.get() - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 2, "list", Types.ListType.ofOptional(1, Types.StringType.get()))); // Project id only. Schema idOnly = schema.select("id"); @@ -269,16 +272,18 @@ public void testPrimitiveListTypeProjection() { @Test public void testNestedListTypeProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "list", Types.ListType.ofOptional( - 4, Types.StructType.of( - Types.NestedField.required(1, "nestedListField1", Types.LongType.get()), - Types.NestedField.required(2, "nestedListField2", Types.LongType.get()), - Types.NestedField.required(3, "nestedListField3", Types.LongType.get()) - ) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "list", + Types.ListType.ofOptional( + 4, + Types.StructType.of( + Types.NestedField.required(1, "nestedListField1", Types.LongType.get()), + Types.NestedField.required(2, "nestedListField2", Types.LongType.get()), + Types.NestedField.required(3, "nestedListField3", Types.LongType.get()))))); // Project id only. Schema idOnly = schema.select("id"); @@ -292,17 +297,20 @@ public void testNestedListTypeProjection() { generateAndValidate(schema, schema); // Project partial list value. - Schema partialList = new Schema( - Types.NestedField.optional(5, "list", Types.ListType.ofOptional( - 4, Types.StructType.of( - Types.NestedField.required(2, "nestedListField2", Types.LongType.get()) - ) - )) - ); - AssertHelpers.assertThrows("Should not allow to project a partial list element with non-primitive type.", - IllegalArgumentException.class, "Cannot project a partial list element", - () -> generateAndValidate(schema, partialList) - ); + Schema partialList = + new Schema( + Types.NestedField.optional( + 5, + "list", + Types.ListType.ofOptional( + 4, + Types.StructType.of( + Types.NestedField.required(2, "nestedListField2", Types.LongType.get()))))); + AssertHelpers.assertThrows( + "Should not allow to project a partial list element with non-primitive type.", + IllegalArgumentException.class, + "Cannot project a partial list element", + () -> generateAndValidate(schema, partialList)); } private void generateAndValidate(Schema schema, Schema projectSchema) { diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java index 9ccb1d56c0ed..df2e6ae21c7e 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.File; @@ -45,34 +44,36 @@ public class TestRowProjection { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private RowData writeAndRead(String desc, Schema writeSchema, Schema readSchema, RowData row) throws IOException { + private RowData writeAndRead(String desc, Schema writeSchema, Schema readSchema, RowData row) + throws IOException { File file = temp.newFile(desc + ".avro"); Assert.assertTrue(file.delete()); - try (FileAppender appender = Avro.write(Files.localOutput(file)) - .schema(writeSchema) - .createWriterFunc(ignore -> new FlinkAvroWriter(FlinkSchemaUtil.convert(writeSchema))) - .build()) { + try (FileAppender appender = + Avro.write(Files.localOutput(file)) + .schema(writeSchema) + .createWriterFunc(ignore -> new FlinkAvroWriter(FlinkSchemaUtil.convert(writeSchema))) + .build()) { appender.add(row); } - Iterable records = Avro.read(Files.localInput(file)) - .project(readSchema) - .createReaderFunc(FlinkAvroReader::new) - .build(); + Iterable records = + Avro.read(Files.localInput(file)) + .project(readSchema) + .createReaderFunc(FlinkAvroReader::new) + .build(); return Iterables.getOnlyElement(records); } @Test public void testFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); @@ -80,93 +81,96 @@ public void testFullProjection() throws Exception { Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - int cmp = Comparators.charSequences() - .compare("test", projected.getString(1).toString()); + int cmp = Comparators.charSequences().compare("test", projected.getString(1).toString()); Assert.assertEquals("Should contain the correct data value", cmp, 0); } @Test public void testSpecialCharacterProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "user id", Types.LongType.get()), - Types.NestedField.optional(1, "data%0", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "user id", Types.LongType.get()), + Types.NestedField.optional(1, "data%0", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); RowData full = writeAndRead("special_chars", schema, schema, row); Assert.assertEquals("Should contain the correct id value", 34L, full.getLong(0)); - Assert.assertEquals("Should contain the correct data value", + Assert.assertEquals( + "Should contain the correct data value", 0, Comparators.charSequences().compare("test", full.getString(1).toString())); RowData projected = writeAndRead("special_characters", schema, schema.select("data%0"), full); Assert.assertEquals("Should not contain id value", 1, projected.getArity()); - Assert.assertEquals("Should contain the correct data value", + Assert.assertEquals( + "Should contain the correct data value", 0, Comparators.charSequences().compare("test", projected.getString(0).toString())); } @Test public void testReorderedFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); - Schema reordered = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("full_projection", schema, reordered, row); - Assert.assertEquals("Should contain the correct 0 value", "test", projected.getString(0).toString()); + Assert.assertEquals( + "Should contain the correct 0 value", "test", projected.getString(0).toString()); Assert.assertEquals("Should contain the correct 1 value", 34L, projected.getLong(1)); } @Test public void testReorderedProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); - Schema reordered = new Schema( - Types.NestedField.optional(2, "missing_1", Types.StringType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(3, "missing_2", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(2, "missing_1", Types.StringType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(3, "missing_2", Types.LongType.get())); RowData projected = writeAndRead("full_projection", schema, reordered, row); Assert.assertTrue("Should contain the correct 0 value", projected.isNullAt(0)); - Assert.assertEquals("Should contain the correct 1 value", "test", projected.getString(1).toString()); + Assert.assertEquals( + "Should contain the correct 1 value", "test", projected.getString(1).toString()); Assert.assertTrue("Should contain the correct 2 value", projected.isNullAt(2)); } @Test public void testRenamedAddedField() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(1, "a", Types.LongType.get()), - Types.NestedField.required(2, "b", Types.LongType.get()), - Types.NestedField.required(3, "d", Types.LongType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(1, "a", Types.LongType.get()), + Types.NestedField.required(2, "b", Types.LongType.get()), + Types.NestedField.required(3, "d", Types.LongType.get())); RowData row = GenericRowData.of(100L, 200L, 300L); - Schema renamedAdded = new Schema( - Types.NestedField.optional(1, "a", Types.LongType.get()), - Types.NestedField.optional(2, "b", Types.LongType.get()), - Types.NestedField.optional(3, "c", Types.LongType.get()), - Types.NestedField.optional(4, "d", Types.LongType.get()) - ); + Schema renamedAdded = + new Schema( + Types.NestedField.optional(1, "a", Types.LongType.get()), + Types.NestedField.optional(2, "b", Types.LongType.get()), + Types.NestedField.optional(3, "c", Types.LongType.get()), + Types.NestedField.optional(4, "d", Types.LongType.get())); RowData projected = writeAndRead("rename_and_add_column_projection", schema, renamedAdded, row); Assert.assertEquals("Should contain the correct value in column 1", projected.getLong(0), 100L); @@ -177,10 +181,10 @@ public void testRenamedAddedField() throws Exception { @Test public void testEmptyProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); @@ -192,24 +196,20 @@ public void testEmptyProjection() throws Exception { @Test public void testBasicProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("basic_projection_id", writeSchema, idOnly, row); Assert.assertEquals("Should not project data", 1, projected.getArity()); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - Schema dataOnly = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, row); @@ -220,17 +220,17 @@ public void testBasicProjection() throws Exception { @Test public void testRename() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); - Schema readSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get()) - ); + Schema readSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); RowData projected = writeAndRead("project_and_rename", writeSchema, readSchema, row); @@ -241,83 +241,87 @@ public void testRename() throws Exception { @Test public void testNestedStructProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); RowData location = GenericRowData.of(52.995143f, -1.539054f); RowData record = GenericRowData.of(34L, location); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("id_only", writeSchema, idOnly, record); Assert.assertEquals("Should not project location", 1, projected.getArity()); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - Schema latOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()) - )) - ); + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); projected = writeAndRead("latitude_only", writeSchema, latOnly, record); RowData projectedLocation = projected.getRow(0, 1); Assert.assertEquals("Should not project id", 1, projected.getArity()); Assert.assertFalse("Should project location", projected.isNullAt(0)); Assert.assertEquals("Should not project longitude", 1, projectedLocation.getArity()); - Assert.assertEquals("Should project latitude", - 52.995143f, projectedLocation.getFloat(0), 0.000001f); + Assert.assertEquals( + "Should project latitude", 52.995143f, projectedLocation.getFloat(0), 0.000001f); - Schema longOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); projected = writeAndRead("longitude_only", writeSchema, longOnly, record); projectedLocation = projected.getRow(0, 1); Assert.assertEquals("Should not project id", 1, projected.getArity()); Assert.assertFalse("Should project location", projected.isNullAt(0)); Assert.assertEquals("Should not project latitutde", 1, projectedLocation.getArity()); - Assert.assertEquals("Should project longitude", - -1.539054f, projectedLocation.getFloat(0), 0.000001f); + Assert.assertEquals( + "Should project longitude", -1.539054f, projectedLocation.getFloat(0), 0.000001f); Schema locationOnly = writeSchema.select("location"); projected = writeAndRead("location_only", writeSchema, locationOnly, record); projectedLocation = projected.getRow(0, 1); Assert.assertEquals("Should not project id", 1, projected.getArity()); Assert.assertFalse("Should project location", projected.isNullAt(0)); - Assert.assertEquals("Should project latitude", - 52.995143f, projectedLocation.getFloat(0), 0.000001f); - Assert.assertEquals("Should project longitude", - -1.539054f, projectedLocation.getFloat(1), 0.000001f); + Assert.assertEquals( + "Should project latitude", 52.995143f, projectedLocation.getFloat(0), 0.000001f); + Assert.assertEquals( + "Should project longitude", -1.539054f, projectedLocation.getFloat(1), 0.000001f); } @Test public void testMapProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "properties", - Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get())) - ); - - GenericMapData properties = new GenericMapData(ImmutableMap.of( - StringData.fromString("a"), - StringData.fromString("A"), - StringData.fromString("b"), - StringData.fromString("B"))); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "properties", + Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); + + GenericMapData properties = + new GenericMapData( + ImmutableMap.of( + StringData.fromString("a"), + StringData.fromString("A"), + StringData.fromString("b"), + StringData.fromString("B"))); RowData row = GenericRowData.of(34L, properties); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); @@ -353,26 +357,28 @@ public void testMapProjection() throws IOException { @Test public void testMapOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); RowData l1 = GenericRowData.of(53.992811f, -1.542616f); RowData l2 = GenericRowData.of(52.995143f, -1.539054f); - GenericMapData map = new GenericMapData(ImmutableMap.of( - StringData.fromString("L1"), l1, StringData.fromString("L2"), l2)); + GenericMapData map = + new GenericMapData( + ImmutableMap.of(StringData.fromString("L1"), l1, StringData.fromString("L2"), l2)); RowData row = GenericRowData.of(34L, map); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); @@ -386,21 +392,19 @@ public void testMapOfStructsProjection() throws IOException { GenericMapData locations = (GenericMapData) projected.getMap(0); Assert.assertNotNull("Should project locations map", locations); GenericArrayData l1l2Array = - new GenericArrayData(new Object[] {StringData.fromString("L2"), StringData.fromString("L1")}); + new GenericArrayData( + new Object[] {StringData.fromString("L2"), StringData.fromString("L1")}); Assert.assertEquals("Should contain L1 and L2", l1l2Array, locations.keyArray()); RowData projectedL1 = (RowData) locations.get(StringData.fromString("L1")); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain lat", - 53.992811f, projectedL1.getFloat(0), 0.000001); + Assert.assertEquals("L1 should contain lat", 53.992811f, projectedL1.getFloat(0), 0.000001); Assert.assertEquals("L1 should not contain long", 1, projectedL1.getArity()); RowData projectedL2 = (RowData) locations.get(StringData.fromString("L2")); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain lat", - 52.995143f, projectedL2.getFloat(0), 0.000001); + Assert.assertEquals("L2 should contain lat", 52.995143f, projectedL2.getFloat(0), 0.000001); Assert.assertEquals("L2 should not contain long", 1, projectedL2.getArity()); - projected = writeAndRead("long_only", - writeSchema, writeSchema.select("locations.long"), row); + projected = writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), row); Assert.assertEquals("Should not project id", 1, projected.getArity()); locations = (GenericMapData) projected.getMap(0); Assert.assertNotNull("Should project locations map", locations); @@ -408,22 +412,23 @@ public void testMapOfStructsProjection() throws IOException { projectedL1 = (RowData) locations.get(StringData.fromString("L1")); Assert.assertNotNull("L1 should not be null", projectedL1); Assert.assertEquals("L1 should not contain lat", 1, projectedL1.getArity()); - Assert.assertEquals("L1 should contain long", - -1.542616f, projectedL1.getFloat(0), 0.000001); + Assert.assertEquals("L1 should contain long", -1.542616f, projectedL1.getFloat(0), 0.000001); projectedL2 = (RowData) locations.get(StringData.fromString("L2")); Assert.assertNotNull("L2 should not be null", projectedL2); Assert.assertEquals("L2 should not contain lat", 1, projectedL2.getArity()); - Assert.assertEquals("L2 should contain long", - -1.539054f, projectedL2.getFloat(0), 0.000001); - - Schema latitiudeRenamed = new Schema( - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "latitude", Types.FloatType.get()) - ) - )) - ); + Assert.assertEquals("L2 should contain long", -1.539054f, projectedL2.getFloat(0), 0.000001); + + Schema latitiudeRenamed = + new Schema( + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, row); Assert.assertEquals("Should not project id", 1, projected.getArity()); @@ -432,29 +437,27 @@ public void testMapOfStructsProjection() throws IOException { Assert.assertEquals("Should contain L1 and L2", l1l2Array, locations.keyArray()); projectedL1 = (RowData) locations.get(StringData.fromString("L1")); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain latitude", - 53.992811f, projectedL1.getFloat(0), 0.000001); + Assert.assertEquals( + "L1 should contain latitude", 53.992811f, projectedL1.getFloat(0), 0.000001); projectedL2 = (RowData) locations.get(StringData.fromString("L2")); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain latitude", - 52.995143f, projectedL2.getFloat(0), 0.000001); + Assert.assertEquals( + "L2 should contain latitude", 52.995143f, projectedL2.getFloat(0), 0.000001); } @Test public void testListProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(10, "values", - Types.ListType.ofOptional(11, Types.LongType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); GenericArrayData values = new GenericArrayData(new Long[] {56L, 57L, 58L}); RowData row = GenericRowData.of(34L, values); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); @@ -474,24 +477,24 @@ public void testListProjection() throws IOException { @Test @SuppressWarnings("unchecked") public void testListOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()) - )) - ) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); RowData p1 = GenericRowData.of(1, 2); RowData p2 = GenericRowData.of(3, null); GenericArrayData arrayData = new GenericArrayData(new RowData[] {p1, p2}); RowData row = GenericRowData.of(34L, arrayData); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); @@ -525,13 +528,15 @@ public void testListOfStructsProjection() throws IOException { Assert.assertEquals("Should not project x", 1, projectedP2.getArity()); Assert.assertTrue("Should project null y", projectedP2.isNullAt(0)); - Schema yRenamed = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.optional(18, "z", Types.IntegerType.get()) - )) - ) - ); + Schema yRenamed = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); projected = writeAndRead("y_renamed", writeSchema, yRenamed, row); Assert.assertEquals("Should not project id", 1, projected.getArity()); @@ -548,22 +553,25 @@ public void testListOfStructsProjection() throws IOException { @Test public void testAddedFieldsWithRequiredChildren() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(1, "a", Types.LongType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(1, "a", Types.LongType.get())); RowData row = GenericRowData.of(100L); - Schema addedFields = new Schema( - Types.NestedField.optional(1, "a", Types.LongType.get()), - Types.NestedField.optional(2, "b", Types.StructType.of( - Types.NestedField.required(3, "c", Types.LongType.get()) - )), - Types.NestedField.optional(4, "d", Types.ListType.ofRequired(5, Types.LongType.get())), - Types.NestedField.optional(6, "e", Types.MapType.ofRequired(7, 8, Types.LongType.get(), Types.LongType.get())) - ); - - RowData projected = writeAndRead("add_fields_with_required_children_projection", schema, addedFields, row); + Schema addedFields = + new Schema( + Types.NestedField.optional(1, "a", Types.LongType.get()), + Types.NestedField.optional( + 2, + "b", + Types.StructType.of(Types.NestedField.required(3, "c", Types.LongType.get()))), + Types.NestedField.optional(4, "d", Types.ListType.ofRequired(5, Types.LongType.get())), + Types.NestedField.optional( + 6, + "e", + Types.MapType.ofRequired(7, 8, Types.LongType.get(), Types.LongType.get()))); + + RowData projected = + writeAndRead("add_fields_with_required_children_projection", schema, addedFields, row); Assert.assertEquals("Should contain the correct value in column 1", projected.getLong(0), 100L); Assert.assertTrue("Should contain empty value in new column 2", projected.isNullAt(1)); Assert.assertTrue("Should contain empty value in new column 4", projected.isNullAt(2)); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java index e30412ad83cc..a9800303aa4f 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; +import static org.apache.iceberg.flink.SimpleDataUtil.createDelete; +import static org.apache.iceberg.flink.SimpleDataUtil.createInsert; +import static org.apache.iceberg.flink.SimpleDataUtil.createRecord; +import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateAfter; +import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateBefore; + import java.io.File; import java.io.IOException; import java.nio.file.Files; @@ -50,12 +55,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.flink.SimpleDataUtil.createDelete; -import static org.apache.iceberg.flink.SimpleDataUtil.createInsert; -import static org.apache.iceberg.flink.SimpleDataUtil.createRecord; -import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateAfter; -import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateBefore; - @RunWith(Parameterized.class) public class TestDeltaTaskWriter extends TableTestBase { private static final int FORMAT_V2 = 2; @@ -64,11 +63,7 @@ public class TestDeltaTaskWriter extends TableTestBase { @Parameterized.Parameters(name = "FileFormat = {0}") public static Object[][] parameters() { - return new Object[][] { - {"avro"}, - {"orc"}, - {"parquet"} - }; + return new Object[][] {{"avro"}, {"orc"}, {"parquet"}}; } public TestDeltaTaskWriter(String fileFormat) { @@ -92,7 +87,8 @@ private void initTable(boolean partitioned) { this.table = create(SCHEMA, PartitionSpec.unpartitioned()); } - table.updateProperties() + table + .updateProperties() .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, String.valueOf(8 * 1024)) .defaultFormat(format) .commit(); @@ -139,12 +135,14 @@ private void testCdcEvents(boolean partitioned) throws IOException { Assert.assertEquals(partitioned ? 3 : 1, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records.", expectedRowSet( - createRecord(1, "eee"), - createRecord(2, "ddd"), - createRecord(4, "fff"), - createRecord(5, "ggg") - ), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records.", + expectedRowSet( + createRecord(1, "eee"), + createRecord(2, "ddd"), + createRecord(4, "fff"), + createRecord(5, "ggg")), + actualRowSet("*")); // Start the 2nd transaction. writer = taskWriterFactory.create(); @@ -165,11 +163,10 @@ private void testCdcEvents(boolean partitioned) throws IOException { Assert.assertEquals(partitioned ? 3 : 1, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet( - createRecord(1, "eee"), - createRecord(5, "iii"), - createRecord(6, "hhh") - ), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", + expectedRowSet(createRecord(1, "eee"), createRecord(5, "iii"), createRecord(6, "hhh")), + actualRowSet("*")); } @Test @@ -229,11 +226,15 @@ private void testAbort(boolean partitioned) throws IOException { } // Assert the current data/delete file count. - List files = Files.walk(Paths.get(tableDir.getPath(), "data")) - .filter(p -> p.toFile().isFile()) - .filter(p -> !p.toString().endsWith(".crc")) - .collect(Collectors.toList()); - Assert.assertEquals("Should have expected file count, but files are: " + files, partitioned ? 4 : 2, files.size()); + List files = + Files.walk(Paths.get(tableDir.getPath(), "data")) + .filter(p -> p.toFile().isFile()) + .filter(p -> !p.toString().endsWith(".crc")) + .collect(Collectors.toList()); + Assert.assertEquals( + "Should have expected file count, but files are: " + files, + partitioned ? 4 : 2, + files.size()); writer.abort(); for (Path file : files) { @@ -270,11 +271,10 @@ public void testPartitionedTableWithDataAsKey() throws IOException { Assert.assertEquals(1, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet( - createRecord(2, "aaa"), - createRecord(3, "bbb"), - createRecord(4, "ccc") - ), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", + expectedRowSet(createRecord(2, "aaa"), createRecord(3, "bbb"), createRecord(4, "ccc")), + actualRowSet("*")); // Start the 2nd transaction. writer = taskWriterFactory.create(); @@ -287,12 +287,14 @@ public void testPartitionedTableWithDataAsKey() throws IOException { Assert.assertEquals(1, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet( - createRecord(2, "aaa"), - createRecord(5, "aaa"), - createRecord(3, "bbb"), - createRecord(6, "bbb") - ), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", + expectedRowSet( + createRecord(2, "aaa"), + createRecord(5, "aaa"), + createRecord(3, "bbb"), + createRecord(6, "bbb")), + actualRowSet("*")); } @Test @@ -311,20 +313,21 @@ public void testPartitionedTableWithDataAndIdAsKey() throws IOException { WriteResult result = writer.complete(); Assert.assertEquals(1, result.dataFiles().length); Assert.assertEquals(1, result.deleteFiles().length); - Assert.assertEquals(Sets.newHashSet(FileContent.POSITION_DELETES), + Assert.assertEquals( + Sets.newHashSet(FileContent.POSITION_DELETES), Sets.newHashSet(result.deleteFiles()[0].content())); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet( - createRecord(1, "aaa") - ), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", expectedRowSet(createRecord(1, "aaa")), actualRowSet("*")); } private void commitTransaction(WriteResult result) { RowDelta rowDelta = table.newRowDelta(); Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); - rowDelta.validateDeletedFiles() + rowDelta + .validateDeletedFiles() .validateDataFilesExist(Lists.newArrayList(result.referencedDataFiles())) .commit(); } @@ -339,7 +342,11 @@ private StructLikeSet actualRowSet(String... columns) throws IOException { private TaskWriterFactory createTaskWriterFactory(List equalityFieldIds) { return new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), FlinkSchemaUtil.convert(table.schema()), - 128 * 1024 * 1024, format, equalityFieldIds, false); + SerializableTable.copyOf(table), + FlinkSchemaUtil.convert(table.schema()), + 128 * 1024 * 1024, + format, + equalityFieldIds, + false); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java index 8d7fa86eac50..4c17cd7607df 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -41,11 +40,16 @@ public TestFlinkAppenderFactory(String fileFormat, boolean partitioned) { } @Override - protected FileAppenderFactory createAppenderFactory(List equalityFieldIds, - Schema eqDeleteSchema, - Schema posDeleteRowSchema) { - return new FlinkAppenderFactory(table.schema(), rowType, table.properties(), table.spec(), - ArrayUtil.toIntArray(equalityFieldIds), eqDeleteSchema, posDeleteRowSchema); + protected FileAppenderFactory createAppenderFactory( + List equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema) { + return new FlinkAppenderFactory( + table.schema(), + rowType, + table.properties(), + table.spec(), + ArrayUtil.toIntArray(equalityFieldIds), + eqDeleteSchema, + posDeleteRowSchema); } @Override diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java index 3223b6e28b92..da45241256f5 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -39,9 +38,11 @@ public TestFlinkFileWriterFactory(FileFormat fileFormat, boolean partitioned) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return FlinkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java index 2a80d496dacd..b7b9539721c4 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.File; @@ -64,13 +63,12 @@ public class TestFlinkIcebergSink { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); - private static final TypeInformation ROW_TYPE_INFO = new RowTypeInfo( - SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); - private static final DataFormatConverters.RowConverter CONVERTER = new DataFormatConverters.RowConverter( - SimpleDataUtil.FLINK_SCHEMA.getFieldDataTypes()); + private static final TypeInformation ROW_TYPE_INFO = + new RowTypeInfo(SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); + private static final DataFormatConverters.RowConverter CONVERTER = + new DataFormatConverters.RowConverter(SimpleDataUtil.FLINK_SCHEMA.getFieldDataTypes()); private String tablePath; private Table table; @@ -84,18 +82,18 @@ public class TestFlinkIcebergSink { @Parameterized.Parameters(name = "format={0}, parallelism = {1}, partitioned = {2}") public static Object[][] parameters() { return new Object[][] { - {"avro", 1, true}, - {"avro", 1, false}, - {"avro", 2, true}, - {"avro", 2, false}, - {"orc", 1, true}, - {"orc", 1, false}, - {"orc", 2, true}, - {"orc", 2, false}, - {"parquet", 1, true}, - {"parquet", 1, false}, - {"parquet", 2, true}, - {"parquet", 2, false} + {"avro", 1, true}, + {"avro", 1, false}, + {"avro", 2, true}, + {"avro", 2, false}, + {"orc", 1, true}, + {"orc", 1, false}, + {"orc", 2, true}, + {"orc", 2, false}, + {"parquet", 1, true}, + {"parquet", 1, false}, + {"parquet", 2, true}, + {"parquet", 2, false} }; } @@ -116,10 +114,12 @@ public void before() throws IOException { Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); table = SimpleDataUtil.createTable(tablePath, props, partitioned); - env = StreamExecutionEnvironment.getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); tableLoader = TableLoader.fromHadoopTable(tablePath); } @@ -134,13 +134,10 @@ private BoundedTestSource createBoundedSource(List rows) { @Test public void testWriteRowData() throws Exception { - List rows = Lists.newArrayList( - Row.of(1, "hello"), - Row.of(2, "world"), - Row.of(3, "foo") - ); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) - .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); + List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) + .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); FlinkSink.forRowData(dataStream) .table(table) @@ -165,11 +162,11 @@ private List createRows(String prefix) { Row.of(2, prefix + "ccc"), Row.of(3, prefix + "aaa"), Row.of(3, prefix + "bbb"), - Row.of(3, prefix + "ccc") - ); + Row.of(3, prefix + "ccc")); } - private void testWriteRow(TableSchema tableSchema, DistributionMode distributionMode) throws Exception { + private void testWriteRow(TableSchema tableSchema, DistributionMode distributionMode) + throws Exception { List rows = createRows(""); DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); @@ -203,7 +200,8 @@ public void testWriteRowWithTableSchema() throws Exception { @Test public void testJobNoneDistributeMode() throws Exception { - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) .commit(); @@ -219,12 +217,15 @@ public void testJobNoneDistributeMode() throws Exception { @Test public void testJobHashDistributionMode() { - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) .commit(); - AssertHelpers.assertThrows("Does not support range distribution-mode now.", - IllegalArgumentException.class, "Flink does not support 'range' write distribution mode now.", + AssertHelpers.assertThrows( + "Does not support range distribution-mode now.", + IllegalArgumentException.class, + "Flink does not support 'range' write distribution mode now.", () -> { testWriteRow(null, DistributionMode.RANGE); return null; @@ -233,16 +234,20 @@ public void testJobHashDistributionMode() { @Test public void testJobNullDistributionMode() throws Exception { - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) .commit(); testWriteRow(null, null); if (partitioned) { - Assert.assertEquals("There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); - Assert.assertEquals("There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); - Assert.assertEquals("There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); + Assert.assertEquals( + "There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); + Assert.assertEquals( + "There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); + Assert.assertEquals( + "There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); } } @@ -250,9 +255,12 @@ public void testJobNullDistributionMode() throws Exception { public void testPartitionWriteMode() throws Exception { testWriteRow(null, DistributionMode.HASH); if (partitioned) { - Assert.assertEquals("There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); - Assert.assertEquals("There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); - Assert.assertEquals("There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); + Assert.assertEquals( + "There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); + Assert.assertEquals( + "There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); + Assert.assertEquals( + "There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); } } @@ -260,9 +268,12 @@ public void testPartitionWriteMode() throws Exception { public void testShuffleByPartitionWithSchema() throws Exception { testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.HASH); if (partitioned) { - Assert.assertEquals("There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); - Assert.assertEquals("There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); - Assert.assertEquals("There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); + Assert.assertEquals( + "There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); + Assert.assertEquals( + "There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); + Assert.assertEquals( + "There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); } } @@ -280,17 +291,19 @@ public void testTwoSinksInDisjointedDAG() throws Exception { Table rightTable = SimpleDataUtil.createTable(rightTablePath, props, partitioned); TableLoader rightTableLoader = TableLoader.fromHadoopTable(rightTablePath); - env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); env.getConfig().disableAutoGeneratedUIDs(); List leftRows = createRows("left-"); - DataStream leftStream = env.fromCollection(leftRows, ROW_TYPE_INFO) - .name("leftCustomSource") - .uid("leftCustomSource"); + DataStream leftStream = + env.fromCollection(leftRows, ROW_TYPE_INFO) + .name("leftCustomSource") + .uid("leftCustomSource"); FlinkSink.forRow(leftStream, SimpleDataUtil.FLINK_SCHEMA) .table(leftTable) .tableLoader(leftTableLoader) @@ -300,9 +313,10 @@ public void testTwoSinksInDisjointedDAG() throws Exception { .append(); List rightRows = createRows("right-"); - DataStream rightStream = env.fromCollection(rightRows, ROW_TYPE_INFO) - .name("rightCustomSource") - .uid("rightCustomSource"); + DataStream rightStream = + env.fromCollection(rightRows, ROW_TYPE_INFO) + .name("rightCustomSource") + .uid("rightCustomSource"); FlinkSink.forRow(rightStream, SimpleDataUtil.FLINK_SCHEMA) .table(rightTable) .tableLoader(rightTableLoader) @@ -324,7 +338,9 @@ public void testTwoSinksInDisjointedDAG() throws Exception { Assert.assertNull(leftTable.currentSnapshot().summary().get("flink.test")); Assert.assertNull(leftTable.currentSnapshot().summary().get("direction")); rightTable.refresh(); - Assert.assertEquals(TestFlinkIcebergSink.class.getName(), rightTable.currentSnapshot().summary().get("flink.test")); + Assert.assertEquals( + TestFlinkIcebergSink.class.getName(), + rightTable.currentSnapshot().summary().get("flink.test")); Assert.assertEquals("rightTable", rightTable.currentSnapshot().summary().get("direction")); } @@ -336,14 +352,17 @@ public void testOverrideWriteConfigWithUnknownDistributionMode() { List rows = createRows(""); DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - FlinkSink.Builder builder = FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .setAll(newProps); - - AssertHelpers.assertThrows("Should fail with invalid distribution mode.", - IllegalArgumentException.class, "No enum constant org.apache.iceberg.DistributionMode.UNRECOGNIZED", + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .setAll(newProps); + + AssertHelpers.assertThrows( + "Should fail with invalid distribution mode.", + IllegalArgumentException.class, + "No enum constant org.apache.iceberg.DistributionMode.UNRECOGNIZED", () -> { builder.append(); @@ -361,14 +380,17 @@ public void testOverrideWriteConfigWithUnknownFileFormat() { List rows = createRows(""); DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - FlinkSink.Builder builder = FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .setAll(newProps); - - AssertHelpers.assertThrows("Should fail with invalid file format.", - IllegalArgumentException.class, "No enum constant org.apache.iceberg.FileFormat.UNRECOGNIZED", + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .setAll(newProps); + + AssertHelpers.assertThrows( + "Should fail with invalid file format.", + IllegalArgumentException.class, + "No enum constant org.apache.iceberg.FileFormat.UNRECOGNIZED", () -> { builder.append(); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java index 97506b90ba46..cb840ada5ac5 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.File; @@ -67,18 +66,18 @@ public class TestFlinkIcebergSinkV2 extends TableTestBase { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private static final int FORMAT_V2 = 2; private static final TypeInformation ROW_TYPE_INFO = new RowTypeInfo(SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); - private static final Map ROW_KIND_MAP = ImmutableMap.of( - "+I", RowKind.INSERT, - "-D", RowKind.DELETE, - "-U", RowKind.UPDATE_BEFORE, - "+U", RowKind.UPDATE_AFTER); + private static final Map ROW_KIND_MAP = + ImmutableMap.of( + "+I", RowKind.INSERT, + "-D", RowKind.DELETE, + "-U", RowKind.UPDATE_BEFORE, + "+U", RowKind.UPDATE_AFTER); private static final int ROW_ID_POS = 0; private static final int ROW_DATA_POS = 1; @@ -91,27 +90,27 @@ public class TestFlinkIcebergSinkV2 extends TableTestBase { private StreamExecutionEnvironment env; private TestTableLoader tableLoader; - @Parameterized.Parameters(name = "FileFormat = {0}, Parallelism = {1}, Partitioned={2}, WriteDistributionMode ={3}") + @Parameterized.Parameters( + name = "FileFormat = {0}, Parallelism = {1}, Partitioned={2}, WriteDistributionMode ={3}") public static Object[][] parameters() { return new Object[][] { - new Object[] {"avro", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {"avro", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {"avro", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {"avro", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - - new Object[] {"orc", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {"orc", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {"orc", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {"orc", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - - new Object[] {"parquet", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, - new Object[] {"parquet", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, - new Object[] {"parquet", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, - new Object[] {"parquet", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE} + new Object[] {"avro", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, + new Object[] {"avro", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, + new Object[] {"avro", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, + new Object[] {"avro", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, + new Object[] {"orc", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, + new Object[] {"orc", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, + new Object[] {"orc", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, + new Object[] {"orc", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, + new Object[] {"parquet", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, + new Object[] {"parquet", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, + new Object[] {"parquet", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, + new Object[] {"parquet", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE} }; } - public TestFlinkIcebergSinkV2(String format, int parallelism, boolean partitioned, String writeDistributionMode) { + public TestFlinkIcebergSinkV2( + String format, int parallelism, boolean partitioned, String writeDistributionMode) { super(FORMAT_V2); this.format = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH)); this.parallelism = parallelism; @@ -128,18 +127,24 @@ public void setupTable() throws IOException { if (!partitioned) { table = create(SimpleDataUtil.SCHEMA, PartitionSpec.unpartitioned()); } else { - table = create(SimpleDataUtil.SCHEMA, PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build()); + table = + create( + SimpleDataUtil.SCHEMA, + PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build()); } - table.updateProperties() + table + .updateProperties() .set(TableProperties.DEFAULT_FILE_FORMAT, format.name()) .set(TableProperties.WRITE_DISTRIBUTION_MODE, writeDistributionMode) .commit(); - env = StreamExecutionEnvironment.getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100L) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100L) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); tableLoader = new TestTableLoader(tableDir.getAbsolutePath()); } @@ -147,19 +152,23 @@ public void setupTable() throws IOException { private List findValidSnapshots(Table table) { List validSnapshots = Lists.newArrayList(); for (Snapshot snapshot : table.snapshots()) { - if (snapshot.allManifests(table.io()).stream().anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { + if (snapshot.allManifests(table.io()).stream() + .anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { validSnapshots.add(snapshot); } } return validSnapshots; } - private void testChangeLogs(List equalityFieldColumns, - KeySelector keySelector, - boolean insertAsUpsert, - List> elementsPerCheckpoint, - List> expectedRecordsPerCheckpoint) throws Exception { - DataStream dataStream = env.addSource(new BoundedTestSource<>(elementsPerCheckpoint), ROW_TYPE_INFO); + private void testChangeLogs( + List equalityFieldColumns, + KeySelector keySelector, + boolean insertAsUpsert, + List> elementsPerCheckpoint, + List> expectedRecordsPerCheckpoint) + throws Exception { + DataStream dataStream = + env.addSource(new BoundedTestSource<>(elementsPerCheckpoint), ROW_TYPE_INFO); FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) .tableLoader(tableLoader) @@ -175,13 +184,16 @@ private void testChangeLogs(List equalityFieldColumns, table.refresh(); List snapshots = findValidSnapshots(table); int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); - Assert.assertEquals("Should have the expected snapshot number", expectedSnapshotNum, snapshots.size()); + Assert.assertEquals( + "Should have the expected snapshot number", expectedSnapshotNum, snapshots.size()); for (int i = 0; i < expectedSnapshotNum; i++) { long snapshotId = snapshots.get(i).snapshotId(); List expectedRecords = expectedRecordsPerCheckpoint.get(i); - Assert.assertEquals("Should have the expected records for the checkpoint#" + i, - expectedRowSet(expectedRecords.toArray(new Record[0])), actualRowSet(snapshotId, "*")); + Assert.assertEquals( + "Should have the expected records for the checkpoint#" + i, + expectedRowSet(expectedRecords.toArray(new Record[0])), + actualRowSet(snapshotId, "*")); } } @@ -200,232 +212,227 @@ private Record record(int id, String data) { @Test public void testCheckAndGetEqualityFieldIds() { - table.updateSchema() + table + .updateSchema() .allowIncompatibleChanges() .addRequiredColumn("type", Types.StringType.get()) .setIdentifierFields("type") .commit(); - DataStream dataStream = env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); - FlinkSink.Builder builder = FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA).table(table); + DataStream dataStream = + env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA).table(table); // Use schema identifier field IDs as equality field id list by default Assert.assertEquals( table.schema().identifierFieldIds(), - Sets.newHashSet(builder.checkAndGetEqualityFieldIds()) - ); + Sets.newHashSet(builder.checkAndGetEqualityFieldIds())); // Use user-provided equality field column as equality field id list builder.equalityFieldColumns(Lists.newArrayList("id")); Assert.assertEquals( Sets.newHashSet(table.schema().findField("id").fieldId()), - Sets.newHashSet(builder.checkAndGetEqualityFieldIds()) - ); + Sets.newHashSet(builder.checkAndGetEqualityFieldIds())); builder.equalityFieldColumns(Lists.newArrayList("type")); Assert.assertEquals( Sets.newHashSet(table.schema().findField("type").fieldId()), - Sets.newHashSet(builder.checkAndGetEqualityFieldIds()) - ); + Sets.newHashSet(builder.checkAndGetEqualityFieldIds())); } @Test public void testChangeLogOnIdKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( + List> elementsPerCheckpoint = ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa"), - row("-D", 2, "aaa"), - row("+I", 2, "bbb") - ), + ImmutableList.of( + row("+I", 1, "aaa"), + row("-D", 1, "aaa"), + row("+I", 1, "bbb"), + row("+I", 2, "aaa"), + row("-D", 2, "aaa"), + row("+I", 2, "bbb")), + ImmutableList.of( + row("-U", 2, "bbb"), row("+U", 2, "ccc"), row("-D", 2, "ccc"), row("+I", 2, "ddd")), + ImmutableList.of( + row("-D", 1, "bbb"), + row("+I", 1, "ccc"), + row("-D", 1, "ccc"), + row("+I", 1, "ddd"))); + + List> expectedRecords = ImmutableList.of( - row("-U", 2, "bbb"), - row("+U", 2, "ccc"), - row("-D", 2, "ccc"), - row("+I", 2, "ddd") - ), - ImmutableList.of( - row("-D", 1, "bbb"), - row("+I", 1, "ccc"), - row("-D", 1, "ccc"), - row("+I", 1, "ddd") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "bbb")), - ImmutableList.of(record(1, "bbb"), record(2, "ddd")), - ImmutableList.of(record(1, "ddd"), record(2, "ddd")) - ); + ImmutableList.of(record(1, "bbb"), record(2, "bbb")), + ImmutableList.of(record(1, "bbb"), record(2, "ddd")), + ImmutableList.of(record(1, "ddd"), record(2, "ddd"))); if (partitioned && writeDistributionMode.equals(TableProperties.WRITE_DISTRIBUTION_MODE_HASH)) { - AssertHelpers.assertThrows("Should be error because equality field columns don't include all partition keys", - IllegalStateException.class, "should be included in equality fields", + AssertHelpers.assertThrows( + "Should be error because equality field columns don't include all partition keys", + IllegalStateException.class, + "should be included in equality fields", () -> { - testChangeLogs(ImmutableList.of("id"), row -> row.getField(ROW_ID_POS), false, - elementsPerCheckpoint, expectedRecords); + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + false, + elementsPerCheckpoint, + expectedRecords); return null; }); } else { - testChangeLogs(ImmutableList.of("id"), row -> row.getField(ROW_ID_POS), false, - elementsPerCheckpoint, expectedRecords); + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + false, + elementsPerCheckpoint, + expectedRecords); } } @Test public void testChangeLogOnDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 2, "bbb"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa") - ), + List> elementsPerCheckpoint = ImmutableList.of( - row("-U", 2, "aaa"), - row("+U", 1, "ccc"), - row("+I", 1, "aaa") - ), + ImmutableList.of( + row("+I", 1, "aaa"), + row("-D", 1, "aaa"), + row("+I", 2, "bbb"), + row("+I", 1, "bbb"), + row("+I", 2, "aaa")), + ImmutableList.of(row("-U", 2, "aaa"), row("+U", 1, "ccc"), row("+I", 1, "aaa")), + ImmutableList.of(row("-D", 1, "bbb"), row("+I", 2, "aaa"), row("+I", 2, "ccc"))); + + List> expectedRecords = ImmutableList.of( - row("-D", 1, "bbb"), - row("+I", 2, "aaa"), - row("+I", 2, "ccc") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "aaa")), - ImmutableList.of(record(1, "aaa"), record(1, "bbb"), record(1, "ccc")), - ImmutableList.of(record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "ccc")) - ); - - testChangeLogs(ImmutableList.of("data"), row -> row.getField(ROW_DATA_POS), false, - elementsPerCheckpoint, expectedRecords); + ImmutableList.of(record(1, "bbb"), record(2, "aaa")), + ImmutableList.of(record(1, "aaa"), record(1, "bbb"), record(1, "ccc")), + ImmutableList.of( + record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "ccc"))); + + testChangeLogs( + ImmutableList.of("data"), + row -> row.getField(ROW_DATA_POS), + false, + elementsPerCheckpoint, + expectedRecords); } @Test public void testChangeLogOnIdDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( + List> elementsPerCheckpoint = ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 2, "bbb"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa") - ), + ImmutableList.of( + row("+I", 1, "aaa"), + row("-D", 1, "aaa"), + row("+I", 2, "bbb"), + row("+I", 1, "bbb"), + row("+I", 2, "aaa")), + ImmutableList.of(row("-U", 2, "aaa"), row("+U", 1, "ccc"), row("+I", 1, "aaa")), + ImmutableList.of(row("-D", 1, "bbb"), row("+I", 2, "aaa"))); + + List> expectedRecords = ImmutableList.of( - row("-U", 2, "aaa"), - row("+U", 1, "ccc"), - row("+I", 1, "aaa") - ), - ImmutableList.of( - row("-D", 1, "bbb"), - row("+I", 2, "aaa") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "aaa"), record(2, "bbb")), - ImmutableList.of(record(1, "aaa"), record(1, "bbb"), record(1, "ccc"), record(2, "bbb")), - ImmutableList.of(record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "bbb")) - ); - - testChangeLogs(ImmutableList.of("data", "id"), row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - false, elementsPerCheckpoint, expectedRecords); + ImmutableList.of(record(1, "bbb"), record(2, "aaa"), record(2, "bbb")), + ImmutableList.of( + record(1, "aaa"), record(1, "bbb"), record(1, "ccc"), record(2, "bbb")), + ImmutableList.of( + record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "bbb"))); + + testChangeLogs( + ImmutableList.of("data", "id"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + false, + elementsPerCheckpoint, + expectedRecords); } @Test public void testChangeLogOnSameKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( - // Checkpoint #1 - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 1, "aaa") - ), - // Checkpoint #2 - ImmutableList.of( - row("-U", 1, "aaa"), - row("+U", 1, "aaa") - ), - // Checkpoint #3 + List> elementsPerCheckpoint = ImmutableList.of( - row("-D", 1, "aaa"), - row("+I", 1, "aaa") - ), - // Checkpoint #4 + // Checkpoint #1 + ImmutableList.of(row("+I", 1, "aaa"), row("-D", 1, "aaa"), row("+I", 1, "aaa")), + // Checkpoint #2 + ImmutableList.of(row("-U", 1, "aaa"), row("+U", 1, "aaa")), + // Checkpoint #3 + ImmutableList.of(row("-D", 1, "aaa"), row("+I", 1, "aaa")), + // Checkpoint #4 + ImmutableList.of(row("-U", 1, "aaa"), row("+U", 1, "aaa"), row("+I", 1, "aaa"))); + + List> expectedRecords = ImmutableList.of( - row("-U", 1, "aaa"), - row("+U", 1, "aaa"), - row("+I", 1, "aaa") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa"), record(1, "aaa")) - ); - - testChangeLogs(ImmutableList.of("id", "data"), row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - false, elementsPerCheckpoint, expectedRecords); + ImmutableList.of(record(1, "aaa")), + ImmutableList.of(record(1, "aaa")), + ImmutableList.of(record(1, "aaa")), + ImmutableList.of(record(1, "aaa"), record(1, "aaa"))); + + testChangeLogs( + ImmutableList.of("id", "data"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + false, + elementsPerCheckpoint, + expectedRecords); } @Test public void testUpsertModeCheck() throws Exception { - DataStream dataStream = env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); - FlinkSink.Builder builder = FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .tableLoader(tableLoader) - .tableSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .upsert(true); - - AssertHelpers.assertThrows("Should be error because upsert mode and overwrite mode enable at the same time.", - IllegalStateException.class, "OVERWRITE mode shouldn't be enable", - () -> builder.equalityFieldColumns(ImmutableList.of("id", "data")).overwrite(true).append() - ); - - AssertHelpers.assertThrows("Should be error because equality field columns are empty.", - IllegalStateException.class, "Equality field columns shouldn't be empty", - () -> builder.equalityFieldColumns(ImmutableList.of()).overwrite(false).append() - ); + DataStream dataStream = + env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .tableLoader(tableLoader) + .tableSchema(SimpleDataUtil.FLINK_SCHEMA) + .writeParallelism(parallelism) + .upsert(true); + + AssertHelpers.assertThrows( + "Should be error because upsert mode and overwrite mode enable at the same time.", + IllegalStateException.class, + "OVERWRITE mode shouldn't be enable", + () -> + builder.equalityFieldColumns(ImmutableList.of("id", "data")).overwrite(true).append()); + + AssertHelpers.assertThrows( + "Should be error because equality field columns are empty.", + IllegalStateException.class, + "Equality field columns shouldn't be empty", + () -> builder.equalityFieldColumns(ImmutableList.of()).overwrite(false).append()); } @Test public void testUpsertOnIdKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("+U", 1, "bbb") - ), - ImmutableList.of( - row("+I", 1, "ccc") - ), + List> elementsPerCheckpoint = ImmutableList.of( - row("+U", 1, "ddd"), - row("+I", 1, "eee") - ) - ); + ImmutableList.of(row("+I", 1, "aaa"), row("+U", 1, "bbb")), + ImmutableList.of(row("+I", 1, "ccc")), + ImmutableList.of(row("+U", 1, "ddd"), row("+I", 1, "eee"))); - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "bbb")), - ImmutableList.of(record(1, "ccc")), - ImmutableList.of(record(1, "eee")) - ); + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(1, "bbb")), + ImmutableList.of(record(1, "ccc")), + ImmutableList.of(record(1, "eee"))); if (!partitioned) { - testChangeLogs(ImmutableList.of("id"), row -> row.getField(ROW_ID_POS), true, - elementsPerCheckpoint, expectedRecords); + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + true, + elementsPerCheckpoint, + expectedRecords); } else { - AssertHelpers.assertThrows("Should be error because equality field columns don't include all partition keys", - IllegalStateException.class, "should be included in equality fields", + AssertHelpers.assertThrows( + "Should be error because equality field columns don't include all partition keys", + IllegalStateException.class, + "should be included in equality fields", () -> { - testChangeLogs(ImmutableList.of("id"), row -> row.getField(ROW_ID_POS), true, - elementsPerCheckpoint, expectedRecords); + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + true, + elementsPerCheckpoint, + expectedRecords); return null; }); } @@ -433,61 +440,46 @@ public void testUpsertOnIdKey() throws Exception { @Test public void testUpsertOnDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("+I", 2, "aaa"), - row("+I", 3, "bbb") - ), + List> elementsPerCheckpoint = ImmutableList.of( - row("+U", 4, "aaa"), - row("-U", 3, "bbb"), - row("+U", 5, "bbb") - ), + ImmutableList.of(row("+I", 1, "aaa"), row("+I", 2, "aaa"), row("+I", 3, "bbb")), + ImmutableList.of(row("+U", 4, "aaa"), row("-U", 3, "bbb"), row("+U", 5, "bbb")), + ImmutableList.of(row("+I", 6, "aaa"), row("+U", 7, "bbb"))); + + List> expectedRecords = ImmutableList.of( - row("+I", 6, "aaa"), - row("+U", 7, "bbb") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(2, "aaa"), record(3, "bbb")), - ImmutableList.of(record(4, "aaa"), record(5, "bbb")), - ImmutableList.of(record(6, "aaa"), record(7, "bbb")) - ); - - testChangeLogs(ImmutableList.of("data"), row -> row.getField(ROW_DATA_POS), true, - elementsPerCheckpoint, expectedRecords); + ImmutableList.of(record(2, "aaa"), record(3, "bbb")), + ImmutableList.of(record(4, "aaa"), record(5, "bbb")), + ImmutableList.of(record(6, "aaa"), record(7, "bbb"))); + + testChangeLogs( + ImmutableList.of("data"), + row -> row.getField(ROW_DATA_POS), + true, + elementsPerCheckpoint, + expectedRecords); } @Test public void testUpsertOnIdDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( + List> elementsPerCheckpoint = ImmutableList.of( - row("+I", 1, "aaa"), - row("+U", 1, "aaa"), - row("+I", 2, "bbb") - ), - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 2, "bbb"), - row("+I", 2, "ccc") - ), + ImmutableList.of(row("+I", 1, "aaa"), row("+U", 1, "aaa"), row("+I", 2, "bbb")), + ImmutableList.of(row("+I", 1, "aaa"), row("-D", 2, "bbb"), row("+I", 2, "ccc")), + ImmutableList.of(row("+U", 1, "bbb"), row("-U", 1, "ccc"), row("-D", 1, "aaa"))); + + List> expectedRecords = ImmutableList.of( - row("+U", 1, "bbb"), - row("-U", 1, "ccc"), - row("-D", 1, "aaa") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "aaa"), record(2, "bbb")), - ImmutableList.of(record(1, "aaa"), record(2, "ccc")), - ImmutableList.of(record(1, "bbb"), record(2, "ccc")) - ); - - testChangeLogs(ImmutableList.of("id", "data"), row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - true, elementsPerCheckpoint, expectedRecords); + ImmutableList.of(record(1, "aaa"), record(2, "bbb")), + ImmutableList.of(record(1, "aaa"), record(2, "ccc")), + ImmutableList.of(record(1, "bbb"), record(2, "ccc"))); + + testChangeLogs( + ImmutableList.of("id", "data"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + true, + elementsPerCheckpoint, + expectedRecords); } private StructLikeSet expectedRowSet(Record... records) { @@ -497,10 +489,8 @@ private StructLikeSet expectedRowSet(Record... records) { private StructLikeSet actualRowSet(long snapshotId, String... columns) throws IOException { table.refresh(); StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - try (CloseableIterable reader = IcebergGenerics.read(table) - .useSnapshot(snapshotId) - .select(columns) - .build()) { + try (CloseableIterable reader = + IcebergGenerics.read(table).useSnapshot(snapshotId).select(columns).build()) { reader.forEach(set::add); } return set; diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java index 4a47656e847d..7fe4e159fc61 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; +import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; + import java.io.File; import java.io.IOException; import java.nio.file.Paths; @@ -51,13 +52,10 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; - public class TestFlinkManifest { private static final Configuration CONF = new Configuration(); - @Rule - public TemporaryFolder tempFolder = new TemporaryFolder(); + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); private String tablePath; private Table table; @@ -75,15 +73,21 @@ public void before() throws IOException { // Construct the iceberg table. table = SimpleDataUtil.createTable(tablePath, ImmutableMap.of(), false); - int[] equalityFieldIds = new int[] { - table.schema().findField("id").fieldId(), - table.schema().findField("data").fieldId() - }; - this.appenderFactory = new FlinkAppenderFactory(table.schema(), FlinkSchemaUtil.convert(table.schema()), - table.properties(), table.spec(), equalityFieldIds, table.schema(), null); + int[] equalityFieldIds = + new int[] { + table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() + }; + this.appenderFactory = + new FlinkAppenderFactory( + table.schema(), + FlinkSchemaUtil.convert(table.schema()), + table.properties(), + table.spec(), + equalityFieldIds, + table.schema(), + null); } - @Test public void testIO() throws IOException { String flinkJobId = newFlinkJobId(); @@ -96,13 +100,15 @@ public void testIO() throws IOException { List dataFiles = generateDataFiles(10); List eqDeleteFiles = generateEqDeleteFiles(5); List posDeleteFiles = generatePosDeleteFiles(5); - DeltaManifests deltaManifests = FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder() - .addDataFiles(dataFiles) - .addDeleteFiles(eqDeleteFiles) - .addDeleteFiles(posDeleteFiles) - .build(), - () -> factory.create(curCkpId), table.spec()); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + WriteResult.builder() + .addDataFiles(dataFiles) + .addDeleteFiles(eqDeleteFiles) + .addDeleteFiles(posDeleteFiles) + .build(), + () -> factory.create(curCkpId), + table.spec()); WriteResult result = FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io()); Assert.assertEquals("Size of data file list are not equal.", 10, result.deleteFiles().length); @@ -125,30 +131,39 @@ public void testUserProvidedManifestLocation() throws IOException { String flinkJobId = newFlinkJobId(); String operatorId = newOperatorUniqueId(); File userProvidedFolder = tempFolder.newFolder(); - Map props = ImmutableMap.of(FLINK_MANIFEST_LOCATION, userProvidedFolder.getAbsolutePath() + "///"); - ManifestOutputFileFactory factory = new ManifestOutputFileFactory( - ((HasTableOperations) table).operations(), table.io(), props, - flinkJobId, operatorId, 1, 1); + Map props = + ImmutableMap.of(FLINK_MANIFEST_LOCATION, userProvidedFolder.getAbsolutePath() + "///"); + ManifestOutputFileFactory factory = + new ManifestOutputFileFactory( + ((HasTableOperations) table).operations(), + table.io(), + props, + flinkJobId, + operatorId, + 1, + 1); List dataFiles = generateDataFiles(5); - DeltaManifests deltaManifests = FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder() - .addDataFiles(dataFiles) - .build(), - () -> factory.create(checkpointId), - table.spec()); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + WriteResult.builder().addDataFiles(dataFiles).build(), + () -> factory.create(checkpointId), + table.spec()); Assert.assertNotNull("Data manifest shouldn't be null", deltaManifests.dataManifest()); Assert.assertNull("Delete manifest should be null", deltaManifests.deleteManifest()); - Assert.assertEquals("The newly created manifest file should be located under the user provided directory", - userProvidedFolder.toPath(), Paths.get(deltaManifests.dataManifest().path()).getParent()); + Assert.assertEquals( + "The newly created manifest file should be located under the user provided directory", + userProvidedFolder.toPath(), + Paths.get(deltaManifests.dataManifest().path()).getParent()); WriteResult result = FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io()); Assert.assertEquals(0, result.deleteFiles().length); Assert.assertEquals(5, result.dataFiles().length); - Assert.assertEquals("Size of data file list are not equal.", dataFiles.size(), result.dataFiles().length); + Assert.assertEquals( + "Size of data file list are not equal.", dataFiles.size(), result.dataFiles().length); for (int i = 0; i < dataFiles.size(); i++) { TestHelpers.assertEquals(dataFiles.get(i), result.dataFiles()[i]); } @@ -159,29 +174,34 @@ public void testVersionedSerializer() throws IOException { long checkpointId = 1; String flinkJobId = newFlinkJobId(); String operatorId = newOperatorUniqueId(); - ManifestOutputFileFactory factory = FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, operatorId, - 1, 1); + ManifestOutputFileFactory factory = + FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, operatorId, 1, 1); List dataFiles = generateDataFiles(10); List eqDeleteFiles = generateEqDeleteFiles(10); List posDeleteFiles = generatePosDeleteFiles(10); - DeltaManifests expected = FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder() - .addDataFiles(dataFiles) - .addDeleteFiles(eqDeleteFiles) - .addDeleteFiles(posDeleteFiles) - .build(), - () -> factory.create(checkpointId), table.spec()); + DeltaManifests expected = + FlinkManifestUtil.writeCompletedFiles( + WriteResult.builder() + .addDataFiles(dataFiles) + .addDeleteFiles(eqDeleteFiles) + .addDeleteFiles(posDeleteFiles) + .build(), + () -> factory.create(checkpointId), + table.spec()); byte[] versionedSerializeData = - SimpleVersionedSerialization.writeVersionAndSerialize(DeltaManifestsSerializer.INSTANCE, expected); - DeltaManifests actual = SimpleVersionedSerialization - .readVersionAndDeSerialize(DeltaManifestsSerializer.INSTANCE, versionedSerializeData); + SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, expected); + DeltaManifests actual = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, versionedSerializeData); TestHelpers.assertEquals(expected.dataManifest(), actual.dataManifest()); TestHelpers.assertEquals(expected.deleteManifest(), actual.deleteManifest()); byte[] versionedSerializeData2 = - SimpleVersionedSerialization.writeVersionAndSerialize(DeltaManifestsSerializer.INSTANCE, actual); + SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, actual); Assert.assertArrayEquals(versionedSerializeData, versionedSerializeData2); } @@ -191,17 +211,21 @@ public void testCompatibility() throws IOException { long checkpointId = 1; String flinkJobId = newFlinkJobId(); String operatorId = newOperatorUniqueId(); - ManifestOutputFileFactory factory = FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, operatorId, - 1, 1); + ManifestOutputFileFactory factory = + FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, operatorId, 1, 1); List dataFiles = generateDataFiles(10); - ManifestFile manifest = FlinkManifestUtil.writeDataFiles(factory.create(checkpointId), table.spec(), dataFiles); - byte[] dataV1 = SimpleVersionedSerialization.writeVersionAndSerialize(new V1Serializer(), manifest); + ManifestFile manifest = + FlinkManifestUtil.writeDataFiles(factory.create(checkpointId), table.spec(), dataFiles); + byte[] dataV1 = + SimpleVersionedSerialization.writeVersionAndSerialize(new V1Serializer(), manifest); DeltaManifests delta = - SimpleVersionedSerialization.readVersionAndDeSerialize(DeltaManifestsSerializer.INSTANCE, dataV1); + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, dataV1); Assert.assertNull("Serialization v1 don't include delete files.", delta.deleteManifest()); - Assert.assertNotNull("Serialization v1 should not have null data manifest.", delta.dataManifest()); + Assert.assertNotNull( + "Serialization v1 should not have null data manifest.", delta.dataManifest()); TestHelpers.assertEquals(manifest, delta.dataManifest()); List actualFiles = FlinkManifestUtil.readDataFiles(delta.dataManifest(), table.io()); @@ -230,18 +254,24 @@ public ManifestFile deserialize(int version, byte[] serialized) throws IOExcepti } private DataFile writeDataFile(String filename, List rows) throws IOException { - return SimpleDataUtil.writeFile(table.schema(), table.spec(), CONF, - tablePath, FileFormat.PARQUET.addExtension(filename), rows); + return SimpleDataUtil.writeFile( + table.schema(), + table.spec(), + CONF, + tablePath, + FileFormat.PARQUET.addExtension(filename), + rows); } private DeleteFile writeEqDeleteFile(String filename, List deletes) throws IOException { - return SimpleDataUtil.writeEqDeleteFile(table, FileFormat.PARQUET, tablePath, filename, appenderFactory, deletes); + return SimpleDataUtil.writeEqDeleteFile( + table, FileFormat.PARQUET, tablePath, filename, appenderFactory, deletes); } private DeleteFile writePosDeleteFile(String filename, List> positions) throws IOException { - return SimpleDataUtil - .writePosDeleteFile(table, FileFormat.PARQUET, tablePath, filename, appenderFactory, positions); + return SimpleDataUtil.writePosDeleteFile( + table, FileFormat.PARQUET, tablePath, filename, appenderFactory, positions); } private List generateDataFiles(int fileNum) throws IOException { @@ -259,7 +289,8 @@ private List generateEqDeleteFiles(int fileNum) throws IOException { List deleteFiles = Lists.newArrayList(); for (int i = 0; i < fileNum; i++) { rowDataList.add(SimpleDataUtil.createDelete(i, "a" + i)); - deleteFiles.add(writeEqDeleteFile("eq-delete-file-" + fileCount.incrementAndGet(), rowDataList)); + deleteFiles.add( + writeEqDeleteFile("eq-delete-file-" + fileCount.incrementAndGet(), rowDataList)); } return deleteFiles; } @@ -269,7 +300,8 @@ private List generatePosDeleteFiles(int fileNum) throws IOException List deleteFiles = Lists.newArrayList(); for (int i = 0; i < fileNum; i++) { positions.add(Pair.of("data-file-1", (long) i)); - deleteFiles.add(writePosDeleteFile("pos-delete-file-" + fileCount.incrementAndGet(), positions)); + deleteFiles.add( + writePosDeleteFile("pos-delete-file-" + fileCount.incrementAndGet(), positions)); } return deleteFiles; } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java index 934b5a0d75de..3951c2e70f65 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -39,9 +38,11 @@ public TestFlinkPartitioningWriters(FileFormat fileFormat) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return FlinkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java index 5fd5c5eebee9..9e846efe6fc9 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -39,9 +38,11 @@ public TestFlinkPositionDeltaWriters(FileFormat fileFormat) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return FlinkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java index 9339e5ac2c3e..07716b9c3e60 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -35,9 +34,11 @@ public TestFlinkRollingFileWriters(FileFormat fileFormat, boolean partitioned) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return FlinkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java index aa31c1819d10..e6d64ef2c720 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import org.apache.flink.table.data.GenericRowData; diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java index 78ffe72c2dcd..3647bddb9148 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.flink.sink.IcebergFilesCommitter.MAX_CONTINUOUS_EMPTY_COMMITS; +import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; + import java.io.File; import java.io.IOException; import java.nio.file.Files; @@ -66,10 +69,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.flink.sink.IcebergFilesCommitter.MAX_CONTINUOUS_EMPTY_COMMITS; -import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; - @RunWith(Parameterized.class) public class TestIcebergFilesCommitter extends TableTestBase { private static final Configuration CONF = new Configuration(); @@ -82,12 +81,12 @@ public class TestIcebergFilesCommitter extends TableTestBase { @Parameterized.Parameters(name = "FileFormat = {0}, FormatVersion={1}") public static Object[][] parameters() { return new Object[][] { - new Object[] {"avro", 1}, - new Object[] {"avro", 2}, - new Object[] {"parquet", 1}, - new Object[] {"parquet", 2}, - new Object[] {"orc", 1}, - new Object[] {"orc", 2} + new Object[] {"avro", 1}, + new Object[] {"avro", 2}, + new Object[] {"parquet", 1}, + new Object[] {"parquet", 2}, + new Object[] {"orc", 1}, + new Object[] {"orc", 2} }; } @@ -110,7 +109,8 @@ public void setupTable() throws IOException { // Construct the iceberg table. table = create(SimpleDataUtil.SCHEMA, PartitionSpec.unpartitioned()); - table.updateProperties() + table + .updateProperties() .set(DEFAULT_FILE_FORMAT, format.name()) .set(FLINK_MANIFEST_LOCATION, flinkManifestFolder.getAbsolutePath()) .set(MAX_CONTINUOUS_EMPTY_COMMITS, "1") @@ -130,7 +130,8 @@ public void testCommitTxnWithoutDataFiles() throws Exception { assertSnapshotSize(0); assertMaxCommittedCheckpointId(jobId, -1L); - // It's better to advance the max-committed-checkpoint-id in iceberg snapshot, so that the future flink job + // It's better to advance the max-committed-checkpoint-id in iceberg snapshot, so that the + // future flink job // failover won't fail. for (int i = 1; i <= 3; i++) { harness.snapshot(++checkpointId, ++timestamp); @@ -147,9 +148,7 @@ public void testCommitTxnWithoutDataFiles() throws Exception { @Test public void testMaxContinuousEmptyCommits() throws Exception { - table.updateProperties() - .set(MAX_CONTINUOUS_EMPTY_COMMITS, "3") - .commit(); + table.updateProperties().set(MAX_CONTINUOUS_EMPTY_COMMITS, "3").commit(); JobID jobId = new JobID(); long checkpointId = 0; @@ -206,7 +205,8 @@ public void testCommitTxn() throws Exception { SimpleDataUtil.assertTableRows(table, ImmutableList.copyOf(rows)); assertSnapshotSize(i); assertMaxCommittedCheckpointId(jobID, i); - Assert.assertEquals(TestIcebergFilesCommitter.class.getName(), + Assert.assertEquals( + TestIcebergFilesCommitter.class.getName(), table.currentSnapshot().summary().get("flink.test")); } } @@ -374,7 +374,8 @@ public void testRecoveryFromValidSnapshot() throws Exception { @Test public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Exception { - // We've two steps in checkpoint: 1. snapshotState(ckp); 2. notifyCheckpointComplete(ckp). It's possible that we + // We've two steps in checkpoint: 1. snapshotState(ckp); 2. notifyCheckpointComplete(ckp). It's + // possible that we // flink job will restore from a checkpoint with only step#1 finished. long checkpointId = 0; long timestamp = 0; @@ -404,7 +405,8 @@ public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Except harness.initializeState(snapshot); harness.open(); - // All flink manifests should be cleaned because it has committed the unfinished iceberg transaction. + // All flink manifests should be cleaned because it has committed the unfinished iceberg + // transaction. assertFlinkManifests(0); SimpleDataUtil.assertTableRows(table, expectedRows); @@ -432,12 +434,14 @@ public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Except // Redeploying flink job from external checkpoint. JobID newJobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(newJobId)) { + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(newJobId)) { harness.setup(); harness.initializeState(snapshot); harness.open(); - // All flink manifests should be cleaned because it has committed the unfinished iceberg transaction. + // All flink manifests should be cleaned because it has committed the unfinished iceberg + // transaction. assertFlinkManifests(0); assertMaxCommittedCheckpointId(newJobId, -1); @@ -470,7 +474,8 @@ public void testStartAnotherJobToWriteSameTable() throws Exception { List tableRows = Lists.newArrayList(); JobID oldJobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(oldJobId)) { + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(oldJobId)) { harness.setup(); harness.open(); @@ -499,7 +504,8 @@ public void testStartAnotherJobToWriteSameTable() throws Exception { checkpointId = 0; timestamp = 0; JobID newJobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(newJobId)) { + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(newJobId)) { harness.setup(); harness.open(); @@ -578,7 +584,8 @@ public void testBoundedStream() throws Exception { SimpleDataUtil.assertTableRows(table, tableRows); assertSnapshotSize(1); assertMaxCommittedCheckpointId(jobId, Long.MAX_VALUE); - Assert.assertEquals(TestIcebergFilesCommitter.class.getName(), + Assert.assertEquals( + TestIcebergFilesCommitter.class.getName(), table.currentSnapshot().summary().get("flink.test")); } } @@ -606,12 +613,14 @@ public void testFlinkManifests() throws Exception { List manifestPaths = assertFlinkManifests(1); Path manifestPath = manifestPaths.get(0); String operatorId = harness.getOneInputOperator().getOperatorID().toString(); - Assert.assertEquals("File name should have the expected pattern.", + Assert.assertEquals( + "File name should have the expected pattern.", String.format("%s-%s-%05d-%d-%d-%05d.avro", jobId, operatorId, 0, 0, checkpoint, 1), manifestPath.getFileName().toString()); // 2. Read the data files from manifests and assert. - List dataFiles = FlinkManifestUtil.readDataFiles(createTestingManifestFile(manifestPath), table.io()); + List dataFiles = + FlinkManifestUtil.readDataFiles(createTestingManifestFile(manifestPath), table.io()); Assert.assertEquals(1, dataFiles.size()); TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); @@ -649,12 +658,14 @@ public void testDeleteFiles() throws Exception { List manifestPaths = assertFlinkManifests(1); Path manifestPath = manifestPaths.get(0); String operatorId = harness.getOneInputOperator().getOperatorID().toString(); - Assert.assertEquals("File name should have the expected pattern.", + Assert.assertEquals( + "File name should have the expected pattern.", String.format("%s-%s-%05d-%d-%d-%05d.avro", jobId, operatorId, 0, 0, checkpoint, 1), manifestPath.getFileName().toString()); // 2. Read the data files from manifests and assert. - List dataFiles = FlinkManifestUtil.readDataFiles(createTestingManifestFile(manifestPath), table.io()); + List dataFiles = + FlinkManifestUtil.readDataFiles(createTestingManifestFile(manifestPath), table.io()); Assert.assertEquals(1, dataFiles.size()); TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); @@ -669,11 +680,10 @@ public void testDeleteFiles() throws Exception { DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(row2)); RowData delete1 = SimpleDataUtil.createDelete(1, "aaa"); - DeleteFile deleteFile1 = writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete1)); - harness.processElement(WriteResult.builder() - .addDataFiles(dataFile2) - .addDeleteFiles(deleteFile1) - .build(), + DeleteFile deleteFile1 = + writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete1)); + harness.processElement( + WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile1).build(), ++timestamp); assertMaxCommittedCheckpointId(jobId, checkpoint); @@ -709,11 +719,10 @@ public void testCommitTwoCheckpointsInSingleTxn() throws Exception { RowData insert2 = SimpleDataUtil.createInsert(2, "bbb"); RowData delete3 = SimpleDataUtil.createDelete(3, "ccc"); DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(insert1, insert2)); - DeleteFile deleteFile1 = writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete3)); - harness.processElement(WriteResult.builder() - .addDataFiles(dataFile1) - .addDeleteFiles(deleteFile1) - .build(), + DeleteFile deleteFile1 = + writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete3)); + harness.processElement( + WriteResult.builder().addDataFiles(dataFile1).addDeleteFiles(deleteFile1).build(), ++timestamp); // The 1th snapshotState. @@ -722,11 +731,10 @@ public void testCommitTwoCheckpointsInSingleTxn() throws Exception { RowData insert4 = SimpleDataUtil.createInsert(4, "ddd"); RowData delete2 = SimpleDataUtil.createDelete(2, "bbb"); DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(insert4)); - DeleteFile deleteFile2 = writeEqDeleteFile(appenderFactory, "delete-file-2", ImmutableList.of(delete2)); - harness.processElement(WriteResult.builder() - .addDataFiles(dataFile2) - .addDeleteFiles(deleteFile2) - .build(), + DeleteFile deleteFile2 = + writeEqDeleteFile(appenderFactory, "delete-file-2", ImmutableList.of(delete2)); + harness.processElement( + WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile2).build(), ++timestamp); // The 2nd snapshotState. @@ -737,48 +745,76 @@ public void testCommitTwoCheckpointsInSingleTxn() throws Exception { SimpleDataUtil.assertTableRows(table, ImmutableList.of(insert1, insert4)); assertMaxCommittedCheckpointId(jobId, checkpoint); assertFlinkManifests(0); - Assert.assertEquals("Should have committed 2 txn.", 2, ImmutableList.copyOf(table.snapshots()).size()); + Assert.assertEquals( + "Should have committed 2 txn.", 2, ImmutableList.copyOf(table.snapshots()).size()); } } - private DeleteFile writeEqDeleteFile(FileAppenderFactory appenderFactory, - String filename, List deletes) throws IOException { - return SimpleDataUtil.writeEqDeleteFile(table, FileFormat.PARQUET, tablePath, filename, appenderFactory, deletes); + private DeleteFile writeEqDeleteFile( + FileAppenderFactory appenderFactory, String filename, List deletes) + throws IOException { + return SimpleDataUtil.writeEqDeleteFile( + table, FileFormat.PARQUET, tablePath, filename, appenderFactory, deletes); } - private DeleteFile writePosDeleteFile(FileAppenderFactory appenderFactory, - String filename, - List> positions) throws IOException { - return SimpleDataUtil.writePosDeleteFile(table, FileFormat.PARQUET, tablePath, filename, appenderFactory, - positions); + private DeleteFile writePosDeleteFile( + FileAppenderFactory appenderFactory, + String filename, + List> positions) + throws IOException { + return SimpleDataUtil.writePosDeleteFile( + table, FileFormat.PARQUET, tablePath, filename, appenderFactory, positions); } private FileAppenderFactory createDeletableAppenderFactory() { - int[] equalityFieldIds = new int[] { - table.schema().findField("id").fieldId(), - table.schema().findField("data").fieldId() - }; - return new FlinkAppenderFactory(table.schema(), - FlinkSchemaUtil.convert(table.schema()), table.properties(), table.spec(), equalityFieldIds, - table.schema(), null); + int[] equalityFieldIds = + new int[] { + table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() + }; + return new FlinkAppenderFactory( + table.schema(), + FlinkSchemaUtil.convert(table.schema()), + table.properties(), + table.spec(), + equalityFieldIds, + table.schema(), + null); } private ManifestFile createTestingManifestFile(Path manifestPath) { - return new GenericManifestFile(manifestPath.toAbsolutePath().toString(), manifestPath.toFile().length(), 0, - ManifestContent.DATA, 0, 0, 0L, 0, 0, 0, 0, 0, 0, null, null); + return new GenericManifestFile( + manifestPath.toAbsolutePath().toString(), + manifestPath.toFile().length(), + 0, + ManifestContent.DATA, + 0, + 0, + 0L, + 0, + 0, + 0, + 0, + 0, + 0, + null, + null); } private List assertFlinkManifests(int expectedCount) throws IOException { - List manifests = Files.list(flinkManifestFolder.toPath()) - .filter(p -> !p.toString().endsWith(".crc")) - .collect(Collectors.toList()); - Assert.assertEquals(String.format("Expected %s flink manifests, but the list is: %s", expectedCount, manifests), - expectedCount, manifests.size()); + List manifests = + Files.list(flinkManifestFolder.toPath()) + .filter(p -> !p.toString().endsWith(".crc")) + .collect(Collectors.toList()); + Assert.assertEquals( + String.format("Expected %s flink manifests, but the list is: %s", expectedCount, manifests), + expectedCount, + manifests.size()); return manifests; } private DataFile writeDataFile(String filename, List rows) throws IOException { - return SimpleDataUtil.writeFile(table.schema(), table.spec(), CONF, tablePath, format.addExtension(filename), rows); + return SimpleDataUtil.writeFile( + table.schema(), table.spec(), CONF, tablePath, format.addExtension(filename), rows); } private void assertMaxCommittedCheckpointId(JobID jobID, long expectedId) { @@ -825,10 +861,14 @@ private static TestOperatorFactory of(String tablePath) { @Override @SuppressWarnings("unchecked") - public > T createStreamOperator(StreamOperatorParameters param) { - IcebergFilesCommitter committer = new IcebergFilesCommitter(new TestTableLoader(tablePath), false, - Collections.singletonMap("flink.test", TestIcebergFilesCommitter.class.getName()), - ThreadPools.WORKER_THREAD_POOL_SIZE); + public > T createStreamOperator( + StreamOperatorParameters param) { + IcebergFilesCommitter committer = + new IcebergFilesCommitter( + new TestTableLoader(tablePath), + false, + Collections.singletonMap("flink.test", TestIcebergFilesCommitter.class.getName()), + ThreadPools.WORKER_THREAD_POOL_SIZE); committer.setup(param.getContainingTask(), param.getStreamConfig(), param.getOutput()); return (T) committer; } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java index 7400449b368d..6f45f10b34be 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.File; @@ -68,8 +67,7 @@ @RunWith(Parameterized.class) public class TestIcebergStreamWriter { - @Rule - public TemporaryFolder tempFolder = new TemporaryFolder(); + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); private String tablePath; private Table table; @@ -80,12 +78,12 @@ public class TestIcebergStreamWriter { @Parameterized.Parameters(name = "format = {0}, partitioned = {1}") public static Object[][] parameters() { return new Object[][] { - {"avro", true}, - {"avro", false}, - {"orc", true}, - {"orc", false}, - {"parquet", true}, - {"parquet", false} + {"avro", true}, + {"avro", false}, + {"orc", true}, + {"orc", false}, + {"parquet", true}, + {"parquet", false} }; } @@ -107,7 +105,8 @@ public void before() throws IOException { @Test public void testWritingTable() throws Exception { long checkpointId = 1L; - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { // The first checkpoint testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 1); @@ -137,13 +136,14 @@ public void testWritingTable() throws Exception { appendFiles.commit(); // Assert the table records. - SimpleDataUtil.assertTableRecords(tablePath, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), - SimpleDataUtil.createRecord(2, "world"), - SimpleDataUtil.createRecord(3, "hello"), - SimpleDataUtil.createRecord(4, "foo"), - SimpleDataUtil.createRecord(5, "bar") - )); + SimpleDataUtil.assertTableRecords( + tablePath, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hello"), + SimpleDataUtil.createRecord(2, "world"), + SimpleDataUtil.createRecord(3, "hello"), + SimpleDataUtil.createRecord(4, "foo"), + SimpleDataUtil.createRecord(5, "bar"))); } } @@ -151,7 +151,8 @@ public void testWritingTable() throws Exception { public void testSnapshotTwice() throws Exception { long checkpointId = 1; long timestamp = 1; - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), timestamp++); testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), timestamp); @@ -174,13 +175,15 @@ public void testSnapshotTwice() throws Exception { @Test public void testTableWithoutSnapshot() throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { Assert.assertEquals(0, testHarness.extractOutputValues().size()); } // Even if we closed the iceberg stream writer, there's no orphan data file. Assert.assertEquals(0, scanDataFiles().size()); - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); // Still not emit the data file yet, because there is no checkpoint. Assert.assertEquals(0, testHarness.extractOutputValues().size()); @@ -212,7 +215,8 @@ private Set scanDataFiles() throws IOException { @Test public void testBoundedStreamCloseWithEmittingDataFiles() throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 2); @@ -236,7 +240,8 @@ public void testBoundedStreamCloseWithEmittingDataFiles() throws Exception { @Test public void testTableWithTargetFileSize() throws Exception { // Adjust the target-file-size in table properties. - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "4") // ~4 bytes; low enough to trigger .commit(); @@ -249,7 +254,8 @@ public void testTableWithTargetFileSize() throws Exception { } } - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { for (RowData row : rows) { testHarness.processElement(row, 1); } @@ -277,20 +283,26 @@ public void testTableWithTargetFileSize() throws Exception { @Test public void testPromotedFlinkDataType() throws Exception { - Schema iSchema = new Schema( - Types.NestedField.required(1, "tinyint", Types.IntegerType.get()), - Types.NestedField.required(2, "smallint", Types.IntegerType.get()), - Types.NestedField.optional(3, "int", Types.IntegerType.get()) - ); - TableSchema flinkSchema = TableSchema.builder() - .field("tinyint", DataTypes.TINYINT().notNull()) - .field("smallint", DataTypes.SMALLINT().notNull()) - .field("int", DataTypes.INT().nullable()) - .build(); + Schema iSchema = + new Schema( + Types.NestedField.required(1, "tinyint", Types.IntegerType.get()), + Types.NestedField.required(2, "smallint", Types.IntegerType.get()), + Types.NestedField.optional(3, "int", Types.IntegerType.get())); + TableSchema flinkSchema = + TableSchema.builder() + .field("tinyint", DataTypes.TINYINT().notNull()) + .field("smallint", DataTypes.SMALLINT().notNull()) + .field("int", DataTypes.INT().nullable()) + .build(); PartitionSpec spec; if (partitioned) { - spec = PartitionSpec.builderFor(iSchema).identity("smallint").identity("tinyint").identity("int").build(); + spec = + PartitionSpec.builderFor(iSchema) + .identity("smallint") + .identity("tinyint") + .identity("int") + .build(); } else { spec = PartitionSpec.unpartitioned(); } @@ -299,21 +311,21 @@ public void testPromotedFlinkDataType() throws Exception { Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); Table icebergTable = new HadoopTables().create(iSchema, spec, props, location); - List rows = Lists.newArrayList( - GenericRowData.of((byte) 0x01, (short) -32768, 101), - GenericRowData.of((byte) 0x02, (short) 0, 102), - GenericRowData.of((byte) 0x03, (short) 32767, 103) - ); + List rows = + Lists.newArrayList( + GenericRowData.of((byte) 0x01, (short) -32768, 101), + GenericRowData.of((byte) 0x02, (short) 0, 102), + GenericRowData.of((byte) 0x03, (short) 32767, 103)); Record record = GenericRecord.create(iSchema); - List expected = Lists.newArrayList( - record.copy(ImmutableMap.of("tinyint", 1, "smallint", -32768, "int", 101)), - record.copy(ImmutableMap.of("tinyint", 2, "smallint", 0, "int", 102)), - record.copy(ImmutableMap.of("tinyint", 3, "smallint", 32767, "int", 103)) - ); - - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter(icebergTable, - flinkSchema)) { + List expected = + Lists.newArrayList( + record.copy(ImmutableMap.of("tinyint", 1, "smallint", -32768, "int", 101)), + record.copy(ImmutableMap.of("tinyint", 2, "smallint", 0, "int", 102)), + record.copy(ImmutableMap.of("tinyint", 3, "smallint", 32767, "int", 103))); + + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter(icebergTable, flinkSchema)) { for (RowData row : rows) { testHarness.processElement(row, 1); } @@ -331,7 +343,8 @@ public void testPromotedFlinkDataType() throws Exception { SimpleDataUtil.assertTableRecords(location, expected); } - private OneInputStreamOperatorTestHarness createIcebergStreamWriter() throws Exception { + private OneInputStreamOperatorTestHarness createIcebergStreamWriter() + throws Exception { return createIcebergStreamWriter(table, SimpleDataUtil.FLINK_SCHEMA); } @@ -339,14 +352,13 @@ private OneInputStreamOperatorTestHarness createIcebergStr Table icebergTable, TableSchema flinkSchema) throws Exception { RowType flinkRowType = FlinkSink.toFlinkRowType(icebergTable.schema(), flinkSchema); FlinkWriteConf flinkWriteConfig = - new FlinkWriteConf(icebergTable, Maps.newHashMap(), new org.apache.flink.configuration.Configuration()); - - IcebergStreamWriter streamWriter = FlinkSink.createStreamWriter( - icebergTable, - flinkWriteConfig, - flinkRowType, null); - OneInputStreamOperatorTestHarness harness = new OneInputStreamOperatorTestHarness<>( - streamWriter, 1, 1, 0); + new FlinkWriteConf( + icebergTable, Maps.newHashMap(), new org.apache.flink.configuration.Configuration()); + + IcebergStreamWriter streamWriter = + FlinkSink.createStreamWriter(icebergTable, flinkWriteConfig, flinkRowType, null); + OneInputStreamOperatorTestHarness harness = + new OneInputStreamOperatorTestHarness<>(streamWriter, 1, 1, 0); harness.setup(); harness.open(); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java index 29a1f78a531e..b6c785cb144b 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -40,53 +39,54 @@ import org.junit.Test; public class TestRowDataPartitionKey { - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(0, "boolType", Types.BooleanType.get()), - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "longType", Types.LongType.get()), - Types.NestedField.required(3, "dateType", Types.DateType.get()), - Types.NestedField.required(4, "timeType", Types.TimeType.get()), - Types.NestedField.required(5, "stringType", Types.StringType.get()), - Types.NestedField.required(6, "timestampWithoutZone", Types.TimestampType.withoutZone()), - Types.NestedField.required(7, "timestampWithZone", Types.TimestampType.withZone()), - Types.NestedField.required(8, "fixedType", Types.FixedType.ofLength(5)), - Types.NestedField.required(9, "uuidType", Types.UUIDType.get()), - Types.NestedField.required(10, "binaryType", Types.BinaryType.get()), - Types.NestedField.required(11, "decimalType1", Types.DecimalType.of(18, 3)), - Types.NestedField.required(12, "decimalType2", Types.DecimalType.of(10, 5)), - Types.NestedField.required(13, "decimalType3", Types.DecimalType.of(38, 19)), - Types.NestedField.required(14, "floatType", Types.FloatType.get()), - Types.NestedField.required(15, "doubleType", Types.DoubleType.get()) - ); - - private static final List SUPPORTED_PRIMITIVES = SCHEMA.asStruct().fields().stream() - .map(Types.NestedField::name).collect(Collectors.toList()); - - private static final Schema NESTED_SCHEMA = new Schema( - Types.NestedField.required(1, "structType", Types.StructType.of( - Types.NestedField.optional(2, "innerStringType", Types.StringType.get()), - Types.NestedField.optional(3, "innerIntegerType", Types.IntegerType.get()) - )) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(0, "boolType", Types.BooleanType.get()), + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "longType", Types.LongType.get()), + Types.NestedField.required(3, "dateType", Types.DateType.get()), + Types.NestedField.required(4, "timeType", Types.TimeType.get()), + Types.NestedField.required(5, "stringType", Types.StringType.get()), + Types.NestedField.required(6, "timestampWithoutZone", Types.TimestampType.withoutZone()), + Types.NestedField.required(7, "timestampWithZone", Types.TimestampType.withZone()), + Types.NestedField.required(8, "fixedType", Types.FixedType.ofLength(5)), + Types.NestedField.required(9, "uuidType", Types.UUIDType.get()), + Types.NestedField.required(10, "binaryType", Types.BinaryType.get()), + Types.NestedField.required(11, "decimalType1", Types.DecimalType.of(18, 3)), + Types.NestedField.required(12, "decimalType2", Types.DecimalType.of(10, 5)), + Types.NestedField.required(13, "decimalType3", Types.DecimalType.of(38, 19)), + Types.NestedField.required(14, "floatType", Types.FloatType.get()), + Types.NestedField.required(15, "doubleType", Types.DoubleType.get())); + + private static final List SUPPORTED_PRIMITIVES = + SCHEMA.asStruct().fields().stream().map(Types.NestedField::name).collect(Collectors.toList()); + + private static final Schema NESTED_SCHEMA = + new Schema( + Types.NestedField.required( + 1, + "structType", + Types.StructType.of( + Types.NestedField.optional(2, "innerStringType", Types.StringType.get()), + Types.NestedField.optional(3, "innerIntegerType", Types.IntegerType.get())))); @Test public void testNullPartitionValue() { - Schema schema = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .identity("data") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("data").build(); - List rows = Lists.newArrayList( - GenericRowData.of(1, StringData.fromString("a")), - GenericRowData.of(2, StringData.fromString("b")), - GenericRowData.of(3, null) - ); + List rows = + Lists.newArrayList( + GenericRowData.of(1, StringData.fromString("a")), + GenericRowData.of(2, StringData.fromString("b")), + GenericRowData.of(3, null)); - RowDataWrapper rowWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + RowDataWrapper rowWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); for (RowData row : rows) { PartitionKey partitionKey = new PartitionKey(spec, schema); @@ -100,16 +100,15 @@ public void testNullPartitionValue() { @Test public void testPartitionWithOneNestedField() { - RowDataWrapper rowWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); + RowDataWrapper rowWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); List records = RandomGenericData.generate(NESTED_SCHEMA, 10, 1991); List rows = Lists.newArrayList(RandomRowData.convert(NESTED_SCHEMA, records)); - PartitionSpec spec1 = PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerStringType") - .build(); - PartitionSpec spec2 = PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerIntegerType") - .build(); + PartitionSpec spec1 = + PartitionSpec.builderFor(NESTED_SCHEMA).identity("structType.innerStringType").build(); + PartitionSpec spec2 = + PartitionSpec.builderFor(NESTED_SCHEMA).identity("structType.innerIntegerType").build(); for (int i = 0; i < rows.size(); i++) { RowData row = rows.get(i); @@ -131,18 +130,21 @@ public void testPartitionWithOneNestedField() { @Test public void testPartitionMultipleNestedField() { - RowDataWrapper rowWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); + RowDataWrapper rowWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); List records = RandomGenericData.generate(NESTED_SCHEMA, 10, 1992); List rows = Lists.newArrayList(RandomRowData.convert(NESTED_SCHEMA, records)); - PartitionSpec spec1 = PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerIntegerType") - .identity("structType.innerStringType") - .build(); - PartitionSpec spec2 = PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerStringType") - .identity("structType.innerIntegerType") - .build(); + PartitionSpec spec1 = + PartitionSpec.builderFor(NESTED_SCHEMA) + .identity("structType.innerIntegerType") + .identity("structType.innerStringType") + .build(); + PartitionSpec spec2 = + PartitionSpec.builderFor(NESTED_SCHEMA) + .identity("structType.innerStringType") + .identity("structType.innerIntegerType") + .build(); PartitionKey pk1 = new PartitionKey(spec1, NESTED_SCHEMA); PartitionKey pk2 = new PartitionKey(spec2, NESTED_SCHEMA); @@ -188,14 +190,19 @@ public void testPartitionValueTypes() { pk.partition(rowWrapper.wrap(row)); expectedPK.partition(recordWrapper.wrap(record)); - Assert.assertEquals("Partition with column " + column + " should have one field.", 1, pk.size()); + Assert.assertEquals( + "Partition with column " + column + " should have one field.", 1, pk.size()); if (column.equals("timeType")) { - Assert.assertEquals("Partition with column " + column + " should have the expected values", - expectedPK.get(0, Long.class) / 1000, pk.get(0, Long.class) / 1000); + Assert.assertEquals( + "Partition with column " + column + " should have the expected values", + expectedPK.get(0, Long.class) / 1000, + pk.get(0, Long.class) / 1000); } else { - Assert.assertEquals("Partition with column " + column + " should have the expected values", - expectedPK.get(0, javaClasses[0]), pk.get(0, javaClasses[0])); + Assert.assertEquals( + "Partition with column " + column + " should have the expected values", + expectedPK.get(0, javaClasses[0]), + pk.get(0, javaClasses[0])); } } } @@ -225,15 +232,19 @@ public void testNestedPartitionValues() { pk.partition(rowWrapper.wrap(rows.get(j))); expectedPK.partition(recordWrapper.wrap(records.get(j))); - Assert.assertEquals("Partition with nested column " + column + " should have one field.", - 1, pk.size()); + Assert.assertEquals( + "Partition with nested column " + column + " should have one field.", 1, pk.size()); if (column.equals("nested.timeType")) { - Assert.assertEquals("Partition with nested column " + column + " should have the expected values.", - expectedPK.get(0, Long.class) / 1000, pk.get(0, Long.class) / 1000); + Assert.assertEquals( + "Partition with nested column " + column + " should have the expected values.", + expectedPK.get(0, Long.class) / 1000, + pk.get(0, Long.class) / 1000); } else { - Assert.assertEquals("Partition with nested column " + column + " should have the expected values.", - expectedPK.get(0, javaClasses[0]), pk.get(0, javaClasses[0])); + Assert.assertEquals( + "Partition with nested column " + column + " should have the expected values.", + expectedPK.get(0, javaClasses[0]), + pk.get(0, javaClasses[0])); } } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java index 2595b098dfea..a47a80ae367e 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.File; @@ -54,18 +53,17 @@ public class TestTaskWriters { private static final Configuration CONF = new Configuration(); private static final long TARGET_FILE_SIZE = 128 * 1024 * 1024; - @Rule - public final TemporaryFolder tempFolder = new TemporaryFolder(); + @Rule public final TemporaryFolder tempFolder = new TemporaryFolder(); @Parameterized.Parameters(name = "format = {0}, partitioned = {1}") public static Object[][] parameters() { return new Object[][] { - {"avro", true}, - {"avro", false}, - {"orc", true}, - {"orc", false}, - {"parquet", true}, - {"parquet", false} + {"avro", true}, + {"avro", false}, + {"orc", true}, + {"orc", false}, + {"parquet", true}, + {"parquet", false} }; } @@ -172,12 +170,13 @@ public void testCompleteFiles() throws IOException { appendFiles.commit(); // Assert the data rows. - SimpleDataUtil.assertTableRecords(path, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "b"), - SimpleDataUtil.createRecord(3, "c"), - SimpleDataUtil.createRecord(4, "d") - )); + SimpleDataUtil.assertTableRecords( + path, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "b"), + SimpleDataUtil.createRecord(3, "c"), + SimpleDataUtil.createRecord(4, "d"))); } } @@ -233,9 +232,14 @@ public void testRandomData() throws IOException { } private TaskWriter createTaskWriter(long targetFileSize) { - TaskWriterFactory taskWriterFactory = new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), (RowType) SimpleDataUtil.FLINK_SCHEMA.toRowDataType().getLogicalType(), - targetFileSize, format, null, false); + TaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + (RowType) SimpleDataUtil.FLINK_SCHEMA.toRowDataType().getLogicalType(), + targetFileSize, + format, + null, + false); taskWriterFactory.initialize(1, 1); return taskWriterFactory.create(); } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java index b0041c3bc04d..b0be3daf7b49 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.List; @@ -53,7 +52,8 @@ public class BoundedTableFactory implements DynamicTableSourceFactory { private static final AtomicInteger DATA_SET_ID = new AtomicInteger(0); private static final Map>> DATA_SETS = Maps.newHashMap(); - private static final ConfigOption DATA_ID = ConfigOptions.key("data-id").stringType().noDefaultValue(); + private static final ConfigOption DATA_ID = + ConfigOptions.key("data-id").stringType().noDefaultValue(); public static String registerDataSet(List> dataSet) { String dataSetId = String.valueOf(DATA_SET_ID.incrementAndGet()); @@ -67,12 +67,13 @@ public static void clearDataSets() { @Override public DynamicTableSource createDynamicTableSource(Context context) { - TableSchema tableSchema = TableSchemaUtils.getPhysicalSchema(context.getCatalogTable().getSchema()); + TableSchema tableSchema = + TableSchemaUtils.getPhysicalSchema(context.getCatalogTable().getSchema()); Configuration configuration = Configuration.fromMap(context.getCatalogTable().getOptions()); String dataId = configuration.getString(DATA_ID); - Preconditions.checkArgument(DATA_SETS.containsKey(dataId), - "data-id %s does not found in registered data set.", dataId); + Preconditions.checkArgument( + DATA_SETS.containsKey(dataId), "data-id %s does not found in registered data set.", dataId); return new BoundedTableSource(DATA_SETS.get(dataId), tableSchema); } @@ -112,8 +113,7 @@ public ChangelogMode getChangelogMode() { Supplier> supplier = () -> elementsPerCheckpoint.stream().flatMap(List::stream); // Add the INSERT row kind by default. - ChangelogMode.Builder builder = ChangelogMode.newBuilder() - .addContainedKind(RowKind.INSERT); + ChangelogMode.Builder builder = ChangelogMode.newBuilder().addContainedKind(RowKind.INSERT); if (supplier.get().anyMatch(r -> r.getKind() == RowKind.DELETE)) { builder.addContainedKind(RowKind.DELETE); @@ -136,12 +136,13 @@ public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderCon @Override public DataStream produceDataStream(StreamExecutionEnvironment env) { boolean checkpointEnabled = env.getCheckpointConfig().isCheckpointingEnabled(); - SourceFunction source = new BoundedTestSource<>(elementsPerCheckpoint, checkpointEnabled); + SourceFunction source = + new BoundedTestSource<>(elementsPerCheckpoint, checkpointEnabled); RowType rowType = (RowType) tableSchema.toRowDataType().getLogicalType(); // Converter to convert the Row to RowData. - DataFormatConverters.RowConverter rowConverter = new DataFormatConverters - .RowConverter(tableSchema.getFieldDataTypes()); + DataFormatConverters.RowConverter rowConverter = + new DataFormatConverters.RowConverter(tableSchema.getFieldDataTypes()); return env.addSource(source, new RowTypeInfo(tableSchema.getFieldTypes())) .map(rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java index 54e44ee5b008..7b435d059845 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Arrays; @@ -28,12 +27,10 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; /** - * A stream source that: - * 1) emits the elements from elementsPerCheckpoint.get(0) without allowing checkpoints. - * 2) then waits for the checkpoint to complete. - * 3) emits the elements from elementsPerCheckpoint.get(1) without allowing checkpoints. - * 4) then waits for the checkpoint to complete. - * 5) ... + * A stream source that: 1) emits the elements from elementsPerCheckpoint.get(0) without allowing + * checkpoints. 2) then waits for the checkpoint to complete. 3) emits the elements from + * elementsPerCheckpoint.get(1) without allowing checkpoints. 4) then waits for the checkpoint to + * complete. 5) ... * *

    Util all the list from elementsPerCheckpoint are exhausted. */ @@ -45,9 +42,7 @@ public final class BoundedTestSource implements SourceFunction, Checkpoint private final AtomicInteger numCheckpointsComplete = new AtomicInteger(0); - /** - * Emits all those elements in several checkpoints. - */ + /** Emits all those elements in several checkpoints. */ public BoundedTestSource(List> elementsPerCheckpoint, boolean checkpointEnabled) { this.elementsPerCheckpoint = elementsPerCheckpoint; this.checkpointEnabled = checkpointEnabled; @@ -57,9 +52,7 @@ public BoundedTestSource(List> elementsPerCheckpoint) { this(elementsPerCheckpoint, true); } - /** - * Emits all those elements in a single checkpoint. - */ + /** Emits all those elements in a single checkpoint. */ public BoundedTestSource(T... elements) { this(Collections.singletonList(Arrays.asList(elements))); } @@ -67,8 +60,9 @@ public BoundedTestSource(T... elements) { @Override public void run(SourceContext ctx) throws Exception { if (!checkpointEnabled) { - Preconditions.checkArgument(elementsPerCheckpoint.size() <= 1, - "There should be at most one list in the elementsPerCheckpoint when checkpoint is disabled."); + Preconditions.checkArgument( + elementsPerCheckpoint.size() <= 1, + "There should be at most one list in the elementsPerCheckpoint when checkpoint is disabled."); elementsPerCheckpoint.stream().flatMap(List::stream).forEach(ctx::collect); return; } @@ -77,11 +71,16 @@ public void run(SourceContext ctx) throws Exception { final int checkpointToAwait; synchronized (ctx.getCheckpointLock()) { - // Let's say checkpointToAwait = numCheckpointsComplete.get() + delta, in fact the value of delta should not - // affect the final table records because we only need to make sure that there will be exactly - // elementsPerCheckpoint.size() checkpoints to emit each records buffer from the original elementsPerCheckpoint. - // Even if the checkpoints that emitted results are not continuous, the correctness of the data should not be - // affected in the end. Setting the delta to be 2 is introducing the variable that produce un-continuous + // Let's say checkpointToAwait = numCheckpointsComplete.get() + delta, in fact the value of + // delta should not + // affect the final table records because we only need to make sure that there will be + // exactly + // elementsPerCheckpoint.size() checkpoints to emit each records buffer from the original + // elementsPerCheckpoint. + // Even if the checkpoints that emitted results are not continuous, the correctness of the + // data should not be + // affected in the end. Setting the delta to be 2 is introducing the variable that produce + // un-continuous // checkpoints that emit the records buffer from elementsPerCheckpoints. checkpointToAwait = numCheckpointsComplete.get() + 2; for (T element : elements) { diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java index b4d8d7bb3efa..7aa2b8034bc5 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.List; @@ -36,8 +35,7 @@ public class ChangeLogTableTestBase extends FlinkTestBase { private volatile TableEnvironment tEnv = null; - @Rule - public TestName name = new TestName(); + @Rule public TestName name = new TestName(); @After public void clean() { @@ -50,16 +48,15 @@ protected TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { if (tEnv == null) { - EnvironmentSettings settings = EnvironmentSettings - .newInstance() - .inStreamingMode() - .build(); + EnvironmentSettings settings = + EnvironmentSettings.newInstance().inStreamingMode().build(); - StreamExecutionEnvironment env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(400) - .setMaxParallelism(1) - .setParallelism(1); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(400) + .setMaxParallelism(1) + .setParallelism(1); tEnv = StreamTableEnvironment.create(env, settings); } @@ -85,8 +82,6 @@ protected static Row updateAfterRow(Object... values) { } protected static List listJoin(List> lists) { - return lists.stream() - .flatMap(List::stream) - .collect(Collectors.toList()); + return lists.stream().flatMap(List::stream).collect(Collectors.toList()); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java index bc302fa5d441..8dc68aad10aa 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.File; @@ -43,18 +42,18 @@ public class SplitHelpers { private static final AtomicLong splitLengthIncrement = new AtomicLong(); - private SplitHelpers() { - } + private SplitHelpers() {} /** * This create a list of IcebergSourceSplit from real files *

  • Create a new Hadoop table under the {@code temporaryFolder} *
  • write {@code fileCount} number of files to the new Iceberg table - *
  • Discover the splits from the table and partition the splits by the {@code filePerSplit} limit + *
  • Discover the splits from the table and partition the splits by the {@code filePerSplit} + * limit *
  • Delete the Hadoop table * - * Since the table and data files are deleted before this method return, - * caller shouldn't attempt to read the data files. + *

    Since the table and data files are deleted before this method return, caller shouldn't + * attempt to read the data files. */ public static List createSplitsFromTransientHadoopTable( TemporaryFolder temporaryFolder, int fileCount, int filesPerSplit) throws Exception { @@ -65,24 +64,28 @@ public static List createSplitsFromTransientHadoopTable( final HadoopCatalog catalog = new HadoopCatalog(hadoopConf, warehouse); try { final Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - final GenericAppenderHelper dataAppender = new GenericAppenderHelper( - table, FileFormat.PARQUET, temporaryFolder); + final GenericAppenderHelper dataAppender = + new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); for (int i = 0; i < fileCount; ++i) { List records = RandomGenericData.generate(TestFixtures.SCHEMA, 2, i); dataAppender.appendToTable(records); } final ScanContext scanContext = ScanContext.builder().build(); - final List splits = FlinkSplitPlanner.planIcebergSourceSplits( - table, scanContext, ThreadPools.getWorkerPool()); + final List splits = + FlinkSplitPlanner.planIcebergSourceSplits( + table, scanContext, ThreadPools.getWorkerPool()); return splits.stream() - .flatMap(split -> { - List> filesList = Lists.partition( - Lists.newArrayList(split.task().files()), filesPerSplit); - return filesList.stream() - .map(files -> new BaseCombinedScanTask(files)) - .map(combinedScanTask -> IcebergSourceSplit.fromCombinedScanTask(combinedScanTask)); - }) + .flatMap( + split -> { + List> filesList = + Lists.partition(Lists.newArrayList(split.task().files()), filesPerSplit); + return filesList.stream() + .map(files -> new BaseCombinedScanTask(files)) + .map( + combinedScanTask -> + IcebergSourceSplit.fromCombinedScanTask(combinedScanTask)); + }) .collect(Collectors.toList()); } finally { catalog.dropTable(TestFixtures.TABLE_IDENTIFIER); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java index d163b84c09c6..7b5f9328694c 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.List; @@ -36,46 +35,53 @@ public void testEmptyDataSet() { List> emptyDataSet = ImmutableList.of(); String dataId = BoundedTableFactory.registerDataSet(emptyDataSet); - sql("CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", table, dataId); + sql( + "CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", + table, dataId); - Assert.assertEquals("Should have caught empty change log set.", ImmutableList.of(), + Assert.assertEquals( + "Should have caught empty change log set.", + ImmutableList.of(), sql("SELECT * FROM %s", table)); } @Test public void testBoundedTableFactory() { String table = name.getMethodName(); - List> dataSet = ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(1, "bbb"), - insertRow(2, "aaa"), - deleteRow(2, "aaa"), - insertRow(2, "bbb") - ), - ImmutableList.of( - updateBeforeRow(2, "bbb"), - updateAfterRow(2, "ccc"), - deleteRow(2, "ccc"), - insertRow(2, "ddd") - ), + List> dataSet = ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(1, "ccc"), - deleteRow(1, "ccc"), - insertRow(1, "ddd") - ) - ); + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(1, "bbb"), + insertRow(2, "aaa"), + deleteRow(2, "aaa"), + insertRow(2, "bbb")), + ImmutableList.of( + updateBeforeRow(2, "bbb"), + updateAfterRow(2, "ccc"), + deleteRow(2, "ccc"), + insertRow(2, "ddd")), + ImmutableList.of( + deleteRow(1, "bbb"), + insertRow(1, "ccc"), + deleteRow(1, "ccc"), + insertRow(1, "ddd"))); String dataId = BoundedTableFactory.registerDataSet(dataSet); - sql("CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", table, dataId); + sql( + "CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", + table, dataId); List rowSet = dataSet.stream().flatMap(Streams::stream).collect(Collectors.toList()); - Assert.assertEquals("Should have the expected change log events.", rowSet, sql("SELECT * FROM %s", table)); + Assert.assertEquals( + "Should have the expected change log events.", rowSet, sql("SELECT * FROM %s", table)); - Assert.assertEquals("Should have the expected change log events", - rowSet.stream().filter(r -> Objects.equals(r.getField(1), "aaa")).collect(Collectors.toList()), + Assert.assertEquals( + "Should have the expected change log events", + rowSet.stream() + .filter(r -> Objects.equals(r.getField(1), "aaa")) + .collect(Collectors.toList()), sql("SELECT * FROM %s WHERE data='aaa'", table)); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java index eae3233a6546..69b8ac269267 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.List; import java.util.Map; @@ -38,11 +39,7 @@ import org.apache.iceberg.types.Types; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.required; - -/** - * Test {@link FlinkInputFormat}. - */ +/** Test {@link FlinkInputFormat}. */ public class TestFlinkInputFormat extends TestFlinkSource { public TestFlinkInputFormat(String fileFormat) { @@ -56,20 +53,27 @@ public void before() throws IOException { @Override protected List run( - FlinkSource.Builder formatBuilder, Map sqlOptions, String sqlFilter, String... sqlSelectedFields) + FlinkSource.Builder formatBuilder, + Map sqlOptions, + String sqlFilter, + String... sqlSelectedFields) throws Exception { return runFormat(formatBuilder.tableLoader(tableLoader()).buildFormat()); } @Test public void testNestedProjection() throws Exception { - Schema schema = new Schema( - required(1, "data", Types.StringType.get()), - required(2, "nested", Types.StructType.of( - Types.NestedField.required(3, "f1", Types.StringType.get()), - Types.NestedField.required(4, "f2", Types.StringType.get()), - Types.NestedField.required(5, "f3", Types.LongType.get()))), - required(6, "id", Types.LongType.get())); + Schema schema = + new Schema( + required(1, "data", Types.StringType.get()), + required( + 2, + "nested", + Types.StructType.of( + Types.NestedField.required(3, "f1", Types.StringType.get()), + Types.NestedField.required(4, "f2", Types.StringType.get()), + Types.NestedField.required(5, "f3", Types.LongType.get()))), + required(6, "id", Types.LongType.get())); Table table = catalog.createTable(TableIdentifier.of("default", "t"), schema); @@ -81,13 +85,17 @@ public void testNestedProjection() throws Exception { // The Flink SQL output: [f2, data] // The FlinkInputFormat output: [nested[f2], data] - TableSchema projectedSchema = TableSchema.builder() - .field("nested", DataTypes.ROW(DataTypes.FIELD("f2", DataTypes.STRING()))) - .field("data", DataTypes.STRING()).build(); - List result = runFormat(FlinkSource.forRowData() - .tableLoader(tableLoader()) - .project(projectedSchema) - .buildFormat()); + TableSchema projectedSchema = + TableSchema.builder() + .field("nested", DataTypes.ROW(DataTypes.FIELD("f2", DataTypes.STRING()))) + .field("data", DataTypes.STRING()) + .build(); + List result = + runFormat( + FlinkSource.forRowData() + .tableLoader(tableLoader()) + .project(projectedSchema) + .buildFormat()); List expected = Lists.newArrayList(); for (Record record : writeRecords) { @@ -100,23 +108,28 @@ public void testNestedProjection() throws Exception { @Test public void testBasicProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(2, "time", Types.TimestampType.withZone()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(2, "time", Types.TimestampType.withZone())); Table table = catalog.createTable(TableIdentifier.of("default", "t"), writeSchema); List writeRecords = RandomGenericData.generate(writeSchema, 2, 0L); new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).appendToTable(writeRecords); - TableSchema projectedSchema = TableSchema.builder() - .field("id", DataTypes.BIGINT()) - .field("data", DataTypes.STRING()) - .build(); - List result = runFormat(FlinkSource.forRowData() - .tableLoader(tableLoader()).project(projectedSchema).buildFormat()); + TableSchema projectedSchema = + TableSchema.builder() + .field("id", DataTypes.BIGINT()) + .field("data", DataTypes.STRING()) + .build(); + List result = + runFormat( + FlinkSource.forRowData() + .tableLoader(tableLoader()) + .project(projectedSchema) + .buildFormat()); List expected = Lists.newArrayList(); for (Record record : writeRecords) { diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java index 2a593c4702b4..b2f914e51299 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -43,26 +42,35 @@ public TestFlinkInputFormatReaderDeletes(FileFormat inputFormat) { } @Override - protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) throws IOException { + protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) + throws IOException { Schema projected = testTable.schema().select(columns); RowType rowType = FlinkSchemaUtil.convert(projected); Map properties = Maps.newHashMap(); - properties.put(CatalogProperties.WAREHOUSE_LOCATION, hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); + properties.put( + CatalogProperties.WAREHOUSE_LOCATION, + hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); properties.put(CatalogProperties.URI, hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)); - properties.put(CatalogProperties.CLIENT_POOL_SIZE, + properties.put( + CatalogProperties.CLIENT_POOL_SIZE, Integer.toString(hiveConf.getInt("iceberg.hive.client-pool-size", 5))); CatalogLoader hiveCatalogLoader = CatalogLoader.hive(catalog.name(), hiveConf, properties); - FlinkInputFormat inputFormat = FlinkSource.forRowData() - .tableLoader(TableLoader.fromCatalog(hiveCatalogLoader, TableIdentifier.of("default", tableName))) - .project(FlinkSchemaUtil.toSchema(rowType)).buildFormat(); + FlinkInputFormat inputFormat = + FlinkSource.forRowData() + .tableLoader( + TableLoader.fromCatalog( + hiveCatalogLoader, TableIdentifier.of("default", tableName))) + .project(FlinkSchemaUtil.toSchema(rowType)) + .buildFormat(); StructLikeSet set = StructLikeSet.create(projected.asStruct()); - TestHelpers.readRowData(inputFormat, rowType).forEach(rowData -> { - RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); - set.add(wrapper.wrap(rowData)); - }); + TestHelpers.readRowData(inputFormat, rowType) + .forEach( + rowData -> { + RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); + set.add(wrapper.wrap(rowData)); + }); return set; } - } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java index 1670ed733421..3a7ec96cb1d6 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -44,7 +43,8 @@ protected FileAppender writeAndGetAppender(List records) throws RowType flinkSchema = FlinkSchemaUtil.convert(SCHEMA); FileAppender appender = - new FlinkAppenderFactory(SCHEMA, flinkSchema, ImmutableMap.of(), PartitionSpec.unpartitioned()) + new FlinkAppenderFactory( + SCHEMA, flinkSchema, ImmutableMap.of(), PartitionSpec.unpartitioned()) .newAppender(org.apache.iceberg.Files.localOutput(temp.newFile()), fileFormat); try (FileAppender fileAppender = appender) { records.stream().map(r -> RowDataConverter.convert(SCHEMA, r)).forEach(fileAppender::add); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java index cc3c71716ef7..987d79fed3c3 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Map; @@ -46,8 +45,7 @@ @RunWith(Parameterized.class) public abstract class TestFlinkReaderDeletesBase extends DeleteReadTests { - @ClassRule - public static final TemporaryFolder TEMP_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMP_FOLDER = new TemporaryFolder(); protected static String databaseName = "default"; @@ -60,9 +58,9 @@ public abstract class TestFlinkReaderDeletesBase extends DeleteReadTests { @Parameterized.Parameters(name = "fileFormat={0}") public static Object[][] parameters() { return new Object[][] { - new Object[] { FileFormat.PARQUET }, - new Object[] { FileFormat.AVRO }, - new Object[] { FileFormat.ORC } + new Object[] {FileFormat.PARQUET}, + new Object[] {FileFormat.AVRO}, + new Object[] {FileFormat.ORC} }; } @@ -75,8 +73,10 @@ public static void startMetastore() { metastore = new TestHiveMetastore(); metastore.start(); hiveConf = metastore.hiveConf(); - catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); } @AfterClass diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java index 9284b8fa9ef1..92363dd5a010 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.File; @@ -69,8 +68,7 @@ public abstract class TestFlinkScan { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); protected HadoopCatalog catalog; protected String warehouse; @@ -100,16 +98,18 @@ public void before() throws IOException { } @After - public void after() throws IOException { - } + public void after() throws IOException {} protected TableLoader tableLoader() { return TableLoader.fromHadoopTable(location); } protected abstract List runWithProjection(String... projected) throws Exception; + protected abstract List runWithFilter(Expression filter, String sqlFilter) throws Exception; + protected abstract List runWithOptions(Map options) throws Exception; + protected abstract List run() throws Exception; @Test @@ -122,31 +122,33 @@ public void testUnpartitionedTable() throws Exception { @Test public void testPartitionedTable() throws Exception { - Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + Table table = + catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); expectedRecords.get(0).set(2, "2020-03-20"); - new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).appendToTable( - org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); + new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER) + .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); } @Test public void testProjection() throws Exception { - Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + Table table = + catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); List inputRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).appendToTable( - org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), inputRecords); + new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER) + .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), inputRecords); assertRows(runWithProjection("data"), Row.of(inputRecords.get(0).get(0))); } @Test public void testIdentityPartitionProjections() throws Exception { - Schema logSchema = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get()) - ); + Schema logSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get()), + Types.NestedField.optional(3, "level", Types.StringType.get()), + Types.NestedField.optional(4, "message", Types.StringType.get())); PartitionSpec spec = PartitionSpec.builderFor(logSchema).identity("dt").identity("level").build(); @@ -158,8 +160,11 @@ public void testIdentityPartitionProjections() throws Exception { for (Record record : inputRecords) { record.set(1, "2020-03-2" + idx); record.set(2, Integer.toString(idx)); - append.appendFile(new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).writeFile( - org.apache.iceberg.TestHelpers.Row.of("2020-03-2" + idx, Integer.toString(idx)), ImmutableList.of(record))); + append.appendFile( + new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER) + .writeFile( + org.apache.iceberg.TestHelpers.Row.of("2020-03-2" + idx, Integer.toString(idx)), + ImmutableList.of(record))); idx += 1; } append.commit(); @@ -178,12 +183,18 @@ public void testIdentityPartitionProjections() throws Exception { validateIdentityPartitionProjections(table, Arrays.asList("message", "level"), inputRecords); validateIdentityPartitionProjections(table, Arrays.asList("level", "dt"), inputRecords); // out-of-order triplets - validateIdentityPartitionProjections(table, Arrays.asList("dt", "level", "message"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("level", "dt", "message"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("dt", "message", "level"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("level", "message", "dt"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("message", "dt", "level"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("message", "level", "dt"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("dt", "level", "message"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("level", "dt", "message"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("dt", "message", "level"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("level", "message", "dt"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("message", "dt", "level"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("message", "level", "dt"), inputRecords); } private void validateIdentityPartitionProjections( @@ -197,7 +208,9 @@ private void validateIdentityPartitionProjections( for (int i = 0; i < projectedFields.size(); i++) { String name = projectedFields.get(i); Assert.assertEquals( - "Projected field " + name + " should match", inputRecord.getField(name), actualRecord.getField(i)); + "Projected field " + name + " should match", + inputRecord.getField(name), + actualRecord.getField(i)); } } } @@ -220,10 +233,12 @@ public void testSnapshotReads() throws Exception { TestHelpers.assertRecords( runWithOptions(ImmutableMap.of("snapshot-id", Long.toString(snapshotId))), - expectedRecords, TestFixtures.SCHEMA); + expectedRecords, + TestFixtures.SCHEMA); TestHelpers.assertRecords( runWithOptions(ImmutableMap.of("as-of-timestamp", Long.toString(timestampMillis))), - expectedRecords, TestFixtures.SCHEMA); + expectedRecords, + TestFixtures.SCHEMA); } @Test @@ -250,57 +265,74 @@ public void testIncrementalRead() throws Exception { List expected2 = Lists.newArrayList(); expected2.addAll(records2); expected2.addAll(records3); - TestHelpers.assertRecords(runWithOptions( - ImmutableMap.builder() - .put("start-snapshot-id", Long.toString(snapshotId1)) - .put("end-snapshot-id", Long.toString(snapshotId3)).build()), - expected2, TestFixtures.SCHEMA); + TestHelpers.assertRecords( + runWithOptions( + ImmutableMap.builder() + .put("start-snapshot-id", Long.toString(snapshotId1)) + .put("end-snapshot-id", Long.toString(snapshotId3)) + .build()), + expected2, + TestFixtures.SCHEMA); } @Test public void testFilterExp() throws Exception { - Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + Table table = + catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); expectedRecords.get(0).set(2, "2020-03-20"); expectedRecords.get(1).set(2, "2020-03-20"); GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); - DataFile dataFile1 = helper.writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); - DataFile dataFile2 = helper.writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-21", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); + DataFile dataFile1 = + helper.writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); + DataFile dataFile2 = + helper.writeFile( + org.apache.iceberg.TestHelpers.Row.of("2020-03-21", 0), + RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); helper.appendToTable(dataFile1, dataFile2); - TestHelpers.assertRecords(runWithFilter( - Expressions.equal("dt", "2020-03-20"), "where dt='2020-03-20'"), + TestHelpers.assertRecords( + runWithFilter(Expressions.equal("dt", "2020-03-20"), "where dt='2020-03-20'"), expectedRecords, TestFixtures.SCHEMA); } @Test public void testPartitionTypes() throws Exception { - Schema typesSchema = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "decimal", Types.DecimalType.of(38, 18)), - Types.NestedField.optional(3, "str", Types.StringType.get()), - Types.NestedField.optional(4, "binary", Types.BinaryType.get()), - Types.NestedField.optional(5, "date", Types.DateType.get()), - Types.NestedField.optional(6, "time", Types.TimeType.get()), - Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone()) - ); - PartitionSpec spec = PartitionSpec.builderFor(typesSchema).identity("decimal").identity("str").identity("binary") - .identity("date").identity("time").identity("timestamp").build(); + Schema typesSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "decimal", Types.DecimalType.of(38, 18)), + Types.NestedField.optional(3, "str", Types.StringType.get()), + Types.NestedField.optional(4, "binary", Types.BinaryType.get()), + Types.NestedField.optional(5, "date", Types.DateType.get()), + Types.NestedField.optional(6, "time", Types.TimeType.get()), + Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone())); + PartitionSpec spec = + PartitionSpec.builderFor(typesSchema) + .identity("decimal") + .identity("str") + .identity("binary") + .identity("date") + .identity("time") + .identity("timestamp") + .build(); Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, typesSchema, spec); List records = RandomGenericData.generate(typesSchema, 10, 0L); GenericAppenderHelper appender = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); for (Record record : records) { - org.apache.iceberg.TestHelpers.Row partition = org.apache.iceberg.TestHelpers.Row.of( - record.get(1), - record.get(2), - record.get(3), - record.get(4) == null ? null : DateTimeUtil.daysFromDate((LocalDate) record.get(4)), - record.get(5) == null ? null : DateTimeUtil.microsFromTime((LocalTime) record.get(5)), - record.get(6) == null ? null : DateTimeUtil.microsFromTimestamp((LocalDateTime) record.get(6))); + org.apache.iceberg.TestHelpers.Row partition = + org.apache.iceberg.TestHelpers.Row.of( + record.get(1), + record.get(2), + record.get(3), + record.get(4) == null ? null : DateTimeUtil.daysFromDate((LocalDate) record.get(4)), + record.get(5) == null ? null : DateTimeUtil.microsFromTime((LocalTime) record.get(5)), + record.get(6) == null + ? null + : DateTimeUtil.microsFromTimestamp((LocalDateTime) record.get(6))); appender.appendToTable(partition, Collections.singletonList(record)); } @@ -309,10 +341,14 @@ public void testPartitionTypes() throws Exception { @Test public void testCustomizedFlinkDataTypes() throws Exception { - Schema schema = new Schema( - Types.NestedField.required( - 1, "map", Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.StringType.get())), - Types.NestedField.required(4, "arr", Types.ListType.ofRequired(5, Types.StringType.get()))); + Schema schema = + new Schema( + Types.NestedField.required( + 1, + "map", + Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.StringType.get())), + Types.NestedField.required( + 4, "arr", Types.ListType.ofRequired(5, Types.StringType.get()))); Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, schema); List records = RandomGenericData.generate(schema, 10, 0L); GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java index fd570ff12445..5d90f5996d1c 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -48,9 +47,7 @@ import org.junit.Assert; import org.junit.Test; -/** - * Test Flink SELECT SQLs. - */ +/** Test Flink SELECT SQLs. */ public class TestFlinkScanSql extends TestFlinkSource { private volatile TableEnvironment tEnv; @@ -62,18 +59,22 @@ public TestFlinkScanSql(String fileFormat) { @Override public void before() throws IOException { super.before(); - sql("create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", warehouse); + sql( + "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + warehouse); sql("use catalog iceberg_catalog"); - getTableEnv().getConfig().getConfiguration().set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); + getTableEnv() + .getConfig() + .getConfiguration() + .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); } private TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { if (tEnv == null) { - this.tEnv = TableEnvironment.create(EnvironmentSettings - .newInstance() - .inBatchMode().build()); + this.tEnv = + TableEnvironment.create(EnvironmentSettings.newInstance().inBatchMode().build()); } } } @@ -81,8 +82,11 @@ private TableEnvironment getTableEnv() { } @Override - protected List run(FlinkSource.Builder formatBuilder, Map sqlOptions, String sqlFilter, - String... sqlSelectedFields) { + protected List run( + FlinkSource.Builder formatBuilder, + Map sqlOptions, + String sqlFilter, + String... sqlSelectedFields) { String select = String.join(",", sqlSelectedFields); StringBuilder builder = new StringBuilder(); @@ -103,7 +107,9 @@ protected List run(FlinkSource.Builder formatBuilder, Map s @Test public void testResiduals() throws Exception { - Table table = catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); + Table table = + catalog.createTable( + TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); List writeRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); writeRecords.get(0).set(1, 123L); @@ -117,21 +123,29 @@ public void testResiduals() throws Exception { expectedRecords.add(writeRecords.get(0)); DataFile dataFile1 = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), writeRecords); - DataFile dataFile2 = helper.writeFile(TestHelpers.Row.of("2020-03-21", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); + DataFile dataFile2 = + helper.writeFile( + TestHelpers.Row.of("2020-03-21", 0), + RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); helper.appendToTable(dataFile1, dataFile2); - Expression filter = Expressions.and(Expressions.equal("dt", "2020-03-20"), Expressions.equal("id", 123)); - org.apache.iceberg.flink.TestHelpers.assertRecords(runWithFilter( - filter, "where dt='2020-03-20' and id=123"), expectedRecords, TestFixtures.SCHEMA); + Expression filter = + Expressions.and(Expressions.equal("dt", "2020-03-20"), Expressions.equal("id", 123)); + org.apache.iceberg.flink.TestHelpers.assertRecords( + runWithFilter(filter, "where dt='2020-03-20' and id=123"), + expectedRecords, + TestFixtures.SCHEMA); } @Test public void testInferedParallelism() throws IOException { - Table table = catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); + Table table = + catalog.createTable( + TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); TableLoader tableLoader = TableLoader.fromHadoopTable(table.location()); - FlinkInputFormat flinkInputFormat = FlinkSource.forRowData().tableLoader(tableLoader).table(table).buildFormat(); + FlinkInputFormat flinkInputFormat = + FlinkSource.forRowData().tableLoader(tableLoader).table(table).buildFormat(); ScanContext scanContext = ScanContext.builder().build(); // Empty table, infer parallelism should be at least 1 @@ -139,44 +153,57 @@ public void testInferedParallelism() throws IOException { Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); - DataFile dataFile1 = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); - DataFile dataFile2 = helper.writeFile(TestHelpers.Row.of("2020-03-21", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); + DataFile dataFile1 = + helper.writeFile( + TestHelpers.Row.of("2020-03-20", 0), + RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); + DataFile dataFile2 = + helper.writeFile( + TestHelpers.Row.of("2020-03-21", 0), + RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); helper.appendToTable(dataFile1, dataFile2); // Make sure to generate 2 CombinedScanTasks long maxFileLen = Math.max(dataFile1.fileSizeInBytes(), dataFile2.fileSizeInBytes()); - sql("ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", maxFileLen); + sql( + "ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", + maxFileLen); - // 2 splits (max infer is the default value 100 , max > splits num), the parallelism is splits num : 2 + // 2 splits (max infer is the default value 100 , max > splits num), the parallelism is splits + // num : 2 parallelism = FlinkSource.forRowData().inferParallelism(flinkInputFormat, scanContext); Assert.assertEquals("Should produce the expected parallelism.", 2, parallelism); // 2 splits and limit is 1 , max infer parallelism is default 100, // which is greater than splits num and limit, the parallelism is the limit value : 1 - parallelism = FlinkSource.forRowData().inferParallelism(flinkInputFormat, ScanContext.builder().limit(1).build()); + parallelism = + FlinkSource.forRowData() + .inferParallelism(flinkInputFormat, ScanContext.builder().limit(1).build()); Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); // 2 splits and max infer parallelism is 1 (max < splits num), the parallelism is 1 Configuration configuration = new Configuration(); configuration.setInteger(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX, 1); - parallelism = FlinkSource.forRowData() - .flinkConf(configuration) - .inferParallelism(flinkInputFormat, ScanContext.builder().build()); + parallelism = + FlinkSource.forRowData() + .flinkConf(configuration) + .inferParallelism(flinkInputFormat, ScanContext.builder().build()); Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); - // 2 splits, max infer parallelism is 1, limit is 3, the parallelism is max infer parallelism : 1 - parallelism = FlinkSource.forRowData() - .flinkConf(configuration) - .inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build()); + // 2 splits, max infer parallelism is 1, limit is 3, the parallelism is max infer parallelism : + // 1 + parallelism = + FlinkSource.forRowData() + .flinkConf(configuration) + .inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build()); Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); // 2 splits, infer parallelism is disabled, the parallelism is flink default parallelism 1 configuration.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); - parallelism = FlinkSource.forRowData() - .flinkConf(configuration) - .inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build()); + parallelism = + FlinkSource.forRowData() + .flinkConf(configuration) + .inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build()); Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); } @@ -185,7 +212,8 @@ public void testInferParallelismWithGlobalSetting() throws IOException { Configuration cfg = tEnv.getConfig().getConfiguration(); cfg.set(PipelineOptions.MAX_PARALLELISM, 1); - Table table = catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, null); + Table table = + catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, null); GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); List expectedRecords = Lists.newArrayList(); @@ -199,16 +227,20 @@ public void testInferParallelismWithGlobalSetting() throws IOException { } // Make sure to generate multiple CombinedScanTasks - sql("ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", maxFileLen); + sql( + "ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", + maxFileLen); List results = run(null, Maps.newHashMap(), "", "*"); - org.apache.iceberg.flink.TestHelpers.assertRecords(results, expectedRecords, TestFixtures.SCHEMA); + org.apache.iceberg.flink.TestHelpers.assertRecords( + results, expectedRecords, TestFixtures.SCHEMA); } @Test public void testExposeLocality() throws Exception { Table table = - catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); + catalog.createTable( + TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); TableLoader tableLoader = TableLoader.fromHadoopTable(table.location()); List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 10, 0L); @@ -220,22 +252,30 @@ public void testExposeLocality() throws Exception { // test sql api Configuration tableConf = getTableEnv().getConfig().getConfiguration(); - tableConf.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), false); + tableConf.setBoolean( + FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), false); List results = sql("select * from t"); - org.apache.iceberg.flink.TestHelpers.assertRecords(results, expectedRecords, TestFixtures.SCHEMA); + org.apache.iceberg.flink.TestHelpers.assertRecords( + results, expectedRecords, TestFixtures.SCHEMA); // test table api - tableConf.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), true); + tableConf.setBoolean( + FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), true); FlinkSource.Builder builder = FlinkSource.forRowData().tableLoader(tableLoader).table(table); Boolean localityEnabled = - DynMethods.builder("localityEnabled").hiddenImpl(builder.getClass()).build().invoke(builder); - // When running with CI or local, `localityEnabled` will be false even if this configuration is enabled + DynMethods.builder("localityEnabled") + .hiddenImpl(builder.getClass()) + .build() + .invoke(builder); + // When running with CI or local, `localityEnabled` will be false even if this configuration is + // enabled Assert.assertFalse("Expose split locality info should be false.", localityEnabled); results = run(builder, Maps.newHashMap(), "where dt='2020-03-20'", "*"); - org.apache.iceberg.flink.TestHelpers.assertRecords(results, expectedRecords, TestFixtures.SCHEMA); + org.apache.iceberg.flink.TestHelpers.assertRecords( + results, expectedRecords, TestFixtures.SCHEMA); } private List sql(String query, Object... args) { diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java index 633a32a4c3d1..3a01952cd9ec 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Collections; @@ -40,8 +39,10 @@ public abstract class TestFlinkSource extends TestFlinkScan { @Override protected List runWithProjection(String... projected) throws Exception { TableSchema.Builder builder = TableSchema.builder(); - TableSchema schema = FlinkSchemaUtil.toSchema(FlinkSchemaUtil.convert( - catalog.loadTable(TableIdentifier.of("default", "t")).schema())); + TableSchema schema = + FlinkSchemaUtil.toSchema( + FlinkSchemaUtil.convert( + catalog.loadTable(TableIdentifier.of("default", "t")).schema())); for (String field : projected) { TableColumn column = schema.getTableColumn(field).get(); builder.field(column.getName(), column.getType()); @@ -51,14 +52,16 @@ protected List runWithProjection(String... projected) throws Exception { @Override protected List runWithFilter(Expression filter, String sqlFilter) throws Exception { - FlinkSource.Builder builder = FlinkSource.forRowData().filters(Collections.singletonList(filter)); + FlinkSource.Builder builder = + FlinkSource.forRowData().filters(Collections.singletonList(filter)); return run(builder, Maps.newHashMap(), sqlFilter, "*"); } @Override protected List runWithOptions(Map options) throws Exception { FlinkSource.Builder builder = FlinkSource.forRowData(); - Optional.ofNullable(options.get("snapshot-id")).ifPresent(value -> builder.snapshotId(Long.parseLong(value))); + Optional.ofNullable(options.get("snapshot-id")) + .ifPresent(value -> builder.snapshotId(Long.parseLong(value))); Optional.ofNullable(options.get("start-snapshot-id")) .ifPresent(value -> builder.startSnapshotId(Long.parseLong(value))); Optional.ofNullable(options.get("end-snapshot-id")) @@ -73,6 +76,10 @@ protected List run() throws Exception { return run(FlinkSource.forRowData(), Maps.newHashMap(), "", "*"); } - protected abstract List run(FlinkSource.Builder formatBuilder, Map sqlOptions, String sqlFilter, - String... sqlSelectedFields) throws Exception; + protected abstract List run( + FlinkSource.Builder formatBuilder, + Map sqlOptions, + String sqlFilter, + String... sqlSelectedFields) + throws Exception; } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java index b2e887afff01..0cfa1073886d 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Arrays; @@ -81,8 +80,9 @@ protected List run() throws Exception { return run(null, null, null); } - private List run(Schema projectedSchema, List filters, - Map options) throws Exception { + private List run( + Schema projectedSchema, List filters, Map options) + throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); @@ -94,10 +94,11 @@ private List run(Schema projectedSchema, List filters, table = tableLoader.loadTable(); } - IcebergSource.Builder sourceBuilder = IcebergSource.forRowData() - .tableLoader(tableLoader()) - .assignerFactory(new SimpleSplitAssignerFactory()) - .flinkConfig(config); + IcebergSource.Builder sourceBuilder = + IcebergSource.forRowData() + .tableLoader(tableLoader()) + .assignerFactory(new SimpleSplitAssignerFactory()) + .flinkConfig(config); if (projectedSchema != null) { sourceBuilder.project(projectedSchema); } @@ -108,17 +109,19 @@ private List run(Schema projectedSchema, List filters, sourceBuilder.properties(options); } - DataStream stream = env.fromSource( - sourceBuilder.build(), - WatermarkStrategy.noWatermarks(), - "testBasicRead", - TypeInformation.of(RowData.class)) - .map(new RowDataToRowMapper(FlinkSchemaUtil.convert( - projectedSchema == null ? table.schema() : projectedSchema))); + DataStream stream = + env.fromSource( + sourceBuilder.build(), + WatermarkStrategy.noWatermarks(), + "testBasicRead", + TypeInformation.of(RowData.class)) + .map( + new RowDataToRowMapper( + FlinkSchemaUtil.convert( + projectedSchema == null ? table.schema() : projectedSchema))); try (CloseableIterator iter = stream.executeAndCollect()) { return Lists.newArrayList(iter); } } - } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java index 41857050e469..582a12523300 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.time.Duration; @@ -59,38 +58,41 @@ public class TestIcebergSourceContinuous { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); @Rule - public final HadoopTableResource tableResource = new HadoopTableResource(TEMPORARY_FOLDER, - TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); + public final HadoopTableResource tableResource = + new HadoopTableResource( + TEMPORARY_FOLDER, TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); private final AtomicLong randomSeed = new AtomicLong(0L); @Test public void testTableScanThenIncremental() throws Exception { - GenericAppenderHelper dataAppender = new GenericAppenderHelper( - tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); // snapshot1 - List batch1 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch1 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch1); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); - try (CloseableIterator iter = createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { List result1 = waitForResult(iter, 2); TestHelpers.assertRecords(result1, batch1, tableResource.table().schema()); // snapshot2 - List batch2 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch2 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch2); tableResource.table().currentSnapshot().snapshotId(); @@ -98,8 +100,9 @@ public void testTableScanThenIncremental() throws Exception { TestHelpers.assertRecords(result2, batch2, tableResource.table().schema()); // snapshot3 - List batch3 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch3 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch3); tableResource.table().currentSnapshot().snapshotId(); @@ -110,42 +113,46 @@ public void testTableScanThenIncremental() throws Exception { @Test public void testEarliestSnapshot() throws Exception { - GenericAppenderHelper dataAppender = new GenericAppenderHelper( - tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); // snapshot0 - List batch0 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch0 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch0); // snapshot1 - List batch1 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch1 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch1); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .build(); - try (CloseableIterator iter = createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { List result1 = waitForResult(iter, 4); List combinedBatch0AndBatch1 = Lists.newArrayList(batch0); combinedBatch0AndBatch1.addAll(batch1); TestHelpers.assertRecords(result1, combinedBatch0AndBatch1, tableResource.table().schema()); // snapshot2 - List batch2 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch2 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch2); List result2 = waitForResult(iter, 2); TestHelpers.assertRecords(result2, batch2, tableResource.table().schema()); // snapshot3 - List batch3 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch3 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch3); List result3 = waitForResult(iter, 2); @@ -155,26 +162,28 @@ public void testEarliestSnapshot() throws Exception { @Test public void testLatestSnapshot() throws Exception { - GenericAppenderHelper dataAppender = new GenericAppenderHelper( - tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); // snapshot0 - List batch0 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch0 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch0); // snapshot1 - List batch1 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch1 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch1); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .build(); - try (CloseableIterator iter = createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { // we want to make sure job is running first so that enumerator can // start from the latest snapshot before inserting the next batch2 below. waitUntilJobIsRunning(MINI_CLUSTER_RESOURCE.getClusterClient()); @@ -184,16 +193,18 @@ public void testLatestSnapshot() throws Exception { TestHelpers.assertRecords(result1, batch1, tableResource.table().schema()); // snapshot2 - List batch2 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch2 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch2); List result2 = waitForResult(iter, 2); TestHelpers.assertRecords(result2, batch2, tableResource.table().schema()); // snapshot3 - List batch3 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch3 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch3); List result3 = waitForResult(iter, 2); @@ -203,43 +214,47 @@ public void testLatestSnapshot() throws Exception { @Test public void testSpecificSnapshotId() throws Exception { - GenericAppenderHelper dataAppender = new GenericAppenderHelper( - tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); // snapshot0 - List batch0 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch0 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch0); long snapshot0 = tableResource.table().currentSnapshot().snapshotId(); // snapshot1 - List batch1 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch1 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch1); long snapshot1 = tableResource.table().currentSnapshot().snapshotId(); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(snapshot1) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(snapshot1) + .build(); - try (CloseableIterator iter = createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { List result1 = waitForResult(iter, 2); TestHelpers.assertRecords(result1, batch1, tableResource.table().schema()); // snapshot2 - List batch2 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch2 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch2); List result2 = waitForResult(iter, 2); TestHelpers.assertRecords(result2, batch2, tableResource.table().schema()); // snapshot3 - List batch3 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch3 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch3); List result3 = waitForResult(iter, 2); @@ -249,12 +264,12 @@ public void testSpecificSnapshotId() throws Exception { @Test public void testSpecificSnapshotTimestamp() throws Exception { - GenericAppenderHelper dataAppender = new GenericAppenderHelper( - tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); // snapshot0 - List batch0 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch0 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch0); long snapshot0Timestamp = tableResource.table().currentSnapshot().timestampMillis(); @@ -262,34 +277,38 @@ public void testSpecificSnapshotTimestamp() throws Exception { Thread.sleep(2); // snapshot1 - List batch1 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch1 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch1); long snapshot1Timestamp = tableResource.table().currentSnapshot().timestampMillis(); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(snapshot1Timestamp) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(snapshot1Timestamp) + .build(); - try (CloseableIterator iter = createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { // consume data from snapshot1 List result1 = waitForResult(iter, 2); TestHelpers.assertRecords(result1, batch1, tableResource.table().schema()); // snapshot2 - List batch2 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch2 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch2); List result2 = waitForResult(iter, 2); TestHelpers.assertRecords(result2, batch2, tableResource.table().schema()); // snapshot3 - List batch3 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch3 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch3); List result3 = waitForResult(iter, 2); @@ -301,20 +320,21 @@ private DataStream createStream(ScanContext scanContext) throws Exception { // start the source and collect output StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); - DataStream stream = env.fromSource( - IcebergSource.forRowData() - .tableLoader(tableResource.tableLoader()) - .assignerFactory(new SimpleSplitAssignerFactory()) - .streaming(scanContext.isStreaming()) - .streamingStartingStrategy(scanContext.streamingStartingStrategy()) - .startSnapshotTimestamp(scanContext.startSnapshotTimestamp()) - .startSnapshotId(scanContext.startSnapshotId()) - .monitorInterval(Duration.ofMillis(10L)) - .build(), - WatermarkStrategy.noWatermarks(), - "icebergSource", - TypeInformation.of(RowData.class)) - .map(new RowDataToRowMapper(FlinkSchemaUtil.convert(tableResource.table().schema()))); + DataStream stream = + env.fromSource( + IcebergSource.forRowData() + .tableLoader(tableResource.tableLoader()) + .assignerFactory(new SimpleSplitAssignerFactory()) + .streaming(scanContext.isStreaming()) + .streamingStartingStrategy(scanContext.streamingStartingStrategy()) + .startSnapshotTimestamp(scanContext.startSnapshotTimestamp()) + .startSnapshotId(scanContext.startSnapshotId()) + .monitorInterval(Duration.ofMillis(10L)) + .build(), + WatermarkStrategy.noWatermarks(), + "icebergSource", + TypeInformation.of(RowData.class)) + .map(new RowDataToRowMapper(FlinkSchemaUtil.convert(tableResource.table().schema()))); return stream; } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java index b16a277c3886..cad1fa67ae19 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.time.Duration; @@ -60,9 +59,7 @@ public class TestIcebergSourceFailover { private static final int PARALLELISM = 4; - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); - + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); @Rule public final MiniClusterWithClientResource miniClusterResource = @@ -75,12 +72,14 @@ public class TestIcebergSourceFailover { .build()); @Rule - public final HadoopTableResource sourceTableResource = new HadoopTableResource(TEMPORARY_FOLDER, - TestFixtures.DATABASE, TestFixtures.TABLE, schema()); + public final HadoopTableResource sourceTableResource = + new HadoopTableResource( + TEMPORARY_FOLDER, TestFixtures.DATABASE, TestFixtures.TABLE, schema()); @Rule - public final HadoopTableResource sinkTableResource = new HadoopTableResource(TEMPORARY_FOLDER, - TestFixtures.DATABASE, TestFixtures.SINK_TABLE, schema()); + public final HadoopTableResource sinkTableResource = + new HadoopTableResource( + TEMPORARY_FOLDER, TestFixtures.DATABASE, TestFixtures.SINK_TABLE, schema()); protected IcebergSource.Builder sourceBuilder() { Configuration config = new Configuration(); @@ -99,10 +98,9 @@ protected List generateRecords(int numRecords, long seed) { return RandomGenericData.generate(schema(), numRecords, seed); } - protected void assertRecords(Table table, List expectedRecords, Duration interval, int maxCount) - throws Exception { - SimpleDataUtil.assertTableRecords(table, - expectedRecords, interval, maxCount); + protected void assertRecords( + Table table, List expectedRecords, Duration interval, int maxCount) throws Exception { + SimpleDataUtil.assertTableRecords(table, expectedRecords, interval, maxCount); } @Test @@ -117,8 +115,9 @@ public void testBoundedWithJobManagerFailover() throws Exception { private void testBoundedIcebergSource(FailoverType failoverType) throws Exception { List expectedRecords = Lists.newArrayList(); - GenericAppenderHelper dataAppender = new GenericAppenderHelper( - sourceTableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper( + sourceTableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); for (int i = 0; i < 4; ++i) { List records = generateRecords(2, i); expectedRecords.addAll(records); @@ -129,11 +128,12 @@ private void testBoundedIcebergSource(FailoverType failoverType) throws Exceptio env.setParallelism(PARALLELISM); env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); - DataStream stream = env.fromSource( - sourceBuilder().build(), - WatermarkStrategy.noWatermarks(), - "IcebergSource", - TypeInformation.of(RowData.class)); + DataStream stream = + env.fromSource( + sourceBuilder().build(), + WatermarkStrategy.noWatermarks(), + "IcebergSource", + TypeInformation.of(RowData.class)); DataStream streamFailingInTheMiddleOfReading = RecordCounterToFail.wrapWithFailureAfter(stream, expectedRecords.size() / 2); @@ -170,8 +170,9 @@ public void testContinuousWithJobManagerFailover() throws Exception { } private void testContinuousIcebergSource(FailoverType failoverType) throws Exception { - GenericAppenderHelper dataAppender = new GenericAppenderHelper( - sourceTableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper( + sourceTableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); List expectedRecords = Lists.newArrayList(); List batch = generateRecords(2, 0); @@ -184,15 +185,16 @@ private void testContinuousIcebergSource(FailoverType failoverType) throws Excep Configuration config = new Configuration(); config.setInteger(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 128); - DataStream stream = env.fromSource( - sourceBuilder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10)) - .streamingStartingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(), - WatermarkStrategy.noWatermarks(), - "IcebergSource", - TypeInformation.of(RowData.class)); + DataStream stream = + env.fromSource( + sourceBuilder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10)) + .streamingStartingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(), + WatermarkStrategy.noWatermarks(), + "IcebergSource", + TypeInformation.of(RowData.class)); // CollectStreamSink from DataStream#executeAndCollect() doesn't guarantee // exactly-once behavior. When Iceberg sink, we can verify end-to-end @@ -211,8 +213,7 @@ private void testContinuousIcebergSource(FailoverType failoverType) throws Excep expectedRecords.addAll(records); dataAppender.appendToTable(records); if (i == 2) { - triggerFailover(failoverType, jobId, () -> { - }, miniClusterResource.getMiniCluster()); + triggerFailover(failoverType, jobId, () -> {}, miniClusterResource.getMiniCluster()); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java index 07b08590ba52..2974f4bc94a2 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -52,58 +51,64 @@ public class TestIcebergSourceReaderDeletes extends TestFlinkReaderDeletesBase { private static final int PARALLELISM = 4; - @ClassRule - public static final TemporaryFolder TMP_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TMP_FOLDER = new TemporaryFolder(); @ClassRule - public static final MiniClusterWithClientResource MINI_CLUSTER = new MiniClusterWithClientResource( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(1) - .setNumberSlotsPerTaskManager(PARALLELISM) - .build()); + public static final MiniClusterWithClientResource MINI_CLUSTER = + new MiniClusterWithClientResource( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(1) + .setNumberSlotsPerTaskManager(PARALLELISM) + .build()); public TestIcebergSourceReaderDeletes(FileFormat inputFormat) { super(inputFormat); } @Override - protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) throws IOException { + protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) + throws IOException { Schema projected = testTable.schema().select(columns); RowType rowType = FlinkSchemaUtil.convert(projected); Map properties = Maps.newHashMap(); - properties.put(CatalogProperties.WAREHOUSE_LOCATION, hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); + properties.put( + CatalogProperties.WAREHOUSE_LOCATION, + hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); properties.put(CatalogProperties.URI, hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)); - properties.put(CatalogProperties.CLIENT_POOL_SIZE, + properties.put( + CatalogProperties.CLIENT_POOL_SIZE, Integer.toString(hiveConf.getInt("iceberg.hive.client-pool-size", 5))); CatalogLoader hiveCatalogLoader = CatalogLoader.hive(catalog.name(), hiveConf, properties); - TableLoader hiveTableLoader = TableLoader.fromCatalog(hiveCatalogLoader, TableIdentifier.of("default", tableName)); + TableLoader hiveTableLoader = + TableLoader.fromCatalog(hiveCatalogLoader, TableIdentifier.of("default", tableName)); hiveTableLoader.open(); try (TableLoader tableLoader = hiveTableLoader) { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); - DataStream stream = env.fromSource( - IcebergSource.builder() - .tableLoader(tableLoader) - .assignerFactory(new SimpleSplitAssignerFactory()) - .project(projected) - .build(), - WatermarkStrategy.noWatermarks(), - "testBasicRead", - TypeInformation.of(RowData.class)); + DataStream stream = + env.fromSource( + IcebergSource.builder() + .tableLoader(tableLoader) + .assignerFactory(new SimpleSplitAssignerFactory()) + .project(projected) + .build(), + WatermarkStrategy.noWatermarks(), + "testBasicRead", + TypeInformation.of(RowData.class)); try (CloseableIterator iter = stream.executeAndCollect()) { List rowDataList = Lists.newArrayList(iter); StructLikeSet set = StructLikeSet.create(projected.asStruct()); - rowDataList.forEach(rowData -> { - RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); - set.add(wrapper.wrap(rowData)); - }); + rowDataList.forEach( + rowData -> { + RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); + set.add(wrapper.wrap(rowData)); + }); return set; } catch (Exception e) { throw new IOException("Failed to collect result", e); } } } - } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java index baccadc31f08..bc63e4a0b282 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -51,8 +50,7 @@ @RunWith(Parameterized.class) public class TestProjectMetaColumn { - @Rule - public final TemporaryFolder folder = new TemporaryFolder(); + @Rule public final TemporaryFolder folder = new TemporaryFolder(); private final FileFormat format; @Parameterized.Parameters(name = "fileFormat={0}") @@ -60,8 +58,7 @@ public static Iterable parameters() { return Lists.newArrayList( new Object[] {FileFormat.PARQUET}, new Object[] {FileFormat.ORC}, - new Object[] {FileFormat.AVRO} - ); + new Object[] {FileFormat.AVRO}); } public TestProjectMetaColumn(FileFormat format) { @@ -71,28 +68,30 @@ public TestProjectMetaColumn(FileFormat format) { private void testSkipToRemoveMetaColumn(int formatVersion) throws IOException { // Create the table with given format version. String location = folder.getRoot().getAbsolutePath(); - Table table = SimpleDataUtil.createTable(location, - ImmutableMap.of(TableProperties.FORMAT_VERSION, String.valueOf(formatVersion)), - false); - - List rows = Lists.newArrayList( - SimpleDataUtil.createInsert(1, "AAA"), - SimpleDataUtil.createInsert(2, "BBB"), - SimpleDataUtil.createInsert(3, "CCC") - ); + Table table = + SimpleDataUtil.createTable( + location, + ImmutableMap.of(TableProperties.FORMAT_VERSION, String.valueOf(formatVersion)), + false); + + List rows = + Lists.newArrayList( + SimpleDataUtil.createInsert(1, "AAA"), + SimpleDataUtil.createInsert(2, "BBB"), + SimpleDataUtil.createInsert(3, "CCC")); writeAndCommit(table, ImmutableList.of(), false, rows); - FlinkInputFormat input = FlinkSource - .forRowData() - .tableLoader(TableLoader.fromHadoopTable(location)) - .buildFormat(); + FlinkInputFormat input = + FlinkSource.forRowData().tableLoader(TableLoader.fromHadoopTable(location)).buildFormat(); List results = Lists.newArrayList(); - TestHelpers.readRowData(input, rowData -> { - // If project to remove the meta columns, it will get a RowDataProjection. - Assert.assertTrue(rowData instanceof GenericRowData); - results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); - }); + TestHelpers.readRowData( + input, + rowData -> { + // If project to remove the meta columns, it will get a RowDataProjection. + Assert.assertTrue(rowData instanceof GenericRowData); + results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); + }); // Assert the results. TestHelpers.assertRows(rows, results, SimpleDataUtil.ROW_TYPE); @@ -112,37 +111,41 @@ public void testV2SkipToRemoveMetaColumn() throws IOException { public void testV2RemoveMetaColumn() throws Exception { // Create the v2 table. String location = folder.getRoot().getAbsolutePath(); - Table table = SimpleDataUtil.createTable(location, ImmutableMap.of(TableProperties.FORMAT_VERSION, "2"), false); - - List rows = Lists.newArrayList( - SimpleDataUtil.createInsert(1, "AAA"), - SimpleDataUtil.createDelete(1, "AAA"), - SimpleDataUtil.createInsert(2, "AAA"), - SimpleDataUtil.createInsert(2, "BBB") - ); + Table table = + SimpleDataUtil.createTable( + location, ImmutableMap.of(TableProperties.FORMAT_VERSION, "2"), false); + + List rows = + Lists.newArrayList( + SimpleDataUtil.createInsert(1, "AAA"), + SimpleDataUtil.createDelete(1, "AAA"), + SimpleDataUtil.createInsert(2, "AAA"), + SimpleDataUtil.createInsert(2, "BBB")); int eqFieldId = table.schema().findField("data").fieldId(); writeAndCommit(table, ImmutableList.of(eqFieldId), true, rows); - FlinkInputFormat input = FlinkSource - .forRowData() - .tableLoader(TableLoader.fromHadoopTable(location)) - .buildFormat(); + FlinkInputFormat input = + FlinkSource.forRowData().tableLoader(TableLoader.fromHadoopTable(location)).buildFormat(); List results = Lists.newArrayList(); - TestHelpers.readRowData(input, rowData -> { - // If project to remove the meta columns, it will get a RowDataProjection. - Assert.assertTrue(rowData instanceof RowDataProjection); - results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); - }); + TestHelpers.readRowData( + input, + rowData -> { + // If project to remove the meta columns, it will get a RowDataProjection. + Assert.assertTrue(rowData instanceof RowDataProjection); + results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); + }); // Assert the results. - TestHelpers.assertRows(ImmutableList.of( - SimpleDataUtil.createInsert(2, "AAA"), - SimpleDataUtil.createInsert(2, "BBB") - ), results, SimpleDataUtil.ROW_TYPE); + TestHelpers.assertRows( + ImmutableList.of( + SimpleDataUtil.createInsert(2, "AAA"), SimpleDataUtil.createInsert(2, "BBB")), + results, + SimpleDataUtil.ROW_TYPE); } - private void writeAndCommit(Table table, List eqFieldIds, boolean upsert, List rows) + private void writeAndCommit( + Table table, List eqFieldIds, boolean upsert, List rows) throws IOException { TaskWriter writer = createTaskWriter(table, eqFieldIds, upsert); try (TaskWriter io = writer) { @@ -165,14 +168,16 @@ private void writeAndCommit(Table table, List eqFieldIds, boolean upser delta.commit(); } - private TaskWriter createTaskWriter(Table table, List equalityFieldIds, boolean upsert) { - TaskWriterFactory taskWriterFactory = new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - SimpleDataUtil.ROW_TYPE, - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, - format, - equalityFieldIds, - upsert); + private TaskWriter createTaskWriter( + Table table, List equalityFieldIds, boolean upsert) { + TaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + SimpleDataUtil.ROW_TYPE, + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, + format, + equalityFieldIds, + upsert); taskWriterFactory.initialize(1, 1); return taskWriterFactory.create(); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java index 174d40258371..10fa4ecf1329 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -63,16 +62,18 @@ protected TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { if (tEnv == null) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings - .newInstance() - .inStreamingMode(); + EnvironmentSettings.Builder settingsBuilder = + EnvironmentSettings.newInstance().inStreamingMode(); - StreamExecutionEnvironment env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); - StreamTableEnvironment streamTableEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); - streamTableEnv.getConfig() + StreamTableEnvironment streamTableEnv = + StreamTableEnvironment.create(env, settingsBuilder.build()); + streamTableEnv + .getConfig() .getConfiguration() .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); tEnv = streamTableEnv; @@ -105,11 +106,11 @@ private void insertRows(String partition, Table table, Row... rows) throws IOExc GenericRecord gRecord = GenericRecord.create(table.schema()); List records = Lists.newArrayList(); for (Row row : rows) { - records.add(gRecord.copy( - "id", row.getField(0), - "data", row.getField(1), - "dt", row.getField(2) - )); + records.add( + gRecord.copy( + "id", row.getField(0), + "data", row.getField(1), + "dt", row.getField(2))); } if (partition != null) { @@ -129,9 +130,12 @@ private void assertRows(List expectedRows, Iterator iterator) { Row actualRow = iterator.next(); Assert.assertEquals("Should have expected fields", 3, actualRow.getArity()); - Assert.assertEquals("Should have expected id", expectedRow.getField(0), actualRow.getField(0)); - Assert.assertEquals("Should have expected data", expectedRow.getField(1), actualRow.getField(1)); - Assert.assertEquals("Should have expected dt", expectedRow.getField(2), actualRow.getField(2)); + Assert.assertEquals( + "Should have expected id", expectedRow.getField(0), actualRow.getField(0)); + Assert.assertEquals( + "Should have expected data", expectedRow.getField(1), actualRow.getField(1)); + Assert.assertEquals( + "Should have expected dt", expectedRow.getField(2), actualRow.getField(2)); } } @@ -140,7 +144,8 @@ public void testUnPartitionedTable() throws Exception { sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - TableResult result = exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); try (CloseableIterator iterator = result.collect()) { Row row1 = Row.of(1, "aaa", "2021-01-01"); @@ -154,13 +159,13 @@ public void testUnPartitionedTable() throws Exception { result.getJobClient().ifPresent(JobClient::cancel); } - @Test public void testPartitionedTable() throws Exception { sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR) PARTITIONED BY (dt)", TABLE); Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - TableResult result = exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); try (CloseableIterator iterator = result.collect()) { Row row1 = Row.of(1, "aaa", "2021-01-01"); insertRows("2021-01-01", table, row1); @@ -190,7 +195,8 @@ public void testConsumeFromBeginning() throws Exception { Row row2 = Row.of(2, "bbb", "2021-01-01"); insertRows(table, row1, row2); - TableResult result = exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); try (CloseableIterator iterator = result.collect()) { assertRows(ImmutableList.of(row1, row2), iterator); @@ -222,8 +228,11 @@ public void testConsumeFromStartSnapshotId() throws Exception { Row row4 = Row.of(4, "ddd", "2021-01-01"); insertRows(table, row3, row4); - TableResult result = exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', " + - "'start-snapshot-id'='%d')*/", TABLE, startSnapshotId); + TableResult result = + exec( + "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', " + + "'start-snapshot-id'='%d')*/", + TABLE, startSnapshotId); try (CloseableIterator iterator = result.collect()) { // The row2 in start snapshot will be excluded. assertRows(ImmutableList.of(row3, row4), iterator); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java index 1b1cf70a32e0..6f8789c92bc5 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.File; @@ -57,19 +56,16 @@ @RunWith(Parameterized.class) public class TestStreamingMonitorFunction extends TableTestBase { - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET; private static final long WAIT_TIME_MILLIS = 10 * 1000L; @Parameterized.Parameters(name = "FormatVersion={0}") public static Iterable parameters() { - return ImmutableList.of( - new Object[] {1}, - new Object[] {2} - ); + return ImmutableList.of(new Object[] {1}, new Object[] {2}); } public TestStreamingMonitorFunction(int formatVersion) { @@ -87,23 +83,24 @@ public void setupTable() throws IOException { table = create(SCHEMA, PartitionSpec.unpartitioned()); } - private void runSourceFunctionInTask(TestSourceContext sourceContext, StreamingMonitorFunction function) { - Thread task = new Thread(() -> { - try { - function.run(sourceContext); - } catch (Exception e) { - throw new RuntimeException(e); - } - }); + private void runSourceFunctionInTask( + TestSourceContext sourceContext, StreamingMonitorFunction function) { + Thread task = + new Thread( + () -> { + try { + function.run(sourceContext); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); task.start(); } @Test public void testConsumeWithoutStartSnapshotId() throws Exception { List> recordsList = generateRecordsAndCommitTxn(10); - ScanContext scanContext = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .build(); + ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build(); StreamingMonitorFunction function = createFunction(scanContext); try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { @@ -114,14 +111,16 @@ public void testConsumeWithoutStartSnapshotId() throws Exception { TestSourceContext sourceContext = new TestSourceContext(latch); runSourceFunctionInTask(sourceContext, function); - Assert.assertTrue("Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); + Assert.assertTrue( + "Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); Thread.sleep(1000L); // Stop the stream task. function.close(); Assert.assertEquals("Should produce the expected splits", 1, sourceContext.splits.size()); - TestHelpers.assertRecords(sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); } } @@ -134,10 +133,11 @@ public void testConsumeFromStartSnapshotId() throws Exception { // Commit the next five transactions. List> recordsList = generateRecordsAndCommitTxn(5); - ScanContext scanContext = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .startSnapshotId(startSnapshotId) - .build(); + ScanContext scanContext = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .startSnapshotId(startSnapshotId) + .build(); StreamingMonitorFunction function = createFunction(scanContext); try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { @@ -148,23 +148,23 @@ public void testConsumeFromStartSnapshotId() throws Exception { TestSourceContext sourceContext = new TestSourceContext(latch); runSourceFunctionInTask(sourceContext, function); - Assert.assertTrue("Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); + Assert.assertTrue( + "Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); Thread.sleep(1000L); // Stop the stream task. function.close(); Assert.assertEquals("Should produce the expected splits", 1, sourceContext.splits.size()); - TestHelpers.assertRecords(sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); } } @Test public void testCheckpointRestore() throws Exception { List> recordsList = generateRecordsAndCommitTxn(10); - ScanContext scanContext = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .build(); + ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build(); StreamingMonitorFunction func = createFunction(scanContext); OperatorSubtaskState state; @@ -176,7 +176,8 @@ public void testCheckpointRestore() throws Exception { TestSourceContext sourceContext = new TestSourceContext(latch); runSourceFunctionInTask(sourceContext, func); - Assert.assertTrue("Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); + Assert.assertTrue( + "Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); Thread.sleep(1000L); state = harness.snapshot(1, 1); @@ -185,7 +186,8 @@ public void testCheckpointRestore() throws Exception { func.close(); Assert.assertEquals("Should produce the expected splits", 1, sourceContext.splits.size()); - TestHelpers.assertRecords(sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); } List> newRecordsList = generateRecordsAndCommitTxn(10); @@ -200,44 +202,50 @@ public void testCheckpointRestore() throws Exception { TestSourceContext sourceContext = new TestSourceContext(latch); runSourceFunctionInTask(sourceContext, newFunc); - Assert.assertTrue("Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); + Assert.assertTrue( + "Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); Thread.sleep(1000L); // Stop the stream task. newFunc.close(); Assert.assertEquals("Should produce the expected splits", 1, sourceContext.splits.size()); - TestHelpers.assertRecords(sourceContext.toRows(), Lists.newArrayList(Iterables.concat(newRecordsList)), SCHEMA); + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(newRecordsList)), SCHEMA); } } @Test public void testInvalidMaxPlanningSnapshotCount() { - ScanContext scanContext1 = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .maxPlanningSnapshotCount(0) - .build(); - - AssertHelpers.assertThrows("Should throw exception because of invalid config", - IllegalArgumentException.class, "must be greater than zero", + ScanContext scanContext1 = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .maxPlanningSnapshotCount(0) + .build(); + + AssertHelpers.assertThrows( + "Should throw exception because of invalid config", + IllegalArgumentException.class, + "must be greater than zero", () -> { createFunction(scanContext1); return null; - } - ); - - ScanContext scanContext2 = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .maxPlanningSnapshotCount(-10) - .build(); - - AssertHelpers.assertThrows("Should throw exception because of invalid config", - IllegalArgumentException.class, "must be greater than zero", + }); + + ScanContext scanContext2 = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .maxPlanningSnapshotCount(-10) + .build(); + + AssertHelpers.assertThrows( + "Should throw exception because of invalid config", + IllegalArgumentException.class, + "must be greater than zero", () -> { createFunction(scanContext2); return null; - } - ); + }); } @Test @@ -247,26 +255,29 @@ public void testConsumeWithMaxPlanningSnapshotCount() throws Exception { // Use the oldest snapshot as starting to avoid the initial case. long oldestSnapshotId = SnapshotUtil.oldestAncestor(table).snapshotId(); - ScanContext scanContext = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .splitSize(1000L) - .startSnapshotId(oldestSnapshotId) - .maxPlanningSnapshotCount(Integer.MAX_VALUE) - .build(); + ScanContext scanContext = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .splitSize(1000L) + .startSnapshotId(oldestSnapshotId) + .maxPlanningSnapshotCount(Integer.MAX_VALUE) + .build(); - FlinkInputSplit[] expectedSplits = FlinkSplitPlanner - .planInputSplits(table, scanContext, ThreadPools.getWorkerPool()); + FlinkInputSplit[] expectedSplits = + FlinkSplitPlanner.planInputSplits(table, scanContext, ThreadPools.getWorkerPool()); Assert.assertEquals("should produce 9 splits", 9, expectedSplits.length); - // This covers three cases that maxPlanningSnapshotCount is less than, equal or greater than the total splits number + // This covers three cases that maxPlanningSnapshotCount is less than, equal or greater than the + // total splits number for (int maxPlanningSnapshotCount : ImmutableList.of(1, 9, 15)) { - scanContext = ScanContext.builder() - .monitorInterval(Duration.ofMillis(500)) - .startSnapshotId(oldestSnapshotId) - .splitSize(1000L) - .maxPlanningSnapshotCount(maxPlanningSnapshotCount) - .build(); + scanContext = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(500)) + .startSnapshotId(oldestSnapshotId) + .splitSize(1000L) + .maxPlanningSnapshotCount(maxPlanningSnapshotCount) + .build(); StreamingMonitorFunction function = createFunction(scanContext); try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { @@ -279,8 +290,10 @@ public void testConsumeWithMaxPlanningSnapshotCount() throws Exception { function.monitorAndForwardSplits(); if (maxPlanningSnapshotCount < 10) { - Assert.assertEquals("Should produce same splits as max-planning-snapshot-count", - maxPlanningSnapshotCount, sourceContext.splits.size()); + Assert.assertEquals( + "Should produce same splits as max-planning-snapshot-count", + maxPlanningSnapshotCount, + sourceContext.splits.size()); } } } @@ -304,12 +317,14 @@ private void writeRecords(List records) throws IOException { } private StreamingMonitorFunction createFunction(ScanContext scanContext) { - return new StreamingMonitorFunction(TestTableLoader.of(tableDir.getAbsolutePath()), scanContext); + return new StreamingMonitorFunction( + TestTableLoader.of(tableDir.getAbsolutePath()), scanContext); } - private AbstractStreamOperatorTestHarness createHarness(StreamingMonitorFunction function) - throws Exception { - StreamSource streamSource = new StreamSource<>(function); + private AbstractStreamOperatorTestHarness createHarness( + StreamingMonitorFunction function) throws Exception { + StreamSource streamSource = + new StreamSource<>(function); return new AbstractStreamOperatorTestHarness<>(streamSource, 1, 1, 0); } @@ -334,14 +349,10 @@ public void collectWithTimestamp(FlinkInputSplit element, long timestamp) { } @Override - public void emitWatermark(Watermark mark) { - - } + public void emitWatermark(Watermark mark) {} @Override - public void markAsTemporarilyIdle() { - - } + public void markAsTemporarilyIdle() {} @Override public Object getCheckpointLock() { @@ -349,14 +360,13 @@ public Object getCheckpointLock() { } @Override - public void close() { - - } + public void close() {} private List toRows() throws IOException { - FlinkInputFormat format = FlinkSource.forRowData() - .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) - .buildFormat(); + FlinkInputFormat format = + FlinkSource.forRowData() + .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) + .buildFormat(); List rows = Lists.newArrayList(); for (FlinkInputSplit split : splits) { diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java index 0c3191136db9..e258a197edf3 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.File; @@ -56,18 +55,15 @@ @RunWith(Parameterized.class) public class TestStreamingReaderOperator extends TableTestBase { - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET; @Parameterized.Parameters(name = "FormatVersion={0}") public static Iterable parameters() { - return ImmutableList.of( - new Object[] {1}, - new Object[] {2} - ); + return ImmutableList.of(new Object[] {1}, new Object[] {2}); } public TestStreamingReaderOperator(int formatVersion) { @@ -115,7 +111,8 @@ public void testProcessAllRecords() throws Exception { @Test public void testTriggerCheckpoint() throws Exception { - // Received emitted splits: split1, split2, split3, checkpoint request is triggered when reading records from + // Received emitted splits: split1, split2, split3, checkpoint request is triggered when reading + // records from // split1. List> expectedRecords = generateRecordsAndCommitTxn(3); @@ -134,11 +131,11 @@ public void testTriggerCheckpoint() throws Exception { harness.processElement(splits.get(2), ++timestamp); // Trigger snapshot state, it will start to work once all records from split0 are read. - processor.getMainMailboxExecutor() - .execute(() -> harness.snapshot(1, 3), "Trigger snapshot"); + processor.getMainMailboxExecutor().execute(() -> harness.snapshot(1, 3), "Trigger snapshot"); Assert.assertTrue("Should have processed the split0", processor.runMailboxStep()); - Assert.assertTrue("Should have processed the snapshot state action", processor.runMailboxStep()); + Assert.assertTrue( + "Should have processed the snapshot state action", processor.runMailboxStep()); TestHelpers.assertRecords(readOutputValues(harness), expectedRecords.get(0), SCHEMA); @@ -148,8 +145,8 @@ public void testTriggerCheckpoint() throws Exception { // Read records from split2. Assert.assertTrue("Should have processed the split2", processor.runMailboxStep()); - TestHelpers.assertRecords(readOutputValues(harness), - Lists.newArrayList(Iterables.concat(expectedRecords)), SCHEMA); + TestHelpers.assertRecords( + readOutputValues(harness), Lists.newArrayList(Iterables.concat(expectedRecords)), SCHEMA); } } @@ -211,7 +208,8 @@ public void testCheckpointRestore() throws Exception { } } - private List readOutputValues(OneInputStreamOperatorTestHarness harness) { + private List readOutputValues( + OneInputStreamOperatorTestHarness harness) { List results = Lists.newArrayList(); for (RowData rowData : harness.extractOutputValues()) { results.add(Row.of(rowData.getInt(0), rowData.getString(1).toString())); @@ -244,33 +242,36 @@ private List generateSplits() { ScanContext scanContext; if (i == snapshotIds.size() - 1) { // Generate the splits from the first snapshot. - scanContext = ScanContext.builder() - .useSnapshotId(snapshotIds.get(i)) - .build(); + scanContext = ScanContext.builder().useSnapshotId(snapshotIds.get(i)).build(); } else { // Generate the splits between the previous snapshot and current snapshot. - scanContext = ScanContext.builder() - .startSnapshotId(snapshotIds.get(i + 1)) - .endSnapshotId(snapshotIds.get(i)) - .build(); + scanContext = + ScanContext.builder() + .startSnapshotId(snapshotIds.get(i + 1)) + .endSnapshotId(snapshotIds.get(i)) + .build(); } - Collections.addAll(inputSplits, FlinkSplitPlanner.planInputSplits( - table, scanContext, ThreadPools.getWorkerPool())); + Collections.addAll( + inputSplits, + FlinkSplitPlanner.planInputSplits(table, scanContext, ThreadPools.getWorkerPool())); } return inputSplits; } - private OneInputStreamOperatorTestHarness createReader() throws Exception { + private OneInputStreamOperatorTestHarness createReader() + throws Exception { // This input format is used to opening the emitted split. - FlinkInputFormat inputFormat = FlinkSource.forRowData() - .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) - .buildFormat(); - - OneInputStreamOperatorFactory factory = StreamingReaderOperator.factory(inputFormat); - OneInputStreamOperatorTestHarness harness = new OneInputStreamOperatorTestHarness<>( - factory, 1, 1, 0); + FlinkInputFormat inputFormat = + FlinkSource.forRowData() + .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) + .buildFormat(); + + OneInputStreamOperatorFactory factory = + StreamingReaderOperator.factory(inputFormat); + OneInputStreamOperatorTestHarness harness = + new OneInputStreamOperatorTestHarness<>(factory, 1, 1, 0); harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime); return harness; diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestSimpleSplitAssigner.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestSimpleSplitAssigner.java index bfe1c6bab2c4..ee6c5cc3a6c8 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestSimpleSplitAssigner.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestSimpleSplitAssigner.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.assigner; import java.util.Collection; @@ -32,8 +31,7 @@ import org.junit.rules.TemporaryFolder; public class TestSimpleSplitAssigner { - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); @Test public void testEmptyInitialization() { @@ -41,19 +39,17 @@ public void testEmptyInitialization() { assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); } - /** - * Test a sequence of interactions for StaticEnumerator - */ + /** Test a sequence of interactions for StaticEnumerator */ @Test public void testStaticEnumeratorSequence() throws Exception { SimpleSplitAssigner assigner = new SimpleSplitAssigner(); - assigner.onDiscoveredSplits(SplitHelpers.createSplitsFromTransientHadoopTable( - TEMPORARY_FOLDER, 4, 2)); + assigner.onDiscoveredSplits( + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 4, 2)); assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); assertSnapshot(assigner, 1); - assigner.onUnassignedSplits(SplitHelpers.createSplitsFromTransientHadoopTable( - TEMPORARY_FOLDER, 1, 1)); + assigner.onUnassignedSplits( + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1)); assertSnapshot(assigner, 2); assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); @@ -62,20 +58,21 @@ public void testStaticEnumeratorSequence() throws Exception { assertSnapshot(assigner, 0); } - /** - * Test a sequence of interactions for ContinuousEnumerator - */ + /** Test a sequence of interactions for ContinuousEnumerator */ @Test public void testContinuousEnumeratorSequence() throws Exception { SimpleSplitAssigner assigner = new SimpleSplitAssigner(); assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - List splits1 = SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); + List splits1 = + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); assertAvailableFuture(assigner, 1, () -> assigner.onDiscoveredSplits(splits1)); - List splits2 = SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); + List splits2 = + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); assertAvailableFuture(assigner, 1, () -> assigner.onUnassignedSplits(splits2)); - assigner.onDiscoveredSplits(SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 2, 1)); + assigner.onDiscoveredSplits( + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 2, 1)); assertSnapshot(assigner, 2); assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); @@ -83,7 +80,8 @@ public void testContinuousEnumeratorSequence() throws Exception { assertSnapshot(assigner, 0); } - private void assertAvailableFuture(SimpleSplitAssigner assigner, int splitCount, Runnable addSplitsRunnable) { + private void assertAvailableFuture( + SimpleSplitAssigner assigner, int splitCount, Runnable addSplitsRunnable) { // register callback AtomicBoolean futureCompleted = new AtomicBoolean(); CompletableFuture future = assigner.isAvailable(); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java index f1db8ef5d6ad..e6610acbc19f 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.IOException; @@ -31,27 +30,22 @@ class ManualContinuousSplitPlanner implements ContinuousSplitPlanner { @Override public ContinuousEnumerationResult planSplits(IcebergEnumeratorPosition lastPosition) { - ContinuousEnumerationResult result = new ContinuousEnumerationResult( - Lists.newArrayList(splits), lastPosition, latestPosition); + ContinuousEnumerationResult result = + new ContinuousEnumerationResult(Lists.newArrayList(splits), lastPosition, latestPosition); return result; } - /** - * Add new splits to the collection - */ + /** Add new splits to the collection */ public void addSplits(List newSplits, IcebergEnumeratorPosition newPosition) { splits.addAll(newSplits); this.latestPosition = newPosition; } - /** - * Clear the splits collection - */ + /** Clear the splits collection */ public void clearSplits() { splits.clear(); } @Override - public void close() throws IOException { - } + public void close() throws IOException {} } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java index 5e7f926e3aee..aad2769af010 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.util.Collection; @@ -41,26 +40,28 @@ import org.junit.rules.TemporaryFolder; public class TestContinuousIcebergEnumerator { - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); @Test public void testDiscoverSplitWhenNoReaderRegistered() throws Exception { ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(); TestingSplitEnumeratorContext enumeratorContext = new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousIcebergEnumerator enumerator = createEnumerator(enumeratorContext, scanContext, splitPlanner); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, scanContext, splitPlanner); - Collection pendingSplitsEmpty = enumerator.snapshotState(1).pendingSplits(); + Collection pendingSplitsEmpty = + enumerator.snapshotState(1).pendingSplits(); Assert.assertEquals(0, pendingSplitsEmpty.size()); // make one split available and trigger the periodic discovery - List splits = SplitHelpers - .createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); splitPlanner.addSplits(splits, IcebergEnumeratorPosition.of(1L, 1L)); enumeratorContext.triggerAllActions(); @@ -76,26 +77,28 @@ public void testDiscoverWhenReaderRegistered() throws Exception { ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(); TestingSplitEnumeratorContext enumeratorContext = new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousIcebergEnumerator enumerator = createEnumerator(enumeratorContext, scanContext, splitPlanner); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, scanContext, splitPlanner); // register one reader, and let it request a split enumeratorContext.registerReader(2, "localhost"); enumerator.addReader(2); - enumerator.handleSourceEvent(2, - new SplitRequestEvent()); + enumerator.handleSourceEvent(2, new SplitRequestEvent()); // make one split available and trigger the periodic discovery - List splits = SplitHelpers - .createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); splitPlanner.addSplits(splits, IcebergEnumeratorPosition.of(1L, 1L)); enumeratorContext.triggerAllActions(); Assert.assertTrue(enumerator.snapshotState(1).pendingSplits().isEmpty()); - MatcherAssert.assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits(), + MatcherAssert.assertThat( + enumeratorContext.getSplitAssignments().get(2).getAssignedSplits(), CoreMatchers.hasItem(splits.get(0))); } @@ -104,44 +107,46 @@ public void testRequestingReaderUnavailableWhenSplitDiscovered() throws Exceptio ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(); TestingSplitEnumeratorContext enumeratorContext = new TestingSplitEnumeratorContext<>(4); - ScanContext config = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousIcebergEnumerator enumerator = createEnumerator(enumeratorContext, config, splitPlanner); + ScanContext config = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, config, splitPlanner); // register one reader, and let it request a split enumeratorContext.registerReader(2, "localhost"); enumerator.addReader(2); - enumerator.handleSourceEvent(2, - new SplitRequestEvent()); + enumerator.handleSourceEvent(2, new SplitRequestEvent()); // remove the reader (like in a failure) enumeratorContext.registeredReaders().remove(2); // make one split available and trigger the periodic discovery - List splits = SplitHelpers - .createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); Assert.assertEquals(1, splits.size()); splitPlanner.addSplits(splits, IcebergEnumeratorPosition.of(1L, 1L)); enumeratorContext.triggerAllActions(); Assert.assertFalse(enumeratorContext.getSplitAssignments().containsKey(2)); - List pendingSplitIds = enumerator.snapshotState(1).pendingSplits().stream() - .map(IcebergSourceSplitState::split) - .map(IcebergSourceSplit::splitId) - .collect(Collectors.toList()); + List pendingSplitIds = + enumerator.snapshotState(1).pendingSplits().stream() + .map(IcebergSourceSplitState::split) + .map(IcebergSourceSplit::splitId) + .collect(Collectors.toList()); Assert.assertEquals(splits.size(), pendingSplitIds.size()); Assert.assertEquals(splits.get(0).splitId(), pendingSplitIds.get(0)); // register the reader again, and let it request a split enumeratorContext.registerReader(2, "localhost"); enumerator.addReader(2); - enumerator.handleSourceEvent(2, - new SplitRequestEvent()); + enumerator.handleSourceEvent(2, new SplitRequestEvent()); Assert.assertTrue(enumerator.snapshotState(2).pendingSplits().isEmpty()); - MatcherAssert.assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits(), + MatcherAssert.assertThat( + enumeratorContext.getSplitAssignments().get(2).getAssignedSplits(), CoreMatchers.hasItem(splits.get(0))); } @@ -160,5 +165,4 @@ private static ContinuousIcebergEnumerator createEnumerator( enumerator.start(); return enumerator; } - } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java index bf794ab74c89..63fc53341f1e 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.IOException; @@ -48,18 +47,17 @@ import org.junit.rules.TestName; public class TestContinuousSplitPlannerImpl { - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private static final FileFormat fileFormat = FileFormat.PARQUET; private static final AtomicLong randomSeed = new AtomicLong(); @Rule - public final HadoopTableResource tableResource = new HadoopTableResource(TEMPORARY_FOLDER, - TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); + public final HadoopTableResource tableResource = + new HadoopTableResource( + TEMPORARY_FOLDER, TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); - @Rule - public TestName testName = new TestName(); + @Rule public TestName testName = new TestName(); private GenericAppenderHelper dataAppender; private DataFile dataFile1; @@ -86,36 +84,40 @@ private void appendTwoSnapshots() throws IOException { snapshot2 = tableResource.table().currentSnapshot(); } - /** - * @return the last enumerated snapshot id - */ + /** @return the last enumerated snapshot id */ private IcebergEnumeratorPosition verifyOneCycle( - ContinuousSplitPlannerImpl splitPlanner, IcebergEnumeratorPosition lastPosition) throws Exception { - List batch = RandomGenericData.generate(TestFixtures.SCHEMA, 2, randomSeed.incrementAndGet()); + ContinuousSplitPlannerImpl splitPlanner, IcebergEnumeratorPosition lastPosition) + throws Exception { + List batch = + RandomGenericData.generate(TestFixtures.SCHEMA, 2, randomSeed.incrementAndGet()); DataFile dataFile = dataAppender.writeFile(null, batch); dataAppender.appendToTable(dataFile); Snapshot snapshot = tableResource.table().currentSnapshot(); ContinuousEnumerationResult result = splitPlanner.planSplits(lastPosition); Assert.assertEquals(lastPosition.snapshotId(), result.fromPosition().snapshotId()); - Assert.assertEquals(lastPosition.snapshotTimestampMs(), result.fromPosition().snapshotTimestampMs()); + Assert.assertEquals( + lastPosition.snapshotTimestampMs(), result.fromPosition().snapshotTimestampMs()); Assert.assertEquals(snapshot.snapshotId(), result.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot.timestampMillis(), result.toPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot.timestampMillis(), result.toPosition().snapshotTimestampMs().longValue()); Assert.assertEquals(1, result.splits().size()); IcebergSourceSplit split = Iterables.getOnlyElement(result.splits()); Assert.assertEquals(1, split.task().files().size()); - Assert.assertEquals(dataFile.path().toString(), + Assert.assertEquals( + dataFile.path().toString(), Iterables.getOnlyElement(split.task().files()).file().path().toString()); return result.toPosition(); } @Test public void testTableScanThenIncrementalWithEmptyTable() throws Exception { - ScanContext scanContext = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContext, null); + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(tableResource.table(), scanContext, null); ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); Assert.assertTrue(emptyTableInitialDiscoveryResult.splits().isEmpty()); @@ -123,8 +125,8 @@ public void testTableScanThenIncrementalWithEmptyTable() throws Exception { Assert.assertTrue(emptyTableInitialDiscoveryResult.toPosition().isEmpty()); Assert.assertNull(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()); - ContinuousEnumerationResult emptyTableSecondDiscoveryResult = splitPlanner - .planSplits(emptyTableInitialDiscoveryResult.toPosition()); + ContinuousEnumerationResult emptyTableSecondDiscoveryResult = + splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); Assert.assertTrue(emptyTableSecondDiscoveryResult.splits().isEmpty()); Assert.assertTrue(emptyTableSecondDiscoveryResult.fromPosition().isEmpty()); Assert.assertNull(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()); @@ -142,23 +144,28 @@ public void testTableScanThenIncrementalWithEmptyTable() throws Exception { public void testTableScanThenIncrementalWithNonEmptyTable() throws Exception { appendTwoSnapshots(); - ScanContext scanContext = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContext, null); + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(tableResource.table(), scanContext, null); ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); Assert.assertNull(initialResult.fromPosition()); - Assert.assertEquals(snapshot2.snapshotId(), initialResult.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot2.timestampMillis(), initialResult.toPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot2.snapshotId(), initialResult.toPosition().snapshotId().longValue()); + Assert.assertEquals( + snapshot2.timestampMillis(), initialResult.toPosition().snapshotTimestampMs().longValue()); Assert.assertEquals(1, initialResult.splits().size()); IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); Assert.assertEquals(2, split.task().files().size()); - Set discoveredFiles = split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().path().toString()) - .collect(Collectors.toSet()); - Set expectedFiles = ImmutableSet.of(dataFile1.path().toString(), dataFile2.path().toString()); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().path().toString()) + .collect(Collectors.toSet()); + Set expectedFiles = + ImmutableSet.of(dataFile1.path().toString(), dataFile2.path().toString()); Assert.assertEquals(expectedFiles, discoveredFiles); IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); @@ -169,12 +176,13 @@ public void testTableScanThenIncrementalWithNonEmptyTable() throws Exception { @Test public void testIncrementalFromLatestSnapshotWithEmptyTable() throws Exception { - ScanContext scanContext = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .splitSize(1L) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContext, null); + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .splitSize(1L) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(tableResource.table(), scanContext, null); ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); Assert.assertTrue(emptyTableInitialDiscoveryResult.splits().isEmpty()); @@ -182,8 +190,8 @@ public void testIncrementalFromLatestSnapshotWithEmptyTable() throws Exception { Assert.assertTrue(emptyTableInitialDiscoveryResult.toPosition().isEmpty()); Assert.assertNull(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()); - ContinuousEnumerationResult emptyTableSecondDiscoveryResult = splitPlanner - .planSplits(emptyTableInitialDiscoveryResult.toPosition()); + ContinuousEnumerationResult emptyTableSecondDiscoveryResult = + splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); Assert.assertTrue(emptyTableSecondDiscoveryResult.splits().isEmpty()); Assert.assertTrue(emptyTableSecondDiscoveryResult.fromPosition().isEmpty()); Assert.assertNull(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()); @@ -192,8 +200,8 @@ public void testIncrementalFromLatestSnapshotWithEmptyTable() throws Exception { // latest mode should discover both snapshots, as latest position is marked by when job starts appendTwoSnapshots(); - ContinuousEnumerationResult afterTwoSnapshotsAppended = splitPlanner - .planSplits(emptyTableSecondDiscoveryResult.toPosition()); + ContinuousEnumerationResult afterTwoSnapshotsAppended = + splitPlanner.planSplits(emptyTableSecondDiscoveryResult.toPosition()); Assert.assertEquals(2, afterTwoSnapshotsAppended.splits().size()); // next 3 snapshots @@ -207,30 +215,37 @@ public void testIncrementalFromLatestSnapshotWithEmptyTable() throws Exception { public void testIncrementalFromLatestSnapshotWithNonEmptyTable() throws Exception { appendTwoSnapshots(); - ScanContext scanContext = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContext, null); + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(tableResource.table(), scanContext, null); ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); Assert.assertNull(initialResult.fromPosition()); // For inclusive behavior, the initial result should point to snapshot1 // Then the next incremental scan shall discover files from latest snapshot2 (inclusive) - Assert.assertEquals(snapshot1.snapshotId(), initialResult.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot1.timestampMillis(), initialResult.toPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot1.snapshotId(), initialResult.toPosition().snapshotId().longValue()); + Assert.assertEquals( + snapshot1.timestampMillis(), initialResult.toPosition().snapshotTimestampMs().longValue()); Assert.assertEquals(0, initialResult.splits().size()); ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - Assert.assertEquals(snapshot1.snapshotId(), secondResult.fromPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot1.timestampMillis(), secondResult.fromPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot1.snapshotId(), secondResult.fromPosition().snapshotId().longValue()); + Assert.assertEquals( + snapshot1.timestampMillis(), secondResult.fromPosition().snapshotTimestampMs().longValue()); Assert.assertEquals(snapshot2.snapshotId(), secondResult.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot2.timestampMillis(), secondResult.toPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot2.timestampMillis(), secondResult.toPosition().snapshotTimestampMs().longValue()); IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); Assert.assertEquals(1, split.task().files().size()); - Set discoveredFiles = split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().path().toString()) - .collect(Collectors.toSet()); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().path().toString()) + .collect(Collectors.toSet()); // should discover dataFile2 appended in snapshot2 Set expectedFiles = ImmutableSet.of(dataFile2.path().toString()); Assert.assertEquals(expectedFiles, discoveredFiles); @@ -243,11 +258,12 @@ public void testIncrementalFromLatestSnapshotWithNonEmptyTable() throws Exceptio @Test public void testIncrementalFromEarliestSnapshotWithEmptyTable() throws Exception { - ScanContext scanContext = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContext, null); + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(tableResource.table(), scanContext, null); ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); Assert.assertTrue(emptyTableInitialDiscoveryResult.splits().isEmpty()); @@ -255,8 +271,8 @@ public void testIncrementalFromEarliestSnapshotWithEmptyTable() throws Exception Assert.assertNull(emptyTableInitialDiscoveryResult.toPosition().snapshotId()); Assert.assertNull(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()); - ContinuousEnumerationResult emptyTableSecondDiscoveryResult = splitPlanner - .planSplits(emptyTableInitialDiscoveryResult.toPosition()); + ContinuousEnumerationResult emptyTableSecondDiscoveryResult = + splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); Assert.assertTrue(emptyTableSecondDiscoveryResult.splits().isEmpty()); Assert.assertNull(emptyTableSecondDiscoveryResult.fromPosition().snapshotId()); Assert.assertNull(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()); @@ -274,11 +290,12 @@ public void testIncrementalFromEarliestSnapshotWithEmptyTable() throws Exception public void testIncrementalFromEarliestSnapshotWithNonEmptyTable() throws Exception { appendTwoSnapshots(); - ScanContext scanContext = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContext, null); + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(tableResource.table(), scanContext, null); ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); Assert.assertNull(initialResult.fromPosition()); @@ -292,14 +309,17 @@ public void testIncrementalFromEarliestSnapshotWithNonEmptyTable() throws Except Assert.assertNull(secondResult.fromPosition().snapshotId()); Assert.assertNull(secondResult.fromPosition().snapshotTimestampMs()); Assert.assertEquals(snapshot2.snapshotId(), secondResult.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot2.timestampMillis(), secondResult.toPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot2.timestampMillis(), secondResult.toPosition().snapshotTimestampMs().longValue()); IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); Assert.assertEquals(2, split.task().files().size()); - Set discoveredFiles = split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().path().toString()) - .collect(Collectors.toSet()); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().path().toString()) + .collect(Collectors.toSet()); // should discover files appended in both snapshot1 and snapshot2 - Set expectedFiles = ImmutableSet.of(dataFile1.path().toString(), dataFile2.path().toString()); + Set expectedFiles = + ImmutableSet.of(dataFile1.path().toString(), dataFile2.path().toString()); Assert.assertEquals(expectedFiles, discoveredFiles); IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); @@ -310,14 +330,17 @@ public void testIncrementalFromEarliestSnapshotWithNonEmptyTable() throws Except @Test public void testIncrementalFromSnapshotIdWithEmptyTable() throws Exception { - ScanContext scanContextWithInvalidSnapshotId = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(1L) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContextWithInvalidSnapshotId, null); - - AssertHelpers.assertThrows("Should detect invalid starting snapshot id", + ScanContext scanContextWithInvalidSnapshotId = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(1L) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl( + tableResource.table(), scanContextWithInvalidSnapshotId, null); + + AssertHelpers.assertThrows( + "Should detect invalid starting snapshot id", IllegalArgumentException.class, "Start snapshot id not found in history: 1", () -> splitPlanner.planSplits(null)); @@ -329,19 +352,23 @@ public void testIncrementalFromSnapshotIdWithInvalidIds() throws Exception { // find an invalid snapshotId long invalidSnapshotId = 0L; - while (invalidSnapshotId == snapshot1.snapshotId() || invalidSnapshotId == snapshot2.snapshotId()) { + while (invalidSnapshotId == snapshot1.snapshotId() + || invalidSnapshotId == snapshot2.snapshotId()) { invalidSnapshotId++; } - ScanContext scanContextWithInvalidSnapshotId = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(invalidSnapshotId) - .build(); + ScanContext scanContextWithInvalidSnapshotId = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(invalidSnapshotId) + .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContextWithInvalidSnapshotId, null); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl( + tableResource.table(), scanContextWithInvalidSnapshotId, null); - AssertHelpers.assertThrows("Should detect invalid starting snapshot id", + AssertHelpers.assertThrows( + "Should detect invalid starting snapshot id", IllegalArgumentException.class, "Start snapshot id not found in history: " + invalidSnapshotId, () -> splitPlanner.planSplits(null)); @@ -351,30 +378,38 @@ public void testIncrementalFromSnapshotIdWithInvalidIds() throws Exception { public void testIncrementalFromSnapshotId() throws Exception { appendTwoSnapshots(); - ScanContext scanContext = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(snapshot2.snapshotId()) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContext, null); + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(snapshot2.snapshotId()) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(tableResource.table(), scanContext, null); ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); Assert.assertNull(initialResult.fromPosition()); - // For inclusive behavior of snapshot2, the initial result should point to snapshot1 (as snapshot2's parent) - Assert.assertEquals(snapshot1.snapshotId(), initialResult.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot1.timestampMillis(), initialResult.toPosition().snapshotTimestampMs().longValue()); + // For inclusive behavior of snapshot2, the initial result should point to snapshot1 (as + // snapshot2's parent) + Assert.assertEquals( + snapshot1.snapshotId(), initialResult.toPosition().snapshotId().longValue()); + Assert.assertEquals( + snapshot1.timestampMillis(), initialResult.toPosition().snapshotTimestampMs().longValue()); Assert.assertEquals(0, initialResult.splits().size()); ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - Assert.assertEquals(snapshot1.snapshotId(), secondResult.fromPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot1.timestampMillis(), secondResult.fromPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot1.snapshotId(), secondResult.fromPosition().snapshotId().longValue()); + Assert.assertEquals( + snapshot1.timestampMillis(), secondResult.fromPosition().snapshotTimestampMs().longValue()); Assert.assertEquals(snapshot2.snapshotId(), secondResult.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot2.timestampMillis(), secondResult.toPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot2.timestampMillis(), secondResult.toPosition().snapshotTimestampMs().longValue()); IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); Assert.assertEquals(1, split.task().files().size()); - Set discoveredFiles = split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().path().toString()) - .collect(Collectors.toSet()); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().path().toString()) + .collect(Collectors.toSet()); // should discover dataFile2 appended in snapshot2 Set expectedFiles = ImmutableSet.of(dataFile2.path().toString()); Assert.assertEquals(expectedFiles, discoveredFiles); @@ -387,14 +422,17 @@ public void testIncrementalFromSnapshotId() throws Exception { @Test public void testIncrementalFromSnapshotTimestampWithEmptyTable() throws Exception { - ScanContext scanContextWithInvalidSnapshotId = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(1L) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContextWithInvalidSnapshotId, null); - - AssertHelpers.assertThrows("Should detect invalid starting snapshot timestamp", + ScanContext scanContextWithInvalidSnapshotId = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(1L) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl( + tableResource.table(), scanContextWithInvalidSnapshotId, null); + + AssertHelpers.assertThrows( + "Should detect invalid starting snapshot timestamp", IllegalArgumentException.class, "Cannot find a snapshot older than 1970-01-01T00:00:00.001+00:00", () -> splitPlanner.planSplits(null)); @@ -405,17 +443,21 @@ public void testIncrementalFromSnapshotTimestampWithInvalidIds() throws Exceptio appendTwoSnapshots(); long invalidSnapshotTimestampMs = snapshot1.timestampMillis() - 1000L; - String invalidSnapshotTimestampMsStr = DateTimeUtil.formatTimestampMillis(invalidSnapshotTimestampMs); + String invalidSnapshotTimestampMsStr = + DateTimeUtil.formatTimestampMillis(invalidSnapshotTimestampMs); - ScanContext scanContextWithInvalidSnapshotId = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(invalidSnapshotTimestampMs) - .build(); + ScanContext scanContextWithInvalidSnapshotId = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(invalidSnapshotTimestampMs) + .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContextWithInvalidSnapshotId, null); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl( + tableResource.table(), scanContextWithInvalidSnapshotId, null); - AssertHelpers.assertThrows("Should detect invalid starting snapshot timestamp", + AssertHelpers.assertThrows( + "Should detect invalid starting snapshot timestamp", IllegalArgumentException.class, "Cannot find a snapshot older than " + invalidSnapshotTimestampMsStr, () -> splitPlanner.planSplits(null)); @@ -425,30 +467,37 @@ public void testIncrementalFromSnapshotTimestampWithInvalidIds() throws Exceptio public void testIncrementalFromSnapshotTimestamp() throws Exception { appendTwoSnapshots(); - ScanContext scanContext = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(snapshot2.timestampMillis()) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContext, null); + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(snapshot2.timestampMillis()) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(tableResource.table(), scanContext, null); ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); Assert.assertNull(initialResult.fromPosition()); // For inclusive behavior, the initial result should point to snapshot1 (as snapshot2's parent). - Assert.assertEquals(snapshot1.snapshotId(), initialResult.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot1.timestampMillis(), initialResult.toPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot1.snapshotId(), initialResult.toPosition().snapshotId().longValue()); + Assert.assertEquals( + snapshot1.timestampMillis(), initialResult.toPosition().snapshotTimestampMs().longValue()); Assert.assertEquals(0, initialResult.splits().size()); ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - Assert.assertEquals(snapshot1.snapshotId(), secondResult.fromPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot1.timestampMillis(), secondResult.fromPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot1.snapshotId(), secondResult.fromPosition().snapshotId().longValue()); + Assert.assertEquals( + snapshot1.timestampMillis(), secondResult.fromPosition().snapshotTimestampMs().longValue()); Assert.assertEquals(snapshot2.snapshotId(), secondResult.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot2.timestampMillis(), secondResult.toPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot2.timestampMillis(), secondResult.toPosition().snapshotTimestampMs().longValue()); IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); Assert.assertEquals(1, split.task().files().size()); - Set discoveredFiles = split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().path().toString()) - .collect(Collectors.toSet()); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().path().toString()) + .collect(Collectors.toSet()); // should discover dataFile2 appended in snapshot2 Set expectedFiles = ImmutableSet.of(dataFile2.path().toString()); Assert.assertEquals(expectedFiles, discoveredFiles); diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java index bc9cd4934a96..f3e9413d8819 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.IOException; @@ -43,12 +42,10 @@ public class TestContinuousSplitPlannerImplStartStrategy { private static final FileFormat FILE_FORMAT = FileFormat.PARQUET; public final TemporaryFolder temporaryFolder = new TemporaryFolder(); - public final HadoopTableResource tableResource = new HadoopTableResource(temporaryFolder, - TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); - @Rule - public final TestRule chain = RuleChain - .outerRule(temporaryFolder) - .around(tableResource); + public final HadoopTableResource tableResource = + new HadoopTableResource( + temporaryFolder, TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); + @Rule public final TestRule chain = RuleChain.outerRule(temporaryFolder).around(tableResource); private GenericAppenderHelper dataAppender; private Snapshot snapshot1; @@ -75,99 +72,120 @@ private void appendThreeSnapshots() throws IOException { } @Test - public void testTableScanThenIncrementalStrategy() throws IOException { - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); + public void testTableScanThenIncrementalStrategy() throws IOException { + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); // emtpy table - Assert.assertFalse(ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).isPresent()); + Assert.assertFalse( + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).isPresent()); appendThreeSnapshots(); - Snapshot startSnapshot = ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); Assert.assertEquals(snapshot3.snapshotId(), startSnapshot.snapshotId()); } @Test public void testForLatestSnapshotStrategy() throws IOException { - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .build(); // emtpy table - Assert.assertFalse(ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).isPresent()); + Assert.assertFalse( + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).isPresent()); appendThreeSnapshots(); - Snapshot startSnapshot = ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); Assert.assertEquals(snapshot3.snapshotId(), startSnapshot.snapshotId()); } @Test public void testForEarliestSnapshotStrategy() throws IOException { - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .build(); // emtpy table - Assert.assertFalse(ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).isPresent()); + Assert.assertFalse( + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).isPresent()); appendThreeSnapshots(); - Snapshot startSnapshot = ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); Assert.assertEquals(snapshot1.snapshotId(), startSnapshot.snapshotId()); } @Test public void testForSpecificSnapshotIdStrategy() throws IOException { - ScanContext scanContextInvalidSnapshotId = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(1L) - .build(); + ScanContext scanContextInvalidSnapshotId = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(1L) + .build(); // emtpy table - AssertHelpers.assertThrows("Should detect invalid starting snapshot id", + AssertHelpers.assertThrows( + "Should detect invalid starting snapshot id", IllegalArgumentException.class, "Start snapshot id not found in history: 1", - () -> ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContextInvalidSnapshotId)); + () -> + ContinuousSplitPlannerImpl.startSnapshot( + tableResource.table(), scanContextInvalidSnapshotId)); appendThreeSnapshots(); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(snapshot2.snapshotId()) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(snapshot2.snapshotId()) + .build(); - Snapshot startSnapshot = ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); Assert.assertEquals(snapshot2.snapshotId(), startSnapshot.snapshotId()); } @Test public void testForSpecificSnapshotTimestampStrategySnapshot2() throws IOException { - ScanContext scanContextInvalidSnapshotTimestamp = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(1L) - .build(); + ScanContext scanContextInvalidSnapshotTimestamp = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(1L) + .build(); // emtpy table - AssertHelpers.assertThrows("Should detect invalid starting snapshot timestamp", + AssertHelpers.assertThrows( + "Should detect invalid starting snapshot timestamp", IllegalArgumentException.class, "Cannot find a snapshot older than 1970-01-01T00:00:00.001+00:00", - () -> ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContextInvalidSnapshotTimestamp)); + () -> + ContinuousSplitPlannerImpl.startSnapshot( + tableResource.table(), scanContextInvalidSnapshotTimestamp)); appendThreeSnapshots(); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(snapshot2.timestampMillis()) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(snapshot2.timestampMillis()) + .build(); - Snapshot startSnapshot = ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); Assert.assertEquals(snapshot2.snapshotId(), startSnapshot.snapshotId()); } @@ -175,13 +193,15 @@ public void testForSpecificSnapshotTimestampStrategySnapshot2() throws IOExcepti public void testForSpecificSnapshotTimestampStrategySnapshot2Minus1() throws IOException { appendThreeSnapshots(); - ScanContext config = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(snapshot2.timestampMillis() - 1L) - .build(); + ScanContext config = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(snapshot2.timestampMillis() - 1L) + .build(); - Snapshot startSnapshot = ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), config).get(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), config).get(); Assert.assertEquals(snapshot2.snapshotId(), startSnapshot.snapshotId()); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java index 05abe7bc1792..33ff58c52f4a 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.util.Collection; @@ -34,10 +33,10 @@ import org.junit.rules.TemporaryFolder; public class TestIcebergEnumeratorStateSerializer { - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); - private final IcebergEnumeratorStateSerializer serializer = IcebergEnumeratorStateSerializer.INSTANCE; + private final IcebergEnumeratorStateSerializer serializer = + IcebergEnumeratorStateSerializer.INSTANCE; @Test public void testEmptySnapshotIdAndPendingSplits() throws Exception { @@ -49,8 +48,10 @@ public void testEmptySnapshotIdAndPendingSplits() throws Exception { @Test public void testSomeSnapshotIdAndEmptyPendingSplits() throws Exception { - IcebergEnumeratorPosition position = IcebergEnumeratorPosition.of(1L, System.currentTimeMillis()); - IcebergEnumeratorState enumeratorState = new IcebergEnumeratorState(position, Collections.emptyList()); + IcebergEnumeratorPosition position = + IcebergEnumeratorPosition.of(1L, System.currentTimeMillis()); + IcebergEnumeratorState enumeratorState = + new IcebergEnumeratorState(position, Collections.emptyList()); byte[] result = serializer.serialize(enumeratorState); IcebergEnumeratorState deserialized = serializer.deserialize(serializer.getVersion(), result); assertEnumeratorStateEquals(enumeratorState, deserialized); @@ -58,13 +59,17 @@ public void testSomeSnapshotIdAndEmptyPendingSplits() throws Exception { @Test public void testSomeSnapshotIdAndPendingSplits() throws Exception { - IcebergEnumeratorPosition position = IcebergEnumeratorPosition.of(2L, System.currentTimeMillis()); - List splits = SplitHelpers - .createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 3, 1); + IcebergEnumeratorPosition position = + IcebergEnumeratorPosition.of(2L, System.currentTimeMillis()); + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 3, 1); Collection pendingSplits = Lists.newArrayList(); - pendingSplits.add(new IcebergSourceSplitState(splits.get(0), IcebergSourceSplitStatus.UNASSIGNED)); - pendingSplits.add(new IcebergSourceSplitState(splits.get(1), IcebergSourceSplitStatus.ASSIGNED)); - pendingSplits.add(new IcebergSourceSplitState(splits.get(2), IcebergSourceSplitStatus.COMPLETED)); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(0), IcebergSourceSplitStatus.UNASSIGNED)); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(1), IcebergSourceSplitStatus.ASSIGNED)); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(2), IcebergSourceSplitStatus.COMPLETED)); IcebergEnumeratorState enumeratorState = new IcebergEnumeratorState(position, pendingSplits); byte[] result = serializer.serialize(enumeratorState); @@ -72,7 +77,8 @@ public void testSomeSnapshotIdAndPendingSplits() throws Exception { assertEnumeratorStateEquals(enumeratorState, deserialized); } - private void assertEnumeratorStateEquals(IcebergEnumeratorState expected, IcebergEnumeratorState actual) { + private void assertEnumeratorStateEquals( + IcebergEnumeratorState expected, IcebergEnumeratorState actual) { Assert.assertEquals(expected.lastEnumeratedPosition(), actual.lastEnumeratedPosition()); Assert.assertEquals(expected.pendingSplits().size(), actual.pendingSplits().size()); Iterator expectedIterator = expected.pendingSplits().iterator(); @@ -81,8 +87,10 @@ private void assertEnumeratorStateEquals(IcebergEnumeratorState expected, Iceber IcebergSourceSplitState expectedSplitState = expectedIterator.next(); IcebergSourceSplitState actualSplitState = actualIterator.next(); Assert.assertEquals(expectedSplitState.split().splitId(), actualSplitState.split().splitId()); - Assert.assertEquals(expectedSplitState.split().fileOffset(), actualSplitState.split().fileOffset()); - Assert.assertEquals(expectedSplitState.split().recordOffset(), actualSplitState.split().recordOffset()); + Assert.assertEquals( + expectedSplitState.split().fileOffset(), actualSplitState.split().fileOffset()); + Assert.assertEquals( + expectedSplitState.split().recordOffset(), actualSplitState.split().recordOffset()); Assert.assertEquals(expectedSplitState.status(), actualSplitState.status()); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java index e186a2b2a14a..720b0c25d12a 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.io.IOException; @@ -46,15 +45,14 @@ public abstract class ReaderFunctionTestBase { @Parameterized.Parameters(name = "fileFormat={0}") public static Object[][] parameters() { - return new Object[][]{ - new Object[]{FileFormat.AVRO}, - new Object[]{FileFormat.ORC}, - new Object[]{FileFormat.PARQUET} + return new Object[][] { + new Object[] {FileFormat.AVRO}, + new Object[] {FileFormat.ORC}, + new Object[] {FileFormat.PARQUET} }; } - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); protected abstract ReaderFunction readerFunction(); @@ -78,11 +76,13 @@ private List> createRecordBatchList(int batchCount) { return recordBatchList; } - private CombinedScanTask createCombinedScanTask(List> recordBatchList) throws IOException { + private CombinedScanTask createCombinedScanTask(List> recordBatchList) + throws IOException { List fileTasks = Lists.newArrayListWithCapacity(recordBatchList.size()); for (int i = 0; i < recordBatchList.size(); ++i) { - FileScanTask fileTask = ReaderUtil.createFileTask( - recordBatchList.get(i), TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); + FileScanTask fileTask = + ReaderUtil.createFileTask( + recordBatchList.get(i), TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); fileTasks.add(fileTask); } @@ -90,7 +90,9 @@ private CombinedScanTask createCombinedScanTask(List> recordBatchLi } private void assertRecordsAndPosition( - List expectedRecords, int expectedFileOffset, long startRecordOffset, + List expectedRecords, + int expectedFileOffset, + long startRecordOffset, RecordsWithSplitIds> batch) { batch.nextSplit(); List actualRecords = Lists.newArrayList(); @@ -98,8 +100,10 @@ private void assertRecordsAndPosition( RecordAndPosition recordAndPosition; while ((recordAndPosition = batch.nextRecordFromSplit()) != null) { actualRecords.add(recordAndPosition.record()); - Assert.assertEquals("expected file offset", expectedFileOffset, recordAndPosition.fileOffset()); - Assert.assertEquals("expected record offset", recordOffset, recordAndPosition.recordOffset() - 1); + Assert.assertEquals( + "expected file offset", expectedFileOffset, recordAndPosition.fileOffset()); + Assert.assertEquals( + "expected record offset", recordOffset, recordAndPosition.recordOffset() - 1); recordOffset++; } @@ -112,7 +116,8 @@ public void testNoCheckpointedPosition() throws IOException { List> recordBatchList = createRecordBatchList(3); CombinedScanTask combinedScanTask = createCombinedScanTask(recordBatchList); IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask); - CloseableIterator>> reader = readerFunction().apply(split); + CloseableIterator>> reader = + readerFunction().apply(split); RecordsWithSplitIds> batch0 = reader.next(); assertRecordsAndPosition(recordBatchList.get(0), 0, 0L, batch0); @@ -132,7 +137,8 @@ public void testCheckpointedPositionBeforeFirstFile() throws IOException { List> recordBatchList = createRecordBatchList(3); CombinedScanTask combinedScanTask = createCombinedScanTask(recordBatchList); IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 0L); - CloseableIterator>> reader = readerFunction().apply(split); + CloseableIterator>> reader = + readerFunction().apply(split); RecordsWithSplitIds> batch0 = reader.next(); assertRecordsAndPosition(recordBatchList.get(0), 0, 0L, batch0); @@ -152,7 +158,8 @@ public void testCheckpointedPositionMiddleFirstFile() throws IOException { List> recordBatchList = createRecordBatchList(3); CombinedScanTask combinedScanTask = createCombinedScanTask(recordBatchList); IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 1L); - CloseableIterator>> reader = readerFunction().apply(split); + CloseableIterator>> reader = + readerFunction().apply(split); RecordsWithSplitIds> batch0 = reader.next(); assertRecordsAndPosition(recordBatchList.get(0).subList(1, 2), 0, 1L, batch0); @@ -172,7 +179,8 @@ public void testCheckpointedPositionAfterFirstFile() throws IOException { List> recordBatchList = createRecordBatchList(3); CombinedScanTask combinedScanTask = createCombinedScanTask(recordBatchList); IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 2L); - CloseableIterator>> reader = readerFunction().apply(split); + CloseableIterator>> reader = + readerFunction().apply(split); RecordsWithSplitIds> batch1 = reader.next(); assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); @@ -188,7 +196,8 @@ public void testCheckpointedPositionBeforeSecondFile() throws IOException { List> recordBatchList = createRecordBatchList(3); CombinedScanTask combinedScanTask = createCombinedScanTask(recordBatchList); IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 1, 0L); - CloseableIterator>> reader = readerFunction().apply(split); + CloseableIterator>> reader = + readerFunction().apply(split); RecordsWithSplitIds> batch1 = reader.next(); assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); @@ -204,7 +213,8 @@ public void testCheckpointedPositionMidSecondFile() throws IOException { List> recordBatchList = createRecordBatchList(3); CombinedScanTask combinedScanTask = createCombinedScanTask(recordBatchList); IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 1, 1L); - CloseableIterator>> reader = readerFunction().apply(split); + CloseableIterator>> reader = + readerFunction().apply(split); RecordsWithSplitIds> batch1 = reader.next(); assertRecordsAndPosition(recordBatchList.get(1).subList(1, 2), 1, 1L, batch1); @@ -214,5 +224,4 @@ public void testCheckpointedPositionMidSecondFile() throws IOException { assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); batch2.recycle(); } - } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java index 2431ee0fce61..7d1d41173331 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.io.File; @@ -46,30 +45,41 @@ public class ReaderUtil { - private ReaderUtil() { - } + private ReaderUtil() {} - public static FileScanTask createFileTask(List records, File file, FileFormat fileFormat, - FileAppenderFactory appenderFactory) throws IOException { - try (FileAppender appender = appenderFactory.newAppender(Files.localOutput(file), fileFormat)) { + public static FileScanTask createFileTask( + List records, + File file, + FileFormat fileFormat, + FileAppenderFactory appenderFactory) + throws IOException { + try (FileAppender appender = + appenderFactory.newAppender(Files.localOutput(file), fileFormat)) { appender.addAll(records); } - DataFile dataFile = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(records.size()) - .withFileSizeInBytes(file.length()) - .withPath(file.toString()) - .withFormat(fileFormat) - .build(); + DataFile dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(records.size()) + .withFileSizeInBytes(file.length()) + .withPath(file.toString()) + .withFormat(fileFormat) + .build(); ResidualEvaluator residuals = ResidualEvaluator.unpartitioned(Expressions.alwaysTrue()); - return new BaseFileScanTask(dataFile, null, SchemaParser.toJson(TestFixtures.SCHEMA), - PartitionSpecParser.toJson(PartitionSpec.unpartitioned()), residuals); + return new BaseFileScanTask( + dataFile, + null, + SchemaParser.toJson(TestFixtures.SCHEMA), + PartitionSpecParser.toJson(PartitionSpec.unpartitioned()), + residuals); } public static DataIterator createDataIterator(CombinedScanTask combinedTask) { return new DataIterator<>( new RowDataFileScanTaskReader(TestFixtures.SCHEMA, TestFixtures.SCHEMA, null, true), - combinedTask, new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), new PlaintextEncryptionManager()); + combinedTask, + new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), + new PlaintextEncryptionManager()); } } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java index 4e46f339a192..644ac2bad6b8 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.util.concurrent.atomic.AtomicBoolean; @@ -27,22 +26,29 @@ public class TestArrayBatchRecords { @Test public void testFullRange() { - String[] elements = new String[]{"0", "1", "2", "3"}; + String[] elements = new String[] {"0", "1", "2", "3"}; testArray(elements, elements.length, 2, 119); } @Test public void testSubRange() { - String[] elements = new String[]{"0", "1", "2", "3"}; + String[] elements = new String[] {"0", "1", "2", "3"}; testArray(elements, 2, 0, 0); } - private void testArray(String[] elements, int numberOfRecords, int fileOffset, long startingRecordOffset) { + private void testArray( + String[] elements, int numberOfRecords, int fileOffset, long startingRecordOffset) { String splitId = "iceberg_split_1"; AtomicBoolean recycled = new AtomicBoolean(); - ArrayBatchRecords recordsWithSplitIds = ArrayBatchRecords.forRecords(splitId, - ignored -> recycled.set(true), elements, numberOfRecords, fileOffset, startingRecordOffset); + ArrayBatchRecords recordsWithSplitIds = + ArrayBatchRecords.forRecords( + splitId, + ignored -> recycled.set(true), + elements, + numberOfRecords, + fileOffset, + startingRecordOffset); Assert.assertEquals(splitId, recordsWithSplitIds.nextSplit()); @@ -59,5 +65,4 @@ private void testArray(String[] elements, int numberOfRecords, int fileOffset, l recordsWithSplitIds.recycle(); Assert.assertTrue(recycled.get()); } - } diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java index 0c23d511af5c..f964a7707689 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.util.Arrays; @@ -44,8 +43,7 @@ public class TestArrayPoolDataIteratorBatcherRowData { - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private static final FileFormat fileFormat = FileFormat.PARQUET; private final GenericAppenderFactory appenderFactory; @@ -57,18 +55,17 @@ public TestArrayPoolDataIteratorBatcherRowData() { config.set(SourceReaderOptions.ELEMENT_QUEUE_CAPACITY, 1); // set batch array size to 2 config.set(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 2); - this.batcher = new ArrayPoolDataIteratorBatcher<>(config, new RowDataRecordFactory(TestFixtures.ROW_TYPE)); + this.batcher = + new ArrayPoolDataIteratorBatcher<>(config, new RowDataRecordFactory(TestFixtures.ROW_TYPE)); this.appenderFactory = new GenericAppenderFactory(TestFixtures.SCHEMA); } - /** - * Read a CombinedScanTask that contains a single file with less than a full batch of records - */ + /** Read a CombinedScanTask that contains a single file with less than a full batch of records */ @Test public void testSingleFileLessThanOneFullBatch() throws Exception { List records = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1); - FileScanTask fileTask = ReaderUtil.createFileTask( - records, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); + FileScanTask fileTask = + ReaderUtil.createFileTask(records, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); CombinedScanTask combinedTask = new BaseCombinedScanTask(fileTask); DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); String splitId = "someSplitId"; @@ -104,13 +101,13 @@ public void testSingleFileLessThanOneFullBatch() throws Exception { /** * Read a CombinedScanTask that contains a single file with multiple batches. * - * Insert 5 records in a single file that should result in 3 batches + *

    Insert 5 records in a single file that should result in 3 batches */ @Test public void testSingleFileWithMultipleBatches() throws Exception { List records = RandomGenericData.generate(TestFixtures.SCHEMA, 5, 1); - FileScanTask fileTask = ReaderUtil.createFileTask( - records, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); + FileScanTask fileTask = + ReaderUtil.createFileTask(records, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); CombinedScanTask combinedTask = new BaseCombinedScanTask(fileTask); DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); String splitId = "someSplitId"; @@ -210,20 +207,24 @@ public void testSingleFileWithMultipleBatches() throws Exception { /** * Read a CombinedScanTask that contains with multiple files. * - * In this test, we also seek the iterator to starting position (1, 1). + *

    In this test, we also seek the iterator to starting position (1, 1). */ @Test public void testMultipleFilesWithSeekPosition() throws Exception { List records0 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1); - FileScanTask fileTask0 = ReaderUtil.createFileTask( - records0, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); + FileScanTask fileTask0 = + ReaderUtil.createFileTask( + records0, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); List records1 = RandomGenericData.generate(TestFixtures.SCHEMA, 4, 2); - FileScanTask fileTask1 = ReaderUtil.createFileTask( - records1, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); + FileScanTask fileTask1 = + ReaderUtil.createFileTask( + records1, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); List records2 = RandomGenericData.generate(TestFixtures.SCHEMA, 3, 3); - FileScanTask fileTask2 = ReaderUtil.createFileTask( - records2, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); - CombinedScanTask combinedTask = new BaseCombinedScanTask(Arrays.asList(fileTask0, fileTask1, fileTask2)); + FileScanTask fileTask2 = + ReaderUtil.createFileTask( + records2, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); + CombinedScanTask combinedTask = + new BaseCombinedScanTask(Arrays.asList(fileTask0, fileTask1, fileTask2)); DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); // seek to file1 and after record 1 diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java index e6ec852969b2..aee271a3a7b8 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.util.List; @@ -39,10 +38,9 @@ public class TestRowDataReaderFunction extends ReaderFunctionTestBase { - protected static final RowType rowType = FlinkSchemaUtil - .convert(TestFixtures.SCHEMA); - private static final DataStructureConverter rowDataConverter = DataStructureConverters.getConverter( - TypeConversions.fromLogicalToDataType(rowType)); + protected static final RowType rowType = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); + private static final DataStructureConverter rowDataConverter = + DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); public TestRowDataReaderFunction(FileFormat fileFormat) { super(fileFormat); @@ -50,8 +48,14 @@ public TestRowDataReaderFunction(FileFormat fileFormat) { @Override protected ReaderFunction readerFunction() { - return new RowDataReaderFunction(new Configuration(), TestFixtures.SCHEMA, TestFixtures.SCHEMA, null, true, - new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), new PlaintextEncryptionManager()); + return new RowDataReaderFunction( + new Configuration(), + TestFixtures.SCHEMA, + TestFixtures.SCHEMA, + null, + true, + new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), + new PlaintextEncryptionManager()); } @Override diff --git a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java index 36eea1e8a409..046b0c31ce2e 100644 --- a/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java +++ b/flink/v1.14/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.split; import java.util.List; @@ -30,8 +29,7 @@ public class TestIcebergSourceSplitSerializer { - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private final IcebergSourceSplitSerializer serializer = IcebergSourceSplitSerializer.INSTANCE; @@ -42,8 +40,9 @@ public void testLatestVersion() throws Exception { } private void serializeAndDeserialize(int splitCount, int filesPerSplit) throws Exception { - final List splits = SplitHelpers - .createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, splitCount, filesPerSplit); + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable( + TEMPORARY_FOLDER, splitCount, filesPerSplit); for (IcebergSourceSplit split : splits) { byte[] result = serializer.serialize(split); IcebergSourceSplit deserialized = serializer.deserialize(serializer.getVersion(), result); @@ -51,14 +50,16 @@ private void serializeAndDeserialize(int splitCount, int filesPerSplit) throws E byte[] cachedResult = serializer.serialize(split); Assert.assertSame(result, cachedResult); - IcebergSourceSplit deserialized2 = serializer.deserialize(serializer.getVersion(), cachedResult); + IcebergSourceSplit deserialized2 = + serializer.deserialize(serializer.getVersion(), cachedResult); assertSplitEquals(split, deserialized2); split.updatePosition(0, 100); byte[] resultAfterUpdatePosition = serializer.serialize(split); // after position change, serialized bytes should have changed Assert.assertNotSame(cachedResult, resultAfterUpdatePosition); - IcebergSourceSplit deserialized3 = serializer.deserialize(serializer.getVersion(), resultAfterUpdatePosition); + IcebergSourceSplit deserialized3 = + serializer.deserialize(serializer.getVersion(), resultAfterUpdatePosition); assertSplitEquals(split, deserialized3); } } @@ -70,8 +71,9 @@ public void testV1() throws Exception { } private void serializeAndDeserializeV1(int splitCount, int filesPerSplit) throws Exception { - final List splits = SplitHelpers - .createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, splitCount, filesPerSplit); + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable( + TEMPORARY_FOLDER, splitCount, filesPerSplit); for (IcebergSourceSplit split : splits) { byte[] result = split.serializeV1(); IcebergSourceSplit deserialized = IcebergSourceSplit.deserializeV1(result); @@ -82,19 +84,22 @@ private void serializeAndDeserializeV1(int splitCount, int filesPerSplit) throws @Test public void testCheckpointedPosition() throws Exception { final AtomicInteger index = new AtomicInteger(); - final List splits = SplitHelpers - .createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 10, 2).stream() - .map(split -> { - IcebergSourceSplit result; - if (index.get() % 2 == 0) { - result = IcebergSourceSplit.fromCombinedScanTask(split.task(), index.get(), index.get()); - } else { - result = split; - } - index.incrementAndGet(); - return result; - }) - .collect(Collectors.toList()); + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 10, 2).stream() + .map( + split -> { + IcebergSourceSplit result; + if (index.get() % 2 == 0) { + result = + IcebergSourceSplit.fromCombinedScanTask( + split.task(), index.get(), index.get()); + } else { + result = split; + } + index.incrementAndGet(); + return result; + }) + .collect(Collectors.toList()); for (IcebergSourceSplit split : splits) { byte[] result = serializer.serialize(split); @@ -103,7 +108,8 @@ public void testCheckpointedPosition() throws Exception { byte[] cachedResult = serializer.serialize(split); Assert.assertSame(result, cachedResult); - IcebergSourceSplit deserialized2 = serializer.deserialize(serializer.getVersion(), cachedResult); + IcebergSourceSplit deserialized2 = + serializer.deserialize(serializer.getVersion(), cachedResult); assertSplitEquals(split, deserialized2); } } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java index 1d53586a2db5..7c098cf20d03 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/CatalogLoader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.Serializable; @@ -32,21 +31,21 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; -/** - * Serializable loader to load an Iceberg {@link Catalog}. - */ +/** Serializable loader to load an Iceberg {@link Catalog}. */ public interface CatalogLoader extends Serializable { /** - * Create a new catalog with the provided properties. NOTICE: for flink, we may initialize the {@link CatalogLoader} - * at flink sql client side or job manager side, and then serialize this catalog loader to task manager, finally - * deserialize it and create a new catalog at task manager side. + * Create a new catalog with the provided properties. NOTICE: for flink, we may initialize the + * {@link CatalogLoader} at flink sql client side or job manager side, and then serialize this + * catalog loader to task manager, finally deserialize it and create a new catalog at task manager + * side. * * @return a newly created {@link Catalog} */ Catalog loadCatalog(); - static CatalogLoader hadoop(String name, Configuration hadoopConf, Map properties) { + static CatalogLoader hadoop( + String name, Configuration hadoopConf, Map properties) { return new HadoopCatalogLoader(name, hadoopConf, properties); } @@ -54,7 +53,8 @@ static CatalogLoader hive(String name, Configuration hadoopConf, Map properties, Configuration hadoopConf, String impl) { + static CatalogLoader custom( + String name, Map properties, Configuration hadoopConf, String impl) { return new CustomCatalogLoader(name, properties, hadoopConf, impl); } @@ -65,9 +65,7 @@ class HadoopCatalogLoader implements CatalogLoader { private final Map properties; private HadoopCatalogLoader( - String catalogName, - Configuration conf, - Map properties) { + String catalogName, Configuration conf, Map properties) { this.catalogName = catalogName; this.hadoopConf = new SerializableConfiguration(conf); this.warehouseLocation = properties.get(CatalogProperties.WAREHOUSE_LOCATION); @@ -76,7 +74,8 @@ private HadoopCatalogLoader( @Override public Catalog loadCatalog() { - return CatalogUtil.loadCatalog(HadoopCatalog.class.getName(), catalogName, properties, hadoopConf.get()); + return CatalogUtil.loadCatalog( + HadoopCatalog.class.getName(), catalogName, properties, hadoopConf.get()); } @Override @@ -96,20 +95,23 @@ class HiveCatalogLoader implements CatalogLoader { private final int clientPoolSize; private final Map properties; - private HiveCatalogLoader(String catalogName, Configuration conf, Map properties) { + private HiveCatalogLoader( + String catalogName, Configuration conf, Map properties) { this.catalogName = catalogName; this.hadoopConf = new SerializableConfiguration(conf); this.uri = properties.get(CatalogProperties.URI); this.warehouse = properties.get(CatalogProperties.WAREHOUSE_LOCATION); - this.clientPoolSize = properties.containsKey(CatalogProperties.CLIENT_POOL_SIZE) ? - Integer.parseInt(properties.get(CatalogProperties.CLIENT_POOL_SIZE)) : - CatalogProperties.CLIENT_POOL_SIZE_DEFAULT; + this.clientPoolSize = + properties.containsKey(CatalogProperties.CLIENT_POOL_SIZE) + ? Integer.parseInt(properties.get(CatalogProperties.CLIENT_POOL_SIZE)) + : CatalogProperties.CLIENT_POOL_SIZE_DEFAULT; this.properties = Maps.newHashMap(properties); } @Override public Catalog loadCatalog() { - return CatalogUtil.loadCatalog(HiveCatalog.class.getName(), catalogName, properties, hadoopConf.get()); + return CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), catalogName, properties, hadoopConf.get()); } @Override @@ -131,14 +133,13 @@ class CustomCatalogLoader implements CatalogLoader { private final String impl; private CustomCatalogLoader( - String name, - Map properties, - Configuration conf, - String impl) { + String name, Map properties, Configuration conf, String impl) { this.hadoopConf = new SerializableConfiguration(conf); this.properties = Maps.newHashMap(properties); // wrap into a hashmap for serialization this.name = name; - this.impl = Preconditions.checkNotNull(impl, "Cannot initialize custom Catalog, impl class name is null"); + this.impl = + Preconditions.checkNotNull( + impl, "Cannot initialize custom Catalog, impl class name is null"); } @Override @@ -148,11 +149,7 @@ public Catalog loadCatalog() { @Override public String toString() { - return MoreObjects.toStringHelper(this) - .add("name", name) - .add("impl", impl) - .toString(); + return MoreObjects.toStringHelper(this).add("name", name).add("impl", impl).toString(); } } - } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java index 678a5a9c0dd7..8779e4656da5 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.Closeable; @@ -80,13 +79,14 @@ /** * A Flink Catalog implementation that wraps an Iceberg {@link Catalog}. - *

    - * The mapping between Flink database and Iceberg namespace: - * Supplying a base namespace for a given catalog, so if you have a catalog that supports a 2-level namespace, you - * would supply the first level in the catalog configuration and the second level would be exposed as Flink databases. - *

    - * The Iceberg table manages its partitions by itself. The partition of the Iceberg table is independent of the - * partition of Flink. + * + *

    The mapping between Flink database and Iceberg namespace: Supplying a base namespace for a + * given catalog, so if you have a catalog that supports a 2-level namespace, you would supply the + * first level in the catalog configuration and the second level would be exposed as Flink + * databases. + * + *

    The Iceberg table manages its partitions by itself. The partition of the Iceberg table is + * independent of the partition of Flink. */ public class FlinkCatalog extends AbstractCatalog { @@ -110,7 +110,8 @@ public FlinkCatalog( Catalog originalCatalog = catalogLoader.loadCatalog(); icebergCatalog = cacheEnabled ? CachingCatalog.wrap(originalCatalog) : originalCatalog; - asNamespaceCatalog = originalCatalog instanceof SupportsNamespaces ? (SupportsNamespaces) originalCatalog : null; + asNamespaceCatalog = + originalCatalog instanceof SupportsNamespaces ? (SupportsNamespaces) originalCatalog : null; closeable = originalCatalog instanceof Closeable ? (Closeable) originalCatalog : null; } @@ -162,7 +163,8 @@ public List listDatabases() throws CatalogException { } @Override - public CatalogDatabase getDatabase(String databaseName) throws DatabaseNotExistException, CatalogException { + public CatalogDatabase getDatabase(String databaseName) + throws DatabaseNotExistException, CatalogException { if (asNamespaceCatalog == null) { if (!getDefaultDatabase().equals(databaseName)) { throw new DatabaseNotExistException(getName(), databaseName); @@ -194,10 +196,12 @@ public boolean databaseExists(String databaseName) throws CatalogException { @Override public void createDatabase(String name, CatalogDatabase database, boolean ignoreIfExists) throws DatabaseAlreadyExistException, CatalogException { - createDatabase(name, mergeComment(database.getProperties(), database.getComment()), ignoreIfExists); + createDatabase( + name, mergeComment(database.getProperties(), database.getComment()), ignoreIfExists); } - private void createDatabase(String databaseName, Map metadata, boolean ignoreIfExists) + private void createDatabase( + String databaseName, Map metadata, boolean ignoreIfExists) throws DatabaseAlreadyExistException, CatalogException { if (asNamespaceCatalog != null) { try { @@ -208,7 +212,8 @@ private void createDatabase(String databaseName, Map metadata, b } } } else { - throw new UnsupportedOperationException("Namespaces are not supported by catalog: " + getName()); + throw new UnsupportedOperationException( + "Namespaces are not supported by catalog: " + getName()); } } @@ -257,7 +262,8 @@ public void alterDatabase(String name, CatalogDatabase newDatabase, boolean igno try { Map oldProperties = asNamespaceCatalog.loadNamespaceMetadata(namespace); - Map newProperties = mergeComment(newDatabase.getProperties(), newDatabase.getComment()); + Map newProperties = + mergeComment(newDatabase.getProperties(), newDatabase.getComment()); for (String key : oldProperties.keySet()) { if (!newProperties.containsKey(key)) { @@ -296,7 +302,8 @@ public void alterDatabase(String name, CatalogDatabase newDatabase, boolean igno } @Override - public List listTables(String databaseName) throws DatabaseNotExistException, CatalogException { + public List listTables(String databaseName) + throws DatabaseNotExistException, CatalogException { try { return icebergCatalog.listTables(toNamespace(databaseName)).stream() .map(TableIdentifier::name) @@ -307,7 +314,8 @@ public List listTables(String databaseName) throws DatabaseNotExistExcep } @Override - public CatalogTable getTable(ObjectPath tablePath) throws TableNotExistException, CatalogException { + public CatalogTable getTable(ObjectPath tablePath) + throws TableNotExistException, CatalogException { Table table = loadIcebergTable(tablePath); return toCatalogTable(table); } @@ -361,10 +369,12 @@ public void renameTable(ObjectPath tablePath, String newTableName, boolean ignor @Override public void createTable(ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) throws CatalogException, TableAlreadyExistException { - if (Objects.equals(table.getOptions().get("connector"), FlinkDynamicTableFactory.FACTORY_IDENTIFIER)) { - throw new IllegalArgumentException("Cannot create the table with 'connector'='iceberg' table property in " + - "an iceberg catalog, Please create table with 'connector'='iceberg' property in a non-iceberg catalog or " + - "create table without 'connector'='iceberg' related properties in an iceberg table."); + if (Objects.equals( + table.getOptions().get("connector"), FlinkDynamicTableFactory.FACTORY_IDENTIFIER)) { + throw new IllegalArgumentException( + "Cannot create the table with 'connector'='iceberg' table property in " + + "an iceberg catalog, Please create table with 'connector'='iceberg' property in a non-iceberg catalog or " + + "create table without 'connector'='iceberg' related properties in an iceberg table."); } createIcebergTable(tablePath, table, ignoreIfExists); @@ -389,11 +399,7 @@ void createIcebergTable(ObjectPath tablePath, CatalogBaseTable table, boolean ig try { icebergCatalog.createTable( - toIdentifier(tablePath), - icebergSchema, - spec, - location, - properties.build()); + toIdentifier(tablePath), icebergSchema, spec, location, properties.build()); } catch (AlreadyExistsException e) { if (!ignoreIfExists) { throw new TableAlreadyExistException(getName(), tablePath, e); @@ -408,15 +414,16 @@ private static void validateTableSchemaAndPartition(CatalogTable ct1, CatalogTab if (ts1.getPrimaryKey().isPresent() && ts2.getPrimaryKey().isPresent()) { equalsPrimary = - Objects.equals(ts1.getPrimaryKey().get().getType(), ts2.getPrimaryKey().get().getType()) && - Objects.equals(ts1.getPrimaryKey().get().getColumns(), ts2.getPrimaryKey().get().getColumns()); + Objects.equals(ts1.getPrimaryKey().get().getType(), ts2.getPrimaryKey().get().getType()) + && Objects.equals( + ts1.getPrimaryKey().get().getColumns(), ts2.getPrimaryKey().get().getColumns()); } else if (!ts1.getPrimaryKey().isPresent() && !ts2.getPrimaryKey().isPresent()) { equalsPrimary = true; } - if (!(Objects.equals(ts1.getTableColumns(), ts2.getTableColumns()) && - Objects.equals(ts1.getWatermarkSpecs(), ts2.getWatermarkSpecs()) && - equalsPrimary)) { + if (!(Objects.equals(ts1.getTableColumns(), ts2.getTableColumns()) + && Objects.equals(ts1.getWatermarkSpecs(), ts2.getWatermarkSpecs()) + && equalsPrimary)) { throw new UnsupportedOperationException("Altering schema is not supported yet."); } @@ -445,7 +452,8 @@ public void alterTable(ObjectPath tablePath, CatalogBaseTable newTable, boolean // Currently, Flink SQL only support altering table properties. - // For current Flink Catalog API, support for adding/removing/renaming columns cannot be done by comparing + // For current Flink Catalog API, support for adding/removing/renaming columns cannot be done by + // comparing // CatalogTable instances, unless the Flink schema contains Iceberg column IDs. validateTableSchemaAndPartition(table, (CatalogTable) newTable); @@ -475,27 +483,36 @@ public void alterTable(ObjectPath tablePath, CatalogBaseTable newTable, boolean } } - oldProperties.keySet().forEach(k -> { - if (!newTable.getOptions().containsKey(k)) { - setProperties.put(k, null); - } - }); + oldProperties + .keySet() + .forEach( + k -> { + if (!newTable.getOptions().containsKey(k)) { + setProperties.put(k, null); + } + }); commitChanges(icebergTable, setLocation, setSnapshotId, pickSnapshotId, setProperties); } private static void validateFlinkTable(CatalogBaseTable table) { - Preconditions.checkArgument(table instanceof CatalogTable, "The Table should be a CatalogTable."); + Preconditions.checkArgument( + table instanceof CatalogTable, "The Table should be a CatalogTable."); TableSchema schema = table.getSchema(); - schema.getTableColumns().forEach(column -> { - if (!FlinkCompatibilityUtil.isPhysicalColumn(column)) { - throw new UnsupportedOperationException("Creating table with computed columns is not supported yet."); - } - }); + schema + .getTableColumns() + .forEach( + column -> { + if (!FlinkCompatibilityUtil.isPhysicalColumn(column)) { + throw new UnsupportedOperationException( + "Creating table with computed columns is not supported yet."); + } + }); if (!schema.getWatermarkSpecs().isEmpty()) { - throw new UnsupportedOperationException("Creating table with watermark specs is not supported yet."); + throw new UnsupportedOperationException( + "Creating table with watermark specs is not supported yet."); } } @@ -520,11 +537,17 @@ private static List toPartitionKeys(PartitionSpec spec, Schema icebergSc return partitionKeysBuilder.build(); } - private static void commitChanges(Table table, String setLocation, String setSnapshotId, - String pickSnapshotId, Map setProperties) { - // don't allow setting the snapshot and picking a commit at the same time because order is ambiguous and choosing + private static void commitChanges( + Table table, + String setLocation, + String setSnapshotId, + String pickSnapshotId, + Map setProperties) { + // don't allow setting the snapshot and picking a commit at the same time because order is + // ambiguous and choosing // one order leads to different results - Preconditions.checkArgument(setSnapshotId == null || pickSnapshotId == null, + Preconditions.checkArgument( + setSnapshotId == null || pickSnapshotId == null, "Cannot set the current snapshot ID and cherry-pick snapshot changes"); if (setSnapshotId != null) { @@ -541,20 +564,19 @@ private static void commitChanges(Table table, String setLocation, String setSna Transaction transaction = table.newTransaction(); if (setLocation != null) { - transaction.updateLocation() - .setLocation(setLocation) - .commit(); + transaction.updateLocation().setLocation(setLocation).commit(); } if (!setProperties.isEmpty()) { UpdateProperties updateProperties = transaction.updateProperties(); - setProperties.forEach((k, v) -> { - if (v == null) { - updateProperties.remove(k); - } else { - updateProperties.set(k, v); - } - }); + setProperties.forEach( + (k, v) -> { + if (v == null) { + updateProperties.remove(k); + } else { + updateProperties.set(k, v); + } + }); updateProperties.commit(); } @@ -565,7 +587,8 @@ static CatalogTable toCatalogTable(Table table) { TableSchema schema = FlinkSchemaUtil.toSchema(table.schema()); List partitionKeys = toPartitionKeys(table.spec(), table.schema()); - // NOTE: We can not create a IcebergCatalogTable extends CatalogTable, because Flink optimizer may use + // NOTE: We can not create a IcebergCatalogTable extends CatalogTable, because Flink optimizer + // may use // CatalogTableImpl to copy a new catalog table. // Let's re-loading table from Iceberg catalog when creating source/sink operators. // Iceberg does not have Table comment, so pass a null (Default comment value in Flink). @@ -581,7 +604,8 @@ CatalogLoader getCatalogLoader() { return catalogLoader; } - // ------------------------------ Unsupported methods --------------------------------------------- + // ------------------------------ Unsupported methods + // --------------------------------------------- @Override public List listViews(String databaseName) throws CatalogException { @@ -595,25 +619,35 @@ public CatalogPartition getPartition(ObjectPath tablePath, CatalogPartitionSpec } @Override - public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { + public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) + throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void createPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec, CatalogPartition partition, - boolean ignoreIfExists) throws CatalogException { + public void createPartition( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogPartition partition, + boolean ignoreIfExists) + throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void dropPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) + public void dropPartition( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void alterPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec, CatalogPartition newPartition, - boolean ignoreIfNotExists) throws CatalogException { + public void alterPartition( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogPartition newPartition, + boolean ignoreIfNotExists) + throws CatalogException { throw new UnsupportedOperationException(); } @@ -623,7 +657,8 @@ public List listFunctions(String dbName) throws CatalogException { } @Override - public CatalogFunction getFunction(ObjectPath functionPath) throws FunctionNotExistException, CatalogException { + public CatalogFunction getFunction(ObjectPath functionPath) + throws FunctionNotExistException, CatalogException { throw new FunctionNotExistException(getName(), functionPath); } @@ -633,13 +668,15 @@ public boolean functionExists(ObjectPath functionPath) throws CatalogException { } @Override - public void createFunction(ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) + public void createFunction( + ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void alterFunction(ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) + public void alterFunction( + ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) throws CatalogException { throw new UnsupportedOperationException(); } @@ -651,26 +688,36 @@ public void dropFunction(ObjectPath functionPath, boolean ignoreIfNotExists) } @Override - public void alterTableStatistics(ObjectPath tablePath, CatalogTableStatistics tableStatistics, - boolean ignoreIfNotExists) throws CatalogException { + public void alterTableStatistics( + ObjectPath tablePath, CatalogTableStatistics tableStatistics, boolean ignoreIfNotExists) + throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void alterTableColumnStatistics(ObjectPath tablePath, CatalogColumnStatistics columnStatistics, - boolean ignoreIfNotExists) throws CatalogException { + public void alterTableColumnStatistics( + ObjectPath tablePath, CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) + throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void alterPartitionStatistics(ObjectPath tablePath, CatalogPartitionSpec partitionSpec, - CatalogTableStatistics partitionStatistics, boolean ignoreIfNotExists) throws CatalogException { + public void alterPartitionStatistics( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogTableStatistics partitionStatistics, + boolean ignoreIfNotExists) + throws CatalogException { throw new UnsupportedOperationException(); } @Override - public void alterPartitionColumnStatistics(ObjectPath tablePath, CatalogPartitionSpec partitionSpec, - CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) throws CatalogException { + public void alterPartitionColumnStatistics( + ObjectPath tablePath, + CatalogPartitionSpec partitionSpec, + CatalogColumnStatistics columnStatistics, + boolean ignoreIfNotExists) + throws CatalogException { throw new UnsupportedOperationException(); } @@ -695,31 +742,32 @@ public List listPartitions(ObjectPath tablePath) set.add(new CatalogPartitionSpec(map)); } } catch (IOException e) { - throw new CatalogException(String.format("Failed to list partitions of table %s", tablePath), e); + throw new CatalogException( + String.format("Failed to list partitions of table %s", tablePath), e); } return Lists.newArrayList(set); } @Override - public List listPartitions(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { + public List listPartitions( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { throw new UnsupportedOperationException(); } @Override - public List listPartitionsByFilter(ObjectPath tablePath, List filters) - throws CatalogException { + public List listPartitionsByFilter( + ObjectPath tablePath, List filters) throws CatalogException { throw new UnsupportedOperationException(); } - // After partition pruning and filter push down, the statistics have become very inaccurate, so the statistics from + // After partition pruning and filter push down, the statistics have become very inaccurate, so + // the statistics from // here are of little significance. // Flink will support something like SupportsReportStatistics in future. @Override - public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) - throws CatalogException { + public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) throws CatalogException { return CatalogTableStatistics.UNKNOWN; } @@ -730,14 +778,14 @@ public CatalogColumnStatistics getTableColumnStatistics(ObjectPath tablePath) } @Override - public CatalogTableStatistics getPartitionStatistics(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { + public CatalogTableStatistics getPartitionStatistics( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { return CatalogTableStatistics.UNKNOWN; } @Override - public CatalogColumnStatistics getPartitionColumnStatistics(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) - throws CatalogException { + public CatalogColumnStatistics getPartitionColumnStatistics( + ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { return CatalogColumnStatistics.UNKNOWN; } } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java index ffa54c0eb95d..1047a5067d4c 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkCatalogFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.net.URL; @@ -40,20 +39,22 @@ /** * A Flink Catalog factory implementation that creates {@link FlinkCatalog}. - *

    - * This supports the following catalog configuration options: + * + *

    This supports the following catalog configuration options: + * *

      - *
    • type - Flink catalog factory key, should be "iceberg"
    • - *
    • catalog-type - iceberg catalog type, "hive" or "hadoop"
    • - *
    • uri - the Hive Metastore URI (Hive catalog only)
    • - *
    • clients - the Hive Client Pool Size (Hive catalog only)
    • - *
    • warehouse - the warehouse path (Hadoop catalog only)
    • - *
    • default-database - a database name to use as the default
    • - *
    • base-namespace - a base namespace as the prefix for all databases (Hadoop catalog only)
    • - *
    • cache-enabled - whether to enable catalog cache
    • + *
    • type - Flink catalog factory key, should be "iceberg" + *
    • catalog-type - iceberg catalog type, "hive" or "hadoop" + *
    • uri - the Hive Metastore URI (Hive catalog only) + *
    • clients - the Hive Client Pool Size (Hive catalog only) + *
    • warehouse - the warehouse path (Hadoop catalog only) + *
    • default-database - a database name to use as the default + *
    • base-namespace - a base namespace as the prefix for all databases (Hadoop + * catalog only) + *
    • cache-enabled - whether to enable catalog cache *
    - *

    - * To use a custom catalog that is not a Hive or Hadoop catalog, extend this class and override + * + *

    To use a custom catalog that is not a Hive or Hadoop catalog, extend this class and override * {@link #createCatalogLoader(String, Map, Configuration)}. */ public class FlinkCatalogFactory implements CatalogFactory { @@ -73,27 +74,33 @@ public class FlinkCatalogFactory implements CatalogFactory { public static final String PROPERTY_VERSION = "property-version"; /** - * Create an Iceberg {@link org.apache.iceberg.catalog.Catalog} loader to be used by this Flink catalog adapter. + * Create an Iceberg {@link org.apache.iceberg.catalog.Catalog} loader to be used by this Flink + * catalog adapter. * - * @param name Flink's catalog name + * @param name Flink's catalog name * @param properties Flink's catalog properties * @param hadoopConf Hadoop configuration for catalog * @return an Iceberg catalog loader */ - static CatalogLoader createCatalogLoader(String name, Map properties, Configuration hadoopConf) { + static CatalogLoader createCatalogLoader( + String name, Map properties, Configuration hadoopConf) { String catalogImpl = properties.get(CatalogProperties.CATALOG_IMPL); if (catalogImpl != null) { String catalogType = properties.get(ICEBERG_CATALOG_TYPE); - Preconditions.checkArgument(catalogType == null, + Preconditions.checkArgument( + catalogType == null, "Cannot create catalog %s, both catalog-type and catalog-impl are set: catalog-type=%s, catalog-impl=%s", - name, catalogType, catalogImpl); + name, + catalogType, + catalogImpl); return CatalogLoader.custom(name, properties, hadoopConf, catalogImpl); } String catalogType = properties.getOrDefault(ICEBERG_CATALOG_TYPE, ICEBERG_CATALOG_TYPE_HIVE); switch (catalogType.toLowerCase(Locale.ENGLISH)) { case ICEBERG_CATALOG_TYPE_HIVE: - // The values of properties 'uri', 'warehouse', 'hive-conf-dir' are allowed to be null, in that case it will + // The values of properties 'uri', 'warehouse', 'hive-conf-dir' are allowed to be null, in + // that case it will // fallback to parse those values from hadoop configuration which is loaded from classpath. String hiveConfDir = properties.get(HIVE_CONF_DIR); Configuration newHadoopConf = mergeHiveConf(hadoopConf, hiveConfDir); @@ -103,8 +110,8 @@ static CatalogLoader createCatalogLoader(String name, Map proper return CatalogLoader.hadoop(name, hadoopConf, properties); default: - throw new UnsupportedOperationException("Unknown catalog-type: " + catalogType + - " (Must be 'hive' or 'hadoop')"); + throw new UnsupportedOperationException( + "Unknown catalog-type: " + catalogType + " (Must be 'hive' or 'hadoop')"); } } @@ -126,7 +133,8 @@ public Catalog createCatalog(String name, Map properties) { return createCatalog(name, properties, clusterHadoopConf()); } - protected Catalog createCatalog(String name, Map properties, Configuration hadoopConf) { + protected Catalog createCatalog( + String name, Map properties, Configuration hadoopConf) { CatalogLoader catalogLoader = createCatalogLoader(name, properties, hadoopConf); String defaultDatabase = properties.getOrDefault(DEFAULT_DATABASE, DEFAULT_DATABASE_NAME); @@ -142,11 +150,14 @@ protected Catalog createCatalog(String name, Map properties, Con private static Configuration mergeHiveConf(Configuration hadoopConf, String hiveConfDir) { Configuration newConf = new Configuration(hadoopConf); if (!Strings.isNullOrEmpty(hiveConfDir)) { - Preconditions.checkState(Files.exists(Paths.get(hiveConfDir, "hive-site.xml")), - "There should be a hive-site.xml file under the directory %s", hiveConfDir); + Preconditions.checkState( + Files.exists(Paths.get(hiveConfDir, "hive-site.xml")), + "There should be a hive-site.xml file under the directory %s", + hiveConfDir); newConf.addResource(new Path(hiveConfDir, "hive-site.xml")); } else { - // If don't provide the hive-site.xml path explicitly, it will try to load resource from classpath. If still + // If don't provide the hive-site.xml path explicitly, it will try to load resource from + // classpath. If still // couldn't load the configuration file, then it will throw exception in HiveCatalog. URL configFile = CatalogLoader.class.getClassLoader().getResource("hive-site.xml"); if (configFile != null) { diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java index e984f6875920..83fa09de544c 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java index 796a709d51e0..b57bf03d3379 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java @@ -16,10 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; - import org.apache.flink.configuration.ConfigOption; import org.apache.flink.configuration.ConfigOptions; import org.apache.flink.configuration.Configuration; @@ -27,8 +25,9 @@ import org.apache.iceberg.util.ThreadPools; /** - * When constructing Flink Iceberg source via Java API, - * configs can be set in {@link Configuration} passed to source builder. E.g. + * When constructing Flink Iceberg source via Java API, configs can be set in {@link Configuration} + * passed to source builder. E.g. + * *

      *   configuration.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, true);
      *   FlinkSource.forRowData()
    @@ -36,9 +35,9 @@
      *       ...
      * 
    * - *

    + *

    When using Flink SQL/table API, connector options can be set in Flink's {@link + * TableEnvironment}. * - * When using Flink SQL/table API, connector options can be set in Flink's {@link TableEnvironment}. *

      *   TableEnvironment tEnv = createTableEnv();
      *   tEnv.getConfig()
    @@ -48,15 +47,15 @@
      */
     public class FlinkConfigOptions {
     
    -  private FlinkConfigOptions() {
    -  }
    +  private FlinkConfigOptions() {}
     
       public static final ConfigOption TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM =
           ConfigOptions.key("table.exec.iceberg.infer-source-parallelism")
               .booleanType()
               .defaultValue(true)
    -          .withDescription("If is false, parallelism of source are set by config.\n" +
    -              "If is true, source parallelism is inferred according to splits number.\n");
    +          .withDescription(
    +              "If is false, parallelism of source are set by config.\n"
    +                  + "If is true, source parallelism is inferred according to splits number.\n");
     
       public static final ConfigOption TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX =
           ConfigOptions.key("table.exec.iceberg.infer-source-parallelism.max")
    @@ -68,13 +67,14 @@ private FlinkConfigOptions() {
           ConfigOptions.key("table.exec.iceberg.expose-split-locality-info")
               .booleanType()
               .noDefaultValue()
    -          .withDescription("Expose split host information to use Flink's locality aware split assigner.");
    +          .withDescription(
    +              "Expose split host information to use Flink's locality aware split assigner.");
     
    -  public static final ConfigOption SOURCE_READER_FETCH_BATCH_RECORD_COUNT = ConfigOptions
    -      .key("table.exec.iceberg.fetch-batch-record-count")
    -      .intType()
    -      .defaultValue(2048)
    -      .withDescription("The target number of records for Iceberg reader fetch batch.");
    +  public static final ConfigOption SOURCE_READER_FETCH_BATCH_RECORD_COUNT =
    +      ConfigOptions.key("table.exec.iceberg.fetch-batch-record-count")
    +          .intType()
    +          .defaultValue(2048)
    +          .withDescription("The target number of records for Iceberg reader fetch batch.");
     
       public static final ConfigOption TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE =
           ConfigOptions.key("table.exec.iceberg.worker-pool-size")
    diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java
    index f8250dc48efa..0c3cd3f69afc 100644
    --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java
    +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkDynamicTableFactory.java
    @@ -16,7 +16,6 @@
      * specific language governing permissions and limitations
      * under the License.
      */
    -
     package org.apache.iceberg.flink;
     
     import java.util.Map;
    @@ -43,7 +42,8 @@
     import org.apache.iceberg.relocated.com.google.common.collect.Maps;
     import org.apache.iceberg.relocated.com.google.common.collect.Sets;
     
    -public class FlinkDynamicTableFactory implements DynamicTableSinkFactory, DynamicTableSourceFactory {
    +public class FlinkDynamicTableFactory
    +    implements DynamicTableSinkFactory, DynamicTableSourceFactory {
       static final String FACTORY_IDENTIFIER = "iceberg";
     
       private static final ConfigOption CATALOG_NAME =
    @@ -91,8 +91,12 @@ public DynamicTableSource createDynamicTableSource(Context context) {
         if (catalog != null) {
           tableLoader = createTableLoader(catalog, objectIdentifier.toObjectPath());
         } else {
    -      tableLoader = createTableLoader(catalogTable, tableProps, objectIdentifier.getDatabaseName(),
    -          objectIdentifier.getObjectName());
    +      tableLoader =
    +          createTableLoader(
    +              catalogTable,
    +              tableProps,
    +              objectIdentifier.getDatabaseName(),
    +              objectIdentifier.getObjectName());
         }
     
         return new IcebergTableSource(tableLoader, tableSchema, tableProps, context.getConfiguration());
    @@ -109,8 +113,9 @@ public DynamicTableSink createDynamicTableSink(Context context) {
         if (catalog != null) {
           tableLoader = createTableLoader(catalog, objectPath);
         } else {
    -      tableLoader = createTableLoader(catalogTable, tableProps, objectPath.getDatabaseName(),
    -          objectPath.getObjectName());
    +      tableLoader =
    +          createTableLoader(
    +              catalogTable, tableProps, objectPath.getDatabaseName(), objectPath.getObjectName());
         }
     
         return new IcebergTableSink(tableLoader, tableSchema, context.getConfiguration());
    @@ -137,15 +142,17 @@ public String factoryIdentifier() {
         return FACTORY_IDENTIFIER;
       }
     
    -  private static TableLoader createTableLoader(CatalogBaseTable catalogBaseTable,
    -                                               Map tableProps,
    -                                               String databaseName,
    -                                               String tableName) {
    +  private static TableLoader createTableLoader(
    +      CatalogBaseTable catalogBaseTable,
    +      Map tableProps,
    +      String databaseName,
    +      String tableName) {
         Configuration flinkConf = new Configuration();
         tableProps.forEach(flinkConf::setString);
     
         String catalogName = flinkConf.getString(CATALOG_NAME);
    -    Preconditions.checkNotNull(catalogName, "Table property '%s' cannot be null", CATALOG_NAME.key());
    +    Preconditions.checkNotNull(
    +        catalogName, "Table property '%s' cannot be null", CATALOG_NAME.key());
     
         String catalogDatabase = flinkConf.getString(CATALOG_DATABASE, databaseName);
         Preconditions.checkNotNull(catalogDatabase, "The iceberg database name cannot be null");
    @@ -155,15 +162,20 @@ private static TableLoader createTableLoader(CatalogBaseTable catalogBaseTable,
     
         org.apache.hadoop.conf.Configuration hadoopConf = FlinkCatalogFactory.clusterHadoopConf();
         FlinkCatalogFactory factory = new FlinkCatalogFactory();
    -    FlinkCatalog flinkCatalog = (FlinkCatalog) factory.createCatalog(catalogName, tableProps, hadoopConf);
    +    FlinkCatalog flinkCatalog =
    +        (FlinkCatalog) factory.createCatalog(catalogName, tableProps, hadoopConf);
         ObjectPath objectPath = new ObjectPath(catalogDatabase, catalogTable);
     
         // Create database if not exists in the external catalog.
         if (!flinkCatalog.databaseExists(catalogDatabase)) {
           try {
    -        flinkCatalog.createDatabase(catalogDatabase, new CatalogDatabaseImpl(Maps.newHashMap(), null), true);
    +        flinkCatalog.createDatabase(
    +            catalogDatabase, new CatalogDatabaseImpl(Maps.newHashMap(), null), true);
           } catch (DatabaseAlreadyExistException e) {
    -        throw new AlreadyExistsException(e, "Database %s already exists in the iceberg catalog %s.", catalogName,
    +        throw new AlreadyExistsException(
    +            e,
    +            "Database %s already exists in the iceberg catalog %s.",
    +            catalogName,
                 catalogDatabase);
           }
         }
    @@ -173,12 +185,17 @@ private static TableLoader createTableLoader(CatalogBaseTable catalogBaseTable,
           try {
             flinkCatalog.createIcebergTable(objectPath, catalogBaseTable, true);
           } catch (TableAlreadyExistException e) {
    -        throw new AlreadyExistsException(e, "Table %s already exists in the database %s and catalog %s",
    -            catalogTable, catalogDatabase, catalogName);
    +        throw new AlreadyExistsException(
    +            e,
    +            "Table %s already exists in the database %s and catalog %s",
    +            catalogTable,
    +            catalogDatabase,
    +            catalogName);
           }
         }
     
    -    return TableLoader.fromCatalog(flinkCatalog.getCatalogLoader(), TableIdentifier.of(catalogDatabase, catalogTable));
    +    return TableLoader.fromCatalog(
    +        flinkCatalog.getCatalogLoader(), TableIdentifier.of(catalogDatabase, catalogTable));
       }
     
       private static TableLoader createTableLoader(FlinkCatalog catalog, ObjectPath objectPath) {
    diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java
    index 5e5c9c1fe0fb..717de9ef5acc 100644
    --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java
    +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkFilters.java
    @@ -16,7 +16,6 @@
      * specific language governing permissions and limitations
      * under the License.
      */
    -
     package org.apache.iceberg.flink;
     
     import java.time.Instant;
    @@ -43,38 +42,38 @@
     import org.apache.iceberg.util.NaNUtil;
     
     public class FlinkFilters {
    -  private FlinkFilters() {
    -  }
    +  private FlinkFilters() {}
     
       private static final Pattern STARTS_WITH_PATTERN = Pattern.compile("([^%]+)%");
     
    -  private static final Map FILTERS = ImmutableMap
    -      .builder()
    -      .put(BuiltInFunctionDefinitions.EQUALS, Operation.EQ)
    -      .put(BuiltInFunctionDefinitions.NOT_EQUALS, Operation.NOT_EQ)
    -      .put(BuiltInFunctionDefinitions.GREATER_THAN, Operation.GT)
    -      .put(BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL, Operation.GT_EQ)
    -      .put(BuiltInFunctionDefinitions.LESS_THAN, Operation.LT)
    -      .put(BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL, Operation.LT_EQ)
    -      .put(BuiltInFunctionDefinitions.IS_NULL, Operation.IS_NULL)
    -      .put(BuiltInFunctionDefinitions.IS_NOT_NULL, Operation.NOT_NULL)
    -      .put(BuiltInFunctionDefinitions.AND, Operation.AND)
    -      .put(BuiltInFunctionDefinitions.OR, Operation.OR)
    -      .put(BuiltInFunctionDefinitions.NOT, Operation.NOT)
    -      .put(BuiltInFunctionDefinitions.LIKE, Operation.STARTS_WITH)
    -      .build();
    +  private static final Map FILTERS =
    +      ImmutableMap.builder()
    +          .put(BuiltInFunctionDefinitions.EQUALS, Operation.EQ)
    +          .put(BuiltInFunctionDefinitions.NOT_EQUALS, Operation.NOT_EQ)
    +          .put(BuiltInFunctionDefinitions.GREATER_THAN, Operation.GT)
    +          .put(BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL, Operation.GT_EQ)
    +          .put(BuiltInFunctionDefinitions.LESS_THAN, Operation.LT)
    +          .put(BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL, Operation.LT_EQ)
    +          .put(BuiltInFunctionDefinitions.IS_NULL, Operation.IS_NULL)
    +          .put(BuiltInFunctionDefinitions.IS_NOT_NULL, Operation.NOT_NULL)
    +          .put(BuiltInFunctionDefinitions.AND, Operation.AND)
    +          .put(BuiltInFunctionDefinitions.OR, Operation.OR)
    +          .put(BuiltInFunctionDefinitions.NOT, Operation.NOT)
    +          .put(BuiltInFunctionDefinitions.LIKE, Operation.STARTS_WITH)
    +          .build();
     
       /**
        * Convert flink expression to iceberg expression.
    -   * 

    - * the BETWEEN, NOT_BETWEEN, IN expression will be converted by flink automatically. the BETWEEN will be converted to - * (GT_EQ AND LT_EQ), the NOT_BETWEEN will be converted to (LT_EQ OR GT_EQ), the IN will be converted to OR, so we do - * not add the conversion here + * + *

    the BETWEEN, NOT_BETWEEN, IN expression will be converted by flink automatically. the + * BETWEEN will be converted to (GT_EQ AND LT_EQ), the NOT_BETWEEN will be converted to (LT_EQ OR + * GT_EQ), the IN will be converted to OR, so we do not add the conversion here * * @param flinkExpression the flink expression * @return the iceberg expression */ - public static Optional convert(org.apache.flink.table.expressions.Expression flinkExpression) { + public static Optional convert( + org.apache.flink.table.expressions.Expression flinkExpression) { if (!(flinkExpression instanceof CallExpression)) { return Optional.empty(); } @@ -97,34 +96,42 @@ public static Optional convert(org.apache.flink.table.expressions.Ex return convertFieldAndLiteral(Expressions::lessThan, Expressions::greaterThan, call); case LT_EQ: - return convertFieldAndLiteral(Expressions::lessThanOrEqual, Expressions::greaterThanOrEqual, call); + return convertFieldAndLiteral( + Expressions::lessThanOrEqual, Expressions::greaterThanOrEqual, call); case GT: return convertFieldAndLiteral(Expressions::greaterThan, Expressions::lessThan, call); case GT_EQ: - return convertFieldAndLiteral(Expressions::greaterThanOrEqual, Expressions::lessThanOrEqual, call); + return convertFieldAndLiteral( + Expressions::greaterThanOrEqual, Expressions::lessThanOrEqual, call); case EQ: - return convertFieldAndLiteral((ref, lit) -> { - if (NaNUtil.isNaN(lit)) { - return Expressions.isNaN(ref); - } else { - return Expressions.equal(ref, lit); - } - }, call); + return convertFieldAndLiteral( + (ref, lit) -> { + if (NaNUtil.isNaN(lit)) { + return Expressions.isNaN(ref); + } else { + return Expressions.equal(ref, lit); + } + }, + call); case NOT_EQ: - return convertFieldAndLiteral((ref, lit) -> { - if (NaNUtil.isNaN(lit)) { - return Expressions.notNaN(ref); - } else { - return Expressions.notEqual(ref, lit); - } - }, call); + return convertFieldAndLiteral( + (ref, lit) -> { + if (NaNUtil.isNaN(lit)) { + return Expressions.notNaN(ref); + } else { + return Expressions.notEqual(ref, lit); + } + }, + call); case NOT: - return onlyChildAs(call, CallExpression.class).flatMap(FlinkFilters::convert).map(Expressions::not); + return onlyChildAs(call, CallExpression.class) + .flatMap(FlinkFilters::convert) + .map(Expressions::not); case AND: return convertLogicExpression(Expressions::and, call); @@ -140,8 +147,8 @@ public static Optional convert(org.apache.flink.table.expressions.Ex return Optional.empty(); } - private static Optional onlyChildAs(CallExpression call, - Class expectedChildClass) { + private static Optional onlyChildAs( + CallExpression call, Class expectedChildClass) { List children = call.getResolvedChildren(); if (children.size() != 1) { return Optional.empty(); @@ -166,26 +173,28 @@ private static Optional convertLike(CallExpression call) { if (left instanceof FieldReferenceExpression && right instanceof ValueLiteralExpression) { String name = ((FieldReferenceExpression) left).getName(); - return convertLiteral((ValueLiteralExpression) right).flatMap(lit -> { - if (lit instanceof String) { - String pattern = (String) lit; - Matcher matcher = STARTS_WITH_PATTERN.matcher(pattern); - // exclude special char of LIKE - // '_' is the wildcard of the SQL LIKE - if (!pattern.contains("_") && matcher.matches()) { - return Optional.of(Expressions.startsWith(name, matcher.group(1))); - } - } - - return Optional.empty(); - }); + return convertLiteral((ValueLiteralExpression) right) + .flatMap( + lit -> { + if (lit instanceof String) { + String pattern = (String) lit; + Matcher matcher = STARTS_WITH_PATTERN.matcher(pattern); + // exclude special char of LIKE + // '_' is the wildcard of the SQL LIKE + if (!pattern.contains("_") && matcher.matches()) { + return Optional.of(Expressions.startsWith(name, matcher.group(1))); + } + } + + return Optional.empty(); + }); } return Optional.empty(); } - private static Optional convertLogicExpression(BiFunction function, - CallExpression call) { + private static Optional convertLogicExpression( + BiFunction function, CallExpression call) { List args = call.getResolvedChildren(); if (args == null || args.size() != 2) { return Optional.empty(); @@ -201,29 +210,33 @@ private static Optional convertLogicExpression(BiFunction convertLiteral(ValueLiteralExpression expression) { - Optional value = expression.getValueAs(expression.getOutputDataType().getLogicalType().getDefaultConversion()); - return value.map(o -> { - if (o instanceof LocalDateTime) { - return DateTimeUtil.microsFromTimestamp((LocalDateTime) o); - } else if (o instanceof Instant) { - return DateTimeUtil.microsFromInstant((Instant) o); - } else if (o instanceof LocalTime) { - return DateTimeUtil.microsFromTime((LocalTime) o); - } else if (o instanceof LocalDate) { - return DateTimeUtil.daysFromDate((LocalDate) o); - } + Optional value = + expression.getValueAs( + expression.getOutputDataType().getLogicalType().getDefaultConversion()); + return value.map( + o -> { + if (o instanceof LocalDateTime) { + return DateTimeUtil.microsFromTimestamp((LocalDateTime) o); + } else if (o instanceof Instant) { + return DateTimeUtil.microsFromInstant((Instant) o); + } else if (o instanceof LocalTime) { + return DateTimeUtil.microsFromTime((LocalTime) o); + } else if (o instanceof LocalDate) { + return DateTimeUtil.daysFromDate((LocalDate) o); + } - return o; - }); + return o; + }); } - private static Optional convertFieldAndLiteral(BiFunction expr, - CallExpression call) { + private static Optional convertFieldAndLiteral( + BiFunction expr, CallExpression call) { return convertFieldAndLiteral(expr, expr, call); } private static Optional convertFieldAndLiteral( - BiFunction convertLR, BiFunction convertRL, + BiFunction convertLR, + BiFunction convertRL, CallExpression call) { List args = call.getResolvedChildren(); if (args.size() != 2) { @@ -239,7 +252,8 @@ private static Optional convertFieldAndLiteral( if (lit.isPresent()) { return Optional.of(convertLR.apply(name, lit.get())); } - } else if (left instanceof ValueLiteralExpression && right instanceof FieldReferenceExpression) { + } else if (left instanceof ValueLiteralExpression + && right instanceof FieldReferenceExpression) { Optional lit = convertLiteral((ValueLiteralExpression) left); String name = ((FieldReferenceExpression) right).getName(); if (lit.isPresent()) { diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java index 6501c0226e44..767d4497ac91 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkFixupTypes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import org.apache.iceberg.Schema; @@ -36,8 +35,8 @@ private FlinkFixupTypes(Schema referenceSchema) { } static Schema fixup(Schema schema, Schema referenceSchema) { - return new Schema(TypeUtil.visit(schema, - new FlinkFixupTypes(referenceSchema)).asStructType().fields()); + return new Schema( + TypeUtil.visit(schema, new FlinkFixupTypes(referenceSchema)).asStructType().fields()); } @Override diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java index 0827b21786c1..97439b7bb0d6 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkSchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -34,33 +33,33 @@ import org.apache.iceberg.types.Types; /** - * Converter between Flink types and Iceberg type. - * The conversion is not a 1:1 mapping that not allows back-and-forth conversion. So some information might get lost - * during the back-and-forth conversion. - *

    - * This inconsistent types: + * Converter between Flink types and Iceberg type. The conversion is not a 1:1 mapping that not + * allows back-and-forth conversion. So some information might get lost during the back-and-forth + * conversion. + * + *

    This inconsistent types: + * *

      - *
    • map Iceberg UUID type to Flink BinaryType(16)
    • - *
    • map Flink VarCharType(_) and CharType(_) to Iceberg String type
    • - *
    • map Flink VarBinaryType(_) to Iceberg Binary type
    • - *
    • map Flink TimeType(_) to Iceberg Time type (microseconds)
    • - *
    • map Flink TimestampType(_) to Iceberg Timestamp without zone type (microseconds)
    • - *
    • map Flink LocalZonedTimestampType(_) to Iceberg Timestamp with zone type (microseconds)
    • - *
    • map Flink MultiSetType to Iceberg Map type(element, int)
    • + *
    • map Iceberg UUID type to Flink BinaryType(16) + *
    • map Flink VarCharType(_) and CharType(_) to Iceberg String type + *
    • map Flink VarBinaryType(_) to Iceberg Binary type + *
    • map Flink TimeType(_) to Iceberg Time type (microseconds) + *
    • map Flink TimestampType(_) to Iceberg Timestamp without zone type (microseconds) + *
    • map Flink LocalZonedTimestampType(_) to Iceberg Timestamp with zone type (microseconds) + *
    • map Flink MultiSetType to Iceberg Map type(element, int) *
    + * *

    */ public class FlinkSchemaUtil { - private FlinkSchemaUtil() { - } + private FlinkSchemaUtil() {} - /** - * Convert the flink table schema to apache iceberg schema. - */ + /** Convert the flink table schema to apache iceberg schema. */ public static Schema convert(TableSchema schema) { LogicalType schemaType = schema.toRowDataType().getLogicalType(); - Preconditions.checkArgument(schemaType instanceof RowType, "Schema logical type should be RowType."); + Preconditions.checkArgument( + schemaType instanceof RowType, "Schema logical type should be RowType."); RowType root = (RowType) schemaType; Type converted = root.accept(new FlinkTypeToType(root)); @@ -75,8 +74,11 @@ private static Schema freshIdentifierFieldIds(Schema iSchema, TableSchema schema if (schema.getPrimaryKey().isPresent()) { for (String column : schema.getPrimaryKey().get().getColumns()) { Types.NestedField field = iSchema.findField(column); - Preconditions.checkNotNull(field, - "Cannot find field ID for the primary key column %s in schema %s", column, iSchema); + Preconditions.checkNotNull( + field, + "Cannot find field ID for the primary key column %s in schema %s", + column, + iSchema); identifierFieldIds.add(field.fieldId()); } } @@ -86,11 +88,11 @@ private static Schema freshIdentifierFieldIds(Schema iSchema, TableSchema schema /** * Convert a Flink {@link TableSchema} to a {@link Schema} based on the given schema. - *

    - * This conversion does not assign new ids; it uses ids from the base schema. - *

    - * Data types, field order, and nullability will match the Flink type. This conversion may return - * a schema that is not compatible with base schema. + * + *

    This conversion does not assign new ids; it uses ids from the base schema. + * + *

    Data types, field order, and nullability will match the Flink type. This conversion may + * return a schema that is not compatible with base schema. * * @param baseSchema a Schema on which conversion is based * @param flinkSchema a Flink TableSchema @@ -163,7 +165,8 @@ public static TableSchema toSchema(Schema schema) { List columns = Lists.newArrayListWithExpectedSize(identifierFieldIds.size()); for (Integer identifierFieldId : identifierFieldIds) { String columnName = schema.findColumnName(identifierFieldId); - Preconditions.checkNotNull(columnName, "Cannot find field with id %s in schema %s", identifierFieldId, schema); + Preconditions.checkNotNull( + columnName, "Cannot find field with id %s in schema %s", identifierFieldId, schema); columns.add(columnName); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java index 88276d86d3df..6f8bfef2ef44 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -176,9 +175,10 @@ public Type visit(RowType rowType) { List newFields = Lists.newArrayListWithExpectedSize(rowType.getFieldCount()); boolean isRoot = root == rowType; - List types = rowType.getFields().stream() - .map(f -> f.getType().accept(this)) - .collect(Collectors.toList()); + List types = + rowType.getFields().stream() + .map(f -> f.getType().accept(this)) + .collect(Collectors.toList()); for (int i = 0; i < rowType.getFieldCount(); i++) { int id = isRoot ? i : getNextId(); diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java index 9d1a3c492cd7..f3de2416088c 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import org.apache.flink.table.types.logical.DayTimeIntervalType; diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java index ddb5f18c52fe..c1af5e49e5f4 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteConf.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.Locale; @@ -29,29 +28,34 @@ /** * A class for common Iceberg configs for Flink writes. - *

    - * If a config is set at multiple levels, the following order of precedence is used (top to bottom): + * + *

    If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * *

      - *
    1. Write options
    2. - *
    3. flink ReadableConfig
    4. - *
    5. Table metadata
    6. + *
    7. Write options + *
    8. flink ReadableConfig + *
    9. Table metadata *
    - * The most specific value is set in write options and takes precedence over all other configs. - * If no write option is provided, this class checks the flink configuration for any overrides. - * If no applicable value is found in the write options, this class uses the table metadata. - *

    - * Note this class is NOT meant to be serialized. + * + * The most specific value is set in write options and takes precedence over all other configs. If + * no write option is provided, this class checks the flink configuration for any overrides. If no + * applicable value is found in the write options, this class uses the table metadata. + * + *

    Note this class is NOT meant to be serialized. */ public class FlinkWriteConf { private final FlinkConfParser confParser; - public FlinkWriteConf(Table table, Map writeOptions, ReadableConfig readableConfig) { + public FlinkWriteConf( + Table table, Map writeOptions, ReadableConfig readableConfig) { this.confParser = new FlinkConfParser(table, writeOptions, readableConfig); } public boolean overwriteMode() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(FlinkWriteOptions.OVERWRITE_MODE.key()) .flinkConfig(FlinkWriteOptions.OVERWRITE_MODE) .defaultValue(FlinkWriteOptions.OVERWRITE_MODE.defaultValue()) @@ -59,7 +63,8 @@ public boolean overwriteMode() { } public boolean upsertMode() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(FlinkWriteOptions.WRITE_UPSERT_ENABLED.key()) .flinkConfig(FlinkWriteOptions.WRITE_UPSERT_ENABLED) .tableProperty(TableProperties.UPSERT_ENABLED) @@ -68,17 +73,20 @@ public boolean upsertMode() { } public FileFormat dataFileFormat() { - String valueAsString = confParser.stringConf() - .option(FlinkWriteOptions.WRITE_FORMAT.key()) - .flinkConfig(FlinkWriteOptions.WRITE_FORMAT) - .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) - .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) - .parse(); + String valueAsString = + confParser + .stringConf() + .option(FlinkWriteOptions.WRITE_FORMAT.key()) + .flinkConfig(FlinkWriteOptions.WRITE_FORMAT) + .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) + .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) + .parse(); return FileFormat.valueOf(valueAsString.toUpperCase(Locale.ENGLISH)); } public long targetDataFileSize() { - return confParser.longConf() + return confParser + .longConf() .option(FlinkWriteOptions.TARGET_FILE_SIZE_BYTES.key()) .flinkConfig(FlinkWriteOptions.TARGET_FILE_SIZE_BYTES) .tableProperty(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES) @@ -87,17 +95,20 @@ public long targetDataFileSize() { } public DistributionMode distributionMode() { - String modeName = confParser.stringConf() - .option(FlinkWriteOptions.DISTRIBUTION_MODE.key()) - .flinkConfig(FlinkWriteOptions.DISTRIBUTION_MODE) - .tableProperty(TableProperties.WRITE_DISTRIBUTION_MODE) - .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_NONE) - .parse(); + String modeName = + confParser + .stringConf() + .option(FlinkWriteOptions.DISTRIBUTION_MODE.key()) + .flinkConfig(FlinkWriteOptions.DISTRIBUTION_MODE) + .tableProperty(TableProperties.WRITE_DISTRIBUTION_MODE) + .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_NONE) + .parse(); return DistributionMode.fromName(modeName); } public int workerPoolSize() { - return confParser.intConf() + return confParser + .intConf() .flinkConfig(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE) .defaultValue(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue()) .parse(); diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java index d0dc9c7fdeb1..a3091d5779c7 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/FlinkWriteOptions.java @@ -16,42 +16,32 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import org.apache.flink.configuration.ConfigOption; import org.apache.flink.configuration.ConfigOptions; -/** - * Flink sink write options - */ +/** Flink sink write options */ public class FlinkWriteOptions { - private FlinkWriteOptions() { - } + private FlinkWriteOptions() {} // File format for write operations(default: Table write.format.default ) public static final ConfigOption WRITE_FORMAT = - ConfigOptions.key("write-format") - .stringType().noDefaultValue(); + ConfigOptions.key("write-format").stringType().noDefaultValue(); // Overrides this table's write.target-file-size-bytes public static final ConfigOption TARGET_FILE_SIZE_BYTES = - ConfigOptions.key("target-file-size-bytes") - .longType().noDefaultValue(); + ConfigOptions.key("target-file-size-bytes").longType().noDefaultValue(); // Overrides this table's write.upsert.enabled public static final ConfigOption WRITE_UPSERT_ENABLED = - ConfigOptions.key("upsert-enabled") - .booleanType().noDefaultValue(); + ConfigOptions.key("upsert-enabled").booleanType().noDefaultValue(); public static final ConfigOption OVERWRITE_MODE = - ConfigOptions.key("overwrite-enabled") - .booleanType().defaultValue(false); + ConfigOptions.key("overwrite-enabled").booleanType().defaultValue(false); // Overrides the table's write.distribution-mode public static final ConfigOption DISTRIBUTION_MODE = - ConfigOptions.key("distribution-mode") - .stringType().noDefaultValue(); - + ConfigOptions.key("distribution-mode").stringType().noDefaultValue(); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java index 855e9e73cba1..b9c4119612dd 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSink.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -48,7 +47,8 @@ private IcebergTableSink(IcebergTableSink toCopy) { this.readableConfig = toCopy.readableConfig; } - public IcebergTableSink(TableLoader tableLoader, TableSchema tableSchema, ReadableConfig readableConfig) { + public IcebergTableSink( + TableLoader tableLoader, TableSchema tableSchema, ReadableConfig readableConfig) { this.tableLoader = tableLoader; this.tableSchema = tableSchema; this.readableConfig = readableConfig; @@ -56,25 +56,28 @@ public IcebergTableSink(TableLoader tableLoader, TableSchema tableSchema, Readab @Override public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { - Preconditions.checkState(!overwrite || context.isBounded(), + Preconditions.checkState( + !overwrite || context.isBounded(), "Unbounded data stream doesn't support overwrite operation."); - List equalityColumns = tableSchema.getPrimaryKey() - .map(UniqueConstraint::getColumns) - .orElseGet(ImmutableList::of); + List equalityColumns = + tableSchema.getPrimaryKey().map(UniqueConstraint::getColumns).orElseGet(ImmutableList::of); - return (DataStreamSinkProvider) (providerContext, dataStream) -> FlinkSink.forRowData(dataStream) - .tableLoader(tableLoader) - .tableSchema(tableSchema) - .equalityFieldColumns(equalityColumns) - .overwrite(overwrite) - .flinkConf(readableConfig) - .append(); + return (DataStreamSinkProvider) + (providerContext, dataStream) -> + FlinkSink.forRowData(dataStream) + .tableLoader(tableLoader) + .tableSchema(tableSchema) + .equalityFieldColumns(equalityColumns) + .overwrite(overwrite) + .flinkConf(readableConfig) + .append(); } @Override public void applyStaticPartition(Map partition) { - // The flink's PartitionFanoutWriter will handle the static partition write policy automatically. + // The flink's PartitionFanoutWriter will handle the static partition write policy + // automatically. } @Override diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSource.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSource.java index a0bc04c03d79..615620491038 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSource.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/IcebergTableSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.Arrays; @@ -44,11 +43,12 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -/** - * Flink Iceberg table source. - */ +/** Flink Iceberg table source. */ public class IcebergTableSource - implements ScanTableSource, SupportsProjectionPushDown, SupportsFilterPushDown, SupportsLimitPushDown { + implements ScanTableSource, + SupportsProjectionPushDown, + SupportsFilterPushDown, + SupportsLimitPushDown { private int[] projectedFields; private long limit; @@ -71,14 +71,23 @@ private IcebergTableSource(IcebergTableSource toCopy) { this.readableConfig = toCopy.readableConfig; } - public IcebergTableSource(TableLoader loader, TableSchema schema, Map properties, - ReadableConfig readableConfig) { + public IcebergTableSource( + TableLoader loader, + TableSchema schema, + Map properties, + ReadableConfig readableConfig) { this(loader, schema, properties, null, false, -1, ImmutableList.of(), readableConfig); } - private IcebergTableSource(TableLoader loader, TableSchema schema, Map properties, - int[] projectedFields, boolean isLimitPushDown, - long limit, List filters, ReadableConfig readableConfig) { + private IcebergTableSource( + TableLoader loader, + TableSchema schema, + Map properties, + int[] projectedFields, + boolean isLimitPushDown, + long limit, + List filters, + ReadableConfig readableConfig) { this.loader = loader; this.schema = schema; this.properties = properties; @@ -93,8 +102,8 @@ private IcebergTableSource(TableLoader loader, TableSchema schema, Map fullNames[i]).toArray(String[]::new), - Arrays.stream(projectedFields).mapToObj(i -> fullTypes[i]).toArray(DataType[]::new)).build(); + return TableSchema.builder() + .fields( + Arrays.stream(projectedFields).mapToObj(i -> fullNames[i]).toArray(String[]::new), + Arrays.stream(projectedFields).mapToObj(i -> fullTypes[i]).toArray(DataType[]::new)) + .build(); } } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java index 401e9db65992..d4cec7a3e80b 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.lang.reflect.Array; @@ -77,7 +76,8 @@ public T get(int pos, Class javaClass) { @Override public void set(int pos, T value) { - throw new UnsupportedOperationException("Could not set a field in the RowDataWrapper because rowData is read-only"); + throw new UnsupportedOperationException( + "Could not set a field in the RowDataWrapper because rowData is read-only"); } private interface PositionalGetter { @@ -104,16 +104,19 @@ private static PositionalGetter buildGetter(LogicalType logicalType, Type typ case DECIMAL: DecimalType decimalType = (DecimalType) logicalType; - return (row, pos) -> row.getDecimal(pos, decimalType.getPrecision(), decimalType.getScale()).toBigDecimal(); + return (row, pos) -> + row.getDecimal(pos, decimalType.getPrecision(), decimalType.getScale()).toBigDecimal(); case TIME_WITHOUT_TIME_ZONE: - // Time in RowData is in milliseconds (Integer), while iceberg's time is microseconds (Long). + // Time in RowData is in milliseconds (Integer), while iceberg's time is microseconds + // (Long). return (row, pos) -> ((long) row.getInt(pos)) * 1_000; case TIMESTAMP_WITHOUT_TIME_ZONE: TimestampType timestampType = (TimestampType) logicalType; return (row, pos) -> { - LocalDateTime localDateTime = row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); return DateTimeUtil.microsFromTimestamp(localDateTime); }; @@ -121,7 +124,8 @@ private static PositionalGetter buildGetter(LogicalType logicalType, Type typ LocalZonedTimestampType lzTs = (LocalZonedTimestampType) logicalType; return (row, pos) -> { TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); - return timestampData.getMillisecond() * 1000 + timestampData.getNanoOfMillisecond() / 1000; + return timestampData.getMillisecond() * 1000 + + timestampData.getNanoOfMillisecond() / 1000; }; case ROW: diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java index ebcb1fb0b7b4..e128badb8461 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/TableLoader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.Closeable; @@ -31,9 +30,9 @@ import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; /** - * Serializable loader to load an Iceberg {@link Table}. - * Flink needs to get {@link Table} objects in the cluster (for example, to get splits), not just on the client side. - * So we need an Iceberg table loader to get the {@link Table} object. + * Serializable loader to load an Iceberg {@link Table}. Flink needs to get {@link Table} objects in + * the cluster (for example, to get splits), not just on the client side. So we need an Iceberg + * table loader to get the {@link Table} object. */ public interface TableLoader extends Closeable, Serializable { @@ -78,14 +77,11 @@ public Table loadTable() { } @Override - public void close() { - } + public void close() {} @Override public String toString() { - return MoreObjects.toStringHelper(this) - .add("location", location) - .toString(); + return MoreObjects.toStringHelper(this).add("location", location).toString(); } } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java index cf594b364f5f..f8f1b74b1ceb 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/TypeToFlinkType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -44,8 +43,7 @@ import org.apache.iceberg.types.Types; class TypeToFlinkType extends TypeUtil.SchemaVisitor { - TypeToFlinkType() { - } + TypeToFlinkType() {} @Override public LogicalType schema(Schema schema, LogicalType structType) { @@ -60,8 +58,8 @@ public LogicalType struct(Types.StructType struct, List fieldResult for (int i = 0; i < fields.size(); i += 1) { Types.NestedField field = fields.get(i); LogicalType type = fieldResults.get(i); - RowType.RowField flinkField = new RowType.RowField( - field.name(), type.copy(field.isOptional()), field.doc()); + RowType.RowField flinkField = + new RowType.RowField(field.name(), type.copy(field.isOptional()), field.doc()); flinkFields.add(flinkField); } @@ -100,9 +98,11 @@ public LogicalType primitive(Type.PrimitiveType primitive) { case DATE: return new DateType(); case TIME: - // For the type: Flink only support TimeType with default precision (second) now. The precision of time is + // For the type: Flink only support TimeType with default precision (second) now. The + // precision of time is // not supported in Flink, so we can think of it as a simple time type directly. - // For the data: Flink uses int that support mills to represent time data, so it supports mills precision. + // For the data: Flink uses int that support mills to represent time data, so it supports + // mills precision. return new TimeType(); case TIMESTAMP: Types.TimestampType timestamp = (Types.TimestampType) primitive; diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java index 98702ceb57f1..06ac54617ae6 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/actions/Actions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.actions; import org.apache.flink.configuration.Configuration; @@ -26,9 +25,10 @@ public class Actions { - public static final Configuration CONFIG = new Configuration() - // disable classloader check as Avro may cache class/object in the serializers. - .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); + public static final Configuration CONFIG = + new Configuration() + // disable classloader check as Avro may cache class/object in the serializers. + .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); private StreamExecutionEnvironment env; private Table table; @@ -49,5 +49,4 @@ public static Actions forTable(Table table) { public RewriteDataFilesAction rewriteDataFiles() { return new RewriteDataFilesAction(env, table); } - } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java index cbd4aed73c8a..9876bb3861c4 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/actions/RewriteDataFilesAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.actions; import java.util.List; @@ -51,7 +50,8 @@ protected List rewriteDataForTasks(List combinedScan int size = combinedScanTasks.size(); int parallelism = Math.min(size, maxParallelism); DataStream dataStream = env.fromCollection(combinedScanTasks); - RowDataRewriter rowDataRewriter = new RowDataRewriter(table(), caseSensitive(), fileIO(), encryptionManager()); + RowDataRewriter rowDataRewriter = + new RowDataRewriter(table(), caseSensitive(), fileIO(), encryptionManager()); try { return rowDataRewriter.rewriteDataForTasks(dataStream, parallelism); } catch (Exception e) { diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java index 1ccc3b787e33..8103224a0b6c 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/AvroWithFlinkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import org.apache.flink.table.types.logical.ArrayType; @@ -29,7 +28,8 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.util.Pair; -public abstract class AvroWithFlinkSchemaVisitor extends AvroWithPartnerByStructureVisitor { +public abstract class AvroWithFlinkSchemaVisitor + extends AvroWithPartnerByStructureVisitor { @Override protected boolean isStringType(LogicalType logicalType) { @@ -43,7 +43,8 @@ protected boolean isMapType(LogicalType logicalType) { @Override protected LogicalType arrayElementType(LogicalType arrayType) { - Preconditions.checkArgument(arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); + Preconditions.checkArgument( + arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); return ((ArrayType) arrayType).getElementType(); } @@ -61,7 +62,8 @@ protected LogicalType mapValueType(LogicalType mapType) { @Override protected Pair fieldNameAndType(LogicalType structType, int pos) { - Preconditions.checkArgument(structType instanceof RowType, "Invalid struct: %s is not a struct", structType); + Preconditions.checkArgument( + structType instanceof RowType, "Invalid struct: %s is not a struct", structType); RowType.RowField field = ((RowType) structType).getFields().get(pos); return Pair.of(field.getName(), field.getType()); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java index 991ef6336297..86404959735a 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.IOException; @@ -49,10 +48,12 @@ public FlinkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSche } @SuppressWarnings("unchecked") - public FlinkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { + public FlinkAvroReader( + org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { this.readSchema = readSchema; - this.reader = (ValueReader) AvroSchemaWithTypeVisitor - .visit(expectedSchema, readSchema, new ReadBuilder(constants)); + this.reader = + (ValueReader) + AvroSchemaWithTypeVisitor.visit(expectedSchema, readSchema, new ReadBuilder(constants)); } @Override @@ -80,8 +81,8 @@ private ReadBuilder(Map idToConstant) { } @Override - public ValueReader record(Types.StructType expected, Schema record, List names, - List> fields) { + public ValueReader record( + Types.StructType expected, Schema record, List names, List> fields) { return FlinkValueReaders.struct(fields, expected.asStructType(), idToConstant); } @@ -91,13 +92,14 @@ public ValueReader union(Type expected, Schema union, List> op } @Override - public ValueReader array(Types.ListType expected, Schema array, ValueReader elementReader) { + public ValueReader array( + Types.ListType expected, Schema array, ValueReader elementReader) { return FlinkValueReaders.array(elementReader); } @Override - public ValueReader map(Types.MapType expected, Schema map, - ValueReader keyReader, ValueReader valueReader) { + public ValueReader map( + Types.MapType expected, Schema map, ValueReader keyReader, ValueReader valueReader) { return FlinkValueReaders.arrayMap(keyReader, valueReader); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java index b069a35d3bbb..873e65783119 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkAvroWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.IOException; @@ -47,8 +46,9 @@ public FlinkAvroWriter(RowType rowType) { @Override @SuppressWarnings("unchecked") public void setSchema(Schema schema) { - this.writer = (ValueWriter) AvroWithFlinkSchemaVisitor - .visit(rowType, schema, new WriteBuilder()); + this.writer = + (ValueWriter) + AvroWithFlinkSchemaVisitor.visit(rowType, schema, new WriteBuilder()); } @Override @@ -63,17 +63,23 @@ public Stream metrics() { private static class WriteBuilder extends AvroWithFlinkSchemaVisitor> { @Override - public ValueWriter record(LogicalType struct, Schema record, List names, List> fields) { - return FlinkValueWriters.row(fields, IntStream.range(0, names.size()) - .mapToObj(i -> fieldNameAndType(struct, i).second()).collect(Collectors.toList())); + public ValueWriter record( + LogicalType struct, Schema record, List names, List> fields) { + return FlinkValueWriters.row( + fields, + IntStream.range(0, names.size()) + .mapToObj(i -> fieldNameAndType(struct, i).second()) + .collect(Collectors.toList())); } @Override public ValueWriter union(LogicalType type, Schema union, List> options) { - Preconditions.checkArgument(options.contains(ValueWriters.nulls()), - "Cannot create writer for non-option union: %s", union); - Preconditions.checkArgument(options.size() == 2, - "Cannot create writer for non-option union: %s", union); + Preconditions.checkArgument( + options.contains(ValueWriters.nulls()), + "Cannot create writer for non-option union: %s", + union); + Preconditions.checkArgument( + options.size() == 2, "Cannot create writer for non-option union: %s", union); if (union.getTypes().get(0).getType() == Schema.Type.NULL) { return ValueWriters.option(0, options.get(1)); } else { @@ -88,12 +94,15 @@ public ValueWriter array(LogicalType sArray, Schema array, ValueWriter ele @Override public ValueWriter map(LogicalType sMap, Schema map, ValueWriter valueReader) { - return FlinkValueWriters.map(FlinkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); + return FlinkValueWriters.map( + FlinkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); } @Override - public ValueWriter map(LogicalType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { - return FlinkValueWriters.arrayMap(keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); + public ValueWriter map( + LogicalType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { + return FlinkValueWriters.arrayMap( + keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); } @Override diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java index 4c4e2050263b..65b9d44ad4b8 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.List; @@ -44,7 +43,8 @@ public FlinkOrcReader(Schema iSchema, TypeDescription readSchema) { } public FlinkOrcReader(Schema iSchema, TypeDescription readSchema, Map idToConstant) { - this.reader = OrcSchemaWithTypeVisitor.visit(iSchema, readSchema, new ReadBuilder(idToConstant)); + this.reader = + OrcSchemaWithTypeVisitor.visit(iSchema, readSchema, new ReadBuilder(idToConstant)); } @Override @@ -65,21 +65,26 @@ private ReadBuilder(Map idToConstant) { } @Override - public OrcValueReader record(Types.StructType iStruct, TypeDescription record, List names, - List> fields) { + public OrcValueReader record( + Types.StructType iStruct, + TypeDescription record, + List names, + List> fields) { return FlinkOrcReaders.struct(fields, iStruct, idToConstant); } @Override - public OrcValueReader list(Types.ListType iList, TypeDescription array, - OrcValueReader elementReader) { + public OrcValueReader list( + Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { return FlinkOrcReaders.array(elementReader); } @Override - public OrcValueReader map(Types.MapType iMap, TypeDescription map, - OrcValueReader keyReader, - OrcValueReader valueReader) { + public OrcValueReader map( + Types.MapType iMap, + TypeDescription map, + OrcValueReader keyReader, + OrcValueReader valueReader) { return FlinkOrcReaders.map(keyReader, valueReader); } @@ -117,8 +122,9 @@ public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescriptio Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; return FlinkOrcReaders.decimals(decimalType.precision(), decimalType.scale()); default: - throw new IllegalArgumentException(String.format("Invalid iceberg type %s corresponding to ORC type %s", - iPrimitive, primitive)); + throw new IllegalArgumentException( + String.format( + "Invalid iceberg type %s corresponding to ORC type %s", iPrimitive, primitive)); } } } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java index 744a05eb2d21..7a4a15c7e600 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.math.BigDecimal; @@ -50,8 +49,7 @@ import org.apache.orc.storage.serde2.io.HiveDecimalWritable; class FlinkOrcReaders { - private FlinkOrcReaders() { - } + private FlinkOrcReaders() {} static OrcValueReader strings() { return StringReader.INSTANCE; @@ -87,13 +85,13 @@ static OrcValueReader array(OrcValueReader elementReader) { return new ArrayReader<>(elementReader); } - public static OrcValueReader map(OrcValueReader keyReader, OrcValueReader valueReader) { + public static OrcValueReader map( + OrcValueReader keyReader, OrcValueReader valueReader) { return new MapReader<>(keyReader, valueReader); } - public static OrcValueReader struct(List> readers, - Types.StructType struct, - Map idToConstant) { + public static OrcValueReader struct( + List> readers, Types.StructType struct, Map idToConstant) { return new StructReader(readers, struct, idToConstant); } @@ -103,7 +101,8 @@ private static class StringReader implements OrcValueReader { @Override public StringData nonNullRead(ColumnVector vector, int row) { BytesColumnVector bytesVector = (BytesColumnVector) vector; - return StringData.fromBytes(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); + return StringData.fromBytes( + bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); } } @@ -130,8 +129,12 @@ public DecimalData nonNullRead(ColumnVector vector, int row) { HiveDecimalWritable value = ((DecimalColumnVector) vector).vector[row]; // The hive ORC writer may will adjust the scale of decimal data. - Preconditions.checkArgument(value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); return DecimalData.fromUnscaledLong(value.serialize64(scale), precision, scale); } @@ -148,10 +151,15 @@ private static class Decimal38Reader implements OrcValueReader { @Override public DecimalData nonNullRead(ColumnVector vector, int row) { - BigDecimal value = ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); + BigDecimal value = + ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); - Preconditions.checkArgument(value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); return DecimalData.fromBigDecimal(value, precision, scale); } @@ -174,9 +182,10 @@ private static class TimestampReader implements OrcValueReader { @Override public TimestampData nonNullRead(ColumnVector vector, int row) { TimestampColumnVector tcv = (TimestampColumnVector) vector; - LocalDateTime localDate = Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) - .atOffset(ZoneOffset.UTC) - .toLocalDateTime(); + LocalDateTime localDate = + Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) + .atOffset(ZoneOffset.UTC) + .toLocalDateTime(); return TimestampData.fromLocalDateTime(localDate); } } @@ -187,9 +196,10 @@ private static class TimestampTzReader implements OrcValueReader @Override public TimestampData nonNullRead(ColumnVector vector, int row) { TimestampColumnVector tcv = (TimestampColumnVector) vector; - Instant instant = Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) - .atOffset(ZoneOffset.UTC) - .toInstant(); + Instant instant = + Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) + .atOffset(ZoneOffset.UTC) + .toInstant(); return TimestampData.fromInstant(instant); } } @@ -254,7 +264,8 @@ public void setBatchContext(long batchOffsetInFile) { private static class StructReader extends OrcValueReaders.StructReader { private final int numFields; - StructReader(List> readers, Types.StructType struct, Map idToConstant) { + StructReader( + List> readers, Types.StructType struct, Map idToConstant) { super(readers, struct, idToConstant); this.numFields = struct.fields().size(); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java index 2eeb268095f5..6a31accffd22 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.Deque; @@ -40,7 +39,9 @@ public class FlinkOrcWriter implements OrcRowWriter { private final FlinkOrcWriters.RowDataWriter writer; private FlinkOrcWriter(RowType rowType, Schema iSchema) { - this.writer = (FlinkOrcWriters.RowDataWriter) FlinkSchemaVisitor.visit(rowType, iSchema, new WriteBuilder()); + this.writer = + (FlinkOrcWriters.RowDataWriter) + FlinkSchemaVisitor.visit(rowType, iSchema, new WriteBuilder()); } public static OrcRowWriter buildWriter(RowType rowType, Schema iSchema) { @@ -66,8 +67,7 @@ public Stream> metrics() { private static class WriteBuilder extends FlinkSchemaVisitor> { private final Deque fieldIds = Lists.newLinkedList(); - private WriteBuilder() { - } + private WriteBuilder() {} @Override public void beforeField(Types.NestedField field) { @@ -80,20 +80,24 @@ public void afterField(Types.NestedField field) { } @Override - public OrcValueWriter record(Types.StructType iStruct, - List> results, - List fieldType) { + public OrcValueWriter record( + Types.StructType iStruct, List> results, List fieldType) { return FlinkOrcWriters.struct(results, fieldType); } @Override - public OrcValueWriter map(Types.MapType iMap, OrcValueWriter key, OrcValueWriter value, - LogicalType keyType, LogicalType valueType) { + public OrcValueWriter map( + Types.MapType iMap, + OrcValueWriter key, + OrcValueWriter value, + LogicalType keyType, + LogicalType valueType) { return FlinkOrcWriters.map(key, value, keyType, valueType); } @Override - public OrcValueWriter list(Types.ListType iList, OrcValueWriter element, LogicalType elementType) { + public OrcValueWriter list( + Types.ListType iList, OrcValueWriter element, LogicalType elementType) { return FlinkOrcWriters.list(element, elementType); } @@ -113,14 +117,20 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType fl case LONG: return GenericOrcWriters.longs(); case FLOAT: - Preconditions.checkArgument(fieldIds.peek() != null, - String.format("[BUG] Cannot find field id for primitive field with type %s. This is likely because id " + - "information is not properly pushed during schema visiting.", iPrimitive)); + Preconditions.checkArgument( + fieldIds.peek() != null, + String.format( + "[BUG] Cannot find field id for primitive field with type %s. This is likely because id " + + "information is not properly pushed during schema visiting.", + iPrimitive)); return GenericOrcWriters.floats(fieldIds.peek()); case DOUBLE: - Preconditions.checkArgument(fieldIds.peek() != null, - String.format("[BUG] Cannot find field id for primitive field with type %s. This is likely because id " + - "information is not properly pushed during schema visiting.", iPrimitive)); + Preconditions.checkArgument( + fieldIds.peek() != null, + String.format( + "[BUG] Cannot find field id for primitive field with type %s. This is likely because id " + + "information is not properly pushed during schema visiting.", + iPrimitive)); return GenericOrcWriters.doubles(fieldIds.peek()); case DATE: return FlinkOrcWriters.dates(); @@ -143,8 +153,10 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType fl Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; return FlinkOrcWriters.decimals(decimalType.precision(), decimalType.scale()); default: - throw new IllegalArgumentException(String.format( - "Invalid iceberg type %s corresponding to Flink logical type %s", iPrimitive, flinkPrimitive)); + throw new IllegalArgumentException( + String.format( + "Invalid iceberg type %s corresponding to Flink logical type %s", + iPrimitive, flinkPrimitive)); } } } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java index 2de5586a33fe..da2f95cf822f 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.time.Instant; @@ -47,8 +46,7 @@ class FlinkOrcWriters { - private FlinkOrcWriters() { - } + private FlinkOrcWriters() {} static OrcValueWriter strings() { return StringWriter.INSTANCE; @@ -80,12 +78,16 @@ static OrcValueWriter decimals(int precision, int scale) { } } - static OrcValueWriter list(OrcValueWriter elementWriter, LogicalType elementType) { + static OrcValueWriter list( + OrcValueWriter elementWriter, LogicalType elementType) { return new ListWriter<>(elementWriter, elementType); } - static OrcValueWriter map(OrcValueWriter keyWriter, OrcValueWriter valueWriter, - LogicalType keyType, LogicalType valueType) { + static OrcValueWriter map( + OrcValueWriter keyWriter, + OrcValueWriter valueWriter, + LogicalType keyType, + LogicalType valueType) { return new MapWriter<>(keyWriter, valueWriter, keyType, valueType); } @@ -132,7 +134,8 @@ public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { cv.setIsUTC(true); // millis OffsetDateTime offsetDateTime = data.toInstant().atOffset(ZoneOffset.UTC); - cv.time[rowId] = offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; + cv.time[rowId] = + offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; // truncate nanos to only keep microsecond precision. cv.nanos[rowId] = (offsetDateTime.getNano() / 1_000) * 1_000; } @@ -164,12 +167,21 @@ private static class Decimal18Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) { - Preconditions.checkArgument(scale == data.scale(), - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, data); - Preconditions.checkArgument(data.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, data); - - ((DecimalColumnVector) output).vector[rowId].setFromLongAndScale(data.toUnscaledLong(), data.scale()); + Preconditions.checkArgument( + scale == data.scale(), + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + data); + Preconditions.checkArgument( + data.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + data); + + ((DecimalColumnVector) output) + .vector[rowId].setFromLongAndScale(data.toUnscaledLong(), data.scale()); } } @@ -184,12 +196,21 @@ private static class Decimal38Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) { - Preconditions.checkArgument(scale == data.scale(), - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, data); - Preconditions.checkArgument(data.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, data); - - ((DecimalColumnVector) output).vector[rowId].set(HiveDecimal.create(data.toBigDecimal(), false)); + Preconditions.checkArgument( + scale == data.scale(), + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + data); + Preconditions.checkArgument( + data.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + data); + + ((DecimalColumnVector) output) + .vector[rowId].set(HiveDecimal.create(data.toBigDecimal(), false)); } } @@ -222,7 +243,6 @@ public void nonNullWrite(int rowId, ArrayData data, ColumnVector output) { public Stream> metrics() { return elementWriter.metrics(); } - } static class MapWriter implements OrcValueWriter { @@ -231,8 +251,11 @@ static class MapWriter implements OrcValueWriter { private final ArrayData.ElementGetter keyGetter; private final ArrayData.ElementGetter valueGetter; - MapWriter(OrcValueWriter keyWriter, OrcValueWriter valueWriter, - LogicalType keyType, LogicalType valueType) { + MapWriter( + OrcValueWriter keyWriter, + OrcValueWriter valueWriter, + LogicalType keyType, + LogicalType valueType) { this.keyWriter = keyWriter; this.valueWriter = valueWriter; this.keyGetter = ArrayData.createElementGetter(keyType); @@ -283,7 +306,6 @@ static class RowDataWriter extends GenericOrcWriters.StructWriter { protected Object get(RowData struct, int index) { return fieldGetters.get(index).getFieldOrNull(struct); } - } private static void growColumnVector(ColumnVector cv, int requestedSize) { diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java index 30184d899453..4189d0ae429b 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.math.BigDecimal; @@ -57,20 +56,19 @@ import org.apache.parquet.schema.Type; public class FlinkParquetReaders { - private FlinkParquetReaders() { - } + private FlinkParquetReaders() {} - public static ParquetValueReader buildReader(Schema expectedSchema, MessageType fileSchema) { + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema) { return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); } @SuppressWarnings("unchecked") - public static ParquetValueReader buildReader(Schema expectedSchema, - MessageType fileSchema, - Map idToConstant) { - return (ParquetValueReader) TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, - new ReadBuilder(fileSchema, idToConstant) - ); + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema, Map idToConstant) { + return (ParquetValueReader) + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); } private static class ReadBuilder extends TypeWithSchemaVisitor> { @@ -83,14 +81,14 @@ private static class ReadBuilder extends TypeWithSchemaVisitor message(Types.StructType expected, MessageType message, - List> fieldReaders) { + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { return struct(expected, message.asGroupType(), fieldReaders); } @Override - public ParquetValueReader struct(Types.StructType expected, GroupType struct, - List> fieldReaders) { + public ParquetValueReader struct( + Types.StructType expected, GroupType struct, List> fieldReaders) { // match the expected struct's order Map> readersById = Maps.newHashMap(); Map typesById = Maps.newHashMap(); @@ -107,10 +105,10 @@ public ParquetValueReader struct(Types.StructType expected, GroupType s } } - List expectedFields = expected != null ? - expected.fields() : ImmutableList.of(); - List> reorderedFields = Lists.newArrayListWithExpectedSize( - expectedFields.size()); + List expectedFields = + expected != null ? expected.fields() : ImmutableList.of(); + List> reorderedFields = + Lists.newArrayListWithExpectedSize(expectedFields.size()); List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); for (Types.NestedField field : expectedFields) { int id = field.fieldId(); @@ -140,8 +138,8 @@ public ParquetValueReader struct(Types.StructType expected, GroupType s } @Override - public ParquetValueReader list(Types.ListType expectedList, GroupType array, - ParquetValueReader elementReader) { + public ParquetValueReader list( + Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { if (expectedList == null) { return null; } @@ -154,13 +152,16 @@ public ParquetValueReader list(Types.ListType expectedList, GroupType array, Type elementType = ParquetSchemaUtil.determineListElementType(array); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - return new ArrayReader<>(repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); + return new ArrayReader<>( + repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); } @Override - public ParquetValueReader map(Types.MapType expectedMap, GroupType map, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { + public ParquetValueReader map( + Types.MapType expectedMap, + GroupType map, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { if (expectedMap == null) { return null; } @@ -176,15 +177,17 @@ public ParquetValueReader map(Types.MapType expectedMap, GroupType map, Type valueType = repeatedKeyValue.getType(1); int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - return new MapReader<>(repeatedD, repeatedR, + return new MapReader<>( + repeatedD, + repeatedR, ParquetValueReaders.option(keyType, keyD, keyReader), ParquetValueReaders.option(valueType, valueD, valueReader)); } @Override @SuppressWarnings("CyclomaticComplexity") - public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveType expected, - PrimitiveType primitive) { + public ParquetValueReader primitive( + org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { if (expected == null) { return null; } @@ -225,7 +228,8 @@ public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveTy return new MillisToTimestampReader(desc); } case DECIMAL: - DecimalLogicalTypeAnnotation decimal = (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); + DecimalLogicalTypeAnnotation decimal = + (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); switch (primitive.getPrimitiveTypeName()) { case BINARY: case FIXED_LEN_BYTE_ARRAY: @@ -272,7 +276,8 @@ public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveTy } } - private static class BinaryDecimalReader extends ParquetValueReaders.PrimitiveReader { + private static class BinaryDecimalReader + extends ParquetValueReaders.PrimitiveReader { private final int precision; private final int scale; @@ -291,7 +296,8 @@ public DecimalData read(DecimalData ignored) { } } - private static class IntegerDecimalReader extends ParquetValueReaders.PrimitiveReader { + private static class IntegerDecimalReader + extends ParquetValueReaders.PrimitiveReader { private final int precision; private final int scale; @@ -323,7 +329,8 @@ public DecimalData read(DecimalData ignored) { } } - private static class MicrosToTimestampTzReader extends ParquetValueReaders.UnboxedReader { + private static class MicrosToTimestampTzReader + extends ParquetValueReaders.UnboxedReader { MicrosToTimestampTzReader(ColumnDescriptor desc) { super(desc); } @@ -331,10 +338,11 @@ private static class MicrosToTimestampTzReader extends ParquetValueReaders.Unbox @Override public TimestampData read(TimestampData ignored) { long value = readLong(); - return TimestampData.fromLocalDateTime(Instant.ofEpochSecond(Math.floorDiv(value, 1000_000), - Math.floorMod(value, 1000_000) * 1000) - .atOffset(ZoneOffset.UTC) - .toLocalDateTime()); + return TimestampData.fromLocalDateTime( + Instant.ofEpochSecond( + Math.floorDiv(value, 1000_000), Math.floorMod(value, 1000_000) * 1000) + .atOffset(ZoneOffset.UTC) + .toLocalDateTime()); } @Override @@ -343,7 +351,8 @@ public long readLong() { } } - private static class MicrosToTimestampReader extends ParquetValueReaders.UnboxedReader { + private static class MicrosToTimestampReader + extends ParquetValueReaders.UnboxedReader { MicrosToTimestampReader(ColumnDescriptor desc) { super(desc); } @@ -351,8 +360,9 @@ private static class MicrosToTimestampReader extends ParquetValueReaders.Unboxed @Override public TimestampData read(TimestampData ignored) { long value = readLong(); - return TimestampData.fromInstant(Instant.ofEpochSecond(Math.floorDiv(value, 1000_000), - Math.floorMod(value, 1000_000) * 1000)); + return TimestampData.fromInstant( + Instant.ofEpochSecond( + Math.floorDiv(value, 1000_000), Math.floorMod(value, 1000_000) * 1000)); } @Override @@ -361,7 +371,8 @@ public long readLong() { } } - private static class MillisToTimestampReader extends ParquetValueReaders.UnboxedReader { + private static class MillisToTimestampReader + extends ParquetValueReaders.UnboxedReader { MillisToTimestampReader(ColumnDescriptor desc) { super(desc); } @@ -378,7 +389,8 @@ public long readLong() { } } - private static class MillisToTimestampTzReader extends ParquetValueReaders.UnboxedReader { + private static class MillisToTimestampTzReader + extends ParquetValueReaders.UnboxedReader { MillisToTimestampTzReader(ColumnDescriptor desc) { super(desc); } @@ -386,9 +398,8 @@ private static class MillisToTimestampTzReader extends ParquetValueReaders.Unbox @Override public TimestampData read(TimestampData ignored) { long millis = readLong(); - return TimestampData.fromLocalDateTime(Instant.ofEpochMilli(millis) - .atOffset(ZoneOffset.UTC) - .toLocalDateTime()); + return TimestampData.fromLocalDateTime( + Instant.ofEpochMilli(millis).atOffset(ZoneOffset.UTC).toLocalDateTime()); } @Override @@ -415,7 +426,8 @@ public StringData read(StringData ignored) { } } - private static class LossyMicrosToMillisTimeReader extends ParquetValueReaders.PrimitiveReader { + private static class LossyMicrosToMillisTimeReader + extends ParquetValueReaders.PrimitiveReader { LossyMicrosToMillisTimeReader(ColumnDescriptor desc) { super(desc); } @@ -438,7 +450,8 @@ public Integer read(Integer reuse) { } } - private static class ArrayReader extends ParquetValueReaders.RepeatedReader { + private static class ArrayReader + extends ParquetValueReaders.RepeatedReader { private int readPos = 0; private int writePos = 0; @@ -484,23 +497,29 @@ protected void addElement(ReusableArrayData reused, E element) { @Override protected ArrayData buildList(ReusableArrayData list) { - // Since ReusableArrayData is not accepted by Flink, use GenericArrayData temporarily to walk around it. + // Since ReusableArrayData is not accepted by Flink, use GenericArrayData temporarily to walk + // around it. // Revert this to use ReusableArrayData once it is fixed in Flink. // For your reference, https://issues.apache.org/jira/browse/FLINK-25238. return new GenericArrayData(Arrays.copyOf(list.values, writePos)); } } - private static class MapReader extends - ParquetValueReaders.RepeatedKeyValueReader { + private static class MapReader + extends ParquetValueReaders.RepeatedKeyValueReader { private int readPos = 0; private int writePos = 0; - private final ParquetValueReaders.ReusableEntry entry = new ParquetValueReaders.ReusableEntry<>(); - private final ParquetValueReaders.ReusableEntry nullEntry = new ParquetValueReaders.ReusableEntry<>(); + private final ParquetValueReaders.ReusableEntry entry = + new ParquetValueReaders.ReusableEntry<>(); + private final ParquetValueReaders.ReusableEntry nullEntry = + new ParquetValueReaders.ReusableEntry<>(); - MapReader(int definitionLevel, int repetitionLevel, - ParquetValueReader keyReader, ParquetValueReader valueReader) { + MapReader( + int definitionLevel, + int repetitionLevel, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { super(definitionLevel, repetitionLevel, keyReader, valueReader); } @@ -549,7 +568,8 @@ protected MapData buildMap(ReusableMapData map) { } } - private static class RowDataReader extends ParquetValueReaders.StructReader { + private static class RowDataReader + extends ParquetValueReaders.StructReader { private final int numFields; RowDataReader(List types, List> readers) { diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java index 6154ef1cfa2b..db4f1730a134 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkParquetWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.Iterator; @@ -52,12 +51,12 @@ import org.apache.parquet.schema.Type; public class FlinkParquetWriters { - private FlinkParquetWriters() { - } + private FlinkParquetWriters() {} @SuppressWarnings("unchecked") public static ParquetValueWriter buildWriter(LogicalType schema, MessageType type) { - return (ParquetValueWriter) ParquetWithFlinkSchemaVisitor.visit(schema, type, new WriteBuilder(type)); + return (ParquetValueWriter) + ParquetWithFlinkSchemaVisitor.visit(schema, type, new WriteBuilder(type)); } private static class WriteBuilder extends ParquetWithFlinkSchemaVisitor> { @@ -68,13 +67,14 @@ private static class WriteBuilder extends ParquetWithFlinkSchemaVisitor message(RowType sStruct, MessageType message, List> fields) { + public ParquetValueWriter message( + RowType sStruct, MessageType message, List> fields) { return struct(sStruct, message.asGroupType(), fields); } @Override - public ParquetValueWriter struct(RowType sStruct, GroupType struct, - List> fieldWriters) { + public ParquetValueWriter struct( + RowType sStruct, GroupType struct, List> fieldWriters) { List fields = struct.getFields(); List flinkFields = sStruct.getFields(); List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); @@ -88,34 +88,42 @@ public ParquetValueWriter struct(RowType sStruct, GroupType struct, } @Override - public ParquetValueWriter list(ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { + public ParquetValueWriter list( + ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - return new ArrayDataWriter<>(repeatedD, repeatedR, + return new ArrayDataWriter<>( + repeatedD, + repeatedR, newOption(repeated.getType(0), elementWriter), sArray.getElementType()); } @Override - public ParquetValueWriter map(MapType sMap, GroupType map, - ParquetValueWriter keyWriter, ParquetValueWriter valueWriter) { + public ParquetValueWriter map( + MapType sMap, + GroupType map, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - return new MapDataWriter<>(repeatedD, repeatedR, + return new MapDataWriter<>( + repeatedD, + repeatedR, newOption(repeatedKeyValue.getType(0), keyWriter), newOption(repeatedKeyValue.getType(1), valueWriter), - sMap.getKeyType(), sMap.getValueType()); + sMap.getKeyType(), + sMap.getValueType()); } - private ParquetValueWriter newOption(Type fieldType, ParquetValueWriter writer) { int maxD = type.getMaxDefinitionLevel(path(fieldType.getName())); return ParquetValueWriters.option(fieldType, maxD, writer); @@ -143,7 +151,8 @@ public ParquetValueWriter primitive(LogicalType fType, PrimitiveType primitiv case TIMESTAMP_MICROS: return timestamps(desc); case DECIMAL: - DecimalLogicalTypeAnnotation decimal = (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); + DecimalLogicalTypeAnnotation decimal = + (DecimalLogicalTypeAnnotation) primitive.getLogicalTypeAnnotation(); switch (primitive.getPrimitiveTypeName()) { case INT32: return decimalAsInteger(desc, decimal.getPrecision(), decimal.getScale()); @@ -184,7 +193,8 @@ public ParquetValueWriter primitive(LogicalType fType, PrimitiveType primitiv } } - private static ParquetValueWriters.PrimitiveWriter ints(LogicalType type, ColumnDescriptor desc) { + private static ParquetValueWriters.PrimitiveWriter ints( + LogicalType type, ColumnDescriptor desc) { if (type instanceof TinyIntType) { return ParquetValueWriters.tinyints(desc); } else if (type instanceof SmallIntType) { @@ -201,26 +211,33 @@ private static ParquetValueWriters.PrimitiveWriter timeMicros(ColumnDes return new TimeMicrosWriter(desc); } - private static ParquetValueWriters.PrimitiveWriter decimalAsInteger(ColumnDescriptor desc, - int precision, int scale) { - Preconditions.checkArgument(precision <= 9, "Cannot write decimal value as integer with precision larger than 9," + - " wrong precision %s", precision); + private static ParquetValueWriters.PrimitiveWriter decimalAsInteger( + ColumnDescriptor desc, int precision, int scale) { + Preconditions.checkArgument( + precision <= 9, + "Cannot write decimal value as integer with precision larger than 9," + + " wrong precision %s", + precision); return new IntegerDecimalWriter(desc, precision, scale); } - private static ParquetValueWriters.PrimitiveWriter decimalAsLong(ColumnDescriptor desc, - int precision, int scale) { - Preconditions.checkArgument(precision <= 18, "Cannot write decimal value as long with precision larger than 18, " + - " wrong precision %s", precision); + private static ParquetValueWriters.PrimitiveWriter decimalAsLong( + ColumnDescriptor desc, int precision, int scale) { + Preconditions.checkArgument( + precision <= 18, + "Cannot write decimal value as long with precision larger than 18, " + + " wrong precision %s", + precision); return new LongDecimalWriter(desc, precision, scale); } - private static ParquetValueWriters.PrimitiveWriter decimalAsFixed(ColumnDescriptor desc, - int precision, int scale) { + private static ParquetValueWriters.PrimitiveWriter decimalAsFixed( + ColumnDescriptor desc, int precision, int scale) { return new FixedDecimalWriter(desc, precision, scale); } - private static ParquetValueWriters.PrimitiveWriter timestamps(ColumnDescriptor desc) { + private static ParquetValueWriters.PrimitiveWriter timestamps( + ColumnDescriptor desc) { return new TimestampDataWriter(desc); } @@ -251,7 +268,8 @@ public void write(int repetitionLevel, Integer value) { } } - private static class IntegerDecimalWriter extends ParquetValueWriters.PrimitiveWriter { + private static class IntegerDecimalWriter + extends ParquetValueWriters.PrimitiveWriter { private final int precision; private final int scale; @@ -263,10 +281,18 @@ private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, DecimalData decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeInteger(repetitionLevel, (int) decimal.toUnscaledLong()); } @@ -284,10 +310,18 @@ private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, DecimalData decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeLong(repetitionLevel, decimal.toUnscaledLong()); } @@ -302,24 +336,28 @@ private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { super(desc); this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(int repetitionLevel, DecimalData decimal) { - byte[] binary = DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toBigDecimal(), bytes.get()); + byte[] binary = + DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toBigDecimal(), bytes.get()); column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(binary)); } } - private static class TimestampDataWriter extends ParquetValueWriters.PrimitiveWriter { + private static class TimestampDataWriter + extends ParquetValueWriters.PrimitiveWriter { private TimestampDataWriter(ColumnDescriptor desc) { super(desc); } @Override public void write(int repetitionLevel, TimestampData value) { - column.writeLong(repetitionLevel, value.getMillisecond() * 1000 + value.getNanoOfMillisecond() / 1000); + column.writeLong( + repetitionLevel, value.getMillisecond() * 1000 + value.getNanoOfMillisecond() / 1000); } } @@ -337,8 +375,11 @@ public void write(int repetitionLevel, byte[] bytes) { private static class ArrayDataWriter extends ParquetValueWriters.RepeatedWriter { private final LogicalType elementType; - private ArrayDataWriter(int definitionLevel, int repetitionLevel, - ParquetValueWriter writer, LogicalType elementType) { + private ArrayDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter writer, + LogicalType elementType) { super(definitionLevel, repetitionLevel, writer); this.elementType = elementType; } @@ -381,13 +422,18 @@ public E next() { } } - private static class MapDataWriter extends ParquetValueWriters.RepeatedKeyValueWriter { + private static class MapDataWriter + extends ParquetValueWriters.RepeatedKeyValueWriter { private final LogicalType keyType; private final LogicalType valueType; - private MapDataWriter(int definitionLevel, int repetitionLevel, - ParquetValueWriter keyWriter, ParquetValueWriter valueWriter, - LogicalType keyType, LogicalType valueType) { + private MapDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter, + LogicalType keyType, + LogicalType valueType) { super(definitionLevel, repetitionLevel, keyWriter, valueWriter); this.keyType = keyType; this.valueType = valueType; @@ -429,7 +475,9 @@ public Map.Entry next() { throw new NoSuchElementException(); } - entry.set((K) keyGetter.getElementOrNull(keys, index), (V) valueGetter.getElementOrNull(values, index)); + entry.set( + (K) keyGetter.getElementOrNull(keys, index), + (V) valueGetter.getElementOrNull(values, index)); index += 1; return entry; diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java index 0909e1b53a85..ba4e1a7a7aec 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.List; @@ -85,8 +84,8 @@ private static T visit(LogicalType flinkType, Type iType, FlinkSchemaVisitor } } - private static T visitRecord(LogicalType flinkType, Types.StructType struct, - FlinkSchemaVisitor visitor) { + private static T visitRecord( + LogicalType flinkType, Types.StructType struct, FlinkSchemaVisitor visitor) { Preconditions.checkArgument(flinkType instanceof RowType, "%s is not a RowType.", flinkType); RowType rowType = (RowType) flinkType; @@ -98,8 +97,8 @@ private static T visitRecord(LogicalType flinkType, Types.StructType struct, for (int i = 0; i < fieldSize; i++) { Types.NestedField iField = nestedFields.get(i); int fieldIndex = rowType.getFieldIndex(iField.name()); - Preconditions.checkArgument(fieldIndex >= 0, - "NestedField: %s is not found in flink RowType: %s", iField, rowType); + Preconditions.checkArgument( + fieldIndex >= 0, "NestedField: %s is not found in flink RowType: %s", iField, rowType); LogicalType fieldFlinkType = rowType.getTypeAt(fieldIndex); @@ -132,11 +131,9 @@ public T primitive(Type.PrimitiveType iPrimitive, LogicalType flinkPrimitive) { return null; } - public void beforeField(Types.NestedField field) { - } + public void beforeField(Types.NestedField field) {} - public void afterField(Types.NestedField field) { - } + public void afterField(Types.NestedField field) {} public void beforeListElement(Types.NestedField elementField) { beforeField(elementField); diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java index 1b7a98f7dc8f..32f6c3a2ccfd 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.IOException; @@ -44,8 +43,7 @@ public class FlinkValueReaders { - private FlinkValueReaders() { - } + private FlinkValueReaders() {} static ValueReader strings() { return StringReader.INSTANCE; @@ -71,7 +69,8 @@ static ValueReader timestampMicros() { return TimestampMicrosReader.INSTANCE; } - static ValueReader decimal(ValueReader unscaledReader, int precision, int scale) { + static ValueReader decimal( + ValueReader unscaledReader, int precision, int scale) { return new DecimalReader(unscaledReader, precision, scale); } @@ -79,8 +78,7 @@ static ValueReader array(ValueReader elementReader) { return new ArrayReader(elementReader); } - static ValueReader arrayMap(ValueReader keyReader, - ValueReader valueReader) { + static ValueReader arrayMap(ValueReader keyReader, ValueReader valueReader) { return new ArrayMapReader(keyReader, valueReader); } @@ -88,16 +86,15 @@ static ValueReader map(ValueReader keyReader, ValueReader valueRe return new MapReader(keyReader, valueReader); } - static ValueReader struct(List> readers, Types.StructType struct, - Map idToConstant) { + static ValueReader struct( + List> readers, Types.StructType struct, Map idToConstant) { return new StructReader(readers, struct, idToConstant); } private static class StringReader implements ValueReader { private static final StringReader INSTANCE = new StringReader(); - private StringReader() { - } + private StringReader() {} @Override public StringData read(Decoder decoder, Object reuse) throws IOException { @@ -143,7 +140,8 @@ private DecimalReader(ValueReader bytesReader, int precision, int scale) @Override public DecimalData read(Decoder decoder, Object reuse) throws IOException { byte[] bytes = bytesReader.read(decoder, null); - return DecimalData.fromBigDecimal(new BigDecimal(new BigInteger(bytes), scale), precision, scale); + return DecimalData.fromBigDecimal( + new BigDecimal(new BigInteger(bytes), scale), precision, scale); } } @@ -287,7 +285,8 @@ public MapData read(Decoder decoder, Object reuse) throws IOException { private static class StructReader extends ValueReaders.StructReader { private final int numFields; - private StructReader(List> readers, Types.StructType struct, Map idToConstant) { + private StructReader( + List> readers, Types.StructType struct, Map idToConstant) { super(readers, struct, idToConstant); this.numFields = readers.size(); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java index 517d7d8e1527..4e86ecce28b5 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/FlinkValueWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.IOException; @@ -37,8 +36,7 @@ public class FlinkValueWriters { - private FlinkValueWriters() { - } + private FlinkValueWriters() {} static ValueWriter strings() { return StringWriter.INSTANCE; @@ -60,13 +58,19 @@ static ValueWriter array(ValueWriter elementWriter, LogicalTyp return new ArrayWriter<>(elementWriter, elementType); } - static ValueWriter arrayMap(ValueWriter keyWriter, LogicalType keyType, - ValueWriter valueWriter, LogicalType valueType) { + static ValueWriter arrayMap( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { return new ArrayMapWriter<>(keyWriter, keyType, valueWriter, valueType); } - static ValueWriter map(ValueWriter keyWriter, LogicalType keyType, - ValueWriter valueWriter, LogicalType valueType) { + static ValueWriter map( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { return new MapWriter<>(keyWriter, keyType, valueWriter, valueType); } @@ -77,8 +81,7 @@ static ValueWriter row(List> writers, List private static class StringWriter implements ValueWriter { private static final StringWriter INSTANCE = new StringWriter(); - private StringWriter() { - } + private StringWriter() {} @Override public void write(StringData s, Encoder encoder) throws IOException { @@ -95,12 +98,14 @@ private static class DecimalWriter implements ValueWriter { private DecimalWriter(int precision, int scale) { this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(DecimalData d, Encoder encoder) throws IOException { - encoder.writeFixed(DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toBigDecimal(), bytes.get())); + encoder.writeFixed( + DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toBigDecimal(), bytes.get())); } } @@ -118,7 +123,8 @@ private static class TimestampMicrosWriter implements ValueWriter @Override public void write(TimestampData timestampData, Encoder encoder) throws IOException { - long micros = timestampData.getMillisecond() * 1000 + timestampData.getNanoOfMillisecond() / 1000; + long micros = + timestampData.getMillisecond() * 1000 + timestampData.getNanoOfMillisecond() / 1000; encoder.writeLong(micros); } } @@ -152,8 +158,11 @@ private static class ArrayMapWriter implements ValueWriter { private final ArrayData.ElementGetter keyGetter; private final ArrayData.ElementGetter valueGetter; - private ArrayMapWriter(ValueWriter keyWriter, LogicalType keyType, - ValueWriter valueWriter, LogicalType valueType) { + private ArrayMapWriter( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { this.keyWriter = keyWriter; this.keyGetter = ArrayData.createElementGetter(keyType); this.valueWriter = valueWriter; @@ -183,8 +192,11 @@ private static class MapWriter implements ValueWriter { private final ArrayData.ElementGetter keyGetter; private final ArrayData.ElementGetter valueGetter; - private MapWriter(ValueWriter keyWriter, LogicalType keyType, - ValueWriter valueWriter, LogicalType valueType) { + private MapWriter( + ValueWriter keyWriter, + LogicalType keyType, + ValueWriter valueWriter, + LogicalType valueType) { this.keyWriter = keyWriter; this.keyGetter = ArrayData.createElementGetter(keyType); this.valueWriter = valueWriter; diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java index 541986f93889..33feb2e32118 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/ParquetWithFlinkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.Deque; @@ -38,12 +37,15 @@ public class ParquetWithFlinkSchemaVisitor { private final Deque fieldNames = Lists.newLinkedList(); - public static T visit(LogicalType sType, Type type, ParquetWithFlinkSchemaVisitor visitor) { + public static T visit( + LogicalType sType, Type type, ParquetWithFlinkSchemaVisitor visitor) { Preconditions.checkArgument(sType != null, "Invalid DataType: null"); if (type instanceof MessageType) { - Preconditions.checkArgument(sType instanceof RowType, "Invalid struct: %s is not a struct", sType); + Preconditions.checkArgument( + sType instanceof RowType, "Invalid struct: %s is not a struct", sType); RowType struct = (RowType) sType; - return visitor.message(struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); + return visitor.message( + struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); } else if (type.isPrimitive()) { return visitor.primitive(sType, type.asPrimitiveType()); } else { @@ -53,21 +55,30 @@ public static T visit(LogicalType sType, Type type, ParquetWithFlinkSchemaVi if (annotation != null) { switch (annotation) { case LIST: - Preconditions.checkArgument(!group.isRepetition(Type.Repetition.REPEATED), - "Invalid list: top-level group is repeated: %s", group); - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid list: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + !group.isRepetition(Type.Repetition.REPEATED), + "Invalid list: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid list: does not contain single repeated field: %s", + group); GroupType repeatedElement = group.getFields().get(0).asGroupType(); - Preconditions.checkArgument(repeatedElement.isRepetition(Type.Repetition.REPEATED), + Preconditions.checkArgument( + repeatedElement.isRepetition(Type.Repetition.REPEATED), "Invalid list: inner group is not repeated"); - Preconditions.checkArgument(repeatedElement.getFieldCount() <= 1, - "Invalid list: repeated group is not a single field: %s", group); + Preconditions.checkArgument( + repeatedElement.getFieldCount() <= 1, + "Invalid list: repeated group is not a single field: %s", + group); - Preconditions.checkArgument(sType instanceof ArrayType, "Invalid list: %s is not an array", sType); + Preconditions.checkArgument( + sType instanceof ArrayType, "Invalid list: %s is not an array", sType); ArrayType array = (ArrayType) sType; - RowType.RowField element = new RowField( - "element", array.getElementType(), "element of " + array.asSummaryString()); + RowType.RowField element = + new RowField( + "element", array.getElementType(), "element of " + array.asSummaryString()); visitor.fieldNames.push(repeatedElement.getName()); try { @@ -83,22 +94,30 @@ public static T visit(LogicalType sType, Type type, ParquetWithFlinkSchemaVi } case MAP: - Preconditions.checkArgument(!group.isRepetition(Type.Repetition.REPEATED), - "Invalid map: top-level group is repeated: %s", group); - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid map: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + !group.isRepetition(Type.Repetition.REPEATED), + "Invalid map: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid map: does not contain single repeated field: %s", + group); GroupType repeatedKeyValue = group.getType(0).asGroupType(); - Preconditions.checkArgument(repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), + Preconditions.checkArgument( + repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), "Invalid map: inner group is not repeated"); - Preconditions.checkArgument(repeatedKeyValue.getFieldCount() <= 2, + Preconditions.checkArgument( + repeatedKeyValue.getFieldCount() <= 2, "Invalid map: repeated group does not have 2 fields"); - Preconditions.checkArgument(sType instanceof MapType, "Invalid map: %s is not a map", sType); + Preconditions.checkArgument( + sType instanceof MapType, "Invalid map: %s is not a map", sType); MapType map = (MapType) sType; - RowField keyField = new RowField("key", map.getKeyType(), "key of " + map.asSummaryString()); - RowField valueField = new RowField( - "value", map.getValueType(), "value of " + map.asSummaryString()); + RowField keyField = + new RowField("key", map.getKeyType(), "key of " + map.asSummaryString()); + RowField valueField = + new RowField("value", map.getValueType(), "value of " + map.asSummaryString()); visitor.fieldNames.push(repeatedKeyValue.getName()); try { @@ -134,13 +153,15 @@ public static T visit(LogicalType sType, Type type, ParquetWithFlinkSchemaVi default: } } - Preconditions.checkArgument(sType instanceof RowType, "Invalid struct: %s is not a struct", sType); + Preconditions.checkArgument( + sType instanceof RowType, "Invalid struct: %s is not a struct", sType); RowType struct = (RowType) sType; return visitor.struct(struct, group, visitFields(struct, group, visitor)); } } - private static T visitField(RowType.RowField sField, Type field, ParquetWithFlinkSchemaVisitor visitor) { + private static T visitField( + RowType.RowField sField, Type field, ParquetWithFlinkSchemaVisitor visitor) { visitor.fieldNames.push(field.getName()); try { return visit(sField.getType(), field, visitor); @@ -149,17 +170,20 @@ private static T visitField(RowType.RowField sField, Type field, ParquetWith } } - private static List visitFields(RowType struct, GroupType group, - ParquetWithFlinkSchemaVisitor visitor) { + private static List visitFields( + RowType struct, GroupType group, ParquetWithFlinkSchemaVisitor visitor) { List sFields = struct.getFields(); - Preconditions.checkArgument(sFields.size() == group.getFieldCount(), - "Structs do not match: %s and %s", struct, group); + Preconditions.checkArgument( + sFields.size() == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); for (int i = 0; i < sFields.size(); i += 1) { Type field = group.getFields().get(i); RowType.RowField sField = sFields.get(i); - Preconditions.checkArgument(field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.getName())), - "Structs do not match: field %s != %s", field.getName(), sField.getName()); + Preconditions.checkArgument( + field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.getName())), + "Structs do not match: field %s != %s", + field.getName(), + sField.getName()); results.add(visitField(sField, field, visitor)); } @@ -195,5 +219,4 @@ protected String[] path(String name) { list.add(name); return list.toArray(new String[0]); } - } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java index 6334a00fd0d7..e41bae686d1e 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/RowDataProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.Map; @@ -38,35 +37,40 @@ public class RowDataProjection implements RowData { /** * Creates a projecting wrapper for {@link RowData} rows. - *

    - * This projection will not project the nested children types of repeated types like lists and maps. + * + *

    This projection will not project the nested children types of repeated types like lists and + * maps. * * @param schema schema of rows wrapped by this projection * @param projectedSchema result schema of the projected rows * @return a wrapper to project rows */ public static RowDataProjection create(Schema schema, Schema projectedSchema) { - return RowDataProjection.create(FlinkSchemaUtil.convert(schema), schema.asStruct(), projectedSchema.asStruct()); + return RowDataProjection.create( + FlinkSchemaUtil.convert(schema), schema.asStruct(), projectedSchema.asStruct()); } /** * Creates a projecting wrapper for {@link RowData} rows. - *

    - * This projection will not project the nested children types of repeated types like lists and maps. + * + *

    This projection will not project the nested children types of repeated types like lists and + * maps. * * @param rowType flink row type of rows wrapped by this projection * @param schema schema of rows wrapped by this projection * @param projectedSchema result schema of the projected rows * @return a wrapper to project rows */ - public static RowDataProjection create(RowType rowType, Types.StructType schema, Types.StructType projectedSchema) { + public static RowDataProjection create( + RowType rowType, Types.StructType schema, Types.StructType projectedSchema) { return new RowDataProjection(rowType, schema, projectedSchema); } private final RowData.FieldGetter[] getters; private RowData rowData; - private RowDataProjection(RowType rowType, Types.StructType rowStruct, Types.StructType projectType) { + private RowDataProjection( + RowType rowType, Types.StructType rowStruct, Types.StructType projectType) { Map fieldIdToPosition = Maps.newHashMap(); for (int i = 0; i < rowStruct.fields().size(); i++) { fieldIdToPosition.put(rowStruct.fields().get(i).fieldId(), i); @@ -77,27 +81,34 @@ private RowDataProjection(RowType rowType, Types.StructType rowStruct, Types.Str Types.NestedField projectField = projectType.fields().get(i); Types.NestedField rowField = rowStruct.field(projectField.fieldId()); - Preconditions.checkNotNull(rowField, - "Cannot locate the project field <%s> in the iceberg struct <%s>", projectField, rowStruct); + Preconditions.checkNotNull( + rowField, + "Cannot locate the project field <%s> in the iceberg struct <%s>", + projectField, + rowStruct); - getters[i] = createFieldGetter(rowType, fieldIdToPosition.get(projectField.fieldId()), rowField, projectField); + getters[i] = + createFieldGetter( + rowType, fieldIdToPosition.get(projectField.fieldId()), rowField, projectField); } } - private static RowData.FieldGetter createFieldGetter(RowType rowType, - int position, - Types.NestedField rowField, - Types.NestedField projectField) { - Preconditions.checkArgument(rowField.type().typeId() == projectField.type().typeId(), - "Different iceberg type between row field <%s> and project field <%s>", rowField, projectField); + private static RowData.FieldGetter createFieldGetter( + RowType rowType, int position, Types.NestedField rowField, Types.NestedField projectField) { + Preconditions.checkArgument( + rowField.type().typeId() == projectField.type().typeId(), + "Different iceberg type between row field <%s> and project field <%s>", + rowField, + projectField); switch (projectField.type().typeId()) { case STRUCT: RowType nestedRowType = (RowType) rowType.getTypeAt(position); return row -> { - RowData nestedRow = row.isNullAt(position) ? null : row.getRow(position, nestedRowType.getFieldCount()); - return RowDataProjection - .create(nestedRowType, rowField.type().asStructType(), projectField.type().asStructType()) + RowData nestedRow = + row.isNullAt(position) ? null : row.getRow(position, nestedRowType.getFieldCount()); + return RowDataProjection.create( + nestedRowType, rowField.type().asStructType(), projectField.type().asStructType()) .wrap(nestedRow); }; @@ -105,13 +116,17 @@ private static RowData.FieldGetter createFieldGetter(RowType rowType, Types.MapType projectedMap = projectField.type().asMapType(); Types.MapType originalMap = rowField.type().asMapType(); - boolean keyProjectable = !projectedMap.keyType().isNestedType() || - projectedMap.keyType().equals(originalMap.keyType()); - boolean valueProjectable = !projectedMap.valueType().isNestedType() || - projectedMap.valueType().equals(originalMap.valueType()); - Preconditions.checkArgument(keyProjectable && valueProjectable, + boolean keyProjectable = + !projectedMap.keyType().isNestedType() + || projectedMap.keyType().equals(originalMap.keyType()); + boolean valueProjectable = + !projectedMap.valueType().isNestedType() + || projectedMap.valueType().equals(originalMap.valueType()); + Preconditions.checkArgument( + keyProjectable && valueProjectable, "Cannot project a partial map key or value with non-primitive type. Trying to project <%s> out of <%s>", - projectField, rowField); + projectField, + rowField); return RowData.createFieldGetter(rowType.getTypeAt(position), position); @@ -119,11 +134,14 @@ private static RowData.FieldGetter createFieldGetter(RowType rowType, Types.ListType projectedList = projectField.type().asListType(); Types.ListType originalList = rowField.type().asListType(); - boolean elementProjectable = !projectedList.elementType().isNestedType() || - projectedList.elementType().equals(originalList.elementType()); - Preconditions.checkArgument(elementProjectable, + boolean elementProjectable = + !projectedList.elementType().isNestedType() + || projectedList.elementType().equals(originalList.elementType()); + Preconditions.checkArgument( + elementProjectable, "Cannot project a partial list element with non-primitive type. Trying to project <%s> out of <%s>", - projectField, rowField); + projectField, + rowField); return RowData.createFieldGetter(rowType.getTypeAt(position), position); diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java index 931880fc360c..c5cb51b7eae4 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.math.BigDecimal; @@ -38,9 +37,7 @@ public class RowDataUtil { - private RowDataUtil() { - - } + private RowDataUtil() {} public static Object convertConstant(Type type, Object value) { if (value == null) { @@ -76,12 +73,13 @@ public static Object convertConstant(Type type, Object value) { } /** - * Similar to the private {@link RowDataSerializer#copyRowData(RowData, RowData)} method. - * This skips the check the arity of rowType and from, - * because the from RowData may contains additional column for position deletes. - * Using {@link RowDataSerializer#copy(RowData, RowData)} will fail the arity check. + * Similar to the private {@link RowDataSerializer#copyRowData(RowData, RowData)} method. This + * skips the check the arity of rowType and from, because the from RowData may contains additional + * column for position deletes. Using {@link RowDataSerializer#copy(RowData, RowData)} will fail + * the arity check. */ - public static RowData clone(RowData from, RowData reuse, RowType rowType, TypeSerializer[] fieldSerializers) { + public static RowData clone( + RowData from, RowData reuse, RowType rowType, TypeSerializer[] fieldSerializers) { GenericRowData ret; if (reuse instanceof GenericRowData) { ret = (GenericRowData) reuse; @@ -99,5 +97,4 @@ public static RowData clone(RowData from, RowData reuse, RowType rowType, TypeSe } return ret; } - } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java index 3bff219d6e6e..b8786f259a9c 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/BaseDeltaTaskWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -47,21 +46,23 @@ abstract class BaseDeltaTaskWriter extends BaseTaskWriter { private final RowDataProjection keyProjection; private final boolean upsert; - BaseDeltaTaskWriter(PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - List equalityFieldIds, - boolean upsert) { + BaseDeltaTaskWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema, + List equalityFieldIds, + boolean upsert) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.schema = schema; this.deleteSchema = TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)); this.wrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); - this.keyWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(deleteSchema), deleteSchema.asStruct()); + this.keyWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(deleteSchema), deleteSchema.asStruct()); this.keyProjection = RowDataProjection.create(schema, deleteSchema); this.upsert = upsert; } @@ -87,7 +88,8 @@ public void write(RowData row) throws IOException { case UPDATE_BEFORE: if (upsert) { - break; // UPDATE_BEFORE is not necessary for UPSERT, we do nothing to prevent delete one row twice + break; // UPDATE_BEFORE is not necessary for UPSERT, we do nothing to prevent delete one + // row twice } writer.delete(row); break; diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java index 866b785d7e1e..036970c06d5b 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifests.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -36,7 +35,8 @@ class DeltaManifests { this(dataManifest, deleteManifest, EMPTY_REF_DATA_FILES); } - DeltaManifests(ManifestFile dataManifest, ManifestFile deleteManifest, CharSequence[] referencedDataFiles) { + DeltaManifests( + ManifestFile dataManifest, ManifestFile deleteManifest, CharSequence[] referencedDataFiles) { Preconditions.checkNotNull(referencedDataFiles, "Referenced data files shouldn't be null."); this.dataManifest = dataManifest; diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java index 859f97940116..c4d6e713bb73 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/DeltaManifestsSerializer.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.ByteArrayInputStream; @@ -43,7 +42,8 @@ public int getVersion() { @Override public byte[] serialize(DeltaManifests deltaManifests) throws IOException { - Preconditions.checkNotNull(deltaManifests, "DeltaManifests to be serialized should not be null"); + Preconditions.checkNotNull( + deltaManifests, "DeltaManifests to be serialized should not be null"); ByteArrayOutputStream binaryOut = new ByteArrayOutputStream(); DataOutputStream out = new DataOutputStream(binaryOut); diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java index 56689567a1d2..18b269d6c3e9 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/EqualityFieldKeySelector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -31,8 +30,8 @@ import org.apache.iceberg.util.StructProjection; /** - * Create a {@link KeySelector} to shuffle by equality fields, to ensure same equality fields record will be emitted to - * same writer in order. + * Create a {@link KeySelector} to shuffle by equality fields, to ensure same equality fields record + * will be emitted to same writer in order. */ class EqualityFieldKeySelector implements KeySelector { @@ -51,8 +50,8 @@ class EqualityFieldKeySelector implements KeySelector { } /** - * Construct the {@link RowDataWrapper} lazily here because few members in it are not serializable. In this way, we - * don't have to serialize them with forcing. + * Construct the {@link RowDataWrapper} lazily here because few members in it are not + * serializable. In this way, we don't have to serialize them with forcing. */ protected RowDataWrapper lazyRowDataWrapper() { if (rowDataWrapper == null) { @@ -61,9 +60,7 @@ protected RowDataWrapper lazyRowDataWrapper() { return rowDataWrapper; } - /** - * Construct the {@link StructProjection} lazily because it is not serializable. - */ + /** Construct the {@link StructProjection} lazily because it is not serializable. */ protected StructProjection lazyStructProjection() { if (structProjection == null) { structProjection = StructProjection.create(schema, deleteSchema); @@ -71,9 +68,7 @@ protected StructProjection lazyStructProjection() { return structProjection; } - /** - * Construct the {@link StructLikeWrapper} lazily because it is not serializable. - */ + /** Construct the {@link StructLikeWrapper} lazily because it is not serializable. */ protected StructLikeWrapper lazyStructLikeWrapper() { if (structLikeWrapper == null) { structLikeWrapper = StructLikeWrapper.forType(deleteSchema.asStruct()); diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java index ade5c28837ec..b5d08b46be58 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -60,13 +59,19 @@ public class FlinkAppenderFactory implements FileAppenderFactory, Seria private RowType eqDeleteFlinkSchema = null; private RowType posDeleteFlinkSchema = null; - public FlinkAppenderFactory(Schema schema, RowType flinkSchema, Map props, PartitionSpec spec) { + public FlinkAppenderFactory( + Schema schema, RowType flinkSchema, Map props, PartitionSpec spec) { this(schema, flinkSchema, props, spec, null, null, null); } - public FlinkAppenderFactory(Schema schema, RowType flinkSchema, Map props, - PartitionSpec spec, int[] equalityFieldIds, - Schema eqDeleteRowSchema, Schema posDeleteRowSchema) { + public FlinkAppenderFactory( + Schema schema, + RowType flinkSchema, + Map props, + PartitionSpec spec, + int[] equalityFieldIds, + Schema eqDeleteRowSchema, + Schema posDeleteRowSchema) { this.schema = schema; this.flinkSchema = flinkSchema; this.props = props; @@ -108,7 +113,8 @@ public FileAppender newAppender(OutputFile outputFile, FileFormat forma case ORC: return ORC.write(outputFile) - .createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) + .createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) .setAll(props) .metricsConfig(metricsConfig) .schema(schema) @@ -133,18 +139,25 @@ public FileAppender newAppender(OutputFile outputFile, FileFormat forma } @Override - public DataWriter newDataWriter(EncryptedOutputFile file, FileFormat format, StructLike partition) { + public DataWriter newDataWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { return new DataWriter<>( - newAppender(file.encryptingOutputFile(), format), format, - file.encryptingOutputFile().location(), spec, partition, file.keyMetadata()); + newAppender(file.encryptingOutputFile(), format), + format, + file.encryptingOutputFile().location(), + spec, + partition, + file.keyMetadata()); } @Override - public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile outputFile, FileFormat format, - StructLike partition) { - Preconditions.checkState(equalityFieldIds != null && equalityFieldIds.length > 0, + public EqualityDeleteWriter newEqDeleteWriter( + EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { + Preconditions.checkState( + equalityFieldIds != null && equalityFieldIds.length > 0, "Equality field ids shouldn't be null or empty when creating equality-delete writer"); - Preconditions.checkNotNull(eqDeleteRowSchema, + Preconditions.checkNotNull( + eqDeleteRowSchema, "Equality delete row schema shouldn't be null when creating equality-delete writer"); MetricsConfig metricsConfig = MetricsConfig.fromProperties(props); @@ -164,7 +177,8 @@ public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile outpu case ORC: return ORC.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) + .createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)) .withPartition(partition) .overwrite() .setAll(props) @@ -176,7 +190,8 @@ public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile outpu case PARQUET: return Parquet.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(lazyEqDeleteFlinkSchema(), msgType)) + .createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(lazyEqDeleteFlinkSchema(), msgType)) .withPartition(partition) .overwrite() .setAll(props) @@ -197,8 +212,8 @@ public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile outpu } @Override - public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile outputFile, FileFormat format, - StructLike partition) { + public PositionDeleteWriter newPosDeleteWriter( + EncryptedOutputFile outputFile, FileFormat format, StructLike partition) { MetricsConfig metricsConfig = MetricsConfig.fromProperties(props); try { switch (format) { @@ -214,9 +229,11 @@ public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile outp .buildPositionWriter(); case ORC: - RowType orcPosDeleteSchema = FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); + RowType orcPosDeleteSchema = + FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); return ORC.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(orcPosDeleteSchema, iSchema)) + .createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(orcPosDeleteSchema, iSchema)) .withPartition(partition) .overwrite() .setAll(props) @@ -228,9 +245,11 @@ public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile outp .buildPositionWriter(); case PARQUET: - RowType flinkPosDeleteSchema = FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); + RowType flinkPosDeleteSchema = + FlinkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); return Parquet.writeDeletes(outputFile.encryptingOutputFile()) - .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(flinkPosDeleteSchema, msgType)) + .createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(flinkPosDeleteSchema, msgType)) .withPartition(partition) .overwrite() .setAll(props) @@ -242,7 +261,8 @@ public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile outp .buildPositionWriter(); default: - throw new UnsupportedOperationException("Cannot write pos-deletes for unsupported file format: " + format); + throw new UnsupportedOperationException( + "Cannot write pos-deletes for unsupported file format: " + format); } } catch (IOException e) { throw new UncheckedIOException(e); diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java index 55a9539c78d1..5872fed36d65 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; +import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; + import java.io.Serializable; import java.util.Locale; import java.util.Map; @@ -40,24 +44,35 @@ import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; -import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; - class FlinkFileWriterFactory extends BaseFileWriterFactory implements Serializable { private RowType dataFlinkType; private RowType equalityDeleteFlinkType; private RowType positionDeleteFlinkType; - FlinkFileWriterFactory(Table table, FileFormat dataFileFormat, Schema dataSchema, RowType dataFlinkType, - SortOrder dataSortOrder, FileFormat deleteFileFormat, - int[] equalityFieldIds, Schema equalityDeleteRowSchema, RowType equalityDeleteFlinkType, - SortOrder equalityDeleteSortOrder, Schema positionDeleteRowSchema, - RowType positionDeleteFlinkType) { - - super(table, dataFileFormat, dataSchema, dataSortOrder, deleteFileFormat, equalityFieldIds, - equalityDeleteRowSchema, equalityDeleteSortOrder, positionDeleteRowSchema); + FlinkFileWriterFactory( + Table table, + FileFormat dataFileFormat, + Schema dataSchema, + RowType dataFlinkType, + SortOrder dataSortOrder, + FileFormat deleteFileFormat, + int[] equalityFieldIds, + Schema equalityDeleteRowSchema, + RowType equalityDeleteFlinkType, + SortOrder equalityDeleteSortOrder, + Schema positionDeleteRowSchema, + RowType positionDeleteFlinkType) { + + super( + table, + dataFileFormat, + dataSchema, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteSortOrder, + positionDeleteRowSchema); this.dataFlinkType = dataFlinkType; this.equalityDeleteFlinkType = equalityDeleteFlinkType; @@ -83,7 +98,8 @@ protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { int rowFieldIndex = positionDeleteFlinkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME); if (rowFieldIndex >= 0) { // FlinkAvroWriter accepts just the Flink type of the row ignoring the path and pos - RowType positionDeleteRowFlinkType = (RowType) positionDeleteFlinkType().getTypeAt(rowFieldIndex); + RowType positionDeleteRowFlinkType = + (RowType) positionDeleteFlinkType().getTypeAt(rowFieldIndex); builder.createWriterFunc(ignored -> new FlinkAvroWriter(positionDeleteRowFlinkType)); } } @@ -95,28 +111,33 @@ protected void configureDataWrite(Parquet.DataWriteBuilder builder) { @Override protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(equalityDeleteFlinkType(), msgType)); + builder.createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(equalityDeleteFlinkType(), msgType)); } @Override protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(positionDeleteFlinkType(), msgType)); + builder.createWriterFunc( + msgType -> FlinkParquetWriters.buildWriter(positionDeleteFlinkType(), msgType)); builder.transformPaths(path -> StringData.fromString(path.toString())); } @Override protected void configureDataWrite(ORC.DataWriteBuilder builder) { - builder.createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(dataFlinkType(), iSchema)); + builder.createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(dataFlinkType(), iSchema)); } @Override protected void configureEqualityDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(equalityDeleteFlinkType(), iSchema)); + builder.createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(equalityDeleteFlinkType(), iSchema)); } @Override protected void configurePositionDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(positionDeleteFlinkType(), iSchema)); + builder.createWriterFunc( + (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(positionDeleteFlinkType(), iSchema)); builder.transformPaths(path -> StringData.fromString(path.toString())); } @@ -131,7 +152,8 @@ private RowType dataFlinkType() { private RowType equalityDeleteFlinkType() { if (equalityDeleteFlinkType == null) { - Preconditions.checkNotNull(equalityDeleteRowSchema(), "Equality delete schema must not be null"); + Preconditions.checkNotNull( + equalityDeleteRowSchema(), "Equality delete schema must not be null"); this.equalityDeleteFlinkType = FlinkSchemaUtil.convert(equalityDeleteRowSchema()); } @@ -140,7 +162,8 @@ private RowType equalityDeleteFlinkType() { private RowType positionDeleteFlinkType() { if (positionDeleteFlinkType == null) { - // wrap the optional row schema into the position delete schema that contains path and position + // wrap the optional row schema into the position delete schema that contains path and + // position Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema()); this.positionDeleteFlinkType = FlinkSchemaUtil.convert(positionDeleteSchema); } @@ -167,10 +190,12 @@ static class Builder { Map properties = table.properties(); - String dataFileFormatName = properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); + String dataFileFormatName = + properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); this.dataFileFormat = FileFormat.valueOf(dataFileFormatName.toUpperCase(Locale.ENGLISH)); - String deleteFileFormatName = properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); + String deleteFileFormatName = + properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); this.deleteFileFormat = FileFormat.valueOf(deleteFileFormatName.toUpperCase(Locale.ENGLISH)); } @@ -186,8 +211,8 @@ Builder dataSchema(Schema newDataSchema) { /** * Sets a Flink type for data. - *

    - * If not set, the value is derived from the provided Iceberg schema. + * + *

    If not set, the value is derived from the provided Iceberg schema. */ Builder dataFlinkType(RowType newDataFlinkType) { this.dataFlinkType = newDataFlinkType; @@ -216,8 +241,8 @@ Builder equalityDeleteRowSchema(Schema newEqualityDeleteRowSchema) { /** * Sets a Flink type for equality deletes. - *

    - * If not set, the value is derived from the provided Iceberg schema. + * + *

    If not set, the value is derived from the provided Iceberg schema. */ Builder equalityDeleteFlinkType(RowType newEqualityDeleteFlinkType) { this.equalityDeleteFlinkType = newEqualityDeleteFlinkType; @@ -236,8 +261,8 @@ Builder positionDeleteRowSchema(Schema newPositionDeleteRowSchema) { /** * Sets a Flink type for position deletes. - *

    - * If not set, the value is derived from the provided Iceberg schema. + * + *

    If not set, the value is derived from the provided Iceberg schema. */ Builder positionDeleteFlinkType(RowType newPositionDeleteFlinkType) { this.positionDeleteFlinkType = newPositionDeleteFlinkType; @@ -247,13 +272,23 @@ Builder positionDeleteFlinkType(RowType newPositionDeleteFlinkType) { FlinkFileWriterFactory build() { boolean noEqualityDeleteConf = equalityFieldIds == null && equalityDeleteRowSchema == null; boolean fullEqualityDeleteConf = equalityFieldIds != null && equalityDeleteRowSchema != null; - Preconditions.checkArgument(noEqualityDeleteConf || fullEqualityDeleteConf, + Preconditions.checkArgument( + noEqualityDeleteConf || fullEqualityDeleteConf, "Equality field IDs and equality delete row schema must be set together"); return new FlinkFileWriterFactory( - table, dataFileFormat, dataSchema, dataFlinkType, dataSortOrder, deleteFileFormat, - equalityFieldIds, equalityDeleteRowSchema, equalityDeleteFlinkType, equalityDeleteSortOrder, - positionDeleteRowSchema, positionDeleteFlinkType); + table, + dataFileFormat, + dataSchema, + dataFlinkType, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteFlinkType, + equalityDeleteSortOrder, + positionDeleteRowSchema, + positionDeleteFlinkType); } } } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java index d20859377ffc..25badc372abf 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkManifestUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -41,12 +40,12 @@ class FlinkManifestUtil { private static final int FORMAT_V2 = 2; private static final Long DUMMY_SNAPSHOT_ID = 0L; - private FlinkManifestUtil() { - } + private FlinkManifestUtil() {} - static ManifestFile writeDataFiles(OutputFile outputFile, PartitionSpec spec, List dataFiles) - throws IOException { - ManifestWriter writer = ManifestFiles.write(FORMAT_V2, spec, outputFile, DUMMY_SNAPSHOT_ID); + static ManifestFile writeDataFiles( + OutputFile outputFile, PartitionSpec spec, List dataFiles) throws IOException { + ManifestWriter writer = + ManifestFiles.write(FORMAT_V2, spec, outputFile, DUMMY_SNAPSHOT_ID); try (ManifestWriter closeableWriter = writer) { closeableWriter.addAll(dataFiles); @@ -61,31 +60,38 @@ static List readDataFiles(ManifestFile manifestFile, FileIO io) throws } } - static ManifestOutputFileFactory createOutputFileFactory(Table table, String flinkJobId, String operatorUniqueId, - int subTaskId, long attemptNumber) { + static ManifestOutputFileFactory createOutputFileFactory( + Table table, String flinkJobId, String operatorUniqueId, int subTaskId, long attemptNumber) { TableOperations ops = ((HasTableOperations) table).operations(); - return new ManifestOutputFileFactory(ops, table.io(), table.properties(), flinkJobId, operatorUniqueId, - subTaskId, attemptNumber); + return new ManifestOutputFileFactory( + ops, + table.io(), + table.properties(), + flinkJobId, + operatorUniqueId, + subTaskId, + attemptNumber); } - static DeltaManifests writeCompletedFiles(WriteResult result, - Supplier outputFileSupplier, - PartitionSpec spec) throws IOException { + static DeltaManifests writeCompletedFiles( + WriteResult result, Supplier outputFileSupplier, PartitionSpec spec) + throws IOException { ManifestFile dataManifest = null; ManifestFile deleteManifest = null; // Write the completed data files into a newly created data manifest file. if (result.dataFiles() != null && result.dataFiles().length > 0) { - dataManifest = writeDataFiles(outputFileSupplier.get(), spec, Lists.newArrayList(result.dataFiles())); + dataManifest = + writeDataFiles(outputFileSupplier.get(), spec, Lists.newArrayList(result.dataFiles())); } // Write the completed delete files into a newly created delete manifest file. if (result.deleteFiles() != null && result.deleteFiles().length > 0) { OutputFile deleteManifestFile = outputFileSupplier.get(); - ManifestWriter deleteManifestWriter = ManifestFiles.writeDeleteManifest(FORMAT_V2, spec, - deleteManifestFile, DUMMY_SNAPSHOT_ID); + ManifestWriter deleteManifestWriter = + ManifestFiles.writeDeleteManifest(FORMAT_V2, spec, deleteManifestFile, DUMMY_SNAPSHOT_ID); try (ManifestWriter writer = deleteManifestWriter) { for (DeleteFile deleteFile : result.deleteFiles()) { writer.add(deleteFile); @@ -98,7 +104,8 @@ static DeltaManifests writeCompletedFiles(WriteResult result, return new DeltaManifests(dataManifest, deleteManifest, result.referencedDataFiles()); } - static WriteResult readCompletedFiles(DeltaManifests deltaManifests, FileIO io) throws IOException { + static WriteResult readCompletedFiles(DeltaManifests deltaManifests, FileIO io) + throws IOException { WriteResult.Builder builder = WriteResult.builder(); // Read the completed data files from persisted data manifest file. @@ -108,13 +115,12 @@ static WriteResult readCompletedFiles(DeltaManifests deltaManifests, FileIO io) // Read the completed delete files from persisted delete manifests file. if (deltaManifests.deleteManifest() != null) { - try (CloseableIterable deleteFiles = ManifestFiles - .readDeleteManifest(deltaManifests.deleteManifest(), io, null)) { + try (CloseableIterable deleteFiles = + ManifestFiles.readDeleteManifest(deltaManifests.deleteManifest(), io, null)) { builder.addDeleteFiles(deleteFiles); } } - return builder.addReferencedDataFiles(deltaManifests.referencedDataFiles()) - .build(); + return builder.addReferencedDataFiles(deltaManifests.referencedDataFiles()).build(); } } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java index 7b597e427bad..172484fc113a 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkSink.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; + import java.io.IOException; import java.io.UncheckedIOException; import java.util.List; @@ -62,40 +63,39 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; - public class FlinkSink { private static final Logger LOG = LoggerFactory.getLogger(FlinkSink.class); - private static final String ICEBERG_STREAM_WRITER_NAME = IcebergStreamWriter.class.getSimpleName(); - private static final String ICEBERG_FILES_COMMITTER_NAME = IcebergFilesCommitter.class.getSimpleName(); + private static final String ICEBERG_STREAM_WRITER_NAME = + IcebergStreamWriter.class.getSimpleName(); + private static final String ICEBERG_FILES_COMMITTER_NAME = + IcebergFilesCommitter.class.getSimpleName(); - private FlinkSink() { - } + private FlinkSink() {} /** - * Initialize a {@link Builder} to export the data from generic input data stream into iceberg table. We use - * {@link RowData} inside the sink connector, so users need to provide a mapper function and a - * {@link TypeInformation} to convert those generic records to a RowData DataStream. + * Initialize a {@link Builder} to export the data from generic input data stream into iceberg + * table. We use {@link RowData} inside the sink connector, so users need to provide a mapper + * function and a {@link TypeInformation} to convert those generic records to a RowData + * DataStream. * - * @param input the generic source input data stream. - * @param mapper function to convert the generic data to {@link RowData} + * @param input the generic source input data stream. + * @param mapper function to convert the generic data to {@link RowData} * @param outputType to define the {@link TypeInformation} for the input data. - * @param the data type of records. + * @param the data type of records. * @return {@link Builder} to connect the iceberg table. */ - public static Builder builderFor(DataStream input, - MapFunction mapper, - TypeInformation outputType) { + public static Builder builderFor( + DataStream input, MapFunction mapper, TypeInformation outputType) { return new Builder().forMapperOutputType(input, mapper, outputType); } /** - * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into iceberg table. We use - * {@link RowData} inside the sink connector, so users need to provide a {@link TableSchema} for builder to convert - * those {@link Row}s to a {@link RowData} DataStream. + * Initialize a {@link Builder} to export the data from input data stream with {@link Row}s into + * iceberg table. We use {@link RowData} inside the sink connector, so users need to provide a + * {@link TableSchema} for builder to convert those {@link Row}s to a {@link RowData} DataStream. * - * @param input the source input data stream with {@link Row}s. + * @param input the source input data stream with {@link Row}s. * @param tableSchema defines the {@link TypeInformation} for input data. * @return {@link Builder} to connect the iceberg table. */ @@ -103,13 +103,15 @@ public static Builder forRow(DataStream input, TableSchema tableSchema) { RowType rowType = (RowType) tableSchema.toRowDataType().getLogicalType(); DataType[] fieldDataTypes = tableSchema.getFieldDataTypes(); - DataFormatConverters.RowConverter rowConverter = new DataFormatConverters.RowConverter(fieldDataTypes); + DataFormatConverters.RowConverter rowConverter = + new DataFormatConverters.RowConverter(fieldDataTypes); return builderFor(input, rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)) .tableSchema(tableSchema); } /** - * Initialize a {@link Builder} to export the data from input data stream with {@link RowData}s into iceberg table. + * Initialize a {@link Builder} to export the data from input data stream with {@link RowData}s + * into iceberg table. * * @param input the source input data stream with {@link RowData}s. * @return {@link Builder} to connect the iceberg table. @@ -131,34 +133,35 @@ public static class Builder { private final Map writeOptions = Maps.newHashMap(); private FlinkWriteConf flinkWriteConf = null; - private Builder() { - } + private Builder() {} private Builder forRowData(DataStream newRowDataInput) { this.inputCreator = ignored -> newRowDataInput; return this; } - private Builder forMapperOutputType(DataStream input, - MapFunction mapper, - TypeInformation outputType) { - this.inputCreator = newUidPrefix -> { - // Input stream order is crucial for some situation(e.g. in cdc case). Therefore, we need to set the parallelism - // of map operator same as its input to keep map operator chaining its input, and avoid rebalanced by default. - SingleOutputStreamOperator inputStream = input.map(mapper, outputType) - .setParallelism(input.getParallelism()); - if (newUidPrefix != null) { - inputStream.name(operatorName(newUidPrefix)).uid(newUidPrefix + "-mapper"); - } - return inputStream; - }; + private Builder forMapperOutputType( + DataStream input, MapFunction mapper, TypeInformation outputType) { + this.inputCreator = + newUidPrefix -> { + // Input stream order is crucial for some situation(e.g. in cdc case). Therefore, we + // need to set the parallelism + // of map operator same as its input to keep map operator chaining its input, and avoid + // rebalanced by default. + SingleOutputStreamOperator inputStream = + input.map(mapper, outputType).setParallelism(input.getParallelism()); + if (newUidPrefix != null) { + inputStream.name(operatorName(newUidPrefix)).uid(newUidPrefix + "-mapper"); + } + return inputStream; + }; return this; } /** - * This iceberg {@link Table} instance is used for initializing {@link IcebergStreamWriter} which will write all - * the records into {@link DataFile}s and emit them to downstream operator. Providing a table would avoid so many - * table loading from each separate task. + * This iceberg {@link Table} instance is used for initializing {@link IcebergStreamWriter} + * which will write all the records into {@link DataFile}s and emit them to downstream operator. + * Providing a table would avoid so many table loading from each separate task. * * @param newTable the loaded iceberg table instance. * @return {@link Builder} to connect the iceberg table. @@ -169,9 +172,9 @@ public Builder table(Table newTable) { } /** - * The table loader is used for loading tables in {@link IcebergFilesCommitter} lazily, we need this loader because - * {@link Table} is not serializable and could not just use the loaded table from Builder#table in the remote task - * manager. + * The table loader is used for loading tables in {@link IcebergFilesCommitter} lazily, we need + * this loader because {@link Table} is not serializable and could not just use the loaded table + * from Builder#table in the remote task manager. * * @param newTableLoader to load iceberg table inside tasks. * @return {@link Builder} to connect the iceberg table. @@ -182,8 +185,8 @@ public Builder tableLoader(TableLoader newTableLoader) { } /** - * Set the write properties for Flink sink. - * View the supported properties in {@link FlinkWriteOptions} + * Set the write properties for Flink sink. View the supported properties in {@link + * FlinkWriteOptions} */ public Builder set(String property, String value) { writeOptions.put(property, value); @@ -191,8 +194,8 @@ public Builder set(String property, String value) { } /** - * Set the write properties for Flink sink. - * View the supported properties in {@link FlinkWriteOptions} + * Set the write properties for Flink sink. View the supported properties in {@link + * FlinkWriteOptions} */ public Builder setAll(Map properties) { writeOptions.putAll(properties); @@ -215,14 +218,15 @@ public Builder flinkConf(ReadableConfig config) { } /** - * Configure the write {@link DistributionMode} that the flink sink will use. Currently, flink support - * {@link DistributionMode#NONE} and {@link DistributionMode#HASH}. + * Configure the write {@link DistributionMode} that the flink sink will use. Currently, flink + * support {@link DistributionMode#NONE} and {@link DistributionMode#HASH}. * * @param mode to specify the write distribution mode. * @return {@link Builder} to connect the iceberg table. */ public Builder distributionMode(DistributionMode mode) { - Preconditions.checkArgument(!DistributionMode.RANGE.equals(mode), + Preconditions.checkArgument( + !DistributionMode.RANGE.equals(mode), "Flink does not support 'range' write distribution mode now."); if (mode != null) { writeOptions.put(FlinkWriteOptions.DISTRIBUTION_MODE.key(), mode.modeName()); @@ -242,10 +246,10 @@ public Builder writeParallelism(int newWriteParallelism) { } /** - * All INSERT/UPDATE_AFTER events from input stream will be transformed to UPSERT events, which means it will - * DELETE the old records and then INSERT the new records. In partitioned table, the partition fields should be - * a subset of equality fields, otherwise the old row that located in partition-A could not be deleted by the - * new row that located in partition-B. + * All INSERT/UPDATE_AFTER events from input stream will be transformed to UPSERT events, which + * means it will DELETE the old records and then INSERT the new records. In partitioned table, + * the partition fields should be a subset of equality fields, otherwise the old row that + * located in partition-A could not be deleted by the new row that located in partition-B. * * @param enabled indicate whether it should transform all INSERT/UPDATE_AFTER events to UPSERT. * @return {@link Builder} to connect the iceberg table. @@ -267,22 +271,25 @@ public Builder equalityFieldColumns(List columns) { } /** - * Set the uid prefix for FlinkSink operators. Note that FlinkSink internally consists of multiple operators (like - * writer, committer, dummy sink etc.) Actually operator uid will be appended with a suffix like "uidPrefix-writer". - *

    - * If provided, this prefix is also applied to operator names. - *

    - * Flink auto generates operator uid if not set explicitly. It is a recommended - * - * best-practice to set uid for all operators before deploying to production. Flink has an option to {@code - * pipeline.auto-generate-uid=false} to disable auto-generation and force explicit setting of all operator uid. - *

    - * Be careful with setting this for an existing job, because now we are changing the operator uid from an - * auto-generated one to this new value. When deploying the change with a checkpoint, Flink won't be able to restore - * the previous Flink sink operator state (more specifically the committer operator state). You need to use {@code - * --allowNonRestoredState} to ignore the previous sink state. During restore Flink sink state is used to check if - * last commit was actually successful or not. {@code --allowNonRestoredState} can lead to data loss if the - * Iceberg commit failed in the last completed checkpoint. + * Set the uid prefix for FlinkSink operators. Note that FlinkSink internally consists of + * multiple operators (like writer, committer, dummy sink etc.) Actually operator uid will be + * appended with a suffix like "uidPrefix-writer".
    + *
    + * If provided, this prefix is also applied to operator names.
    + *
    + * Flink auto generates operator uid if not set explicitly. It is a recommended + * best-practice to set uid for all operators before deploying to production. Flink has an + * option to {@code pipeline.auto-generate-uid=false} to disable auto-generation and force + * explicit setting of all operator uid.
    + *
    + * Be careful with setting this for an existing job, because now we are changing the operator + * uid from an auto-generated one to this new value. When deploying the change with a + * checkpoint, Flink won't be able to restore the previous Flink sink operator state (more + * specifically the committer operator state). You need to use {@code --allowNonRestoredState} + * to ignore the previous sink state. During restore Flink sink state is used to check if last + * commit was actually successful or not. {@code --allowNonRestoredState} can lead to data loss + * if the Iceberg commit failed in the last completed checkpoint. * * @param newPrefix prefix for Flink sink operator uid and name * @return {@link Builder} to connect the iceberg table. @@ -303,7 +310,8 @@ public Builder setSnapshotProperty(String property, String value) { } private DataStreamSink chainIcebergOperators() { - Preconditions.checkArgument(inputCreator != null, + Preconditions.checkArgument( + inputCreator != null, "Please use forRowData() or forMapperOutputType() to initialize the input DataStream."); Preconditions.checkNotNull(tableLoader, "Table loader shouldn't be null"); @@ -314,7 +322,8 @@ private DataStreamSink chainIcebergOperators() { try (TableLoader loader = tableLoader) { this.table = loader.loadTable(); } catch (IOException e) { - throw new UncheckedIOException("Failed to load iceberg table from table loader: " + tableLoader, e); + throw new UncheckedIOException( + "Failed to load iceberg table from table loader: " + tableLoader, e); } } @@ -326,13 +335,15 @@ private DataStreamSink chainIcebergOperators() { // Convert the requested flink table schema to flink row type. RowType flinkRowType = toFlinkRowType(table.schema(), tableSchema); - // Distribute the records from input data stream based on the write.distribution-mode and equality fields. - DataStream distributeStream = distributeDataStream( - rowDataInput, equalityFieldIds, table.spec(), table.schema(), flinkRowType); + // Distribute the records from input data stream based on the write.distribution-mode and + // equality fields. + DataStream distributeStream = + distributeDataStream( + rowDataInput, equalityFieldIds, table.spec(), table.schema(), flinkRowType); // Add parallel writers that append rows to files - SingleOutputStreamOperator writerStream = appendWriter(distributeStream, flinkRowType, - equalityFieldIds); + SingleOutputStreamOperator writerStream = + appendWriter(distributeStream, flinkRowType, equalityFieldIds); // Add single-parallelism committer that commits files // after successful checkpoint or end of input @@ -359,18 +370,24 @@ private String operatorName(String suffix) { List checkAndGetEqualityFieldIds() { List equalityFieldIds = Lists.newArrayList(table.schema().identifierFieldIds()); if (equalityFieldColumns != null && equalityFieldColumns.size() > 0) { - Set equalityFieldSet = Sets.newHashSetWithExpectedSize(equalityFieldColumns.size()); + Set equalityFieldSet = + Sets.newHashSetWithExpectedSize(equalityFieldColumns.size()); for (String column : equalityFieldColumns) { org.apache.iceberg.types.Types.NestedField field = table.schema().findField(column); - Preconditions.checkNotNull(field, "Missing required equality field column '%s' in table schema %s", - column, table.schema()); + Preconditions.checkNotNull( + field, + "Missing required equality field column '%s' in table schema %s", + column, + table.schema()); equalityFieldSet.add(field.fieldId()); } if (!equalityFieldSet.equals(table.schema().identifierFieldIds())) { - LOG.warn("The configured equality field column IDs {} are not matched with the schema identifier field IDs" + - " {}, use job specified equality field columns as the equality fields by default.", - equalityFieldSet, table.schema().identifierFieldIds()); + LOG.warn( + "The configured equality field column IDs {} are not matched with the schema identifier field IDs" + + " {}, use job specified equality field columns as the equality fields by default.", + equalityFieldSet, + table.schema().identifierFieldIds()); } equalityFieldIds = Lists.newArrayList(equalityFieldSet); } @@ -378,66 +395,82 @@ List checkAndGetEqualityFieldIds() { } @SuppressWarnings("unchecked") - private DataStreamSink appendDummySink(SingleOutputStreamOperator committerStream) { - DataStreamSink resultStream = committerStream - .addSink(new DiscardingSink()) - .name(operatorName(String.format("IcebergSink %s", this.table.name()))) - .setParallelism(1); + private DataStreamSink appendDummySink( + SingleOutputStreamOperator committerStream) { + DataStreamSink resultStream = + committerStream + .addSink(new DiscardingSink()) + .name(operatorName(String.format("IcebergSink %s", this.table.name()))) + .setParallelism(1); if (uidPrefix != null) { resultStream = resultStream.uid(uidPrefix + "-dummysink"); } return resultStream; } - private SingleOutputStreamOperator appendCommitter(SingleOutputStreamOperator writerStream) { - IcebergFilesCommitter filesCommitter = new IcebergFilesCommitter( - tableLoader, flinkWriteConf.overwriteMode(), snapshotProperties, - flinkWriteConf.workerPoolSize()); - SingleOutputStreamOperator committerStream = writerStream - .transform(operatorName(ICEBERG_FILES_COMMITTER_NAME), Types.VOID, filesCommitter) - .setParallelism(1) - .setMaxParallelism(1); + private SingleOutputStreamOperator appendCommitter( + SingleOutputStreamOperator writerStream) { + IcebergFilesCommitter filesCommitter = + new IcebergFilesCommitter( + tableLoader, + flinkWriteConf.overwriteMode(), + snapshotProperties, + flinkWriteConf.workerPoolSize()); + SingleOutputStreamOperator committerStream = + writerStream + .transform(operatorName(ICEBERG_FILES_COMMITTER_NAME), Types.VOID, filesCommitter) + .setParallelism(1) + .setMaxParallelism(1); if (uidPrefix != null) { committerStream = committerStream.uid(uidPrefix + "-committer"); } return committerStream; } - private SingleOutputStreamOperator appendWriter(DataStream input, RowType flinkRowType, - List equalityFieldIds) { + private SingleOutputStreamOperator appendWriter( + DataStream input, RowType flinkRowType, List equalityFieldIds) { // Validate the equality fields and partition fields if we enable the upsert mode. if (flinkWriteConf.upsertMode()) { - Preconditions.checkState(!flinkWriteConf.overwriteMode(), + Preconditions.checkState( + !flinkWriteConf.overwriteMode(), "OVERWRITE mode shouldn't be enable when configuring to use UPSERT data stream."); - Preconditions.checkState(!equalityFieldIds.isEmpty(), + Preconditions.checkState( + !equalityFieldIds.isEmpty(), "Equality field columns shouldn't be empty when configuring to use UPSERT data stream."); if (!table.spec().isUnpartitioned()) { for (PartitionField partitionField : table.spec().fields()) { - Preconditions.checkState(equalityFieldIds.contains(partitionField.sourceId()), + Preconditions.checkState( + equalityFieldIds.contains(partitionField.sourceId()), "In UPSERT mode, partition field '%s' should be included in equality fields: '%s'", - partitionField, equalityFieldColumns); + partitionField, + equalityFieldColumns); } } } - IcebergStreamWriter streamWriter = createStreamWriter(table, flinkWriteConf, - flinkRowType, equalityFieldIds); + IcebergStreamWriter streamWriter = + createStreamWriter(table, flinkWriteConf, flinkRowType, equalityFieldIds); int parallelism = writeParallelism == null ? input.getParallelism() : writeParallelism; - SingleOutputStreamOperator writerStream = input - .transform(operatorName(ICEBERG_STREAM_WRITER_NAME), TypeInformation.of(WriteResult.class), streamWriter) - .setParallelism(parallelism); + SingleOutputStreamOperator writerStream = + input + .transform( + operatorName(ICEBERG_STREAM_WRITER_NAME), + TypeInformation.of(WriteResult.class), + streamWriter) + .setParallelism(parallelism); if (uidPrefix != null) { writerStream = writerStream.uid(uidPrefix + "-writer"); } return writerStream; } - private DataStream distributeDataStream(DataStream input, - List equalityFieldIds, - PartitionSpec partitionSpec, - Schema iSchema, - RowType flinkRowType) { + private DataStream distributeDataStream( + DataStream input, + List equalityFieldIds, + PartitionSpec partitionSpec, + Schema iSchema, + RowType flinkRowType) { DistributionMode writeMode = flinkWriteConf.distributionMode(); LOG.info("Write distribution mode is '{}'", writeMode.modeName()); @@ -447,28 +480,35 @@ private DataStream distributeDataStream(DataStream input, return input; } else { LOG.info("Distribute rows by equality fields, because there are equality fields set"); - return input.keyBy(new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + return input.keyBy( + new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); } case HASH: if (equalityFieldIds.isEmpty()) { if (partitionSpec.isUnpartitioned()) { - LOG.warn("Fallback to use 'none' distribution mode, because there are no equality fields set " + - "and table is unpartitioned"); + LOG.warn( + "Fallback to use 'none' distribution mode, because there are no equality fields set " + + "and table is unpartitioned"); return input; } else { return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); } } else { if (partitionSpec.isUnpartitioned()) { - LOG.info("Distribute rows by equality fields, because there are equality fields set " + - "and table is unpartitioned"); - return input.keyBy(new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + LOG.info( + "Distribute rows by equality fields, because there are equality fields set " + + "and table is unpartitioned"); + return input.keyBy( + new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); } else { for (PartitionField partitionField : partitionSpec.fields()) { - Preconditions.checkState(equalityFieldIds.contains(partitionField.sourceId()), - "In 'hash' distribution mode with equality fields set, partition field '%s' " + - "should be included in equality fields: '%s'", partitionField, equalityFieldColumns); + Preconditions.checkState( + equalityFieldIds.contains(partitionField.sourceId()), + "In 'hash' distribution mode with equality fields set, partition field '%s' " + + "should be included in equality fields: '%s'", + partitionField, + equalityFieldColumns); } return input.keyBy(new PartitionKeySelector(partitionSpec, iSchema, flinkRowType)); } @@ -476,13 +516,18 @@ private DataStream distributeDataStream(DataStream input, case RANGE: if (equalityFieldIds.isEmpty()) { - LOG.warn("Fallback to use 'none' distribution mode, because there are no equality fields set " + - "and {}=range is not supported yet in flink", WRITE_DISTRIBUTION_MODE); + LOG.warn( + "Fallback to use 'none' distribution mode, because there are no equality fields set " + + "and {}=range is not supported yet in flink", + WRITE_DISTRIBUTION_MODE); return input; } else { - LOG.info("Distribute rows by equality fields, because there are equality fields set " + - "and{}=range is not supported yet in flink", WRITE_DISTRIBUTION_MODE); - return input.keyBy(new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); + LOG.info( + "Distribute rows by equality fields, because there are equality fields set " + + "and{}=range is not supported yet in flink", + WRITE_DISTRIBUTION_MODE); + return input.keyBy( + new EqualityFieldKeySelector(iSchema, flinkRowType, equalityFieldIds)); } default: @@ -493,13 +538,17 @@ private DataStream distributeDataStream(DataStream input, static RowType toFlinkRowType(Schema schema, TableSchema requestedSchema) { if (requestedSchema != null) { - // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing iceberg schema. + // Convert the flink schema to iceberg schema firstly, then reassign ids to match the existing + // iceberg schema. Schema writeSchema = TypeUtil.reassignIds(FlinkSchemaUtil.convert(requestedSchema), schema); TypeUtil.validateWriteSchema(schema, writeSchema, true, true); - // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will be promoted to - // iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT (backend by 1 'byte'), we will - // read 4 bytes rather than 1 byte, it will mess up the byte array in BinaryRowData. So here we must use flink + // We use this flink schema to read values from RowData. The flink's TINYINT and SMALLINT will + // be promoted to + // iceberg INTEGER, that means if we use iceberg's table schema to read TINYINT (backend by 1 + // 'byte'), we will + // read 4 bytes rather than 1 byte, it will mess up the byte array in BinaryRowData. So here + // we must use flink // schema. return (RowType) requestedSchema.toRowDataType().getLogicalType(); } else { @@ -507,16 +556,22 @@ static RowType toFlinkRowType(Schema schema, TableSchema requestedSchema) { } } - static IcebergStreamWriter createStreamWriter(Table table, - FlinkWriteConf flinkWriteConf, - RowType flinkRowType, - List equalityFieldIds) { + static IcebergStreamWriter createStreamWriter( + Table table, + FlinkWriteConf flinkWriteConf, + RowType flinkRowType, + List equalityFieldIds) { Preconditions.checkArgument(table != null, "Iceberg table shouldn't be null"); Table serializableTable = SerializableTable.copyOf(table); - TaskWriterFactory taskWriterFactory = new RowDataTaskWriterFactory( - serializableTable, flinkRowType, flinkWriteConf.targetDataFileSize(), - flinkWriteConf.dataFileFormat(), equalityFieldIds, flinkWriteConf.upsertMode()); + TaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + serializableTable, + flinkRowType, + flinkWriteConf.targetDataFileSize(), + flinkWriteConf.dataFileFormat(), + equalityFieldIds, + flinkWriteConf.upsertMode()); return new IcebergStreamWriter<>(table.name(), taskWriterFactory); } } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java index bd332d96d466..c5b7643b27a7 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergFilesCommitter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -70,9 +69,12 @@ class IcebergFilesCommitter extends AbstractStreamOperator private static final Logger LOG = LoggerFactory.getLogger(IcebergFilesCommitter.class); private static final String FLINK_JOB_ID = "flink.job-id"; - // The max checkpoint id we've committed to iceberg table. As the flink's checkpoint is always increasing, so we could - // correctly commit all the data files whose checkpoint id is greater than the max committed one to iceberg table, for - // avoiding committing the same data files twice. This id will be attached to iceberg's meta when committing the + // The max checkpoint id we've committed to iceberg table. As the flink's checkpoint is always + // increasing, so we could + // correctly commit all the data files whose checkpoint id is greater than the max committed one + // to iceberg table, for + // avoiding committing the same data files twice. This id will be attached to iceberg's meta when + // committing the // iceberg transaction. private static final String MAX_COMMITTED_CHECKPOINT_ID = "flink.max-committed-checkpoint-id"; static final String MAX_CONTINUOUS_EMPTY_COMMITS = "flink.max-continuous-empty-commits"; @@ -82,15 +84,21 @@ class IcebergFilesCommitter extends AbstractStreamOperator private final boolean replacePartitions; private final Map snapshotProperties; - // A sorted map to maintain the completed data files for each pending checkpointId (which have not been committed - // to iceberg table). We need a sorted map here because there's possible that few checkpoints snapshot failed, for - // example: the 1st checkpoint have 2 data files <1, >, the 2st checkpoint have 1 data files - // <2, >. Snapshot for checkpoint#1 interrupted because of network/disk failure etc, while we don't expect - // any data loss in iceberg table. So we keep the finished files <1, > in memory and retry to commit + // A sorted map to maintain the completed data files for each pending checkpointId (which have not + // been committed + // to iceberg table). We need a sorted map here because there's possible that few checkpoints + // snapshot failed, for + // example: the 1st checkpoint have 2 data files <1, >, the 2st checkpoint have 1 + // data files + // <2, >. Snapshot for checkpoint#1 interrupted because of network/disk failure etc, while + // we don't expect + // any data loss in iceberg table. So we keep the finished files <1, > in memory and + // retry to commit // iceberg table when the next checkpoint happen. private final NavigableMap dataFilesPerCheckpoint = Maps.newTreeMap(); - // The completed files cache for current checkpoint. Once the snapshot barrier received, it will be flushed to the + // The completed files cache for current checkpoint. Once the snapshot barrier received, it will + // be flushed to the // 'dataFilesPerCheckpoint'. private final List writeResultsOfCurrentCkpt = Lists.newArrayList(); @@ -101,22 +109,29 @@ class IcebergFilesCommitter extends AbstractStreamOperator private transient long maxCommittedCheckpointId; private transient int continuousEmptyCheckpoints; private transient int maxContinuousEmptyCommits; - // There're two cases that we restore from flink checkpoints: the first case is restoring from snapshot created by the - // same flink job; another case is restoring from snapshot created by another different job. For the second case, we - // need to maintain the old flink job's id in flink state backend to find the max-committed-checkpoint-id when + // There're two cases that we restore from flink checkpoints: the first case is restoring from + // snapshot created by the + // same flink job; another case is restoring from snapshot created by another different job. For + // the second case, we + // need to maintain the old flink job's id in flink state backend to find the + // max-committed-checkpoint-id when // traversing iceberg table's snapshots. - private static final ListStateDescriptor JOB_ID_DESCRIPTOR = new ListStateDescriptor<>( - "iceberg-flink-job-id", BasicTypeInfo.STRING_TYPE_INFO); + private static final ListStateDescriptor JOB_ID_DESCRIPTOR = + new ListStateDescriptor<>("iceberg-flink-job-id", BasicTypeInfo.STRING_TYPE_INFO); private transient ListState jobIdState; // All pending checkpoints states for this function. - private static final ListStateDescriptor> STATE_DESCRIPTOR = buildStateDescriptor(); + private static final ListStateDescriptor> STATE_DESCRIPTOR = + buildStateDescriptor(); private transient ListState> checkpointsState; private final Integer workerPoolSize; private transient ExecutorService workerPool; - IcebergFilesCommitter(TableLoader tableLoader, boolean replacePartitions, Map snapshotProperties, - Integer workerPoolSize) { + IcebergFilesCommitter( + TableLoader tableLoader, + boolean replacePartitions, + Map snapshotProperties, + Integer workerPoolSize) { this.tableLoader = tableLoader; this.replacePartitions = replacePartitions; this.snapshotProperties = snapshotProperties; @@ -132,32 +147,37 @@ public void initializeState(StateInitializationContext context) throws Exception this.tableLoader.open(); this.table = tableLoader.loadTable(); - maxContinuousEmptyCommits = PropertyUtil.propertyAsInt(table.properties(), MAX_CONTINUOUS_EMPTY_COMMITS, 10); - Preconditions.checkArgument(maxContinuousEmptyCommits > 0, - MAX_CONTINUOUS_EMPTY_COMMITS + " must be positive"); + maxContinuousEmptyCommits = + PropertyUtil.propertyAsInt(table.properties(), MAX_CONTINUOUS_EMPTY_COMMITS, 10); + Preconditions.checkArgument( + maxContinuousEmptyCommits > 0, MAX_CONTINUOUS_EMPTY_COMMITS + " must be positive"); int subTaskId = getRuntimeContext().getIndexOfThisSubtask(); int attemptId = getRuntimeContext().getAttemptNumber(); String operatorUniqueId = getRuntimeContext().getOperatorUniqueID(); - this.manifestOutputFileFactory = FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, operatorUniqueId, - subTaskId, attemptId); + this.manifestOutputFileFactory = + FlinkManifestUtil.createOutputFileFactory( + table, flinkJobId, operatorUniqueId, subTaskId, attemptId); this.maxCommittedCheckpointId = INITIAL_CHECKPOINT_ID; this.checkpointsState = context.getOperatorStateStore().getListState(STATE_DESCRIPTOR); this.jobIdState = context.getOperatorStateStore().getListState(JOB_ID_DESCRIPTOR); if (context.isRestored()) { String restoredFlinkJobId = jobIdState.get().iterator().next(); - Preconditions.checkState(!Strings.isNullOrEmpty(restoredFlinkJobId), + Preconditions.checkState( + !Strings.isNullOrEmpty(restoredFlinkJobId), "Flink job id parsed from checkpoint snapshot shouldn't be null or empty"); - // Since flink's checkpoint id will start from the max-committed-checkpoint-id + 1 in the new flink job even if - // it's restored from a snapshot created by another different flink job, so it's safe to assign the max committed + // Since flink's checkpoint id will start from the max-committed-checkpoint-id + 1 in the new + // flink job even if + // it's restored from a snapshot created by another different flink job, so it's safe to + // assign the max committed // checkpoint id from restored flink job to the current flink job. this.maxCommittedCheckpointId = getMaxCommittedCheckpointId(table, restoredFlinkJobId); - NavigableMap uncommittedDataFiles = Maps - .newTreeMap(checkpointsState.get().iterator().next()) - .tailMap(maxCommittedCheckpointId, false); + NavigableMap uncommittedDataFiles = + Maps.newTreeMap(checkpointsState.get().iterator().next()) + .tailMap(maxCommittedCheckpointId, false); if (!uncommittedDataFiles.isEmpty()) { // Committed all uncommitted data files from the old flink job to iceberg table. long maxUncommittedCheckpointId = uncommittedDataFiles.lastKey(); @@ -170,7 +190,10 @@ public void initializeState(StateInitializationContext context) throws Exception public void snapshotState(StateSnapshotContext context) throws Exception { super.snapshotState(context); long checkpointId = context.getCheckpointId(); - LOG.info("Start to flush snapshot state to state backend, table: {}, checkpointId: {}", table, checkpointId); + LOG.info( + "Start to flush snapshot state to state backend, table: {}, checkpointId: {}", + table, + checkpointId); // Update the checkpoint state. dataFilesPerCheckpoint.put(checkpointId, writeToManifest(checkpointId)); @@ -193,7 +216,8 @@ public void notifyCheckpointComplete(long checkpointId) throws Exception { // 2. snapshotState(ckpId+1); // 3. notifyCheckpointComplete(ckpId+1); // 4. notifyCheckpointComplete(ckpId); - // For step#4, we don't need to commit iceberg table again because in step#3 we've committed all the files, + // For step#4, we don't need to commit iceberg table again because in step#3 we've committed all + // the files, // Besides, we need to maintain the max-committed-checkpoint-id to be increasing. if (checkpointId > maxCommittedCheckpointId) { commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, checkpointId); @@ -201,9 +225,9 @@ public void notifyCheckpointComplete(long checkpointId) throws Exception { } } - private void commitUpToCheckpoint(NavigableMap deltaManifestsMap, - String newFlinkJobId, - long checkpointId) throws IOException { + private void commitUpToCheckpoint( + NavigableMap deltaManifestsMap, String newFlinkJobId, long checkpointId) + throws IOException { NavigableMap pendingMap = deltaManifestsMap.headMap(checkpointId, true); List manifests = Lists.newArrayList(); NavigableMap pendingResults = Maps.newTreeMap(); @@ -213,14 +237,18 @@ private void commitUpToCheckpoint(NavigableMap deltaManifestsMap, continue; } - DeltaManifests deltaManifests = SimpleVersionedSerialization - .readVersionAndDeSerialize(DeltaManifestsSerializer.INSTANCE, e.getValue()); - pendingResults.put(e.getKey(), FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io())); + DeltaManifests deltaManifests = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, e.getValue()); + pendingResults.put( + e.getKey(), FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io())); manifests.addAll(deltaManifests.manifests()); } - int totalFiles = pendingResults.values().stream() - .mapToInt(r -> r.dataFiles().length + r.deleteFiles().length).sum(); + int totalFiles = + pendingResults.values().stream() + .mapToInt(r -> r.dataFiles().length + r.deleteFiles().length) + .sum(); continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0; if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) { if (replacePartitions) { @@ -238,21 +266,25 @@ private void commitUpToCheckpoint(NavigableMap deltaManifestsMap, table.io().deleteFile(manifest.path()); } catch (Exception e) { // The flink manifests cleaning failure shouldn't abort the completed checkpoint. - String details = MoreObjects.toStringHelper(this) - .add("flinkJobId", newFlinkJobId) - .add("checkpointId", checkpointId) - .add("manifestPath", manifest.path()) - .toString(); - LOG.warn("The iceberg transaction has been committed, but we failed to clean the temporary flink manifests: {}", - details, e); + String details = + MoreObjects.toStringHelper(this) + .add("flinkJobId", newFlinkJobId) + .add("checkpointId", checkpointId) + .add("manifestPath", manifest.path()) + .toString(); + LOG.warn( + "The iceberg transaction has been committed, but we failed to clean the temporary flink manifests: {}", + details, + e); } } } - private void replacePartitions(NavigableMap pendingResults, String newFlinkJobId, - long checkpointId) { + private void replacePartitions( + NavigableMap pendingResults, String newFlinkJobId, long checkpointId) { // Partition overwrite does not support delete files. - int deleteFilesNum = pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum(); + int deleteFilesNum = + pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum(); Preconditions.checkState(deleteFilesNum == 0, "Cannot overwrite partitions with delete files."); // Commit the overwrite transaction. @@ -260,17 +292,21 @@ private void replacePartitions(NavigableMap pendingResults, S int numFiles = 0; for (WriteResult result : pendingResults.values()) { - Preconditions.checkState(result.referencedDataFiles().length == 0, "Should have no referenced data files."); + Preconditions.checkState( + result.referencedDataFiles().length == 0, "Should have no referenced data files."); numFiles += result.dataFiles().length; Arrays.stream(result.dataFiles()).forEach(dynamicOverwrite::addFile); } - commitOperation(dynamicOverwrite, numFiles, 0, "dynamic partition overwrite", newFlinkJobId, checkpointId); + commitOperation( + dynamicOverwrite, numFiles, 0, "dynamic partition overwrite", newFlinkJobId, checkpointId); } - private void commitDeltaTxn(NavigableMap pendingResults, String newFlinkJobId, long checkpointId) { - int deleteFilesNum = pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum(); + private void commitDeltaTxn( + NavigableMap pendingResults, String newFlinkJobId, long checkpointId) { + int deleteFilesNum = + pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum(); if (deleteFilesNum == 0) { // To be compatible with iceberg format V1. @@ -278,7 +314,8 @@ private void commitDeltaTxn(NavigableMap pendingResults, Stri int numFiles = 0; for (WriteResult result : pendingResults.values()) { - Preconditions.checkState(result.referencedDataFiles().length == 0, "Should have no referenced data files."); + Preconditions.checkState( + result.referencedDataFiles().length == 0, "Should have no referenced data files."); numFiles += result.dataFiles().length; Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile); @@ -288,16 +325,23 @@ private void commitDeltaTxn(NavigableMap pendingResults, Stri } else { // To be compatible with iceberg format V2. for (Map.Entry e : pendingResults.entrySet()) { - // We don't commit the merged result into a single transaction because for the sequential transaction txn1 and - // txn2, the equality-delete files of txn2 are required to be applied to data files from txn1. Committing the + // We don't commit the merged result into a single transaction because for the sequential + // transaction txn1 and + // txn2, the equality-delete files of txn2 are required to be applied to data files from + // txn1. Committing the // merged one will lead to the incorrect delete semantic. WriteResult result = e.getValue(); - // Row delta validations are not needed for streaming changes that write equality deletes. Equality deletes - // are applied to data in all previous sequence numbers, so retries may push deletes further in the future, - // but do not affect correctness. Position deletes committed to the table in this path are used only to delete - // rows from data files that are being added in this commit. There is no way for data files added along with - // the delete files to be concurrently removed, so there is no need to validate the files referenced by the + // Row delta validations are not needed for streaming changes that write equality deletes. + // Equality deletes + // are applied to data in all previous sequence numbers, so retries may push deletes further + // in the future, + // but do not affect correctness. Position deletes committed to the table in this path are + // used only to delete + // rows from data files that are being added in this commit. There is no way for data files + // added along with + // the delete files to be concurrently removed, so there is no need to validate the files + // referenced by the // position delete files that are being committed. RowDelta rowDelta = table.newRowDelta().scanManifestsWith(workerPool); @@ -307,17 +351,28 @@ private void commitDeltaTxn(NavigableMap pendingResults, Stri int numDeleteFiles = result.deleteFiles().length; Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); - commitOperation(rowDelta, numDataFiles, numDeleteFiles, "rowDelta", newFlinkJobId, e.getKey()); + commitOperation( + rowDelta, numDataFiles, numDeleteFiles, "rowDelta", newFlinkJobId, e.getKey()); } } } - private void commitOperation(SnapshotUpdate operation, int numDataFiles, int numDeleteFiles, String description, - String newFlinkJobId, long checkpointId) { - LOG.info("Committing {} with {} data files and {} delete files to table {}", description, numDataFiles, - numDeleteFiles, table); + private void commitOperation( + SnapshotUpdate operation, + int numDataFiles, + int numDeleteFiles, + String description, + String newFlinkJobId, + long checkpointId) { + LOG.info( + "Committing {} with {} data files and {} delete files to table {}", + description, + numDataFiles, + numDeleteFiles, + table); snapshotProperties.forEach(operation::set); - // custom snapshot metadata properties will be overridden if they conflict with internal ones used by the sink. + // custom snapshot metadata properties will be overridden if they conflict with internal ones + // used by the sink. operation.set(MAX_COMMITTED_CHECKPOINT_ID, Long.toString(checkpointId)); operation.set(FLINK_JOB_ID, newFlinkJobId); @@ -343,7 +398,8 @@ public void endInput() throws IOException { } /** - * Write all the complete data files to a newly created manifest file and return the manifest's avro serialized bytes. + * Write all the complete data files to a newly created manifest file and return the manifest's + * avro serialized bytes. */ private byte[] writeToManifest(long checkpointId) throws IOException { if (writeResultsOfCurrentCkpt.isEmpty()) { @@ -351,10 +407,12 @@ private byte[] writeToManifest(long checkpointId) throws IOException { } WriteResult result = WriteResult.builder().addAll(writeResultsOfCurrentCkpt).build(); - DeltaManifests deltaManifests = FlinkManifestUtil.writeCompletedFiles(result, - () -> manifestOutputFileFactory.create(checkpointId), table.spec()); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + result, () -> manifestOutputFileFactory.create(checkpointId), table.spec()); - return SimpleVersionedSerialization.writeVersionAndSerialize(DeltaManifestsSerializer.INSTANCE, deltaManifests); + return SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, deltaManifests); } @Override @@ -362,7 +420,8 @@ public void open() throws Exception { super.open(); final String operatorID = getRuntimeContext().getOperatorUniqueID(); - this.workerPool = ThreadPools.newWorkerPool("iceberg-worker-pool-" + operatorID, workerPoolSize); + this.workerPool = + ThreadPools.newWorkerPool("iceberg-worker-pool-" + operatorID, workerPoolSize); } @Override @@ -379,9 +438,11 @@ public void close() throws Exception { private static ListStateDescriptor> buildStateDescriptor() { Comparator longComparator = Comparators.forType(Types.LongType.get()); // Construct a SortedMapTypeInfo. - SortedMapTypeInfo sortedMapTypeInfo = new SortedMapTypeInfo<>( - BasicTypeInfo.LONG_TYPE_INFO, PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO, longComparator - ); + SortedMapTypeInfo sortedMapTypeInfo = + new SortedMapTypeInfo<>( + BasicTypeInfo.LONG_TYPE_INFO, + PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO, + longComparator); return new ListStateDescriptor<>("iceberg-files-committer-state", sortedMapTypeInfo); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java index cc8e6ce8284f..693ecb06da67 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -83,7 +82,8 @@ public void close() throws Exception { @Override public void endInput() throws IOException { - // For bounded stream, it may don't enable the checkpoint mechanism so we'd better to emit the remaining + // For bounded stream, it may don't enable the checkpoint mechanism so we'd better to emit the + // remaining // completed files to downstream before closing the writer so that we won't miss any of them. emit(writer.complete()); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java index b7d575bb446b..045e45a4ceae 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/ManifestOutputFileFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.Map; @@ -28,7 +27,8 @@ import org.apache.iceberg.relocated.com.google.common.base.Strings; class ManifestOutputFileFactory { - // Users could define their own flink manifests directory by setting this value in table properties. + // Users could define their own flink manifests directory by setting this value in table + // properties. static final String FLINK_MANIFEST_LOCATION = "flink.manifests.location"; private final TableOperations ops; @@ -40,8 +40,14 @@ class ManifestOutputFileFactory { private final long attemptNumber; private final AtomicInteger fileCount = new AtomicInteger(0); - ManifestOutputFileFactory(TableOperations ops, FileIO io, Map props, - String flinkJobId, String operatorUniqueId, int subTaskId, long attemptNumber) { + ManifestOutputFileFactory( + TableOperations ops, + FileIO io, + Map props, + String flinkJobId, + String operatorUniqueId, + int subTaskId, + long attemptNumber) { this.ops = ops; this.io = io; this.props = props; @@ -52,8 +58,15 @@ class ManifestOutputFileFactory { } private String generatePath(long checkpointId) { - return FileFormat.AVRO.addExtension(String.format("%s-%s-%05d-%d-%d-%05d", flinkJobId, operatorUniqueId, - subTaskId, attemptNumber, checkpointId, fileCount.incrementAndGet())); + return FileFormat.AVRO.addExtension( + String.format( + "%s-%s-%05d-%d-%d-%05d", + flinkJobId, + operatorUniqueId, + subTaskId, + attemptNumber, + checkpointId, + fileCount.incrementAndGet())); } OutputFile create(long checkpointId) { @@ -64,7 +77,8 @@ OutputFile create(long checkpointId) { // User don't specify any flink manifest directory, so just use the default metadata path. newManifestFullPath = ops.metadataFileLocation(generatePath(checkpointId)); } else { - newManifestFullPath = String.format("%s/%s", stripTrailingSlash(flinkManifestDir), generatePath(checkpointId)); + newManifestFullPath = + String.format("%s/%s", stripTrailingSlash(flinkManifestDir), generatePath(checkpointId)); } return io.newOutputFile(newManifestFullPath); diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java index 598df09eee83..df951684b446 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionKeySelector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import org.apache.flink.api.java.functions.KeySelector; @@ -28,8 +27,9 @@ import org.apache.iceberg.flink.RowDataWrapper; /** - * Create a {@link KeySelector} to shuffle by partition key, then each partition/bucket will be wrote by only one - * task. That will reduce lots of small files in partitioned fanout write policy for {@link FlinkSink}. + * Create a {@link KeySelector} to shuffle by partition key, then each partition/bucket will be + * wrote by only one task. That will reduce lots of small files in partitioned fanout write policy + * for {@link FlinkSink}. */ class PartitionKeySelector implements KeySelector { @@ -46,8 +46,8 @@ class PartitionKeySelector implements KeySelector { } /** - * Construct the {@link RowDataWrapper} lazily here because few members in it are not serializable. In this way, we - * don't have to serialize them with forcing. + * Construct the {@link RowDataWrapper} lazily here because few members in it are not + * serializable. In this way, we don't have to serialize them with forcing. */ private RowDataWrapper lazyRowDataWrapper() { if (rowDataWrapper == null) { diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java index 1eee6298e933..38062dd1a2c4 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/PartitionedDeltaWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -41,17 +40,27 @@ class PartitionedDeltaWriter extends BaseDeltaTaskWriter { private final Map writers = Maps.newHashMap(); - PartitionedDeltaWriter(PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - List equalityFieldIds, - boolean upsert) { - super(spec, format, appenderFactory, fileFactory, io, targetFileSize, schema, flinkSchema, equalityFieldIds, + PartitionedDeltaWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema, + List equalityFieldIds, + boolean upsert) { + super( + spec, + format, + appenderFactory, + fileFactory, + io, + targetFileSize, + schema, + flinkSchema, + equalityFieldIds, upsert); this.partitionKey = new PartitionKey(spec, schema); } @@ -62,7 +71,8 @@ RowDataDeltaWriter route(RowData row) { RowDataDeltaWriter writer = writers.get(partitionKey); if (writer == null) { - // NOTICE: we need to copy a new partition key here, in case of messing up the keys in writers. + // NOTICE: we need to copy a new partition key here, in case of messing up the keys in + // writers. PartitionKey copiedKey = partitionKey.copy(); writer = new RowDataDeltaWriter(copiedKey); writers.put(copiedKey, writer); diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java index f6ee976e637f..1c330434d019 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/RowDataTaskWriterFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -53,12 +52,13 @@ public class RowDataTaskWriterFactory implements TaskWriterFactory { private transient OutputFileFactory outputFileFactory; - public RowDataTaskWriterFactory(Table table, - RowType flinkSchema, - long targetFileSizeBytes, - FileFormat format, - List equalityFieldIds, - boolean upsert) { + public RowDataTaskWriterFactory( + Table table, + RowType flinkSchema, + long targetFileSizeBytes, + FileFormat format, + List equalityFieldIds, + boolean upsert) { this.table = table; this.schema = table.schema(); this.flinkSchema = flinkSchema; @@ -70,47 +70,90 @@ public RowDataTaskWriterFactory(Table table, this.upsert = upsert; if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { - this.appenderFactory = new FlinkAppenderFactory(schema, flinkSchema, table.properties(), spec); + this.appenderFactory = + new FlinkAppenderFactory(schema, flinkSchema, table.properties(), spec); } else if (upsert) { - // In upsert mode, only the new row is emitted using INSERT row kind. Therefore, any column of the inserted row - // may differ from the deleted row other than the primary key fields, and the delete file must contain values + // In upsert mode, only the new row is emitted using INSERT row kind. Therefore, any column of + // the inserted row + // may differ from the deleted row other than the primary key fields, and the delete file must + // contain values // that are correct for the deleted row. Therefore, only write the equality delete fields. - this.appenderFactory = new FlinkAppenderFactory(schema, flinkSchema, table.properties(), spec, - ArrayUtil.toIntArray(equalityFieldIds), TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)), null); + this.appenderFactory = + new FlinkAppenderFactory( + schema, + flinkSchema, + table.properties(), + spec, + ArrayUtil.toIntArray(equalityFieldIds), + TypeUtil.select(schema, Sets.newHashSet(equalityFieldIds)), + null); } else { - this.appenderFactory = new FlinkAppenderFactory(schema, flinkSchema, table.properties(), spec, - ArrayUtil.toIntArray(equalityFieldIds), schema, null); + this.appenderFactory = + new FlinkAppenderFactory( + schema, + flinkSchema, + table.properties(), + spec, + ArrayUtil.toIntArray(equalityFieldIds), + schema, + null); } } @Override public void initialize(int taskId, int attemptId) { - this.outputFileFactory = OutputFileFactory.builderFor(table, taskId, attemptId) - .format(format) - .build(); + this.outputFileFactory = + OutputFileFactory.builderFor(table, taskId, attemptId).format(format).build(); } @Override public TaskWriter create() { - Preconditions.checkNotNull(outputFileFactory, + Preconditions.checkNotNull( + outputFileFactory, "The outputFileFactory shouldn't be null if we have invoked the initialize()."); if (equalityFieldIds == null || equalityFieldIds.isEmpty()) { // Initialize a task writer to write INSERT only. if (spec.isUnpartitioned()) { - return new UnpartitionedWriter<>(spec, format, appenderFactory, outputFileFactory, io, targetFileSizeBytes); + return new UnpartitionedWriter<>( + spec, format, appenderFactory, outputFileFactory, io, targetFileSizeBytes); } else { - return new RowDataPartitionedFanoutWriter(spec, format, appenderFactory, outputFileFactory, - io, targetFileSizeBytes, schema, flinkSchema); + return new RowDataPartitionedFanoutWriter( + spec, + format, + appenderFactory, + outputFileFactory, + io, + targetFileSizeBytes, + schema, + flinkSchema); } } else { // Initialize a task writer to write both INSERT and equality DELETE. if (spec.isUnpartitioned()) { - return new UnpartitionedDeltaWriter(spec, format, appenderFactory, outputFileFactory, io, - targetFileSizeBytes, schema, flinkSchema, equalityFieldIds, upsert); + return new UnpartitionedDeltaWriter( + spec, + format, + appenderFactory, + outputFileFactory, + io, + targetFileSizeBytes, + schema, + flinkSchema, + equalityFieldIds, + upsert); } else { - return new PartitionedDeltaWriter(spec, format, appenderFactory, outputFileFactory, io, - targetFileSizeBytes, schema, flinkSchema, equalityFieldIds, upsert); + return new PartitionedDeltaWriter( + spec, + format, + appenderFactory, + outputFileFactory, + io, + targetFileSizeBytes, + schema, + flinkSchema, + equalityFieldIds, + upsert); } } } @@ -120,9 +163,15 @@ private static class RowDataPartitionedFanoutWriter extends PartitionedFanoutWri private final PartitionKey partitionKey; private final RowDataWrapper rowDataWrapper; - RowDataPartitionedFanoutWriter(PartitionSpec spec, FileFormat format, FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize, Schema schema, - RowType flinkSchema) { + RowDataPartitionedFanoutWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.partitionKey = new PartitionKey(spec, schema); this.rowDataWrapper = new RowDataWrapper(flinkSchema, schema.asStruct()); diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java index 9d56ec6a812a..e3a1245e8cbd 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/TaskWriterFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.Serializable; @@ -32,7 +31,7 @@ public interface TaskWriterFactory extends Serializable { /** * Initialize the factory with a given taskId and attemptId. * - * @param taskId the identifier of task. + * @param taskId the identifier of task. * @param attemptId the attempt id of this task. */ void initialize(int taskId, int attemptId); diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java index 331ed7c78192..7680fb933b20 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/sink/UnpartitionedDeltaWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.IOException; @@ -33,17 +32,27 @@ class UnpartitionedDeltaWriter extends BaseDeltaTaskWriter { private final RowDataDeltaWriter writer; - UnpartitionedDeltaWriter(PartitionSpec spec, - FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, - FileIO io, - long targetFileSize, - Schema schema, - RowType flinkSchema, - List equalityFieldIds, - boolean upsert) { - super(spec, format, appenderFactory, fileFactory, io, targetFileSize, schema, flinkSchema, equalityFieldIds, + UnpartitionedDeltaWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + RowType flinkSchema, + List equalityFieldIds, + boolean upsert) { + super( + spec, + format, + appenderFactory, + fileFactory, + io, + targetFileSize, + schema, + flinkSchema, + equalityFieldIds, upsert); this.writer = new RowDataDeltaWriter(null); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java index 805cea50131b..91d975349b19 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -49,8 +48,11 @@ public class DataIterator implements CloseableIterator { private int fileOffset; private long recordOffset; - public DataIterator(FileScanTaskReader fileScanTaskReader, CombinedScanTask task, - FileIO io, EncryptionManager encryption) { + public DataIterator( + FileScanTaskReader fileScanTaskReader, + CombinedScanTask task, + FileIO io, + EncryptionManager encryption) { this.fileScanTaskReader = fileScanTaskReader; this.inputFilesDecryptor = new InputFilesDecryptor(task, io, encryption); @@ -67,17 +69,20 @@ public DataIterator(FileScanTaskReader fileScanTaskReader, CombinedScanTask t } /** - * (startingFileOffset, startingRecordOffset) points to the next row that reader should resume from. - * E.g., if the seek position is (file=0, record=1), seek moves the iterator position to the 2nd row - * in file 0. When next() is called after seek, 2nd row from file 0 should be returned. + * (startingFileOffset, startingRecordOffset) points to the next row that reader should resume + * from. E.g., if the seek position is (file=0, record=1), seek moves the iterator position to the + * 2nd row in file 0. When next() is called after seek, 2nd row from file 0 should be returned. */ public void seek(int startingFileOffset, long startingRecordOffset) { - Preconditions.checkState(fileOffset == -1, - "Seek should be called before any other iterator actions"); + Preconditions.checkState( + fileOffset == -1, "Seek should be called before any other iterator actions"); // skip files - Preconditions.checkState(startingFileOffset < combinedTask.files().size(), + Preconditions.checkState( + startingFileOffset < combinedTask.files().size(), "Invalid starting file offset %s for combined scan task with %s files: %s", - startingFileOffset, combinedTask.files().size(), combinedTask); + startingFileOffset, + combinedTask.files().size(), + combinedTask); for (long i = 0L; i < startingFileOffset; ++i) { tasks.next(); } @@ -88,9 +93,10 @@ public void seek(int startingFileOffset, long startingRecordOffset) { if (currentFileHasNext() && hasNext()) { next(); } else { - throw new IllegalStateException(String.format( - "Invalid starting record offset %d for file %d from CombinedScanTask: %s", - startingRecordOffset, startingFileOffset, combinedTask)); + throw new IllegalStateException( + String.format( + "Invalid starting record offset %d for file %d from CombinedScanTask: %s", + startingRecordOffset, startingFileOffset, combinedTask)); } } @@ -115,10 +121,7 @@ public boolean currentFileHasNext() { return currentIterator.hasNext(); } - /** - * Updates the current iterator field to ensure that the current Iterator - * is not exhausted. - */ + /** Updates the current iterator field to ensure that the current Iterator is not exhausted. */ private void updateCurrentIterator() { try { while (!currentIterator.hasNext() && tasks.hasNext()) { diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java index 04273016ee2d..927a804a4792 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FileScanTaskReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.Serializable; diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java index ee71ab7fe594..44b35522becb 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -37,9 +36,7 @@ import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.util.ThreadPools; -/** - * Flink {@link InputFormat} for Iceberg. - */ +/** Flink {@link InputFormat} for Iceberg. */ public class FlinkInputFormat extends RichInputFormat { private static final long serialVersionUID = 1L; @@ -53,14 +50,19 @@ public class FlinkInputFormat extends RichInputFormat private transient DataIterator iterator; private transient long currentReadCount = 0L; - FlinkInputFormat(TableLoader tableLoader, Schema tableSchema, FileIO io, EncryptionManager encryption, - ScanContext context) { + FlinkInputFormat( + TableLoader tableLoader, + Schema tableSchema, + FileIO io, + EncryptionManager encryption, + ScanContext context) { this.tableLoader = tableLoader; this.io = io; this.encryption = encryption; this.context = context; - this.rowDataReader = new RowDataFileScanTaskReader(tableSchema, - context.project(), context.nameMapping(), context.caseSensitive()); + this.rowDataReader = + new RowDataFileScanTaskReader( + tableSchema, context.project(), context.nameMapping(), context.caseSensitive()); } @VisibleForTesting @@ -78,7 +80,8 @@ public BaseStatistics getStatistics(BaseStatistics cachedStatistics) { public FlinkInputSplit[] createInputSplits(int minNumSplits) throws IOException { // Called in Job manager, so it is OK to load table from catalog. tableLoader.open(); - final ExecutorService workerPool = ThreadPools.newWorkerPool("iceberg-plan-worker-pool", context.planParallelism()); + final ExecutorService workerPool = + ThreadPools.newWorkerPool("iceberg-plan-worker-pool", context.planParallelism()); try (TableLoader loader = tableLoader) { Table table = loader.loadTable(); return FlinkSplitPlanner.planInputSplits(table, context, workerPool); @@ -89,14 +92,13 @@ public FlinkInputSplit[] createInputSplits(int minNumSplits) throws IOException @Override public InputSplitAssigner getInputSplitAssigner(FlinkInputSplit[] inputSplits) { - return context.exposeLocality() ? - new LocatableInputSplitAssigner(inputSplits) : - new DefaultInputSplitAssigner(inputSplits); + return context.exposeLocality() + ? new LocatableInputSplitAssigner(inputSplits) + : new DefaultInputSplitAssigner(inputSplits); } @Override - public void configure(Configuration parameters) { - } + public void configure(Configuration parameters) {} @Override public void open(FlinkInputSplit split) { diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java index 5bb85fe7162a..16fd4f39596c 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputSplit.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Arrays; diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java index f6cf902b1729..7ac6c5162483 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -53,20 +52,21 @@ public class FlinkSource { private static final Logger LOG = LoggerFactory.getLogger(FlinkSource.class); - private FlinkSource() { - } + private FlinkSource() {} /** - * Initialize a {@link Builder} to read the data from iceberg table. Equivalent to {@link TableScan}. See more options - * in {@link ScanContext}. - *

    - * The Source can be read static data in bounded mode. It can also continuously check the arrival of new data and read - * records incrementally. + * Initialize a {@link Builder} to read the data from iceberg table. Equivalent to {@link + * TableScan}. See more options in {@link ScanContext}. + * + *

    The Source can be read static data in bounded mode. It can also continuously check the + * arrival of new data and read records incrementally. + * *

      - *
    • Without startSnapshotId: Bounded
    • - *
    • With startSnapshotId and with endSnapshotId: Bounded
    • - *
    • With startSnapshotId (-1 means unbounded preceding) and Without endSnapshotId: Unbounded
    • + *
    • Without startSnapshotId: Bounded + *
    • With startSnapshotId and with endSnapshotId: Bounded + *
    • With startSnapshotId (-1 means unbounded preceding) and Without endSnapshotId: Unbounded *
    + * *

    * * @return {@link Builder} to connect the iceberg table. @@ -75,9 +75,7 @@ public static Builder forRowData() { return new Builder(); } - /** - * Source builder to build {@link DataStream}. - */ + /** Source builder to build {@link DataStream}. */ public static class Builder { private static final Set FILE_SYSTEM_SUPPORT_LOCALITY = ImmutableSet.of("hdfs"); @@ -223,9 +221,11 @@ public FlinkInputFormat buildFormat() { contextBuilder.project(FlinkSchemaUtil.convert(icebergSchema, projectedSchema)); } contextBuilder.exposeLocality(localityEnabled()); - contextBuilder.planParallelism(readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE)); + contextBuilder.planParallelism( + readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE)); - return new FlinkInputFormat(tableLoader, icebergSchema, io, encryption, contextBuilder.build()); + return new FlinkInputFormat( + tableLoader, icebergSchema, io, encryption, contextBuilder.build()); } public DataStream build() { @@ -233,7 +233,8 @@ public DataStream build() { FlinkInputFormat format = buildFormat(); ScanContext context = contextBuilder.build(); - TypeInformation typeInfo = FlinkCompatibilityUtil.toTypeInfo(FlinkSchemaUtil.convert(context.project())); + TypeInformation typeInfo = + FlinkCompatibilityUtil.toTypeInfo(FlinkSchemaUtil.convert(context.project())); if (!context.isStreaming()) { int parallelism = inferParallelism(format, context); @@ -253,26 +254,30 @@ public DataStream build() { } int inferParallelism(FlinkInputFormat format, ScanContext context) { - int parallelism = readableConfig.get(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM); + int parallelism = + readableConfig.get(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM); if (readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM)) { - int maxInferParallelism = readableConfig.get(FlinkConfigOptions - .TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX); + int maxInferParallelism = + readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX); Preconditions.checkState( maxInferParallelism >= 1, - FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX.key() + " cannot be less than 1"); + FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX.key() + + " cannot be less than 1"); int splitNum; try { FlinkInputSplit[] splits = format.createInputSplits(0); splitNum = splits.length; } catch (IOException e) { - throw new UncheckedIOException("Failed to create iceberg input splits for table: " + table, e); + throw new UncheckedIOException( + "Failed to create iceberg input splits for table: " + table, e); } parallelism = Math.min(splitNum, maxInferParallelism); } if (context.limit() > 0) { - int limit = context.limit() >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) context.limit(); + int limit = + context.limit() >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) context.limit(); parallelism = Math.min(parallelism, limit); } @@ -283,8 +288,10 @@ int inferParallelism(FlinkInputFormat format, ScanContext context) { private boolean localityEnabled() { Boolean localityEnabled = - this.exposeLocality != null ? this.exposeLocality : - readableConfig.get(FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO); + this.exposeLocality != null + ? this.exposeLocality + : readableConfig.get( + FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO); if (localityEnabled != null && !localityEnabled) { return false; @@ -294,10 +301,14 @@ private boolean localityEnabled() { if (fileIO instanceof HadoopFileIO) { HadoopFileIO hadoopFileIO = (HadoopFileIO) fileIO; try { - String scheme = new Path(table.location()).getFileSystem(hadoopFileIO.getConf()).getScheme(); + String scheme = + new Path(table.location()).getFileSystem(hadoopFileIO.getConf()).getScheme(); return FILE_SYSTEM_SUPPORT_LOCALITY.contains(scheme); } catch (IOException e) { - LOG.warn("Failed to determine whether the locality information can be exposed for table: {}", table, e); + LOG.warn( + "Failed to determine whether the locality information can be exposed for table: {}", + table, + e); } } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java index d1628fbca794..4746625310b1 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -40,11 +39,12 @@ @Internal public class FlinkSplitPlanner { - private FlinkSplitPlanner() { - } + private FlinkSplitPlanner() {} - static FlinkInputSplit[] planInputSplits(Table table, ScanContext context, ExecutorService workerPool) { - try (CloseableIterable tasksIterable = planTasks(table, context, workerPool)) { + static FlinkInputSplit[] planInputSplits( + Table table, ScanContext context, ExecutorService workerPool) { + try (CloseableIterable tasksIterable = + planTasks(table, context, workerPool)) { List tasks = Lists.newArrayList(tasksIterable); FlinkInputSplit[] splits = new FlinkInputSplit[tasks.size()]; boolean exposeLocality = context.exposeLocality(); @@ -52,33 +52,35 @@ static FlinkInputSplit[] planInputSplits(Table table, ScanContext context, Execu Tasks.range(tasks.size()) .stopOnFailure() .executeWith(exposeLocality ? workerPool : null) - .run(index -> { - CombinedScanTask task = tasks.get(index); - String[] hostnames = null; - if (exposeLocality) { - hostnames = Util.blockLocations(table.io(), task); - } - splits[index] = new FlinkInputSplit(index, task, hostnames); - }); + .run( + index -> { + CombinedScanTask task = tasks.get(index); + String[] hostnames = null; + if (exposeLocality) { + hostnames = Util.blockLocations(table.io(), task); + } + splits[index] = new FlinkInputSplit(index, task, hostnames); + }); return splits; } catch (IOException e) { throw new UncheckedIOException("Failed to process tasks iterable", e); } } - /** - * This returns splits for the FLIP-27 source - */ + /** This returns splits for the FLIP-27 source */ public static List planIcebergSourceSplits( Table table, ScanContext context, ExecutorService workerPool) { - try (CloseableIterable tasksIterable = planTasks(table, context, workerPool)) { - return Lists.newArrayList(CloseableIterable.transform(tasksIterable, IcebergSourceSplit::fromCombinedScanTask)); + try (CloseableIterable tasksIterable = + planTasks(table, context, workerPool)) { + return Lists.newArrayList( + CloseableIterable.transform(tasksIterable, IcebergSourceSplit::fromCombinedScanTask)); } catch (IOException e) { throw new UncheckedIOException("Failed to process task iterable: ", e); } } - static CloseableIterable planTasks(Table table, ScanContext context, ExecutorService workerPool) { + static CloseableIterable planTasks( + Table table, ScanContext context, ExecutorService workerPool) { ScanMode scanMode = checkScanMode(context); if (scanMode == ScanMode.INCREMENTAL_APPEND_SCAN) { IncrementalAppendScan scan = table.newIncrementalAppendScan(); @@ -115,22 +117,20 @@ private enum ScanMode { } private static ScanMode checkScanMode(ScanContext context) { - if (context.isStreaming() || context.startSnapshotId() != null || context.endSnapshotId() != null) { + if (context.isStreaming() + || context.startSnapshotId() != null + || context.endSnapshotId() != null) { return ScanMode.INCREMENTAL_APPEND_SCAN; } else { return ScanMode.BATCH; } } - /** - * refine scan with common configs - */ + /** refine scan with common configs */ private static > T refineScanWithBaseConfigs( T scan, ScanContext context, ExecutorService workerPool) { - T refinedScan = scan - .caseSensitive(context.caseSensitive()) - .project(context.project()) - .planWith(workerPool); + T refinedScan = + scan.caseSensitive(context.caseSensitive()).project(context.project()).planWith(workerPool); if (context.includeColumnStats()) { refinedScan = refinedScan.includeColumnStats(); @@ -141,11 +141,14 @@ private static > T refineScanW } if (context.splitLookback() != null) { - refinedScan = refinedScan.option(TableProperties.SPLIT_LOOKBACK, context.splitLookback().toString()); + refinedScan = + refinedScan.option(TableProperties.SPLIT_LOOKBACK, context.splitLookback().toString()); } if (context.splitOpenFileCost() != null) { - refinedScan = refinedScan.option(TableProperties.SPLIT_OPEN_FILE_COST, context.splitOpenFileCost().toString()); + refinedScan = + refinedScan.option( + TableProperties.SPLIT_OPEN_FILE_COST, context.splitOpenFileCost().toString()); } if (context.filters() != null) { diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java index 9a3e98d2fee5..85a9cd2b78fd 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/IcebergSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -95,8 +94,7 @@ public Boundedness getBoundedness() { @Override public SourceReader createReader(SourceReaderContext readerContext) { - ReaderMetricsContext readerMetrics = - new ReaderMetricsContext(readerContext.metricGroup()); + ReaderMetricsContext readerMetrics = new ReaderMetricsContext(readerContext.metricGroup()); return new IcebergSourceReader<>(readerFunction, readerContext, readerMetrics); } @@ -130,20 +128,28 @@ private SplitEnumerator createEnumer if (enumState == null) { assigner = assignerFactory.createAssigner(); } else { - LOG.info("Iceberg source restored {} splits from state for table {}", - enumState.pendingSplits().size(), table.name()); + LOG.info( + "Iceberg source restored {} splits from state for table {}", + enumState.pendingSplits().size(), + table.name()); assigner = assignerFactory.createAssigner(enumState.pendingSplits()); } if (scanContext.isStreaming()) { - // Ideally, operatorId should be used as the threadPoolName as Flink guarantees its uniqueness within a job. - // SplitEnumeratorContext doesn't expose the OperatorCoordinator.Context, which would contain the OperatorID. - // Need to discuss with Flink community whether it is ok to expose a public API like the protected method - // "OperatorCoordinator.Context getCoordinatorContext()" from SourceCoordinatorContext implementation. + // Ideally, operatorId should be used as the threadPoolName as Flink guarantees its uniqueness + // within a job. + // SplitEnumeratorContext doesn't expose the OperatorCoordinator.Context, which would contain + // the OperatorID. + // Need to discuss with Flink community whether it is ok to expose a public API like the + // protected method + // "OperatorCoordinator.Context getCoordinatorContext()" from SourceCoordinatorContext + // implementation. // For now,

  • - is used as the unique thread pool name. - ContinuousSplitPlanner splitPlanner = new ContinuousSplitPlannerImpl( - table, scanContext, table.name() + "-" + UUID.randomUUID()); - return new ContinuousIcebergEnumerator(enumContext, assigner, scanContext, splitPlanner, enumState); + ContinuousSplitPlanner splitPlanner = + new ContinuousSplitPlannerImpl( + table, scanContext, table.name() + "-" + UUID.randomUUID()); + return new ContinuousIcebergEnumerator( + enumContext, assigner, scanContext, splitPlanner, enumState); } else { return new StaticIcebergEnumerator(enumContext, assigner, table, scanContext, enumState); } @@ -168,8 +174,7 @@ public static class Builder { // optional private final ScanContext.Builder contextBuilder = ScanContext.builder(); - Builder() { - } + Builder() {} public Builder tableLoader(TableLoader loader) { this.tableLoader = loader; @@ -292,8 +297,15 @@ public IcebergSource build() { try (TableLoader loader = tableLoader) { loader.open(); Table table = tableLoader.loadTable(); - RowDataReaderFunction rowDataReaderFunction = new RowDataReaderFunction(flinkConfig, table.schema(), - context.project(), context.nameMapping(), context.caseSensitive(), table.io(), table.encryption()); + RowDataReaderFunction rowDataReaderFunction = + new RowDataReaderFunction( + flinkConfig, + table.schema(), + context.project(), + context.nameMapping(), + context.caseSensitive(), + table.io(), + table.encryption()); this.readerFunction = (ReaderFunction) rowDataReaderFunction; } catch (IOException e) { throw new UncheckedIOException(e); diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java index b71f2b0fafe5..5fada27d5471 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Map; @@ -56,8 +55,8 @@ public class RowDataFileScanTaskReader implements FileScanTaskReader { private final String nameMapping; private final boolean caseSensitive; - public RowDataFileScanTaskReader(Schema tableSchema, Schema projectedSchema, - String nameMapping, boolean caseSensitive) { + public RowDataFileScanTaskReader( + Schema tableSchema, Schema projectedSchema, String nameMapping, boolean caseSensitive) { this.tableSchema = tableSchema; this.projectedSchema = projectedSchema; this.nameMapping = nameMapping; @@ -65,21 +64,28 @@ public RowDataFileScanTaskReader(Schema tableSchema, Schema projectedSchema, } @Override - public CloseableIterator open(FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { + public CloseableIterator open( + FileScanTask task, InputFilesDecryptor inputFilesDecryptor) { Schema partitionSchema = TypeUtil.select(projectedSchema, task.spec().identitySourceIds()); - Map idToConstant = partitionSchema.columns().isEmpty() ? ImmutableMap.of() : - PartitionUtil.constantsMap(task, RowDataUtil::convertConstant); + Map idToConstant = + partitionSchema.columns().isEmpty() + ? ImmutableMap.of() + : PartitionUtil.constantsMap(task, RowDataUtil::convertConstant); - FlinkDeleteFilter deletes = new FlinkDeleteFilter(task, tableSchema, projectedSchema, inputFilesDecryptor); - CloseableIterable iterable = deletes.filter( - newIterable(task, deletes.requiredSchema(), idToConstant, inputFilesDecryptor) - ); + FlinkDeleteFilter deletes = + new FlinkDeleteFilter(task, tableSchema, projectedSchema, inputFilesDecryptor); + CloseableIterable iterable = + deletes.filter( + newIterable(task, deletes.requiredSchema(), idToConstant, inputFilesDecryptor)); // Project the RowData to remove the extra meta columns. if (!projectedSchema.sameSchema(deletes.requiredSchema())) { - RowDataProjection rowDataProjection = RowDataProjection.create( - deletes.requiredRowType(), deletes.requiredSchema().asStruct(), projectedSchema.asStruct()); + RowDataProjection rowDataProjection = + RowDataProjection.create( + deletes.requiredRowType(), + deletes.requiredSchema().asStruct(), + projectedSchema.asStruct()); iterable = CloseableIterable.transform(iterable, rowDataProjection::wrap); } @@ -87,7 +93,10 @@ public CloseableIterator open(FileScanTask task, InputFilesDecryptor in } private CloseableIterable newIterable( - FileScanTask task, Schema schema, Map idToConstant, InputFilesDecryptor inputFilesDecryptor) { + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { CloseableIterable iter; if (task.isDataTask()) { throw new UnsupportedOperationException("Cannot read data task."); @@ -115,12 +124,16 @@ private CloseableIterable newIterable( } private CloseableIterable newAvroIterable( - FileScanTask task, Schema schema, Map idToConstant, InputFilesDecryptor inputFilesDecryptor) { - Avro.ReadBuilder builder = Avro.read(inputFilesDecryptor.getInputFile(task)) - .reuseContainers() - .project(schema) - .split(task.start(), task.length()) - .createReaderFunc(readSchema -> new FlinkAvroReader(schema, readSchema, idToConstant)); + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Avro.ReadBuilder builder = + Avro.read(inputFilesDecryptor.getInputFile(task)) + .reuseContainers() + .project(schema) + .split(task.start(), task.length()) + .createReaderFunc(readSchema -> new FlinkAvroReader(schema, readSchema, idToConstant)); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -130,14 +143,19 @@ private CloseableIterable newAvroIterable( } private CloseableIterable newParquetIterable( - FileScanTask task, Schema schema, Map idToConstant, InputFilesDecryptor inputFilesDecryptor) { - Parquet.ReadBuilder builder = Parquet.read(inputFilesDecryptor.getInputFile(task)) - .split(task.start(), task.length()) - .project(schema) - .createReaderFunc(fileSchema -> FlinkParquetReaders.buildReader(schema, fileSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive) - .reuseContainers(); + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Parquet.ReadBuilder builder = + Parquet.read(inputFilesDecryptor.getInputFile(task)) + .split(task.start(), task.length()) + .project(schema) + .createReaderFunc( + fileSchema -> FlinkParquetReaders.buildReader(schema, fileSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive) + .reuseContainers(); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -147,16 +165,22 @@ private CloseableIterable newParquetIterable( } private CloseableIterable newOrcIterable( - FileScanTask task, Schema schema, Map idToConstant, InputFilesDecryptor inputFilesDecryptor) { - Schema readSchemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(schema, - Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); - - ORC.ReadBuilder builder = ORC.read(inputFilesDecryptor.getInputFile(task)) - .project(readSchemaWithoutConstantAndMetadataFields) - .split(task.start(), task.length()) - .createReaderFunc(readOrcSchema -> new FlinkOrcReader(schema, readOrcSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive); + FileScanTask task, + Schema schema, + Map idToConstant, + InputFilesDecryptor inputFilesDecryptor) { + Schema readSchemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot( + schema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); + + ORC.ReadBuilder builder = + ORC.read(inputFilesDecryptor.getInputFile(task)) + .project(readSchemaWithoutConstantAndMetadataFields) + .split(task.start(), task.length()) + .createReaderFunc( + readOrcSchema -> new FlinkOrcReader(schema, readOrcSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -170,8 +194,11 @@ private static class FlinkDeleteFilter extends DeleteFilter { private final RowDataWrapper asStructLike; private final InputFilesDecryptor inputFilesDecryptor; - FlinkDeleteFilter(FileScanTask task, Schema tableSchema, Schema requestedSchema, - InputFilesDecryptor inputFilesDecryptor) { + FlinkDeleteFilter( + FileScanTask task, + Schema tableSchema, + Schema requestedSchema, + InputFilesDecryptor inputFilesDecryptor) { super(task.file().path().toString(), task.deletes(), tableSchema, requestedSchema); this.requiredRowType = FlinkSchemaUtil.convert(requiredSchema()); this.asStructLike = new RowDataWrapper(requiredRowType, requiredSchema().asStruct()); diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java index 5e8837c5d47b..a721755c276f 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/RowDataRewriter.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; +import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; + import java.util.Collection; import java.util.List; import java.util.Locale; @@ -46,8 +47,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; - public class RowDataRewriter { private static final Logger LOG = LoggerFactory.getLogger(RowDataRewriter.class); @@ -60,31 +59,36 @@ public class RowDataRewriter { private final TaskWriterFactory taskWriterFactory; private final String tableName; - public RowDataRewriter(Table table, boolean caseSensitive, FileIO io, EncryptionManager encryptionManager) { + public RowDataRewriter( + Table table, boolean caseSensitive, FileIO io, EncryptionManager encryptionManager) { this.schema = table.schema(); this.caseSensitive = caseSensitive; this.io = io; this.encryptionManager = encryptionManager; - this.nameMapping = PropertyUtil.propertyAsString(table.properties(), DEFAULT_NAME_MAPPING, null); + this.nameMapping = + PropertyUtil.propertyAsString(table.properties(), DEFAULT_NAME_MAPPING, null); this.tableName = table.name(); - String formatString = PropertyUtil.propertyAsString(table.properties(), TableProperties.DEFAULT_FILE_FORMAT, - TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); + String formatString = + PropertyUtil.propertyAsString( + table.properties(), + TableProperties.DEFAULT_FILE_FORMAT, + TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); FileFormat format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH)); RowType flinkSchema = FlinkSchemaUtil.convert(table.schema()); - this.taskWriterFactory = new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - flinkSchema, - Long.MAX_VALUE, - format, - null, - false); + this.taskWriterFactory = + new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), flinkSchema, Long.MAX_VALUE, format, null, false); } - public List rewriteDataForTasks(DataStream dataStream, int parallelism) throws Exception { - RewriteMap map = new RewriteMap(schema, nameMapping, io, caseSensitive, encryptionManager, taskWriterFactory); + public List rewriteDataForTasks( + DataStream dataStream, int parallelism) throws Exception { + RewriteMap map = + new RewriteMap( + schema, nameMapping, io, caseSensitive, encryptionManager, taskWriterFactory); DataStream> ds = dataStream.map(map).setParallelism(parallelism); - return Lists.newArrayList(ds.executeAndCollect("Rewrite table :" + tableName)).stream().flatMap(Collection::stream) + return Lists.newArrayList(ds.executeAndCollect("Rewrite table :" + tableName)).stream() + .flatMap(Collection::stream) .collect(Collectors.toList()); } @@ -102,15 +106,21 @@ public static class RewriteMap extends RichMapFunction taskWriterFactory; private final RowDataFileScanTaskReader rowDataReader; - public RewriteMap(Schema schema, String nameMapping, FileIO io, boolean caseSensitive, - EncryptionManager encryptionManager, TaskWriterFactory taskWriterFactory) { + public RewriteMap( + Schema schema, + String nameMapping, + FileIO io, + boolean caseSensitive, + EncryptionManager encryptionManager, + TaskWriterFactory taskWriterFactory) { this.schema = schema; this.nameMapping = nameMapping; this.io = io; this.caseSensitive = caseSensitive; this.encryptionManager = encryptionManager; this.taskWriterFactory = taskWriterFactory; - this.rowDataReader = new RowDataFileScanTaskReader(schema, schema, nameMapping, caseSensitive); + this.rowDataReader = + new RowDataFileScanTaskReader(schema, schema, nameMapping, caseSensitive); } @Override @@ -126,7 +136,7 @@ public List map(CombinedScanTask task) throws Exception { // Initialize the task writer. this.writer = taskWriterFactory.create(); try (DataIterator iterator = - new DataIterator<>(rowDataReader, task, io, encryptionManager)) { + new DataIterator<>(rowDataReader, task, io, encryptionManager)) { while (iterator.hasNext()) { RowData rowData = iterator.next(); writer.write(rowData); diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java index 2fe47cc7f490..1d99e441b4a0 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; +import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; + import java.io.Serializable; import java.time.Duration; import java.util.List; @@ -32,11 +33,7 @@ import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.flink.FlinkConfigOptions; -import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; - -/** - * Context object with optional arguments for a Flink Scan. - */ +/** Context object with optional arguments for a Flink Scan. */ @Internal public class ScanContext implements Serializable { @@ -52,7 +49,8 @@ public class ScanContext implements Serializable { ConfigOptions.key("as-of-timestamp").longType().defaultValue(null); private static final ConfigOption STARTING_STRATEGY = - ConfigOptions.key("starting-strategy").enumType(StreamingStartingStrategy.class) + ConfigOptions.key("starting-strategy") + .enumType(StreamingStartingStrategy.class) .defaultValue(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT); private static final ConfigOption START_SNAPSHOT_TIMESTAMP = @@ -107,12 +105,27 @@ public class ScanContext implements Serializable { private final Integer planParallelism; private final int maxPlanningSnapshotCount; - private ScanContext(boolean caseSensitive, Long snapshotId, StreamingStartingStrategy startingStrategy, - Long startSnapshotTimestamp, Long startSnapshotId, Long endSnapshotId, Long asOfTimestamp, - Long splitSize, Integer splitLookback, Long splitOpenFileCost, boolean isStreaming, - Duration monitorInterval, String nameMapping, Schema schema, List filters, - long limit, boolean includeColumnStats, boolean exposeLocality, Integer planParallelism, - int maxPlanningSnapshotCount) { + private ScanContext( + boolean caseSensitive, + Long snapshotId, + StreamingStartingStrategy startingStrategy, + Long startSnapshotTimestamp, + Long startSnapshotId, + Long endSnapshotId, + Long asOfTimestamp, + Long splitSize, + Integer splitLookback, + Long splitOpenFileCost, + boolean isStreaming, + Duration monitorInterval, + String nameMapping, + Schema schema, + List filters, + long limit, + boolean includeColumnStats, + boolean exposeLocality, + Integer planParallelism, + int maxPlanningSnapshotCount) { this.caseSensitive = caseSensitive; this.snapshotId = snapshotId; this.startingStrategy = startingStrategy; @@ -141,15 +154,19 @@ private ScanContext(boolean caseSensitive, Long snapshotId, StreamingStartingStr private void validate() { if (isStreaming) { if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) { - Preconditions.checkArgument(startSnapshotId != null, + Preconditions.checkArgument( + startSnapshotId != null, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); - Preconditions.checkArgument(startSnapshotTimestamp == null, + Preconditions.checkArgument( + startSnapshotTimestamp == null, "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); } if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) { - Preconditions.checkArgument(startSnapshotTimestamp != null, + Preconditions.checkArgument( + startSnapshotTimestamp != null, "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); - Preconditions.checkArgument(startSnapshotId == null, + Preconditions.checkArgument( + startSnapshotId == null, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); } } @@ -304,11 +321,11 @@ public static class Builder { private long limit = -1L; private boolean includeColumnStats = INCLUDE_COLUMN_STATS.defaultValue(); private boolean exposeLocality; - private Integer planParallelism = FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue(); + private Integer planParallelism = + FlinkConfigOptions.TABLE_EXEC_ICEBERG_WORKER_POOL_SIZE.defaultValue(); private int maxPlanningSnapshotCount = MAX_PLANNING_SNAPSHOT_COUNT.defaultValue(); - private Builder() { - } + private Builder() {} public Builder caseSensitive(boolean newCaseSensitive) { this.caseSensitive = newCaseSensitive; @@ -432,10 +449,27 @@ public Builder fromProperties(Map properties) { } public ScanContext build() { - return new ScanContext(caseSensitive, snapshotId, startingStrategy, startSnapshotTimestamp, - startSnapshotId, endSnapshotId, asOfTimestamp, splitSize, splitLookback, - splitOpenFileCost, isStreaming, monitorInterval, nameMapping, projectedSchema, - filters, limit, includeColumnStats, exposeLocality, planParallelism, maxPlanningSnapshotCount); + return new ScanContext( + caseSensitive, + snapshotId, + startingStrategy, + startSnapshotTimestamp, + startSnapshotId, + endSnapshotId, + asOfTimestamp, + splitSize, + splitLookback, + splitOpenFileCost, + isStreaming, + monitorInterval, + nameMapping, + projectedSchema, + filters, + limit, + includeColumnStats, + exposeLocality, + planParallelism, + maxPlanningSnapshotCount); } } } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java index 06def6508938..75791c95bd4a 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -45,19 +44,20 @@ import org.slf4j.LoggerFactory; /** - * This is the single (non-parallel) monitoring task which takes a {@link FlinkInputFormat}, - * it is responsible for: + * This is the single (non-parallel) monitoring task which takes a {@link FlinkInputFormat}, it is + * responsible for: * *
      - *
    1. Monitoring snapshots of the Iceberg table.
    2. - *
    3. Creating the {@link FlinkInputSplit splits} corresponding to the incremental files
    4. - *
    5. Assigning them to downstream tasks for further processing.
    6. + *
    7. Monitoring snapshots of the Iceberg table. + *
    8. Creating the {@link FlinkInputSplit splits} corresponding to the incremental files + *
    9. Assigning them to downstream tasks for further processing. *
    * - *

    The splits to be read are forwarded to the downstream {@link StreamingReaderOperator} - * which can have parallelism greater than one. + *

    The splits to be read are forwarded to the downstream {@link StreamingReaderOperator} which + * can have parallelism greater than one. */ -public class StreamingMonitorFunction extends RichSourceFunction implements CheckpointedFunction { +public class StreamingMonitorFunction extends RichSourceFunction + implements CheckpointedFunction { private static final Logger LOG = LoggerFactory.getLogger(StreamingMonitorFunction.class); @@ -68,7 +68,8 @@ public class StreamingMonitorFunction extends RichSourceFunction 0, + Preconditions.checkArgument( + scanContext.maxPlanningSnapshotCount() > 0, "The max-planning-snapshot-count must be greater than zero"); this.tableLoader = tableLoader; this.scanContext = scanContext; @@ -96,9 +100,12 @@ public void open(Configuration parameters) throws Exception { final RuntimeContext runtimeContext = getRuntimeContext(); ValidationException.check( - runtimeContext instanceof StreamingRuntimeContext, "context should be instance of StreamingRuntimeContext"); + runtimeContext instanceof StreamingRuntimeContext, + "context should be instance of StreamingRuntimeContext"); final String operatorID = ((StreamingRuntimeContext) runtimeContext).getOperatorUniqueID(); - this.workerPool = ThreadPools.newWorkerPool("iceberg-worker-pool-" + operatorID, scanContext.planParallelism()); + this.workerPool = + ThreadPools.newWorkerPool( + "iceberg-worker-pool-" + operatorID, scanContext.planParallelism()); } @Override @@ -108,21 +115,24 @@ public void initializeState(FunctionInitializationContext context) throws Except table = tableLoader.loadTable(); // Initialize the flink state for last snapshot id. - lastSnapshotIdState = context.getOperatorStateStore().getListState( - new ListStateDescriptor<>( - "snapshot-id-state", - LongSerializer.INSTANCE)); + lastSnapshotIdState = + context + .getOperatorStateStore() + .getListState(new ListStateDescriptor<>("snapshot-id-state", LongSerializer.INSTANCE)); // Restore the last-snapshot-id from flink's state if possible. if (context.isRestored()) { LOG.info("Restoring state for the {}.", getClass().getSimpleName()); lastSnapshotId = lastSnapshotIdState.get().iterator().next(); } else if (scanContext.startSnapshotId() != null) { - Preconditions.checkNotNull(table.currentSnapshot(), "Don't have any available snapshot in table."); + Preconditions.checkNotNull( + table.currentSnapshot(), "Don't have any available snapshot in table."); long currentSnapshotId = table.currentSnapshot().snapshotId(); - Preconditions.checkState(SnapshotUtil.isAncestorOf(table, currentSnapshotId, scanContext.startSnapshotId()), - "The option start-snapshot-id %s is not an ancestor of the current snapshot.", scanContext.startSnapshotId()); + Preconditions.checkState( + SnapshotUtil.isAncestorOf(table, currentSnapshotId, scanContext.startSnapshotId()), + "The option start-snapshot-id %s is not an ancestor of the current snapshot.", + scanContext.startSnapshotId()); lastSnapshotId = scanContext.startSnapshotId(); } @@ -143,13 +153,15 @@ public void run(SourceContext ctx) throws Exception { } } - private long toSnapshotIdInclusive(long lastConsumedSnapshotId, long currentSnapshotId, - int maxPlanningSnapshotCount) { - List snapshotIds = SnapshotUtil.snapshotIdsBetween(table, lastConsumedSnapshotId, currentSnapshotId); + private long toSnapshotIdInclusive( + long lastConsumedSnapshotId, long currentSnapshotId, int maxPlanningSnapshotCount) { + List snapshotIds = + SnapshotUtil.snapshotIdsBetween(table, lastConsumedSnapshotId, currentSnapshotId); if (snapshotIds.size() <= maxPlanningSnapshotCount) { return currentSnapshotId; } else { - // It uses reverted index since snapshotIdsBetween returns Ids that are ordered by committed time descending. + // It uses reverted index since snapshotIdsBetween returns Ids that are ordered by committed + // time descending. return snapshotIds.get(snapshotIds.size() - maxPlanningSnapshotCount); } } @@ -172,14 +184,23 @@ void monitorAndForwardSplits() { if (lastSnapshotId == INIT_LAST_SNAPSHOT_ID) { newScanContext = scanContext.copyWithSnapshotId(snapshotId); } else { - snapshotId = toSnapshotIdInclusive(lastSnapshotId, snapshotId, scanContext.maxPlanningSnapshotCount()); + snapshotId = + toSnapshotIdInclusive( + lastSnapshotId, snapshotId, scanContext.maxPlanningSnapshotCount()); newScanContext = scanContext.copyWithAppendsBetween(lastSnapshotId, snapshotId); } - LOG.debug("Start discovering splits from {} (exclusive) to {} (inclusive)", lastSnapshotId, snapshotId); + LOG.debug( + "Start discovering splits from {} (exclusive) to {} (inclusive)", + lastSnapshotId, + snapshotId); long start = System.currentTimeMillis(); - FlinkInputSplit[] splits = FlinkSplitPlanner.planInputSplits(table, newScanContext, workerPool); - LOG.debug("Discovered {} splits, time elapsed {}ms", splits.length, System.currentTimeMillis() - start); + FlinkInputSplit[] splits = + FlinkSplitPlanner.planInputSplits(table, newScanContext, workerPool); + LOG.debug( + "Discovered {} splits, time elapsed {}ms", + splits.length, + System.currentTimeMillis() - start); // only need to hold the checkpoint lock when emitting the splits and updating lastSnapshotId start = System.currentTimeMillis(); @@ -190,7 +211,10 @@ void monitorAndForwardSplits() { lastSnapshotId = snapshotId; } - LOG.debug("Forwarded {} splits, time elapsed {}ms", splits.length, System.currentTimeMillis() - start); + LOG.debug( + "Forwarded {} splits, time elapsed {}ms", + splits.length, + System.currentTimeMillis() - start); } } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java index c8efc2b5992f..ee6f7b63988d 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/StreamingReaderOperator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -47,20 +46,23 @@ /** * The operator that reads the {@link FlinkInputSplit splits} received from the preceding {@link - * StreamingMonitorFunction}. Contrary to the {@link StreamingMonitorFunction} which has a parallelism of 1, - * this operator can have multiple parallelism. + * StreamingMonitorFunction}. Contrary to the {@link StreamingMonitorFunction} which has a + * parallelism of 1, this operator can have multiple parallelism. * - *

    As soon as a split descriptor is received, it is put in a queue, and use {@link MailboxExecutor} - * read the actual data of the split. This architecture allows the separation of the reading thread from the one split - * processing the checkpoint barriers, thus removing any potential back-pressure. + *

    As soon as a split descriptor is received, it is put in a queue, and use {@link + * MailboxExecutor} read the actual data of the split. This architecture allows the separation of + * the reading thread from the one split processing the checkpoint barriers, thus removing any + * potential back-pressure. */ public class StreamingReaderOperator extends AbstractStreamOperator implements OneInputStreamOperator { private static final Logger LOG = LoggerFactory.getLogger(StreamingReaderOperator.class); - // It's the same thread that is running this operator and checkpoint actions. we use this executor to schedule only - // one split for future reading, so that a new checkpoint could be triggered without blocking long time for exhausting + // It's the same thread that is running this operator and checkpoint actions. we use this executor + // to schedule only + // one split for future reading, so that a new checkpoint could be triggered without blocking long + // time for exhausting // all scheduled splits. private final MailboxExecutor executor; private FlinkInputFormat format; @@ -70,17 +72,21 @@ public class StreamingReaderOperator extends AbstractStreamOperator private transient ListState inputSplitsState; private transient Queue splits; - // Splits are read by the same thread that calls processElement. Each read task is submitted to that thread by adding - // them to the executor. This state is used to ensure that only one read task is in that queue at a time, so that read - // tasks do not accumulate ahead of checkpoint tasks. When there is a read task in the queue, this is set to RUNNING. + // Splits are read by the same thread that calls processElement. Each read task is submitted to + // that thread by adding + // them to the executor. This state is used to ensure that only one read task is in that queue at + // a time, so that read + // tasks do not accumulate ahead of checkpoint tasks. When there is a read task in the queue, this + // is set to RUNNING. // When there are no more files to read, this will be set to IDLE. private transient SplitState currentSplitState; - private StreamingReaderOperator(FlinkInputFormat format, ProcessingTimeService timeService, - MailboxExecutor mailboxExecutor) { + private StreamingReaderOperator( + FlinkInputFormat format, ProcessingTimeService timeService, MailboxExecutor mailboxExecutor) { this.format = Preconditions.checkNotNull(format, "The InputFormat should not be null."); this.processingTimeService = timeService; - this.executor = Preconditions.checkNotNull(mailboxExecutor, "The mailboxExecutor should not be null."); + this.executor = + Preconditions.checkNotNull(mailboxExecutor, "The mailboxExecutor should not be null."); } @Override @@ -89,8 +95,10 @@ public void initializeState(StateInitializationContext context) throws Exception // TODO Replace Java serialization with Avro approach to keep state compatibility. // See issue: https://github.com/apache/iceberg/issues/1698 - inputSplitsState = context.getOperatorStateStore().getListState( - new ListStateDescriptor<>("splits", new JavaSerializer<>())); + inputSplitsState = + context + .getOperatorStateStore() + .getListState(new ListStateDescriptor<>("splits", new JavaSerializer<>())); // Initialize the current split state to IDLE. currentSplitState = SplitState.IDLE; @@ -106,14 +114,15 @@ public void initializeState(StateInitializationContext context) throws Exception } } - this.sourceContext = StreamSourceContexts.getSourceContext( - getOperatorConfig().getTimeCharacteristic(), - getProcessingTimeService(), - new Object(), // no actual locking needed - output, - getRuntimeContext().getExecutionConfig().getAutoWatermarkInterval(), - -1, - true); + this.sourceContext = + StreamSourceContexts.getSourceContext( + getOperatorConfig().getTimeCharacteristic(), + getProcessingTimeService(), + new Object(), // no actual locking needed + output, + getRuntimeContext().getExecutionConfig().getAutoWatermarkInterval(), + -1, + true); // Enqueue to process the recovered input splits. enqueueProcessSplits(); @@ -197,11 +206,13 @@ static OneInputStreamOperatorFactory factory(FlinkInpu } private enum SplitState { - IDLE, RUNNING + IDLE, + RUNNING } private static class OperatorFactory extends AbstractStreamOperatorFactory - implements YieldingOperatorFactory, OneInputStreamOperatorFactory { + implements YieldingOperatorFactory, + OneInputStreamOperatorFactory { private final FlinkInputFormat format; @@ -218,9 +229,12 @@ public void setMailboxExecutor(MailboxExecutor mailboxExecutor) { @SuppressWarnings("unchecked") @Override - public > O createStreamOperator(StreamOperatorParameters parameters) { - StreamingReaderOperator operator = new StreamingReaderOperator(format, processingTimeService, mailboxExecutor); - operator.setup(parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); + public > O createStreamOperator( + StreamOperatorParameters parameters) { + StreamingReaderOperator operator = + new StreamingReaderOperator(format, processingTimeService, mailboxExecutor); + operator.setup( + parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); return (O) operator; } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java index 3e83fbe7f5af..11707bf82a0f 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/StreamingStartingStrategy.java @@ -16,43 +16,39 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; -/** - * Starting strategy for streaming execution. - */ +/** Starting strategy for streaming execution. */ public enum StreamingStartingStrategy { /** * Do a regular table scan then switch to the incremental mode. - *

    - * The incremental mode starts from the current snapshot exclusive. + * + *

    The incremental mode starts from the current snapshot exclusive. */ TABLE_SCAN_THEN_INCREMENTAL, /** * Start incremental mode from the latest snapshot inclusive. - *

    - * If it is an empty map, all future append snapshots should be discovered. + * + *

    If it is an empty map, all future append snapshots should be discovered. */ INCREMENTAL_FROM_LATEST_SNAPSHOT, /** * Start incremental mode from the earliest snapshot inclusive. - *

    - * If it is an empty map, all future append snapshots should be discovered. + * + *

    If it is an empty map, all future append snapshots should be discovered. */ INCREMENTAL_FROM_EARLIEST_SNAPSHOT, - /** - * Start incremental mode from a snapshot with a specific id inclusive. - */ + /** Start incremental mode from a snapshot with a specific id inclusive. */ INCREMENTAL_FROM_SNAPSHOT_ID, /** * Start incremental mode from a snapshot with a specific timestamp inclusive. - *

    - * If the timestamp is between two snapshots, it should start from the snapshot after the timestamp. + * + *

    If the timestamp is between two snapshots, it should start from the snapshot after the + * timestamp. */ INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java index 21feab660ba7..72deaeb890f3 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/GetSplitResult.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.assigner; import org.apache.flink.annotation.Internal; @@ -27,18 +26,15 @@ public class GetSplitResult { public enum Status { - AVAILABLE, /** - * There are pending splits. But they can't be assigned - * due to constraints (like event time alignment) + * There are pending splits. But they can't be assigned due to constraints (like event time + * alignment) */ CONSTRAINED, - /** - * Assigner doesn't have pending splits. - */ + /** Assigner doesn't have pending splits. */ UNAVAILABLE } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssigner.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssigner.java index ed70ad3774aa..c1da261a5555 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssigner.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssigner.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.assigner; import java.util.ArrayDeque; @@ -31,8 +30,8 @@ import org.apache.iceberg.flink.source.split.IcebergSourceSplitStatus; /** - * Since all methods are called in the source coordinator thread by enumerator, - * there is no need for locking. + * Since all methods are called in the source coordinator thread by enumerator, there is no need for + * locking. */ @Internal public class SimpleSplitAssigner implements SplitAssigner { @@ -79,9 +78,7 @@ private void addSplits(Collection splits) { } } - /** - * Simple assigner only tracks unassigned splits - */ + /** Simple assigner only tracks unassigned splits */ @Override public Collection state() { return pendingSplits.stream() diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java index 76f3d66e9086..1c14f4fcf9b9 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SimpleSplitAssignerFactory.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.assigner; import java.util.Collection; import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; -/** - * Create simple assigner that hands out splits without any guarantee in order or locality. - */ +/** Create simple assigner that hands out splits without any guarantee in order or locality. */ public class SimpleSplitAssignerFactory implements SplitAssignerFactory { @Override diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java index 78ef02cad43a..b17a554f5e65 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssigner.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.assigner; import java.io.Closeable; @@ -28,19 +27,19 @@ import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; /** - * SplitAssigner interface is extracted out as a separate component so that - * we can plug in different split assignment strategy for different requirements. E.g. + * SplitAssigner interface is extracted out as a separate component so that we can plug in different + * split assignment strategy for different requirements. E.g. + * *

      - *
    • Simple assigner with no ordering guarantee or locality aware optimization.
    • - *
    • Locality aware assigner that prefer splits that are local.
    • - *
    • Snapshot aware assigner that assign splits based on the order they are committed.
    • - *
    • Event time alignment assigner that assign splits satisfying certain time ordering - * within a single source or across sources.
    • + *
    • Simple assigner with no ordering guarantee or locality aware optimization. + *
    • Locality aware assigner that prefer splits that are local. + *
    • Snapshot aware assigner that assign splits based on the order they are committed. + *
    • Event time alignment assigner that assign splits satisfying certain time ordering within a + * single source or across sources. *
    * - *

    - * Enumerator should call the assigner APIs from the coordinator thread. - * This is to simplify the thread safety for assigner implementation. + *

    Enumerator should call the assigner APIs from the coordinator thread. This is to simplify the + * thread safety for assigner implementation. */ public interface SplitAssigner extends Closeable { @@ -48,64 +47,54 @@ public interface SplitAssigner extends Closeable { * Some assigners may need to start background threads or perform other activity such as * registering as listeners to updates from other event sources e.g., watermark tracker. */ - default void start() { - } + default void start() {} /** - * Some assigners may need to perform certain actions - * when their corresponding enumerators are closed + * Some assigners may need to perform certain actions when their corresponding enumerators are + * closed */ @Override - default void close() { - } + default void close() {} /** - * Request a new split from the assigner - * when enumerator trying to assign splits to awaiting readers. - *

    - * If enumerator wasn't able to assign the split (e.g., reader disconnected), - * enumerator should call {@link SplitAssigner#onUnassignedSplits} to return the split. + * Request a new split from the assigner when enumerator trying to assign splits to awaiting + * readers. + * + *

    If enumerator wasn't able to assign the split (e.g., reader disconnected), enumerator should + * call {@link SplitAssigner#onUnassignedSplits} to return the split. */ GetSplitResult getNext(@Nullable String hostname); - /** - * Add new splits discovered by enumerator - */ + /** Add new splits discovered by enumerator */ void onDiscoveredSplits(Collection splits); - /** - * Forward addSplitsBack event (for failed reader) to assigner - */ + /** Forward addSplitsBack event (for failed reader) to assigner */ void onUnassignedSplits(Collection splits); /** - * Some assigner (like event time alignment) may rack in-progress splits - * to advance watermark upon completed splits + * Some assigner (like event time alignment) may rack in-progress splits to advance watermark upon + * completed splits */ - default void onCompletedSplits(Collection completedSplitIds) { - } + default void onCompletedSplits(Collection completedSplitIds) {} /** - * Get assigner state for checkpointing. - * This is a super-set API that works for all currently imagined assigners. + * Get assigner state for checkpointing. This is a super-set API that works for all currently + * imagined assigners. */ Collection state(); /** - * Enumerator can get a notification via CompletableFuture - * when the assigner has more splits available later. - * Enumerator should schedule assignment in the thenAccept action of the future. - *

    - * Assigner will return the same future if this method is called again - * before the previous future is completed. - *

    - * The future can be completed from other thread, - * e.g. the coordinator thread from another thread - * for event time alignment. - *

    - * If enumerator need to trigger action upon the future completion, - * it may want to run it in the coordinator thread - * using {@link SplitEnumeratorContext#runInCoordinatorThread(Runnable)}. + * Enumerator can get a notification via CompletableFuture when the assigner has more splits + * available later. Enumerator should schedule assignment in the thenAccept action of the future. + * + *

    Assigner will return the same future if this method is called again before the previous + * future is completed. + * + *

    The future can be completed from other thread, e.g. the coordinator thread from another + * thread for event time alignment. + * + *

    If enumerator need to trigger action upon the future completion, it may want to run it in + * the coordinator thread using {@link SplitEnumeratorContext#runInCoordinatorThread(Runnable)}. */ CompletableFuture isAvailable(); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java index bc4ff0479f72..6e02a556ffcd 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/assigner/SplitAssignerFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.assigner; import java.io.Serializable; diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java index 8c9be862bf33..3aca390755ed 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/AbstractIcebergEnumerator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.IOException; @@ -38,9 +37,11 @@ import org.slf4j.LoggerFactory; /** - * TODO: publish enumerator monitor metrics like number of pending metrics after FLINK-21000 is resolved + * TODO: publish enumerator monitor metrics like number of pending metrics after FLINK-21000 is + * resolved */ -abstract class AbstractIcebergEnumerator implements SplitEnumerator { +abstract class AbstractIcebergEnumerator + implements SplitEnumerator { private static final Logger LOG = LoggerFactory.getLogger(AbstractIcebergEnumerator.class); private final SplitEnumeratorContext enumeratorContext; @@ -49,8 +50,7 @@ abstract class AbstractIcebergEnumerator implements SplitEnumerator> availableFuture; AbstractIcebergEnumerator( - SplitEnumeratorContext enumeratorContext, - SplitAssigner assigner) { + SplitEnumeratorContext enumeratorContext, SplitAssigner assigner) { this.enumeratorContext = enumeratorContext; this.assigner = assigner; this.readersAwaitingSplit = new LinkedHashMap<>(); @@ -70,29 +70,32 @@ public void close() throws IOException { @Override public void handleSplitRequest(int subtaskId, @Nullable String requesterHostname) { // Iceberg source uses custom split request event to piggyback finished split ids. - throw new UnsupportedOperationException(String.format("Received invalid default split request event " + - "from subtask %d as Iceberg source uses custom split request event", subtaskId)); + throw new UnsupportedOperationException( + String.format( + "Received invalid default split request event " + + "from subtask %d as Iceberg source uses custom split request event", + subtaskId)); } @Override public void handleSourceEvent(int subtaskId, SourceEvent sourceEvent) { if (sourceEvent instanceof SplitRequestEvent) { - SplitRequestEvent splitRequestEvent = - (SplitRequestEvent) sourceEvent; + SplitRequestEvent splitRequestEvent = (SplitRequestEvent) sourceEvent; LOG.info("Received request split event from subtask {}", subtaskId); assigner.onCompletedSplits(splitRequestEvent.finishedSplitIds()); readersAwaitingSplit.put(subtaskId, splitRequestEvent.requesterHostname()); assignSplits(); } else { - throw new IllegalArgumentException(String.format("Received unknown event from subtask %d: %s", - subtaskId, sourceEvent.getClass().getCanonicalName())); + throw new IllegalArgumentException( + String.format( + "Received unknown event from subtask %d: %s", + subtaskId, sourceEvent.getClass().getCanonicalName())); } } @Override public void addSplitsBack(List splits, int subtaskId) { - LOG.info("Add {} splits back to the pool for failed subtask {}", - splits.size(), subtaskId); + LOG.info("Add {} splits back to the pool for failed subtask {}", splits.size(), subtaskId); assigner.onUnassignedSplits(splits); assignSplits(); } @@ -140,10 +143,7 @@ private void assignSplits() { } } - /** - * return true if enumerator should wait for splits - * like in the continuous enumerator case - */ + /** return true if enumerator should wait for splits like in the continuous enumerator case */ protected abstract boolean shouldWaitForMoreSplits(); private synchronized void getAvailableFutureIfNeeded() { @@ -151,18 +151,22 @@ private synchronized void getAvailableFutureIfNeeded() { return; } - CompletableFuture future = assigner.isAvailable() - .thenAccept(ignore -> - // Must run assignSplits in coordinator thread - // because the future may be completed from other threads. - // E.g., in event time alignment assigner, - // watermark advancement from another source may - // cause the available future to be completed - enumeratorContext.runInCoordinatorThread(() -> { - LOG.debug("Executing callback of assignSplits"); - availableFuture.set(null); - assignSplits(); - })); + CompletableFuture future = + assigner + .isAvailable() + .thenAccept( + ignore -> + // Must run assignSplits in coordinator thread + // because the future may be completed from other threads. + // E.g., in event time alignment assigner, + // watermark advancement from another source may + // cause the available future to be completed + enumeratorContext.runInCoordinatorThread( + () -> { + LOG.debug("Executing callback of assignSplits"); + availableFuture.set(null); + assignSplits(); + })); availableFuture.set(future); LOG.debug("Registered callback for future available splits"); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java index 8c20f2cf22bc..41863ffee60b 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousEnumerationResult.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.util.Collection; diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java index e2b94b8c3e2b..d104f46fdaaf 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousIcebergEnumerator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.IOException; @@ -42,8 +41,8 @@ public class ContinuousIcebergEnumerator extends AbstractIcebergEnumerator { private final ContinuousSplitPlanner splitPlanner; /** - * snapshotId for the last enumerated snapshot. next incremental enumeration - * should be based off this as the starting position. + * snapshotId for the last enumerated snapshot. next incremental enumeration should be based off + * this as the starting position. */ private final AtomicReference enumeratorPosition; @@ -91,30 +90,36 @@ public IcebergEnumeratorState snapshotState(long checkpointId) { return new IcebergEnumeratorState(enumeratorPosition.get(), assigner.state()); } - /** - * This method is executed in an IO thread pool. - */ + /** This method is executed in an IO thread pool. */ private ContinuousEnumerationResult discoverSplits() { return splitPlanner.planSplits(enumeratorPosition.get()); } - /** - * This method is executed in a single coordinator thread. - */ + /** This method is executed in a single coordinator thread. */ private void processDiscoveredSplits(ContinuousEnumerationResult result, Throwable error) { if (error == null) { if (!Objects.equals(result.fromPosition(), enumeratorPosition.get())) { - // Multiple discoverSplits() may be triggered with the same starting snapshot to the I/O thread pool. - // E.g., the splitDiscoveryInterval is very short (like 10 ms in some unit tests) or the thread - // pool is busy and multiple discovery actions are executed concurrently. Discovery result should - // only be accepted if the starting position matches the enumerator position (like compare-and-swap). - LOG.info("Skip {} discovered splits because the scan starting position doesn't match " + - "the current enumerator position: enumerator position = {}, scan starting position = {}", - result.splits().size(), enumeratorPosition.get(), result.fromPosition()); + // Multiple discoverSplits() may be triggered with the same starting snapshot to the I/O + // thread pool. + // E.g., the splitDiscoveryInterval is very short (like 10 ms in some unit tests) or the + // thread + // pool is busy and multiple discovery actions are executed concurrently. Discovery result + // should + // only be accepted if the starting position matches the enumerator position (like + // compare-and-swap). + LOG.info( + "Skip {} discovered splits because the scan starting position doesn't match " + + "the current enumerator position: enumerator position = {}, scan starting position = {}", + result.splits().size(), + enumeratorPosition.get(), + result.fromPosition()); } else { assigner.onDiscoveredSplits(result.splits()); - LOG.info("Added {} splits discovered between ({}, {}] to the assigner", - result.splits().size(), result.fromPosition(), result.toPosition()); + LOG.info( + "Added {} splits discovered between ({}, {}] to the assigner", + result.splits().size(), + result.fromPosition(), + result.toPosition()); // update the enumerator position even if there is no split discovered // or the toPosition is empty (e.g. for empty table). enumeratorPosition.set(result.toPosition()); diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java index 1737ae6a5023..2a1325178873 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlanner.java @@ -16,20 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.Closeable; import org.apache.flink.annotation.Internal; -/** - * This interface is introduced so that we can plug in different split planner for unit test - */ +/** This interface is introduced so that we can plug in different split planner for unit test */ @Internal public interface ContinuousSplitPlanner extends Closeable { - /** - * Discover the files appended between {@code lastPosition} and current table snapshot - */ + /** Discover the files appended between {@code lastPosition} and current table snapshot */ ContinuousEnumerationResult planSplits(IcebergEnumeratorPosition lastPosition); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java index 2bbaaf940b63..a3ac8549909d 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/ContinuousSplitPlannerImpl.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.IOException; @@ -48,15 +47,18 @@ public class ContinuousSplitPlannerImpl implements ContinuousSplitPlanner { private final ExecutorService workerPool; /** - * @param threadName thread name prefix for worker pool to run the split planning. - * If null, a shared worker pool will be used. + * @param threadName thread name prefix for worker pool to run the split planning. If null, a + * shared worker pool will be used. */ public ContinuousSplitPlannerImpl(Table table, ScanContext scanContext, String threadName) { this.table = table; this.scanContext = scanContext; this.isSharedPool = threadName == null; - this.workerPool = isSharedPool ? ThreadPools.getWorkerPool() - : ThreadPools.newWorkerPool("iceberg-plan-worker-pool-" + threadName, scanContext.planParallelism()); + this.workerPool = + isSharedPool + ? ThreadPools.getWorkerPool() + : ThreadPools.newWorkerPool( + "iceberg-plan-worker-pool-" + threadName, scanContext.planParallelism()); } @Override @@ -76,63 +78,75 @@ public ContinuousEnumerationResult planSplits(IcebergEnumeratorPosition lastPosi } } - /** - * Discover incremental changes between @{code lastPosition} and current table snapshot - */ - private ContinuousEnumerationResult discoverIncrementalSplits(IcebergEnumeratorPosition lastPosition) { + /** Discover incremental changes between @{code lastPosition} and current table snapshot */ + private ContinuousEnumerationResult discoverIncrementalSplits( + IcebergEnumeratorPosition lastPosition) { Snapshot currentSnapshot = table.currentSnapshot(); if (currentSnapshot == null) { // empty table - Preconditions.checkArgument(lastPosition.snapshotId() == null, + Preconditions.checkArgument( + lastPosition.snapshotId() == null, "Invalid last enumerated position for an empty table: not null"); LOG.info("Skip incremental scan because table is empty"); return new ContinuousEnumerationResult(Collections.emptyList(), lastPosition, lastPosition); - } else if (lastPosition.snapshotId() != null && currentSnapshot.snapshotId() == lastPosition.snapshotId()) { + } else if (lastPosition.snapshotId() != null + && currentSnapshot.snapshotId() == lastPosition.snapshotId()) { LOG.info("Current table snapshot is already enumerated: {}", currentSnapshot.snapshotId()); return new ContinuousEnumerationResult(Collections.emptyList(), lastPosition, lastPosition); } else { - IcebergEnumeratorPosition newPosition = IcebergEnumeratorPosition.of( - currentSnapshot.snapshotId(), currentSnapshot.timestampMillis()); - ScanContext incrementalScan = scanContext - .copyWithAppendsBetween(lastPosition.snapshotId(), currentSnapshot.snapshotId()); - List splits = FlinkSplitPlanner.planIcebergSourceSplits(table, incrementalScan, workerPool); - LOG.info("Discovered {} splits from incremental scan: " + - "from snapshot (exclusive) is {}, to snapshot (inclusive) is {}", - splits.size(), lastPosition, newPosition); + IcebergEnumeratorPosition newPosition = + IcebergEnumeratorPosition.of( + currentSnapshot.snapshotId(), currentSnapshot.timestampMillis()); + ScanContext incrementalScan = + scanContext.copyWithAppendsBetween( + lastPosition.snapshotId(), currentSnapshot.snapshotId()); + List splits = + FlinkSplitPlanner.planIcebergSourceSplits(table, incrementalScan, workerPool); + LOG.info( + "Discovered {} splits from incremental scan: " + + "from snapshot (exclusive) is {}, to snapshot (inclusive) is {}", + splits.size(), + lastPosition, + newPosition); return new ContinuousEnumerationResult(splits, lastPosition, newPosition); } } /** * Discovery initial set of splits based on {@link StreamingStartingStrategy}. - * - *

  • {@link ContinuousEnumerationResult#splits()} should contain initial splits - * discovered from table scan for {@link StreamingStartingStrategy#TABLE_SCAN_THEN_INCREMENTAL}. - * For all other strategies, splits collection should be empty. - *
  • {@link ContinuousEnumerationResult#toPosition()} points to the starting position - * for the next incremental split discovery with exclusive behavior. Meaning files committed - * by the snapshot from the position in {@code ContinuousEnumerationResult} won't be included - * in the next incremental scan. + *
  • {@link ContinuousEnumerationResult#splits()} should contain initial splits discovered from + * table scan for {@link StreamingStartingStrategy#TABLE_SCAN_THEN_INCREMENTAL}. For all other + * strategies, splits collection should be empty. + *
  • {@link ContinuousEnumerationResult#toPosition()} points to the starting position for the + * next incremental split discovery with exclusive behavior. Meaning files committed by the + * snapshot from the position in {@code ContinuousEnumerationResult} won't be included in the + * next incremental scan. */ private ContinuousEnumerationResult discoverInitialSplits() { Optional startSnapshotOptional = startSnapshot(table, scanContext); if (!startSnapshotOptional.isPresent()) { - return new ContinuousEnumerationResult(Collections.emptyList(), null, - IcebergEnumeratorPosition.empty()); + return new ContinuousEnumerationResult( + Collections.emptyList(), null, IcebergEnumeratorPosition.empty()); } Snapshot startSnapshot = startSnapshotOptional.get(); - LOG.info("Get starting snapshot id {} based on strategy {}", - startSnapshot.snapshotId(), scanContext.streamingStartingStrategy()); + LOG.info( + "Get starting snapshot id {} based on strategy {}", + startSnapshot.snapshotId(), + scanContext.streamingStartingStrategy()); List splits; IcebergEnumeratorPosition toPosition; - if (scanContext.streamingStartingStrategy() == StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) { + if (scanContext.streamingStartingStrategy() + == StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) { // do a batch table scan first splits = FlinkSplitPlanner.planIcebergSourceSplits(table, scanContext, workerPool); - LOG.info("Discovered {} splits from initial batch table scan with snapshot Id {}", - splits.size(), startSnapshot.snapshotId()); + LOG.info( + "Discovered {} splits from initial batch table scan with snapshot Id {}", + splits.size(), + startSnapshot.snapshotId()); // For TABLE_SCAN_THEN_INCREMENTAL, incremental mode starts exclusive from the startSnapshot - toPosition = IcebergEnumeratorPosition.of(startSnapshot.snapshotId(), startSnapshot.timestampMillis()); + toPosition = + IcebergEnumeratorPosition.of(startSnapshot.snapshotId(), startSnapshot.timestampMillis()); } else { // For all other modes, starting snapshot should be consumed inclusively. // Use parentId to achieve the inclusive behavior. It is fine if parentId is null. @@ -140,24 +154,29 @@ private ContinuousEnumerationResult discoverInitialSplits() { Long parentSnapshotId = startSnapshot.parentId(); if (parentSnapshotId != null) { Snapshot parentSnapshot = table.snapshot(parentSnapshotId); - Long parentSnapshotTimestampMs = parentSnapshot != null ? parentSnapshot.timestampMillis() : null; + Long parentSnapshotTimestampMs = + parentSnapshot != null ? parentSnapshot.timestampMillis() : null; toPosition = IcebergEnumeratorPosition.of(parentSnapshotId, parentSnapshotTimestampMs); } else { toPosition = IcebergEnumeratorPosition.empty(); } - LOG.info("Start incremental scan with start snapshot (inclusive): id = {}, timestamp = {}", - startSnapshot.snapshotId(), startSnapshot.timestampMillis()); + LOG.info( + "Start incremental scan with start snapshot (inclusive): id = {}, timestamp = {}", + startSnapshot.snapshotId(), + startSnapshot.timestampMillis()); } return new ContinuousEnumerationResult(splits, null, toPosition); } /** - * Calculate the starting snapshot based on the {@link StreamingStartingStrategy} defined in {@code ScanContext}. - *

    - * If the {@link StreamingStartingStrategy} is not {@link StreamingStartingStrategy#TABLE_SCAN_THEN_INCREMENTAL}, - * the start snapshot should be consumed inclusively. + * Calculate the starting snapshot based on the {@link StreamingStartingStrategy} defined in + * {@code ScanContext}. + * + *

    If the {@link StreamingStartingStrategy} is not {@link + * StreamingStartingStrategy#TABLE_SCAN_THEN_INCREMENTAL}, the start snapshot should be consumed + * inclusively. */ @VisibleForTesting static Optional startSnapshot(Table table, ScanContext scanContext) { @@ -169,21 +188,25 @@ static Optional startSnapshot(Table table, ScanContext scanContext) { return Optional.ofNullable(SnapshotUtil.oldestAncestor(table)); case INCREMENTAL_FROM_SNAPSHOT_ID: Snapshot matchedSnapshotById = table.snapshot(scanContext.startSnapshotId()); - Preconditions.checkArgument(matchedSnapshotById != null, + Preconditions.checkArgument( + matchedSnapshotById != null, "Start snapshot id not found in history: " + scanContext.startSnapshotId()); return Optional.of(matchedSnapshotById); case INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP: - long snapshotIdAsOfTime = SnapshotUtil.snapshotIdAsOfTime(table, scanContext.startSnapshotTimestamp()); + long snapshotIdAsOfTime = + SnapshotUtil.snapshotIdAsOfTime(table, scanContext.startSnapshotTimestamp()); Snapshot matchedSnapshotByTimestamp = table.snapshot(snapshotIdAsOfTime); if (matchedSnapshotByTimestamp.timestampMillis() == scanContext.startSnapshotTimestamp()) { return Optional.of(matchedSnapshotByTimestamp); } else { - // if the snapshotIdAsOfTime has the timestamp value smaller than the scanContext.startSnapshotTimestamp(), + // if the snapshotIdAsOfTime has the timestamp value smaller than the + // scanContext.startSnapshotTimestamp(), // return the child snapshot whose timestamp value is larger return Optional.of(SnapshotUtil.snapshotAfter(table, snapshotIdAsOfTime)); } default: - throw new IllegalArgumentException("Unknown starting strategy: " + scanContext.streamingStartingStrategy()); + throw new IllegalArgumentException( + "Unknown starting strategy: " + scanContext.streamingStartingStrategy()); } } } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java index e024473da3c9..96aba296f8cf 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPosition.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; @@ -62,9 +61,7 @@ public String toString() { @Override public int hashCode() { - return Objects.hashCode( - snapshotId, - snapshotTimestampMs); + return Objects.hashCode(snapshotId, snapshotTimestampMs); } @Override @@ -76,7 +73,7 @@ public boolean equals(Object o) { return false; } IcebergEnumeratorPosition other = (IcebergEnumeratorPosition) o; - return Objects.equal(snapshotId, other.snapshotId()) && - Objects.equal(snapshotTimestampMs, other.snapshotTimestampMs()); + return Objects.equal(snapshotId, other.snapshotId()) + && Objects.equal(snapshotTimestampMs, other.snapshotTimestampMs()); } } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java index 83b230e80e08..1c63807361c5 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorPositionSerializer.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.IOException; @@ -24,9 +23,11 @@ import org.apache.flink.core.memory.DataInputDeserializer; import org.apache.flink.core.memory.DataOutputSerializer; -class IcebergEnumeratorPositionSerializer implements SimpleVersionedSerializer { +class IcebergEnumeratorPositionSerializer + implements SimpleVersionedSerializer { - public static final IcebergEnumeratorPositionSerializer INSTANCE = new IcebergEnumeratorPositionSerializer(); + public static final IcebergEnumeratorPositionSerializer INSTANCE = + new IcebergEnumeratorPositionSerializer(); private static final int VERSION = 1; diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java index bd2f44c0059b..7913f7b4350e 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorState.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.Serializable; @@ -24,12 +23,9 @@ import javax.annotation.Nullable; import org.apache.iceberg.flink.source.split.IcebergSourceSplitState; -/** - * Enumerator state for checkpointing - */ +/** Enumerator state for checkpointing */ public class IcebergEnumeratorState implements Serializable { - @Nullable - private final IcebergEnumeratorPosition lastEnumeratedPosition; + @Nullable private final IcebergEnumeratorPosition lastEnumeratedPosition; private final Collection pendingSplits; public IcebergEnumeratorState(Collection pendingSplits) { diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java index 8f020bbe539e..f72804363894 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/IcebergEnumeratorStateSerializer.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.IOException; @@ -32,17 +31,21 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; @Internal -public class IcebergEnumeratorStateSerializer implements SimpleVersionedSerializer { +public class IcebergEnumeratorStateSerializer + implements SimpleVersionedSerializer { - public static final IcebergEnumeratorStateSerializer INSTANCE = new IcebergEnumeratorStateSerializer(); + public static final IcebergEnumeratorStateSerializer INSTANCE = + new IcebergEnumeratorStateSerializer(); private static final int VERSION = 1; private static final ThreadLocal SERIALIZER_CACHE = ThreadLocal.withInitial(() -> new DataOutputSerializer(1024)); - private final IcebergEnumeratorPositionSerializer positionSerializer = IcebergEnumeratorPositionSerializer.INSTANCE; - private final IcebergSourceSplitSerializer splitSerializer = IcebergSourceSplitSerializer.INSTANCE; + private final IcebergEnumeratorPositionSerializer positionSerializer = + IcebergEnumeratorPositionSerializer.INSTANCE; + private final IcebergSourceSplitSerializer splitSerializer = + IcebergSourceSplitSerializer.INSTANCE; @Override public int getVersion() { @@ -108,7 +111,8 @@ private IcebergEnumeratorState deserializeV1(byte[] serialized) throws IOExcepti in.read(splitBytes); IcebergSourceSplit split = splitSerializer.deserialize(splitSerializerVersion, splitBytes); String statusName = in.readUTF(); - pendingSplits.add(new IcebergSourceSplitState(split, IcebergSourceSplitStatus.valueOf(statusName))); + pendingSplits.add( + new IcebergSourceSplitState(split, IcebergSourceSplitStatus.valueOf(statusName))); } return new IcebergEnumeratorState(enumeratorPosition, pendingSplits); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java index 8d33b6f0734d..1897f2368a41 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/enumerator/StaticIcebergEnumerator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.util.List; @@ -34,9 +33,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * One-time split enumeration at the start-up for batch execution - */ +/** One-time split enumeration at the start-up for batch execution */ @Internal public class StaticIcebergEnumerator extends AbstractIcebergEnumerator { private static final Logger LOG = LoggerFactory.getLogger(StaticIcebergEnumerator.class); @@ -64,18 +61,26 @@ public StaticIcebergEnumerator( public void start() { super.start(); if (shouldEnumerate) { - // Ideally, operatorId should be used as the threadPoolName as Flink guarantees its uniqueness within a job. - // SplitEnumeratorContext doesn't expose the OperatorCoordinator.Context, which would contain the OperatorID. - // Need to discuss with Flink community whether it is ok to expose a public API like the protected method - // "OperatorCoordinator.Context getCoordinatorContext()" from SourceCoordinatorContext implementation. + // Ideally, operatorId should be used as the threadPoolName as Flink guarantees its uniqueness + // within a job. + // SplitEnumeratorContext doesn't expose the OperatorCoordinator.Context, which would contain + // the OperatorID. + // Need to discuss with Flink community whether it is ok to expose a public API like the + // protected method + // "OperatorCoordinator.Context getCoordinatorContext()" from SourceCoordinatorContext + // implementation. // For now,

  • - is used as the unique thread pool name. String threadName = "iceberg-plan-worker-pool-" + table.name() + "-" + UUID.randomUUID(); - ExecutorService workerPool = ThreadPools.newWorkerPool(threadName, scanContext.planParallelism()); + ExecutorService workerPool = + ThreadPools.newWorkerPool(threadName, scanContext.planParallelism()); try { - List splits = FlinkSplitPlanner.planIcebergSourceSplits(table, scanContext, workerPool); + List splits = + FlinkSplitPlanner.planIcebergSourceSplits(table, scanContext, workerPool); assigner.onDiscoveredSplits(splits); - LOG.info("Discovered {} splits from table {} during job initialization", - splits.size(), table.name()); + LOG.info( + "Discovered {} splits from table {} during job initialization", + splits.size(), + table.name()); } finally { workerPool.shutdown(); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java index f4e23a09a9f2..7b94c364c976 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayBatchRecords.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.util.Collections; @@ -33,22 +32,20 @@ * {@link RecordsWithSplitIds} is used to pass a batch of records from fetcher to source reader. * Batching is to improve the efficiency for records handover. * - * {@link RecordsWithSplitIds} interface can encapsulate batches from multiple splits. - * This is the case for Kafka source where fetchers can retrieve records from multiple - * Kafka partitions at the same time. + *

    {@link RecordsWithSplitIds} interface can encapsulate batches from multiple splits. This is + * the case for Kafka source where fetchers can retrieve records from multiple Kafka partitions at + * the same time. * - * For file-based sources like Iceberg, readers always read one split/file at a time. - * Hence, we will only have a batch of records for one split here. + *

    For file-based sources like Iceberg, readers always read one split/file at a time. Hence, we + * will only have a batch of records for one split here. * - * This class uses array to store a batch of records from the same file (with the same fileOffset). + *

    This class uses array to store a batch of records from the same file (with the same + * fileOffset). */ class ArrayBatchRecords implements RecordsWithSplitIds> { - @Nullable - private String splitId; - @Nullable - private final Pool.Recycler recycler; - @Nullable - private final T[] records; + @Nullable private String splitId; + @Nullable private final Pool.Recycler recycler; + @Nullable private final T[] records; private final int numberOfRecords; private final Set finishedSplits; private final RecordAndPosition recordAndPosition; @@ -57,8 +54,13 @@ class ArrayBatchRecords implements RecordsWithSplitIds> private int position; private ArrayBatchRecords( - @Nullable String splitId, @Nullable Pool.Recycler recycler, @Nullable T[] records, - int numberOfRecords, int fileOffset, long startingRecordOffset, Set finishedSplits) { + @Nullable String splitId, + @Nullable Pool.Recycler recycler, + @Nullable T[] records, + int numberOfRecords, + int fileOffset, + long startingRecordOffset, + Set finishedSplits) { Preconditions.checkArgument(numberOfRecords >= 0, "numberOfRecords can't be negative"); Preconditions.checkArgument(fileOffset >= 0, "fileOffset can't be negative"); Preconditions.checkArgument(startingRecordOffset >= 0, "numberOfRecords can't be negative"); @@ -67,7 +69,8 @@ private ArrayBatchRecords( this.recycler = recycler; this.records = records; this.numberOfRecords = numberOfRecords; - this.finishedSplits = Preconditions.checkNotNull(finishedSplits, "finishedSplits can be empty but not null"); + this.finishedSplits = + Preconditions.checkNotNull(finishedSplits, "finishedSplits can be empty but not null"); this.recordAndPosition = new RecordAndPosition<>(); recordAndPosition.set(null, fileOffset, startingRecordOffset); @@ -97,8 +100,8 @@ public RecordAndPosition nextRecordFromSplit() { } /** - * This method is called when all records from this batch has been emitted. - * If recycler is set, it should be called to return the records array back to pool. + * This method is called when all records from this batch has been emitted. If recycler is set, it + * should be called to return the records array back to pool. */ @Override public void recycle() { @@ -125,15 +128,15 @@ int numberOfRecords() { /** * Create a ArrayBatchRecords backed up an array with records from the same file * - * @param splitId Iceberg source only read from one split a time. - * We never have multiple records from multiple splits. - * @param recycler Because {@link DataIterator} with {@link RowData} returns an iterator of reused RowData object, - * we need to clone RowData eagerly when constructing a batch of records. - * We can use object pool to reuse the RowData array object which can be expensive to create. - * This recycler can be provided to recycle the array object back to pool after read is exhausted. - * If the {@link DataIterator} returns an iterator of non-reused objects, - * we don't need to clone objects. It is cheap to just create the batch array. - * Hence, we don't need object pool and recycler can be set to null. + * @param splitId Iceberg source only read from one split a time. We never have multiple records + * from multiple splits. + * @param recycler Because {@link DataIterator} with {@link RowData} returns an iterator of reused + * RowData object, we need to clone RowData eagerly when constructing a batch of records. We + * can use object pool to reuse the RowData array object which can be expensive to create. + * This recycler can be provided to recycle the array object back to pool after read is + * exhausted. If the {@link DataIterator} returns an iterator of non-reused objects, we don't + * need to clone objects. It is cheap to just create the batch array. Hence, we don't need + * object pool and recycler can be set to null. * @param records an array (maybe reused) holding a batch of records * @param numberOfRecords actual number of records in the array * @param fileOffset fileOffset for all records in this batch @@ -141,10 +144,20 @@ int numberOfRecords() { * @param record type */ public static ArrayBatchRecords forRecords( - String splitId, Pool.Recycler recycler, T[] records, int numberOfRecords, - int fileOffset, long startingRecordOffset) { - return new ArrayBatchRecords<>(splitId, recycler, records, numberOfRecords, - fileOffset, startingRecordOffset, Collections.emptySet()); + String splitId, + Pool.Recycler recycler, + T[] records, + int numberOfRecords, + int fileOffset, + long startingRecordOffset) { + return new ArrayBatchRecords<>( + splitId, + recycler, + records, + numberOfRecords, + fileOffset, + startingRecordOffset, + Collections.emptySet()); } /** @@ -153,7 +166,6 @@ public static ArrayBatchRecords forRecords( * @param splitId for the split that is just exhausted */ public static ArrayBatchRecords finishedSplit(String splitId) { - return new ArrayBatchRecords<>(null, null, null, - 0, 0, 0, Collections.singleton(splitId)); + return new ArrayBatchRecords<>(null, null, null, 0, 0, 0, Collections.singleton(splitId)); } } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java index 22da79ebf1e0..306afd1811be 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.io.IOException; @@ -30,9 +29,7 @@ import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -/** - * This implementation stores record batch in array from recyclable pool - */ +/** This implementation stores record batch in array from recyclable pool */ class ArrayPoolDataIteratorBatcher implements DataIteratorBatcher { private final int batchSize; private final int handoverQueueSize; @@ -67,7 +64,8 @@ private Pool createPoolOfBatches(int numBatches) { return poolOfBatches; } - private class ArrayPoolBatchIterator implements CloseableIterator>> { + private class ArrayPoolBatchIterator + implements CloseableIterator>> { private final String splitId; private final DataIterator inputIterator; @@ -106,8 +104,13 @@ public RecordsWithSplitIds> next() { } } - return ArrayBatchRecords.forRecords(splitId, pool.recycler(), batch, recordCount, - inputIterator.fileOffset(), inputIterator.recordOffset() - recordCount); + return ArrayBatchRecords.forRecords( + splitId, + pool.recycler(), + batch, + recordCount, + inputIterator.fileOffset(), + inputIterator.recordOffset() - recordCount); } @Override diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java index f95a7f95e669..c376e359c600 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.io.Serializable; @@ -26,10 +25,12 @@ import org.apache.iceberg.io.CloseableIterator; /** - * Batcher converts iterator of T into iterator of batched {@code RecordsWithSplitIds>}, - * as FLIP-27's {@link SplitReader#fetch()} returns batched records. + * Batcher converts iterator of T into iterator of batched {@code + * RecordsWithSplitIds>}, as FLIP-27's {@link SplitReader#fetch()} returns + * batched records. */ @FunctionalInterface public interface DataIteratorBatcher extends Serializable { - CloseableIterator>> batch(String splitId, DataIterator inputIterator); + CloseableIterator>> batch( + String splitId, DataIterator inputIterator); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java index 5c2c2b4b3ae8..bbf797ef4aa8 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; @@ -24,9 +23,7 @@ import org.apache.iceberg.flink.source.split.IcebergSourceSplit; import org.apache.iceberg.io.CloseableIterator; -/** - * A {@link ReaderFunction} implementation that uses {@link DataIterator}. - */ +/** A {@link ReaderFunction} implementation that uses {@link DataIterator}. */ public abstract class DataIteratorReaderFunction implements ReaderFunction { private final DataIteratorBatcher batcher; @@ -37,10 +34,10 @@ public DataIteratorReaderFunction(DataIteratorBatcher batcher) { protected abstract DataIterator createDataIterator(IcebergSourceSplit split); @Override - public CloseableIterator>> apply(IcebergSourceSplit split) { + public CloseableIterator>> apply( + IcebergSourceSplit split) { DataIterator inputIterator = createDataIterator(split); inputIterator.seek(split.fileOffset(), split.recordOffset()); return batcher.batch(split.splitId(), inputIterator); } - } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java index 44d3ba572dca..db8d2fcd56e3 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.util.Collection; @@ -28,13 +27,12 @@ import org.apache.iceberg.flink.source.split.SplitRequestEvent; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -public class IcebergSourceReader extends - SingleThreadMultiplexSourceReaderBase, T, IcebergSourceSplit, IcebergSourceSplit> { +public class IcebergSourceReader + extends SingleThreadMultiplexSourceReaderBase< + RecordAndPosition, T, IcebergSourceSplit, IcebergSourceSplit> { public IcebergSourceReader( - ReaderFunction readerFunction, - SourceReaderContext context, - ReaderMetricsContext metrics) { + ReaderFunction readerFunction, SourceReaderContext context, ReaderMetricsContext metrics) { super( () -> new IcebergSourceSplitReader<>(readerFunction, context, metrics), new IcebergSourceRecordEmitter<>(), diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceRecordEmitter.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceRecordEmitter.java index 4e467db92e93..337d9d3c4223 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceRecordEmitter.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceRecordEmitter.java @@ -16,23 +16,20 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import org.apache.flink.api.connector.source.SourceOutput; import org.apache.flink.connector.base.source.reader.RecordEmitter; import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -final class IcebergSourceRecordEmitter implements RecordEmitter, T, IcebergSourceSplit> { +final class IcebergSourceRecordEmitter + implements RecordEmitter, T, IcebergSourceSplit> { - IcebergSourceRecordEmitter() { - } + IcebergSourceRecordEmitter() {} @Override public void emitRecord( - RecordAndPosition element, - SourceOutput output, - IcebergSourceSplit split) { + RecordAndPosition element, SourceOutput output, IcebergSourceSplit split) { output.collect(element.record()); split.updatePosition(element.fileOffset(), element.recordOffset()); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java index 8c6bb90a64c1..8e9e2b296e39 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.io.IOException; @@ -55,18 +54,24 @@ class IcebergSourceSplitReader implements SplitReader, I private IcebergSourceSplit currentSplit; private String currentSplitId; - IcebergSourceSplitReader(ReaderFunction openSplitFunction, - SourceReaderContext context, - ReaderMetricsContext metrics) { + IcebergSourceSplitReader( + ReaderFunction openSplitFunction, + SourceReaderContext context, + ReaderMetricsContext metrics) { this.openSplitFunction = openSplitFunction; this.indexOfSubtask = context.getIndexOfSubtask(); this.splits = new ArrayDeque<>(); - this.assignedSplits = metrics.counter(ReaderMetricsContext.ASSIGNED_SPLITS, Long.class, Unit.COUNT); - this.assignedBytes = metrics.counter(ReaderMetricsContext.ASSIGNED_BYTES, Long.class, Unit.COUNT); - this.finishedSplits = metrics.counter(ReaderMetricsContext.FINISHED_SPLITS, Long.class, Unit.COUNT); - this.finishedBytes = metrics.counter(ReaderMetricsContext.FINISHED_BYTES, Long.class, Unit.COUNT); - this.splitReaderFetchCalls = metrics.counter(ReaderMetricsContext.SPLIT_READER_FETCH_CALLS, Long.class, Unit.COUNT); + this.assignedSplits = + metrics.counter(ReaderMetricsContext.ASSIGNED_SPLITS, Long.class, Unit.COUNT); + this.assignedBytes = + metrics.counter(ReaderMetricsContext.ASSIGNED_BYTES, Long.class, Unit.COUNT); + this.finishedSplits = + metrics.counter(ReaderMetricsContext.FINISHED_SPLITS, Long.class, Unit.COUNT); + this.finishedBytes = + metrics.counter(ReaderMetricsContext.FINISHED_BYTES, Long.class, Unit.COUNT); + this.splitReaderFetchCalls = + metrics.counter(ReaderMetricsContext.SPLIT_READER_FETCH_CALLS, Long.class, Unit.COUNT); } @Override @@ -101,8 +106,8 @@ public RecordsWithSplitIds> fetch() throws IOException { @Override public void handleSplitsChanges(SplitsChange splitsChange) { if (!(splitsChange instanceof SplitsAddition)) { - throw new UnsupportedOperationException(String.format( - "Unsupported split change: %s", splitsChange.getClass())); + throw new UnsupportedOperationException( + String.format("Unsupported split change: %s", splitsChange.getClass())); } LOG.info("Add {} splits to reader", splitsChange.splits().size()); @@ -112,8 +117,7 @@ public void handleSplitsChanges(SplitsChange splitsChange) { } @Override - public void wakeUp() { - } + public void wakeUp() {} @Override public void close() throws Exception { @@ -124,15 +128,11 @@ public void close() throws Exception { } private long calculateBytes(IcebergSourceSplit split) { - return split.task().files().stream() - .map(FileScanTask::length) - .reduce(0L, Long::sum); + return split.task().files().stream().map(FileScanTask::length).reduce(0L, Long::sum); } private long calculateBytes(SplitsChange splitsChanges) { - return splitsChanges.splits().stream() - .map(this::calculateBytes) - .reduce(0L, Long::sum); + return splitsChanges.splits().stream().map(this::calculateBytes).reduce(0L, Long::sum); } private ArrayBatchRecords finishSplit() throws IOException { diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java index eff0156229de..1ea91f10b4e7 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.io.Serializable; @@ -26,6 +25,7 @@ import org.apache.iceberg.io.CloseableIterator; @FunctionalInterface -public interface ReaderFunction extends Serializable, - Function>>> { -} +public interface ReaderFunction + extends Serializable, + Function< + IcebergSourceSplit, CloseableIterator>>> {} diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderMetricsContext.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderMetricsContext.java index 6fcf2a45bef5..15ff3cc8bd63 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderMetricsContext.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderMetricsContext.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.util.concurrent.atomic.AtomicLong; @@ -71,15 +70,17 @@ public Counter counter(String name, Class type, Unit un ValidationException.check(type == Long.class, "'%s' requires Integer type", FINISHED_BYTES); return (Counter) longCounter(finishedBytes::addAndGet, finishedBytes::get); case SPLIT_READER_FETCH_CALLS: - ValidationException.check(type == Long.class, "'%s' requires Integer type", SPLIT_READER_FETCH_CALLS); - return (Counter) longCounter(splitReaderFetchCalls::addAndGet, splitReaderFetchCalls::get); + ValidationException.check( + type == Long.class, "'%s' requires Integer type", SPLIT_READER_FETCH_CALLS); + return (Counter) + longCounter(splitReaderFetchCalls::addAndGet, splitReaderFetchCalls::get); default: throw new IllegalArgumentException(String.format("Unsupported counter: '%s'", name)); } } private Counter longCounter(Consumer consumer, Supplier supplier) { - return new Counter() { + return new Counter() { @Override public void increment() { increment(1L); diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java index 46538516cbda..6ac92592b6aa 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import org.apache.flink.annotation.Internal; @@ -43,8 +42,7 @@ public RecordAndPosition(T record, int fileOffset, long recordOffset) { this.recordOffset = recordOffset; } - public RecordAndPosition() { - } + public RecordAndPosition() {} // ------------------------------------------------------------------------ @@ -77,5 +75,4 @@ public void record(T nextRecord) { public String toString() { return String.format("%s @ %d + %d", record, fileOffset, recordOffset); } - } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java index 4134b071622f..ef92e2e6b81f 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java @@ -16,25 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.io.Serializable; /** - * In FLIP-27 source, SplitReader#fetch() returns a batch of records. - * Since DataIterator for RowData returns an iterator of reused RowData objects, - * RecordFactory is needed to (1) create object array that is recyclable via pool. - * (2) clone RowData element from DataIterator to the batch array. + * In FLIP-27 source, SplitReader#fetch() returns a batch of records. Since DataIterator for RowData + * returns an iterator of reused RowData objects, RecordFactory is needed to (1) create object array + * that is recyclable via pool. (2) clone RowData element from DataIterator to the batch array. */ interface RecordFactory extends Serializable { - /** - * Create a batch of records - */ + /** Create a batch of records */ T[] createBatch(int batchSize); - /** - * Clone record into the specified position of the batch array - */ + /** Clone record into the specified position of the batch array */ void clone(T from, T[] batch, int position); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java index cdb460f2e7ab..c747375d2a28 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import org.apache.flink.configuration.ReadableConfig; @@ -39,10 +38,18 @@ public class RowDataReaderFunction extends DataIteratorReaderFunction { private final EncryptionManager encryption; public RowDataReaderFunction( - ReadableConfig config, Schema tableSchema, Schema projectedSchema, - String nameMapping, boolean caseSensitive, FileIO io, EncryptionManager encryption) { - super(new ArrayPoolDataIteratorBatcher<>(config, new RowDataRecordFactory( - FlinkSchemaUtil.convert(readSchema(tableSchema, projectedSchema))))); + ReadableConfig config, + Schema tableSchema, + Schema projectedSchema, + String nameMapping, + boolean caseSensitive, + FileIO io, + EncryptionManager encryption) { + super( + new ArrayPoolDataIteratorBatcher<>( + config, + new RowDataRecordFactory( + FlinkSchemaUtil.convert(readSchema(tableSchema, projectedSchema))))); this.tableSchema = tableSchema; this.readSchema = readSchema(tableSchema, projectedSchema); this.nameMapping = nameMapping; @@ -54,17 +61,14 @@ public RowDataReaderFunction( @Override public DataIterator createDataIterator(IcebergSourceSplit split) { return new DataIterator<>( - new RowDataFileScanTaskReader( - tableSchema, - readSchema, - nameMapping, - caseSensitive), - split.task(), io, encryption); + new RowDataFileScanTaskReader(tableSchema, readSchema, nameMapping, caseSensitive), + split.task(), + io, + encryption); } private static Schema readSchema(Schema tableSchema, Schema projectedSchema) { Preconditions.checkNotNull(tableSchema, "Table schema can't be null"); return projectedSchema == null ? tableSchema : projectedSchema; } - } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java index a91c7b45ed61..1e265b2663ce 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import org.apache.flink.api.common.typeutils.TypeSerializer; diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java index b46096af0e67..35f8ade9843d 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.split; import java.io.IOException; @@ -43,8 +42,7 @@ public class IcebergSourceSplit implements SourceSplit, Serializable { // The splits are frequently serialized into checkpoints. // Caching the byte representation makes repeated serialization cheap. - @Nullable - private transient byte[] serializedBytesCache; + @Nullable private transient byte[] serializedBytesCache; private IcebergSourceSplit(CombinedScanTask task, int fileOffset, long recordOffset) { this.task = task; @@ -75,9 +73,7 @@ public long recordOffset() { @Override public String splitId() { - return MoreObjects.toStringHelper(this) - .add("files", toString(task.files())) - .toString(); + return MoreObjects.toStringHelper(this).add("files", toString(task.files())).toString(); } public void updatePosition(int newFileOffset, long newRecordOffset) { @@ -97,12 +93,16 @@ public String toString() { } private String toString(Collection files) { - return Iterables.toString(files.stream().map(fileScanTask -> - MoreObjects.toStringHelper(fileScanTask) - .add("file", fileScanTask.file().path().toString()) - .add("start", fileScanTask.start()) - .add("length", fileScanTask.length()) - .toString()).collect(Collectors.toList())); + return Iterables.toString( + files.stream() + .map( + fileScanTask -> + MoreObjects.toStringHelper(fileScanTask) + .add("file", fileScanTask.file().path().toString()) + .add("start", fileScanTask.start()) + .add("length", fileScanTask.length()) + .toString()) + .collect(Collectors.toList())); } byte[] serializeV1() throws IOException { @@ -114,7 +114,8 @@ byte[] serializeV1() throws IOException { static IcebergSourceSplit deserializeV1(byte[] serialized) throws IOException { try { - return InstantiationUtil.deserializeObject(serialized, IcebergSourceSplit.class.getClassLoader()); + return InstantiationUtil.deserializeObject( + serialized, IcebergSourceSplit.class.getClassLoader()); } catch (ClassNotFoundException e) { throw new RuntimeException("Failed to deserialize the split.", e); } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java index 9e32af5429b9..ee0f364e17d6 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.split; import java.io.IOException; @@ -24,9 +23,8 @@ import org.apache.flink.core.io.SimpleVersionedSerializer; /** - * TODO: use Java serialization for now. - * Will switch to more stable serializer from - * issue-1698. + * TODO: use Java serialization for now. Will switch to more stable serializer from issue-1698. */ @Internal public class IcebergSourceSplitSerializer implements SimpleVersionedSerializer { @@ -49,8 +47,11 @@ public IcebergSourceSplit deserialize(int version, byte[] serialized) throws IOE case 1: return IcebergSourceSplit.deserializeV1(serialized); default: - throw new IOException(String.format("Failed to deserialize IcebergSourceSplit. " + - "Encountered unsupported version: %d. Supported version are [1]", version)); + throw new IOException( + String.format( + "Failed to deserialize IcebergSourceSplit. " + + "Encountered unsupported version: %d. Supported version are [1]", + version)); } } } diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java index b08218b93ce9..d9061e049e00 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitState.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.split; public class IcebergSourceSplitState { diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java index 78ce70b22f7a..d4a84a165e1a 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitStatus.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.split; public enum IcebergSourceSplitStatus { diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java index 7a7610cc1978..eabd757aa638 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/source/split/SplitRequestEvent.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.split; import java.util.Collection; @@ -24,9 +23,7 @@ import org.apache.flink.annotation.Internal; import org.apache.flink.api.connector.source.SourceEvent; -/** - * We can remove this class once FLINK-21364 is resolved. - */ +/** We can remove this class once FLINK-21364 is resolved. */ @Internal public class SplitRequestEvent implements SourceEvent { private static final long serialVersionUID = 1L; diff --git a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java index 274d2a8d17a0..2c5c587f4ebf 100644 --- a/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java +++ b/flink/v1.15/flink/src/main/java/org/apache/iceberg/flink/util/FlinkCompatibilityUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.util; import org.apache.flink.api.common.typeinfo.TypeInformation; @@ -26,14 +25,12 @@ import org.apache.flink.table.types.logical.RowType; /** - * This is a small util class that try to hide calls to Flink - * Internal or PublicEvolve interfaces as Flink can change - * those APIs during minor version release. + * This is a small util class that try to hide calls to Flink Internal or PublicEvolve interfaces as + * Flink can change those APIs during minor version release. */ public class FlinkCompatibilityUtil { - private FlinkCompatibilityUtil() { - } + private FlinkCompatibilityUtil() {} public static TypeInformation toTypeInfo(RowType rowType) { return InternalTypeInfo.of(rowType); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/FlinkCatalogTestBase.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/FlinkCatalogTestBase.java index 3c5f25e9d876..d4da736dcd83 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/FlinkCatalogTestBase.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/FlinkCatalogTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.IOException; @@ -74,8 +73,7 @@ public static Iterable parameters() { return Lists.newArrayList( new Object[] {"testhive", Namespace.empty()}, new Object[] {"testhadoop", Namespace.empty()}, - new Object[] {"testhadoop_basenamespace", Namespace.of("l0", "l1")} - ); + new Object[] {"testhadoop_basenamespace", Namespace.of("l0", "l1")}); } protected final String catalogName; @@ -92,9 +90,10 @@ public FlinkCatalogTestBase(String catalogName, Namespace baseNamespace) { this.catalogName = catalogName; this.baseNamespace = baseNamespace; this.isHadoopCatalog = catalogName.startsWith("testhadoop"); - this.validationCatalog = isHadoopCatalog ? - new HadoopCatalog(hiveConf, "file:" + hadoopWarehouse.getRoot()) : - catalog; + this.validationCatalog = + isHadoopCatalog + ? new HadoopCatalog(hiveConf, "file:" + hadoopWarehouse.getRoot()) + : catalog; this.validationNamespaceCatalog = (SupportsNamespaces) validationCatalog; config.put("type", "iceberg"); @@ -110,7 +109,8 @@ public FlinkCatalogTestBase(String catalogName, Namespace baseNamespace) { config.put(CatalogProperties.WAREHOUSE_LOCATION, String.format("file://%s", warehouseRoot())); this.flinkDatabase = catalogName + "." + DATABASE; - this.icebergNamespace = Namespace.of(ArrayUtils.concat(baseNamespace.levels(), new String[] {DATABASE})); + this.icebergNamespace = + Namespace.of(ArrayUtils.concat(baseNamespace.levels(), new String[] {DATABASE})); } protected String warehouseRoot() { @@ -139,8 +139,14 @@ static String toWithClause(Map props) { if (propCount > 0) { builder.append(","); } - builder.append("'").append(entry.getKey()).append("'").append("=") - .append("'").append(entry.getValue()).append("'"); + builder + .append("'") + .append(entry.getKey()) + .append("'") + .append("=") + .append("'") + .append(entry.getValue()) + .append("'"); propCount++; } builder.append(")"); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/FlinkTestBase.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/FlinkTestBase.java index d19ff3467eca..95471ac88257 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/FlinkTestBase.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/FlinkTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -45,8 +44,7 @@ public abstract class FlinkTestBase extends TestBaseUtils { public static MiniClusterWithClientResource miniClusterResource = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private static TestHiveMetastore metastore = null; protected static HiveConf hiveConf = null; @@ -59,8 +57,10 @@ public static void startMetastore() { FlinkTestBase.metastore = new TestHiveMetastore(); metastore.start(); FlinkTestBase.hiveConf = metastore.hiveConf(); - FlinkTestBase.catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + FlinkTestBase.catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); } @AfterClass @@ -73,13 +73,12 @@ protected TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { if (tEnv == null) { - EnvironmentSettings settings = EnvironmentSettings - .newInstance() - .inBatchMode() - .build(); + EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build(); TableEnvironment env = TableEnvironment.create(settings); - env.getConfig().getConfiguration().set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); + env.getConfig() + .getConfiguration() + .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); tEnv = env; } } @@ -105,9 +104,7 @@ protected List sql(String query, Object... args) { } protected void assertSameElements(Iterable expected, Iterable actual) { - Assertions.assertThat(actual) - .isNotNull() - .containsExactlyInAnyOrderElementsOf(expected); + Assertions.assertThat(actual).isNotNull().containsExactlyInAnyOrderElementsOf(expected); } protected void assertSameElements(String message, Iterable expected, Iterable actual) { diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/HadoopTableResource.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/HadoopTableResource.java index bc4e209a46ba..a4338a881bc8 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/HadoopTableResource.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/HadoopTableResource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -41,12 +40,17 @@ public class HadoopTableResource extends ExternalResource { private TableLoader tableLoader; private Table table; - public HadoopTableResource(TemporaryFolder temporaryFolder, String database, String tableName, Schema schema) { + public HadoopTableResource( + TemporaryFolder temporaryFolder, String database, String tableName, Schema schema) { this(temporaryFolder, database, tableName, schema, null); } - public HadoopTableResource(TemporaryFolder temporaryFolder, String database, String tableName, - Schema schema, PartitionSpec partitionSpec) { + public HadoopTableResource( + TemporaryFolder temporaryFolder, + String database, + String tableName, + Schema schema, + PartitionSpec partitionSpec) { this.temporaryFolder = temporaryFolder; this.database = database; this.tableName = tableName; @@ -67,7 +71,8 @@ protected void before() throws Throwable { if (partitionSpec == null) { this.table = catalog.createTable(TableIdentifier.of(database, tableName), schema); } else { - this.table = catalog.createTable(TableIdentifier.of(database, tableName), schema, partitionSpec); + this.table = + catalog.createTable(TableIdentifier.of(database, tableName), schema, partitionSpec); } tableLoader.open(); } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java index 9dfa1acf2719..45af9241b743 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/MiniClusterResource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import org.apache.flink.configuration.Configuration; @@ -29,20 +28,18 @@ public class MiniClusterResource { private static final int DEFAULT_TM_NUM = 1; private static final int DEFAULT_PARALLELISM = 4; - public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = new Configuration() - // disable classloader check as Avro may cache class/object in the serializers. - .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - - private MiniClusterResource() { + public static final Configuration DISABLE_CLASSLOADER_CHECK_CONFIG = + new Configuration() + // disable classloader check as Avro may cache class/object in the serializers. + .set(CoreOptions.CHECK_LEAKED_CLASSLOADER, false); - } + private MiniClusterResource() {} /** - * It will start a mini cluster with classloader.check-leaked-classloader=false, - * so that we won't break the unit tests because of the class loader leak issue. - * In our iceberg integration tests, there're some that will assert the results - * after finished the flink jobs, so actually we may access the class loader - * that has been closed by the flink task managers if we enable the switch + * It will start a mini cluster with classloader.check-leaked-classloader=false, so that we won't + * break the unit tests because of the class loader leak issue. In our iceberg integration tests, + * there're some that will assert the results after finished the flink jobs, so actually we may + * access the class loader that has been closed by the flink task managers if we enable the switch * classloader.check-leaked-classloader by default. */ public static MiniClusterWithClientResource createWithClassloaderCheckDisabled() { @@ -53,5 +50,4 @@ public static MiniClusterWithClientResource createWithClassloaderCheckDisabled() .setConfiguration(DISABLE_CLASSLOADER_CHECK_CONFIG) .build()); } - } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java index 24ba196f1195..e532fb62615c 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/RowDataConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.math.BigDecimal; @@ -50,8 +49,7 @@ public class RowDataConverter { private static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC); private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); - private RowDataConverter() { - } + private RowDataConverter() {} public static RowData convert(Schema iSchema, Record record) { return convert(iSchema.asStruct(), record); @@ -104,11 +102,14 @@ private static Object convert(Type type, Object object) { return bb.array(); case BINARY: ByteBuffer buffer = (ByteBuffer) object; - return Arrays.copyOfRange(buffer.array(), buffer.arrayOffset() + buffer.position(), + return Arrays.copyOfRange( + buffer.array(), + buffer.arrayOffset() + buffer.position(), buffer.arrayOffset() + buffer.remaining()); case DECIMAL: Types.DecimalType decimalType = (Types.DecimalType) type; - return DecimalData.fromBigDecimal((BigDecimal) object, decimalType.precision(), decimalType.scale()); + return DecimalData.fromBigDecimal( + (BigDecimal) object, decimalType.precision(), decimalType.scale()); case STRUCT: return convert(type.asStructType(), (Record) object); case LIST: @@ -124,8 +125,7 @@ private static Object convert(Type type, Object object) { for (Map.Entry entry : map.entrySet()) { convertedMap.put( convert(type.asMapType().keyType(), entry.getKey()), - convert(type.asMapType().valueType(), entry.getValue()) - ); + convert(type.asMapType().valueType(), entry.getValue())); } return new GenericMapData(convertedMap); default: diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java index a6f70453bc50..87c0bc364663 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/SimpleDataUtil.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.hadoop.HadoopOutputFile.fromPath; + import java.io.IOException; import java.time.Duration; import java.util.Collections; @@ -68,28 +69,24 @@ import org.apache.iceberg.util.StructLikeWrapper; import org.junit.Assert; -import static org.apache.iceberg.hadoop.HadoopOutputFile.fromPath; - public class SimpleDataUtil { - private SimpleDataUtil() { - } + private SimpleDataUtil() {} - public static final Schema SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + public static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); - public static final TableSchema FLINK_SCHEMA = TableSchema.builder() - .field("id", DataTypes.INT()) - .field("data", DataTypes.STRING()) - .build(); + public static final TableSchema FLINK_SCHEMA = + TableSchema.builder().field("id", DataTypes.INT()).field("data", DataTypes.STRING()).build(); public static final RowType ROW_TYPE = (RowType) FLINK_SCHEMA.toRowDataType().getLogicalType(); public static final Record RECORD = GenericRecord.create(SCHEMA); - public static Table createTable(String path, Map properties, boolean partitioned) { + public static Table createTable( + String path, Map properties, boolean partitioned) { PartitionSpec spec; if (partitioned) { spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); @@ -126,8 +123,13 @@ public static RowData createUpdateAfter(Integer id, String data) { return GenericRowData.ofKind(RowKind.UPDATE_AFTER, id, StringData.fromString(data)); } - public static DataFile writeFile(Schema schema, PartitionSpec spec, Configuration conf, - String location, String filename, List rows) + public static DataFile writeFile( + Schema schema, + PartitionSpec spec, + Configuration conf, + String location, + String filename, + List rows) throws IOException { Path path = new Path(location, filename); FileFormat fileFormat = FileFormat.fromFileName(filename); @@ -148,26 +150,40 @@ public static DataFile writeFile(Schema schema, PartitionSpec spec, Configuratio .build(); } - public static DeleteFile writeEqDeleteFile(Table table, FileFormat format, String filename, - FileAppenderFactory appenderFactory, - List deletes) throws IOException { + public static DeleteFile writeEqDeleteFile( + Table table, + FileFormat format, + String filename, + FileAppenderFactory appenderFactory, + List deletes) + throws IOException { EncryptedOutputFile outputFile = - table.encryption().encrypt(fromPath(new Path(table.location(), filename), new Configuration())); + table + .encryption() + .encrypt(fromPath(new Path(table.location(), filename), new Configuration())); - EqualityDeleteWriter eqWriter = appenderFactory.newEqDeleteWriter(outputFile, format, null); + EqualityDeleteWriter eqWriter = + appenderFactory.newEqDeleteWriter(outputFile, format, null); try (EqualityDeleteWriter writer = eqWriter) { writer.deleteAll(deletes); } return eqWriter.toDeleteFile(); } - public static DeleteFile writePosDeleteFile(Table table, FileFormat format, String filename, - FileAppenderFactory appenderFactory, - List> positions) throws IOException { + public static DeleteFile writePosDeleteFile( + Table table, + FileFormat format, + String filename, + FileAppenderFactory appenderFactory, + List> positions) + throws IOException { EncryptedOutputFile outputFile = - table.encryption().encrypt(fromPath(new Path(table.location(), filename), new Configuration())); + table + .encryption() + .encrypt(fromPath(new Path(table.location(), filename), new Configuration())); - PositionDeleteWriter posWriter = appenderFactory.newPosDeleteWriter(outputFile, format, null); + PositionDeleteWriter posWriter = + appenderFactory.newPosDeleteWriter(outputFile, format, null); try (PositionDeleteWriter writer = posWriter) { for (Pair p : positions) { writer.delete(p.first(), p.second()); @@ -194,9 +210,7 @@ public static void assertTableRows(Table table, List expected) throws I assertTableRecords(table, convertToRecords(expected)); } - /** - * Get all rows for a table - */ + /** Get all rows for a table */ public static List tableRecords(Table table) throws IOException { table.refresh(); List records = Lists.newArrayList(); @@ -220,7 +234,8 @@ private static boolean equalsRecords(List expected, List actual, return expectedSet.equals(actualSet); } - private static void assertRecordsEqual(List expected, List actual, Schema schema) { + private static void assertRecordsEqual( + List expected, List actual, Schema schema) { Assert.assertEquals(expected.size(), actual.size()); Types.StructType type = schema.asStruct(); StructLikeSet expectedSet = StructLikeSet.create(type); @@ -231,8 +246,8 @@ private static void assertRecordsEqual(List expected, List actua } /** - * Assert table contains the expected list of records after - * waiting up to {@code maxCheckCount} with {@code checkInterval} + * Assert table contains the expected list of records after waiting up to {@code maxCheckCount} + * with {@code checkInterval} */ public static void assertTableRecords( Table table, List expected, Duration checkInterval, int maxCheckCount) @@ -266,7 +281,8 @@ public static void assertTableRecords(Table table, List expected) throws } } - public static void assertTableRecords(String tablePath, List expected) throws IOException { + public static void assertTableRecords(String tablePath, List expected) + throws IOException { Preconditions.checkArgument(expected != null, "expected records shouldn't be null"); assertTableRecords(new HadoopTables().load(tablePath), expected); } @@ -281,14 +297,15 @@ public static StructLikeSet actualRowSet(Table table, String... columns) throws return actualRowSet(table, null, columns); } - public static StructLikeSet actualRowSet(Table table, Long snapshotId, String... columns) throws IOException { + public static StructLikeSet actualRowSet(Table table, Long snapshotId, String... columns) + throws IOException { table.refresh(); StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - try (CloseableIterable reader = IcebergGenerics - .read(table) - .useSnapshot(snapshotId == null ? table.currentSnapshot().snapshotId() : snapshotId) - .select(columns) - .build()) { + try (CloseableIterable reader = + IcebergGenerics.read(table) + .useSnapshot(snapshotId == null ? table.currentSnapshot().snapshotId() : snapshotId) + .select(columns) + .build()) { reader.forEach(set::add); } return set; @@ -300,16 +317,14 @@ public static List partitionDataFiles(Table table, Map Types.StructType partitionType = table.spec().partitionType(); Record partitionRecord = GenericRecord.create(partitionType).copy(partitionValues); - StructLikeWrapper expectedWrapper = StructLikeWrapper - .forType(partitionType) - .set(partitionRecord); + StructLikeWrapper expectedWrapper = + StructLikeWrapper.forType(partitionType).set(partitionRecord); List dataFiles = Lists.newArrayList(); try (CloseableIterable fileScanTasks = table.newScan().planFiles()) { for (FileScanTask scanTask : fileScanTasks) { - StructLikeWrapper wrapper = StructLikeWrapper - .forType(partitionType) - .set(scanTask.file().partition()); + StructLikeWrapper wrapper = + StructLikeWrapper.forType(partitionType).set(scanTask.file().partition()); if (expectedWrapper.equals(wrapper)) { dataFiles.add(scanTask.file()); @@ -335,7 +350,9 @@ public static Map> snapshotToDataFiles(Table table) throws tableScan = tableScan.useSnapshot(current.snapshotId()); } try (CloseableIterable scanTasks = tableScan.planFiles()) { - result.put(current.snapshotId(), ImmutableList.copyOf(Iterables.transform(scanTasks, FileScanTask::file))); + result.put( + current.snapshotId(), + ImmutableList.copyOf(Iterables.transform(scanTasks, FileScanTask::file))); } // Continue to traverse the parent snapshot if exists. @@ -352,13 +369,14 @@ public static List matchingPartitions( List dataFiles, PartitionSpec partitionSpec, Map partitionValues) { Types.StructType partitionType = partitionSpec.partitionType(); Record partitionRecord = GenericRecord.create(partitionType).copy(partitionValues); - StructLikeWrapper expected = StructLikeWrapper - .forType(partitionType) - .set(partitionRecord); - return dataFiles.stream().filter(df -> { - StructLikeWrapper wrapper = StructLikeWrapper.forType(partitionType).set(df.partition()); - return wrapper.equals(expected); - }).collect(Collectors.toList()); + StructLikeWrapper expected = StructLikeWrapper.forType(partitionType).set(partitionRecord); + return dataFiles.stream() + .filter( + df -> { + StructLikeWrapper wrapper = + StructLikeWrapper.forType(partitionType).set(df.partition()); + return wrapper.equals(expected); + }) + .collect(Collectors.toList()); } - } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java index 6ecb169b44d7..e77c62c3849d 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestCatalogTableLoader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.ByteArrayInputStream; @@ -43,14 +42,13 @@ import org.junit.BeforeClass; import org.junit.Test; -/** - * Test for {@link CatalogLoader} and {@link TableLoader}. - */ +/** Test for {@link CatalogLoader} and {@link TableLoader}. */ public class TestCatalogTableLoader extends FlinkTestBase { private static File warehouse = null; private static final TableIdentifier IDENTIFIER = TableIdentifier.of("default", "my_table"); - private static final Schema SCHEMA = new Schema(Types.NestedField.required(1, "f1", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema(Types.NestedField.required(1, "f1", Types.StringType.get())); @BeforeClass public static void createWarehouse() throws IOException { @@ -95,12 +93,14 @@ public void testHiveCatalogTableLoader() throws IOException, ClassNotFoundExcept validateTableLoader(TableLoader.fromCatalog(catalogLoader, IDENTIFIER)); } - private static void validateCatalogLoader(CatalogLoader loader) throws IOException, ClassNotFoundException { + private static void validateCatalogLoader(CatalogLoader loader) + throws IOException, ClassNotFoundException { Table table = javaSerAndDeSer(loader).loadCatalog().createTable(IDENTIFIER, SCHEMA); validateHadoopConf(table); } - private static void validateTableLoader(TableLoader loader) throws IOException, ClassNotFoundException { + private static void validateTableLoader(TableLoader loader) + throws IOException, ClassNotFoundException { TableLoader copied = javaSerAndDeSer(loader); copied.open(); try { @@ -112,7 +112,9 @@ private static void validateTableLoader(TableLoader loader) throws IOException, private static void validateHadoopConf(Table table) { FileIO io = table.io(); - Assertions.assertThat(io).as("FileIO should be a HadoopFileIO").isInstanceOf(HadoopFileIO.class); + Assertions.assertThat(io) + .as("FileIO should be a HadoopFileIO") + .isInstanceOf(HadoopFileIO.class); HadoopFileIO hadoopIO = (HadoopFileIO) io; Assert.assertEquals("my_value", hadoopIO.conf().get("my_key")); } @@ -124,7 +126,8 @@ private static T javaSerAndDeSer(T object) throws IOException, ClassNotFound out.writeObject(object); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { return (T) in.readObject(); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java index bef5bbb0b04a..975d77cb3565 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestChangeLogTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -49,8 +48,9 @@ import org.junit.runners.Parameterized; /** - * In this test case, we mainly cover the impact of primary key selection, multiple operations within a single - * transaction, and multiple operations between different txn on the correctness of the data. + * In this test case, we mainly cover the impact of primary key selection, multiple operations + * within a single transaction, and multiple operations between different txn on the correctness of + * the data. */ @RunWith(Parameterized.class) public class TestChangeLogTable extends ChangeLogTableTestBase { @@ -66,10 +66,7 @@ public class TestChangeLogTable extends ChangeLogTableTestBase { @Parameterized.Parameters(name = "PartitionedTable={0}") public static Iterable parameters() { - return ImmutableList.of( - new Object[] {true}, - new Object[] {false} - ); + return ImmutableList.of(new Object[] {true}, new Object[] {false}); } public TestChangeLogTable(boolean partitioned) { @@ -85,12 +82,14 @@ public static void createWarehouse() throws IOException { @Before public void before() { - sql("CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + sql( + "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", CATALOG_NAME, warehouse); sql("USE CATALOG %s", CATALOG_NAME); sql("CREATE DATABASE %s", DATABASE_NAME); sql("USE %s", DATABASE_NAME); - // Set the table.exec.sink.upsert-materialize=NONE, so that downstream operators will receive the + // Set the table.exec.sink.upsert-materialize=NONE, so that downstream operators will receive + // the // records with the same order as the source operator, bypassing Flink's inferred shuffle. getTableEnv().getConfig().set("table.exec.sink.upsert-materialize", "NONE"); } @@ -106,137 +105,112 @@ public void clean() { @Test public void testSqlChangeLogOnIdKey() throws Exception { - List> inputRowsPerCheckpoint = ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(1, "bbb"), - insertRow(2, "aaa"), - deleteRow(2, "aaa"), - insertRow(2, "bbb") - ), + List> inputRowsPerCheckpoint = ImmutableList.of( - updateBeforeRow(2, "bbb"), - updateAfterRow(2, "ccc"), - deleteRow(2, "ccc"), - insertRow(2, "ddd") - ), + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(1, "bbb"), + insertRow(2, "aaa"), + deleteRow(2, "aaa"), + insertRow(2, "bbb")), + ImmutableList.of( + updateBeforeRow(2, "bbb"), + updateAfterRow(2, "ccc"), + deleteRow(2, "ccc"), + insertRow(2, "ddd")), + ImmutableList.of( + deleteRow(1, "bbb"), + insertRow(1, "ccc"), + deleteRow(1, "ccc"), + insertRow(1, "ddd"))); + + List> expectedRecordsPerCheckpoint = ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(1, "ccc"), - deleteRow(1, "ccc"), - insertRow(1, "ddd") - ) - ); - - List> expectedRecordsPerCheckpoint = ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "bbb")), - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "ddd")), - ImmutableList.of(insertRow(1, "ddd"), insertRow(2, "ddd")) - ); - - testSqlChangeLog(TABLE_NAME, ImmutableList.of("id"), inputRowsPerCheckpoint, - expectedRecordsPerCheckpoint); + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "bbb")), + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "ddd")), + ImmutableList.of(insertRow(1, "ddd"), insertRow(2, "ddd"))); + + testSqlChangeLog( + TABLE_NAME, ImmutableList.of("id"), inputRowsPerCheckpoint, expectedRecordsPerCheckpoint); } @Test public void testChangeLogOnDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( + List> elementsPerCheckpoint = ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(1, "bbb"), - insertRow(2, "aaa") - ), + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(2, "bbb"), + insertRow(1, "bbb"), + insertRow(2, "aaa")), + ImmutableList.of( + updateBeforeRow(2, "aaa"), updateAfterRow(1, "ccc"), insertRow(1, "aaa")), + ImmutableList.of(deleteRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "ccc"))); + + List> expectedRecords = ImmutableList.of( - updateBeforeRow(2, "aaa"), - updateAfterRow(1, "ccc"), - insertRow(1, "aaa") - ), - ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(2, "aaa"), - insertRow(2, "ccc") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa")), - ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc")), - ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "ccc"), insertRow(2, "aaa"), insertRow(2, "ccc")) - ); + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa")), + ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc")), + ImmutableList.of( + insertRow(1, "aaa"), + insertRow(1, "ccc"), + insertRow(2, "aaa"), + insertRow(2, "ccc"))); testSqlChangeLog(TABLE_NAME, ImmutableList.of("data"), elementsPerCheckpoint, expectedRecords); } @Test public void testChangeLogOnIdDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(1, "bbb"), - insertRow(2, "aaa") - ), + List> elementsPerCheckpoint = ImmutableList.of( - updateBeforeRow(2, "aaa"), - updateAfterRow(1, "ccc"), - insertRow(1, "aaa") - ), + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(2, "bbb"), + insertRow(1, "bbb"), + insertRow(2, "aaa")), + ImmutableList.of( + updateBeforeRow(2, "aaa"), updateAfterRow(1, "ccc"), insertRow(1, "aaa")), + ImmutableList.of(deleteRow(1, "bbb"), insertRow(2, "aaa"))); + + List> expectedRecords = ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(2, "aaa") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "bbb")), - ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc"), insertRow(2, "bbb")), - ImmutableList.of(insertRow(1, "aaa"), insertRow(1, "ccc"), insertRow(2, "aaa"), insertRow(2, "bbb")) - ); - - testSqlChangeLog(TABLE_NAME, ImmutableList.of("data", "id"), elementsPerCheckpoint, expectedRecords); + ImmutableList.of(insertRow(1, "bbb"), insertRow(2, "aaa"), insertRow(2, "bbb")), + ImmutableList.of( + insertRow(1, "aaa"), insertRow(1, "bbb"), insertRow(1, "ccc"), insertRow(2, "bbb")), + ImmutableList.of( + insertRow(1, "aaa"), + insertRow(1, "ccc"), + insertRow(2, "aaa"), + insertRow(2, "bbb"))); + + testSqlChangeLog( + TABLE_NAME, ImmutableList.of("data", "id"), elementsPerCheckpoint, expectedRecords); } @Test public void testPureInsertOnIdKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( + List> elementsPerCheckpoint = ImmutableList.of( - insertRow(1, "aaa"), - insertRow(2, "bbb") - ), - ImmutableList.of( - insertRow(3, "ccc"), - insertRow(4, "ddd") - ), - ImmutableList.of( - insertRow(5, "eee"), - insertRow(6, "fff") - ) - ); + ImmutableList.of(insertRow(1, "aaa"), insertRow(2, "bbb")), + ImmutableList.of(insertRow(3, "ccc"), insertRow(4, "ddd")), + ImmutableList.of(insertRow(5, "eee"), insertRow(6, "fff"))); - List> expectedRecords = ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - insertRow(2, "bbb") - ), - ImmutableList.of( - insertRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(3, "ccc"), - insertRow(4, "ddd") - ), + List> expectedRecords = ImmutableList.of( - insertRow(1, "aaa"), - insertRow(2, "bbb"), - insertRow(3, "ccc"), - insertRow(4, "ddd"), - insertRow(5, "eee"), - insertRow(6, "fff") - ) - ); + ImmutableList.of(insertRow(1, "aaa"), insertRow(2, "bbb")), + ImmutableList.of( + insertRow(1, "aaa"), insertRow(2, "bbb"), insertRow(3, "ccc"), insertRow(4, "ddd")), + ImmutableList.of( + insertRow(1, "aaa"), + insertRow(2, "bbb"), + insertRow(3, "ccc"), + insertRow(4, "ddd"), + insertRow(5, "eee"), + insertRow(6, "fff"))); testSqlChangeLog(TABLE_NAME, ImmutableList.of("data"), elementsPerCheckpoint, expectedRecords); } @@ -247,13 +221,14 @@ private static Record record(int id, String data) { private Table createTable(String tableName, List key, boolean isPartitioned) { String partitionByCause = isPartitioned ? "PARTITIONED BY (data)" : ""; - sql("CREATE TABLE %s(id INT, data VARCHAR, PRIMARY KEY(%s) NOT ENFORCED) %s", + sql( + "CREATE TABLE %s(id INT, data VARCHAR, PRIMARY KEY(%s) NOT ENFORCED) %s", tableName, Joiner.on(',').join(key), partitionByCause); // Upgrade the iceberg table to format v2. - CatalogLoader loader = CatalogLoader.hadoop("my_catalog", CONF, ImmutableMap.of( - CatalogProperties.WAREHOUSE_LOCATION, warehouse - )); + CatalogLoader loader = + CatalogLoader.hadoop( + "my_catalog", CONF, ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouse)); Table table = loader.loadCatalog().loadTable(TableIdentifier.of(DATABASE_NAME, TABLE_NAME)); TableOperations ops = ((BaseTable) table).operations(); TableMetadata meta = ops.current(); @@ -262,15 +237,20 @@ private Table createTable(String tableName, List key, boolean isPartitio return table; } - private void testSqlChangeLog(String tableName, - List key, - List> inputRowsPerCheckpoint, - List> expectedRecordsPerCheckpoint) throws Exception { + private void testSqlChangeLog( + String tableName, + List key, + List> inputRowsPerCheckpoint, + List> expectedRecordsPerCheckpoint) + throws Exception { String dataId = BoundedTableFactory.registerDataSet(inputRowsPerCheckpoint); - sql("CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + - " WITH ('connector'='BoundedSource', 'data-id'='%s')", SOURCE_TABLE, dataId); + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + + " WITH ('connector'='BoundedSource', 'data-id'='%s')", + SOURCE_TABLE, dataId); - Assert.assertEquals("Should have the expected rows", + Assert.assertEquals( + "Should have the expected rows", listJoin(inputRowsPerCheckpoint), sql("SELECT * FROM %s", SOURCE_TABLE)); @@ -280,17 +260,21 @@ private void testSqlChangeLog(String tableName, table.refresh(); List snapshots = findValidSnapshots(table); int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); - Assert.assertEquals("Should have the expected snapshot number", expectedSnapshotNum, snapshots.size()); + Assert.assertEquals( + "Should have the expected snapshot number", expectedSnapshotNum, snapshots.size()); for (int i = 0; i < expectedSnapshotNum; i++) { long snapshotId = snapshots.get(i).snapshotId(); List expectedRows = expectedRecordsPerCheckpoint.get(i); - Assert.assertEquals("Should have the expected records for the checkpoint#" + i, - expectedRowSet(table, expectedRows), actualRowSet(table, snapshotId)); + Assert.assertEquals( + "Should have the expected records for the checkpoint#" + i, + expectedRowSet(table, expectedRows), + actualRowSet(table, snapshotId)); } if (expectedSnapshotNum > 0) { - Assert.assertEquals("Should have the expected rows in the final table", + Assert.assertEquals( + "Should have the expected rows in the final table", Sets.newHashSet(expectedRecordsPerCheckpoint.get(expectedSnapshotNum - 1)), Sets.newHashSet(sql("SELECT * FROM %s", tableName))); } @@ -299,7 +283,8 @@ private void testSqlChangeLog(String tableName, private List findValidSnapshots(Table table) { List validSnapshots = Lists.newArrayList(); for (Snapshot snapshot : table.snapshots()) { - if (snapshot.allManifests(table.io()).stream().anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { + if (snapshot.allManifests(table.io()).stream() + .anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { validSnapshots.add(snapshot); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java index fe9deb37684f..e9372adda4c1 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestDataFileSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -45,21 +47,17 @@ import org.assertj.core.api.Assertions; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestDataFileSerialization { - private static final Schema DATE_SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema DATE_SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec PARTITION_SPEC = PartitionSpec - .builderFor(DATE_SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec PARTITION_SPEC = + PartitionSpec.builderFor(DATE_SCHEMA).identity("date").build(); private static final Map COLUMN_SIZES = Maps.newHashMap(); private static final Map VALUE_COUNTS = Maps.newHashMap(); @@ -81,40 +79,43 @@ public class TestDataFileSerialization { UPPER_BOUNDS.put(1, longToBuffer(4L)); } - private static final Metrics METRICS = new Metrics( - 5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS); - - private static final DataFile DATA_FILE = DataFiles - .builder(PARTITION_SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(1234) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withSplitOffsets(ImmutableList.of(4L)) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) - .withSortOrder(SortOrder.unsorted()) - .build(); - - private static final DeleteFile POS_DELETE_FILE = FileMetadata.deleteFileBuilder(PARTITION_SPEC) - .ofPositionDeletes() - .withPath("/path/to/pos-delete.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) - .withRecordCount(23) - .build(); - - private static final DeleteFile EQ_DELETE_FILE = FileMetadata.deleteFileBuilder(PARTITION_SPEC) - .ofEqualityDeletes(2, 3) - .withPath("/path/to/equality-delete.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("date=2018-06-08") - .withMetrics(METRICS) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) - .withRecordCount(23) - .withSortOrder(SortOrder.unsorted()) - .build(); + private static final Metrics METRICS = + new Metrics( + 5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS); + + private static final DataFile DATA_FILE = + DataFiles.builder(PARTITION_SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(1234) + .withPartitionPath("date=2018-06-08") + .withMetrics(METRICS) + .withSplitOffsets(ImmutableList.of(4L)) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) + .withSortOrder(SortOrder.unsorted()) + .build(); + + private static final DeleteFile POS_DELETE_FILE = + FileMetadata.deleteFileBuilder(PARTITION_SPEC) + .ofPositionDeletes() + .withPath("/path/to/pos-delete.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("date=2018-06-08") + .withMetrics(METRICS) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) + .withRecordCount(23) + .build(); + + private static final DeleteFile EQ_DELETE_FILE = + FileMetadata.deleteFileBuilder(PARTITION_SPEC) + .ofEqualityDeletes(2, 3) + .withPath("/path/to/equality-delete.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("date=2018-06-08") + .withMetrics(METRICS) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(35)) + .withRecordCount(23) + .withSortOrder(SortOrder.unsorted()) + .build(); @Test public void testJavaSerialization() throws Exception { @@ -130,7 +131,8 @@ public void testJavaSerialization() throws Exception { out.writeObject(EQ_DELETE_FILE.copy()); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { for (int i = 0; i < 2; i += 1) { Object obj = in.readObject(); Assertions.assertThat(obj).as("Should be a DataFile").isInstanceOf(DataFile.class); @@ -139,13 +141,17 @@ public void testJavaSerialization() throws Exception { for (int i = 0; i < 2; i += 1) { Object obj = in.readObject(); - Assertions.assertThat(obj).as("Should be a position DeleteFile").isInstanceOf(DeleteFile.class); + Assertions.assertThat(obj) + .as("Should be a position DeleteFile") + .isInstanceOf(DeleteFile.class); TestHelpers.assertEquals(POS_DELETE_FILE, (DeleteFile) obj); } for (int i = 0; i < 2; i += 1) { Object obj = in.readObject(); - Assertions.assertThat(obj).as("Should be a equality DeleteFile").isInstanceOf(DeleteFile.class); + Assertions.assertThat(obj) + .as("Should be a equality DeleteFile") + .isInstanceOf(DeleteFile.class); TestHelpers.assertEquals(EQ_DELETE_FILE, (DeleteFile) obj); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java index c3c280cec3ea..884ea2d1d3b1 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFixtures.java @@ -16,32 +16,28 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.types.Types.NestedField.required; + import org.apache.flink.table.types.logical.RowType; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestFixtures { - private TestFixtures() { - - } + private TestFixtures() {} - public static final Schema SCHEMA = new Schema( - required(1, "data", Types.StringType.get()), - required(2, "id", Types.LongType.get()), - required(3, "dt", Types.StringType.get())); + public static final Schema SCHEMA = + new Schema( + required(1, "data", Types.StringType.get()), + required(2, "id", Types.LongType.get()), + required(3, "dt", Types.StringType.get())); - public static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .identity("dt") - .bucket("id", 1) - .build(); + public static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("dt").bucket("id", 1).build(); public static final RowType ROW_TYPE = FlinkSchemaUtil.convert(SCHEMA); @@ -51,13 +47,13 @@ private TestFixtures() { public static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of(DATABASE, TABLE); - public static final Schema TS_SCHEMA = new Schema( - required(1, "ts", Types.TimestampType.withoutZone()), - required(2, "str", Types.StringType.get())); + public static final Schema TS_SCHEMA = + new Schema( + required(1, "ts", Types.TimestampType.withoutZone()), + required(2, "str", Types.StringType.get())); - public static final PartitionSpec TS_SPEC = PartitionSpec.builderFor(TS_SCHEMA) - .hour("ts") - .build(); + public static final PartitionSpec TS_SPEC = + PartitionSpec.builderFor(TS_SCHEMA).hour("ts").build(); public static final RowType TS_ROW_TYPE = FlinkSchemaUtil.convert(TS_SCHEMA); } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java index 180a2bc5f01b..d4de12c62300 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogDatabase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -58,16 +57,21 @@ public void testCreateNamespace() { sql("CREATE DATABASE %s", flinkDatabase); - Assert.assertTrue("Database should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Database should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); - Assert.assertTrue("Database should still exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Database should still exist", + validationNamespaceCatalog.namespaceExists(icebergNamespace)); sql("DROP DATABASE IF EXISTS %s", flinkDatabase); - Assert.assertFalse("Database should be dropped", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertFalse( + "Database should be dropped", validationNamespaceCatalog.namespaceExists(icebergNamespace)); sql("CREATE DATABASE IF NOT EXISTS %s", flinkDatabase); - Assert.assertTrue("Database should be created", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Database should be created", validationNamespaceCatalog.namespaceExists(icebergNamespace)); } @Test @@ -75,9 +79,12 @@ public void testDefaultDatabase() { sql("USE CATALOG %s", catalogName); sql("SHOW TABLES"); - Assert.assertEquals("Should use the current catalog", getTableEnv().getCurrentCatalog(), catalogName); - Assert.assertEquals("Should use the configured default namespace", - getTableEnv().getCurrentDatabase(), "default"); + Assert.assertEquals( + "Should use the current catalog", getTableEnv().getCurrentCatalog(), catalogName); + Assert.assertEquals( + "Should use the configured default namespace", + getTableEnv().getCurrentDatabase(), + "default"); } @Test @@ -88,7 +95,8 @@ public void testDropEmptyDatabase() { sql("CREATE DATABASE %s", flinkDatabase); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); sql("DROP DATABASE %s", flinkDatabase); @@ -99,7 +107,8 @@ public void testDropEmptyDatabase() { @Test public void testDropNonEmptyNamespace() { - Assume.assumeFalse("Hadoop catalog throws IOException: Directory is not empty.", isHadoopCatalog); + Assume.assumeFalse( + "Hadoop catalog throws IOException: Directory is not empty.", isHadoopCatalog); Assert.assertFalse( "Namespace should not already exist", @@ -111,8 +120,11 @@ public void testDropNonEmptyNamespace() { TableIdentifier.of(icebergNamespace, "tl"), new Schema(Types.NestedField.optional(0, "id", Types.LongType.get()))); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - Assert.assertTrue("Table should exist", validationCatalog.tableExists(TableIdentifier.of(icebergNamespace, "tl"))); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Table should exist", + validationCatalog.tableExists(TableIdentifier.of(icebergNamespace, "tl"))); AssertHelpers.assertThrowsCause( "Should fail if trying to delete a non-empty database", @@ -133,7 +145,8 @@ public void testListTables() { sql("USE CATALOG %s", catalogName); sql("USE %s", DATABASE); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); Assert.assertEquals("Should not list any tables", 0, sql("SHOW TABLES").size()); @@ -155,29 +168,35 @@ public void testListNamespace() { sql("CREATE DATABASE %s", flinkDatabase); sql("USE CATALOG %s", catalogName); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); List databases = sql("SHOW DATABASES"); if (isHadoopCatalog) { Assert.assertEquals("Should have 2 database", 2, databases.size()); - Assert.assertEquals("Should have db and default database", + Assert.assertEquals( + "Should have db and default database", Sets.newHashSet("default", "db"), Sets.newHashSet(databases.get(0).getField(0), databases.get(1).getField(0))); if (!baseNamespace.isEmpty()) { // test namespace not belongs to this catalog - validationNamespaceCatalog.createNamespace(Namespace.of(baseNamespace.level(0), "UNKNOWN_NAMESPACE")); + validationNamespaceCatalog.createNamespace( + Namespace.of(baseNamespace.level(0), "UNKNOWN_NAMESPACE")); databases = sql("SHOW DATABASES"); Assert.assertEquals("Should have 2 database", 2, databases.size()); - Assert.assertEquals("Should have db and default database", + Assert.assertEquals( + "Should have db and default database", Sets.newHashSet("default", "db"), Sets.newHashSet(databases.get(0).getField(0), databases.get(1).getField(0))); } } else { - // If there are multiple classes extends FlinkTestBase, TestHiveMetastore may loose the creation for default + // If there are multiple classes extends FlinkTestBase, TestHiveMetastore may loose the + // creation for default // database. See HiveMetaStore.HMSHandler.init. - Assert.assertTrue("Should have db database", + Assert.assertTrue( + "Should have db database", databases.stream().anyMatch(d -> Objects.equals(d.getField(0), "db"))); } } @@ -192,11 +211,14 @@ public void testCreateNamespaceWithMetadata() { sql("CREATE DATABASE %s WITH ('prop'='value')", flinkDatabase); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - Assert.assertEquals("Namespace should have expected prop value", "value", nsMetadata.get("prop")); + Assert.assertEquals( + "Namespace should have expected prop value", "value", nsMetadata.get("prop")); } @Test @@ -209,11 +231,14 @@ public void testCreateNamespaceWithComment() { sql("CREATE DATABASE %s COMMENT 'namespace doc'", flinkDatabase); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - Assert.assertEquals("Namespace should have expected comment", "namespace doc", nsMetadata.get("comment")); + Assert.assertEquals( + "Namespace should have expected comment", "namespace doc", nsMetadata.get("comment")); } @Test @@ -229,12 +254,16 @@ public void testCreateNamespaceWithLocation() throws Exception { sql("CREATE DATABASE %s WITH ('location'='%s')", flinkDatabase, location); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - Assert.assertEquals("Namespace should have expected location", - "file:" + location.getPath(), nsMetadata.get("location")); + Assert.assertEquals( + "Namespace should have expected location", + "file:" + location.getPath(), + nsMetadata.get("location")); } @Test @@ -247,16 +276,21 @@ public void testSetProperties() { sql("CREATE DATABASE %s", flinkDatabase); - Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); + Assert.assertTrue( + "Namespace should exist", validationNamespaceCatalog.namespaceExists(icebergNamespace)); - Map defaultMetadata = validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - Assert.assertFalse("Default metadata should not have custom property", defaultMetadata.containsKey("prop")); + Map defaultMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + Assert.assertFalse( + "Default metadata should not have custom property", defaultMetadata.containsKey("prop")); sql("ALTER DATABASE %s SET ('prop'='value')", flinkDatabase); - Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); + Map nsMetadata = + validationNamespaceCatalog.loadNamespaceMetadata(icebergNamespace); - Assert.assertEquals("Namespace should have expected prop value", "value", nsMetadata.get("prop")); + Assert.assertEquals( + "Namespace should have expected prop value", "value", nsMetadata.get("prop")); } @Test diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java index cd893d836dc8..f7edd5653ebd 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.Map; @@ -46,11 +45,12 @@ public void before() { @Test public void testCreateCreateCatalogHive() { String catalogName = "hiveCatalog"; - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); + props.put( + FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); - Catalog catalog = FlinkCatalogFactory - .createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); + Catalog catalog = + FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) + .loadCatalog(); Assertions.assertThat(catalog).isNotNull().isInstanceOf(HiveCatalog.class); } @@ -58,11 +58,12 @@ public void testCreateCreateCatalogHive() { @Test public void testCreateCreateCatalogHadoop() { String catalogName = "hadoopCatalog"; - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HADOOP); + props.put( + FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HADOOP); - Catalog catalog = FlinkCatalogFactory - .createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); + Catalog catalog = + FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) + .loadCatalog(); Assertions.assertThat(catalog).isNotNull().isInstanceOf(HadoopCatalog.class); } @@ -72,9 +73,9 @@ public void testCreateCreateCatalogCustom() { String catalogName = "customCatalog"; props.put(CatalogProperties.CATALOG_IMPL, CustomHadoopCatalog.class.getName()); - Catalog catalog = FlinkCatalogFactory - .createCatalogLoader(catalogName, props, new Configuration()) - .loadCatalog(); + Catalog catalog = + FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) + .loadCatalog(); Assertions.assertThat(catalog).isNotNull().isInstanceOf(CustomHadoopCatalog.class); } @@ -83,13 +84,14 @@ public void testCreateCreateCatalogCustom() { public void testCreateCreateCatalogCustomWithHiveCatalogTypeSet() { String catalogName = "customCatalog"; props.put(CatalogProperties.CATALOG_IMPL, CustomHadoopCatalog.class.getName()); - props.put(FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); + props.put( + FlinkCatalogFactory.ICEBERG_CATALOG_TYPE, FlinkCatalogFactory.ICEBERG_CATALOG_TYPE_HIVE); AssertHelpers.assertThrows( "Should throw when both catalog-type and catalog-impl are set", IllegalArgumentException.class, - "both catalog-type and catalog-impl are set", () -> - FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())); + "both catalog-type and catalog-impl are set", + () -> FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())); } @Test @@ -100,20 +102,18 @@ public void testLoadCatalogUnknown() { AssertHelpers.assertThrows( "Should throw when an unregistered / unknown catalog is set as the catalog factor's`type` setting", UnsupportedOperationException.class, - "Unknown catalog-type", () -> - FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration()) - ); + "Unknown catalog-type", + () -> FlinkCatalogFactory.createCatalogLoader(catalogName, props, new Configuration())); } public static class CustomHadoopCatalog extends HadoopCatalog { - public CustomHadoopCatalog() { - - } + public CustomHadoopCatalog() {} public CustomHadoopCatalog(Configuration conf, String warehouseLocation) { setConf(conf); - initialize("custom", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation)); + initialize( + "custom", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouseLocation)); } } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java index 3bb2861d8778..45b3da5fe661 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.Arrays; @@ -89,26 +88,27 @@ public void testGetTable() { sql("CREATE TABLE tl(id BIGINT, strV STRING)"); Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, "tl")); - Schema iSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "strV", Types.StringType.get()) - ); - Assert.assertEquals("Should load the expected iceberg schema", iSchema.toString(), table.schema().toString()); + Schema iSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "strV", Types.StringType.get())); + Assert.assertEquals( + "Should load the expected iceberg schema", iSchema.toString(), table.schema().toString()); } @Test public void testRenameTable() { Assume.assumeFalse("HadoopCatalog does not support rename table", isHadoopCatalog); - final Schema tableSchema = new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); + final Schema tableSchema = + new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); validationCatalog.createTable(TableIdentifier.of(icebergNamespace, "tl"), tableSchema); sql("ALTER TABLE tl RENAME TO tl2"); AssertHelpers.assertThrows( "Should fail if trying to get a nonexistent table", ValidationException.class, "Table `tl` was not found.", - () -> getTableEnv().from("tl") - ); + () -> getTableEnv().from("tl")); Schema actualSchema = FlinkSchemaUtil.convert(getTableEnv().from("tl2").getSchema()); Assert.assertEquals(tableSchema.asStruct(), actualSchema.asStruct()); } @@ -124,7 +124,8 @@ public void testCreateTable() throws TableNotExistException { Assert.assertEquals(Maps.newHashMap(), table.properties()); CatalogTable catalogTable = catalogTable("tl"); - Assert.assertEquals(TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); + Assert.assertEquals( + TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); Assert.assertEquals(Maps.newHashMap(), catalogTable.getOptions()); } @@ -133,33 +134,41 @@ public void testCreateTableWithPrimaryKey() throws Exception { sql("CREATE TABLE tl(id BIGINT, data STRING, key STRING PRIMARY KEY NOT ENFORCED)"); Table table = table("tl"); - Assert.assertEquals("Should have the expected row key.", + Assert.assertEquals( + "Should have the expected row key.", Sets.newHashSet(table.schema().findField("key").fieldId()), table.schema().identifierFieldIds()); CatalogTable catalogTable = catalogTable("tl"); Optional uniqueConstraintOptional = catalogTable.getSchema().getPrimaryKey(); - Assert.assertTrue("Should have the expected unique constraint", uniqueConstraintOptional.isPresent()); - Assert.assertEquals("Should have the expected columns", - ImmutableList.of("key"), uniqueConstraintOptional.get().getColumns()); + Assert.assertTrue( + "Should have the expected unique constraint", uniqueConstraintOptional.isPresent()); + Assert.assertEquals( + "Should have the expected columns", + ImmutableList.of("key"), + uniqueConstraintOptional.get().getColumns()); } @Test public void testCreateTableWithMultiColumnsInPrimaryKey() throws Exception { - sql("CREATE TABLE tl(id BIGINT, data STRING, CONSTRAINT pk_constraint PRIMARY KEY(data, id) NOT ENFORCED)"); + sql( + "CREATE TABLE tl(id BIGINT, data STRING, CONSTRAINT pk_constraint PRIMARY KEY(data, id) NOT ENFORCED)"); Table table = table("tl"); - Assert.assertEquals("Should have the expected RowKey", + Assert.assertEquals( + "Should have the expected RowKey", Sets.newHashSet( - table.schema().findField("id").fieldId(), - table.schema().findField("data").fieldId()), + table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId()), table.schema().identifierFieldIds()); CatalogTable catalogTable = catalogTable("tl"); Optional uniqueConstraintOptional = catalogTable.getSchema().getPrimaryKey(); - Assert.assertTrue("Should have the expected unique constraint", uniqueConstraintOptional.isPresent()); - Assert.assertEquals("Should have the expected columns", - ImmutableSet.of("data", "id"), ImmutableSet.copyOf(uniqueConstraintOptional.get().getColumns())); + Assert.assertTrue( + "Should have the expected unique constraint", uniqueConstraintOptional.isPresent()); + Assert.assertEquals( + "Should have the expected columns", + ImmutableSet.of("data", "id"), + ImmutableSet.copyOf(uniqueConstraintOptional.get().getColumns())); } @Test @@ -170,7 +179,8 @@ public void testCreateTableIfNotExists() { Assert.assertEquals(Maps.newHashMap(), table("tl").properties()); sql("DROP TABLE tl"); - AssertHelpers.assertThrows("Table 'tl' should be dropped", + AssertHelpers.assertThrows( + "Table 'tl' should be dropped", NoSuchTableException.class, "Table does not exist: " + getFullQualifiedTableName("tl"), () -> table("tl")); @@ -179,14 +189,12 @@ public void testCreateTableIfNotExists() { Assert.assertEquals(Maps.newHashMap(), table("tl").properties()); final Map expectedProperties = ImmutableMap.of("key", "value"); - table("tl").updateProperties() - .set("key", "value") - .commit(); + table("tl").updateProperties().set("key", "value").commit(); Assert.assertEquals(expectedProperties, table("tl").properties()); sql("CREATE TABLE IF NOT EXISTS tl(id BIGINT)"); - Assert.assertEquals("Should still be the old table.", - expectedProperties, table("tl").properties()); + Assert.assertEquals( + "Should still be the old table.", expectedProperties, table("tl").properties()); } @Test @@ -201,13 +209,15 @@ public void testCreateTableLike() throws TableNotExistException { Assert.assertEquals(Maps.newHashMap(), table.properties()); CatalogTable catalogTable = catalogTable("tl2"); - Assert.assertEquals(TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); + Assert.assertEquals( + TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); Assert.assertEquals(Maps.newHashMap(), catalogTable.getOptions()); } @Test public void testCreateTableLocation() { - Assume.assumeFalse("HadoopCatalog does not support creating table with location", isHadoopCatalog); + Assume.assumeFalse( + "HadoopCatalog does not support creating table with location", isHadoopCatalog); sql("CREATE TABLE tl(id BIGINT) WITH ('location'='file:///tmp/location')"); @@ -226,15 +236,20 @@ public void testCreatePartitionTable() throws TableNotExistException { Table table = table("tl"); Assert.assertEquals( new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get())).asStruct(), + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get())) + .asStruct(), table.schema().asStruct()); - Assert.assertEquals(PartitionSpec.builderFor(table.schema()).identity("dt").build(), table.spec()); + Assert.assertEquals( + PartitionSpec.builderFor(table.schema()).identity("dt").build(), table.spec()); Assert.assertEquals(Maps.newHashMap(), table.properties()); CatalogTable catalogTable = catalogTable("tl"); Assert.assertEquals( - TableSchema.builder().field("id", DataTypes.BIGINT()).field("dt", DataTypes.STRING()).build(), + TableSchema.builder() + .field("id", DataTypes.BIGINT()) + .field("dt", DataTypes.STRING()) + .build(), catalogTable.getSchema()); Assert.assertEquals(Maps.newHashMap(), catalogTable.getOptions()); Assert.assertEquals(Collections.singletonList("dt"), catalogTable.getPartitionKeys()); @@ -245,8 +260,10 @@ public void testCreateTableWithFormatV2ThroughTableProperty() throws Exception { sql("CREATE TABLE tl(id BIGINT) WITH ('format-version'='2')"); Table table = table("tl"); - Assert.assertEquals("should create table using format v2", - 2, ((BaseTable) table).operations().current().formatVersion()); + Assert.assertEquals( + "should create table using format v2", + 2, + ((BaseTable) table).operations().current().formatVersion()); } @Test @@ -255,12 +272,10 @@ public void testUpgradeTableWithFormatV2ThroughTableProperty() throws Exception Table table = table("tl"); TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("should create table using format v1", - 1, ops.refresh().formatVersion()); + Assert.assertEquals("should create table using format v1", 1, ops.refresh().formatVersion()); sql("ALTER TABLE tl SET('format-version'='2')"); - Assert.assertEquals("should update table to use format v2", - 2, ops.refresh().formatVersion()); + Assert.assertEquals("should update table to use format v2", 2, ops.refresh().formatVersion()); } @Test @@ -269,10 +284,10 @@ public void testDowngradeTableToFormatV1ThroughTablePropertyFails() throws Excep Table table = table("tl"); TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("should create table using format v2", - 2, ops.refresh().formatVersion()); + Assert.assertEquals("should create table using format v2", 2, ops.refresh().formatVersion()); - AssertHelpers.assertThrowsRootCause("should fail to downgrade to v1", + AssertHelpers.assertThrowsRootCause( + "should fail to downgrade to v1", IllegalArgumentException.class, "Cannot downgrade v2 table to v1", () -> sql("ALTER TABLE tl SET('format-version'='1')")); @@ -282,13 +297,13 @@ public void testDowngradeTableToFormatV1ThroughTablePropertyFails() throws Excep public void testLoadTransformPartitionTable() throws TableNotExistException { Schema schema = new Schema(Types.NestedField.optional(0, "id", Types.LongType.get())); validationCatalog.createTable( - TableIdentifier.of(icebergNamespace, "tl"), schema, + TableIdentifier.of(icebergNamespace, "tl"), + schema, PartitionSpec.builderFor(schema).bucket("id", 100).build()); CatalogTable catalogTable = catalogTable("tl"); Assert.assertEquals( - TableSchema.builder().field("id", DataTypes.BIGINT()).build(), - catalogTable.getSchema()); + TableSchema.builder().field("id", DataTypes.BIGINT()).build(), catalogTable.getSchema()); Assert.assertEquals(Maps.newHashMap(), catalogTable.getOptions()); Assert.assertEquals(Collections.emptyList(), catalogTable.getPartitionKeys()); } @@ -312,8 +327,10 @@ public void testAlterTable() throws TableNotExistException { // remove property CatalogTable catalogTable = catalogTable("tl"); properties.remove("oldK"); - getTableEnv().getCatalog(getTableEnv().getCurrentCatalog()).get().alterTable( - new ObjectPath(DATABASE, "tl"), catalogTable.copy(properties), false); + getTableEnv() + .getCatalog(getTableEnv().getCurrentCatalog()) + .get() + .alterTable(new ObjectPath(DATABASE, "tl"), catalogTable.copy(properties), false); Assert.assertEquals(properties, table("tl").properties()); } @@ -336,8 +353,10 @@ public void testAlterTableWithPrimaryKey() throws TableNotExistException { // remove property CatalogTable catalogTable = catalogTable("tl"); properties.remove("oldK"); - getTableEnv().getCatalog(getTableEnv().getCurrentCatalog()).get().alterTable( - new ObjectPath(DATABASE, "tl"), catalogTable.copy(properties), false); + getTableEnv() + .getCatalog(getTableEnv().getCurrentCatalog()) + .get() + .alterTable(new ObjectPath(DATABASE, "tl"), catalogTable.copy(properties), false); Assert.assertEquals(properties, table("tl").properties()); } @@ -356,43 +375,40 @@ public void testSetCurrentAndCherryPickSnapshotId() { Table table = table("tl"); - DataFile fileA = DataFiles.builder(table.spec()) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - DataFile fileB = DataFiles.builder(table.spec()) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=1") // easy way to set partition data for now - .withRecordCount(1) - .build(); - DataFile replacementFile = DataFiles.builder(table.spec()) - .withPath("/path/to/data-a-replacement.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - - table.newAppend() - .appendFile(fileA) - .commit(); + DataFile fileA = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + DataFile fileB = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=1") // easy way to set partition data for now + .withRecordCount(1) + .build(); + DataFile replacementFile = + DataFiles.builder(table.spec()) + .withPath("/path/to/data-a-replacement.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + + table.newAppend().appendFile(fileA).commit(); long snapshotId = table.currentSnapshot().snapshotId(); // stage an overwrite that replaces FILE_A - table.newReplacePartitions() - .addFile(replacementFile) - .stageOnly() - .commit(); + table.newReplacePartitions().addFile(replacementFile).stageOnly().commit(); Snapshot staged = Iterables.getLast(table.snapshots()); - Assert.assertEquals("Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); + Assert.assertEquals( + "Should find the staged overwrite snapshot", DataOperations.OVERWRITE, staged.operation()); // add another append so that the original commit can't be fast-forwarded - table.newAppend() - .appendFile(fileB) - .commit(); + table.newAppend().appendFile(fileB).commit(); // test cherry pick sql("ALTER TABLE tl SET('cherry-pick-snapshot-id'='%s')", staged.snapshotId()); @@ -405,10 +421,13 @@ public void testSetCurrentAndCherryPickSnapshotId() { private void validateTableFiles(Table tbl, DataFile... expectedFiles) { tbl.refresh(); - Set expectedFilePaths = Arrays.stream(expectedFiles).map(DataFile::path).collect(Collectors.toSet()); - Set actualFilePaths = StreamSupport.stream(tbl.newScan().planFiles().spliterator(), false) - .map(FileScanTask::file).map(ContentFile::path) - .collect(Collectors.toSet()); + Set expectedFilePaths = + Arrays.stream(expectedFiles).map(DataFile::path).collect(Collectors.toSet()); + Set actualFilePaths = + StreamSupport.stream(tbl.newScan().planFiles().spliterator(), false) + .map(FileScanTask::file) + .map(ContentFile::path) + .collect(Collectors.toSet()); Assert.assertEquals("Files should match", expectedFilePaths, actualFilePaths); } @@ -417,7 +436,10 @@ private Table table(String name) { } private CatalogTable catalogTable(String name) throws TableNotExistException { - return (CatalogTable) getTableEnv().getCatalog(getTableEnv().getCurrentCatalog()).get() - .getTable(new ObjectPath(DATABASE, name)); + return (CatalogTable) + getTableEnv() + .getCatalog(getTableEnv().getCurrentCatalog()) + .get() + .getTable(new ObjectPath(DATABASE, name)); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java index b6c4812861f7..839700f50127 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkCatalogTablePartitions.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.flink.FlinkCatalogFactory.CACHE_ENABLED; + import java.util.List; import org.apache.flink.table.catalog.CatalogPartitionSpec; import org.apache.flink.table.catalog.ObjectPath; @@ -35,18 +36,18 @@ import org.junit.Test; import org.junit.runners.Parameterized; -import static org.apache.iceberg.flink.FlinkCatalogFactory.CACHE_ENABLED; - public class TestFlinkCatalogTablePartitions extends FlinkCatalogTestBase { private String tableName = "test_table"; private final FileFormat format; - @Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, cacheEnabled={3}") + @Parameterized.Parameters( + name = "catalogName={0}, baseNamespace={1}, format={2}, cacheEnabled={3}") public static Iterable parameters() { List parameters = Lists.newArrayList(); - for (FileFormat format : new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { + for (FileFormat format : + new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { for (Boolean cacheEnabled : new Boolean[] {true, false}) { for (Object[] catalogParams : FlinkCatalogTestBase.parameters()) { String catalogName = (String) catalogParams[0]; @@ -58,8 +59,8 @@ public static Iterable parameters() { return parameters; } - public TestFlinkCatalogTablePartitions(String catalogName, Namespace baseNamespace, FileFormat format, - boolean cacheEnabled) { + public TestFlinkCatalogTablePartitions( + String catalogName, Namespace baseNamespace, FileFormat format, boolean cacheEnabled) { super(catalogName, baseNamespace); this.format = format; config.put(CACHE_ENABLED, String.valueOf(cacheEnabled)); @@ -83,20 +84,26 @@ public void cleanNamespaces() { @Test public void testListPartitionsWithUnpartitionedTable() { - sql("CREATE TABLE %s (id INT, data VARCHAR) with ('write.format.default'='%s')", + sql( + "CREATE TABLE %s (id INT, data VARCHAR) with ('write.format.default'='%s')", tableName, format.name()); sql("INSERT INTO %s SELECT 1,'a'", tableName); ObjectPath objectPath = new ObjectPath(DATABASE, tableName); FlinkCatalog flinkCatalog = (FlinkCatalog) getTableEnv().getCatalog(catalogName).get(); - AssertHelpers.assertThrows("Should not list partitions for unpartitioned table.", - TableNotPartitionedException.class, () -> flinkCatalog.listPartitions(objectPath)); + AssertHelpers.assertThrows( + "Should not list partitions for unpartitioned table.", + TableNotPartitionedException.class, + () -> flinkCatalog.listPartitions(objectPath)); } @Test - public void testListPartitionsWithPartitionedTable() throws TableNotExistException, TableNotPartitionedException { - sql("CREATE TABLE %s (id INT, data VARCHAR) PARTITIONED BY (data) " + - "with ('write.format.default'='%s')", tableName, format.name()); + public void testListPartitionsWithPartitionedTable() + throws TableNotExistException, TableNotPartitionedException { + sql( + "CREATE TABLE %s (id INT, data VARCHAR) PARTITIONED BY (data) " + + "with ('write.format.default'='%s')", + tableName, format.name()); sql("INSERT INTO %s SELECT 1,'a'", tableName); sql("INSERT INTO %s SELECT 2,'b'", tableName); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java index 0044acf57da2..c89ea4f53054 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkFilters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.math.BigDecimal; @@ -56,36 +55,38 @@ public class TestFlinkFilters { - private static final TableSchema TABLE_SCHEMA = TableSchema.builder() - .field("field1", DataTypes.INT()) - .field("field2", DataTypes.BIGINT()) - .field("field3", DataTypes.FLOAT()) - .field("field4", DataTypes.DOUBLE()) - .field("field5", DataTypes.STRING()) - .field("field6", DataTypes.BOOLEAN()) - .field("field7", DataTypes.BINARY(2)) - .field("field8", DataTypes.DECIMAL(10, 2)) - .field("field9", DataTypes.DATE()) - .field("field10", DataTypes.TIME()) - .field("field11", DataTypes.TIMESTAMP()) - .field("field12", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) - .build(); - - // A map list of fields and values used to verify the conversion of flink expression to iceberg expression - private static final List> FIELD_VALUE_LIST = ImmutableList.of( - Pair.of("field1", 1), - Pair.of("field2", 2L), - Pair.of("field3", 3F), - Pair.of("field4", 4D), - Pair.of("field5", "iceberg"), - Pair.of("field6", true), - Pair.of("field7", new byte[] {'a', 'b'}), - Pair.of("field8", BigDecimal.valueOf(10.12)), - Pair.of("field9", DateTimeUtil.daysFromDate(LocalDate.now())), - Pair.of("field10", DateTimeUtil.microsFromTime(LocalTime.now())), - Pair.of("field11", DateTimeUtil.microsFromTimestamp(LocalDateTime.now())), - Pair.of("field12", DateTimeUtil.microsFromInstant(Instant.now())) - ); + private static final TableSchema TABLE_SCHEMA = + TableSchema.builder() + .field("field1", DataTypes.INT()) + .field("field2", DataTypes.BIGINT()) + .field("field3", DataTypes.FLOAT()) + .field("field4", DataTypes.DOUBLE()) + .field("field5", DataTypes.STRING()) + .field("field6", DataTypes.BOOLEAN()) + .field("field7", DataTypes.BINARY(2)) + .field("field8", DataTypes.DECIMAL(10, 2)) + .field("field9", DataTypes.DATE()) + .field("field10", DataTypes.TIME()) + .field("field11", DataTypes.TIMESTAMP()) + .field("field12", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) + .build(); + + // A map list of fields and values used to verify the conversion of flink expression to iceberg + // expression + private static final List> FIELD_VALUE_LIST = + ImmutableList.of( + Pair.of("field1", 1), + Pair.of("field2", 2L), + Pair.of("field3", 3F), + Pair.of("field4", 4D), + Pair.of("field5", "iceberg"), + Pair.of("field6", true), + Pair.of("field7", new byte[] {'a', 'b'}), + Pair.of("field8", BigDecimal.valueOf(10.12)), + Pair.of("field9", DateTimeUtil.daysFromDate(LocalDate.now())), + Pair.of("field10", DateTimeUtil.microsFromTime(LocalTime.now())), + Pair.of("field11", DateTimeUtil.microsFromTimestamp(LocalDateTime.now())), + Pair.of("field12", DateTimeUtil.microsFromInstant(Instant.now()))); @Test public void testFlinkDataTypeEqual() { @@ -114,15 +115,18 @@ public void testFlinkDataTypeEqual() { @Test public void testEquals() { for (Pair pair : FIELD_VALUE_LIST) { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.equal(pair.first(), pair.second()); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.equal(pair.first(), pair.second()); Optional actual = - FlinkFilters.convert(resolve(Expressions.$(pair.first()).isEqual(Expressions.lit(pair.second())))); + FlinkFilters.convert( + resolve(Expressions.$(pair.first()).isEqual(Expressions.lit(pair.second())))); Assert.assertTrue("Conversion should succeed", actual.isPresent()); assertPredicatesMatch(expected, actual.get()); Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(pair.second()).isEqual(Expressions.$(pair.first())))); + FlinkFilters.convert( + resolve(Expressions.lit(pair.second()).isEqual(Expressions.$(pair.first())))); Assert.assertTrue("Conversion should succeed", actual1.isPresent()); assertPredicatesMatch(expected, actual1.get()); } @@ -146,15 +150,18 @@ public void testEqualsNaN() { @Test public void testNotEquals() { for (Pair pair : FIELD_VALUE_LIST) { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.notEqual(pair.first(), pair.second()); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.notEqual(pair.first(), pair.second()); Optional actual = - FlinkFilters.convert(resolve(Expressions.$(pair.first()).isNotEqual(Expressions.lit(pair.second())))); + FlinkFilters.convert( + resolve(Expressions.$(pair.first()).isNotEqual(Expressions.lit(pair.second())))); Assert.assertTrue("Conversion should succeed", actual.isPresent()); assertPredicatesMatch(expected, actual.get()); Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(pair.second()).isNotEqual(Expressions.$(pair.first())))); + FlinkFilters.convert( + resolve(Expressions.lit(pair.second()).isNotEqual(Expressions.$(pair.first())))); Assert.assertTrue("Conversion should succeed", actual1.isPresent()); assertPredicatesMatch(expected, actual1.get()); } @@ -165,19 +172,22 @@ public void testNotEqualsNaN() { UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.notNaN("field3"); Optional actual = - FlinkFilters.convert(resolve(Expressions.$("field3").isNotEqual(Expressions.lit(Float.NaN)))); + FlinkFilters.convert( + resolve(Expressions.$("field3").isNotEqual(Expressions.lit(Float.NaN)))); Assert.assertTrue("Conversion should succeed", actual.isPresent()); assertPredicatesMatch(expected, actual.get()); Optional actual1 = - FlinkFilters.convert(resolve(Expressions.lit(Float.NaN).isNotEqual(Expressions.$("field3")))); + FlinkFilters.convert( + resolve(Expressions.lit(Float.NaN).isNotEqual(Expressions.$("field3")))); Assert.assertTrue("Conversion should succeed", actual1.isPresent()); assertPredicatesMatch(expected, actual1.get()); } @Test public void testGreaterThan() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.greaterThan("field1", 1); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.greaterThan("field1", 1); Optional actual = FlinkFilters.convert(resolve(Expressions.$("field1").isGreater(Expressions.lit(1)))); @@ -192,7 +202,8 @@ public void testGreaterThan() { @Test public void testGreaterThanEquals() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.greaterThanOrEqual("field1", 1); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.greaterThanOrEqual("field1", 1); Optional actual = FlinkFilters.convert(resolve(Expressions.$("field1").isGreaterOrEqual(Expressions.lit(1)))); @@ -207,7 +218,8 @@ public void testGreaterThanEquals() { @Test public void testLessThan() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.lessThan("field1", 1); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.lessThan("field1", 1); Optional actual = FlinkFilters.convert(resolve(Expressions.$("field1").isLess(Expressions.lit(1)))); @@ -222,7 +234,8 @@ public void testLessThan() { @Test public void testLessThanEquals() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.lessThanOrEqual("field1", 1); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.lessThanOrEqual("field1", 1); Optional actual = FlinkFilters.convert(resolve(Expressions.$("field1").isLessOrEqual(Expressions.lit(1)))); @@ -249,20 +262,26 @@ public void testIsNotNull() { Expression expr = resolve(Expressions.$("field1").isNotNull()); Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.notNull("field1"); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.notNull("field1"); assertPredicatesMatch(expected, actual.get()); } @Test public void testAnd() { - Expression expr = resolve( - Expressions.$("field1").isEqual(Expressions.lit(1)).and(Expressions.$("field2").isEqual(Expressions.lit(2L)))); + Expression expr = + resolve( + Expressions.$("field1") + .isEqual(Expressions.lit(1)) + .and(Expressions.$("field2").isEqual(Expressions.lit(2L)))); Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); And and = (And) actual.get(); - And expected = (And) org.apache.iceberg.expressions.Expressions.and( - org.apache.iceberg.expressions.Expressions.equal("field1", 1), - org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); + And expected = + (And) + org.apache.iceberg.expressions.Expressions.and( + org.apache.iceberg.expressions.Expressions.equal("field1", 1), + org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); assertPredicatesMatch(expected.left(), and.left()); assertPredicatesMatch(expected.right(), and.right()); @@ -270,14 +289,19 @@ public void testAnd() { @Test public void testOr() { - Expression expr = resolve( - Expressions.$("field1").isEqual(Expressions.lit(1)).or(Expressions.$("field2").isEqual(Expressions.lit(2L)))); + Expression expr = + resolve( + Expressions.$("field1") + .isEqual(Expressions.lit(1)) + .or(Expressions.$("field2").isEqual(Expressions.lit(2L)))); Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); Or or = (Or) actual.get(); - Or expected = (Or) org.apache.iceberg.expressions.Expressions.or( - org.apache.iceberg.expressions.Expressions.equal("field1", 1), - org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); + Or expected = + (Or) + org.apache.iceberg.expressions.Expressions.or( + org.apache.iceberg.expressions.Expressions.equal("field1", 1), + org.apache.iceberg.expressions.Expressions.equal("field2", 2L)); assertPredicatesMatch(expected.left(), or.left()); assertPredicatesMatch(expected.right(), or.right()); @@ -285,13 +309,18 @@ public void testOr() { @Test public void testNot() { - Expression expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.NOT, Expressions.$("field1").isEqual(Expressions.lit(1)))); + Expression expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.NOT, + Expressions.$("field1").isEqual(Expressions.lit(1)))); Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); Not not = (Not) actual.get(); - Not expected = (Not) org.apache.iceberg.expressions.Expressions.not( - org.apache.iceberg.expressions.Expressions.equal("field1", 1)); + Not expected = + (Not) + org.apache.iceberg.expressions.Expressions.not( + org.apache.iceberg.expressions.Expressions.equal("field1", 1)); Assert.assertEquals("Predicate operation should match", expected.op(), not.op()); assertPredicatesMatch(expected.child(), not.child()); @@ -299,40 +328,59 @@ public void testNot() { @Test public void testLike() { - UnboundPredicate expected = org.apache.iceberg.expressions.Expressions.startsWith("field5", "abc"); - Expression expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("abc%"))); + UnboundPredicate expected = + org.apache.iceberg.expressions.Expressions.startsWith("field5", "abc"); + Expression expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("abc%"))); Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); assertPredicatesMatch(expected, actual.get()); - expr = resolve(ApiExpressionUtils - .unresolvedCall(BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%abc"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%abc"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); - expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%abc%"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, + Expressions.$("field5"), + Expressions.lit("%abc%"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); - expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("abc%d"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, + Expressions.$("field5"), + Expressions.lit("abc%d"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); - expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("%"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); - expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a_"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a_"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); - expr = resolve(ApiExpressionUtils.unresolvedCall( - BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a%b"))); + expr = + resolve( + ApiExpressionUtils.unresolvedCall( + BuiltInFunctionDefinitions.LIKE, Expressions.$("field5"), Expressions.lit("a%b"))); actual = FlinkFilters.convert(expr); Assert.assertFalse("Conversion should failed", actual.isPresent()); } @@ -343,13 +391,15 @@ private void matchLiteral(String fieldName, Object flinkLiteral, T icebergLi Optional actual = FlinkFilters.convert(expr); Assert.assertTrue("Conversion should succeed", actual.isPresent()); org.apache.iceberg.expressions.Expression expression = actual.get(); - Assertions.assertThat(expression).as("The expression should be a UnboundPredicate") + Assertions.assertThat(expression) + .as("The expression should be a UnboundPredicate") .isInstanceOf(UnboundPredicate.class); UnboundPredicate unboundPredicate = (UnboundPredicate) expression; org.apache.iceberg.expressions.Expression expression1 = unboundPredicate.bind(FlinkSchemaUtil.convert(TABLE_SCHEMA).asStruct(), false); - Assertions.assertThat(expression1).as("The expression should be a BoundLiteralPredicate") + Assertions.assertThat(expression1) + .as("The expression should be a BoundLiteralPredicate") .isInstanceOf(BoundLiteralPredicate.class); BoundLiteralPredicate predicate = (BoundLiteralPredicate) expression1; @@ -357,49 +407,61 @@ private void matchLiteral(String fieldName, Object flinkLiteral, T icebergLi } private static Expression resolve(Expression originalExpression) { - return originalExpression.accept(new ApiExpressionDefaultVisitor() { - @Override - public Expression visit(UnresolvedReferenceExpression unresolvedReference) { - String name = unresolvedReference.getName(); - Optional field = TABLE_SCHEMA.getTableColumn(name); - if (field.isPresent()) { - int index = TABLE_SCHEMA.getTableColumns().indexOf(field.get()); - return new FieldReferenceExpression(name, field.get().getType(), 0, index); - } else { - return null; - } - } - - @Override - public Expression visit(UnresolvedCallExpression unresolvedCall) { - List children = - unresolvedCall.getChildren().stream().map(e -> (ResolvedExpression) e.accept(this)) - .collect(Collectors.toList()); - return new CallExpression(unresolvedCall.getFunctionDefinition(), children, DataTypes.STRING()); - } - - @Override - public Expression visit(ValueLiteralExpression valueLiteral) { - return valueLiteral; - } - - @Override - protected Expression defaultMethod(Expression expression) { - throw new UnsupportedOperationException(String.format("unsupported expression: %s", expression)); - } - }); + return originalExpression.accept( + new ApiExpressionDefaultVisitor() { + @Override + public Expression visit(UnresolvedReferenceExpression unresolvedReference) { + String name = unresolvedReference.getName(); + Optional field = TABLE_SCHEMA.getTableColumn(name); + if (field.isPresent()) { + int index = TABLE_SCHEMA.getTableColumns().indexOf(field.get()); + return new FieldReferenceExpression(name, field.get().getType(), 0, index); + } else { + return null; + } + } + + @Override + public Expression visit(UnresolvedCallExpression unresolvedCall) { + List children = + unresolvedCall.getChildren().stream() + .map(e -> (ResolvedExpression) e.accept(this)) + .collect(Collectors.toList()); + return new CallExpression( + unresolvedCall.getFunctionDefinition(), children, DataTypes.STRING()); + } + + @Override + public Expression visit(ValueLiteralExpression valueLiteral) { + return valueLiteral; + } + + @Override + protected Expression defaultMethod(Expression expression) { + throw new UnsupportedOperationException( + String.format("unsupported expression: %s", expression)); + } + }); } - private void assertPredicatesMatch(org.apache.iceberg.expressions.Expression expected, - org.apache.iceberg.expressions.Expression actual) { - Assertions.assertThat(expected).as("The expected expression should be a UnboundPredicate") + private void assertPredicatesMatch( + org.apache.iceberg.expressions.Expression expected, + org.apache.iceberg.expressions.Expression actual) { + Assertions.assertThat(expected) + .as("The expected expression should be a UnboundPredicate") .isInstanceOf(UnboundPredicate.class); - Assertions.assertThat(actual).as("The actual expression should be a UnboundPredicate") + Assertions.assertThat(actual) + .as("The actual expression should be a UnboundPredicate") .isInstanceOf(UnboundPredicate.class); UnboundPredicate predicateExpected = (UnboundPredicate) expected; UnboundPredicate predicateActual = (UnboundPredicate) actual; - Assert.assertEquals("Predicate operation should match", predicateExpected.op(), predicateActual.op()); - Assert.assertEquals("Predicate literal should match", predicateExpected.literal(), predicateActual.literal()); - Assert.assertEquals("Predicate name should match", predicateExpected.ref().name(), predicateActual.ref().name()); + Assert.assertEquals( + "Predicate operation should match", predicateExpected.op(), predicateActual.op()); + Assert.assertEquals( + "Predicate literal should match", predicateExpected.literal(), predicateActual.literal()); + Assert.assertEquals( + "Predicate name should match", + predicateExpected.ref().name(), + predicateActual.ref().name()); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java index 24065015795e..64746356636b 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkHiveCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -36,8 +35,7 @@ public class TestFlinkHiveCatalog extends FlinkTestBase { - @Rule - public TemporaryFolder tempFolder = new TemporaryFolder(); + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); @Test public void testCreateCatalogWithWarehouseLocation() throws IOException { @@ -61,7 +59,8 @@ public void testCreateCatalogWithHiveConfDir() throws IOException { try (FileOutputStream fos = new FileOutputStream(hiveSiteXML)) { Configuration newConf = new Configuration(hiveConf); // Set another new directory which is different with the hive metastore's warehouse path. - newConf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, "file://" + warehouseDir.getAbsolutePath()); + newConf.set( + HiveConf.ConfVars.METASTOREWAREHOUSE.varname, "file://" + warehouseDir.getAbsolutePath()); newConf.writeXml(fos); } Assert.assertTrue("hive-site.xml should be created now.", Files.exists(hiveSiteXML.toPath())); @@ -77,8 +76,11 @@ public void testCreateCatalogWithHiveConfDir() throws IOException { checkSQLQuery(props, warehouseDir); } - private void checkSQLQuery(Map catalogProperties, File warehouseDir) throws IOException { - sql("CREATE CATALOG test_catalog WITH %s", FlinkCatalogTestBase.toWithClause(catalogProperties)); + private void checkSQLQuery(Map catalogProperties, File warehouseDir) + throws IOException { + sql( + "CREATE CATALOG test_catalog WITH %s", + FlinkCatalogTestBase.toWithClause(catalogProperties)); sql("USE CATALOG test_catalog"); sql("CREATE DATABASE test_db"); sql("USE test_db"); @@ -93,7 +95,8 @@ private void checkSQLQuery(Map catalogProperties, File warehouse Path dataPath = tablePath.resolve("data"); Assert.assertTrue("Table data path should exist", Files.exists(dataPath)); - Assert.assertEquals("Should have a .crc file and a .parquet file", 2, Files.list(dataPath).count()); + Assert.assertEquals( + "Should have a .crc file and a .parquet file", 2, Files.list(dataPath).count()); sql("DROP TABLE test_table"); sql("DROP DATABASE test_db"); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java index 01f8524464e0..b5dfb9cb2f6b 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkSchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import org.apache.flink.table.api.DataTypes; @@ -45,194 +44,270 @@ public class TestFlinkSchemaUtil { @Test public void testConvertFlinkSchemaToIcebergSchema() { - TableSchema flinkSchema = TableSchema.builder() - .field("id", DataTypes.INT().notNull()) - .field("name", DataTypes.STRING()) /* optional by default */ - .field("salary", DataTypes.DOUBLE().notNull()) - .field("locations", DataTypes.MAP(DataTypes.STRING(), - DataTypes.ROW(DataTypes.FIELD("posX", DataTypes.DOUBLE().notNull(), "X field"), - DataTypes.FIELD("posY", DataTypes.DOUBLE().notNull(), "Y field")))) - .field("strArray", DataTypes.ARRAY(DataTypes.STRING()).nullable()) - .field("intArray", DataTypes.ARRAY(DataTypes.INT()).nullable()) - .field("char", DataTypes.CHAR(10).notNull()) - .field("varchar", DataTypes.VARCHAR(10).notNull()) - .field("boolean", DataTypes.BOOLEAN().nullable()) - .field("tinyint", DataTypes.TINYINT()) - .field("smallint", DataTypes.SMALLINT()) - .field("bigint", DataTypes.BIGINT()) - .field("varbinary", DataTypes.VARBINARY(10)) - .field("binary", DataTypes.BINARY(10)) - .field("time", DataTypes.TIME()) - .field("timestampWithoutZone", DataTypes.TIMESTAMP()) - .field("timestampWithZone", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) - .field("date", DataTypes.DATE()) - .field("decimal", DataTypes.DECIMAL(2, 2)) - .field("decimal2", DataTypes.DECIMAL(38, 2)) - .field("decimal3", DataTypes.DECIMAL(10, 1)) - .field("multiset", DataTypes.MULTISET(DataTypes.STRING().notNull())) - .build(); + TableSchema flinkSchema = + TableSchema.builder() + .field("id", DataTypes.INT().notNull()) + .field("name", DataTypes.STRING()) /* optional by default */ + .field("salary", DataTypes.DOUBLE().notNull()) + .field( + "locations", + DataTypes.MAP( + DataTypes.STRING(), + DataTypes.ROW( + DataTypes.FIELD("posX", DataTypes.DOUBLE().notNull(), "X field"), + DataTypes.FIELD("posY", DataTypes.DOUBLE().notNull(), "Y field")))) + .field("strArray", DataTypes.ARRAY(DataTypes.STRING()).nullable()) + .field("intArray", DataTypes.ARRAY(DataTypes.INT()).nullable()) + .field("char", DataTypes.CHAR(10).notNull()) + .field("varchar", DataTypes.VARCHAR(10).notNull()) + .field("boolean", DataTypes.BOOLEAN().nullable()) + .field("tinyint", DataTypes.TINYINT()) + .field("smallint", DataTypes.SMALLINT()) + .field("bigint", DataTypes.BIGINT()) + .field("varbinary", DataTypes.VARBINARY(10)) + .field("binary", DataTypes.BINARY(10)) + .field("time", DataTypes.TIME()) + .field("timestampWithoutZone", DataTypes.TIMESTAMP()) + .field("timestampWithZone", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()) + .field("date", DataTypes.DATE()) + .field("decimal", DataTypes.DECIMAL(2, 2)) + .field("decimal2", DataTypes.DECIMAL(38, 2)) + .field("decimal3", DataTypes.DECIMAL(10, 1)) + .field("multiset", DataTypes.MULTISET(DataTypes.STRING().notNull())) + .build(); - Schema icebergSchema = new Schema( - Types.NestedField.required(0, "id", Types.IntegerType.get(), null), - Types.NestedField.optional(1, "name", Types.StringType.get(), null), - Types.NestedField.required(2, "salary", Types.DoubleType.get(), null), - Types.NestedField.optional(3, "locations", Types.MapType.ofOptional(24, 25, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(22, "posX", Types.DoubleType.get(), "X field"), - Types.NestedField.required(23, "posY", Types.DoubleType.get(), "Y field") - ))), - Types.NestedField.optional(4, "strArray", Types.ListType.ofOptional(26, Types.StringType.get())), - Types.NestedField.optional(5, "intArray", Types.ListType.ofOptional(27, Types.IntegerType.get())), - Types.NestedField.required(6, "char", Types.StringType.get()), - Types.NestedField.required(7, "varchar", Types.StringType.get()), - Types.NestedField.optional(8, "boolean", Types.BooleanType.get()), - Types.NestedField.optional(9, "tinyint", Types.IntegerType.get()), - Types.NestedField.optional(10, "smallint", Types.IntegerType.get()), - Types.NestedField.optional(11, "bigint", Types.LongType.get()), - Types.NestedField.optional(12, "varbinary", Types.BinaryType.get()), - Types.NestedField.optional(13, "binary", Types.FixedType.ofLength(10)), - Types.NestedField.optional(14, "time", Types.TimeType.get()), - Types.NestedField.optional(15, "timestampWithoutZone", Types.TimestampType.withoutZone()), - Types.NestedField.optional(16, "timestampWithZone", Types.TimestampType.withZone()), - Types.NestedField.optional(17, "date", Types.DateType.get()), - Types.NestedField.optional(18, "decimal", Types.DecimalType.of(2, 2)), - Types.NestedField.optional(19, "decimal2", Types.DecimalType.of(38, 2)), - Types.NestedField.optional(20, "decimal3", Types.DecimalType.of(10, 1)), - Types.NestedField.optional(21, "multiset", Types.MapType.ofRequired(28, 29, - Types.StringType.get(), - Types.IntegerType.get())) - ); + Schema icebergSchema = + new Schema( + Types.NestedField.required(0, "id", Types.IntegerType.get(), null), + Types.NestedField.optional(1, "name", Types.StringType.get(), null), + Types.NestedField.required(2, "salary", Types.DoubleType.get(), null), + Types.NestedField.optional( + 3, + "locations", + Types.MapType.ofOptional( + 24, + 25, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(22, "posX", Types.DoubleType.get(), "X field"), + Types.NestedField.required( + 23, "posY", Types.DoubleType.get(), "Y field")))), + Types.NestedField.optional( + 4, "strArray", Types.ListType.ofOptional(26, Types.StringType.get())), + Types.NestedField.optional( + 5, "intArray", Types.ListType.ofOptional(27, Types.IntegerType.get())), + Types.NestedField.required(6, "char", Types.StringType.get()), + Types.NestedField.required(7, "varchar", Types.StringType.get()), + Types.NestedField.optional(8, "boolean", Types.BooleanType.get()), + Types.NestedField.optional(9, "tinyint", Types.IntegerType.get()), + Types.NestedField.optional(10, "smallint", Types.IntegerType.get()), + Types.NestedField.optional(11, "bigint", Types.LongType.get()), + Types.NestedField.optional(12, "varbinary", Types.BinaryType.get()), + Types.NestedField.optional(13, "binary", Types.FixedType.ofLength(10)), + Types.NestedField.optional(14, "time", Types.TimeType.get()), + Types.NestedField.optional( + 15, "timestampWithoutZone", Types.TimestampType.withoutZone()), + Types.NestedField.optional(16, "timestampWithZone", Types.TimestampType.withZone()), + Types.NestedField.optional(17, "date", Types.DateType.get()), + Types.NestedField.optional(18, "decimal", Types.DecimalType.of(2, 2)), + Types.NestedField.optional(19, "decimal2", Types.DecimalType.of(38, 2)), + Types.NestedField.optional(20, "decimal3", Types.DecimalType.of(10, 1)), + Types.NestedField.optional( + 21, + "multiset", + Types.MapType.ofRequired(28, 29, Types.StringType.get(), Types.IntegerType.get()))); checkSchema(flinkSchema, icebergSchema); } @Test public void testMapField() { - TableSchema flinkSchema = TableSchema.builder() - .field("map_int_long", DataTypes.MAP(DataTypes.INT(), DataTypes.BIGINT()).notNull()) /* Required */ - .field("map_int_array_string", DataTypes.MAP(DataTypes.ARRAY(DataTypes.INT()), DataTypes.STRING())) - .field("map_decimal_string", DataTypes.MAP(DataTypes.DECIMAL(10, 2), DataTypes.STRING())) - .field("map_fields_fields", - DataTypes.MAP( - DataTypes.ROW( - DataTypes.FIELD("field_int", DataTypes.INT(), "doc - int"), - DataTypes.FIELD("field_string", DataTypes.STRING(), "doc - string") - ).notNull(), /* Required */ - DataTypes.ROW( - DataTypes.FIELD("field_array", DataTypes.ARRAY(DataTypes.STRING()), "doc - array") - ).notNull() /* Required */ - ).notNull() /* Required */ - ) - .build(); + TableSchema flinkSchema = + TableSchema.builder() + .field( + "map_int_long", + DataTypes.MAP(DataTypes.INT(), DataTypes.BIGINT()).notNull()) /* Required */ + .field( + "map_int_array_string", + DataTypes.MAP(DataTypes.ARRAY(DataTypes.INT()), DataTypes.STRING())) + .field( + "map_decimal_string", DataTypes.MAP(DataTypes.DECIMAL(10, 2), DataTypes.STRING())) + .field( + "map_fields_fields", + DataTypes.MAP( + DataTypes.ROW( + DataTypes.FIELD("field_int", DataTypes.INT(), "doc - int"), + DataTypes.FIELD("field_string", DataTypes.STRING(), "doc - string")) + .notNull(), /* Required */ + DataTypes.ROW( + DataTypes.FIELD( + "field_array", + DataTypes.ARRAY(DataTypes.STRING()), + "doc - array")) + .notNull() /* Required */) + .notNull() /* Required */) + .build(); - Schema icebergSchema = new Schema( - Types.NestedField.required(0, "map_int_long", - Types.MapType.ofOptional(4, 5, Types.IntegerType.get(), Types.LongType.get()), null), - Types.NestedField.optional(1, "map_int_array_string", - Types.MapType.ofOptional(7, 8, - Types.ListType.ofOptional(6, Types.IntegerType.get()), Types.StringType.get()), null), - Types.NestedField.optional(2, "map_decimal_string", Types.MapType.ofOptional(9, 10, - Types.DecimalType.of(10, 2), Types.StringType.get())), - Types.NestedField.required(3, "map_fields_fields", - Types.MapType.ofRequired( - 15, 16, - Types.StructType.of(Types.NestedField.optional(11, "field_int", Types.IntegerType.get(), "doc - int"), - Types.NestedField.optional(12, "field_string", Types.StringType.get(), "doc - string")), - Types.StructType.of(Types.NestedField.optional(14, "field_array", - Types.ListType.ofOptional(13, Types.StringType.get()), "doc - array")) - ) - ) - ); + Schema icebergSchema = + new Schema( + Types.NestedField.required( + 0, + "map_int_long", + Types.MapType.ofOptional(4, 5, Types.IntegerType.get(), Types.LongType.get()), + null), + Types.NestedField.optional( + 1, + "map_int_array_string", + Types.MapType.ofOptional( + 7, + 8, + Types.ListType.ofOptional(6, Types.IntegerType.get()), + Types.StringType.get()), + null), + Types.NestedField.optional( + 2, + "map_decimal_string", + Types.MapType.ofOptional( + 9, 10, Types.DecimalType.of(10, 2), Types.StringType.get())), + Types.NestedField.required( + 3, + "map_fields_fields", + Types.MapType.ofRequired( + 15, + 16, + Types.StructType.of( + Types.NestedField.optional( + 11, "field_int", Types.IntegerType.get(), "doc - int"), + Types.NestedField.optional( + 12, "field_string", Types.StringType.get(), "doc - string")), + Types.StructType.of( + Types.NestedField.optional( + 14, + "field_array", + Types.ListType.ofOptional(13, Types.StringType.get()), + "doc - array"))))); checkSchema(flinkSchema, icebergSchema); } @Test public void testStructField() { - TableSchema flinkSchema = TableSchema.builder() - .field("struct_int_string_decimal", DataTypes.ROW( - DataTypes.FIELD("field_int", DataTypes.INT()), - DataTypes.FIELD("field_string", DataTypes.STRING()), - DataTypes.FIELD("field_decimal", DataTypes.DECIMAL(19, 2)), - DataTypes.FIELD("field_struct", DataTypes.ROW( - DataTypes.FIELD("inner_struct_int", DataTypes.INT()), - DataTypes.FIELD("inner_struct_float_array", DataTypes.ARRAY(DataTypes.FLOAT())) - ).notNull()) /* Row is required */ - ).notNull()) /* Required */ - .field("struct_map_int_int", DataTypes.ROW( - DataTypes.FIELD("field_map", DataTypes.MAP(DataTypes.INT(), DataTypes.INT())) - ).nullable()) /* Optional */ - .build(); + TableSchema flinkSchema = + TableSchema.builder() + .field( + "struct_int_string_decimal", + DataTypes.ROW( + DataTypes.FIELD("field_int", DataTypes.INT()), + DataTypes.FIELD("field_string", DataTypes.STRING()), + DataTypes.FIELD("field_decimal", DataTypes.DECIMAL(19, 2)), + DataTypes.FIELD( + "field_struct", + DataTypes.ROW( + DataTypes.FIELD("inner_struct_int", DataTypes.INT()), + DataTypes.FIELD( + "inner_struct_float_array", + DataTypes.ARRAY(DataTypes.FLOAT()))) + .notNull()) /* Row is required */) + .notNull()) /* Required */ + .field( + "struct_map_int_int", + DataTypes.ROW( + DataTypes.FIELD( + "field_map", DataTypes.MAP(DataTypes.INT(), DataTypes.INT()))) + .nullable()) /* Optional */ + .build(); - Schema icebergSchema = new Schema( - Types.NestedField.required(0, "struct_int_string_decimal", - Types.StructType.of( - Types.NestedField.optional(5, "field_int", Types.IntegerType.get()), - Types.NestedField.optional(6, "field_string", Types.StringType.get()), - Types.NestedField.optional(7, "field_decimal", Types.DecimalType.of(19, 2)), - Types.NestedField.required(8, "field_struct", - Types.StructType.of( - Types.NestedField.optional(3, "inner_struct_int", Types.IntegerType.get()), - Types.NestedField.optional(4, "inner_struct_float_array", - Types.ListType.ofOptional(2, Types.FloatType.get())) - )) - )), - Types.NestedField.optional(1, "struct_map_int_int", - Types.StructType.of( - Types.NestedField.optional(11, "field_map", Types.MapType.ofOptional(9, 10, - Types.IntegerType.get(), Types.IntegerType.get())) - ) - ) - ); + Schema icebergSchema = + new Schema( + Types.NestedField.required( + 0, + "struct_int_string_decimal", + Types.StructType.of( + Types.NestedField.optional(5, "field_int", Types.IntegerType.get()), + Types.NestedField.optional(6, "field_string", Types.StringType.get()), + Types.NestedField.optional(7, "field_decimal", Types.DecimalType.of(19, 2)), + Types.NestedField.required( + 8, + "field_struct", + Types.StructType.of( + Types.NestedField.optional( + 3, "inner_struct_int", Types.IntegerType.get()), + Types.NestedField.optional( + 4, + "inner_struct_float_array", + Types.ListType.ofOptional(2, Types.FloatType.get())))))), + Types.NestedField.optional( + 1, + "struct_map_int_int", + Types.StructType.of( + Types.NestedField.optional( + 11, + "field_map", + Types.MapType.ofOptional( + 9, 10, Types.IntegerType.get(), Types.IntegerType.get()))))); checkSchema(flinkSchema, icebergSchema); } @Test public void testListField() { - TableSchema flinkSchema = TableSchema.builder() - .field("list_struct_fields", DataTypes.ARRAY( - DataTypes.ROW( - DataTypes.FIELD("field_int", DataTypes.INT()) - ) - ).notNull()) /* Required */ - .field("list_optional_struct_fields", DataTypes.ARRAY( - DataTypes.ROW( - DataTypes.FIELD( - "field_timestamp_with_local_time_zone", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE() - ) - ) - ).nullable()) /* Optional */ - .field("list_map_fields", DataTypes.ARRAY( - DataTypes.MAP( - DataTypes.ARRAY(DataTypes.INT().notNull()), /* Key of map must be required */ - DataTypes.ROW( - DataTypes.FIELD("field_0", DataTypes.INT(), "doc - int") - ) - ).notNull() - ).notNull()) /* Required */ - .build(); + TableSchema flinkSchema = + TableSchema.builder() + .field( + "list_struct_fields", + DataTypes.ARRAY(DataTypes.ROW(DataTypes.FIELD("field_int", DataTypes.INT()))) + .notNull()) /* Required */ + .field( + "list_optional_struct_fields", + DataTypes.ARRAY( + DataTypes.ROW( + DataTypes.FIELD( + "field_timestamp_with_local_time_zone", + DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()))) + .nullable()) /* Optional */ + .field( + "list_map_fields", + DataTypes.ARRAY( + DataTypes.MAP( + DataTypes.ARRAY( + DataTypes.INT().notNull()), /* Key of map must be required */ + DataTypes.ROW( + DataTypes.FIELD("field_0", DataTypes.INT(), "doc - int"))) + .notNull()) + .notNull()) /* Required */ + .build(); - Schema icebergSchema = new Schema( - Types.NestedField.required(0, "list_struct_fields", - Types.ListType.ofOptional(4, Types.StructType.of( - Types.NestedField.optional(3, "field_int", Types.IntegerType.get()) - ))), - Types.NestedField.optional(1, "list_optional_struct_fields", - Types.ListType.ofOptional(6, Types.StructType.of( - Types.NestedField.optional(5, "field_timestamp_with_local_time_zone", Types.TimestampType.withZone()) - ))), - Types.NestedField.required(2, "list_map_fields", - Types.ListType.ofRequired(11, - Types.MapType.ofOptional(9, 10, - Types.ListType.ofRequired(7, Types.IntegerType.get()), + Schema icebergSchema = + new Schema( + Types.NestedField.required( + 0, + "list_struct_fields", + Types.ListType.ofOptional( + 4, Types.StructType.of( - Types.NestedField.optional(8, "field_0", Types.IntegerType.get(), "doc - int") - ) - ) - )) - ); + Types.NestedField.optional(3, "field_int", Types.IntegerType.get())))), + Types.NestedField.optional( + 1, + "list_optional_struct_fields", + Types.ListType.ofOptional( + 6, + Types.StructType.of( + Types.NestedField.optional( + 5, + "field_timestamp_with_local_time_zone", + Types.TimestampType.withZone())))), + Types.NestedField.required( + 2, + "list_map_fields", + Types.ListType.ofRequired( + 11, + Types.MapType.ofOptional( + 9, + 10, + Types.ListType.ofRequired(7, Types.IntegerType.get()), + Types.StructType.of( + Types.NestedField.optional( + 8, "field_0", Types.IntegerType.get(), "doc - int")))))); checkSchema(flinkSchema, icebergSchema); } @@ -242,34 +317,43 @@ private void checkSchema(TableSchema flinkSchema, Schema icebergSchema) { // The conversion is not a 1:1 mapping, so we just check iceberg types. Assert.assertEquals( icebergSchema.asStruct(), - FlinkSchemaUtil.convert(FlinkSchemaUtil.toSchema(FlinkSchemaUtil.convert(icebergSchema))).asStruct()); + FlinkSchemaUtil.convert(FlinkSchemaUtil.toSchema(FlinkSchemaUtil.convert(icebergSchema))) + .asStruct()); } @Test public void testInconsistentTypes() { checkInconsistentType( - Types.UUIDType.get(), new BinaryType(16), - new BinaryType(16), Types.FixedType.ofLength(16)); + Types.UUIDType.get(), new BinaryType(16), new BinaryType(16), Types.FixedType.ofLength(16)); checkInconsistentType( - Types.StringType.get(), new VarCharType(VarCharType.MAX_LENGTH), - new CharType(100), Types.StringType.get()); + Types.StringType.get(), + new VarCharType(VarCharType.MAX_LENGTH), + new CharType(100), + Types.StringType.get()); checkInconsistentType( - Types.BinaryType.get(), new VarBinaryType(VarBinaryType.MAX_LENGTH), - new VarBinaryType(100), Types.BinaryType.get()); + Types.BinaryType.get(), + new VarBinaryType(VarBinaryType.MAX_LENGTH), + new VarBinaryType(100), + Types.BinaryType.get()); checkInconsistentType( - Types.TimeType.get(), new TimeType(), - new TimeType(3), Types.TimeType.get()); + Types.TimeType.get(), new TimeType(), new TimeType(3), Types.TimeType.get()); checkInconsistentType( - Types.TimestampType.withoutZone(), new TimestampType(6), - new TimestampType(3), Types.TimestampType.withoutZone()); + Types.TimestampType.withoutZone(), + new TimestampType(6), + new TimestampType(3), + Types.TimestampType.withoutZone()); checkInconsistentType( - Types.TimestampType.withZone(), new LocalZonedTimestampType(6), - new LocalZonedTimestampType(3), Types.TimestampType.withZone()); + Types.TimestampType.withZone(), + new LocalZonedTimestampType(6), + new LocalZonedTimestampType(3), + Types.TimestampType.withZone()); } private void checkInconsistentType( - Type icebergType, LogicalType flinkExpectedType, - LogicalType flinkType, Type icebergExpectedType) { + Type icebergType, + LogicalType flinkExpectedType, + LogicalType flinkType, + Type icebergExpectedType) { Assert.assertEquals(flinkExpectedType, FlinkSchemaUtil.convert(icebergType)); Assert.assertEquals( Types.StructType.of(Types.NestedField.optional(0, "f0", icebergExpectedType)), @@ -278,19 +362,19 @@ private void checkInconsistentType( @Test public void testConvertFlinkSchemaBaseOnIcebergSchema() { - Schema baseSchema = new Schema( - Lists.newArrayList( - Types.NestedField.required(101, "int", Types.IntegerType.get()), - Types.NestedField.optional(102, "string", Types.StringType.get()) - ), - Sets.newHashSet(101) - ); + Schema baseSchema = + new Schema( + Lists.newArrayList( + Types.NestedField.required(101, "int", Types.IntegerType.get()), + Types.NestedField.optional(102, "string", Types.StringType.get())), + Sets.newHashSet(101)); - TableSchema flinkSchema = TableSchema.builder() - .field("int", DataTypes.INT().notNull()) - .field("string", DataTypes.STRING().nullable()) - .primaryKey("int") - .build(); + TableSchema flinkSchema = + TableSchema.builder() + .field("int", DataTypes.INT().notNull()) + .field("string", DataTypes.STRING().nullable()) + .primaryKey("int") + .build(); Schema convertedSchema = FlinkSchemaUtil.convert(baseSchema, flinkSchema); Assert.assertEquals(baseSchema.asStruct(), convertedSchema.asStruct()); Assert.assertEquals(ImmutableSet.of(101), convertedSchema.identifierFieldIds()); @@ -298,29 +382,33 @@ public void testConvertFlinkSchemaBaseOnIcebergSchema() { @Test public void testConvertFlinkSchemaWithPrimaryKeys() { - Schema icebergSchema = new Schema( - Lists.newArrayList( - Types.NestedField.required(1, "int", Types.IntegerType.get()), - Types.NestedField.required(2, "string", Types.StringType.get()) - ), - Sets.newHashSet(1, 2) - ); + Schema icebergSchema = + new Schema( + Lists.newArrayList( + Types.NestedField.required(1, "int", Types.IntegerType.get()), + Types.NestedField.required(2, "string", Types.StringType.get())), + Sets.newHashSet(1, 2)); TableSchema tableSchema = FlinkSchemaUtil.toSchema(icebergSchema); Assert.assertTrue(tableSchema.getPrimaryKey().isPresent()); - Assert.assertEquals(ImmutableSet.of("int", "string"), + Assert.assertEquals( + ImmutableSet.of("int", "string"), ImmutableSet.copyOf(tableSchema.getPrimaryKey().get().getColumns())); } @Test public void testConvertFlinkSchemaWithNestedColumnInPrimaryKeys() { - Schema icebergSchema = new Schema( - Lists.newArrayList(Types.NestedField.required(1, "struct", - Types.StructType.of(Types.NestedField.required(2, "inner", Types.IntegerType.get()))) - ), - Sets.newHashSet(2) - ); - AssertHelpers.assertThrows("Does not support the nested columns in flink schema's primary keys", + Schema icebergSchema = + new Schema( + Lists.newArrayList( + Types.NestedField.required( + 1, + "struct", + Types.StructType.of( + Types.NestedField.required(2, "inner", Types.IntegerType.get())))), + Sets.newHashSet(2)); + AssertHelpers.assertThrows( + "Does not support the nested columns in flink schema's primary keys", ValidationException.class, "Column 'struct.inner' does not exist", () -> FlinkSchemaUtil.toSchema(icebergSchema)); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java index 76618329be7f..c4c75edd9edd 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSink.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.List; @@ -59,8 +58,7 @@ public class TestFlinkTableSink extends FlinkCatalogTestBase { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private static final String SOURCE_TABLE = "default_catalog.default_database.bounded_source"; private static final String TABLE_NAME = "test_table"; @@ -70,10 +68,12 @@ public class TestFlinkTableSink extends FlinkCatalogTestBase { private final FileFormat format; private final boolean isStreamingJob; - @Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") + @Parameterized.Parameters( + name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") public static Iterable parameters() { List parameters = Lists.newArrayList(); - for (FileFormat format : new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { + for (FileFormat format : + new FileFormat[] {FileFormat.ORC, FileFormat.AVRO, FileFormat.PARQUET}) { for (Boolean isStreaming : new Boolean[] {true, false}) { for (Object[] catalogParams : FlinkCatalogTestBase.parameters()) { String catalogName = (String) catalogParams[0]; @@ -85,7 +85,8 @@ public static Iterable parameters() { return parameters; } - public TestFlinkTableSink(String catalogName, Namespace baseNamespace, FileFormat format, Boolean isStreamingJob) { + public TestFlinkTableSink( + String catalogName, Namespace baseNamespace, FileFormat format, Boolean isStreamingJob) { super(catalogName, baseNamespace); this.format = format; this.isStreamingJob = isStreamingJob; @@ -98,8 +99,9 @@ protected TableEnvironment getTableEnv() { EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); if (isStreamingJob) { settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); @@ -120,7 +122,9 @@ public void before() { sql("CREATE DATABASE %s", flinkDatabase); sql("USE CATALOG %s", catalogName); sql("USE %s", DATABASE); - sql("CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", TABLE_NAME, format.name()); + sql( + "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", + TABLE_NAME, format.name()); icebergTable = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME)); } @@ -136,78 +140,86 @@ public void clean() { @Test public void testInsertFromSourceTable() throws Exception { // Register the rows into a temporary table. - getTableEnv().createTemporaryView("sourceTable", - getTableEnv().fromValues(SimpleDataUtil.FLINK_SCHEMA.toRowDataType(), - Expressions.row(1, "hello"), - Expressions.row(2, "world"), - Expressions.row(3, (String) null), - Expressions.row(null, "bar") - ) - ); + getTableEnv() + .createTemporaryView( + "sourceTable", + getTableEnv() + .fromValues( + SimpleDataUtil.FLINK_SCHEMA.toRowDataType(), + Expressions.row(1, "hello"), + Expressions.row(2, "world"), + Expressions.row(3, (String) null), + Expressions.row(null, "bar"))); // Redirect the records from source table to destination table. sql("INSERT INTO %s SELECT id,data from sourceTable", TABLE_NAME); // Assert the table records as expected. - SimpleDataUtil.assertTableRecords(icebergTable, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), - SimpleDataUtil.createRecord(2, "world"), - SimpleDataUtil.createRecord(3, null), - SimpleDataUtil.createRecord(null, "bar") - )); + SimpleDataUtil.assertTableRecords( + icebergTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hello"), + SimpleDataUtil.createRecord(2, "world"), + SimpleDataUtil.createRecord(3, null), + SimpleDataUtil.createRecord(null, "bar"))); } @Test public void testOverwriteTable() throws Exception { - Assume.assumeFalse("Flink unbounded streaming does not support overwrite operation", isStreamingJob); + Assume.assumeFalse( + "Flink unbounded streaming does not support overwrite operation", isStreamingJob); sql("INSERT INTO %s SELECT 1, 'a'", TABLE_NAME); - SimpleDataUtil.assertTableRecords(icebergTable, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a") - )); + SimpleDataUtil.assertTableRecords( + icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(1, "a"))); sql("INSERT OVERWRITE %s SELECT 2, 'b'", TABLE_NAME); - SimpleDataUtil.assertTableRecords(icebergTable, Lists.newArrayList( - SimpleDataUtil.createRecord(2, "b") - )); + SimpleDataUtil.assertTableRecords( + icebergTable, Lists.newArrayList(SimpleDataUtil.createRecord(2, "b"))); } @Test public void testReplacePartitions() throws Exception { - Assume.assumeFalse("Flink unbounded streaming does not support overwrite operation", isStreamingJob); + Assume.assumeFalse( + "Flink unbounded streaming does not support overwrite operation", isStreamingJob); String tableName = "test_partition"; - sql("CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", tableName, format.name()); try { - Table partitionedTable = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); + Table partitionedTable = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); sql("INSERT INTO %s SELECT 1, 'a'", tableName); sql("INSERT INTO %s SELECT 2, 'b'", tableName); sql("INSERT INTO %s SELECT 3, 'c'", tableName); - SimpleDataUtil.assertTableRecords(partitionedTable, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "b"), - SimpleDataUtil.createRecord(3, "c") - )); + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "b"), + SimpleDataUtil.createRecord(3, "c"))); sql("INSERT OVERWRITE %s SELECT 4, 'b'", tableName); sql("INSERT OVERWRITE %s SELECT 5, 'a'", tableName); - SimpleDataUtil.assertTableRecords(partitionedTable, Lists.newArrayList( - SimpleDataUtil.createRecord(5, "a"), - SimpleDataUtil.createRecord(4, "b"), - SimpleDataUtil.createRecord(3, "c") - )); + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(5, "a"), + SimpleDataUtil.createRecord(4, "b"), + SimpleDataUtil.createRecord(3, "c"))); sql("INSERT OVERWRITE %s PARTITION (data='a') SELECT 6", tableName); - SimpleDataUtil.assertTableRecords(partitionedTable, Lists.newArrayList( - SimpleDataUtil.createRecord(6, "a"), - SimpleDataUtil.createRecord(4, "b"), - SimpleDataUtil.createRecord(3, "c") - )); + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(6, "a"), + SimpleDataUtil.createRecord(4, "b"), + SimpleDataUtil.createRecord(3, "c"))); } finally { sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); } @@ -216,34 +228,38 @@ public void testReplacePartitions() throws Exception { @Test public void testInsertIntoPartition() throws Exception { String tableName = "test_insert_into_partition"; - sql("CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH ('write.format.default'='%s')", tableName, format.name()); try { - Table partitionedTable = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); + Table partitionedTable = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); // Full partition. sql("INSERT INTO %s PARTITION (data='a') SELECT 1", tableName); sql("INSERT INTO %s PARTITION (data='a') SELECT 2", tableName); sql("INSERT INTO %s PARTITION (data='b') SELECT 3", tableName); - SimpleDataUtil.assertTableRecords(partitionedTable, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "a"), - SimpleDataUtil.createRecord(3, "b") - )); + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "a"), + SimpleDataUtil.createRecord(3, "b"))); // Partial partition. sql("INSERT INTO %s SELECT 4, 'c'", tableName); sql("INSERT INTO %s SELECT 5, 'd'", tableName); - SimpleDataUtil.assertTableRecords(partitionedTable, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "a"), - SimpleDataUtil.createRecord(3, "b"), - SimpleDataUtil.createRecord(4, "c"), - SimpleDataUtil.createRecord(5, "d") - )); + SimpleDataUtil.assertTableRecords( + partitionedTable, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "a"), + SimpleDataUtil.createRecord(3, "b"), + SimpleDataUtil.createRecord(4, "c"), + SimpleDataUtil.createRecord(5, "d"))); } finally { sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); } @@ -252,34 +268,45 @@ public void testInsertIntoPartition() throws Exception { @Test public void testHashDistributeMode() throws Exception { String tableName = "test_hash_distribution_mode"; - Map tableProps = ImmutableMap.of( - "write.format.default", format.name(), - TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName() - ); + Map tableProps = + ImmutableMap.of( + "write.format.default", + format.name(), + TableProperties.WRITE_DISTRIBUTION_MODE, + DistributionMode.HASH.modeName()); // Initialize a BoundedSource table to precisely emit those rows in only one checkpoint. - List dataSet = IntStream.range(1, 1000) - .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) - .flatMap(List::stream) - .collect(Collectors.toList()); + List dataSet = + IntStream.range(1, 1000) + .mapToObj(i -> ImmutableList.of(Row.of(i, "aaa"), Row.of(i, "bbb"), Row.of(i, "ccc"))) + .flatMap(List::stream) + .collect(Collectors.toList()); String dataId = BoundedTableFactory.registerDataSet(ImmutableList.of(dataSet)); - sql("CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + - " WITH ('connector'='BoundedSource', 'data-id'='%s')", SOURCE_TABLE, dataId); - Assert.assertEquals("Should have the expected rows in source table.", Sets.newHashSet(dataSet), + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL)" + + " WITH ('connector'='BoundedSource', 'data-id'='%s')", + SOURCE_TABLE, dataId); + Assert.assertEquals( + "Should have the expected rows in source table.", + Sets.newHashSet(dataSet), Sets.newHashSet(sql("SELECT * FROM %s", SOURCE_TABLE))); - sql("CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH %s", + sql( + "CREATE TABLE %s(id INT, data VARCHAR) PARTITIONED BY (data) WITH %s", tableName, toWithClause(tableProps)); try { // Insert data set. sql("INSERT INTO %s SELECT * FROM %s", tableName, SOURCE_TABLE); - Assert.assertEquals("Should have the expected rows in sink table.", Sets.newHashSet(dataSet), + Assert.assertEquals( + "Should have the expected rows in sink table.", + Sets.newHashSet(dataSet), Sets.newHashSet(sql("SELECT * FROM %s", tableName))); // Sometimes we will have more than one checkpoint if we pass the auto checkpoint interval, - // thus producing multiple snapshots. Here we assert that each snapshot has only 1 file per partition. + // thus producing multiple snapshots. Here we assert that each snapshot has only 1 file per + // partition. Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, tableName)); Map> snapshotToDataFiles = SimpleDataUtil.snapshotToDataFiles(table); for (List dataFiles : snapshotToDataFiles.values()) { @@ -287,12 +314,24 @@ public void testHashDistributeMode() throws Exception { continue; } - Assert.assertEquals("There should be 1 data file in partition 'aaa'", 1, - SimpleDataUtil.matchingPartitions(dataFiles, table.spec(), ImmutableMap.of("data", "aaa")).size()); - Assert.assertEquals("There should be 1 data file in partition 'bbb'", 1, - SimpleDataUtil.matchingPartitions(dataFiles, table.spec(), ImmutableMap.of("data", "bbb")).size()); - Assert.assertEquals("There should be 1 data file in partition 'ccc'", 1, - SimpleDataUtil.matchingPartitions(dataFiles, table.spec(), ImmutableMap.of("data", "ccc")).size()); + Assert.assertEquals( + "There should be 1 data file in partition 'aaa'", + 1, + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "aaa")) + .size()); + Assert.assertEquals( + "There should be 1 data file in partition 'bbb'", + 1, + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "bbb")) + .size()); + Assert.assertEquals( + "There should be 1 data file in partition 'ccc'", + 1, + SimpleDataUtil.matchingPartitions( + dataFiles, table.spec(), ImmutableMap.of("data", "ccc")) + .size()); } } finally { sql("DROP TABLE IF EXISTS %s.%s", flinkDatabase, tableName); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSource.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSource.java index fe9c9d832a36..8f30f13db7e0 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSource.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkTableSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -52,18 +51,17 @@ public class TestFlinkTableSource extends FlinkTestBase { public TestFlinkTableSource() { // register a scan event listener to validate pushdown - Listeners.register(event -> { - scanEventCount += 1; - lastScanEvent = event; - }, ScanEvent.class); + Listeners.register( + event -> { + scanEventCount += 1; + lastScanEvent = event; + }, + ScanEvent.class); } @Override protected TableEnvironment getTableEnv() { - super.getTableEnv() - .getConfig() - .getConfiguration() - .set(CoreOptions.DEFAULT_PARALLELISM, 1); + super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1); return super.getTableEnv(); } @@ -77,14 +75,18 @@ public static void createWarehouse() throws IOException { @Before public void before() { - sql("CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", CATALOG_NAME, - warehouse); + sql( + "CREATE CATALOG %s WITH ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + CATALOG_NAME, warehouse); sql("USE CATALOG %s", CATALOG_NAME); sql("CREATE DATABASE %s", DATABASE_NAME); sql("USE %s", DATABASE_NAME); - sql("CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) WITH ('write.format.default'='%s')", TABLE_NAME, - format.name()); - sql("INSERT INTO %s VALUES (1,'iceberg',10),(2,'b',20),(3,CAST(NULL AS VARCHAR),30)", TABLE_NAME); + sql( + "CREATE TABLE %s (id INT, data VARCHAR,d DOUBLE) WITH ('write.format.default'='%s')", + TABLE_NAME, format.name()); + sql( + "INSERT INTO %s VALUES (1,'iceberg',10),(2,'b',20),(3,CAST(NULL AS VARCHAR),30)", + TABLE_NAME); this.scanEventCount = 0; this.lastScanEvent = null; @@ -100,19 +102,19 @@ public void clean() { @Test public void testLimitPushDown() { - AssertHelpers.assertThrows("Invalid limit number: -1 ", SqlParserException.class, + AssertHelpers.assertThrows( + "Invalid limit number: -1 ", + SqlParserException.class, () -> sql("SELECT * FROM %s LIMIT -1", TABLE_NAME)); - Assert.assertEquals("Should have 0 record", 0, sql("SELECT * FROM %s LIMIT 0", TABLE_NAME).size()); + Assert.assertEquals( + "Should have 0 record", 0, sql("SELECT * FROM %s LIMIT 0", TABLE_NAME).size()); String sqlLimitExceed = String.format("SELECT * FROM %s LIMIT 4", TABLE_NAME); List resultExceed = sql(sqlLimitExceed); Assert.assertEquals("Should have 3 records", 3, resultExceed.size()); - List expectedList = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedList = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedList, resultExceed); String querySql = String.format("SELECT * FROM %s LIMIT 1", TABLE_NAME); @@ -121,26 +123,24 @@ public void testLimitPushDown() { Assert.assertTrue("Explain should contain LimitPushDown", explain.contains(expectedExplain)); List result = sql(querySql); Assert.assertEquals("Should have 1 record", 1, result.size()); - Assertions.assertThat(result) - .containsAnyElementsOf(expectedList); + Assertions.assertThat(result).containsAnyElementsOf(expectedList); String sqlMixed = String.format("SELECT * FROM %s WHERE id = 1 LIMIT 2", TABLE_NAME); List mixedResult = sql(sqlMixed); Assert.assertEquals("Should have 1 record", 1, mixedResult.size()); - Assert.assertEquals("Should produce the expected records", Row.of(1, "iceberg", 10.0), mixedResult.get(0)); + Assert.assertEquals( + "Should produce the expected records", Row.of(1, "iceberg", 10.0), mixedResult.get(0)); } @Test public void testNoFilterPushDown() { String sql = String.format("SELECT * FROM %s ", TABLE_NAME); List result = sql(sql); - List expectedRecords = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedRecords = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedRecords, result); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); } @Test @@ -150,10 +150,12 @@ public void testFilterPushDownEqual() { List result = sql(sqlLiteralRight); Assert.assertEquals("Should have 1 record", 1, result.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), result.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), result.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -172,10 +174,12 @@ public void testFilterPushDownEqualLiteralOnLeft() { List resultLeft = sql(sqlLiteralLeft); Assert.assertEquals("Should have 1 record", 1, resultLeft.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLeft.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLeft.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -186,13 +190,11 @@ public void testFilterPushDownNoEqual() { List resultNE = sql(sqlNE); Assert.assertEquals("Should have 2 records", 2, resultNE.size()); - List expectedNE = Lists.newArrayList( - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedNE = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedNE, resultNE); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -206,15 +208,18 @@ public void testFilterPushDownNoEqualNull() { @Test public void testFilterPushDownAnd() { - String sqlAnd = String.format("SELECT * FROM %s WHERE id = 1 AND data = 'iceberg' ", TABLE_NAME); + String sqlAnd = + String.format("SELECT * FROM %s WHERE id = 1 AND data = 'iceberg' ", TABLE_NAME); List resultAnd = sql(sqlAnd); Assert.assertEquals("Should have 1 record", 1, resultAnd.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultAnd.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultAnd.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); String expected = "(ref(name=\"id\") == 1 and ref(name=\"data\") == \"iceberg\")"; - Assert.assertEquals("Should contain the push down filter", expected, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expected, lastScanEvent.filter().toString()); } @Test @@ -225,14 +230,12 @@ public void testFilterPushDownOr() { List resultOr = sql(sqlOr); Assert.assertEquals("Should have 2 record", 2, resultOr.size()); - List expectedOR = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expectedOR = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expectedOR, resultOr); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -243,14 +246,12 @@ public void testFilterPushDownGreaterThan() { List resultGT = sql(sqlGT); Assert.assertEquals("Should have 2 record", 2, resultGT.size()); - List expectedGT = Lists.newArrayList( - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedGT = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedGT, resultGT); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -270,14 +271,12 @@ public void testFilterPushDownGreaterThanLiteralOnLeft() { List resultGT = sql(sqlGT); Assert.assertEquals("Should have 2 records", 2, resultGT.size()); - List expectedGT = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expectedGT = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expectedGT, resultGT); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -288,14 +287,12 @@ public void testFilterPushDownGreaterThanEqual() { List resultGTE = sql(sqlGTE); Assert.assertEquals("Should have 2 records", 2, resultGTE.size()); - List expectedGTE = Lists.newArrayList( - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedGTE = Lists.newArrayList(Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedGTE, resultGTE); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -315,14 +312,12 @@ public void testFilterPushDownGreaterThanEqualLiteralOnLeft() { List resultGTE = sql(sqlGTE); Assert.assertEquals("Should have 2 records", 2, resultGTE.size()); - List expectedGTE = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expectedGTE = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expectedGTE, resultGTE); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -332,10 +327,12 @@ public void testFilterPushDownLessThan() { List resultLT = sql(sqlLT); Assert.assertEquals("Should have 1 record", 1, resultLT.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLT.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLT.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -354,10 +351,12 @@ public void testFilterPushDownLessThanLiteralOnLeft() { List resultLT = sql(sqlLT); Assert.assertEquals("Should have 1 record", 1, resultLT.size()); - Assert.assertEquals("Should produce the expected record", Row.of(3, null, 30.0), resultLT.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(3, null, 30.0), resultLT.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -367,10 +366,12 @@ public void testFilterPushDownLessThanEqual() { List resultLTE = sql(sqlLTE); Assert.assertEquals("Should have 1 record", 1, resultLTE.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLTE.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultLTE.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -389,10 +390,12 @@ public void testFilterPushDownLessThanEqualLiteralOnLeft() { List resultLTE = sql(sqlLTE); Assert.assertEquals("Should have 1 record", 1, resultLTE.size()); - Assert.assertEquals("Should produce the expected record", Row.of(3, null, 30.0), resultLTE.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(3, null, 30.0), resultLTE.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -402,23 +405,24 @@ public void testFilterPushDownIn() { List resultIN = sql(sqlIN); Assert.assertEquals("Should have 2 records", 2, resultIN.size()); - List expectedIN = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expectedIN = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expectedIN, resultIN); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test public void testFilterPushDownInNull() { - String sqlInNull = String.format("SELECT * FROM %s WHERE data IN ('iceberg',NULL) ", TABLE_NAME); + String sqlInNull = + String.format("SELECT * FROM %s WHERE data IN ('iceberg',NULL) ", TABLE_NAME); List result = sql(sqlInNull); Assert.assertEquals("Should have 1 record", 1, result.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), result.get(0)); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), result.get(0)); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); } @Test @@ -427,10 +431,12 @@ public void testFilterPushDownNotIn() { List resultNotIn = sql(sqlNotIn); Assert.assertEquals("Should have 1 record", 1, resultNotIn.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultNotIn.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultNotIn.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); String expectedScan = "(ref(name=\"id\") != 2 and ref(name=\"id\") != 3)"; - Assert.assertEquals("Should contain the push down filter", expectedScan, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedScan, lastScanEvent.filter().toString()); } @Test @@ -438,7 +444,8 @@ public void testFilterPushDownNotInNull() { String sqlNotInNull = String.format("SELECT * FROM %s WHERE id NOT IN (1,2,NULL) ", TABLE_NAME); List resultGT = sql(sqlNotInNull); Assert.assertEquals("Should have 0 record", 0, resultGT.size()); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); } @Test @@ -449,14 +456,12 @@ public void testFilterPushDownIsNotNull() { List resultNotNull = sql(sqlNotNull); Assert.assertEquals("Should have 2 record", 2, resultNotNull.size()); - List expected = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expected = Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expected, resultNotNull); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -466,10 +471,12 @@ public void testFilterPushDownIsNull() { List resultNull = sql(sqlNull); Assert.assertEquals("Should have 1 record", 1, resultNull.size()); - Assert.assertEquals("Should produce the expected record", Row.of(3, null, 30.0), resultNull.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(3, null, 30.0), resultNull.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -478,11 +485,13 @@ public void testFilterPushDownNot() { List resultNot = sql(sqlNot); Assert.assertEquals("Should have 1 record", 1, resultNot.size()); - Assert.assertEquals("Should produce the expected record", Row.of(3, null, 30.0), resultNot.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(3, null, 30.0), resultNot.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); String expectedFilter = "(ref(name=\"id\") != 1 and ref(name=\"id\") != 2)"; - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -492,28 +501,30 @@ public void testFilterPushDownBetween() { List resultBetween = sql(sqlBetween); Assert.assertEquals("Should have 2 record", 2, resultBetween.size()); - List expectedBetween = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0) - ); + List expectedBetween = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0)); assertSameElements(expectedBetween, resultBetween); Assert.assertEquals("Should create only one scan", 1, scanEventCount); String expected = "(ref(name=\"id\") >= 1 and ref(name=\"id\") <= 2)"; - Assert.assertEquals("Should contain the push down filter", expected, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expected, lastScanEvent.filter().toString()); } @Test public void testFilterPushDownNotBetween() { - String sqlNotBetween = String.format("SELECT * FROM %s WHERE id NOT BETWEEN 2 AND 3 ", TABLE_NAME); + String sqlNotBetween = + String.format("SELECT * FROM %s WHERE id NOT BETWEEN 2 AND 3 ", TABLE_NAME); String expectedFilter = "(ref(name=\"id\") < 2 or ref(name=\"id\") > 3)"; List resultNotBetween = sql(sqlNotBetween); Assert.assertEquals("Should have 1 record", 1, resultNotBetween.size()); - Assert.assertEquals("Should produce the expected record", Row.of(1, "iceberg", 10.0), resultNotBetween.get(0)); + Assert.assertEquals( + "Should produce the expected record", Row.of(1, "iceberg", 10.0), resultNotBetween.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -523,10 +534,13 @@ public void testFilterPushDownLike() { String sqlLike = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'ice%%' "; List resultLike = sql(sqlLike); Assert.assertEquals("Should have 1 record", 1, resultLike.size()); - Assert.assertEquals("The like result should produce the expected record", - Row.of(1, "iceberg", 10.0), resultLike.get(0)); + Assert.assertEquals( + "The like result should produce the expected record", + Row.of(1, "iceberg", 10.0), + resultLike.get(0)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); + Assert.assertEquals( + "Should contain the push down filter", expectedFilter, lastScanEvent.filter().toString()); } @Test @@ -535,85 +549,105 @@ public void testFilterNotPushDownLike() { String sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i' "; List resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 1 record", 0, resultLike.size()); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%i%%' "; resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 1 record", 1, resultLike.size()); Assert.assertEquals("Should produce the expected record", expectRecord, resultLike.get(0)); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%ice%%g' "; resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 1 record", 1, resultLike.size()); Assert.assertEquals("Should produce the expected record", expectRecord, resultLike.get(0)); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE '%%' "; resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 3 records", 3, resultLike.size()); - List expectedRecords = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedRecords = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedRecords, resultLike); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'iceber_' "; resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 1 record", 1, resultLike.size()); Assert.assertEquals("Should produce the expected record", expectRecord, resultLike.get(0)); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); sqlNoPushDown = "SELECT * FROM " + TABLE_NAME + " WHERE data LIKE 'i%%g' "; resultLike = sql(sqlNoPushDown); Assert.assertEquals("Should have 1 record", 1, resultLike.size()); Assert.assertEquals("Should produce the expected record", expectRecord, resultLike.get(0)); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); } @Test public void testFilterPushDown2Literal() { String sql2Literal = String.format("SELECT * FROM %s WHERE 1 > 0 ", TABLE_NAME); List result = sql(sql2Literal); - List expectedRecords = Lists.newArrayList( - Row.of(1, "iceberg", 10.0), - Row.of(2, "b", 20.0), - Row.of(3, null, 30.0) - ); + List expectedRecords = + Lists.newArrayList(Row.of(1, "iceberg", 10.0), Row.of(2, "b", 20.0), Row.of(3, null, 30.0)); assertSameElements(expectedRecords, result); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); } /** - * NaN is not supported by flink now, so we add the test case to assert the parse error, when we upgrade the flink - * that supports NaN, we will delele the method, and add some test case to test NaN. + * NaN is not supported by flink now, so we add the test case to assert the parse error, when we + * upgrade the flink that supports NaN, we will delele the method, and add some test case to test + * NaN. */ @Test public void testSqlParseError() { - String sqlParseErrorEqual = String.format("SELECT * FROM %s WHERE d = CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorEqual)); - - String sqlParseErrorNotEqual = String.format("SELECT * FROM %s WHERE d <> CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorNotEqual)); - - String sqlParseErrorGT = String.format("SELECT * FROM %s WHERE d > CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorGT)); - - String sqlParseErrorLT = String.format("SELECT * FROM %s WHERE d < CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorLT)); - - String sqlParseErrorGTE = String.format("SELECT * FROM %s WHERE d >= CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorGTE)); - - String sqlParseErrorLTE = String.format("SELECT * FROM %s WHERE d <= CAST('NaN' AS DOUBLE) ", TABLE_NAME); - AssertHelpers.assertThrows("The NaN is not supported by flink now. ", - NumberFormatException.class, () -> sql(sqlParseErrorLTE)); + String sqlParseErrorEqual = + String.format("SELECT * FROM %s WHERE d = CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorEqual)); + + String sqlParseErrorNotEqual = + String.format("SELECT * FROM %s WHERE d <> CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorNotEqual)); + + String sqlParseErrorGT = + String.format("SELECT * FROM %s WHERE d > CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorGT)); + + String sqlParseErrorLT = + String.format("SELECT * FROM %s WHERE d < CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorLT)); + + String sqlParseErrorGTE = + String.format("SELECT * FROM %s WHERE d >= CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorGTE)); + + String sqlParseErrorLTE = + String.format("SELECT * FROM %s WHERE d <= CAST('NaN' AS DOUBLE) ", TABLE_NAME); + AssertHelpers.assertThrows( + "The NaN is not supported by flink now. ", + NumberFormatException.class, + () -> sql(sqlParseErrorLTE)); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java index f0afac17a9e7..d4b93bc9d4a2 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestFlinkUpsert.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.time.LocalDate; @@ -49,14 +48,14 @@ public class TestFlinkUpsert extends FlinkCatalogTestBase { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private final boolean isStreamingJob; private final Map tableUpsertProps = Maps.newHashMap(); private TableEnvironment tEnv; - public TestFlinkUpsert(String catalogName, Namespace baseNamespace, FileFormat format, Boolean isStreamingJob) { + public TestFlinkUpsert( + String catalogName, Namespace baseNamespace, FileFormat format, Boolean isStreamingJob) { super(catalogName, baseNamespace); this.isStreamingJob = isStreamingJob; tableUpsertProps.put(TableProperties.FORMAT_VERSION, "2"); @@ -64,13 +63,16 @@ public TestFlinkUpsert(String catalogName, Namespace baseNamespace, FileFormat f tableUpsertProps.put(TableProperties.DEFAULT_FILE_FORMAT, format.name()); } - @Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") + @Parameterized.Parameters( + name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}") public static Iterable parameters() { List parameters = Lists.newArrayList(); - for (FileFormat format : new FileFormat[] {FileFormat.PARQUET, FileFormat.AVRO, FileFormat.ORC}) { + for (FileFormat format : + new FileFormat[] {FileFormat.PARQUET, FileFormat.AVRO, FileFormat.ORC}) { for (Boolean isStreaming : new Boolean[] {true, false}) { // Only test with one catalog as this is a file operation concern. - // FlinkCatalogTestBase requires the catalog name start with testhadoop if using hadoop catalog. + // FlinkCatalogTestBase requires the catalog name start with testhadoop if using hadoop + // catalog. String catalogName = "testhadoop"; Namespace baseNamespace = Namespace.of("default"); parameters.add(new Object[] {catalogName, baseNamespace, format, isStreaming}); @@ -83,12 +85,12 @@ public static Iterable parameters() { protected TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings - .newInstance(); + EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); if (isStreamingJob) { settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); @@ -124,33 +126,36 @@ public void testUpsertAndQuery() { LocalDate dt20220301 = LocalDate.of(2022, 3, 1); LocalDate dt20220302 = LocalDate.of(2022, 3, 2); - sql("CREATE TABLE %s(id INT NOT NULL, province STRING NOT NULL, dt DATE, PRIMARY KEY(id,province) NOT ENFORCED) " + - "PARTITIONED BY (province) WITH %s", + sql( + "CREATE TABLE %s(id INT NOT NULL, province STRING NOT NULL, dt DATE, PRIMARY KEY(id,province) NOT ENFORCED) " + + "PARTITIONED BY (province) WITH %s", tableName, toWithClause(tableUpsertProps)); try { - sql("INSERT INTO %s VALUES " + - "(1, 'a', DATE '2022-03-01')," + - "(2, 'b', DATE '2022-03-01')," + - "(1, 'b', DATE '2022-03-01')", + sql( + "INSERT INTO %s VALUES " + + "(1, 'a', DATE '2022-03-01')," + + "(2, 'b', DATE '2022-03-01')," + + "(1, 'b', DATE '2022-03-01')", tableName); - sql("INSERT INTO %s VALUES " + - "(4, 'a', DATE '2022-03-02')," + - "(5, 'b', DATE '2022-03-02')," + - "(1, 'b', DATE '2022-03-02')", + sql( + "INSERT INTO %s VALUES " + + "(4, 'a', DATE '2022-03-02')," + + "(5, 'b', DATE '2022-03-02')," + + "(1, 'b', DATE '2022-03-02')", tableName); - List rowsOn20220301 = Lists.newArrayList(Row.of(2, "b", dt20220301), Row.of(1, "a", dt20220301)); + List rowsOn20220301 = + Lists.newArrayList(Row.of(2, "b", dt20220301), Row.of(1, "a", dt20220301)); TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt < '2022-03-02'", tableName), - rowsOn20220301); + sql("SELECT * FROM %s WHERE dt < '2022-03-02'", tableName), rowsOn20220301); - List rowsOn20220302 = Lists.newArrayList( - Row.of(1, "b", dt20220302), Row.of(4, "a", dt20220302), Row.of(5, "b", dt20220302)); + List rowsOn20220302 = + Lists.newArrayList( + Row.of(1, "b", dt20220302), Row.of(4, "a", dt20220302), Row.of(5, "b", dt20220302)); TestHelpers.assertRows( - sql("SELECT * FROM %s WHERE dt = '2022-03-02'", tableName), - rowsOn20220302); + sql("SELECT * FROM %s WHERE dt = '2022-03-02'", tableName), rowsOn20220302); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), @@ -165,33 +170,24 @@ public void testPrimaryKeyEqualToPartitionKey() { // This is an SQL based reproduction of TestFlinkIcebergSinkV2#testUpsertOnDataKey String tableName = "upsert_on_data_key"; try { - sql("CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL, PRIMARY KEY(data) NOT ENFORCED) " + - "PARTITIONED BY (data) WITH %s", + sql( + "CREATE TABLE %s(id INT NOT NULL, data STRING NOT NULL, PRIMARY KEY(data) NOT ENFORCED) " + + "PARTITIONED BY (data) WITH %s", tableName, toWithClause(tableUpsertProps)); - sql("INSERT INTO %s VALUES " + - "(1, 'aaa')," + - "(2, 'aaa')," + - "(3, 'bbb')", - tableName); + sql("INSERT INTO %s VALUES " + "(1, 'aaa')," + "(2, 'aaa')," + "(3, 'bbb')", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of(2, "aaa"), Row.of(3, "bbb"))); - sql("INSERT INTO %s VALUES " + - "(4, 'aaa')," + - "(5, 'bbb')", - tableName); + sql("INSERT INTO %s VALUES " + "(4, 'aaa')," + "(5, 'bbb')", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of(4, "aaa"), Row.of(5, "bbb"))); - sql("INSERT INTO %s VALUES " + - "(6, 'aaa')," + - "(7, 'bbb')", - tableName); + sql("INSERT INTO %s VALUES " + "(6, 'aaa')," + "(7, 'bbb')", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), @@ -206,32 +202,36 @@ public void testPrimaryKeyFieldsAtBeginningOfSchema() { String tableName = "upsert_on_pk_at_schema_start"; LocalDate dt = LocalDate.of(2022, 3, 1); try { - sql("CREATE TABLE %s(data STRING NOT NULL, dt DATE NOT NULL, id INT, PRIMARY KEY(data,dt) NOT ENFORCED) " + - "PARTITIONED BY (data) WITH %s", + sql( + "CREATE TABLE %s(data STRING NOT NULL, dt DATE NOT NULL, id INT, PRIMARY KEY(data,dt) NOT ENFORCED) " + + "PARTITIONED BY (data) WITH %s", tableName, toWithClause(tableUpsertProps)); - sql("INSERT INTO %s VALUES " + - "('aaa', DATE '2022-03-01', 1)," + - "('aaa', DATE '2022-03-01', 2)," + - "('bbb', DATE '2022-03-01', 3)", + sql( + "INSERT INTO %s VALUES " + + "('aaa', DATE '2022-03-01', 1)," + + "('aaa', DATE '2022-03-01', 2)," + + "('bbb', DATE '2022-03-01', 3)", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of("aaa", dt, 2), Row.of("bbb", dt, 3))); - sql("INSERT INTO %s VALUES " + - "('aaa', DATE '2022-03-01', 4)," + - "('bbb', DATE '2022-03-01', 5)", + sql( + "INSERT INTO %s VALUES " + + "('aaa', DATE '2022-03-01', 4)," + + "('bbb', DATE '2022-03-01', 5)", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of("aaa", dt, 4), Row.of("bbb", dt, 5))); - sql("INSERT INTO %s VALUES " + - "('aaa', DATE '2022-03-01', 6)," + - "('bbb', DATE '2022-03-01', 7)", + sql( + "INSERT INTO %s VALUES " + + "('aaa', DATE '2022-03-01', 6)," + + "('bbb', DATE '2022-03-01', 7)", tableName); TestHelpers.assertRows( @@ -244,37 +244,42 @@ public void testPrimaryKeyFieldsAtBeginningOfSchema() { @Test public void testPrimaryKeyFieldsAtEndOfTableSchema() { - // This is the same test case as testPrimaryKeyFieldsAtBeginningOfSchema, but the primary key fields + // This is the same test case as testPrimaryKeyFieldsAtBeginningOfSchema, but the primary key + // fields // are located at the end of the flink schema. String tableName = "upsert_on_pk_at_schema_end"; LocalDate dt = LocalDate.of(2022, 3, 1); try { - sql("CREATE TABLE %s(id INT, data STRING NOT NULL, dt DATE NOT NULL, PRIMARY KEY(data,dt) NOT ENFORCED) " + - "PARTITIONED BY (data) WITH %s", + sql( + "CREATE TABLE %s(id INT, data STRING NOT NULL, dt DATE NOT NULL, PRIMARY KEY(data,dt) NOT ENFORCED) " + + "PARTITIONED BY (data) WITH %s", tableName, toWithClause(tableUpsertProps)); - sql("INSERT INTO %s VALUES " + - "(1, 'aaa', DATE '2022-03-01')," + - "(2, 'aaa', DATE '2022-03-01')," + - "(3, 'bbb', DATE '2022-03-01')", + sql( + "INSERT INTO %s VALUES " + + "(1, 'aaa', DATE '2022-03-01')," + + "(2, 'aaa', DATE '2022-03-01')," + + "(3, 'bbb', DATE '2022-03-01')", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of(2, "aaa", dt), Row.of(3, "bbb", dt))); - sql("INSERT INTO %s VALUES " + - "(4, 'aaa', DATE '2022-03-01')," + - "(5, 'bbb', DATE '2022-03-01')", + sql( + "INSERT INTO %s VALUES " + + "(4, 'aaa', DATE '2022-03-01')," + + "(5, 'bbb', DATE '2022-03-01')", tableName); TestHelpers.assertRows( sql("SELECT * FROM %s", tableName), Lists.newArrayList(Row.of(4, "aaa", dt), Row.of(5, "bbb", dt))); - sql("INSERT INTO %s VALUES " + - "(6, 'aaa', DATE '2022-03-01')," + - "(7, 'bbb', DATE '2022-03-01')", + sql( + "INSERT INTO %s VALUES " + + "(6, 'aaa', DATE '2022-03-01')," + + "(7, 'bbb', DATE '2022-03-01')", tableName); TestHelpers.assertRows( diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java index 51c981ee531b..e840ba842bef 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestHelpers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.IOException; @@ -67,8 +66,7 @@ import org.junit.Assert; public class TestHelpers { - private TestHelpers() { - } + private TestHelpers() {} public static T roundTripKryoSerialize(Class clazz, T table) throws IOException { KryoSerializer kryo = new KryoSerializer<>(clazz, new ExecutionConfig()); @@ -81,13 +79,15 @@ public static T roundTripKryoSerialize(Class clazz, T table) throws IOExc } public static RowData copyRowData(RowData from, RowType rowType) { - TypeSerializer[] fieldSerializers = rowType.getChildren().stream() - .map((LogicalType type) -> InternalSerializers.create(type)) - .toArray(TypeSerializer[]::new); + TypeSerializer[] fieldSerializers = + rowType.getChildren().stream() + .map((LogicalType type) -> InternalSerializers.create(type)) + .toArray(TypeSerializer[]::new); return RowDataUtil.clone(from, null, rowType, fieldSerializers); } - public static void readRowData(FlinkInputFormat input, Consumer visitor) throws IOException { + public static void readRowData(FlinkInputFormat input, Consumer visitor) + throws IOException { for (FlinkInputSplit s : input.createInputSplits(0)) { input.open(s); try { @@ -101,19 +101,21 @@ public static void readRowData(FlinkInputFormat input, Consumer visitor } } - public static List readRowData(FlinkInputFormat inputFormat, RowType rowType) throws IOException { + public static List readRowData(FlinkInputFormat inputFormat, RowType rowType) + throws IOException { List results = Lists.newArrayList(); readRowData(inputFormat, row -> results.add(copyRowData(row, rowType))); return results; } - public static List readRows(FlinkInputFormat inputFormat, RowType rowType) throws IOException { + public static List readRows(FlinkInputFormat inputFormat, RowType rowType) + throws IOException { return convertRowDataToRow(readRowData(inputFormat, rowType), rowType); } public static List convertRowDataToRow(List rowDataList, RowType rowType) { - DataStructureConverter converter = DataStructureConverters.getConverter( - TypeConversions.fromLogicalToDataType(rowType)); + DataStructureConverter converter = + DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); return rowDataList.stream() .map(converter::toExternal) .map(Row.class::cast) @@ -123,9 +125,12 @@ public static List convertRowDataToRow(List rowDataList, RowType r public static void assertRecords(List results, List expectedRecords, Schema schema) { List expected = Lists.newArrayList(); @SuppressWarnings("unchecked") - DataStructureConverter converter = (DataStructureConverter) DataStructureConverters.getConverter( - TypeConversions.fromLogicalToDataType(FlinkSchemaUtil.convert(schema))); - expectedRecords.forEach(r -> expected.add(converter.toExternal(RowDataConverter.convert(schema, r)))); + DataStructureConverter converter = + (DataStructureConverter) + DataStructureConverters.getConverter( + TypeConversions.fromLogicalToDataType(FlinkSchemaUtil.convert(schema))); + expectedRecords.forEach( + r -> expected.add(converter.toExternal(RowDataConverter.convert(schema, r)))); assertRows(results, expected); } @@ -141,13 +146,17 @@ public static void assertRowData(Schema schema, StructLike expected, RowData act assertRowData(schema.asStruct(), FlinkSchemaUtil.convert(schema), expected, actual); } - public static void assertRowData(Types.StructType structType, LogicalType rowType, StructLike expectedRecord, - RowData actualRowData) { + public static void assertRowData( + Types.StructType structType, + LogicalType rowType, + StructLike expectedRecord, + RowData actualRowData) { if (expectedRecord == null && actualRowData == null) { return; } - Assert.assertTrue("expected Record and actual RowData should be both null or not null", + Assert.assertTrue( + "expected Record and actual RowData should be both null or not null", expectedRecord != null && actualRowData != null); List types = Lists.newArrayList(); @@ -158,24 +167,30 @@ public static void assertRowData(Types.StructType structType, LogicalType rowTyp for (int i = 0; i < types.size(); i += 1) { LogicalType logicalType = ((RowType) rowType).getTypeAt(i); Object expected = expectedRecord.get(i, Object.class); - // The RowData.createFieldGetter won't return null for the required field. But in the projection case, if we are - // projecting a nested required field from an optional struct, then we should give a null for the projected field - // if the outer struct value is null. So we need to check the nullable for actualRowData here. For more details + // The RowData.createFieldGetter won't return null for the required field. But in the + // projection case, if we are + // projecting a nested required field from an optional struct, then we should give a null for + // the projected field + // if the outer struct value is null. So we need to check the nullable for actualRowData here. + // For more details // please see issue #2738. - Object actual = actualRowData.isNullAt(i) ? null : - RowData.createFieldGetter(logicalType, i).getFieldOrNull(actualRowData); + Object actual = + actualRowData.isNullAt(i) + ? null + : RowData.createFieldGetter(logicalType, i).getFieldOrNull(actualRowData); assertEquals(types.get(i), logicalType, expected, actual); } } - private static void assertEquals(Type type, LogicalType logicalType, Object expected, Object actual) { + private static void assertEquals( + Type type, LogicalType logicalType, Object expected, Object actual) { if (expected == null && actual == null) { return; } - Assert.assertTrue("expected and actual should be both null or not null", - expected != null && actual != null); + Assert.assertTrue( + "expected and actual should be both null or not null", expected != null && actual != null); switch (type.typeId()) { case BOOLEAN: @@ -194,7 +209,9 @@ private static void assertEquals(Type type, LogicalType logicalType, Object expe Assert.assertEquals("double value should be equal", expected, actual); break; case STRING: - Assertions.assertThat(expected).as("Should expect a CharSequence").isInstanceOf(CharSequence.class); + Assertions.assertThat(expected) + .as("Should expect a CharSequence") + .isInstanceOf(CharSequence.class); Assert.assertEquals("string should be equal", String.valueOf(expected), actual.toString()); break; case DATE: @@ -203,40 +220,56 @@ private static void assertEquals(Type type, LogicalType logicalType, Object expe Assert.assertEquals("date should be equal", expected, date); break; case TIME: - Assertions.assertThat(expected).as("Should expect a LocalTime").isInstanceOf(LocalTime.class); + Assertions.assertThat(expected) + .as("Should expect a LocalTime") + .isInstanceOf(LocalTime.class); int milliseconds = (int) (((LocalTime) expected).toNanoOfDay() / 1000_000); Assert.assertEquals("time millis should be equal", milliseconds, actual); break; case TIMESTAMP: if (((Types.TimestampType) type).shouldAdjustToUTC()) { - Assertions.assertThat(expected).as("Should expect a OffsetDataTime").isInstanceOf(OffsetDateTime.class); + Assertions.assertThat(expected) + .as("Should expect a OffsetDataTime") + .isInstanceOf(OffsetDateTime.class); OffsetDateTime ts = (OffsetDateTime) expected; - Assert.assertEquals("OffsetDataTime should be equal", ts.toLocalDateTime(), + Assert.assertEquals( + "OffsetDataTime should be equal", + ts.toLocalDateTime(), ((TimestampData) actual).toLocalDateTime()); } else { - Assertions.assertThat(expected).as("Should expect a LocalDataTime").isInstanceOf(LocalDateTime.class); + Assertions.assertThat(expected) + .as("Should expect a LocalDataTime") + .isInstanceOf(LocalDateTime.class); LocalDateTime ts = (LocalDateTime) expected; - Assert.assertEquals("LocalDataTime should be equal", ts, - ((TimestampData) actual).toLocalDateTime()); + Assert.assertEquals( + "LocalDataTime should be equal", ts, ((TimestampData) actual).toLocalDateTime()); } break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assert.assertEquals("binary should be equal", expected, ByteBuffer.wrap((byte[]) actual)); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); BigDecimal bd = (BigDecimal) expected; - Assert.assertEquals("decimal value should be equal", bd, - ((DecimalData) actual).toBigDecimal()); + Assert.assertEquals( + "decimal value should be equal", bd, ((DecimalData) actual).toBigDecimal()); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Collection expectedArrayData = (Collection) expected; ArrayData actualArrayData = (ArrayData) actual; LogicalType elementType = ((ArrayType) logicalType).getElementType(); - Assert.assertEquals("array length should be equal", expectedArrayData.size(), actualArrayData.size()); - assertArrayValues(type.asListType().elementType(), elementType, expectedArrayData, actualArrayData); + Assert.assertEquals( + "array length should be equal", expectedArrayData.size(), actualArrayData.size()); + assertArrayValues( + type.asListType().elementType(), elementType, expectedArrayData, actualArrayData); break; case MAP: Assertions.assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); @@ -248,7 +281,9 @@ private static void assertEquals(Type type, LogicalType logicalType, Object expe break; case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); - Assert.assertEquals("UUID should be equal", expected.toString(), + Assert.assertEquals( + "UUID should be equal", + expected.toString(), UUID.nameUUIDFromBytes((byte[]) actual).toString()); break; case FIXED: @@ -260,8 +295,8 @@ private static void assertEquals(Type type, LogicalType logicalType, Object expe } } - private static void assertArrayValues(Type type, LogicalType logicalType, Collection expectedArray, - ArrayData actualArray) { + private static void assertArrayValues( + Type type, LogicalType logicalType, Collection expectedArray, ArrayData actualArray) { List expectedElements = Lists.newArrayList(expectedArray); for (int i = 0; i < expectedArray.size(); i += 1) { if (expectedElements.get(i) == null) { @@ -271,12 +306,16 @@ private static void assertArrayValues(Type type, LogicalType logicalType, Collec Object expected = expectedElements.get(i); - assertEquals(type, logicalType, expected, + assertEquals( + type, + logicalType, + expected, ArrayData.createElementGetter(logicalType).getElementOrNull(actualArray, i)); } } - private static void assertMapValues(Types.MapType mapType, LogicalType type, Map expected, MapData actual) { + private static void assertMapValues( + Types.MapType mapType, LogicalType type, Map expected, MapData actual) { Assert.assertEquals("map size should be equal", expected.size(), actual.size()); ArrayData actualKeyArrayData = actual.keyArray(); @@ -305,7 +344,10 @@ private static void assertMapValues(Types.MapType mapType, LogicalType type, Map } Assert.assertNotNull("Should have a matching key", matchedActualKey); final int valueIndex = matchedKeyIndex; - assertEquals(valueType, actualValueType, entry.getValue(), + assertEquals( + valueType, + actualValueType, + entry.getValue(), valueGetter.getElementOrNull(actualValueArrayData, valueIndex)); } } @@ -319,31 +361,55 @@ public static void assertEquals(ManifestFile expected, ManifestFile actual) { Assert.assertEquals("Length must match", expected.length(), actual.length()); Assert.assertEquals("Spec id must match", expected.partitionSpecId(), actual.partitionSpecId()); Assert.assertEquals("ManifestContent must match", expected.content(), actual.content()); - Assert.assertEquals("SequenceNumber must match", expected.sequenceNumber(), actual.sequenceNumber()); - Assert.assertEquals("MinSequenceNumber must match", expected.minSequenceNumber(), actual.minSequenceNumber()); + Assert.assertEquals( + "SequenceNumber must match", expected.sequenceNumber(), actual.sequenceNumber()); + Assert.assertEquals( + "MinSequenceNumber must match", expected.minSequenceNumber(), actual.minSequenceNumber()); Assert.assertEquals("Snapshot id must match", expected.snapshotId(), actual.snapshotId()); - Assert.assertEquals("Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); - Assert.assertEquals("Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); - Assert.assertEquals("Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); - Assert.assertEquals("Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); - Assert.assertEquals("Existing files count must match", expected.existingFilesCount(), actual.existingFilesCount()); - Assert.assertEquals("Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); - Assert.assertEquals("Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); - Assert.assertEquals("Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); - Assert.assertEquals("Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); + Assert.assertEquals( + "Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); + Assert.assertEquals( + "Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); + Assert.assertEquals( + "Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); + Assert.assertEquals( + "Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); + Assert.assertEquals( + "Existing files count must match", + expected.existingFilesCount(), + actual.existingFilesCount()); + Assert.assertEquals( + "Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); + Assert.assertEquals( + "Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); + Assert.assertEquals( + "Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); + Assert.assertEquals( + "Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); List expectedSummaries = expected.partitions(); List actualSummaries = actual.partitions(); - Assert.assertEquals("PartitionFieldSummary size does not match", expectedSummaries.size(), actualSummaries.size()); + Assert.assertEquals( + "PartitionFieldSummary size does not match", + expectedSummaries.size(), + actualSummaries.size()); for (int i = 0; i < expectedSummaries.size(); i++) { - Assert.assertEquals("Null flag in partition must match", - expectedSummaries.get(i).containsNull(), actualSummaries.get(i).containsNull()); - Assert.assertEquals("NaN flag in partition must match", - expectedSummaries.get(i).containsNaN(), actualSummaries.get(i).containsNaN()); - Assert.assertEquals("Lower bounds in partition must match", - expectedSummaries.get(i).lowerBound(), actualSummaries.get(i).lowerBound()); - Assert.assertEquals("Upper bounds in partition must match", - expectedSummaries.get(i).upperBound(), actualSummaries.get(i).upperBound()); + Assert.assertEquals( + "Null flag in partition must match", + expectedSummaries.get(i).containsNull(), + actualSummaries.get(i).containsNull()); + Assert.assertEquals( + "NaN flag in partition must match", + expectedSummaries.get(i).containsNaN(), + actualSummaries.get(i).containsNaN()); + Assert.assertEquals( + "Lower bounds in partition must match", + expectedSummaries.get(i).lowerBound(), + actualSummaries.get(i).lowerBound()); + Assert.assertEquals( + "Upper bounds in partition must match", + expectedSummaries.get(i).upperBound(), + actualSummaries.get(i).upperBound()); } } @@ -358,7 +424,8 @@ public static void assertEquals(ContentFile expected, ContentFile actual) Assert.assertEquals("Format", expected.format(), actual.format()); Assert.assertEquals("Partition size", expected.partition().size(), actual.partition().size()); for (int i = 0; i < expected.partition().size(); i++) { - Assert.assertEquals("Partition data at index " + i, + Assert.assertEquals( + "Partition data at index " + i, expected.partition().get(i, Object.class), actual.partition().get(i, Object.class)); } @@ -371,6 +438,7 @@ public static void assertEquals(ContentFile expected, ContentFile actual) Assert.assertEquals("Upper bounds", expected.upperBounds(), actual.upperBounds()); Assert.assertEquals("Key metadata", expected.keyMetadata(), actual.keyMetadata()); Assert.assertEquals("Split offsets", expected.splitOffsets(), actual.splitOffsets()); - Assert.assertEquals("Equality field id list", actual.equalityFieldIds(), expected.equalityFieldIds()); + Assert.assertEquals( + "Equality field id list", actual.equalityFieldIds(), expected.equalityFieldIds()); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java index 26bff05a632a..a12fe2507fd5 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestIcebergConnector.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.flink; import java.io.IOException; @@ -53,8 +51,7 @@ public class TestIcebergConnector extends FlinkTestBase { private static final String TABLE_NAME = "test_table"; - @ClassRule - public static final TemporaryFolder WAREHOUSE = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder WAREHOUSE = new TemporaryFolder(); private final String catalogName; private final Map properties; @@ -66,118 +63,106 @@ public static Iterable parameters() { return Lists.newArrayList( // Create iceberg table in the hadoop catalog and default database. new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop" - ), - true + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop"), + true }, new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-table", "not_existing_table" - ), - true + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-table", "not_existing_table"), + true }, new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop" - ), - false + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop"), + false }, // Create iceberg table in the hadoop catalog and not_existing_db. new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db" - ), - true + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-database", "not_existing_db"), + true }, new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db", - "catalog-table", "not_existing_table" - ), - true + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-database", "not_existing_db", + "catalog-table", "not_existing_table"), + true }, new Object[] { - "testhadoop", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hadoop", - "catalog-database", "not_existing_db" - ), - false + "testhadoop", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hadoop", + "catalog-database", "not_existing_db"), + false }, // Create iceberg table in the hive catalog and default database. new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive" - ), - true + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive"), + true }, new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-table", "not_existing_table" - ), - true + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-table", "not_existing_table"), + true }, new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive" - ), - false + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive"), + false }, // Create iceberg table in the hive catalog and not_existing_db. new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db" - ), - true + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-database", "not_existing_db"), + true }, new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db", - "catalog-table", "not_existing_table" - ), - true + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-database", "not_existing_db", + "catalog-table", "not_existing_table"), + true }, new Object[] { - "testhive", - ImmutableMap.of( - "connector", "iceberg", - "catalog-type", "hive", - "catalog-database", "not_existing_db" - ), - false - } - ); + "testhive", + ImmutableMap.of( + "connector", "iceberg", + "catalog-type", "hive", + "catalog-database", "not_existing_db"), + false + }); } - public TestIcebergConnector(String catalogName, Map properties, boolean isStreaming) { + public TestIcebergConnector( + String catalogName, Map properties, boolean isStreaming) { this.catalogName = catalogName; this.properties = properties; this.isStreaming = isStreaming; @@ -191,8 +176,9 @@ protected TableEnvironment getTableEnv() { EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance(); if (isStreaming) { settingsBuilder.inStreamingMode(); - StreamExecutionEnvironment env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); env.setMaxParallelism(2); env.setParallelism(2); @@ -202,7 +188,8 @@ protected TableEnvironment getTableEnv() { tEnv = TableEnvironment.create(settingsBuilder.build()); } // Set only one parallelism. - tEnv.getConfig().getConfiguration() + tEnv.getConfig() + .getConfiguration() .set(CoreOptions.DEFAULT_PARALLELISM, 1) .set(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); } @@ -239,21 +226,24 @@ private void testCreateConnectorTable() { // Create table under the flink's current database. sql("CREATE TABLE %s (id BIGINT, data STRING) WITH %s", TABLE_NAME, toWithClause(tableProps)); sql("INSERT INTO %s VALUES (1, 'AAA'), (2, 'BBB'), (3, 'CCC')", TABLE_NAME); - Assert.assertEquals("Should have expected rows", + Assert.assertEquals( + "Should have expected rows", Sets.newHashSet(Row.of(1L, "AAA"), Row.of(2L, "BBB"), Row.of(3L, "CCC")), Sets.newHashSet(sql("SELECT * FROM %s", TABLE_NAME))); FlinkCatalogFactory factory = new FlinkCatalogFactory(); Catalog flinkCatalog = factory.createCatalog(catalogName, tableProps, new Configuration()); - Assert.assertTrue("Should have created the expected database", - flinkCatalog.databaseExists(databaseName())); - Assert.assertTrue("Should have created the expected table", + Assert.assertTrue( + "Should have created the expected database", flinkCatalog.databaseExists(databaseName())); + Assert.assertTrue( + "Should have created the expected table", flinkCatalog.tableExists(new ObjectPath(databaseName(), tableName()))); // Drop and create it again. sql("DROP TABLE %s", TABLE_NAME); sql("CREATE TABLE %s (id BIGINT, data STRING) WITH %s", TABLE_NAME, toWithClause(tableProps)); - Assert.assertEquals("Should have expected rows", + Assert.assertEquals( + "Should have expected rows", Sets.newHashSet(Row.of(1L, "AAA"), Row.of(2L, "BBB"), Row.of(3L, "CCC")), Sets.newHashSet(sql("SELECT * FROM %s", TABLE_NAME))); } @@ -271,7 +261,8 @@ public void testCatalogDatabaseConflictWithFlinkDatabase() { try { testCreateConnectorTable(); // Ensure that the table was created under the specific database. - AssertHelpers.assertThrows("Table should already exists", + AssertHelpers.assertThrows( + "Table should already exists", org.apache.flink.table.api.TableException.class, "Could not execute CreateTable in path", () -> sql("CREATE TABLE `default_catalog`.`%s`.`%s`", databaseName(), TABLE_NAME)); @@ -302,15 +293,14 @@ public void testConnectorTableInIcebergCatalog() { // Create a connector table in an iceberg catalog. sql("CREATE CATALOG `test_catalog` WITH %s", toWithClause(catalogProps)); try { - AssertHelpers.assertThrowsCause("Cannot create the iceberg connector table in iceberg catalog", + AssertHelpers.assertThrowsCause( + "Cannot create the iceberg connector table in iceberg catalog", IllegalArgumentException.class, "Cannot create the table with 'connector'='iceberg' table property in an iceberg catalog", - () -> sql("CREATE TABLE `test_catalog`.`%s`.`%s` (id BIGINT, data STRING) WITH %s", - FlinkCatalogFactory.DEFAULT_DATABASE_NAME, - TABLE_NAME, - toWithClause(tableProps) - ) - ); + () -> + sql( + "CREATE TABLE `test_catalog`.`%s`.`%s` (id BIGINT, data STRING) WITH %s", + FlinkCatalogFactory.DEFAULT_DATABASE_NAME, TABLE_NAME, toWithClause(tableProps))); } finally { sql("DROP CATALOG IF EXISTS `test_catalog`"); } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java index e90a9a469e47..6bd94e9ca61c 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestManifestFileSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; @@ -52,61 +54,62 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestManifestFileSerialization { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - required(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("double") - .build(); - - private static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withPartition(org.apache.iceberg.TestHelpers.Row.of(1D)) - .withPartitionPath("double=1") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - ImmutableMap.of(), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); - - private static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(0) - .withPartition(org.apache.iceberg.TestHelpers.Row.of(Double.NaN)) - .withPartitionPath("double=NaN") - .withMetrics(new Metrics(1L, - null, // no column sizes - ImmutableMap.of(1, 1L, 4, 1L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - ImmutableMap.of(4, 1L), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(1L)) // upper bounds - )) - .build(); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + required(4, "double", Types.DoubleType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("double").build(); + + private static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(0) + .withPartition(org.apache.iceberg.TestHelpers.Row.of(1D)) + .withPartitionPath("double=1") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + ImmutableMap.of(), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(4L)) // upper bounds + )) + .build(); + + private static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-2.parquet") + .withFileSizeInBytes(0) + .withPartition(org.apache.iceberg.TestHelpers.Row.of(Double.NaN)) + .withPartitionPath("double=NaN") + .withMetrics( + new Metrics( + 1L, + null, // no column sizes + ImmutableMap.of(1, 1L, 4, 1L), // value count + ImmutableMap.of(1, 0L, 2, 0L), // null count + ImmutableMap.of(4, 1L), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(1L)) // upper bounds + )) + .build(); private static final FileIO FILE_IO = new HadoopFileIO(new Configuration()); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testKryoSerialization() throws IOException { - KryoSerializer kryo = new KryoSerializer<>(ManifestFile.class, new ExecutionConfig()); + KryoSerializer kryo = + new KryoSerializer<>(ManifestFile.class, new ExecutionConfig()); DataOutputSerializer outputView = new DataOutputSerializer(1024); @@ -138,7 +141,8 @@ public void testJavaSerialization() throws Exception { out.writeObject(GenericManifestFile.copyOf(manifest).build()); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { for (int i = 0; i < 3; i += 1) { Object obj = in.readObject(); Assertions.assertThat(obj).as("Should be a ManifestFile").isInstanceOf(ManifestFile.class); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java index 9012fc564bd1..ae4ab2844bc8 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.util.Iterator; @@ -34,28 +33,32 @@ public class TestRowDataWrapper extends RecordWrapperTest { /** - * Flink's time type has been truncated to millis seconds, so we need a customized assert method to check the - * values. + * Flink's time type has been truncated to millis seconds, so we need a customized assert method + * to check the values. */ @Override public void testTime() { - generateAndValidate(new Schema(TIME.fields()), (message, expectedWrapper, actualWrapper) -> { - for (int pos = 0; pos < TIME.fields().size(); pos++) { - Object expected = expectedWrapper.get().get(pos, Object.class); - Object actual = actualWrapper.get().get(pos, Object.class); - if (expected == actual) { - return; - } + generateAndValidate( + new Schema(TIME.fields()), + (message, expectedWrapper, actualWrapper) -> { + for (int pos = 0; pos < TIME.fields().size(); pos++) { + Object expected = expectedWrapper.get().get(pos, Object.class); + Object actual = actualWrapper.get().get(pos, Object.class); + if (expected == actual) { + return; + } - if (expected == null || actual == null) { - Assert.fail(String.format("The expected value is %s but actual value is %s", expected, actual)); - } + if (expected == null || actual == null) { + Assert.fail( + String.format( + "The expected value is %s but actual value is %s", expected, actual)); + } - int expectedMilliseconds = (int) ((long) expected / 1000_000); - int actualMilliseconds = (int) ((long) actual / 1000_000); - Assert.assertEquals(message, expectedMilliseconds, actualMilliseconds); - } - }); + int expectedMilliseconds = (int) ((long) expected / 1000_000); + int actualMilliseconds = (int) ((long) actual / 1000_000); + Assert.assertEquals(message, expectedMilliseconds, actualMilliseconds); + } + }); } @Override @@ -65,7 +68,8 @@ protected void generateAndValidate(Schema schema, RecordWrapperTest.AssertMethod Iterable rowDataList = RandomRowData.generate(schema, numRecords, 101L); InternalRecordWrapper recordWrapper = new InternalRecordWrapper(schema.asStruct()); - RowDataWrapper rowDataWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + RowDataWrapper rowDataWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); Iterator actual = recordList.iterator(); Iterator expected = rowDataList.iterator(); @@ -79,8 +83,10 @@ protected void generateAndValidate(Schema schema, RecordWrapperTest.AssertMethod StructLike recordStructLike = recordWrapper.wrap(actual.next()); StructLike rowDataStructLike = rowDataWrapper.wrap(expected.next()); - assertMethod.assertEquals("Should have expected StructLike values", - actualWrapper.set(recordStructLike), expectedWrapper.set(rowDataStructLike)); + assertMethod.assertEquals( + "Should have expected StructLike values", + actualWrapper.set(recordStructLike), + expectedWrapper.set(rowDataStructLike)); } Assert.assertFalse("Shouldn't have more record", actual.hasNext()); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java index 5f7ae29ec737..61a821a9ac5a 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestTableLoader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; import java.io.File; @@ -35,9 +34,7 @@ public TestTableLoader(String dir) { } @Override - public void open() { - - } + public void open() {} @Override public Table loadTable() { @@ -45,7 +42,5 @@ public Table loadTable() { } @Override - public void close() { - - } + public void close() {} } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java index 10c0d269bc50..27124d93fef4 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/TestTableSerialization.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink; +import static org.apache.iceberg.flink.TestHelpers.roundTripKryoSerialize; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -42,30 +45,22 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.flink.TestHelpers.roundTripKryoSerialize; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestTableSerialization { private static final HadoopTables TABLES = new HadoopTables(); - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("date").build(); - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA) - .asc("id") - .build(); + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @Before @@ -82,16 +77,17 @@ public void initTable() throws IOException { public void testSerializableTableKryoSerialization() throws IOException { SerializableTable serializableTable = (SerializableTable) SerializableTable.copyOf(table); TestHelpers.assertSerializedAndLoadedMetadata( - table, - roundTripKryoSerialize(SerializableTable.class, serializableTable)); + table, roundTripKryoSerialize(SerializableTable.class, serializableTable)); } @Test public void testSerializableMetadataTableKryoSerialization() throws IOException { for (MetadataTableType type : MetadataTableType.values()) { TableOperations ops = ((HasTableOperations) table).operations(); - Table metadataTable = MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); - SerializableTable serializableMetadataTable = (SerializableTable) SerializableTable.copyOf(metadataTable); + Table metadataTable = + MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); + SerializableTable serializableMetadataTable = + (SerializableTable) SerializableTable.copyOf(metadataTable); TestHelpers.assertSerializedAndLoadedMetadata( metadataTable, @@ -103,15 +99,12 @@ public void testSerializableMetadataTableKryoSerialization() throws IOException public void testSerializableTransactionTableKryoSerialization() throws IOException { Transaction txn = table.newTransaction(); - txn.updateProperties() - .set("k1", "v1") - .commit(); + txn.updateProperties().set("k1", "v1").commit(); Table txnTable = txn.table(); SerializableTable serializableTxnTable = (SerializableTable) SerializableTable.copyOf(txnTable); TestHelpers.assertSerializedMetadata( - txnTable, - roundTripKryoSerialize(SerializableTable.class, serializableTxnTable)); + txnTable, roundTripKryoSerialize(SerializableTable.class, serializableTxnTable)); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java index 03cdcd80dec1..e59d7dacd978 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/actions/TestRewriteDataFilesAction.java @@ -16,10 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.flink.actions; +import static org.apache.iceberg.flink.SimpleDataUtil.RECORD; + import java.io.File; import java.io.IOException; import java.util.List; @@ -58,8 +58,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.flink.SimpleDataUtil.RECORD; - @RunWith(Parameterized.class) public class TestRewriteDataFilesAction extends FlinkCatalogTestBase { @@ -69,24 +67,23 @@ public class TestRewriteDataFilesAction extends FlinkCatalogTestBase { private Table icebergTableUnPartitioned; private Table icebergTablePartitioned; - public TestRewriteDataFilesAction(String catalogName, Namespace baseNamespace, FileFormat format) { + public TestRewriteDataFilesAction( + String catalogName, Namespace baseNamespace, FileFormat format) { super(catalogName, baseNamespace); this.format = format; } @Override protected TableEnvironment getTableEnv() { - super.getTableEnv() - .getConfig() - .getConfiguration() - .set(CoreOptions.DEFAULT_PARALLELISM, 1); + super.getTableEnv().getConfig().getConfiguration().set(CoreOptions.DEFAULT_PARALLELISM, 1); return super.getTableEnv(); } @Parameterized.Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}") public static Iterable parameters() { List parameters = Lists.newArrayList(); - for (FileFormat format : new FileFormat[] {FileFormat.AVRO, FileFormat.ORC, FileFormat.PARQUET}) { + for (FileFormat format : + new FileFormat[] {FileFormat.AVRO, FileFormat.ORC, FileFormat.PARQUET}) { for (Object[] catalogParams : FlinkCatalogTestBase.parameters()) { String catalogName = (String) catalogParams[0]; Namespace baseNamespace = (Namespace) catalogParams[1]; @@ -96,8 +93,7 @@ public static Iterable parameters() { return parameters; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Override @Before @@ -106,16 +102,18 @@ public void before() { sql("CREATE DATABASE %s", flinkDatabase); sql("USE CATALOG %s", catalogName); sql("USE %s", DATABASE); - sql("CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", TABLE_NAME_UNPARTITIONED, - format.name()); - icebergTableUnPartitioned = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, - TABLE_NAME_UNPARTITIONED)); - - sql("CREATE TABLE %s (id int, data varchar,spec varchar) " + - " PARTITIONED BY (data,spec) with ('write.format.default'='%s')", + sql( + "CREATE TABLE %s (id int, data varchar) with ('write.format.default'='%s')", + TABLE_NAME_UNPARTITIONED, format.name()); + icebergTableUnPartitioned = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_UNPARTITIONED)); + + sql( + "CREATE TABLE %s (id int, data varchar,spec varchar) " + + " PARTITIONED BY (data,spec) with ('write.format.default'='%s')", TABLE_NAME_PARTITIONED, format.name()); - icebergTablePartitioned = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, - TABLE_NAME_PARTITIONED)); + icebergTablePartitioned = + validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE_NAME_PARTITIONED)); } @Override @@ -130,13 +128,10 @@ public void clean() { @Test public void testRewriteDataFilesEmptyTable() throws Exception { Assert.assertNull("Table must be empty", icebergTableUnPartitioned.currentSnapshot()); - Actions.forTable(icebergTableUnPartitioned) - .rewriteDataFiles() - .execute(); + Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute(); Assert.assertNull("Table must stay empty", icebergTableUnPartitioned.currentSnapshot()); } - @Test public void testRewriteDataFilesUnpartitionedTable() throws Exception { sql("INSERT INTO %s SELECT 1, 'hello'", TABLE_NAME_UNPARTITIONED); @@ -145,13 +140,12 @@ public void testRewriteDataFilesUnpartitionedTable() throws Exception { icebergTableUnPartitioned.refresh(); CloseableIterable tasks = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 2 data files before rewrite", 2, dataFiles.size()); RewriteDataFilesActionResult result = - Actions.forTable(icebergTableUnPartitioned) - .rewriteDataFiles() - .execute(); + Actions.forTable(icebergTableUnPartitioned).rewriteDataFiles().execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); @@ -159,14 +153,15 @@ public void testRewriteDataFilesUnpartitionedTable() throws Exception { icebergTableUnPartitioned.refresh(); CloseableIterable tasks1 = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 1 data files after rewrite", 1, dataFiles1.size()); // Assert the table records as expected. - SimpleDataUtil.assertTableRecords(icebergTableUnPartitioned, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), - SimpleDataUtil.createRecord(2, "world") - )); + SimpleDataUtil.assertTableRecords( + icebergTableUnPartitioned, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hello"), SimpleDataUtil.createRecord(2, "world"))); } @Test @@ -179,13 +174,12 @@ public void testRewriteDataFilesPartitionedTable() throws Exception { icebergTablePartitioned.refresh(); CloseableIterable tasks = icebergTablePartitioned.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles.size()); RewriteDataFilesActionResult result = - Actions.forTable(icebergTablePartitioned) - .rewriteDataFiles() - .execute(); + Actions.forTable(icebergTablePartitioned).rewriteDataFiles().execute(); Assert.assertEquals("Action should rewrite 4 data files", 4, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 2 data file", 2, result.addedDataFiles().size()); @@ -193,26 +187,27 @@ public void testRewriteDataFilesPartitionedTable() throws Exception { icebergTablePartitioned.refresh(); CloseableIterable tasks1 = icebergTablePartitioned.newScan().planFiles(); - List dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 2 data files after rewrite", 2, dataFiles1.size()); // Assert the table records as expected. - Schema schema = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "spec", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "spec", Types.StringType.get())); Record record = GenericRecord.create(schema); - SimpleDataUtil.assertTableRecords(icebergTablePartitioned, Lists.newArrayList( - record.copy("id", 1, "data", "hello", "spec", "a"), - record.copy("id", 2, "data", "hello", "spec", "a"), - record.copy("id", 3, "data", "world", "spec", "b"), - record.copy("id", 4, "data", "world", "spec", "b") - )); + SimpleDataUtil.assertTableRecords( + icebergTablePartitioned, + Lists.newArrayList( + record.copy("id", 1, "data", "hello", "spec", "a"), + record.copy("id", 2, "data", "hello", "spec", "a"), + record.copy("id", 3, "data", "world", "spec", "b"), + record.copy("id", 4, "data", "world", "spec", "b"))); } - @Test public void testRewriteDataFilesWithFilter() throws Exception { sql("INSERT INTO %s SELECT 1, 'hello' ,'a'", TABLE_NAME_PARTITIONED); @@ -224,7 +219,8 @@ public void testRewriteDataFilesWithFilter() throws Exception { icebergTablePartitioned.refresh(); CloseableIterable tasks = icebergTablePartitioned.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 5 data files before rewrite", 5, dataFiles.size()); RewriteDataFilesActionResult result = @@ -240,24 +236,26 @@ public void testRewriteDataFilesWithFilter() throws Exception { icebergTablePartitioned.refresh(); CloseableIterable tasks1 = icebergTablePartitioned.newScan().planFiles(); - List dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 4 data files after rewrite", 4, dataFiles1.size()); // Assert the table records as expected. - Schema schema = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "spec", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "spec", Types.StringType.get())); Record record = GenericRecord.create(schema); - SimpleDataUtil.assertTableRecords(icebergTablePartitioned, Lists.newArrayList( - record.copy("id", 1, "data", "hello", "spec", "a"), - record.copy("id", 2, "data", "hello", "spec", "a"), - record.copy("id", 3, "data", "world", "spec", "a"), - record.copy("id", 4, "data", "world", "spec", "b"), - record.copy("id", 5, "data", "world", "spec", "b") - )); + SimpleDataUtil.assertTableRecords( + icebergTablePartitioned, + Lists.newArrayList( + record.copy("id", 1, "data", "hello", "spec", "a"), + record.copy("id", 2, "data", "hello", "spec", "a"), + record.copy("id", 3, "data", "world", "spec", "a"), + record.copy("id", 4, "data", "world", "spec", "b"), + record.copy("id", 5, "data", "world", "spec", "b"))); } @Test @@ -285,22 +283,23 @@ public void testRewriteLargeTableHasResiduals() throws IOException { icebergTableUnPartitioned.refresh(); - CloseableIterable tasks = icebergTableUnPartitioned.newScan() - .ignoreResiduals() - .filter(Expressions.equal("data", "0")) - .planFiles(); + CloseableIterable tasks = + icebergTableUnPartitioned + .newScan() + .ignoreResiduals() + .filter(Expressions.equal("data", "0")) + .planFiles(); for (FileScanTask task : tasks) { Assert.assertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual()); } - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 2 data files before rewrite", 2, dataFiles.size()); Actions actions = Actions.forTable(icebergTableUnPartitioned); - RewriteDataFilesActionResult result = actions - .rewriteDataFiles() - .filter(Expressions.equal("data", "0")) - .execute(); + RewriteDataFilesActionResult result = + actions.rewriteDataFiles().filter(Expressions.equal("data", "0")).execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); @@ -310,13 +309,14 @@ public void testRewriteLargeTableHasResiduals() throws IOException { /** * a test case to test avoid repeate compress - *

    - * If datafile cannot be combined to CombinedScanTask with other DataFiles, the size of the CombinedScanTask list size - * is 1, so we remove these CombinedScanTasks to avoid compressed repeatedly. - *

    - * In this test case,we generated 3 data files and set targetSizeInBytes greater than the largest file size so that it - * cannot be combined a CombinedScanTask with other datafiles. The datafile with the largest file size will not be - * compressed. + * + *

    If datafile cannot be combined to CombinedScanTask with other DataFiles, the size of the + * CombinedScanTask list size is 1, so we remove these CombinedScanTasks to avoid compressed + * repeatedly. + * + *

    In this test case,we generated 3 data files and set targetSizeInBytes greater than the + * largest file size so that it cannot be combined a CombinedScanTask with other datafiles. The + * datafile with the largest file size will not be compressed. * * @throws IOException IOException */ @@ -327,7 +327,8 @@ public void testRewriteAvoidRepeateCompress() throws IOException { GenericAppenderFactory genericAppenderFactory = new GenericAppenderFactory(schema); File file = temp.newFile(); int count = 0; - try (FileAppender fileAppender = genericAppenderFactory.newAppender(Files.localOutput(file), format)) { + try (FileAppender fileAppender = + genericAppenderFactory.newAppender(Files.localOutput(file), format)) { long filesize = 20000; for (; fileAppender.length() < filesize; count++) { Record record = SimpleDataUtil.createRecord(count, UUID.randomUUID().toString()); @@ -336,16 +337,15 @@ public void testRewriteAvoidRepeateCompress() throws IOException { } } - DataFile dataFile = DataFiles.builder(icebergTableUnPartitioned.spec()) - .withPath(file.getAbsolutePath()) - .withFileSizeInBytes(file.length()) - .withFormat(format) - .withRecordCount(count) - .build(); + DataFile dataFile = + DataFiles.builder(icebergTableUnPartitioned.spec()) + .withPath(file.getAbsolutePath()) + .withFileSizeInBytes(file.length()) + .withFormat(format) + .withRecordCount(count) + .build(); - icebergTableUnPartitioned.newAppend() - .appendFile(dataFile) - .commit(); + icebergTableUnPartitioned.newAppend().appendFile(dataFile).commit(); sql("INSERT INTO %s SELECT 1,'a' ", TABLE_NAME_UNPARTITIONED); sql("INSERT INTO %s SELECT 2,'b' ", TABLE_NAME_UNPARTITIONED); @@ -353,28 +353,32 @@ public void testRewriteAvoidRepeateCompress() throws IOException { icebergTableUnPartitioned.refresh(); CloseableIterable tasks = icebergTableUnPartitioned.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 3 data files before rewrite", 3, dataFiles.size()); Actions actions = Actions.forTable(icebergTableUnPartitioned); long targetSizeInBytes = file.length() + 10; - RewriteDataFilesActionResult result = actions - .rewriteDataFiles() - .targetSizeInBytes(targetSizeInBytes) - .splitOpenFileCost(1) - .execute(); + RewriteDataFilesActionResult result = + actions + .rewriteDataFiles() + .targetSizeInBytes(targetSizeInBytes) + .splitOpenFileCost(1) + .execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); icebergTableUnPartitioned.refresh(); CloseableIterable tasks1 = icebergTableUnPartitioned.newScan().planFiles(); - List dataFilesRewrote = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + List dataFilesRewrote = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 2 data files after rewrite", 2, dataFilesRewrote.size()); // the biggest file do not be rewrote - List rewroteDataFileNames = dataFilesRewrote.stream().map(ContentFile::path).collect(Collectors.toList()); + List rewroteDataFileNames = + dataFilesRewrote.stream().map(ContentFile::path).collect(Collectors.toList()); Assert.assertTrue(rewroteDataFileNames.contains(file.getAbsolutePath())); // Assert the table records as expected. diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java index b6fee9259f53..cc58d9817ac6 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/RandomRowData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import org.apache.flink.table.data.RowData; @@ -27,8 +26,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Iterables; public class RandomRowData { - private RandomRowData() { - } + private RandomRowData() {} public static Iterable generate(Schema schema, int numRecords, long seed) { return convert(schema, RandomGenericData.generate(schema, numRecords, seed)); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java index 549a6ed3a586..74b1da6007e6 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/RowDataToRowMapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import org.apache.flink.api.common.functions.RichMapFunction; @@ -40,8 +39,8 @@ public RowDataToRowMapper(RowType rowType) { @Override public void open(Configuration parameters) throws Exception { - this.converter = DataStructureConverters.getConverter( - TypeConversions.fromLogicalToDataType(rowType)); + this.converter = + DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); } @Override diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java index 8ea9e06a9f8d..e8aab824ea2d 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkAvroReaderWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.File; @@ -52,17 +51,17 @@ public class TestFlinkAvroReaderWriter extends DataTest { private static final int NUM_RECORDS = 100; - private static final Schema SCHEMA_NUM_TYPE = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "int", Types.IntegerType.get()), - Types.NestedField.optional(3, "float", Types.FloatType.get()), - Types.NestedField.optional(4, "double", Types.DoubleType.get()), - Types.NestedField.optional(5, "date", Types.DateType.get()), - Types.NestedField.optional(6, "time", Types.TimeType.get()), - Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone()), - Types.NestedField.optional(8, "bigint", Types.LongType.get()), - Types.NestedField.optional(9, "decimal", Types.DecimalType.of(4, 2)) - ); + private static final Schema SCHEMA_NUM_TYPE = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "int", Types.IntegerType.get()), + Types.NestedField.optional(3, "float", Types.FloatType.get()), + Types.NestedField.optional(4, "double", Types.DoubleType.get()), + Types.NestedField.optional(5, "date", Types.DateType.get()), + Types.NestedField.optional(6, "time", Types.TimeType.get()), + Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone()), + Types.NestedField.optional(8, "bigint", Types.LongType.get()), + Types.NestedField.optional(9, "decimal", Types.DecimalType.of(4, 2))); @Override protected void writeAndValidate(Schema schema) throws IOException { @@ -70,25 +69,29 @@ protected void writeAndValidate(Schema schema) throws IOException { writeAndValidate(schema, expectedRecords, NUM_RECORDS); } - private void writeAndValidate(Schema schema, List expectedRecords, int numRecord) throws IOException { + private void writeAndValidate(Schema schema, List expectedRecords, int numRecord) + throws IOException { RowType flinkSchema = FlinkSchemaUtil.convert(schema); List expectedRows = Lists.newArrayList(RandomRowData.convert(schema, expectedRecords)); File recordsFile = temp.newFile(); Assert.assertTrue("Delete should succeed", recordsFile.delete()); - // Write the expected records into AVRO file, then read them into RowData and assert with the expected Record list. - try (FileAppender writer = Avro.write(Files.localOutput(recordsFile)) - .schema(schema) - .createWriterFunc(DataWriter::create) - .build()) { + // Write the expected records into AVRO file, then read them into RowData and assert with the + // expected Record list. + try (FileAppender writer = + Avro.write(Files.localOutput(recordsFile)) + .schema(schema) + .createWriterFunc(DataWriter::create) + .build()) { writer.addAll(expectedRecords); } - try (CloseableIterable reader = Avro.read(Files.localInput(recordsFile)) - .project(schema) - .createReaderFunc(FlinkAvroReader::new) - .build()) { + try (CloseableIterable reader = + Avro.read(Files.localInput(recordsFile)) + .project(schema) + .createReaderFunc(FlinkAvroReader::new) + .build()) { Iterator expected = expectedRecords.iterator(); Iterator rows = reader.iterator(); for (int i = 0; i < numRecord; i++) { @@ -101,18 +104,21 @@ private void writeAndValidate(Schema schema, List expectedRecords, int n File rowDataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", rowDataFile.delete()); - // Write the expected RowData into AVRO file, then read them into Record and assert with the expected RowData list. - try (FileAppender writer = Avro.write(Files.localOutput(rowDataFile)) - .schema(schema) - .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) - .build()) { + // Write the expected RowData into AVRO file, then read them into Record and assert with the + // expected RowData list. + try (FileAppender writer = + Avro.write(Files.localOutput(rowDataFile)) + .schema(schema) + .createWriterFunc(ignore -> new FlinkAvroWriter(flinkSchema)) + .build()) { writer.addAll(expectedRows); } - try (CloseableIterable reader = Avro.read(Files.localInput(rowDataFile)) - .project(schema) - .createReaderFunc(DataReader::create) - .build()) { + try (CloseableIterable reader = + Avro.read(Files.localInput(rowDataFile)) + .project(schema) + .createReaderFunc(DataReader::create) + .build()) { Iterator expected = expectedRows.iterator(); Iterator records = reader.iterator(); for (int i = 0; i < numRecord; i += 1) { @@ -124,14 +130,22 @@ private void writeAndValidate(Schema schema, List expectedRecords, int n } private Record recordNumType( - int id, int intV, float floatV, double doubleV, long date, long time, long timestamp, - long bigint, double decimal) { + int id, + int intV, + float floatV, + double doubleV, + long date, + long time, + long timestamp, + long bigint, + double decimal) { Record record = GenericRecord.create(SCHEMA_NUM_TYPE); record.setField("id", id); record.setField("int", intV); record.setField("float", floatV); record.setField("double", doubleV); - record.setField("date", DateTimeUtil.dateFromDays((int) new Date(date).toLocalDate().toEpochDay())); + record.setField( + "date", DateTimeUtil.dateFromDays((int) new Date(date).toLocalDate().toEpochDay())); record.setField("time", new Time(time).toLocalTime()); record.setField("timestamp", DateTimeUtil.timestampFromMicros(timestamp * 1000)); record.setField("bigint", bigint); @@ -142,11 +156,28 @@ private Record recordNumType( @Test public void testNumericTypes() throws IOException { - List expected = ImmutableList.of( - recordNumType(2, Integer.MAX_VALUE, Float.MAX_VALUE, Double.MAX_VALUE, Long.MAX_VALUE, - 1643811742000L, 1643811742000L, 1643811742000L, 10.24d), - recordNumType(2, Integer.MIN_VALUE, Float.MIN_VALUE, Double.MIN_VALUE, Long.MIN_VALUE, - 1643811742000L, 1643811742000L, 1643811742000L, 10.24d)); + List expected = + ImmutableList.of( + recordNumType( + 2, + Integer.MAX_VALUE, + Float.MAX_VALUE, + Double.MAX_VALUE, + Long.MAX_VALUE, + 1643811742000L, + 1643811742000L, + 1643811742000L, + 10.24d), + recordNumType( + 2, + Integer.MIN_VALUE, + Float.MIN_VALUE, + Double.MIN_VALUE, + Long.MIN_VALUE, + 1643811742000L, + 1643811742000L, + 1643811742000L, + 10.24d)); writeAndValidate(SCHEMA_NUM_TYPE, expected, 2); } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java index 5f4a7c00d1c8..fdffc0e01c20 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.File; @@ -52,18 +51,21 @@ protected void writeAndValidate(Schema schema) throws IOException { File recordsFile = temp.newFile(); Assert.assertTrue("Delete should succeed", recordsFile.delete()); - // Write the expected records into ORC file, then read them into RowData and assert with the expected Record list. - try (FileAppender writer = ORC.write(Files.localOutput(recordsFile)) - .schema(schema) - .createWriterFunc(GenericOrcWriter::buildWriter) - .build()) { + // Write the expected records into ORC file, then read them into RowData and assert with the + // expected Record list. + try (FileAppender writer = + ORC.write(Files.localOutput(recordsFile)) + .schema(schema) + .createWriterFunc(GenericOrcWriter::buildWriter) + .build()) { writer.addAll(expectedRecords); } - try (CloseableIterable reader = ORC.read(Files.localInput(recordsFile)) - .project(schema) - .createReaderFunc(type -> new FlinkOrcReader(schema, type)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(recordsFile)) + .project(schema) + .createReaderFunc(type -> new FlinkOrcReader(schema, type)) + .build()) { Iterator expected = expectedRecords.iterator(); Iterator rows = reader.iterator(); for (int i = 0; i < NUM_RECORDS; i++) { @@ -76,19 +78,22 @@ protected void writeAndValidate(Schema schema) throws IOException { File rowDataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", rowDataFile.delete()); - // Write the expected RowData into ORC file, then read them into Record and assert with the expected RowData list. + // Write the expected RowData into ORC file, then read them into Record and assert with the + // expected RowData list. RowType rowType = FlinkSchemaUtil.convert(schema); - try (FileAppender writer = ORC.write(Files.localOutput(rowDataFile)) - .schema(schema) - .createWriterFunc((iSchema, typeDesc) -> FlinkOrcWriter.buildWriter(rowType, iSchema)) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(rowDataFile)) + .schema(schema) + .createWriterFunc((iSchema, typeDesc) -> FlinkOrcWriter.buildWriter(rowType, iSchema)) + .build()) { writer.addAll(expectedRows); } - try (CloseableIterable reader = ORC.read(Files.localInput(rowDataFile)) - .project(schema) - .createReaderFunc(type -> GenericOrcReader.buildReader(schema, type)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(rowDataFile)) + .project(schema) + .createReaderFunc(type -> GenericOrcReader.buildReader(schema, type)) + .build()) { Iterator expected = expectedRows.iterator(); Iterator records = reader.iterator(); for (int i = 0; i < NUM_RECORDS; i += 1) { diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java index 6c3a4e34efaa..30a2a7bb51ce 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetReader.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; @@ -49,28 +50,27 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestFlinkParquetReader extends DataTest { private static final int NUM_RECORDS = 100; @Test public void testTwoLevelList() throws IOException { - Schema schema = new Schema( - optional(1, "arraybytes", Types.ListType.ofRequired(3, Types.BinaryType.get())), - optional(2, "topbytes", Types.BinaryType.get()) - ); + Schema schema = + new Schema( + optional(1, "arraybytes", Types.ListType.ofRequired(3, Types.BinaryType.get())), + optional(2, "topbytes", Types.BinaryType.get())); org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(schema.asStruct()); File testFile = temp.newFile(); Assert.assertTrue(testFile.delete()); - ParquetWriter writer = AvroParquetWriter.builder(new Path(testFile.toURI())) - .withDataModel(GenericData.get()) - .withSchema(avroSchema) - .config("parquet.avro.add-list-element-records", "true") - .config("parquet.avro.write-old-list-structure", "true") - .build(); + ParquetWriter writer = + AvroParquetWriter.builder(new Path(testFile.toURI())) + .withDataModel(GenericData.get()) + .withSchema(avroSchema) + .config("parquet.avro.add-list-element-records", "true") + .config("parquet.avro.write-old-list-structure", "true") + .build(); GenericRecordBuilder recordBuilder = new GenericRecordBuilder(avroSchema); List expectedByteList = Lists.newArrayList(); @@ -84,10 +84,11 @@ public void testTwoLevelList() throws IOException { writer.write(expectedRecord); writer.close(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) + .build()) { Iterator rows = reader.iterator(); Assert.assertTrue("Should have at least one row", rows.hasNext()); RowData rowData = rows.next(); @@ -101,17 +102,19 @@ private void writeAndValidate(Iterable iterable, Schema schema) throws I File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .createWriterFunc(GenericParquetWriter::buildWriter) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(schema) + .createWriterFunc(GenericParquetWriter::buildWriter) + .build()) { writer.addAll(iterable); } - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(type -> FlinkParquetReaders.buildReader(schema, type)) + .build()) { Iterator expected = iterable.iterator(); Iterator rows = reader.iterator(); LogicalType rowType = FlinkSchemaUtil.convert(schema); @@ -126,7 +129,10 @@ private void writeAndValidate(Iterable iterable, Schema schema) throws I @Override protected void writeAndValidate(Schema schema) throws IOException { writeAndValidate(RandomGenericData.generate(schema, NUM_RECORDS, 19981), schema); - writeAndValidate(RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124), schema); - writeAndValidate(RandomGenericData.generateFallbackRecords(schema, NUM_RECORDS, 21124, NUM_RECORDS / 20), schema); + writeAndValidate( + RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124), schema); + writeAndValidate( + RandomGenericData.generateFallbackRecords(schema, NUM_RECORDS, 21124, NUM_RECORDS / 20), + schema); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java index 1db0f8767518..7b868eafc311 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkParquetWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.File; @@ -42,8 +41,7 @@ public class TestFlinkParquetWriter extends DataTest { private static final int NUM_RECORDS = 100; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private void writeAndValidate(Iterable iterable, Schema schema) throws IOException { File testFile = temp.newFile(); @@ -51,17 +49,19 @@ private void writeAndValidate(Iterable iterable, Schema schema) throws LogicalType logicalType = FlinkSchemaUtil.convert(schema); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(logicalType, msgType)) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(schema) + .createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(logicalType, msgType)) + .build()) { writer.addAll(iterable); } - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(msgType -> GenericParquetReaders.buildReader(schema, msgType)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(msgType -> GenericParquetReaders.buildReader(schema, msgType)) + .build()) { Iterator expected = iterable.iterator(); Iterator actual = reader.iterator(); LogicalType rowType = FlinkSchemaUtil.convert(schema); @@ -75,15 +75,19 @@ private void writeAndValidate(Iterable iterable, Schema schema) throws @Override protected void writeAndValidate(Schema schema) throws IOException { - writeAndValidate( - RandomRowData.generate(schema, NUM_RECORDS, 19981), schema); + writeAndValidate(RandomRowData.generate(schema, NUM_RECORDS, 19981), schema); - writeAndValidate(RandomRowData.convert(schema, - RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124)), + writeAndValidate( + RandomRowData.convert( + schema, + RandomGenericData.generateDictionaryEncodableRecords(schema, NUM_RECORDS, 21124)), schema); - writeAndValidate(RandomRowData.convert(schema, - RandomGenericData.generateFallbackRecords(schema, NUM_RECORDS, 21124, NUM_RECORDS / 20)), + writeAndValidate( + RandomRowData.convert( + schema, + RandomGenericData.generateFallbackRecords( + schema, NUM_RECORDS, 21124, NUM_RECORDS / 20)), schema); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java index 37016adfbdf2..4cb77b11fd7b 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.util.Iterator; @@ -36,98 +35,96 @@ public class TestRowDataProjection { @Test public void testFullProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); generateAndValidate(schema, schema); } @Test public void testReorderedFullProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); - Schema reordered = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); generateAndValidate(schema, reordered); } @Test public void testBasicProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); - Schema id = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); - Schema data = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + Schema id = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); + Schema data = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); generateAndValidate(schema, id); generateAndValidate(schema, data); } @Test public void testEmptyProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); generateAndValidate(schema, schema.select()); } @Test public void testRename() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); - - Schema renamed = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); + + Schema renamed = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); generateAndValidate(schema, renamed); } @Test public void testNestedProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); // Project id only. - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); generateAndValidate(schema, idOnly); // Project lat only. - Schema latOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()) - )) - ); + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); generateAndValidate(schema, latOnly); // Project long only. - Schema longOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); generateAndValidate(schema, longOnly); // Project location. @@ -137,37 +134,40 @@ public void testNestedProjection() { @Test public void testPrimitiveTypeProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(2, "b", Types.BooleanType.get()), - Types.NestedField.optional(3, "i", Types.IntegerType.get()), - Types.NestedField.required(4, "l", Types.LongType.get()), - Types.NestedField.optional(5, "f", Types.FloatType.get()), - Types.NestedField.required(6, "d", Types.DoubleType.get()), - Types.NestedField.optional(7, "date", Types.DateType.get()), - Types.NestedField.optional(8, "time", Types.TimeType.get()), - Types.NestedField.required(9, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.required(10, "ts_tz", Types.TimestampType.withZone()), - Types.NestedField.required(11, "s", Types.StringType.get()), - Types.NestedField.required(12, "fixed", Types.FixedType.ofLength(7)), - Types.NestedField.optional(13, "bytes", Types.BinaryType.get()), - Types.NestedField.required(14, "dec_9_0", Types.DecimalType.of(9, 0)), - Types.NestedField.required(15, "dec_11_2", Types.DecimalType.of(11, 2)), - Types.NestedField.required(16, "dec_38_10", Types.DecimalType.of(38, 10))// maximum precision - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(2, "b", Types.BooleanType.get()), + Types.NestedField.optional(3, "i", Types.IntegerType.get()), + Types.NestedField.required(4, "l", Types.LongType.get()), + Types.NestedField.optional(5, "f", Types.FloatType.get()), + Types.NestedField.required(6, "d", Types.DoubleType.get()), + Types.NestedField.optional(7, "date", Types.DateType.get()), + Types.NestedField.optional(8, "time", Types.TimeType.get()), + Types.NestedField.required(9, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.required(10, "ts_tz", Types.TimestampType.withZone()), + Types.NestedField.required(11, "s", Types.StringType.get()), + Types.NestedField.required(12, "fixed", Types.FixedType.ofLength(7)), + Types.NestedField.optional(13, "bytes", Types.BinaryType.get()), + Types.NestedField.required(14, "dec_9_0", Types.DecimalType.of(9, 0)), + Types.NestedField.required(15, "dec_11_2", Types.DecimalType.of(11, 2)), + Types.NestedField.required( + 16, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision + ); generateAndValidate(schema, schema); } @Test public void testPrimitiveMapTypeProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "map", Types.MapType.ofOptional( - 1, 2, Types.IntegerType.get(), Types.StringType.get() - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "map", + Types.MapType.ofOptional(1, 2, Types.IntegerType.get(), Types.StringType.get()))); // Project id only. Schema idOnly = schema.select("id"); @@ -183,20 +183,21 @@ public void testPrimitiveMapTypeProjection() { @Test public void testNestedMapTypeProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(7, "map", Types.MapType.ofOptional( - 5, 6, - Types.StructType.of( - Types.NestedField.required(1, "key", Types.LongType.get()), - Types.NestedField.required(2, "keyData", Types.LongType.get()) - ), - Types.StructType.of( - Types.NestedField.required(3, "value", Types.LongType.get()), - Types.NestedField.required(4, "valueData", Types.LongType.get()) - ) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 7, + "map", + Types.MapType.ofOptional( + 5, + 6, + Types.StructType.of( + Types.NestedField.required(1, "key", Types.LongType.get()), + Types.NestedField.required(2, "keyData", Types.LongType.get())), + Types.StructType.of( + Types.NestedField.required(3, "value", Types.LongType.get()), + Types.NestedField.required(4, "valueData", Types.LongType.get()))))); // Project id only. Schema idOnly = schema.select("id"); @@ -210,50 +211,52 @@ public void testNestedMapTypeProjection() { generateAndValidate(schema, schema); // Project partial map key. - Schema partialMapKey = new Schema( - Types.NestedField.optional(7, "map", Types.MapType.ofOptional( - 5, 6, - Types.StructType.of( - Types.NestedField.required(1, "key", Types.LongType.get()) - ), - Types.StructType.of( - Types.NestedField.required(3, "value", Types.LongType.get()), - Types.NestedField.required(4, "valueData", Types.LongType.get()) - ) - )) - ); - AssertHelpers.assertThrows("Should not allow to project a partial map key with non-primitive type.", - IllegalArgumentException.class, "Cannot project a partial map key or value", - () -> generateAndValidate(schema, partialMapKey) - ); + Schema partialMapKey = + new Schema( + Types.NestedField.optional( + 7, + "map", + Types.MapType.ofOptional( + 5, + 6, + Types.StructType.of(Types.NestedField.required(1, "key", Types.LongType.get())), + Types.StructType.of( + Types.NestedField.required(3, "value", Types.LongType.get()), + Types.NestedField.required(4, "valueData", Types.LongType.get()))))); + AssertHelpers.assertThrows( + "Should not allow to project a partial map key with non-primitive type.", + IllegalArgumentException.class, + "Cannot project a partial map key or value", + () -> generateAndValidate(schema, partialMapKey)); // Project partial map key. - Schema partialMapValue = new Schema( - Types.NestedField.optional(7, "map", Types.MapType.ofOptional( - 5, 6, - Types.StructType.of( - Types.NestedField.required(1, "key", Types.LongType.get()), - Types.NestedField.required(2, "keyData", Types.LongType.get()) - ), - Types.StructType.of( - Types.NestedField.required(3, "value", Types.LongType.get()) - ) - )) - ); - AssertHelpers.assertThrows("Should not allow to project a partial map value with non-primitive type.", - IllegalArgumentException.class, "Cannot project a partial map key or value", - () -> generateAndValidate(schema, partialMapValue) - ); + Schema partialMapValue = + new Schema( + Types.NestedField.optional( + 7, + "map", + Types.MapType.ofOptional( + 5, + 6, + Types.StructType.of( + Types.NestedField.required(1, "key", Types.LongType.get()), + Types.NestedField.required(2, "keyData", Types.LongType.get())), + Types.StructType.of( + Types.NestedField.required(3, "value", Types.LongType.get()))))); + AssertHelpers.assertThrows( + "Should not allow to project a partial map value with non-primitive type.", + IllegalArgumentException.class, + "Cannot project a partial map key or value", + () -> generateAndValidate(schema, partialMapValue)); } @Test public void testPrimitiveListTypeProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(2, "list", Types.ListType.ofOptional( - 1, Types.StringType.get() - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 2, "list", Types.ListType.ofOptional(1, Types.StringType.get()))); // Project id only. Schema idOnly = schema.select("id"); @@ -269,16 +272,18 @@ public void testPrimitiveListTypeProjection() { @Test public void testNestedListTypeProjection() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "list", Types.ListType.ofOptional( - 4, Types.StructType.of( - Types.NestedField.required(1, "nestedListField1", Types.LongType.get()), - Types.NestedField.required(2, "nestedListField2", Types.LongType.get()), - Types.NestedField.required(3, "nestedListField3", Types.LongType.get()) - ) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "list", + Types.ListType.ofOptional( + 4, + Types.StructType.of( + Types.NestedField.required(1, "nestedListField1", Types.LongType.get()), + Types.NestedField.required(2, "nestedListField2", Types.LongType.get()), + Types.NestedField.required(3, "nestedListField3", Types.LongType.get()))))); // Project id only. Schema idOnly = schema.select("id"); @@ -292,17 +297,20 @@ public void testNestedListTypeProjection() { generateAndValidate(schema, schema); // Project partial list value. - Schema partialList = new Schema( - Types.NestedField.optional(5, "list", Types.ListType.ofOptional( - 4, Types.StructType.of( - Types.NestedField.required(2, "nestedListField2", Types.LongType.get()) - ) - )) - ); - AssertHelpers.assertThrows("Should not allow to project a partial list element with non-primitive type.", - IllegalArgumentException.class, "Cannot project a partial list element", - () -> generateAndValidate(schema, partialList) - ); + Schema partialList = + new Schema( + Types.NestedField.optional( + 5, + "list", + Types.ListType.ofOptional( + 4, + Types.StructType.of( + Types.NestedField.required(2, "nestedListField2", Types.LongType.get()))))); + AssertHelpers.assertThrows( + "Should not allow to project a partial list element with non-primitive type.", + IllegalArgumentException.class, + "Cannot project a partial list element", + () -> generateAndValidate(schema, partialList)); } private void generateAndValidate(Schema schema, Schema projectSchema) { diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java index 9ccb1d56c0ed..df2e6ae21c7e 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/data/TestRowProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.data; import java.io.File; @@ -45,34 +44,36 @@ public class TestRowProjection { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private RowData writeAndRead(String desc, Schema writeSchema, Schema readSchema, RowData row) throws IOException { + private RowData writeAndRead(String desc, Schema writeSchema, Schema readSchema, RowData row) + throws IOException { File file = temp.newFile(desc + ".avro"); Assert.assertTrue(file.delete()); - try (FileAppender appender = Avro.write(Files.localOutput(file)) - .schema(writeSchema) - .createWriterFunc(ignore -> new FlinkAvroWriter(FlinkSchemaUtil.convert(writeSchema))) - .build()) { + try (FileAppender appender = + Avro.write(Files.localOutput(file)) + .schema(writeSchema) + .createWriterFunc(ignore -> new FlinkAvroWriter(FlinkSchemaUtil.convert(writeSchema))) + .build()) { appender.add(row); } - Iterable records = Avro.read(Files.localInput(file)) - .project(readSchema) - .createReaderFunc(FlinkAvroReader::new) - .build(); + Iterable records = + Avro.read(Files.localInput(file)) + .project(readSchema) + .createReaderFunc(FlinkAvroReader::new) + .build(); return Iterables.getOnlyElement(records); } @Test public void testFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); @@ -80,93 +81,96 @@ public void testFullProjection() throws Exception { Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - int cmp = Comparators.charSequences() - .compare("test", projected.getString(1).toString()); + int cmp = Comparators.charSequences().compare("test", projected.getString(1).toString()); Assert.assertEquals("Should contain the correct data value", cmp, 0); } @Test public void testSpecialCharacterProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "user id", Types.LongType.get()), - Types.NestedField.optional(1, "data%0", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "user id", Types.LongType.get()), + Types.NestedField.optional(1, "data%0", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); RowData full = writeAndRead("special_chars", schema, schema, row); Assert.assertEquals("Should contain the correct id value", 34L, full.getLong(0)); - Assert.assertEquals("Should contain the correct data value", + Assert.assertEquals( + "Should contain the correct data value", 0, Comparators.charSequences().compare("test", full.getString(1).toString())); RowData projected = writeAndRead("special_characters", schema, schema.select("data%0"), full); Assert.assertEquals("Should not contain id value", 1, projected.getArity()); - Assert.assertEquals("Should contain the correct data value", + Assert.assertEquals( + "Should contain the correct data value", 0, Comparators.charSequences().compare("test", projected.getString(0).toString())); } @Test public void testReorderedFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); - Schema reordered = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("full_projection", schema, reordered, row); - Assert.assertEquals("Should contain the correct 0 value", "test", projected.getString(0).toString()); + Assert.assertEquals( + "Should contain the correct 0 value", "test", projected.getString(0).toString()); Assert.assertEquals("Should contain the correct 1 value", 34L, projected.getLong(1)); } @Test public void testReorderedProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); - Schema reordered = new Schema( - Types.NestedField.optional(2, "missing_1", Types.StringType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(3, "missing_2", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(2, "missing_1", Types.StringType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(3, "missing_2", Types.LongType.get())); RowData projected = writeAndRead("full_projection", schema, reordered, row); Assert.assertTrue("Should contain the correct 0 value", projected.isNullAt(0)); - Assert.assertEquals("Should contain the correct 1 value", "test", projected.getString(1).toString()); + Assert.assertEquals( + "Should contain the correct 1 value", "test", projected.getString(1).toString()); Assert.assertTrue("Should contain the correct 2 value", projected.isNullAt(2)); } @Test public void testRenamedAddedField() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(1, "a", Types.LongType.get()), - Types.NestedField.required(2, "b", Types.LongType.get()), - Types.NestedField.required(3, "d", Types.LongType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(1, "a", Types.LongType.get()), + Types.NestedField.required(2, "b", Types.LongType.get()), + Types.NestedField.required(3, "d", Types.LongType.get())); RowData row = GenericRowData.of(100L, 200L, 300L); - Schema renamedAdded = new Schema( - Types.NestedField.optional(1, "a", Types.LongType.get()), - Types.NestedField.optional(2, "b", Types.LongType.get()), - Types.NestedField.optional(3, "c", Types.LongType.get()), - Types.NestedField.optional(4, "d", Types.LongType.get()) - ); + Schema renamedAdded = + new Schema( + Types.NestedField.optional(1, "a", Types.LongType.get()), + Types.NestedField.optional(2, "b", Types.LongType.get()), + Types.NestedField.optional(3, "c", Types.LongType.get()), + Types.NestedField.optional(4, "d", Types.LongType.get())); RowData projected = writeAndRead("rename_and_add_column_projection", schema, renamedAdded, row); Assert.assertEquals("Should contain the correct value in column 1", projected.getLong(0), 100L); @@ -177,10 +181,10 @@ public void testRenamedAddedField() throws Exception { @Test public void testEmptyProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); @@ -192,24 +196,20 @@ public void testEmptyProjection() throws Exception { @Test public void testBasicProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("basic_projection_id", writeSchema, idOnly, row); Assert.assertEquals("Should not project data", 1, projected.getArity()); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - Schema dataOnly = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, row); @@ -220,17 +220,17 @@ public void testBasicProjection() throws Exception { @Test public void testRename() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); RowData row = GenericRowData.of(34L, StringData.fromString("test")); - Schema readSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get()) - ); + Schema readSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); RowData projected = writeAndRead("project_and_rename", writeSchema, readSchema, row); @@ -241,83 +241,87 @@ public void testRename() throws Exception { @Test public void testNestedStructProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); RowData location = GenericRowData.of(52.995143f, -1.539054f); RowData record = GenericRowData.of(34L, location); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("id_only", writeSchema, idOnly, record); Assert.assertEquals("Should not project location", 1, projected.getArity()); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); - Schema latOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()) - )) - ); + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); projected = writeAndRead("latitude_only", writeSchema, latOnly, record); RowData projectedLocation = projected.getRow(0, 1); Assert.assertEquals("Should not project id", 1, projected.getArity()); Assert.assertFalse("Should project location", projected.isNullAt(0)); Assert.assertEquals("Should not project longitude", 1, projectedLocation.getArity()); - Assert.assertEquals("Should project latitude", - 52.995143f, projectedLocation.getFloat(0), 0.000001f); + Assert.assertEquals( + "Should project latitude", 52.995143f, projectedLocation.getFloat(0), 0.000001f); - Schema longOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); projected = writeAndRead("longitude_only", writeSchema, longOnly, record); projectedLocation = projected.getRow(0, 1); Assert.assertEquals("Should not project id", 1, projected.getArity()); Assert.assertFalse("Should project location", projected.isNullAt(0)); Assert.assertEquals("Should not project latitutde", 1, projectedLocation.getArity()); - Assert.assertEquals("Should project longitude", - -1.539054f, projectedLocation.getFloat(0), 0.000001f); + Assert.assertEquals( + "Should project longitude", -1.539054f, projectedLocation.getFloat(0), 0.000001f); Schema locationOnly = writeSchema.select("location"); projected = writeAndRead("location_only", writeSchema, locationOnly, record); projectedLocation = projected.getRow(0, 1); Assert.assertEquals("Should not project id", 1, projected.getArity()); Assert.assertFalse("Should project location", projected.isNullAt(0)); - Assert.assertEquals("Should project latitude", - 52.995143f, projectedLocation.getFloat(0), 0.000001f); - Assert.assertEquals("Should project longitude", - -1.539054f, projectedLocation.getFloat(1), 0.000001f); + Assert.assertEquals( + "Should project latitude", 52.995143f, projectedLocation.getFloat(0), 0.000001f); + Assert.assertEquals( + "Should project longitude", -1.539054f, projectedLocation.getFloat(1), 0.000001f); } @Test public void testMapProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "properties", - Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get())) - ); - - GenericMapData properties = new GenericMapData(ImmutableMap.of( - StringData.fromString("a"), - StringData.fromString("A"), - StringData.fromString("b"), - StringData.fromString("B"))); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "properties", + Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); + + GenericMapData properties = + new GenericMapData( + ImmutableMap.of( + StringData.fromString("a"), + StringData.fromString("A"), + StringData.fromString("b"), + StringData.fromString("B"))); RowData row = GenericRowData.of(34L, properties); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); @@ -353,26 +357,28 @@ public void testMapProjection() throws IOException { @Test public void testMapOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); RowData l1 = GenericRowData.of(53.992811f, -1.542616f); RowData l2 = GenericRowData.of(52.995143f, -1.539054f); - GenericMapData map = new GenericMapData(ImmutableMap.of( - StringData.fromString("L1"), l1, StringData.fromString("L2"), l2)); + GenericMapData map = + new GenericMapData( + ImmutableMap.of(StringData.fromString("L1"), l1, StringData.fromString("L2"), l2)); RowData row = GenericRowData.of(34L, map); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); @@ -386,21 +392,19 @@ public void testMapOfStructsProjection() throws IOException { GenericMapData locations = (GenericMapData) projected.getMap(0); Assert.assertNotNull("Should project locations map", locations); GenericArrayData l1l2Array = - new GenericArrayData(new Object[] {StringData.fromString("L2"), StringData.fromString("L1")}); + new GenericArrayData( + new Object[] {StringData.fromString("L2"), StringData.fromString("L1")}); Assert.assertEquals("Should contain L1 and L2", l1l2Array, locations.keyArray()); RowData projectedL1 = (RowData) locations.get(StringData.fromString("L1")); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain lat", - 53.992811f, projectedL1.getFloat(0), 0.000001); + Assert.assertEquals("L1 should contain lat", 53.992811f, projectedL1.getFloat(0), 0.000001); Assert.assertEquals("L1 should not contain long", 1, projectedL1.getArity()); RowData projectedL2 = (RowData) locations.get(StringData.fromString("L2")); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain lat", - 52.995143f, projectedL2.getFloat(0), 0.000001); + Assert.assertEquals("L2 should contain lat", 52.995143f, projectedL2.getFloat(0), 0.000001); Assert.assertEquals("L2 should not contain long", 1, projectedL2.getArity()); - projected = writeAndRead("long_only", - writeSchema, writeSchema.select("locations.long"), row); + projected = writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), row); Assert.assertEquals("Should not project id", 1, projected.getArity()); locations = (GenericMapData) projected.getMap(0); Assert.assertNotNull("Should project locations map", locations); @@ -408,22 +412,23 @@ public void testMapOfStructsProjection() throws IOException { projectedL1 = (RowData) locations.get(StringData.fromString("L1")); Assert.assertNotNull("L1 should not be null", projectedL1); Assert.assertEquals("L1 should not contain lat", 1, projectedL1.getArity()); - Assert.assertEquals("L1 should contain long", - -1.542616f, projectedL1.getFloat(0), 0.000001); + Assert.assertEquals("L1 should contain long", -1.542616f, projectedL1.getFloat(0), 0.000001); projectedL2 = (RowData) locations.get(StringData.fromString("L2")); Assert.assertNotNull("L2 should not be null", projectedL2); Assert.assertEquals("L2 should not contain lat", 1, projectedL2.getArity()); - Assert.assertEquals("L2 should contain long", - -1.539054f, projectedL2.getFloat(0), 0.000001); - - Schema latitiudeRenamed = new Schema( - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "latitude", Types.FloatType.get()) - ) - )) - ); + Assert.assertEquals("L2 should contain long", -1.539054f, projectedL2.getFloat(0), 0.000001); + + Schema latitiudeRenamed = + new Schema( + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, row); Assert.assertEquals("Should not project id", 1, projected.getArity()); @@ -432,29 +437,27 @@ public void testMapOfStructsProjection() throws IOException { Assert.assertEquals("Should contain L1 and L2", l1l2Array, locations.keyArray()); projectedL1 = (RowData) locations.get(StringData.fromString("L1")); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain latitude", - 53.992811f, projectedL1.getFloat(0), 0.000001); + Assert.assertEquals( + "L1 should contain latitude", 53.992811f, projectedL1.getFloat(0), 0.000001); projectedL2 = (RowData) locations.get(StringData.fromString("L2")); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain latitude", - 52.995143f, projectedL2.getFloat(0), 0.000001); + Assert.assertEquals( + "L2 should contain latitude", 52.995143f, projectedL2.getFloat(0), 0.000001); } @Test public void testListProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(10, "values", - Types.ListType.ofOptional(11, Types.LongType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); GenericArrayData values = new GenericArrayData(new Long[] {56L, 57L, 58L}); RowData row = GenericRowData.of(34L, values); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); @@ -474,24 +477,24 @@ public void testListProjection() throws IOException { @Test @SuppressWarnings("unchecked") public void testListOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()) - )) - ) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); RowData p1 = GenericRowData.of(1, 2); RowData p2 = GenericRowData.of(3, null); GenericArrayData arrayData = new GenericArrayData(new RowData[] {p1, p2}); RowData row = GenericRowData.of(34L, arrayData); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); RowData projected = writeAndRead("id_only", writeSchema, idOnly, row); Assert.assertEquals("Should contain the correct id value", 34L, projected.getLong(0)); @@ -525,13 +528,15 @@ public void testListOfStructsProjection() throws IOException { Assert.assertEquals("Should not project x", 1, projectedP2.getArity()); Assert.assertTrue("Should project null y", projectedP2.isNullAt(0)); - Schema yRenamed = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.optional(18, "z", Types.IntegerType.get()) - )) - ) - ); + Schema yRenamed = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); projected = writeAndRead("y_renamed", writeSchema, yRenamed, row); Assert.assertEquals("Should not project id", 1, projected.getArity()); @@ -548,22 +553,25 @@ public void testListOfStructsProjection() throws IOException { @Test public void testAddedFieldsWithRequiredChildren() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(1, "a", Types.LongType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(1, "a", Types.LongType.get())); RowData row = GenericRowData.of(100L); - Schema addedFields = new Schema( - Types.NestedField.optional(1, "a", Types.LongType.get()), - Types.NestedField.optional(2, "b", Types.StructType.of( - Types.NestedField.required(3, "c", Types.LongType.get()) - )), - Types.NestedField.optional(4, "d", Types.ListType.ofRequired(5, Types.LongType.get())), - Types.NestedField.optional(6, "e", Types.MapType.ofRequired(7, 8, Types.LongType.get(), Types.LongType.get())) - ); - - RowData projected = writeAndRead("add_fields_with_required_children_projection", schema, addedFields, row); + Schema addedFields = + new Schema( + Types.NestedField.optional(1, "a", Types.LongType.get()), + Types.NestedField.optional( + 2, + "b", + Types.StructType.of(Types.NestedField.required(3, "c", Types.LongType.get()))), + Types.NestedField.optional(4, "d", Types.ListType.ofRequired(5, Types.LongType.get())), + Types.NestedField.optional( + 6, + "e", + Types.MapType.ofRequired(7, 8, Types.LongType.get(), Types.LongType.get()))); + + RowData projected = + writeAndRead("add_fields_with_required_children_projection", schema, addedFields, row); Assert.assertEquals("Should contain the correct value in column 1", projected.getLong(0), 100L); Assert.assertTrue("Should contain empty value in new column 2", projected.isNullAt(1)); Assert.assertTrue("Should contain empty value in new column 4", projected.isNullAt(2)); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java index e30412ad83cc..a9800303aa4f 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestDeltaTaskWriter.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; +import static org.apache.iceberg.flink.SimpleDataUtil.createDelete; +import static org.apache.iceberg.flink.SimpleDataUtil.createInsert; +import static org.apache.iceberg.flink.SimpleDataUtil.createRecord; +import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateAfter; +import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateBefore; + import java.io.File; import java.io.IOException; import java.nio.file.Files; @@ -50,12 +55,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.flink.SimpleDataUtil.createDelete; -import static org.apache.iceberg.flink.SimpleDataUtil.createInsert; -import static org.apache.iceberg.flink.SimpleDataUtil.createRecord; -import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateAfter; -import static org.apache.iceberg.flink.SimpleDataUtil.createUpdateBefore; - @RunWith(Parameterized.class) public class TestDeltaTaskWriter extends TableTestBase { private static final int FORMAT_V2 = 2; @@ -64,11 +63,7 @@ public class TestDeltaTaskWriter extends TableTestBase { @Parameterized.Parameters(name = "FileFormat = {0}") public static Object[][] parameters() { - return new Object[][] { - {"avro"}, - {"orc"}, - {"parquet"} - }; + return new Object[][] {{"avro"}, {"orc"}, {"parquet"}}; } public TestDeltaTaskWriter(String fileFormat) { @@ -92,7 +87,8 @@ private void initTable(boolean partitioned) { this.table = create(SCHEMA, PartitionSpec.unpartitioned()); } - table.updateProperties() + table + .updateProperties() .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, String.valueOf(8 * 1024)) .defaultFormat(format) .commit(); @@ -139,12 +135,14 @@ private void testCdcEvents(boolean partitioned) throws IOException { Assert.assertEquals(partitioned ? 3 : 1, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records.", expectedRowSet( - createRecord(1, "eee"), - createRecord(2, "ddd"), - createRecord(4, "fff"), - createRecord(5, "ggg") - ), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records.", + expectedRowSet( + createRecord(1, "eee"), + createRecord(2, "ddd"), + createRecord(4, "fff"), + createRecord(5, "ggg")), + actualRowSet("*")); // Start the 2nd transaction. writer = taskWriterFactory.create(); @@ -165,11 +163,10 @@ private void testCdcEvents(boolean partitioned) throws IOException { Assert.assertEquals(partitioned ? 3 : 1, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet( - createRecord(1, "eee"), - createRecord(5, "iii"), - createRecord(6, "hhh") - ), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", + expectedRowSet(createRecord(1, "eee"), createRecord(5, "iii"), createRecord(6, "hhh")), + actualRowSet("*")); } @Test @@ -229,11 +226,15 @@ private void testAbort(boolean partitioned) throws IOException { } // Assert the current data/delete file count. - List files = Files.walk(Paths.get(tableDir.getPath(), "data")) - .filter(p -> p.toFile().isFile()) - .filter(p -> !p.toString().endsWith(".crc")) - .collect(Collectors.toList()); - Assert.assertEquals("Should have expected file count, but files are: " + files, partitioned ? 4 : 2, files.size()); + List files = + Files.walk(Paths.get(tableDir.getPath(), "data")) + .filter(p -> p.toFile().isFile()) + .filter(p -> !p.toString().endsWith(".crc")) + .collect(Collectors.toList()); + Assert.assertEquals( + "Should have expected file count, but files are: " + files, + partitioned ? 4 : 2, + files.size()); writer.abort(); for (Path file : files) { @@ -270,11 +271,10 @@ public void testPartitionedTableWithDataAsKey() throws IOException { Assert.assertEquals(1, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet( - createRecord(2, "aaa"), - createRecord(3, "bbb"), - createRecord(4, "ccc") - ), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", + expectedRowSet(createRecord(2, "aaa"), createRecord(3, "bbb"), createRecord(4, "ccc")), + actualRowSet("*")); // Start the 2nd transaction. writer = taskWriterFactory.create(); @@ -287,12 +287,14 @@ public void testPartitionedTableWithDataAsKey() throws IOException { Assert.assertEquals(1, result.deleteFiles().length); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet( - createRecord(2, "aaa"), - createRecord(5, "aaa"), - createRecord(3, "bbb"), - createRecord(6, "bbb") - ), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", + expectedRowSet( + createRecord(2, "aaa"), + createRecord(5, "aaa"), + createRecord(3, "bbb"), + createRecord(6, "bbb")), + actualRowSet("*")); } @Test @@ -311,20 +313,21 @@ public void testPartitionedTableWithDataAndIdAsKey() throws IOException { WriteResult result = writer.complete(); Assert.assertEquals(1, result.dataFiles().length); Assert.assertEquals(1, result.deleteFiles().length); - Assert.assertEquals(Sets.newHashSet(FileContent.POSITION_DELETES), + Assert.assertEquals( + Sets.newHashSet(FileContent.POSITION_DELETES), Sets.newHashSet(result.deleteFiles()[0].content())); commitTransaction(result); - Assert.assertEquals("Should have expected records", expectedRowSet( - createRecord(1, "aaa") - ), actualRowSet("*")); + Assert.assertEquals( + "Should have expected records", expectedRowSet(createRecord(1, "aaa")), actualRowSet("*")); } private void commitTransaction(WriteResult result) { RowDelta rowDelta = table.newRowDelta(); Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows); Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes); - rowDelta.validateDeletedFiles() + rowDelta + .validateDeletedFiles() .validateDataFilesExist(Lists.newArrayList(result.referencedDataFiles())) .commit(); } @@ -339,7 +342,11 @@ private StructLikeSet actualRowSet(String... columns) throws IOException { private TaskWriterFactory createTaskWriterFactory(List equalityFieldIds) { return new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), FlinkSchemaUtil.convert(table.schema()), - 128 * 1024 * 1024, format, equalityFieldIds, false); + SerializableTable.copyOf(table), + FlinkSchemaUtil.convert(table.schema()), + 128 * 1024 * 1024, + format, + equalityFieldIds, + false); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java index 8d7fa86eac50..4c17cd7607df 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -41,11 +40,16 @@ public TestFlinkAppenderFactory(String fileFormat, boolean partitioned) { } @Override - protected FileAppenderFactory createAppenderFactory(List equalityFieldIds, - Schema eqDeleteSchema, - Schema posDeleteRowSchema) { - return new FlinkAppenderFactory(table.schema(), rowType, table.properties(), table.spec(), - ArrayUtil.toIntArray(equalityFieldIds), eqDeleteSchema, posDeleteRowSchema); + protected FileAppenderFactory createAppenderFactory( + List equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema) { + return new FlinkAppenderFactory( + table.schema(), + rowType, + table.properties(), + table.spec(), + ArrayUtil.toIntArray(equalityFieldIds), + eqDeleteSchema, + posDeleteRowSchema); } @Override diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java index 3223b6e28b92..da45241256f5 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkFileWriterFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -39,9 +38,11 @@ public TestFlinkFileWriterFactory(FileFormat fileFormat, boolean partitioned) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return FlinkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java index 263c2fc03503..6d2947599158 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.File; @@ -64,13 +63,12 @@ public class TestFlinkIcebergSink { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); - private static final TypeInformation ROW_TYPE_INFO = new RowTypeInfo( - SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); - private static final DataFormatConverters.RowConverter CONVERTER = new DataFormatConverters.RowConverter( - SimpleDataUtil.FLINK_SCHEMA.getFieldDataTypes()); + private static final TypeInformation ROW_TYPE_INFO = + new RowTypeInfo(SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); + private static final DataFormatConverters.RowConverter CONVERTER = + new DataFormatConverters.RowConverter(SimpleDataUtil.FLINK_SCHEMA.getFieldDataTypes()); private Table table; private StreamExecutionEnvironment env; @@ -83,18 +81,18 @@ public class TestFlinkIcebergSink { @Parameterized.Parameters(name = "format={0}, parallelism = {1}, partitioned = {2}") public static Object[][] parameters() { return new Object[][] { - {"avro", 1, true}, - {"avro", 1, false}, - {"avro", 2, true}, - {"avro", 2, false}, - {"orc", 1, true}, - {"orc", 1, false}, - {"orc", 2, true}, - {"orc", 2, false}, - {"parquet", 1, true}, - {"parquet", 1, false}, - {"parquet", 2, true}, - {"parquet", 2, false} + {"avro", 1, true}, + {"avro", 1, false}, + {"avro", 2, true}, + {"avro", 2, false}, + {"orc", 1, true}, + {"orc", 1, false}, + {"orc", 2, true}, + {"orc", 2, false}, + {"parquet", 1, true}, + {"parquet", 1, false}, + {"parquet", 2, true}, + {"parquet", 2, false} }; } @@ -115,10 +113,12 @@ public void before() throws IOException { Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); table = SimpleDataUtil.createTable(tablePath, props, partitioned); - env = StreamExecutionEnvironment.getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); tableLoader = TableLoader.fromHadoopTable(tablePath); } @@ -133,13 +133,10 @@ private BoundedTestSource createBoundedSource(List rows) { @Test public void testWriteRowData() throws Exception { - List rows = Lists.newArrayList( - Row.of(1, "hello"), - Row.of(2, "world"), - Row.of(3, "foo") - ); - DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) - .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); + List rows = Lists.newArrayList(Row.of(1, "hello"), Row.of(2, "world"), Row.of(3, "foo")); + DataStream dataStream = + env.addSource(createBoundedSource(rows), ROW_TYPE_INFO) + .map(CONVERTER::toInternal, FlinkCompatibilityUtil.toTypeInfo(SimpleDataUtil.ROW_TYPE)); FlinkSink.forRowData(dataStream) .table(table) @@ -164,11 +161,11 @@ private List createRows(String prefix) { Row.of(2, prefix + "ccc"), Row.of(3, prefix + "aaa"), Row.of(3, prefix + "bbb"), - Row.of(3, prefix + "ccc") - ); + Row.of(3, prefix + "ccc")); } - private void testWriteRow(TableSchema tableSchema, DistributionMode distributionMode) throws Exception { + private void testWriteRow(TableSchema tableSchema, DistributionMode distributionMode) + throws Exception { List rows = createRows(""); DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); @@ -202,7 +199,8 @@ public void testWriteRowWithTableSchema() throws Exception { @Test public void testJobNoneDistributeMode() throws Exception { - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) .commit(); @@ -218,12 +216,15 @@ public void testJobNoneDistributeMode() throws Exception { @Test public void testJobHashDistributionMode() { - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) .commit(); - AssertHelpers.assertThrows("Does not support range distribution-mode now.", - IllegalArgumentException.class, "Flink does not support 'range' write distribution mode now.", + AssertHelpers.assertThrows( + "Does not support range distribution-mode now.", + IllegalArgumentException.class, + "Flink does not support 'range' write distribution mode now.", () -> { testWriteRow(null, DistributionMode.RANGE); return null; @@ -232,16 +233,20 @@ public void testJobHashDistributionMode() { @Test public void testJobNullDistributionMode() throws Exception { - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, DistributionMode.HASH.modeName()) .commit(); testWriteRow(null, null); if (partitioned) { - Assert.assertEquals("There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); - Assert.assertEquals("There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); - Assert.assertEquals("There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); + Assert.assertEquals( + "There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); + Assert.assertEquals( + "There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); + Assert.assertEquals( + "There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); } } @@ -249,9 +254,12 @@ public void testJobNullDistributionMode() throws Exception { public void testPartitionWriteMode() throws Exception { testWriteRow(null, DistributionMode.HASH); if (partitioned) { - Assert.assertEquals("There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); - Assert.assertEquals("There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); - Assert.assertEquals("There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); + Assert.assertEquals( + "There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); + Assert.assertEquals( + "There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); + Assert.assertEquals( + "There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); } } @@ -259,9 +267,12 @@ public void testPartitionWriteMode() throws Exception { public void testShuffleByPartitionWithSchema() throws Exception { testWriteRow(SimpleDataUtil.FLINK_SCHEMA, DistributionMode.HASH); if (partitioned) { - Assert.assertEquals("There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); - Assert.assertEquals("There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); - Assert.assertEquals("There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); + Assert.assertEquals( + "There should be only 1 data file in partition 'aaa'", 1, partitionFiles("aaa")); + Assert.assertEquals( + "There should be only 1 data file in partition 'bbb'", 1, partitionFiles("bbb")); + Assert.assertEquals( + "There should be only 1 data file in partition 'ccc'", 1, partitionFiles("ccc")); } } @@ -279,17 +290,19 @@ public void testTwoSinksInDisjointedDAG() throws Exception { Table rightTable = SimpleDataUtil.createTable(rightTablePath, props, partitioned); TableLoader rightTableLoader = TableLoader.fromHadoopTable(rightTablePath); - env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); env.getConfig().disableAutoGeneratedUIDs(); List leftRows = createRows("left-"); - DataStream leftStream = env.fromCollection(leftRows, ROW_TYPE_INFO) - .name("leftCustomSource") - .uid("leftCustomSource"); + DataStream leftStream = + env.fromCollection(leftRows, ROW_TYPE_INFO) + .name("leftCustomSource") + .uid("leftCustomSource"); FlinkSink.forRow(leftStream, SimpleDataUtil.FLINK_SCHEMA) .table(leftTable) .tableLoader(leftTableLoader) @@ -299,9 +312,10 @@ public void testTwoSinksInDisjointedDAG() throws Exception { .append(); List rightRows = createRows("right-"); - DataStream rightStream = env.fromCollection(rightRows, ROW_TYPE_INFO) - .name("rightCustomSource") - .uid("rightCustomSource"); + DataStream rightStream = + env.fromCollection(rightRows, ROW_TYPE_INFO) + .name("rightCustomSource") + .uid("rightCustomSource"); FlinkSink.forRow(rightStream, SimpleDataUtil.FLINK_SCHEMA) .table(rightTable) .tableLoader(rightTableLoader) @@ -323,7 +337,9 @@ public void testTwoSinksInDisjointedDAG() throws Exception { Assert.assertNull(leftTable.currentSnapshot().summary().get("flink.test")); Assert.assertNull(leftTable.currentSnapshot().summary().get("direction")); rightTable.refresh(); - Assert.assertEquals(TestFlinkIcebergSink.class.getName(), rightTable.currentSnapshot().summary().get("flink.test")); + Assert.assertEquals( + TestFlinkIcebergSink.class.getName(), + rightTable.currentSnapshot().summary().get("flink.test")); Assert.assertEquals("rightTable", rightTable.currentSnapshot().summary().get("direction")); } @@ -335,14 +351,17 @@ public void testOverrideWriteConfigWithUnknownDistributionMode() { List rows = createRows(""); DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - FlinkSink.Builder builder = FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .setAll(newProps); - - AssertHelpers.assertThrows("Should fail with invalid distribution mode.", - IllegalArgumentException.class, "No enum constant org.apache.iceberg.DistributionMode.UNRECOGNIZED", + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .setAll(newProps); + + AssertHelpers.assertThrows( + "Should fail with invalid distribution mode.", + IllegalArgumentException.class, + "No enum constant org.apache.iceberg.DistributionMode.UNRECOGNIZED", () -> { builder.append(); @@ -360,14 +379,17 @@ public void testOverrideWriteConfigWithUnknownFileFormat() { List rows = createRows(""); DataStream dataStream = env.addSource(createBoundedSource(rows), ROW_TYPE_INFO); - FlinkSink.Builder builder = FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .table(table) - .tableLoader(tableLoader) - .writeParallelism(parallelism) - .setAll(newProps); - - AssertHelpers.assertThrows("Should fail with invalid file format.", - IllegalArgumentException.class, "No enum constant org.apache.iceberg.FileFormat.UNRECOGNIZED", + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .writeParallelism(parallelism) + .setAll(newProps); + + AssertHelpers.assertThrows( + "Should fail with invalid file format.", + IllegalArgumentException.class, + "No enum constant org.apache.iceberg.FileFormat.UNRECOGNIZED", () -> { builder.append(); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java index 97506b90ba46..cb840ada5ac5 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSinkV2.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.File; @@ -67,18 +66,18 @@ public class TestFlinkIcebergSinkV2 extends TableTestBase { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private static final int FORMAT_V2 = 2; private static final TypeInformation ROW_TYPE_INFO = new RowTypeInfo(SimpleDataUtil.FLINK_SCHEMA.getFieldTypes()); - private static final Map ROW_KIND_MAP = ImmutableMap.of( - "+I", RowKind.INSERT, - "-D", RowKind.DELETE, - "-U", RowKind.UPDATE_BEFORE, - "+U", RowKind.UPDATE_AFTER); + private static final Map ROW_KIND_MAP = + ImmutableMap.of( + "+I", RowKind.INSERT, + "-D", RowKind.DELETE, + "-U", RowKind.UPDATE_BEFORE, + "+U", RowKind.UPDATE_AFTER); private static final int ROW_ID_POS = 0; private static final int ROW_DATA_POS = 1; @@ -91,27 +90,27 @@ public class TestFlinkIcebergSinkV2 extends TableTestBase { private StreamExecutionEnvironment env; private TestTableLoader tableLoader; - @Parameterized.Parameters(name = "FileFormat = {0}, Parallelism = {1}, Partitioned={2}, WriteDistributionMode ={3}") + @Parameterized.Parameters( + name = "FileFormat = {0}, Parallelism = {1}, Partitioned={2}, WriteDistributionMode ={3}") public static Object[][] parameters() { return new Object[][] { - new Object[] {"avro", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {"avro", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {"avro", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - new Object[] {"avro", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, - - new Object[] {"orc", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {"orc", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {"orc", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - new Object[] {"orc", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, - - new Object[] {"parquet", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, - new Object[] {"parquet", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, - new Object[] {"parquet", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, - new Object[] {"parquet", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE} + new Object[] {"avro", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, + new Object[] {"avro", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, + new Object[] {"avro", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, + new Object[] {"avro", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_NONE}, + new Object[] {"orc", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, + new Object[] {"orc", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, + new Object[] {"orc", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, + new Object[] {"orc", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_HASH}, + new Object[] {"parquet", 1, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, + new Object[] {"parquet", 1, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, + new Object[] {"parquet", 4, true, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE}, + new Object[] {"parquet", 4, false, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE} }; } - public TestFlinkIcebergSinkV2(String format, int parallelism, boolean partitioned, String writeDistributionMode) { + public TestFlinkIcebergSinkV2( + String format, int parallelism, boolean partitioned, String writeDistributionMode) { super(FORMAT_V2); this.format = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH)); this.parallelism = parallelism; @@ -128,18 +127,24 @@ public void setupTable() throws IOException { if (!partitioned) { table = create(SimpleDataUtil.SCHEMA, PartitionSpec.unpartitioned()); } else { - table = create(SimpleDataUtil.SCHEMA, PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build()); + table = + create( + SimpleDataUtil.SCHEMA, + PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build()); } - table.updateProperties() + table + .updateProperties() .set(TableProperties.DEFAULT_FILE_FORMAT, format.name()) .set(TableProperties.WRITE_DISTRIBUTION_MODE, writeDistributionMode) .commit(); - env = StreamExecutionEnvironment.getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(100L) - .setParallelism(parallelism) - .setMaxParallelism(parallelism); + env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(100L) + .setParallelism(parallelism) + .setMaxParallelism(parallelism); tableLoader = new TestTableLoader(tableDir.getAbsolutePath()); } @@ -147,19 +152,23 @@ public void setupTable() throws IOException { private List findValidSnapshots(Table table) { List validSnapshots = Lists.newArrayList(); for (Snapshot snapshot : table.snapshots()) { - if (snapshot.allManifests(table.io()).stream().anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { + if (snapshot.allManifests(table.io()).stream() + .anyMatch(m -> snapshot.snapshotId() == m.snapshotId())) { validSnapshots.add(snapshot); } } return validSnapshots; } - private void testChangeLogs(List equalityFieldColumns, - KeySelector keySelector, - boolean insertAsUpsert, - List> elementsPerCheckpoint, - List> expectedRecordsPerCheckpoint) throws Exception { - DataStream dataStream = env.addSource(new BoundedTestSource<>(elementsPerCheckpoint), ROW_TYPE_INFO); + private void testChangeLogs( + List equalityFieldColumns, + KeySelector keySelector, + boolean insertAsUpsert, + List> elementsPerCheckpoint, + List> expectedRecordsPerCheckpoint) + throws Exception { + DataStream dataStream = + env.addSource(new BoundedTestSource<>(elementsPerCheckpoint), ROW_TYPE_INFO); FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) .tableLoader(tableLoader) @@ -175,13 +184,16 @@ private void testChangeLogs(List equalityFieldColumns, table.refresh(); List snapshots = findValidSnapshots(table); int expectedSnapshotNum = expectedRecordsPerCheckpoint.size(); - Assert.assertEquals("Should have the expected snapshot number", expectedSnapshotNum, snapshots.size()); + Assert.assertEquals( + "Should have the expected snapshot number", expectedSnapshotNum, snapshots.size()); for (int i = 0; i < expectedSnapshotNum; i++) { long snapshotId = snapshots.get(i).snapshotId(); List expectedRecords = expectedRecordsPerCheckpoint.get(i); - Assert.assertEquals("Should have the expected records for the checkpoint#" + i, - expectedRowSet(expectedRecords.toArray(new Record[0])), actualRowSet(snapshotId, "*")); + Assert.assertEquals( + "Should have the expected records for the checkpoint#" + i, + expectedRowSet(expectedRecords.toArray(new Record[0])), + actualRowSet(snapshotId, "*")); } } @@ -200,232 +212,227 @@ private Record record(int id, String data) { @Test public void testCheckAndGetEqualityFieldIds() { - table.updateSchema() + table + .updateSchema() .allowIncompatibleChanges() .addRequiredColumn("type", Types.StringType.get()) .setIdentifierFields("type") .commit(); - DataStream dataStream = env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); - FlinkSink.Builder builder = FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA).table(table); + DataStream dataStream = + env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA).table(table); // Use schema identifier field IDs as equality field id list by default Assert.assertEquals( table.schema().identifierFieldIds(), - Sets.newHashSet(builder.checkAndGetEqualityFieldIds()) - ); + Sets.newHashSet(builder.checkAndGetEqualityFieldIds())); // Use user-provided equality field column as equality field id list builder.equalityFieldColumns(Lists.newArrayList("id")); Assert.assertEquals( Sets.newHashSet(table.schema().findField("id").fieldId()), - Sets.newHashSet(builder.checkAndGetEqualityFieldIds()) - ); + Sets.newHashSet(builder.checkAndGetEqualityFieldIds())); builder.equalityFieldColumns(Lists.newArrayList("type")); Assert.assertEquals( Sets.newHashSet(table.schema().findField("type").fieldId()), - Sets.newHashSet(builder.checkAndGetEqualityFieldIds()) - ); + Sets.newHashSet(builder.checkAndGetEqualityFieldIds())); } @Test public void testChangeLogOnIdKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( + List> elementsPerCheckpoint = ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa"), - row("-D", 2, "aaa"), - row("+I", 2, "bbb") - ), + ImmutableList.of( + row("+I", 1, "aaa"), + row("-D", 1, "aaa"), + row("+I", 1, "bbb"), + row("+I", 2, "aaa"), + row("-D", 2, "aaa"), + row("+I", 2, "bbb")), + ImmutableList.of( + row("-U", 2, "bbb"), row("+U", 2, "ccc"), row("-D", 2, "ccc"), row("+I", 2, "ddd")), + ImmutableList.of( + row("-D", 1, "bbb"), + row("+I", 1, "ccc"), + row("-D", 1, "ccc"), + row("+I", 1, "ddd"))); + + List> expectedRecords = ImmutableList.of( - row("-U", 2, "bbb"), - row("+U", 2, "ccc"), - row("-D", 2, "ccc"), - row("+I", 2, "ddd") - ), - ImmutableList.of( - row("-D", 1, "bbb"), - row("+I", 1, "ccc"), - row("-D", 1, "ccc"), - row("+I", 1, "ddd") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "bbb")), - ImmutableList.of(record(1, "bbb"), record(2, "ddd")), - ImmutableList.of(record(1, "ddd"), record(2, "ddd")) - ); + ImmutableList.of(record(1, "bbb"), record(2, "bbb")), + ImmutableList.of(record(1, "bbb"), record(2, "ddd")), + ImmutableList.of(record(1, "ddd"), record(2, "ddd"))); if (partitioned && writeDistributionMode.equals(TableProperties.WRITE_DISTRIBUTION_MODE_HASH)) { - AssertHelpers.assertThrows("Should be error because equality field columns don't include all partition keys", - IllegalStateException.class, "should be included in equality fields", + AssertHelpers.assertThrows( + "Should be error because equality field columns don't include all partition keys", + IllegalStateException.class, + "should be included in equality fields", () -> { - testChangeLogs(ImmutableList.of("id"), row -> row.getField(ROW_ID_POS), false, - elementsPerCheckpoint, expectedRecords); + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + false, + elementsPerCheckpoint, + expectedRecords); return null; }); } else { - testChangeLogs(ImmutableList.of("id"), row -> row.getField(ROW_ID_POS), false, - elementsPerCheckpoint, expectedRecords); + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + false, + elementsPerCheckpoint, + expectedRecords); } } @Test public void testChangeLogOnDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 2, "bbb"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa") - ), + List> elementsPerCheckpoint = ImmutableList.of( - row("-U", 2, "aaa"), - row("+U", 1, "ccc"), - row("+I", 1, "aaa") - ), + ImmutableList.of( + row("+I", 1, "aaa"), + row("-D", 1, "aaa"), + row("+I", 2, "bbb"), + row("+I", 1, "bbb"), + row("+I", 2, "aaa")), + ImmutableList.of(row("-U", 2, "aaa"), row("+U", 1, "ccc"), row("+I", 1, "aaa")), + ImmutableList.of(row("-D", 1, "bbb"), row("+I", 2, "aaa"), row("+I", 2, "ccc"))); + + List> expectedRecords = ImmutableList.of( - row("-D", 1, "bbb"), - row("+I", 2, "aaa"), - row("+I", 2, "ccc") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "aaa")), - ImmutableList.of(record(1, "aaa"), record(1, "bbb"), record(1, "ccc")), - ImmutableList.of(record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "ccc")) - ); - - testChangeLogs(ImmutableList.of("data"), row -> row.getField(ROW_DATA_POS), false, - elementsPerCheckpoint, expectedRecords); + ImmutableList.of(record(1, "bbb"), record(2, "aaa")), + ImmutableList.of(record(1, "aaa"), record(1, "bbb"), record(1, "ccc")), + ImmutableList.of( + record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "ccc"))); + + testChangeLogs( + ImmutableList.of("data"), + row -> row.getField(ROW_DATA_POS), + false, + elementsPerCheckpoint, + expectedRecords); } @Test public void testChangeLogOnIdDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( + List> elementsPerCheckpoint = ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 2, "bbb"), - row("+I", 1, "bbb"), - row("+I", 2, "aaa") - ), + ImmutableList.of( + row("+I", 1, "aaa"), + row("-D", 1, "aaa"), + row("+I", 2, "bbb"), + row("+I", 1, "bbb"), + row("+I", 2, "aaa")), + ImmutableList.of(row("-U", 2, "aaa"), row("+U", 1, "ccc"), row("+I", 1, "aaa")), + ImmutableList.of(row("-D", 1, "bbb"), row("+I", 2, "aaa"))); + + List> expectedRecords = ImmutableList.of( - row("-U", 2, "aaa"), - row("+U", 1, "ccc"), - row("+I", 1, "aaa") - ), - ImmutableList.of( - row("-D", 1, "bbb"), - row("+I", 2, "aaa") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "bbb"), record(2, "aaa"), record(2, "bbb")), - ImmutableList.of(record(1, "aaa"), record(1, "bbb"), record(1, "ccc"), record(2, "bbb")), - ImmutableList.of(record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "bbb")) - ); - - testChangeLogs(ImmutableList.of("data", "id"), row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - false, elementsPerCheckpoint, expectedRecords); + ImmutableList.of(record(1, "bbb"), record(2, "aaa"), record(2, "bbb")), + ImmutableList.of( + record(1, "aaa"), record(1, "bbb"), record(1, "ccc"), record(2, "bbb")), + ImmutableList.of( + record(1, "aaa"), record(1, "ccc"), record(2, "aaa"), record(2, "bbb"))); + + testChangeLogs( + ImmutableList.of("data", "id"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + false, + elementsPerCheckpoint, + expectedRecords); } @Test public void testChangeLogOnSameKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( - // Checkpoint #1 - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 1, "aaa"), - row("+I", 1, "aaa") - ), - // Checkpoint #2 - ImmutableList.of( - row("-U", 1, "aaa"), - row("+U", 1, "aaa") - ), - // Checkpoint #3 + List> elementsPerCheckpoint = ImmutableList.of( - row("-D", 1, "aaa"), - row("+I", 1, "aaa") - ), - // Checkpoint #4 + // Checkpoint #1 + ImmutableList.of(row("+I", 1, "aaa"), row("-D", 1, "aaa"), row("+I", 1, "aaa")), + // Checkpoint #2 + ImmutableList.of(row("-U", 1, "aaa"), row("+U", 1, "aaa")), + // Checkpoint #3 + ImmutableList.of(row("-D", 1, "aaa"), row("+I", 1, "aaa")), + // Checkpoint #4 + ImmutableList.of(row("-U", 1, "aaa"), row("+U", 1, "aaa"), row("+I", 1, "aaa"))); + + List> expectedRecords = ImmutableList.of( - row("-U", 1, "aaa"), - row("+U", 1, "aaa"), - row("+I", 1, "aaa") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa")), - ImmutableList.of(record(1, "aaa"), record(1, "aaa")) - ); - - testChangeLogs(ImmutableList.of("id", "data"), row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - false, elementsPerCheckpoint, expectedRecords); + ImmutableList.of(record(1, "aaa")), + ImmutableList.of(record(1, "aaa")), + ImmutableList.of(record(1, "aaa")), + ImmutableList.of(record(1, "aaa"), record(1, "aaa"))); + + testChangeLogs( + ImmutableList.of("id", "data"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + false, + elementsPerCheckpoint, + expectedRecords); } @Test public void testUpsertModeCheck() throws Exception { - DataStream dataStream = env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); - FlinkSink.Builder builder = FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) - .tableLoader(tableLoader) - .tableSchema(SimpleDataUtil.FLINK_SCHEMA) - .writeParallelism(parallelism) - .upsert(true); - - AssertHelpers.assertThrows("Should be error because upsert mode and overwrite mode enable at the same time.", - IllegalStateException.class, "OVERWRITE mode shouldn't be enable", - () -> builder.equalityFieldColumns(ImmutableList.of("id", "data")).overwrite(true).append() - ); - - AssertHelpers.assertThrows("Should be error because equality field columns are empty.", - IllegalStateException.class, "Equality field columns shouldn't be empty", - () -> builder.equalityFieldColumns(ImmutableList.of()).overwrite(false).append() - ); + DataStream dataStream = + env.addSource(new BoundedTestSource<>(ImmutableList.of()), ROW_TYPE_INFO); + FlinkSink.Builder builder = + FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .tableLoader(tableLoader) + .tableSchema(SimpleDataUtil.FLINK_SCHEMA) + .writeParallelism(parallelism) + .upsert(true); + + AssertHelpers.assertThrows( + "Should be error because upsert mode and overwrite mode enable at the same time.", + IllegalStateException.class, + "OVERWRITE mode shouldn't be enable", + () -> + builder.equalityFieldColumns(ImmutableList.of("id", "data")).overwrite(true).append()); + + AssertHelpers.assertThrows( + "Should be error because equality field columns are empty.", + IllegalStateException.class, + "Equality field columns shouldn't be empty", + () -> builder.equalityFieldColumns(ImmutableList.of()).overwrite(false).append()); } @Test public void testUpsertOnIdKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("+U", 1, "bbb") - ), - ImmutableList.of( - row("+I", 1, "ccc") - ), + List> elementsPerCheckpoint = ImmutableList.of( - row("+U", 1, "ddd"), - row("+I", 1, "eee") - ) - ); + ImmutableList.of(row("+I", 1, "aaa"), row("+U", 1, "bbb")), + ImmutableList.of(row("+I", 1, "ccc")), + ImmutableList.of(row("+U", 1, "ddd"), row("+I", 1, "eee"))); - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "bbb")), - ImmutableList.of(record(1, "ccc")), - ImmutableList.of(record(1, "eee")) - ); + List> expectedRecords = + ImmutableList.of( + ImmutableList.of(record(1, "bbb")), + ImmutableList.of(record(1, "ccc")), + ImmutableList.of(record(1, "eee"))); if (!partitioned) { - testChangeLogs(ImmutableList.of("id"), row -> row.getField(ROW_ID_POS), true, - elementsPerCheckpoint, expectedRecords); + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + true, + elementsPerCheckpoint, + expectedRecords); } else { - AssertHelpers.assertThrows("Should be error because equality field columns don't include all partition keys", - IllegalStateException.class, "should be included in equality fields", + AssertHelpers.assertThrows( + "Should be error because equality field columns don't include all partition keys", + IllegalStateException.class, + "should be included in equality fields", () -> { - testChangeLogs(ImmutableList.of("id"), row -> row.getField(ROW_ID_POS), true, - elementsPerCheckpoint, expectedRecords); + testChangeLogs( + ImmutableList.of("id"), + row -> row.getField(ROW_ID_POS), + true, + elementsPerCheckpoint, + expectedRecords); return null; }); } @@ -433,61 +440,46 @@ public void testUpsertOnIdKey() throws Exception { @Test public void testUpsertOnDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( - ImmutableList.of( - row("+I", 1, "aaa"), - row("+I", 2, "aaa"), - row("+I", 3, "bbb") - ), + List> elementsPerCheckpoint = ImmutableList.of( - row("+U", 4, "aaa"), - row("-U", 3, "bbb"), - row("+U", 5, "bbb") - ), + ImmutableList.of(row("+I", 1, "aaa"), row("+I", 2, "aaa"), row("+I", 3, "bbb")), + ImmutableList.of(row("+U", 4, "aaa"), row("-U", 3, "bbb"), row("+U", 5, "bbb")), + ImmutableList.of(row("+I", 6, "aaa"), row("+U", 7, "bbb"))); + + List> expectedRecords = ImmutableList.of( - row("+I", 6, "aaa"), - row("+U", 7, "bbb") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(2, "aaa"), record(3, "bbb")), - ImmutableList.of(record(4, "aaa"), record(5, "bbb")), - ImmutableList.of(record(6, "aaa"), record(7, "bbb")) - ); - - testChangeLogs(ImmutableList.of("data"), row -> row.getField(ROW_DATA_POS), true, - elementsPerCheckpoint, expectedRecords); + ImmutableList.of(record(2, "aaa"), record(3, "bbb")), + ImmutableList.of(record(4, "aaa"), record(5, "bbb")), + ImmutableList.of(record(6, "aaa"), record(7, "bbb"))); + + testChangeLogs( + ImmutableList.of("data"), + row -> row.getField(ROW_DATA_POS), + true, + elementsPerCheckpoint, + expectedRecords); } @Test public void testUpsertOnIdDataKey() throws Exception { - List> elementsPerCheckpoint = ImmutableList.of( + List> elementsPerCheckpoint = ImmutableList.of( - row("+I", 1, "aaa"), - row("+U", 1, "aaa"), - row("+I", 2, "bbb") - ), - ImmutableList.of( - row("+I", 1, "aaa"), - row("-D", 2, "bbb"), - row("+I", 2, "ccc") - ), + ImmutableList.of(row("+I", 1, "aaa"), row("+U", 1, "aaa"), row("+I", 2, "bbb")), + ImmutableList.of(row("+I", 1, "aaa"), row("-D", 2, "bbb"), row("+I", 2, "ccc")), + ImmutableList.of(row("+U", 1, "bbb"), row("-U", 1, "ccc"), row("-D", 1, "aaa"))); + + List> expectedRecords = ImmutableList.of( - row("+U", 1, "bbb"), - row("-U", 1, "ccc"), - row("-D", 1, "aaa") - ) - ); - - List> expectedRecords = ImmutableList.of( - ImmutableList.of(record(1, "aaa"), record(2, "bbb")), - ImmutableList.of(record(1, "aaa"), record(2, "ccc")), - ImmutableList.of(record(1, "bbb"), record(2, "ccc")) - ); - - testChangeLogs(ImmutableList.of("id", "data"), row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), - true, elementsPerCheckpoint, expectedRecords); + ImmutableList.of(record(1, "aaa"), record(2, "bbb")), + ImmutableList.of(record(1, "aaa"), record(2, "ccc")), + ImmutableList.of(record(1, "bbb"), record(2, "ccc"))); + + testChangeLogs( + ImmutableList.of("id", "data"), + row -> Row.of(row.getField(ROW_ID_POS), row.getField(ROW_DATA_POS)), + true, + elementsPerCheckpoint, + expectedRecords); } private StructLikeSet expectedRowSet(Record... records) { @@ -497,10 +489,8 @@ private StructLikeSet expectedRowSet(Record... records) { private StructLikeSet actualRowSet(long snapshotId, String... columns) throws IOException { table.refresh(); StructLikeSet set = StructLikeSet.create(table.schema().asStruct()); - try (CloseableIterable reader = IcebergGenerics.read(table) - .useSnapshot(snapshotId) - .select(columns) - .build()) { + try (CloseableIterable reader = + IcebergGenerics.read(table).useSnapshot(snapshotId).select(columns).build()) { reader.forEach(set::add); } return set; diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java index eccf3da1af53..27dc665055cc 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkManifest.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; +import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; + import java.io.File; import java.io.IOException; import java.nio.file.Paths; @@ -51,13 +52,10 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; - public class TestFlinkManifest { private static final Configuration CONF = new Configuration(); - @Rule - public TemporaryFolder tempFolder = new TemporaryFolder(); + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); private Table table; private FileAppenderFactory appenderFactory; @@ -74,15 +72,21 @@ public void before() throws IOException { // Construct the iceberg table. table = SimpleDataUtil.createTable(tablePath, ImmutableMap.of(), false); - int[] equalityFieldIds = new int[] { - table.schema().findField("id").fieldId(), - table.schema().findField("data").fieldId() - }; - this.appenderFactory = new FlinkAppenderFactory(table.schema(), FlinkSchemaUtil.convert(table.schema()), - table.properties(), table.spec(), equalityFieldIds, table.schema(), null); + int[] equalityFieldIds = + new int[] { + table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() + }; + this.appenderFactory = + new FlinkAppenderFactory( + table.schema(), + FlinkSchemaUtil.convert(table.schema()), + table.properties(), + table.spec(), + equalityFieldIds, + table.schema(), + null); } - @Test public void testIO() throws IOException { String flinkJobId = newFlinkJobId(); @@ -95,13 +99,15 @@ public void testIO() throws IOException { List dataFiles = generateDataFiles(10); List eqDeleteFiles = generateEqDeleteFiles(5); List posDeleteFiles = generatePosDeleteFiles(5); - DeltaManifests deltaManifests = FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder() - .addDataFiles(dataFiles) - .addDeleteFiles(eqDeleteFiles) - .addDeleteFiles(posDeleteFiles) - .build(), - () -> factory.create(curCkpId), table.spec()); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + WriteResult.builder() + .addDataFiles(dataFiles) + .addDeleteFiles(eqDeleteFiles) + .addDeleteFiles(posDeleteFiles) + .build(), + () -> factory.create(curCkpId), + table.spec()); WriteResult result = FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io()); Assert.assertEquals("Size of data file list are not equal.", 10, result.deleteFiles().length); @@ -124,30 +130,39 @@ public void testUserProvidedManifestLocation() throws IOException { String flinkJobId = newFlinkJobId(); String operatorId = newOperatorUniqueId(); File userProvidedFolder = tempFolder.newFolder(); - Map props = ImmutableMap.of(FLINK_MANIFEST_LOCATION, userProvidedFolder.getAbsolutePath() + "///"); - ManifestOutputFileFactory factory = new ManifestOutputFileFactory( - ((HasTableOperations) table).operations(), table.io(), props, - flinkJobId, operatorId, 1, 1); + Map props = + ImmutableMap.of(FLINK_MANIFEST_LOCATION, userProvidedFolder.getAbsolutePath() + "///"); + ManifestOutputFileFactory factory = + new ManifestOutputFileFactory( + ((HasTableOperations) table).operations(), + table.io(), + props, + flinkJobId, + operatorId, + 1, + 1); List dataFiles = generateDataFiles(5); - DeltaManifests deltaManifests = FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder() - .addDataFiles(dataFiles) - .build(), - () -> factory.create(checkpointId), - table.spec()); + DeltaManifests deltaManifests = + FlinkManifestUtil.writeCompletedFiles( + WriteResult.builder().addDataFiles(dataFiles).build(), + () -> factory.create(checkpointId), + table.spec()); Assert.assertNotNull("Data manifest shouldn't be null", deltaManifests.dataManifest()); Assert.assertNull("Delete manifest should be null", deltaManifests.deleteManifest()); - Assert.assertEquals("The newly created manifest file should be located under the user provided directory", - userProvidedFolder.toPath(), Paths.get(deltaManifests.dataManifest().path()).getParent()); + Assert.assertEquals( + "The newly created manifest file should be located under the user provided directory", + userProvidedFolder.toPath(), + Paths.get(deltaManifests.dataManifest().path()).getParent()); WriteResult result = FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io()); Assert.assertEquals(0, result.deleteFiles().length); Assert.assertEquals(5, result.dataFiles().length); - Assert.assertEquals("Size of data file list are not equal.", dataFiles.size(), result.dataFiles().length); + Assert.assertEquals( + "Size of data file list are not equal.", dataFiles.size(), result.dataFiles().length); for (int i = 0; i < dataFiles.size(); i++) { TestHelpers.assertEquals(dataFiles.get(i), result.dataFiles()[i]); } @@ -158,29 +173,34 @@ public void testVersionedSerializer() throws IOException { long checkpointId = 1; String flinkJobId = newFlinkJobId(); String operatorId = newOperatorUniqueId(); - ManifestOutputFileFactory factory = FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, operatorId, - 1, 1); + ManifestOutputFileFactory factory = + FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, operatorId, 1, 1); List dataFiles = generateDataFiles(10); List eqDeleteFiles = generateEqDeleteFiles(10); List posDeleteFiles = generatePosDeleteFiles(10); - DeltaManifests expected = FlinkManifestUtil.writeCompletedFiles( - WriteResult.builder() - .addDataFiles(dataFiles) - .addDeleteFiles(eqDeleteFiles) - .addDeleteFiles(posDeleteFiles) - .build(), - () -> factory.create(checkpointId), table.spec()); + DeltaManifests expected = + FlinkManifestUtil.writeCompletedFiles( + WriteResult.builder() + .addDataFiles(dataFiles) + .addDeleteFiles(eqDeleteFiles) + .addDeleteFiles(posDeleteFiles) + .build(), + () -> factory.create(checkpointId), + table.spec()); byte[] versionedSerializeData = - SimpleVersionedSerialization.writeVersionAndSerialize(DeltaManifestsSerializer.INSTANCE, expected); - DeltaManifests actual = SimpleVersionedSerialization - .readVersionAndDeSerialize(DeltaManifestsSerializer.INSTANCE, versionedSerializeData); + SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, expected); + DeltaManifests actual = + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, versionedSerializeData); TestHelpers.assertEquals(expected.dataManifest(), actual.dataManifest()); TestHelpers.assertEquals(expected.deleteManifest(), actual.deleteManifest()); byte[] versionedSerializeData2 = - SimpleVersionedSerialization.writeVersionAndSerialize(DeltaManifestsSerializer.INSTANCE, actual); + SimpleVersionedSerialization.writeVersionAndSerialize( + DeltaManifestsSerializer.INSTANCE, actual); Assert.assertArrayEquals(versionedSerializeData, versionedSerializeData2); } @@ -190,17 +210,21 @@ public void testCompatibility() throws IOException { long checkpointId = 1; String flinkJobId = newFlinkJobId(); String operatorId = newOperatorUniqueId(); - ManifestOutputFileFactory factory = FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, operatorId, - 1, 1); + ManifestOutputFileFactory factory = + FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, operatorId, 1, 1); List dataFiles = generateDataFiles(10); - ManifestFile manifest = FlinkManifestUtil.writeDataFiles(factory.create(checkpointId), table.spec(), dataFiles); - byte[] dataV1 = SimpleVersionedSerialization.writeVersionAndSerialize(new V1Serializer(), manifest); + ManifestFile manifest = + FlinkManifestUtil.writeDataFiles(factory.create(checkpointId), table.spec(), dataFiles); + byte[] dataV1 = + SimpleVersionedSerialization.writeVersionAndSerialize(new V1Serializer(), manifest); DeltaManifests delta = - SimpleVersionedSerialization.readVersionAndDeSerialize(DeltaManifestsSerializer.INSTANCE, dataV1); + SimpleVersionedSerialization.readVersionAndDeSerialize( + DeltaManifestsSerializer.INSTANCE, dataV1); Assert.assertNull("Serialization v1 don't include delete files.", delta.deleteManifest()); - Assert.assertNotNull("Serialization v1 should not have null data manifest.", delta.dataManifest()); + Assert.assertNotNull( + "Serialization v1 should not have null data manifest.", delta.dataManifest()); TestHelpers.assertEquals(manifest, delta.dataManifest()); List actualFiles = FlinkManifestUtil.readDataFiles(delta.dataManifest(), table.io()); @@ -229,18 +253,24 @@ public ManifestFile deserialize(int version, byte[] serialized) throws IOExcepti } private DataFile writeDataFile(String filename, List rows) throws IOException { - return SimpleDataUtil.writeFile(table.schema(), table.spec(), CONF, - table.location(), FileFormat.PARQUET.addExtension(filename), rows); + return SimpleDataUtil.writeFile( + table.schema(), + table.spec(), + CONF, + table.location(), + FileFormat.PARQUET.addExtension(filename), + rows); } private DeleteFile writeEqDeleteFile(String filename, List deletes) throws IOException { - return SimpleDataUtil.writeEqDeleteFile(table, FileFormat.PARQUET, filename, appenderFactory, deletes); + return SimpleDataUtil.writeEqDeleteFile( + table, FileFormat.PARQUET, filename, appenderFactory, deletes); } private DeleteFile writePosDeleteFile(String filename, List> positions) throws IOException { - return SimpleDataUtil - .writePosDeleteFile(table, FileFormat.PARQUET, filename, appenderFactory, positions); + return SimpleDataUtil.writePosDeleteFile( + table, FileFormat.PARQUET, filename, appenderFactory, positions); } private List generateDataFiles(int fileNum) throws IOException { @@ -258,7 +288,8 @@ private List generateEqDeleteFiles(int fileNum) throws IOException { List deleteFiles = Lists.newArrayList(); for (int i = 0; i < fileNum; i++) { rowDataList.add(SimpleDataUtil.createDelete(i, "a" + i)); - deleteFiles.add(writeEqDeleteFile("eq-delete-file-" + fileCount.incrementAndGet(), rowDataList)); + deleteFiles.add( + writeEqDeleteFile("eq-delete-file-" + fileCount.incrementAndGet(), rowDataList)); } return deleteFiles; } @@ -268,7 +299,8 @@ private List generatePosDeleteFiles(int fileNum) throws IOException List deleteFiles = Lists.newArrayList(); for (int i = 0; i < fileNum; i++) { positions.add(Pair.of("data-file-1", (long) i)); - deleteFiles.add(writePosDeleteFile("pos-delete-file-" + fileCount.incrementAndGet(), positions)); + deleteFiles.add( + writePosDeleteFile("pos-delete-file-" + fileCount.incrementAndGet(), positions)); } return deleteFiles; } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java index 934b5a0d75de..3951c2e70f65 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPartitioningWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -39,9 +38,11 @@ public TestFlinkPartitioningWriters(FileFormat fileFormat) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return FlinkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java index 5fd5c5eebee9..9e846efe6fc9 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkPositionDeltaWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -39,9 +38,11 @@ public TestFlinkPositionDeltaWriters(FileFormat fileFormat) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return FlinkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java index 9339e5ac2c3e..07716b9c3e60 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkRollingFileWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -35,9 +34,11 @@ public TestFlinkRollingFileWriters(FileFormat fileFormat, boolean partitioned) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return FlinkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java index aa31c1819d10..e6d64ef2c720 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkWriterMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import org.apache.flink.table.data.GenericRowData; diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java index d1fb23720bb3..6dbc10a1730c 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergFilesCommitter.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.flink.sink.IcebergFilesCommitter.MAX_CONTINUOUS_EMPTY_COMMITS; +import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; + import java.io.File; import java.io.IOException; import java.nio.file.Files; @@ -66,10 +69,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.flink.sink.IcebergFilesCommitter.MAX_CONTINUOUS_EMPTY_COMMITS; -import static org.apache.iceberg.flink.sink.ManifestOutputFileFactory.FLINK_MANIFEST_LOCATION; - @RunWith(Parameterized.class) public class TestIcebergFilesCommitter extends TableTestBase { private static final Configuration CONF = new Configuration(); @@ -81,12 +80,12 @@ public class TestIcebergFilesCommitter extends TableTestBase { @Parameterized.Parameters(name = "FileFormat = {0}, FormatVersion={1}") public static Object[][] parameters() { return new Object[][] { - new Object[] {"avro", 1}, - new Object[] {"avro", 2}, - new Object[] {"parquet", 1}, - new Object[] {"parquet", 2}, - new Object[] {"orc", 1}, - new Object[] {"orc", 2} + new Object[] {"avro", 1}, + new Object[] {"avro", 2}, + new Object[] {"parquet", 1}, + new Object[] {"parquet", 2}, + new Object[] {"orc", 1}, + new Object[] {"orc", 2} }; } @@ -107,7 +106,8 @@ public void setupTable() throws IOException { // Construct the iceberg table. table = create(SimpleDataUtil.SCHEMA, PartitionSpec.unpartitioned()); - table.updateProperties() + table + .updateProperties() .set(DEFAULT_FILE_FORMAT, format.name()) .set(FLINK_MANIFEST_LOCATION, flinkManifestFolder.getAbsolutePath()) .set(MAX_CONTINUOUS_EMPTY_COMMITS, "1") @@ -127,7 +127,8 @@ public void testCommitTxnWithoutDataFiles() throws Exception { assertSnapshotSize(0); assertMaxCommittedCheckpointId(jobId, -1L); - // It's better to advance the max-committed-checkpoint-id in iceberg snapshot, so that the future flink job + // It's better to advance the max-committed-checkpoint-id in iceberg snapshot, so that the + // future flink job // failover won't fail. for (int i = 1; i <= 3; i++) { harness.snapshot(++checkpointId, ++timestamp); @@ -144,9 +145,7 @@ public void testCommitTxnWithoutDataFiles() throws Exception { @Test public void testMaxContinuousEmptyCommits() throws Exception { - table.updateProperties() - .set(MAX_CONTINUOUS_EMPTY_COMMITS, "3") - .commit(); + table.updateProperties().set(MAX_CONTINUOUS_EMPTY_COMMITS, "3").commit(); JobID jobId = new JobID(); long checkpointId = 0; @@ -203,7 +202,8 @@ public void testCommitTxn() throws Exception { SimpleDataUtil.assertTableRows(table, ImmutableList.copyOf(rows)); assertSnapshotSize(i); assertMaxCommittedCheckpointId(jobID, i); - Assert.assertEquals(TestIcebergFilesCommitter.class.getName(), + Assert.assertEquals( + TestIcebergFilesCommitter.class.getName(), table.currentSnapshot().summary().get("flink.test")); } } @@ -371,7 +371,8 @@ public void testRecoveryFromValidSnapshot() throws Exception { @Test public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Exception { - // We've two steps in checkpoint: 1. snapshotState(ckp); 2. notifyCheckpointComplete(ckp). It's possible that we + // We've two steps in checkpoint: 1. snapshotState(ckp); 2. notifyCheckpointComplete(ckp). It's + // possible that we // flink job will restore from a checkpoint with only step#1 finished. long checkpointId = 0; long timestamp = 0; @@ -401,7 +402,8 @@ public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Except harness.initializeState(snapshot); harness.open(); - // All flink manifests should be cleaned because it has committed the unfinished iceberg transaction. + // All flink manifests should be cleaned because it has committed the unfinished iceberg + // transaction. assertFlinkManifests(0); SimpleDataUtil.assertTableRows(table, expectedRows); @@ -429,12 +431,14 @@ public void testRecoveryFromSnapshotWithoutCompletedNotification() throws Except // Redeploying flink job from external checkpoint. JobID newJobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(newJobId)) { + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(newJobId)) { harness.setup(); harness.initializeState(snapshot); harness.open(); - // All flink manifests should be cleaned because it has committed the unfinished iceberg transaction. + // All flink manifests should be cleaned because it has committed the unfinished iceberg + // transaction. assertFlinkManifests(0); assertMaxCommittedCheckpointId(newJobId, -1); @@ -467,7 +471,8 @@ public void testStartAnotherJobToWriteSameTable() throws Exception { List tableRows = Lists.newArrayList(); JobID oldJobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(oldJobId)) { + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(oldJobId)) { harness.setup(); harness.open(); @@ -496,7 +501,8 @@ public void testStartAnotherJobToWriteSameTable() throws Exception { checkpointId = 0; timestamp = 0; JobID newJobId = new JobID(); - try (OneInputStreamOperatorTestHarness harness = createStreamSink(newJobId)) { + try (OneInputStreamOperatorTestHarness harness = + createStreamSink(newJobId)) { harness.setup(); harness.open(); @@ -575,7 +581,8 @@ public void testBoundedStream() throws Exception { SimpleDataUtil.assertTableRows(table, tableRows); assertSnapshotSize(1); assertMaxCommittedCheckpointId(jobId, Long.MAX_VALUE); - Assert.assertEquals(TestIcebergFilesCommitter.class.getName(), + Assert.assertEquals( + TestIcebergFilesCommitter.class.getName(), table.currentSnapshot().summary().get("flink.test")); } } @@ -603,12 +610,14 @@ public void testFlinkManifests() throws Exception { List manifestPaths = assertFlinkManifests(1); Path manifestPath = manifestPaths.get(0); String operatorId = harness.getOneInputOperator().getOperatorID().toString(); - Assert.assertEquals("File name should have the expected pattern.", + Assert.assertEquals( + "File name should have the expected pattern.", String.format("%s-%s-%05d-%d-%d-%05d.avro", jobId, operatorId, 0, 0, checkpoint, 1), manifestPath.getFileName().toString()); // 2. Read the data files from manifests and assert. - List dataFiles = FlinkManifestUtil.readDataFiles(createTestingManifestFile(manifestPath), table.io()); + List dataFiles = + FlinkManifestUtil.readDataFiles(createTestingManifestFile(manifestPath), table.io()); Assert.assertEquals(1, dataFiles.size()); TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); @@ -646,12 +655,14 @@ public void testDeleteFiles() throws Exception { List manifestPaths = assertFlinkManifests(1); Path manifestPath = manifestPaths.get(0); String operatorId = harness.getOneInputOperator().getOperatorID().toString(); - Assert.assertEquals("File name should have the expected pattern.", + Assert.assertEquals( + "File name should have the expected pattern.", String.format("%s-%s-%05d-%d-%d-%05d.avro", jobId, operatorId, 0, 0, checkpoint, 1), manifestPath.getFileName().toString()); // 2. Read the data files from manifests and assert. - List dataFiles = FlinkManifestUtil.readDataFiles(createTestingManifestFile(manifestPath), table.io()); + List dataFiles = + FlinkManifestUtil.readDataFiles(createTestingManifestFile(manifestPath), table.io()); Assert.assertEquals(1, dataFiles.size()); TestHelpers.assertEquals(dataFile1, dataFiles.get(0)); @@ -666,11 +677,10 @@ public void testDeleteFiles() throws Exception { DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(row2)); RowData delete1 = SimpleDataUtil.createDelete(1, "aaa"); - DeleteFile deleteFile1 = writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete1)); - harness.processElement(WriteResult.builder() - .addDataFiles(dataFile2) - .addDeleteFiles(deleteFile1) - .build(), + DeleteFile deleteFile1 = + writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete1)); + harness.processElement( + WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile1).build(), ++timestamp); assertMaxCommittedCheckpointId(jobId, checkpoint); @@ -706,11 +716,10 @@ public void testCommitTwoCheckpointsInSingleTxn() throws Exception { RowData insert2 = SimpleDataUtil.createInsert(2, "bbb"); RowData delete3 = SimpleDataUtil.createDelete(3, "ccc"); DataFile dataFile1 = writeDataFile("data-file-1", ImmutableList.of(insert1, insert2)); - DeleteFile deleteFile1 = writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete3)); - harness.processElement(WriteResult.builder() - .addDataFiles(dataFile1) - .addDeleteFiles(deleteFile1) - .build(), + DeleteFile deleteFile1 = + writeEqDeleteFile(appenderFactory, "delete-file-1", ImmutableList.of(delete3)); + harness.processElement( + WriteResult.builder().addDataFiles(dataFile1).addDeleteFiles(deleteFile1).build(), ++timestamp); // The 1th snapshotState. @@ -719,11 +728,10 @@ public void testCommitTwoCheckpointsInSingleTxn() throws Exception { RowData insert4 = SimpleDataUtil.createInsert(4, "ddd"); RowData delete2 = SimpleDataUtil.createDelete(2, "bbb"); DataFile dataFile2 = writeDataFile("data-file-2", ImmutableList.of(insert4)); - DeleteFile deleteFile2 = writeEqDeleteFile(appenderFactory, "delete-file-2", ImmutableList.of(delete2)); - harness.processElement(WriteResult.builder() - .addDataFiles(dataFile2) - .addDeleteFiles(deleteFile2) - .build(), + DeleteFile deleteFile2 = + writeEqDeleteFile(appenderFactory, "delete-file-2", ImmutableList.of(delete2)); + harness.processElement( + WriteResult.builder().addDataFiles(dataFile2).addDeleteFiles(deleteFile2).build(), ++timestamp); // The 2nd snapshotState. @@ -734,48 +742,74 @@ public void testCommitTwoCheckpointsInSingleTxn() throws Exception { SimpleDataUtil.assertTableRows(table, ImmutableList.of(insert1, insert4)); assertMaxCommittedCheckpointId(jobId, checkpoint); assertFlinkManifests(0); - Assert.assertEquals("Should have committed 2 txn.", 2, ImmutableList.copyOf(table.snapshots()).size()); + Assert.assertEquals( + "Should have committed 2 txn.", 2, ImmutableList.copyOf(table.snapshots()).size()); } } - private DeleteFile writeEqDeleteFile(FileAppenderFactory appenderFactory, - String filename, List deletes) throws IOException { + private DeleteFile writeEqDeleteFile( + FileAppenderFactory appenderFactory, String filename, List deletes) + throws IOException { return SimpleDataUtil.writeEqDeleteFile(table, format, filename, appenderFactory, deletes); } - private DeleteFile writePosDeleteFile(FileAppenderFactory appenderFactory, - String filename, - List> positions) throws IOException { + private DeleteFile writePosDeleteFile( + FileAppenderFactory appenderFactory, + String filename, + List> positions) + throws IOException { return SimpleDataUtil.writePosDeleteFile(table, format, filename, appenderFactory, positions); } private FileAppenderFactory createDeletableAppenderFactory() { - int[] equalityFieldIds = new int[] { - table.schema().findField("id").fieldId(), - table.schema().findField("data").fieldId() - }; - return new FlinkAppenderFactory(table.schema(), - FlinkSchemaUtil.convert(table.schema()), table.properties(), table.spec(), equalityFieldIds, - table.schema(), null); + int[] equalityFieldIds = + new int[] { + table.schema().findField("id").fieldId(), table.schema().findField("data").fieldId() + }; + return new FlinkAppenderFactory( + table.schema(), + FlinkSchemaUtil.convert(table.schema()), + table.properties(), + table.spec(), + equalityFieldIds, + table.schema(), + null); } private ManifestFile createTestingManifestFile(Path manifestPath) { - return new GenericManifestFile(manifestPath.toAbsolutePath().toString(), manifestPath.toFile().length(), 0, - ManifestContent.DATA, 0, 0, 0L, 0, 0, 0, 0, 0, 0, null, null); + return new GenericManifestFile( + manifestPath.toAbsolutePath().toString(), + manifestPath.toFile().length(), + 0, + ManifestContent.DATA, + 0, + 0, + 0L, + 0, + 0, + 0, + 0, + 0, + 0, + null, + null); } private List assertFlinkManifests(int expectedCount) throws IOException { - List manifests = Files.list(flinkManifestFolder.toPath()) - .filter(p -> !p.toString().endsWith(".crc")) - .collect(Collectors.toList()); - Assert.assertEquals(String.format("Expected %s flink manifests, but the list is: %s", expectedCount, manifests), - expectedCount, manifests.size()); + List manifests = + Files.list(flinkManifestFolder.toPath()) + .filter(p -> !p.toString().endsWith(".crc")) + .collect(Collectors.toList()); + Assert.assertEquals( + String.format("Expected %s flink manifests, but the list is: %s", expectedCount, manifests), + expectedCount, + manifests.size()); return manifests; } private DataFile writeDataFile(String filename, List rows) throws IOException { - return SimpleDataUtil.writeFile(table.schema(), table.spec(), CONF, table.location(), - format.addExtension(filename), rows); + return SimpleDataUtil.writeFile( + table.schema(), table.spec(), CONF, table.location(), format.addExtension(filename), rows); } private void assertMaxCommittedCheckpointId(JobID jobID, long expectedId) { @@ -822,10 +856,14 @@ private static TestOperatorFactory of(String tablePath) { @Override @SuppressWarnings("unchecked") - public > T createStreamOperator(StreamOperatorParameters param) { - IcebergFilesCommitter committer = new IcebergFilesCommitter(new TestTableLoader(tablePath), false, - Collections.singletonMap("flink.test", TestIcebergFilesCommitter.class.getName()), - ThreadPools.WORKER_THREAD_POOL_SIZE); + public > T createStreamOperator( + StreamOperatorParameters param) { + IcebergFilesCommitter committer = + new IcebergFilesCommitter( + new TestTableLoader(tablePath), + false, + Collections.singletonMap("flink.test", TestIcebergFilesCommitter.class.getName()), + ThreadPools.WORKER_THREAD_POOL_SIZE); committer.setup(param.getContainingTask(), param.getStreamConfig(), param.getOutput()); return (T) committer; } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java index ae2171238e59..2e134c61fdd8 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.File; @@ -68,8 +67,7 @@ @RunWith(Parameterized.class) public class TestIcebergStreamWriter { - @Rule - public TemporaryFolder tempFolder = new TemporaryFolder(); + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); private Table table; @@ -79,12 +77,12 @@ public class TestIcebergStreamWriter { @Parameterized.Parameters(name = "format = {0}, partitioned = {1}") public static Object[][] parameters() { return new Object[][] { - {"avro", true}, - {"avro", false}, - {"orc", true}, - {"orc", false}, - {"parquet", true}, - {"parquet", false} + {"avro", true}, + {"avro", false}, + {"orc", true}, + {"orc", false}, + {"parquet", true}, + {"parquet", false} }; } @@ -104,7 +102,8 @@ public void before() throws IOException { @Test public void testWritingTable() throws Exception { long checkpointId = 1L; - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { // The first checkpoint testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 1); @@ -134,13 +133,14 @@ public void testWritingTable() throws Exception { appendFiles.commit(); // Assert the table records. - SimpleDataUtil.assertTableRecords(table, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "hello"), - SimpleDataUtil.createRecord(2, "world"), - SimpleDataUtil.createRecord(3, "hello"), - SimpleDataUtil.createRecord(4, "foo"), - SimpleDataUtil.createRecord(5, "bar") - )); + SimpleDataUtil.assertTableRecords( + table, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "hello"), + SimpleDataUtil.createRecord(2, "world"), + SimpleDataUtil.createRecord(3, "hello"), + SimpleDataUtil.createRecord(4, "foo"), + SimpleDataUtil.createRecord(5, "bar"))); } } @@ -148,7 +148,8 @@ public void testWritingTable() throws Exception { public void testSnapshotTwice() throws Exception { long checkpointId = 1; long timestamp = 1; - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), timestamp++); testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), timestamp); @@ -171,13 +172,15 @@ public void testSnapshotTwice() throws Exception { @Test public void testTableWithoutSnapshot() throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { Assert.assertEquals(0, testHarness.extractOutputValues().size()); } // Even if we closed the iceberg stream writer, there's no orphan data file. Assert.assertEquals(0, scanDataFiles().size()); - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); // Still not emit the data file yet, because there is no checkpoint. Assert.assertEquals(0, testHarness.extractOutputValues().size()); @@ -209,7 +212,8 @@ private Set scanDataFiles() throws IOException { @Test public void testBoundedStreamCloseWithEmittingDataFiles() throws Exception { - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { testHarness.processElement(SimpleDataUtil.createRowData(1, "hello"), 1); testHarness.processElement(SimpleDataUtil.createRowData(2, "world"), 2); @@ -233,7 +237,8 @@ public void testBoundedStreamCloseWithEmittingDataFiles() throws Exception { @Test public void testTableWithTargetFileSize() throws Exception { // Adjust the target-file-size in table properties. - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "4") // ~4 bytes; low enough to trigger .commit(); @@ -246,7 +251,8 @@ public void testTableWithTargetFileSize() throws Exception { } } - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter()) { + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter()) { for (RowData row : rows) { testHarness.processElement(row, 1); } @@ -274,20 +280,26 @@ public void testTableWithTargetFileSize() throws Exception { @Test public void testPromotedFlinkDataType() throws Exception { - Schema iSchema = new Schema( - Types.NestedField.required(1, "tinyint", Types.IntegerType.get()), - Types.NestedField.required(2, "smallint", Types.IntegerType.get()), - Types.NestedField.optional(3, "int", Types.IntegerType.get()) - ); - TableSchema flinkSchema = TableSchema.builder() - .field("tinyint", DataTypes.TINYINT().notNull()) - .field("smallint", DataTypes.SMALLINT().notNull()) - .field("int", DataTypes.INT().nullable()) - .build(); + Schema iSchema = + new Schema( + Types.NestedField.required(1, "tinyint", Types.IntegerType.get()), + Types.NestedField.required(2, "smallint", Types.IntegerType.get()), + Types.NestedField.optional(3, "int", Types.IntegerType.get())); + TableSchema flinkSchema = + TableSchema.builder() + .field("tinyint", DataTypes.TINYINT().notNull()) + .field("smallint", DataTypes.SMALLINT().notNull()) + .field("int", DataTypes.INT().nullable()) + .build(); PartitionSpec spec; if (partitioned) { - spec = PartitionSpec.builderFor(iSchema).identity("smallint").identity("tinyint").identity("int").build(); + spec = + PartitionSpec.builderFor(iSchema) + .identity("smallint") + .identity("tinyint") + .identity("int") + .build(); } else { spec = PartitionSpec.unpartitioned(); } @@ -296,21 +308,21 @@ public void testPromotedFlinkDataType() throws Exception { Map props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()); Table icebergTable = new HadoopTables().create(iSchema, spec, props, location); - List rows = Lists.newArrayList( - GenericRowData.of((byte) 0x01, (short) -32768, 101), - GenericRowData.of((byte) 0x02, (short) 0, 102), - GenericRowData.of((byte) 0x03, (short) 32767, 103) - ); + List rows = + Lists.newArrayList( + GenericRowData.of((byte) 0x01, (short) -32768, 101), + GenericRowData.of((byte) 0x02, (short) 0, 102), + GenericRowData.of((byte) 0x03, (short) 32767, 103)); Record record = GenericRecord.create(iSchema); - List expected = Lists.newArrayList( - record.copy(ImmutableMap.of("tinyint", 1, "smallint", -32768, "int", 101)), - record.copy(ImmutableMap.of("tinyint", 2, "smallint", 0, "int", 102)), - record.copy(ImmutableMap.of("tinyint", 3, "smallint", 32767, "int", 103)) - ); - - try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter(icebergTable, - flinkSchema)) { + List expected = + Lists.newArrayList( + record.copy(ImmutableMap.of("tinyint", 1, "smallint", -32768, "int", 101)), + record.copy(ImmutableMap.of("tinyint", 2, "smallint", 0, "int", 102)), + record.copy(ImmutableMap.of("tinyint", 3, "smallint", 32767, "int", 103))); + + try (OneInputStreamOperatorTestHarness testHarness = + createIcebergStreamWriter(icebergTable, flinkSchema)) { for (RowData row : rows) { testHarness.processElement(row, 1); } @@ -328,7 +340,8 @@ public void testPromotedFlinkDataType() throws Exception { SimpleDataUtil.assertTableRecords(location, expected); } - private OneInputStreamOperatorTestHarness createIcebergStreamWriter() throws Exception { + private OneInputStreamOperatorTestHarness createIcebergStreamWriter() + throws Exception { return createIcebergStreamWriter(table, SimpleDataUtil.FLINK_SCHEMA); } @@ -336,14 +349,13 @@ private OneInputStreamOperatorTestHarness createIcebergStr Table icebergTable, TableSchema flinkSchema) throws Exception { RowType flinkRowType = FlinkSink.toFlinkRowType(icebergTable.schema(), flinkSchema); FlinkWriteConf flinkWriteConfig = - new FlinkWriteConf(icebergTable, Maps.newHashMap(), new org.apache.flink.configuration.Configuration()); - - IcebergStreamWriter streamWriter = FlinkSink.createStreamWriter( - icebergTable, - flinkWriteConfig, - flinkRowType, null); - OneInputStreamOperatorTestHarness harness = new OneInputStreamOperatorTestHarness<>( - streamWriter, 1, 1, 0); + new FlinkWriteConf( + icebergTable, Maps.newHashMap(), new org.apache.flink.configuration.Configuration()); + + IcebergStreamWriter streamWriter = + FlinkSink.createStreamWriter(icebergTable, flinkWriteConfig, flinkRowType, null); + OneInputStreamOperatorTestHarness harness = + new OneInputStreamOperatorTestHarness<>(streamWriter, 1, 1, 0); harness.setup(); harness.open(); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java index 29a1f78a531e..b6c785cb144b 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestRowDataPartitionKey.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.util.List; @@ -40,53 +39,54 @@ import org.junit.Test; public class TestRowDataPartitionKey { - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(0, "boolType", Types.BooleanType.get()), - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "longType", Types.LongType.get()), - Types.NestedField.required(3, "dateType", Types.DateType.get()), - Types.NestedField.required(4, "timeType", Types.TimeType.get()), - Types.NestedField.required(5, "stringType", Types.StringType.get()), - Types.NestedField.required(6, "timestampWithoutZone", Types.TimestampType.withoutZone()), - Types.NestedField.required(7, "timestampWithZone", Types.TimestampType.withZone()), - Types.NestedField.required(8, "fixedType", Types.FixedType.ofLength(5)), - Types.NestedField.required(9, "uuidType", Types.UUIDType.get()), - Types.NestedField.required(10, "binaryType", Types.BinaryType.get()), - Types.NestedField.required(11, "decimalType1", Types.DecimalType.of(18, 3)), - Types.NestedField.required(12, "decimalType2", Types.DecimalType.of(10, 5)), - Types.NestedField.required(13, "decimalType3", Types.DecimalType.of(38, 19)), - Types.NestedField.required(14, "floatType", Types.FloatType.get()), - Types.NestedField.required(15, "doubleType", Types.DoubleType.get()) - ); - - private static final List SUPPORTED_PRIMITIVES = SCHEMA.asStruct().fields().stream() - .map(Types.NestedField::name).collect(Collectors.toList()); - - private static final Schema NESTED_SCHEMA = new Schema( - Types.NestedField.required(1, "structType", Types.StructType.of( - Types.NestedField.optional(2, "innerStringType", Types.StringType.get()), - Types.NestedField.optional(3, "innerIntegerType", Types.IntegerType.get()) - )) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(0, "boolType", Types.BooleanType.get()), + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "longType", Types.LongType.get()), + Types.NestedField.required(3, "dateType", Types.DateType.get()), + Types.NestedField.required(4, "timeType", Types.TimeType.get()), + Types.NestedField.required(5, "stringType", Types.StringType.get()), + Types.NestedField.required(6, "timestampWithoutZone", Types.TimestampType.withoutZone()), + Types.NestedField.required(7, "timestampWithZone", Types.TimestampType.withZone()), + Types.NestedField.required(8, "fixedType", Types.FixedType.ofLength(5)), + Types.NestedField.required(9, "uuidType", Types.UUIDType.get()), + Types.NestedField.required(10, "binaryType", Types.BinaryType.get()), + Types.NestedField.required(11, "decimalType1", Types.DecimalType.of(18, 3)), + Types.NestedField.required(12, "decimalType2", Types.DecimalType.of(10, 5)), + Types.NestedField.required(13, "decimalType3", Types.DecimalType.of(38, 19)), + Types.NestedField.required(14, "floatType", Types.FloatType.get()), + Types.NestedField.required(15, "doubleType", Types.DoubleType.get())); + + private static final List SUPPORTED_PRIMITIVES = + SCHEMA.asStruct().fields().stream().map(Types.NestedField::name).collect(Collectors.toList()); + + private static final Schema NESTED_SCHEMA = + new Schema( + Types.NestedField.required( + 1, + "structType", + Types.StructType.of( + Types.NestedField.optional(2, "innerStringType", Types.StringType.get()), + Types.NestedField.optional(3, "innerIntegerType", Types.IntegerType.get())))); @Test public void testNullPartitionValue() { - Schema schema = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .identity("data") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("data").build(); - List rows = Lists.newArrayList( - GenericRowData.of(1, StringData.fromString("a")), - GenericRowData.of(2, StringData.fromString("b")), - GenericRowData.of(3, null) - ); + List rows = + Lists.newArrayList( + GenericRowData.of(1, StringData.fromString("a")), + GenericRowData.of(2, StringData.fromString("b")), + GenericRowData.of(3, null)); - RowDataWrapper rowWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); + RowDataWrapper rowWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(schema), schema.asStruct()); for (RowData row : rows) { PartitionKey partitionKey = new PartitionKey(spec, schema); @@ -100,16 +100,15 @@ public void testNullPartitionValue() { @Test public void testPartitionWithOneNestedField() { - RowDataWrapper rowWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); + RowDataWrapper rowWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); List records = RandomGenericData.generate(NESTED_SCHEMA, 10, 1991); List rows = Lists.newArrayList(RandomRowData.convert(NESTED_SCHEMA, records)); - PartitionSpec spec1 = PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerStringType") - .build(); - PartitionSpec spec2 = PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerIntegerType") - .build(); + PartitionSpec spec1 = + PartitionSpec.builderFor(NESTED_SCHEMA).identity("structType.innerStringType").build(); + PartitionSpec spec2 = + PartitionSpec.builderFor(NESTED_SCHEMA).identity("structType.innerIntegerType").build(); for (int i = 0; i < rows.size(); i++) { RowData row = rows.get(i); @@ -131,18 +130,21 @@ public void testPartitionWithOneNestedField() { @Test public void testPartitionMultipleNestedField() { - RowDataWrapper rowWrapper = new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); + RowDataWrapper rowWrapper = + new RowDataWrapper(FlinkSchemaUtil.convert(NESTED_SCHEMA), NESTED_SCHEMA.asStruct()); List records = RandomGenericData.generate(NESTED_SCHEMA, 10, 1992); List rows = Lists.newArrayList(RandomRowData.convert(NESTED_SCHEMA, records)); - PartitionSpec spec1 = PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerIntegerType") - .identity("structType.innerStringType") - .build(); - PartitionSpec spec2 = PartitionSpec.builderFor(NESTED_SCHEMA) - .identity("structType.innerStringType") - .identity("structType.innerIntegerType") - .build(); + PartitionSpec spec1 = + PartitionSpec.builderFor(NESTED_SCHEMA) + .identity("structType.innerIntegerType") + .identity("structType.innerStringType") + .build(); + PartitionSpec spec2 = + PartitionSpec.builderFor(NESTED_SCHEMA) + .identity("structType.innerStringType") + .identity("structType.innerIntegerType") + .build(); PartitionKey pk1 = new PartitionKey(spec1, NESTED_SCHEMA); PartitionKey pk2 = new PartitionKey(spec2, NESTED_SCHEMA); @@ -188,14 +190,19 @@ public void testPartitionValueTypes() { pk.partition(rowWrapper.wrap(row)); expectedPK.partition(recordWrapper.wrap(record)); - Assert.assertEquals("Partition with column " + column + " should have one field.", 1, pk.size()); + Assert.assertEquals( + "Partition with column " + column + " should have one field.", 1, pk.size()); if (column.equals("timeType")) { - Assert.assertEquals("Partition with column " + column + " should have the expected values", - expectedPK.get(0, Long.class) / 1000, pk.get(0, Long.class) / 1000); + Assert.assertEquals( + "Partition with column " + column + " should have the expected values", + expectedPK.get(0, Long.class) / 1000, + pk.get(0, Long.class) / 1000); } else { - Assert.assertEquals("Partition with column " + column + " should have the expected values", - expectedPK.get(0, javaClasses[0]), pk.get(0, javaClasses[0])); + Assert.assertEquals( + "Partition with column " + column + " should have the expected values", + expectedPK.get(0, javaClasses[0]), + pk.get(0, javaClasses[0])); } } } @@ -225,15 +232,19 @@ public void testNestedPartitionValues() { pk.partition(rowWrapper.wrap(rows.get(j))); expectedPK.partition(recordWrapper.wrap(records.get(j))); - Assert.assertEquals("Partition with nested column " + column + " should have one field.", - 1, pk.size()); + Assert.assertEquals( + "Partition with nested column " + column + " should have one field.", 1, pk.size()); if (column.equals("nested.timeType")) { - Assert.assertEquals("Partition with nested column " + column + " should have the expected values.", - expectedPK.get(0, Long.class) / 1000, pk.get(0, Long.class) / 1000); + Assert.assertEquals( + "Partition with nested column " + column + " should have the expected values.", + expectedPK.get(0, Long.class) / 1000, + pk.get(0, Long.class) / 1000); } else { - Assert.assertEquals("Partition with nested column " + column + " should have the expected values.", - expectedPK.get(0, javaClasses[0]), pk.get(0, javaClasses[0])); + Assert.assertEquals( + "Partition with nested column " + column + " should have the expected values.", + expectedPK.get(0, javaClasses[0]), + pk.get(0, javaClasses[0])); } } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java index ddeedbb094d2..b177701270bc 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/sink/TestTaskWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.sink; import java.io.File; @@ -54,18 +53,17 @@ public class TestTaskWriters { private static final Configuration CONF = new Configuration(); private static final long TARGET_FILE_SIZE = 128 * 1024 * 1024; - @Rule - public final TemporaryFolder tempFolder = new TemporaryFolder(); + @Rule public final TemporaryFolder tempFolder = new TemporaryFolder(); @Parameterized.Parameters(name = "format = {0}, partitioned = {1}") public static Object[][] parameters() { return new Object[][] { - {"avro", true}, - {"avro", false}, - {"orc", true}, - {"orc", false}, - {"parquet", true}, - {"parquet", false} + {"avro", true}, + {"avro", false}, + {"orc", true}, + {"orc", false}, + {"parquet", true}, + {"parquet", false} }; } @@ -169,12 +167,13 @@ public void testCompleteFiles() throws IOException { appendFiles.commit(); // Assert the data rows. - SimpleDataUtil.assertTableRecords(table, Lists.newArrayList( - SimpleDataUtil.createRecord(1, "a"), - SimpleDataUtil.createRecord(2, "b"), - SimpleDataUtil.createRecord(3, "c"), - SimpleDataUtil.createRecord(4, "d") - )); + SimpleDataUtil.assertTableRecords( + table, + Lists.newArrayList( + SimpleDataUtil.createRecord(1, "a"), + SimpleDataUtil.createRecord(2, "b"), + SimpleDataUtil.createRecord(3, "c"), + SimpleDataUtil.createRecord(4, "d"))); } } @@ -230,9 +229,14 @@ public void testRandomData() throws IOException { } private TaskWriter createTaskWriter(long targetFileSize) { - TaskWriterFactory taskWriterFactory = new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), (RowType) SimpleDataUtil.FLINK_SCHEMA.toRowDataType().getLogicalType(), - targetFileSize, format, null, false); + TaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + (RowType) SimpleDataUtil.FLINK_SCHEMA.toRowDataType().getLogicalType(), + targetFileSize, + format, + null, + false); taskWriterFactory.initialize(1, 1); return taskWriterFactory.create(); } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java index bf38550c8723..a08578a4c106 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTableFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.List; @@ -54,7 +53,8 @@ public class BoundedTableFactory implements DynamicTableSourceFactory { private static final AtomicInteger DATA_SET_ID = new AtomicInteger(0); private static final Map>> DATA_SETS = Maps.newHashMap(); - private static final ConfigOption DATA_ID = ConfigOptions.key("data-id").stringType().noDefaultValue(); + private static final ConfigOption DATA_ID = + ConfigOptions.key("data-id").stringType().noDefaultValue(); public static String registerDataSet(List> dataSet) { String dataSetId = String.valueOf(DATA_SET_ID.incrementAndGet()); @@ -68,12 +68,13 @@ public static void clearDataSets() { @Override public DynamicTableSource createDynamicTableSource(Context context) { - TableSchema tableSchema = TableSchemaUtils.getPhysicalSchema(context.getCatalogTable().getSchema()); + TableSchema tableSchema = + TableSchemaUtils.getPhysicalSchema(context.getCatalogTable().getSchema()); Configuration configuration = Configuration.fromMap(context.getCatalogTable().getOptions()); String dataId = configuration.getString(DATA_ID); - Preconditions.checkArgument(DATA_SETS.containsKey(dataId), - "data-id %s does not found in registered data set.", dataId); + Preconditions.checkArgument( + DATA_SETS.containsKey(dataId), "data-id %s does not found in registered data set.", dataId); return new BoundedTableSource(DATA_SETS.get(dataId), tableSchema); } @@ -113,8 +114,7 @@ public ChangelogMode getChangelogMode() { Supplier> supplier = () -> elementsPerCheckpoint.stream().flatMap(List::stream); // Add the INSERT row kind by default. - ChangelogMode.Builder builder = ChangelogMode.newBuilder() - .addContainedKind(RowKind.INSERT); + ChangelogMode.Builder builder = ChangelogMode.newBuilder().addContainedKind(RowKind.INSERT); if (supplier.get().anyMatch(r -> r.getKind() == RowKind.DELETE)) { builder.addContainedKind(RowKind.DELETE); @@ -138,12 +138,13 @@ public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderCon public DataStream produceDataStream( ProviderContext providerContext, StreamExecutionEnvironment env) { boolean checkpointEnabled = env.getCheckpointConfig().isCheckpointingEnabled(); - SourceFunction source = new BoundedTestSource<>(elementsPerCheckpoint, checkpointEnabled); + SourceFunction source = + new BoundedTestSource<>(elementsPerCheckpoint, checkpointEnabled); RowType rowType = (RowType) tableSchema.toRowDataType().getLogicalType(); // Converter to convert the Row to RowData. - DataFormatConverters.RowConverter rowConverter = new DataFormatConverters - .RowConverter(tableSchema.getFieldDataTypes()); + DataFormatConverters.RowConverter rowConverter = + new DataFormatConverters.RowConverter(tableSchema.getFieldDataTypes()); return env.addSource(source, new RowTypeInfo(tableSchema.getFieldTypes())) .map(rowConverter::toInternal, FlinkCompatibilityUtil.toTypeInfo(rowType)); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java index 54e44ee5b008..7b435d059845 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/BoundedTestSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Arrays; @@ -28,12 +27,10 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; /** - * A stream source that: - * 1) emits the elements from elementsPerCheckpoint.get(0) without allowing checkpoints. - * 2) then waits for the checkpoint to complete. - * 3) emits the elements from elementsPerCheckpoint.get(1) without allowing checkpoints. - * 4) then waits for the checkpoint to complete. - * 5) ... + * A stream source that: 1) emits the elements from elementsPerCheckpoint.get(0) without allowing + * checkpoints. 2) then waits for the checkpoint to complete. 3) emits the elements from + * elementsPerCheckpoint.get(1) without allowing checkpoints. 4) then waits for the checkpoint to + * complete. 5) ... * *

    Util all the list from elementsPerCheckpoint are exhausted. */ @@ -45,9 +42,7 @@ public final class BoundedTestSource implements SourceFunction, Checkpoint private final AtomicInteger numCheckpointsComplete = new AtomicInteger(0); - /** - * Emits all those elements in several checkpoints. - */ + /** Emits all those elements in several checkpoints. */ public BoundedTestSource(List> elementsPerCheckpoint, boolean checkpointEnabled) { this.elementsPerCheckpoint = elementsPerCheckpoint; this.checkpointEnabled = checkpointEnabled; @@ -57,9 +52,7 @@ public BoundedTestSource(List> elementsPerCheckpoint) { this(elementsPerCheckpoint, true); } - /** - * Emits all those elements in a single checkpoint. - */ + /** Emits all those elements in a single checkpoint. */ public BoundedTestSource(T... elements) { this(Collections.singletonList(Arrays.asList(elements))); } @@ -67,8 +60,9 @@ public BoundedTestSource(T... elements) { @Override public void run(SourceContext ctx) throws Exception { if (!checkpointEnabled) { - Preconditions.checkArgument(elementsPerCheckpoint.size() <= 1, - "There should be at most one list in the elementsPerCheckpoint when checkpoint is disabled."); + Preconditions.checkArgument( + elementsPerCheckpoint.size() <= 1, + "There should be at most one list in the elementsPerCheckpoint when checkpoint is disabled."); elementsPerCheckpoint.stream().flatMap(List::stream).forEach(ctx::collect); return; } @@ -77,11 +71,16 @@ public void run(SourceContext ctx) throws Exception { final int checkpointToAwait; synchronized (ctx.getCheckpointLock()) { - // Let's say checkpointToAwait = numCheckpointsComplete.get() + delta, in fact the value of delta should not - // affect the final table records because we only need to make sure that there will be exactly - // elementsPerCheckpoint.size() checkpoints to emit each records buffer from the original elementsPerCheckpoint. - // Even if the checkpoints that emitted results are not continuous, the correctness of the data should not be - // affected in the end. Setting the delta to be 2 is introducing the variable that produce un-continuous + // Let's say checkpointToAwait = numCheckpointsComplete.get() + delta, in fact the value of + // delta should not + // affect the final table records because we only need to make sure that there will be + // exactly + // elementsPerCheckpoint.size() checkpoints to emit each records buffer from the original + // elementsPerCheckpoint. + // Even if the checkpoints that emitted results are not continuous, the correctness of the + // data should not be + // affected in the end. Setting the delta to be 2 is introducing the variable that produce + // un-continuous // checkpoints that emit the records buffer from elementsPerCheckpoints. checkpointToAwait = numCheckpointsComplete.get() + 2; for (T element : elements) { diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java index b4d8d7bb3efa..7aa2b8034bc5 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/ChangeLogTableTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.List; @@ -36,8 +35,7 @@ public class ChangeLogTableTestBase extends FlinkTestBase { private volatile TableEnvironment tEnv = null; - @Rule - public TestName name = new TestName(); + @Rule public TestName name = new TestName(); @After public void clean() { @@ -50,16 +48,15 @@ protected TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { if (tEnv == null) { - EnvironmentSettings settings = EnvironmentSettings - .newInstance() - .inStreamingMode() - .build(); + EnvironmentSettings settings = + EnvironmentSettings.newInstance().inStreamingMode().build(); - StreamExecutionEnvironment env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) - .enableCheckpointing(400) - .setMaxParallelism(1) - .setParallelism(1); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG) + .enableCheckpointing(400) + .setMaxParallelism(1) + .setParallelism(1); tEnv = StreamTableEnvironment.create(env, settings); } @@ -85,8 +82,6 @@ protected static Row updateAfterRow(Object... values) { } protected static List listJoin(List> lists) { - return lists.stream() - .flatMap(List::stream) - .collect(Collectors.toList()); + return lists.stream().flatMap(List::stream).collect(Collectors.toList()); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java index bc302fa5d441..8dc68aad10aa 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/SplitHelpers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.File; @@ -43,18 +42,18 @@ public class SplitHelpers { private static final AtomicLong splitLengthIncrement = new AtomicLong(); - private SplitHelpers() { - } + private SplitHelpers() {} /** * This create a list of IcebergSourceSplit from real files *

  • Create a new Hadoop table under the {@code temporaryFolder} *
  • write {@code fileCount} number of files to the new Iceberg table - *
  • Discover the splits from the table and partition the splits by the {@code filePerSplit} limit + *
  • Discover the splits from the table and partition the splits by the {@code filePerSplit} + * limit *
  • Delete the Hadoop table * - * Since the table and data files are deleted before this method return, - * caller shouldn't attempt to read the data files. + *

    Since the table and data files are deleted before this method return, caller shouldn't + * attempt to read the data files. */ public static List createSplitsFromTransientHadoopTable( TemporaryFolder temporaryFolder, int fileCount, int filesPerSplit) throws Exception { @@ -65,24 +64,28 @@ public static List createSplitsFromTransientHadoopTable( final HadoopCatalog catalog = new HadoopCatalog(hadoopConf, warehouse); try { final Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); - final GenericAppenderHelper dataAppender = new GenericAppenderHelper( - table, FileFormat.PARQUET, temporaryFolder); + final GenericAppenderHelper dataAppender = + new GenericAppenderHelper(table, FileFormat.PARQUET, temporaryFolder); for (int i = 0; i < fileCount; ++i) { List records = RandomGenericData.generate(TestFixtures.SCHEMA, 2, i); dataAppender.appendToTable(records); } final ScanContext scanContext = ScanContext.builder().build(); - final List splits = FlinkSplitPlanner.planIcebergSourceSplits( - table, scanContext, ThreadPools.getWorkerPool()); + final List splits = + FlinkSplitPlanner.planIcebergSourceSplits( + table, scanContext, ThreadPools.getWorkerPool()); return splits.stream() - .flatMap(split -> { - List> filesList = Lists.partition( - Lists.newArrayList(split.task().files()), filesPerSplit); - return filesList.stream() - .map(files -> new BaseCombinedScanTask(files)) - .map(combinedScanTask -> IcebergSourceSplit.fromCombinedScanTask(combinedScanTask)); - }) + .flatMap( + split -> { + List> filesList = + Lists.partition(Lists.newArrayList(split.task().files()), filesPerSplit); + return filesList.stream() + .map(files -> new BaseCombinedScanTask(files)) + .map( + combinedScanTask -> + IcebergSourceSplit.fromCombinedScanTask(combinedScanTask)); + }) .collect(Collectors.toList()); } finally { catalog.dropTable(TestFixtures.TABLE_IDENTIFIER); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java index d163b84c09c6..7b5f9328694c 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestBoundedTableFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.List; @@ -36,46 +35,53 @@ public void testEmptyDataSet() { List> emptyDataSet = ImmutableList.of(); String dataId = BoundedTableFactory.registerDataSet(emptyDataSet); - sql("CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", table, dataId); + sql( + "CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", + table, dataId); - Assert.assertEquals("Should have caught empty change log set.", ImmutableList.of(), + Assert.assertEquals( + "Should have caught empty change log set.", + ImmutableList.of(), sql("SELECT * FROM %s", table)); } @Test public void testBoundedTableFactory() { String table = name.getMethodName(); - List> dataSet = ImmutableList.of( - ImmutableList.of( - insertRow(1, "aaa"), - deleteRow(1, "aaa"), - insertRow(1, "bbb"), - insertRow(2, "aaa"), - deleteRow(2, "aaa"), - insertRow(2, "bbb") - ), - ImmutableList.of( - updateBeforeRow(2, "bbb"), - updateAfterRow(2, "ccc"), - deleteRow(2, "ccc"), - insertRow(2, "ddd") - ), + List> dataSet = ImmutableList.of( - deleteRow(1, "bbb"), - insertRow(1, "ccc"), - deleteRow(1, "ccc"), - insertRow(1, "ddd") - ) - ); + ImmutableList.of( + insertRow(1, "aaa"), + deleteRow(1, "aaa"), + insertRow(1, "bbb"), + insertRow(2, "aaa"), + deleteRow(2, "aaa"), + insertRow(2, "bbb")), + ImmutableList.of( + updateBeforeRow(2, "bbb"), + updateAfterRow(2, "ccc"), + deleteRow(2, "ccc"), + insertRow(2, "ddd")), + ImmutableList.of( + deleteRow(1, "bbb"), + insertRow(1, "ccc"), + deleteRow(1, "ccc"), + insertRow(1, "ddd"))); String dataId = BoundedTableFactory.registerDataSet(dataSet); - sql("CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", table, dataId); + sql( + "CREATE TABLE %s(id INT, data STRING) WITH ('connector'='BoundedSource', 'data-id'='%s')", + table, dataId); List rowSet = dataSet.stream().flatMap(Streams::stream).collect(Collectors.toList()); - Assert.assertEquals("Should have the expected change log events.", rowSet, sql("SELECT * FROM %s", table)); + Assert.assertEquals( + "Should have the expected change log events.", rowSet, sql("SELECT * FROM %s", table)); - Assert.assertEquals("Should have the expected change log events", - rowSet.stream().filter(r -> Objects.equals(r.getField(1), "aaa")).collect(Collectors.toList()), + Assert.assertEquals( + "Should have the expected change log events", + rowSet.stream() + .filter(r -> Objects.equals(r.getField(1), "aaa")) + .collect(Collectors.toList()), sql("SELECT * FROM %s WHERE data='aaa'", table)); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java index eae3233a6546..69b8ac269267 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormat.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.List; import java.util.Map; @@ -38,11 +39,7 @@ import org.apache.iceberg.types.Types; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.required; - -/** - * Test {@link FlinkInputFormat}. - */ +/** Test {@link FlinkInputFormat}. */ public class TestFlinkInputFormat extends TestFlinkSource { public TestFlinkInputFormat(String fileFormat) { @@ -56,20 +53,27 @@ public void before() throws IOException { @Override protected List run( - FlinkSource.Builder formatBuilder, Map sqlOptions, String sqlFilter, String... sqlSelectedFields) + FlinkSource.Builder formatBuilder, + Map sqlOptions, + String sqlFilter, + String... sqlSelectedFields) throws Exception { return runFormat(formatBuilder.tableLoader(tableLoader()).buildFormat()); } @Test public void testNestedProjection() throws Exception { - Schema schema = new Schema( - required(1, "data", Types.StringType.get()), - required(2, "nested", Types.StructType.of( - Types.NestedField.required(3, "f1", Types.StringType.get()), - Types.NestedField.required(4, "f2", Types.StringType.get()), - Types.NestedField.required(5, "f3", Types.LongType.get()))), - required(6, "id", Types.LongType.get())); + Schema schema = + new Schema( + required(1, "data", Types.StringType.get()), + required( + 2, + "nested", + Types.StructType.of( + Types.NestedField.required(3, "f1", Types.StringType.get()), + Types.NestedField.required(4, "f2", Types.StringType.get()), + Types.NestedField.required(5, "f3", Types.LongType.get()))), + required(6, "id", Types.LongType.get())); Table table = catalog.createTable(TableIdentifier.of("default", "t"), schema); @@ -81,13 +85,17 @@ public void testNestedProjection() throws Exception { // The Flink SQL output: [f2, data] // The FlinkInputFormat output: [nested[f2], data] - TableSchema projectedSchema = TableSchema.builder() - .field("nested", DataTypes.ROW(DataTypes.FIELD("f2", DataTypes.STRING()))) - .field("data", DataTypes.STRING()).build(); - List result = runFormat(FlinkSource.forRowData() - .tableLoader(tableLoader()) - .project(projectedSchema) - .buildFormat()); + TableSchema projectedSchema = + TableSchema.builder() + .field("nested", DataTypes.ROW(DataTypes.FIELD("f2", DataTypes.STRING()))) + .field("data", DataTypes.STRING()) + .build(); + List result = + runFormat( + FlinkSource.forRowData() + .tableLoader(tableLoader()) + .project(projectedSchema) + .buildFormat()); List expected = Lists.newArrayList(); for (Record record : writeRecords) { @@ -100,23 +108,28 @@ public void testNestedProjection() throws Exception { @Test public void testBasicProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(2, "time", Types.TimestampType.withZone()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(2, "time", Types.TimestampType.withZone())); Table table = catalog.createTable(TableIdentifier.of("default", "t"), writeSchema); List writeRecords = RandomGenericData.generate(writeSchema, 2, 0L); new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).appendToTable(writeRecords); - TableSchema projectedSchema = TableSchema.builder() - .field("id", DataTypes.BIGINT()) - .field("data", DataTypes.STRING()) - .build(); - List result = runFormat(FlinkSource.forRowData() - .tableLoader(tableLoader()).project(projectedSchema).buildFormat()); + TableSchema projectedSchema = + TableSchema.builder() + .field("id", DataTypes.BIGINT()) + .field("data", DataTypes.STRING()) + .build(); + List result = + runFormat( + FlinkSource.forRowData() + .tableLoader(tableLoader()) + .project(projectedSchema) + .buildFormat()); List expected = Lists.newArrayList(); for (Record record : writeRecords) { diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java index 2a593c4702b4..b2f914e51299 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkInputFormatReaderDeletes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -43,26 +42,35 @@ public TestFlinkInputFormatReaderDeletes(FileFormat inputFormat) { } @Override - protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) throws IOException { + protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) + throws IOException { Schema projected = testTable.schema().select(columns); RowType rowType = FlinkSchemaUtil.convert(projected); Map properties = Maps.newHashMap(); - properties.put(CatalogProperties.WAREHOUSE_LOCATION, hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); + properties.put( + CatalogProperties.WAREHOUSE_LOCATION, + hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); properties.put(CatalogProperties.URI, hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)); - properties.put(CatalogProperties.CLIENT_POOL_SIZE, + properties.put( + CatalogProperties.CLIENT_POOL_SIZE, Integer.toString(hiveConf.getInt("iceberg.hive.client-pool-size", 5))); CatalogLoader hiveCatalogLoader = CatalogLoader.hive(catalog.name(), hiveConf, properties); - FlinkInputFormat inputFormat = FlinkSource.forRowData() - .tableLoader(TableLoader.fromCatalog(hiveCatalogLoader, TableIdentifier.of("default", tableName))) - .project(FlinkSchemaUtil.toSchema(rowType)).buildFormat(); + FlinkInputFormat inputFormat = + FlinkSource.forRowData() + .tableLoader( + TableLoader.fromCatalog( + hiveCatalogLoader, TableIdentifier.of("default", tableName))) + .project(FlinkSchemaUtil.toSchema(rowType)) + .buildFormat(); StructLikeSet set = StructLikeSet.create(projected.asStruct()); - TestHelpers.readRowData(inputFormat, rowType).forEach(rowData -> { - RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); - set.add(wrapper.wrap(rowData)); - }); + TestHelpers.readRowData(inputFormat, rowType) + .forEach( + rowData -> { + RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); + set.add(wrapper.wrap(rowData)); + }); return set; } - } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java index 1670ed733421..3a7ec96cb1d6 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkMergingMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -44,7 +43,8 @@ protected FileAppender writeAndGetAppender(List records) throws RowType flinkSchema = FlinkSchemaUtil.convert(SCHEMA); FileAppender appender = - new FlinkAppenderFactory(SCHEMA, flinkSchema, ImmutableMap.of(), PartitionSpec.unpartitioned()) + new FlinkAppenderFactory( + SCHEMA, flinkSchema, ImmutableMap.of(), PartitionSpec.unpartitioned()) .newAppender(org.apache.iceberg.Files.localOutput(temp.newFile()), fileFormat); try (FileAppender fileAppender = appender) { records.stream().map(r -> RowDataConverter.convert(SCHEMA, r)).forEach(fileAppender::add); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java index cc3c71716ef7..987d79fed3c3 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkReaderDeletesBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Map; @@ -46,8 +45,7 @@ @RunWith(Parameterized.class) public abstract class TestFlinkReaderDeletesBase extends DeleteReadTests { - @ClassRule - public static final TemporaryFolder TEMP_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMP_FOLDER = new TemporaryFolder(); protected static String databaseName = "default"; @@ -60,9 +58,9 @@ public abstract class TestFlinkReaderDeletesBase extends DeleteReadTests { @Parameterized.Parameters(name = "fileFormat={0}") public static Object[][] parameters() { return new Object[][] { - new Object[] { FileFormat.PARQUET }, - new Object[] { FileFormat.AVRO }, - new Object[] { FileFormat.ORC } + new Object[] {FileFormat.PARQUET}, + new Object[] {FileFormat.AVRO}, + new Object[] {FileFormat.ORC} }; } @@ -75,8 +73,10 @@ public static void startMetastore() { metastore = new TestHiveMetastore(); metastore.start(); hiveConf = metastore.hiveConf(); - catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); } @AfterClass diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java index 9284b8fa9ef1..92363dd5a010 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.File; @@ -69,8 +68,7 @@ public abstract class TestFlinkScan { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); protected HadoopCatalog catalog; protected String warehouse; @@ -100,16 +98,18 @@ public void before() throws IOException { } @After - public void after() throws IOException { - } + public void after() throws IOException {} protected TableLoader tableLoader() { return TableLoader.fromHadoopTable(location); } protected abstract List runWithProjection(String... projected) throws Exception; + protected abstract List runWithFilter(Expression filter, String sqlFilter) throws Exception; + protected abstract List runWithOptions(Map options) throws Exception; + protected abstract List run() throws Exception; @Test @@ -122,31 +122,33 @@ public void testUnpartitionedTable() throws Exception { @Test public void testPartitionedTable() throws Exception { - Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + Table table = + catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); expectedRecords.get(0).set(2, "2020-03-20"); - new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).appendToTable( - org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); + new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER) + .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); TestHelpers.assertRecords(run(), expectedRecords, TestFixtures.SCHEMA); } @Test public void testProjection() throws Exception { - Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + Table table = + catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); List inputRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 0L); - new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).appendToTable( - org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), inputRecords); + new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER) + .appendToTable(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), inputRecords); assertRows(runWithProjection("data"), Row.of(inputRecords.get(0).get(0))); } @Test public void testIdentityPartitionProjections() throws Exception { - Schema logSchema = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "dt", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get()) - ); + Schema logSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "dt", Types.StringType.get()), + Types.NestedField.optional(3, "level", Types.StringType.get()), + Types.NestedField.optional(4, "message", Types.StringType.get())); PartitionSpec spec = PartitionSpec.builderFor(logSchema).identity("dt").identity("level").build(); @@ -158,8 +160,11 @@ public void testIdentityPartitionProjections() throws Exception { for (Record record : inputRecords) { record.set(1, "2020-03-2" + idx); record.set(2, Integer.toString(idx)); - append.appendFile(new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER).writeFile( - org.apache.iceberg.TestHelpers.Row.of("2020-03-2" + idx, Integer.toString(idx)), ImmutableList.of(record))); + append.appendFile( + new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER) + .writeFile( + org.apache.iceberg.TestHelpers.Row.of("2020-03-2" + idx, Integer.toString(idx)), + ImmutableList.of(record))); idx += 1; } append.commit(); @@ -178,12 +183,18 @@ public void testIdentityPartitionProjections() throws Exception { validateIdentityPartitionProjections(table, Arrays.asList("message", "level"), inputRecords); validateIdentityPartitionProjections(table, Arrays.asList("level", "dt"), inputRecords); // out-of-order triplets - validateIdentityPartitionProjections(table, Arrays.asList("dt", "level", "message"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("level", "dt", "message"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("dt", "message", "level"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("level", "message", "dt"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("message", "dt", "level"), inputRecords); - validateIdentityPartitionProjections(table, Arrays.asList("message", "level", "dt"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("dt", "level", "message"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("level", "dt", "message"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("dt", "message", "level"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("level", "message", "dt"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("message", "dt", "level"), inputRecords); + validateIdentityPartitionProjections( + table, Arrays.asList("message", "level", "dt"), inputRecords); } private void validateIdentityPartitionProjections( @@ -197,7 +208,9 @@ private void validateIdentityPartitionProjections( for (int i = 0; i < projectedFields.size(); i++) { String name = projectedFields.get(i); Assert.assertEquals( - "Projected field " + name + " should match", inputRecord.getField(name), actualRecord.getField(i)); + "Projected field " + name + " should match", + inputRecord.getField(name), + actualRecord.getField(i)); } } } @@ -220,10 +233,12 @@ public void testSnapshotReads() throws Exception { TestHelpers.assertRecords( runWithOptions(ImmutableMap.of("snapshot-id", Long.toString(snapshotId))), - expectedRecords, TestFixtures.SCHEMA); + expectedRecords, + TestFixtures.SCHEMA); TestHelpers.assertRecords( runWithOptions(ImmutableMap.of("as-of-timestamp", Long.toString(timestampMillis))), - expectedRecords, TestFixtures.SCHEMA); + expectedRecords, + TestFixtures.SCHEMA); } @Test @@ -250,57 +265,74 @@ public void testIncrementalRead() throws Exception { List expected2 = Lists.newArrayList(); expected2.addAll(records2); expected2.addAll(records3); - TestHelpers.assertRecords(runWithOptions( - ImmutableMap.builder() - .put("start-snapshot-id", Long.toString(snapshotId1)) - .put("end-snapshot-id", Long.toString(snapshotId3)).build()), - expected2, TestFixtures.SCHEMA); + TestHelpers.assertRecords( + runWithOptions( + ImmutableMap.builder() + .put("start-snapshot-id", Long.toString(snapshotId1)) + .put("end-snapshot-id", Long.toString(snapshotId3)) + .build()), + expected2, + TestFixtures.SCHEMA); } @Test public void testFilterExp() throws Exception { - Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); + Table table = + catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA, TestFixtures.SPEC); List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); expectedRecords.get(0).set(2, "2020-03-20"); expectedRecords.get(1).set(2, "2020-03-20"); GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); - DataFile dataFile1 = helper.writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); - DataFile dataFile2 = helper.writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-21", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); + DataFile dataFile1 = + helper.writeFile(org.apache.iceberg.TestHelpers.Row.of("2020-03-20", 0), expectedRecords); + DataFile dataFile2 = + helper.writeFile( + org.apache.iceberg.TestHelpers.Row.of("2020-03-21", 0), + RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); helper.appendToTable(dataFile1, dataFile2); - TestHelpers.assertRecords(runWithFilter( - Expressions.equal("dt", "2020-03-20"), "where dt='2020-03-20'"), + TestHelpers.assertRecords( + runWithFilter(Expressions.equal("dt", "2020-03-20"), "where dt='2020-03-20'"), expectedRecords, TestFixtures.SCHEMA); } @Test public void testPartitionTypes() throws Exception { - Schema typesSchema = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "decimal", Types.DecimalType.of(38, 18)), - Types.NestedField.optional(3, "str", Types.StringType.get()), - Types.NestedField.optional(4, "binary", Types.BinaryType.get()), - Types.NestedField.optional(5, "date", Types.DateType.get()), - Types.NestedField.optional(6, "time", Types.TimeType.get()), - Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone()) - ); - PartitionSpec spec = PartitionSpec.builderFor(typesSchema).identity("decimal").identity("str").identity("binary") - .identity("date").identity("time").identity("timestamp").build(); + Schema typesSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "decimal", Types.DecimalType.of(38, 18)), + Types.NestedField.optional(3, "str", Types.StringType.get()), + Types.NestedField.optional(4, "binary", Types.BinaryType.get()), + Types.NestedField.optional(5, "date", Types.DateType.get()), + Types.NestedField.optional(6, "time", Types.TimeType.get()), + Types.NestedField.optional(7, "timestamp", Types.TimestampType.withoutZone())); + PartitionSpec spec = + PartitionSpec.builderFor(typesSchema) + .identity("decimal") + .identity("str") + .identity("binary") + .identity("date") + .identity("time") + .identity("timestamp") + .build(); Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, typesSchema, spec); List records = RandomGenericData.generate(typesSchema, 10, 0L); GenericAppenderHelper appender = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); for (Record record : records) { - org.apache.iceberg.TestHelpers.Row partition = org.apache.iceberg.TestHelpers.Row.of( - record.get(1), - record.get(2), - record.get(3), - record.get(4) == null ? null : DateTimeUtil.daysFromDate((LocalDate) record.get(4)), - record.get(5) == null ? null : DateTimeUtil.microsFromTime((LocalTime) record.get(5)), - record.get(6) == null ? null : DateTimeUtil.microsFromTimestamp((LocalDateTime) record.get(6))); + org.apache.iceberg.TestHelpers.Row partition = + org.apache.iceberg.TestHelpers.Row.of( + record.get(1), + record.get(2), + record.get(3), + record.get(4) == null ? null : DateTimeUtil.daysFromDate((LocalDate) record.get(4)), + record.get(5) == null ? null : DateTimeUtil.microsFromTime((LocalTime) record.get(5)), + record.get(6) == null + ? null + : DateTimeUtil.microsFromTimestamp((LocalDateTime) record.get(6))); appender.appendToTable(partition, Collections.singletonList(record)); } @@ -309,10 +341,14 @@ public void testPartitionTypes() throws Exception { @Test public void testCustomizedFlinkDataTypes() throws Exception { - Schema schema = new Schema( - Types.NestedField.required( - 1, "map", Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.StringType.get())), - Types.NestedField.required(4, "arr", Types.ListType.ofRequired(5, Types.StringType.get()))); + Schema schema = + new Schema( + Types.NestedField.required( + 1, + "map", + Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.StringType.get())), + Types.NestedField.required( + 4, "arr", Types.ListType.ofRequired(5, Types.StringType.get()))); Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, schema); List records = RandomGenericData.generate(schema, 10, 0L); GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java index fd570ff12445..5d90f5996d1c 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkScanSql.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -48,9 +47,7 @@ import org.junit.Assert; import org.junit.Test; -/** - * Test Flink SELECT SQLs. - */ +/** Test Flink SELECT SQLs. */ public class TestFlinkScanSql extends TestFlinkSource { private volatile TableEnvironment tEnv; @@ -62,18 +59,22 @@ public TestFlinkScanSql(String fileFormat) { @Override public void before() throws IOException { super.before(); - sql("create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", warehouse); + sql( + "create catalog iceberg_catalog with ('type'='iceberg', 'catalog-type'='hadoop', 'warehouse'='%s')", + warehouse); sql("use catalog iceberg_catalog"); - getTableEnv().getConfig().getConfiguration().set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); + getTableEnv() + .getConfig() + .getConfiguration() + .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); } private TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { if (tEnv == null) { - this.tEnv = TableEnvironment.create(EnvironmentSettings - .newInstance() - .inBatchMode().build()); + this.tEnv = + TableEnvironment.create(EnvironmentSettings.newInstance().inBatchMode().build()); } } } @@ -81,8 +82,11 @@ private TableEnvironment getTableEnv() { } @Override - protected List run(FlinkSource.Builder formatBuilder, Map sqlOptions, String sqlFilter, - String... sqlSelectedFields) { + protected List run( + FlinkSource.Builder formatBuilder, + Map sqlOptions, + String sqlFilter, + String... sqlSelectedFields) { String select = String.join(",", sqlSelectedFields); StringBuilder builder = new StringBuilder(); @@ -103,7 +107,9 @@ protected List run(FlinkSource.Builder formatBuilder, Map s @Test public void testResiduals() throws Exception { - Table table = catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); + Table table = + catalog.createTable( + TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); List writeRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L); writeRecords.get(0).set(1, 123L); @@ -117,21 +123,29 @@ public void testResiduals() throws Exception { expectedRecords.add(writeRecords.get(0)); DataFile dataFile1 = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), writeRecords); - DataFile dataFile2 = helper.writeFile(TestHelpers.Row.of("2020-03-21", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); + DataFile dataFile2 = + helper.writeFile( + TestHelpers.Row.of("2020-03-21", 0), + RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); helper.appendToTable(dataFile1, dataFile2); - Expression filter = Expressions.and(Expressions.equal("dt", "2020-03-20"), Expressions.equal("id", 123)); - org.apache.iceberg.flink.TestHelpers.assertRecords(runWithFilter( - filter, "where dt='2020-03-20' and id=123"), expectedRecords, TestFixtures.SCHEMA); + Expression filter = + Expressions.and(Expressions.equal("dt", "2020-03-20"), Expressions.equal("id", 123)); + org.apache.iceberg.flink.TestHelpers.assertRecords( + runWithFilter(filter, "where dt='2020-03-20' and id=123"), + expectedRecords, + TestFixtures.SCHEMA); } @Test public void testInferedParallelism() throws IOException { - Table table = catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); + Table table = + catalog.createTable( + TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); TableLoader tableLoader = TableLoader.fromHadoopTable(table.location()); - FlinkInputFormat flinkInputFormat = FlinkSource.forRowData().tableLoader(tableLoader).table(table).buildFormat(); + FlinkInputFormat flinkInputFormat = + FlinkSource.forRowData().tableLoader(tableLoader).table(table).buildFormat(); ScanContext scanContext = ScanContext.builder().build(); // Empty table, infer parallelism should be at least 1 @@ -139,44 +153,57 @@ public void testInferedParallelism() throws IOException { Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); - DataFile dataFile1 = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); - DataFile dataFile2 = helper.writeFile(TestHelpers.Row.of("2020-03-21", 0), - RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); + DataFile dataFile1 = + helper.writeFile( + TestHelpers.Row.of("2020-03-20", 0), + RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); + DataFile dataFile2 = + helper.writeFile( + TestHelpers.Row.of("2020-03-21", 0), + RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L)); helper.appendToTable(dataFile1, dataFile2); // Make sure to generate 2 CombinedScanTasks long maxFileLen = Math.max(dataFile1.fileSizeInBytes(), dataFile2.fileSizeInBytes()); - sql("ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", maxFileLen); + sql( + "ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", + maxFileLen); - // 2 splits (max infer is the default value 100 , max > splits num), the parallelism is splits num : 2 + // 2 splits (max infer is the default value 100 , max > splits num), the parallelism is splits + // num : 2 parallelism = FlinkSource.forRowData().inferParallelism(flinkInputFormat, scanContext); Assert.assertEquals("Should produce the expected parallelism.", 2, parallelism); // 2 splits and limit is 1 , max infer parallelism is default 100, // which is greater than splits num and limit, the parallelism is the limit value : 1 - parallelism = FlinkSource.forRowData().inferParallelism(flinkInputFormat, ScanContext.builder().limit(1).build()); + parallelism = + FlinkSource.forRowData() + .inferParallelism(flinkInputFormat, ScanContext.builder().limit(1).build()); Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); // 2 splits and max infer parallelism is 1 (max < splits num), the parallelism is 1 Configuration configuration = new Configuration(); configuration.setInteger(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX, 1); - parallelism = FlinkSource.forRowData() - .flinkConf(configuration) - .inferParallelism(flinkInputFormat, ScanContext.builder().build()); + parallelism = + FlinkSource.forRowData() + .flinkConf(configuration) + .inferParallelism(flinkInputFormat, ScanContext.builder().build()); Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); - // 2 splits, max infer parallelism is 1, limit is 3, the parallelism is max infer parallelism : 1 - parallelism = FlinkSource.forRowData() - .flinkConf(configuration) - .inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build()); + // 2 splits, max infer parallelism is 1, limit is 3, the parallelism is max infer parallelism : + // 1 + parallelism = + FlinkSource.forRowData() + .flinkConf(configuration) + .inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build()); Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); // 2 splits, infer parallelism is disabled, the parallelism is flink default parallelism 1 configuration.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false); - parallelism = FlinkSource.forRowData() - .flinkConf(configuration) - .inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build()); + parallelism = + FlinkSource.forRowData() + .flinkConf(configuration) + .inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build()); Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism); } @@ -185,7 +212,8 @@ public void testInferParallelismWithGlobalSetting() throws IOException { Configuration cfg = tEnv.getConfig().getConfiguration(); cfg.set(PipelineOptions.MAX_PARALLELISM, 1); - Table table = catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, null); + Table table = + catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, null); GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER); List expectedRecords = Lists.newArrayList(); @@ -199,16 +227,20 @@ public void testInferParallelismWithGlobalSetting() throws IOException { } // Make sure to generate multiple CombinedScanTasks - sql("ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", maxFileLen); + sql( + "ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", + maxFileLen); List results = run(null, Maps.newHashMap(), "", "*"); - org.apache.iceberg.flink.TestHelpers.assertRecords(results, expectedRecords, TestFixtures.SCHEMA); + org.apache.iceberg.flink.TestHelpers.assertRecords( + results, expectedRecords, TestFixtures.SCHEMA); } @Test public void testExposeLocality() throws Exception { Table table = - catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); + catalog.createTable( + TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC); TableLoader tableLoader = TableLoader.fromHadoopTable(table.location()); List expectedRecords = RandomGenericData.generate(TestFixtures.SCHEMA, 10, 0L); @@ -220,22 +252,30 @@ public void testExposeLocality() throws Exception { // test sql api Configuration tableConf = getTableEnv().getConfig().getConfiguration(); - tableConf.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), false); + tableConf.setBoolean( + FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), false); List results = sql("select * from t"); - org.apache.iceberg.flink.TestHelpers.assertRecords(results, expectedRecords, TestFixtures.SCHEMA); + org.apache.iceberg.flink.TestHelpers.assertRecords( + results, expectedRecords, TestFixtures.SCHEMA); // test table api - tableConf.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), true); + tableConf.setBoolean( + FlinkConfigOptions.TABLE_EXEC_ICEBERG_EXPOSE_SPLIT_LOCALITY_INFO.key(), true); FlinkSource.Builder builder = FlinkSource.forRowData().tableLoader(tableLoader).table(table); Boolean localityEnabled = - DynMethods.builder("localityEnabled").hiddenImpl(builder.getClass()).build().invoke(builder); - // When running with CI or local, `localityEnabled` will be false even if this configuration is enabled + DynMethods.builder("localityEnabled") + .hiddenImpl(builder.getClass()) + .build() + .invoke(builder); + // When running with CI or local, `localityEnabled` will be false even if this configuration is + // enabled Assert.assertFalse("Expose split locality info should be false.", localityEnabled); results = run(builder, Maps.newHashMap(), "where dt='2020-03-20'", "*"); - org.apache.iceberg.flink.TestHelpers.assertRecords(results, expectedRecords, TestFixtures.SCHEMA); + org.apache.iceberg.flink.TestHelpers.assertRecords( + results, expectedRecords, TestFixtures.SCHEMA); } private List sql(String query, Object... args) { diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java index 633a32a4c3d1..3a01952cd9ec 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestFlinkSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Collections; @@ -40,8 +39,10 @@ public abstract class TestFlinkSource extends TestFlinkScan { @Override protected List runWithProjection(String... projected) throws Exception { TableSchema.Builder builder = TableSchema.builder(); - TableSchema schema = FlinkSchemaUtil.toSchema(FlinkSchemaUtil.convert( - catalog.loadTable(TableIdentifier.of("default", "t")).schema())); + TableSchema schema = + FlinkSchemaUtil.toSchema( + FlinkSchemaUtil.convert( + catalog.loadTable(TableIdentifier.of("default", "t")).schema())); for (String field : projected) { TableColumn column = schema.getTableColumn(field).get(); builder.field(column.getName(), column.getType()); @@ -51,14 +52,16 @@ protected List runWithProjection(String... projected) throws Exception { @Override protected List runWithFilter(Expression filter, String sqlFilter) throws Exception { - FlinkSource.Builder builder = FlinkSource.forRowData().filters(Collections.singletonList(filter)); + FlinkSource.Builder builder = + FlinkSource.forRowData().filters(Collections.singletonList(filter)); return run(builder, Maps.newHashMap(), sqlFilter, "*"); } @Override protected List runWithOptions(Map options) throws Exception { FlinkSource.Builder builder = FlinkSource.forRowData(); - Optional.ofNullable(options.get("snapshot-id")).ifPresent(value -> builder.snapshotId(Long.parseLong(value))); + Optional.ofNullable(options.get("snapshot-id")) + .ifPresent(value -> builder.snapshotId(Long.parseLong(value))); Optional.ofNullable(options.get("start-snapshot-id")) .ifPresent(value -> builder.startSnapshotId(Long.parseLong(value))); Optional.ofNullable(options.get("end-snapshot-id")) @@ -73,6 +76,10 @@ protected List run() throws Exception { return run(FlinkSource.forRowData(), Maps.newHashMap(), "", "*"); } - protected abstract List run(FlinkSource.Builder formatBuilder, Map sqlOptions, String sqlFilter, - String... sqlSelectedFields) throws Exception; + protected abstract List run( + FlinkSource.Builder formatBuilder, + Map sqlOptions, + String sqlFilter, + String... sqlSelectedFields) + throws Exception; } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java index b2e887afff01..0cfa1073886d 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceBounded.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.util.Arrays; @@ -81,8 +80,9 @@ protected List run() throws Exception { return run(null, null, null); } - private List run(Schema projectedSchema, List filters, - Map options) throws Exception { + private List run( + Schema projectedSchema, List filters, Map options) + throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); @@ -94,10 +94,11 @@ private List run(Schema projectedSchema, List filters, table = tableLoader.loadTable(); } - IcebergSource.Builder sourceBuilder = IcebergSource.forRowData() - .tableLoader(tableLoader()) - .assignerFactory(new SimpleSplitAssignerFactory()) - .flinkConfig(config); + IcebergSource.Builder sourceBuilder = + IcebergSource.forRowData() + .tableLoader(tableLoader()) + .assignerFactory(new SimpleSplitAssignerFactory()) + .flinkConfig(config); if (projectedSchema != null) { sourceBuilder.project(projectedSchema); } @@ -108,17 +109,19 @@ private List run(Schema projectedSchema, List filters, sourceBuilder.properties(options); } - DataStream stream = env.fromSource( - sourceBuilder.build(), - WatermarkStrategy.noWatermarks(), - "testBasicRead", - TypeInformation.of(RowData.class)) - .map(new RowDataToRowMapper(FlinkSchemaUtil.convert( - projectedSchema == null ? table.schema() : projectedSchema))); + DataStream stream = + env.fromSource( + sourceBuilder.build(), + WatermarkStrategy.noWatermarks(), + "testBasicRead", + TypeInformation.of(RowData.class)) + .map( + new RowDataToRowMapper( + FlinkSchemaUtil.convert( + projectedSchema == null ? table.schema() : projectedSchema))); try (CloseableIterator iter = stream.executeAndCollect()) { return Lists.newArrayList(iter); } } - } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java index 41857050e469..582a12523300 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceContinuous.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.time.Duration; @@ -59,38 +58,41 @@ public class TestIcebergSourceContinuous { public static final MiniClusterWithClientResource MINI_CLUSTER_RESOURCE = MiniClusterResource.createWithClassloaderCheckDisabled(); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); @Rule - public final HadoopTableResource tableResource = new HadoopTableResource(TEMPORARY_FOLDER, - TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); + public final HadoopTableResource tableResource = + new HadoopTableResource( + TEMPORARY_FOLDER, TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); private final AtomicLong randomSeed = new AtomicLong(0L); @Test public void testTableScanThenIncremental() throws Exception { - GenericAppenderHelper dataAppender = new GenericAppenderHelper( - tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); // snapshot1 - List batch1 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch1 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch1); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); - try (CloseableIterator iter = createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { List result1 = waitForResult(iter, 2); TestHelpers.assertRecords(result1, batch1, tableResource.table().schema()); // snapshot2 - List batch2 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch2 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch2); tableResource.table().currentSnapshot().snapshotId(); @@ -98,8 +100,9 @@ public void testTableScanThenIncremental() throws Exception { TestHelpers.assertRecords(result2, batch2, tableResource.table().schema()); // snapshot3 - List batch3 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch3 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch3); tableResource.table().currentSnapshot().snapshotId(); @@ -110,42 +113,46 @@ public void testTableScanThenIncremental() throws Exception { @Test public void testEarliestSnapshot() throws Exception { - GenericAppenderHelper dataAppender = new GenericAppenderHelper( - tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); // snapshot0 - List batch0 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch0 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch0); // snapshot1 - List batch1 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch1 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch1); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .build(); - try (CloseableIterator iter = createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { List result1 = waitForResult(iter, 4); List combinedBatch0AndBatch1 = Lists.newArrayList(batch0); combinedBatch0AndBatch1.addAll(batch1); TestHelpers.assertRecords(result1, combinedBatch0AndBatch1, tableResource.table().schema()); // snapshot2 - List batch2 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch2 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch2); List result2 = waitForResult(iter, 2); TestHelpers.assertRecords(result2, batch2, tableResource.table().schema()); // snapshot3 - List batch3 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch3 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch3); List result3 = waitForResult(iter, 2); @@ -155,26 +162,28 @@ public void testEarliestSnapshot() throws Exception { @Test public void testLatestSnapshot() throws Exception { - GenericAppenderHelper dataAppender = new GenericAppenderHelper( - tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); // snapshot0 - List batch0 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch0 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch0); // snapshot1 - List batch1 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch1 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch1); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .build(); - try (CloseableIterator iter = createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { // we want to make sure job is running first so that enumerator can // start from the latest snapshot before inserting the next batch2 below. waitUntilJobIsRunning(MINI_CLUSTER_RESOURCE.getClusterClient()); @@ -184,16 +193,18 @@ public void testLatestSnapshot() throws Exception { TestHelpers.assertRecords(result1, batch1, tableResource.table().schema()); // snapshot2 - List batch2 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch2 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch2); List result2 = waitForResult(iter, 2); TestHelpers.assertRecords(result2, batch2, tableResource.table().schema()); // snapshot3 - List batch3 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch3 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch3); List result3 = waitForResult(iter, 2); @@ -203,43 +214,47 @@ public void testLatestSnapshot() throws Exception { @Test public void testSpecificSnapshotId() throws Exception { - GenericAppenderHelper dataAppender = new GenericAppenderHelper( - tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); // snapshot0 - List batch0 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch0 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch0); long snapshot0 = tableResource.table().currentSnapshot().snapshotId(); // snapshot1 - List batch1 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch1 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch1); long snapshot1 = tableResource.table().currentSnapshot().snapshotId(); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(snapshot1) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(snapshot1) + .build(); - try (CloseableIterator iter = createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { List result1 = waitForResult(iter, 2); TestHelpers.assertRecords(result1, batch1, tableResource.table().schema()); // snapshot2 - List batch2 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch2 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch2); List result2 = waitForResult(iter, 2); TestHelpers.assertRecords(result2, batch2, tableResource.table().schema()); // snapshot3 - List batch3 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch3 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch3); List result3 = waitForResult(iter, 2); @@ -249,12 +264,12 @@ public void testSpecificSnapshotId() throws Exception { @Test public void testSpecificSnapshotTimestamp() throws Exception { - GenericAppenderHelper dataAppender = new GenericAppenderHelper( - tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper(tableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); // snapshot0 - List batch0 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch0 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch0); long snapshot0Timestamp = tableResource.table().currentSnapshot().timestampMillis(); @@ -262,34 +277,38 @@ public void testSpecificSnapshotTimestamp() throws Exception { Thread.sleep(2); // snapshot1 - List batch1 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch1 = + RandomGenericData.generate(tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch1); long snapshot1Timestamp = tableResource.table().currentSnapshot().timestampMillis(); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10L)) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(snapshot1Timestamp) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10L)) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(snapshot1Timestamp) + .build(); - try (CloseableIterator iter = createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { + try (CloseableIterator iter = + createStream(scanContext).executeAndCollect(getClass().getSimpleName())) { // consume data from snapshot1 List result1 = waitForResult(iter, 2); TestHelpers.assertRecords(result1, batch1, tableResource.table().schema()); // snapshot2 - List batch2 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch2 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch2); List result2 = waitForResult(iter, 2); TestHelpers.assertRecords(result2, batch2, tableResource.table().schema()); // snapshot3 - List batch3 = RandomGenericData.generate( - tableResource.table().schema(), 2, randomSeed.incrementAndGet()); + List batch3 = + RandomGenericData.generate( + tableResource.table().schema(), 2, randomSeed.incrementAndGet()); dataAppender.appendToTable(batch3); List result3 = waitForResult(iter, 2); @@ -301,20 +320,21 @@ private DataStream createStream(ScanContext scanContext) throws Exception { // start the source and collect output StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); - DataStream stream = env.fromSource( - IcebergSource.forRowData() - .tableLoader(tableResource.tableLoader()) - .assignerFactory(new SimpleSplitAssignerFactory()) - .streaming(scanContext.isStreaming()) - .streamingStartingStrategy(scanContext.streamingStartingStrategy()) - .startSnapshotTimestamp(scanContext.startSnapshotTimestamp()) - .startSnapshotId(scanContext.startSnapshotId()) - .monitorInterval(Duration.ofMillis(10L)) - .build(), - WatermarkStrategy.noWatermarks(), - "icebergSource", - TypeInformation.of(RowData.class)) - .map(new RowDataToRowMapper(FlinkSchemaUtil.convert(tableResource.table().schema()))); + DataStream stream = + env.fromSource( + IcebergSource.forRowData() + .tableLoader(tableResource.tableLoader()) + .assignerFactory(new SimpleSplitAssignerFactory()) + .streaming(scanContext.isStreaming()) + .streamingStartingStrategy(scanContext.streamingStartingStrategy()) + .startSnapshotTimestamp(scanContext.startSnapshotTimestamp()) + .startSnapshotId(scanContext.startSnapshotId()) + .monitorInterval(Duration.ofMillis(10L)) + .build(), + WatermarkStrategy.noWatermarks(), + "icebergSource", + TypeInformation.of(RowData.class)) + .map(new RowDataToRowMapper(FlinkSchemaUtil.convert(tableResource.table().schema()))); return stream; } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java index b16a277c3886..cad1fa67ae19 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceFailover.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.time.Duration; @@ -60,9 +59,7 @@ public class TestIcebergSourceFailover { private static final int PARALLELISM = 4; - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); - + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); @Rule public final MiniClusterWithClientResource miniClusterResource = @@ -75,12 +72,14 @@ public class TestIcebergSourceFailover { .build()); @Rule - public final HadoopTableResource sourceTableResource = new HadoopTableResource(TEMPORARY_FOLDER, - TestFixtures.DATABASE, TestFixtures.TABLE, schema()); + public final HadoopTableResource sourceTableResource = + new HadoopTableResource( + TEMPORARY_FOLDER, TestFixtures.DATABASE, TestFixtures.TABLE, schema()); @Rule - public final HadoopTableResource sinkTableResource = new HadoopTableResource(TEMPORARY_FOLDER, - TestFixtures.DATABASE, TestFixtures.SINK_TABLE, schema()); + public final HadoopTableResource sinkTableResource = + new HadoopTableResource( + TEMPORARY_FOLDER, TestFixtures.DATABASE, TestFixtures.SINK_TABLE, schema()); protected IcebergSource.Builder sourceBuilder() { Configuration config = new Configuration(); @@ -99,10 +98,9 @@ protected List generateRecords(int numRecords, long seed) { return RandomGenericData.generate(schema(), numRecords, seed); } - protected void assertRecords(Table table, List expectedRecords, Duration interval, int maxCount) - throws Exception { - SimpleDataUtil.assertTableRecords(table, - expectedRecords, interval, maxCount); + protected void assertRecords( + Table table, List expectedRecords, Duration interval, int maxCount) throws Exception { + SimpleDataUtil.assertTableRecords(table, expectedRecords, interval, maxCount); } @Test @@ -117,8 +115,9 @@ public void testBoundedWithJobManagerFailover() throws Exception { private void testBoundedIcebergSource(FailoverType failoverType) throws Exception { List expectedRecords = Lists.newArrayList(); - GenericAppenderHelper dataAppender = new GenericAppenderHelper( - sourceTableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper( + sourceTableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); for (int i = 0; i < 4; ++i) { List records = generateRecords(2, i); expectedRecords.addAll(records); @@ -129,11 +128,12 @@ private void testBoundedIcebergSource(FailoverType failoverType) throws Exceptio env.setParallelism(PARALLELISM); env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); - DataStream stream = env.fromSource( - sourceBuilder().build(), - WatermarkStrategy.noWatermarks(), - "IcebergSource", - TypeInformation.of(RowData.class)); + DataStream stream = + env.fromSource( + sourceBuilder().build(), + WatermarkStrategy.noWatermarks(), + "IcebergSource", + TypeInformation.of(RowData.class)); DataStream streamFailingInTheMiddleOfReading = RecordCounterToFail.wrapWithFailureAfter(stream, expectedRecords.size() / 2); @@ -170,8 +170,9 @@ public void testContinuousWithJobManagerFailover() throws Exception { } private void testContinuousIcebergSource(FailoverType failoverType) throws Exception { - GenericAppenderHelper dataAppender = new GenericAppenderHelper( - sourceTableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); + GenericAppenderHelper dataAppender = + new GenericAppenderHelper( + sourceTableResource.table(), FileFormat.PARQUET, TEMPORARY_FOLDER); List expectedRecords = Lists.newArrayList(); List batch = generateRecords(2, 0); @@ -184,15 +185,16 @@ private void testContinuousIcebergSource(FailoverType failoverType) throws Excep Configuration config = new Configuration(); config.setInteger(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 128); - DataStream stream = env.fromSource( - sourceBuilder() - .streaming(true) - .monitorInterval(Duration.ofMillis(10)) - .streamingStartingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(), - WatermarkStrategy.noWatermarks(), - "IcebergSource", - TypeInformation.of(RowData.class)); + DataStream stream = + env.fromSource( + sourceBuilder() + .streaming(true) + .monitorInterval(Duration.ofMillis(10)) + .streamingStartingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(), + WatermarkStrategy.noWatermarks(), + "IcebergSource", + TypeInformation.of(RowData.class)); // CollectStreamSink from DataStream#executeAndCollect() doesn't guarantee // exactly-once behavior. When Iceberg sink, we can verify end-to-end @@ -211,8 +213,7 @@ private void testContinuousIcebergSource(FailoverType failoverType) throws Excep expectedRecords.addAll(records); dataAppender.appendToTable(records); if (i == 2) { - triggerFailover(failoverType, jobId, () -> { - }, miniClusterResource.getMiniCluster()); + triggerFailover(failoverType, jobId, () -> {}, miniClusterResource.getMiniCluster()); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java index 07b08590ba52..2974f4bc94a2 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceReaderDeletes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -52,58 +51,64 @@ public class TestIcebergSourceReaderDeletes extends TestFlinkReaderDeletesBase { private static final int PARALLELISM = 4; - @ClassRule - public static final TemporaryFolder TMP_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TMP_FOLDER = new TemporaryFolder(); @ClassRule - public static final MiniClusterWithClientResource MINI_CLUSTER = new MiniClusterWithClientResource( - new MiniClusterResourceConfiguration.Builder() - .setNumberTaskManagers(1) - .setNumberSlotsPerTaskManager(PARALLELISM) - .build()); + public static final MiniClusterWithClientResource MINI_CLUSTER = + new MiniClusterWithClientResource( + new MiniClusterResourceConfiguration.Builder() + .setNumberTaskManagers(1) + .setNumberSlotsPerTaskManager(PARALLELISM) + .build()); public TestIcebergSourceReaderDeletes(FileFormat inputFormat) { super(inputFormat); } @Override - protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) throws IOException { + protected StructLikeSet rowSet(String tableName, Table testTable, String... columns) + throws IOException { Schema projected = testTable.schema().select(columns); RowType rowType = FlinkSchemaUtil.convert(projected); Map properties = Maps.newHashMap(); - properties.put(CatalogProperties.WAREHOUSE_LOCATION, hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); + properties.put( + CatalogProperties.WAREHOUSE_LOCATION, + hiveConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); properties.put(CatalogProperties.URI, hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)); - properties.put(CatalogProperties.CLIENT_POOL_SIZE, + properties.put( + CatalogProperties.CLIENT_POOL_SIZE, Integer.toString(hiveConf.getInt("iceberg.hive.client-pool-size", 5))); CatalogLoader hiveCatalogLoader = CatalogLoader.hive(catalog.name(), hiveConf, properties); - TableLoader hiveTableLoader = TableLoader.fromCatalog(hiveCatalogLoader, TableIdentifier.of("default", tableName)); + TableLoader hiveTableLoader = + TableLoader.fromCatalog(hiveCatalogLoader, TableIdentifier.of("default", tableName)); hiveTableLoader.open(); try (TableLoader tableLoader = hiveTableLoader) { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); - DataStream stream = env.fromSource( - IcebergSource.builder() - .tableLoader(tableLoader) - .assignerFactory(new SimpleSplitAssignerFactory()) - .project(projected) - .build(), - WatermarkStrategy.noWatermarks(), - "testBasicRead", - TypeInformation.of(RowData.class)); + DataStream stream = + env.fromSource( + IcebergSource.builder() + .tableLoader(tableLoader) + .assignerFactory(new SimpleSplitAssignerFactory()) + .project(projected) + .build(), + WatermarkStrategy.noWatermarks(), + "testBasicRead", + TypeInformation.of(RowData.class)); try (CloseableIterator iter = stream.executeAndCollect()) { List rowDataList = Lists.newArrayList(iter); StructLikeSet set = StructLikeSet.create(projected.asStruct()); - rowDataList.forEach(rowData -> { - RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); - set.add(wrapper.wrap(rowData)); - }); + rowDataList.forEach( + rowData -> { + RowDataWrapper wrapper = new RowDataWrapper(rowType, projected.asStruct()); + set.add(wrapper.wrap(rowData)); + }); return set; } catch (Exception e) { throw new IOException("Failed to collect result", e); } } } - } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java index baccadc31f08..bc63e4a0b282 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestProjectMetaColumn.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -51,8 +50,7 @@ @RunWith(Parameterized.class) public class TestProjectMetaColumn { - @Rule - public final TemporaryFolder folder = new TemporaryFolder(); + @Rule public final TemporaryFolder folder = new TemporaryFolder(); private final FileFormat format; @Parameterized.Parameters(name = "fileFormat={0}") @@ -60,8 +58,7 @@ public static Iterable parameters() { return Lists.newArrayList( new Object[] {FileFormat.PARQUET}, new Object[] {FileFormat.ORC}, - new Object[] {FileFormat.AVRO} - ); + new Object[] {FileFormat.AVRO}); } public TestProjectMetaColumn(FileFormat format) { @@ -71,28 +68,30 @@ public TestProjectMetaColumn(FileFormat format) { private void testSkipToRemoveMetaColumn(int formatVersion) throws IOException { // Create the table with given format version. String location = folder.getRoot().getAbsolutePath(); - Table table = SimpleDataUtil.createTable(location, - ImmutableMap.of(TableProperties.FORMAT_VERSION, String.valueOf(formatVersion)), - false); - - List rows = Lists.newArrayList( - SimpleDataUtil.createInsert(1, "AAA"), - SimpleDataUtil.createInsert(2, "BBB"), - SimpleDataUtil.createInsert(3, "CCC") - ); + Table table = + SimpleDataUtil.createTable( + location, + ImmutableMap.of(TableProperties.FORMAT_VERSION, String.valueOf(formatVersion)), + false); + + List rows = + Lists.newArrayList( + SimpleDataUtil.createInsert(1, "AAA"), + SimpleDataUtil.createInsert(2, "BBB"), + SimpleDataUtil.createInsert(3, "CCC")); writeAndCommit(table, ImmutableList.of(), false, rows); - FlinkInputFormat input = FlinkSource - .forRowData() - .tableLoader(TableLoader.fromHadoopTable(location)) - .buildFormat(); + FlinkInputFormat input = + FlinkSource.forRowData().tableLoader(TableLoader.fromHadoopTable(location)).buildFormat(); List results = Lists.newArrayList(); - TestHelpers.readRowData(input, rowData -> { - // If project to remove the meta columns, it will get a RowDataProjection. - Assert.assertTrue(rowData instanceof GenericRowData); - results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); - }); + TestHelpers.readRowData( + input, + rowData -> { + // If project to remove the meta columns, it will get a RowDataProjection. + Assert.assertTrue(rowData instanceof GenericRowData); + results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); + }); // Assert the results. TestHelpers.assertRows(rows, results, SimpleDataUtil.ROW_TYPE); @@ -112,37 +111,41 @@ public void testV2SkipToRemoveMetaColumn() throws IOException { public void testV2RemoveMetaColumn() throws Exception { // Create the v2 table. String location = folder.getRoot().getAbsolutePath(); - Table table = SimpleDataUtil.createTable(location, ImmutableMap.of(TableProperties.FORMAT_VERSION, "2"), false); - - List rows = Lists.newArrayList( - SimpleDataUtil.createInsert(1, "AAA"), - SimpleDataUtil.createDelete(1, "AAA"), - SimpleDataUtil.createInsert(2, "AAA"), - SimpleDataUtil.createInsert(2, "BBB") - ); + Table table = + SimpleDataUtil.createTable( + location, ImmutableMap.of(TableProperties.FORMAT_VERSION, "2"), false); + + List rows = + Lists.newArrayList( + SimpleDataUtil.createInsert(1, "AAA"), + SimpleDataUtil.createDelete(1, "AAA"), + SimpleDataUtil.createInsert(2, "AAA"), + SimpleDataUtil.createInsert(2, "BBB")); int eqFieldId = table.schema().findField("data").fieldId(); writeAndCommit(table, ImmutableList.of(eqFieldId), true, rows); - FlinkInputFormat input = FlinkSource - .forRowData() - .tableLoader(TableLoader.fromHadoopTable(location)) - .buildFormat(); + FlinkInputFormat input = + FlinkSource.forRowData().tableLoader(TableLoader.fromHadoopTable(location)).buildFormat(); List results = Lists.newArrayList(); - TestHelpers.readRowData(input, rowData -> { - // If project to remove the meta columns, it will get a RowDataProjection. - Assert.assertTrue(rowData instanceof RowDataProjection); - results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); - }); + TestHelpers.readRowData( + input, + rowData -> { + // If project to remove the meta columns, it will get a RowDataProjection. + Assert.assertTrue(rowData instanceof RowDataProjection); + results.add(TestHelpers.copyRowData(rowData, SimpleDataUtil.ROW_TYPE)); + }); // Assert the results. - TestHelpers.assertRows(ImmutableList.of( - SimpleDataUtil.createInsert(2, "AAA"), - SimpleDataUtil.createInsert(2, "BBB") - ), results, SimpleDataUtil.ROW_TYPE); + TestHelpers.assertRows( + ImmutableList.of( + SimpleDataUtil.createInsert(2, "AAA"), SimpleDataUtil.createInsert(2, "BBB")), + results, + SimpleDataUtil.ROW_TYPE); } - private void writeAndCommit(Table table, List eqFieldIds, boolean upsert, List rows) + private void writeAndCommit( + Table table, List eqFieldIds, boolean upsert, List rows) throws IOException { TaskWriter writer = createTaskWriter(table, eqFieldIds, upsert); try (TaskWriter io = writer) { @@ -165,14 +168,16 @@ private void writeAndCommit(Table table, List eqFieldIds, boolean upser delta.commit(); } - private TaskWriter createTaskWriter(Table table, List equalityFieldIds, boolean upsert) { - TaskWriterFactory taskWriterFactory = new RowDataTaskWriterFactory( - SerializableTable.copyOf(table), - SimpleDataUtil.ROW_TYPE, - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, - format, - equalityFieldIds, - upsert); + private TaskWriter createTaskWriter( + Table table, List equalityFieldIds, boolean upsert) { + TaskWriterFactory taskWriterFactory = + new RowDataTaskWriterFactory( + SerializableTable.copyOf(table), + SimpleDataUtil.ROW_TYPE, + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, + format, + equalityFieldIds, + upsert); taskWriterFactory.initialize(1, 1); return taskWriterFactory.create(); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java index 174d40258371..10fa4ecf1329 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamScanSql.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.IOException; @@ -63,16 +62,18 @@ protected TableEnvironment getTableEnv() { if (tEnv == null) { synchronized (this) { if (tEnv == null) { - EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings - .newInstance() - .inStreamingMode(); + EnvironmentSettings.Builder settingsBuilder = + EnvironmentSettings.newInstance().inStreamingMode(); - StreamExecutionEnvironment env = StreamExecutionEnvironment - .getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); + StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment( + MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG); env.enableCheckpointing(400); - StreamTableEnvironment streamTableEnv = StreamTableEnvironment.create(env, settingsBuilder.build()); - streamTableEnv.getConfig() + StreamTableEnvironment streamTableEnv = + StreamTableEnvironment.create(env, settingsBuilder.build()); + streamTableEnv + .getConfig() .getConfiguration() .set(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true); tEnv = streamTableEnv; @@ -105,11 +106,11 @@ private void insertRows(String partition, Table table, Row... rows) throws IOExc GenericRecord gRecord = GenericRecord.create(table.schema()); List records = Lists.newArrayList(); for (Row row : rows) { - records.add(gRecord.copy( - "id", row.getField(0), - "data", row.getField(1), - "dt", row.getField(2) - )); + records.add( + gRecord.copy( + "id", row.getField(0), + "data", row.getField(1), + "dt", row.getField(2))); } if (partition != null) { @@ -129,9 +130,12 @@ private void assertRows(List expectedRows, Iterator iterator) { Row actualRow = iterator.next(); Assert.assertEquals("Should have expected fields", 3, actualRow.getArity()); - Assert.assertEquals("Should have expected id", expectedRow.getField(0), actualRow.getField(0)); - Assert.assertEquals("Should have expected data", expectedRow.getField(1), actualRow.getField(1)); - Assert.assertEquals("Should have expected dt", expectedRow.getField(2), actualRow.getField(2)); + Assert.assertEquals( + "Should have expected id", expectedRow.getField(0), actualRow.getField(0)); + Assert.assertEquals( + "Should have expected data", expectedRow.getField(1), actualRow.getField(1)); + Assert.assertEquals( + "Should have expected dt", expectedRow.getField(2), actualRow.getField(2)); } } @@ -140,7 +144,8 @@ public void testUnPartitionedTable() throws Exception { sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR)", TABLE); Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - TableResult result = exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); try (CloseableIterator iterator = result.collect()) { Row row1 = Row.of(1, "aaa", "2021-01-01"); @@ -154,13 +159,13 @@ public void testUnPartitionedTable() throws Exception { result.getJobClient().ifPresent(JobClient::cancel); } - @Test public void testPartitionedTable() throws Exception { sql("CREATE TABLE %s (id INT, data VARCHAR, dt VARCHAR) PARTITIONED BY (dt)", TABLE); Table table = validationCatalog.loadTable(TableIdentifier.of(icebergNamespace, TABLE)); - TableResult result = exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); try (CloseableIterator iterator = result.collect()) { Row row1 = Row.of(1, "aaa", "2021-01-01"); insertRows("2021-01-01", table, row1); @@ -190,7 +195,8 @@ public void testConsumeFromBeginning() throws Exception { Row row2 = Row.of(2, "bbb", "2021-01-01"); insertRows(table, row1, row2); - TableResult result = exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); + TableResult result = + exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/", TABLE); try (CloseableIterator iterator = result.collect()) { assertRows(ImmutableList.of(row1, row2), iterator); @@ -222,8 +228,11 @@ public void testConsumeFromStartSnapshotId() throws Exception { Row row4 = Row.of(4, "ddd", "2021-01-01"); insertRows(table, row3, row4); - TableResult result = exec("SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', " + - "'start-snapshot-id'='%d')*/", TABLE, startSnapshotId); + TableResult result = + exec( + "SELECT * FROM %s /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', " + + "'start-snapshot-id'='%d')*/", + TABLE, startSnapshotId); try (CloseableIterator iterator = result.collect()) { // The row2 in start snapshot will be excluded. assertRows(ImmutableList.of(row3, row4), iterator); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java index 1b1cf70a32e0..6f8789c92bc5 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingMonitorFunction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.File; @@ -57,19 +56,16 @@ @RunWith(Parameterized.class) public class TestStreamingMonitorFunction extends TableTestBase { - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET; private static final long WAIT_TIME_MILLIS = 10 * 1000L; @Parameterized.Parameters(name = "FormatVersion={0}") public static Iterable parameters() { - return ImmutableList.of( - new Object[] {1}, - new Object[] {2} - ); + return ImmutableList.of(new Object[] {1}, new Object[] {2}); } public TestStreamingMonitorFunction(int formatVersion) { @@ -87,23 +83,24 @@ public void setupTable() throws IOException { table = create(SCHEMA, PartitionSpec.unpartitioned()); } - private void runSourceFunctionInTask(TestSourceContext sourceContext, StreamingMonitorFunction function) { - Thread task = new Thread(() -> { - try { - function.run(sourceContext); - } catch (Exception e) { - throw new RuntimeException(e); - } - }); + private void runSourceFunctionInTask( + TestSourceContext sourceContext, StreamingMonitorFunction function) { + Thread task = + new Thread( + () -> { + try { + function.run(sourceContext); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); task.start(); } @Test public void testConsumeWithoutStartSnapshotId() throws Exception { List> recordsList = generateRecordsAndCommitTxn(10); - ScanContext scanContext = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .build(); + ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build(); StreamingMonitorFunction function = createFunction(scanContext); try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { @@ -114,14 +111,16 @@ public void testConsumeWithoutStartSnapshotId() throws Exception { TestSourceContext sourceContext = new TestSourceContext(latch); runSourceFunctionInTask(sourceContext, function); - Assert.assertTrue("Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); + Assert.assertTrue( + "Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); Thread.sleep(1000L); // Stop the stream task. function.close(); Assert.assertEquals("Should produce the expected splits", 1, sourceContext.splits.size()); - TestHelpers.assertRecords(sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); } } @@ -134,10 +133,11 @@ public void testConsumeFromStartSnapshotId() throws Exception { // Commit the next five transactions. List> recordsList = generateRecordsAndCommitTxn(5); - ScanContext scanContext = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .startSnapshotId(startSnapshotId) - .build(); + ScanContext scanContext = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .startSnapshotId(startSnapshotId) + .build(); StreamingMonitorFunction function = createFunction(scanContext); try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { @@ -148,23 +148,23 @@ public void testConsumeFromStartSnapshotId() throws Exception { TestSourceContext sourceContext = new TestSourceContext(latch); runSourceFunctionInTask(sourceContext, function); - Assert.assertTrue("Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); + Assert.assertTrue( + "Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); Thread.sleep(1000L); // Stop the stream task. function.close(); Assert.assertEquals("Should produce the expected splits", 1, sourceContext.splits.size()); - TestHelpers.assertRecords(sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); } } @Test public void testCheckpointRestore() throws Exception { List> recordsList = generateRecordsAndCommitTxn(10); - ScanContext scanContext = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .build(); + ScanContext scanContext = ScanContext.builder().monitorInterval(Duration.ofMillis(100)).build(); StreamingMonitorFunction func = createFunction(scanContext); OperatorSubtaskState state; @@ -176,7 +176,8 @@ public void testCheckpointRestore() throws Exception { TestSourceContext sourceContext = new TestSourceContext(latch); runSourceFunctionInTask(sourceContext, func); - Assert.assertTrue("Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); + Assert.assertTrue( + "Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); Thread.sleep(1000L); state = harness.snapshot(1, 1); @@ -185,7 +186,8 @@ public void testCheckpointRestore() throws Exception { func.close(); Assert.assertEquals("Should produce the expected splits", 1, sourceContext.splits.size()); - TestHelpers.assertRecords(sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(recordsList)), SCHEMA); } List> newRecordsList = generateRecordsAndCommitTxn(10); @@ -200,44 +202,50 @@ public void testCheckpointRestore() throws Exception { TestSourceContext sourceContext = new TestSourceContext(latch); runSourceFunctionInTask(sourceContext, newFunc); - Assert.assertTrue("Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); + Assert.assertTrue( + "Should have expected elements.", latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS)); Thread.sleep(1000L); // Stop the stream task. newFunc.close(); Assert.assertEquals("Should produce the expected splits", 1, sourceContext.splits.size()); - TestHelpers.assertRecords(sourceContext.toRows(), Lists.newArrayList(Iterables.concat(newRecordsList)), SCHEMA); + TestHelpers.assertRecords( + sourceContext.toRows(), Lists.newArrayList(Iterables.concat(newRecordsList)), SCHEMA); } } @Test public void testInvalidMaxPlanningSnapshotCount() { - ScanContext scanContext1 = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .maxPlanningSnapshotCount(0) - .build(); - - AssertHelpers.assertThrows("Should throw exception because of invalid config", - IllegalArgumentException.class, "must be greater than zero", + ScanContext scanContext1 = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .maxPlanningSnapshotCount(0) + .build(); + + AssertHelpers.assertThrows( + "Should throw exception because of invalid config", + IllegalArgumentException.class, + "must be greater than zero", () -> { createFunction(scanContext1); return null; - } - ); - - ScanContext scanContext2 = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .maxPlanningSnapshotCount(-10) - .build(); - - AssertHelpers.assertThrows("Should throw exception because of invalid config", - IllegalArgumentException.class, "must be greater than zero", + }); + + ScanContext scanContext2 = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .maxPlanningSnapshotCount(-10) + .build(); + + AssertHelpers.assertThrows( + "Should throw exception because of invalid config", + IllegalArgumentException.class, + "must be greater than zero", () -> { createFunction(scanContext2); return null; - } - ); + }); } @Test @@ -247,26 +255,29 @@ public void testConsumeWithMaxPlanningSnapshotCount() throws Exception { // Use the oldest snapshot as starting to avoid the initial case. long oldestSnapshotId = SnapshotUtil.oldestAncestor(table).snapshotId(); - ScanContext scanContext = ScanContext.builder() - .monitorInterval(Duration.ofMillis(100)) - .splitSize(1000L) - .startSnapshotId(oldestSnapshotId) - .maxPlanningSnapshotCount(Integer.MAX_VALUE) - .build(); + ScanContext scanContext = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(100)) + .splitSize(1000L) + .startSnapshotId(oldestSnapshotId) + .maxPlanningSnapshotCount(Integer.MAX_VALUE) + .build(); - FlinkInputSplit[] expectedSplits = FlinkSplitPlanner - .planInputSplits(table, scanContext, ThreadPools.getWorkerPool()); + FlinkInputSplit[] expectedSplits = + FlinkSplitPlanner.planInputSplits(table, scanContext, ThreadPools.getWorkerPool()); Assert.assertEquals("should produce 9 splits", 9, expectedSplits.length); - // This covers three cases that maxPlanningSnapshotCount is less than, equal or greater than the total splits number + // This covers three cases that maxPlanningSnapshotCount is less than, equal or greater than the + // total splits number for (int maxPlanningSnapshotCount : ImmutableList.of(1, 9, 15)) { - scanContext = ScanContext.builder() - .monitorInterval(Duration.ofMillis(500)) - .startSnapshotId(oldestSnapshotId) - .splitSize(1000L) - .maxPlanningSnapshotCount(maxPlanningSnapshotCount) - .build(); + scanContext = + ScanContext.builder() + .monitorInterval(Duration.ofMillis(500)) + .startSnapshotId(oldestSnapshotId) + .splitSize(1000L) + .maxPlanningSnapshotCount(maxPlanningSnapshotCount) + .build(); StreamingMonitorFunction function = createFunction(scanContext); try (AbstractStreamOperatorTestHarness harness = createHarness(function)) { @@ -279,8 +290,10 @@ public void testConsumeWithMaxPlanningSnapshotCount() throws Exception { function.monitorAndForwardSplits(); if (maxPlanningSnapshotCount < 10) { - Assert.assertEquals("Should produce same splits as max-planning-snapshot-count", - maxPlanningSnapshotCount, sourceContext.splits.size()); + Assert.assertEquals( + "Should produce same splits as max-planning-snapshot-count", + maxPlanningSnapshotCount, + sourceContext.splits.size()); } } } @@ -304,12 +317,14 @@ private void writeRecords(List records) throws IOException { } private StreamingMonitorFunction createFunction(ScanContext scanContext) { - return new StreamingMonitorFunction(TestTableLoader.of(tableDir.getAbsolutePath()), scanContext); + return new StreamingMonitorFunction( + TestTableLoader.of(tableDir.getAbsolutePath()), scanContext); } - private AbstractStreamOperatorTestHarness createHarness(StreamingMonitorFunction function) - throws Exception { - StreamSource streamSource = new StreamSource<>(function); + private AbstractStreamOperatorTestHarness createHarness( + StreamingMonitorFunction function) throws Exception { + StreamSource streamSource = + new StreamSource<>(function); return new AbstractStreamOperatorTestHarness<>(streamSource, 1, 1, 0); } @@ -334,14 +349,10 @@ public void collectWithTimestamp(FlinkInputSplit element, long timestamp) { } @Override - public void emitWatermark(Watermark mark) { - - } + public void emitWatermark(Watermark mark) {} @Override - public void markAsTemporarilyIdle() { - - } + public void markAsTemporarilyIdle() {} @Override public Object getCheckpointLock() { @@ -349,14 +360,13 @@ public Object getCheckpointLock() { } @Override - public void close() { - - } + public void close() {} private List toRows() throws IOException { - FlinkInputFormat format = FlinkSource.forRowData() - .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) - .buildFormat(); + FlinkInputFormat format = + FlinkSource.forRowData() + .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) + .buildFormat(); List rows = Lists.newArrayList(); for (FlinkInputSplit split : splits) { diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java index 0c3191136db9..e258a197edf3 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source; import java.io.File; @@ -56,18 +55,15 @@ @RunWith(Parameterized.class) public class TestStreamingReaderOperator extends TableTestBase { - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); private static final FileFormat DEFAULT_FORMAT = FileFormat.PARQUET; @Parameterized.Parameters(name = "FormatVersion={0}") public static Iterable parameters() { - return ImmutableList.of( - new Object[] {1}, - new Object[] {2} - ); + return ImmutableList.of(new Object[] {1}, new Object[] {2}); } public TestStreamingReaderOperator(int formatVersion) { @@ -115,7 +111,8 @@ public void testProcessAllRecords() throws Exception { @Test public void testTriggerCheckpoint() throws Exception { - // Received emitted splits: split1, split2, split3, checkpoint request is triggered when reading records from + // Received emitted splits: split1, split2, split3, checkpoint request is triggered when reading + // records from // split1. List> expectedRecords = generateRecordsAndCommitTxn(3); @@ -134,11 +131,11 @@ public void testTriggerCheckpoint() throws Exception { harness.processElement(splits.get(2), ++timestamp); // Trigger snapshot state, it will start to work once all records from split0 are read. - processor.getMainMailboxExecutor() - .execute(() -> harness.snapshot(1, 3), "Trigger snapshot"); + processor.getMainMailboxExecutor().execute(() -> harness.snapshot(1, 3), "Trigger snapshot"); Assert.assertTrue("Should have processed the split0", processor.runMailboxStep()); - Assert.assertTrue("Should have processed the snapshot state action", processor.runMailboxStep()); + Assert.assertTrue( + "Should have processed the snapshot state action", processor.runMailboxStep()); TestHelpers.assertRecords(readOutputValues(harness), expectedRecords.get(0), SCHEMA); @@ -148,8 +145,8 @@ public void testTriggerCheckpoint() throws Exception { // Read records from split2. Assert.assertTrue("Should have processed the split2", processor.runMailboxStep()); - TestHelpers.assertRecords(readOutputValues(harness), - Lists.newArrayList(Iterables.concat(expectedRecords)), SCHEMA); + TestHelpers.assertRecords( + readOutputValues(harness), Lists.newArrayList(Iterables.concat(expectedRecords)), SCHEMA); } } @@ -211,7 +208,8 @@ public void testCheckpointRestore() throws Exception { } } - private List readOutputValues(OneInputStreamOperatorTestHarness harness) { + private List readOutputValues( + OneInputStreamOperatorTestHarness harness) { List results = Lists.newArrayList(); for (RowData rowData : harness.extractOutputValues()) { results.add(Row.of(rowData.getInt(0), rowData.getString(1).toString())); @@ -244,33 +242,36 @@ private List generateSplits() { ScanContext scanContext; if (i == snapshotIds.size() - 1) { // Generate the splits from the first snapshot. - scanContext = ScanContext.builder() - .useSnapshotId(snapshotIds.get(i)) - .build(); + scanContext = ScanContext.builder().useSnapshotId(snapshotIds.get(i)).build(); } else { // Generate the splits between the previous snapshot and current snapshot. - scanContext = ScanContext.builder() - .startSnapshotId(snapshotIds.get(i + 1)) - .endSnapshotId(snapshotIds.get(i)) - .build(); + scanContext = + ScanContext.builder() + .startSnapshotId(snapshotIds.get(i + 1)) + .endSnapshotId(snapshotIds.get(i)) + .build(); } - Collections.addAll(inputSplits, FlinkSplitPlanner.planInputSplits( - table, scanContext, ThreadPools.getWorkerPool())); + Collections.addAll( + inputSplits, + FlinkSplitPlanner.planInputSplits(table, scanContext, ThreadPools.getWorkerPool())); } return inputSplits; } - private OneInputStreamOperatorTestHarness createReader() throws Exception { + private OneInputStreamOperatorTestHarness createReader() + throws Exception { // This input format is used to opening the emitted split. - FlinkInputFormat inputFormat = FlinkSource.forRowData() - .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) - .buildFormat(); - - OneInputStreamOperatorFactory factory = StreamingReaderOperator.factory(inputFormat); - OneInputStreamOperatorTestHarness harness = new OneInputStreamOperatorTestHarness<>( - factory, 1, 1, 0); + FlinkInputFormat inputFormat = + FlinkSource.forRowData() + .tableLoader(TestTableLoader.of(tableDir.getAbsolutePath())) + .buildFormat(); + + OneInputStreamOperatorFactory factory = + StreamingReaderOperator.factory(inputFormat); + OneInputStreamOperatorTestHarness harness = + new OneInputStreamOperatorTestHarness<>(factory, 1, 1, 0); harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime); return harness; diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestSimpleSplitAssigner.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestSimpleSplitAssigner.java index bfe1c6bab2c4..ee6c5cc3a6c8 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestSimpleSplitAssigner.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/assigner/TestSimpleSplitAssigner.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.assigner; import java.util.Collection; @@ -32,8 +31,7 @@ import org.junit.rules.TemporaryFolder; public class TestSimpleSplitAssigner { - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); @Test public void testEmptyInitialization() { @@ -41,19 +39,17 @@ public void testEmptyInitialization() { assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); } - /** - * Test a sequence of interactions for StaticEnumerator - */ + /** Test a sequence of interactions for StaticEnumerator */ @Test public void testStaticEnumeratorSequence() throws Exception { SimpleSplitAssigner assigner = new SimpleSplitAssigner(); - assigner.onDiscoveredSplits(SplitHelpers.createSplitsFromTransientHadoopTable( - TEMPORARY_FOLDER, 4, 2)); + assigner.onDiscoveredSplits( + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 4, 2)); assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); assertSnapshot(assigner, 1); - assigner.onUnassignedSplits(SplitHelpers.createSplitsFromTransientHadoopTable( - TEMPORARY_FOLDER, 1, 1)); + assigner.onUnassignedSplits( + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1)); assertSnapshot(assigner, 2); assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); @@ -62,20 +58,21 @@ public void testStaticEnumeratorSequence() throws Exception { assertSnapshot(assigner, 0); } - /** - * Test a sequence of interactions for ContinuousEnumerator - */ + /** Test a sequence of interactions for ContinuousEnumerator */ @Test public void testContinuousEnumeratorSequence() throws Exception { SimpleSplitAssigner assigner = new SimpleSplitAssigner(); assertGetNext(assigner, GetSplitResult.Status.UNAVAILABLE); - List splits1 = SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); + List splits1 = + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); assertAvailableFuture(assigner, 1, () -> assigner.onDiscoveredSplits(splits1)); - List splits2 = SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); + List splits2 = + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); assertAvailableFuture(assigner, 1, () -> assigner.onUnassignedSplits(splits2)); - assigner.onDiscoveredSplits(SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 2, 1)); + assigner.onDiscoveredSplits( + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 2, 1)); assertSnapshot(assigner, 2); assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); assertGetNext(assigner, GetSplitResult.Status.AVAILABLE); @@ -83,7 +80,8 @@ public void testContinuousEnumeratorSequence() throws Exception { assertSnapshot(assigner, 0); } - private void assertAvailableFuture(SimpleSplitAssigner assigner, int splitCount, Runnable addSplitsRunnable) { + private void assertAvailableFuture( + SimpleSplitAssigner assigner, int splitCount, Runnable addSplitsRunnable) { // register callback AtomicBoolean futureCompleted = new AtomicBoolean(); CompletableFuture future = assigner.isAvailable(); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java index f1db8ef5d6ad..e6610acbc19f 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/ManualContinuousSplitPlanner.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.IOException; @@ -31,27 +30,22 @@ class ManualContinuousSplitPlanner implements ContinuousSplitPlanner { @Override public ContinuousEnumerationResult planSplits(IcebergEnumeratorPosition lastPosition) { - ContinuousEnumerationResult result = new ContinuousEnumerationResult( - Lists.newArrayList(splits), lastPosition, latestPosition); + ContinuousEnumerationResult result = + new ContinuousEnumerationResult(Lists.newArrayList(splits), lastPosition, latestPosition); return result; } - /** - * Add new splits to the collection - */ + /** Add new splits to the collection */ public void addSplits(List newSplits, IcebergEnumeratorPosition newPosition) { splits.addAll(newSplits); this.latestPosition = newPosition; } - /** - * Clear the splits collection - */ + /** Clear the splits collection */ public void clearSplits() { splits.clear(); } @Override - public void close() throws IOException { - } + public void close() throws IOException {} } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java index 5e7f926e3aee..aad2769af010 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousIcebergEnumerator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.util.Collection; @@ -41,26 +40,28 @@ import org.junit.rules.TemporaryFolder; public class TestContinuousIcebergEnumerator { - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); @Test public void testDiscoverSplitWhenNoReaderRegistered() throws Exception { ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(); TestingSplitEnumeratorContext enumeratorContext = new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousIcebergEnumerator enumerator = createEnumerator(enumeratorContext, scanContext, splitPlanner); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, scanContext, splitPlanner); - Collection pendingSplitsEmpty = enumerator.snapshotState(1).pendingSplits(); + Collection pendingSplitsEmpty = + enumerator.snapshotState(1).pendingSplits(); Assert.assertEquals(0, pendingSplitsEmpty.size()); // make one split available and trigger the periodic discovery - List splits = SplitHelpers - .createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); splitPlanner.addSplits(splits, IcebergEnumeratorPosition.of(1L, 1L)); enumeratorContext.triggerAllActions(); @@ -76,26 +77,28 @@ public void testDiscoverWhenReaderRegistered() throws Exception { ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(); TestingSplitEnumeratorContext enumeratorContext = new TestingSplitEnumeratorContext<>(4); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousIcebergEnumerator enumerator = createEnumerator(enumeratorContext, scanContext, splitPlanner); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, scanContext, splitPlanner); // register one reader, and let it request a split enumeratorContext.registerReader(2, "localhost"); enumerator.addReader(2); - enumerator.handleSourceEvent(2, - new SplitRequestEvent()); + enumerator.handleSourceEvent(2, new SplitRequestEvent()); // make one split available and trigger the periodic discovery - List splits = SplitHelpers - .createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); splitPlanner.addSplits(splits, IcebergEnumeratorPosition.of(1L, 1L)); enumeratorContext.triggerAllActions(); Assert.assertTrue(enumerator.snapshotState(1).pendingSplits().isEmpty()); - MatcherAssert.assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits(), + MatcherAssert.assertThat( + enumeratorContext.getSplitAssignments().get(2).getAssignedSplits(), CoreMatchers.hasItem(splits.get(0))); } @@ -104,44 +107,46 @@ public void testRequestingReaderUnavailableWhenSplitDiscovered() throws Exceptio ManualContinuousSplitPlanner splitPlanner = new ManualContinuousSplitPlanner(); TestingSplitEnumeratorContext enumeratorContext = new TestingSplitEnumeratorContext<>(4); - ScanContext config = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousIcebergEnumerator enumerator = createEnumerator(enumeratorContext, config, splitPlanner); + ScanContext config = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousIcebergEnumerator enumerator = + createEnumerator(enumeratorContext, config, splitPlanner); // register one reader, and let it request a split enumeratorContext.registerReader(2, "localhost"); enumerator.addReader(2); - enumerator.handleSourceEvent(2, - new SplitRequestEvent()); + enumerator.handleSourceEvent(2, new SplitRequestEvent()); // remove the reader (like in a failure) enumeratorContext.registeredReaders().remove(2); // make one split available and trigger the periodic discovery - List splits = SplitHelpers - .createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 1, 1); Assert.assertEquals(1, splits.size()); splitPlanner.addSplits(splits, IcebergEnumeratorPosition.of(1L, 1L)); enumeratorContext.triggerAllActions(); Assert.assertFalse(enumeratorContext.getSplitAssignments().containsKey(2)); - List pendingSplitIds = enumerator.snapshotState(1).pendingSplits().stream() - .map(IcebergSourceSplitState::split) - .map(IcebergSourceSplit::splitId) - .collect(Collectors.toList()); + List pendingSplitIds = + enumerator.snapshotState(1).pendingSplits().stream() + .map(IcebergSourceSplitState::split) + .map(IcebergSourceSplit::splitId) + .collect(Collectors.toList()); Assert.assertEquals(splits.size(), pendingSplitIds.size()); Assert.assertEquals(splits.get(0).splitId(), pendingSplitIds.get(0)); // register the reader again, and let it request a split enumeratorContext.registerReader(2, "localhost"); enumerator.addReader(2); - enumerator.handleSourceEvent(2, - new SplitRequestEvent()); + enumerator.handleSourceEvent(2, new SplitRequestEvent()); Assert.assertTrue(enumerator.snapshotState(2).pendingSplits().isEmpty()); - MatcherAssert.assertThat(enumeratorContext.getSplitAssignments().get(2).getAssignedSplits(), + MatcherAssert.assertThat( + enumeratorContext.getSplitAssignments().get(2).getAssignedSplits(), CoreMatchers.hasItem(splits.get(0))); } @@ -160,5 +165,4 @@ private static ContinuousIcebergEnumerator createEnumerator( enumerator.start(); return enumerator; } - } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java index bf794ab74c89..63fc53341f1e 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImpl.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.IOException; @@ -48,18 +47,17 @@ import org.junit.rules.TestName; public class TestContinuousSplitPlannerImpl { - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private static final FileFormat fileFormat = FileFormat.PARQUET; private static final AtomicLong randomSeed = new AtomicLong(); @Rule - public final HadoopTableResource tableResource = new HadoopTableResource(TEMPORARY_FOLDER, - TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); + public final HadoopTableResource tableResource = + new HadoopTableResource( + TEMPORARY_FOLDER, TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); - @Rule - public TestName testName = new TestName(); + @Rule public TestName testName = new TestName(); private GenericAppenderHelper dataAppender; private DataFile dataFile1; @@ -86,36 +84,40 @@ private void appendTwoSnapshots() throws IOException { snapshot2 = tableResource.table().currentSnapshot(); } - /** - * @return the last enumerated snapshot id - */ + /** @return the last enumerated snapshot id */ private IcebergEnumeratorPosition verifyOneCycle( - ContinuousSplitPlannerImpl splitPlanner, IcebergEnumeratorPosition lastPosition) throws Exception { - List batch = RandomGenericData.generate(TestFixtures.SCHEMA, 2, randomSeed.incrementAndGet()); + ContinuousSplitPlannerImpl splitPlanner, IcebergEnumeratorPosition lastPosition) + throws Exception { + List batch = + RandomGenericData.generate(TestFixtures.SCHEMA, 2, randomSeed.incrementAndGet()); DataFile dataFile = dataAppender.writeFile(null, batch); dataAppender.appendToTable(dataFile); Snapshot snapshot = tableResource.table().currentSnapshot(); ContinuousEnumerationResult result = splitPlanner.planSplits(lastPosition); Assert.assertEquals(lastPosition.snapshotId(), result.fromPosition().snapshotId()); - Assert.assertEquals(lastPosition.snapshotTimestampMs(), result.fromPosition().snapshotTimestampMs()); + Assert.assertEquals( + lastPosition.snapshotTimestampMs(), result.fromPosition().snapshotTimestampMs()); Assert.assertEquals(snapshot.snapshotId(), result.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot.timestampMillis(), result.toPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot.timestampMillis(), result.toPosition().snapshotTimestampMs().longValue()); Assert.assertEquals(1, result.splits().size()); IcebergSourceSplit split = Iterables.getOnlyElement(result.splits()); Assert.assertEquals(1, split.task().files().size()); - Assert.assertEquals(dataFile.path().toString(), + Assert.assertEquals( + dataFile.path().toString(), Iterables.getOnlyElement(split.task().files()).file().path().toString()); return result.toPosition(); } @Test public void testTableScanThenIncrementalWithEmptyTable() throws Exception { - ScanContext scanContext = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContext, null); + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(tableResource.table(), scanContext, null); ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); Assert.assertTrue(emptyTableInitialDiscoveryResult.splits().isEmpty()); @@ -123,8 +125,8 @@ public void testTableScanThenIncrementalWithEmptyTable() throws Exception { Assert.assertTrue(emptyTableInitialDiscoveryResult.toPosition().isEmpty()); Assert.assertNull(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()); - ContinuousEnumerationResult emptyTableSecondDiscoveryResult = splitPlanner - .planSplits(emptyTableInitialDiscoveryResult.toPosition()); + ContinuousEnumerationResult emptyTableSecondDiscoveryResult = + splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); Assert.assertTrue(emptyTableSecondDiscoveryResult.splits().isEmpty()); Assert.assertTrue(emptyTableSecondDiscoveryResult.fromPosition().isEmpty()); Assert.assertNull(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()); @@ -142,23 +144,28 @@ public void testTableScanThenIncrementalWithEmptyTable() throws Exception { public void testTableScanThenIncrementalWithNonEmptyTable() throws Exception { appendTwoSnapshots(); - ScanContext scanContext = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContext, null); + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(tableResource.table(), scanContext, null); ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); Assert.assertNull(initialResult.fromPosition()); - Assert.assertEquals(snapshot2.snapshotId(), initialResult.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot2.timestampMillis(), initialResult.toPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot2.snapshotId(), initialResult.toPosition().snapshotId().longValue()); + Assert.assertEquals( + snapshot2.timestampMillis(), initialResult.toPosition().snapshotTimestampMs().longValue()); Assert.assertEquals(1, initialResult.splits().size()); IcebergSourceSplit split = Iterables.getOnlyElement(initialResult.splits()); Assert.assertEquals(2, split.task().files().size()); - Set discoveredFiles = split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().path().toString()) - .collect(Collectors.toSet()); - Set expectedFiles = ImmutableSet.of(dataFile1.path().toString(), dataFile2.path().toString()); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().path().toString()) + .collect(Collectors.toSet()); + Set expectedFiles = + ImmutableSet.of(dataFile1.path().toString(), dataFile2.path().toString()); Assert.assertEquals(expectedFiles, discoveredFiles); IcebergEnumeratorPosition lastPosition = initialResult.toPosition(); @@ -169,12 +176,13 @@ public void testTableScanThenIncrementalWithNonEmptyTable() throws Exception { @Test public void testIncrementalFromLatestSnapshotWithEmptyTable() throws Exception { - ScanContext scanContext = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .splitSize(1L) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContext, null); + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .splitSize(1L) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(tableResource.table(), scanContext, null); ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); Assert.assertTrue(emptyTableInitialDiscoveryResult.splits().isEmpty()); @@ -182,8 +190,8 @@ public void testIncrementalFromLatestSnapshotWithEmptyTable() throws Exception { Assert.assertTrue(emptyTableInitialDiscoveryResult.toPosition().isEmpty()); Assert.assertNull(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()); - ContinuousEnumerationResult emptyTableSecondDiscoveryResult = splitPlanner - .planSplits(emptyTableInitialDiscoveryResult.toPosition()); + ContinuousEnumerationResult emptyTableSecondDiscoveryResult = + splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); Assert.assertTrue(emptyTableSecondDiscoveryResult.splits().isEmpty()); Assert.assertTrue(emptyTableSecondDiscoveryResult.fromPosition().isEmpty()); Assert.assertNull(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()); @@ -192,8 +200,8 @@ public void testIncrementalFromLatestSnapshotWithEmptyTable() throws Exception { // latest mode should discover both snapshots, as latest position is marked by when job starts appendTwoSnapshots(); - ContinuousEnumerationResult afterTwoSnapshotsAppended = splitPlanner - .planSplits(emptyTableSecondDiscoveryResult.toPosition()); + ContinuousEnumerationResult afterTwoSnapshotsAppended = + splitPlanner.planSplits(emptyTableSecondDiscoveryResult.toPosition()); Assert.assertEquals(2, afterTwoSnapshotsAppended.splits().size()); // next 3 snapshots @@ -207,30 +215,37 @@ public void testIncrementalFromLatestSnapshotWithEmptyTable() throws Exception { public void testIncrementalFromLatestSnapshotWithNonEmptyTable() throws Exception { appendTwoSnapshots(); - ScanContext scanContext = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContext, null); + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(tableResource.table(), scanContext, null); ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); Assert.assertNull(initialResult.fromPosition()); // For inclusive behavior, the initial result should point to snapshot1 // Then the next incremental scan shall discover files from latest snapshot2 (inclusive) - Assert.assertEquals(snapshot1.snapshotId(), initialResult.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot1.timestampMillis(), initialResult.toPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot1.snapshotId(), initialResult.toPosition().snapshotId().longValue()); + Assert.assertEquals( + snapshot1.timestampMillis(), initialResult.toPosition().snapshotTimestampMs().longValue()); Assert.assertEquals(0, initialResult.splits().size()); ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - Assert.assertEquals(snapshot1.snapshotId(), secondResult.fromPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot1.timestampMillis(), secondResult.fromPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot1.snapshotId(), secondResult.fromPosition().snapshotId().longValue()); + Assert.assertEquals( + snapshot1.timestampMillis(), secondResult.fromPosition().snapshotTimestampMs().longValue()); Assert.assertEquals(snapshot2.snapshotId(), secondResult.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot2.timestampMillis(), secondResult.toPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot2.timestampMillis(), secondResult.toPosition().snapshotTimestampMs().longValue()); IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); Assert.assertEquals(1, split.task().files().size()); - Set discoveredFiles = split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().path().toString()) - .collect(Collectors.toSet()); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().path().toString()) + .collect(Collectors.toSet()); // should discover dataFile2 appended in snapshot2 Set expectedFiles = ImmutableSet.of(dataFile2.path().toString()); Assert.assertEquals(expectedFiles, discoveredFiles); @@ -243,11 +258,12 @@ public void testIncrementalFromLatestSnapshotWithNonEmptyTable() throws Exceptio @Test public void testIncrementalFromEarliestSnapshotWithEmptyTable() throws Exception { - ScanContext scanContext = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContext, null); + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(tableResource.table(), scanContext, null); ContinuousEnumerationResult emptyTableInitialDiscoveryResult = splitPlanner.planSplits(null); Assert.assertTrue(emptyTableInitialDiscoveryResult.splits().isEmpty()); @@ -255,8 +271,8 @@ public void testIncrementalFromEarliestSnapshotWithEmptyTable() throws Exception Assert.assertNull(emptyTableInitialDiscoveryResult.toPosition().snapshotId()); Assert.assertNull(emptyTableInitialDiscoveryResult.toPosition().snapshotTimestampMs()); - ContinuousEnumerationResult emptyTableSecondDiscoveryResult = splitPlanner - .planSplits(emptyTableInitialDiscoveryResult.toPosition()); + ContinuousEnumerationResult emptyTableSecondDiscoveryResult = + splitPlanner.planSplits(emptyTableInitialDiscoveryResult.toPosition()); Assert.assertTrue(emptyTableSecondDiscoveryResult.splits().isEmpty()); Assert.assertNull(emptyTableSecondDiscoveryResult.fromPosition().snapshotId()); Assert.assertNull(emptyTableSecondDiscoveryResult.fromPosition().snapshotTimestampMs()); @@ -274,11 +290,12 @@ public void testIncrementalFromEarliestSnapshotWithEmptyTable() throws Exception public void testIncrementalFromEarliestSnapshotWithNonEmptyTable() throws Exception { appendTwoSnapshots(); - ScanContext scanContext = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContext, null); + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(tableResource.table(), scanContext, null); ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); Assert.assertNull(initialResult.fromPosition()); @@ -292,14 +309,17 @@ public void testIncrementalFromEarliestSnapshotWithNonEmptyTable() throws Except Assert.assertNull(secondResult.fromPosition().snapshotId()); Assert.assertNull(secondResult.fromPosition().snapshotTimestampMs()); Assert.assertEquals(snapshot2.snapshotId(), secondResult.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot2.timestampMillis(), secondResult.toPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot2.timestampMillis(), secondResult.toPosition().snapshotTimestampMs().longValue()); IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); Assert.assertEquals(2, split.task().files().size()); - Set discoveredFiles = split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().path().toString()) - .collect(Collectors.toSet()); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().path().toString()) + .collect(Collectors.toSet()); // should discover files appended in both snapshot1 and snapshot2 - Set expectedFiles = ImmutableSet.of(dataFile1.path().toString(), dataFile2.path().toString()); + Set expectedFiles = + ImmutableSet.of(dataFile1.path().toString(), dataFile2.path().toString()); Assert.assertEquals(expectedFiles, discoveredFiles); IcebergEnumeratorPosition lastPosition = secondResult.toPosition(); @@ -310,14 +330,17 @@ public void testIncrementalFromEarliestSnapshotWithNonEmptyTable() throws Except @Test public void testIncrementalFromSnapshotIdWithEmptyTable() throws Exception { - ScanContext scanContextWithInvalidSnapshotId = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(1L) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContextWithInvalidSnapshotId, null); - - AssertHelpers.assertThrows("Should detect invalid starting snapshot id", + ScanContext scanContextWithInvalidSnapshotId = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(1L) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl( + tableResource.table(), scanContextWithInvalidSnapshotId, null); + + AssertHelpers.assertThrows( + "Should detect invalid starting snapshot id", IllegalArgumentException.class, "Start snapshot id not found in history: 1", () -> splitPlanner.planSplits(null)); @@ -329,19 +352,23 @@ public void testIncrementalFromSnapshotIdWithInvalidIds() throws Exception { // find an invalid snapshotId long invalidSnapshotId = 0L; - while (invalidSnapshotId == snapshot1.snapshotId() || invalidSnapshotId == snapshot2.snapshotId()) { + while (invalidSnapshotId == snapshot1.snapshotId() + || invalidSnapshotId == snapshot2.snapshotId()) { invalidSnapshotId++; } - ScanContext scanContextWithInvalidSnapshotId = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(invalidSnapshotId) - .build(); + ScanContext scanContextWithInvalidSnapshotId = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(invalidSnapshotId) + .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContextWithInvalidSnapshotId, null); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl( + tableResource.table(), scanContextWithInvalidSnapshotId, null); - AssertHelpers.assertThrows("Should detect invalid starting snapshot id", + AssertHelpers.assertThrows( + "Should detect invalid starting snapshot id", IllegalArgumentException.class, "Start snapshot id not found in history: " + invalidSnapshotId, () -> splitPlanner.planSplits(null)); @@ -351,30 +378,38 @@ public void testIncrementalFromSnapshotIdWithInvalidIds() throws Exception { public void testIncrementalFromSnapshotId() throws Exception { appendTwoSnapshots(); - ScanContext scanContext = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(snapshot2.snapshotId()) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContext, null); + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(snapshot2.snapshotId()) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(tableResource.table(), scanContext, null); ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); Assert.assertNull(initialResult.fromPosition()); - // For inclusive behavior of snapshot2, the initial result should point to snapshot1 (as snapshot2's parent) - Assert.assertEquals(snapshot1.snapshotId(), initialResult.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot1.timestampMillis(), initialResult.toPosition().snapshotTimestampMs().longValue()); + // For inclusive behavior of snapshot2, the initial result should point to snapshot1 (as + // snapshot2's parent) + Assert.assertEquals( + snapshot1.snapshotId(), initialResult.toPosition().snapshotId().longValue()); + Assert.assertEquals( + snapshot1.timestampMillis(), initialResult.toPosition().snapshotTimestampMs().longValue()); Assert.assertEquals(0, initialResult.splits().size()); ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - Assert.assertEquals(snapshot1.snapshotId(), secondResult.fromPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot1.timestampMillis(), secondResult.fromPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot1.snapshotId(), secondResult.fromPosition().snapshotId().longValue()); + Assert.assertEquals( + snapshot1.timestampMillis(), secondResult.fromPosition().snapshotTimestampMs().longValue()); Assert.assertEquals(snapshot2.snapshotId(), secondResult.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot2.timestampMillis(), secondResult.toPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot2.timestampMillis(), secondResult.toPosition().snapshotTimestampMs().longValue()); IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); Assert.assertEquals(1, split.task().files().size()); - Set discoveredFiles = split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().path().toString()) - .collect(Collectors.toSet()); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().path().toString()) + .collect(Collectors.toSet()); // should discover dataFile2 appended in snapshot2 Set expectedFiles = ImmutableSet.of(dataFile2.path().toString()); Assert.assertEquals(expectedFiles, discoveredFiles); @@ -387,14 +422,17 @@ public void testIncrementalFromSnapshotId() throws Exception { @Test public void testIncrementalFromSnapshotTimestampWithEmptyTable() throws Exception { - ScanContext scanContextWithInvalidSnapshotId = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(1L) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContextWithInvalidSnapshotId, null); - - AssertHelpers.assertThrows("Should detect invalid starting snapshot timestamp", + ScanContext scanContextWithInvalidSnapshotId = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(1L) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl( + tableResource.table(), scanContextWithInvalidSnapshotId, null); + + AssertHelpers.assertThrows( + "Should detect invalid starting snapshot timestamp", IllegalArgumentException.class, "Cannot find a snapshot older than 1970-01-01T00:00:00.001+00:00", () -> splitPlanner.planSplits(null)); @@ -405,17 +443,21 @@ public void testIncrementalFromSnapshotTimestampWithInvalidIds() throws Exceptio appendTwoSnapshots(); long invalidSnapshotTimestampMs = snapshot1.timestampMillis() - 1000L; - String invalidSnapshotTimestampMsStr = DateTimeUtil.formatTimestampMillis(invalidSnapshotTimestampMs); + String invalidSnapshotTimestampMsStr = + DateTimeUtil.formatTimestampMillis(invalidSnapshotTimestampMs); - ScanContext scanContextWithInvalidSnapshotId = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(invalidSnapshotTimestampMs) - .build(); + ScanContext scanContextWithInvalidSnapshotId = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(invalidSnapshotTimestampMs) + .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContextWithInvalidSnapshotId, null); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl( + tableResource.table(), scanContextWithInvalidSnapshotId, null); - AssertHelpers.assertThrows("Should detect invalid starting snapshot timestamp", + AssertHelpers.assertThrows( + "Should detect invalid starting snapshot timestamp", IllegalArgumentException.class, "Cannot find a snapshot older than " + invalidSnapshotTimestampMsStr, () -> splitPlanner.planSplits(null)); @@ -425,30 +467,37 @@ public void testIncrementalFromSnapshotTimestampWithInvalidIds() throws Exceptio public void testIncrementalFromSnapshotTimestamp() throws Exception { appendTwoSnapshots(); - ScanContext scanContext = ScanContext.builder() - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(snapshot2.timestampMillis()) - .build(); - ContinuousSplitPlannerImpl splitPlanner = new ContinuousSplitPlannerImpl( - tableResource.table(), scanContext, null); + ScanContext scanContext = + ScanContext.builder() + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(snapshot2.timestampMillis()) + .build(); + ContinuousSplitPlannerImpl splitPlanner = + new ContinuousSplitPlannerImpl(tableResource.table(), scanContext, null); ContinuousEnumerationResult initialResult = splitPlanner.planSplits(null); Assert.assertNull(initialResult.fromPosition()); // For inclusive behavior, the initial result should point to snapshot1 (as snapshot2's parent). - Assert.assertEquals(snapshot1.snapshotId(), initialResult.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot1.timestampMillis(), initialResult.toPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot1.snapshotId(), initialResult.toPosition().snapshotId().longValue()); + Assert.assertEquals( + snapshot1.timestampMillis(), initialResult.toPosition().snapshotTimestampMs().longValue()); Assert.assertEquals(0, initialResult.splits().size()); ContinuousEnumerationResult secondResult = splitPlanner.planSplits(initialResult.toPosition()); - Assert.assertEquals(snapshot1.snapshotId(), secondResult.fromPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot1.timestampMillis(), secondResult.fromPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot1.snapshotId(), secondResult.fromPosition().snapshotId().longValue()); + Assert.assertEquals( + snapshot1.timestampMillis(), secondResult.fromPosition().snapshotTimestampMs().longValue()); Assert.assertEquals(snapshot2.snapshotId(), secondResult.toPosition().snapshotId().longValue()); - Assert.assertEquals(snapshot2.timestampMillis(), secondResult.toPosition().snapshotTimestampMs().longValue()); + Assert.assertEquals( + snapshot2.timestampMillis(), secondResult.toPosition().snapshotTimestampMs().longValue()); IcebergSourceSplit split = Iterables.getOnlyElement(secondResult.splits()); Assert.assertEquals(1, split.task().files().size()); - Set discoveredFiles = split.task().files().stream() - .map(fileScanTask -> fileScanTask.file().path().toString()) - .collect(Collectors.toSet()); + Set discoveredFiles = + split.task().files().stream() + .map(fileScanTask -> fileScanTask.file().path().toString()) + .collect(Collectors.toSet()); // should discover dataFile2 appended in snapshot2 Set expectedFiles = ImmutableSet.of(dataFile2.path().toString()); Assert.assertEquals(expectedFiles, discoveredFiles); diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java index bc9cd4934a96..f3e9413d8819 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestContinuousSplitPlannerImplStartStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.io.IOException; @@ -43,12 +42,10 @@ public class TestContinuousSplitPlannerImplStartStrategy { private static final FileFormat FILE_FORMAT = FileFormat.PARQUET; public final TemporaryFolder temporaryFolder = new TemporaryFolder(); - public final HadoopTableResource tableResource = new HadoopTableResource(temporaryFolder, - TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); - @Rule - public final TestRule chain = RuleChain - .outerRule(temporaryFolder) - .around(tableResource); + public final HadoopTableResource tableResource = + new HadoopTableResource( + temporaryFolder, TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); + @Rule public final TestRule chain = RuleChain.outerRule(temporaryFolder).around(tableResource); private GenericAppenderHelper dataAppender; private Snapshot snapshot1; @@ -75,99 +72,120 @@ private void appendThreeSnapshots() throws IOException { } @Test - public void testTableScanThenIncrementalStrategy() throws IOException { - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) - .build(); + public void testTableScanThenIncrementalStrategy() throws IOException { + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.TABLE_SCAN_THEN_INCREMENTAL) + .build(); // emtpy table - Assert.assertFalse(ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).isPresent()); + Assert.assertFalse( + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).isPresent()); appendThreeSnapshots(); - Snapshot startSnapshot = ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); Assert.assertEquals(snapshot3.snapshotId(), startSnapshot.snapshotId()); } @Test public void testForLatestSnapshotStrategy() throws IOException { - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .build(); // emtpy table - Assert.assertFalse(ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).isPresent()); + Assert.assertFalse( + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).isPresent()); appendThreeSnapshots(); - Snapshot startSnapshot = ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); Assert.assertEquals(snapshot3.snapshotId(), startSnapshot.snapshotId()); } @Test public void testForEarliestSnapshotStrategy() throws IOException { - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_EARLIEST_SNAPSHOT) + .build(); // emtpy table - Assert.assertFalse(ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).isPresent()); + Assert.assertFalse( + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).isPresent()); appendThreeSnapshots(); - Snapshot startSnapshot = ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); Assert.assertEquals(snapshot1.snapshotId(), startSnapshot.snapshotId()); } @Test public void testForSpecificSnapshotIdStrategy() throws IOException { - ScanContext scanContextInvalidSnapshotId = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(1L) - .build(); + ScanContext scanContextInvalidSnapshotId = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(1L) + .build(); // emtpy table - AssertHelpers.assertThrows("Should detect invalid starting snapshot id", + AssertHelpers.assertThrows( + "Should detect invalid starting snapshot id", IllegalArgumentException.class, "Start snapshot id not found in history: 1", - () -> ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContextInvalidSnapshotId)); + () -> + ContinuousSplitPlannerImpl.startSnapshot( + tableResource.table(), scanContextInvalidSnapshotId)); appendThreeSnapshots(); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) - .startSnapshotId(snapshot2.snapshotId()) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) + .startSnapshotId(snapshot2.snapshotId()) + .build(); - Snapshot startSnapshot = ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); Assert.assertEquals(snapshot2.snapshotId(), startSnapshot.snapshotId()); } @Test public void testForSpecificSnapshotTimestampStrategySnapshot2() throws IOException { - ScanContext scanContextInvalidSnapshotTimestamp = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(1L) - .build(); + ScanContext scanContextInvalidSnapshotTimestamp = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(1L) + .build(); // emtpy table - AssertHelpers.assertThrows("Should detect invalid starting snapshot timestamp", + AssertHelpers.assertThrows( + "Should detect invalid starting snapshot timestamp", IllegalArgumentException.class, "Cannot find a snapshot older than 1970-01-01T00:00:00.001+00:00", - () -> ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContextInvalidSnapshotTimestamp)); + () -> + ContinuousSplitPlannerImpl.startSnapshot( + tableResource.table(), scanContextInvalidSnapshotTimestamp)); appendThreeSnapshots(); - ScanContext scanContext = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(snapshot2.timestampMillis()) - .build(); + ScanContext scanContext = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(snapshot2.timestampMillis()) + .build(); - Snapshot startSnapshot = ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), scanContext).get(); Assert.assertEquals(snapshot2.snapshotId(), startSnapshot.snapshotId()); } @@ -175,13 +193,15 @@ public void testForSpecificSnapshotTimestampStrategySnapshot2() throws IOExcepti public void testForSpecificSnapshotTimestampStrategySnapshot2Minus1() throws IOException { appendThreeSnapshots(); - ScanContext config = ScanContext.builder() - .streaming(true) - .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) - .startSnapshotTimestamp(snapshot2.timestampMillis() - 1L) - .build(); + ScanContext config = + ScanContext.builder() + .streaming(true) + .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) + .startSnapshotTimestamp(snapshot2.timestampMillis() - 1L) + .build(); - Snapshot startSnapshot = ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), config).get(); + Snapshot startSnapshot = + ContinuousSplitPlannerImpl.startSnapshot(tableResource.table(), config).get(); Assert.assertEquals(snapshot2.snapshotId(), startSnapshot.snapshotId()); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java index 05abe7bc1792..33ff58c52f4a 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/enumerator/TestIcebergEnumeratorStateSerializer.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.enumerator; import java.util.Collection; @@ -34,10 +33,10 @@ import org.junit.rules.TemporaryFolder; public class TestIcebergEnumeratorStateSerializer { - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); - private final IcebergEnumeratorStateSerializer serializer = IcebergEnumeratorStateSerializer.INSTANCE; + private final IcebergEnumeratorStateSerializer serializer = + IcebergEnumeratorStateSerializer.INSTANCE; @Test public void testEmptySnapshotIdAndPendingSplits() throws Exception { @@ -49,8 +48,10 @@ public void testEmptySnapshotIdAndPendingSplits() throws Exception { @Test public void testSomeSnapshotIdAndEmptyPendingSplits() throws Exception { - IcebergEnumeratorPosition position = IcebergEnumeratorPosition.of(1L, System.currentTimeMillis()); - IcebergEnumeratorState enumeratorState = new IcebergEnumeratorState(position, Collections.emptyList()); + IcebergEnumeratorPosition position = + IcebergEnumeratorPosition.of(1L, System.currentTimeMillis()); + IcebergEnumeratorState enumeratorState = + new IcebergEnumeratorState(position, Collections.emptyList()); byte[] result = serializer.serialize(enumeratorState); IcebergEnumeratorState deserialized = serializer.deserialize(serializer.getVersion(), result); assertEnumeratorStateEquals(enumeratorState, deserialized); @@ -58,13 +59,17 @@ public void testSomeSnapshotIdAndEmptyPendingSplits() throws Exception { @Test public void testSomeSnapshotIdAndPendingSplits() throws Exception { - IcebergEnumeratorPosition position = IcebergEnumeratorPosition.of(2L, System.currentTimeMillis()); - List splits = SplitHelpers - .createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 3, 1); + IcebergEnumeratorPosition position = + IcebergEnumeratorPosition.of(2L, System.currentTimeMillis()); + List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 3, 1); Collection pendingSplits = Lists.newArrayList(); - pendingSplits.add(new IcebergSourceSplitState(splits.get(0), IcebergSourceSplitStatus.UNASSIGNED)); - pendingSplits.add(new IcebergSourceSplitState(splits.get(1), IcebergSourceSplitStatus.ASSIGNED)); - pendingSplits.add(new IcebergSourceSplitState(splits.get(2), IcebergSourceSplitStatus.COMPLETED)); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(0), IcebergSourceSplitStatus.UNASSIGNED)); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(1), IcebergSourceSplitStatus.ASSIGNED)); + pendingSplits.add( + new IcebergSourceSplitState(splits.get(2), IcebergSourceSplitStatus.COMPLETED)); IcebergEnumeratorState enumeratorState = new IcebergEnumeratorState(position, pendingSplits); byte[] result = serializer.serialize(enumeratorState); @@ -72,7 +77,8 @@ public void testSomeSnapshotIdAndPendingSplits() throws Exception { assertEnumeratorStateEquals(enumeratorState, deserialized); } - private void assertEnumeratorStateEquals(IcebergEnumeratorState expected, IcebergEnumeratorState actual) { + private void assertEnumeratorStateEquals( + IcebergEnumeratorState expected, IcebergEnumeratorState actual) { Assert.assertEquals(expected.lastEnumeratedPosition(), actual.lastEnumeratedPosition()); Assert.assertEquals(expected.pendingSplits().size(), actual.pendingSplits().size()); Iterator expectedIterator = expected.pendingSplits().iterator(); @@ -81,8 +87,10 @@ private void assertEnumeratorStateEquals(IcebergEnumeratorState expected, Iceber IcebergSourceSplitState expectedSplitState = expectedIterator.next(); IcebergSourceSplitState actualSplitState = actualIterator.next(); Assert.assertEquals(expectedSplitState.split().splitId(), actualSplitState.split().splitId()); - Assert.assertEquals(expectedSplitState.split().fileOffset(), actualSplitState.split().fileOffset()); - Assert.assertEquals(expectedSplitState.split().recordOffset(), actualSplitState.split().recordOffset()); + Assert.assertEquals( + expectedSplitState.split().fileOffset(), actualSplitState.split().fileOffset()); + Assert.assertEquals( + expectedSplitState.split().recordOffset(), actualSplitState.split().recordOffset()); Assert.assertEquals(expectedSplitState.status(), actualSplitState.status()); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java index e186a2b2a14a..720b0c25d12a 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.io.IOException; @@ -46,15 +45,14 @@ public abstract class ReaderFunctionTestBase { @Parameterized.Parameters(name = "fileFormat={0}") public static Object[][] parameters() { - return new Object[][]{ - new Object[]{FileFormat.AVRO}, - new Object[]{FileFormat.ORC}, - new Object[]{FileFormat.PARQUET} + return new Object[][] { + new Object[] {FileFormat.AVRO}, + new Object[] {FileFormat.ORC}, + new Object[] {FileFormat.PARQUET} }; } - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); protected abstract ReaderFunction readerFunction(); @@ -78,11 +76,13 @@ private List> createRecordBatchList(int batchCount) { return recordBatchList; } - private CombinedScanTask createCombinedScanTask(List> recordBatchList) throws IOException { + private CombinedScanTask createCombinedScanTask(List> recordBatchList) + throws IOException { List fileTasks = Lists.newArrayListWithCapacity(recordBatchList.size()); for (int i = 0; i < recordBatchList.size(); ++i) { - FileScanTask fileTask = ReaderUtil.createFileTask( - recordBatchList.get(i), TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); + FileScanTask fileTask = + ReaderUtil.createFileTask( + recordBatchList.get(i), TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); fileTasks.add(fileTask); } @@ -90,7 +90,9 @@ private CombinedScanTask createCombinedScanTask(List> recordBatchLi } private void assertRecordsAndPosition( - List expectedRecords, int expectedFileOffset, long startRecordOffset, + List expectedRecords, + int expectedFileOffset, + long startRecordOffset, RecordsWithSplitIds> batch) { batch.nextSplit(); List actualRecords = Lists.newArrayList(); @@ -98,8 +100,10 @@ private void assertRecordsAndPosition( RecordAndPosition recordAndPosition; while ((recordAndPosition = batch.nextRecordFromSplit()) != null) { actualRecords.add(recordAndPosition.record()); - Assert.assertEquals("expected file offset", expectedFileOffset, recordAndPosition.fileOffset()); - Assert.assertEquals("expected record offset", recordOffset, recordAndPosition.recordOffset() - 1); + Assert.assertEquals( + "expected file offset", expectedFileOffset, recordAndPosition.fileOffset()); + Assert.assertEquals( + "expected record offset", recordOffset, recordAndPosition.recordOffset() - 1); recordOffset++; } @@ -112,7 +116,8 @@ public void testNoCheckpointedPosition() throws IOException { List> recordBatchList = createRecordBatchList(3); CombinedScanTask combinedScanTask = createCombinedScanTask(recordBatchList); IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask); - CloseableIterator>> reader = readerFunction().apply(split); + CloseableIterator>> reader = + readerFunction().apply(split); RecordsWithSplitIds> batch0 = reader.next(); assertRecordsAndPosition(recordBatchList.get(0), 0, 0L, batch0); @@ -132,7 +137,8 @@ public void testCheckpointedPositionBeforeFirstFile() throws IOException { List> recordBatchList = createRecordBatchList(3); CombinedScanTask combinedScanTask = createCombinedScanTask(recordBatchList); IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 0L); - CloseableIterator>> reader = readerFunction().apply(split); + CloseableIterator>> reader = + readerFunction().apply(split); RecordsWithSplitIds> batch0 = reader.next(); assertRecordsAndPosition(recordBatchList.get(0), 0, 0L, batch0); @@ -152,7 +158,8 @@ public void testCheckpointedPositionMiddleFirstFile() throws IOException { List> recordBatchList = createRecordBatchList(3); CombinedScanTask combinedScanTask = createCombinedScanTask(recordBatchList); IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 1L); - CloseableIterator>> reader = readerFunction().apply(split); + CloseableIterator>> reader = + readerFunction().apply(split); RecordsWithSplitIds> batch0 = reader.next(); assertRecordsAndPosition(recordBatchList.get(0).subList(1, 2), 0, 1L, batch0); @@ -172,7 +179,8 @@ public void testCheckpointedPositionAfterFirstFile() throws IOException { List> recordBatchList = createRecordBatchList(3); CombinedScanTask combinedScanTask = createCombinedScanTask(recordBatchList); IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 0, 2L); - CloseableIterator>> reader = readerFunction().apply(split); + CloseableIterator>> reader = + readerFunction().apply(split); RecordsWithSplitIds> batch1 = reader.next(); assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); @@ -188,7 +196,8 @@ public void testCheckpointedPositionBeforeSecondFile() throws IOException { List> recordBatchList = createRecordBatchList(3); CombinedScanTask combinedScanTask = createCombinedScanTask(recordBatchList); IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 1, 0L); - CloseableIterator>> reader = readerFunction().apply(split); + CloseableIterator>> reader = + readerFunction().apply(split); RecordsWithSplitIds> batch1 = reader.next(); assertRecordsAndPosition(recordBatchList.get(1), 1, 0L, batch1); @@ -204,7 +213,8 @@ public void testCheckpointedPositionMidSecondFile() throws IOException { List> recordBatchList = createRecordBatchList(3); CombinedScanTask combinedScanTask = createCombinedScanTask(recordBatchList); IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(combinedScanTask, 1, 1L); - CloseableIterator>> reader = readerFunction().apply(split); + CloseableIterator>> reader = + readerFunction().apply(split); RecordsWithSplitIds> batch1 = reader.next(); assertRecordsAndPosition(recordBatchList.get(1).subList(1, 2), 1, 1L, batch1); @@ -214,5 +224,4 @@ public void testCheckpointedPositionMidSecondFile() throws IOException { assertRecordsAndPosition(recordBatchList.get(2), 2, 0L, batch2); batch2.recycle(); } - } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java index 2431ee0fce61..7d1d41173331 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.io.File; @@ -46,30 +45,41 @@ public class ReaderUtil { - private ReaderUtil() { - } + private ReaderUtil() {} - public static FileScanTask createFileTask(List records, File file, FileFormat fileFormat, - FileAppenderFactory appenderFactory) throws IOException { - try (FileAppender appender = appenderFactory.newAppender(Files.localOutput(file), fileFormat)) { + public static FileScanTask createFileTask( + List records, + File file, + FileFormat fileFormat, + FileAppenderFactory appenderFactory) + throws IOException { + try (FileAppender appender = + appenderFactory.newAppender(Files.localOutput(file), fileFormat)) { appender.addAll(records); } - DataFile dataFile = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(records.size()) - .withFileSizeInBytes(file.length()) - .withPath(file.toString()) - .withFormat(fileFormat) - .build(); + DataFile dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(records.size()) + .withFileSizeInBytes(file.length()) + .withPath(file.toString()) + .withFormat(fileFormat) + .build(); ResidualEvaluator residuals = ResidualEvaluator.unpartitioned(Expressions.alwaysTrue()); - return new BaseFileScanTask(dataFile, null, SchemaParser.toJson(TestFixtures.SCHEMA), - PartitionSpecParser.toJson(PartitionSpec.unpartitioned()), residuals); + return new BaseFileScanTask( + dataFile, + null, + SchemaParser.toJson(TestFixtures.SCHEMA), + PartitionSpecParser.toJson(PartitionSpec.unpartitioned()), + residuals); } public static DataIterator createDataIterator(CombinedScanTask combinedTask) { return new DataIterator<>( new RowDataFileScanTaskReader(TestFixtures.SCHEMA, TestFixtures.SCHEMA, null, true), - combinedTask, new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), new PlaintextEncryptionManager()); + combinedTask, + new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), + new PlaintextEncryptionManager()); } } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java index 4e46f339a192..644ac2bad6b8 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayBatchRecords.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.util.concurrent.atomic.AtomicBoolean; @@ -27,22 +26,29 @@ public class TestArrayBatchRecords { @Test public void testFullRange() { - String[] elements = new String[]{"0", "1", "2", "3"}; + String[] elements = new String[] {"0", "1", "2", "3"}; testArray(elements, elements.length, 2, 119); } @Test public void testSubRange() { - String[] elements = new String[]{"0", "1", "2", "3"}; + String[] elements = new String[] {"0", "1", "2", "3"}; testArray(elements, 2, 0, 0); } - private void testArray(String[] elements, int numberOfRecords, int fileOffset, long startingRecordOffset) { + private void testArray( + String[] elements, int numberOfRecords, int fileOffset, long startingRecordOffset) { String splitId = "iceberg_split_1"; AtomicBoolean recycled = new AtomicBoolean(); - ArrayBatchRecords recordsWithSplitIds = ArrayBatchRecords.forRecords(splitId, - ignored -> recycled.set(true), elements, numberOfRecords, fileOffset, startingRecordOffset); + ArrayBatchRecords recordsWithSplitIds = + ArrayBatchRecords.forRecords( + splitId, + ignored -> recycled.set(true), + elements, + numberOfRecords, + fileOffset, + startingRecordOffset); Assert.assertEquals(splitId, recordsWithSplitIds.nextSplit()); @@ -59,5 +65,4 @@ private void testArray(String[] elements, int numberOfRecords, int fileOffset, l recordsWithSplitIds.recycle(); Assert.assertTrue(recycled.get()); } - } diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java index 0c23d511af5c..f964a7707689 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestArrayPoolDataIteratorBatcherRowData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.util.Arrays; @@ -44,8 +43,7 @@ public class TestArrayPoolDataIteratorBatcherRowData { - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private static final FileFormat fileFormat = FileFormat.PARQUET; private final GenericAppenderFactory appenderFactory; @@ -57,18 +55,17 @@ public TestArrayPoolDataIteratorBatcherRowData() { config.set(SourceReaderOptions.ELEMENT_QUEUE_CAPACITY, 1); // set batch array size to 2 config.set(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_RECORD_COUNT, 2); - this.batcher = new ArrayPoolDataIteratorBatcher<>(config, new RowDataRecordFactory(TestFixtures.ROW_TYPE)); + this.batcher = + new ArrayPoolDataIteratorBatcher<>(config, new RowDataRecordFactory(TestFixtures.ROW_TYPE)); this.appenderFactory = new GenericAppenderFactory(TestFixtures.SCHEMA); } - /** - * Read a CombinedScanTask that contains a single file with less than a full batch of records - */ + /** Read a CombinedScanTask that contains a single file with less than a full batch of records */ @Test public void testSingleFileLessThanOneFullBatch() throws Exception { List records = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1); - FileScanTask fileTask = ReaderUtil.createFileTask( - records, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); + FileScanTask fileTask = + ReaderUtil.createFileTask(records, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); CombinedScanTask combinedTask = new BaseCombinedScanTask(fileTask); DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); String splitId = "someSplitId"; @@ -104,13 +101,13 @@ public void testSingleFileLessThanOneFullBatch() throws Exception { /** * Read a CombinedScanTask that contains a single file with multiple batches. * - * Insert 5 records in a single file that should result in 3 batches + *

    Insert 5 records in a single file that should result in 3 batches */ @Test public void testSingleFileWithMultipleBatches() throws Exception { List records = RandomGenericData.generate(TestFixtures.SCHEMA, 5, 1); - FileScanTask fileTask = ReaderUtil.createFileTask( - records, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); + FileScanTask fileTask = + ReaderUtil.createFileTask(records, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); CombinedScanTask combinedTask = new BaseCombinedScanTask(fileTask); DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); String splitId = "someSplitId"; @@ -210,20 +207,24 @@ public void testSingleFileWithMultipleBatches() throws Exception { /** * Read a CombinedScanTask that contains with multiple files. * - * In this test, we also seek the iterator to starting position (1, 1). + *

    In this test, we also seek the iterator to starting position (1, 1). */ @Test public void testMultipleFilesWithSeekPosition() throws Exception { List records0 = RandomGenericData.generate(TestFixtures.SCHEMA, 1, 1); - FileScanTask fileTask0 = ReaderUtil.createFileTask( - records0, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); + FileScanTask fileTask0 = + ReaderUtil.createFileTask( + records0, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); List records1 = RandomGenericData.generate(TestFixtures.SCHEMA, 4, 2); - FileScanTask fileTask1 = ReaderUtil.createFileTask( - records1, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); + FileScanTask fileTask1 = + ReaderUtil.createFileTask( + records1, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); List records2 = RandomGenericData.generate(TestFixtures.SCHEMA, 3, 3); - FileScanTask fileTask2 = ReaderUtil.createFileTask( - records2, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); - CombinedScanTask combinedTask = new BaseCombinedScanTask(Arrays.asList(fileTask0, fileTask1, fileTask2)); + FileScanTask fileTask2 = + ReaderUtil.createFileTask( + records2, TEMPORARY_FOLDER.newFile(), fileFormat, appenderFactory); + CombinedScanTask combinedTask = + new BaseCombinedScanTask(Arrays.asList(fileTask0, fileTask1, fileTask2)); DataIterator dataIterator = ReaderUtil.createDataIterator(combinedTask); // seek to file1 and after record 1 diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java index e6ec852969b2..aee271a3a7b8 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.reader; import java.util.List; @@ -39,10 +38,9 @@ public class TestRowDataReaderFunction extends ReaderFunctionTestBase { - protected static final RowType rowType = FlinkSchemaUtil - .convert(TestFixtures.SCHEMA); - private static final DataStructureConverter rowDataConverter = DataStructureConverters.getConverter( - TypeConversions.fromLogicalToDataType(rowType)); + protected static final RowType rowType = FlinkSchemaUtil.convert(TestFixtures.SCHEMA); + private static final DataStructureConverter rowDataConverter = + DataStructureConverters.getConverter(TypeConversions.fromLogicalToDataType(rowType)); public TestRowDataReaderFunction(FileFormat fileFormat) { super(fileFormat); @@ -50,8 +48,14 @@ public TestRowDataReaderFunction(FileFormat fileFormat) { @Override protected ReaderFunction readerFunction() { - return new RowDataReaderFunction(new Configuration(), TestFixtures.SCHEMA, TestFixtures.SCHEMA, null, true, - new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), new PlaintextEncryptionManager()); + return new RowDataReaderFunction( + new Configuration(), + TestFixtures.SCHEMA, + TestFixtures.SCHEMA, + null, + true, + new HadoopFileIO(new org.apache.hadoop.conf.Configuration()), + new PlaintextEncryptionManager()); } @Override diff --git a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java index 36eea1e8a409..046b0c31ce2e 100644 --- a/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java +++ b/flink/v1.15/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.flink.source.split; import java.util.List; @@ -30,8 +29,7 @@ public class TestIcebergSourceSplitSerializer { - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); private final IcebergSourceSplitSerializer serializer = IcebergSourceSplitSerializer.INSTANCE; @@ -42,8 +40,9 @@ public void testLatestVersion() throws Exception { } private void serializeAndDeserialize(int splitCount, int filesPerSplit) throws Exception { - final List splits = SplitHelpers - .createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, splitCount, filesPerSplit); + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable( + TEMPORARY_FOLDER, splitCount, filesPerSplit); for (IcebergSourceSplit split : splits) { byte[] result = serializer.serialize(split); IcebergSourceSplit deserialized = serializer.deserialize(serializer.getVersion(), result); @@ -51,14 +50,16 @@ private void serializeAndDeserialize(int splitCount, int filesPerSplit) throws E byte[] cachedResult = serializer.serialize(split); Assert.assertSame(result, cachedResult); - IcebergSourceSplit deserialized2 = serializer.deserialize(serializer.getVersion(), cachedResult); + IcebergSourceSplit deserialized2 = + serializer.deserialize(serializer.getVersion(), cachedResult); assertSplitEquals(split, deserialized2); split.updatePosition(0, 100); byte[] resultAfterUpdatePosition = serializer.serialize(split); // after position change, serialized bytes should have changed Assert.assertNotSame(cachedResult, resultAfterUpdatePosition); - IcebergSourceSplit deserialized3 = serializer.deserialize(serializer.getVersion(), resultAfterUpdatePosition); + IcebergSourceSplit deserialized3 = + serializer.deserialize(serializer.getVersion(), resultAfterUpdatePosition); assertSplitEquals(split, deserialized3); } } @@ -70,8 +71,9 @@ public void testV1() throws Exception { } private void serializeAndDeserializeV1(int splitCount, int filesPerSplit) throws Exception { - final List splits = SplitHelpers - .createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, splitCount, filesPerSplit); + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable( + TEMPORARY_FOLDER, splitCount, filesPerSplit); for (IcebergSourceSplit split : splits) { byte[] result = split.serializeV1(); IcebergSourceSplit deserialized = IcebergSourceSplit.deserializeV1(result); @@ -82,19 +84,22 @@ private void serializeAndDeserializeV1(int splitCount, int filesPerSplit) throws @Test public void testCheckpointedPosition() throws Exception { final AtomicInteger index = new AtomicInteger(); - final List splits = SplitHelpers - .createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 10, 2).stream() - .map(split -> { - IcebergSourceSplit result; - if (index.get() % 2 == 0) { - result = IcebergSourceSplit.fromCombinedScanTask(split.task(), index.get(), index.get()); - } else { - result = split; - } - index.incrementAndGet(); - return result; - }) - .collect(Collectors.toList()); + final List splits = + SplitHelpers.createSplitsFromTransientHadoopTable(TEMPORARY_FOLDER, 10, 2).stream() + .map( + split -> { + IcebergSourceSplit result; + if (index.get() % 2 == 0) { + result = + IcebergSourceSplit.fromCombinedScanTask( + split.task(), index.get(), index.get()); + } else { + result = split; + } + index.incrementAndGet(); + return result; + }) + .collect(Collectors.toList()); for (IcebergSourceSplit split : splits) { byte[] result = serializer.serialize(split); @@ -103,7 +108,8 @@ public void testCheckpointedPosition() throws Exception { byte[] cachedResult = serializer.serialize(split); Assert.assertSame(result, cachedResult); - IcebergSourceSplit deserialized2 = serializer.deserialize(serializer.getVersion(), cachedResult); + IcebergSourceSplit deserialized2 = + serializer.deserialize(serializer.getVersion(), cachedResult); assertSplitEquals(split, deserialized2); } } diff --git a/gcp/src/main/java/org/apache/iceberg/gcp/GCPProperties.java b/gcp/src/main/java/org/apache/iceberg/gcp/GCPProperties.java index 0f092be0fdad..457a76313d15 100644 --- a/gcp/src/main/java/org/apache/iceberg/gcp/GCPProperties.java +++ b/gcp/src/main/java/org/apache/iceberg/gcp/GCPProperties.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.gcp; import java.io.Serializable; @@ -48,8 +47,7 @@ public class GCPProperties implements Serializable { private Integer gcsChannelReadChunkSize; private Integer gcsChannelWriteChunkSize; - public GCPProperties() { - } + public GCPProperties() {} public GCPProperties(Map properties) { projectId = properties.get(GCS_PROJECT_ID); diff --git a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/BaseGCSFile.java b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/BaseGCSFile.java index b44f9a6359ed..d34ea63eee82 100644 --- a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/BaseGCSFile.java +++ b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/BaseGCSFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.gcp.gcs; import com.google.cloud.storage.Blob; diff --git a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSFileIO.java b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSFileIO.java index ecb520f1d20b..f77146c14be2 100644 --- a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSFileIO.java +++ b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSFileIO.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.gcp.gcs; import com.google.cloud.storage.BlobId; @@ -36,16 +35,20 @@ /** * FileIO Implementation backed by Google Cloud Storage (GCS) - *

    - * Locations follow the conventions used by - * {@link com.google.cloud.storage.BlobId#fromGsUtilUri(String) BlobId.fromGsUtilUri} - * that follow the convention

    {@code gs:///}
    - *

    - * See Cloud Storage Overview + * + *

    Locations follow the conventions used by {@link + * com.google.cloud.storage.BlobId#fromGsUtilUri(String) BlobId.fromGsUtilUri} that follow the + * convention + * + *

    {@code gs:///}
    + * + *

    See Cloud Storage + * Overview */ public class GCSFileIO implements FileIO { private static final Logger LOG = LoggerFactory.getLogger(GCSFileIO.class); - private static final String DEFAULT_METRICS_IMPL = "org.apache.iceberg.hadoop.HadoopMetricsContext"; + private static final String DEFAULT_METRICS_IMPL = + "org.apache.iceberg.hadoop.HadoopMetricsContext"; private SerializableSupplier storageSupplier; private GCPProperties gcpProperties; @@ -56,16 +59,16 @@ public class GCSFileIO implements FileIO { /** * No-arg constructor to load the FileIO dynamically. - *

    - * All fields are initialized by calling {@link GCSFileIO#initialize(Map)} later. + * + *

    All fields are initialized by calling {@link GCSFileIO#initialize(Map)} later. */ - public GCSFileIO() { - } + public GCSFileIO() {} /** * Constructor with custom storage supplier and GCP properties. - *

    - * Calling {@link GCSFileIO#initialize(Map)} will overwrite information set in this constructor. + * + *

    Calling {@link GCSFileIO#initialize(Map)} will overwrite information set in this + * constructor. * * @param storageSupplier storage supplier * @param gcpProperties gcp properties @@ -121,26 +124,32 @@ public void initialize(Map props) { this.properties = props; this.gcpProperties = new GCPProperties(props); - this.storageSupplier = () -> { - StorageOptions.Builder builder = StorageOptions.newBuilder(); - - gcpProperties.projectId().ifPresent(builder::setProjectId); - gcpProperties.clientLibToken().ifPresent(builder::setClientLibToken); - gcpProperties.serviceHost().ifPresent(builder::setHost); - - // Report Hadoop metrics if Hadoop is available - try { - DynConstructors.Ctor ctor = - DynConstructors.builder(MetricsContext.class).hiddenImpl(DEFAULT_METRICS_IMPL, String.class).buildChecked(); - MetricsContext context = ctor.newInstance("gcs"); - context.initialize(props); - this.metrics = context; - } catch (NoClassDefFoundError | NoSuchMethodException | ClassCastException e) { - LOG.warn("Unable to load metrics class: '{}', falling back to null metrics", DEFAULT_METRICS_IMPL, e); - } - - return builder.build().getService(); - }; + this.storageSupplier = + () -> { + StorageOptions.Builder builder = StorageOptions.newBuilder(); + + gcpProperties.projectId().ifPresent(builder::setProjectId); + gcpProperties.clientLibToken().ifPresent(builder::setClientLibToken); + gcpProperties.serviceHost().ifPresent(builder::setHost); + + // Report Hadoop metrics if Hadoop is available + try { + DynConstructors.Ctor ctor = + DynConstructors.builder(MetricsContext.class) + .hiddenImpl(DEFAULT_METRICS_IMPL, String.class) + .buildChecked(); + MetricsContext context = ctor.newInstance("gcs"); + context.initialize(props); + this.metrics = context; + } catch (NoClassDefFoundError | NoSuchMethodException | ClassCastException e) { + LOG.warn( + "Unable to load metrics class: '{}', falling back to null metrics", + DEFAULT_METRICS_IMPL, + e); + } + + return builder.build().getService(); + }; } @Override diff --git a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputFile.java b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputFile.java index c220615e40ec..a911296a59dd 100644 --- a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputFile.java +++ b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.gcp.gcs; import com.google.cloud.storage.BlobId; @@ -29,18 +28,31 @@ class GCSInputFile extends BaseGCSFile implements InputFile { private Long length; - static GCSInputFile fromLocation(String location, Storage storage, - GCPProperties gcpProperties, MetricsContext metrics) { + static GCSInputFile fromLocation( + String location, Storage storage, GCPProperties gcpProperties, MetricsContext metrics) { return new GCSInputFile(storage, BlobId.fromGsUtilUri(location), null, gcpProperties, metrics); } - static GCSInputFile fromLocation(String location, long length, Storage storage, - GCPProperties gcpProperties, MetricsContext metrics) { + static GCSInputFile fromLocation( + String location, + long length, + Storage storage, + GCPProperties gcpProperties, + MetricsContext metrics) { return new GCSInputFile( - storage, BlobId.fromGsUtilUri(location), length > 0 ? length : null, gcpProperties, metrics); + storage, + BlobId.fromGsUtilUri(location), + length > 0 ? length : null, + gcpProperties, + metrics); } - GCSInputFile(Storage storage, BlobId blobId, Long length, GCPProperties gcpProperties, MetricsContext metrics) { + GCSInputFile( + Storage storage, + BlobId blobId, + Long length, + GCPProperties gcpProperties, + MetricsContext metrics) { super(storage, blobId, gcpProperties, metrics); this.length = length; } diff --git a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputStream.java b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputStream.java index 873cff872bec..e8515d42207e 100644 --- a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputStream.java +++ b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.gcp.gcs; import com.google.api.client.util.Lists; @@ -41,8 +40,8 @@ import org.slf4j.LoggerFactory; /** - * The GCSInputStream leverages native streaming channels from the GCS API - * for streaming uploads. See Streaming Transfers + * The GCSInputStream leverages native streaming channels from the GCS API for streaming uploads. + * See Streaming Transfers */ class GCSInputStream extends SeekableInputStream { private static final Logger LOG = LoggerFactory.getLogger(GCSInputStream.class); @@ -61,13 +60,15 @@ class GCSInputStream extends SeekableInputStream { private final Counter readBytes; private final Counter readOperations; - GCSInputStream(Storage storage, BlobId blobId, GCPProperties gcpProperties, MetricsContext metrics) { + GCSInputStream( + Storage storage, BlobId blobId, GCPProperties gcpProperties, MetricsContext metrics) { this.storage = storage; this.blobId = blobId; this.gcpProperties = gcpProperties; this.readBytes = metrics.counter(FileIOMetricsContext.READ_BYTES, Long.class, Unit.BYTES); - this.readOperations = metrics.counter(FileIOMetricsContext.READ_OPERATIONS, Integer.class, Unit.COUNT); + this.readOperations = + metrics.counter(FileIOMetricsContext.READ_OPERATIONS, Integer.class, Unit.COUNT); createStack = Thread.currentThread().getStackTrace(); @@ -77,10 +78,12 @@ class GCSInputStream extends SeekableInputStream { private void openStream() { List sourceOptions = Lists.newArrayList(); - gcpProperties.decryptionKey().ifPresent( - key -> sourceOptions.add(BlobSourceOption.decryptionKey(key))); - gcpProperties.userProject().ifPresent( - userProject -> sourceOptions.add(BlobSourceOption.userProject(userProject))); + gcpProperties + .decryptionKey() + .ifPresent(key -> sourceOptions.add(BlobSourceOption.decryptionKey(key))); + gcpProperties + .userProject() + .ifPresent(userProject -> sourceOptions.add(BlobSourceOption.userProject(userProject))); channel = storage.reader(blobId, sourceOptions.toArray(new BlobSourceOption[0])); @@ -149,8 +152,7 @@ protected void finalize() throws Throwable { super.finalize(); if (!closed) { close(); // releasing resources is more important than printing the warning - String trace = Joiner.on("\n\t").join( - Arrays.copyOfRange(createStack, 1, createStack.length)); + String trace = Joiner.on("\n\t").join(Arrays.copyOfRange(createStack, 1, createStack.length)); LOG.warn("Unclosed input stream created by:\n\t{}", trace); } } diff --git a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSOutputFile.java b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSOutputFile.java index a8f5d608082b..ae6f51775bdb 100644 --- a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSOutputFile.java +++ b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSOutputFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.gcp.gcs; import com.google.cloud.storage.BlobId; @@ -32,18 +31,19 @@ class GCSOutputFile extends BaseGCSFile implements OutputFile { - static GCSOutputFile fromLocation(String location, Storage storage, - GCPProperties gcpProperties, MetricsContext metrics) { + static GCSOutputFile fromLocation( + String location, Storage storage, GCPProperties gcpProperties, MetricsContext metrics) { return new GCSOutputFile(storage, BlobId.fromGsUtilUri(location), gcpProperties, metrics); } - GCSOutputFile(Storage storage, BlobId blobId, GCPProperties gcpProperties, MetricsContext metrics) { + GCSOutputFile( + Storage storage, BlobId blobId, GCPProperties gcpProperties, MetricsContext metrics) { super(storage, blobId, gcpProperties, metrics); } /** - * Create an output stream for the specified location if the target object - * does not exist in GCS at the time of invocation. + * Create an output stream for the specified location if the target object does not exist in GCS + * at the time of invocation. * * @return output stream */ diff --git a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSOutputStream.java b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSOutputStream.java index 493c66fbffa5..03ab5335f561 100644 --- a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSOutputStream.java +++ b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSOutputStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.gcp.gcs; import com.google.api.client.util.Lists; @@ -41,8 +40,8 @@ import org.slf4j.LoggerFactory; /** - * The GCSOutputStream leverages native streaming channels from the GCS API - * for streaming uploads. See Streaming Transfers + * The GCSOutputStream leverages native streaming channels from the GCS API for streaming uploads. + * See Streaming Transfers */ class GCSOutputStream extends PositionOutputStream { private static final Logger LOG = LoggerFactory.getLogger(GCSOutputStream.class); @@ -60,8 +59,9 @@ class GCSOutputStream extends PositionOutputStream { private long pos = 0; private boolean closed = false; - GCSOutputStream(Storage storage, BlobId blobId, GCPProperties gcpProperties, - MetricsContext metrics) throws IOException { + GCSOutputStream( + Storage storage, BlobId blobId, GCPProperties gcpProperties, MetricsContext metrics) + throws IOException { this.storage = storage; this.blobId = blobId; this.gcpProperties = gcpProperties; @@ -69,7 +69,8 @@ class GCSOutputStream extends PositionOutputStream { createStack = Thread.currentThread().getStackTrace(); this.writeBytes = metrics.counter(FileIOMetricsContext.WRITE_BYTES, Long.class, Unit.BYTES); - this.writeOperations = metrics.counter(FileIOMetricsContext.WRITE_OPERATIONS, Integer.class, Unit.COUNT); + this.writeOperations = + metrics.counter(FileIOMetricsContext.WRITE_OPERATIONS, Integer.class, Unit.COUNT); openStream(); } @@ -103,13 +104,16 @@ public void write(byte[] b, int off, int len) throws IOException { private void openStream() { List writeOptions = Lists.newArrayList(); - gcpProperties.encryptionKey().ifPresent( - key -> writeOptions.add(BlobWriteOption.encryptionKey(key))); - gcpProperties.userProject().ifPresent( - userProject -> writeOptions.add(BlobWriteOption.userProject(userProject))); + gcpProperties + .encryptionKey() + .ifPresent(key -> writeOptions.add(BlobWriteOption.encryptionKey(key))); + gcpProperties + .userProject() + .ifPresent(userProject -> writeOptions.add(BlobWriteOption.userProject(userProject))); - WriteChannel channel = storage.writer(BlobInfo.newBuilder(blobId).build(), - writeOptions.toArray(new BlobWriteOption[0])); + WriteChannel channel = + storage.writer( + BlobInfo.newBuilder(blobId).build(), writeOptions.toArray(new BlobWriteOption[0])); gcpProperties.channelWriteChunkSize().ifPresent(channel::setChunkSize); @@ -127,15 +131,13 @@ public void close() throws IOException { stream.close(); } - @SuppressWarnings("checkstyle:NoFinalizer") @Override protected void finalize() throws Throwable { super.finalize(); if (!closed) { close(); // releasing resources is more important than printing the warning - String trace = Joiner.on("\n\t").join( - Arrays.copyOfRange(createStack, 1, createStack.length)); + String trace = Joiner.on("\n\t").join(Arrays.copyOfRange(createStack, 1, createStack.length)); LOG.warn("Unclosed output stream created by:\n\t{}", trace); } } diff --git a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/GCSFileIOTest.java b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/GCSFileIOTest.java index 99bb08d4bf55..7ebaf0682976 100644 --- a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/GCSFileIOTest.java +++ b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/GCSFileIOTest.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.gcp.gcs; +import static java.lang.String.format; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertFalse; + import com.google.cloud.storage.BlobInfo; import com.google.cloud.storage.Storage; import com.google.cloud.storage.contrib.nio.testing.LocalStorageHelper; @@ -34,10 +37,6 @@ import org.junit.Before; import org.junit.Test; -import static java.lang.String.format; -import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.Assert.assertFalse; - public class GCSFileIOTest { private static final String TEST_BUCKET = "TEST_BUCKET"; private final Random random = new Random(1); @@ -53,7 +52,7 @@ public void before() { @Test public void newInputFile() throws IOException { String location = format("gs://%s/path/to/file.txt", TEST_BUCKET); - byte [] expected = new byte[1024 * 1024]; + byte[] expected = new byte[1024 * 1024]; random.nextBytes(expected); InputFile in = io.newInputFile(location); @@ -65,7 +64,7 @@ public void newInputFile() throws IOException { } assertThat(in.exists()).isTrue(); - byte [] actual = new byte[1024 * 1024]; + byte[] actual = new byte[1024 * 1024]; try (InputStream is = in.newStream()) { IOUtils.readFully(is, actual); @@ -84,13 +83,17 @@ public void testDelete() { storage.create(BlobInfo.newBuilder(TEST_BUCKET, path).build()); // There should be one blob in the bucket - assertThat(StreamSupport.stream(storage.list(TEST_BUCKET).iterateAll().spliterator(), false).count()) + assertThat( + StreamSupport.stream(storage.list(TEST_BUCKET).iterateAll().spliterator(), false) + .count()) .isEqualTo(1); io.deleteFile(format("gs://%s/%s", TEST_BUCKET, path)); // The bucket should now be empty - assertThat(StreamSupport.stream(storage.list(TEST_BUCKET).iterateAll().spliterator(), false).count()) + assertThat( + StreamSupport.stream(storage.list(TEST_BUCKET).iterateAll().spliterator(), false) + .count()) .isZero(); } } diff --git a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/GCSInputStreamTest.java b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/GCSInputStreamTest.java index f1a2815e7965..481c40430c41 100644 --- a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/GCSInputStreamTest.java +++ b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/GCSInputStreamTest.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.gcp.gcs; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; + import com.google.cloud.storage.BlobId; import com.google.cloud.storage.BlobInfo; import com.google.cloud.storage.Storage; @@ -33,10 +36,6 @@ import org.apache.iceberg.metrics.MetricsContext; import org.junit.Test; -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertThrows; - public class GCSInputStreamTest { private final Random random = new Random(1); @@ -52,10 +51,10 @@ public void testRead() throws Exception { writeGCSData(uri, data); - try (SeekableInputStream in = new GCSInputStream(storage, uri, gcpProperties, - MetricsContext.nullMetrics())) { + try (SeekableInputStream in = + new GCSInputStream(storage, uri, gcpProperties, MetricsContext.nullMetrics())) { int readSize = 1024; - byte [] actual = new byte[readSize]; + byte[] actual = new byte[readSize]; readAndCheck(in, in.getPos(), readSize, data, false); readAndCheck(in, in.getPos(), readSize, data, true); @@ -80,13 +79,14 @@ public void testRead() throws Exception { } } - private void readAndCheck(SeekableInputStream in, long rangeStart, int size, byte [] original, boolean buffered) + private void readAndCheck( + SeekableInputStream in, long rangeStart, int size, byte[] original, boolean buffered) throws IOException { in.seek(rangeStart); assertEquals(rangeStart, in.getPos()); long rangeEnd = rangeStart + size; - byte [] actual = new byte[size]; + byte[] actual = new byte[size]; if (buffered) { IOUtils.readFully(in, actual); @@ -104,8 +104,8 @@ private void readAndCheck(SeekableInputStream in, long rangeStart, int size, byt @Test public void testClose() throws Exception { BlobId blobId = BlobId.fromGsUtilUri("gs://bucket/path/to/closed.dat"); - SeekableInputStream closed = new GCSInputStream(storage, blobId, gcpProperties, - MetricsContext.nullMetrics()); + SeekableInputStream closed = + new GCSInputStream(storage, blobId, gcpProperties, MetricsContext.nullMetrics()); closed.close(); assertThrows(IllegalStateException.class, () -> closed.seek(0)); } @@ -117,14 +117,14 @@ public void testSeek() throws Exception { writeGCSData(blobId, data); - try (SeekableInputStream in = new GCSInputStream(storage, blobId, gcpProperties, - MetricsContext.nullMetrics())) { + try (SeekableInputStream in = + new GCSInputStream(storage, blobId, gcpProperties, MetricsContext.nullMetrics())) { in.seek(data.length / 2); - byte[] actual = new byte[data.length / 2 ]; + byte[] actual = new byte[data.length / 2]; IOUtils.readFully(in, actual, 0, data.length / 2); - byte [] expected = Arrays.copyOfRange(data, data.length / 2, data.length); + byte[] expected = Arrays.copyOfRange(data, data.length / 2, data.length); assertArrayEquals(expected, actual); } } diff --git a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/GCSOutputStreamTest.java b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/GCSOutputStreamTest.java index 85d0e12ad17d..aa464b7506e4 100644 --- a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/GCSOutputStreamTest.java +++ b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/GCSOutputStreamTest.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.gcp.gcs; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; + import com.google.cloud.storage.BlobId; import com.google.cloud.storage.Storage; import com.google.cloud.storage.contrib.nio.testing.LocalStorageHelper; @@ -33,9 +35,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; - public class GCSOutputStreamTest { private static final Logger LOG = LoggerFactory.getLogger(GCSOutputStreamTest.class); private static final String BUCKET = "test-bucket"; @@ -47,13 +46,15 @@ public class GCSOutputStreamTest { @Test public void testWrite() { // Run tests for both byte and array write paths - Stream.of(true, false).forEach(arrayWrite -> { - // Test small file write - writeAndVerify(storage, randomBlobId(), randomData(1024), arrayWrite); + Stream.of(true, false) + .forEach( + arrayWrite -> { + // Test small file write + writeAndVerify(storage, randomBlobId(), randomData(1024), arrayWrite); - // Test large file - writeAndVerify(storage, randomBlobId(), randomData(10 * 1024 * 1024), arrayWrite); - }); + // Test large file + writeAndVerify(storage, randomBlobId(), randomData(10 * 1024 * 1024), arrayWrite); + }); } @Test @@ -64,10 +65,9 @@ public void testMultipleClose() throws IOException { stream.close(); } - - private void writeAndVerify(Storage client, BlobId uri, byte [] data, boolean arrayWrite) { - try (GCSOutputStream stream = new GCSOutputStream(client, uri, properties, - MetricsContext.nullMetrics())) { + private void writeAndVerify(Storage client, BlobId uri, byte[] data, boolean arrayWrite) { + try (GCSOutputStream stream = + new GCSOutputStream(client, uri, properties, MetricsContext.nullMetrics())) { if (arrayWrite) { stream.write(data); assertEquals(data.length, stream.getPos()); @@ -90,7 +90,7 @@ private byte[] readGCSData(BlobId blobId) { } private byte[] randomData(int size) { - byte [] result = new byte[size]; + byte[] result = new byte[size]; random.nextBytes(result); return result; } diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/CachedClientPool.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/CachedClientPool.java index 117ddf3fb8ce..431f721632a5 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/CachedClientPool.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/CachedClientPool.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; import com.github.benmanes.caffeine.cache.Cache; @@ -44,10 +43,14 @@ public class CachedClientPool implements ClientPool properties) { this.conf = conf; this.metastoreUri = conf.get(HiveConf.ConfVars.METASTOREURIS.varname, ""); - this.clientPoolSize = PropertyUtil.propertyAsInt(properties, + this.clientPoolSize = + PropertyUtil.propertyAsInt( + properties, CatalogProperties.CLIENT_POOL_SIZE, CatalogProperties.CLIENT_POOL_SIZE_DEFAULT); - this.evictionInterval = PropertyUtil.propertyAsLong(properties, + this.evictionInterval = + PropertyUtil.propertyAsLong( + properties, CatalogProperties.CLIENT_POOL_CACHE_EVICTION_INTERVAL_MS, CatalogProperties.CLIENT_POOL_CACHE_EVICTION_INTERVAL_MS_DEFAULT); init(); @@ -60,7 +63,9 @@ HiveClientPool clientPool() { private synchronized void init() { if (clientPoolCache == null) { - clientPoolCache = Caffeine.newBuilder().expireAfterAccess(evictionInterval, TimeUnit.MILLISECONDS) + clientPoolCache = + Caffeine.newBuilder() + .expireAfterAccess(evictionInterval, TimeUnit.MILLISECONDS) .removalListener((key, value, cause) -> ((HiveClientPool) value).close()) .build(); } @@ -72,7 +77,8 @@ static Cache clientPoolCache() { } @Override - public R run(Action action) throws TException, InterruptedException { + public R run(Action action) + throws TException, InterruptedException { return clientPool().run(action); } diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java index 4fae9ca878c8..a86ce9dcdc49 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; import java.util.List; @@ -73,8 +72,7 @@ public class HiveCatalog extends BaseMetastoreCatalog implements SupportsNamespa private boolean listAllTables = false; private Map catalogProperties; - public HiveCatalog() { - } + public HiveCatalog() {} @Override public void initialize(String inputName, Map properties) { @@ -90,22 +88,27 @@ public void initialize(String inputName, Map properties) { } if (properties.containsKey(CatalogProperties.WAREHOUSE_LOCATION)) { - this.conf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, + this.conf.set( + HiveConf.ConfVars.METASTOREWAREHOUSE.varname, LocationUtil.stripTrailingSlash(properties.get(CatalogProperties.WAREHOUSE_LOCATION))); } - this.listAllTables = Boolean.parseBoolean(properties.getOrDefault(LIST_ALL_TABLES, LIST_ALL_TABLES_DEFAULT)); + this.listAllTables = + Boolean.parseBoolean(properties.getOrDefault(LIST_ALL_TABLES, LIST_ALL_TABLES_DEFAULT)); String fileIOImpl = properties.get(CatalogProperties.FILE_IO_IMPL); - this.fileIO = fileIOImpl == null ? new HadoopFileIO(conf) : CatalogUtil.loadFileIO(fileIOImpl, properties, conf); + this.fileIO = + fileIOImpl == null + ? new HadoopFileIO(conf) + : CatalogUtil.loadFileIO(fileIOImpl, properties, conf); this.clients = new CachedClientPool(conf, properties); } @Override public List listTables(Namespace namespace) { - Preconditions.checkArgument(isValidateNamespace(namespace), - "Missing database in namespace: %s", namespace); + Preconditions.checkArgument( + isValidateNamespace(namespace), "Missing database in namespace: %s", namespace); String database = namespace.level(0); try { @@ -113,19 +116,31 @@ public List listTables(Namespace namespace) { List tableIdentifiers; if (listAllTables) { - tableIdentifiers = tableNames.stream() - .map(t -> TableIdentifier.of(namespace, t)) - .collect(Collectors.toList()); + tableIdentifiers = + tableNames.stream() + .map(t -> TableIdentifier.of(namespace, t)) + .collect(Collectors.toList()); } else { - List

  • tableObjects = clients.run(client -> client.getTableObjectsByName(database, tableNames)); - tableIdentifiers = tableObjects.stream() - .filter(table -> table.getParameters() != null && BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE - .equalsIgnoreCase(table.getParameters().get(BaseMetastoreTableOperations.TABLE_TYPE_PROP))) - .map(table -> TableIdentifier.of(namespace, table.getTableName())) - .collect(Collectors.toList()); + List
    tableObjects = + clients.run(client -> client.getTableObjectsByName(database, tableNames)); + tableIdentifiers = + tableObjects.stream() + .filter( + table -> + table.getParameters() != null + && BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE + .equalsIgnoreCase( + table + .getParameters() + .get(BaseMetastoreTableOperations.TABLE_TYPE_PROP))) + .map(table -> TableIdentifier.of(namespace, table.getTableName())) + .collect(Collectors.toList()); } - LOG.debug("Listing of namespace: {} resulted in the following tables: {}", namespace, tableIdentifiers); + LOG.debug( + "Listing of namespace: {} resulted in the following tables: {}", + namespace, + tableIdentifiers); return tableIdentifiers; } catch (UnknownDBException e) { @@ -162,12 +177,15 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { } try { - clients.run(client -> { - client.dropTable(database, identifier.name(), - false /* do not delete data */, - false /* throw NoSuchObjectException if the table doesn't exist */); - return null; - }); + clients.run( + client -> { + client.dropTable( + database, + identifier.name(), + false /* do not delete data */, + false /* throw NoSuchObjectException if the table doesn't exist */); + return null; + }); if (purge && lastMetadata != null) { CatalogUtil.dropTableData(ops.io(), lastMetadata); @@ -209,10 +227,11 @@ public void renameTable(TableIdentifier from, TableIdentifier originalTo) { table.setDbName(toDatabase); table.setTableName(to.name()); - clients.run(client -> { - MetastoreUtil.alterTable(client, fromDatabase, fromName, table); - return null; - }); + clients.run( + client -> { + MetastoreUtil.alterTable(client, fromDatabase, fromName, table); + return null; + }); LOG.info("Renamed table from {}, to {}", from, to); @@ -220,7 +239,8 @@ public void renameTable(TableIdentifier from, TableIdentifier originalTo) { throw new NoSuchTableException("Table does not exist: %s", from); } catch (AlreadyExistsException e) { - throw new org.apache.iceberg.exceptions.AlreadyExistsException("Table already exists: %s", to); + throw new org.apache.iceberg.exceptions.AlreadyExistsException( + "Table already exists: %s", to); } catch (TException e) { throw new RuntimeException("Failed to rename " + from + " to " + to, e); @@ -234,25 +254,28 @@ public void renameTable(TableIdentifier from, TableIdentifier originalTo) { @Override public void createNamespace(Namespace namespace, Map meta) { Preconditions.checkArgument( - !namespace.isEmpty(), - "Cannot create namespace with invalid name: %s", namespace); - Preconditions.checkArgument(isValidateNamespace(namespace), - "Cannot support multi part namespace in Hive Metastore: %s", namespace); + !namespace.isEmpty(), "Cannot create namespace with invalid name: %s", namespace); + Preconditions.checkArgument( + isValidateNamespace(namespace), + "Cannot support multi part namespace in Hive Metastore: %s", + namespace); try { - clients.run(client -> { - client.createDatabase(convertToDatabase(namespace, meta)); - return null; - }); + clients.run( + client -> { + client.createDatabase(convertToDatabase(namespace, meta)); + return null; + }); LOG.info("Created namespace: {}", namespace); } catch (AlreadyExistsException e) { - throw new org.apache.iceberg.exceptions.AlreadyExistsException(e, "Namespace '%s' already exists!", - namespace); + throw new org.apache.iceberg.exceptions.AlreadyExistsException( + e, "Namespace '%s' already exists!", namespace); } catch (TException e) { - throw new RuntimeException("Failed to create namespace " + namespace + " in Hive Metastore", e); + throw new RuntimeException( + "Failed to create namespace " + namespace + " in Hive Metastore", e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); @@ -270,16 +293,17 @@ public List listNamespaces(Namespace namespace) { return ImmutableList.of(); } try { - List namespaces = clients.run(IMetaStoreClient::getAllDatabases) - .stream() - .map(Namespace::of) - .collect(Collectors.toList()); + List namespaces = + clients.run(IMetaStoreClient::getAllDatabases).stream() + .map(Namespace::of) + .collect(Collectors.toList()); LOG.debug("Listing namespace {} returned tables: {}", namespace, namespaces); return namespaces; } catch (TException e) { - throw new RuntimeException("Failed to list all namespace: " + namespace + " in Hive Metastore", e); + throw new RuntimeException( + "Failed to list all namespace: " + namespace + " in Hive Metastore", e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); @@ -295,19 +319,22 @@ public boolean dropNamespace(Namespace namespace) { } try { - clients.run(client -> { - client.dropDatabase(namespace.level(0), - false /* deleteData */, - false /* ignoreUnknownDb */, - false /* cascade */); - return null; - }); + clients.run( + client -> { + client.dropDatabase( + namespace.level(0), + false /* deleteData */, + false /* ignoreUnknownDb */, + false /* cascade */); + return null; + }); LOG.info("Dropped namespace: {}", namespace); return true; } catch (InvalidOperationException e) { - throw new NamespaceNotEmptyException(e, "Namespace %s is not empty. One or more tables exist.", namespace); + throw new NamespaceNotEmptyException( + e, "Namespace %s is not empty. One or more tables exist.", namespace); } catch (NoSuchObjectException e) { return false; @@ -323,7 +350,7 @@ public boolean dropNamespace(Namespace namespace) { } @Override - public boolean setProperties(Namespace namespace, Map properties) { + public boolean setProperties(Namespace namespace, Map properties) { Map parameter = Maps.newHashMap(); parameter.putAll(loadNamespaceMetadata(namespace)); @@ -338,7 +365,7 @@ public boolean setProperties(Namespace namespace, Map propertie } @Override - public boolean removeProperties(Namespace namespace, Set properties) { + public boolean removeProperties(Namespace namespace, Set properties) { Map parameter = Maps.newHashMap(); parameter.putAll(loadNamespaceMetadata(namespace)); @@ -352,12 +379,13 @@ public boolean removeProperties(Namespace namespace, Set properties) { return true; } - private void alterHiveDataBase(Namespace namespace, Database database) { + private void alterHiveDataBase(Namespace namespace, Database database) { try { - clients.run(client -> { - client.alterDatabase(namespace.level(0), database); - return null; - }); + clients.run( + client -> { + client.alterDatabase(namespace.level(0), database); + return null; + }); } catch (NoSuchObjectException | UnknownDBException e) { throw new NoSuchNamespaceException(e, "Namespace does not exist: %s", namespace); @@ -368,7 +396,8 @@ private void alterHiveDataBase(Namespace namespace, Database database) { } catch (InterruptedException e) { Thread.currentThread().interrupt(); - throw new RuntimeException("Interrupted in call to getDatabase(name) " + namespace + " in Hive Metastore", e); + throw new RuntimeException( + "Interrupted in call to getDatabase(name) " + namespace + " in Hive Metastore", e); } } @@ -388,7 +417,8 @@ public Map loadNamespaceMetadata(Namespace namespace) { throw new NoSuchNamespaceException(e, "Namespace does not exist: %s", namespace); } catch (TException e) { - throw new RuntimeException("Failed to list namespace under namespace: " + namespace + " in Hive Metastore", e); + throw new RuntimeException( + "Failed to list namespace under namespace: " + namespace + " in Hive Metastore", e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); @@ -436,14 +466,16 @@ protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { // Create a new location based on the namespace / database if it is set on database level try { - Database databaseData = clients.run(client -> client.getDatabase(tableIdentifier.namespace().levels()[0])); + Database databaseData = + clients.run(client -> client.getDatabase(tableIdentifier.namespace().levels()[0])); if (databaseData.getLocationUri() != null) { // If the database location is set use it as a base. return String.format("%s/%s", databaseData.getLocationUri(), tableIdentifier.name()); } } catch (TException e) { - throw new RuntimeException(String.format("Metastore operation failed for %s", tableIdentifier), e); + throw new RuntimeException( + String.format("Metastore operation failed for %s", tableIdentifier), e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); @@ -454,14 +486,13 @@ protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { String warehouseLocation = getWarehouseLocation(); return String.format( "%s/%s.db/%s", - warehouseLocation, - tableIdentifier.namespace().levels()[0], - tableIdentifier.name()); + warehouseLocation, tableIdentifier.namespace().levels()[0], tableIdentifier.name()); } private String getWarehouseLocation() { String warehouseLocation = conf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname); - Preconditions.checkNotNull(warehouseLocation, "Warehouse location is not set: hive.metastore.warehouse.dir=null"); + Preconditions.checkNotNull( + warehouseLocation, "Warehouse location is not set: hive.metastore.warehouse.dir=null"); return warehouseLocation; } @@ -487,23 +518,26 @@ Database convertToDatabase(Namespace namespace, Map meta) { Map parameter = Maps.newHashMap(); database.setName(namespace.level(0)); - database.setLocationUri(new Path(getWarehouseLocation(), namespace.level(0)).toString() + ".db"); - - meta.forEach((key, value) -> { - if (key.equals("comment")) { - database.setDescription(value); - } else if (key.equals("location")) { - database.setLocationUri(value); - } else { - if (value != null) { - parameter.put(key, value); - } - } - }); + database.setLocationUri( + new Path(getWarehouseLocation(), namespace.level(0)).toString() + ".db"); + + meta.forEach( + (key, value) -> { + if (key.equals("comment")) { + database.setDescription(value); + } else if (key.equals("location")) { + database.setLocationUri(value); + } else { + if (value != null) { + parameter.put(key, value); + } + } + }); database.setParameters(parameter); return database; } + @Override public String toString() { return MoreObjects.toStringHelper(this) diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveClientPool.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveClientPool.java index 031f36f7494f..9bc232043a5d 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveClientPool.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveClientPool.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; import org.apache.hadoop.conf.Configuration; @@ -34,10 +33,19 @@ public class HiveClientPool extends ClientPoolImpl { - private static final DynMethods.StaticMethod GET_CLIENT = DynMethods.builder("getProxy") - .impl(RetryingMetaStoreClient.class, HiveConf.class, HiveMetaHookLoader.class, String.class) // Hive 1 and 2 - .impl(RetryingMetaStoreClient.class, Configuration.class, HiveMetaHookLoader.class, String.class) // Hive 3 - .buildStatic(); + private static final DynMethods.StaticMethod GET_CLIENT = + DynMethods.builder("getProxy") + .impl( + RetryingMetaStoreClient.class, + HiveConf.class, + HiveMetaHookLoader.class, + String.class) // Hive 1 and 2 + .impl( + RetryingMetaStoreClient.class, + Configuration.class, + HiveMetaHookLoader.class, + String.class) // Hive 3 + .buildStatic(); private final HiveConf hiveConf; @@ -49,12 +57,14 @@ public HiveClientPool(int poolSize, Configuration conf) { } @Override - protected IMetaStoreClient newClient() { + protected IMetaStoreClient newClient() { try { try { - return GET_CLIENT.invoke(hiveConf, (HiveMetaHookLoader) tbl -> null, HiveMetaStoreClient.class.getName()); + return GET_CLIENT.invoke( + hiveConf, (HiveMetaHookLoader) tbl -> null, HiveMetaStoreClient.class.getName()); } catch (RuntimeException e) { - // any MetaException would be wrapped into RuntimeException during reflection, so let's double-check type here + // any MetaException would be wrapped into RuntimeException during reflection, so let's + // double-check type here if (e.getCause() instanceof MetaException) { throw (MetaException) e.getCause(); } @@ -64,9 +74,11 @@ protected IMetaStoreClient newClient() { throw new RuntimeMetaException(e, "Failed to connect to Hive Metastore"); } catch (Throwable t) { if (t.getMessage().contains("Another instance of Derby may have already booted")) { - throw new RuntimeMetaException(t, "Failed to start an embedded metastore because embedded " + - "Derby supports only one client at a time. To fix this, use a metastore that supports " + - "multiple clients."); + throw new RuntimeMetaException( + t, + "Failed to start an embedded metastore because embedded " + + "Derby supports only one client at a time. To fix this, use a metastore that supports " + + "multiple clients."); } throw new RuntimeMetaException(t, "Failed to connect to Hive Metastore"); @@ -86,8 +98,11 @@ protected IMetaStoreClient reconnect(IMetaStoreClient client) { @Override protected boolean isConnectionException(Exception e) { - return super.isConnectionException(e) || (e != null && e instanceof MetaException && - e.getMessage().contains("Got exception: org.apache.thrift.transport.TTransportException")); + return super.isConnectionException(e) + || (e != null + && e instanceof MetaException + && e.getMessage() + .contains("Got exception: org.apache.thrift.transport.TTransportException")); } @Override diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java index 50c64850525b..dfad78d344d9 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; import java.util.Collections; @@ -36,8 +35,8 @@ import org.slf4j.LoggerFactory; /** - * Package private class for converting Hive schema to Iceberg schema. Should be used only by the HiveSchemaUtil. - * Use {@link HiveSchemaUtil} for conversion purposes. + * Package private class for converting Hive schema to Iceberg schema. Should be used only by the + * HiveSchemaUtil. Use {@link HiveSchemaUtil} for conversion purposes. */ class HiveSchemaConverter { private static final Logger LOG = LoggerFactory.getLogger(HiveSchemaConverter.class); @@ -50,7 +49,8 @@ private HiveSchemaConverter(boolean autoConvert) { this.id = 0; } - static Schema convert(List names, List typeInfos, List comments, boolean autoConvert) { + static Schema convert( + List names, List typeInfos, List comments, boolean autoConvert) { HiveSchemaConverter converter = new HiveSchemaConverter(autoConvert); return new Schema(converter.convertInternal(names, typeInfos, comments)); } @@ -60,11 +60,16 @@ static Type convert(TypeInfo typeInfo, boolean autoConvert) { return converter.convertType(typeInfo); } - List convertInternal(List names, List typeInfos, List comments) { + List convertInternal( + List names, List typeInfos, List comments) { List result = Lists.newArrayListWithExpectedSize(names.size()); for (int i = 0; i < names.size(); ++i) { - result.add(Types.NestedField.optional(id++, names.get(i), convertType(typeInfos.get(i)), - (comments.isEmpty() || i >= comments.size()) ? null : comments.get(i))); + result.add( + Types.NestedField.optional( + id++, + names.get(i), + convertType(typeInfos.get(i)), + (comments.isEmpty() || i >= comments.size()) ? null : comments.get(i))); } return result; @@ -82,7 +87,9 @@ Type convertType(TypeInfo typeInfo) { return Types.BooleanType.get(); case BYTE: case SHORT: - Preconditions.checkArgument(autoConvert, "Unsupported Hive type: %s, use integer instead", + Preconditions.checkArgument( + autoConvert, + "Unsupported Hive type: %s, use integer instead", ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory()); LOG.debug("Using auto conversion from SHORT/BYTE to INTEGER"); @@ -95,7 +102,9 @@ Type convertType(TypeInfo typeInfo) { return Types.BinaryType.get(); case CHAR: case VARCHAR: - Preconditions.checkArgument(autoConvert, "Unsupported Hive type: %s, use string instead", + Preconditions.checkArgument( + autoConvert, + "Unsupported Hive type: %s, use string instead", ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory()); LOG.debug("Using auto conversion from CHAR/VARCHAR to STRING"); @@ -113,18 +122,22 @@ Type convertType(TypeInfo typeInfo) { case INTERVAL_DAY_TIME: default: // special case for Timestamp with Local TZ which is only available in Hive3 - if ("TIMESTAMPLOCALTZ".equalsIgnoreCase(((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory().name())) { + if ("TIMESTAMPLOCALTZ" + .equalsIgnoreCase(((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory().name())) { return Types.TimestampType.withZone(); } - throw new IllegalArgumentException("Unsupported Hive type (" + - ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory() + - ") for Iceberg tables."); + throw new IllegalArgumentException( + "Unsupported Hive type (" + + ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory() + + ") for Iceberg tables."); } case STRUCT: StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; List fields = - convertInternal(structTypeInfo.getAllStructFieldNames(), structTypeInfo.getAllStructFieldTypeInfos(), - Collections.emptyList()); + convertInternal( + structTypeInfo.getAllStructFieldNames(), + structTypeInfo.getAllStructFieldTypeInfos(), + Collections.emptyList()); return Types.StructType.of(fields); case MAP: MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo; diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java index 8cba910fde34..25d9a74a5249 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; import java.util.List; @@ -30,14 +29,13 @@ import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; - public final class HiveSchemaUtil { - private HiveSchemaUtil() { - } + private HiveSchemaUtil() {} /** * Converts the Iceberg schema to a Hive schema (list of FieldSchema objects). + * * @param schema The original Iceberg schema to convert * @return The Hive column list generated from the Iceberg schema */ @@ -48,8 +46,9 @@ public static List convert(Schema schema) { } /** - * Converts a Hive schema (list of FieldSchema objects) to an Iceberg schema. If some of the types are not convertible - * then exception is thrown. + * Converts a Hive schema (list of FieldSchema objects) to an Iceberg schema. If some of the types + * are not convertible then exception is thrown. + * * @param fieldSchemas The list of the columns * @return An equivalent Iceberg Schema */ @@ -59,10 +58,11 @@ public static Schema convert(List fieldSchemas) { /** * Converts a Hive schema (list of FieldSchema objects) to an Iceberg schema. + * * @param fieldSchemas The list of the columns - * @param autoConvert If true then TINYINT and SMALLINT is converted to INTEGER and VARCHAR and CHAR is - * converted to STRING. Otherwise if these types are used in the Hive schema then exception is - * thrown. + * @param autoConvert If true then TINYINT and SMALLINT is converted to INTEGER and + * VARCHAR and CHAR is converted to STRING. Otherwise if these types are used in the Hive + * schema then exception is thrown. * @return An equivalent Iceberg Schema */ public static Schema convert(List fieldSchemas, boolean autoConvert) { @@ -80,6 +80,7 @@ public static Schema convert(List fieldSchemas, boolean autoConvert /** * Converts the Hive partition columns to Iceberg identity partition specification. + * * @param schema The Iceberg schema * @param fieldSchemas The partition column specification * @return The Iceberg partition specification @@ -91,8 +92,9 @@ public static PartitionSpec spec(Schema schema, List fieldSchemas) } /** - * Converts the Hive list of column names and column types to an Iceberg schema. If some of the types are not - * convertible then exception is thrown. + * Converts the Hive list of column names and column types to an Iceberg schema. If some of the + * types are not convertible then exception is thrown. + * * @param names The list of the Hive column names * @param types The list of the Hive column types * @param comments The list of the Hive column comments @@ -104,20 +106,23 @@ public static Schema convert(List names, List types, Listtrue then TINYINT and SMALLINT is converted to INTEGER and VARCHAR and CHAR is - * converted to STRING. Otherwise if these types are used in the Hive schema then exception is - * thrown. + * @param autoConvert If true then TINYINT and SMALLINT is converted to INTEGER and + * VARCHAR and CHAR is converted to STRING. Otherwise if these types are used in the Hive + * schema then exception is thrown. * @return The Iceberg schema */ - public static Schema convert(List names, List types, List comments, boolean autoConvert) { + public static Schema convert( + List names, List types, List comments, boolean autoConvert) { return HiveSchemaConverter.convert(names, types, comments, autoConvert); } /** * Converts an Iceberg type to a Hive TypeInfo object. + * * @param type The Iceberg type * @return The Hive type */ @@ -127,6 +132,7 @@ public static TypeInfo convert(Type type) { /** * Converts a Hive typeInfo object to an Iceberg type. + * * @param typeInfo The Hive type * @return The Iceberg type */ @@ -166,16 +172,18 @@ private static String convertToTypeString(Type type) { return String.format("decimal(%s,%s)", decimalType.precision(), decimalType.scale()); case STRUCT: final Types.StructType structType = type.asStructType(); - final String nameToType = structType.fields().stream() - .map(f -> String.format("%s:%s", f.name(), convert(f.type()))) - .collect(Collectors.joining(",")); + final String nameToType = + structType.fields().stream() + .map(f -> String.format("%s:%s", f.name(), convert(f.type()))) + .collect(Collectors.joining(",")); return String.format("struct<%s>", nameToType); case LIST: final Types.ListType listType = type.asListType(); return String.format("array<%s>", convert(listType.elementType())); case MAP: final Types.MapType mapType = type.asMapType(); - return String.format("map<%s,%s>", convert(mapType.keyType()), convert(mapType.valueType())); + return String.format( + "map<%s,%s>", convert(mapType.keyType()), convert(mapType.valueType())); default: throw new UnsupportedOperationException(type + " is not supported"); } diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java index 42e2e11c878e..294605a55105 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; +import static org.apache.iceberg.TableProperties.GC_ENABLED; + import com.fasterxml.jackson.core.JsonProcessingException; import com.github.benmanes.caffeine.cache.Cache; import com.github.benmanes.caffeine.cache.Caffeine; @@ -79,8 +80,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.GC_ENABLED; - /** * TODO we should be able to extract some more commonalities to BaseMetastoreTableOperations to * avoid code duplication between this class and Metacat Tables. @@ -91,10 +90,13 @@ public class HiveTableOperations extends BaseMetastoreTableOperations { private static final String HIVE_ACQUIRE_LOCK_TIMEOUT_MS = "iceberg.hive.lock-timeout-ms"; private static final String HIVE_LOCK_CHECK_MIN_WAIT_MS = "iceberg.hive.lock-check-min-wait-ms"; private static final String HIVE_LOCK_CHECK_MAX_WAIT_MS = "iceberg.hive.lock-check-max-wait-ms"; - private static final String HIVE_ICEBERG_METADATA_REFRESH_MAX_RETRIES = "iceberg.hive.metadata-refresh-max-retries"; - private static final String HIVE_TABLE_LEVEL_LOCK_EVICT_MS = "iceberg.hive.table-level-lock-evict-ms"; + private static final String HIVE_ICEBERG_METADATA_REFRESH_MAX_RETRIES = + "iceberg.hive.metadata-refresh-max-retries"; + private static final String HIVE_TABLE_LEVEL_LOCK_EVICT_MS = + "iceberg.hive.table-level-lock-evict-ms"; - // the max size is based on HMS backend database. For Hive versions below 2.3, the max table parameter size is 4000 + // the max size is based on HMS backend database. For Hive versions below 2.3, the max table + // parameter size is 4000 // characters, see https://issues.apache.org/jira/browse/HIVE-12274 // set to 0 to not expose Iceberg metadata in HMS Table properties. private static final String HIVE_TABLE_PROPERTY_MAX_SIZE = "iceberg.hive.table-property-max-size"; @@ -104,33 +106,35 @@ public class HiveTableOperations extends BaseMetastoreTableOperations { private static final long HIVE_LOCK_CHECK_MAX_WAIT_MS_DEFAULT = 5 * 1000; // 5 seconds private static final int HIVE_ICEBERG_METADATA_REFRESH_MAX_RETRIES_DEFAULT = 2; private static final long HIVE_TABLE_LEVEL_LOCK_EVICT_MS_DEFAULT = TimeUnit.MINUTES.toMillis(10); - private static final BiMap ICEBERG_TO_HMS_TRANSLATION = ImmutableBiMap.of( - // gc.enabled in Iceberg and external.table.purge in Hive are meant to do the same things but with different names - GC_ENABLED, "external.table.purge" - ); + private static final BiMap ICEBERG_TO_HMS_TRANSLATION = + ImmutableBiMap.of( + // gc.enabled in Iceberg and external.table.purge in Hive are meant to do the same things + // but with different names + GC_ENABLED, "external.table.purge"); private static Cache commitLockCache; private static synchronized void initTableLevelLockCache(long evictionTimeout) { if (commitLockCache == null) { - commitLockCache = Caffeine.newBuilder() - .expireAfterAccess(evictionTimeout, TimeUnit.MILLISECONDS) - .build(); + commitLockCache = + Caffeine.newBuilder().expireAfterAccess(evictionTimeout, TimeUnit.MILLISECONDS).build(); } } /** - * Provides key translation where necessary between Iceberg and HMS props. This translation is needed because some - * properties control the same behaviour but are named differently in Iceberg and Hive. Therefore changes to these - * property pairs should be synchronized. + * Provides key translation where necessary between Iceberg and HMS props. This translation is + * needed because some properties control the same behaviour but are named differently in Iceberg + * and Hive. Therefore changes to these property pairs should be synchronized. * - * Example: Deleting data files upon DROP TABLE is enabled using gc.enabled=true in Iceberg and - * external.table.purge=true in Hive. Hive and Iceberg users are unaware of each other's control flags, therefore - * inconsistent behaviour can occur from e.g. a Hive user's point of view if external.table.purge=true is set on the - * HMS table but gc.enabled=false is set on the Iceberg table, resulting in no data file deletion. + *

    Example: Deleting data files upon DROP TABLE is enabled using gc.enabled=true in Iceberg and + * external.table.purge=true in Hive. Hive and Iceberg users are unaware of each other's control + * flags, therefore inconsistent behaviour can occur from e.g. a Hive user's point of view if + * external.table.purge=true is set on the HMS table but gc.enabled=false is set on the Iceberg + * table, resulting in no data file deletion. * * @param hmsProp The HMS property that should be translated to Iceberg property - * @return Iceberg property equivalent to the hmsProp. If no such translation exists, the original hmsProp is returned + * @return Iceberg property equivalent to the hmsProp. If no such translation exists, the original + * hmsProp is returned */ public static String translateToIcebergProp(String hmsProp) { return ICEBERG_TO_HMS_TRANSLATION.inverse().getOrDefault(hmsProp, hmsProp); @@ -154,8 +158,13 @@ private static class WaitingForLockException extends RuntimeException { private final FileIO fileIO; private final ClientPool metaClients; - protected HiveTableOperations(Configuration conf, ClientPool metaClients, FileIO fileIO, - String catalogName, String database, String table) { + protected HiveTableOperations( + Configuration conf, + ClientPool metaClients, + FileIO fileIO, + String catalogName, + String database, + String table) { this.conf = conf; this.metaClients = metaClients; this.fileIO = fileIO; @@ -169,8 +178,11 @@ protected HiveTableOperations(Configuration conf, ClientPool metaClients, FileIO this.lockCheckMaxWaitTime = conf.getLong(HIVE_LOCK_CHECK_MAX_WAIT_MS, HIVE_LOCK_CHECK_MAX_WAIT_MS_DEFAULT); this.metadataRefreshMaxRetries = - conf.getInt(HIVE_ICEBERG_METADATA_REFRESH_MAX_RETRIES, HIVE_ICEBERG_METADATA_REFRESH_MAX_RETRIES_DEFAULT); - this.maxHiveTablePropertySize = conf.getLong(HIVE_TABLE_PROPERTY_MAX_SIZE, HIVE_TABLE_PROPERTY_MAX_SIZE_DEFAULT); + conf.getInt( + HIVE_ICEBERG_METADATA_REFRESH_MAX_RETRIES, + HIVE_ICEBERG_METADATA_REFRESH_MAX_RETRIES_DEFAULT); + this.maxHiveTablePropertySize = + conf.getLong(HIVE_TABLE_PROPERTY_MAX_SIZE, HIVE_TABLE_PROPERTY_MAX_SIZE_DEFAULT); long tableLevelLockCacheEvictionTimeout = conf.getLong(HIVE_TABLE_LEVEL_LOCK_EVICT_MS, HIVE_TABLE_LEVEL_LOCK_EVICT_MS_DEFAULT); initTableLevelLockCache(tableLevelLockCacheEvictionTimeout); @@ -201,7 +213,8 @@ protected void doRefresh() { } } catch (TException e) { - String errMsg = String.format("Failed to get table info from metastore %s.%s", database, tableName); + String errMsg = + String.format("Failed to get table info from metastore %s.%s", database, tableName); throw new RuntimeException(errMsg, e); } catch (InterruptedException e) { @@ -215,15 +228,18 @@ protected void doRefresh() { @SuppressWarnings("checkstyle:CyclomaticComplexity") @Override protected void doCommit(TableMetadata base, TableMetadata metadata) { - String newMetadataLocation = base == null && metadata.metadataFileLocation() != null ? - metadata.metadataFileLocation() : writeNewMetadata(metadata, currentVersion() + 1); + String newMetadataLocation = + base == null && metadata.metadataFileLocation() != null + ? metadata.metadataFileLocation() + : writeNewMetadata(metadata, currentVersion() + 1); boolean hiveEngineEnabled = hiveEngineEnabled(metadata, conf); boolean keepHiveStats = conf.getBoolean(ConfigProperties.KEEP_HIVE_STATS, false); CommitStatus commitStatus = CommitStatus.FAILURE; boolean updateHiveTable = false; Optional lockId = Optional.empty(); - // getting a process-level lock per table to avoid concurrent commit attempts to the same table from the same + // getting a process-level lock per table to avoid concurrent commit attempts to the same table + // from the same // JVM process, which would result in unnecessary and costly HMS lock acquisition requests ReentrantLock tableLevelMutex = commitLockCache.get(fullName, t -> new ReentrantLock(true)); tableLevelMutex.lock(); @@ -234,8 +250,11 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { Table tbl = loadHmsTable(); if (tbl != null) { - // If we try to create the table but the metadata location is already set, then we had a concurrent commit - if (base == null && tbl.getParameters().get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP) != null) { + // If we try to create the table but the metadata location is already set, then we had a + // concurrent commit + if (base == null + && tbl.getParameters().get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP) + != null) { throw new AlreadyExistsException("Table already exists: %s.%s", database, tableName); } @@ -259,15 +278,18 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { // get Iceberg props that have been removed Set removedProps = Collections.emptySet(); if (base != null) { - removedProps = base.properties().keySet().stream() - .filter(key -> !metadata.properties().containsKey(key)) - .collect(Collectors.toSet()); + removedProps = + base.properties().keySet().stream() + .filter(key -> !metadata.properties().containsKey(key)) + .collect(Collectors.toSet()); } - Map summary = Optional.ofNullable(metadata.currentSnapshot()) - .map(Snapshot::summary) - .orElseGet(ImmutableMap::of); - setHmsTableParameters(newMetadataLocation, tbl, metadata, removedProps, hiveEngineEnabled, summary); + Map summary = + Optional.ofNullable(metadata.currentSnapshot()) + .map(Snapshot::summary) + .orElseGet(ImmutableMap::of); + setHmsTableParameters( + newMetadataLocation, tbl, metadata, removedProps, hiveEngineEnabled, summary); if (!keepHiveStats) { tbl.getParameters().remove(StatsSetupConst.COLUMN_STATS_ACCURATE); @@ -283,14 +305,20 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { throw new ValidationException(e, "Invalid Hive object for %s.%s", database, tableName); } catch (Throwable e) { - if (e.getMessage() != null && e.getMessage().contains("Table/View 'HIVE_LOCKS' does not exist")) { - throw new RuntimeException("Failed to acquire locks from metastore because the underlying metastore " + - "table 'HIVE_LOCKS' does not exist. This can occur when using an embedded metastore which does not " + - "support transactions. To fix this use an alternative metastore.", e); + if (e.getMessage() != null + && e.getMessage().contains("Table/View 'HIVE_LOCKS' does not exist")) { + throw new RuntimeException( + "Failed to acquire locks from metastore because the underlying metastore " + + "table 'HIVE_LOCKS' does not exist. This can occur when using an embedded metastore which does not " + + "support transactions. To fix this use an alternative metastore.", + e); } - LOG.error("Cannot tell if commit to {}.{} succeeded, attempting to reconnect and check.", - database, tableName, e); + LOG.error( + "Cannot tell if commit to {}.{} succeeded, attempting to reconnect and check.", + database, + tableName, + e); commitStatus = checkCommitStatus(newMetadataLocation, metadata); switch (commitStatus) { case SUCCESS: @@ -302,7 +330,8 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { } } } catch (TException | UnknownHostException e) { - throw new RuntimeException(String.format("Metastore operation failed for %s.%s", database, tableName), e); + throw new RuntimeException( + String.format("Metastore operation failed for %s.%s", database, tableName), e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); @@ -312,21 +341,25 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { cleanupMetadataAndUnlock(commitStatus, newMetadataLocation, lockId, tableLevelMutex); } - LOG.info("Committed to table {} with the new metadata location {}", fullName, newMetadataLocation); + LOG.info( + "Committed to table {} with the new metadata location {}", fullName, newMetadataLocation); } @VisibleForTesting - void persistTable(Table hmsTable, boolean updateHiveTable) throws TException, InterruptedException { + void persistTable(Table hmsTable, boolean updateHiveTable) + throws TException, InterruptedException { if (updateHiveTable) { - metaClients.run(client -> { - MetastoreUtil.alterTable(client, database, tableName, hmsTable); - return null; - }); + metaClients.run( + client -> { + MetastoreUtil.alterTable(client, database, tableName, hmsTable); + return null; + }); } else { - metaClients.run(client -> { - client.createTable(hmsTable); - return null; - }); + metaClients.run( + client -> { + client.createTable(hmsTable); + return null; + }); } } @@ -342,35 +375,46 @@ private Table loadHmsTable() throws TException, InterruptedException { private Table newHmsTable() { final long currentTimeMillis = System.currentTimeMillis(); - Table newTable = new Table(tableName, - database, - System.getProperty("user.name"), - (int) currentTimeMillis / 1000, - (int) currentTimeMillis / 1000, - Integer.MAX_VALUE, - null, - Collections.emptyList(), - Maps.newHashMap(), - null, - null, - TableType.EXTERNAL_TABLE.toString()); - - newTable.getParameters().put("EXTERNAL", "TRUE"); // using the external table type also requires this + Table newTable = + new Table( + tableName, + database, + System.getProperty("user.name"), + (int) currentTimeMillis / 1000, + (int) currentTimeMillis / 1000, + Integer.MAX_VALUE, + null, + Collections.emptyList(), + Maps.newHashMap(), + null, + null, + TableType.EXTERNAL_TABLE.toString()); + + newTable + .getParameters() + .put("EXTERNAL", "TRUE"); // using the external table type also requires this return newTable; } - private void setHmsTableParameters(String newMetadataLocation, Table tbl, TableMetadata metadata, - Set obsoleteProps, boolean hiveEngineEnabled, - Map summary) { - Map parameters = Optional.ofNullable(tbl.getParameters()) - .orElseGet(Maps::newHashMap); + private void setHmsTableParameters( + String newMetadataLocation, + Table tbl, + TableMetadata metadata, + Set obsoleteProps, + boolean hiveEngineEnabled, + Map summary) { + Map parameters = + Optional.ofNullable(tbl.getParameters()).orElseGet(Maps::newHashMap); // push all Iceberg table properties into HMS - metadata.properties().forEach((key, value) -> { - // translate key names between Iceberg and HMS where needed - String hmsKey = ICEBERG_TO_HMS_TRANSLATION.getOrDefault(key, key); - parameters.put(hmsKey, value); - }); + metadata + .properties() + .forEach( + (key, value) -> { + // translate key names between Iceberg and HMS where needed + String hmsKey = ICEBERG_TO_HMS_TRANSLATION.getOrDefault(key, key); + parameters.put(hmsKey, value); + }); if (metadata.uuid() != null) { parameters.put(TableProperties.UUID, metadata.uuid()); } @@ -387,7 +431,8 @@ private void setHmsTableParameters(String newMetadataLocation, Table tbl, TableM // If needed set the 'storage_handler' property to enable query from Hive if (hiveEngineEnabled) { - parameters.put(hive_metastoreConstants.META_TABLE_STORAGE, + parameters.put( + hive_metastoreConstants.META_TABLE_STORAGE, "org.apache.iceberg.mr.hive.HiveIcebergStorageHandler"); } else { parameters.remove(hive_metastoreConstants.META_TABLE_STORAGE); @@ -420,8 +465,11 @@ void setSnapshotStats(TableMetadata metadata, Map parameters) { Snapshot currentSnapshot = metadata.currentSnapshot(); if (exposeInHmsProperties() && currentSnapshot != null) { - parameters.put(TableProperties.CURRENT_SNAPSHOT_ID, String.valueOf(currentSnapshot.snapshotId())); - parameters.put(TableProperties.CURRENT_SNAPSHOT_TIMESTAMP, String.valueOf(currentSnapshot.timestampMillis())); + parameters.put( + TableProperties.CURRENT_SNAPSHOT_ID, String.valueOf(currentSnapshot.snapshotId())); + parameters.put( + TableProperties.CURRENT_SNAPSHOT_TIMESTAMP, + String.valueOf(currentSnapshot.timestampMillis())); setSnapshotSummary(parameters, currentSnapshot); } @@ -435,11 +483,16 @@ void setSnapshotSummary(Map parameters, Snapshot currentSnapshot if (summary.length() <= maxHiveTablePropertySize) { parameters.put(TableProperties.CURRENT_SNAPSHOT_SUMMARY, summary); } else { - LOG.warn("Not exposing the current snapshot({}) summary in HMS since it exceeds {} characters", - currentSnapshot.snapshotId(), maxHiveTablePropertySize); + LOG.warn( + "Not exposing the current snapshot({}) summary in HMS since it exceeds {} characters", + currentSnapshot.snapshotId(), + maxHiveTablePropertySize); } } catch (JsonProcessingException e) { - LOG.warn("Failed to convert current snapshot({}) summary to a json string", currentSnapshot.snapshotId(), e); + LOG.warn( + "Failed to convert current snapshot({}) summary to a json string", + currentSnapshot.snapshotId(), + e); } } @@ -464,7 +517,9 @@ void setPartitionSpec(TableMetadata metadata, Map parameters) { @VisibleForTesting void setSortOrder(TableMetadata metadata, Map parameters) { parameters.remove(TableProperties.DEFAULT_SORT_ORDER); - if (exposeInHmsProperties() && metadata.sortOrder() != null && metadata.sortOrder().isSorted()) { + if (exposeInHmsProperties() + && metadata.sortOrder() != null + && metadata.sortOrder().isSorted()) { String sortOrder = SortOrderParser.toJson(metadata.sortOrder()); setField(parameters, TableProperties.DEFAULT_SORT_ORDER, sortOrder); } @@ -474,7 +529,8 @@ private void setField(Map parameters, String key, String value) if (value.length() <= maxHiveTablePropertySize) { parameters.put(key, value); } else { - LOG.warn("Not exposing {} in HMS since it exceeds {} characters", key, maxHiveTablePropertySize); + LOG.warn( + "Not exposing {} in HMS since it exceeds {} characters", key, maxHiveTablePropertySize); } } @@ -505,11 +561,14 @@ private StorageDescriptor storageDescriptor(TableMetadata metadata, boolean hive @SuppressWarnings("ReverseDnsLookup") @VisibleForTesting long acquireLock() throws UnknownHostException, TException, InterruptedException { - final LockComponent lockComponent = new LockComponent(LockType.EXCLUSIVE, LockLevel.TABLE, database); + final LockComponent lockComponent = + new LockComponent(LockType.EXCLUSIVE, LockLevel.TABLE, database); lockComponent.setTablename(tableName); - final LockRequest lockRequest = new LockRequest(Lists.newArrayList(lockComponent), - System.getProperty("user.name"), - InetAddress.getLocalHost().getHostName()); + final LockRequest lockRequest = + new LockRequest( + Lists.newArrayList(lockComponent), + System.getProperty("user.name"), + InetAddress.getLocalHost().getHostName()); LockResponse lockResponse = metaClients.run(client -> client.lock(lockRequest)); AtomicReference state = new AtomicReference<>(lockResponse.getState()); long lockId = lockResponse.getLockid(); @@ -520,34 +579,41 @@ long acquireLock() throws UnknownHostException, TException, InterruptedException try { if (state.get().equals(LockState.WAITING)) { - // Retry count is the typical "upper bound of retries" for Tasks.run() function. In fact, the maximum number of - // attempts the Tasks.run() would try is `retries + 1`. Here, for checking locks, we use timeout as the - // upper bound of retries. So it is just reasonable to set a large retry count. However, if we set - // Integer.MAX_VALUE, the above logic of `retries + 1` would overflow into Integer.MIN_VALUE. Hence, - // the retry is set conservatively as `Integer.MAX_VALUE - 100` so it doesn't hit any boundary issues. + // Retry count is the typical "upper bound of retries" for Tasks.run() function. In fact, + // the maximum number of + // attempts the Tasks.run() would try is `retries + 1`. Here, for checking locks, we use + // timeout as the + // upper bound of retries. So it is just reasonable to set a large retry count. However, if + // we set + // Integer.MAX_VALUE, the above logic of `retries + 1` would overflow into + // Integer.MIN_VALUE. Hence, + // the retry is set conservatively as `Integer.MAX_VALUE - 100` so it doesn't hit any + // boundary issues. Tasks.foreach(lockId) .retry(Integer.MAX_VALUE - 100) - .exponentialBackoff( - lockCheckMinWaitTime, - lockCheckMaxWaitTime, - lockAcquireTimeout, - 1.5) + .exponentialBackoff(lockCheckMinWaitTime, lockCheckMaxWaitTime, lockAcquireTimeout, 1.5) .throwFailureWhenFinished() .onlyRetryOn(WaitingForLockException.class) - .run(id -> { - try { - LockResponse response = metaClients.run(client -> client.checkLock(id)); - LockState newState = response.getState(); - state.set(newState); - if (newState.equals(LockState.WAITING)) { - throw new WaitingForLockException( + .run( + id -> { + try { + LockResponse response = metaClients.run(client -> client.checkLock(id)); + LockState newState = response.getState(); + state.set(newState); + if (newState.equals(LockState.WAITING)) { + throw new WaitingForLockException( String.format("Waiting for lock on table %s.%s", database, tableName)); - } - } catch (InterruptedException e) { - Thread.interrupted(); // Clear the interrupt status flag - LOG.warn("Interrupted while waiting for lock on table {}.{}", database, tableName, e); - } - }, TException.class); + } + } catch (InterruptedException e) { + Thread.interrupted(); // Clear the interrupt status flag + LOG.warn( + "Interrupted while waiting for lock on table {}.{}", + database, + tableName, + e); + } + }, + TException.class); } } catch (WaitingForLockException waitingForLockException) { timeout = true; @@ -560,18 +626,22 @@ long acquireLock() throws UnknownHostException, TException, InterruptedException // timeout and do not have lock acquired if (timeout && !state.get().equals(LockState.ACQUIRED)) { - throw new CommitFailedException("Timed out after %s ms waiting for lock on %s.%s", - duration, database, tableName); + throw new CommitFailedException( + "Timed out after %s ms waiting for lock on %s.%s", duration, database, tableName); } if (!state.get().equals(LockState.ACQUIRED)) { - throw new CommitFailedException("Could not acquire the lock on %s.%s, " + - "lock request ended in state %s", database, tableName, state); + throw new CommitFailedException( + "Could not acquire the lock on %s.%s, " + "lock request ended in state %s", + database, tableName, state); } return lockId; } - private void cleanupMetadataAndUnlock(CommitStatus commitStatus, String metadataLocation, Optional lockId, + private void cleanupMetadataAndUnlock( + CommitStatus commitStatus, + String metadataLocation, + Optional lockId, ReentrantLock tableLevelMutex) { try { if (commitStatus == CommitStatus.FAILURE) { @@ -598,28 +668,35 @@ private void unlock(Optional lockId) { @VisibleForTesting void doUnlock(long lockId) throws TException, InterruptedException { - metaClients.run(client -> { - client.unlock(lockId); - return null; - }); + metaClients.run( + client -> { + client.unlock(lockId); + return null; + }); } static void validateTableIsIceberg(Table table, String fullName) { String tableType = table.getParameters().get(TABLE_TYPE_PROP); - NoSuchIcebergTableException.check(tableType != null && tableType.equalsIgnoreCase(ICEBERG_TABLE_TYPE_VALUE), - "Not an iceberg table: %s (type=%s)", fullName, tableType); + NoSuchIcebergTableException.check( + tableType != null && tableType.equalsIgnoreCase(ICEBERG_TABLE_TYPE_VALUE), + "Not an iceberg table: %s (type=%s)", + fullName, + tableType); } /** * Returns if the hive engine related values should be enabled on the table, or not. - *

    - * The decision is made like this: + * + *

    The decision is made like this: + * *

      - *
    1. Table property value {@link TableProperties#ENGINE_HIVE_ENABLED} - *
    2. If the table property is not set then check the hive-site.xml property value - * {@link ConfigProperties#ENGINE_HIVE_ENABLED} - *
    3. If none of the above is enabled then use the default value {@link TableProperties#ENGINE_HIVE_ENABLED_DEFAULT} + *
    4. Table property value {@link TableProperties#ENGINE_HIVE_ENABLED} + *
    5. If the table property is not set then check the hive-site.xml property value {@link + * ConfigProperties#ENGINE_HIVE_ENABLED} + *
    6. If none of the above is enabled then use the default value {@link + * TableProperties#ENGINE_HIVE_ENABLED_DEFAULT} *
    + * * @param metadata Table metadata to use * @param conf The hive configuration to use * @return if the hive engine related values should be enabled or not @@ -630,6 +707,7 @@ private static boolean hiveEngineEnabled(TableMetadata metadata, Configuration c return metadata.propertyAsBoolean(TableProperties.ENGINE_HIVE_ENABLED, false); } - return conf.getBoolean(ConfigProperties.ENGINE_HIVE_ENABLED, TableProperties.ENGINE_HIVE_ENABLED_DEFAULT); + return conf.getBoolean( + ConfigProperties.ENGINE_HIVE_ENABLED, TableProperties.ENGINE_HIVE_ENABLED_DEFAULT); } } diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/MetastoreUtil.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/MetastoreUtil.java index 76363f138c56..5c3c485d507f 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/MetastoreUtil.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/MetastoreUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; import org.apache.hadoop.hive.common.StatsSetupConst; @@ -30,37 +29,46 @@ public class MetastoreUtil { // this class is unique to Hive3 and cannot be found in Hive2, therefore a good proxy to see if // we are working against Hive3 dependencies - private static final String HIVE3_UNIQUE_CLASS = "org.apache.hadoop.hive.serde2.io.DateWritableV2"; + private static final String HIVE3_UNIQUE_CLASS = + "org.apache.hadoop.hive.serde2.io.DateWritableV2"; - private static final DynMethods.UnboundMethod ALTER_TABLE = DynMethods.builder("alter_table") - .impl(IMetaStoreClient.class, "alter_table_with_environmentContext", - String.class, String.class, Table.class, EnvironmentContext.class) - .impl(IMetaStoreClient.class, "alter_table", - String.class, String.class, Table.class, EnvironmentContext.class) - .impl(IMetaStoreClient.class, "alter_table", - String.class, String.class, Table.class) - .build(); + private static final DynMethods.UnboundMethod ALTER_TABLE = + DynMethods.builder("alter_table") + .impl( + IMetaStoreClient.class, + "alter_table_with_environmentContext", + String.class, + String.class, + Table.class, + EnvironmentContext.class) + .impl( + IMetaStoreClient.class, + "alter_table", + String.class, + String.class, + Table.class, + EnvironmentContext.class) + .impl(IMetaStoreClient.class, "alter_table", String.class, String.class, Table.class) + .build(); private static final boolean HIVE3_PRESENT_ON_CLASSPATH = detectHive3(); - private MetastoreUtil() { - } + private MetastoreUtil() {} - /** - * Returns true if Hive3 dependencies are found on classpath, false otherwise. - */ + /** Returns true if Hive3 dependencies are found on classpath, false otherwise. */ public static boolean hive3PresentOnClasspath() { return HIVE3_PRESENT_ON_CLASSPATH; } /** - * Calls alter_table method using the metastore client. If possible, an environmental context will be used that - * turns off stats updates to avoid recursive listing. + * Calls alter_table method using the metastore client. If possible, an environmental context will + * be used that turns off stats updates to avoid recursive listing. */ - public static void alterTable(IMetaStoreClient client, String databaseName, String tblName, Table table) { - EnvironmentContext envContext = new EnvironmentContext( - ImmutableMap.of(StatsSetupConst.DO_NOT_UPDATE_STATS, StatsSetupConst.TRUE) - ); + public static void alterTable( + IMetaStoreClient client, String databaseName, String tblName, Table table) { + EnvironmentContext envContext = + new EnvironmentContext( + ImmutableMap.of(StatsSetupConst.DO_NOT_UPDATE_STATS, StatsSetupConst.TRUE)); ALTER_TABLE.invoke(client, databaseName, tblName, table, envContext); } diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/RuntimeMetaException.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/RuntimeMetaException.java index 99b46c1565c3..0b324857b277 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/RuntimeMetaException.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/RuntimeMetaException.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; import com.google.errorprone.annotations.FormatMethod; import org.apache.hadoop.hive.metastore.api.MetaException; -/** - * Exception used to wrap {@link MetaException} as a {@link RuntimeException} and add context. - */ +/** Exception used to wrap {@link MetaException} as a {@link RuntimeException} and add context. */ public class RuntimeMetaException extends RuntimeException { public RuntimeMetaException(MetaException cause) { super(cause); diff --git a/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveCreateReplaceTableTest.java b/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveCreateReplaceTableTest.java index 9aef3d4b128f..74176ed44e11 100644 --- a/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveCreateReplaceTableTest.java +++ b/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveCreateReplaceTableTest.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; +import static org.apache.iceberg.PartitionSpec.builderFor; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import org.apache.iceberg.AppendFiles; import org.apache.iceberg.AssertHelpers; @@ -42,23 +44,16 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.PartitionSpec.builderFor; -import static org.apache.iceberg.types.Types.NestedField.required; - public class HiveCreateReplaceTableTest extends HiveMetastoreTest { private static final String TABLE_NAME = "tbl"; private static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of(DB_NAME, TABLE_NAME); - private static final Schema SCHEMA = new Schema( - required(3, "id", Types.IntegerType.get()), - required(4, "data", Types.StringType.get()) - ); - private static final PartitionSpec SPEC = builderFor(SCHEMA) - .identity("id") - .build(); + private static final Schema SCHEMA = + new Schema( + required(3, "id", Types.IntegerType.get()), required(4, "data", Types.StringType.get())); + private static final PartitionSpec SPEC = builderFor(SCHEMA).identity("id").build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String tableLocation; @@ -76,11 +71,10 @@ public void cleanup() { public void testCreateTableTxn() { Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER)); - Transaction txn = catalog.newCreateTableTransaction( - TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap()); - txn.updateProperties() - .set("prop", "value") - .commit(); + Transaction txn = + catalog.newCreateTableTransaction( + TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap()); + txn.updateProperties().set("prop", "value").commit(); // verify the table is still not visible before the transaction is committed Assert.assertFalse(catalog.tableExists(TABLE_IDENTIFIER)); @@ -95,8 +89,9 @@ public void testCreateTableTxn() { public void testCreateTableTxnTableCreatedConcurrently() { Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER)); - Transaction txn = catalog.newCreateTableTransaction( - TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap()); + Transaction txn = + catalog.newCreateTableTransaction( + TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap()); // create the table concurrently catalog.createTable(TABLE_IDENTIFIER, SCHEMA, SPEC); @@ -113,22 +108,25 @@ public void testCreateTableTxnTableCreatedConcurrently() { public void testCreateTableTxnAndAppend() { Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER)); - Transaction txn = catalog.newCreateTableTransaction( - TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap()); + Transaction txn = + catalog.newCreateTableTransaction( + TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap()); AppendFiles append = txn.newAppend(); - DataFile dataFile = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(0) - .withRecordCount(1) - .build(); + DataFile dataFile = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(0) + .withRecordCount(1) + .build(); append.appendFile(dataFile); append.commit(); txn.commitTransaction(); Table table = catalog.loadTable(TABLE_IDENTIFIER); Snapshot snapshot = table.currentSnapshot(); - Assert.assertTrue("Table should have one manifest file", snapshot.allManifests(table.io()).size() == 1); + Assert.assertTrue( + "Table should have one manifest file", snapshot.allManifests(table.io()).size() == 1); } @Test @@ -143,7 +141,9 @@ public void testCreateTableTxnTableAlreadyExists() { "Should not be possible to start a new create table txn", AlreadyExistsException.class, "Table already exists: hivedb.tbl", - () -> catalog.newCreateTableTransaction(TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap())); + () -> + catalog.newCreateTableTransaction( + TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap())); } @Test @@ -155,12 +155,9 @@ public void testReplaceTableTxn() { txn.commitTransaction(); Table table = catalog.loadTable(TABLE_IDENTIFIER); - PartitionSpec v1Expected = PartitionSpec.builderFor(table.schema()) - .alwaysNull("id", "id") - .withSpecId(1) - .build(); - Assert.assertEquals("Table should have a spec with one void field", - v1Expected, table.spec()); + PartitionSpec v1Expected = + PartitionSpec.builderFor(table.schema()).alwaysNull("id", "id").withSpecId(1).build(); + Assert.assertEquals("Table should have a spec with one void field", v1Expected, table.spec()); } @Test @@ -181,9 +178,7 @@ public void testReplaceTableTxnTableDeletedConcurrently() { catalog.dropTable(TABLE_IDENTIFIER); - txn.updateProperties() - .set("prop", "value") - .commit(); + txn.updateProperties().set("prop", "value").commit(); AssertHelpers.assertThrows( "Replace table txn should fail", @@ -194,19 +189,16 @@ public void testReplaceTableTxnTableDeletedConcurrently() { @Test public void testReplaceTableTxnTableModifiedConcurrently() { - Table table = catalog.createTable(TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap()); + Table table = + catalog.createTable(TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap()); Assert.assertTrue("Table should exist", catalog.tableExists(TABLE_IDENTIFIER)); Transaction txn = catalog.newReplaceTableTransaction(TABLE_IDENTIFIER, SCHEMA, SPEC, false); // update the table concurrently - table.updateProperties() - .set("another-prop", "another-value") - .commit(); + table.updateProperties().set("another-prop", "another-value").commit(); - txn.updateProperties() - .set("prop", "value") - .commit(); + txn.updateProperties().set("prop", "value").commit(); txn.commitTransaction(); // the replace should still succeed @@ -220,9 +212,7 @@ public void testCreateOrReplaceTableTxnTableNotExists() { Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER)); Transaction txn = catalog.newReplaceTableTransaction(TABLE_IDENTIFIER, SCHEMA, SPEC, true); - txn.updateProperties() - .set("prop", "value") - .commit(); + txn.updateProperties().set("prop", "value").commit(); txn.commitTransaction(); Table table = catalog.loadTable(TABLE_IDENTIFIER); @@ -238,12 +228,9 @@ public void testCreateOrReplaceTableTxnTableExists() { txn.commitTransaction(); Table table = catalog.loadTable(TABLE_IDENTIFIER); - PartitionSpec v1Expected = PartitionSpec.builderFor(table.schema()) - .alwaysNull("id", "id") - .withSpecId(1) - .build(); - Assert.assertEquals("Table should have a spec with one void field", - v1Expected, table.spec()); + PartitionSpec v1Expected = + PartitionSpec.builderFor(table.schema()).alwaysNull("id", "id").withSpecId(1).build(); + Assert.assertEquals("Table should have a spec with one void field", v1Expected, table.spec()); } @Test @@ -253,11 +240,15 @@ public void testCreateOrReplaceTableTxnTableDeletedConcurrently() { catalog.createTable(TABLE_IDENTIFIER, SCHEMA, SPEC); Assert.assertTrue("Table should be created", catalog.tableExists(TABLE_IDENTIFIER)); - Transaction txn = catalog.newReplaceTableTransaction( - TABLE_IDENTIFIER, SCHEMA, PartitionSpec.unpartitioned(), tableLocation, Maps.newHashMap(), true); - txn.updateProperties() - .set("prop", "value") - .commit(); + Transaction txn = + catalog.newReplaceTableTransaction( + TABLE_IDENTIFIER, + SCHEMA, + PartitionSpec.unpartitioned(), + tableLocation, + Maps.newHashMap(), + true); + txn.updateProperties().set("prop", "value").commit(); // drop the table concurrently catalog.dropTable(TABLE_IDENTIFIER); @@ -273,11 +264,15 @@ public void testCreateOrReplaceTableTxnTableDeletedConcurrently() { public void testCreateOrReplaceTableTxnTableCreatedConcurrently() { Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER)); - Transaction txn = catalog.newReplaceTableTransaction( - TABLE_IDENTIFIER, SCHEMA, PartitionSpec.unpartitioned(), tableLocation, Maps.newHashMap(), true); - txn.updateProperties() - .set("prop", "value") - .commit(); + Transaction txn = + catalog.newReplaceTableTransaction( + TABLE_IDENTIFIER, + SCHEMA, + PartitionSpec.unpartitioned(), + tableLocation, + Maps.newHashMap(), + true); + txn.updateProperties().set("prop", "value").commit(); // create the table concurrently catalog.createTable(TABLE_IDENTIFIER, SCHEMA, SPEC); @@ -295,21 +290,21 @@ public void testCreateOrReplaceTableTxnTableCreatedConcurrently() { public void testCreateTableTxnWithGlobalTableLocation() { Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER)); - Transaction txn = catalog.newCreateTableTransaction( - TABLE_IDENTIFIER, SCHEMA, SPEC, "file:///" + tableLocation, Maps.newHashMap()); + Transaction txn = + catalog.newCreateTableTransaction( + TABLE_IDENTIFIER, SCHEMA, SPEC, "file:///" + tableLocation, Maps.newHashMap()); txn.commitTransaction(); Table table = catalog.loadTable(TABLE_IDENTIFIER); - DataFile dataFile = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(0) - .withRecordCount(1) - .build(); + DataFile dataFile = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(0) + .withRecordCount(1) + .build(); - table.newAppend() - .appendFile(dataFile) - .commit(); + table.newAppend().appendFile(dataFile).commit(); Assert.assertEquals("Write should succeed", 1, Iterables.size(table.snapshots())); } diff --git a/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveMetastoreTest.java b/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveMetastoreTest.java index b1fb891f3054..b3c31351c321 100644 --- a/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveMetastoreTest.java +++ b/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveMetastoreTest.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; import java.util.concurrent.TimeUnit; @@ -49,9 +48,15 @@ public static void startMetastore() throws Exception { String dbPath = metastore.getDatabasePath(DB_NAME); Database db = new Database(DB_NAME, "description", dbPath, Maps.newHashMap()); metastoreClient.createDatabase(db); - HiveMetastoreTest.catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE, ImmutableMap.of( - CatalogProperties.CLIENT_POOL_CACHE_EVICTION_INTERVAL_MS, String.valueOf(EVICTION_INTERVAL)), hiveConf); + HiveMetastoreTest.catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), + CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE, + ImmutableMap.of( + CatalogProperties.CLIENT_POOL_CACHE_EVICTION_INTERVAL_MS, + String.valueOf(EVICTION_INTERVAL)), + hiveConf); } @AfterClass diff --git a/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveTableBaseTest.java b/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveTableBaseTest.java index b584b9e73c14..b49c61192aaa 100644 --- a/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveTableBaseTest.java +++ b/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveTableBaseTest.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; +import static org.apache.iceberg.PartitionSpec.builderFor; +import static org.apache.iceberg.TableMetadataParser.getFileExtension; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.nio.file.Paths; import java.util.Arrays; @@ -33,23 +37,20 @@ import org.junit.After; import org.junit.Before; -import static org.apache.iceberg.PartitionSpec.builderFor; -import static org.apache.iceberg.TableMetadataParser.getFileExtension; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - - public class HiveTableBaseTest extends HiveMetastoreTest { - static final String TABLE_NAME = "tbl"; + static final String TABLE_NAME = "tbl"; static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of(DB_NAME, TABLE_NAME); - static final Schema schema = new Schema(Types.StructType.of( - required(1, "id", Types.LongType.get())).fields()); + static final Schema schema = + new Schema(Types.StructType.of(required(1, "id", Types.LongType.get())).fields()); - static final Schema altered = new Schema(Types.StructType.of( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.LongType.get())).fields()); + static final Schema altered = + new Schema( + Types.StructType.of( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.LongType.get())) + .fields()); private static final PartitionSpec partitionSpec = builderFor(schema).identity("id").build(); @@ -57,7 +58,8 @@ public class HiveTableBaseTest extends HiveMetastoreTest { @Before public void createTestTable() { - this.tableLocation = new Path(catalog.createTable(TABLE_IDENTIFIER, schema, partitionSpec).location()); + this.tableLocation = + new Path(catalog.createTable(TABLE_IDENTIFIER, schema, partitionSpec).location()); } @After @@ -66,6 +68,7 @@ public void dropTestTable() throws Exception { tableLocation.getFileSystem(hiveConf).delete(tableLocation, true); catalog.dropTable(TABLE_IDENTIFIER, false /* metadata only, location was already deleted */); } + private static String getTableBasePath(String tableName) { String databasePath = metastore.getDatabasePath(DB_NAME); return Paths.get(databasePath, tableName).toAbsolutePath().toString(); @@ -98,10 +101,8 @@ protected static List manifestFiles(String tableName) { } private static List filterByExtension(String tableName, String extension) { - return metadataFiles(tableName) - .stream() + return metadataFiles(tableName).stream() .filter(f -> f.endsWith(extension)) .collect(Collectors.toList()); } - } diff --git a/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveTableTest.java b/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveTableTest.java index 9bfc969ac4c4..51ed46ef8591 100644 --- a/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveTableTest.java +++ b/hive-metastore/src/test/java/org/apache/iceberg/hive/HiveTableTest.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; +import static java.nio.file.Files.createTempDirectory; +import static java.nio.file.attribute.PosixFilePermissions.asFileAttribute; +import static java.nio.file.attribute.PosixFilePermissions.fromString; +import static org.apache.iceberg.BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE; +import static org.apache.iceberg.BaseMetastoreTableOperations.METADATA_LOCATION_PROP; +import static org.apache.iceberg.BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP; +import static org.apache.iceberg.BaseMetastoreTableOperations.TABLE_TYPE_PROP; +import static org.apache.iceberg.TableMetadataParser.getFileExtension; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -71,22 +81,10 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static java.nio.file.Files.createTempDirectory; -import static java.nio.file.attribute.PosixFilePermissions.asFileAttribute; -import static java.nio.file.attribute.PosixFilePermissions.fromString; -import static org.apache.iceberg.BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE; -import static org.apache.iceberg.BaseMetastoreTableOperations.METADATA_LOCATION_PROP; -import static org.apache.iceberg.BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP; -import static org.apache.iceberg.BaseMetastoreTableOperations.TABLE_TYPE_PROP; -import static org.apache.iceberg.TableMetadataParser.getFileExtension; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class HiveTableTest extends HiveTableBaseTest { - static final String NON_DEFAULT_DATABASE = "nondefault"; + static final String NON_DEFAULT_DATABASE = "nondefault"; - @Rule - public TemporaryFolder tempFolder = new TemporaryFolder(); + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); @Test public void testCreate() throws TException { @@ -120,7 +118,8 @@ public void testCreate() throws TException { @Test public void testRename() { String renamedTableName = "rename_table_name"; - TableIdentifier renameTableIdentifier = TableIdentifier.of(TABLE_IDENTIFIER.namespace(), renamedTableName); + TableIdentifier renameTableIdentifier = + TableIdentifier.of(TABLE_IDENTIFIER.namespace(), renamedTableName); Table original = catalog.loadTable(TABLE_IDENTIFIER); catalog.renameTable(TABLE_IDENTIFIER, renameTableIdentifier); @@ -140,7 +139,8 @@ public void testRename() { @Test public void testDrop() { Assert.assertTrue("Table should exist", catalog.tableExists(TABLE_IDENTIFIER)); - Assert.assertTrue("Drop should return true and drop the table", catalog.dropTable(TABLE_IDENTIFIER)); + Assert.assertTrue( + "Drop should return true and drop the table", catalog.dropTable(TABLE_IDENTIFIER)); Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER)); } @@ -150,60 +150,59 @@ public void testDropWithoutPurgeLeavesTableData() throws IOException { String fileLocation = appendData(table, "file"); - String manifestListLocation = table.currentSnapshot().manifestListLocation().replace("file:", ""); + String manifestListLocation = + table.currentSnapshot().manifestListLocation().replace("file:", ""); - Assert.assertTrue("Drop should return true and drop the table", + Assert.assertTrue( + "Drop should return true and drop the table", catalog.dropTable(TABLE_IDENTIFIER, false /* do not delete underlying files */)); Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER)); - Assert.assertTrue("Table data files should exist", - new File(fileLocation).exists()); - Assert.assertTrue("Table metadata files should exist", - new File(manifestListLocation).exists()); + Assert.assertTrue("Table data files should exist", new File(fileLocation).exists()); + Assert.assertTrue("Table metadata files should exist", new File(manifestListLocation).exists()); } @Test public void testDropTable() throws IOException { Table table = catalog.loadTable(TABLE_IDENTIFIER); - GenericRecordBuilder recordBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert(schema, "test")); - List records = Lists.newArrayList( - recordBuilder.set("id", 1L).build(), - recordBuilder.set("id", 2L).build(), - recordBuilder.set("id", 3L).build() - ); + GenericRecordBuilder recordBuilder = + new GenericRecordBuilder(AvroSchemaUtil.convert(schema, "test")); + List records = + Lists.newArrayList( + recordBuilder.set("id", 1L).build(), + recordBuilder.set("id", 2L).build(), + recordBuilder.set("id", 3L).build()); String location1 = table.location().replace("file:", "") + "/data/file1.avro"; - try (FileAppender writer = Avro.write(Files.localOutput(location1)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Avro.write(Files.localOutput(location1)).schema(schema).named("test").build()) { for (GenericData.Record rec : records) { writer.add(rec); } } String location2 = table.location().replace("file:", "") + "/data/file2.avro"; - try (FileAppender writer = Avro.write(Files.localOutput(location2)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Avro.write(Files.localOutput(location2)).schema(schema).named("test").build()) { for (GenericData.Record rec : records) { writer.add(rec); } } - DataFile file1 = DataFiles.builder(table.spec()) - .withRecordCount(3) - .withPath(location1) - .withFileSizeInBytes(Files.localInput(location2).getLength()) - .build(); + DataFile file1 = + DataFiles.builder(table.spec()) + .withRecordCount(3) + .withPath(location1) + .withFileSizeInBytes(Files.localInput(location2).getLength()) + .build(); - DataFile file2 = DataFiles.builder(table.spec()) - .withRecordCount(3) - .withPath(location2) - .withFileSizeInBytes(Files.localInput(location1).getLength()) - .build(); + DataFile file2 = + DataFiles.builder(table.spec()) + .withRecordCount(3) + .withPath(location2) + .withFileSizeInBytes(Files.localInput(location1).getLength()) + .build(); // add both data files table.newAppend().appendFile(file1).appendFile(file2).commit(); @@ -211,27 +210,34 @@ public void testDropTable() throws IOException { // delete file2 table.newDelete().deleteFile(file2.path()).commit(); - String manifestListLocation = table.currentSnapshot().manifestListLocation().replace("file:", ""); + String manifestListLocation = + table.currentSnapshot().manifestListLocation().replace("file:", ""); List manifests = table.currentSnapshot().allManifests(table.io()); - Assert.assertTrue("Drop (table and data) should return true and drop the table", + Assert.assertTrue( + "Drop (table and data) should return true and drop the table", catalog.dropTable(TABLE_IDENTIFIER)); Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER)); - Assert.assertFalse("Table data files should not exist", - new File(location1).exists()); - Assert.assertFalse("Table data files should not exist", - new File(location2).exists()); - Assert.assertFalse("Table manifest list files should not exist", - new File(manifestListLocation).exists()); + Assert.assertFalse("Table data files should not exist", new File(location1).exists()); + Assert.assertFalse("Table data files should not exist", new File(location2).exists()); + Assert.assertFalse( + "Table manifest list files should not exist", new File(manifestListLocation).exists()); for (ManifestFile manifest : manifests) { - Assert.assertFalse("Table manifest files should not exist", + Assert.assertFalse( + "Table manifest files should not exist", new File(manifest.path().replace("file:", "")).exists()); } - Assert.assertFalse("Table metadata file should not exist", - new File(((HasTableOperations) table).operations().current() - .metadataFileLocation().replace("file:", "")).exists()); + Assert.assertFalse( + "Table metadata file should not exist", + new File( + ((HasTableOperations) table) + .operations() + .current() + .metadataFileLocation() + .replace("file:", "")) + .exists()); } @Test @@ -247,13 +253,12 @@ public void testExistingTableUpdate() throws TException { Assert.assertEquals(0, manifestFiles(TABLE_NAME).size()); Assert.assertEquals(altered.asStruct(), icebergTable.schema().asStruct()); - final org.apache.hadoop.hive.metastore.api.Table table = metastoreClient.getTable(DB_NAME, TABLE_NAME); - final List hiveColumns = table.getSd().getCols().stream() - .map(FieldSchema::getName) - .collect(Collectors.toList()); - final List icebergColumns = altered.columns().stream() - .map(Types.NestedField::name) - .collect(Collectors.toList()); + final org.apache.hadoop.hive.metastore.api.Table table = + metastoreClient.getTable(DB_NAME, TABLE_NAME); + final List hiveColumns = + table.getSd().getCols().stream().map(FieldSchema::getName).collect(Collectors.toList()); + final List icebergColumns = + altered.columns().stream().map(Types.NestedField::name).collect(Collectors.toList()); Assert.assertEquals(icebergColumns, hiveColumns); } @@ -261,51 +266,67 @@ public void testExistingTableUpdate() throws TException { public void testColumnTypeChangeInMetastore() throws TException { Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER); - Schema expectedSchema = new Schema(Types.StructType.of( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.LongType.get()), - optional(3, "string", Types.StringType.get()), - optional(4, "int", Types.IntegerType.get())).fields()); + Schema expectedSchema = + new Schema( + Types.StructType.of( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.LongType.get()), + optional(3, "string", Types.StringType.get()), + optional(4, "int", Types.IntegerType.get())) + .fields()); // Add columns with different types, then verify we could delete one column in hive metastore - // as hive conf METASTORE_DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES was set to false. If this was set to true, - // an InvalidOperationException would thrown in method MetaStoreUtils#throwExceptionIfIncompatibleColTypeChange() - icebergTable.updateSchema() - .addColumn("data", Types.LongType.get()) - .addColumn("string", Types.StringType.get()) - .addColumn("int", Types.IntegerType.get()) - .commit(); - - Assert.assertEquals("Schema should match expected", expectedSchema.asStruct(), icebergTable.schema().asStruct()); - - expectedSchema = new Schema(Types.StructType.of( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.LongType.get()), - optional(4, "int", Types.IntegerType.get())).fields()); + // as hive conf METASTORE_DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES was set to false. If this was + // set to true, + // an InvalidOperationException would thrown in method + // MetaStoreUtils#throwExceptionIfIncompatibleColTypeChange() + icebergTable + .updateSchema() + .addColumn("data", Types.LongType.get()) + .addColumn("string", Types.StringType.get()) + .addColumn("int", Types.IntegerType.get()) + .commit(); + + Assert.assertEquals( + "Schema should match expected", + expectedSchema.asStruct(), + icebergTable.schema().asStruct()); + + expectedSchema = + new Schema( + Types.StructType.of( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.LongType.get()), + optional(4, "int", Types.IntegerType.get())) + .fields()); icebergTable.updateSchema().deleteColumn("string").commit(); - Assert.assertEquals("Schema should match expected", expectedSchema.asStruct(), icebergTable.schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema.asStruct(), + icebergTable.schema().asStruct()); } @Test public void testFailure() throws TException { Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER); - org.apache.hadoop.hive.metastore.api.Table table = metastoreClient.getTable(DB_NAME, TABLE_NAME); + org.apache.hadoop.hive.metastore.api.Table table = + metastoreClient.getTable(DB_NAME, TABLE_NAME); String dummyLocation = "dummylocation"; table.getParameters().put(METADATA_LOCATION_PROP, dummyLocation); metastoreClient.alter_table(DB_NAME, TABLE_NAME, table); - Assertions.assertThatThrownBy(() -> icebergTable.updateSchema() - .addColumn("data", Types.LongType.get()) - .commit()) - .isInstanceOf(CommitFailedException.class) - .hasMessageContaining("is not same as the current table metadata location 'dummylocation'"); + Assertions.assertThatThrownBy( + () -> icebergTable.updateSchema().addColumn("data", Types.LongType.get()).commit()) + .isInstanceOf(CommitFailedException.class) + .hasMessageContaining("is not same as the current table metadata location 'dummylocation'"); } @Test public void testListTables() throws TException, IOException { List tableIdents = catalog.listTables(TABLE_IDENTIFIER.namespace()); - List expectedIdents = tableIdents.stream() - .filter(t -> t.namespace().level(0).equals(DB_NAME) && t.name().equals(TABLE_NAME)) - .collect(Collectors.toList()); + List expectedIdents = + tableIdents.stream() + .filter(t -> t.namespace().level(0).equals(DB_NAME) && t.name().equals(TABLE_NAME)) + .collect(Collectors.toList()); Assert.assertEquals(1, expectedIdents.size()); Assert.assertTrue(catalog.tableExists(TABLE_IDENTIFIER)); @@ -327,22 +348,45 @@ public void testListTables() throws TException, IOException { metastoreClient.dropTable(DB_NAME, hiveTableName); } - private org.apache.hadoop.hive.metastore.api.Table createHiveTable(String hiveTableName) throws IOException { + private org.apache.hadoop.hive.metastore.api.Table createHiveTable(String hiveTableName) + throws IOException { Map parameters = Maps.newHashMap(); - parameters.put(serdeConstants.SERIALIZATION_CLASS, "org.apache.hadoop.hive.serde2.thrift.test.IntString"); - parameters.put(serdeConstants.SERIALIZATION_FORMAT, "org.apache.thrift.protocol.TBinaryProtocol"); + parameters.put( + serdeConstants.SERIALIZATION_CLASS, "org.apache.hadoop.hive.serde2.thrift.test.IntString"); + parameters.put( + serdeConstants.SERIALIZATION_FORMAT, "org.apache.thrift.protocol.TBinaryProtocol"); - SerDeInfo serDeInfo = new SerDeInfo(null, "org.apache.hadoop.hive.serde2.thrift.ThriftDeserializer", parameters); + SerDeInfo serDeInfo = + new SerDeInfo(null, "org.apache.hadoop.hive.serde2.thrift.ThriftDeserializer", parameters); // StorageDescriptor has an empty list of fields - SerDe will report them. - StorageDescriptor sd = new StorageDescriptor(Lists.newArrayList(), tempFolder.newFolder().getAbsolutePath(), - "org.apache.hadoop.mapred.TextInputFormat", "org.apache.hadoop.mapred.TextOutputFormat", - false, -1, serDeInfo, Lists.newArrayList(), Lists.newArrayList(), Maps.newHashMap()); + StorageDescriptor sd = + new StorageDescriptor( + Lists.newArrayList(), + tempFolder.newFolder().getAbsolutePath(), + "org.apache.hadoop.mapred.TextInputFormat", + "org.apache.hadoop.mapred.TextOutputFormat", + false, + -1, + serDeInfo, + Lists.newArrayList(), + Lists.newArrayList(), + Maps.newHashMap()); org.apache.hadoop.hive.metastore.api.Table hiveTable = - new org.apache.hadoop.hive.metastore.api.Table(hiveTableName, DB_NAME, "test_owner", - 0, 0, 0, sd, Lists.newArrayList(), Maps.newHashMap(), - "viewOriginalText", "viewExpandedText", TableType.EXTERNAL_TABLE.name()); + new org.apache.hadoop.hive.metastore.api.Table( + hiveTableName, + DB_NAME, + "test_owner", + 0, + 0, + 0, + sd, + Lists.newArrayList(), + Maps.newHashMap(), + "viewOriginalText", + "viewExpandedText", + TableType.EXTERNAL_TABLE.name()); return hiveTable; } @@ -350,9 +394,11 @@ private org.apache.hadoop.hive.metastore.api.Table createHiveTable(String hiveTa public void testNonDefaultDatabaseLocation() throws IOException, TException { Namespace namespace = Namespace.of(NON_DEFAULT_DATABASE); // Create a new location and a non-default database / namespace for it - File nonDefaultLocation = createTempDirectory(NON_DEFAULT_DATABASE, - asFileAttribute(fromString("rwxrwxrwx"))).toFile(); - catalog.createNamespace(namespace, Collections.singletonMap("location", nonDefaultLocation.getPath())); + File nonDefaultLocation = + createTempDirectory(NON_DEFAULT_DATABASE, asFileAttribute(fromString("rwxrwxrwx"))) + .toFile(); + catalog.createNamespace( + namespace, Collections.singletonMap("location", nonDefaultLocation.getPath())); Map namespaceMeta = catalog.loadNamespaceMetadata(namespace); // Make sure that we are testing a namespace with a non default location :) Assert.assertEquals(namespaceMeta.get("location"), "file:" + nonDefaultLocation.getPath()); @@ -370,11 +416,13 @@ public void testNonDefaultDatabaseLocation() throws IOException, TException { @Test public void testRegisterTable() throws TException { - org.apache.hadoop.hive.metastore.api.Table originalTable = metastoreClient.getTable(DB_NAME, TABLE_NAME); + org.apache.hadoop.hive.metastore.api.Table originalTable = + metastoreClient.getTable(DB_NAME, TABLE_NAME); Map originalParams = originalTable.getParameters(); Assert.assertNotNull(originalParams); - Assert.assertTrue(ICEBERG_TABLE_TYPE_VALUE.equalsIgnoreCase(originalParams.get(TABLE_TYPE_PROP))); + Assert.assertTrue( + ICEBERG_TABLE_TYPE_VALUE.equalsIgnoreCase(originalParams.get(TABLE_TYPE_PROP))); Assert.assertTrue("EXTERNAL_TABLE".equalsIgnoreCase(originalTable.getTableType())); catalog.dropTable(TABLE_IDENTIFIER, false); @@ -385,12 +433,15 @@ public void testRegisterTable() throws TException { catalog.registerTable(TABLE_IDENTIFIER, "file:" + metadataVersionFiles.get(0)); - org.apache.hadoop.hive.metastore.api.Table newTable = metastoreClient.getTable(DB_NAME, TABLE_NAME); + org.apache.hadoop.hive.metastore.api.Table newTable = + metastoreClient.getTable(DB_NAME, TABLE_NAME); Map newTableParameters = newTable.getParameters(); Assert.assertNull(newTableParameters.get(PREVIOUS_METADATA_LOCATION_PROP)); - Assert.assertEquals(originalParams.get(TABLE_TYPE_PROP), newTableParameters.get(TABLE_TYPE_PROP)); - Assert.assertEquals(originalParams.get(METADATA_LOCATION_PROP), newTableParameters.get(METADATA_LOCATION_PROP)); + Assert.assertEquals( + originalParams.get(TABLE_TYPE_PROP), newTableParameters.get(TABLE_TYPE_PROP)); + Assert.assertEquals( + originalParams.get(METADATA_LOCATION_PROP), newTableParameters.get(METADATA_LOCATION_PROP)); Assert.assertEquals(originalTable.getSd(), newTable.getSd()); } @@ -401,7 +452,9 @@ public void testRegisterHadoopTableToHiveCatalog() throws IOException, TExceptio HadoopCatalog hadoopCatalog = new HadoopCatalog(new Configuration(), tableLocation); // create table using hadoop catalog TableIdentifier identifier = TableIdentifier.of(DB_NAME, "table1"); - Table table = hadoopCatalog.createTable(identifier, schema, PartitionSpec.unpartitioned(), Maps.newHashMap()); + Table table = + hadoopCatalog.createTable( + identifier, schema, PartitionSpec.unpartitioned(), Maps.newHashMap()); // insert some data String file1Location = appendData(table, "file1"); List tasks = Lists.newArrayList(table.newScan().planFiles()); @@ -417,13 +470,15 @@ public void testRegisterHadoopTableToHiveCatalog() throws IOException, TExceptio Assert.assertEquals(2, metadataFiles.size()); AssertHelpers.assertThrows( - "Hive metastore should not have this table", NoSuchObjectException.class, - "table not found", - () -> metastoreClient.getTable(DB_NAME, "table1")); + "Hive metastore should not have this table", + NoSuchObjectException.class, + "table not found", + () -> metastoreClient.getTable(DB_NAME, "table1")); AssertHelpers.assertThrows( - "Hive catalog should fail to load the table", NoSuchTableException.class, - "Table does not exist:", - () -> catalog.loadTable(identifier)); + "Hive catalog should fail to load the table", + NoSuchTableException.class, + "Table does not exist:", + () -> catalog.loadTable(identifier)); // register the table to hive catalog using the latest metadata file String latestMetadataFile = ((BaseTable) table).operations().current().metadataFileLocation(); @@ -438,29 +493,30 @@ public void testRegisterHadoopTableToHiveCatalog() throws IOException, TExceptio String file2Location = appendData(table, "file2"); tasks = Lists.newArrayList(table.newScan().planFiles()); Assert.assertEquals("Should scan 2 files", 2, tasks.size()); - Set files = tasks.stream().map(task -> task.file().path().toString()).collect(Collectors.toSet()); + Set files = + tasks.stream().map(task -> task.file().path().toString()).collect(Collectors.toSet()); Assert.assertTrue(files.contains(file1Location) && files.contains(file2Location)); } private String appendData(Table table, String fileName) throws IOException { - GenericRecordBuilder recordBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert(schema, "test")); - List records = Lists.newArrayList( + GenericRecordBuilder recordBuilder = + new GenericRecordBuilder(AvroSchemaUtil.convert(schema, "test")); + List records = + Lists.newArrayList( recordBuilder.set("id", 1L).build(), recordBuilder.set("id", 2L).build(), - recordBuilder.set("id", 3L).build() - ); + recordBuilder.set("id", 3L).build()); String fileLocation = table.location().replace("file:", "") + "/data/" + fileName + ".avro"; - try (FileAppender writer = Avro.write(Files.localOutput(fileLocation)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Avro.write(Files.localOutput(fileLocation)).schema(schema).named("test").build()) { for (GenericData.Record rec : records) { writer.add(rec); } } - DataFile file = DataFiles.builder(table.spec()) + DataFile file = + DataFiles.builder(table.spec()) .withRecordCount(3) .withPath(fileLocation) .withFileSizeInBytes(Files.localInput(fileLocation).getLength()) @@ -473,11 +529,13 @@ private String appendData(Table table, String fileName) throws IOException { @Test public void testRegisterExistingTable() throws TException { - org.apache.hadoop.hive.metastore.api.Table originalTable = metastoreClient.getTable(DB_NAME, TABLE_NAME); + org.apache.hadoop.hive.metastore.api.Table originalTable = + metastoreClient.getTable(DB_NAME, TABLE_NAME); Map originalParams = originalTable.getParameters(); Assert.assertNotNull(originalParams); - Assert.assertTrue(ICEBERG_TABLE_TYPE_VALUE.equalsIgnoreCase(originalParams.get(TABLE_TYPE_PROP))); + Assert.assertTrue( + ICEBERG_TABLE_TYPE_VALUE.equalsIgnoreCase(originalParams.get(TABLE_TYPE_PROP))); Assert.assertTrue("EXTERNAL_TABLE".equalsIgnoreCase(originalTable.getTableType())); List metadataVersionFiles = metadataVersionFiles(TABLE_NAME); @@ -485,7 +543,8 @@ public void testRegisterExistingTable() throws TException { // Try to register an existing table AssertHelpers.assertThrows( - "Should complain that the table already exists", AlreadyExistsException.class, + "Should complain that the table already exists", + AlreadyExistsException.class, "Table already exists", () -> catalog.registerTable(TABLE_IDENTIFIER, "file:" + metadataVersionFiles.get(0))); } @@ -499,7 +558,8 @@ public void testEngineHiveEnabledDefault() throws TException { catalog.getConf().unset(ConfigProperties.ENGINE_HIVE_ENABLED); catalog.createTable(TABLE_IDENTIFIER, schema, PartitionSpec.unpartitioned()); - org.apache.hadoop.hive.metastore.api.Table hmsTable = metastoreClient.getTable(DB_NAME, TABLE_NAME); + org.apache.hadoop.hive.metastore.api.Table hmsTable = + metastoreClient.getTable(DB_NAME, TABLE_NAME); assertHiveEnabled(hmsTable, false); } @@ -513,7 +573,8 @@ public void testEngineHiveEnabledConfig() throws TException { catalog.getConf().set(ConfigProperties.ENGINE_HIVE_ENABLED, "true"); catalog.createTable(TABLE_IDENTIFIER, schema, PartitionSpec.unpartitioned()); - org.apache.hadoop.hive.metastore.api.Table hmsTable = metastoreClient.getTable(DB_NAME, TABLE_NAME); + org.apache.hadoop.hive.metastore.api.Table hmsTable = + metastoreClient.getTable(DB_NAME, TABLE_NAME); assertHiveEnabled(hmsTable, true); @@ -539,7 +600,8 @@ public void testEngineHiveEnabledTableProperty() throws TException { catalog.getConf().set(ConfigProperties.ENGINE_HIVE_ENABLED, "false"); catalog.createTable(TABLE_IDENTIFIER, schema, PartitionSpec.unpartitioned(), tableProperties); - org.apache.hadoop.hive.metastore.api.Table hmsTable = metastoreClient.getTable(DB_NAME, TABLE_NAME); + org.apache.hadoop.hive.metastore.api.Table hmsTable = + metastoreClient.getTable(DB_NAME, TABLE_NAME); assertHiveEnabled(hmsTable, true); @@ -564,28 +626,34 @@ public void testMissingMetadataWontCauseHang() { Assert.assertTrue(realLocation.renameTo(fakeLocation)); AssertHelpers.assertThrows( - "HiveTableOperations shouldn't hang indefinitely when a missing metadata file is encountered", - NotFoundException.class, - () -> catalog.loadTable(TABLE_IDENTIFIER)); + "HiveTableOperations shouldn't hang indefinitely when a missing metadata file is encountered", + NotFoundException.class, + () -> catalog.loadTable(TABLE_IDENTIFIER)); Assert.assertTrue(fakeLocation.renameTo(realLocation)); } - private void assertHiveEnabled(org.apache.hadoop.hive.metastore.api.Table hmsTable, boolean expected) { + private void assertHiveEnabled( + org.apache.hadoop.hive.metastore.api.Table hmsTable, boolean expected) { if (expected) { - Assert.assertEquals("org.apache.iceberg.mr.hive.HiveIcebergStorageHandler", + Assert.assertEquals( + "org.apache.iceberg.mr.hive.HiveIcebergStorageHandler", hmsTable.getParameters().get(hive_metastoreConstants.META_TABLE_STORAGE)); - Assert.assertEquals("org.apache.iceberg.mr.hive.HiveIcebergSerDe", + Assert.assertEquals( + "org.apache.iceberg.mr.hive.HiveIcebergSerDe", hmsTable.getSd().getSerdeInfo().getSerializationLib()); - Assert.assertEquals("org.apache.iceberg.mr.hive.HiveIcebergInputFormat", - hmsTable.getSd().getInputFormat()); - Assert.assertEquals("org.apache.iceberg.mr.hive.HiveIcebergOutputFormat", - hmsTable.getSd().getOutputFormat()); + Assert.assertEquals( + "org.apache.iceberg.mr.hive.HiveIcebergInputFormat", hmsTable.getSd().getInputFormat()); + Assert.assertEquals( + "org.apache.iceberg.mr.hive.HiveIcebergOutputFormat", hmsTable.getSd().getOutputFormat()); } else { Assert.assertNull(hmsTable.getParameters().get(hive_metastoreConstants.META_TABLE_STORAGE)); - Assert.assertEquals("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", + Assert.assertEquals( + "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", hmsTable.getSd().getSerdeInfo().getSerializationLib()); - Assert.assertEquals("org.apache.hadoop.mapred.FileInputFormat", hmsTable.getSd().getInputFormat()); - Assert.assertEquals("org.apache.hadoop.mapred.FileOutputFormat", hmsTable.getSd().getOutputFormat()); + Assert.assertEquals( + "org.apache.hadoop.mapred.FileInputFormat", hmsTable.getSd().getInputFormat()); + Assert.assertEquals( + "org.apache.hadoop.mapred.FileOutputFormat", hmsTable.getSd().getOutputFormat()); } } } diff --git a/hive-metastore/src/test/java/org/apache/iceberg/hive/ScriptRunner.java b/hive-metastore/src/test/java/org/apache/iceberg/hive/ScriptRunner.java index fce4b307a201..9ac0fab4bf37 100644 --- a/hive-metastore/src/test/java/org/apache/iceberg/hive/ScriptRunner.java +++ b/hive-metastore/src/test/java/org/apache/iceberg/hive/ScriptRunner.java @@ -1,24 +1,21 @@ /* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * Copyright 2004 Clinton Begin + * http://www.apache.org/licenses/LICENSE-2.0 * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * Slightly modified version of the com.ibatis.common.jdbc.ScriptRunner class - * from the iBATIS Apache project. Only removed dependency on Resource class - * and a constructor. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ - package org.apache.iceberg.hive; import java.io.IOException; @@ -31,9 +28,7 @@ import java.sql.SQLException; import java.sql.Statement; -/** - * Tool to run database scripts - */ +/** Tool to run database scripts */ public class ScriptRunner { private static final String DEFAULT_DELIMITER = ";"; @@ -49,11 +44,8 @@ public class ScriptRunner { private String delimiter = DEFAULT_DELIMITER; private boolean fullLineDelimiter = false; - /** - * Default constructor - */ - public ScriptRunner(Connection connection, boolean autoCommit, - boolean stopOnError) { + /** Default constructor */ + public ScriptRunner(Connection connection, boolean autoCommit, boolean stopOnError) { this.connection = connection; this.autoCommit = autoCommit; this.stopOnError = stopOnError; @@ -67,8 +59,7 @@ public void setDelimiter(String newDelimiter, boolean newFullLineDelimiter) { /** * Setter for logWriter property * - * @param logWriter - * - the new value of the logWriter property + * @param logWriter - the new value of the logWriter property */ public void setLogWriter(PrintWriter logWriter) { this.logWriter = logWriter; @@ -77,8 +68,7 @@ public void setLogWriter(PrintWriter logWriter) { /** * Setter for errorLogWriter property * - * @param errorLogWriter - * - the new value of the errorLogWriter property + * @param errorLogWriter - the new value of the errorLogWriter property */ public void setErrorLogWriter(PrintWriter errorLogWriter) { this.errorLogWriter = errorLogWriter; @@ -87,8 +77,7 @@ public void setErrorLogWriter(PrintWriter errorLogWriter) { /** * Runs an SQL script (read in using the Reader parameter) * - * @param reader - * - the source of the script + * @param reader - the source of the script */ public void runScript(Reader reader) throws IOException, SQLException { try { @@ -111,17 +100,12 @@ public void runScript(Reader reader) throws IOException, SQLException { } /** - * Runs an SQL script (read in using the Reader parameter) using the - * connection passed in + * Runs an SQL script (read in using the Reader parameter) using the connection passed in * - * @param conn - * - the connection to use for the script - * @param reader - * - the source of the script - * @throws SQLException - * if any SQL errors occur - * @throws IOException - * if there is an error reading from the Reader + * @param conn - the connection to use for the script + * @param reader - the source of the script + * @throws SQLException if any SQL errors occur + * @throws IOException if there is an error reading from the Reader */ @SuppressWarnings("checkstyle:CyclomaticComplexity") private void runScript(Connection conn, Reader reader) throws IOException, SQLException { @@ -140,10 +124,9 @@ private void runScript(Connection conn, Reader reader) throws IOException, SQLEx // Do nothing } else if (trimmedLine.length() < 1 || trimmedLine.startsWith("--")) { // Do nothing - } else if (!fullLineDelimiter && trimmedLine.endsWith(getDelimiter()) || - fullLineDelimiter && trimmedLine.equals(getDelimiter())) { - command.append(line.substring(0, line - .lastIndexOf(getDelimiter()))); + } else if (!fullLineDelimiter && trimmedLine.endsWith(getDelimiter()) + || fullLineDelimiter && trimmedLine.equals(getDelimiter())) { + command.append(line.substring(0, line.lastIndexOf(getDelimiter()))); command.append(" "); Statement statement = conn.createStatement(); diff --git a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestCachedClientPool.java b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestCachedClientPool.java index 7c1f3c4028fb..3c6b9a99e64e 100644 --- a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestCachedClientPool.java +++ b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestCachedClientPool.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; import java.util.Collections; @@ -39,5 +38,4 @@ public void testClientPoolCleaner() throws InterruptedException { TimeUnit.MILLISECONDS.sleep(EVICTION_INTERVAL + TimeUnit.SECONDS.toMillis(5)); Assert.assertNull(CachedClientPool.clientPoolCache().getIfPresent(metastoreUri)); } - } diff --git a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveCatalog.java b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveCatalog.java index 46ccb571a553..b5040ec070e7 100644 --- a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveCatalog.java +++ b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveCatalog.java @@ -16,9 +16,21 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; +import static org.apache.iceberg.NullOrder.NULLS_FIRST; +import static org.apache.iceberg.SortDirection.ASC; +import static org.apache.iceberg.TableProperties.CURRENT_SCHEMA; +import static org.apache.iceberg.TableProperties.CURRENT_SNAPSHOT_ID; +import static org.apache.iceberg.TableProperties.CURRENT_SNAPSHOT_SUMMARY; +import static org.apache.iceberg.TableProperties.CURRENT_SNAPSHOT_TIMESTAMP; +import static org.apache.iceberg.TableProperties.DEFAULT_PARTITION_SPEC; +import static org.apache.iceberg.TableProperties.DEFAULT_SORT_ORDER; +import static org.apache.iceberg.expressions.Expressions.bucket; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + import java.util.List; import java.util.Map; import java.util.UUID; @@ -64,47 +76,34 @@ import org.junit.jupiter.api.Assertions; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.NullOrder.NULLS_FIRST; -import static org.apache.iceberg.SortDirection.ASC; -import static org.apache.iceberg.TableProperties.CURRENT_SCHEMA; -import static org.apache.iceberg.TableProperties.CURRENT_SNAPSHOT_ID; -import static org.apache.iceberg.TableProperties.CURRENT_SNAPSHOT_SUMMARY; -import static org.apache.iceberg.TableProperties.CURRENT_SNAPSHOT_TIMESTAMP; -import static org.apache.iceberg.TableProperties.DEFAULT_PARTITION_SPEC; -import static org.apache.iceberg.TableProperties.DEFAULT_SORT_ORDER; -import static org.apache.iceberg.expressions.Expressions.bucket; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - public class TestHiveCatalog extends HiveMetastoreTest { - private static ImmutableMap meta = ImmutableMap.of( - "owner", "apache", - "group", "iceberg", - "comment", "iceberg hiveCatalog test"); + private static ImmutableMap meta = + ImmutableMap.of( + "owner", "apache", + "group", "iceberg", + "comment", "iceberg hiveCatalog test"); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testCreateTableBuilder() throws Exception { - Schema schema = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .bucket("data", 16) - .build(); + Schema schema = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); + PartitionSpec spec = PartitionSpec.builderFor(schema).bucket("data", 16).build(); TableIdentifier tableIdent = TableIdentifier.of(DB_NAME, "tbl"); String location = temp.newFolder("tbl").toString(); try { - Table table = catalog.buildTable(tableIdent, schema) - .withPartitionSpec(spec) - .withLocation(location) - .withProperty("key1", "value1") - .withProperty("key2", "value2") - .create(); + Table table = + catalog + .buildTable(tableIdent, schema) + .withPartitionSpec(spec) + .withLocation(location) + .withProperty("key1", "value1") + .withProperty("key2", "value2") + .create(); Assert.assertEquals(location, table.location()); Assert.assertEquals(2, table.schema().columns().size()); @@ -118,13 +117,11 @@ public void testCreateTableBuilder() throws Exception { @Test public void testCreateTableWithCaching() throws Exception { - Schema schema = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .bucket("data", 16) - .build(); + Schema schema = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); + PartitionSpec spec = PartitionSpec.builderFor(schema).bucket("data", 16).build(); TableIdentifier tableIdent = TableIdentifier.of(DB_NAME, "tbl"); String location = temp.newFolder("tbl").toString(); ImmutableMap properties = ImmutableMap.of("key1", "value1", "key2", "value2"); @@ -145,18 +142,20 @@ public void testCreateTableWithCaching() throws Exception { @Test public void testInitialize() { - Assertions.assertDoesNotThrow(() -> { - HiveCatalog catalog = new HiveCatalog(); - catalog.initialize("hive", Maps.newHashMap()); - }); + Assertions.assertDoesNotThrow( + () -> { + HiveCatalog catalog = new HiveCatalog(); + catalog.initialize("hive", Maps.newHashMap()); + }); } @Test public void testToStringWithoutSetConf() { - Assertions.assertDoesNotThrow(() -> { - HiveCatalog catalog = new HiveCatalog(); - catalog.toString(); - }); + Assertions.assertDoesNotThrow( + () -> { + HiveCatalog catalog = new HiveCatalog(); + catalog.toString(); + }); } @Test @@ -168,22 +167,22 @@ public void testInitializeCatalogWithProperties() { catalog.initialize("hive", properties); Assert.assertEquals(catalog.getConf().get("hive.metastore.uris"), "thrift://examplehost:9083"); - Assert.assertEquals(catalog.getConf().get("hive.metastore.warehouse.dir"), "/user/hive/testwarehouse"); + Assert.assertEquals( + catalog.getConf().get("hive.metastore.warehouse.dir"), "/user/hive/testwarehouse"); } @Test public void testCreateTableTxnBuilder() throws Exception { - Schema schema = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); TableIdentifier tableIdent = TableIdentifier.of(DB_NAME, "tbl"); String location = temp.newFolder("tbl").toString(); try { - Transaction txn = catalog.buildTable(tableIdent, schema) - .withLocation(location) - .createTransaction(); + Transaction txn = + catalog.buildTable(tableIdent, schema).withLocation(location).createTransaction(); txn.commitTransaction(); Table table = catalog.loadTable(tableIdent); @@ -197,22 +196,22 @@ public void testCreateTableTxnBuilder() throws Exception { @Test public void testReplaceTxnBuilder() throws Exception { - Schema schema = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .bucket("data", 16) - .build(); + Schema schema = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); + PartitionSpec spec = PartitionSpec.builderFor(schema).bucket("data", 16).build(); TableIdentifier tableIdent = TableIdentifier.of(DB_NAME, "tbl"); String location = temp.newFolder("tbl").toString(); try { - Transaction createTxn = catalog.buildTable(tableIdent, schema) - .withPartitionSpec(spec) - .withLocation(location) - .withProperty("key1", "value1") - .createOrReplaceTransaction(); + Transaction createTxn = + catalog + .buildTable(tableIdent, schema) + .withPartitionSpec(spec) + .withLocation(location) + .withProperty("key1", "value1") + .createOrReplaceTransaction(); createTxn.commitTransaction(); Table table = catalog.loadTable(tableIdent); @@ -220,21 +219,23 @@ public void testReplaceTxnBuilder() throws Exception { String newLocation = temp.newFolder("tbl-2").toString(); - Transaction replaceTxn = catalog.buildTable(tableIdent, schema) - .withProperty("key2", "value2") - .withLocation(newLocation) - .replaceTransaction(); + Transaction replaceTxn = + catalog + .buildTable(tableIdent, schema) + .withProperty("key2", "value2") + .withLocation(newLocation) + .replaceTransaction(); replaceTxn.commitTransaction(); table = catalog.loadTable(tableIdent); Assert.assertEquals(newLocation, table.location()); Assert.assertNull(table.currentSnapshot()); - PartitionSpec v1Expected = PartitionSpec.builderFor(table.schema()) - .alwaysNull("data", "data_bucket") - .withSpecId(1) - .build(); - Assert.assertEquals("Table should have a spec with one void field", - v1Expected, table.spec()); + PartitionSpec v1Expected = + PartitionSpec.builderFor(table.schema()) + .alwaysNull("data", "data_bucket") + .withSpecId(1) + .build(); + Assert.assertEquals("Table should have a spec with one void field", v1Expected, table.spec()); Assert.assertEquals("value1", table.properties().get("key1")); Assert.assertEquals("value2", table.properties().get("key2")); @@ -245,13 +246,11 @@ public void testReplaceTxnBuilder() throws Exception { @Test public void testCreateTableDefaultSortOrder() throws Exception { - Schema schema = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .bucket("data", 16) - .build(); + Schema schema = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); + PartitionSpec spec = PartitionSpec.builderFor(schema).bucket("data", 16).build(); TableIdentifier tableIdent = TableIdentifier.of(DB_NAME, "tbl"); try { @@ -259,7 +258,8 @@ public void testCreateTableDefaultSortOrder() throws Exception { Assert.assertEquals("Order ID must match", 0, table.sortOrder().orderId()); Assert.assertTrue("Order must unsorted", table.sortOrder().isUnsorted()); - Assert.assertFalse("Must not have default sort order in catalog", + Assert.assertFalse( + "Must not have default sort order in catalog", hmsTableParameters().containsKey(DEFAULT_SORT_ORDER)); } finally { catalog.dropTable(tableIdent); @@ -268,32 +268,32 @@ public void testCreateTableDefaultSortOrder() throws Exception { @Test public void testCreateTableCustomSortOrder() throws Exception { - Schema schema = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .bucket("data", 16) - .build(); - SortOrder order = SortOrder.builderFor(schema) - .asc("id", NULLS_FIRST) - .build(); + Schema schema = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); + PartitionSpec spec = PartitionSpec.builderFor(schema).bucket("data", 16).build(); + SortOrder order = SortOrder.builderFor(schema).asc("id", NULLS_FIRST).build(); TableIdentifier tableIdent = TableIdentifier.of(DB_NAME, "tbl"); try { - Table table = catalog.buildTable(tableIdent, schema) - .withPartitionSpec(spec) - .withSortOrder(order) - .create(); + Table table = + catalog + .buildTable(tableIdent, schema) + .withPartitionSpec(spec) + .withSortOrder(order) + .create(); SortOrder sortOrder = table.sortOrder(); Assert.assertEquals("Order ID must match", 1, sortOrder.orderId()); Assert.assertEquals("Order must have 1 field", 1, sortOrder.fields().size()); Assert.assertEquals("Direction must match ", ASC, sortOrder.fields().get(0).direction()); - Assert.assertEquals("Null order must match ", NULLS_FIRST, sortOrder.fields().get(0).nullOrder()); + Assert.assertEquals( + "Null order must match ", NULLS_FIRST, sortOrder.fields().get(0).nullOrder()); Transform transform = Transforms.identity(Types.IntegerType.get()); Assert.assertEquals("Transform must match", transform, sortOrder.fields().get(0).transform()); - Assert.assertEquals(SortOrderParser.toJson(table.sortOrder()), hmsTableParameters().get(DEFAULT_SORT_ORDER)); + Assert.assertEquals( + SortOrderParser.toJson(table.sortOrder()), hmsTableParameters().get(DEFAULT_SORT_ORDER)); } finally { catalog.dropTable(tableIdent); } @@ -308,26 +308,29 @@ public void testCreateNamespace() throws Exception { Assert.assertTrue(database1.getParameters().get("owner").equals("apache")); Assert.assertTrue(database1.getParameters().get("group").equals("iceberg")); - Assert.assertEquals("There no same location for db and namespace", - database1.getLocationUri(), defaultUri(namespace1)); + Assert.assertEquals( + "There no same location for db and namespace", + database1.getLocationUri(), + defaultUri(namespace1)); - AssertHelpers.assertThrows("Should fail to create when namespace already exist " + namespace1, - AlreadyExistsException.class, "Namespace '" + namespace1 + "' already exists!", () -> { + AssertHelpers.assertThrows( + "Should fail to create when namespace already exist " + namespace1, + AlreadyExistsException.class, + "Namespace '" + namespace1 + "' already exists!", + () -> { catalog.createNamespace(namespace1); }); String hiveLocalDir = temp.newFolder().toURI().toString(); // remove the trailing slash of the URI hiveLocalDir = hiveLocalDir.substring(0, hiveLocalDir.length() - 1); - ImmutableMap newMeta = ImmutableMap.builder() - .putAll(meta) - .put("location", hiveLocalDir) - .build(); + ImmutableMap newMeta = + ImmutableMap.builder().putAll(meta).put("location", hiveLocalDir).build(); Namespace namespace2 = Namespace.of("haveLocation"); catalog.createNamespace(namespace2, newMeta); Database database2 = metastoreClient.getDatabase(namespace2.toString()); - Assert.assertEquals("There no same location for db and namespace", - database2.getLocationUri(), hiveLocalDir); + Assert.assertEquals( + "There no same location for db and namespace", database2.getLocationUri(), hiveLocalDir); } @Test @@ -354,8 +357,10 @@ public void testLoadNamespaceMeta() throws TException { Map nameMata = catalog.loadNamespaceMetadata(namespace); Assert.assertTrue(nameMata.get("owner").equals("apache")); Assert.assertTrue(nameMata.get("group").equals("iceberg")); - Assert.assertEquals("There no same location for db and namespace", - nameMata.get("location"), catalog.convertToDatabase(namespace, meta).getLocationUri()); + Assert.assertEquals( + "There no same location for db and namespace", + nameMata.get("location"), + catalog.convertToDatabase(namespace, meta).getLocationUri()); } @Test @@ -364,9 +369,9 @@ public void testNamespaceExists() throws TException { catalog.createNamespace(namespace, meta); - Assert.assertTrue("Should true to namespace exist", - catalog.namespaceExists(namespace)); - Assert.assertTrue("Should false to namespace doesn't exist", + Assert.assertTrue("Should true to namespace exist", catalog.namespaceExists(namespace)); + Assert.assertTrue( + "Should false to namespace doesn't exist", !catalog.namespaceExists(Namespace.of("db2", "db2", "ns2"))); } @@ -375,20 +380,23 @@ public void testSetNamespaceProperties() throws TException { Namespace namespace = Namespace.of("dbname_set"); catalog.createNamespace(namespace, meta); - catalog.setProperties(namespace, + catalog.setProperties( + namespace, ImmutableMap.of( "owner", "alter_apache", "test", "test", "location", "file:/data/tmp", - "comment", "iceberg test") - ); + "comment", "iceberg test")); Database database = metastoreClient.getDatabase(namespace.level(0)); Assert.assertEquals(database.getParameters().get("owner"), "alter_apache"); Assert.assertEquals(database.getParameters().get("test"), "test"); Assert.assertEquals(database.getParameters().get("group"), "iceberg"); - AssertHelpers.assertThrows("Should fail to namespace not exist" + namespace, - NoSuchNamespaceException.class, "Namespace does not exist: ", () -> { + AssertHelpers.assertThrows( + "Should fail to namespace not exist" + namespace, + NoSuchNamespaceException.class, + "Namespace does not exist: ", + () -> { catalog.setProperties(Namespace.of("db2", "db2", "ns2"), meta); }); } @@ -405,9 +413,13 @@ public void testRemoveNamespaceProperties() throws TException { Assert.assertEquals(database.getParameters().get("owner"), null); Assert.assertEquals(database.getParameters().get("group"), "iceberg"); - AssertHelpers.assertThrows("Should fail to namespace not exist" + namespace, - NoSuchNamespaceException.class, "Namespace does not exist: ", () -> { - catalog.removeProperties(Namespace.of("db2", "db2", "ns2"), ImmutableSet.of("comment", "owner")); + AssertHelpers.assertThrows( + "Should fail to namespace not exist" + namespace, + NoSuchNamespaceException.class, + "Namespace does not exist: ", + () -> { + catalog.removeProperties( + Namespace.of("db2", "db2", "ns2"), ImmutableSet.of("comment", "owner")); }); } @@ -415,8 +427,8 @@ public void testRemoveNamespaceProperties() throws TException { public void testDropNamespace() throws TException { Namespace namespace = Namespace.of("dbname_drop"); TableIdentifier identifier = TableIdentifier.of(namespace, "table"); - Schema schema = new Schema(Types.StructType.of( - required(1, "id", Types.LongType.get())).fields()); + Schema schema = + new Schema(Types.StructType.of(required(1, "id", Types.LongType.get())).fields()); catalog.createNamespace(namespace, meta); catalog.createTable(identifier, schema); @@ -424,37 +436,39 @@ public void testDropNamespace() throws TException { Assert.assertTrue(nameMata.get("owner").equals("apache")); Assert.assertTrue(nameMata.get("group").equals("iceberg")); - AssertHelpers.assertThrows("Should fail to drop namespace is not empty" + namespace, + AssertHelpers.assertThrows( + "Should fail to drop namespace is not empty" + namespace, NamespaceNotEmptyException.class, - "Namespace dbname_drop is not empty. One or more tables exist.", () -> { + "Namespace dbname_drop is not empty. One or more tables exist.", + () -> { catalog.dropNamespace(namespace); }); Assert.assertTrue(catalog.dropTable(identifier, true)); - Assert.assertTrue("Should fail to drop namespace if it is not empty", - catalog.dropNamespace(namespace)); - Assert.assertFalse("Should fail to drop when namespace doesn't exist", + Assert.assertTrue( + "Should fail to drop namespace if it is not empty", catalog.dropNamespace(namespace)); + Assert.assertFalse( + "Should fail to drop when namespace doesn't exist", catalog.dropNamespace(Namespace.of("db.ns1"))); - AssertHelpers.assertThrows("Should fail to drop namespace exist" + namespace, - NoSuchNamespaceException.class, "Namespace does not exist: ", () -> { + AssertHelpers.assertThrows( + "Should fail to drop namespace exist" + namespace, + NoSuchNamespaceException.class, + "Namespace does not exist: ", + () -> { catalog.loadNamespaceMetadata(namespace); }); } @Test public void testTableName() { - Schema schema = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .bucket("data", 16) - .build(); + Schema schema = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); + PartitionSpec spec = PartitionSpec.builderFor(schema).bucket("data", 16).build(); TableIdentifier tableIdent = TableIdentifier.of(DB_NAME, "tbl"); try { - catalog.buildTable(tableIdent, schema) - .withPartitionSpec(spec) - .create(); + catalog.buildTable(tableIdent, schema).withPartitionSpec(spec).create(); Table table = catalog.loadTable(tableIdent); Assert.assertEquals("Name must match", "hive.hivedb.tbl", table.name()); @@ -468,23 +482,23 @@ public void testTableName() { } private String defaultUri(Namespace namespace) throws TException { - return metastoreClient.getConfigValue( - "hive.metastore.warehouse.dir", "") + "/" + namespace.level(0) + ".db"; + return metastoreClient.getConfigValue("hive.metastore.warehouse.dir", "") + + "/" + + namespace.level(0) + + ".db"; } @Test public void testUUIDinTableProperties() throws Exception { - Schema schema = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); TableIdentifier tableIdentifier = TableIdentifier.of(DB_NAME, "tbl"); String location = temp.newFolder("tbl").toString(); try { - catalog.buildTable(tableIdentifier, schema) - .withLocation(location) - .create(); + catalog.buildTable(tableIdentifier, schema).withLocation(location).create(); Assert.assertNotNull(hmsTableParameters().get(TableProperties.UUID)); } finally { @@ -494,17 +508,15 @@ public void testUUIDinTableProperties() throws Exception { @Test public void testSnapshotStatsTableProperties() throws Exception { - Schema schema = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); TableIdentifier tableIdentifier = TableIdentifier.of(DB_NAME, "tbl"); String location = temp.newFolder("tbl").toString(); try { - catalog.buildTable(tableIdentifier, schema) - .withLocation(location) - .create(); + catalog.buildTable(tableIdentifier, schema).withLocation(location).create(); // check whether parameters are in expected state Map parameters = hmsTableParameters(); @@ -516,21 +528,24 @@ public void testSnapshotStatsTableProperties() throws Exception { // create a snapshot Table icebergTable = catalog.loadTable(tableIdentifier); String fileName = UUID.randomUUID().toString(); - DataFile file = DataFiles.builder(icebergTable.spec()) - .withPath(FileFormat.PARQUET.addExtension(fileName)) - .withRecordCount(2) - .withFileSizeInBytes(0) - .build(); + DataFile file = + DataFiles.builder(icebergTable.spec()) + .withPath(FileFormat.PARQUET.addExtension(fileName)) + .withRecordCount(2) + .withFileSizeInBytes(0) + .build(); icebergTable.newFastAppend().appendFile(file).commit(); // check whether parameters are in expected state parameters = hmsTableParameters(); Assert.assertEquals("1", parameters.get(TableProperties.SNAPSHOT_COUNT)); - String summary = JsonUtil.mapper().writeValueAsString(icebergTable.currentSnapshot().summary()); + String summary = + JsonUtil.mapper().writeValueAsString(icebergTable.currentSnapshot().summary()); Assert.assertEquals(summary, parameters.get(CURRENT_SNAPSHOT_SUMMARY)); long snapshotId = icebergTable.currentSnapshot().snapshotId(); Assert.assertEquals(String.valueOf(snapshotId), parameters.get(CURRENT_SNAPSHOT_ID)); - Assert.assertEquals(String.valueOf(icebergTable.currentSnapshot().timestampMillis()), + Assert.assertEquals( + String.valueOf(icebergTable.currentSnapshot().timestampMillis()), parameters.get(CURRENT_SNAPSHOT_TIMESTAMP)); } finally { @@ -542,7 +557,8 @@ public void testSnapshotStatsTableProperties() throws Exception { public void testSetSnapshotSummary() throws Exception { Configuration conf = new Configuration(); conf.set("iceberg.hive.table-property-max-size", "4000"); - HiveTableOperations ops = new HiveTableOperations(conf, null, null, catalog.name(), DB_NAME, "tbl"); + HiveTableOperations ops = + new HiveTableOperations(conf, null, null, catalog.name(), DB_NAME, "tbl"); Snapshot snapshot = mock(Snapshot.class); Map summary = Maps.newHashMap(); when(snapshot.summary()).thenReturn(summary); @@ -565,14 +581,18 @@ public void testSetSnapshotSummary() throws Exception { Assert.assertTrue(summarySize > 4000 && summarySize < 32672); parameters.remove(CURRENT_SNAPSHOT_SUMMARY); ops.setSnapshotSummary(parameters, snapshot); - Assert.assertEquals("The snapshot summary must not be in parameters due to the size limit", 0, parameters.size()); + Assert.assertEquals( + "The snapshot summary must not be in parameters due to the size limit", + 0, + parameters.size()); } @Test public void testNotExposeTableProperties() { Configuration conf = new Configuration(); conf.set("iceberg.hive.table-property-max-size", "0"); - HiveTableOperations ops = new HiveTableOperations(conf, null, null, catalog.name(), DB_NAME, "tbl"); + HiveTableOperations ops = + new HiveTableOperations(conf, null, null, catalog.name(), DB_NAME, "tbl"); TableMetadata metadata = mock(TableMetadata.class); Map parameters = Maps.newHashMap(); parameters.put(CURRENT_SNAPSHOT_SUMMARY, "summary"); @@ -599,19 +619,21 @@ public void testNotExposeTableProperties() { @Test public void testSetDefaultPartitionSpec() throws Exception { - Schema schema = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); TableIdentifier tableIdent = TableIdentifier.of(DB_NAME, "tbl"); try { Table table = catalog.buildTable(tableIdent, schema).create(); - Assert.assertFalse("Must not have default partition spec", + Assert.assertFalse( + "Must not have default partition spec", hmsTableParameters().containsKey(TableProperties.DEFAULT_PARTITION_SPEC)); table.updateSpec().addField(bucket("data", 16)).commit(); - Assert.assertEquals(PartitionSpecParser.toJson(table.spec()), + Assert.assertEquals( + PartitionSpecParser.toJson(table.spec()), hmsTableParameters().get(TableProperties.DEFAULT_PARTITION_SPEC)); } finally { catalog.dropTable(tableIdent); @@ -620,16 +642,17 @@ public void testSetDefaultPartitionSpec() throws Exception { @Test public void testSetCurrentSchema() throws Exception { - Schema schema = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID"), - required(2, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + required(1, "id", Types.IntegerType.get(), "unique ID"), + required(2, "data", Types.StringType.get())); TableIdentifier tableIdent = TableIdentifier.of(DB_NAME, "tbl"); try { Table table = catalog.buildTable(tableIdent, schema).create(); - Assert.assertEquals(SchemaParser.toJson(table.schema()), hmsTableParameters().get(CURRENT_SCHEMA)); + Assert.assertEquals( + SchemaParser.toJson(table.schema()), hmsTableParameters().get(CURRENT_SCHEMA)); // add many new fields to make the schema json string exceed the limit UpdateSchema updateSchema = table.updateSchema(); @@ -655,34 +678,41 @@ public void testConstructorWarehousePathWithEndSlash() { HiveCatalog catalogWithSlash = new HiveCatalog(); String wareHousePath = "s3://bucket/db/tbl"; - catalogWithSlash.initialize("hive_catalog", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, - wareHousePath + "/")); - Assert.assertEquals("Should have trailing slash stripped", wareHousePath, catalogWithSlash.getConf().get( - HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); + catalogWithSlash.initialize( + "hive_catalog", ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, wareHousePath + "/")); + Assert.assertEquals( + "Should have trailing slash stripped", + wareHousePath, + catalogWithSlash.getConf().get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); } @Test public void testTablePropsDefinedAtCatalogLevel() { - Schema schema = new Schema( - required(1, "id", Types.IntegerType.get(), "unique ID") - ); + Schema schema = new Schema(required(1, "id", Types.IntegerType.get(), "unique ID")); TableIdentifier tableIdent = TableIdentifier.of(DB_NAME, "tbl"); - ImmutableMap catalogProps = ImmutableMap.of( - "table-default.key1", "catalog-default-key1", - "table-default.key2", "catalog-default-key2", - "table-default.key3", "catalog-default-key3", - "table-override.key3", "catalog-override-key3", - "table-override.key4", "catalog-override-key4"); - Catalog hiveCatalog = CatalogUtil.loadCatalog(HiveCatalog.class.getName(), - CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE, catalogProps, hiveConf); + ImmutableMap catalogProps = + ImmutableMap.of( + "table-default.key1", "catalog-default-key1", + "table-default.key2", "catalog-default-key2", + "table-default.key3", "catalog-default-key3", + "table-override.key3", "catalog-override-key3", + "table-override.key4", "catalog-override-key4"); + Catalog hiveCatalog = + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), + CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE, + catalogProps, + hiveConf); try { - Table table = hiveCatalog.buildTable(tableIdent, schema) - .withProperty("key2", "table-key2") - .withProperty("key3", "table-key3") - .withProperty("key5", "table-key5") - .create(); + Table table = + hiveCatalog + .buildTable(tableIdent, schema) + .withProperty("key2", "table-key2") + .withProperty("key3", "table-key3") + .withProperty("key5", "table-key5") + .create(); Assert.assertEquals( "Table defaults set for the catalog must be added to the table properties.", @@ -693,8 +723,8 @@ public void testTablePropsDefinedAtCatalogLevel() { "table-key2", table.properties().get("key2")); Assert.assertEquals( - "Table property override set at catalog level must override table default" + - " properties set at catalog level and table property specified.", + "Table property override set at catalog level must override table default" + + " properties set at catalog level and table property specified.", "catalog-override-key3", table.properties().get("key3")); Assert.assertEquals( @@ -702,8 +732,8 @@ public void testTablePropsDefinedAtCatalogLevel() { "catalog-override-key4", table.properties().get("key4")); Assert.assertEquals( - "Table properties without any catalog level default or override should be added to table" + - " properties.", + "Table properties without any catalog level default or override should be added to table" + + " properties.", "table-key5", table.properties().get("key5")); } finally { diff --git a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveClientPool.java b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveClientPool.java index 36996e33e3c6..9434f1cab303 100644 --- a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveClientPool.java +++ b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveClientPool.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; import java.io.ByteArrayInputStream; @@ -43,14 +42,15 @@ public class TestHiveClientPool { - private static final String HIVE_SITE_CONTENT = "\n" + - "\n" + - "\n" + - " \n" + - " hive.metastore.sasl.enabled\n" + - " true\n" + - " \n" + - "\n"; + private static final String HIVE_SITE_CONTENT = + "\n" + + "\n" + + "\n" + + " \n" + + " hive.metastore.sasl.enabled\n" + + " true\n" + + " \n" + + "\n"; HiveClientPool clients; @@ -74,19 +74,22 @@ public void testConf() { HiveClientPool clientPool = new HiveClientPool(10, conf); HiveConf clientConf = clientPool.hiveConf(); - Assert.assertEquals(conf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname), - clientConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); + Assert.assertEquals( + conf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname), + clientConf.get(HiveConf.ConfVars.METASTOREWAREHOUSE.varname)); Assert.assertEquals(10, clientPool.poolSize()); // 'hive.metastore.sasl.enabled' should be 'true' as defined in xml - Assert.assertEquals(conf.get(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL.varname), - clientConf.get(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL.varname)); + Assert.assertEquals( + conf.get(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL.varname), + clientConf.get(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL.varname)); Assert.assertTrue(clientConf.getBoolVar(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL)); } private HiveConf createHiveConf() { HiveConf hiveConf = new HiveConf(); - try (InputStream inputStream = new ByteArrayInputStream(HIVE_SITE_CONTENT.getBytes(StandardCharsets.UTF_8))) { + try (InputStream inputStream = + new ByteArrayInputStream(HIVE_SITE_CONTENT.getBytes(StandardCharsets.UTF_8))) { hiveConf.addResource(inputStream, "for_test"); } catch (IOException e) { throw new RuntimeException(e); @@ -97,8 +100,11 @@ private HiveConf createHiveConf() { @Test public void testNewClientFailure() { Mockito.doThrow(new RuntimeException("Connection exception")).when(clients).newClient(); - AssertHelpers.assertThrows("Should throw exception", RuntimeException.class, - "Connection exception", () -> clients.run(Object::toString)); + AssertHelpers.assertThrows( + "Should throw exception", + RuntimeException.class, + "Connection exception", + () -> clients.run(Object::toString)); } @Test @@ -106,9 +112,13 @@ public void testGetTablesFailsForNonReconnectableException() throws Exception { HiveMetaStoreClient hmsClient = Mockito.mock(HiveMetaStoreClient.class); Mockito.doReturn(hmsClient).when(clients).newClient(); Mockito.doThrow(new MetaException("Another meta exception")) - .when(hmsClient).getTables(Mockito.anyString(), Mockito.anyString()); - AssertHelpers.assertThrows("Should throw exception", MetaException.class, - "Another meta exception", () -> clients.run(client -> client.getTables("default", "t"))); + .when(hmsClient) + .getTables(Mockito.anyString(), Mockito.anyString()); + AssertHelpers.assertThrows( + "Should throw exception", + MetaException.class, + "Another meta exception", + () -> clients.run(client -> client.getTables("default", "t"))); } @Test @@ -143,7 +153,15 @@ public void testConnectionFailureRestoreForTTransportException() throws Exceptio GetAllFunctionsResponse response = new GetAllFunctionsResponse(); response.addToFunctions( - new Function("concat", "db1", "classname", "root", PrincipalType.USER, 100, FunctionType.JAVA, null)); + new Function( + "concat", + "db1", + "classname", + "root", + PrincipalType.USER, + 100, + FunctionType.JAVA, + null)); Mockito.doReturn(response).when(newClient).getAllFunctions(); Assert.assertEquals(response, clients.run(client -> client.getAllFunctions(), true)); diff --git a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveCommitLocks.java b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveCommitLocks.java index 756f3ccaf2d3..37fa9b519cd7 100644 --- a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveCommitLocks.java +++ b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveCommitLocks.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; +import static org.mockito.Mockito.any; +import static org.mockito.Mockito.doNothing; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.eq; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.reset; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + import java.util.Collections; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -44,17 +54,6 @@ import org.junit.BeforeClass; import org.junit.Test; -import static org.mockito.Mockito.any; -import static org.mockito.Mockito.doNothing; -import static org.mockito.Mockito.doReturn; -import static org.mockito.Mockito.eq; -import static org.mockito.Mockito.never; -import static org.mockito.Mockito.reset; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.times; -import static org.mockito.Mockito.verify; -import static org.mockito.Mockito.when; - public class TestHiveCommitLocks extends HiveTableBaseTest { private static HiveTableOperations spyOps = null; private static HiveClientPool spyClientPool = null; @@ -80,12 +79,14 @@ public static void initializeSpies() throws Exception { // Set up the spy clients as static variables instead of before every test. // The spy clients are reused between methods and closed at the end of all tests in this class. spyClientPool = spy(new HiveClientPool(1, overriddenHiveConf)); - when(spyClientPool.newClient()).thenAnswer(invocation -> { - // cannot spy on RetryingHiveMetastoreClient as it is a proxy - IMetaStoreClient client = spy(new HiveMetaStoreClient(hiveConf)); - spyClientRef.set(client); - return spyClientRef.get(); - }); + when(spyClientPool.newClient()) + .thenAnswer( + invocation -> { + // cannot spy on RetryingHiveMetastoreClient as it is a proxy + IMetaStoreClient client = spy(new HiveMetaStoreClient(hiveConf)); + spyClientRef.set(client); + return spyClientRef.get(); + }); spyClientPool.run(IMetaStoreClient::isLocalMetaStore); // To ensure new client is created. @@ -106,9 +107,7 @@ public void before() throws Exception { metadataV1 = ops.current(); - table.updateSchema() - .addColumn("n", Types.IntegerType.get()) - .commit(); + table.updateSchema().addColumn("n", Types.IntegerType.get()).commit(); ops.refresh(); @@ -116,8 +115,15 @@ public void before() throws Exception { Assert.assertEquals(2, ops.current().schema().columns().size()); - spyOps = spy(new HiveTableOperations(overriddenHiveConf, spyCachedClientPool, ops.io(), catalog.name(), - dbName, tableName)); + spyOps = + spy( + new HiveTableOperations( + overriddenHiveConf, + spyCachedClientPool, + ops.io(), + catalog.name(), + dbName, + tableName)); } @AfterClass @@ -161,7 +167,8 @@ public void testLockAcquisitionAfterRetries() throws TException, InterruptedExce public void testLockFailureAtFirstTime() throws TException { doReturn(notAcquiredLockResponse).when(spyClient).lock(any()); - AssertHelpers.assertThrows("Expected an exception", + AssertHelpers.assertThrows( + "Expected an exception", CommitFailedException.class, "Could not acquire the lock on", () -> spyOps.doCommit(metadataV2, metadataV1)); @@ -178,7 +185,8 @@ public void testLockFailureAfterRetries() throws TException { .when(spyClient) .checkLock(eq(dummyLockId)); - AssertHelpers.assertThrows("Expected an exception", + AssertHelpers.assertThrows( + "Expected an exception", CommitFailedException.class, "Could not acquire the lock on", () -> spyOps.doCommit(metadataV2, metadataV1)); @@ -189,7 +197,8 @@ public void testLockTimeoutAfterRetries() throws TException { doReturn(waitLockResponse).when(spyClient).lock(any()); doReturn(waitLockResponse).when(spyClient).checkLock(eq(dummyLockId)); - AssertHelpers.assertThrows("Expected an exception", + AssertHelpers.assertThrows( + "Expected an exception", CommitFailedException.class, "Timed out after", () -> spyOps.doCommit(metadataV2, metadataV1)); @@ -198,10 +207,13 @@ public void testLockTimeoutAfterRetries() throws TException { @Test public void testPassThroughThriftExceptions() throws TException { doReturn(waitLockResponse).when(spyClient).lock(any()); - doReturn(waitLockResponse).doThrow(new TException("Test Thrift Exception")) - .when(spyClient).checkLock(eq(dummyLockId)); + doReturn(waitLockResponse) + .doThrow(new TException("Test Thrift Exception")) + .when(spyClient) + .checkLock(eq(dummyLockId)); - AssertHelpers.assertThrows("Expected an exception", + AssertHelpers.assertThrows( + "Expected an exception", RuntimeException.class, "Metastore operation failed for", () -> spyOps.doCommit(metadataV2, metadataV1)); @@ -210,13 +222,18 @@ public void testPassThroughThriftExceptions() throws TException { @Test public void testPassThroughInterruptions() throws TException { doReturn(waitLockResponse).when(spyClient).lock(any()); - doReturn(waitLockResponse).doAnswer(invocation -> { - Thread.currentThread().interrupt(); - Thread.sleep(10); - return waitLockResponse; - }).when(spyClient).checkLock(eq(dummyLockId)); + doReturn(waitLockResponse) + .doAnswer( + invocation -> { + Thread.currentThread().interrupt(); + Thread.sleep(10); + return waitLockResponse; + }) + .when(spyClient) + .checkLock(eq(dummyLockId)); - AssertHelpers.assertThrows("Expected an exception", + AssertHelpers.assertThrows( + "Expected an exception", CommitFailedException.class, "Could not acquire the lock on", () -> spyOps.doCommit(metadataV2, metadataV1)); @@ -230,15 +247,19 @@ public void testTableLevelProcessLockBlocksConcurrentHMSRequestsForSameTable() t // simulate several concurrent commit operations on the same table ExecutorService executor = Executors.newFixedThreadPool(numConcurrentCommits); - IntStream.range(0, numConcurrentCommits).forEach(i -> - executor.submit(() -> { - try { - spyOps.doCommit(metadataV2, metadataV1); - } catch (CommitFailedException e) { - // failures are expected here when checking the base version - // it's no problem, we're not testing the actual commit success here, only the HMS lock acquisition attempts - } - })); + IntStream.range(0, numConcurrentCommits) + .forEach( + i -> + executor.submit( + () -> { + try { + spyOps.doCommit(metadataV2, metadataV1); + } catch (CommitFailedException e) { + // failures are expected here when checking the base version + // it's no problem, we're not testing the actual commit success here, only + // the HMS lock acquisition attempts + } + })); executor.shutdown(); executor.awaitTermination(30, TimeUnit.SECONDS); diff --git a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveCommits.java b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveCommits.java index 07c7fa8b4028..dedec50f0a9c 100644 --- a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveCommits.java +++ b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveCommits.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; +import static org.mockito.Mockito.any; +import static org.mockito.Mockito.anyBoolean; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + import java.io.File; import java.net.UnknownHostException; import java.util.concurrent.atomic.AtomicLong; @@ -37,13 +43,6 @@ import org.junit.Test; import org.mockito.ArgumentCaptor; -import static org.mockito.Mockito.any; -import static org.mockito.Mockito.anyBoolean; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.doThrow; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - public class TestHiveCommits extends HiveTableBaseTest { @Test @@ -53,9 +52,7 @@ public void testSuppressUnlockExceptions() throws TException, InterruptedExcepti TableMetadata metadataV1 = ops.current(); - table.updateSchema() - .addColumn("n", Types.IntegerType.get()) - .commit(); + table.updateSchema().addColumn("n", Types.IntegerType.get()).commit(); ops.refresh(); @@ -81,19 +78,19 @@ public void testSuppressUnlockExceptions() throws TException, InterruptedExcepti } /** - * Pretends we throw an error while persisting, and not found with check state, commit state should be treated as - * unknown, because in reality the persisting may still succeed, just not yet by the time of checking. + * Pretends we throw an error while persisting, and not found with check state, commit state + * should be treated as unknown, because in reality the persisting may still succeed, just not yet + * by the time of checking. */ @Test - public void testThriftExceptionUnknownStateIfNotInHistoryFailureOnCommit() throws TException, InterruptedException { + public void testThriftExceptionUnknownStateIfNotInHistoryFailureOnCommit() + throws TException, InterruptedException { Table table = catalog.loadTable(TABLE_IDENTIFIER); HiveTableOperations ops = (HiveTableOperations) ((HasTableOperations) table).operations(); TableMetadata metadataV1 = ops.current(); - table.updateSchema() - .addColumn("n", Types.IntegerType.get()) - .commit(); + table.updateSchema().addColumn("n", Types.IntegerType.get()).commit(); ops.refresh(); @@ -105,20 +102,24 @@ public void testThriftExceptionUnknownStateIfNotInHistoryFailureOnCommit() throw failCommitAndThrowException(spyOps); - AssertHelpers.assertThrows("We should assume commit state is unknown if the " + - "new location is not found in history in commit state check", CommitStateUnknownException.class, - "Datacenter on fire", () -> spyOps.commit(metadataV2, metadataV1)); + AssertHelpers.assertThrows( + "We should assume commit state is unknown if the " + + "new location is not found in history in commit state check", + CommitStateUnknownException.class, + "Datacenter on fire", + () -> spyOps.commit(metadataV2, metadataV1)); ops.refresh(); Assert.assertEquals("Current metadata should not have changed", metadataV2, ops.current()); Assert.assertTrue("Current metadata should still exist", metadataFileExists(metadataV2)); - Assert.assertEquals("New metadata files should still exist, new location not in history but" + - " the commit may still succeed", 3, metadataFileCount(ops.current())); + Assert.assertEquals( + "New metadata files should still exist, new location not in history but" + + " the commit may still succeed", + 3, + metadataFileCount(ops.current())); } - /** - * Pretends we throw an error while persisting that actually does commit serverside - */ + /** Pretends we throw an error while persisting that actually does commit serverside */ @Test public void testThriftExceptionSuccessOnCommit() throws TException, InterruptedException { Table table = catalog.loadTable(TABLE_IDENTIFIER); @@ -126,9 +127,7 @@ public void testThriftExceptionSuccessOnCommit() throws TException, InterruptedE TableMetadata metadataV1 = ops.current(); - table.updateSchema() - .addColumn("n", Types.IntegerType.get()) - .commit(); + table.updateSchema().addColumn("n", Types.IntegerType.get()).commit(); ops.refresh(); @@ -141,19 +140,23 @@ public void testThriftExceptionSuccessOnCommit() throws TException, InterruptedE // Simulate a communication error after a successful commit commitAndThrowException(ops, spyOps); - // Shouldn't throw because the commit actually succeeds even though persistTable throws an exception + // Shouldn't throw because the commit actually succeeds even though persistTable throws an + // exception spyOps.commit(metadataV2, metadataV1); ops.refresh(); Assert.assertNotEquals("Current metadata should have changed", metadataV2, ops.current()); - Assert.assertTrue("Current metadata file should still exist", metadataFileExists(ops.current())); - Assert.assertEquals("Commit should have been successful and new metadata file should be made", - 3, metadataFileCount(ops.current())); + Assert.assertTrue( + "Current metadata file should still exist", metadataFileExists(ops.current())); + Assert.assertEquals( + "Commit should have been successful and new metadata file should be made", + 3, + metadataFileCount(ops.current())); } /** - * Pretends we throw an exception while persisting and don't know what happened, can't check to find out, - * but in reality the commit failed + * Pretends we throw an exception while persisting and don't know what happened, can't check to + * find out, but in reality the commit failed */ @Test public void testThriftExceptionUnknownFailedCommit() throws TException, InterruptedException { @@ -162,9 +165,7 @@ public void testThriftExceptionUnknownFailedCommit() throws TException, Interrup TableMetadata metadataV1 = ops.current(); - table.updateSchema() - .addColumn("n", Types.IntegerType.get()) - .commit(); + table.updateSchema().addColumn("n", Types.IntegerType.get()).commit(); ops.refresh(); @@ -177,21 +178,26 @@ public void testThriftExceptionUnknownFailedCommit() throws TException, Interrup failCommitAndThrowException(spyOps); breakFallbackCatalogCommitCheck(spyOps); - AssertHelpers.assertThrows("Should throw CommitStateUnknownException since the catalog check was blocked", - CommitStateUnknownException.class, "Datacenter on fire", + AssertHelpers.assertThrows( + "Should throw CommitStateUnknownException since the catalog check was blocked", + CommitStateUnknownException.class, + "Datacenter on fire", () -> spyOps.commit(metadataV2, metadataV1)); ops.refresh(); Assert.assertEquals("Current metadata should not have changed", metadataV2, ops.current()); - Assert.assertTrue("Current metadata file should still exist", metadataFileExists(ops.current())); - Assert.assertEquals("Client could not determine outcome so new metadata file should also exist", - 3, metadataFileCount(ops.current())); + Assert.assertTrue( + "Current metadata file should still exist", metadataFileExists(ops.current())); + Assert.assertEquals( + "Client could not determine outcome so new metadata file should also exist", + 3, + metadataFileCount(ops.current())); } /** - * Pretends we throw an exception while persisting and don't know what happened, can't check to find out, - * but in reality the commit succeeded + * Pretends we throw an exception while persisting and don't know what happened, can't check to + * find out, but in reality the commit succeeded */ @Test public void testThriftExceptionsUnknownSuccessCommit() throws TException, InterruptedException { @@ -200,9 +206,7 @@ public void testThriftExceptionsUnknownSuccessCommit() throws TException, Interr TableMetadata metadataV1 = ops.current(); - table.updateSchema() - .addColumn("n", Types.IntegerType.get()) - .commit(); + table.updateSchema().addColumn("n", Types.IntegerType.get()).commit(); ops.refresh(); @@ -215,40 +219,41 @@ public void testThriftExceptionsUnknownSuccessCommit() throws TException, Interr commitAndThrowException(ops, spyOps); breakFallbackCatalogCommitCheck(spyOps); - AssertHelpers.assertThrows("Should throw CommitStateUnknownException since the catalog check was blocked", - CommitStateUnknownException.class, "Datacenter on fire", + AssertHelpers.assertThrows( + "Should throw CommitStateUnknownException since the catalog check was blocked", + CommitStateUnknownException.class, + "Datacenter on fire", () -> spyOps.commit(metadataV2, metadataV1)); ops.refresh(); Assert.assertFalse("Current metadata should have changed", ops.current().equals(metadataV2)); - Assert.assertTrue("Current metadata file should still exist", metadataFileExists(ops.current())); + Assert.assertTrue( + "Current metadata file should still exist", metadataFileExists(ops.current())); } /** - * Pretends we threw an exception while persisting, the commit succeeded, the lock expired, - * and a second committer placed a commit on top of ours before the first committer was able to check - * if their commit succeeded or not + * Pretends we threw an exception while persisting, the commit succeeded, the lock expired, and a + * second committer placed a commit on top of ours before the first committer was able to check if + * their commit succeeded or not * - * Timeline: - * Client 1 commits which throws an exception but suceeded - * Client 1's lock expires while waiting to do the recheck for commit success - * Client 2 acquires a lock, commits successfully on top of client 1's commit and release lock - * Client 1 check's to see if their commit was successful + *

    Timeline: Client 1 commits which throws an exception but suceeded Client 1's lock expires + * while waiting to do the recheck for commit success Client 2 acquires a lock, commits + * successfully on top of client 1's commit and release lock Client 1 check's to see if their + * commit was successful * - * This tests to make sure a disconnected client 1 doesn't think their commit failed just because it isn't the - * current one during the recheck phase. + *

    This tests to make sure a disconnected client 1 doesn't think their commit failed just + * because it isn't the current one during the recheck phase. */ @Test - public void testThriftExceptionConcurrentCommit() throws TException, InterruptedException, UnknownHostException { + public void testThriftExceptionConcurrentCommit() + throws TException, InterruptedException, UnknownHostException { Table table = catalog.loadTable(TABLE_IDENTIFIER); HiveTableOperations ops = (HiveTableOperations) ((HasTableOperations) table).operations(); TableMetadata metadataV1 = ops.current(); - table.updateSchema() - .addColumn("n", Types.IntegerType.get()) - .commit(); + table.updateSchema().addColumn("n", Types.IntegerType.get()).commit(); ops.refresh(); @@ -259,10 +264,13 @@ public void testThriftExceptionConcurrentCommit() throws TException, Interrupted HiveTableOperations spyOps = spy(ops); AtomicLong lockId = new AtomicLong(); - doAnswer(i -> { - lockId.set(ops.acquireLock()); - return lockId.get(); - }).when(spyOps).acquireLock(); + doAnswer( + i -> { + lockId.set(ops.acquireLock()); + return lockId.get(); + }) + .when(spyOps) + .acquireLock(); concurrentCommitAndThrowException(ops, spyOps, table, lockId); @@ -274,54 +282,70 @@ public void testThriftExceptionConcurrentCommit() throws TException, Interrupted ops.refresh(); Assert.assertNotEquals("Current metadata should have changed", metadataV2, ops.current()); - Assert.assertTrue("Current metadata file should still exist", metadataFileExists(ops.current())); - Assert.assertEquals("The column addition from the concurrent commit should have been successful", - 2, ops.current().schema().columns().size()); + Assert.assertTrue( + "Current metadata file should still exist", metadataFileExists(ops.current())); + Assert.assertEquals( + "The column addition from the concurrent commit should have been successful", + 2, + ops.current().schema().columns().size()); } @Test public void testInvalidObjectException() { TableIdentifier badTi = TableIdentifier.of(DB_NAME, "`tbl`"); - Assert.assertThrows(String.format("Invalid table name for %s.%s", DB_NAME, "`tbl`"), + Assert.assertThrows( + String.format("Invalid table name for %s.%s", DB_NAME, "`tbl`"), ValidationException.class, () -> catalog.createTable(badTi, schema, PartitionSpec.unpartitioned())); } @Test public void testAlreadyExistsException() { - Assert.assertThrows(String.format("Table already exists: %s.%s", DB_NAME, TABLE_NAME), + Assert.assertThrows( + String.format("Table already exists: %s.%s", DB_NAME, TABLE_NAME), AlreadyExistsException.class, () -> catalog.createTable(TABLE_IDENTIFIER, schema, PartitionSpec.unpartitioned())); } - private void commitAndThrowException(HiveTableOperations realOperations, HiveTableOperations spyOperations) + private void commitAndThrowException( + HiveTableOperations realOperations, HiveTableOperations spyOperations) throws TException, InterruptedException { // Simulate a communication error after a successful commit - doAnswer(i -> { - org.apache.hadoop.hive.metastore.api.Table tbl = - i.getArgument(0, org.apache.hadoop.hive.metastore.api.Table.class); - realOperations.persistTable(tbl, true); - throw new TException("Datacenter on fire"); - }).when(spyOperations).persistTable(any(), anyBoolean()); + doAnswer( + i -> { + org.apache.hadoop.hive.metastore.api.Table tbl = + i.getArgument(0, org.apache.hadoop.hive.metastore.api.Table.class); + realOperations.persistTable(tbl, true); + throw new TException("Datacenter on fire"); + }) + .when(spyOperations) + .persistTable(any(), anyBoolean()); } - private void concurrentCommitAndThrowException(HiveTableOperations realOperations, HiveTableOperations spyOperations, - Table table, AtomicLong lockId) + private void concurrentCommitAndThrowException( + HiveTableOperations realOperations, + HiveTableOperations spyOperations, + Table table, + AtomicLong lockId) throws TException, InterruptedException { // Simulate a communication error after a successful commit - doAnswer(i -> { - org.apache.hadoop.hive.metastore.api.Table tbl = - i.getArgument(0, org.apache.hadoop.hive.metastore.api.Table.class); - realOperations.persistTable(tbl, true); - // Simulate lock expiration or removal - realOperations.doUnlock(lockId.get()); - table.refresh(); - table.updateSchema().addColumn("newCol", Types.IntegerType.get()).commit(); - throw new TException("Datacenter on fire"); - }).when(spyOperations).persistTable(any(), anyBoolean()); + doAnswer( + i -> { + org.apache.hadoop.hive.metastore.api.Table tbl = + i.getArgument(0, org.apache.hadoop.hive.metastore.api.Table.class); + realOperations.persistTable(tbl, true); + // Simulate lock expiration or removal + realOperations.doUnlock(lockId.get()); + table.refresh(); + table.updateSchema().addColumn("newCol", Types.IntegerType.get()).commit(); + throw new TException("Datacenter on fire"); + }) + .when(spyOperations) + .persistTable(any(), anyBoolean()); } - private void failCommitAndThrowException(HiveTableOperations spyOperations) throws TException, InterruptedException { + private void failCommitAndThrowException(HiveTableOperations spyOperations) + throws TException, InterruptedException { doThrow(new TException("Datacenter on fire")) .when(spyOperations) .persistTable(any(), anyBoolean()); @@ -337,7 +361,9 @@ private boolean metadataFileExists(TableMetadata metadata) { } private int metadataFileCount(TableMetadata metadata) { - return new File(metadata.metadataFileLocation().replace("file:", "")).getParentFile() - .listFiles(file -> file.getName().endsWith("metadata.json")).length; + return new File(metadata.metadataFileLocation().replace("file:", "")) + .getParentFile() + .listFiles(file -> file.getName().endsWith("metadata.json")) + .length; } } diff --git a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveMetastore.java b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveMetastore.java index 3c7a2d04b223..d4e5ac46eb06 100644 --- a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveMetastore.java +++ b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveMetastore.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; +import static java.nio.file.Files.createTempDirectory; +import static java.nio.file.attribute.PosixFilePermissions.asFileAttribute; +import static java.nio.file.attribute.PosixFilePermissions.fromString; + import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -51,10 +54,6 @@ import org.apache.thrift.transport.TTransportFactory; import org.junit.Assert; -import static java.nio.file.Files.createTempDirectory; -import static java.nio.file.attribute.PosixFilePermissions.asFileAttribute; -import static java.nio.file.attribute.PosixFilePermissions.fromString; - public class TestHiveMetastore { private static final String DEFAULT_DATABASE_NAME = "default"; @@ -62,23 +61,32 @@ public class TestHiveMetastore { // create the metastore handlers based on whether we're working with Hive2 or Hive3 dependencies // we need to do this because there is a breaking API change between Hive2 and Hive3 - private static final DynConstructors.Ctor HMS_HANDLER_CTOR = DynConstructors.builder() + private static final DynConstructors.Ctor HMS_HANDLER_CTOR = + DynConstructors.builder() .impl(HiveMetaStore.HMSHandler.class, String.class, Configuration.class) .impl(HiveMetaStore.HMSHandler.class, String.class, HiveConf.class) .build(); - private static final DynMethods.StaticMethod GET_BASE_HMS_HANDLER = DynMethods.builder("getProxy") + private static final DynMethods.StaticMethod GET_BASE_HMS_HANDLER = + DynMethods.builder("getProxy") .impl(RetryingHMSHandler.class, Configuration.class, IHMSHandler.class, boolean.class) .impl(RetryingHMSHandler.class, HiveConf.class, IHMSHandler.class, boolean.class) .buildStatic(); - // Hive3 introduces background metastore tasks (MetastoreTaskThread) for performing various cleanup duties. These - // threads are scheduled and executed in a static thread pool (org.apache.hadoop.hive.metastore.ThreadPool). - // This thread pool is shut down normally as part of the JVM shutdown hook, but since we're creating and tearing down - // multiple metastore instances within the same JVM, we have to call this cleanup method manually, otherwise - // threads from our previous test suite will be stuck in the pool with stale config, and keep on being scheduled. - // This can lead to issues, e.g. accidental Persistence Manager closure by ScheduledQueryExecutionsMaintTask. - private static final DynMethods.StaticMethod METASTORE_THREADS_SHUTDOWN = DynMethods.builder("shutdown") + // Hive3 introduces background metastore tasks (MetastoreTaskThread) for performing various + // cleanup duties. These + // threads are scheduled and executed in a static thread pool + // (org.apache.hadoop.hive.metastore.ThreadPool). + // This thread pool is shut down normally as part of the JVM shutdown hook, but since we're + // creating and tearing down + // multiple metastore instances within the same JVM, we have to call this cleanup method manually, + // otherwise + // threads from our previous test suite will be stuck in the pool with stale config, and keep on + // being scheduled. + // This can lead to issues, e.g. accidental Persistence Manager closure by + // ScheduledQueryExecutionsMaintTask. + private static final DynMethods.StaticMethod METASTORE_THREADS_SHUTDOWN = + DynMethods.builder("shutdown") .impl("org.apache.hadoop.hive.metastore.ThreadPool") .orNoop() .buildStatic(); @@ -90,21 +98,25 @@ public class TestHiveMetastore { static { try { - HIVE_LOCAL_DIR = createTempDirectory("hive", asFileAttribute(fromString("rwxrwxrwx"))).toFile(); + HIVE_LOCAL_DIR = + createTempDirectory("hive", asFileAttribute(fromString("rwxrwxrwx"))).toFile(); DERBY_PATH = new File(HIVE_LOCAL_DIR, "metastore_db").getPath(); File derbyLogFile = new File(HIVE_LOCAL_DIR, "derby.log"); System.setProperty("derby.stream.error.file", derbyLogFile.getAbsolutePath()); setupMetastoreDB("jdbc:derby:" + DERBY_PATH + ";create=true"); - Runtime.getRuntime().addShutdownHook(new Thread(() -> { - Path localDirPath = new Path(HIVE_LOCAL_DIR.getAbsolutePath()); - FileSystem fs = Util.getFs(localDirPath, new Configuration()); - String errMsg = "Failed to delete " + localDirPath; - try { - Assert.assertTrue(errMsg, fs.delete(localDirPath, true)); - } catch (IOException e) { - throw new RuntimeException(errMsg, e); - } - })); + Runtime.getRuntime() + .addShutdownHook( + new Thread( + () -> { + Path localDirPath = new Path(HIVE_LOCAL_DIR.getAbsolutePath()); + FileSystem fs = Util.getFs(localDirPath, new Configuration()); + String errMsg = "Failed to delete " + localDirPath; + try { + Assert.assertTrue(errMsg, fs.delete(localDirPath, true)); + } catch (IOException e) { + throw new RuntimeException(errMsg, e); + } + })); } catch (Exception e) { throw new RuntimeException("Failed to setup local dir for hive metastore", e); } @@ -124,7 +136,9 @@ public void start() { } /** - * Starts a TestHiveMetastore with the default connection pool size (5) with the provided HiveConf. + * Starts a TestHiveMetastore with the default connection pool size (5) with the provided + * HiveConf. + * * @param conf The hive configuration to use */ public void start(HiveConf conf) { @@ -133,6 +147,7 @@ public void start(HiveConf conf) { /** * Starts a TestHiveMetastore with a provided connection pool size and HiveConf. + * * @param conf The hive configuration to use * @param poolSize The number of threads in the executor pool */ @@ -147,8 +162,11 @@ public void start(HiveConf conf, int poolSize) { this.executorService = Executors.newSingleThreadExecutor(); this.executorService.submit(() -> server.serve()); - // in Hive3, setting this as a system prop ensures that it will be picked up whenever a new HiveConf is created - System.setProperty(HiveConf.ConfVars.METASTOREURIS.varname, hiveConf.getVar(HiveConf.ConfVars.METASTOREURIS)); + // in Hive3, setting this as a system prop ensures that it will be picked up whenever a new + // HiveConf is created + System.setProperty( + HiveConf.ConfVars.METASTOREURIS.varname, + hiveConf.getVar(HiveConf.ConfVars.METASTOREURIS)); this.clientPool = new HiveClientPool(1, hiveConf); } catch (Exception e) { @@ -186,18 +204,20 @@ public void reset() throws Exception { if (clientPool != null) { for (String dbName : clientPool.run(client -> client.getAllDatabases())) { for (String tblName : clientPool.run(client -> client.getAllTables(dbName))) { - clientPool.run(client -> { - client.dropTable(dbName, tblName, true, true, true); - return null; - }); + clientPool.run( + client -> { + client.dropTable(dbName, tblName, true, true, true); + return null; + }); } if (!DEFAULT_DATABASE_NAME.equals(dbName)) { // Drop cascade, functions dropped by cascade - clientPool.run(client -> { - client.dropDatabase(dbName, true, true, true); - return null; - }); + clientPool.run( + client -> { + client.dropDatabase(dbName, true, true, true); + return null; + }); } } } @@ -205,8 +225,8 @@ public void reset() throws Exception { Path warehouseRoot = new Path(HIVE_LOCAL_DIR.getAbsolutePath()); FileSystem fs = Util.getFs(warehouseRoot, hiveConf); for (FileStatus fileStatus : fs.listStatus(warehouseRoot)) { - if (!fileStatus.getPath().getName().equals("derby.log") && - !fileStatus.getPath().getName().equals("metastore_db")) { + if (!fileStatus.getPath().getName().equals("derby.log") + && !fileStatus.getPath().getName().equals("metastore_db")) { fs.delete(fileStatus.getPath(), true); } } @@ -220,25 +240,30 @@ public Table getTable(TableIdentifier identifier) throws TException, Interrupted return getTable(identifier.namespace().toString(), identifier.name()); } - private TServer newThriftServer(TServerSocket socket, int poolSize, HiveConf conf) throws Exception { + private TServer newThriftServer(TServerSocket socket, int poolSize, HiveConf conf) + throws Exception { HiveConf serverConf = new HiveConf(conf); - serverConf.set(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname, "jdbc:derby:" + DERBY_PATH + ";create=true"); + serverConf.set( + HiveConf.ConfVars.METASTORECONNECTURLKEY.varname, + "jdbc:derby:" + DERBY_PATH + ";create=true"); baseHandler = HMS_HANDLER_CTOR.newInstance("new db based metaserver", serverConf); IHMSHandler handler = GET_BASE_HMS_HANDLER.invoke(serverConf, baseHandler, false); - TThreadPoolServer.Args args = new TThreadPoolServer.Args(socket) - .processor(new TSetIpAddressProcessor<>(handler)) - .transportFactory(new TTransportFactory()) - .protocolFactory(new TBinaryProtocol.Factory()) - .minWorkerThreads(poolSize) - .maxWorkerThreads(poolSize); + TThreadPoolServer.Args args = + new TThreadPoolServer.Args(socket) + .processor(new TSetIpAddressProcessor<>(handler)) + .transportFactory(new TTransportFactory()) + .protocolFactory(new TBinaryProtocol.Factory()) + .minWorkerThreads(poolSize) + .maxWorkerThreads(poolSize); return new TThreadPoolServer(args); } private void initConf(HiveConf conf, int port) { conf.set(HiveConf.ConfVars.METASTOREURIS.varname, "thrift://localhost:" + port); - conf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, "file:" + HIVE_LOCAL_DIR.getAbsolutePath()); + conf.set( + HiveConf.ConfVars.METASTOREWAREHOUSE.varname, "file:" + HIVE_LOCAL_DIR.getAbsolutePath()); conf.set(HiveConf.ConfVars.METASTORE_TRY_DIRECT_SQL.varname, "false"); conf.set(HiveConf.ConfVars.METASTORE_DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES.varname, "false"); conf.set("iceberg.hive.client-pool-size", "2"); diff --git a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java index b20a518bc11b..e73081f1bdb0 100644 --- a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java +++ b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; @@ -35,78 +36,106 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestHiveSchemaUtil { - private static final Schema SIMPLE_ICEBERG_SCHEMA = new Schema( - optional(0, "customer_id", Types.LongType.get(), "customer comment"), - optional(1, "first_name", Types.StringType.get(), "first name comment") - ); - - private static final Schema COMPLEX_ICEBERG_SCHEMA = new Schema( - optional(0, "id", Types.LongType.get(), ""), - optional(1, "name", Types.StringType.get(), ""), - optional(2, "employee_info", Types.StructType.of( - optional(3, "employer", Types.StringType.get()), - optional(4, "id", Types.LongType.get()), - optional(5, "address", Types.StringType.get()) - ), ""), - optional(6, "places_lived", Types.ListType.ofOptional(10, Types.StructType.of( - optional(7, "street", Types.StringType.get()), - optional(8, "city", Types.StringType.get()), - optional(9, "country", Types.StringType.get()) - )), ""), - optional(11, "memorable_moments", Types.MapType.ofOptional(15, 16, - Types.StringType.get(), - Types.StructType.of( - optional(12, "year", Types.IntegerType.get()), - optional(13, "place", Types.StringType.get()), - optional(14, "details", Types.StringType.get()) - )), ""), - optional(17, "current_address", Types.StructType.of( - optional(18, "street_address", Types.StructType.of( - optional(19, "street_number", Types.IntegerType.get()), - optional(20, "street_name", Types.StringType.get()), - optional(21, "street_type", Types.StringType.get()) - )), - optional(22, "country", Types.StringType.get()), - optional(23, "postal_code", Types.StringType.get()) - ), "") - ); - - private static final List SIMPLE_HIVE_SCHEMA = ImmutableList.of( - new FieldSchema("customer_id", serdeConstants.BIGINT_TYPE_NAME, "customer comment"), - new FieldSchema("first_name", serdeConstants.STRING_TYPE_NAME, "first name comment") - ); - - private static final List COMPLEX_HIVE_SCHEMA = ImmutableList.of( - new FieldSchema("id", "bigint", ""), - new FieldSchema("name", "string", ""), - new FieldSchema("employee_info", "struct", ""), - new FieldSchema("places_lived", "array>", ""), - new FieldSchema("memorable_moments", "map>", ""), - new FieldSchema("current_address", "struct,country:string,postal_code:string>", "") - ); + private static final Schema SIMPLE_ICEBERG_SCHEMA = + new Schema( + optional(0, "customer_id", Types.LongType.get(), "customer comment"), + optional(1, "first_name", Types.StringType.get(), "first name comment")); + + private static final Schema COMPLEX_ICEBERG_SCHEMA = + new Schema( + optional(0, "id", Types.LongType.get(), ""), + optional(1, "name", Types.StringType.get(), ""), + optional( + 2, + "employee_info", + Types.StructType.of( + optional(3, "employer", Types.StringType.get()), + optional(4, "id", Types.LongType.get()), + optional(5, "address", Types.StringType.get())), + ""), + optional( + 6, + "places_lived", + Types.ListType.ofOptional( + 10, + Types.StructType.of( + optional(7, "street", Types.StringType.get()), + optional(8, "city", Types.StringType.get()), + optional(9, "country", Types.StringType.get()))), + ""), + optional( + 11, + "memorable_moments", + Types.MapType.ofOptional( + 15, + 16, + Types.StringType.get(), + Types.StructType.of( + optional(12, "year", Types.IntegerType.get()), + optional(13, "place", Types.StringType.get()), + optional(14, "details", Types.StringType.get()))), + ""), + optional( + 17, + "current_address", + Types.StructType.of( + optional( + 18, + "street_address", + Types.StructType.of( + optional(19, "street_number", Types.IntegerType.get()), + optional(20, "street_name", Types.StringType.get()), + optional(21, "street_type", Types.StringType.get()))), + optional(22, "country", Types.StringType.get()), + optional(23, "postal_code", Types.StringType.get())), + "")); + + private static final List SIMPLE_HIVE_SCHEMA = + ImmutableList.of( + new FieldSchema("customer_id", serdeConstants.BIGINT_TYPE_NAME, "customer comment"), + new FieldSchema("first_name", serdeConstants.STRING_TYPE_NAME, "first name comment")); + + private static final List COMPLEX_HIVE_SCHEMA = + ImmutableList.of( + new FieldSchema("id", "bigint", ""), + new FieldSchema("name", "string", ""), + new FieldSchema("employee_info", "struct", ""), + new FieldSchema( + "places_lived", "array>", ""), + new FieldSchema( + "memorable_moments", "map>", ""), + new FieldSchema( + "current_address", + "struct,country:string,postal_code:string>", + "")); @Test public void testSimpleSchemaConvertToIcebergSchema() { - Assert.assertEquals(SIMPLE_ICEBERG_SCHEMA.asStruct(), HiveSchemaUtil.convert(SIMPLE_HIVE_SCHEMA).asStruct()); + Assert.assertEquals( + SIMPLE_ICEBERG_SCHEMA.asStruct(), HiveSchemaUtil.convert(SIMPLE_HIVE_SCHEMA).asStruct()); } @Test public void testSimpleSchemaConvertToIcebergSchemaFromNameAndTypeLists() { - List names = SIMPLE_HIVE_SCHEMA.stream().map(field -> field.getName()).collect(Collectors.toList()); - List types = SIMPLE_HIVE_SCHEMA.stream() - .map(field -> TypeInfoUtils.getTypeInfoFromTypeString(field.getType())) - .collect(Collectors.toList()); - List comments = SIMPLE_HIVE_SCHEMA.stream().map(FieldSchema::getComment).collect(Collectors.toList()); - Assert.assertEquals(SIMPLE_ICEBERG_SCHEMA.asStruct(), HiveSchemaUtil.convert(names, types, comments).asStruct()); + List names = + SIMPLE_HIVE_SCHEMA.stream().map(field -> field.getName()).collect(Collectors.toList()); + List types = + SIMPLE_HIVE_SCHEMA.stream() + .map(field -> TypeInfoUtils.getTypeInfoFromTypeString(field.getType())) + .collect(Collectors.toList()); + List comments = + SIMPLE_HIVE_SCHEMA.stream().map(FieldSchema::getComment).collect(Collectors.toList()); + Assert.assertEquals( + SIMPLE_ICEBERG_SCHEMA.asStruct(), + HiveSchemaUtil.convert(names, types, comments).asStruct()); } @Test public void testComplexSchemaConvertToIcebergSchema() { - Assert.assertEquals(COMPLEX_ICEBERG_SCHEMA.asStruct(), HiveSchemaUtil.convert(COMPLEX_HIVE_SCHEMA).asStruct()); + Assert.assertEquals( + COMPLEX_ICEBERG_SCHEMA.asStruct(), HiveSchemaUtil.convert(COMPLEX_HIVE_SCHEMA).asStruct()); } @Test @@ -118,11 +147,13 @@ public void testSchemaConvertToIcebergSchemaForEveryPrimitiveType() { @Test public void testNotSupportedTypes() { for (FieldSchema notSupportedField : getNotSupportedFieldSchemas()) { - AssertHelpers.assertThrows("should throw exception", IllegalArgumentException.class, - "Unsupported Hive type", () -> { + AssertHelpers.assertThrows( + "should throw exception", + IllegalArgumentException.class, + "Unsupported Hive type", + () -> { HiveSchemaUtil.convert(Lists.newArrayList(Arrays.asList(notSupportedField))); - } - ); + }); } } @@ -142,30 +173,35 @@ public void testSimpleTypeAndTypeInfoConvert() { List fieldSchemas = getSupportedFieldSchemas(); List nestedFields = getSchemaWithSupportedTypes().columns(); for (int i = 0; i < fieldSchemas.size(); ++i) { - checkConvert(TypeInfoUtils.getTypeInfoFromTypeString(fieldSchemas.get(i).getType()), nestedFields.get(i).type()); + checkConvert( + TypeInfoUtils.getTypeInfoFromTypeString(fieldSchemas.get(i).getType()), + nestedFields.get(i).type()); } } @Test public void testComplexTypeAndTypeInfoConvert() { for (int i = 0; i < COMPLEX_HIVE_SCHEMA.size(); ++i) { - checkConvert(TypeInfoUtils.getTypeInfoFromTypeString(COMPLEX_HIVE_SCHEMA.get(i).getType()), + checkConvert( + TypeInfoUtils.getTypeInfoFromTypeString(COMPLEX_HIVE_SCHEMA.get(i).getType()), COMPLEX_ICEBERG_SCHEMA.columns().get(i).type()); } } @Test public void testConversionWithoutLastComment() { - Schema expected = new Schema( - optional(0, "customer_id", Types.LongType.get(), "customer comment"), - optional(1, "first_name", Types.StringType.get(), null) - ); + Schema expected = + new Schema( + optional(0, "customer_id", Types.LongType.get(), "customer comment"), + optional(1, "first_name", Types.StringType.get(), null)); - Schema schema = HiveSchemaUtil.convert( - Arrays.asList("customer_id", "first_name"), - Arrays.asList(TypeInfoUtils.getTypeInfoFromTypeString(serdeConstants.BIGINT_TYPE_NAME), - TypeInfoUtils.getTypeInfoFromTypeString(serdeConstants.STRING_TYPE_NAME)), - Arrays.asList("customer comment")); + Schema schema = + HiveSchemaUtil.convert( + Arrays.asList("customer_id", "first_name"), + Arrays.asList( + TypeInfoUtils.getTypeInfoFromTypeString(serdeConstants.BIGINT_TYPE_NAME), + TypeInfoUtils.getTypeInfoFromTypeString(serdeConstants.STRING_TYPE_NAME)), + Arrays.asList("customer comment")); Assert.assertEquals(expected.asStruct(), schema.asStruct()); } @@ -191,7 +227,8 @@ protected List getNotSupportedFieldSchemas() { fields.add(new FieldSchema("c_short", serdeConstants.SMALLINT_TYPE_NAME, "")); fields.add(new FieldSchema("c_char", serdeConstants.CHAR_TYPE_NAME + "(5)", "")); fields.add(new FieldSchema("c_varchar", serdeConstants.VARCHAR_TYPE_NAME + "(5)", "")); - fields.add(new FieldSchema("c_interval_date", serdeConstants.INTERVAL_YEAR_MONTH_TYPE_NAME, "")); + fields.add( + new FieldSchema("c_interval_date", serdeConstants.INTERVAL_YEAR_MONTH_TYPE_NAME, "")); fields.add(new FieldSchema("c_interval_time", serdeConstants.INTERVAL_DAY_TIME_TYPE_NAME, "")); return fields; } @@ -212,6 +249,7 @@ protected Schema getSchemaWithSupportedTypes() { /** * Check conversion for 1-on-1 mappings + * * @param typeInfo Hive type * @param type Iceberg type */ @@ -224,6 +262,7 @@ private void checkConvert(TypeInfo typeInfo, Type type) { /** * Compares the nested types without checking the ids. + * * @param expected The expected types to compare * @param actual The actual types to compare */ diff --git a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveTableConcurrency.java b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveTableConcurrency.java index 3077dc51f6b4..b8630364e837 100644 --- a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveTableConcurrency.java +++ b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveTableConcurrency.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.hive; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; + import java.util.UUID; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -35,10 +38,6 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; - public class TestHiveTableConcurrency extends HiveTableBaseTest { @Test @@ -46,33 +45,37 @@ public synchronized void testConcurrentFastAppends() { Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER); String fileName = UUID.randomUUID().toString(); - DataFile file = DataFiles.builder(icebergTable.spec()) - .withPath(FileFormat.PARQUET.addExtension(fileName)) - .withRecordCount(2) - .withFileSizeInBytes(0) - .build(); + DataFile file = + DataFiles.builder(icebergTable.spec()) + .withPath(FileFormat.PARQUET.addExtension(fileName)) + .withRecordCount(2) + .withFileSizeInBytes(0) + .build(); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); Tasks.range(2) - .stopOnFailure().throwFailureWhenFinished() + .stopOnFailure() + .throwFailureWhenFinished() .executeWith(executorService) - .run(index -> { - for (int numCommittedFiles = 0; numCommittedFiles < 10; numCommittedFiles++) { - while (barrier.get() < numCommittedFiles * 2) { - try { - Thread.sleep(10); - } catch (InterruptedException e) { - throw new RuntimeException(e); + .run( + index -> { + for (int numCommittedFiles = 0; numCommittedFiles < 10; numCommittedFiles++) { + while (barrier.get() < numCommittedFiles * 2) { + try { + Thread.sleep(10); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + + icebergTable.newFastAppend().appendFile(file).commit(); + barrier.incrementAndGet(); } - } - - icebergTable.newFastAppend().appendFile(file).commit(); - barrier.incrementAndGet(); - } - }); + }); icebergTable.refresh(); Assert.assertEquals(20, icebergTable.currentSnapshot().allManifests(icebergTable.io()).size()); @@ -82,21 +85,24 @@ public synchronized void testConcurrentFastAppends() { public synchronized void testConcurrentConnections() throws InterruptedException { Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER); - icebergTable.updateProperties() + icebergTable + .updateProperties() .set(COMMIT_NUM_RETRIES, "20") .set(COMMIT_MIN_RETRY_WAIT_MS, "25") .set(COMMIT_MAX_RETRY_WAIT_MS, "25") .commit(); String fileName = UUID.randomUUID().toString(); - DataFile file = DataFiles.builder(icebergTable.spec()) - .withPath(FileFormat.PARQUET.addExtension(fileName)) - .withRecordCount(2) - .withFileSizeInBytes(0) - .build(); - - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(7)); + DataFile file = + DataFiles.builder(icebergTable.spec()) + .withPath(FileFormat.PARQUET.addExtension(fileName)) + .withRecordCount(2) + .withFileSizeInBytes(0) + .build(); + + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(7)); for (int i = 0; i < 7; i++) { executorService.submit(() -> icebergTable.newAppend().appendFile(file).commit()); diff --git a/hive3/src/main/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java b/hive3/src/main/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java index d53030f7bc77..cfee55f3546a 100644 --- a/hive3/src/main/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java +++ b/hive3/src/main/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java @@ -7,15 +7,15 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ - package org.apache.hadoop.hive.ql.io.orc; import java.io.ByteArrayOutputStream; @@ -43,18 +43,17 @@ import org.slf4j.LoggerFactory; /** - * In order to fix some compatibility issues with ORC support with Hive 3.x and the shaded ORC libraries, - * this class has been copied from Hive 3.x source code. However, this class should be removed once - * Hive 4 is out. + * In order to fix some compatibility issues with ORC support with Hive 3.x and the shaded ORC + * libraries, this class has been copied from Hive 3.x source code. However, this class should be + * removed once Hive 4 is out. */ public class OrcSplit extends FileSplit implements ColumnarSplit, LlapAwareSplit { private static final Logger LOG = LoggerFactory.getLogger(OrcSplit.class); private OrcTail orcTail; private boolean hasFooter; - /** - * This means {@link AcidUtils.AcidBaseFileType#ORIGINAL_BASE} - */ + /** This means {@link AcidUtils.AcidBaseFileType#ORIGINAL_BASE} */ private boolean isOriginal; + private boolean hasBase; // partition root private Path rootDir; @@ -76,9 +75,19 @@ protected OrcSplit() { super(null, 0, 0, (String[]) null); } - public OrcSplit(Path path, Object fileId, long offset, long length, String[] hosts, - OrcTail orcTail, boolean isOriginal, boolean hasBase, - List deltas, long projectedDataSize, long fileLen, Path rootDir) { + public OrcSplit( + Path path, + Object fileId, + long offset, + long length, + String[] hosts, + OrcTail orcTail, + boolean isOriginal, + boolean hasBase, + List deltas, + long projectedDataSize, + long fileLen, + Path rootDir) { super(path, offset, length, hosts); // For HDFS, we could avoid serializing file ID and just replace the path with inode-based // path. However, that breaks bunch of stuff because Hive later looks up things by split path. @@ -108,19 +117,22 @@ public void write(DataOutput out) throws IOException { out.write(bos.toByteArray()); if (LOG.isTraceEnabled()) { - LOG.trace("Writing additional {} bytes to OrcSplit as payload. Required {} bytes.", - additional, required); + LOG.trace( + "Writing additional {} bytes to OrcSplit as payload. Required {} bytes.", + additional, + required); } } private void writeAdditionalPayload(final DataOutputStream out) throws IOException { boolean isFileIdLong = fileKey instanceof Long; boolean isFileIdWritable = fileKey instanceof Writable; - int flags = (hasBase ? BASE_FLAG : 0) | - (isOriginal ? ORIGINAL_FLAG : 0) | - (hasFooter ? FOOTER_FLAG : 0) | - (isFileIdLong ? HAS_LONG_FILEID_FLAG : 0) | - (isFileIdWritable ? HAS_SYNTHETIC_FILEID_FLAG : 0); + int flags = + (hasBase ? BASE_FLAG : 0) + | (isOriginal ? ORIGINAL_FLAG : 0) + | (hasFooter ? FOOTER_FLAG : 0) + | (isFileIdLong ? HAS_LONG_FILEID_FLAG : 0) + | (isFileIdWritable ? HAS_SYNTHETIC_FILEID_FLAG : 0); out.writeByte(flags); out.writeInt(deltas.size()); for (AcidInputFormat.DeltaMetaData delta : deltas) { @@ -191,12 +203,11 @@ public boolean hasFooter() { } /** - * @return {@code true} if file schema doesn't have Acid metadata columns - * Such file may be in a delta_x_y/ or base_x due to being added via - * "load data" command. It could be at partition|table root due to table having - * been converted from non-acid to acid table. It could even be something like - * "warehouse/t/HIVE_UNION_SUBDIR_15/000000_0" if it was written by an - * "insert into t select ... from A union all select ... from B" + * @return {@code true} if file schema doesn't have Acid metadata columns Such file may be in a + * delta_x_y/ or base_x due to being added via "load data" command. It could be at + * partition|table root due to table having been converted from non-acid to acid table. It + * could even be something like "warehouse/t/HIVE_UNION_SUBDIR_15/000000_0" if it was written + * by an "insert into t select ... from A union all select ... from B" */ public boolean isOriginal() { return isOriginal; @@ -219,8 +230,8 @@ public long getFileLength() { } /** - * If this method returns true, then for sure it is ACID. - * However, if it returns false.. it could be ACID or non-ACID. + * If this method returns true, then for sure it is ACID. However, if it returns false.. it could + * be ACID or non-ACID. * * @return true if is ACID */ @@ -248,8 +259,8 @@ public boolean canUseLlapIo(Configuration conf) { final boolean isVectorized = HiveConf.getBoolVar(conf, ConfVars.HIVE_VECTORIZATION_ENABLED); Boolean isSplitUpdate = null; if (isAcidRead) { - final AcidUtils.AcidOperationalProperties acidOperationalProperties - = AcidUtils.getAcidOperationalProperties(conf); + final AcidUtils.AcidOperationalProperties acidOperationalProperties = + AcidUtils.getAcidOperationalProperties(conf); isSplitUpdate = acidOperationalProperties.isSplitUpdate(); } @@ -276,8 +287,22 @@ public boolean canUseLlapIo(Configuration conf) { @Override public String toString() { - return "OrcSplit [" + getPath() + ", start=" + getStart() + ", length=" + getLength() + - ", isOriginal=" + isOriginal + ", fileLength=" + fileLen + ", hasFooter=" + hasFooter + - ", hasBase=" + hasBase + ", deltas=" + (deltas == null ? 0 : deltas.size()) + "]"; + return "OrcSplit [" + + getPath() + + ", start=" + + getStart() + + ", length=" + + getLength() + + ", isOriginal=" + + isOriginal + + ", fileLength=" + + fileLen + + ", hasFooter=" + + hasFooter + + ", hasBase=" + + hasBase + + ", deltas=" + + (deltas == null ? 0 : deltas.size()) + + "]"; } } diff --git a/hive3/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDateObjectInspectorHive3.java b/hive3/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDateObjectInspectorHive3.java index ab64b3695698..08bb03282bac 100644 --- a/hive3/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDateObjectInspectorHive3.java +++ b/hive3/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDateObjectInspectorHive3.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.time.LocalDate; @@ -30,7 +29,8 @@ public final class IcebergDateObjectInspectorHive3 extends AbstractPrimitiveJavaObjectInspector implements DateObjectInspector, WriteObjectInspector { - private static final IcebergDateObjectInspectorHive3 INSTANCE = new IcebergDateObjectInspectorHive3(); + private static final IcebergDateObjectInspectorHive3 INSTANCE = + new IcebergDateObjectInspectorHive3(); public static IcebergDateObjectInspectorHive3 get() { return INSTANCE; @@ -63,7 +63,8 @@ public Object copyObject(Object o) { if (o instanceof Date) { return new Date((Date) o); } else if (o instanceof LocalDate) { - return LocalDate.of(((LocalDate) o).getYear(), ((LocalDate) o).getMonth(), ((LocalDate) o).getDayOfMonth()); + return LocalDate.of( + ((LocalDate) o).getYear(), ((LocalDate) o).getMonth(), ((LocalDate) o).getDayOfMonth()); } else { return o; } diff --git a/hive3/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampObjectInspectorHive3.java b/hive3/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampObjectInspectorHive3.java index be33870dad00..9ba99c8dec6c 100644 --- a/hive3/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampObjectInspectorHive3.java +++ b/hive3/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampObjectInspectorHive3.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.time.LocalDateTime; @@ -30,7 +29,8 @@ public class IcebergTimestampObjectInspectorHive3 extends AbstractPrimitiveJavaObjectInspector implements TimestampObjectInspector, WriteObjectInspector { - private static final IcebergTimestampObjectInspectorHive3 INSTANCE = new IcebergTimestampObjectInspectorHive3(); + private static final IcebergTimestampObjectInspectorHive3 INSTANCE = + new IcebergTimestampObjectInspectorHive3(); public static IcebergTimestampObjectInspectorHive3 get() { return INSTANCE; @@ -46,7 +46,8 @@ public LocalDateTime convert(Object o) { return null; } Timestamp timestamp = (Timestamp) o; - return LocalDateTime.ofEpochSecond(timestamp.toEpochSecond(), timestamp.getNanos(), ZoneOffset.UTC); + return LocalDateTime.ofEpochSecond( + timestamp.toEpochSecond(), timestamp.getNanos(), ZoneOffset.UTC); } @Override @@ -83,5 +84,4 @@ public Object copyObject(Object o) { return o; } } - } diff --git a/hive3/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampWithZoneObjectInspectorHive3.java b/hive3/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampWithZoneObjectInspectorHive3.java index 199ea361d29f..c3c81f8d7a74 100644 --- a/hive3/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampWithZoneObjectInspectorHive3.java +++ b/hive3/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampWithZoneObjectInspectorHive3.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.time.OffsetDateTime; @@ -28,7 +27,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampLocalTZObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -public class IcebergTimestampWithZoneObjectInspectorHive3 extends AbstractPrimitiveJavaObjectInspector +public class IcebergTimestampWithZoneObjectInspectorHive3 + extends AbstractPrimitiveJavaObjectInspector implements TimestampLocalTZObjectInspector, WriteObjectInspector { private static final IcebergTimestampWithZoneObjectInspectorHive3 INSTANCE = diff --git a/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/CompatibilityHiveVectorUtils.java b/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/CompatibilityHiveVectorUtils.java index a897a943b073..48207a1b3a99 100644 --- a/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/CompatibilityHiveVectorUtils.java +++ b/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/CompatibilityHiveVectorUtils.java @@ -7,15 +7,15 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ - package org.apache.iceberg.mr.hive.vector; import java.nio.charset.StandardCharsets; @@ -45,19 +45,18 @@ import org.slf4j.LoggerFactory; /** - * Contains ported code snippets from later Hive sources. We should get rid of this class as soon as Hive 4 is released - * and Iceberg makes a dependency to that version. + * Contains ported code snippets from later Hive sources. We should get rid of this class as soon as + * Hive 4 is released and Iceberg makes a dependency to that version. */ public class CompatibilityHiveVectorUtils { private static final Logger LOG = LoggerFactory.getLogger(CompatibilityHiveVectorUtils.class); - private CompatibilityHiveVectorUtils() { - - } + private CompatibilityHiveVectorUtils() {} /** - * Returns serialized mapwork instance from a job conf - ported from Hive source code LlapHiveUtils#findMapWork + * Returns serialized mapwork instance from a job conf - ported from Hive source code + * LlapHiveUtils#findMapWork * * @param job JobConf instance * @return a serialized {@link MapWork} based on the given job conf @@ -91,7 +90,6 @@ public static MapWork findMapWork(JobConf job) { return (MapWork) work; } - /** * Ported from Hive source code VectorizedRowBatchCtx#addPartitionColsToBatch * @@ -100,9 +98,10 @@ public static MapWork findMapWork(JobConf job) { * @param partitionColumnName partition key * @param rowColumnTypeInfo column type description */ -// @SuppressWarnings({"AvoidNestedBlocks", "FallThrough", "MethodLength", "CyclomaticComplexity", "Indentation"}) - public static void addPartitionColsToBatch(ColumnVector col, Object value, String partitionColumnName, - TypeInfo rowColumnTypeInfo) { + // @SuppressWarnings({"AvoidNestedBlocks", "FallThrough", "MethodLength", "CyclomaticComplexity", + // "Indentation"}) + public static void addPartitionColsToBatch( + ColumnVector col, Object value, String partitionColumnName, TypeInfo rowColumnTypeInfo) { PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) rowColumnTypeInfo; if (value == null) { @@ -210,9 +209,11 @@ public static void addPartitionColsToBatch(ColumnVector col, Object value, Strin break; default: - throw new RuntimeException("Unable to recognize the partition type " + - primitiveTypeInfo.getPrimitiveCategory() + " for column " + partitionColumnName); + throw new RuntimeException( + "Unable to recognize the partition type " + + primitiveTypeInfo.getPrimitiveCategory() + + " for column " + + partitionColumnName); } - } } diff --git a/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/HiveIcebergVectorizedRecordReader.java b/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/HiveIcebergVectorizedRecordReader.java index 98507ddece58..70c43cb21f91 100644 --- a/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/HiveIcebergVectorizedRecordReader.java +++ b/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/HiveIcebergVectorizedRecordReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.vector; import java.io.IOException; @@ -30,13 +29,17 @@ * Basically an MR1 API implementing wrapper for transferring VectorizedRowBatch's produced by * IcebergInputformat$IcebergRecordReader which relies on the MR2 API format. */ -public final class HiveIcebergVectorizedRecordReader extends AbstractMapredIcebergRecordReader { +public final class HiveIcebergVectorizedRecordReader + extends AbstractMapredIcebergRecordReader { private final JobConf job; public HiveIcebergVectorizedRecordReader( - org.apache.iceberg.mr.mapreduce.IcebergInputFormat mapreduceInputFormat, IcebergSplit split, - JobConf job, Reporter reporter) throws IOException { + org.apache.iceberg.mr.mapreduce.IcebergInputFormat mapreduceInputFormat, + IcebergSplit split, + JobConf job, + Reporter reporter) + throws IOException { super(mapreduceInputFormat, split, job, reporter); this.job = job; } @@ -62,12 +65,13 @@ public boolean next(Void key, VectorizedRowBatch value) throws IOException { @Override public VectorizedRowBatch createValue() { - return CompatibilityHiveVectorUtils.findMapWork(job).getVectorizedRowBatchCtx().createVectorizedRowBatch(); + return CompatibilityHiveVectorUtils.findMapWork(job) + .getVectorizedRowBatchCtx() + .createVectorizedRowBatch(); } @Override public long getPos() { return -1; } - } diff --git a/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java b/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java index 73b4f43902b8..e6f9b8f8cc98 100644 --- a/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java +++ b/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.vector; import java.io.IOException; @@ -55,25 +54,28 @@ import org.apache.parquet.schema.MessageType; /** - * Utility class to create vectorized readers for Hive. - * As per the file format of the task, it will create a matching vectorized record reader that is already implemented - * in Hive. It will also do some tweaks on the produced vectors for Iceberg's use e.g. partition column handling. + * Utility class to create vectorized readers for Hive. As per the file format of the task, it will + * create a matching vectorized record reader that is already implemented in Hive. It will also do + * some tweaks on the produced vectors for Iceberg's use e.g. partition column handling. */ public class HiveVectorizedReader { + private HiveVectorizedReader() {} - private HiveVectorizedReader() { - - } - - public static CloseableIterable reader(InputFile inputFile, FileScanTask task, Map idToConstant, + public static CloseableIterable reader( + InputFile inputFile, + FileScanTask task, + Map idToConstant, TaskAttemptContext context) { JobConf job = (JobConf) context.getConfiguration(); Path path = new Path(inputFile.location()); FileFormat format = task.file().format(); - Reporter reporter = ((MapredIcebergInputFormat.CompatibilityTaskAttemptContextImpl) context).getLegacyReporter(); + Reporter reporter = + ((MapredIcebergInputFormat.CompatibilityTaskAttemptContextImpl) context) + .getLegacyReporter(); - // Hive by default requires partition columns to be read too. This is not required for identity partition + // Hive by default requires partition columns to be read too. This is not required for identity + // partition // columns, as we will add this as constants later. int[] partitionColIndices = null; @@ -121,30 +123,53 @@ public static CloseableIterable reader(InputFile inputFile, FileScanTask break; default: - throw new UnsupportedOperationException("Vectorized Hive reading unimplemented for format: " + format); + throw new UnsupportedOperationException( + "Vectorized Hive reading unimplemented for format: " + format); } - return createVectorizedRowBatchIterable(recordReader, job, partitionColIndices, partitionValues); + return createVectorizedRowBatchIterable( + recordReader, job, partitionColIndices, partitionValues); } catch (IOException ioe) { throw new RuntimeException("Error creating vectorized record reader for " + inputFile, ioe); } } - - private static RecordReader orcRecordReader(JobConf job, Reporter reporter, - FileScanTask task, InputFile inputFile, Path path, long start, long length) throws IOException { - // Metadata information has to be passed along in the OrcSplit. Without specifying this, the vectorized - // reader will assume that the ORC file ends at the task's start + length, and might fail reading the tail.. + private static RecordReader orcRecordReader( + JobConf job, + Reporter reporter, + FileScanTask task, + InputFile inputFile, + Path path, + long start, + long length) + throws IOException { + // Metadata information has to be passed along in the OrcSplit. Without specifying this, the + // vectorized + // reader will assume that the ORC file ends at the task's start + length, and might fail + // reading the tail.. OrcTail orcTail = VectorizedReadUtils.getOrcTail(inputFile, job); - InputSplit split = new OrcSplit(path, null, start, length, (String[]) null, orcTail, - false, false, Lists.newArrayList(), 0, task.length(), path.getParent()); + InputSplit split = + new OrcSplit( + path, + null, + start, + length, + (String[]) null, + orcTail, + false, + false, + Lists.newArrayList(), + 0, + task.length(), + path.getParent()); return new VectorizedOrcInputFormat().getRecordReader(split, job, reporter); } - private static RecordReader parquetRecordReader(JobConf job, Reporter reporter, - FileScanTask task, Path path, long start, long length) throws IOException { + private static RecordReader parquetRecordReader( + JobConf job, Reporter reporter, FileScanTask task, Path path, long start, long length) + throws IOException { InputSplit split = new FileSplit(path, start, length, job); VectorizedParquetInputFormat inputFormat = new VectorizedParquetInputFormat(); @@ -155,8 +180,9 @@ private static RecordReader parquetRecordReade if (ParquetSchemaUtil.hasIds(fileSchema)) { typeWithIds = ParquetSchemaUtil.pruneColumns(fileSchema, expectedSchema); } else { - typeWithIds = ParquetSchemaUtil.pruneColumnsFallback(ParquetSchemaUtil.addFallbackIds(fileSchema), - expectedSchema); + typeWithIds = + ParquetSchemaUtil.pruneColumnsFallback( + ParquetSchemaUtil.addFallbackIds(fileSchema), expectedSchema); } ParquetSchemaFieldNameVisitor psv = new ParquetSchemaFieldNameVisitor(fileSchema); @@ -167,7 +193,9 @@ private static RecordReader parquetRecordReade } private static CloseableIterable createVectorizedRowBatchIterable( - RecordReader hiveRecordReader, JobConf job, int[] partitionColIndices, + RecordReader hiveRecordReader, + JobConf job, + int[] partitionColIndices, Object[] partitionValues) { VectorizedRowBatchIterator iterator = @@ -186,5 +214,4 @@ public void close() throws IOException { } }; } - } diff --git a/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/ParquetSchemaFieldNameVisitor.java b/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/ParquetSchemaFieldNameVisitor.java index 4cc5ceeaa935..e6e4ff1e04de 100644 --- a/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/ParquetSchemaFieldNameVisitor.java +++ b/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/ParquetSchemaFieldNameVisitor.java @@ -7,15 +7,15 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ - package org.apache.iceberg.mr.hive.vector; import java.util.List; @@ -31,8 +31,9 @@ import org.apache.parquet.schema.Type; /** - * Collects the top level field names from Parquet schema. During schema visit it translates the expected schema's - * field names to what fields the visitor can match in the file schema to support column renames. + * Collects the top level field names from Parquet schema. During schema visit it translates the + * expected schema's field names to what fields the visitor can match in the file schema to support + * column renames. */ class ParquetSchemaFieldNameVisitor extends TypeWithSchemaVisitor { private final MessageType originalFileSchema; @@ -53,7 +54,8 @@ public Type message(Types.StructType expected, MessageType prunedFileSchema, Lis public Type struct(Types.StructType expected, GroupType struct, List fields) { boolean isMessageType = struct instanceof MessageType; - List expectedFields = expected != null ? expected.fields() : ImmutableList.of(); + List expectedFields = + expected != null ? expected.fields() : ImmutableList.of(); List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); for (Types.NestedField field : expectedFields) { @@ -65,10 +67,12 @@ public Type struct(Types.StructType expected, GroupType struct, List field Type fieldInPrunedFileSchema = typesById.get(id); if (fieldInPrunedFileSchema == null) { if (!originalFileSchema.containsField(field.name())) { - // Must be a new field - it isn't in this parquet file yet, so add the new field name instead of null + // Must be a new field - it isn't in this parquet file yet, so add the new field name + // instead of null appendToColNamesList(isMessageType, field.name()); } else { - // This field is found in the parquet file with a different ID, so it must have been recreated since. + // This field is found in the parquet file with a different ID, so it must have been + // recreated since. // Inserting a dummy col name to force Hive Parquet reader returning null for this column. appendToColNamesList(isMessageType, DUMMY_COL_NAME); } @@ -95,7 +99,8 @@ private void appendToColNamesList(boolean isMessageType, String colName) { } @Override - public Type primitive(org.apache.iceberg.types.Type.PrimitiveType expected, + public Type primitive( + org.apache.iceberg.types.Type.PrimitiveType expected, org.apache.parquet.schema.PrimitiveType primitive) { typesById.put(primitive.getId().intValue(), primitive); return primitive; diff --git a/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/VectorizedRowBatchIterator.java b/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/VectorizedRowBatchIterator.java index e29b1353cf4c..558a181cb0ad 100644 --- a/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/VectorizedRowBatchIterator.java +++ b/hive3/src/main/java/org/apache/iceberg/mr/hive/vector/VectorizedRowBatchIterator.java @@ -7,15 +7,15 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ - package org.apache.iceberg.mr.hive.vector; import java.io.IOException; @@ -39,8 +39,11 @@ public final class VectorizedRowBatchIterator implements CloseableIterator recordReader, JobConf job, - int[] partitionColIndices, Object[] partitionValues) { + VectorizedRowBatchIterator( + RecordReader recordReader, + JobConf job, + int[] partitionColIndices, + Object[] partitionValues) { this.recordReader = recordReader; this.key = recordReader.createKey(); this.batch = recordReader.createValue(); @@ -65,8 +68,11 @@ private void advance() { if (partitionColIndices != null) { for (int i = 0; i < partitionColIndices.length; ++i) { int colIdx = partitionColIndices[i]; - CompatibilityHiveVectorUtils.addPartitionColsToBatch(batch.cols[colIdx], partitionValues[i], - vrbCtx.getRowColumnNames()[colIdx], vrbCtx.getRowColumnTypeInfos()[colIdx]); + CompatibilityHiveVectorUtils.addPartitionColsToBatch( + batch.cols[colIdx], + partitionValues[i], + vrbCtx.getRowColumnNames()[colIdx], + vrbCtx.getRowColumnTypeInfos()[colIdx]); } } } catch (IOException ioe) { diff --git a/hive3/src/main/java/org/apache/iceberg/orc/VectorizedReadUtils.java b/hive3/src/main/java/org/apache/iceberg/orc/VectorizedReadUtils.java index 2a7fd8b457d5..70e6fc9d5a48 100644 --- a/hive3/src/main/java/org/apache/iceberg/orc/VectorizedReadUtils.java +++ b/hive3/src/main/java/org/apache/iceberg/orc/VectorizedReadUtils.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.io.IOException; @@ -26,17 +25,18 @@ import org.apache.orc.impl.ReaderImpl; /** - * Utilities that rely on Iceberg code from org.apache.iceberg.orc package and are required for ORC vectorization. + * Utilities that rely on Iceberg code from org.apache.iceberg.orc package and are required for ORC + * vectorization. */ public class VectorizedReadUtils { - private VectorizedReadUtils() { - - } + private VectorizedReadUtils() {} /** * Opens the ORC inputFile and reads the metadata information to construct the OrcTail content. - * Unfortunately the API doesn't allow simple access to OrcTail, so we need the serialization trick. + * Unfortunately the API doesn't allow simple access to OrcTail, so we need the serialization + * trick. + * * @param inputFile - the ORC file * @param job - JobConf instance for the current task * @throws IOException - errors relating to accessing the ORC file @@ -46,7 +46,5 @@ public static OrcTail getOrcTail(InputFile inputFile, JobConf job) throws IOExce try (ReaderImpl orcFileReader = (ReaderImpl) ORC.newFileReader(inputFile, job)) { return ReaderImpl.extractFileTail(orcFileReader.getSerializedFileFooter()); } - } - } diff --git a/hive3/src/test/java/org/apache/iceberg/mr/hive/TestHiveSchemaUtilHive3.java b/hive3/src/test/java/org/apache/iceberg/mr/hive/TestHiveSchemaUtilHive3.java index 109723029efb..69a3dd39f8d1 100644 --- a/hive3/src/test/java/org/apache/iceberg/mr/hive/TestHiveSchemaUtilHive3.java +++ b/hive3/src/test/java/org/apache/iceberg/mr/hive/TestHiveSchemaUtilHive3.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.util.List; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.serde.serdeConstants; @@ -27,8 +28,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestHiveSchemaUtilHive3 extends TestHiveSchemaUtil { @Override diff --git a/hive3/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDateObjectInspectorHive3.java b/hive3/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDateObjectInspectorHive3.java index ca48639eeadb..102fbd0e1e5c 100644 --- a/hive3/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDateObjectInspectorHive3.java +++ b/hive3/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDateObjectInspectorHive3.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.time.LocalDate; @@ -62,5 +61,4 @@ public void testIcebergDateObjectInspector() { Assert.assertFalse(oi.preferWritable()); } - } diff --git a/hive3/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampObjectInspectorHive3.java b/hive3/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampObjectInspectorHive3.java index 89ee0cd31167..661e1b148ee4 100644 --- a/hive3/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampObjectInspectorHive3.java +++ b/hive3/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampObjectInspectorHive3.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.time.Instant; @@ -37,7 +36,8 @@ public void testIcebergTimestampObjectInspector() { IcebergTimestampObjectInspectorHive3 oi = IcebergTimestampObjectInspectorHive3.get(); Assert.assertEquals(ObjectInspector.Category.PRIMITIVE, oi.getCategory()); - Assert.assertEquals(PrimitiveObjectInspector.PrimitiveCategory.TIMESTAMP, oi.getPrimitiveCategory()); + Assert.assertEquals( + PrimitiveObjectInspector.PrimitiveCategory.TIMESTAMP, oi.getPrimitiveCategory()); Assert.assertEquals(TypeInfoFactory.timestampTypeInfo, oi.getTypeInfo()); Assert.assertEquals(TypeInfoFactory.timestampTypeInfo.getTypeName(), oi.getTypeName()); @@ -51,7 +51,9 @@ public void testIcebergTimestampObjectInspector() { Assert.assertNull(oi.convert(null)); long epochMilli = 1601471970000L; - LocalDateTime local = LocalDateTime.ofInstant(Instant.ofEpochMilli(epochMilli), ZoneId.of("UTC")).plusNanos(34000); + LocalDateTime local = + LocalDateTime.ofInstant(Instant.ofEpochMilli(epochMilli), ZoneId.of("UTC")) + .plusNanos(34000); Timestamp ts = Timestamp.ofEpochMilli(epochMilli); ts.setNanos(34000); @@ -67,5 +69,4 @@ public void testIcebergTimestampObjectInspector() { Assert.assertEquals(local, oi.convert(ts)); } - } diff --git a/hive3/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampWithZoneObjectInspectorHive3.java b/hive3/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampWithZoneObjectInspectorHive3.java index bc4924577e1b..17e6047cfaae 100644 --- a/hive3/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampWithZoneObjectInspectorHive3.java +++ b/hive3/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampWithZoneObjectInspectorHive3.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.time.LocalDateTime; @@ -35,10 +34,12 @@ public class TestIcebergTimestampWithZoneObjectInspectorHive3 { @Test public void testIcebergTimestampLocalTZObjectInspector() { - IcebergTimestampWithZoneObjectInspectorHive3 oi = IcebergTimestampWithZoneObjectInspectorHive3.get(); + IcebergTimestampWithZoneObjectInspectorHive3 oi = + IcebergTimestampWithZoneObjectInspectorHive3.get(); Assert.assertEquals(ObjectInspector.Category.PRIMITIVE, oi.getCategory()); - Assert.assertEquals(PrimitiveObjectInspector.PrimitiveCategory.TIMESTAMPLOCALTZ, oi.getPrimitiveCategory()); + Assert.assertEquals( + PrimitiveObjectInspector.PrimitiveCategory.TIMESTAMPLOCALTZ, oi.getPrimitiveCategory()); Assert.assertEquals(TypeInfoFactory.timestampLocalTZTypeInfo, oi.getTypeInfo()); Assert.assertEquals(TypeInfoFactory.timestampLocalTZTypeInfo.getTypeName(), oi.getTypeName()); @@ -52,16 +53,19 @@ public void testIcebergTimestampLocalTZObjectInspector() { Assert.assertNull(oi.convert(null)); LocalDateTime dateTimeAtUTC = LocalDateTime.of(2020, 12, 10, 15, 55, 20, 30000); - OffsetDateTime offsetDateTime = OffsetDateTime.of(dateTimeAtUTC.plusHours(4), ZoneOffset.ofHours(4)); + OffsetDateTime offsetDateTime = + OffsetDateTime.of(dateTimeAtUTC.plusHours(4), ZoneOffset.ofHours(4)); TimestampTZ ts = new TimestampTZ(dateTimeAtUTC.atZone(ZoneId.of("UTC"))); Assert.assertEquals(ts, oi.getPrimitiveJavaObject(offsetDateTime)); - Assert.assertEquals(new TimestampLocalTZWritable(ts), oi.getPrimitiveWritableObject(offsetDateTime)); + Assert.assertEquals( + new TimestampLocalTZWritable(ts), oi.getPrimitiveWritableObject(offsetDateTime)); // try with another offset as well offsetDateTime = OffsetDateTime.of(dateTimeAtUTC.plusHours(11), ZoneOffset.ofHours(11)); Assert.assertEquals(ts, oi.getPrimitiveJavaObject(offsetDateTime)); - Assert.assertEquals(new TimestampLocalTZWritable(ts), oi.getPrimitiveWritableObject(offsetDateTime)); + Assert.assertEquals( + new TimestampLocalTZWritable(ts), oi.getPrimitiveWritableObject(offsetDateTime)); TimestampTZ copy = (TimestampTZ) oi.copyObject(ts); @@ -72,5 +76,4 @@ public void testIcebergTimestampLocalTZObjectInspector() { Assert.assertEquals(OffsetDateTime.of(dateTimeAtUTC, ZoneOffset.UTC), oi.convert(ts)); } - } diff --git a/mr/src/main/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedSupport.java b/mr/src/main/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedSupport.java index 44711fbebd77..aa3ae709f69c 100644 --- a/mr/src/main/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedSupport.java +++ b/mr/src/main/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedSupport.java @@ -16,26 +16,25 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.hadoop.hive.ql.exec.vector; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; -/** - * Copied here from Hive for compatibility - */ +/** Copied here from Hive for compatibility */ @SuppressWarnings("VisibilityModifier") public class VectorizedSupport { public enum Support { DECIMAL_64; final String lowerCaseName; + Support() { this.lowerCaseName = name().toLowerCase(); } public static final Map nameToSupportMap = Maps.newHashMap(); + static { for (Support support : values()) { nameToSupportMap.put(support.lowerCaseName, support); diff --git a/mr/src/main/java/org/apache/iceberg/mr/Catalogs.java b/mr/src/main/java/org/apache/iceberg/mr/Catalogs.java index e15421bbc5cf..526f340165a5 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/Catalogs.java +++ b/mr/src/main/java/org/apache/iceberg/mr/Catalogs.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr; import java.util.Map; @@ -43,27 +42,30 @@ /** * Class for catalog resolution and accessing the common functions for {@link Catalog} API. - *

    - * If the catalog name is provided, get the catalog type from iceberg.catalog.catalogName.type config. - *

    - * In case the catalog name is {@link #ICEBERG_HADOOP_TABLE_NAME location_based_table}, - * type is ignored and tables will be loaded using {@link HadoopTables}. - *

    - * In case the value of catalog type is null, iceberg.catalog.catalogName.catalog-impl config - * is used to determine the catalog implementation class. - *

    - * If catalog name is null, get the catalog type from {@link InputFormatConfig#CATALOG iceberg.mr.catalog} config: + * + *

    If the catalog name is provided, get the catalog type from iceberg.catalog.catalogName + * .type config. + * + *

    In case the catalog name is {@link #ICEBERG_HADOOP_TABLE_NAME location_based_table}, type is + * ignored and tables will be loaded using {@link HadoopTables}. + * + *

    In case the value of catalog type is null, iceberg.catalog.catalogName + * .catalog-impl config is used to determine the catalog implementation class. + * + *

    If catalog name is null, get the catalog type from {@link InputFormatConfig#CATALOG + * iceberg.mr.catalog} config: + * *

      - *
    • hive: HiveCatalog
    • - *
    • location: HadoopTables
    • - *
    • hadoop: HadoopCatalog
    • + *
    • hive: HiveCatalog + *
    • location: HadoopTables + *
    • hadoop: HadoopCatalog *
    - *

    - * In case the value of catalog type is null, - * {@link InputFormatConfig#CATALOG_LOADER_CLASS iceberg.mr.catalog.loader.class} is used to determine - * the catalog implementation class. - *

    - * Note: null catalog name mode is only supported for backwards compatibility. Using this mode is NOT RECOMMENDED. + * + *

    In case the value of catalog type is null, {@link InputFormatConfig#CATALOG_LOADER_CLASS + * iceberg.mr.catalog.loader.class} is used to determine the catalog implementation class. + * + *

    Note: null catalog name mode is only supported for backwards compatibility. Using this mode is + * NOT RECOMMENDED. */ public final class Catalogs { @@ -74,40 +76,53 @@ public final class Catalogs { private static final String NO_CATALOG_TYPE = "no catalog"; private static final Set PROPERTIES_TO_REMOVE = - ImmutableSet.of(InputFormatConfig.TABLE_SCHEMA, InputFormatConfig.PARTITION_SPEC, LOCATION, NAME, - InputFormatConfig.CATALOG_NAME); + ImmutableSet.of( + InputFormatConfig.TABLE_SCHEMA, + InputFormatConfig.PARTITION_SPEC, + LOCATION, + NAME, + InputFormatConfig.CATALOG_NAME); - private Catalogs() { - } + private Catalogs() {} /** - * Load an Iceberg table using the catalog and table identifier (or table path) specified by the configuration. + * Load an Iceberg table using the catalog and table identifier (or table path) specified by the + * configuration. + * * @param conf a Hadoop conf * @return an Iceberg table */ public static Table loadTable(Configuration conf) { - return loadTable(conf, conf.get(InputFormatConfig.TABLE_IDENTIFIER), conf.get(InputFormatConfig.TABLE_LOCATION), - conf.get(InputFormatConfig.CATALOG_NAME)); + return loadTable( + conf, + conf.get(InputFormatConfig.TABLE_IDENTIFIER), + conf.get(InputFormatConfig.TABLE_LOCATION), + conf.get(InputFormatConfig.CATALOG_NAME)); } /** * Load an Iceberg table using the catalog specified by the configuration. - *

    - * The table identifier ({@link Catalogs#NAME}) and the catalog name ({@link InputFormatConfig#CATALOG_NAME}), - * or table path ({@link Catalogs#LOCATION}) should be specified by the controlling properties. - *

    - * Used by HiveIcebergSerDe and HiveIcebergStorageHandler + * + *

    The table identifier ({@link Catalogs#NAME}) and the catalog name ({@link + * InputFormatConfig#CATALOG_NAME}), or table path ({@link Catalogs#LOCATION}) should be specified + * by the controlling properties. + * + *

    Used by HiveIcebergSerDe and HiveIcebergStorageHandler + * * @param conf a Hadoop * @param props the controlling properties * @return an Iceberg table */ public static Table loadTable(Configuration conf, Properties props) { - return loadTable(conf, props.getProperty(NAME), props.getProperty(LOCATION), - props.getProperty(InputFormatConfig.CATALOG_NAME)); + return loadTable( + conf, + props.getProperty(NAME), + props.getProperty(LOCATION), + props.getProperty(InputFormatConfig.CATALOG_NAME)); } - private static Table loadTable(Configuration conf, String tableIdentifier, String tableLocation, - String catalogName) { + private static Table loadTable( + Configuration conf, String tableIdentifier, String tableLocation, String catalogName) { Optional catalog = loadCatalog(conf, catalogName); if (catalog.isPresent()) { @@ -121,16 +136,20 @@ private static Table loadTable(Configuration conf, String tableIdentifier, Strin /** * Creates an Iceberg table using the catalog specified by the configuration. - *

    - * The properties should contain the following values: + * + *

    The properties should contain the following values: + * *

      - *
    • Table identifier ({@link Catalogs#NAME}) or table path ({@link Catalogs#LOCATION}) is required - *
    • Table schema ({@link InputFormatConfig#TABLE_SCHEMA}) is required - *
    • Partition specification ({@link InputFormatConfig#PARTITION_SPEC}) is optional. Table will be unpartitioned if - * not provided - *

    - * Other properties will be handled over to the Table creation. The controlling properties above will not be - * propagated. + *

  • Table identifier ({@link Catalogs#NAME}) or table path ({@link Catalogs#LOCATION}) is + * required + *
  • Table schema ({@link InputFormatConfig#TABLE_SCHEMA}) is required + *
  • Partition specification ({@link InputFormatConfig#PARTITION_SPEC}) is optional. Table + * will be unpartitioned if not provided + * + * + *

    Other properties will be handled over to the Table creation. The controlling properties + * above will not be propagated. + * * @param conf a Hadoop conf * @param props the controlling properties * @return the created Iceberg table @@ -171,9 +190,10 @@ public static Table createTable(Configuration conf, Properties props) { /** * Drops an Iceberg table using the catalog specified by the configuration. - *

    - * The table identifier ({@link Catalogs#NAME}) or table path ({@link Catalogs#LOCATION}) should be specified by - * the controlling properties. + * + *

    The table identifier ({@link Catalogs#NAME}) or table path ({@link Catalogs#LOCATION}) + * should be specified by the controlling properties. + * * @param conf a Hadoop conf * @param props the controlling properties * @return the created Iceberg table @@ -196,6 +216,7 @@ public static boolean dropTable(Configuration conf, Properties props) { /** * Returns true if HiveCatalog is used + * * @param conf a Hadoop conf * @param props the controlling properties * @return true if the Catalog is HiveCatalog @@ -210,7 +231,8 @@ public static boolean hiveCatalog(Configuration conf, Properties props) { if (catalogType != null) { return CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE.equalsIgnoreCase(catalogType); } - return getCatalogProperties(conf, catalogName, catalogType).get(CatalogProperties.CATALOG_IMPL) == null; + return getCatalogProperties(conf, catalogName, catalogType).get(CatalogProperties.CATALOG_IMPL) + == null; } @VisibleForTesting @@ -220,37 +242,44 @@ static Optional loadCatalog(Configuration conf, String catalogName) { return Optional.empty(); } else { String name = catalogName == null ? ICEBERG_DEFAULT_CATALOG_NAME : catalogName; - return Optional.of(CatalogUtil.buildIcebergCatalog(name, - getCatalogProperties(conf, name, catalogType), conf)); + return Optional.of( + CatalogUtil.buildIcebergCatalog( + name, getCatalogProperties(conf, name, catalogType), conf)); } } /** * Collect all the catalog specific configuration from the global hive configuration. + * * @param conf a Hadoop configuration * @param catalogName name of the catalog * @param catalogType type of the catalog * @return complete map of catalog properties */ - private static Map getCatalogProperties(Configuration conf, String catalogName, String catalogType) { + private static Map getCatalogProperties( + Configuration conf, String catalogName, String catalogType) { String keyPrefix = InputFormatConfig.CATALOG_CONFIG_PREFIX + catalogName; - Map catalogProperties = Streams.stream(conf.iterator()) + Map catalogProperties = + Streams.stream(conf.iterator()) .filter(e -> e.getKey().startsWith(keyPrefix)) - .collect(Collectors.toMap(e -> e.getKey().substring(keyPrefix.length() + 1), Map.Entry::getValue)); + .collect( + Collectors.toMap( + e -> e.getKey().substring(keyPrefix.length() + 1), Map.Entry::getValue)); return addCatalogPropertiesIfMissing(conf, catalogType, catalogProperties); } /** - * This method is used for backward-compatible catalog configuration. - * Collect all the catalog specific configuration from the global hive configuration. - * Note: this should be removed when the old catalog configuration is deprecated. + * This method is used for backward-compatible catalog configuration. Collect all the catalog + * specific configuration from the global hive configuration. Note: this should be removed when + * the old catalog configuration is deprecated. + * * @param conf global hive configuration * @param catalogType type of the catalog * @param catalogProperties pre-populated catalog properties * @return complete map of catalog properties */ - private static Map addCatalogPropertiesIfMissing(Configuration conf, String catalogType, - Map catalogProperties) { + private static Map addCatalogPropertiesIfMissing( + Configuration conf, String catalogType, Map catalogProperties) { if (catalogType != null) { catalogProperties.putIfAbsent(CatalogUtil.ICEBERG_CATALOG_TYPE, catalogType); } @@ -269,8 +298,8 @@ private static Map addCatalogPropertiesIfMissing(Configuration c /** * Return the catalog type based on the catalog name. - *

    - * See {@link Catalogs} documentation for catalog type resolution strategy. + * + *

    See {@link Catalogs} documentation for catalog type resolution strategy. * * @param conf global hive configuration * @param catalogName name of the catalog @@ -278,8 +307,10 @@ private static Map addCatalogPropertiesIfMissing(Configuration c */ private static String getCatalogType(Configuration conf, String catalogName) { if (catalogName != null) { - String catalogType = conf.get(InputFormatConfig.catalogPropertyConfigKey( - catalogName, CatalogUtil.ICEBERG_CATALOG_TYPE)); + String catalogType = + conf.get( + InputFormatConfig.catalogPropertyConfigKey( + catalogName, CatalogUtil.ICEBERG_CATALOG_TYPE)); if (catalogName.equals(ICEBERG_HADOOP_TABLE_NAME)) { return NO_CATALOG_TYPE; } else { diff --git a/mr/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java b/mr/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java index 02d68a3ddd94..a53fabe08318 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java +++ b/mr/src/main/java/org/apache/iceberg/mr/InputFormatConfig.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr; import java.util.List; @@ -29,8 +28,7 @@ public class InputFormatConfig { - private InputFormatConfig() { - } + private InputFormatConfig() {} // configuration values for Iceberg input formats public static final String REUSE_CONTAINERS = "iceberg.mr.reuse.containers"; @@ -51,37 +49,39 @@ private InputFormatConfig() { public static final String LOCALITY = "iceberg.mr.locality"; /** - * @deprecated please use {@link #catalogPropertyConfigKey(String, String)} - * with config key {@link org.apache.iceberg.CatalogUtil#ICEBERG_CATALOG_TYPE} to specify the type of a catalog. + * @deprecated please use {@link #catalogPropertyConfigKey(String, String)} with config key {@link + * org.apache.iceberg.CatalogUtil#ICEBERG_CATALOG_TYPE} to specify the type of a catalog. */ - @Deprecated - public static final String CATALOG = "iceberg.mr.catalog"; + @Deprecated public static final String CATALOG = "iceberg.mr.catalog"; /** - * @deprecated please use {@link #catalogPropertyConfigKey(String, String)} - * with config key {@link org.apache.iceberg.CatalogProperties#WAREHOUSE_LOCATION} - * to specify the warehouse location of a catalog. + * @deprecated please use {@link #catalogPropertyConfigKey(String, String)} with config key {@link + * org.apache.iceberg.CatalogProperties#WAREHOUSE_LOCATION} to specify the warehouse location + * of a catalog. */ @Deprecated - public static final String HADOOP_CATALOG_WAREHOUSE_LOCATION = "iceberg.mr.catalog.hadoop.warehouse.location"; + public static final String HADOOP_CATALOG_WAREHOUSE_LOCATION = + "iceberg.mr.catalog.hadoop.warehouse.location"; /** - * @deprecated please use {@link #catalogPropertyConfigKey(String, String)} - * with config key {@link org.apache.iceberg.CatalogProperties#CATALOG_IMPL} - * to specify the implementation of a catalog. + * @deprecated please use {@link #catalogPropertyConfigKey(String, String)} with config key {@link + * org.apache.iceberg.CatalogProperties#CATALOG_IMPL} to specify the implementation of a + * catalog. */ - @Deprecated - public static final String CATALOG_LOADER_CLASS = "iceberg.mr.catalog.loader.class"; + @Deprecated public static final String CATALOG_LOADER_CLASS = "iceberg.mr.catalog.loader.class"; public static final String SELECTED_COLUMNS = "iceberg.mr.selected.columns"; public static final String EXTERNAL_TABLE_PURGE = "external.table.purge"; - public static final String CONFIG_SERIALIZATION_DISABLED = "iceberg.mr.config.serialization.disabled"; + public static final String CONFIG_SERIALIZATION_DISABLED = + "iceberg.mr.config.serialization.disabled"; public static final boolean CONFIG_SERIALIZATION_DISABLED_DEFAULT = false; public static final String OUTPUT_TABLES = "iceberg.mr.output.tables"; - public static final String COMMIT_TABLE_THREAD_POOL_SIZE = "iceberg.mr.commit.table.thread.pool.size"; + public static final String COMMIT_TABLE_THREAD_POOL_SIZE = + "iceberg.mr.commit.table.thread.pool.size"; public static final int COMMIT_TABLE_THREAD_POOL_SIZE_DEFAULT = 10; - public static final String COMMIT_FILE_THREAD_POOL_SIZE = "iceberg.mr.commit.file.thread.pool.size"; + public static final String COMMIT_FILE_THREAD_POOL_SIZE = + "iceberg.mr.commit.file.thread.pool.size"; public static final int COMMIT_FILE_THREAD_POOL_SIZE_DEFAULT = 10; public static final String WRITE_TARGET_FILE_SIZE = "iceberg.mr.write.target.file.size"; @@ -180,9 +180,7 @@ public ConfigBuilder splitSize(long splitSize) { return this; } - /** - * If this API is called. The input splits constructed will have host location information - */ + /** If this API is called. The input splits constructed will have host location information */ public ConfigBuilder preferLocality() { conf.setBoolean(LOCALITY, true); return this; @@ -199,10 +197,11 @@ public ConfigBuilder usePigTuples() { } /** - * Compute platforms pass down filters to data sources. If the data source cannot apply some filters, or only - * partially applies the filter, it will return the residual filter back. If the platform can correctly apply the - * residual filters, then it should call this api. Otherwise the current api will throw an exception if the passed - * in filter is not completely satisfied. + * Compute platforms pass down filters to data sources. If the data source cannot apply some + * filters, or only partially applies the filter, it will return the residual filter back. If + * the platform can correctly apply the residual filters, then it should call this api. + * Otherwise the current api will throw an exception if the passed in filter is not completely + * satisfied. */ public ConfigBuilder skipResidualFiltering() { conf.setBoolean(InputFormatConfig.SKIP_RESIDUAL_FILTERING, true); @@ -225,10 +224,10 @@ public static String[] selectedColumns(Configuration conf) { /** * Get Hadoop config key of a catalog property based on catalog name + * * @param catalogName catalog name - * @param catalogProperty catalog property, can be any custom property, - * a commonly used list of properties can be found - * at {@link org.apache.iceberg.CatalogProperties} + * @param catalogProperty catalog property, can be any custom property, a commonly used list of + * properties can be found at {@link org.apache.iceberg.CatalogProperties} * @return Hadoop config key of a catalog property for the catalog name */ public static String catalogPropertyConfigKey(String catalogName, String catalogProperty) { @@ -239,5 +238,4 @@ private static Schema schema(Configuration conf, String key) { String json = conf.get(key); return json == null ? null : SchemaParser.fromJson(json); } - } diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/Deserializer.java b/mr/src/main/java/org/apache/iceberg/mr/hive/Deserializer.java index f7221f09e078..62ce4930cc8a 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/Deserializer.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/Deserializer.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; import java.util.List; @@ -41,13 +40,12 @@ import org.apache.iceberg.types.Types.NestedField; import org.apache.iceberg.types.Types.StructType; - class Deserializer { private FieldDeserializer fieldDeserializer; /** - * Builder to create a Deserializer instance. - * Requires an Iceberg Schema and the Hive ObjectInspector for converting the data. + * Builder to create a Deserializer instance. Requires an Iceberg Schema and the Hive + * ObjectInspector for converting the data. */ static class Builder { private Schema schema; @@ -76,6 +74,7 @@ Deserializer build() { /** * Deserializes the Hive result object to an Iceberg record using the provided ObjectInspectors. + * * @param data The Hive data to deserialize * @return The resulting Iceberg Record */ @@ -87,20 +86,26 @@ private Deserializer(Schema schema, ObjectInspectorPair pair) { this.fieldDeserializer = DeserializerVisitor.visit(schema, pair); } - private static class DeserializerVisitor extends SchemaWithPartnerVisitor { + private static class DeserializerVisitor + extends SchemaWithPartnerVisitor { public static FieldDeserializer visit(Schema schema, ObjectInspectorPair pair) { - return visit(schema, new FixNameMappingObjectInspectorPair(schema, pair), new DeserializerVisitor(), + return visit( + schema, + new FixNameMappingObjectInspectorPair(schema, pair), + new DeserializerVisitor(), new PartnerObjectInspectorByNameAccessors()); } @Override - public FieldDeserializer schema(Schema schema, ObjectInspectorPair pair, FieldDeserializer deserializer) { + public FieldDeserializer schema( + Schema schema, ObjectInspectorPair pair, FieldDeserializer deserializer) { return deserializer; } @Override - public FieldDeserializer field(NestedField field, ObjectInspectorPair pair, FieldDeserializer deserializer) { + public FieldDeserializer field( + NestedField field, ObjectInspectorPair pair, FieldDeserializer deserializer) { return deserializer; } @@ -125,7 +130,8 @@ public FieldDeserializer primitive(PrimitiveType type, ObjectInspectorPair pair) } @Override - public FieldDeserializer struct(StructType type, ObjectInspectorPair pair, List deserializers) { + public FieldDeserializer struct( + StructType type, ObjectInspectorPair pair, List deserializers) { Preconditions.checkNotNull(type, "Can not create reader for null type"); GenericRecord template = GenericRecord.create(type); return o -> { @@ -133,8 +139,10 @@ public FieldDeserializer struct(StructType type, ObjectInspectorPair pair, List< return null; } - List data = ((StructObjectInspector) pair.sourceInspector()).getStructFieldsDataAsList(o); - // GenericRecord.copy() is more performant then GenericRecord.create(StructType) since NAME_MAP_CACHE access + List data = + ((StructObjectInspector) pair.sourceInspector()).getStructFieldsDataAsList(o); + // GenericRecord.copy() is more performant then GenericRecord.create(StructType) since + // NAME_MAP_CACHE access // is eliminated. Using copy here to gain performance. Record result = template.copy(); @@ -152,7 +160,8 @@ public FieldDeserializer struct(StructType type, ObjectInspectorPair pair, List< } @Override - public FieldDeserializer list(ListType listTypeInfo, ObjectInspectorPair pair, FieldDeserializer deserializer) { + public FieldDeserializer list( + ListType listTypeInfo, ObjectInspectorPair pair, FieldDeserializer deserializer) { return o -> { if (o == null) { return null; @@ -170,8 +179,11 @@ public FieldDeserializer list(ListType listTypeInfo, ObjectInspectorPair pair, F } @Override - public FieldDeserializer map(MapType mapType, ObjectInspectorPair pair, FieldDeserializer keyDeserializer, - FieldDeserializer valueDeserializer) { + public FieldDeserializer map( + MapType mapType, + ObjectInspectorPair pair, + FieldDeserializer keyDeserializer, + FieldDeserializer valueDeserializer) { return o -> { if (o == null) { return null; @@ -181,7 +193,8 @@ public FieldDeserializer map(MapType mapType, ObjectInspectorPair pair, FieldDes MapObjectInspector mapObjectInspector = (MapObjectInspector) pair.sourceInspector(); for (Map.Entry entry : mapObjectInspector.getMap(o).entrySet()) { - result.put(keyDeserializer.value(entry.getKey()), valueDeserializer.value(entry.getValue())); + result.put( + keyDeserializer.value(entry.getKey()), valueDeserializer.value(entry.getValue())); } return result; }; @@ -195,8 +208,12 @@ private static class PartnerObjectInspectorByNameAccessors public ObjectInspectorPair fieldPartner(ObjectInspectorPair pair, int fieldId, String name) { String sourceName = pair.sourceName(name); return new ObjectInspectorPair( - ((StructObjectInspector) pair.writerInspector()).getStructFieldRef(name).getFieldObjectInspector(), - ((StructObjectInspector) pair.sourceInspector()).getStructFieldRef(sourceName).getFieldObjectInspector()); + ((StructObjectInspector) pair.writerInspector()) + .getStructFieldRef(name) + .getFieldObjectInspector(), + ((StructObjectInspector) pair.sourceInspector()) + .getStructFieldRef(sourceName) + .getFieldObjectInspector()); } @Override @@ -226,10 +243,10 @@ private interface FieldDeserializer { } /** - * Hive query results schema column names do not match the target Iceberg column names. - * Instead we have to rely on the column order. To keep the other parts of the code generic we fix this with a - * wrapper around the ObjectInspectorPair. This wrapper maps the Iceberg schema column names instead of the Hive - * column names. + * Hive query results schema column names do not match the target Iceberg column names. Instead we + * have to rely on the column order. To keep the other parts of the code generic we fix this with + * a wrapper around the ObjectInspectorPair. This wrapper maps the Iceberg schema column names + * instead of the Hive column names. */ private static class FixNameMappingObjectInspectorPair extends ObjectInspectorPair { private final Map sourceNameMap; @@ -239,7 +256,8 @@ private static class FixNameMappingObjectInspectorPair extends ObjectInspectorPa this.sourceNameMap = Maps.newHashMapWithExpectedSize(schema.columns().size()); - List fields = ((StructObjectInspector) sourceInspector()).getAllStructFieldRefs(); + List fields = + ((StructObjectInspector) sourceInspector()).getAllStructFieldRefs(); for (int i = 0; i < schema.columns().size(); ++i) { sourceNameMap.put(schema.columns().get(i).name(), fields.get(i).getFieldName()); } @@ -253,12 +271,12 @@ String sourceName(String originalName) { /** * To get the data for Iceberg {@link Record}s we have to use both ObjectInspectors. - *

    - * We use the Hive ObjectInspectors (sourceInspector) to get the Hive primitive types. - *

    - * We use the Iceberg ObjectInspectors (writerInspector) only if conversion is needed for - * generating the correct type for Iceberg Records. See: {@link WriteObjectInspector} interface on the provided - * writerInspector. + * + *

    We use the Hive ObjectInspectors (sourceInspector) to get the Hive primitive types. + * + *

    We use the Iceberg ObjectInspectors (writerInspector) only if conversion is needed for + * generating the correct type for Iceberg Records. See: {@link WriteObjectInspector} interface on + * the provided writerInspector. */ private static class ObjectInspectorPair { private ObjectInspector writerInspector; diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergFilterFactory.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergFilterFactory.java index 1004afc8c6e6..87d64066b287 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergFilterFactory.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergFilterFactory.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; +import static org.apache.iceberg.expressions.Expressions.and; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.isNaN; +import static org.apache.iceberg.expressions.Expressions.isNull; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.not; +import static org.apache.iceberg.expressions.Expressions.or; + import java.math.BigDecimal; import java.sql.Date; import java.sql.Timestamp; @@ -35,28 +45,18 @@ import org.apache.iceberg.util.DateTimeUtil; import org.apache.iceberg.util.NaNUtil; -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.isNull; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.not; -import static org.apache.iceberg.expressions.Expressions.or; - public class HiveIcebergFilterFactory { - private HiveIcebergFilterFactory() { - } + private HiveIcebergFilterFactory() {} public static Expression generateFilterExpression(SearchArgument sarg) { return translate(sarg.getExpression(), sarg.getLeaves()); } /** - * Recursive method to traverse down the ExpressionTree to evaluate each expression and its leaf nodes. + * Recursive method to traverse down the ExpressionTree to evaluate each expression and its leaf + * nodes. + * * @param tree Current ExpressionTree where the 'top' node is being evaluated. * @param leaves List of all leaf nodes within the tree. * @return Expression that is translated from the Hive SearchArgument. @@ -89,6 +89,7 @@ private static Expression translate(ExpressionTree tree, List lea /** * Translate leaf nodes from Hive operator to Iceberg operator. + * * @param leaf Leaf node * @return Expression fully translated from Hive PredicateLeaf */ @@ -106,8 +107,9 @@ private static Expression translateLeaf(PredicateLeaf leaf) { return in(column, leafToLiteralList(leaf)); case BETWEEN: List icebergLiterals = leafToLiteralList(leaf); - return and(greaterThanOrEqual(column, icebergLiterals.get(0)), - lessThanOrEqual(column, icebergLiterals.get(1))); + return and( + greaterThanOrEqual(column, icebergLiterals.get(0)), + lessThanOrEqual(column, icebergLiterals.get(1))); case IS_NULL: return isNull(column); default: @@ -115,11 +117,12 @@ private static Expression translateLeaf(PredicateLeaf leaf) { } } - // PredicateLeafImpl has a work-around for Kryo serialization with java.util.Date objects where it converts values to - // Timestamp using Date#getTime. This conversion discards microseconds, so this is a necessary to avoid it. - private static final DynFields.UnboundField LITERAL_FIELD = DynFields.builder() - .hiddenImpl(SearchArgumentImpl.PredicateLeafImpl.class, "literal") - .build(); + // PredicateLeafImpl has a work-around for Kryo serialization with java.util.Date objects where it + // converts values to + // Timestamp using Date#getTime. This conversion discards microseconds, so this is a necessary to + // avoid it. + private static final DynFields.UnboundField LITERAL_FIELD = + DynFields.builder().hiddenImpl(SearchArgumentImpl.PredicateLeafImpl.class, "literal").build(); private static Object leafToLiteral(PredicateLeaf leaf) { switch (leaf.getType()) { @@ -151,36 +154,44 @@ private static List leafToLiteralList(PredicateLeaf leaf) { case STRING: return leaf.getLiteralList(); case DATE: - return leaf.getLiteralList().stream().map(value -> daysFromDate((Date) value)) - .collect(Collectors.toList()); + return leaf.getLiteralList().stream() + .map(value -> daysFromDate((Date) value)) + .collect(Collectors.toList()); case DECIMAL: return leaf.getLiteralList().stream() - .map(value -> hiveDecimalToBigDecimal((HiveDecimalWritable) value)) - .collect(Collectors.toList()); + .map(value -> hiveDecimalToBigDecimal((HiveDecimalWritable) value)) + .collect(Collectors.toList()); case TIMESTAMP: return leaf.getLiteralList().stream() - .map(value -> microsFromTimestamp((Timestamp) value)) - .collect(Collectors.toList()); + .map(value -> microsFromTimestamp((Timestamp) value)) + .collect(Collectors.toList()); default: throw new UnsupportedOperationException("Unknown type: " + leaf.getType()); } } private static BigDecimal hiveDecimalToBigDecimal(HiveDecimalWritable hiveDecimalWritable) { - return hiveDecimalWritable.getHiveDecimal().bigDecimalValue().setScale(hiveDecimalWritable.scale()); + return hiveDecimalWritable + .getHiveDecimal() + .bigDecimalValue() + .setScale(hiveDecimalWritable.scale()); } // Hive uses `java.sql.Date.valueOf(lit.toString());` to convert a literal to Date - // Which uses `java.util.Date()` internally to create the object and that uses the TimeZone.getDefaultRef() - // To get back the expected date we have to use the LocalDate which gets rid of the TimeZone misery as it uses + // Which uses `java.util.Date()` internally to create the object and that uses the + // TimeZone.getDefaultRef() + // To get back the expected date we have to use the LocalDate which gets rid of the TimeZone + // misery as it uses // the year/month/day to generate the object private static int daysFromDate(Date date) { return DateTimeUtil.daysFromDate(date.toLocalDate()); } // Hive uses `java.sql.Timestamp.valueOf(lit.toString());` to convert a literal to Timestamp - // Which again uses `java.util.Date()` internally to create the object which uses the TimeZone.getDefaultRef() - // To get back the expected timestamp we have to use the LocalDateTime which gets rid of the TimeZone misery + // Which again uses `java.util.Date()` internally to create the object which uses the + // TimeZone.getDefaultRef() + // To get back the expected timestamp we have to use the LocalDateTime which gets rid of the + // TimeZone misery // as it uses the year/month/day/hour/min/sec/nanos to generate the object private static int daysFromTimestamp(Timestamp timestamp) { return DateTimeUtil.daysFromDate(timestamp.toLocalDateTime().toLocalDate()); diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergInputFormat.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergInputFormat.java index 046c1ee6f7c0..add9e07c54c4 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergInputFormat.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergInputFormat.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; import java.io.IOException; @@ -55,22 +54,25 @@ import org.slf4j.LoggerFactory; public class HiveIcebergInputFormat extends MapredIcebergInputFormat - implements CombineHiveInputFormat.AvoidSplitCombination, VectorizedInputFormatInterface { + implements CombineHiveInputFormat.AvoidSplitCombination, VectorizedInputFormatInterface { private static final Logger LOG = LoggerFactory.getLogger(HiveIcebergInputFormat.class); private static final String HIVE_VECTORIZED_RECORDREADER_CLASS = - "org.apache.iceberg.mr.hive.vector.HiveIcebergVectorizedRecordReader"; - private static final DynConstructors.Ctor HIVE_VECTORIZED_RECORDREADER_CTOR; + "org.apache.iceberg.mr.hive.vector.HiveIcebergVectorizedRecordReader"; + private static final DynConstructors.Ctor + HIVE_VECTORIZED_RECORDREADER_CTOR; static { if (MetastoreUtil.hive3PresentOnClasspath()) { - HIVE_VECTORIZED_RECORDREADER_CTOR = DynConstructors.builder(AbstractMapredIcebergRecordReader.class) - .impl(HIVE_VECTORIZED_RECORDREADER_CLASS, - IcebergInputFormat.class, - IcebergSplit.class, - JobConf.class, - Reporter.class) - .build(); + HIVE_VECTORIZED_RECORDREADER_CTOR = + DynConstructors.builder(AbstractMapredIcebergRecordReader.class) + .impl( + HIVE_VECTORIZED_RECORDREADER_CLASS, + IcebergInputFormat.class, + IcebergSplit.class, + JobConf.class, + Reporter.class) + .build(); } else { HIVE_VECTORIZED_RECORDREADER_CTOR = null; } @@ -81,14 +83,16 @@ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { // Convert Hive filter to Iceberg filter String hiveFilter = job.get(TableScanDesc.FILTER_EXPR_CONF_STR); if (hiveFilter != null) { - ExprNodeGenericFuncDesc exprNodeDesc = SerializationUtilities - .deserializeObject(hiveFilter, ExprNodeGenericFuncDesc.class); + ExprNodeGenericFuncDesc exprNodeDesc = + SerializationUtilities.deserializeObject(hiveFilter, ExprNodeGenericFuncDesc.class); SearchArgument sarg = ConvertAstToSearchArg.create(job, exprNodeDesc); try { Expression filter = HiveIcebergFilterFactory.generateFilterExpression(sarg); job.set(InputFormatConfig.FILTER_EXPRESSION, SerializationUtil.serializeToBase64(filter)); } catch (UnsupportedOperationException e) { - LOG.warn("Unable to create Iceberg filter, continuing without filter (will be applied by Hive later): ", e); + LOG.warn( + "Unable to create Iceberg filter, continuing without filter (will be applied by Hive later): ", + e); } } @@ -97,30 +101,29 @@ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { String location = job.get(InputFormatConfig.TABLE_LOCATION); return Arrays.stream(super.getSplits(job, numSplits)) - .map(split -> new HiveIcebergSplit((IcebergSplit) split, location)) - .toArray(InputSplit[]::new); + .map(split -> new HiveIcebergSplit((IcebergSplit) split, location)) + .toArray(InputSplit[]::new); } @Override - public RecordReader> getRecordReader(InputSplit split, JobConf job, - Reporter reporter) throws IOException { + public RecordReader> getRecordReader( + InputSplit split, JobConf job, Reporter reporter) throws IOException { String[] selectedColumns = ColumnProjectionUtils.getReadColumnNames(job); job.setStrings(InputFormatConfig.SELECTED_COLUMNS, selectedColumns); - if (HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED) && - Utilities.getVectorizedRowBatchCtx(job) != null) { - Preconditions.checkArgument(MetastoreUtil.hive3PresentOnClasspath(), "Vectorization only supported for Hive 3+"); + if (HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED) + && Utilities.getVectorizedRowBatchCtx(job) != null) { + Preconditions.checkArgument( + MetastoreUtil.hive3PresentOnClasspath(), "Vectorization only supported for Hive 3+"); job.setEnum(InputFormatConfig.IN_MEMORY_DATA_MODEL, InputFormatConfig.InMemoryDataModel.HIVE); job.setBoolean(InputFormatConfig.SKIP_RESIDUAL_FILTERING, true); IcebergSplit icebergSplit = ((IcebergSplitContainer) split).icebergSplit(); // bogus cast for favouring code reuse over syntax - return (RecordReader) HIVE_VECTORIZED_RECORDREADER_CTOR.newInstance( - new IcebergInputFormat<>(), - icebergSplit, - job, - reporter); + return (RecordReader) + HIVE_VECTORIZED_RECORDREADER_CTOR.newInstance( + new IcebergInputFormat<>(), icebergSplit, job, reporter); } else { return super.getRecordReader(split, job, reporter); } @@ -131,10 +134,10 @@ public boolean shouldSkipCombine(Path path, Configuration conf) { return true; } - // Override annotation commented out, since this interface method has been introduced only in Hive 3 + // Override annotation commented out, since this interface method has been introduced only in Hive + // 3 // @Override public VectorizedSupport.Support[] getSupportedFeatures() { return new VectorizedSupport.Support[0]; } - } diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java index 4bfd458e7b06..97c93955bc7d 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; import java.util.List; @@ -50,16 +49,20 @@ public class HiveIcebergMetaHook implements HiveMetaHook { private static final Logger LOG = LoggerFactory.getLogger(HiveIcebergMetaHook.class); - private static final Set PARAMETERS_TO_REMOVE = ImmutableSet - .of(InputFormatConfig.TABLE_SCHEMA, Catalogs.LOCATION, Catalogs.NAME); - private static final Set PROPERTIES_TO_REMOVE = ImmutableSet - // We don't want to push down the metadata location props to Iceberg from HMS, - // since the snapshot pointer in HMS would always be one step ahead - .of(BaseMetastoreTableOperations.METADATA_LOCATION_PROP, - BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP, - // Initially we'd like to cache the partition spec in HMS, but not push it down later to Iceberg during alter - // table commands since by then the HMS info can be stale + Iceberg does not store its partition spec in the props - InputFormatConfig.PARTITION_SPEC); + private static final Set PARAMETERS_TO_REMOVE = + ImmutableSet.of(InputFormatConfig.TABLE_SCHEMA, Catalogs.LOCATION, Catalogs.NAME); + private static final Set PROPERTIES_TO_REMOVE = + ImmutableSet + // We don't want to push down the metadata location props to Iceberg from HMS, + // since the snapshot pointer in HMS would always be one step ahead + .of( + BaseMetastoreTableOperations.METADATA_LOCATION_PROP, + BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP, + // Initially we'd like to cache the partition spec in HMS, but not push it down later to + // Iceberg during alter + // table commands since by then the HMS info can be stale + Iceberg does not store its + // partition spec in the props + InputFormatConfig.PARTITION_SPEC); private final Configuration conf; private Table icebergTable = null; @@ -77,8 +80,11 @@ public void preCreateTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) this.catalogProperties = getCatalogProperties(hmsTable); // Set the table type even for non HiveCatalog based tables - hmsTable.getParameters().put(BaseMetastoreTableOperations.TABLE_TYPE_PROP, - BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE.toUpperCase()); + hmsTable + .getParameters() + .put( + BaseMetastoreTableOperations.TABLE_TYPE_PROP, + BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE.toUpperCase()); if (!Catalogs.hiveCatalog(conf, catalogProperties)) { // For non-HiveCatalog tables too, we should set the input and output format @@ -90,9 +96,11 @@ public void preCreateTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) try { this.icebergTable = Catalogs.loadTable(conf, catalogProperties); - Preconditions.checkArgument(catalogProperties.getProperty(InputFormatConfig.TABLE_SCHEMA) == null, + Preconditions.checkArgument( + catalogProperties.getProperty(InputFormatConfig.TABLE_SCHEMA) == null, "Iceberg table already created - can not use provided schema"); - Preconditions.checkArgument(catalogProperties.getProperty(InputFormatConfig.PARTITION_SPEC) == null, + Preconditions.checkArgument( + catalogProperties.getProperty(InputFormatConfig.PARTITION_SPEC) == null, "Iceberg table already created - can not use provided partition specification"); LOG.info("Iceberg table already exists {}", icebergTable); @@ -104,13 +112,15 @@ public void preCreateTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) } // If the table does not exist collect data for table creation - // - InputFormatConfig.TABLE_SCHEMA, InputFormatConfig.PARTITION_SPEC takes precedence so the user can override the + // - InputFormatConfig.TABLE_SCHEMA, InputFormatConfig.PARTITION_SPEC takes precedence so the + // user can override the // Iceberg schema and specification generated by the code Schema schema = schema(catalogProperties, hmsTable); PartitionSpec spec = spec(schema, catalogProperties, hmsTable); - // If there are partition keys specified remove them from the HMS table and add them to the column list + // If there are partition keys specified remove them from the HMS table and add them to the + // column list if (hmsTable.isSetPartitionKeys()) { hmsTable.getSd().getCols().addAll(hmsTable.getPartitionKeys()); hmsTable.setPartitionKeysIsSet(false); @@ -124,7 +134,8 @@ public void preCreateTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) // If the table is not managed by Hive catalog then the location should be set if (!Catalogs.hiveCatalog(conf, catalogProperties)) { - Preconditions.checkArgument(hmsTable.getSd() != null && hmsTable.getSd().getLocation() != null, + Preconditions.checkArgument( + hmsTable.getSd() != null && hmsTable.getSd().getLocation() != null, "Table location not set"); } @@ -151,21 +162,29 @@ public void commitCreateTable(org.apache.hadoop.hive.metastore.api.Table hmsTabl @Override public void preDropTable(org.apache.hadoop.hive.metastore.api.Table hmsTable) { this.catalogProperties = getCatalogProperties(hmsTable); - this.deleteIcebergTable = hmsTable.getParameters() != null && - "TRUE".equalsIgnoreCase(hmsTable.getParameters().get(InputFormatConfig.EXTERNAL_TABLE_PURGE)); + this.deleteIcebergTable = + hmsTable.getParameters() != null + && "TRUE" + .equalsIgnoreCase( + hmsTable.getParameters().get(InputFormatConfig.EXTERNAL_TABLE_PURGE)); if (deleteIcebergTable && Catalogs.hiveCatalog(conf, catalogProperties)) { // Store the metadata and the io for deleting the actual table data try { - String metadataLocation = hmsTable.getParameters().get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP); + String metadataLocation = + hmsTable.getParameters().get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP); this.deleteIo = Catalogs.loadTable(conf, catalogProperties).io(); this.deleteMetadata = TableMetadataParser.read(deleteIo, metadataLocation); } catch (Exception e) { - LOG.error("preDropTable: Error during loading Iceberg table or parsing its metadata for HMS table: {}.{}. " + - "In some cases, this might lead to undeleted metadata files under the table directory: {}. " + - "Please double check and, if needed, manually delete any dangling files/folders, if any. " + - "In spite of this error, the HMS table drop operation should proceed as normal.", - hmsTable.getDbName(), hmsTable.getTableName(), hmsTable.getSd().getLocation(), e); + LOG.error( + "preDropTable: Error during loading Iceberg table or parsing its metadata for HMS table: {}.{}. " + + "In some cases, this might lead to undeleted metadata files under the table directory: {}. " + + "Please double check and, if needed, manually delete any dangling files/folders, if any. " + + "In spite of this error, the HMS table drop operation should proceed as normal.", + hmsTable.getDbName(), + hmsTable.getTableName(), + hmsTable.getSd().getLocation(), + e); } } } @@ -176,54 +195,75 @@ public void rollbackDropTable(org.apache.hadoop.hive.metastore.api.Table hmsTabl } @Override - public void commitDropTable(org.apache.hadoop.hive.metastore.api.Table hmsTable, boolean deleteData) { + public void commitDropTable( + org.apache.hadoop.hive.metastore.api.Table hmsTable, boolean deleteData) { if (deleteData && deleteIcebergTable) { try { if (!Catalogs.hiveCatalog(conf, catalogProperties)) { - LOG.info("Dropping with purge all the data for table {}.{}", hmsTable.getDbName(), hmsTable.getTableName()); + LOG.info( + "Dropping with purge all the data for table {}.{}", + hmsTable.getDbName(), + hmsTable.getTableName()); Catalogs.dropTable(conf, catalogProperties); } else { - // do nothing if metadata folder has been deleted already (Hive 4 behaviour for purge=TRUE) + // do nothing if metadata folder has been deleted already (Hive 4 behaviour for + // purge=TRUE) if (deleteMetadata != null && deleteIo.newInputFile(deleteMetadata.location()).exists()) { CatalogUtil.dropTableData(deleteIo, deleteMetadata); } } } catch (Exception e) { - // we want to successfully complete the Hive DROP TABLE command despite catalog-related exceptions here - // e.g. we wish to successfully delete a Hive table even if the underlying Hadoop table has already been deleted - LOG.warn("Exception during commitDropTable operation for table {}.{}.", - hmsTable.getDbName(), hmsTable.getTableName(), e); + // we want to successfully complete the Hive DROP TABLE command despite catalog-related + // exceptions here + // e.g. we wish to successfully delete a Hive table even if the underlying Hadoop table has + // already been deleted + LOG.warn( + "Exception during commitDropTable operation for table {}.{}.", + hmsTable.getDbName(), + hmsTable.getTableName(), + e); } } } /** * Calculates the properties we would like to send to the catalog. + * *
      - *
    • The base of the properties is the properties stored at the Hive Metastore for the given table - *
    • We add the {@link Catalogs#LOCATION} as the table location - *
    • We add the {@link Catalogs#NAME} as TableIdentifier defined by the database name and table name - *
    • We remove some parameters that we don't want to push down to the Iceberg table props + *
    • The base of the properties is the properties stored at the Hive Metastore for the given + * table + *
    • We add the {@link Catalogs#LOCATION} as the table location + *
    • We add the {@link Catalogs#NAME} as TableIdentifier defined by the database name and + * table name + *
    • We remove some parameters that we don't want to push down to the Iceberg table props *
    + * * @param hmsTable Table for which we are calculating the properties * @return The properties we can provide for Iceberg functions, like {@link Catalogs} */ - private static Properties getCatalogProperties(org.apache.hadoop.hive.metastore.api.Table hmsTable) { + private static Properties getCatalogProperties( + org.apache.hadoop.hive.metastore.api.Table hmsTable) { Properties properties = new Properties(); - hmsTable.getParameters().forEach((key, value) -> { - // translate key names between HMS and Iceberg where needed - String icebergKey = HiveTableOperations.translateToIcebergProp(key); - properties.put(icebergKey, value); - }); - - if (properties.get(Catalogs.LOCATION) == null && - hmsTable.getSd() != null && hmsTable.getSd().getLocation() != null) { + hmsTable + .getParameters() + .forEach( + (key, value) -> { + // translate key names between HMS and Iceberg where needed + String icebergKey = HiveTableOperations.translateToIcebergProp(key); + properties.put(icebergKey, value); + }); + + if (properties.get(Catalogs.LOCATION) == null + && hmsTable.getSd() != null + && hmsTable.getSd().getLocation() != null) { properties.put(Catalogs.LOCATION, hmsTable.getSd().getLocation()); } if (properties.get(Catalogs.NAME) == null) { - properties.put(Catalogs.NAME, TableIdentifier.of(hmsTable.getDbName(), hmsTable.getTableName()).toString()); + properties.put( + Catalogs.NAME, + TableIdentifier.of(hmsTable.getDbName(), hmsTable.getTableName()).toString()); } // Remove HMS table parameters we don't want to propagate to Iceberg @@ -232,7 +272,8 @@ private static Properties getCatalogProperties(org.apache.hadoop.hive.metastore. return properties; } - private Schema schema(Properties properties, org.apache.hadoop.hive.metastore.api.Table hmsTable) { + private Schema schema( + Properties properties, org.apache.hadoop.hive.metastore.api.Table hmsTable) { boolean autoConversion = conf.getBoolean(InputFormatConfig.SCHEMA_AUTO_CONVERSION, false); if (properties.getProperty(InputFormatConfig.TABLE_SCHEMA) != null) { @@ -247,16 +288,20 @@ private Schema schema(Properties properties, org.apache.hadoop.hive.metastore.ap } } - private static PartitionSpec spec(Schema schema, Properties properties, - org.apache.hadoop.hive.metastore.api.Table hmsTable) { + private static PartitionSpec spec( + Schema schema, Properties properties, org.apache.hadoop.hive.metastore.api.Table hmsTable) { if (hmsTable.getParameters().get(InputFormatConfig.PARTITION_SPEC) != null) { - Preconditions.checkArgument(!hmsTable.isSetPartitionKeys() || hmsTable.getPartitionKeys().isEmpty(), - "Provide only one of the following: Hive partition specification, or the " + - InputFormatConfig.PARTITION_SPEC + " property"); - return PartitionSpecParser.fromJson(schema, hmsTable.getParameters().get(InputFormatConfig.PARTITION_SPEC)); + Preconditions.checkArgument( + !hmsTable.isSetPartitionKeys() || hmsTable.getPartitionKeys().isEmpty(), + "Provide only one of the following: Hive partition specification, or the " + + InputFormatConfig.PARTITION_SPEC + + " property"); + return PartitionSpecParser.fromJson( + schema, hmsTable.getParameters().get(InputFormatConfig.PARTITION_SPEC)); } else if (hmsTable.isSetPartitionKeys() && !hmsTable.getPartitionKeys().isEmpty()) { - // If the table is partitioned then generate the identity partition definitions for the Iceberg table + // If the table is partitioned then generate the identity partition definitions for the + // Iceberg table return HiveSchemaUtil.spec(schema, hmsTable.getPartitionKeys()); } else { return PartitionSpec.unpartitioned(); diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputCommitter.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputCommitter.java index 4b3157b2c8aa..a4b4eecdfe55 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputCommitter.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputCommitter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; import java.io.IOException; @@ -58,8 +57,8 @@ import org.slf4j.LoggerFactory; /** - * An Iceberg table committer for adding data files to the Iceberg tables. - * Currently independent of the Hive ACID transactions. + * An Iceberg table committer for adding data files to the Iceberg tables. Currently independent of + * the Hive ACID transactions. */ public class HiveIcebergOutputCommitter extends OutputCommitter { private static final String FOR_COMMIT_EXTENSION = ".forCommit"; @@ -79,12 +78,13 @@ public void setupTask(TaskAttemptContext taskAttemptContext) { @Override public boolean needsTaskCommit(TaskAttemptContext context) { // We need to commit if this is the last phase of a MapReduce process - return TaskType.REDUCE.equals(context.getTaskAttemptID().getTaskID().getTaskType()) || - context.getJobConf().getNumReduceTasks() == 0; + return TaskType.REDUCE.equals(context.getTaskAttemptID().getTaskID().getTaskType()) + || context.getJobConf().getNumReduceTasks() == 0; } /** * Collects the generated data files and creates a commit file storing the data file list. + * * @param originalContext The task attempt context * @throws IOException Thrown if there is an error writing the commit file */ @@ -95,11 +95,16 @@ public void commitTask(TaskAttemptContext originalContext) throws IOException { TaskAttemptID attemptID = context.getTaskAttemptID(); JobConf jobConf = context.getJobConf(); Collection outputs = HiveIcebergStorageHandler.outputTables(context.getJobConf()); - Map writers = Optional.ofNullable(HiveIcebergRecordWriter.getWriters(attemptID)) - .orElseGet(() -> { - LOG.info("CommitTask found no writers for output tables: {}, attemptID: {}", outputs, attemptID); - return ImmutableMap.of(); - }); + Map writers = + Optional.ofNullable(HiveIcebergRecordWriter.getWriters(attemptID)) + .orElseGet( + () -> { + LOG.info( + "CommitTask found no writers for output tables: {}, attemptID: {}", + outputs, + attemptID); + return ImmutableMap.of(); + }); ExecutorService tableExecutor = tableExecutor(jobConf, outputs.size()); try { @@ -109,27 +114,38 @@ public void commitTask(TaskAttemptContext originalContext) throws IOException { .stopOnFailure() .throwFailureWhenFinished() .executeWith(tableExecutor) - .run(output -> { - Table table = HiveIcebergStorageHandler.table(context.getJobConf(), output); - if (table != null) { - HiveIcebergRecordWriter writer = writers.get(output); - DataFile[] closedFiles; - if (writer != null) { - closedFiles = writer.dataFiles(); - } else { - LOG.info("CommitTask found no writer for specific table: {}, attemptID: {}", output, attemptID); - closedFiles = new DataFile[0]; - } - // Creating the file containing the data files generated by this task for this table - String fileForCommitLocation = generateFileForCommitLocation(table.location(), jobConf, - attemptID.getJobID(), attemptID.getTaskID().getId()); - createFileForCommit(closedFiles, fileForCommitLocation, table.io()); - } else { - // When using Tez multi-table inserts, we could have more output tables in config than - // the actual tables this task has written to and has serialized in its config - LOG.info("CommitTask found no serialized table in config for table: {}.", output); - } - }, IOException.class); + .run( + output -> { + Table table = HiveIcebergStorageHandler.table(context.getJobConf(), output); + if (table != null) { + HiveIcebergRecordWriter writer = writers.get(output); + DataFile[] closedFiles; + if (writer != null) { + closedFiles = writer.dataFiles(); + } else { + LOG.info( + "CommitTask found no writer for specific table: {}, attemptID: {}", + output, + attemptID); + closedFiles = new DataFile[0]; + } + // Creating the file containing the data files generated by this task for this + // table + String fileForCommitLocation = + generateFileForCommitLocation( + table.location(), + jobConf, + attemptID.getJobID(), + attemptID.getTaskID().getId()); + createFileForCommit(closedFiles, fileForCommitLocation, table.io()); + } else { + // When using Tez multi-table inserts, we could have more output tables in config + // than + // the actual tables this task has written to and has serialized in its config + LOG.info("CommitTask found no serialized table in config for table: {}.", output); + } + }, + IOException.class); } finally { if (tableExecutor != null) { tableExecutor.shutdown(); @@ -142,6 +158,7 @@ public void commitTask(TaskAttemptContext originalContext) throws IOException { /** * Removes files generated by this task. + * * @param originalContext The task attempt context * @throws IOException Thrown if there is an error closing the writer */ @@ -150,7 +167,8 @@ public void abortTask(TaskAttemptContext originalContext) throws IOException { TaskAttemptContext context = TezUtil.enrichContextWithAttemptWrapper(originalContext); // Clean up writer data from the local store - Map writers = HiveIcebergRecordWriter.removeWriters(context.getTaskAttemptID()); + Map writers = + HiveIcebergRecordWriter.removeWriters(context.getTaskAttemptID()); // Remove files if it was not done already if (writers != null) { @@ -161,8 +179,9 @@ public void abortTask(TaskAttemptContext originalContext) throws IOException { } /** - * Reads the commit files stored in the temp directories and collects the generated committed data files. - * Appends the data files to the tables. At the end removes the temporary directories. + * Reads the commit files stored in the temp directories and collects the generated committed data + * files. Appends the data files to the tables. At the end removes the temporary directories. + * * @param originalContext The job context * @throws IOException if there is a failure accessing the files */ @@ -185,16 +204,21 @@ public void commitJob(JobContext originalContext) throws IOException { .throwFailureWhenFinished() .stopOnFailure() .executeWith(tableExecutor) - .run(output -> { - Table table = HiveIcebergStorageHandler.table(jobConf, output); - if (table != null) { - String catalogName = HiveIcebergStorageHandler.catalogName(jobConf, output); - jobLocations.add(generateJobLocation(table.location(), jobConf, jobContext.getJobID())); - commitTable(table.io(), fileExecutor, jobContext, output, table.location(), catalogName); - } else { - LOG.info("CommitJob found no serialized table in config for table: {}. Skipping job commit.", output); - } - }); + .run( + output -> { + Table table = HiveIcebergStorageHandler.table(jobConf, output); + if (table != null) { + String catalogName = HiveIcebergStorageHandler.catalogName(jobConf, output); + jobLocations.add( + generateJobLocation(table.location(), jobConf, jobContext.getJobID())); + commitTable( + table.io(), fileExecutor, jobContext, output, table.location(), catalogName); + } else { + LOG.info( + "CommitJob found no serialized table in config for table: {}. Skipping job commit.", + output); + } + }); } finally { fileExecutor.shutdown(); if (tableExecutor != null) { @@ -202,14 +226,18 @@ public void commitJob(JobContext originalContext) throws IOException { } } - LOG.info("Commit took {} ms for job {}", System.currentTimeMillis() - startTime, jobContext.getJobID()); + LOG.info( + "Commit took {} ms for job {}", + System.currentTimeMillis() - startTime, + jobContext.getJobID()); cleanup(jobContext, jobLocations); } /** - * Removes the generated data files if there is a commit file already generated for them. - * The cleanup at the end removes the temporary directories as well. + * Removes the generated data files if there is a commit file already generated for them. The + * cleanup at the end removes the temporary directories as well. + * * @param originalContext The job context * @param status The status of the job * @throws IOException if there is a failure deleting the files @@ -231,22 +259,28 @@ public void abortJob(JobContext originalContext, int status) throws IOException .suppressFailureWhenFinished() .executeWith(tableExecutor) .onFailure((output, exc) -> LOG.warn("Failed cleanup table {} on abort job", output, exc)) - .run(output -> { - LOG.info("Cleaning table {} with job id {}", output, jobContext.getJobID()); - Table table = HiveIcebergStorageHandler.table(jobConf, output); - jobLocations.add(generateJobLocation(table.location(), jobConf, jobContext.getJobID())); - Collection dataFiles = dataFiles(fileExecutor, table.location(), jobContext, table.io(), false); - - // Check if we have files already committed and remove data files if there are any - if (dataFiles.size() > 0) { - Tasks.foreach(dataFiles) - .retry(3) - .suppressFailureWhenFinished() - .executeWith(fileExecutor) - .onFailure((file, exc) -> LOG.warn("Failed to remove data file {} on abort job", file.path(), exc)) - .run(file -> table.io().deleteFile(file.path().toString())); - } - }); + .run( + output -> { + LOG.info("Cleaning table {} with job id {}", output, jobContext.getJobID()); + Table table = HiveIcebergStorageHandler.table(jobConf, output); + jobLocations.add( + generateJobLocation(table.location(), jobConf, jobContext.getJobID())); + Collection dataFiles = + dataFiles(fileExecutor, table.location(), jobContext, table.io(), false); + + // Check if we have files already committed and remove data files if there are any + if (dataFiles.size() > 0) { + Tasks.foreach(dataFiles) + .retry(3) + .suppressFailureWhenFinished() + .executeWith(fileExecutor) + .onFailure( + (file, exc) -> + LOG.warn( + "Failed to remove data file {} on abort job", file.path(), exc)) + .run(file -> table.io().deleteFile(file.path().toString())); + } + }); } finally { fileExecutor.shutdown(); if (tableExecutor != null) { @@ -261,6 +295,7 @@ public void abortJob(JobContext originalContext, int status) throws IOException /** * Collects the additions to a single table and adds/commits the new files to the Iceberg table. + * * @param io The io to read the forCommit files * @param executor The executor used to read the forCommit files * @param jobContext The job context @@ -268,8 +303,13 @@ public void abortJob(JobContext originalContext, int status) throws IOException * @param location The location of the table used for loading from the catalog * @param catalogName The name of the catalog that contains the table */ - private void commitTable(FileIO io, ExecutorService executor, JobContext jobContext, String name, String location, - String catalogName) { + private void commitTable( + FileIO io, + ExecutorService executor, + JobContext jobContext, + String name, + String location, + String catalogName) { JobConf conf = jobContext.getJobConf(); Properties catalogProperties = new Properties(); catalogProperties.put(Catalogs.NAME, name); @@ -280,8 +320,10 @@ private void commitTable(FileIO io, ExecutorService executor, JobContext jobCont Table table = Catalogs.loadTable(conf, catalogProperties); long startTime = System.currentTimeMillis(); - LOG.info("Committing job has started for table: {}, using location: {}", - table, generateJobLocation(location, conf, jobContext.getJobID())); + LOG.info( + "Committing job has started for table: {}, using location: {}", + table, + generateJobLocation(location, conf, jobContext.getJobID())); Collection dataFiles = dataFiles(executor, location, jobContext, io, true); @@ -290,16 +332,23 @@ private void commitTable(FileIO io, ExecutorService executor, JobContext jobCont AppendFiles append = table.newAppend(); dataFiles.forEach(append::appendFile); append.commit(); - LOG.info("Commit took {} ms for table: {} with {} file(s)", System.currentTimeMillis() - startTime, table, + LOG.info( + "Commit took {} ms for table: {} with {} file(s)", + System.currentTimeMillis() - startTime, + table, dataFiles.size()); LOG.debug("Added files {}", dataFiles); } else { - LOG.info("Commit took {} ms for table: {} with no new files", System.currentTimeMillis() - startTime, table); + LOG.info( + "Commit took {} ms for table: {} with no new files", + System.currentTimeMillis() - startTime, + table); } } /** * Cleans up the jobs temporary locations. For every target table there is a temp dir to clean up. + * * @param jobContext The job context * @param jobLocations The locations to clean up * @throws IOException if there is a failure deleting the files @@ -313,25 +362,33 @@ private void cleanup(JobContext jobContext, Collection jobLocations) thr Tasks.foreach(jobLocations) .retry(3) .suppressFailureWhenFinished() - .onFailure((jobLocation, exc) -> LOG.debug("Failed to remove directory {} on job cleanup", jobLocation, exc)) - .run(jobLocation -> { - LOG.info("Cleaning location: {}", jobLocation); - Path toDelete = new Path(jobLocation); - FileSystem fs = Util.getFs(toDelete, jobConf); - fs.delete(toDelete, true); - }, IOException.class); + .onFailure( + (jobLocation, exc) -> + LOG.debug("Failed to remove directory {} on job cleanup", jobLocation, exc)) + .run( + jobLocation -> { + LOG.info("Cleaning location: {}", jobLocation); + Path toDelete = new Path(jobLocation); + FileSystem fs = Util.getFs(toDelete, jobConf); + fs.delete(toDelete, true); + }, + IOException.class); LOG.info("Cleaning for job {} finished", jobContext.getJobID()); } /** - * Executor service for parallel handling of file reads. Should be shared when committing multiple tables. + * Executor service for parallel handling of file reads. Should be shared when committing multiple + * tables. + * * @param conf The configuration containing the pool size * @return The generated executor service */ private static ExecutorService fileExecutor(Configuration conf) { - int size = conf.getInt(InputFormatConfig.COMMIT_FILE_THREAD_POOL_SIZE, - InputFormatConfig.COMMIT_FILE_THREAD_POOL_SIZE_DEFAULT); + int size = + conf.getInt( + InputFormatConfig.COMMIT_FILE_THREAD_POOL_SIZE, + InputFormatConfig.COMMIT_FILE_THREAD_POOL_SIZE_DEFAULT); return Executors.newFixedThreadPool( size, new ThreadFactoryBuilder() @@ -342,14 +399,19 @@ private static ExecutorService fileExecutor(Configuration conf) { } /** - * Executor service for parallel handling of table manipulation. Could return null, if no parallelism is possible. + * Executor service for parallel handling of table manipulation. Could return null, if no + * parallelism is possible. + * * @param conf The configuration containing the pool size - * @param maxThreadNum The number of requests we want to handle (might be decreased further by configuration) + * @param maxThreadNum The number of requests we want to handle (might be decreased further by + * configuration) * @return The generated executor service, or null if executor is not needed. */ private static ExecutorService tableExecutor(Configuration conf, int maxThreadNum) { - int size = conf.getInt(InputFormatConfig.COMMIT_TABLE_THREAD_POOL_SIZE, - InputFormatConfig.COMMIT_TABLE_THREAD_POOL_SIZE_DEFAULT); + int size = + conf.getInt( + InputFormatConfig.COMMIT_TABLE_THREAD_POOL_SIZE, + InputFormatConfig.COMMIT_TABLE_THREAD_POOL_SIZE_DEFAULT); size = Math.min(maxThreadNum, size); if (size > 1) { return Executors.newFixedThreadPool( @@ -366,6 +428,7 @@ private static ExecutorService tableExecutor(Configuration conf, int maxThreadNu /** * Get the committed data files for this table and job. + * * @param executor The executor used for reading the forCommit files parallel * @param location The location of the table * @param jobContext The job context @@ -373,32 +436,41 @@ private static ExecutorService tableExecutor(Configuration conf, int maxThreadNu * @param throwOnFailure If true then it throws an exception on failure * @return The list of the committed data files */ - private static Collection dataFiles(ExecutorService executor, String location, JobContext jobContext, - FileIO io, boolean throwOnFailure) { + private static Collection dataFiles( + ExecutorService executor, + String location, + JobContext jobContext, + FileIO io, + boolean throwOnFailure) { JobConf conf = jobContext.getJobConf(); // If there are reducers, then every reducer will generate a result file. // If this is a map only task, then every mapper will generate a result file. - int expectedFiles = conf.getNumReduceTasks() > 0 ? conf.getNumReduceTasks() : conf.getNumMapTasks(); + int expectedFiles = + conf.getNumReduceTasks() > 0 ? conf.getNumReduceTasks() : conf.getNumMapTasks(); Collection dataFiles = new ConcurrentLinkedQueue<>(); - // Reading the committed files. The assumption here is that the taskIds are generated in sequential order + // Reading the committed files. The assumption here is that the taskIds are generated in + // sequential order // starting from 0. Tasks.range(expectedFiles) .throwFailureWhenFinished(throwOnFailure) .executeWith(executor) .retry(3) - .run(taskId -> { - String taskFileName = generateFileForCommitLocation(location, conf, jobContext.getJobID(), taskId); - dataFiles.addAll(Arrays.asList(readFileForCommit(taskFileName, io))); - }); + .run( + taskId -> { + String taskFileName = + generateFileForCommitLocation(location, conf, jobContext.getJobID(), taskId); + dataFiles.addAll(Arrays.asList(readFileForCommit(taskFileName, io))); + }); return dataFiles; } /** - * Generates the job temp location based on the job configuration. - * Currently it uses TABLE_LOCATION/temp/QUERY_ID-jobId. + * Generates the job temp location based on the job configuration. Currently it uses + * TABLE_LOCATION/temp/QUERY_ID-jobId. + * * @param location The location of the table * @param conf The job's configuration * @param jobId The JobID for the task @@ -411,16 +483,18 @@ static String generateJobLocation(String location, Configuration conf, JobID job } /** - * Generates file location based on the task configuration and a specific task id. - * This file will be used to store the data required to generate the Iceberg commit. - * Currently it uses TABLE_LOCATION/temp/QUERY_ID-jobId/task-[0..numTasks).forCommit. + * Generates file location based on the task configuration and a specific task id. This file will + * be used to store the data required to generate the Iceberg commit. Currently it uses + * TABLE_LOCATION/temp/QUERY_ID-jobId/task-[0..numTasks).forCommit. + * * @param location The location of the table * @param conf The job's configuration * @param jobId The jobId for the task * @param taskId The taskId for the commit file * @return The file to store the results */ - private static String generateFileForCommitLocation(String location, Configuration conf, JobID jobId, int taskId) { + private static String generateFileForCommitLocation( + String location, Configuration conf, JobID jobId, int taskId) { return generateJobLocation(location, conf, jobId) + "/task-" + taskId + FOR_COMMIT_EXTENSION; } @@ -435,10 +509,12 @@ private static void createFileForCommit(DataFile[] closedFiles, String location, } private static DataFile[] readFileForCommit(String fileForCommitLocation, FileIO io) { - try (ObjectInputStream ois = new ObjectInputStream(io.newInputFile(fileForCommitLocation).newStream())) { + try (ObjectInputStream ois = + new ObjectInputStream(io.newInputFile(fileForCommitLocation).newStream())) { return (DataFile[]) ois.readObject(); } catch (ClassNotFoundException | IOException e) { - throw new NotFoundException("Can not read or parse committed file: %s", fileForCommitLocation); + throw new NotFoundException( + "Can not read or parse committed file: %s", fileForCommitLocation); } } } diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputFormat.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputFormat.java index 348580d2ffa0..95eb7f0370b8 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputFormat.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputFormat.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; import java.util.Locale; @@ -45,18 +44,24 @@ import org.apache.iceberg.mr.mapred.Container; import org.apache.iceberg.util.PropertyUtil; -public class HiveIcebergOutputFormat implements OutputFormat>, - HiveOutputFormat> { +public class HiveIcebergOutputFormat + implements OutputFormat>, + HiveOutputFormat> { @Override - public FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf jc, Path finalOutPath, Class valueClass, - boolean isCompressed, Properties tableAndSerDeProperties, Progressable progress) { + public FileSinkOperator.RecordWriter getHiveRecordWriter( + JobConf jc, + Path finalOutPath, + Class valueClass, + boolean isCompressed, + Properties tableAndSerDeProperties, + Progressable progress) { return writer(jc); } @Override - public org.apache.hadoop.mapred.RecordWriter> getRecordWriter(FileSystem ignored, - JobConf job, String name, Progressable progress) { + public org.apache.hadoop.mapred.RecordWriter> getRecordWriter( + FileSystem ignored, JobConf job, String name, Progressable progress) { return writer(job); } @@ -68,24 +73,43 @@ public void checkOutputSpecs(FileSystem ignored, JobConf job) { private static HiveIcebergRecordWriter writer(JobConf jc) { TaskAttemptID taskAttemptID = TezUtil.taskAttemptWrapper(jc); // It gets the config from the FileSinkOperator which has its own config for every target table - Table table = HiveIcebergStorageHandler.table(jc, jc.get(hive_metastoreConstants.META_TABLE_NAME)); + Table table = + HiveIcebergStorageHandler.table(jc, jc.get(hive_metastoreConstants.META_TABLE_NAME)); Schema schema = HiveIcebergStorageHandler.schema(jc); PartitionSpec spec = table.spec(); - FileFormat fileFormat = FileFormat.valueOf(PropertyUtil.propertyAsString(table.properties(), - TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT).toUpperCase(Locale.ENGLISH)); - long targetFileSize = PropertyUtil.propertyAsLong(table.properties(), TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, + FileFormat fileFormat = + FileFormat.valueOf( + PropertyUtil.propertyAsString( + table.properties(), + TableProperties.DEFAULT_FILE_FORMAT, + TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) + .toUpperCase(Locale.ENGLISH)); + long targetFileSize = + PropertyUtil.propertyAsLong( + table.properties(), + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); FileIO io = table.io(); int partitionId = taskAttemptID.getTaskID().getId(); int taskId = taskAttemptID.getId(); - String operationId = jc.get(HiveConf.ConfVars.HIVEQUERYID.varname) + "-" + taskAttemptID.getJobID(); - OutputFileFactory outputFileFactory = OutputFileFactory.builderFor(table, partitionId, taskId) - .format(fileFormat) - .operationId(operationId) - .build(); + String operationId = + jc.get(HiveConf.ConfVars.HIVEQUERYID.varname) + "-" + taskAttemptID.getJobID(); + OutputFileFactory outputFileFactory = + OutputFileFactory.builderFor(table, partitionId, taskId) + .format(fileFormat) + .operationId(operationId) + .build(); String tableName = jc.get(Catalogs.NAME); - return new HiveIcebergRecordWriter(schema, spec, fileFormat, - new GenericAppenderFactory(schema, spec), outputFileFactory, io, targetFileSize, taskAttemptID, tableName); + return new HiveIcebergRecordWriter( + schema, + spec, + fileFormat, + new GenericAppenderFactory(schema, spec), + outputFileFactory, + io, + targetFileSize, + taskAttemptID, + tableName); } } diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergRecordWriter.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergRecordWriter.java index 18cdc24c8c2c..793b9c5e6448 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergRecordWriter.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergRecordWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; import java.io.IOException; @@ -44,7 +43,8 @@ import org.slf4j.LoggerFactory; class HiveIcebergRecordWriter extends PartitionedFanoutWriter - implements FileSinkOperator.RecordWriter, org.apache.hadoop.mapred.RecordWriter> { + implements FileSinkOperator.RecordWriter, + org.apache.hadoop.mapred.RecordWriter> { private static final Logger LOG = LoggerFactory.getLogger(HiveIcebergRecordWriter.class); // The current key is reused at every write to avoid unnecessary object creation @@ -53,7 +53,8 @@ class HiveIcebergRecordWriter extends PartitionedFanoutWriter // > map to store the active writers // Stored in concurrent map, since some executor engines can share containers - private static final Map> writers = Maps.newConcurrentMap(); + private static final Map> writers = + Maps.newConcurrentMap(); static Map removeWriters(TaskAttemptID taskAttemptID) { return writers.remove(taskAttemptID); @@ -63,9 +64,16 @@ static Map getWriters(TaskAttemptID taskAttempt return writers.get(taskAttemptID); } - HiveIcebergRecordWriter(Schema schema, PartitionSpec spec, FileFormat format, - FileAppenderFactory appenderFactory, OutputFileFactory fileFactory, FileIO io, long targetFileSize, - TaskAttemptID taskAttemptID, String tableName) { + HiveIcebergRecordWriter( + Schema schema, + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + TaskAttemptID taskAttemptID, + String tableName) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.io = io; this.currentKey = new PartitionKey(spec, schema); @@ -99,11 +107,14 @@ public void close(boolean abort) throws IOException { .executeWith(ThreadPools.getWorkerPool()) .retry(3) .suppressFailureWhenFinished() - .onFailure((file, exception) -> LOG.debug("Failed on to remove file {} on abort", file, exception)) + .onFailure( + (file, exception) -> + LOG.debug("Failed on to remove file {} on abort", file, exception)) .run(dataFile -> io.deleteFile(dataFile.path().toString())); } - LOG.info("IcebergRecordWriter is closed with abort={}. Created {} files", abort, dataFiles.length); + LOG.info( + "IcebergRecordWriter is closed with abort={}. Created {} files", abort, dataFiles.length); } @Override diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java index b1896d21ca85..f206bba47605 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; import java.util.Arrays; @@ -59,26 +58,34 @@ public class HiveIcebergSerDe extends AbstractSerDe { private Container row = new Container<>(); @Override - public void initialize(@Nullable Configuration configuration, Properties serDeProperties) throws SerDeException { + public void initialize(@Nullable Configuration configuration, Properties serDeProperties) + throws SerDeException { // HiveIcebergSerDe.initialize is called multiple places in Hive code: - // - When we are trying to create a table - HiveDDL data is stored at the serDeProperties, but no Iceberg table + // - When we are trying to create a table - HiveDDL data is stored at the serDeProperties, but + // no Iceberg table // is created yet. - // - When we are compiling the Hive query on HiveServer2 side - We only have table information (location/name), - // and we have to read the schema using the table data. This is called multiple times so there is room for + // - When we are compiling the Hive query on HiveServer2 side - We only have table information + // (location/name), + // and we have to read the schema using the table data. This is called multiple times so there + // is room for // optimizing here. - // - When we are executing the Hive query in the execution engine - We do not want to load the table data on every - // executor, but serDeProperties are populated by HiveIcebergStorageHandler.configureInputJobProperties() and + // - When we are executing the Hive query in the execution engine - We do not want to load the + // table data on every + // executor, but serDeProperties are populated by + // HiveIcebergStorageHandler.configureInputJobProperties() and // the resulting properties are serialized and distributed to the executors if (serDeProperties.get(InputFormatConfig.TABLE_SCHEMA) != null) { - this.tableSchema = SchemaParser.fromJson((String) serDeProperties.get(InputFormatConfig.TABLE_SCHEMA)); + this.tableSchema = + SchemaParser.fromJson((String) serDeProperties.get(InputFormatConfig.TABLE_SCHEMA)); } else { try { // always prefer the original table schema if there is one this.tableSchema = Catalogs.loadTable(configuration, serDeProperties).schema(); LOG.info("Using schema from existing table {}", SchemaParser.toJson(tableSchema)); } catch (Exception e) { - boolean autoConversion = configuration.getBoolean(InputFormatConfig.SCHEMA_AUTO_CONVERSION, false); + boolean autoConversion = + configuration.getBoolean(InputFormatConfig.SCHEMA_AUTO_CONVERSION, false); // If we can not load the table try the provided hive schema this.tableSchema = hiveSchemaOrThrow(serDeProperties, e, autoConversion); } @@ -91,11 +98,16 @@ public void initialize(@Nullable Configuration configuration, Properties serDePr } else { configuration.setBoolean(InputFormatConfig.CASE_SENSITIVE, false); String[] selectedColumns = ColumnProjectionUtils.getReadColumnNames(configuration); - // When same table is joined multiple times, it is possible some selected columns are duplicated, - // in this case wrong recordStructField position leads wrong value or ArrayIndexOutOfBoundException - String[] distinctSelectedColumns = Arrays.stream(selectedColumns).distinct().toArray(String[]::new); - projectedSchema = distinctSelectedColumns.length > 0 ? - tableSchema.caseInsensitiveSelect(distinctSelectedColumns) : tableSchema; + // When same table is joined multiple times, it is possible some selected columns are + // duplicated, + // in this case wrong recordStructField position leads wrong value or + // ArrayIndexOutOfBoundException + String[] distinctSelectedColumns = + Arrays.stream(selectedColumns).distinct().toArray(String[]::new); + projectedSchema = + distinctSelectedColumns.length > 0 + ? tableSchema.caseInsensitiveSelect(distinctSelectedColumns) + : tableSchema; // the input split mapper handles does not belong to this table // it is necessary to ensure projectedSchema equals to tableSchema, // or we cannot find selectOperator's column from inspector @@ -120,11 +132,12 @@ public Class getSerializedClass() { public Writable serialize(Object o, ObjectInspector objectInspector) { Deserializer deserializer = deserializers.get(objectInspector); if (deserializer == null) { - deserializer = new Deserializer.Builder() - .schema(tableSchema) - .sourceInspector((StructObjectInspector) objectInspector) - .writerInspector((StructObjectInspector) inspector) - .build(); + deserializer = + new Deserializer.Builder() + .schema(tableSchema) + .sourceInspector((StructObjectInspector) objectInspector) + .writerInspector((StructObjectInspector) inspector) + .build(); deserializers.put(objectInspector, deserializer); } @@ -148,27 +161,34 @@ public ObjectInspector getObjectInspector() { } /** - * Gets the hive schema from the serDeProperties, and throws an exception if it is not provided. In the later case - * it adds the previousException as a root cause. + * Gets the hive schema from the serDeProperties, and throws an exception if it is not provided. + * In the later case it adds the previousException as a root cause. + * * @param serDeProperties The source of the hive schema * @param previousException If we had an exception previously - * @param autoConversion When true, convert unsupported types to more permissive ones, like tinyint to - * int + * @param autoConversion When true, convert unsupported types to more permissive + * ones, like tinyint to int * @return The hive schema parsed from the serDeProperties * @throws SerDeException If there is no schema information in the serDeProperties */ - private static Schema hiveSchemaOrThrow(Properties serDeProperties, Exception previousException, - boolean autoConversion) + private static Schema hiveSchemaOrThrow( + Properties serDeProperties, Exception previousException, boolean autoConversion) throws SerDeException { // Read the configuration parameters String columnNames = serDeProperties.getProperty(serdeConstants.LIST_COLUMNS); String columnTypes = serDeProperties.getProperty(serdeConstants.LIST_COLUMN_TYPES); // No constant for column comments and column comments delimiter. String columnComments = serDeProperties.getProperty(LIST_COLUMN_COMMENT); - String columnNameDelimiter = serDeProperties.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? - serDeProperties.getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA); - if (columnNames != null && columnTypes != null && columnNameDelimiter != null && - !columnNames.isEmpty() && !columnTypes.isEmpty() && !columnNameDelimiter.isEmpty()) { + String columnNameDelimiter = + serDeProperties.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) + ? serDeProperties.getProperty(serdeConstants.COLUMN_NAME_DELIMITER) + : String.valueOf(SerDeUtils.COMMA); + if (columnNames != null + && columnTypes != null + && columnNameDelimiter != null + && !columnNames.isEmpty() + && !columnTypes.isEmpty() + && !columnNameDelimiter.isEmpty()) { // Parse the configuration parameters List names = Lists.newArrayList(); Collections.addAll(names, columnNames.split(columnNameDelimiter)); @@ -176,12 +196,17 @@ private static Schema hiveSchemaOrThrow(Properties serDeProperties, Exception pr if (columnComments != null) { Collections.addAll(comments, columnComments.split(Character.toString(Character.MIN_VALUE))); } - Schema hiveSchema = HiveSchemaUtil.convert(names, TypeInfoUtils.getTypeInfosFromTypeString(columnTypes), - comments, autoConversion); + Schema hiveSchema = + HiveSchemaUtil.convert( + names, + TypeInfoUtils.getTypeInfosFromTypeString(columnTypes), + comments, + autoConversion); LOG.info("Using hive schema {}", SchemaParser.toJson(hiveSchema)); return hiveSchema; } else { - throw new SerDeException("Please provide an existing table or a valid schema", previousException); + throw new SerDeException( + "Please provide an existing table or a valid schema", previousException); } } } diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSplit.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSplit.java index a1e7b1aa4cde..25bc3c523009 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSplit.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSplit.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; import java.io.DataInput; @@ -33,15 +32,17 @@ public class HiveIcebergSplit extends FileSplit implements IcebergSplitContainer private IcebergSplit innerSplit; - // Hive uses the path name of a split to map it back to a partition (`PartitionDesc`) or table description object - // (`TableDesc`) which specifies the relevant input format for reading the files belonging to that partition or table. - // That way, `HiveInputFormat` and `CombineHiveInputFormat` can read files with different file formats in the same + // Hive uses the path name of a split to map it back to a partition (`PartitionDesc`) or table + // description object + // (`TableDesc`) which specifies the relevant input format for reading the files belonging to that + // partition or table. + // That way, `HiveInputFormat` and `CombineHiveInputFormat` can read files with different file + // formats in the same // MapReduce job and merge compatible splits together. private String tableLocation; // public no-argument constructor for deserialization - public HiveIcebergSplit() { - } + public HiveIcebergSplit() {} HiveIcebergSplit(IcebergSplit split, String tableLocation) { this.innerSplit = split; diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java index 48a76775b8bf..da40f4c73ef3 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; import java.io.Serializable; @@ -92,32 +91,39 @@ public void configureOutputJobProperties(TableDesc tableDesc, Map map) { - - } + public void configureTableJobProperties(TableDesc tableDesc, Map map) {} - // Override annotation commented out, since this interface method has been introduced only in Hive 3 + // Override annotation commented out, since this interface method has been introduced only in Hive + // 3 // @Override - public void configureInputJobCredentials(TableDesc tableDesc, Map secrets) { - - } + public void configureInputJobCredentials(TableDesc tableDesc, Map secrets) {} @Override public void configureJobConf(TableDesc tableDesc, JobConf jobConf) { - if (tableDesc != null && tableDesc.getProperties() != null && - tableDesc.getProperties().get(WRITE_KEY) != null) { + if (tableDesc != null + && tableDesc.getProperties() != null + && tableDesc.getProperties().get(WRITE_KEY) != null) { String tableName = tableDesc.getTableName(); - Preconditions.checkArgument(!tableName.contains(TABLE_NAME_SEPARATOR), - "Can not handle table " + tableName + ". Its name contains '" + TABLE_NAME_SEPARATOR + "'"); + Preconditions.checkArgument( + !tableName.contains(TABLE_NAME_SEPARATOR), + "Can not handle table " + + tableName + + ". Its name contains '" + + TABLE_NAME_SEPARATOR + + "'"); String tables = jobConf.get(InputFormatConfig.OUTPUT_TABLES); tables = tables == null ? tableName : tables + TABLE_NAME_SEPARATOR + tableName; jobConf.set("mapred.output.committer.class", HiveIcebergOutputCommitter.class.getName()); @@ -152,7 +158,8 @@ public String toString() { * @return Entire filter to take advantage of Hive's pruning as well as Iceberg's pruning. */ @Override - public DecomposedPredicate decomposePredicate(JobConf jobConf, Deserializer deserializer, ExprNodeDesc exprNodeDesc) { + public DecomposedPredicate decomposePredicate( + JobConf jobConf, Deserializer deserializer, ExprNodeDesc exprNodeDesc) { DecomposedPredicate predicate = new DecomposedPredicate(); predicate.residualPredicate = (ExprNodeGenericFuncDesc) exprNodeDesc; predicate.pushedPredicate = (ExprNodeGenericFuncDesc) exprNodeDesc; @@ -160,52 +167,63 @@ public DecomposedPredicate decomposePredicate(JobConf jobConf, Deserializer dese } /** - * Returns the Table serialized to the configuration based on the table name. - * If configuration is missing from the FileIO of the table, it will be populated with the input config. + * Returns the Table serialized to the configuration based on the table name. If configuration is + * missing from the FileIO of the table, it will be populated with the input config. * * @param config The configuration used to get the data from * @param name The name of the table we need as returned by TableDesc.getTableName() * @return The Table */ public static Table table(Configuration config, String name) { - Table table = SerializationUtil.deserializeFromBase64(config.get(InputFormatConfig.SERIALIZED_TABLE_PREFIX + name)); + Table table = + SerializationUtil.deserializeFromBase64( + config.get(InputFormatConfig.SERIALIZED_TABLE_PREFIX + name)); checkAndSetIoConfig(config, table); return table; } /** - * If enabled, it populates the FileIO's hadoop configuration with the input config object. - * This might be necessary when the table object was serialized without the FileIO config. + * If enabled, it populates the FileIO's hadoop configuration with the input config object. This + * might be necessary when the table object was serialized without the FileIO config. * * @param config Configuration to set for FileIO, if enabled * @param table The Iceberg table object */ public static void checkAndSetIoConfig(Configuration config, Table table) { - if (table != null && config.getBoolean(InputFormatConfig.CONFIG_SERIALIZATION_DISABLED, - InputFormatConfig.CONFIG_SERIALIZATION_DISABLED_DEFAULT) && table.io() instanceof HadoopConfigurable) { + if (table != null + && config.getBoolean( + InputFormatConfig.CONFIG_SERIALIZATION_DISABLED, + InputFormatConfig.CONFIG_SERIALIZATION_DISABLED_DEFAULT) + && table.io() instanceof HadoopConfigurable) { ((HadoopConfigurable) table.io()).setConf(config); } } /** - * If enabled, it ensures that the FileIO's hadoop configuration will not be serialized. - * This might be desirable for decreasing the overall size of serialized table objects. + * If enabled, it ensures that the FileIO's hadoop configuration will not be serialized. This + * might be desirable for decreasing the overall size of serialized table objects. * - * Note: Skipping FileIO config serialization in this fashion might in turn necessitate calling - * {@link #checkAndSetIoConfig(Configuration, Table)} on the deserializer-side to enable subsequent use of the FileIO. + *

    Note: Skipping FileIO config serialization in this fashion might in turn necessitate calling + * {@link #checkAndSetIoConfig(Configuration, Table)} on the deserializer-side to enable + * subsequent use of the FileIO. * * @param config Configuration to set for FileIO in a transient manner, if enabled * @param table The Iceberg table object */ public static void checkAndSkipIoConfigSerialization(Configuration config, Table table) { - if (table != null && config.getBoolean(InputFormatConfig.CONFIG_SERIALIZATION_DISABLED, - InputFormatConfig.CONFIG_SERIALIZATION_DISABLED_DEFAULT) && table.io() instanceof HadoopConfigurable) { - ((HadoopConfigurable) table.io()).serializeConfWith(conf -> new NonSerializingConfig(config)::get); + if (table != null + && config.getBoolean( + InputFormatConfig.CONFIG_SERIALIZATION_DISABLED, + InputFormatConfig.CONFIG_SERIALIZATION_DISABLED_DEFAULT) + && table.io() instanceof HadoopConfigurable) { + ((HadoopConfigurable) table.io()) + .serializeConfWith(conf -> new NonSerializingConfig(config)::get); } } /** * Returns the names of the output tables stored in the configuration. + * * @param config The configuration used to get the data from * @return The collection of the table names as returned by TableDesc.getTableName() */ @@ -215,6 +233,7 @@ public static Collection outputTables(Configuration config) { /** * Returns the catalog name serialized to the configuration. + * * @param config The configuration used to get the data from * @param name The name of the table we neeed as returned by TableDesc.getTableName() * @return catalog name @@ -225,6 +244,7 @@ public static String catalogName(Configuration config, String name) { /** * Returns the Table Schema serialized to the configuration. + * * @param config The configuration used to get the data from * @return The Table Schema object */ @@ -233,23 +253,25 @@ public static Schema schema(Configuration config) { } /** - * Stores the serializable table data in the configuration. - * Currently the following is handled: + * Stores the serializable table data in the configuration. Currently the following is handled: + * *

      - *
    • - Table - in case the table is serializable
    • - *
    • - Location
    • - *
    • - Schema
    • - *
    • - Partition specification
    • - *
    • - FileIO for handling table files
    • - *
    • - Location provider used for file generation
    • - *
    • - Encryption manager for encryption handling
    • + *
    • - Table - in case the table is serializable + *
    • - Location + *
    • - Schema + *
    • - Partition specification + *
    • - FileIO for handling table files + *
    • - Location provider used for file generation + *
    • - Encryption manager for encryption handling *
    + * * @param configuration The configuration storing the catalog information * @param tableDesc The table which we want to store to the configuration * @param map The map of the configuration properties which we append with the serialized data */ @VisibleForTesting - static void overlayTableProperties(Configuration configuration, TableDesc tableDesc, Map map) { + static void overlayTableProperties( + Configuration configuration, TableDesc tableDesc, Map map) { Properties props = tableDesc.getProperties(); Table table = Catalogs.loadTable(configuration, props); String schemaJson = SchemaParser.toJson(table.schema()); @@ -265,15 +287,19 @@ static void overlayTableProperties(Configuration configuration, TableDesc tableD // serialize table object into config Table serializableTable = SerializableTable.copyOf(table); checkAndSkipIoConfigSerialization(configuration, serializableTable); - map.put(InputFormatConfig.SERIALIZED_TABLE_PREFIX + tableDesc.getTableName(), + map.put( + InputFormatConfig.SERIALIZED_TABLE_PREFIX + tableDesc.getTableName(), SerializationUtil.serializeToBase64(serializableTable)); - // We need to remove this otherwise the job.xml will be invalid as column comments are separated with '\0' and + // We need to remove this otherwise the job.xml will be invalid as column comments are separated + // with '\0' and // the serialization utils fail to serialize this character map.remove("columns.comments"); - // save schema into table props as well to avoid repeatedly hitting the HMS during serde initializations - // this is an exception to the interface documentation, but it's a safe operation to add this property + // save schema into table props as well to avoid repeatedly hitting the HMS during serde + // initializations + // this is an exception to the interface documentation, but it's a safe operation to add this + // property props.put(InputFormatConfig.TABLE_SCHEMA, schemaJson); } @@ -287,7 +313,8 @@ private static class NonSerializingConfig implements Serializable { public Configuration get() { if (conf == null) { - throw new IllegalStateException("Configuration was not serialized on purpose but was not set manually either"); + throw new IllegalStateException( + "Configuration was not serialized on purpose but was not set manually either"); } return conf; diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/TezUtil.java b/mr/src/main/java/org/apache/iceberg/mr/hive/TezUtil.java index a69d183fcebe..a5cb2dcbc3ac 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/TezUtil.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/TezUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; import java.util.Objects; @@ -31,14 +30,18 @@ public class TezUtil { private static final String TASK_ATTEMPT_ID_KEY = "mapred.task.id"; - // TezProcessor (Hive) propagates the vertex id under this key - available during Task commit phase + // TezProcessor (Hive) propagates the vertex id under this key - available during Task commit + // phase private static final String TEZ_VERTEX_ID_HIVE = "hive.tez.vertex.index"; - // MROutputCommitter (Tez) propagates the vertex id under this key - available during DAG/Vertex commit phase + // MROutputCommitter (Tez) propagates the vertex id under this key - available during DAG/Vertex + // commit phase private static final String TEZ_VERTEX_ID_DAG = "mapreduce.task.vertex.id"; /** - * If the Tez vertex id is present in config, creates a new jobContext by appending the Tez vertex id to the jobID. - * For the rationale behind this enrichment, please refer to point #1 in the docs of {@link TaskAttemptWrapper}. + * If the Tez vertex id is present in config, creates a new jobContext by appending the Tez vertex + * id to the jobID. For the rationale behind this enrichment, please refer to point #1 in the docs + * of {@link TaskAttemptWrapper}. + * * @param jobContext original jobContext to be enriched * @return enriched jobContext */ @@ -53,12 +56,15 @@ public static JobContext enrichContextWithVertexId(JobContext jobContext) { } /** - * Creates a new taskAttemptContext by replacing the taskAttemptID with a wrapped object. - * For the rationale behind this enrichment, please refer to point #2 in the docs of {@link TaskAttemptWrapper}. + * Creates a new taskAttemptContext by replacing the taskAttemptID with a wrapped object. For the + * rationale behind this enrichment, please refer to point #2 in the docs of {@link + * TaskAttemptWrapper}. + * * @param taskAttemptContext original taskAttemptContext to be enriched * @return enriched taskAttemptContext */ - public static TaskAttemptContext enrichContextWithAttemptWrapper(TaskAttemptContext taskAttemptContext) { + public static TaskAttemptContext enrichContextWithAttemptWrapper( + TaskAttemptContext taskAttemptContext) { TaskAttemptID wrapped = TezUtil.taskAttemptWrapper(taskAttemptContext.getTaskAttemptID()); return new TaskAttemptContextImpl(taskAttemptContext.getJobConf(), wrapped); } @@ -68,7 +74,8 @@ public static TaskAttemptID taskAttemptWrapper(TaskAttemptID attemptID) { } public static TaskAttemptID taskAttemptWrapper(JobConf jc) { - return new TaskAttemptWrapper(TaskAttemptID.forName(jc.get(TASK_ATTEMPT_ID_KEY)), jc.get(TEZ_VERTEX_ID_HIVE)); + return new TaskAttemptWrapper( + TaskAttemptID.forName(jc.get(TASK_ATTEMPT_ID_KEY)), jc.get(TEZ_VERTEX_ID_HIVE)); } private static JobID getJobIDWithVertexAppended(JobID jobID, String vertexId) { @@ -79,24 +86,27 @@ private static JobID getJobIDWithVertexAppended(JobID jobID, String vertexId) { } } - private TezUtil() { - } + private TezUtil() {} /** - * Subclasses {@link org.apache.hadoop.mapred.TaskAttemptID}. It has two main purposes: - * 1. Provide a way to append an optional vertex id to the Job ID. This is needed because there is a discrepancy - * between how the attempt ID is constructed in the {@link org.apache.tez.mapreduce.output.MROutput} (with vertex ID - * appended to the end of the Job ID) and how it's available in the mapper (without vertex ID) which creates and - * caches the HiveIcebergRecordWriter object. - * 2. Redefine the equals/hashcode provided by TaskAttemptID so that task type (map or reduce) does not count, and - * therefore the mapper and reducer threads can use the same attempt ID-based key to retrieve the cached - * HiveIcebergRecordWriter object. + * Subclasses {@link org.apache.hadoop.mapred.TaskAttemptID}. It has two main purposes: 1. Provide + * a way to append an optional vertex id to the Job ID. This is needed because there is a + * discrepancy between how the attempt ID is constructed in the {@link + * org.apache.tez.mapreduce.output.MROutput} (with vertex ID appended to the end of the Job ID) + * and how it's available in the mapper (without vertex ID) which creates and caches the + * HiveIcebergRecordWriter object. 2. Redefine the equals/hashcode provided by TaskAttemptID so + * that task type (map or reduce) does not count, and therefore the mapper and reducer threads can + * use the same attempt ID-based key to retrieve the cached HiveIcebergRecordWriter object. */ private static class TaskAttemptWrapper extends TaskAttemptID { TaskAttemptWrapper(TaskAttemptID attemptID, String vertexId) { - super(getJobIDWithVertexAppended(attemptID.getJobID(), vertexId).getJtIdentifier(), - attemptID.getJobID().getId(), attemptID.getTaskType(), attemptID.getTaskID().getId(), attemptID.getId()); + super( + getJobIDWithVertexAppended(attemptID.getJobID(), vertexId).getJtIdentifier(), + attemptID.getJobID().getId(), + attemptID.getTaskType(), + attemptID.getTaskID().getId(), + attemptID.getId()); } @Override @@ -108,9 +118,9 @@ public boolean equals(Object o) { return false; } TaskAttemptWrapper that = (TaskAttemptWrapper) o; - return getId() == that.getId() && - getTaskID().getId() == that.getTaskID().getId() && - Objects.equals(getJobID(), that.getJobID()); + return getId() == that.getId() + && getTaskID().getId() == that.getTaskID().getId() + && Objects.equals(getJobID(), that.getJobID()); } @Override diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergBinaryObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergBinaryObjectInspector.java index 61b30c72ba68..e84993baf1ef 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergBinaryObjectInspector.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergBinaryObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.nio.ByteBuffer; @@ -60,7 +59,8 @@ public Object copyObject(Object o) { return Arrays.copyOf(bytes, bytes.length); } else if (o instanceof ByteBuffer) { ByteBuffer copy = - ByteBuffer.wrap(((ByteBuffer) o).array(), ((ByteBuffer) o).arrayOffset(), ((ByteBuffer) o).limit()); + ByteBuffer.wrap( + ((ByteBuffer) o).array(), ((ByteBuffer) o).arrayOffset(), ((ByteBuffer) o).limit()); return copy; } else { return o; @@ -71,5 +71,4 @@ public Object copyObject(Object o) { public ByteBuffer convert(Object o) { return o == null ? null : ByteBuffer.wrap((byte[]) o); } - } diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDateObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDateObjectInspector.java index 63af550d09ca..17a82f430208 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDateObjectInspector.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDateObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.sql.Date; @@ -59,7 +58,8 @@ public Object copyObject(Object o) { if (o instanceof Date) { return new Date(((Date) o).getTime()); } else if (o instanceof LocalDate) { - return LocalDate.of(((LocalDate) o).getYear(), ((LocalDate) o).getMonth(), ((LocalDate) o).getDayOfMonth()); + return LocalDate.of( + ((LocalDate) o).getYear(), ((LocalDate) o).getMonth(), ((LocalDate) o).getDayOfMonth()); } else { return o; } diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDecimalObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDecimalObjectInspector.java index b30a3fadbc67..20d52ffa5559 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDecimalObjectInspector.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergDecimalObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import com.github.benmanes.caffeine.cache.Cache; @@ -33,9 +32,8 @@ public final class IcebergDecimalObjectInspector extends AbstractPrimitiveJavaObjectInspector implements HiveDecimalObjectInspector, WriteObjectInspector { - private static final Cache CACHE = Caffeine.newBuilder() - .expireAfterAccess(10, TimeUnit.MINUTES) - .build(); + private static final Cache CACHE = + Caffeine.newBuilder().expireAfterAccess(10, TimeUnit.MINUTES).build(); public static IcebergDecimalObjectInspector get(int precision, int scale) { Preconditions.checkArgument(scale < precision); diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergFixedObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergFixedObjectInspector.java index be4ce2b9020e..87dbfb1fbd84 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergFixedObjectInspector.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergFixedObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.util.Arrays; diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergObjectInspector.java index 039950213f92..f4014ed6f2a7 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergObjectInspector.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.util.List; @@ -36,33 +35,31 @@ public final class IcebergObjectInspector extends TypeUtil.SchemaVisitor { // get the correct inspectors depending on whether we're working with Hive2 or Hive3 dependencies - // we need to do this because there is a breaking API change in Date/TimestampObjectInspector between Hive2 and Hive3 - private static final String DATE_INSPECTOR_CLASS = MetastoreUtil.hive3PresentOnClasspath() ? - "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergDateObjectInspectorHive3" : - "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergDateObjectInspector"; - - public static final ObjectInspector DATE_INSPECTOR = DynMethods.builder("get") - .impl(DATE_INSPECTOR_CLASS) - .buildStatic() - .invoke(); - - private static final String TIMESTAMP_INSPECTOR_CLASS = MetastoreUtil.hive3PresentOnClasspath() ? - "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergTimestampObjectInspectorHive3" : - "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergTimestampObjectInspector"; - - private static final String TIMESTAMPTZ_INSPECTOR_CLASS = MetastoreUtil.hive3PresentOnClasspath() ? - "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergTimestampWithZoneObjectInspectorHive3" : - "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergTimestampWithZoneObjectInspector"; - - public static final ObjectInspector TIMESTAMP_INSPECTOR = DynMethods.builder("get") - .impl(TIMESTAMP_INSPECTOR_CLASS) - .buildStatic() - .invoke(); - - public static final ObjectInspector TIMESTAMP_INSPECTOR_WITH_TZ = DynMethods.builder("get") - .impl(TIMESTAMPTZ_INSPECTOR_CLASS) - .buildStatic() - .invoke(); + // we need to do this because there is a breaking API change in Date/TimestampObjectInspector + // between Hive2 and Hive3 + private static final String DATE_INSPECTOR_CLASS = + MetastoreUtil.hive3PresentOnClasspath() + ? "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergDateObjectInspectorHive3" + : "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergDateObjectInspector"; + + public static final ObjectInspector DATE_INSPECTOR = + DynMethods.builder("get").impl(DATE_INSPECTOR_CLASS).buildStatic().invoke(); + + private static final String TIMESTAMP_INSPECTOR_CLASS = + MetastoreUtil.hive3PresentOnClasspath() + ? "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergTimestampObjectInspectorHive3" + : "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergTimestampObjectInspector"; + + private static final String TIMESTAMPTZ_INSPECTOR_CLASS = + MetastoreUtil.hive3PresentOnClasspath() + ? "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergTimestampWithZoneObjectInspectorHive3" + : "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergTimestampWithZoneObjectInspector"; + + public static final ObjectInspector TIMESTAMP_INSPECTOR = + DynMethods.builder("get").impl(TIMESTAMP_INSPECTOR_CLASS).buildStatic().invoke(); + + public static final ObjectInspector TIMESTAMP_INSPECTOR_WITH_TZ = + DynMethods.builder("get").impl(TIMESTAMPTZ_INSPECTOR_CLASS).buildStatic().invoke(); public static ObjectInspector create(@Nullable Schema schema) { if (schema == null) { @@ -87,9 +84,12 @@ public ObjectInspector list(Types.ListType listTypeInfo, ObjectInspector listObj } @Override - public ObjectInspector map(Types.MapType mapType, - ObjectInspector keyObjectInspector, ObjectInspector valueObjectInspector) { - return ObjectInspectorFactory.getStandardMapObjectInspector(keyObjectInspector, valueObjectInspector); + public ObjectInspector map( + Types.MapType mapType, + ObjectInspector keyObjectInspector, + ObjectInspector valueObjectInspector) { + return ObjectInspectorFactory.getStandardMapObjectInspector( + keyObjectInspector, valueObjectInspector); } @Override @@ -144,8 +144,8 @@ public ObjectInspector schema(Schema schema, ObjectInspector structObjectInspect } @Override - public ObjectInspector struct(Types.StructType structType, List fieldObjectInspectors) { + public ObjectInspector struct( + Types.StructType structType, List fieldObjectInspectors) { return new IcebergRecordObjectInspector(structType, fieldObjectInspectors); } - } diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergRecordObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergRecordObjectInspector.java index 58037f91c9b9..b5204068c726 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergRecordObjectInspector.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergRecordObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.util.Collections; @@ -35,11 +34,12 @@ public final class IcebergRecordObjectInspector extends StructObjectInspector { private static final IcebergRecordObjectInspector EMPTY = - new IcebergRecordObjectInspector(Types.StructType.of(), Collections.emptyList()); + new IcebergRecordObjectInspector(Types.StructType.of(), Collections.emptyList()); private final List structFields; - public IcebergRecordObjectInspector(Types.StructType structType, List objectInspectors) { + public IcebergRecordObjectInspector( + Types.StructType structType, List objectInspectors) { Preconditions.checkArgument(structType.fields().size() == objectInspectors.size()); this.structFields = Lists.newArrayListWithExpectedSize(structType.fields().size()); @@ -48,9 +48,15 @@ public IcebergRecordObjectInspector(Types.StructType structType, List getStructFieldsDataAsList(Object o) { } Record record = (Record) o; - return structFields - .stream() - .map(f -> record.get(f.position())) - .collect(Collectors.toList()); + return structFields.stream().map(f -> record.get(f.position())).collect(Collectors.toList()); } @Override @@ -175,7 +178,5 @@ public boolean equals(Object o) { public int hashCode() { return Objects.hash(field, oi, position); } - } - } diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimeObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimeObjectInspector.java index f38ec5b30957..a2e311489fc3 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimeObjectInspector.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimeObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.time.LocalTime; diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampObjectInspector.java index 78f7fa9db163..08c74c9afa4a 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampObjectInspector.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.sql.Timestamp; @@ -29,7 +28,8 @@ public class IcebergTimestampObjectInspector extends AbstractPrimitiveJavaObjectInspector implements TimestampObjectInspector, WriteObjectInspector { - private static final IcebergTimestampObjectInspector INSTANCE = new IcebergTimestampObjectInspector(); + private static final IcebergTimestampObjectInspector INSTANCE = + new IcebergTimestampObjectInspector(); public static IcebergTimestampObjectInspector get() { return INSTANCE; diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampWithZoneObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampWithZoneObjectInspector.java index b708e4e9c90c..f315b0b6d8ea 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampWithZoneObjectInspector.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergTimestampWithZoneObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.sql.Timestamp; @@ -30,7 +29,8 @@ public class IcebergTimestampWithZoneObjectInspector extends AbstractPrimitiveJavaObjectInspector implements TimestampObjectInspector, WriteObjectInspector { - private static final IcebergTimestampWithZoneObjectInspector INSTANCE = new IcebergTimestampWithZoneObjectInspector(); + private static final IcebergTimestampWithZoneObjectInspector INSTANCE = + new IcebergTimestampWithZoneObjectInspector(); public static IcebergTimestampWithZoneObjectInspector get() { return INSTANCE; @@ -70,5 +70,4 @@ public Object copyObject(Object o) { return o; } } - } diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergUUIDObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergUUIDObjectInspector.java index e5c44f32a091..21ac71f72cfb 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergUUIDObjectInspector.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergUUIDObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.util.UUID; diff --git a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/WriteObjectInspector.java b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/WriteObjectInspector.java index 2092dc03418a..a6e112335fe4 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/WriteObjectInspector.java +++ b/mr/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/WriteObjectInspector.java @@ -16,13 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; /** - * Interface for converting the Hive primitive objects for the objects which could be added to an Iceberg Record. - * If the IcebergObjectInspector does not implement this then the default Hive primitive objects will be used without - * conversion. + * Interface for converting the Hive primitive objects for the objects which could be added to an + * Iceberg Record. If the IcebergObjectInspector does not implement this then the default Hive + * primitive objects will be used without conversion. */ public interface WriteObjectInspector { Object convert(Object value); diff --git a/mr/src/main/java/org/apache/iceberg/mr/mapred/AbstractMapredIcebergRecordReader.java b/mr/src/main/java/org/apache/iceberg/mr/mapred/AbstractMapredIcebergRecordReader.java index 022dc513fb85..69d64f12bafa 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/mapred/AbstractMapredIcebergRecordReader.java +++ b/mr/src/main/java/org/apache/iceberg/mr/mapred/AbstractMapredIcebergRecordReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.mapred; import java.io.IOException; @@ -31,8 +30,12 @@ public abstract class AbstractMapredIcebergRecordReader implements RecordRead protected final org.apache.hadoop.mapreduce.RecordReader innerReader; - public AbstractMapredIcebergRecordReader(org.apache.iceberg.mr.mapreduce.IcebergInputFormat mapreduceInputFormat, - IcebergSplit split, JobConf job, Reporter reporter) throws IOException { + public AbstractMapredIcebergRecordReader( + org.apache.iceberg.mr.mapreduce.IcebergInputFormat mapreduceInputFormat, + IcebergSplit split, + JobConf job, + Reporter reporter) + throws IOException { TaskAttemptContext context = MapredIcebergInputFormat.newTaskAttemptContext(job, reporter); try { @@ -42,7 +45,6 @@ public AbstractMapredIcebergRecordReader(org.apache.iceberg.mr.mapreduce.Iceberg Thread.currentThread().interrupt(); throw new RuntimeException(e); } - } @Override @@ -66,5 +68,4 @@ public void close() throws IOException { innerReader.close(); } } - } diff --git a/mr/src/main/java/org/apache/iceberg/mr/mapred/Container.java b/mr/src/main/java/org/apache/iceberg/mr/mapred/Container.java index fda174ec9e3a..dc518b471292 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/mapred/Container.java +++ b/mr/src/main/java/org/apache/iceberg/mr/mapred/Container.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.mapred; import java.io.DataInput; diff --git a/mr/src/main/java/org/apache/iceberg/mr/mapred/MapredIcebergInputFormat.java b/mr/src/main/java/org/apache/iceberg/mr/mapred/MapredIcebergInputFormat.java index 60db53bcfecc..4c329c595959 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/mapred/MapredIcebergInputFormat.java +++ b/mr/src/main/java/org/apache/iceberg/mr/mapred/MapredIcebergInputFormat.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.mapred; import java.io.IOException; @@ -54,8 +53,8 @@ public MapredIcebergInputFormat() { } /** - * Configures the {@code JobConf} to use the {@code MapredIcebergInputFormat} and - * returns a helper to add further configuration. + * Configures the {@code JobConf} to use the {@code MapredIcebergInputFormat} and returns a helper + * to add further configuration. * * @param job the {@code JobConf} to configure */ @@ -66,26 +65,29 @@ public static InputFormatConfig.ConfigBuilder configure(JobConf job) { @Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { - return innerInputFormat.getSplits(newJobContext(job)) - .stream() - .map(InputSplit.class::cast) - .toArray(InputSplit[]::new); + return innerInputFormat.getSplits(newJobContext(job)).stream() + .map(InputSplit.class::cast) + .toArray(InputSplit[]::new); } @Override - public RecordReader> getRecordReader(InputSplit split, JobConf job, - Reporter reporter) throws IOException { + public RecordReader> getRecordReader( + InputSplit split, JobConf job, Reporter reporter) throws IOException { IcebergSplit icebergSplit = ((IcebergSplitContainer) split).icebergSplit(); return new MapredIcebergRecordReader<>(innerInputFormat, icebergSplit, job, reporter); } - - private static final class MapredIcebergRecordReader extends AbstractMapredIcebergRecordReader> { + private static final class MapredIcebergRecordReader + extends AbstractMapredIcebergRecordReader> { private final long splitLength; // for getPos() - MapredIcebergRecordReader(org.apache.iceberg.mr.mapreduce.IcebergInputFormat mapreduceInputFormat, - IcebergSplit split, JobConf job, Reporter reporter) throws IOException { + MapredIcebergRecordReader( + org.apache.iceberg.mr.mapreduce.IcebergInputFormat mapreduceInputFormat, + IcebergSplit split, + JobConf job, + Reporter reporter) + throws IOException { super(mapreduceInputFormat, split, job, reporter); splitLength = split.getLength(); } @@ -114,18 +116,17 @@ public Container createValue() { public long getPos() throws IOException { return (long) (splitLength * getProgress()); } - } private static JobContext newJobContext(JobConf job) { - JobID jobID = Optional.ofNullable(JobID.forName(job.get(JobContext.ID))) - .orElseGet(JobID::new); + JobID jobID = Optional.ofNullable(JobID.forName(job.get(JobContext.ID))).orElseGet(JobID::new); return new JobContextImpl(job, jobID); } public static TaskAttemptContext newTaskAttemptContext(JobConf job, Reporter reporter) { - TaskAttemptID taskAttemptID = Optional.ofNullable(TaskAttemptID.forName(job.get(JobContext.TASK_ATTEMPT_ID))) + TaskAttemptID taskAttemptID = + Optional.ofNullable(TaskAttemptID.forName(job.get(JobContext.TASK_ATTEMPT_ID))) .orElseGet(TaskAttemptID::new); return new CompatibilityTaskAttemptContextImpl(job, taskAttemptID, reporter); @@ -136,7 +137,8 @@ public static class CompatibilityTaskAttemptContextImpl extends TaskAttemptConte private final Reporter legacyReporter; - public CompatibilityTaskAttemptContextImpl(Configuration conf, TaskAttemptID taskId, Reporter reporter) { + public CompatibilityTaskAttemptContextImpl( + Configuration conf, TaskAttemptID taskId, Reporter reporter) { super(conf, taskId, toStatusReporter(reporter)); this.legacyReporter = reporter; } diff --git a/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java b/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java index ccfbb6a31006..8c31723da55c 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java +++ b/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.mapreduce; import java.io.IOException; @@ -83,12 +82,13 @@ /** * Generic Mrv2 InputFormat API for Iceberg. * - * @param T is the in memory data model which can either be Pig tuples, Hive rows. Default is Iceberg records + * @param T is the in memory data model which can either be Pig tuples, Hive rows. Default is + * Iceberg records */ public class IcebergInputFormat extends InputFormat { /** - * Configures the {@code Job} to use the {@code IcebergInputFormat} and - * returns a helper to add further configuration. + * Configures the {@code Job} to use the {@code IcebergInputFormat} and returns a helper to add + * further configuration. * * @param job the {@code Job} to configure */ @@ -100,12 +100,17 @@ public static InputFormatConfig.ConfigBuilder configure(Job job) { @Override public List getSplits(JobContext context) { Configuration conf = context.getConfiguration(); - Table table = Optional - .ofNullable(HiveIcebergStorageHandler.table(conf, conf.get(InputFormatConfig.TABLE_IDENTIFIER))) - .orElseGet(() -> Catalogs.loadTable(conf)); - - TableScan scan = table.newScan() - .caseSensitive(conf.getBoolean(InputFormatConfig.CASE_SENSITIVE, InputFormatConfig.CASE_SENSITIVE_DEFAULT)); + Table table = + Optional.ofNullable( + HiveIcebergStorageHandler.table(conf, conf.get(InputFormatConfig.TABLE_IDENTIFIER))) + .orElseGet(() -> Catalogs.loadTable(conf)); + + TableScan scan = + table + .newScan() + .caseSensitive( + conf.getBoolean( + InputFormatConfig.CASE_SENSITIVE, InputFormatConfig.CASE_SENSITIVE_DEFAULT)); long snapshotId = conf.getLong(InputFormatConfig.SNAPSHOT_ID, -1); if (snapshotId != -1) { scan = scan.useSnapshot(snapshotId); @@ -128,32 +133,39 @@ public List getSplits(JobContext context) { } // TODO add a filter parser to get rid of Serialization - Expression filter = SerializationUtil.deserializeFromBase64(conf.get(InputFormatConfig.FILTER_EXPRESSION)); + Expression filter = + SerializationUtil.deserializeFromBase64(conf.get(InputFormatConfig.FILTER_EXPRESSION)); if (filter != null) { scan = scan.filter(filter); } List splits = Lists.newArrayList(); boolean applyResidual = !conf.getBoolean(InputFormatConfig.SKIP_RESIDUAL_FILTERING, false); - InputFormatConfig.InMemoryDataModel model = conf.getEnum(InputFormatConfig.IN_MEMORY_DATA_MODEL, - InputFormatConfig.InMemoryDataModel.GENERIC); + InputFormatConfig.InMemoryDataModel model = + conf.getEnum( + InputFormatConfig.IN_MEMORY_DATA_MODEL, InputFormatConfig.InMemoryDataModel.GENERIC); try (CloseableIterable tasksIterable = scan.planTasks()) { Table serializableTable = SerializableTable.copyOf(table); - tasksIterable.forEach(task -> { - if (applyResidual && (model == InputFormatConfig.InMemoryDataModel.HIVE || - model == InputFormatConfig.InMemoryDataModel.PIG)) { - // TODO: We do not support residual evaluation for HIVE and PIG in memory data model yet - checkResiduals(task); - } - splits.add(new IcebergSplit(serializableTable, conf, task)); - }); + tasksIterable.forEach( + task -> { + if (applyResidual + && (model == InputFormatConfig.InMemoryDataModel.HIVE + || model == InputFormatConfig.InMemoryDataModel.PIG)) { + // TODO: We do not support residual evaluation for HIVE and PIG in memory data model + // yet + checkResiduals(task); + } + splits.add(new IcebergSplit(serializableTable, conf, task)); + }); } catch (IOException e) { throw new UncheckedIOException(String.format("Failed to close table scan: %s", scan), e); } // if enabled, do not serialize FileIO hadoop config to decrease split size - // However, do not skip serialization for metatable queries, because some metadata tasks cache the IO object and we - // wouldn't be able to inject the config into these tasks on the deserializer-side, unlike for standard queries + // However, do not skip serialization for metatable queries, because some metadata tasks cache + // the IO object and we + // wouldn't be able to inject the config into these tasks on the deserializer-side, unlike for + // standard queries if (scan instanceof DataTableScan) { HiveIcebergStorageHandler.checkAndSkipIoConfigSerialization(conf, table); } @@ -162,15 +174,18 @@ public List getSplits(JobContext context) { } private static void checkResiduals(CombinedScanTask task) { - task.files().forEach(fileScanTask -> { - Expression residual = fileScanTask.residual(); - if (residual != null && !residual.equals(Expressions.alwaysTrue())) { - throw new UnsupportedOperationException( - String.format( - "Filter expression %s is not completely satisfied. Additional rows " + - "can be returned not satisfied by the filter expression", residual)); - } - }); + task.files() + .forEach( + fileScanTask -> { + Expression residual = fileScanTask.residual(); + if (residual != null && !residual.equals(Expressions.alwaysTrue())) { + throw new UnsupportedOperationException( + String.format( + "Filter expression %s is not completely satisfied. Additional rows " + + "can be returned not satisfied by the filter expression", + residual)); + } + }); } @Override @@ -180,18 +195,21 @@ public RecordReader createRecordReader(InputSplit split, TaskAttemptCon private static final class IcebergRecordReader extends RecordReader { - private static final String HIVE_VECTORIZED_READER_CLASS = "org.apache.iceberg.mr.hive.vector.HiveVectorizedReader"; + private static final String HIVE_VECTORIZED_READER_CLASS = + "org.apache.iceberg.mr.hive.vector.HiveVectorizedReader"; private static final DynMethods.StaticMethod HIVE_VECTORIZED_READER_BUILDER; static { if (MetastoreUtil.hive3PresentOnClasspath()) { - HIVE_VECTORIZED_READER_BUILDER = DynMethods.builder("reader") - .impl(HIVE_VECTORIZED_READER_CLASS, - InputFile.class, - FileScanTask.class, - Map.class, - TaskAttemptContext.class) - .buildStatic(); + HIVE_VECTORIZED_READER_BUILDER = + DynMethods.builder("reader") + .impl( + HIVE_VECTORIZED_READER_CLASS, + InputFile.class, + FileScanTask.class, + Map.class, + TaskAttemptContext.class) + .buildStatic(); } else { HIVE_VECTORIZED_READER_BUILDER = null; } @@ -213,7 +231,8 @@ private static final class IcebergRecordReader extends RecordReader @Override public void initialize(InputSplit split, TaskAttemptContext newContext) { Configuration conf = newContext.getConfiguration(); - // For now IcebergInputFormat does its own split planning and does not accept FileSplit instances + // For now IcebergInputFormat does its own split planning and does not accept FileSplit + // instances CombinedScanTask task = ((IcebergSplit) split).task(); this.context = newContext; Table table = ((IcebergSplit) split).table(); @@ -223,11 +242,14 @@ public void initialize(InputSplit split, TaskAttemptContext newContext) { this.tasks = task.files().iterator(); this.tableSchema = InputFormatConfig.tableSchema(conf); this.nameMapping = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - this.caseSensitive = conf.getBoolean(InputFormatConfig.CASE_SENSITIVE, InputFormatConfig.CASE_SENSITIVE_DEFAULT); + this.caseSensitive = + conf.getBoolean( + InputFormatConfig.CASE_SENSITIVE, InputFormatConfig.CASE_SENSITIVE_DEFAULT); this.expectedSchema = readSchema(conf, tableSchema, caseSensitive); this.reuseContainers = conf.getBoolean(InputFormatConfig.REUSE_CONTAINERS, false); - this.inMemoryDataModel = conf.getEnum(InputFormatConfig.IN_MEMORY_DATA_MODEL, - InputFormatConfig.InMemoryDataModel.GENERIC); + this.inMemoryDataModel = + conf.getEnum( + InputFormatConfig.IN_MEMORY_DATA_MODEL, InputFormatConfig.InMemoryDataModel.GENERIC); this.currentIterator = open(tasks.next(), expectedSchema).iterator(); } @@ -259,11 +281,16 @@ public T getCurrentValue() { @Override public float getProgress() { - // TODO: We could give a more accurate progress based on records read from the file. Context.getProgress does not - // have enough information to give an accurate progress value. This isn't that easy, since we don't know how much - // of the input split has been processed and we are pushing filters into Parquet and ORC. But we do know when a - // file is opened and could count the number of rows returned, so we can estimate. And we could also add a row - // count to the readers so that we can get an accurate count of rows that have been either returned or filtered + // TODO: We could give a more accurate progress based on records read from the file. + // Context.getProgress does not + // have enough information to give an accurate progress value. This isn't that easy, since we + // don't know how much + // of the input split has been processed and we are pushing filters into Parquet and ORC. But + // we do know when a + // file is opened and could count the number of rows returned, so we can estimate. And we + // could also add a row + // count to the readers so that we can get an accurate count of rows that have been either + // returned or filtered // out. return context.getProgress(); } @@ -275,9 +302,10 @@ public void close() throws IOException { private CloseableIterable openTask(FileScanTask currentTask, Schema readSchema) { DataFile file = currentTask.file(); - InputFile inputFile = encryptionManager.decrypt(EncryptedFiles.encryptedInput( - io.newInputFile(file.path().toString()), - file.keyMetadata())); + InputFile inputFile = + encryptionManager.decrypt( + EncryptedFiles.encryptedInput( + io.newInputFile(file.path().toString()), file.keyMetadata())); CloseableIterable iterable; switch (file.format()) { @@ -315,16 +343,18 @@ private CloseableIterable open(FileScanTask currentTask, Schema readSchema) { } } - private CloseableIterable applyResidualFiltering(CloseableIterable iter, Expression residual, - Schema readSchema) { - boolean applyResidual = !context.getConfiguration().getBoolean(InputFormatConfig.SKIP_RESIDUAL_FILTERING, false); + private CloseableIterable applyResidualFiltering( + CloseableIterable iter, Expression residual, Schema readSchema) { + boolean applyResidual = + !context.getConfiguration().getBoolean(InputFormatConfig.SKIP_RESIDUAL_FILTERING, false); if (applyResidual && residual != null && residual != Expressions.alwaysTrue()) { // Date and timestamp values are not the correct type for Evaluator. // Wrapping to return the expected type. InternalRecordWrapper wrapper = new InternalRecordWrapper(readSchema.asStruct()); Evaluator filter = new Evaluator(readSchema.asStruct(), residual, caseSensitive); - return CloseableIterable.filter(iter, record -> filter.eval(wrapper.wrap((StructLike) record))); + return CloseableIterable.filter( + iter, record -> filter.eval(wrapper.wrap((StructLike) record))); } else { return iter; } @@ -332,9 +362,8 @@ private CloseableIterable applyResidualFiltering(CloseableIterable iter, E private CloseableIterable newAvroIterable( InputFile inputFile, FileScanTask task, Schema readSchema) { - Avro.ReadBuilder avroReadBuilder = Avro.read(inputFile) - .project(readSchema) - .split(task.start(), task.length()); + Avro.ReadBuilder avroReadBuilder = + Avro.read(inputFile).project(readSchema).split(task.start(), task.length()); if (reuseContainers) { avroReadBuilder.reuseContainers(); } @@ -346,18 +375,23 @@ private CloseableIterable newAvroIterable( case PIG: case HIVE: // TODO implement value readers for Pig and Hive - throw new UnsupportedOperationException("Avro support not yet supported for Pig and Hive"); + throw new UnsupportedOperationException( + "Avro support not yet supported for Pig and Hive"); case GENERIC: avroReadBuilder.createReaderFunc( (expIcebergSchema, expAvroSchema) -> - DataReader.create(expIcebergSchema, expAvroSchema, + DataReader.create( + expIcebergSchema, + expAvroSchema, constantsMap(task, IdentityPartitionConverters::convertConstant))); } return applyResidualFiltering(avroReadBuilder.build(), task.residual(), readSchema); } - private CloseableIterable newParquetIterable(InputFile inputFile, FileScanTask task, Schema readSchema) { - Map idToConstant = constantsMap(task, IdentityPartitionConverters::convertConstant); + private CloseableIterable newParquetIterable( + InputFile inputFile, FileScanTask task, Schema readSchema) { + Map idToConstant = + constantsMap(task, IdentityPartitionConverters::convertConstant); CloseableIterable parquetIterator = null; switch (inMemoryDataModel) { @@ -365,17 +399,20 @@ private CloseableIterable newParquetIterable(InputFile inputFile, FileScanTas throw new UnsupportedOperationException("Parquet support not yet supported for Pig"); case HIVE: if (MetastoreUtil.hive3PresentOnClasspath()) { - parquetIterator = HIVE_VECTORIZED_READER_BUILDER.invoke(inputFile, task, idToConstant, context); + parquetIterator = + HIVE_VECTORIZED_READER_BUILDER.invoke(inputFile, task, idToConstant, context); } else { - throw new UnsupportedOperationException("Vectorized read is unsupported for Hive 2 integration."); + throw new UnsupportedOperationException( + "Vectorized read is unsupported for Hive 2 integration."); } break; case GENERIC: - Parquet.ReadBuilder parquetReadBuilder = Parquet.read(inputFile) - .project(readSchema) - .filter(task.residual()) - .caseSensitive(caseSensitive) - .split(task.start(), task.length()); + Parquet.ReadBuilder parquetReadBuilder = + Parquet.read(inputFile) + .project(readSchema) + .filter(task.residual()) + .caseSensitive(caseSensitive) + .split(task.start(), task.length()); if (reuseContainers) { parquetReadBuilder.reuseContainers(); } @@ -383,17 +420,23 @@ private CloseableIterable newParquetIterable(InputFile inputFile, FileScanTas parquetReadBuilder.withNameMapping(NameMappingParser.fromJson(nameMapping)); } parquetReadBuilder.createReaderFunc( - fileSchema -> GenericParquetReaders.buildReader( - readSchema, fileSchema, constantsMap(task, IdentityPartitionConverters::convertConstant))); + fileSchema -> + GenericParquetReaders.buildReader( + readSchema, + fileSchema, + constantsMap(task, IdentityPartitionConverters::convertConstant))); parquetIterator = parquetReadBuilder.build(); } return applyResidualFiltering(parquetIterator, task.residual(), readSchema); } - private CloseableIterable newOrcIterable(InputFile inputFile, FileScanTask task, Schema readSchema) { - Map idToConstant = constantsMap(task, IdentityPartitionConverters::convertConstant); - Schema readSchemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(readSchema, - Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); + private CloseableIterable newOrcIterable( + InputFile inputFile, FileScanTask task, Schema readSchema) { + Map idToConstant = + constantsMap(task, IdentityPartitionConverters::convertConstant); + Schema readSchemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot( + readSchema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); CloseableIterable orcIterator = null; // ORC does not support reuse containers yet @@ -403,20 +446,22 @@ private CloseableIterable newOrcIterable(InputFile inputFile, FileScanTask ta throw new UnsupportedOperationException("ORC support not yet supported for Pig"); case HIVE: if (MetastoreUtil.hive3PresentOnClasspath()) { - orcIterator = HIVE_VECTORIZED_READER_BUILDER.invoke(inputFile, task, idToConstant, context); + orcIterator = + HIVE_VECTORIZED_READER_BUILDER.invoke(inputFile, task, idToConstant, context); } else { - throw new UnsupportedOperationException("Vectorized read is unsupported for Hive 2 integration."); + throw new UnsupportedOperationException( + "Vectorized read is unsupported for Hive 2 integration."); } break; case GENERIC: - ORC.ReadBuilder orcReadBuilder = ORC.read(inputFile) - .project(readSchemaWithoutConstantAndMetadataFields) - .filter(task.residual()) - .caseSensitive(caseSensitive) - .split(task.start(), task.length()); + ORC.ReadBuilder orcReadBuilder = + ORC.read(inputFile) + .project(readSchemaWithoutConstantAndMetadataFields) + .filter(task.residual()) + .caseSensitive(caseSensitive) + .split(task.start(), task.length()); orcReadBuilder.createReaderFunc( - fileSchema -> GenericOrcReader.buildReader( - readSchema, fileSchema, idToConstant)); + fileSchema -> GenericOrcReader.buildReader(readSchema, fileSchema, idToConstant)); if (nameMapping != null) { orcReadBuilder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -427,7 +472,8 @@ private CloseableIterable newOrcIterable(InputFile inputFile, FileScanTask ta return applyResidualFiltering(orcIterator, task.residual(), readSchema); } - private Map constantsMap(FileScanTask task, BiFunction converter) { + private Map constantsMap( + FileScanTask task, BiFunction converter) { PartitionSpec spec = task.spec(); Set idColumns = spec.identitySourceIds(); Schema partitionSchema = TypeUtil.select(expectedSchema, idColumns); @@ -439,7 +485,8 @@ private CloseableIterable newOrcIterable(InputFile inputFile, FileScanTask ta } } - private static Schema readSchema(Configuration conf, Schema tableSchema, boolean caseSensitive) { + private static Schema readSchema( + Configuration conf, Schema tableSchema, boolean caseSensitive) { Schema readSchema = InputFormatConfig.readSchema(conf); if (readSchema != null) { @@ -451,8 +498,9 @@ private static Schema readSchema(Configuration conf, Schema tableSchema, boolean return tableSchema; } - return caseSensitive ? tableSchema.select(selectedColumns) : tableSchema.caseInsensitiveSelect(selectedColumns); + return caseSensitive + ? tableSchema.select(selectedColumns) + : tableSchema.caseInsensitiveSelect(selectedColumns); } } - } diff --git a/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergSplit.java b/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergSplit.java index 4ed43d3b17d5..e4aeeaee6d3f 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergSplit.java +++ b/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergSplit.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.mapreduce; import java.io.DataInput; @@ -31,11 +30,13 @@ import org.apache.iceberg.mr.InputFormatConfig; import org.apache.iceberg.util.SerializationUtil; -// Since this class extends `mapreduce.InputSplit and implements `mapred.InputSplit`, it can be returned by both MR v1 +// Since this class extends `mapreduce.InputSplit and implements `mapred.InputSplit`, it can be +// returned by both MR v1 // and v2 file formats. -public class IcebergSplit extends InputSplit implements org.apache.hadoop.mapred.InputSplit, IcebergSplitContainer { +public class IcebergSplit extends InputSplit + implements org.apache.hadoop.mapred.InputSplit, IcebergSplitContainer { - private static final String[] ANYWHERE = new String[]{"*"}; + private static final String[] ANYWHERE = new String[] {"*"}; private Table table; private CombinedScanTask task; @@ -44,8 +45,7 @@ public class IcebergSplit extends InputSplit implements org.apache.hadoop.mapred private transient Configuration conf; // public no-argument constructor for deserialization - public IcebergSplit() { - } + public IcebergSplit() {} IcebergSplit(Table table, Configuration conf, CombinedScanTask task) { this.table = table; diff --git a/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergSplitContainer.java b/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergSplitContainer.java index c77543763b1a..68328cf460f7 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergSplitContainer.java +++ b/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergSplitContainer.java @@ -16,11 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.mapreduce; public interface IcebergSplitContainer { IcebergSplit icebergSplit(); - } diff --git a/mr/src/test/java/org/apache/iceberg/mr/TestCatalogs.java b/mr/src/test/java/org/apache/iceberg/mr/TestCatalogs.java index 4d4a87f03c54..47b633528cf9 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/TestCatalogs.java +++ b/mr/src/test/java/org/apache/iceberg/mr/TestCatalogs.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.Collections; import java.util.Optional; @@ -46,17 +47,15 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestCatalogs { private static final Schema SCHEMA = new Schema(required(1, "foo", Types.StringType.get())); - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("foo").build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("foo").build(); private Configuration conf; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Before public void before() { @@ -67,8 +66,10 @@ public void before() { public void testLoadTableFromLocation() throws IOException { conf.set(InputFormatConfig.CATALOG, Catalogs.LOCATION); AssertHelpers.assertThrows( - "Should complain about table location not set", IllegalArgumentException.class, - "location not set", () -> Catalogs.loadTable(conf)); + "Should complain about table location not set", + IllegalArgumentException.class, + "location not set", + () -> Catalogs.loadTable(conf)); HadoopTables tables = new HadoopTables(); Table hadoopTable = tables.create(SCHEMA, temp.newFolder("hadoop_tables").toString()); @@ -85,8 +86,10 @@ public void testLoadTableFromCatalog() throws IOException { setCustomCatalogProperties(defaultCatalogName, warehouseLocation); AssertHelpers.assertThrows( - "Should complain about table identifier not set", IllegalArgumentException.class, - "identifier not set", () -> Catalogs.loadTable(conf)); + "Should complain about table identifier not set", + IllegalArgumentException.class, + "identifier not set", + () -> Catalogs.loadTable(conf)); HadoopCatalog catalog = new CustomHadoopCatalog(conf, warehouseLocation); Table hadoopCatalogTable = catalog.createTable(TableIdentifier.of("table"), SCHEMA); @@ -101,15 +104,19 @@ public void testCreateDropTableToLocation() throws IOException { Properties missingSchema = new Properties(); missingSchema.put("location", temp.newFolder("hadoop_tables").toString()); AssertHelpers.assertThrows( - "Should complain about table schema not set", NullPointerException.class, - "schema not set", () -> Catalogs.createTable(conf, missingSchema)); + "Should complain about table schema not set", + NullPointerException.class, + "schema not set", + () -> Catalogs.createTable(conf, missingSchema)); conf.set(InputFormatConfig.CATALOG, Catalogs.LOCATION); Properties missingLocation = new Properties(); missingLocation.put(InputFormatConfig.TABLE_SCHEMA, SchemaParser.toJson(SCHEMA)); AssertHelpers.assertThrows( - "Should complain about table location not set", NullPointerException.class, - "location not set", () -> Catalogs.createTable(conf, missingLocation)); + "Should complain about table location not set", + NullPointerException.class, + "location not set", + () -> Catalogs.createTable(conf, missingLocation)); Properties properties = new Properties(); properties.put("location", temp.getRoot() + "/hadoop_tables"); @@ -128,16 +135,20 @@ public void testCreateDropTableToLocation() throws IOException { Assert.assertEquals(Collections.singletonMap("dummy", "test"), table.properties()); AssertHelpers.assertThrows( - "Should complain about table location not set", NullPointerException.class, - "location not set", () -> Catalogs.dropTable(conf, new Properties())); + "Should complain about table location not set", + NullPointerException.class, + "location not set", + () -> Catalogs.dropTable(conf, new Properties())); Properties dropProperties = new Properties(); dropProperties.put("location", temp.getRoot() + "/hadoop_tables"); Catalogs.dropTable(conf, dropProperties); AssertHelpers.assertThrows( - "Should complain about table not found", NoSuchTableException.class, - "Table does not exist", () -> Catalogs.loadTable(conf, dropProperties)); + "Should complain about table not found", + NoSuchTableException.class, + "Table does not exist", + () -> Catalogs.loadTable(conf, dropProperties)); } @Test @@ -152,15 +163,19 @@ public void testCreateDropTableToCatalog() throws IOException { missingSchema.put("name", identifier.toString()); missingSchema.put(InputFormatConfig.CATALOG_NAME, defaultCatalogName); AssertHelpers.assertThrows( - "Should complain about table schema not set", NullPointerException.class, - "schema not set", () -> Catalogs.createTable(conf, missingSchema)); + "Should complain about table schema not set", + NullPointerException.class, + "schema not set", + () -> Catalogs.createTable(conf, missingSchema)); Properties missingIdentifier = new Properties(); missingIdentifier.put(InputFormatConfig.TABLE_SCHEMA, SchemaParser.toJson(SCHEMA)); missingIdentifier.put(InputFormatConfig.CATALOG_NAME, defaultCatalogName); AssertHelpers.assertThrows( - "Should complain about table identifier not set", NullPointerException.class, - "identifier not set", () -> Catalogs.createTable(conf, missingIdentifier)); + "Should complain about table identifier not set", + NullPointerException.class, + "identifier not set", + () -> Catalogs.createTable(conf, missingIdentifier)); Properties properties = new Properties(); properties.put("name", identifier.toString()); @@ -179,8 +194,10 @@ public void testCreateDropTableToCatalog() throws IOException { Assert.assertEquals(Collections.singletonMap("dummy", "test"), table.properties()); AssertHelpers.assertThrows( - "Should complain about table identifier not set", NullPointerException.class, - "identifier not set", () -> Catalogs.dropTable(conf, new Properties())); + "Should complain about table identifier not set", + NullPointerException.class, + "identifier not set", + () -> Catalogs.dropTable(conf, new Properties())); Properties dropProperties = new Properties(); dropProperties.put("name", identifier.toString()); @@ -188,8 +205,10 @@ public void testCreateDropTableToCatalog() throws IOException { Catalogs.dropTable(conf, dropProperties); AssertHelpers.assertThrows( - "Should complain about table not found", NoSuchTableException.class, - "Table does not exist", () -> Catalogs.loadTable(conf, dropProperties)); + "Should complain about table not found", + NoSuchTableException.class, + "Table does not exist", + () -> Catalogs.loadTable(conf, dropProperties)); } @Test @@ -240,8 +259,10 @@ public void testLegacyLoadCatalogLocation() { public void testLegacyLoadCatalogUnknown() { conf.set(InputFormatConfig.CATALOG, "fooType"); AssertHelpers.assertThrows( - "should complain about catalog not supported", UnsupportedOperationException.class, - "Unknown catalog type", () -> Catalogs.loadCatalog(conf, null)); + "should complain about catalog not supported", + UnsupportedOperationException.class, + "Unknown catalog type", + () -> Catalogs.loadCatalog(conf, null)); } @Test @@ -258,7 +279,8 @@ public void testLoadCatalogDefault() { @Test public void testLoadCatalogHive() { String catalogName = "barCatalog"; - conf.set(InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogUtil.ICEBERG_CATALOG_TYPE), + conf.set( + InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogUtil.ICEBERG_CATALOG_TYPE), CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE); Optional hiveCatalog = Catalogs.loadCatalog(conf, catalogName); Assert.assertTrue(hiveCatalog.isPresent()); @@ -271,25 +293,33 @@ public void testLoadCatalogHive() { @Test public void testLegacyLoadCustomCatalogWithHiveCatalogTypeSet() { String catalogName = "barCatalog"; - conf.set(InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogUtil.ICEBERG_CATALOG_TYPE), - CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE); + conf.set( + InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogUtil.ICEBERG_CATALOG_TYPE), + CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE); conf.set(InputFormatConfig.CATALOG_LOADER_CLASS, CustomHadoopCatalog.class.getName()); conf.set(InputFormatConfig.HADOOP_CATALOG_WAREHOUSE_LOCATION, "/tmp/mylocation"); - AssertHelpers.assertThrows("Should complain about both configs being set", IllegalArgumentException.class, - "both type and catalog-impl are set", () -> Catalogs.loadCatalog(conf, catalogName)); + AssertHelpers.assertThrows( + "Should complain about both configs being set", + IllegalArgumentException.class, + "both type and catalog-impl are set", + () -> Catalogs.loadCatalog(conf, catalogName)); } @Test public void testLoadCatalogHadoop() { String catalogName = "barCatalog"; - conf.set(InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogUtil.ICEBERG_CATALOG_TYPE), + conf.set( + InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogUtil.ICEBERG_CATALOG_TYPE), CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP); - conf.set(InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogProperties.WAREHOUSE_LOCATION), + conf.set( + InputFormatConfig.catalogPropertyConfigKey( + catalogName, CatalogProperties.WAREHOUSE_LOCATION), "/tmp/mylocation"); Optional hadoopCatalog = Catalogs.loadCatalog(conf, catalogName); Assert.assertTrue(hadoopCatalog.isPresent()); Assertions.assertThat(hadoopCatalog.get()).isInstanceOf(HadoopCatalog.class); - Assert.assertEquals("HadoopCatalog{name=barCatalog, location=/tmp/mylocation}", hadoopCatalog.get().toString()); + Assert.assertEquals( + "HadoopCatalog{name=barCatalog, location=/tmp/mylocation}", hadoopCatalog.get().toString()); Properties properties = new Properties(); properties.put(InputFormatConfig.CATALOG_NAME, catalogName); Assert.assertFalse(Catalogs.hiveCatalog(conf, properties)); @@ -298,13 +328,15 @@ public void testLoadCatalogHadoop() { @Test public void testLoadCatalogHadoopWithLegacyWarehouseLocation() { String catalogName = "barCatalog"; - conf.set(InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogUtil.ICEBERG_CATALOG_TYPE), + conf.set( + InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogUtil.ICEBERG_CATALOG_TYPE), CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP); conf.set(InputFormatConfig.HADOOP_CATALOG_WAREHOUSE_LOCATION, "/tmp/mylocation"); Optional hadoopCatalog = Catalogs.loadCatalog(conf, catalogName); Assert.assertTrue(hadoopCatalog.isPresent()); Assertions.assertThat(hadoopCatalog.get()).isInstanceOf(HadoopCatalog.class); - Assert.assertEquals("HadoopCatalog{name=barCatalog, location=/tmp/mylocation}", hadoopCatalog.get().toString()); + Assert.assertEquals( + "HadoopCatalog{name=barCatalog, location=/tmp/mylocation}", hadoopCatalog.get().toString()); Properties properties = new Properties(); properties.put(InputFormatConfig.CATALOG_NAME, catalogName); Assert.assertFalse(Catalogs.hiveCatalog(conf, properties)); @@ -313,9 +345,12 @@ public void testLoadCatalogHadoopWithLegacyWarehouseLocation() { @Test public void testLoadCatalogCustom() { String catalogName = "barCatalog"; - conf.set(InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogProperties.CATALOG_IMPL), + conf.set( + InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogProperties.CATALOG_IMPL), CustomHadoopCatalog.class.getName()); - conf.set(InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogProperties.WAREHOUSE_LOCATION), + conf.set( + InputFormatConfig.catalogPropertyConfigKey( + catalogName, CatalogProperties.WAREHOUSE_LOCATION), "/tmp/mylocation"); Optional customHadoopCatalog = Catalogs.loadCatalog(conf, catalogName); Assert.assertTrue(customHadoopCatalog.isPresent()); @@ -333,28 +368,32 @@ public void testLoadCatalogLocation() { @Test public void testLoadCatalogUnknown() { String catalogName = "barCatalog"; - conf.set(InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogUtil.ICEBERG_CATALOG_TYPE), "fooType"); + conf.set( + InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogUtil.ICEBERG_CATALOG_TYPE), + "fooType"); AssertHelpers.assertThrows( - "should complain about catalog not supported", UnsupportedOperationException.class, - "Unknown catalog type:", () -> Catalogs.loadCatalog(conf, catalogName)); + "should complain about catalog not supported", + UnsupportedOperationException.class, + "Unknown catalog type:", + () -> Catalogs.loadCatalog(conf, catalogName)); } public static class CustomHadoopCatalog extends HadoopCatalog { - public CustomHadoopCatalog() { - - } + public CustomHadoopCatalog() {} public CustomHadoopCatalog(Configuration conf, String warehouseLocation) { super(conf, warehouseLocation); } - } private void setCustomCatalogProperties(String catalogName, String warehouseLocation) { - conf.set(InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogProperties.WAREHOUSE_LOCATION), + conf.set( + InputFormatConfig.catalogPropertyConfigKey( + catalogName, CatalogProperties.WAREHOUSE_LOCATION), warehouseLocation); - conf.set(InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogProperties.CATALOG_IMPL), + conf.set( + InputFormatConfig.catalogPropertyConfigKey(catalogName, CatalogProperties.CATALOG_IMPL), CustomHadoopCatalog.class.getName()); conf.set(InputFormatConfig.CATALOG_NAME, catalogName); } diff --git a/mr/src/test/java/org/apache/iceberg/mr/TestHelper.java b/mr/src/test/java/org/apache/iceberg/mr/TestHelper.java index 363570750a88..8d877d4a3173 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/TestHelper.java +++ b/mr/src/test/java/org/apache/iceberg/mr/TestHelper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr; import java.io.IOException; @@ -53,8 +52,14 @@ public class TestHelper { private Table table; - public TestHelper(Configuration conf, Tables tables, String tableIdentifier, Schema schema, PartitionSpec spec, - FileFormat fileFormat, TemporaryFolder tmp) { + public TestHelper( + Configuration conf, + Tables tables, + String tableIdentifier, + Schema schema, + PartitionSpec spec, + FileFormat fileFormat, + TemporaryFolder tmp) { this.conf = conf; this.tables = tables; this.tableIdentifier = tableIdentifier; @@ -74,8 +79,11 @@ public Table table() { } public Map properties() { - return ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, fileFormat.name(), - TableProperties.ENGINE_HIVE_ENABLED, "true"); + return ImmutableMap.of( + TableProperties.DEFAULT_FILE_FORMAT, + fileFormat.name(), + TableProperties.ENGINE_HIVE_ENABLED, + "true"); } public Table createTable(Schema theSchema, PartitionSpec theSpec) { @@ -92,7 +100,6 @@ public Table createUnpartitionedTable() { return createTable(schema, PartitionSpec.unpartitioned()); } - public List generateRandomRecords(int num, long seed) { Preconditions.checkNotNull(table, "table not set"); return generateRandomRecords(table.schema(), num, seed); diff --git a/mr/src/test/java/org/apache/iceberg/mr/TestIcebergInputFormats.java b/mr/src/test/java/org/apache/iceberg/mr/TestIcebergInputFormats.java index 3e38c02d30cf..b8199927d052 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/TestIcebergInputFormats.java +++ b/mr/src/test/java/org/apache/iceberg/mr/TestIcebergInputFormats.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.io.UncheckedIOException; @@ -69,29 +70,28 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestIcebergInputFormats { - public static final List> TESTED_INPUT_FORMATS = ImmutableList.of( + public static final List> TESTED_INPUT_FORMATS = + ImmutableList.of( TestInputFormat.newFactory("IcebergInputFormat", TestIcebergInputFormat::create), - TestInputFormat.newFactory("MapredIcebergInputFormat", TestMapredIcebergInputFormat::create)); + TestInputFormat.newFactory( + "MapredIcebergInputFormat", TestMapredIcebergInputFormat::create)); - private static final List TESTED_FILE_FORMATS = ImmutableList.of("avro", "orc", "parquet"); + private static final List TESTED_FILE_FORMATS = + ImmutableList.of("avro", "orc", "parquet"); - private static final Schema SCHEMA = new Schema( + private static final Schema SCHEMA = + new Schema( required(1, "data", Types.StringType.get()), required(2, "id", Types.LongType.get()), required(3, "date", Types.StringType.get())); - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .identity("date") - .bucket("id", 1) - .build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("date").bucket("id", 1).build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); // before variables private Configuration conf; @@ -130,7 +130,8 @@ public static Object[][] parameters() { return parameters; } - public TestIcebergInputFormats(TestInputFormat.Factory testInputFormat, String fileFormat) { + public TestIcebergInputFormats( + TestInputFormat.Factory testInputFormat, String fileFormat) { this.testInputFormat = testInputFormat; this.fileFormat = FileFormat.valueOf(fileFormat.toUpperCase(Locale.ENGLISH)); } @@ -163,7 +164,8 @@ public void testFilterExp() throws Exception { expectedRecords.get(1).set(2, "2020-03-20"); DataFile dataFile1 = helper.writeFile(Row.of("2020-03-20", 0), expectedRecords); - DataFile dataFile2 = helper.writeFile(Row.of("2020-03-21", 0), helper.generateRandomRecords(2, 0L)); + DataFile dataFile2 = + helper.writeFile(Row.of("2020-03-21", 0), helper.generateRandomRecords(2, 0L)); helper.appendToTable(dataFile1, dataFile2); builder.filter(Expressions.equal("date", "2020-03-20")); @@ -184,12 +186,12 @@ public void testResiduals() throws Exception { expectedRecords.add(writeRecords.get(0)); DataFile dataFile1 = helper.writeFile(Row.of("2020-03-20", 0), writeRecords); - DataFile dataFile2 = helper.writeFile(Row.of("2020-03-21", 0), helper.generateRandomRecords(2, 0L)); + DataFile dataFile2 = + helper.writeFile(Row.of("2020-03-21", 0), helper.generateRandomRecords(2, 0L)); helper.appendToTable(dataFile1, dataFile2); - builder.filter(Expressions.and( - Expressions.equal("date", "2020-03-20"), - Expressions.equal("id", 123))); + builder.filter( + Expressions.and(Expressions.equal("date", "2020-03-20"), Expressions.equal("id", 123))); testInputFormat.create(builder.conf()).validate(expectedRecords); // skip residual filtering @@ -207,21 +209,23 @@ public void testFailedResidualFiltering() throws Exception { helper.appendToTable(Row.of("2020-03-20", 0), expectedRecords); - builder.useHiveRows() - .filter(Expressions.and( - Expressions.equal("date", "2020-03-20"), - Expressions.equal("id", 0))); + builder + .useHiveRows() + .filter( + Expressions.and(Expressions.equal("date", "2020-03-20"), Expressions.equal("id", 0))); AssertHelpers.assertThrows( "Residuals are not evaluated today for Iceberg Generics In memory model of HIVE", - UnsupportedOperationException.class, "Filter expression ref(name=\"id\") == 0 is not completely satisfied.", + UnsupportedOperationException.class, + "Filter expression ref(name=\"id\") == 0 is not completely satisfied.", () -> testInputFormat.create(builder.conf())); builder.usePigTuples(); AssertHelpers.assertThrows( "Residuals are not evaluated today for Iceberg Generics In memory model of PIG", - UnsupportedOperationException.class, "Filter expression ref(name=\"id\") == 0 is not completely satisfied.", + UnsupportedOperationException.class, + "Filter expression ref(name=\"id\") == 0 is not completely satisfied.", () -> testInputFormat.create(builder.conf())); } @@ -240,15 +244,15 @@ public void testProjection() throws Exception { Assert.assertEquals(projection.asStruct(), outputRecords.get(0).struct()); } - private static final Schema LOG_SCHEMA = new Schema( + private static final Schema LOG_SCHEMA = + new Schema( Types.NestedField.optional(1, "id", Types.IntegerType.get()), Types.NestedField.optional(2, "date", Types.StringType.get()), Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get()) - ); + Types.NestedField.optional(4, "message", Types.StringType.get())); private static final PartitionSpec IDENTITY_PARTITION_SPEC = - PartitionSpec.builderFor(LOG_SCHEMA).identity("date").identity("level").build(); + PartitionSpec.builderFor(LOG_SCHEMA).identity("date").identity("level").build(); @Test public void testIdentityPartitionProjections() throws Exception { @@ -260,7 +264,8 @@ public void testIdentityPartitionProjections() throws Exception { for (Record record : inputRecords) { record.set(1, "2020-03-2" + idx); record.set(2, idx.toString()); - append.appendFile(helper.writeFile(Row.of("2020-03-2" + idx, idx.toString()), ImmutableList.of(record))); + append.appendFile( + helper.writeFile(Row.of("2020-03-2" + idx, idx.toString()), ImmutableList.of(record))); idx += 1; } append.commit(); @@ -298,7 +303,8 @@ private static Schema withColumns(String... names) { return TypeUtil.select(LOG_SCHEMA, projectedIds); } - private void validateIdentityPartitionProjections(Schema projectedSchema, List inputRecords) { + private void validateIdentityPartitionProjections( + Schema projectedSchema, List inputRecords) { builder.project(projectedSchema); List actualRecords = testInputFormat.create(builder.conf()).getRecords(); @@ -307,11 +313,14 @@ private void validateIdentityPartitionProjections(Schema projectedSchema, List { private final List splits; @@ -396,10 +410,12 @@ public void validate(List expected) { public interface Factory { String name(); + TestInputFormat create(Configuration conf); } - public static Factory newFactory(String name, Function> function) { + public static Factory newFactory( + String name, Function> function) { return new Factory() { @Override public String name() { @@ -437,8 +453,8 @@ private static TestMapredIcebergInputFormat create(Configuration conf) { for (org.apache.hadoop.mapred.InputSplit split : splits) { iceSplits.add((IcebergSplit) split); - org.apache.hadoop.mapred.RecordReader> - reader = inputFormat.getRecordReader(split, job, Reporter.NULL); + org.apache.hadoop.mapred.RecordReader> reader = + inputFormat.getRecordReader(split, job, Reporter.NULL); try { Container container = reader.createValue(); diff --git a/mr/src/test/java/org/apache/iceberg/mr/TestInputFormatReaderDeletes.java b/mr/src/test/java/org/apache/iceberg/mr/TestInputFormatReaderDeletes.java index 2ba4e50e8aa1..45b74f897cb7 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/TestInputFormatReaderDeletes.java +++ b/mr/src/test/java/org/apache/iceberg/mr/TestInputFormatReaderDeletes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr; import java.io.File; @@ -53,12 +52,12 @@ public class TestInputFormatReaderDeletes extends DeleteReadTests { @Parameterized.Parameters(name = "inputFormat = {0}, fileFormat={1}") public static Object[][] parameters() { return new Object[][] { - { "IcebergInputFormat", FileFormat.PARQUET }, - { "IcebergInputFormat", FileFormat.AVRO }, - { "IcebergInputFormat", FileFormat.ORC }, - { "MapredIcebergInputFormat", FileFormat.PARQUET }, - { "MapredIcebergInputFormat", FileFormat.AVRO }, - { "MapredIcebergInputFormat", FileFormat.ORC }, + {"IcebergInputFormat", FileFormat.PARQUET}, + {"IcebergInputFormat", FileFormat.AVRO}, + {"IcebergInputFormat", FileFormat.ORC}, + {"MapredIcebergInputFormat", FileFormat.PARQUET}, + {"MapredIcebergInputFormat", FileFormat.AVRO}, + {"MapredIcebergInputFormat", FileFormat.ORC}, }; } @@ -97,17 +96,20 @@ protected void dropTable(String name) { @Override public StructLikeSet rowSet(String name, Table table, String... columns) { - InputFormatConfig.ConfigBuilder builder = new InputFormatConfig.ConfigBuilder(conf).readFrom(table.location()); + InputFormatConfig.ConfigBuilder builder = + new InputFormatConfig.ConfigBuilder(conf).readFrom(table.location()); Schema projected = table.schema().select(columns); StructLikeSet set = StructLikeSet.create(projected.asStruct()); - set.addAll(TestIcebergInputFormats.TESTED_INPUT_FORMATS.stream() - .filter(recordFactory -> recordFactory.name().equals(inputFormat)) - .map(recordFactory -> recordFactory.create(builder.project(projected).conf()).getRecords()) - .flatMap(List::stream) - .map(record -> new InternalRecordWrapper(projected.asStruct()).wrap(record)) - .collect(Collectors.toList()) - ); + set.addAll( + TestIcebergInputFormats.TESTED_INPUT_FORMATS.stream() + .filter(recordFactory -> recordFactory.name().equals(inputFormat)) + .map( + recordFactory -> + recordFactory.create(builder.project(projected).conf()).getRecords()) + .flatMap(List::stream) + .map(record -> new InternalRecordWrapper(projected.asStruct()).wrap(record)) + .collect(Collectors.toList())); return set; } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java b/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java index 649998fa718c..f20b28a35fd2 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandlerTestUtils.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import java.util.Collections; import java.util.List; @@ -32,25 +33,24 @@ import org.apache.orc.OrcConf; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class HiveIcebergStorageHandlerTestUtils { static final FileFormat[] FILE_FORMATS = new FileFormat[] {FileFormat.AVRO, FileFormat.ORC, FileFormat.PARQUET}; - static final Schema CUSTOMER_SCHEMA = new Schema( + static final Schema CUSTOMER_SCHEMA = + new Schema( optional(1, "customer_id", Types.LongType.get()), optional(2, "first_name", Types.StringType.get(), "This is first name"), - optional(3, "last_name", Types.StringType.get(), "This is last name") - ); + optional(3, "last_name", Types.StringType.get(), "This is last name")); - static final Schema CUSTOMER_SCHEMA_WITH_UPPERCASE = new Schema( + static final Schema CUSTOMER_SCHEMA_WITH_UPPERCASE = + new Schema( optional(1, "CustomER_Id", Types.LongType.get()), optional(2, "First_name", Types.StringType.get()), - optional(3, "Last_name", Types.StringType.get()) - ); + optional(3, "Last_name", Types.StringType.get())); - static final List CUSTOMER_RECORDS = TestHelper.RecordsBuilder.newInstance(CUSTOMER_SCHEMA) + static final List CUSTOMER_RECORDS = + TestHelper.RecordsBuilder.newInstance(CUSTOMER_SCHEMA) .add(0L, "Alice", "Brown") .add(1L, "Bob", "Green") .add(2L, "Trudy", "Pink") @@ -69,24 +69,31 @@ static TestHiveShell shell(Map configs) { shell.setHiveConfValue("hive.notification.event.poll.interval", "-1"); shell.setHiveConfValue("hive.tez.exec.print.summary", "true"); configs.forEach((k, v) -> shell.setHiveConfValue(k, v)); - // We would like to make sure that ORC reading overrides this config, so reading Iceberg tables could work in + // We would like to make sure that ORC reading overrides this config, so reading Iceberg tables + // could work in // systems (like Hive 3.2 and higher) where this value is set to true explicitly. shell.setHiveConfValue(OrcConf.FORCE_POSITIONAL_EVOLUTION.getHiveConfName(), "true"); shell.start(); return shell; } - static TestTables testTables(TestHiveShell shell, TestTables.TestTableType testTableType, TemporaryFolder temp) - throws IOException { + static TestTables testTables( + TestHiveShell shell, TestTables.TestTableType testTableType, TemporaryFolder temp) + throws IOException { return testTables(shell, testTableType, temp, Catalogs.ICEBERG_DEFAULT_CATALOG_NAME); } - static TestTables testTables(TestHiveShell shell, TestTables.TestTableType testTableType, TemporaryFolder temp, - String catalogName) throws IOException { + static TestTables testTables( + TestHiveShell shell, + TestTables.TestTableType testTableType, + TemporaryFolder temp, + String catalogName) + throws IOException { return testTableType.instance(shell.metastore().hiveConf(), temp, catalogName); } - static void init(TestHiveShell shell, TestTables testTables, TemporaryFolder temp, String engine) { + static void init( + TestHiveShell shell, TestTables testTables, TemporaryFolder temp, String engine) { shell.openSession(); for (Map.Entry property : testTables.properties().entrySet()) { @@ -101,7 +108,8 @@ static void init(TestHiveShell shell, TestTables testTables, TemporaryFolder tem static void close(TestHiveShell shell) throws Exception { shell.closeSession(); shell.metastore().reset(); - // HiveServer2 thread pools are using thread local Hive -> HMSClient objects. These are not cleaned up when the + // HiveServer2 thread pools are using thread local Hive -> HMSClient objects. These are not + // cleaned up when the // HiveServer2 is stopped. Only Finalizer closes the HMS connections. System.gc(); } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergTestUtils.java b/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergTestUtils.java index 62e358ce7fb9..de5189f7c3ad 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergTestUtils.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/HiveIcebergTestUtils.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.math.BigDecimal; @@ -64,32 +65,43 @@ import org.apache.iceberg.util.ByteBuffers; import org.junit.Assert; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class HiveIcebergTestUtils { // TODO: Can this be a constant all around the Iceberg tests? - public static final Schema FULL_SCHEMA = new Schema( - // TODO: Create tests for field case insensitivity. - optional(1, "boolean_type", Types.BooleanType.get()), - optional(2, "integer_type", Types.IntegerType.get()), - optional(3, "long_type", Types.LongType.get()), - optional(4, "float_type", Types.FloatType.get()), - optional(5, "double_type", Types.DoubleType.get()), - optional(6, "date_type", Types.DateType.get()), - optional(7, "tstz", Types.TimestampType.withZone()), - optional(8, "ts", Types.TimestampType.withoutZone()), - optional(9, "string_type", Types.StringType.get()), - optional(10, "fixed_type", Types.FixedType.ofLength(3)), - optional(11, "binary_type", Types.BinaryType.get()), - optional(12, "decimal_type", Types.DecimalType.of(38, 10)), - optional(13, "time_type", Types.TimeType.get()), - optional(14, "uuid_type", Types.UUIDType.get())); + public static final Schema FULL_SCHEMA = + new Schema( + // TODO: Create tests for field case insensitivity. + optional(1, "boolean_type", Types.BooleanType.get()), + optional(2, "integer_type", Types.IntegerType.get()), + optional(3, "long_type", Types.LongType.get()), + optional(4, "float_type", Types.FloatType.get()), + optional(5, "double_type", Types.DoubleType.get()), + optional(6, "date_type", Types.DateType.get()), + optional(7, "tstz", Types.TimestampType.withZone()), + optional(8, "ts", Types.TimestampType.withoutZone()), + optional(9, "string_type", Types.StringType.get()), + optional(10, "fixed_type", Types.FixedType.ofLength(3)), + optional(11, "binary_type", Types.BinaryType.get()), + optional(12, "decimal_type", Types.DecimalType.of(38, 10)), + optional(13, "time_type", Types.TimeType.get()), + optional(14, "uuid_type", Types.UUIDType.get())); public static final StandardStructObjectInspector FULL_SCHEMA_OBJECT_INSPECTOR = ObjectInspectorFactory.getStandardStructObjectInspector( - Arrays.asList("boolean_type", "integer_type", "long_type", "float_type", "double_type", - "date_type", "tstz", "ts", "string_type", "fixed_type", "binary_type", "decimal_type", - "time_type", "uuid_type"), + Arrays.asList( + "boolean_type", + "integer_type", + "long_type", + "float_type", + "double_type", + "date_type", + "tstz", + "ts", + "string_type", + "fixed_type", + "binary_type", + "decimal_type", + "time_type", + "uuid_type"), Arrays.asList( PrimitiveObjectInspectorFactory.writableBooleanObjectInspector, PrimitiveObjectInspectorFactory.writableIntObjectInspector, @@ -104,8 +116,7 @@ public class HiveIcebergTestUtils { PrimitiveObjectInspectorFactory.writableBinaryObjectInspector, PrimitiveObjectInspectorFactory.writableHiveDecimalObjectInspector, PrimitiveObjectInspectorFactory.writableStringObjectInspector, - PrimitiveObjectInspectorFactory.writableStringObjectInspector - )); + PrimitiveObjectInspectorFactory.writableStringObjectInspector)); private HiveIcebergTestUtils() { // Empty constructor for the utility class @@ -113,6 +124,7 @@ private HiveIcebergTestUtils() { /** * Generates a test record where every field has a value. + * * @return Record with every field set */ public static Record getTestRecord() { @@ -127,8 +139,8 @@ public static Record getTestRecord() { record.set(6, OffsetDateTime.of(2017, 11, 22, 11, 30, 7, 0, ZoneOffset.ofHours(2))); record.set(7, LocalDateTime.of(2019, 2, 22, 9, 44, 54)); record.set(8, "kilenc"); - record.set(9, new byte[]{0, 1, 2}); - record.set(10, ByteBuffer.wrap(new byte[]{0, 1, 2, 3})); + record.set(9, new byte[] {0, 1, 2}); + record.set(10, ByteBuffer.wrap(new byte[] {0, 1, 2, 3})); record.set(11, new BigDecimal("0.0000000013")); record.set(12, LocalTime.of(11, 33)); record.set(13, UUID.fromString("73689599-d7fc-4dfb-b94e-106ff20284a5")); @@ -138,6 +150,7 @@ public static Record getTestRecord() { /** * Record with every field set to null. + * * @return Empty record */ public static Record getNullTestRecord() { @@ -152,6 +165,7 @@ public static Record getNullTestRecord() { /** * Hive values for the test record. + * * @param record The original Iceberg record * @return The Hive 'record' containing the same values */ @@ -170,28 +184,33 @@ public static List valuesForTestRecord(Record record) { new BytesWritable(ByteBuffers.toByteArray(record.get(10, ByteBuffer.class))), new HiveDecimalWritable(HiveDecimal.create(record.get(11, BigDecimal.class))), new Text(record.get(12, LocalTime.class).toString()), - new Text(record.get(13, UUID.class).toString()) - ); + new Text(record.get(13, UUID.class).toString())); } /** * Converts a list of Object arrays to a list of Iceberg records. + * * @param schema The schema of the Iceberg record * @param rows The data of the records * @return The list of the converted records */ public static List valueForRow(Schema schema, List rows) { - return rows.stream().map(row -> { - Record record = GenericRecord.create(schema); - for (int i = 0; i < row.length; ++i) { - record.set(i, row[i]); - } - return record; - }).collect(Collectors.toList()); + return rows.stream() + .map( + row -> { + Record record = GenericRecord.create(schema); + for (int i = 0; i < row.length; ++i) { + record.set(i, row[i]); + } + return record; + }) + .collect(Collectors.toList()); } /** - * Check if 2 Iceberg records are the same or not. Compares OffsetDateTimes only by the Intant they represent. + * Check if 2 Iceberg records are the same or not. Compares OffsetDateTimes only by the Intant + * they represent. + * * @param expected The expected record * @param actual The actual record */ @@ -199,7 +218,8 @@ public static void assertEquals(Record expected, Record actual) { for (int i = 0; i < expected.size(); ++i) { if (expected.get(i) instanceof OffsetDateTime) { // For OffsetDateTime we just compare the actual instant - Assert.assertEquals(((OffsetDateTime) expected.get(i)).toInstant(), + Assert.assertEquals( + ((OffsetDateTime) expected.get(i)).toInstant(), ((OffsetDateTime) actual.get(i)).toInstant()); } else if (expected.get(i) instanceof byte[]) { Assert.assertArrayEquals((byte[]) expected.get(i), (byte[]) actual.get(i)); @@ -210,14 +230,16 @@ public static void assertEquals(Record expected, Record actual) { } /** - * Validates whether the table contains the expected records. The results should be sorted by a unique key so we do - * not end up with flaky tests. + * Validates whether the table contains the expected records. The results should be sorted by a + * unique key so we do not end up with flaky tests. + * * @param table The table we should read the records from * @param expected The expected list of Records * @param sortBy The column position by which we will sort * @throws IOException Exceptions when reading the table data */ - public static void validateData(Table table, List expected, int sortBy) throws IOException { + public static void validateData(Table table, List expected, int sortBy) + throws IOException { // Refresh the table, so we get the new data as well table.refresh(); List records = Lists.newArrayListWithExpectedSize(expected.size()); @@ -229,8 +251,9 @@ public static void validateData(Table table, List expected, int sortBy) } /** - * Validates whether the 2 sets of records are the same. The results should be sorted by a unique key so we do - * not end up with flaky tests. + * Validates whether the 2 sets of records are the same. The results should be sorted by a unique + * key so we do not end up with flaky tests. + * * @param expected The expected list of Records * @param actual The actual list of Records * @param sortBy The column position by which we will sort @@ -251,19 +274,23 @@ public static void validateData(List expected, List actual, int /** * Validates the number of files under a {@link Table} generated by a specific queryId and jobId. * Validates that the commit files are removed. + * * @param table The table we are checking * @param conf The configuration used for generating the job location * @param jobId The jobId which generated the files * @param dataFileNum The expected number of data files (TABLE_LOCATION/data/*) */ - public static void validateFiles(Table table, Configuration conf, JobID jobId, int dataFileNum) throws IOException { - List dataFiles = Files.walk(Paths.get(table.location() + "/data")) - .filter(Files::isRegularFile) - .filter(path -> !path.getFileName().toString().startsWith(".")) - .collect(Collectors.toList()); + public static void validateFiles(Table table, Configuration conf, JobID jobId, int dataFileNum) + throws IOException { + List dataFiles = + Files.walk(Paths.get(table.location() + "/data")) + .filter(Files::isRegularFile) + .filter(path -> !path.getFileName().toString().startsWith(".")) + .collect(Collectors.toList()); Assert.assertEquals(dataFileNum, dataFiles.size()); Assert.assertFalse( - new File(HiveIcebergOutputCommitter.generateJobLocation(table.location(), conf, jobId)).exists()); + new File(HiveIcebergOutputCommitter.generateJobLocation(table.location(), conf, jobId)) + .exists()); } } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestDeserializer.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestDeserializer.java index adad32aa48a0..464f7ccc0a0c 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestDeserializer.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/TestDeserializer.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.util.Arrays; import java.util.Collections; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; @@ -38,21 +39,18 @@ import org.junit.Assume; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestDeserializer { - private static final Schema CUSTOMER_SCHEMA = new Schema( - optional(1, "customer_id", Types.LongType.get()), - optional(2, "first_name", Types.StringType.get()) - ); + private static final Schema CUSTOMER_SCHEMA = + new Schema( + optional(1, "customer_id", Types.LongType.get()), + optional(2, "first_name", Types.StringType.get())); private static final StandardStructObjectInspector CUSTOMER_OBJECT_INSPECTOR = ObjectInspectorFactory.getStandardStructObjectInspector( Arrays.asList("customer_id", "first_name"), Arrays.asList( PrimitiveObjectInspectorFactory.writableLongObjectInspector, - PrimitiveObjectInspectorFactory.writableStringObjectInspector - )); + PrimitiveObjectInspectorFactory.writableStringObjectInspector)); @Test public void testSchemaDeserialize() { @@ -61,71 +59,72 @@ public void testSchemaDeserialize() { Arrays.asList("0:col1", "1:col2"), Arrays.asList( PrimitiveObjectInspectorFactory.writableLongObjectInspector, - PrimitiveObjectInspectorFactory.writableStringObjectInspector - )); + PrimitiveObjectInspectorFactory.writableStringObjectInspector)); - Deserializer deserializer = new Deserializer.Builder() - .schema(CUSTOMER_SCHEMA) - .writerInspector((StructObjectInspector) IcebergObjectInspector.create(CUSTOMER_SCHEMA)) - .sourceInspector(schemaObjectInspector) - .build(); + Deserializer deserializer = + new Deserializer.Builder() + .schema(CUSTOMER_SCHEMA) + .writerInspector((StructObjectInspector) IcebergObjectInspector.create(CUSTOMER_SCHEMA)) + .sourceInspector(schemaObjectInspector) + .build(); Record expected = GenericRecord.create(CUSTOMER_SCHEMA); expected.set(0, 1L); expected.set(1, "Bob"); - Record actual = deserializer.deserialize(new Object[] { new LongWritable(1L), new Text("Bob") }); + Record actual = deserializer.deserialize(new Object[] {new LongWritable(1L), new Text("Bob")}); Assert.assertEquals(expected, actual); } @Test public void testStructDeserialize() { - Deserializer deserializer = new Deserializer.Builder() - .schema(CUSTOMER_SCHEMA) - .writerInspector((StructObjectInspector) IcebergObjectInspector.create(CUSTOMER_SCHEMA)) - .sourceInspector(CUSTOMER_OBJECT_INSPECTOR) - .build(); + Deserializer deserializer = + new Deserializer.Builder() + .schema(CUSTOMER_SCHEMA) + .writerInspector((StructObjectInspector) IcebergObjectInspector.create(CUSTOMER_SCHEMA)) + .sourceInspector(CUSTOMER_OBJECT_INSPECTOR) + .build(); Record expected = GenericRecord.create(CUSTOMER_SCHEMA); expected.set(0, 1L); expected.set(1, "Bob"); - Record actual = deserializer.deserialize(new Object[] { new LongWritable(1L), new Text("Bob") }); + Record actual = deserializer.deserialize(new Object[] {new LongWritable(1L), new Text("Bob")}); Assert.assertEquals(expected, actual); } @Test public void testMapDeserialize() { - Schema schema = new Schema( - optional(1, "map_type", Types.MapType.ofOptional(2, 3, - Types.LongType.get(), - Types.StringType.get() - )) - ); - - StructObjectInspector inspector = ObjectInspectorFactory.getStandardStructObjectInspector( - Arrays.asList("map_type"), - Arrays.asList( - ObjectInspectorFactory.getStandardMapObjectInspector( - PrimitiveObjectInspectorFactory.writableLongObjectInspector, - PrimitiveObjectInspectorFactory.writableStringObjectInspector - ) - )); + Schema schema = + new Schema( + optional( + 1, + "map_type", + Types.MapType.ofOptional(2, 3, Types.LongType.get(), Types.StringType.get()))); + + StructObjectInspector inspector = + ObjectInspectorFactory.getStandardStructObjectInspector( + Arrays.asList("map_type"), + Arrays.asList( + ObjectInspectorFactory.getStandardMapObjectInspector( + PrimitiveObjectInspectorFactory.writableLongObjectInspector, + PrimitiveObjectInspectorFactory.writableStringObjectInspector))); - Deserializer deserializer = new Deserializer.Builder() - .schema(schema) - .writerInspector((StructObjectInspector) IcebergObjectInspector.create(schema)) - .sourceInspector(inspector) - .build(); + Deserializer deserializer = + new Deserializer.Builder() + .schema(schema) + .writerInspector((StructObjectInspector) IcebergObjectInspector.create(schema)) + .sourceInspector(inspector) + .build(); Record expected = GenericRecord.create(schema); expected.set(0, Collections.singletonMap(1L, "Taylor")); MapWritable map = new MapWritable(); map.put(new LongWritable(1L), new Text("Taylor")); - Object[] data = new Object[] { map }; + Object[] data = new Object[] {map}; Record actual = deserializer.deserialize(data); Assert.assertEquals(expected, actual); @@ -133,27 +132,27 @@ public void testMapDeserialize() { @Test public void testListDeserialize() { - Schema schema = new Schema( - optional(1, "list_type", Types.ListType.ofOptional(2, Types.LongType.get())) - ); - - StructObjectInspector inspector = ObjectInspectorFactory.getStandardStructObjectInspector( - Arrays.asList("list_type"), - Arrays.asList( - ObjectInspectorFactory.getStandardListObjectInspector( - PrimitiveObjectInspectorFactory.writableLongObjectInspector) - )); - - Deserializer deserializer = new Deserializer.Builder() - .schema(schema) - .writerInspector((StructObjectInspector) IcebergObjectInspector.create(schema)) - .sourceInspector(inspector) - .build(); + Schema schema = + new Schema(optional(1, "list_type", Types.ListType.ofOptional(2, Types.LongType.get()))); + + StructObjectInspector inspector = + ObjectInspectorFactory.getStandardStructObjectInspector( + Arrays.asList("list_type"), + Arrays.asList( + ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableLongObjectInspector))); + + Deserializer deserializer = + new Deserializer.Builder() + .schema(schema) + .writerInspector((StructObjectInspector) IcebergObjectInspector.create(schema)) + .sourceInspector(inspector) + .build(); Record expected = GenericRecord.create(schema); expected.set(0, Collections.singletonList(1L)); - Object[] data = new Object[] { new Object[] { new LongWritable(1L) } }; + Object[] data = new Object[] {new Object[] {new LongWritable(1L)}}; Record actual = deserializer.deserialize(data); Assert.assertEquals(expected, actual); @@ -161,13 +160,17 @@ public void testListDeserialize() { @Test public void testDeserializeEverySupportedType() { - Assume.assumeFalse("No test yet for Hive3 (Date/Timestamp creation)", MetastoreUtil.hive3PresentOnClasspath()); - - Deserializer deserializer = new Deserializer.Builder() - .schema(HiveIcebergTestUtils.FULL_SCHEMA) - .writerInspector((StructObjectInspector) IcebergObjectInspector.create(HiveIcebergTestUtils.FULL_SCHEMA)) - .sourceInspector(HiveIcebergTestUtils.FULL_SCHEMA_OBJECT_INSPECTOR) - .build(); + Assume.assumeFalse( + "No test yet for Hive3 (Date/Timestamp creation)", MetastoreUtil.hive3PresentOnClasspath()); + + Deserializer deserializer = + new Deserializer.Builder() + .schema(HiveIcebergTestUtils.FULL_SCHEMA) + .writerInspector( + (StructObjectInspector) + IcebergObjectInspector.create(HiveIcebergTestUtils.FULL_SCHEMA)) + .sourceInspector(HiveIcebergTestUtils.FULL_SCHEMA_OBJECT_INSPECTOR) + .build(); Record expected = HiveIcebergTestUtils.getTestRecord(); Record actual = deserializer.deserialize(HiveIcebergTestUtils.valuesForTestRecord(expected)); @@ -177,11 +180,14 @@ public void testDeserializeEverySupportedType() { @Test public void testNullDeserialize() { - Deserializer deserializer = new Deserializer.Builder() - .schema(HiveIcebergTestUtils.FULL_SCHEMA) - .writerInspector((StructObjectInspector) IcebergObjectInspector.create(HiveIcebergTestUtils.FULL_SCHEMA)) - .sourceInspector(HiveIcebergTestUtils.FULL_SCHEMA_OBJECT_INSPECTOR) - .build(); + Deserializer deserializer = + new Deserializer.Builder() + .schema(HiveIcebergTestUtils.FULL_SCHEMA) + .writerInspector( + (StructObjectInspector) + IcebergObjectInspector.create(HiveIcebergTestUtils.FULL_SCHEMA)) + .sourceInspector(HiveIcebergTestUtils.FULL_SCHEMA_OBJECT_INSPECTOR) + .build(); Record expected = HiveIcebergTestUtils.getNullTestRecord(); diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergFilterFactory.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergFilterFactory.java index dc48501e3084..b31408d20a60 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergFilterFactory.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergFilterFactory.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; +import static org.junit.Assert.assertEquals; + import java.math.BigDecimal; import java.sql.Date; import java.sql.Timestamp; @@ -37,17 +38,17 @@ import org.apache.iceberg.util.DateTimeUtil; import org.junit.Test; -import static org.junit.Assert.assertEquals; - public class TestHiveIcebergFilterFactory { @Test public void testEqualsOperand() { SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = builder.startAnd().equals("salary", PredicateLeaf.Type.LONG, 3000L).end().build(); + SearchArgument arg = + builder.startAnd().equals("salary", PredicateLeaf.Type.LONG, 3000L).end().build(); UnboundPredicate expected = Expressions.equal("salary", 3000L); - UnboundPredicate actual = (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); + UnboundPredicate actual = + (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); assertPredicatesMatch(expected, actual); } @@ -55,10 +56,12 @@ public void testEqualsOperand() { @Test public void testEqualsOperandRewrite() { SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = builder.startAnd().equals("float", PredicateLeaf.Type.FLOAT, Double.NaN).end().build(); + SearchArgument arg = + builder.startAnd().equals("float", PredicateLeaf.Type.FLOAT, Double.NaN).end().build(); UnboundPredicate expected = Expressions.isNaN("float"); - UnboundPredicate actual = (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); + UnboundPredicate actual = + (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); assertPredicatesMatch(expected, actual); } @@ -66,7 +69,8 @@ public void testEqualsOperandRewrite() { @Test public void testNotEqualsOperand() { SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = builder.startNot().equals("salary", PredicateLeaf.Type.LONG, 3000L).end().build(); + SearchArgument arg = + builder.startNot().equals("salary", PredicateLeaf.Type.LONG, 3000L).end().build(); Not expected = (Not) Expressions.not(Expressions.equal("salary", 3000L)); Not actual = (Not) HiveIcebergFilterFactory.generateFilterExpression(arg); @@ -83,10 +87,12 @@ public void testNotEqualsOperand() { @Test public void testLessThanOperand() { SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = builder.startAnd().lessThan("salary", PredicateLeaf.Type.LONG, 3000L).end().build(); + SearchArgument arg = + builder.startAnd().lessThan("salary", PredicateLeaf.Type.LONG, 3000L).end().build(); UnboundPredicate expected = Expressions.lessThan("salary", 3000L); - UnboundPredicate actual = (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); + UnboundPredicate actual = + (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); assertEquals(actual.op(), expected.op()); assertEquals(actual.literal(), expected.literal()); @@ -96,10 +102,12 @@ public void testLessThanOperand() { @Test public void testLessThanEqualsOperand() { SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = builder.startAnd().lessThanEquals("salary", PredicateLeaf.Type.LONG, 3000L).end().build(); + SearchArgument arg = + builder.startAnd().lessThanEquals("salary", PredicateLeaf.Type.LONG, 3000L).end().build(); UnboundPredicate expected = Expressions.lessThanOrEqual("salary", 3000L); - UnboundPredicate actual = (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); + UnboundPredicate actual = + (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); assertPredicatesMatch(expected, actual); } @@ -107,10 +115,12 @@ public void testLessThanEqualsOperand() { @Test public void testInOperand() { SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = builder.startAnd().in("salary", PredicateLeaf.Type.LONG, 3000L, 4000L).end().build(); + SearchArgument arg = + builder.startAnd().in("salary", PredicateLeaf.Type.LONG, 3000L, 4000L).end().build(); UnboundPredicate expected = Expressions.in("salary", 3000L, 4000L); - UnboundPredicate actual = (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); + UnboundPredicate actual = + (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); assertEquals(actual.op(), expected.op()); assertEquals(actual.literals(), expected.literals()); @@ -120,12 +130,14 @@ public void testInOperand() { @Test public void testBetweenOperand() { SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = builder - .startAnd() - .between("salary", PredicateLeaf.Type.LONG, 3000L, 4000L).end().build(); - - And expected = (And) Expressions.and(Expressions.greaterThanOrEqual("salary", 3000L), - Expressions.lessThanOrEqual("salary", 3000L)); + SearchArgument arg = + builder.startAnd().between("salary", PredicateLeaf.Type.LONG, 3000L, 4000L).end().build(); + + And expected = + (And) + Expressions.and( + Expressions.greaterThanOrEqual("salary", 3000L), + Expressions.lessThanOrEqual("salary", 3000L)); And actual = (And) HiveIcebergFilterFactory.generateFilterExpression(arg); assertEquals(actual.op(), expected.op()); @@ -139,7 +151,8 @@ public void testIsNullOperand() { SearchArgument arg = builder.startAnd().isNull("salary", PredicateLeaf.Type.LONG).end().build(); UnboundPredicate expected = Expressions.isNull("salary"); - UnboundPredicate actual = (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); + UnboundPredicate actual = + (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); assertEquals(actual.op(), expected.op()); assertEquals(actual.ref().name(), expected.ref().name()); @@ -148,14 +161,17 @@ public void testIsNullOperand() { @Test public void testAndOperand() { SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = builder - .startAnd() - .equals("salary", PredicateLeaf.Type.LONG, 3000L) - .equals("salary", PredicateLeaf.Type.LONG, 4000L) - .end().build(); - - And expected = (And) Expressions - .and(Expressions.equal("salary", 3000L), Expressions.equal("salary", 4000L)); + SearchArgument arg = + builder + .startAnd() + .equals("salary", PredicateLeaf.Type.LONG, 3000L) + .equals("salary", PredicateLeaf.Type.LONG, 4000L) + .end() + .build(); + + And expected = + (And) + Expressions.and(Expressions.equal("salary", 3000L), Expressions.equal("salary", 4000L)); And actual = (And) HiveIcebergFilterFactory.generateFilterExpression(arg); assertEquals(actual.op(), expected.op()); @@ -166,14 +182,16 @@ public void testAndOperand() { @Test public void testOrOperand() { SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = builder - .startOr() - .equals("salary", PredicateLeaf.Type.LONG, 3000L) - .equals("salary", PredicateLeaf.Type.LONG, 4000L) - .end().build(); - - Or expected = (Or) Expressions - .or(Expressions.equal("salary", 3000L), Expressions.equal("salary", 4000L)); + SearchArgument arg = + builder + .startOr() + .equals("salary", PredicateLeaf.Type.LONG, 3000L) + .equals("salary", PredicateLeaf.Type.LONG, 4000L) + .end() + .build(); + + Or expected = + (Or) Expressions.or(Expressions.equal("salary", 3000L), Expressions.equal("salary", 4000L)); Or actual = (Or) HiveIcebergFilterFactory.generateFilterExpression(arg); assertEquals(actual.op(), expected.op()); @@ -184,10 +202,12 @@ public void testOrOperand() { @Test public void testStringType() { SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = builder.startAnd().equals("string", PredicateLeaf.Type.STRING, "Joe").end().build(); + SearchArgument arg = + builder.startAnd().equals("string", PredicateLeaf.Type.STRING, "Joe").end().build(); UnboundPredicate expected = Expressions.equal("string", "Joe"); - UnboundPredicate actual = (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); + UnboundPredicate actual = + (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); assertPredicatesMatch(expected, actual); } @@ -195,10 +215,12 @@ public void testStringType() { @Test public void testFloatType() { SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = builder.startAnd().equals("float", PredicateLeaf.Type.FLOAT, 1200D).end().build(); + SearchArgument arg = + builder.startAnd().equals("float", PredicateLeaf.Type.FLOAT, 1200D).end().build(); UnboundPredicate expected = Expressions.equal("float", 1200D); - UnboundPredicate actual = (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); + UnboundPredicate actual = + (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); assertPredicatesMatch(expected, actual); } @@ -206,10 +228,12 @@ public void testFloatType() { @Test public void testBooleanType() { SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = builder.startAnd().equals("boolean", PredicateLeaf.Type.BOOLEAN, true).end().build(); + SearchArgument arg = + builder.startAnd().equals("boolean", PredicateLeaf.Type.BOOLEAN, true).end().build(); UnboundPredicate expected = Expressions.equal("boolean", true); - UnboundPredicate actual = (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); + UnboundPredicate actual = + (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); assertPredicatesMatch(expected, actual); } @@ -218,25 +242,31 @@ public void testBooleanType() { public void testDateType() { SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); Date gmtDate = Date.valueOf(LocalDate.of(2015, 11, 12)); - SearchArgument arg = builder.startAnd().equals("date", PredicateLeaf.Type.DATE, gmtDate).end().build(); + SearchArgument arg = + builder.startAnd().equals("date", PredicateLeaf.Type.DATE, gmtDate).end().build(); - UnboundPredicate expected = Expressions.equal("date", Literal.of("2015-11-12").to(Types.DateType.get()).value()); - UnboundPredicate actual = (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); + UnboundPredicate expected = + Expressions.equal("date", Literal.of("2015-11-12").to(Types.DateType.get()).value()); + UnboundPredicate actual = + (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); assertPredicatesMatch(expected, actual); } @Test public void testTimestampType() { - Literal timestampLiteral = Literal.of("2012-10-02T05:16:17.123456").to(Types.TimestampType.withoutZone()); + Literal timestampLiteral = + Literal.of("2012-10-02T05:16:17.123456").to(Types.TimestampType.withoutZone()); long timestampMicros = timestampLiteral.value(); Timestamp ts = Timestamp.valueOf(DateTimeUtil.timestampFromMicros(timestampMicros)); SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = builder.startAnd().equals("timestamp", PredicateLeaf.Type.TIMESTAMP, ts).end().build(); + SearchArgument arg = + builder.startAnd().equals("timestamp", PredicateLeaf.Type.TIMESTAMP, ts).end().build(); UnboundPredicate expected = Expressions.equal("timestamp", timestampMicros); - UnboundPredicate actual = (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); + UnboundPredicate actual = + (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); assertPredicatesMatch(expected, actual); } @@ -244,11 +274,16 @@ public void testTimestampType() { @Test public void testDecimalType() { SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - SearchArgument arg = builder.startAnd().equals("decimal", PredicateLeaf.Type.DECIMAL, - new HiveDecimalWritable("20.12")).end().build(); + SearchArgument arg = + builder + .startAnd() + .equals("decimal", PredicateLeaf.Type.DECIMAL, new HiveDecimalWritable("20.12")) + .end() + .build(); UnboundPredicate expected = Expressions.equal("decimal", new BigDecimal("20.12")); - UnboundPredicate actual = (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); + UnboundPredicate actual = + (UnboundPredicate) HiveIcebergFilterFactory.generateFilterExpression(arg); assertPredicatesMatch(expected, actual); } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergOutputCommitter.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergOutputCommitter.java index a33b9f781472..1818d3f08f63 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergOutputCommitter.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergOutputCommitter.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; +import static org.apache.iceberg.mr.hive.HiveIcebergRecordWriter.getWriters; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.Collections; import java.util.List; @@ -59,9 +61,6 @@ import org.mockito.ArgumentCaptor; import org.mockito.Mockito; -import static org.apache.iceberg.mr.hive.HiveIcebergRecordWriter.getWriters; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestHiveIcebergOutputCommitter { private static final long TARGET_FILE_SIZE = 128 * 1024 * 1024; private static final int RECORD_NUM = 5; @@ -72,16 +71,15 @@ public class TestHiveIcebergOutputCommitter { private static final TaskAttemptID REDUCE_TASK_ID = new TaskAttemptID(JOB_ID.getJtIdentifier(), JOB_ID.getId(), TaskType.REDUCE, 0, 0); - private static final Schema CUSTOMER_SCHEMA = new Schema( - required(1, "customer_id", Types.LongType.get()), - required(2, "first_name", Types.StringType.get()) - ); + private static final Schema CUSTOMER_SCHEMA = + new Schema( + required(1, "customer_id", Types.LongType.get()), + required(2, "first_name", Types.StringType.get())); private static final PartitionSpec PARTITIONED_SPEC = PartitionSpec.builderFor(CUSTOMER_SCHEMA).bucket("customer_id", 3).build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testNeedsTaskCommit() { @@ -92,15 +90,18 @@ public void testNeedsTaskCommit() { mapOnlyJobConf.setNumReduceTasks(0); // Map only job should commit map tasks - Assert.assertTrue(committer.needsTaskCommit(new TaskAttemptContextImpl(mapOnlyJobConf, MAP_TASK_ID))); + Assert.assertTrue( + committer.needsTaskCommit(new TaskAttemptContextImpl(mapOnlyJobConf, MAP_TASK_ID))); JobConf mapReduceJobConf = new JobConf(); mapReduceJobConf.setNumMapTasks(10); mapReduceJobConf.setNumReduceTasks(10); // MapReduce job should not commit map tasks, but should commit reduce tasks - Assert.assertFalse(committer.needsTaskCommit(new TaskAttemptContextImpl(mapReduceJobConf, MAP_TASK_ID))); - Assert.assertTrue(committer.needsTaskCommit(new TaskAttemptContextImpl(mapReduceJobConf, REDUCE_TASK_ID))); + Assert.assertFalse( + committer.needsTaskCommit(new TaskAttemptContextImpl(mapReduceJobConf, MAP_TASK_ID))); + Assert.assertTrue( + committer.needsTaskCommit(new TaskAttemptContextImpl(mapReduceJobConf, REDUCE_TASK_ID))); } @Test @@ -192,10 +193,12 @@ public void testAbortJob() throws IOException { public void writerIsClosedAfterTaskCommitFailure() throws IOException { HiveIcebergOutputCommitter committer = new HiveIcebergOutputCommitter(); HiveIcebergOutputCommitter failingCommitter = Mockito.spy(committer); - ArgumentCaptor argumentCaptor = ArgumentCaptor.forClass(TaskAttemptContextImpl.class); + ArgumentCaptor argumentCaptor = + ArgumentCaptor.forClass(TaskAttemptContextImpl.class); String exceptionMessage = "Commit task failed!"; Mockito.doThrow(new RuntimeException(exceptionMessage)) - .when(failingCommitter).commitTask(argumentCaptor.capture()); + .when(failingCommitter) + .commitTask(argumentCaptor.capture()); Table table = table(temp.getRoot().getPath(), false); JobConf conf = jobConf(table, 1); @@ -207,7 +210,8 @@ public void writerIsClosedAfterTaskCommitFailure() throws IOException { } Assert.assertEquals(1, argumentCaptor.getAllValues().size()); - TaskAttemptID capturedId = TezUtil.taskAttemptWrapper(argumentCaptor.getValue().getTaskAttemptID()); + TaskAttemptID capturedId = + TezUtil.taskAttemptWrapper(argumentCaptor.getValue().getTaskAttemptID()); // writer is still in the map after commitTask failure Assert.assertNotNull(getWriters(capturedId)); failingCommitter.abortTask(new TaskAttemptContextImpl(conf, capturedId)); @@ -218,8 +222,11 @@ public void writerIsClosedAfterTaskCommitFailure() throws IOException { private Table table(String location, boolean partitioned) { HadoopTables tables = new HadoopTables(); - return tables.create(CUSTOMER_SCHEMA, partitioned ? PARTITIONED_SPEC : PartitionSpec.unpartitioned(), - ImmutableMap.of(InputFormatConfig.CATALOG_NAME, Catalogs.ICEBERG_HADOOP_TABLE_NAME), location); + return tables.create( + CUSTOMER_SCHEMA, + partitioned ? PARTITIONED_SPEC : PartitionSpec.unpartitioned(), + ImmutableMap.of(InputFormatConfig.CATALOG_NAME, Catalogs.ICEBERG_HADOOP_TABLE_NAME), + location); } private JobConf jobConf(Table table, int taskNum) { @@ -228,38 +235,51 @@ private JobConf jobConf(Table table, int taskNum) { conf.setNumReduceTasks(0); conf.set(HiveConf.ConfVars.HIVEQUERYID.varname, QUERY_ID); conf.set(InputFormatConfig.OUTPUT_TABLES, table.name()); - conf.set(InputFormatConfig.TABLE_CATALOG_PREFIX + table.name(), - table.properties().get(InputFormatConfig.CATALOG_NAME)); - conf.set(InputFormatConfig.SERIALIZED_TABLE_PREFIX + table.name(), SerializationUtil.serializeToBase64(table)); + conf.set( + InputFormatConfig.TABLE_CATALOG_PREFIX + table.name(), + table.properties().get(InputFormatConfig.CATALOG_NAME)); + conf.set( + InputFormatConfig.SERIALIZED_TABLE_PREFIX + table.name(), + SerializationUtil.serializeToBase64(table)); Map propMap = Maps.newHashMap(); TableDesc tableDesc = new TableDesc(); tableDesc.setProperties(new Properties()); tableDesc.getProperties().setProperty(Catalogs.NAME, table.name()); tableDesc.getProperties().setProperty(Catalogs.LOCATION, table.location()); - tableDesc.getProperties().setProperty(InputFormatConfig.CATALOG_NAME, table.properties() - .get(InputFormatConfig.CATALOG_NAME)); + tableDesc + .getProperties() + .setProperty( + InputFormatConfig.CATALOG_NAME, table.properties().get(InputFormatConfig.CATALOG_NAME)); HiveIcebergStorageHandler.overlayTableProperties(conf, tableDesc, propMap); propMap.forEach((key, value) -> conf.set(key, value)); return conf; } /** - * Write random records to the given table using separate {@link HiveIcebergOutputCommitter} and - * a separate {@link HiveIcebergRecordWriter} for every task. + * Write random records to the given table using separate {@link HiveIcebergOutputCommitter} and a + * separate {@link HiveIcebergRecordWriter} for every task. + * * @param name The name of the table to get the table object from the conf * @param taskNum The number of tasks in the job handled by the committer * @param attemptNum The id used for attempt number generation * @param commitTasks If true the tasks will be committed - * @param abortTasks If true the tasks will be aborted - needed so we can simulate no commit/no abort - * situation + * @param abortTasks If true the tasks will be aborted - needed so we can simulate no + * commit/no abort situation * @param conf The job configuration * @param committer The output committer that should be used for committing/aborting the tasks * @return The random generated records which were appended to the table * @throws IOException Propagating {@link HiveIcebergRecordWriter} exceptions */ - private List writeRecords(String name, int taskNum, int attemptNum, boolean commitTasks, boolean abortTasks, - JobConf conf, OutputCommitter committer) throws IOException { + private List writeRecords( + String name, + int taskNum, + int attemptNum, + boolean commitTasks, + boolean abortTasks, + JobConf conf, + OutputCommitter committer) + throws IOException { List expected = Lists.newArrayListWithExpectedSize(RECORD_NUM * taskNum); Table table = HiveIcebergStorageHandler.table(conf, name); @@ -269,17 +289,27 @@ private List writeRecords(String name, int taskNum, int attemptNum, bool for (int i = 0; i < taskNum; ++i) { List records = TestHelper.generateRandomRecords(schema, RECORD_NUM, i + attemptNum); - TaskAttemptID taskId = new TaskAttemptID(JOB_ID.getJtIdentifier(), JOB_ID.getId(), TaskType.MAP, i, attemptNum); + TaskAttemptID taskId = + new TaskAttemptID(JOB_ID.getJtIdentifier(), JOB_ID.getId(), TaskType.MAP, i, attemptNum); int partitionId = taskId.getTaskID().getId(); String operationId = QUERY_ID + "-" + JOB_ID; FileFormat fileFormat = FileFormat.PARQUET; - OutputFileFactory outputFileFactory = OutputFileFactory.builderFor(table, partitionId, attemptNum) - .format(fileFormat) - .operationId(operationId) - .build(); - HiveIcebergRecordWriter testWriter = new HiveIcebergRecordWriter(schema, spec, fileFormat, - new GenericAppenderFactory(schema), outputFileFactory, io, TARGET_FILE_SIZE, - TezUtil.taskAttemptWrapper(taskId), conf.get(Catalogs.NAME)); + OutputFileFactory outputFileFactory = + OutputFileFactory.builderFor(table, partitionId, attemptNum) + .format(fileFormat) + .operationId(operationId) + .build(); + HiveIcebergRecordWriter testWriter = + new HiveIcebergRecordWriter( + schema, + spec, + fileFormat, + new GenericAppenderFactory(schema), + outputFileFactory, + io, + TARGET_FILE_SIZE, + TezUtil.taskAttemptWrapper(taskId), + conf.get(Catalogs.NAME)); Container container = new Container<>(); @@ -300,8 +330,15 @@ private List writeRecords(String name, int taskNum, int attemptNum, bool return expected; } - private List writeRecords(String name, int taskNum, int attemptNum, boolean commitTasks, boolean abortTasks, - JobConf conf) throws IOException { - return writeRecords(name, taskNum, attemptNum, commitTasks, abortTasks, conf, new HiveIcebergOutputCommitter()); + private List writeRecords( + String name, + int taskNum, + int attemptNum, + boolean commitTasks, + boolean abortTasks, + JobConf conf) + throws IOException { + return writeRecords( + name, taskNum, attemptNum, commitTasks, abortTasks, conf, new HiveIcebergOutputCommitter()); } } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSerDe.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSerDe.java index 7e043a83b535..889c441c28fc 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSerDe.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergSerDe.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Properties; @@ -38,14 +39,12 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestHiveIcebergSerDe { - private static final Schema schema = new Schema(required(1, "string_field", Types.StringType.get())); + private static final Schema schema = + new Schema(required(1, "string_field", Types.StringType.get())); - @Rule - public TemporaryFolder tmp = new TemporaryFolder(); + @Rule public TemporaryFolder tmp = new TemporaryFolder(); @Test public void testInitialize() throws IOException, SerDeException { @@ -77,5 +76,4 @@ public void testDeserialize() { Assert.assertEquals(record, serDe.deserialize(container)); } - } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerLocalScan.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerLocalScan.java index feaf121570bd..5aeb825e7ba5 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerLocalScan.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerLocalScan.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.junit.runners.Parameterized.Parameter; +import static org.junit.runners.Parameterized.Parameters; + import java.io.IOException; import java.math.BigDecimal; import java.util.Collection; @@ -53,10 +56,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.junit.runners.Parameterized.Parameter; -import static org.junit.runners.Parameterized.Parameters; - @RunWith(Parameterized.class) public class TestHiveIcebergStorageHandlerLocalScan { @@ -69,10 +68,11 @@ public static Collection parameters() { testParams.add(new Object[] {fileFormat, TestTables.TestTableType.HIVE_CATALOG}); } - // Run tests for every Catalog for a single FileFormat (PARQUET) - skip HiveCatalog tests as they are added before + // Run tests for every Catalog for a single FileFormat (PARQUET) - skip HiveCatalog tests as + // they are added before for (TestTables.TestTableType testTableType : TestTables.ALL_TABLE_TYPES) { if (!TestTables.TestTableType.HIVE_CATALOG.equals(testTableType)) { - testParams.add(new Object[]{FileFormat.PARQUET, testTableType}); + testParams.add(new Object[] {FileFormat.PARQUET, testTableType}); } } @@ -89,8 +89,7 @@ public static Collection parameters() { @Parameter(1) public TestTables.TestTableType testTableType; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @BeforeClass public static void beforeClass() { @@ -105,7 +104,8 @@ public static void afterClass() throws Exception { @Before public void before() throws IOException { testTables = HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp); - // Uses spark as an engine so we can detect if we unintentionally try to use any execution engines + // Uses spark as an engine so we can detect if we unintentionally try to use any execution + // engines HiveIcebergStorageHandlerTestUtils.init(shell, testTables, temp, "spark"); } @@ -125,7 +125,11 @@ public void testScanEmptyTable() throws IOException { @Test public void testScanTable() throws IOException { - testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, + testTables.createTable( + shell, + "customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); // Single fetch task: no MR job. @@ -139,9 +143,12 @@ public void testScanTable() throws IOException { @Test public void testScanTableCaseInsensitive() throws IOException { - testTables.createTable(shell, "customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA_WITH_UPPERCASE, fileFormat, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); + testTables.createTable( + shell, + "customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA_WITH_UPPERCASE, + fileFormat, + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); List rows = shell.executeStatement("SELECT * FROM default.customers"); @@ -150,19 +157,21 @@ public void testScanTableCaseInsensitive() throws IOException { Assert.assertArrayEquals(new Object[] {1L, "Bob", "Green"}, rows.get(1)); Assert.assertArrayEquals(new Object[] {2L, "Trudy", "Pink"}, rows.get(2)); - rows = shell.executeStatement("SELECT * FROM default.customers where CustomER_Id < 2 " + - "and first_name in ('Alice', 'Bob')"); + rows = + shell.executeStatement( + "SELECT * FROM default.customers where CustomER_Id < 2 " + + "and first_name in ('Alice', 'Bob')"); Assert.assertEquals(2, rows.size()); Assert.assertArrayEquals(new Object[] {0L, "Alice", "Brown"}, rows.get(0)); Assert.assertArrayEquals(new Object[] {1L, "Bob", "Green"}, rows.get(1)); } - @Test public void testDecimalTableWithPredicateLiterals() throws IOException { Schema schema = new Schema(required(1, "decimal_field", Types.DecimalType.of(7, 2))); - List records = TestHelper.RecordsBuilder.newInstance(schema) + List records = + TestHelper.RecordsBuilder.newInstance(schema) .add(new BigDecimal("85.00")) .add(new BigDecimal("100.56")) .add(new BigDecimal("100.57")) @@ -170,7 +179,8 @@ public void testDecimalTableWithPredicateLiterals() throws IOException { testTables.createTable(shell, "dec_test", schema, fileFormat, records); // Use integer literal in predicate - List rows = shell.executeStatement("SELECT * FROM default.dec_test where decimal_field >= 85"); + List rows = + shell.executeStatement("SELECT * FROM default.dec_test where decimal_field >= 85"); Assert.assertEquals(3, rows.size()); Assert.assertArrayEquals(new Object[] {"85.00"}, rows.get(0)); Assert.assertArrayEquals(new Object[] {"100.56"}, rows.get(1)); @@ -194,32 +204,39 @@ public void testDecimalTableWithPredicateLiterals() throws IOException { @Test public void testColumnSelection() throws IOException { - testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, + testTables.createTable( + shell, + "customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - List outOfOrderColumns = shell - .executeStatement("SELECT first_name, customer_id, last_name FROM default.customers"); + List outOfOrderColumns = + shell.executeStatement("SELECT first_name, customer_id, last_name FROM default.customers"); Assert.assertEquals(3, outOfOrderColumns.size()); Assert.assertArrayEquals(new Object[] {"Alice", 0L, "Brown"}, outOfOrderColumns.get(0)); Assert.assertArrayEquals(new Object[] {"Bob", 1L, "Green"}, outOfOrderColumns.get(1)); Assert.assertArrayEquals(new Object[] {"Trudy", 2L, "Pink"}, outOfOrderColumns.get(2)); - List allButFirstColumn = shell.executeStatement("SELECT first_name, last_name FROM default.customers"); + List allButFirstColumn = + shell.executeStatement("SELECT first_name, last_name FROM default.customers"); Assert.assertEquals(3, allButFirstColumn.size()); Assert.assertArrayEquals(new Object[] {"Alice", "Brown"}, allButFirstColumn.get(0)); Assert.assertArrayEquals(new Object[] {"Bob", "Green"}, allButFirstColumn.get(1)); Assert.assertArrayEquals(new Object[] {"Trudy", "Pink"}, allButFirstColumn.get(2)); - List allButMiddleColumn = shell.executeStatement("SELECT customer_id, last_name FROM default.customers"); + List allButMiddleColumn = + shell.executeStatement("SELECT customer_id, last_name FROM default.customers"); Assert.assertEquals(3, allButMiddleColumn.size()); Assert.assertArrayEquals(new Object[] {0L, "Brown"}, allButMiddleColumn.get(0)); Assert.assertArrayEquals(new Object[] {1L, "Green"}, allButMiddleColumn.get(1)); Assert.assertArrayEquals(new Object[] {2L, "Pink"}, allButMiddleColumn.get(2)); - List allButLastColumn = shell.executeStatement("SELECT customer_id, first_name FROM default.customers"); + List allButLastColumn = + shell.executeStatement("SELECT customer_id, first_name FROM default.customers"); Assert.assertEquals(3, allButLastColumn.size()); Assert.assertArrayEquals(new Object[] {0L, "Alice"}, allButLastColumn.get(0)); @@ -229,10 +246,15 @@ public void testColumnSelection() throws IOException { @Test public void selectSameColumnTwice() throws IOException { - testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, + testTables.createTable( + shell, + "customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - List columns = shell.executeStatement("SELECT first_name, first_name FROM default.customers"); + List columns = + shell.executeStatement("SELECT first_name, first_name FROM default.customers"); Assert.assertEquals(3, columns.size()); Assert.assertArrayEquals(new Object[] {"Alice", "Alice"}, columns.get(0)); @@ -245,82 +267,143 @@ public void testCreateTableWithColumnSpecification() throws IOException { TableIdentifier identifier = TableIdentifier.of("default", "customers"); Map> data = Maps.newHashMapWithExpectedSize(1); data.put(null, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - String createSql = "CREATE EXTERNAL TABLE " + identifier + - " (customer_id BIGINT, first_name STRING COMMENT 'This is first name', " + - "last_name STRING COMMENT 'This is last name')" + - " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - testTables.propertiesForCreateTableSQL(ImmutableMap.of()); - runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - PartitionSpec.unpartitioned(), data); + String createSql = + "CREATE EXTERNAL TABLE " + + identifier + + " (customer_id BIGINT, first_name STRING COMMENT 'This is first name', " + + "last_name STRING COMMENT 'This is last name')" + + " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + testTables.propertiesForCreateTableSQL(ImmutableMap.of()); + runCreateAndReadTest( + identifier, + createSql, + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + PartitionSpec.unpartitioned(), + data); } @Test public void testCreateTableWithColumnSpecificationPartitioned() throws IOException { TableIdentifier identifier = TableIdentifier.of("default", "customers"); PartitionSpec spec = - PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("last_name").build(); - Map> data = ImmutableMap.of( - Row.of("Brown"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), - Row.of("Green"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), - Row.of("Pink"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2))); - String createSql = "CREATE EXTERNAL TABLE " + identifier + - " (customer_id BIGINT, first_name STRING COMMENT 'This is first name') " + - "PARTITIONED BY (last_name STRING COMMENT 'This is last name') STORED BY " + - "'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - testTables.propertiesForCreateTableSQL(ImmutableMap.of()); - runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data); + PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + .identity("last_name") + .build(); + Map> data = + ImmutableMap.of( + Row.of("Brown"), + Collections.singletonList( + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), + Row.of("Green"), + Collections.singletonList( + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), + Row.of("Pink"), + Collections.singletonList( + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2))); + String createSql = + "CREATE EXTERNAL TABLE " + + identifier + + " (customer_id BIGINT, first_name STRING COMMENT 'This is first name') " + + "PARTITIONED BY (last_name STRING COMMENT 'This is last name') STORED BY " + + "'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + testTables.propertiesForCreateTableSQL(ImmutableMap.of()); + runCreateAndReadTest( + identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data); } @Test public void testCreatePartitionedTableByProperty() throws IOException { TableIdentifier identifier = TableIdentifier.of("default", "customers"); PartitionSpec spec = - PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("last_name").build(); - Map> data = ImmutableMap.of( - Row.of("Brown"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), - Row.of("Green"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), - Row.of("Pink"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2))); - String createSql = "CREATE EXTERNAL TABLE " + identifier + - " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - "TBLPROPERTIES ('" + InputFormatConfig.PARTITION_SPEC + "'='" + PartitionSpecParser.toJson(spec) + "', " + - "'" + InputFormatConfig.TABLE_SCHEMA + "'='" + - SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + "', " + - "'" + InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')"; - runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data); + PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + .identity("last_name") + .build(); + Map> data = + ImmutableMap.of( + Row.of("Brown"), + Collections.singletonList( + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), + Row.of("Green"), + Collections.singletonList( + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), + Row.of("Pink"), + Collections.singletonList( + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2))); + String createSql = + "CREATE EXTERNAL TABLE " + + identifier + + " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + "TBLPROPERTIES ('" + + InputFormatConfig.PARTITION_SPEC + + "'='" + + PartitionSpecParser.toJson(spec) + + "', " + + "'" + + InputFormatConfig.TABLE_SCHEMA + + "'='" + + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + + "', " + + "'" + + InputFormatConfig.CATALOG_NAME + + "'='" + + testTables.catalogName() + + "')"; + runCreateAndReadTest( + identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data); } @Test public void testCreateTableWithColumnSpecificationMultilevelPartitioned() throws IOException { TableIdentifier identifier = TableIdentifier.of("default", "customers"); - PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .identity("first_name").identity("last_name").build(); - Map> data = ImmutableMap.of( - Row.of("Alice", "Brown"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), - Row.of("Bob", "Green"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), - Row.of("Trudy", "Pink"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2))); - String createSql = "CREATE EXTERNAL TABLE " + identifier + " (customer_id BIGINT) " + - "PARTITIONED BY (first_name STRING COMMENT 'This is first name', " + - "last_name STRING COMMENT 'This is last name') " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - testTables.propertiesForCreateTableSQL(ImmutableMap.of()); - runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data); + PartitionSpec spec = + PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + .identity("first_name") + .identity("last_name") + .build(); + Map> data = + ImmutableMap.of( + Row.of("Alice", "Brown"), + Collections.singletonList( + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), + Row.of("Bob", "Green"), + Collections.singletonList( + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), + Row.of("Trudy", "Pink"), + Collections.singletonList( + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2))); + String createSql = + "CREATE EXTERNAL TABLE " + + identifier + + " (customer_id BIGINT) " + + "PARTITIONED BY (first_name STRING COMMENT 'This is first name', " + + "last_name STRING COMMENT 'This is last name') " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + testTables.propertiesForCreateTableSQL(ImmutableMap.of()); + runCreateAndReadTest( + identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data); } @Test public void testArrayOfPrimitivesInTable() throws IOException { Schema schema = - new Schema(required(1, "arrayofprimitives", Types.ListType.ofRequired(2, Types.IntegerType.get()))); - List records = testTables.createTableWithGeneratedRecords(shell, "arraytable", schema, fileFormat, 1); + new Schema( + required( + 1, "arrayofprimitives", Types.ListType.ofRequired(2, Types.IntegerType.get()))); + List records = + testTables.createTableWithGeneratedRecords(shell, "arraytable", schema, fileFormat, 1); // access a single element from the array for (int i = 0; i < records.size(); i++) { List expectedList = (List) records.get(i).getField("arrayofprimitives"); for (int j = 0; j < expectedList.size(); j++) { - List queryResult = shell.executeStatement( - String.format("SELECT arrayofprimitives[%d] FROM default.arraytable " + "LIMIT 1 OFFSET %d", j, i)); + List queryResult = + shell.executeStatement( + String.format( + "SELECT arrayofprimitives[%d] FROM default.arraytable " + "LIMIT 1 OFFSET %d", + j, i)); Assert.assertEquals(expectedList.get(j), queryResult.get(0)[0]); } } @@ -329,19 +412,24 @@ public void testArrayOfPrimitivesInTable() throws IOException { @Test public void testArrayOfArraysInTable() throws IOException { Schema schema = - new Schema( - required(1, "arrayofarrays", - Types.ListType.ofRequired(2, Types.ListType.ofRequired(3, Types.DateType.get())))); - List records = testTables.createTableWithGeneratedRecords(shell, "arraytable", schema, fileFormat, 1); + new Schema( + required( + 1, + "arrayofarrays", + Types.ListType.ofRequired(2, Types.ListType.ofRequired(3, Types.DateType.get())))); + List records = + testTables.createTableWithGeneratedRecords(shell, "arraytable", schema, fileFormat, 1); // access an element from a matrix for (int i = 0; i < records.size(); i++) { List expectedList = (List) records.get(i).getField("arrayofarrays"); for (int j = 0; j < expectedList.size(); j++) { List expectedInnerList = (List) expectedList.get(j); for (int k = 0; k < expectedInnerList.size(); k++) { - List queryResult = shell.executeStatement( - String.format("SELECT arrayofarrays[%d][%d] FROM default.arraytable " + "LIMIT 1 OFFSET %d", - j, k, i)); + List queryResult = + shell.executeStatement( + String.format( + "SELECT arrayofarrays[%d][%d] FROM default.arraytable " + "LIMIT 1 OFFSET %d", + j, k, i)); Assert.assertEquals(expectedInnerList.get(k).toString(), queryResult.get(0)[0]); } } @@ -351,19 +439,27 @@ public void testArrayOfArraysInTable() throws IOException { @Test public void testArrayOfMapsInTable() throws IOException { Schema schema = - new Schema(required(1, "arrayofmaps", Types.ListType - .ofRequired(2, Types.MapType.ofRequired(3, 4, Types.StringType.get(), - Types.BooleanType.get())))); - List records = testTables.createTableWithGeneratedRecords(shell, "arraytable", schema, fileFormat, 1); + new Schema( + required( + 1, + "arrayofmaps", + Types.ListType.ofRequired( + 2, + Types.MapType.ofRequired( + 3, 4, Types.StringType.get(), Types.BooleanType.get())))); + List records = + testTables.createTableWithGeneratedRecords(shell, "arraytable", schema, fileFormat, 1); // access an element from a map in an array for (int i = 0; i < records.size(); i++) { List expectedList = (List) records.get(i).getField("arrayofmaps"); for (int j = 0; j < expectedList.size(); j++) { Map expectedMap = (Map) expectedList.get(j); for (Map.Entry entry : expectedMap.entrySet()) { - List queryResult = shell.executeStatement(String - .format("SELECT arrayofmaps[%d][\"%s\"] FROM default.arraytable LIMIT 1 OFFSET %d", j, - entry.getKey(), i)); + List queryResult = + shell.executeStatement( + String.format( + "SELECT arrayofmaps[%d][\"%s\"] FROM default.arraytable LIMIT 1 OFFSET %d", + j, entry.getKey(), i)); Assert.assertEquals(entry.getValue(), queryResult.get(0)[0]); } } @@ -373,18 +469,29 @@ public void testArrayOfMapsInTable() throws IOException { @Test public void testArrayOfStructsInTable() throws IOException { Schema schema = - new Schema( - required(1, "arrayofstructs", Types.ListType.ofRequired(2, Types.StructType - .of(required(3, "something", Types.DoubleType.get()), required(4, "someone", - Types.LongType.get()), required(5, "somewhere", Types.StringType.get()))))); - List records = testTables.createTableWithGeneratedRecords(shell, "arraytable", schema, fileFormat, 1); + new Schema( + required( + 1, + "arrayofstructs", + Types.ListType.ofRequired( + 2, + Types.StructType.of( + required(3, "something", Types.DoubleType.get()), + required(4, "someone", Types.LongType.get()), + required(5, "somewhere", Types.StringType.get()))))); + List records = + testTables.createTableWithGeneratedRecords(shell, "arraytable", schema, fileFormat, 1); // access an element from a struct in an array for (int i = 0; i < records.size(); i++) { List expectedList = (List) records.get(i).getField("arrayofstructs"); for (int j = 0; j < expectedList.size(); j++) { - List queryResult = shell.executeStatement(String.format("SELECT arrayofstructs[%d].something, " + - "arrayofstructs[%d].someone, arrayofstructs[%d].somewhere FROM default.arraytable LIMIT 1 " + - "OFFSET %d", j, j, j, i)); + List queryResult = + shell.executeStatement( + String.format( + "SELECT arrayofstructs[%d].something, " + + "arrayofstructs[%d].someone, arrayofstructs[%d].somewhere FROM default.arraytable LIMIT 1 " + + "OFFSET %d", + j, j, j, i)); GenericRecord genericRecord = (GenericRecord) expectedList.get(j); Assert.assertEquals(genericRecord.getField("something"), queryResult.get(0)[0]); Assert.assertEquals(genericRecord.getField("someone"), queryResult.get(0)[1]); @@ -395,17 +502,23 @@ public void testArrayOfStructsInTable() throws IOException { @Test public void testMapOfPrimitivesInTable() throws IOException { - Schema schema = new Schema( - required(1, "mapofprimitives", Types.MapType.ofRequired(2, 3, Types.StringType.get(), - Types.IntegerType.get()))); - List records = testTables.createTableWithGeneratedRecords(shell, "maptable", schema, fileFormat, 1); + Schema schema = + new Schema( + required( + 1, + "mapofprimitives", + Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.IntegerType.get()))); + List records = + testTables.createTableWithGeneratedRecords(shell, "maptable", schema, fileFormat, 1); // access a single value from the map for (int i = 0; i < records.size(); i++) { Map expectedMap = (Map) records.get(i).getField("mapofprimitives"); for (Map.Entry entry : expectedMap.entrySet()) { - List queryResult = shell.executeStatement(String - .format("SELECT mapofprimitives[\"%s\"] " + "FROM default.maptable LIMIT 1 OFFSET %d", entry.getKey(), - i)); + List queryResult = + shell.executeStatement( + String.format( + "SELECT mapofprimitives[\"%s\"] " + "FROM default.maptable LIMIT 1 OFFSET %d", + entry.getKey(), i)); Assert.assertEquals(entry.getValue(), queryResult.get(0)[0]); } } @@ -413,19 +526,29 @@ public void testMapOfPrimitivesInTable() throws IOException { @Test public void testMapOfArraysInTable() throws IOException { - Schema schema = new Schema( - required(1, "mapofarrays", - Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.ListType.ofRequired(4, - Types.DateType.get())))); - List records = testTables.createTableWithGeneratedRecords(shell, "maptable", schema, fileFormat, 1); + Schema schema = + new Schema( + required( + 1, + "mapofarrays", + Types.MapType.ofRequired( + 2, + 3, + Types.StringType.get(), + Types.ListType.ofRequired(4, Types.DateType.get())))); + List records = + testTables.createTableWithGeneratedRecords(shell, "maptable", schema, fileFormat, 1); // access a single element from a list in a map for (int i = 0; i < records.size(); i++) { Map expectedMap = (Map) records.get(i).getField("mapofarrays"); for (Map.Entry entry : expectedMap.entrySet()) { List expectedList = (List) entry.getValue(); for (int j = 0; j < expectedList.size(); j++) { - List queryResult = shell.executeStatement(String - .format("SELECT mapofarrays[\"%s\"]" + "[%d] FROM maptable LIMIT 1 OFFSET %d", entry.getKey(), j, i)); + List queryResult = + shell.executeStatement( + String.format( + "SELECT mapofarrays[\"%s\"]" + "[%d] FROM maptable LIMIT 1 OFFSET %d", + entry.getKey(), j, i)); Assert.assertEquals(expectedList.get(j).toString(), queryResult.get(0)[0]); } } @@ -434,19 +557,30 @@ public void testMapOfArraysInTable() throws IOException { @Test public void testMapOfMapsInTable() throws IOException { - Schema schema = new Schema( - required(1, "mapofmaps", Types.MapType.ofRequired(2, 3, Types.StringType.get(), - Types.MapType.ofRequired(4, 5, Types.StringType.get(), Types.StringType.get())))); - List records = testTables.createTableWithGeneratedRecords(shell, "maptable", schema, fileFormat, 1); + Schema schema = + new Schema( + required( + 1, + "mapofmaps", + Types.MapType.ofRequired( + 2, + 3, + Types.StringType.get(), + Types.MapType.ofRequired( + 4, 5, Types.StringType.get(), Types.StringType.get())))); + List records = + testTables.createTableWithGeneratedRecords(shell, "maptable", schema, fileFormat, 1); // access a single element from a map in a map for (int i = 0; i < records.size(); i++) { Map expectedMap = (Map) records.get(i).getField("mapofmaps"); for (Map.Entry entry : expectedMap.entrySet()) { Map expectedInnerMap = (Map) entry.getValue(); for (Map.Entry innerEntry : expectedInnerMap.entrySet()) { - List queryResult = shell.executeStatement(String - .format("SELECT mapofmaps[\"%s\"]" + "[\"%s\"] FROM maptable LIMIT 1 OFFSET %d", entry.getKey(), - innerEntry.getKey(), i)); + List queryResult = + shell.executeStatement( + String.format( + "SELECT mapofmaps[\"%s\"]" + "[\"%s\"] FROM maptable LIMIT 1 OFFSET %d", + entry.getKey(), innerEntry.getKey(), i)); Assert.assertEquals(innerEntry.getValue(), queryResult.get(0)[0]); } } @@ -455,19 +589,32 @@ public void testMapOfMapsInTable() throws IOException { @Test public void testMapOfStructsInTable() throws IOException { - Schema schema = new Schema( - required(1, "mapofstructs", Types.MapType.ofRequired(2, 3, Types.StringType.get(), - Types.StructType.of(required(4, "something", Types.DoubleType.get()), - required(5, "someone", Types.LongType.get()), - required(6, "somewhere", Types.StringType.get()))))); - List records = testTables.createTableWithGeneratedRecords(shell, "maptable", schema, fileFormat, 1); + Schema schema = + new Schema( + required( + 1, + "mapofstructs", + Types.MapType.ofRequired( + 2, + 3, + Types.StringType.get(), + Types.StructType.of( + required(4, "something", Types.DoubleType.get()), + required(5, "someone", Types.LongType.get()), + required(6, "somewhere", Types.StringType.get()))))); + List records = + testTables.createTableWithGeneratedRecords(shell, "maptable", schema, fileFormat, 1); // access a single element from a struct in a map for (int i = 0; i < records.size(); i++) { Map expectedMap = (Map) records.get(i).getField("mapofstructs"); for (Map.Entry entry : expectedMap.entrySet()) { - List queryResult = shell.executeStatement(String.format("SELECT mapofstructs[\"%s\"].something, " + - "mapofstructs[\"%s\"].someone, mapofstructs[\"%s\"].somewhere FROM default.maptable LIMIT 1 " + - "OFFSET %d", entry.getKey(), entry.getKey(), entry.getKey(), i)); + List queryResult = + shell.executeStatement( + String.format( + "SELECT mapofstructs[\"%s\"].something, " + + "mapofstructs[\"%s\"].someone, mapofstructs[\"%s\"].somewhere FROM default.maptable LIMIT 1 " + + "OFFSET %d", + entry.getKey(), entry.getKey(), entry.getKey(), i)); GenericRecord genericRecord = (GenericRecord) entry.getValue(); Assert.assertEquals(genericRecord.getField("something"), queryResult.get(0)[0]); Assert.assertEquals(genericRecord.getField("someone"), queryResult.get(0)[1]); @@ -478,15 +625,24 @@ public void testMapOfStructsInTable() throws IOException { @Test public void testStructOfPrimitivesInTable() throws IOException { - Schema schema = new Schema(required(1, "structofprimitives", - Types.StructType.of(required(2, "key", Types.StringType.get()), required(3, "value", - Types.IntegerType.get())))); - List records = testTables.createTableWithGeneratedRecords(shell, "structtable", schema, fileFormat, 1); + Schema schema = + new Schema( + required( + 1, + "structofprimitives", + Types.StructType.of( + required(2, "key", Types.StringType.get()), + required(3, "value", Types.IntegerType.get())))); + List records = + testTables.createTableWithGeneratedRecords(shell, "structtable", schema, fileFormat, 1); // access a single value in a struct for (int i = 0; i < records.size(); i++) { GenericRecord expectedStruct = (GenericRecord) records.get(i).getField("structofprimitives"); - List queryResult = shell.executeStatement(String.format( - "SELECT structofprimitives.key, structofprimitives.value FROM default.structtable LIMIT 1 OFFSET %d", i)); + List queryResult = + shell.executeStatement( + String.format( + "SELECT structofprimitives.key, structofprimitives.value FROM default.structtable LIMIT 1 OFFSET %d", + i)); Assert.assertEquals(expectedStruct.getField("key"), queryResult.get(0)[0]); Assert.assertEquals(expectedStruct.getField("value"), queryResult.get(0)[1]); } @@ -494,25 +650,35 @@ public void testStructOfPrimitivesInTable() throws IOException { @Test public void testStructOfArraysInTable() throws IOException { - Schema schema = new Schema( - required(1, "structofarrays", Types.StructType - .of(required(2, "names", Types.ListType.ofRequired(3, Types.StringType.get())), - required(4, "birthdays", Types.ListType.ofRequired(5, - Types.DateType.get()))))); - List records = testTables.createTableWithGeneratedRecords(shell, "structtable", schema, fileFormat, 1); + Schema schema = + new Schema( + required( + 1, + "structofarrays", + Types.StructType.of( + required(2, "names", Types.ListType.ofRequired(3, Types.StringType.get())), + required(4, "birthdays", Types.ListType.ofRequired(5, Types.DateType.get()))))); + List records = + testTables.createTableWithGeneratedRecords(shell, "structtable", schema, fileFormat, 1); // access an element of an array inside a struct for (int i = 0; i < records.size(); i++) { GenericRecord expectedStruct = (GenericRecord) records.get(i).getField("structofarrays"); List expectedList = (List) expectedStruct.getField("names"); for (int j = 0; j < expectedList.size(); j++) { - List queryResult = shell.executeStatement( - String.format("SELECT structofarrays.names[%d] FROM default.structtable LIMIT 1 OFFSET %d", j, i)); + List queryResult = + shell.executeStatement( + String.format( + "SELECT structofarrays.names[%d] FROM default.structtable LIMIT 1 OFFSET %d", + j, i)); Assert.assertEquals(expectedList.get(j), queryResult.get(0)[0]); } expectedList = (List) expectedStruct.getField("birthdays"); for (int j = 0; j < expectedList.size(); j++) { - List queryResult = shell.executeStatement( - String.format("SELECT structofarrays.birthdays[%d] FROM default.structtable LIMIT 1 OFFSET %d", j, i)); + List queryResult = + shell.executeStatement( + String.format( + "SELECT structofarrays.birthdays[%d] FROM default.structtable LIMIT 1 OFFSET %d", + j, i)); Assert.assertEquals(expectedList.get(j).toString(), queryResult.get(0)[0]); } } @@ -520,28 +686,43 @@ public void testStructOfArraysInTable() throws IOException { @Test public void testStructOfMapsInTable() throws IOException { - Schema schema = new Schema( - required(1, "structofmaps", Types.StructType - .of(required(2, "map1", Types.MapType.ofRequired(3, 4, - Types.StringType.get(), Types.StringType.get())), required(5, "map2", - Types.MapType.ofRequired(6, 7, Types.StringType.get(), - Types.IntegerType.get()))))); - List records = testTables.createTableWithGeneratedRecords(shell, "structtable", schema, fileFormat, 1); + Schema schema = + new Schema( + required( + 1, + "structofmaps", + Types.StructType.of( + required( + 2, + "map1", + Types.MapType.ofRequired( + 3, 4, Types.StringType.get(), Types.StringType.get())), + required( + 5, + "map2", + Types.MapType.ofRequired( + 6, 7, Types.StringType.get(), Types.IntegerType.get()))))); + List records = + testTables.createTableWithGeneratedRecords(shell, "structtable", schema, fileFormat, 1); // access a map entry inside a struct for (int i = 0; i < records.size(); i++) { GenericRecord expectedStruct = (GenericRecord) records.get(i).getField("structofmaps"); Map expectedMap = (Map) expectedStruct.getField("map1"); for (Map.Entry entry : expectedMap.entrySet()) { - List queryResult = shell.executeStatement(String - .format("SELECT structofmaps.map1[\"%s\"] from default.structtable LIMIT 1 OFFSET %d", entry.getKey(), - i)); + List queryResult = + shell.executeStatement( + String.format( + "SELECT structofmaps.map1[\"%s\"] from default.structtable LIMIT 1 OFFSET %d", + entry.getKey(), i)); Assert.assertEquals(entry.getValue(), queryResult.get(0)[0]); } expectedMap = (Map) expectedStruct.getField("map2"); for (Map.Entry entry : expectedMap.entrySet()) { - List queryResult = shell.executeStatement(String - .format("SELECT structofmaps.map2[\"%s\"] from default.structtable LIMIT 1 OFFSET %d", entry.getKey(), - i)); + List queryResult = + shell.executeStatement( + String.format( + "SELECT structofmaps.map2[\"%s\"] from default.structtable LIMIT 1 OFFSET %d", + entry.getKey(), i)); Assert.assertEquals(entry.getValue(), queryResult.get(0)[0]); } } @@ -549,25 +730,42 @@ public void testStructOfMapsInTable() throws IOException { @Test public void testStructOfStructsInTable() throws IOException { - Schema schema = new Schema( - required(1, "structofstructs", Types.StructType.of(required(2, "struct1", Types.StructType - .of(required(3, "key", Types.StringType.get()), required(4, "value", - Types.IntegerType.get())))))); - List records = testTables.createTableWithGeneratedRecords(shell, "structtable", schema, fileFormat, 1); + Schema schema = + new Schema( + required( + 1, + "structofstructs", + Types.StructType.of( + required( + 2, + "struct1", + Types.StructType.of( + required(3, "key", Types.StringType.get()), + required(4, "value", Types.IntegerType.get())))))); + List records = + testTables.createTableWithGeneratedRecords(shell, "structtable", schema, fileFormat, 1); // access a struct element inside a struct for (int i = 0; i < records.size(); i++) { GenericRecord expectedStruct = (GenericRecord) records.get(i).getField("structofstructs"); GenericRecord expectedInnerStruct = (GenericRecord) expectedStruct.getField("struct1"); - List queryResult = shell.executeStatement(String.format( - "SELECT structofstructs.struct1.key, structofstructs.struct1.value FROM default.structtable " + - "LIMIT 1 OFFSET %d", i)); + List queryResult = + shell.executeStatement( + String.format( + "SELECT structofstructs.struct1.key, structofstructs.struct1.value FROM default.structtable " + + "LIMIT 1 OFFSET %d", + i)); Assert.assertEquals(expectedInnerStruct.getField("key"), queryResult.get(0)[0]); Assert.assertEquals(expectedInnerStruct.getField("value"), queryResult.get(0)[1]); } } - private void runCreateAndReadTest(TableIdentifier identifier, String createSQL, Schema expectedSchema, - PartitionSpec expectedSpec, Map> data) throws IOException { + private void runCreateAndReadTest( + TableIdentifier identifier, + String createSQL, + Schema expectedSchema, + PartitionSpec expectedSpec, + Map> data) + throws IOException { shell.executeStatement(createSQL); org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); @@ -576,7 +774,8 @@ private void runCreateAndReadTest(TableIdentifier identifier, String createSQL, List expected = Lists.newArrayList(); for (StructLike partition : data.keySet()) { - testTables.appendIcebergTable(shell.getHiveConf(), icebergTable, fileFormat, partition, data.get(partition)); + testTables.appendIcebergTable( + shell.getHiveConf(), icebergTable, fileFormat, partition, data.get(partition)); expected.addAll(data.get(partition)); } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerNoScan.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerNoScan.java index 7a28a9536b6b..0326ef4e7e21 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerNoScan.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerNoScan.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.junit.runners.Parameterized.Parameter; +import static org.junit.runners.Parameterized.Parameters; + import java.io.IOException; import java.util.Collection; import java.util.Collections; @@ -70,47 +74,57 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.junit.runners.Parameterized.Parameter; -import static org.junit.runners.Parameterized.Parameters; - @RunWith(Parameterized.class) public class TestHiveIcebergStorageHandlerNoScan { private static final PartitionSpec SPEC = PartitionSpec.unpartitioned(); - private static final Schema COMPLEX_SCHEMA = new Schema( - optional(1, "id", Types.LongType.get()), - optional(2, "name", Types.StringType.get()), - optional(3, "employee_info", Types.StructType.of( - optional(7, "employer", Types.StringType.get()), - optional(8, "id", Types.LongType.get()), - optional(9, "address", Types.StringType.get()) - )), - optional(4, "places_lived", Types.ListType.ofOptional(10, Types.StructType.of( - optional(11, "street", Types.StringType.get()), - optional(12, "city", Types.StringType.get()), - optional(13, "country", Types.StringType.get()) - ))), - optional(5, "memorable_moments", Types.MapType.ofOptional(14, 15, - Types.StringType.get(), - Types.StructType.of( - optional(16, "year", Types.IntegerType.get()), - optional(17, "place", Types.StringType.get()), - optional(18, "details", Types.StringType.get()) - ))), - optional(6, "current_address", Types.StructType.of( - optional(19, "street_address", Types.StructType.of( - optional(22, "street_number", Types.IntegerType.get()), - optional(23, "street_name", Types.StringType.get()), - optional(24, "street_type", Types.StringType.get()) - )), - optional(20, "country", Types.StringType.get()), - optional(21, "postal_code", Types.StringType.get()) - )) - ); - - private static final Set IGNORED_PARAMS = ImmutableSet.of("bucketing_version", "numFilesErasureCoded"); + private static final Schema COMPLEX_SCHEMA = + new Schema( + optional(1, "id", Types.LongType.get()), + optional(2, "name", Types.StringType.get()), + optional( + 3, + "employee_info", + Types.StructType.of( + optional(7, "employer", Types.StringType.get()), + optional(8, "id", Types.LongType.get()), + optional(9, "address", Types.StringType.get()))), + optional( + 4, + "places_lived", + Types.ListType.ofOptional( + 10, + Types.StructType.of( + optional(11, "street", Types.StringType.get()), + optional(12, "city", Types.StringType.get()), + optional(13, "country", Types.StringType.get())))), + optional( + 5, + "memorable_moments", + Types.MapType.ofOptional( + 14, + 15, + Types.StringType.get(), + Types.StructType.of( + optional(16, "year", Types.IntegerType.get()), + optional(17, "place", Types.StringType.get()), + optional(18, "details", Types.StringType.get())))), + optional( + 6, + "current_address", + Types.StructType.of( + optional( + 19, + "street_address", + Types.StructType.of( + optional(22, "street_number", Types.IntegerType.get()), + optional(23, "street_name", Types.StringType.get()), + optional(24, "street_type", Types.StringType.get()))), + optional(20, "country", Types.StringType.get()), + optional(21, "postal_code", Types.StringType.get())))); + + private static final Set IGNORED_PARAMS = + ImmutableSet.of("bucketing_version", "numFilesErasureCoded"); @Parameters(name = "catalog={0}") public static Collection parameters() { @@ -129,8 +143,7 @@ public static Collection parameters() { @Parameter(0) public TestTables.TestTableType testTableType; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @BeforeClass public static void beforeClass() { @@ -145,7 +158,8 @@ public static void afterClass() throws Exception { @Before public void before() throws IOException { testTables = HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp); - // Uses spark as an engine so we can detect if we unintentionally try to use any execution engines + // Uses spark as an engine so we can detect if we unintentionally try to use any execution + // engines HiveIcebergStorageHandlerTestUtils.init(shell, testTables, temp, "spark"); } @@ -158,36 +172,51 @@ public void after() throws Exception { public void testCreateDropTable() throws TException, IOException, InterruptedException { TableIdentifier identifier = TableIdentifier.of("default", "customers"); - shell.executeStatement("CREATE EXTERNAL TABLE customers " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - "TBLPROPERTIES ('" + InputFormatConfig.TABLE_SCHEMA + "'='" + - SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + "', " + - "'" + InputFormatConfig.PARTITION_SPEC + "'='" + - PartitionSpecParser.toJson(PartitionSpec.unpartitioned()) + "', " + - "'dummy'='test', " + - "'" + InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')"); + shell.executeStatement( + "CREATE EXTERNAL TABLE customers " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + "TBLPROPERTIES ('" + + InputFormatConfig.TABLE_SCHEMA + + "'='" + + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + + "', " + + "'" + + InputFormatConfig.PARTITION_SPEC + + "'='" + + PartitionSpecParser.toJson(PartitionSpec.unpartitioned()) + + "', " + + "'dummy'='test', " + + "'" + + InputFormatConfig.CATALOG_NAME + + "'='" + + testTables.catalogName() + + "')"); // Check the Iceberg table data org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - Assert.assertEquals(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA.asStruct(), + Assert.assertEquals( + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA.asStruct(), icebergTable.schema().asStruct()); Assert.assertEquals(PartitionSpec.unpartitioned(), icebergTable.spec()); - org.apache.hadoop.hive.metastore.api.Table hmsTable = shell.metastore().getTable("default", "customers"); + org.apache.hadoop.hive.metastore.api.Table hmsTable = + shell.metastore().getTable("default", "customers"); Properties tableProperties = new Properties(); hmsTable.getParameters().entrySet().stream() - .filter(e -> !IGNORED_PARAMS.contains(e.getKey())) - .forEach(e -> tableProperties.put(e.getKey(), e.getValue())); + .filter(e -> !IGNORED_PARAMS.contains(e.getKey())) + .forEach(e -> tableProperties.put(e.getKey(), e.getValue())); if (!Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) { shell.executeStatement("DROP TABLE customers"); // Check if the table was really dropped even from the Catalog - AssertHelpers.assertThrows("should throw exception", NoSuchTableException.class, - "Table does not exist", () -> { + AssertHelpers.assertThrows( + "should throw exception", + NoSuchTableException.class, + "Table does not exist", + () -> { testTables.loadTable(identifier); - } - ); + }); } else { Path hmsTableLocation = new Path(hmsTable.getSd().getLocation()); @@ -195,17 +224,20 @@ public void testCreateDropTable() throws TException, IOException, InterruptedExc shell.executeStatement("DROP TABLE customers"); // Check if we drop an exception when trying to load the table - AssertHelpers.assertThrows("should throw exception", NoSuchTableException.class, - "Table does not exist", () -> { + AssertHelpers.assertThrows( + "should throw exception", + NoSuchTableException.class, + "Table does not exist", + () -> { testTables.loadTable(identifier); - } - ); + }); // Check if the files are removed FileSystem fs = Util.getFs(hmsTableLocation, shell.getHiveConf()); if (fs.exists(hmsTableLocation)) { // if table directory has been deleted, we're good. This is the expected behavior in Hive4. - // if table directory exists, its contents should have been cleaned up, save for an empty metadata dir (Hive3). + // if table directory exists, its contents should have been cleaned up, save for an empty + // metadata dir (Hive3). Assert.assertEquals(1, fs.listStatus(hmsTableLocation).length); Assert.assertEquals(0, fs.listStatus(new Path(hmsTableLocation, "metadata")).length); } @@ -216,39 +248,56 @@ public void testCreateDropTable() throws TException, IOException, InterruptedExc public void testCreateDropTableNonDefaultCatalog() throws TException, InterruptedException { TableIdentifier identifier = TableIdentifier.of("default", "customers"); String catalogName = "nondefaultcatalog"; - testTables.properties().entrySet() - .forEach(e -> shell.setHiveSessionValue(e.getKey().replace(testTables.catalog, catalogName), e.getValue())); - String createSql = "CREATE EXTERNAL TABLE " + identifier + - " (customer_id BIGINT, first_name STRING COMMENT 'This is first name'," + - " last_name STRING COMMENT 'This is last name')" + - " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - testTables.propertiesForCreateTableSQL(ImmutableMap.of()); + testTables + .properties() + .entrySet() + .forEach( + e -> + shell.setHiveSessionValue( + e.getKey().replace(testTables.catalog, catalogName), e.getValue())); + String createSql = + "CREATE EXTERNAL TABLE " + + identifier + + " (customer_id BIGINT, first_name STRING COMMENT 'This is first name'," + + " last_name STRING COMMENT 'This is last name')" + + " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + testTables.propertiesForCreateTableSQL(ImmutableMap.of()); shell.executeStatement(createSql); Table icebergTable = testTables.loadTable(identifier); - Assert.assertEquals(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA.asStruct(), + Assert.assertEquals( + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA.asStruct(), icebergTable.schema().asStruct()); shell.executeStatement("DROP TABLE default.customers"); // Check if the table was really dropped even from the Catalog - AssertHelpers.assertThrows("should throw exception", NoSuchTableException.class, - "Table does not exist", () -> { + AssertHelpers.assertThrows( + "should throw exception", + NoSuchTableException.class, + "Table does not exist", + () -> { testTables.loadTable(identifier); - } - ); + }); } @Test public void testCreateTableWithoutSpec() { TableIdentifier identifier = TableIdentifier.of("default", "customers"); - shell.executeStatement("CREATE EXTERNAL TABLE customers " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - "TBLPROPERTIES ('" + InputFormatConfig.TABLE_SCHEMA + "'='" + - SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + "','" + - InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')"); + shell.executeStatement( + "CREATE EXTERNAL TABLE customers " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + "TBLPROPERTIES ('" + + InputFormatConfig.TABLE_SCHEMA + + "'='" + + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + + "','" + + InputFormatConfig.CATALOG_NAME + + "'='" + + testTables.catalogName() + + "')"); // Check the Iceberg table partition data org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); @@ -259,14 +308,25 @@ public void testCreateTableWithoutSpec() { public void testCreateTableWithUnpartitionedSpec() { TableIdentifier identifier = TableIdentifier.of("default", "customers"); // We need the location for HadoopTable based tests only - shell.executeStatement("CREATE EXTERNAL TABLE customers " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - "TBLPROPERTIES ('" + InputFormatConfig.TABLE_SCHEMA + "'='" + - SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + "', " + - "'" + InputFormatConfig.PARTITION_SPEC + "'='" + - PartitionSpecParser.toJson(PartitionSpec.unpartitioned()) + "', " + - "'" + InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')"); + shell.executeStatement( + "CREATE EXTERNAL TABLE customers " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + "TBLPROPERTIES ('" + + InputFormatConfig.TABLE_SCHEMA + + "'='" + + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + + "', " + + "'" + + InputFormatConfig.PARTITION_SPEC + + "'='" + + PartitionSpecParser.toJson(PartitionSpec.unpartitioned()) + + "', " + + "'" + + InputFormatConfig.CATALOG_NAME + + "'='" + + testTables.catalogName() + + "')"); // Check the Iceberg table partition data org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); @@ -277,39 +337,67 @@ public void testCreateTableWithUnpartitionedSpec() { public void testCreateTableWithFormatV2ThroughTableProperty() { TableIdentifier identifier = TableIdentifier.of("default", "customers"); // We need the location for HadoopTable based tests only - shell.executeStatement("CREATE EXTERNAL TABLE customers " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - "TBLPROPERTIES ('" + InputFormatConfig.TABLE_SCHEMA + "'='" + - SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + "', " + - "'" + InputFormatConfig.PARTITION_SPEC + "'='" + - PartitionSpecParser.toJson(PartitionSpec.unpartitioned()) + "', " + - "'" + InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "', " + - "'" + TableProperties.FORMAT_VERSION + "'='" + 2 + "')"); + shell.executeStatement( + "CREATE EXTERNAL TABLE customers " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + "TBLPROPERTIES ('" + + InputFormatConfig.TABLE_SCHEMA + + "'='" + + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + + "', " + + "'" + + InputFormatConfig.PARTITION_SPEC + + "'='" + + PartitionSpecParser.toJson(PartitionSpec.unpartitioned()) + + "', " + + "'" + + InputFormatConfig.CATALOG_NAME + + "'='" + + testTables.catalogName() + + "', " + + "'" + + TableProperties.FORMAT_VERSION + + "'='" + + 2 + + "')"); // Check the Iceberg table partition data org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - Assert.assertEquals("should create table using format v2", - 2, ((BaseTable) icebergTable).operations().current().formatVersion()); + Assert.assertEquals( + "should create table using format v2", + 2, + ((BaseTable) icebergTable).operations().current().formatVersion()); } @Test public void testDeleteBackingTable() throws TException, IOException, InterruptedException { TableIdentifier identifier = TableIdentifier.of("default", "customers"); - shell.executeStatement("CREATE EXTERNAL TABLE customers " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - "TBLPROPERTIES ('" + InputFormatConfig.TABLE_SCHEMA + "'='" + - SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + "', " + - "'" + InputFormatConfig.EXTERNAL_TABLE_PURGE + "'='FALSE', " + - "'" + InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')"); - - org.apache.hadoop.hive.metastore.api.Table hmsTable = shell.metastore().getTable("default", "customers"); + shell.executeStatement( + "CREATE EXTERNAL TABLE customers " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + "TBLPROPERTIES ('" + + InputFormatConfig.TABLE_SCHEMA + + "'='" + + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + + "', " + + "'" + + InputFormatConfig.EXTERNAL_TABLE_PURGE + + "'='FALSE', " + + "'" + + InputFormatConfig.CATALOG_NAME + + "'='" + + testTables.catalogName() + + "')"); + + org.apache.hadoop.hive.metastore.api.Table hmsTable = + shell.metastore().getTable("default", "customers"); Properties tableProperties = new Properties(); hmsTable.getParameters().entrySet().stream() - .filter(e -> !IGNORED_PARAMS.contains(e.getKey())) - .forEach(e -> tableProperties.put(e.getKey(), e.getValue())); + .filter(e -> !IGNORED_PARAMS.contains(e.getKey())) + .forEach(e -> tableProperties.put(e.getKey(), e.getValue())); if (!Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) { shell.executeStatement("DROP TABLE customers"); @@ -323,11 +411,13 @@ public void testDeleteBackingTable() throws TException, IOException, Interrupted shell.executeStatement("DROP TABLE customers"); // Check if we drop an exception when trying to drop the table - AssertHelpers.assertThrows("should throw exception", NoSuchTableException.class, - "Table does not exist", () -> { + AssertHelpers.assertThrows( + "should throw exception", + NoSuchTableException.class, + "Table does not exist", + () -> { testTables.loadTable(identifier); - } - ); + }); // Check if the files are kept FileSystem fs = Util.getFs(hmsTableLocation, shell.getHiveConf()); @@ -337,31 +427,44 @@ public void testDeleteBackingTable() throws TException, IOException, Interrupted } @Test - public void testDropTableWithCorruptedMetadata() throws TException, IOException, InterruptedException { - Assume.assumeTrue("Only HiveCatalog attempts to load the Iceberg table prior to dropping it.", + public void testDropTableWithCorruptedMetadata() + throws TException, IOException, InterruptedException { + Assume.assumeTrue( + "Only HiveCatalog attempts to load the Iceberg table prior to dropping it.", testTableType == TestTables.TestTableType.HIVE_CATALOG); // create test table TableIdentifier identifier = TableIdentifier.of("default", "customers"); - testTables.createTable(shell, identifier.name(), - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, FileFormat.PARQUET, ImmutableList.of()); + testTables.createTable( + shell, + identifier.name(), + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + FileFormat.PARQUET, + ImmutableList.of()); // enable data purging (this should set external.table.purge=true on the HMS table) Table table = testTables.loadTable(identifier); table.updateProperties().set(GC_ENABLED, "true").commit(); - // delete its current snapshot file (i.e. corrupt the metadata to make the Iceberg table unloadable) - String metadataLocation = shell.metastore().getTable(identifier) - .getParameters().get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP); + // delete its current snapshot file (i.e. corrupt the metadata to make the Iceberg table + // unloadable) + String metadataLocation = + shell + .metastore() + .getTable(identifier) + .getParameters() + .get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP); table.io().deleteFile(metadataLocation); // check if HMS table is nonetheless still droppable shell.executeStatement(String.format("DROP TABLE %s", identifier)); - AssertHelpers.assertThrows("should throw exception", NoSuchTableException.class, - "Table does not exist", () -> { + AssertHelpers.assertThrows( + "should throw exception", + NoSuchTableException.class, + "Table does not exist", + () -> { testTables.loadTable(identifier); - } - ); + }); } @Test @@ -369,97 +472,141 @@ public void testCreateTableError() { TableIdentifier identifier = TableIdentifier.of("default", "withShell2"); // Wrong schema - AssertHelpers.assertThrows("should throw exception", IllegalArgumentException.class, - "Unrecognized token 'WrongSchema'", () -> { - shell.executeStatement("CREATE EXTERNAL TABLE withShell2 " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - "TBLPROPERTIES ('" + InputFormatConfig.TABLE_SCHEMA + "'='WrongSchema'" + - ",'" + InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')"); - } - ); + AssertHelpers.assertThrows( + "should throw exception", + IllegalArgumentException.class, + "Unrecognized token 'WrongSchema'", + () -> { + shell.executeStatement( + "CREATE EXTERNAL TABLE withShell2 " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + "TBLPROPERTIES ('" + + InputFormatConfig.TABLE_SCHEMA + + "'='WrongSchema'" + + ",'" + + InputFormatConfig.CATALOG_NAME + + "'='" + + testTables.catalogName() + + "')"); + }); // Missing schema, we try to get the schema from the table and fail - AssertHelpers.assertThrows("should throw exception", IllegalArgumentException.class, - "Please provide ", () -> { - shell.executeStatement("CREATE EXTERNAL TABLE withShell2 " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - testTables.propertiesForCreateTableSQL(ImmutableMap.of())); - } - ); + AssertHelpers.assertThrows( + "should throw exception", + IllegalArgumentException.class, + "Please provide ", + () -> { + shell.executeStatement( + "CREATE EXTERNAL TABLE withShell2 " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + testTables.propertiesForCreateTableSQL(ImmutableMap.of())); + }); if (!testTables.locationForCreateTableSQL(identifier).isEmpty()) { // Only test this if the location is required - AssertHelpers.assertThrows("should throw exception", IllegalArgumentException.class, - "Table location not set", () -> { - shell.executeStatement("CREATE EXTERNAL TABLE withShell2 " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - "TBLPROPERTIES ('" + InputFormatConfig.TABLE_SCHEMA + "'='" + - SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + "','" + - InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')"); - } - ); + AssertHelpers.assertThrows( + "should throw exception", + IllegalArgumentException.class, + "Table location not set", + () -> { + shell.executeStatement( + "CREATE EXTERNAL TABLE withShell2 " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + "TBLPROPERTIES ('" + + InputFormatConfig.TABLE_SCHEMA + + "'='" + + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + + "','" + + InputFormatConfig.CATALOG_NAME + + "'='" + + testTables.catalogName() + + "')"); + }); } } @Test public void testCreateTableAboveExistingTable() throws IOException { // Create the Iceberg table - testTables.createIcebergTable(shell.getHiveConf(), "customers", COMPLEX_SCHEMA, FileFormat.PARQUET, + testTables.createIcebergTable( + shell.getHiveConf(), + "customers", + COMPLEX_SCHEMA, + FileFormat.PARQUET, Collections.emptyList()); if (testTableType == TestTables.TestTableType.HIVE_CATALOG) { // In HiveCatalog we just expect an exception since the table is already exists - AssertHelpers.assertThrows("should throw exception", IllegalArgumentException.class, - "customers already exists", () -> { - shell.executeStatement("CREATE EXTERNAL TABLE customers " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - "TBLPROPERTIES ('" + InputFormatConfig.TABLE_SCHEMA + "'='" + - SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + "',' " + - InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')"); - } - ); + AssertHelpers.assertThrows( + "should throw exception", + IllegalArgumentException.class, + "customers already exists", + () -> { + shell.executeStatement( + "CREATE EXTERNAL TABLE customers " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + "TBLPROPERTIES ('" + + InputFormatConfig.TABLE_SCHEMA + + "'='" + + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + + "',' " + + InputFormatConfig.CATALOG_NAME + + "'='" + + testTables.catalogName() + + "')"); + }); } else { // With other catalogs, table creation should succeed - shell.executeStatement("CREATE EXTERNAL TABLE customers " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(TableIdentifier.of("default", "customers")) + - testTables.propertiesForCreateTableSQL(ImmutableMap.of())); + shell.executeStatement( + "CREATE EXTERNAL TABLE customers " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(TableIdentifier.of("default", "customers")) + + testTables.propertiesForCreateTableSQL(ImmutableMap.of())); } } @Test public void testCreatePartitionedTableWithPropertiesAndWithColumnSpecification() { PartitionSpec spec = - PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("last_name").build(); - - AssertHelpers.assertThrows("should throw exception", IllegalArgumentException.class, - "Provide only one of the following", () -> { - shell.executeStatement("CREATE EXTERNAL TABLE customers (customer_id BIGINT) " + - "PARTITIONED BY (first_name STRING) " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(TableIdentifier.of("default", "customers")) + - " TBLPROPERTIES ('" + InputFormatConfig.PARTITION_SPEC + "'='" + - PartitionSpecParser.toJson(spec) + "')"); - } - ); + PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + .identity("last_name") + .build(); + + AssertHelpers.assertThrows( + "should throw exception", + IllegalArgumentException.class, + "Provide only one of the following", + () -> { + shell.executeStatement( + "CREATE EXTERNAL TABLE customers (customer_id BIGINT) " + + "PARTITIONED BY (first_name STRING) " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(TableIdentifier.of("default", "customers")) + + " TBLPROPERTIES ('" + + InputFormatConfig.PARTITION_SPEC + + "'='" + + PartitionSpecParser.toJson(spec) + + "')"); + }); } @Test public void testCreateTableWithColumnSpecificationHierarchy() { TableIdentifier identifier = TableIdentifier.of("default", "customers"); - shell.executeStatement("CREATE EXTERNAL TABLE customers (" + - "id BIGINT, name STRING, " + - "employee_info STRUCT < employer: STRING, id: BIGINT, address: STRING >, " + - "places_lived ARRAY < STRUCT >, " + - "memorable_moments MAP < STRING, STRUCT < year: INT, place: STRING, details: STRING >>, " + - "current_address STRUCT < street_address: STRUCT " + - ", country: STRING, postal_code: STRING >) " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - testTables.propertiesForCreateTableSQL(ImmutableMap.of())); + shell.executeStatement( + "CREATE EXTERNAL TABLE customers (" + + "id BIGINT, name STRING, " + + "employee_info STRUCT < employer: STRING, id: BIGINT, address: STRING >, " + + "places_lived ARRAY < STRUCT >, " + + "memorable_moments MAP < STRING, STRUCT < year: INT, place: STRING, details: STRING >>, " + + "current_address STRUCT < street_address: STRUCT " + + ", country: STRING, postal_code: STRING >) " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + testTables.propertiesForCreateTableSQL(ImmutableMap.of())); // Check the Iceberg table data org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); @@ -469,26 +616,27 @@ public void testCreateTableWithColumnSpecificationHierarchy() { @Test public void testCreateTableWithAllSupportedTypes() { TableIdentifier identifier = TableIdentifier.of("default", "all_types"); - Schema allSupportedSchema = new Schema( - optional(1, "t_float", Types.FloatType.get()), - optional(2, "t_double", Types.DoubleType.get()), - optional(3, "t_boolean", Types.BooleanType.get()), - optional(4, "t_int", Types.IntegerType.get()), - optional(5, "t_bigint", Types.LongType.get()), - optional(6, "t_binary", Types.BinaryType.get()), - optional(7, "t_string", Types.StringType.get()), - optional(8, "t_timestamp", Types.TimestampType.withoutZone()), - optional(9, "t_date", Types.DateType.get()), - optional(10, "t_decimal", Types.DecimalType.of(3, 2)) - ); + Schema allSupportedSchema = + new Schema( + optional(1, "t_float", Types.FloatType.get()), + optional(2, "t_double", Types.DoubleType.get()), + optional(3, "t_boolean", Types.BooleanType.get()), + optional(4, "t_int", Types.IntegerType.get()), + optional(5, "t_bigint", Types.LongType.get()), + optional(6, "t_binary", Types.BinaryType.get()), + optional(7, "t_string", Types.StringType.get()), + optional(8, "t_timestamp", Types.TimestampType.withoutZone()), + optional(9, "t_date", Types.DateType.get()), + optional(10, "t_decimal", Types.DecimalType.of(3, 2))); // Intentionally adding some mixed letters to test that we handle them correctly - shell.executeStatement("CREATE EXTERNAL TABLE all_types (" + - "t_Float FLOaT, t_dOuble DOUBLE, t_boolean BOOLEAN, t_int INT, t_bigint BIGINT, t_binary BINARY, " + - "t_string STRING, t_timestamp TIMESTAMP, t_date DATE, t_decimal DECIMAL(3,2)) " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - testTables.propertiesForCreateTableSQL(ImmutableMap.of())); + shell.executeStatement( + "CREATE EXTERNAL TABLE all_types (" + + "t_Float FLOaT, t_dOuble DOUBLE, t_boolean BOOLEAN, t_int INT, t_bigint BIGINT, t_binary BINARY, " + + "t_string STRING, t_timestamp TIMESTAMP, t_date DATE, t_decimal DECIMAL(3,2)) " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + testTables.propertiesForCreateTableSQL(ImmutableMap.of())); // Check the Iceberg table data org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); @@ -499,22 +647,28 @@ public void testCreateTableWithAllSupportedTypes() { public void testCreateTableWithNotSupportedTypes() { TableIdentifier identifier = TableIdentifier.of("default", "not_supported_types"); // Can not create INTERVAL types from normal create table, so leave them out from this test - Map notSupportedTypes = ImmutableMap.of( - "TINYINT", Types.IntegerType.get(), - "SMALLINT", Types.IntegerType.get(), - "VARCHAR(1)", Types.StringType.get(), - "CHAR(1)", Types.StringType.get()); + Map notSupportedTypes = + ImmutableMap.of( + "TINYINT", Types.IntegerType.get(), + "SMALLINT", Types.IntegerType.get(), + "VARCHAR(1)", Types.StringType.get(), + "CHAR(1)", Types.StringType.get()); for (String notSupportedType : notSupportedTypes.keySet()) { - AssertHelpers.assertThrows("should throw exception", IllegalArgumentException.class, - "Unsupported Hive type", () -> { - shell.executeStatement("CREATE EXTERNAL TABLE not_supported_types " + - "(not_supported " + notSupportedType + ") " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - testTables.propertiesForCreateTableSQL(ImmutableMap.of())); - } - ); + AssertHelpers.assertThrows( + "should throw exception", + IllegalArgumentException.class, + "Unsupported Hive type", + () -> { + shell.executeStatement( + "CREATE EXTERNAL TABLE not_supported_types " + + "(not_supported " + + notSupportedType + + ") " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + testTables.propertiesForCreateTableSQL(ImmutableMap.of())); + }); } } @@ -522,22 +676,31 @@ public void testCreateTableWithNotSupportedTypes() { public void testCreateTableWithNotSupportedTypesWithAutoConversion() { TableIdentifier identifier = TableIdentifier.of("default", "not_supported_types"); // Can not create INTERVAL types from normal create table, so leave them out from this test - Map notSupportedTypes = ImmutableMap.of( - "TINYINT", Types.IntegerType.get(), - "SMALLINT", Types.IntegerType.get(), - "VARCHAR(1)", Types.StringType.get(), - "CHAR(1)", Types.StringType.get()); + Map notSupportedTypes = + ImmutableMap.of( + "TINYINT", + Types.IntegerType.get(), + "SMALLINT", + Types.IntegerType.get(), + "VARCHAR(1)", + Types.StringType.get(), + "CHAR(1)", + Types.StringType.get()); shell.setHiveSessionValue(InputFormatConfig.SCHEMA_AUTO_CONVERSION, "true"); for (String notSupportedType : notSupportedTypes.keySet()) { - shell.executeStatement("CREATE EXTERNAL TABLE not_supported_types (not_supported " + notSupportedType + ") " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - testTables.propertiesForCreateTableSQL(ImmutableMap.of())); + shell.executeStatement( + "CREATE EXTERNAL TABLE not_supported_types (not_supported " + + notSupportedType + + ") " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + testTables.propertiesForCreateTableSQL(ImmutableMap.of())); org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - Assert.assertEquals(notSupportedTypes.get(notSupportedType), icebergTable.schema().columns().get(0).type()); + Assert.assertEquals( + notSupportedTypes.get(notSupportedType), icebergTable.schema().columns().get(0).type()); shell.executeStatement("DROP TABLE not_supported_types"); } } @@ -545,33 +708,40 @@ public void testCreateTableWithNotSupportedTypesWithAutoConversion() { @Test public void testCreateTableWithColumnComments() { TableIdentifier identifier = TableIdentifier.of("default", "comment_table"); - shell.executeStatement("CREATE EXTERNAL TABLE comment_table (" + - "t_int INT COMMENT 'int column', " + - "t_string STRING COMMENT 'string column', " + - "t_string_2 STRING) " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - testTables.propertiesForCreateTableSQL(ImmutableMap.of())); + shell.executeStatement( + "CREATE EXTERNAL TABLE comment_table (" + + "t_int INT COMMENT 'int column', " + + "t_string STRING COMMENT 'string column', " + + "t_string_2 STRING) " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + testTables.propertiesForCreateTableSQL(ImmutableMap.of())); org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); List rows = shell.executeStatement("DESCRIBE default.comment_table"); Assert.assertEquals(icebergTable.schema().columns().size(), rows.size()); for (int i = 0; i < icebergTable.schema().columns().size(); i++) { Types.NestedField field = icebergTable.schema().columns().get(i); - Assert.assertArrayEquals(new Object[] {field.name(), HiveSchemaUtil.convert(field.type()).getTypeName(), - (field.doc() != null ? field.doc() : "from deserializer")}, rows.get(i)); + Assert.assertArrayEquals( + new Object[] { + field.name(), + HiveSchemaUtil.convert(field.type()).getTypeName(), + (field.doc() != null ? field.doc() : "from deserializer") + }, + rows.get(i)); } } @Test public void testCreateTableWithoutColumnComments() { TableIdentifier identifier = TableIdentifier.of("default", "without_comment_table"); - shell.executeStatement("CREATE EXTERNAL TABLE without_comment_table (" + - "t_int INT, " + - "t_string STRING) " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - testTables.propertiesForCreateTableSQL(ImmutableMap.of())); + shell.executeStatement( + "CREATE EXTERNAL TABLE without_comment_table (" + + "t_int INT, " + + "t_string STRING) " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + testTables.propertiesForCreateTableSQL(ImmutableMap.of())); org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); List rows = shell.executeStatement("DESCRIBE default.without_comment_table"); @@ -579,8 +749,11 @@ public void testCreateTableWithoutColumnComments() { for (int i = 0; i < icebergTable.schema().columns().size(); i++) { Types.NestedField field = icebergTable.schema().columns().get(i); Assert.assertNull(field.doc()); - Assert.assertArrayEquals(new Object[] {field.name(), HiveSchemaUtil.convert(field.type()).getTypeName(), - "from deserializer"}, rows.get(i)); + Assert.assertArrayEquals( + new Object[] { + field.name(), HiveSchemaUtil.convert(field.type()).getTypeName(), "from deserializer" + }, + rows.get(i)); } } @@ -588,15 +761,21 @@ public void testCreateTableWithoutColumnComments() { public void testIcebergAndHmsTableProperties() throws Exception { TableIdentifier identifier = TableIdentifier.of("default", "customers"); - shell.executeStatement(String.format("CREATE EXTERNAL TABLE default.customers " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' %s" + - "TBLPROPERTIES ('%s'='%s', '%s'='%s', '%s'='%s', '%s'='%s')", - testTables.locationForCreateTableSQL(identifier), // we need the location for HadoopTable based tests only - InputFormatConfig.TABLE_SCHEMA, SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA), - InputFormatConfig.PARTITION_SPEC, PartitionSpecParser.toJson(SPEC), - "custom_property", "initial_val", - InputFormatConfig.CATALOG_NAME, testTables.catalogName())); - + shell.executeStatement( + String.format( + "CREATE EXTERNAL TABLE default.customers " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' %s" + + "TBLPROPERTIES ('%s'='%s', '%s'='%s', '%s'='%s', '%s'='%s')", + testTables.locationForCreateTableSQL( + identifier), // we need the location for HadoopTable based tests only + InputFormatConfig.TABLE_SCHEMA, + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA), + InputFormatConfig.PARTITION_SPEC, + PartitionSpecParser.toJson(SPEC), + "custom_property", + "initial_val", + InputFormatConfig.CATALOG_NAME, + testTables.catalogName())); // Check the Iceberg table parameters org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); @@ -607,9 +786,10 @@ public void testIcebergAndHmsTableProperties() throws Exception { expectedIcebergProperties.put("storage_handler", HiveIcebergStorageHandler.class.getName()); // Check the HMS table parameters - org.apache.hadoop.hive.metastore.api.Table hmsTable = shell.metastore().getTable("default", "customers"); - Map hmsParams = hmsTable.getParameters() - .entrySet().stream() + org.apache.hadoop.hive.metastore.api.Table hmsTable = + shell.metastore().getTable("default", "customers"); + Map hmsParams = + hmsTable.getParameters().entrySet().stream() .filter(e -> !IGNORED_PARAMS.contains(e.getKey())) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); Properties tableProperties = new Properties(); @@ -629,13 +809,17 @@ public void testIcebergAndHmsTableProperties() throws Exception { Assert.assertEquals("TRUE", hmsParams.get(InputFormatConfig.EXTERNAL_TABLE_PURGE)); Assert.assertEquals("TRUE", hmsParams.get("EXTERNAL")); Assert.assertEquals("true", hmsParams.get(TableProperties.ENGINE_HIVE_ENABLED)); - Assert.assertEquals(HiveIcebergStorageHandler.class.getName(), + Assert.assertEquals( + HiveIcebergStorageHandler.class.getName(), hmsParams.get(hive_metastoreConstants.META_TABLE_STORAGE)); - Assert.assertEquals(BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE.toUpperCase(), + Assert.assertEquals( + BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE.toUpperCase(), hmsParams.get(BaseMetastoreTableOperations.TABLE_TYPE_PROP)); - Assert.assertEquals(hmsParams.get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP), - getCurrentSnapshotForHiveCatalogTable(icebergTable)); - Assert.assertNull(hmsParams.get(BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP)); + Assert.assertEquals( + hmsParams.get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP), + getCurrentSnapshotForHiveCatalogTable(icebergTable)); + Assert.assertNull( + hmsParams.get(BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP)); Assert.assertNotNull(hmsParams.get(hive_metastoreConstants.DDL_TIME)); Assert.assertNotNull(hmsParams.get(InputFormatConfig.PARTITION_SPEC)); } else { @@ -645,42 +829,46 @@ public void testIcebergAndHmsTableProperties() throws Exception { // Check HMS inputformat/outputformat/serde Assert.assertEquals(HiveIcebergInputFormat.class.getName(), hmsTable.getSd().getInputFormat()); - Assert.assertEquals(HiveIcebergOutputFormat.class.getName(), hmsTable.getSd().getOutputFormat()); - Assert.assertEquals(HiveIcebergSerDe.class.getName(), hmsTable.getSd().getSerdeInfo().getSerializationLib()); + Assert.assertEquals( + HiveIcebergOutputFormat.class.getName(), hmsTable.getSd().getOutputFormat()); + Assert.assertEquals( + HiveIcebergSerDe.class.getName(), hmsTable.getSd().getSerdeInfo().getSerializationLib()); // Add two new properties to the Iceberg table and update an existing one - icebergTable.updateProperties() + icebergTable + .updateProperties() .set("new_prop_1", "true") .set("new_prop_2", "false") .set("custom_property", "new_val") .commit(); // Refresh the HMS table to see if new Iceberg properties got synced into HMS - hmsParams = shell.metastore().getTable("default", "customers").getParameters() - .entrySet().stream() - .filter(e -> !IGNORED_PARAMS.contains(e.getKey())) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + hmsParams = + shell.metastore().getTable("default", "customers").getParameters().entrySet().stream() + .filter(e -> !IGNORED_PARAMS.contains(e.getKey())) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); if (Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) { - Assert.assertEquals(16, hmsParams.size()); // 2 newly-added properties + previous_metadata_location prop + Assert.assertEquals( + 16, hmsParams.size()); // 2 newly-added properties + previous_metadata_location prop Assert.assertEquals("true", hmsParams.get("new_prop_1")); Assert.assertEquals("false", hmsParams.get("new_prop_2")); Assert.assertEquals("new_val", hmsParams.get("custom_property")); String prevSnapshot = getCurrentSnapshotForHiveCatalogTable(icebergTable); icebergTable.refresh(); String newSnapshot = getCurrentSnapshotForHiveCatalogTable(icebergTable); - Assert.assertEquals(hmsParams.get(BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP), prevSnapshot); - Assert.assertEquals(hmsParams.get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP), newSnapshot); + Assert.assertEquals( + hmsParams.get(BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP), + prevSnapshot); + Assert.assertEquals( + hmsParams.get(BaseMetastoreTableOperations.METADATA_LOCATION_PROP), newSnapshot); } else { Assert.assertEquals(8, hmsParams.size()); } // Remove some Iceberg props and see if they're removed from HMS table props as well if (Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) { - icebergTable.updateProperties() - .remove("custom_property") - .remove("new_prop_1") - .commit(); + icebergTable.updateProperties().remove("custom_property").remove("new_prop_1").commit(); hmsParams = shell.metastore().getTable("default", "customers").getParameters(); Assert.assertFalse(hmsParams.containsKey("custom_property")); Assert.assertFalse(hmsParams.containsKey("new_prop_1")); @@ -690,29 +878,42 @@ public void testIcebergAndHmsTableProperties() throws Exception { // append some data and check whether HMS stats are aligned with snapshot summary if (Catalogs.hiveCatalog(shell.getHiveConf(), tableProperties)) { List records = HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS; - testTables.appendIcebergTable(shell.getHiveConf(), icebergTable, FileFormat.PARQUET, null, records); + testTables.appendIcebergTable( + shell.getHiveConf(), icebergTable, FileFormat.PARQUET, null, records); hmsParams = shell.metastore().getTable("default", "customers").getParameters(); Map summary = icebergTable.currentSnapshot().summary(); - Assert.assertEquals(summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP), hmsParams.get(StatsSetupConst.NUM_FILES)); - Assert.assertEquals(summary.get(SnapshotSummary.TOTAL_RECORDS_PROP), hmsParams.get(StatsSetupConst.ROW_COUNT)); - Assert.assertEquals(summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP), hmsParams.get(StatsSetupConst.TOTAL_SIZE)); + Assert.assertEquals( + summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP), + hmsParams.get(StatsSetupConst.NUM_FILES)); + Assert.assertEquals( + summary.get(SnapshotSummary.TOTAL_RECORDS_PROP), + hmsParams.get(StatsSetupConst.ROW_COUNT)); + Assert.assertEquals( + summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP), + hmsParams.get(StatsSetupConst.TOTAL_SIZE)); } } @Test public void testIcebergHMSPropertiesTranslation() throws Exception { - Assume.assumeTrue("Iceberg - HMS property translation is only relevant for HiveCatalog", + Assume.assumeTrue( + "Iceberg - HMS property translation is only relevant for HiveCatalog", testTableType == TestTables.TestTableType.HIVE_CATALOG); TableIdentifier identifier = TableIdentifier.of("default", "customers"); // Create HMS table with with a property to be translated - shell.executeStatement(String.format("CREATE EXTERNAL TABLE default.customers " + - "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'" + - "TBLPROPERTIES ('%s'='%s', '%s'='%s', '%s'='%s')", - InputFormatConfig.TABLE_SCHEMA, SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA), - InputFormatConfig.PARTITION_SPEC, PartitionSpecParser.toJson(SPEC), - InputFormatConfig.EXTERNAL_TABLE_PURGE, "false")); + shell.executeStatement( + String.format( + "CREATE EXTERNAL TABLE default.customers " + + "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'" + + "TBLPROPERTIES ('%s'='%s', '%s'='%s', '%s'='%s')", + InputFormatConfig.TABLE_SCHEMA, + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA), + InputFormatConfig.PARTITION_SPEC, + PartitionSpecParser.toJson(SPEC), + InputFormatConfig.EXTERNAL_TABLE_PURGE, + "false")); // Check that HMS table prop was translated to equivalent Iceberg prop (purge -> gc.enabled) org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); @@ -720,12 +921,11 @@ public void testIcebergHMSPropertiesTranslation() throws Exception { Assert.assertNull(icebergTable.properties().get(InputFormatConfig.EXTERNAL_TABLE_PURGE)); // Change Iceberg prop - icebergTable.updateProperties() - .set(GC_ENABLED, "true") - .commit(); + icebergTable.updateProperties().set(GC_ENABLED, "true").commit(); // Check that Iceberg prop was translated to equivalent HMS prop (gc.enabled -> purge) - Map hmsParams = shell.metastore().getTable("default", "customers").getParameters(); + Map hmsParams = + shell.metastore().getTable("default", "customers").getParameters(); Assert.assertEquals("true", hmsParams.get(InputFormatConfig.EXTERNAL_TABLE_PURGE)); Assert.assertNull(hmsParams.get(GC_ENABLED)); } @@ -734,11 +934,20 @@ public void testIcebergHMSPropertiesTranslation() throws Exception { public void testDropTableWithAppendedData() throws IOException { TableIdentifier identifier = TableIdentifier.of("default", "customers"); - testTables.createTable(shell, identifier.name(), HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, SPEC, - FileFormat.PARQUET, ImmutableList.of()); + testTables.createTable( + shell, + identifier.name(), + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + SPEC, + FileFormat.PARQUET, + ImmutableList.of()); org.apache.iceberg.Table icebergTable = testTables.loadTable(identifier); - testTables.appendIcebergTable(shell.getHiveConf(), icebergTable, FileFormat.PARQUET, null, + testTables.appendIcebergTable( + shell.getHiveConf(), + icebergTable, + FileFormat.PARQUET, + null, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); shell.executeStatement("DROP TABLE customers"); @@ -746,19 +955,24 @@ public void testDropTableWithAppendedData() throws IOException { @Test public void testDropHiveTableWithoutUnderlyingTable() throws IOException { - Assume.assumeFalse("Not relevant for HiveCatalog", - testTableType.equals(TestTables.TestTableType.HIVE_CATALOG)); + Assume.assumeFalse( + "Not relevant for HiveCatalog", + testTableType.equals(TestTables.TestTableType.HIVE_CATALOG)); TableIdentifier identifier = TableIdentifier.of("default", "customers"); // Create the Iceberg table in non-HiveCatalog - testTables.createIcebergTable(shell.getHiveConf(), identifier.name(), - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, FileFormat.PARQUET, + testTables.createIcebergTable( + shell.getHiveConf(), + identifier.name(), + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + FileFormat.PARQUET, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); // Create Hive table on top String tableLocation = testTables.locationForCreateTableSQL(identifier); - shell.executeStatement(testTables.createHiveTableSQL(identifier, - ImmutableMap.of(InputFormatConfig.EXTERNAL_TABLE_PURGE, "TRUE"))); + shell.executeStatement( + testTables.createHiveTableSQL( + identifier, ImmutableMap.of(InputFormatConfig.EXTERNAL_TABLE_PURGE, "TRUE"))); // Drop the Iceberg table Properties properties = new Properties(); @@ -771,6 +985,7 @@ public void testDropHiveTableWithoutUnderlyingTable() throws IOException { } private String getCurrentSnapshotForHiveCatalogTable(org.apache.iceberg.Table icebergTable) { - return ((BaseMetastoreTableOperations) ((BaseTable) icebergTable).operations()).currentMetadataLocation(); + return ((BaseMetastoreTableOperations) ((BaseTable) icebergTable).operations()) + .currentMetadataLocation(); } } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerTimezone.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerTimezone.java index 677be087c1af..86e3baf8e759 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerTimezone.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerTimezone.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.junit.runners.Parameterized.Parameter; +import static org.junit.runners.Parameterized.Parameters; + import java.io.IOException; import java.text.DateFormat; import java.time.LocalDate; @@ -47,33 +50,32 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.junit.runners.Parameterized.Parameter; -import static org.junit.runners.Parameterized.Parameters; - @RunWith(Parameterized.class) public class TestHiveIcebergStorageHandlerTimezone { private static final Optional> dateFormat = - Optional.ofNullable((ThreadLocal) DynFields.builder() - .hiddenImpl(TimestampWritable.class, "threadLocalDateFormat") - .defaultAlwaysNull() - .buildStatic() - .get()); + Optional.ofNullable( + (ThreadLocal) + DynFields.builder() + .hiddenImpl(TimestampWritable.class, "threadLocalDateFormat") + .defaultAlwaysNull() + .buildStatic() + .get()); private static final Optional> localTimeZone = - Optional.ofNullable((ThreadLocal) DynFields.builder() - .hiddenImpl(DateWritable.class, "LOCAL_TIMEZONE") - .defaultAlwaysNull() - .buildStatic() - .get()); + Optional.ofNullable( + (ThreadLocal) + DynFields.builder() + .hiddenImpl(DateWritable.class, "LOCAL_TIMEZONE") + .defaultAlwaysNull() + .buildStatic() + .get()); @Parameters(name = "timezone={0}") public static Collection parameters() { return ImmutableList.of( new String[] {"America/New_York"}, new String[] {"Asia/Kolkata"}, - new String[] {"UTC/Greenwich"} - ); + new String[] {"UTC/Greenwich"}); } private static TestHiveShell shell; @@ -83,8 +85,7 @@ public static Collection parameters() { @Parameter(0) public String timezoneString; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @BeforeClass public static void beforeClass() { @@ -100,13 +101,17 @@ public static void afterClass() throws Exception { public void before() throws IOException { TimeZone.setDefault(TimeZone.getTimeZone(timezoneString)); - // Magic to clean cached date format and local timezone for Hive where the default timezone is used/stored in the + // Magic to clean cached date format and local timezone for Hive where the default timezone is + // used/stored in the // cached object dateFormat.ifPresent(ThreadLocal::remove); localTimeZone.ifPresent(ThreadLocal::remove); - this.testTables = HiveIcebergStorageHandlerTestUtils.testTables(shell, TestTables.TestTableType.HIVE_CATALOG, temp); - // Uses spark as an engine so we can detect if we unintentionally try to use any execution engines + this.testTables = + HiveIcebergStorageHandlerTestUtils.testTables( + shell, TestTables.TestTableType.HIVE_CATALOG, temp); + // Uses spark as an engine so we can detect if we unintentionally try to use any execution + // engines HiveIcebergStorageHandlerTestUtils.init(shell, testTables, temp, "spark"); } @@ -119,18 +124,22 @@ public void after() throws Exception { public void testDateQuery() throws IOException { Schema dateSchema = new Schema(optional(1, "d_date", Types.DateType.get())); - List records = TestHelper.RecordsBuilder.newInstance(dateSchema) - .add(LocalDate.of(2020, 1, 21)) - .add(LocalDate.of(2020, 1, 24)) - .build(); + List records = + TestHelper.RecordsBuilder.newInstance(dateSchema) + .add(LocalDate.of(2020, 1, 21)) + .add(LocalDate.of(2020, 1, 24)) + .build(); testTables.createTable(shell, "date_test", dateSchema, FileFormat.PARQUET, records); - List result = shell.executeStatement("SELECT * from date_test WHERE d_date='2020-01-21'"); + List result = + shell.executeStatement("SELECT * from date_test WHERE d_date='2020-01-21'"); Assert.assertEquals(1, result.size()); Assert.assertEquals("2020-01-21", result.get(0)[0]); - result = shell.executeStatement("SELECT * from date_test WHERE d_date in ('2020-01-21', '2020-01-22')"); + result = + shell.executeStatement( + "SELECT * from date_test WHERE d_date in ('2020-01-21', '2020-01-22')"); Assert.assertEquals(1, result.size()); Assert.assertEquals("2020-01-21", result.get(0)[0]); @@ -146,23 +155,27 @@ public void testDateQuery() throws IOException { public void testTimestampQuery() throws IOException { Schema timestampSchema = new Schema(optional(1, "d_ts", Types.TimestampType.withoutZone())); - List records = TestHelper.RecordsBuilder.newInstance(timestampSchema) - .add(LocalDateTime.of(2019, 1, 22, 9, 44, 54, 100000000)) - .add(LocalDateTime.of(2019, 2, 22, 9, 44, 54, 200000000)) - .build(); + List records = + TestHelper.RecordsBuilder.newInstance(timestampSchema) + .add(LocalDateTime.of(2019, 1, 22, 9, 44, 54, 100000000)) + .add(LocalDateTime.of(2019, 2, 22, 9, 44, 54, 200000000)) + .build(); testTables.createTable(shell, "ts_test", timestampSchema, FileFormat.PARQUET, records); - List result = shell.executeStatement("SELECT d_ts FROM ts_test WHERE d_ts='2019-02-22 09:44:54.2'"); + List result = + shell.executeStatement("SELECT d_ts FROM ts_test WHERE d_ts='2019-02-22 09:44:54.2'"); Assert.assertEquals(1, result.size()); Assert.assertEquals("2019-02-22 09:44:54.2", result.get(0)[0]); - result = shell.executeStatement( - "SELECT * FROM ts_test WHERE d_ts in ('2017-01-01 22:30:57.1', '2019-02-22 09:44:54.2')"); + result = + shell.executeStatement( + "SELECT * FROM ts_test WHERE d_ts in ('2017-01-01 22:30:57.1', '2019-02-22 09:44:54.2')"); Assert.assertEquals(1, result.size()); Assert.assertEquals("2019-02-22 09:44:54.2", result.get(0)[0]); - result = shell.executeStatement("SELECT d_ts FROM ts_test WHERE d_ts < '2019-02-22 09:44:54.2'"); + result = + shell.executeStatement("SELECT d_ts FROM ts_test WHERE d_ts < '2019-02-22 09:44:54.2'"); Assert.assertEquals(1, result.size()); Assert.assertEquals("2019-01-22 09:44:54.1", result.get(0)[0]); diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithEngine.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithEngine.java index 38570b879613..4b74ad4fc6ca 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithEngine.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithEngine.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.junit.Assume.assumeTrue; +import static org.junit.runners.Parameterized.Parameter; +import static org.junit.runners.Parameterized.Parameters; + import java.io.IOException; import java.util.Collection; import java.util.List; @@ -58,48 +63,54 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.junit.Assume.assumeTrue; -import static org.junit.runners.Parameterized.Parameter; -import static org.junit.runners.Parameterized.Parameters; - @RunWith(Parameterized.class) public class TestHiveIcebergStorageHandlerWithEngine { private static final String[] EXECUTION_ENGINES = new String[] {"tez", "mr"}; - private static final Schema ORDER_SCHEMA = new Schema( + private static final Schema ORDER_SCHEMA = + new Schema( required(1, "order_id", Types.LongType.get()), required(2, "customer_id", Types.LongType.get()), required(3, "total", Types.DoubleType.get()), - required(4, "product_id", Types.LongType.get()) - ); + required(4, "product_id", Types.LongType.get())); - private static final List ORDER_RECORDS = TestHelper.RecordsBuilder.newInstance(ORDER_SCHEMA) + private static final List ORDER_RECORDS = + TestHelper.RecordsBuilder.newInstance(ORDER_SCHEMA) .add(100L, 0L, 11.11d, 1L) .add(101L, 0L, 22.22d, 2L) .add(102L, 1L, 33.33d, 3L) .build(); - private static final Schema PRODUCT_SCHEMA = new Schema( + private static final Schema PRODUCT_SCHEMA = + new Schema( optional(1, "id", Types.LongType.get()), optional(2, "name", Types.StringType.get()), - optional(3, "price", Types.DoubleType.get()) - ); + optional(3, "price", Types.DoubleType.get())); - private static final List PRODUCT_RECORDS = TestHelper.RecordsBuilder.newInstance(PRODUCT_SCHEMA) + private static final List PRODUCT_RECORDS = + TestHelper.RecordsBuilder.newInstance(PRODUCT_SCHEMA) .add(1L, "skirt", 11.11d) .add(2L, "tee", 22.22d) .add(3L, "watch", 33.33d) .build(); private static final List SUPPORTED_TYPES = - ImmutableList.of(Types.BooleanType.get(), Types.IntegerType.get(), Types.LongType.get(), - Types.FloatType.get(), Types.DoubleType.get(), Types.DateType.get(), Types.TimestampType.withZone(), - Types.TimestampType.withoutZone(), Types.StringType.get(), Types.BinaryType.get(), - Types.DecimalType.of(3, 1), Types.UUIDType.get(), Types.FixedType.ofLength(5), - Types.TimeType.get()); + ImmutableList.of( + Types.BooleanType.get(), + Types.IntegerType.get(), + Types.LongType.get(), + Types.FloatType.get(), + Types.DoubleType.get(), + Types.DateType.get(), + Types.TimestampType.withZone(), + Types.TimestampType.withoutZone(), + Types.StringType.get(), + Types.BinaryType.get(), + Types.DecimalType.of(3, 1), + Types.UUIDType.get(), + Types.FixedType.ofLength(5), + Types.TimeType.get()); @Parameters(name = "fileFormat={0}, engine={1}, catalog={2}, isVectorized={3}") public static Collection parameters() { @@ -111,11 +122,14 @@ public static Collection parameters() { for (String engine : EXECUTION_ENGINES) { // include Tez tests only for Java 8 if (javaVersion.equals("1.8") || "mr".equals(engine)) { - testParams.add(new Object[] {fileFormat, engine, TestTables.TestTableType.HIVE_CATALOG, false}); + testParams.add( + new Object[] {fileFormat, engine, TestTables.TestTableType.HIVE_CATALOG, false}); // test for vectorization=ON in case of ORC format and Tez engine - if ((fileFormat == FileFormat.PARQUET || fileFormat == FileFormat.ORC) && - "tez".equals(engine) && MetastoreUtil.hive3PresentOnClasspath()) { - testParams.add(new Object[] {fileFormat, engine, TestTables.TestTableType.HIVE_CATALOG, true}); + if ((fileFormat == FileFormat.PARQUET || fileFormat == FileFormat.ORC) + && "tez".equals(engine) + && MetastoreUtil.hive3PresentOnClasspath()) { + testParams.add( + new Object[] {fileFormat, engine, TestTables.TestTableType.HIVE_CATALOG, true}); } } } @@ -125,7 +139,7 @@ public static Collection parameters() { // skip HiveCatalog tests as they are added before for (TestTables.TestTableType testTableType : TestTables.ALL_TABLE_TYPES) { if (!TestTables.TestTableType.HIVE_CATALOG.equals(testTableType)) { - testParams.add(new Object[]{FileFormat.PARQUET, "mr", testTableType, false}); + testParams.add(new Object[] {FileFormat.PARQUET, "mr", testTableType, false}); } } @@ -148,11 +162,9 @@ public static Collection parameters() { @Parameter(3) public boolean isVectorized; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - @Rule - public Timeout timeout = new Timeout(200_000, TimeUnit.MILLISECONDS); + @Rule public Timeout timeout = new Timeout(200_000, TimeUnit.MILLISECONDS); @BeforeClass public static void beforeClass() { @@ -168,7 +180,8 @@ public static void afterClass() throws Exception { public void before() throws IOException { testTables = HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp); HiveIcebergStorageHandlerTestUtils.init(shell, testTables, temp, executionEngine); - HiveConf.setBoolVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, isVectorized); + HiveConf.setBoolVar( + shell.getHiveConf(), HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, isVectorized); if (isVectorized) { HiveConf.setVar(shell.getHiveConf(), HiveConf.ConfVars.HIVEFETCHTASKCONVERSION, "none"); } else { @@ -179,21 +192,30 @@ public void before() throws IOException { @After public void after() throws Exception { HiveIcebergStorageHandlerTestUtils.close(shell); - // Mixing mr and tez jobs within the same JVM can cause problems. Mr jobs set the ExecMapper status to done=false - // at the beginning and to done=true at the end. However, tez jobs also rely on this value to see if they should - // proceed, but they do not reset it to done=false at the beginning. Therefore, without calling this after each test - // case, any tez job that follows a completed mr job will erroneously read done=true and will not proceed. + // Mixing mr and tez jobs within the same JVM can cause problems. Mr jobs set the ExecMapper + // status to done=false + // at the beginning and to done=true at the end. However, tez jobs also rely on this value to + // see if they should + // proceed, but they do not reset it to done=false at the beginning. Therefore, without calling + // this after each test + // case, any tez job that follows a completed mr job will erroneously read done=true and will + // not proceed. ExecMapper.setDone(false); } @Test public void testScanTable() throws IOException { - testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, + testTables.createTable( + shell, + "customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); // Adding the ORDER BY clause will cause Hive to spawn a local MR job this time. List descRows = - shell.executeStatement("SELECT first_name, customer_id FROM default.customers ORDER BY customer_id DESC"); + shell.executeStatement( + "SELECT first_name, customer_id FROM default.customers ORDER BY customer_id DESC"); Assert.assertEquals(3, descRows.size()); Assert.assertArrayEquals(new Object[] {"Trudy", 2L}, descRows.get(0)); @@ -208,10 +230,10 @@ public void testCBOWithSelectedColumnsNonOverlapJoin() throws IOException { testTables.createTable(shell, "products", PRODUCT_SCHEMA, fileFormat, PRODUCT_RECORDS); testTables.createTable(shell, "orders", ORDER_SCHEMA, fileFormat, ORDER_RECORDS); - List rows = shell.executeStatement( - "SELECT o.order_id, o.customer_id, o.total, p.name " + - "FROM default.orders o JOIN default.products p ON o.product_id = p.id ORDER BY o.order_id" - ); + List rows = + shell.executeStatement( + "SELECT o.order_id, o.customer_id, o.total, p.name " + + "FROM default.orders o JOIN default.products p ON o.product_id = p.id ORDER BY o.order_id"); Assert.assertEquals(3, rows.size()); Assert.assertArrayEquals(new Object[] {100L, 0L, 11.11d, "skirt"}, rows.get(0)); @@ -221,30 +243,40 @@ public void testCBOWithSelectedColumnsNonOverlapJoin() throws IOException { @Test public void testDescribeTable() throws IOException { - testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); + testTables.createTable( + shell, + "customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); List rows = shell.executeStatement("DESCRIBE default.customers"); - Assert.assertEquals(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA.columns().size(), rows.size()); + Assert.assertEquals( + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA.columns().size(), rows.size()); for (int i = 0; i < HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA.columns().size(); i++) { Types.NestedField field = HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA.columns().get(i); String comment = field.doc() == null ? "from deserializer" : field.doc(); - Assert.assertArrayEquals(new Object[] {field.name(), HiveSchemaUtil.convert(field.type()).getTypeName(), - comment}, rows.get(i)); + Assert.assertArrayEquals( + new Object[] {field.name(), HiveSchemaUtil.convert(field.type()).getTypeName(), comment}, + rows.get(i)); } } @Test public void testCBOWithSelectedColumnsOverlapJoin() throws IOException { shell.setHiveSessionValue("hive.cbo.enable", true); - testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); + testTables.createTable( + shell, + "customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); testTables.createTable(shell, "orders", ORDER_SCHEMA, fileFormat, ORDER_RECORDS); - List rows = shell.executeStatement( - "SELECT c.first_name, o.order_id " + - "FROM default.orders o JOIN default.customers c ON o.customer_id = c.customer_id " + - "ORDER BY o.order_id DESC" - ); + List rows = + shell.executeStatement( + "SELECT c.first_name, o.order_id " + + "FROM default.orders o JOIN default.customers c ON o.customer_id = c.customer_id " + + "ORDER BY o.order_id DESC"); Assert.assertEquals(3, rows.size()); Assert.assertArrayEquals(new Object[] {"Bob", 102L}, rows.get(0)); @@ -258,10 +290,10 @@ public void testCBOWithSelfJoin() throws IOException { testTables.createTable(shell, "orders", ORDER_SCHEMA, fileFormat, ORDER_RECORDS); - List rows = shell.executeStatement( - "SELECT o1.order_id, o1.customer_id, o1.total " + - "FROM default.orders o1 JOIN default.orders o2 ON o1.order_id = o2.order_id ORDER BY o1.order_id" - ); + List rows = + shell.executeStatement( + "SELECT o1.order_id, o1.customer_id, o1.total " + + "FROM default.orders o1 JOIN default.orders o2 ON o1.order_id = o2.order_id ORDER BY o1.order_id"); Assert.assertEquals(3, rows.size()); Assert.assertArrayEquals(new Object[] {100L, 0L, 11.11d}, rows.get(0)); @@ -287,11 +319,24 @@ public void testJoinTablesSupportedTypes() throws IOException { List records = TestHelper.generateRandomRecords(schema, 1, 0L); testTables.createTable(shell, tableName, schema, fileFormat, records); - List queryResult = shell.executeStatement("select s." + columnName + ", h." + columnName + - " from default." + tableName + " s join default." + tableName + " h on h." + columnName + "=s." + - columnName); - Assert.assertEquals("Non matching record count for table " + tableName + " with type " + type, - 1, queryResult.size()); + List queryResult = + shell.executeStatement( + "select s." + + columnName + + ", h." + + columnName + + " from default." + + tableName + + " s join default." + + tableName + + " h on h." + + columnName + + "=s." + + columnName); + Assert.assertEquals( + "Non matching record count for table " + tableName + " with type " + type, + 1, + queryResult.size()); } } @@ -311,10 +356,12 @@ public void testSelectDistinctFromTable() throws IOException { Schema schema = new Schema(required(1, columnName, type)); List records = TestHelper.generateRandomRecords(schema, 4, 0L); - int size = records.stream().map(r -> r.getField(columnName)).collect(Collectors.toSet()).size(); + int size = + records.stream().map(r -> r.getField(columnName)).collect(Collectors.toSet()).size(); testTables.createTable(shell, tableName, schema, fileFormat, records); - List queryResult = shell.executeStatement("select count(distinct(" + columnName + - ")) from default." + tableName); + List queryResult = + shell.executeStatement( + "select count(distinct(" + columnName + ")) from default." + tableName); int distinctIds = ((Long) queryResult.get(0)[0]).intValue(); Assert.assertEquals(tableName, size, distinctIds); } @@ -324,21 +371,33 @@ public void testSelectDistinctFromTable() throws IOException { public void testInsert() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - Table table = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, ImmutableList.of()); + Table table = + testTables.createTable( + shell, + "customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, + ImmutableList.of()); // The expected query is like // INSERT INTO customers VALUES (0, 'Alice'), (1, 'Bob'), (2, 'Trudy') StringBuilder query = new StringBuilder().append("INSERT INTO customers VALUES "); - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.forEach(record -> query.append("(") - .append(record.get(0)).append(",'") - .append(record.get(1)).append("','") - .append(record.get(2)).append("'),")); + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.forEach( + record -> + query + .append("(") + .append(record.get(0)) + .append(",'") + .append(record.get(1)) + .append("','") + .append(record.get(2)) + .append("'),")); query.setLength(query.length() - 1); shell.executeStatement(query.toString()); - HiveIcebergTestUtils.validateData(table, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 0); + HiveIcebergTestUtils.validateData( + table, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 0); } @Test @@ -356,11 +415,18 @@ public void testInsertSupportedTypes() throws IOException { } String columnName = type.typeId().toString().toLowerCase() + "_column"; - Schema schema = new Schema(required(1, "id", Types.LongType.get()), required(2, columnName, type)); + Schema schema = + new Schema(required(1, "id", Types.LongType.get()), required(2, columnName, type)); List expected = TestHelper.generateRandomRecords(schema, 5, 0L); - Table table = testTables.createTable(shell, type.typeId().toString().toLowerCase() + "_table_" + i, - schema, PartitionSpec.unpartitioned(), fileFormat, expected); + Table table = + testTables.createTable( + shell, + type.typeId().toString().toLowerCase() + "_table_" + i, + schema, + PartitionSpec.unpartitioned(), + fileFormat, + expected); HiveIcebergTestUtils.validateData(table, expected, 0); } @@ -368,14 +434,20 @@ public void testInsertSupportedTypes() throws IOException { /** * Testing map only inserts. + * * @throws IOException If there is an underlying IOException */ @Test public void testInsertFromSelect() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - Table table = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); + Table table = + testTables.createTable( + shell, + "customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); shell.executeStatement("INSERT INTO customers SELECT * FROM customers"); @@ -387,14 +459,20 @@ public void testInsertFromSelect() throws IOException { /** * Testing map-reduce inserts. + * * @throws IOException If there is an underlying IOException */ @Test public void testInsertFromSelectWithOrderBy() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - Table table = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); + Table table = + testTables.createTable( + shell, + "customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); // We expect that there will be Mappers and Reducers here shell.executeStatement("INSERT INTO customers SELECT * FROM customers ORDER BY customer_id"); @@ -409,17 +487,23 @@ public void testInsertFromSelectWithOrderBy() throws IOException { public void testInsertFromSelectWithProjection() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - Table table = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, ImmutableList.of()); + Table table = + testTables.createTable( + shell, + "customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, + ImmutableList.of()); testTables.createTable(shell, "orders", ORDER_SCHEMA, fileFormat, ORDER_RECORDS); shell.executeStatement( "INSERT INTO customers (customer_id, last_name) SELECT distinct(customer_id), 'test' FROM orders"); - List expected = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .add(0L, null, "test") - .add(1L, null, "test") - .build(); + List expected = + TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + .add(0L, null, "test") + .add(1L, null, "test") + .build(); HiveIcebergTestUtils.validateData(table, expected, 0); } @@ -429,23 +513,40 @@ public void testInsertUsingSourceTableWithSharedColumnsNames() throws IOExceptio Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); List records = HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS; - PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .identity("last_name").build(); - testTables.createTable(shell, "source_customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - spec, fileFormat, records); - Table table = testTables.createTable(shell, "target_customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, ImmutableList.of()); - - // Below select from source table should produce: "hive.io.file.readcolumn.names=customer_id,last_name". - // Inserting into the target table should not fail because first_name is not selected from the source table - shell.executeStatement("INSERT INTO target_customers SELECT customer_id, 'Sam', last_name FROM source_customers"); + PartitionSpec spec = + PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + .identity("last_name") + .build(); + testTables.createTable( + shell, + "source_customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + spec, + fileFormat, + records); + Table table = + testTables.createTable( + shell, + "target_customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + spec, + fileFormat, + ImmutableList.of()); + + // Below select from source table should produce: + // "hive.io.file.readcolumn.names=customer_id,last_name". + // Inserting into the target table should not fail because first_name is not selected from the + // source table + shell.executeStatement( + "INSERT INTO target_customers SELECT customer_id, 'Sam', last_name FROM source_customers"); List expected = Lists.newArrayListWithExpectedSize(records.size()); - records.forEach(r -> { - Record copy = r.copy(); - copy.setField("first_name", "Sam"); - expected.add(copy); - }); + records.forEach( + r -> { + Record copy = r.copy(); + copy.setField("first_name", "Sam"); + expected.add(copy); + }); HiveIcebergTestUtils.validateData(table, expected, 0); } @@ -453,27 +554,48 @@ public void testInsertUsingSourceTableWithSharedColumnsNames() throws IOExceptio public void testInsertFromJoiningTwoIcebergTables() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .identity("last_name").build(); - testTables.createTable(shell, "source_customers_1", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - testTables.createTable(shell, "source_customers_2", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - Table table = testTables.createTable(shell, "target_customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, ImmutableList.of()); + PartitionSpec spec = + PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + .identity("last_name") + .build(); + testTables.createTable( + shell, + "source_customers_1", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + spec, + fileFormat, + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); + testTables.createTable( + shell, + "source_customers_2", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + spec, + fileFormat, + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); + Table table = + testTables.createTable( + shell, + "target_customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + spec, + fileFormat, + ImmutableList.of()); - shell.executeStatement("INSERT INTO target_customers SELECT a.customer_id, b.first_name, a.last_name FROM " + - "source_customers_1 a JOIN source_customers_2 b ON a.last_name = b.last_name"); + shell.executeStatement( + "INSERT INTO target_customers SELECT a.customer_id, b.first_name, a.last_name FROM " + + "source_customers_1 a JOIN source_customers_2 b ON a.last_name = b.last_name"); - HiveIcebergTestUtils.validateData(table, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 0); + HiveIcebergTestUtils.validateData( + table, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 0); } @Test public void testWriteArrayOfPrimitivesInTable() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - Schema schema = new Schema(required(1, "id", Types.LongType.get()), - required(2, "arrayofprimitives", - Types.ListType.ofRequired(3, Types.StringType.get()))); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required(2, "arrayofprimitives", Types.ListType.ofRequired(3, Types.StringType.get()))); List records = TestHelper.generateRandomRecords(schema, 5, 0L); testComplexTypeWrite(schema, records); } @@ -484,8 +606,11 @@ public void testWriteArrayOfArraysInTable() throws IOException { Schema schema = new Schema( required(1, "id", Types.LongType.get()), - required(2, "arrayofarrays", - Types.ListType.ofRequired(3, Types.ListType.ofRequired(4, Types.StringType.get())))); + required( + 2, + "arrayofarrays", + Types.ListType.ofRequired( + 3, Types.ListType.ofRequired(4, Types.StringType.get())))); List records = TestHelper.generateRandomRecords(schema, 3, 1L); testComplexTypeWrite(schema, records); } @@ -494,10 +619,15 @@ public void testWriteArrayOfArraysInTable() throws IOException { public void testWriteArrayOfMapsInTable() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); Schema schema = - new Schema(required(1, "id", Types.LongType.get()), - required(2, "arrayofmaps", Types.ListType - .ofRequired(3, Types.MapType.ofRequired(4, 5, Types.StringType.get(), - Types.StringType.get())))); + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "arrayofmaps", + Types.ListType.ofRequired( + 3, + Types.MapType.ofRequired( + 4, 5, Types.StringType.get(), Types.StringType.get())))); List records = TestHelper.generateRandomRecords(schema, 5, 1L); testComplexTypeWrite(schema, records); } @@ -506,10 +636,17 @@ public void testWriteArrayOfMapsInTable() throws IOException { public void testWriteArrayOfStructsInTable() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); Schema schema = - new Schema(required(1, "id", Types.LongType.get()), - required(2, "arrayofstructs", Types.ListType.ofRequired(3, Types.StructType - .of(required(4, "something", Types.StringType.get()), required(5, "someone", - Types.StringType.get()), required(6, "somewhere", Types.StringType.get()))))); + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "arrayofstructs", + Types.ListType.ofRequired( + 3, + Types.StructType.of( + required(4, "something", Types.StringType.get()), + required(5, "someone", Types.StringType.get()), + required(6, "somewhere", Types.StringType.get()))))); List records = TestHelper.generateRandomRecords(schema, 5, 0L); testComplexTypeWrite(schema, records); } @@ -517,9 +654,13 @@ public void testWriteArrayOfStructsInTable() throws IOException { @Test public void testWriteMapOfPrimitivesInTable() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - Schema schema = new Schema(required(1, "id", Types.LongType.get()), - required(2, "mapofprimitives", Types.MapType.ofRequired(3, 4, Types.StringType.get(), - Types.StringType.get()))); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "mapofprimitives", + Types.MapType.ofRequired(3, 4, Types.StringType.get(), Types.StringType.get()))); List records = TestHelper.generateRandomRecords(schema, 5, 0L); testComplexTypeWrite(schema, records); } @@ -527,10 +668,17 @@ public void testWriteMapOfPrimitivesInTable() throws IOException { @Test public void testWriteMapOfArraysInTable() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - Schema schema = new Schema(required(1, "id", Types.LongType.get()), - required(2, "mapofarrays", - Types.MapType.ofRequired(3, 4, Types.StringType.get(), Types.ListType.ofRequired(5, - Types.StringType.get())))); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "mapofarrays", + Types.MapType.ofRequired( + 3, + 4, + Types.StringType.get(), + Types.ListType.ofRequired(5, Types.StringType.get())))); List records = TestHelper.generateRandomRecords(schema, 5, 0L); testComplexTypeWrite(schema, records); } @@ -538,9 +686,18 @@ public void testWriteMapOfArraysInTable() throws IOException { @Test public void testWriteMapOfMapsInTable() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - Schema schema = new Schema(required(1, "id", Types.LongType.get()), - required(2, "mapofmaps", Types.MapType.ofRequired(3, 4, Types.StringType.get(), - Types.MapType.ofRequired(5, 6, Types.StringType.get(), Types.StringType.get())))); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "mapofmaps", + Types.MapType.ofRequired( + 3, + 4, + Types.StringType.get(), + Types.MapType.ofRequired( + 5, 6, Types.StringType.get(), Types.StringType.get())))); List records = TestHelper.generateRandomRecords(schema, 5, 0L); testComplexTypeWrite(schema, records); } @@ -548,11 +705,20 @@ public void testWriteMapOfMapsInTable() throws IOException { @Test public void testWriteMapOfStructsInTable() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - Schema schema = new Schema(required(1, "id", Types.LongType.get()), - required(2, "mapofstructs", Types.MapType.ofRequired(3, 4, Types.StringType.get(), - Types.StructType.of(required(5, "something", Types.StringType.get()), - required(6, "someone", Types.StringType.get()), - required(7, "somewhere", Types.StringType.get()))))); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "mapofstructs", + Types.MapType.ofRequired( + 3, + 4, + Types.StringType.get(), + Types.StructType.of( + required(5, "something", Types.StringType.get()), + required(6, "someone", Types.StringType.get()), + required(7, "somewhere", Types.StringType.get()))))); List records = TestHelper.generateRandomRecords(schema, 5, 0L); testComplexTypeWrite(schema, records); } @@ -560,10 +726,15 @@ public void testWriteMapOfStructsInTable() throws IOException { @Test public void testWriteStructOfPrimitivesInTable() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - Schema schema = new Schema(required(1, "id", Types.LongType.get()), - required(2, "structofprimitives", - Types.StructType.of(required(3, "key", Types.StringType.get()), required(4, "value", - Types.StringType.get())))); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "structofprimitives", + Types.StructType.of( + required(3, "key", Types.StringType.get()), + required(4, "value", Types.StringType.get())))); List records = TestHelper.generateRandomRecords(schema, 5, 0L); testComplexTypeWrite(schema, records); } @@ -571,11 +742,16 @@ public void testWriteStructOfPrimitivesInTable() throws IOException { @Test public void testWriteStructOfArraysInTable() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - Schema schema = new Schema(required(1, "id", Types.LongType.get()), - required(2, "structofarrays", Types.StructType - .of(required(3, "names", Types.ListType.ofRequired(4, Types.StringType.get())), - required(5, "birthdays", Types.ListType.ofRequired(6, - Types.StringType.get()))))); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "structofarrays", + Types.StructType.of( + required(3, "names", Types.ListType.ofRequired(4, Types.StringType.get())), + required( + 5, "birthdays", Types.ListType.ofRequired(6, Types.StringType.get()))))); List records = TestHelper.generateRandomRecords(schema, 5, 1L); testComplexTypeWrite(schema, records); } @@ -583,12 +759,23 @@ public void testWriteStructOfArraysInTable() throws IOException { @Test public void testWriteStructOfMapsInTable() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - Schema schema = new Schema(required(1, "id", Types.LongType.get()), - required(2, "structofmaps", Types.StructType - .of(required(3, "map1", Types.MapType.ofRequired(4, 5, - Types.StringType.get(), Types.StringType.get())), required(6, "map2", - Types.MapType.ofRequired(7, 8, Types.StringType.get(), - Types.StringType.get()))))); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "structofmaps", + Types.StructType.of( + required( + 3, + "map1", + Types.MapType.ofRequired( + 4, 5, Types.StringType.get(), Types.StringType.get())), + required( + 6, + "map2", + Types.MapType.ofRequired( + 7, 8, Types.StringType.get(), Types.StringType.get()))))); List records = TestHelper.generateRandomRecords(schema, 5, 0L); testComplexTypeWrite(schema, records); } @@ -596,10 +783,19 @@ public void testWriteStructOfMapsInTable() throws IOException { @Test public void testWriteStructOfStructsInTable() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - Schema schema = new Schema(required(1, "id", Types.LongType.get()), - required(2, "structofstructs", Types.StructType.of(required(3, "struct1", Types.StructType - .of(required(4, "key", Types.StringType.get()), required(5, "value", - Types.StringType.get())))))); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "structofstructs", + Types.StructType.of( + required( + 3, + "struct1", + Types.StructType.of( + required(4, "key", Types.StringType.get()), + required(5, "value", Types.StringType.get())))))); List records = TestHelper.generateRandomRecords(schema, 5, 0L); testComplexTypeWrite(schema, records); } @@ -608,14 +804,22 @@ public void testWriteStructOfStructsInTable() throws IOException { public void testPartitionedWrite() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .bucket("customer_id", 3) - .build(); + PartitionSpec spec = + PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + .bucket("customer_id", 3) + .build(); - List records = TestHelper.generateRandomRecords(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, 4, 0L); + List records = + TestHelper.generateRandomRecords(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, 4, 0L); - Table table = testTables.createTable(shell, "partitioned_customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, records); + Table table = + testTables.createTable( + shell, + "partitioned_customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + spec, + fileFormat, + records); HiveIcebergTestUtils.validateData(table, records, 0); } @@ -624,14 +828,22 @@ public void testPartitionedWrite() throws IOException { public void testIdentityPartitionedWrite() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .identity("customer_id") - .build(); + PartitionSpec spec = + PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + .identity("customer_id") + .build(); - List records = TestHelper.generateRandomRecords(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, 4, 0L); + List records = + TestHelper.generateRandomRecords(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, 4, 0L); - Table table = testTables.createTable(shell, "partitioned_customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, records); + Table table = + testTables.createTable( + shell, + "partitioned_customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + spec, + fileFormat, + records); HiveIcebergTestUtils.validateData(table, records, 0); } @@ -640,15 +852,23 @@ public void testIdentityPartitionedWrite() throws IOException { public void testMultilevelIdentityPartitionedWrite() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) - .identity("customer_id") - .identity("last_name") - .build(); + PartitionSpec spec = + PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + .identity("customer_id") + .identity("last_name") + .build(); - List records = TestHelper.generateRandomRecords(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, 4, 0L); + List records = + TestHelper.generateRandomRecords(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, 4, 0L); - Table table = testTables.createTable(shell, "partitioned_customers", - HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, records); + Table table = + testTables.createTable( + shell, + "partitioned_customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + spec, + fileFormat, + records); HiveIcebergTestUtils.validateData(table, records, 0); } @@ -657,38 +877,47 @@ public void testMultilevelIdentityPartitionedWrite() throws IOException { public void testMultiTableInsert() throws IOException { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - - Schema target1Schema = new Schema( - optional(1, "customer_id", Types.LongType.get()), - optional(2, "first_name", Types.StringType.get()) - ); - - Schema target2Schema = new Schema( - optional(1, "last_name", Types.StringType.get()), - optional(2, "customer_id", Types.LongType.get()) - ); - - List target1Records = TestHelper.RecordsBuilder.newInstance(target1Schema) - .add(0L, "Alice") - .add(1L, "Bob") - .add(2L, "Trudy") - .build(); + testTables.createTable( + shell, + "customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - List target2Records = TestHelper.RecordsBuilder.newInstance(target2Schema) - .add("Brown", 0L) - .add("Green", 1L) - .add("Pink", 2L) - .build(); + Schema target1Schema = + new Schema( + optional(1, "customer_id", Types.LongType.get()), + optional(2, "first_name", Types.StringType.get())); - Table target1 = testTables.createTable(shell, "target1", target1Schema, fileFormat, ImmutableList.of()); - Table target2 = testTables.createTable(shell, "target2", target2Schema, fileFormat, ImmutableList.of()); + Schema target2Schema = + new Schema( + optional(1, "last_name", Types.StringType.get()), + optional(2, "customer_id", Types.LongType.get())); + + List target1Records = + TestHelper.RecordsBuilder.newInstance(target1Schema) + .add(0L, "Alice") + .add(1L, "Bob") + .add(2L, "Trudy") + .build(); + + List target2Records = + TestHelper.RecordsBuilder.newInstance(target2Schema) + .add("Brown", 0L) + .add("Green", 1L) + .add("Pink", 2L) + .build(); + + Table target1 = + testTables.createTable(shell, "target1", target1Schema, fileFormat, ImmutableList.of()); + Table target2 = + testTables.createTable(shell, "target2", target2Schema, fileFormat, ImmutableList.of()); // simple insert: should create a single vertex writing to both target tables - shell.executeStatement("FROM customers " + - "INSERT INTO target1 SELECT customer_id, first_name " + - "INSERT INTO target2 SELECT last_name, customer_id"); + shell.executeStatement( + "FROM customers " + + "INSERT INTO target1 SELECT customer_id, first_name " + + "INSERT INTO target2 SELECT last_name, customer_id"); // Check that everything is as expected HiveIcebergTestUtils.validateData(target1, target1Records, 0); @@ -699,9 +928,10 @@ public void testMultiTableInsert() throws IOException { testTables.truncateIcebergTable(target2); // complex insert: should use a different vertex for each target table - shell.executeStatement("FROM customers " + - "INSERT INTO target1 SELECT customer_id, first_name ORDER BY first_name " + - "INSERT INTO target2 SELECT last_name, customer_id ORDER BY last_name"); + shell.executeStatement( + "FROM customers " + + "INSERT INTO target1 SELECT customer_id, first_name ORDER BY first_name " + + "INSERT INTO target2 SELECT last_name, customer_id ORDER BY last_name"); // Check that everything is as expected HiveIcebergTestUtils.validateData(target1, target1Records, 0); @@ -709,47 +939,57 @@ public void testMultiTableInsert() throws IOException { } /** - * Fix vectorized parquet - * issue-4403. - **/ + * Fix vectorized parquet issue-4403. + */ @Test public void testStructMapWithNull() throws IOException { - Assume.assumeTrue("Vectorized parquet throw class cast exception see : issue 4403", + Assume.assumeTrue( + "Vectorized parquet throw class cast exception see : issue 4403", !("PARQUET".equals(fileFormat.name()) && isVectorized)); - Schema schema = new Schema(required(1, "id", Types.LongType.get()), - required(2, "mapofstructs", Types.MapType.ofRequired(3, 4, Types.StringType.get(), - Types.StructType.of(required(5, "something", Types.StringType.get()), - required(6, "someone", Types.StringType.get()), - required(7, "somewhere", Types.StringType.get()) - )) - ) - ); - - List records = TestHelper.RecordsBuilder.newInstance(schema) - .add(0L, ImmutableMap.of()) - .build(); + Schema schema = + new Schema( + required(1, "id", Types.LongType.get()), + required( + 2, + "mapofstructs", + Types.MapType.ofRequired( + 3, + 4, + Types.StringType.get(), + Types.StructType.of( + required(5, "something", Types.StringType.get()), + required(6, "someone", Types.StringType.get()), + required(7, "somewhere", Types.StringType.get()))))); + + List records = + TestHelper.RecordsBuilder.newInstance(schema).add(0L, ImmutableMap.of()).build(); testTables.createTable(shell, "mapwithnull", schema, fileFormat, records); - List results = shell.executeStatement("select mapofstructs['context'].someone FROM mapwithnull"); + List results = + shell.executeStatement("select mapofstructs['context'].someone FROM mapwithnull"); Assert.assertEquals(1, results.size()); Assert.assertNull(results.get(0)[0]); } @Test public void testWriteWithDefaultWriteFormat() { - Assume.assumeTrue("Testing the default file format is enough for a single scenario.", - executionEngine.equals("mr") && testTableType == TestTables.TestTableType.HIVE_CATALOG && - fileFormat == FileFormat.ORC); + Assume.assumeTrue( + "Testing the default file format is enough for a single scenario.", + executionEngine.equals("mr") + && testTableType == TestTables.TestTableType.HIVE_CATALOG + && fileFormat == FileFormat.ORC); TableIdentifier identifier = TableIdentifier.of("default", "customers"); // create Iceberg table without specifying a write format in the tbl properties // it should fall back to using the default file format - shell.executeStatement(String.format("CREATE EXTERNAL TABLE %s (id bigint, name string) STORED BY '%s' %s", - identifier, - HiveIcebergStorageHandler.class.getName(), - testTables.locationForCreateTableSQL(identifier))); + shell.executeStatement( + String.format( + "CREATE EXTERNAL TABLE %s (id bigint, name string) STORED BY '%s' %s", + identifier, + HiveIcebergStorageHandler.class.getName(), + testTables.locationForCreateTableSQL(identifier))); shell.executeStatement(String.format("INSERT INTO %s VALUES (10, 'Linda')", identifier)); List results = shell.executeStatement(String.format("SELECT * FROM %s", identifier)); @@ -761,16 +1001,30 @@ public void testWriteWithDefaultWriteFormat() { @Test public void testInsertEmptyResultSet() throws IOException { - Table source = testTables.createTable(shell, "source", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, ImmutableList.of()); - Table target = testTables.createTable(shell, "target", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, ImmutableList.of()); + Table source = + testTables.createTable( + shell, + "source", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, + ImmutableList.of()); + Table target = + testTables.createTable( + shell, + "target", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, + ImmutableList.of()); shell.executeStatement("INSERT INTO target SELECT * FROM source"); HiveIcebergTestUtils.validateData(target, ImmutableList.of(), 0); - testTables.appendIcebergTable(shell.getHiveConf(), source, fileFormat, null, - HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); + testTables.appendIcebergTable( + shell.getHiveConf(), + source, + fileFormat, + null, + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); shell.executeStatement("INSERT INTO target SELECT * FROM source WHERE first_name = 'Nobody'"); HiveIcebergTestUtils.validateData(target, ImmutableList.of(), 0); } @@ -778,44 +1032,77 @@ public void testInsertEmptyResultSet() throws IOException { @Test public void testStatsPopulation() throws Exception { Assume.assumeTrue("Tez write is not implemented yet", executionEngine.equals("mr")); - Assume.assumeTrue("Only HiveCatalog can remove stats which become obsolete", + Assume.assumeTrue( + "Only HiveCatalog can remove stats which become obsolete", testTableType == TestTables.TestTableType.HIVE_CATALOG); shell.setHiveSessionValue(HiveConf.ConfVars.HIVESTATSAUTOGATHER.varname, true); // create the table using a catalog which supports updating Hive stats (KEEP_HIVE_STATS is true) shell.setHiveSessionValue(ConfigProperties.KEEP_HIVE_STATS, true); TableIdentifier identifier = TableIdentifier.of("default", "customers"); - testTables.createTable(shell, identifier.name(), HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - PartitionSpec.unpartitioned(), fileFormat, ImmutableList.of()); + testTables.createTable( + shell, + identifier.name(), + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + PartitionSpec.unpartitioned(), + fileFormat, + ImmutableList.of()); // insert some data and check the stats are up-to-date - String insert = testTables.getInsertQuery(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, identifier, false); + String insert = + testTables.getInsertQuery( + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, identifier, false); shell.executeStatement(insert); - String stats = shell.metastore().getTable(identifier).getParameters().get(StatsSetupConst.COLUMN_STATS_ACCURATE); - Assert.assertTrue(stats.startsWith("{\"BASIC_STATS\":\"true\"")); // it's followed by column stats in Hive3 + String stats = + shell + .metastore() + .getTable(identifier) + .getParameters() + .get(StatsSetupConst.COLUMN_STATS_ACCURATE); + Assert.assertTrue( + stats.startsWith("{\"BASIC_STATS\":\"true\"")); // it's followed by column stats in Hive3 // Create a Catalog where the KEEP_HIVE_STATS is false shell.metastore().hiveConf().set(ConfigProperties.KEEP_HIVE_STATS, StatsSetupConst.FALSE); - TestTables nonHiveTestTables = HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp); + TestTables nonHiveTestTables = + HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType, temp); Table nonHiveTable = nonHiveTestTables.loadTable(identifier); - // Append data to the table through a non-Hive engine (in this case, via the java API) -> should remove stats - nonHiveTestTables.appendIcebergTable(shell.getHiveConf(), nonHiveTable, fileFormat, null, + // Append data to the table through a non-Hive engine (in this case, via the java API) -> should + // remove stats + nonHiveTestTables.appendIcebergTable( + shell.getHiveConf(), + nonHiveTable, + fileFormat, + null, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - stats = shell.metastore().getTable(identifier).getParameters().get(StatsSetupConst.COLUMN_STATS_ACCURATE); + stats = + shell + .metastore() + .getTable(identifier) + .getParameters() + .get(StatsSetupConst.COLUMN_STATS_ACCURATE); Assert.assertNull(stats); // insert some data again using Hive catalog, and check the stats are back shell.executeStatement(insert); - stats = shell.metastore().getTable(identifier).getParameters().get(StatsSetupConst.COLUMN_STATS_ACCURATE); - Assert.assertTrue(stats.startsWith("{\"BASIC_STATS\":\"true\"")); // it's followed by column stats in Hive3 + stats = + shell + .metastore() + .getTable(identifier) + .getParameters() + .get(StatsSetupConst.COLUMN_STATS_ACCURATE); + Assert.assertTrue( + stats.startsWith("{\"BASIC_STATS\":\"true\"")); // it's followed by column stats in Hive3 } /** - * Tests that vectorized ORC reading code path correctly handles when the same ORC file is split into multiple parts. - * Although the split offsets and length will not always include the file tail that contains the metadata, the - * vectorized reader needs to make sure to handle the tail reading regardless of the offsets. If this is not done - * correctly, the last SELECT query will fail. + * Tests that vectorized ORC reading code path correctly handles when the same ORC file is split + * into multiple parts. Although the split offsets and length will not always include the file + * tail that contains the metadata, the vectorized reader needs to make sure to handle the tail + * reading regardless of the offsets. If this is not done correctly, the last SELECT query will + * fail. + * * @throws Exception - any test error */ @Test @@ -823,15 +1110,22 @@ public void testVectorizedOrcMultipleSplits() throws Exception { assumeTrue(isVectorized && FileFormat.ORC.equals(fileFormat)); // This data will be held by a ~870kB ORC file - List records = TestHelper.generateRandomRecords(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - 20000, 0L); - - // To support splitting the ORC file, we need to specify the stripe size to a small value. It looks like the min - // value is about 220kB, no smaller stripes are written by ORC. Anyway, this setting will produce 4 stripes. + List records = + TestHelper.generateRandomRecords( + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, 20000, 0L); + + // To support splitting the ORC file, we need to specify the stripe size to a small value. It + // looks like the min + // value is about 220kB, no smaller stripes are written by ORC. Anyway, this setting will + // produce 4 stripes. shell.setHiveSessionValue("orc.stripe.size", "210000"); - testTables.createTable(shell, "targettab", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, records); + testTables.createTable( + shell, + "targettab", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, + records); // Will request 4 splits, separated on the exact stripe boundaries within the ORC file. // (Would request 5 if ORC split generation wouldn't be split (aka stripe) offset aware). @@ -839,57 +1133,86 @@ public void testVectorizedOrcMultipleSplits() throws Exception { List result = shell.executeStatement("SELECT * FROM targettab ORDER BY last_name"); Assert.assertEquals(20000, result.size()); - } @Test public void testRemoveAndAddBackColumnFromIcebergTable() throws IOException { assumeTrue(isVectorized && FileFormat.PARQUET.equals(fileFormat)); - // Create an Iceberg table with the columns customer_id, first_name and last_name with some initial data. - Table icebergTable = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, - fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); + // Create an Iceberg table with the columns customer_id, first_name and last_name with some + // initial data. + Table icebergTable = + testTables.createTable( + shell, + "customers", + HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, + fileFormat, + HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); // Remove the first_name column icebergTable.updateSchema().deleteColumn("first_name").commit(); // Add a new column with the name first_name - icebergTable.updateSchema().addColumn("first_name", Types.StringType.get(), "This is new first name").commit(); + icebergTable + .updateSchema() + .addColumn("first_name", Types.StringType.get(), "This is new first name") + .commit(); // Add new data to the table with the new first_name column filled. icebergTable = testTables.loadTable(TableIdentifier.of("default", "customers")); - Schema customerSchemaWithNewFirstName = new Schema(optional(1, "customer_id", Types.LongType.get()), - optional(2, "last_name", Types.StringType.get(), "This is last name"), - optional(3, "first_name", Types.StringType.get(), "This is the newly added first name")); + Schema customerSchemaWithNewFirstName = + new Schema( + optional(1, "customer_id", Types.LongType.get()), + optional(2, "last_name", Types.StringType.get(), "This is last name"), + optional( + 3, "first_name", Types.StringType.get(), "This is the newly added first name")); List newCustomersWithNewFirstName = - TestHelper.RecordsBuilder.newInstance(customerSchemaWithNewFirstName).add(3L, "Red", "James").build(); - testTables.appendIcebergTable(shell.getHiveConf(), icebergTable, fileFormat, null, newCustomersWithNewFirstName); + TestHelper.RecordsBuilder.newInstance(customerSchemaWithNewFirstName) + .add(3L, "Red", "James") + .build(); + testTables.appendIcebergTable( + shell.getHiveConf(), icebergTable, fileFormat, null, newCustomersWithNewFirstName); TestHelper.RecordsBuilder customersWithNewFirstNameBuilder = - TestHelper.RecordsBuilder.newInstance(customerSchemaWithNewFirstName).add(0L, "Brown", null) - .add(1L, "Green", null).add(2L, "Pink", null).add(3L, "Red", "James"); + TestHelper.RecordsBuilder.newInstance(customerSchemaWithNewFirstName) + .add(0L, "Brown", null) + .add(1L, "Green", null) + .add(2L, "Pink", null) + .add(3L, "Red", "James"); List customersWithNewFirstName = customersWithNewFirstNameBuilder.build(); // Run a 'select *' from Hive and check if the first_name column is returned. - // It should be null for the old data and should be filled in the entry added after the column addition. + // It should be null for the old data and should be filled in the entry added after the column + // addition. List rows = shell.executeStatement("SELECT * FROM default.customers"); - HiveIcebergTestUtils.validateData(customersWithNewFirstName, - HiveIcebergTestUtils.valueForRow(customerSchemaWithNewFirstName, rows), 0); - - Schema customerSchemaWithNewFirstNameOnly = new Schema(optional(1, "customer_id", Types.LongType.get()), - optional(3, "first_name", Types.StringType.get(), "This is the newly added first name")); + HiveIcebergTestUtils.validateData( + customersWithNewFirstName, + HiveIcebergTestUtils.valueForRow(customerSchemaWithNewFirstName, rows), + 0); - TestHelper.RecordsBuilder customersWithNewFirstNameOnlyBuilder = TestHelper.RecordsBuilder - .newInstance(customerSchemaWithNewFirstNameOnly).add(0L, null).add(1L, null).add(2L, null).add(3L, "James"); + Schema customerSchemaWithNewFirstNameOnly = + new Schema( + optional(1, "customer_id", Types.LongType.get()), + optional( + 3, "first_name", Types.StringType.get(), "This is the newly added first name")); + + TestHelper.RecordsBuilder customersWithNewFirstNameOnlyBuilder = + TestHelper.RecordsBuilder.newInstance(customerSchemaWithNewFirstNameOnly) + .add(0L, null) + .add(1L, null) + .add(2L, null) + .add(3L, "James"); List customersWithNewFirstNameOnly = customersWithNewFirstNameOnlyBuilder.build(); // Run a 'select first_name' from Hive to check if the new first-name column can be queried. rows = shell.executeStatement("SELECT customer_id, first_name FROM default.customers"); - HiveIcebergTestUtils.validateData(customersWithNewFirstNameOnly, - HiveIcebergTestUtils.valueForRow(customerSchemaWithNewFirstNameOnly, rows), 0); - + HiveIcebergTestUtils.validateData( + customersWithNewFirstNameOnly, + HiveIcebergTestUtils.valueForRow(customerSchemaWithNewFirstNameOnly, rows), + 0); } /** * Checks if the certain type is an unsupported vectorized types in Hive 3.1.2 + * * @param type - data type * @return - true if unsupported */ @@ -899,12 +1222,11 @@ private boolean isUnsupportedVectorizedTypeForHive(Type type) { } switch (fileFormat) { case PARQUET: - return Types.DecimalType.of(3, 1).equals(type) || - type == Types.TimestampType.withoutZone() || - type == Types.TimeType.get(); + return Types.DecimalType.of(3, 1).equals(type) + || type == Types.TimestampType.withoutZone() + || type == Types.TimeType.get(); case ORC: - return type == Types.TimestampType.withZone() || - type == Types.TimeType.get(); + return type == Types.TimestampType.withZone() || type == Types.TimeType.get(); default: return false; } @@ -912,18 +1234,27 @@ private boolean isUnsupportedVectorizedTypeForHive(Type type) { private void testComplexTypeWrite(Schema schema, List records) throws IOException { String tableName = "complex_table"; - Table table = testTables.createTable(shell, "complex_table", schema, fileFormat, ImmutableList.of()); + Table table = + testTables.createTable(shell, "complex_table", schema, fileFormat, ImmutableList.of()); String dummyTableName = "dummy"; shell.executeStatement("CREATE TABLE default." + dummyTableName + "(a int)"); shell.executeStatement("INSERT INTO TABLE default." + dummyTableName + " VALUES(1)"); - records.forEach(r -> shell.executeStatement(insertQueryForComplexType(tableName, dummyTableName, schema, r))); + records.forEach( + r -> + shell.executeStatement( + insertQueryForComplexType(tableName, dummyTableName, schema, r))); HiveIcebergTestUtils.validateData(table, records, 0); } - private String insertQueryForComplexType(String tableName, String dummyTableName, Schema schema, Record record) { - StringBuilder query = new StringBuilder("INSERT INTO TABLE ").append(tableName).append(" SELECT ") - .append(record.get(0)).append(", "); + private String insertQueryForComplexType( + String tableName, String dummyTableName, Schema schema, Record record) { + StringBuilder query = + new StringBuilder("INSERT INTO TABLE ") + .append(tableName) + .append(" SELECT ") + .append(record.get(0)) + .append(", "); Type type = schema.asStruct().fields().get(1).type(); query.append(buildComplexTypeInnerQuery(record.get(1), type)); query.setLength(query.length() - 1); @@ -949,16 +1280,27 @@ private StringBuilder buildComplexTypeInnerQuery(Object field, Type type) { Type keyType = ((Types.MapType) type).fields().get(0).type(); Type valueType = ((Types.MapType) type).fields().get(1).type(); if (!entries.isEmpty()) { - entries.entrySet().forEach(e -> query.append(buildComplexTypeInnerQuery(e.getKey(), keyType) - .append(buildComplexTypeInnerQuery(e.getValue(), valueType)))); + entries + .entrySet() + .forEach( + e -> + query.append( + buildComplexTypeInnerQuery(e.getKey(), keyType) + .append(buildComplexTypeInnerQuery(e.getValue(), valueType)))); query.setLength(query.length() - 1); } query.append("),"); } else if (type instanceof Types.StructType) { query.append("named_struct("); - ((GenericRecord) field).struct().fields().stream() - .forEach(f -> query.append(buildComplexTypeInnerQuery(f.name(), Types.StringType.get())) - .append(buildComplexTypeInnerQuery(((GenericRecord) field).getField(f.name()), f.type()))); + ((GenericRecord) field) + .struct().fields().stream() + .forEach( + f -> + query + .append(buildComplexTypeInnerQuery(f.name(), Types.StringType.get())) + .append( + buildComplexTypeInnerQuery( + ((GenericRecord) field).getField(f.name()), f.type()))); query.setLength(query.length() - 1); query.append("),"); } else if (type instanceof Types.StringType) { diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithMultipleCatalogs.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithMultipleCatalogs.java index b9601d2e465a..b24959bbe8e7 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithMultipleCatalogs.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithMultipleCatalogs.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; import java.io.IOException; @@ -42,33 +41,40 @@ @RunWith(Parameterized.class) public class TestHiveIcebergStorageHandlerWithMultipleCatalogs { - private static final String[] EXECUTION_ENGINES = new String[] { "tez", "mr" }; + private static final String[] EXECUTION_ENGINES = new String[] {"tez", "mr"}; private static final String HIVECATALOGNAME = "table1_catalog"; private static final String OTHERCATALOGNAME = "table2_catalog"; private static TestHiveShell shell; @Parameterized.Parameter(0) public FileFormat fileFormat1; + @Parameterized.Parameter(1) public FileFormat fileFormat2; + @Parameterized.Parameter(2) public String executionEngine; + @Parameterized.Parameter(3) public TestTables.TestTableType testTableType1; + @Parameterized.Parameter(4) public String table1CatalogName; + @Parameterized.Parameter(5) public TestTables.TestTableType testTableType2; + @Parameterized.Parameter(6) public String table2CatalogName; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private TestTables testTables1; private TestTables testTables2; - @Parameterized.Parameters(name = "fileFormat1={0}, fileFormat2={1}, engine={2}, tableType1={3}, catalogName1={4}, " + - "tableType2={5}, catalogName2={6}") + @Parameterized.Parameters( + name = + "fileFormat1={0}, fileFormat2={1}, engine={2}, tableType1={3}, catalogName1={4}, " + + "tableType2={5}, catalogName2={6}") public static Collection parameters() { Collection testParams = Lists.newArrayList(); String javaVersion = System.getProperty("java.specification.version"); @@ -79,8 +85,16 @@ public static Collection parameters() { if (javaVersion.equals("1.8") || "mr".equals(engine)) { for (TestTables.TestTableType testTableType : TestTables.ALL_TABLE_TYPES) { if (!TestTables.TestTableType.HIVE_CATALOG.equals(testTableType)) { - testParams.add(new Object[]{FileFormat.PARQUET, FileFormat.ORC, engine, - TestTables.TestTableType.HIVE_CATALOG, HIVECATALOGNAME, testTableType, OTHERCATALOGNAME}); + testParams.add( + new Object[] { + FileFormat.PARQUET, + FileFormat.ORC, + engine, + TestTables.TestTableType.HIVE_CATALOG, + HIVECATALOGNAME, + testTableType, + OTHERCATALOGNAME + }); } } } @@ -100,12 +114,22 @@ public static void afterClass() throws Exception { @Before public void before() throws IOException { - testTables1 = HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType1, temp, table1CatalogName); + testTables1 = + HiveIcebergStorageHandlerTestUtils.testTables( + shell, testTableType1, temp, table1CatalogName); HiveIcebergStorageHandlerTestUtils.init(shell, testTables1, temp, executionEngine); - testTables1.properties().entrySet().forEach(e -> shell.setHiveSessionValue(e.getKey(), e.getValue())); - - testTables2 = HiveIcebergStorageHandlerTestUtils.testTables(shell, testTableType2, temp, table2CatalogName); - testTables2.properties().entrySet().forEach(e -> shell.setHiveSessionValue(e.getKey(), e.getValue())); + testTables1 + .properties() + .entrySet() + .forEach(e -> shell.setHiveSessionValue(e.getKey(), e.getValue())); + + testTables2 = + HiveIcebergStorageHandlerTestUtils.testTables( + shell, testTableType2, temp, table2CatalogName); + testTables2 + .properties() + .entrySet() + .forEach(e -> shell.setHiveSessionValue(e.getKey(), e.getValue())); } @After @@ -115,29 +139,48 @@ public void after() throws Exception { @Test public void testJoinTablesFromDifferentCatalogs() throws IOException { - createAndAddRecords(testTables1, fileFormat1, TableIdentifier.of("default", "customers1"), + createAndAddRecords( + testTables1, + fileFormat1, + TableIdentifier.of("default", "customers1"), HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - createAndAddRecords(testTables2, fileFormat2, TableIdentifier.of("default", "customers2"), + createAndAddRecords( + testTables2, + fileFormat2, + TableIdentifier.of("default", "customers2"), HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS); - List rows = shell.executeStatement("SELECT c2.customer_id, c2.first_name, c2.last_name " + - "FROM default.customers2 c2 JOIN default.customers1 c1 ON c2.customer_id = c1.customer_id " + - "ORDER BY c2.customer_id"); + List rows = + shell.executeStatement( + "SELECT c2.customer_id, c2.first_name, c2.last_name " + + "FROM default.customers2 c2 JOIN default.customers1 c1 ON c2.customer_id = c1.customer_id " + + "ORDER BY c2.customer_id"); Assert.assertEquals(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.size(), rows.size()); - HiveIcebergTestUtils.validateData(Lists.newArrayList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS), - HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, rows), 0); + HiveIcebergTestUtils.validateData( + Lists.newArrayList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS), + HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, rows), + 0); } - private void createAndAddRecords(TestTables testTables, FileFormat fileFormat, TableIdentifier identifier, - List records) throws IOException { + private void createAndAddRecords( + TestTables testTables, + FileFormat fileFormat, + TableIdentifier identifier, + List records) + throws IOException { String createSql = - "CREATE EXTERNAL TABLE " + identifier + " (customer_id BIGINT, first_name STRING, last_name STRING)" + - " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + - testTables.locationForCreateTableSQL(identifier) + - " TBLPROPERTIES ('" + InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')"; + "CREATE EXTERNAL TABLE " + + identifier + + " (customer_id BIGINT, first_name STRING, last_name STRING)" + + " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' " + + testTables.locationForCreateTableSQL(identifier) + + " TBLPROPERTIES ('" + + InputFormatConfig.CATALOG_NAME + + "'='" + + testTables.catalogName() + + "')"; shell.executeStatement(createSql); Table icebergTable = testTables.loadTable(identifier); testTables.appendIcebergTable(shell.getHiveConf(), icebergTable, fileFormat, null, records); } - } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveShell.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveShell.java index 5ad3caa62a39..7396350613da 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveShell.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveShell.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; import java.util.Collections; @@ -39,11 +38,13 @@ /** * Test class for running HiveQL queries, essentially acting like a Beeline shell in tests. * - * It spins up both an HS2 and a Metastore instance to work with. The shell will only accept - * queries if it has been previously initialized via {@link #start()}, and a session has been opened via - * {@link #openSession()}. Prior to calling {@link #start()}, the shell should first be configured with props that apply - * across all test cases by calling {@link #setHiveConfValue(String, String)} ()}. On the other hand, session-level conf - * can be applied anytime via {@link #setHiveSessionValue(String, String)} ()}, once we've opened an active session. + *

    It spins up both an HS2 and a Metastore instance to work with. The shell will only accept + * queries if it has been previously initialized via {@link #start()}, and a session has been opened + * via {@link #openSession()}. Prior to calling {@link #start()}, the shell should first be + * configured with props that apply across all test cases by calling {@link + * #setHiveConfValue(String, String)} ()}. On the other hand, session-level conf can be applied + * anytime via {@link #setHiveSessionValue(String, String)} ()}, once we've opened an active + * session. */ public class TestHiveShell { @@ -61,7 +62,8 @@ public TestHiveShell() { } public void setHiveConfValue(String key, String value) { - Preconditions.checkState(!started, "TestHiveShell has already been started. Cannot set Hive conf anymore."); + Preconditions.checkState( + !started, "TestHiveShell has already been started. Cannot set Hive conf anymore."); hs2Conf.verifyAndSet(key, value); } @@ -81,8 +83,11 @@ public void setHiveSessionValue(String key, boolean value) { public void start() { // Create a copy of the HiveConf for the metastore metastore.start(new HiveConf(hs2Conf), 10); - hs2Conf.setVar(HiveConf.ConfVars.METASTOREURIS, metastore.hiveConf().getVar(HiveConf.ConfVars.METASTOREURIS)); - hs2Conf.setVar(HiveConf.ConfVars.METASTOREWAREHOUSE, + hs2Conf.setVar( + HiveConf.ConfVars.METASTOREURIS, + metastore.hiveConf().getVar(HiveConf.ConfVars.METASTOREURIS)); + hs2Conf.setVar( + HiveConf.ConfVars.METASTOREWAREHOUSE, metastore.hiveConf().getVar(HiveConf.ConfVars.METASTOREWAREHOUSE)); // Initializing RpcMetrics in a single JVM multiple times can cause issues @@ -90,7 +95,8 @@ public void start() { hs2.init(hs2Conf); hs2.start(); - client = hs2.getServices().stream() + client = + hs2.getServices().stream() .filter(CLIService.class::isInstance) .findFirst() .map(CLIService.class::cast) @@ -112,10 +118,13 @@ public TestHiveMetastore metastore() { } public void openSession() { - Preconditions.checkState(started, "You have to start TestHiveShell first, before opening a session."); + Preconditions.checkState( + started, "You have to start TestHiveShell first, before opening a session."); try { - SessionHandle sessionHandle = client.getSessionManager().openSession( - CLIService.SERVER_VERSION, "", "", "127.0.0.1", Collections.emptyMap()); + SessionHandle sessionHandle = + client + .getSessionManager() + .openSession(CLIService.SERVER_VERSION, "", "", "127.0.0.1", Collections.emptyMap()); session = client.getSessionManager().getSession(sessionHandle); } catch (Exception e) { throw new RuntimeException("Unable to open new Hive session: ", e); @@ -133,10 +142,12 @@ public void closeSession() { } public List executeStatement(String statement) { - Preconditions.checkState(session != null, - "You have to start TestHiveShell and open a session first, before running a query."); + Preconditions.checkState( + session != null, + "You have to start TestHiveShell and open a session first, before running a query."); try { - OperationHandle handle = client.executeStatement(session.getSessionHandle(), statement, Collections.emptyMap()); + OperationHandle handle = + client.executeStatement(session.getSessionHandle(), statement, Collections.emptyMap()); List resultSet = Lists.newArrayList(); if (handle.hasResultSet()) { RowSet rowSet; @@ -149,7 +160,8 @@ public List executeStatement(String statement) { } return resultSet; } catch (HiveSQLException e) { - throw new IllegalArgumentException("Failed to execute Hive query '" + statement + "': " + e.getMessage(), e); + throw new IllegalArgumentException( + "Failed to execute Hive query '" + statement + "': " + e.getMessage(), e); } } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/TestTables.java b/mr/src/test/java/org/apache/iceberg/mr/hive/TestTables.java index 547df22c499a..ec166b6dc62c 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/TestTables.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/TestTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive; import java.io.File; @@ -66,12 +65,13 @@ // Helper class for setting up and testing various catalog implementations abstract class TestTables { - public static final TestTableType[] ALL_TABLE_TYPES = new TestTableType[] { - TestTableType.HADOOP_TABLE, - TestTableType.HADOOP_CATALOG, - TestTableType.CUSTOM_CATALOG, - TestTableType.HIVE_CATALOG - }; + public static final TestTableType[] ALL_TABLE_TYPES = + new TestTableType[] { + TestTableType.HADOOP_TABLE, + TestTableType.HADOOP_CATALOG, + TestTableType.CUSTOM_CATALOG, + TestTableType.HIVE_CATALOG + }; private final Tables tables; protected final TemporaryFolder temp; @@ -105,46 +105,57 @@ public String catalogName() { } /** - * The location string needed to be provided for CREATE TABLE ... commands, - * like "LOCATION 'file:///tmp/warehouse/default/tablename'. Empty ("") if LOCATION is not needed. + * The location string needed to be provided for CREATE TABLE ... commands, like "LOCATION + * 'file:///tmp/warehouse/default/tablename'. Empty ("") if LOCATION is not needed. + * * @param identifier The table identifier * @return The location string for create table operation */ public abstract String locationForCreateTableSQL(TableIdentifier identifier); /** - * The table properties string needed for the CREATE TABLE ... commands, - * like {@code TBLPROPERTIES('iceberg.catalog'='mycatalog')} - * @return the tables properties string, such as {@code TBLPROPERTIES('iceberg.catalog'='mycatalog')} + * The table properties string needed for the CREATE TABLE ... commands, like {@code + * TBLPROPERTIES('iceberg.catalog'='mycatalog')} + * + * @return the tables properties string, such as {@code + * TBLPROPERTIES('iceberg.catalog'='mycatalog')} */ public String propertiesForCreateTableSQL(Map tableProperties) { Map properties = Maps.newHashMap(tableProperties); properties.putIfAbsent(InputFormatConfig.CATALOG_NAME, catalog); - String props = properties.entrySet().stream() + String props = + properties.entrySet().stream() .map(entry -> String.format("'%s'='%s'", entry.getKey(), entry.getValue())) .collect(Collectors.joining(",")); return " TBLPROPERTIES (" + props + ")"; } /** - * If an independent Hive table creation is needed for the given Catalog then this should return the Hive SQL - * string which we have to execute. Overridden for HiveCatalog where the Hive table is immediately created - * during the Iceberg table creation so no extra sql execution is required. + * If an independent Hive table creation is needed for the given Catalog then this should return + * the Hive SQL string which we have to execute. Overridden for HiveCatalog where the Hive table + * is immediately created during the Iceberg table creation so no extra sql execution is required. + * * @param identifier The table identifier (the namespace should be non-empty and single level) * @param tableProps Optional map of table properties * @return The SQL string - which should be executed, null - if it is not needed. */ public String createHiveTableSQL(TableIdentifier identifier, Map tableProps) { Preconditions.checkArgument(!identifier.namespace().isEmpty(), "Namespace should not be empty"); - Preconditions.checkArgument(identifier.namespace().levels().length == 1, "Namespace should be single level"); - return String.format("CREATE TABLE %s.%s STORED BY '%s' %s %s", identifier.namespace(), identifier.name(), - HiveIcebergStorageHandler.class.getName(), locationForCreateTableSQL(identifier), - propertiesForCreateTableSQL(tableProps)); + Preconditions.checkArgument( + identifier.namespace().levels().length == 1, "Namespace should be single level"); + return String.format( + "CREATE TABLE %s.%s STORED BY '%s' %s %s", + identifier.namespace(), + identifier.name(), + HiveIcebergStorageHandler.class.getName(), + locationForCreateTableSQL(identifier), + propertiesForCreateTableSQL(tableProps)); } /** - * Loads the given table from the actual catalog. Overridden by HadoopTables, since the parameter of the - * {@link Tables#load(String)} should be the full path of the table metadata directory + * Loads the given table from the actual catalog. Overridden by HadoopTables, since the parameter + * of the {@link Tables#load(String)} should be the full path of the table metadata directory + * * @param identifier The table we want to load * @return The Table loaded from the Catalog */ @@ -153,9 +164,10 @@ public Table loadTable(TableIdentifier identifier) { } /** - * Creates an non partitioned Hive test table. Creates the Iceberg table/data and creates the corresponding Hive - * table as well when needed. The table will be in the 'default' database. The table will be populated with the - * provided List of {@link Record}s. + * Creates an non partitioned Hive test table. Creates the Iceberg table/data and creates the + * corresponding Hive table as well when needed. The table will be in the 'default' database. The + * table will be populated with the provided List of {@link Record}s. + * * @param shell The HiveShell used for Hive table creation * @param tableName The name of the test table * @param schema The schema used for the table creation @@ -164,10 +176,16 @@ public Table loadTable(TableIdentifier identifier) { * @return The created table * @throws IOException If there is an error writing data */ - public Table createTable(TestHiveShell shell, String tableName, Schema schema, FileFormat fileFormat, - List records) throws IOException { + public Table createTable( + TestHiveShell shell, + String tableName, + Schema schema, + FileFormat fileFormat, + List records) + throws IOException { Table table = createIcebergTable(shell.getHiveConf(), tableName, schema, fileFormat, records); - String createHiveSQL = createHiveTableSQL(TableIdentifier.of("default", tableName), ImmutableMap.of()); + String createHiveSQL = + createHiveTableSQL(TableIdentifier.of("default", tableName), ImmutableMap.of()); if (createHiveSQL != null) { shell.executeStatement(createHiveSQL); } @@ -176,8 +194,10 @@ public Table createTable(TestHiveShell shell, String tableName, Schema schema, F } /** - * Creates a partitioned Hive test table using Hive SQL. The table will be in the 'default' database. - * The table will be populated with the provided List of {@link Record}s using a Hive insert statement. + * Creates a partitioned Hive test table using Hive SQL. The table will be in the 'default' + * database. The table will be populated with the provided List of {@link Record}s using a Hive + * insert statement. + * * @param shell The HiveShell used for Hive table creation * @param tableName The name of the test table * @param schema The schema used for the table creation @@ -187,29 +207,56 @@ public Table createTable(TestHiveShell shell, String tableName, Schema schema, F * @return The created table * @throws IOException If there is an error writing data */ - public Table createTable(TestHiveShell shell, String tableName, Schema schema, PartitionSpec spec, - FileFormat fileFormat, List records) { + public Table createTable( + TestHiveShell shell, + String tableName, + Schema schema, + PartitionSpec spec, + FileFormat fileFormat, + List records) { TableIdentifier identifier = TableIdentifier.of("default", tableName); - shell.executeStatement("CREATE EXTERNAL TABLE " + identifier + - " STORED BY '" + HiveIcebergStorageHandler.class.getName() + "' " + - locationForCreateTableSQL(identifier) + - "TBLPROPERTIES ('" + InputFormatConfig.TABLE_SCHEMA + "'='" + - SchemaParser.toJson(schema) + "', " + - "'" + InputFormatConfig.PARTITION_SPEC + "'='" + - PartitionSpecParser.toJson(spec) + "', " + - "'" + TableProperties.DEFAULT_FILE_FORMAT + "'='" + fileFormat + "', " + - "'" + InputFormatConfig.CATALOG_NAME + "'='" + catalogName() + "')"); + shell.executeStatement( + "CREATE EXTERNAL TABLE " + + identifier + + " STORED BY '" + + HiveIcebergStorageHandler.class.getName() + + "' " + + locationForCreateTableSQL(identifier) + + "TBLPROPERTIES ('" + + InputFormatConfig.TABLE_SCHEMA + + "'='" + + SchemaParser.toJson(schema) + + "', " + + "'" + + InputFormatConfig.PARTITION_SPEC + + "'='" + + PartitionSpecParser.toJson(spec) + + "', " + + "'" + + TableProperties.DEFAULT_FILE_FORMAT + + "'='" + + fileFormat + + "', " + + "'" + + InputFormatConfig.CATALOG_NAME + + "'='" + + catalogName() + + "')"); if (records != null && !records.isEmpty()) { StringBuilder query = new StringBuilder().append("INSERT INTO " + identifier + " VALUES "); - records.forEach(record -> { - query.append("("); - query.append(record.struct().fields().stream() - .map(field -> getStringValueForInsert(record.getField(field.name()), field.type())) - .collect(Collectors.joining(","))); - query.append("),"); - }); + records.forEach( + record -> { + query.append("("); + query.append( + record.struct().fields().stream() + .map( + field -> + getStringValueForInsert(record.getField(field.name()), field.type())) + .collect(Collectors.joining(","))); + query.append("),"); + }); query.setLength(query.length() - 1); shell.executeStatement(query.toString()); @@ -218,25 +265,32 @@ public Table createTable(TestHiveShell shell, String tableName, Schema schema, P return loadTable(identifier); } - public String getInsertQuery(List records, TableIdentifier identifier, boolean isOverwrite) { - StringBuilder query = new StringBuilder(String.format("INSERT %s %s VALUES ", - isOverwrite ? "OVERWRITE TABLE" : "INTO", identifier)); - - records.forEach(record -> { - query.append("("); - query.append(record.struct().fields().stream() - .map(field -> getStringValueForInsert(record.getField(field.name()), field.type())) - .collect(Collectors.joining(","))); - query.append("),"); - }); + public String getInsertQuery( + List records, TableIdentifier identifier, boolean isOverwrite) { + StringBuilder query = + new StringBuilder( + String.format( + "INSERT %s %s VALUES ", isOverwrite ? "OVERWRITE TABLE" : "INTO", identifier)); + + records.forEach( + record -> { + query.append("("); + query.append( + record.struct().fields().stream() + .map( + field -> getStringValueForInsert(record.getField(field.name()), field.type())) + .collect(Collectors.joining(","))); + query.append("),"); + }); query.setLength(query.length() - 1); return query.toString(); } /** - * Creates a Hive test table. Creates the Iceberg table/data and creates the corresponding Hive table as well when - * needed. The table will be in the 'default' database. The table will be populated with the provided with randomly - * generated {@link Record}s. + * Creates a Hive test table. Creates the Iceberg table/data and creates the corresponding Hive + * table as well when needed. The table will be in the 'default' database. The table will be + * populated with the provided with randomly generated {@link Record}s. + * * @param shell The HiveShell used for Hive table creation * @param tableName The name of the test table * @param schema The schema used for the table creation @@ -244,16 +298,18 @@ public String getInsertQuery(List records, TableIdentifier identifier, b * @param numRecords The number of records should be generated and stored in the table * @throws IOException If there is an error writing data */ - public List createTableWithGeneratedRecords(TestHiveShell shell, String tableName, Schema schema, - FileFormat fileFormat, int numRecords) throws IOException { + public List createTableWithGeneratedRecords( + TestHiveShell shell, String tableName, Schema schema, FileFormat fileFormat, int numRecords) + throws IOException { List records = TestHelper.generateRandomRecords(schema, numRecords, 0L); createTable(shell, tableName, schema, fileFormat, records); return records; } /** - * Creates an Iceberg table/data without creating the corresponding Hive table. The table will be in the 'default' - * namespace. + * Creates an Iceberg table/data without creating the corresponding Hive table. The table will be + * in the 'default' namespace. + * * @param configuration The configuration used during the table creation * @param tableName The name of the test table * @param schema The schema used for the table creation @@ -262,11 +318,23 @@ public List createTableWithGeneratedRecords(TestHiveShell shell, String * @return The create table * @throws IOException If there is an error writing data */ - public Table createIcebergTable(Configuration configuration, String tableName, Schema schema, FileFormat fileFormat, - List records) throws IOException { + public Table createIcebergTable( + Configuration configuration, + String tableName, + Schema schema, + FileFormat fileFormat, + List records) + throws IOException { String identifier = identifier("default." + tableName); - TestHelper helper = new TestHelper(new Configuration(configuration), tables(), identifier, schema, - PartitionSpec.unpartitioned(), fileFormat, temp); + TestHelper helper = + new TestHelper( + new Configuration(configuration), + tables(), + identifier, + schema, + PartitionSpec.unpartitioned(), + fileFormat, + temp); Table table = helper.createTable(); if (records != null && !records.isEmpty()) { @@ -278,6 +346,7 @@ public Table createIcebergTable(Configuration configuration, String tableName, S /** * Append more data to the table. + * * @param configuration The configuration used during the table creation * @param table The table to append * @param format The file format used for writing the data @@ -285,10 +354,14 @@ public Table createIcebergTable(Configuration configuration, String tableName, S * @param records The records with which should be added to the table * @throws IOException If there is an error writing data */ - public void appendIcebergTable(Configuration configuration, Table table, FileFormat format, StructLike partition, - List records) throws IOException { - TestHelper helper = new TestHelper( - configuration, null, null, null, null, format, temp); + public void appendIcebergTable( + Configuration configuration, + Table table, + FileFormat format, + StructLike partition, + List records) + throws IOException { + TestHelper helper = new TestHelper(configuration, null, null, null, null, format, temp); helper.setTable(table); if (!records.isEmpty()) { @@ -298,6 +371,7 @@ public void appendIcebergTable(Configuration configuration, Table table, FileFor /** * Truncates an Iceberg table. + * * @param table The iceberg table to truncate */ public void truncateIcebergTable(Table table) { @@ -313,10 +387,15 @@ private CatalogToTables(Catalog catalog) { } @Override - public Table create(Schema schema, PartitionSpec spec, SortOrder sortOrder, - Map properties, String tableIdentifier) { + public Table create( + Schema schema, + PartitionSpec spec, + SortOrder sortOrder, + Map properties, + String tableIdentifier) { TableIdentifier tableIdent = TableIdentifier.parse(tableIdentifier); - return catalog.buildTable(tableIdent, schema) + return catalog + .buildTable(tableIdent, schema) .withPartitionSpec(spec) .withSortOrder(sortOrder) .withProperties(properties) @@ -338,12 +417,18 @@ static class CustomCatalogTestTables extends TestTables { private final String warehouseLocation; - CustomCatalogTestTables(Configuration conf, TemporaryFolder temp, String catalogName) throws IOException { - this(conf, temp, (MetastoreUtil.hive3PresentOnClasspath() ? "file:" : "") + - temp.newFolder("custom", "warehouse").toString(), catalogName); + CustomCatalogTestTables(Configuration conf, TemporaryFolder temp, String catalogName) + throws IOException { + this( + conf, + temp, + (MetastoreUtil.hive3PresentOnClasspath() ? "file:" : "") + + temp.newFolder("custom", "warehouse").toString(), + catalogName); } - CustomCatalogTestTables(Configuration conf, TemporaryFolder temp, String warehouseLocation, String catalogName) { + CustomCatalogTestTables( + Configuration conf, TemporaryFolder temp, String warehouseLocation, String catalogName) { super(new TestCatalogs.CustomHadoopCatalog(conf, warehouseLocation), temp, catalogName); this.warehouseLocation = warehouseLocation; } @@ -351,30 +436,34 @@ static class CustomCatalogTestTables extends TestTables { @Override public Map properties() { return ImmutableMap.of( - InputFormatConfig.catalogPropertyConfigKey(catalog, CatalogProperties.CATALOG_IMPL), - TestCatalogs.CustomHadoopCatalog.class.getName(), - InputFormatConfig.catalogPropertyConfigKey(catalog, CatalogProperties.WAREHOUSE_LOCATION), - warehouseLocation - ); + InputFormatConfig.catalogPropertyConfigKey(catalog, CatalogProperties.CATALOG_IMPL), + TestCatalogs.CustomHadoopCatalog.class.getName(), + InputFormatConfig.catalogPropertyConfigKey(catalog, CatalogProperties.WAREHOUSE_LOCATION), + warehouseLocation); } @Override public String locationForCreateTableSQL(TableIdentifier identifier) { return "LOCATION '" + warehouseLocation + TestTables.tablePath(identifier) + "' "; } - } static class HadoopCatalogTestTables extends TestTables { private final String warehouseLocation; - HadoopCatalogTestTables(Configuration conf, TemporaryFolder temp, String catalogName) throws IOException { - this(conf, temp, (MetastoreUtil.hive3PresentOnClasspath() ? "file:" : "") + - temp.newFolder("hadoop", "warehouse").toString(), catalogName); + HadoopCatalogTestTables(Configuration conf, TemporaryFolder temp, String catalogName) + throws IOException { + this( + conf, + temp, + (MetastoreUtil.hive3PresentOnClasspath() ? "file:" : "") + + temp.newFolder("hadoop", "warehouse").toString(), + catalogName); } - HadoopCatalogTestTables(Configuration conf, TemporaryFolder temp, String warehouseLocation, String catalogName) { + HadoopCatalogTestTables( + Configuration conf, TemporaryFolder temp, String warehouseLocation, String catalogName) { super(new HadoopCatalog(conf, warehouseLocation), temp, catalogName); this.warehouseLocation = warehouseLocation; } @@ -385,8 +474,7 @@ public Map properties() { InputFormatConfig.catalogPropertyConfigKey(catalog, CatalogUtil.ICEBERG_CATALOG_TYPE), CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP, InputFormatConfig.catalogPropertyConfigKey(catalog, CatalogProperties.WAREHOUSE_LOCATION), - warehouseLocation - ); + warehouseLocation); } @Override @@ -406,7 +494,8 @@ public String identifier(String tableIdentifier) { try { TableIdentifier identifier = TableIdentifier.parse(tableIdentifier); - location = temp.newFolder(ObjectArrays.concat(identifier.namespace().levels(), identifier.name())); + location = + temp.newFolder(ObjectArrays.concat(identifier.namespace().levels(), identifier.name())); } catch (IOException ioe) { throw new UncheckedIOException(ioe); } @@ -424,19 +513,25 @@ public String locationForCreateTableSQL(TableIdentifier identifier) { public Table loadTable(TableIdentifier identifier) { return tables().load(temp.getRoot().getPath() + TestTables.tablePath(identifier)); } - } static class HiveTestTables extends TestTables { HiveTestTables(Configuration conf, TemporaryFolder temp, String catalogName) { - super(CatalogUtil.loadCatalog(HiveCatalog.class.getName(), CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE, - ImmutableMap.of(), conf), temp, catalogName); + super( + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), + CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE, + ImmutableMap.of(), + conf), + temp, + catalogName); } @Override public Map properties() { - return ImmutableMap.of(InputFormatConfig.catalogPropertyConfigKey(catalog, CatalogUtil.ICEBERG_CATALOG_TYPE), + return ImmutableMap.of( + InputFormatConfig.catalogPropertyConfigKey(catalog, CatalogUtil.ICEBERG_CATALOG_TYPE), CatalogUtil.ICEBERG_CATALOG_TYPE_HIVE); } @@ -460,9 +555,11 @@ private String getStringValueForInsert(Object value, Type type) { if (type.equals(Types.TimestampType.withoutZone())) { return String.format(template, Timestamp.valueOf((LocalDateTime) value).toString()); } else if (type.equals(Types.TimestampType.withZone())) { - return String.format(template, Timestamp.from(((OffsetDateTime) value).toInstant()).toString()); + return String.format( + template, Timestamp.from(((OffsetDateTime) value).toInstant()).toString()); } else if (type.equals(Types.BooleanType.get())) { - // in hive2 boolean type values must not be surrounded in apostrophes. Otherwise the value is translated to true. + // in hive2 boolean type values must not be surrounded in apostrophes. Otherwise the value is + // translated to true. return value.toString(); } else { return String.format(template, value.toString()); @@ -472,32 +569,36 @@ private String getStringValueForInsert(Object value, Type type) { enum TestTableType { HADOOP_TABLE { @Override - public TestTables instance(Configuration conf, TemporaryFolder temporaryFolder, String catalogName) { + public TestTables instance( + Configuration conf, TemporaryFolder temporaryFolder, String catalogName) { return new HadoopTestTables(conf, temporaryFolder); } }, HADOOP_CATALOG { @Override - public TestTables instance(Configuration conf, TemporaryFolder temporaryFolder, String catalogName) + public TestTables instance( + Configuration conf, TemporaryFolder temporaryFolder, String catalogName) throws IOException { return new HadoopCatalogTestTables(conf, temporaryFolder, catalogName); } }, CUSTOM_CATALOG { @Override - public TestTables instance(Configuration conf, TemporaryFolder temporaryFolder, String catalogName) + public TestTables instance( + Configuration conf, TemporaryFolder temporaryFolder, String catalogName) throws IOException { return new CustomCatalogTestTables(conf, temporaryFolder, catalogName); } }, HIVE_CATALOG { @Override - public TestTables instance(Configuration conf, TemporaryFolder temporaryFolder, String catalogName) { + public TestTables instance( + Configuration conf, TemporaryFolder temporaryFolder, String catalogName) { return new HiveTestTables(conf, temporaryFolder, catalogName); } }; - public abstract TestTables instance(Configuration conf, TemporaryFolder temporaryFolder, String catalogName) - throws IOException; + public abstract TestTables instance( + Configuration conf, TemporaryFolder temporaryFolder, String catalogName) throws IOException; } } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergBinaryObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergBinaryObjectInspector.java index 9d47d74d5366..5db84e5aa4b9 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergBinaryObjectInspector.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergBinaryObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.nio.ByteBuffer; @@ -35,7 +34,8 @@ public void testIcebergByteBufferObjectInspector() { BinaryObjectInspector oi = IcebergBinaryObjectInspector.get(); Assert.assertEquals(ObjectInspector.Category.PRIMITIVE, oi.getCategory()); - Assert.assertEquals(PrimitiveObjectInspector.PrimitiveCategory.BINARY, oi.getPrimitiveCategory()); + Assert.assertEquals( + PrimitiveObjectInspector.PrimitiveCategory.BINARY, oi.getPrimitiveCategory()); Assert.assertEquals(TypeInfoFactory.binaryTypeInfo, oi.getTypeInfo()); Assert.assertEquals(TypeInfoFactory.binaryTypeInfo.getTypeName(), oi.getTypeName()); @@ -68,5 +68,4 @@ public void testIcebergByteBufferObjectInspector() { Assert.assertFalse(oi.preferWritable()); } - } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDateObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDateObjectInspector.java index a263a2b503b6..73681fec5799 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDateObjectInspector.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDateObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.sql.Date; @@ -61,5 +60,4 @@ public void testIcebergDateObjectInspector() { Assert.assertFalse(oi.preferWritable()); } - } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDecimalObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDecimalObjectInspector.java index 4d04eee1622d..1c4734c77f4c 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDecimalObjectInspector.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergDecimalObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.math.BigDecimal; @@ -46,7 +45,8 @@ public void testIcebergDecimalObjectInspector() { HiveDecimalObjectInspector oi = IcebergDecimalObjectInspector.get(38, 18); Assert.assertEquals(ObjectInspector.Category.PRIMITIVE, oi.getCategory()); - Assert.assertEquals(PrimitiveObjectInspector.PrimitiveCategory.DECIMAL, oi.getPrimitiveCategory()); + Assert.assertEquals( + PrimitiveObjectInspector.PrimitiveCategory.DECIMAL, oi.getPrimitiveCategory()); Assert.assertEquals(new DecimalTypeInfo(38, 18), oi.getTypeInfo()); Assert.assertEquals(TypeInfoFactory.decimalTypeInfo.getTypeName(), oi.getTypeName()); @@ -64,7 +64,8 @@ public void testIcebergDecimalObjectInspector() { HiveDecimal one = HiveDecimal.create(BigDecimal.ONE); Assert.assertEquals(one, oi.getPrimitiveJavaObject(BigDecimal.ONE)); - Assert.assertEquals(new HiveDecimalWritable(one), oi.getPrimitiveWritableObject(BigDecimal.ONE)); + Assert.assertEquals( + new HiveDecimalWritable(one), oi.getPrimitiveWritableObject(BigDecimal.ONE)); HiveDecimal copy = (HiveDecimal) oi.copyObject(one); @@ -73,5 +74,4 @@ public void testIcebergDecimalObjectInspector() { Assert.assertFalse(oi.preferWritable()); } - } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergFixedObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergFixedObjectInspector.java index 82bfa5e162a4..a902d027269d 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergFixedObjectInspector.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergFixedObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -33,7 +32,8 @@ public void testIcebergFixedObjectInspector() { IcebergFixedObjectInspector oi = IcebergFixedObjectInspector.get(); Assert.assertEquals(ObjectInspector.Category.PRIMITIVE, oi.getCategory()); - Assert.assertEquals(PrimitiveObjectInspector.PrimitiveCategory.BINARY, oi.getPrimitiveCategory()); + Assert.assertEquals( + PrimitiveObjectInspector.PrimitiveCategory.BINARY, oi.getPrimitiveCategory()); Assert.assertEquals(TypeInfoFactory.binaryTypeInfo, oi.getTypeInfo()); Assert.assertEquals(TypeInfoFactory.binaryTypeInfo.getTypeName(), oi.getTypeName()); @@ -46,7 +46,7 @@ public void testIcebergFixedObjectInspector() { Assert.assertNull(oi.getPrimitiveWritableObject(null)); Assert.assertNull(oi.convert(null)); - byte[] bytes = new byte[] { 0, 1 }; + byte[] bytes = new byte[] {0, 1}; BytesWritable bytesWritable = new BytesWritable(bytes); Assert.assertArrayEquals(bytes, oi.getPrimitiveJavaObject(bytes)); @@ -60,5 +60,4 @@ public void testIcebergFixedObjectInspector() { Assert.assertFalse(oi.preferWritable()); } - } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergObjectInspector.java index 593af9a53ba2..0641ee2b6349 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergObjectInspector.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergObjectInspector.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; +import static org.apache.iceberg.types.Types.NestedField.required; + import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructField; @@ -33,12 +34,10 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.required; - - public class TestIcebergObjectInspector { - private final Schema schema = new Schema( + private final Schema schema = + new Schema( required(1, "binary_field", Types.BinaryType.get(), "binary comment"), required(2, "boolean_field", Types.BooleanType.get(), "boolean comment"), required(3, "date_field", Types.DateType.get(), "date comment"), @@ -52,17 +51,24 @@ public class TestIcebergObjectInspector { required(11, "timestamp_field", Types.TimestampType.withoutZone(), "timestamp comment"), required(12, "timestamptz_field", Types.TimestampType.withZone(), "timestamptz comment"), required(13, "uuid_field", Types.UUIDType.get(), "uuid comment"), - required(14, "list_field", - Types.ListType.ofRequired(15, Types.StringType.get()), "list comment"), - required(16, "map_field", - Types.MapType.ofRequired(17, 18, Types.StringType.get(), Types.IntegerType.get()), - "map comment"), - required(19, "struct_field", Types.StructType.of( - Types.NestedField.required(20, "nested_field", Types.StringType.get(), "nested field comment")), - "struct comment" - ), - required(21, "time_field", Types.TimeType.get(), "time comment") - ); + required( + 14, + "list_field", + Types.ListType.ofRequired(15, Types.StringType.get()), + "list comment"), + required( + 16, + "map_field", + Types.MapType.ofRequired(17, 18, Types.StringType.get(), Types.IntegerType.get()), + "map comment"), + required( + 19, + "struct_field", + Types.StructType.of( + Types.NestedField.required( + 20, "nested_field", Types.StringType.get(), "nested field comment")), + "struct comment"), + required(21, "time_field", Types.TimeType.get(), "time comment")); @SuppressWarnings("MethodLength") @Test @@ -85,7 +91,8 @@ public void testIcebergObjectInspector() { Assert.assertEquals(2, booleanField.getFieldID()); Assert.assertEquals("boolean_field", booleanField.getFieldName()); Assert.assertEquals("boolean comment", booleanField.getFieldComment()); - Assert.assertEquals(getPrimitiveObjectInspector(boolean.class), booleanField.getFieldObjectInspector()); + Assert.assertEquals( + getPrimitiveObjectInspector(boolean.class), booleanField.getFieldObjectInspector()); // date StructField dateField = soi.getStructFieldRef("date_field"); @@ -93,11 +100,13 @@ public void testIcebergObjectInspector() { Assert.assertEquals("date_field", dateField.getFieldName()); Assert.assertEquals("date comment", dateField.getFieldComment()); if (MetastoreUtil.hive3PresentOnClasspath()) { - Assert.assertEquals("org.apache.iceberg.mr.hive.serde.objectinspector.IcebergDateObjectInspectorHive3", - dateField.getFieldObjectInspector().getClass().getName()); + Assert.assertEquals( + "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergDateObjectInspectorHive3", + dateField.getFieldObjectInspector().getClass().getName()); } else { - Assert.assertEquals("org.apache.iceberg.mr.hive.serde.objectinspector.IcebergDateObjectInspector", - dateField.getFieldObjectInspector().getClass().getName()); + Assert.assertEquals( + "org.apache.iceberg.mr.hive.serde.objectinspector.IcebergDateObjectInspector", + dateField.getFieldObjectInspector().getClass().getName()); } // decimal @@ -105,14 +114,16 @@ public void testIcebergObjectInspector() { Assert.assertEquals(4, decimalField.getFieldID()); Assert.assertEquals("decimal_field", decimalField.getFieldName()); Assert.assertEquals("decimal comment", decimalField.getFieldComment()); - Assert.assertEquals(IcebergDecimalObjectInspector.get(38, 18), decimalField.getFieldObjectInspector()); + Assert.assertEquals( + IcebergDecimalObjectInspector.get(38, 18), decimalField.getFieldObjectInspector()); // double StructField doubleField = soi.getStructFieldRef("double_field"); Assert.assertEquals(5, doubleField.getFieldID()); Assert.assertEquals("double_field", doubleField.getFieldName()); Assert.assertEquals("double comment", doubleField.getFieldComment()); - Assert.assertEquals(getPrimitiveObjectInspector(double.class), doubleField.getFieldObjectInspector()); + Assert.assertEquals( + getPrimitiveObjectInspector(double.class), doubleField.getFieldObjectInspector()); // fixed StructField fixedField = soi.getStructFieldRef("fixed_field"); @@ -126,28 +137,32 @@ public void testIcebergObjectInspector() { Assert.assertEquals(7, floatField.getFieldID()); Assert.assertEquals("float_field", floatField.getFieldName()); Assert.assertEquals("float comment", floatField.getFieldComment()); - Assert.assertEquals(getPrimitiveObjectInspector(float.class), floatField.getFieldObjectInspector()); + Assert.assertEquals( + getPrimitiveObjectInspector(float.class), floatField.getFieldObjectInspector()); // integer StructField integerField = soi.getStructFieldRef("integer_field"); Assert.assertEquals(8, integerField.getFieldID()); Assert.assertEquals("integer_field", integerField.getFieldName()); Assert.assertEquals("integer comment", integerField.getFieldComment()); - Assert.assertEquals(getPrimitiveObjectInspector(int.class), integerField.getFieldObjectInspector()); + Assert.assertEquals( + getPrimitiveObjectInspector(int.class), integerField.getFieldObjectInspector()); // long StructField longField = soi.getStructFieldRef("long_field"); Assert.assertEquals(9, longField.getFieldID()); Assert.assertEquals("long_field", longField.getFieldName()); Assert.assertEquals("long comment", longField.getFieldComment()); - Assert.assertEquals(getPrimitiveObjectInspector(long.class), longField.getFieldObjectInspector()); + Assert.assertEquals( + getPrimitiveObjectInspector(long.class), longField.getFieldObjectInspector()); // string StructField stringField = soi.getStructFieldRef("string_field"); Assert.assertEquals(10, stringField.getFieldID()); Assert.assertEquals("string_field", stringField.getFieldName()); Assert.assertEquals("string comment", stringField.getFieldComment()); - Assert.assertEquals(getPrimitiveObjectInspector(String.class), stringField.getFieldObjectInspector()); + Assert.assertEquals( + getPrimitiveObjectInspector(String.class), stringField.getFieldObjectInspector()); // timestamp without tz StructField timestampField = soi.getStructFieldRef("timestamp_field"); @@ -155,10 +170,12 @@ public void testIcebergObjectInspector() { Assert.assertEquals("timestamp_field", timestampField.getFieldName()); Assert.assertEquals("timestamp comment", timestampField.getFieldComment()); if (MetastoreUtil.hive3PresentOnClasspath()) { - Assert.assertEquals("IcebergTimestampObjectInspectorHive3", + Assert.assertEquals( + "IcebergTimestampObjectInspectorHive3", timestampField.getFieldObjectInspector().getClass().getSimpleName()); } else { - Assert.assertEquals(IcebergTimestampObjectInspector.get(), timestampField.getFieldObjectInspector()); + Assert.assertEquals( + IcebergTimestampObjectInspector.get(), timestampField.getFieldObjectInspector()); } // timestamp with tz @@ -167,10 +184,13 @@ public void testIcebergObjectInspector() { Assert.assertEquals("timestamptz_field", timestampTzField.getFieldName()); Assert.assertEquals("timestamptz comment", timestampTzField.getFieldComment()); if (MetastoreUtil.hive3PresentOnClasspath()) { - Assert.assertEquals("IcebergTimestampWithZoneObjectInspectorHive3", + Assert.assertEquals( + "IcebergTimestampWithZoneObjectInspectorHive3", timestampTzField.getFieldObjectInspector().getClass().getSimpleName()); } else { - Assert.assertEquals(IcebergTimestampWithZoneObjectInspector.get(), timestampTzField.getFieldObjectInspector()); + Assert.assertEquals( + IcebergTimestampWithZoneObjectInspector.get(), + timestampTzField.getFieldObjectInspector()); } // UUID @@ -192,7 +212,8 @@ public void testIcebergObjectInspector() { Assert.assertEquals(16, mapField.getFieldID()); Assert.assertEquals("map_field", mapField.getFieldName()); Assert.assertEquals("map comment", mapField.getFieldComment()); - Assert.assertEquals(getMapObjectInspector(String.class, int.class), mapField.getFieldObjectInspector()); + Assert.assertEquals( + getMapObjectInspector(String.class, int.class), mapField.getFieldObjectInspector()); // struct StructField structField = soi.getStructFieldRef("struct_field"); @@ -200,8 +221,10 @@ public void testIcebergObjectInspector() { Assert.assertEquals("struct_field", structField.getFieldName()); Assert.assertEquals("struct comment", structField.getFieldComment()); - ObjectInspector expectedObjectInspector = new IcebergRecordObjectInspector( - (Types.StructType) schema.findType(19), ImmutableList.of(getPrimitiveObjectInspector(String.class))); + ObjectInspector expectedObjectInspector = + new IcebergRecordObjectInspector( + (Types.StructType) schema.findType(19), + ImmutableList.of(getPrimitiveObjectInspector(String.class))); Assert.assertEquals(expectedObjectInspector, structField.getFieldObjectInspector()); // time @@ -213,17 +236,18 @@ public void testIcebergObjectInspector() { } private static ObjectInspector getPrimitiveObjectInspector(Class clazz) { - PrimitiveTypeInfo typeInfo = (PrimitiveTypeInfo) TypeInfoFactory.getPrimitiveTypeInfoFromJavaPrimitive(clazz); + PrimitiveTypeInfo typeInfo = + (PrimitiveTypeInfo) TypeInfoFactory.getPrimitiveTypeInfoFromJavaPrimitive(clazz); return PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(typeInfo); } private static ObjectInspector getListObjectInspector(Class clazz) { - return ObjectInspectorFactory.getStandardListObjectInspector(getPrimitiveObjectInspector(clazz)); + return ObjectInspectorFactory.getStandardListObjectInspector( + getPrimitiveObjectInspector(clazz)); } private static ObjectInspector getMapObjectInspector(Class keyClazz, Class valueClazz) { return ObjectInspectorFactory.getStandardMapObjectInspector( - getPrimitiveObjectInspector(keyClazz), getPrimitiveObjectInspector(valueClazz)); + getPrimitiveObjectInspector(keyClazz), getPrimitiveObjectInspector(valueClazz)); } - } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergRecordObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergRecordObjectInspector.java index a07415361c9c..4ed358c116fb 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergRecordObjectInspector.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergRecordObjectInspector.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; +import static org.apache.iceberg.types.Types.NestedField.required; + import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.iceberg.Schema; @@ -29,23 +30,25 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestIcebergRecordObjectInspector { @Test public void testIcebergRecordObjectInspector() { - Schema schema = new Schema( + Schema schema = + new Schema( required(1, "integer_field", Types.IntegerType.get()), - required(2, "struct_field", Types.StructType.of( - Types.NestedField.required(3, "string_field", Types.StringType.get()))) - ); + required( + 2, + "struct_field", + Types.StructType.of( + Types.NestedField.required(3, "string_field", Types.StringType.get())))); Record record = RandomGenericData.generate(schema, 1, 0L).get(0); Record innerRecord = record.get(1, Record.class); StructObjectInspector soi = (StructObjectInspector) IcebergObjectInspector.create(schema); - Assert.assertEquals(ImmutableList.of(record.get(0), record.get(1)), soi.getStructFieldsDataAsList(record)); + Assert.assertEquals( + ImmutableList.of(record.get(0), record.get(1)), soi.getStructFieldsDataAsList(record)); StructField integerField = soi.getStructFieldRef("integer_field"); Assert.assertEquals(record.get(0), soi.getStructFieldData(record, integerField)); @@ -57,21 +60,24 @@ public void testIcebergRecordObjectInspector() { StructObjectInspector innerSoi = (StructObjectInspector) structField.getFieldObjectInspector(); StructField stringField = innerSoi.getStructFieldRef("string_field"); - Assert.assertEquals(ImmutableList.of(innerRecord.get(0)), innerSoi.getStructFieldsDataAsList(innerRecord)); + Assert.assertEquals( + ImmutableList.of(innerRecord.get(0)), innerSoi.getStructFieldsDataAsList(innerRecord)); Assert.assertEquals(innerRecord.get(0), innerSoi.getStructFieldData(innerData, stringField)); } @Test public void testIcebergRecordObjectInspectorWithRowNull() { - Schema schema = new Schema( - required(1, "integer_field", Types.IntegerType.get()), - required(2, "struct_field", Types.StructType.of( - Types.NestedField.required(3, "string_field", Types.StringType.get()))) - ); + Schema schema = + new Schema( + required(1, "integer_field", Types.IntegerType.get()), + required( + 2, + "struct_field", + Types.StructType.of( + Types.NestedField.required(3, "string_field", Types.StringType.get())))); StructObjectInspector soi = (StructObjectInspector) IcebergObjectInspector.create(schema); Assert.assertNull(soi.getStructFieldsDataAsList(null)); StructField integerField = soi.getStructFieldRef("integer_field"); Assert.assertNull(soi.getStructFieldData(null, integerField)); } - } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimeObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimeObjectInspector.java index c635920ed4ca..04c3c710f27d 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimeObjectInspector.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimeObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.time.LocalTime; @@ -35,7 +34,8 @@ public void testIcebergTimeObjectInspector() { IcebergTimeObjectInspector oi = IcebergTimeObjectInspector.get(); Assert.assertEquals(ObjectInspector.Category.PRIMITIVE, oi.getCategory()); - Assert.assertEquals(PrimitiveObjectInspector.PrimitiveCategory.STRING, oi.getPrimitiveCategory()); + Assert.assertEquals( + PrimitiveObjectInspector.PrimitiveCategory.STRING, oi.getPrimitiveCategory()); Assert.assertEquals(TypeInfoFactory.stringTypeInfo, oi.getTypeInfo()); Assert.assertEquals(TypeInfoFactory.stringTypeInfo.getTypeName(), oi.getTypeName()); diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampObjectInspector.java index 85fe3730e129..9205d7c0d7f8 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampObjectInspector.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.sql.Timestamp; @@ -35,7 +34,8 @@ public void testIcebergTimestampObjectInspector() { IcebergTimestampObjectInspector oi = IcebergTimestampObjectInspector.get(); Assert.assertEquals(ObjectInspector.Category.PRIMITIVE, oi.getCategory()); - Assert.assertEquals(PrimitiveObjectInspector.PrimitiveCategory.TIMESTAMP, oi.getPrimitiveCategory()); + Assert.assertEquals( + PrimitiveObjectInspector.PrimitiveCategory.TIMESTAMP, oi.getPrimitiveCategory()); Assert.assertEquals(TypeInfoFactory.timestampTypeInfo, oi.getTypeInfo()); Assert.assertEquals(TypeInfoFactory.timestampTypeInfo.getTypeName(), oi.getTypeName()); @@ -63,5 +63,4 @@ public void testIcebergTimestampObjectInspector() { Assert.assertEquals(local, oi.convert(ts)); } - } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampWithZoneObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampWithZoneObjectInspector.java index 2aef6d712e59..20caec44f7bd 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampWithZoneObjectInspector.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergTimestampWithZoneObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.sql.Timestamp; @@ -37,7 +36,8 @@ public void testIcebergTimestampObjectInspectorWithUTCAdjustment() { IcebergTimestampWithZoneObjectInspector oi = IcebergTimestampWithZoneObjectInspector.get(); Assert.assertEquals(ObjectInspector.Category.PRIMITIVE, oi.getCategory()); - Assert.assertEquals(PrimitiveObjectInspector.PrimitiveCategory.TIMESTAMP, oi.getPrimitiveCategory()); + Assert.assertEquals( + PrimitiveObjectInspector.PrimitiveCategory.TIMESTAMP, oi.getPrimitiveCategory()); Assert.assertEquals(TypeInfoFactory.timestampTypeInfo, oi.getTypeInfo()); Assert.assertEquals(TypeInfoFactory.timestampTypeInfo.getTypeName(), oi.getTypeName()); @@ -64,11 +64,12 @@ public void testIcebergTimestampObjectInspectorWithUTCAdjustment() { Assert.assertFalse(oi.preferWritable()); - Assert.assertEquals(OffsetDateTime.ofInstant(local.toInstant(ZoneOffset.ofHours(-5)), ZoneOffset.UTC), - oi.convert(ts)); + Assert.assertEquals( + OffsetDateTime.ofInstant(local.toInstant(ZoneOffset.ofHours(-5)), ZoneOffset.UTC), + oi.convert(ts)); - Assert.assertEquals(offsetDateTime.withOffsetSameInstant(ZoneOffset.UTC), - oi.convert(Timestamp.from(offsetDateTime.toInstant()))); + Assert.assertEquals( + offsetDateTime.withOffsetSameInstant(ZoneOffset.UTC), + oi.convert(Timestamp.from(offsetDateTime.toInstant()))); } - } diff --git a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergUUIDObjectInspector.java b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergUUIDObjectInspector.java index cbf55e3f3883..303cabc1cc15 100644 --- a/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergUUIDObjectInspector.java +++ b/mr/src/test/java/org/apache/iceberg/mr/hive/serde/objectinspector/TestIcebergUUIDObjectInspector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.mr.hive.serde.objectinspector; import java.util.UUID; @@ -34,7 +33,8 @@ public void testIcebergUUIDObjectInspector() { IcebergUUIDObjectInspector oi = IcebergUUIDObjectInspector.get(); Assert.assertEquals(ObjectInspector.Category.PRIMITIVE, oi.getCategory()); - Assert.assertEquals(PrimitiveObjectInspector.PrimitiveCategory.STRING, oi.getPrimitiveCategory()); + Assert.assertEquals( + PrimitiveObjectInspector.PrimitiveCategory.STRING, oi.getPrimitiveCategory()); Assert.assertEquals(TypeInfoFactory.stringTypeInfo, oi.getTypeInfo()); Assert.assertEquals(TypeInfoFactory.stringTypeInfo.getTypeName(), oi.getTypeName()); diff --git a/nessie/src/main/java/org/apache/iceberg/nessie/NessieCatalog.java b/nessie/src/main/java/org/apache/iceberg/nessie/NessieCatalog.java index f5509395c223..b48509becc73 100644 --- a/nessie/src/main/java/org/apache/iceberg/nessie/NessieCatalog.java +++ b/nessie/src/main/java/org/apache/iceberg/nessie/NessieCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.nessie; import java.io.IOException; @@ -55,13 +54,13 @@ /** * Nessie implementation of Iceberg Catalog. * - *

    - * A note on namespaces: Nessie namespaces are implicit and do not need to be explicitly created or deleted. - * The create and delete namespace methods are no-ops for the NessieCatalog. One can still list namespaces that have - * objects stored in them to assist with namespace-centric catalog exploration. - *

    + *

    A note on namespaces: Nessie namespaces are implicit and do not need to be explicitly created + * or deleted. The create and delete namespace methods are no-ops for the NessieCatalog. One can + * still list namespaces that have objects stored in them to assist with namespace-centric catalog + * exploration. */ -public class NessieCatalog extends BaseMetastoreCatalog implements AutoCloseable, SupportsNamespaces, Configurable { +public class NessieCatalog extends BaseMetastoreCatalog + implements AutoCloseable, SupportsNamespaces, Configurable { private static final Logger LOG = LoggerFactory.getLogger(NessieCatalog.class); private static final Joiner SLASH = Joiner.on("/"); @@ -73,8 +72,7 @@ public class NessieCatalog extends BaseMetastoreCatalog implements AutoCloseable private Map catalogOptions; private CloseableGroup closeableGroup; - public NessieCatalog() { - } + public NessieCatalog() {} @SuppressWarnings("checkstyle:HiddenField") @Override @@ -82,33 +80,44 @@ public void initialize(String name, Map options) { Map catalogOptions = ImmutableMap.copyOf(options); String fileIOImpl = options.get(CatalogProperties.FILE_IO_IMPL); // remove nessie prefix - final Function removePrefix = x -> x.replace(NessieUtil.NESSIE_CONFIG_PREFIX, ""); - final String requestedRef = options.get(removePrefix.apply(NessieConfigConstants.CONF_NESSIE_REF)); - String requestedHash = options.get(removePrefix.apply(NessieConfigConstants.CONF_NESSIE_REF_HASH)); - NessieApiV1 api = createNessieClientBuilder(options.get(NessieConfigConstants.CONF_NESSIE_CLIENT_BUILDER_IMPL)) - .fromConfig(x -> options.get(removePrefix.apply(x))) - .build(NessieApiV1.class); + final Function removePrefix = + x -> x.replace(NessieUtil.NESSIE_CONFIG_PREFIX, ""); + final String requestedRef = + options.get(removePrefix.apply(NessieConfigConstants.CONF_NESSIE_REF)); + String requestedHash = + options.get(removePrefix.apply(NessieConfigConstants.CONF_NESSIE_REF_HASH)); + NessieApiV1 api = + createNessieClientBuilder( + options.get(NessieConfigConstants.CONF_NESSIE_CLIENT_BUILDER_IMPL)) + .fromConfig(x -> options.get(removePrefix.apply(x))) + .build(NessieApiV1.class); - initialize(name, + initialize( + name, new NessieIcebergClient(api, requestedRef, requestedHash, catalogOptions), - fileIOImpl == null ? new HadoopFileIO(config) : CatalogUtil.loadFileIO(fileIOImpl, options, config), + fileIOImpl == null + ? new HadoopFileIO(config) + : CatalogUtil.loadFileIO(fileIOImpl, options, config), catalogOptions); } /** - * An alternative way to initialize the catalog using a pre-configured {@link NessieIcebergClient} and {@link FileIO} - * instance. + * An alternative way to initialize the catalog using a pre-configured {@link NessieIcebergClient} + * and {@link FileIO} instance. + * * @param name The name of the catalog, defaults to "nessie" if null * @param client The pre-configured {@link NessieIcebergClient} instance to use * @param fileIO The {@link FileIO} instance to use * @param catalogOptions The catalog options to use */ @SuppressWarnings("checkstyle:HiddenField") - public void initialize(String name, NessieIcebergClient client, FileIO fileIO, Map catalogOptions) { + public void initialize( + String name, NessieIcebergClient client, FileIO fileIO, Map catalogOptions) { this.name = name == null ? "nessie" : name; this.client = Preconditions.checkNotNull(client, "client must be non-null"); this.fileIO = Preconditions.checkNotNull(fileIO, "fileIO must be non-null"); - this.catalogOptions = Preconditions.checkNotNull(catalogOptions, "catalogOptions must be non-null"); + this.catalogOptions = + Preconditions.checkNotNull(catalogOptions, "catalogOptions must be non-null"); this.warehouseLocation = validateWarehouseLocation(name, catalogOptions); this.closeableGroup = new CloseableGroup(); closeableGroup.addCloseable(client); @@ -120,11 +129,13 @@ public void initialize(String name, NessieIcebergClient client, FileIO fileIO, M private String validateWarehouseLocation(String name, Map catalogOptions) { String warehouseLocation = catalogOptions.get(CatalogProperties.WAREHOUSE_LOCATION); if (warehouseLocation == null) { - // Explicitly log a warning, otherwise the thrown exception can get list in the "silent-ish catch" + // Explicitly log a warning, otherwise the thrown exception can get list in the "silent-ish + // catch" // in o.a.i.spark.Spark3Util.catalogAndIdentifier(o.a.s.sql.SparkSession, List, // o.a.s.sql.connector.catalog.CatalogPlugin) // in the code block - // Pair catalogIdentifier = SparkUtil.catalogAndIdentifier(nameParts, + // Pair catalogIdentifier = + // SparkUtil.catalogAndIdentifier(nameParts, // catalogName -> { // try { // return catalogManager.catalog(catalogName); @@ -136,8 +147,11 @@ private String validateWarehouseLocation(String name, Map catalo // defaultCatalog, // currentNamespace // ); - LOG.warn("Catalog creation for inputName={} and options {} failed, because parameter " + - "'warehouse' is not set, Nessie can't store data.", name, catalogOptions); + LOG.warn( + "Catalog creation for inputName={} and options {} failed, because parameter " + + "'warehouse' is not set, Nessie can't store data.", + name, + catalogOptions); throw new IllegalStateException("Parameter 'warehouse' not set, Nessie can't store data."); } return warehouseLocation; @@ -147,9 +161,11 @@ private static NessieClientBuilder createNessieClientBuilder(String customBui NessieClientBuilder clientBuilder; if (customBuilder != null) { try { - clientBuilder = DynMethods.builder("builder").impl(customBuilder).build().asStatic().invoke(); + clientBuilder = + DynMethods.builder("builder").impl(customBuilder).build().asStatic().invoke(); } catch (Exception e) { - throw new RuntimeException(String.format("Failed to use custom NessieClientBuilder '%s'.", customBuilder), e); + throw new RuntimeException( + String.format("Failed to use custom NessieClientBuilder '%s'.", customBuilder), e); } } else { clientBuilder = HttpClientBuilder.builder(); @@ -173,7 +189,9 @@ public String name() { protected TableOperations newTableOps(TableIdentifier tableIdentifier) { TableReference tr = parseTableReference(tableIdentifier); return new NessieTableOperations( - ContentKey.of(org.projectnessie.model.Namespace.of(tableIdentifier.namespace().levels()), tr.getName()), + ContentKey.of( + org.projectnessie.model.Namespace.of(tableIdentifier.namespace().levels()), + tr.getName()), client.withReference(tr.getReference(), tr.getHash()), fileIO, catalogOptions); @@ -195,7 +213,8 @@ public List listTables(Namespace namespace) { @Override public boolean dropTable(TableIdentifier identifier, boolean purge) { TableReference tableReference = parseTableReference(identifier); - return client.withReference(tableReference.getReference(), tableReference.getHash()) + return client + .withReference(tableReference.getReference(), tableReference.getHash()) .dropTable(identifierWithoutTableReference(identifier, tableReference), purge); } @@ -204,16 +223,25 @@ public void renameTable(TableIdentifier from, TableIdentifier to) { TableReference fromTableReference = parseTableReference(from); TableReference toTableReference = parseTableReference(to); String fromReference = - fromTableReference.hasReference() ? fromTableReference.getReference() : client.getRef().getName(); + fromTableReference.hasReference() + ? fromTableReference.getReference() + : client.getRef().getName(); String toReference = - toTableReference.hasReference() ? toTableReference.getReference() : client.getRef().getName(); + toTableReference.hasReference() + ? toTableReference.getReference() + : client.getRef().getName(); Preconditions.checkArgument( fromReference.equalsIgnoreCase(toReference), - "from: %s and to: %s reference name must be same", fromReference, toReference); + "from: %s and to: %s reference name must be same", + fromReference, + toReference); - client.withReference(fromTableReference.getReference(), fromTableReference.getHash()).renameTable( - identifierWithoutTableReference(from, fromTableReference), - NessieUtil.removeCatalogName(identifierWithoutTableReference(to, toTableReference), name())); + client + .withReference(fromTableReference.getReference(), fromTableReference.getHash()) + .renameTable( + identifierWithoutTableReference(from, fromTableReference), + NessieUtil.removeCatalogName( + identifierWithoutTableReference(to, toTableReference), name())); } @Override @@ -227,14 +255,16 @@ public List listNamespaces(Namespace namespace) throws NoSuchNamespac } /** - * Load the given namespace but return an empty map because namespace properties are currently not supported. + * Load the given namespace but return an empty map because namespace properties are currently not + * supported. * * @param namespace a namespace. {@link Namespace} * @return an empty map * @throws NoSuchNamespaceException If the namespace does not exist */ @Override - public Map loadNamespaceMetadata(Namespace namespace) throws NoSuchNamespaceException { + public Map loadNamespaceMetadata(Namespace namespace) + throws NoSuchNamespaceException { return client.loadNamespaceMetadata(namespace); } @@ -280,12 +310,15 @@ FileIO fileIO() { private TableReference parseTableReference(TableIdentifier tableIdentifier) { TableReference tr = TableReference.parse(tableIdentifier.name()); - Preconditions.checkArgument(!tr.hasTimestamp(), "Invalid table name: # is only allowed for hashes (reference by " + - "timestamp is not supported)"); + Preconditions.checkArgument( + !tr.hasTimestamp(), + "Invalid table name: # is only allowed for hashes (reference by " + + "timestamp is not supported)"); return tr; } - private TableIdentifier identifierWithoutTableReference(TableIdentifier identifier, TableReference tableReference) { + private TableIdentifier identifierWithoutTableReference( + TableIdentifier identifier, TableReference tableReference) { if (tableReference.hasReference()) { return TableIdentifier.of(identifier.namespace(), tableReference.getName()); } diff --git a/nessie/src/main/java/org/apache/iceberg/nessie/NessieIcebergClient.java b/nessie/src/main/java/org/apache/iceberg/nessie/NessieIcebergClient.java index 39adeba671e1..2794c02ccbb0 100644 --- a/nessie/src/main/java/org/apache/iceberg/nessie/NessieIcebergClient.java +++ b/nessie/src/main/java/org/apache/iceberg/nessie/NessieIcebergClient.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.nessie; import java.util.Arrays; @@ -68,7 +67,10 @@ public class NessieIcebergClient implements AutoCloseable { private final Map catalogOptions; public NessieIcebergClient( - NessieApiV1 api, String requestedRef, String requestedHash, Map catalogOptions) { + NessieApiV1 api, + String requestedRef, + String requestedHash, + Map catalogOptions) { this.api = api; this.catalogOptions = catalogOptions; this.reference = Suppliers.memoize(() -> loadReference(requestedRef, requestedHash)); @@ -87,8 +89,9 @@ public void refresh() throws NessieNotFoundException { } public NessieIcebergClient withReference(String requestedRef, String hash) { - if (null == requestedRef || - (getRef().getReference().getName().equals(requestedRef) && getRef().getHash().equals(hash))) { + if (null == requestedRef + || (getRef().getReference().getName().equals(requestedRef) + && getRef().getHash().equals(hash))) { return this; } return new NessieIcebergClient(getApi(), requestedRef, hash, catalogOptions); @@ -97,7 +100,9 @@ public NessieIcebergClient withReference(String requestedRef, String hash) { private UpdateableReference loadReference(String requestedRef, String hash) { try { Reference ref = - requestedRef == null ? api.getDefaultBranch() : api.getReference().refName(requestedRef).get(); + requestedRef == null + ? api.getDefaultBranch() + : api.getReference().refName(requestedRef).get(); if (hash != null) { if (ref instanceof Branch) { ref = Branch.of(ref.getName(), hash); @@ -108,28 +113,29 @@ private UpdateableReference loadReference(String requestedRef, String hash) { return new UpdateableReference(ref, hash != null); } catch (NessieNotFoundException ex) { if (requestedRef != null) { - throw new IllegalArgumentException(String.format("Nessie ref '%s' does not exist", requestedRef), ex); + throw new IllegalArgumentException( + String.format("Nessie ref '%s' does not exist", requestedRef), ex); } - throw new IllegalArgumentException(String.format("Nessie does not have an existing default branch. " + - "Either configure an alternative ref via '%s' or create the default branch on the server.", - NessieConfigConstants.CONF_NESSIE_REF), ex); + throw new IllegalArgumentException( + String.format( + "Nessie does not have an existing default branch. " + + "Either configure an alternative ref via '%s' or create the default branch on the server.", + NessieConfigConstants.CONF_NESSIE_REF), + ex); } } public List listTables(Namespace namespace) { try { - return api.getEntries() - .reference(getRef().getReference()) - .get() - .getEntries() - .stream() + return api.getEntries().reference(getRef().getReference()).get().getEntries().stream() .filter(namespacePredicate(namespace)) .filter(e -> Content.Type.ICEBERG_TABLE == e.getType()) .map(this::toIdentifier) .collect(Collectors.toList()); } catch (NessieNotFoundException ex) { - throw new NoSuchNamespaceException(ex, "Unable to list tables due to missing ref '%s'", getRef().getName()); + throw new NoSuchNamespaceException( + ex, "Unable to list tables due to missing ref '%s'", getRef().getName()); } } @@ -168,7 +174,8 @@ public IcebergTable table(TableIdentifier tableIdentifier) { public void createNamespace(Namespace namespace, Map metadata) { try { getRef().checkMutable(); - getApi().createNamespace() + getApi() + .createNamespace() .reference(getRef().getReference()) .namespace(org.projectnessie.model.Namespace.of(namespace.levels())) .properties(metadata) @@ -177,31 +184,39 @@ public void createNamespace(Namespace namespace, Map metadata) { } catch (NessieNamespaceAlreadyExistsException e) { throw new AlreadyExistsException(e, "Namespace already exists: %s", namespace); } catch (NessieNotFoundException e) { - throw new RuntimeException(String.format("Cannot create Namespace '%s': " + - "ref '%s' is no longer valid.", namespace, getRef().getName()), e); + throw new RuntimeException( + String.format( + "Cannot create Namespace '%s': " + "ref '%s' is no longer valid.", + namespace, getRef().getName()), + e); } } public List listNamespaces(Namespace namespace) throws NoSuchNamespaceException { try { - GetNamespacesResponse response = getApi().getMultipleNamespaces() - .reference(getRef().getReference()) - .namespace(org.projectnessie.model.Namespace.of(namespace.levels())) - .get(); + GetNamespacesResponse response = + getApi() + .getMultipleNamespaces() + .reference(getRef().getReference()) + .namespace(org.projectnessie.model.Namespace.of(namespace.levels())) + .get(); return response.getNamespaces().stream() .map(ns -> Namespace.of(ns.getElements().toArray(new String[0]))) .collect(Collectors.toList()); } catch (NessieReferenceNotFoundException e) { throw new RuntimeException( - String.format("Cannot list Namespaces starting from '%s': " + - "ref '%s' is no longer valid.", namespace, getRef().getName()), e); + String.format( + "Cannot list Namespaces starting from '%s': " + "ref '%s' is no longer valid.", + namespace, getRef().getName()), + e); } } public boolean dropNamespace(Namespace namespace) throws NamespaceNotEmptyException { try { getRef().checkMutable(); - getApi().deleteNamespace() + getApi() + .deleteNamespace() .reference(getRef().getReference()) .namespace(org.projectnessie.model.Namespace.of(namespace.levels())) .delete(); @@ -210,16 +225,23 @@ public boolean dropNamespace(Namespace namespace) throws NamespaceNotEmptyExcept } catch (NessieNamespaceNotFoundException e) { return false; } catch (NessieNotFoundException e) { - LOG.error("Cannot drop Namespace '{}': ref '{}' is no longer valid.", namespace, getRef().getName(), e); + LOG.error( + "Cannot drop Namespace '{}': ref '{}' is no longer valid.", + namespace, + getRef().getName(), + e); return false; } catch (NessieNamespaceNotEmptyException e) { - throw new NamespaceNotEmptyException(e, "Namespace '%s' is not empty. One or more tables exist.", namespace); + throw new NamespaceNotEmptyException( + e, "Namespace '%s' is not empty. One or more tables exist.", namespace); } } - public Map loadNamespaceMetadata(Namespace namespace) throws NoSuchNamespaceException { + public Map loadNamespaceMetadata(Namespace namespace) + throws NoSuchNamespaceException { try { - return getApi().getNamespace() + return getApi() + .getNamespace() .reference(getRef().getReference()) .namespace(org.projectnessie.model.Namespace.of(namespace.levels())) .get() @@ -227,8 +249,11 @@ public Map loadNamespaceMetadata(Namespace namespace) throws NoS } catch (NessieNamespaceNotFoundException e) { throw new NoSuchNamespaceException(e, "Namespace does not exist: %s", namespace); } catch (NessieReferenceNotFoundException e) { - throw new RuntimeException(String.format("Cannot load Namespace '%s': " + - "ref '%s' is no longer valid.", namespace, getRef().getName()), e); + throw new RuntimeException( + String.format( + "Cannot load Namespace '%s': " + "ref '%s' is no longer valid.", + namespace, getRef().getName()), + e); } } @@ -247,8 +272,10 @@ public boolean setProperties(Namespace namespace, Map properties throw new NoSuchNamespaceException(e, "Namespace does not exist: %s", namespace); } catch (NessieNotFoundException e) { throw new RuntimeException( - String.format("Cannot update properties on Namespace '%s': ref '%s' is no longer valid.", - namespace, getRef().getName()), e); + String.format( + "Cannot update properties on Namespace '%s': ref '%s' is no longer valid.", + namespace, getRef().getName()), + e); } } @@ -267,8 +294,10 @@ public boolean removeProperties(Namespace namespace, Set properties) { throw new NoSuchNamespaceException(e, "Namespace does not exist: %s", namespace); } catch (NessieNotFoundException e) { throw new RuntimeException( - String.format("Cannot remove properties from Namespace '%s': ref '%s' is no longer valid.", - namespace, getRef().getName()), e); + String.format( + "Cannot remove properties from Namespace '%s': ref '%s' is no longer valid.", + namespace, getRef().getName()), + e); } } @@ -284,11 +313,15 @@ public void renameTable(TableIdentifier from, TableIdentifier to) { throw new AlreadyExistsException("Table already exists: %s", to.name()); } - CommitMultipleOperationsBuilder operations = getApi().commitMultipleOperations() - .commitMeta(NessieUtil.buildCommitMetadata(String.format("Iceberg rename table from '%s' to '%s'", - from, to), catalogOptions)) - .operation(Operation.Put.of(NessieUtil.toKey(to), existingFromTable, existingFromTable)) - .operation(Operation.Delete.of(NessieUtil.toKey(from))); + CommitMultipleOperationsBuilder operations = + getApi() + .commitMultipleOperations() + .commitMeta( + NessieUtil.buildCommitMetadata( + String.format("Iceberg rename table from '%s' to '%s'", from, to), + catalogOptions)) + .operation(Operation.Put.of(NessieUtil.toKey(to), existingFromTable, existingFromTable)) + .operation(Operation.Delete.of(NessieUtil.toKey(from))); try { Tasks.foreach(operations) @@ -296,22 +329,31 @@ public void renameTable(TableIdentifier from, TableIdentifier to) { .stopRetryOn(NessieNotFoundException.class) .throwFailureWhenFinished() .onFailure((o, exception) -> refresh()) - .run(ops -> { - Branch branch = ops - .branch(getRef().getAsBranch()) - .commit(); - getRef().updateReference(branch); - }, BaseNessieClientServerException.class); + .run( + ops -> { + Branch branch = ops.branch(getRef().getAsBranch()).commit(); + getRef().updateReference(branch); + }, + BaseNessieClientServerException.class); } catch (NessieNotFoundException e) { - // important note: the NotFoundException refers to the ref only. If a table was not found it would imply that the - // another commit has deleted the table from underneath us. This would arise as a Conflict exception as opposed to - // a not found exception. This is analogous to a merge conflict in git when a table has been changed by one user + // important note: the NotFoundException refers to the ref only. If a table was not found it + // would imply that the + // another commit has deleted the table from underneath us. This would arise as a Conflict + // exception as opposed to + // a not found exception. This is analogous to a merge conflict in git when a table has been + // changed by one user // and removed by another. - throw new RuntimeException(String.format("Cannot rename table '%s' to '%s': " + - "ref '%s' no longer exists.", from.name(), to.name(), getRef().getName()), e); + throw new RuntimeException( + String.format( + "Cannot rename table '%s' to '%s': " + "ref '%s' no longer exists.", + from.name(), to.name(), getRef().getName()), + e); } catch (BaseNessieClientServerException e) { - throw new CommitFailedException(e, "Cannot rename table '%s' to '%s': " + - "the current reference is not up to date.", from.name(), to.name()); + throw new CommitFailedException( + e, + "Cannot rename table '%s' to '%s': " + "the current reference is not up to date.", + from.name(), + to.name()); } catch (HttpClientException ex) { // Intentionally catch all nessie-client-exceptions here and not just the "timeout" variant // to catch all kinds of network errors (e.g. connection reset). Network code implementation @@ -319,7 +361,8 @@ public void renameTable(TableIdentifier from, TableIdentifier to) { // safe than sorry. throw new CommitStateUnknownException(ex); } - // Intentionally just "throw through" Nessie's HttpClientException here and do not "special case" + // Intentionally just "throw through" Nessie's HttpClientException here and do not "special + // case" // just the "timeout" variant to propagate all kinds of network errors (e.g. connection reset). // Network code implementation details and all kinds of network devices can induce unexpected // behavior. So better be safe than sorry. @@ -337,10 +380,13 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { LOG.info("Purging data for table {} was set to true but is ignored", identifier.toString()); } - CommitMultipleOperationsBuilder commitBuilderBase = getApi().commitMultipleOperations() - .commitMeta(NessieUtil.buildCommitMetadata(String.format("Iceberg delete table %s", identifier), - catalogOptions)) - .operation(Operation.Delete.of(NessieUtil.toKey(identifier))); + CommitMultipleOperationsBuilder commitBuilderBase = + getApi() + .commitMultipleOperations() + .commitMeta( + NessieUtil.buildCommitMetadata( + String.format("Iceberg delete table %s", identifier), catalogOptions)) + .operation(Operation.Delete.of(NessieUtil.toKey(identifier))); // We try to drop the table. Simple retry after ref update. boolean threw = true; @@ -350,15 +396,18 @@ public boolean dropTable(TableIdentifier identifier, boolean purge) { .stopRetryOn(NessieNotFoundException.class) .throwFailureWhenFinished() .onFailure((o, exception) -> refresh()) - .run(commitBuilder -> { - Branch branch = commitBuilder - .branch(getRef().getAsBranch()) - .commit(); - getRef().updateReference(branch); - }, BaseNessieClientServerException.class); + .run( + commitBuilder -> { + Branch branch = commitBuilder.branch(getRef().getAsBranch()).commit(); + getRef().updateReference(branch); + }, + BaseNessieClientServerException.class); threw = false; } catch (NessieConflictException e) { - LOG.error("Cannot drop table: failed after retry (update ref '{}' and retry)", getRef().getName(), e); + LOG.error( + "Cannot drop table: failed after retry (update ref '{}' and retry)", + getRef().getName(), + e); } catch (NessieNotFoundException e) { LOG.error("Cannot drop table: ref '{}' is no longer valid.", getRef().getName(), e); } catch (BaseNessieClientServerException e) { diff --git a/nessie/src/main/java/org/apache/iceberg/nessie/NessieTableOperations.java b/nessie/src/main/java/org/apache/iceberg/nessie/NessieTableOperations.java index 2cbe974bdd1a..2263d8151218 100644 --- a/nessie/src/main/java/org/apache/iceberg/nessie/NessieTableOperations.java +++ b/nessie/src/main/java/org/apache/iceberg/nessie/NessieTableOperations.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.nessie; import com.fasterxml.jackson.databind.JsonNode; @@ -45,9 +44,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * Nessie implementation of Iceberg TableOperations. - */ +/** Nessie implementation of Iceberg TableOperations. */ public class NessieTableOperations extends BaseMetastoreTableOperations { private static final Logger LOG = LoggerFactory.getLogger(NessieTableOperations.class); @@ -64,9 +61,7 @@ public class NessieTableOperations extends BaseMetastoreTableOperations { private final FileIO fileIO; private final Map catalogOptions; - /** - * Create a nessie table operations given a table identifier. - */ + /** Create a nessie table operations given a table identifier. */ NessieTableOperations( ContentKey key, NessieIcebergClient client, @@ -85,21 +80,23 @@ protected String tableName() { private TableMetadata loadTableMetadata(String metadataLocation, Reference reference) { // Update the TableMetadata with the Content of NessieTableState. - TableMetadata deserialized = NessieUtil.tableMetadataFromIcebergTable(io(), table, metadataLocation); + TableMetadata deserialized = + NessieUtil.tableMetadataFromIcebergTable(io(), table, metadataLocation); Map newProperties = Maps.newHashMap(deserialized.properties()); newProperties.put(NESSIE_COMMIT_ID_PROPERTY, reference.getHash()); - TableMetadata.Builder builder = TableMetadata.buildFrom(deserialized) - .setPreviousFileLocation(null) - .setCurrentSchema(table.getSchemaId()) - .setDefaultSortOrder(table.getSortOrderId()) - .setDefaultPartitionSpec(table.getSpecId()) - .withMetadataLocation(metadataLocation) - .setProperties(newProperties); + TableMetadata.Builder builder = + TableMetadata.buildFrom(deserialized) + .setPreviousFileLocation(null) + .setCurrentSchema(table.getSchemaId()) + .setDefaultSortOrder(table.getSortOrderId()) + .setDefaultPartitionSpec(table.getSpecId()) + .withMetadataLocation(metadataLocation) + .setProperties(newProperties); if (table.getSnapshotId() != -1) { builder.setBranchSnapshot(table.getSnapshotId(), SnapshotRef.MAIN_BRANCH); } - LOG.info("loadTableMetadata for '{}' from location '{}' at '{}'", key, metadataLocation, - reference); + LOG.info( + "loadTableMetadata for '{}' from location '{}' at '{}'", key, metadataLocation, reference); return builder.discardChanges().build(); } @@ -109,24 +106,31 @@ protected void doRefresh() { try { client.refresh(); } catch (NessieNotFoundException e) { - throw new RuntimeException(String.format("Failed to refresh as ref '%s' " + - "is no longer valid.", client.getRef().getName()), e); + throw new RuntimeException( + String.format( + "Failed to refresh as ref '%s' " + "is no longer valid.", client.getRef().getName()), + e); } String metadataLocation = null; Reference reference = client.getRef().getReference(); try { - Content content = client.getApi().getContent().key(key).reference(reference).get() - .get(key); + Content content = client.getApi().getContent().key(key).reference(reference).get().get(key); LOG.debug("Content '{}' at '{}': {}", key, reference, content); if (content == null) { if (currentMetadataLocation() != null) { throw new NoSuchTableException("No such table '%s' in '%s'", key, reference); } } else { - this.table = content.unwrap(IcebergTable.class) - .orElseThrow( - () -> new IllegalStateException(String.format("Cannot refresh iceberg table: " + - "Nessie points to a non-Iceberg object for path: %s.", key))); + this.table = + content + .unwrap(IcebergTable.class) + .orElseThrow( + () -> + new IllegalStateException( + String.format( + "Cannot refresh iceberg table: " + + "Nessie points to a non-Iceberg object for path: %s.", + key))); metadataLocation = table.getMetadataLocation(); } } catch (NessieNotFoundException ex) { @@ -152,8 +156,10 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { } } - String newMetadataLocation = (base == null) && (metadata.metadataFileLocation() != null) ? - metadata.metadataFileLocation() : writeNewMetadata(metadata, currentVersion() + 1); + String newMetadataLocation = + (base == null) && (metadata.metadataFileLocation() != null) + ? metadata.metadataFileLocation() + : writeNewMetadata(metadata, currentVersion() + 1); boolean delete = true; try { @@ -165,36 +171,50 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { long snapshotId = snapshot != null ? snapshot.snapshotId() : -1L; JsonNode newMetadata = NessieUtil.tableMetadataAsJsonNode(metadata); - IcebergTable newTable = newTableBuilder - .snapshotId(snapshotId) - .schemaId(metadata.currentSchemaId()) - .specId(metadata.defaultSpecId()) - .sortOrderId(metadata.defaultSortOrderId()) - .metadataLocation(newMetadataLocation) - .metadata( - GenericMetadata.of("org.apache:iceberg:" + metadata.formatVersion(), newMetadata)) - .build(); - - LOG.debug("Committing '{}' against '{}', current is '{}': {}", key, expectedHead, - current.getHash(), newTable); + IcebergTable newTable = + newTableBuilder + .snapshotId(snapshotId) + .schemaId(metadata.currentSchemaId()) + .specId(metadata.defaultSpecId()) + .sortOrderId(metadata.defaultSortOrderId()) + .metadataLocation(newMetadataLocation) + .metadata( + GenericMetadata.of("org.apache:iceberg:" + metadata.formatVersion(), newMetadata)) + .build(); + + LOG.debug( + "Committing '{}' against '{}', current is '{}': {}", + key, + expectedHead, + current.getHash(), + newTable); ImmutableCommitMeta.Builder builder = ImmutableCommitMeta.builder(); builder.message(buildCommitMsg(base, metadata)); if (isSnapshotOperation(base, metadata)) { builder.putProperties("iceberg.operation", snapshot.operation()); } - Branch branch = client.getApi().commitMultipleOperations() - .operation(Operation.Put.of(key, newTable, table)) - .commitMeta(NessieUtil.catalogOptions(builder, catalogOptions).build()) - .branch(expectedHead) - .commit(); - LOG.info("Committed '{}' against '{}', expected commit-id was '{}'", key, branch, + Branch branch = + client + .getApi() + .commitMultipleOperations() + .operation(Operation.Put.of(key, newTable, table)) + .commitMeta(NessieUtil.catalogOptions(builder, catalogOptions).build()) + .branch(expectedHead) + .commit(); + LOG.info( + "Committed '{}' against '{}', expected commit-id was '{}'", + key, + branch, expectedHead.getHash()); updateableReference.updateReference(branch); delete = false; } catch (NessieConflictException ex) { - throw new CommitFailedException(ex, "Cannot commit: Reference hash is out of date. " + - "Update the reference '%s' and try again", updateableReference.getName()); + throw new CommitFailedException( + ex, + "Cannot commit: Reference hash is out of date. " + + "Update the reference '%s' and try again", + updateableReference.getName()); } catch (HttpClientException ex) { // Intentionally catch all nessie-client-exceptions here and not just the "timeout" variant // to catch all kinds of network errors (e.g. connection reset). Network code implementation @@ -204,8 +224,9 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { throw new CommitStateUnknownException(ex); } catch (NessieNotFoundException ex) { throw new RuntimeException( - String.format("Cannot commit: Reference '%s' no longer exists", - updateableReference.getName()), ex); + String.format( + "Cannot commit: Reference '%s' no longer exists", updateableReference.getName()), + ex); } finally { if (delete) { io().deleteFile(newMetadataLocation); @@ -215,14 +236,16 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { private boolean isSnapshotOperation(TableMetadata base, TableMetadata metadata) { Snapshot snapshot = metadata.currentSnapshot(); - return snapshot != null && (base == null || base.currentSnapshot() == null || - snapshot.snapshotId() != base.currentSnapshot().snapshotId()); + return snapshot != null + && (base == null + || base.currentSnapshot() == null + || snapshot.snapshotId() != base.currentSnapshot().snapshotId()); } private String buildCommitMsg(TableMetadata base, TableMetadata metadata) { if (isSnapshotOperation(base, metadata)) { - return String.format("Iceberg %s against %s", metadata.currentSnapshot().operation(), - tableName()); + return String.format( + "Iceberg %s against %s", metadata.currentSnapshot().operation(), tableName()); } else if (base != null && metadata.currentSchemaId() != base.currentSchemaId()) { return String.format("Iceberg schema change against %s", tableName()); } diff --git a/nessie/src/main/java/org/apache/iceberg/nessie/NessieUtil.java b/nessie/src/main/java/org/apache/iceberg/nessie/NessieUtil.java index d53d34c12272..46f6c54f2f02 100644 --- a/nessie/src/main/java/org/apache/iceberg/nessie/NessieUtil.java +++ b/nessie/src/main/java/org/apache/iceberg/nessie/NessieUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.nessie; import com.fasterxml.jackson.core.JsonGenerator; @@ -48,8 +47,7 @@ public final class NessieUtil { public static final String NESSIE_CONFIG_PREFIX = "nessie."; static final String APPLICATION_TYPE = "application-type"; - private NessieUtil() { - } + private NessieUtil() {} static TableIdentifier removeCatalogName(TableIdentifier to, String name) { @@ -78,21 +76,23 @@ static CommitMeta buildCommitMetadata(String commitMsg, Map cata return catalogOptions(CommitMeta.builder().message(commitMsg), catalogOptions).build(); } - static ImmutableCommitMeta.Builder catalogOptions(ImmutableCommitMeta.Builder commitMetaBuilder, - Map catalogOptions) { + static ImmutableCommitMeta.Builder catalogOptions( + ImmutableCommitMeta.Builder commitMetaBuilder, Map catalogOptions) { Preconditions.checkArgument(null != catalogOptions, "catalogOptions must not be null"); commitMetaBuilder.author(NessieUtil.commitAuthor(catalogOptions)); commitMetaBuilder.putProperties(APPLICATION_TYPE, "iceberg"); if (catalogOptions.containsKey(CatalogProperties.APP_ID)) { - commitMetaBuilder.putProperties(CatalogProperties.APP_ID, catalogOptions.get(CatalogProperties.APP_ID)); + commitMetaBuilder.putProperties( + CatalogProperties.APP_ID, catalogOptions.get(CatalogProperties.APP_ID)); } return commitMetaBuilder; } /** * @param catalogOptions The options where to look for the user - * @return The author that can be used for a commit, which is either the user from the given - * catalogOptions or the logged in user as defined in the user.name JVM properties. + * @return The author that can be used for a commit, which is either the user from the + * given catalogOptions or the logged in user as defined in the user.name + * JVM properties. */ @Nullable private static String commitAuthor(Map catalogOptions) { @@ -100,7 +100,8 @@ private static String commitAuthor(Map catalogOptions) { .orElseGet(() -> System.getProperty("user.name")); } - static TableMetadata tableMetadataFromIcebergTable(FileIO io, IcebergTable table, String metadataLocation) { + static TableMetadata tableMetadataFromIcebergTable( + FileIO io, IcebergTable table, String metadataLocation) { TableMetadata deserialized; if (table.getMetadata() != null) { String jsonString; diff --git a/nessie/src/main/java/org/apache/iceberg/nessie/UpdateableReference.java b/nessie/src/main/java/org/apache/iceberg/nessie/UpdateableReference.java index 20f43433a6fa..7e49457981bf 100644 --- a/nessie/src/main/java/org/apache/iceberg/nessie/UpdateableReference.java +++ b/nessie/src/main/java/org/apache/iceberg/nessie/UpdateableReference.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.nessie; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; @@ -31,8 +30,8 @@ class UpdateableReference { private final boolean mutable; /** - * Construct a new {@link UpdateableReference} using a Nessie reference object and a flag - * whether an explicit hash was used to create the reference object. + * Construct a new {@link UpdateableReference} using a Nessie reference object and a flag whether + * an explicit hash was used to create the reference object. */ UpdateableReference(Reference reference, boolean hashReference) { this.reference = reference; @@ -73,7 +72,8 @@ public Reference getReference() { } public void checkMutable() { - Preconditions.checkArgument(mutable, "You can only mutate tables when using a branch without a hash or timestamp."); + Preconditions.checkArgument( + mutable, "You can only mutate tables when using a branch without a hash or timestamp."); } public String getName() { diff --git a/nessie/src/test/java/org/apache/iceberg/nessie/BaseTestIceberg.java b/nessie/src/test/java/org/apache/iceberg/nessie/BaseTestIceberg.java index 98b0a272cffc..e39516a1c068 100644 --- a/nessie/src/test/java/org/apache/iceberg/nessie/BaseTestIceberg.java +++ b/nessie/src/test/java/org/apache/iceberg/nessie/BaseTestIceberg.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.nessie; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.net.URI; import java.nio.file.Path; @@ -66,8 +67,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.types.Types.NestedField.required; - @ExtendWith(DatabaseAdapterExtension.class) @NessieDbAdapterName(InmemoryDatabaseAdapterFactory.NAME) @NessieExternalDatabase(InmemoryTestConnectionProviderSource.class) @@ -75,13 +74,13 @@ public abstract class BaseTestIceberg { @NessieDbAdapter(storeWorker = TableCommitMetaStoreWorker.class) static DatabaseAdapter databaseAdapter; + @RegisterExtension static NessieJaxRsExtension server = new NessieJaxRsExtension(() -> databaseAdapter); private static final Logger LOG = LoggerFactory.getLogger(BaseTestIceberg.class); - @TempDir - public Path temp; + @TempDir public Path temp; protected NessieCatalog catalog; protected NessieApiV1 api; @@ -136,11 +135,12 @@ NessieCatalog initCatalog(String ref) { NessieCatalog initCatalog(String ref, String hash) { NessieCatalog newCatalog = new NessieCatalog(); newCatalog.setConf(hadoopConfig); - ImmutableMap.Builder options = ImmutableMap.builder() - .put("ref", ref) - .put(CatalogProperties.URI, uri) - .put("auth-type", "NONE") - .put(CatalogProperties.WAREHOUSE_LOCATION, temp.toUri().toString()); + ImmutableMap.Builder options = + ImmutableMap.builder() + .put("ref", ref) + .put(CatalogProperties.URI, uri) + .put("auth-type", "NONE") + .put(CatalogProperties.WAREHOUSE_LOCATION, temp.toUri().toString()); if (null != hash) { options.put("ref.hash", hash); } @@ -204,15 +204,12 @@ static String metadataLocation(NessieCatalog catalog, TableIdentifier tableIdent return icebergOps.currentMetadataLocation(); } - static String writeRecordsToFile(Table table, Schema schema, String filename, - List records) - throws IOException { - String fileLocation = table.location().replace("file:", "") + - String.format("/data/%s.avro", filename); - try (FileAppender writer = Avro.write(Files.localOutput(fileLocation)) - .schema(schema) - .named("test") - .build()) { + static String writeRecordsToFile( + Table table, Schema schema, String filename, List records) throws IOException { + String fileLocation = + table.location().replace("file:", "") + String.format("/data/%s.avro", filename); + try (FileAppender writer = + Avro.write(Files.localOutput(fileLocation)).schema(schema).named("test").build()) { for (Record rec : records) { writer.add(rec); } diff --git a/nessie/src/test/java/org/apache/iceberg/nessie/TestBranchVisibility.java b/nessie/src/test/java/org/apache/iceberg/nessie/TestBranchVisibility.java index 3ca769e73ffd..7db904ef9233 100644 --- a/nessie/src/test/java/org/apache/iceberg/nessie/TestBranchVisibility.java +++ b/nessie/src/test/java/org/apache/iceberg/nessie/TestBranchVisibility.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.nessie; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Collections; import java.util.Map; import org.apache.avro.generic.GenericRecordBuilder; @@ -50,8 +51,6 @@ import org.projectnessie.model.IcebergTable; import org.projectnessie.model.Reference; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestBranchVisibility extends BaseTestIceberg { private final TableIdentifier tableIdentifier1 = TableIdentifier.of("test-ns", "table1"); @@ -85,17 +84,17 @@ public void after() throws NessieNotFoundException, NessieConflictException { @Test public void testBranchNoChange() { - testCatalogEquality(catalog, testCatalog, true, true, () -> { }); + testCatalogEquality(catalog, testCatalog, true, true, () -> {}); } /** Ensure catalogs can't see each others updates. */ @Test public void testUpdateCatalogs() { - testCatalogEquality(catalog, testCatalog, false, true, - () -> updateSchema(catalog, tableIdentifier1)); + testCatalogEquality( + catalog, testCatalog, false, true, () -> updateSchema(catalog, tableIdentifier1)); - testCatalogEquality(catalog, testCatalog, false, false, - () -> updateSchema(catalog, tableIdentifier2)); + testCatalogEquality( + catalog, testCatalog, false, false, () -> updateSchema(catalog, tableIdentifier2)); } @Test @@ -105,11 +104,11 @@ public void testCatalogOnReference() { // catalog created with ref points to same catalog as above NessieCatalog refCatalog = initCatalog("test"); - testCatalogEquality(refCatalog, testCatalog, true, true, () -> { }); + testCatalogEquality(refCatalog, testCatalog, true, true, () -> {}); // catalog created with hash points to same catalog as above NessieCatalog refHashCatalog = initCatalog("main"); - testCatalogEquality(refHashCatalog, catalog, true, true, () -> { }); + testCatalogEquality(refHashCatalog, catalog, true, true, () -> {}); } @Test @@ -126,7 +125,8 @@ public void testCatalogWithTableNames() { // Earlier versions used "table1@" + tree.getReferenceByName("main").getHash() before, but since // Nessie 0.8.2 the branch name became mandatory and specifying a hash within a branch is not // possible. - Assertions.assertThat(metadataLocation(catalog, TableIdentifier.of("test-ns", "table1@" + mainName))) + Assertions.assertThat( + metadataLocation(catalog, TableIdentifier.of("test-ns", "table1@" + mainName))) .isEqualTo(metadataLocation(testCatalog, tableIdentifier1)); } @@ -134,7 +134,8 @@ public void testCatalogWithTableNames() { public void testConcurrentChanges() { NessieCatalog emptyTestCatalog = initCatalog("test"); updateSchema(testCatalog, tableIdentifier1); - // Updating table with out of date hash. We expect this to succeed because of retry despite the conflict. + // Updating table with out of date hash. We expect this to succeed because of retry despite the + // conflict. updateSchema(emptyTestCatalog, tableIdentifier1); } @@ -146,16 +147,16 @@ public void testSchemaSnapshot() throws Exception { String branch2 = "branch-2"; NessieCatalog catalog = initCatalog(branchTest); - String metadataOnTest = addRow(catalog, tableIdentifier1, "initial-data", - ImmutableMap.of("id0", 4L)); + String metadataOnTest = + addRow(catalog, tableIdentifier1, "initial-data", ImmutableMap.of("id0", 4L)); long snapshotIdOnTest = snapshotIdFromMetadata(catalog, metadataOnTest); String hashOnTest = catalog.currentHash(); createBranch(branch1, hashOnTest, branchTest); createBranch(branch2, hashOnTest, branchTest); - String metadataOnTest2 = addRow(catalog, tableIdentifier1, "added-data-on-test", - ImmutableMap.of("id0", 5L)); + String metadataOnTest2 = + addRow(catalog, tableIdentifier1, "added-data-on-test", ImmutableMap.of("id0", 5L)); Assertions.assertThat(metadataOnTest2).isNotEqualTo(metadataOnTest); long snapshotIdOnTest2 = snapshotIdFromMetadata(catalog, metadataOnTest2); verifyRefState(catalog, tableIdentifier1, snapshotIdOnTest2, 0); @@ -163,15 +164,23 @@ public void testSchemaSnapshot() throws Exception { NessieCatalog catalogBranch1 = initCatalog(branch1); updateSchema(catalogBranch1, tableIdentifier1, Types.StringType.get()); verifyRefState(catalogBranch1, tableIdentifier1, snapshotIdOnTest, 1); - String metadataOn1 = addRow(catalogBranch1, tableIdentifier1, "testSchemaSnapshot-in-1", - ImmutableMap.of("id0", 42L, "id1", "world")); + String metadataOn1 = + addRow( + catalogBranch1, + tableIdentifier1, + "testSchemaSnapshot-in-1", + ImmutableMap.of("id0", 42L, "id1", "world")); Assertions.assertThat(metadataOn1).isNotEqualTo(metadataOnTest).isNotEqualTo(metadataOnTest2); NessieCatalog catalogBranch2 = initCatalog(branch2); updateSchema(catalogBranch2, tableIdentifier1, Types.IntegerType.get()); verifyRefState(catalogBranch2, tableIdentifier1, snapshotIdOnTest, 1); - String metadataOn2 = addRow(catalogBranch2, tableIdentifier1, "testSchemaSnapshot-in-2", - ImmutableMap.of("id0", 43L, "id2", 666)); + String metadataOn2 = + addRow( + catalogBranch2, + tableIdentifier1, + "testSchemaSnapshot-in-2", + ImmutableMap.of("id0", 43L, "id2", 666)); Assertions.assertThat(metadataOn2).isNotEqualTo(metadataOnTest).isNotEqualTo(metadataOnTest2); } @@ -182,22 +191,25 @@ public void testMetadataLocation() throws Exception { // commit on tableIdentifier1 on branch1 NessieCatalog catalog = initCatalog(branch1); - String metadataLocationOfCommit1 = addRow(catalog, tableIdentifier1, "initial-data", - ImmutableMap.of("id0", 4L)); + String metadataLocationOfCommit1 = + addRow(catalog, tableIdentifier1, "initial-data", ImmutableMap.of("id0", 4L)); createBranch(branch2, catalog.currentHash(), branch1); // commit on tableIdentifier1 on branch2 catalog = initCatalog(branch2); - String metadataLocationOfCommit2 = addRow(catalog, tableIdentifier1, "some-more-data", - ImmutableMap.of("id0", 42L)); - Assertions.assertThat(metadataLocationOfCommit2).isNotNull().isNotEqualTo(metadataLocationOfCommit1); + String metadataLocationOfCommit2 = + addRow(catalog, tableIdentifier1, "some-more-data", ImmutableMap.of("id0", 42L)); + Assertions.assertThat(metadataLocationOfCommit2) + .isNotNull() + .isNotEqualTo(metadataLocationOfCommit1); catalog = initCatalog(branch1); // load tableIdentifier1 on branch1 BaseTable table = (BaseTable) catalog.loadTable(tableIdentifier1); // branch1's tableIdentifier1's metadata location must not have changed Assertions.assertThat(table.operations().current().metadataFileLocation()) - .isNotNull().isNotEqualTo(metadataLocationOfCommit2); + .isNotNull() + .isNotEqualTo(metadataLocationOfCommit2); } /** @@ -219,8 +231,8 @@ public void testStateTrackingOnMultipleBranches() throws Exception { verifyRefState(catalog, tableIdentifier1, -1L, 0); // Add a row and verify that the - String metadataOnTest = addRow(catalog, tableIdentifier1, "initial-data", - Collections.singletonMap("id0", 1L)); + String metadataOnTest = + addRow(catalog, tableIdentifier1, "initial-data", Collections.singletonMap("id0", 1L)); Assertions.assertThat(metadataOnTest).isNotEqualTo(initialLocation); long snapshotIdOnTest = snapshotIdFromMetadata(catalog, metadataOnTest); verifyRefState(catalog, tableIdentifier1, snapshotIdOnTest, 0); @@ -238,8 +250,12 @@ public void testStateTrackingOnMultipleBranches() throws Exception { verifySchema(catalogBranchA, tableIdentifier1, Types.LongType.get(), Types.StringType.get()); verifyRefState(catalog, tableIdentifier1, snapshotIdOnTest, 0); - String metadataOnA1 = addRow(catalogBranchA, tableIdentifier1, "branch-a-1", - ImmutableMap.of("id0", 2L, "id1", "hello")); + String metadataOnA1 = + addRow( + catalogBranchA, + tableIdentifier1, + "branch-a-1", + ImmutableMap.of("id0", 2L, "id1", "hello")); // addRow() must produce a new metadata Assertions.assertThat(metadataOnA1).isNotEqualTo(metadataOnTest); long snapshotIdOnA1 = snapshotIdFromMetadata(catalogBranchA, metadataOnA1); @@ -258,21 +274,24 @@ public void testStateTrackingOnMultipleBranches() throws Exception { verifySchema(catalogBranchB, tableIdentifier1, Types.LongType.get(), Types.LongType.get()); verifyRefState(catalog, tableIdentifier1, snapshotIdOnTest, 0); - String metadataOnB1 = addRow(catalogBranchB, tableIdentifier1, "branch-b-1", - ImmutableMap.of("id0", 3L, "id2", 42L)); + String metadataOnB1 = + addRow( + catalogBranchB, tableIdentifier1, "branch-b-1", ImmutableMap.of("id0", 3L, "id2", 42L)); long snapshotIdOnB1 = snapshotIdFromMetadata(catalogBranchB, metadataOnB1); // addRow() must produce a new metadata - Assertions.assertThat(metadataOnB1) - .isNotEqualTo(metadataOnA1) - .isNotEqualTo(metadataOnTest); + Assertions.assertThat(metadataOnB1).isNotEqualTo(metadataOnA1).isNotEqualTo(metadataOnTest); verifyRefState(catalogBranchB, tableIdentifier1, snapshotIdOnB1, 1); verifyRefState(catalog, tableIdentifier1, snapshotIdOnTest, 0); // repeat addRow() against branchA catalogBranchA = initCatalog(branchA); verifySchema(catalogBranchA, tableIdentifier1, Types.LongType.get(), Types.StringType.get()); - String metadataOnA2 = addRow(catalogBranchA, tableIdentifier1, "branch-a-2", - ImmutableMap.of("id0", 4L, "id1", "hello")); + String metadataOnA2 = + addRow( + catalogBranchA, + tableIdentifier1, + "branch-a-2", + ImmutableMap.of("id0", 4L, "id1", "hello")); long snapshotIdOnA2 = snapshotIdFromMetadata(catalogBranchA, metadataOnA2); Assertions.assertThat(metadataOnA2) .isNotEqualTo(metadataOnA1) @@ -283,11 +302,16 @@ public void testStateTrackingOnMultipleBranches() throws Exception { // repeat addRow() against branchB catalogBranchB = initCatalog(branchB); verifySchema(catalogBranchB, tableIdentifier1, Types.LongType.get(), Types.LongType.get()); - String metadataOnB2 = addRow(catalogBranchB, tableIdentifier1, "branch-b-2", - ImmutableMap.of("id0", 5L, "id2", 666L)); + String metadataOnB2 = + addRow( + catalogBranchB, + tableIdentifier1, + "branch-b-2", + ImmutableMap.of("id0", 5L, "id2", 666L)); long snapshotIdOnB2 = snapshotIdFromMetadata(catalogBranchA, metadataOnB2); Assertions.assertThat(metadataOnB2) - .isNotEqualTo(metadataOnA1).isNotEqualTo(metadataOnA2) + .isNotEqualTo(metadataOnA1) + .isNotEqualTo(metadataOnA2) .isNotEqualTo(metadataOnB1) .isNotEqualTo(metadataOnTest); verifyRefState(catalogBranchB, tableIdentifier1, snapshotIdOnB2, 1); @@ -296,7 +320,8 @@ public void testStateTrackingOnMultipleBranches() throws Exception { verifyRefState(catalog, tableIdentifier1, snapshotIdOnTest, 0); } - private void verifyRefState(NessieCatalog catalog, TableIdentifier identifier, long snapshotId, int schemaId) + private void verifyRefState( + NessieCatalog catalog, TableIdentifier identifier, long snapshotId, int schemaId) throws Exception { IcebergTable icebergTable = loadIcebergTable(catalog, identifier); Assertions.assertThat(icebergTable) @@ -304,33 +329,41 @@ private void verifyRefState(NessieCatalog catalog, TableIdentifier identifier, l .containsExactly(snapshotId, schemaId); } - private long snapshotIdFromNessie(NessieCatalog catalog, TableIdentifier identifier) throws Exception { + private long snapshotIdFromNessie(NessieCatalog catalog, TableIdentifier identifier) + throws Exception { IcebergTable icebergTable = loadIcebergTable(catalog, identifier); return icebergTable.getSnapshotId(); } private long snapshotIdFromMetadata(NessieCatalog catalog, String metadataLocation) { - Snapshot snapshot = TableMetadataParser.read(catalog.fileIO(), metadataLocation).currentSnapshot(); + Snapshot snapshot = + TableMetadataParser.read(catalog.fileIO(), metadataLocation).currentSnapshot(); return snapshot != null ? snapshot.snapshotId() : -1; } private IcebergTable loadIcebergTable(NessieCatalog catalog, TableIdentifier identifier) throws NessieNotFoundException { ContentKey key = NessieUtil.toKey(identifier); - return api.getContent().refName(catalog.currentRefName()).key(key) - .get().get(key).unwrap(IcebergTable.class) + return api.getContent() + .refName(catalog.currentRefName()) + .key(key) + .get() + .get(key) + .unwrap(IcebergTable.class) .orElseThrow(NullPointerException::new); } - private String addRow(NessieCatalog catalog, TableIdentifier identifier, String fileName, Map data) + private String addRow( + NessieCatalog catalog, TableIdentifier identifier, String fileName, Map data) throws Exception { Table table = catalog.loadTable(identifier); GenericRecordBuilder recordBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert(table.schema(), table.name())); data.forEach(recordBuilder::set); - String fileLocation = writeRecordsToFile(table, table.schema(), fileName, - Collections.singletonList(recordBuilder.build())); + String fileLocation = + writeRecordsToFile( + table, table.schema(), fileName, Collections.singletonList(recordBuilder.build())); DataFile dataFile = makeDataFile(table, fileLocation); // Run via `Transaction` to exercise the whole code path ran via Spark (Spark SQL) @@ -360,7 +393,10 @@ private void updateSchema(NessieCatalog catalog, TableIdentifier identifier, Typ } private void testCatalogEquality( - NessieCatalog catalog, NessieCatalog compareCatalog, boolean table1Equal, boolean table2Equal, + NessieCatalog catalog, + NessieCatalog compareCatalog, + boolean table1Equal, + boolean table2Equal, ThrowingCallable callable) { String testTable1 = metadataLocation(compareCatalog, tableIdentifier1); String testTable2 = metadataLocation(compareCatalog, tableIdentifier2); @@ -376,29 +412,33 @@ private void testCatalogEquality( String table1 = metadataLocation(catalog, tableIdentifier1); String table2 = metadataLocation(catalog, tableIdentifier2); - AbstractStringAssert assertion = Assertions.assertThat(table1) - .describedAs("Table %s on ref %s should%s be equal to table %s on ref %s", - tableIdentifier1.name(), - catalog.currentRefName(), - table1Equal ? "" : " not", - tableIdentifier1.name(), - compareCatalog.currentRefName()); + AbstractStringAssert assertion = + Assertions.assertThat(table1) + .describedAs( + "Table %s on ref %s should%s be equal to table %s on ref %s", + tableIdentifier1.name(), + catalog.currentRefName(), + table1Equal ? "" : " not", + tableIdentifier1.name(), + compareCatalog.currentRefName()); if (table1Equal) { assertion.isEqualTo(testTable1); - } else { + } else { assertion.isNotEqualTo(testTable1); } - assertion = Assertions.assertThat(table2) - .describedAs("Table %s on ref %s should%s be equal to table %s on ref %s", - tableIdentifier2.name(), - catalog.currentRefName(), - table2Equal ? "" : " not", - tableIdentifier2.name(), - compareCatalog.currentRefName()); + assertion = + Assertions.assertThat(table2) + .describedAs( + "Table %s on ref %s should%s be equal to table %s on ref %s", + tableIdentifier2.name(), + catalog.currentRefName(), + table2Equal ? "" : " not", + tableIdentifier2.name(), + compareCatalog.currentRefName()); if (table2Equal) { assertion.isEqualTo(testTable2); - } else { + } else { assertion.isNotEqualTo(testTable2); } } @@ -407,7 +447,8 @@ private void testCatalogEquality( public void testWithRefAndHash() throws NessieConflictException, NessieNotFoundException { String testBranch = "testBranch"; createBranch(testBranch, null); - Schema schema = new Schema(Types.StructType.of(required(1, "id", Types.LongType.get())).fields()); + Schema schema = + new Schema(Types.StructType.of(required(1, "id", Types.LongType.get())).fields()); NessieCatalog nessieCatalog = initCatalog(testBranch); String hashBeforeNamespaceCreation = api.getReference().refName(testBranch).get().getHash(); @@ -433,8 +474,8 @@ public void testWithRefAndHash() throws NessieConflictException, NessieNotFoundE // updates should not be possible Assertions.assertThatThrownBy(() -> catalogAtHash2.createTable(identifier, schema)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("You can only mutate tables when using a branch without a hash or timestamp."); + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("You can only mutate tables when using a branch without a hash or timestamp."); Assertions.assertThat(catalogAtHash2.listTables(namespace)).isEmpty(); // updates should be still possible here diff --git a/nessie/src/test/java/org/apache/iceberg/nessie/TestCustomNessieClient.java b/nessie/src/test/java/org/apache/iceberg/nessie/TestCustomNessieClient.java index 058702486eca..2a3c0c85e000 100644 --- a/nessie/src/test/java/org/apache/iceberg/nessie/TestCustomNessieClient.java +++ b/nessie/src/test/java/org/apache/iceberg/nessie/TestCustomNessieClient.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.nessie; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + import java.net.URI; import java.util.function.Function; import org.apache.iceberg.CatalogProperties; @@ -32,8 +33,6 @@ import org.projectnessie.client.auth.NessieAuthentication; import org.projectnessie.client.http.HttpClientBuilder; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - public class TestCustomNessieClient extends BaseTestIceberg { public TestCustomNessieClient() { @@ -43,43 +42,64 @@ public TestCustomNessieClient() { @Test public void testNoCustomClient() { NessieCatalog catalog = new NessieCatalog(); - catalog.initialize("nessie", - ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, temp.toUri().toString(), - CatalogProperties.URI, uri)); + catalog.initialize( + "nessie", + ImmutableMap.of( + CatalogProperties.WAREHOUSE_LOCATION, + temp.toUri().toString(), + CatalogProperties.URI, + uri)); } @Test public void testUnnecessaryDefaultCustomClient() { NessieCatalog catalog = new NessieCatalog(); - catalog.initialize("nessie", - ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, temp.toUri().toString(), - CatalogProperties.URI, uri, - NessieConfigConstants.CONF_NESSIE_CLIENT_BUILDER_IMPL, HttpClientBuilder.class.getName())); + catalog.initialize( + "nessie", + ImmutableMap.of( + CatalogProperties.WAREHOUSE_LOCATION, + temp.toUri().toString(), + CatalogProperties.URI, + uri, + NessieConfigConstants.CONF_NESSIE_CLIENT_BUILDER_IMPL, + HttpClientBuilder.class.getName())); } @Test public void testNonExistentCustomClient() { String nonExistingClass = "non.existent.ClientBuilderImpl"; - assertThatThrownBy(() -> { - NessieCatalog catalog = new NessieCatalog(); - catalog.initialize("nessie", - ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, temp.toUri().toString(), - CatalogProperties.URI, uri, - NessieConfigConstants.CONF_NESSIE_CLIENT_BUILDER_IMPL, nonExistingClass)); - }) + assertThatThrownBy( + () -> { + NessieCatalog catalog = new NessieCatalog(); + catalog.initialize( + "nessie", + ImmutableMap.of( + CatalogProperties.WAREHOUSE_LOCATION, + temp.toUri().toString(), + CatalogProperties.URI, + uri, + NessieConfigConstants.CONF_NESSIE_CLIENT_BUILDER_IMPL, + nonExistingClass)); + }) .isInstanceOf(RuntimeException.class) .hasMessageContaining(nonExistingClass); } @Test public void testCustomClient() { - assertThatThrownBy(() -> { - NessieCatalog catalog = new NessieCatalog(); - catalog.initialize("nessie", - ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, temp.toUri().toString(), - CatalogProperties.URI, uri, - NessieConfigConstants.CONF_NESSIE_CLIENT_BUILDER_IMPL, DummyClientBuilderImpl.class.getName())); - }) + assertThatThrownBy( + () -> { + NessieCatalog catalog = new NessieCatalog(); + catalog.initialize( + "nessie", + ImmutableMap.of( + CatalogProperties.WAREHOUSE_LOCATION, + temp.toUri().toString(), + CatalogProperties.URI, + uri, + NessieConfigConstants.CONF_NESSIE_CLIENT_BUILDER_IMPL, + DummyClientBuilderImpl.class.getName())); + }) .isInstanceOf(RuntimeException.class) .hasMessage("BUILD CALLED"); } @@ -117,8 +137,7 @@ public NessieClientBuilder fromSystemProperties() { } @Override - public NessieClientBuilder withAuthentication( - NessieAuthentication authentication) { + public NessieClientBuilder withAuthentication(NessieAuthentication authentication) { return this; } diff --git a/nessie/src/test/java/org/apache/iceberg/nessie/TestNamespace.java b/nessie/src/test/java/org/apache/iceberg/nessie/TestNamespace.java index ce11f8afbe49..431f1a276fd3 100644 --- a/nessie/src/test/java/org/apache/iceberg/nessie/TestNamespace.java +++ b/nessie/src/test/java/org/apache/iceberg/nessie/TestNamespace.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.nessie; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -37,8 +38,6 @@ import org.projectnessie.model.ContentKey; import org.projectnessie.model.IcebergTable; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestNamespace extends BaseTestIceberg { private static final String BRANCH = "test-namespace"; @@ -90,15 +89,17 @@ public void testCreatingAndDroppingNamespaceWithContent() throws NessieNotFoundE Assertions.assertThat(catalog.namespaceExists(namespace)).isTrue(); TableIdentifier identifier = TableIdentifier.of(namespace, "tbl"); - Schema schema = new Schema(Types.StructType.of(required(1, "id", Types.LongType.get())).fields()); + Schema schema = + new Schema(Types.StructType.of(required(1, "id", Types.LongType.get())).fields()); Assertions.assertThat(catalog.createTable(identifier, schema)).isNotNull(); ContentKey key = NessieUtil.toKey(identifier); - Assertions.assertThat(api.getContent().key(key).refName(BRANCH).get().get(key).unwrap(IcebergTable.class)) + Assertions.assertThat( + api.getContent().key(key).refName(BRANCH).get().get(key).unwrap(IcebergTable.class)) .isPresent(); Assertions.assertThatThrownBy(() -> catalog.dropNamespace(namespace)) - .isInstanceOf(NamespaceNotEmptyException.class) + .isInstanceOf(NamespaceNotEmptyException.class) .hasMessage("Namespace 'test' is not empty. One or more tables exist."); catalog.dropTable(identifier, true); @@ -114,11 +115,13 @@ public void testSettingProperties() { Assertions.assertThat(catalog.namespaceExists(namespace)).isTrue(); Assertions.assertThat(catalog.loadNamespaceMetadata(namespace)).isEqualTo(properties); - ImmutableMap updatedProperties = ImmutableMap.of("prop2", "val2", "prop", "new_val"); + ImmutableMap updatedProperties = + ImmutableMap.of("prop2", "val2", "prop", "new_val"); catalog.setProperties(namespace, updatedProperties); Assertions.assertThat(catalog.loadNamespaceMetadata(namespace)).isEqualTo(updatedProperties); - Assertions.assertThatThrownBy(() -> catalog.setProperties(Namespace.of("unknown"), updatedProperties)) + Assertions.assertThatThrownBy( + () -> catalog.setProperties(Namespace.of("unknown"), updatedProperties)) .isInstanceOf(NoSuchNamespaceException.class) .hasMessage("Namespace does not exist: unknown"); } diff --git a/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieCatalog.java b/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieCatalog.java index 115b482a12de..ba9a3b4c6521 100644 --- a/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieCatalog.java +++ b/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.nessie; import java.io.IOException; diff --git a/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieIcebergClient.java b/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieIcebergClient.java index 1a469b9b7d6b..d6643be91e24 100644 --- a/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieIcebergClient.java +++ b/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieIcebergClient.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.nessie; import org.apache.iceberg.catalog.Namespace; @@ -67,7 +66,8 @@ public void testWithReference() throws NessieNotFoundException { } @Test - public void testWithReferenceAfterRecreatingBranch() throws NessieConflictException, NessieNotFoundException { + public void testWithReferenceAfterRecreatingBranch() + throws NessieConflictException, NessieNotFoundException { String branch = "branchToBeDropped"; createBranch(branch, null); NessieIcebergClient client = new NessieIcebergClient(api, branch, null, ImmutableMap.of()); @@ -76,12 +76,17 @@ public void testWithReferenceAfterRecreatingBranch() throws NessieConflictExcept Namespace namespace = Namespace.of("a"); client.createNamespace(namespace, ImmutableMap.of()); Assertions.assertThat(client.listNamespaces(namespace)).isNotNull(); - client.getApi().deleteBranch().branch((Branch) client.getApi().getReference().refName(branch).get()).delete(); + client + .getApi() + .deleteBranch() + .branch((Branch) client.getApi().getReference().refName(branch).get()) + .delete(); createBranch(branch, null); // make sure the client uses the re-created branch Reference ref = client.getApi().getReference().refName(branch).get(); - Assertions.assertThat(client.withReference(branch, null).getRef().getReference()).isEqualTo(ref); + Assertions.assertThat(client.withReference(branch, null).getRef().getReference()) + .isEqualTo(ref); Assertions.assertThat(client.withReference(branch, null)).isNotEqualTo(client); } } diff --git a/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieTable.java b/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieTable.java index 9f8bda77580f..97ded6094ef9 100644 --- a/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieTable.java +++ b/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieTable.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.nessie; +import static org.apache.iceberg.TableMetadataParser.getFileExtension; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.net.URI; @@ -60,10 +63,6 @@ import org.projectnessie.model.Operation; import org.projectnessie.model.Tag; -import static org.apache.iceberg.TableMetadataParser.getFileExtension; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestNessieTable extends BaseTestIceberg { private static final String BRANCH = "iceberg-table-test"; @@ -72,11 +71,14 @@ public class TestNessieTable extends BaseTestIceberg { private static final String TABLE_NAME = "tbl"; private static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of(DB_NAME, TABLE_NAME); private static final ContentKey KEY = ContentKey.of(DB_NAME, TABLE_NAME); - private static final Schema schema = new Schema(Types.StructType.of( - required(1, "id", Types.LongType.get())).fields()); - private static final Schema altered = new Schema(Types.StructType.of( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.LongType.get())).fields()); + private static final Schema schema = + new Schema(Types.StructType.of(required(1, "id", Types.LongType.get())).fields()); + private static final Schema altered = + new Schema( + Types.StructType.of( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.LongType.get())) + .fields()); private Path tableLocation; @@ -103,19 +105,15 @@ public void afterEach() throws Exception { super.afterEach(); } - private IcebergTable getTable(ContentKey key) - throws NessieNotFoundException { + private IcebergTable getTable(ContentKey key) throws NessieNotFoundException { return getTable(BRANCH, key); } - private IcebergTable getTable(String ref, ContentKey key) - throws NessieNotFoundException { + private IcebergTable getTable(String ref, ContentKey key) throws NessieNotFoundException { return api.getContent().key(key).refName(ref).get().get(key).unwrap(IcebergTable.class).get(); } - /** - * Verify that Nessie always returns the globally-current global-content w/ only DMLs. - */ + /** Verify that Nessie always returns the globally-current global-content w/ only DMLs. */ @Test public void verifyStateMovesForDML() throws Exception { // 1. initialize table @@ -124,8 +122,10 @@ public void verifyStateMovesForDML() throws Exception { // 2. create 2nd branch String testCaseBranch = "verify-global-moving"; - api.createReference().sourceRefName(BRANCH) - .reference(Branch.of(testCaseBranch, catalog.currentHash())).create(); + api.createReference() + .sourceRefName(BRANCH) + .reference(Branch.of(testCaseBranch, catalog.currentHash())) + .create(); try (NessieCatalog ignore = initCatalog(testCaseBranch)) { IcebergTable contentInitialMain = getTable(BRANCH, KEY); @@ -211,9 +211,8 @@ public void testCreate() throws IOException { @Test public void testRename() throws NessieNotFoundException { String renamedTableName = "rename_table_name"; - TableIdentifier renameTableIdentifier = TableIdentifier.of( - TABLE_IDENTIFIER.namespace(), - renamedTableName); + TableIdentifier renameTableIdentifier = + TableIdentifier.of(TABLE_IDENTIFIER.namespace(), renamedTableName); Table original = catalog.loadTable(TABLE_IDENTIFIER); @@ -236,16 +235,23 @@ public void testRename() throws NessieNotFoundException { @Test public void testRenameWithTableReference() throws NessieNotFoundException { String renamedTableName = "rename_table_name"; - TableIdentifier renameTableIdentifier = TableIdentifier.of(TABLE_IDENTIFIER.namespace(), renamedTableName); + TableIdentifier renameTableIdentifier = + TableIdentifier.of(TABLE_IDENTIFIER.namespace(), renamedTableName); ImmutableTableReference fromTableReference = - ImmutableTableReference.builder().reference(catalog.currentRefName()).name(TABLE_IDENTIFIER.name()).build(); + ImmutableTableReference.builder() + .reference(catalog.currentRefName()) + .name(TABLE_IDENTIFIER.name()) + .build(); ImmutableTableReference toTableReference = ImmutableTableReference.builder() .reference(catalog.currentRefName()) - .name(renameTableIdentifier.name()).build(); - TableIdentifier fromIdentifier = TableIdentifier.of(TABLE_IDENTIFIER.namespace(), fromTableReference.toString()); - TableIdentifier toIdentifier = TableIdentifier.of(TABLE_IDENTIFIER.namespace(), toTableReference.toString()); + .name(renameTableIdentifier.name()) + .build(); + TableIdentifier fromIdentifier = + TableIdentifier.of(TABLE_IDENTIFIER.namespace(), fromTableReference.toString()); + TableIdentifier toIdentifier = + TableIdentifier.of(TABLE_IDENTIFIER.namespace(), toTableReference.toString()); Table original = catalog.loadTable(fromIdentifier); @@ -268,29 +274,42 @@ public void testRenameWithTableReference() throws NessieNotFoundException { @Test public void testRenameWithTableReferenceInvalidCase() throws NessieNotFoundException { String renamedTableName = "rename_table_name"; - TableIdentifier renameTableIdentifier = TableIdentifier.of(TABLE_IDENTIFIER.namespace(), renamedTableName); + TableIdentifier renameTableIdentifier = + TableIdentifier.of(TABLE_IDENTIFIER.namespace(), renamedTableName); ImmutableTableReference fromTableReference = - ImmutableTableReference.builder().reference("Something").name(TABLE_IDENTIFIER.name()).build(); + ImmutableTableReference.builder() + .reference("Something") + .name(TABLE_IDENTIFIER.name()) + .build(); ImmutableTableReference toTableReference = ImmutableTableReference.builder() .reference(catalog.currentRefName()) - .name(renameTableIdentifier.name()).build(); - TableIdentifier fromIdentifier = TableIdentifier.of(TABLE_IDENTIFIER.namespace(), fromTableReference.toString()); - TableIdentifier toIdentifier = TableIdentifier.of(TABLE_IDENTIFIER.namespace(), toTableReference.toString()); + .name(renameTableIdentifier.name()) + .build(); + TableIdentifier fromIdentifier = + TableIdentifier.of(TABLE_IDENTIFIER.namespace(), fromTableReference.toString()); + TableIdentifier toIdentifier = + TableIdentifier.of(TABLE_IDENTIFIER.namespace(), toTableReference.toString()); Assertions.assertThatThrownBy(() -> catalog.renameTable(fromIdentifier, toIdentifier)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("from: Something and to: iceberg-table-test reference name must be same"); fromTableReference = - ImmutableTableReference.builder().reference(catalog.currentRefName()).name(TABLE_IDENTIFIER.name()).build(); + ImmutableTableReference.builder() + .reference(catalog.currentRefName()) + .name(TABLE_IDENTIFIER.name()) + .build(); toTableReference = ImmutableTableReference.builder() .reference("Something") - .name(renameTableIdentifier.name()).build(); - TableIdentifier fromIdentifierNew = TableIdentifier.of(TABLE_IDENTIFIER.namespace(), fromTableReference.toString()); - TableIdentifier toIdentifierNew = TableIdentifier.of(TABLE_IDENTIFIER.namespace(), toTableReference.toString()); + .name(renameTableIdentifier.name()) + .build(); + TableIdentifier fromIdentifierNew = + TableIdentifier.of(TABLE_IDENTIFIER.namespace(), fromTableReference.toString()); + TableIdentifier toIdentifierNew = + TableIdentifier.of(TABLE_IDENTIFIER.namespace(), toTableReference.toString()); Assertions.assertThatThrownBy(() -> catalog.renameTable(fromIdentifierNew, toIdentifierNew)) .isInstanceOf(IllegalArgumentException.class) @@ -301,14 +320,17 @@ private void verifyCommitMetadata() throws NessieNotFoundException { // check that the author is properly set List log = api.getCommitLog().refName(BRANCH).get().getLogEntries(); Assertions.assertThat(log) - .isNotNull().isNotEmpty() - .allSatisfy(logEntry -> { - CommitMeta commit = logEntry.getCommitMeta(); - Assertions.assertThat(commit.getAuthor()).isNotNull().isNotEmpty(); - Assertions.assertThat(commit.getAuthor()).isEqualTo(System.getProperty("user.name")); - Assertions.assertThat(commit.getProperties().get(NessieUtil.APPLICATION_TYPE)).isEqualTo("iceberg"); - Assertions.assertThat(commit.getMessage()).startsWith("Iceberg"); - }); + .isNotNull() + .isNotEmpty() + .allSatisfy( + logEntry -> { + CommitMeta commit = logEntry.getCommitMeta(); + Assertions.assertThat(commit.getAuthor()).isNotNull().isNotEmpty(); + Assertions.assertThat(commit.getAuthor()).isEqualTo(System.getProperty("user.name")); + Assertions.assertThat(commit.getProperties().get(NessieUtil.APPLICATION_TYPE)) + .isEqualTo("iceberg"); + Assertions.assertThat(commit.getMessage()).startsWith("Iceberg"); + }); } @Test @@ -322,8 +344,12 @@ public void testDrop() throws NessieNotFoundException { @Test public void testDropWithTableReference() throws NessieNotFoundException { ImmutableTableReference tableReference = - ImmutableTableReference.builder().reference(catalog.currentRefName()).name(TABLE_IDENTIFIER.name()).build(); - TableIdentifier identifier = TableIdentifier.of(TABLE_IDENTIFIER.namespace(), tableReference.toString()); + ImmutableTableReference.builder() + .reference(catalog.currentRefName()) + .name(TABLE_IDENTIFIER.name()) + .build(); + TableIdentifier identifier = + TableIdentifier.of(TABLE_IDENTIFIER.namespace(), tableReference.toString()); Assertions.assertThat(catalog.tableExists(identifier)).isTrue(); Assertions.assertThat(catalog.dropTable(identifier)).isTrue(); Assertions.assertThat(catalog.tableExists(identifier)).isFalse(); @@ -382,14 +408,14 @@ public void testDropTable() throws IOException { } TableOperations ops = ((HasTableOperations) table).operations(); String metadataLocation = ((NessieTableOperations) ops).currentMetadataLocation(); - Assertions.assertThat(new File(metadataLocation.replace("file:", ""))) - .exists(); + Assertions.assertThat(new File(metadataLocation.replace("file:", ""))).exists(); verifyCommitMetadata(); } private void validateRegister(TableIdentifier identifier, String metadataVersionFiles) { - Assertions.assertThat(catalog.registerTable(identifier, "file:" + metadataVersionFiles)).isNotNull(); + Assertions.assertThat(catalog.registerTable(identifier, "file:" + metadataVersionFiles)) + .isNotNull(); Table newTable = catalog.loadTable(identifier); Assertions.assertThat(newTable).isNotNull(); TableOperations ops = ((HasTableOperations) newTable).operations(); @@ -409,20 +435,22 @@ public void testRegisterTableWithGivenBranch() { } @Test - public void testRegisterTableFailureScenarios() throws NessieConflictException, NessieNotFoundException { + public void testRegisterTableFailureScenarios() + throws NessieConflictException, NessieNotFoundException { List metadataVersionFiles = metadataVersionFiles(TABLE_NAME); Assertions.assertThat(1).isEqualTo(metadataVersionFiles.size()); // Case 1: Branch does not exist ImmutableTableReference defaultTableReference = ImmutableTableReference.builder().reference("default").name(TABLE_NAME).build(); - TableIdentifier defaultIdentifier = TableIdentifier.of(DB_NAME, defaultTableReference.toString()); + TableIdentifier defaultIdentifier = + TableIdentifier.of(DB_NAME, defaultTableReference.toString()); Assertions.assertThatThrownBy( - () -> catalog.registerTable( - defaultIdentifier, "file:" + metadataVersionFiles.get(0))) + () -> catalog.registerTable(defaultIdentifier, "file:" + metadataVersionFiles.get(0))) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Nessie ref 'default' does not exist"); // Case 2: Table Already Exists - Assertions.assertThatThrownBy(() -> catalog.registerTable(TABLE_IDENTIFIER, "file:" + metadataVersionFiles.get(0))) + Assertions.assertThatThrownBy( + () -> catalog.registerTable(TABLE_IDENTIFIER, "file:" + metadataVersionFiles.get(0))) .isInstanceOf(AlreadyExistsException.class) .hasMessage("Table already exists: db.tbl"); // Case 3: Registering using a tag @@ -436,17 +464,19 @@ public void testRegisterTableFailureScenarios() throws NessieConflictException, ImmutableTableReference.builder().reference("tag_1").name(TABLE_NAME).build(); TableIdentifier tagIdentifier = TableIdentifier.of(DB_NAME, tagTableReference.toString()); Assertions.assertThatThrownBy( - () -> catalog.registerTable( - tagIdentifier, "file:" + metadataVersionFiles.get(0))) + () -> catalog.registerTable(tagIdentifier, "file:" + metadataVersionFiles.get(0))) .isInstanceOf(IllegalArgumentException.class) .hasMessage("You can only mutate tables when using a branch without a hash or timestamp."); // Case 4: non-null metadata path with null metadata location Assertions.assertThatThrownBy( - () -> catalog.registerTable(TABLE_IDENTIFIER, "file:" + metadataVersionFiles.get(0) + "invalidName")) + () -> + catalog.registerTable( + TABLE_IDENTIFIER, "file:" + metadataVersionFiles.get(0) + "invalidName")) .isInstanceOf(NotFoundException.class); // Case 5: null identifier Assertions.assertThatThrownBy( - () -> catalog.registerTable(null, "file:" + metadataVersionFiles.get(0) + "invalidName")) + () -> + catalog.registerTable(null, "file:" + metadataVersionFiles.get(0) + "invalidName")) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Invalid identifier: null"); } @@ -493,12 +523,14 @@ public void testFailure() throws NessieNotFoundException, NessieConflictExceptio IcebergTable table = getTable(BRANCH, KEY); IcebergTable value = IcebergTable.of("dummytable.metadata.json", 42, 42, 42, 42, "cid"); - api.commitMultipleOperations().branch(branch) + api.commitMultipleOperations() + .branch(branch) .operation(Operation.Put.of(KEY, value)) .commitMeta(CommitMeta.fromMessage("")) .commit(); - Assertions.assertThatThrownBy(() -> icebergTable.updateSchema().addColumn("data", Types.LongType.get()).commit()) + Assertions.assertThatThrownBy( + () -> icebergTable.updateSchema().addColumn("data", Types.LongType.get()).commit()) .isInstanceOf(CommitFailedException.class) .hasMessage( "Cannot commit: Reference hash is out of date. Update the reference 'iceberg-table-test' and try again"); @@ -507,12 +539,10 @@ public void testFailure() throws NessieNotFoundException, NessieConflictExceptio @Test public void testListTables() { List tableIdents = catalog.listTables(TABLE_IDENTIFIER.namespace()); - List expectedIdents = tableIdents.stream() - .filter(t -> t.namespace() - .level(0) - .equals(DB_NAME) && - t.name().equals(TABLE_NAME)) - .collect(Collectors.toList()); + List expectedIdents = + tableIdents.stream() + .filter(t -> t.namespace().level(0).equals(DB_NAME) && t.name().equals(TABLE_NAME)) + .collect(Collectors.toList()); Assertions.assertThat(expectedIdents).hasSize(1); Assertions.assertThat(catalog.tableExists(TABLE_IDENTIFIER)).isTrue(); @@ -535,7 +565,8 @@ private String metadataLocation(String tableName) { return Paths.get(getTableBasePath(tableName), "metadata").toString(); } - @SuppressWarnings("RegexpSinglelineJava") // respecting this rule requires a lot more lines of code + @SuppressWarnings( + "RegexpSinglelineJava") // respecting this rule requires a lot more lines of code private List metadataFiles(String tableName) { return Arrays.stream(Objects.requireNonNull(new File(metadataLocation(tableName)).listFiles())) .map(File::getAbsolutePath) @@ -551,8 +582,7 @@ protected List manifestFiles(String tableName) { } private List filterByExtension(String tableName, String extension) { - return metadataFiles(tableName) - .stream() + return metadataFiles(tableName).stream() .filter(f -> f.endsWith(extension)) .collect(Collectors.toList()); } @@ -567,5 +597,4 @@ private static String addRecordsToFile(Table table, String filename) throws IOEx return writeRecordsToFile(table, schema, filename, records); } - } diff --git a/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieUtil.java b/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieUtil.java index 54b54e258828..6d7b4e825f52 100644 --- a/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieUtil.java +++ b/nessie/src/test/java/org/apache/iceberg/nessie/TestNessieUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.nessie; import com.fasterxml.jackson.databind.JsonNode; @@ -47,9 +46,13 @@ public void testTableMetadataJsonRoundtrip() { // Construct a dummy TableMetadata object Map properties = Collections.singletonMap("property-key", "property-value"); String location = "obj://foo/bar/baz"; - TableMetadata tableMetadata = TableMetadata.newTableMetadata( - new Schema(1, NestedField.of(1, false, "column", StringType.get())), - PartitionSpec.unpartitioned(), SortOrder.unsorted(), location, properties); + TableMetadata tableMetadata = + TableMetadata.newTableMetadata( + new Schema(1, NestedField.of(1, false, "column", StringType.get())), + PartitionSpec.unpartitioned(), + SortOrder.unsorted(), + location, + properties); // Produce a generic JsonNode from the TableMetadata JsonNode jsonNode = NessieUtil.tableMetadataAsJsonNode(tableMetadata); @@ -59,19 +62,19 @@ public void testTableMetadataJsonRoundtrip() { n -> n.get("format-version").asLong(-2L), n -> n.get("location").asText("x"), n -> n.get("properties").get("property-key").asText()) - .containsExactly( - 1L, - location, - "property-value"); + .containsExactly(1L, location, "property-value"); // Create a Nessie IcebergTable object with the JsonNode as the metadata IcebergTable icebergTableNoMetadata = IcebergTable.of(location, 0L, 1, 2, 3, "cid"); - IcebergTable icebergTable = ImmutableIcebergTable.builder().from(icebergTableNoMetadata) - .metadata(GenericMetadata.of("iceberg", jsonNode)).build(); + IcebergTable icebergTable = + ImmutableIcebergTable.builder() + .from(icebergTableNoMetadata) + .metadata(GenericMetadata.of("iceberg", jsonNode)) + .build(); // Deserialize the TableMetadata from Nessie IcebergTable - TableMetadata deserializedMetadata = NessieUtil.tableMetadataFromIcebergTable(null, - icebergTable, location); + TableMetadata deserializedMetadata = + NessieUtil.tableMetadataFromIcebergTable(null, icebergTable, location); // (Could compare tableMetadata against deserializedMetadata, but TableMetadata has no equals()) @@ -87,9 +90,12 @@ public void testTableMetadataFromFileIO() { IcebergTable icebergTableNoMetadata = IcebergTable.of(location, 0L, 1, 2, 3, "cid"); // Check that newInputFile() is called when IcebergTable has no metadata - Mockito.when(fileIoMock.newInputFile(location)).thenThrow(new RuntimeException("newInputFile called")); - Assertions.assertThatThrownBy(() -> - NessieUtil.tableMetadataFromIcebergTable(fileIoMock, icebergTableNoMetadata, location)) + Mockito.when(fileIoMock.newInputFile(location)) + .thenThrow(new RuntimeException("newInputFile called")); + Assertions.assertThatThrownBy( + () -> + NessieUtil.tableMetadataFromIcebergTable( + fileIoMock, icebergTableNoMetadata, location)) .isInstanceOf(RuntimeException.class) .hasMessage("newInputFile called"); } @@ -106,14 +112,17 @@ public void testSparkAppIdAndUserIsSetOnCommitMetadata() { String commitMsg = "commit msg"; String appId = "SPARK_ID_123"; String user = "sparkUser"; - CommitMeta commitMeta = NessieUtil.buildCommitMetadata( - commitMsg, - ImmutableMap.of(CatalogProperties.APP_ID, appId, CatalogProperties.USER, user)); + CommitMeta commitMeta = + NessieUtil.buildCommitMetadata( + commitMsg, + ImmutableMap.of(CatalogProperties.APP_ID, appId, CatalogProperties.USER, user)); Assertions.assertThat(commitMeta.getMessage()).isEqualTo(commitMsg); Assertions.assertThat(commitMeta.getAuthor()).isEqualTo(user); Assertions.assertThat(commitMeta.getProperties()).hasSize(2); - Assertions.assertThat(commitMeta.getProperties().get(NessieUtil.APPLICATION_TYPE)).isEqualTo("iceberg"); - Assertions.assertThat(commitMeta.getProperties().get(CatalogProperties.APP_ID)).isEqualTo(appId); + Assertions.assertThat(commitMeta.getProperties().get(NessieUtil.APPLICATION_TYPE)) + .isEqualTo("iceberg"); + Assertions.assertThat(commitMeta.getProperties().get(CatalogProperties.APP_ID)) + .isEqualTo(appId); } @Test @@ -123,7 +132,8 @@ public void testAuthorIsSetOnCommitMetadata() { Assertions.assertThat(commitMeta.getMessage()).isEqualTo(commitMsg); Assertions.assertThat(commitMeta.getAuthor()).isEqualTo(System.getProperty("user.name")); Assertions.assertThat(commitMeta.getProperties()).hasSize(1); - Assertions.assertThat(commitMeta.getProperties().get(NessieUtil.APPLICATION_TYPE)).isEqualTo("iceberg"); + Assertions.assertThat(commitMeta.getProperties().get(NessieUtil.APPLICATION_TYPE)) + .isEqualTo("iceberg"); } @Test diff --git a/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReader.java b/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReader.java index 893d01465ddb..f4d816baf2ba 100644 --- a/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReader.java +++ b/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.orc; import java.util.Collections; @@ -39,10 +38,13 @@ public class GenericOrcReader implements OrcRowReader { public GenericOrcReader( Schema expectedSchema, TypeDescription readOrcSchema, Map idToConstant) { - this.reader = OrcSchemaWithTypeVisitor.visit(expectedSchema, readOrcSchema, new ReadBuilder(idToConstant)); + this.reader = + OrcSchemaWithTypeVisitor.visit( + expectedSchema, readOrcSchema, new ReadBuilder(idToConstant)); } - public static OrcRowReader buildReader(Schema expectedSchema, TypeDescription fileSchema) { + public static OrcRowReader buildReader( + Schema expectedSchema, TypeDescription fileSchema) { return new GenericOrcReader(expectedSchema, fileSchema, Collections.emptyMap()); } @@ -70,18 +72,25 @@ private ReadBuilder(Map idToConstant) { @Override public OrcValueReader record( - Types.StructType expected, TypeDescription record, List names, List> fields) { + Types.StructType expected, + TypeDescription record, + List names, + List> fields) { return GenericOrcReaders.struct(fields, expected, idToConstant); } @Override - public OrcValueReader list(Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { + public OrcValueReader list( + Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { return GenericOrcReaders.array(elementReader); } @Override public OrcValueReader map( - Types.MapType iMap, TypeDescription map, OrcValueReader keyReader, OrcValueReader valueReader) { + Types.MapType iMap, + TypeDescription map, + OrcValueReader keyReader, + OrcValueReader valueReader) { return GenericOrcReaders.map(keyReader, valueReader); } @@ -104,7 +113,9 @@ public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescriptio return OrcValueReaders.longs(); default: throw new IllegalStateException( - String.format("Invalid iceberg type %s corresponding to ORC type %s", iPrimitive, primitive)); + String.format( + "Invalid iceberg type %s corresponding to ORC type %s", + iPrimitive, primitive)); } case FLOAT: @@ -133,7 +144,9 @@ public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescriptio return GenericOrcReaders.bytes(); default: throw new IllegalStateException( - String.format("Invalid iceberg type %s corresponding to ORC type %s", iPrimitive, primitive)); + String.format( + "Invalid iceberg type %s corresponding to ORC type %s", + iPrimitive, primitive)); } default: throw new IllegalArgumentException("Unhandled type " + primitive); diff --git a/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReaders.java b/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReaders.java index ee22713a78cb..18ce07ac8b74 100644 --- a/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReaders.java +++ b/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.orc; import java.math.BigDecimal; @@ -48,11 +47,9 @@ import org.apache.orc.storage.ql.exec.vector.MapColumnVector; import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector; - public class GenericOrcReaders { - private GenericOrcReaders() { - } + private GenericOrcReaders() {} public static OrcValueReader struct( List> readers, Types.StructType struct, Map idToConstant) { @@ -63,7 +60,8 @@ public static OrcValueReader> array(OrcValueReader elementReader) { return new ListReader(elementReader); } - public static OrcValueReader> map(OrcValueReader keyReader, OrcValueReader valueReader) { + public static OrcValueReader> map( + OrcValueReader keyReader, OrcValueReader valueReader) { return new MapReader(keyReader, valueReader); } @@ -102,21 +100,20 @@ public static OrcValueReader timestamps() { private static class TimestampTzReader implements OrcValueReader { public static final OrcValueReader INSTANCE = new TimestampTzReader(); - private TimestampTzReader() { - } + private TimestampTzReader() {} @Override public OffsetDateTime nonNullRead(ColumnVector vector, int row) { TimestampColumnVector tcv = (TimestampColumnVector) vector; - return Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]).atOffset(ZoneOffset.UTC); + return Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) + .atOffset(ZoneOffset.UTC); } } private static class TimeReader implements OrcValueReader { public static final OrcValueReader INSTANCE = new TimeReader(); - private TimeReader() { - } + private TimeReader() {} @Override public LocalTime nonNullRead(ColumnVector vector, int row) { @@ -127,8 +124,7 @@ public LocalTime nonNullRead(ColumnVector vector, int row) { private static class DateReader implements OrcValueReader { public static final OrcValueReader INSTANCE = new DateReader(); - private DateReader() { - } + private DateReader() {} @Override public LocalDate nonNullRead(ColumnVector vector, int row) { @@ -139,22 +135,21 @@ public LocalDate nonNullRead(ColumnVector vector, int row) { private static class TimestampReader implements OrcValueReader { public static final OrcValueReader INSTANCE = new TimestampReader(); - private TimestampReader() { - } + private TimestampReader() {} @Override public LocalDateTime nonNullRead(ColumnVector vector, int row) { TimestampColumnVector tcv = (TimestampColumnVector) vector; - return Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]).atOffset(ZoneOffset.UTC) - .toLocalDateTime(); + return Instant.ofEpochSecond(Math.floorDiv(tcv.time[row], 1_000), tcv.nanos[row]) + .atOffset(ZoneOffset.UTC) + .toLocalDateTime(); } } private static class DecimalReader implements OrcValueReader { public static final OrcValueReader INSTANCE = new DecimalReader(); - private DecimalReader() { - } + private DecimalReader() {} @Override public BigDecimal nonNullRead(ColumnVector vector, int row) { @@ -166,27 +161,29 @@ public BigDecimal nonNullRead(ColumnVector vector, int row) { private static class StringReader implements OrcValueReader { public static final OrcValueReader INSTANCE = new StringReader(); - private StringReader() { - } + private StringReader() {} @Override public String nonNullRead(ColumnVector vector, int row) { BytesColumnVector bytesVector = (BytesColumnVector) vector; - return new String(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row], - StandardCharsets.UTF_8); + return new String( + bytesVector.vector[row], + bytesVector.start[row], + bytesVector.length[row], + StandardCharsets.UTF_8); } } private static class UUIDReader implements OrcValueReader { public static final OrcValueReader INSTANCE = new UUIDReader(); - private UUIDReader() { - } + private UUIDReader() {} @Override public UUID nonNullRead(ColumnVector vector, int row) { BytesColumnVector bytesVector = (BytesColumnVector) vector; - ByteBuffer buf = ByteBuffer.wrap(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); + ByteBuffer buf = + ByteBuffer.wrap(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); return UUIDUtil.convert(buf); } } @@ -194,27 +191,31 @@ public UUID nonNullRead(ColumnVector vector, int row) { private static class BytesReader implements OrcValueReader { public static final OrcValueReader INSTANCE = new BytesReader(); - private BytesReader() { - } + private BytesReader() {} @Override public ByteBuffer nonNullRead(ColumnVector vector, int row) { BytesColumnVector bytesVector = (BytesColumnVector) vector; - return ByteBuffer.wrap(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); + return ByteBuffer.wrap( + bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); } } private static class StructReader extends OrcValueReaders.StructReader { private final GenericRecord template; - protected StructReader(List> readers, Types.StructType structType, Map idToConstant) { + protected StructReader( + List> readers, + Types.StructType structType, + Map idToConstant) { super(readers, structType, idToConstant); this.template = structType != null ? GenericRecord.create(structType) : null; } @Override protected Record create() { - // GenericRecord.copy() is more performant then GenericRecord.create(StructType) since NAME_MAP_CACHE access + // GenericRecord.copy() is more performant then GenericRecord.create(StructType) since + // NAME_MAP_CACHE access // is eliminated. Using copy here to gain performance. return template.copy(); } diff --git a/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcWriter.java b/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcWriter.java index a136085cfd42..93815ca5604c 100644 --- a/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcWriter.java +++ b/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.orc; import java.util.List; @@ -38,35 +37,41 @@ public class GenericOrcWriter implements OrcRowWriter { private final RecordWriter writer; private GenericOrcWriter(Schema expectedSchema, TypeDescription orcSchema) { - Preconditions.checkArgument(orcSchema.getCategory() == TypeDescription.Category.STRUCT, + Preconditions.checkArgument( + orcSchema.getCategory() == TypeDescription.Category.STRUCT, "Top level must be a struct " + orcSchema); - writer = (RecordWriter) OrcSchemaWithTypeVisitor.visit(expectedSchema, orcSchema, new WriteBuilder()); + writer = + (RecordWriter) + OrcSchemaWithTypeVisitor.visit(expectedSchema, orcSchema, new WriteBuilder()); } - public static OrcRowWriter buildWriter(Schema expectedSchema, TypeDescription fileSchema) { + public static OrcRowWriter buildWriter( + Schema expectedSchema, TypeDescription fileSchema) { return new GenericOrcWriter(expectedSchema, fileSchema); } private static class WriteBuilder extends OrcSchemaWithTypeVisitor> { - private WriteBuilder() { - } + private WriteBuilder() {} @Override - public OrcValueWriter record(Types.StructType iStruct, TypeDescription record, - List names, List> fields) { + public OrcValueWriter record( + Types.StructType iStruct, + TypeDescription record, + List names, + List> fields) { return new RecordWriter(fields); } @Override - public OrcValueWriter list(Types.ListType iList, TypeDescription array, - OrcValueWriter element) { + public OrcValueWriter list( + Types.ListType iList, TypeDescription array, OrcValueWriter element) { return GenericOrcWriters.list(element); } @Override - public OrcValueWriter map(Types.MapType iMap, TypeDescription map, - OrcValueWriter key, OrcValueWriter value) { + public OrcValueWriter map( + Types.MapType iMap, TypeDescription map, OrcValueWriter key, OrcValueWriter value) { return GenericOrcWriters.map(key, value); } @@ -106,8 +111,9 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, TypeDescriptio Types.DecimalType decimalType = (Types.DecimalType) iPrimitive; return GenericOrcWriters.decimal(decimalType.precision(), decimalType.scale()); default: - throw new IllegalArgumentException(String.format("Invalid iceberg type %s corresponding to ORC type %s", - iPrimitive, primitive)); + throw new IllegalArgumentException( + String.format( + "Invalid iceberg type %s corresponding to ORC type %s", iPrimitive, primitive)); } } } diff --git a/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcWriters.java b/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcWriters.java index feb3778cc49b..5e12a828b7f0 100644 --- a/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcWriters.java +++ b/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.orc; import java.io.IOException; @@ -60,8 +59,7 @@ public class GenericOrcWriters { private static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC); private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); - private GenericOrcWriters() { - } + private GenericOrcWriters() {} public static OrcValueWriter booleans() { return BooleanWriter.INSTANCE; @@ -137,12 +135,13 @@ public static OrcValueWriter> list(OrcValueWriter element) { return new ListWriter<>(element); } - public static OrcValueWriter> map(OrcValueWriter key, OrcValueWriter value) { + public static OrcValueWriter> map( + OrcValueWriter key, OrcValueWriter value) { return new MapWriter<>(key, value); } - public static OrcRowWriter> positionDelete(OrcRowWriter writer, - Function pathTransformFunc) { + public static OrcRowWriter> positionDelete( + OrcRowWriter writer, Function pathTransformFunc) { return new PositionDeleteStructWriter<>(writer, pathTransformFunc); } @@ -222,10 +221,14 @@ public void nullWrite() { @Override public Stream> metrics() { FieldMetrics metricsWithoutNullCount = floatFieldMetricsBuilder.build(); - return Stream.of(new FieldMetrics<>(metricsWithoutNullCount.id(), - metricsWithoutNullCount.valueCount() + nullValueCount, - nullValueCount, metricsWithoutNullCount.nanValueCount(), - metricsWithoutNullCount.lowerBound(), metricsWithoutNullCount.upperBound())); + return Stream.of( + new FieldMetrics<>( + metricsWithoutNullCount.id(), + metricsWithoutNullCount.valueCount() + nullValueCount, + nullValueCount, + metricsWithoutNullCount.nanValueCount(), + metricsWithoutNullCount.lowerBound(), + metricsWithoutNullCount.upperBound())); } } @@ -251,10 +254,14 @@ public void nullWrite() { @Override public Stream> metrics() { FieldMetrics metricsWithoutNullCount = doubleFieldMetricsBuilder.build(); - return Stream.of(new FieldMetrics<>(metricsWithoutNullCount.id(), - metricsWithoutNullCount.valueCount() + nullValueCount, - nullValueCount, metricsWithoutNullCount.nanValueCount(), - metricsWithoutNullCount.lowerBound(), metricsWithoutNullCount.upperBound())); + return Stream.of( + new FieldMetrics<>( + metricsWithoutNullCount.id(), + metricsWithoutNullCount.valueCount() + nullValueCount, + nullValueCount, + metricsWithoutNullCount.nanValueCount(), + metricsWithoutNullCount.lowerBound(), + metricsWithoutNullCount.upperBound())); } } @@ -274,8 +281,8 @@ private static class ByteBufferWriter implements OrcValueWriter { @Override public void nonNullWrite(int rowId, ByteBuffer data, ColumnVector output) { if (data.hasArray()) { - ((BytesColumnVector) output).setRef(rowId, data.array(), - data.arrayOffset() + data.position(), data.remaining()); + ((BytesColumnVector) output) + .setRef(rowId, data.array(), data.arrayOffset() + data.position(), data.remaining()); } else { byte[] rawData = ByteBuffers.toByteArray(data); ((BytesColumnVector) output).setRef(rowId, rawData, 0, rawData.length); @@ -337,7 +344,8 @@ public void nonNullWrite(int rowId, LocalDateTime data, ColumnVector output) { TimestampColumnVector cv = (TimestampColumnVector) output; cv.setIsUTC(true); cv.time[rowId] = data.toInstant(ZoneOffset.UTC).toEpochMilli(); // millis - cv.nanos[rowId] = (data.getNano() / 1_000) * 1_000; // truncate nanos to only keep microsecond precision + cv.nanos[rowId] = + (data.getNano() / 1_000) * 1_000; // truncate nanos to only keep microsecond precision } } @@ -352,13 +360,21 @@ private static class Decimal18Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, BigDecimal data, ColumnVector output) { - Preconditions.checkArgument(data.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, data); - Preconditions.checkArgument(data.precision() <= precision, - "Cannot write value as decimal(%s,%s), invalid precision: %s", precision, scale, data); + Preconditions.checkArgument( + data.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + data); + Preconditions.checkArgument( + data.precision() <= precision, + "Cannot write value as decimal(%s,%s), invalid precision: %s", + precision, + scale, + data); - ((DecimalColumnVector) output).vector[rowId] - .setFromLongAndScale(data.unscaledValue().longValueExact(), scale); + ((DecimalColumnVector) output) + .vector[rowId].setFromLongAndScale(data.unscaledValue().longValueExact(), scale); } } @@ -373,10 +389,18 @@ private static class Decimal38Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, BigDecimal data, ColumnVector output) { - Preconditions.checkArgument(data.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, data); - Preconditions.checkArgument(data.precision() <= precision, - "Cannot write value as decimal(%s,%s), invalid precision: %s", precision, scale, data); + Preconditions.checkArgument( + data.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + data); + Preconditions.checkArgument( + data.precision() <= precision, + "Cannot write value as decimal(%s,%s), invalid precision: %s", + precision, + scale, + data); ((DecimalColumnVector) output).vector[rowId].set(HiveDecimal.create(data, false)); } @@ -493,7 +517,8 @@ private static class PositionDeleteStructWriter extends StructWriter> { private final Function pathTransformFunc; - PositionDeleteStructWriter(OrcRowWriter replacedWriter, Function pathTransformFunc) { + PositionDeleteStructWriter( + OrcRowWriter replacedWriter, Function pathTransformFunc) { super(replacedWriter.writers()); this.pathTransformFunc = pathTransformFunc; } @@ -514,7 +539,8 @@ protected Object get(PositionDelete delete, int index) { @Override public void write(PositionDelete row, VectorizedRowBatch output) throws IOException { Preconditions.checkArgument(row != null, "value must not be null"); - Preconditions.checkArgument(writers().size() == 2 || row.row() != null, + Preconditions.checkArgument( + writers().size() == 2 || row.row() != null, "The row in PositionDelete must not be null because it was set row schema in position delete."); writeRow(row, output); } diff --git a/orc/src/main/java/org/apache/iceberg/orc/ApplyNameMapping.java b/orc/src/main/java/org/apache/iceberg/orc/ApplyNameMapping.java index 619a8c33f3ce..9c78556d34ed 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ApplyNameMapping.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ApplyNameMapping.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.util.List; @@ -55,7 +54,8 @@ TypeDescription setId(TypeDescription type, MappedField mappedField) { } @Override - public TypeDescription record(TypeDescription record, List names, List fields) { + public TypeDescription record( + TypeDescription record, List names, List fields) { Preconditions.checkArgument(names.size() == fields.size(), "All fields must have names"); MappedField field = nameMapping.find(currentPath()); TypeDescription structType = TypeDescription.createStruct(); @@ -81,7 +81,8 @@ public TypeDescription list(TypeDescription array, TypeDescription element) { @Override public TypeDescription map(TypeDescription map, TypeDescription key, TypeDescription value) { - Preconditions.checkArgument(key != null && value != null, "Map type must have both key and value types"); + Preconditions.checkArgument( + key != null && value != null, "Map type must have both key and value types"); MappedField field = nameMapping.find(currentPath()); TypeDescription mapType = TypeDescription.createMap(key, value); diff --git a/orc/src/main/java/org/apache/iceberg/orc/EstimateOrcAvgWidthVisitor.java b/orc/src/main/java/org/apache/iceberg/orc/EstimateOrcAvgWidthVisitor.java index af0c3b6a2f77..2e7c3ab38835 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/EstimateOrcAvgWidthVisitor.java +++ b/orc/src/main/java/org/apache/iceberg/orc/EstimateOrcAvgWidthVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.util.List; diff --git a/orc/src/main/java/org/apache/iceberg/orc/ExpressionToSearchArgument.java b/orc/src/main/java/org/apache/iceberg/orc/ExpressionToSearchArgument.java index 54ed24a04e5c..ba05d966c827 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ExpressionToSearchArgument.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ExpressionToSearchArgument.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.math.BigDecimal; @@ -42,29 +41,29 @@ import org.apache.orc.storage.ql.io.sarg.SearchArgumentFactory; import org.apache.orc.storage.serde2.io.HiveDecimalWritable; -class ExpressionToSearchArgument extends ExpressionVisitors.BoundVisitor { +class ExpressionToSearchArgument + extends ExpressionVisitors.BoundVisitor { static SearchArgument convert(Expression expr, TypeDescription readSchema) { - Map idToColumnName = ORCSchemaUtil.idToOrcName(ORCSchemaUtil.convert(readSchema)); + Map idToColumnName = + ORCSchemaUtil.idToOrcName(ORCSchemaUtil.convert(readSchema)); SearchArgument.Builder builder = SearchArgumentFactory.newBuilder(); - ExpressionVisitors.visit(expr, new ExpressionToSearchArgument(builder, idToColumnName)).invoke(); + ExpressionVisitors.visit(expr, new ExpressionToSearchArgument(builder, idToColumnName)) + .invoke(); return builder.build(); } - // Currently every predicate in ORC requires a PredicateLeaf.Type field which is not available for these Iceberg types - private static final Set UNSUPPORTED_TYPES = ImmutableSet.of( - TypeID.BINARY, - TypeID.FIXED, - TypeID.UUID, - TypeID.STRUCT, - TypeID.MAP, - TypeID.LIST - ); + // Currently every predicate in ORC requires a PredicateLeaf.Type field which is not available for + // these Iceberg types + private static final Set UNSUPPORTED_TYPES = + ImmutableSet.of( + TypeID.BINARY, TypeID.FIXED, TypeID.UUID, TypeID.STRUCT, TypeID.MAP, TypeID.LIST); private SearchArgument.Builder builder; private Map idToColumnName; - private ExpressionToSearchArgument(SearchArgument.Builder builder, Map idToColumnName) { + private ExpressionToSearchArgument( + SearchArgument.Builder builder, Map idToColumnName) { this.builder = builder; this.idToColumnName = idToColumnName; } @@ -110,24 +109,26 @@ public Action or(Action leftChild, Action rightChild) { @Override public Action isNull(Bound expr) { - return () -> this.builder.isNull(idToColumnName.get(expr.ref().fieldId()), - type(expr.ref().type())); + return () -> + this.builder.isNull(idToColumnName.get(expr.ref().fieldId()), type(expr.ref().type())); } @Override public Action notNull(Bound expr) { - return () -> this.builder.startNot() - .isNull(idToColumnName.get(expr.ref().fieldId()), - type(expr.ref().type())) - .end(); + return () -> + this.builder + .startNot() + .isNull(idToColumnName.get(expr.ref().fieldId()), type(expr.ref().type())) + .end(); } @Override public Action isNaN(Bound expr) { - return () -> this.builder.equals( - idToColumnName.get(expr.ref().fieldId()), - type(expr.ref().type()), - literal(expr.ref().type(), getNaNForType(expr.ref().type()))); + return () -> + this.builder.equals( + idToColumnName.get(expr.ref().fieldId()), + type(expr.ref().type()), + literal(expr.ref().type(), getNaNForType(expr.ref().type()))); } private Object getNaNForType(Type type) { @@ -155,45 +156,57 @@ public Action notNaN(Bound expr) { @Override public Action lt(Bound expr, Literal lit) { - return () -> this.builder.lessThan(idToColumnName.get(expr.ref().fieldId()), - type(expr.ref().type()), - literal(expr.ref().type(), lit.value())); + return () -> + this.builder.lessThan( + idToColumnName.get(expr.ref().fieldId()), + type(expr.ref().type()), + literal(expr.ref().type(), lit.value())); } @Override public Action ltEq(Bound expr, Literal lit) { - return () -> this.builder.lessThanEquals(idToColumnName.get(expr.ref().fieldId()), - type(expr.ref().type()), - literal(expr.ref().type(), lit.value())); + return () -> + this.builder.lessThanEquals( + idToColumnName.get(expr.ref().fieldId()), + type(expr.ref().type()), + literal(expr.ref().type(), lit.value())); } @Override public Action gt(Bound expr, Literal lit) { // ORC SearchArguments do not have a greaterThan predicate, so we use not(lessThanOrEquals) // e.g. x > 5 => not(x <= 5) - return () -> this.builder.startNot() - .lessThanEquals(idToColumnName.get(expr.ref().fieldId()), - type(expr.ref().type()), - literal(expr.ref().type(), lit.value())) - .end(); + return () -> + this.builder + .startNot() + .lessThanEquals( + idToColumnName.get(expr.ref().fieldId()), + type(expr.ref().type()), + literal(expr.ref().type(), lit.value())) + .end(); } @Override public Action gtEq(Bound expr, Literal lit) { // ORC SearchArguments do not have a greaterThanOrEquals predicate, so we use not(lessThan) // e.g. x >= 5 => not(x < 5) - return () -> this.builder.startNot() - .lessThan(idToColumnName.get(expr.ref().fieldId()), - type(expr.ref().type()), - literal(expr.ref().type(), lit.value())) - .end(); + return () -> + this.builder + .startNot() + .lessThan( + idToColumnName.get(expr.ref().fieldId()), + type(expr.ref().type()), + literal(expr.ref().type(), lit.value())) + .end(); } @Override public Action eq(Bound expr, Literal lit) { - return () -> this.builder.equals(idToColumnName.get(expr.ref().fieldId()), - type(expr.ref().type()), - literal(expr.ref().type(), lit.value())); + return () -> + this.builder.equals( + idToColumnName.get(expr.ref().fieldId()), + type(expr.ref().type()), + literal(expr.ref().type(), lit.value())); } @Override @@ -215,10 +228,11 @@ public Action notEq(Bound expr, Literal lit) { @Override public Action in(Bound expr, Set literalSet) { - return () -> this.builder.in( - idToColumnName.get(expr.ref().fieldId()), - type(expr.ref().type()), - literalSet.stream().map(lit -> literal(expr.ref().type(), lit)).toArray(Object[]::new)); + return () -> + this.builder.in( + idToColumnName.get(expr.ref().fieldId()), + type(expr.ref().type()), + literalSet.stream().map(lit -> literal(expr.ref().type(), lit)).toArray(Object[]::new)); } @Override @@ -240,14 +254,16 @@ public Action notIn(Bound expr, Set literalSet) { @Override public Action startsWith(Bound expr, Literal lit) { - // Cannot push down STARTS_WITH operator to ORC, so return TruthValue.YES_NO_NULL which signifies + // Cannot push down STARTS_WITH operator to ORC, so return TruthValue.YES_NO_NULL which + // signifies // that this predicate cannot help with filtering return () -> this.builder.literal(TruthValue.YES_NO_NULL); } @Override public Action notStartsWith(Bound expr, Literal lit) { - // Cannot push down NOT_STARTS_WITH operator to ORC, so return TruthValue.YES_NO_NULL which signifies + // Cannot push down NOT_STARTS_WITH operator to ORC, so return TruthValue.YES_NO_NULL which + // signifies // that this predicate cannot help with filtering return () -> this.builder.literal(TruthValue.YES_NO_NULL); } @@ -255,7 +271,8 @@ public Action notStartsWith(Bound expr, Literal lit) { @Override public Action predicate(BoundPredicate pred) { if (UNSUPPORTED_TYPES.contains(pred.ref().type().typeId())) { - // Cannot push down predicates for types which cannot be represented in PredicateLeaf.Type, so return + // Cannot push down predicates for types which cannot be represented in PredicateLeaf.Type, so + // return // TruthValue.YES_NO_NULL which signifies that this predicate cannot help with filtering return () -> this.builder.literal(TruthValue.YES_NO_NULL); } else { @@ -288,7 +305,8 @@ private PredicateLeaf.Type type(Type icebergType) { case DECIMAL: return PredicateLeaf.Type.DECIMAL; default: - throw new UnsupportedOperationException("Type " + icebergType + " not supported in ORC SearchArguments"); + throw new UnsupportedOperationException( + "Type " + icebergType + " not supported in ORC SearchArguments"); } } @@ -309,14 +327,15 @@ private Object literal(Type icebergType, T icebergLiteral) { return Date.valueOf(LocalDate.ofEpochDay((Integer) icebergLiteral)); case TIMESTAMP: long microsFromEpoch = (Long) icebergLiteral; - return Timestamp.from(Instant.ofEpochSecond( - Math.floorDiv(microsFromEpoch, 1_000_000), - Math.floorMod(microsFromEpoch, 1_000_000) * 1_000 - )); + return Timestamp.from( + Instant.ofEpochSecond( + Math.floorDiv(microsFromEpoch, 1_000_000), + Math.floorMod(microsFromEpoch, 1_000_000) * 1_000)); case DECIMAL: return new HiveDecimalWritable(HiveDecimal.create((BigDecimal) icebergLiteral, false)); default: - throw new UnsupportedOperationException("Type " + icebergType + " not supported in ORC SearchArguments"); + throw new UnsupportedOperationException( + "Type " + icebergType + " not supported in ORC SearchArguments"); } } } diff --git a/orc/src/main/java/org/apache/iceberg/orc/HasIds.java b/orc/src/main/java/org/apache/iceberg/orc/HasIds.java index 833e1d977d44..849f24a3a877 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/HasIds.java +++ b/orc/src/main/java/org/apache/iceberg/orc/HasIds.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.util.List; @@ -27,7 +26,8 @@ class HasIds extends OrcSchemaVisitor { @Override public Boolean record(TypeDescription record, List names, List fields) { - return ORCSchemaUtil.icebergID(record).isPresent() || fields.stream().anyMatch(Predicate.isEqual(true)); + return ORCSchemaUtil.icebergID(record).isPresent() + || fields.stream().anyMatch(Predicate.isEqual(true)); } @Override diff --git a/orc/src/main/java/org/apache/iceberg/orc/IdToOrcName.java b/orc/src/main/java/org/apache/iceberg/orc/IdToOrcName.java index e9e48e1993fc..1fb11d7c9e61 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/IdToOrcName.java +++ b/orc/src/main/java/org/apache/iceberg/orc/IdToOrcName.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.util.Deque; @@ -33,9 +32,11 @@ /** * Generates mapping from field IDs to ORC qualified names. - *

    - * This visitor also enclose column names in backticks i.e. ` so that ORC can correctly parse column names with - * special characters. A comparison of ORC convention with Iceberg convention is provided below + * + *

    This visitor also enclose column names in backticks i.e. ` so that ORC can correctly parse + * column names with special characters. A comparison of ORC convention with Iceberg convention is + * provided below + * *

    {@code
      *                                      Iceberg           ORC
      * field                                field             field
    @@ -100,7 +101,8 @@ public Map schema(Schema schema, Map structRes
       }
     
       @Override
    -  public Map struct(Types.StructType struct, List> fieldResults) {
    +  public Map struct(
    +      Types.StructType struct, List> fieldResults) {
         return idToName;
       }
     
    @@ -117,7 +119,8 @@ public Map list(Types.ListType list, Map eleme
       }
     
       @Override
    -  public Map map(Types.MapType map, Map keyResult, Map valueResult) {
    +  public Map map(
    +      Types.MapType map, Map keyResult, Map valueResult) {
         addField("_key", map.keyId());
         addField("_value", map.valueId());
         return idToName;
    @@ -135,7 +138,8 @@ private void addField(String name, int fieldId) {
       }
     
       private String quoteName(String name) {
    -    String escapedName = name.replace("`", "``"); // if the column name contains ` then escape it with another `
    +    String escapedName =
    +        name.replace("`", "``"); // if the column name contains ` then escape it with another `
         return "`" + escapedName + "`";
       }
     }
    diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORC.java b/orc/src/main/java/org/apache/iceberg/orc/ORC.java
    index 6621502af92f..79c83aec90b8 100644
    --- a/orc/src/main/java/org/apache/iceberg/orc/ORC.java
    +++ b/orc/src/main/java/org/apache/iceberg/orc/ORC.java
    @@ -16,9 +16,24 @@
      * specific language governing permissions and limitations
      * under the License.
      */
    -
     package org.apache.iceberg.orc;
     
    +import static org.apache.iceberg.TableProperties.DELETE_ORC_BLOCK_SIZE_BYTES;
    +import static org.apache.iceberg.TableProperties.DELETE_ORC_COMPRESSION;
    +import static org.apache.iceberg.TableProperties.DELETE_ORC_COMPRESSION_STRATEGY;
    +import static org.apache.iceberg.TableProperties.DELETE_ORC_STRIPE_SIZE_BYTES;
    +import static org.apache.iceberg.TableProperties.DELETE_ORC_WRITE_BATCH_SIZE;
    +import static org.apache.iceberg.TableProperties.ORC_BLOCK_SIZE_BYTES;
    +import static org.apache.iceberg.TableProperties.ORC_BLOCK_SIZE_BYTES_DEFAULT;
    +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION;
    +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_DEFAULT;
    +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY;
    +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY_DEFAULT;
    +import static org.apache.iceberg.TableProperties.ORC_STRIPE_SIZE_BYTES;
    +import static org.apache.iceberg.TableProperties.ORC_STRIPE_SIZE_BYTES_DEFAULT;
    +import static org.apache.iceberg.TableProperties.ORC_WRITE_BATCH_SIZE;
    +import static org.apache.iceberg.TableProperties.ORC_WRITE_BATCH_SIZE_DEFAULT;
    +
     import java.io.IOException;
     import java.nio.charset.StandardCharsets;
     import java.util.List;
    @@ -68,33 +83,13 @@
     import org.apache.orc.TypeDescription;
     import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
     
    -import static org.apache.iceberg.TableProperties.DELETE_ORC_BLOCK_SIZE_BYTES;
    -import static org.apache.iceberg.TableProperties.DELETE_ORC_COMPRESSION;
    -import static org.apache.iceberg.TableProperties.DELETE_ORC_COMPRESSION_STRATEGY;
    -import static org.apache.iceberg.TableProperties.DELETE_ORC_STRIPE_SIZE_BYTES;
    -import static org.apache.iceberg.TableProperties.DELETE_ORC_WRITE_BATCH_SIZE;
    -import static org.apache.iceberg.TableProperties.ORC_BLOCK_SIZE_BYTES;
    -import static org.apache.iceberg.TableProperties.ORC_BLOCK_SIZE_BYTES_DEFAULT;
    -import static org.apache.iceberg.TableProperties.ORC_COMPRESSION;
    -import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_DEFAULT;
    -import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY;
    -import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY_DEFAULT;
    -import static org.apache.iceberg.TableProperties.ORC_STRIPE_SIZE_BYTES;
    -import static org.apache.iceberg.TableProperties.ORC_STRIPE_SIZE_BYTES_DEFAULT;
    -import static org.apache.iceberg.TableProperties.ORC_WRITE_BATCH_SIZE;
    -import static org.apache.iceberg.TableProperties.ORC_WRITE_BATCH_SIZE_DEFAULT;
    -
     @SuppressWarnings("checkstyle:AbbreviationAsWordInName")
     public class ORC {
     
    -  /**
    -   * @deprecated use {@link TableProperties#ORC_WRITE_BATCH_SIZE} instead
    -   */
    -  @Deprecated
    -  private static final String VECTOR_ROW_BATCH_SIZE = "iceberg.orc.vectorbatch.size";
    +  /** @deprecated use {@link TableProperties#ORC_WRITE_BATCH_SIZE} instead */
    +  @Deprecated private static final String VECTOR_ROW_BATCH_SIZE = "iceberg.orc.vectorbatch.size";
     
    -  private ORC() {
    -  }
    +  private ORC() {}
     
       public static WriteBuilder write(OutputFile file) {
         return new WriteBuilder(file);
    @@ -134,6 +129,7 @@ public WriteBuilder metadata(String property, String value) {
     
         /**
          * Setting a specific configuration value for the writer.
    +     *
          * @param property The property to set
          * @param value The value to set
          * @return The resulting builder for chaining purposes
    @@ -149,7 +145,8 @@ public WriteBuilder set(String property, String value) {
           return this;
         }
     
    -    public WriteBuilder createWriterFunc(BiFunction> writerFunction) {
    +    public WriteBuilder createWriterFunc(
    +        BiFunction> writerFunction) {
           this.createWriterFunc = writerFunction;
           return this;
         }
    @@ -179,7 +176,8 @@ public WriteBuilder metricsConfig(MetricsConfig newMetricsConfig) {
         }
     
         // supposed to always be a private method used strictly by data and delete write builders
    -    private WriteBuilder createContextFunc(Function, Context> newCreateContextFunc) {
    +    private WriteBuilder createContextFunc(
    +        Function, Context> newCreateContextFunc) {
           this.createContextFunc = newCreateContextFunc;
           return this;
         }
    @@ -205,9 +203,14 @@ public  FileAppender build() {
           OrcConf.COMPRESSION_STRATEGY.setString(conf, context.compressionStrategy().name());
           OrcConf.OVERWRITE_OUTPUT_FILE.setBoolean(conf, overwrite);
     
    -      return new OrcFileAppender<>(schema,
    -          this.file, createWriterFunc, conf, metadata,
    -          context.vectorizedRowBatchSize(), metricsConfig);
    +      return new OrcFileAppender<>(
    +          schema,
    +          this.file,
    +          createWriterFunc,
    +          conf,
    +          metadata,
    +          context.vectorizedRowBatchSize(),
    +          metricsConfig);
         }
     
         private static class Context {
    @@ -237,8 +240,12 @@ public CompressionStrategy compressionStrategy() {
             return compressionStrategy;
           }
     
    -      private Context(long stripeSize, long blockSize, int vectorizedRowBatchSize,
    -          CompressionKind compressionKind, CompressionStrategy compressionStrategy) {
    +      private Context(
    +          long stripeSize,
    +          long blockSize,
    +          int vectorizedRowBatchSize,
    +          CompressionKind compressionKind,
    +          CompressionStrategy compressionStrategy) {
             this.stripeSize = stripeSize;
             this.blockSize = blockSize;
             this.vectorizedRowBatchSize = vectorizedRowBatchSize;
    @@ -247,52 +254,71 @@ private Context(long stripeSize, long blockSize, int vectorizedRowBatchSize,
           }
     
           static Context dataContext(Map config) {
    -        long stripeSize = PropertyUtil.propertyAsLong(config, OrcConf.STRIPE_SIZE.getAttribute(),
    -            ORC_STRIPE_SIZE_BYTES_DEFAULT);
    +        long stripeSize =
    +            PropertyUtil.propertyAsLong(
    +                config, OrcConf.STRIPE_SIZE.getAttribute(), ORC_STRIPE_SIZE_BYTES_DEFAULT);
             stripeSize = PropertyUtil.propertyAsLong(config, ORC_STRIPE_SIZE_BYTES, stripeSize);
             Preconditions.checkArgument(stripeSize > 0, "Stripe size must be > 0");
     
    -        long blockSize = PropertyUtil.propertyAsLong(config, OrcConf.BLOCK_SIZE.getAttribute(),
    -            ORC_BLOCK_SIZE_BYTES_DEFAULT);
    +        long blockSize =
    +            PropertyUtil.propertyAsLong(
    +                config, OrcConf.BLOCK_SIZE.getAttribute(), ORC_BLOCK_SIZE_BYTES_DEFAULT);
             blockSize = PropertyUtil.propertyAsLong(config, ORC_BLOCK_SIZE_BYTES, blockSize);
             Preconditions.checkArgument(blockSize > 0, "Block size must be > 0");
     
    -        int vectorizedRowBatchSize = PropertyUtil.propertyAsInt(config,
    -            ORC_WRITE_BATCH_SIZE, ORC_WRITE_BATCH_SIZE_DEFAULT);
    -        Preconditions.checkArgument(vectorizedRowBatchSize > 0, "VectorizedRow batch size must be > 0");
    +        int vectorizedRowBatchSize =
    +            PropertyUtil.propertyAsInt(config, ORC_WRITE_BATCH_SIZE, ORC_WRITE_BATCH_SIZE_DEFAULT);
    +        Preconditions.checkArgument(
    +            vectorizedRowBatchSize > 0, "VectorizedRow batch size must be > 0");
     
    -        String codecAsString = PropertyUtil.propertyAsString(config, OrcConf.COMPRESS.getAttribute(),
    -            ORC_COMPRESSION_DEFAULT);
    +        String codecAsString =
    +            PropertyUtil.propertyAsString(
    +                config, OrcConf.COMPRESS.getAttribute(), ORC_COMPRESSION_DEFAULT);
             codecAsString = PropertyUtil.propertyAsString(config, ORC_COMPRESSION, codecAsString);
             CompressionKind compressionKind = toCompressionKind(codecAsString);
     
    -        String strategyAsString = PropertyUtil.propertyAsString(config, OrcConf.COMPRESSION_STRATEGY.getAttribute(),
    -            ORC_COMPRESSION_STRATEGY_DEFAULT);
    -        strategyAsString = PropertyUtil.propertyAsString(config, ORC_COMPRESSION_STRATEGY, strategyAsString);
    +        String strategyAsString =
    +            PropertyUtil.propertyAsString(
    +                config,
    +                OrcConf.COMPRESSION_STRATEGY.getAttribute(),
    +                ORC_COMPRESSION_STRATEGY_DEFAULT);
    +        strategyAsString =
    +            PropertyUtil.propertyAsString(config, ORC_COMPRESSION_STRATEGY, strategyAsString);
             CompressionStrategy compressionStrategy = toCompressionStrategy(strategyAsString);
     
    -        return new Context(stripeSize, blockSize, vectorizedRowBatchSize, compressionKind, compressionStrategy);
    +        return new Context(
    +            stripeSize, blockSize, vectorizedRowBatchSize, compressionKind, compressionStrategy);
           }
     
           static Context deleteContext(Map config) {
             Context dataContext = dataContext(config);
     
    -        long stripeSize = PropertyUtil.propertyAsLong(config, DELETE_ORC_STRIPE_SIZE_BYTES, dataContext.stripeSize());
    +        long stripeSize =
    +            PropertyUtil.propertyAsLong(
    +                config, DELETE_ORC_STRIPE_SIZE_BYTES, dataContext.stripeSize());
     
    -        long blockSize = PropertyUtil.propertyAsLong(config, DELETE_ORC_BLOCK_SIZE_BYTES, dataContext.blockSize());
    +        long blockSize =
    +            PropertyUtil.propertyAsLong(
    +                config, DELETE_ORC_BLOCK_SIZE_BYTES, dataContext.blockSize());
     
    -        int vectorizedRowBatchSize = PropertyUtil.propertyAsInt(config,
    -            DELETE_ORC_WRITE_BATCH_SIZE, dataContext.vectorizedRowBatchSize());
    +        int vectorizedRowBatchSize =
    +            PropertyUtil.propertyAsInt(
    +                config, DELETE_ORC_WRITE_BATCH_SIZE, dataContext.vectorizedRowBatchSize());
     
             String codecAsString = config.get(DELETE_ORC_COMPRESSION);
    -        CompressionKind compressionKind = codecAsString != null ? toCompressionKind(codecAsString) :
    -            dataContext.compressionKind();
    +        CompressionKind compressionKind =
    +            codecAsString != null
    +                ? toCompressionKind(codecAsString)
    +                : dataContext.compressionKind();
     
             String strategyAsString = config.get(DELETE_ORC_COMPRESSION_STRATEGY);
             CompressionStrategy compressionStrategy =
    -            strategyAsString != null ? toCompressionStrategy(strategyAsString) : dataContext.compressionStrategy();
    +            strategyAsString != null
    +                ? toCompressionStrategy(strategyAsString)
    +                : dataContext.compressionStrategy();
     
    -        return new Context(stripeSize, blockSize, vectorizedRowBatchSize, compressionKind, compressionStrategy);
    +        return new Context(
    +            stripeSize, blockSize, vectorizedRowBatchSize, compressionKind, compressionStrategy);
           }
     
           private static CompressionKind toCompressionKind(String codecAsString) {
    @@ -307,7 +333,8 @@ private static CompressionStrategy toCompressionStrategy(String strategyAsString
             try {
               return CompressionStrategy.valueOf(strategyAsString.toUpperCase(Locale.ENGLISH));
             } catch (IllegalArgumentException e) {
    -          throw new IllegalArgumentException("Unsupported compression strategy: " + strategyAsString);
    +          throw new IllegalArgumentException(
    +              "Unsupported compression strategy: " + strategyAsString);
             }
           }
         }
    @@ -372,7 +399,8 @@ public DataWriteBuilder metricsConfig(MetricsConfig newMetricsConfig) {
           return this;
         }
     
    -    public DataWriteBuilder createWriterFunc(BiFunction> writerFunction) {
    +    public DataWriteBuilder createWriterFunc(
    +        BiFunction> writerFunction) {
           appenderBuilder.createWriterFunc(writerFunction);
           return this;
         }
    @@ -399,11 +427,13 @@ public DataWriteBuilder withSortOrder(SortOrder newSortOrder) {
     
         public  DataWriter build() {
           Preconditions.checkArgument(spec != null, "Cannot create data writer without spec");
    -      Preconditions.checkArgument(spec.isUnpartitioned() || partition != null,
    +      Preconditions.checkArgument(
    +          spec.isUnpartitioned() || partition != null,
               "Partition must not be null when creating data writer for partitioned spec");
     
           FileAppender fileAppender = appenderBuilder.build();
    -      return new DataWriter<>(fileAppender, FileFormat.ORC, location, spec, partition, keyMetadata, sortOrder);
    +      return new DataWriter<>(
    +          fileAppender, FileFormat.ORC, location, spec, partition, keyMetadata, sortOrder);
         }
       }
     
    @@ -465,7 +495,8 @@ public DeleteWriteBuilder metricsConfig(MetricsConfig newMetricsConfig) {
           return this;
         }
     
    -    public DeleteWriteBuilder createWriterFunc(BiFunction> newWriterFunc) {
    +    public DeleteWriteBuilder createWriterFunc(
    +        BiFunction> newWriterFunc) {
           this.createWriterFunc = newWriterFunc;
           return this;
         }
    @@ -511,18 +542,25 @@ public DeleteWriteBuilder withSortOrder(SortOrder newSortOrder) {
         }
     
         public  EqualityDeleteWriter buildEqualityWriter() {
    -      Preconditions.checkState(rowSchema != null, "Cannot create equality delete file without a schema");
    -      Preconditions.checkState(equalityFieldIds != null, "Cannot create equality delete file without delete field ids");
    -      Preconditions.checkState(createWriterFunc != null,
    +      Preconditions.checkState(
    +          rowSchema != null, "Cannot create equality delete file without a schema");
    +      Preconditions.checkState(
    +          equalityFieldIds != null, "Cannot create equality delete file without delete field ids");
    +      Preconditions.checkState(
    +          createWriterFunc != null,
               "Cannot create equality delete file unless createWriterFunc is set");
    -      Preconditions.checkArgument(spec != null, "Spec must not be null when creating equality delete writer");
    -      Preconditions.checkArgument(spec.isUnpartitioned() || partition != null,
    +      Preconditions.checkArgument(
    +          spec != null, "Spec must not be null when creating equality delete writer");
    +      Preconditions.checkArgument(
    +          spec.isUnpartitioned() || partition != null,
               "Partition must not be null for partitioned writes");
     
           meta("delete-type", "equality");
    -      meta("delete-field-ids", IntStream.of(equalityFieldIds)
    -          .mapToObj(Objects::toString)
    -          .collect(Collectors.joining(", ")));
    +      meta(
    +          "delete-field-ids",
    +          IntStream.of(equalityFieldIds)
    +              .mapToObj(Objects::toString)
    +              .collect(Collectors.joining(", ")));
     
           // the appender uses the row schema without extra columns
           appenderBuilder.schema(rowSchema);
    @@ -530,16 +568,26 @@ public  EqualityDeleteWriter buildEqualityWriter() {
           appenderBuilder.createContextFunc(WriteBuilder.Context::deleteContext);
     
           return new EqualityDeleteWriter<>(
    -          appenderBuilder.build(), FileFormat.ORC, location, spec, partition, keyMetadata,
    -          sortOrder, equalityFieldIds);
    +          appenderBuilder.build(),
    +          FileFormat.ORC,
    +          location,
    +          spec,
    +          partition,
    +          keyMetadata,
    +          sortOrder,
    +          equalityFieldIds);
         }
     
         public  PositionDeleteWriter buildPositionWriter() {
    -      Preconditions.checkState(equalityFieldIds == null, "Cannot create position delete file using delete field ids");
    -      Preconditions.checkArgument(spec != null, "Spec must not be null when creating position delete writer");
    -      Preconditions.checkArgument(spec.isUnpartitioned() || partition != null,
    +      Preconditions.checkState(
    +          equalityFieldIds == null, "Cannot create position delete file using delete field ids");
    +      Preconditions.checkArgument(
    +          spec != null, "Spec must not be null when creating position delete writer");
    +      Preconditions.checkArgument(
    +          spec.isUnpartitioned() || partition != null,
               "Partition must not be null for partitioned writes");
    -      Preconditions.checkArgument(rowSchema == null || createWriterFunc != null,
    +      Preconditions.checkArgument(
    +          rowSchema == null || createWriterFunc != null,
               "Create function should be provided if we write row data");
     
           meta("delete-type", "position");
    @@ -548,14 +596,19 @@ public  PositionDeleteWriter buildPositionWriter() {
             Schema deleteSchema = DeleteSchemaUtil.posDeleteSchema(rowSchema);
             appenderBuilder.schema(deleteSchema);
     
    -        appenderBuilder.createWriterFunc((schema, typeDescription) ->
    -            GenericOrcWriters.positionDelete(createWriterFunc.apply(deleteSchema, typeDescription), pathTransformFunc));
    +        appenderBuilder.createWriterFunc(
    +            (schema, typeDescription) ->
    +                GenericOrcWriters.positionDelete(
    +                    createWriterFunc.apply(deleteSchema, typeDescription), pathTransformFunc));
           } else {
             appenderBuilder.schema(DeleteSchemaUtil.pathPosSchema());
     
    -        // We ignore the 'createWriterFunc' and 'rowSchema' even if is provided, since we do not write row data itself
    -        appenderBuilder.createWriterFunc((schema, typeDescription) -> GenericOrcWriters.positionDelete(
    -                GenericOrcWriter.buildWriter(schema, typeDescription), Function.identity()));
    +        // We ignore the 'createWriterFunc' and 'rowSchema' even if is provided, since we do not
    +        // write row data itself
    +        appenderBuilder.createWriterFunc(
    +            (schema, typeDescription) ->
    +                GenericOrcWriters.positionDelete(
    +                    GenericOrcWriter.buildWriter(schema, typeDescription), Function.identity()));
           }
     
           appenderBuilder.createContextFunc(WriteBuilder.Context::deleteContext);
    @@ -592,7 +645,8 @@ private ReadBuilder(InputFile file) {
             this.conf = new Configuration();
           }
     
    -      // We need to turn positional schema evolution off since we use column name based schema evolution for projection
    +      // We need to turn positional schema evolution off since we use column name based schema
    +      // evolution for projection
           this.conf.setBoolean(OrcConf.FORCE_POSITIONAL_EVOLUTION.getHiveConfName(), false);
         }
     
    @@ -626,7 +680,8 @@ public ReadBuilder config(String property, String value) {
         }
     
         public ReadBuilder createReaderFunc(Function> readerFunction) {
    -      Preconditions.checkArgument(this.batchedReaderFunc == null,
    +      Preconditions.checkArgument(
    +          this.batchedReaderFunc == null,
               "Reader function cannot be set since the batched version is already set");
           this.readerFunc = readerFunction;
           return this;
    @@ -637,8 +692,10 @@ public ReadBuilder filter(Expression newFilter) {
           return this;
         }
     
    -    public ReadBuilder createBatchedReaderFunc(Function> batchReaderFunction) {
    -      Preconditions.checkArgument(this.readerFunc == null,
    +    public ReadBuilder createBatchedReaderFunc(
    +        Function> batchReaderFunction) {
    +      Preconditions.checkArgument(
    +          this.readerFunc == null,
               "Batched reader function cannot be set since the non-batched version is already set");
           this.batchedReaderFunc = batchReaderFunction;
           return this;
    @@ -656,8 +713,18 @@ public ReadBuilder withNameMapping(NameMapping newNameMapping) {
     
         public  CloseableIterable build() {
           Preconditions.checkNotNull(schema, "Schema is required");
    -      return new OrcIterable<>(file, conf, schema, nameMapping, start, length, readerFunc, caseSensitive, filter,
    -          batchedReaderFunc, recordsPerBatch);
    +      return new OrcIterable<>(
    +          file,
    +          conf,
    +          schema,
    +          nameMapping,
    +          start,
    +          length,
    +          readerFunc,
    +          caseSensitive,
    +          filter,
    +          batchedReaderFunc,
    +          recordsPerBatch);
         }
       }
     
    diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORCSchemaUtil.java b/orc/src/main/java/org/apache/iceberg/orc/ORCSchemaUtil.java
    index 4356f6b48c87..fae1a76c3706 100644
    --- a/orc/src/main/java/org/apache/iceberg/orc/ORCSchemaUtil.java
    +++ b/orc/src/main/java/org/apache/iceberg/orc/ORCSchemaUtil.java
    @@ -16,7 +16,6 @@
      * specific language governing permissions and limitations
      * under the License.
      */
    -
     package org.apache.iceberg.orc;
     
     import java.util.List;
    @@ -34,17 +33,18 @@
     import org.apache.iceberg.types.Types;
     import org.apache.orc.TypeDescription;
     
    -/**
    - * Utilities for mapping Iceberg to ORC schemas.
    - */
    +/** Utilities for mapping Iceberg to ORC schemas. */
     public final class ORCSchemaUtil {
     
       public enum BinaryType {
    -    UUID, FIXED, BINARY
    +    UUID,
    +    FIXED,
    +    BINARY
       }
     
       public enum LongType {
    -    TIME, LONG
    +    TIME,
    +    LONG
       }
     
       private static class OrcField {
    @@ -69,15 +69,16 @@ public TypeDescription type() {
       static final String ICEBERG_REQUIRED_ATTRIBUTE = "iceberg.required";
     
       /**
    -   * The name of the ORC {@link TypeDescription} attribute indicating the Iceberg type corresponding to an
    -   * ORC binary type. The values for this attribute are denoted in {@code BinaryType}.
    +   * The name of the ORC {@link TypeDescription} attribute indicating the Iceberg type corresponding
    +   * to an ORC binary type. The values for this attribute are denoted in {@code BinaryType}.
        */
       public static final String ICEBERG_BINARY_TYPE_ATTRIBUTE = "iceberg.binary-type";
       /**
    -   * The name of the ORC {@link TypeDescription} attribute indicating the Iceberg type corresponding to an
    -   * ORC long type. The values for this attribute are denoted in {@code LongType}.
    +   * The name of the ORC {@link TypeDescription} attribute indicating the Iceberg type corresponding
    +   * to an ORC long type. The values for this attribute are denoted in {@code LongType}.
        */
       public static final String ICEBERG_LONG_TYPE_ATTRIBUTE = "iceberg.long-type";
    +
       static final String ICEBERG_FIELD_LENGTH = "iceberg.length";
     
       private static final ImmutableMultimap TYPE_MAPPING =
    @@ -100,8 +101,7 @@ public TypeDescription type() {
               .put(Type.TypeID.DECIMAL, TypeDescription.Category.DECIMAL)
               .build();
     
    -  private ORCSchemaUtil() {
    -  }
    +  private ORCSchemaUtil() {}
     
       public static TypeDescription convert(Schema schema) {
         final TypeDescription root = TypeDescription.createStruct();
    @@ -158,41 +158,48 @@ private static TypeDescription convert(Integer fieldId, Type type, boolean isReq
           case FIXED:
             orcType = TypeDescription.createBinary();
             orcType.setAttribute(ICEBERG_BINARY_TYPE_ATTRIBUTE, BinaryType.FIXED.toString());
    -        orcType.setAttribute(ICEBERG_FIELD_LENGTH, Integer.toString(((Types.FixedType) type).length()));
    +        orcType.setAttribute(
    +            ICEBERG_FIELD_LENGTH, Integer.toString(((Types.FixedType) type).length()));
             break;
           case BINARY:
             orcType = TypeDescription.createBinary();
             orcType.setAttribute(ICEBERG_BINARY_TYPE_ATTRIBUTE, BinaryType.BINARY.toString());
             break;
    -      case DECIMAL: {
    -        Types.DecimalType decimal = (Types.DecimalType) type;
    -        orcType = TypeDescription.createDecimal()
    -            .withScale(decimal.scale())
    -            .withPrecision(decimal.precision());
    -        break;
    -      }
    -      case STRUCT: {
    -        orcType = TypeDescription.createStruct();
    -        for (Types.NestedField field : type.asStructType().fields()) {
    -          TypeDescription childType = convert(field.fieldId(), field.type(), field.isRequired());
    -          orcType.addField(field.name(), childType);
    +      case DECIMAL:
    +        {
    +          Types.DecimalType decimal = (Types.DecimalType) type;
    +          orcType =
    +              TypeDescription.createDecimal()
    +                  .withScale(decimal.scale())
    +                  .withPrecision(decimal.precision());
    +          break;
    +        }
    +      case STRUCT:
    +        {
    +          orcType = TypeDescription.createStruct();
    +          for (Types.NestedField field : type.asStructType().fields()) {
    +            TypeDescription childType = convert(field.fieldId(), field.type(), field.isRequired());
    +            orcType.addField(field.name(), childType);
    +          }
    +          break;
    +        }
    +      case LIST:
    +        {
    +          Types.ListType list = (Types.ListType) type;
    +          TypeDescription elementType =
    +              convert(list.elementId(), list.elementType(), list.isElementRequired());
    +          orcType = TypeDescription.createList(elementType);
    +          break;
    +        }
    +      case MAP:
    +        {
    +          Types.MapType map = (Types.MapType) type;
    +          TypeDescription keyType = convert(map.keyId(), map.keyType(), true);
    +          TypeDescription valueType =
    +              convert(map.valueId(), map.valueType(), map.isValueRequired());
    +          orcType = TypeDescription.createMap(keyType, valueType);
    +          break;
             }
    -        break;
    -      }
    -      case LIST: {
    -        Types.ListType list = (Types.ListType) type;
    -        TypeDescription elementType = convert(list.elementId(), list.elementType(),
    -            list.isElementRequired());
    -        orcType = TypeDescription.createList(elementType);
    -        break;
    -      }
    -      case MAP: {
    -        Types.MapType map = (Types.MapType) type;
    -        TypeDescription keyType = convert(map.keyId(), map.keyType(), true);
    -        TypeDescription valueType = convert(map.valueId(), map.valueType(), map.isValueRequired());
    -        orcType = TypeDescription.createMap(keyType, valueType);
    -        break;
    -      }
           default:
             throw new IllegalArgumentException("Unhandled type " + type.typeId());
         }
    @@ -204,9 +211,9 @@ private static TypeDescription convert(Integer fieldId, Type type, boolean isReq
       }
     
       /**
    -   * Convert an ORC schema to an Iceberg schema. This method handles the convertion from the original
    -   * Iceberg column mapping IDs if present in the ORC column attributes, otherwise, ORC columns with no
    -   * Iceberg IDs will be ignored and skipped in the conversion.
    +   * Convert an ORC schema to an Iceberg schema. This method handles the convertion from the
    +   * original Iceberg column mapping IDs if present in the ORC column attributes, otherwise, ORC
    +   * columns with no Iceberg IDs will be ignored and skipped in the conversion.
        *
        * @return the Iceberg schema
        * @throws IllegalArgumentException if ORC schema has no columns with Iceberg ID attributes
    @@ -214,12 +221,16 @@ private static TypeDescription convert(Integer fieldId, Type type, boolean isReq
       public static Schema convert(TypeDescription orcSchema) {
         List children = orcSchema.getChildren();
         List childrenNames = orcSchema.getFieldNames();
    -    Preconditions.checkState(children.size() == childrenNames.size(),
    +    Preconditions.checkState(
    +        children.size() == childrenNames.size(),
             "Error in ORC file, children fields and names do not match.");
     
         OrcToIcebergVisitor schemaConverter = new OrcToIcebergVisitor();
    -    List fields = OrcToIcebergVisitor.visitSchema(orcSchema, schemaConverter).stream()
    -        .filter(Optional::isPresent).map(Optional::get).collect(Collectors.toList());
    +    List fields =
    +        OrcToIcebergVisitor.visitSchema(orcSchema, schemaConverter).stream()
    +            .filter(Optional::isPresent)
    +            .map(Optional::get)
    +            .collect(Collectors.toList());
     
         if (fields.size() == 0) {
           throw new IllegalArgumentException("ORC schema does not contain Iceberg IDs");
    @@ -229,22 +240,17 @@ public static Schema convert(TypeDescription orcSchema) {
       }
     
       /**
    -   * Converts an Iceberg schema to a corresponding ORC schema within the context of an existing
    -   * ORC file schema.
    -   * This method also handles schema evolution from the original ORC file schema
    -   * to the given Iceberg schema. It builds the desired reader schema with the schema
    -   * evolution rules and pass that down to the ORC reader,
    -   * which would then use its schema evolution to map that to the writer’s schema.
    +   * Converts an Iceberg schema to a corresponding ORC schema within the context of an existing ORC
    +   * file schema. This method also handles schema evolution from the original ORC file schema to the
    +   * given Iceberg schema. It builds the desired reader schema with the schema evolution rules and
    +   * pass that down to the ORC reader, which would then use its schema evolution to map that to the
    +   * writer’s schema.
        *
    -   * Example:
    -   * 
    +   * 

    Example: * Iceberg writer ORC writer * struct<a (1): int, b (2): string> struct<a: int, b: string> * struct<a (1): struct<b (2): string, c (3): date>> struct<a: struct<b:string, c:date>> - * - * - * Iceberg reader ORC reader - * + * Iceberg reader ORC reader * struct<a (2): string, c (3): date> struct<b: string, c: date> * struct<aa (1): struct<cc (3): date, bb (2): string>> struct<a: struct<c:date, b:string>> * @@ -253,14 +259,14 @@ public static Schema convert(TypeDescription orcSchema) { * @param originalOrcSchema an existing ORC file schema * @return the resulting ORC schema */ - public static TypeDescription buildOrcProjection(Schema schema, - TypeDescription originalOrcSchema) { + public static TypeDescription buildOrcProjection( + Schema schema, TypeDescription originalOrcSchema) { final Map icebergToOrc = icebergToOrcMapping("root", originalOrcSchema); return buildOrcProjection(Integer.MIN_VALUE, schema.asStruct(), true, icebergToOrc); } - private static TypeDescription buildOrcProjection(Integer fieldId, Type type, boolean isRequired, - Map mapping) { + private static TypeDescription buildOrcProjection( + Integer fieldId, Type type, boolean isRequired, Map mapping) { final TypeDescription orcType; switch (type.typeId()) { @@ -270,25 +276,36 @@ private static TypeDescription buildOrcProjection(Integer fieldId, Type type, bo // Using suffix _r to avoid potential underlying issues in ORC reader // with reused column names between ORC and Iceberg; // e.g. renaming column c -> d and adding new column d - String name = Optional.ofNullable(mapping.get(nestedField.fieldId())) - .map(OrcField::name) - .orElseGet(() -> nestedField.name() + "_r" + nestedField.fieldId()); - TypeDescription childType = buildOrcProjection(nestedField.fieldId(), nestedField.type(), - isRequired && nestedField.isRequired(), mapping); + String name = + Optional.ofNullable(mapping.get(nestedField.fieldId())) + .map(OrcField::name) + .orElseGet(() -> nestedField.name() + "_r" + nestedField.fieldId()); + TypeDescription childType = + buildOrcProjection( + nestedField.fieldId(), + nestedField.type(), + isRequired && nestedField.isRequired(), + mapping); orcType.addField(name, childType); } break; case LIST: Types.ListType list = (Types.ListType) type; - TypeDescription elementType = buildOrcProjection(list.elementId(), list.elementType(), - isRequired && list.isElementRequired(), mapping); + TypeDescription elementType = + buildOrcProjection( + list.elementId(), + list.elementType(), + isRequired && list.isElementRequired(), + mapping); orcType = TypeDescription.createList(elementType); break; case MAP: Types.MapType map = (Types.MapType) type; - TypeDescription keyType = buildOrcProjection(map.keyId(), map.keyType(), isRequired, mapping); - TypeDescription valueType = buildOrcProjection(map.valueId(), map.valueType(), - isRequired && map.isValueRequired(), mapping); + TypeDescription keyType = + buildOrcProjection(map.keyId(), map.keyType(), isRequired, mapping); + TypeDescription valueType = + buildOrcProjection( + map.valueId(), map.valueType(), isRequired && map.isValueRequired(), mapping); orcType = TypeDescription.createMap(keyType, valueType); break; default: @@ -299,15 +316,19 @@ private static TypeDescription buildOrcProjection(Integer fieldId, Type type, bo if (promotedType.isPresent()) { orcType = promotedType.get(); } else { - Preconditions.checkArgument(isSameType(originalType, type), + Preconditions.checkArgument( + isSameType(originalType, type), "Can not promote %s type to %s", - originalType.getCategory(), type.typeId().name()); + originalType.getCategory(), + type.typeId().name()); orcType = originalType.clone(); } } else { if (isRequired) { throw new IllegalArgumentException( - String.format("Field %d of type %s is required and was not found.", fieldId, type.toString())); + String.format( + "Field %d of type %s is required and was not found.", + fieldId, type.toString())); } orcType = convert(fieldId, type, false); @@ -345,27 +366,27 @@ private static Map icebergToOrcMapping(String name, TypeDescr return icebergToOrc; } - - private static Optional getPromotedType(Type icebergType, - TypeDescription originalOrcType) { + private static Optional getPromotedType( + Type icebergType, TypeDescription originalOrcType) { TypeDescription promotedOrcType = null; - if (Type.TypeID.LONG.equals(icebergType.typeId()) && - TypeDescription.Category.INT.equals(originalOrcType.getCategory())) { + if (Type.TypeID.LONG.equals(icebergType.typeId()) + && TypeDescription.Category.INT.equals(originalOrcType.getCategory())) { // Promote: int to long promotedOrcType = TypeDescription.createLong(); - } else if (Type.TypeID.DOUBLE.equals(icebergType.typeId()) && - TypeDescription.Category.FLOAT.equals(originalOrcType.getCategory())) { + } else if (Type.TypeID.DOUBLE.equals(icebergType.typeId()) + && TypeDescription.Category.FLOAT.equals(originalOrcType.getCategory())) { // Promote: float to double promotedOrcType = TypeDescription.createDouble(); - } else if (Type.TypeID.DECIMAL.equals(icebergType.typeId()) && - TypeDescription.Category.DECIMAL.equals(originalOrcType.getCategory())) { + } else if (Type.TypeID.DECIMAL.equals(icebergType.typeId()) + && TypeDescription.Category.DECIMAL.equals(originalOrcType.getCategory())) { // Promote: decimal(P, S) to decimal(P', S) if P' > P Types.DecimalType newDecimal = (Types.DecimalType) icebergType; - if (newDecimal.scale() == originalOrcType.getScale() && - newDecimal.precision() > originalOrcType.getPrecision()) { - promotedOrcType = TypeDescription.createDecimal() - .withScale(newDecimal.scale()) - .withPrecision(newDecimal.precision()); + if (newDecimal.scale() == originalOrcType.getScale() + && newDecimal.precision() > originalOrcType.getPrecision()) { + promotedOrcType = + TypeDescription.createDecimal() + .withScale(newDecimal.scale()) + .withPrecision(newDecimal.precision()); } } return Optional.ofNullable(promotedOrcType); @@ -375,7 +396,9 @@ private static boolean isSameType(TypeDescription orcType, Type icebergType) { if (icebergType.typeId() == Type.TypeID.TIMESTAMP) { Types.TimestampType tsType = (Types.TimestampType) icebergType; return Objects.equals( - tsType.shouldAdjustToUTC() ? TypeDescription.Category.TIMESTAMP_INSTANT : TypeDescription.Category.TIMESTAMP, + tsType.shouldAdjustToUTC() + ? TypeDescription.Category.TIMESTAMP_INSTANT + : TypeDescription.Category.TIMESTAMP, orcType.getCategory()); } else { return TYPE_MAPPING.containsEntry(icebergType.typeId(), orcType.getCategory()); diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcBatchReader.java b/orc/src/main/java/org/apache/iceberg/orc/OrcBatchReader.java index 2b67157a8c55..b1f04dc476f9 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcBatchReader.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcBatchReader.java @@ -16,19 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; -/** - * Used for implementing ORC batch readers. - */ +/** Used for implementing ORC batch readers. */ public interface OrcBatchReader { - /** - * Reads a row batch. - */ + /** Reads a row batch. */ T read(VectorizedRowBatch batch); void setBatchContext(long batchOffsetInFile); diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java b/orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java index b3b0b4f38200..f407bdcf436c 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.io.IOException; @@ -46,9 +45,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * Create a file appender for ORC. - */ +/** Create a file appender for ORC. */ class OrcFileAppender implements FileAppender { private static final Logger LOG = LoggerFactory.getLogger(OrcFileAppender.class); @@ -62,10 +59,14 @@ class OrcFileAppender implements FileAppender { private final Configuration conf; private final MetricsConfig metricsConfig; - OrcFileAppender(Schema schema, OutputFile file, - BiFunction> createWriterFunc, - Configuration conf, Map metadata, - int batchSize, MetricsConfig metricsConfig) { + OrcFileAppender( + Schema schema, + OutputFile file, + BiFunction> createWriterFunc, + Configuration conf, + Map metadata, + int batchSize, + MetricsConfig metricsConfig) { this.conf = conf; this.file = file; this.batchSize = batchSize; @@ -74,7 +75,8 @@ class OrcFileAppender implements FileAppender { TypeDescription orcSchema = ORCSchemaUtil.convert(schema); this.avgRowByteSize = - OrcSchemaVisitor.visitSchema(orcSchema, new EstimateOrcAvgWidthVisitor()).stream().reduce(Integer::sum) + OrcSchemaVisitor.visitSchema(orcSchema, new EstimateOrcAvgWidthVisitor()).stream() + .reduce(Integer::sum) .orElse(0); if (avgRowByteSize == 0) { LOG.warn("The average length of the rows appears to be zero."); @@ -101,14 +103,14 @@ public void add(D datum) { batch.reset(); } } catch (IOException ioe) { - throw new UncheckedIOException(String.format("Problem writing to ORC file %s", file.location()), ioe); + throw new UncheckedIOException( + String.format("Problem writing to ORC file %s", file.location()), ioe); } } @Override public Metrics metrics() { - Preconditions.checkState(isClosed, - "Cannot return metrics while appending to an open file."); + Preconditions.checkState(isClosed, "Cannot return metrics while appending to an open file."); return OrcMetrics.fromWriter(writer, valueWriter.metrics(), metricsConfig); } @@ -125,15 +127,21 @@ public long length() { List stripes = writer.getStripes(); if (!stripes.isEmpty()) { StripeInformation stripeInformation = stripes.get(stripes.size() - 1); - dataLength = stripeInformation != null ? stripeInformation.getOffset() + stripeInformation.getLength() : 0; + dataLength = + stripeInformation != null + ? stripeInformation.getOffset() + stripeInformation.getLength() + : 0; } } catch (IOException e) { - throw new UncheckedIOException(String.format("Can't get Stripe's length from the file writer with path: %s.", - file.location()), e); + throw new UncheckedIOException( + String.format( + "Can't get Stripe's length from the file writer with path: %s.", file.location()), + e); } // This value is estimated, not actual. - return (long) Math.ceil(dataLength + (estimateMemory + (long) batch.size * avgRowByteSize) * 0.2); + return (long) + Math.ceil(dataLength + (estimateMemory + (long) batch.size * avgRowByteSize) * 0.2); } @Override @@ -162,8 +170,8 @@ public void close() throws IOException { } } - private static Writer newOrcWriter(OutputFile file, - OrcFile.WriterOptions options, Map metadata) { + private static Writer newOrcWriter( + OutputFile file, OrcFile.WriterOptions options, Map metadata) { final Path locPath = new Path(file.location()); final Writer writer; @@ -179,10 +187,10 @@ private static Writer newOrcWriter(OutputFile file, } @SuppressWarnings("unchecked") - private static OrcRowWriter newOrcRowWriter(Schema schema, - TypeDescription orcSchema, - BiFunction> - createWriterFunc) { + private static OrcRowWriter newOrcRowWriter( + Schema schema, + TypeDescription orcSchema, + BiFunction> createWriterFunc) { return (OrcRowWriter) createWriterFunc.apply(schema, orcSchema); } } diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java b/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java index d3eaa410fdbd..58cf5d1f9655 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.io.IOException; @@ -39,9 +38,7 @@ import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; import org.apache.orc.storage.ql.io.sarg.SearchArgument; -/** - * Iterable used to read rows from ORC. - */ +/** Iterable used to read rows from ORC. */ class OrcIterable extends CloseableGroup implements CloseableIterable { private final Configuration config; private final Schema schema; @@ -55,10 +52,18 @@ class OrcIterable extends CloseableGroup implements CloseableIterable { private final int recordsPerBatch; private NameMapping nameMapping; - OrcIterable(InputFile file, Configuration config, Schema schema, - NameMapping nameMapping, Long start, Long length, - Function> readerFunction, boolean caseSensitive, Expression filter, - Function> batchReaderFunction, int recordsPerBatch) { + OrcIterable( + InputFile file, + Configuration config, + Schema schema, + NameMapping nameMapping, + Long start, + Long length, + Function> readerFunction, + boolean caseSensitive, + Expression filter, + Function> batchReaderFunction, + int recordsPerBatch) { this.schema = schema; this.readerFunction = readerFunction; this.file = file; @@ -96,34 +101,40 @@ public CloseableIterator iterator() { sarg = ExpressionToSearchArgument.convert(boundFilter, readOrcSchema); } - VectorizedRowBatchIterator rowBatchIterator = newOrcIterator(file, readOrcSchema, start, length, orcFileReader, - sarg, recordsPerBatch); + VectorizedRowBatchIterator rowBatchIterator = + newOrcIterator(file, readOrcSchema, start, length, orcFileReader, sarg, recordsPerBatch); if (batchReaderFunction != null) { OrcBatchReader batchReader = (OrcBatchReader) batchReaderFunction.apply(readOrcSchema); - return CloseableIterator.transform(rowBatchIterator, pair -> { - batchReader.setBatchContext(pair.second()); - return batchReader.read(pair.first()); - }); + return CloseableIterator.transform( + rowBatchIterator, + pair -> { + batchReader.setBatchContext(pair.second()); + return batchReader.read(pair.first()); + }); } else { - return new OrcRowIterator<>(rowBatchIterator, (OrcRowReader) readerFunction.apply(readOrcSchema)); + return new OrcRowIterator<>( + rowBatchIterator, (OrcRowReader) readerFunction.apply(readOrcSchema)); } } - private static VectorizedRowBatchIterator newOrcIterator(InputFile file, - TypeDescription readerSchema, - Long start, Long length, - Reader orcFileReader, SearchArgument sarg, - int recordsPerBatch) { + private static VectorizedRowBatchIterator newOrcIterator( + InputFile file, + TypeDescription readerSchema, + Long start, + Long length, + Reader orcFileReader, + SearchArgument sarg, + int recordsPerBatch) { final Reader.Options options = orcFileReader.options(); if (start != null) { options.range(start, length); } options.schema(readerSchema); - options.searchArgument(sarg, new String[]{}); + options.searchArgument(sarg, new String[] {}); try { - return new VectorizedRowBatchIterator(file.location(), readerSchema, orcFileReader.rows(options), - recordsPerBatch); + return new VectorizedRowBatchIterator( + file.location(), readerSchema, orcFileReader.rows(options), recordsPerBatch); } catch (IOException ioe) { throw new RuntimeIOException(ioe, "Failed to get ORC rows for file: %s", file); } diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java b/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java index ad63d8c277fc..972591d53d03 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.io.IOException; @@ -65,11 +64,11 @@ public class OrcMetrics { private enum Bound { - LOWER, UPPER + LOWER, + UPPER } - private OrcMetrics() { - } + private OrcMetrics() {} public static Metrics fromInputFile(InputFile file) { return fromInputFile(file, MetricsConfig.getDefault()); @@ -79,73 +78,88 @@ public static Metrics fromInputFile(InputFile file, MetricsConfig metricsConfig) return fromInputFile(file, metricsConfig, null); } - public static Metrics fromInputFile(InputFile file, MetricsConfig metricsConfig, NameMapping mapping) { - final Configuration config = (file instanceof HadoopInputFile) ? - ((HadoopInputFile) file).getConf() : new Configuration(); + public static Metrics fromInputFile( + InputFile file, MetricsConfig metricsConfig, NameMapping mapping) { + final Configuration config = + (file instanceof HadoopInputFile) + ? ((HadoopInputFile) file).getConf() + : new Configuration(); return fromInputFile(file, config, metricsConfig, mapping); } - static Metrics fromInputFile(InputFile file, Configuration config, MetricsConfig metricsConfig, NameMapping mapping) { + static Metrics fromInputFile( + InputFile file, Configuration config, MetricsConfig metricsConfig, NameMapping mapping) { try (Reader orcReader = ORC.newFileReader(file, config)) { - return buildOrcMetrics(orcReader.getNumberOfRows(), orcReader.getSchema(), orcReader.getStatistics(), - Stream.empty(), metricsConfig, mapping); + return buildOrcMetrics( + orcReader.getNumberOfRows(), + orcReader.getSchema(), + orcReader.getStatistics(), + Stream.empty(), + metricsConfig, + mapping); } catch (IOException ioe) { throw new RuntimeIOException(ioe, "Failed to open file: %s", file.location()); } } - static Metrics fromWriter(Writer writer, Stream> fieldMetricsStream, MetricsConfig metricsConfig) { + static Metrics fromWriter( + Writer writer, Stream> fieldMetricsStream, MetricsConfig metricsConfig) { try { - return buildOrcMetrics(writer.getNumberOfRows(), writer.getSchema(), writer.getStatistics(), - fieldMetricsStream, metricsConfig, null); + return buildOrcMetrics( + writer.getNumberOfRows(), + writer.getSchema(), + writer.getStatistics(), + fieldMetricsStream, + metricsConfig, + null); } catch (IOException ioe) { throw new RuntimeIOException(ioe, "Failed to get statistics from writer"); } } - private static Metrics buildOrcMetrics(final long numOfRows, final TypeDescription orcSchema, - final ColumnStatistics[] colStats, - final Stream> fieldMetricsStream, - final MetricsConfig metricsConfig, - final NameMapping mapping) { - final TypeDescription orcSchemaWithIds = (!ORCSchemaUtil.hasIds(orcSchema) && mapping != null) ? - ORCSchemaUtil.applyNameMapping(orcSchema, mapping) : orcSchema; + private static Metrics buildOrcMetrics( + final long numOfRows, + final TypeDescription orcSchema, + final ColumnStatistics[] colStats, + final Stream> fieldMetricsStream, + final MetricsConfig metricsConfig, + final NameMapping mapping) { + final TypeDescription orcSchemaWithIds = + (!ORCSchemaUtil.hasIds(orcSchema) && mapping != null) + ? ORCSchemaUtil.applyNameMapping(orcSchema, mapping) + : orcSchema; final Set statsColumns = statsColumns(orcSchemaWithIds); - final MetricsConfig effectiveMetricsConfig = Optional.ofNullable(metricsConfig) - .orElseGet(MetricsConfig::getDefault); + final MetricsConfig effectiveMetricsConfig = + Optional.ofNullable(metricsConfig).orElseGet(MetricsConfig::getDefault); Map columnSizes = Maps.newHashMapWithExpectedSize(colStats.length); Map valueCounts = Maps.newHashMapWithExpectedSize(colStats.length); Map nullCounts = Maps.newHashMapWithExpectedSize(colStats.length); if (!ORCSchemaUtil.hasIds(orcSchemaWithIds)) { - return new Metrics(numOfRows, - columnSizes, - valueCounts, - nullCounts, - null, - null, - null); + return new Metrics(numOfRows, columnSizes, valueCounts, nullCounts, null, null, null); } final Schema schema = ORCSchemaUtil.convert(orcSchemaWithIds); Map lowerBounds = Maps.newHashMap(); Map upperBounds = Maps.newHashMap(); - Map> fieldMetricsMap = Optional.ofNullable(fieldMetricsStream) - .map(stream -> stream.collect(Collectors.toMap(FieldMetrics::id, Function.identity()))) - .orElseGet(Maps::newHashMap); + Map> fieldMetricsMap = + Optional.ofNullable(fieldMetricsStream) + .map(stream -> stream.collect(Collectors.toMap(FieldMetrics::id, Function.identity()))) + .orElseGet(Maps::newHashMap); for (int i = 0; i < colStats.length; i++) { final ColumnStatistics colStat = colStats[i]; final TypeDescription orcCol = orcSchemaWithIds.findSubtype(i); - final Optional icebergColOpt = ORCSchemaUtil.icebergID(orcCol) - .map(schema::findField); + final Optional icebergColOpt = + ORCSchemaUtil.icebergID(orcCol).map(schema::findField); if (icebergColOpt.isPresent()) { final Types.NestedField icebergCol = icebergColOpt.get(); final int fieldId = icebergCol.fieldId(); - final MetricsMode metricsMode = MetricsUtil.metricsMode(schema, effectiveMetricsConfig, icebergCol.fieldId()); + final MetricsMode metricsMode = + MetricsUtil.metricsMode(schema, effectiveMetricsConfig, icebergCol.fieldId()); columnSizes.put(fieldId, colStat.getBytesOnDisk()); if (metricsMode == MetricsModes.None.get()) { @@ -165,28 +179,39 @@ private static Metrics buildOrcMetrics(final long numOfRows, final TypeDescripti valueCounts.put(fieldId, colStat.getNumberOfValues() + nullCounts.get(fieldId)); if (metricsMode != MetricsModes.Counts.get()) { - Optional orcMin = (colStat.getNumberOfValues() > 0) ? - fromOrcMin(icebergCol.type(), colStat, metricsMode, fieldMetricsMap.get(fieldId)) : Optional.empty(); + Optional orcMin = + (colStat.getNumberOfValues() > 0) + ? fromOrcMin( + icebergCol.type(), colStat, metricsMode, fieldMetricsMap.get(fieldId)) + : Optional.empty(); orcMin.ifPresent(byteBuffer -> lowerBounds.put(icebergCol.fieldId(), byteBuffer)); - Optional orcMax = (colStat.getNumberOfValues() > 0) ? - fromOrcMax(icebergCol.type(), colStat, metricsMode, fieldMetricsMap.get(fieldId)) : Optional.empty(); + Optional orcMax = + (colStat.getNumberOfValues() > 0) + ? fromOrcMax( + icebergCol.type(), colStat, metricsMode, fieldMetricsMap.get(fieldId)) + : Optional.empty(); orcMax.ifPresent(byteBuffer -> upperBounds.put(icebergCol.fieldId(), byteBuffer)); } } } } - return new Metrics(numOfRows, + return new Metrics( + numOfRows, columnSizes, valueCounts, nullCounts, - MetricsUtil.createNanValueCounts(fieldMetricsMap.values().stream(), effectiveMetricsConfig, schema), + MetricsUtil.createNanValueCounts( + fieldMetricsMap.values().stream(), effectiveMetricsConfig, schema), lowerBounds, upperBounds); } - private static Optional fromOrcMin(Type type, ColumnStatistics columnStats, - MetricsMode metricsMode, FieldMetrics fieldMetrics) { + private static Optional fromOrcMin( + Type type, + ColumnStatistics columnStats, + MetricsMode metricsMode, + FieldMetrics fieldMetrics) { Object min = null; if (columnStats instanceof IntegerColumnStatistics) { min = ((IntegerColumnStatistics) columnStats).getMinimum(); @@ -195,12 +220,17 @@ private static Optional fromOrcMin(Type type, ColumnStatistics colum } } else if (columnStats instanceof DoubleColumnStatistics) { if (fieldMetrics != null) { - // since Orc includes NaN for upper/lower bounds of floating point columns, and we don't want this behavior, - // we have tracked metrics for such columns ourselves and thus do not need to rely on Orc's column statistics. + // since Orc includes NaN for upper/lower bounds of floating point columns, and we don't + // want this behavior, + // we have tracked metrics for such columns ourselves and thus do not need to rely on Orc's + // column statistics. min = fieldMetrics.lowerBound(); } else { - // imported files will not have metrics that were tracked by Iceberg, so fall back to the file's metrics. - min = replaceNaN(((DoubleColumnStatistics) columnStats).getMinimum(), Double.NEGATIVE_INFINITY); + // imported files will not have metrics that were tracked by Iceberg, so fall back to the + // file's metrics. + min = + replaceNaN( + ((DoubleColumnStatistics) columnStats).getMinimum(), Double.NEGATIVE_INFINITY); if (type.typeId() == Type.TypeID.FLOAT) { min = ((Double) min).floatValue(); } @@ -208,29 +238,35 @@ private static Optional fromOrcMin(Type type, ColumnStatistics colum } else if (columnStats instanceof StringColumnStatistics) { min = ((StringColumnStatistics) columnStats).getMinimum(); } else if (columnStats instanceof DecimalColumnStatistics) { - min = Optional - .ofNullable(((DecimalColumnStatistics) columnStats).getMinimum()) - .map(minStats -> minStats.bigDecimalValue() - .setScale(((Types.DecimalType) type).scale())) - .orElse(null); + min = + Optional.ofNullable(((DecimalColumnStatistics) columnStats).getMinimum()) + .map( + minStats -> + minStats.bigDecimalValue().setScale(((Types.DecimalType) type).scale())) + .orElse(null); } else if (columnStats instanceof DateColumnStatistics) { min = (int) ((DateColumnStatistics) columnStats).getMinimumDayOfEpoch(); } else if (columnStats instanceof TimestampColumnStatistics) { TimestampColumnStatistics tColStats = (TimestampColumnStatistics) columnStats; Timestamp minValue = tColStats.getMinimumUTC(); - min = Optional.ofNullable(minValue) - .map(v -> DateTimeUtil.microsFromInstant(v.toInstant())) - .orElse(null); + min = + Optional.ofNullable(minValue) + .map(v -> DateTimeUtil.microsFromInstant(v.toInstant())) + .orElse(null); } else if (columnStats instanceof BooleanColumnStatistics) { BooleanColumnStatistics booleanStats = (BooleanColumnStatistics) columnStats; min = booleanStats.getFalseCount() <= 0; } - return Optional.ofNullable(Conversions.toByteBuffer(type, truncateIfNeeded(Bound.LOWER, type, min, metricsMode))); + return Optional.ofNullable( + Conversions.toByteBuffer(type, truncateIfNeeded(Bound.LOWER, type, min, metricsMode))); } - private static Optional fromOrcMax(Type type, ColumnStatistics columnStats, - MetricsMode metricsMode, FieldMetrics fieldMetrics) { + private static Optional fromOrcMax( + Type type, + ColumnStatistics columnStats, + MetricsMode metricsMode, + FieldMetrics fieldMetrics) { Object max = null; if (columnStats instanceof IntegerColumnStatistics) { max = ((IntegerColumnStatistics) columnStats).getMaximum(); @@ -239,12 +275,17 @@ private static Optional fromOrcMax(Type type, ColumnStatistics colum } } else if (columnStats instanceof DoubleColumnStatistics) { if (fieldMetrics != null) { - // since Orc includes NaN for upper/lower bounds of floating point columns, and we don't want this behavior, - // we have tracked metrics for such columns ourselves and thus do not need to rely on Orc's column statistics. + // since Orc includes NaN for upper/lower bounds of floating point columns, and we don't + // want this behavior, + // we have tracked metrics for such columns ourselves and thus do not need to rely on Orc's + // column statistics. max = fieldMetrics.upperBound(); } else { - // imported files will not have metrics that were tracked by Iceberg, so fall back to the file's metrics. - max = replaceNaN(((DoubleColumnStatistics) columnStats).getMaximum(), Double.POSITIVE_INFINITY); + // imported files will not have metrics that were tracked by Iceberg, so fall back to the + // file's metrics. + max = + replaceNaN( + ((DoubleColumnStatistics) columnStats).getMaximum(), Double.POSITIVE_INFINITY); if (type.typeId() == Type.TypeID.FLOAT) { max = ((Double) max).floatValue(); } @@ -252,34 +293,41 @@ private static Optional fromOrcMax(Type type, ColumnStatistics colum } else if (columnStats instanceof StringColumnStatistics) { max = ((StringColumnStatistics) columnStats).getMaximum(); } else if (columnStats instanceof DecimalColumnStatistics) { - max = Optional - .ofNullable(((DecimalColumnStatistics) columnStats).getMaximum()) - .map(maxStats -> maxStats.bigDecimalValue() - .setScale(((Types.DecimalType) type).scale())) - .orElse(null); + max = + Optional.ofNullable(((DecimalColumnStatistics) columnStats).getMaximum()) + .map( + maxStats -> + maxStats.bigDecimalValue().setScale(((Types.DecimalType) type).scale())) + .orElse(null); } else if (columnStats instanceof DateColumnStatistics) { max = (int) ((DateColumnStatistics) columnStats).getMaximumDayOfEpoch(); } else if (columnStats instanceof TimestampColumnStatistics) { TimestampColumnStatistics tColStats = (TimestampColumnStatistics) columnStats; Timestamp maxValue = tColStats.getMaximumUTC(); - max = Optional.ofNullable(maxValue) - .map(v -> DateTimeUtil.microsFromInstant(v.toInstant())) - .orElse(null); + max = + Optional.ofNullable(maxValue) + .map(v -> DateTimeUtil.microsFromInstant(v.toInstant())) + .orElse(null); } else if (columnStats instanceof BooleanColumnStatistics) { BooleanColumnStatistics booleanStats = (BooleanColumnStatistics) columnStats; max = booleanStats.getTrueCount() > 0; } - return Optional.ofNullable(Conversions.toByteBuffer(type, truncateIfNeeded(Bound.UPPER, type, max, metricsMode))); + return Optional.ofNullable( + Conversions.toByteBuffer(type, truncateIfNeeded(Bound.UPPER, type, max, metricsMode))); } private static Object replaceNaN(double value, double replacement) { return Double.isNaN(value) ? replacement : value; } - private static Object truncateIfNeeded(Bound bound, Type type, Object value, MetricsMode metricsMode) { - // Out of the two types which could be truncated, string or binary, ORC only supports string bounds. + private static Object truncateIfNeeded( + Bound bound, Type type, Object value, MetricsMode metricsMode) { + // Out of the two types which could be truncated, string or binary, ORC only supports string + // bounds. // Therefore, truncation will be applied if needed only on string type. - if (!(metricsMode instanceof MetricsModes.Truncate) || type.typeId() != Type.TypeID.STRING || value == null) { + if (!(metricsMode instanceof MetricsModes.Truncate) + || type.typeId() != Type.TypeID.STRING + || value == null) { return value; } @@ -289,8 +337,10 @@ private static Object truncateIfNeeded(Bound bound, Type type, Object value, Met switch (bound) { case UPPER: - return Optional.ofNullable(UnicodeUtil.truncateStringMax(Literal.of(charSequence), truncateLength)) - .map(Literal::value).orElse(charSequence); + return Optional.ofNullable( + UnicodeUtil.truncateStringMax(Literal.of(charSequence), truncateLength)) + .map(Literal::value) + .orElse(charSequence); case LOWER: return UnicodeUtil.truncateStringMin(Literal.of(charSequence), truncateLength).value(); default: @@ -304,11 +354,15 @@ private static Set statsColumns(TypeDescription schema) { private static class StatsColumnsVisitor extends OrcSchemaVisitor> { @Override - public Set record(TypeDescription record, List names, List> fields) { + public Set record( + TypeDescription record, List names, List> fields) { ImmutableSet.Builder result = ImmutableSet.builder(); fields.stream().filter(Objects::nonNull).forEach(result::addAll); - record.getChildren().stream().map(ORCSchemaUtil::icebergID).filter(Optional::isPresent) - .map(Optional::get).forEach(result::add); + record.getChildren().stream() + .map(ORCSchemaUtil::icebergID) + .filter(Optional::isPresent) + .map(Optional::get) + .forEach(result::add); return result.build(); } } diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcRowReader.java b/orc/src/main/java/org/apache/iceberg/orc/OrcRowReader.java index 68a140873b3c..a43a8856b2a5 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcRowReader.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcRowReader.java @@ -16,19 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; -/** - * Used for implementing ORC row readers. - */ +/** Used for implementing ORC row readers. */ public interface OrcRowReader { - /** - * Reads a row. - */ + /** Reads a row. */ T read(VectorizedRowBatch batch, int row); void setBatchContext(long batchOffsetInFile); diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcRowWriter.java b/orc/src/main/java/org/apache/iceberg/orc/OrcRowWriter.java index 0487b9c5f965..8182a5054c65 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcRowWriter.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcRowWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.io.IOException; @@ -25,15 +24,13 @@ import org.apache.iceberg.FieldMetrics; import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; -/** - * Write data value of a schema. - */ +/** Write data value of a schema. */ public interface OrcRowWriter { /** * Writes or appends a row to ORC's VectorizedRowBatch. * - * @param row the row data value to write. + * @param row the row data value to write. * @param output the VectorizedRowBatch to which the output will be written. * @throws IOException if there's any IO error while writing the data value. */ @@ -41,9 +38,7 @@ public interface OrcRowWriter { List> writers(); - /** - * Returns a stream of {@link FieldMetrics} that this OrcRowWriter keeps track of. - */ + /** Returns a stream of {@link FieldMetrics} that this OrcRowWriter keeps track of. */ default Stream> metrics() { return Stream.empty(); } diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcSchemaVisitor.java b/orc/src/main/java/org/apache/iceberg/orc/OrcSchemaVisitor.java index 778037b8ce51..455aca15ffc4 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcSchemaVisitor.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.util.Deque; @@ -25,9 +24,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.orc.TypeDescription; -/** - * Generic visitor of an ORC Schema. - */ +/** Generic visitor of an ORC Schema. */ public abstract class OrcSchemaVisitor { private final Deque fieldNames = Lists.newLinkedList(); @@ -87,9 +84,10 @@ public static T visit(TypeDescription schema, OrcSchemaVisitor visitor) { } } - private static List visitFields(List fields, List names, - OrcSchemaVisitor visitor) { - Preconditions.checkArgument(fields.size() == names.size(), "Not all fields have names in ORC struct"); + private static List visitFields( + List fields, List names, OrcSchemaVisitor visitor) { + Preconditions.checkArgument( + fields.size() == names.size(), "Not all fields have names in ORC struct"); List results = Lists.newArrayListWithExpectedSize(fields.size()); for (int i = 0; i < fields.size(); i++) { diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcSchemaWithTypeVisitor.java b/orc/src/main/java/org/apache/iceberg/orc/OrcSchemaWithTypeVisitor.java index 53b0c9f2fdeb..fd37283a86f8 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcSchemaWithTypeVisitor.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcSchemaWithTypeVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.util.List; @@ -25,14 +24,16 @@ import org.apache.iceberg.types.Types; import org.apache.orc.TypeDescription; - public abstract class OrcSchemaWithTypeVisitor { public static T visit( - org.apache.iceberg.Schema iSchema, TypeDescription schema, OrcSchemaWithTypeVisitor visitor) { + org.apache.iceberg.Schema iSchema, + TypeDescription schema, + OrcSchemaWithTypeVisitor visitor) { return visit(iSchema.asStruct(), schema, visitor); } - public static T visit(Type iType, TypeDescription schema, OrcSchemaWithTypeVisitor visitor) { + public static T visit( + Type iType, TypeDescription schema, OrcSchemaWithTypeVisitor visitor) { switch (schema.getCategory()) { case STRUCT: return visitRecord(iType != null ? iType.asStructType() : null, schema, visitor); @@ -43,13 +44,15 @@ public static T visit(Type iType, TypeDescription schema, OrcSchemaWithTypeV case LIST: Types.ListType list = iType != null ? iType.asListType() : null; return visitor.list( - list, schema, + list, + schema, visit(list != null ? list.elementType() : null, schema.getChildren().get(0), visitor)); case MAP: Types.MapType map = iType != null ? iType.asMapType() : null; return visitor.map( - map, schema, + map, + schema, visit(map != null ? map.keyType() : null, schema.getChildren().get(0), visitor), visit(map != null ? map.valueType() : null, schema.getChildren().get(1), visitor)); @@ -71,7 +74,8 @@ private static T visitRecord( return visitor.record(struct, record, names, results); } - public T record(Types.StructType iStruct, TypeDescription record, List names, List fields) { + public T record( + Types.StructType iStruct, TypeDescription record, List names, List fields) { return null; } diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcToIcebergVisitor.java b/orc/src/main/java/org/apache/iceberg/orc/OrcToIcebergVisitor.java index d00060eb34e6..6992f88b870b 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcToIcebergVisitor.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcToIcebergVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.util.List; @@ -25,28 +24,31 @@ import org.apache.iceberg.types.Types; import org.apache.orc.TypeDescription; -/** - * Converts an ORC schema to Iceberg. - */ +/** Converts an ORC schema to Iceberg. */ class OrcToIcebergVisitor extends OrcSchemaVisitor> { @Override - public Optional record(TypeDescription record, List names, - List> fields) { + public Optional record( + TypeDescription record, List names, List> fields) { boolean isOptional = ORCSchemaUtil.isOptional(record); Optional icebergIdOpt = ORCSchemaUtil.icebergID(record); if (!icebergIdOpt.isPresent() || fields.stream().noneMatch(Optional::isPresent)) { return Optional.empty(); } - Types.StructType structType = Types.StructType.of( - fields.stream().filter(Optional::isPresent).map(Optional::get).collect(Collectors.toList())); - return Optional.of(Types.NestedField.of(icebergIdOpt.get(), isOptional, currentFieldName(), structType)); + Types.StructType structType = + Types.StructType.of( + fields.stream() + .filter(Optional::isPresent) + .map(Optional::get) + .collect(Collectors.toList())); + return Optional.of( + Types.NestedField.of(icebergIdOpt.get(), isOptional, currentFieldName(), structType)); } @Override - public Optional list(TypeDescription array, - Optional element) { + public Optional list( + TypeDescription array, Optional element) { boolean isOptional = ORCSchemaUtil.isOptional(array); Optional icebergIdOpt = ORCSchemaUtil.icebergID(array); @@ -55,16 +57,18 @@ public Optional list(TypeDescription array, } Types.NestedField foundElement = element.get(); - Types.ListType listTypeWithElem = ORCSchemaUtil.isOptional(array.getChildren().get(0)) ? - Types.ListType.ofOptional(foundElement.fieldId(), foundElement.type()) : - Types.ListType.ofRequired(foundElement.fieldId(), foundElement.type()); + Types.ListType listTypeWithElem = + ORCSchemaUtil.isOptional(array.getChildren().get(0)) + ? Types.ListType.ofOptional(foundElement.fieldId(), foundElement.type()) + : Types.ListType.ofRequired(foundElement.fieldId(), foundElement.type()); - return Optional.of(Types.NestedField.of(icebergIdOpt.get(), isOptional, currentFieldName(), listTypeWithElem)); + return Optional.of( + Types.NestedField.of(icebergIdOpt.get(), isOptional, currentFieldName(), listTypeWithElem)); } @Override - public Optional map(TypeDescription map, Optional key, - Optional value) { + public Optional map( + TypeDescription map, Optional key, Optional value) { boolean isOptional = ORCSchemaUtil.isOptional(map); Optional icebergIdOpt = ORCSchemaUtil.icebergID(map); @@ -74,11 +78,15 @@ public Optional map(TypeDescription map, Optional primitive(TypeDescription primitive) { foundField = Types.NestedField.of(icebergID, isOptional, name, Types.IntegerType.get()); break; case LONG: - String longAttributeValue = primitive.getAttributeValue(ORCSchemaUtil.ICEBERG_LONG_TYPE_ATTRIBUTE); - ORCSchemaUtil.LongType longType = longAttributeValue == null ? - ORCSchemaUtil.LongType.LONG : ORCSchemaUtil.LongType.valueOf(longAttributeValue); + String longAttributeValue = + primitive.getAttributeValue(ORCSchemaUtil.ICEBERG_LONG_TYPE_ATTRIBUTE); + ORCSchemaUtil.LongType longType = + longAttributeValue == null + ? ORCSchemaUtil.LongType.LONG + : ORCSchemaUtil.LongType.valueOf(longAttributeValue); switch (longType) { case TIME: foundField = Types.NestedField.of(icebergID, isOptional, name, Types.TimeType.get()); @@ -129,16 +140,22 @@ public Optional primitive(TypeDescription primitive) { foundField = Types.NestedField.of(icebergID, isOptional, name, Types.StringType.get()); break; case BINARY: - String binaryAttributeValue = primitive.getAttributeValue(ORCSchemaUtil.ICEBERG_BINARY_TYPE_ATTRIBUTE); - ORCSchemaUtil.BinaryType binaryType = binaryAttributeValue == null ? ORCSchemaUtil.BinaryType.BINARY : - ORCSchemaUtil.BinaryType.valueOf(binaryAttributeValue); + String binaryAttributeValue = + primitive.getAttributeValue(ORCSchemaUtil.ICEBERG_BINARY_TYPE_ATTRIBUTE); + ORCSchemaUtil.BinaryType binaryType = + binaryAttributeValue == null + ? ORCSchemaUtil.BinaryType.BINARY + : ORCSchemaUtil.BinaryType.valueOf(binaryAttributeValue); switch (binaryType) { case UUID: foundField = Types.NestedField.of(icebergID, isOptional, name, Types.UUIDType.get()); break; case FIXED: - int fixedLength = Integer.parseInt(primitive.getAttributeValue(ORCSchemaUtil.ICEBERG_FIELD_LENGTH)); - foundField = Types.NestedField.of(icebergID, isOptional, name, Types.FixedType.ofLength(fixedLength)); + int fixedLength = + Integer.parseInt(primitive.getAttributeValue(ORCSchemaUtil.ICEBERG_FIELD_LENGTH)); + foundField = + Types.NestedField.of( + icebergID, isOptional, name, Types.FixedType.ofLength(fixedLength)); break; case BINARY: foundField = Types.NestedField.of(icebergID, isOptional, name, Types.BinaryType.get()); @@ -151,14 +168,20 @@ public Optional primitive(TypeDescription primitive) { foundField = Types.NestedField.of(icebergID, isOptional, name, Types.DateType.get()); break; case TIMESTAMP: - foundField = Types.NestedField.of(icebergID, isOptional, name, Types.TimestampType.withoutZone()); + foundField = + Types.NestedField.of(icebergID, isOptional, name, Types.TimestampType.withoutZone()); break; case TIMESTAMP_INSTANT: - foundField = Types.NestedField.of(icebergID, isOptional, name, Types.TimestampType.withZone()); + foundField = + Types.NestedField.of(icebergID, isOptional, name, Types.TimestampType.withZone()); break; case DECIMAL: - foundField = Types.NestedField.of(icebergID, isOptional, name, - Types.DecimalType.of(primitive.getPrecision(), primitive.getScale())); + foundField = + Types.NestedField.of( + icebergID, + isOptional, + name, + Types.DecimalType.of(primitive.getPrecision(), primitive.getScale())); break; default: throw new IllegalArgumentException("Can't handle " + primitive); diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcValueReader.java b/orc/src/main/java/org/apache/iceberg/orc/OrcValueReader.java index 4c2772fb24a6..d4aab5a63cbc 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcValueReader.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcValueReader.java @@ -16,12 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import org.apache.orc.storage.ql.exec.vector.ColumnVector; - public interface OrcValueReader { default T read(ColumnVector vector, int row) { int rowIndex = vector.isRepeating ? 0 : row; @@ -34,6 +32,5 @@ default T read(ColumnVector vector, int row) { T nonNullRead(ColumnVector vector, int row); - default void setBatchContext(long batchOffsetInFile) { - } + default void setBatchContext(long batchOffsetInFile) {} } diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcValueReaders.java b/orc/src/main/java/org/apache/iceberg/orc/OrcValueReaders.java index ecb1c725eb4e..18fef2940380 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcValueReaders.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcValueReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.util.Arrays; @@ -30,10 +29,8 @@ import org.apache.orc.storage.ql.exec.vector.LongColumnVector; import org.apache.orc.storage.ql.exec.vector.StructColumnVector; - public class OrcValueReaders { - private OrcValueReaders() { - } + private OrcValueReaders() {} public static OrcValueReader booleans() { return BooleanReader.INSTANCE; @@ -66,8 +63,7 @@ public static OrcValueReader constants(C constant) { private static class BooleanReader implements OrcValueReader { static final BooleanReader INSTANCE = new BooleanReader(); - private BooleanReader() { - } + private BooleanReader() {} @Override public Boolean nonNullRead(ColumnVector vector, int row) { @@ -78,8 +74,7 @@ public Boolean nonNullRead(ColumnVector vector, int row) { private static class IntegerReader implements OrcValueReader { static final IntegerReader INSTANCE = new IntegerReader(); - private IntegerReader() { - } + private IntegerReader() {} @Override public Integer nonNullRead(ColumnVector vector, int row) { @@ -90,8 +85,7 @@ public Integer nonNullRead(ColumnVector vector, int row) { private static class LongReader implements OrcValueReader { static final LongReader INSTANCE = new LongReader(); - private LongReader() { - } + private LongReader() {} @Override public Long nonNullRead(ColumnVector vector, int row) { @@ -102,8 +96,7 @@ public Long nonNullRead(ColumnVector vector, int row) { private static class FloatReader implements OrcValueReader { private static final FloatReader INSTANCE = new FloatReader(); - private FloatReader() { - } + private FloatReader() {} @Override public Float nonNullRead(ColumnVector vector, int row) { @@ -114,8 +107,7 @@ public Float nonNullRead(ColumnVector vector, int row) { private static class DoubleReader implements OrcValueReader { private static final DoubleReader INSTANCE = new DoubleReader(); - private DoubleReader() { - } + private DoubleReader() {} @Override public Double nonNullRead(ColumnVector vector, int row) { @@ -126,15 +118,16 @@ public Double nonNullRead(ColumnVector vector, int row) { private static class BytesReader implements OrcValueReader { private static final BytesReader INSTANCE = new BytesReader(); - private BytesReader() { - } + private BytesReader() {} @Override public byte[] nonNullRead(ColumnVector vector, int row) { BytesColumnVector bytesVector = (BytesColumnVector) vector; return Arrays.copyOfRange( - bytesVector.vector[row], bytesVector.start[row], bytesVector.start[row] + bytesVector.length[row]); + bytesVector.vector[row], + bytesVector.start[row], + bytesVector.start[row] + bytesVector.length[row]); } } @@ -142,7 +135,8 @@ public abstract static class StructReader implements OrcValueReader { private final OrcValueReader[] readers; private final boolean[] isConstantOrMetadataField; - protected StructReader(List> readers, Types.StructType struct, Map idToConstant) { + protected StructReader( + List> readers, Types.StructType struct, Map idToConstant) { List fields = struct.fields(); this.readers = new OrcValueReader[fields.size()]; this.isConstantOrMetadataField = new boolean[fields.size()]; diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcValueWriter.java b/orc/src/main/java/org/apache/iceberg/orc/OrcValueWriter.java index d8c27ac30879..f58971641228 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcValueWriter.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcValueWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.util.stream.Stream; @@ -28,8 +27,8 @@ public interface OrcValueWriter { /** * Take a value from the data value and add it to the ORC output. * - * @param rowId the row in the ColumnVector - * @param data the data value to write. + * @param rowId the row in the ColumnVector + * @param data the data value to write. * @param output the ColumnVector to put the value into */ default void write(int rowId, T data, ColumnVector output) { @@ -49,9 +48,7 @@ default void nullWrite() { // no op } - /** - * Returns a stream of {@link FieldMetrics} that this OrcValueWriter keeps track of. - */ + /** Returns a stream of {@link FieldMetrics} that this OrcValueWriter keeps track of. */ default Stream> metrics() { return Stream.empty(); } diff --git a/orc/src/main/java/org/apache/iceberg/orc/RemoveIds.java b/orc/src/main/java/org/apache/iceberg/orc/RemoveIds.java index bfa56deb54dd..a4f0a296803f 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/RemoveIds.java +++ b/orc/src/main/java/org/apache/iceberg/orc/RemoveIds.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.util.List; @@ -26,7 +25,8 @@ class RemoveIds extends OrcSchemaVisitor { @Override - public TypeDescription record(TypeDescription record, List names, List fields) { + public TypeDescription record( + TypeDescription record, List names, List fields) { Preconditions.checkArgument(names.size() == fields.size(), "All fields must have names."); TypeDescription struct = TypeDescription.createStruct(); diff --git a/orc/src/main/java/org/apache/iceberg/orc/VectorizedRowBatchIterator.java b/orc/src/main/java/org/apache/iceberg/orc/VectorizedRowBatchIterator.java index daba2c457302..86e701aba3ae 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/VectorizedRowBatchIterator.java +++ b/orc/src/main/java/org/apache/iceberg/orc/VectorizedRowBatchIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.io.IOException; @@ -28,18 +27,20 @@ import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; /** - * An adaptor so that the ORC RecordReader can be used as an Iterator. - * Because the same VectorizedRowBatch is reused on each call to next, - * it gets changed when hasNext or next is called. + * An adaptor so that the ORC RecordReader can be used as an Iterator. Because the same + * VectorizedRowBatch is reused on each call to next, it gets changed when hasNext or next is + * called. */ -public class VectorizedRowBatchIterator implements CloseableIterator> { +public class VectorizedRowBatchIterator + implements CloseableIterator> { private final String fileLocation; private final RecordReader rows; private final VectorizedRowBatch batch; private boolean advanced = false; private long batchOffsetInFile = 0; - VectorizedRowBatchIterator(String fileLocation, TypeDescription schema, RecordReader rows, int recordsPerBatch) { + VectorizedRowBatchIterator( + String fileLocation, TypeDescription schema, RecordReader rows, int recordsPerBatch) { this.fileLocation = fileLocation; this.rows = rows; this.batch = schema.createRowBatch(recordsPerBatch); diff --git a/orc/src/test/java/org/apache/iceberg/orc/TestBuildOrcProjection.java b/orc/src/test/java/org/apache/iceberg/orc/TestBuildOrcProjection.java index 12b74aff9b74..1fee1ef12c73 100644 --- a/orc/src/test/java/org/apache/iceberg/orc/TestBuildOrcProjection.java +++ b/orc/src/test/java/org/apache/iceberg/orc/TestBuildOrcProjection.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.junit.Assert.assertEquals; + import org.apache.iceberg.Schema; import org.apache.iceberg.types.Types; import org.apache.orc.TypeDescription; @@ -26,21 +29,14 @@ import org.junit.Test; import org.junit.rules.ExpectedException; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.junit.Assert.assertEquals; - -/** - * Test projections on ORC types. - */ +/** Test projections on ORC types. */ public class TestBuildOrcProjection { @Test public void testProjectionPrimitiveNoOp() { - Schema originalSchema = new Schema( - optional(1, "a", Types.IntegerType.get()), - optional(2, "b", Types.StringType.get()) - ); + Schema originalSchema = + new Schema( + optional(1, "a", Types.IntegerType.get()), optional(2, "b", Types.StringType.get())); // Original mapping (stored in ORC) TypeDescription orcSchema = ORCSchemaUtil.convert(originalSchema); @@ -53,19 +49,19 @@ public void testProjectionPrimitiveNoOp() { @Test public void testProjectionPrimitive() { - Schema originalSchema = new Schema( - optional(1, "a", Types.IntegerType.get()), - optional(2, "b", Types.StringType.get()) - ); + Schema originalSchema = + new Schema( + optional(1, "a", Types.IntegerType.get()), optional(2, "b", Types.StringType.get())); // Original mapping (stored in ORC) TypeDescription orcSchema = ORCSchemaUtil.convert(originalSchema); // Evolve schema - Schema evolveSchema = new Schema( - optional(2, "a", Types.StringType.get()), - optional(3, "c", Types.DateType.get()) // will produce ORC column c_r3 (new) - ); + Schema evolveSchema = + new Schema( + optional(2, "a", Types.StringType.get()), + optional(3, "c", Types.DateType.get()) // will produce ORC column c_r3 (new) + ); TypeDescription newOrcSchema = ORCSchemaUtil.buildOrcProjection(evolveSchema, orcSchema); assertEquals(2, newOrcSchema.getChildren().size()); @@ -77,13 +73,10 @@ public void testProjectionPrimitive() { @Test public void testProjectionNestedNoOp() { - Types.StructType nestedStructType = Types.StructType.of( - optional(2, "b", Types.StringType.get()), - optional(3, "c", Types.DateType.get()) - ); - Schema originalSchema = new Schema( - optional(1, "a", nestedStructType) - ); + Types.StructType nestedStructType = + Types.StructType.of( + optional(2, "b", Types.StringType.get()), optional(3, "c", Types.DateType.get())); + Schema originalSchema = new Schema(optional(1, "a", nestedStructType)); // Original mapping (stored in ORC) TypeDescription orcSchema = ORCSchemaUtil.convert(originalSchema); @@ -100,25 +93,19 @@ public void testProjectionNestedNoOp() { @Test public void testProjectionNested() { - Types.StructType nestedStructType = Types.StructType.of( - optional(2, "b", Types.StringType.get()), - optional(3, "c", Types.DateType.get()) - ); - Schema originalSchema = new Schema( - optional(1, "a", nestedStructType) - ); + Types.StructType nestedStructType = + Types.StructType.of( + optional(2, "b", Types.StringType.get()), optional(3, "c", Types.DateType.get())); + Schema originalSchema = new Schema(optional(1, "a", nestedStructType)); // Original mapping (stored in ORC) TypeDescription orcSchema = ORCSchemaUtil.convert(originalSchema); // Evolve schema - Types.StructType newNestedStructType = Types.StructType.of( - optional(3, "cc", Types.DateType.get()), - optional(2, "bb", Types.StringType.get()) - ); - Schema evolveSchema = new Schema( - optional(1, "aa", newNestedStructType) - ); + Types.StructType newNestedStructType = + Types.StructType.of( + optional(3, "cc", Types.DateType.get()), optional(2, "bb", Types.StringType.get())); + Schema evolveSchema = new Schema(optional(1, "aa", newNestedStructType)); TypeDescription newOrcSchema = ORCSchemaUtil.buildOrcProjection(evolveSchema, orcSchema); assertEquals(1, newOrcSchema.getChildren().size()); @@ -132,17 +119,13 @@ public void testProjectionNested() { @Test public void testEvolutionAddContainerField() { - Schema baseSchema = new Schema( - required(1, "a", Types.IntegerType.get()) - ); + Schema baseSchema = new Schema(required(1, "a", Types.IntegerType.get())); TypeDescription baseOrcSchema = ORCSchemaUtil.convert(baseSchema); - Schema evolvedSchema = new Schema( - required(1, "a", Types.IntegerType.get()), - optional(2, "b", Types.StructType.of( - required(3, "c", Types.LongType.get()) - )) - ); + Schema evolvedSchema = + new Schema( + required(1, "a", Types.IntegerType.get()), + optional(2, "b", Types.StructType.of(required(3, "c", Types.LongType.get())))); TypeDescription newOrcSchema = ORCSchemaUtil.buildOrcProjection(evolvedSchema, baseOrcSchema); assertEquals(2, newOrcSchema.getChildren().size()); @@ -154,29 +137,28 @@ public void testEvolutionAddContainerField() { assertEquals(TypeDescription.Category.LONG, nestedCol.findSubtype("c_r3").getCategory()); } - @Rule - public ExpectedException exceptionRule = ExpectedException.none(); + @Rule public ExpectedException exceptionRule = ExpectedException.none(); @Test public void testRequiredNestedFieldMissingInFile() { exceptionRule.expect(IllegalArgumentException.class); exceptionRule.expectMessage("Field 4 of type long is required and was not found"); - Schema baseSchema = new Schema( - required(1, "a", Types.IntegerType.get()), - required(2, "b", Types.StructType.of( - required(3, "c", Types.LongType.get()) - )) - ); + Schema baseSchema = + new Schema( + required(1, "a", Types.IntegerType.get()), + required(2, "b", Types.StructType.of(required(3, "c", Types.LongType.get())))); TypeDescription baseOrcSchema = ORCSchemaUtil.convert(baseSchema); - Schema evolvedSchema = new Schema( - required(1, "a", Types.IntegerType.get()), - required(2, "b", Types.StructType.of( - required(3, "c", Types.LongType.get()), - required(4, "d", Types.LongType.get()) - )) - ); + Schema evolvedSchema = + new Schema( + required(1, "a", Types.IntegerType.get()), + required( + 2, + "b", + Types.StructType.of( + required(3, "c", Types.LongType.get()), + required(4, "d", Types.LongType.get())))); ORCSchemaUtil.buildOrcProjection(evolvedSchema, baseOrcSchema); } diff --git a/orc/src/test/java/org/apache/iceberg/orc/TestEstimateOrcAvgWidthVisitor.java b/orc/src/test/java/org/apache/iceberg/orc/TestEstimateOrcAvgWidthVisitor.java index aca95efd4959..46736bcf11c9 100644 --- a/orc/src/test/java/org/apache/iceberg/orc/TestEstimateOrcAvgWidthVisitor.java +++ b/orc/src/test/java/org/apache/iceberg/orc/TestEstimateOrcAvgWidthVisitor.java @@ -16,50 +16,62 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import org.apache.iceberg.Schema; import org.apache.iceberg.types.Types; import org.apache.orc.TypeDescription; import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestEstimateOrcAvgWidthVisitor { // all supported fields protected static final Types.NestedField ID_FIELD = required(1, "id", Types.IntegerType.get()); protected static final Types.NestedField DATA_FIELD = optional(2, "data", Types.StringType.get()); - protected static final Types.NestedField FLOAT_FIELD = required(3, "float", Types.FloatType.get()); - protected static final Types.NestedField DOUBLE_FIELD = optional(4, "double", Types.DoubleType.get()); - protected static final Types.NestedField DECIMAL_FIELD = optional(5, "decimal", Types.DecimalType.of(5, 3)); - protected static final Types.NestedField FIXED_FIELD = optional(7, "fixed", Types.FixedType.ofLength(4)); - protected static final Types.NestedField BINARY_FIELD = optional(8, "binary", Types.BinaryType.get()); - protected static final Types.NestedField FLOAT_LIST_FIELD = optional(9, "floatList", - Types.ListType.ofRequired(10, Types.FloatType.get())); + protected static final Types.NestedField FLOAT_FIELD = + required(3, "float", Types.FloatType.get()); + protected static final Types.NestedField DOUBLE_FIELD = + optional(4, "double", Types.DoubleType.get()); + protected static final Types.NestedField DECIMAL_FIELD = + optional(5, "decimal", Types.DecimalType.of(5, 3)); + protected static final Types.NestedField FIXED_FIELD = + optional(7, "fixed", Types.FixedType.ofLength(4)); + protected static final Types.NestedField BINARY_FIELD = + optional(8, "binary", Types.BinaryType.get()); + protected static final Types.NestedField FLOAT_LIST_FIELD = + optional(9, "floatList", Types.ListType.ofRequired(10, Types.FloatType.get())); protected static final Types.NestedField LONG_FIELD = optional(11, "long", Types.LongType.get()); - protected static final Types.NestedField BOOLEAN_FIELD = optional(12, "boolean", Types.BooleanType.get()); - protected static final Types.NestedField TIMESTAMP_ZONE_FIELD = optional(13, "timestampZone", - Types.TimestampType.withZone()); - protected static final Types.NestedField TIMESTAMP_FIELD = optional(14, "timestamp", - Types.TimestampType.withoutZone()); + protected static final Types.NestedField BOOLEAN_FIELD = + optional(12, "boolean", Types.BooleanType.get()); + protected static final Types.NestedField TIMESTAMP_ZONE_FIELD = + optional(13, "timestampZone", Types.TimestampType.withZone()); + protected static final Types.NestedField TIMESTAMP_FIELD = + optional(14, "timestamp", Types.TimestampType.withoutZone()); protected static final Types.NestedField DATE_FIELD = optional(15, "date", Types.DateType.get()); protected static final Types.NestedField UUID_FIELD = required(16, "uuid", Types.UUIDType.get()); - protected static final Types.NestedField MAP_FIELD_1 = optional(17, "map1", - Types.MapType.ofOptional(18, 19, Types.FloatType.get(), Types.StringType.get()) - ); - protected static final Types.NestedField MAP_FIELD_2 = optional(20, "map2", - Types.MapType.ofOptional(21, 22, Types.IntegerType.get(), Types.DoubleType.get()) - ); - protected static final Types.NestedField STRUCT_FIELD = optional(23, "struct", Types.StructType.of( - required(24, "booleanField", Types.BooleanType.get()), - optional(25, "date", Types.DateType.get()), - optional(27, "timestamp", Types.TimestampType.withZone()) - )); + protected static final Types.NestedField MAP_FIELD_1 = + optional( + 17, + "map1", + Types.MapType.ofOptional(18, 19, Types.FloatType.get(), Types.StringType.get())); + protected static final Types.NestedField MAP_FIELD_2 = + optional( + 20, + "map2", + Types.MapType.ofOptional(21, 22, Types.IntegerType.get(), Types.DoubleType.get())); + protected static final Types.NestedField STRUCT_FIELD = + optional( + 23, + "struct", + Types.StructType.of( + required(24, "booleanField", Types.BooleanType.get()), + optional(25, "date", Types.DateType.get()), + optional(27, "timestamp", Types.TimestampType.withZone()))); @Test public void testEstimateIntegerWidth() { @@ -146,7 +158,8 @@ public void testEstimateTimestampWidth() { Schema timestampZoneSchema = new Schema(TIMESTAMP_ZONE_FIELD); TypeDescription timestampZoneOrcSchema = ORCSchemaUtil.convert(timestampZoneSchema); long estimateLength = getEstimateLength(timestampZoneOrcSchema); - Assert.assertEquals("Estimated average length of timestamps with zone must be 12.", 12, estimateLength); + Assert.assertEquals( + "Estimated average length of timestamps with zone must be 12.", 12, estimateLength); Schema timestampSchema = new Schema(TIMESTAMP_FIELD); TypeDescription timestampOrcSchema = ORCSchemaUtil.convert(timestampSchema); @@ -188,8 +201,20 @@ public void testEstimateStructWidth() { @Test public void testEstimateFullWidth() { - Schema fullSchema = new Schema(ID_FIELD, DATA_FIELD, FLOAT_FIELD, DOUBLE_FIELD, DECIMAL_FIELD, FIXED_FIELD, - BINARY_FIELD, FLOAT_LIST_FIELD, LONG_FIELD, MAP_FIELD_1, MAP_FIELD_2, STRUCT_FIELD); + Schema fullSchema = + new Schema( + ID_FIELD, + DATA_FIELD, + FLOAT_FIELD, + DOUBLE_FIELD, + DECIMAL_FIELD, + FIXED_FIELD, + BINARY_FIELD, + FLOAT_LIST_FIELD, + LONG_FIELD, + MAP_FIELD_1, + MAP_FIELD_2, + STRUCT_FIELD); TypeDescription fullOrcSchema = ORCSchemaUtil.convert(fullSchema); long estimateLength = getEstimateLength(fullOrcSchema); Assert.assertEquals("Estimated average length of the row must be 611.", 611, estimateLength); @@ -197,6 +222,8 @@ public void testEstimateFullWidth() { private Integer getEstimateLength(TypeDescription orcSchemaWithDate) { return OrcSchemaVisitor.visitSchema(orcSchemaWithDate, new EstimateOrcAvgWidthVisitor()) - .stream().reduce(Integer::sum).orElse(0); + .stream() + .reduce(Integer::sum) + .orElse(0); } } diff --git a/orc/src/test/java/org/apache/iceberg/orc/TestExpressionToSearchArgument.java b/orc/src/test/java/org/apache/iceberg/orc/TestExpressionToSearchArgument.java index a7c77b111be4..4af8768786bf 100644 --- a/orc/src/test/java/org/apache/iceberg/orc/TestExpressionToSearchArgument.java +++ b/orc/src/test/java/org/apache/iceberg/orc/TestExpressionToSearchArgument.java @@ -16,9 +16,24 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; +import static org.apache.iceberg.expressions.Expressions.and; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.isNaN; +import static org.apache.iceberg.expressions.Expressions.isNull; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.expressions.Expressions.notIn; +import static org.apache.iceberg.expressions.Expressions.notNaN; +import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.math.BigDecimal; import java.nio.ByteBuffer; import java.sql.Date; @@ -46,70 +61,79 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.isNull; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.notEqual; -import static org.apache.iceberg.expressions.Expressions.notIn; -import static org.apache.iceberg.expressions.Expressions.notNaN; -import static org.apache.iceberg.expressions.Expressions.notNull; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestExpressionToSearchArgument { @Test public void testPrimitiveTypes() { - Schema schema = new Schema( - required(1, "int", Types.IntegerType.get()), - required(2, "long", Types.LongType.get()), - required(3, "float", Types.FloatType.get()), - required(4, "double", Types.DoubleType.get()), - required(5, "boolean", Types.BooleanType.get()), - required(6, "string", Types.StringType.get()), - required(7, "date", Types.DateType.get()), - required(8, "time", Types.TimeType.get()), - required(9, "tsTz", Types.TimestampType.withZone()), - required(10, "ts", Types.TimestampType.withoutZone()), - required(11, "decimal", Types.DecimalType.of(38, 2)), - required(12, "float2", Types.FloatType.get()), - required(13, "double2", Types.DoubleType.get()) - ); - - Expression expr = and( - and( - and(lessThan("int", 1), lessThanOrEqual("long", 100)), - and(greaterThan("float", 5.0), greaterThanOrEqual("double", 500.0)) - ), + Schema schema = + new Schema( + required(1, "int", Types.IntegerType.get()), + required(2, "long", Types.LongType.get()), + required(3, "float", Types.FloatType.get()), + required(4, "double", Types.DoubleType.get()), + required(5, "boolean", Types.BooleanType.get()), + required(6, "string", Types.StringType.get()), + required(7, "date", Types.DateType.get()), + required(8, "time", Types.TimeType.get()), + required(9, "tsTz", Types.TimestampType.withZone()), + required(10, "ts", Types.TimestampType.withoutZone()), + required(11, "decimal", Types.DecimalType.of(38, 2)), + required(12, "float2", Types.FloatType.get()), + required(13, "double2", Types.DoubleType.get())); + + Expression expr = and( - and(equal("boolean", true), notEqual("string", "test")), - and(in("decimal", BigDecimal.valueOf(-12345, 2), BigDecimal.valueOf(12345, 2)), notIn("time", 100L, 200L)) - ), - and(isNaN("float2"), notNaN("double2")) - ); + and( + and(lessThan("int", 1), lessThanOrEqual("long", 100)), + and(greaterThan("float", 5.0), greaterThanOrEqual("double", 500.0))), + and( + and(equal("boolean", true), notEqual("string", "test")), + and( + in("decimal", BigDecimal.valueOf(-12345, 2), BigDecimal.valueOf(12345, 2)), + notIn("time", 100L, 200L))), + and(isNaN("float2"), notNaN("double2"))); Expression boundFilter = Binder.bind(schema.asStruct(), expr, true); - SearchArgument expected = SearchArgumentFactory.newBuilder() - .startAnd() - .lessThan("`int`", Type.LONG, 1L) - .lessThanEquals("`long`", Type.LONG, 100L) - .startNot().lessThanEquals("`float`", Type.FLOAT, 5.0).end() - .startNot().lessThan("`double`", Type.FLOAT, 500.0).end() - .equals("`boolean`", Type.BOOLEAN, true) - .startOr().isNull("`string`", Type.STRING).startNot().equals("`string`", Type.STRING, "test").end().end() - .in("`decimal`", Type.DECIMAL, new HiveDecimalWritable("-123.45"), new HiveDecimalWritable("123.45")) - .startOr().isNull("`time`", Type.LONG).startNot().in("`time`", Type.LONG, 100L, 200L).end().end() - .equals("`float2`", Type.FLOAT, Double.NaN) - .startOr().isNull("`double2`", Type.FLOAT).startNot().equals("`double2`", Type.FLOAT, Double.NaN).end().end() - .end() - .build(); - - SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, ORCSchemaUtil.convert(schema)); + SearchArgument expected = + SearchArgumentFactory.newBuilder() + .startAnd() + .lessThan("`int`", Type.LONG, 1L) + .lessThanEquals("`long`", Type.LONG, 100L) + .startNot() + .lessThanEquals("`float`", Type.FLOAT, 5.0) + .end() + .startNot() + .lessThan("`double`", Type.FLOAT, 500.0) + .end() + .equals("`boolean`", Type.BOOLEAN, true) + .startOr() + .isNull("`string`", Type.STRING) + .startNot() + .equals("`string`", Type.STRING, "test") + .end() + .end() + .in( + "`decimal`", + Type.DECIMAL, + new HiveDecimalWritable("-123.45"), + new HiveDecimalWritable("123.45")) + .startOr() + .isNull("`time`", Type.LONG) + .startNot() + .in("`time`", Type.LONG, 100L, 200L) + .end() + .end() + .equals("`float2`", Type.FLOAT, Double.NaN) + .startOr() + .isNull("`double2`", Type.FLOAT) + .startNot() + .equals("`double2`", Type.FLOAT, Double.NaN) + .end() + .end() + .end() + .build(); + + SearchArgument actual = + ExpressionToSearchArgument.convert(boundFilter, ORCSchemaUtil.convert(schema)); Assert.assertEquals(expected.toString(), actual.toString()); } @@ -117,32 +141,39 @@ public void testPrimitiveTypes() { public void testTimezoneSensitiveTypes() { TimeZone currentTz = TimeZone.getDefault(); try { - for (String timezone : new String[]{"America/New_York", "Asia/Kolkata", "UTC/Greenwich"}) { + for (String timezone : new String[] {"America/New_York", "Asia/Kolkata", "UTC/Greenwich"}) { TimeZone.setDefault(TimeZone.getTimeZone(timezone)); OffsetDateTime tsTzPredicate = OffsetDateTime.parse("2019-10-02T00:47:28.207366Z"); OffsetDateTime tsPredicate = OffsetDateTime.parse("1968-01-16T13:07:59.048625Z"); OffsetDateTime epoch = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC); - Schema schema = new Schema( - required(1, "date", Types.DateType.get()), - required(2, "tsTz", Types.TimestampType.withZone()), - required(3, "ts", Types.TimestampType.withoutZone()) - ); + Schema schema = + new Schema( + required(1, "date", Types.DateType.get()), + required(2, "tsTz", Types.TimestampType.withZone()), + required(3, "ts", Types.TimestampType.withoutZone())); - Expression expr = and( - and(equal("date", 10L), equal("tsTz", ChronoUnit.MICROS.between(epoch, tsTzPredicate))), - equal("ts", ChronoUnit.MICROS.between(epoch, tsPredicate)) - ); + Expression expr = + and( + and( + equal("date", 10L), + equal("tsTz", ChronoUnit.MICROS.between(epoch, tsTzPredicate))), + equal("ts", ChronoUnit.MICROS.between(epoch, tsPredicate))); Expression boundFilter = Binder.bind(schema.asStruct(), expr, true); - SearchArgument expected = SearchArgumentFactory.newBuilder() - .startAnd() - .equals("`date`", Type.DATE, Date.valueOf(LocalDate.parse("1970-01-11", DateTimeFormatter.ISO_LOCAL_DATE))) - .equals("`tsTz`", Type.TIMESTAMP, Timestamp.from(tsTzPredicate.toInstant())) - .equals("`ts`", Type.TIMESTAMP, Timestamp.from(tsPredicate.toInstant())) - .end() - .build(); - - SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, ORCSchemaUtil.convert(schema)); + SearchArgument expected = + SearchArgumentFactory.newBuilder() + .startAnd() + .equals( + "`date`", + Type.DATE, + Date.valueOf(LocalDate.parse("1970-01-11", DateTimeFormatter.ISO_LOCAL_DATE))) + .equals("`tsTz`", Type.TIMESTAMP, Timestamp.from(tsTzPredicate.toInstant())) + .equals("`ts`", Type.TIMESTAMP, Timestamp.from(tsPredicate.toInstant())) + .end() + .build(); + + SearchArgument actual = + ExpressionToSearchArgument.convert(boundFilter, ORCSchemaUtil.convert(schema)); Assert.assertEquals(expected.toString(), actual.toString()); } } finally { @@ -152,130 +183,147 @@ public void testTimezoneSensitiveTypes() { @Test public void testUnsupportedTypes() { - Schema schema = new Schema( - required(1, "binary", Types.BinaryType.get()), - required(2, "fixed", Types.FixedType.ofLength(5)), - required(3, "uuid", Types.UUIDType.get()), - // use optional fields for performing isNull checks because Iceberg itself resolves them for required fields - optional(4, "struct", Types.StructType.of( - required(5, "long", Types.LongType.get()) - )), - optional(6, "list", Types.ListType.ofRequired(7, Types.LongType.get())), - optional(8, "map", Types.MapType.ofRequired(9, 10, Types.LongType.get(), Types.LongType.get())) - ); + Schema schema = + new Schema( + required(1, "binary", Types.BinaryType.get()), + required(2, "fixed", Types.FixedType.ofLength(5)), + required(3, "uuid", Types.UUIDType.get()), + // use optional fields for performing isNull checks because Iceberg itself resolves them + // for required fields + optional(4, "struct", Types.StructType.of(required(5, "long", Types.LongType.get()))), + optional(6, "list", Types.ListType.ofRequired(7, Types.LongType.get())), + optional( + 8, + "map", + Types.MapType.ofRequired(9, 10, Types.LongType.get(), Types.LongType.get()))); // all operations for these types should resolve to YES_NO_NULL - Expression expr = and( + Expression expr = and( - and(equal("binary", ByteBuffer.allocate(10)), notEqual("fixed", ByteBuffer.allocate(5))), - and(greaterThan("uuid", UUID.fromString("1-2-3-4-5")), isNull("struct")) - ), - and(notNull("list"), isNull("map")) - ); + and( + and( + equal("binary", ByteBuffer.allocate(10)), + notEqual("fixed", ByteBuffer.allocate(5))), + and(greaterThan("uuid", UUID.fromString("1-2-3-4-5")), isNull("struct"))), + and(notNull("list"), isNull("map"))); Expression boundFilter = Binder.bind(schema.asStruct(), expr, true); - SearchArgument expected = SearchArgumentFactory.newBuilder() - .literal(TruthValue.YES_NO_NULL) - .build(); + SearchArgument expected = + SearchArgumentFactory.newBuilder().literal(TruthValue.YES_NO_NULL).build(); - SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, ORCSchemaUtil.convert(schema)); + SearchArgument actual = + ExpressionToSearchArgument.convert(boundFilter, ORCSchemaUtil.convert(schema)); Assert.assertEquals(expected.toString(), actual.toString()); } @Test public void testNestedPrimitives() { - Schema schema = new Schema( - optional(1, "struct", Types.StructType.of( - required(2, "long", Types.LongType.get()), - required(11, "float", Types.FloatType.get()) - )), - optional(3, "list", Types.ListType.ofRequired(4, Types.LongType.get())), - optional(5, "map", Types.MapType.ofRequired(6, 7, Types.LongType.get(), Types.DoubleType.get())), - optional(8, "listOfStruct", Types.ListType.ofRequired(9, Types.StructType.of( - required(10, "long", Types.LongType.get())))) - ); - - Expression expr = and( - and(equal("struct.long", 1), equal("list.element", 2)), - and(equal("map.key", 3), equal("listOfStruct.long", 4)), - and(isNaN("map.value"), notNaN("struct.float")) - ); + Schema schema = + new Schema( + optional( + 1, + "struct", + Types.StructType.of( + required(2, "long", Types.LongType.get()), + required(11, "float", Types.FloatType.get()))), + optional(3, "list", Types.ListType.ofRequired(4, Types.LongType.get())), + optional( + 5, + "map", + Types.MapType.ofRequired(6, 7, Types.LongType.get(), Types.DoubleType.get())), + optional( + 8, + "listOfStruct", + Types.ListType.ofRequired( + 9, Types.StructType.of(required(10, "long", Types.LongType.get()))))); + + Expression expr = + and( + and(equal("struct.long", 1), equal("list.element", 2)), + and(equal("map.key", 3), equal("listOfStruct.long", 4)), + and(isNaN("map.value"), notNaN("struct.float"))); Expression boundFilter = Binder.bind(schema.asStruct(), expr, true); - SearchArgument expected = SearchArgumentFactory.newBuilder() - .startAnd() - .equals("`struct`.`long`", Type.LONG, 1L) - .equals("`list`.`_elem`", Type.LONG, 2L) - .equals("`map`.`_key`", Type.LONG, 3L) - .equals("`listOfStruct`.`_elem`.`long`", Type.LONG, 4L) - .equals("`map`.`_value`", Type.FLOAT, Double.NaN) - .startOr() - .isNull("`struct`.`float`", Type.FLOAT) - .startNot().equals("`struct`.`float`", Type.FLOAT, Double.NaN) - .end() // not - .end() // or - .end() // and - .build(); - - SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, ORCSchemaUtil.convert(schema)); + SearchArgument expected = + SearchArgumentFactory.newBuilder() + .startAnd() + .equals("`struct`.`long`", Type.LONG, 1L) + .equals("`list`.`_elem`", Type.LONG, 2L) + .equals("`map`.`_key`", Type.LONG, 3L) + .equals("`listOfStruct`.`_elem`.`long`", Type.LONG, 4L) + .equals("`map`.`_value`", Type.FLOAT, Double.NaN) + .startOr() + .isNull("`struct`.`float`", Type.FLOAT) + .startNot() + .equals("`struct`.`float`", Type.FLOAT, Double.NaN) + .end() // not + .end() // or + .end() // and + .build(); + + SearchArgument actual = + ExpressionToSearchArgument.convert(boundFilter, ORCSchemaUtil.convert(schema)); Assert.assertEquals(expected.toString(), actual.toString()); } @Test public void testSpecialCharacters() { - Schema schema = new Schema( - required(1, "col.with.dots", Types.StructType.of( - required(2, "inner.col.with.dots", Types.LongType.get()) - )), - required(3, "colW!th$peci@lCh@rs", Types.LongType.get()), - required(4, "colWith`Quotes`", Types.LongType.get()) - ); - - Expression expr = and( - equal("col.with.dots.inner.col.with.dots", 1), - and(equal("colW!th$peci@lCh@rs", 2), equal("colWith`Quotes`", 3)) - ); + Schema schema = + new Schema( + required( + 1, + "col.with.dots", + Types.StructType.of(required(2, "inner.col.with.dots", Types.LongType.get()))), + required(3, "colW!th$peci@lCh@rs", Types.LongType.get()), + required(4, "colWith`Quotes`", Types.LongType.get())); + + Expression expr = + and( + equal("col.with.dots.inner.col.with.dots", 1), + and(equal("colW!th$peci@lCh@rs", 2), equal("colWith`Quotes`", 3))); Expression boundFilter = Binder.bind(schema.asStruct(), expr, true); - SearchArgument expected = SearchArgumentFactory.newBuilder() - .startAnd() - .equals("`col.with.dots`.`inner.col.with.dots`", Type.LONG, 1L) - .equals("`colW!th$peci@lCh@rs`", Type.LONG, 2L) - .equals("`colWith``Quotes```", Type.LONG, 3L) - .end() - .build(); - - SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, ORCSchemaUtil.convert(schema)); + SearchArgument expected = + SearchArgumentFactory.newBuilder() + .startAnd() + .equals("`col.with.dots`.`inner.col.with.dots`", Type.LONG, 1L) + .equals("`colW!th$peci@lCh@rs`", Type.LONG, 2L) + .equals("`colWith``Quotes```", Type.LONG, 3L) + .end() + .build(); + + SearchArgument actual = + ExpressionToSearchArgument.convert(boundFilter, ORCSchemaUtil.convert(schema)); Assert.assertEquals(expected.toString(), actual.toString()); } @Test public void testEvolvedSchema() { - Schema fileSchema = new Schema( - required(1, "int", Types.IntegerType.get()), - optional(2, "long_to_be_dropped", Types.LongType.get()) - ); + Schema fileSchema = + new Schema( + required(1, "int", Types.IntegerType.get()), + optional(2, "long_to_be_dropped", Types.LongType.get())); - Schema evolvedSchema = new Schema( - required(1, "int_renamed", Types.IntegerType.get()), - optional(3, "float_added", Types.FloatType.get()) - ); + Schema evolvedSchema = + new Schema( + required(1, "int_renamed", Types.IntegerType.get()), + optional(3, "float_added", Types.FloatType.get())); - TypeDescription readSchema = ORCSchemaUtil.buildOrcProjection(evolvedSchema, ORCSchemaUtil.convert(fileSchema)); + TypeDescription readSchema = + ORCSchemaUtil.buildOrcProjection(evolvedSchema, ORCSchemaUtil.convert(fileSchema)); Expression expr = equal("int_renamed", 1); Expression boundFilter = Binder.bind(evolvedSchema.asStruct(), expr, true); - SearchArgument expected = SearchArgumentFactory.newBuilder() - .equals("`int`", Type.LONG, 1L) - .build(); + SearchArgument expected = + SearchArgumentFactory.newBuilder().equals("`int`", Type.LONG, 1L).build(); SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, readSchema); Assert.assertEquals(expected.toString(), actual.toString()); // for columns not in the file, buildOrcProjection will append field names with _r - // this will be passed down to ORC, but ORC will handle such cases and return a TruthValue during evaluation + // this will be passed down to ORC, but ORC will handle such cases and return a TruthValue + // during evaluation expr = equal("float_added", 1); boundFilter = Binder.bind(evolvedSchema.asStruct(), expr, true); - expected = SearchArgumentFactory.newBuilder() - .equals("`float_added_r3`", Type.FLOAT, 1.0) - .build(); + expected = + SearchArgumentFactory.newBuilder().equals("`float_added_r3`", Type.FLOAT, 1.0).build(); actual = ExpressionToSearchArgument.convert(boundFilter, readSchema); Assert.assertEquals(expected.toString(), actual.toString()); @@ -283,23 +331,25 @@ public void testEvolvedSchema() { @Test public void testOriginalSchemaNameMapping() { - Schema originalSchema = new Schema( - required(1, "int", Types.IntegerType.get()), - optional(2, "long", Types.LongType.get()) - ); + Schema originalSchema = + new Schema( + required(1, "int", Types.IntegerType.get()), optional(2, "long", Types.LongType.get())); - TypeDescription orcSchemaWithoutIds = ORCSchemaUtil.removeIds(ORCSchemaUtil.convert(originalSchema)); + TypeDescription orcSchemaWithoutIds = + ORCSchemaUtil.removeIds(ORCSchemaUtil.convert(originalSchema)); NameMapping nameMapping = MappingUtil.create(originalSchema); - TypeDescription readSchema = ORCSchemaUtil.buildOrcProjection(originalSchema, - ORCSchemaUtil.applyNameMapping(orcSchemaWithoutIds, nameMapping)); + TypeDescription readSchema = + ORCSchemaUtil.buildOrcProjection( + originalSchema, ORCSchemaUtil.applyNameMapping(orcSchemaWithoutIds, nameMapping)); Expression expr = and(equal("int", 1), equal("long", 1)); Expression boundFilter = Binder.bind(originalSchema.asStruct(), expr, true); - SearchArgument expected = SearchArgumentFactory.newBuilder() - .equals("`int`", Type.LONG, 1L) - .equals("`long`", Type.LONG, 1L) - .build(); + SearchArgument expected = + SearchArgumentFactory.newBuilder() + .equals("`int`", Type.LONG, 1L) + .equals("`long`", Type.LONG, 1L) + .build(); SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, readSchema); Assert.assertEquals(expected.toString(), actual.toString()); @@ -307,36 +357,37 @@ public void testOriginalSchemaNameMapping() { @Test public void testModifiedSimpleSchemaNameMapping() { - Schema originalSchema = new Schema( - required(1, "int", Types.IntegerType.get()), - optional(2, "long_to_be_dropped", Types.LongType.get()) - ); - Schema mappingSchema = new Schema( - required(1, "int", Types.IntegerType.get()), - optional(3, "new_float_field", Types.FloatType.get()) - ); - TypeDescription orcSchemaWithoutIds = ORCSchemaUtil.removeIds(ORCSchemaUtil.convert(originalSchema)); + Schema originalSchema = + new Schema( + required(1, "int", Types.IntegerType.get()), + optional(2, "long_to_be_dropped", Types.LongType.get())); + Schema mappingSchema = + new Schema( + required(1, "int", Types.IntegerType.get()), + optional(3, "new_float_field", Types.FloatType.get())); + TypeDescription orcSchemaWithoutIds = + ORCSchemaUtil.removeIds(ORCSchemaUtil.convert(originalSchema)); NameMapping nameMapping = MappingUtil.create(mappingSchema); - TypeDescription readSchema = ORCSchemaUtil.buildOrcProjection(mappingSchema, - ORCSchemaUtil.applyNameMapping(orcSchemaWithoutIds, nameMapping)); + TypeDescription readSchema = + ORCSchemaUtil.buildOrcProjection( + mappingSchema, ORCSchemaUtil.applyNameMapping(orcSchemaWithoutIds, nameMapping)); Expression expr = equal("int", 1); Expression boundFilter = Binder.bind(mappingSchema.asStruct(), expr, true); - SearchArgument expected = SearchArgumentFactory.newBuilder() - .equals("`int`", Type.LONG, 1L) - .build(); + SearchArgument expected = + SearchArgumentFactory.newBuilder().equals("`int`", Type.LONG, 1L).build(); SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, readSchema); Assert.assertEquals(expected.toString(), actual.toString()); // for columns not in the file, buildOrcProjection will append field names with _r - // this will be passed down to ORC, but ORC will handle such cases and return a TruthValue during evaluation + // this will be passed down to ORC, but ORC will handle such cases and return a TruthValue + // during evaluation expr = equal("new_float_field", 1); boundFilter = Binder.bind(mappingSchema.asStruct(), expr, true); - expected = SearchArgumentFactory.newBuilder() - .equals("`new_float_field_r3`", Type.FLOAT, 1.0) - .build(); + expected = + SearchArgumentFactory.newBuilder().equals("`new_float_field_r3`", Type.FLOAT, 1.0).build(); actual = ExpressionToSearchArgument.convert(boundFilter, readSchema); Assert.assertEquals(expected.toString(), actual.toString()); @@ -344,67 +395,83 @@ public void testModifiedSimpleSchemaNameMapping() { @Test public void testModifiedComplexSchemaNameMapping() { - Schema originalSchema = new Schema( - optional(1, "struct", Types.StructType.of( - required(2, "long", Types.LongType.get()) - )), - optional(3, "list", Types.ListType.ofRequired(4, Types.LongType.get())), - optional(5, "map", Types.MapType.ofRequired(6, 7, Types.LongType.get(), Types.LongType.get())), - optional(8, "listOfStruct", Types.ListType.ofRequired(9, Types.StructType.of( - required(10, "long", Types.LongType.get())))), - optional(11, "listOfPeople", Types.ListType.ofRequired(12, Types.StructType.of( - required(13, "name", Types.StringType.get()), - required(14, "birth_date", Types.DateType.get())))) - ); - Schema mappingSchema = new Schema( - optional(1, "struct", Types.StructType.of( - required(2, "int", Types.LongType.get()) - )), - optional(3, "list", Types.ListType.ofRequired(4, Types.LongType.get())), - optional(5, "newMap", Types.MapType.ofRequired(6, 7, Types.StringType.get(), Types.LongType.get())), - optional(8, "listOfStruct", Types.ListType.ofRequired(9, Types.StructType.of( - required(10, "newLong", Types.LongType.get())))), - optional(11, "listOfPeople", Types.ListType.ofRequired(12, Types.StructType.of( - required(13, "name", Types.StringType.get()), - required(14, "age", Types.IntegerType.get())))) - ); - TypeDescription orcSchemaWithoutIds = ORCSchemaUtil.removeIds(ORCSchemaUtil.convert(originalSchema)); + Schema originalSchema = + new Schema( + optional(1, "struct", Types.StructType.of(required(2, "long", Types.LongType.get()))), + optional(3, "list", Types.ListType.ofRequired(4, Types.LongType.get())), + optional( + 5, + "map", + Types.MapType.ofRequired(6, 7, Types.LongType.get(), Types.LongType.get())), + optional( + 8, + "listOfStruct", + Types.ListType.ofRequired( + 9, Types.StructType.of(required(10, "long", Types.LongType.get())))), + optional( + 11, + "listOfPeople", + Types.ListType.ofRequired( + 12, + Types.StructType.of( + required(13, "name", Types.StringType.get()), + required(14, "birth_date", Types.DateType.get()))))); + Schema mappingSchema = + new Schema( + optional(1, "struct", Types.StructType.of(required(2, "int", Types.LongType.get()))), + optional(3, "list", Types.ListType.ofRequired(4, Types.LongType.get())), + optional( + 5, + "newMap", + Types.MapType.ofRequired(6, 7, Types.StringType.get(), Types.LongType.get())), + optional( + 8, + "listOfStruct", + Types.ListType.ofRequired( + 9, Types.StructType.of(required(10, "newLong", Types.LongType.get())))), + optional( + 11, + "listOfPeople", + Types.ListType.ofRequired( + 12, + Types.StructType.of( + required(13, "name", Types.StringType.get()), + required(14, "age", Types.IntegerType.get()))))); + TypeDescription orcSchemaWithoutIds = + ORCSchemaUtil.removeIds(ORCSchemaUtil.convert(originalSchema)); NameMapping nameMapping = MappingUtil.create(mappingSchema); - TypeDescription readSchema = ORCSchemaUtil.buildOrcProjection(mappingSchema, - ORCSchemaUtil.applyNameMapping(orcSchemaWithoutIds, nameMapping)); + TypeDescription readSchema = + ORCSchemaUtil.buildOrcProjection( + mappingSchema, ORCSchemaUtil.applyNameMapping(orcSchemaWithoutIds, nameMapping)); - Expression expr = and( + Expression expr = and( - equal("struct.int", 1), and( - lessThanOrEqual("list.element", 5), - equal("newMap.key", "country") - ), and( - equal("listOfStruct.newLong", 100L), - notEqual("listOfPeople.name", "Bob") - ) - - ), - lessThan("listOfPeople.age", 30) - ); + equal("struct.int", 1), + and(lessThanOrEqual("list.element", 5), equal("newMap.key", "country")), + and(equal("listOfStruct.newLong", 100L), notEqual("listOfPeople.name", "Bob"))), + lessThan("listOfPeople.age", 30)); Expression boundFilter = Binder.bind(mappingSchema.asStruct(), expr, true); - SearchArgument expected = SearchArgumentFactory.newBuilder() - .startAnd() - // Drops struct.long - .equals("`struct`.`int_r2`", Type.LONG, 1L) - .lessThanEquals("`list`.`_elem`", Type.LONG, 5L) - // Drops map - .equals("`newMap_r5`.`_key`", Type.STRING, "country") - // Drops listOfStruct.long - .equals("`listOfStruct`.`_elem`.`newLong_r10`", Type.LONG, 100L) - .startOr() - .isNull("`listOfPeople`.`_elem`.`name`", Type.STRING) - .startNot().equals("`listOfPeople`.`_elem`.`name`", Type.STRING, "Bob").end() - .end() - .lessThan("`listOfPeople`.`_elem`.`age_r14`", Type.LONG, 30L) - .end() - .build(); + SearchArgument expected = + SearchArgumentFactory.newBuilder() + .startAnd() + // Drops struct.long + .equals("`struct`.`int_r2`", Type.LONG, 1L) + .lessThanEquals("`list`.`_elem`", Type.LONG, 5L) + // Drops map + .equals("`newMap_r5`.`_key`", Type.STRING, "country") + // Drops listOfStruct.long + .equals("`listOfStruct`.`_elem`.`newLong_r10`", Type.LONG, 100L) + .startOr() + .isNull("`listOfPeople`.`_elem`.`name`", Type.STRING) + .startNot() + .equals("`listOfPeople`.`_elem`.`name`", Type.STRING, "Bob") + .end() + .end() + .lessThan("`listOfPeople`.`_elem`.`age_r14`", Type.LONG, 30L) + .end() + .build(); SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, readSchema); Assert.assertEquals(expected.toString(), actual.toString()); diff --git a/orc/src/test/java/org/apache/iceberg/orc/TestIdToOrcName.java b/orc/src/test/java/org/apache/iceberg/orc/TestIdToOrcName.java index f4867343a3aa..8d1ada5bd668 100644 --- a/orc/src/test/java/org/apache/iceberg/orc/TestIdToOrcName.java +++ b/orc/src/test/java/org/apache/iceberg/orc/TestIdToOrcName.java @@ -16,45 +16,58 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.iceberg.Schema; import org.apache.iceberg.types.Types; import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestIdToOrcName { @Test public void testIdToQuotedColumnName() { - Schema schema = new Schema( - required(1, "long", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - required(3, "long", Types.LongType.get()) - )), - required(4, "listOfLongs", Types.ListType.ofRequired(5, Types.LongType.get())), - required(6, "listOfStructs", Types.ListType.ofRequired(7, Types.StructType.of( - required(8, "long", Types.LongType.get()) - ))), - required(9, "map", Types.MapType.ofRequired(10, 11, Types.LongType.get(), Types.LongType.get())), - required(12, "mapOfStructs", Types.MapType.ofRequired(13, 14, - Types.StructType.of(required(15, "long", Types.LongType.get())), - Types.StructType.of(required(16, "long", Types.LongType.get())) - )), - required(17, "listOfMapsOfStruct", Types.ListType.ofRequired(18, Types.MapType.ofRequired(19, 20, - Types.StructType.of(required(21, "long", Types.LongType.get())), - Types.StructType.of(required(22, "long", Types.LongType.get())) - ))), - required(23, "col.with.dots", Types.StructType.of( - required(24, "inner.col.with.dots", Types.LongType.get()) - )), - required(25, "colW!th$peci@lCh@rs", Types.LongType.get()), - required(26, "colWith`Quotes`", Types.LongType.get()) - ); + Schema schema = + new Schema( + required(1, "long", Types.LongType.get()), + required(2, "struct", Types.StructType.of(required(3, "long", Types.LongType.get()))), + required(4, "listOfLongs", Types.ListType.ofRequired(5, Types.LongType.get())), + required( + 6, + "listOfStructs", + Types.ListType.ofRequired( + 7, Types.StructType.of(required(8, "long", Types.LongType.get())))), + required( + 9, + "map", + Types.MapType.ofRequired(10, 11, Types.LongType.get(), Types.LongType.get())), + required( + 12, + "mapOfStructs", + Types.MapType.ofRequired( + 13, + 14, + Types.StructType.of(required(15, "long", Types.LongType.get())), + Types.StructType.of(required(16, "long", Types.LongType.get())))), + required( + 17, + "listOfMapsOfStruct", + Types.ListType.ofRequired( + 18, + Types.MapType.ofRequired( + 19, + 20, + Types.StructType.of(required(21, "long", Types.LongType.get())), + Types.StructType.of(required(22, "long", Types.LongType.get()))))), + required( + 23, + "col.with.dots", + Types.StructType.of(required(24, "inner.col.with.dots", Types.LongType.get()))), + required(25, "colW!th$peci@lCh@rs", Types.LongType.get()), + required(26, "colWith`Quotes`", Types.LongType.get())); Map actual = ORCSchemaUtil.idToOrcName(schema); Assert.assertEquals("`long`", actual.get(1)); diff --git a/orc/src/test/java/org/apache/iceberg/orc/TestORCSchemaUtil.java b/orc/src/test/java/org/apache/iceberg/orc/TestORCSchemaUtil.java index d80671666f00..d64e2dd610e8 100644 --- a/orc/src/test/java/org/apache/iceberg/orc/TestORCSchemaUtil.java +++ b/orc/src/test/java/org/apache/iceberg/orc/TestORCSchemaUtil.java @@ -16,9 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; +import static org.apache.iceberg.AssertHelpers.assertThrows; +import static org.apache.iceberg.orc.ORCSchemaUtil.ICEBERG_ID_ATTRIBUTE; +import static org.apache.iceberg.orc.ORCSchemaUtil.ICEBERG_REQUIRED_ATTRIBUTE; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + import java.util.Collections; import java.util.List; import java.util.Optional; @@ -31,36 +39,28 @@ import org.apache.orc.TypeDescription; import org.junit.Test; -import static org.apache.iceberg.AssertHelpers.assertThrows; -import static org.apache.iceberg.orc.ORCSchemaUtil.ICEBERG_ID_ATTRIBUTE; -import static org.apache.iceberg.orc.ORCSchemaUtil.ICEBERG_REQUIRED_ATTRIBUTE; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - public class TestORCSchemaUtil { - private static final Types.StructType SUPPORTED_PRIMITIVES = Types.StructType.of( - optional(1, "intCol", Types.IntegerType.get()), - optional(3, "longCol", Types.LongType.get()), - optional(6, "intCol2", Types.IntegerType.get()), - optional(20, "intCol3", Types.IntegerType.get()), - required(9, "doubleCol", Types.DoubleType.get()), - required(10, "uuidCol", Types.UUIDType.get()), - optional(2, "booleanCol", Types.BooleanType.get()), - optional(21, "fixedCol", Types.FixedType.ofLength(4096)), - required(22, "binaryCol", Types.BinaryType.get()), - required(23, "stringCol", Types.StringType.get()), - required(25, "floatCol", Types.FloatType.get()), - optional(30, "dateCol", Types.DateType.get()), - required(32, "timeCol", Types.TimeType.get()), - required(34, "timestampCol", Types.TimestampType.withZone()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision - ); + private static final Types.StructType SUPPORTED_PRIMITIVES = + Types.StructType.of( + optional(1, "intCol", Types.IntegerType.get()), + optional(3, "longCol", Types.LongType.get()), + optional(6, "intCol2", Types.IntegerType.get()), + optional(20, "intCol3", Types.IntegerType.get()), + required(9, "doubleCol", Types.DoubleType.get()), + required(10, "uuidCol", Types.UUIDType.get()), + optional(2, "booleanCol", Types.BooleanType.get()), + optional(21, "fixedCol", Types.FixedType.ofLength(4096)), + required(22, "binaryCol", Types.BinaryType.get()), + required(23, "stringCol", Types.StringType.get()), + required(25, "floatCol", Types.FloatType.get()), + optional(30, "dateCol", Types.DateType.get()), + required(32, "timeCol", Types.TimeType.get()), + required(34, "timestampCol", Types.TimestampType.withZone()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision + ); @Test public void testRoundtripConversionPrimitive() { @@ -70,129 +70,147 @@ public void testRoundtripConversionPrimitive() { @Test public void testRoundtripConversionNested() { - Types.StructType leafStructType = Types.StructType.of( - optional(6, "leafLongCol", Types.LongType.get()), - optional(7, "leafBinaryCol", Types.BinaryType.get()) - ); - Types.StructType nestedStructType = Types.StructType.of( - optional(4, "longCol", Types.LongType.get()), - optional(5, "leafStructCol", leafStructType) - ); - Types.StructType structPrimTypeForList = Types.StructType.of( - optional(506, "leafLongCol", Types.LongType.get()), - optional(507, "leafBinaryCol", Types.BinaryType.get()) - ); - Types.StructType leafStructTypeForList = Types.StructType.of( - optional(516, "leafLongCol", Types.LongType.get()), - optional(517, "leafBinaryCol", Types.BinaryType.get()) - ); - Types.StructType nestedStructTypeForList = Types.StructType.of( - optional(504, "longCol", Types.LongType.get()), - optional(505, "leafStructCol", leafStructTypeForList) - ); - Types.StructType structPrimTypeForMap = Types.StructType.of( - optional(606, "leafLongCol", Types.LongType.get()), - optional(607, "leafBinaryCol", Types.BinaryType.get()) - ); - Types.StructType leafStructTypeForMap = Types.StructType.of( - optional(616, "leafLongCol", Types.LongType.get()), - optional(617, "leafBinaryCol", Types.BinaryType.get()) - ); - Types.StructType nestedStructTypeForMap = Types.StructType.of( - optional(604, "longCol", Types.LongType.get()), - optional(605, "leafStructCol", leafStructTypeForMap) - ); - Types.StructType leafStructTypeForStruct = Types.StructType.of( - optional(716, "leafLongCol", Types.LongType.get()), - optional(717, "leafBinaryCol", Types.BinaryType.get()) - ); - Types.StructType nestedStructTypeForStruct = Types.StructType.of( - optional(704, "longCol", Types.LongType.get()), - optional(705, "leafStructCol", leafStructTypeForStruct) - ); + Types.StructType leafStructType = + Types.StructType.of( + optional(6, "leafLongCol", Types.LongType.get()), + optional(7, "leafBinaryCol", Types.BinaryType.get())); + Types.StructType nestedStructType = + Types.StructType.of( + optional(4, "longCol", Types.LongType.get()), + optional(5, "leafStructCol", leafStructType)); + Types.StructType structPrimTypeForList = + Types.StructType.of( + optional(506, "leafLongCol", Types.LongType.get()), + optional(507, "leafBinaryCol", Types.BinaryType.get())); + Types.StructType leafStructTypeForList = + Types.StructType.of( + optional(516, "leafLongCol", Types.LongType.get()), + optional(517, "leafBinaryCol", Types.BinaryType.get())); + Types.StructType nestedStructTypeForList = + Types.StructType.of( + optional(504, "longCol", Types.LongType.get()), + optional(505, "leafStructCol", leafStructTypeForList)); + Types.StructType structPrimTypeForMap = + Types.StructType.of( + optional(606, "leafLongCol", Types.LongType.get()), + optional(607, "leafBinaryCol", Types.BinaryType.get())); + Types.StructType leafStructTypeForMap = + Types.StructType.of( + optional(616, "leafLongCol", Types.LongType.get()), + optional(617, "leafBinaryCol", Types.BinaryType.get())); + Types.StructType nestedStructTypeForMap = + Types.StructType.of( + optional(604, "longCol", Types.LongType.get()), + optional(605, "leafStructCol", leafStructTypeForMap)); + Types.StructType leafStructTypeForStruct = + Types.StructType.of( + optional(716, "leafLongCol", Types.LongType.get()), + optional(717, "leafBinaryCol", Types.BinaryType.get())); + Types.StructType nestedStructTypeForStruct = + Types.StructType.of( + optional(704, "longCol", Types.LongType.get()), + optional(705, "leafStructCol", leafStructTypeForStruct)); // all fields in expected iceberg schema will be optional since we don't have a column mapping - Schema expectedSchema = new Schema( - optional(1, "intCol", Types.IntegerType.get()), - optional(2, "longCol", Types.LongType.get()), - optional(3, "nestedStructCol", nestedStructType), - optional(8, "intCol3", Types.IntegerType.get()), - optional(9, "doubleCol", Types.DoubleType.get()), - required(10, "uuidCol", Types.UUIDType.get()), - optional(20, "booleanCol", Types.BooleanType.get()), - optional(21, "fixedCol", Types.FixedType.ofLength(4096)), - required(22, "binaryCol", Types.BinaryType.get()), - required(23, "stringCol", Types.StringType.get()), - required(24, "decimalCol", Types.DecimalType.of(15, 3)), - required(25, "floatCol", Types.FloatType.get()), - optional(30, "dateCol", Types.DateType.get()), - required(32, "timeCol", Types.TimeType.get()), - required(34, "timestampCol", Types.TimestampType.withZone()), - required(35, "listPrimCol", - Types.ListType.ofRequired(135, Types.LongType.get())), - required(36, "listPrimNestCol", - Types.ListType.ofRequired(136, structPrimTypeForList)), - required(37, "listNestedCol", - Types.ListType.ofRequired(137, nestedStructTypeForList)), - optional(38, "mapPrimCol", - Types.MapType.ofRequired(138, 238, Types.StringType.get(), Types.FixedType.ofLength(4096))), - required(39, "mapPrimNestCol", - Types.MapType.ofRequired(139, 239, Types.StringType.get(), structPrimTypeForMap)), - required(40, "mapNestedCol", - Types.MapType.ofRequired(140, 240, Types.StringType.get(), nestedStructTypeForMap)), - required(41, "structListNestCol", - Types.ListType.ofRequired(241, - Types.StructType.of( - optional(816, "leafLongCol", Types.LongType.get()), - optional(817, "leafBinaryCol", Types.BinaryType.get()) - )) - ), - required(42, "structMapNestCol", - Types.MapType.ofRequired(242, 342, Types.StringType.get(), + Schema expectedSchema = + new Schema( + optional(1, "intCol", Types.IntegerType.get()), + optional(2, "longCol", Types.LongType.get()), + optional(3, "nestedStructCol", nestedStructType), + optional(8, "intCol3", Types.IntegerType.get()), + optional(9, "doubleCol", Types.DoubleType.get()), + required(10, "uuidCol", Types.UUIDType.get()), + optional(20, "booleanCol", Types.BooleanType.get()), + optional(21, "fixedCol", Types.FixedType.ofLength(4096)), + required(22, "binaryCol", Types.BinaryType.get()), + required(23, "stringCol", Types.StringType.get()), + required(24, "decimalCol", Types.DecimalType.of(15, 3)), + required(25, "floatCol", Types.FloatType.get()), + optional(30, "dateCol", Types.DateType.get()), + required(32, "timeCol", Types.TimeType.get()), + required(34, "timestampCol", Types.TimestampType.withZone()), + required(35, "listPrimCol", Types.ListType.ofRequired(135, Types.LongType.get())), + required(36, "listPrimNestCol", Types.ListType.ofRequired(136, structPrimTypeForList)), + required(37, "listNestedCol", Types.ListType.ofRequired(137, nestedStructTypeForList)), + optional( + 38, + "mapPrimCol", + Types.MapType.ofRequired( + 138, 238, Types.StringType.get(), Types.FixedType.ofLength(4096))), + required( + 39, + "mapPrimNestCol", + Types.MapType.ofRequired(139, 239, Types.StringType.get(), structPrimTypeForMap)), + required( + 40, + "mapNestedCol", + Types.MapType.ofRequired(140, 240, Types.StringType.get(), nestedStructTypeForMap)), + required( + 41, + "structListNestCol", + Types.ListType.ofRequired( + 241, + Types.StructType.of( + optional(816, "leafLongCol", Types.LongType.get()), + optional(817, "leafBinaryCol", Types.BinaryType.get())))), + required( + 42, + "structMapNestCol", + Types.MapType.ofRequired( + 242, + 342, + Types.StringType.get(), + Types.StructType.of( + optional(916, "leafLongCol", Types.LongType.get()), + optional(917, "leafBinaryCol", Types.BinaryType.get())))), + required( + 43, + "structStructNestCol", Types.StructType.of( - optional(916, "leafLongCol", Types.LongType.get()), - optional(917, "leafBinaryCol", Types.BinaryType.get()) - ) - )), - required(43, "structStructNestCol", - Types.StructType.of(required(243, "innerStructNest", - Types.StructType.of( - optional(1016, "leafLongCol", Types.LongType.get()), - optional(1017, "leafBinaryCol", Types.BinaryType.get()) - )) - )), - required(44, "structStructComplexNestCol", - Types.StructType.of(required(244, "innerStructNest", + required( + 243, + "innerStructNest", + Types.StructType.of( + optional(1016, "leafLongCol", Types.LongType.get()), + optional(1017, "leafBinaryCol", Types.BinaryType.get()))))), + required( + 44, + "structStructComplexNestCol", Types.StructType.of( - optional(1116, "leafLongCol", Types.LongType.get()), - optional(1117, "leftMapOfListStructCol", - Types.MapType.ofRequired(1150, 1151, - Types.StringType.get(), - Types.ListType.ofRequired(1250, nestedStructTypeForStruct)) - ) - )) - )) - ); + required( + 244, + "innerStructNest", + Types.StructType.of( + optional(1116, "leafLongCol", Types.LongType.get()), + optional( + 1117, + "leftMapOfListStructCol", + Types.MapType.ofRequired( + 1150, + 1151, + Types.StringType.get(), + Types.ListType.ofRequired( + 1250, nestedStructTypeForStruct)))))))); TypeDescription orcSchema = ORCSchemaUtil.convert(expectedSchema); assertEquals(expectedSchema.asStruct(), ORCSchemaUtil.convert(orcSchema).asStruct()); } @Test public void testTypePromotions() { - Schema originalSchema = new Schema( - optional(1, "a", Types.IntegerType.get()), - optional(2, "b", Types.FloatType.get()), - optional(3, "c", Types.DecimalType.of(10, 2)) - ); + Schema originalSchema = + new Schema( + optional(1, "a", Types.IntegerType.get()), + optional(2, "b", Types.FloatType.get()), + optional(3, "c", Types.DecimalType.of(10, 2))); // Original mapping (stored in ORC) TypeDescription orcSchema = ORCSchemaUtil.convert(originalSchema); // Evolve schema - Schema evolveSchema = new Schema( - optional(1, "a", Types.LongType.get()), - optional(2, "b", Types.DoubleType.get()), - optional(3, "c", Types.DecimalType.of(15, 2)) - ); + Schema evolveSchema = + new Schema( + optional(1, "a", Types.LongType.get()), + optional(2, "b", Types.DoubleType.get()), + optional(3, "c", Types.DecimalType.of(15, 2))); TypeDescription newOrcSchema = ORCSchemaUtil.buildOrcProjection(evolveSchema, orcSchema); assertEquals(3, newOrcSchema.getChildren().size()); @@ -209,17 +227,16 @@ public void testTypePromotions() { @Test public void testInvalidTypePromotions() { - Schema originalSchema = new Schema( - optional(1, "a", Types.LongType.get()) - ); + Schema originalSchema = new Schema(optional(1, "a", Types.LongType.get())); TypeDescription orcSchema = ORCSchemaUtil.convert(originalSchema); - Schema evolveSchema = new Schema( - optional(1, "a", Types.IntegerType.get()) - ); + Schema evolveSchema = new Schema(optional(1, "a", Types.IntegerType.get())); - assertThrows("Should not allow invalid type promotion", - IllegalArgumentException.class, "Can not promote", () -> { + assertThrows( + "Should not allow invalid type promotion", + IllegalArgumentException.class, + "Can not promote", + () -> { ORCSchemaUtil.buildOrcProjection(evolveSchema, orcSchema); }); } @@ -230,8 +247,10 @@ public void testSkipNonIcebergColumns() { TypeDescription intCol = TypeDescription.createInt(); intCol.setAttribute(ICEBERG_ID_ATTRIBUTE, "1"); intCol.setAttribute(ICEBERG_REQUIRED_ATTRIBUTE, "true"); - TypeDescription listCol = TypeDescription - .createList(TypeDescription.createMap(TypeDescription.createString(), TypeDescription.createDate())); + TypeDescription listCol = + TypeDescription.createList( + TypeDescription.createMap( + TypeDescription.createString(), TypeDescription.createDate())); listCol.setAttribute(ICEBERG_ID_ATTRIBUTE, "2"); schema.addField("intCol", intCol); schema.addField("listCol", listCol); @@ -244,12 +263,14 @@ public void testSkipNonIcebergColumns() { schema.addField("mapCol", mapCol); Schema icebergSchema = ORCSchemaUtil.convert(schema); - Schema expectedSchema = new Schema( - required(1, "intCol", Types.IntegerType.get()), - // Skipped listCol since element has no Iceberg ID - optional(5, "mapCol", Types.MapType.ofOptional(3, 4, - Types.StringType.get(), Types.BooleanType.get())) - ); + Schema expectedSchema = + new Schema( + required(1, "intCol", Types.IntegerType.get()), + // Skipped listCol since element has no Iceberg ID + optional( + 5, + "mapCol", + Types.MapType.ofOptional(3, 4, Types.StringType.get(), Types.BooleanType.get()))); assertEquals("Schemas must match.", expectedSchema.asStruct(), icebergSchema.asStruct()); TypeDescription structCol = TypeDescription.createStruct(); @@ -269,34 +290,46 @@ public void testSkipNonIcebergColumns() { schema.addField("mapCol2", mapCol2); Schema icebergSchema2 = ORCSchemaUtil.convert(schema); - Schema expectedSchema2 = new Schema( - required(1, "intCol", Types.IntegerType.get()), - optional(5, "mapCol", Types.MapType.ofOptional(3, 4, - Types.StringType.get(), Types.BooleanType.get())), - required(7, "structCol", Types.StructType.of( - // Skipped binaryCol - required(6, "doubleCol", Types.DoubleType.get()) - // Skipped mapCol2 since value has no Iceberg ID - )) - ); + Schema expectedSchema2 = + new Schema( + required(1, "intCol", Types.IntegerType.get()), + optional( + 5, + "mapCol", + Types.MapType.ofOptional(3, 4, Types.StringType.get(), Types.BooleanType.get())), + required( + 7, + "structCol", + Types.StructType.of( + // Skipped binaryCol + required(6, "doubleCol", Types.DoubleType.get()) + // Skipped mapCol2 since value has no Iceberg ID + ))); assertEquals("Schemas must match.", expectedSchema2.asStruct(), icebergSchema2.asStruct()); } @Test public void testHasIds() { - Schema schema = new Schema( - optional(1, "data", Types.StructType.of( - optional(10, "entries", Types.MapType.ofOptional(11, 12, Types.StringType.get(), Types.DateType.get())) - )), - optional(2, "intCol", Types.IntegerType.get()), - optional(3, "longCol", Types.LongType.get()), - optional(4, "listCol", Types.ListType.ofOptional(40, Types.DoubleType.get())) - ); + Schema schema = + new Schema( + optional( + 1, + "data", + Types.StructType.of( + optional( + 10, + "entries", + Types.MapType.ofOptional( + 11, 12, Types.StringType.get(), Types.DateType.get())))), + optional(2, "intCol", Types.IntegerType.get()), + optional(3, "longCol", Types.LongType.get()), + optional(4, "listCol", Types.ListType.ofOptional(40, Types.DoubleType.get()))); TypeDescription orcSchema = ORCSchemaUtil.removeIds(ORCSchemaUtil.convert(schema)); assertFalse("Should not have Ids", ORCSchemaUtil.hasIds(orcSchema)); - TypeDescription map2Col = TypeDescription.createMap(TypeDescription.createString(), TypeDescription.createBinary()); + TypeDescription map2Col = + TypeDescription.createMap(TypeDescription.createString(), TypeDescription.createBinary()); map2Col.setAttribute(ICEBERG_ID_ATTRIBUTE, "4"); orcSchema.addField("map2Col", map2Col); assertTrue("Should have Ids after adding one type with Id", ORCSchemaUtil.hasIds(orcSchema)); @@ -304,97 +337,137 @@ public void testHasIds() { @Test public void testAssignIdsByNameMapping() { - Types.StructType structType = Types.StructType.of( - required(0, "id", Types.LongType.get()), - optional(1, "list_of_maps", - Types.ListType.ofOptional(2, Types.MapType.ofOptional(3, 4, - Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - optional(5, "map_of_lists", - Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), - required(9, "list_of_lists", - Types.ListType.ofOptional(10, Types.ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), - required(12, "map_of_maps", - Types.MapType.ofOptional(13, 14, - Types.StringType.get(), - Types.MapType.ofOptional(15, 16, + Types.StructType structType = + Types.StructType.of( + required(0, "id", Types.LongType.get()), + optional( + 1, + "list_of_maps", + Types.ListType.ofOptional( + 2, + Types.MapType.ofOptional(3, 4, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + optional( + 5, + "map_of_lists", + Types.MapType.ofOptional( + 6, + 7, Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - required(17, "list_of_struct_of_nested_types", Types.ListType.ofOptional(19, Types.StructType.of( - Types.NestedField.required(20, "m1", Types.MapType.ofOptional(21, 22, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(23, "l1", Types.ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), - Types.NestedField.required(25, "l2", Types.ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(27, "m2", Types.MapType.ofOptional(28, 29, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)) - ))) - ); - - Schema schema = new Schema(TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) - .asStructType().fields()); + Types.ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), + required( + 9, + "list_of_lists", + Types.ListType.ofOptional(10, Types.ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), + required( + 12, + "map_of_maps", + Types.MapType.ofOptional( + 13, + 14, + Types.StringType.get(), + Types.MapType.ofOptional( + 15, 16, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + required( + 17, + "list_of_struct_of_nested_types", + Types.ListType.ofOptional( + 19, + Types.StructType.of( + Types.NestedField.required( + 20, + "m1", + Types.MapType.ofOptional( + 21, 22, Types.StringType.get(), SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 23, "l1", Types.ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), + Types.NestedField.required( + 25, "l2", Types.ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 27, + "m2", + Types.MapType.ofOptional( + 28, 29, Types.StringType.get(), SUPPORTED_PRIMITIVES)))))); + + Schema schema = + new Schema( + TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) + .asStructType() + .fields()); NameMapping nameMapping = MappingUtil.create(schema); TypeDescription typeDescriptionWithIds = ORCSchemaUtil.convert(schema); - TypeDescription typeDescriptionWithIdsFromNameMapping = ORCSchemaUtil - .applyNameMapping(ORCSchemaUtil.removeIds(typeDescriptionWithIds), nameMapping); + TypeDescription typeDescriptionWithIdsFromNameMapping = + ORCSchemaUtil.applyNameMapping( + ORCSchemaUtil.removeIds(typeDescriptionWithIds), nameMapping); - assertTrue("TypeDescription schemas should be equal, including IDs", + assertTrue( + "TypeDescription schemas should be equal, including IDs", equalsWithIds(typeDescriptionWithIds, typeDescriptionWithIdsFromNameMapping)); } @Test public void testAssignIdsByNameMappingAndProject() { - Types.StructType structType = Types.StructType.of( - required(1, "id", Types.LongType.get()), - optional(2, "list_of_structs", - Types.ListType.ofOptional(3, Types.StructType.of( - required(4, "entry", Types.LongType.get()), - required(5, "data", Types.BinaryType.get()) - )) - ), - optional(6, "map", - Types.MapType.ofOptional(7, 8, Types.StringType.get(), Types.DoubleType.get()) - ), - optional(12, "map_of_structs", - Types.MapType.ofOptional(13, 14, Types.StringType.get(), Types.StructType.of( - required(20, "field", Types.LongType.get()))) - ), - required(30, "struct", Types.StructType.of( - required(31, "lat", Types.DoubleType.get()), - required(32, "long", Types.DoubleType.get()) - ) - ) - ); - - TypeDescription fileSchema = ORCSchemaUtil.removeIds( - ORCSchemaUtil.convert(new Schema(structType.asStructType().fields()))); - - Schema mappingSchema = new Schema(Types.StructType.of( - optional(1, "new_id", Types.LongType.get()), - optional(2, "list_of_structs", - Types.ListType.ofOptional(3, Types.StructType.of( - required(5, "data", Types.BinaryType.get()) - )) - ), - optional(6, "map", - Types.MapType.ofOptional(7, 8, Types.StringType.get(), Types.DoubleType.get()) - ), - optional(30, "struct", + Types.StructType structType = + Types.StructType.of( + required(1, "id", Types.LongType.get()), + optional( + 2, + "list_of_structs", + Types.ListType.ofOptional( + 3, + Types.StructType.of( + required(4, "entry", Types.LongType.get()), + required(5, "data", Types.BinaryType.get())))), + optional( + 6, + "map", + Types.MapType.ofOptional(7, 8, Types.StringType.get(), Types.DoubleType.get())), + optional( + 12, + "map_of_structs", + Types.MapType.ofOptional( + 13, + 14, + Types.StringType.get(), + Types.StructType.of(required(20, "field", Types.LongType.get())))), + required( + 30, + "struct", + Types.StructType.of( + required(31, "lat", Types.DoubleType.get()), + required(32, "long", Types.DoubleType.get())))); + + TypeDescription fileSchema = + ORCSchemaUtil.removeIds( + ORCSchemaUtil.convert(new Schema(structType.asStructType().fields()))); + + Schema mappingSchema = + new Schema( Types.StructType.of( - optional(31, "latitude", Types.DoubleType.get()), - optional(32, "longitude", Types.DoubleType.get()) - ) - ), - optional(40, "long", Types.LongType.get()) - ).asStructType().fields()); + optional(1, "new_id", Types.LongType.get()), + optional( + 2, + "list_of_structs", + Types.ListType.ofOptional( + 3, Types.StructType.of(required(5, "data", Types.BinaryType.get())))), + optional( + 6, + "map", + Types.MapType.ofOptional( + 7, 8, Types.StringType.get(), Types.DoubleType.get())), + optional( + 30, + "struct", + Types.StructType.of( + optional(31, "latitude", Types.DoubleType.get()), + optional(32, "longitude", Types.DoubleType.get()))), + optional(40, "long", Types.LongType.get())) + .asStructType() + .fields()); NameMapping nameMapping = MappingUtil.create(mappingSchema); - TypeDescription typeDescriptionWithIdsFromNameMapping = ORCSchemaUtil - .applyNameMapping(fileSchema, nameMapping); + TypeDescription typeDescriptionWithIdsFromNameMapping = + ORCSchemaUtil.applyNameMapping(fileSchema, nameMapping); TypeDescription expected = TypeDescription.createStruct(); // new field @@ -435,13 +508,14 @@ public void testAssignIdsByNameMappingAndProject() { longField.setAttribute(ICEBERG_ID_ATTRIBUTE, "40"); expected.addField("long_r40", longField); - assertTrue("ORC Schema must have the same structure, but one has Iceberg IDs", + assertTrue( + "ORC Schema must have the same structure, but one has Iceberg IDs", typeDescriptionWithIdsFromNameMapping.equals(fileSchema, false)); - TypeDescription projectedOrcSchema = ORCSchemaUtil.buildOrcProjection(mappingSchema, - typeDescriptionWithIdsFromNameMapping); - assertTrue("Schema should be the prunned by projection", - equalsWithIds(expected, projectedOrcSchema)); + TypeDescription projectedOrcSchema = + ORCSchemaUtil.buildOrcProjection(mappingSchema, typeDescriptionWithIdsFromNameMapping); + assertTrue( + "Schema should be the prunned by projection", equalsWithIds(expected, projectedOrcSchema)); } private static boolean equalsWithIds(TypeDescription first, TypeDescription second) { @@ -455,19 +529,23 @@ private static boolean equalsWithIds(TypeDescription first, TypeDescription seco // check the ID attribute on non-root TypeDescriptions if (first.getId() > 0 && second.getId() > 0) { - if (first.getAttributeValue(ICEBERG_ID_ATTRIBUTE) == null || - second.getAttributeValue(ICEBERG_ID_ATTRIBUTE) == null) { + if (first.getAttributeValue(ICEBERG_ID_ATTRIBUTE) == null + || second.getAttributeValue(ICEBERG_ID_ATTRIBUTE) == null) { return false; } - if (!first.getAttributeValue(ICEBERG_ID_ATTRIBUTE).equals(second.getAttributeValue(ICEBERG_ID_ATTRIBUTE))) { + if (!first + .getAttributeValue(ICEBERG_ID_ATTRIBUTE) + .equals(second.getAttributeValue(ICEBERG_ID_ATTRIBUTE))) { return false; } } // check the children - List firstChildren = Optional.ofNullable(first.getChildren()).orElse(Collections.emptyList()); - List secondChildren = Optional.ofNullable(second.getChildren()).orElse(Collections.emptyList()); + List firstChildren = + Optional.ofNullable(first.getChildren()).orElse(Collections.emptyList()); + List secondChildren = + Optional.ofNullable(second.getChildren()).orElse(Collections.emptyList()); if (firstChildren.size() != secondChildren.size()) { return false; } diff --git a/orc/src/test/java/org/apache/iceberg/orc/TestOrcDeleteWriters.java b/orc/src/test/java/org/apache/iceberg/orc/TestOrcDeleteWriters.java index 5de06a0ef4a3..bbce6f723472 100644 --- a/orc/src/test/java/org/apache/iceberg/orc/TestOrcDeleteWriters.java +++ b/orc/src/test/java/org/apache/iceberg/orc/TestOrcDeleteWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.io.File; @@ -49,14 +48,14 @@ import org.junit.rules.TemporaryFolder; public class TestOrcDeleteWriters { - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); private List records; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Before public void createDeleteRecords() { @@ -77,13 +76,14 @@ public void testEqualityDeleteWriter() throws IOException { File deleteFile = temp.newFile(); OutputFile out = Files.localOutput(deleteFile); - EqualityDeleteWriter deleteWriter = ORC.writeDeletes(out) - .createWriterFunc(GenericOrcWriter::buildWriter) - .overwrite() - .rowSchema(SCHEMA) - .withSpec(PartitionSpec.unpartitioned()) - .equalityFieldIds(1) - .buildEqualityWriter(); + EqualityDeleteWriter deleteWriter = + ORC.writeDeletes(out) + .createWriterFunc(GenericOrcWriter::buildWriter) + .overwrite() + .rowSchema(SCHEMA) + .withSpec(PartitionSpec.unpartitioned()) + .equalityFieldIds(1) + .buildEqualityWriter(); try (EqualityDeleteWriter writer = deleteWriter) { writer.deleteAll(records); @@ -91,16 +91,18 @@ public void testEqualityDeleteWriter() throws IOException { DeleteFile metadata = deleteWriter.toDeleteFile(); Assert.assertEquals("Format should be ORC", FileFormat.ORC, metadata.format()); - Assert.assertEquals("Should be equality deletes", FileContent.EQUALITY_DELETES, metadata.content()); + Assert.assertEquals( + "Should be equality deletes", FileContent.EQUALITY_DELETES, metadata.content()); Assert.assertEquals("Record count should be correct", records.size(), metadata.recordCount()); Assert.assertEquals("Partition should be empty", 0, metadata.partition().size()); Assert.assertNull("Key metadata should be null", metadata.keyMetadata()); List deletedRecords; - try (CloseableIterable reader = ORC.read(out.toInputFile()) - .project(SCHEMA) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(SCHEMA, fileSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(out.toInputFile()) + .project(SCHEMA) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(SCHEMA, fileSchema)) + .build()) { deletedRecords = Lists.newArrayList(reader); } @@ -111,22 +113,25 @@ public void testEqualityDeleteWriter() throws IOException { public void testPositionDeleteWriter() throws IOException { File deleteFile = temp.newFile(); - Schema deleteSchema = new Schema( - MetadataColumns.DELETE_FILE_PATH, - MetadataColumns.DELETE_FILE_POS, - Types.NestedField.optional(MetadataColumns.DELETE_FILE_ROW_FIELD_ID, "row", SCHEMA.asStruct())); + Schema deleteSchema = + new Schema( + MetadataColumns.DELETE_FILE_PATH, + MetadataColumns.DELETE_FILE_POS, + Types.NestedField.optional( + MetadataColumns.DELETE_FILE_ROW_FIELD_ID, "row", SCHEMA.asStruct())); String deletePath = "s3://bucket/path/file.orc"; GenericRecord posDelete = GenericRecord.create(deleteSchema); List expectedDeleteRecords = Lists.newArrayList(); OutputFile out = Files.localOutput(deleteFile); - PositionDeleteWriter deleteWriter = ORC.writeDeletes(out) - .createWriterFunc(GenericOrcWriter::buildWriter) - .overwrite() - .rowSchema(SCHEMA) - .withSpec(PartitionSpec.unpartitioned()) - .buildPositionWriter(); + PositionDeleteWriter deleteWriter = + ORC.writeDeletes(out) + .createWriterFunc(GenericOrcWriter::buildWriter) + .overwrite() + .rowSchema(SCHEMA) + .withSpec(PartitionSpec.unpartitioned()) + .buildPositionWriter(); PositionDelete positionDelete = PositionDelete.create(); try (PositionDeleteWriter writer = deleteWriter) { @@ -134,52 +139,56 @@ public void testPositionDeleteWriter() throws IOException { int pos = i * 3 + 2; positionDelete.set(deletePath, pos, records.get(i)); writer.write(positionDelete); - expectedDeleteRecords.add(posDelete.copy(ImmutableMap.of( - "file_path", deletePath, - "pos", (long) pos, - "row", records.get(i)))); + expectedDeleteRecords.add( + posDelete.copy( + ImmutableMap.of( + "file_path", deletePath, "pos", (long) pos, "row", records.get(i)))); } } DeleteFile metadata = deleteWriter.toDeleteFile(); Assert.assertEquals("Format should be ORC", FileFormat.ORC, metadata.format()); - Assert.assertEquals("Should be position deletes", FileContent.POSITION_DELETES, metadata.content()); + Assert.assertEquals( + "Should be position deletes", FileContent.POSITION_DELETES, metadata.content()); Assert.assertEquals("Record count should be correct", records.size(), metadata.recordCount()); Assert.assertEquals("Partition should be empty", 0, metadata.partition().size()); Assert.assertNull("Key metadata should be null", metadata.keyMetadata()); List deletedRecords; - try (CloseableIterable reader = ORC.read(out.toInputFile()) - .project(deleteSchema) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(deleteSchema, fileSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(out.toInputFile()) + .project(deleteSchema) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(deleteSchema, fileSchema)) + .build()) { deletedRecords = Lists.newArrayList(reader); } - Assert.assertEquals("Deleted records should match expected", expectedDeleteRecords, deletedRecords); + Assert.assertEquals( + "Deleted records should match expected", expectedDeleteRecords, deletedRecords); } @Test public void testPositionDeleteWriterWithEmptyRow() throws IOException { File deleteFile = temp.newFile(); - Schema deleteSchema = new Schema( - MetadataColumns.DELETE_FILE_PATH, - MetadataColumns.DELETE_FILE_POS); + Schema deleteSchema = + new Schema(MetadataColumns.DELETE_FILE_PATH, MetadataColumns.DELETE_FILE_POS); String deletePath = "s3://bucket/path/file.orc"; GenericRecord posDelete = GenericRecord.create(deleteSchema); List expectedDeleteRecords = Lists.newArrayList(); OutputFile out = Files.localOutput(deleteFile); - PositionDeleteWriter deleteWriter = ORC.writeDeletes(out) - .createWriterFunc(GenericOrcWriter::buildWriter) - .overwrite() - .withSpec(PartitionSpec.unpartitioned()) - .transformPaths(path -> { - throw new RuntimeException("Should not be called for performance reasons"); - }) - .buildPositionWriter(); + PositionDeleteWriter deleteWriter = + ORC.writeDeletes(out) + .createWriterFunc(GenericOrcWriter::buildWriter) + .overwrite() + .withSpec(PartitionSpec.unpartitioned()) + .transformPaths( + path -> { + throw new RuntimeException("Should not be called for performance reasons"); + }) + .buildPositionWriter(); PositionDelete positionDelete = PositionDelete.create(); try (PositionDeleteWriter writer = deleteWriter) { @@ -187,27 +196,29 @@ public void testPositionDeleteWriterWithEmptyRow() throws IOException { int pos = i * 3 + 2; positionDelete.set(deletePath, pos, null); writer.write(positionDelete); - expectedDeleteRecords.add(posDelete.copy(ImmutableMap.of( - "file_path", deletePath, - "pos", (long) pos))); + expectedDeleteRecords.add( + posDelete.copy(ImmutableMap.of("file_path", deletePath, "pos", (long) pos))); } } DeleteFile metadata = deleteWriter.toDeleteFile(); Assert.assertEquals("Format should be ORC", FileFormat.ORC, metadata.format()); - Assert.assertEquals("Should be position deletes", FileContent.POSITION_DELETES, metadata.content()); + Assert.assertEquals( + "Should be position deletes", FileContent.POSITION_DELETES, metadata.content()); Assert.assertEquals("Record count should be correct", records.size(), metadata.recordCount()); Assert.assertEquals("Partition should be empty", 0, metadata.partition().size()); Assert.assertNull("Key metadata should be null", metadata.keyMetadata()); List deletedRecords; - try (CloseableIterable reader = ORC.read(out.toInputFile()) - .project(deleteSchema) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(deleteSchema, fileSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(out.toInputFile()) + .project(deleteSchema) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(deleteSchema, fileSchema)) + .build()) { deletedRecords = Lists.newArrayList(reader); } - Assert.assertEquals("Deleted records should match expected", expectedDeleteRecords, deletedRecords); + Assert.assertEquals( + "Deleted records should match expected", expectedDeleteRecords, deletedRecords); } } diff --git a/orc/src/test/java/org/apache/iceberg/orc/TestTableProperties.java b/orc/src/test/java/org/apache/iceberg/orc/TestTableProperties.java index aba77f224aa0..09fe8ff915e2 100644 --- a/orc/src/test/java/org/apache/iceberg/orc/TestTableProperties.java +++ b/orc/src/test/java/org/apache/iceberg/orc/TestTableProperties.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.orc; import java.io.File; @@ -46,13 +45,12 @@ public class TestTableProperties { - public static final Schema SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + public static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); - @ClassRule - public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + @ClassRule public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); @Test public void testOrcTableProperties() throws Exception { @@ -65,12 +63,13 @@ public void testOrcTableProperties() throws Exception { String codecAsString = CompressionKind.values()[random.nextInt(numOfCodecs)].name(); String strategyAsString = CompressionStrategy.values()[random.nextInt(numOfStrategies)].name(); - ImmutableMap properties = ImmutableMap.of( - TableProperties.ORC_STRIPE_SIZE_BYTES, String.valueOf(stripeSizeBytes), - TableProperties.ORC_BLOCK_SIZE_BYTES, String.valueOf(blockSizeBytes), - TableProperties.ORC_COMPRESSION, codecAsString, - TableProperties.ORC_COMPRESSION_STRATEGY, strategyAsString, - TableProperties.DEFAULT_FILE_FORMAT, FileFormat.ORC.name()); + ImmutableMap properties = + ImmutableMap.of( + TableProperties.ORC_STRIPE_SIZE_BYTES, String.valueOf(stripeSizeBytes), + TableProperties.ORC_BLOCK_SIZE_BYTES, String.valueOf(blockSizeBytes), + TableProperties.ORC_COMPRESSION, codecAsString, + TableProperties.ORC_COMPRESSION_STRATEGY, strategyAsString, + TableProperties.DEFAULT_FILE_FORMAT, FileFormat.ORC.name()); File folder = TEMPORARY_FOLDER.newFolder(); @@ -84,10 +83,11 @@ public void testOrcTableProperties() throws Exception { File testFile = TEMPORARY_FOLDER.newFile(); Assert.assertTrue(testFile.delete()); - FileAppender writer = ORC.write(Files.localOutput(testFile)) - .forTable(table) - .createWriterFunc(GenericOrcWriter::buildWriter) - .build(); + FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .forTable(table) + .createWriterFunc(GenericOrcWriter::buildWriter) + .build(); DynFields.BoundField confField = DynFields.builder().hiddenImpl(writer.getClass(), "conf").build(writer); @@ -97,7 +97,8 @@ public void testOrcTableProperties() throws Exception { Assert.assertEquals(stripeSizeBytes, OrcConf.STRIPE_SIZE.getLong(configuration)); Assert.assertEquals(codecAsString, OrcConf.COMPRESS.getString(configuration)); Assert.assertEquals(strategyAsString, OrcConf.COMPRESSION_STRATEGY.getString(configuration)); - Assert.assertEquals(FileFormat.ORC.name(), configuration.get(TableProperties.DEFAULT_FILE_FORMAT)); + Assert.assertEquals( + FileFormat.ORC.name(), configuration.get(TableProperties.DEFAULT_FILE_FORMAT)); } @Test @@ -111,12 +112,13 @@ public void testOrcTableDeleteProperties() throws Exception { String codecAsString = CompressionKind.values()[random.nextInt(numOfCodecs)].name(); String strategyAsString = CompressionStrategy.values()[random.nextInt(numOfStrategies)].name(); - ImmutableMap properties = ImmutableMap.of( - TableProperties.DELETE_ORC_STRIPE_SIZE_BYTES, String.valueOf(stripeSizeBytes), - TableProperties.DELETE_ORC_BLOCK_SIZE_BYTES, String.valueOf(blockSizeBytes), - TableProperties.DELETE_ORC_COMPRESSION, codecAsString, - TableProperties.DELETE_ORC_COMPRESSION_STRATEGY, strategyAsString, - TableProperties.DEFAULT_FILE_FORMAT, FileFormat.ORC.name()); + ImmutableMap properties = + ImmutableMap.of( + TableProperties.DELETE_ORC_STRIPE_SIZE_BYTES, String.valueOf(stripeSizeBytes), + TableProperties.DELETE_ORC_BLOCK_SIZE_BYTES, String.valueOf(blockSizeBytes), + TableProperties.DELETE_ORC_COMPRESSION, codecAsString, + TableProperties.DELETE_ORC_COMPRESSION_STRATEGY, strategyAsString, + TableProperties.DEFAULT_FILE_FORMAT, FileFormat.ORC.name()); File folder = TEMPORARY_FOLDER.newFolder(); @@ -130,11 +132,12 @@ public void testOrcTableDeleteProperties() throws Exception { File testFile = TEMPORARY_FOLDER.newFile(); Assert.assertTrue(testFile.delete()); - EqualityDeleteWriter deleteWriter = ORC.writeDeletes(Files.localOutput(testFile)) - .forTable(table) - .equalityFieldIds(1) - .createWriterFunc(GenericOrcWriter::buildWriter) - .buildEqualityWriter(); + EqualityDeleteWriter deleteWriter = + ORC.writeDeletes(Files.localOutput(testFile)) + .forTable(table) + .equalityFieldIds(1) + .createWriterFunc(GenericOrcWriter::buildWriter) + .buildEqualityWriter(); DynFields.BoundField> writer = DynFields.builder().hiddenImpl(deleteWriter.getClass(), "appender").build(deleteWriter); @@ -148,6 +151,7 @@ public void testOrcTableDeleteProperties() throws Exception { Assert.assertEquals(stripeSizeBytes, OrcConf.STRIPE_SIZE.getLong(configuration)); Assert.assertEquals(codecAsString, OrcConf.COMPRESS.getString(configuration)); Assert.assertEquals(strategyAsString, OrcConf.COMPRESSION_STRATEGY.getString(configuration)); - Assert.assertEquals(FileFormat.ORC.name(), configuration.get(TableProperties.DEFAULT_FILE_FORMAT)); + Assert.assertEquals( + FileFormat.ORC.name(), configuration.get(TableProperties.DEFAULT_FILE_FORMAT)); } } diff --git a/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetReaders.java b/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetReaders.java index d9694f97a1bf..c96074ebfdb1 100644 --- a/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetReaders.java +++ b/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.parquet; import java.nio.ByteBuffer; @@ -50,32 +49,30 @@ import org.apache.parquet.schema.Type; public abstract class BaseParquetReaders { - protected BaseParquetReaders() { - } + protected BaseParquetReaders() {} - protected ParquetValueReader createReader(Schema expectedSchema, - MessageType fileSchema) { + protected ParquetValueReader createReader(Schema expectedSchema, MessageType fileSchema) { return createReader(expectedSchema, fileSchema, ImmutableMap.of()); } @SuppressWarnings("unchecked") - protected ParquetValueReader createReader(Schema expectedSchema, - MessageType fileSchema, - Map idToConstant) { + protected ParquetValueReader createReader( + Schema expectedSchema, MessageType fileSchema, Map idToConstant) { if (ParquetSchemaUtil.hasIds(fileSchema)) { return (ParquetValueReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, - new ReadBuilder(fileSchema, idToConstant)); + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); } else { return (ParquetValueReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), + fileSchema, new FallbackReadBuilder(fileSchema, idToConstant)); } } - protected abstract ParquetValueReader createStructReader(List types, - List> fieldReaders, - Types.StructType structType); + protected abstract ParquetValueReader createStructReader( + List types, List> fieldReaders, Types.StructType structType); private class FallbackReadBuilder extends ReadBuilder { private FallbackReadBuilder(MessageType type, Map idToConstant) { @@ -83,18 +80,18 @@ private FallbackReadBuilder(MessageType type, Map idToConstant) { } @Override - public ParquetValueReader message(Types.StructType expected, MessageType message, - List> fieldReaders) { + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { // the top level matches by ID, but the remaining IDs are missing return super.struct(expected, message, fieldReaders); } @Override - public ParquetValueReader struct(Types.StructType expected, GroupType struct, - List> fieldReaders) { + public ParquetValueReader struct( + Types.StructType expected, GroupType struct, List> fieldReaders) { // the expected struct is ignored because nested fields are never found when the - List> newFields = Lists.newArrayListWithExpectedSize( - fieldReaders.size()); + List> newFields = + Lists.newArrayListWithExpectedSize(fieldReaders.size()); List types = Lists.newArrayListWithExpectedSize(fieldReaders.size()); List fields = struct.getFields(); for (int i = 0; i < fields.size(); i += 1) { @@ -121,14 +118,14 @@ private ReadBuilder(MessageType type, Map idToConstant) { } @Override - public ParquetValueReader message(Types.StructType expected, MessageType message, - List> fieldReaders) { + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { return struct(expected, message.asGroupType(), fieldReaders); } @Override - public ParquetValueReader struct(Types.StructType expected, GroupType struct, - List> fieldReaders) { + public ParquetValueReader struct( + Types.StructType expected, GroupType struct, List> fieldReaders) { // match the expected struct's order Map> readersById = Maps.newHashMap(); Map typesById = Maps.newHashMap(); @@ -144,10 +141,10 @@ public ParquetValueReader struct(Types.StructType expected, GroupType struct, } } - List expectedFields = expected != null ? - expected.fields() : ImmutableList.of(); - List> reorderedFields = Lists.newArrayListWithExpectedSize( - expectedFields.size()); + List expectedFields = + expected != null ? expected.fields() : ImmutableList.of(); + List> reorderedFields = + Lists.newArrayListWithExpectedSize(expectedFields.size()); List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); for (Types.NestedField field : expectedFields) { int id = field.fieldId(); @@ -177,8 +174,8 @@ public ParquetValueReader struct(Types.StructType expected, GroupType struct, } @Override - public ParquetValueReader list(Types.ListType expectedList, GroupType array, - ParquetValueReader elementReader) { + public ParquetValueReader list( + Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { if (expectedList == null) { return null; } @@ -191,14 +188,16 @@ public ParquetValueReader list(Types.ListType expectedList, GroupType array, Type elementType = ParquetSchemaUtil.determineListElementType(array); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - return new ParquetValueReaders.ListReader<>(repeatedD, repeatedR, - ParquetValueReaders.option(elementType, elementD, elementReader)); + return new ParquetValueReaders.ListReader<>( + repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); } @Override - public ParquetValueReader map(Types.MapType expectedMap, GroupType map, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { + public ParquetValueReader map( + Types.MapType expectedMap, + GroupType map, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { if (expectedMap == null) { return null; } @@ -214,15 +213,17 @@ public ParquetValueReader map(Types.MapType expectedMap, GroupType map, Type valueType = repeatedKeyValue.getType(1); int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - return new ParquetValueReaders.MapReader<>(repeatedD, repeatedR, + return new ParquetValueReaders.MapReader<>( + repeatedD, + repeatedR, ParquetValueReaders.option(keyType, keyD, keyReader), ParquetValueReaders.option(valueType, valueD, valueReader)); } @Override @SuppressWarnings("checkstyle:CyclomaticComplexity") - public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveType expected, - PrimitiveType primitive) { + public ParquetValueReader primitive( + org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { if (expected == null) { return null; } @@ -291,7 +292,8 @@ public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveTy case FIXED_LEN_BYTE_ARRAY: return new FixedReader(desc); case BINARY: - if (expected != null && expected.typeId() == org.apache.iceberg.types.Type.TypeID.STRING) { + if (expected != null + && expected.typeId() == org.apache.iceberg.types.Type.TypeID.STRING) { return new ParquetValueReaders.StringReader(desc); } else { return new ParquetValueReaders.BytesReader(desc); @@ -303,7 +305,8 @@ public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveTy return new ParquetValueReaders.UnboxedReader<>(desc); } case FLOAT: - if (expected != null && expected.typeId() == org.apache.iceberg.types.Type.TypeID.DOUBLE) { + if (expected != null + && expected.typeId() == org.apache.iceberg.types.Type.TypeID.DOUBLE) { return new ParquetValueReaders.FloatAsDoubleReader(desc); } else { return new ParquetValueReaders.UnboxedReader<>(desc); @@ -351,7 +354,8 @@ public LocalDateTime read(LocalDateTime reuse) { } } - private static class TimestampMillisReader extends ParquetValueReaders.PrimitiveReader { + private static class TimestampMillisReader + extends ParquetValueReaders.PrimitiveReader { private TimestampMillisReader(ColumnDescriptor desc) { super(desc); } @@ -362,7 +366,8 @@ public LocalDateTime read(LocalDateTime reuse) { } } - private static class TimestampInt96Reader extends ParquetValueReaders.PrimitiveReader { + private static class TimestampInt96Reader + extends ParquetValueReaders.PrimitiveReader { private static final long UNIX_EPOCH_JULIAN = 2_440_588L; private TimestampInt96Reader(ColumnDescriptor desc) { @@ -371,17 +376,19 @@ private TimestampInt96Reader(ColumnDescriptor desc) { @Override public OffsetDateTime read(OffsetDateTime reuse) { - final ByteBuffer byteBuffer = column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + final ByteBuffer byteBuffer = + column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); final long timeOfDayNanos = byteBuffer.getLong(); final int julianDay = byteBuffer.getInt(); - return Instant - .ofEpochMilli(TimeUnit.DAYS.toMillis(julianDay - UNIX_EPOCH_JULIAN)) - .plusNanos(timeOfDayNanos).atOffset(ZoneOffset.UTC); + return Instant.ofEpochMilli(TimeUnit.DAYS.toMillis(julianDay - UNIX_EPOCH_JULIAN)) + .plusNanos(timeOfDayNanos) + .atOffset(ZoneOffset.UTC); } } - private static class TimestamptzReader extends ParquetValueReaders.PrimitiveReader { + private static class TimestamptzReader + extends ParquetValueReaders.PrimitiveReader { private TimestamptzReader(ColumnDescriptor desc) { super(desc); } @@ -392,7 +399,8 @@ public OffsetDateTime read(OffsetDateTime reuse) { } } - private static class TimestamptzMillisReader extends ParquetValueReaders.PrimitiveReader { + private static class TimestamptzMillisReader + extends ParquetValueReaders.PrimitiveReader { private TimestamptzMillisReader(ColumnDescriptor desc) { super(desc); } diff --git a/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetWriter.java b/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetWriter.java index ee8d7e043658..470f95e8bc99 100644 --- a/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetWriter.java +++ b/parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.parquet; import java.time.Instant; @@ -48,7 +47,8 @@ protected ParquetValueWriter createWriter(MessageType type) { return (ParquetValueWriter) ParquetTypeVisitor.visit(type, new WriteBuilder(type)); } - protected abstract ParquetValueWriters.StructWriter createStructWriter(List> writers); + protected abstract ParquetValueWriters.StructWriter createStructWriter( + List> writers); private class WriteBuilder extends ParquetTypeVisitor> { private final MessageType type; @@ -58,14 +58,15 @@ private WriteBuilder(MessageType type) { } @Override - public ParquetValueWriter message(MessageType message, List> fieldWriters) { + public ParquetValueWriter message( + MessageType message, List> fieldWriters) { return struct(message.asGroupType(), fieldWriters); } @Override - public ParquetValueWriter struct(GroupType struct, - List> fieldWriters) { + public ParquetValueWriter struct( + GroupType struct, List> fieldWriters) { List fields = struct.getFields(); List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); for (int i = 0; i < fields.size(); i += 1) { @@ -88,14 +89,13 @@ public ParquetValueWriter list(GroupType array, ParquetValueWriter element Type elementType = repeated.getType(0); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())); - return ParquetValueWriters.collections(repeatedD, repeatedR, - ParquetValueWriters.option(elementType, elementD, elementWriter)); + return ParquetValueWriters.collections( + repeatedD, repeatedR, ParquetValueWriters.option(elementType, elementD, elementWriter)); } @Override - public ParquetValueWriter map(GroupType map, - ParquetValueWriter keyWriter, - ParquetValueWriter valueWriter) { + public ParquetValueWriter map( + GroupType map, ParquetValueWriter keyWriter, ParquetValueWriter valueWriter) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); @@ -107,7 +107,9 @@ public ParquetValueWriter map(GroupType map, Type valueType = repeatedKeyValue.getType(1); int valueD = type.getMaxDefinitionLevel(path(valueType.getName())); - return ParquetValueWriters.maps(repeatedD, repeatedR, + return ParquetValueWriters.maps( + repeatedD, + repeatedR, ParquetValueWriters.option(keyType, keyD, keyWriter), ParquetValueWriters.option(valueType, valueD, valueWriter)); } @@ -145,8 +147,9 @@ public ParquetValueWriter primitive(PrimitiveType primitive) { } } - private static class LogicalTypeWriterVisitor implements - LogicalTypeAnnotation.LogicalTypeAnnotationVisitor> { + private static class LogicalTypeWriterVisitor + implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor< + ParquetValueWriters.PrimitiveWriter> { private final ColumnDescriptor desc; private LogicalTypeWriterVisitor(ColumnDescriptor desc) { @@ -170,15 +173,18 @@ public Optional> visit( LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalType) { switch (desc.getPrimitiveType().getPrimitiveTypeName()) { case INT32: - return Optional.of(ParquetValueWriters.decimalAsInteger( - desc, decimalType.getPrecision(), decimalType.getScale())); + return Optional.of( + ParquetValueWriters.decimalAsInteger( + desc, decimalType.getPrecision(), decimalType.getScale())); case INT64: - return Optional.of(ParquetValueWriters.decimalAsLong( - desc, decimalType.getPrecision(), decimalType.getScale())); + return Optional.of( + ParquetValueWriters.decimalAsLong( + desc, decimalType.getPrecision(), decimalType.getScale())); case BINARY: case FIXED_LEN_BYTE_ARRAY: - return Optional.of(ParquetValueWriters.decimalAsFixed( - desc, decimalType.getPrecision(), decimalType.getScale())); + return Optional.of( + ParquetValueWriters.decimalAsFixed( + desc, decimalType.getPrecision(), decimalType.getScale())); } return Optional.empty(); } @@ -198,8 +204,10 @@ public Optional> visit( @Override public Optional> visit( LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampType) { - Preconditions.checkArgument(LogicalTypeAnnotation.TimeUnit.MICROS.equals(timestampType.getUnit()), - "Cannot write timestamp in %s, only MICROS is supported", timestampType.getUnit()); + Preconditions.checkArgument( + LogicalTypeAnnotation.TimeUnit.MICROS.equals(timestampType.getUnit()), + "Cannot write timestamp in %s, only MICROS is supported", + timestampType.getUnit()); if (timestampType.isAdjustedToUTC()) { return Optional.of(new TimestamptzWriter(desc)); } else { @@ -210,7 +218,8 @@ public Optional> visit( @Override public Optional> visit( LogicalTypeAnnotation.IntLogicalTypeAnnotation intType) { - Preconditions.checkArgument(intType.isSigned() || intType.getBitWidth() < 64, + Preconditions.checkArgument( + intType.isSigned() || intType.getBitWidth() < 64, "Cannot read uint64: not a supported Java type"); if (intType.getBitWidth() < 64) { return Optional.of(ParquetValueWriters.ints(desc)); @@ -264,12 +273,13 @@ private TimestampWriter(ColumnDescriptor desc) { @Override public void write(int repetitionLevel, LocalDateTime value) { - column.writeLong(repetitionLevel, - ChronoUnit.MICROS.between(EPOCH, value.atOffset(ZoneOffset.UTC))); + column.writeLong( + repetitionLevel, ChronoUnit.MICROS.between(EPOCH, value.atOffset(ZoneOffset.UTC))); } } - private static class TimestamptzWriter extends ParquetValueWriters.PrimitiveWriter { + private static class TimestamptzWriter + extends ParquetValueWriters.PrimitiveWriter { private TimestamptzWriter(ColumnDescriptor desc) { super(desc); } diff --git a/parquet/src/main/java/org/apache/iceberg/data/parquet/GenericParquetReaders.java b/parquet/src/main/java/org/apache/iceberg/data/parquet/GenericParquetReaders.java index 4e13fc62e361..99b8c9baad64 100644 --- a/parquet/src/main/java/org/apache/iceberg/data/parquet/GenericParquetReaders.java +++ b/parquet/src/main/java/org/apache/iceberg/data/parquet/GenericParquetReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.parquet; import java.util.List; @@ -34,30 +33,28 @@ public class GenericParquetReaders extends BaseParquetReaders { private static final GenericParquetReaders INSTANCE = new GenericParquetReaders(); - private GenericParquetReaders() { - } + private GenericParquetReaders() {} - public static ParquetValueReader buildReader(Schema expectedSchema, MessageType fileSchema) { + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema) { return INSTANCE.createReader(expectedSchema, fileSchema); } - public static ParquetValueReader buildReader(Schema expectedSchema, MessageType fileSchema, - Map idToConstant) { + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema, Map idToConstant) { return INSTANCE.createReader(expectedSchema, fileSchema, idToConstant); } @Override - protected ParquetValueReader createStructReader(List types, List> fieldReaders, - StructType structType) { + protected ParquetValueReader createStructReader( + List types, List> fieldReaders, StructType structType) { return new RecordReader(types, fieldReaders, structType); } private static class RecordReader extends StructReader { private final GenericRecord template; - RecordReader(List types, - List> readers, - StructType struct) { + RecordReader(List types, List> readers, StructType struct) { super(types, readers); this.template = struct != null ? GenericRecord.create(struct) : null; } @@ -67,7 +64,8 @@ protected Record newStructData(Record reuse) { if (reuse != null) { return reuse; } else { - // GenericRecord.copy() is more performant then GenericRecord.create(StructType) since NAME_MAP_CACHE access + // GenericRecord.copy() is more performant then GenericRecord.create(StructType) since + // NAME_MAP_CACHE access // is eliminated. Using copy here to gain performance. return template.copy(); } diff --git a/parquet/src/main/java/org/apache/iceberg/data/parquet/GenericParquetWriter.java b/parquet/src/main/java/org/apache/iceberg/data/parquet/GenericParquetWriter.java index 5bb878f536ba..7f2c107b8dc8 100644 --- a/parquet/src/main/java/org/apache/iceberg/data/parquet/GenericParquetWriter.java +++ b/parquet/src/main/java/org/apache/iceberg/data/parquet/GenericParquetWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.data.parquet; import java.util.List; @@ -28,8 +27,7 @@ public class GenericParquetWriter extends BaseParquetWriter { private static final GenericParquetWriter INSTANCE = new GenericParquetWriter(); - private GenericParquetWriter() { - } + private GenericParquetWriter() {} public static ParquetValueWriter buildWriter(MessageType type) { return INSTANCE.createWriter(type); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ApplyNameMapping.java b/parquet/src/main/java/org/apache/iceberg/parquet/ApplyNameMapping.java index 363631c8c04d..3ad11ab6e523 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ApplyNameMapping.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ApplyNameMapping.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.util.Deque; @@ -64,14 +63,13 @@ public Type struct(GroupType struct, List types) { @Override public Type list(GroupType list, Type elementType) { - Preconditions.checkArgument(elementType != null, - "List type must have element field"); + Preconditions.checkArgument(elementType != null, "List type must have element field"); Type listElement = ParquetSchemaUtil.determineListElementType(list); MappedField field = nameMapping.find(currentPath()); - Types.GroupBuilder listBuilder = Types.buildGroup(list.getRepetition()) - .as(LogicalTypeAnnotation.listType()); + Types.GroupBuilder listBuilder = + Types.buildGroup(list.getRepetition()).as(LogicalTypeAnnotation.listType()); if (listElement.isRepetition(Type.Repetition.REPEATED)) { listBuilder.addFields(elementType); } else { @@ -84,14 +82,17 @@ public Type list(GroupType list, Type elementType) { @Override public Type map(GroupType map, Type keyType, Type valueType) { - Preconditions.checkArgument(keyType != null && valueType != null, - "Map type must have both key field and value field"); + Preconditions.checkArgument( + keyType != null && valueType != null, "Map type must have both key field and value field"); MappedField field = nameMapping.find(currentPath()); - Type mapType = Types.buildGroup(map.getRepetition()) - .as(LogicalTypeAnnotation.mapType()) - .repeatedGroup().addFields(keyType, valueType).named(map.getFieldName(0)) - .named(map.getName()); + Type mapType = + Types.buildGroup(map.getRepetition()) + .as(LogicalTypeAnnotation.mapType()) + .repeatedGroup() + .addFields(keyType, valueType) + .named(map.getFieldName(0)) + .named(map.getName()); return field == null ? mapType : mapType.withId(field.id()); } @@ -114,7 +115,8 @@ public void afterField(Type type) { @Override public void beforeElementField(Type element) { - // normalize the name to "element" so that the mapping will match structures with alternative names + // normalize the name to "element" so that the mapping will match structures with alternative + // names fieldNames.push(LIST_ELEMENT_NAME); } @@ -126,7 +128,8 @@ public void beforeKeyField(Type key) { @Override public void beforeValueField(Type key) { - // normalize the name to "value" so that the mapping will match structures with alternative names + // normalize the name to "value" so that the mapping will match structures with alternative + // names fieldNames.push(MAP_VALUE_NAME); } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java index 3edc20ff54a5..647397fad670 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import org.apache.parquet.column.ColumnDescriptor; @@ -71,5 +70,4 @@ protected void advance() { public boolean hasNext() { return triplesRead < triplesCount; } - } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/BasePageIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/BasePageIterator.java index 87cf33d60c4c..75989e8f649b 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/BasePageIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/BasePageIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.io.IOException; @@ -72,13 +71,15 @@ protected void reset() { this.hasNext = false; } - protected abstract void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount); + protected abstract void initDataReader( + Encoding dataEncoding, ByteBufferInputStream in, int valueCount); - protected abstract void initDefinitionLevelsReader(DataPageV1 dataPageV1, ColumnDescriptor descriptor, - ByteBufferInputStream in, int count) throws IOException; + protected abstract void initDefinitionLevelsReader( + DataPageV1 dataPageV1, ColumnDescriptor descriptor, ByteBufferInputStream in, int count) + throws IOException; - protected abstract void initDefinitionLevelsReader(DataPageV2 dataPageV2, ColumnDescriptor descriptor) - throws IOException; + protected abstract void initDefinitionLevelsReader( + DataPageV2 dataPageV2, ColumnDescriptor descriptor) throws IOException; public int currentPageCount() { return triplesCount; @@ -91,26 +92,28 @@ public boolean hasNext() { public void setPage(DataPage page) { Preconditions.checkNotNull(page, "Cannot read from null page"); this.page = page; - this.page.accept(new DataPage.Visitor() { - @Override - public ValuesReader visit(DataPageV1 dataPageV1) { - initFromPage(dataPageV1); - return null; - } - - @Override - public ValuesReader visit(DataPageV2 dataPageV2) { - initFromPage(dataPageV2); - return null; - } - }); + this.page.accept( + new DataPage.Visitor() { + @Override + public ValuesReader visit(DataPageV1 dataPageV1) { + initFromPage(dataPageV1); + return null; + } + + @Override + public ValuesReader visit(DataPageV2 dataPageV2) { + initFromPage(dataPageV2); + return null; + } + }); this.triplesRead = 0; this.hasNext = triplesRead < triplesCount; } protected void initFromPage(DataPageV1 initPage) { this.triplesCount = initPage.getValueCount(); - ValuesReader rlReader = initPage.getRlEncoding().getValuesReader(desc, ValuesType.REPETITION_LEVEL); + ValuesReader rlReader = + initPage.getRlEncoding().getValuesReader(desc, ValuesType.REPETITION_LEVEL); this.repetitionLevels = new ValuesReaderIntIterator(rlReader); try { BytesInput bytes = initPage.getBytes(); @@ -129,7 +132,8 @@ protected void initFromPage(DataPageV1 initPage) { protected void initFromPage(DataPageV2 initPage) { this.triplesCount = initPage.getValueCount(); - this.repetitionLevels = newRLEIterator(desc.getMaxRepetitionLevel(), initPage.getRepetitionLevels()); + this.repetitionLevels = + newRLEIterator(desc.getMaxRepetitionLevel(), initPage.getRepetitionLevels()); try { initDefinitionLevelsReader(initPage, desc); LOG.debug("page data size {} bytes and {} records", initPage.getData().size(), triplesCount); @@ -167,8 +171,7 @@ IntIterator newRLEIterator(int maxLevel, BytesInput bytes) { } return new RLEIntIterator( new RunLengthBitPackingHybridDecoder( - BytesUtils.getWidthFromMaxInt(maxLevel), - bytes.toInputStream())); + BytesUtils.getWidthFromMaxInt(maxLevel), bytes.toInputStream())); } catch (IOException e) { throw new ParquetDecodingException("could not read levels in page for col " + desc, e); } @@ -197,5 +200,4 @@ int nextInt() { return 0; } } - } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ColumnIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/ColumnIterator.java index 99a7453cf0b2..1c0ea4829eb8 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ColumnIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ColumnIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import org.apache.parquet.column.ColumnDescriptor; @@ -27,58 +26,65 @@ public abstract class ColumnIterator extends BaseColumnIterator implements Tr static ColumnIterator newIterator(ColumnDescriptor desc, String writerVersion) { switch (desc.getPrimitiveType().getPrimitiveTypeName()) { case BOOLEAN: - return (ColumnIterator) new ColumnIterator(desc, writerVersion) { - @Override - public Boolean next() { - return nextBoolean(); - } - }; + return (ColumnIterator) + new ColumnIterator(desc, writerVersion) { + @Override + public Boolean next() { + return nextBoolean(); + } + }; case INT32: - return (ColumnIterator) new ColumnIterator(desc, writerVersion) { - @Override - public Integer next() { - return nextInteger(); - } - }; + return (ColumnIterator) + new ColumnIterator(desc, writerVersion) { + @Override + public Integer next() { + return nextInteger(); + } + }; case INT64: - return (ColumnIterator) new ColumnIterator(desc, writerVersion) { - @Override - public Long next() { - return nextLong(); - } - }; + return (ColumnIterator) + new ColumnIterator(desc, writerVersion) { + @Override + public Long next() { + return nextLong(); + } + }; case INT96: - return (ColumnIterator) new ColumnIterator(desc, writerVersion) { - @Override - public Binary next() { - return nextBinary(); - } - }; + return (ColumnIterator) + new ColumnIterator(desc, writerVersion) { + @Override + public Binary next() { + return nextBinary(); + } + }; case FLOAT: - return (ColumnIterator) new ColumnIterator(desc, writerVersion) { - @Override - public Float next() { - return nextFloat(); - } - }; + return (ColumnIterator) + new ColumnIterator(desc, writerVersion) { + @Override + public Float next() { + return nextFloat(); + } + }; case DOUBLE: - return (ColumnIterator) new ColumnIterator(desc, writerVersion) { - @Override - public Double next() { - return nextDouble(); - } - }; + return (ColumnIterator) + new ColumnIterator(desc, writerVersion) { + @Override + public Double next() { + return nextDouble(); + } + }; case FIXED_LEN_BYTE_ARRAY: case BINARY: - return (ColumnIterator) new ColumnIterator(desc, writerVersion) { - @Override - public Binary next() { - return nextBinary(); - } - }; + return (ColumnIterator) + new ColumnIterator(desc, writerVersion) { + @Override + public Binary next() { + return nextBinary(); + } + }; default: - throw new UnsupportedOperationException("Unsupported primitive type: " + - desc.getPrimitiveType().getPrimitiveTypeName()); + throw new UnsupportedOperationException( + "Unsupported primitive type: " + desc.getPrimitiveType().getPrimitiveTypeName()); } } @@ -154,5 +160,4 @@ public N nextNull() { protected BasePageIterator pageIterator() { return pageIterator; } - } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ColumnWriter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ColumnWriter.java index 06f3575a6bac..57a7b5800a9c 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ColumnWriter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ColumnWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import org.apache.parquet.column.ColumnDescriptor; @@ -28,51 +27,57 @@ public abstract class ColumnWriter implements TripleWriter { static ColumnWriter newWriter(ColumnDescriptor desc) { switch (desc.getPrimitiveType().getPrimitiveTypeName()) { case BOOLEAN: - return (ColumnWriter) new ColumnWriter(desc) { - @Override - public void write(int rl, Boolean value) { - writeBoolean(rl, value); - } - }; + return (ColumnWriter) + new ColumnWriter(desc) { + @Override + public void write(int rl, Boolean value) { + writeBoolean(rl, value); + } + }; case INT32: - return (ColumnWriter) new ColumnWriter(desc) { - @Override - public void write(int rl, Integer value) { - writeInteger(rl, value); - } - }; + return (ColumnWriter) + new ColumnWriter(desc) { + @Override + public void write(int rl, Integer value) { + writeInteger(rl, value); + } + }; case INT64: - return (ColumnWriter) new ColumnWriter(desc) { - @Override - public void write(int rl, Long value) { - writeLong(rl, value); - } - }; + return (ColumnWriter) + new ColumnWriter(desc) { + @Override + public void write(int rl, Long value) { + writeLong(rl, value); + } + }; case FLOAT: - return (ColumnWriter) new ColumnWriter(desc) { - @Override - public void write(int rl, Float value) { - writeFloat(rl, value); - } - }; + return (ColumnWriter) + new ColumnWriter(desc) { + @Override + public void write(int rl, Float value) { + writeFloat(rl, value); + } + }; case DOUBLE: - return (ColumnWriter) new ColumnWriter(desc) { - @Override - public void write(int rl, Double value) { - writeDouble(rl, value); - } - }; + return (ColumnWriter) + new ColumnWriter(desc) { + @Override + public void write(int rl, Double value) { + writeDouble(rl, value); + } + }; case FIXED_LEN_BYTE_ARRAY: case BINARY: - return (ColumnWriter) new ColumnWriter(desc) { - @Override - public void write(int rl, Binary value) { - writeBinary(rl, value); - } - }; + return (ColumnWriter) + new ColumnWriter(desc) { + @Override + public void write(int rl, Binary value) { + writeBinary(rl, value); + } + }; default: - throw new UnsupportedOperationException("Unsupported primitive type: " + - desc.getPrimitiveType().getPrimitiveTypeName()); + throw new UnsupportedOperationException( + "Unsupported primitive type: " + desc.getPrimitiveType().getPrimitiveTypeName()); } } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java b/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java index e1476a06c877..26ef6e468ede 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.List; import java.util.Map; import java.util.Optional; @@ -36,13 +38,10 @@ import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type.Repetition; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** * A visitor that converts a {@link MessageType} to a {@link Type} in Iceberg. - *

    - * Fields we could not determine IDs for will be pruned. + * + *

    Fields we could not determine IDs for will be pruned. */ class MessageTypeToType extends ParquetTypeVisitor { private static final Joiner DOT = Joiner.on("."); @@ -74,7 +73,8 @@ public Type struct(GroupType struct, List fieldTypes) { Preconditions.checkArgument( !field.isRepetition(Repetition.REPEATED), - "Fields cannot have repetition REPEATED: %s", field); + "Fields cannot have repetition REPEATED: %s", + field); Integer fieldId = getId(field); Type fieldType = fieldTypes.get(i); @@ -122,12 +122,14 @@ public Type map(GroupType map, Type keyType, Type valueType) { Preconditions.checkArgument( !value.isRepetition(Repetition.REPEATED), - "Values cannot have repetition REPEATED: %s", value); + "Values cannot have repetition REPEATED: %s", + value); Integer keyFieldId = getId(key); Integer valueFieldId = getId(value); - // keep the map if its key and values have ids and were not pruned (i.e. their types are not null) + // keep the map if its key and values have ids and were not pruned (i.e. their types are not + // null) if (keyFieldId != null && valueFieldId != null && keyType != null && valueType != null) { addAlias(key.getName(), keyFieldId); addAlias(value.getName(), valueFieldId); @@ -174,11 +176,11 @@ public Type primitive(PrimitiveType primitive) { return Types.BinaryType.get(); } - throw new UnsupportedOperationException( - "Cannot convert unknown primitive type: " + primitive); + throw new UnsupportedOperationException("Cannot convert unknown primitive type: " + primitive); } - private static class ParquetLogicalTypeVisitor implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor { + private static class ParquetLogicalTypeVisitor + implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor { private static final ParquetLogicalTypeVisitor INSTANCE = new ParquetLogicalTypeVisitor(); private static ParquetLogicalTypeVisitor get() { @@ -211,13 +213,16 @@ public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation time } @Override - public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampType) { - return Optional.of(timestampType.isAdjustedToUTC() ? TimestampType.withZone() : TimestampType.withoutZone()); + public Optional visit( + LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampType) { + return Optional.of( + timestampType.isAdjustedToUTC() ? TimestampType.withZone() : TimestampType.withoutZone()); } @Override public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intType) { - Preconditions.checkArgument(intType.isSigned() || intType.getBitWidth() < 64, + Preconditions.checkArgument( + intType.isSigned() || intType.getBitWidth() < 64, "Cannot use uint64: not a supported Java type"); if (intType.getBitWidth() < 32) { return Optional.of(Types.IntegerType.get()); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/PageIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/PageIterator.java index cee5e9ad72e7..34383352bf68 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/PageIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/PageIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.io.IOException; @@ -39,58 +38,65 @@ abstract class PageIterator extends BasePageIterator implements TripleIterato static PageIterator newIterator(ColumnDescriptor desc, String writerVersion) { switch (desc.getPrimitiveType().getPrimitiveTypeName()) { case BOOLEAN: - return (PageIterator) new PageIterator(desc, writerVersion) { - @Override - public Boolean next() { - return nextBoolean(); - } - }; + return (PageIterator) + new PageIterator(desc, writerVersion) { + @Override + public Boolean next() { + return nextBoolean(); + } + }; case INT32: - return (PageIterator) new PageIterator(desc, writerVersion) { - @Override - public Integer next() { - return nextInteger(); - } - }; + return (PageIterator) + new PageIterator(desc, writerVersion) { + @Override + public Integer next() { + return nextInteger(); + } + }; case INT64: - return (PageIterator) new PageIterator(desc, writerVersion) { - @Override - public Long next() { - return nextLong(); - } - }; + return (PageIterator) + new PageIterator(desc, writerVersion) { + @Override + public Long next() { + return nextLong(); + } + }; case INT96: - return (PageIterator) new PageIterator(desc, writerVersion) { - @Override - public Binary next() { - return nextBinary(); - } - }; + return (PageIterator) + new PageIterator(desc, writerVersion) { + @Override + public Binary next() { + return nextBinary(); + } + }; case FLOAT: - return (PageIterator) new PageIterator(desc, writerVersion) { - @Override - public Float next() { - return nextFloat(); - } - }; + return (PageIterator) + new PageIterator(desc, writerVersion) { + @Override + public Float next() { + return nextFloat(); + } + }; case DOUBLE: - return (PageIterator) new PageIterator(desc, writerVersion) { - @Override - public Double next() { - return nextDouble(); - } - }; + return (PageIterator) + new PageIterator(desc, writerVersion) { + @Override + public Double next() { + return nextDouble(); + } + }; case FIXED_LEN_BYTE_ARRAY: case BINARY: - return (PageIterator) new PageIterator(desc, writerVersion) { - @Override - public Binary next() { - return nextBinary(); - } - }; + return (PageIterator) + new PageIterator(desc, writerVersion) { + @Override + public Binary next() { + return nextBinary(); + } + }; default: - throw new UnsupportedOperationException("Unsupported primitive type: " + - desc.getPrimitiveType().getPrimitiveTypeName()); + throw new UnsupportedOperationException( + "Unsupported primitive type: " + desc.getPrimitiveType().getPrimitiveTypeName()); } } @@ -112,7 +118,7 @@ public int currentDefinitionLevel() { @Override public int currentRepetitionLevel() { -// Preconditions.checkArgument(currentDL >= 0, "Should not read repetition, past page end"); + // Preconditions.checkArgument(currentDL >= 0, "Should not read repetition, past page end"); return currentRL; } @@ -197,21 +203,23 @@ private void advance() { } RuntimeException handleRuntimeException(RuntimeException exception) { - if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, valueEncoding) && - exception instanceof ArrayIndexOutOfBoundsException) { + if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, valueEncoding) + && exception instanceof ArrayIndexOutOfBoundsException) { // this is probably PARQUET-246, which may happen if reading data with // MR because this can't be detected without reading all footers - throw new ParquetDecodingException("Read failure possibly due to " + - "PARQUET-246: try setting parquet.split.files to false", + throw new ParquetDecodingException( + "Read failure possibly due to " + "PARQUET-246: try setting parquet.split.files to false", new ParquetDecodingException( - String.format("Can't read value in column %s at value %d out of %d in current page. " + - "repetition level: %d, definition level: %d", + String.format( + "Can't read value in column %s at value %d out of %d in current page. " + + "repetition level: %d, definition level: %d", desc, triplesRead, triplesCount, currentRL, currentDL), exception)); } throw new ParquetDecodingException( - String.format("Can't read value in column %s at value %d out of %d in current page. " + - "repetition level: %d, definition level: %d", + String.format( + "Can't read value in column %s at value %d out of %d in current page. " + + "repetition level: %d, definition level: %d", desc, triplesRead, triplesCount, currentRL, currentDL), exception); } @@ -228,18 +236,22 @@ protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, i if (dataEncoding.usesDictionary()) { if (dictionary == null) { throw new ParquetDecodingException( - "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding); + "could not read page in col " + + desc + + " as the dictionary was missing for encoding " + + dataEncoding); } - this.values = dataEncoding.getDictionaryBasedValuesReader(desc, ValuesType.VALUES, dictionary); + this.values = + dataEncoding.getDictionaryBasedValuesReader(desc, ValuesType.VALUES, dictionary); } else { this.values = dataEncoding.getValuesReader(desc, ValuesType.VALUES); } -// if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) { -// bindToDictionary(dictionary); -// } else { -// bind(path.getType()); -// } + // if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) { + // bindToDictionary(dictionary); + // } else { + // bind(path.getType()); + // } try { values.initFromPage(valueCount, in); @@ -247,24 +259,26 @@ protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, i throw new ParquetDecodingException("could not read page in col " + desc, e); } - if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) && - previousReader instanceof RequiresPreviousReader) { + if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) + && previousReader instanceof RequiresPreviousReader) { // previous reader can only be set if reading sequentially ((RequiresPreviousReader) values).setPreviousReader(previousReader); } } @Override - protected void initDefinitionLevelsReader(DataPageV1 dataPageV1, ColumnDescriptor desc, ByteBufferInputStream in, - int triplesCount) throws IOException { - ValuesReader dlReader = dataPageV1.getDlEncoding().getValuesReader(desc, ValuesType.DEFINITION_LEVEL); + protected void initDefinitionLevelsReader( + DataPageV1 dataPageV1, ColumnDescriptor desc, ByteBufferInputStream in, int triplesCount) + throws IOException { + ValuesReader dlReader = + dataPageV1.getDlEncoding().getValuesReader(desc, ValuesType.DEFINITION_LEVEL); this.definitionLevels = new ValuesReaderIntIterator(dlReader); dlReader.initFromPage(triplesCount, in); } @Override protected void initDefinitionLevelsReader(DataPageV2 dataPageV2, ColumnDescriptor desc) { - this.definitionLevels = newRLEIterator(desc.getMaxDefinitionLevel(), dataPageV2.getDefinitionLevels()); + this.definitionLevels = + newRLEIterator(desc.getMaxDefinitionLevel(), dataPageV2.getDefinitionLevels()); } - } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java index 1561283febcf..ac8856b92345 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java @@ -16,9 +16,33 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; +import static org.apache.iceberg.TableProperties.DELETE_PARQUET_COMPRESSION; +import static org.apache.iceberg.TableProperties.DELETE_PARQUET_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.DELETE_PARQUET_DICT_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.DELETE_PARQUET_PAGE_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT; +import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT; +import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX; +import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_MAX_BYTES; +import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_MAX_BYTES_DEFAULT; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_DEFAULT; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL_DEFAULT; +import static org.apache.iceberg.TableProperties.PARQUET_DICT_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.PARQUET_DICT_SIZE_BYTES_DEFAULT; +import static org.apache.iceberg.TableProperties.PARQUET_PAGE_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.PARQUET_PAGE_SIZE_BYTES_DEFAULT; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT_DEFAULT; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT_DEFAULT; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; + import java.io.File; import java.io.IOException; import java.util.Collection; @@ -78,37 +102,14 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.schema.MessageType; -import static org.apache.iceberg.TableProperties.DELETE_PARQUET_COMPRESSION; -import static org.apache.iceberg.TableProperties.DELETE_PARQUET_COMPRESSION_LEVEL; -import static org.apache.iceberg.TableProperties.DELETE_PARQUET_DICT_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.DELETE_PARQUET_PAGE_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT; -import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT; -import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX; -import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_MAX_BYTES; -import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_MAX_BYTES_DEFAULT; -import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; -import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_DEFAULT; -import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; -import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL_DEFAULT; -import static org.apache.iceberg.TableProperties.PARQUET_DICT_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.PARQUET_DICT_SIZE_BYTES_DEFAULT; -import static org.apache.iceberg.TableProperties.PARQUET_PAGE_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.PARQUET_PAGE_SIZE_BYTES_DEFAULT; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT_DEFAULT; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT_DEFAULT; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; - public class Parquet { - private Parquet() { - } + private Parquet() {} - private static final Collection READ_PROPERTIES_TO_REMOVE = Sets.newHashSet( - "parquet.read.filter", "parquet.private.read.filter.predicate", "parquet.read.support.class"); + private static final Collection READ_PROPERTIES_TO_REMOVE = + Sets.newHashSet( + "parquet.read.filter", + "parquet.private.read.filter.predicate", + "parquet.read.support.class"); public static WriteBuilder write(OutputFile file) { return new WriteBuilder(file); @@ -174,7 +175,8 @@ public WriteBuilder meta(String property, String value) { return this; } - public WriteBuilder createWriterFunc(Function> newCreateWriterFunc) { + public WriteBuilder createWriterFunc( + Function> newCreateWriterFunc) { this.createWriterFunc = newCreateWriterFunc; return this; } @@ -220,7 +222,8 @@ WriteBuilder withWriterVersion(WriterVersion version) { } // supposed to always be a private method used strictly by data and delete write builders - private WriteBuilder createContextFunc(Function, Context> newCreateContextFunc) { + private WriteBuilder createContextFunc( + Function, Context> newCreateContextFunc) { this.createContextFunc = newCreateContextFunc; return this; } @@ -267,20 +270,21 @@ public FileAppender build() throws IOException { MessageType type = ParquetSchemaUtil.convert(schema, name); if (createWriterFunc != null) { - Preconditions.checkArgument(writeSupport == null, - "Cannot write with both write support and Parquet value writer"); + Preconditions.checkArgument( + writeSupport == null, "Cannot write with both write support and Parquet value writer"); for (Map.Entry entry : config.entrySet()) { conf.set(entry.getKey(), entry.getValue()); } - ParquetProperties.Builder propsBuilder = ParquetProperties.builder() - .withWriterVersion(writerVersion) - .withPageSize(pageSize) - .withDictionaryPageSize(dictionaryPageSize) - .withMinRowCountForPageSizeCheck(rowGroupCheckMinRecordCount) - .withMaxRowCountForPageSizeCheck(rowGroupCheckMaxRecordCount) - .withMaxBloomFilterBytes(bloomFilterMaxBytes); + ParquetProperties.Builder propsBuilder = + ParquetProperties.builder() + .withWriterVersion(writerVersion) + .withPageSize(pageSize) + .withDictionaryPageSize(dictionaryPageSize) + .withMinRowCountForPageSizeCheck(rowGroupCheckMinRecordCount) + .withMaxRowCountForPageSizeCheck(rowGroupCheckMaxRecordCount) + .withMaxBloomFilterBytes(bloomFilterMaxBytes); for (Map.Entry entry : columnBloomFilterEnabled.entrySet()) { String colPath = entry.getKey(); @@ -291,20 +295,29 @@ public FileAppender build() throws IOException { ParquetProperties parquetProperties = propsBuilder.build(); return new org.apache.iceberg.parquet.ParquetWriter<>( - conf, file, schema, rowGroupSize, metadata, createWriterFunc, codec, - parquetProperties, metricsConfig, writeMode); + conf, + file, + schema, + rowGroupSize, + metadata, + createWriterFunc, + codec, + parquetProperties, + metricsConfig, + writeMode); } else { - ParquetWriteBuilder parquetWriteBuilder = new ParquetWriteBuilder(ParquetIO.file(file)) - .withWriterVersion(writerVersion) - .setType(type) - .setConfig(config) - .setKeyValueMetadata(metadata) - .setWriteSupport(getWriteSupport(type)) - .withCompressionCodec(codec) - .withWriteMode(writeMode) - .withRowGroupSize(rowGroupSize) - .withPageSize(pageSize) - .withDictionaryPageSize(dictionaryPageSize); + ParquetWriteBuilder parquetWriteBuilder = + new ParquetWriteBuilder(ParquetIO.file(file)) + .withWriterVersion(writerVersion) + .setType(type) + .setConfig(config) + .setKeyValueMetadata(metadata) + .setWriteSupport(getWriteSupport(type)) + .withCompressionCodec(codec) + .withWriteMode(writeMode) + .withRowGroupSize(rowGroupSize) + .withPageSize(pageSize) + .withDictionaryPageSize(dictionaryPageSize); for (Map.Entry entry : columnBloomFilterEnabled.entrySet()) { String colPath = entry.getKey(); @@ -327,11 +340,16 @@ private static class Context { private final int bloomFilterMaxBytes; private final Map columnBloomFilterEnabled; - private Context(int rowGroupSize, int pageSize, int dictionaryPageSize, - CompressionCodecName codec, String compressionLevel, - int rowGroupCheckMinRecordCount, int rowGroupCheckMaxRecordCount, - int bloomFilterMaxBytes, - Map columnBloomFilterEnabled) { + private Context( + int rowGroupSize, + int pageSize, + int dictionaryPageSize, + CompressionCodecName codec, + String compressionLevel, + int rowGroupCheckMinRecordCount, + int rowGroupCheckMaxRecordCount, + int bloomFilterMaxBytes, + Map columnBloomFilterEnabled) { this.rowGroupSize = rowGroupSize; this.pageSize = pageSize; this.dictionaryPageSize = dictionaryPageSize; @@ -344,44 +362,64 @@ private Context(int rowGroupSize, int pageSize, int dictionaryPageSize, } static Context dataContext(Map config) { - int rowGroupSize = PropertyUtil.propertyAsInt(config, - PARQUET_ROW_GROUP_SIZE_BYTES, PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT); + int rowGroupSize = + PropertyUtil.propertyAsInt( + config, PARQUET_ROW_GROUP_SIZE_BYTES, PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT); Preconditions.checkArgument(rowGroupSize > 0, "Row group size must be > 0"); - int pageSize = PropertyUtil.propertyAsInt(config, - PARQUET_PAGE_SIZE_BYTES, PARQUET_PAGE_SIZE_BYTES_DEFAULT); + int pageSize = + PropertyUtil.propertyAsInt( + config, PARQUET_PAGE_SIZE_BYTES, PARQUET_PAGE_SIZE_BYTES_DEFAULT); Preconditions.checkArgument(pageSize > 0, "Page size must be > 0"); - int dictionaryPageSize = PropertyUtil.propertyAsInt(config, - PARQUET_DICT_SIZE_BYTES, PARQUET_DICT_SIZE_BYTES_DEFAULT); + int dictionaryPageSize = + PropertyUtil.propertyAsInt( + config, PARQUET_DICT_SIZE_BYTES, PARQUET_DICT_SIZE_BYTES_DEFAULT); Preconditions.checkArgument(dictionaryPageSize > 0, "Dictionary page size must be > 0"); - String codecAsString = config.getOrDefault(PARQUET_COMPRESSION, PARQUET_COMPRESSION_DEFAULT); + String codecAsString = + config.getOrDefault(PARQUET_COMPRESSION, PARQUET_COMPRESSION_DEFAULT); CompressionCodecName codec = toCodec(codecAsString); - String compressionLevel = config.getOrDefault(PARQUET_COMPRESSION_LEVEL, PARQUET_COMPRESSION_LEVEL_DEFAULT); - - int rowGroupCheckMinRecordCount = PropertyUtil.propertyAsInt(config, - PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT, PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT_DEFAULT); - Preconditions.checkArgument(rowGroupCheckMinRecordCount > 0, - "Row group check minimal record count must be > 0"); - - int rowGroupCheckMaxRecordCount = PropertyUtil.propertyAsInt(config, - PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT, PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT_DEFAULT); - Preconditions.checkArgument(rowGroupCheckMaxRecordCount > 0, - "Row group check maximum record count must be > 0"); - Preconditions.checkArgument(rowGroupCheckMaxRecordCount >= rowGroupCheckMinRecordCount, + String compressionLevel = + config.getOrDefault(PARQUET_COMPRESSION_LEVEL, PARQUET_COMPRESSION_LEVEL_DEFAULT); + + int rowGroupCheckMinRecordCount = + PropertyUtil.propertyAsInt( + config, + PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT, + PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT_DEFAULT); + Preconditions.checkArgument( + rowGroupCheckMinRecordCount > 0, "Row group check minimal record count must be > 0"); + + int rowGroupCheckMaxRecordCount = + PropertyUtil.propertyAsInt( + config, + PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT, + PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT_DEFAULT); + Preconditions.checkArgument( + rowGroupCheckMaxRecordCount > 0, "Row group check maximum record count must be > 0"); + Preconditions.checkArgument( + rowGroupCheckMaxRecordCount >= rowGroupCheckMinRecordCount, "Row group check maximum record count must be >= minimal record count"); - int bloomFilterMaxBytes = PropertyUtil.propertyAsInt(config, PARQUET_BLOOM_FILTER_MAX_BYTES, - PARQUET_BLOOM_FILTER_MAX_BYTES_DEFAULT); + int bloomFilterMaxBytes = + PropertyUtil.propertyAsInt( + config, PARQUET_BLOOM_FILTER_MAX_BYTES, PARQUET_BLOOM_FILTER_MAX_BYTES_DEFAULT); Preconditions.checkArgument(bloomFilterMaxBytes > 0, "bloom Filter Max Bytes must be > 0"); Map columnBloomFilterEnabled = PropertyUtil.propertiesWithPrefix(config, PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX); - return new Context(rowGroupSize, pageSize, dictionaryPageSize, codec, compressionLevel, - rowGroupCheckMinRecordCount, rowGroupCheckMaxRecordCount, bloomFilterMaxBytes, + return new Context( + rowGroupSize, + pageSize, + dictionaryPageSize, + codec, + compressionLevel, + rowGroupCheckMinRecordCount, + rowGroupCheckMaxRecordCount, + bloomFilterMaxBytes, columnBloomFilterEnabled); } @@ -389,44 +427,64 @@ static Context deleteContext(Map config) { // default delete config using data config Context dataContext = dataContext(config); - int rowGroupSize = PropertyUtil.propertyAsInt(config, - DELETE_PARQUET_ROW_GROUP_SIZE_BYTES, dataContext.rowGroupSize()); + int rowGroupSize = + PropertyUtil.propertyAsInt( + config, DELETE_PARQUET_ROW_GROUP_SIZE_BYTES, dataContext.rowGroupSize()); Preconditions.checkArgument(rowGroupSize > 0, "Row group size must be > 0"); - int pageSize = PropertyUtil.propertyAsInt(config, - DELETE_PARQUET_PAGE_SIZE_BYTES, dataContext.pageSize()); + int pageSize = + PropertyUtil.propertyAsInt( + config, DELETE_PARQUET_PAGE_SIZE_BYTES, dataContext.pageSize()); Preconditions.checkArgument(pageSize > 0, "Page size must be > 0"); - int dictionaryPageSize = PropertyUtil.propertyAsInt(config, - DELETE_PARQUET_DICT_SIZE_BYTES, dataContext.dictionaryPageSize()); + int dictionaryPageSize = + PropertyUtil.propertyAsInt( + config, DELETE_PARQUET_DICT_SIZE_BYTES, dataContext.dictionaryPageSize()); Preconditions.checkArgument(dictionaryPageSize > 0, "Dictionary page size must be > 0"); String codecAsString = config.get(DELETE_PARQUET_COMPRESSION); - CompressionCodecName codec = codecAsString != null ? toCodec(codecAsString) : dataContext.codec(); - - String compressionLevel = config.getOrDefault(DELETE_PARQUET_COMPRESSION_LEVEL, dataContext.compressionLevel()); - - int rowGroupCheckMinRecordCount = PropertyUtil.propertyAsInt(config, - DELETE_PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT, dataContext.rowGroupCheckMinRecordCount()); - Preconditions.checkArgument(rowGroupCheckMinRecordCount > 0, - "Row group check minimal record count must be > 0"); - - int rowGroupCheckMaxRecordCount = PropertyUtil.propertyAsInt(config, - DELETE_PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT, dataContext.rowGroupCheckMaxRecordCount()); - Preconditions.checkArgument(rowGroupCheckMaxRecordCount > 0, - "Row group check maximum record count must be > 0"); - Preconditions.checkArgument(rowGroupCheckMaxRecordCount >= rowGroupCheckMinRecordCount, + CompressionCodecName codec = + codecAsString != null ? toCodec(codecAsString) : dataContext.codec(); + + String compressionLevel = + config.getOrDefault(DELETE_PARQUET_COMPRESSION_LEVEL, dataContext.compressionLevel()); + + int rowGroupCheckMinRecordCount = + PropertyUtil.propertyAsInt( + config, + DELETE_PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT, + dataContext.rowGroupCheckMinRecordCount()); + Preconditions.checkArgument( + rowGroupCheckMinRecordCount > 0, "Row group check minimal record count must be > 0"); + + int rowGroupCheckMaxRecordCount = + PropertyUtil.propertyAsInt( + config, + DELETE_PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT, + dataContext.rowGroupCheckMaxRecordCount()); + Preconditions.checkArgument( + rowGroupCheckMaxRecordCount > 0, "Row group check maximum record count must be > 0"); + Preconditions.checkArgument( + rowGroupCheckMaxRecordCount >= rowGroupCheckMinRecordCount, "Row group check maximum record count must be >= minimal record count"); - int bloomFilterMaxBytes = PropertyUtil.propertyAsInt(config, PARQUET_BLOOM_FILTER_MAX_BYTES, - PARQUET_BLOOM_FILTER_MAX_BYTES_DEFAULT); + int bloomFilterMaxBytes = + PropertyUtil.propertyAsInt( + config, PARQUET_BLOOM_FILTER_MAX_BYTES, PARQUET_BLOOM_FILTER_MAX_BYTES_DEFAULT); Preconditions.checkArgument(bloomFilterMaxBytes > 0, "bloom Filter Max Bytes must be > 0"); Map columnBloomFilterEnabled = PropertyUtil.propertiesWithPrefix(config, PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX); - return new Context(rowGroupSize, pageSize, dictionaryPageSize, codec, compressionLevel, - rowGroupCheckMinRecordCount, rowGroupCheckMaxRecordCount, bloomFilterMaxBytes, + return new Context( + rowGroupSize, + pageSize, + dictionaryPageSize, + codec, + compressionLevel, + rowGroupCheckMinRecordCount, + rowGroupCheckMaxRecordCount, + bloomFilterMaxBytes, columnBloomFilterEnabled); } @@ -535,7 +593,8 @@ public DataWriteBuilder metricsConfig(MetricsConfig newMetricsConfig) { return this; } - public DataWriteBuilder createWriterFunc(Function> newCreateWriterFunc) { + public DataWriteBuilder createWriterFunc( + Function> newCreateWriterFunc) { appenderBuilder.createWriterFunc(newCreateWriterFunc); return this; } @@ -562,11 +621,13 @@ public DataWriteBuilder withSortOrder(SortOrder newSortOrder) { public DataWriter build() throws IOException { Preconditions.checkArgument(spec != null, "Cannot create data writer without spec"); - Preconditions.checkArgument(spec.isUnpartitioned() || partition != null, + Preconditions.checkArgument( + spec.isUnpartitioned() || partition != null, "Partition must not be null when creating data writer for partitioned spec"); FileAppender fileAppender = appenderBuilder.build(); - return new DataWriter<>(fileAppender, FileFormat.PARQUET, location, spec, partition, keyMetadata, sortOrder); + return new DataWriter<>( + fileAppender, FileFormat.PARQUET, location, spec, partition, keyMetadata, sortOrder); } } @@ -628,7 +689,8 @@ public DeleteWriteBuilder metricsConfig(MetricsConfig newMetricsConfig) { return this; } - public DeleteWriteBuilder createWriterFunc(Function> newCreateWriterFunc) { + public DeleteWriteBuilder createWriterFunc( + Function> newCreateWriterFunc) { this.createWriterFunc = newCreateWriterFunc; return this; } @@ -674,19 +736,25 @@ public DeleteWriteBuilder withSortOrder(SortOrder newSortOrder) { } public EqualityDeleteWriter buildEqualityWriter() throws IOException { - Preconditions.checkState(rowSchema != null, "Cannot create equality delete file without a schema"); - Preconditions.checkState(equalityFieldIds != null, "Cannot create equality delete file without delete field ids"); - Preconditions.checkState(createWriterFunc != null, + Preconditions.checkState( + rowSchema != null, "Cannot create equality delete file without a schema"); + Preconditions.checkState( + equalityFieldIds != null, "Cannot create equality delete file without delete field ids"); + Preconditions.checkState( + createWriterFunc != null, "Cannot create equality delete file unless createWriterFunc is set"); - Preconditions.checkArgument(spec != null, - "Spec must not be null when creating equality delete writer"); - Preconditions.checkArgument(spec.isUnpartitioned() || partition != null, + Preconditions.checkArgument( + spec != null, "Spec must not be null when creating equality delete writer"); + Preconditions.checkArgument( + spec.isUnpartitioned() || partition != null, "Partition must not be null for partitioned writes"); meta("delete-type", "equality"); - meta("delete-field-ids", IntStream.of(equalityFieldIds) - .mapToObj(Objects::toString) - .collect(Collectors.joining(", "))); + meta( + "delete-field-ids", + IntStream.of(equalityFieldIds) + .mapToObj(Objects::toString) + .collect(Collectors.joining(", "))); // the appender uses the row schema without extra columns appenderBuilder.schema(rowSchema); @@ -694,16 +762,26 @@ public EqualityDeleteWriter buildEqualityWriter() throws IOException { appenderBuilder.createContextFunc(WriteBuilder.Context::deleteContext); return new EqualityDeleteWriter<>( - appenderBuilder.build(), FileFormat.PARQUET, location, spec, partition, keyMetadata, - sortOrder, equalityFieldIds); + appenderBuilder.build(), + FileFormat.PARQUET, + location, + spec, + partition, + keyMetadata, + sortOrder, + equalityFieldIds); } public PositionDeleteWriter buildPositionWriter() throws IOException { - Preconditions.checkState(equalityFieldIds == null, "Cannot create position delete file using delete field ids"); - Preconditions.checkArgument(spec != null, "Spec must not be null when creating position delete writer"); - Preconditions.checkArgument(spec.isUnpartitioned() || partition != null, + Preconditions.checkState( + equalityFieldIds == null, "Cannot create position delete file using delete field ids"); + Preconditions.checkArgument( + spec != null, "Spec must not be null when creating position delete writer"); + Preconditions.checkArgument( + spec.isUnpartitioned() || partition != null, "Partition must not be null for partitioned writes"); - Preconditions.checkArgument(rowSchema == null || createWriterFunc != null, + Preconditions.checkArgument( + rowSchema == null || createWriterFunc != null, "Create function should be provided if we write row data"); meta("delete-type", "position"); @@ -712,22 +790,28 @@ public PositionDeleteWriter buildPositionWriter() throws IOException { // the appender uses the row schema wrapped with position fields appenderBuilder.schema(DeleteSchemaUtil.posDeleteSchema(rowSchema)); - appenderBuilder.createWriterFunc(parquetSchema -> { - ParquetValueWriter writer = createWriterFunc.apply(parquetSchema); - if (writer instanceof StructWriter) { - return new PositionDeleteStructWriter((StructWriter) writer, pathTransformFunc); - } else { - throw new UnsupportedOperationException("Cannot wrap writer for position deletes: " + writer.getClass()); - } - }); + appenderBuilder.createWriterFunc( + parquetSchema -> { + ParquetValueWriter writer = createWriterFunc.apply(parquetSchema); + if (writer instanceof StructWriter) { + return new PositionDeleteStructWriter( + (StructWriter) writer, pathTransformFunc); + } else { + throw new UnsupportedOperationException( + "Cannot wrap writer for position deletes: " + writer.getClass()); + } + }); } else { appenderBuilder.schema(DeleteSchemaUtil.pathPosSchema()); - // We ignore the 'createWriterFunc' and 'rowSchema' even if is provided, since we do not write row data itself - appenderBuilder.createWriterFunc(parquetSchema -> - new PositionDeleteStructWriter((StructWriter) GenericParquetWriter.buildWriter(parquetSchema), - Function.identity())); + // We ignore the 'createWriterFunc' and 'rowSchema' even if is provided, since we do not + // write row data itself + appenderBuilder.createWriterFunc( + parquetSchema -> + new PositionDeleteStructWriter( + (StructWriter) GenericParquetWriter.buildWriter(parquetSchema), + Function.identity())); } appenderBuilder.createContextFunc(WriteBuilder.Context::deleteContext); @@ -737,7 +821,8 @@ public PositionDeleteWriter buildPositionWriter() throws IOException { } } - private static class ParquetWriteBuilder extends ParquetWriter.Builder> { + private static class ParquetWriteBuilder + extends ParquetWriter.Builder> { private Map keyValueMetadata = Maps.newHashMap(); private Map config = Maps.newHashMap(); private MessageType type; @@ -848,15 +933,18 @@ public ReadBuilder readSupport(ReadSupport newFilterSupport) { return this; } - public ReadBuilder createReaderFunc(Function> newReaderFunction) { - Preconditions.checkArgument(this.batchedReaderFunc == null, + public ReadBuilder createReaderFunc( + Function> newReaderFunction) { + Preconditions.checkArgument( + this.batchedReaderFunc == null, "Reader function cannot be set since the batched version is already set"); this.readerFunc = newReaderFunction; return this; } public ReadBuilder createBatchedReaderFunc(Function> func) { - Preconditions.checkArgument(this.readerFunc == null, + Preconditions.checkArgument( + this.readerFunc == null, "Batched reader function cannot be set since the non-batched version is already set"); this.batchedReaderFunc = func; return this; @@ -913,11 +1001,26 @@ public CloseableIterable build() { ParquetReadOptions options = optionsBuilder.build(); if (batchedReaderFunc != null) { - return new VectorizedParquetReader<>(file, schema, options, batchedReaderFunc, nameMapping, filter, - reuseContainers, caseSensitive, maxRecordsPerBatch); + return new VectorizedParquetReader<>( + file, + schema, + options, + batchedReaderFunc, + nameMapping, + filter, + reuseContainers, + caseSensitive, + maxRecordsPerBatch); } else { return new org.apache.iceberg.parquet.ParquetReader<>( - file, schema, options, readerFunc, nameMapping, filter, reuseContainers, caseSensitive); + file, + schema, + options, + readerFunc, + nameMapping, + filter, + reuseContainers, + caseSensitive); } } @@ -932,9 +1035,12 @@ public CloseableIterable build() { } // default options for readers - builder.set("parquet.strict.typing", "false") // allow type promotion + builder + .set("parquet.strict.typing", "false") // allow type promotion .set("parquet.avro.compatible", "false") // use the new RecordReader with Utf8 support - .set("parquet.avro.add-list-element-records", "false"); // assume that lists use a 3-level schema + .set( + "parquet.avro.add-list-element-records", + "false"); // assume that lists use a 3-level schema for (Map.Entry entry : properties.entrySet()) { builder.set(entry.getKey(), entry.getValue()); @@ -950,14 +1056,16 @@ public CloseableIterable build() { throw new RuntimeIOException(e); } Schema fileSchema = ParquetSchemaUtil.convert(type); - builder.useStatsFilter() + builder + .useStatsFilter() .useDictionaryFilter() .useRecordFilter(filterRecords) .useBloomFilter() .withFilter(ParquetFilters.convert(fileSchema, filter, caseSensitive)); } else { // turn off filtering - builder.useStatsFilter(false) + builder + .useStatsFilter(false) .useDictionaryFilter(false) .useBloomFilter(false) .useRecordFilter(false); @@ -1018,19 +1126,28 @@ protected ReadSupport getReadSupport() { /** * Combines several files into one * - * @param inputFiles an {@link Iterable} of parquet files. The order of iteration determines the order in which - * content of files are read and written to the {@code outputFile} - * @param outputFile the output parquet file containing all the data from {@code inputFiles} + * @param inputFiles an {@link Iterable} of parquet files. The order of iteration determines the + * order in which content of files are read and written to the {@code outputFile} + * @param outputFile the output parquet file containing all the data from {@code inputFiles} * @param rowGroupSize the row group size to use when writing the {@code outputFile} - * @param schema the schema of the data - * @param metadata extraMetadata to write at the footer of the {@code outputFile} + * @param schema the schema of the data + * @param metadata extraMetadata to write at the footer of the {@code outputFile} */ - public static void concat(Iterable inputFiles, File outputFile, int rowGroupSize, Schema schema, - Map metadata) throws IOException { + public static void concat( + Iterable inputFiles, + File outputFile, + int rowGroupSize, + Schema schema, + Map metadata) + throws IOException { OutputFile file = Files.localOutput(outputFile); - ParquetFileWriter writer = new ParquetFileWriter( - ParquetIO.file(file), ParquetSchemaUtil.convert(schema, "table"), - ParquetFileWriter.Mode.CREATE, rowGroupSize, 0); + ParquetFileWriter writer = + new ParquetFileWriter( + ParquetIO.file(file), + ParquetSchemaUtil.convert(schema, "table"), + ParquetFileWriter.Mode.CREATE, + rowGroupSize, + 0); writer.start(); for (File inputFile : inputFiles) { writer.appendFile(ParquetIO.file(Files.localInput(inputFile))); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvro.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvro.java index 1cb7ee4a5114..f985d979d7bc 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvro.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvro.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.math.BigDecimal; @@ -42,8 +41,7 @@ class ParquetAvro { - private ParquetAvro() { - } + private ParquetAvro() {} static Schema parquetAvroSchema(Schema avroSchema) { return AvroSchemaVisitor.visit(avroSchema, new ParquetDecimalSchemaConverter()); @@ -87,12 +85,12 @@ public void validate(Schema schema) { super.validate(schema); switch (schema.getType()) { case INT: - Preconditions.checkArgument(precision <= 9, - "Int cannot hold decimal precision: %s", precision); + Preconditions.checkArgument( + precision <= 9, "Int cannot hold decimal precision: %s", precision); break; case LONG: - Preconditions.checkArgument(precision <= 18, - "Long cannot hold decimal precision: %s", precision); + Preconditions.checkArgument( + precision <= 18, "Long cannot hold decimal precision: %s", precision); break; case FIXED: break; @@ -100,17 +98,19 @@ public void validate(Schema schema) { throw new IllegalArgumentException("Invalid base type for decimal: " + schema); } Preconditions.checkArgument(scale >= 0, "Scale %s cannot be negative", scale); - Preconditions.checkArgument(scale <= precision, - "Scale %s cannot be less than precision %s", scale, precision); + Preconditions.checkArgument( + scale <= precision, "Scale %s cannot be less than precision %s", scale, precision); } } static { - LogicalTypes.register(ParquetDecimal.NAME, schema -> { - int precision = Integer.parseInt(schema.getProp("precision")); - int scale = Integer.parseInt(schema.getProp("scale")); - return new ParquetDecimal(precision, scale); - }); + LogicalTypes.register( + ParquetDecimal.NAME, + schema -> { + int precision = Integer.parseInt(schema.getProp("precision")); + int scale = Integer.parseInt(schema.getProp("scale")); + return new ParquetDecimal(precision, scale); + }); } private static class IntDecimalConversion extends Conversion { @@ -179,66 +179,71 @@ public BigDecimal fromFixed(GenericFixed value, Schema schema, LogicalType type) public GenericFixed toFixed(BigDecimal value, Schema schema, LogicalType type) { ParquetDecimal dec = (ParquetDecimal) type; Pair key = new Pair<>(dec.precision(), dec.scale()); - return super.toFixed(value, schema, - decimalsByScale.computeIfAbsent(key, k -> LogicalTypes.decimal(k.getFirst(), k.getSecond()))); + return super.toFixed( + value, + schema, + decimalsByScale.computeIfAbsent( + key, k -> LogicalTypes.decimal(k.getFirst(), k.getSecond()))); } } - static final GenericData DEFAULT_MODEL = new SpecificData() { - private final Conversion fixedDecimalConversion = new FixedDecimalConversion(); - private final Conversion intDecimalConversion = new IntDecimalConversion(); - private final Conversion longDecimalConversion = new LongDecimalConversion(); - private final Conversion uuidConversion = new UUIDConversion(); - - { - addLogicalTypeConversion(fixedDecimalConversion); - addLogicalTypeConversion(uuidConversion); - } - - @Override - @SuppressWarnings("unchecked") - public Conversion getConversionByClass(Class datumClass, LogicalType logicalType) { - if (logicalType == null) { - return null; - } + static final GenericData DEFAULT_MODEL = + new SpecificData() { + private final Conversion fixedDecimalConversion = new FixedDecimalConversion(); + private final Conversion intDecimalConversion = new IntDecimalConversion(); + private final Conversion longDecimalConversion = new LongDecimalConversion(); + private final Conversion uuidConversion = new UUIDConversion(); - if (logicalType instanceof ParquetDecimal) { - ParquetDecimal decimal = (ParquetDecimal) logicalType; - if (decimal.precision() <= 9) { - return (Conversion) intDecimalConversion; - } else if (decimal.precision() <= 18) { - return (Conversion) longDecimalConversion; - } else { - return (Conversion) fixedDecimalConversion; + { + addLogicalTypeConversion(fixedDecimalConversion); + addLogicalTypeConversion(uuidConversion); } - } else if ("uuid".equals(logicalType.getName())) { - return (Conversion) uuidConversion; - } - return super.getConversionByClass(datumClass, logicalType); - } - @Override - @SuppressWarnings("unchecked") - public Conversion getConversionFor(LogicalType logicalType) { - if (logicalType == null) { - return null; - } + @Override + @SuppressWarnings("unchecked") + public Conversion getConversionByClass( + Class datumClass, LogicalType logicalType) { + if (logicalType == null) { + return null; + } + + if (logicalType instanceof ParquetDecimal) { + ParquetDecimal decimal = (ParquetDecimal) logicalType; + if (decimal.precision() <= 9) { + return (Conversion) intDecimalConversion; + } else if (decimal.precision() <= 18) { + return (Conversion) longDecimalConversion; + } else { + return (Conversion) fixedDecimalConversion; + } + } else if ("uuid".equals(logicalType.getName())) { + return (Conversion) uuidConversion; + } + return super.getConversionByClass(datumClass, logicalType); + } - if (logicalType instanceof LogicalTypes.Decimal) { - LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; - if (decimal.getPrecision() <= 9) { - return (Conversion) intDecimalConversion; - } else if (decimal.getPrecision() <= 18) { - return (Conversion) longDecimalConversion; - } else { - return (Conversion) fixedDecimalConversion; + @Override + @SuppressWarnings("unchecked") + public Conversion getConversionFor(LogicalType logicalType) { + if (logicalType == null) { + return null; + } + + if (logicalType instanceof LogicalTypes.Decimal) { + LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType; + if (decimal.getPrecision() <= 9) { + return (Conversion) intDecimalConversion; + } else if (decimal.getPrecision() <= 18) { + return (Conversion) longDecimalConversion; + } else { + return (Conversion) fixedDecimalConversion; + } + } else if ("uuid".equals(logicalType.getName())) { + return (Conversion) uuidConversion; + } + return super.getConversionFor(logicalType); } - } else if ("uuid".equals(logicalType.getName())) { - return (Conversion) uuidConversion; - } - return super.getConversionFor(logicalType); - } - }; + }; private static class ParquetDecimalSchemaConverter extends AvroSchemaVisitor { @Override @@ -309,8 +314,12 @@ public Schema primitive(Schema primitive) { } else { return new ParquetDecimal(decimal.getPrecision(), decimal.getScale()) - .addToSchema(Schema.createFixed(primitive.getName(), - null, null, TypeUtil.decimalRequiredBytes(decimal.getPrecision()))); + .addToSchema( + Schema.createFixed( + primitive.getName(), + null, + null, + TypeUtil.decimalRequiredBytes(decimal.getPrecision()))); } } @@ -333,8 +342,13 @@ private boolean isIdentical(List types, List replacements) { } private static Schema copyRecord(Schema record, List newFields) { - Schema copy = Schema.createRecord(record.getName(), - record.getDoc(), record.getNamespace(), record.isError(), newFields); + Schema copy = + Schema.createRecord( + record.getName(), + record.getDoc(), + record.getNamespace(), + record.isError(), + newFields); for (Map.Entry prop : record.getObjectProps().entrySet()) { copy.addProp(prop.getKey(), prop.getValue()); @@ -344,8 +358,8 @@ private static Schema copyRecord(Schema record, List newFields) { } private static Schema.Field copyField(Schema.Field field, Schema newSchema) { - Schema.Field copy = new Schema.Field(field.name(), - newSchema, field.doc(), field.defaultVal(), field.order()); + Schema.Field copy = + new Schema.Field(field.name(), newSchema, field.doc(), field.defaultVal(), field.order()); for (Map.Entry prop : field.getObjectProps().entrySet()) { copy.addProp(prop.getKey(), prop.getValue()); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroReader.java index 9d3f93d02f0d..adf376c9036a 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroReader.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; -public class ParquetAvroReader { -} +public class ParquetAvroReader {} diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroValueReaders.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroValueReaders.java index e71f65178f0f..b3eccdc31d97 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroValueReaders.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroValueReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.math.BigDecimal; @@ -53,15 +52,14 @@ import org.apache.parquet.schema.Type; public class ParquetAvroValueReaders { - private ParquetAvroValueReaders() { - } + private ParquetAvroValueReaders() {} @SuppressWarnings("unchecked") - public static ParquetValueReader buildReader(org.apache.iceberg.Schema expectedSchema, - MessageType fileSchema) { + public static ParquetValueReader buildReader( + org.apache.iceberg.Schema expectedSchema, MessageType fileSchema) { return (ParquetValueReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, - new ReadBuilder(expectedSchema, fileSchema)); + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), fileSchema, new ReadBuilder(expectedSchema, fileSchema)); } private static class ReadBuilder extends TypeWithSchemaVisitor> { @@ -76,14 +74,14 @@ private static class ReadBuilder extends TypeWithSchemaVisitor message(Types.StructType expected, MessageType message, - List> fieldReaders) { + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { return struct(expected, message.asGroupType(), fieldReaders); } @Override - public ParquetValueReader struct(Types.StructType expected, GroupType struct, - List> fieldReaders) { + public ParquetValueReader struct( + Types.StructType expected, GroupType struct, List> fieldReaders) { Schema avroSchema = avroSchemas.get(expected); // match the expected struct's order @@ -98,10 +96,10 @@ public ParquetValueReader struct(Types.StructType expected, GroupType struct, typesById.put(id, fieldType); } - List expectedFields = expected != null ? - expected.fields() : ImmutableList.of(); - List> reorderedFields = Lists.newArrayListWithExpectedSize( - expectedFields.size()); + List expectedFields = + expected != null ? expected.fields() : ImmutableList.of(); + List> reorderedFields = + Lists.newArrayListWithExpectedSize(expectedFields.size()); List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); for (Types.NestedField field : expectedFields) { int id = field.fieldId(); @@ -119,8 +117,8 @@ public ParquetValueReader struct(Types.StructType expected, GroupType struct, } @Override - public ParquetValueReader list(Types.ListType expectedList, GroupType array, - ParquetValueReader elementReader) { + public ParquetValueReader list( + Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; @@ -129,13 +127,16 @@ public ParquetValueReader list(Types.ListType expectedList, GroupType array, Type elementType = ParquetSchemaUtil.determineListElementType(array); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - return new ListReader<>(repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); + return new ListReader<>( + repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); } @Override - public ParquetValueReader map(Types.MapType expectedMap, GroupType map, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { + public ParquetValueReader map( + Types.MapType expectedMap, + GroupType map, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); @@ -147,14 +148,16 @@ public ParquetValueReader map(Types.MapType expectedMap, GroupType map, Type valueType = repeatedKeyValue.getType(1); int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - return new MapReader<>(repeatedD, repeatedR, + return new MapReader<>( + repeatedD, + repeatedR, ParquetValueReaders.option(keyType, keyD, keyReader), ParquetValueReaders.option(valueType, valueD, valueReader)); } @Override - public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveType expected, - PrimitiveType primitive) { + public ParquetValueReader primitive( + org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { ColumnDescriptor desc = type.getColumnDescription(currentPath()); boolean isMapKey = fieldNames.contains("key"); @@ -342,9 +345,7 @@ public long readLong() { static class RecordReader extends StructReader { private final Schema schema; - RecordReader(List types, - List> readers, - Schema schema) { + RecordReader(List types, List> readers, Schema schema) { super(types, readers); this.schema = schema; } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroWriter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroWriter.java index c2489edf34a3..1c314a235a62 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroWriter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.util.List; @@ -34,8 +33,7 @@ import org.apache.parquet.schema.Type; public class ParquetAvroWriter { - private ParquetAvroWriter() { - } + private ParquetAvroWriter() {} @SuppressWarnings("unchecked") public static ParquetValueWriter buildWriter(MessageType type) { @@ -50,14 +48,14 @@ private static class WriteBuilder extends ParquetTypeVisitor message(MessageType message, - List> fieldWriters) { + public ParquetValueWriter message( + MessageType message, List> fieldWriters) { return struct(message.asGroupType(), fieldWriters); } @Override - public ParquetValueWriter struct(GroupType struct, - List> fieldWriters) { + public ParquetValueWriter struct( + GroupType struct, List> fieldWriters) { List fields = struct.getFields(); List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); for (int i = 0; i < fields.size(); i += 1) { @@ -80,14 +78,13 @@ public ParquetValueWriter list(GroupType array, ParquetValueWriter element Type elementType = repeated.getType(0); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())); - return ParquetValueWriters.collections(repeatedD, repeatedR, - ParquetValueWriters.option(elementType, elementD, elementWriter)); + return ParquetValueWriters.collections( + repeatedD, repeatedR, ParquetValueWriters.option(elementType, elementD, elementWriter)); } @Override - public ParquetValueWriter map(GroupType map, - ParquetValueWriter keyWriter, - ParquetValueWriter valueWriter) { + public ParquetValueWriter map( + GroupType map, ParquetValueWriter keyWriter, ParquetValueWriter valueWriter) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); @@ -99,7 +96,9 @@ public ParquetValueWriter map(GroupType map, Type valueType = repeatedKeyValue.getType(1); int valueD = type.getMaxDefinitionLevel(path(valueType.getName())); - return ParquetValueWriters.maps(repeatedD, repeatedR, + return ParquetValueWriters.maps( + repeatedD, + repeatedR, ParquetValueWriters.option(keyType, keyD, keyWriter), ParquetValueWriters.option(valueType, valueD, valueWriter)); } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetBloomRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetBloomRowGroupFilter.java index 21a5d4562ca7..60f07f0054c4 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetBloomRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetBloomRowGroupFilter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.math.BigDecimal; @@ -68,12 +67,13 @@ public ParquetBloomRowGroupFilter(Schema schema, Expression unbound, boolean cas /** * Tests whether the bloom for a row group may contain records that match the expression. * - * @param fileSchema schema for the Parquet file + * @param fileSchema schema for the Parquet file * @param rowGroup metadata for a row group * @param bloomReader a bloom filter reader * @return false if the file cannot contain rows that match the expression, true otherwise. */ - public boolean shouldRead(MessageType fileSchema, BlockMetaData rowGroup, BloomFilterReader bloomReader) { + public boolean shouldRead( + MessageType fileSchema, BlockMetaData rowGroup, BloomFilterReader bloomReader) { return new BloomEvalVisitor().eval(fileSchema, rowGroup, bloomReader); } @@ -88,7 +88,8 @@ private class BloomEvalVisitor extends BoundExpressionVisitor { private Map parquetPrimitiveTypes = null; private Map types = null; - private boolean eval(MessageType fileSchema, BlockMetaData rowGroup, BloomFilterReader bloomFilterReader) { + private boolean eval( + MessageType fileSchema, BlockMetaData rowGroup, BloomFilterReader bloomFilterReader) { this.bloomReader = bloomFilterReader; this.fieldsWithBloomFilter = Sets.newHashSet(); this.columnMetaMap = Maps.newHashMap(); @@ -110,8 +111,10 @@ private boolean eval(MessageType fileSchema, BlockMetaData rowGroup, BloomFilter } } - Set filterRefs = Binder.boundReferences(schema.asStruct(), ImmutableList.of(expr), caseSensitive); - // If the filter's column set doesn't overlap with any bloom filter columns, exit early with ROWS_MIGHT_MATCH + Set filterRefs = + Binder.boundReferences(schema.asStruct(), ImmutableList.of(expr), caseSensitive); + // If the filter's column set doesn't overlap with any bloom filter columns, exit early with + // ROWS_MIGHT_MATCH if (filterRefs.size() > 0 && Sets.intersection(fieldsWithBloomFilter, filterRefs).isEmpty()) { return ROWS_MIGHT_MATCH; } @@ -263,7 +266,8 @@ private BloomFilter loadBloomFilter(int id) { } } - private boolean shouldRead(PrimitiveType primitiveType, T value, BloomFilter bloom, Type type) { + private boolean shouldRead( + PrimitiveType primitiveType, T value, BloomFilter bloom, Type type) { long hashValue = 0; switch (primitiveType.getPrimitiveTypeName()) { case INT32: @@ -314,7 +318,9 @@ private boolean shouldRead(PrimitiveType primitiveType, T value, BloomFilter int scale = metadata.getScale(); int precision = metadata.getPrecision(); byte[] requiredBytes = new byte[TypeUtil.decimalRequiredBytes(precision)]; - byte[] binary = DecimalUtil.toReusedFixLengthBytes(precision, scale, (BigDecimal) value, requiredBytes); + byte[] binary = + DecimalUtil.toReusedFixLengthBytes( + precision, scale, (BigDecimal) value, requiredBytes); hashValue = bloom.hash(Binary.fromConstantByteArray(binary)); return bloom.findHash(hashValue); case UUID: diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetConversions.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetConversions.java index 78b3a31cebfd..c7bb2fa5d739 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetConversions.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetConversions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.math.BigDecimal; @@ -31,8 +30,7 @@ import org.apache.parquet.schema.PrimitiveType; class ParquetConversions { - private ParquetConversions() { - } + private ParquetConversions() {} @SuppressWarnings("unchecked") static Literal fromParquetPrimitive(Type type, PrimitiveType parquetType, Object value) { @@ -68,14 +66,15 @@ static Literal fromParquetPrimitive(Type type, PrimitiveType parquetType, } } - static Function converterFromParquet(PrimitiveType parquetType, Type icebergType) { + static Function converterFromParquet( + PrimitiveType parquetType, Type icebergType) { Function fromParquet = converterFromParquet(parquetType); if (icebergType != null) { - if (icebergType.typeId() == Type.TypeID.LONG && - parquetType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT32) { + if (icebergType.typeId() == Type.TypeID.LONG + && parquetType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT32) { return value -> ((Integer) fromParquet.apply(value)).longValue(); - } else if (icebergType.typeId() == Type.TypeID.DOUBLE && - parquetType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.FLOAT) { + } else if (icebergType.typeId() == Type.TypeID.DOUBLE + && parquetType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.FLOAT) { return value -> ((Float) fromParquet.apply(value)).doubleValue(); } } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java index 5dc0c598f319..88339e211149 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.io.IOException; @@ -70,8 +69,8 @@ public ParquetDictionaryRowGroupFilter(Schema schema, Expression unbound, boolea * @param dictionaries a dictionary page read store * @return false if the file cannot contain rows that match the expression, true otherwise. */ - public boolean shouldRead(MessageType fileSchema, BlockMetaData rowGroup, - DictionaryPageReadStore dictionaries) { + public boolean shouldRead( + MessageType fileSchema, BlockMetaData rowGroup, DictionaryPageReadStore dictionaries) { return new EvalVisitor().eval(fileSchema, rowGroup, dictionaries); } @@ -86,8 +85,10 @@ private class EvalVisitor extends BoundExpressionVisitor { private Map cols = null; private Map> conversions = null; - private boolean eval(MessageType fileSchema, BlockMetaData rowGroup, - DictionaryPageReadStore dictionaryReadStore) { + private boolean eval( + MessageType fileSchema, + BlockMetaData rowGroup, + DictionaryPageReadStore dictionaryReadStore) { this.dictionaries = dictionaryReadStore; this.dictCache = Maps.newHashMap(); this.isFallback = Maps.newHashMap(); @@ -181,7 +182,8 @@ public Boolean notNaN(BoundReference ref) { } private Comparator comparatorForNaNPredicate(BoundReference ref) { - // Construct the same comparator as in ComparableLiteral.comparator, ignoring null value order as dictionary + // Construct the same comparator as in ComparableLiteral.comparator, ignoring null value order + // as dictionary // cannot contain null values. // No need to check type: incompatible types will be handled during expression binding. return Comparators.forType(ref.type().asPrimitiveType()); @@ -354,8 +356,11 @@ public Boolean notIn(BoundReference ref, Set literalSet) { return ROWS_MIGHT_MATCH; } - // ROWS_CANNOT_MATCH if no values in the dictionary that are not also in the set (the difference is empty) - return Sets.difference(dictionary, literalSet).isEmpty() ? ROWS_CANNOT_MATCH : ROWS_MIGHT_MATCH; + // ROWS_CANNOT_MATCH if no values in the dictionary that are not also in the set (the + // difference is empty) + return Sets.difference(dictionary, literalSet).isEmpty() + ? ROWS_CANNOT_MATCH + : ROWS_MIGHT_MATCH; } @Override @@ -425,21 +430,28 @@ private Set dict(int id, Comparator comparator) { for (int i = 0; i <= dict.getMaxId(); i++) { switch (col.getPrimitiveType().getPrimitiveTypeName()) { - case FIXED_LEN_BYTE_ARRAY: dictSet.add((T) conversion.apply(dict.decodeToBinary(i))); + case FIXED_LEN_BYTE_ARRAY: + dictSet.add((T) conversion.apply(dict.decodeToBinary(i))); break; - case BINARY: dictSet.add((T) conversion.apply(dict.decodeToBinary(i))); + case BINARY: + dictSet.add((T) conversion.apply(dict.decodeToBinary(i))); break; - case INT32: dictSet.add((T) conversion.apply(dict.decodeToInt(i))); + case INT32: + dictSet.add((T) conversion.apply(dict.decodeToInt(i))); break; - case INT64: dictSet.add((T) conversion.apply(dict.decodeToLong(i))); + case INT64: + dictSet.add((T) conversion.apply(dict.decodeToLong(i))); break; - case FLOAT: dictSet.add((T) conversion.apply(dict.decodeToFloat(i))); + case FLOAT: + dictSet.add((T) conversion.apply(dict.decodeToFloat(i))); break; - case DOUBLE: dictSet.add((T) conversion.apply(dict.decodeToDouble(i))); + case DOUBLE: + dictSet.add((T) conversion.apply(dict.decodeToDouble(i))); break; default: throw new IllegalArgumentException( - "Cannot decode dictionary of type: " + col.getPrimitiveType().getPrimitiveTypeName()); + "Cannot decode dictionary of type: " + + col.getPrimitiveType().getPrimitiveTypeName()); } } @@ -452,5 +464,4 @@ private Set dict(int id, Comparator comparator) { private static boolean mayContainNull(ColumnChunkMetaData meta) { return meta.getStatistics() == null || meta.getStatistics().getNumNulls() != 0; } - } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFilters.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFilters.java index fa9387535f59..fc6febe19438 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFilters.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFilters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.nio.ByteBuffer; @@ -38,11 +37,11 @@ class ParquetFilters { - private ParquetFilters() { - } + private ParquetFilters() {} static FilterCompat.Filter convert(Schema schema, Expression expr, boolean caseSensitive) { - FilterPredicate pred = ExpressionVisitors.visit(expr, new ConvertFilterToParquet(schema, caseSensitive)); + FilterPredicate pred = + ExpressionVisitors.visit(expr, new ConvertFilterToParquet(schema, caseSensitive)); // TODO: handle AlwaysFalse.INSTANCE if (pred != null && pred != AlwaysTrue.INSTANCE) { // FilterCompat will apply LogicalInverseRewriter @@ -112,7 +111,8 @@ protected Expression bind(UnboundPredicate pred) { @Override public FilterPredicate predicate(BoundPredicate pred) { if (!(pred.term() instanceof BoundReference)) { - throw new UnsupportedOperationException("Cannot convert non-reference to Parquet filter: " + pred.term()); + throw new UnsupportedOperationException( + "Cannot convert non-reference to Parquet filter: " + pred.term()); } Operation op = pred.op(); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetIO.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetIO.java index d65b8d638edf..8b2a6e242b4d 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetIO.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetIO.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.io.IOException; @@ -38,12 +37,9 @@ import org.apache.parquet.io.PositionOutputStream; import org.apache.parquet.io.SeekableInputStream; -/** - * Methods in this class translate from the IO API to Parquet's IO API. - */ +/** Methods in this class translate from the IO API to Parquet's IO API. */ class ParquetIO { - private ParquetIO() { - } + private ParquetIO() {} static InputFile file(org.apache.iceberg.io.InputFile file) { // TODO: use reflection to avoid depending on classes from iceberg-hadoop @@ -51,7 +47,8 @@ static InputFile file(org.apache.iceberg.io.InputFile file) { if (file instanceof HadoopInputFile) { HadoopInputFile hfile = (HadoopInputFile) file; try { - return org.apache.parquet.hadoop.util.HadoopInputFile.fromStatus(hfile.getStat(), hfile.getConf()); + return org.apache.parquet.hadoop.util.HadoopInputFile.fromStatus( + hfile.getStat(), hfile.getConf()); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to create Parquet input file for %s", file); } @@ -63,7 +60,8 @@ static OutputFile file(org.apache.iceberg.io.OutputFile file) { if (file instanceof HadoopOutputFile) { HadoopOutputFile hfile = (HadoopOutputFile) file; try { - return org.apache.parquet.hadoop.util.HadoopOutputFile.fromPath(hfile.getPath(), hfile.getConf()); + return org.apache.parquet.hadoop.util.HadoopOutputFile.fromPath( + hfile.getPath(), hfile.getConf()); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to create Parquet output file for %s", file); } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetIterable.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetIterable.java index 5ec5b59f9170..ac4e5c1f97ed 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetIterable.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetIterable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.io.IOException; diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java index d99d99f7a260..fdf697a78a2a 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.nio.ByteBuffer; @@ -406,14 +405,22 @@ public Boolean in(BoundReference ref, Set literalSet) { } T lower = min(colStats, id); - literals = literals.stream().filter(v -> ref.comparator().compare(lower, v) <= 0).collect(Collectors.toList()); - if (literals.isEmpty()) { // if all values are less than lower bound, rows cannot match. + literals = + literals.stream() + .filter(v -> ref.comparator().compare(lower, v) <= 0) + .collect(Collectors.toList()); + if (literals.isEmpty()) { // if all values are less than lower bound, rows cannot match. return ROWS_CANNOT_MATCH; } T upper = max(colStats, id); - literals = literals.stream().filter(v -> ref.comparator().compare(upper, v) >= 0).collect(Collectors.toList()); - if (literals.isEmpty()) { // if all remaining values are greater than upper bound, rows cannot match. + literals = + literals.stream() + .filter(v -> ref.comparator().compare(upper, v) >= 0) + .collect(Collectors.toList()); + if (literals + .isEmpty()) { // if all remaining values are greater than upper bound, rows cannot + // match. return ROWS_CANNOT_MATCH; } } @@ -456,7 +463,9 @@ public Boolean startsWith(BoundReference ref, Literal lit) { Binary lower = colStats.genericGetMin(); // truncate lower bound so that its length in bytes is not greater than the length of prefix int lowerLength = Math.min(prefixAsBytes.remaining(), lower.length()); - int lowerCmp = comparator.compare(BinaryUtil.truncateBinary(lower.toByteBuffer(), lowerLength), prefixAsBytes); + int lowerCmp = + comparator.compare( + BinaryUtil.truncateBinary(lower.toByteBuffer(), lowerLength), prefixAsBytes); if (lowerCmp > 0) { return ROWS_CANNOT_MATCH; } @@ -464,7 +473,9 @@ public Boolean startsWith(BoundReference ref, Literal lit) { Binary upper = colStats.genericGetMax(); // truncate upper bound so that its length in bytes is not greater than the length of prefix int upperLength = Math.min(prefixAsBytes.remaining(), upper.length()); - int upperCmp = comparator.compare(BinaryUtil.truncateBinary(upper.toByteBuffer(), upperLength), prefixAsBytes); + int upperCmp = + comparator.compare( + BinaryUtil.truncateBinary(upper.toByteBuffer(), upperLength), prefixAsBytes); if (upperCmp < 0) { return ROWS_CANNOT_MATCH; } @@ -497,7 +508,8 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { Binary lower = colStats.genericGetMin(); Binary upper = colStats.genericGetMax(); - // notStartsWith will match unless all values must start with the prefix. this happens when the lower and upper + // notStartsWith will match unless all values must start with the prefix. this happens when + // the lower and upper // bounds both start with the prefix. if (lower != null && upper != null) { ByteBuffer prefix = lit.toByteBuffer(); @@ -509,7 +521,9 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { } // truncate lower bound to the prefix and check for equality - int cmp = comparator.compare(BinaryUtil.truncateBinary(lower.toByteBuffer(), prefix.remaining()), prefix); + int cmp = + comparator.compare( + BinaryUtil.truncateBinary(lower.toByteBuffer(), prefix.remaining()), prefix); if (cmp == 0) { // the lower bound starts with the prefix; check the upper bound // if upper is shorter than the prefix, it can't start with the prefix @@ -517,10 +531,14 @@ public Boolean notStartsWith(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; } - // truncate upper bound so that its length in bytes is not greater than the length of prefix - cmp = comparator.compare(BinaryUtil.truncateBinary(upper.toByteBuffer(), prefix.remaining()), prefix); + // truncate upper bound so that its length in bytes is not greater than the length of + // prefix + cmp = + comparator.compare( + BinaryUtil.truncateBinary(upper.toByteBuffer(), prefix.remaining()), prefix); if (cmp == 0) { - // both bounds match the prefix, so all rows must match the prefix and none do not match + // both bounds match the prefix, so all rows must match the prefix and none do not + // match return ROWS_CANNOT_MATCH; } } @@ -542,23 +560,24 @@ private T max(Statistics statistics, int id) { } /** - * Checks against older versions of Parquet statistics which may have a null count but undefined min and max - * statistics. Returns true if nonNull values exist in the row group but no further statistics are available. - *

    - * We can't use {@code statistics.hasNonNullValue()} because it is inaccurate with older files and will return - * false if min and max are not set. - *

    - * This is specifically for 1.5.0-CDH Parquet builds and later which contain the different unusual hasNonNull - * behavior. OSS Parquet builds are not effected because PARQUET-251 prohibits the reading of these statistics - * from versions of Parquet earlier than 1.8.0. + * Checks against older versions of Parquet statistics which may have a null count but undefined + * min and max statistics. Returns true if nonNull values exist in the row group but no further + * statistics are available. + * + *

    We can't use {@code statistics.hasNonNullValue()} because it is inaccurate with older files + * and will return false if min and max are not set. + * + *

    This is specifically for 1.5.0-CDH Parquet builds and later which contain the different + * unusual hasNonNull behavior. OSS Parquet builds are not effected because PARQUET-251 prohibits + * the reading of these statistics from versions of Parquet earlier than 1.8.0. * * @param statistics Statistics to check * @param valueCount Number of values in the row group * @return true if nonNull values exist and no other stats can be used */ static boolean hasNonNullButNoMinMax(Statistics statistics, long valueCount) { - return statistics.getNumNulls() < valueCount && - (statistics.getMaxBytes() == null || statistics.getMinBytes() == null); + return statistics.getNumNulls() < valueCount + && (statistics.getMaxBytes() == null || statistics.getMinBytes() == null); } private static boolean mayContainNull(Statistics statistics) { diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReadSupport.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReadSupport.java index 645b3fa0d1fb..b1172147f80a 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReadSupport.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReadSupport.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.util.Map; @@ -44,7 +43,11 @@ class ParquetReadSupport extends ReadSupport { private final boolean callInit; private final NameMapping nameMapping; - ParquetReadSupport(Schema expectedSchema, ReadSupport readSupport, boolean callInit, NameMapping nameMapping) { + ParquetReadSupport( + Schema expectedSchema, + ReadSupport readSupport, + boolean callInit, + NameMapping nameMapping) { this.expectedSchema = expectedSchema; this.wrapped = readSupport; this.callInit = callInit; @@ -53,7 +56,8 @@ class ParquetReadSupport extends ReadSupport { @Override @SuppressWarnings("deprecation") - public ReadContext init(Configuration configuration, Map keyValueMetaData, MessageType fileSchema) { + public ReadContext init( + Configuration configuration, Map keyValueMetaData, MessageType fileSchema) { // Columns are selected from the Parquet file by taking the read context's message type and // matching to the file's columns by full path, so this must select columns by using the path // in the file's schema. @@ -74,11 +78,13 @@ public ReadContext init(Configuration configuration, Map keyValu configuration.set("parquet.avro.write-old-list-structure", "false"); // set Avro schemas in case the reader is Avro - AvroReadSupport.setRequestedProjection(configuration, - AvroSchemaUtil.convert(expectedSchema, projection.getName())); - org.apache.avro.Schema avroReadSchema = AvroSchemaUtil.buildAvroProjection( - AvroSchemaUtil.convert(ParquetSchemaUtil.convert(projection), projection.getName()), - expectedSchema, ImmutableMap.of()); + AvroReadSupport.setRequestedProjection( + configuration, AvroSchemaUtil.convert(expectedSchema, projection.getName())); + org.apache.avro.Schema avroReadSchema = + AvroSchemaUtil.buildAvroProjection( + AvroSchemaUtil.convert(ParquetSchemaUtil.convert(projection), projection.getName()), + expectedSchema, + ImmutableMap.of()); AvroReadSupport.setAvroReadSchema(configuration, ParquetAvro.parquetAvroSchema(avroReadSchema)); // let the context set up read support metadata, but always use the correct projection @@ -88,20 +94,22 @@ public ReadContext init(Configuration configuration, Map keyValu context = wrapped.init(configuration, keyValueMetaData, projection); } catch (UnsupportedOperationException e) { // try the InitContext version - context = wrapped.init(new InitContext( - configuration, makeMultimap(keyValueMetaData), projection)); + context = + wrapped.init( + new InitContext(configuration, makeMultimap(keyValueMetaData), projection)); } } - return new ReadContext(projection, - context != null ? context.getReadSupportMetadata() : ImmutableMap.of()); + return new ReadContext( + projection, context != null ? context.getReadSupportMetadata() : ImmutableMap.of()); } @Override - public RecordMaterializer prepareForRead(Configuration configuration, - Map fileMetadata, - MessageType fileMessageType, - ReadContext readContext) { + public RecordMaterializer prepareForRead( + Configuration configuration, + Map fileMetadata, + MessageType fileMessageType, + ReadContext readContext) { // This is the type created in init that was based on the file's schema. The schema that this // will pass to the wrapped ReadSupport needs to match the expected schema's names. Rather than // renaming the file's schema, convert the expected schema to Parquet. This relies on writing diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java index 29e980def260..c1d8b0ccbbad 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.io.IOException; @@ -45,9 +44,15 @@ public class ParquetReader extends CloseableGroup implements CloseableIterabl private final boolean caseSensitive; private final NameMapping nameMapping; - public ParquetReader(InputFile input, Schema expectedSchema, ParquetReadOptions options, - Function> readerFunc, NameMapping nameMapping, - Expression filter, boolean reuseContainers, boolean caseSensitive) { + public ParquetReader( + InputFile input, + Schema expectedSchema, + ParquetReadOptions options, + Function> readerFunc, + NameMapping nameMapping, + Expression filter, + boolean reuseContainers, + boolean caseSensitive) { this.input = input; this.expectedSchema = expectedSchema; this.options = options; @@ -63,9 +68,18 @@ public ParquetReader(InputFile input, Schema expectedSchema, ParquetReadOptions private ReadConf init() { if (conf == null) { - ReadConf readConf = new ReadConf<>( - input, options, expectedSchema, filter, readerFunc, null, nameMapping, reuseContainers, - caseSensitive, null); + ReadConf readConf = + new ReadConf<>( + input, + options, + expectedSchema, + filter, + readerFunc, + null, + nameMapping, + reuseContainers, + caseSensitive, + null); this.conf = readConf.copy(); return readConf; } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java index 0370f8c02ec7..27ce35882458 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.util.List; @@ -36,15 +35,15 @@ public class ParquetSchemaUtil { - private ParquetSchemaUtil() { - } + private ParquetSchemaUtil() {} public static MessageType convert(Schema schema, String name) { return new TypeToMessageType().convert(schema, name); } /** - * Converts a Parquet schema to an Iceberg schema. Fields without IDs are kept and assigned fallback IDs. + * Converts a Parquet schema to an Iceberg schema. Fields without IDs are kept and assigned + * fallback IDs. * * @param parquetSchema a Parquet schema * @return a matching Iceberg schema for the provided Parquet schema @@ -52,7 +51,8 @@ public static MessageType convert(Schema schema, String name) { public static Schema convert(MessageType parquetSchema) { // if the Parquet schema does not contain ids, we assign fallback ids to top-level fields // all remaining fields will get ids >= 1000 to avoid pruning columns without ids - MessageType parquetSchemaWithIds = hasIds(parquetSchema) ? parquetSchema : addFallbackIds(parquetSchema); + MessageType parquetSchemaWithIds = + hasIds(parquetSchema) ? parquetSchema : addFallbackIds(parquetSchema); AtomicInteger nextId = new AtomicInteger(1000); return convertInternal(parquetSchemaWithIds, name -> nextId.getAndIncrement()); } @@ -67,7 +67,8 @@ public static Schema convertAndPrune(MessageType parquetSchema) { return convertInternal(parquetSchema, name -> null); } - private static Schema convertInternal(MessageType parquetSchema, Function nameToIdFunc) { + private static Schema convertInternal( + MessageType parquetSchema, Function nameToIdFunc) { MessageTypeToType converter = new MessageTypeToType(nameToIdFunc); return new Schema( ParquetTypeVisitor.visit(parquetSchema, converter).asNestedType().fields(), @@ -82,11 +83,11 @@ public static MessageType pruneColumns(MessageType fileSchema, Schema expectedSc /** * Prunes columns from a Parquet file schema that was written without field ids. - *

    - * Files that were written without field ids are read assuming that schema evolution preserved + * + *

    Files that were written without field ids are read assuming that schema evolution preserved * column order. Deleting columns was not allowed. - *

    - * The order of columns in the resulting Parquet schema matches the Parquet file. + * + *

    The order of columns in the resulting Parquet schema matches the Parquet file. * * @param fileSchema schema from a Parquet file that does not have field ids. * @param expectedSchema expected schema @@ -178,14 +179,15 @@ static boolean isOldListElementType(GroupType list) { String parentName = list.getName(); return - // For legacy 2-level list types with primitive element type, e.g.: - // - // // ARRAY (nullable list, non-null elements) - // optional group my_list (LIST) { - // repeated int32 element; - // } - // - repeatedType.isPrimitive() || + // For legacy 2-level list types with primitive element type, e.g.: + // + // // ARRAY (nullable list, non-null elements) + // optional group my_list (LIST) { + // repeated int32 element; + // } + // + repeatedType.isPrimitive() + || // For legacy 2-level list types whose element type is a group type with 2 or more fields, // e.g.: // @@ -197,7 +199,8 @@ static boolean isOldListElementType(GroupType list) { // }; // } // - repeatedType.asGroupType().getFieldCount() > 1 || + repeatedType.asGroupType().getFieldCount() > 1 + || // For legacy 2-level list types generated by parquet-avro (Parquet version < 1.6.0), e.g.: // // // ARRAY> (nullable list, non-null elements) @@ -206,7 +209,8 @@ static boolean isOldListElementType(GroupType list) { // required binary str (UTF8); // }; // } - repeatedType.getName().equals("array") || + repeatedType.getName().equals("array") + || // For Parquet data generated by parquet-thrift, e.g.: // // // ARRAY> (nullable list, non-null elements) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetTypeVisitor.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetTypeVisitor.java index 89483560012d..1f6e8554c463 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetTypeVisitor.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetTypeVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.util.Deque; @@ -34,8 +33,7 @@ public class ParquetTypeVisitor { public static T visit(Type type, ParquetTypeVisitor visitor) { if (type instanceof MessageType) { - return visitor.message((MessageType) type, - visitFields(type.asGroupType(), visitor)); + return visitor.message((MessageType) type, visitFields(type.asGroupType(), visitor)); } else if (type.isPrimitive()) { return visitor.primitive(type.asPrimitiveType()); @@ -61,11 +59,14 @@ public static T visit(Type type, ParquetTypeVisitor visitor) { } private static T visitList(GroupType list, ParquetTypeVisitor visitor) { - Preconditions.checkArgument(list.getFieldCount() == 1, - "Invalid list: does not contain single repeated field: %s", list); + Preconditions.checkArgument( + list.getFieldCount() == 1, + "Invalid list: does not contain single repeated field: %s", + list); Type repeatedElement = list.getFields().get(0); - Preconditions.checkArgument(repeatedElement.isRepetition(Type.Repetition.REPEATED), + Preconditions.checkArgument( + repeatedElement.isRepetition(Type.Repetition.REPEATED), "Invalid list: inner group is not repeated"); Type listElement = ParquetSchemaUtil.determineListElementType(list); @@ -102,15 +103,19 @@ private static T visitListElement(Type listElement, ParquetTypeVisitor vi } private static T visitMap(GroupType map, ParquetTypeVisitor visitor) { - Preconditions.checkArgument(!map.isRepetition(Type.Repetition.REPEATED), - "Invalid map: top-level group is repeated: %s", map); - Preconditions.checkArgument(map.getFieldCount() == 1, - "Invalid map: does not contain single repeated field: %s", map); + Preconditions.checkArgument( + !map.isRepetition(Type.Repetition.REPEATED), + "Invalid map: top-level group is repeated: %s", + map); + Preconditions.checkArgument( + map.getFieldCount() == 1, "Invalid map: does not contain single repeated field: %s", map); GroupType repeatedKeyValue = map.getType(0).asGroupType(); - Preconditions.checkArgument(repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), + Preconditions.checkArgument( + repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), "Invalid map: inner group is not repeated"); - Preconditions.checkArgument(repeatedKeyValue.getFieldCount() <= 2, + Preconditions.checkArgument( + repeatedKeyValue.getFieldCount() <= 2, "Invalid map: repeated group does not have 2 fields"); visitor.beforeRepeatedKeyValue(repeatedKeyValue); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetUtil.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetUtil.java index 4d5ba2eba8f1..b6d57d073e0e 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetUtil.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.io.IOException; @@ -67,14 +66,14 @@ public class ParquetUtil { // not meant to be instantiated - private ParquetUtil() { - } + private ParquetUtil() {} public static Metrics fileMetrics(InputFile file, MetricsConfig metricsConfig) { return fileMetrics(file, metricsConfig, null); } - public static Metrics fileMetrics(InputFile file, MetricsConfig metricsConfig, NameMapping nameMapping) { + public static Metrics fileMetrics( + InputFile file, MetricsConfig metricsConfig, NameMapping nameMapping) { try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(file))) { return footerMetrics(reader.getFooter(), Stream.empty(), metricsConfig, nameMapping); } catch (IOException e) { @@ -82,14 +81,17 @@ public static Metrics fileMetrics(InputFile file, MetricsConfig metricsConfig, N } } - public static Metrics footerMetrics(ParquetMetadata metadata, Stream> fieldMetrics, - MetricsConfig metricsConfig) { + public static Metrics footerMetrics( + ParquetMetadata metadata, Stream> fieldMetrics, MetricsConfig metricsConfig) { return footerMetrics(metadata, fieldMetrics, metricsConfig, null); } @SuppressWarnings("checkstyle:CyclomaticComplexity") - public static Metrics footerMetrics(ParquetMetadata metadata, Stream> fieldMetrics, - MetricsConfig metricsConfig, NameMapping nameMapping) { + public static Metrics footerMetrics( + ParquetMetadata metadata, + Stream> fieldMetrics, + MetricsConfig metricsConfig, + NameMapping nameMapping) { Preconditions.checkNotNull(fieldMetrics, "fieldMetrics should not be null"); long rowCount = 0; @@ -104,8 +106,8 @@ public static Metrics footerMetrics(ParquetMetadata metadata, Stream> fieldMetricsMap = fieldMetrics.collect( - Collectors.toMap(FieldMetrics::id, Function.identity())); + Map> fieldMetricsMap = + fieldMetrics.collect(Collectors.toMap(FieldMetrics::id, Function.identity())); List blocks = metadata.getBlocks(); for (BlockMetaData block : blocks) { @@ -138,11 +140,13 @@ public static Metrics footerMetrics(ParquetMetadata metadata, Stream min = ParquetConversions.fromParquetPrimitive( - field.type(), column.getPrimitiveType(), stats.genericGetMin()); + Literal min = + ParquetConversions.fromParquetPrimitive( + field.type(), column.getPrimitiveType(), stats.genericGetMin()); updateMin(lowerBounds, fieldId, field.type(), min, metricsMode); - Literal max = ParquetConversions.fromParquetPrimitive( - field.type(), column.getPrimitiveType(), stats.genericGetMax()); + Literal max = + ParquetConversions.fromParquetPrimitive( + field.type(), column.getPrimitiveType(), stats.genericGetMax()); updateMax(upperBounds, fieldId, field.type(), max, metricsMode); } } @@ -159,39 +163,52 @@ public static Metrics footerMetrics(ParquetMetadata metadata, Stream> idToFieldMetricsMap, MetricsConfig metricsConfig, Schema schema, - Map> lowerBounds, Map> upperBounds) { - idToFieldMetricsMap.entrySet().forEach(entry -> { - int fieldId = entry.getKey(); - FieldMetrics metrics = entry.getValue(); - MetricsMode metricsMode = MetricsUtil.metricsMode(schema, metricsConfig, fieldId); - - // only check for MetricsModes.None, since we don't truncate float/double values. - if (metricsMode != MetricsModes.None.get()) { - if (!metrics.hasBounds()) { - lowerBounds.remove(fieldId); - upperBounds.remove(fieldId); - } else if (metrics.upperBound() instanceof Float) { - lowerBounds.put(fieldId, Literal.of((Float) metrics.lowerBound())); - upperBounds.put(fieldId, Literal.of((Float) metrics.upperBound())); - } else if (metrics.upperBound() instanceof Double) { - lowerBounds.put(fieldId, Literal.of((Double) metrics.lowerBound())); - upperBounds.put(fieldId, Literal.of((Double) metrics.upperBound())); - } else { - throw new UnsupportedOperationException("Expected only float or double column metrics"); - } - } - }); + Map> idToFieldMetricsMap, + MetricsConfig metricsConfig, + Schema schema, + Map> lowerBounds, + Map> upperBounds) { + idToFieldMetricsMap + .entrySet() + .forEach( + entry -> { + int fieldId = entry.getKey(); + FieldMetrics metrics = entry.getValue(); + MetricsMode metricsMode = MetricsUtil.metricsMode(schema, metricsConfig, fieldId); + + // only check for MetricsModes.None, since we don't truncate float/double values. + if (metricsMode != MetricsModes.None.get()) { + if (!metrics.hasBounds()) { + lowerBounds.remove(fieldId); + upperBounds.remove(fieldId); + } else if (metrics.upperBound() instanceof Float) { + lowerBounds.put(fieldId, Literal.of((Float) metrics.lowerBound())); + upperBounds.put(fieldId, Literal.of((Float) metrics.upperBound())); + } else if (metrics.upperBound() instanceof Double) { + lowerBounds.put(fieldId, Literal.of((Double) metrics.lowerBound())); + upperBounds.put(fieldId, Literal.of((Double) metrics.upperBound())); + } else { + throw new UnsupportedOperationException( + "Expected only float or double column metrics"); + } + } + }); } - private static MessageType getParquetTypeWithIds(ParquetMetadata metadata, NameMapping nameMapping) { + private static MessageType getParquetTypeWithIds( + ParquetMetadata metadata, NameMapping nameMapping) { MessageType type = metadata.getFileMetaData().getSchema(); if (ParquetSchemaUtil.hasIds(type)) { @@ -206,7 +223,8 @@ private static MessageType getParquetTypeWithIds(ParquetMetadata metadata, NameM } /** - * Returns a list of offsets in ascending order determined by the starting position of the row groups. + * Returns a list of offsets in ascending order determined by the starting position of the row + * groups. */ public static List getSplitOffsets(ParquetMetadata md) { List splitOffsets = Lists.newArrayListWithExpectedSize(md.getBlocks().size()); @@ -250,8 +268,12 @@ private static void increment(Map columns, int fieldId, long amou } @SuppressWarnings("unchecked") - private static void updateMin(Map> lowerBounds, int id, Type type, - Literal min, MetricsMode metricsMode) { + private static void updateMin( + Map> lowerBounds, + int id, + Type type, + Literal min, + MetricsMode metricsMode) { Literal currentMin = (Literal) lowerBounds.get(id); if (currentMin == null || min.comparator().compare(min.value(), currentMin.value()) < 0) { if (metricsMode == MetricsModes.Full.get()) { @@ -261,11 +283,13 @@ private static void updateMin(Map> lowerBounds, int id, int truncateLength = truncateMode.length(); switch (type.typeId()) { case STRING: - lowerBounds.put(id, UnicodeUtil.truncateStringMin((Literal) min, truncateLength)); + lowerBounds.put( + id, UnicodeUtil.truncateStringMin((Literal) min, truncateLength)); break; case FIXED: case BINARY: - lowerBounds.put(id, BinaryUtil.truncateBinaryMin((Literal) min, truncateLength)); + lowerBounds.put( + id, BinaryUtil.truncateBinaryMin((Literal) min, truncateLength)); break; default: lowerBounds.put(id, min); @@ -275,8 +299,12 @@ private static void updateMin(Map> lowerBounds, int id, } @SuppressWarnings("unchecked") - private static void updateMax(Map> upperBounds, int id, Type type, - Literal max, MetricsMode metricsMode) { + private static void updateMax( + Map> upperBounds, + int id, + Type type, + Literal max, + MetricsMode metricsMode) { Literal currentMax = (Literal) upperBounds.get(id); if (currentMax == null || max.comparator().compare(max.value(), currentMax.value()) > 0) { if (metricsMode == MetricsModes.Full.get()) { @@ -286,16 +314,16 @@ private static void updateMax(Map> upperBounds, int id, int truncateLength = truncateMode.length(); switch (type.typeId()) { case STRING: - Literal truncatedMaxString = UnicodeUtil.truncateStringMax((Literal) max, - truncateLength); + Literal truncatedMaxString = + UnicodeUtil.truncateStringMax((Literal) max, truncateLength); if (truncatedMaxString != null) { upperBounds.put(id, truncatedMaxString); } break; case FIXED: case BINARY: - Literal truncatedMaxBinary = BinaryUtil.truncateBinaryMax((Literal) max, - truncateLength); + Literal truncatedMaxBinary = + BinaryUtil.truncateBinaryMax((Literal) max, truncateLength); if (truncatedMaxBinary != null) { upperBounds.put(id, truncatedMaxBinary); } @@ -310,7 +338,8 @@ private static void updateMax(Map> upperBounds, int id, private static Map toBufferMap(Schema schema, Map> map) { Map bufferMap = Maps.newHashMap(); for (Map.Entry> entry : map.entrySet()) { - bufferMap.put(entry.getKey(), + bufferMap.put( + entry.getKey(), Conversions.toByteBuffer(schema.findType(entry.getKey()), entry.getValue().value())); } return bufferMap; diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReader.java index a556849d3519..b6c2b5b70303 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.util.List; diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java index baa80393a5ef..b995e1707112 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; +import static java.util.Collections.emptyIterator; + import java.lang.reflect.Array; import java.math.BigDecimal; import java.math.BigInteger; @@ -34,14 +35,11 @@ import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.Type; -import static java.util.Collections.emptyIterator; - public class ParquetValueReaders { - private ParquetValueReaders() { - } + private ParquetValueReaders() {} - public static ParquetValueReader option(Type type, int definitionLevel, - ParquetValueReader reader) { + public static ParquetValueReader option( + Type type, int definitionLevel, ParquetValueReader reader) { if (type.isRepetition(Type.Repetition.OPTIONAL)) { return new OptionReader<>(definitionLevel, reader); } @@ -64,35 +62,35 @@ public static ParquetValueReader position() { private static class NullReader implements ParquetValueReader { private static final NullReader INSTANCE = new NullReader<>(); private static final ImmutableList> COLUMNS = ImmutableList.of(); - private static final TripleIterator NULL_COLUMN = new TripleIterator() { - @Override - public int currentDefinitionLevel() { - return 0; - } + private static final TripleIterator NULL_COLUMN = + new TripleIterator() { + @Override + public int currentDefinitionLevel() { + return 0; + } - @Override - public int currentRepetitionLevel() { - return 0; - } + @Override + public int currentRepetitionLevel() { + return 0; + } - @Override - public N nextNull() { - return null; - } + @Override + public N nextNull() { + return null; + } - @Override - public boolean hasNext() { - return false; - } + @Override + public boolean hasNext() { + return false; + } - @Override - public Object next() { - return null; - } - }; + @Override + public Object next() { + return null; + } + }; - private NullReader() { - } + private NullReader() {} @Override public T read(T reuse) { @@ -110,8 +108,7 @@ public List> columns() { } @Override - public void setPageSource(PageReadStore pageStore, long rowPosition) { - } + public void setPageSource(PageReadStore pageStore, long rowPosition) {} } static class ConstantReader implements ParquetValueReader { @@ -137,8 +134,7 @@ public List> columns() { } @Override - public void setPageSource(PageReadStore pageStore, long rowPosition) { - } + public void setPageSource(PageReadStore pageStore, long rowPosition) {} } static class PositionReader implements ParquetValueReader { @@ -170,8 +166,10 @@ public void setPageSource(PageReadStore pageStore, long rowPosition) { public abstract static class PrimitiveReader implements ParquetValueReader { private final ColumnDescriptor desc; + @SuppressWarnings("checkstyle:VisibilityModifier") protected final ColumnIterator column; + private final List> children; protected PrimitiveReader(ColumnDescriptor desc) { @@ -400,7 +398,8 @@ public abstract static class RepeatedReader implements ParquetValueRead private final TripleIterator column; private final List> children; - protected RepeatedReader(int definitionLevel, int repetitionLevel, ParquetValueReader reader) { + protected RepeatedReader( + int definitionLevel, int repetitionLevel, ParquetValueReader reader) { this.definitionLevel = definitionLevel; this.repetitionLevel = repetitionLevel; this.reader = reader; @@ -457,8 +456,7 @@ public static class ListReader extends RepeatedReader, List, E> { private List lastList = null; private Iterator elements = null; - public ListReader(int definitionLevel, int repetitionLevel, - ParquetValueReader reader) { + public ListReader(int definitionLevel, int repetitionLevel, ParquetValueReader reader) { super(definitionLevel, repetitionLevel, reader); } @@ -511,17 +509,21 @@ public abstract static class RepeatedKeyValueReader implements Parqu private final TripleIterator column; private final List> children; - protected RepeatedKeyValueReader(int definitionLevel, int repetitionLevel, - ParquetValueReader keyReader, ParquetValueReader valueReader) { + protected RepeatedKeyValueReader( + int definitionLevel, + int repetitionLevel, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { this.definitionLevel = definitionLevel; this.repetitionLevel = repetitionLevel; this.keyReader = keyReader; this.valueReader = valueReader; this.column = keyReader.column(); - this.children = ImmutableList.>builder() - .addAll(keyReader.columns()) - .addAll(valueReader.columns()) - .build(); + this.children = + ImmutableList.>builder() + .addAll(keyReader.columns()) + .addAll(valueReader.columns()) + .build(); } @Override @@ -576,9 +578,11 @@ public static class MapReader extends RepeatedKeyValueReader, Ma private Map lastMap = null; private Iterator> pairs = null; - public MapReader(int definitionLevel, int repetitionLevel, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { + public MapReader( + int definitionLevel, + int repetitionLevel, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { super(definitionLevel, repetitionLevel, keyReader, valueReader); } @@ -627,9 +631,9 @@ public static class ReusableEntry implements Map.Entry { private K key = null; private V value = null; - public void set(K newKey, V newValue) { + public void set(K newKey, V newValue) { this.key = newKey; - this.value = newValue; + this.value = newValue; } @Override @@ -661,9 +665,10 @@ private interface Setter { @SuppressWarnings("unchecked") protected StructReader(List types, List> readers) { - this.readers = (ParquetValueReader[]) Array.newInstance( - ParquetValueReader.class, readers.size()); - TripleIterator[] columns = (TripleIterator[]) Array.newInstance(TripleIterator.class, readers.size()); + this.readers = + (ParquetValueReader[]) Array.newInstance(ParquetValueReader.class, readers.size()); + TripleIterator[] columns = + (TripleIterator[]) Array.newInstance(TripleIterator.class, readers.size()); Setter[] setters = (Setter[]) Array.newInstance(Setter.class, readers.size()); ImmutableList.Builder> columnsBuilder = ImmutableList.builder(); @@ -711,7 +716,7 @@ public List> columns() { @SuppressWarnings("unchecked") private Setter newSetter(ParquetValueReader reader, Type type) { if (reader instanceof UnboxedReader && type.isPrimitive()) { - UnboxedReader unboxed = (UnboxedReader) reader; + UnboxedReader unboxed = (UnboxedReader) reader; switch (type.asPrimitiveType().getPrimitiveTypeName()) { case BOOLEAN: return (record, pos, ignored) -> setBoolean(record, pos, unboxed.readBoolean()); @@ -756,8 +761,8 @@ private E get(I intermediate, int pos) { /** * Used to set a struct value by position. - *

    - * To avoid boxing, override {@link #setInteger(Object, int, int)} and similar methods. + * + *

    To avoid boxing, override {@link #setInteger(Object, int, int)} and similar methods. * * @param struct a struct object created by {@link #newStructData(Object)} * @param pos the position in the struct to set diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueWriter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueWriter.java index fa9dcb7d0237..8a61879b5246 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueWriter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.util.List; @@ -31,11 +30,8 @@ public interface ParquetValueWriter { void setColumnStore(ColumnWriteStore columnStore); - /** - * Returns a stream of {@link FieldMetrics} that this ParquetValueWriter keeps track of. - */ + /** Returns a stream of {@link FieldMetrics} that this ParquetValueWriter keeps track of. */ default Stream> metrics() { return Stream.empty(); } } - diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueWriters.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueWriters.java index 1911d40467c6..b2d91c99ef44 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueWriters.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.lang.reflect.Array; @@ -45,12 +44,10 @@ import org.apache.parquet.schema.Type; public class ParquetValueWriters { - private ParquetValueWriters() { - } + private ParquetValueWriters() {} - public static ParquetValueWriter option(Type type, - int definitionLevel, - ParquetValueWriter writer) { + public static ParquetValueWriter option( + Type type, int definitionLevel, ParquetValueWriter writer) { if (type.isRepetition(Type.Repetition.OPTIONAL)) { return new OptionWriter<>(definitionLevel, writer); } @@ -90,18 +87,18 @@ public static PrimitiveWriter strings(ColumnDescriptor desc) { return new StringWriter(desc); } - public static PrimitiveWriter decimalAsInteger(ColumnDescriptor desc, - int precision, int scale) { + public static PrimitiveWriter decimalAsInteger( + ColumnDescriptor desc, int precision, int scale) { return new IntegerDecimalWriter(desc, precision, scale); } - public static PrimitiveWriter decimalAsLong(ColumnDescriptor desc, - int precision, int scale) { + public static PrimitiveWriter decimalAsLong( + ColumnDescriptor desc, int precision, int scale) { return new LongDecimalWriter(desc, precision, scale); } - public static PrimitiveWriter decimalAsFixed(ColumnDescriptor desc, - int precision, int scale) { + public static PrimitiveWriter decimalAsFixed( + ColumnDescriptor desc, int precision, int scale) { return new FixedDecimalWriter(desc, precision, scale); } @@ -113,15 +110,15 @@ public static CollectionWriter collections(int dl, int rl, ParquetValueWr return new CollectionWriter<>(dl, rl, writer); } - public static MapWriter maps(int dl, int rl, - ParquetValueWriter keyWriter, - ParquetValueWriter valueWriter) { + public static MapWriter maps( + int dl, int rl, ParquetValueWriter keyWriter, ParquetValueWriter valueWriter) { return new MapWriter<>(dl, rl, keyWriter, valueWriter); } public abstract static class PrimitiveWriter implements ParquetValueWriter { @SuppressWarnings("checkstyle:VisibilityModifier") protected final ColumnWriter column; + private final List> children; protected PrimitiveWriter(ColumnDescriptor desc) { @@ -158,7 +155,7 @@ public void writeInteger(int repetitionLevel, int value) { column.writeInteger(repetitionLevel, value); } - public void writeLong(int repetitionLevel, long value) { + public void writeLong(int repetitionLevel, long value) { column.writeLong(repetitionLevel, value); } @@ -247,10 +244,18 @@ private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, BigDecimal decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeInteger(repetitionLevel, decimal.unscaledValue().intValue()); } @@ -268,10 +273,18 @@ private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, BigDecimal decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeLong(repetitionLevel, decimal.unscaledValue().longValue()); } @@ -286,7 +299,8 @@ private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { super(desc); this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override @@ -316,8 +330,8 @@ private StringWriter(ColumnDescriptor desc) { public void write(int repetitionLevel, CharSequence value) { if (value instanceof Utf8) { Utf8 utf8 = (Utf8) value; - column.writeBinary(repetitionLevel, - Binary.fromReusedByteArray(utf8.getBytes(), 0, utf8.getByteLength())); + column.writeBinary( + repetitionLevel, Binary.fromReusedByteArray(utf8.getBytes(), 0, utf8.getByteLength())); } else { column.writeBinary(repetitionLevel, Binary.fromString(value.toString())); } @@ -362,7 +376,8 @@ public void setColumnStore(ColumnWriteStore columnStore) { @Override public Stream> metrics() { if (writer instanceof PrimitiveWriter) { - List> fieldMetricsFromWriter = writer.metrics().collect(Collectors.toList()); + List> fieldMetricsFromWriter = + writer.metrics().collect(Collectors.toList()); if (fieldMetricsFromWriter.size() == 0) { // we are not tracking field metrics for this type ourselves @@ -370,19 +385,24 @@ public Stream> metrics() { } else if (fieldMetricsFromWriter.size() == 1) { FieldMetrics metrics = fieldMetricsFromWriter.get(0); return Stream.of( - new FieldMetrics<>(metrics.id(), - metrics.valueCount() + nullValueCount, nullValueCount, - metrics.nanValueCount(), metrics.lowerBound(), metrics.upperBound()) - ); + new FieldMetrics<>( + metrics.id(), + metrics.valueCount() + nullValueCount, + nullValueCount, + metrics.nanValueCount(), + metrics.lowerBound(), + metrics.upperBound())); } else { - throw new IllegalStateException(String.format( - "OptionWriter should only expect at most one field metric from a primitive writer." + - "Current number of fields: %s, primitive writer type: %s", - fieldMetricsFromWriter.size(), writer.getClass().getSimpleName())); + throw new IllegalStateException( + String.format( + "OptionWriter should only expect at most one field metric from a primitive writer." + + "Current number of fields: %s, primitive writer type: %s", + fieldMetricsFromWriter.size(), writer.getClass().getSimpleName())); } } - // skipping updating null stats for non-primitive types since we don't use them today, to avoid unnecessary work + // skipping updating null stats for non-primitive types since we don't use them today, to + // avoid unnecessary work return writer.metrics(); } } @@ -393,8 +413,8 @@ public abstract static class RepeatedWriter implements ParquetValueWriter< private final ParquetValueWriter writer; private final List> children; - protected RepeatedWriter(int definitionLevel, int repetitionLevel, - ParquetValueWriter writer) { + protected RepeatedWriter( + int definitionLevel, int repetitionLevel, ParquetValueWriter writer) { this.definitionLevel = definitionLevel; this.repetitionLevel = repetitionLevel; this.writer = writer; @@ -447,8 +467,8 @@ public Stream> metrics() { } private static class CollectionWriter extends RepeatedWriter, E> { - private CollectionWriter(int definitionLevel, int repetitionLevel, - ParquetValueWriter writer) { + private CollectionWriter( + int definitionLevel, int repetitionLevel, ParquetValueWriter writer) { super(definitionLevel, repetitionLevel, writer); } @@ -465,17 +485,20 @@ public abstract static class RepeatedKeyValueWriter implements ParquetV private final ParquetValueWriter valueWriter; private final List> children; - protected RepeatedKeyValueWriter(int definitionLevel, int repetitionLevel, - ParquetValueWriter keyWriter, - ParquetValueWriter valueWriter) { + protected RepeatedKeyValueWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter) { this.definitionLevel = definitionLevel; this.repetitionLevel = repetitionLevel; this.keyWriter = keyWriter; this.valueWriter = valueWriter; - this.children = ImmutableList.>builder() - .addAll(keyWriter.columns()) - .addAll(valueWriter.columns()) - .build(); + this.children = + ImmutableList.>builder() + .addAll(keyWriter.columns()) + .addAll(valueWriter.columns()) + .build(); } @Override @@ -525,8 +548,11 @@ public Stream> metrics() { } private static class MapWriter extends RepeatedKeyValueWriter, K, V> { - private MapWriter(int definitionLevel, int repetitionLevel, - ParquetValueWriter keyWriter, ParquetValueWriter valueWriter) { + private MapWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter) { super(definitionLevel, repetitionLevel, keyWriter, valueWriter); } @@ -542,8 +568,9 @@ public abstract static class StructWriter implements ParquetValueWriter { @SuppressWarnings("unchecked") protected StructWriter(List> writers) { - this.writers = (ParquetValueWriter[]) Array.newInstance( - ParquetValueWriter.class, writers.size()); + this.writers = + (ParquetValueWriter[]) + Array.newInstance(ParquetValueWriter.class, writers.size()); ImmutableList.Builder> columnsBuilder = ImmutableList.builder(); for (int i = 0; i < writers.size(); i += 1) { @@ -586,7 +613,8 @@ public Stream> metrics() { public static class PositionDeleteStructWriter extends StructWriter> { private final Function pathTransformFunc; - public PositionDeleteStructWriter(StructWriter replacedWriter, Function pathTransformFunc) { + public PositionDeleteStructWriter( + StructWriter replacedWriter, Function pathTransformFunc) { super(Arrays.asList(replacedWriter.writers)); this.pathTransformFunc = pathTransformFunc; } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriteAdapter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriteAdapter.java index 981c0af3a0ca..049fed0decde 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriteAdapter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriteAdapter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.io.IOException; @@ -31,8 +30,9 @@ import org.apache.parquet.hadoop.metadata.ParquetMetadata; /** - * Parquet writer that wraps around hadoop's {@link ParquetWriter}. - * It shouldn't be used in production; {@link org.apache.iceberg.parquet.ParquetWriter} is a better alternative. + * Parquet writer that wraps around hadoop's {@link ParquetWriter}. It shouldn't be used in + * production; {@link org.apache.iceberg.parquet.ParquetWriter} is a better alternative. + * * @deprecated use {@link org.apache.iceberg.parquet.ParquetWriter} */ @Deprecated @@ -60,7 +60,7 @@ public Metrics metrics() { Preconditions.checkState(footer != null, "Cannot produce metrics until closed"); // Note: Metrics reported by this method do not contain a full set of available metrics. // Specifically, it lacks metrics not included in Parquet file's footer (e.g. NaN count) - return ParquetUtil.footerMetrics(footer, Stream.empty(), metricsConfig); + return ParquetUtil.footerMetrics(footer, Stream.empty(), metricsConfig); } @Override diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriteSupport.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriteSupport.java index 2990bd11356a..90bcea28a219 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriteSupport.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriteSupport.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.util.Map; @@ -31,7 +30,8 @@ class ParquetWriteSupport extends WriteSupport { private final Map keyValueMetadata; private final WriteSupport wrapped; - ParquetWriteSupport(MessageType type, Map keyValueMetadata, WriteSupport writeSupport) { + ParquetWriteSupport( + MessageType type, Map keyValueMetadata, WriteSupport writeSupport) { this.type = type; this.keyValueMetadata = keyValueMetadata; this.wrapped = writeSupport; @@ -40,10 +40,11 @@ class ParquetWriteSupport extends WriteSupport { @Override public WriteContext init(Configuration configuration) { WriteContext wrappedContext = wrapped.init(configuration); - Map metadata = ImmutableMap.builder() - .putAll(keyValueMetadata) - .putAll(wrappedContext.getExtraMetaData()) - .build(); + Map metadata = + ImmutableMap.builder() + .putAll(keyValueMetadata) + .putAll(wrappedContext.getExtraMetaData()) + .build(); return new WriteContext(type, metadata); } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriter.java index 93e01a83c684..db49efe61dff 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.io.Closeable; @@ -49,19 +48,21 @@ class ParquetWriter implements FileAppender, Closeable { private static final Metrics EMPTY_METRICS = new Metrics(0L, null, null, null, null); - private static final DynConstructors.Ctor pageStoreCtorParquet = DynConstructors - .builder(PageWriteStore.class) - .hiddenImpl("org.apache.parquet.hadoop.ColumnChunkPageWriteStore", - CodecFactory.BytesCompressor.class, - MessageType.class, - ByteBufferAllocator.class, - int.class) - .build(); + private static final DynConstructors.Ctor pageStoreCtorParquet = + DynConstructors.builder(PageWriteStore.class) + .hiddenImpl( + "org.apache.parquet.hadoop.ColumnChunkPageWriteStore", + CodecFactory.BytesCompressor.class, + MessageType.class, + ByteBufferAllocator.class, + int.class) + .build(); - private static final DynMethods.UnboundMethod flushToWriter = DynMethods - .builder("flushToFileWriter") - .hiddenImpl("org.apache.parquet.hadoop.ColumnChunkPageWriteStore", ParquetFileWriter.class) - .build(); + private static final DynMethods.UnboundMethod flushToWriter = + DynMethods.builder("flushToFileWriter") + .hiddenImpl( + "org.apache.parquet.hadoop.ColumnChunkPageWriteStore", ParquetFileWriter.class) + .build(); private final long targetRowGroupSize; private final Map metadata; @@ -86,13 +87,17 @@ class ParquetWriter implements FileAppender, Closeable { private static final int DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH = 64; @SuppressWarnings("unchecked") - ParquetWriter(Configuration conf, OutputFile output, Schema schema, long rowGroupSize, - Map metadata, - Function> createWriterFunc, - CompressionCodecName codec, - ParquetProperties properties, - MetricsConfig metricsConfig, - ParquetFileWriter.Mode writeMode) { + ParquetWriter( + Configuration conf, + OutputFile output, + Schema schema, + long rowGroupSize, + Map metadata, + Function> createWriterFunc, + CompressionCodecName codec, + ParquetProperties properties, + MetricsConfig metricsConfig, + ParquetFileWriter.Mode writeMode) { this.targetRowGroupSize = rowGroupSize; this.props = properties; this.metadata = ImmutableMap.copyOf(metadata); @@ -100,7 +105,8 @@ class ParquetWriter implements FileAppender, Closeable { this.parquetSchema = ParquetSchemaUtil.convert(schema, "table"); this.model = (ParquetValueWriter) createWriterFunc.apply(parquetSchema); this.metricsConfig = metricsConfig; - this.columnIndexTruncateLength = conf.getInt(COLUMN_INDEX_TRUNCATE_LENGTH, DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH); + this.columnIndexTruncateLength = + conf.getInt(COLUMN_INDEX_TRUNCATE_LENGTH, DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH); this.writeMode = writeMode; this.output = output; this.conf = conf; @@ -111,8 +117,9 @@ class ParquetWriter implements FileAppender, Closeable { private void ensureWriterInitialized() { if (writer == null) { try { - this.writer = new ParquetFileWriter( - ParquetIO.file(output, conf), parquetSchema, writeMode, targetRowGroupSize, 0); + this.writer = + new ParquetFileWriter( + ParquetIO.file(output, conf), parquetSchema, writeMode, targetRowGroupSize, 0); } catch (IOException e) { throw new UncheckedIOException("Failed to create Parquet file", e); } @@ -144,12 +151,12 @@ public Metrics metrics() { /** * Returns the approximate length of the output file produced by this writer. - *

    - * Prior to calling {@link ParquetWriter#close}, the result is approximate. After calling close, the length is - * exact. * - * @return the approximate length of the output file produced by this writer or the exact length if this writer is - * closed. + *

    Prior to calling {@link ParquetWriter#close}, the result is approximate. After calling + * close, the length is exact. + * + * @return the approximate length of the output file produced by this writer or the exact length + * if this writer is closed. */ @Override public long length() { @@ -161,7 +168,8 @@ public long length() { } if (!closed && recordCount > 0) { - // recordCount > 0 when there are records in the write store that have not been flushed to the Parquet file + // recordCount > 0 when there are records in the write store that have not been flushed to + // the Parquet file length += writeStore.getBufferedSize(); } @@ -190,8 +198,11 @@ private void checkSize() { } else { long remainingSpace = targetRowGroupSize - bufferedSize; long remainingRecords = (long) (remainingSpace / avgRecordSize); - this.nextCheckRecordCount = recordCount + Math.min(Math.max(remainingRecords / 2, - props.getMinRowCountForPageSizeCheck()), props.getMaxRowCountForPageSizeCheck()); + this.nextCheckRecordCount = + recordCount + + Math.min( + Math.max(remainingRecords / 2, props.getMinRowCountForPageSizeCheck()), + props.getMaxRowCountForPageSizeCheck()); } } } @@ -217,15 +228,19 @@ private void flushRowGroup(boolean finished) { private void startRowGroup() { Preconditions.checkState(!closed, "Writer is closed"); - this.nextCheckRecordCount = Math.min(Math.max(recordCount / 2, - props.getMinRowCountForPageSizeCheck()), props.getMaxRowCountForPageSizeCheck()); + this.nextCheckRecordCount = + Math.min( + Math.max(recordCount / 2, props.getMinRowCountForPageSizeCheck()), + props.getMaxRowCountForPageSizeCheck()); this.recordCount = 0; - PageWriteStore pageStore = pageStoreCtorParquet.newInstance( - compressor, parquetSchema, props.getAllocator(), this.columnIndexTruncateLength); + PageWriteStore pageStore = + pageStoreCtorParquet.newInstance( + compressor, parquetSchema, props.getAllocator(), this.columnIndexTruncateLength); this.flushPageStoreToWriter = flushToWriter.bind(pageStore); - this.writeStore = props.newColumnWriteStore(parquetSchema, pageStore, (BloomFilterWriteStore) pageStore); + this.writeStore = + props.newColumnWriteStore(parquetSchema, pageStore, (BloomFilterWriteStore) pageStore); model.setColumnStore(writeStore); } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/PruneColumns.java b/parquet/src/main/java/org/apache/iceberg/parquet/PruneColumns.java index 2232d9a75ef4..f4f50f1f3efa 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/PruneColumns.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/PruneColumns.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.util.Collections; @@ -137,7 +136,8 @@ public Type map(GroupType map, Type key, Type value) { Integer keyId = getId(originalKey); Integer valueId = getId(originalValue); - if ((keyId != null && selectedIds.contains(keyId)) || (valueId != null && selectedIds.contains(valueId))) { + if ((keyId != null && selectedIds.contains(keyId)) + || (valueId != null && selectedIds.contains(valueId))) { return map; } else if (value != null) { if (!Objects.equal(value, originalValue)) { @@ -164,8 +164,8 @@ private boolean isStruct(Type field) { } else { GroupType groupType = field.asGroupType(); LogicalTypeAnnotation logicalTypeAnnotation = groupType.getLogicalTypeAnnotation(); - return !LogicalTypeAnnotation.mapType().equals(logicalTypeAnnotation) && - !LogicalTypeAnnotation.listType().equals(logicalTypeAnnotation); + return !LogicalTypeAnnotation.mapType().equals(logicalTypeAnnotation) + && !LogicalTypeAnnotation.listType().equals(logicalTypeAnnotation); } } } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java index 9153ce0fdfa0..7bb89a30f8e9 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.io.IOException; @@ -65,10 +64,17 @@ class ReadConf { private final List> columnChunkMetaDataForRowGroups; @SuppressWarnings("unchecked") - ReadConf(InputFile file, ParquetReadOptions options, Schema expectedSchema, Expression filter, - Function> readerFunc, Function> batchedReaderFunc, NameMapping nameMapping, boolean reuseContainers, - boolean caseSensitive, Integer bSize) { + ReadConf( + InputFile file, + ParquetReadOptions options, + Schema expectedSchema, + Expression filter, + Function> readerFunc, + Function> batchedReaderFunc, + NameMapping nameMapping, + boolean reuseContainers, + boolean caseSensitive, + Integer bSize) { this.file = file; this.options = options; this.reader = newReader(file, options); @@ -105,11 +111,15 @@ class ReadConf { long computedTotalValues = 0L; for (int i = 0; i < shouldSkip.length; i += 1) { BlockMetaData rowGroup = rowGroups.get(i); - startRowPositions[i] = offsetToStartPos == null ? 0 : offsetToStartPos.get(rowGroup.getStartingPos()); - boolean shouldRead = filter == null || ( - statsFilter.shouldRead(typeWithIds, rowGroup) && - dictFilter.shouldRead(typeWithIds, rowGroup, reader.getDictionaryReader(rowGroup)) && - bloomFilter.shouldRead(typeWithIds, rowGroup, reader.getBloomFilterDataReader(rowGroup))); + startRowPositions[i] = + offsetToStartPos == null ? 0 : offsetToStartPos.get(rowGroup.getStartingPos()); + boolean shouldRead = + filter == null + || (statsFilter.shouldRead(typeWithIds, rowGroup) + && dictFilter.shouldRead( + typeWithIds, rowGroup, reader.getDictionaryReader(rowGroup)) + && bloomFilter.shouldRead( + typeWithIds, rowGroup, reader.getBloomFilterDataReader(rowGroup))); this.shouldSkip[i] = !shouldRead; if (shouldRead) { computedTotalValues += rowGroup.getRowCount(); @@ -225,16 +235,21 @@ private static ParquetFileReader newReader(InputFile file, ParquetReadOptions op } private List> getColumnChunkMetadataForRowGroups() { - Set projectedColumns = projection.getColumns().stream() - .map(columnDescriptor -> ColumnPath.get(columnDescriptor.getPath())).collect(Collectors.toSet()); - ImmutableList.Builder> listBuilder = ImmutableList.builder(); + Set projectedColumns = + projection.getColumns().stream() + .map(columnDescriptor -> ColumnPath.get(columnDescriptor.getPath())) + .collect(Collectors.toSet()); + ImmutableList.Builder> listBuilder = + ImmutableList.builder(); for (int i = 0; i < rowGroups.size(); i++) { if (!shouldSkip[i]) { BlockMetaData blockMetaData = rowGroups.get(i); ImmutableMap.Builder mapBuilder = ImmutableMap.builder(); blockMetaData.getColumns().stream() .filter(columnChunkMetaData -> projectedColumns.contains(columnChunkMetaData.getPath())) - .forEach(columnChunkMetaData -> mapBuilder.put(columnChunkMetaData.getPath(), columnChunkMetaData)); + .forEach( + columnChunkMetaData -> + mapBuilder.put(columnChunkMetaData.getPath(), columnChunkMetaData)); listBuilder.add(mapBuilder.build()); } else { listBuilder.add(ImmutableMap.of()); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/RemoveIds.java b/parquet/src/main/java/org/apache/iceberg/parquet/RemoveIds.java index 6096f0f5b1dc..21479e7ae381 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/RemoveIds.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/RemoveIds.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.util.List; @@ -48,17 +47,12 @@ public Type struct(GroupType struct, List fields) { @Override public Type list(GroupType array, Type item) { - return Types.list(array.getRepetition()) - .element(item) - .named(array.getName()); + return Types.list(array.getRepetition()).element(item).named(array.getName()); } @Override public Type map(GroupType map, Type key, Type value) { - return Types.map(map.getRepetition()) - .key(key) - .value(value) - .named(map.getName()); + return Types.map(map.getRepetition()).key(key).value(value).named(map.getName()); } @Override @@ -72,5 +66,4 @@ public Type primitive(PrimitiveType primitive) { public static MessageType removeIds(MessageType type) { return (MessageType) ParquetTypeVisitor.visit(type, new RemoveIds()); } - } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/TripleIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/TripleIterator.java index db8d68a549c9..5a833d4c4447 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/TripleIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/TripleIterator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.util.Iterator; @@ -25,8 +24,8 @@ interface TripleIterator extends Iterator { /** * Returns the definition level from the current triple. - *

    - * This method does not advance this iterator. + * + *

    This method does not advance this iterator. * * @return the definition level of the current triple. * @throws java.util.NoSuchElementException if there are no more elements @@ -35,8 +34,8 @@ interface TripleIterator extends Iterator { /** * Returns the repetition level from the current triple or 0 if there are no more elements. - *

    - * This method does not advance this iterator. + * + *

    This method does not advance this iterator. * * @return the repetition level of the current triple, or 0 if there is no current triple. * @throws java.util.NoSuchElementException if there are no more elements @@ -45,8 +44,8 @@ interface TripleIterator extends Iterator { /** * Returns the next value as an un-boxed boolean. - *

    - * This method has the same behavior as {@link #next()} and will advance this iterator. + * + *

    This method has the same behavior as {@link #next()} and will advance this iterator. * * @return the next value as an un-boxed boolean * @throws java.util.NoSuchElementException if there are no more elements @@ -58,8 +57,8 @@ default boolean nextBoolean() { /** * Returns the next value as an un-boxed int. - *

    - * This method has the same behavior as {@link #next()} and will advance this iterator. + * + *

    This method has the same behavior as {@link #next()} and will advance this iterator. * * @return the next value as an un-boxed int * @throws java.util.NoSuchElementException if there are no more elements @@ -71,8 +70,8 @@ default int nextInteger() { /** * Returns the next value as an un-boxed long. - *

    - * This method has the same behavior as {@link #next()} and will advance this iterator. + * + *

    This method has the same behavior as {@link #next()} and will advance this iterator. * * @return the next value as an un-boxed long * @throws java.util.NoSuchElementException if there are no more elements @@ -84,8 +83,8 @@ default long nextLong() { /** * Returns the next value as an un-boxed float. - *

    - * This method has the same behavior as {@link #next()} and will advance this iterator. + * + *

    This method has the same behavior as {@link #next()} and will advance this iterator. * * @return the next value as an un-boxed float * @throws java.util.NoSuchElementException if there are no more elements @@ -97,8 +96,8 @@ default float nextFloat() { /** * Returns the next value as an un-boxed double. - *

    - * This method has the same behavior as {@link #next()} and will advance this iterator. + * + *

    This method has the same behavior as {@link #next()} and will advance this iterator. * * @return the next value as an un-boxed double * @throws java.util.NoSuchElementException if there are no more elements @@ -110,8 +109,8 @@ default double nextDouble() { /** * Returns the next value as a Binary. - *

    - * This method has the same behavior as {@link #next()} and will advance this iterator. + * + *

    This method has the same behavior as {@link #next()} and will advance this iterator. * * @return the next value as a Binary * @throws java.util.NoSuchElementException if there are no more elements @@ -123,8 +122,8 @@ default Binary nextBinary() { /** * Returns null and advances the iterator. - *

    - * This method has the same behavior as {@link #next()} and will advance this iterator. + * + *

    This method has the same behavior as {@link #next()} and will advance this iterator. * * @return null * @throws java.util.NoSuchElementException if there are no more elements diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/TripleWriter.java b/parquet/src/main/java/org/apache/iceberg/parquet/TripleWriter.java index 8fbe90c06ce5..99cca80e1f43 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/TripleWriter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/TripleWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import org.apache.parquet.io.api.Binary; diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/TypeToMessageType.java b/parquet/src/main/java/org/apache/iceberg/parquet/TypeToMessageType.java index d1d60d2307a1..54f11500489b 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/TypeToMessageType.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/TypeToMessageType.java @@ -16,9 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; + import org.apache.iceberg.Schema; import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.types.Type.NestedType; @@ -38,25 +45,17 @@ import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Types; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; - public class TypeToMessageType { public static final int DECIMAL_INT32_MAX_DIGITS = 9; public static final int DECIMAL_INT64_MAX_DIGITS = 18; private static final LogicalTypeAnnotation STRING = LogicalTypeAnnotation.stringType(); private static final LogicalTypeAnnotation DATE = LogicalTypeAnnotation.dateType(); - private static final LogicalTypeAnnotation TIME_MICROS = LogicalTypeAnnotation - .timeType(false /* not adjusted to UTC */, TimeUnit.MICROS); - private static final LogicalTypeAnnotation TIMESTAMP_MICROS = LogicalTypeAnnotation - .timestampType(false /* not adjusted to UTC */, TimeUnit.MICROS); - private static final LogicalTypeAnnotation TIMESTAMPTZ_MICROS = LogicalTypeAnnotation - .timestampType(true /* adjusted to UTC */, TimeUnit.MICROS); + private static final LogicalTypeAnnotation TIME_MICROS = + LogicalTypeAnnotation.timeType(false /* not adjusted to UTC */, TimeUnit.MICROS); + private static final LogicalTypeAnnotation TIMESTAMP_MICROS = + LogicalTypeAnnotation.timestampType(false /* not adjusted to UTC */, TimeUnit.MICROS); + private static final LogicalTypeAnnotation TIMESTAMPTZ_MICROS = + LogicalTypeAnnotation.timestampType(true /* adjusted to UTC */, TimeUnit.MICROS); public MessageType convert(Schema schema, String name) { Types.MessageTypeBuilder builder = Types.buildMessage(); @@ -79,8 +78,8 @@ public GroupType struct(StructType struct, Type.Repetition repetition, int id, S } public Type field(NestedField field) { - Type.Repetition repetition = field.isOptional() ? - Type.Repetition.OPTIONAL : Type.Repetition.REQUIRED; + Type.Repetition repetition = + field.isOptional() ? Type.Repetition.OPTIONAL : Type.Repetition.REQUIRED; int id = field.fieldId(); String name = field.name(); @@ -118,7 +117,8 @@ public GroupType map(MapType map, Type.Repetition repetition, int id, String nam .named(AvroSchemaUtil.makeCompatibleName(name)); } - public Type primitive(PrimitiveType primitive, Type.Repetition repetition, int id, String originalName) { + public Type primitive( + PrimitiveType primitive, Type.Repetition repetition, int id, String originalName) { String name = AvroSchemaUtil.makeCompatibleName(originalName); switch (primitive.typeId()) { case BOOLEAN: @@ -148,7 +148,8 @@ public Type primitive(PrimitiveType primitive, Type.Repetition repetition, int i case FIXED: FixedType fixed = (FixedType) primitive; - return Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition).length(fixed.length()) + return Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition) + .length(fixed.length()) .id(id) .named(name); @@ -172,17 +173,19 @@ public Type primitive(PrimitiveType primitive, Type.Repetition repetition, int i } else { // store as a fixed-length array int minLength = TypeUtil.decimalRequiredBytes(decimal.precision()); - return Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition).length(minLength) + return Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition) + .length(minLength) .as(decimalAnnotation(decimal.precision(), decimal.scale())) .id(id) .named(name); } case UUID: - return Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition).length(16) - .as(LogicalTypeAnnotation.uuidType()) - .id(id) - .named(name); + return Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition) + .length(16) + .as(LogicalTypeAnnotation.uuidType()) + .id(id) + .named(name); default: throw new UnsupportedOperationException("Unsupported type for Parquet: " + primitive); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/TypeWithSchemaVisitor.java b/parquet/src/main/java/org/apache/iceberg/parquet/TypeWithSchemaVisitor.java index 0081bc333e9e..e0c07d31755e 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/TypeWithSchemaVisitor.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/TypeWithSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.util.ArrayDeque; @@ -40,15 +39,16 @@ public class TypeWithSchemaVisitor { protected ArrayDeque fieldNames = new ArrayDeque<>(); @SuppressWarnings("checkstyle:CyclomaticComplexity") - public static T visit(org.apache.iceberg.types.Type iType, Type type, TypeWithSchemaVisitor visitor) { + public static T visit( + org.apache.iceberg.types.Type iType, Type type, TypeWithSchemaVisitor visitor) { if (type instanceof MessageType) { Types.StructType struct = iType != null ? iType.asStructType() : null; - return visitor.message(struct, (MessageType) type, - visitFields(struct, type.asGroupType(), visitor)); + return visitor.message( + struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); } else if (type.isPrimitive()) { - org.apache.iceberg.types.Type.PrimitiveType iPrimitive = iType != null ? - iType.asPrimitiveType() : null; + org.apache.iceberg.types.Type.PrimitiveType iPrimitive = + iType != null ? iType.asPrimitiveType() : null; return visitor.primitive(iPrimitive, type.asPrimitiveType()); } else { @@ -58,11 +58,14 @@ public static T visit(org.apache.iceberg.types.Type iType, Type type, TypeWi if (annotation != null) { switch (annotation) { case LIST: - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid list: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid list: does not contain single repeated field: %s", + group); Type repeatedElement = group.getFields().get(0); - Preconditions.checkArgument(repeatedElement.isRepetition(Type.Repetition.REPEATED), + Preconditions.checkArgument( + repeatedElement.isRepetition(Type.Repetition.REPEATED), "Invalid list: inner group is not repeated"); Type listElement = ParquetSchemaUtil.determineListElementType(group); @@ -80,15 +83,21 @@ public static T visit(org.apache.iceberg.types.Type iType, Type type, TypeWi } case MAP: - Preconditions.checkArgument(!group.isRepetition(Type.Repetition.REPEATED), - "Invalid map: top-level group is repeated: %s", group); - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid map: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + !group.isRepetition(Type.Repetition.REPEATED), + "Invalid map: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid map: does not contain single repeated field: %s", + group); GroupType repeatedKeyValue = group.getType(0).asGroupType(); - Preconditions.checkArgument(repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), + Preconditions.checkArgument( + repeatedKeyValue.isRepetition(Type.Repetition.REPEATED), "Invalid map: inner group is not repeated"); - Preconditions.checkArgument(repeatedKeyValue.getFieldCount() <= 2, + Preconditions.checkArgument( + repeatedKeyValue.getFieldCount() <= 2, "Invalid map: repeated group does not have 2 fields"); Types.MapType map = null; @@ -141,14 +150,20 @@ public static T visit(org.apache.iceberg.types.Type iType, Type type, TypeWi } private static T visitTwoLevelList( - Types.ListType iListType, Types.NestedField iListElement, GroupType pListType, Type pListElement, + Types.ListType iListType, + Types.NestedField iListElement, + GroupType pListType, + Type pListElement, TypeWithSchemaVisitor visitor) { T elementResult = visitField(iListElement, pListElement, visitor); return visitor.list(iListType, pListType, elementResult); } private static T visitThreeLevelList( - Types.ListType iListType, Types.NestedField iListElement, GroupType pListType, Type pListElement, + Types.ListType iListType, + Types.NestedField iListElement, + GroupType pListType, + Type pListElement, TypeWithSchemaVisitor visitor) { visitor.fieldNames.push(pListType.getFieldName(0)); @@ -161,7 +176,8 @@ private static T visitThreeLevelList( } } - private static T visitField(Types.NestedField iField, Type field, TypeWithSchemaVisitor visitor) { + private static T visitField( + Types.NestedField iField, Type field, TypeWithSchemaVisitor visitor) { visitor.fieldNames.push(field.getName()); try { return visit(iField != null ? iField.type() : null, field, visitor); @@ -170,7 +186,8 @@ private static T visitField(Types.NestedField iField, Type field, TypeWithSc } } - private static List visitFields(Types.StructType struct, GroupType group, TypeWithSchemaVisitor visitor) { + private static List visitFields( + Types.StructType struct, GroupType group, TypeWithSchemaVisitor visitor) { List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); for (Type field : group.getFields()) { int id = -1; @@ -200,8 +217,8 @@ public T map(Types.MapType iMap, GroupType map, T key, T value) { return null; } - public T primitive(org.apache.iceberg.types.Type.PrimitiveType iPrimitive, - PrimitiveType primitive) { + public T primitive( + org.apache.iceberg.types.Type.PrimitiveType iPrimitive, PrimitiveType primitive) { return null; } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ValuesAsBytesReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ValuesAsBytesReader.java index a759802793b8..71e10247af37 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ValuesAsBytesReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ValuesAsBytesReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.io.IOException; @@ -27,8 +26,8 @@ import org.apache.parquet.io.ParquetDecodingException; /** - * Implements a {@link ValuesReader} specifically to read given number of bytes from the underlying {@link - * ByteBufferInputStream}. + * Implements a {@link ValuesReader} specifically to read given number of bytes from the underlying + * {@link ByteBufferInputStream}. */ public class ValuesAsBytesReader extends ValuesReader { private ByteBufferInputStream valuesInputStream = null; @@ -36,8 +35,7 @@ public class ValuesAsBytesReader extends ValuesReader { private int bitOffset; private byte currentByte = 0; - public ValuesAsBytesReader() { - } + public ValuesAsBytesReader() {} @Override public void initFromPage(int valueCount, ByteBufferInputStream in) { @@ -91,9 +89,7 @@ public final boolean readBoolean() { return value; } - /** - * Returns 1 if true, 0 otherwise. - */ + /** Returns 1 if true, 0 otherwise. */ public final int readBooleanAsInt() { if (bitOffset == 0) { currentByte = getByte(); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java index d3e08480c500..773e0f7a85d0 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.io.IOException; @@ -52,9 +51,15 @@ public class VectorizedParquetReader extends CloseableGroup implements Closea private final NameMapping nameMapping; public VectorizedParquetReader( - InputFile input, Schema expectedSchema, ParquetReadOptions options, - Function> readerFunc, NameMapping nameMapping, Expression filter, - boolean reuseContainers, boolean caseSensitive, int maxRecordsPerBatch) { + InputFile input, + Schema expectedSchema, + ParquetReadOptions options, + Function> readerFunc, + NameMapping nameMapping, + Expression filter, + boolean reuseContainers, + boolean caseSensitive, + int maxRecordsPerBatch) { this.input = input; this.expectedSchema = expectedSchema; this.options = options; @@ -71,9 +76,18 @@ public VectorizedParquetReader( private ReadConf init() { if (conf == null) { - ReadConf readConf = new ReadConf( - input, options, expectedSchema, filter, null, batchReaderFunc, nameMapping, reuseContainers, - caseSensitive, batchSize); + ReadConf readConf = + new ReadConf( + input, + options, + expectedSchema, + filter, + null, + batchReaderFunc, + nameMapping, + reuseContainers, + caseSensitive, + batchSize); this.conf = readConf.copy(); return readConf; } @@ -113,7 +127,6 @@ private static class FileIterator implements CloseableIterator { this.rowGroupsStartRowPos = conf.startRowPositions(); } - @Override public boolean hasNext() { return valuesRead < totalValues; diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedReader.java index aa8582dda755..72b1e39e9634 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.util.Map; @@ -24,15 +23,13 @@ import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.ColumnPath; -/** - * Interface for vectorized Iceberg readers. - */ +/** Interface for vectorized Iceberg readers. */ public interface VectorizedReader { /** * Reads a batch of type @param <T> and of size numRows * - * @param reuse container for the last batch to be reused for next batch + * @param reuse container for the last batch to be reused for next batch * @param numRows number of rows to read * @return batch of records of type @param <T> */ @@ -43,14 +40,13 @@ public interface VectorizedReader { /** * Sets the row group information to be used with this reader * - * @param pages row group information for all the columns + * @param pages row group information for all the columns * @param metadata map of {@link ColumnPath} -> {@link ColumnChunkMetaData} for the row group * @param rowPosition the row group's row offset in the parquet file */ - void setRowGroupInfo(PageReadStore pages, Map metadata, long rowPosition); + void setRowGroupInfo( + PageReadStore pages, Map metadata, long rowPosition); - /** - * Release any resources allocated. - */ + /** Release any resources allocated. */ void close(); } diff --git a/parquet/src/test/java/org/apache/iceberg/TestHelpers.java b/parquet/src/test/java/org/apache/iceberg/TestHelpers.java index 38cd2f2df1af..be6ebe93d59e 100644 --- a/parquet/src/test/java/org/apache/iceberg/TestHelpers.java +++ b/parquet/src/test/java/org/apache/iceberg/TestHelpers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.concurrent.Callable; @@ -27,24 +26,23 @@ public class TestHelpers { - private TestHelpers() { - } + private TestHelpers() {} /** * A convenience method to avoid a large number of @Test(expected=...) tests + * * @param message A String message to describe this assertion * @param expected An Exception class that the Runnable should throw - * @param containedInMessage A String that should be contained by the thrown - * exception's message + * @param containedInMessage A String that should be contained by the thrown exception's message * @param callable A Callable that is expected to throw the exception */ - public static void assertThrows(String message, - Class expected, - String containedInMessage, - Callable callable) { - AbstractThrowableAssert check = Assertions.assertThatThrownBy(callable::call) - .as(message) - .isInstanceOf(expected); + public static void assertThrows( + String message, + Class expected, + String containedInMessage, + Callable callable) { + AbstractThrowableAssert check = + Assertions.assertThatThrownBy(callable::call).as(message).isInstanceOf(expected); if (null != containedInMessage) { check.hasMessageContaining(containedInMessage); } @@ -52,19 +50,19 @@ public static void assertThrows(String message, /** * A convenience method to avoid a large number of @Test(expected=...) tests + * * @param message A String message to describe this assertion * @param expected An Exception class that the Runnable should throw - * @param containedInMessage A String that should be contained by the thrown - * exception's message + * @param containedInMessage A String that should be contained by the thrown exception's message * @param runnable A Runnable that is expected to throw the runtime exception */ - public static void assertThrows(String message, - Class expected, - String containedInMessage, - Runnable runnable) { - AbstractThrowableAssert check = Assertions.assertThatThrownBy(runnable::run) - .as(message) - .isInstanceOf(expected); + public static void assertThrows( + String message, + Class expected, + String containedInMessage, + Runnable runnable) { + AbstractThrowableAssert check = + Assertions.assertThatThrownBy(runnable::run).as(message).isInstanceOf(expected); if (null != containedInMessage) { check.hasMessageContaining(containedInMessage); } @@ -72,6 +70,7 @@ public static void assertThrows(String message, /** * A convenience method to assert if an Avro field is empty + * * @param record The record to read from * @param field The name of the field */ diff --git a/parquet/src/test/java/org/apache/iceberg/avro/TestParquetReadProjection.java b/parquet/src/test/java/org/apache/iceberg/avro/TestParquetReadProjection.java index d9ab284d264c..13bf1a37a119 100644 --- a/parquet/src/test/java/org/apache/iceberg/avro/TestParquetReadProjection.java +++ b/parquet/src/test/java/org/apache/iceberg/avro/TestParquetReadProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.File; @@ -30,24 +29,19 @@ public class TestParquetReadProjection extends TestReadProjection { @Override - protected GenericData.Record writeAndRead(String desc, - Schema writeSchema, - Schema readSchema, - GenericData.Record record) + protected GenericData.Record writeAndRead( + String desc, Schema writeSchema, Schema readSchema, GenericData.Record record) throws IOException { File file = temp.newFile(desc + ".parquet"); file.delete(); - try (FileAppender appender = Parquet.write(Files.localOutput(file)) - .schema(writeSchema) - .build()) { + try (FileAppender appender = + Parquet.write(Files.localOutput(file)).schema(writeSchema).build()) { appender.add(record); } - Iterable records = Parquet.read(Files.localInput(file)) - .project(readSchema) - .callInit() - .build(); + Iterable records = + Parquet.read(Files.localInput(file)).project(readSchema).callInit().build(); return Iterables.getOnlyElement(records); } diff --git a/parquet/src/test/java/org/apache/iceberg/avro/TestReadProjection.java b/parquet/src/test/java/org/apache/iceberg/avro/TestReadProjection.java index 685d4ee88f19..06e92b57ecfc 100644 --- a/parquet/src/test/java/org/apache/iceberg/avro/TestReadProjection.java +++ b/parquet/src/test/java/org/apache/iceberg/avro/TestReadProjection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.avro; import java.io.IOException; @@ -37,20 +36,17 @@ import org.junit.rules.TemporaryFolder; public abstract class TestReadProjection { - protected abstract Record writeAndRead(String desc, - Schema writeSchema, - Schema readSchema, - Record record) throws IOException; + protected abstract Record writeAndRead( + String desc, Schema writeSchema, Schema readSchema, Record record) throws IOException; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = new Record(AvroSchemaUtil.convert(schema, "table")); record.put("id", 34L); @@ -60,26 +56,25 @@ public void testFullProjection() throws Exception { Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.get("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.get("data")); + int cmp = Comparators.charSequences().compare("test", (CharSequence) projected.get("data")); Assert.assertTrue("Should contain the correct data value", cmp == 0); } @Test public void testReorderedFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = new Record(AvroSchemaUtil.convert(schema, "table")); record.put("id", 34L); record.put("data", "test"); - Schema reordered = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("full_projection", schema, reordered, record); @@ -89,20 +84,20 @@ public void testReorderedFullProjection() throws Exception { @Test public void testReorderedProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = new Record(AvroSchemaUtil.convert(schema, "table")); record.put("id", 34L); record.put("data", "test"); - Schema reordered = new Schema( - Types.NestedField.optional(2, "missing_1", Types.StringType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(3, "missing_2", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(2, "missing_1", Types.StringType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(3, "missing_2", Types.LongType.get())); Record projected = writeAndRead("full_projection", schema, reordered, record); @@ -113,10 +108,10 @@ public void testReorderedProjection() throws Exception { @Test public void testEmptyProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = new Record(AvroSchemaUtil.convert(schema, "table")); record.put("id", 34L); @@ -135,131 +130,129 @@ public void testEmptyProjection() throws Exception { @Test public void testBasicProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); record.put("data", "test"); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("basic_projection_id", writeSchema, idOnly, record); TestHelpers.assertEmptyAvroField(projected, "data"); Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.get("id")); - Schema dataOnly = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, record); TestHelpers.assertEmptyAvroField(projected, "id"); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.get("data")); + int cmp = Comparators.charSequences().compare("test", (CharSequence) projected.get("data")); Assert.assertEquals("Should contain the correct data value", 0, cmp); } @Test public void testRename() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); record.put("data", "test"); - Schema readSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get()) - ); + Schema readSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); Record projected = writeAndRead("project_and_rename", writeSchema, readSchema, record); Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.get("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.get("renamed")); + int cmp = Comparators.charSequences().compare("test", (CharSequence) projected.get("renamed")); Assert.assertTrue("Should contain the correct data/renamed value", cmp == 0); } @Test public void testNestedStructProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); - Record location = new Record( - AvroSchemaUtil.fromOption(record.getSchema().getField("location").schema())); + Record location = + new Record(AvroSchemaUtil.fromOption(record.getSchema().getField("location").schema())); location.put("lat", 52.995143f); location.put("long", -1.539054f); record.put("location", location); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); TestHelpers.assertEmptyAvroField(projected, "location"); Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.get("id")); - Schema latOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()) - )) - ); + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); projected = writeAndRead("latitude_only", writeSchema, latOnly, record); Record projectedLocation = (Record) projected.get("location"); TestHelpers.assertEmptyAvroField(projected, "id"); Assert.assertNotNull("Should project location", projected.get("location")); TestHelpers.assertEmptyAvroField(projectedLocation, "long"); - Assert.assertEquals("Should project latitude", - 52.995143f, (float) projectedLocation.get("lat"), 0.000001f); + Assert.assertEquals( + "Should project latitude", 52.995143f, (float) projectedLocation.get("lat"), 0.000001f); - Schema longOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); projected = writeAndRead("longitude_only", writeSchema, longOnly, record); projectedLocation = (Record) projected.get("location"); TestHelpers.assertEmptyAvroField(projected, "id"); Assert.assertNotNull("Should project location", projected.get("location")); TestHelpers.assertEmptyAvroField(projectedLocation, "lat"); - Assert.assertEquals("Should project longitude", - -1.539054f, (float) projectedLocation.get("long"), 0.000001f); + Assert.assertEquals( + "Should project longitude", -1.539054f, (float) projectedLocation.get("long"), 0.000001f); Schema locationOnly = writeSchema.select("location"); projected = writeAndRead("location_only", writeSchema, locationOnly, record); projectedLocation = (Record) projected.get("location"); TestHelpers.assertEmptyAvroField(projected, "id"); Assert.assertNotNull("Should project location", projected.get("location")); - Assert.assertEquals("Should project latitude", - 52.995143f, (float) projectedLocation.get("lat"), 0.000001f); - Assert.assertEquals("Should project longitude", - -1.539054f, (float) projectedLocation.get("long"), 0.000001f); + Assert.assertEquals( + "Should project latitude", 52.995143f, (float) projectedLocation.get("lat"), 0.000001f); + Assert.assertEquals( + "Should project longitude", -1.539054f, (float) projectedLocation.get("long"), 0.000001f); } @Test public void testMapProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "properties", - Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "properties", + Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); Map properties = ImmutableMap.of("a", "A", "b", "B"); @@ -267,9 +260,7 @@ public void testMapProjection() throws IOException { record.put("id", 34L); record.put("properties", properties); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.get("id")); @@ -278,20 +269,20 @@ public void testMapProjection() throws IOException { Schema keyOnly = writeSchema.select("properties.key"); projected = writeAndRead("key_only", writeSchema, keyOnly, record); TestHelpers.assertEmptyAvroField(projected, "id"); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.get("properties"))); + Assert.assertEquals( + "Should project entire map", properties, toStringMap((Map) projected.get("properties"))); Schema valueOnly = writeSchema.select("properties.value"); projected = writeAndRead("value_only", writeSchema, valueOnly, record); TestHelpers.assertEmptyAvroField(projected, "id"); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.get("properties"))); + Assert.assertEquals( + "Should project entire map", properties, toStringMap((Map) projected.get("properties"))); Schema mapOnly = writeSchema.select("properties"); projected = writeAndRead("map_only", writeSchema, mapOnly, record); TestHelpers.assertEmptyAvroField(projected, "id"); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.get("properties"))); + Assert.assertEquals( + "Should project entire map", properties, toStringMap((Map) projected.get("properties"))); } private Map toStringMap(Map map) { @@ -308,22 +299,27 @@ public void testMapProjection() throws IOException { @Test public void testMapOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); - Record l1 = new Record(AvroSchemaUtil.fromOption( - AvroSchemaUtil.fromOption(record.getSchema().getField("locations").schema()) - .getValueType())); + Record l1 = + new Record( + AvroSchemaUtil.fromOption( + AvroSchemaUtil.fromOption(record.getSchema().getField("locations").schema()) + .getValueType())); l1.put("lat", 53.992811f); l1.put("long", -1.542616f); Record l2 = new Record(l1.getSchema()); @@ -331,9 +327,7 @@ public void testMapOfStructsProjection() throws IOException { l2.put("long", -1.539054f); record.put("locations", ImmutableMap.of("L1", l1, "L2", l2)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.get("id")); @@ -341,81 +335,85 @@ public void testMapOfStructsProjection() throws IOException { projected = writeAndRead("all_locations", writeSchema, writeSchema.select("locations"), record); TestHelpers.assertEmptyAvroField(projected, "id"); - Assert.assertEquals("Should project locations map", - record.get("locations"), toStringMap((Map) projected.get("locations"))); + Assert.assertEquals( + "Should project locations map", + record.get("locations"), + toStringMap((Map) projected.get("locations"))); - projected = writeAndRead("lat_only", - writeSchema, writeSchema.select("locations.lat"), record); + projected = writeAndRead("lat_only", writeSchema, writeSchema.select("locations.lat"), record); TestHelpers.assertEmptyAvroField(projected, "id"); Map locations = toStringMap((Map) projected.get("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); Record projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain lat", - 53.992811f, (float) projectedL1.get("lat"), 0.000001); + Assert.assertEquals( + "L1 should contain lat", 53.992811f, (float) projectedL1.get("lat"), 0.000001); TestHelpers.assertEmptyAvroField(projectedL1, "long"); Record projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain lat", - 52.995143f, (float) projectedL2.get("lat"), 0.000001); + Assert.assertEquals( + "L2 should contain lat", 52.995143f, (float) projectedL2.get("lat"), 0.000001); TestHelpers.assertEmptyAvroField(projectedL2, "long"); - projected = writeAndRead("long_only", - writeSchema, writeSchema.select("locations.long"), record); + projected = + writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), record); TestHelpers.assertEmptyAvroField(projected, "id"); locations = toStringMap((Map) projected.get("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); TestHelpers.assertEmptyAvroField(projectedL1, "lat"); - Assert.assertEquals("L1 should contain long", - -1.542616f, (float) projectedL1.get("long"), 0.000001); + Assert.assertEquals( + "L1 should contain long", -1.542616f, (float) projectedL1.get("long"), 0.000001); projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); TestHelpers.assertEmptyAvroField(projectedL2, "lat"); - Assert.assertEquals("L2 should contain long", - -1.539054f, (float) projectedL2.get("long"), 0.000001); - - Schema latitiudeRenamed = new Schema( - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "latitude", Types.FloatType.get()) - ) - )) - ); + Assert.assertEquals( + "L2 should contain long", -1.539054f, (float) projectedL2.get("long"), 0.000001); + + Schema latitiudeRenamed = + new Schema( + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, record); TestHelpers.assertEmptyAvroField(projected, "id"); locations = toStringMap((Map) projected.get("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain latitude", - 53.992811f, (float) projectedL1.get("latitude"), 0.000001); + Assert.assertEquals( + "L1 should contain latitude", 53.992811f, (float) projectedL1.get("latitude"), 0.000001); TestHelpers.assertEmptyAvroField(projectedL1, "lat"); TestHelpers.assertEmptyAvroField(projectedL1, "long"); projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain latitude", - 52.995143f, (float) projectedL2.get("latitude"), 0.000001); + Assert.assertEquals( + "L2 should contain latitude", 52.995143f, (float) projectedL2.get("latitude"), 0.000001); TestHelpers.assertEmptyAvroField(projectedL2, "lat"); TestHelpers.assertEmptyAvroField(projectedL2, "long"); } @Test public void testListProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(10, "values", - Types.ListType.ofOptional(11, Types.LongType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); List values = ImmutableList.of(56L, 57L, 58L); @@ -423,9 +421,7 @@ public void testListProjection() throws IOException { record.put("id", 34L); record.put("values", values); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.get("id")); @@ -445,21 +441,25 @@ public void testListProjection() throws IOException { @Test @SuppressWarnings("unchecked") public void testListOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()) - )) - ) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); Record record = new Record(AvroSchemaUtil.convert(writeSchema, "table")); record.put("id", 34L); - Record p1 = new Record(AvroSchemaUtil.fromOption( - AvroSchemaUtil.fromOption(record.getSchema().getField("points").schema()) - .getElementType())); + Record p1 = + new Record( + AvroSchemaUtil.fromOption( + AvroSchemaUtil.fromOption(record.getSchema().getField("points").schema()) + .getElementType())); p1.put("x", 1); p1.put("y", 2); Record p2 = new Record(p1.getSchema()); @@ -467,9 +467,7 @@ public void testListOfStructsProjection() throws IOException { p2.put("y", null); record.put("points", ImmutableList.of(p1, p2)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.get("id")); @@ -477,8 +475,8 @@ public void testListOfStructsProjection() throws IOException { projected = writeAndRead("all_points", writeSchema, writeSchema.select("points"), record); TestHelpers.assertEmptyAvroField(projected, "id"); - Assert.assertEquals("Should project points list", - record.get("points"), projected.get("points")); + Assert.assertEquals( + "Should project points list", record.get("points"), projected.get("points")); projected = writeAndRead("x_only", writeSchema, writeSchema.select("points.x"), record); TestHelpers.assertEmptyAvroField(projected, "id"); @@ -504,13 +502,15 @@ public void testListOfStructsProjection() throws IOException { TestHelpers.assertEmptyAvroField(projectedP2, "x"); Assert.assertNull("Should project null y", projectedP2.get("y")); - Schema yRenamed = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.optional(18, "z", Types.IntegerType.get()) - )) - ) - ); + Schema yRenamed = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); projected = writeAndRead("y_renamed", writeSchema, yRenamed, record); TestHelpers.assertEmptyAvroField(projected, "id"); diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/ParquetWritingTestUtils.java b/parquet/src/test/java/org/apache/iceberg/parquet/ParquetWritingTestUtils.java index 24effa7496a5..58463bbb1edc 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/ParquetWritingTestUtils.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/ParquetWritingTestUtils.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; +import static org.apache.iceberg.Files.localOutput; + import java.io.Closeable; import java.io.File; import java.io.IOException; @@ -34,42 +35,49 @@ import org.apache.parquet.schema.MessageType; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.Files.localOutput; - -/** - * Utilities for tests that need to write Parquet files. - */ +/** Utilities for tests that need to write Parquet files. */ class ParquetWritingTestUtils { - private ParquetWritingTestUtils() { - } + private ParquetWritingTestUtils() {} - static File writeRecords(TemporaryFolder temp, Schema schema, GenericData.Record... records) throws IOException { + static File writeRecords(TemporaryFolder temp, Schema schema, GenericData.Record... records) + throws IOException { return writeRecords(temp, schema, Collections.emptyMap(), null, records); } - static File writeRecords(TemporaryFolder temp, Schema schema, - Map properties, GenericData.Record... records) throws IOException { + static File writeRecords( + TemporaryFolder temp, + Schema schema, + Map properties, + GenericData.Record... records) + throws IOException { return writeRecords(temp, schema, properties, null, records); } static File writeRecords( - TemporaryFolder temp, - Schema schema, Map properties, - Function> createWriterFunc, - GenericData.Record... records) throws IOException { + TemporaryFolder temp, + Schema schema, + Map properties, + Function> createWriterFunc, + GenericData.Record... records) + throws IOException { File file = createTempFile(temp); write(file, schema, properties, createWriterFunc, records); return file; } - static long write(File file, Schema schema, Map properties, - Function> createWriterFunc, - GenericData.Record... records) throws IOException { + static long write( + File file, + Schema schema, + Map properties, + Function> createWriterFunc, + GenericData.Record... records) + throws IOException { long len = 0; - FileAppender writer = Parquet.write(localOutput(file)) + FileAppender writer = + Parquet.write(localOutput(file)) .schema(schema) .setAll(properties) .createWriterFunc(createWriterFunc) @@ -77,7 +85,10 @@ static long write(File file, Schema schema, Map properties, try (Closeable toClose = writer) { writer.addAll(Lists.newArrayList(records)); - len = writer.length(); // in deprecated adapter we need to get the length first and then close the writer + len = + writer + .length(); // in deprecated adapter we need to get the length first and then close the + // writer } if (writer instanceof ParquetWriter) { diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestBloomRowGroupFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestBloomRowGroupFilter.java index a8d690fff776..e3e938e45034 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestBloomRowGroupFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestBloomRowGroupFilter.java @@ -16,9 +16,29 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; +import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX; +import static org.apache.iceberg.avro.AvroSchemaUtil.convert; +import static org.apache.iceberg.expressions.Expressions.and; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.isNaN; +import static org.apache.iceberg.expressions.Expressions.isNull; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.not; +import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.expressions.Expressions.notIn; +import static org.apache.iceberg.expressions.Expressions.notNaN; +import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.or; +import static org.apache.iceberg.expressions.Expressions.startsWith; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.math.BigDecimal; @@ -59,90 +79,69 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX; -import static org.apache.iceberg.avro.AvroSchemaUtil.convert; -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.isNull; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.not; -import static org.apache.iceberg.expressions.Expressions.notEqual; -import static org.apache.iceberg.expressions.Expressions.notIn; -import static org.apache.iceberg.expressions.Expressions.notNaN; -import static org.apache.iceberg.expressions.Expressions.notNull; -import static org.apache.iceberg.expressions.Expressions.or; -import static org.apache.iceberg.expressions.Expressions.startsWith; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestBloomRowGroupFilter { private static final Types.StructType structFieldType = Types.StructType.of(Types.NestedField.required(16, "int_field", IntegerType.get())); - private static final Schema SCHEMA = new Schema( - required(1, "id", IntegerType.get()), - required(2, "long", LongType.get()), - required(3, "double", DoubleType.get()), - required(4, "float", FloatType.get()), - required(5, "string", StringType.get()), - required(6, "uuid", UUIDType.get()), - required(7, "required", StringType.get()), - optional(8, "non_bloom", StringType.get()), - optional(9, "all_nulls", LongType.get()), - optional(10, "some_nulls", StringType.get()), - optional(11, "no_nulls", StringType.get()), - optional(12, "all_nans", DoubleType.get()), - optional(13, "some_nans", FloatType.get()), - optional(14, "no_nans", DoubleType.get()), - optional(15, "struct_not_null", structFieldType), - optional(17, "not_in_file", FloatType.get()), - optional(18, "no_stats", StringType.get()), - optional(19, "boolean", Types.BooleanType.get()), - optional(20, "time", Types.TimeType.get()), - optional(21, "date", Types.DateType.get()), - optional(22, "timestamp", Types.TimestampType.withoutZone()), - optional(23, "timestamptz", Types.TimestampType.withZone()), - optional(24, "binary", Types.BinaryType.get()), - optional(25, "int_decimal", Types.DecimalType.of(8, 2)), - optional(26, "long_decimal", Types.DecimalType.of(14, 2)), - optional(27, "fixed_decimal", Types.DecimalType.of(31, 2)) - ); + private static final Schema SCHEMA = + new Schema( + required(1, "id", IntegerType.get()), + required(2, "long", LongType.get()), + required(3, "double", DoubleType.get()), + required(4, "float", FloatType.get()), + required(5, "string", StringType.get()), + required(6, "uuid", UUIDType.get()), + required(7, "required", StringType.get()), + optional(8, "non_bloom", StringType.get()), + optional(9, "all_nulls", LongType.get()), + optional(10, "some_nulls", StringType.get()), + optional(11, "no_nulls", StringType.get()), + optional(12, "all_nans", DoubleType.get()), + optional(13, "some_nans", FloatType.get()), + optional(14, "no_nans", DoubleType.get()), + optional(15, "struct_not_null", structFieldType), + optional(17, "not_in_file", FloatType.get()), + optional(18, "no_stats", StringType.get()), + optional(19, "boolean", Types.BooleanType.get()), + optional(20, "time", Types.TimeType.get()), + optional(21, "date", Types.DateType.get()), + optional(22, "timestamp", Types.TimestampType.withoutZone()), + optional(23, "timestamptz", Types.TimestampType.withZone()), + optional(24, "binary", Types.BinaryType.get()), + optional(25, "int_decimal", Types.DecimalType.of(8, 2)), + optional(26, "long_decimal", Types.DecimalType.of(14, 2)), + optional(27, "fixed_decimal", Types.DecimalType.of(31, 2))); private static final Types.StructType _structFieldType = Types.StructType.of(Types.NestedField.required(16, "_int_field", IntegerType.get())); - private static final Schema FILE_SCHEMA = new Schema( - required(1, "_id", IntegerType.get()), - required(2, "_long", LongType.get()), - required(3, "_double", DoubleType.get()), - required(4, "_float", FloatType.get()), - required(5, "_string", StringType.get()), - required(6, "_uuid", UUIDType.get()), - required(7, "_required", StringType.get()), - required(8, "_non_bloom", StringType.get()), - optional(9, "_all_nulls", LongType.get()), - optional(10, "_some_nulls", StringType.get()), - optional(11, "_no_nulls", StringType.get()), - optional(12, "_all_nans", DoubleType.get()), - optional(13, "_some_nans", FloatType.get()), - optional(14, "_no_nans", DoubleType.get()), - optional(15, "_struct_not_null", _structFieldType), - optional(18, "_no_stats", StringType.get()), - optional(19, "_boolean", Types.BooleanType.get()), - optional(20, "_time", Types.TimeType.get()), - optional(21, "_date", Types.DateType.get()), - optional(22, "_timestamp", Types.TimestampType.withoutZone()), - optional(23, "_timestamptz", Types.TimestampType.withZone()), - optional(24, "_binary", Types.BinaryType.get()), - optional(25, "_int_decimal", Types.DecimalType.of(8, 2)), - optional(26, "_long_decimal", Types.DecimalType.of(14, 2)), - optional(27, "_fixed_decimal", Types.DecimalType.of(31, 2)) - ); + private static final Schema FILE_SCHEMA = + new Schema( + required(1, "_id", IntegerType.get()), + required(2, "_long", LongType.get()), + required(3, "_double", DoubleType.get()), + required(4, "_float", FloatType.get()), + required(5, "_string", StringType.get()), + required(6, "_uuid", UUIDType.get()), + required(7, "_required", StringType.get()), + required(8, "_non_bloom", StringType.get()), + optional(9, "_all_nulls", LongType.get()), + optional(10, "_some_nulls", StringType.get()), + optional(11, "_no_nulls", StringType.get()), + optional(12, "_all_nans", DoubleType.get()), + optional(13, "_some_nans", FloatType.get()), + optional(14, "_no_nans", DoubleType.get()), + optional(15, "_struct_not_null", _structFieldType), + optional(18, "_no_stats", StringType.get()), + optional(19, "_boolean", Types.BooleanType.get()), + optional(20, "_time", Types.TimeType.get()), + optional(21, "_date", Types.DateType.get()), + optional(22, "_timestamp", Types.TimestampType.withoutZone()), + optional(23, "_timestamptz", Types.TimestampType.withZone()), + optional(24, "_binary", Types.BinaryType.get()), + optional(25, "_int_decimal", Types.DecimalType.of(8, 2)), + optional(26, "_long_decimal", Types.DecimalType.of(14, 2)), + optional(27, "_fixed_decimal", Types.DecimalType.of(31, 2))); private static final String TOO_LONG_FOR_STATS; @@ -184,8 +183,7 @@ public class TestBloomRowGroupFilter { private BlockMetaData rowGroupMetadata = null; private BloomFilterReader bloomStore = null; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Before public void createInputFile() throws IOException { @@ -196,42 +194,47 @@ public void createInputFile() throws IOException { org.apache.avro.Schema structSchema = AvroSchemaUtil.convert(_structFieldType); OutputFile outFile = Files.localOutput(parquetFile); - try (FileAppender appender = Parquet.write(outFile) - .schema(FILE_SCHEMA) - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_id", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_long", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_double", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_float", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_string", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_uuid", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_required", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_all_nulls", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_some_nulls", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_no_nulls", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_all_nans", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_some_nans", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_no_nans", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_struct_not_null._int_field", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_not_in_file", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_no_stats", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_boolean", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_time", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_date", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_timestamp", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_timestamptz", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_binary", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_int_decimal", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_long_decimal", "true") - .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_fixed_decimal", "true") - .build()) { + try (FileAppender appender = + Parquet.write(outFile) + .schema(FILE_SCHEMA) + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_id", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_long", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_double", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_float", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_string", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_uuid", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_required", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_all_nulls", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_some_nulls", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_no_nulls", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_all_nans", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_some_nans", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_no_nans", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_struct_not_null._int_field", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_not_in_file", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_no_stats", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_boolean", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_time", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_date", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_timestamp", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_timestamptz", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_binary", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_int_decimal", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_long_decimal", "true") + .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "_fixed_decimal", "true") + .build()) { GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table")); // create 50 records for (int i = 0; i < INT_VALUE_COUNT; i += 1) { builder.set("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0 builder.set("_long", LONG_BASE + INT_MIN_VALUE + i); // min=130L, max=179L, num-nulls=0 - builder.set("_double", DOUBLE_BASE + INT_MIN_VALUE + i); // min=1030D, max=1079D, num-nulls=0 - builder.set("_float", FLOAT_BASE + INT_MIN_VALUE + i); // min=10030F, max=10079F, num-nulls=0 - builder.set("_string", BINARY_PREFIX + (INT_MIN_VALUE + i)); // min=BINARY测试_30, max=BINARY测试_79, num-nulls=0 + builder.set( + "_double", DOUBLE_BASE + INT_MIN_VALUE + i); // min=1030D, max=1079D, num-nulls=0 + builder.set( + "_float", FLOAT_BASE + INT_MIN_VALUE + i); // min=10030F, max=10079F, num-nulls=0 + builder.set( + "_string", + BINARY_PREFIX + (INT_MIN_VALUE + i)); // min=BINARY测试_30, max=BINARY测试_79, num-nulls=0 builder.set("_uuid", RANDOM_UUIDS.get(i)); // required, random uuid, always non-null builder.set("_required", "req"); // required, always non-null builder.set("_non_bloom", RANDOM_UUIDS.get(i)); // bloom filter not enabled @@ -271,239 +274,314 @@ public void createInputFile() throws IOException { @Test public void testNotNull() { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notNull("all_nulls")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, notNull("all_nulls")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notNull("some_nulls")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, notNull("some_nulls")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notNull("no_nulls")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, notNull("no_nulls")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notNull("struct_not_null.int_field")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, notNull("struct_not_null.int_field")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: this field is required and are always not-null", shouldRead); } @Test public void testIsNull() { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, isNull("all_nulls")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, isNull("all_nulls")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, isNull("some_nulls")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, isNull("some_nulls")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, isNull("no_nulls")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, isNull("no_nulls")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, isNull("struct_not_null.int_field")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, isNull("struct_not_null.int_field")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should skip: this field is required and are always not-null", shouldRead); } @Test public void testRequiredColumn() { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notNull("required")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, notNull("required")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: required columns are always non-null", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, isNull("required")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, isNull("required")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should skip: required columns are always non-null", shouldRead); } @Test public void testIsNaNs() { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, isNaN("all_nans")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, isNaN("all_nans")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, isNaN("some_nans")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, isNaN("some_nans")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, isNaN("no_nans")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, isNaN("no_nans")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } @Test public void testNotNaNs() { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notNaN("all_nans")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, notNaN("all_nans")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notNaN("some_nans")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, notNaN("some_nans")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notNaN("no_nans")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, notNaN("no_nans")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } @Test public void testStartsWith() { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, startsWith("non_bloom", "re")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, startsWith("non_bloom", "re")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: no bloom", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, startsWith("required", "re")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, startsWith("required", "re")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, startsWith("required", "req")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, startsWith("required", "req")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, startsWith("some_nulls", "so")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, startsWith("some_nulls", "so")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, startsWith("required", "reqs")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, startsWith("required", "reqs")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, startsWith("some_nulls", "somex")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, startsWith("some_nulls", "somex")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, startsWith("no_nulls", "xxx")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, startsWith("no_nulls", "xxx")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } @Test public void testMissingColumn() { - TestHelpers.assertThrows("Should complain about missing column in expression", - ValidationException.class, "Cannot find field 'missing'", - () -> new ParquetBloomRowGroupFilter(SCHEMA, lessThan("missing", 5)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore)); + TestHelpers.assertThrows( + "Should complain about missing column in expression", + ValidationException.class, + "Cannot find field 'missing'", + () -> + new ParquetBloomRowGroupFilter(SCHEMA, lessThan("missing", 5)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore)); } @Test public void testColumnNotInFile() { - Expression[] exprs = new Expression[]{ - lessThan("not_in_file", 1.0f), lessThanOrEqual("not_in_file", 1.0f), - equal("not_in_file", 1.0f), greaterThan("not_in_file", 1.0f), - greaterThanOrEqual("not_in_file", 1.0f), notNull("not_in_file"), - isNull("not_in_file"), notEqual("not_in_file", 1.0f), in("not_in_file", 1.0f, 2.0f) - }; + Expression[] exprs = + new Expression[] { + lessThan("not_in_file", 1.0f), + lessThanOrEqual("not_in_file", 1.0f), + equal("not_in_file", 1.0f), + greaterThan("not_in_file", 1.0f), + greaterThanOrEqual("not_in_file", 1.0f), + notNull("not_in_file"), + isNull("not_in_file"), + notEqual("not_in_file", 1.0f), + in("not_in_file", 1.0f, 2.0f) + }; for (Expression expr : exprs) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, expr) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, expr) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter cannot be found: " + expr, shouldRead); } } @Test public void testColumnNotBloomFilterEnabled() { - Expression[] exprs = new Expression[]{ - lessThan("non_bloom", "a"), lessThanOrEqual("non_bloom", "a"), equal("non_bloom", "a"), - greaterThan("non_bloom", "a"), greaterThanOrEqual("non_bloom", "a"), notNull("non_bloom"), - isNull("non_bloom"), notEqual("non_bloom", "a"), in("non_bloom", "a", "test1", "test2") - }; + Expression[] exprs = + new Expression[] { + lessThan("non_bloom", "a"), lessThanOrEqual("non_bloom", "a"), equal("non_bloom", "a"), + greaterThan("non_bloom", "a"), greaterThanOrEqual("non_bloom", "a"), notNull("non_bloom"), + isNull("non_bloom"), notEqual("non_bloom", "a"), in("non_bloom", "a", "test1", "test2") + }; for (Expression expr : exprs) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, expr) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, expr) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter cannot be found: " + expr, shouldRead); } } @Test public void testMissingStats() { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("no_stats", "a")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("no_stats", "a")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should skip: stats are missing but bloom filter is present", shouldRead); } @Test public void testNot() { - // this test case must use a real predicate, not alwaysTrue(), otherwise binding will simplify it out + // this test case must use a real predicate, not alwaysTrue(), otherwise binding will simplify + // it out for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, not(equal("id", i))) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, not(equal("id", i))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } } @Test public void testAnd() { - // this test case must use a real predicate, not alwaysTrue(), otherwise binding will simplify it out + // this test case must use a real predicate, not alwaysTrue(), otherwise binding will simplify + // it out boolean shouldRead = - new ParquetBloomRowGroupFilter(SCHEMA, and(equal("id", INT_MIN_VALUE - 25), equal("id", INT_MIN_VALUE + 30))) + new ParquetBloomRowGroupFilter( + SCHEMA, and(equal("id", INT_MIN_VALUE - 25), equal("id", INT_MIN_VALUE + 30))) .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should skip: and(false, true)", shouldRead); shouldRead = - new ParquetBloomRowGroupFilter(SCHEMA, and(equal("id", INT_MIN_VALUE - 25), equal("id", INT_MAX_VALUE + 1))) + new ParquetBloomRowGroupFilter( + SCHEMA, and(equal("id", INT_MIN_VALUE - 25), equal("id", INT_MAX_VALUE + 1))) .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should skip: and(false, false)", shouldRead); shouldRead = - new ParquetBloomRowGroupFilter(SCHEMA, and(equal("id", INT_MIN_VALUE + 25), equal("id", INT_MIN_VALUE))) + new ParquetBloomRowGroupFilter( + SCHEMA, and(equal("id", INT_MIN_VALUE + 25), equal("id", INT_MIN_VALUE))) .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: and(true, true)", shouldRead); // AND filters that refer different columns ("id", "long", "binary") - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, and(equal("id", INT_MIN_VALUE + 25), - equal("long", LONG_BASE + 30), equal("binary", RANDOM_BYTES.get(30)))) + shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, + and( + equal("id", INT_MIN_VALUE + 25), + equal("long", LONG_BASE + 30), + equal("binary", RANDOM_BYTES.get(30)))) .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: and(true, true, true)", shouldRead); // AND filters that refer different columns ("id", "long", "binary") - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, and(equal("id", INT_MIN_VALUE - 25), - equal("long", LONG_BASE + 30), equal("binary", RANDOM_BYTES.get(30)))) + shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, + and( + equal("id", INT_MIN_VALUE - 25), + equal("long", LONG_BASE + 30), + equal("binary", RANDOM_BYTES.get(30)))) .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should skip: and(false, true, true)", shouldRead); // In And, one of the filter's column doesn't have bloom filter - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, and(equal("id", INT_MIN_VALUE + 25), - equal("long", LONG_BASE + 30), equal("binary", RANDOM_BYTES.get(30)), - equal("non_bloom", "a"))) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, + and( + equal("id", INT_MIN_VALUE + 25), + equal("long", LONG_BASE + 30), + equal("binary", RANDOM_BYTES.get(30)), + equal("non_bloom", "a"))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: and(true, true, true, true)", shouldRead); // In And, one of the filter's column doesn't have bloom filter - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, and(equal("id", INT_MIN_VALUE - 25), - equal("long", LONG_BASE + 30), equal("binary", RANDOM_BYTES.get(30)), - equal("non_bloom", "a"))) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, + and( + equal("id", INT_MIN_VALUE - 25), + equal("long", LONG_BASE + 30), + equal("binary", RANDOM_BYTES.get(30)), + equal("non_bloom", "a"))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should skip: and(false, true, true, true)", shouldRead); // In And, one of the filter's column is not in the file - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, and(equal("id", INT_MIN_VALUE + 25), - equal("long", LONG_BASE + 30), equal("binary", RANDOM_BYTES.get(30)), - equal("not_in_file", 1.0f))) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, + and( + equal("id", INT_MIN_VALUE + 25), + equal("long", LONG_BASE + 30), + equal("binary", RANDOM_BYTES.get(30)), + equal("not_in_file", 1.0f))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: and(true, true, true, true)", shouldRead); // In And, one of the filter's column is not in the file - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, and(equal("id", INT_MIN_VALUE - 25), - equal("long", LONG_BASE + 30), equal("binary", RANDOM_BYTES.get(30)), - equal("not_in_file", 1.0f))) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, + and( + equal("id", INT_MIN_VALUE - 25), + equal("long", LONG_BASE + 30), + equal("binary", RANDOM_BYTES.get(30)), + equal("not_in_file", 1.0f))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should skip: and(false, true, true, true)", shouldRead); } @Test public void testOr() { - // this test case must use a real predicate, not alwaysTrue(), otherwise binding will simplify it out + // this test case must use a real predicate, not alwaysTrue(), otherwise binding will simplify + // it out boolean shouldRead = - new ParquetBloomRowGroupFilter(SCHEMA, or(equal("id", INT_MIN_VALUE - 25), equal("id", INT_MAX_VALUE + 1))) + new ParquetBloomRowGroupFilter( + SCHEMA, or(equal("id", INT_MIN_VALUE - 25), equal("id", INT_MAX_VALUE + 1))) .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should skip: or(false, false)", shouldRead); shouldRead = - new ParquetBloomRowGroupFilter(SCHEMA, or(equal("id", INT_MIN_VALUE - 25), equal("id", INT_MAX_VALUE - 19))) + new ParquetBloomRowGroupFilter( + SCHEMA, or(equal("id", INT_MIN_VALUE - 25), equal("id", INT_MAX_VALUE - 19))) .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: or(false, true)", shouldRead); } @@ -511,8 +589,9 @@ public void testOr() { @Test public void testIntegerLt() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, lessThan("id", i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, lessThan("id", i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } } @@ -520,8 +599,9 @@ public void testIntegerLt() { @Test public void testIntegerLtEq() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, lessThanOrEqual("id", i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, lessThanOrEqual("id", i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } } @@ -529,8 +609,9 @@ public void testIntegerLtEq() { @Test public void testIntegerGt() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, greaterThan("id", i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, greaterThan("id", i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } } @@ -538,8 +619,9 @@ public void testIntegerGt() { @Test public void testIntegerGtEq() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, greaterThanOrEqual("id", i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, greaterThanOrEqual("id", i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } } @@ -547,8 +629,9 @@ public void testIntegerGtEq() { @Test public void testIntegerEq() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("id", i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("id", i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); if (i >= INT_MIN_VALUE && i <= INT_MAX_VALUE) { Assert.assertTrue("Should read: integer within range", shouldRead); } else { @@ -560,8 +643,9 @@ public void testIntegerEq() { @Test public void testLongEq() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("long", LONG_BASE + i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("long", LONG_BASE + i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); if (i >= INT_MIN_VALUE && i <= INT_MAX_VALUE) { Assert.assertTrue("Should read: long within range", shouldRead); } else { @@ -573,8 +657,9 @@ public void testLongEq() { @Test public void testBytesEq() { for (int i = 0; i < INT_VALUE_COUNT; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("binary", RANDOM_BYTES.get(i))) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("binary", RANDOM_BYTES.get(i))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: binary within range", shouldRead); } @@ -582,8 +667,9 @@ public void testBytesEq() { for (int i = 1; i <= 10; i += 1) { byte[] byteArray = new byte[i]; rd.nextBytes(byteArray); - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("binary", byteArray)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("binary", byteArray)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should not read: cannot match a new generated binary", shouldRead); } } @@ -591,53 +677,57 @@ public void testBytesEq() { @Test public void testIntDeciamlEq() { for (int i = 0; i < INT_VALUE_COUNT; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, - equal("int_decimal", new BigDecimal(String.valueOf(77.77 + i)))) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, equal("int_decimal", new BigDecimal(String.valueOf(77.77 + i)))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: decimal within range", shouldRead); } - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, - equal("int_decimal", new BigDecimal("1234.56"))) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("int_decimal", new BigDecimal("1234.56"))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should not read: decimal outside range", shouldRead); } @Test - public void testLongDeciamlEq() { + public void testLongDeciamlEq() { for (int i = 0; i < INT_VALUE_COUNT; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, - equal("long_decimal", new BigDecimal(String.valueOf(88.88 + i)))) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, equal("long_decimal", new BigDecimal(String.valueOf(88.88 + i)))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: decimal within range", shouldRead); } - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, - equal("long_decimal", new BigDecimal("1234.56"))) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("long_decimal", new BigDecimal("1234.56"))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should not read: decimal outside range", shouldRead); } @Test public void testFixedDeciamlEq() { for (int i = 0; i < INT_VALUE_COUNT; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, - equal("fixed_decimal", new BigDecimal(String.valueOf(99.99 + i)))) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, equal("fixed_decimal", new BigDecimal(String.valueOf(99.99 + i)))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: decimal within range", shouldRead); } - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, - equal("fixed_decimal", new BigDecimal("1234.56"))) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("fixed_decimal", new BigDecimal("1234.56"))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should not read: decimal outside range", shouldRead); } @Test public void testDoubleEq() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("double", DOUBLE_BASE + i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("double", DOUBLE_BASE + i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); if (i >= INT_MIN_VALUE && i <= INT_MAX_VALUE) { Assert.assertTrue("Should read: double within range", shouldRead); } else { @@ -649,8 +739,9 @@ public void testDoubleEq() { @Test public void testFloatEq() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("float", FLOAT_BASE + i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("float", FLOAT_BASE + i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); if (i >= INT_MIN_VALUE && i <= INT_MAX_VALUE) { Assert.assertTrue("Should read: float within range", shouldRead); } else { @@ -662,8 +753,9 @@ public void testFloatEq() { @Test public void testStringEq() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("string", BINARY_PREFIX + i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("string", BINARY_PREFIX + i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); if (i >= INT_MIN_VALUE && i <= INT_MAX_VALUE) { Assert.assertTrue("Should read: string within range", shouldRead); } else { @@ -675,26 +767,30 @@ public void testStringEq() { @Test public void testRandomBinaryEq() { for (int i = 0; i < INT_VALUE_COUNT; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("uuid", RANDOM_UUIDS.get(i))) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("uuid", RANDOM_UUIDS.get(i))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: uuid within range", shouldRead); } Random rd = new Random(1357); - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, - equal("uuid", new UUID(rd.nextLong(), rd.nextLong()).toString())) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, equal("uuid", new UUID(rd.nextLong(), rd.nextLong()).toString())) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should not read: cannot match a new generated random uuid", shouldRead); } @Test public void testBooleanEq() { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("boolean", true)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("boolean", true)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter is not supported for Boolean", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("boolean", false)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("boolean", false)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter is not supported for Boolean", shouldRead); } @@ -702,8 +798,9 @@ public void testBooleanEq() { public void testTimeEq() { for (int i = -20; i < INT_VALUE_COUNT + 20; i++) { Instant ins = instant.plusSeconds(i * 86400); - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("time", ins.toEpochMilli())) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("time", ins.toEpochMilli())) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); if (i >= 0 && i < INT_VALUE_COUNT) { Assert.assertTrue("Should read: time within range", shouldRead); } else { @@ -716,8 +813,9 @@ public void testTimeEq() { public void testDateEq() { for (int i = -20; i < INT_VALUE_COUNT + 20; i++) { Instant ins = instant.plusSeconds(i * 86400); - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("date", ins.getEpochSecond())) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("date", ins.getEpochSecond())) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); if (i >= 0 && i < INT_VALUE_COUNT) { Assert.assertTrue("Should read: date within range", shouldRead); } else { @@ -730,8 +828,9 @@ public void testDateEq() { public void testTimestampEq() { for (int i = -20; i < INT_VALUE_COUNT + 20; i++) { Instant ins = instant.plusSeconds(i * 86400); - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("timestamp", ins.toEpochMilli())) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("timestamp", ins.toEpochMilli())) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); if (i >= 0 && i < INT_VALUE_COUNT) { Assert.assertTrue("Should read: timestamp within range", shouldRead); } else { @@ -744,8 +843,9 @@ public void testTimestampEq() { public void testTimestamptzEq() { for (int i = -20; i < INT_VALUE_COUNT + 20; i++) { Instant ins = instant.plusSeconds(i * 86400); - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("timestamptz", ins.toEpochMilli())) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("timestamptz", ins.toEpochMilli())) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); if (i >= 0 && i < INT_VALUE_COUNT) { Assert.assertTrue("Should read: timestamptz within range", shouldRead); } else { @@ -757,8 +857,9 @@ public void testTimestamptzEq() { @Test public void testIntegerNotEq() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notEqual("id", i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, notEqual("id", i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } } @@ -766,28 +867,32 @@ public void testIntegerNotEq() { @Test public void testIntegerNotEqRewritten() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, not(equal("id", i))) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, not(equal("id", i))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } } @Test public void testStringNotEq() { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notEqual("some_nulls", "some")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, notEqual("some_nulls", "some")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notEqual("no_nulls", "")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, notEqual("no_nulls", "")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } @Test public void testStructFieldLt() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, lessThan("struct_not_null.int_field", i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, lessThan("struct_not_null.int_field", i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } } @@ -795,8 +900,9 @@ public void testStructFieldLt() { @Test public void testStructFieldLtEq() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, lessThanOrEqual("struct_not_null.int_field", i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, lessThanOrEqual("struct_not_null.int_field", i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } } @@ -804,8 +910,9 @@ public void testStructFieldLtEq() { @Test public void testStructFieldGt() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, greaterThan("struct_not_null.int_field", i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, greaterThan("struct_not_null.int_field", i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } } @@ -813,8 +920,9 @@ public void testStructFieldGt() { @Test public void testStructFieldGtEq() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, greaterThanOrEqual("struct_not_null.int_field", i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, greaterThanOrEqual("struct_not_null.int_field", i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } } @@ -822,8 +930,9 @@ public void testStructFieldGtEq() { @Test public void testStructFieldEq() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("struct_not_null.int_field", i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("struct_not_null.int_field", i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); if (i >= INT_MIN_VALUE && i <= INT_MAX_VALUE) { Assert.assertTrue("Should read: value within range", shouldRead); } else { @@ -835,26 +944,35 @@ public void testStructFieldEq() { @Test public void testStructFieldNotEq() { for (int i = INT_MIN_VALUE - 20; i < INT_MAX_VALUE + 20; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notEqual("struct_not_null.int_field", i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, notEqual("struct_not_null.int_field", i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } } @Test public void testCaseInsensitive() { - // the column name is required. If setting caseSentitive to true, ValidationException: Cannot find field 'Required' - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("Required", "Req"), false) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + // the column name is required. If setting caseSentitive to true, ValidationException: Cannot + // find field 'Required' + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("Required", "Req"), false) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should skip: contains only 'req'", shouldRead); } @Test public void testMissingBloomFilterForColumn() { - TestHelpers.assertThrows("Should complain about missing bloom filter", - IllegalStateException.class, "Failed to read required bloom filter for id: 10", - () -> new ParquetBloomRowGroupFilter(SCHEMA, equal("some_nulls", "some")) - .shouldRead(parquetSchema, rowGroupMetadata, new DummyBloomFilterReader(null, rowGroupMetadata))); + TestHelpers.assertThrows( + "Should complain about missing bloom filter", + IllegalStateException.class, + "Failed to read required bloom filter for id: 10", + () -> + new ParquetBloomRowGroupFilter(SCHEMA, equal("some_nulls", "some")) + .shouldRead( + parquetSchema, + rowGroupMetadata, + new DummyBloomFilterReader(null, rowGroupMetadata))); } private static class DummyBloomFilterReader extends BloomFilterReader { @@ -872,130 +990,177 @@ public BloomFilter readBloomFilter(ColumnChunkMetaData meta) { public void testIntegerIn() { // only one value is present for (int i = 0; i < INT_VALUE_COUNT; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter( - SCHEMA, - in("id", INT_MIN_VALUE - 3 * i, INT_MIN_VALUE + i, INT_MAX_VALUE + 3 * i) - ).shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, in("id", INT_MIN_VALUE - 3 * i, INT_MIN_VALUE + i, INT_MAX_VALUE + 3 * i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: integer within range", shouldRead); } // all values are present - boolean shouldRead = new ParquetBloomRowGroupFilter( - SCHEMA, - in("id", IntStream.range(INT_MIN_VALUE - 10, INT_MAX_VALUE + 10).boxed().collect(Collectors.toList())) - ).shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, + in( + "id", + IntStream.range(INT_MIN_VALUE - 10, INT_MAX_VALUE + 10) + .boxed() + .collect(Collectors.toList()))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: the bloom is a subset of the in set", shouldRead); // all values are present - shouldRead = new ParquetBloomRowGroupFilter( - SCHEMA, - in("id", IntStream.range(INT_MIN_VALUE, INT_MAX_VALUE).boxed().collect(Collectors.toList())) - ).shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, + in( + "id", + IntStream.range(INT_MIN_VALUE, INT_MAX_VALUE) + .boxed() + .collect(Collectors.toList()))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: the bloom is equal to the in set", shouldRead); // no values are present - shouldRead = new ParquetBloomRowGroupFilter( - SCHEMA, - in("id", IntStream.range(INT_MIN_VALUE - 10, INT_MIN_VALUE - 1).boxed().collect(Collectors.toList())) - ).shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, + in( + "id", + IntStream.range(INT_MIN_VALUE - 10, INT_MIN_VALUE - 1) + .boxed() + .collect(Collectors.toList()))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should not read: value outside range", shouldRead); } @Test public void testOtherTypesIn() { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, in("all_nulls", 1, 2)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, in("all_nulls", 1, 2)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should not read: in on all nulls column (bloom is empty) ", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, in("some_nulls", "aaa", "some")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, in("some_nulls", "aaa", "some")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: in on some nulls column", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, in("some_nulls", "aaa", "bbb")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, in("some_nulls", "aaa", "bbb")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should not read: some_nulls values are not within the set", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, in("no_nulls", "aaa", "bbb")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); - Assert.assertFalse("Should not read: in on no nulls column (empty string is not within the set)", shouldRead); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, in("no_nulls", "aaa", "bbb")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + Assert.assertFalse( + "Should not read: in on no nulls column (empty string is not within the set)", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, in("no_nulls", "aaa", "")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); - Assert.assertTrue("Should read: in on no nulls column (empty string is within the set)", shouldRead); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, in("no_nulls", "aaa", "")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + Assert.assertTrue( + "Should read: in on no nulls column (empty string is within the set)", shouldRead); } @Test public void testIntegerNotIn() { // only one value is present for (int i = 0; i < INT_VALUE_COUNT; i++) { - boolean shouldRead = new ParquetBloomRowGroupFilter( - SCHEMA, - notIn("id", INT_MIN_VALUE - 3 * i, INT_MIN_VALUE + i, INT_MAX_VALUE + 3 * i)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, + notIn("id", INT_MIN_VALUE - 3 * i, INT_MIN_VALUE + i, INT_MAX_VALUE + 3 * i)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } // all values are present - boolean shouldRead = new ParquetBloomRowGroupFilter( - SCHEMA, - notIn("id", IntStream.range(INT_MIN_VALUE - 10, INT_MAX_VALUE + 10).boxed().collect(Collectors.toList())) - ).shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, + notIn( + "id", + IntStream.range(INT_MIN_VALUE - 10, INT_MAX_VALUE + 10) + .boxed() + .collect(Collectors.toList()))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); // all values are present - shouldRead = new ParquetBloomRowGroupFilter( - SCHEMA, - notIn("id", IntStream.range(INT_MIN_VALUE, INT_MAX_VALUE).boxed().collect(Collectors.toList())) - ).shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, + notIn( + "id", + IntStream.range(INT_MIN_VALUE, INT_MAX_VALUE) + .boxed() + .collect(Collectors.toList()))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); // no values are present - shouldRead = new ParquetBloomRowGroupFilter( - SCHEMA, - notIn("id", IntStream.range(INT_MIN_VALUE - 10, INT_MIN_VALUE - 1).boxed().collect(Collectors.toList())) - ).shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter( + SCHEMA, + notIn( + "id", + IntStream.range(INT_MIN_VALUE - 10, INT_MIN_VALUE - 1) + .boxed() + .collect(Collectors.toList()))) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } @Test public void testOtherTypesNotIn() { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notIn("all_nulls", 1, 2)) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, notIn("all_nulls", 1, 2)) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notIn("some_nulls", "aaa", "bbb")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, notIn("some_nulls", "aaa", "bbb")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notIn("no_nulls", "aaa", "bbb")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, notIn("no_nulls", "aaa", "bbb")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, notIn("no_nulls", "aaa", "")) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, notIn("no_nulls", "aaa", "")) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: bloom filter doesn't help", shouldRead); } @Test public void testTypeConversions() { - boolean shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("long", LONG_BASE + INT_MIN_VALUE + 1), true) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + boolean shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("long", LONG_BASE + INT_MIN_VALUE + 1), true) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: Integer value promoted", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("long", LONG_BASE + INT_MIN_VALUE - 1), true) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("long", LONG_BASE + INT_MIN_VALUE - 1), true) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should not read: Integer value promoted", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("id", (long) (INT_MIN_VALUE + 1)), true) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("id", (long) (INT_MIN_VALUE + 1)), true) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertTrue("Should read: Long value truncated", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("id", (long) (INT_MIN_VALUE - 1)), true) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("id", (long) (INT_MIN_VALUE - 1)), true) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should not read: Long value truncated", shouldRead); - shouldRead = new ParquetBloomRowGroupFilter(SCHEMA, equal("id", ((long) Integer.MAX_VALUE) + 1), true) - .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); + shouldRead = + new ParquetBloomRowGroupFilter(SCHEMA, equal("id", ((long) Integer.MAX_VALUE) + 1), true) + .shouldRead(parquetSchema, rowGroupMetadata, bloomStore); Assert.assertFalse("Should not read: Long value outside Integer range", shouldRead); } } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestCDHParquetStatistics.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestCDHParquetStatistics.java index 6484e7b75953..1ceca74d215c 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestCDHParquetStatistics.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestCDHParquetStatistics.java @@ -16,19 +16,18 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + import org.apache.parquet.column.statistics.Statistics; import org.junit.Assert; import org.junit.Test; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - /** - * Tests for Parquet 1.5.0-Stats which cannot be evaluated like later versions of Parquet stats. They are intercepted - * by the hasNonNullButNoMinMax function which always returns ROWS_MAY_MATCH + * Tests for Parquet 1.5.0-Stats which cannot be evaluated like later versions of Parquet stats. + * They are intercepted by the hasNonNullButNoMinMax function which always returns ROWS_MAY_MATCH */ public class TestCDHParquetStatistics { @@ -38,7 +37,8 @@ public void testCDHParquetStatistcs() { when(cdhBinaryColumnStats.getMaxBytes()).thenReturn(null); when(cdhBinaryColumnStats.getMinBytes()).thenReturn(null); when(cdhBinaryColumnStats.getNumNulls()).thenReturn(0L); - Assert.assertTrue(ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(cdhBinaryColumnStats, 50L)); + Assert.assertTrue( + ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(cdhBinaryColumnStats, 50L)); } @Test @@ -47,7 +47,8 @@ public void testCDHParquetStatisticsNullNotSet() { when(cdhBinaryColumnStats.getMaxBytes()).thenReturn(null); when(cdhBinaryColumnStats.getMinBytes()).thenReturn(null); when(cdhBinaryColumnStats.getNumNulls()).thenReturn(-1L); - Assert.assertTrue(ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(cdhBinaryColumnStats, 50L)); + Assert.assertTrue( + ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(cdhBinaryColumnStats, 50L)); } @Test @@ -56,7 +57,8 @@ public void testCDHParquetStatistcsAllNull() { when(cdhBinaryColumnStats.getMaxBytes()).thenReturn(null); when(cdhBinaryColumnStats.getMinBytes()).thenReturn(null); when(cdhBinaryColumnStats.getNumNulls()).thenReturn(50L); - Assert.assertFalse(ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(cdhBinaryColumnStats, 50L)); + Assert.assertFalse( + ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(cdhBinaryColumnStats, 50L)); } @Test @@ -65,7 +67,8 @@ public void testNonCDHParquetStatistics() { when(normalBinaryColumnStats.getMaxBytes()).thenReturn(new byte[2]); when(normalBinaryColumnStats.getMinBytes()).thenReturn(new byte[2]); when(normalBinaryColumnStats.getNumNulls()).thenReturn(0L); - Assert.assertFalse(ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(normalBinaryColumnStats, 50L)); + Assert.assertFalse( + ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(normalBinaryColumnStats, 50L)); } @Test @@ -74,7 +77,8 @@ public void testNonCDHParquetStatisticsNullNotSet() { when(normalBinaryColumnStats.getMaxBytes()).thenReturn(new byte[2]); when(normalBinaryColumnStats.getMinBytes()).thenReturn(new byte[2]); when(normalBinaryColumnStats.getNumNulls()).thenReturn(-1L); - Assert.assertFalse(ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(normalBinaryColumnStats, 50L)); + Assert.assertFalse( + ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(normalBinaryColumnStats, 50L)); } @Test @@ -83,6 +87,7 @@ public void testNonCDHParquetStatisticsAllNull() { when(normalBinaryColumnStats.getMaxBytes()).thenReturn(new byte[2]); when(normalBinaryColumnStats.getMinBytes()).thenReturn(new byte[2]); when(normalBinaryColumnStats.getNumNulls()).thenReturn(50L); - Assert.assertFalse(ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(normalBinaryColumnStats, 50L)); + Assert.assertFalse( + ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(normalBinaryColumnStats, 50L)); } } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java index 178a258e4ff6..f2e086396514 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java @@ -16,9 +16,31 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; +import static org.apache.iceberg.avro.AvroSchemaUtil.convert; +import static org.apache.iceberg.expressions.Expressions.and; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.isNaN; +import static org.apache.iceberg.expressions.Expressions.isNull; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.not; +import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.expressions.Expressions.notIn; +import static org.apache.iceberg.expressions.Expressions.notNaN; +import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; +import static org.apache.iceberg.expressions.Expressions.or; +import static org.apache.iceberg.expressions.Expressions.startsWith; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0; +import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0; + import java.io.File; import java.io.IOException; import java.math.BigDecimal; @@ -64,68 +86,53 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.avro.AvroSchemaUtil.convert; -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.isNull; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.not; -import static org.apache.iceberg.expressions.Expressions.notEqual; -import static org.apache.iceberg.expressions.Expressions.notIn; -import static org.apache.iceberg.expressions.Expressions.notNaN; -import static org.apache.iceberg.expressions.Expressions.notNull; -import static org.apache.iceberg.expressions.Expressions.notStartsWith; -import static org.apache.iceberg.expressions.Expressions.or; -import static org.apache.iceberg.expressions.Expressions.startsWith; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0; -import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0; - @RunWith(Parameterized.class) public class TestDictionaryRowGroupFilter { private static final Types.StructType structFieldType = - Types.StructType.of(Types.NestedField.required(9, "int_field", IntegerType.get())); - - private static final Schema SCHEMA = new Schema( - required(1, "id", IntegerType.get()), - optional(2, "no_stats", StringType.get()), - required(3, "required", StringType.get()), - optional(4, "all_nulls", LongType.get()), - optional(5, "some_nulls", StringType.get()), - optional(6, "no_nulls", StringType.get()), - optional(7, "non_dict", StringType.get()), - optional(8, "struct_not_null", structFieldType), - optional(10, "not_in_file", FloatType.get()), - optional(11, "all_nans", DoubleType.get()), - optional(12, "some_nans", FloatType.get()), - optional(13, "no_nans", DoubleType.get()), - optional(14, "decimal_fixed", DecimalType.of(20, 10)) // >18 precision to enforce FIXED_LEN_BYTE_ARRAY - ); + Types.StructType.of(Types.NestedField.required(9, "int_field", IntegerType.get())); + + private static final Schema SCHEMA = + new Schema( + required(1, "id", IntegerType.get()), + optional(2, "no_stats", StringType.get()), + required(3, "required", StringType.get()), + optional(4, "all_nulls", LongType.get()), + optional(5, "some_nulls", StringType.get()), + optional(6, "no_nulls", StringType.get()), + optional(7, "non_dict", StringType.get()), + optional(8, "struct_not_null", structFieldType), + optional(10, "not_in_file", FloatType.get()), + optional(11, "all_nans", DoubleType.get()), + optional(12, "some_nans", FloatType.get()), + optional(13, "no_nans", DoubleType.get()), + optional( + 14, + "decimal_fixed", + DecimalType.of(20, 10)) // >18 precision to enforce FIXED_LEN_BYTE_ARRAY + ); private static final Types.StructType _structFieldType = - Types.StructType.of(Types.NestedField.required(9, "_int_field", IntegerType.get())); - - private static final Schema FILE_SCHEMA = new Schema( - required(1, "_id", IntegerType.get()), - optional(2, "_no_stats", StringType.get()), - required(3, "_required", StringType.get()), - optional(4, "_all_nulls", LongType.get()), - optional(5, "_some_nulls", StringType.get()), - optional(6, "_no_nulls", StringType.get()), - optional(7, "_non_dict", StringType.get()), - optional(8, "_struct_not_null", _structFieldType), - optional(11, "_all_nans", DoubleType.get()), - optional(12, "_some_nans", FloatType.get()), - optional(13, "_no_nans", DoubleType.get()), - optional(14, "_decimal_fixed", DecimalType.of(20, 10)) // >18 precision to enforce FIXED_LEN_BYTE_ARRAY - ); + Types.StructType.of(Types.NestedField.required(9, "_int_field", IntegerType.get())); + + private static final Schema FILE_SCHEMA = + new Schema( + required(1, "_id", IntegerType.get()), + optional(2, "_no_stats", StringType.get()), + required(3, "_required", StringType.get()), + optional(4, "_all_nulls", LongType.get()), + optional(5, "_some_nulls", StringType.get()), + optional(6, "_no_nulls", StringType.get()), + optional(7, "_non_dict", StringType.get()), + optional(8, "_struct_not_null", _structFieldType), + optional(11, "_all_nans", DoubleType.get()), + optional(12, "_some_nans", FloatType.get()), + optional(13, "_no_nans", DoubleType.get()), + optional( + 14, + "_decimal_fixed", + DecimalType.of(20, 10)) // >18 precision to enforce FIXED_LEN_BYTE_ARRAY + ); private static final String TOO_LONG_FOR_STATS; @@ -140,16 +147,17 @@ public class TestDictionaryRowGroupFilter { private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 79; private static final BigDecimal DECIMAL_MIN_VALUE = new BigDecimal("-1234567890.0987654321"); - private static final BigDecimal DECIMAL_STEP = new BigDecimal("1234567890.0987654321").subtract(DECIMAL_MIN_VALUE) - .divide(new BigDecimal(INT_MAX_VALUE - INT_MIN_VALUE), RoundingMode.HALF_UP); + private static final BigDecimal DECIMAL_STEP = + new BigDecimal("1234567890.0987654321") + .subtract(DECIMAL_MIN_VALUE) + .divide(new BigDecimal(INT_MAX_VALUE - INT_MIN_VALUE), RoundingMode.HALF_UP); private MessageType parquetSchema = null; private BlockMetaData rowGroupMetadata = null; private DictionaryPageReadStore dictionaryStore = null; private final WriterVersion writerVersion; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Parameterized.Parameters public static List writerVersions() { @@ -169,17 +177,16 @@ public void createInputFile() throws IOException { org.apache.avro.Schema structSchema = AvroSchemaUtil.convert(_structFieldType); OutputFile outFile = Files.localOutput(parquetFile); - try (FileAppender appender = Parquet.write(outFile) - .schema(FILE_SCHEMA) - .withWriterVersion(writerVersion) - .build()) { + try (FileAppender appender = + Parquet.write(outFile).schema(FILE_SCHEMA).withWriterVersion(writerVersion).build()) { GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table")); // create 20 copies of each record to ensure dictionary-encoding for (int copy = 0; copy < 20; copy += 1) { // create 50 records for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { builder.set("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0 - builder.set("_no_stats", TOO_LONG_FOR_STATS); // value longer than 4k will produce no stats + builder.set( + "_no_stats", TOO_LONG_FOR_STATS); // value longer than 4k will produce no stats builder.set("_required", "req"); // required, always non-null builder.set("_all_nulls", null); // never non-null builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values @@ -189,8 +196,10 @@ public void createInputFile() throws IOException { builder.set("_some_nans", (i % 10 == 0) ? Float.NaN : 2F); // includes some nan values builder.set("_no_nans", 3D); // optional, but always non-nan - // min=-1234567890.0987654321, max~=1234567890.0987654321 (depending on rounding), num-nulls=0 - builder.set("_decimal_fixed", DECIMAL_MIN_VALUE.add(DECIMAL_STEP.multiply(new BigDecimal(i)))); + // min=-1234567890.0987654321, max~=1234567890.0987654321 (depending on rounding), + // num-nulls=0 + builder.set( + "_decimal_fixed", DECIMAL_MIN_VALUE.add(DECIMAL_STEP.multiply(new BigDecimal(i)))); Record structNotNull = new Record(structSchema); structNotNull.put("_int_field", INT_MIN_VALUE + i); @@ -214,35 +223,43 @@ public void createInputFile() throws IOException { @Test public void testAssumptions() { // this case validates that other cases don't need to test expressions with null literals. - TestHelpers.assertThrows("Should reject null literal in equal expression", + TestHelpers.assertThrows( + "Should reject null literal in equal expression", NullPointerException.class, "Cannot create expression literal from null", () -> equal("col", null)); - TestHelpers.assertThrows("Should reject null literal in notEqual expression", + TestHelpers.assertThrows( + "Should reject null literal in notEqual expression", NullPointerException.class, "Cannot create expression literal from null", () -> notEqual("col", null)); - TestHelpers.assertThrows("Should reject null literal in lessThan expression", + TestHelpers.assertThrows( + "Should reject null literal in lessThan expression", NullPointerException.class, "Cannot create expression literal from null", () -> lessThan("col", null)); - TestHelpers.assertThrows("Should reject null literal in lessThanOrEqual expression", + TestHelpers.assertThrows( + "Should reject null literal in lessThanOrEqual expression", NullPointerException.class, "Cannot create expression literal from null", () -> lessThanOrEqual("col", null)); - TestHelpers.assertThrows("Should reject null literal in greaterThan expression", + TestHelpers.assertThrows( + "Should reject null literal in greaterThan expression", NullPointerException.class, "Cannot create expression literal from null", () -> greaterThan("col", null)); - TestHelpers.assertThrows("Should reject null literal in greaterThanOrEqual expression", + TestHelpers.assertThrows( + "Should reject null literal in greaterThanOrEqual expression", NullPointerException.class, "Cannot create expression literal from null", () -> greaterThanOrEqual("col", null)); - TestHelpers.assertThrows("Should reject null literal in startsWith expression", + TestHelpers.assertThrows( + "Should reject null literal in startsWith expression", NullPointerException.class, "Cannot create expression literal from null", () -> startsWith("col", null)); - TestHelpers.assertThrows("Should reject null literal in notStartsWith expression", + TestHelpers.assertThrows( + "Should reject null literal in notStartsWith expression", NullPointerException.class, "Cannot create expression literal from null", () -> notStartsWith("col", null)); @@ -250,744 +267,967 @@ public void testAssumptions() { @Test public void testAllNulls() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("all_nulls")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("all_nulls")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("some_nulls")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("some_nulls")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("no_nulls")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("no_nulls")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("struct_not_null")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("struct_not_null")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); } @Test public void testNoNulls() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("all_nulls")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("all_nulls")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("some_nulls")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("some_nulls")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("no_nulls")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("no_nulls")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("struct_not_null")) + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("struct_not_null")) .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); } @Test public void testRequiredColumn() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("required")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("required")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: required columns are always non-null", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("required")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("required")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: required columns are always non-null", shouldRead); } @Test public void testIsNaNs() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNaN("all_nans")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, isNaN("all_nans")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: all_nans column will contain NaN", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNaN("some_nans")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, isNaN("some_nans")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: some_nans column will contain NaN", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNaN("no_nans")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, isNaN("no_nans")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: no_nans column will not contain NaN", shouldRead); } @Test public void testNotNaNs() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNaN("all_nans")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notNaN("all_nans")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: all_nans column will not contain non-NaN", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNaN("some_nans")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notNaN("some_nans")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: some_nans column will contain non-NaN", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNaN("no_nans")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notNaN("no_nans")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: no_nans column will contain non-NaN", shouldRead); } @Test public void testStartsWith() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("non_dict", "re")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("non_dict", "re")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: no dictionary", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("required", "re")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("required", "re")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("required", "req")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("required", "req")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("some_nulls", "so")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("some_nulls", "so")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("no_stats", UUID.randomUUID().toString())) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, startsWith("no_stats", UUID.randomUUID().toString())) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: no stats but dictionary is present", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("required", "reqs")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("required", "reqs")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: no match in dictionary", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("some_nulls", "somex")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("some_nulls", "somex")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: no match in dictionary", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("no_nulls", "xxx")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("no_nulls", "xxx")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: no match in dictionary", shouldRead); } @Test public void testNotStartsWith() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("non_dict", "re")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("non_dict", "re")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: no dictionary", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("required", "re")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("required", "re")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: no match in dictionary", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("required", "req")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("required", "req")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: no match in dictionary", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("some_nulls", "s!")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("some_nulls", "s!")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("no_stats", UUID.randomUUID().toString())) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, notStartsWith("no_stats", UUID.randomUUID().toString())) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: no stats but dictionary is present", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("required", "reqs")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("required", "reqs")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("some_nulls", "somex")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("some_nulls", "somex")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("some_nulls", "some")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("some_nulls", "some")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: no match in dictionary", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("no_nulls", "xxx")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notStartsWith("no_nulls", "xxx")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); } - @Test public void testMissingColumn() { - TestHelpers.assertThrows("Should complain about missing column in expression", - ValidationException.class, "Cannot find field 'missing'", - () -> new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("missing", 5)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore)); + TestHelpers.assertThrows( + "Should complain about missing column in expression", + ValidationException.class, + "Cannot find field 'missing'", + () -> + new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("missing", 5)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore)); } @Test public void testColumnNotInFile() { - Expression[] exprs = new Expression[] { - lessThan("not_in_file", 1.0f), lessThanOrEqual("not_in_file", 1.0f), - equal("not_in_file", 1.0f), greaterThan("not_in_file", 1.0f), - greaterThanOrEqual("not_in_file", 1.0f), notNull("not_in_file"), - isNull("not_in_file"), notEqual("not_in_file", 1.0f) - }; + Expression[] exprs = + new Expression[] { + lessThan("not_in_file", 1.0f), lessThanOrEqual("not_in_file", 1.0f), + equal("not_in_file", 1.0f), greaterThan("not_in_file", 1.0f), + greaterThanOrEqual("not_in_file", 1.0f), notNull("not_in_file"), + isNull("not_in_file"), notEqual("not_in_file", 1.0f) + }; for (Expression expr : exprs) { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, expr) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, expr) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary cannot be found: " + expr, shouldRead); } } @Test public void testColumnFallbackOrNotDictionaryEncoded() { - Expression[] exprs = new Expression[] { - lessThan("non_dict", "a"), lessThanOrEqual("non_dict", "a"), equal("non_dict", "a"), - greaterThan("non_dict", "a"), greaterThanOrEqual("non_dict", "a"), notNull("non_dict"), - isNull("non_dict"), notEqual("non_dict", "a") - }; + Expression[] exprs = + new Expression[] { + lessThan("non_dict", "a"), lessThanOrEqual("non_dict", "a"), equal("non_dict", "a"), + greaterThan("non_dict", "a"), greaterThanOrEqual("non_dict", "a"), notNull("non_dict"), + isNull("non_dict"), notEqual("non_dict", "a") + }; for (Expression expr : exprs) { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, expr) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, expr) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: dictionary cannot be found: " + expr, shouldRead); } } @Test public void testMissingStats() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("no_stats", "a")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, equal("no_stats", "a")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: stats are missing but dictionary is present", shouldRead); } @Test public void testNot() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(lessThan("id", INT_MIN_VALUE - 25))) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, not(lessThan("id", INT_MIN_VALUE - 25))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: not(false)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(greaterThan("id", INT_MIN_VALUE - 25))) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, not(greaterThan("id", INT_MIN_VALUE - 25))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: not(true)", shouldRead); } @Test public void testAnd() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MIN_VALUE - 30))) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, + and( + lessThan("id", INT_MIN_VALUE - 25), + greaterThanOrEqual("id", INT_MIN_VALUE - 30))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: and(false, true)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, + and( + lessThan("id", INT_MIN_VALUE - 25), + greaterThanOrEqual("id", INT_MAX_VALUE + 1))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: and(false, false)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - and(greaterThan("id", INT_MIN_VALUE - 25), lessThanOrEqual("id", INT_MIN_VALUE))) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, + and(greaterThan("id", INT_MIN_VALUE - 25), lessThanOrEqual("id", INT_MIN_VALUE))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: and(true, true)", shouldRead); } @Test public void testOr() { // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, + or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: or(false, false)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE - 19))) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, + or( + lessThan("id", INT_MIN_VALUE - 25), + greaterThanOrEqual("id", INT_MAX_VALUE - 19))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: or(false, true)", shouldRead); } @Test public void testIntegerLt() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("id", INT_MIN_VALUE - 25)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("id", INT_MIN_VALUE - 25)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("id", INT_MIN_VALUE)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("id", INT_MIN_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id range below lower bound (30 is not < 30)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("id", INT_MIN_VALUE + 1)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("id", INT_MIN_VALUE + 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: one possible id", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("id", INT_MAX_VALUE)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("id", INT_MAX_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: may possible ids", shouldRead); } @Test public void testIntegerLtEq() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE - 25)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE - 25)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE - 1)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE - 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id range below lower bound (29 < 30)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: one possible id", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("id", INT_MAX_VALUE)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("id", INT_MAX_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: many possible ids", shouldRead); } @Test public void testIntegerGt() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("id", INT_MAX_VALUE + 6)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("id", INT_MAX_VALUE + 6)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("id", INT_MAX_VALUE)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("id", INT_MAX_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id range above upper bound (79 is not > 79)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("id", INT_MAX_VALUE - 1)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("id", INT_MAX_VALUE - 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: one possible id", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("id", INT_MAX_VALUE - 4)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("id", INT_MAX_VALUE - 4)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: may possible ids", shouldRead); } @Test public void testIntegerGtEq() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE + 6)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE + 6)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE + 1)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE + 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id range above upper bound (80 > 79)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: one possible id", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE - 4)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE - 4)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: may possible ids", shouldRead); } @Test public void testIntegerEq() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MIN_VALUE - 25)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MIN_VALUE - 25)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id below lower bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MIN_VALUE - 1)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MIN_VALUE - 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id below lower bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MIN_VALUE)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MIN_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MAX_VALUE - 4)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MAX_VALUE - 4)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MAX_VALUE)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MAX_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MAX_VALUE + 1)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MAX_VALUE + 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id above upper bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MAX_VALUE + 6)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MAX_VALUE + 6)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id above upper bound", shouldRead); } @Test public void testIntegerNotEq() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MIN_VALUE - 25)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MIN_VALUE - 25)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id below lower bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MIN_VALUE - 1)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MIN_VALUE - 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id below lower bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MIN_VALUE)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MIN_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MAX_VALUE - 4)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MAX_VALUE - 4)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MAX_VALUE)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MAX_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MAX_VALUE + 1)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MAX_VALUE + 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id above upper bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MAX_VALUE + 6)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MAX_VALUE + 6)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id above upper bound", shouldRead); } @Test public void testIntegerNotEqRewritten() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MIN_VALUE - 25))) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MIN_VALUE - 25))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id below lower bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MIN_VALUE - 1))) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MIN_VALUE - 1))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id below lower bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MIN_VALUE))) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MIN_VALUE))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MAX_VALUE - 4))) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MAX_VALUE - 4))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MAX_VALUE))) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MAX_VALUE))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MAX_VALUE + 1))) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MAX_VALUE + 1))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id above upper bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MAX_VALUE + 6))) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MAX_VALUE + 6))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id above upper bound", shouldRead); } @Test public void testStringNotEq() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("some_nulls", "some")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("some_nulls", "some")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: contains null != 'some'", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("no_nulls", "")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("no_nulls", "")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: contains only ''", shouldRead); } @Test public void testStructFieldLt() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - lessThan("struct_not_null.int_field", INT_MIN_VALUE - 25) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, lessThan("struct_not_null.int_field", INT_MIN_VALUE - 25)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("struct_not_null.int_field", INT_MIN_VALUE)) + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, lessThan("struct_not_null.int_field", INT_MIN_VALUE)) .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id range below lower bound (30 is not < 30)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("struct_not_null.int_field", INT_MIN_VALUE + 1)) + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, lessThan("struct_not_null.int_field", INT_MIN_VALUE + 1)) .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: one possible id", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("struct_not_null.int_field", INT_MAX_VALUE)) + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, lessThan("struct_not_null.int_field", INT_MAX_VALUE)) .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: may possible ids", shouldRead); } @Test public void testStructFieldLtEq() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 25) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 25)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 1) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id range below lower bound (29 < 30)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: one possible id", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - lessThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, lessThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: many possible ids", shouldRead); - } @Test public void testStructFieldGt() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - greaterThan("struct_not_null.int_field", INT_MAX_VALUE + 6) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, greaterThan("struct_not_null.int_field", INT_MAX_VALUE + 6)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - greaterThan("struct_not_null.int_field", INT_MAX_VALUE) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, greaterThan("struct_not_null.int_field", INT_MAX_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id range above upper bound (79 is not > 79)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - greaterThan("struct_not_null.int_field", INT_MAX_VALUE - 1) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, greaterThan("struct_not_null.int_field", INT_MAX_VALUE - 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: one possible id", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - greaterThan("struct_not_null.int_field", INT_MAX_VALUE - 4) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, greaterThan("struct_not_null.int_field", INT_MAX_VALUE - 4)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: may possible ids", shouldRead); } @Test public void testStructFieldGtEq() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 6) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 6)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 1) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id range above upper bound (80 > 79)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: one possible id", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE - 4) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE - 4)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: may possible ids", shouldRead); } @Test public void testStructFieldEq() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - equal("struct_not_null.int_field", INT_MIN_VALUE - 25) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, equal("struct_not_null.int_field", INT_MIN_VALUE - 25)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id below lower bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("struct_not_null.int_field", INT_MIN_VALUE - 1)) + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, equal("struct_not_null.int_field", INT_MIN_VALUE - 1)) .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id below lower bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("struct_not_null.int_field", INT_MIN_VALUE)) + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, equal("struct_not_null.int_field", INT_MIN_VALUE)) .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("struct_not_null.int_field", INT_MAX_VALUE - 4)) + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, equal("struct_not_null.int_field", INT_MAX_VALUE - 4)) .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("struct_not_null.int_field", INT_MAX_VALUE)) + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, equal("struct_not_null.int_field", INT_MAX_VALUE)) .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("struct_not_null.int_field", INT_MAX_VALUE + 1)) + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, equal("struct_not_null.int_field", INT_MAX_VALUE + 1)) .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id above upper bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("struct_not_null.int_field", INT_MAX_VALUE + 6)) + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, equal("struct_not_null.int_field", INT_MAX_VALUE + 6)) .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id above upper bound", shouldRead); } @Test public void testStructFieldNotEq() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - notEqual("struct_not_null.int_field", INT_MIN_VALUE - 25) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, notEqual("struct_not_null.int_field", INT_MIN_VALUE - 25)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id below lower bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("struct_not_null.int_field", INT_MIN_VALUE - 1)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, notEqual("struct_not_null.int_field", INT_MIN_VALUE - 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id below lower bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("struct_not_null.int_field", INT_MIN_VALUE)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, notEqual("struct_not_null.int_field", INT_MIN_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id equal to lower bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("struct_not_null.int_field", INT_MAX_VALUE - 4)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, notEqual("struct_not_null.int_field", INT_MAX_VALUE - 4)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("struct_not_null.int_field", INT_MAX_VALUE)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, notEqual("struct_not_null.int_field", INT_MAX_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id equal to upper bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MAX_VALUE + 1)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MAX_VALUE + 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id above upper bound", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("struct_not_null.int_field", INT_MAX_VALUE + 6)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, notEqual("struct_not_null.int_field", INT_MAX_VALUE + 6)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id above upper bound", shouldRead); } @Test public void testCaseInsensitive() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("no_Nulls", ""), false) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("no_Nulls", ""), false) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should skip: contains only ''", shouldRead); } @Test public void testMissingDictionaryPageForColumn() { - TestHelpers.assertThrows("Should complain about missing dictionary", - IllegalStateException.class, "Failed to read required dictionary page for id: 5", - () -> new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("some_nulls", "some")) - .shouldRead(parquetSchema, rowGroupMetadata, descriptor -> null)); + TestHelpers.assertThrows( + "Should complain about missing dictionary", + IllegalStateException.class, + "Failed to read required dictionary page for id: 5", + () -> + new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("some_nulls", "some")) + .shouldRead(parquetSchema, rowGroupMetadata, descriptor -> null)); } @Test public void testIntegerIn() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); - Assert.assertFalse("Should not read: id below lower bound (5 < 30, 6 < 30). The two sets are disjoint.", + boolean shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, in("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertFalse( + "Should not read: id below lower bound (5 < 30, 6 < 30). The two sets are disjoint.", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); - Assert.assertFalse("Should not read: id below lower bound (28 < 30, 29 < 30). The two sets are disjoint.", + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertFalse( + "Should not read: id below lower bound (28 < 30, 29 < 30). The two sets are disjoint.", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id equal to lower bound (30 == 30)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: in set is a subset of the dictionary", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id equal to upper bound (79 == 79)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: id above upper bound (80 > 79, 81 > 79)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); - Assert.assertFalse("Should not read: id above upper bound (85 > 79, 86 > 79). The two sets are disjoint.", + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertFalse( + "Should not read: id above upper bound (85 > 79, 86 > 79). The two sets are disjoint.", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - in("id", IntStream.range(INT_MIN_VALUE - 10, INT_MAX_VALUE + 10).boxed().collect(Collectors.toList())) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, + in( + "id", + IntStream.range(INT_MIN_VALUE - 10, INT_MAX_VALUE + 10) + .boxed() + .collect(Collectors.toList()))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: the dictionary is a subset of the in set", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - in("id", IntStream.range(INT_MIN_VALUE, INT_MAX_VALUE + 1).boxed().collect(Collectors.toList())) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, + in( + "id", + IntStream.range(INT_MIN_VALUE, INT_MAX_VALUE + 1) + .boxed() + .collect(Collectors.toList()))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: the dictionary is equal to the in set", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("all_nulls", 1, 2)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, in("all_nulls", 1, 2)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: in on all nulls column (isFallback to be true) ", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("some_nulls", "aaa", "some")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, in("some_nulls", "aaa", "some")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: in on some nulls column", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("some_nulls", "aaa", "bbb")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, in("some_nulls", "aaa", "bbb")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: some_nulls values are not within the set", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("no_nulls", "aaa", "bbb")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); - Assert.assertFalse("Should not read: in on no nulls column (empty string is not within the set)", shouldRead); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, in("no_nulls", "aaa", "bbb")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertFalse( + "Should not read: in on no nulls column (empty string is not within the set)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("no_nulls", "aaa", "")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); - Assert.assertTrue("Should read: in on no nulls column (empty string is within the set)", shouldRead); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, in("no_nulls", "aaa", "")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertTrue( + "Should read: in on no nulls column (empty string is within the set)", shouldRead); } @Test public void testIntegerNotIn() { - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - notIn("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); - Assert.assertTrue("Should read: id below lower bound (5 < 30, 6 < 30). The two sets are disjoint.", shouldRead); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, notIn("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertTrue( + "Should read: id below lower bound (5 < 30, 6 < 30). The two sets are disjoint.", + shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); - Assert.assertTrue("Should read: id below lower bound (28 < 30, 29 < 30). The two sets are disjoint.", shouldRead); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, notIn("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertTrue( + "Should read: id below lower bound (28 < 30, 29 < 30). The two sets are disjoint.", + shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id equal to lower bound (30 == 30)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, notIn("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: the notIn set is a subset of the dictionary", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: id equal to upper bound (79 == 79)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); - Assert.assertTrue("Should read: id above upper bound (80 > 79, 81 > 79). The two sets are disjoint.", shouldRead); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, notIn("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertTrue( + "Should read: id above upper bound (80 > 79, 81 > 79). The two sets are disjoint.", + shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); - Assert.assertTrue("Should read: id above upper bound (85 > 79, 86 > 79). The two sets are disjoint.", shouldRead); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, notIn("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertTrue( + "Should read: id above upper bound (85 > 79, 86 > 79). The two sets are disjoint.", + shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - notIn("id", IntStream.range(INT_MIN_VALUE - 10, INT_MAX_VALUE + 10).boxed().collect(Collectors.toList())) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, + notIn( + "id", + IntStream.range(INT_MIN_VALUE - 10, INT_MAX_VALUE + 10) + .boxed() + .collect(Collectors.toList()))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: the dictionary is a subset of the notIn set", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - notIn("id", IntStream.range(INT_MIN_VALUE, INT_MAX_VALUE + 1).boxed().collect(Collectors.toList())) - ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, + notIn( + "id", + IntStream.range(INT_MIN_VALUE, INT_MAX_VALUE + 1) + .boxed() + .collect(Collectors.toList()))) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertFalse("Should not read: the dictionary is equal to the notIn set", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("all_nulls", 1, 2)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("all_nulls", 1, 2)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should read: notIn on all nulls column", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("some_nulls", "aaa", "bbb")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); - Assert.assertTrue("Should read: notIn on some nulls column (any null matches the notIn)", shouldRead); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("some_nulls", "aaa", "bbb")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertTrue( + "Should read: notIn on some nulls column (any null matches the notIn)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("no_nulls", "aaa", "bbb")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); - Assert.assertTrue("Should read: notIn on no nulls column (empty string is not within the set)", shouldRead); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("no_nulls", "aaa", "bbb")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertTrue( + "Should read: notIn on no nulls column (empty string is not within the set)", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("no_nulls", "aaa", "")) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); - Assert.assertFalse("Should not read: notIn on no nulls column (empty string is within the set)", shouldRead); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("no_nulls", "aaa", "")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertFalse( + "Should not read: notIn on no nulls column (empty string is within the set)", shouldRead); } @Test public void testTypePromotion() { Schema promotedSchema = new Schema(required(1, "id", LongType.get())); - boolean shouldRead = new ParquetDictionaryRowGroupFilter(promotedSchema, equal("id", INT_MIN_VALUE + 1), true) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(promotedSchema, equal("id", INT_MIN_VALUE + 1), true) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); Assert.assertTrue("Should succeed with promoted schema", shouldRead); } @Test public void testFixedLenByteArray() { - // This test is to validate the handling of FIXED_LEN_BYTE_ARRAY Parquet type being dictionary encoded. + // This test is to validate the handling of FIXED_LEN_BYTE_ARRAY Parquet type being dictionary + // encoded. // (No need to validate all the possible predicates) - Assume.assumeTrue("decimal_fixed is not dictionary encoded in case of writer version " + writerVersion, - getColumnForName(rowGroupMetadata, "_decimal_fixed").getEncodings().contains(Encoding.RLE_DICTIONARY)); + Assume.assumeTrue( + "decimal_fixed is not dictionary encoded in case of writer version " + writerVersion, + getColumnForName(rowGroupMetadata, "_decimal_fixed") + .getEncodings() + .contains(Encoding.RLE_DICTIONARY)); - boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, - greaterThanOrEqual("decimal_fixed", BigDecimal.ZERO)).shouldRead(parquetSchema, rowGroupMetadata, - dictionaryStore); - Assert.assertTrue("Should read: Half of the decimal_fixed values are greater than 0", shouldRead); + boolean shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, greaterThanOrEqual("decimal_fixed", BigDecimal.ZERO)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertTrue( + "Should read: Half of the decimal_fixed values are greater than 0", shouldRead); - shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("decimal_fixed", DECIMAL_MIN_VALUE)) - .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); - Assert.assertFalse("Should not read: No decimal_fixed values less than -1234567890.0987654321", shouldRead); + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("decimal_fixed", DECIMAL_MIN_VALUE)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + Assert.assertFalse( + "Should not read: No decimal_fixed values less than -1234567890.0987654321", shouldRead); } private ColumnChunkMetaData getColumnForName(BlockMetaData rowGroup, String columnName) { diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java index 88cdfd57f7f6..1762802fd527 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java @@ -16,9 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; +import static org.apache.iceberg.Files.localInput; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.parquet.ParquetWritingTestUtils.createTempFile; +import static org.apache.iceberg.parquet.ParquetWritingTestUtils.write; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; @@ -47,18 +54,9 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.Files.localInput; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.parquet.ParquetWritingTestUtils.createTempFile; -import static org.apache.iceberg.parquet.ParquetWritingTestUtils.write; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestParquet { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testRowGroupSizeConfigurable() throws IOException { @@ -68,7 +66,8 @@ public void testRowGroupSizeConfigurable() throws IOException { // as default PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT is 100. File parquetFile = generateFile(null, 101, 4 * Integer.BYTES, null, null).first(); - try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(localInput(parquetFile)))) { + try (ParquetFileReader reader = + ParquetFileReader.open(ParquetIO.file(localInput(parquetFile)))) { Assert.assertEquals(2, reader.getRowGroups().size()); } } @@ -79,18 +78,18 @@ public void testRowGroupSizeConfigurableWithWriter() throws IOException { // and PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT configs. // We should just need to write 5 integers (20 bytes) // to create two row groups with row group size configured at 16 bytes. - File parquetFile = generateFile(ParquetAvroWriter::buildWriter, 5, 4 * Integer.BYTES, 1, 2).first(); + File parquetFile = + generateFile(ParquetAvroWriter::buildWriter, 5, 4 * Integer.BYTES, 1, 2).first(); - try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(localInput(parquetFile)))) { + try (ParquetFileReader reader = + ParquetFileReader.open(ParquetIO.file(localInput(parquetFile)))) { Assert.assertEquals(2, reader.getRowGroups().size()); } } @Test public void testNumberOfBytesWritten() throws IOException { - Schema schema = new Schema( - optional(1, "intCol", IntegerType.get()) - ); + Schema schema = new Schema(optional(1, "intCol", IntegerType.get())); // this value was specifically derived to reproduce iss1980 // record count grow factor is 10000 (hardcoded) @@ -108,8 +107,13 @@ public void testNumberOfBytesWritten() throws IOException { records.add(record); } - long actualSize = write(file, schema, Collections.emptyMap(), ParquetAvroWriter::buildWriter, - records.toArray(new GenericData.Record[]{})); + long actualSize = + write( + file, + schema, + Collections.emptyMap(), + ParquetAvroWriter::buildWriter, + records.toArray(new GenericData.Record[] {})); long expectedSize = ParquetIO.file(localInput(file)).getLength(); Assert.assertEquals(expectedSize, actualSize); @@ -117,21 +121,22 @@ public void testNumberOfBytesWritten() throws IOException { @Test public void testTwoLevelList() throws IOException { - Schema schema = new Schema( - optional(1, "arraybytes", Types.ListType.ofRequired(3, Types.BinaryType.get())), - optional(2, "topbytes", Types.BinaryType.get()) - ); + Schema schema = + new Schema( + optional(1, "arraybytes", Types.ListType.ofRequired(3, Types.BinaryType.get())), + optional(2, "topbytes", Types.BinaryType.get())); org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(schema.asStruct()); File testFile = temp.newFile(); Assert.assertTrue(testFile.delete()); - ParquetWriter writer = AvroParquetWriter.builder(new Path(testFile.toURI())) - .withDataModel(GenericData.get()) - .withSchema(avroSchema) - .config("parquet.avro.add-list-element-records", "true") - .config("parquet.avro.write-old-list-structure", "true") - .build(); + ParquetWriter writer = + AvroParquetWriter.builder(new Path(testFile.toURI())) + .withDataModel(GenericData.get()) + .withSchema(avroSchema) + .config("parquet.avro.add-list-element-records", "true") + .config("parquet.avro.write-old-list-structure", "true") + .build(); GenericRecordBuilder recordBuilder = new GenericRecordBuilder(avroSchema); List expectedByteList = Lists.newArrayList(); @@ -145,32 +150,34 @@ public void testTwoLevelList() throws IOException { writer.write(expectedRecord); writer.close(); - GenericData.Record recordRead = Iterables.getOnlyElement(Parquet.read(Files.localInput(testFile)) - .project(schema) - .callInit() - .build()); + GenericData.Record recordRead = + Iterables.getOnlyElement( + Parquet.read(Files.localInput(testFile)).project(schema).callInit().build()); Assert.assertEquals(expectedByteList, recordRead.get("arraybytes")); Assert.assertEquals(expectedBinary, recordRead.get("topbytes")); } private Pair generateFile( - Function> createWriterFunc, int desiredRecordCount, - Integer rowGroupSizeBytes, Integer minCheckRecordCount, Integer maxCheckRecordCount) + Function> createWriterFunc, + int desiredRecordCount, + Integer rowGroupSizeBytes, + Integer minCheckRecordCount, + Integer maxCheckRecordCount) throws IOException { - Schema schema = new Schema( - optional(1, "intCol", IntegerType.get()) - ); + Schema schema = new Schema(optional(1, "intCol", IntegerType.get())); ImmutableMap.Builder propsBuilder = ImmutableMap.builder(); if (rowGroupSizeBytes != null) { propsBuilder.put(PARQUET_ROW_GROUP_SIZE_BYTES, Integer.toString(rowGroupSizeBytes)); } if (minCheckRecordCount != null) { - propsBuilder.put(PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT, Integer.toString(minCheckRecordCount)); + propsBuilder.put( + PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT, Integer.toString(minCheckRecordCount)); } if (maxCheckRecordCount != null) { - propsBuilder.put(PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT, Integer.toString(maxCheckRecordCount)); + propsBuilder.put( + PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT, Integer.toString(maxCheckRecordCount)); } List records = Lists.newArrayListWithCapacity(desiredRecordCount); @@ -182,11 +189,13 @@ private Pair generateFile( } File file = createTempFile(temp); - long size = write(file, - schema, - propsBuilder.build(), - createWriterFunc, - records.toArray(new GenericData.Record[]{})); + long size = + write( + file, + schema, + propsBuilder.build(), + createWriterFunc, + records.toArray(new GenericData.Record[] {})); return Pair.of(file, size); } } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetDataWriter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetDataWriter.java index 1db7da732c30..728488329585 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetDataWriter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetDataWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.io.IOException; @@ -51,15 +50,15 @@ import org.junit.rules.TemporaryFolder; public class TestParquetDataWriter { - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "binary", Types.BinaryType.get())); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "binary", Types.BinaryType.get())); private List records; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Before public void createRecords() { @@ -79,18 +78,16 @@ public void createRecords() { public void testDataWriter() throws IOException { OutputFile file = Files.localOutput(temp.newFile()); - SortOrder sortOrder = SortOrder.builderFor(SCHEMA) - .withOrderId(10) - .asc("id") - .build(); + SortOrder sortOrder = SortOrder.builderFor(SCHEMA).withOrderId(10).asc("id").build(); - DataWriter dataWriter = Parquet.writeData(file) - .schema(SCHEMA) - .createWriterFunc(GenericParquetWriter::buildWriter) - .overwrite() - .withSpec(PartitionSpec.unpartitioned()) - .withSortOrder(sortOrder) - .build(); + DataWriter dataWriter = + Parquet.writeData(file) + .schema(SCHEMA) + .createWriterFunc(GenericParquetWriter::buildWriter) + .overwrite() + .withSpec(PartitionSpec.unpartitioned()) + .withSortOrder(sortOrder) + .build(); try { for (Record record : records) { @@ -106,14 +103,16 @@ public void testDataWriter() throws IOException { Assert.assertEquals("Should be data file", FileContent.DATA, dataFile.content()); Assert.assertEquals("Record count should match", records.size(), dataFile.recordCount()); Assert.assertEquals("Partition should be empty", 0, dataFile.partition().size()); - Assert.assertEquals("Sort order should match", sortOrder.orderId(), (int) dataFile.sortOrderId()); + Assert.assertEquals( + "Sort order should match", sortOrder.orderId(), (int) dataFile.sortOrderId()); Assert.assertNull("Key metadata should be null", dataFile.keyMetadata()); List writtenRecords; - try (CloseableIterable reader = Parquet.read(file.toInputFile()) - .project(SCHEMA) - .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(SCHEMA, fileSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(file.toInputFile()) + .project(SCHEMA) + .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(SCHEMA, fileSchema)) + .build()) { writtenRecords = Lists.newArrayList(reader); } @@ -125,19 +124,27 @@ public void testDataWriter() throws IOException { public void testInvalidUpperBoundString() throws Exception { OutputFile file = Files.localOutput(temp.newFile()); - Table testTable = TestTables.create(temp.newFile(), "test_invalid_string_bound", - SCHEMA, PartitionSpec.unpartitioned(), SortOrder.unsorted(), 2); - testTable.updateProperties() + Table testTable = + TestTables.create( + temp.newFile(), + "test_invalid_string_bound", + SCHEMA, + PartitionSpec.unpartitioned(), + SortOrder.unsorted(), + 2); + testTable + .updateProperties() .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "truncate(16)") .commit(); - DataWriter dataWriter = Parquet.writeData(file) - .metricsConfig(MetricsConfig.forTable(testTable)) - .schema(SCHEMA) - .createWriterFunc(GenericParquetWriter::buildWriter) - .overwrite() - .withSpec(PartitionSpec.unpartitioned()) - .build(); + DataWriter dataWriter = + Parquet.writeData(file) + .metricsConfig(MetricsConfig.forTable(testTable)) + .schema(SCHEMA) + .createWriterFunc(GenericParquetWriter::buildWriter) + .overwrite() + .withSpec(PartitionSpec.unpartitioned()) + .build(); // These high code points cause an overflow GenericRecord genericRecord = GenericRecord.create(SCHEMA); @@ -162,15 +169,17 @@ public void testInvalidUpperBoundString() throws Exception { Assert.assertEquals("Format should be Parquet", FileFormat.PARQUET, dataFile.format()); Assert.assertEquals("Should be data file", FileContent.DATA, dataFile.content()); - Assert.assertEquals("Record count should match", overflowRecords.size(), dataFile.recordCount()); + Assert.assertEquals( + "Record count should match", overflowRecords.size(), dataFile.recordCount()); Assert.assertEquals("Partition should be empty", 0, dataFile.partition().size()); Assert.assertNull("Key metadata should be null", dataFile.keyMetadata()); List writtenRecords; - try (CloseableIterable reader = Parquet.read(file.toInputFile()) - .project(SCHEMA) - .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(SCHEMA, fileSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(file.toInputFile()) + .project(SCHEMA) + .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(SCHEMA, fileSchema)) + .build()) { writtenRecords = Lists.newArrayList(reader); } @@ -187,19 +196,27 @@ public void testInvalidUpperBoundString() throws Exception { public void testInvalidUpperBoundBinary() throws Exception { OutputFile file = Files.localOutput(temp.newFile()); - Table testTable = TestTables.create(temp.newFile(), "test_invalid_binary_bound", - SCHEMA, PartitionSpec.unpartitioned(), SortOrder.unsorted(), 2); - testTable.updateProperties() + Table testTable = + TestTables.create( + temp.newFile(), + "test_invalid_binary_bound", + SCHEMA, + PartitionSpec.unpartitioned(), + SortOrder.unsorted(), + 2); + testTable + .updateProperties() .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "truncate(16)") .commit(); - DataWriter dataWriter = Parquet.writeData(file) - .metricsConfig(MetricsConfig.forTable(testTable)) - .schema(SCHEMA) - .createWriterFunc(GenericParquetWriter::buildWriter) - .overwrite() - .withSpec(PartitionSpec.unpartitioned()) - .build(); + DataWriter dataWriter = + Parquet.writeData(file) + .metricsConfig(MetricsConfig.forTable(testTable)) + .schema(SCHEMA) + .createWriterFunc(GenericParquetWriter::buildWriter) + .overwrite() + .withSpec(PartitionSpec.unpartitioned()) + .build(); // This max binary value causes an overflow GenericRecord genericRecord = GenericRecord.create(SCHEMA); @@ -208,8 +225,7 @@ public void testInvalidUpperBoundBinary() throws Exception { for (int i = 0; i < 17; i++) { bytes.put(i, (byte) 0xff); } - builder.add(genericRecord.copy(ImmutableMap.of("id", 1L, "binary", - bytes))); + builder.add(genericRecord.copy(ImmutableMap.of("id", 1L, "binary", bytes))); List overflowRecords = builder.build(); try { @@ -224,15 +240,17 @@ public void testInvalidUpperBoundBinary() throws Exception { Assert.assertEquals("Format should be Parquet", FileFormat.PARQUET, dataFile.format()); Assert.assertEquals("Should be data file", FileContent.DATA, dataFile.content()); - Assert.assertEquals("Record count should match", overflowRecords.size(), dataFile.recordCount()); + Assert.assertEquals( + "Record count should match", overflowRecords.size(), dataFile.recordCount()); Assert.assertEquals("Partition should be empty", 0, dataFile.partition().size()); Assert.assertNull("Key metadata should be null", dataFile.keyMetadata()); List writtenRecords; - try (CloseableIterable reader = Parquet.read(file.toInputFile()) - .project(SCHEMA) - .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(SCHEMA, fileSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(file.toInputFile()) + .project(SCHEMA) + .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(SCHEMA, fileSchema)) + .build()) { writtenRecords = Lists.newArrayList(reader); } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetDeleteWriters.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetDeleteWriters.java index 9af07199b7d7..df10f7fac048 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetDeleteWriters.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetDeleteWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.io.File; @@ -49,14 +48,14 @@ import org.junit.rules.TemporaryFolder; public class TestParquetDeleteWriters { - private static final Schema SCHEMA = new Schema( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); private List records; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Before public void createDeleteRecords() { @@ -77,13 +76,14 @@ public void testEqualityDeleteWriter() throws IOException { File deleteFile = temp.newFile(); OutputFile out = Files.localOutput(deleteFile); - EqualityDeleteWriter deleteWriter = Parquet.writeDeletes(out) - .createWriterFunc(GenericParquetWriter::buildWriter) - .overwrite() - .rowSchema(SCHEMA) - .withSpec(PartitionSpec.unpartitioned()) - .equalityFieldIds(1) - .buildEqualityWriter(); + EqualityDeleteWriter deleteWriter = + Parquet.writeDeletes(out) + .createWriterFunc(GenericParquetWriter::buildWriter) + .overwrite() + .rowSchema(SCHEMA) + .withSpec(PartitionSpec.unpartitioned()) + .equalityFieldIds(1) + .buildEqualityWriter(); try (EqualityDeleteWriter writer = deleteWriter) { writer.deleteAll(records); @@ -91,16 +91,18 @@ public void testEqualityDeleteWriter() throws IOException { DeleteFile metadata = deleteWriter.toDeleteFile(); Assert.assertEquals("Format should be Parquet", FileFormat.PARQUET, metadata.format()); - Assert.assertEquals("Should be equality deletes", FileContent.EQUALITY_DELETES, metadata.content()); + Assert.assertEquals( + "Should be equality deletes", FileContent.EQUALITY_DELETES, metadata.content()); Assert.assertEquals("Record count should be correct", records.size(), metadata.recordCount()); Assert.assertEquals("Partition should be empty", 0, metadata.partition().size()); Assert.assertNull("Key metadata should be null", metadata.keyMetadata()); List deletedRecords; - try (CloseableIterable reader = Parquet.read(out.toInputFile()) - .project(SCHEMA) - .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(SCHEMA, fileSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(out.toInputFile()) + .project(SCHEMA) + .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(SCHEMA, fileSchema)) + .build()) { deletedRecords = Lists.newArrayList(reader); } @@ -111,99 +113,110 @@ public void testEqualityDeleteWriter() throws IOException { public void testPositionDeleteWriter() throws IOException { File deleteFile = temp.newFile(); - Schema deleteSchema = new Schema( - MetadataColumns.DELETE_FILE_PATH, - MetadataColumns.DELETE_FILE_POS, - NestedField.optional(MetadataColumns.DELETE_FILE_ROW_FIELD_ID, "row", SCHEMA.asStruct())); + Schema deleteSchema = + new Schema( + MetadataColumns.DELETE_FILE_PATH, + MetadataColumns.DELETE_FILE_POS, + NestedField.optional( + MetadataColumns.DELETE_FILE_ROW_FIELD_ID, "row", SCHEMA.asStruct())); String deletePath = "s3://bucket/path/file.parquet"; GenericRecord posDelete = GenericRecord.create(deleteSchema); List expectedDeleteRecords = Lists.newArrayList(); OutputFile out = Files.localOutput(deleteFile); - PositionDeleteWriter deleteWriter = Parquet.writeDeletes(out) - .createWriterFunc(GenericParquetWriter::buildWriter) - .overwrite() - .rowSchema(SCHEMA) - .withSpec(PartitionSpec.unpartitioned()) - .buildPositionWriter(); + PositionDeleteWriter deleteWriter = + Parquet.writeDeletes(out) + .createWriterFunc(GenericParquetWriter::buildWriter) + .overwrite() + .rowSchema(SCHEMA) + .withSpec(PartitionSpec.unpartitioned()) + .buildPositionWriter(); try (PositionDeleteWriter writer = deleteWriter) { for (int i = 0; i < records.size(); i += 1) { int pos = i * 3 + 2; writer.delete(deletePath, pos, records.get(i)); - expectedDeleteRecords.add(posDelete.copy(ImmutableMap.of( - "file_path", deletePath, - "pos", (long) pos, - "row", records.get(i)))); + expectedDeleteRecords.add( + posDelete.copy( + ImmutableMap.of( + "file_path", deletePath, "pos", (long) pos, "row", records.get(i)))); } } DeleteFile metadata = deleteWriter.toDeleteFile(); Assert.assertEquals("Format should be Parquet", FileFormat.PARQUET, metadata.format()); - Assert.assertEquals("Should be position deletes", FileContent.POSITION_DELETES, metadata.content()); + Assert.assertEquals( + "Should be position deletes", FileContent.POSITION_DELETES, metadata.content()); Assert.assertEquals("Record count should be correct", records.size(), metadata.recordCount()); Assert.assertEquals("Partition should be empty", 0, metadata.partition().size()); Assert.assertNull("Key metadata should be null", metadata.keyMetadata()); List deletedRecords; - try (CloseableIterable reader = Parquet.read(out.toInputFile()) - .project(deleteSchema) - .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(deleteSchema, fileSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(out.toInputFile()) + .project(deleteSchema) + .createReaderFunc( + fileSchema -> GenericParquetReaders.buildReader(deleteSchema, fileSchema)) + .build()) { deletedRecords = Lists.newArrayList(reader); } - Assert.assertEquals("Deleted records should match expected", expectedDeleteRecords, deletedRecords); + Assert.assertEquals( + "Deleted records should match expected", expectedDeleteRecords, deletedRecords); } @Test public void testPositionDeleteWriterWithEmptyRow() throws IOException { File deleteFile = temp.newFile(); - Schema deleteSchema = new Schema( - MetadataColumns.DELETE_FILE_PATH, - MetadataColumns.DELETE_FILE_POS); + Schema deleteSchema = + new Schema(MetadataColumns.DELETE_FILE_PATH, MetadataColumns.DELETE_FILE_POS); String deletePath = "s3://bucket/path/file.parquet"; GenericRecord posDelete = GenericRecord.create(deleteSchema); List expectedDeleteRecords = Lists.newArrayList(); OutputFile out = Files.localOutput(deleteFile); - PositionDeleteWriter deleteWriter = Parquet.writeDeletes(out) - .createWriterFunc(GenericParquetWriter::buildWriter) - .overwrite() - .withSpec(PartitionSpec.unpartitioned()) - .transformPaths(path -> { - throw new RuntimeException("Should not be called for performance reasons"); - }) - .buildPositionWriter(); + PositionDeleteWriter deleteWriter = + Parquet.writeDeletes(out) + .createWriterFunc(GenericParquetWriter::buildWriter) + .overwrite() + .withSpec(PartitionSpec.unpartitioned()) + .transformPaths( + path -> { + throw new RuntimeException("Should not be called for performance reasons"); + }) + .buildPositionWriter(); try (PositionDeleteWriter writer = deleteWriter) { for (int i = 0; i < records.size(); i += 1) { int pos = i * 3 + 2; writer.delete(deletePath, pos, null); - expectedDeleteRecords.add(posDelete.copy(ImmutableMap.of( - "file_path", deletePath, - "pos", (long) pos))); + expectedDeleteRecords.add( + posDelete.copy(ImmutableMap.of("file_path", deletePath, "pos", (long) pos))); } } DeleteFile metadata = deleteWriter.toDeleteFile(); Assert.assertEquals("Format should be Parquet", FileFormat.PARQUET, metadata.format()); - Assert.assertEquals("Should be position deletes", FileContent.POSITION_DELETES, metadata.content()); + Assert.assertEquals( + "Should be position deletes", FileContent.POSITION_DELETES, metadata.content()); Assert.assertEquals("Record count should be correct", records.size(), metadata.recordCount()); Assert.assertEquals("Partition should be empty", 0, metadata.partition().size()); Assert.assertNull("Key metadata should be null", metadata.keyMetadata()); List deletedRecords; - try (CloseableIterable reader = Parquet.read(out.toInputFile()) - .project(deleteSchema) - .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(deleteSchema, fileSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(out.toInputFile()) + .project(deleteSchema) + .createReaderFunc( + fileSchema -> GenericParquetReaders.buildReader(deleteSchema, fileSchema)) + .build()) { deletedRecords = Lists.newArrayList(reader); } - Assert.assertEquals("Deleted records should match expected", expectedDeleteRecords, deletedRecords); + Assert.assertEquals( + "Deleted records should match expected", expectedDeleteRecords, deletedRecords); } } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetSchemaUtil.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetSchemaUtil.java index 6922111b7f60..c669cf02a2be 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetSchemaUtil.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetSchemaUtil.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.concurrent.atomic.AtomicInteger; import org.apache.iceberg.Schema; import org.apache.iceberg.mapping.MappingUtil; @@ -39,144 +41,222 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestParquetSchemaUtil { - private static final Types.StructType SUPPORTED_PRIMITIVES = Types.StructType.of( - required(100, "id", Types.LongType.get()), - optional(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - optional(103, "i", Types.IntegerType.get()), - required(104, "l", Types.LongType.get()), - optional(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - optional(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - required(112, "fixed", Types.FixedType.ofLength(7)), - optional(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision - ); + private static final Types.StructType SUPPORTED_PRIMITIVES = + Types.StructType.of( + required(100, "id", Types.LongType.get()), + optional(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + optional(103, "i", Types.IntegerType.get()), + required(104, "l", Types.LongType.get()), + optional(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + optional(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + required(112, "fixed", Types.FixedType.ofLength(7)), + optional(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision + ); @Test public void testAssignIdsByNameMapping() { - Types.StructType structType = Types.StructType.of( - required(0, "id", Types.LongType.get()), - optional(1, "list_of_maps", - Types.ListType.ofOptional(2, Types.MapType.ofOptional(3, 4, - Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - optional(5, "map_of_lists", - Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), - required(9, "list_of_lists", - Types.ListType.ofOptional(10, Types.ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), - required(12, "map_of_maps", - Types.MapType.ofOptional(13, 14, - Types.StringType.get(), - Types.MapType.ofOptional(15, 16, + Types.StructType structType = + Types.StructType.of( + required(0, "id", Types.LongType.get()), + optional( + 1, + "list_of_maps", + Types.ListType.ofOptional( + 2, + Types.MapType.ofOptional(3, 4, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + optional( + 5, + "map_of_lists", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), + required( + 9, + "list_of_lists", + Types.ListType.ofOptional(10, Types.ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), + required( + 12, + "map_of_maps", + Types.MapType.ofOptional( + 13, + 14, Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - required(17, "list_of_struct_of_nested_types", Types.ListType.ofOptional(19, Types.StructType.of( - Types.NestedField.required(20, "m1", Types.MapType.ofOptional(21, 22, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(23, "l1", Types.ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), - Types.NestedField.required(25, "l2", Types.ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(27, "m2", Types.MapType.ofOptional(28, 29, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)) - ))) - ); + Types.MapType.ofOptional( + 15, 16, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + required( + 17, + "list_of_struct_of_nested_types", + Types.ListType.ofOptional( + 19, + Types.StructType.of( + Types.NestedField.required( + 20, + "m1", + Types.MapType.ofOptional( + 21, 22, Types.StringType.get(), SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 23, "l1", Types.ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), + Types.NestedField.required( + 25, "l2", Types.ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 27, + "m2", + Types.MapType.ofOptional( + 28, 29, Types.StringType.get(), SUPPORTED_PRIMITIVES)))))); - Schema schema = new Schema(TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) - .asStructType().fields()); + Schema schema = + new Schema( + TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) + .asStructType() + .fields()); NameMapping nameMapping = MappingUtil.create(schema); MessageType messageTypeWithIds = ParquetSchemaUtil.convert(schema, "parquet_type"); - MessageType messageTypeWithIdsFromNameMapping = ParquetSchemaUtil - .applyNameMapping(RemoveIds.removeIds(messageTypeWithIds), nameMapping); + MessageType messageTypeWithIdsFromNameMapping = + ParquetSchemaUtil.applyNameMapping(RemoveIds.removeIds(messageTypeWithIds), nameMapping); Assert.assertEquals(messageTypeWithIds, messageTypeWithIdsFromNameMapping); } @Test public void testSchemaConversionWithoutAssigningIds() { - MessageType messageType = new MessageType("test", - primitive(1, "int_col", PrimitiveTypeName.INT32, Repetition.REQUIRED), - primitive(2, "double_col", PrimitiveTypeName.DOUBLE, Repetition.OPTIONAL), - primitive(null, "long_col", PrimitiveTypeName.INT64, Repetition.OPTIONAL), - struct(3, "struct_col_1", Repetition.REQUIRED, - primitive(4, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED), - primitive(null, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL), - primitive(5, "n3", PrimitiveTypeName.INT64, Repetition.OPTIONAL)), - struct(6, "struct_col_2", Repetition.OPTIONAL, - primitive(null, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED), - primitive(null, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL), - primitive(null, "n3", PrimitiveTypeName.INT64, Repetition.OPTIONAL)), - list(null, "list_col_1", Repetition.REQUIRED, - primitive(7, "i", PrimitiveTypeName.INT32, Repetition.OPTIONAL)), - list(8, "list_col_2", Repetition.REQUIRED, - primitive(null, "i", PrimitiveTypeName.INT32, Repetition.OPTIONAL)), - list(9, "list_col_3", Repetition.OPTIONAL, - struct(null, "s", Repetition.REQUIRED, - primitive(10, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED), - primitive(11, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL))), - list(12, "list_col_4", Repetition.REQUIRED, - struct(13, "s", Repetition.REQUIRED, + MessageType messageType = + new MessageType( + "test", + primitive(1, "int_col", PrimitiveTypeName.INT32, Repetition.REQUIRED), + primitive(2, "double_col", PrimitiveTypeName.DOUBLE, Repetition.OPTIONAL), + primitive(null, "long_col", PrimitiveTypeName.INT64, Repetition.OPTIONAL), + struct( + 3, + "struct_col_1", + Repetition.REQUIRED, + primitive(4, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED), + primitive(null, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL), + primitive(5, "n3", PrimitiveTypeName.INT64, Repetition.OPTIONAL)), + struct( + 6, + "struct_col_2", + Repetition.OPTIONAL, primitive(null, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED), - primitive(null, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL))), - list(14, "list_col_5", Repetition.OPTIONAL, - struct(15, "s", Repetition.REQUIRED, - primitive(16, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED), - primitive(17, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL))), - map(null, "map_col_1", Repetition.REQUIRED, - primitive(18, "k", PrimitiveTypeName.INT32, Repetition.REQUIRED), - primitive(19, "v", PrimitiveTypeName.INT32, Repetition.REQUIRED)), - map(20, "map_col_2", Repetition.OPTIONAL, - primitive(null, "k", PrimitiveTypeName.INT32, Repetition.REQUIRED), - primitive(21, "v", PrimitiveTypeName.INT32, Repetition.REQUIRED)), - map(22, "map_col_3", Repetition.REQUIRED, - primitive(null, "k", PrimitiveTypeName.INT32, Repetition.REQUIRED), - primitive(null, "v", PrimitiveTypeName.INT32, Repetition.REQUIRED)), - map(23, "map_col_4", Repetition.OPTIONAL, - primitive(24, "k", PrimitiveTypeName.INT32, Repetition.REQUIRED), - struct(25, "s", Repetition.REQUIRED, - primitive(null, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED), - primitive(26, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL), - primitive(null, "n3", PrimitiveTypeName.INT64, Repetition.OPTIONAL))), - map(27, "map_col_5", Repetition.REQUIRED, - primitive(28, "k", PrimitiveTypeName.INT32, Repetition.REQUIRED), - primitive(29, "v", PrimitiveTypeName.INT32, Repetition.REQUIRED)) - ); + primitive(null, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL), + primitive(null, "n3", PrimitiveTypeName.INT64, Repetition.OPTIONAL)), + list( + null, + "list_col_1", + Repetition.REQUIRED, + primitive(7, "i", PrimitiveTypeName.INT32, Repetition.OPTIONAL)), + list( + 8, + "list_col_2", + Repetition.REQUIRED, + primitive(null, "i", PrimitiveTypeName.INT32, Repetition.OPTIONAL)), + list( + 9, + "list_col_3", + Repetition.OPTIONAL, + struct( + null, + "s", + Repetition.REQUIRED, + primitive(10, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED), + primitive(11, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL))), + list( + 12, + "list_col_4", + Repetition.REQUIRED, + struct( + 13, + "s", + Repetition.REQUIRED, + primitive(null, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED), + primitive(null, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL))), + list( + 14, + "list_col_5", + Repetition.OPTIONAL, + struct( + 15, + "s", + Repetition.REQUIRED, + primitive(16, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED), + primitive(17, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL))), + map( + null, + "map_col_1", + Repetition.REQUIRED, + primitive(18, "k", PrimitiveTypeName.INT32, Repetition.REQUIRED), + primitive(19, "v", PrimitiveTypeName.INT32, Repetition.REQUIRED)), + map( + 20, + "map_col_2", + Repetition.OPTIONAL, + primitive(null, "k", PrimitiveTypeName.INT32, Repetition.REQUIRED), + primitive(21, "v", PrimitiveTypeName.INT32, Repetition.REQUIRED)), + map( + 22, + "map_col_3", + Repetition.REQUIRED, + primitive(null, "k", PrimitiveTypeName.INT32, Repetition.REQUIRED), + primitive(null, "v", PrimitiveTypeName.INT32, Repetition.REQUIRED)), + map( + 23, + "map_col_4", + Repetition.OPTIONAL, + primitive(24, "k", PrimitiveTypeName.INT32, Repetition.REQUIRED), + struct( + 25, + "s", + Repetition.REQUIRED, + primitive(null, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED), + primitive(26, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL), + primitive(null, "n3", PrimitiveTypeName.INT64, Repetition.OPTIONAL))), + map( + 27, + "map_col_5", + Repetition.REQUIRED, + primitive(28, "k", PrimitiveTypeName.INT32, Repetition.REQUIRED), + primitive(29, "v", PrimitiveTypeName.INT32, Repetition.REQUIRED))); - Schema expectedSchema = new Schema( - required(1, "int_col", Types.IntegerType.get()), - optional(2, "double_col", Types.DoubleType.get()), - required(3, "struct_col_1", Types.StructType.of( - required(4, "n1", Types.IntegerType.get()), - optional(5, "n3", Types.LongType.get()) - )), - optional(14, "list_col_5", Types.ListType.ofRequired(15, - Types.StructType.of( - required(16, "n1", Types.IntegerType.get()), - optional(17, "n2", Types.LongType.get()) - ) - )), - optional(23, "map_col_4", Types.MapType.ofRequired(24, 25, - Types.IntegerType.get(), - Types.StructType.of( - optional(26, "n2", Types.LongType.get()) - ) - )), - required(27, "map_col_5", Types.MapType.ofRequired(28, 29, - Types.IntegerType.get(), - Types.IntegerType.get() - )) - ); + Schema expectedSchema = + new Schema( + required(1, "int_col", Types.IntegerType.get()), + optional(2, "double_col", Types.DoubleType.get()), + required( + 3, + "struct_col_1", + Types.StructType.of( + required(4, "n1", Types.IntegerType.get()), + optional(5, "n3", Types.LongType.get()))), + optional( + 14, + "list_col_5", + Types.ListType.ofRequired( + 15, + Types.StructType.of( + required(16, "n1", Types.IntegerType.get()), + optional(17, "n2", Types.LongType.get())))), + optional( + 23, + "map_col_4", + Types.MapType.ofRequired( + 24, + 25, + Types.IntegerType.get(), + Types.StructType.of(optional(26, "n2", Types.LongType.get())))), + required( + 27, + "map_col_5", + Types.MapType.ofRequired( + 28, 29, Types.IntegerType.get(), Types.IntegerType.get()))); Schema actualSchema = ParquetSchemaUtil.convertAndPrune(messageType); Assert.assertEquals("Schema must match", expectedSchema.asStruct(), actualSchema.asStruct()); @@ -185,19 +265,24 @@ public void testSchemaConversionWithoutAssigningIds() { @Test public void testSchemaConversionForHiveStyleLists() { String parquetSchemaString = - "message spark_schema {\n" + - " optional group col1 (LIST) {\n" + - " repeated group bag {\n" + - " optional group array {\n" + - " required int32 col2;\n" + - " }\n" + - " }\n" + - " }\n" + - "}\n"; + "message spark_schema {\n" + + " optional group col1 (LIST) {\n" + + " repeated group bag {\n" + + " optional group array {\n" + + " required int32 col2;\n" + + " }\n" + + " }\n" + + " }\n" + + "}\n"; MessageType messageType = MessageTypeParser.parseMessageType(parquetSchemaString); - Schema expectedSchema = new Schema(optional(1, "col1", Types.ListType.ofOptional( - 2, Types.StructType.of(required(3, "col2", Types.IntegerType.get()))))); + Schema expectedSchema = + new Schema( + optional( + 1, + "col1", + Types.ListType.ofOptional( + 2, Types.StructType.of(required(3, "col2", Types.IntegerType.get()))))); NameMapping nameMapping = MappingUtil.create(expectedSchema); MessageType messageTypeWithIds = ParquetSchemaUtil.applyNameMapping(messageType, nameMapping); Schema actualSchema = ParquetSchemaUtil.convertAndPrune(messageTypeWithIds); @@ -207,16 +292,16 @@ public void testSchemaConversionForHiveStyleLists() { @Test public void testLegacyTwoLevelListTypeWithPrimitiveElement() { String parquetSchemaString = - "message spark_schema {\n" + - " optional group arraybytes (LIST) {\n" + - " repeated binary array;\n" + - " }\n" + - "}\n"; + "message spark_schema {\n" + + " optional group arraybytes (LIST) {\n" + + " repeated binary array;\n" + + " }\n" + + "}\n"; MessageType messageType = MessageTypeParser.parseMessageType(parquetSchemaString); - Schema expectedSchema = new Schema( - optional(1, "arraybytes", Types.ListType.ofRequired(1000, Types.BinaryType.get())) - ); + Schema expectedSchema = + new Schema( + optional(1, "arraybytes", Types.ListType.ofRequired(1000, Types.BinaryType.get()))); Schema actualSchema = ParquetSchemaUtil.convert(messageType); Assert.assertEquals("Schema must match", expectedSchema.asStruct(), actualSchema.asStruct()); @@ -225,29 +310,32 @@ public void testLegacyTwoLevelListTypeWithPrimitiveElement() { @Test public void testLegacyTwoLevelListTypeWithGroupTypeElementWithTwoFields() { String messageType = - "message root {" + - " required group f0 {" + - " required group f00 (LIST) {" + - " repeated group element {" + - " required int32 f000;" + - " optional int64 f001;" + - " }" + - " }" + - " }" + - "}"; + "message root {" + + " required group f0 {" + + " required group f00 (LIST) {" + + " repeated group element {" + + " required int32 f000;" + + " optional int64 f001;" + + " }" + + " }" + + " }" + + "}"; MessageType parquetScehma = MessageTypeParser.parseMessageType(messageType); - Schema expectedSchema = new Schema( - required(1, "f0", Types.StructType.of( - required(1003, "f00", Types.ListType.ofRequired( - 1002, + Schema expectedSchema = + new Schema( + required( + 1, + "f0", Types.StructType.of( - required(1000, "f000", Types.IntegerType.get()), - optional(1001, "f001", Types.LongType.get()) - ) - )) - )) - ); + required( + 1003, + "f00", + Types.ListType.ofRequired( + 1002, + Types.StructType.of( + required(1000, "f000", Types.IntegerType.get()), + optional(1001, "f001", Types.LongType.get()))))))); Schema actualSchema = ParquetSchemaUtil.convert(parquetScehma); Assert.assertEquals("Schema must match", expectedSchema.asStruct(), actualSchema.asStruct()); @@ -256,23 +344,22 @@ public void testLegacyTwoLevelListTypeWithGroupTypeElementWithTwoFields() { @Test public void testLegacyTwoLevelListGenByParquetAvro() { String messageType = - "message root {" + - " optional group my_list (LIST) {" + - " repeated group array {" + - " required binary str (UTF8);" + - " }" + - " }" + - "}"; + "message root {" + + " optional group my_list (LIST) {" + + " repeated group array {" + + " required binary str (UTF8);" + + " }" + + " }" + + "}"; MessageType parquetScehma = MessageTypeParser.parseMessageType(messageType); - Schema expectedSchema = new Schema( - optional(1, "my_list", Types.ListType.ofRequired( - 1001, - Types.StructType.of( - required(1000, "str", Types.StringType.get()) - ) - )) - ); + Schema expectedSchema = + new Schema( + optional( + 1, + "my_list", + Types.ListType.ofRequired( + 1001, Types.StructType.of(required(1000, "str", Types.StringType.get()))))); Schema actualSchema = ParquetSchemaUtil.convert(parquetScehma); Assert.assertEquals("Schema must match", expectedSchema.asStruct(), actualSchema.asStruct()); @@ -281,23 +368,22 @@ public void testLegacyTwoLevelListGenByParquetAvro() { @Test public void testLegacyTwoLevelListGenByParquetThrift() { String messageType = - "message root {" + - " optional group my_list (LIST) {" + - " repeated group my_list_tuple {" + - " required binary str (UTF8);" + - " }" + - " }" + - "}"; + "message root {" + + " optional group my_list (LIST) {" + + " repeated group my_list_tuple {" + + " required binary str (UTF8);" + + " }" + + " }" + + "}"; MessageType parquetScehma = MessageTypeParser.parseMessageType(messageType); - Schema expectedSchema = new Schema( - optional(1, "my_list", Types.ListType.ofRequired( - 1001, - Types.StructType.of( - required(1000, "str", Types.StringType.get()) - ) - )) - ); + Schema expectedSchema = + new Schema( + optional( + 1, + "my_list", + Types.ListType.ofRequired( + 1001, Types.StructType.of(required(1000, "str", Types.StringType.get()))))); Schema actualSchema = ParquetSchemaUtil.convert(parquetScehma); Assert.assertEquals("Schema must match", expectedSchema.asStruct(), actualSchema.asStruct()); @@ -306,30 +392,31 @@ public void testLegacyTwoLevelListGenByParquetThrift() { @Test public void testLegacyTwoLevelListGenByParquetThrift1() { String messageType = - "message root {" + - " optional group my_list (LIST) {" + - " repeated group my_list_tuple (LIST) {" + - " repeated int32 my_list_tuple_tuple;" + - " }" + - " }" + - "}"; + "message root {" + + " optional group my_list (LIST) {" + + " repeated group my_list_tuple (LIST) {" + + " repeated int32 my_list_tuple_tuple;" + + " }" + + " }" + + "}"; MessageType parquetScehma = MessageTypeParser.parseMessageType(messageType); - Schema expectedSchema = new Schema( - optional(1, "my_list", Types.ListType.ofRequired( - 1001, - Types.ListType.ofRequired( - 1000, Types.IntegerType.get() - ) - )) - ); + Schema expectedSchema = + new Schema( + optional( + 1, + "my_list", + Types.ListType.ofRequired( + 1001, Types.ListType.ofRequired(1000, Types.IntegerType.get())))); Schema actualSchema = ParquetSchemaUtil.convert(parquetScehma); Assert.assertEquals("Schema must match", expectedSchema.asStruct(), actualSchema.asStruct()); } - private Type primitive(Integer id, String name, PrimitiveTypeName typeName, Repetition repetition) { - PrimitiveBuilder builder = org.apache.parquet.schema.Types.primitive(typeName, repetition); + private Type primitive( + Integer id, String name, PrimitiveTypeName typeName, Repetition repetition) { + PrimitiveBuilder builder = + org.apache.parquet.schema.Types.primitive(typeName, repetition); if (id != null) { builder.id(id); } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java index 7d6f3baf3053..9e33ffa314ed 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestPruneColumns.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import org.apache.iceberg.Schema; @@ -37,53 +36,86 @@ public class TestPruneColumns { @Test public void testMapKeyValueName() { - MessageType fileSchema = Types.buildMessage() - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.buildGroup(Type.Repetition.REPEATED) - .addField(Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) - .as(LogicalTypeAnnotation.stringType()) - .id(2) - .named("key")) - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(4).named("x")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(5).named("y")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(6).named("z")) - .id(3) - .named("value")) - .named("custom_key_value_name")) - .as(LogicalTypeAnnotation.mapType()) - .id(1) - .named("m")) - .named("table"); + MessageType fileSchema = + Types.buildMessage() + .addField( + Types.buildGroup(Type.Repetition.OPTIONAL) + .addField( + Types.buildGroup(Type.Repetition.REPEATED) + .addField( + Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.stringType()) + .id(2) + .named("key")) + .addField( + Types.buildGroup(Type.Repetition.OPTIONAL) + .addField( + Types.primitive( + PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(4) + .named("x")) + .addField( + Types.primitive( + PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(5) + .named("y")) + .addField( + Types.primitive( + PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(6) + .named("z")) + .id(3) + .named("value")) + .named("custom_key_value_name")) + .as(LogicalTypeAnnotation.mapType()) + .id(1) + .named("m")) + .named("table"); // project map.value.x and map.value.y - Schema projection = new Schema( - NestedField.optional(1, "m", MapType.ofOptional(2, 3, - StringType.get(), - StructType.of( - NestedField.required(4, "x", DoubleType.get()), - NestedField.required(5, "y", DoubleType.get()) - ) - )) - ); + Schema projection = + new Schema( + NestedField.optional( + 1, + "m", + MapType.ofOptional( + 2, + 3, + StringType.get(), + StructType.of( + NestedField.required(4, "x", DoubleType.get()), + NestedField.required(5, "y", DoubleType.get()))))); - MessageType expected = Types.buildMessage() - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.buildGroup(Type.Repetition.REPEATED) - .addField(Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) - .as(LogicalTypeAnnotation.stringType()) - .id(2) - .named("key")) - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(4).named("x")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(5).named("y")) - .id(3) - .named("value")) - .named("custom_key_value_name")) - .as(LogicalTypeAnnotation.mapType()) - .id(1) - .named("m")) - .named("table"); + MessageType expected = + Types.buildMessage() + .addField( + Types.buildGroup(Type.Repetition.OPTIONAL) + .addField( + Types.buildGroup(Type.Repetition.REPEATED) + .addField( + Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .as(LogicalTypeAnnotation.stringType()) + .id(2) + .named("key")) + .addField( + Types.buildGroup(Type.Repetition.OPTIONAL) + .addField( + Types.primitive( + PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(4) + .named("x")) + .addField( + Types.primitive( + PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(5) + .named("y")) + .id(3) + .named("value")) + .named("custom_key_value_name")) + .as(LogicalTypeAnnotation.mapType()) + .id(1) + .named("m")) + .named("table"); MessageType actual = ParquetSchemaUtil.pruneColumns(fileSchema, projection); Assert.assertEquals("Pruned schema should not rename repeated struct", expected, actual); @@ -91,44 +123,74 @@ public void testMapKeyValueName() { @Test public void testListElementName() { - MessageType fileSchema = Types.buildMessage() - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.buildGroup(Type.Repetition.REPEATED) - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(4).named("x")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(5).named("y")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(6).named("z")) - .id(3) - .named("custom_element_name")) - .named("custom_repeated_name")) - .as(LogicalTypeAnnotation.listType()) - .id(1) - .named("m")) - .named("table"); + MessageType fileSchema = + Types.buildMessage() + .addField( + Types.buildGroup(Type.Repetition.OPTIONAL) + .addField( + Types.buildGroup(Type.Repetition.REPEATED) + .addField( + Types.buildGroup(Type.Repetition.OPTIONAL) + .addField( + Types.primitive( + PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(4) + .named("x")) + .addField( + Types.primitive( + PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(5) + .named("y")) + .addField( + Types.primitive( + PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(6) + .named("z")) + .id(3) + .named("custom_element_name")) + .named("custom_repeated_name")) + .as(LogicalTypeAnnotation.listType()) + .id(1) + .named("m")) + .named("table"); // project map.value.x and map.value.y - Schema projection = new Schema( - NestedField.optional(1, "m", ListType.ofOptional(3, - StructType.of( - NestedField.required(4, "x", DoubleType.get()), - NestedField.required(5, "y", DoubleType.get()) - ) - )) - ); + Schema projection = + new Schema( + NestedField.optional( + 1, + "m", + ListType.ofOptional( + 3, + StructType.of( + NestedField.required(4, "x", DoubleType.get()), + NestedField.required(5, "y", DoubleType.get()))))); - MessageType expected = Types.buildMessage() - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.buildGroup(Type.Repetition.REPEATED) - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(4).named("x")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(5).named("y")) - .id(3) - .named("custom_element_name")) - .named("custom_repeated_name")) - .as(LogicalTypeAnnotation.listType()) - .id(1) - .named("m")) - .named("table"); + MessageType expected = + Types.buildMessage() + .addField( + Types.buildGroup(Type.Repetition.OPTIONAL) + .addField( + Types.buildGroup(Type.Repetition.REPEATED) + .addField( + Types.buildGroup(Type.Repetition.OPTIONAL) + .addField( + Types.primitive( + PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(4) + .named("x")) + .addField( + Types.primitive( + PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(5) + .named("y")) + .id(3) + .named("custom_element_name")) + .named("custom_repeated_name")) + .as(LogicalTypeAnnotation.listType()) + .id(1) + .named("m")) + .named("table"); MessageType actual = ParquetSchemaUtil.pruneColumns(fileSchema, projection); Assert.assertEquals("Pruned schema should not rename repeated struct", expected, actual); @@ -136,41 +198,73 @@ public void testListElementName() { @Test public void testStructElementName() { - MessageType fileSchema = Types.buildMessage() - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(1).named("id")) - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(3).named("x")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(4).named("y")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(5).named("z")) - .id(2) - .named("struct_name_1")) - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(7).named("x")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(8).named("y")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(9).named("z")) - .id(6) - .named("struct_name_2")) - .named("table"); + MessageType fileSchema = + Types.buildMessage() + .addField( + Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(1) + .named("id")) + .addField( + Types.buildGroup(Type.Repetition.OPTIONAL) + .addField( + Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(3) + .named("x")) + .addField( + Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(4) + .named("y")) + .addField( + Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(5) + .named("z")) + .id(2) + .named("struct_name_1")) + .addField( + Types.buildGroup(Type.Repetition.OPTIONAL) + .addField( + Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(7) + .named("x")) + .addField( + Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(8) + .named("y")) + .addField( + Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(9) + .named("z")) + .id(6) + .named("struct_name_2")) + .named("table"); // project map.value.x and map.value.y - Schema projection = new Schema( - NestedField.optional(2, "struct_name_1", StructType.of( - NestedField.required(4, "y", DoubleType.get()), - NestedField.required(5, "z", DoubleType.get()) - )), - NestedField.optional(6, "struct_name_2", StructType.of()) - ); + Schema projection = + new Schema( + NestedField.optional( + 2, + "struct_name_1", + StructType.of( + NestedField.required(4, "y", DoubleType.get()), + NestedField.required(5, "z", DoubleType.get()))), + NestedField.optional(6, "struct_name_2", StructType.of())); - MessageType expected = Types.buildMessage() - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(4).named("y")) - .addField(Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED).id(5).named("z")) - .id(2) - .named("struct_name_1")) - .addField(Types.buildGroup(Type.Repetition.OPTIONAL) - .id(6) - .named("struct_name_2")) - .named("table"); + MessageType expected = + Types.buildMessage() + .addField( + Types.buildGroup(Type.Repetition.OPTIONAL) + .addField( + Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(4) + .named("y")) + .addField( + Types.primitive(PrimitiveTypeName.DOUBLE, Type.Repetition.REQUIRED) + .id(5) + .named("z")) + .id(2) + .named("struct_name_1")) + .addField(Types.buildGroup(Type.Repetition.OPTIONAL).id(6).named("struct_name_2")) + .named("table"); MessageType actual = ParquetSchemaUtil.pruneColumns(fileSchema, projection); Assert.assertEquals("Pruned schema should be matched", expected, actual); diff --git a/pig/src/main/java/org/apache/iceberg/pig/IcebergPigInputFormat.java b/pig/src/main/java/org/apache/iceberg/pig/IcebergPigInputFormat.java index e4bf5983d15b..0d5a6dd9271f 100644 --- a/pig/src/main/java/org/apache/iceberg/pig/IcebergPigInputFormat.java +++ b/pig/src/main/java/org/apache/iceberg/pig/IcebergPigInputFormat.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.pig; import java.io.DataInput; @@ -87,7 +86,9 @@ public List getSplits(JobContext context) throws IOException { // Apply Filters Expression filterExpression = - (Expression) ObjectSerializer.deserialize(context.getConfiguration().get(scope(ICEBERG_FILTER_EXPRESSION))); + (Expression) + ObjectSerializer.deserialize( + context.getConfiguration().get(scope(ICEBERG_FILTER_EXPRESSION))); LOG.info("[{}]: iceberg filter expressions: {}", signature, filterExpression); if (filterExpression != null) { @@ -109,7 +110,7 @@ public RecordReader createRecordReader(InputSplit split, TaskAttemptCon } private static class IcebergSplit extends InputSplit implements Writable { - private static final String[] ANYWHERE = new String[] { "*" }; + private static final String[] ANYWHERE = new String[] {"*"}; private CombinedScanTask task; @@ -178,14 +179,19 @@ private boolean advance() throws IOException { FileScanTask currentTask = tasks.next(); - Schema tableSchema = (Schema) ObjectSerializer.deserialize(context.getConfiguration().get(scope(ICEBERG_SCHEMA))); + Schema tableSchema = + (Schema) + ObjectSerializer.deserialize(context.getConfiguration().get(scope(ICEBERG_SCHEMA))); LOG.debug("[{}]: Task table schema: {}", signature, tableSchema); List projectedFields = - (List) ObjectSerializer.deserialize(context.getConfiguration().get(scope(ICEBERG_PROJECTED_FIELDS))); + (List) + ObjectSerializer.deserialize( + context.getConfiguration().get(scope(ICEBERG_PROJECTED_FIELDS))); LOG.debug("[{}]: Task projected fields: {}", signature, projectedFields); - Schema projectedSchema = projectedFields != null ? SchemaUtil.project(tableSchema, projectedFields) : tableSchema; + Schema projectedSchema = + projectedFields != null ? SchemaUtil.project(tableSchema, projectedFields) : tableSchema; PartitionSpec spec = currentTask.asFileScanTask().spec(); DataFile file = currentTask.file(); @@ -214,24 +220,31 @@ private boolean advance() throws IOException { int partitionIndex = partitionSpecFieldIndexMap.get(field.name()); Object partitionValue = file.partition().get(partitionIndex, Object.class); - partitionValueMap.put(field.fieldId(), convertPartitionValue(field.type(), partitionValue)); + partitionValueMap.put( + field.fieldId(), convertPartitionValue(field.type(), partitionValue)); } - reader = Parquet.read(inputFile) - .project(readSchema) - .split(currentTask.start(), currentTask.length()) - .filter(currentTask.residual()) - .createReaderFunc( - fileSchema -> PigParquetReader.buildReader(fileSchema, projectedSchema, partitionValueMap)) - .build(); + reader = + Parquet.read(inputFile) + .project(readSchema) + .split(currentTask.start(), currentTask.length()) + .filter(currentTask.residual()) + .createReaderFunc( + fileSchema -> + PigParquetReader.buildReader( + fileSchema, projectedSchema, partitionValueMap)) + .build(); } else { - reader = Parquet.read(inputFile) - .project(projectedSchema) - .split(currentTask.start(), currentTask.length()) - .filter(currentTask.residual()) - .createReaderFunc( - fileSchema -> PigParquetReader.buildReader(fileSchema, projectedSchema, partitionValueMap)) - .build(); + reader = + Parquet.read(inputFile) + .project(projectedSchema) + .split(currentTask.start(), currentTask.length()) + .filter(currentTask.residual()) + .createReaderFunc( + fileSchema -> + PigParquetReader.buildReader( + fileSchema, projectedSchema, partitionValueMap)) + .build(); } recordIterator = reader.iterator(); @@ -285,8 +298,6 @@ public float getProgress() { } @Override - public void close() { - - } + public void close() {} } } diff --git a/pig/src/main/java/org/apache/iceberg/pig/IcebergStorage.java b/pig/src/main/java/org/apache/iceberg/pig/IcebergStorage.java index 1daa33afedf6..51221666a02b 100644 --- a/pig/src/main/java/org/apache/iceberg/pig/IcebergStorage.java +++ b/pig/src/main/java/org/apache/iceberg/pig/IcebergStorage.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.pig; import java.io.IOException; @@ -65,7 +64,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class IcebergStorage extends LoadFunc implements LoadMetadata, LoadPredicatePushdown, LoadPushDown { +public class IcebergStorage extends LoadFunc + implements LoadMetadata, LoadPredicatePushdown, LoadPushDown { private static final Logger LOG = LoggerFactory.getLogger(IcebergStorage.class); public static final String PIG_ICEBERG_TABLES_IMPL = "pig.iceberg.tables.impl"; @@ -124,7 +124,6 @@ public ResourceSchema getSchema(String location, Job job) throws IOException { return SchemaUtil.convert(schema); } - @Override public ResourceStatistics getStatistics(String location, Job job) { LOG.info("[{}]: getStatistics() -> : {}", signature, location); @@ -167,8 +166,19 @@ public List getPredicateFields(String location, Job job) throws IOExcept @Override public ImmutableList getSupportedExpressionTypes() { LOG.info("[{}]: getSupportedExpressionTypes()", signature); - return ImmutableList.of(OpType.OP_AND, OpType.OP_OR, OpType.OP_EQ, OpType.OP_NE, OpType.OP_NOT, OpType.OP_GE, - OpType.OP_GT, OpType.OP_LE, OpType.OP_LT, OpType.OP_BETWEEN, OpType.OP_IN, OpType.OP_NULL); + return ImmutableList.of( + OpType.OP_AND, + OpType.OP_OR, + OpType.OP_EQ, + OpType.OP_NE, + OpType.OP_NOT, + OpType.OP_GE, + OpType.OP_GT, + OpType.OP_LE, + OpType.OP_LT, + OpType.OP_BETWEEN, + OpType.OP_IN, + OpType.OP_NULL); } @Override @@ -183,7 +193,8 @@ public void setPushdownPredicate(Expression predicate) throws IOException { storeInUDFContext(IcebergPigInputFormat.ICEBERG_FILTER_EXPRESSION, icebergExpression); } - private org.apache.iceberg.expressions.Expression convert(Expression expression) throws IOException { + private org.apache.iceberg.expressions.Expression convert(Expression expression) + throws IOException { OpType op = expression.getOpType(); if (expression instanceof BinaryExpression) { @@ -199,12 +210,12 @@ private org.apache.iceberg.expressions.Expression convert(Expression expression) BetweenExpression between = (BetweenExpression) rhs; return Expressions.and( convert(OpType.OP_GE, (Column) lhs, (Const) between.getLower()), - convert(OpType.OP_LE, (Column) lhs, (Const) between.getUpper()) - ); + convert(OpType.OP_LE, (Column) lhs, (Const) between.getUpper())); case OP_IN: - return ((InExpression) rhs).getValues().stream() - .map(value -> convert(OpType.OP_EQ, (Column) lhs, (Const) value)) - .reduce(Expressions.alwaysFalse(), Expressions::or); + return ((InExpression) rhs) + .getValues().stream() + .map(value -> convert(OpType.OP_EQ, (Column) lhs, (Const) value)) + .reduce(Expressions.alwaysFalse(), Expressions::or); default: if (lhs instanceof Column && rhs instanceof Const) { return convert(op, (Column) lhs, (Const) rhs); @@ -217,9 +228,12 @@ private org.apache.iceberg.expressions.Expression convert(Expression expression) Expression unary = ((UnaryExpression) expression).getExpression(); switch (op) { - case OP_NOT: return Expressions.not(convert(unary)); - case OP_NULL: return Expressions.isNull(((Column) unary).getName()); - default: throw new FrontendException("Unsupported unary operator" + op); + case OP_NOT: + return Expressions.not(convert(unary)); + case OP_NULL: + return Expressions.isNull(((Column) unary).getName()); + default: + throw new FrontendException("Unsupported unary operator" + op); } } @@ -231,16 +245,23 @@ private org.apache.iceberg.expressions.Expression convert(OpType op, Column col, Object value = constant.getValue(); switch (op) { - case OP_GE: return Expressions.greaterThanOrEqual(name, value); - case OP_GT: return Expressions.greaterThan(name, value); - case OP_LE: return Expressions.lessThanOrEqual(name, value); - case OP_LT: return Expressions.lessThan(name, value); - case OP_EQ: return NaNUtil.isNaN(value) ? Expressions.isNaN(name) : Expressions.equal(name, value); - case OP_NE: return NaNUtil.isNaN(value) ? Expressions.notNaN(name) : Expressions.notEqual(name, value); + case OP_GE: + return Expressions.greaterThanOrEqual(name, value); + case OP_GT: + return Expressions.greaterThan(name, value); + case OP_LE: + return Expressions.lessThanOrEqual(name, value); + case OP_LT: + return Expressions.lessThan(name, value); + case OP_EQ: + return NaNUtil.isNaN(value) ? Expressions.isNaN(name) : Expressions.equal(name, value); + case OP_NE: + return NaNUtil.isNaN(value) ? Expressions.notNaN(name) : Expressions.notEqual(name, value); } throw new RuntimeException( - String.format("[%s]: Failed to pushdown expression: %s %s %s", signature, col, op, constant)); + String.format( + "[%s]: Failed to pushdown expression: %s %s %s", signature, col, op, constant)); } @Override @@ -253,8 +274,10 @@ public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList) LOG.info("[{}]: pushProjection() -> {}", signature, requiredFieldList); try { - List projection = requiredFieldList.getFields() - .stream().map(RequiredField::getAlias).collect(Collectors.toList()); + List projection = + requiredFieldList.getFields().stream() + .map(RequiredField::getAlias) + .collect(Collectors.toList()); storeInUDFContext(IcebergPigInputFormat.ICEBERG_PROJECTED_FIELDS, (Serializable) projection); } catch (IOException e) { @@ -270,14 +293,17 @@ public void setUDFContextSignature(String newSignature) { } private void storeInUDFContext(String key, Serializable value) throws IOException { - Properties properties = UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[]{signature}); + Properties properties = + UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[] {signature}); properties.setProperty(key, ObjectSerializer.serialize(value)); } private void copyUDFContextToScopedConfiguration(Configuration conf, String key) { - String value = UDFContext.getUDFContext() - .getUDFProperties(this.getClass(), new String[]{signature}).getProperty(key); + String value = + UDFContext.getUDFContext() + .getUDFProperties(this.getClass(), new String[] {signature}) + .getProperty(key); if (value != null) { conf.set(key + '.' + signature, value); @@ -291,7 +317,8 @@ public String relativeToAbsolutePath(String location, Path curDir) throws IOExce private Table load(String location, Job job) throws IOException { if (iceberg == null) { - Class tablesImpl = job.getConfiguration().getClass(PIG_ICEBERG_TABLES_IMPL, HadoopTables.class); + Class tablesImpl = + job.getConfiguration().getClass(PIG_ICEBERG_TABLES_IMPL, HadoopTables.class); LOG.info("Initializing iceberg tables implementation: {}", tablesImpl); iceberg = (Tables) ReflectionUtils.newInstance(tablesImpl, job.getConfiguration()); } @@ -310,5 +337,4 @@ private Table load(String location, Job job) throws IOException { return result; } - } diff --git a/pig/src/main/java/org/apache/iceberg/pig/PigParquetReader.java b/pig/src/main/java/org/apache/iceberg/pig/PigParquetReader.java index 9fddba474b3c..4c9e582e4c1f 100644 --- a/pig/src/main/java/org/apache/iceberg/pig/PigParquetReader.java +++ b/pig/src/main/java/org/apache/iceberg/pig/PigParquetReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.pig; import java.time.Instant; @@ -62,8 +61,7 @@ import org.apache.pig.data.TupleFactory; public class PigParquetReader { - private PigParquetReader() { - } + private PigParquetReader() {} @SuppressWarnings("unchecked") public static ParquetValueReader buildReader( @@ -71,11 +69,13 @@ public static ParquetValueReader buildReader( if (ParquetSchemaUtil.hasIds(fileSchema)) { return (ParquetValueReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, - new ReadBuilder(fileSchema, partitionValues)); + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, partitionValues)); } else { return (ParquetValueReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), + fileSchema, new FallbackReadBuilder(fileSchema, partitionValues)); } } @@ -96,8 +96,8 @@ public ParquetValueReader message( public ParquetValueReader struct( Types.StructType ignored, GroupType struct, List> fieldReaders) { // the expected struct is ignored because nested fields are never found when the - List> newFields = Lists.newArrayListWithExpectedSize( - fieldReaders.size()); + List> newFields = + Lists.newArrayListWithExpectedSize(fieldReaders.size()); List types = Lists.newArrayListWithExpectedSize(fieldReaders.size()); List fields = struct.getFields(); for (int i = 0; i < fields.size(); i += 1) { @@ -145,10 +145,10 @@ public ParquetValueReader struct( typesById.put(id, fieldType); } - List expectedFields = expected != null ? - expected.fields() : ImmutableList.of(); - List> reorderedFields = Lists.newArrayListWithExpectedSize( - expectedFields.size()); + List expectedFields = + expected != null ? expected.fields() : ImmutableList.of(); + List> reorderedFields = + Lists.newArrayListWithExpectedSize(expectedFields.size()); List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); for (Types.NestedField field : expectedFields) { int id = field.fieldId(); @@ -182,12 +182,16 @@ public ParquetValueReader list( Type elementType = ParquetSchemaUtil.determineListElementType(array); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - return new ArrayReader<>(repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); + return new ArrayReader<>( + repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); } @Override public ParquetValueReader map( - Types.MapType expectedMap, GroupType map, ParquetValueReader keyReader, ParquetValueReader valueReader) { + Types.MapType expectedMap, + GroupType map, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); @@ -199,7 +203,9 @@ public ParquetValueReader map( Type valueType = repeatedKeyValue.getType(1); int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - return new MapReader<>(repeatedD, repeatedR, + return new MapReader<>( + repeatedD, + repeatedR, ParquetValueReaders.option(keyType, keyD, keyReader), ParquetValueReaders.option(valueType, valueD, valueReader)); } @@ -225,9 +231,12 @@ public ParquetValueReader primitive( } else { return new UnboxedReader(desc); } - case INT_64: return new UnboxedReader<>(desc); - case TIMESTAMP_MILLIS: return new TimestampMillisReader(desc); - case TIMESTAMP_MICROS: return new TimestampMicrosReader(desc); + case INT_64: + return new UnboxedReader<>(desc); + case TIMESTAMP_MILLIS: + return new TimestampMillisReader(desc); + case TIMESTAMP_MICROS: + return new TimestampMicrosReader(desc); case DECIMAL: DecimalMetadata decimal = primitive.getDecimalMetadata(); switch (primitive.getPrimitiveTypeName()) { @@ -243,7 +252,8 @@ public ParquetValueReader primitive( "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); } default: - throw new UnsupportedOperationException("Unsupported type: " + primitive.getOriginalType()); + throw new UnsupportedOperationException( + "Unsupported type: " + primitive.getOriginalType()); } } @@ -283,7 +293,8 @@ private static class DateReader extends PrimitiveReader { @Override public String read(String reuse) { OffsetDateTime day = EPOCH.plusDays(column.nextInteger()); - return String.format("%04d-%02d-%02d", day.getYear(), day.getMonth().getValue(), day.getDayOfMonth()); + return String.format( + "%04d-%02d-%02d", day.getYear(), day.getMonth().getValue(), day.getDayOfMonth()); } } @@ -328,8 +339,11 @@ public String read(String ignored) { private static class MapReader extends RepeatedKeyValueReader, Map, K, V> { private final ReusableEntry nullEntry = new ReusableEntry<>(); - MapReader(int definitionLevel, int repetitionLevel, - ParquetValueReader keyReader, ParquetValueReader valueReader) { + MapReader( + int definitionLevel, + int repetitionLevel, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { super(definitionLevel, repetitionLevel, keyReader, valueReader); } @@ -412,7 +426,8 @@ protected void set(Tuple tuple, int pos, Object value) { try { tuple.set(pos, value); } catch (ExecException e) { - throw new RuntimeException(String.format("Error setting tuple value for pos: %d, value: %s", pos, value), e); + throw new RuntimeException( + String.format("Error setting tuple value for pos: %d, value: %s", pos, value), e); } } } diff --git a/pig/src/main/java/org/apache/iceberg/pig/SchemaUtil.java b/pig/src/main/java/org/apache/iceberg/pig/SchemaUtil.java index 108ee2da1f3e..e02cfaf677b8 100644 --- a/pig/src/main/java/org/apache/iceberg/pig/SchemaUtil.java +++ b/pig/src/main/java/org/apache/iceberg/pig/SchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.pig; import java.io.IOException; @@ -32,8 +31,7 @@ public class SchemaUtil { - private SchemaUtil() { - } + private SchemaUtil() {} public static ResourceSchema convert(Schema icebergSchema) throws IOException { ResourceSchema result = new ResourceSchema(); @@ -60,7 +58,8 @@ private static ResourceFieldSchema convert(Type type) throws IOException { return result; } - private static ResourceFieldSchema [] convertFields(List fields) throws IOException { + private static ResourceFieldSchema[] convertFields(List fields) + throws IOException { List result = Lists.newArrayList(); for (Types.NestedField nf : fields) { @@ -72,20 +71,34 @@ private static ResourceFieldSchema convert(Type type) throws IOException { private static byte convertType(Type type) throws IOException { switch (type.typeId()) { - case BOOLEAN: return DataType.BOOLEAN; - case INTEGER: return DataType.INTEGER; - case LONG: return DataType.LONG; - case FLOAT: return DataType.FLOAT; - case DOUBLE: return DataType.DOUBLE; - case TIMESTAMP: return DataType.CHARARRAY; - case DATE: return DataType.CHARARRAY; - case STRING: return DataType.CHARARRAY; - case FIXED: return DataType.BYTEARRAY; - case BINARY: return DataType.BYTEARRAY; - case DECIMAL: return DataType.BIGDECIMAL; - case STRUCT: return DataType.TUPLE; - case LIST: return DataType.BAG; - case MAP: return DataType.MAP; + case BOOLEAN: + return DataType.BOOLEAN; + case INTEGER: + return DataType.INTEGER; + case LONG: + return DataType.LONG; + case FLOAT: + return DataType.FLOAT; + case DOUBLE: + return DataType.DOUBLE; + case TIMESTAMP: + return DataType.CHARARRAY; + case DATE: + return DataType.CHARARRAY; + case STRING: + return DataType.CHARARRAY; + case FIXED: + return DataType.BYTEARRAY; + case BINARY: + return DataType.BYTEARRAY; + case DECIMAL: + return DataType.BIGDECIMAL; + case STRUCT: + return DataType.TUPLE; + case LIST: + return DataType.BAG; + case MAP: + return DataType.MAP; default: throw new FrontendException("Unsupported primitive type:" + type); } @@ -110,7 +123,8 @@ private static ResourceSchema convertComplex(Type type) throws IOException { case LIST: Types.ListType listType = type.asListType(); - ResourceFieldSchema [] elementFieldSchemas = new ResourceFieldSchema[]{convert(listType.elementType())}; + ResourceFieldSchema[] elementFieldSchemas = + new ResourceFieldSchema[] {convert(listType.elementType())}; if (listType.elementType().isStructType()) { result.setFields(elementFieldSchemas); @@ -123,7 +137,7 @@ private static ResourceSchema convertComplex(Type type) throws IOException { tupleSchema.setType(DataType.TUPLE); tupleSchema.setSchema(elementSchema); - result.setFields(new ResourceFieldSchema[]{tupleSchema}); + result.setFields(new ResourceFieldSchema[] {tupleSchema}); } return result; @@ -133,7 +147,7 @@ private static ResourceSchema convertComplex(Type type) throws IOException { if (mapType.keyType().typeId() != Type.TypeID.STRING) { throw new FrontendException("Unsupported map key type: " + mapType.keyType()); } - result.setFields(new ResourceFieldSchema[]{convert(mapType.valueType())}); + result.setFields(new ResourceFieldSchema[] {convert(mapType.valueType())}); return result; default: @@ -150,5 +164,4 @@ public static Schema project(Schema schema, List requiredFields) { return new Schema(columns); } - } diff --git a/pig/src/test/java/org/apache/iceberg/pig/SchemaUtilTest.java b/pig/src/test/java/org/apache/iceberg/pig/SchemaUtilTest.java index bb946ae166c6..bddfe4951d7c 100644 --- a/pig/src/test/java/org/apache/iceberg/pig/SchemaUtilTest.java +++ b/pig/src/test/java/org/apache/iceberg/pig/SchemaUtilTest.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.pig; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.junit.Assert.assertEquals; + import java.io.IOException; import org.apache.iceberg.Schema; import org.apache.iceberg.types.Types.BinaryType; @@ -37,28 +40,25 @@ import org.assertj.core.api.Assertions; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.junit.Assert.assertEquals; - public class SchemaUtilTest { @Test public void testPrimitive() throws IOException { - Schema icebergSchema = new Schema( - optional(1, "b", BooleanType.get()), - optional(2, "i", IntegerType.get()), - optional(3, "l", LongType.get()), - optional(4, "f", FloatType.get()), - optional(5, "d", DoubleType.get()), - optional(6, "dec", DecimalType.of(0, 2)), - optional(7, "s", StringType.get()), - optional(8, "bi", BinaryType.get()) - ); + Schema icebergSchema = + new Schema( + optional(1, "b", BooleanType.get()), + optional(2, "i", IntegerType.get()), + optional(3, "l", LongType.get()), + optional(4, "f", FloatType.get()), + optional(5, "d", DoubleType.get()), + optional(6, "dec", DecimalType.of(0, 2)), + optional(7, "s", StringType.get()), + optional(8, "bi", BinaryType.get())); ResourceSchema pigSchema = SchemaUtil.convert(icebergSchema); assertEquals( - "b:boolean,i:int,l:long,f:float,d:double,dec:bigdecimal,s:chararray,bi:bytearray", pigSchema.toString()); + "b:boolean,i:int,l:long,f:float,d:double,dec:bigdecimal,s:chararray,bi:bytearray", + pigSchema.toString()); } @Test @@ -67,86 +67,140 @@ public void testComplex() throws IOException { new Schema( optional(1, "bag", ListType.ofOptional(2, BooleanType.get())), optional(3, "map", MapType.ofOptional(4, 5, StringType.get(), DoubleType.get())), - optional(6, "tuple", - StructType.of(optional(7, "i", IntegerType.get()), optional(8, "f", FloatType.get()))) - ), "bag:{(boolean)},map:[double],tuple:(i:int,f:float)", null - ); + optional( + 6, + "tuple", + StructType.of( + optional(7, "i", IntegerType.get()), optional(8, "f", FloatType.get())))), + "bag:{(boolean)},map:[double],tuple:(i:int,f:float)", + null); } @Test public void invalidMap() { - Assertions.assertThatThrownBy(() -> convertToPigSchema(new Schema( - optional(1, "invalid", MapType.ofOptional(2, 3, IntegerType.get(), DoubleType.get())) - ), "", "")) - .isInstanceOf(FrontendException.class) - .hasMessageContaining("Unsupported map key type: int"); + Assertions.assertThatThrownBy( + () -> + convertToPigSchema( + new Schema( + optional( + 1, + "invalid", + MapType.ofOptional(2, 3, IntegerType.get(), DoubleType.get()))), + "", + "")) + .isInstanceOf(FrontendException.class) + .hasMessageContaining("Unsupported map key type: int"); } @Test public void nestedMaps() throws IOException { - convertToPigSchema(new Schema( - optional(1, "nested", - MapType.ofOptional(2, 3, StringType.get(), - MapType.ofOptional(4, 5, StringType.get(), - MapType.ofOptional(6, 7, StringType.get(), DecimalType.of(10, 2))))) - ), "nested:[[[bigdecimal]]]", ""); + convertToPigSchema( + new Schema( + optional( + 1, + "nested", + MapType.ofOptional( + 2, + 3, + StringType.get(), + MapType.ofOptional( + 4, + 5, + StringType.get(), + MapType.ofOptional(6, 7, StringType.get(), DecimalType.of(10, 2)))))), + "nested:[[[bigdecimal]]]", + ""); } @Test public void nestedBags() throws IOException { - convertToPigSchema(new Schema( - optional(1, "nested", - ListType.ofOptional(2, - ListType.ofOptional(3, - ListType.ofOptional(4, DoubleType.get())))) - ), "nested:{({({(double)})})}", ""); + convertToPigSchema( + new Schema( + optional( + 1, + "nested", + ListType.ofOptional( + 2, ListType.ofOptional(3, ListType.ofOptional(4, DoubleType.get()))))), + "nested:{({({(double)})})}", + ""); } @Test public void nestedTuples() throws IOException { - convertToPigSchema(new Schema( - optional(1, "first", StructType.of( - optional(2, "second", StructType.of( - optional(3, "third", StructType.of( - optional(4, "val", StringType.get()) - )) - )) - )) - ), "first:(second:(third:(val:chararray)))", ""); + convertToPigSchema( + new Schema( + optional( + 1, + "first", + StructType.of( + optional( + 2, + "second", + StructType.of( + optional( + 3, + "third", + StructType.of(optional(4, "val", StringType.get())))))))), + "first:(second:(third:(val:chararray)))", + ""); } @Test public void complexNested() throws IOException { - convertToPigSchema(new Schema( - optional(1, "t", StructType.of( - optional(2, "b", ListType.ofOptional(3, StructType.of( - optional(4, "i", IntegerType.get()), - optional(5, "s", StringType.get()) - ))) - )), - optional(6, "m1", MapType.ofOptional(7, 8, StringType.get(), StructType.of( - optional(9, "b", ListType.ofOptional(10, BinaryType.get())), - optional(11, "m2", MapType.ofOptional(12, 13, StringType.get(), IntegerType.get())) - ))), - optional(14, "b1", ListType.ofOptional(15, - MapType.ofOptional(16, 17, StringType.get(), - ListType.ofOptional(18, FloatType.get())))) - ), "t:(b:{(i:int,s:chararray)}),m1:[(b:{(bytearray)},m2:[int])],b1:{([{(float)}])}", ""); + convertToPigSchema( + new Schema( + optional( + 1, + "t", + StructType.of( + optional( + 2, + "b", + ListType.ofOptional( + 3, + StructType.of( + optional(4, "i", IntegerType.get()), + optional(5, "s", StringType.get())))))), + optional( + 6, + "m1", + MapType.ofOptional( + 7, + 8, + StringType.get(), + StructType.of( + optional(9, "b", ListType.ofOptional(10, BinaryType.get())), + optional( + 11, + "m2", + MapType.ofOptional(12, 13, StringType.get(), IntegerType.get()))))), + optional( + 14, + "b1", + ListType.ofOptional( + 15, + MapType.ofOptional( + 16, 17, StringType.get(), ListType.ofOptional(18, FloatType.get()))))), + "t:(b:{(i:int,s:chararray)}),m1:[(b:{(bytearray)},m2:[int])],b1:{([{(float)}])}", + ""); } - @Test public void mapConversions() throws IOException { - // consistent behavior for maps conversions. The below test case, correctly does not specify map key types + // consistent behavior for maps conversions. The below test case, correctly does not specify map + // key types convertToPigSchema( new Schema( required( - 1, "a", + 1, + "a", MapType.ofRequired( - 2, 3, + 2, + 3, StringType.get(), ListType.ofRequired( - 4, StructType.of( + 4, + StructType.of( required(5, "b", LongType.get()), required(6, "c", StringType.get())))))), "a:[{(b:long,c:chararray)}]", @@ -157,27 +211,35 @@ public void mapConversions() throws IOException { convertToPigSchema( new Schema( StructType.of( - required(1, "a", MapType.ofRequired( - 2, 3, - StringType.get(), - MapType.ofRequired(4, 5, StringType.get(), DoubleType.get()))) - ).fields()), + required( + 1, + "a", + MapType.ofRequired( + 2, + 3, + StringType.get(), + MapType.ofRequired(4, 5, StringType.get(), DoubleType.get())))) + .fields()), "a:[[double]]", "A map key type does not need to be specified"); } @Test public void testTupleInMap() throws IOException { - Schema icebergSchema = new Schema( - optional( - 1, "nested_list", - MapType.ofOptional( - 2, 3, - StringType.get(), - ListType.ofOptional( - 4, StructType.of( - required(5, "id", LongType.get()), - optional(6, "data", StringType.get())))))); + Schema icebergSchema = + new Schema( + optional( + 1, + "nested_list", + MapType.ofOptional( + 2, + 3, + StringType.get(), + ListType.ofOptional( + 4, + StructType.of( + required(5, "id", LongType.get()), + optional(6, "data", StringType.get())))))); ResourceSchema pigSchema = SchemaUtil.convert(icebergSchema); // The output should contain a nested struct within a list within a map, I think. @@ -186,13 +248,13 @@ public void testTupleInMap() throws IOException { @Test public void testLongInBag() throws IOException { - Schema icebergSchema = new Schema( - optional( - 1, "nested_list", - MapType.ofOptional( - 2, 3, - StringType.get(), - ListType.ofRequired(5, LongType.get())))); + Schema icebergSchema = + new Schema( + optional( + 1, + "nested_list", + MapType.ofOptional( + 2, 3, StringType.get(), ListType.ofRequired(5, LongType.get())))); SchemaUtil.convert(icebergSchema); } @@ -202,21 +264,23 @@ public void doubleWrappingTuples() throws IOException { convertToPigSchema( new Schema( StructType.of( - required(1, "a", - ListType.ofRequired(2, StructType.of(required(3, "b", StringType.get())))) - ).fields()), + required( + 1, + "a", + ListType.ofRequired(2, StructType.of(required(3, "b", StringType.get()))))) + .fields()), "a:{(b:chararray)}", "A tuple inside a bag should not be double wrapped"); // struct> -> "(a:{(boolean)}) convertToPigSchema( - new Schema(StructType.of(required(1, "a", ListType.ofRequired(2, BooleanType.get()))).fields()), + new Schema( + StructType.of(required(1, "a", ListType.ofRequired(2, BooleanType.get()))).fields()), "a:{(boolean)}", - "boolean (or anything non-tuple) element inside a bag should be wrapped inside a tuple" - ); + "boolean (or anything non-tuple) element inside a bag should be wrapped inside a tuple"); } - private static void convertToPigSchema(Schema icebergSchema, String expectedPigSchema, String assertMessage) - throws IOException { + private static void convertToPigSchema( + Schema icebergSchema, String expectedPigSchema, String assertMessage) throws IOException { ResourceSchema pigSchema = SchemaUtil.convert(icebergSchema); assertEquals(assertMessage, expectedPigSchema, pigSchema.toString()); } diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java index 17df7d2cf9d7..d6b0e9c94258 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -32,13 +31,13 @@ public class SparkBenchmarkUtil { - private SparkBenchmarkUtil() { - } + private SparkBenchmarkUtil() {} public static UnsafeProjection projection(Schema expectedSchema, Schema actualSchema) { StructType struct = SparkSchemaUtil.convert(actualSchema); - List refs = JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava(); + List refs = + JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava(); List attrs = Lists.newArrayListWithExpectedSize(struct.fields().length); List exprs = Lists.newArrayListWithExpectedSize(struct.fields().length); diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java index 00c48e2e247a..9b0cc5c5e27c 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -52,15 +54,11 @@ import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** * A benchmark that evaluates the performance of reading Parquet data with a flat schema using * Iceberg and Spark Parquet readers. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=SparkParquetReadersFlatDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-readers-flat-data-benchmark-result.txt @@ -73,22 +71,23 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetReadersFlatDataBenchmark { - private static final DynMethods.UnboundMethod APPLY_PROJECTION = DynMethods.builder("apply") - .impl(UnsafeProjection.class, InternalRow.class) - .build(); - private static final Schema SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); - private static final Schema PROJECTED_SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(8, "stringCol", Types.StringType.get())); + private static final DynMethods.UnboundMethod APPLY_PROJECTION = + DynMethods.builder("apply").impl(UnsafeProjection.class, InternalRow.class).build(); + private static final Schema SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); + private static final Schema PROJECTED_SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(8, "stringCol", Types.StringType.get())); private static final int NUM_RECORDS = 10000000; private File dataFile; @@ -97,10 +96,8 @@ public void setupBenchmark() throws IOException { dataFile = File.createTempFile("parquet-flat-data-benchmark", ".parquet"); dataFile.delete(); List records = RandomData.generateList(SCHEMA, NUM_RECORDS, 0L); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .schema(SCHEMA) - .named("benchmark") - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)).schema(SCHEMA).named("benchmark").build()) { writer.addAll(records); } } @@ -115,10 +112,11 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void readUsingIcebergReader(Blackhole blackHole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackHole.consume(row); @@ -129,14 +127,15 @@ public void readUsingIcebergReader(Blackhole blackHole) throws IOException { @Benchmark @Threads(1) public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); + Iterable unsafeRows = + Iterables.transform( + rows, APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -148,14 +147,15 @@ public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException @Threads(1) public void readUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -166,10 +166,11 @@ public void readUsingSparkReader(Blackhole blackhole) throws IOException { @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -180,14 +181,18 @@ public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOE @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { - - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA))::invoke); + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { + + Iterable unsafeRows = + Iterables.transform( + rows, + APPLY_PROJECTION.bind( + SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA)) + ::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -199,14 +204,15 @@ public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) thro @Threads(1) public void readWithProjectionUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(PROJECTED_SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java index 5473c758fbb4..eafa60b826cc 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -52,15 +54,11 @@ import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** - * A benchmark that evaluates the performance of reading nested Parquet data using - * Iceberg and Spark Parquet readers. + * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg and Spark + * Parquet readers. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=SparkParquetReadersNestedDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-readers-nested-data-benchmark-result.txt @@ -73,22 +71,21 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetReadersNestedDataBenchmark { - private static final DynMethods.UnboundMethod APPLY_PROJECTION = DynMethods.builder("apply") - .impl(UnsafeProjection.class, InternalRow.class) - .build(); - private static final Schema SCHEMA = new Schema( - required(0, "id", Types.LongType.get()), - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get()) - )) - ); - private static final Schema PROJECTED_SCHEMA = new Schema( - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()) - )) - ); + private static final DynMethods.UnboundMethod APPLY_PROJECTION = + DynMethods.builder("apply").impl(UnsafeProjection.class, InternalRow.class).build(); + private static final Schema SCHEMA = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 4, + "nested", + Types.StructType.of( + required(1, "col1", Types.StringType.get()), + required(2, "col2", Types.DoubleType.get()), + required(3, "col3", Types.LongType.get())))); + private static final Schema PROJECTED_SCHEMA = + new Schema( + optional(4, "nested", Types.StructType.of(required(1, "col1", Types.StringType.get())))); private static final int NUM_RECORDS = 10000000; private File dataFile; @@ -97,10 +94,8 @@ public void setupBenchmark() throws IOException { dataFile = File.createTempFile("parquet-nested-data-benchmark", ".parquet"); dataFile.delete(); List records = RandomData.generateList(SCHEMA, NUM_RECORDS, 0L); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .schema(SCHEMA) - .named("benchmark") - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)).schema(SCHEMA).named("benchmark").build()) { writer.addAll(records); } } @@ -115,10 +110,11 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void readUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -129,14 +125,15 @@ public void readUsingIcebergReader(Blackhole blackhole) throws IOException { @Benchmark @Threads(1) public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); + Iterable unsafeRows = + Iterables.transform( + rows, APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -148,14 +145,15 @@ public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException @Threads(1) public void readUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -166,10 +164,11 @@ public void readUsingSparkReader(Blackhole blackhole) throws IOException { @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -180,14 +179,18 @@ public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOE @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { - - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA))::invoke); + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { + + Iterable unsafeRows = + Iterables.transform( + rows, + APPLY_PROJECTION.bind( + SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA)) + ::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -199,14 +202,15 @@ public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) thro @Threads(1) public void readWithProjectionUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(PROJECTED_SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java index d0b13f0f7f94..c711bfad1a57 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.Files; @@ -45,15 +47,11 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.annotations.Warmup; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** * A benchmark that evaluates the performance of writing Parquet data with a flat schema using * Iceberg and Spark Parquet writers. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=SparkParquetWritersFlatDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-writers-flat-data-benchmark-result.txt @@ -66,15 +64,16 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetWritersFlatDataBenchmark { - private static final Schema SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); private static final int NUM_RECORDS = 1000000; private Iterable rows; private File dataFile; @@ -96,10 +95,13 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void writeUsingIcebergWriter() throws IOException { - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) + .schema(SCHEMA) + .build()) { writer.addAll(rows); } @@ -109,15 +111,16 @@ public void writeUsingIcebergWriter() throws IOException { @Threads(1) public void writeUsingSparkWriter() throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .writeSupport(new ParquetWriteSupport()) - .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) - .set("spark.sql.parquet.writeLegacyFormat", "false") - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .writeSupport(new ParquetWriteSupport()) + .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) + .set("spark.sql.parquet.writeLegacyFormat", "false") + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") + .schema(SCHEMA) + .build()) { writer.addAll(rows); } diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java index bae5df1da169..794444d9728e 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.Files; @@ -45,15 +47,11 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.annotations.Warmup; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using - * Iceberg and Spark Parquet writers. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and Spark + * Parquet writers. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=SparkParquetWritersNestedDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-writers-nested-data-benchmark-result.txt @@ -66,14 +64,16 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetWritersNestedDataBenchmark { - private static final Schema SCHEMA = new Schema( - required(0, "id", Types.LongType.get()), - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get()) - )) - ); + private static final Schema SCHEMA = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 4, + "nested", + Types.StructType.of( + required(1, "col1", Types.StringType.get()), + required(2, "col2", Types.DoubleType.get()), + required(3, "col3", Types.LongType.get())))); private static final int NUM_RECORDS = 1000000; private Iterable rows; private File dataFile; @@ -95,10 +95,13 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void writeUsingIcebergWriter() throws IOException { - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) + .schema(SCHEMA) + .build()) { writer.addAll(rows); } @@ -108,15 +111,16 @@ public void writeUsingIcebergWriter() throws IOException { @Threads(1) public void writeUsingSparkWriter() throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .writeSupport(new ParquetWriteSupport()) - .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) - .set("spark.sql.parquet.writeLegacyFormat", "false") - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .writeSupport(new ParquetWriteSupport()) + .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) + .set("spark.sql.parquet.writeLegacyFormat", "false") + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") + .schema(SCHEMA) + .build()) { writer.addAll(rows); } diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java index 1820a801b2fb..0dbf07285060 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; @FunctionalInterface diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java index 0ceedfd0e20d..19bcdd672157 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -79,7 +78,8 @@ protected String newTableLocation() { protected String dataLocation() { Map properties = table.properties(); - return properties.getOrDefault(TableProperties.WRITE_DATA_LOCATION, String.format("%s/data", table.location())); + return properties.getOrDefault( + TableProperties.WRITE_DATA_LOCATION, String.format("%s/data", table.location())); } protected void cleanupFiles() throws IOException { @@ -92,12 +92,12 @@ protected void cleanupFiles() throws IOException { } protected void setupSpark(boolean enableDictionaryEncoding) { - SparkSession.Builder builder = SparkSession.builder() - .config("spark.ui.enabled", false); + SparkSession.Builder builder = SparkSession.builder().config("spark.ui.enabled", false); if (!enableDictionaryEncoding) { - builder.config("parquet.dictionary.page.size", "1") - .config("parquet.enable.dictionary", false) - .config(TableProperties.PARQUET_DICT_SIZE_BYTES, "1"); + builder + .config("parquet.dictionary.page.size", "1") + .config("parquet.enable.dictionary", false) + .config(TableProperties.PARQUET_DICT_SIZE_BYTES, "1"); } builder.master("local"); spark = builder.getOrCreate(); @@ -114,13 +114,14 @@ protected void tearDownSpark() { } protected void materialize(Dataset ds) { - ds.queryExecution().toRdd().toJavaRDD().foreach(record -> { }); + ds.queryExecution().toRdd().toJavaRDD().foreach(record -> {}); } protected void appendAsFile(Dataset ds) { // ensure the schema is precise (including nullability) StructType sparkSchema = SparkSchemaUtil.convert(table.schema()); - spark.createDataFrame(ds.rdd(), sparkSchema) + spark + .createDataFrame(ds.rdd(), sparkSchema) .coalesce(1) .write() .format("iceberg") @@ -132,42 +133,49 @@ protected void withSQLConf(Map conf, Action action) { SQLConf sqlConf = SQLConf.get(); Map currentConfValues = Maps.newHashMap(); - conf.keySet().forEach(confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); - - conf.forEach((confKey, confValue) -> { - if (SQLConf.staticConfKeys().contains(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); + conf.keySet() + .forEach( + confKey -> { + if (sqlConf.contains(confKey)) { + String currentConfValue = sqlConf.getConfString(confKey); + currentConfValues.put(confKey, currentConfValue); + } + }); + + conf.forEach( + (confKey, confValue) -> { + if (SQLConf.staticConfKeys().contains(confKey)) { + throw new RuntimeException("Cannot modify the value of a static config: " + confKey); + } + sqlConf.setConfString(confKey, confValue); + }); try { action.invoke(); } finally { - conf.forEach((confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); + conf.forEach( + (confKey, confValue) -> { + if (currentConfValues.containsKey(confKey)) { + sqlConf.setConfString(confKey, currentConfValues.get(confKey)); + } else { + sqlConf.unsetConf(confKey); + } + }); } } protected void withTableProperties(Map props, Action action) { Map tableProps = table.properties(); Map currentPropValues = Maps.newHashMap(); - props.keySet().forEach(propKey -> { - if (tableProps.containsKey(propKey)) { - String currentPropValue = tableProps.get(propKey); - currentPropValues.put(propKey, currentPropValue); - } - }); + props + .keySet() + .forEach( + propKey -> { + if (tableProps.containsKey(propKey)) { + String currentPropValue = tableProps.get(propKey); + currentPropValues.put(propKey, currentPropValue); + } + }); UpdateProperties updateProperties = table.updateProperties(); props.forEach(updateProperties::set); @@ -177,13 +185,14 @@ protected void withTableProperties(Map props, Action action) { action.invoke(); } finally { UpdateProperties restoreProperties = table.updateProperties(); - props.forEach((propKey, propValue) -> { - if (currentPropValues.containsKey(propKey)) { - restoreProperties.set(propKey, currentPropValues.get(propKey)); - } else { - restoreProperties.remove(propKey); - } - }); + props.forEach( + (propKey, propValue) -> { + if (currentPropValues.containsKey(propKey)) { + restoreProperties.set(propKey, currentPropValues.get(propKey)); + } else { + restoreProperties.remove(propKey); + } + }); restoreProperties.commit(); } } diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java index 9e206321a540..59e6230350d9 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -29,9 +31,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class IcebergSourceFlatDataBenchmark extends IcebergSourceBenchmark { @Override @@ -41,15 +40,16 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java index 5a0d9359ec6b..a1c61b9b4de0 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -29,9 +31,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class IcebergSourceNestedDataBenchmark extends IcebergSourceBenchmark { @Override @@ -41,14 +40,16 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(0, "id", Types.LongType.get()), - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get()) - )) - ); + Schema schema = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 4, + "nested", + Types.StructType.of( + required(1, "col1", Types.StringType.get()), + required(2, "col2", Types.DoubleType.get()), + required(3, "col3", Types.LongType.get())))); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java index 369a1507b648..f68b587735dd 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -29,9 +31,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class IcebergSourceNestedListDataBenchmark extends IcebergSourceBenchmark { @Override @@ -41,12 +40,19 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(0, "id", Types.LongType.get()), - optional(1, "outerlist", Types.ListType.ofOptional(2, - Types.StructType.of(required(3, "innerlist", Types.ListType.ofRequired(4, Types.StringType.get()))) - )) - ); + Schema schema = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 1, + "outerlist", + Types.ListType.ofOptional( + 2, + Types.StructType.of( + required( + 3, + "innerlist", + Types.ListType.ofRequired(4, Types.StringType.get())))))); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java index 06e00e3ebab7..eace9d3e44a7 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.Comparator; import java.util.List; @@ -57,23 +59,20 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.infra.Blackhole; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class WritersBenchmark extends IcebergSourceBenchmark { private static final int NUM_ROWS = 2500000; private static final long TARGET_FILE_SIZE_IN_BYTES = 50L * 1024 * 1024; - private static final Schema SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "timestampCol", Types.TimestampType.withZone()), - optional(7, "stringCol", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "timestampCol", Types.TimestampType.withZone()), + optional(7, "stringCol", Types.StringType.get())); private Iterable rows; private Iterable positionDeleteRows; @@ -91,7 +90,8 @@ public void setupBenchmark() { data.sort(Comparator.comparingInt(row -> transform.apply(row.getInt(1)))); this.rows = data; - this.positionDeleteRows = RandomData.generateSpark(DeleteSchemaUtil.pathPosSchema(), NUM_ROWS, 0L); + this.positionDeleteRows = + RandomData.generateSpark(DeleteSchemaUtil.pathPosSchema(), NUM_ROWS, 0L); this.unpartitionedSpec = table().specs().get(0); Preconditions.checkArgument(unpartitionedSpec.isUnpartitioned()); @@ -117,9 +117,7 @@ protected final Table initTable() { Table table = tables.create(SCHEMA, spec, properties, newTableLocation()); // add a partitioned spec to the table - table.updateSpec() - .addField(Expressions.bucket("intCol", 32)) - .commit(); + table.updateSpec().addField(Expressions.bucket("intCol", 32)).commit(); return table; } @@ -130,13 +128,14 @@ public void writeUnpartitionedClusteredDataWriter(Blackhole blackhole) throws IO FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .dataSchema(table().schema()) + .build(); - ClusteredDataWriter writer = new ClusteredDataWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredDataWriter writer = + new ClusteredDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); try (ClusteredDataWriter closeableWriter = writer) { for (InternalRow row : rows) { @@ -156,13 +155,14 @@ public void writeUnpartitionedLegacyDataWriter(Blackhole blackhole) throws IOExc Schema writeSchema = table().schema(); StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(unpartitionedSpec) - .build(); + SparkAppenderFactory appenders = + SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) + .spec(unpartitionedSpec) + .build(); - TaskWriter writer = new UnpartitionedWriter<>( - unpartitionedSpec, fileFormat(), appenders, - fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + TaskWriter writer = + new UnpartitionedWriter<>( + unpartitionedSpec, fileFormat(), appenders, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); try (TaskWriter closableWriter = writer) { for (InternalRow row : rows) { @@ -179,13 +179,14 @@ public void writePartitionedClusteredDataWriter(Blackhole blackhole) throws IOEx FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .dataSchema(table().schema()) + .build(); - ClusteredDataWriter writer = new ClusteredDataWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredDataWriter writer = + new ClusteredDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); StructType dataSparkType = SparkSchemaUtil.convert(table().schema()); @@ -210,14 +211,21 @@ public void writePartitionedLegacyDataWriter(Blackhole blackhole) throws IOExcep Schema writeSchema = table().schema(); StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(partitionedSpec) - .build(); - - TaskWriter writer = new SparkPartitionedWriter( - partitionedSpec, fileFormat(), appenders, - fileFactory, io, TARGET_FILE_SIZE_IN_BYTES, - writeSchema, sparkWriteType); + SparkAppenderFactory appenders = + SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) + .spec(partitionedSpec) + .build(); + + TaskWriter writer = + new SparkPartitionedWriter( + partitionedSpec, + fileFormat(), + appenders, + fileFactory, + io, + TARGET_FILE_SIZE_IN_BYTES, + writeSchema, + sparkWriteType); try (TaskWriter closableWriter = writer) { for (InternalRow row : rows) { @@ -234,13 +242,14 @@ public void writePartitionedFanoutDataWriter(Blackhole blackhole) throws IOExcep FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .dataSchema(table().schema()) + .build(); - FanoutDataWriter writer = new FanoutDataWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + FanoutDataWriter writer = + new FanoutDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); StructType dataSparkType = SparkSchemaUtil.convert(table().schema()); @@ -265,14 +274,21 @@ public void writePartitionedLegacyFanoutDataWriter(Blackhole blackhole) throws I Schema writeSchema = table().schema(); StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(partitionedSpec) - .build(); - - TaskWriter writer = new SparkPartitionedFanoutWriter( - partitionedSpec, fileFormat(), appenders, - fileFactory, io, TARGET_FILE_SIZE_IN_BYTES, - writeSchema, sparkWriteType); + SparkAppenderFactory appenders = + SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) + .spec(partitionedSpec) + .build(); + + TaskWriter writer = + new SparkPartitionedFanoutWriter( + partitionedSpec, + fileFormat(), + appenders, + fileFactory, + io, + TARGET_FILE_SIZE_IN_BYTES, + writeSchema, + sparkWriteType); try (TaskWriter closableWriter = writer) { for (InternalRow row : rows) { @@ -285,20 +301,23 @@ partitionedSpec, fileFormat(), appenders, @Benchmark @Threads(1) - public void writePartitionedClusteredEqualityDeleteWriter(Blackhole blackhole) throws IOException { + public void writePartitionedClusteredEqualityDeleteWriter(Blackhole blackhole) + throws IOException { FileIO io = table().io(); int equalityFieldId = table().schema().findField("longCol").fieldId(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .equalityDeleteRowSchema(table().schema()) - .equalityFieldIds(new int[]{equalityFieldId}) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .equalityDeleteRowSchema(table().schema()) + .equalityFieldIds(new int[] {equalityFieldId}) + .build(); - ClusteredEqualityDeleteWriter writer = new ClusteredEqualityDeleteWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredEqualityDeleteWriter writer = + new ClusteredEqualityDeleteWriter<>( + writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); StructType deleteSparkType = SparkSchemaUtil.convert(table().schema()); @@ -316,16 +335,17 @@ public void writePartitionedClusteredEqualityDeleteWriter(Blackhole blackhole) t @Benchmark @Threads(1) - public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) throws IOException { + public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) + throws IOException { FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).build(); - ClusteredPositionDeleteWriter writer = new ClusteredPositionDeleteWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredPositionDeleteWriter writer = + new ClusteredPositionDeleteWriter<>( + writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PositionDelete positionDelete = PositionDelete.create(); try (ClusteredPositionDeleteWriter closeableWriter = writer) { @@ -341,8 +361,6 @@ public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) } private OutputFileFactory newFileFactory() { - return OutputFileFactory.builderFor(table(), 1, 1) - .format(fileFormat()) - .build(); + return OutputFileFactory.builderFor(table(), 1, 1).format(fileFormat()).build(); } } diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java index c2909d944079..3cdde8d652e4 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.avro; import org.apache.iceberg.FileFormat; @@ -25,8 +24,7 @@ /** * A benchmark that evaluates the performance of various Iceberg writers for Avro data. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=AvroWritersBenchmark * -PjmhOutputPath=benchmark/avro-writers-benchmark-result.txt diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java index ec09149c50fa..fa4c97ce6229 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.avro; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,18 +36,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of reading Avro data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading Avro data with a flat schema using Iceberg + * and the built-in file source in Spark. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=IcebergSourceFlatAvroDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-avro-data-read-benchmark-result.txt @@ -70,11 +68,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,10 +82,12 @@ public void readIceberg() { public void readFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().format("avro").load(dataLocation()); + materialize(df); + }); } @Benchmark @@ -93,11 +95,13 @@ public void readFileSource() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @@ -105,28 +109,34 @@ public void readWithProjectionIceberg() { public void readWithProjectionFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().format("avro").load(dataLocation()).select("longCol"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "avro"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + appendAsFile(df); + } + }); } } diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java index 2f2da308afd7..c08fa5c50cd4 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.avro; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,19 +36,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - - /** - * A benchmark that evaluates the performance of reading Avro data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading Avro data with a flat schema using Iceberg + * and the built-in file source in Spark. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=IcebergSourceNestedAvroDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-avro-data-read-benchmark-result.txt @@ -71,11 +68,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -83,10 +82,12 @@ public void readIceberg() { public void readFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().format("avro").load(dataLocation()); + materialize(df); + }); } @Benchmark @@ -94,11 +95,14 @@ public void readFileSource() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("nested.col3"); + materialize(df); + }); } @Benchmark @@ -106,27 +110,33 @@ public void readWithProjectionIceberg() { public void readWithProjectionFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()).select("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = + spark().read().format("avro").load(dataLocation()).select("nested.col3"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "avro"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); + appendAsFile(df); + } + }); } } diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java index 329c9ffe7738..d0fdd8915780 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -30,13 +32,10 @@ import org.apache.iceberg.spark.source.IcebergSourceBenchmark; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - - /** - * Same as {@link org.apache.iceberg.spark.source.IcebergSourceFlatDataBenchmark} but we disable the Timestamp with - * zone type for ORC performance tests as Spark native reader does not support ORC's TIMESTAMP_INSTANT type + * Same as {@link org.apache.iceberg.spark.source.IcebergSourceFlatDataBenchmark} but we disable the + * Timestamp with zone type for ORC performance tests as Spark native reader does not support ORC's + * TIMESTAMP_INSTANT type */ public abstract class IcebergSourceFlatORCDataBenchmark extends IcebergSourceBenchmark { @@ -47,17 +46,19 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - // Disable timestamp column for ORC performance tests as Spark native reader does not support ORC's - // TIMESTAMP_INSTANT type - // optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + // Disable timestamp column for ORC performance tests as Spark native reader does not + // support ORC's + // TIMESTAMP_INSTANT type + // optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java index 52ef50dfdc9e..12accf7b76ed 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,18 +36,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of reading ORC data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading ORC data with a flat schema using Iceberg + * and the built-in file source in Spark. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=IcebergSourceFlatORCDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-orc-data-read-benchmark-result.txt @@ -70,11 +68,13 @@ public void tearDownBenchmark() throws IOException { public void readIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,12 +82,18 @@ public void readIcebergNonVectorized() { public void readIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation); + materialize(df); + }); } @Benchmark @@ -96,10 +102,12 @@ public void readFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()); + materialize(df); + }); } @Benchmark @@ -108,10 +116,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()); + materialize(df); + }); } @Benchmark @@ -119,11 +129,13 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @@ -131,25 +143,33 @@ public void readWithProjectionIcebergNonVectorized() { public void readWithProjectionIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation) + .select("longCol"); + materialize(df); + }); } - @Benchmark @Threads(1) public void readWithProjectionFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()).select("longCol"); + materialize(df); + }); } @Benchmark @@ -158,27 +178,33 @@ public void readWithProjectionFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()).select("longCol"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "orc"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + appendAsFile(df); + } + }); } } diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java index fd1c5059c406..f57d6764c8ac 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.spark.sql.functions.array_repeat; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,22 +35,18 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.array_repeat; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=IcebergSourceNestedListORCDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-list-orc-data-write-benchmark-result.txt * */ -public class IcebergSourceNestedListORCDataWriteBenchmark extends IcebergSourceNestedListDataBenchmark { +public class IcebergSourceNestedListORCDataWriteBenchmark + extends IcebergSourceNestedListDataBenchmark { @Setup public void setupBenchmark() { @@ -67,8 +66,12 @@ public void tearDownBenchmark() throws IOException { @Threads(1) public void writeIceberg() { String tableLocation = table().location(); - benchmarkData().write().format("iceberg").option("write-format", "orc") - .mode(SaveMode.Append).save(tableLocation); + benchmarkData() + .write() + .format("iceberg") + .option("write-format", "orc") + .mode(SaveMode.Append) + .save(tableLocation); } @Benchmark @@ -76,11 +79,17 @@ public void writeIceberg() { public void writeIcebergDictionaryOff() { Map tableProperties = Maps.newHashMap(); tableProperties.put("orc.dictionary.key.threshold", "0"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - benchmarkData().write().format("iceberg").option("write-format", "orc") - .mode(SaveMode.Append).save(tableLocation); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + benchmarkData() + .write() + .format("iceberg") + .option("write-format", "orc") + .mode(SaveMode.Append) + .save(tableLocation); + }); } @Benchmark @@ -90,10 +99,11 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(numRows) - .withColumn("outerlist", array_repeat(struct( - expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), - 10)) + return spark() + .range(numRows) + .withColumn( + "outerlist", + array_repeat(struct(expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), 10)) .coalesce(1); } } diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java index fc70219627c5..d0fe63484f7e 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,19 +37,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - - /** - * A benchmark that evaluates the performance of reading ORC data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading ORC data with a flat schema using Iceberg + * and the built-in file source in Spark. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=IcebergSourceNestedORCDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-orc-data-read-benchmark-result.txt @@ -72,11 +69,13 @@ public void tearDownBenchmark() throws IOException { public void readIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -84,12 +83,18 @@ public void readIcebergNonVectorized() { public void readIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation); + materialize(df); + }); } @Benchmark @@ -98,10 +103,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()); + materialize(df); + }); } @Benchmark @@ -109,11 +116,14 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -121,12 +131,19 @@ public void readWithProjectionIcebergNonVectorized() { public void readWithProjectionIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation) + .selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -135,27 +152,32 @@ public void readWithProjectionFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()).selectExpr("nested.col3"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "orc"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); + appendAsFile(df); + } + }); } } diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java index ea9dadb59d52..2642c481f8e7 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,21 +35,15 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** * A benchmark that evaluates the file skipping capabilities in the Spark data source for Iceberg. * - * This class uses a dataset with a flat schema, where the records are clustered according to the + *

    This class uses a dataset with a flat schema, where the records are clustered according to the * column used in the filter predicate. * - * The performance is compared to the built-in file source in Spark. + *

    The performance is compared to the built-in file source in Spark. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmhs * -PjmhIncludeRegex=IcebergSourceFlatParquetDataFilterBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-filter-benchmark-result.txt @@ -74,11 +72,14 @@ public void tearDownBenchmark() throws IOException { public void readWithFilterIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -87,10 +88,12 @@ public void readWithFilterFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -99,23 +102,27 @@ public void readWithFilterFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } private void appendData() { for (int fileNum = 1; fileNum < NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); appendAsFile(df); } } diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java index 06633bda8773..484572f9a541 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,17 +35,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of reading Parquet data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading Parquet data with a flat schema using + * Iceberg and the built-in file source in Spark. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=IcebergSourceFlatParquetDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-read-benchmark-result.txt @@ -69,11 +67,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,10 +82,12 @@ public void readFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -94,10 +96,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -105,11 +109,13 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @@ -118,10 +124,12 @@ public void readWithProjectionFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("longCol"); + materialize(df); + }); } @Benchmark @@ -130,23 +138,27 @@ public void readWithProjectionFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("longCol"); + materialize(df); + }); } private void appendData() { for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); appendAsFile(df); } } diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java index e531b86a1cac..6e94e2d98750 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,14 +33,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of writing Parquet data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing Parquet data with a flat schema using + * Iceberg and the built-in file source in Spark. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=IcebergSourceFlatParquetDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-write-benchmark-result.txt @@ -76,7 +74,8 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(NUM_ROWS) + return spark() + .range(NUM_ROWS) .withColumnRenamed("id", "longCol") .withColumn("intCol", expr("CAST(longCol AS INT)")) .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java index 016c210c7283..099be4c01b54 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.spark.sql.functions.array_repeat; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -33,22 +36,18 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.array_repeat; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=IcebergSourceNestedListParquetDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-list-parquet-data-write-benchmark-result.txt * */ -public class IcebergSourceNestedListParquetDataWriteBenchmark extends IcebergSourceNestedListDataBenchmark { +public class IcebergSourceNestedListParquetDataWriteBenchmark + extends IcebergSourceNestedListDataBenchmark { @Setup public void setupBenchmark() { @@ -80,10 +79,11 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(numRows) - .withColumn("outerlist", array_repeat(struct( - expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), - 10)) + return spark() + .range(numRows) + .withColumn( + "outerlist", + array_repeat(struct(expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), 10)) .coalesce(1); } } diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java index a3530173bdfc..eeb84b8efcd9 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,27 +35,22 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - /** * A benchmark that evaluates the file skipping capabilities in the Spark data source for Iceberg. * - * This class uses a dataset with nested data, where the records are clustered according to the + *

    This class uses a dataset with nested data, where the records are clustered according to the * column used in the filter predicate. * - * The performance is compared to the built-in file source in Spark. + *

    The performance is compared to the built-in file source in Spark. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=IcebergSourceNestedParquetDataFilterBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-filter-benchmark-result.txt * */ -public class IcebergSourceNestedParquetDataFilterBenchmark extends IcebergSourceNestedDataBenchmark { +public class IcebergSourceNestedParquetDataFilterBenchmark + extends IcebergSourceNestedDataBenchmark { private static final String FILTER_COND = "nested.col3 == 0"; private static final int NUM_FILES = 500; @@ -74,11 +73,14 @@ public void tearDownBenchmark() throws IOException { public void readWithFilterIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -87,10 +89,12 @@ public void readWithFilterFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -99,22 +103,25 @@ public void readWithFilterFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } private void appendData() { for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); appendAsFile(df); } } diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java index 61f841c41ef0..c369c75321fd 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,17 +35,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=IcebergSourceNestedParquetDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-read-benchmark-result.txt @@ -69,11 +67,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,10 +82,12 @@ public void readFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -94,10 +96,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -105,11 +109,14 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -119,10 +126,12 @@ public void readWithProjectionFileSourceVectorized() { conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); conf.put(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED().key(), "true"); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -132,22 +141,25 @@ public void readWithProjectionFileSourceNonVectorized() { conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); conf.put(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED().key(), "true"); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); + materialize(df); + }); } private void appendData() { for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); appendAsFile(df); } } diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java index e70e04f73fdf..450ecb709092 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,15 +34,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=IcebergSourceNestedParquetDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-write-benchmark-result.txt @@ -77,14 +75,14 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(NUM_ROWS) + return spark() + .range(NUM_ROWS) .withColumn( "nested", struct( expr("CAST(id AS string) AS col1"), expr("CAST(id AS double) AS col2"), - expr("id AS col3") - )) + expr("id AS col3"))) .coalesce(1); } } diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java index 663f4e151e58..d06409e129be 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; import org.apache.iceberg.FileFormat; @@ -25,8 +24,7 @@ /** * A benchmark that evaluates the performance of various Iceberg writers for Parquet data. * - * To run this benchmark for spark-2.4: - * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=ParquetWritersBenchmark * -PjmhOutputPath=benchmark/parquet-writers-benchmark-result.txt diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java index eaeb64a7f6ca..63f35578d14a 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet.vectorized; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.pmod; +import static org.apache.spark.sql.functions.to_date; +import static org.apache.spark.sql.functions.to_timestamp; + import java.math.BigDecimal; import java.math.BigInteger; import java.util.Map; @@ -33,31 +38,26 @@ import org.apache.spark.sql.types.DataTypes; import org.openjdk.jmh.annotations.Setup; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.pmod; -import static org.apache.spark.sql.functions.to_date; -import static org.apache.spark.sql.functions.to_timestamp; - /** - * Benchmark to compare performance of reading Parquet dictionary encoded data with a flat schema using vectorized - * Iceberg read path and the built-in file source in Spark. - *

    - * To run this benchmark for spark-2.4: - * + * Benchmark to compare performance of reading Parquet dictionary encoded data with a flat schema + * using vectorized Iceberg read path and the built-in file source in Spark. + * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=VectorizedReadDictionaryEncodedFlatParquetDataBenchmark * -PjmhOutputPath=benchmark/results.txt * */ -public class VectorizedReadDictionaryEncodedFlatParquetDataBenchmark extends VectorizedReadFlatParquetDataBenchmark { +public class VectorizedReadDictionaryEncodedFlatParquetDataBenchmark + extends VectorizedReadFlatParquetDataBenchmark { @Setup @Override public void setupBenchmark() { setupSpark(true); appendData(); - // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds + // Allow unsafe memory access to avoid the costly check arrow does to check if index is within + // bounds System.setProperty("arrow.enable_unsafe_memory_access", "true"); // Disable expensive null check for every get(index) call. // Iceberg manages nullability checks itself instead of relying on arrow. @@ -83,9 +83,7 @@ void appendData() { df = withTimestampColumnDictEncoded(df); df = withStringColumnDictEncoded(df); df = df.drop("id"); - df.write().format("iceberg") - .mode(SaveMode.Append) - .save(table().location()); + df.write().format("iceberg").mode(SaveMode.Append).save(table().location()); } private static Column modColumn() { @@ -110,7 +108,6 @@ private static Dataset withIntColumnDictEncoded(Dataset df) { private static Dataset withFloatColumnDictEncoded(Dataset df) { return df.withColumn("floatCol", modColumn().cast(DataTypes.FloatType)); - } private static Dataset withDoubleColumnDictEncoded(Dataset df) { @@ -129,7 +126,8 @@ private static Dataset withDateColumnDictEncoded(Dataset df) { private static Dataset withTimestampColumnDictEncoded(Dataset df) { Column days = modColumn().cast(DataTypes.ShortType); - return df.withColumn("timestampCol", to_timestamp(date_add(to_date(lit("04/12/2019"), "MM/dd/yyyy"), days))); + return df.withColumn( + "timestampCol", to_timestamp(date_add(to_date(lit("04/12/2019"), "MM/dd/yyyy"), days))); } private static Dataset withStringColumnDictEncoded(Dataset df) { diff --git a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java index 782a5cda4d19..feb6c6d5d9eb 100644 --- a/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java +++ b/spark/v2.4/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java @@ -16,9 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet.vectorized; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.pmod; +import static org.apache.spark.sql.functions.when; + import java.io.IOException; import java.util.Map; import org.apache.hadoop.conf.Configuration; @@ -38,21 +46,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.pmod; -import static org.apache.spark.sql.functions.when; - /** - * Benchmark to compare performance of reading Parquet data with a flat schema using vectorized Iceberg read path and - * the built-in file source in Spark. - *

    - * To run this benchmark for spark-2.4: - * + * Benchmark to compare performance of reading Parquet data with a flat schema using vectorized + * Iceberg read path and the built-in file source in Spark. + * + *

    To run this benchmark for spark-2.4: * ./gradlew -DsparkVersions=2.4 :iceberg-spark:iceberg-spark-2.4:jmh * -PjmhIncludeRegex=VectorizedReadFlatParquetDataBenchmark * -PjmhOutputPath=benchmark/results.txt @@ -67,7 +65,8 @@ public class VectorizedReadFlatParquetDataBenchmark extends IcebergSourceBenchma public void setupBenchmark() { setupSpark(); appendData(); - // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds + // Allow unsafe memory access to avoid the costly check arrow does to check if index is within + // bounds System.setProperty("arrow.enable_unsafe_memory_access", "true"); // Disable expensive null check for every get(index) call. // Iceberg manages nullability checks itself instead of relying on arrow. @@ -87,15 +86,16 @@ protected Configuration initHadoopConf() { @Override protected Table initTable() { - Schema schema = new Schema( - optional(1, "longCol", Types.LongType.get()), - optional(2, "intCol", Types.IntegerType.get()), - optional(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + optional(1, "longCol", Types.LongType.get()), + optional(2, "intCol", Types.IntegerType.get()), + optional(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = parquetWriteProps(); @@ -111,19 +111,20 @@ Map parquetWriteProps() { void appendData() { for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS_PER_FILE) - .withColumn( - "longCol", - when(pmod(col("id"), lit(10)).equalTo(lit(0)), lit(null)) - .otherwise(col("id"))) - .drop("id") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(longCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS_PER_FILE) + .withColumn( + "longCol", + when(pmod(col("id"), lit(10)).equalTo(lit(0)), lit(null)).otherwise(col("id"))) + .drop("id") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(longCol AS STRING)")); appendAsFile(df); } } @@ -131,161 +132,189 @@ void appendData() { @Benchmark @Threads(1) public void readIntegersIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("intCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("intCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readIntegersSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("intCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("intCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readLongsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readLongsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("longCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readFloatsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("floatCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("floatCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readFloatsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("floatCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("floatCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDoublesIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("doubleCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("doubleCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDoublesSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("doubleCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("doubleCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDecimalsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("decimalCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("decimalCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDecimalsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("decimalCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("decimalCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDatesIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("dateCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("dateCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDatesSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("dateCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("dateCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readTimestampsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("timestampCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("timestampCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readTimestampsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("timestampCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("timestampCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readStringsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("stringCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("stringCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readStringsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("stringCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("stringCol"); + materialize(df); + }); } private static Map tablePropsWithVectorizationEnabled(int batchSize) { diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/Actions.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/Actions.java index 73fd3f33c9e2..ee55bbf766a5 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/Actions.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/Actions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import org.apache.iceberg.Table; @@ -26,8 +25,9 @@ /** * An API for interacting with actions in Spark. * - * @deprecated since 0.12.0, used for supporting {@link RewriteDataFilesAction} in Spark 2.4 for backward compatibility. - * This implementation is no longer maintained, the new implementation is available with Spark 3.x + * @deprecated since 0.12.0, used for supporting {@link RewriteDataFilesAction} in Spark 2.4 for + * backward compatibility. This implementation is no longer maintained, the new implementation + * is available with Spark 3.x */ @Deprecated public class Actions { @@ -46,9 +46,12 @@ private static DynConstructors.Ctor actionConstructor() { String className = implClass(); try { implConstructor = - DynConstructors.builder().hiddenImpl(className, SparkSession.class, Table.class).buildChecked(); + DynConstructors.builder() + .hiddenImpl(className, SparkSession.class, Table.class) + .buildChecked(); } catch (NoSuchMethodException e) { - throw new IllegalArgumentException("Cannot find appropriate Actions implementation on the classpath.", e); + throw new IllegalArgumentException( + "Cannot find appropriate Actions implementation on the classpath.", e); } } return implConstructor; @@ -63,9 +66,9 @@ protected Actions(SparkSession spark, Table table) { } /** - * @deprecated since 0.12.0, used for supporting {@link RewriteDataFilesAction} - * in Spark 2.4 for backward compatibility. - * This implementation is no longer maintained, the new implementation is available with Spark 3.x + * @deprecated since 0.12.0, used for supporting {@link RewriteDataFilesAction} in Spark 2.4 for + * backward compatibility. This implementation is no longer maintained, the new implementation + * is available with Spark 3.x */ @Deprecated public static Actions forTable(SparkSession spark, Table table) { @@ -73,9 +76,9 @@ public static Actions forTable(SparkSession spark, Table table) { } /** - * @deprecated since 0.12.0, used for supporting {@link RewriteDataFilesAction} - * in Spark 2.4 for backward compatibility. - * This implementation is no longer maintained, the new implementation is available with Spark 3.x + * @deprecated since 0.12.0, used for supporting {@link RewriteDataFilesAction} in Spark 2.4 for + * backward compatibility. This implementation is no longer maintained, the new implementation + * is available with Spark 3.x */ @Deprecated public static Actions forTable(Table table) { @@ -83,9 +86,9 @@ public static Actions forTable(Table table) { } /** - * @deprecated since 0.12.0, used for supporting {@link RewriteDataFilesAction} - * in Spark 2.4 for backward compatibility. - * This implementation is no longer maintained, the new implementation is available with Spark 3.x + * @deprecated since 0.12.0, used for supporting {@link RewriteDataFilesAction} in Spark 2.4 for + * backward compatibility. This implementation is no longer maintained, the new implementation + * is available with Spark 3.x */ @Deprecated public RewriteDataFilesAction rewriteDataFiles() { @@ -99,5 +102,4 @@ protected SparkSession spark() { protected Table table() { return table; } - } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/RewriteDataFilesAction.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/RewriteDataFilesAction.java index 0418a9cb10ff..bbbb57adf569 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/RewriteDataFilesAction.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/RewriteDataFilesAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import java.util.List; @@ -33,12 +32,11 @@ import org.apache.spark.sql.SparkSession; /** - * @deprecated since 0.12.0, keeping this in Spark 2.4 for backward compatibility. - * This implementation is no longer maintained, the new implementation is available with Spark 3.x + * @deprecated since 0.12.0, keeping this in Spark 2.4 for backward compatibility. This + * implementation is no longer maintained, the new implementation is available with Spark 3.x */ @Deprecated -public class RewriteDataFilesAction - extends BaseRewriteDataFilesAction { +public class RewriteDataFilesAction extends BaseRewriteDataFilesAction { private final JavaSparkContext sparkContext; private FileIO fileIO; @@ -63,7 +61,8 @@ protected FileIO fileIO() { @Override protected List rewriteDataForTasks(List combinedScanTasks) { - JavaRDD taskRDD = sparkContext.parallelize(combinedScanTasks, combinedScanTasks.size()); + JavaRDD taskRDD = + sparkContext.parallelize(combinedScanTasks, combinedScanTasks.size()); Broadcast

  • tableBroadcast = sparkContext.broadcast(SerializableTable.copyOf(table())); RowDataRewriter rowDataRewriter = new RowDataRewriter(tableBroadcast, spec(), caseSensitive()); return rowDataRewriter.rewriteDataForTasks(taskRDD); diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/SparkActions.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/SparkActions.java index 97ec74b7eb8d..9985c26214db 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/SparkActions.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/actions/SparkActions.java @@ -16,15 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; import org.apache.iceberg.Table; import org.apache.spark.sql.SparkSession; /** - * @deprecated since 0.12.0, used for supporting {@link RewriteDataFilesAction} in Spark 2.4 for backward compatibility. - * This implementation is no longer maintained, the new implementation is available with Spark 3.x + * @deprecated since 0.12.0, used for supporting {@link RewriteDataFilesAction} in Spark 2.4 for + * backward compatibility. This implementation is no longer maintained, the new implementation + * is available with Spark 3.x */ @Deprecated class SparkActions extends Actions { diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java index 862626d0cd6d..87de0a98b934 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.transforms.Transform; @@ -27,14 +26,18 @@ import org.apache.spark.sql.types.DataTypes; public class IcebergSpark { - private IcebergSpark() { - } + private IcebergSpark() {} - public static void registerBucketUDF(SparkSession session, String funcName, DataType sourceType, int numBuckets) { + public static void registerBucketUDF( + SparkSession session, String funcName, DataType sourceType, int numBuckets) { SparkTypeToType typeConverter = new SparkTypeToType(); Type sourceIcebergType = typeConverter.atomic(sourceType); Transform bucket = Transforms.bucket(sourceIcebergType, numBuckets); - session.udf().register(funcName, - value -> bucket.apply(SparkValueConverter.convert(sourceIcebergType, value)), DataTypes.IntegerType); + session + .udf() + .register( + funcName, + value -> bucket.apply(SparkValueConverter.convert(sourceIcebergType, value)), + DataTypes.IntegerType); } } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java index a35808fd8ce6..c0756d924e2f 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java @@ -16,13 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; -/** - * Captures information about the current job - * which is used for displaying on the UI - */ +/** Captures information about the current job which is used for displaying on the UI */ public class JobGroupInfo { private String groupId; private String description; diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java index 155dce707701..dc8ba69d40a8 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.spark.SparkContext; @@ -26,10 +25,10 @@ public class JobGroupUtils { private static final String JOB_GROUP_ID = SparkContext$.MODULE$.SPARK_JOB_GROUP_ID(); private static final String JOB_GROUP_DESC = SparkContext$.MODULE$.SPARK_JOB_DESCRIPTION(); - private static final String JOB_INTERRUPT_ON_CANCEL = SparkContext$.MODULE$.SPARK_JOB_INTERRUPT_ON_CANCEL(); + private static final String JOB_INTERRUPT_ON_CANCEL = + SparkContext$.MODULE$.SPARK_JOB_INTERRUPT_ON_CANCEL(); - private JobGroupUtils() { - } + private JobGroupUtils() {} public static JobGroupInfo getJobGroupInfo(SparkContext sparkContext) { String groupId = sparkContext.getLocalProperty(JOB_GROUP_ID); @@ -41,6 +40,7 @@ public static JobGroupInfo getJobGroupInfo(SparkContext sparkContext) { public static void setJobGroupInfo(SparkContext sparkContext, JobGroupInfo info) { sparkContext.setLocalProperty(JOB_GROUP_ID, info.groupId()); sparkContext.setLocalProperty(JOB_GROUP_DESC, info.description()); - sparkContext.setLocalProperty(JOB_INTERRUPT_ON_CANCEL, String.valueOf(info.interruptOnCancel())); + sparkContext.setLocalProperty( + JOB_INTERRUPT_ON_CANCEL, String.valueOf(info.interruptOnCancel())); } } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java index 3bdf984ed219..3c111d3b44cb 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -70,7 +69,8 @@ public Type schema(Schema schema, Supplier structResult) { @Override public Type struct(Types.StructType struct, Iterable fieldResults) { - Preconditions.checkNotNull(struct, "Cannot prune null struct. Pruning must start with a schema."); + Preconditions.checkNotNull( + struct, "Cannot prune null struct. Pruning must start with a schema."); Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); StructType requestedStruct = (StructType) current; @@ -92,13 +92,13 @@ public Type struct(Types.StructType struct, Iterable fieldResults) { } else if (field.isOptional()) { changed = true; - projectedFields.put(field.name(), - Types.NestedField.optional(field.fieldId(), field.name(), type)); + projectedFields.put( + field.name(), Types.NestedField.optional(field.fieldId(), field.name(), type)); } else { changed = true; - projectedFields.put(field.name(), - Types.NestedField.required(field.fieldId(), field.name(), type)); + projectedFields.put( + field.name(), Types.NestedField.required(field.fieldId(), field.name(), type)); } } @@ -145,8 +145,10 @@ public Type field(Types.NestedField field, Supplier fieldResult) { int fieldIndex = requestedStruct.fieldIndex(field.name()); StructField requestedField = requestedStruct.fields()[fieldIndex]; - Preconditions.checkArgument(requestedField.nullable() || field.isRequired(), - "Cannot project an optional field as non-null: %s", field.name()); + Preconditions.checkArgument( + requestedField.nullable() || field.isRequired(), + "Cannot project an optional field as non-null: %s", + field.name()); this.current = requestedField.dataType(); try { @@ -164,8 +166,10 @@ public Type list(Types.ListType list, Supplier elementResult) { Preconditions.checkArgument(current instanceof ArrayType, "Not an array: %s", current); ArrayType requestedArray = (ArrayType) current; - Preconditions.checkArgument(requestedArray.containsNull() || !list.isElementOptional(), - "Cannot project an array of optional elements as required elements: %s", requestedArray); + Preconditions.checkArgument( + requestedArray.containsNull() || !list.isElementOptional(), + "Cannot project an array of optional elements as required elements: %s", + requestedArray); this.current = requestedArray.elementType(); try { @@ -190,10 +194,14 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu Preconditions.checkArgument(current instanceof MapType, "Not a map: %s", current); MapType requestedMap = (MapType) current; - Preconditions.checkArgument(requestedMap.valueContainsNull() || !map.isValueOptional(), - "Cannot project a map of optional values as required values: %s", map); - Preconditions.checkArgument(StringType.class.isInstance(requestedMap.keyType()), - "Invalid map key type (not string): %s", requestedMap.keyType()); + Preconditions.checkArgument( + requestedMap.valueContainsNull() || !map.isValueOptional(), + "Cannot project a map of optional values as required values: %s", + map); + Preconditions.checkArgument( + StringType.class.isInstance(requestedMap.keyType()), + "Invalid map key type (not string): %s", + requestedMap.keyType()); this.current = requestedMap.valueType(); try { @@ -215,23 +223,32 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu @Override public Type primitive(Type.PrimitiveType primitive) { Class expectedType = TYPES.get(primitive.typeId()); - Preconditions.checkArgument(expectedType != null && expectedType.isInstance(current), - "Cannot project %s to incompatible type: %s", primitive, current); + Preconditions.checkArgument( + expectedType != null && expectedType.isInstance(current), + "Cannot project %s to incompatible type: %s", + primitive, + current); // additional checks based on type switch (primitive.typeId()) { case DECIMAL: Types.DecimalType decimal = (Types.DecimalType) primitive; DecimalType requestedDecimal = (DecimalType) current; - Preconditions.checkArgument(requestedDecimal.scale() == decimal.scale(), - "Cannot project decimal with incompatible scale: %s != %s", requestedDecimal.scale(), decimal.scale()); - Preconditions.checkArgument(requestedDecimal.precision() >= decimal.precision(), + Preconditions.checkArgument( + requestedDecimal.scale() == decimal.scale(), + "Cannot project decimal with incompatible scale: %s != %s", + requestedDecimal.scale(), + decimal.scale()); + Preconditions.checkArgument( + requestedDecimal.precision() >= decimal.precision(), "Cannot project decimal with incompatible precision: %s < %s", - requestedDecimal.precision(), decimal.precision()); + requestedDecimal.precision(), + decimal.precision()); break; case TIMESTAMP: Types.TimestampType timestamp = (Types.TimestampType) primitive; - Preconditions.checkArgument(timestamp.shouldAdjustToUTC(), + Preconditions.checkArgument( + timestamp.shouldAdjustToUTC(), "Cannot project timestamp (without time zone) as timestamptz (with time zone)"); break; default: @@ -240,19 +257,19 @@ public Type primitive(Type.PrimitiveType primitive) { return primitive; } - private static final ImmutableMap> TYPES = ImmutableMap - .>builder() - .put(TypeID.BOOLEAN, BooleanType.class) - .put(TypeID.INTEGER, IntegerType.class) - .put(TypeID.LONG, LongType.class) - .put(TypeID.FLOAT, FloatType.class) - .put(TypeID.DOUBLE, DoubleType.class) - .put(TypeID.DATE, DateType.class) - .put(TypeID.TIMESTAMP, TimestampType.class) - .put(TypeID.DECIMAL, DecimalType.class) - .put(TypeID.UUID, StringType.class) - .put(TypeID.STRING, StringType.class) - .put(TypeID.FIXED, BinaryType.class) - .put(TypeID.BINARY, BinaryType.class) - .build(); + private static final ImmutableMap> TYPES = + ImmutableMap.>builder() + .put(TypeID.BOOLEAN, BooleanType.class) + .put(TypeID.INTEGER, IntegerType.class) + .put(TypeID.LONG, LongType.class) + .put(TypeID.FLOAT, FloatType.class) + .put(TypeID.DOUBLE, DoubleType.class) + .put(TypeID.DATE, DateType.class) + .put(TypeID.TIMESTAMP, TimestampType.class) + .put(TypeID.DECIMAL, DecimalType.class) + .put(TypeID.UUID, StringType.class) + .put(TypeID.STRING, StringType.class) + .put(TypeID.FIXED, BinaryType.class) + .put(TypeID.BINARY, BinaryType.class) + .build(); } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java index c6984e2fe8cd..61a215b938c5 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -68,7 +67,8 @@ public Type schema(Schema schema, Supplier structResult) { @Override public Type struct(Types.StructType struct, Iterable fieldResults) { - Preconditions.checkNotNull(struct, "Cannot prune null struct. Pruning must start with a schema."); + Preconditions.checkNotNull( + struct, "Cannot prune null struct. Pruning must start with a schema."); Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); List fields = struct.fields(); @@ -120,8 +120,10 @@ public Type field(Types.NestedField field, Supplier fieldResult) { int fieldIndex = requestedStruct.fieldIndex(field.name()); StructField requestedField = requestedStruct.fields()[fieldIndex]; - Preconditions.checkArgument(requestedField.nullable() || field.isRequired(), - "Cannot project an optional field as non-null: %s", field.name()); + Preconditions.checkArgument( + requestedField.nullable() || field.isRequired(), + "Cannot project an optional field as non-null: %s", + field.name()); this.current = requestedField.dataType(); try { @@ -139,8 +141,10 @@ public Type list(Types.ListType list, Supplier elementResult) { Preconditions.checkArgument(current instanceof ArrayType, "Not an array: %s", current); ArrayType requestedArray = (ArrayType) current; - Preconditions.checkArgument(requestedArray.containsNull() || !list.isElementOptional(), - "Cannot project an array of optional elements as required elements: %s", requestedArray); + Preconditions.checkArgument( + requestedArray.containsNull() || !list.isElementOptional(), + "Cannot project an array of optional elements as required elements: %s", + requestedArray); this.current = requestedArray.elementType(); try { @@ -165,8 +169,10 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu Preconditions.checkArgument(current instanceof MapType, "Not a map: %s", current); MapType requestedMap = (MapType) current; - Preconditions.checkArgument(requestedMap.valueContainsNull() || !map.isValueOptional(), - "Cannot project a map of optional values as required values: %s", map); + Preconditions.checkArgument( + requestedMap.valueContainsNull() || !map.isValueOptional(), + "Cannot project a map of optional values as required values: %s", + map); this.current = requestedMap.valueType(); try { @@ -188,19 +194,27 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu @Override public Type primitive(Type.PrimitiveType primitive) { Class expectedType = TYPES.get(primitive.typeId()); - Preconditions.checkArgument(expectedType != null && expectedType.isInstance(current), - "Cannot project %s to incompatible type: %s", primitive, current); + Preconditions.checkArgument( + expectedType != null && expectedType.isInstance(current), + "Cannot project %s to incompatible type: %s", + primitive, + current); // additional checks based on type switch (primitive.typeId()) { case DECIMAL: Types.DecimalType decimal = (Types.DecimalType) primitive; DecimalType requestedDecimal = (DecimalType) current; - Preconditions.checkArgument(requestedDecimal.scale() == decimal.scale(), - "Cannot project decimal with incompatible scale: %s != %s", requestedDecimal.scale(), decimal.scale()); - Preconditions.checkArgument(requestedDecimal.precision() >= decimal.precision(), + Preconditions.checkArgument( + requestedDecimal.scale() == decimal.scale(), + "Cannot project decimal with incompatible scale: %s != %s", + requestedDecimal.scale(), + decimal.scale()); + Preconditions.checkArgument( + requestedDecimal.precision() >= decimal.precision(), "Cannot project decimal with incompatible precision: %s < %s", - requestedDecimal.precision(), decimal.precision()); + requestedDecimal.precision(), + decimal.precision()); break; default: } @@ -208,19 +222,19 @@ public Type primitive(Type.PrimitiveType primitive) { return primitive; } - private static final ImmutableMap> TYPES = ImmutableMap - .>builder() - .put(TypeID.BOOLEAN, BooleanType.class) - .put(TypeID.INTEGER, IntegerType.class) - .put(TypeID.LONG, LongType.class) - .put(TypeID.FLOAT, FloatType.class) - .put(TypeID.DOUBLE, DoubleType.class) - .put(TypeID.DATE, DateType.class) - .put(TypeID.TIMESTAMP, TimestampType.class) - .put(TypeID.DECIMAL, DecimalType.class) - .put(TypeID.UUID, StringType.class) - .put(TypeID.STRING, StringType.class) - .put(TypeID.FIXED, BinaryType.class) - .put(TypeID.BINARY, BinaryType.class) - .build(); + private static final ImmutableMap> TYPES = + ImmutableMap.>builder() + .put(TypeID.BOOLEAN, BooleanType.class) + .put(TypeID.INTEGER, IntegerType.class) + .put(TypeID.LONG, LongType.class) + .put(TypeID.FLOAT, FloatType.class) + .put(TypeID.DOUBLE, DoubleType.class) + .put(TypeID.DATE, DateType.class) + .put(TypeID.TIMESTAMP, TimestampType.class) + .put(TypeID.DECIMAL, DecimalType.class) + .put(TypeID.UUID, StringType.class) + .put(TypeID.STRING, StringType.class) + .put(TypeID.FIXED, BinaryType.class) + .put(TypeID.BINARY, BinaryType.class) + .build(); } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java index 79051b12625e..33e5ca936800 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Locale; @@ -159,7 +158,8 @@ public ThisT tableProperty(String name) { protected T parse(Function conversion, T defaultValue) { if (optionName != null) { - // use lower case comparison as DataSourceOptions.asMap() in Spark 2 returns a lower case map + // use lower case comparison as DataSourceOptions.asMap() in Spark 2 returns a lower case + // map String optionValue = options.get(optionName.toLowerCase(Locale.ROOT)); if (optionValue != null) { return conversion.apply(optionValue); diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java index a6390d39c575..87e831872472 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.nio.ByteBuffer; @@ -62,10 +61,12 @@ public SparkDataFile(Types.StructType type, StructType sparkType) { this.wrappedPartition = new SparkStructLike(type.fieldType("partition").asStructType()); Map positions = Maps.newHashMap(); - type.fields().forEach(field -> { - String fieldName = field.name(); - positions.put(fieldName, fieldPosition(fieldName, sparkType)); - }); + type.fields() + .forEach( + field -> { + String fieldName = field.name(); + positions.put(fieldName, fieldPosition(fieldName, sparkType)); + }); filePathPosition = positions.get("file_path"); fileFormatPosition = positions.get("file_format"); @@ -139,23 +140,29 @@ public Map valueCounts() { @Override public Map nullValueCounts() { - return wrapped.isNullAt(nullValueCountsPosition) ? null : wrapped.getJavaMap(nullValueCountsPosition); + return wrapped.isNullAt(nullValueCountsPosition) + ? null + : wrapped.getJavaMap(nullValueCountsPosition); } @Override public Map nanValueCounts() { - return wrapped.isNullAt(nanValueCountsPosition) ? null : wrapped.getJavaMap(nanValueCountsPosition); + return wrapped.isNullAt(nanValueCountsPosition) + ? null + : wrapped.getJavaMap(nanValueCountsPosition); } @Override public Map lowerBounds() { - Map lowerBounds = wrapped.isNullAt(lowerBoundsPosition) ? null : wrapped.getJavaMap(lowerBoundsPosition); + Map lowerBounds = + wrapped.isNullAt(lowerBoundsPosition) ? null : wrapped.getJavaMap(lowerBoundsPosition); return convert(lowerBoundsType, lowerBounds); } @Override public Map upperBounds() { - Map upperBounds = wrapped.isNullAt(upperBoundsPosition) ? null : wrapped.getJavaMap(upperBoundsPosition); + Map upperBounds = + wrapped.isNullAt(upperBoundsPosition) ? null : wrapped.getJavaMap(upperBoundsPosition); return convert(upperBoundsType, upperBounds); } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java index 2eb53baa688e..5c6fe3e0ff96 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import com.google.errorprone.annotations.FormatMethod; @@ -29,8 +28,7 @@ public class SparkExceptionUtil { - private SparkExceptionUtil() { - } + private SparkExceptionUtil() {} /** * Converts checked exceptions to unchecked exceptions. @@ -41,8 +39,8 @@ private SparkExceptionUtil() { * @return unchecked exception. */ @FormatMethod - public static RuntimeException toUncheckedException(final Throwable cause, final String message, - final Object... args) { + public static RuntimeException toUncheckedException( + final Throwable cause, final String message, final Object... args) { // Parameters are required to be final to help @FormatMethod do static analysis if (cause instanceof RuntimeException) { return (RuntimeException) cause; diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java index 0703688b9773..fae12544e78a 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java @@ -16,9 +16,22 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.expressions.Expressions.and; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.isNaN; +import static org.apache.iceberg.expressions.Expressions.isNull; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.not; +import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.or; +import static org.apache.iceberg.expressions.Expressions.startsWith; + import java.sql.Date; import java.sql.Timestamp; import java.util.Objects; @@ -45,40 +58,25 @@ import org.apache.spark.sql.sources.Or; import org.apache.spark.sql.sources.StringStartsWith; -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.isNull; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.not; -import static org.apache.iceberg.expressions.Expressions.notNull; -import static org.apache.iceberg.expressions.Expressions.or; -import static org.apache.iceberg.expressions.Expressions.startsWith; - public class SparkFilters { - private SparkFilters() { - } - - private static final ImmutableMap, Operation> FILTERS = ImmutableMap - ., Operation>builder() - .put(EqualTo.class, Operation.EQ) - .put(EqualNullSafe.class, Operation.EQ) - .put(GreaterThan.class, Operation.GT) - .put(GreaterThanOrEqual.class, Operation.GT_EQ) - .put(LessThan.class, Operation.LT) - .put(LessThanOrEqual.class, Operation.LT_EQ) - .put(In.class, Operation.IN) - .put(IsNull.class, Operation.IS_NULL) - .put(IsNotNull.class, Operation.NOT_NULL) - .put(And.class, Operation.AND) - .put(Or.class, Operation.OR) - .put(Not.class, Operation.NOT) - .put(StringStartsWith.class, Operation.STARTS_WITH) - .build(); + private SparkFilters() {} + + private static final ImmutableMap, Operation> FILTERS = + ImmutableMap., Operation>builder() + .put(EqualTo.class, Operation.EQ) + .put(EqualNullSafe.class, Operation.EQ) + .put(GreaterThan.class, Operation.GT) + .put(GreaterThanOrEqual.class, Operation.GT_EQ) + .put(LessThan.class, Operation.LT) + .put(LessThanOrEqual.class, Operation.LT_EQ) + .put(In.class, Operation.IN) + .put(IsNull.class, Operation.IS_NULL) + .put(IsNotNull.class, Operation.NOT_NULL) + .put(And.class, Operation.AND) + .put(Or.class, Operation.OR) + .put(Not.class, Operation.NOT) + .put(StringStartsWith.class, Operation.STARTS_WITH) + .build(); public static Expression convert(Filter filter) { // avoid using a chain of if instanceof statements by mapping to the expression enum. @@ -113,8 +111,8 @@ public static Expression convert(Filter filter) { if (filter instanceof EqualTo) { EqualTo eq = (EqualTo) filter; // comparison with null in normal equality is always null. this is probably a mistake. - Preconditions.checkNotNull(eq.value(), - "Expression is always false (eq is not null-safe): %s", filter); + Preconditions.checkNotNull( + eq.value(), "Expression is always false (eq is not null-safe): %s", filter); return handleEqual(eq.attribute(), eq.value()); } else { EqualNullSafe eq = (EqualNullSafe) filter; @@ -127,7 +125,8 @@ public static Expression convert(Filter filter) { case IN: In inFilter = (In) filter; - return in(inFilter.attribute(), + return in( + inFilter.attribute(), Stream.of(inFilter.values()) .filter(Objects::nonNull) .map(SparkFilters::convertLiteral) @@ -141,30 +140,33 @@ public static Expression convert(Filter filter) { } return null; - case AND: { - And andFilter = (And) filter; - Expression left = convert(andFilter.left()); - Expression right = convert(andFilter.right()); - if (left != null && right != null) { - return and(left, right); + case AND: + { + And andFilter = (And) filter; + Expression left = convert(andFilter.left()); + Expression right = convert(andFilter.right()); + if (left != null && right != null) { + return and(left, right); + } + return null; } - return null; - } - - case OR: { - Or orFilter = (Or) filter; - Expression left = convert(orFilter.left()); - Expression right = convert(orFilter.right()); - if (left != null && right != null) { - return or(left, right); + + case OR: + { + Or orFilter = (Or) filter; + Expression left = convert(orFilter.left()); + Expression right = convert(orFilter.right()); + if (left != null && right != null) { + return or(left, right); + } + return null; } - return null; - } - case STARTS_WITH: { - StringStartsWith stringStartsWith = (StringStartsWith) filter; - return startsWith(stringStartsWith.attribute(), stringStartsWith.value()); - } + case STARTS_WITH: + { + StringStartsWith stringStartsWith = (StringStartsWith) filter; + return startsWith(stringStartsWith.attribute(), stringStartsWith.value()); + } } } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java index d4dd53d34a97..b35213501aef 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Schema; @@ -27,9 +26,10 @@ /** * By default Spark type {@link org.apache.iceberg.types.Types.TimestampType} should be converted to - * {@link Types.TimestampType#withZone()} iceberg type. But we also can convert - * {@link org.apache.iceberg.types.Types.TimestampType} to {@link Types.TimestampType#withoutZone()} iceberg type - * by setting {@link SparkSQLProperties#USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES} to 'true' + * {@link Types.TimestampType#withZone()} iceberg type. But we also can convert {@link + * org.apache.iceberg.types.Types.TimestampType} to {@link Types.TimestampType#withoutZone()} + * iceberg type by setting {@link SparkSQLProperties#USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES} + * to 'true' */ class SparkFixupTimestampType extends FixupTypes { @@ -38,8 +38,8 @@ private SparkFixupTimestampType(Schema referenceSchema) { } static Schema fixup(Schema schema) { - return new Schema(TypeUtil.visit(schema, - new SparkFixupTimestampType(schema)).asStructType().fields()); + return new Schema( + TypeUtil.visit(schema, new SparkFixupTimestampType(schema)).asStructType().fields()); } @Override diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java index 5508965af249..6c4ec39b20f1 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Schema; @@ -25,8 +24,8 @@ import org.apache.iceberg.types.TypeUtil; /** - * Some types, like binary and fixed, are converted to the same Spark type. Conversion back - * can produce only one, which may not be correct. + * Some types, like binary and fixed, are converted to the same Spark type. Conversion back can + * produce only one, which may not be correct. */ class SparkFixupTypes extends FixupTypes { @@ -35,8 +34,8 @@ private SparkFixupTypes(Schema referenceSchema) { } static Schema fixup(Schema schema, Schema referenceSchema) { - return new Schema(TypeUtil.visit(schema, - new SparkFixupTypes(referenceSchema)).asStructType().fields()); + return new Schema( + TypeUtil.visit(schema, new SparkFixupTypes(referenceSchema)).asStructType().fields()); } @Override diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java index 777b3594768c..425158492c78 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -31,18 +30,21 @@ /** * A class for common Iceberg configs for Spark reads. - *

    - * If a config is set at multiple levels, the following order of precedence is used (top to bottom): + * + *

    If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * *

      - *
    1. Read options
    2. - *
    3. Session configuration
    4. - *
    5. Table metadata
    6. + *
    7. Read options + *
    8. Session configuration + *
    9. Table metadata *
    - * The most specific value is set in read options and takes precedence over all other configs. - * If no read option is provided, this class checks the session configuration for any overrides. - * If no applicable value is found in the session configuration, this class uses the table metadata. - *

    - * Note this class is NOT meant to be serialized and sent to executors. + * + * The most specific value is set in read options and takes precedence over all other configs. If no + * read option is provided, this class checks the session configuration for any overrides. If no + * applicable value is found in the session configuration, this class uses the table metadata. + * + *

    Note this class is NOT meant to be serialized and sent to executors. */ public class SparkReadConf { @@ -64,41 +66,31 @@ public boolean localityEnabled() { if (file instanceof HadoopInputFile) { String scheme = ((HadoopInputFile) file).getFileSystem().getScheme(); boolean defaultValue = LOCALITY_WHITELIST_FS.contains(scheme); - return PropertyUtil.propertyAsBoolean( - readOptions, - SparkReadOptions.LOCALITY, - defaultValue); + return PropertyUtil.propertyAsBoolean(readOptions, SparkReadOptions.LOCALITY, defaultValue); } return false; } public Long snapshotId() { - return confParser.longConf() - .option(SparkReadOptions.SNAPSHOT_ID) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.SNAPSHOT_ID).parseOptional(); } public Long asOfTimestamp() { - return confParser.longConf() - .option(SparkReadOptions.AS_OF_TIMESTAMP) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.AS_OF_TIMESTAMP).parseOptional(); } public Long startSnapshotId() { - return confParser.longConf() - .option(SparkReadOptions.START_SNAPSHOT_ID) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.START_SNAPSHOT_ID).parseOptional(); } public Long endSnapshotId() { - return confParser.longConf() - .option(SparkReadOptions.END_SNAPSHOT_ID) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.END_SNAPSHOT_ID).parseOptional(); } public boolean parquetVectorizationEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.VECTORIZATION_ENABLED) .sessionConf(SparkSQLProperties.VECTORIZATION_ENABLED) .tableProperty(TableProperties.PARQUET_VECTORIZATION_ENABLED) @@ -107,7 +99,8 @@ public boolean parquetVectorizationEnabled() { } public int parquetBatchSize() { - return confParser.intConf() + return confParser + .intConf() .option(SparkReadOptions.VECTORIZATION_BATCH_SIZE) .tableProperty(TableProperties.PARQUET_BATCH_SIZE) .defaultValue(TableProperties.PARQUET_BATCH_SIZE_DEFAULT) @@ -115,7 +108,8 @@ public int parquetBatchSize() { } public boolean orcVectorizationEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.VECTORIZATION_ENABLED) .sessionConf(SparkSQLProperties.VECTORIZATION_ENABLED) .tableProperty(TableProperties.ORC_VECTORIZATION_ENABLED) @@ -124,7 +118,8 @@ public boolean orcVectorizationEnabled() { } public int orcBatchSize() { - return confParser.intConf() + return confParser + .intConf() .option(SparkReadOptions.VECTORIZATION_BATCH_SIZE) .tableProperty(TableProperties.ORC_BATCH_SIZE) .defaultValue(TableProperties.ORC_BATCH_SIZE_DEFAULT) @@ -132,7 +127,8 @@ public int orcBatchSize() { } public long splitSize() { - return confParser.longConf() + return confParser + .longConf() .option(SparkReadOptions.SPLIT_SIZE) .tableProperty(TableProperties.SPLIT_SIZE) .defaultValue(TableProperties.SPLIT_SIZE_DEFAULT) @@ -140,7 +136,8 @@ public long splitSize() { } public int splitLookback() { - return confParser.intConf() + return confParser + .intConf() .option(SparkReadOptions.LOOKBACK) .tableProperty(TableProperties.SPLIT_LOOKBACK) .defaultValue(TableProperties.SPLIT_LOOKBACK_DEFAULT) @@ -148,7 +145,8 @@ public int splitLookback() { } public long splitOpenFileCost() { - return confParser.longConf() + return confParser + .longConf() .option(SparkReadOptions.FILE_OPEN_COST) .tableProperty(TableProperties.SPLIT_OPEN_FILE_COST) .defaultValue(TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT) @@ -157,18 +155,20 @@ public long splitOpenFileCost() { /** * Enables reading a timestamp without time zone as a timestamp with time zone. - *

    - * Generally, this is not safe as a timestamp without time zone is supposed to represent the wall-clock time, - * i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, - * but a timestamp with time zone represents instant semantics, i.e. the timestamp - * is adjusted so that the corresponding time in the reader timezone is displayed. - *

    - * When set to false (default), an exception must be thrown while reading a timestamp without time zone. + * + *

    Generally, this is not safe as a timestamp without time zone is supposed to represent the + * wall-clock time, i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, + * but a timestamp with time zone represents instant semantics, i.e. the timestamp is adjusted so + * that the corresponding time in the reader timezone is displayed. + * + *

    When set to false (default), an exception must be thrown while reading a timestamp without + * time zone. * * @return boolean indicating if reading timestamps without timezone is allowed */ public boolean handleTimestampWithoutZone() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .sessionConf(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .defaultValue(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT) diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java index 571a2d59e70d..0b5c5902fa99 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java @@ -16,16 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; -/** - * Spark DF read options - */ +/** Spark DF read options */ public class SparkReadOptions { - private SparkReadOptions() { - } + private SparkReadOptions() {} // Snapshot ID of the table snapshot to read public static final String SNAPSHOT_ID = "snapshot-id"; @@ -61,7 +57,8 @@ private SparkReadOptions() { public static final String STREAMING_SKIP_DELETE_SNAPSHOTS = "streaming-skip-delete-snapshots"; // Controls whether to allow reading timestamps without zone info - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = "handle-timestamp-without-timezone"; + public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = + "handle-timestamp-without-timezone"; // Controls whether to report locality information to Spark while allocating input partitions public static final String LOCALITY = "locality"; diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java index f2dcc13bece0..fa8bd719f391 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java @@ -16,19 +16,18 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; public class SparkSQLProperties { - private SparkSQLProperties() { - } + private SparkSQLProperties() {} // Controls whether vectorized reads are enabled public static final String VECTORIZATION_ENABLED = "spark.sql.iceberg.vectorization.enabled"; // Controls whether reading/writing timestamps without timezones is allowed - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = "spark.sql.iceberg.handle-timestamp-without-timezone"; + public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = + "spark.sql.iceberg.handle-timestamp-without-timezone"; public static final boolean HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT = false; // Controls whether timestamp types for new tables should be stored with timezone info diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java index 321050dceb74..653987e654aa 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Collection; @@ -40,17 +39,14 @@ import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.StructType; -/** - * Helper methods for working with Spark/Hive metadata. - */ +/** Helper methods for working with Spark/Hive metadata. */ public class SparkSchemaUtil { - private SparkSchemaUtil() { - } + private SparkSchemaUtil() {} /** * Returns a {@link Schema} for the given table with fresh field ids. - *

    - * This creates a Schema for an existing table by looking up the table's schema with Spark and + * + *

    This creates a Schema for an existing table by looking up the table's schema with Spark and * converting that schema. Spark/Hive partition columns are included in the schema. * * @param spark a Spark session @@ -65,8 +61,8 @@ public static Schema schemaForTable(SparkSession spark, String name) { /** * Returns a {@link PartitionSpec} for the given table. - *

    - * This creates a partition spec for an existing table by looking up the table's schema and + * + *

    This creates a partition spec for an existing table by looking up the table's schema and * creating a spec with identity partitions for each partition column. * * @param spark a Spark session @@ -74,14 +70,15 @@ public static Schema schemaForTable(SparkSession spark, String name) { * @return a PartitionSpec for the table * @throws AnalysisException if thrown by the Spark catalog */ - public static PartitionSpec specForTable(SparkSession spark, String name) throws AnalysisException { + public static PartitionSpec specForTable(SparkSession spark, String name) + throws AnalysisException { List parts = Lists.newArrayList(Splitter.on('.').limit(2).split(name)); String db = parts.size() == 1 ? "default" : parts.get(0); String table = parts.get(parts.size() == 1 ? 0 : 1); - PartitionSpec spec = identitySpec( - schemaForTable(spark, name), - spark.catalog().listColumns(db, table).collectAsList()); + PartitionSpec spec = + identitySpec( + schemaForTable(spark, name), spark.catalog().listColumns(db, table).collectAsList()); return spec == null ? PartitionSpec.unpartitioned() : spec; } @@ -109,13 +106,14 @@ public static DataType convert(Type type) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. - *

    - * This conversion assigns fresh ids. - *

    - * Some data types are represented as the same Spark type. These are converted to a default type. - *

    - * To convert using a reference schema for field ids and ambiguous types, use - * {@link #convert(Schema, StructType)}. + * + *

    This conversion assigns fresh ids. + * + *

    Some data types are represented as the same Spark type. These are converted to a default + * type. + * + *

    To convert using a reference schema for field ids and ambiguous types, use {@link + * #convert(Schema, StructType)}. * * @param sparkType a Spark StructType * @return the equivalent Schema @@ -127,16 +125,18 @@ public static Schema convert(StructType sparkType) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. - *

    - * This conversion assigns fresh ids. - *

    - * Some data types are represented as the same Spark type. These are converted to a default type. - *

    - * To convert using a reference schema for field ids and ambiguous types, use - * {@link #convert(Schema, StructType)}. + * + *

    This conversion assigns fresh ids. + * + *

    Some data types are represented as the same Spark type. These are converted to a default + * type. + * + *

    To convert using a reference schema for field ids and ambiguous types, use {@link + * #convert(Schema, StructType)}. * * @param sparkType a Spark StructType - * @param useTimestampWithoutZone boolean flag indicates that timestamp should be stored without timezone + * @param useTimestampWithoutZone boolean flag indicates that timestamp should be stored without + * timezone * @return the equivalent Schema * @throws IllegalArgumentException if the type cannot be converted */ @@ -151,13 +151,14 @@ public static Schema convert(StructType sparkType, boolean useTimestampWithoutZo /** * Convert a Spark {@link DataType struct} to a {@link Type} with new field ids. - *

    - * This conversion assigns fresh ids. - *

    - * Some data types are represented as the same Spark type. These are converted to a default type. - *

    - * To convert using a reference schema for field ids and ambiguous types, use - * {@link #convert(Schema, StructType)}. + * + *

    This conversion assigns fresh ids. + * + *

    Some data types are represented as the same Spark type. These are converted to a default + * type. + * + *

    To convert using a reference schema for field ids and ambiguous types, use {@link + * #convert(Schema, StructType)}. * * @param sparkType a Spark DataType * @return the equivalent Type @@ -169,11 +170,11 @@ public static Type convert(DataType sparkType) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} based on the given schema. - *

    - * This conversion does not assign new ids; it uses ids from the base schema. - *

    - * Data types, field order, and nullability will match the spark type. This conversion may return - * a schema that is not compatible with base schema. + * + *

    This conversion does not assign new ids; it uses ids from the base schema. + * + *

    Data types, field order, and nullability will match the spark type. This conversion may + * return a schema that is not compatible with base schema. * * @param baseSchema a Schema on which conversion is based * @param sparkType a Spark StructType @@ -182,7 +183,8 @@ public static Type convert(DataType sparkType) { */ public static Schema convert(Schema baseSchema, StructType sparkType) { // convert to a type with fresh ids - Types.StructType struct = SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); + Types.StructType struct = + SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); // reassign ids to match the base schema Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema); // fix types that can't be represented in Spark (UUID and Fixed) @@ -191,8 +193,8 @@ public static Schema convert(Schema baseSchema, StructType sparkType) { /** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - *

    - * This requires that the Spark type is a projection of the Schema. Nullability and types must + * + *

    This requires that the Spark type is a projection of the Schema. Nullability and types must * match. * * @param schema a Schema @@ -201,19 +203,20 @@ public static Schema convert(Schema baseSchema, StructType sparkType) { * @throws IllegalArgumentException if the Spark type does not match the Schema */ public static Schema prune(Schema schema, StructType requestedType) { - return new Schema(TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, ImmutableSet.of())) - .asNestedType() - .asStructType() - .fields()); + return new Schema( + TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, ImmutableSet.of())) + .asNestedType() + .asStructType() + .fields()); } /** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - *

    - * This requires that the Spark type is a projection of the Schema. Nullability and types must + * + *

    This requires that the Spark type is a projection of the Schema. Nullability and types must * match. - *

    - * The filters list of {@link Expression} is used to ensure that columns referenced by filters + * + *

    The filters list of {@link Expression} is used to ensure that columns referenced by filters * are projected. * * @param schema a Schema @@ -224,19 +227,20 @@ public static Schema prune(Schema schema, StructType requestedType) { */ public static Schema prune(Schema schema, StructType requestedType, List filters) { Set filterRefs = Binder.boundReferences(schema.asStruct(), filters, true); - return new Schema(TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) - .asNestedType() - .asStructType() - .fields()); + return new Schema( + TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) + .asNestedType() + .asStructType() + .fields()); } /** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - *

    - * This requires that the Spark type is a projection of the Schema. Nullability and types must + * + *

    This requires that the Spark type is a projection of the Schema. Nullability and types must * match. - *

    - * The filters list of {@link Expression} is used to ensure that columns referenced by filters + * + *

    The filters list of {@link Expression} is used to ensure that columns referenced by filters * are projected. * * @param schema a Schema @@ -245,14 +249,16 @@ public static Schema prune(Schema schema, StructType requestedType, List filterRefs = Binder.boundReferences(schema.asStruct(), Collections.singletonList(filter), caseSensitive); - return new Schema(TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) - .asNestedType() - .asStructType() - .fields()); + return new Schema( + TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) + .asNestedType() + .asStructType() + .fields()); } private static PartitionSpec identitySpec(Schema schema, Collection columns) { @@ -282,7 +288,7 @@ private static PartitionSpec identitySpec(Schema schema, List partitionN /** * Estimate approximate table size based on Spark schema and total records. * - * @param tableSchema Spark schema + * @param tableSchema Spark schema * @param totalRecords total records in the table * @return approximate size based on table schema */ diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java index 30509e3381dc..77cfa0f34c63 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.StructLike; diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java index f731d6089a97..584468fa006c 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.spark.sql.functions.col; + import java.io.IOException; import java.io.Serializable; import java.net.URI; @@ -94,8 +95,6 @@ import scala.collection.Seq; import scala.runtime.AbstractPartialFunction; -import static org.apache.spark.sql.functions.col; - /** * Java version of the original SparkTableUtil.scala * https://github.com/apache/iceberg/blob/apache-iceberg-0.8.0-incubating/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala @@ -109,20 +108,19 @@ public class SparkTableUtil { private static final PathFilter HIDDEN_PATH_FILTER = p -> !p.getName().startsWith("_") && !p.getName().startsWith("."); - private static final String duplicateFileMessage = "Cannot complete import because data files " + - "to be imported already exist within the target table: %s. " + - "This is disabled by default as Iceberg is not designed for mulitple references to the same file" + - " within the same table. If you are sure, you may set 'check_duplicate_files' to false to force the import."; - + private static final String duplicateFileMessage = + "Cannot complete import because data files " + + "to be imported already exist within the target table: %s. " + + "This is disabled by default as Iceberg is not designed for mulitple references to the same file" + + " within the same table. If you are sure, you may set 'check_duplicate_files' to false to force the import."; - private SparkTableUtil() { - } + private SparkTableUtil() {} /** * Returns a DataFrame with a row for each partition in the table. * - * The DataFrame has 3 columns, partition key (a=1/b=2), partition location, and format - * (avro or parquet). + *

    The DataFrame has 3 columns, partition key (a=1/b=2), partition location, and format (avro + * or parquet). * * @param spark a Spark session * @param table a table name and (optional) database @@ -130,7 +128,9 @@ private SparkTableUtil() { */ public static Dataset partitionDF(SparkSession spark, String table) { List partitions = getPartitions(spark, table); - return spark.createDataFrame(partitions, SparkPartition.class).toDF("partition", "uri", "format"); + return spark + .createDataFrame(partitions, SparkPartition.class) + .toDF("partition", "uri", "format"); } /** @@ -141,9 +141,12 @@ public static Dataset partitionDF(SparkSession spark, String table) { * @param expression The expression whose matching partitions are returned. * @return a DataFrame of the table partitions. */ - public static Dataset partitionDFByFilter(SparkSession spark, String table, String expression) { + public static Dataset partitionDFByFilter( + SparkSession spark, String table, String expression) { List partitions = getPartitionsByFilter(spark, table, expression); - return spark.createDataFrame(partitions, SparkPartition.class).toDF("partition", "uri", "format"); + return spark + .createDataFrame(partitions, SparkPartition.class) + .toDF("partition", "uri", "format"); } /** @@ -158,7 +161,8 @@ public static List getPartitions(SparkSession spark, String tabl TableIdentifier tableIdent = spark.sessionState().sqlParser().parseTableIdentifier(table); return getPartitions(spark, tableIdent, null); } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to parse table identifier: %s", table); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to parse table identifier: %s", table); } } @@ -170,30 +174,33 @@ public static List getPartitions(SparkSession spark, String tabl * @param partitionFilter partition filter, or null if no filter * @return all table's partitions */ - public static List getPartitions(SparkSession spark, TableIdentifier tableIdent, - Map partitionFilter) { + public static List getPartitions( + SparkSession spark, TableIdentifier tableIdent, Map partitionFilter) { try { SessionCatalog catalog = spark.sessionState().catalog(); CatalogTable catalogTable = catalog.getTableMetadata(tableIdent); Option> scalaPartitionFilter; if (partitionFilter != null && !partitionFilter.isEmpty()) { - scalaPartitionFilter = Option.apply(JavaConverters.mapAsScalaMapConverter(partitionFilter).asScala() - .toMap(Predef.conforms())); + scalaPartitionFilter = + Option.apply( + JavaConverters.mapAsScalaMapConverter(partitionFilter) + .asScala() + .toMap(Predef.conforms())); } else { scalaPartitionFilter = Option.empty(); } - Seq partitions = catalog.listPartitions(tableIdent, scalaPartitionFilter); - return JavaConverters - .seqAsJavaListConverter(partitions) - .asJava() - .stream() + Seq partitions = + catalog.listPartitions(tableIdent, scalaPartitionFilter); + return JavaConverters.seqAsJavaListConverter(partitions).asJava().stream() .map(catalogPartition -> toSparkPartition(catalogPartition, catalogTable)) .collect(Collectors.toList()); } catch (NoSuchDatabaseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Database not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Database not found in catalog.", tableIdent); } catch (NoSuchTableException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Table not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Table not found in catalog.", tableIdent); } } @@ -205,19 +212,22 @@ public static List getPartitions(SparkSession spark, TableIdenti * @param predicate a predicate on partition columns * @return matching table's partitions */ - public static List getPartitionsByFilter(SparkSession spark, String table, String predicate) { + public static List getPartitionsByFilter( + SparkSession spark, String table, String predicate) { TableIdentifier tableIdent; try { tableIdent = spark.sessionState().sqlParser().parseTableIdentifier(table); } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to parse the table identifier: %s", table); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to parse the table identifier: %s", table); } Expression unresolvedPredicateExpr; try { unresolvedPredicateExpr = spark.sessionState().sqlParser().parseExpression(predicate); } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to parse the predicate expression: %s", predicate); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to parse the predicate expression: %s", predicate); } Expression resolvedPredicateExpr = resolveAttrs(spark, table, unresolvedPredicateExpr); @@ -232,8 +242,8 @@ public static List getPartitionsByFilter(SparkSession spark, Str * @param predicateExpr a predicate expression on partition columns * @return matching table's partitions */ - public static List getPartitionsByFilter(SparkSession spark, TableIdentifier tableIdent, - Expression predicateExpr) { + public static List getPartitionsByFilter( + SparkSession spark, TableIdentifier tableIdent, Expression predicateExpr) { try { SessionCatalog catalog = spark.sessionState().catalog(); CatalogTable catalogTable = catalog.getTableMetadata(tableIdent); @@ -244,111 +254,131 @@ public static List getPartitionsByFilter(SparkSession spark, Tab } else { resolvedPredicateExpr = predicateExpr; } - Seq predicates = JavaConverters - .collectionAsScalaIterableConverter(ImmutableList.of(resolvedPredicateExpr)) - .asScala().toSeq(); + Seq predicates = + JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(resolvedPredicateExpr)) + .asScala() + .toSeq(); - Seq partitions = catalog.listPartitionsByFilter(tableIdent, predicates); + Seq partitions = + catalog.listPartitionsByFilter(tableIdent, predicates); - return JavaConverters - .seqAsJavaListConverter(partitions) - .asJava() - .stream() + return JavaConverters.seqAsJavaListConverter(partitions).asJava().stream() .map(catalogPartition -> toSparkPartition(catalogPartition, catalogTable)) .collect(Collectors.toList()); } catch (NoSuchDatabaseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Database not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Database not found in catalog.", tableIdent); } catch (NoSuchTableException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Table not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Table not found in catalog.", tableIdent); } } /** * Returns the data files in a partition by listing the partition location. * - * For Parquet and ORC partitions, this will read metrics from the file footer. For Avro partitions, - * metrics are set to null. + *

    For Parquet and ORC partitions, this will read metrics from the file footer. For Avro + * partitions, metrics are set to null. * * @param partition a partition * @param conf a serializable Hadoop conf * @param metricsConfig a metrics conf * @return a List of DataFile - * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, Configuration, - * MetricsConfig, NameMapping)} + * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, + * Configuration, MetricsConfig, NameMapping)} */ @Deprecated - public static List listPartition(SparkPartition partition, PartitionSpec spec, - SerializableConfiguration conf, MetricsConfig metricsConfig) { + public static List listPartition( + SparkPartition partition, + PartitionSpec spec, + SerializableConfiguration conf, + MetricsConfig metricsConfig) { return listPartition(partition, spec, conf, metricsConfig, null); } /** * Returns the data files in a partition by listing the partition location. * - * For Parquet and ORC partitions, this will read metrics from the file footer. For Avro partitions, - * metrics are set to null. + *

    For Parquet and ORC partitions, this will read metrics from the file footer. For Avro + * partitions, metrics are set to null. * * @param partition a partition * @param conf a serializable Hadoop conf * @param metricsConfig a metrics conf * @param mapping a name mapping * @return a List of DataFile - * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, Configuration, - * MetricsConfig, NameMapping)} + * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, + * Configuration, MetricsConfig, NameMapping)} */ @Deprecated - public static List listPartition(SparkPartition partition, PartitionSpec spec, - SerializableConfiguration conf, MetricsConfig metricsConfig, - NameMapping mapping) { - return TableMigrationUtil.listPartition(partition.values, partition.uri, partition.format, spec, conf.get(), - metricsConfig, mapping); + public static List listPartition( + SparkPartition partition, + PartitionSpec spec, + SerializableConfiguration conf, + MetricsConfig metricsConfig, + NameMapping mapping) { + return TableMigrationUtil.listPartition( + partition.values, + partition.uri, + partition.format, + spec, + conf.get(), + metricsConfig, + mapping); } - - private static SparkPartition toSparkPartition(CatalogTablePartition partition, CatalogTable table) { + private static SparkPartition toSparkPartition( + CatalogTablePartition partition, CatalogTable table) { Option locationUri = partition.storage().locationUri(); Option serde = partition.storage().serde(); Preconditions.checkArgument(locationUri.nonEmpty(), "Partition URI should be defined"); - Preconditions.checkArgument(serde.nonEmpty() || table.provider().nonEmpty(), - "Partition format should be defined"); + Preconditions.checkArgument( + serde.nonEmpty() || table.provider().nonEmpty(), "Partition format should be defined"); String uri = Util.uriToString(locationUri.get()); String format = serde.nonEmpty() ? serde.get() : table.provider().get(); - Map partitionSpec = JavaConverters.mapAsJavaMapConverter(partition.spec()).asJava(); + Map partitionSpec = + JavaConverters.mapAsJavaMapConverter(partition.spec()).asJava(); return new SparkPartition(partitionSpec, uri, format); } private static Expression resolveAttrs(SparkSession spark, String table, Expression expr) { Function2 resolver = spark.sessionState().analyzer().resolver(); LogicalPlan plan = spark.table(table).queryExecution().analyzed(); - return expr.transform(new AbstractPartialFunction() { - @Override - public Expression apply(Expression attr) { - UnresolvedAttribute unresolvedAttribute = (UnresolvedAttribute) attr; - Option namedExpressionOption = plan.resolve(unresolvedAttribute.nameParts(), resolver); - if (namedExpressionOption.isDefined()) { - return (Expression) namedExpressionOption.get(); - } else { - throw new IllegalArgumentException( - String.format("Could not resolve %s using columns: %s", attr, plan.output())); - } - } - - @Override - public boolean isDefinedAt(Expression attr) { - return attr instanceof UnresolvedAttribute; - } - }); + return expr.transform( + new AbstractPartialFunction() { + @Override + public Expression apply(Expression attr) { + UnresolvedAttribute unresolvedAttribute = (UnresolvedAttribute) attr; + Option namedExpressionOption = + plan.resolve(unresolvedAttribute.nameParts(), resolver); + if (namedExpressionOption.isDefined()) { + return (Expression) namedExpressionOption.get(); + } else { + throw new IllegalArgumentException( + String.format("Could not resolve %s using columns: %s", attr, plan.output())); + } + } + + @Override + public boolean isDefinedAt(Expression attr) { + return attr instanceof UnresolvedAttribute; + } + }); } - private static Iterator buildManifest(SerializableConfiguration conf, PartitionSpec spec, - String basePath, Iterator> fileTuples) { + private static Iterator buildManifest( + SerializableConfiguration conf, + PartitionSpec spec, + String basePath, + Iterator> fileTuples) { if (fileTuples.hasNext()) { FileIO io = new HadoopFileIO(conf.get()); TaskContext ctx = TaskContext.get(); - String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId()); + String suffix = + String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId()); Path location = new Path(basePath, suffix); String outputPath = FileFormat.AVRO.addExtension(location.toString()); OutputFile outputFile = io.newOutputFile(outputPath); @@ -357,7 +387,8 @@ private static Iterator buildManifest(SerializableConfiguration co try (ManifestWriter writerRef = writer) { fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2)); } catch (IOException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to close the manifest writer: %s", outputPath); } ManifestFile manifestFile = writer.toManifestFile(); @@ -370,42 +401,54 @@ private static Iterator buildManifest(SerializableConfiguration co /** * Import files from an existing Spark table to an Iceberg table. * - * The import uses the Spark session to get table metadata. It assumes no - * operation is going on the original and target table and thus is not - * thread-safe. + *

    The import uses the Spark session to get table metadata. It assumes no operation is going on + * the original and target table and thus is not thread-safe. * * @param spark a Spark session * @param sourceTableIdent an identifier of the source Spark table * @param targetTable an Iceberg table where to import the data * @param stagingDir a staging directory to store temporary manifest files - * @param partitionFilter only import partitions whose values match those in the map, can be partially defined + * @param partitionFilter only import partitions whose values match those in the map, can be + * partially defined * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file */ - public static void importSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, - String stagingDir, Map partitionFilter, - boolean checkDuplicateFiles) { + public static void importSparkTable( + SparkSession spark, + TableIdentifier sourceTableIdent, + Table targetTable, + String stagingDir, + Map partitionFilter, + boolean checkDuplicateFiles) { SessionCatalog catalog = spark.sessionState().catalog(); - String db = sourceTableIdent.database().nonEmpty() ? - sourceTableIdent.database().get() : - catalog.getCurrentDatabase(); - TableIdentifier sourceTableIdentWithDB = new TableIdentifier(sourceTableIdent.table(), Some.apply(db)); + String db = + sourceTableIdent.database().nonEmpty() + ? sourceTableIdent.database().get() + : catalog.getCurrentDatabase(); + TableIdentifier sourceTableIdentWithDB = + new TableIdentifier(sourceTableIdent.table(), Some.apply(db)); if (!catalog.tableExists(sourceTableIdentWithDB)) { - throw new org.apache.iceberg.exceptions.NoSuchTableException("Table %s does not exist", sourceTableIdentWithDB); + throw new org.apache.iceberg.exceptions.NoSuchTableException( + "Table %s does not exist", sourceTableIdentWithDB); } try { - PartitionSpec spec = SparkSchemaUtil.specForTable(spark, sourceTableIdentWithDB.unquotedString()); + PartitionSpec spec = + SparkSchemaUtil.specForTable(spark, sourceTableIdentWithDB.unquotedString()); if (Objects.equal(spec, PartitionSpec.unpartitioned())) { - importUnpartitionedSparkTable(spark, sourceTableIdentWithDB, targetTable, checkDuplicateFiles); + importUnpartitionedSparkTable( + spark, sourceTableIdentWithDB, targetTable, checkDuplicateFiles); } else { - List sourceTablePartitions = getPartitions(spark, sourceTableIdent, - partitionFilter); - Preconditions.checkArgument(!sourceTablePartitions.isEmpty(), - "Cannot find any partitions in table %s", sourceTableIdent); - importSparkPartitions(spark, sourceTablePartitions, targetTable, spec, stagingDir, checkDuplicateFiles); + List sourceTablePartitions = + getPartitions(spark, sourceTableIdent, partitionFilter); + Preconditions.checkArgument( + !sourceTablePartitions.isEmpty(), + "Cannot find any partitions in table %s", + sourceTableIdent); + importSparkPartitions( + spark, sourceTablePartitions, targetTable, spec, stagingDir, checkDuplicateFiles); } } catch (AnalysisException e) { throw SparkExceptionUtil.toUncheckedException( @@ -416,9 +459,8 @@ public static void importSparkTable(SparkSession spark, TableIdentifier sourceTa /** * Import files from an existing Spark table to an Iceberg table. * - * The import uses the Spark session to get table metadata. It assumes no - * operation is going on the original and target table and thus is not - * thread-safe. + *

    The import uses the Spark session to get table metadata. It assumes no operation is going on + * the original and target table and thus is not thread-safe. * * @param spark a Spark session * @param sourceTableIdent an identifier of the source Spark table @@ -426,33 +468,49 @@ public static void importSparkTable(SparkSession spark, TableIdentifier sourceTa * @param stagingDir a staging directory to store temporary manifest files * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file */ - public static void importSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, - String stagingDir, boolean checkDuplicateFiles) { - importSparkTable(spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), checkDuplicateFiles); + public static void importSparkTable( + SparkSession spark, + TableIdentifier sourceTableIdent, + Table targetTable, + String stagingDir, + boolean checkDuplicateFiles) { + importSparkTable( + spark, + sourceTableIdent, + targetTable, + stagingDir, + Collections.emptyMap(), + checkDuplicateFiles); } /** * Import files from an existing Spark table to an Iceberg table. * - * The import uses the Spark session to get table metadata. It assumes no - * operation is going on the original and target table and thus is not - * thread-safe. + *

    The import uses the Spark session to get table metadata. It assumes no operation is going on + * the original and target table and thus is not thread-safe. + * * @param spark a Spark session * @param sourceTableIdent an identifier of the source Spark table * @param targetTable an Iceberg table where to import the data * @param stagingDir a staging directory to store temporary manifest files */ - public static void importSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, - String stagingDir) { - importSparkTable(spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), false); + public static void importSparkTable( + SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, String stagingDir) { + importSparkTable( + spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), false); } - private static void importUnpartitionedSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, - Table targetTable, boolean checkDuplicateFiles) { + private static void importUnpartitionedSparkTable( + SparkSession spark, + TableIdentifier sourceTableIdent, + Table targetTable, + boolean checkDuplicateFiles) { try { CatalogTable sourceTable = spark.sessionState().catalog().getTableMetadata(sourceTableIdent); Option format = - sourceTable.storage().serde().nonEmpty() ? sourceTable.storage().serde() : sourceTable.provider(); + sourceTable.storage().serde().nonEmpty() + ? sourceTable.storage().serde() + : sourceTable.provider(); Preconditions.checkArgument(format.nonEmpty(), "Could not determine table format"); Map partition = Collections.emptyMap(); @@ -460,20 +518,34 @@ private static void importUnpartitionedSparkTable(SparkSession spark, TableIdent Configuration conf = spark.sessionState().newHadoopConf(); MetricsConfig metricsConfig = MetricsConfig.forTable(targetTable); String nameMappingString = targetTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; - - List files = TableMigrationUtil.listPartition( - partition, Util.uriToString(sourceTable.location()), format.get(), spec, conf, metricsConfig, nameMapping); + NameMapping nameMapping = + nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; + + List files = + TableMigrationUtil.listPartition( + partition, + Util.uriToString(sourceTable.location()), + format.get(), + spec, + conf, + metricsConfig, + nameMapping); if (checkDuplicateFiles) { - Dataset importedFiles = spark.createDataset( - Lists.transform(files, f -> f.path().toString()), Encoders.STRING()).toDF("file_path"); - Dataset existingFiles = loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); - Column joinCond = existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); - Dataset duplicates = importedFiles.join(existingFiles, joinCond) - .select("file_path").as(Encoders.STRING()); - Preconditions.checkState(duplicates.isEmpty(), - String.format(duplicateFileMessage, Joiner.on(",").join((String[]) duplicates.take(10)))); + Dataset importedFiles = + spark + .createDataset(Lists.transform(files, f -> f.path().toString()), Encoders.STRING()) + .toDF("file_path"); + Dataset existingFiles = + loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); + Column joinCond = + existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); + Dataset duplicates = + importedFiles.join(existingFiles, joinCond).select("file_path").as(Encoders.STRING()); + Preconditions.checkState( + duplicates.isEmpty(), + String.format( + duplicateFileMessage, Joiner.on(",").join((String[]) duplicates.take(10)))); } AppendFiles append = targetTable.newAppend(); @@ -498,55 +570,72 @@ private static void importUnpartitionedSparkTable(SparkSession spark, TableIdent * @param stagingDir a staging directory to store temporary manifest files * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file */ - public static void importSparkPartitions(SparkSession spark, List partitions, Table targetTable, - PartitionSpec spec, String stagingDir, boolean checkDuplicateFiles) { + public static void importSparkPartitions( + SparkSession spark, + List partitions, + Table targetTable, + PartitionSpec spec, + String stagingDir, + boolean checkDuplicateFiles) { Configuration conf = spark.sessionState().newHadoopConf(); SerializableConfiguration serializableConf = new SerializableConfiguration(conf); - int parallelism = Math.min(partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism()); + int parallelism = + Math.min( + partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism()); int numShufflePartitions = spark.sessionState().conf().numShufflePartitions(); MetricsConfig metricsConfig = MetricsConfig.fromProperties(targetTable.properties()); String nameMappingString = targetTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; + NameMapping nameMapping = + nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD partitionRDD = sparkContext.parallelize(partitions, parallelism); - Dataset partitionDS = spark.createDataset( - partitionRDD.rdd(), - Encoders.javaSerialization(SparkPartition.class)); + Dataset partitionDS = + spark.createDataset(partitionRDD.rdd(), Encoders.javaSerialization(SparkPartition.class)); - Dataset filesToImport = partitionDS - .flatMap((FlatMapFunction) sparkPartition -> - listPartition(sparkPartition, spec, serializableConf, metricsConfig, nameMapping).iterator(), + Dataset filesToImport = + partitionDS.flatMap( + (FlatMapFunction) + sparkPartition -> + listPartition( + sparkPartition, spec, serializableConf, metricsConfig, nameMapping) + .iterator(), Encoders.javaSerialization(DataFile.class)); if (checkDuplicateFiles) { - Dataset importedFiles = filesToImport.map(f -> f.path().toString(), Encoders.STRING()).toDF("file_path"); + Dataset importedFiles = + filesToImport.map(f -> f.path().toString(), Encoders.STRING()).toDF("file_path"); Dataset existingFiles = loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); - Column joinCond = existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); - Dataset duplicates = importedFiles.join(existingFiles, joinCond) - .select("file_path").as(Encoders.STRING()); - Preconditions.checkState(duplicates.isEmpty(), + Column joinCond = + existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); + Dataset duplicates = + importedFiles.join(existingFiles, joinCond).select("file_path").as(Encoders.STRING()); + Preconditions.checkState( + duplicates.isEmpty(), String.format(duplicateFileMessage, Joiner.on(",").join((String[]) duplicates.take(10)))); } - List manifests = filesToImport - .repartition(numShufflePartitions) - .map((MapFunction>) file -> - Tuple2.apply(file.path().toString(), file), - Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class))) - .orderBy(col("_1")) - .mapPartitions( - (MapPartitionsFunction, ManifestFile>) fileTuple -> - buildManifest(serializableConf, spec, stagingDir, fileTuple), - Encoders.javaSerialization(ManifestFile.class)) - .collectAsList(); + List manifests = + filesToImport + .repartition(numShufflePartitions) + .map( + (MapFunction>) + file -> Tuple2.apply(file.path().toString(), file), + Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class))) + .orderBy(col("_1")) + .mapPartitions( + (MapPartitionsFunction, ManifestFile>) + fileTuple -> buildManifest(serializableConf, spec, stagingDir, fileTuple), + Encoders.javaSerialization(ManifestFile.class)) + .collectAsList(); try { - boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean( - targetTable.properties(), - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); + boolean snapshotIdInheritanceEnabled = + PropertyUtil.propertyAsBoolean( + targetTable.properties(), + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); AppendFiles append = targetTable.newAppend(); manifests.forEach(append::appendManifest); @@ -571,13 +660,17 @@ public static void importSparkPartitions(SparkSession spark, List partitions, Table targetTable, - PartitionSpec spec, String stagingDir) { + public static void importSparkPartitions( + SparkSession spark, + List partitions, + Table targetTable, + PartitionSpec spec, + String stagingDir) { importSparkPartitions(spark, partitions, targetTable, spec, stagingDir, false); } - public static List filterPartitions(List partitions, - Map partitionFilter) { + public static List filterPartitions( + List partitions, Map partitionFilter) { if (partitionFilter.isEmpty()) { return partitions; } else { @@ -595,17 +688,25 @@ private static void deleteManifests(FileIO io, List manifests) { } // Attempt to use Spark3 Catalog resolution if available on the path - private static final DynMethods.UnboundMethod LOAD_METADATA_TABLE = DynMethods.builder("loadMetadataTable") - .hiddenImpl("org.apache.iceberg.spark.Spark3Util", SparkSession.class, Table.class, MetadataTableType.class) - .orNoop() - .build(); - - public static Dataset loadCatalogMetadataTable(SparkSession spark, Table table, MetadataTableType type) { - Preconditions.checkArgument(!LOAD_METADATA_TABLE.isNoop(), "Cannot find Spark3Util class but Spark3 is in use"); + private static final DynMethods.UnboundMethod LOAD_METADATA_TABLE = + DynMethods.builder("loadMetadataTable") + .hiddenImpl( + "org.apache.iceberg.spark.Spark3Util", + SparkSession.class, + Table.class, + MetadataTableType.class) + .orNoop() + .build(); + + public static Dataset loadCatalogMetadataTable( + SparkSession spark, Table table, MetadataTableType type) { + Preconditions.checkArgument( + !LOAD_METADATA_TABLE.isNoop(), "Cannot find Spark3Util class but Spark3 is in use"); return LOAD_METADATA_TABLE.asStatic().invoke(spark, table, type); } - public static Dataset loadMetadataTable(SparkSession spark, Table table, MetadataTableType type) { + public static Dataset loadMetadataTable( + SparkSession spark, Table table, MetadataTableType type) { if (spark.version().startsWith("3")) { // construct the metadata table instance directly Dataset catalogMetadataTable = loadCatalogMetadataTable(spark, table, type); @@ -631,14 +732,12 @@ public static Dataset loadMetadataTable(SparkSession spark, Table table, Me // Try loading by name as a Hive table without Catalog return dataFrameReader.load(tableName.replaceFirst("hive\\.", "") + "." + type); } else { - throw new IllegalArgumentException(String.format( - "Cannot find the metadata table for %s of type %s", tableName, type)); + throw new IllegalArgumentException( + String.format("Cannot find the metadata table for %s of type %s", tableName, type)); } } - /** - * Class representing a table partition. - */ + /** Class representing a table partition. */ public static class SparkPartition implements Serializable { private final Map values; private final String uri; @@ -680,9 +779,9 @@ public boolean equals(Object o) { return false; } SparkPartition that = (SparkPartition) o; - return Objects.equal(values, that.values) && - Objects.equal(uri, that.uri) && - Objects.equal(format, that.format); + return Objects.equal(values, that.values) + && Objects.equal(uri, that.uri) + && Objects.equal(format, that.format); } @Override diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java index f0b8b2a9762b..17499736fbeb 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -70,7 +69,7 @@ public Type struct(StructType struct, List types) { List newFields = Lists.newArrayListWithExpectedSize(fields.length); boolean isRoot = root == struct; for (int i = 0; i < fields.length; i += 1) { - StructField field = fields[i]; + StructField field = fields[i]; Type type = types.get(i); int id; @@ -122,10 +121,9 @@ public Type atomic(DataType atomic) { if (atomic instanceof BooleanType) { return Types.BooleanType.get(); - } else if ( - atomic instanceof IntegerType || - atomic instanceof ShortType || - atomic instanceof ByteType) { + } else if (atomic instanceof IntegerType + || atomic instanceof ShortType + || atomic instanceof ByteType) { return Types.IntegerType.get(); } else if (atomic instanceof LongType) { @@ -137,10 +135,9 @@ public Type atomic(DataType atomic) { } else if (atomic instanceof DoubleType) { return Types.DoubleType.get(); - } else if ( - atomic instanceof StringType || - atomic instanceof CharType || - atomic instanceof VarcharType) { + } else if (atomic instanceof StringType + || atomic instanceof CharType + || atomic instanceof VarcharType) { return Types.StringType.get(); } else if (atomic instanceof DateType) { @@ -151,13 +148,11 @@ public Type atomic(DataType atomic) { } else if (atomic instanceof DecimalType) { return Types.DecimalType.of( - ((DecimalType) atomic).precision(), - ((DecimalType) atomic).scale()); + ((DecimalType) atomic).precision(), ((DecimalType) atomic).scale()); } else if (atomic instanceof BinaryType) { return Types.BinaryType.get(); } - throw new UnsupportedOperationException( - "Not a supported type: " + atomic.catalogString()); + throw new UnsupportedOperationException("Not a supported type: " + atomic.catalogString()); } } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java index 83b31940711e..1ef694263fa4 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -35,26 +34,22 @@ static T visit(DataType type, SparkTypeVisitor visitor) { List fieldResults = Lists.newArrayListWithExpectedSize(fields.length); for (StructField field : fields) { - fieldResults.add(visitor.field( - field, - visit(field.dataType(), visitor))); + fieldResults.add(visitor.field(field, visit(field.dataType(), visitor))); } return visitor.struct((StructType) type, fieldResults); } else if (type instanceof MapType) { - return visitor.map((MapType) type, + return visitor.map( + (MapType) type, visit(((MapType) type).keyType(), visitor), visit(((MapType) type).valueType(), visitor)); } else if (type instanceof ArrayType) { - return visitor.array( - (ArrayType) type, - visit(((ArrayType) type).elementType(), visitor)); + return visitor.array((ArrayType) type, visit(((ArrayType) type).elementType(), visitor)); } else if (type instanceof UserDefinedType) { - throw new UnsupportedOperationException( - "User-defined types are not supported"); + throw new UnsupportedOperationException("User-defined types are not supported"); } else { return visitor.atomic(type); diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java index 06f74d4fda06..2cdec2b0629c 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -42,26 +41,33 @@ public class SparkUtil { - public static final String TIMESTAMP_WITHOUT_TIMEZONE_ERROR = String.format("Cannot handle timestamp without" + - " timezone fields in Spark. Spark does not natively support this type but if you would like to handle all" + - " timestamps as timestamp with timezone set '%s' to true. This will not change the underlying values stored" + - " but will change their displayed values in Spark. For more information please see" + - " https://docs.databricks.com/spark/latest/dataframes-datasets/dates-timestamps.html#ansi-sql-and" + - "-spark-sql-timestamps", SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); + public static final String TIMESTAMP_WITHOUT_TIMEZONE_ERROR = + String.format( + "Cannot handle timestamp without" + + " timezone fields in Spark. Spark does not natively support this type but if you would like to handle all" + + " timestamps as timestamp with timezone set '%s' to true. This will not change the underlying values stored" + + " but will change their displayed values in Spark. For more information please see" + + " https://docs.databricks.com/spark/latest/dataframes-datasets/dates-timestamps.html#ansi-sql-and" + + "-spark-sql-timestamps", + SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); private static final String SPARK_CATALOG_CONF_PREFIX = "spark.sql.catalog"; - // Format string used as the prefix for spark configuration keys to override hadoop configuration values - // for Iceberg tables from a given catalog. These keys can be specified as `spark.sql.catalog.$catalogName.hadoop.*`, - // similar to using `spark.hadoop.*` to override hadoop configurations globally for a given spark session. - private static final String SPARK_CATALOG_HADOOP_CONF_OVERRIDE_FMT_STR = SPARK_CATALOG_CONF_PREFIX + ".%s.hadoop."; + // Format string used as the prefix for spark configuration keys to override hadoop configuration + // values + // for Iceberg tables from a given catalog. These keys can be specified as + // `spark.sql.catalog.$catalogName.hadoop.*`, + // similar to using `spark.hadoop.*` to override hadoop configurations globally for a given spark + // session. + private static final String SPARK_CATALOG_HADOOP_CONF_OVERRIDE_FMT_STR = + SPARK_CATALOG_CONF_PREFIX + ".%s.hadoop."; - private SparkUtil() { - } + private SparkUtil() {} public static FileIO serializableFileIO(Table table) { if (table.io() instanceof HadoopConfigurable) { // we need to use Spark's SerializableConfiguration to avoid issues with Kryo serialization - ((HadoopConfigurable) table.io()).serializeConfWith(conf -> new SerializableConfiguration(conf)::value); + ((HadoopConfigurable) table.io()) + .serializeConfWith(conf -> new SerializableConfiguration(conf)::value); } return table.io(); @@ -75,11 +81,12 @@ public static FileIO serializableFileIO(Table table) { */ public static void validatePartitionTransforms(PartitionSpec spec) { if (spec.fields().stream().anyMatch(field -> field.transform() instanceof UnknownTransform)) { - String unsupported = spec.fields().stream() - .map(PartitionField::transform) - .filter(transform -> transform instanceof UnknownTransform) - .map(Transform::toString) - .collect(Collectors.joining(", ")); + String unsupported = + spec.fields().stream() + .map(PartitionField::transform) + .filter(transform -> transform instanceof UnknownTransform) + .map(Transform::toString) + .collect(Collectors.joining(", ")); throw new UnsupportedOperationException( String.format("Cannot write using unsupported transforms: %s", unsupported)); @@ -87,18 +94,20 @@ public static void validatePartitionTransforms(PartitionSpec spec) { } /** - * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply - * Attempts to find the catalog and identifier a multipart identifier represents + * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply Attempts to find the + * catalog and identifier a multipart identifier represents + * * @param nameParts Multipart identifier representing a table * @return The CatalogPlugin and Identifier for the table */ - public static Pair catalogAndIdentifier(List nameParts, - Function catalogProvider, - BiFunction identiferProvider, - C currentCatalog, - String[] currentNamespace) { - Preconditions.checkArgument(!nameParts.isEmpty(), - "Cannot determine catalog and identifier from empty name"); + public static Pair catalogAndIdentifier( + List nameParts, + Function catalogProvider, + BiFunction identiferProvider, + C currentCatalog, + String[] currentNamespace) { + Preconditions.checkArgument( + !nameParts.isEmpty(), "Cannot determine catalog and identifier from empty name"); int lastElementIndex = nameParts.size() - 1; String name = nameParts.get(lastElementIndex); @@ -110,7 +119,7 @@ public static Pair catalogAndIdentifier(List nameParts, C catalog = catalogProvider.apply(nameParts.get(0)); if (catalog == null) { // The first element was not a valid catalog, treat it like part of the namespace - String[] namespace = nameParts.subList(0, lastElementIndex).toArray(new String[0]); + String[] namespace = nameParts.subList(0, lastElementIndex).toArray(new String[0]); return Pair.of(currentCatalog, identiferProvider.apply(namespace, name)); } else { // Assume the first element is a valid catalog @@ -122,6 +131,7 @@ public static Pair catalogAndIdentifier(List nameParts, /** * Responsible for checking if the table schema has a timestamp without timezone column + * * @param schema table schema to check if it contains a timestamp without timezone column * @return boolean indicating if the schema passed in has a timestamp field without a timezone */ @@ -131,15 +141,17 @@ public static boolean hasTimestampWithoutZone(Schema schema) { /** * Checks whether timestamp types for new tables should be stored with timezone info. - *

    - * The default value is false and all timestamp fields are stored as {@link Types.TimestampType#withZone()}. - * If enabled, all timestamp fields in new tables will be stored as {@link Types.TimestampType#withoutZone()}. + * + *

    The default value is false and all timestamp fields are stored as {@link + * Types.TimestampType#withZone()}. If enabled, all timestamp fields in new tables will be stored + * as {@link Types.TimestampType#withoutZone()}. * * @param sessionConf a Spark runtime config * @return true if timestamp types for new tables should be stored with timezone info */ public static boolean useTimestampWithoutZoneInNewTables(RuntimeConfig sessionConf) { - String sessionConfValue = sessionConf.get(SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, null); + String sessionConfValue = + sessionConf.get(SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, null); if (sessionConfValue != null) { return Boolean.parseBoolean(sessionConfValue); } @@ -147,32 +159,40 @@ public static boolean useTimestampWithoutZoneInNewTables(RuntimeConfig sessionCo } /** - * Pulls any Catalog specific overrides for the Hadoop conf from the current SparkSession, which can be - * set via `spark.sql.catalog.$catalogName.hadoop.*` + * Pulls any Catalog specific overrides for the Hadoop conf from the current SparkSession, which + * can be set via `spark.sql.catalog.$catalogName.hadoop.*` * - * Mirrors the override of hadoop configurations for a given spark session using `spark.hadoop.*`. + *

    Mirrors the override of hadoop configurations for a given spark session using + * `spark.hadoop.*`. * - * The SparkCatalog allows for hadoop configurations to be overridden per catalog, by setting + *

    The SparkCatalog allows for hadoop configurations to be overridden per catalog, by setting * them on the SQLConf, where the following will add the property "fs.default.name" with value - * "hdfs://hanksnamenode:8020" to the catalog's hadoop configuration. - * SparkSession.builder() - * .config(s"spark.sql.catalog.$catalogName.hadoop.fs.default.name", "hdfs://hanksnamenode:8020") - * .getOrCreate() + * "hdfs://hanksnamenode:8020" to the catalog's hadoop configuration. SparkSession.builder() + * .config(s"spark.sql.catalog.$catalogName.hadoop.fs.default.name", "hdfs://hanksnamenode:8020") + * .getOrCreate() + * * @param spark The current Spark session * @param catalogName Name of the catalog to find overrides for. - * @return the Hadoop Configuration that should be used for this catalog, with catalog specific overrides applied. + * @return the Hadoop Configuration that should be used for this catalog, with catalog specific + * overrides applied. */ public static Configuration hadoopConfCatalogOverrides(SparkSession spark, String catalogName) { // Find keys for the catalog intended to be hadoop configurations final String hadoopConfCatalogPrefix = hadoopConfPrefixForCatalog(catalogName); final Configuration conf = spark.sessionState().newHadoopConf(); - spark.sqlContext().conf().settings().forEach((k, v) -> { - // These checks are copied from `spark.sessionState().newHadoopConfWithOptions()`, which we - // avoid using to not have to convert back and forth between scala / java map types. - if (v != null && k != null && k.startsWith(hadoopConfCatalogPrefix)) { - conf.set(k.substring(hadoopConfCatalogPrefix.length()), v); - } - }); + spark + .sqlContext() + .conf() + .settings() + .forEach( + (k, v) -> { + // These checks are copied from `spark.sessionState().newHadoopConfWithOptions()`, + // which we + // avoid using to not have to convert back and forth between scala / java map types. + if (v != null && k != null && k.startsWith(hadoopConfCatalogPrefix)) { + conf.set(k.substring(hadoopConfCatalogPrefix.length()), v); + } + }); return conf; } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java index 150ef9ad21f1..741ef00619ea 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.nio.ByteBuffer; @@ -34,13 +33,10 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.util.DateTimeUtils; -/** - * A utility class that converts Spark values to Iceberg's internal representation. - */ +/** A utility class that converts Spark values to Iceberg's internal representation. */ public class SparkValueConverter { - private SparkValueConverter() { - } + private SparkValueConverter() {} public static Record convert(Schema schema, Row row) { return convert(schema.asStruct(), row); diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java index 756f4197b736..08b3fbee7590 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Locale; @@ -31,18 +30,21 @@ /** * A class for common Iceberg configs for Spark writes. - *

    - * If a config is set at multiple levels, the following order of precedence is used (top to bottom): + * + *

    If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * *

      - *
    1. Write options
    2. - *
    3. Session configuration
    4. - *
    5. Table metadata
    6. + *
    7. Write options + *
    8. Session configuration + *
    9. Table metadata *
    - * The most specific value is set in write options and takes precedence over all other configs. - * If no write option is provided, this class checks the session configuration for any overrides. - * If no applicable value is found in the session configuration, this class uses the table metadata. - *

    - * Note this class is NOT meant to be serialized and sent to executors. + * + * The most specific value is set in write options and takes precedence over all other configs. If + * no write option is provided, this class checks the session configuration for any overrides. If no + * applicable value is found in the session configuration, this class uses the table metadata. + * + *

    Note this class is NOT meant to be serialized and sent to executors. */ public class SparkWriteConf { @@ -57,7 +59,8 @@ public SparkWriteConf(SparkSession spark, Table table, Map write } public boolean checkNullability() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.CHECK_NULLABILITY) .sessionConf(SparkSQLProperties.CHECK_NULLABILITY) .defaultValue(SparkSQLProperties.CHECK_NULLABILITY_DEFAULT) @@ -65,7 +68,8 @@ public boolean checkNullability() { } public boolean checkOrdering() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.CHECK_ORDERING) .sessionConf(SparkSQLProperties.CHECK_ORDERING) .defaultValue(SparkSQLProperties.CHECK_ORDERING_DEFAULT) @@ -74,18 +78,20 @@ public boolean checkOrdering() { /** * Enables writing a timestamp with time zone as a timestamp without time zone. - *

    - * Generally, this is not safe as a timestamp without time zone is supposed to represent the wall-clock time, - * i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, - * but a timestamp with time zone represents instant semantics, i.e. the timestamp - * is adjusted so that the corresponding time in the reader timezone is displayed. - *

    - * When set to false (default), an exception must be thrown if the table contains a timestamp without time zone. + * + *

    Generally, this is not safe as a timestamp without time zone is supposed to represent the + * wall-clock time, i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, + * but a timestamp with time zone represents instant semantics, i.e. the timestamp is adjusted so + * that the corresponding time in the reader timezone is displayed. + * + *

    When set to false (default), an exception must be thrown if the table contains a timestamp + * without time zone. * * @return boolean indicating if writing timestamps without timezone is allowed */ public boolean handleTimestampWithoutZone() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .sessionConf(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .defaultValue(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT) @@ -102,16 +108,19 @@ public String wapId() { } public FileFormat dataFileFormat() { - String valueAsString = confParser.stringConf() - .option(SparkWriteOptions.WRITE_FORMAT) - .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) - .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) - .parse(); + String valueAsString = + confParser + .stringConf() + .option(SparkWriteOptions.WRITE_FORMAT) + .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) + .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) + .parse(); return FileFormat.valueOf(valueAsString.toUpperCase(Locale.ENGLISH)); } public long targetDataFileSize() { - return confParser.longConf() + return confParser + .longConf() .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES) .tableProperty(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES) .defaultValue(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT) @@ -119,7 +128,8 @@ public long targetDataFileSize() { } public boolean fanoutWriterEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.FANOUT_ENABLED) .tableProperty(TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED) .defaultValue(TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED_DEFAULT) @@ -129,11 +139,13 @@ public boolean fanoutWriterEnabled() { public Map extraSnapshotMetadata() { Map extraSnapshotMetadata = Maps.newHashMap(); - writeOptions.forEach((key, value) -> { - if (key.startsWith(SnapshotSummary.EXTRA_METADATA_PREFIX)) { - extraSnapshotMetadata.put(key.substring(SnapshotSummary.EXTRA_METADATA_PREFIX.length()), value); - } - }); + writeOptions.forEach( + (key, value) -> { + if (key.startsWith(SnapshotSummary.EXTRA_METADATA_PREFIX)) { + extraSnapshotMetadata.put( + key.substring(SnapshotSummary.EXTRA_METADATA_PREFIX.length()), value); + } + }); return extraSnapshotMetadata; } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java index 38574d364b20..0ba435ae7429 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java @@ -16,16 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; -/** - * Spark DF write options - */ +/** Spark DF write options */ public class SparkWriteOptions { - private SparkWriteOptions() { - } + private SparkWriteOptions() {} // Fileformat for write operations(default: Table write.format.default ) public static final String WRITE_FORMAT = "write-format"; @@ -52,5 +48,6 @@ private SparkWriteOptions() { public static final String REWRITTEN_FILE_SCAN_TASK_SET_ID = "rewritten-file-scan-task-set-id"; // Controls whether to allow writing timestamps without zone info - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = "handle-timestamp-without-timezone"; + public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = + "handle-timestamp-without-timezone"; } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java index 6a8be60eb078..1e4b0f2f4e3d 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -43,8 +42,7 @@ import org.apache.spark.sql.types.TimestampType$; class TypeToSparkType extends TypeUtil.SchemaVisitor { - TypeToSparkType() { - } + TypeToSparkType() {} @Override public DataType schema(Schema schema, DataType structType) { @@ -59,8 +57,8 @@ public DataType struct(Types.StructType struct, List fieldResults) { for (int i = 0; i < fields.size(); i += 1) { Types.NestedField field = fields.get(i); DataType type = fieldResults.get(i); - StructField sparkField = StructField.apply( - field.name(), type, field.isOptional(), Metadata.empty()); + StructField sparkField = + StructField.apply(field.name(), type, field.isOptional(), Metadata.empty()); if (field.doc() != null) { sparkField = sparkField.withComment(field.doc()); } @@ -101,8 +99,7 @@ public DataType primitive(Type.PrimitiveType primitive) { case DATE: return DateType$.MODULE$; case TIME: - throw new UnsupportedOperationException( - "Spark does not support time fields"); + throw new UnsupportedOperationException("Spark does not support time fields"); case TIMESTAMP: return TimestampType$.MODULE$; case STRING: diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java index e316dfb81c11..a79f075ef442 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -57,35 +59,37 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** * An action that removes orphan metadata and data files by listing a given location and comparing * the actual files in that location with data and metadata files referenced by all valid snapshots. * The location must be accessible for listing via the Hadoop {@link FileSystem}. - *

    - * By default, this action cleans up the table location returned by {@link Table#location()} and - * removes unreachable files that are older than 3 days using {@link Table#io()}. The behavior can be modified - * by passing a custom location to {@link #location} and a custom timestamp to {@link #olderThan(long)}. - * For example, someone might point this action to the data folder to clean up only orphan data files. - * In addition, there is a way to configure an alternative delete method via {@link #deleteWith(Consumer)}. - *

    - * Note: It is dangerous to call this action with a short retention interval as it might corrupt - * the state of the table if another operation is writing at the same time. + * + *

    By default, this action cleans up the table location returned by {@link Table#location()} and + * removes unreachable files that are older than 3 days using {@link Table#io()}. The behavior can + * be modified by passing a custom location to {@link #location} and a custom timestamp to {@link + * #olderThan(long)}. For example, someone might point this action to the data folder to clean up + * only orphan data files. In addition, there is a way to configure an alternative delete method via + * {@link #deleteWith(Consumer)}. + * + *

    Note: It is dangerous to call this action with a short retention interval as it might + * corrupt the state of the table if another operation is writing at the same time. */ public class BaseDeleteOrphanFilesSparkAction - extends BaseSparkAction implements DeleteOrphanFiles { + extends BaseSparkAction + implements DeleteOrphanFiles { private static final Logger LOG = LoggerFactory.getLogger(BaseDeleteOrphanFilesSparkAction.class); - private static final UserDefinedFunction filenameUDF = functions.udf((String path) -> { - int lastIndex = path.lastIndexOf(File.separator); - if (lastIndex == -1) { - return path; - } else { - return path.substring(lastIndex + 1); - } - }, DataTypes.StringType); + private static final UserDefinedFunction filenameUDF = + functions.udf( + (String path) -> { + int lastIndex = path.lastIndexOf(File.separator); + if (lastIndex == -1) { + return path; + } else { + return path.substring(lastIndex + 1); + } + }, + DataTypes.StringType); private static final ExecutorService DEFAULT_DELETE_EXECUTOR_SERVICE = null; @@ -95,12 +99,13 @@ public class BaseDeleteOrphanFilesSparkAction private String location = null; private long olderThanTimestamp = System.currentTimeMillis() - TimeUnit.DAYS.toMillis(3); - private Consumer deleteFunc = new Consumer() { - @Override - public void accept(String file) { - table.io().deleteFile(file); - } - }; + private Consumer deleteFunc = + new Consumer() { + @Override + public void accept(String file) { + table.io().deleteFile(file); + } + }; private ExecutorService deleteExecutorService = DEFAULT_DELETE_EXECUTOR_SERVICE; @@ -108,7 +113,8 @@ public BaseDeleteOrphanFilesSparkAction(SparkSession spark, Table table) { super(spark); this.hadoopConf = new SerializableConfiguration(spark.sessionState().newHadoopConf()); - this.partitionDiscoveryParallelism = spark.sessionState().conf().parallelPartitionDiscoveryParallelism(); + this.partitionDiscoveryParallelism = + spark.sessionState().conf().parallelPartitionDiscoveryParallelism(); this.table = table; this.location = table.location(); @@ -158,7 +164,8 @@ private String jobDesc() { if (location != null) { options.add("location=" + location); } - return String.format("Removing orphan files (%s) from %s", Joiner.on(',').join(options), table.name()); + return String.format( + "Removing orphan files (%s) from %s", Joiner.on(',').join(options), table.name()); } private DeleteOrphanFiles.Result doExecute() { @@ -172,9 +179,8 @@ private DeleteOrphanFiles.Result doExecute() { Column nameEqual = actualFileName.equalTo(validFileName); Column actualContains = actualFileDF.col("file_path").contains(validFileDF.col("file_path")); Column joinCond = nameEqual.and(actualContains); - List orphanFiles = actualFileDF.join(validFileDF, joinCond, "leftanti") - .as(Encoders.STRING()) - .collectAsList(); + List orphanFiles = + actualFileDF.join(validFileDF, joinCond, "leftanti").as(Encoders.STRING()).collectAsList(); Tasks.foreach(orphanFiles) .noRetry() @@ -205,15 +211,23 @@ private Dataset buildActualFileDF() { JavaRDD subDirRDD = sparkContext().parallelize(subDirs, parallelism); Broadcast conf = sparkContext().broadcast(hadoopConf); - JavaRDD matchingLeafFileRDD = subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp)); + JavaRDD matchingLeafFileRDD = + subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp)); JavaRDD completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD); - return spark().createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path"); + return spark() + .createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()) + .toDF("file_path"); } private static void listDirRecursively( - String dir, Predicate predicate, Configuration conf, int maxDepth, - int maxDirectSubDirs, List remainingSubDirs, List matchingFiles) { + String dir, + Predicate predicate, + Configuration conf, + int maxDepth, + int maxDirectSubDirs, + List remainingSubDirs, + List matchingFiles) { // stop listing whenever we reach the max depth if (maxDepth <= 0) { @@ -242,7 +256,14 @@ private static void listDirRecursively( } for (String subDir : subDirs) { - listDirRecursively(subDir, predicate, conf, maxDepth - 1, maxDirectSubDirs, remainingSubDirs, matchingFiles); + listDirRecursively( + subDir, + predicate, + conf, + maxDepth - 1, + maxDirectSubDirs, + remainingSubDirs, + matchingFiles); } } catch (IOException e) { throw new RuntimeIOException(e); @@ -250,8 +271,7 @@ private static void listDirRecursively( } private static FlatMapFunction, String> listDirsRecursively( - Broadcast conf, - long olderThanTimestamp) { + Broadcast conf, long olderThanTimestamp) { return dirs -> { List subDirs = Lists.newArrayList(); @@ -262,12 +282,15 @@ private static FlatMapFunction, String> listDirsRecursively( int maxDepth = 2000; int maxDirectSubDirs = Integer.MAX_VALUE; - dirs.forEachRemaining(dir -> { - listDirRecursively(dir, predicate, conf.value().value(), maxDepth, maxDirectSubDirs, subDirs, files); - }); + dirs.forEachRemaining( + dir -> { + listDirRecursively( + dir, predicate, conf.value().value(), maxDepth, maxDirectSubDirs, subDirs, files); + }); if (!subDirs.isEmpty()) { - throw new RuntimeException("Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth); + throw new RuntimeException( + "Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth); } return files.iterator(); diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java index 6534617d2dec..1431ae5d78ec 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.util.Iterator; import java.util.List; import java.util.concurrent.ExecutorService; @@ -47,17 +49,16 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** - * An implementation of {@link DeleteReachableFiles} that uses metadata tables in Spark - * to determine which files should be deleted. + * An implementation of {@link DeleteReachableFiles} that uses metadata tables in Spark to determine + * which files should be deleted. */ @SuppressWarnings("UnnecessaryAnonymousClass") public class BaseDeleteReachableFilesSparkAction - extends BaseSparkAction implements DeleteReachableFiles { - private static final Logger LOG = LoggerFactory.getLogger(BaseDeleteReachableFilesSparkAction.class); + extends BaseSparkAction + implements DeleteReachableFiles { + private static final Logger LOG = + LoggerFactory.getLogger(BaseDeleteReachableFilesSparkAction.class); private static final String DATA_FILE = "Data File"; private static final String MANIFEST = "Manifest"; @@ -71,12 +72,13 @@ public class BaseDeleteReachableFilesSparkAction private final TableMetadata tableMetadata; - private final Consumer defaultDelete = new Consumer() { - @Override - public void accept(String file) { - io.deleteFile(file); - } - }; + private final Consumer defaultDelete = + new Consumer() { + @Override + public void accept(String file) { + io.deleteFile(file); + } + }; private Consumer removeFunc = defaultDelete; private ExecutorService removeExecutorService = DEFAULT_DELETE_EXECUTOR_SERVICE; @@ -105,7 +107,6 @@ public DeleteReachableFiles io(FileIO fileIO) { public DeleteReachableFiles deleteWith(Consumer deleteFunc) { this.removeFunc = deleteFunc; return this; - } @Override @@ -117,7 +118,8 @@ public DeleteReachableFiles executeDeleteWith(ExecutorService executorService) { @Override public Result execute() { Preconditions.checkArgument(io != null, "File IO cannot be null"); - String msg = String.format("Removing files reachable from %s", tableMetadata.metadataFileLocation()); + String msg = + String.format("Removing files reachable from %s", tableMetadata.metadataFileLocation()); JobGroupInfo info = newJobGroupInfo("REMOVE-FILES", msg); return withJobGroupInfo(info, this::doExecute); } @@ -165,40 +167,45 @@ private BaseDeleteReachableFilesActionResult deleteFiles(Iterator deleted) AtomicLong otherFilesCount = new AtomicLong(0L); Tasks.foreach(deleted) - .retry(3).stopRetryOn(NotFoundException.class).suppressFailureWhenFinished() + .retry(3) + .stopRetryOn(NotFoundException.class) + .suppressFailureWhenFinished() .executeWith(removeExecutorService) - .onFailure((fileInfo, exc) -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - LOG.warn("Delete failed for {}: {}", type, file, exc); - }) - .run(fileInfo -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - removeFunc.accept(file); - switch (type) { - case DATA_FILE: - dataFileCount.incrementAndGet(); - LOG.trace("Deleted Data File: {}", file); - break; - case MANIFEST: - manifestCount.incrementAndGet(); - LOG.debug("Deleted Manifest: {}", file); - break; - case MANIFEST_LIST: - manifestListCount.incrementAndGet(); - LOG.debug("Deleted Manifest List: {}", file); - break; - case OTHERS: - otherFilesCount.incrementAndGet(); - LOG.debug("Others: {}", file); - break; - } - }); - - long filesCount = dataFileCount.get() + manifestCount.get() + manifestListCount.get() + otherFilesCount.get(); + .onFailure( + (fileInfo, exc) -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + LOG.warn("Delete failed for {}: {}", type, file, exc); + }) + .run( + fileInfo -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + removeFunc.accept(file); + switch (type) { + case DATA_FILE: + dataFileCount.incrementAndGet(); + LOG.trace("Deleted Data File: {}", file); + break; + case MANIFEST: + manifestCount.incrementAndGet(); + LOG.debug("Deleted Manifest: {}", file); + break; + case MANIFEST_LIST: + manifestListCount.incrementAndGet(); + LOG.debug("Deleted Manifest List: {}", file); + break; + case OTHERS: + otherFilesCount.incrementAndGet(); + LOG.debug("Others: {}", file); + break; + } + }); + + long filesCount = + dataFileCount.get() + manifestCount.get() + manifestListCount.get() + otherFilesCount.get(); LOG.info("Total files removed: {}", filesCount); - return new BaseDeleteReachableFilesActionResult(dataFileCount.get(), manifestCount.get(), manifestListCount.get(), - otherFilesCount.get()); + return new BaseDeleteReachableFilesActionResult( + dataFileCount.get(), manifestCount.get(), manifestListCount.get(), otherFilesCount.get()); } } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java index 88589bca5cab..2e1f0c079eca 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.util.Iterator; import java.util.List; import java.util.Set; @@ -48,22 +50,20 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** - * An action that performs the same operation as {@link org.apache.iceberg.ExpireSnapshots} but uses Spark - * to determine the delta in files between the pre and post-expiration table metadata. All of the same - * restrictions of {@link org.apache.iceberg.ExpireSnapshots} also apply to this action. - *

    - * This action first leverages {@link org.apache.iceberg.ExpireSnapshots} to expire snapshots and then - * uses metadata tables to find files that can be safely deleted. This is done by anti-joining two Datasets - * that contain all manifest and data files before and after the expiration. The snapshot expiration - * will be fully committed before any deletes are issued. - *

    - * This operation performs a shuffle so the parallelism can be controlled through 'spark.sql.shuffle.partitions'. - *

    - * Deletes are still performed locally after retrieving the results from the Spark executors. + * An action that performs the same operation as {@link org.apache.iceberg.ExpireSnapshots} but uses + * Spark to determine the delta in files between the pre and post-expiration table metadata. All of + * the same restrictions of {@link org.apache.iceberg.ExpireSnapshots} also apply to this action. + * + *

    This action first leverages {@link org.apache.iceberg.ExpireSnapshots} to expire snapshots and + * then uses metadata tables to find files that can be safely deleted. This is done by anti-joining + * two Datasets that contain all manifest and data files before and after the expiration. The + * snapshot expiration will be fully committed before any deletes are issued. + * + *

    This operation performs a shuffle so the parallelism can be controlled through + * 'spark.sql.shuffle.partitions'. + * + *

    Deletes are still performed locally after retrieving the results from the Spark executors. */ @SuppressWarnings("UnnecessaryAnonymousClass") public class BaseExpireSnapshotsSparkAction @@ -81,12 +81,13 @@ public class BaseExpireSnapshotsSparkAction private final Table table; private final TableOperations ops; - private final Consumer defaultDelete = new Consumer() { - @Override - public void accept(String file) { - ops.io().deleteFile(file); - } - }; + private final Consumer defaultDelete = + new Consumer() { + @Override + public void accept(String file) { + ops.io().deleteFile(file); + } + }; private final Set expiredSnapshotIds = Sets.newHashSet(); private Long expireOlderThanValue = null; @@ -130,8 +131,10 @@ public BaseExpireSnapshotsSparkAction expireOlderThan(long timestampMillis) { @Override public BaseExpireSnapshotsSparkAction retainLast(int numSnapshots) { - Preconditions.checkArgument(1 <= numSnapshots, - "Number of snapshots to retain must be at least 1, cannot be: %s", numSnapshots); + Preconditions.checkArgument( + 1 <= numSnapshots, + "Number of snapshots to retain must be at least 1, cannot be: %s", + numSnapshots); this.retainLastValue = numSnapshots; return this; } @@ -144,10 +147,11 @@ public BaseExpireSnapshotsSparkAction deleteWith(Consumer newDeleteFunc) /** * Expires snapshots and commits the changes to the table, returning a Dataset of files to delete. - *

    - * This does not delete data files. To delete data files, run {@link #execute()}. - *

    - * This may be called before or after {@link #execute()} is called to return the expired file list. + * + *

    This does not delete data files. To delete data files, run {@link #execute()}. + * + *

    This may be called before or after {@link #execute()} is called to return the expired file + * list. * * @return a Dataset of files that are no longer referenced by the table */ @@ -157,7 +161,8 @@ public Dataset expire() { Dataset originalFiles = buildValidFileDF(ops.current()); // perform expiration - org.apache.iceberg.ExpireSnapshots expireSnapshots = table.expireSnapshots().cleanExpiredFiles(false); + org.apache.iceberg.ExpireSnapshots expireSnapshots = + table.expireSnapshots().cleanExpiredFiles(false); for (long id : expiredSnapshotIds) { expireSnapshots = expireSnapshots.expireSnapshotId(id); } @@ -202,13 +207,15 @@ private String jobDesc() { if (!expiredSnapshotIds.isEmpty()) { Long first = expiredSnapshotIds.stream().findFirst().get(); if (expiredSnapshotIds.size() > 1) { - options.add(String.format("snapshot_ids: %s (%s more...)", first, expiredSnapshotIds.size() - 1)); + options.add( + String.format("snapshot_ids: %s (%s more...)", first, expiredSnapshotIds.size() - 1)); } else { options.add(String.format("snapshot_id: %s", first)); } } - return String.format("Expiring snapshots (%s) in %s", Joiner.on(',').join(options), table.name()); + return String.format( + "Expiring snapshots (%s) in %s", Joiner.on(',').join(options), table.name()); } private ExpireSnapshots.Result doExecute() { @@ -243,34 +250,41 @@ private BaseExpireSnapshotsActionResult deleteFiles(Iterator expired) { AtomicLong manifestListCount = new AtomicLong(0L); Tasks.foreach(expired) - .retry(3).stopRetryOn(NotFoundException.class).suppressFailureWhenFinished() + .retry(3) + .stopRetryOn(NotFoundException.class) + .suppressFailureWhenFinished() .executeWith(deleteExecutorService) - .onFailure((fileInfo, exc) -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - LOG.warn("Delete failed for {}: {}", type, file, exc); - }) - .run(fileInfo -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - deleteFunc.accept(file); - switch (type) { - case DATA_FILE: - dataFileCount.incrementAndGet(); - LOG.trace("Deleted Data File: {}", file); - break; - case MANIFEST: - manifestCount.incrementAndGet(); - LOG.debug("Deleted Manifest: {}", file); - break; - case MANIFEST_LIST: - manifestListCount.incrementAndGet(); - LOG.debug("Deleted Manifest List: {}", file); - break; - } - }); - - LOG.info("Deleted {} total files", dataFileCount.get() + manifestCount.get() + manifestListCount.get()); - return new BaseExpireSnapshotsActionResult(dataFileCount.get(), manifestCount.get(), manifestListCount.get()); + .onFailure( + (fileInfo, exc) -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + LOG.warn("Delete failed for {}: {}", type, file, exc); + }) + .run( + fileInfo -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + deleteFunc.accept(file); + switch (type) { + case DATA_FILE: + dataFileCount.incrementAndGet(); + LOG.trace("Deleted Data File: {}", file); + break; + case MANIFEST: + manifestCount.incrementAndGet(); + LOG.debug("Deleted Manifest: {}", file); + break; + case MANIFEST_LIST: + manifestListCount.incrementAndGet(); + LOG.debug("Deleted Manifest List: {}", file); + break; + } + }); + + LOG.info( + "Deleted {} total files", + dataFileCount.get() + manifestCount.get() + manifestListCount.get()); + return new BaseExpireSnapshotsActionResult( + dataFileCount.get(), manifestCount.get(), manifestListCount.get()); } } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java index c446d42ca062..b0d8fcf74809 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.MetadataTableType.ENTRIES; + import java.io.IOException; import java.util.Collections; import java.util.List; @@ -68,15 +69,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.MetadataTableType.ENTRIES; - /** * An action that rewrites manifests in a distributed manner and co-locates metadata for partitions. - *

    - * By default, this action rewrites all manifests for the current partition spec and writes the result - * to the metadata folder. The behavior can be modified by passing a custom predicate to {@link #rewriteIf(Predicate)} - * and a custom spec id to {@link #specId(int)}. In addition, there is a way to configure a custom location - * for new manifests via {@link #stagingLocation}. + * + *

    By default, this action rewrites all manifests for the current partition spec and writes the + * result to the metadata folder. The behavior can be modified by passing a custom predicate to + * {@link #rewriteIf(Predicate)} and a custom spec id to {@link #specId(int)}. In addition, there is + * a way to configure a custom location for new manifests via {@link #stagingLocation}. */ public class BaseRewriteManifestsSparkAction extends BaseSnapshotUpdateSparkAction @@ -102,10 +101,11 @@ public BaseRewriteManifestsSparkAction(SparkSession spark, Table table) { this.manifestEncoder = Encoders.javaSerialization(ManifestFile.class); this.table = table; this.spec = table.spec(); - this.targetManifestSizeBytes = PropertyUtil.propertyAsLong( - table.properties(), - TableProperties.MANIFEST_TARGET_SIZE_BYTES, - TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT); + this.targetManifestSizeBytes = + PropertyUtil.propertyAsLong( + table.properties(), + TableProperties.MANIFEST_TARGET_SIZE_BYTES, + TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT); this.fileIO = SparkUtil.serializableFileIO(table); // default the staging location to the metadata location @@ -143,7 +143,9 @@ public RewriteManifests stagingLocation(String newStagingLocation) { @Override public RewriteManifests.Result execute() { - String desc = String.format("Rewriting manifests (staging location=%s) of %s", stagingLocation, table.name()); + String desc = + String.format( + "Rewriting manifests (staging location=%s) of %s", stagingLocation, table.name()); JobGroupInfo info = newJobGroupInfo("REWRITE-MANIFESTS", desc); return withJobGroupInfo(info, this::doExecute); } @@ -158,10 +160,12 @@ private RewriteManifests.Result doExecute() { int numEntries = 0; for (ManifestFile manifest : matchingManifests) { - ValidationException.check(hasFileCounts(manifest), "No file counts in manifest: %s", manifest.path()); + ValidationException.check( + hasFileCounts(manifest), "No file counts in manifest: %s", manifest.path()); totalSizeBytes += manifest.length(); - numEntries += manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); + numEntries += + manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); } int targetNumManifests = targetNumManifests(totalSizeBytes); @@ -173,7 +177,9 @@ private RewriteManifests.Result doExecute() { if (spec.fields().size() < 1) { newManifests = writeManifestsForUnpartitionedTable(manifestEntryDF, targetNumManifests); } else { - newManifests = writeManifestsForPartitionedTable(manifestEntryDF, targetNumManifests, targetNumManifestEntries); + newManifests = + writeManifestsForPartitionedTable( + manifestEntryDF, targetNumManifests, targetNumManifestEntries); } replaceManifests(matchingManifests, newManifests); @@ -182,13 +188,16 @@ private RewriteManifests.Result doExecute() { } private Dataset buildManifestEntryDF(List manifests) { - Dataset manifestDF = spark() - .createDataset(Lists.transform(manifests, ManifestFile::path), Encoders.STRING()) - .toDF("manifest"); + Dataset manifestDF = + spark() + .createDataset(Lists.transform(manifests, ManifestFile::path), Encoders.STRING()) + .toDF("manifest"); - Dataset manifestEntryDF = loadMetadataTable(table, ENTRIES) - .filter("status < 2") // select only live entries - .selectExpr("input_file_name() as manifest", "snapshot_id", "sequence_number", "data_file"); + Dataset manifestEntryDF = + loadMetadataTable(table, ENTRIES) + .filter("status < 2") // select only live entries + .selectExpr( + "input_file_name() as manifest", "snapshot_id", "sequence_number", "data_file"); Column joinCond = manifestDF.col("manifest").equalTo(manifestEntryDF.col("manifest")); return manifestEntryDF @@ -196,7 +205,8 @@ private Dataset buildManifestEntryDF(List manifests) { .select("snapshot_id", "sequence_number", "data_file"); } - private List writeManifestsForUnpartitionedTable(Dataset manifestEntryDF, int numManifests) { + private List writeManifestsForUnpartitionedTable( + Dataset manifestEntryDF, int numManifests) { Broadcast io = sparkContext().broadcast(fileIO); StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType(); @@ -208,41 +218,44 @@ private List writeManifestsForUnpartitionedTable(Dataset mani .repartition(numManifests) .mapPartitions( toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), - manifestEncoder - ) + manifestEncoder) .collectAsList(); } private List writeManifestsForPartitionedTable( - Dataset manifestEntryDF, int numManifests, - int targetNumManifestEntries) { + Dataset manifestEntryDF, int numManifests, int targetNumManifestEntries) { Broadcast io = sparkContext().broadcast(fileIO); StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType(); - // we allow the actual size of manifests to be 10% higher if the estimation is not precise enough + // we allow the actual size of manifests to be 10% higher if the estimation is not precise + // enough long maxNumManifestEntries = (long) (1.1 * targetNumManifestEntries); - return withReusableDS(manifestEntryDF, df -> { - Column partitionColumn = df.col("data_file.partition"); - return df.repartitionByRange(numManifests, partitionColumn) - .sortWithinPartitions(partitionColumn) - .mapPartitions( - toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), - manifestEncoder - ) - .collectAsList(); - }); + return withReusableDS( + manifestEntryDF, + df -> { + Column partitionColumn = df.col("data_file.partition"); + return df.repartitionByRange(numManifests, partitionColumn) + .sortWithinPartitions(partitionColumn) + .mapPartitions( + toManifests( + io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), + manifestEncoder) + .collectAsList(); + }); } private U withReusableDS(Dataset ds, Function, U> func) { Dataset reusableDS; - boolean useCaching = PropertyUtil.propertyAsBoolean(options(), USE_CACHING, USE_CACHING_DEFAULT); + boolean useCaching = + PropertyUtil.propertyAsBoolean(options(), USE_CACHING, USE_CACHING_DEFAULT); if (useCaching) { reusableDS = ds.cache(); } else { int parallelism = SQLConf.get().numShufflePartitions(); - reusableDS = ds.repartition(parallelism).map((MapFunction) value -> value, ds.exprEnc()); + reusableDS = + ds.repartition(parallelism).map((MapFunction) value -> value, ds.exprEnc()); } try { @@ -275,17 +288,19 @@ private int targetNumManifestEntries(int numEntries, int numManifests) { } private boolean hasFileCounts(ManifestFile manifest) { - return manifest.addedFilesCount() != null && - manifest.existingFilesCount() != null && - manifest.deletedFilesCount() != null; + return manifest.addedFilesCount() != null + && manifest.existingFilesCount() != null + && manifest.deletedFilesCount() != null; } - private void replaceManifests(Iterable deletedManifests, Iterable addedManifests) { + private void replaceManifests( + Iterable deletedManifests, Iterable addedManifests) { try { - boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean( - table.properties(), - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); + boolean snapshotIdInheritanceEnabled = + PropertyUtil.propertyAsBoolean( + table.properties(), + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); org.apache.iceberg.RewriteManifests rewriteManifests = table.rewriteManifests(); deletedManifests.forEach(rewriteManifests::deleteManifest); @@ -315,12 +330,20 @@ private void deleteFiles(Iterable locations) { } private static ManifestFile writeManifest( - List rows, int startIndex, int endIndex, Broadcast io, - String location, int format, PartitionSpec spec, StructType sparkType) throws IOException { + List rows, + int startIndex, + int endIndex, + Broadcast io, + String location, + int format, + PartitionSpec spec, + StructType sparkType) + throws IOException { String manifestName = "optimized-m-" + UUID.randomUUID(); Path manifestPath = new Path(location, manifestName); - OutputFile outputFile = io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString())); + OutputFile outputFile = + io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString())); Types.StructType dataFileType = DataFile.getType(spec.partitionType()); SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkType); @@ -343,8 +366,12 @@ private static ManifestFile writeManifest( } private static MapPartitionsFunction toManifests( - Broadcast io, long maxNumManifestEntries, String location, - int format, PartitionSpec spec, StructType sparkType) { + Broadcast io, + long maxNumManifestEntries, + String location, + int format, + PartitionSpec spec, + StructType sparkType) { return rows -> { List rowsAsList = Lists.newArrayList(rows); @@ -355,11 +382,15 @@ private static MapPartitionsFunction toManifests( List manifests = Lists.newArrayList(); if (rowsAsList.size() <= maxNumManifestEntries) { - manifests.add(writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType)); + manifests.add( + writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType)); } else { int midIndex = rowsAsList.size() / 2; - manifests.add(writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType)); - manifests.add(writeManifest(rowsAsList, midIndex, rowsAsList.size(), io, location, format, spec, sparkType)); + manifests.add( + writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType)); + manifests.add( + writeManifest( + rowsAsList, midIndex, rowsAsList.size(), io, location, format, spec, sparkType)); } return manifests.iterator(); diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java index 53fa06bbb5dc..f68fb4e97e78 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.Map; @@ -24,8 +23,8 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.spark.sql.SparkSession; -abstract class BaseSnapshotUpdateSparkAction - extends BaseSparkAction implements SnapshotUpdate { +abstract class BaseSnapshotUpdateSparkAction extends BaseSparkAction + implements SnapshotUpdate { private final Map summary = Maps.newHashMap(); diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java index 42c54679b669..c9d93ce9de5f 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.MetadataTableType.ALL_MANIFESTS; + import java.util.Iterator; import java.util.List; import java.util.Map; @@ -49,8 +50,6 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; -import static org.apache.iceberg.MetadataTableType.ALL_MANIFESTS; - abstract class BaseSparkAction implements Action { private static final AtomicInteger JOB_COUNTER = new AtomicInteger(); @@ -115,11 +114,20 @@ protected Dataset buildValidDataFileDF(Table table) { JavaSparkContext context = JavaSparkContext.fromSparkContext(spark.sparkContext()); Broadcast ioBroadcast = context.broadcast(SparkUtil.serializableFileIO(table)); - Dataset allManifests = loadMetadataTable(table, ALL_MANIFESTS) - .selectExpr("path", "length", "partition_spec_id as partitionSpecId", "added_snapshot_id as addedSnapshotId") - .dropDuplicates("path") - .repartition(spark.sessionState().conf().numShufflePartitions()) // avoid adaptive execution combining tasks - .as(Encoders.bean(ManifestFileBean.class)); + Dataset allManifests = + loadMetadataTable(table, ALL_MANIFESTS) + .selectExpr( + "path", + "length", + "partition_spec_id as partitionSpecId", + "added_snapshot_id as addedSnapshotId") + .dropDuplicates("path") + .repartition( + spark + .sessionState() + .conf() + .numShufflePartitions()) // avoid adaptive execution combining tasks + .as(Encoders.bean(ManifestFileBean.class)); return allManifests.flatMap(new ReadManifest(ioBroadcast), Encoders.STRING()).toDF("file_path"); } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkActions.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkActions.java index 58b57177cf73..bec51944f222 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkActions.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkActions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import org.apache.iceberg.Table; diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java index 0837fb7d39e4..3660b870c63f 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.nio.ByteBuffer; diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java index 66e2aa579a5b..4b1ea37c2169 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import org.apache.iceberg.actions.ActionsProvider; @@ -24,9 +23,9 @@ /** * An implementation of {@link ActionsProvider} for Spark. - *

    - * This class is the primary API for interacting with actions in Spark that users should use - * to instantiate particular actions. + * + *

    This class is the primary API for interacting with actions in Spark that users should use to + * instantiate particular actions. */ public class SparkActions extends BaseSparkActions { diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java index 40ed05b4ce65..74454fc1e466 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import org.apache.iceberg.avro.AvroWithPartnerByStructureVisitor; @@ -30,7 +29,8 @@ import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; -public abstract class AvroWithSparkSchemaVisitor extends AvroWithPartnerByStructureVisitor { +public abstract class AvroWithSparkSchemaVisitor + extends AvroWithPartnerByStructureVisitor { @Override protected boolean isStringType(DataType dataType) { @@ -44,7 +44,8 @@ protected boolean isMapType(DataType dataType) { @Override protected DataType arrayElementType(DataType arrayType) { - Preconditions.checkArgument(arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); + Preconditions.checkArgument( + arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); return ((ArrayType) arrayType).elementType(); } @@ -62,7 +63,8 @@ protected DataType mapValueType(DataType mapType) { @Override protected Pair fieldNameAndType(DataType structType, int pos) { - Preconditions.checkArgument(structType instanceof StructType, "Invalid struct: %s is not a struct", structType); + Preconditions.checkArgument( + structType instanceof StructType, "Invalid struct: %s is not a struct", structType); StructField field = ((StructType) structType).apply(pos); return Pair.of(field.name(), field.dataType()); } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java index 924cc3e2325a..d74a76f94e87 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.Deque; @@ -48,9 +47,11 @@ public class ParquetWithSparkSchemaVisitor { public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisitor visitor) { Preconditions.checkArgument(sType != null, "Invalid DataType: null"); if (type instanceof MessageType) { - Preconditions.checkArgument(sType instanceof StructType, "Invalid struct: %s is not a struct", sType); + Preconditions.checkArgument( + sType instanceof StructType, "Invalid struct: %s is not a struct", sType); StructType struct = (StructType) sType; - return visitor.message(struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); + return visitor.message( + struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); } else if (type.isPrimitive()) { return visitor.primitive(sType, type.asPrimitiveType()); @@ -62,21 +63,30 @@ public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisit if (annotation != null) { switch (annotation) { case LIST: - Preconditions.checkArgument(!group.isRepetition(Repetition.REPEATED), - "Invalid list: top-level group is repeated: %s", group); - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid list: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + !group.isRepetition(Repetition.REPEATED), + "Invalid list: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid list: does not contain single repeated field: %s", + group); GroupType repeatedElement = group.getFields().get(0).asGroupType(); - Preconditions.checkArgument(repeatedElement.isRepetition(Repetition.REPEATED), + Preconditions.checkArgument( + repeatedElement.isRepetition(Repetition.REPEATED), "Invalid list: inner group is not repeated"); - Preconditions.checkArgument(repeatedElement.getFieldCount() <= 1, - "Invalid list: repeated group is not a single field: %s", group); + Preconditions.checkArgument( + repeatedElement.getFieldCount() <= 1, + "Invalid list: repeated group is not a single field: %s", + group); - Preconditions.checkArgument(sType instanceof ArrayType, "Invalid list: %s is not an array", sType); + Preconditions.checkArgument( + sType instanceof ArrayType, "Invalid list: %s is not an array", sType); ArrayType array = (ArrayType) sType; - StructField element = new StructField( - "element", array.elementType(), array.containsNull(), Metadata.empty()); + StructField element = + new StructField( + "element", array.elementType(), array.containsNull(), Metadata.empty()); visitor.fieldNames.push(repeatedElement.getName()); try { @@ -92,22 +102,30 @@ public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisit } case MAP: - Preconditions.checkArgument(!group.isRepetition(Repetition.REPEATED), - "Invalid map: top-level group is repeated: %s", group); - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid map: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + !group.isRepetition(Repetition.REPEATED), + "Invalid map: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid map: does not contain single repeated field: %s", + group); GroupType repeatedKeyValue = group.getType(0).asGroupType(); - Preconditions.checkArgument(repeatedKeyValue.isRepetition(Repetition.REPEATED), + Preconditions.checkArgument( + repeatedKeyValue.isRepetition(Repetition.REPEATED), "Invalid map: inner group is not repeated"); - Preconditions.checkArgument(repeatedKeyValue.getFieldCount() <= 2, + Preconditions.checkArgument( + repeatedKeyValue.getFieldCount() <= 2, "Invalid map: repeated group does not have 2 fields"); - Preconditions.checkArgument(sType instanceof MapType, "Invalid map: %s is not a map", sType); + Preconditions.checkArgument( + sType instanceof MapType, "Invalid map: %s is not a map", sType); MapType map = (MapType) sType; StructField keyField = new StructField("key", map.keyType(), false, Metadata.empty()); - StructField valueField = new StructField( - "value", map.valueType(), map.valueContainsNull(), Metadata.empty()); + StructField valueField = + new StructField( + "value", map.valueType(), map.valueContainsNull(), Metadata.empty()); visitor.fieldNames.push(repeatedKeyValue.getName()); try { @@ -144,13 +162,15 @@ public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisit } } - Preconditions.checkArgument(sType instanceof StructType, "Invalid struct: %s is not a struct", sType); + Preconditions.checkArgument( + sType instanceof StructType, "Invalid struct: %s is not a struct", sType); StructType struct = (StructType) sType; return visitor.struct(struct, group, visitFields(struct, group, visitor)); } } - private static T visitField(StructField sField, Type field, ParquetWithSparkSchemaVisitor visitor) { + private static T visitField( + StructField sField, Type field, ParquetWithSparkSchemaVisitor visitor) { visitor.fieldNames.push(field.getName()); try { return visit(sField.dataType(), field, visitor); @@ -159,17 +179,20 @@ private static T visitField(StructField sField, Type field, ParquetWithSpark } } - private static List visitFields(StructType struct, GroupType group, - ParquetWithSparkSchemaVisitor visitor) { + private static List visitFields( + StructType struct, GroupType group, ParquetWithSparkSchemaVisitor visitor) { StructField[] sFields = struct.fields(); - Preconditions.checkArgument(sFields.length == group.getFieldCount(), - "Structs do not match: %s and %s", struct, group); + Preconditions.checkArgument( + sFields.length == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); for (int i = 0; i < sFields.length; i += 1) { Type field = group.getFields().get(i); StructField sField = sFields[i]; - Preconditions.checkArgument(field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), - "Structs do not match: field %s != %s", field.getName(), sField.name()); + Preconditions.checkArgument( + field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), + "Structs do not match: field %s != %s", + field.getName(), + sField.name()); results.add(visitField(sField, field, visitor)); } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java index c693e2e2c057..4622d2928ac4 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -38,7 +37,6 @@ import org.apache.iceberg.types.Types; import org.apache.spark.sql.catalyst.InternalRow; - public class SparkAvroReader implements DatumReader, SupportsRowPosition { private final Schema readSchema; @@ -50,10 +48,12 @@ public SparkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSche } @SuppressWarnings("unchecked") - public SparkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { + public SparkAvroReader( + org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { this.readSchema = readSchema; - this.reader = (ValueReader) AvroSchemaWithTypeVisitor - .visit(expectedSchema, readSchema, new ReadBuilder(constants)); + this.reader = + (ValueReader) + AvroSchemaWithTypeVisitor.visit(expectedSchema, readSchema, new ReadBuilder(constants)); } @Override @@ -81,8 +81,8 @@ private ReadBuilder(Map idToConstant) { } @Override - public ValueReader record(Types.StructType expected, Schema record, List names, - List> fields) { + public ValueReader record( + Types.StructType expected, Schema record, List names, List> fields) { return SparkValueReaders.struct(fields, expected, idToConstant); } @@ -92,13 +92,14 @@ public ValueReader union(Type expected, Schema union, List> op } @Override - public ValueReader array(Types.ListType expected, Schema array, ValueReader elementReader) { + public ValueReader array( + Types.ListType expected, Schema array, ValueReader elementReader) { return SparkValueReaders.array(elementReader); } @Override - public ValueReader map(Types.MapType expected, Schema map, - ValueReader keyReader, ValueReader valueReader) { + public ValueReader map( + Types.MapType expected, Schema map, ValueReader keyReader, ValueReader valueReader) { return SparkValueReaders.arrayMap(keyReader, valueReader); } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java index 7582125128a7..15465568c231 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -50,8 +49,9 @@ public SparkAvroWriter(StructType dsSchema) { @Override @SuppressWarnings("unchecked") public void setSchema(Schema schema) { - this.writer = (ValueWriter) AvroWithSparkSchemaVisitor - .visit(dsSchema, schema, new WriteBuilder()); + this.writer = + (ValueWriter) + AvroWithSparkSchemaVisitor.visit(dsSchema, schema, new WriteBuilder()); } @Override @@ -66,17 +66,23 @@ public Stream metrics() { private static class WriteBuilder extends AvroWithSparkSchemaVisitor> { @Override - public ValueWriter record(DataType struct, Schema record, List names, List> fields) { - return SparkValueWriters.struct(fields, IntStream.range(0, names.size()) - .mapToObj(i -> fieldNameAndType(struct, i).second()).collect(Collectors.toList())); + public ValueWriter record( + DataType struct, Schema record, List names, List> fields) { + return SparkValueWriters.struct( + fields, + IntStream.range(0, names.size()) + .mapToObj(i -> fieldNameAndType(struct, i).second()) + .collect(Collectors.toList())); } @Override public ValueWriter union(DataType type, Schema union, List> options) { - Preconditions.checkArgument(options.contains(ValueWriters.nulls()), - "Cannot create writer for non-option union: %s", union); - Preconditions.checkArgument(options.size() == 2, - "Cannot create writer for non-option union: %s", union); + Preconditions.checkArgument( + options.contains(ValueWriters.nulls()), + "Cannot create writer for non-option union: %s", + union); + Preconditions.checkArgument( + options.size() == 2, "Cannot create writer for non-option union: %s", union); if (union.getTypes().get(0).getType() == Schema.Type.NULL) { return ValueWriters.option(0, options.get(1)); } else { @@ -91,12 +97,15 @@ public ValueWriter array(DataType sArray, Schema array, ValueWriter elemen @Override public ValueWriter map(DataType sMap, Schema map, ValueWriter valueReader) { - return SparkValueWriters.map(SparkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); + return SparkValueWriters.map( + SparkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); } @Override - public ValueWriter map(DataType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { - return SparkValueWriters.arrayMap(keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); + public ValueWriter map( + DataType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { + return SparkValueWriters.arrayMap( + keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); } @Override diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java index 4ed6420a9aa4..78db137054bc 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.List; @@ -34,10 +33,9 @@ import org.apache.spark.sql.catalyst.InternalRow; /** - * Converts the OrcIterator, which returns ORC's VectorizedRowBatch to a - * set of Spark's UnsafeRows. + * Converts the OrcIterator, which returns ORC's VectorizedRowBatch to a set of Spark's UnsafeRows. * - * It minimizes allocations by reusing most of the objects in the implementation. + *

    It minimizes allocations by reusing most of the objects in the implementation. */ public class SparkOrcReader implements OrcRowReader { private final OrcValueReader reader; @@ -48,8 +46,12 @@ public SparkOrcReader(org.apache.iceberg.Schema expectedSchema, TypeDescription @SuppressWarnings("unchecked") public SparkOrcReader( - org.apache.iceberg.Schema expectedSchema, TypeDescription readOrcSchema, Map idToConstant) { - this.reader = OrcSchemaWithTypeVisitor.visit(expectedSchema, readOrcSchema, new ReadBuilder(idToConstant)); + org.apache.iceberg.Schema expectedSchema, + TypeDescription readOrcSchema, + Map idToConstant) { + this.reader = + OrcSchemaWithTypeVisitor.visit( + expectedSchema, readOrcSchema, new ReadBuilder(idToConstant)); } @Override @@ -71,18 +73,25 @@ private ReadBuilder(Map idToConstant) { @Override public OrcValueReader record( - Types.StructType expected, TypeDescription record, List names, List> fields) { + Types.StructType expected, + TypeDescription record, + List names, + List> fields) { return SparkOrcValueReaders.struct(fields, expected, idToConstant); } @Override - public OrcValueReader list(Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { + public OrcValueReader list( + Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { return SparkOrcValueReaders.array(elementReader); } @Override public OrcValueReader map( - Types.MapType iMap, TypeDescription map, OrcValueReader keyReader, OrcValueReader valueReader) { + Types.MapType iMap, + TypeDescription map, + OrcValueReader keyReader, + OrcValueReader valueReader) { return SparkOrcValueReaders.map(keyReader, valueReader); } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java index f35ab7a17c63..9e9b3e53bbcc 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.math.BigDecimal; @@ -44,8 +43,7 @@ import org.apache.spark.unsafe.types.UTF8String; public class SparkOrcValueReaders { - private SparkOrcValueReaders() { - } + private SparkOrcValueReaders() {} public static OrcValueReader utf8String() { return StringReader.INSTANCE; @@ -125,8 +123,7 @@ public MapData nonNullRead(ColumnVector vector, int row) { } return new ArrayBasedMapData( - new GenericArrayData(keys.toArray()), - new GenericArrayData(values.toArray())); + new GenericArrayData(keys.toArray()), new GenericArrayData(values.toArray())); } @Override @@ -139,7 +136,8 @@ public void setBatchContext(long batchOffsetInFile) { static class StructReader extends OrcValueReaders.StructReader { private final int numFields; - protected StructReader(List> readers, Types.StructType struct, Map idToConstant) { + protected StructReader( + List> readers, Types.StructType struct, Map idToConstant) { super(readers, struct, idToConstant); this.numFields = struct.fields().size(); } @@ -162,21 +160,20 @@ protected void set(InternalRow struct, int pos, Object value) { private static class StringReader implements OrcValueReader { private static final StringReader INSTANCE = new StringReader(); - private StringReader() { - } + private StringReader() {} @Override public UTF8String nonNullRead(ColumnVector vector, int row) { BytesColumnVector bytesVector = (BytesColumnVector) vector; - return UTF8String.fromBytes(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); + return UTF8String.fromBytes( + bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); } } private static class TimestampTzReader implements OrcValueReader { private static final TimestampTzReader INSTANCE = new TimestampTzReader(); - private TimestampTzReader() { - } + private TimestampTzReader() {} @Override public Long nonNullRead(ColumnVector vector, int row) { @@ -198,12 +195,20 @@ private static class Decimal18Reader implements OrcValueReader { public Decimal nonNullRead(ColumnVector vector, int row) { HiveDecimalWritable value = ((DecimalColumnVector) vector).vector[row]; - // The scale of decimal read from hive ORC file may be not equals to the expected scale. For data type - // decimal(10,3) and the value 10.100, the hive ORC writer will remove its trailing zero and store it - // as 101*10^(-1), its scale will adjust from 3 to 1. So here we could not assert that value.scale() == scale. - // we also need to convert the hive orc decimal to a decimal with expected precision and scale. - Preconditions.checkArgument(value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); + // The scale of decimal read from hive ORC file may be not equals to the expected scale. For + // data type + // decimal(10,3) and the value 10.100, the hive ORC writer will remove its trailing zero and + // store it + // as 101*10^(-1), its scale will adjust from 3 to 1. So here we could not assert that + // value.scale() == scale. + // we also need to convert the hive orc decimal to a decimal with expected precision and + // scale. + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); return new Decimal().set(value.serialize64(scale), precision, scale); } @@ -220,11 +225,15 @@ private static class Decimal38Reader implements OrcValueReader { @Override public Decimal nonNullRead(ColumnVector vector, int row) { - BigDecimal value = ((DecimalColumnVector) vector).vector[row] - .getHiveDecimal().bigDecimalValue(); - - Preconditions.checkArgument(value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); + BigDecimal value = + ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); + + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); return new Decimal().set(new scala.math.BigDecimal(value), precision, scale); } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java index abb12dffc050..780090f99109 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.List; @@ -37,8 +36,7 @@ import org.apache.spark.unsafe.types.UTF8String; class SparkOrcValueWriters { - private SparkOrcValueWriters() { - } + private SparkOrcValueWriters() {} static OrcValueWriter strings() { return StringWriter.INSTANCE; @@ -60,8 +58,8 @@ static OrcValueWriter list(OrcValueWriter element, List o return new ListWriter<>(element, orcType); } - static OrcValueWriter map(OrcValueWriter keyWriter, OrcValueWriter valueWriter, - List orcTypes) { + static OrcValueWriter map( + OrcValueWriter keyWriter, OrcValueWriter valueWriter, List orcTypes) { return new MapWriter<>(keyWriter, valueWriter, orcTypes); } @@ -73,7 +71,6 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) { byte[] value = data.getBytes(); ((BytesColumnVector) output).setRef(rowId, value, 0, value.length); } - } private static class TimestampTzWriter implements OrcValueWriter { @@ -85,7 +82,6 @@ public void nonNullWrite(int rowId, Long micros, ColumnVector output) { cv.time[rowId] = Math.floorDiv(micros, 1_000); // millis cv.nanos[rowId] = (int) Math.floorMod(micros, 1_000_000) * 1_000; // nanos } - } private static class Decimal18Writer implements OrcValueWriter { @@ -97,20 +93,18 @@ private static class Decimal18Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, Decimal decimal, ColumnVector output) { - ((DecimalColumnVector) output).vector[rowId].setFromLongAndScale( - decimal.toUnscaledLong(), scale); + ((DecimalColumnVector) output) + .vector[rowId].setFromLongAndScale(decimal.toUnscaledLong(), scale); } - } private static class Decimal38Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, Decimal decimal, ColumnVector output) { - ((DecimalColumnVector) output).vector[rowId].set( - HiveDecimal.create(decimal.toJavaBigDecimal())); + ((DecimalColumnVector) output) + .vector[rowId].set(HiveDecimal.create(decimal.toJavaBigDecimal())); } - } private static class ListWriter implements OrcValueWriter { @@ -120,10 +114,12 @@ private static class ListWriter implements OrcValueWriter { @SuppressWarnings("unchecked") ListWriter(OrcValueWriter writer, List orcTypes) { if (orcTypes.size() != 1) { - throw new IllegalArgumentException("Expected one (and same) ORC type for list elements, got: " + orcTypes); + throw new IllegalArgumentException( + "Expected one (and same) ORC type for list elements, got: " + orcTypes); } this.writer = writer; - this.fieldGetter = (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); + this.fieldGetter = + (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); } @Override @@ -145,7 +141,6 @@ public void nonNullWrite(int rowId, ArrayData value, ColumnVector output) { public Stream> metrics() { return writer.metrics(); } - } private static class MapWriter implements OrcValueWriter { @@ -155,14 +150,20 @@ private static class MapWriter implements OrcValueWriter { private final SparkOrcWriter.FieldGetter valueFieldGetter; @SuppressWarnings("unchecked") - MapWriter(OrcValueWriter keyWriter, OrcValueWriter valueWriter, List orcTypes) { + MapWriter( + OrcValueWriter keyWriter, + OrcValueWriter valueWriter, + List orcTypes) { if (orcTypes.size() != 2) { - throw new IllegalArgumentException("Expected two ORC type descriptions for a map, got: " + orcTypes); + throw new IllegalArgumentException( + "Expected two ORC type descriptions for a map, got: " + orcTypes); } this.keyWriter = keyWriter; this.valueWriter = valueWriter; - this.keyFieldGetter = (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); - this.valueFieldGetter = (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(1)); + this.keyFieldGetter = + (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); + this.valueFieldGetter = + (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(1)); } @Override @@ -189,7 +190,6 @@ public void nonNullWrite(int rowId, MapData map, ColumnVector output) { public Stream> metrics() { return Stream.concat(keyWriter.metrics(), valueWriter.metrics()); } - } private static void growColumnVector(ColumnVector cv, int requestedSize) { diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java index 9c7f3a6eb01d..6a8c7f1d3c88 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.Serializable; @@ -39,19 +38,18 @@ import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.expressions.SpecializedGetters; -/** - * This class acts as an adaptor from an OrcFileAppender to a - * FileAppender<InternalRow>. - */ +/** This class acts as an adaptor from an OrcFileAppender to a FileAppender<InternalRow>. */ public class SparkOrcWriter implements OrcRowWriter { private final InternalRowWriter writer; public SparkOrcWriter(Schema iSchema, TypeDescription orcSchema) { - Preconditions.checkArgument(orcSchema.getCategory() == TypeDescription.Category.STRUCT, + Preconditions.checkArgument( + orcSchema.getCategory() == TypeDescription.Category.STRUCT, "Top level must be a struct " + orcSchema); - writer = (InternalRowWriter) OrcSchemaWithTypeVisitor.visit(iSchema, orcSchema, new WriteBuilder()); + writer = + (InternalRowWriter) OrcSchemaWithTypeVisitor.visit(iSchema, orcSchema, new WriteBuilder()); } @Override @@ -71,24 +69,26 @@ public Stream> metrics() { } private static class WriteBuilder extends OrcSchemaWithTypeVisitor> { - private WriteBuilder() { - } + private WriteBuilder() {} @Override - public OrcValueWriter record(Types.StructType iStruct, TypeDescription record, - List names, List> fields) { + public OrcValueWriter record( + Types.StructType iStruct, + TypeDescription record, + List names, + List> fields) { return new InternalRowWriter(fields, record.getChildren()); } @Override - public OrcValueWriter list(Types.ListType iList, TypeDescription array, - OrcValueWriter element) { + public OrcValueWriter list( + Types.ListType iList, TypeDescription array, OrcValueWriter element) { return SparkOrcValueWriters.list(element, array.getChildren()); } @Override - public OrcValueWriter map(Types.MapType iMap, TypeDescription map, - OrcValueWriter key, OrcValueWriter value) { + public OrcValueWriter map( + Types.MapType iMap, TypeDescription map, OrcValueWriter key, OrcValueWriter value) { return SparkOrcValueWriters.map(key, value, map.getChildren()); } @@ -178,8 +178,9 @@ static FieldGetter createFieldGetter(TypeDescription fieldType) { // being changed behind our back. break; case DECIMAL: - fieldGetter = (row, ordinal) -> - row.getDecimal(ordinal, fieldType.getPrecision(), fieldType.getScale()); + fieldGetter = + (row, ordinal) -> + row.getDecimal(ordinal, fieldType.getPrecision(), fieldType.getScale()); break; case STRING: case CHAR: @@ -196,7 +197,8 @@ static FieldGetter createFieldGetter(TypeDescription fieldType) { fieldGetter = SpecializedGetters::getMap; break; default: - throw new IllegalArgumentException("Encountered an unsupported ORC type during a write from Spark."); + throw new IllegalArgumentException( + "Encountered an unsupported ORC type during a write from Spark."); } return (row, ordinal) -> { @@ -210,10 +212,12 @@ static FieldGetter createFieldGetter(TypeDescription fieldType) { interface FieldGetter extends Serializable { /** - * Returns a value from a complex Spark data holder such ArrayData, InternalRow, etc... - * Calls the appropriate getter for the expected data type. + * Returns a value from a complex Spark data holder such ArrayData, InternalRow, etc... Calls + * the appropriate getter for the expected data type. + * * @param row Spark's data representation - * @param ordinal index in the data structure (e.g. column index for InterRow, list index in ArrayData, etc..) + * @param ordinal index in the data structure (e.g. column index for InterRow, list index in + * ArrayData, etc..) * @return field value at ordinal */ @Nullable diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java index 8abee4a575e1..8c4c3dce226a 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.math.BigDecimal; @@ -66,25 +65,25 @@ import org.apache.spark.unsafe.types.UTF8String; public class SparkParquetReaders { - private SparkParquetReaders() { - } + private SparkParquetReaders() {} - public static ParquetValueReader buildReader(Schema expectedSchema, - MessageType fileSchema) { + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema) { return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); } @SuppressWarnings("unchecked") - public static ParquetValueReader buildReader(Schema expectedSchema, - MessageType fileSchema, - Map idToConstant) { + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema, Map idToConstant) { if (ParquetSchemaUtil.hasIds(fileSchema)) { return (ParquetValueReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, - new ReadBuilder(fileSchema, idToConstant)); + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); } else { return (ParquetValueReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), + fileSchema, new FallbackReadBuilder(fileSchema, idToConstant)); } } @@ -95,18 +94,18 @@ private static class FallbackReadBuilder extends ReadBuilder { } @Override - public ParquetValueReader message(Types.StructType expected, MessageType message, - List> fieldReaders) { + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { // the top level matches by ID, but the remaining IDs are missing return super.struct(expected, message, fieldReaders); } @Override - public ParquetValueReader struct(Types.StructType ignored, GroupType struct, - List> fieldReaders) { + public ParquetValueReader struct( + Types.StructType ignored, GroupType struct, List> fieldReaders) { // the expected struct is ignored because nested fields are never found when the - List> newFields = Lists.newArrayListWithExpectedSize( - fieldReaders.size()); + List> newFields = + Lists.newArrayListWithExpectedSize(fieldReaders.size()); List types = Lists.newArrayListWithExpectedSize(fieldReaders.size()); List fields = struct.getFields(); for (int i = 0; i < fields.size(); i += 1) { @@ -130,14 +129,14 @@ private static class ReadBuilder extends TypeWithSchemaVisitor message(Types.StructType expected, MessageType message, - List> fieldReaders) { + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { return struct(expected, message.asGroupType(), fieldReaders); } @Override - public ParquetValueReader struct(Types.StructType expected, GroupType struct, - List> fieldReaders) { + public ParquetValueReader struct( + Types.StructType expected, GroupType struct, List> fieldReaders) { // match the expected struct's order Map> readersById = Maps.newHashMap(); Map typesById = Maps.newHashMap(); @@ -152,10 +151,10 @@ public ParquetValueReader struct(Types.StructType expected, GroupType struct, } } - List expectedFields = expected != null ? - expected.fields() : ImmutableList.of(); - List> reorderedFields = Lists.newArrayListWithExpectedSize( - expectedFields.size()); + List expectedFields = + expected != null ? expected.fields() : ImmutableList.of(); + List> reorderedFields = + Lists.newArrayListWithExpectedSize(expectedFields.size()); List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); for (Types.NestedField field : expectedFields) { int id = field.fieldId(); @@ -185,8 +184,8 @@ public ParquetValueReader struct(Types.StructType expected, GroupType struct, } @Override - public ParquetValueReader list(Types.ListType expectedList, GroupType array, - ParquetValueReader elementReader) { + public ParquetValueReader list( + Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); @@ -196,13 +195,16 @@ public ParquetValueReader list(Types.ListType expectedList, GroupType array, Type elementType = repeated.getType(0); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - return new ArrayReader<>(repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); + return new ArrayReader<>( + repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); } @Override - public ParquetValueReader map(Types.MapType expectedMap, GroupType map, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { + public ParquetValueReader map( + Types.MapType expectedMap, + GroupType map, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); @@ -214,14 +216,16 @@ public ParquetValueReader map(Types.MapType expectedMap, GroupType map, Type valueType = repeatedKeyValue.getType(1); int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - return new MapReader<>(repeatedD, repeatedR, + return new MapReader<>( + repeatedD, + repeatedR, ParquetValueReaders.option(keyType, keyD, keyReader), ParquetValueReaders.option(valueType, valueD, valueReader)); } @Override - public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveType expected, - PrimitiveType primitive) { + public ParquetValueReader primitive( + org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { ColumnDescriptor desc = type.getColumnDescription(currentPath()); if (primitive.getOriginalType() != null) { @@ -377,12 +381,13 @@ public Long read(Long ignored) { @Override public long readLong() { - final ByteBuffer byteBuffer = column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + final ByteBuffer byteBuffer = + column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); final long timeOfDayNanos = byteBuffer.getLong(); final int julianDay = byteBuffer.getInt(); - return TimeUnit.DAYS.toMicros(julianDay - UNIX_EPOCH_JULIAN) + - TimeUnit.NANOSECONDS.toMicros(timeOfDayNanos); + return TimeUnit.DAYS.toMicros(julianDay - UNIX_EPOCH_JULIAN) + + TimeUnit.NANOSECONDS.toMicros(timeOfDayNanos); } } @@ -456,15 +461,19 @@ protected ArrayData buildList(ReusableArrayData list) { } } - private static class MapReader extends RepeatedKeyValueReader { + private static class MapReader + extends RepeatedKeyValueReader { private int readPos = 0; private int writePos = 0; private final ReusableEntry entry = new ReusableEntry<>(); private final ReusableEntry nullEntry = new ReusableEntry<>(); - MapReader(int definitionLevel, int repetitionLevel, - ParquetValueReader keyReader, ParquetValueReader valueReader) { + MapReader( + int definitionLevel, + int repetitionLevel, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { super(definitionLevel, repetitionLevel, keyReader, valueReader); } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index 5e268d26ed9c..c7622678c74d 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.Iterator; @@ -54,12 +53,12 @@ import org.apache.spark.unsafe.types.UTF8String; public class SparkParquetWriters { - private SparkParquetWriters() { - } + private SparkParquetWriters() {} @SuppressWarnings("unchecked") public static ParquetValueWriter buildWriter(StructType dfSchema, MessageType type) { - return (ParquetValueWriter) ParquetWithSparkSchemaVisitor.visit(dfSchema, type, new WriteBuilder(type)); + return (ParquetValueWriter) + ParquetWithSparkSchemaVisitor.visit(dfSchema, type, new WriteBuilder(type)); } private static class WriteBuilder extends ParquetWithSparkSchemaVisitor> { @@ -70,14 +69,14 @@ private static class WriteBuilder extends ParquetWithSparkSchemaVisitor message(StructType sStruct, MessageType message, - List> fieldWriters) { + public ParquetValueWriter message( + StructType sStruct, MessageType message, List> fieldWriters) { return struct(sStruct, message.asGroupType(), fieldWriters); } @Override - public ParquetValueWriter struct(StructType sStruct, GroupType struct, - List> fieldWriters) { + public ParquetValueWriter struct( + StructType sStruct, GroupType struct, List> fieldWriters) { List fields = struct.getFields(); StructField[] sparkFields = sStruct.fields(); List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); @@ -91,31 +90,40 @@ public ParquetValueWriter struct(StructType sStruct, GroupType struct, } @Override - public ParquetValueWriter list(ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { + public ParquetValueWriter list( + ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - return new ArrayDataWriter<>(repeatedD, repeatedR, + return new ArrayDataWriter<>( + repeatedD, + repeatedR, newOption(repeated.getType(0), elementWriter), sArray.elementType()); } @Override - public ParquetValueWriter map(MapType sMap, GroupType map, - ParquetValueWriter keyWriter, ParquetValueWriter valueWriter) { + public ParquetValueWriter map( + MapType sMap, + GroupType map, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - return new MapDataWriter<>(repeatedD, repeatedR, + return new MapDataWriter<>( + repeatedD, + repeatedR, newOption(repeatedKeyValue.getType(0), keyWriter), newOption(repeatedKeyValue.getType(1), valueWriter), - sMap.keyType(), sMap.valueType()); + sMap.keyType(), + sMap.valueType()); } private ParquetValueWriter newOption(Type fieldType, ParquetValueWriter writer) { @@ -197,18 +205,18 @@ private static PrimitiveWriter utf8Strings(ColumnDescriptor desc) { return new UTF8StringWriter(desc); } - private static PrimitiveWriter decimalAsInteger(ColumnDescriptor desc, - int precision, int scale) { + private static PrimitiveWriter decimalAsInteger( + ColumnDescriptor desc, int precision, int scale) { return new IntegerDecimalWriter(desc, precision, scale); } - private static PrimitiveWriter decimalAsLong(ColumnDescriptor desc, - int precision, int scale) { + private static PrimitiveWriter decimalAsLong( + ColumnDescriptor desc, int precision, int scale) { return new LongDecimalWriter(desc, precision, scale); } - private static PrimitiveWriter decimalAsFixed(ColumnDescriptor desc, - int precision, int scale) { + private static PrimitiveWriter decimalAsFixed( + ColumnDescriptor desc, int precision, int scale) { return new FixedDecimalWriter(desc, precision, scale); } @@ -239,10 +247,18 @@ private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, Decimal decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeInteger(repetitionLevel, (int) decimal.toUnscaledLong()); } @@ -260,10 +276,18 @@ private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, Decimal decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeLong(repetitionLevel, decimal.toUnscaledLong()); } @@ -278,12 +302,15 @@ private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { super(desc); this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(int repetitionLevel, Decimal decimal) { - byte[] binary = DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toJavaBigDecimal(), bytes.get()); + byte[] binary = + DecimalUtil.toReusedFixLengthBytes( + precision, scale, decimal.toJavaBigDecimal(), bytes.get()); column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(binary)); } } @@ -302,8 +329,11 @@ public void write(int repetitionLevel, byte[] bytes) { private static class ArrayDataWriter extends RepeatedWriter { private final DataType elementType; - private ArrayDataWriter(int definitionLevel, int repetitionLevel, - ParquetValueWriter writer, DataType elementType) { + private ArrayDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter writer, + DataType elementType) { super(definitionLevel, repetitionLevel, writer); this.elementType = elementType; } @@ -354,9 +384,13 @@ private static class MapDataWriter extends RepeatedKeyValueWriter keyWriter, ParquetValueWriter valueWriter, - DataType keyType, DataType valueType) { + private MapDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter, + DataType keyType, + DataType valueType) { super(definitionLevel, repetitionLevel, keyWriter, valueWriter); this.keyType = keyType; this.valueType = valueType; diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java index 0d3ce2b28d0b..11655c72d857 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -44,8 +43,7 @@ public class SparkValueReaders { - private SparkValueReaders() { - } + private SparkValueReaders() {} static ValueReader strings() { return StringReader.INSTANCE; @@ -67,8 +65,8 @@ static ValueReader array(ValueReader elementReader) { return new ArrayReader(elementReader); } - static ValueReader arrayMap(ValueReader keyReader, - ValueReader valueReader) { + static ValueReader arrayMap( + ValueReader keyReader, ValueReader valueReader) { return new ArrayMapReader(keyReader, valueReader); } @@ -76,16 +74,15 @@ static ValueReader map(ValueReader keyReader, ValueReader< return new MapReader(keyReader, valueReader); } - static ValueReader struct(List> readers, Types.StructType struct, - Map idToConstant) { + static ValueReader struct( + List> readers, Types.StructType struct, Map idToConstant) { return new StructReader(readers, struct, idToConstant); } private static class StringReader implements ValueReader { private static final StringReader INSTANCE = new StringReader(); - private StringReader() { - } + private StringReader() {} @Override public UTF8String read(Decoder decoder, Object reuse) throws IOException { @@ -97,10 +94,10 @@ public UTF8String read(Decoder decoder, Object reuse) throws IOException { Utf8 string = decoder.readString(utf8); return UTF8String.fromBytes(string.getBytes(), 0, string.getByteLength()); -// int length = decoder.readInt(); -// byte[] bytes = new byte[length]; -// decoder.readFixed(bytes, 0, length); -// return UTF8String.fromBytes(bytes); + // int length = decoder.readInt(); + // byte[] bytes = new byte[length]; + // decoder.readFixed(bytes, 0, length); + // return UTF8String.fromBytes(bytes); } } @@ -122,16 +119,17 @@ public UTF8String read(Decoder decoder, Object ignore) throws IOException { } private static class UUIDReader implements ValueReader { - private static final ThreadLocal BUFFER = ThreadLocal.withInitial(() -> { - ByteBuffer buffer = ByteBuffer.allocate(16); - buffer.order(ByteOrder.BIG_ENDIAN); - return buffer; - }); + private static final ThreadLocal BUFFER = + ThreadLocal.withInitial( + () -> { + ByteBuffer buffer = ByteBuffer.allocate(16); + buffer.order(ByteOrder.BIG_ENDIAN); + return buffer; + }); private static final UUIDReader INSTANCE = new UUIDReader(); - private UUIDReader() { - } + private UUIDReader() {} @Override @SuppressWarnings("ByteBufferBackingArray") @@ -258,14 +256,16 @@ public ArrayBasedMapData read(Decoder decoder, Object reuse) throws IOException static class StructReader extends ValueReaders.StructReader { private final int numFields; - protected StructReader(List> readers, Types.StructType struct, Map idToConstant) { + protected StructReader( + List> readers, Types.StructType struct, Map idToConstant) { super(readers, struct, idToConstant); this.numFields = readers.size(); } @Override protected InternalRow reuseOrCreate(Object reuse) { - if (reuse instanceof GenericInternalRow && ((GenericInternalRow) reuse).numFields() == numFields) { + if (reuse instanceof GenericInternalRow + && ((GenericInternalRow) reuse).numFields() == numFields) { return (InternalRow) reuse; } return new GenericInternalRow(numFields); diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java index 24a69c1d7f11..5f2e2c054888 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -39,8 +38,7 @@ public class SparkValueWriters { - private SparkValueWriters() { - } + private SparkValueWriters() {} static ValueWriter strings() { return StringWriter.INSTANCE; @@ -75,8 +73,7 @@ static ValueWriter struct(List> writers, List { private static final StringWriter INSTANCE = new StringWriter(); - private StringWriter() { - } + private StringWriter() {} @Override public void write(UTF8String s, Encoder encoder) throws IOException { @@ -88,16 +85,17 @@ public void write(UTF8String s, Encoder encoder) throws IOException { } private static class UUIDWriter implements ValueWriter { - private static final ThreadLocal BUFFER = ThreadLocal.withInitial(() -> { - ByteBuffer buffer = ByteBuffer.allocate(16); - buffer.order(ByteOrder.BIG_ENDIAN); - return buffer; - }); + private static final ThreadLocal BUFFER = + ThreadLocal.withInitial( + () -> { + ByteBuffer buffer = ByteBuffer.allocate(16); + buffer.order(ByteOrder.BIG_ENDIAN); + return buffer; + }); private static final UUIDWriter INSTANCE = new UUIDWriter(); - private UUIDWriter() { - } + private UUIDWriter() {} @Override @SuppressWarnings("ByteBufferBackingArray") @@ -120,12 +118,14 @@ private static class DecimalWriter implements ValueWriter { private DecimalWriter(int precision, int scale) { this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(Decimal d, Encoder encoder) throws IOException { - encoder.writeFixed(DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toJavaBigDecimal(), bytes.get())); + encoder.writeFixed( + DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toJavaBigDecimal(), bytes.get())); } } @@ -158,8 +158,11 @@ private static class ArrayMapWriter implements ValueWriter { private final DataType keyType; private final DataType valueType; - private ArrayMapWriter(ValueWriter keyWriter, DataType keyType, - ValueWriter valueWriter, DataType valueType) { + private ArrayMapWriter( + ValueWriter keyWriter, + DataType keyType, + ValueWriter valueWriter, + DataType valueType) { this.keyWriter = keyWriter; this.keyType = keyType; this.valueWriter = valueWriter; @@ -189,8 +192,11 @@ private static class MapWriter implements ValueWriter { private final DataType keyType; private final DataType valueType; - private MapWriter(ValueWriter keyWriter, DataType keyType, - ValueWriter valueWriter, DataType valueType) { + private MapWriter( + ValueWriter keyWriter, + DataType keyType, + ValueWriter valueWriter, + DataType valueType) { this.keyWriter = keyWriter; this.keyType = keyType; this.valueWriter = valueWriter; diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java index 505ace508352..e32ebcb02bbc 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.math.BigDecimal; @@ -32,10 +31,12 @@ import org.apache.spark.unsafe.types.UTF8String; final class ArrowVectorAccessorFactory - extends GenericArrowVectorAccessorFactory { + extends GenericArrowVectorAccessorFactory< + Decimal, UTF8String, ColumnarArray, ArrowColumnVector> { ArrowVectorAccessorFactory() { - super(DecimalFactoryImpl::new, + super( + DecimalFactoryImpl::new, StringFactoryImpl::new, StructChildFactoryImpl::new, ArrayFactoryImpl::new); @@ -70,9 +71,7 @@ public UTF8String ofRow(VarCharVector vector, int rowId) { int end = vector.getEndOffset(rowId); return UTF8String.fromAddress( - null, - vector.getDataBuffer().memoryAddress() + start, - end - start); + null, vector.getDataBuffer().memoryAddress() + start, end - start); } @Override @@ -84,7 +83,9 @@ public UTF8String ofBytes(byte[] bytes) { public UTF8String ofByteBuffer(ByteBuffer byteBuffer) { if (byteBuffer.hasArray()) { return UTF8String.fromBytes( - byteBuffer.array(), byteBuffer.arrayOffset() + byteBuffer.position(), byteBuffer.remaining()); + byteBuffer.array(), + byteBuffer.arrayOffset() + byteBuffer.position(), + byteBuffer.remaining()); } byte[] bytes = new byte[byteBuffer.remaining()]; byteBuffer.get(bytes); @@ -92,7 +93,8 @@ public UTF8String ofByteBuffer(ByteBuffer byteBuffer) { } } - private static final class ArrayFactoryImpl implements ArrayFactory { + private static final class ArrayFactoryImpl + implements ArrayFactory { @Override public ArrowColumnVector ofChild(ValueVector childVector) { return new ArrowColumnVector(childVector); @@ -108,7 +110,8 @@ public ColumnarArray ofRow(ValueVector vector, ArrowColumnVector childData, int } } - private static final class StructChildFactoryImpl implements StructChildFactory { + private static final class StructChildFactoryImpl + implements StructChildFactory { @Override public Class getGenericClass() { return ArrowColumnVector.class; diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java index f3b3377af2b4..810fef81b5bb 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.arrow.vectorized.ArrowVectorAccessor; @@ -35,6 +34,5 @@ public class ArrowVectorAccessors { return factory.getVectorAccessor(holder); } - private ArrowVectorAccessors() { - } + private ArrowVectorAccessors() {} } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java index f71a6968099c..f761b2eb551b 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.util.List; @@ -28,9 +27,9 @@ import org.apache.spark.sql.vectorized.ColumnarBatch; /** - * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized read path. The - * {@link ColumnarBatch} returned is created by passing in the Arrow vectors populated via delegated read calls to - * {@linkplain VectorizedArrowReader VectorReader(s)}. + * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized + * read path. The {@link ColumnarBatch} returned is created by passing in the Arrow vectors + * populated via delegated read calls to {@linkplain VectorizedArrowReader VectorReader(s)}. */ public class ColumnarBatchReader extends BaseBatchReader { @@ -40,7 +39,8 @@ public ColumnarBatchReader(List> readers) { @Override public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { - Preconditions.checkArgument(numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); + Preconditions.checkArgument( + numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); ColumnVector[] arrowColumnVectors = new ColumnVector[readers.length]; if (reuse == null) { @@ -52,10 +52,10 @@ public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { int numRowsInVector = vectorHolders[i].numValues(); Preconditions.checkState( numRowsInVector == numRowsToRead, - "Number of rows in the vector %s didn't match expected %s ", numRowsInVector, + "Number of rows in the vector %s didn't match expected %s ", + numRowsInVector, numRowsToRead); - arrowColumnVectors[i] = - IcebergArrowColumnVector.forHolder(vectorHolders[i], numRowsInVector); + arrowColumnVectors[i] = IcebergArrowColumnVector.forHolder(vectorHolders[i], numRowsInVector); } ColumnarBatch batch = new ColumnarBatch(arrowColumnVectors); batch.setNumRows(numRowsToRead); diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java index 7e3d94a97375..8a0b329ebd52 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.spark.SparkSchemaUtil; @@ -39,8 +38,7 @@ class ConstantColumnVector extends ColumnVector { } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java index 514eec84fe82..33c1a5284818 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.arrow.vectorized.ArrowVectorAccessor; @@ -33,9 +32,10 @@ import org.apache.spark.unsafe.types.UTF8String; /** - * Implementation of Spark's {@link ColumnVector} interface. The code for this class is heavily inspired from Spark's - * {@link ArrowColumnVector} The main difference is in how nullability checks are made in this class by relying on - * {@link NullabilityHolder} instead of the validity vector in the Arrow vector. + * Implementation of Spark's {@link ColumnVector} interface. The code for this class is heavily + * inspired from Spark's {@link ArrowColumnVector} The main difference is in how nullability checks + * are made in this class by relying on {@link NullabilityHolder} instead of the validity vector in + * the Arrow vector. */ public class IcebergArrowColumnVector extends ColumnVector { @@ -146,12 +146,14 @@ public ArrowColumnVector getChild(int ordinal) { } static ColumnVector forHolder(VectorHolder holder, int numRows) { - return holder.isDummy() ? - new ConstantColumnVector(Types.IntegerType.get(), numRows, ((ConstantVectorHolder) holder).getConstant()) : - new IcebergArrowColumnVector(holder); + return holder.isDummy() + ? new ConstantColumnVector( + Types.IntegerType.get(), numRows, ((ConstantVectorHolder) holder).getConstant()) + : new IcebergArrowColumnVector(holder); } - public ArrowVectorAccessor vectorAccessor() { + public ArrowVectorAccessor + vectorAccessor() { return accessor; } } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java index f0d35c665b6b..a4d878b63569 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.spark.SparkSchemaUtil; @@ -37,8 +36,7 @@ public class RowPositionColumnVector extends ColumnVector { } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java index 418c25993a7e..7c3b825a62e7 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.util.List; @@ -47,23 +46,27 @@ public class VectorizedSparkOrcReaders { - private VectorizedSparkOrcReaders() { - } + private VectorizedSparkOrcReaders() {} - public static OrcBatchReader buildReader(Schema expectedSchema, TypeDescription fileSchema, - Map idToConstant) { - Converter converter = OrcSchemaWithTypeVisitor.visit(expectedSchema, fileSchema, new ReadBuilder(idToConstant)); + public static OrcBatchReader buildReader( + Schema expectedSchema, TypeDescription fileSchema, Map idToConstant) { + Converter converter = + OrcSchemaWithTypeVisitor.visit(expectedSchema, fileSchema, new ReadBuilder(idToConstant)); return new OrcBatchReader() { private long batchOffsetInFile; @Override public ColumnarBatch read(VectorizedRowBatch batch) { - BaseOrcColumnVector cv = (BaseOrcColumnVector) converter.convert(new StructColumnVector(batch.size, batch.cols), - batch.size, batchOffsetInFile); - ColumnarBatch columnarBatch = new ColumnarBatch(IntStream.range(0, expectedSchema.columns().size()) - .mapToObj(cv::getChild) - .toArray(ColumnVector[]::new)); + BaseOrcColumnVector cv = + (BaseOrcColumnVector) + converter.convert( + new StructColumnVector(batch.size, batch.cols), batch.size, batchOffsetInFile); + ColumnarBatch columnarBatch = + new ColumnarBatch( + IntStream.range(0, expectedSchema.columns().size()) + .mapToObj(cv::getChild) + .toArray(ColumnVector[]::new)); columnarBatch.setNumRows(batch.size); return columnarBatch; } @@ -76,8 +79,10 @@ public void setBatchContext(long batchOffsetInFile) { } private interface Converter { - ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector columnVector, int batchSize, - long batchOffsetInFile); + ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector columnVector, + int batchSize, + long batchOffsetInFile); } private static class ReadBuilder extends OrcSchemaWithTypeVisitor { @@ -88,8 +93,11 @@ private ReadBuilder(Map idToConstant) { } @Override - public Converter record(Types.StructType iStruct, TypeDescription record, List names, - List fields) { + public Converter record( + Types.StructType iStruct, + TypeDescription record, + List names, + List fields) { return new StructConverter(iStruct, fields, idToConstant); } @@ -132,7 +140,8 @@ public Converter primitive(Type.PrimitiveType iPrimitive, TypeDescription primit primitiveValueReader = SparkOrcValueReaders.timestampTzs(); break; case DECIMAL: - primitiveValueReader = SparkOrcValueReaders.decimals(primitive.getPrecision(), primitive.getScale()); + primitiveValueReader = + SparkOrcValueReaders.decimals(primitive.getPrecision(), primitive.getScale()); break; case CHAR: case VARCHAR: @@ -146,7 +155,8 @@ public Converter primitive(Type.PrimitiveType iPrimitive, TypeDescription primit throw new IllegalArgumentException("Unhandled type " + primitive); } return (columnVector, batchSize, batchOffsetInFile) -> - new PrimitiveOrcColumnVector(iPrimitive, batchSize, columnVector, primitiveValueReader, batchOffsetInFile); + new PrimitiveOrcColumnVector( + iPrimitive, batchSize, columnVector, primitiveValueReader, batchOffsetInFile); } } @@ -155,15 +165,15 @@ private abstract static class BaseOrcColumnVector extends ColumnVector { private final int batchSize; private Integer numNulls; - BaseOrcColumnVector(Type type, int batchSize, org.apache.orc.storage.ql.exec.vector.ColumnVector vector) { + BaseOrcColumnVector( + Type type, int batchSize, org.apache.orc.storage.ql.exec.vector.ColumnVector vector) { super(SparkSchemaUtil.convert(type)); this.vector = vector; this.batchSize = batchSize; } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { @@ -278,8 +288,12 @@ private static class PrimitiveOrcColumnVector extends BaseOrcColumnVector { private final OrcValueReader primitiveValueReader; private final long batchOffsetInFile; - PrimitiveOrcColumnVector(Type type, int batchSize, org.apache.orc.storage.ql.exec.vector.ColumnVector vector, - OrcValueReader primitiveValueReader, long batchOffsetInFile) { + PrimitiveOrcColumnVector( + Type type, + int batchSize, + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + OrcValueReader primitiveValueReader, + long batchOffsetInFile) { super(type, batchSize, vector); this.vector = vector; this.primitiveValueReader = primitiveValueReader; @@ -313,7 +327,8 @@ public double getDouble(int rowId) { @Override public Decimal getDecimal(int rowId, int precision, int scale) { - // TODO: Is it okay to assume that (precision,scale) parameters == (precision,scale) of the decimal type + // TODO: Is it okay to assume that (precision,scale) parameters == (precision,scale) of the + // decimal type // and return a Decimal with (precision,scale) of the decimal type? return (Decimal) primitiveValueReader.read(vector, rowId); } @@ -339,16 +354,20 @@ private ArrayConverter(Types.ListType listType, Converter elementConverter) { } @Override - public ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector vector, int batchSize, - long batchOffsetInFile) { + public ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + int batchSize, + long batchOffsetInFile) { ListColumnVector listVector = (ListColumnVector) vector; - ColumnVector elementVector = elementConverter.convert(listVector.child, batchSize, batchOffsetInFile); + ColumnVector elementVector = + elementConverter.convert(listVector.child, batchSize, batchOffsetInFile); return new BaseOrcColumnVector(listType, batchSize, vector) { @Override public ColumnarArray getArray(int rowId) { int index = getRowIndex(rowId); - return new ColumnarArray(elementVector, (int) listVector.offsets[index], (int) listVector.lengths[index]); + return new ColumnarArray( + elementVector, (int) listVector.offsets[index], (int) listVector.lengths[index]); } }; } @@ -366,17 +385,23 @@ private MapConverter(Types.MapType mapType, Converter keyConverter, Converter va } @Override - public ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector vector, int batchSize, - long batchOffsetInFile) { + public ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + int batchSize, + long batchOffsetInFile) { MapColumnVector mapVector = (MapColumnVector) vector; ColumnVector keyVector = keyConverter.convert(mapVector.keys, batchSize, batchOffsetInFile); - ColumnVector valueVector = valueConverter.convert(mapVector.values, batchSize, batchOffsetInFile); + ColumnVector valueVector = + valueConverter.convert(mapVector.values, batchSize, batchOffsetInFile); return new BaseOrcColumnVector(mapType, batchSize, vector) { @Override public ColumnarMap getMap(int rowId) { int index = getRowIndex(rowId); - return new ColumnarMap(keyVector, valueVector, (int) mapVector.offsets[index], + return new ColumnarMap( + keyVector, + valueVector, + (int) mapVector.offsets[index], (int) mapVector.lengths[index]); } }; @@ -388,30 +413,37 @@ private static class StructConverter implements Converter { private final List fieldConverters; private final Map idToConstant; - private StructConverter(Types.StructType structType, List fieldConverters, - Map idToConstant) { + private StructConverter( + Types.StructType structType, + List fieldConverters, + Map idToConstant) { this.structType = structType; this.fieldConverters = fieldConverters; this.idToConstant = idToConstant; } @Override - public ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector vector, int batchSize, - long batchOffsetInFile) { + public ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + int batchSize, + long batchOffsetInFile) { StructColumnVector structVector = (StructColumnVector) vector; List fields = structType.fields(); List fieldVectors = Lists.newArrayListWithExpectedSize(fields.size()); for (int pos = 0, vectorIndex = 0; pos < fields.size(); pos += 1) { Types.NestedField field = fields.get(pos); if (idToConstant.containsKey(field.fieldId())) { - fieldVectors.add(new ConstantColumnVector(field.type(), batchSize, idToConstant.get(field.fieldId()))); + fieldVectors.add( + new ConstantColumnVector(field.type(), batchSize, idToConstant.get(field.fieldId()))); } else if (field.equals(MetadataColumns.ROW_POSITION)) { fieldVectors.add(new RowPositionColumnVector(batchOffsetInFile)); } else if (field.equals(MetadataColumns.IS_DELETED)) { fieldVectors.add(new ConstantColumnVector(field.type(), batchSize, false)); } else { - fieldVectors.add(fieldConverters.get(vectorIndex) - .convert(structVector.fields[vectorIndex], batchSize, batchOffsetInFile)); + fieldVectors.add( + fieldConverters + .get(vectorIndex) + .convert(structVector.fields[vectorIndex], batchSize, batchOffsetInFile)); vectorIndex++; } } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java index b2d582352d74..bbb63e077bc6 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.util.Map; @@ -28,13 +27,10 @@ public class VectorizedSparkParquetReaders { - private VectorizedSparkParquetReaders() { - } + private VectorizedSparkParquetReaders() {} public static ColumnarBatchReader buildReader( - Schema expectedSchema, - MessageType fileSchema, - boolean setArrowValidityVector) { + Schema expectedSchema, MessageType fileSchema, boolean setArrowValidityVector) { return buildReader(expectedSchema, fileSchema, setArrowValidityVector, Maps.newHashMap()); } @@ -44,9 +40,14 @@ public static ColumnarBatchReader buildReader( boolean setArrowValidityVector, Map idToConstant) { return (ColumnarBatchReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), + fileSchema, new VectorizedReaderBuilder( - expectedSchema, fileSchema, setArrowValidityVector, - idToConstant, ColumnarBatchReader::new)); + expectedSchema, + fileSchema, + setArrowValidityVector, + idToConstant, + ColumnarBatchReader::new)); } } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseDataReader.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseDataReader.java index b58745c7a00d..2cab8ee238e0 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseDataReader.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/BaseDataReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.Closeable; @@ -76,10 +75,16 @@ abstract class BaseDataReader implements Closeable { this.tasks = task.files().iterator(); Map keyMetadata = Maps.newHashMap(); task.files().stream() - .flatMap(fileScanTask -> Stream.concat(Stream.of(fileScanTask.file()), fileScanTask.deletes().stream())) + .flatMap( + fileScanTask -> + Stream.concat(Stream.of(fileScanTask.file()), fileScanTask.deletes().stream())) .forEach(file -> keyMetadata.put(file.path().toString(), file.keyMetadata())); - Stream encrypted = keyMetadata.entrySet().stream() - .map(entry -> EncryptedFiles.encryptedInput(table.io().newInputFile(entry.getKey()), entry.getValue())); + Stream encrypted = + keyMetadata.entrySet().stream() + .map( + entry -> + EncryptedFiles.encryptedInput( + table.io().newInputFile(entry.getKey()), entry.getValue())); // decrypt with the batch call to avoid multiple RPCs to a key server, if possible Iterable decryptedFiles = table.encryption().decrypt(encrypted::iterator); @@ -188,7 +193,8 @@ protected static Object convertConstant(Type type, Object value) { for (int index = 0; index < fields.size(); index++) { NestedField field = fields.get(index); Type fieldType = field.type(); - values[index] = convertConstant(fieldType, struct.get(index, fieldType.typeId().javaClass())); + values[index] = + convertConstant(fieldType, struct.get(index, fieldType.typeId().javaClass())); } return new GenericInternalRow(values); diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java index e4bd3ceba6ce..d620faa979f6 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -50,7 +49,8 @@ class BatchDataReader extends BaseDataReader { private final boolean caseSensitive; private final int batchSize; - BatchDataReader(CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive, int size) { + BatchDataReader( + CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive, int size) { super(table, task); this.expectedSchema = expectedSchema; this.nameMapping = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); @@ -71,18 +71,26 @@ CloseableIterator open(FileScanTask task) { InputFile location = getInputFile(task); Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask"); if (task.file().format() == FileFormat.PARQUET) { - Parquet.ReadBuilder builder = Parquet.read(location) - .project(expectedSchema) - .split(task.start(), task.length()) - .createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(expectedSchema, - fileSchema, /* setArrowValidityVector */ NullCheckingForGet.NULL_CHECKING_ENABLED, idToConstant)) - .recordsPerBatch(batchSize) - .filter(task.residual()) - .caseSensitive(caseSensitive) - // Spark eagerly consumes the batches. So the underlying memory allocated could be reused - // without worrying about subsequent reads clobbering over each other. This improves - // read performance as every batch read doesn't have to pay the cost of allocating memory. - .reuseContainers(); + Parquet.ReadBuilder builder = + Parquet.read(location) + .project(expectedSchema) + .split(task.start(), task.length()) + .createBatchedReaderFunc( + fileSchema -> + VectorizedSparkParquetReaders.buildReader( + expectedSchema, + fileSchema, /* setArrowValidityVector */ + NullCheckingForGet.NULL_CHECKING_ENABLED, + idToConstant)) + .recordsPerBatch(batchSize) + .filter(task.residual()) + .caseSensitive(caseSensitive) + // Spark eagerly consumes the batches. So the underlying memory allocated could be + // reused + // without worrying about subsequent reads clobbering over each other. This improves + // read performance as every batch read doesn't have to pay the cost of allocating + // memory. + .reuseContainers(); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -92,16 +100,21 @@ CloseableIterator open(FileScanTask task) { } else if (task.file().format() == FileFormat.ORC) { Set constantFieldIds = idToConstant.keySet(); Set metadataFieldIds = MetadataColumns.metadataFieldIds(); - Sets.SetView constantAndMetadataFieldIds = Sets.union(constantFieldIds, metadataFieldIds); - Schema schemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(expectedSchema, constantAndMetadataFieldIds); - ORC.ReadBuilder builder = ORC.read(location) - .project(schemaWithoutConstantAndMetadataFields) - .split(task.start(), task.length()) - .createBatchedReaderFunc(fileSchema -> VectorizedSparkOrcReaders.buildReader(expectedSchema, fileSchema, - idToConstant)) - .recordsPerBatch(batchSize) - .filter(task.residual()) - .caseSensitive(caseSensitive); + Sets.SetView constantAndMetadataFieldIds = + Sets.union(constantFieldIds, metadataFieldIds); + Schema schemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot(expectedSchema, constantAndMetadataFieldIds); + ORC.ReadBuilder builder = + ORC.read(location) + .project(schemaWithoutConstantAndMetadataFields) + .split(task.start(), task.length()) + .createBatchedReaderFunc( + fileSchema -> + VectorizedSparkOrcReaders.buildReader( + expectedSchema, fileSchema, idToConstant)) + .recordsPerBatch(batchSize) + .filter(task.residual()) + .caseSensitive(caseSensitive); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/CustomCatalogs.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/CustomCatalogs.java index 44f2851fd258..9eda420f76ea 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/CustomCatalogs.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/CustomCatalogs.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import com.github.benmanes.caffeine.cache.Cache; @@ -39,22 +38,20 @@ import org.apache.spark.sql.SparkSession; public final class CustomCatalogs { - private static final Cache, Catalog> CATALOG_CACHE = Caffeine.newBuilder().build(); + private static final Cache, Catalog> CATALOG_CACHE = + Caffeine.newBuilder().build(); public static final String ICEBERG_DEFAULT_CATALOG = "default_catalog"; public static final String ICEBERG_CATALOG_PREFIX = "spark.sql.catalog"; - private CustomCatalogs() { - } + private CustomCatalogs() {} /** * Build an Iceberg {@link Catalog} to be used by this Spark source adapter. * - *

    - * The cache is to facilitate reuse of catalogs, especially if wrapped in CachingCatalog. For non-Hive catalogs all - * custom parameters passed to the catalog are considered in the cache key. Hive catalogs only cache based on - * the Metastore URIs as per previous behaviour. - * + *

    The cache is to facilitate reuse of catalogs, especially if wrapped in CachingCatalog. For + * non-Hive catalogs all custom parameters passed to the catalog are considered in the cache key. + * Hive catalogs only cache based on the Metastore URIs as per previous behaviour. * * @param spark Spark Session * @param name Catalog Name @@ -71,15 +68,15 @@ private static Catalog buildCatalog(Pair sparkAndName) { Configuration conf = SparkUtil.hadoopConfCatalogOverrides(spark, name); String catalogPrefix = String.format("%s.%s", ICEBERG_CATALOG_PREFIX, name); - if (!name.equals(ICEBERG_DEFAULT_CATALOG) && - !sparkConf.contains(catalogPrefix)) { + if (!name.equals(ICEBERG_DEFAULT_CATALOG) && !sparkConf.contains(catalogPrefix)) { // we return null if spark.sql.catalog. is not the Spark Catalog // and we aren't looking for the default catalog return null; } - Map options = Arrays.stream(sparkConf.getAllWithPrefix(catalogPrefix + ".")) - .collect(Collectors.toMap(x -> x._1, x -> x._2)); + Map options = + Arrays.stream(sparkConf.getAllWithPrefix(catalogPrefix + ".")) + .collect(Collectors.toMap(x -> x._1, x -> x._2)); return CatalogUtil.buildIcebergCatalog(name, options, conf); } @@ -89,10 +86,12 @@ public static Table table(SparkSession spark, String path) { return catalogAndTableIdentifier.first().loadTable(catalogAndTableIdentifier.second()); } - private static Pair catalogAndIdentifier(SparkSession spark, String path) { - String[] currentNamespace = new String[]{spark.catalog().currentDatabase()}; + private static Pair catalogAndIdentifier( + SparkSession spark, String path) { + String[] currentNamespace = new String[] {spark.catalog().currentDatabase()}; List nameParts = Splitter.on(".").splitToList(path); - return SparkUtil.catalogAndIdentifier(nameParts, + return SparkUtil.catalogAndIdentifier( + nameParts, s -> loadCatalog(spark, s), (n, t) -> TableIdentifier.of(Namespace.of(n), t), loadCatalog(spark, ICEBERG_DEFAULT_CATALOG), diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java index ce2226f4f75e..1c55e1b8ebe2 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -32,7 +31,8 @@ public class EqualityDeleteRowReader extends RowDataReader { private final Schema expectedSchema; - public EqualityDeleteRowReader(CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { + public EqualityDeleteRowReader( + CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { super(task, table, table.schema(), caseSensitive); this.expectedSchema = expectedSchema; } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java index 8976d666f168..f3ceee176a94 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -45,7 +44,8 @@ import org.apache.spark.sql.streaming.OutputMode; import org.apache.spark.sql.types.StructType; -public class IcebergSource implements DataSourceV2, ReadSupport, WriteSupport, DataSourceRegister, StreamWriteSupport { +public class IcebergSource + implements DataSourceV2, ReadSupport, WriteSupport, DataSourceRegister, StreamWriteSupport { private SparkSession lazySpark = null; private Configuration lazyConf = null; @@ -66,7 +66,8 @@ public DataSourceReader createReader(StructType readSchema, DataSourceOptions op Table table = getTableAndResolveHadoopConfiguration(options, conf); String caseSensitive = lazySparkSession().conf().get("spark.sql.caseSensitive"); - Reader reader = new Reader(lazySparkSession(), table, Boolean.parseBoolean(caseSensitive), options); + Reader reader = + new Reader(lazySparkSession(), table, Boolean.parseBoolean(caseSensitive), options); if (readSchema != null) { // convert() will fail if readSchema contains fields not in reader.snapshotSchema() SparkSchemaUtil.convert(reader.snapshotSchema(), readSchema); @@ -77,52 +78,69 @@ public DataSourceReader createReader(StructType readSchema, DataSourceOptions op } @Override - public Optional createWriter(String jobId, StructType dsStruct, SaveMode mode, - DataSourceOptions options) { - Preconditions.checkArgument(mode == SaveMode.Append || mode == SaveMode.Overwrite, - "Save mode %s is not supported", mode); + public Optional createWriter( + String jobId, StructType dsStruct, SaveMode mode, DataSourceOptions options) { + Preconditions.checkArgument( + mode == SaveMode.Append || mode == SaveMode.Overwrite, + "Save mode %s is not supported", + mode); Configuration conf = new Configuration(lazyBaseConf()); Table table = getTableAndResolveHadoopConfiguration(options, conf); SparkWriteConf writeConf = new SparkWriteConf(lazySparkSession(), table, options.asMap()); Preconditions.checkArgument( - writeConf.handleTimestampWithoutZone() || !SparkUtil.hasTimestampWithoutZone(table.schema()), + writeConf.handleTimestampWithoutZone() + || !SparkUtil.hasTimestampWithoutZone(table.schema()), SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct); - TypeUtil.validateWriteSchema(table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); + TypeUtil.validateWriteSchema( + table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); SparkUtil.validatePartitionTransforms(table.spec()); String appId = lazySparkSession().sparkContext().applicationId(); String wapId = writeConf.wapId(); boolean replacePartitions = mode == SaveMode.Overwrite; - return Optional.of(new Writer( - lazySparkSession(), table, writeConf, replacePartitions, appId, wapId, writeSchema, dsStruct)); + return Optional.of( + new Writer( + lazySparkSession(), + table, + writeConf, + replacePartitions, + appId, + wapId, + writeSchema, + dsStruct)); } @Override - public StreamWriter createStreamWriter(String runId, StructType dsStruct, - OutputMode mode, DataSourceOptions options) { + public StreamWriter createStreamWriter( + String runId, StructType dsStruct, OutputMode mode, DataSourceOptions options) { Preconditions.checkArgument( mode == OutputMode.Append() || mode == OutputMode.Complete(), - "Output mode %s is not supported", mode); + "Output mode %s is not supported", + mode); Configuration conf = new Configuration(lazyBaseConf()); Table table = getTableAndResolveHadoopConfiguration(options, conf); SparkWriteConf writeConf = new SparkWriteConf(lazySparkSession(), table, options.asMap()); Preconditions.checkArgument( - writeConf.handleTimestampWithoutZone() || !SparkUtil.hasTimestampWithoutZone(table.schema()), + writeConf.handleTimestampWithoutZone() + || !SparkUtil.hasTimestampWithoutZone(table.schema()), SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct); - TypeUtil.validateWriteSchema(table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); + TypeUtil.validateWriteSchema( + table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); SparkUtil.validatePartitionTransforms(table.spec()); // Spark 2.4.x passes runId to createStreamWriter instead of real queryId, // so we fetch it directly from sparkContext to make writes idempotent - String queryId = lazySparkSession().sparkContext().getLocalProperty(StreamExecution.QUERY_ID_KEY()); + String queryId = + lazySparkSession().sparkContext().getLocalProperty(StreamExecution.QUERY_ID_KEY()); String appId = lazySparkSession().sparkContext().applicationId(); - return new StreamingWriter(lazySparkSession(), table, writeConf, queryId, mode, appId, writeSchema, dsStruct); + return new StreamingWriter( + lazySparkSession(), table, writeConf, queryId, mode, appId, writeSchema, dsStruct); } protected Table findTable(DataSourceOptions options, Configuration conf) { @@ -163,8 +181,7 @@ private Table getTableAndResolveHadoopConfiguration( return table; } - private static void mergeIcebergHadoopConfs( - Configuration baseConf, Map options) { + private static void mergeIcebergHadoopConfs(Configuration baseConf, Map options) { options.keySet().stream() .filter(key -> key.startsWith("hadoop.")) .forEach(key -> baseConf.set(key.replaceFirst("hadoop.", ""), options.get(key))); diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java index ef1eb08d873c..524266f6f83a 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.nio.ByteBuffer; @@ -32,8 +31,8 @@ import org.apache.spark.sql.types.StructType; /** - * Class to adapt a Spark {@code InternalRow} to Iceberg {@link StructLike} for uses like - * {@link org.apache.iceberg.PartitionKey#partition(StructLike)} + * Class to adapt a Spark {@code InternalRow} to Iceberg {@link StructLike} for uses like {@link + * org.apache.iceberg.PartitionKey#partition(StructLike)} */ class InternalRowWrapper implements StructLike { private final DataType[] types; @@ -42,12 +41,8 @@ class InternalRowWrapper implements StructLike { @SuppressWarnings("unchecked") InternalRowWrapper(StructType rowType) { - this.types = Stream.of(rowType.fields()) - .map(StructField::dataType) - .toArray(DataType[]::new); - this.getters = Stream.of(types) - .map(InternalRowWrapper::getter) - .toArray(BiFunction[]::new); + this.types = Stream.of(rowType.fields()).map(StructField::dataType).toArray(DataType[]::new); + this.getters = Stream.of(types).map(InternalRowWrapper::getter).toArray(BiFunction[]::new); } InternalRowWrapper wrap(InternalRow internalRow) { diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java index b1c6e4d780fb..94396d218304 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Reader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -76,8 +75,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -class Reader implements DataSourceReader, SupportsScanColumnarBatch, SupportsPushDownFilters, - SupportsPushDownRequiredColumns, SupportsReportStatistics { +class Reader + implements DataSourceReader, + SupportsScanColumnarBatch, + SupportsPushDownFilters, + SupportsPushDownRequiredColumns, + SupportsReportStatistics { private static final Logger LOG = LoggerFactory.getLogger(Reader.class); private static final Filter[] NO_FILTERS = new Filter[0]; @@ -122,8 +125,11 @@ class Reader implements DataSourceReader, SupportsScanColumnarBatch, SupportsPus LOG.warn("Failed to get Hadoop Filesystem", ioe); } String scheme = fsscheme; // Makes an effectively final version of scheme - this.localityPreferred = options.get("locality").map(Boolean::parseBoolean) - .orElseGet(() -> LOCALITY_WHITELIST_FS.contains(scheme)); + this.localityPreferred = + options + .get("locality") + .map(Boolean::parseBoolean) + .orElseGet(() -> LOCALITY_WHITELIST_FS.contains(scheme)); } else { this.localityPreferred = false; } @@ -138,15 +144,16 @@ private void validateOptions( "Cannot scan using both snapshot-id and as-of-timestamp to select the table snapshot"); } - if ((snapshotId != null || asOfTimestamp != null) && - (startSnapshotId != null || endSnapshotId != null)) { + if ((snapshotId != null || asOfTimestamp != null) + && (startSnapshotId != null || endSnapshotId != null)) { throw new IllegalArgumentException( - "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan when either snapshot-id or " + - "as-of-timestamp is specified"); + "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan when either snapshot-id or " + + "as-of-timestamp is specified"); } if (startSnapshotId == null && endSnapshotId != null) { - throw new IllegalArgumentException("Cannot only specify option end-snapshot-id to do incremental scan"); + throw new IllegalArgumentException( + "Cannot only specify option end-snapshot-id to do incremental scan"); } } @@ -181,12 +188,14 @@ private TableScan configureBaseScan(boolean caseSensitive, DataSourceOptions opt scan = scan.option(TableProperties.SPLIT_SIZE, splitSize.toString()); } - Integer splitLookback = options.get(SparkReadOptions.LOOKBACK).map(Integer::parseInt).orElse(null); + Integer splitLookback = + options.get(SparkReadOptions.LOOKBACK).map(Integer::parseInt).orElse(null); if (splitLookback != null) { scan = scan.option(TableProperties.SPLIT_LOOKBACK, splitLookback.toString()); } - Long splitOpenFileCost = options.get(SparkReadOptions.FILE_OPEN_COST).map(Long::parseLong).orElse(null); + Long splitOpenFileCost = + options.get(SparkReadOptions.FILE_OPEN_COST).map(Long::parseLong).orElse(null); if (splitOpenFileCost != null) { scan = scan.option(TableProperties.SPLIT_OPEN_FILE_COST, splitOpenFileCost.toString()); } @@ -201,9 +210,11 @@ protected Schema snapshotSchema() { private Schema lazySchema() { if (schema == null) { if (requestedSchema != null) { - // the projection should include all columns that will be returned, including those only used in filters - this.schema = SparkSchemaUtil.prune( - baseScan.schema(), requestedSchema, filterExpression(), baseScan.isCaseSensitive()); + // the projection should include all columns that will be returned, including those only + // used in filters + this.schema = + SparkSchemaUtil.prune( + baseScan.schema(), requestedSchema, filterExpression(), baseScan.isCaseSensitive()); } else { this.schema = baseScan.schema(); } @@ -220,8 +231,9 @@ private Expression filterExpression() { private StructType lazyType() { if (type == null) { - Preconditions.checkArgument(readTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(lazySchema()), - SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); + Preconditions.checkArgument( + readTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(lazySchema()), + SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); this.type = SparkSchemaUtil.convert(lazySchema()); } return type; @@ -241,8 +253,10 @@ public List> planBatchInputPartitions() { Preconditions.checkState(batchSize > 0, "Invalid batch size"); String expectedSchemaString = SchemaParser.toJson(lazySchema()); - ValidationException.check(tasks().stream().noneMatch(TableScanUtil::hasDeletes), - "Cannot scan table %s: cannot apply required delete files", table); + ValidationException.check( + tasks().stream().noneMatch(TableScanUtil::hasDeletes), + "Cannot scan table %s: cannot apply required delete files", + table); // broadcast the table metadata as input partitions will be sent to executors Broadcast

    tableBroadcast = sparkContext.broadcast(SerializableTable.copyOf(table)); @@ -254,17 +268,22 @@ public List> planBatchInputPartitions() { Tasks.range(readTasks.length) .stopOnFailure() .executeWith(localityPreferred ? ThreadPools.getWorkerPool() : null) - .run(index -> readTasks[index] = new ReadTask<>( - scanTasks.get(index), tableBroadcast, expectedSchemaString, caseSensitive, - localityPreferred, new BatchReaderFactory(batchSize))); + .run( + index -> + readTasks[index] = + new ReadTask<>( + scanTasks.get(index), + tableBroadcast, + expectedSchemaString, + caseSensitive, + localityPreferred, + new BatchReaderFactory(batchSize))); LOG.info("Batching input partitions with {} tasks.", readTasks.length); return Arrays.asList(readTasks); } - /** - * This is called in the Spark Driver when data is to be materialized into {@link InternalRow} - */ + /** This is called in the Spark Driver when data is to be materialized into {@link InternalRow} */ @Override public List> planInputPartitions() { String expectedSchemaString = SchemaParser.toJson(lazySchema()); @@ -279,9 +298,16 @@ public List> planInputPartitions() { Tasks.range(readTasks.length) .stopOnFailure() .executeWith(localityPreferred ? ThreadPools.getWorkerPool() : null) - .run(index -> readTasks[index] = new ReadTask<>( - scanTasks.get(index), tableBroadcast, expectedSchemaString, caseSensitive, - localityPreferred, InternalRowReaderFactory.INSTANCE)); + .run( + index -> + readTasks[index] = + new ReadTask<>( + scanTasks.get(index), + tableBroadcast, + expectedSchemaString, + caseSensitive, + localityPreferred, + InternalRowReaderFactory.INSTANCE)); return Arrays.asList(readTasks); } @@ -334,10 +360,14 @@ public Statistics estimateStatistics() { return new Stats(0L, 0L); } - // estimate stats using snapshot summary only for partitioned tables (metadata tables are unpartitioned) + // estimate stats using snapshot summary only for partitioned tables (metadata tables are + // unpartitioned) if (!table.spec().isUnpartitioned() && filterExpression() == Expressions.alwaysTrue()) { - long totalRecords = PropertyUtil.propertyAsLong(table.currentSnapshot().summary(), - SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE); + long totalRecords = + PropertyUtil.propertyAsLong( + table.currentSnapshot().summary(), + SnapshotSummary.TOTAL_RECORDS_PROP, + Long.MAX_VALUE); return new Stats(SparkSchemaUtil.estimateSize(lazyType(), totalRecords), totalRecords); } @@ -360,28 +390,38 @@ public boolean enableBatchRead() { if (readUsingBatch == null) { boolean allParquetFileScanTasks = tasks().stream() - .allMatch(combinedScanTask -> !combinedScanTask.isDataTask() && combinedScanTask.files() - .stream() - .allMatch(fileScanTask -> fileScanTask.file().format().equals( - FileFormat.PARQUET))); + .allMatch( + combinedScanTask -> + !combinedScanTask.isDataTask() + && combinedScanTask.files().stream() + .allMatch( + fileScanTask -> + fileScanTask.file().format().equals(FileFormat.PARQUET))); boolean allOrcFileScanTasks = tasks().stream() - .allMatch(combinedScanTask -> !combinedScanTask.isDataTask() && combinedScanTask.files() - .stream() - .allMatch(fileScanTask -> fileScanTask.file().format().equals( - FileFormat.ORC))); + .allMatch( + combinedScanTask -> + !combinedScanTask.isDataTask() + && combinedScanTask.files().stream() + .allMatch( + fileScanTask -> + fileScanTask.file().format().equals(FileFormat.ORC))); boolean atLeastOneColumn = lazySchema().columns().size() > 0; - boolean onlyPrimitives = lazySchema().columns().stream().allMatch(c -> c.type().isPrimitiveType()); + boolean onlyPrimitives = + lazySchema().columns().stream().allMatch(c -> c.type().isPrimitiveType()); boolean hasNoDeleteFiles = tasks().stream().noneMatch(TableScanUtil::hasDeletes); boolean batchReadsEnabled = batchReadsEnabled(allParquetFileScanTasks, allOrcFileScanTasks); - this.readUsingBatch = batchReadsEnabled && hasNoDeleteFiles && (allOrcFileScanTasks || - (allParquetFileScanTasks && atLeastOneColumn && onlyPrimitives)); + this.readUsingBatch = + batchReadsEnabled + && hasNoDeleteFiles + && (allOrcFileScanTasks + || (allParquetFileScanTasks && atLeastOneColumn && onlyPrimitives)); if (readUsingBatch) { this.batchSize = batchSize(allParquetFileScanTasks, allOrcFileScanTasks); @@ -410,8 +450,7 @@ private int batchSize(boolean isParquetOnly, boolean isOrcOnly) { } } - private static void mergeIcebergHadoopConfs( - Configuration baseConf, Map options) { + private static void mergeIcebergHadoopConfs(Configuration baseConf, Map options) { options.keySet().stream() .filter(key -> key.startsWith("hadoop.")) .forEach(key -> baseConf.set(key.replaceFirst("hadoop.", ""), options.get(key))); @@ -455,8 +494,13 @@ private static class ReadTask implements Serializable, InputPartition { private transient Schema expectedSchema = null; private transient String[] preferredLocations = null; - private ReadTask(CombinedScanTask task, Broadcast
    tableBroadcast, String expectedSchemaString, - boolean caseSensitive, boolean localityPreferred, ReaderFactory readerFactory) { + private ReadTask( + CombinedScanTask task, + Broadcast
    tableBroadcast, + String expectedSchemaString, + boolean caseSensitive, + boolean localityPreferred, + ReaderFactory readerFactory) { this.task = task; this.tableBroadcast = tableBroadcast; this.expectedSchemaString = expectedSchemaString; @@ -496,18 +540,18 @@ private String[] getPreferredLocations() { } private interface ReaderFactory extends Serializable { - InputPartitionReader create(CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive); + InputPartitionReader create( + CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive); } private static class InternalRowReaderFactory implements ReaderFactory { private static final InternalRowReaderFactory INSTANCE = new InternalRowReaderFactory(); - private InternalRowReaderFactory() { - } + private InternalRowReaderFactory() {} @Override - public InputPartitionReader create(CombinedScanTask task, Table table, - Schema expectedSchema, boolean caseSensitive) { + public InputPartitionReader create( + CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { return new RowReader(task, table, expectedSchema, caseSensitive); } } @@ -520,20 +564,27 @@ private static class BatchReaderFactory implements ReaderFactory } @Override - public InputPartitionReader create(CombinedScanTask task, Table table, - Schema expectedSchema, boolean caseSensitive) { + public InputPartitionReader create( + CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { return new BatchReader(task, table, expectedSchema, caseSensitive, batchSize); } } - private static class RowReader extends RowDataReader implements InputPartitionReader { + private static class RowReader extends RowDataReader + implements InputPartitionReader { RowReader(CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { super(task, table, expectedSchema, caseSensitive); } } - private static class BatchReader extends BatchDataReader implements InputPartitionReader { - BatchReader(CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive, int size) { + private static class BatchReader extends BatchDataReader + implements InputPartitionReader { + BatchReader( + CombinedScanTask task, + Table table, + Schema expectedSchema, + boolean caseSensitive, + int size) { super(task, table, expectedSchema, caseSensitive, size); } } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java index 4f5962494feb..f206149da30e 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -81,7 +80,8 @@ protected Schema tableSchema() { return tableSchema; } - protected CloseableIterable open(FileScanTask task, Schema readSchema, Map idToConstant) { + protected CloseableIterable open( + FileScanTask task, Schema readSchema, Map idToConstant) { CloseableIterable iter; if (task.isDataTask()) { iter = newDataIterable(task.asDataTask(), readSchema); @@ -112,15 +112,14 @@ protected CloseableIterable open(FileScanTask task, Schema readSche } private CloseableIterable newAvroIterable( - InputFile location, - FileScanTask task, - Schema projection, - Map idToConstant) { - Avro.ReadBuilder builder = Avro.read(location) - .reuseContainers() - .project(projection) - .split(task.start(), task.length()) - .createReaderFunc(readSchema -> new SparkAvroReader(projection, readSchema, idToConstant)); + InputFile location, FileScanTask task, Schema projection, Map idToConstant) { + Avro.ReadBuilder builder = + Avro.read(location) + .reuseContainers() + .project(projection) + .split(task.start(), task.length()) + .createReaderFunc( + readSchema -> new SparkAvroReader(projection, readSchema, idToConstant)); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -130,17 +129,16 @@ private CloseableIterable newAvroIterable( } private CloseableIterable newParquetIterable( - InputFile location, - FileScanTask task, - Schema readSchema, - Map idToConstant) { - Parquet.ReadBuilder builder = Parquet.read(location) - .reuseContainers() - .split(task.start(), task.length()) - .project(readSchema) - .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive); + InputFile location, FileScanTask task, Schema readSchema, Map idToConstant) { + Parquet.ReadBuilder builder = + Parquet.read(location) + .reuseContainers() + .split(task.start(), task.length()) + .project(readSchema) + .createReaderFunc( + fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -150,19 +148,19 @@ private CloseableIterable newParquetIterable( } private CloseableIterable newOrcIterable( - InputFile location, - FileScanTask task, - Schema readSchema, - Map idToConstant) { - Schema readSchemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(readSchema, - Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); - - ORC.ReadBuilder builder = ORC.read(location) - .project(readSchemaWithoutConstantAndMetadataFields) - .split(task.start(), task.length()) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive); + InputFile location, FileScanTask task, Schema readSchema, Map idToConstant) { + Schema readSchemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot( + readSchema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); + + ORC.ReadBuilder builder = + ORC.read(location) + .project(readSchemaWithoutConstantAndMetadataFields) + .split(task.start(), task.length()) + .createReaderFunc( + readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -173,8 +171,8 @@ private CloseableIterable newOrcIterable( private CloseableIterable newDataIterable(DataTask task, Schema readSchema) { StructInternalRow row = new StructInternalRow(readSchema.asStruct()); - CloseableIterable asSparkRows = CloseableIterable.transform( - task.asDataTask().rows(), row::setStruct); + CloseableIterable asSparkRows = + CloseableIterable.transform(task.asDataTask().rows(), row::setStruct); return asSparkRows; } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java index 63cc3a466c1a..aee0d4f0586b 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.Serializable; @@ -55,23 +54,25 @@ public class RowDataRewriter implements Serializable { private final FileFormat format; private final boolean caseSensitive; - public RowDataRewriter(Broadcast
    tableBroadcast, PartitionSpec spec, boolean caseSensitive) { + public RowDataRewriter( + Broadcast
    tableBroadcast, PartitionSpec spec, boolean caseSensitive) { this.tableBroadcast = tableBroadcast; this.spec = spec; this.caseSensitive = caseSensitive; Table table = tableBroadcast.value(); - String formatString = table.properties().getOrDefault( - TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); + String formatString = + table + .properties() + .getOrDefault( + TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); this.format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH)); } public List rewriteDataForTasks(JavaRDD taskRDD) { JavaRDD> dataFilesRDD = taskRDD.map(this::rewriteDataForTask); - return dataFilesRDD.collect().stream() - .flatMap(Collection::stream) - .collect(Collectors.toList()); + return dataFilesRDD.collect().stream().flatMap(Collection::stream).collect(Collectors.toList()); } private List rewriteDataForTask(CombinedScanTask task) throws Exception { @@ -86,28 +87,44 @@ private List rewriteDataForTask(CombinedScanTask task) throws Exceptio RowDataReader dataReader = new RowDataReader(task, table, schema, caseSensitive); StructType structType = SparkSchemaUtil.convert(schema); - SparkAppenderFactory appenderFactory = SparkAppenderFactory.builderFor(table, schema, structType) - .spec(spec) - .build(); - OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, partitionId, taskId) - .defaultSpec(spec) - .format(format) - .build(); + SparkAppenderFactory appenderFactory = + SparkAppenderFactory.builderFor(table, schema, structType).spec(spec).build(); + OutputFileFactory fileFactory = + OutputFileFactory.builderFor(table, partitionId, taskId) + .defaultSpec(spec) + .format(format) + .build(); TaskWriter writer; if (spec.isUnpartitioned()) { - writer = new UnpartitionedWriter<>(spec, format, appenderFactory, fileFactory, table.io(), - Long.MAX_VALUE); - } else if (PropertyUtil.propertyAsBoolean(properties, + writer = + new UnpartitionedWriter<>( + spec, format, appenderFactory, fileFactory, table.io(), Long.MAX_VALUE); + } else if (PropertyUtil.propertyAsBoolean( + properties, TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED, TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED_DEFAULT)) { - writer = new SparkPartitionedFanoutWriter( - spec, format, appenderFactory, fileFactory, table.io(), Long.MAX_VALUE, schema, - structType); + writer = + new SparkPartitionedFanoutWriter( + spec, + format, + appenderFactory, + fileFactory, + table.io(), + Long.MAX_VALUE, + schema, + structType); } else { - writer = new SparkPartitionedWriter( - spec, format, appenderFactory, fileFactory, table.io(), Long.MAX_VALUE, schema, - structType); + writer = + new SparkPartitionedWriter( + spec, + format, + appenderFactory, + fileFactory, + table.io(), + Long.MAX_VALUE, + schema, + structType); } try { @@ -127,14 +144,24 @@ private List rewriteDataForTask(CombinedScanTask task) throws Exceptio LOG.error("Aborting task", originalThrowable); context.markTaskFailed(originalThrowable); - LOG.error("Aborting commit for partition {} (task {}, attempt {}, stage {}.{})", - partitionId, taskId, context.attemptNumber(), context.stageId(), context.stageAttemptNumber()); + LOG.error( + "Aborting commit for partition {} (task {}, attempt {}, stage {}.{})", + partitionId, + taskId, + context.attemptNumber(), + context.stageId(), + context.stageAttemptNumber()); if (dataReader != null) { dataReader.close(); } writer.abort(); - LOG.error("Aborted commit for partition {} (task {}, attempt {}, stage {}.{})", - partitionId, taskId, context.taskAttemptId(), context.stageId(), context.stageAttemptNumber()); + LOG.error( + "Aborted commit for partition {} (task {}, attempt {}, stage {}.{})", + partitionId, + taskId, + context.taskAttemptId(), + context.stageId(), + context.stageAttemptNumber()); } catch (Throwable inner) { if (originalThrowable != inner) { diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java index 4becf666ed3e..6372edde0782 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -61,8 +60,14 @@ class SparkAppenderFactory implements FileAppenderFactory { private StructType eqDeleteSparkType = null; private StructType posDeleteSparkType = null; - SparkAppenderFactory(Map properties, Schema writeSchema, StructType dsSchema, PartitionSpec spec, - int[] equalityFieldIds, Schema eqDeleteRowSchema, Schema posDeleteRowSchema) { + SparkAppenderFactory( + Map properties, + Schema writeSchema, + StructType dsSchema, + PartitionSpec spec, + int[] equalityFieldIds, + Schema eqDeleteRowSchema, + Schema posDeleteRowSchema) { this.properties = properties; this.writeSchema = writeSchema; this.dsSchema = dsSchema; @@ -85,7 +90,6 @@ static class Builder { private Schema eqDeleteRowSchema; private Schema posDeleteRowSchema; - Builder(Table table, Schema writeSchema, StructType dsSchema) { this.table = table; this.spec = table.spec(); @@ -118,16 +122,24 @@ SparkAppenderFactory build() { Preconditions.checkNotNull(writeSchema, "Write Schema must not be null"); Preconditions.checkNotNull(dsSchema, "DS Schema must not be null"); if (equalityFieldIds != null) { - Preconditions.checkNotNull(eqDeleteRowSchema, "Equality Field Ids and Equality Delete Row Schema" + - " must be set together"); + Preconditions.checkNotNull( + eqDeleteRowSchema, + "Equality Field Ids and Equality Delete Row Schema" + " must be set together"); } if (eqDeleteRowSchema != null) { - Preconditions.checkNotNull(equalityFieldIds, "Equality Field Ids and Equality Delete Row Schema" + - " must be set together"); + Preconditions.checkNotNull( + equalityFieldIds, + "Equality Field Ids and Equality Delete Row Schema" + " must be set together"); } - return new SparkAppenderFactory(table.properties(), writeSchema, dsSchema, spec, equalityFieldIds, - eqDeleteRowSchema, posDeleteRowSchema); + return new SparkAppenderFactory( + table.properties(), + writeSchema, + dsSchema, + spec, + equalityFieldIds, + eqDeleteRowSchema, + posDeleteRowSchema); } } @@ -141,7 +153,8 @@ private StructType lazyEqDeleteSparkType() { private StructType lazyPosDeleteSparkType() { if (posDeleteSparkType == null) { - Preconditions.checkNotNull(posDeleteRowSchema, "Position delete row schema shouldn't be null"); + Preconditions.checkNotNull( + posDeleteRowSchema, "Position delete row schema shouldn't be null"); this.posDeleteSparkType = SparkSchemaUtil.convert(posDeleteRowSchema); } return posDeleteSparkType; @@ -187,24 +200,33 @@ public FileAppender newAppender(OutputFile file, FileFormat fileFor } @Override - public DataWriter newDataWriter(EncryptedOutputFile file, FileFormat format, StructLike partition) { - return new DataWriter<>(newAppender(file.encryptingOutputFile(), format), format, - file.encryptingOutputFile().location(), spec, partition, file.keyMetadata()); + public DataWriter newDataWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { + return new DataWriter<>( + newAppender(file.encryptingOutputFile(), format), + format, + file.encryptingOutputFile().location(), + spec, + partition, + file.keyMetadata()); } @Override - public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile file, FileFormat format, - StructLike partition) { - Preconditions.checkState(equalityFieldIds != null && equalityFieldIds.length > 0, + public EqualityDeleteWriter newEqDeleteWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { + Preconditions.checkState( + equalityFieldIds != null && equalityFieldIds.length > 0, "Equality field ids shouldn't be null or empty when creating equality-delete writer"); - Preconditions.checkNotNull(eqDeleteRowSchema, + Preconditions.checkNotNull( + eqDeleteRowSchema, "Equality delete row schema shouldn't be null when creating equality-delete writer"); try { switch (format) { case PARQUET: return Parquet.writeDeletes(file.encryptingOutputFile()) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(lazyEqDeleteSparkType(), msgType)) + .createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(lazyEqDeleteSparkType(), msgType)) .overwrite() .rowSchema(eqDeleteRowSchema) .withSpec(spec) @@ -245,15 +267,16 @@ public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile f } @Override - public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile file, FileFormat format, - StructLike partition) { + public PositionDeleteWriter newPosDeleteWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { try { switch (format) { case PARQUET: StructType sparkPosDeleteSchema = SparkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); return Parquet.writeDeletes(file.encryptingOutputFile()) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(sparkPosDeleteSchema, msgType)) + .createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(sparkPosDeleteSchema, msgType)) .overwrite() .rowSchema(posDeleteRowSchema) .withSpec(spec) diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java index beaa7c295024..a8c894bfc50c 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; +import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; + import java.util.Locale; import java.util.Map; import org.apache.iceberg.FileFormat; @@ -40,24 +44,35 @@ import org.apache.spark.sql.types.StructType; import org.apache.spark.unsafe.types.UTF8String; -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; -import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; - class SparkFileWriterFactory extends BaseFileWriterFactory { private StructType dataSparkType; private StructType equalityDeleteSparkType; private StructType positionDeleteSparkType; - SparkFileWriterFactory(Table table, FileFormat dataFileFormat, Schema dataSchema, StructType dataSparkType, - SortOrder dataSortOrder, FileFormat deleteFileFormat, - int[] equalityFieldIds, Schema equalityDeleteRowSchema, StructType equalityDeleteSparkType, - SortOrder equalityDeleteSortOrder, Schema positionDeleteRowSchema, - StructType positionDeleteSparkType) { - - super(table, dataFileFormat, dataSchema, dataSortOrder, deleteFileFormat, equalityFieldIds, - equalityDeleteRowSchema, equalityDeleteSortOrder, positionDeleteRowSchema); + SparkFileWriterFactory( + Table table, + FileFormat dataFileFormat, + Schema dataSchema, + StructType dataSparkType, + SortOrder dataSortOrder, + FileFormat deleteFileFormat, + int[] equalityFieldIds, + Schema equalityDeleteRowSchema, + StructType equalityDeleteSparkType, + SortOrder equalityDeleteSortOrder, + Schema positionDeleteRowSchema, + StructType positionDeleteSparkType) { + + super( + table, + dataFileFormat, + dataSchema, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteSortOrder, + positionDeleteRowSchema); this.dataSparkType = dataSparkType; this.equalityDeleteSparkType = equalityDeleteSparkType; @@ -80,7 +95,8 @@ protected void configureEqualityDelete(Avro.DeleteWriteBuilder builder) { @Override protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { - boolean withRow = positionDeleteSparkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME).isDefined(); + boolean withRow = + positionDeleteSparkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME).isDefined(); if (withRow) { // SparkAvroWriter accepts just the Spark type of the row ignoring the path and pos StructField rowField = positionDeleteSparkType().apply(DELETE_FILE_ROW_FIELD_NAME); @@ -96,12 +112,14 @@ protected void configureDataWrite(Parquet.DataWriteBuilder builder) { @Override protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(msgType -> SparkParquetWriters.buildWriter(equalityDeleteSparkType(), msgType)); + builder.createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(equalityDeleteSparkType(), msgType)); } @Override protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(msgType -> SparkParquetWriters.buildWriter(positionDeleteSparkType(), msgType)); + builder.createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(positionDeleteSparkType(), msgType)); builder.transformPaths(path -> UTF8String.fromString(path.toString())); } @@ -132,7 +150,8 @@ private StructType dataSparkType() { private StructType equalityDeleteSparkType() { if (equalityDeleteSparkType == null) { - Preconditions.checkNotNull(equalityDeleteRowSchema(), "Equality delete schema must not be null"); + Preconditions.checkNotNull( + equalityDeleteRowSchema(), "Equality delete schema must not be null"); this.equalityDeleteSparkType = SparkSchemaUtil.convert(equalityDeleteRowSchema()); } @@ -141,7 +160,8 @@ private StructType equalityDeleteSparkType() { private StructType positionDeleteSparkType() { if (positionDeleteSparkType == null) { - // wrap the optional row schema into the position delete schema that contains path and position + // wrap the optional row schema into the position delete schema that contains path and + // position Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema()); this.positionDeleteSparkType = SparkSchemaUtil.convert(positionDeleteSchema); } @@ -168,10 +188,12 @@ static class Builder { Map properties = table.properties(); - String dataFileFormatName = properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); + String dataFileFormatName = + properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); this.dataFileFormat = FileFormat.valueOf(dataFileFormatName.toUpperCase(Locale.ENGLISH)); - String deleteFileFormatName = properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); + String deleteFileFormatName = + properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); this.deleteFileFormat = FileFormat.valueOf(deleteFileFormatName.toUpperCase(Locale.ENGLISH)); } @@ -233,13 +255,23 @@ Builder positionDeleteSparkType(StructType newPositionDeleteSparkType) { SparkFileWriterFactory build() { boolean noEqualityDeleteConf = equalityFieldIds == null && equalityDeleteRowSchema == null; boolean fullEqualityDeleteConf = equalityFieldIds != null && equalityDeleteRowSchema != null; - Preconditions.checkArgument(noEqualityDeleteConf || fullEqualityDeleteConf, + Preconditions.checkArgument( + noEqualityDeleteConf || fullEqualityDeleteConf, "Equality field IDs and equality delete row schema must be set together"); return new SparkFileWriterFactory( - table, dataFileFormat, dataSchema, dataSparkType, dataSortOrder, deleteFileFormat, - equalityFieldIds, equalityDeleteRowSchema, equalityDeleteSparkType, equalityDeleteSortOrder, - positionDeleteRowSchema, positionDeleteSparkType); + table, + dataFileFormat, + dataSchema, + dataSparkType, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteSparkType, + equalityDeleteSortOrder, + positionDeleteRowSchema, + positionDeleteSparkType); } } } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java index d38ae2f40316..f17cd260f928 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.FileFormat; @@ -34,10 +33,15 @@ public class SparkPartitionedFanoutWriter extends PartitionedFanoutWriter appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize, - Schema schema, StructType sparkSchema) { + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + StructType sparkSchema) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.partitionKey = new PartitionKey(spec, schema); this.internalRowWrapper = new InternalRowWrapper(sparkSchema); diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java index f81a09926d85..a86091644360 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.FileFormat; @@ -34,10 +33,15 @@ public class SparkPartitionedWriter extends PartitionedWriter { private final PartitionKey partitionKey; private final InternalRowWrapper internalRowWrapper; - public SparkPartitionedWriter(PartitionSpec spec, FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize, - Schema schema, StructType sparkSchema) { + public SparkPartitionedWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + StructType sparkSchema) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.partitionKey = new PartitionKey(spec, schema); this.internalRowWrapper = new InternalRowWrapper(sparkSchema); diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java index 76119c186944..c4142011b294 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.OptionalLong; diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java index f15c5c6536e9..9fa4c63dda2b 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import com.fasterxml.jackson.core.JsonGenerator; @@ -46,10 +45,10 @@ class StreamingOffset extends Offset { * An implementation of Spark Structured Streaming Offset, to track the current processed files of * Iceberg table. * - * @param snapshotId The current processed snapshot id. - * @param position The position of last scanned file in snapshot. - * @param scanAllFiles whether to scan all files in a snapshot; for example, to read - * all data when starting a stream. + * @param snapshotId The current processed snapshot id. + * @param position The position of last scanned file in snapshot. + * @param scanAllFiles whether to scan all files in a snapshot; for example, to read all data when + * starting a stream. */ StreamingOffset(long snapshotId, long position, boolean scanAllFiles) { this.snapshotId = snapshotId; @@ -65,8 +64,10 @@ static StreamingOffset fromJson(String json) { // The version of StreamingOffset. The offset was created with a version number // used to validate when deserializing from json string. int version = JsonUtil.getInt(VERSION, node); - Preconditions.checkArgument(version == CURR_VERSION, - "Cannot parse offset JSON: offset version %s is not supported", version); + Preconditions.checkArgument( + version == CURR_VERSION, + "Cannot parse offset JSON: offset version %s is not supported", + version); long snapshotId = JsonUtil.getLong(SNAPSHOT_ID, node); int position = JsonUtil.getInt(POSITION, node); @@ -74,7 +75,8 @@ static StreamingOffset fromJson(String json) { return new StreamingOffset(snapshotId, position, shouldScanAllFiles); } catch (IOException e) { - throw new IllegalArgumentException(String.format("Failed to parse StreamingOffset from JSON string %s", json), e); + throw new IllegalArgumentException( + String.format("Failed to parse StreamingOffset from JSON string %s", json), e); } } @@ -114,9 +116,9 @@ boolean shouldScanAllFiles() { public boolean equals(Object obj) { if (obj instanceof StreamingOffset) { StreamingOffset offset = (StreamingOffset) obj; - return offset.snapshotId == snapshotId && - offset.position == position && - offset.scanAllFiles == scanAllFiles; + return offset.snapshotId == snapshotId + && offset.position == position + && offset.scanAllFiles == scanAllFiles; } else { return false; } @@ -129,7 +131,8 @@ public int hashCode() { @Override public String toString() { - return String.format("Streaming Offset[%d: position (%d) scan_all_files (%b)]", - snapshotId, position, scanAllFiles); + return String.format( + "Streaming Offset[%d: position (%d) scan_all_files (%b)]", + snapshotId, position, scanAllFiles); } } diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StreamingWriter.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StreamingWriter.java index dfac043b71c2..64ad7e672866 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StreamingWriter.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StreamingWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -46,8 +45,15 @@ public class StreamingWriter extends Writer implements StreamWriter { private final String queryId; private final OutputMode mode; - StreamingWriter(SparkSession spark, Table table, SparkWriteConf writeConf, String queryId, - OutputMode mode, String applicationId, Schema writeSchema, StructType dsSchema) { + StreamingWriter( + SparkSession spark, + Table table, + SparkWriteConf writeConf, + String queryId, + OutputMode mode, + String applicationId, + Schema writeSchema, + StructType dsSchema) { super(spark, table, writeConf, false, applicationId, writeSchema, dsSchema); this.queryId = queryId; this.mode = mode; @@ -84,7 +90,8 @@ public void commit(long epochId, WriterCommitMessage[] messages) { } } - private void commit(SnapshotUpdate snapshotUpdate, long epochId, int numFiles, String description) { + private void commit( + SnapshotUpdate snapshotUpdate, long epochId, int numFiles, String description) { snapshotUpdate.set(QUERY_ID_PROPERTY, queryId); snapshotUpdate.set(EPOCH_ID_PROPERTY, Long.toString(epochId)); commitOperation(snapshotUpdate, numFiles, description); diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java index a2288ef3edd7..3c7ebabeab3d 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.math.BigDecimal; @@ -128,7 +127,8 @@ public int getInt(int ordinal) { } else if (integer instanceof LocalDate) { return (int) ((LocalDate) integer).toEpochDay(); } else { - throw new IllegalStateException("Unknown type for int field. Type name: " + integer.getClass().getName()); + throw new IllegalStateException( + "Unknown type for int field. Type name: " + integer.getClass().getName()); } } @@ -143,7 +143,8 @@ public long getLong(int ordinal) { } else if (longVal instanceof LocalDate) { return ((LocalDate) longVal).toEpochDay(); } else { - throw new IllegalStateException("Unknown type for long field. Type name: " + longVal.getClass().getName()); + throw new IllegalStateException( + "Unknown type for long field. Type name: " + longVal.getClass().getName()); } } @@ -190,7 +191,8 @@ private byte[] getBinaryInternal(int ordinal) { } else if (bytes instanceof byte[]) { return (byte[]) bytes; } else { - throw new IllegalStateException("Unknown type for binary field. Type name: " + bytes.getClass().getName()); + throw new IllegalStateException( + "Unknown type for binary field. Type name: " + bytes.getClass().getName()); } } @@ -206,8 +208,7 @@ public InternalRow getStruct(int ordinal, int numFields) { private InternalRow getStructInternal(int ordinal, int numFields) { return new StructInternalRow( - type.fields().get(ordinal).type().asStructType(), - struct.get(ordinal, StructLike.class)); + type.fields().get(ordinal).type().asStructType(), struct.get(ordinal, StructLike.class)); } @Override @@ -227,7 +228,8 @@ public MapData getMap(int ordinal) { } private MapData getMapInternal(int ordinal) { - return mapToMapData(type.fields().get(ordinal).type().asMapType(), struct.get(ordinal, Map.class)); + return mapToMapData( + type.fields().get(ordinal).type().asMapType(), struct.get(ordinal, Map.class)); } @Override @@ -292,31 +294,52 @@ private ArrayData collectionToArrayData(Type elementType, Collection values) case DOUBLE: return fillArray(values, array -> (pos, value) -> array[pos] = value); case STRING: - return fillArray(values, array -> - (BiConsumer) (pos, seq) -> array[pos] = UTF8String.fromString(seq.toString())); + return fillArray( + values, + array -> + (BiConsumer) + (pos, seq) -> array[pos] = UTF8String.fromString(seq.toString())); case FIXED: case BINARY: - return fillArray(values, array -> - (BiConsumer) (pos, buf) -> array[pos] = ByteBuffers.toByteArray(buf)); + return fillArray( + values, + array -> + (BiConsumer) + (pos, buf) -> array[pos] = ByteBuffers.toByteArray(buf)); case DECIMAL: - return fillArray(values, array -> - (BiConsumer) (pos, dec) -> array[pos] = Decimal.apply(dec)); + return fillArray( + values, + array -> + (BiConsumer) (pos, dec) -> array[pos] = Decimal.apply(dec)); case STRUCT: - return fillArray(values, array -> (BiConsumer) (pos, tuple) -> - array[pos] = new StructInternalRow(elementType.asStructType(), tuple)); + return fillArray( + values, + array -> + (BiConsumer) + (pos, tuple) -> + array[pos] = new StructInternalRow(elementType.asStructType(), tuple)); case LIST: - return fillArray(values, array -> (BiConsumer>) (pos, list) -> - array[pos] = collectionToArrayData(elementType.asListType().elementType(), list)); + return fillArray( + values, + array -> + (BiConsumer>) + (pos, list) -> + array[pos] = + collectionToArrayData(elementType.asListType().elementType(), list)); case MAP: - return fillArray(values, array -> (BiConsumer>) (pos, map) -> - array[pos] = mapToMapData(elementType.asMapType(), map)); + return fillArray( + values, + array -> + (BiConsumer>) + (pos, map) -> array[pos] = mapToMapData(elementType.asMapType(), map)); default: throw new UnsupportedOperationException("Unsupported array element type: " + elementType); } } @SuppressWarnings("unchecked") - private GenericArrayData fillArray(Collection values, Function> makeSetter) { + private GenericArrayData fillArray( + Collection values, Function> makeSetter) { Object[] array = new Object[values.size()]; BiConsumer setter = makeSetter.apply(array); diff --git a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Writer.java b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Writer.java index c4259a9bf36e..721603a9009b 100644 --- a/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Writer.java +++ b/spark/v2.4/spark/src/main/java/org/apache/iceberg/spark/source/Writer.java @@ -16,9 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; + import java.io.IOException; import java.util.Arrays; import java.util.Map; @@ -54,15 +62,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; - // TODO: parameterize DataSourceWriter with subclass of WriterCommitMessage class Writer implements DataSourceWriter { private static final Logger LOG = LoggerFactory.getLogger(Writer.class); @@ -81,13 +80,26 @@ class Writer implements DataSourceWriter { private boolean cleanupOnAbort = true; - Writer(SparkSession spark, Table table, SparkWriteConf writeConf, boolean replacePartitions, - String applicationId, Schema writeSchema, StructType dsSchema) { + Writer( + SparkSession spark, + Table table, + SparkWriteConf writeConf, + boolean replacePartitions, + String applicationId, + Schema writeSchema, + StructType dsSchema) { this(spark, table, writeConf, replacePartitions, applicationId, null, writeSchema, dsSchema); } - Writer(SparkSession spark, Table table, SparkWriteConf writeConf, boolean replacePartitions, - String applicationId, String wapId, Schema writeSchema, StructType dsSchema) { + Writer( + SparkSession spark, + Table table, + SparkWriteConf writeConf, + boolean replacePartitions, + String applicationId, + String wapId, + Schema writeSchema, + StructType dsSchema) { this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); this.table = table; this.format = writeConf.dataFileFormat(); @@ -102,15 +114,20 @@ class Writer implements DataSourceWriter { } private boolean isWapTable() { - return Boolean.parseBoolean(table.properties().getOrDefault( - TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED_DEFAULT)); + return Boolean.parseBoolean( + table + .properties() + .getOrDefault( + TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, + TableProperties.WRITE_AUDIT_PUBLISH_ENABLED_DEFAULT)); } @Override public DataWriterFactory createWriterFactory() { // broadcast the table metadata as the writer factory will be sent to executors Broadcast
    tableBroadcast = sparkContext.broadcast(SerializableTable.copyOf(table)); - return new WriterFactory(tableBroadcast, format, targetFileSize, writeSchema, dsSchema, partitionedFanoutEnabled); + return new WriterFactory( + tableBroadcast, format, targetFileSize, writeSchema, dsSchema, partitionedFanoutEnabled); } @Override @@ -188,16 +205,21 @@ public void abort(WriterCommitMessage[] messages) { Tasks.foreach(files(messages)) .retry(PropertyUtil.propertyAsInt(props, COMMIT_NUM_RETRIES, COMMIT_NUM_RETRIES_DEFAULT)) .exponentialBackoff( - PropertyUtil.propertyAsInt(props, COMMIT_MIN_RETRY_WAIT_MS, COMMIT_MIN_RETRY_WAIT_MS_DEFAULT), - PropertyUtil.propertyAsInt(props, COMMIT_MAX_RETRY_WAIT_MS, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT), - PropertyUtil.propertyAsInt(props, COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_MIN_RETRY_WAIT_MS, COMMIT_MIN_RETRY_WAIT_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_MAX_RETRY_WAIT_MS, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), 2.0 /* exponential */) .throwFailureWhenFinished() - .run(file -> { - table.io().deleteFile(file.path().toString()); - }); + .run( + file -> { + table.io().deleteFile(file.path().toString()); + }); } else { - LOG.warn("Skipping cleaning up of data files because Iceberg was unable to determine the final commit state"); + LOG.warn( + "Skipping cleaning up of data files because Iceberg was unable to determine the final commit state"); } } @@ -207,9 +229,13 @@ protected Table table() { protected Iterable files(WriterCommitMessage[] messages) { if (messages.length > 0) { - return Iterables.concat(Iterables.transform(Arrays.asList(messages), message -> message != null ? - ImmutableList.copyOf(((TaskCommit) message).files()) : - ImmutableList.of())); + return Iterables.concat( + Iterables.transform( + Arrays.asList(messages), + message -> + message != null + ? ImmutableList.copyOf(((TaskCommit) message).files()) + : ImmutableList.of())); } return ImmutableList.of(); } @@ -239,8 +265,13 @@ static class WriterFactory implements DataWriterFactory { private final StructType dsSchema; private final boolean partitionedFanoutEnabled; - WriterFactory(Broadcast
    tableBroadcast, FileFormat format, long targetFileSize, - Schema writeSchema, StructType dsSchema, boolean partitionedFanoutEnabled) { + WriterFactory( + Broadcast
    tableBroadcast, + FileFormat format, + long targetFileSize, + Schema writeSchema, + StructType dsSchema, + boolean partitionedFanoutEnabled) { this.tableBroadcast = tableBroadcast; this.format = format; this.targetFileSize = targetFileSize; @@ -253,28 +284,36 @@ static class WriterFactory implements DataWriterFactory { public DataWriter createDataWriter(int partitionId, long taskId, long epochId) { Table table = tableBroadcast.value(); - OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, partitionId, taskId).format(format).build(); - SparkAppenderFactory appenderFactory = SparkAppenderFactory.builderFor(table, writeSchema, dsSchema).build(); + OutputFileFactory fileFactory = + OutputFileFactory.builderFor(table, partitionId, taskId).format(format).build(); + SparkAppenderFactory appenderFactory = + SparkAppenderFactory.builderFor(table, writeSchema, dsSchema).build(); PartitionSpec spec = table.spec(); FileIO io = table.io(); if (spec.isUnpartitioned()) { - return new Unpartitioned24Writer(spec, format, appenderFactory, fileFactory, io, targetFileSize); + return new Unpartitioned24Writer( + spec, format, appenderFactory, fileFactory, io, targetFileSize); } else if (partitionedFanoutEnabled) { - return new PartitionedFanout24Writer(spec, format, appenderFactory, fileFactory, io, targetFileSize, - writeSchema, dsSchema); + return new PartitionedFanout24Writer( + spec, format, appenderFactory, fileFactory, io, targetFileSize, writeSchema, dsSchema); } else { - return new Partitioned24Writer(spec, format, appenderFactory, fileFactory, io, targetFileSize, - writeSchema, dsSchema); + return new Partitioned24Writer( + spec, format, appenderFactory, fileFactory, io, targetFileSize, writeSchema, dsSchema); } } } private static class Unpartitioned24Writer extends UnpartitionedWriter implements DataWriter { - Unpartitioned24Writer(PartitionSpec spec, FileFormat format, SparkAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO fileIo, long targetFileSize) { + Unpartitioned24Writer( + PartitionSpec spec, + FileFormat format, + SparkAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO fileIo, + long targetFileSize) { super(spec, format, appenderFactory, fileFactory, fileIo, targetFileSize); } @@ -286,12 +325,20 @@ public WriterCommitMessage commit() throws IOException { } } - private static class Partitioned24Writer extends SparkPartitionedWriter implements DataWriter { + private static class Partitioned24Writer extends SparkPartitionedWriter + implements DataWriter { - Partitioned24Writer(PartitionSpec spec, FileFormat format, SparkAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO fileIo, long targetFileSize, - Schema schema, StructType sparkSchema) { - super(spec, format, appenderFactory, fileFactory, fileIo, targetFileSize, schema, sparkSchema); + Partitioned24Writer( + PartitionSpec spec, + FileFormat format, + SparkAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO fileIo, + long targetFileSize, + Schema schema, + StructType sparkSchema) { + super( + spec, format, appenderFactory, fileFactory, fileIo, targetFileSize, schema, sparkSchema); } @Override @@ -305,12 +352,17 @@ public WriterCommitMessage commit() throws IOException { private static class PartitionedFanout24Writer extends SparkPartitionedFanoutWriter implements DataWriter { - PartitionedFanout24Writer(PartitionSpec spec, FileFormat format, - SparkAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO fileIo, long targetFileSize, - Schema schema, StructType sparkSchema) { - super(spec, format, appenderFactory, fileFactory, fileIo, targetFileSize, schema, - sparkSchema); + PartitionedFanout24Writer( + PartitionSpec spec, + FileFormat format, + SparkAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO fileIo, + long targetFileSize, + Schema schema, + StructType sparkSchema) { + super( + spec, format, appenderFactory, fileFactory, fileIo, targetFileSize, schema, sparkSchema); } @Override diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/KryoHelpers.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/KryoHelpers.java index ee0f0a73959a..6d88aaa11813 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/KryoHelpers.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/KryoHelpers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.esotericsoftware.kryo.Kryo; @@ -32,8 +31,7 @@ public class KryoHelpers { - private KryoHelpers() { - } + private KryoHelpers() {} @SuppressWarnings("unchecked") public static T roundTripSerialize(T obj) throws IOException { @@ -45,7 +43,8 @@ public static T roundTripSerialize(T obj) throws IOException { kryo.writeClassAndObject(out, obj); } - try (Input in = new Input(new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray())))) { + try (Input in = + new Input(new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray())))) { return (T) kryo.readClassAndObject(in); } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java index 99396647ee3e..235cf69ef449 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Comparator; @@ -25,15 +24,14 @@ import org.junit.Assert; public final class TaskCheckHelper { - private TaskCheckHelper() { - } + private TaskCheckHelper() {} public static void assertEquals(BaseCombinedScanTask expected, BaseCombinedScanTask actual) { List expectedTasks = getFileScanTasksInFilePathOrder(expected); List actualTasks = getFileScanTasksInFilePathOrder(actual); - Assert.assertEquals("The number of file scan tasks should match", - expectedTasks.size(), actualTasks.size()); + Assert.assertEquals( + "The number of file scan tasks should match", expectedTasks.size(), actualTasks.size()); for (int i = 0; i < expectedTasks.size(); i++) { FileScanTask expectedTask = expectedTasks.get(i); @@ -50,38 +48,56 @@ public static void assertEquals(FileScanTask expected, FileScanTask actual) { Assert.assertEquals("starting position doesn't match", expected.start(), actual.start()); - Assert.assertEquals("the number of bytes to scan doesn't match", expected.start(), actual.start()); + Assert.assertEquals( + "the number of bytes to scan doesn't match", expected.start(), actual.start()); // simplify comparison on residual expression via comparing toString - Assert.assertEquals("Residual expression doesn't match", - expected.residual().toString(), actual.residual().toString()); + Assert.assertEquals( + "Residual expression doesn't match", + expected.residual().toString(), + actual.residual().toString()); } public static void assertEquals(DataFile expected, DataFile actual) { - Assert.assertEquals("Should match the serialized record path", - expected.path(), actual.path()); - Assert.assertEquals("Should match the serialized record format", - expected.format(), actual.format()); - Assert.assertEquals("Should match the serialized record partition", - expected.partition().get(0, Object.class), actual.partition().get(0, Object.class)); - Assert.assertEquals("Should match the serialized record count", - expected.recordCount(), actual.recordCount()); - Assert.assertEquals("Should match the serialized record size", - expected.fileSizeInBytes(), actual.fileSizeInBytes()); - Assert.assertEquals("Should match the serialized record value counts", - expected.valueCounts(), actual.valueCounts()); - Assert.assertEquals("Should match the serialized record null value counts", - expected.nullValueCounts(), actual.nullValueCounts()); - Assert.assertEquals("Should match the serialized record lower bounds", - expected.lowerBounds(), actual.lowerBounds()); - Assert.assertEquals("Should match the serialized record upper bounds", - expected.upperBounds(), actual.upperBounds()); - Assert.assertEquals("Should match the serialized record key metadata", - expected.keyMetadata(), actual.keyMetadata()); - Assert.assertEquals("Should match the serialized record offsets", - expected.splitOffsets(), actual.splitOffsets()); - Assert.assertEquals("Should match the serialized record offsets", - expected.keyMetadata(), actual.keyMetadata()); + Assert.assertEquals("Should match the serialized record path", expected.path(), actual.path()); + Assert.assertEquals( + "Should match the serialized record format", expected.format(), actual.format()); + Assert.assertEquals( + "Should match the serialized record partition", + expected.partition().get(0, Object.class), + actual.partition().get(0, Object.class)); + Assert.assertEquals( + "Should match the serialized record count", expected.recordCount(), actual.recordCount()); + Assert.assertEquals( + "Should match the serialized record size", + expected.fileSizeInBytes(), + actual.fileSizeInBytes()); + Assert.assertEquals( + "Should match the serialized record value counts", + expected.valueCounts(), + actual.valueCounts()); + Assert.assertEquals( + "Should match the serialized record null value counts", + expected.nullValueCounts(), + actual.nullValueCounts()); + Assert.assertEquals( + "Should match the serialized record lower bounds", + expected.lowerBounds(), + actual.lowerBounds()); + Assert.assertEquals( + "Should match the serialized record upper bounds", + expected.upperBounds(), + actual.upperBounds()); + Assert.assertEquals( + "Should match the serialized record key metadata", + expected.keyMetadata(), + actual.keyMetadata()); + Assert.assertEquals( + "Should match the serialized record offsets", + expected.splitOffsets(), + actual.splitOffsets()); + Assert.assertEquals( + "Should match the serialized record offsets", expected.keyMetadata(), actual.keyMetadata()); } private static List getFileScanTasksInFilePathOrder(BaseCombinedScanTask task) { diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java index 12fa8b2fc539..33b5316b72b7 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TaskCheckHelper.assertEquals; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; @@ -51,22 +54,17 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.TaskCheckHelper.assertEquals; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestDataFileSerialization { - private static final Schema DATE_SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema DATE_SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec PARTITION_SPEC = PartitionSpec - .builderFor(DATE_SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec PARTITION_SPEC = + PartitionSpec.builderFor(DATE_SCHEMA).identity("date").build(); private static final Map VALUE_COUNTS = Maps.newHashMap(); private static final Map NULL_VALUE_COUNTS = Maps.newHashMap(); @@ -85,20 +83,26 @@ public class TestDataFileSerialization { UPPER_BOUNDS.put(1, longToBuffer(4L)); } - private static final DataFile DATA_FILE = DataFiles - .builder(PARTITION_SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(1234) - .withPartitionPath("date=2018-06-08") - .withMetrics(new Metrics( - 5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS)) - .withSplitOffsets(ImmutableList.of(4L)) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) - .withSortOrder(SortOrder.unsorted()) - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final DataFile DATA_FILE = + DataFiles.builder(PARTITION_SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(1234) + .withPartitionPath("date=2018-06-08") + .withMetrics( + new Metrics( + 5L, + null, + VALUE_COUNTS, + NULL_VALUE_COUNTS, + NAN_VALUE_COUNTS, + LOWER_BOUNDS, + UPPER_BOUNDS)) + .withSplitOffsets(ImmutableList.of(4L)) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) + .withSortOrder(SortOrder.unsorted()) + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testDataFileKryoSerialization() throws Exception { @@ -128,7 +132,8 @@ public void testDataFileJavaSerialization() throws Exception { out.writeObject(DATA_FILE.copy()); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { for (int i = 0; i < 2; i += 1) { Object obj = in.readObject(); Assertions.assertThat(obj).as("Should be a DataFile").isInstanceOf(DataFile.class); @@ -140,13 +145,14 @@ public void testDataFileJavaSerialization() throws Exception { @Test public void testParquetWriterSplitOffsets() throws IOException { Iterable records = RandomData.generateSpark(DATE_SCHEMA, 1, 33L); - File parquetFile = new File( - temp.getRoot(), - FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); + File parquetFile = + new File(temp.getRoot(), FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); FileAppender writer = Parquet.write(Files.localOutput(parquetFile)) .schema(DATE_SCHEMA) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType)) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType)) .build(); try { writer.addAll(records); diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java index 7c14485ff9a0..b44e6cbb8d4c 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -35,36 +37,29 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestFileIOSerialization { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("date").build(); - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA) - .asc("id") - .build(); + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); static { CONF.set("k1", "v1"); CONF.set("k2", "v2"); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @Before diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java index 25004aa110e4..a20b2d9f05de 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; @@ -47,56 +49,57 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestManifestFileSerialization { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - required(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("double") - .build(); - - private static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withPartition(TestHelpers.Row.of(1D)) - .withPartitionPath("double=1") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - ImmutableMap.of(), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); - - private static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(0) - .withPartition(TestHelpers.Row.of(Double.NaN)) - .withPartitionPath("double=NaN") - .withMetrics(new Metrics(1L, - null, // no column sizes - ImmutableMap.of(1, 1L, 4, 1L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - ImmutableMap.of(4, 1L), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(1L)) // upper bounds - )) - .build(); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + required(4, "double", Types.DoubleType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("double").build(); + + private static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(0) + .withPartition(TestHelpers.Row.of(1D)) + .withPartitionPath("double=1") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + ImmutableMap.of(), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(4L)) // upper bounds + )) + .build(); + + private static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-2.parquet") + .withFileSizeInBytes(0) + .withPartition(TestHelpers.Row.of(Double.NaN)) + .withPartitionPath("double=NaN") + .withMetrics( + new Metrics( + 1L, + null, // no column sizes + ImmutableMap.of(1, 1L, 4, 1L), // value count + ImmutableMap.of(1, 0L, 2, 0L), // null count + ImmutableMap.of(4, 1L), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(1L)) // upper bounds + )) + .build(); private static final FileIO FILE_IO = new HadoopFileIO(new Configuration()); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testManifestFileKryoSerialization() throws IOException { @@ -134,7 +137,8 @@ public void testManifestFileJavaSerialization() throws Exception { out.writeObject(GenericManifestFile.copyOf(manifest).build()); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { for (int i = 0; i < 3; i += 1) { Object obj = in.readObject(); Assertions.assertThat(obj).as("Should be a ManifestFile").isInstanceOf(ManifestFile.class); @@ -148,27 +152,46 @@ private void checkManifestFile(ManifestFile expected, ManifestFile actual) { Assert.assertEquals("Length must match", expected.length(), actual.length()); Assert.assertEquals("Spec id must match", expected.partitionSpecId(), actual.partitionSpecId()); Assert.assertEquals("Snapshot id must match", expected.snapshotId(), actual.snapshotId()); - Assert.assertEquals("Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); - Assert.assertEquals("Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); - Assert.assertEquals("Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); - Assert.assertEquals("Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); - Assert.assertEquals("Existing files count must match", expected.existingFilesCount(), actual.existingFilesCount()); - Assert.assertEquals("Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); - Assert.assertEquals("Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); - Assert.assertEquals("Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); - Assert.assertEquals("Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); + Assert.assertEquals( + "Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); + Assert.assertEquals( + "Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); + Assert.assertEquals( + "Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); + Assert.assertEquals( + "Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); + Assert.assertEquals( + "Existing files count must match", + expected.existingFilesCount(), + actual.existingFilesCount()); + Assert.assertEquals( + "Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); + Assert.assertEquals( + "Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); + Assert.assertEquals( + "Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); + Assert.assertEquals( + "Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); PartitionFieldSummary expectedPartition = expected.partitions().get(0); PartitionFieldSummary actualPartition = actual.partitions().get(0); - Assert.assertEquals("Null flag in partition must match", - expectedPartition.containsNull(), actualPartition.containsNull()); - Assert.assertEquals("NaN flag in partition must match", - expectedPartition.containsNaN(), actualPartition.containsNaN()); - Assert.assertEquals("Lower bounds in partition must match", - expectedPartition.lowerBound(), actualPartition.lowerBound()); - Assert.assertEquals("Upper bounds in partition must match", - expectedPartition.upperBound(), actualPartition.upperBound()); + Assert.assertEquals( + "Null flag in partition must match", + expectedPartition.containsNull(), + actualPartition.containsNull()); + Assert.assertEquals( + "NaN flag in partition must match", + expectedPartition.containsNaN(), + actualPartition.containsNaN()); + Assert.assertEquals( + "Lower bounds in partition must match", + expectedPartition.lowerBound(), + actualPartition.lowerBound()); + Assert.assertEquals( + "Upper bounds in partition must match", + expectedPartition.upperBound(), + actualPartition.upperBound()); } private ManifestFile writeManifest(DataFile... files) throws IOException { diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java index e234ee2617aa..4dd34f7a7611 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; + import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; @@ -50,19 +51,16 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestScanTaskSerialization extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String tableLocation = null; @@ -86,7 +84,9 @@ public void testBaseCombinedScanTaskKryoSerialization() throws Exception { try (Input in = new Input(new FileInputStream(data))) { Object obj = kryo.readClassAndObject(in); - Assertions.assertThat(obj).as("Should be a BaseCombinedScanTask").isInstanceOf(BaseCombinedScanTask.class); + Assertions.assertThat(obj) + .as("Should be a BaseCombinedScanTask") + .isInstanceOf(BaseCombinedScanTask.class); TaskCheckHelper.assertEquals(scanTask, (BaseCombinedScanTask) obj); } } @@ -100,9 +100,12 @@ public void testBaseCombinedScanTaskJavaSerialization() throws Exception { out.writeObject(scanTask); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { Object obj = in.readObject(); - Assertions.assertThat(obj).as("Should be a BaseCombinedScanTask").isInstanceOf(BaseCombinedScanTask.class); + Assertions.assertThat(obj) + .as("Should be a BaseCombinedScanTask") + .isInstanceOf(BaseCombinedScanTask.class); TaskCheckHelper.assertEquals(scanTask, (BaseCombinedScanTask) obj); } } @@ -112,16 +115,15 @@ private BaseCombinedScanTask prepareBaseCombinedScanTaskForSerDeTest() { Map options = Maps.newHashMap(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -136,10 +138,6 @@ private void writeRecords(List records) { } private void writeDF(Dataset df) { - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java index 88e30c0ece68..a4b86752cf3b 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -31,30 +33,23 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestTableSerialization { private static final HadoopTables TABLES = new HadoopTables(); - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("date").build(); - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA) - .asc("id") - .build(); + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @Before @@ -70,19 +65,20 @@ public void initTable() throws IOException { @Test public void testSerializableTableKryoSerialization() throws IOException { Table serializableTable = SerializableTable.copyOf(table); - TestHelpers.assertSerializedAndLoadedMetadata(table, KryoHelpers.roundTripSerialize(serializableTable)); + TestHelpers.assertSerializedAndLoadedMetadata( + table, KryoHelpers.roundTripSerialize(serializableTable)); } @Test public void testSerializableMetadataTableKryoSerialization() throws IOException { for (MetadataTableType type : MetadataTableType.values()) { TableOperations ops = ((HasTableOperations) table).operations(); - Table metadataTable = MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); + Table metadataTable = + MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); Table serializableMetadataTable = SerializableTable.copyOf(metadataTable); TestHelpers.assertSerializedAndLoadedMetadata( - metadataTable, - KryoHelpers.roundTripSerialize(serializableMetadataTable)); + metadataTable, KryoHelpers.roundTripSerialize(serializableMetadataTable)); } } @@ -90,13 +86,12 @@ public void testSerializableMetadataTableKryoSerialization() throws IOException public void testSerializableTransactionTableKryoSerialization() throws IOException { Transaction txn = table.newTransaction(); - txn.updateProperties() - .set("k1", "v1") - .commit(); + txn.updateProperties().set("k1", "v1").commit(); Table txnTable = txn.table(); Table serializableTxnTable = SerializableTable.copyOf(txnTable); - TestHelpers.assertSerializedMetadata(txnTable, KryoHelpers.roundTripSerialize(serializableTxnTable)); + TestHelpers.assertSerializedMetadata( + txnTable, KryoHelpers.roundTripSerialize(serializableTxnTable)); } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/actions/TestRewriteDataFilesAction.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/actions/TestRewriteDataFilesAction.java index 91b50cf1f029..226cc897856f 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/actions/TestRewriteDataFilesAction.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/actions/TestRewriteDataFilesAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.util.Collections; import java.util.Comparator; @@ -50,19 +51,16 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestRewriteDataFilesAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String tableLocation = null; @@ -93,22 +91,22 @@ public void testRewriteDataFilesUnpartitionedTable() { Map options = Maps.newHashMap(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles.size()); Actions actions = Actions.forTable(table); @@ -120,7 +118,8 @@ public void testRewriteDataFilesUnpartitionedTable() { table.refresh(); CloseableIterable tasks1 = table.newScan().planFiles(); - List dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 1 data files before rewrite", 1, dataFiles1.size()); List expectedRecords = Lists.newArrayList(); @@ -128,50 +127,47 @@ public void testRewriteDataFilesUnpartitionedTable() { expectedRecords.addAll(records2); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @Test public void testRewriteDataFilesPartitionedTable() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .truncate("c2", 2) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); Map options = Maps.newHashMap(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), - new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), + new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), + new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD")); writeRecords(records2); - List records3 = Lists.newArrayList( - new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), - new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG") - ); + List records3 = + Lists.newArrayList( + new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), + new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG")); writeRecords(records3); - List records4 = Lists.newArrayList( - new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH") - ); + List records4 = + Lists.newArrayList( + new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH")); writeRecords(records4); table.refresh(); CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size()); Actions actions = Actions.forTable(table); @@ -183,7 +179,8 @@ public void testRewriteDataFilesPartitionedTable() { table.refresh(); CloseableIterable tasks1 = table.newScan().planFiles(); - List dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles1.size()); List expectedRecords = Lists.newArrayList(); @@ -193,66 +190,65 @@ public void testRewriteDataFilesPartitionedTable() { expectedRecords.addAll(records4); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2", "c3") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @Test public void testRewriteDataFilesWithFilter() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .truncate("c2", 2) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); Map options = Maps.newHashMap(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), - new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), + new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), + new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD")); writeRecords(records2); - List records3 = Lists.newArrayList( - new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), - new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG") - ); + List records3 = + Lists.newArrayList( + new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), + new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG")); writeRecords(records3); - List records4 = Lists.newArrayList( - new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH") - ); + List records4 = + Lists.newArrayList( + new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH")); writeRecords(records4); table.refresh(); CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size()); Actions actions = Actions.forTable(table); - RewriteDataFilesActionResult result = actions - .rewriteDataFiles() - .filter(Expressions.equal("c1", 1)) - .filter(Expressions.startsWith("c2", "AA")) - .execute(); + RewriteDataFilesActionResult result = + actions + .rewriteDataFiles() + .filter(Expressions.equal("c1", 1)) + .filter(Expressions.startsWith("c2", "AA")) + .execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); table.refresh(); CloseableIterable tasks1 = table.newScan().planFiles(); - List dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); + List dataFiles1 = + Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 7 data files before rewrite", 7, dataFiles1.size()); List expectedRecords = Lists.newArrayList(); @@ -262,9 +258,8 @@ public void testRewriteDataFilesWithFilter() { expectedRecords.addAll(records4); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2", "c3") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -286,31 +281,27 @@ public void testRewriteLargeTableHasResiduals() { table.refresh(); - CloseableIterable tasks = table.newScan() - .ignoreResiduals() - .filter(Expressions.equal("c3", "0")) - .planFiles(); + CloseableIterable tasks = + table.newScan().ignoreResiduals().filter(Expressions.equal("c3", "0")).planFiles(); for (FileScanTask task : tasks) { Assert.assertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual()); } - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 2 data files before rewrite", 2, dataFiles.size()); Actions actions = Actions.forTable(table); - RewriteDataFilesActionResult result = actions - .rewriteDataFiles() - .filter(Expressions.equal("c3", "0")) - .execute(); + RewriteDataFilesActionResult result = + actions.rewriteDataFiles().filter(Expressions.equal("c3", "0")).execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); table.refresh(); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @@ -324,21 +315,24 @@ public void testRewriteDataFilesForLargeFile() throws AnalysisException { List records1 = Lists.newArrayList(); - IntStream.range(0, 2000).forEach(i -> records1.add(new ThreeColumnRecord(i, "foo" + i, "bar" + i))); + IntStream.range(0, 2000) + .forEach(i -> records1.add(new ThreeColumnRecord(i, "foo" + i, "bar" + i))); Dataset df = spark.createDataFrame(records1, ThreeColumnRecord.class).repartition(1); writeDF(df); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), - new ThreeColumnRecord(1, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), + new ThreeColumnRecord(1, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); - DataFile maxSizeFile = Collections.max(dataFiles, Comparator.comparingLong(DataFile::fileSizeInBytes)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + DataFile maxSizeFile = + Collections.max(dataFiles, Comparator.comparingLong(DataFile::fileSizeInBytes)); Assert.assertEquals("Should have 3 files before rewrite", 3, dataFiles.size()); spark.read().format("iceberg").load(tableLocation).createTempView("origin"); @@ -348,11 +342,12 @@ public void testRewriteDataFilesForLargeFile() throws AnalysisException { Actions actions = Actions.forTable(table); long targetSizeInBytes = maxSizeFile.fileSizeInBytes() - 10; - RewriteDataFilesActionResult result = actions - .rewriteDataFiles() - .targetSizeInBytes(targetSizeInBytes) - .splitOpenFileCost(1) - .execute(); + RewriteDataFilesActionResult result = + actions + .rewriteDataFiles() + .targetSizeInBytes(targetSizeInBytes) + .splitOpenFileCost(1) + .execute(); Assert.assertEquals("Action should delete 3 data files", 3, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 2 data files", 2, result.addedDataFiles().size()); @@ -371,18 +366,12 @@ private void writeRecords(List records) { } private void writeDF(Dataset df) { - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); } @Test public void testRewriteToOutputPartitionSpec() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").build(); Map options = Maps.newHashMap(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); @@ -390,48 +379,48 @@ public void testRewriteToOutputPartitionSpec() { Assert.assertEquals("Should have 2 partitions specs", 2, table.specs().size()); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), - new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), + new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), + new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD")); writeRecords(records2); - List records3 = Lists.newArrayList( - new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), - new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG") - ); + List records3 = + Lists.newArrayList( + new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), + new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG")); writeRecords(records3); - List records4 = Lists.newArrayList( - new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH") - ); + List records4 = + Lists.newArrayList( + new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH")); writeRecords(records4); table.refresh(); CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size()); Dataset beforeResultDF = spark.read().format("iceberg").load(tableLocation); - List beforeActualFilteredRecords = beforeResultDF.sort("c1", "c2", "c3") - .filter("c1 = 1 AND c2 = 'BBBBBBBBBB'") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List beforeActualFilteredRecords = + beforeResultDF + .sort("c1", "c2", "c3") + .filter("c1 = 1 AND c2 = 'BBBBBBBBBB'") + .as(Encoders.bean(ThreeColumnRecord.class)) + .collectAsList(); Assert.assertEquals("Rows must match", records2, beforeActualFilteredRecords); Actions actions = Actions.forTable(table); - RewriteDataFilesActionResult result = actions - .rewriteDataFiles() - .outputSpecId(0) - .execute(); + RewriteDataFilesActionResult result = actions.rewriteDataFiles().outputSpecId(0).execute(); Assert.assertEquals("Action should rewrite 8 data files", 8, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 2 data file", 2, result.addedDataFiles().size()); @@ -441,7 +430,8 @@ public void testRewriteToOutputPartitionSpec() { table.refresh(); CloseableIterable tasks2 = table.newScan().planFiles(); - List dataFiles2 = Lists.newArrayList(CloseableIterable.transform(tasks2, FileScanTask::file)); + List dataFiles2 = + Lists.newArrayList(CloseableIterable.transform(tasks2, FileScanTask::file)); Assert.assertEquals("Should have 2 data files after rewrite", 2, dataFiles2.size()); // Should still have all the same data @@ -452,27 +442,27 @@ public void testRewriteToOutputPartitionSpec() { expectedRecords.addAll(records4); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2", "c3") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); - List actualFilteredRecords = resultDF.sort("c1", "c2", "c3") - .filter("c1 = 1 AND c2 = 'BBBBBBBBBB'") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualFilteredRecords = + resultDF + .sort("c1", "c2", "c3") + .filter("c1 = 1 AND c2 = 'BBBBBBBBBB'") + .as(Encoders.bean(ThreeColumnRecord.class)) + .collectAsList(); Assert.assertEquals("Rows must match", records2, actualFilteredRecords); - List records5 = Lists.newArrayList( - new ThreeColumnRecord(3, "CCCCCCCCCC", "FFFF"), - new ThreeColumnRecord(3, "CCCCCCCCCC", "HHHH") - ); + List records5 = + Lists.newArrayList( + new ThreeColumnRecord(3, "CCCCCCCCCC", "FFFF"), + new ThreeColumnRecord(3, "CCCCCCCCCC", "HHHH")); writeRecords(records5); expectedRecords.addAll(records5); - actualRecords = resultDF.sort("c1", "c2", "c3") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + actualRecords = + resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/ConcurrencyTest.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/ConcurrencyTest.java index aea3a4ded112..715b953b443a 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/ConcurrencyTest.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/ConcurrencyTest.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.examples; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.nio.file.Files; @@ -41,19 +42,14 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.types.Types.NestedField.optional; - -/** - * This class tests how Iceberg handles concurrency when reading and writing at the same time - */ +/** This class tests how Iceberg handles concurrency when reading and writing at the same time */ public class ConcurrencyTest { private static final Logger log = LoggerFactory.getLogger(ConcurrencyTest.class); - private Schema schema = new Schema( - optional(1, "key", Types.LongType.get()), - optional(2, "value", Types.StringType.get()) - ); + private Schema schema = + new Schema( + optional(1, "key", Types.LongType.get()), optional(2, "value", Types.StringType.get())); private SparkSession spark; private File tableLocation; private Table table; @@ -78,8 +74,8 @@ public void before() throws IOException { } /** - * The test creates 500 read tasks and one really long write (writing 1 mil rows) - * and uses threading to call the tasks concurrently. + * The test creates 500 read tasks and one really long write (writing 1 mil rows) and uses + * threading to call the tasks concurrently. */ @Test public void writingAndReadingConcurrently() throws InterruptedException { @@ -102,17 +98,17 @@ public void writingAndReadingConcurrently() throws InterruptedException { } private Void readTable() { - Dataset results = spark.read() - .format("iceberg") - .load(tableLocation.toString()); + Dataset results = spark.read().format("iceberg").load(tableLocation.toString()); log.info("" + results.count()); return null; } + private Void writeToTable(List writeData) { log.info("WRITING!"); Dataset df = spark.createDataFrame(writeData, SimpleRecord.class); - df.select("key", "value").write() + df.select("key", "value") + .write() .format("iceberg") .mode("append") .save(tableLocation.toString()); diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/ReadAndWriteTablesTest.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/ReadAndWriteTablesTest.java index 3f5ff68284de..0b74d49f44bf 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/ReadAndWriteTablesTest.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/ReadAndWriteTablesTest.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.examples; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.nio.file.Files; @@ -38,11 +39,7 @@ import org.junit.Before; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; - -/** - * This test class uses Spark to create partitioned and unpartitioned tables locally. - */ +/** This test class uses Spark to create partitioned and unpartitioned tables locally. */ public class ReadAndWriteTablesTest { private SparkSession spark; @@ -58,72 +55,65 @@ public void before() throws IOException { pathToTable = Files.createTempDirectory("temp").toFile(); tables = new HadoopTables(spark.sessionState().newHadoopConf()); - schema = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + schema = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get())); } @Test public void createUnpartitionedTable() { table = tables.create(schema, pathToTable.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(pathToTable.toString()); + df.select("id", "data").write().format("iceberg").mode("append").save(pathToTable.toString()); table.refresh(); } @Test public void createPartitionedTable() { - PartitionSpec spec = PartitionSpec.builderFor(schema) - .identity("id") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("id").build(); table = tables.create(schema, spec, pathToTable.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(pathToTable.toString()); + df.select("id", "data").write().format("iceberg").mode("append").save(pathToTable.toString()); table.refresh(); } @Test public void writeDataFromJsonFile() { - Schema bookSchema = new Schema( - optional(1, "title", Types.StringType.get()), - optional(2, "price", Types.LongType.get()), - optional(3, "author", Types.StringType.get()), - optional(4, "published", Types.TimestampType.withZone()), - optional(5, "genre", Types.StringType.get()) - ); + Schema bookSchema = + new Schema( + optional(1, "title", Types.StringType.get()), + optional(2, "price", Types.LongType.get()), + optional(3, "author", Types.StringType.get()), + optional(4, "published", Types.TimestampType.withZone()), + optional(5, "genre", Types.StringType.get())); table = tables.create(bookSchema, pathToTable.toString()); Dataset df = spark.read().json("src/test/resources/data/books.json"); - df.select(df.col("title"), df.col("price"), df.col("author"), - df.col("published").cast(DataTypes.TimestampType), df.col("genre")).write() + df.select( + df.col("title"), + df.col("price"), + df.col("author"), + df.col("published").cast(DataTypes.TimestampType), + df.col("genre")) + .write() .format("iceberg") .mode("append") .save(pathToTable.toString()); @@ -135,9 +125,7 @@ public void writeDataFromJsonFile() { public void readFromIcebergTableWithSpark() { table = tables.create(schema, pathToTable.toString()); - Dataset results = spark.read() - .format("iceberg") - .load(pathToTable.toString()); + Dataset results = spark.read().format("iceberg").load(pathToTable.toString()); results.createOrReplaceTempView("table"); spark.sql("select * from table").show(); @@ -147,10 +135,8 @@ public void readFromIcebergTableWithSpark() { public void readFromPartitionedTableWithFilter() { table = tables.create(schema, pathToTable.toString()); - Dataset results = spark.read() - .format("iceberg") - .load(pathToTable.toString()) - .filter("data != \"b\""); + Dataset results = + spark.read().format("iceberg").load(pathToTable.toString()).filter("data != \"b\""); results.createOrReplaceTempView("table"); spark.sql("SELECT * FROM table").show(); diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SchemaEvolutionTest.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SchemaEvolutionTest.java index a23954ba3f66..da0ad897a354 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SchemaEvolutionTest.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SchemaEvolutionTest.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.examples; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.nio.file.Files; @@ -47,11 +48,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.types.Types.NestedField.optional; - /** - * This class tests how you can evolve your table schema with Iceberg. - * This includes things like adding, deleting, renaming columns and type promotions. + * This class tests how you can evolve your table schema with Iceberg. This includes things like + * adding, deleting, renaming columns and type promotions. */ public class SchemaEvolutionTest { @@ -62,7 +61,6 @@ public class SchemaEvolutionTest { private File tableLocation; private final String dataLocation = "src/test/resources/data/"; - @BeforeClass public static void beforeAll() { spark = SparkSession.builder().master("local[2]").getOrCreate(); @@ -71,25 +69,27 @@ public static void beforeAll() { @Before public void before() throws IOException { tableLocation = Files.createTempDirectory("temp").toFile(); - Schema schema = new Schema( - optional(1, "title", Types.StringType.get()), - optional(2, "price", Types.IntegerType.get()), - optional(3, "author", Types.StringType.get()), - optional(4, "published", Types.TimestampType.withZone()), - optional(5, "genre", Types.StringType.get()) - ); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .year("published") - .build(); + Schema schema = + new Schema( + optional(1, "title", Types.StringType.get()), + optional(2, "price", Types.IntegerType.get()), + optional(3, "author", Types.StringType.get()), + optional(4, "published", Types.TimestampType.withZone()), + optional(5, "genre", Types.StringType.get())); + PartitionSpec spec = PartitionSpec.builderFor(schema).year("published").build(); HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); table = tables.create(schema, spec, tableLocation.toString()); Dataset df = spark.read().json(dataLocation + "/books.json"); - df.select(df.col("title"), df.col("price").cast(DataTypes.IntegerType), - df.col("author"), df.col("published").cast(DataTypes.TimestampType), - df.col("genre")).write() + df.select( + df.col("title"), + df.col("price").cast(DataTypes.IntegerType), + df.col("author"), + df.col("published").cast(DataTypes.TimestampType), + df.col("genre")) + .write() .format("iceberg") .mode("append") .save(tableLocation.toString()); @@ -106,9 +106,14 @@ public void addColumnToSchema() { table.updateSchema().addColumn(fieldName, Types.StringType.get()).commit(); Dataset df2 = spark.read().json(dataLocation + "new-books.json"); - df2.select(df2.col("title"), df2.col("price").cast(DataTypes.IntegerType), - df2.col("author"), df2.col("published").cast(DataTypes.TimestampType), - df2.col("genre"), df2.col("publisher")).write() + df2.select( + df2.col("title"), + df2.col("price").cast(DataTypes.IntegerType), + df2.col("author"), + df2.col("published").cast(DataTypes.TimestampType), + df2.col("genre"), + df2.col("publisher")) + .write() .format("iceberg") .mode("append") .save(tableLocation.toString()); @@ -119,9 +124,7 @@ public void deleteColumnFromSchema() { table.updateSchema().deleteColumn("genre").commit(); table.refresh(); - Dataset results = spark.read() - .format("iceberg") - .load(tableLocation.toString()); + Dataset results = spark.read().format("iceberg").load(tableLocation.toString()); results.createOrReplaceTempView("table"); spark.sql("select * from table").show(); @@ -133,45 +136,44 @@ public void renameColumn() { table.updateSchema().renameColumn("author", "writer").commit(); table.refresh(); - Dataset results = spark.read() - .format("iceberg") - .load(tableLocation.toString()); + Dataset results = spark.read().format("iceberg").load(tableLocation.toString()); results.createOrReplaceTempView("table"); spark.sql("select * from table").show(); List fields = Arrays.asList(spark.sql("select * from table").schema().names()); Assert.assertTrue(fields.contains("writer")); Assert.assertFalse(fields.contains("author")); - } @Test public void updateColumnTypeIntToLong() { table.updateSchema().updateColumn("price", Types.LongType.get()).commit(); - Dataset results = spark.read() - .format("iceberg") - .load(tableLocation.toString()); + Dataset results = spark.read().format("iceberg").load(tableLocation.toString()); - Stream structFieldStream = Arrays.stream(results.schema().fields()) - .filter(field -> field.name().equalsIgnoreCase("price")); + Stream structFieldStream = + Arrays.stream(results.schema().fields()) + .filter(field -> field.name().equalsIgnoreCase("price")); Optional first = structFieldStream.findFirst(); - Assert.assertTrue("Unable to change datatype from Long to Int", first.isPresent() && - first.get().dataType() == LongType$.MODULE$); + Assert.assertTrue( + "Unable to change datatype from Long to Int", + first.isPresent() && first.get().dataType() == LongType$.MODULE$); } @Test public void updateColumnTypeIntToString() { - Assertions.assertThatThrownBy(() -> table.updateSchema().updateColumn("price", Types.StringType.get()).commit()) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot change column type: price: int -> string"); + Assertions.assertThatThrownBy( + () -> table.updateSchema().updateColumn("price", Types.StringType.get()).commit()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot change column type: price: int -> string"); } @Test public void updateColumnTypeStringToInt() { - Assertions.assertThatThrownBy(() -> table.updateSchema().updateColumn("author", Types.IntegerType.get()).commit()) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot change column type: author: string -> int"); + Assertions.assertThatThrownBy( + () -> table.updateSchema().updateColumn("author", Types.IntegerType.get()).commit()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot change column type: author: string -> int"); } @Test diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SimpleRecord.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SimpleRecord.java index af8a5a765c9a..8d4e4c7e1668 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SimpleRecord.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SimpleRecord.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.examples; import org.apache.iceberg.relocated.com.google.common.base.Objects; @@ -25,8 +24,7 @@ public class SimpleRecord { private Integer id; private String data; - public SimpleRecord() { - } + public SimpleRecord() {} SimpleRecord(Integer id, String data) { this.id = id; diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SnapshotFunctionalityTest.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SnapshotFunctionalityTest.java index 54ea5d725e69..3f3ca13c28ae 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SnapshotFunctionalityTest.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/examples/SnapshotFunctionalityTest.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.examples; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.nio.file.Files; @@ -43,11 +44,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.types.Types.NestedField.optional; - /** - * This class tests the snapshot functionality available with Iceberg. - * This includes things like time-travel, rollback and retrieving metadata. + * This class tests the snapshot functionality available with Iceberg. This includes things like + * time-travel, rollback and retrieving metadata. */ public class SnapshotFunctionalityTest { @@ -59,10 +58,10 @@ public class SnapshotFunctionalityTest { @Before public void before() throws IOException { - Schema schema = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get())); spark = SparkSession.builder().master("local[2]").getOrCreate(); @@ -72,16 +71,15 @@ public void before() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); table = tables.create(schema, spec, tableLocation.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); for (int i = 0; i < 5; i++) { - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(tableLocation.toString()); @@ -96,9 +94,7 @@ public void rollbackToPreviousSnapshotAndReadData() { table.rollback().toSnapshotId(oldId).commit(); table.refresh(); - Dataset results = spark.read() - .format("iceberg") - .load(tableLocation.toString()); + Dataset results = spark.read().format("iceberg").load(tableLocation.toString()); results.createOrReplaceTempView("table"); spark.sql("select * from table").show(); @@ -115,9 +111,7 @@ public void expireOldSnapshotWithSnapshotID() { List snapshots = IteratorUtils.toList(iterator); } - /** - * Expires anything older than a given timestamp, NOT including that timestamp. - */ + /** Expires anything older than a given timestamp, NOT including that timestamp. */ @Test public void retireAllSnapshotsOlderThanTimestamp() { long secondLatestTimestamp = table.history().get(2).timestampMillis(); diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java index f943663a7418..95fc7e1bb707 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; + import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -40,8 +41,6 @@ import org.junit.Assert; import org.junit.BeforeClass; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; - public abstract class SparkTestBase { protected static final Object ANY = new Object(); @@ -57,15 +56,18 @@ public static void startMetastoreAndSpark() { metastore.start(); SparkTestBase.hiveConf = metastore.hiveConf(); - SparkTestBase.spark = SparkSession.builder() - .master("local[2]") - .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .enableHiveSupport() - .getOrCreate(); + SparkTestBase.spark = + SparkSession.builder() + .master("local[2]") + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .enableHiveSupport() + .getOrCreate(); - SparkTestBase.catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + SparkTestBase.catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); try { catalog.createNamespace(Namespace.of("default")); @@ -106,22 +108,23 @@ protected List rowsToJava(List rows) { private Object[] toJava(Row row) { return IntStream.range(0, row.size()) - .mapToObj(pos -> { - if (row.isNullAt(pos)) { - return null; - } - - Object value = row.get(pos); - if (value instanceof Row) { - return toJava((Row) value); - } else if (value instanceof scala.collection.Seq) { - return row.getList(pos); - } else if (value instanceof scala.collection.Map) { - return row.getJavaMap(pos); - } else { - return value; - } - }) + .mapToObj( + pos -> { + if (row.isNullAt(pos)) { + return null; + } + + Object value = row.get(pos); + if (value instanceof Row) { + return toJava((Row) value); + } else if (value instanceof scala.collection.Seq) { + return row.getList(pos); + } else if (value instanceof scala.collection.Map) { + return row.getJavaMap(pos); + } else { + return value; + } + }) .toArray(Object[]::new); } @@ -137,8 +140,10 @@ protected Object[] row(Object... values) { return values; } - protected void assertEquals(String context, List expectedRows, List actualRows) { - Assert.assertEquals(context + ": number of results should match", expectedRows.size(), actualRows.size()); + protected void assertEquals( + String context, List expectedRows, List actualRows) { + Assert.assertEquals( + context + ": number of results should match", expectedRows.size(), actualRows.size()); for (int row = 0; row < expectedRows.size(); row += 1) { Object[] expected = expectedRows.get(row); Object[] actual = actualRows.get(row); @@ -172,30 +177,34 @@ protected void withSQLConf(Map conf, Action action) { SQLConf sqlConf = SQLConf.get(); Map currentConfValues = Maps.newHashMap(); - conf.keySet().forEach(confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); - - conf.forEach((confKey, confValue) -> { - if (SQLConf.staticConfKeys().contains(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); + conf.keySet() + .forEach( + confKey -> { + if (sqlConf.contains(confKey)) { + String currentConfValue = sqlConf.getConfString(confKey); + currentConfValues.put(confKey, currentConfValue); + } + }); + + conf.forEach( + (confKey, confValue) -> { + if (SQLConf.staticConfKeys().contains(confKey)) { + throw new RuntimeException("Cannot modify the value of a static config: " + confKey); + } + sqlConf.setConfString(confKey, confValue); + }); try { action.invoke(); } finally { - conf.forEach((confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); + conf.forEach( + (confKey, confValue) -> { + if (currentConfValues.containsKey(confKey)) { + sqlConf.setConfString(confKey, currentConfValues.get(confKey)); + } else { + sqlConf.unsetConf(confKey); + } + }); } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java index 8bb32c969842..4e6331982d85 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java @@ -16,34 +16,33 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import org.apache.iceberg.Schema; import org.apache.iceberg.types.Types; import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestSparkSchemaUtil { - private static final Schema TEST_SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema TEST_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); @Test public void testEstiamteSizeMaxValue() throws IOException { - Assert.assertEquals("estimateSize returns Long max value", Long.MAX_VALUE, - SparkSchemaUtil.estimateSize( - null, - Long.MAX_VALUE)); + Assert.assertEquals( + "estimateSize returns Long max value", + Long.MAX_VALUE, + SparkSchemaUtil.estimateSize(null, Long.MAX_VALUE)); } @Test public void testEstiamteSizeWithOverflow() throws IOException { - long tableSize = SparkSchemaUtil.estimateSize(SparkSchemaUtil.convert(TEST_SCHEMA), Long.MAX_VALUE - 1); + long tableSize = + SparkSchemaUtil.estimateSize(SparkSchemaUtil.convert(TEST_SCHEMA), Long.MAX_VALUE - 1); Assert.assertEquals("estimateSize handles overflow", Long.MAX_VALUE, tableSize); } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java index 57941b8c7940..7f00c7edd8a9 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Schema; @@ -31,51 +30,55 @@ public class TestSparkValueConverter { @Test public void testSparkNullMapConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); assertCorrectNullConversion(schema); } @Test public void testSparkNullListConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", - Types.ListType.ofOptional(6, Types.StringType.get()) - ) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, "locations", Types.ListType.ofOptional(6, Types.StringType.get()))); assertCorrectNullConversion(schema); } @Test public void testSparkNullStructConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); assertCorrectNullConversion(schema); } @Test public void testSparkNullPrimitiveConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "location", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(5, "location", Types.StringType.get())); assertCorrectNullConversion(schema); } @@ -83,7 +86,8 @@ private void assertCorrectNullConversion(Schema schema) { Row sparkRow = RowFactory.create(1, null); Record record = GenericRecord.create(schema); record.set(0, 1); - Assert.assertEquals("Round-trip conversion should produce original value", + Assert.assertEquals( + "Round-trip conversion should produce original value", record, SparkValueConverter.convert(schema, sparkRow)); } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java index f4b296562c94..7124c51ddd3d 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -53,46 +54,47 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestDeleteReachableFilesAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); private static final int SHUFFLE_PARTITIONS = 2; private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("c1").build(); - static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(0)) - .withRecordCount(1) - .build(); - static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(1)) - .withRecordCount(1) - .build(); - static final DataFile FILE_C = DataFiles.builder(SPEC) - .withPath("/path/to/data-c.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(2)) - .withRecordCount(1) - .build(); - static final DataFile FILE_D = DataFiles.builder(SPEC) - .withPath("/path/to/data-d.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(3)) - .withRecordCount(1) - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(0)) + .withRecordCount(1) + .build(); + static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(1)) + .withRecordCount(1) + .build(); + static final DataFile FILE_C = + DataFiles.builder(SPEC) + .withPath("/path/to/data-c.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(2)) + .withRecordCount(1) + .build(); + static final DataFile FILE_D = + DataFiles.builder(SPEC) + .withPath("/path/to/data-d.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(3)) + .withRecordCount(1) + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @@ -104,62 +106,76 @@ public void setupTableLocation() throws Exception { spark.conf().set("spark.sql.shuffle.partitions", SHUFFLE_PARTITIONS); } - private void checkRemoveFilesResults(long expectedDatafiles, long expectedManifestsDeleted, - long expectedManifestListsDeleted, long expectedOtherFilesDeleted, - DeleteReachableFiles.Result results) { - Assert.assertEquals("Incorrect number of manifest files deleted", - expectedManifestsDeleted, results.deletedManifestsCount()); - Assert.assertEquals("Incorrect number of datafiles deleted", - expectedDatafiles, results.deletedDataFilesCount()); - Assert.assertEquals("Incorrect number of manifest lists deleted", - expectedManifestListsDeleted, results.deletedManifestListsCount()); - Assert.assertEquals("Incorrect number of other lists deleted", - expectedOtherFilesDeleted, results.deletedOtherFilesCount()); + private void checkRemoveFilesResults( + long expectedDatafiles, + long expectedManifestsDeleted, + long expectedManifestListsDeleted, + long expectedOtherFilesDeleted, + DeleteReachableFiles.Result results) { + Assert.assertEquals( + "Incorrect number of manifest files deleted", + expectedManifestsDeleted, + results.deletedManifestsCount()); + Assert.assertEquals( + "Incorrect number of datafiles deleted", + expectedDatafiles, + results.deletedDataFilesCount()); + Assert.assertEquals( + "Incorrect number of manifest lists deleted", + expectedManifestListsDeleted, + results.deletedManifestListsCount()); + Assert.assertEquals( + "Incorrect number of other lists deleted", + expectedOtherFilesDeleted, + results.deletedOtherFilesCount()); } @Test public void dataFilesCleanupWithParallelTasks() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit(); Set deletedFiles = ConcurrentHashMap.newKeySet(); Set deleteThreads = ConcurrentHashMap.newKeySet(); AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - DeleteReachableFiles.Result result = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .executeDeleteWith(Executors.newFixedThreadPool(4, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-files-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })) - .deleteWith(s -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(s); - }) - .execute(); - - // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory - Assert.assertEquals(deleteThreads, + DeleteReachableFiles.Result result = + sparkActions() + .deleteReachableFiles(metadataLocation(table)) + .io(table.io()) + .executeDeleteWith( + Executors.newFixedThreadPool( + 4, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("remove-files-" + deleteThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })) + .deleteWith( + s -> { + deleteThreads.add(Thread.currentThread().getName()); + deletedFiles.add(s); + }) + .execute(); + + // Verifies that the delete methods ran in the threads created by the provided ExecutorService + // ThreadFactory + Assert.assertEquals( + deleteThreads, Sets.newHashSet("remove-files-0", "remove-files-1", "remove-files-2", "remove-files-3")); - Lists.newArrayList(FILE_A, FILE_B, FILE_C, FILE_D).forEach(file -> - Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())) - ); + Lists.newArrayList(FILE_A, FILE_B, FILE_C, FILE_D) + .forEach( + file -> + Assert.assertTrue( + "FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString()))); checkRemoveFilesResults(4L, 6L, 4L, 6, result); } @@ -167,64 +183,43 @@ public void dataFilesCleanupWithParallelTasks() { public void testWithExpiringDanglingStageCommit() { table.location(); // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` staged commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); // `C` commit - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); - DeleteReachableFiles.Result result = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .execute(); + DeleteReachableFiles.Result result = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()).execute(); checkRemoveFilesResults(3L, 3L, 3L, 5, result); } @Test public void testRemoveFileActionOnEmptyTable() { - DeleteReachableFiles.Result result = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .execute(); + DeleteReachableFiles.Result result = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()).execute(); checkRemoveFilesResults(0, 0, 0, 2, result); } @Test public void testRemoveFilesActionWithReducedVersionsTable() { - table.updateProperties() - .set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "2").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); - - table.newAppend() - .appendFile(FILE_B) - .commit(); - - table.newAppend() - .appendFile(FILE_B) - .commit(); - - table.newAppend() - .appendFile(FILE_C) - .commit(); - - table.newAppend() - .appendFile(FILE_D) - .commit(); - - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()); + table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "2").commit(); + table.newAppend().appendFile(FILE_A).commit(); + + table.newAppend().appendFile(FILE_B).commit(); + + table.newAppend().appendFile(FILE_B).commit(); + + table.newAppend().appendFile(FILE_C).commit(); + + table.newAppend().appendFile(FILE_D).commit(); + + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); DeleteReachableFiles.Result result = baseRemoveFilesSparkAction.execute(); checkRemoveFilesResults(4, 5, 5, 8, result); @@ -232,57 +227,44 @@ public void testRemoveFilesActionWithReducedVersionsTable() { @Test public void testRemoveFilesAction() { - table.newAppend() - .appendFile(FILE_A) - .commit(); - - table.newAppend() - .appendFile(FILE_B) - .commit(); - - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()); - checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); + table.newAppend().appendFile(FILE_A).commit(); + + table.newAppend().appendFile(FILE_B).commit(); + + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); + checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); } @Test public void testRemoveFilesActionWithDefaultIO() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); // IO not set explicitly on removeReachableFiles action // IO defaults to HadoopFileIO - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)); - checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)); + checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); } @Test public void testUseLocalIterator() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); int jobsBefore = spark.sparkContext().dagScheduler().nextJobId().get(); - DeleteReachableFiles.Result results = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .option("stream-results", "true").execute(); + DeleteReachableFiles.Result results = + sparkActions() + .deleteReachableFiles(metadataLocation(table)) + .io(table.io()) + .option("stream-results", "true") + .execute(); int jobsAfter = spark.sparkContext().dagScheduler().nextJobId().get(); int totalJobsRun = jobsAfter - jobsBefore; @@ -290,52 +272,52 @@ public void testUseLocalIterator() { checkRemoveFilesResults(3L, 4L, 3L, 5, results); Assert.assertEquals( - "Expected total jobs to be equal to total number of shuffle partitions", totalJobsRun, SHUFFLE_PARTITIONS); + "Expected total jobs to be equal to total number of shuffle partitions", + totalJobsRun, + SHUFFLE_PARTITIONS); } @Test public void testIgnoreMetadataFilesNotFound() { - table.updateProperties() - .set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1").commit(); + table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // There are three metadata json files at this point DeleteOrphanFiles.Result result = sparkActions().deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); - Assert.assertTrue("Should remove v1 file", + Assert.assertTrue( + "Should remove v1 file", StreamSupport.stream(result.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("v1.metadata.json"))); - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()); + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); DeleteReachableFiles.Result res = baseRemoveFilesSparkAction.execute(); - checkRemoveFilesResults(1, 1, 1, 4, res); + checkRemoveFilesResults(1, 1, 1, 4, res); } @Test public void testEmptyIOThrowsException() { - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(null); - AssertHelpers.assertThrows("FileIO needs to be set to use RemoveFiles action", - IllegalArgumentException.class, "File IO cannot be null", + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(null); + AssertHelpers.assertThrows( + "FileIO needs to be set to use RemoveFiles action", + IllegalArgumentException.class, + "File IO cannot be null", baseRemoveFilesSparkAction::execute); } @Test public void testRemoveFilesActionWhenGarbageCollectionDisabled() { - table.updateProperties() - .set(TableProperties.GC_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - AssertHelpers.assertThrows("Should complain about removing files when GC is disabled", - ValidationException.class, "Cannot remove files: GC is disabled (deleting files may corrupt other tables)", + AssertHelpers.assertThrows( + "Should complain about removing files when GC is disabled", + ValidationException.class, + "Cannot remove files: GC is disabled (deleting files may corrupt other tables)", () -> sparkActions().deleteReachableFiles(metadataLocation(table))); } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java index 87b89185c4a9..84ac1e2bb8ce 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.util.List; @@ -56,46 +57,47 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestExpireSnapshotsAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); private static final int SHUFFLE_PARTITIONS = 2; private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("c1").build(); - static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=1") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_C = DataFiles.builder(SPEC) - .withPath("/path/to/data-c.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=2") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_D = DataFiles.builder(SPEC) - .withPath("/path/to/data-d.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=3") // easy way to set partition data for now - .withRecordCount(1) - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=1") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_C = + DataFiles.builder(SPEC) + .withPath("/path/to/data-c.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=2") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_D = + DataFiles.builder(SPEC) + .withPath("/path/to/data-d.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=3") // easy way to set partition data for now + .withRecordCount(1) + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableDir; private String tableLocation; @@ -121,37 +123,41 @@ private Long rightAfterSnapshot(long snapshotId) { return end; } - private void checkExpirationResults(long expectedDatafiles, long expectedManifestsDeleted, - long expectedManifestListsDeleted, ExpireSnapshots.Result results) { - - Assert.assertEquals("Incorrect number of manifest files deleted", - expectedManifestsDeleted, results.deletedManifestsCount()); - Assert.assertEquals("Incorrect number of datafiles deleted", - expectedDatafiles, results.deletedDataFilesCount()); - Assert.assertEquals("Incorrect number of manifest lists deleted", - expectedManifestListsDeleted, results.deletedManifestListsCount()); + private void checkExpirationResults( + long expectedDatafiles, + long expectedManifestsDeleted, + long expectedManifestListsDeleted, + ExpireSnapshots.Result results) { + + Assert.assertEquals( + "Incorrect number of manifest files deleted", + expectedManifestsDeleted, + results.deletedManifestsCount()); + Assert.assertEquals( + "Incorrect number of datafiles deleted", + expectedDatafiles, + results.deletedDataFilesCount()); + Assert.assertEquals( + "Incorrect number of manifest lists deleted", + expectedManifestListsDeleted, + results.deletedManifestListsCount()); } @Test public void testFilesCleaned() throws Exception { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); long end = rightAfterSnapshot(); - ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); + ExpireSnapshots.Result results = + SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); - Assert.assertEquals("Table does not have 1 snapshot after expiration", 1, Iterables.size(table.snapshots())); + Assert.assertEquals( + "Table does not have 1 snapshot after expiration", 1, Iterables.size(table.snapshots())); checkExpirationResults(1L, 1L, 2L, results); } @@ -159,21 +165,13 @@ public void testFilesCleaned() throws Exception { @Test public void dataFilesCleanupWithParallelTasks() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit(); long t4 = rightAfterSnapshot(); @@ -181,23 +179,33 @@ public void dataFilesCleanupWithParallelTasks() throws IOException { Set deleteThreads = ConcurrentHashMap.newKeySet(); AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .executeDeleteWith(Executors.newFixedThreadPool(4, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-snapshot-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })) - .expireOlderThan(t4) - .deleteWith(s -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(s); - }) - .execute(); - - // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory - Assert.assertEquals(deleteThreads, - Sets.newHashSet("remove-snapshot-0", "remove-snapshot-1", "remove-snapshot-2", "remove-snapshot-3")); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .executeDeleteWith( + Executors.newFixedThreadPool( + 4, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("remove-snapshot-" + deleteThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })) + .expireOlderThan(t4) + .deleteWith( + s -> { + deleteThreads.add(Thread.currentThread().getName()); + deletedFiles.add(s); + }) + .execute(); + + // Verifies that the delete methods ran in the threads created by the provided ExecutorService + // ThreadFactory + Assert.assertEquals( + deleteThreads, + Sets.newHashSet( + "remove-snapshot-0", "remove-snapshot-1", "remove-snapshot-2", "remove-snapshot-3")); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); Assert.assertTrue("FILE_B should be deleted", deletedFiles.contains(FILE_B.path().toString())); @@ -207,9 +215,7 @@ public void dataFilesCleanupWithParallelTasks() throws IOException { @Test public void testNoFilesDeletedWhenNoSnapshotsExpired() throws Exception { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).execute(); checkExpirationResults(0L, 0L, 0L, results); @@ -217,30 +223,24 @@ public void testNoFilesDeletedWhenNoSnapshotsExpired() throws Exception { @Test public void testCleanupRepeatedOverwrites() throws Exception { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); for (int i = 0; i < 10; i++) { - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); - - table.newOverwrite() - .deleteFile(FILE_B) - .addFile(FILE_A) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); + + table.newOverwrite().deleteFile(FILE_B).addFile(FILE_A).commit(); } long end = rightAfterSnapshot(); - ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); + ExpireSnapshots.Result results = + SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); checkExpirationResults(1L, 39L, 20L, results); } @Test public void testRetainLastWithExpireOlderThan() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); @@ -249,217 +249,256 @@ public void testRetainLastWithExpireOlderThan() { t1 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); long t3 = rightAfterSnapshot(); // Retain last 2 snapshots - SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .retainLast(2) - .execute(); + SparkActions.get().expireSnapshots(table).expireOlderThan(t3).retainLast(2).execute(); - Assert.assertEquals("Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + Assert.assertEquals( + "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); } @Test public void testExpireTwoSnapshotsById() throws Exception { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); long secondSnapshotID = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); // Retain last 2 snapshots - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireSnapshotId(firstSnapshotId) - .expireSnapshotId(secondSnapshotID) - .execute(); - - Assert.assertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId)); - Assert.assertEquals("Second snapshot should not be present.", null, table.snapshot(secondSnapshotID)); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireSnapshotId(firstSnapshotId) + .expireSnapshotId(secondSnapshotID) + .execute(); + + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + Assert.assertEquals( + "Second snapshot should not be present.", null, table.snapshot(secondSnapshotID)); checkExpirationResults(0L, 0L, 2L, result); } @Test public void testRetainLastWithExpireById() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); // Retain last 3 snapshots, but explicitly remove the first snapshot - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireSnapshotId(firstSnapshotId) - .retainLast(3) - .execute(); - - Assert.assertEquals("Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireSnapshotId(firstSnapshotId) + .retainLast(3) + .execute(); + + Assert.assertEquals( + "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); checkExpirationResults(0L, 0L, 1L, result); } @Test public void testRetainLastWithTooFewSnapshots() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .appendFile(FILE_B) // data_bucket=1 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); long t2 = rightAfterSnapshot(); // Retain last 3 snapshots - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t2) - .retainLast(3) - .execute(); - - Assert.assertEquals("Should have two snapshots", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should still present", - firstSnapshotId, table.snapshot(firstSnapshotId).snapshotId()); + ExpireSnapshots.Result result = + SparkActions.get().expireSnapshots(table).expireOlderThan(t2).retainLast(3).execute(); + + Assert.assertEquals( + "Should have two snapshots", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should still present", + firstSnapshotId, + table.snapshot(firstSnapshotId).snapshotId()); checkExpirationResults(0L, 0L, 0L, result); } @Test public void testRetainLastKeepsExpiringSnapshot() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_D) // data_bucket=3 .commit(); // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(secondSnapshot.timestampMillis()) - .retainLast(2) - .execute(); - - Assert.assertEquals("Should have three snapshots.", 3, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNotNull("Second snapshot should present.", table.snapshot(secondSnapshot.snapshotId())); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(secondSnapshot.timestampMillis()) + .retainLast(2) + .execute(); + + Assert.assertEquals( + "Should have three snapshots.", 3, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNotNull( + "Second snapshot should present.", table.snapshot(secondSnapshot.snapshotId())); checkExpirationResults(0L, 0L, 1L, result); } @Test public void testExpireSnapshotsWithDisabledGarbageCollection() { - table.updateProperties() - .set(TableProperties.GC_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - AssertHelpers.assertThrows("Should complain about expiring snapshots", - ValidationException.class, "Cannot expire snapshots: GC is disabled", + AssertHelpers.assertThrows( + "Should complain about expiring snapshots", + ValidationException.class, + "Cannot expire snapshots: GC is disabled", () -> SparkActions.get().expireSnapshots(table)); } @Test public void testExpireOlderThanMultipleCalls() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); Snapshot thirdSnapshot = table.currentSnapshot(); // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(secondSnapshot.timestampMillis()) - .expireOlderThan(thirdSnapshot.timestampMillis()) - .execute(); - - Assert.assertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNull("Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(secondSnapshot.timestampMillis()) + .expireOlderThan(thirdSnapshot.timestampMillis()) + .execute(); + + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNull( + "Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); checkExpirationResults(0L, 0L, 2L, result); } @Test public void testRetainLastMultipleCalls() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); long t3 = rightAfterSnapshot(); // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .retainLast(2) - .retainLast(1) - .execute(); - - Assert.assertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNull("Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(t3) + .retainLast(2) + .retainLast(1) + .execute(); + + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNull( + "Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); checkExpirationResults(0L, 0L, 2L, result); } @Test public void testRetainZeroSnapshots() { - AssertHelpers.assertThrows("Should fail retain 0 snapshots " + - "because number of snapshots to retain cannot be zero", + AssertHelpers.assertThrows( + "Should fail retain 0 snapshots " + "because number of snapshots to retain cannot be zero", IllegalArgumentException.class, "Number of snapshots to retain must be at least 1, cannot be: 0", () -> SparkActions.get().expireSnapshots(table).retainLast(0).execute()); @@ -467,28 +506,22 @@ public void testRetainZeroSnapshots() { @Test public void testScanExpiredManifestInValidSnapshotAppend() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.newOverwrite() - .addFile(FILE_C) - .deleteFile(FILE_A) - .commit(); + table.newOverwrite().addFile(FILE_C).deleteFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_D).commit(); long t3 = rightAfterSnapshot(); Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(t3) + .deleteWith(deletedFiles::add) + .execute(); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); checkExpirationResults(1L, 1L, 2L, result); @@ -496,72 +529,61 @@ public void testScanExpiredManifestInValidSnapshotAppend() { @Test public void testScanExpiredManifestInValidSnapshotFastAppend() { - table.updateProperties() + table + .updateProperties() .set(TableProperties.MANIFEST_MERGE_ENABLED, "true") .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1") .commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.newOverwrite() - .addFile(FILE_C) - .deleteFile(FILE_A) - .commit(); + table.newOverwrite().addFile(FILE_C).deleteFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_D) - .commit(); + table.newFastAppend().appendFile(FILE_D).commit(); long t3 = rightAfterSnapshot(); Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(t3) + .deleteWith(deletedFiles::add) + .execute(); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); checkExpirationResults(1L, 1L, 2L, result); } /** - * Test on table below, and expiring the staged commit `B` using `expireOlderThan` API. - * Table: A - C - * ` B (staged) + * Test on table below, and expiring the staged commit `B` using `expireOlderThan` API. Table: A - + * C ` B (staged) */ @Test public void testWithExpiringDanglingStageCommit() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` staged commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); TableMetadata base = ((BaseTable) table).operations().current(); Snapshot snapshotA = base.snapshots().get(0); Snapshot snapshotB = base.snapshots().get(1); // `C` commit - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Set deletedFiles = Sets.newHashSet(); // Expire all commits including dangling staged snapshot. - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(snapshotB.timestampMillis() + 1) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireOlderThan(snapshotB.timestampMillis() + 1) + .execute(); checkExpirationResults(1L, 1L, 2L, result); @@ -569,122 +591,107 @@ public void testWithExpiringDanglingStageCommit() { expectedDeletes.add(snapshotA.manifestListLocation()); // Files should be deleted of dangling staged snapshot - snapshotB.addedFiles().forEach(i -> { - expectedDeletes.add(i.path().toString()); - }); + snapshotB + .addedFiles() + .forEach( + i -> { + expectedDeletes.add(i.path().toString()); + }); // ManifestList should be deleted too expectedDeletes.add(snapshotB.manifestListLocation()); - snapshotB.dataManifests().forEach(file -> { - // Only the manifest of B should be deleted. - if (file.snapshotId() == snapshotB.snapshotId()) { - expectedDeletes.add(file.path()); - } - }); - Assert.assertSame("Files deleted count should be expected", expectedDeletes.size(), deletedFiles.size()); + snapshotB + .dataManifests() + .forEach( + file -> { + // Only the manifest of B should be deleted. + if (file.snapshotId() == snapshotB.snapshotId()) { + expectedDeletes.add(file.path()); + } + }); + Assert.assertSame( + "Files deleted count should be expected", expectedDeletes.size(), deletedFiles.size()); // Take the diff expectedDeletes.removeAll(deletedFiles); Assert.assertTrue("Exactly same files should be deleted", expectedDeletes.isEmpty()); } /** - * Expire cherry-pick the commit as shown below, when `B` is in table's current state - * Table: - * A - B - C <--current snapshot - * `- D (source=B) + * Expire cherry-pick the commit as shown below, when `B` is in table's current state Table: A - B + * - C <--current snapshot `- D (source=B) */ @Test public void testWithCherryPickTableSnapshot() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot snapshotA = table.currentSnapshot(); // `B` commit Set deletedAFiles = Sets.newHashSet(); - table.newOverwrite() - .addFile(FILE_B) - .deleteFile(FILE_A) - .deleteWith(deletedAFiles::add) - .commit(); + table.newOverwrite().addFile(FILE_B).deleteFile(FILE_A).deleteWith(deletedAFiles::add).commit(); Assert.assertTrue("No files should be physically deleted", deletedAFiles.isEmpty()); // pick the snapshot 'B` Snapshot snapshotB = table.currentSnapshot(); // `C` commit to let cherry-pick take effect, and avoid fast-forward of `B` with cherry-pick - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Snapshot snapshotC = table.currentSnapshot(); // Move the table back to `A` - table.manageSnapshots() - .setCurrentSnapshot(snapshotA.snapshotId()) - .commit(); + table.manageSnapshots().setCurrentSnapshot(snapshotA.snapshotId()).commit(); // Generate A -> `D (B)` - table.manageSnapshots() - .cherrypick(snapshotB.snapshotId()) - .commit(); + table.manageSnapshots().cherrypick(snapshotB.snapshotId()).commit(); Snapshot snapshotD = table.currentSnapshot(); // Move the table back to `C` - table.manageSnapshots() - .setCurrentSnapshot(snapshotC.snapshotId()) - .commit(); + table.manageSnapshots().setCurrentSnapshot(snapshotC.snapshotId()).commit(); List deletedFiles = Lists.newArrayList(); // Expire `C` - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(snapshotC.timestampMillis() + 1) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireOlderThan(snapshotC.timestampMillis() + 1) + .execute(); // Make sure no dataFiles are deleted for the B, C, D snapshot - Lists.newArrayList(snapshotB, snapshotC, snapshotD).forEach(i -> { - i.addedFiles().forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); + Lists.newArrayList(snapshotB, snapshotC, snapshotD) + .forEach( + i -> { + i.addedFiles() + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); checkExpirationResults(1L, 2L, 2L, result); } /** - * Test on table below, and expiring `B` which is not in current table state. - * 1) Expire `B` - * 2) All commit - * Table: A - C - D (B) - * ` B (staged) + * Test on table below, and expiring `B` which is not in current table state. 1) Expire `B` 2) All + * commit Table: A - C - D (B) ` B (staged) */ @Test public void testWithExpiringStagedThenCherrypick() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); // pick the snapshot that's staged but not committed TableMetadata base = ((BaseTable) table).operations().current(); Snapshot snapshotB = base.snapshots().get(1); // `C` commit to let cherry-pick take effect, and avoid fast-forward of `B` with cherry-pick - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); // `D (B)` cherry-pick commit - table.manageSnapshots() - .cherrypick(snapshotB.snapshotId()) - .commit(); + table.manageSnapshots().cherrypick(snapshotB.snapshotId()).commit(); base = ((BaseTable) table).operations().current(); Snapshot snapshotD = base.snapshots().get(3); @@ -692,47 +699,55 @@ public void testWithExpiringStagedThenCherrypick() { List deletedFiles = Lists.newArrayList(); // Expire `B` commit. - ExpireSnapshots.Result firstResult = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireSnapshotId(snapshotB.snapshotId()) - .execute(); + ExpireSnapshots.Result firstResult = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireSnapshotId(snapshotB.snapshotId()) + .execute(); // Make sure no dataFiles are deleted for the staged snapshot - Lists.newArrayList(snapshotB).forEach(i -> { - i.addedFiles().forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); + Lists.newArrayList(snapshotB) + .forEach( + i -> { + i.addedFiles() + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); checkExpirationResults(0L, 1L, 1L, firstResult); // Expire all snapshots including cherry-pick - ExpireSnapshots.Result secondResult = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(table.currentSnapshot().timestampMillis() + 1) - .execute(); + ExpireSnapshots.Result secondResult = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireOlderThan(table.currentSnapshot().timestampMillis() + 1) + .execute(); // Make sure no dataFiles are deleted for the staged and cherry-pick - Lists.newArrayList(snapshotB, snapshotD).forEach(i -> { - i.addedFiles().forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); + Lists.newArrayList(snapshotB, snapshotD) + .forEach( + i -> { + i.addedFiles() + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); checkExpirationResults(0L, 0L, 2L, secondResult); } @Test public void testExpireOlderThan() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); long snapshotId = table.currentSnapshot().snapshotId(); @@ -740,42 +755,45 @@ public void testExpireOlderThan() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertEquals("Should remove only the expired manifest list location", - Sets.newHashSet(firstSnapshot.manifestListLocation()), deletedFiles); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertEquals( + "Should remove only the expired manifest list location", + Sets.newHashSet(firstSnapshot.manifestListLocation()), + deletedFiles); checkExpirationResults(0, 0, 1, result); } @Test public void testExpireOlderThanWithDelete() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests().size()); + Assert.assertEquals("Should create one manifest", 1, firstSnapshot.allManifests().size()); rightAfterSnapshot(); - table.newDelete() - .deleteFile(FILE_A) - .commit(); + table.newDelete().deleteFile(FILE_A).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create replace manifest with a rewritten manifest", - 1, secondSnapshot.allManifests().size()); + Assert.assertEquals( + "Should create replace manifest with a rewritten manifest", + 1, + secondSnapshot.allManifests().size()); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); rightAfterSnapshot(); @@ -785,21 +803,33 @@ public void testExpireOlderThanWithDelete() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the second oldest snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and deleted data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the second oldest snapshot", + table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and deleted data file", Sets.newHashSet( firstSnapshot.manifestListLocation(), // snapshot expired firstSnapshot.allManifests().get(0).path(), // manifest was rewritten for delete secondSnapshot.manifestListLocation(), // snapshot expired - secondSnapshot.allManifests().get(0).path(), // manifest contained only deletes, was dropped + secondSnapshot + .allManifests() + .get(0) + .path(), // manifest contained only deletes, was dropped FILE_A.path()), // deleted deletedFiles); @@ -809,30 +839,28 @@ public void testExpireOlderThanWithDelete() { @Test public void testExpireOlderThanWithDeleteInMergedManifests() { // merge every commit - table.updateProperties() - .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests().size()); + Assert.assertEquals("Should create one manifest", 1, firstSnapshot.allManifests().size()); rightAfterSnapshot(); - table.newDelete() + table + .newDelete() .deleteFile(FILE_A) // FILE_B is still in the dataset .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should replace manifest with a rewritten manifest", - 1, secondSnapshot.allManifests().size()); + Assert.assertEquals( + "Should replace manifest with a rewritten manifest", + 1, + secondSnapshot.allManifests().size()); - table.newFastAppend() // do not merge to keep the last snapshot's manifest valid + table + .newFastAppend() // do not merge to keep the last snapshot's manifest valid .appendFile(FILE_C) .commit(); @@ -844,16 +872,25 @@ public void testExpireOlderThanWithDeleteInMergedManifests() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the second oldest snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and deleted data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the second oldest snapshot", + table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and deleted data file", Sets.newHashSet( firstSnapshot.manifestListLocation(), // snapshot expired firstSnapshot.allManifests().get(0).path(), // manifest was rewritten for delete @@ -867,33 +904,24 @@ public void testExpireOlderThanWithDeleteInMergedManifests() { @Test public void testExpireOlderThanWithRollback() { // merge every commit - table.updateProperties() - .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests().size()); + Assert.assertEquals("Should create one manifest", 1, firstSnapshot.allManifests().size()); rightAfterSnapshot(); - table.newDelete() - .deleteFile(FILE_B) - .commit(); + table.newDelete().deleteFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); Set secondSnapshotManifests = Sets.newHashSet(secondSnapshot.allManifests()); secondSnapshotManifests.removeAll(firstSnapshot.allManifests()); - Assert.assertEquals("Should add one new manifest for append", 1, secondSnapshotManifests.size()); + Assert.assertEquals( + "Should add one new manifest for append", 1, secondSnapshotManifests.size()); - table.manageSnapshots() - .rollbackTo(firstSnapshot.snapshotId()) - .commit(); + table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit(); long tAfterCommits = rightAfterSnapshot(secondSnapshot.snapshotId()); @@ -901,19 +929,29 @@ public void testExpireOlderThanWithRollback() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNotNull("Expire should keep the oldest snapshot, current", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and reverted appended data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNotNull( + "Expire should keep the oldest snapshot, current", + table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and reverted appended data file", Sets.newHashSet( secondSnapshot.manifestListLocation(), // snapshot expired - Iterables.getOnlyElement(secondSnapshotManifests).path()), // manifest is no longer referenced + Iterables.getOnlyElement(secondSnapshotManifests) + .path()), // manifest is no longer referenced deletedFiles); checkExpirationResults(0, 1, 1, result); @@ -921,28 +959,22 @@ public void testExpireOlderThanWithRollback() { @Test public void testExpireOlderThanWithRollbackAndMergedManifests() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests().size()); + Assert.assertEquals("Should create one manifest", 1, firstSnapshot.allManifests().size()); rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); Set secondSnapshotManifests = Sets.newHashSet(secondSnapshot.allManifests()); secondSnapshotManifests.removeAll(firstSnapshot.allManifests()); - Assert.assertEquals("Should add one new manifest for append", 1, secondSnapshotManifests.size()); + Assert.assertEquals( + "Should add one new manifest for append", 1, secondSnapshotManifests.size()); - table.manageSnapshots() - .rollbackTo(firstSnapshot.snapshotId()) - .commit(); + table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit(); long tAfterCommits = rightAfterSnapshot(secondSnapshot.snapshotId()); @@ -950,19 +982,29 @@ public void testExpireOlderThanWithRollbackAndMergedManifests() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNotNull("Expire should keep the oldest snapshot, current", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and reverted appended data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNotNull( + "Expire should keep the oldest snapshot, current", + table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and reverted appended data file", Sets.newHashSet( secondSnapshot.manifestListLocation(), // snapshot expired - Iterables.getOnlyElement(secondSnapshotManifests).path(), // manifest is no longer referenced + Iterables.getOnlyElement(secondSnapshotManifests) + .path(), // manifest is no longer referenced FILE_B.path()), // added, but rolled back deletedFiles); @@ -974,27 +1016,25 @@ public void testExpireOnEmptyTable() { Set deletedFiles = Sets.newHashSet(); // table has no data, testing ExpireSnapshots should not fail with no snapshot - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(System.currentTimeMillis()) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(System.currentTimeMillis()) + .deleteWith(deletedFiles::add) + .execute(); checkExpirationResults(0, 0, 0, result); } @Test public void testExpireAction() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); long snapshotId = table.currentSnapshot().snapshotId(); @@ -1002,51 +1042,58 @@ public void testExpireAction() { Set deletedFiles = Sets.newHashSet(); - BaseExpireSnapshotsSparkAction action = (BaseExpireSnapshotsSparkAction) SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add); + BaseExpireSnapshotsSparkAction action = + (BaseExpireSnapshotsSparkAction) + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add); Dataset pendingDeletes = action.expire(); List pending = pendingDeletes.collectAsList(); - Assert.assertEquals("Should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertEquals( + "Should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); Assert.assertEquals("Pending deletes should contain one row", 1, pending.size()); - Assert.assertEquals("Pending delete should be the expired manifest list location", - firstSnapshot.manifestListLocation(), pending.get(0).getString(0)); - Assert.assertEquals("Pending delete should be a manifest list", - "Manifest List", pending.get(0).getString(1)); + Assert.assertEquals( + "Pending delete should be the expired manifest list location", + firstSnapshot.manifestListLocation(), + pending.get(0).getString(0)); + Assert.assertEquals( + "Pending delete should be a manifest list", "Manifest List", pending.get(0).getString(1)); Assert.assertEquals("Should not delete any files", 0, deletedFiles.size()); - Assert.assertSame("Multiple calls to expire should return the same deleted files", - pendingDeletes, action.expire()); + Assert.assertSame( + "Multiple calls to expire should return the same deleted files", + pendingDeletes, + action.expire()); } @Test public void testUseLocalIterator() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); long end = rightAfterSnapshot(); int jobsBefore = spark.sparkContext().dagScheduler().nextJobId().get(); ExpireSnapshots.Result results = - SparkActions.get().expireSnapshots(table).expireOlderThan(end).option("stream-results", "true").execute(); + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(end) + .option("stream-results", "true") + .execute(); - Assert.assertEquals("Table does not have 1 snapshot after expiration", 1, Iterables.size(table.snapshots())); + Assert.assertEquals( + "Table does not have 1 snapshot after expiration", 1, Iterables.size(table.snapshots())); int jobsAfter = spark.sparkContext().dagScheduler().nextJobId().get(); int totalJobsRun = jobsAfter - jobsBefore; @@ -1054,7 +1101,9 @@ public void testUseLocalIterator() { checkExpirationResults(1L, 1L, 2L, results); Assert.assertTrue( - String.format("Expected more than %d jobs when using local iterator, ran %d", SHUFFLE_PARTITIONS, totalJobsRun), + String.format( + "Expected more than %d jobs when using local iterator, ran %d", + SHUFFLE_PARTITIONS, totalJobsRun), totalJobsRun > SHUFFLE_PARTITIONS); } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java index 7cf2e1f295a5..e42923c0bf23 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -64,23 +65,18 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestRemoveOrphanFilesAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - protected static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); - protected static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .truncate("c2", 2) - .identity("c3") - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + protected static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); + protected static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).truncate("c2", 2).identity("c3").build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableDir = null; protected String tableLocation = null; @@ -92,41 +88,37 @@ public void setupTableLocation() throws Exception { @Test public void testDryRun() throws IOException, InterruptedException { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - List validFiles = spark.read().format("iceberg") - .load(tableLocation + "#files") - .select("file_path") - .as(Encoders.STRING()) - .collectAsList(); + List validFiles = + spark + .read() + .format("iceberg") + .load(tableLocation + "#files") + .select("file_path") + .as(Encoders.STRING()) + .collectAsList(); Assert.assertEquals("Should be 2 valid files", 2, validFiles.size()); df.write().mode("append").parquet(tableLocation + "/data"); Path dataPath = new Path(tableLocation + "/data"); FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf()); - List allFiles = Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) - .filter(FileStatus::isFile) - .map(file -> file.getPath().toString()) - .collect(Collectors.toList()); + List allFiles = + Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) + .filter(FileStatus::isFile) + .map(file -> file.getPath().toString()) + .collect(Collectors.toList()); Assert.assertEquals("Should be 3 files", 3, allFiles.size()); List invalidFiles = Lists.newArrayList(allFiles); @@ -138,32 +130,34 @@ public void testDryRun() throws IOException, InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result1 = actions.deleteOrphanFiles(table) - .deleteWith(s -> { }) - .execute(); - Assert.assertTrue("Default olderThan interval should be safe", Iterables.isEmpty(result1.orphanFileLocations())); + DeleteOrphanFiles.Result result1 = + actions.deleteOrphanFiles(table).deleteWith(s -> {}).execute(); + Assert.assertTrue( + "Default olderThan interval should be safe", + Iterables.isEmpty(result1.orphanFileLocations())); - DeleteOrphanFiles.Result result2 = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .deleteWith(s -> { }) - .execute(); + DeleteOrphanFiles.Result result2 = + actions + .deleteOrphanFiles(table) + .olderThan(System.currentTimeMillis()) + .deleteWith(s -> {}) + .execute(); Assert.assertEquals("Action should find 1 file", invalidFiles, result2.orphanFileLocations()); Assert.assertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0)))); - DeleteOrphanFiles.Result result3 = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result3 = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Action should delete 1 file", invalidFiles, result3.orphanFileLocations()); - Assert.assertFalse("Invalid file should not be present", fs.exists(new Path(invalidFiles.get(0)))); + Assert.assertFalse( + "Invalid file should not be present", fs.exists(new Path(invalidFiles.get(0)))); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(records); expectedRecords.addAll(records); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -171,36 +165,22 @@ public void testDryRun() throws IOException, InterruptedException { public void testAllValidFilesAreKept() throws IOException, InterruptedException { Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records1 = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df1 = spark.createDataFrame(records1, ThreeColumnRecord.class).coalesce(1); // original append - df1.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA") - ); + List records2 = + Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA")); Dataset df2 = spark.createDataFrame(records2, ThreeColumnRecord.class).coalesce(1); // dynamic partition overwrite - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("overwrite") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation); // second append - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); List snapshots = Lists.newArrayList(table.snapshots()); @@ -223,9 +203,8 @@ public void testAllValidFilesAreKept() throws IOException, InterruptedException SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 4 files", 4, Iterables.size(result.orphanFileLocations())); @@ -249,36 +228,22 @@ public void testAllValidFilesAreKept() throws IOException, InterruptedException public void orphanedFileRemovedWithParallelTasks() throws InterruptedException, IOException { Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records1 = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df1 = spark.createDataFrame(records1, ThreeColumnRecord.class).coalesce(1); // original append - df1.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA") - ); + List records2 = + Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA")); Dataset df2 = spark.createDataFrame(records2, ThreeColumnRecord.class).coalesce(1); // dynamic partition overwrite - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("overwrite") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation); // second append - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data"); df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA"); @@ -292,25 +257,34 @@ public void orphanedFileRemovedWithParallelTasks() throws InterruptedException, Set deleteThreads = ConcurrentHashMap.newKeySet(); AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - ExecutorService executorService = Executors.newFixedThreadPool(4, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-orphan-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); - return thread; - }); - - DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table) + ExecutorService executorService = + Executors.newFixedThreadPool( + 4, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("remove-orphan-" + deleteThreadsIndex.getAndIncrement()); + thread.setDaemon(true); + return thread; + }); + + DeleteOrphanFiles.Result result = + SparkActions.get() + .deleteOrphanFiles(table) .executeDeleteWith(executorService) .olderThan(System.currentTimeMillis()) - .deleteWith(file -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(file); - }) + .deleteWith( + file -> { + deleteThreads.add(Thread.currentThread().getName()); + deletedFiles.add(file); + }) .execute(); - // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory - Assert.assertEquals(deleteThreads, - Sets.newHashSet("remove-orphan-0", "remove-orphan-1", "remove-orphan-2", "remove-orphan-3")); + // Verifies that the delete methods ran in the threads created by the provided ExecutorService + // ThreadFactory + Assert.assertEquals( + deleteThreads, + Sets.newHashSet( + "remove-orphan-0", "remove-orphan-1", "remove-orphan-2", "remove-orphan-3")); Assert.assertEquals("Should delete 4 files", 4, deletedFiles.size()); } @@ -321,31 +295,21 @@ public void testWapFilesAreKept() throws InterruptedException { props.put(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, "true"); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); // normal write - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); spark.conf().set("spark.wap.id", "1"); // wap write - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Should not return data from the staged snapshot", records, actualRecords); // sleep for 1 second to unsure files will be old enough @@ -353,11 +317,11 @@ public void testWapFilesAreKept() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); + Assert.assertTrue( + "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); } @Test @@ -367,16 +331,11 @@ public void testMetadataFolderIsIntact() throws InterruptedException { props.put(TableProperties.WRITE_DATA_LOCATION, tableLocation); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df.write().mode("append").parquet(tableLocation + "/c2_trunc=AA/c3=AAAA"); @@ -385,16 +344,14 @@ public void testMetadataFolderIsIntact() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @@ -402,16 +359,11 @@ public void testMetadataFolderIsIntact() throws InterruptedException { public void testOlderThanTimestamp() throws InterruptedException { Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); @@ -426,11 +378,11 @@ public void testOlderThanTimestamp() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(timestamp) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(timestamp).execute(); - Assert.assertEquals("Should delete only 2 files", 2, Iterables.size(result.orphanFileLocations())); + Assert.assertEquals( + "Should delete only 2 files", 2, Iterables.size(result.orphanFileLocations())); } @Test @@ -440,34 +392,26 @@ public void testRemoveUnreachableMetadataVersionFiles() throws InterruptedExcept props.put(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1"); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); // sleep for 1 second to unsure files will be old enough Thread.sleep(1000); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); - Assert.assertTrue("Should remove v1 file", StreamSupport.stream(result.orphanFileLocations().spliterator(), false) + Assert.assertTrue( + "Should remove v1 file", + StreamSupport.stream(result.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("v1.metadata.json"))); List expectedRecords = Lists.newArrayList(); @@ -475,9 +419,8 @@ public void testRemoveUnreachableMetadataVersionFiles() throws InterruptedExcept expectedRecords.addAll(records); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -492,27 +435,22 @@ public void testManyTopLevelPartitions() throws InterruptedException { Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); // sleep for 1 second to unsure files will be old enough Thread.sleep(1000); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); + Assert.assertTrue( + "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @@ -527,32 +465,29 @@ public void testManyLeafPartitions() throws InterruptedException { Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); // sleep for 1 second to unsure files will be old enough Thread.sleep(1000); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); + Assert.assertTrue( + "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } private List snapshotFiles(long snapshotId) { - return spark.read().format("iceberg") + return spark + .read() + .format("iceberg") .option("snapshot-id", snapshotId) .load(tableLocation + "#files") .select("file_path") @@ -562,11 +497,12 @@ private List snapshotFiles(long snapshotId) { @Test public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, InterruptedException { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableDir.getAbsolutePath()); + Table table = + TABLES.create( + SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableDir.getAbsolutePath()); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); @@ -576,11 +512,14 @@ public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, Inte .mode("append") .save(tableDir.getAbsolutePath()); - List validFiles = spark.read().format("iceberg") - .load(tableLocation + "#files") - .select("file_path") - .as(Encoders.STRING()) - .collectAsList(); + List validFiles = + spark + .read() + .format("iceberg") + .load(tableLocation + "#files") + .select("file_path") + .as(Encoders.STRING()) + .collectAsList(); Assert.assertEquals("Should be 1 valid files", 1, validFiles.size()); String validFile = validFiles.get(0); @@ -588,10 +527,11 @@ public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, Inte Path dataPath = new Path(tableLocation + "/data"); FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf()); - List allFiles = Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) - .filter(FileStatus::isFile) - .map(file -> file.getPath().toString()) - .collect(Collectors.toList()); + List allFiles = + Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) + .filter(FileStatus::isFile) + .map(file -> file.getPath().toString()) + .collect(Collectors.toList()); Assert.assertEquals("Should be 2 files", 2, allFiles.size()); List invalidFiles = Lists.newArrayList(allFiles); @@ -602,10 +542,12 @@ public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, Inte Thread.sleep(1000); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .deleteWith(s -> { }) - .execute(); + DeleteOrphanFiles.Result result = + actions + .deleteOrphanFiles(table) + .olderThan(System.currentTimeMillis()) + .deleteWith(s -> {}) + .execute(); Assert.assertEquals("Action should find 1 file", invalidFiles, result.orphanFileLocations()); Assert.assertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0)))); } @@ -618,18 +560,15 @@ public void testRemoveOrphanFilesWithHadoopCatalog() throws InterruptedException Namespace namespace = Namespace.of(namespaceName); TableIdentifier tableIdentifier = TableIdentifier.of(namespace, tableName); - Table table = catalog.createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap()); + Table table = + catalog.createTable( + tableIdentifier, SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap()); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(table.location()); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(table.location()); df.write().mode("append").parquet(table.location() + "/data"); @@ -638,28 +577,30 @@ public void testRemoveOrphanFilesWithHadoopCatalog() throws InterruptedException table.refresh(); - DeleteOrphanFiles.Result result = SparkActions.get() - .deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + SparkActions.get().deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertEquals("Should delete only 1 files", 1, Iterables.size(result.orphanFileLocations())); + Assert.assertEquals( + "Should delete only 1 files", 1, Iterables.size(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(table.location()); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @Test public void testHiveCatalogTable() throws IOException { - Table table = catalog.createTable(TableIdentifier.of("default", "hivetestorphan"), SCHEMA, SPEC, tableLocation, - Maps.newHashMap()); + Table table = + catalog.createTable( + TableIdentifier.of("default", "hivetestorphan"), + SCHEMA, + SPEC, + tableLocation, + Maps.newHashMap()); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); @@ -672,35 +613,35 @@ public void testHiveCatalogTable() throws IOException { String location = table.location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result result = + SparkActions.get() + .deleteOrphanFiles(table) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(result.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "data/trashfile"))); } @Test public void testGarbageCollectionDisabled() { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - table.updateProperties() - .set(TableProperties.GC_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - AssertHelpers.assertThrows("Should complain about removing orphan files", - ValidationException.class, "Cannot remove orphan files: GC is disabled", + AssertHelpers.assertThrows( + "Should complain about removing orphan files", + ValidationException.class, + "Cannot remove orphan files: GC is disabled", () -> SparkActions.get().deleteOrphanFiles(table).execute()); } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java index 40adb7d4c918..b98c717a4aad 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + import java.io.File; import java.io.IOException; import java.util.List; @@ -53,28 +57,22 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - @RunWith(Parameterized.class) public class TestRewriteManifestsAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); @Parameterized.Parameters(name = "snapshotIdInheritanceEnabled = {0}") public static Object[] parameters() { - return new Object[] { "true", "false" }; + return new Object[] {"true", "false"}; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String snapshotIdInheritanceEnabled; private String tableLocation = null; @@ -100,7 +98,8 @@ public void testRewriteManifestsEmptyTable() throws IOException { SparkActions actions = SparkActions.get(); - actions.rewriteManifests(table) + actions + .rewriteManifests(table) .rewriteIf(manifest -> true) .stagingLocation(temp.newFolder().toString()) .execute(); @@ -115,16 +114,15 @@ public void testRewriteSmallManifestsNonPartitionedTable() { options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -134,12 +132,13 @@ public void testRewriteSmallManifestsNonPartitionedTable() { SparkActions actions = SparkActions.get(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .execute(); + RewriteManifests.Result result = + actions.rewriteManifests(table).rewriteIf(manifest -> true).execute(); - Assert.assertEquals("Action should rewrite 2 manifests", 2, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite 2 manifests", 2, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); table.refresh(); @@ -155,9 +154,8 @@ public void testRewriteSmallManifestsNonPartitionedTable() { expectedRecords.addAll(records2); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -169,16 +167,15 @@ public void testRewriteManifestsWithCommitStateUnknownException() { options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -191,15 +188,19 @@ public void testRewriteManifestsWithCommitStateUnknownException() { // create a spy which would throw a CommitStateUnknownException after successful commit. org.apache.iceberg.RewriteManifests newRewriteManifests = table.rewriteManifests(); org.apache.iceberg.RewriteManifests spyNewRewriteManifests = spy(newRewriteManifests); - doAnswer(invocation -> { - newRewriteManifests.commit(); - throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); - }).when(spyNewRewriteManifests).commit(); + doAnswer( + invocation -> { + newRewriteManifests.commit(); + throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); + }) + .when(spyNewRewriteManifests) + .commit(); Table spyTable = spy(table); when(spyTable.rewriteManifests()).thenReturn(spyNewRewriteManifests); - AssertHelpers.assertThrowsCause("Should throw a Commit State Unknown Exception", + AssertHelpers.assertThrowsCause( + "Should throw a Commit State Unknown Exception", RuntimeException.class, "Datacenter on Fire", () -> actions.rewriteManifests(spyTable).rewriteIf(manifest -> true).execute()); @@ -219,45 +220,40 @@ public void testRewriteManifestsWithCommitStateUnknownException() { expectedRecords.addAll(records2); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @Test public void testRewriteSmallManifestsPartitionedTable() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .truncate("c2", 2) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); - List records3 = Lists.newArrayList( - new ThreeColumnRecord(3, "EEEEEEEEEE", "EEEE"), - new ThreeColumnRecord(3, "FFFFFFFFFF", "FFFF") - ); + List records3 = + Lists.newArrayList( + new ThreeColumnRecord(3, "EEEEEEEEEE", "EEEE"), + new ThreeColumnRecord(3, "FFFFFFFFFF", "FFFF")); writeRecords(records3); - List records4 = Lists.newArrayList( - new ThreeColumnRecord(4, "GGGGGGGGGG", "GGGG"), - new ThreeColumnRecord(4, "HHHHHHHHHG", "HHHH") - ); + List records4 = + Lists.newArrayList( + new ThreeColumnRecord(4, "GGGGGGGGGG", "GGGG"), + new ThreeColumnRecord(4, "HHHHHHHHHG", "HHHH")); writeRecords(records4); table.refresh(); @@ -271,16 +267,18 @@ public void testRewriteSmallManifestsPartitionedTable() { long manifestEntrySizeBytes = computeManifestEntrySizeBytes(manifests); long targetManifestSizeBytes = (long) (1.05 * 4 * manifestEntrySizeBytes); - table.updateProperties() + table + .updateProperties() .set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(targetManifestSizeBytes)) .commit(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .execute(); + RewriteManifests.Result result = + actions.rewriteManifests(table).rewriteIf(manifest -> true).execute(); - Assert.assertEquals("Action should rewrite 4 manifests", 4, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite 4 manifests", 4, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); table.refresh(); @@ -302,32 +300,29 @@ public void testRewriteSmallManifestsPartitionedTable() { expectedRecords.addAll(records4); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @Test public void testRewriteImportedManifests() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c3") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c3").build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); File parquetTableDir = temp.newFolder("parquet_table"); String parquetTableLocation = parquetTableDir.toURI().toString(); try { Dataset inputDF = spark.createDataFrame(records, ThreeColumnRecord.class); - inputDF.select("c1", "c2", "c3") + inputDF + .select("c1", "c2", "c3") .write() .format("parquet") .mode("overwrite") @@ -336,19 +331,26 @@ public void testRewriteImportedManifests() throws IOException { .saveAsTable("parquet_table"); File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); + SparkTableUtil.importSparkTable( + spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); Snapshot snapshot = table.currentSnapshot(); SparkActions actions = SparkActions.get(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .stagingLocation(temp.newFolder().toString()) - .execute(); + RewriteManifests.Result result = + actions + .rewriteManifests(table) + .rewriteIf(manifest -> true) + .stagingLocation(temp.newFolder().toString()) + .execute(); - Assert.assertEquals("Action should rewrite all manifests", snapshot.allManifests(), result.rewrittenManifests()); - Assert.assertEquals("Action should add 1 manifest", 1, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite all manifests", + snapshot.allManifests(), + result.rewrittenManifests()); + Assert.assertEquals( + "Action should add 1 manifest", 1, Iterables.size(result.addedManifests())); } finally { spark.sql("DROP TABLE parquet_table"); @@ -357,9 +359,7 @@ public void testRewriteImportedManifests() throws IOException { @Test public void testRewriteLargeManifestsPartitionedTable() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c3") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c3").build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); @@ -379,19 +379,26 @@ public void testRewriteLargeManifestsPartitionedTable() throws IOException { Assert.assertEquals("Should have 1 manifests before rewrite", 1, manifests.size()); // set the target manifest size to a small value to force splitting records into multiple files - table.updateProperties() - .set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(manifests.get(0).length() / 2)) + table + .updateProperties() + .set( + TableProperties.MANIFEST_TARGET_SIZE_BYTES, + String.valueOf(manifests.get(0).length() / 2)) .commit(); SparkActions actions = SparkActions.get(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .stagingLocation(temp.newFolder().toString()) - .execute(); + RewriteManifests.Result result = + actions + .rewriteManifests(table) + .rewriteIf(manifest -> true) + .stagingLocation(temp.newFolder().toString()) + .execute(); - Assert.assertEquals("Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); table.refresh(); @@ -399,33 +406,28 @@ public void testRewriteLargeManifestsPartitionedTable() throws IOException { Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size()); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @Test public void testRewriteManifestsWithPredicate() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .truncate("c2", 2) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -436,14 +438,18 @@ public void testRewriteManifestsWithPredicate() throws IOException { SparkActions actions = SparkActions.get(); // rewrite only the first manifest without caching - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> manifest.path().equals(manifests.get(0).path())) - .stagingLocation(temp.newFolder().toString()) - .option("use-caching", "false") - .execute(); - - Assert.assertEquals("Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); + RewriteManifests.Result result = + actions + .rewriteManifests(table) + .rewriteIf(manifest -> manifest.path().equals(manifests.get(0).path())) + .stagingLocation(temp.newFolder().toString()) + .option("use-caching", "false") + .execute(); + + Assert.assertEquals( + "Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); table.refresh(); @@ -451,16 +457,16 @@ public void testRewriteManifestsWithPredicate() throws IOException { Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size()); Assert.assertFalse("First manifest must be rewritten", newManifests.contains(manifests.get(0))); - Assert.assertTrue("Second manifest must not be rewritten", newManifests.contains(manifests.get(1))); + Assert.assertTrue( + "Second manifest must not be rewritten", newManifests.contains(manifests.get(1))); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(records1); expectedRecords.addAll(records2); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -471,11 +477,7 @@ private void writeRecords(List records) { } private void writeDF(Dataset df) { - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); } private long computeManifestEntrySizeBytes(List manifests) { @@ -484,7 +486,8 @@ private long computeManifestEntrySizeBytes(List manifests) { for (ManifestFile manifest : manifests) { totalSize += manifest.length(); - numEntries += manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); + numEntries += + manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); } return totalSize / numEntries; diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java index ead159477094..2e99ca98ba16 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; @@ -38,34 +40,31 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class AvroDataTest { protected abstract void writeAndValidate(Schema schema) throws IOException; - protected static final StructType SUPPORTED_PRIMITIVES = StructType.of( - required(100, "id", LongType.get()), - optional(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - optional(103, "i", Types.IntegerType.get()), - required(104, "l", LongType.get()), - optional(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - optional(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - // required(111, "uuid", Types.UUIDType.get()), - required(112, "fixed", Types.FixedType.ofLength(7)), - optional(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + protected static final StructType SUPPORTED_PRIMITIVES = + StructType.of( + required(100, "id", LongType.get()), + optional(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + optional(103, "i", Types.IntegerType.get()), + required(104, "l", LongType.get()), + optional(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + optional(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + // required(111, "uuid", Types.UUIDType.get()), + required(112, "fixed", Types.FixedType.ofLength(7)), + optional(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision + ); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testSimpleStruct() throws IOException { @@ -74,162 +73,208 @@ public void testSimpleStruct() throws IOException { @Test public void testStructWithRequiredFields() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asRequired)))); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds( + new Schema( + Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asRequired)))); } @Test public void testStructWithOptionalFields() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)))); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds( + new Schema( + Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)))); } @Test public void testNestedStruct() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema(required(1, "struct", SUPPORTED_PRIMITIVES)))); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds(new Schema(required(1, "struct", SUPPORTED_PRIMITIVES)))); } @Test public void testArray() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", ListType.ofOptional(2, Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, Types.StringType.get()))); writeAndValidate(schema); } @Test public void testArrayOfStructs() throws IOException { - Schema schema = TypeUtil.assignIncreasingFreshIds(new Schema( - required(0, "id", LongType.get()), - optional(1, "data", ListType.ofOptional(2, SUPPORTED_PRIMITIVES)))); + Schema schema = + TypeUtil.assignIncreasingFreshIds( + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, SUPPORTED_PRIMITIVES)))); writeAndValidate(schema); } @Test public void testMap() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StringType.get(), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional(2, 3, Types.StringType.get(), Types.StringType.get()))); writeAndValidate(schema); } @Test public void testNumericMapKey() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.LongType.get(), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, "data", MapType.ofOptional(2, 3, Types.LongType.get(), Types.StringType.get()))); writeAndValidate(schema); } @Test public void testComplexMapKey() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StructType.of( - required(4, "i", Types.IntegerType.get()), - optional(5, "s", Types.StringType.get())), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional( + 2, + 3, + Types.StructType.of( + required(4, "i", Types.IntegerType.get()), + optional(5, "s", Types.StringType.get())), + Types.StringType.get()))); writeAndValidate(schema); } @Test public void testMapOfStructs() throws IOException { - Schema schema = TypeUtil.assignIncreasingFreshIds(new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)))); + Schema schema = + TypeUtil.assignIncreasingFreshIds( + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional(2, 3, Types.StringType.get(), SUPPORTED_PRIMITIVES)))); writeAndValidate(schema); } @Test public void testMixedTypes() throws IOException { - StructType structType = StructType.of( - required(0, "id", LongType.get()), - optional(1, "list_of_maps", - ListType.ofOptional(2, MapType.ofOptional(3, 4, - Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - optional(5, "map_of_lists", - MapType.ofOptional(6, 7, - Types.StringType.get(), - ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), - required(9, "list_of_lists", - ListType.ofOptional(10, ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), - required(12, "map_of_maps", - MapType.ofOptional(13, 14, - Types.StringType.get(), - MapType.ofOptional(15, 16, + StructType structType = + StructType.of( + required(0, "id", LongType.get()), + optional( + 1, + "list_of_maps", + ListType.ofOptional( + 2, MapType.ofOptional(3, 4, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + optional( + 5, + "map_of_lists", + MapType.ofOptional( + 6, 7, Types.StringType.get(), ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), + required( + 9, + "list_of_lists", + ListType.ofOptional(10, ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), + required( + 12, + "map_of_maps", + MapType.ofOptional( + 13, + 14, Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - required(17, "list_of_struct_of_nested_types", ListType.ofOptional(19, StructType.of( - Types.NestedField.required(20, "m1", MapType.ofOptional(21, 22, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(23, "l1", ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), - Types.NestedField.required(25, "l2", ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(27, "m2", MapType.ofOptional(28, 29, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)) - ))) - ); - - Schema schema = new Schema(TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) - .asStructType().fields()); + MapType.ofOptional(15, 16, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + required( + 17, + "list_of_struct_of_nested_types", + ListType.ofOptional( + 19, + StructType.of( + Types.NestedField.required( + 20, + "m1", + MapType.ofOptional( + 21, 22, Types.StringType.get(), SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 23, "l1", ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), + Types.NestedField.required( + 25, "l2", ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 27, + "m2", + MapType.ofOptional( + 28, 29, Types.StringType.get(), SUPPORTED_PRIMITIVES)))))); + + Schema schema = + new Schema( + TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) + .asStructType() + .fields()); writeAndValidate(schema); } @Test public void testTimestampWithoutZone() throws IOException { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - Schema schema = TypeUtil.assignIncreasingFreshIds(new Schema( - required(0, "id", LongType.get()), - optional(1, "ts_without_zone", Types.TimestampType.withoutZone()))); - - writeAndValidate(schema); - }); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + Schema schema = + TypeUtil.assignIncreasingFreshIds( + new Schema( + required(0, "id", LongType.get()), + optional(1, "ts_without_zone", Types.TimestampType.withoutZone()))); + + writeAndValidate(schema); + }); } protected void withSQLConf(Map conf, Action action) throws IOException { SQLConf sqlConf = SQLConf.get(); Map currentConfValues = Maps.newHashMap(); - conf.keySet().forEach(confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); - - conf.forEach((confKey, confValue) -> { - if (SQLConf.staticConfKeys().contains(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); + conf.keySet() + .forEach( + confKey -> { + if (sqlConf.contains(confKey)) { + String currentConfValue = sqlConf.getConfString(confKey); + currentConfValues.put(confKey, currentConfValue); + } + }); + + conf.forEach( + (confKey, confValue) -> { + if (SQLConf.staticConfKeys().contains(confKey)) { + throw new RuntimeException("Cannot modify the value of a static config: " + confKey); + } + sqlConf.setConfString(confKey, confValue); + }); try { action.invoke(); } finally { - conf.forEach((confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); + conf.forEach( + (confKey, confValue) -> { + if (currentConfValues.containsKey(confKey)) { + sqlConf.setConfString(confKey, currentConfValues.get(confKey)); + } else { + sqlConf.unsetConf(confKey); + } + }); } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java index 46c95cef112d..a96e3b1f57f5 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static scala.collection.JavaConverters.mapAsJavaMapConverter; +import static scala.collection.JavaConverters.seqAsJavaListConverter; + import java.math.BigDecimal; import java.nio.ByteBuffer; import java.sql.Timestamp; @@ -48,13 +51,8 @@ import org.junit.Assert; import scala.collection.Seq; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static scala.collection.JavaConverters.mapAsJavaMapConverter; -import static scala.collection.JavaConverters.seqAsJavaListConverter; - public class GenericsHelpers { - private GenericsHelpers() { - } + private GenericsHelpers() {} private static final OffsetDateTime EPOCH = Instant.ofEpochMilli(0L).atOffset(ZoneOffset.UTC); private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); @@ -71,7 +69,8 @@ public static void assertEqualsSafe(Types.StructType struct, Record expected, Ro } } - private static void assertEqualsSafe(Types.ListType list, Collection expected, List actual) { + private static void assertEqualsSafe( + Types.ListType list, Collection expected, List actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); for (int i = 0; i < expectedElements.size(); i += 1) { @@ -82,11 +81,11 @@ private static void assertEqualsSafe(Types.ListType list, Collection expected } } - private static void assertEqualsSafe(Types.MapType map, - Map expected, Map actual) { + private static void assertEqualsSafe(Types.MapType map, Map expected, Map actual) { Type keyType = map.keyType(); Type valueType = map.valueType(); - Assert.assertEquals("Should have the same number of keys", expected.keySet().size(), actual.keySet().size()); + Assert.assertEquals( + "Should have the same number of keys", expected.keySet().size(), actual.keySet().size()); for (Object expectedKey : expected.keySet()) { Object matchingKey = null; @@ -120,22 +119,29 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) Assert.assertEquals("Primitive value should be equal to expected", expected, actual); break; case DATE: - Assertions.assertThat(expected).as("Should expect a LocalDate").isInstanceOf(LocalDate.class); + Assertions.assertThat(expected) + .as("Should expect a LocalDate") + .isInstanceOf(LocalDate.class); Assertions.assertThat(actual).as("Should be a Date").isInstanceOf(Date.class); - Assert.assertEquals("ISO-8601 date should be equal", expected.toString(), actual.toString()); + Assert.assertEquals( + "ISO-8601 date should be equal", expected.toString(), actual.toString()); break; case TIMESTAMP: Assertions.assertThat(actual).as("Should be a Timestamp").isInstanceOf(Timestamp.class); Timestamp ts = (Timestamp) actual; // milliseconds from nanos has already been added by getTime - OffsetDateTime actualTs = EPOCH.plusNanos( - (ts.getTime() * 1_000_000) + (ts.getNanos() % 1_000_000)); + OffsetDateTime actualTs = + EPOCH.plusNanos((ts.getTime() * 1_000_000) + (ts.getNanos() % 1_000_000)); Types.TimestampType timestampType = (Types.TimestampType) type; if (timestampType.shouldAdjustToUTC()) { - Assertions.assertThat(expected).as("Should expect an OffsetDateTime").isInstanceOf(OffsetDateTime.class); + Assertions.assertThat(expected) + .as("Should expect an OffsetDateTime") + .isInstanceOf(OffsetDateTime.class); Assert.assertEquals("Timestamp should be equal", expected, actualTs); } else { - Assertions.assertThat(expected).as("Should expect an LocalDateTime").isInstanceOf(LocalDateTime.class); + Assertions.assertThat(expected) + .as("Should expect an LocalDateTime") + .isInstanceOf(LocalDateTime.class); Assert.assertEquals("Timestamp should be equal", expected, actualTs.toLocalDateTime()); } break; @@ -146,23 +152,25 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a String").isInstanceOf(String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual); + Assert.assertEquals("UUID string representation should match", expected.toString(), actual); break; case FIXED: Assertions.assertThat(expected).as("Should expect a byte[]").isInstanceOf(byte[].class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - (byte[]) expected, (byte[]) actual); + Assert.assertArrayEquals("Bytes should match", (byte[]) expected, (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a BigDecimal").isInstanceOf(BigDecimal.class); Assert.assertEquals("BigDecimals should be equal", expected, actual); break; @@ -172,16 +180,20 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) assertEqualsSafe(type.asNestedType().asStructType(), (Record) expected, (Row) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be a Seq").isInstanceOf(Seq.class); List asList = seqAsJavaListConverter((Seq) actual).asJava(); assertEqualsSafe(type.asNestedType().asListType(), (Collection) expected, asList); break; case MAP: Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be a Map").isInstanceOf(scala.collection.Map.class); - Map asMap = mapAsJavaMapConverter( - (scala.collection.Map) actual).asJava(); + Assertions.assertThat(actual) + .as("Should be a Map") + .isInstanceOf(scala.collection.Map.class); + Map asMap = + mapAsJavaMapConverter((scala.collection.Map) actual).asJava(); assertEqualsSafe(type.asNestedType().asMapType(), (Map) expected, asMap); break; case TIME: @@ -190,7 +202,8 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) } } - public static void assertEqualsUnsafe(Types.StructType struct, Record expected, InternalRow actual) { + public static void assertEqualsUnsafe( + Types.StructType struct, Record expected, InternalRow actual) { List fields = struct.fields(); for (int i = 0; i < fields.size(); i += 1) { Type fieldType = fields.get(i).type(); @@ -202,7 +215,8 @@ public static void assertEqualsUnsafe(Types.StructType struct, Record expected, } } - private static void assertEqualsUnsafe(Types.ListType list, Collection expected, ArrayData actual) { + private static void assertEqualsUnsafe( + Types.ListType list, Collection expected, ArrayData actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); for (int i = 0; i < expectedElements.size(); i += 1) { @@ -245,20 +259,29 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual Assert.assertEquals("Primitive value should be equal to expected", expected, actual); break; case DATE: - Assertions.assertThat(expected).as("Should expect a LocalDate").isInstanceOf(LocalDate.class); + Assertions.assertThat(expected) + .as("Should expect a LocalDate") + .isInstanceOf(LocalDate.class); int expectedDays = (int) ChronoUnit.DAYS.between(EPOCH_DAY, (LocalDate) expected); Assert.assertEquals("Primitive value should be equal to expected", expectedDays, actual); break; case TIMESTAMP: Types.TimestampType timestampType = (Types.TimestampType) type; if (timestampType.shouldAdjustToUTC()) { - Assertions.assertThat(expected).as("Should expect an OffsetDateTime").isInstanceOf(OffsetDateTime.class); + Assertions.assertThat(expected) + .as("Should expect an OffsetDateTime") + .isInstanceOf(OffsetDateTime.class); long expectedMicros = ChronoUnit.MICROS.between(EPOCH, (OffsetDateTime) expected); - Assert.assertEquals("Primitive value should be equal to expected", expectedMicros, actual); + Assert.assertEquals( + "Primitive value should be equal to expected", expectedMicros, actual); } else { - Assertions.assertThat(expected).as("Should expect an LocalDateTime").isInstanceOf(LocalDateTime.class); - long expectedMicros = ChronoUnit.MICROS.between(EPOCH, ((LocalDateTime) expected).atZone(ZoneId.of("UTC"))); - Assert.assertEquals("Primitive value should be equal to expected", expectedMicros, actual); + Assertions.assertThat(expected) + .as("Should expect an LocalDateTime") + .isInstanceOf(LocalDateTime.class); + long expectedMicros = + ChronoUnit.MICROS.between(EPOCH, ((LocalDateTime) expected).atZone(ZoneId.of("UTC"))); + Assert.assertEquals( + "Primitive value should be equal to expected", expectedMicros, actual); } break; case STRING: @@ -268,8 +291,8 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a UTF8String").isInstanceOf(UTF8String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual.toString()); + Assert.assertEquals( + "UUID string representation should match", expected.toString(), actual.toString()); break; case FIXED: Assertions.assertThat(expected).as("Should expect a byte[]").isInstanceOf(byte[].class); @@ -277,30 +300,42 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual Assert.assertArrayEquals("Bytes should match", (byte[]) expected, (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a Decimal").isInstanceOf(Decimal.class); - Assert.assertEquals("BigDecimals should be equal", - expected, ((Decimal) actual).toJavaBigDecimal()); + Assert.assertEquals( + "BigDecimals should be equal", expected, ((Decimal) actual).toJavaBigDecimal()); break; case STRUCT: Assertions.assertThat(expected).as("Should expect a Record").isInstanceOf(Record.class); - Assertions.assertThat(actual).as("Should be an InternalRow").isInstanceOf(InternalRow.class); - assertEqualsUnsafe(type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); + Assertions.assertThat(actual) + .as("Should be an InternalRow") + .isInstanceOf(InternalRow.class); + assertEqualsUnsafe( + type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be an ArrayData").isInstanceOf(ArrayData.class); - assertEqualsUnsafe(type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); + assertEqualsUnsafe( + type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); break; case MAP: Assertions.assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be an ArrayBasedMapData").isInstanceOf(MapData.class); + Assertions.assertThat(actual) + .as("Should be an ArrayBasedMapData") + .isInstanceOf(MapData.class); assertEqualsUnsafe(type.asNestedType().asMapType(), (Map) expected, (MapData) actual); break; case TIME: diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java index d3bffb75eb5c..1c95df8ced12 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.math.BigDecimal; @@ -53,8 +52,7 @@ public class RandomData { // Default percentage of number of values that are null for optional fields public static final float DEFAULT_NULL_PERCENTAGE = 0.05f; - private RandomData() { - } + private RandomData() {} public static List generateList(Schema schema, int numRecords, long seed) { RandomDataGenerator generator = new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE); @@ -67,63 +65,71 @@ public static List generateList(Schema schema, int numRecords, long seed } public static Iterable generateSpark(Schema schema, int numRecords, long seed) { - return () -> new Iterator() { - private SparkRandomDataGenerator generator = new SparkRandomDataGenerator(seed); - private int count = 0; - - @Override - public boolean hasNext() { - return count < numRecords; - } - - @Override - public InternalRow next() { - if (count >= numRecords) { - throw new NoSuchElementException(); - } - count += 1; - return (InternalRow) TypeUtil.visit(schema, generator); - } - }; + return () -> + new Iterator() { + private SparkRandomDataGenerator generator = new SparkRandomDataGenerator(seed); + private int count = 0; + + @Override + public boolean hasNext() { + return count < numRecords; + } + + @Override + public InternalRow next() { + if (count >= numRecords) { + throw new NoSuchElementException(); + } + count += 1; + return (InternalRow) TypeUtil.visit(schema, generator); + } + }; } public static Iterable generate(Schema schema, int numRecords, long seed) { - return newIterable(() -> new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE), schema, numRecords); + return newIterable( + () -> new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE), schema, numRecords); } - public static Iterable generate(Schema schema, int numRecords, long seed, float nullPercentage) { - return newIterable(() -> new RandomDataGenerator(schema, seed, nullPercentage), schema, numRecords); + public static Iterable generate( + Schema schema, int numRecords, long seed, float nullPercentage) { + return newIterable( + () -> new RandomDataGenerator(schema, seed, nullPercentage), schema, numRecords); } - public static Iterable generateFallbackData(Schema schema, int numRecords, long seed, long numDictRecords) { - return newIterable(() -> new FallbackDataGenerator(schema, seed, numDictRecords), schema, numRecords); + public static Iterable generateFallbackData( + Schema schema, int numRecords, long seed, long numDictRecords) { + return newIterable( + () -> new FallbackDataGenerator(schema, seed, numDictRecords), schema, numRecords); } public static Iterable generateDictionaryEncodableData( Schema schema, int numRecords, long seed, float nullPercentage) { - return newIterable(() -> new DictionaryEncodedDataGenerator(schema, seed, nullPercentage), schema, numRecords); + return newIterable( + () -> new DictionaryEncodedDataGenerator(schema, seed, nullPercentage), schema, numRecords); } - private static Iterable newIterable(Supplier newGenerator, - Schema schema, int numRecords) { - return () -> new Iterator() { - private int count = 0; - private RandomDataGenerator generator = newGenerator.get(); - - @Override - public boolean hasNext() { - return count < numRecords; - } - - @Override - public Record next() { - if (count >= numRecords) { - throw new NoSuchElementException(); - } - count += 1; - return (Record) TypeUtil.visit(schema, generator); - } - }; + private static Iterable newIterable( + Supplier newGenerator, Schema schema, int numRecords) { + return () -> + new Iterator() { + private int count = 0; + private RandomDataGenerator generator = newGenerator.get(); + + @Override + public boolean hasNext() { + return count < numRecords; + } + + @Override + public Record next() { + if (count >= numRecords) { + throw new NoSuchElementException(); + } + count += 1; + return (Record) TypeUtil.visit(schema, generator); + } + }; } private static class RandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor { @@ -218,8 +224,7 @@ public Object primitive(Type.PrimitiveType primitive) { // them here. switch (primitive.typeId()) { case FIXED: - return new GenericData.Fixed(typeToSchema.get(primitive), - (byte[]) result); + return new GenericData.Fixed(typeToSchema.get(primitive), (byte[]) result); case BINARY: return ByteBuffer.wrap((byte[]) result); case UUID: diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java index 53d5e8763e6f..42f4c1a1ab42 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static scala.collection.JavaConverters.mapAsJavaMapConverter; +import static scala.collection.JavaConverters.seqAsJavaListConverter; + import java.math.BigDecimal; import java.nio.ByteBuffer; import java.sql.Timestamp; @@ -63,14 +66,9 @@ import org.junit.Assert; import scala.collection.Seq; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static scala.collection.JavaConverters.mapAsJavaMapConverter; -import static scala.collection.JavaConverters.seqAsJavaListConverter; - public class TestHelpers { - private TestHelpers() { - } + private TestHelpers() {} public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row) { List fields = struct.fields(); @@ -84,8 +82,11 @@ public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row } } - public static void assertEqualsBatch(Types.StructType struct, Iterator expected, ColumnarBatch batch, - boolean checkArrowValidityVector) { + public static void assertEqualsBatch( + Types.StructType struct, + Iterator expected, + ColumnarBatch batch, + boolean checkArrowValidityVector) { for (int rowId = 0; rowId < batch.numRows(); rowId++) { List fields = struct.fields(); InternalRow row = batch.getRow(rowId); @@ -98,15 +99,16 @@ public static void assertEqualsBatch(Types.StructType struct, Iterator e if (checkArrowValidityVector) { ColumnVector columnVector = batch.column(i); - ValueVector arrowVector = ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector(); - Assert.assertFalse("Nullability doesn't match of " + columnVector.dataType(), + ValueVector arrowVector = + ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector(); + Assert.assertFalse( + "Nullability doesn't match of " + columnVector.dataType(), expectedValue == null ^ arrowVector.isNull(rowId)); } } } } - private static void assertEqualsSafe(Types.ListType list, Collection expected, List actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); @@ -118,8 +120,7 @@ private static void assertEqualsSafe(Types.ListType list, Collection expected } } - private static void assertEqualsSafe(Types.MapType map, - Map expected, Map actual) { + private static void assertEqualsSafe(Types.MapType map, Map expected, Map actual) { Type keyType = map.keyType(); Type valueType = map.valueType(); @@ -178,23 +179,28 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a String").isInstanceOf(String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual); + Assert.assertEquals("UUID string representation should match", expected.toString(), actual); break; case FIXED: - Assertions.assertThat(expected).as("Should expect a Fixed").isInstanceOf(GenericData.Fixed.class); + Assertions.assertThat(expected) + .as("Should expect a Fixed") + .isInstanceOf(GenericData.Fixed.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((GenericData.Fixed) expected).bytes(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((GenericData.Fixed) expected).bytes(), (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a BigDecimal").isInstanceOf(BigDecimal.class); Assert.assertEquals("BigDecimals should be equal", expected, actual); break; @@ -204,16 +210,20 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) assertEqualsSafe(type.asNestedType().asStructType(), (Record) expected, (Row) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be a Seq").isInstanceOf(Seq.class); List asList = seqAsJavaListConverter((Seq) actual).asJava(); assertEqualsSafe(type.asNestedType().asListType(), (Collection) expected, asList); break; case MAP: Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be a Map").isInstanceOf(scala.collection.Map.class); - Map asMap = mapAsJavaMapConverter( - (scala.collection.Map) actual).asJava(); + Assertions.assertThat(actual) + .as("Should be a Map") + .isInstanceOf(scala.collection.Map.class); + Map asMap = + mapAsJavaMapConverter((scala.collection.Map) actual).asJava(); assertEqualsSafe(type.asNestedType().asMapType(), (Map) expected, asMap); break; case TIME: @@ -234,7 +244,8 @@ public static void assertEqualsUnsafe(Types.StructType struct, Record rec, Inter } } - private static void assertEqualsUnsafe(Types.ListType list, Collection expected, ArrayData actual) { + private static void assertEqualsUnsafe( + Types.ListType list, Collection expected, ArrayData actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); for (int i = 0; i < expectedElements.size(); i += 1) { @@ -280,8 +291,10 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual case DOUBLE: Assertions.assertThat(actual).as("Should be a double").isInstanceOf(Double.class); if (expected instanceof Float) { - Assert.assertEquals("Values didn't match", Double.doubleToLongBits(((Number) expected).doubleValue()), - Double.doubleToLongBits((double) actual)); + Assert.assertEquals( + "Values didn't match", + Double.doubleToLongBits(((Number) expected).doubleValue()), + Double.doubleToLongBits((double) actual)); } else { Assert.assertEquals("Primitive value should be equal to expected", expected, actual); } @@ -300,40 +313,54 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a UTF8String").isInstanceOf(UTF8String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual.toString()); + Assert.assertEquals( + "UUID string representation should match", expected.toString(), actual.toString()); break; case FIXED: - Assertions.assertThat(expected).as("Should expect a Fixed").isInstanceOf(GenericData.Fixed.class); + Assertions.assertThat(expected) + .as("Should expect a Fixed") + .isInstanceOf(GenericData.Fixed.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((GenericData.Fixed) expected).bytes(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((GenericData.Fixed) expected).bytes(), (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a Decimal").isInstanceOf(Decimal.class); - Assert.assertEquals("BigDecimals should be equal", - expected, ((Decimal) actual).toJavaBigDecimal()); + Assert.assertEquals( + "BigDecimals should be equal", expected, ((Decimal) actual).toJavaBigDecimal()); break; case STRUCT: Assertions.assertThat(expected).as("Should expect a Record").isInstanceOf(Record.class); - Assertions.assertThat(actual).as("Should be an InternalRow").isInstanceOf(InternalRow.class); - assertEqualsUnsafe(type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); + Assertions.assertThat(actual) + .as("Should be an InternalRow") + .isInstanceOf(InternalRow.class); + assertEqualsUnsafe( + type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be an ArrayData").isInstanceOf(ArrayData.class); - assertEqualsUnsafe(type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); + assertEqualsUnsafe( + type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); break; case MAP: Assertions.assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be an ArrayBasedMapData").isInstanceOf(MapData.class); + Assertions.assertThat(actual) + .as("Should be an ArrayBasedMapData") + .isInstanceOf(MapData.class); assertEqualsUnsafe(type.asNestedType().asMapType(), (Map) expected, (MapData) actual); break; case TIME: @@ -344,13 +371,14 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual /** * Check that the given InternalRow is equivalent to the Row. + * * @param prefix context for error messages * @param type the type of the row * @param expected the expected value of the row * @param actual the actual value of the row */ - public static void assertEquals(String prefix, Types.StructType type, - InternalRow expected, Row actual) { + public static void assertEquals( + String prefix, Types.StructType type, InternalRow expected, Row actual) { if (expected == null || actual == null) { Assert.assertEquals(prefix, expected, actual); } else { @@ -368,30 +396,41 @@ public static void assertEquals(String prefix, Types.StructType type, case DECIMAL: case DATE: case TIMESTAMP: - Assert.assertEquals(prefix + "." + fieldName + " - " + childType, + Assert.assertEquals( + prefix + "." + fieldName + " - " + childType, getValue(expected, c, childType), getPrimitiveValue(actual, c, childType)); break; case UUID: case FIXED: case BINARY: - assertEqualBytes(prefix + "." + fieldName, + assertEqualBytes( + prefix + "." + fieldName, (byte[]) getValue(expected, c, childType), (byte[]) actual.get(c)); break; - case STRUCT: { - Types.StructType st = (Types.StructType) childType; - assertEquals(prefix + "." + fieldName, st, - expected.getStruct(c, st.fields().size()), actual.getStruct(c)); - break; - } + case STRUCT: + { + Types.StructType st = (Types.StructType) childType; + assertEquals( + prefix + "." + fieldName, + st, + expected.getStruct(c, st.fields().size()), + actual.getStruct(c)); + break; + } case LIST: - assertEqualsLists(prefix + "." + fieldName, childType.asListType(), + assertEqualsLists( + prefix + "." + fieldName, + childType.asListType(), expected.getArray(c), toList((Seq) actual.get(c))); break; case MAP: - assertEqualsMaps(prefix + "." + fieldName, childType.asMapType(), expected.getMap(c), + assertEqualsMaps( + prefix + "." + fieldName, + childType.asMapType(), + expected.getMap(c), toJavaMap((scala.collection.Map) actual.getMap(c))); break; default: @@ -401,8 +440,8 @@ public static void assertEquals(String prefix, Types.StructType type, } } - private static void assertEqualsLists(String prefix, Types.ListType type, - ArrayData expected, List actual) { + private static void assertEqualsLists( + String prefix, Types.ListType type, ArrayData expected, List actual) { if (expected == null || actual == null) { Assert.assertEquals(prefix, expected, actual); } else { @@ -419,31 +458,42 @@ private static void assertEqualsLists(String prefix, Types.ListType type, case DECIMAL: case DATE: case TIMESTAMP: - Assert.assertEquals(prefix + ".elem " + e + " - " + childType, + Assert.assertEquals( + prefix + ".elem " + e + " - " + childType, getValue(expected, e, childType), actual.get(e)); break; case UUID: case FIXED: case BINARY: - assertEqualBytes(prefix + ".elem " + e, + assertEqualBytes( + prefix + ".elem " + e, (byte[]) getValue(expected, e, childType), (byte[]) actual.get(e)); break; - case STRUCT: { - Types.StructType st = (Types.StructType) childType; - assertEquals(prefix + ".elem " + e, st, - expected.getStruct(e, st.fields().size()), (Row) actual.get(e)); - break; - } + case STRUCT: + { + Types.StructType st = (Types.StructType) childType; + assertEquals( + prefix + ".elem " + e, + st, + expected.getStruct(e, st.fields().size()), + (Row) actual.get(e)); + break; + } case LIST: - assertEqualsLists(prefix + ".elem " + e, childType.asListType(), + assertEqualsLists( + prefix + ".elem " + e, + childType.asListType(), expected.getArray(e), toList((Seq) actual.get(e))); break; case MAP: - assertEqualsMaps(prefix + ".elem " + e, childType.asMapType(), - expected.getMap(e), toJavaMap((scala.collection.Map) actual.get(e))); + assertEqualsMaps( + prefix + ".elem " + e, + childType.asMapType(), + expected.getMap(e), + toJavaMap((scala.collection.Map) actual.get(e))); break; default: throw new IllegalArgumentException("Unhandled type " + childType); @@ -452,8 +502,8 @@ private static void assertEqualsLists(String prefix, Types.ListType type, } } - private static void assertEqualsMaps(String prefix, Types.MapType type, - MapData expected, Map actual) { + private static void assertEqualsMaps( + String prefix, Types.MapType type, MapData expected, Map actual) { if (expected == null || actual == null) { Assert.assertEquals(prefix, expected, actual); } else { @@ -466,7 +516,9 @@ private static void assertEqualsMaps(String prefix, Types.MapType type, Object expectedKey = getValue(expectedKeyArray, e, keyType); Object actualValue = actual.get(expectedKey); if (actualValue == null) { - Assert.assertEquals(prefix + ".key=" + expectedKey + " has null", true, + Assert.assertEquals( + prefix + ".key=" + expectedKey + " has null", + true, expected.valueArray().isNullAt(e)); } else { switch (valueType.typeId()) { @@ -479,32 +531,40 @@ private static void assertEqualsMaps(String prefix, Types.MapType type, case DECIMAL: case DATE: case TIMESTAMP: - Assert.assertEquals(prefix + ".key=" + expectedKey + " - " + valueType, + Assert.assertEquals( + prefix + ".key=" + expectedKey + " - " + valueType, getValue(expectedValueArray, e, valueType), actual.get(expectedKey)); break; case UUID: case FIXED: case BINARY: - assertEqualBytes(prefix + ".key=" + expectedKey, + assertEqualBytes( + prefix + ".key=" + expectedKey, (byte[]) getValue(expectedValueArray, e, valueType), (byte[]) actual.get(expectedKey)); break; - case STRUCT: { - Types.StructType st = (Types.StructType) valueType; - assertEquals(prefix + ".key=" + expectedKey, st, - expectedValueArray.getStruct(e, st.fields().size()), - (Row) actual.get(expectedKey)); - break; - } + case STRUCT: + { + Types.StructType st = (Types.StructType) valueType; + assertEquals( + prefix + ".key=" + expectedKey, + st, + expectedValueArray.getStruct(e, st.fields().size()), + (Row) actual.get(expectedKey)); + break; + } case LIST: - assertEqualsLists(prefix + ".key=" + expectedKey, + assertEqualsLists( + prefix + ".key=" + expectedKey, valueType.asListType(), expectedValueArray.getArray(e), toList((Seq) actual.get(expectedKey))); break; case MAP: - assertEqualsMaps(prefix + ".key=" + expectedKey, valueType.asMapType(), + assertEqualsMaps( + prefix + ".key=" + expectedKey, + valueType.asMapType(), expectedValueArray.getMap(e), toJavaMap((scala.collection.Map) actual.get(expectedKey))); break; @@ -516,8 +576,7 @@ private static void assertEqualsMaps(String prefix, Types.MapType type, } } - private static Object getValue(SpecializedGetters container, int ord, - Type type) { + private static Object getValue(SpecializedGetters container, int ord, Type type) { if (container.isNullAt(ord)) { return null; } @@ -542,10 +601,11 @@ private static Object getValue(SpecializedGetters container, int ord, return new DateWritable(container.getInt(ord)).get(); case TIMESTAMP: return DateTimeUtils.toJavaTimestamp(container.getLong(ord)); - case DECIMAL: { - Types.DecimalType dt = (Types.DecimalType) type; - return container.getDecimal(ord, dt.precision(), dt.scale()).toJavaBigDecimal(); - } + case DECIMAL: + { + Types.DecimalType dt = (Types.DecimalType) type; + return container.getDecimal(ord, dt.precision(), dt.scale()).toJavaBigDecimal(); + } case STRUCT: Types.StructType struct = type.asStructType(); InternalRow internalRow = container.getStruct(ord, struct.fields().size()); @@ -603,8 +663,7 @@ private static List toList(Seq val) { return val == null ? null : seqAsJavaListConverter(val).asJava(); } - private static void assertEqualBytes(String context, byte[] expected, - byte[] actual) { + private static void assertEqualBytes(String context, byte[] expected, byte[] actual) { if (expected == null || actual == null) { Assert.assertEquals(context, expected, actual); } else { @@ -622,23 +681,29 @@ private static void assertEquals(String context, DataType type, Object expected, } if (type instanceof StructType) { - Assertions.assertThat(expected).as("Expected should be an InternalRow: " + context) + Assertions.assertThat(expected) + .as("Expected should be an InternalRow: " + context) .isInstanceOf(InternalRow.class); - Assertions.assertThat(actual).as("Actual should be an InternalRow: " + context) + Assertions.assertThat(actual) + .as("Actual should be an InternalRow: " + context) .isInstanceOf(InternalRow.class); assertEquals(context, (StructType) type, (InternalRow) expected, (InternalRow) actual); } else if (type instanceof ArrayType) { - Assertions.assertThat(expected).as("Expected should be an ArrayData: " + context) + Assertions.assertThat(expected) + .as("Expected should be an ArrayData: " + context) .isInstanceOf(ArrayData.class); - Assertions.assertThat(actual).as("Actual should be an ArrayData: " + context) + Assertions.assertThat(actual) + .as("Actual should be an ArrayData: " + context) .isInstanceOf(ArrayData.class); assertEquals(context, (ArrayType) type, (ArrayData) expected, (ArrayData) actual); } else if (type instanceof MapType) { - Assertions.assertThat(expected).as("Expected should be a MapData: " + context) + Assertions.assertThat(expected) + .as("Expected should be a MapData: " + context) .isInstanceOf(MapData.class); - Assertions.assertThat(actual).as("Actual should be a MapData: " + context) + Assertions.assertThat(actual) + .as("Actual should be a MapData: " + context) .isInstanceOf(MapData.class); assertEquals(context, (MapType) type, (MapData) expected, (MapData) actual); @@ -649,32 +714,37 @@ private static void assertEquals(String context, DataType type, Object expected, } } - private static void assertEquals(String context, StructType struct, - InternalRow expected, InternalRow actual) { + private static void assertEquals( + String context, StructType struct, InternalRow expected, InternalRow actual) { Assert.assertEquals("Should have correct number of fields", struct.size(), actual.numFields()); for (int i = 0; i < actual.numFields(); i += 1) { StructField field = struct.fields()[i]; DataType type = field.dataType(); - assertEquals(context + "." + field.name(), type, + assertEquals( + context + "." + field.name(), + type, expected.isNullAt(i) ? null : expected.get(i, type), actual.isNullAt(i) ? null : actual.get(i, type)); } } - private static void assertEquals(String context, ArrayType array, ArrayData expected, ArrayData actual) { - Assert.assertEquals("Should have the same number of elements", - expected.numElements(), actual.numElements()); + private static void assertEquals( + String context, ArrayType array, ArrayData expected, ArrayData actual) { + Assert.assertEquals( + "Should have the same number of elements", expected.numElements(), actual.numElements()); DataType type = array.elementType(); for (int i = 0; i < actual.numElements(); i += 1) { - assertEquals(context + ".element", type, + assertEquals( + context + ".element", + type, expected.isNullAt(i) ? null : expected.get(i, type), actual.isNullAt(i) ? null : actual.get(i, type)); } } private static void assertEquals(String context, MapType map, MapData expected, MapData actual) { - Assert.assertEquals("Should have the same number of elements", - expected.numElements(), actual.numElements()); + Assert.assertEquals( + "Should have the same number of elements", expected.numElements(), actual.numElements()); DataType keyType = map.keyType(); ArrayData expectedKeys = expected.keyArray(); @@ -685,10 +755,14 @@ private static void assertEquals(String context, MapType map, MapData expected, ArrayData actualValues = actual.valueArray(); for (int i = 0; i < actual.numElements(); i += 1) { - assertEquals(context + ".key", keyType, + assertEquals( + context + ".key", + keyType, expectedKeys.isNullAt(i) ? null : expectedKeys.get(i, keyType), actualKeys.isNullAt(i) ? null : actualKeys.get(i, keyType)); - assertEquals(context + ".value", valueType, + assertEquals( + context + ".value", + valueType, expectedValues.isNullAt(i) ? null : expectedValues.get(i, valueType), actualValues.isNullAt(i) ? null : actualValues.get(i, valueType)); } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java index 7cf9b9c736c6..1e51a088390e 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import org.apache.iceberg.Files; @@ -32,16 +33,12 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestOrcWrite { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); @Test public void splitOffsets() throws IOException { @@ -49,10 +46,11 @@ public void splitOffsets() throws IOException { Assert.assertTrue("Delete should succeed", testFile.delete()); Iterable rows = RandomData.generateSpark(SCHEMA, 1, 0L); - FileAppender writer = ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(SCHEMA) - .build(); + FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(SCHEMA) + .build(); writer.addAll(rows); writer.close(); diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java index 464e3165583c..a4ffc2fea437 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -38,54 +40,68 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestParquetAvroReader { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required(5, "strict", Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional(6, "hopeful", Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()) - )), - optional(10, "vehement", Types.LongType.get()) - )), - optional(11, "metamorphosis", Types.MapType.ofRequired(12, 13, - Types.StringType.get(), Types.TimestampType.withoutZone())), - required(14, "winter", Types.ListType.ofOptional(15, Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.TimeType.get()), - optional(18, "wheeze", Types.StringType.get()) - ))), - optional(19, "renovate", Types.MapType.ofRequired(20, 21, - Types.StringType.get(), Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.TimeType.get()), - required(24, "couch rope", Types.IntegerType.get()) - ))), - optional(2, "slide", Types.StringType.get()) - ); + @Rule public TemporaryFolder temp = new TemporaryFolder(); + + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "roots", Types.LongType.get()), + optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), + required( + 5, + "strict", + Types.StructType.of( + required(9, "tangerine", Types.StringType.get()), + optional( + 6, + "hopeful", + Types.StructType.of( + required(7, "steel", Types.FloatType.get()), + required(8, "lantern", Types.DateType.get()))), + optional(10, "vehement", Types.LongType.get()))), + optional( + 11, + "metamorphosis", + Types.MapType.ofRequired( + 12, 13, Types.StringType.get(), Types.TimestampType.withoutZone())), + required( + 14, + "winter", + Types.ListType.ofOptional( + 15, + Types.StructType.of( + optional(16, "beet", Types.DoubleType.get()), + required(17, "stamp", Types.TimeType.get()), + optional(18, "wheeze", Types.StringType.get())))), + optional( + 19, + "renovate", + Types.MapType.ofRequired( + 20, + 21, + Types.StringType.get(), + Types.StructType.of( + optional(22, "jumpy", Types.DoubleType.get()), + required(23, "koala", Types.TimeType.get()), + required(24, "couch rope", Types.IntegerType.get())))), + optional(2, "slide", Types.StringType.get())); @Ignore public void testStructSchema() throws IOException { - Schema structSchema = new Schema( - required(1, "circumvent", Types.LongType.get()), - optional(2, "antarctica", Types.StringType.get()), - optional(3, "fluent", Types.DoubleType.get()), - required(4, "quell", Types.StructType.of( - required(5, "operator", Types.BooleanType.get()), - optional(6, "fanta", Types.IntegerType.get()), - optional(7, "cable", Types.FloatType.get()) - )), - required(8, "chimney", Types.TimestampType.withZone()), - required(9, "wool", Types.DateType.get()) - ); + Schema structSchema = + new Schema( + required(1, "circumvent", Types.LongType.get()), + optional(2, "antarctica", Types.StringType.get()), + optional(3, "fluent", Types.DoubleType.get()), + required( + 4, + "quell", + Types.StructType.of( + required(5, "operator", Types.BooleanType.get()), + optional(6, "fanta", Types.IntegerType.get()), + optional(7, "cable", Types.FloatType.get()))), + required(8, "chimney", Types.TimestampType.withZone()), + required(9, "wool", Types.DateType.get())); File testFile = writeTestData(structSchema, 5_000_000, 1059); // RandomData uses the root record name "test", which must match for records to be equal @@ -100,11 +116,12 @@ public void testStructSchema() throws IOException { // clean up as much memory as possible to avoid a large GC during the timed run System.gc(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(structSchema) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(structSchema, readSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(structSchema) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(structSchema, readSchema)) + .build()) { long start = System.currentTimeMillis(); long val = 0; long count = 0; @@ -137,9 +154,8 @@ public void testWithOldReadPath() throws IOException { // clean up as much memory as possible to avoid a large GC during the timed run System.gc(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)).project(COMPLEX_SCHEMA).build()) { long start = System.currentTimeMillis(); long val = 0; long count = 0; @@ -154,11 +170,12 @@ public void testWithOldReadPath() throws IOException { // clean up as much memory as possible to avoid a large GC during the timed run System.gc(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) + .build()) { long start = System.currentTimeMillis(); long val = 0; long count = 0; @@ -179,9 +196,8 @@ public void testCorrectness() throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)).schema(COMPLEX_SCHEMA).build()) { writer.addAll(records); } @@ -189,12 +205,13 @@ public void testCorrectness() throws IOException { MessageType readSchema = ParquetSchemaUtil.convert(COMPLEX_SCHEMA, "test"); // verify that the new read path is correct - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .reuseContainers() - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) + .reuseContainers() + .build()) { int recordNum = 0; Iterator iter = records.iterator(); for (Record actual : reader) { @@ -209,9 +226,8 @@ private File writeTestData(Schema schema, int numRecords, int seed) throws IOExc File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)).schema(schema).build()) { writer.addAll(RandomData.generate(schema, numRecords, seed)); } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java index dcfc873a5a67..15c6268da478 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -38,39 +40,51 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestParquetAvroWriter { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required(5, "strict", Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional(6, "hopeful", Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()) - )), - optional(10, "vehement", Types.LongType.get()) - )), - optional(11, "metamorphosis", Types.MapType.ofRequired(12, 13, - Types.StringType.get(), Types.TimestampType.withoutZone())), - required(14, "winter", Types.ListType.ofOptional(15, Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.TimeType.get()), - optional(18, "wheeze", Types.StringType.get()) - ))), - optional(19, "renovate", Types.MapType.ofRequired(20, 21, - Types.StringType.get(), Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.TimeType.get()), - required(24, "couch rope", Types.IntegerType.get()) - ))), - optional(2, "slide", Types.StringType.get()) - ); + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "roots", Types.LongType.get()), + optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), + required( + 5, + "strict", + Types.StructType.of( + required(9, "tangerine", Types.StringType.get()), + optional( + 6, + "hopeful", + Types.StructType.of( + required(7, "steel", Types.FloatType.get()), + required(8, "lantern", Types.DateType.get()))), + optional(10, "vehement", Types.LongType.get()))), + optional( + 11, + "metamorphosis", + Types.MapType.ofRequired( + 12, 13, Types.StringType.get(), Types.TimestampType.withoutZone())), + required( + 14, + "winter", + Types.ListType.ofOptional( + 15, + Types.StructType.of( + optional(16, "beet", Types.DoubleType.get()), + required(17, "stamp", Types.TimeType.get()), + optional(18, "wheeze", Types.StringType.get())))), + optional( + 19, + "renovate", + Types.MapType.ofRequired( + 20, + 21, + Types.StringType.get(), + Types.StructType.of( + optional(22, "jumpy", Types.DoubleType.get()), + required(23, "koala", Types.TimeType.get()), + required(24, "couch rope", Types.IntegerType.get())))), + optional(2, "slide", Types.StringType.get())); @Test public void testCorrectness() throws IOException { @@ -79,10 +93,11 @@ public void testCorrectness() throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .createWriterFunc(ParquetAvroWriter::buildWriter) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(COMPLEX_SCHEMA) + .createWriterFunc(ParquetAvroWriter::buildWriter) + .build()) { writer.addAll(records); } @@ -90,11 +105,12 @@ public void testCorrectness() throws IOException { MessageType readSchema = ParquetSchemaUtil.convert(COMPLEX_SCHEMA, "test"); // verify that the new read path is correct - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) + .build()) { int recordNum = 0; Iterator iter = records.iterator(); for (Record actual : reader) { diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java index 3517c32ffebb..6f05a9ed7c1f 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.File; @@ -42,20 +41,20 @@ public class TestSparkAvroEnums { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void writeAndValidateEnums() throws IOException { - org.apache.avro.Schema avroSchema = SchemaBuilder.record("root") - .fields() - .name("enumCol") - .type() - .nullable() - .enumeration("testEnum") - .symbols("SYMB1", "SYMB2") - .enumDefault("SYMB2") - .endRecord(); + org.apache.avro.Schema avroSchema = + SchemaBuilder.record("root") + .fields() + .name("enumCol") + .type() + .nullable() + .enumeration("testEnum") + .symbols("SYMB1", "SYMB2") + .enumDefault("SYMB2") + .endRecord(); org.apache.avro.Schema enumSchema = avroSchema.getField("enumCol").schema().getTypes().get(0); Record enumRecord1 = new GenericData.Record(avroSchema); @@ -77,10 +76,11 @@ public void writeAndValidateEnums() throws IOException { Schema schema = new Schema(AvroSchemaUtil.convert(avroSchema).asStructType().fields()); List rows; - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)) + .createReaderFunc(SparkAvroReader::new) + .project(schema) + .build()) { rows = Lists.newArrayList(reader); } @@ -88,7 +88,8 @@ public void writeAndValidateEnums() throws IOException { for (int i = 0; i < expected.size(); i += 1) { String expectedEnumString = expected.get(i).get("enumCol") == null ? null : expected.get(i).get("enumCol").toString(); - String sparkString = rows.get(i).getUTF8String(0) == null ? null : rows.get(i).getUTF8String(0).toString(); + String sparkString = + rows.get(i).getUTF8String(0) == null ? null : rows.get(i).getUTF8String(0).toString(); Assert.assertEquals(expectedEnumString, sparkString); } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java index e4398df39cc8..6d1ef3db3657 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; + import java.io.File; import java.io.IOException; import java.util.List; @@ -32,8 +33,6 @@ import org.apache.spark.sql.catalyst.InternalRow; import org.junit.Assert; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; - public class TestSparkAvroReader extends AvroDataTest { @Override protected void writeAndValidate(Schema schema) throws IOException { @@ -42,20 +41,19 @@ protected void writeAndValidate(Schema schema) throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Avro.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Avro.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { for (Record rec : expected) { writer.add(rec); } } List rows; - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)) + .createReaderFunc(SparkAvroReader::new) + .project(schema) + .build()) { rows = Lists.newArrayList(reader); } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java index bfbb8f52202c..56f3cf3c5d8b 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.TimeZone; @@ -65,7 +64,7 @@ public void testSparkTimestamp() { public void checkSparkTimestamp(String timestampString, String sparkRepr) { Literal ts = Literal.of(timestampString).to(Types.TimestampType.withZone()); String sparkTimestamp = DateTimeUtils.timestampToString(ts.value()); - Assert.assertEquals("Should be the same timestamp (" + ts.value() + ")", - sparkRepr, sparkTimestamp); + Assert.assertEquals( + "Should be the same timestamp (" + ts.value() + ")", sparkRepr, sparkTimestamp); } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java index b8ee56370edf..3c9037adc393 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -57,21 +58,18 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSparkOrcReadMetadataColumns { - private static final Schema DATA_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()) - ); - - private static final Schema PROJECTION_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - MetadataColumns.ROW_POSITION, - MetadataColumns.IS_DELETED - ); + private static final Schema DATA_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), required(101, "data", Types.StringType.get())); + + private static final Schema PROJECTION_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get()), + MetadataColumns.ROW_POSITION, + MetadataColumns.IS_DELETED); private static final int NUM_ROWS = 1000; private static final List DATA_ROWS; @@ -99,11 +97,10 @@ public class TestSparkOrcReadMetadataColumns { @Parameterized.Parameters(name = "vectorized = {0}") public static Object[] parameters() { - return new Object[] { false, true }; + return new Object[] {false, true}; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private boolean vectorized; private File testFile; @@ -117,14 +114,15 @@ public void writeFile() throws IOException { testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(DATA_SCHEMA) - // write in such a way that the file contains 10 stripes each with 100 rows - .set("iceberg.orc.vectorbatch.size", "100") - .set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "100") - .set(OrcConf.STRIPE_SIZE.getAttribute(), "1") - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(DATA_SCHEMA) + // write in such a way that the file contains 10 stripes each with 100 rows + .set("iceberg.orc.vectorbatch.size", "100") + .set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "100") + .set(OrcConf.STRIPE_SIZE.getAttribute(), "1") + .build()) { writer.addAll(DATA_ROWS); } } @@ -136,41 +134,54 @@ public void testReadRowNumbers() throws IOException { @Test public void testReadRowNumbersWithFilter() throws IOException { - readAndValidate(Expressions.greaterThanOrEqual("id", 500), null, null, EXPECTED_ROWS.subList(500, 1000)); + readAndValidate( + Expressions.greaterThanOrEqual("id", 500), null, null, EXPECTED_ROWS.subList(500, 1000)); } @Test public void testReadRowNumbersWithSplits() throws IOException { Reader reader; try { - OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(new Configuration()).useUTCTimestamp(true); - reader = OrcFile.createReader(new Path(testFile.toString()), readerOptions); + OrcFile.ReaderOptions readerOptions = + OrcFile.readerOptions(new Configuration()).useUTCTimestamp(true); + reader = OrcFile.createReader(new Path(testFile.toString()), readerOptions); } catch (IOException ioe) { throw new RuntimeIOException(ioe, "Failed to open file: %s", testFile); } - List splitOffsets = reader.getStripes().stream().map(StripeInformation::getOffset) - .collect(Collectors.toList()); - List splitLengths = reader.getStripes().stream().map(StripeInformation::getLength) - .collect(Collectors.toList()); + List splitOffsets = + reader.getStripes().stream().map(StripeInformation::getOffset).collect(Collectors.toList()); + List splitLengths = + reader.getStripes().stream().map(StripeInformation::getLength).collect(Collectors.toList()); for (int i = 0; i < 10; i++) { - readAndValidate(null, splitOffsets.get(i), splitLengths.get(i), EXPECTED_ROWS.subList(i * 100, (i + 1) * 100)); + readAndValidate( + null, + splitOffsets.get(i), + splitLengths.get(i), + EXPECTED_ROWS.subList(i * 100, (i + 1) * 100)); } } - private void readAndValidate(Expression filter, Long splitStart, Long splitLength, List expected) + private void readAndValidate( + Expression filter, Long splitStart, Long splitLength, List expected) throws IOException { - Schema projectionWithoutMetadataFields = TypeUtil.selectNot(PROJECTION_SCHEMA, MetadataColumns.metadataFieldIds()); + Schema projectionWithoutMetadataFields = + TypeUtil.selectNot(PROJECTION_SCHEMA, MetadataColumns.metadataFieldIds()); CloseableIterable reader = null; try { - ORC.ReadBuilder builder = ORC.read(Files.localInput(testFile)) - .project(projectionWithoutMetadataFields); + ORC.ReadBuilder builder = + ORC.read(Files.localInput(testFile)).project(projectionWithoutMetadataFields); if (vectorized) { - builder = builder.createBatchedReaderFunc(readOrcSchema -> - VectorizedSparkOrcReaders.buildReader(PROJECTION_SCHEMA, readOrcSchema, ImmutableMap.of())); + builder = + builder.createBatchedReaderFunc( + readOrcSchema -> + VectorizedSparkOrcReaders.buildReader( + PROJECTION_SCHEMA, readOrcSchema, ImmutableMap.of())); } else { - builder = builder.createReaderFunc(readOrcSchema -> new SparkOrcReader(PROJECTION_SCHEMA, readOrcSchema)); + builder = + builder.createReaderFunc( + readOrcSchema -> new SparkOrcReader(PROJECTION_SCHEMA, readOrcSchema)); } if (filter != null) { diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java index 5042d1cc1338..b23fe729a187 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.data.TestHelpers.assertEquals; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Collections; @@ -38,45 +40,44 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.spark.data.TestHelpers.assertEquals; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkOrcReader extends AvroDataTest { @Override protected void writeAndValidate(Schema schema) throws IOException { - final Iterable expected = RandomData - .generateSpark(schema, 100, 0L); + final Iterable expected = RandomData.generateSpark(schema, 100, 0L); writeAndValidateRecords(schema, expected); } @Test public void writeAndValidateRepeatingRecords() throws IOException { - Schema structSchema = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()) - ); - List expectedRepeating = Collections.nCopies(100, - RandomData.generateSpark(structSchema, 1, 0L).iterator().next()); + Schema structSchema = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get())); + List expectedRepeating = + Collections.nCopies(100, RandomData.generateSpark(structSchema, 1, 0L).iterator().next()); writeAndValidateRecords(structSchema, expectedRepeating); } - private void writeAndValidateRecords(Schema schema, Iterable expected) throws IOException { + private void writeAndValidateRecords(Schema schema, Iterable expected) + throws IOException { final File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(schema) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(schema) + .build()) { writer.addAll(expected); } - try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) + .build()) { final Iterator actualRows = reader.iterator(); final Iterator expectedRows = expected.iterator(); while (expectedRows.hasNext()) { @@ -86,11 +87,13 @@ private void writeAndValidateRecords(Schema schema, Iterable expect Assert.assertFalse("Should not have extra rows", actualRows.hasNext()); } - try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) - .project(schema) - .createBatchedReaderFunc(readOrcSchema -> - VectorizedSparkOrcReaders.buildReader(schema, readOrcSchema, ImmutableMap.of())) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(testFile)) + .project(schema) + .createBatchedReaderFunc( + readOrcSchema -> + VectorizedSparkOrcReaders.buildReader(schema, readOrcSchema, ImmutableMap.of())) + .build()) { final Iterator actualRows = batchesToRows(reader.iterator()); final Iterator expectedRows = expected.iterator(); while (expectedRows.hasNext()) { diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java index a8a6313dbfaa..929d08f2cdb6 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -58,20 +59,17 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSparkParquetReadMetadataColumns { - private static final Schema DATA_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()) - ); + private static final Schema DATA_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), required(101, "data", Types.StringType.get())); - private static final Schema PROJECTION_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - MetadataColumns.ROW_POSITION - ); + private static final Schema PROJECTION_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get()), + MetadataColumns.ROW_POSITION); private static final int NUM_ROWS = 1000; private static final List DATA_ROWS; @@ -107,16 +105,12 @@ public class TestSparkParquetReadMetadataColumns { } } - @Parameterized.Parameters(name = "vectorized = {0}") + @Parameterized.Parameters(name = "vectorized = {0}") public static Object[][] parameters() { - return new Object[][] { - new Object[] { false }, - new Object[] { true } - }; + return new Object[][] {new Object[] {false}, new Object[] {true}}; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final boolean vectorized; private File testFile; @@ -133,28 +127,32 @@ public void writeFile() throws IOException { testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - ParquetFileWriter parquetFileWriter = new ParquetFileWriter( - conf, - ParquetSchemaUtil.convert(DATA_SCHEMA, "testSchema"), - new Path(testFile.getAbsolutePath()) - ); + ParquetFileWriter parquetFileWriter = + new ParquetFileWriter( + conf, + ParquetSchemaUtil.convert(DATA_SCHEMA, "testSchema"), + new Path(testFile.getAbsolutePath())); parquetFileWriter.start(); for (int i = 0; i < NUM_ROW_GROUPS; i += 1) { File split = temp.newFile(); Assert.assertTrue("Delete should succeed", split.delete()); fileSplits.add(new Path(split.getAbsolutePath())); - try (FileAppender writer = Parquet.write(Files.localOutput(split)) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(struct, msgType)) - .schema(DATA_SCHEMA) - .overwrite() - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(split)) + .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(struct, msgType)) + .schema(DATA_SCHEMA) + .overwrite() + .build()) { writer.addAll(DATA_ROWS.subList(i * ROWS_PER_SPLIT, (i + 1) * ROWS_PER_SPLIT)); } - parquetFileWriter.appendFile(HadoopInputFile.fromPath(new Path(split.getAbsolutePath()), conf)); + parquetFileWriter.appendFile( + HadoopInputFile.fromPath(new Path(split.getAbsolutePath()), conf)); } - parquetFileWriter - .end(ParquetFileWriter.mergeMetadataFiles(fileSplits, conf).getFileMetaData().getKeyValueMetaData()); + parquetFileWriter.end( + ParquetFileWriter.mergeMetadataFiles(fileSplits, conf) + .getFileMetaData() + .getKeyValueMetaData()); } @Test @@ -167,7 +165,8 @@ public void testReadRowNumbersWithFilter() throws IOException { // current iceberg supports row group filter. for (int i = 1; i < 5; i += 1) { readAndValidate( - Expressions.and(Expressions.lessThan("id", NUM_ROWS / 2), + Expressions.and( + Expressions.lessThan("id", NUM_ROWS / 2), Expressions.greaterThanOrEqual("id", i * ROWS_PER_SPLIT)), null, null, @@ -177,28 +176,36 @@ public void testReadRowNumbersWithFilter() throws IOException { @Test public void testReadRowNumbersWithSplits() throws IOException { - ParquetFileReader fileReader = new ParquetFileReader( - HadoopInputFile.fromPath(new Path(testFile.getAbsolutePath()), new Configuration()), - ParquetReadOptions.builder().build()); + ParquetFileReader fileReader = + new ParquetFileReader( + HadoopInputFile.fromPath(new Path(testFile.getAbsolutePath()), new Configuration()), + ParquetReadOptions.builder().build()); List rowGroups = fileReader.getRowGroups(); for (int i = 0; i < NUM_ROW_GROUPS; i += 1) { - readAndValidate(null, + readAndValidate( + null, rowGroups.get(i).getColumns().get(0).getStartingPos(), rowGroups.get(i).getCompressedSize(), EXPECTED_ROWS.subList(i * ROWS_PER_SPLIT, (i + 1) * ROWS_PER_SPLIT)); } } - private void readAndValidate(Expression filter, Long splitStart, Long splitLength, List expected) + private void readAndValidate( + Expression filter, Long splitStart, Long splitLength, List expected) throws IOException { - Parquet.ReadBuilder builder = Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA); + Parquet.ReadBuilder builder = + Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA); if (vectorized) { - builder.createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(PROJECTION_SCHEMA, - fileSchema, NullCheckingForGet.NULL_CHECKING_ENABLED)); + builder.createBatchedReaderFunc( + fileSchema -> + VectorizedSparkParquetReaders.buildReader( + PROJECTION_SCHEMA, fileSchema, NullCheckingForGet.NULL_CHECKING_ENABLED)); builder.recordsPerBatch(RECORDS_PER_BATCH); } else { - builder = builder.createReaderFunc(msgType -> SparkParquetReaders.buildReader(PROJECTION_SCHEMA, msgType)); + builder = + builder.createReaderFunc( + msgType -> SparkParquetReaders.buildReader(PROJECTION_SCHEMA, msgType)); } if (filter != null) { @@ -209,7 +216,8 @@ private void readAndValidate(Expression filter, Long splitStart, Long splitLengt builder = builder.split(splitStart, splitLength); } - try (CloseableIterable reader = vectorized ? batchesToRows(builder.build()) : builder.build()) { + try (CloseableIterable reader = + vectorized ? batchesToRows(builder.build()) : builder.build()) { final Iterator actualRows = reader.iterator(); for (InternalRow internalRow : expected) { diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java index 03d234c1eca5..d4b7443e2e20 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -60,31 +62,31 @@ import org.junit.Assume; import org.junit.Test; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkParquetReader extends AvroDataTest { @Override protected void writeAndValidate(Schema schema) throws IOException { - Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find(schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); + Assume.assumeTrue( + "Parquet Avro cannot write non-string map keys", + null + == TypeUtil.find( + schema, + type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); List expected = RandomData.generateList(schema, 100, 0L); File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { writer.addAll(expected); } - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(type -> SparkParquetReaders.buildReader(schema, type)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(type -> SparkParquetReaders.buildReader(schema, type)) + .build()) { Iterator rows = reader.iterator(); for (int i = 0; i < expected.size(); i += 1) { Assert.assertTrue("Should have expected number of rows", rows.hasNext()); @@ -129,7 +131,8 @@ protected Table tableFromInputFile(InputFile inputFile, Schema schema) throws IO @Test public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOException { - String outputFilePath = String.format("%s/%s", temp.getRoot().getAbsolutePath(), "parquet_int96.parquet"); + String outputFilePath = + String.format("%s/%s", temp.getRoot().getAbsolutePath(), "parquet_int96.parquet"); HadoopOutputFile outputFile = HadoopOutputFile.fromPath( new org.apache.hadoop.fs.Path(outputFilePath), new Configuration()); @@ -137,7 +140,7 @@ public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOExceptio StructType sparkSchema = new StructType( new StructField[] { - new StructField("ts", DataTypes.TimestampType, true, Metadata.empty()) + new StructField("ts", DataTypes.TimestampType, true, Metadata.empty()) }); List rows = Lists.newArrayList(RandomData.generateSpark(schema, 10, 0L)); @@ -164,14 +167,14 @@ public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOExceptio Assert.assertEquals(rows.size(), tableRecords.size()); - for (int i = 0; i < tableRecords.size(); i++) { + for (int i = 0; i < tableRecords.size(); i++) { GenericsHelpers.assertEqualsUnsafe(schema.asStruct(), tableRecords.get(i), rows.get(i)); } } /** - * Native Spark ParquetWriter.Builder implementation so that we can write timestamps using Spark's native - * ParquetWriteSupport. + * Native Spark ParquetWriter.Builder implementation so that we can write timestamps using Spark's + * native ParquetWriteSupport. */ private static class NativeSparkWriterBuilder extends ParquetWriter.Builder { diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java index c75a87abc45c..261fb8838aa4 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -35,39 +37,51 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkParquetWriter { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required(5, "strict", Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional(6, "hopeful", Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()) - )), - optional(10, "vehement", Types.LongType.get()) - )), - optional(11, "metamorphosis", Types.MapType.ofRequired(12, 13, - Types.StringType.get(), Types.TimestampType.withZone())), - required(14, "winter", Types.ListType.ofOptional(15, Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.FloatType.get()), - optional(18, "wheeze", Types.StringType.get()) - ))), - optional(19, "renovate", Types.MapType.ofRequired(20, 21, - Types.StringType.get(), Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.IntegerType.get()), - required(24, "couch rope", Types.IntegerType.get()) - ))), - optional(2, "slide", Types.StringType.get()) - ); + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "roots", Types.LongType.get()), + optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), + required( + 5, + "strict", + Types.StructType.of( + required(9, "tangerine", Types.StringType.get()), + optional( + 6, + "hopeful", + Types.StructType.of( + required(7, "steel", Types.FloatType.get()), + required(8, "lantern", Types.DateType.get()))), + optional(10, "vehement", Types.LongType.get()))), + optional( + 11, + "metamorphosis", + Types.MapType.ofRequired( + 12, 13, Types.StringType.get(), Types.TimestampType.withZone())), + required( + 14, + "winter", + Types.ListType.ofOptional( + 15, + Types.StructType.of( + optional(16, "beet", Types.DoubleType.get()), + required(17, "stamp", Types.FloatType.get()), + optional(18, "wheeze", Types.StringType.get())))), + optional( + 19, + "renovate", + Types.MapType.ofRequired( + 20, + 21, + Types.StringType.get(), + Types.StructType.of( + optional(22, "jumpy", Types.DoubleType.get()), + required(23, "koala", Types.IntegerType.get()), + required(24, "couch rope", Types.IntegerType.get())))), + optional(2, "slide", Types.StringType.get())); @Test public void testCorrectness() throws IOException { @@ -77,17 +91,22 @@ public void testCorrectness() throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(COMPLEX_SCHEMA), msgType)) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(COMPLEX_SCHEMA) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter( + SparkSchemaUtil.convert(COMPLEX_SCHEMA), msgType)) + .build()) { writer.addAll(records); } - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(COMPLEX_SCHEMA, type)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(COMPLEX_SCHEMA, type)) + .build()) { Iterator expected = records.iterator(); Iterator rows = reader.iterator(); for (int i = 0; i < numRows; i += 1) { diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java index 1e7430d16df7..d10e7f5a19e3 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.math.BigDecimal; @@ -40,8 +41,6 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkRecordOrcReaderWriter extends AvroDataTest { private static final int NUM_RECORDS = 200; @@ -50,19 +49,21 @@ private void writeAndValidate(Schema schema, List expectedRecords) throw Assert.assertTrue("Delete should succeed", originalFile.delete()); // Write few generic records into the original test file. - try (FileAppender writer = ORC.write(Files.localOutput(originalFile)) - .createWriterFunc(GenericOrcWriter::buildWriter) - .schema(schema) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(originalFile)) + .createWriterFunc(GenericOrcWriter::buildWriter) + .schema(schema) + .build()) { writer.addAll(expectedRecords); } // Read into spark InternalRow from the original test file. List internalRows = Lists.newArrayList(); - try (CloseableIterable reader = ORC.read(Files.localInput(originalFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(originalFile)) + .project(schema) + .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) + .build()) { reader.forEach(internalRows::add); assertEqualsUnsafe(schema.asStruct(), expectedRecords, reader, expectedRecords.size()); } @@ -71,26 +72,29 @@ private void writeAndValidate(Schema schema, List expectedRecords) throw Assert.assertTrue("Delete should succeed", anotherFile.delete()); // Write those spark InternalRows into a new file again. - try (FileAppender writer = ORC.write(Files.localOutput(anotherFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(schema) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(anotherFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(schema) + .build()) { writer.addAll(internalRows); } // Check whether the InternalRows are expected records. - try (CloseableIterable reader = ORC.read(Files.localInput(anotherFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(anotherFile)) + .project(schema) + .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) + .build()) { assertEqualsUnsafe(schema.asStruct(), expectedRecords, reader, expectedRecords.size()); } // Read into iceberg GenericRecord and check again. - try (CloseableIterable reader = ORC.read(Files.localInput(anotherFile)) - .createReaderFunc(typeDesc -> GenericOrcReader.buildReader(schema, typeDesc)) - .project(schema) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(anotherFile)) + .createReaderFunc(typeDesc -> GenericOrcReader.buildReader(schema, typeDesc)) + .project(schema) + .build()) { assertRecordEquals(expectedRecords, reader, expectedRecords.size()); } } @@ -103,11 +107,11 @@ protected void writeAndValidate(Schema schema) throws IOException { @Test public void testDecimalWithTrailingZero() throws IOException { - Schema schema = new Schema( - required(1, "d1", Types.DecimalType.of(10, 2)), - required(2, "d2", Types.DecimalType.of(20, 5)), - required(3, "d3", Types.DecimalType.of(38, 20)) - ); + Schema schema = + new Schema( + required(1, "d1", Types.DecimalType.of(10, 2)), + required(2, "d2", Types.DecimalType.of(20, 5)), + required(3, "d3", Types.DecimalType.of(38, 20))); List expected = Lists.newArrayList(); @@ -121,7 +125,8 @@ public void testDecimalWithTrailingZero() throws IOException { writeAndValidate(schema, expected); } - private static void assertRecordEquals(Iterable expected, Iterable actual, int size) { + private static void assertRecordEquals( + Iterable expected, Iterable actual, int size) { Iterator expectedIter = expected.iterator(); Iterator actualIter = actual.iterator(); for (int i = 0; i < size; i += 1) { @@ -133,8 +138,8 @@ private static void assertRecordEquals(Iterable expected, Iterable expected, - Iterable actual, int size) { + private static void assertEqualsUnsafe( + Types.StructType struct, Iterable expected, Iterable actual, int size) { Iterator expectedIter = expected.iterator(); Iterator actualIter = actual.iterator(); for (int i = 0; i < size; i += 1) { diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java index f292df0c3bf8..756f49a2aad6 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet.vectorized; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; + import java.io.File; import java.io.IOException; import org.apache.avro.generic.GenericData; @@ -35,42 +36,42 @@ import org.junit.Ignore; import org.junit.Test; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; - public class TestParquetDictionaryEncodedVectorizedReads extends TestParquetVectorizedReads { @Override - Iterable generateData(Schema schema, int numRecords, long seed, float nullPercentage, - Function transform) { - Iterable data = RandomData.generateDictionaryEncodableData(schema, numRecords, seed, nullPercentage); + Iterable generateData( + Schema schema, + int numRecords, + long seed, + float nullPercentage, + Function transform) { + Iterable data = + RandomData.generateDictionaryEncodableData(schema, numRecords, seed, nullPercentage); return transform == IDENTITY ? data : Iterables.transform(data, transform); } @Test @Override @Ignore // Ignored since this code path is already tested in TestParquetVectorizedReads - public void testVectorizedReadsWithNewContainers() throws IOException { - - } + public void testVectorizedReadsWithNewContainers() throws IOException {} @Test public void testMixedDictionaryNonDictionaryReads() throws IOException { Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); File dictionaryEncodedFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dictionaryEncodedFile.delete()); - Iterable dictionaryEncodableData = RandomData.generateDictionaryEncodableData( - schema, - 10000, - 0L, - RandomData.DEFAULT_NULL_PERCENTAGE); - try (FileAppender writer = getParquetWriter(schema, dictionaryEncodedFile)) { + Iterable dictionaryEncodableData = + RandomData.generateDictionaryEncodableData( + schema, 10000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE); + try (FileAppender writer = + getParquetWriter(schema, dictionaryEncodedFile)) { writer.addAll(dictionaryEncodableData); } File plainEncodingFile = temp.newFile(); Assert.assertTrue("Delete should succeed", plainEncodingFile.delete()); - Iterable nonDictionaryData = RandomData.generate(schema, 10000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE); + Iterable nonDictionaryData = + RandomData.generate(schema, 10000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE); try (FileAppender writer = getParquetWriter(schema, plainEncodingFile)) { writer.addAll(nonDictionaryData); } @@ -78,15 +79,19 @@ public void testMixedDictionaryNonDictionaryReads() throws IOException { int rowGroupSize = PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; File mixedFile = temp.newFile(); Assert.assertTrue("Delete should succeed", mixedFile.delete()); - Parquet.concat(ImmutableList.of(dictionaryEncodedFile, plainEncodingFile, dictionaryEncodedFile), - mixedFile, rowGroupSize, schema, ImmutableMap.of()); + Parquet.concat( + ImmutableList.of(dictionaryEncodedFile, plainEncodingFile, dictionaryEncodedFile), + mixedFile, + rowGroupSize, + schema, + ImmutableMap.of()); assertRecordsMatch( - schema, - 30000, - FluentIterable.concat(dictionaryEncodableData, nonDictionaryData, dictionaryEncodableData), - mixedFile, - false, - true, - BATCH_SIZE); + schema, + 30000, + FluentIterable.concat(dictionaryEncodableData, nonDictionaryData, dictionaryEncodableData), + mixedFile, + false, + true, + BATCH_SIZE); } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java index 5ceac3fdb76e..42ea34936b5f 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet.vectorized; import java.io.File; @@ -33,7 +32,8 @@ import org.junit.Ignore; import org.junit.Test; -public class TestParquetDictionaryFallbackToPlainEncodingVectorizedReads extends TestParquetVectorizedReads { +public class TestParquetDictionaryFallbackToPlainEncodingVectorizedReads + extends TestParquetVectorizedReads { private static final int NUM_ROWS = 1_000_000; @Override @@ -42,15 +42,20 @@ protected int getNumRows() { } @Override - Iterable generateData(Schema schema, int numRecords, long seed, float nullPercentage, - Function transform) { + Iterable generateData( + Schema schema, + int numRecords, + long seed, + float nullPercentage, + Function transform) { // TODO: take into account nullPercentage when generating fallback encoding data Iterable data = RandomData.generateFallbackData(schema, numRecords, seed, numRecords / 20); return transform == IDENTITY ? data : Iterables.transform(data, transform); } @Override - FileAppender getParquetWriter(Schema schema, File testFile) throws IOException { + FileAppender getParquetWriter(Schema schema, File testFile) + throws IOException { return Parquet.write(Files.localOutput(testFile)) .schema(schema) .named("test") @@ -61,14 +66,10 @@ FileAppender getParquetWriter(Schema schema, File testFile) @Test @Override @Ignore // Fallback encoding not triggered when data is mostly null - public void testMostlyNullsForOptionalFields() { - - } + public void testMostlyNullsForOptionalFields() {} @Test @Override @Ignore // Ignored since this code path is already tested in TestParquetVectorizedReads - public void testVectorizedReadsWithNewContainers() throws IOException { - - } + public void testVectorizedReadsWithNewContainers() throws IOException {} } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java index 48dcc94a5fce..8908a23fad8f 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet.vectorized; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -49,9 +51,6 @@ import org.junit.Ignore; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestParquetVectorizedReads extends AvroDataTest { private static final int NUM_ROWS = 200_000; static final int BATCH_SIZE = 10_000; @@ -64,24 +63,44 @@ protected void writeAndValidate(Schema schema) throws IOException { } private void writeAndValidate( - Schema schema, int numRecords, long seed, float nullPercentage, - boolean setAndCheckArrowValidityVector, boolean reuseContainers) - throws IOException { - writeAndValidate(schema, numRecords, seed, nullPercentage, - setAndCheckArrowValidityVector, reuseContainers, BATCH_SIZE, IDENTITY); + Schema schema, + int numRecords, + long seed, + float nullPercentage, + boolean setAndCheckArrowValidityVector, + boolean reuseContainers) + throws IOException { + writeAndValidate( + schema, + numRecords, + seed, + nullPercentage, + setAndCheckArrowValidityVector, + reuseContainers, + BATCH_SIZE, + IDENTITY); } private void writeAndValidate( - Schema schema, int numRecords, long seed, float nullPercentage, - boolean setAndCheckArrowValidityVector, boolean reuseContainers, int batchSize, - Function transform) + Schema schema, + int numRecords, + long seed, + float nullPercentage, + boolean setAndCheckArrowValidityVector, + boolean reuseContainers, + int batchSize, + Function transform) throws IOException { // Write test data - Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find( - schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); + Assume.assumeTrue( + "Parquet Avro cannot write non-string map keys", + null + == TypeUtil.find( + schema, + type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); - Iterable expected = generateData(schema, numRecords, seed, nullPercentage, transform); + Iterable expected = + generateData(schema, numRecords, seed, nullPercentage, transform); // write a test parquet file using iceberg writer File testFile = temp.newFile(); @@ -90,58 +109,74 @@ private void writeAndValidate( try (FileAppender writer = getParquetWriter(schema, testFile)) { writer.addAll(expected); } - assertRecordsMatch(schema, numRecords, expected, testFile, setAndCheckArrowValidityVector, - reuseContainers, batchSize); + assertRecordsMatch( + schema, + numRecords, + expected, + testFile, + setAndCheckArrowValidityVector, + reuseContainers, + batchSize); } protected int getNumRows() { return NUM_ROWS; } - Iterable generateData(Schema schema, int numRecords, long seed, float nullPercentage, - Function transform) { - Iterable data = RandomData.generate(schema, numRecords, seed, nullPercentage); + Iterable generateData( + Schema schema, + int numRecords, + long seed, + float nullPercentage, + Function transform) { + Iterable data = + RandomData.generate(schema, numRecords, seed, nullPercentage); return transform == IDENTITY ? data : Iterables.transform(data, transform); } - FileAppender getParquetWriter(Schema schema, File testFile) throws IOException { + FileAppender getParquetWriter(Schema schema, File testFile) + throws IOException { + return Parquet.write(Files.localOutput(testFile)).schema(schema).named("test").build(); + } + + FileAppender getParquetV2Writer(Schema schema, File testFile) + throws IOException { return Parquet.write(Files.localOutput(testFile)) .schema(schema) .named("test") + .writerVersion(ParquetProperties.WriterVersion.PARQUET_2_0) .build(); } - FileAppender getParquetV2Writer(Schema schema, File testFile) throws IOException { - return Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .writerVersion(ParquetProperties.WriterVersion.PARQUET_2_0) - .build(); - } - void assertRecordsMatch( - Schema schema, int expectedSize, Iterable expected, File testFile, - boolean setAndCheckArrowValidityBuffer, boolean reuseContainers, int batchSize) + Schema schema, + int expectedSize, + Iterable expected, + File testFile, + boolean setAndCheckArrowValidityBuffer, + boolean reuseContainers, + int batchSize) throws IOException { - Parquet.ReadBuilder readBuilder = Parquet.read(Files.localInput(testFile)) - .project(schema) - .recordsPerBatch(batchSize) - .createBatchedReaderFunc(type -> VectorizedSparkParquetReaders.buildReader( - schema, - type, - setAndCheckArrowValidityBuffer)); + Parquet.ReadBuilder readBuilder = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .recordsPerBatch(batchSize) + .createBatchedReaderFunc( + type -> + VectorizedSparkParquetReaders.buildReader( + schema, type, setAndCheckArrowValidityBuffer)); if (reuseContainers) { readBuilder.reuseContainers(); } - try (CloseableIterable batchReader = - readBuilder.build()) { + try (CloseableIterable batchReader = readBuilder.build()) { Iterator expectedIter = expected.iterator(); Iterator batches = batchReader.iterator(); int numRowsRead = 0; while (batches.hasNext()) { ColumnarBatch batch = batches.next(); numRowsRead += batch.numRows(); - TestHelpers.assertEqualsBatch(schema.asStruct(), expectedIter, batch, setAndCheckArrowValidityBuffer); + TestHelpers.assertEqualsBatch( + schema.asStruct(), expectedIter, batch, setAndCheckArrowValidityBuffer); } Assert.assertEquals(expectedSize, numRowsRead); } @@ -149,38 +184,31 @@ void assertRecordsMatch( @Test @Ignore - public void testArray() { - } + public void testArray() {} @Test @Ignore - public void testArrayOfStructs() { - } + public void testArrayOfStructs() {} @Test @Ignore - public void testMap() { - } + public void testMap() {} @Test @Ignore - public void testNumericMapKey() { - } + public void testNumericMapKey() {} @Test @Ignore - public void testComplexMapKey() { - } + public void testComplexMapKey() {} @Test @Ignore - public void testMapOfStructs() { - } + public void testMapOfStructs() {} @Test @Ignore - public void testMixedTypes() { - } + public void testMixedTypes() {} @Test @Override @@ -189,13 +217,13 @@ public void testNestedStruct() { "Vectorized reads are not supported yet for struct fields", UnsupportedOperationException.class, "Vectorized reads are not supported yet for struct fields", - () -> VectorizedSparkParquetReaders.buildReader( - TypeUtil.assignIncreasingFreshIds(new Schema(required( - 1, - "struct", - SUPPORTED_PRIMITIVES))), - new MessageType("struct", new GroupType(Type.Repetition.OPTIONAL, "struct").withId(1)), - false)); + () -> + VectorizedSparkParquetReaders.buildReader( + TypeUtil.assignIncreasingFreshIds( + new Schema(required(1, "struct", SUPPORTED_PRIMITIVES))), + new MessageType( + "struct", new GroupType(Type.Repetition.OPTIONAL, "struct").withId(1)), + false)); } @Test @@ -211,27 +239,40 @@ public void testMostlyNullsForOptionalFields() throws IOException { @Test public void testSettingArrowValidityVector() throws IOException { - writeAndValidate(new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)), - getNumRows(), 0L, RandomData.DEFAULT_NULL_PERCENTAGE, true, true); + writeAndValidate( + new Schema(Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)), + getNumRows(), + 0L, + RandomData.DEFAULT_NULL_PERCENTAGE, + true, + true); } @Test public void testVectorizedReadsWithNewContainers() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema(SUPPORTED_PRIMITIVES.fields())), - getNumRows(), 0L, RandomData.DEFAULT_NULL_PERCENTAGE, true, false); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds(new Schema(SUPPORTED_PRIMITIVES.fields())), + getNumRows(), + 0L, + RandomData.DEFAULT_NULL_PERCENTAGE, + true, + false); } @Test public void testVectorizedReadsWithReallocatedArrowBuffers() throws IOException { // With a batch size of 2, 256 bytes are allocated in the VarCharVector. By adding strings of // length 512, the vector will need to be reallocated for storing the batch. - writeAndValidate(new Schema( + writeAndValidate( + new Schema( Lists.newArrayList( - SUPPORTED_PRIMITIVES.field("id"), - SUPPORTED_PRIMITIVES.field("data"))), - 10, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, - true, true, 2, + SUPPORTED_PRIMITIVES.field("id"), SUPPORTED_PRIMITIVES.field("data"))), + 10, + 0L, + RandomData.DEFAULT_NULL_PERCENTAGE, + true, + true, + 2, record -> { if (record.get("data") != null) { record.put("data", Strings.padEnd((String) record.get("data"), 512, 'a')); @@ -244,65 +285,67 @@ record -> { @Test public void testReadsForTypePromotedColumns() throws Exception { - Schema writeSchema = new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "int_data", Types.IntegerType.get()), - optional(102, "float_data", Types.FloatType.get()), - optional(103, "decimal_data", Types.DecimalType.of(10, 5)) - ); + Schema writeSchema = + new Schema( + required(100, "id", Types.LongType.get()), + optional(101, "int_data", Types.IntegerType.get()), + optional(102, "float_data", Types.FloatType.get()), + optional(103, "decimal_data", Types.DecimalType.of(10, 5))); File dataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = generateData(writeSchema, 30000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); + Iterable data = + generateData(writeSchema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); try (FileAppender writer = getParquetWriter(writeSchema, dataFile)) { writer.addAll(data); } - Schema readSchema = new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "int_data", Types.LongType.get()), - optional(102, "float_data", Types.DoubleType.get()), - optional(103, "decimal_data", Types.DecimalType.of(25, 5)) - ); + Schema readSchema = + new Schema( + required(100, "id", Types.LongType.get()), + optional(101, "int_data", Types.LongType.get()), + optional(102, "float_data", Types.DoubleType.get()), + optional(103, "decimal_data", Types.DecimalType.of(25, 5))); - assertRecordsMatch(readSchema, 30000, data, dataFile, false, - true, BATCH_SIZE); + assertRecordsMatch(readSchema, 30000, data, dataFile, false, true, BATCH_SIZE); } @Test public void testSupportedReadsForParquetV2() throws Exception { // Only float and double column types are written using plain encoding with Parquet V2 - Schema schema = new Schema( + Schema schema = + new Schema( optional(102, "float_data", Types.FloatType.get()), optional(103, "double_data", Types.DoubleType.get())); File dataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = generateData(schema, 30000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); + Iterable data = + generateData(schema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); try (FileAppender writer = getParquetV2Writer(schema, dataFile)) { writer.addAll(data); } - assertRecordsMatch(schema, 30000, data, dataFile, false, - true, BATCH_SIZE); + assertRecordsMatch(schema, 30000, data, dataFile, false, true, BATCH_SIZE); } @Test public void testUnsupportedReadsForParquetV2() throws Exception { - // Longs, ints, string types etc use delta encoding and which are not supported for vectorized reads + // Longs, ints, string types etc use delta encoding and which are not supported for vectorized + // reads Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); File dataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = generateData(schema, 30000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); + Iterable data = + generateData(schema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); try (FileAppender writer = getParquetV2Writer(schema, dataFile)) { writer.addAll(data); } - AssertHelpers.assertThrows("Vectorized reads not supported", - UnsupportedOperationException.class, "Cannot support vectorized reads for column", () -> { - assertRecordsMatch(schema, 30000, data, dataFile, false, - true, BATCH_SIZE); + AssertHelpers.assertThrows( + "Vectorized reads not supported", + UnsupportedOperationException.class, + "Cannot support vectorized reads for column", + () -> { + assertRecordsMatch(schema, 30000, data, dataFile, false, true, BATCH_SIZE); return null; }); } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java index 5e22daeb0841..53a35eec61ce 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.time.Instant; diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java index 158d574737f6..3a8d087258a7 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -48,7 +47,8 @@ public class ManualSource implements WriteSupport, DataSourceRegister, DataSourc private Configuration lazyConf = null; public static void setTable(String name, Table table) { - Preconditions.checkArgument(!tableMap.containsKey(name), "Cannot set " + name + ". It is already set"); + Preconditions.checkArgument( + !tableMap.containsKey(name), "Cannot set " + name + ". It is already set"); tableMap.put(name, table); } @@ -62,25 +62,35 @@ public String shortName() { } @Override - public Optional createWriter(String writeUUID, StructType dsStruct, SaveMode mode, - DataSourceOptions options) { + public Optional createWriter( + String writeUUID, StructType dsStruct, SaveMode mode, DataSourceOptions options) { Map properties = options.asMap(); - Preconditions.checkArgument(properties.containsKey(TABLE_NAME), "Missing property " + TABLE_NAME); + Preconditions.checkArgument( + properties.containsKey(TABLE_NAME), "Missing property " + TABLE_NAME); String tableName = properties.get(TABLE_NAME); Preconditions.checkArgument(tableMap.containsKey(tableName), "Table missing " + tableName); Table table = tableMap.get(tableName); SparkWriteConf writeConf = new SparkWriteConf(lazySparkSession(), table, options.asMap()); Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct); - TypeUtil.validateWriteSchema(table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); + TypeUtil.validateWriteSchema( + table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); SparkUtil.validatePartitionTransforms(table.spec()); String appId = lazySparkSession().sparkContext().applicationId(); String wapId = writeConf.wapId(); boolean replacePartitions = mode == SaveMode.Overwrite; - return Optional.of(new Writer( - lazySparkSession(), table, writeConf, replacePartitions, appId, wapId, writeSchema, dsStruct)); + return Optional.of( + new Writer( + lazySparkSession(), + table, + writeConf, + replacePartitions, + appId, + wapId, + writeSchema, + dsStruct)); } private SparkSession lazySparkSession() { diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java index c8b7a31b3ba0..550e20b9338e 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.relocated.com.google.common.base.Objects; @@ -25,8 +24,7 @@ public class SimpleRecord { private Integer id; private String data; - public SimpleRecord() { - } + public SimpleRecord() {} public SimpleRecord(Integer id, String data) { this.id = id; diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java index 4d2e12229813..9491adde4605 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.List; @@ -46,13 +47,10 @@ import org.junit.Rule; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.Files.localOutput; - public class TestAvroScan extends AvroDataTest { private static final Configuration CONF = new Configuration(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; @@ -75,8 +73,8 @@ protected void writeAndValidate(Schema schema) throws IOException { File dataFolder = new File(location, "data"); dataFolder.mkdirs(); - File avroFile = new File(dataFolder, - FileFormat.AVRO.addExtension(UUID.randomUUID().toString())); + File avroFile = + new File(dataFolder, FileFormat.AVRO.addExtension(UUID.randomUUID().toString())); HadoopTables tables = new HadoopTables(CONF); Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); @@ -87,23 +85,21 @@ protected void writeAndValidate(Schema schema) throws IOException { List expected = RandomData.generateList(tableSchema, 100, 1L); - try (FileAppender writer = Avro.write(localOutput(avroFile)) - .schema(tableSchema) - .build()) { + try (FileAppender writer = + Avro.write(localOutput(avroFile)).schema(tableSchema).build()) { writer.addAll(expected); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(100) - .withFileSizeInBytes(avroFile.length()) - .withPath(avroFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(100) + .withFileSizeInBytes(avroFile.length()) + .withPath(avroFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); - Dataset df = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset df = spark.read().format("iceberg").load(location.toString()); List rows = df.collectAsList(); Assert.assertEquals("Should contain 100 rows", 100, rows.size()); diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCatalog.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCatalog.java index 948140a9e78f..3e0c9cbb4052 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCatalog.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -40,8 +39,7 @@ public class TestCatalog implements Catalog, Configurable { private Configuration conf; private String warehouse; - public TestCatalog() { - } + public TestCatalog() {} @Override public String name() { @@ -51,27 +49,40 @@ public String name() { private String tablePath(TableIdentifier identifier) { return String.format("%s/%s", warehouse, identifier.name()); } + @Override public List listTables(Namespace namespace) { throw new UnsupportedOperationException(); } @Override - public Table createTable(TableIdentifier identifier, Schema schema, - PartitionSpec spec, String location, Map properties) { + public Table createTable( + TableIdentifier identifier, + Schema schema, + PartitionSpec spec, + String location, + Map properties) { return tables.create(schema, spec, properties, tablePath(identifier)); } @Override - public Transaction newCreateTableTransaction(TableIdentifier identifier, Schema schema, - PartitionSpec spec, String location, Map properties) { + public Transaction newCreateTableTransaction( + TableIdentifier identifier, + Schema schema, + PartitionSpec spec, + String location, + Map properties) { throw new UnsupportedOperationException(); } @Override - public Transaction newReplaceTableTransaction(TableIdentifier identifier, Schema schema, - PartitionSpec spec, String location, Map properties, - boolean orCreate) { + public Transaction newReplaceTableTransaction( + TableIdentifier identifier, + Schema schema, + PartitionSpec spec, + String location, + Map properties, + boolean orCreate) { throw new UnsupportedOperationException(); } @@ -94,11 +105,13 @@ public Table loadTable(TableIdentifier identifier) { public void initialize(String name, Map properties) { String uri = properties.get(CatalogProperties.URI); warehouse = properties.get("warehouse"); - Preconditions.checkArgument(uri != null, - "Cannot initialize TestCatalog. The metastore connection uri must be set."); - Preconditions.checkArgument(uri.contains("thrift"), + Preconditions.checkArgument( + uri != null, "Cannot initialize TestCatalog. The metastore connection uri must be set."); + Preconditions.checkArgument( + uri.contains("thrift"), "Cannot initialize TestCatalog. The metastore connection uri must use thrift as the scheme."); - Preconditions.checkArgument(warehouse != null, + Preconditions.checkArgument( + warehouse != null, "Cannot initialize TestCatalog. The base path for the catalog's warehouse directory must be set."); this.tables = new HadoopTables(conf); } @@ -112,5 +125,4 @@ public void setConf(Configuration conf) { public Configuration getConf() { return this.conf; } - } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCustomCatalog.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCustomCatalog.java index b9021e0ca059..d003fb1f65d4 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCustomCatalog.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestCustomCatalog.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.util.List; import org.apache.iceberg.AssertHelpers; @@ -43,32 +44,47 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestCustomCatalog { - private static final String CATALOG_IMPL = String.format("%s.%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, - CustomCatalogs.ICEBERG_DEFAULT_CATALOG, CatalogProperties.CATALOG_IMPL); - private static final String WAREHOUSE = String.format("%s.%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, - CustomCatalogs.ICEBERG_DEFAULT_CATALOG, CatalogProperties.WAREHOUSE_LOCATION); - private static final String URI_KEY = String.format("%s.%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, - CustomCatalogs.ICEBERG_DEFAULT_CATALOG, CatalogProperties.URI); + private static final String CATALOG_IMPL = + String.format( + "%s.%s.%s", + CustomCatalogs.ICEBERG_CATALOG_PREFIX, + CustomCatalogs.ICEBERG_DEFAULT_CATALOG, + CatalogProperties.CATALOG_IMPL); + private static final String WAREHOUSE = + String.format( + "%s.%s.%s", + CustomCatalogs.ICEBERG_CATALOG_PREFIX, + CustomCatalogs.ICEBERG_DEFAULT_CATALOG, + CatalogProperties.WAREHOUSE_LOCATION); + private static final String URI_KEY = + String.format( + "%s.%s.%s", + CustomCatalogs.ICEBERG_CATALOG_PREFIX, + CustomCatalogs.ICEBERG_DEFAULT_CATALOG, + CatalogProperties.URI); private static final String TEST_CATALOG = "placeholder_catalog"; - private static final String TEST_CATALOG_IMPL = String.format("%s.%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, - TEST_CATALOG, CatalogProperties.CATALOG_IMPL); - private static final String TEST_WAREHOUSE = String.format("%s.%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, - TEST_CATALOG, CatalogProperties.WAREHOUSE_LOCATION); - private static final String TEST_URI_KEY = String.format("%s.%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, - TEST_CATALOG, CatalogProperties.URI); + private static final String TEST_CATALOG_IMPL = + String.format( + "%s.%s.%s", + CustomCatalogs.ICEBERG_CATALOG_PREFIX, TEST_CATALOG, CatalogProperties.CATALOG_IMPL); + private static final String TEST_WAREHOUSE = + String.format( + "%s.%s.%s", + CustomCatalogs.ICEBERG_CATALOG_PREFIX, + TEST_CATALOG, + CatalogProperties.WAREHOUSE_LOCATION); + private static final String TEST_URI_KEY = + String.format( + "%s.%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, TEST_CATALOG, CatalogProperties.URI); private static final String URI_VAL = "thrift://localhost:12345"; // dummy uri private static final String CATALOG_VAL = "org.apache.iceberg.spark.source.TestCatalog"; private static final TableIdentifier TABLE = TableIdentifier.of("default", "table"); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); File tableDir = null; String tableLocation = null; @@ -91,16 +107,17 @@ public static void stopMetastoreAndSpark() { public void setupTable() throws Exception { SparkConf sparkConf = spark.sparkContext().conf(); sparkConf.set( - String.format("%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, CustomCatalogs.ICEBERG_DEFAULT_CATALOG), + String.format( + "%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, CustomCatalogs.ICEBERG_DEFAULT_CATALOG), "placeholder"); sparkConf.set( - String.format("%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, TEST_CATALOG), - "placeholder"); + String.format("%s.%s", CustomCatalogs.ICEBERG_CATALOG_PREFIX, TEST_CATALOG), "placeholder"); this.tables = new HadoopTables(spark.sessionState().newHadoopConf()); this.tableDir = temp.newFolder(); tableDir.delete(); // created by table create this.tableLocation = tableDir.toURI().toString(); - tables.create(SCHEMA, PartitionSpec.unpartitioned(), String.format("%s/%s", tableLocation, TABLE.name())); + tables.create( + SCHEMA, PartitionSpec.unpartitioned(), String.format("%s/%s", tableLocation, TABLE.name())); } @After @@ -121,33 +138,34 @@ public void withSparkOptions() { sparkConf.set(CATALOG_IMPL, CATALOG_VAL); sparkConf.set(URI_KEY, URI_VAL); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - AssertHelpers.assertThrows("We have not set all properties", IllegalArgumentException.class, - "The base path for the catalog's warehouse directory must be set", () -> - df.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(TABLE.toString()) - ); + AssertHelpers.assertThrows( + "We have not set all properties", + IllegalArgumentException.class, + "The base path for the catalog's warehouse directory must be set", + () -> + df.select("id", "data") + .write() + .format("iceberg") + .mode("append") + .save(TABLE.toString())); sparkConf.set(WAREHOUSE, tableLocation); - df.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(TABLE.toString()); + df.select("id", "data").write().format("iceberg").mode("append").save(TABLE.toString()); - List dfNew = spark.read().format("iceberg") - .load(TABLE.toString()) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List dfNew = + spark + .read() + .format("iceberg") + .load(TABLE.toString()) + .orderBy("id") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Data should match", expected, dfNew); } @@ -160,35 +178,30 @@ public void withSparkCatalog() { sparkConf.set(TEST_CATALOG_IMPL, CATALOG_VAL); sparkConf.set(TEST_URI_KEY, URI_VAL); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - AssertHelpers.assertThrows("We have not set all properties", IllegalArgumentException.class, - "The base path for the catalog's warehouse directory must be set", () -> - df.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(catalogTable) - ); + AssertHelpers.assertThrows( + "We have not set all properties", + IllegalArgumentException.class, + "The base path for the catalog's warehouse directory must be set", + () -> df.select("id", "data").write().format("iceberg").mode("append").save(catalogTable)); sparkConf.set(TEST_WAREHOUSE, tableLocation); - df.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(catalogTable); + df.select("id", "data").write().format("iceberg").mode("append").save(catalogTable); - List dfNew = spark.read().format("iceberg") - .load(catalogTable) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List dfNew = + spark + .read() + .format("iceberg") + .load(catalogTable) + .orderBy("id") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Data should match", expected, dfNew); } - } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java index 1391362823e1..af02cc8d4602 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsSafe; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; + import java.io.File; import java.io.IOException; import java.net.URI; @@ -68,10 +71,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsSafe; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; - @RunWith(Parameterized.class) public class TestDataFrameWrites extends AvroDataTest { private static final Configuration CONF = new Configuration(); @@ -80,7 +79,7 @@ public class TestDataFrameWrites extends AvroDataTest { @Parameterized.Parameters(name = "format = {0}") public static Object[] parameters() { - return new Object[] { "parquet", "avro", "orc" }; + return new Object[] {"parquet", "avro", "orc"}; } public TestDataFrameWrites(String format) { @@ -92,32 +91,36 @@ public TestDataFrameWrites(String format) { private Map tableProperties; - private org.apache.spark.sql.types.StructType sparkSchema = new org.apache.spark.sql.types.StructType( - new org.apache.spark.sql.types.StructField[] { - new org.apache.spark.sql.types.StructField( - "optionalField", - org.apache.spark.sql.types.DataTypes.StringType, - true, - org.apache.spark.sql.types.Metadata.empty()), - new org.apache.spark.sql.types.StructField( - "requiredField", - org.apache.spark.sql.types.DataTypes.StringType, - false, - org.apache.spark.sql.types.Metadata.empty()) - }); - - private Schema icebergSchema = new Schema( - Types.NestedField.optional(1, "optionalField", Types.StringType.get()), - Types.NestedField.required(2, "requiredField", Types.StringType.get())); - - private List data0 = Arrays.asList( - "{\"optionalField\": \"a1\", \"requiredField\": \"bid_001\"}", - "{\"optionalField\": \"a2\", \"requiredField\": \"bid_002\"}"); - private List data1 = Arrays.asList( - "{\"optionalField\": \"d1\", \"requiredField\": \"bid_101\"}", - "{\"optionalField\": \"d2\", \"requiredField\": \"bid_102\"}", - "{\"optionalField\": \"d3\", \"requiredField\": \"bid_103\"}", - "{\"optionalField\": \"d4\", \"requiredField\": \"bid_104\"}"); + private org.apache.spark.sql.types.StructType sparkSchema = + new org.apache.spark.sql.types.StructType( + new org.apache.spark.sql.types.StructField[] { + new org.apache.spark.sql.types.StructField( + "optionalField", + org.apache.spark.sql.types.DataTypes.StringType, + true, + org.apache.spark.sql.types.Metadata.empty()), + new org.apache.spark.sql.types.StructField( + "requiredField", + org.apache.spark.sql.types.DataTypes.StringType, + false, + org.apache.spark.sql.types.Metadata.empty()) + }); + + private Schema icebergSchema = + new Schema( + Types.NestedField.optional(1, "optionalField", Types.StringType.get()), + Types.NestedField.required(2, "requiredField", Types.StringType.get())); + + private List data0 = + Arrays.asList( + "{\"optionalField\": \"a1\", \"requiredField\": \"bid_001\"}", + "{\"optionalField\": \"a2\", \"requiredField\": \"bid_002\"}"); + private List data1 = + Arrays.asList( + "{\"optionalField\": \"d1\", \"requiredField\": \"bid_101\"}", + "{\"optionalField\": \"d2\", \"requiredField\": \"bid_102\"}", + "{\"optionalField\": \"d3\", \"requiredField\": \"bid_103\"}", + "{\"optionalField\": \"d4\", \"requiredField\": \"bid_104\"}"); @BeforeClass public static void startSpark() { @@ -145,8 +148,10 @@ public void testWriteWithCustomDataLocation() throws IOException { File location = createTableFolder(); File tablePropertyDataLocation = temp.newFolder("test-table-property-data-dir"); Table table = createTable(new Schema(SUPPORTED_PRIMITIVES.fields()), location); - table.updateProperties().set( - TableProperties.WRITE_DATA_LOCATION, tablePropertyDataLocation.getAbsolutePath()).commit(); + table + .updateProperties() + .set(TableProperties.WRITE_DATA_LOCATION, tablePropertyDataLocation.getAbsolutePath()) + .commit(); writeAndValidateWithLocations(table, location, tablePropertyDataLocation); } @@ -162,7 +167,8 @@ private Table createTable(Schema schema, File location) { return tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); } - private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) throws IOException { + private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) + throws IOException { Schema tableSchema = table.schema(); // use the table schema because ids are reassigned table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); @@ -179,47 +185,56 @@ private void writeAndValidateWithLocations(Table table, File location, File expe while (expectedIter.hasNext() && actualIter.hasNext()) { assertEqualsSafe(tableSchema.asStruct(), expectedIter.next(), actualIter.next()); } - Assert.assertEquals("Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext()); - - table.currentSnapshot().addedFiles().forEach(dataFile -> - Assert.assertTrue( - String.format( - "File should have the parent directory %s, but has: %s.", - expectedDataDir.getAbsolutePath(), - dataFile.path()), - URI.create(dataFile.path().toString()).getPath().startsWith(expectedDataDir.getAbsolutePath()))); + Assert.assertEquals( + "Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext()); + + table + .currentSnapshot() + .addedFiles() + .forEach( + dataFile -> + Assert.assertTrue( + String.format( + "File should have the parent directory %s, but has: %s.", + expectedDataDir.getAbsolutePath(), dataFile.path()), + URI.create(dataFile.path().toString()) + .getPath() + .startsWith(expectedDataDir.getAbsolutePath()))); } private List readTable(String location) { - Dataset result = spark.read() - .format("iceberg") - .load(location); + Dataset result = spark.read().format("iceberg").load(location); return result.collectAsList(); } - private void writeData(Iterable records, Schema schema, String location) throws IOException { + private void writeData(Iterable records, Schema schema, String location) + throws IOException { Dataset df = createDataset(records, schema); DataFrameWriter writer = df.write().format("iceberg").mode("append"); writer.save(location); } - private void writeDataWithFailOnPartition(Iterable records, Schema schema, String location) - throws IOException, SparkException { + private void writeDataWithFailOnPartition( + Iterable records, Schema schema, String location) throws IOException, SparkException { final int numPartitions = 10; final int partitionToFail = new Random().nextInt(numPartitions); - MapPartitionsFunction failOnFirstPartitionFunc = (MapPartitionsFunction) input -> { - int partitionId = TaskContext.getPartitionId(); - - if (partitionId == partitionToFail) { - throw new SparkException(String.format("Intended exception in partition %d !", partitionId)); - } - return input; - }; - - Dataset df = createDataset(records, schema) - .repartition(numPartitions) - .mapPartitions(failOnFirstPartitionFunc, RowEncoder.apply(convert(schema))); + MapPartitionsFunction failOnFirstPartitionFunc = + (MapPartitionsFunction) + input -> { + int partitionId = TaskContext.getPartitionId(); + + if (partitionId == partitionToFail) { + throw new SparkException( + String.format("Intended exception in partition %d !", partitionId)); + } + return input; + }; + + Dataset df = + createDataset(records, schema) + .repartition(numPartitions) + .mapPartitions(failOnFirstPartitionFunc, RowEncoder.apply(convert(schema))); // This trick is needed because Spark 3 handles decimal overflow in RowEncoder which "changes" // nullability of the column to "true" regardless of original nullability. // Setting "check-nullability" option to "false" doesn't help as it fails at Spark analyzer. @@ -234,10 +249,8 @@ private Dataset createDataset(Iterable records, Schema schema) thro File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Avro.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Avro.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { for (Record rec : records) { writer.add(rec); } @@ -245,10 +258,11 @@ private Dataset createDataset(Iterable records, Schema schema) thro // make sure the dataframe matches the records before moving on List rows = Lists.newArrayList(); - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)) + .createReaderFunc(SparkAvroReader::new) + .project(schema) + .build()) { Iterator recordIter = records.iterator(); Iterator readIter = reader.iterator(); @@ -257,7 +271,8 @@ private Dataset createDataset(Iterable records, Schema schema) thro assertEqualsUnsafe(schema.asStruct(), recordIter.next(), row); rows.add(row); } - Assert.assertEquals("Both iterators should be exhausted", recordIter.hasNext(), readIter.hasNext()); + Assert.assertEquals( + "Both iterators should be exhausted", recordIter.hasNext(), readIter.hasNext()); } JavaRDD rdd = sc.parallelize(rows); @@ -266,7 +281,8 @@ private Dataset createDataset(Iterable records, Schema schema) thro @Test public void testNullableWithWriteOption() throws IOException { - Assume.assumeTrue("Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); + Assume.assumeTrue( + "Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); File location = new File(temp.newFolder("parquet"), "test"); String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString()); @@ -276,9 +292,11 @@ public void testNullableWithWriteOption() throws IOException { // read this and append to iceberg dataset spark - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) - .write().parquet(sourcePath); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) + .write() + .parquet(sourcePath); // this is our iceberg dataset to which we will append data new HadoopTables(spark.sessionState().newHadoopConf()) @@ -290,15 +308,24 @@ public void testNullableWithWriteOption() throws IOException { // this is the initial data inside the iceberg dataset spark - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) - .write().format("iceberg").mode(SaveMode.Append).save(targetPath); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) + .write() + .format("iceberg") + .mode(SaveMode.Append) + .save(targetPath); // read from parquet and append to iceberg w/ nullability check disabled spark - .read().schema(SparkSchemaUtil.convert(icebergSchema)).parquet(sourcePath) - .write().format("iceberg").option(SparkWriteOptions.CHECK_NULLABILITY, false) - .mode(SaveMode.Append).save(targetPath); + .read() + .schema(SparkSchemaUtil.convert(icebergSchema)) + .parquet(sourcePath) + .write() + .format("iceberg") + .option(SparkWriteOptions.CHECK_NULLABILITY, false) + .mode(SaveMode.Append) + .save(targetPath); // read all data List rows = spark.read().format("iceberg").load(targetPath).collectAsList(); @@ -307,7 +334,8 @@ public void testNullableWithWriteOption() throws IOException { @Test public void testNullableWithSparkSqlOption() throws IOException { - Assume.assumeTrue("Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); + Assume.assumeTrue( + "Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); File location = new File(temp.newFolder("parquet"), "test"); String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString()); @@ -317,15 +345,18 @@ public void testNullableWithSparkSqlOption() throws IOException { // read this and append to iceberg dataset spark - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) - .write().parquet(sourcePath); - - SparkSession newSparkSession = SparkSession.builder() - .master("local[2]") - .appName("NullableTest") - .config(SparkSQLProperties.CHECK_NULLABILITY, false) - .getOrCreate(); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) + .write() + .parquet(sourcePath); + + SparkSession newSparkSession = + SparkSession.builder() + .master("local[2]") + .appName("NullableTest") + .config(SparkSQLProperties.CHECK_NULLABILITY, false) + .getOrCreate(); // this is our iceberg dataset to which we will append data new HadoopTables(newSparkSession.sessionState().newHadoopConf()) @@ -337,19 +368,27 @@ public void testNullableWithSparkSqlOption() throws IOException { // this is the initial data inside the iceberg dataset newSparkSession - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) - .write().format("iceberg").mode(SaveMode.Append).save(targetPath); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) + .write() + .format("iceberg") + .mode(SaveMode.Append) + .save(targetPath); // read from parquet and append to iceberg newSparkSession - .read().schema(SparkSchemaUtil.convert(icebergSchema)).parquet(sourcePath) - .write().format("iceberg").mode(SaveMode.Append).save(targetPath); + .read() + .schema(SparkSchemaUtil.convert(icebergSchema)) + .parquet(sourcePath) + .write() + .format("iceberg") + .mode(SaveMode.Append) + .save(targetPath); // read all data List rows = newSparkSession.read().format("iceberg").load(targetPath).collectAsList(); Assert.assertEquals("Should contain 6 rows", 6, rows.size()); - } @Test diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java index 7655b4b82f13..0e48797480e9 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import java.math.RoundingMode; import java.util.List; @@ -54,19 +55,15 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestDataSourceOptions { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); private static SparkSession spark = null; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @BeforeClass public static void startSpark() { @@ -90,23 +87,23 @@ public void testWriteFormatOptionOverridesTableProperties() throws IOException { options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro"); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, "parquet") .mode(SaveMode.Append) .save(tableLocation); try (CloseableIterable tasks = table.newScan().planFiles()) { - tasks.forEach(task -> { - FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); - Assert.assertEquals(FileFormat.PARQUET, fileFormat); - }); + tasks.forEach( + task -> { + FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); + Assert.assertEquals(FileFormat.PARQUET, fileFormat); + }); } } @@ -120,22 +117,18 @@ public void testNoWriteFormatOption() throws IOException { options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro"); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); - df.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); try (CloseableIterable tasks = table.newScan().planFiles()) { - tasks.forEach(task -> { - FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); - Assert.assertEquals(FileFormat.AVRO, fileFormat); - }); + tasks.forEach( + task -> { + FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); + Assert.assertEquals(FileFormat.AVRO, fileFormat); + }); } } @@ -155,24 +148,25 @@ public void testHadoopOptions() throws IOException { // to verify that 'hadoop.' data source options are propagated correctly sparkHadoopConf.set("fs.default.name", "hdfs://localhost:9000"); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() + originalDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .option("hadoop.fs.default.name", "file:///") .save(tableLocation); - Dataset resultDf = spark.read() - .format("iceberg") - .option("hadoop.fs.default.name", "file:///") - .load(tableLocation); - List resultRecords = resultDf.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset resultDf = + spark + .read() + .format("iceberg") + .option("hadoop.fs.default.name", "file:///") + .load(tableLocation); + List resultRecords = + resultDf.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Records should match", expectedRecords, resultRecords); } finally { @@ -188,15 +182,16 @@ public void testSplitOptionsOverridesTableProperties() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Map options = Maps.newHashMap(); options.put(TableProperties.SPLIT_SIZE, String.valueOf(128L * 1024 * 1024)); // 128Mb - options.put(TableProperties.DEFAULT_FILE_FORMAT, String.valueOf(FileFormat.AVRO)); // Arbitrarily splittable + options.put( + TableProperties.DEFAULT_FILE_FORMAT, + String.valueOf(FileFormat.AVRO)); // Arbitrarily splittable Table icebergTable = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data") + originalDf + .select("id", "data") .repartition(1) .write() .format("iceberg") @@ -209,10 +204,12 @@ public void testSplitOptionsOverridesTableProperties() throws IOException { long fileSize = files.get(0).fileSizeInBytes(); long splitSize = LongMath.divide(fileSize, 2, RoundingMode.CEILING); - Dataset resultDf = spark.read() - .format("iceberg") - .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(splitSize)) - .load(tableLocation); + Dataset resultDf = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(splitSize)) + .load(tableLocation); Assert.assertEquals("Spark partitions should match", 2, resultDf.javaRDD().getNumPartitions()); } @@ -226,18 +223,16 @@ public void testIncrementalScanOptions() throws IOException { Map options = Maps.newHashMap(); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "d") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "d")); for (SimpleRecord record : expectedRecords) { - Dataset originalDf = spark.createDataFrame(Lists.newArrayList(record), SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + Dataset originalDf = + spark.createDataFrame(Lists.newArrayList(record), SimpleRecord.class); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); } List snapshotIds = SnapshotUtil.currentAncestorIds(table); @@ -247,11 +242,13 @@ public void testIncrementalScanOptions() throws IOException { IllegalArgumentException.class, "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan", () -> { - spark.read() + spark + .read() .format("iceberg") .option("snapshot-id", snapshotIds.get(3).toString()) .option("start-snapshot-id", snapshotIds.get(3).toString()) - .load(tableLocation).explain(); + .load(tableLocation) + .explain(); }); // end-snapshot-id and as-of-timestamp are both configured. @@ -260,12 +257,15 @@ public void testIncrementalScanOptions() throws IOException { IllegalArgumentException.class, "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan", () -> { - spark.read() + spark + .read() .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, + .option( + SparkReadOptions.AS_OF_TIMESTAMP, Long.toString(table.snapshot(snapshotIds.get(3)).timestampMillis())) .option("end-snapshot-id", snapshotIds.get(2).toString()) - .load(tableLocation).explain(); + .load(tableLocation) + .explain(); }); // only end-snapshot-id is configured. @@ -274,31 +274,37 @@ public void testIncrementalScanOptions() throws IOException { IllegalArgumentException.class, "Cannot only specify option end-snapshot-id to do incremental scan", () -> { - spark.read() + spark + .read() .format("iceberg") .option("end-snapshot-id", snapshotIds.get(2).toString()) - .load(tableLocation).explain(); + .load(tableLocation) + .explain(); }); // test (1st snapshot, current snapshot] incremental scan. - List result = spark.read() - .format("iceberg") - .option("start-snapshot-id", snapshotIds.get(3).toString()) - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List result = + spark + .read() + .format("iceberg") + .option("start-snapshot-id", snapshotIds.get(3).toString()) + .load(tableLocation) + .orderBy("id") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Records should match", expectedRecords.subList(1, 4), result); // test (2nd snapshot, 3rd snapshot] incremental scan. - List result1 = spark.read() - .format("iceberg") - .option("start-snapshot-id", snapshotIds.get(2).toString()) - .option("end-snapshot-id", snapshotIds.get(1).toString()) - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List result1 = + spark + .read() + .format("iceberg") + .option("start-snapshot-id", snapshotIds.get(2).toString()) + .option("end-snapshot-id", snapshotIds.get(1).toString()) + .load(tableLocation) + .orderBy("id") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Records should match", expectedRecords.subList(2, 3), result1); } @@ -311,41 +317,34 @@ public void testMetadataSplitSizeOptionOverrideTableProperties() throws IOExcept Map options = Maps.newHashMap(); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); // produce 1st manifest - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); // produce 2nd manifest - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); List manifests = table.currentSnapshot().allManifests(); Assert.assertEquals("Must be 2 manifests", 2, manifests.size()); // set the target metadata split size so each manifest ends up in a separate split - table.updateProperties() + table + .updateProperties() .set(TableProperties.METADATA_SPLIT_SIZE, String.valueOf(manifests.get(0).length())) .commit(); - Dataset entriesDf = spark.read() - .format("iceberg") - .load(tableLocation + "#entries"); + Dataset entriesDf = spark.read().format("iceberg").load(tableLocation + "#entries"); Assert.assertEquals("Num partitions must match", 2, entriesDf.javaRDD().getNumPartitions()); // override the table property using options - entriesDf = spark.read() - .format("iceberg") - .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)) - .load(tableLocation + "#entries"); + entriesDf = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)) + .load(tableLocation + "#entries"); Assert.assertEquals("Num partitions must match", 1, entriesDf.javaRDD().getNumPartitions()); } @@ -358,24 +357,26 @@ public void testDefaultMetadataSplitSize() throws IOException { Map options = Maps.newHashMap(); tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); int splitSize = (int) TableProperties.METADATA_SPLIT_SIZE_DEFAULT; // 32MB split size - int expectedSplits = ((int) tables.load(tableLocation + "#entries") - .currentSnapshot().allManifests().get(0).length() + splitSize - 1) / splitSize; + int expectedSplits = + ((int) + tables + .load(tableLocation + "#entries") + .currentSnapshot() + .allManifests() + .get(0) + .length() + + splitSize + - 1) + / splitSize; - Dataset metadataDf = spark.read() - .format("iceberg") - .load(tableLocation + "#entries"); + Dataset metadataDf = spark.read().format("iceberg").load(tableLocation + "#entries"); int partitionNum = metadataDf.javaRDD().getNumPartitions(); Assert.assertEquals("Spark partitions should match", expectedSplits, partitionNum); @@ -387,17 +388,17 @@ public void testExtraSnapshotMetadata() throws IOException { HadoopTables tables = new HadoopTables(CONF); tables.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".extra-key", "someValue") - .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".another-key", "anotherValue") - .save(tableLocation); + originalDf + .select("id", "data") + .write() + .format("iceberg") + .mode("append") + .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".extra-key", "someValue") + .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".another-key", "anotherValue") + .save(tableLocation); Table table = tables.load(tableLocation); diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java index f006ba49f370..2dc61d943547 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; +import static org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp; +import static org.apache.spark.sql.functions.callUDF; +import static org.apache.spark.sql.functions.column; + import java.io.File; import java.io.IOException; import java.sql.Timestamp; @@ -78,41 +82,31 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; -import static org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp; -import static org.apache.spark.sql.functions.callUDF; -import static org.apache.spark.sql.functions.column; - @RunWith(Parameterized.class) public class TestFilteredScan { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "ts", Types.TimestampType.withZone()), - Types.NestedField.optional(3, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "ts", Types.TimestampType.withZone()), + Types.NestedField.optional(3, "data", Types.StringType.get())); - private static final PartitionSpec BUCKET_BY_ID = PartitionSpec.builderFor(SCHEMA) - .bucket("id", 4) - .build(); + private static final PartitionSpec BUCKET_BY_ID = + PartitionSpec.builderFor(SCHEMA).bucket("id", 4).build(); - private static final PartitionSpec PARTITION_BY_DAY = PartitionSpec.builderFor(SCHEMA) - .day("ts") - .build(); + private static final PartitionSpec PARTITION_BY_DAY = + PartitionSpec.builderFor(SCHEMA).day("ts").build(); - private static final PartitionSpec PARTITION_BY_HOUR = PartitionSpec.builderFor(SCHEMA) - .hour("ts") - .build(); + private static final PartitionSpec PARTITION_BY_HOUR = + PartitionSpec.builderFor(SCHEMA).hour("ts").build(); - private static final PartitionSpec PARTITION_BY_DATA = PartitionSpec.builderFor(SCHEMA) - .identity("data") - .build(); + private static final PartitionSpec PARTITION_BY_DATA = + PartitionSpec.builderFor(SCHEMA).identity("data").build(); - private static final PartitionSpec PARTITION_BY_ID = PartitionSpec.builderFor(SCHEMA) - .identity("id") - .build(); + private static final PartitionSpec PARTITION_BY_ID = + PartitionSpec.builderFor(SCHEMA).identity("id").build(); private static SparkSession spark = null; @@ -125,14 +119,20 @@ public static void startSpark() { spark.udf().register("bucket4", (UDF1) bucket4::apply, IntegerType$.MODULE$); Transform day = Transforms.day(Types.TimestampType.withZone()); - spark.udf().register("ts_day", - (UDF1) timestamp -> day.apply((Long) fromJavaTimestamp(timestamp)), - IntegerType$.MODULE$); + spark + .udf() + .register( + "ts_day", + (UDF1) timestamp -> day.apply((Long) fromJavaTimestamp(timestamp)), + IntegerType$.MODULE$); Transform hour = Transforms.hour(Types.TimestampType.withZone()); - spark.udf().register("ts_hour", - (UDF1) timestamp -> hour.apply((Long) fromJavaTimestamp(timestamp)), - IntegerType$.MODULE$); + spark + .udf() + .register( + "ts_hour", + (UDF1) timestamp -> hour.apply((Long) fromJavaTimestamp(timestamp)), + IntegerType$.MODULE$); spark.udf().register("data_ident", (UDF1) data -> data, StringType$.MODULE$); spark.udf().register("id_ident", (UDF1) id -> id, LongType$.MODULE$); @@ -145,8 +145,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String format; private final boolean vectorized; @@ -154,11 +153,11 @@ public static void stopSpark() { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } @@ -188,25 +187,25 @@ public void writeUnpartitionedTable() throws IOException { // create records using the table's schema this.records = testRecords(tableSchema); - try (FileAppender writer = new GenericAppenderFactory(tableSchema).newAppender( - localOutput(testFile), fileFormat)) { + try (FileAppender writer = + new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), fileFormat)) { writer.addAll(records); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(records.size()) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(records.size()) + .withFileSizeInBytes(testFile.length()) + .withPath(testFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); } @Test public void testUnpartitionedIDFilters() { - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( - "path", unpartitioned.toString()) - ); + DataSourceOptions options = + new DataSourceOptions(ImmutableMap.of("path", unpartitioned.toString())); IcebergSource source = new IcebergSource(); @@ -219,16 +218,15 @@ public void testUnpartitionedIDFilters() { Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); // validate row filtering - assertEqualsSafe(SCHEMA.asStruct(), expected(i), - read(unpartitioned.toString(), vectorized, "id = " + i)); + assertEqualsSafe( + SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), vectorized, "id = " + i)); } } @Test public void testUnpartitionedCaseInsensitiveIDFilters() { - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( - "path", unpartitioned.toString()) - ); + DataSourceOptions options = + new DataSourceOptions(ImmutableMap.of("path", unpartitioned.toString())); // set spark.sql.caseSensitive to false String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive"); @@ -240,13 +238,17 @@ public void testUnpartitionedCaseInsensitiveIDFilters() { for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); - pushFilters(reader, EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match + pushFilters( + reader, + EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match List> tasks = reader.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); // validate row filtering - assertEqualsSafe(SCHEMA.asStruct(), expected(i), + assertEqualsSafe( + SCHEMA.asStruct(), + expected(i), read(unpartitioned.toString(), vectorized, "id = " + i)); } } finally { @@ -257,9 +259,8 @@ public void testUnpartitionedCaseInsensitiveIDFilters() { @Test public void testUnpartitionedTimestampFilter() { - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( - "path", unpartitioned.toString()) - ); + DataSourceOptions options = + new DataSourceOptions(ImmutableMap.of("path", unpartitioned.toString())); IcebergSource source = new IcebergSource(); @@ -270,22 +271,25 @@ public void testUnpartitionedTimestampFilter() { List> tasks = reader.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); - assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), - read(unpartitioned.toString(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(5, 6, 7, 8, 9), + read( + unpartitioned.toString(), + vectorized, + "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } @Test public void testBucketPartitionedIDFilters() { File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id"); - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( - "path", location.toString()) - ); + DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString())); IcebergSource source = new IcebergSource(); DataSourceReader unfiltered = source.createReader(options); - Assert.assertEquals("Unfiltered table should created 4 read tasks", - 4, unfiltered.planInputPartitions().size()); + Assert.assertEquals( + "Unfiltered table should created 4 read tasks", 4, unfiltered.planInputPartitions().size()); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); @@ -298,7 +302,8 @@ public void testBucketPartitionedIDFilters() { Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size()); // validate row filtering - assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), vectorized, "id = " + i)); + assertEqualsSafe( + SCHEMA.asStruct(), expected(i), read(location.toString(), vectorized, "id = " + i)); } } @@ -307,14 +312,12 @@ public void testBucketPartitionedIDFilters() { public void testDayPartitionedTimestampFilters() { File location = buildPartitionedTable("partitioned_by_day", PARTITION_BY_DAY, "ts_day", "ts"); - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( - "path", location.toString()) - ); + DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString())); IcebergSource source = new IcebergSource(); DataSourceReader unfiltered = source.createReader(options); - Assert.assertEquals("Unfiltered table should created 2 read tasks", - 2, unfiltered.planInputPartitions().size()); + Assert.assertEquals( + "Unfiltered table should created 2 read tasks", 2, unfiltered.planInputPartitions().size()); { DataSourceReader reader = source.createReader(options); @@ -324,39 +327,50 @@ public void testDayPartitionedTimestampFilters() { List> tasks = reader.planInputPartitions(); Assert.assertEquals("Should create one task for 2017-12-21", 1, tasks.size()); - assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), - read(location.toString(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(5, 6, 7, 8, 9), + read( + location.toString(), + vectorized, + "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } { DataSourceReader reader = source.createReader(options); - pushFilters(reader, And.apply( - GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), - LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); + pushFilters( + reader, + And.apply( + GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), + LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); List> tasks = reader.planInputPartitions(); Assert.assertEquals("Should create one task for 2017-12-22", 1, tasks.size()); - assertEqualsSafe(SCHEMA.asStruct(), expected(1, 2), read(location.toString(), vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + - "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(1, 2), + read( + location.toString(), + vectorized, + "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); } } @SuppressWarnings("checkstyle:AvoidNestedBlocks") @Test public void testHourPartitionedTimestampFilters() { - File location = buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts"); + File location = + buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts"); - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( - "path", location.toString()) - ); + DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString())); IcebergSource source = new IcebergSource(); DataSourceReader unfiltered = source.createReader(options); - Assert.assertEquals("Unfiltered table should created 9 read tasks", - 9, unfiltered.planInputPartitions().size()); + Assert.assertEquals( + "Unfiltered table should created 9 read tasks", 9, unfiltered.planInputPartitions().size()); { DataSourceReader reader = source.createReader(options); @@ -366,23 +380,35 @@ public void testHourPartitionedTimestampFilters() { List> tasks = reader.planInputPartitions(); Assert.assertEquals("Should create 4 tasks for 2017-12-21: 15, 17, 21, 22", 4, tasks.size()); - assertEqualsSafe(SCHEMA.asStruct(), expected(8, 9, 7, 6, 5), - read(location.toString(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(8, 9, 7, 6, 5), + read( + location.toString(), + vectorized, + "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } { DataSourceReader reader = source.createReader(options); - pushFilters(reader, And.apply( - GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), - LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); + pushFilters( + reader, + And.apply( + GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), + LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); List> tasks = reader.planInputPartitions(); Assert.assertEquals("Should create 2 tasks for 2017-12-22: 6, 7", 2, tasks.size()); - assertEqualsSafe(SCHEMA.asStruct(), expected(2, 1), read(location.toString(), vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + - "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(2, 1), + read( + location.toString(), + vectorized, + "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); } } @@ -396,10 +422,15 @@ public void testFilterByNonProjectedColumn() { expected.add(projectFlat(actualProjection, rec)); } - assertEqualsSafe(actualProjection.asStruct(), expected, read( - unpartitioned.toString(), vectorized, - "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)", - "id", "data")); + assertEqualsSafe( + actualProjection.asStruct(), + expected, + read( + unpartitioned.toString(), + vectorized, + "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)", + "id", + "data")); } { @@ -411,57 +442,63 @@ public void testFilterByNonProjectedColumn() { expected.add(projectFlat(actualProjection, rec)); } - assertEqualsSafe(actualProjection.asStruct(), expected, read( - unpartitioned.toString(), vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + - "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)", - "id")); + assertEqualsSafe( + actualProjection.asStruct(), + expected, + read( + unpartitioned.toString(), + vectorized, + "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)", + "id")); } } @Test public void testInFilter() { - File location = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); + File location = + buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( - "path", location.toString()) - ); + DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString())); IcebergSource source = new IcebergSource(); DataSourceReader reader = source.createReader(options); - pushFilters(reader, new In("data", new String[]{"foo", "junction", "brush", null})); + pushFilters(reader, new In("data", new String[] {"foo", "junction", "brush", null})); Assert.assertEquals(2, reader.planInputPartitions().size()); } @Test public void testInFilterForTimestamp() { - File location = buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts"); + File location = + buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts"); - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( - "path", location.toString()) - ); + DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString())); IcebergSource source = new IcebergSource(); DataSourceReader reader = source.createReader(options); - pushFilters(reader, new In("ts", new Timestamp[]{ - new Timestamp(instant("2017-12-22T00:00:00.123+00:00") / 1000), - new Timestamp(instant("2017-12-22T09:20:44.294+00:00") / 1000), - new Timestamp(instant("2017-12-22T00:34:00.184+00:00") / 1000), - new Timestamp(instant("2017-12-21T15:15:16.230+00:00") / 1000), - null - })); - - Assert.assertEquals("Should create 1 task for 2017-12-21: 15", 1, reader.planInputPartitions().size()); + pushFilters( + reader, + new In( + "ts", + new Timestamp[] { + new Timestamp(instant("2017-12-22T00:00:00.123+00:00") / 1000), + new Timestamp(instant("2017-12-22T09:20:44.294+00:00") / 1000), + new Timestamp(instant("2017-12-22T00:34:00.184+00:00") / 1000), + new Timestamp(instant("2017-12-21T15:15:16.230+00:00") / 1000), + null + })); + + Assert.assertEquals( + "Should create 1 task for 2017-12-21: 15", 1, reader.planInputPartitions().size()); } @Test public void testPartitionedByDataStartsWithFilter() { - File location = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); + File location = + buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( - "path", location.toString()) - ); + DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString())); IcebergSource source = new IcebergSource(); DataSourceReader reader = source.createReader(options); @@ -474,9 +511,7 @@ public void testPartitionedByDataStartsWithFilter() { public void testPartitionedByIdStartsWith() { File location = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id"); - DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( - "path", location.toString()) - ); + DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("path", location.toString())); IcebergSource source = new IcebergSource(); DataSourceReader reader = source.createReader(options); @@ -487,15 +522,15 @@ public void testPartitionedByIdStartsWith() { @Test public void testUnpartitionedStartsWith() { - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()); - List matchedData = df.select("data") - .where("data LIKE 'jun%'") - .as(Encoders.STRING()) - .collectAsList(); + List matchedData = + df.select("data").where("data LIKE 'jun%'").as(Encoders.STRING()).collectAsList(); Assert.assertEquals(1, matchedData.size()); Assert.assertEquals("junction", matchedData.get(0)); @@ -511,8 +546,8 @@ private static Record projectFlat(Schema projection, Record record) { return result; } - public static void assertEqualsUnsafe(Types.StructType struct, - List expected, List actual) { + public static void assertEqualsUnsafe( + Types.StructType struct, List expected, List actual) { // TODO: match records by ID int numRecords = Math.min(expected.size(), actual.size()); for (int i = 0; i < numRecords; i += 1) { @@ -521,8 +556,8 @@ public static void assertEqualsUnsafe(Types.StructType struct, Assert.assertEquals("Number of results should match expected", expected.size(), actual.size()); } - public static void assertEqualsSafe(Types.StructType struct, - List expected, List actual) { + public static void assertEqualsSafe( + Types.StructType struct, List expected, List actual) { // TODO: match records by ID int numRecords = Math.min(expected.size(), actual.size()); for (int i = 0; i < numRecords; i += 1) { @@ -545,7 +580,8 @@ private void pushFilters(DataSourceReader reader, Filter... filters) { filterable.pushFilters(filters); } - private File buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) { + private File buildPartitionedTable( + String desc, PartitionSpec spec, String udf, String partitionColumn) { File location = new File(parent, desc); Table byId = TABLES.create(SCHEMA, spec, location.toString()); @@ -554,10 +590,12 @@ private File buildPartitionedTable(String desc, PartitionSpec spec, String udf, byId.updateProperties().set("read.split.target-size", "2048").commit(); // copy the unpartitioned table into the partitioned table to produce the partitioned data - Dataset allRows = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()); + Dataset allRows = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()); allRows .coalesce(1) // ensure only 1 file per partition is written @@ -583,19 +621,23 @@ private List testRecords(Schema schema) { record(schema, 6L, parse("2017-12-21T21:55:30.589712+00:00"), "element"), record(schema, 7L, parse("2017-12-21T17:31:14.532797+00:00"), "limited"), record(schema, 8L, parse("2017-12-21T15:21:51.237521+00:00"), "global"), - record(schema, 9L, parse("2017-12-21T15:02:15.230570+00:00"), "goldfish") - ); + record(schema, 9L, parse("2017-12-21T15:02:15.230570+00:00"), "goldfish")); } private static List read(String table, boolean vectorized, String expr) { return read(table, vectorized, expr, "*"); } - private static List read(String table, boolean vectorized, String expr, String select0, String... selectN) { - Dataset dataset = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table).filter(expr) - .select(select0, selectN); + private static List read( + String table, boolean vectorized, String expr, String select0, String... selectN) { + Dataset dataset = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table) + .filter(expr) + .select(select0, selectN); return dataset.collectAsList(); } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java index 4b16099bc5dd..0237e4d63bbd 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localInput; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.List; @@ -61,25 +63,26 @@ import org.junit.rules.TemporaryFolder; import scala.collection.JavaConversions; -import static org.apache.iceberg.Files.localInput; -import static org.apache.iceberg.Files.localOutput; - public class TestForwardCompatibility { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); // create a spec for the schema that uses a "zero" transform that produces all 0s - private static final PartitionSpec UNKNOWN_SPEC = PartitionSpecParser.fromJson(SCHEMA, - "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); + private static final PartitionSpec UNKNOWN_SPEC = + PartitionSpecParser.fromJson( + SCHEMA, + "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); // create a fake spec to use to write table metadata - private static final PartitionSpec FAKE_SPEC = PartitionSpecParser.fromJson(SCHEMA, - "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"identity\", \"source-id\": 1 } ] }"); + private static final PartitionSpec FAKE_SPEC = + PartitionSpecParser.fromJson( + SCHEMA, + "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"identity\", \"source-id\": 1 } ] }"); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; @@ -105,20 +108,22 @@ public void testSparkWriteFailsUnknownTransform() throws IOException { HadoopTables tables = new HadoopTables(CONF); tables.create(SCHEMA, UNKNOWN_SPEC, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - AssertHelpers.assertThrows("Should reject write with unsupported transform", - UnsupportedOperationException.class, "Cannot write using unsupported transforms: zero", - () -> df.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(location.toString())); + AssertHelpers.assertThrows( + "Should reject write with unsupported transform", + UnsupportedOperationException.class, + "Cannot write using unsupported transforms: zero", + () -> + df.select("id", "data") + .write() + .format("iceberg") + .mode("append") + .save(location.toString())); } @Test @@ -134,20 +139,24 @@ public void testSparkStreamingWriteFailsUnknownTransform() throws IOException { tables.create(SCHEMA, UNKNOWN_SPEC, location.toString()); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - StreamingQuery query = inputStream.toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("append") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()) - .start(); + StreamingQuery query = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("append") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()) + .start(); List batch1 = Lists.newArrayList(1, 2); send(batch1, inputStream); - AssertHelpers.assertThrows("Should reject streaming write with unsupported transform", - StreamingQueryException.class, "Cannot write using unsupported transforms: zero", + AssertHelpers.assertThrows( + "Should reject streaming write with unsupported transform", + StreamingQueryException.class, + "Cannot write using unsupported transforms: zero", query::processAllAvailable); } @@ -166,22 +175,22 @@ public void testSparkCanReadUnknownTransform() throws IOException { List expected = RandomData.generateList(table.schema(), 100, 1L); - File parquetFile = new File(dataFolder, - FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); - FileAppender writer = Parquet.write(localOutput(parquetFile)) - .schema(table.schema()) - .build(); + File parquetFile = + new File(dataFolder, FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); + FileAppender writer = + Parquet.write(localOutput(parquetFile)).schema(table.schema()).build(); try { writer.addAll(expected); } finally { writer.close(); } - DataFile file = DataFiles.builder(FAKE_SPEC) - .withInputFile(localInput(parquetFile)) - .withMetrics(writer.metrics()) - .withPartitionPath("id_zero=0") - .build(); + DataFile file = + DataFiles.builder(FAKE_SPEC) + .withInputFile(localInput(parquetFile)) + .withMetrics(writer.metrics()) + .withPartitionPath("id_zero=0") + .build(); OutputFile manifestFile = localOutput(FileFormat.AVRO.addExtension(temp.newFile().toString())); ManifestWriter manifestWriter = ManifestFiles.write(FAKE_SPEC, manifestFile); @@ -193,9 +202,7 @@ public void testSparkCanReadUnknownTransform() throws IOException { table.newFastAppend().appendManifest(manifestWriter.toManifestFile()).commit(); - Dataset df = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset df = spark.read().format("iceberg").load(location.toString()); List rows = df.collectAsList(); Assert.assertEquals("Should contain 100 rows", 100, rows.size()); diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java index fdf1b3391825..72e7b72b508e 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.hadoop.conf.Configuration; diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java index f1cfc7a72e17..b55ba0e2199a 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java index 76923d43a3bc..f6df8d495b90 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -62,7 +61,8 @@ public Table createTable(TableIdentifier ident, Schema schema, PartitionSpec spe @Override public Table loadTable(TableIdentifier ident, String entriesSuffix) { - TableIdentifier identifier = TableIdentifier.of(ident.namespace().level(0), ident.name(), entriesSuffix); + TableIdentifier identifier = + TableIdentifier.of(ident.namespace().level(0), ident.name(), entriesSuffix); return TestIcebergSourceHiveTables.catalog.loadTable(identifier); } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java index 6b62d2e9bef1..d70e3dcbf181 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.ManifestContent.DATA; +import static org.apache.iceberg.ManifestContent.DELETES; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.io.UncheckedIOException; import java.util.Comparator; @@ -76,33 +80,26 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.ManifestContent.DATA; -import static org.apache.iceberg.ManifestContent.DELETES; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class TestIcebergSourceTablesBase extends SparkTestBase { - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - private static final Schema SCHEMA2 = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()), - optional(3, "category", Types.StringType.get()) - ); + private static final Schema SCHEMA2 = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get()), + optional(3, "category", Types.StringType.get())); - private static final Schema SCHEMA3 = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(3, "category", Types.StringType.get()) - ); + private static final Schema SCHEMA3 = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(3, "category", Types.StringType.get())); private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("id").build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); public abstract Table createTable(TableIdentifier ident, Schema schema, PartitionSpec spec); @@ -117,23 +114,21 @@ public synchronized void testTablesSupport() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "1"), - new SimpleRecord(2, "2"), - new SimpleRecord(3, "3")); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "1"), new SimpleRecord(2, "2"), new SimpleRecord(3, "3")); Dataset inputDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - List actualRecords = resultDf.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + List actualRecords = + resultDf.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Records should match", expectedRecords, actualRecords); } @@ -147,17 +142,21 @@ public void testEntriesTable() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .collectAsList(); Snapshot snapshot = table.currentSnapshot(); @@ -165,14 +164,16 @@ public void testEntriesTable() throws Exception { InputFile manifest = table.io().newInputFile(snapshot.allManifests().get(0).path()); List expected = Lists.newArrayList(); - try (CloseableIterable rows = Avro.read(manifest).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(manifest).project(entriesTable.schema()).build()) { // each row must inherit snapshot_id and sequence_number - rows.forEach(row -> { - row.put(2, 0L); - GenericData.Record file = (GenericData.Record) row.get("data_file"); - asMetadataRecord(file); - expected.add(row); - }); + rows.forEach( + row -> { + row.put(2, 0L); + GenericData.Record file = (GenericData.Record) row.get("data_file"); + asMetadataRecord(file); + expected.add(row); + }); } Assert.assertEquals("Entries table should have one row", 1, expected.size()); @@ -188,18 +189,22 @@ public void testEntriesTablePartitionedPrune() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("status") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select("status") + .collectAsList(); Assert.assertEquals("Results should contain only one status", 1, actual.size()); Assert.assertEquals("That status should be Added (1)", 1, actual.get(0).getInt(0)); @@ -213,7 +218,9 @@ public void testEntriesTableDataFilePrune() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -221,15 +228,19 @@ public void testEntriesTableDataFilePrune() throws Exception { table.refresh(); DataFile file = table.currentSnapshot().addedFiles().iterator().next(); - List singleActual = rowsToJava(spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("data_file.file_path") - .collectAsList()); + List singleActual = + rowsToJava( + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select("data_file.file_path") + .collectAsList()); List singleExpected = ImmutableList.of(row(file.path())); - assertEquals("Should prune a single element from a nested struct", singleExpected, singleActual); + assertEquals( + "Should prune a single element from a nested struct", singleExpected, singleActual); } @Test @@ -240,7 +251,9 @@ public void testEntriesTableDataFilePruneMulti() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -248,14 +261,22 @@ public void testEntriesTableDataFilePruneMulti() throws Exception { table.refresh(); DataFile file = table.currentSnapshot().addedFiles().iterator().next(); - List multiActual = rowsToJava(spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("data_file.file_path", "data_file.value_counts", "data_file.record_count", "data_file.column_sizes") - .collectAsList()); - - List multiExpected = ImmutableList.of( - row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); + List multiActual = + rowsToJava( + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select( + "data_file.file_path", + "data_file.value_counts", + "data_file.record_count", + "data_file.column_sizes") + .collectAsList()); + + List multiExpected = + ImmutableList.of( + row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); assertEquals("Should prune a single element from a nested struct", multiExpected, multiActual); } @@ -268,7 +289,9 @@ public void testFilesSelectMap() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -276,14 +299,18 @@ public void testFilesSelectMap() throws Exception { table.refresh(); DataFile file = table.currentSnapshot().addedFiles().iterator().next(); - List multiActual = rowsToJava(spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .select("file_path", "value_counts", "record_count", "column_sizes") - .collectAsList()); + List multiActual = + rowsToJava( + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "files")) + .select("file_path", "value_counts", "record_count", "column_sizes") + .collectAsList()); - List multiExpected = ImmutableList.of( - row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); + List multiExpected = + ImmutableList.of( + row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); assertEquals("Should prune a single element from a row", multiExpected, multiActual); } @@ -294,10 +321,13 @@ public void testAllEntriesTable() throws Exception { Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); Table entriesTable = loadTable(tableIdentifier, "all_entries"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -306,7 +336,8 @@ public void testAllEntriesTable() throws Exception { table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -314,23 +345,28 @@ public void testAllEntriesTable() throws Exception { // ensure table data isn't stale table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_entries")) - .orderBy("snapshot_id") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_entries")) + .orderBy("snapshot_id") + .collectAsList(); List expected = Lists.newArrayList(); - for (ManifestFile manifest : Iterables.concat(Iterables.transform(table.snapshots(), Snapshot::allManifests))) { + for (ManifestFile manifest : + Iterables.concat(Iterables.transform(table.snapshots(), Snapshot::allManifests))) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { // each row must inherit snapshot_id and sequence_number - rows.forEach(row -> { - row.put(2, 0L); - GenericData.Record file = (GenericData.Record) row.get("data_file"); - asMetadataRecord(file); - expected.add(row); - }); + rows.forEach( + row -> { + row.put(2, 0L); + GenericData.Record file = (GenericData.Record) row.get("data_file"); + asMetadataRecord(file); + expected.add(row); + }); } } @@ -339,7 +375,8 @@ public void testAllEntriesTable() throws Exception { Assert.assertEquals("Entries table should have 3 rows", 3, expected.size()); Assert.assertEquals("Actual results should have 3 rows", 3, actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(entriesTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + entriesTable.schema().asStruct(), expected.get(i), actual.get(i)); } } @@ -351,7 +388,9 @@ public void testCountEntriesTable() { // init load List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -359,12 +398,16 @@ public void testCountEntriesTable() { final int expectedEntryCount = 1; // count entries - Assert.assertEquals("Count should return " + expectedEntryCount, - expectedEntryCount, spark.read().format("iceberg").load(loadLocation(tableIdentifier, "entries")).count()); + Assert.assertEquals( + "Count should return " + expectedEntryCount, + expectedEntryCount, + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "entries")).count()); // count all_entries - Assert.assertEquals("Count should return " + expectedEntryCount, - expectedEntryCount, spark.read().format("iceberg").load(loadLocation(tableIdentifier, "all_entries")).count()); + Assert.assertEquals( + "Count should return " + expectedEntryCount, + expectedEntryCount, + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "all_entries")).count()); } @Test @@ -374,16 +417,20 @@ public void testFilesTable() throws Exception { Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "files"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -391,15 +438,14 @@ public void testFilesTable() throws Exception { // delete the first file to test that only live files are listed table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .collectAsList(); + List actual = + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")).collectAsList(); List expected = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().dataManifests()) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { if ((Integer) record.get("status") < 2 /* added or existing */) { GenericData.Record file = (GenericData.Record) record.get("data_file"); @@ -421,42 +467,42 @@ public void testFilesTableWithSnapshotIdInheritance() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "files_inheritance_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); - table.updateProperties() - .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "files"); - spark.sql(String.format( - "CREATE TABLE parquet_table (data string, id int) " + - "USING parquet PARTITIONED BY (id) LOCATION '%s'", - temp.newFolder())); + spark.sql( + String.format( + "CREATE TABLE parquet_table (data string, id int) " + + "USING parquet PARTITIONED BY (id) LOCATION '%s'", + temp.newFolder())); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF.select("data", "id").write() - .mode("overwrite") - .insertInto("parquet_table"); + inputDF.select("data", "id").write().mode("overwrite").insertInto("parquet_table"); try { String stagingLocation = table.location() + "/metadata"; - SparkTableUtil.importSparkTable(spark, + SparkTableUtil.importSparkTable( + spark, new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), - table, stagingLocation); + table, + stagingLocation); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "files")) + .collectAsList(); List expected = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().dataManifests()) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { GenericData.Record file = (GenericData.Record) record.get("data_file"); asMetadataRecord(file); @@ -472,7 +518,6 @@ public void testFilesTableWithSnapshotIdInheritance() throws Exception { } finally { spark.sql("DROP TABLE parquet_table"); } - } @Test @@ -483,35 +528,35 @@ public void testEntriesTableWithSnapshotIdInheritance() throws Exception { PartitionSpec spec = SPEC; Table table = createTable(tableIdentifier, SCHEMA, spec); - table.updateProperties() - .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); - spark.sql(String.format( - "CREATE TABLE parquet_table (data string, id int) " + - "USING parquet PARTITIONED BY (id) LOCATION '%s'", - temp.newFolder())); + spark.sql( + String.format( + "CREATE TABLE parquet_table (data string, id int) " + + "USING parquet PARTITIONED BY (id) LOCATION '%s'", + temp.newFolder())); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF.select("data", "id").write() - .mode("overwrite") - .insertInto("parquet_table"); + inputDF.select("data", "id").write().mode("overwrite").insertInto("parquet_table"); try { String stagingLocation = table.location() + "/metadata"; SparkTableUtil.importSparkTable( - spark, new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), table, stagingLocation); + spark, + new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), + table, + stagingLocation); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("sequence_number", "snapshot_id", "data_file") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select("sequence_number", "snapshot_id", "data_file") + .collectAsList(); table.refresh(); @@ -534,10 +579,13 @@ public void testFilesUnpartitionedTable() throws Exception { Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "files"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -546,7 +594,8 @@ public void testFilesUnpartitionedTable() throws Exception { DataFile toDelete = Iterables.getOnlyElement(table.currentSnapshot().addedFiles()); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -554,15 +603,14 @@ public void testFilesUnpartitionedTable() throws Exception { // delete the first file to test that only live files are listed table.newDelete().deleteFile(toDelete).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .collectAsList(); + List actual = + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")).collectAsList(); List expected = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().dataManifests()) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { if ((Integer) record.get("status") < 2 /* added or existing */) { GenericData.Record file = (GenericData.Record) record.get("data_file"); @@ -585,38 +633,49 @@ public void testAllMetadataTablesWithStagedCommits() throws Exception { table.updateProperties().set(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, "true").commit(); spark.conf().set("spark.wap.id", "1234567"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actualAllData = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_data_files")) - .collectAsList(); - - List actualAllManifests = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .collectAsList(); - - List actualAllEntries = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_entries")) - .collectAsList(); - - Assert.assertTrue("Stage table should have some snapshots", table.snapshots().iterator().hasNext()); - Assert.assertEquals("Stage table should have null currentSnapshot", - null, table.currentSnapshot()); + List actualAllData = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_data_files")) + .collectAsList(); + + List actualAllManifests = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_manifests")) + .collectAsList(); + + List actualAllEntries = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_entries")) + .collectAsList(); + + Assert.assertTrue( + "Stage table should have some snapshots", table.snapshots().iterator().hasNext()); + Assert.assertEquals( + "Stage table should have null currentSnapshot", null, table.currentSnapshot()); Assert.assertEquals("Actual results should have two rows", 2, actualAllData.size()); Assert.assertEquals("Actual results should have two rows", 2, actualAllManifests.size()); Assert.assertEquals("Actual results should have two rows", 2, actualAllEntries.size()); @@ -629,10 +688,13 @@ public void testAllDataFilesTable() throws Exception { Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "all_data_files"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -641,7 +703,8 @@ public void testAllDataFilesTable() throws Exception { table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -649,17 +712,21 @@ public void testAllDataFilesTable() throws Exception { // ensure table data isn't stale table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_data_files")) - .orderBy("file_path") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_data_files")) + .orderBy("file_path") + .collectAsList(); actual.sort(Comparator.comparing(o -> o.getString(1))); List expected = Lists.newArrayList(); - for (ManifestFile manifest : Iterables.concat(Iterables.transform(table.snapshots(), Snapshot::dataManifests))) { + for (ManifestFile manifest : + Iterables.concat(Iterables.transform(table.snapshots(), Snapshot::dataManifests))) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { if ((Integer) record.get("status") < 2 /* added or existing */) { GenericData.Record file = (GenericData.Record) record.get("data_file"); @@ -688,7 +755,9 @@ public void testHistoryTable() { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -697,7 +766,9 @@ public void testHistoryTable() { long firstSnapshotTimestamp = table.currentSnapshot().timestampMillis(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -710,7 +781,9 @@ public void testHistoryTable() { table.rollback().toSnapshotId(firstSnapshotId).commit(); long rollbackTimestamp = Iterables.getLast(table.history()).timestampMillis(); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -719,34 +792,43 @@ public void testHistoryTable() { long thirdSnapshotTimestamp = table.currentSnapshot().timestampMillis(); long thirdSnapshotId = table.currentSnapshot().snapshotId(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "history")) - .collectAsList(); - - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(historyTable.schema(), "history")); - List expected = Lists.newArrayList( - builder.set("made_current_at", firstSnapshotTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("is_current_ancestor", true) - .build(), - builder.set("made_current_at", secondSnapshotTimestamp * 1000) - .set("snapshot_id", secondSnapshotId) - .set("parent_id", firstSnapshotId) - .set("is_current_ancestor", false) // commit rolled back, not an ancestor of the current table state - .build(), - builder.set("made_current_at", rollbackTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("is_current_ancestor", true) - .build(), - builder.set("made_current_at", thirdSnapshotTimestamp * 1000) - .set("snapshot_id", thirdSnapshotId) - .set("parent_id", firstSnapshotId) - .set("is_current_ancestor", true) - .build() - ); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "history")) + .collectAsList(); + + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(historyTable.schema(), "history")); + List expected = + Lists.newArrayList( + builder + .set("made_current_at", firstSnapshotTimestamp * 1000) + .set("snapshot_id", firstSnapshotId) + .set("parent_id", null) + .set("is_current_ancestor", true) + .build(), + builder + .set("made_current_at", secondSnapshotTimestamp * 1000) + .set("snapshot_id", secondSnapshotId) + .set("parent_id", firstSnapshotId) + .set( + "is_current_ancestor", + false) // commit rolled back, not an ancestor of the current table state + .build(), + builder + .set("made_current_at", rollbackTimestamp * 1000) + .set("snapshot_id", firstSnapshotId) + .set("parent_id", null) + .set("is_current_ancestor", true) + .build(), + builder + .set("made_current_at", thirdSnapshotTimestamp * 1000) + .set("snapshot_id", thirdSnapshotId) + .set("parent_id", firstSnapshotId) + .set("is_current_ancestor", true) + .build()); Assert.assertEquals("History table should have a row for each commit", 4, actual.size()); TestHelpers.assertEqualsSafe(historyTable.schema().asStruct(), expected.get(0), actual.get(0)); @@ -763,7 +845,9 @@ public void testSnapshotsTable() { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -782,40 +866,47 @@ public void testSnapshotsTable() { // rollback the table state to the first snapshot table.rollback().toSnapshotId(firstSnapshotId).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "snapshots")) - .collectAsList(); - - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(snapTable.schema(), "snapshots")); - List expected = Lists.newArrayList( - builder.set("committed_at", firstSnapshotTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("operation", "append") - .set("manifest_list", firstManifestList) - .set("summary", ImmutableMap.of( - "added-records", "1", - "added-data-files", "1", - "changed-partition-count", "1", - "total-data-files", "1", - "total-records", "1" - )) - .build(), - builder.set("committed_at", secondSnapshotTimestamp * 1000) - .set("snapshot_id", secondSnapshotId) - .set("parent_id", firstSnapshotId) - .set("operation", "delete") - .set("manifest_list", secondManifestList) - .set("summary", ImmutableMap.of( - "deleted-records", "1", - "deleted-data-files", "1", - "changed-partition-count", "1", - "total-records", "0", - "total-data-files", "0" - )) - .build() - ); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "snapshots")) + .collectAsList(); + + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(snapTable.schema(), "snapshots")); + List expected = + Lists.newArrayList( + builder + .set("committed_at", firstSnapshotTimestamp * 1000) + .set("snapshot_id", firstSnapshotId) + .set("parent_id", null) + .set("operation", "append") + .set("manifest_list", firstManifestList) + .set( + "summary", + ImmutableMap.of( + "added-records", "1", + "added-data-files", "1", + "changed-partition-count", "1", + "total-data-files", "1", + "total-records", "1")) + .build(), + builder + .set("committed_at", secondSnapshotTimestamp * 1000) + .set("snapshot_id", secondSnapshotId) + .set("parent_id", firstSnapshotId) + .set("operation", "delete") + .set("manifest_list", secondManifestList) + .set( + "summary", + ImmutableMap.of( + "deleted-records", "1", + "deleted-data-files", "1", + "changed-partition-count", "1", + "total-records", "0", + "total-data-files", "0")) + .build()); Assert.assertEquals("Snapshots table should have a row for each snapshot", 2, actual.size()); TestHelpers.assertEqualsSafe(snapTable.schema().asStruct(), expected.get(0), actual.get(0)); @@ -830,7 +921,9 @@ public void testPrunedSnapshotsTable() { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -846,40 +939,47 @@ public void testPrunedSnapshotsTable() { // rollback the table state to the first snapshot table.rollback().toSnapshotId(firstSnapshotId).commit(); - Dataset actualDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "snapshots")) - .select("operation", "committed_at", "summary", "parent_id"); + Dataset actualDf = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "snapshots")) + .select("operation", "committed_at", "summary", "parent_id"); Schema projectedSchema = SparkSchemaUtil.convert(actualDf.schema()); List actual = actualDf.collectAsList(); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema, "snapshots")); - List expected = Lists.newArrayList( - builder.set("committed_at", firstSnapshotTimestamp * 1000) - .set("parent_id", null) - .set("operation", "append") - .set("summary", ImmutableMap.of( - "added-records", "1", - "added-data-files", "1", - "changed-partition-count", "1", - "total-data-files", "1", - "total-records", "1" - )) - .build(), - builder.set("committed_at", secondSnapshotTimestamp * 1000) - .set("parent_id", firstSnapshotId) - .set("operation", "delete") - .set("summary", ImmutableMap.of( - "deleted-records", "1", - "deleted-data-files", "1", - "changed-partition-count", "1", - "total-records", "0", - "total-data-files", "0" - )) - .build() - ); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema, "snapshots")); + List expected = + Lists.newArrayList( + builder + .set("committed_at", firstSnapshotTimestamp * 1000) + .set("parent_id", null) + .set("operation", "append") + .set( + "summary", + ImmutableMap.of( + "added-records", "1", + "added-data-files", "1", + "changed-partition-count", "1", + "total-data-files", "1", + "total-records", "1")) + .build(), + builder + .set("committed_at", secondSnapshotTimestamp * 1000) + .set("parent_id", firstSnapshotId) + .set("operation", "delete") + .set( + "summary", + ImmutableMap.of( + "deleted-records", "1", + "deleted-data-files", "1", + "changed-partition-count", "1", + "total-records", "0", + "total-data-files", "0")) + .build()); Assert.assertEquals("Snapshots table should have a row for each snapshot", 2, actual.size()); TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(0), actual.get(0)); @@ -891,48 +991,73 @@ public void testManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "manifests"); - Dataset df1 = spark.createDataFrame( - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame( + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), + SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .collectAsList(); table.refresh(); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema(), "manifests")); - GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema().findType("partition_summaries.element").asStructType(), "partition_summary")); - List expected = Lists.transform(table.currentSnapshot().allManifests(), manifest -> - builder - .set("content", manifest.content().id()) - .set("path", manifest.path()) - .set("length", manifest.length()) - .set("partition_spec_id", manifest.partitionSpecId()) - .set("added_snapshot_id", manifest.snapshotId()) - .set("added_data_files_count", manifest.content() == DATA ? manifest.addedFilesCount() : 0) - .set("existing_data_files_count", manifest.content() == DATA ? manifest.existingFilesCount() : 0) - .set("deleted_data_files_count", manifest.content() == DATA ? manifest.deletedFilesCount() : 0) - .set("added_delete_files_count", manifest.content() == DELETES ? manifest.addedFilesCount() : 0) - .set("existing_delete_files_count", manifest.content() == DELETES ? manifest.existingFilesCount() : 0) - .set("deleted_delete_files_count", manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) - .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> - summaryBuilder - .set("contains_null", true) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build() - )) - .build() - ); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(manifestTable.schema(), "manifests")); + GenericRecordBuilder summaryBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + manifestTable.schema().findType("partition_summaries.element").asStructType(), + "partition_summary")); + List expected = + Lists.transform( + table.currentSnapshot().allManifests(), + manifest -> + builder + .set("content", manifest.content().id()) + .set("path", manifest.path()) + .set("length", manifest.length()) + .set("partition_spec_id", manifest.partitionSpecId()) + .set("added_snapshot_id", manifest.snapshotId()) + .set( + "added_data_files_count", + manifest.content() == DATA ? manifest.addedFilesCount() : 0) + .set( + "existing_data_files_count", + manifest.content() == DATA ? manifest.existingFilesCount() : 0) + .set( + "deleted_data_files_count", + manifest.content() == DATA ? manifest.deletedFilesCount() : 0) + .set( + "added_delete_files_count", + manifest.content() == DELETES ? manifest.addedFilesCount() : 0) + .set( + "existing_delete_files_count", + manifest.content() == DELETES ? manifest.existingFilesCount() : 0) + .set( + "deleted_delete_files_count", + manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) + .set( + "partition_summaries", + Lists.transform( + manifest.partitions(), + partition -> + summaryBuilder + .set("contains_null", true) + .set("contains_nan", false) + .set("lower_bound", "1") + .set("upper_bound", "1") + .build())) + .build()); Assert.assertEquals("Manifests table should have one manifest row", 1, actual.size()); TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(0), actual.get(0)); @@ -943,56 +1068,77 @@ public void testPruneManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "manifests"); - Dataset df1 = spark.createDataFrame( - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame( + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), + SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); if (!spark.version().startsWith("2")) { // Spark 2 isn't able to actually push down nested struct projections so this will not break - AssertHelpers.assertThrows("Can't prune struct inside list", SparkException.class, + AssertHelpers.assertThrows( + "Can't prune struct inside list", + SparkException.class, "Cannot project a partial list element struct", - () -> spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries.contains_null") - .collectAsList()); + () -> + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .select("partition_spec_id", "path", "partition_summaries.contains_null") + .collectAsList()); } - Dataset actualDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries"); + Dataset actualDf = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .select("partition_spec_id", "path", "partition_summaries"); Schema projectedSchema = SparkSchemaUtil.convert(actualDf.schema()); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .select("partition_spec_id", "path", "partition_summaries") + .collectAsList(); table.refresh(); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema.asStruct())); - GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - projectedSchema.findType("partition_summaries.element").asStructType(), "partition_summary")); - List expected = Lists.transform(table.currentSnapshot().allManifests(), manifest -> - builder.set("partition_spec_id", manifest.partitionSpecId()) - .set("path", manifest.path()) - .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> - summaryBuilder - .set("contains_null", true) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build() - )) - .build() - ); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema.asStruct())); + GenericRecordBuilder summaryBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + projectedSchema.findType("partition_summaries.element").asStructType(), + "partition_summary")); + List expected = + Lists.transform( + table.currentSnapshot().allManifests(), + manifest -> + builder + .set("partition_spec_id", manifest.partitionSpecId()) + .set("path", manifest.path()) + .set( + "partition_summaries", + Lists.transform( + manifest.partitions(), + partition -> + summaryBuilder + .set("contains_null", true) + .set("contains_nan", false) + .set("lower_bound", "1") + .set("upper_bound", "1") + .build())) + .build()); Assert.assertEquals("Manifests table should have one manifest row", 1, actual.size()); TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(0), actual.get(0)); @@ -1003,53 +1149,62 @@ public void testAllManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "all_manifests"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - table.updateProperties() - .set(TableProperties.FORMAT_VERSION, "2") - .commit(); + table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); - DataFile dataFile = Iterables.getFirst(table.currentSnapshot().addedDataFiles(table.io()), null); + DataFile dataFile = + Iterables.getFirst(table.currentSnapshot().addedDataFiles(table.io()), null); PartitionSpec dataFileSpec = table.specs().get(dataFile.specId()); StructLike dataFilePartition = dataFile.partition(); PositionDelete delete = PositionDelete.create(); delete.set(dataFile.path(), 0L, null); - DeleteFile deleteFile = writePositionDeletes(table, dataFileSpec, dataFilePartition, ImmutableList.of(delete)); + DeleteFile deleteFile = + writePositionDeletes(table, dataFileSpec, dataFilePartition, ImmutableList.of(delete)); - table.newRowDelta() - .addDeletes(deleteFile) - .commit(); + table.newRowDelta().addDeletes(deleteFile).commit(); table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); Stream> snapshotIdToManifests = StreamSupport.stream(table.snapshots().spliterator(), false) - .flatMap(snapshot -> snapshot.allManifests(table.io()).stream().map( - manifest -> Pair.of(snapshot.snapshotId(), manifest))); - - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .orderBy("path") - .collectAsList(); + .flatMap( + snapshot -> + snapshot.allManifests(table.io()).stream() + .map(manifest -> Pair.of(snapshot.snapshotId(), manifest))); + + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_manifests")) + .orderBy("path") + .collectAsList(); table.refresh(); - List expected = snapshotIdToManifests - .map(snapshotManifest -> manifestRecord(manifestTable, snapshotManifest.first(), snapshotManifest.second())) - .collect(Collectors.toList()); + List expected = + snapshotIdToManifests + .map( + snapshotManifest -> + manifestRecord( + manifestTable, snapshotManifest.first(), snapshotManifest.second())) + .collect(Collectors.toList()); expected.sort(Comparator.comparing(o -> o.get("path").toString())); Assert.assertEquals("Manifests table should have 5 manifest rows", 5, actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); } } @@ -1058,33 +1213,37 @@ public void testUnpartitionedPartitionsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "unpartitioned_partitions_test"); createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - Dataset df = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - Types.StructType expectedSchema = Types.StructType.of( - required(2, "record_count", Types.LongType.get()), - required(3, "file_count", Types.IntegerType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + required(2, "record_count", Types.LongType.get()), + required(3, "file_count", Types.IntegerType.get())); Table partitionsTable = loadTable(tableIdentifier, "partitions"); - Assert.assertEquals("Schema should not have partition field", - expectedSchema, partitionsTable.schema().asStruct()); + Assert.assertEquals( + "Schema should not have partition field", + expectedSchema, + partitionsTable.schema().asStruct()); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - partitionsTable.schema(), "partitions")); - GenericData.Record expectedRow = builder - .set("record_count", 1L) - .set("file_count", 1) - .build(); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(partitionsTable.schema(), "partitions")); + GenericData.Record expectedRow = builder.set("record_count", 1L).set("file_count", 1).build(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .collectAsList(); Assert.assertEquals("Unpartitioned partitions table should have one row", 1, actual.size()); TestHelpers.assertEqualsSafe(expectedSchema, expectedRow, actual.get(0)); @@ -1095,10 +1254,13 @@ public void testPartitionsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "partitions_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table partitionsTable = loadTable(tableIdentifier, "partitions"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1107,69 +1269,86 @@ public void testPartitionsTable() { long firstCommitId = table.currentSnapshot().snapshotId(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .orderBy("partition.id") - .collectAsList(); - - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - partitionsTable.schema(), "partitions")); - GenericRecordBuilder partitionBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - partitionsTable.schema().findType("partition").asStructType(), "partition")); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .orderBy("partition.id") + .collectAsList(); + + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(partitionsTable.schema(), "partitions")); + GenericRecordBuilder partitionBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + partitionsTable.schema().findType("partition").asStructType(), "partition")); List expected = Lists.newArrayList(); - expected.add(builder - .set("partition", partitionBuilder.set("id", 1).build()) - .set("record_count", 1L) - .set("file_count", 1) - .set("spec_id", 0) - .build()); - expected.add(builder - .set("partition", partitionBuilder.set("id", 2).build()) - .set("record_count", 1L) - .set("file_count", 1) - .set("spec_id", 0) - .build()); + expected.add( + builder + .set("partition", partitionBuilder.set("id", 1).build()) + .set("record_count", 1L) + .set("file_count", 1) + .set("spec_id", 0) + .build()); + expected.add( + builder + .set("partition", partitionBuilder.set("id", 2).build()) + .set("record_count", 1L) + .set("file_count", 1) + .set("spec_id", 0) + .build()); Assert.assertEquals("Partitions table should have two rows", 2, expected.size()); Assert.assertEquals("Actual results should have two rows", 2, actual.size()); for (int i = 0; i < 2; i += 1) { - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); } // check time travel - List actualAfterFirstCommit = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, String.valueOf(firstCommitId)) - .load(loadLocation(tableIdentifier, "partitions")) - .orderBy("partition.id") - .collectAsList(); + List actualAfterFirstCommit = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, String.valueOf(firstCommitId)) + .load(loadLocation(tableIdentifier, "partitions")) + .orderBy("partition.id") + .collectAsList(); Assert.assertEquals("Actual results should have one row", 1, actualAfterFirstCommit.size()); - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(0), actualAfterFirstCommit.get(0)); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(0), actualAfterFirstCommit.get(0)); // check predicate push down - List filtered = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .filter("partition.id < 2") - .collectAsList(); + List filtered = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .filter("partition.id < 2") + .collectAsList(); Assert.assertEquals("Actual results should have one row", 1, filtered.size()); - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(0), filtered.get(0)); - - List nonFiltered = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .filter("partition.id < 2 or record_count=1") - .collectAsList(); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(0), filtered.get(0)); + + List nonFiltered = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .filter("partition.id < 2 or record_count=1") + .collectAsList(); Assert.assertEquals("Actual results should have one row", 2, nonFiltered.size()); for (int i = 0; i < 2; i += 1) { - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); } } @@ -1178,62 +1357,63 @@ public synchronized void testSnapshotReadAfterAddColumn() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List originalRecords = Lists.newArrayList( - RowFactory.create(1, "x"), - RowFactory.create(2, "y"), - RowFactory.create(3, "z")); + List originalRecords = + Lists.newArrayList( + RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z")); StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA); Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf.orderBy("id").collectAsList()); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); Snapshot snapshot1 = table.currentSnapshot(); table.updateSchema().addColumn("category", Types.StringType.get()).commit(); - List newRecords = Lists.newArrayList( - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); + List newRecords = + Lists.newArrayList(RowFactory.create(4, "xy", "B"), RowFactory.create(5, "xyz", "C")); StructType newSparkSchema = SparkSchemaUtil.convert(SCHEMA2); Dataset inputDf2 = spark.createDataFrame(newRecords, newSparkSchema); - inputDf2.select("id", "data", "category").write() + inputDf2 + .select("id", "data", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - List updatedRecords = Lists.newArrayList( - RowFactory.create(1, "x", null), - RowFactory.create(2, "y", null), - RowFactory.create(3, "z", null), - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); - - Dataset resultDf2 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", updatedRecords, - resultDf2.orderBy("id").collectAsList()); - - Dataset resultDf3 = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshot1.snapshotId()) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf3.orderBy("id").collectAsList()); + List updatedRecords = + Lists.newArrayList( + RowFactory.create(1, "x", null), + RowFactory.create(2, "y", null), + RowFactory.create(3, "z", null), + RowFactory.create(4, "xy", "B"), + RowFactory.create(5, "xyz", "C")); + + Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); + + Dataset resultDf3 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshot1.snapshotId()) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf3.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf3.schema()); } @@ -1242,72 +1422,76 @@ public synchronized void testSnapshotReadAfterDropColumn() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA2, PartitionSpec.unpartitioned()); - List originalRecords = Lists.newArrayList( - RowFactory.create(1, "x", "A"), - RowFactory.create(2, "y", "A"), - RowFactory.create(3, "z", "B")); + List originalRecords = + Lists.newArrayList( + RowFactory.create(1, "x", "A"), + RowFactory.create(2, "y", "A"), + RowFactory.create(3, "z", "B")); StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA2); Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf.select("id", "data", "category").write() + inputDf + .select("id", "data", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf.orderBy("id").collectAsList()); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); long tsBeforeDropColumn = waitUntilAfter(System.currentTimeMillis()); table.updateSchema().deleteColumn("data").commit(); long tsAfterDropColumn = waitUntilAfter(System.currentTimeMillis()); - List newRecords = Lists.newArrayList( - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); + List newRecords = Lists.newArrayList(RowFactory.create(4, "B"), RowFactory.create(5, "C")); StructType newSparkSchema = SparkSchemaUtil.convert(SCHEMA3); Dataset inputDf2 = spark.createDataFrame(newRecords, newSparkSchema); - inputDf2.select("id", "category").write() + inputDf2 + .select("id", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - List updatedRecords = Lists.newArrayList( - RowFactory.create(1, "A"), - RowFactory.create(2, "A"), - RowFactory.create(3, "B"), - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); - - Dataset resultDf2 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", updatedRecords, - resultDf2.orderBy("id").collectAsList()); - - Dataset resultDf3 = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, tsBeforeDropColumn) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf3.orderBy("id").collectAsList()); + List updatedRecords = + Lists.newArrayList( + RowFactory.create(1, "A"), + RowFactory.create(2, "A"), + RowFactory.create(3, "B"), + RowFactory.create(4, "B"), + RowFactory.create(5, "C")); + + Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); + + Dataset resultDf3 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, tsBeforeDropColumn) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf3.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf3.schema()); // At tsAfterDropColumn, there has been a schema change, but no new snapshot, // so the snapshot as of tsAfterDropColumn is the same as that as of tsBeforeDropColumn. - Dataset resultDf4 = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, tsAfterDropColumn) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf4.orderBy("id").collectAsList()); + Dataset resultDf4 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, tsAfterDropColumn) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf4.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf4.schema()); } @@ -1316,77 +1500,77 @@ public synchronized void testSnapshotReadAfterAddAndDropColumn() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List originalRecords = Lists.newArrayList( - RowFactory.create(1, "x"), - RowFactory.create(2, "y"), - RowFactory.create(3, "z")); + List originalRecords = + Lists.newArrayList( + RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z")); StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA); Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf.orderBy("id").collectAsList()); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); Snapshot snapshot1 = table.currentSnapshot(); table.updateSchema().addColumn("category", Types.StringType.get()).commit(); - List newRecords = Lists.newArrayList( - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); + List newRecords = + Lists.newArrayList(RowFactory.create(4, "xy", "B"), RowFactory.create(5, "xyz", "C")); StructType sparkSchemaAfterAddColumn = SparkSchemaUtil.convert(SCHEMA2); Dataset inputDf2 = spark.createDataFrame(newRecords, sparkSchemaAfterAddColumn); - inputDf2.select("id", "data", "category").write() + inputDf2 + .select("id", "data", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - List updatedRecords = Lists.newArrayList( - RowFactory.create(1, "x", null), - RowFactory.create(2, "y", null), - RowFactory.create(3, "z", null), - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); + List updatedRecords = + Lists.newArrayList( + RowFactory.create(1, "x", null), + RowFactory.create(2, "y", null), + RowFactory.create(3, "z", null), + RowFactory.create(4, "xy", "B"), + RowFactory.create(5, "xyz", "C")); - Dataset resultDf2 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", updatedRecords, - resultDf2.orderBy("id").collectAsList()); + Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); table.updateSchema().deleteColumn("data").commit(); - List recordsAfterDropColumn = Lists.newArrayList( - RowFactory.create(1, null), - RowFactory.create(2, null), - RowFactory.create(3, null), - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); - - Dataset resultDf3 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", recordsAfterDropColumn, - resultDf3.orderBy("id").collectAsList()); - - Dataset resultDf4 = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshot1.snapshotId()) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf4.orderBy("id").collectAsList()); + List recordsAfterDropColumn = + Lists.newArrayList( + RowFactory.create(1, null), + RowFactory.create(2, null), + RowFactory.create(3, null), + RowFactory.create(4, "B"), + RowFactory.create(5, "C")); + + Dataset resultDf3 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", recordsAfterDropColumn, resultDf3.orderBy("id").collectAsList()); + + Dataset resultDf4 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshot1.snapshotId()) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf4.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf4.schema()); } @@ -1395,13 +1579,12 @@ public void testRemoveOrphanFilesActionSupport() throws InterruptedException { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List records = Lists.newArrayList( - new SimpleRecord(1, "1") - ); + List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1413,21 +1596,23 @@ public void testRemoveOrphanFilesActionSupport() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result1 = actions.deleteOrphanFiles(table) - .location(table.location() + "/metadata") - .olderThan(System.currentTimeMillis()) - .execute(); - Assert.assertTrue("Should not delete any metadata files", Iterables.isEmpty(result1.orphanFileLocations())); + DeleteOrphanFiles.Result result1 = + actions + .deleteOrphanFiles(table) + .location(table.location() + "/metadata") + .olderThan(System.currentTimeMillis()) + .execute(); + Assert.assertTrue( + "Should not delete any metadata files", Iterables.isEmpty(result1.orphanFileLocations())); - DeleteOrphanFiles.Result result2 = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); - Assert.assertEquals("Should delete 1 data file", 1, Iterables.size(result2.orphanFileLocations())); + DeleteOrphanFiles.Result result2 = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); + Assert.assertEquals( + "Should delete 1 data file", 1, Iterables.size(result2.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); - List actualRecords = resultDF - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @@ -1437,22 +1622,26 @@ public void testAllManifestTableSnapshotFiltering() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "all_manifest_snapshot_filtering"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "all_manifests"); - Dataset df = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); List> snapshotIdToManifests = Lists.newArrayList(); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); Snapshot snapshot1 = table.currentSnapshot(); - snapshotIdToManifests.addAll(snapshot1.allManifests().stream() - .map(manifest -> Pair.of(snapshot1.snapshotId(), manifest)) - .collect(Collectors.toList())); + snapshotIdToManifests.addAll( + snapshot1.allManifests().stream() + .map(manifest -> Pair.of(snapshot1.snapshotId(), manifest)) + .collect(Collectors.toList())); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1460,16 +1649,19 @@ public void testAllManifestTableSnapshotFiltering() throws Exception { table.refresh(); Snapshot snapshot2 = table.currentSnapshot(); Assert.assertEquals("Should have two manifests", 2, snapshot2.allManifests().size()); - snapshotIdToManifests.addAll(snapshot2.allManifests().stream() - .map(manifest -> Pair.of(snapshot2.snapshotId(), manifest)) - .collect(Collectors.toList())); + snapshotIdToManifests.addAll( + snapshot2.allManifests().stream() + .map(manifest -> Pair.of(snapshot2.snapshotId(), manifest)) + .collect(Collectors.toList())); // Add manifests that will not be selected - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1479,30 +1671,41 @@ public void testAllManifestTableSnapshotFiltering() throws Exception { snapshotIds.add(String.valueOf(snapshot2.snapshotId())); snapshotIds.toString(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .filter("reference_snapshot_id in " + snapshotIds) - .orderBy("path") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_manifests")) + .filter("reference_snapshot_id in " + snapshotIds) + .orderBy("path") + .collectAsList(); table.refresh(); - List expected = snapshotIdToManifests.stream() - .map(snapshotManifest -> manifestRecord(manifestTable, snapshotManifest.first(), snapshotManifest.second())) - .collect(Collectors.toList()); + List expected = + snapshotIdToManifests.stream() + .map( + snapshotManifest -> + manifestRecord( + manifestTable, snapshotManifest.first(), snapshotManifest.second())) + .collect(Collectors.toList()); expected.sort(Comparator.comparing(o -> o.get("path").toString())); Assert.assertEquals("Manifests table should have 3 manifest rows", 3, actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); } } - private GenericData.Record manifestRecord(Table manifestTable, Long referenceSnapshotId, ManifestFile manifest) { - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema(), "manifests")); - GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema().findType("partition_summaries.element").asStructType(), "partition_summary")); + private GenericData.Record manifestRecord( + Table manifestTable, Long referenceSnapshotId, ManifestFile manifest) { + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(manifestTable.schema(), "manifests")); + GenericRecordBuilder summaryBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + manifestTable.schema().findType("partition_summaries.element").asStructType(), + "partition_summary")); return builder .set("content", manifest.content().id()) .set("path", manifest.path()) @@ -1510,19 +1713,32 @@ private GenericData.Record manifestRecord(Table manifestTable, Long referenceSna .set("partition_spec_id", manifest.partitionSpecId()) .set("added_snapshot_id", manifest.snapshotId()) .set("added_data_files_count", manifest.content() == DATA ? manifest.addedFilesCount() : 0) - .set("existing_data_files_count", manifest.content() == DATA ? manifest.existingFilesCount() : 0) - .set("deleted_data_files_count", manifest.content() == DATA ? manifest.deletedFilesCount() : 0) - .set("added_delete_files_count", manifest.content() == DELETES ? manifest.addedFilesCount() : 0) - .set("existing_delete_files_count", manifest.content() == DELETES ? manifest.existingFilesCount() : 0) - .set("deleted_delete_files_count", manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) - .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> - summaryBuilder - .set("contains_null", false) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build() - )) + .set( + "existing_data_files_count", + manifest.content() == DATA ? manifest.existingFilesCount() : 0) + .set( + "deleted_data_files_count", + manifest.content() == DATA ? manifest.deletedFilesCount() : 0) + .set( + "added_delete_files_count", + manifest.content() == DELETES ? manifest.addedFilesCount() : 0) + .set( + "existing_delete_files_count", + manifest.content() == DELETES ? manifest.existingFilesCount() : 0) + .set( + "deleted_delete_files_count", + manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) + .set( + "partition_summaries", + Lists.transform( + manifest.partitions(), + partition -> + summaryBuilder + .set("contains_null", false) + .set("contains_nan", false) + .set("lower_bound", "1") + .set("upper_bound", "1") + .build())) .set("reference_snapshot_id", referenceSnapshotId) .build(); } @@ -1532,8 +1748,8 @@ private void asMetadataRecord(GenericData.Record file) { file.put(3, 0); // specId } - private PositionDeleteWriter newPositionDeleteWriter(Table table, PartitionSpec spec, - StructLike partition) { + private PositionDeleteWriter newPositionDeleteWriter( + Table table, PartitionSpec spec, StructLike partition) { OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, 0, 0).build(); EncryptedOutputFile outputFile = fileFactory.newOutputFile(spec, partition); @@ -1541,9 +1757,13 @@ private PositionDeleteWriter newPositionDeleteWriter(Table table, P return fileWriterFactory.newPositionDeleteWriter(outputFile, spec, partition); } - private DeleteFile writePositionDeletes(Table table, PartitionSpec spec, StructLike partition, - Iterable> deletes) { - PositionDeleteWriter positionDeleteWriter = newPositionDeleteWriter(table, spec, partition); + private DeleteFile writePositionDeletes( + Table table, + PartitionSpec spec, + StructLike partition, + Iterable> deletes) { + PositionDeleteWriter positionDeleteWriter = + newPositionDeleteWriter(table, spec, partition); try (PositionDeleteWriter writer = positionDeleteWriter) { for (PositionDelete delete : deletes) { diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java index c275daee5f7e..559668ee31a1 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.math.BigDecimal; @@ -61,8 +60,8 @@ public void testRegisterIntegerBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_int_16", DataTypes.IntegerType, 16); List results = spark.sql("SELECT iceberg_bucket_int_16(1)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); } @Test @@ -70,8 +69,8 @@ public void testRegisterShortBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_short_16", DataTypes.ShortType, 16); List results = spark.sql("SELECT iceberg_bucket_short_16(1S)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); } @Test @@ -79,8 +78,8 @@ public void testRegisterByteBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_byte_16", DataTypes.ByteType, 16); List results = spark.sql("SELECT iceberg_bucket_byte_16(1Y)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); } @Test @@ -88,8 +87,8 @@ public void testRegisterLongBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_long_16", DataTypes.LongType, 16); List results = spark.sql("SELECT iceberg_bucket_long_16(1L)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.LongType.get(), 16).apply(1L), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.LongType.get(), 16).apply(1L), results.get(0).getInt(0)); } @Test @@ -97,7 +96,8 @@ public void testRegisterStringBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_string_16", DataTypes.StringType, 16); List results = spark.sql("SELECT iceberg_bucket_string_16('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), + Assert.assertEquals( + (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), results.get(0).getInt(0)); } @@ -106,7 +106,8 @@ public void testRegisterCharBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_char_16", new CharType(5), 16); List results = spark.sql("SELECT iceberg_bucket_char_16('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), + Assert.assertEquals( + (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), results.get(0).getInt(0)); } @@ -115,73 +116,89 @@ public void testRegisterVarCharBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_varchar_16", new VarcharType(5), 16); List results = spark.sql("SELECT iceberg_bucket_varchar_16('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), + Assert.assertEquals( + (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), results.get(0).getInt(0)); } @Test public void testRegisterDateBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_date_16", DataTypes.DateType, 16); - List results = spark.sql("SELECT iceberg_bucket_date_16(DATE '2021-06-30')").collectAsList(); + List results = + spark.sql("SELECT iceberg_bucket_date_16(DATE '2021-06-30')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.DateType.get(), 16) - .apply(DateTimeUtils.fromJavaDate(Date.valueOf("2021-06-30"))), + Assert.assertEquals( + (int) + Transforms.bucket(Types.DateType.get(), 16) + .apply(DateTimeUtils.fromJavaDate(Date.valueOf("2021-06-30"))), results.get(0).getInt(0)); } @Test public void testRegisterTimestampBucketUDF() { - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_timestamp_16", DataTypes.TimestampType, 16); + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_timestamp_16", DataTypes.TimestampType, 16); List results = - spark.sql("SELECT iceberg_bucket_timestamp_16(TIMESTAMP '2021-06-30 00:00:00.000')").collectAsList(); + spark + .sql("SELECT iceberg_bucket_timestamp_16(TIMESTAMP '2021-06-30 00:00:00.000')") + .collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.TimestampType.withZone(), 16) - .apply(DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2021-06-30 00:00:00.000"))), + Assert.assertEquals( + (int) + Transforms.bucket(Types.TimestampType.withZone(), 16) + .apply( + DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2021-06-30 00:00:00.000"))), results.get(0).getInt(0)); } @Test public void testRegisterBinaryBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_binary_16", DataTypes.BinaryType, 16); - List results = - spark.sql("SELECT iceberg_bucket_binary_16(X'0020001F')").collectAsList(); + List results = spark.sql("SELECT iceberg_bucket_binary_16(X'0020001F')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.BinaryType.get(), 16) - .apply(ByteBuffer.wrap(new byte[]{0x00, 0x20, 0x00, 0x1F})), + Assert.assertEquals( + (int) + Transforms.bucket(Types.BinaryType.get(), 16) + .apply(ByteBuffer.wrap(new byte[] {0x00, 0x20, 0x00, 0x1F})), results.get(0).getInt(0)); } @Test public void testRegisterDecimalBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_decimal_16", new DecimalType(4, 2), 16); - List results = - spark.sql("SELECT iceberg_bucket_decimal_16(11.11)").collectAsList(); + List results = spark.sql("SELECT iceberg_bucket_decimal_16(11.11)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.DecimalType.of(4, 2), 16) - .apply(new BigDecimal("11.11")), + Assert.assertEquals( + (int) Transforms.bucket(Types.DecimalType.of(4, 2), 16).apply(new BigDecimal("11.11")), results.get(0).getInt(0)); } @Test public void testRegisterBooleanBucketUDF() { - Assertions.assertThatThrownBy(() -> - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_boolean_16", DataTypes.BooleanType, 16)) + Assertions.assertThatThrownBy( + () -> + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_boolean_16", DataTypes.BooleanType, 16)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot bucket by type: boolean"); } @Test public void testRegisterDoubleBucketUDF() { - Assertions.assertThatThrownBy(() -> - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_double_16", DataTypes.DoubleType, 16)) + Assertions.assertThatThrownBy( + () -> + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_double_16", DataTypes.DoubleType, 16)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot bucket by type: double"); } @Test public void testRegisterFloatBucketUDF() { - Assertions.assertThatThrownBy(() -> - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_float_16", DataTypes.FloatType, 16)) + Assertions.assertThatThrownBy( + () -> + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_float_16", DataTypes.FloatType, 16)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot bucket by type: float"); } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java index e07798301db8..7313c18cc09d 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; @@ -55,11 +54,11 @@ public class TestIdentityPartitionData extends SparkTestBase { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true }, + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true}, }; } @@ -71,36 +70,37 @@ public TestIdentityPartitionData(String format, boolean vectorized) { this.vectorized = vectorized; } - private static final Schema LOG_SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "date", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get()) - ); - - private static final List LOGS = ImmutableList.of( - LogMessage.debug("2020-02-02", "debug event 1"), - LogMessage.info("2020-02-02", "info event 1"), - LogMessage.debug("2020-02-02", "debug event 2"), - LogMessage.info("2020-02-03", "info event 2"), - LogMessage.debug("2020-02-03", "debug event 3"), - LogMessage.info("2020-02-03", "info event 3"), - LogMessage.error("2020-02-03", "error event 1"), - LogMessage.debug("2020-02-04", "debug event 4"), - LogMessage.warn("2020-02-04", "warn event 1"), - LogMessage.debug("2020-02-04", "debug event 5") - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - - private PartitionSpec spec = PartitionSpec.builderFor(LOG_SCHEMA).identity("date").identity("level").build(); + private static final Schema LOG_SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "date", Types.StringType.get()), + Types.NestedField.optional(3, "level", Types.StringType.get()), + Types.NestedField.optional(4, "message", Types.StringType.get())); + + private static final List LOGS = + ImmutableList.of( + LogMessage.debug("2020-02-02", "debug event 1"), + LogMessage.info("2020-02-02", "info event 1"), + LogMessage.debug("2020-02-02", "debug event 2"), + LogMessage.info("2020-02-03", "info event 2"), + LogMessage.debug("2020-02-03", "debug event 3"), + LogMessage.info("2020-02-03", "info event 3"), + LogMessage.error("2020-02-03", "error event 1"), + LogMessage.debug("2020-02-04", "debug event 4"), + LogMessage.warn("2020-02-04", "warn event 1"), + LogMessage.debug("2020-02-04", "debug event 5")); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); + + private PartitionSpec spec = + PartitionSpec.builderFor(LOG_SCHEMA).identity("date").identity("level").build(); private Table table = null; private Dataset logs = null; /** - * Use the Hive Based table to make Identity Partition Columns with no duplication of the data in the underlying - * parquet files. This makes sure that if the identity mapping fails, the test will also fail. + * Use the Hive Based table to make Identity Partition Columns with no duplication of the data in + * the underlying parquet files. This makes sure that if the identity mapping fails, the test will + * also fail. */ private void setupParquet() throws Exception { File location = temp.newFolder("logs"); @@ -109,15 +109,25 @@ private void setupParquet() throws Exception { Assert.assertTrue("Temp folder should exist", location.exists()); Map properties = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format); - this.logs = spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); + this.logs = + spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); spark.sql(String.format("DROP TABLE IF EXISTS %s", hiveTable)); - logs.orderBy("date", "level", "id").write().partitionBy("date", "level").format("parquet") - .option("path", hiveLocation.toString()).saveAsTable(hiveTable); - - this.table = TABLES.create(SparkSchemaUtil.schemaForTable(spark, hiveTable), - SparkSchemaUtil.specForTable(spark, hiveTable), properties, location.toString()); - - SparkTableUtil.importSparkTable(spark, new TableIdentifier(hiveTable), table, location.toString()); + logs.orderBy("date", "level", "id") + .write() + .partitionBy("date", "level") + .format("parquet") + .option("path", hiveLocation.toString()) + .saveAsTable(hiveTable); + + this.table = + TABLES.create( + SparkSchemaUtil.schemaForTable(spark, hiveTable), + SparkSchemaUtil.specForTable(spark, hiveTable), + properties, + location.toString()); + + SparkTableUtil.importSparkTable( + spark, new TableIdentifier(hiveTable), table, location.toString()); } @Before @@ -130,56 +140,70 @@ public void setupTable() throws Exception { Map properties = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format); this.table = TABLES.create(LOG_SCHEMA, spec, properties, location.toString()); - this.logs = spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); + this.logs = + spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); - logs.orderBy("date", "level", "id").write().format("iceberg").mode("append").save(location.toString()); + logs.orderBy("date", "level", "id") + .write() + .format("iceberg") + .mode("append") + .save(location.toString()); } } @Test public void testFullProjection() { List expected = logs.orderBy("id").collectAsList(); - List actual = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()).orderBy("id") - .select("id", "date", "level", "message") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table.location()) + .orderBy("id") + .select("id", "date", "level", "message") + .collectAsList(); Assert.assertEquals("Rows should match", expected, actual); } @Test public void testProjections() { - String[][] cases = new String[][] { - // individual fields - new String[] { "date" }, - new String[] { "level" }, - new String[] { "message" }, - // field pairs - new String[] { "date", "message" }, - new String[] { "level", "message" }, - new String[] { "date", "level" }, - // out-of-order pairs - new String[] { "message", "date" }, - new String[] { "message", "level" }, - new String[] { "level", "date" }, - // full projection, different orderings - new String[] { "date", "level", "message" }, - new String[] { "level", "date", "message" }, - new String[] { "date", "message", "level" }, - new String[] { "level", "message", "date" }, - new String[] { "message", "date", "level" }, - new String[] { "message", "level", "date" } - }; + String[][] cases = + new String[][] { + // individual fields + new String[] {"date"}, + new String[] {"level"}, + new String[] {"message"}, + // field pairs + new String[] {"date", "message"}, + new String[] {"level", "message"}, + new String[] {"date", "level"}, + // out-of-order pairs + new String[] {"message", "date"}, + new String[] {"message", "level"}, + new String[] {"level", "date"}, + // full projection, different orderings + new String[] {"date", "level", "message"}, + new String[] {"level", "date", "message"}, + new String[] {"date", "message", "level"}, + new String[] {"level", "message", "date"}, + new String[] {"message", "date", "level"}, + new String[] {"message", "level", "date"} + }; for (String[] ordering : cases) { List expected = logs.select("id", ordering).orderBy("id").collectAsList(); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()) - .select("id", ordering).orderBy("id") - .collectAsList(); - Assert.assertEquals("Rows should match for ordering: " + Arrays.toString(ordering), expected, actual); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table.location()) + .select("id", ordering) + .orderBy("id") + .collectAsList(); + Assert.assertEquals( + "Rows should match for ordering: " + Arrays.toString(ordering), expected, actual); } } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java index 4ab01044046f..9e75145faff9 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Iterator; @@ -68,8 +67,10 @@ protected void generateAndValidate(Schema schema, AssertMethod assertMethod) { StructLike recordStructLike = recordWrapper.wrap(actual.next()); StructLike rowStructLike = rowWrapper.wrap(expected.next()); - assertMethod.assertEquals("Should have expected StructLike values", - actualWrapper.set(recordStructLike), expectedWrapper.set(rowStructLike)); + assertMethod.assertEquals( + "Should have expected StructLike values", + actualWrapper.set(recordStructLike), + expectedWrapper.set(rowStructLike)); } Assert.assertFalse("Shouldn't have more record", actual.hasNext()); diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestNameMappingProjection.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestNameMappingProjection.java index c425fec39546..74139b16ae99 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestNameMappingProjection.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestNameMappingProjection.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -57,29 +60,26 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestNameMappingProjection extends HiveTableBaseTest { private static final Configuration CONF = HiveTableBaseTest.hiveConf; private static SparkSession spark = null; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @BeforeClass public static void startSpark() { String metastoreURI = CONF.get(HiveConf.ConfVars.METASTOREURIS.varname); // Create a spark session. - TestNameMappingProjection.spark = SparkSession.builder().master("local[2]") - .enableHiveSupport() - .config("spark.hadoop.hive.metastore.uris", metastoreURI) - .config("hive.exec.dynamic.partition", "true") - .config("hive.exec.dynamic.partition.mode", "nonstrict") - .config("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation", "true") - .getOrCreate(); + TestNameMappingProjection.spark = + SparkSession.builder() + .master("local[2]") + .enableHiveSupport() + .config("spark.hadoop.hive.metastore.uris", metastoreURI) + .config("hive.exec.dynamic.partition", "true") + .config("hive.exec.dynamic.partition.mode", "nonstrict") + .config("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation", "true") + .getOrCreate(); } @AfterClass @@ -98,8 +98,9 @@ public void testOrcReaderWithNameMapping() throws IOException { orcSchema.addField("name", TypeDescription.createString()); Path dataFilePath = new Path(orcFile.toString(), "name-mapping-data.orc"); - try (org.apache.orc.Writer writer = OrcFile.createWriter(dataFilePath, - OrcFile.writerOptions(new Configuration()).setSchema(orcSchema))) { + try (org.apache.orc.Writer writer = + OrcFile.createWriter( + dataFilePath, OrcFile.writerOptions(new Configuration()).setSchema(orcSchema))) { VectorizedRowBatch batch = orcSchema.createRowBatch(); byte[] aliceVal = "Alice".getBytes(StandardCharsets.UTF_8); byte[] bobVal = "Bob".getBytes(StandardCharsets.UTF_8); @@ -121,12 +122,13 @@ public void testOrcReaderWithNameMapping() throws IOException { } File fileWithData = new File(dataFilePath.toString()); - DataFile orcDataFile = DataFiles.builder(PartitionSpec.unpartitioned()) - .withFormat("orc") - .withFileSizeInBytes(fileWithData.length()) - .withPath(fileWithData.getAbsolutePath()) - .withRecordCount(2) - .build(); + DataFile orcDataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withFormat("orc") + .withFileSizeInBytes(fileWithData.length()) + .withPath(fileWithData.getAbsolutePath()) + .withRecordCount(2) + .build(); assertNameMappingProjection(orcDataFile, "orc_table"); } @@ -134,12 +136,13 @@ public void testOrcReaderWithNameMapping() throws IOException { @Test public void testAvroReaderWithNameMapping() throws IOException { File avroFile = temp.newFile(); - org.apache.avro.Schema avroSchema = SchemaBuilder.record("TestRecord") - .namespace("org.apache.iceberg.spark.data") - .fields() - .requiredInt("id") - .requiredString("name") - .endRecord(); + org.apache.avro.Schema avroSchema = + SchemaBuilder.record("TestRecord") + .namespace("org.apache.iceberg.spark.data") + .fields() + .requiredInt("id") + .requiredString("name") + .endRecord(); org.apache.avro.Schema avroSchemaWithoutIds = RemoveIds.removeIds(avroSchema); @@ -159,42 +162,46 @@ public void testAvroReaderWithNameMapping() throws IOException { dataFileWriter.append(record2); dataFileWriter.close(); - DataFile avroDataFile = DataFiles.builder(PartitionSpec.unpartitioned()) - .withFormat("avro") - .withFileSizeInBytes(avroFile.length()) - .withPath(avroFile.getAbsolutePath()) - .withRecordCount(2) - .build(); + DataFile avroDataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withFormat("avro") + .withFileSizeInBytes(avroFile.length()) + .withPath(avroFile.getAbsolutePath()) + .withRecordCount(2) + .build(); assertNameMappingProjection(avroDataFile, "avro_table"); } private void assertNameMappingProjection(DataFile dataFile, String tableName) { - Schema filteredSchema = new Schema( - required(1, "name", Types.StringType.get()) - ); + Schema filteredSchema = new Schema(required(1, "name", Types.StringType.get())); NameMapping nameMapping = MappingUtil.create(filteredSchema); - Schema tableSchema = new Schema( - required(1, "name", Types.StringType.get()), - optional(2, "id", Types.IntegerType.get()) - ); + Schema tableSchema = + new Schema( + required(1, "name", Types.StringType.get()), + optional(2, "id", Types.IntegerType.get())); - Table table = catalog.createTable( - org.apache.iceberg.catalog.TableIdentifier.of(DB_NAME, tableName), - tableSchema, - PartitionSpec.unpartitioned()); + Table table = + catalog.createTable( + org.apache.iceberg.catalog.TableIdentifier.of(DB_NAME, tableName), + tableSchema, + PartitionSpec.unpartitioned()); - table.updateProperties() + table + .updateProperties() .set(DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)) .commit(); table.newFastAppend().appendFile(dataFile).commit(); - List actual = spark.read().format("iceberg") - .load(String.format("%s.%s", DB_NAME, tableName)) - .filter("name='Alice'") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(String.format("%s.%s", DB_NAME, tableName)) + .filter("name='Alice'") + .collectAsList(); Assert.assertEquals("Should project 1 record", 1, actual.size()); Assert.assertEquals("Should equal to 'Alice'", "Alice", actual.get(0).getString(0)); diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java index adfe8c7d3649..f585ed360f95 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.List; @@ -52,8 +53,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; - @RunWith(Parameterized.class) public class TestParquetScan extends AvroDataTest { private static final Configuration CONF = new Configuration(); @@ -72,12 +71,11 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Parameterized.Parameters(name = "vectorized = {0}") public static Object[] parameters() { - return new Object[] { false, true }; + return new Object[] {false, true}; } private final boolean vectorized; @@ -88,18 +86,20 @@ public TestParquetScan(boolean vectorized) { @Override protected void writeAndValidate(Schema schema) throws IOException { - Assume.assumeTrue("Cannot handle non-string map keys in parquet-avro", - null == TypeUtil.find( - schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); + Assume.assumeTrue( + "Cannot handle non-string map keys in parquet-avro", + null + == TypeUtil.find( + schema, + type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); File parent = temp.newFolder("parquet"); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); dataFolder.mkdirs(); - File parquetFile = new File(dataFolder, - FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); + File parquetFile = + new File(dataFolder, FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); HadoopTables tables = new HadoopTables(CONF); Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); @@ -110,24 +110,25 @@ protected void writeAndValidate(Schema schema) throws IOException { List expected = RandomData.generateList(tableSchema, 100, 1L); - try (FileAppender writer = Parquet.write(localOutput(parquetFile)) - .schema(tableSchema) - .build()) { + try (FileAppender writer = + Parquet.write(localOutput(parquetFile)).schema(tableSchema).build()) { writer.addAll(expected); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withFileSizeInBytes(parquetFile.length()) - .withPath(parquetFile.toString()) - .withRecordCount(100) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withFileSizeInBytes(parquetFile.length()) + .withPath(parquetFile.toString()) + .withRecordCount(100) + .build(); table.newAppend().appendFile(file).commit(); - table.updateProperties().set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)).commit(); + table + .updateProperties() + .set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .commit(); - Dataset df = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset df = spark.read().format("iceberg").load(location.toString()); List rows = df.collectAsList(); Assert.assertEquals("Should contain 100 rows", 100, rows.size()); diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java index 24f7b69e1dc5..ffe21432f00c 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; @@ -78,11 +77,11 @@ public class TestPartitionPruning { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } @@ -97,9 +96,12 @@ public TestPartitionPruning(String format, boolean vectorized) { private static SparkSession spark = null; private static JavaSparkContext sparkContext = null; - private static Transform bucketTransform = Transforms.bucket(Types.IntegerType.get(), 3); - private static Transform truncateTransform = Transforms.truncate(Types.StringType.get(), 5); - private static Transform hourTransform = Transforms.hour(Types.TimestampType.withoutZone()); + private static Transform bucketTransform = + Transforms.bucket(Types.IntegerType.get(), 3); + private static Transform truncateTransform = + Transforms.truncate(Types.StringType.get(), 5); + private static Transform hourTransform = + Transforms.hour(Types.TimestampType.withoutZone()); @BeforeClass public static void startSpark() { @@ -110,12 +112,21 @@ public static void startSpark() { CONF.set(optionKey, CountOpenLocalFileSystem.class.getName()); spark.conf().set(optionKey, CountOpenLocalFileSystem.class.getName()); spark.conf().set("spark.sql.session.timeZone", "UTC"); - spark.udf().register("bucket3", (Integer num) -> bucketTransform.apply(num), DataTypes.IntegerType); - spark.udf().register("truncate5", (String str) -> truncateTransform.apply(str), DataTypes.StringType); + spark + .udf() + .register("bucket3", (Integer num) -> bucketTransform.apply(num), DataTypes.IntegerType); + spark + .udf() + .register("truncate5", (String str) -> truncateTransform.apply(str), DataTypes.StringType); // NOTE: date transforms take the type long, not Timestamp - spark.udf().register("hour", (Timestamp ts) -> hourTransform.apply( - org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp(ts)), - DataTypes.IntegerType); + spark + .udf() + .register( + "hour", + (Timestamp ts) -> + hourTransform.apply( + org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp(ts)), + DataTypes.IntegerType); } @AfterClass @@ -125,70 +136,70 @@ public static void stopSpark() { currentSpark.stop(); } - private static final Schema LOG_SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "date", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get()), - Types.NestedField.optional(5, "timestamp", Types.TimestampType.withZone()) - ); - - private static final List LOGS = ImmutableList.of( - LogMessage.debug("2020-02-02", "debug event 1", getInstant("2020-02-02T00:00:00")), - LogMessage.info("2020-02-02", "info event 1", getInstant("2020-02-02T01:00:00")), - LogMessage.debug("2020-02-02", "debug event 2", getInstant("2020-02-02T02:00:00")), - LogMessage.info("2020-02-03", "info event 2", getInstant("2020-02-03T00:00:00")), - LogMessage.debug("2020-02-03", "debug event 3", getInstant("2020-02-03T01:00:00")), - LogMessage.info("2020-02-03", "info event 3", getInstant("2020-02-03T02:00:00")), - LogMessage.error("2020-02-03", "error event 1", getInstant("2020-02-03T03:00:00")), - LogMessage.debug("2020-02-04", "debug event 4", getInstant("2020-02-04T01:00:00")), - LogMessage.warn("2020-02-04", "warn event 1", getInstant("2020-02-04T02:00:00")), - LogMessage.debug("2020-02-04", "debug event 5", getInstant("2020-02-04T03:00:00")) - ); + private static final Schema LOG_SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "date", Types.StringType.get()), + Types.NestedField.optional(3, "level", Types.StringType.get()), + Types.NestedField.optional(4, "message", Types.StringType.get()), + Types.NestedField.optional(5, "timestamp", Types.TimestampType.withZone())); + + private static final List LOGS = + ImmutableList.of( + LogMessage.debug("2020-02-02", "debug event 1", getInstant("2020-02-02T00:00:00")), + LogMessage.info("2020-02-02", "info event 1", getInstant("2020-02-02T01:00:00")), + LogMessage.debug("2020-02-02", "debug event 2", getInstant("2020-02-02T02:00:00")), + LogMessage.info("2020-02-03", "info event 2", getInstant("2020-02-03T00:00:00")), + LogMessage.debug("2020-02-03", "debug event 3", getInstant("2020-02-03T01:00:00")), + LogMessage.info("2020-02-03", "info event 3", getInstant("2020-02-03T02:00:00")), + LogMessage.error("2020-02-03", "error event 1", getInstant("2020-02-03T03:00:00")), + LogMessage.debug("2020-02-04", "debug event 4", getInstant("2020-02-04T01:00:00")), + LogMessage.warn("2020-02-04", "warn event 1", getInstant("2020-02-04T02:00:00")), + LogMessage.debug("2020-02-04", "debug event 5", getInstant("2020-02-04T03:00:00"))); private static Instant getInstant(String timestampWithoutZone) { - Long epochMicros = (Long) Literal.of(timestampWithoutZone).to(Types.TimestampType.withoutZone()).value(); + Long epochMicros = + (Long) Literal.of(timestampWithoutZone).to(Types.TimestampType.withoutZone()).value(); return Instant.ofEpochMilli(TimeUnit.MICROSECONDS.toMillis(epochMicros)); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private PartitionSpec spec = PartitionSpec.builderFor(LOG_SCHEMA) - .identity("date") - .identity("level") - .bucket("id", 3) - .truncate("message", 5) - .hour("timestamp") - .build(); + private PartitionSpec spec = + PartitionSpec.builderFor(LOG_SCHEMA) + .identity("date") + .identity("level") + .bucket("id", 3) + .truncate("message", 5) + .hour("timestamp") + .build(); @Test public void testPartitionPruningIdentityString() { String filterCond = "date >= '2020-02-03' AND level = 'DEBUG'"; - Predicate partCondition = (Row r) -> { - String date = r.getString(0); - String level = r.getString(1); - return date.compareTo("2020-02-03") >= 0 && level.equals("DEBUG"); - }; + Predicate partCondition = + (Row r) -> { + String date = r.getString(0); + String level = r.getString(1); + return date.compareTo("2020-02-03") >= 0 && level.equals("DEBUG"); + }; runTest(filterCond, partCondition); } @Test public void testPartitionPruningBucketingInteger() { - final int[] ids = new int[]{ - LOGS.get(3).getId(), - LOGS.get(7).getId() - }; - String condForIds = Arrays.stream(ids).mapToObj(String::valueOf) - .collect(Collectors.joining(",", "(", ")")); + final int[] ids = new int[] {LOGS.get(3).getId(), LOGS.get(7).getId()}; + String condForIds = + Arrays.stream(ids).mapToObj(String::valueOf).collect(Collectors.joining(",", "(", ")")); String filterCond = "id in " + condForIds; - Predicate partCondition = (Row r) -> { - int bucketId = r.getInt(2); - Set buckets = Arrays.stream(ids).map(bucketTransform::apply) - .boxed().collect(Collectors.toSet()); - return buckets.contains(bucketId); - }; + Predicate partCondition = + (Row r) -> { + int bucketId = r.getInt(2); + Set buckets = + Arrays.stream(ids).map(bucketTransform::apply).boxed().collect(Collectors.toSet()); + return buckets.contains(bucketId); + }; runTest(filterCond, partCondition); } @@ -196,10 +207,11 @@ public void testPartitionPruningBucketingInteger() { @Test public void testPartitionPruningTruncatedString() { String filterCond = "message like 'info event%'"; - Predicate partCondition = (Row r) -> { - String truncatedMessage = r.getString(3); - return truncatedMessage.equals("info "); - }; + Predicate partCondition = + (Row r) -> { + String truncatedMessage = r.getString(3); + return truncatedMessage.equals("info "); + }; runTest(filterCond, partCondition); } @@ -207,10 +219,11 @@ public void testPartitionPruningTruncatedString() { @Test public void testPartitionPruningTruncatedStringComparingValueShorterThanPartitionValue() { String filterCond = "message like 'inf%'"; - Predicate partCondition = (Row r) -> { - String truncatedMessage = r.getString(3); - return truncatedMessage.startsWith("inf"); - }; + Predicate partCondition = + (Row r) -> { + String truncatedMessage = r.getString(3); + return truncatedMessage.startsWith("inf"); + }; runTest(filterCond, partCondition); } @@ -219,17 +232,20 @@ public void testPartitionPruningTruncatedStringComparingValueShorterThanPartitio public void testPartitionPruningHourlyPartition() { String filterCond; if (spark.version().startsWith("2")) { - // Looks like from Spark 2 we need to compare timestamp with timestamp to push down the filter. + // Looks like from Spark 2 we need to compare timestamp with timestamp to push down the + // filter. filterCond = "timestamp >= to_timestamp('2020-02-03T01:00:00')"; } else { filterCond = "timestamp >= '2020-02-03T01:00:00'"; } - Predicate partCondition = (Row r) -> { - int hourValue = r.getInt(4); - Instant instant = getInstant("2020-02-03T01:00:00"); - Integer hourValueToFilter = hourTransform.apply(TimeUnit.MILLISECONDS.toMicros(instant.toEpochMilli())); - return hourValue >= hourValueToFilter; - }; + Predicate partCondition = + (Row r) -> { + int hourValue = r.getInt(4); + Instant instant = getInstant("2020-02-03T01:00:00"); + Integer hourValueToFilter = + hourTransform.apply(TimeUnit.MILLISECONDS.toMicros(instant.toEpochMilli())); + return hourValue >= hourValueToFilter; + }; runTest(filterCond, partCondition); } @@ -242,24 +258,26 @@ private void runTest(String filterCond, Predicate partCondition) { Dataset logs = createTestDataset(); saveTestDatasetToTable(logs, table); - List expected = logs - .select("id", "date", "level", "message", "timestamp") - .filter(filterCond) - .orderBy("id") - .collectAsList(); + List expected = + logs.select("id", "date", "level", "message", "timestamp") + .filter(filterCond) + .orderBy("id") + .collectAsList(); Assert.assertFalse("Expected rows should be not empty", expected.isEmpty()); // remove records which may be recorded during storing to table CountOpenLocalFileSystem.resetRecordsInPathPrefix(originTableLocation.getAbsolutePath()); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()) - .select("id", "date", "level", "message", "timestamp") - .filter(filterCond) - .orderBy("id") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table.location()) + .select("id", "date", "level", "message", "timestamp") + .filter(filterCond) + .orderBy("id") + .collectAsList(); Assert.assertFalse("Actual rows should not be empty", actual.isEmpty()); Assert.assertEquals("Rows should match", expected, actual); @@ -282,40 +300,59 @@ private Table createTable(File originTableLocation) { } private Dataset createTestDataset() { - List rows = LOGS.stream().map(logMessage -> { - Object[] underlying = new Object[] { - logMessage.getId(), - UTF8String.fromString(logMessage.getDate()), - UTF8String.fromString(logMessage.getLevel()), - UTF8String.fromString(logMessage.getMessage()), - // discard the nanoseconds part to simplify - TimeUnit.MILLISECONDS.toMicros(logMessage.getTimestamp().toEpochMilli()) - }; - return new GenericInternalRow(underlying); - }).collect(Collectors.toList()); + List rows = + LOGS.stream() + .map( + logMessage -> { + Object[] underlying = + new Object[] { + logMessage.getId(), + UTF8String.fromString(logMessage.getDate()), + UTF8String.fromString(logMessage.getLevel()), + UTF8String.fromString(logMessage.getMessage()), + // discard the nanoseconds part to simplify + TimeUnit.MILLISECONDS.toMicros(logMessage.getTimestamp().toEpochMilli()) + }; + return new GenericInternalRow(underlying); + }) + .collect(Collectors.toList()); JavaRDD rdd = sparkContext.parallelize(rows); - Dataset df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(LOG_SCHEMA), false); - - return df - .selectExpr("id", "date", "level", "message", "timestamp") - .selectExpr("id", "date", "level", "message", "timestamp", "bucket3(id) AS bucket_id", - "truncate5(message) AS truncated_message", "hour(timestamp) AS ts_hour"); + Dataset df = + spark.internalCreateDataFrame( + JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(LOG_SCHEMA), false); + + return df.selectExpr("id", "date", "level", "message", "timestamp") + .selectExpr( + "id", + "date", + "level", + "message", + "timestamp", + "bucket3(id) AS bucket_id", + "truncate5(message) AS truncated_message", + "hour(timestamp) AS ts_hour"); } private void saveTestDatasetToTable(Dataset logs, Table table) { logs.orderBy("date", "level", "bucket_id", "truncated_message", "ts_hour") .select("id", "date", "level", "message", "timestamp") - .write().format("iceberg").mode("append").save(table.location()); + .write() + .format("iceberg") + .mode("append") + .save(table.location()); } - private void assertAccessOnDataFiles(File originTableLocation, Table table, Predicate partCondition) { + private void assertAccessOnDataFiles( + File originTableLocation, Table table, Predicate partCondition) { // only use files in current table location to avoid side-effects on concurrent test runs - Set readFilesInQuery = CountOpenLocalFileSystem.pathToNumOpenCalled.keySet() - .stream().filter(path -> path.startsWith(originTableLocation.getAbsolutePath())) - .collect(Collectors.toSet()); + Set readFilesInQuery = + CountOpenLocalFileSystem.pathToNumOpenCalled.keySet().stream() + .filter(path -> path.startsWith(originTableLocation.getAbsolutePath())) + .collect(Collectors.toSet()); - List files = spark.read().format("iceberg").load(table.location() + "#files").collectAsList(); + List files = + spark.read().format("iceberg").load(table.location() + "#files").collectAsList(); Set filesToRead = extractFilePathsMatchingConditionOnPartition(files, partCondition); Set filesToNotRead = extractFilePathsNotIn(files, filesToRead); @@ -325,37 +362,51 @@ private void assertAccessOnDataFiles(File originTableLocation, Table table, Pred Assert.assertFalse("The query should prune some data files.", filesToNotRead.isEmpty()); - // We don't check "all" data files bound to the condition are being read, as data files can be pruned on + // We don't check "all" data files bound to the condition are being read, as data files can be + // pruned on // other conditions like lower/upper bound of columns. - Assert.assertFalse("Some of data files in partition range should be read. " + - "Read files in query: " + readFilesInQuery + " / data files in partition range: " + filesToRead, + Assert.assertFalse( + "Some of data files in partition range should be read. " + + "Read files in query: " + + readFilesInQuery + + " / data files in partition range: " + + filesToRead, Sets.intersection(filesToRead, readFilesInQuery).isEmpty()); // Data files which aren't bound to the condition shouldn't be read. - Assert.assertTrue("Data files outside of partition range should not be read. " + - "Read files in query: " + readFilesInQuery + " / data files outside of partition range: " + filesToNotRead, + Assert.assertTrue( + "Data files outside of partition range should not be read. " + + "Read files in query: " + + readFilesInQuery + + " / data files outside of partition range: " + + filesToNotRead, Sets.intersection(filesToNotRead, readFilesInQuery).isEmpty()); } - private Set extractFilePathsMatchingConditionOnPartition(List files, Predicate condition) { + private Set extractFilePathsMatchingConditionOnPartition( + List files, Predicate condition) { // idx 1: file_path, idx 3: partition return files.stream() - .filter(r -> { - Row partition = r.getStruct(4); - return condition.test(partition); - }).map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) + .filter( + r -> { + Row partition = r.getStruct(4); + return condition.test(partition); + }) + .map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) .collect(Collectors.toSet()); } private Set extractFilePathsNotIn(List files, Set filePaths) { - Set allFilePaths = files.stream().map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) - .collect(Collectors.toSet()); + Set allFilePaths = + files.stream() + .map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) + .collect(Collectors.toSet()); return Sets.newHashSet(Sets.symmetricDifference(allFilePaths, filePaths)); } public static class CountOpenLocalFileSystem extends RawLocalFileSystem { - public static String scheme = String.format("TestIdentityPartitionData%dfs", - new Random().nextInt()); + public static String scheme = + String.format("TestIdentityPartitionData%dfs", new Random().nextInt()); public static Map pathToNumOpenCalled = Maps.newConcurrentMap(); public static String convertPath(String absPath) { @@ -401,13 +452,15 @@ public String getScheme() { @Override public FSDataInputStream open(Path f, int bufferSize) throws IOException { String path = f.toUri().getPath(); - pathToNumOpenCalled.compute(path, (ignored, v) -> { - if (v == null) { - return 1L; - } else { - return v + 1; - } - }); + pathToNumOpenCalled.compute( + path, + (ignored, v) -> { + if (v == null) { + return 1L; + } else { + return v + 1; + } + }); return super.open(f, bufferSize); } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java index f63181766852..fedac9aee3ac 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.util.List; import org.apache.avro.generic.GenericData; @@ -56,46 +58,43 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestPartitionValues { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } - private static final Schema SUPPORTED_PRIMITIVES = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - required(103, "i", Types.IntegerType.get()), - required(104, "l", Types.LongType.get()), - required(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - required(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - required(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision - ); - - private static final Schema SIMPLE_SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get())); - - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SIMPLE_SCHEMA) - .identity("data") - .build(); + private static final Schema SUPPORTED_PRIMITIVES = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + required(103, "i", Types.IntegerType.get()), + required(104, "l", Types.LongType.get()), + required(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + required(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + required(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision + ); + + private static final Schema SIMPLE_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SIMPLE_SCHEMA).identity("data").build(); private static SparkSession spark = null; @@ -111,8 +110,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String format; private final boolean vectorized; @@ -134,29 +132,30 @@ public void testNullPartitionValue() throws Exception { Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, null) - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, null)); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(location.toString()); - Dataset result = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()); + Dataset result = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(location.toString()); - List actual = result - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -174,29 +173,28 @@ public void testReorderedColumns() throws Exception { Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("data", "id").write() - .format("iceberg") - .mode(SaveMode.Append) - .option(SparkWriteOptions.CHECK_ORDERING, "false") - .save(location.toString()); + df.select("data", "id") + .write() + .format("iceberg") + .mode(SaveMode.Append) + .option(SparkWriteOptions.CHECK_ORDERING, "false") + .save(location.toString()); - Dataset result = spark.read() + Dataset result = + spark + .read() .format("iceberg") .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) .load(location.toString()); - List actual = result - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -214,30 +212,29 @@ public void testReorderedColumnsNoNullability() throws Exception { Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("data", "id").write() - .format("iceberg") - .mode(SaveMode.Append) - .option(SparkWriteOptions.CHECK_ORDERING, "false") - .option(SparkWriteOptions.CHECK_NULLABILITY, "false") - .save(location.toString()); + df.select("data", "id") + .write() + .format("iceberg") + .mode(SaveMode.Append) + .option(SparkWriteOptions.CHECK_ORDERING, "false") + .option(SparkWriteOptions.CHECK_NULLABILITY, "false") + .save(location.toString()); - Dataset result = spark.read() + Dataset result = + spark + .read() .format("iceberg") .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) .load(location.toString()); - List actual = result - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -245,9 +242,10 @@ public void testReorderedColumnsNoNullability() throws Exception { @Test public void testPartitionValueTypes() throws Exception { - String[] columnNames = new String[] { - "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" - }; + String[] columnNames = + new String[] { + "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" + }; HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); @@ -259,23 +257,27 @@ public void testPartitionValueTypes() throws Exception { List expected = RandomData.generateList(source.schema(), 2, 128735L); File avroData = temp.newFile("data.avro"); Assert.assertTrue(avroData.delete()); - try (FileAppender appender = Avro.write(Files.localOutput(avroData)) - .schema(source.schema()) - .build()) { + try (FileAppender appender = + Avro.write(Files.localOutput(avroData)).schema(source.schema()).build()) { appender.addAll(expected); } // add the Avro data file to the source table - source.newAppend() - .appendFile(DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(10) - .withInputFile(Files.localInput(avroData)) - .build()) + source + .newAppend() + .appendFile( + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(10) + .withInputFile(Files.localInput(avroData)) + .build()) .commit(); - Dataset sourceDF = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(sourceLocation); + Dataset sourceDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(sourceLocation); for (String column : columnNames) { String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString(); @@ -290,16 +292,15 @@ public void testPartitionValueTypes() throws Exception { Table table = tables.create(SUPPORTED_PRIMITIVES, spec, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - sourceDF.write() - .format("iceberg") - .mode(SaveMode.Append) - .save(location.toString()); + sourceDF.write().format("iceberg").mode(SaveMode.Append).save(location.toString()); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(location.toString()) + .collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); @@ -312,9 +313,10 @@ public void testPartitionValueTypes() throws Exception { @Test public void testNestedPartitionValues() throws Exception { - String[] columnNames = new String[] { - "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" - }; + String[] columnNames = + new String[] { + "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" + }; HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); Schema nestedSchema = new Schema(optional(1, "nested", SUPPORTED_PRIMITIVES.asStruct())); @@ -327,23 +329,27 @@ public void testNestedPartitionValues() throws Exception { List expected = RandomData.generateList(source.schema(), 2, 128735L); File avroData = temp.newFile("data.avro"); Assert.assertTrue(avroData.delete()); - try (FileAppender appender = Avro.write(Files.localOutput(avroData)) - .schema(source.schema()) - .build()) { + try (FileAppender appender = + Avro.write(Files.localOutput(avroData)).schema(source.schema()).build()) { appender.addAll(expected); } // add the Avro data file to the source table - source.newAppend() - .appendFile(DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(10) - .withInputFile(Files.localInput(avroData)) - .build()) + source + .newAppend() + .appendFile( + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(10) + .withInputFile(Files.localInput(avroData)) + .build()) .commit(); - Dataset sourceDF = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(sourceLocation); + Dataset sourceDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(sourceLocation); for (String column : columnNames) { String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString(); @@ -353,45 +359,46 @@ public void testNestedPartitionValues() throws Exception { File dataFolder = new File(location, "data"); Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity("nested." + column).build(); + PartitionSpec spec = + PartitionSpec.builderFor(nestedSchema).identity("nested." + column).build(); Table table = tables.create(nestedSchema, spec, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - sourceDF.write() - .format("iceberg") - .mode(SaveMode.Append) - .save(location.toString()); + sourceDF.write().format("iceberg").mode(SaveMode.Append).save(location.toString()); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(location.toString()) + .collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe( - nestedSchema.asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe(nestedSchema.asStruct(), expected.get(i), actual.get(i)); } } } /** * To verify if WrappedPositionAccessor is generated against a string field within a nested field, - * rather than a Position2Accessor. - * Or when building the partition path, a ClassCastException is thrown with the message like: - * Cannot cast org.apache.spark.unsafe.types.UTF8String to java.lang.CharSequence + * rather than a Position2Accessor. Or when building the partition path, a ClassCastException is + * thrown with the message like: Cannot cast org.apache.spark.unsafe.types.UTF8String to + * java.lang.CharSequence */ @Test public void testPartitionedByNestedString() throws Exception { // schema and partition spec - Schema nestedSchema = new Schema( - Types.NestedField.required(1, "struct", - Types.StructType.of(Types.NestedField.required(2, "string", Types.StringType.get())) - ) - ); + Schema nestedSchema = + new Schema( + Types.NestedField.required( + 1, + "struct", + Types.StructType.of( + Types.NestedField.required(2, "string", Types.StringType.get())))); PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity("struct.string").build(); // create table @@ -401,14 +408,14 @@ public void testPartitionedByNestedString() throws Exception { // input data frame StructField[] structFields = { - new StructField("struct", - DataTypes.createStructType( - new StructField[] { - new StructField("string", DataTypes.StringType, false, Metadata.empty()) - } - ), - false, Metadata.empty() - ) + new StructField( + "struct", + DataTypes.createStructType( + new StructField[] { + new StructField("string", DataTypes.StringType, false, Metadata.empty()) + }), + false, + Metadata.empty()) }; List rows = Lists.newArrayList(); @@ -416,17 +423,16 @@ public void testPartitionedByNestedString() throws Exception { Dataset sourceDF = spark.createDataFrame(rows, new StructType(structFields)); // write into iceberg - sourceDF.write() - .format("iceberg") - .mode(SaveMode.Append) - .save(baseLocation); + sourceDF.write().format("iceberg").mode(SaveMode.Append).save(baseLocation); // verify - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(baseLocation) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(baseLocation) + .collectAsList(); Assert.assertEquals("Number of rows should match", rows.size(), actual.size()); } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java index 8d65b64cab6d..cfc746f6e932 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.avro.Schema.Type.UNION; + import java.io.IOException; import java.util.List; import java.util.Map; @@ -37,8 +38,6 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.avro.Schema.Type.UNION; - public abstract class TestReadProjection { final String format; @@ -46,20 +45,17 @@ public abstract class TestReadProjection { this.format = format; } - protected abstract Record writeAndRead(String desc, - Schema writeSchema, - Schema readSchema, - Record record) throws IOException; + protected abstract Record writeAndRead( + String desc, Schema writeSchema, Schema readSchema, Record record) throws IOException; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); @@ -67,32 +63,33 @@ public void testFullProjection() throws Exception { Record projected = writeAndRead("full_projection", schema, schema, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("data")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("data")); Assert.assertEquals("Should contain the correct data value", 0, cmp); } @Test public void testReorderedFullProjection() throws Exception { -// Assume.assumeTrue( -// "Spark's Parquet read support does not support reordered columns", -// !format.equalsIgnoreCase("parquet")); + // Assume.assumeTrue( + // "Spark's Parquet read support does not support reordered columns", + // !format.equalsIgnoreCase("parquet")); - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); record.setField("data", "test"); - Schema reordered = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("reordered_full_projection", schema, reordered, record); @@ -102,24 +99,24 @@ public void testReorderedFullProjection() throws Exception { @Test public void testReorderedProjection() throws Exception { -// Assume.assumeTrue( -// "Spark's Parquet read support does not support reordered columns", -// !format.equalsIgnoreCase("parquet")); + // Assume.assumeTrue( + // "Spark's Parquet read support does not support reordered columns", + // !format.equalsIgnoreCase("parquet")); - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); record.setField("data", "test"); - Schema reordered = new Schema( - Types.NestedField.optional(2, "missing_1", Types.StringType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(3, "missing_2", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(2, "missing_1", Types.StringType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(3, "missing_2", Types.LongType.get())); Record projected = writeAndRead("reordered_projection", schema, reordered, record); @@ -130,10 +127,10 @@ public void testReorderedProjection() throws Exception { @Test public void testEmptyProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); @@ -152,68 +149,68 @@ public void testEmptyProjection() throws Exception { @Test public void testBasicProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); record.setField("data", "test"); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("basic_projection_id", writeSchema, idOnly, record); Assert.assertNull("Should not project data", projected.getField("data")); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); - Schema dataOnly = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("data")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("data")); Assert.assertEquals("Should contain the correct data value", 0, cmp); } @Test public void testRename() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); record.setField("data", "test"); - Schema readSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get()) - ); + Schema readSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); Record projected = writeAndRead("project_and_rename", writeSchema, readSchema, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("renamed")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("renamed")); Assert.assertEquals("Should contain the correct data/renamed value", 0, cmp); } @Test public void testNestedStructProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); @@ -222,61 +219,76 @@ public void testNestedStructProjection() throws Exception { location.setField("long", -1.539054f); record.setField("location", location); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); Record projectedLocation = (Record) projected.getField("location"); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project location", projectedLocation); - Schema latOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()) - )) - ); + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); projected = writeAndRead("latitude_only", writeSchema, latOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); Assert.assertNull("Should not project longitude", projectedLocation.getField("long")); - Assert.assertEquals("Should project latitude", - 52.995143f, (float) projectedLocation.getField("lat"), 0.000001f); - - Schema longOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Assert.assertEquals( + "Should project latitude", + 52.995143f, + (float) projectedLocation.getField("lat"), + 0.000001f); + + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); projected = writeAndRead("longitude_only", writeSchema, longOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); Assert.assertNull("Should not project latitutde", projectedLocation.getField("lat")); - Assert.assertEquals("Should project longitude", - -1.539054f, (float) projectedLocation.getField("long"), 0.000001f); + Assert.assertEquals( + "Should project longitude", + -1.539054f, + (float) projectedLocation.getField("long"), + 0.000001f); Schema locationOnly = writeSchema.select("location"); projected = writeAndRead("location_only", writeSchema, locationOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); - Assert.assertEquals("Should project latitude", - 52.995143f, (float) projectedLocation.getField("lat"), 0.000001f); - Assert.assertEquals("Should project longitude", - -1.539054f, (float) projectedLocation.getField("long"), 0.000001f); + Assert.assertEquals( + "Should project latitude", + 52.995143f, + (float) projectedLocation.getField("lat"), + 0.000001f); + Assert.assertEquals( + "Should project longitude", + -1.539054f, + (float) projectedLocation.getField("long"), + 0.000001f); } @Test public void testMapProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "properties", - Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "properties", + Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); Map properties = ImmutableMap.of("a", "A", "b", "B"); @@ -284,31 +296,36 @@ public void testMapProjection() throws IOException { record.setField("id", 34L); record.setField("properties", properties); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project properties map", projected.getField("properties")); Schema keyOnly = writeSchema.select("properties.key"); projected = writeAndRead("key_only", writeSchema, keyOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); Schema valueOnly = writeSchema.select("properties.value"); projected = writeAndRead("value_only", writeSchema, valueOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); Schema mapOnly = writeSchema.select("properties"); projected = writeAndRead("map_only", writeSchema, mapOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); } private Map toStringMap(Map map) { @@ -325,16 +342,19 @@ public void testMapProjection() throws IOException { @Test public void testMapOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); @@ -346,91 +366,100 @@ public void testMapOfStructsProjection() throws IOException { l2.setField("long", -1.539054f); record.setField("locations", ImmutableMap.of("L1", l1, "L2", l2)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project locations map", projected.getField("locations")); projected = writeAndRead("all_locations", writeSchema, writeSchema.select("locations"), record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project locations map", - record.getField("locations"), toStringMap((Map) projected.getField("locations"))); + Assert.assertEquals( + "Should project locations map", + record.getField("locations"), + toStringMap((Map) projected.getField("locations"))); - projected = writeAndRead("lat_only", - writeSchema, writeSchema.select("locations.lat"), record); + projected = writeAndRead("lat_only", writeSchema, writeSchema.select("locations.lat"), record); Assert.assertNull("Should not project id", projected.getField("id")); Map locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); Record projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain lat", - 53.992811f, (float) projectedL1.getField("lat"), 0.000001); + Assert.assertEquals( + "L1 should contain lat", 53.992811f, (float) projectedL1.getField("lat"), 0.000001); Assert.assertNull("L1 should not contain long", projectedL1.getField("long")); Record projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain lat", - 52.995143f, (float) projectedL2.getField("lat"), 0.000001); + Assert.assertEquals( + "L2 should contain lat", 52.995143f, (float) projectedL2.getField("lat"), 0.000001); Assert.assertNull("L2 should not contain long", projectedL2.getField("long")); - projected = writeAndRead("long_only", - writeSchema, writeSchema.select("locations.long"), record); + projected = + writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), record); Assert.assertNull("Should not project id", projected.getField("id")); locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); Assert.assertNull("L1 should not contain lat", projectedL1.getField("lat")); - Assert.assertEquals("L1 should contain long", - -1.542616f, (float) projectedL1.getField("long"), 0.000001); + Assert.assertEquals( + "L1 should contain long", -1.542616f, (float) projectedL1.getField("long"), 0.000001); projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); Assert.assertNull("L2 should not contain lat", projectedL2.getField("lat")); - Assert.assertEquals("L2 should contain long", - -1.539054f, (float) projectedL2.getField("long"), 0.000001); - - Schema latitiudeRenamed = new Schema( - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "latitude", Types.FloatType.get()) - ) - )) - ); + Assert.assertEquals( + "L2 should contain long", -1.539054f, (float) projectedL2.getField("long"), 0.000001); + + Schema latitiudeRenamed = + new Schema( + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, record); Assert.assertNull("Should not project id", projected.getField("id")); locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain latitude", - 53.992811f, (float) projectedL1.getField("latitude"), 0.000001); + Assert.assertEquals( + "L1 should contain latitude", + 53.992811f, + (float) projectedL1.getField("latitude"), + 0.000001); Assert.assertNull("L1 should not contain lat", projectedL1.getField("lat")); Assert.assertNull("L1 should not contain long", projectedL1.getField("long")); projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain latitude", - 52.995143f, (float) projectedL2.getField("latitude"), 0.000001); + Assert.assertEquals( + "L2 should contain latitude", + 52.995143f, + (float) projectedL2.getField("latitude"), + 0.000001); Assert.assertNull("L2 should not contain lat", projectedL2.getField("lat")); Assert.assertNull("L2 should not contain long", projectedL2.getField("long")); } @Test public void testListProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(10, "values", - Types.ListType.ofOptional(11, Types.LongType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); List values = ImmutableList.of(56L, 57L, 58L); @@ -438,12 +467,11 @@ public void testListProjection() throws IOException { record.setField("id", 34L); record.setField("values", values); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project values list", projected.getField("values")); Schema elementOnly = writeSchema.select("values.element"); @@ -460,15 +488,17 @@ public void testListProjection() throws IOException { @Test @SuppressWarnings("unchecked") public void testListOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()) - )) - ) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); @@ -480,18 +510,17 @@ public void testListOfStructsProjection() throws IOException { p2.setField("y", null); record.setField("points", ImmutableList.of(p1, p2)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project points list", projected.getField("points")); projected = writeAndRead("all_points", writeSchema, writeSchema.select("points"), record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project points list", - record.getField("points"), projected.getField("points")); + Assert.assertEquals( + "Should project points list", record.getField("points"), projected.getField("points")); projected = writeAndRead("x_only", writeSchema, writeSchema.select("points.x"), record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -517,13 +546,15 @@ public void testListOfStructsProjection() throws IOException { Assert.assertNull("Should not project x", projectedP2.getField("x")); Assert.assertNull("Should project null y", projectedP2.getField("y")); - Schema yRenamed = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.optional(18, "z", Types.IntegerType.get()) - )) - ) - ); + Schema yRenamed = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); projected = writeAndRead("y_renamed", writeSchema, yRenamed, record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -539,15 +570,17 @@ public void testListOfStructsProjection() throws IOException { Assert.assertNull("Should not project y", projectedP2.getField("y")); Assert.assertNull("Should project null z", projectedP2.getField("z")); - Schema zAdded = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()), - Types.NestedField.optional(20, "z", Types.IntegerType.get()) - )) - ) - ); + Schema zAdded = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()), + Types.NestedField.optional(20, "z", Types.IntegerType.get()))))); projected = writeAndRead("z_added", writeSchema, zAdded, record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -565,10 +598,10 @@ public void testListOfStructsProjection() throws IOException { } private static org.apache.avro.Schema fromOption(org.apache.avro.Schema schema) { - Preconditions.checkArgument(schema.getType() == UNION, - "Expected union schema but was passed: %s", schema); - Preconditions.checkArgument(schema.getTypes().size() == 2, - "Expected optional schema, but was passed: %s", schema); + Preconditions.checkArgument( + schema.getType() == UNION, "Expected union schema but was passed: %s", schema); + Preconditions.checkArgument( + schema.getTypes().size() == 2, "Expected optional schema, but was passed: %s", schema); if (schema.getTypes().get(0).getType() == org.apache.avro.Schema.Type.NULL) { return schema.getTypes().get(1); } else { diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSelect.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSelect.java index 5df767ecac64..2ecab364cdfe 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSelect.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSelect.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.Serializable; import java.util.List; @@ -48,15 +49,13 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestSelect { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()), - optional(3, "doubleVal", Types.DoubleType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get()), + optional(3, "doubleVal", Types.DoubleType.get())); private static SparkSession spark; @@ -66,17 +65,17 @@ public class TestSelect { private Table table; static { - Listeners.register(event -> { - scanEventCount += 1; - lastScanEvent = event; - }, ScanEvent.class); + Listeners.register( + event -> { + scanEventCount += 1; + lastScanEvent = event; + }, + ScanEvent.class); } @BeforeClass public static void startSpark() { - spark = SparkSession.builder() - .master("local[2]") - .getOrCreate(); + spark = SparkSession.builder().master("local[2]").getOrCreate(); } @AfterClass @@ -86,8 +85,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String tableLocation = null; @@ -98,24 +96,21 @@ public void init() throws Exception { table = TABLES.create(SCHEMA, tableLocation); - List rows = Lists.newArrayList( - new Record(1, "a", 1.0), - new Record(2, "b", 2.0), - new Record(3, "c", Double.NaN) - ); + List rows = + Lists.newArrayList( + new Record(1, "a", 1.0), new Record(2, "b", 2.0), new Record(3, "c", Double.NaN)); Dataset df = spark.createDataFrame(rows, Record.class); - df.select("id", "data", "doubleVal").write() + df.select("id", "data", "doubleVal") + .write() .format("iceberg") .mode("append") .save(tableLocation); table.refresh(); - Dataset results = spark.read() - .format("iceberg") - .load(tableLocation); + Dataset results = spark.read().format("iceberg").load(tableLocation); results.createOrReplaceTempView("table"); scanEventCount = 0; @@ -124,10 +119,13 @@ public void init() throws Exception { @Test public void testSelect() { - List expected = ImmutableList.of( - new Record(1, "a", 1.0), new Record(2, "b", 2.0), new Record(3, "c", Double.NaN)); + List expected = + ImmutableList.of( + new Record(1, "a", 1.0), new Record(2, "b", 2.0), new Record(3, "c", Double.NaN)); - Assert.assertEquals("Should return all expected rows", expected, + Assert.assertEquals( + "Should return all expected rows", + expected, sql("select * from table", Encoders.bean(Record.class))); } @@ -135,7 +133,9 @@ public void testSelect() { public void testSelectRewrite() { List expected = ImmutableList.of(new Record(3, "c", Double.NaN)); - Assert.assertEquals("Should return all expected rows", expected, + Assert.assertEquals( + "Should return all expected rows", + expected, sql("SELECT * FROM table where doubleVal = double('NaN')", Encoders.bean(Record.class))); Assert.assertEquals("Should create only one scan", 1, scanEventCount); @@ -144,13 +144,15 @@ public void testSelectRewrite() { Expression left = ((And) filter).left(); Expression right = ((And) filter).right(); - Assert.assertEquals("Left expression should be NOT_NULL", - Expression.Operation.NOT_NULL, left.op()); - Assert.assertTrue("Left expression should contain column name 'doubleVal'", + Assert.assertEquals( + "Left expression should be NOT_NULL", Expression.Operation.NOT_NULL, left.op()); + Assert.assertTrue( + "Left expression should contain column name 'doubleVal'", left.toString().contains("doubleVal")); - Assert.assertEquals("Right expression should be IS_NAN", - Expression.Operation.IS_NAN, right.op()); - Assert.assertTrue("Right expression should contain column name 'doubleVal'", + Assert.assertEquals( + "Right expression should be IS_NAN", Expression.Operation.IS_NAN, right.op()); + Assert.assertTrue( + "Right expression should contain column name 'doubleVal'", right.toString().contains("doubleVal")); } @@ -158,11 +160,14 @@ public void testSelectRewrite() { public void testProjection() { List expected = ImmutableList.of(1, 2, 3); - Assert.assertEquals("Should return all expected rows", expected, sql("SELECT id FROM table", Encoders.INT())); + Assert.assertEquals( + "Should return all expected rows", expected, sql("SELECT id FROM table", Encoders.INT())); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); - Assert.assertEquals("Should project only the id column", + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should project only the id column", table.schema().select("id").asStruct(), lastScanEvent.projection().asStruct()); } @@ -171,11 +176,14 @@ public void testProjection() { public void testExpressionPushdown() { List expected = ImmutableList.of("b"); - Assert.assertEquals("Should return all expected rows", expected, + Assert.assertEquals( + "Should return all expected rows", + expected, sql("SELECT data FROM table WHERE id = 2", Encoders.STRING())); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should project only id and data columns", + Assert.assertEquals( + "Should project only id and data columns", table.schema().select("id", "data").asStruct(), lastScanEvent.projection().asStruct()); } @@ -189,8 +197,7 @@ public static class Record implements Serializable { private String data; private Double doubleVal; - public Record() { - } + public Record() {} Record(Integer id, String data, Double doubleVal) { this.id = id; @@ -232,8 +239,9 @@ public boolean equals(Object o) { } Record record = (Record) o; - return Objects.equal(id, record.id) && Objects.equal(data, record.data) && - Objects.equal(doubleVal, record.doubleVal); + return Objects.equal(id, record.id) + && Objects.equal(data, record.data) + && Objects.equal(doubleVal, record.doubleVal); } @Override diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java index f5e567adb3a5..26645167f6af 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import java.util.List; import org.apache.hadoop.conf.Configuration; @@ -43,18 +44,14 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestSnapshotSelection { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; @@ -79,48 +76,40 @@ public void testSnapshotSelectionById() throws IOException { Table table = tables.create(SCHEMA, spec, tableLocation); // produce the first snapshot - List firstBatchRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List firstBatchRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); // produce the second snapshot - List secondBatchRecords = Lists.newArrayList( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e"), - new SimpleRecord(6, "f") - ); + List secondBatchRecords = + Lists.newArrayList( + new SimpleRecord(4, "d"), new SimpleRecord(5, "e"), new SimpleRecord(6, "f")); Dataset secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class); secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); Assert.assertEquals("Expected 2 snapshots", 2, Iterables.size(table.snapshots())); // verify records in the current snapshot - Dataset currentSnapshotResult = spark.read() - .format("iceberg") - .load(tableLocation); - List currentSnapshotRecords = currentSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset currentSnapshotResult = spark.read().format("iceberg").load(tableLocation); + List currentSnapshotRecords = + currentSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(firstBatchRecords); expectedRecords.addAll(secondBatchRecords); - Assert.assertEquals("Current snapshot rows should match", expectedRecords, currentSnapshotRecords); + Assert.assertEquals( + "Current snapshot rows should match", expectedRecords, currentSnapshotRecords); // verify records in the previous snapshot Snapshot currentSnapshot = table.currentSnapshot(); Long parentSnapshotId = currentSnapshot.parentId(); - Dataset previousSnapshotResult = spark.read() - .format("iceberg") - .option("snapshot-id", parentSnapshotId) - .load(tableLocation); - List previousSnapshotRecords = previousSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - Assert.assertEquals("Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); + Dataset previousSnapshotResult = + spark.read().format("iceberg").option("snapshot-id", parentSnapshotId).load(tableLocation); + List previousSnapshotRecords = + previousSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Assert.assertEquals( + "Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); } @Test @@ -132,11 +121,9 @@ public void testSnapshotSelectionByTimestamp() throws IOException { Table table = tables.create(SCHEMA, spec, tableLocation); // produce the first snapshot - List firstBatchRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List firstBatchRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); @@ -144,37 +131,35 @@ public void testSnapshotSelectionByTimestamp() throws IOException { long firstSnapshotTimestamp = System.currentTimeMillis(); // produce the second snapshot - List secondBatchRecords = Lists.newArrayList( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e"), - new SimpleRecord(6, "f") - ); + List secondBatchRecords = + Lists.newArrayList( + new SimpleRecord(4, "d"), new SimpleRecord(5, "e"), new SimpleRecord(6, "f")); Dataset secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class); secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); Assert.assertEquals("Expected 2 snapshots", 2, Iterables.size(table.snapshots())); // verify records in the current snapshot - Dataset currentSnapshotResult = spark.read() - .format("iceberg") - .load(tableLocation); - List currentSnapshotRecords = currentSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset currentSnapshotResult = spark.read().format("iceberg").load(tableLocation); + List currentSnapshotRecords = + currentSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(firstBatchRecords); expectedRecords.addAll(secondBatchRecords); - Assert.assertEquals("Current snapshot rows should match", expectedRecords, currentSnapshotRecords); + Assert.assertEquals( + "Current snapshot rows should match", expectedRecords, currentSnapshotRecords); // verify records in the previous snapshot - Dataset previousSnapshotResult = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, firstSnapshotTimestamp) - .load(tableLocation); - List previousSnapshotRecords = previousSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - Assert.assertEquals("Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); + Dataset previousSnapshotResult = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, firstSnapshotTimestamp) + .load(tableLocation); + List previousSnapshotRecords = + previousSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Assert.assertEquals( + "Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); } @Test @@ -185,12 +170,10 @@ public void testSnapshotSelectionByInvalidSnapshotId() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, tableLocation); - Assertions.assertThatThrownBy(() -> spark.read() - .format("iceberg") - .option("snapshot-id", -10) - .load(tableLocation)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot find snapshot with ID -10"); + Assertions.assertThatThrownBy( + () -> spark.read().format("iceberg").option("snapshot-id", -10).load(tableLocation)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot find snapshot with ID -10"); } @Test @@ -202,12 +185,15 @@ public void testSnapshotSelectionByInvalidTimestamp() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, tableLocation); - Assertions.assertThatThrownBy(() -> spark.read() + Assertions.assertThatThrownBy( + () -> + spark + .read() .format("iceberg") .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) .load(tableLocation)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot find a snapshot older than"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot find a snapshot older than"); } @Test @@ -218,23 +204,24 @@ public void testSnapshotSelectionBySnapshotIdAndTimestamp() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, tableLocation); - List firstBatchRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List firstBatchRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); long timestamp = System.currentTimeMillis(); long snapshotId = table.currentSnapshot().snapshotId(); - Assertions.assertThatThrownBy(() -> spark.read() + Assertions.assertThatThrownBy( + () -> + spark + .read() .format("iceberg") .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) .load(tableLocation)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot scan using both snapshot-id and as-of-timestamp"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot scan using both snapshot-id and as-of-timestamp"); } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java index bda525780d8b..3fb2a630fe81 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -41,13 +40,13 @@ public TestSparkAppenderFactory(String fileFormat, boolean partitioned) { } @Override - protected FileAppenderFactory createAppenderFactory(List equalityFieldIds, - Schema eqDeleteSchema, - Schema posDeleteRowSchema) { + protected FileAppenderFactory createAppenderFactory( + List equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema) { return SparkAppenderFactory.builderFor(table, table.schema(), sparkType) .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) .eqDeleteRowSchema(eqDeleteSchema) - .posDelRowSchema(posDeleteRowSchema).build(); + .posDelRowSchema(posDeleteRowSchema) + .build(); } @Override diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkBaseDataReader.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkBaseDataReader.java index 870be890da90..6c4239371476 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkBaseDataReader.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkBaseDataReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.FileFormat.PARQUET; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -48,13 +50,9 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.FileFormat.PARQUET; -import static org.apache.iceberg.Files.localOutput; - public class TestSparkBaseDataReader { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @@ -127,14 +125,12 @@ public void testClosureOnDataExhaustion() throws IOException { Assert.assertNotNull("Reader should return non-null value", reader.get()); } - Assert.assertEquals("Reader returned incorrect number of records", - totalTasks * recordPerTask, - countRecords - ); - tasks.forEach(t -> - Assert.assertTrue("All iterators should be closed after read exhausion", - reader.isIteratorClosed(t)) - ); + Assert.assertEquals( + "Reader returned incorrect number of records", totalTasks * recordPerTask, countRecords); + tasks.forEach( + t -> + Assert.assertTrue( + "All iterators should be closed after read exhausion", reader.isIteratorClosed(t))); } @Test @@ -150,13 +146,15 @@ public void testClosureDuringIteration() throws IOException { // Total of 2 elements Assert.assertTrue(reader.next()); - Assert.assertFalse("First iter should not be closed on its last element", - reader.isIteratorClosed(firstTask)); + Assert.assertFalse( + "First iter should not be closed on its last element", reader.isIteratorClosed(firstTask)); Assert.assertTrue(reader.next()); - Assert.assertTrue("First iter should be closed after moving to second iter", + Assert.assertTrue( + "First iter should be closed after moving to second iter", reader.isIteratorClosed(firstTask)); - Assert.assertFalse("Second iter should not be closed on its last element", + Assert.assertFalse( + "Second iter should not be closed on its last element", reader.isIteratorClosed(secondTask)); Assert.assertFalse(reader.next()); @@ -174,10 +172,10 @@ public void testClosureWithoutAnyRead() throws IOException { reader.close(); - tasks.forEach(t -> - Assert.assertFalse("Iterator should not be created eagerly for tasks", - reader.hasIterator(t)) - ); + tasks.forEach( + t -> + Assert.assertFalse( + "Iterator should not be created eagerly for tasks", reader.hasIterator(t))); } @Test @@ -198,12 +196,13 @@ public void testExplicitClosure() throws IOException { // Some tasks might have not been opened yet, so we don't have corresponding tracker for it. // But all that have been created must be closed. - tasks.forEach(t -> { - if (reader.hasIterator(t)) { - Assert.assertTrue("Iterator should be closed after read exhausion", - reader.isIteratorClosed(t)); - } - }); + tasks.forEach( + t -> { + if (reader.hasIterator(t)) { + Assert.assertTrue( + "Iterator should be closed after read exhausion", reader.isIteratorClosed(t)); + } + }); } @Test @@ -223,26 +222,26 @@ public void testIdempotentExplicitClosure() throws IOException { for (int closeAttempt = 0; closeAttempt < 5; closeAttempt++) { reader.close(); for (int i = 0; i < 5; i++) { - Assert.assertTrue("Iterator should be closed after read exhausion", + Assert.assertTrue( + "Iterator should be closed after read exhausion", reader.isIteratorClosed(tasks.get(i))); } for (int i = 5; i < 10; i++) { - Assert.assertFalse("Iterator should not be created eagerly for tasks", - reader.hasIterator(tasks.get(i))); + Assert.assertFalse( + "Iterator should not be created eagerly for tasks", reader.hasIterator(tasks.get(i))); } } } - private List createFileScanTasks(Integer totalTasks, Integer recordPerTask) throws IOException { + private List createFileScanTasks(Integer totalTasks, Integer recordPerTask) + throws IOException { String desc = "make_scan_tasks"; File parent = temp.newFolder(desc); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); try { this.table = TestTables.create(location, desc, schema, PartitionSpec.unpartitioned()); @@ -254,22 +253,21 @@ private List createFileScanTasks(Integer totalTasks, Integer recor AppendFiles appendFiles = table.newAppend(); for (int i = 0; i < totalTasks; i++) { File parquetFile = new File(dataFolder, PARQUET.addExtension(UUID.randomUUID().toString())); - try (FileAppender writer = Parquet.write(localOutput(parquetFile)) - .schema(tableSchema) - .build()) { + try (FileAppender writer = + Parquet.write(localOutput(parquetFile)).schema(tableSchema).build()) { writer.addAll(expected); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withFileSizeInBytes(parquetFile.length()) - .withPath(parquetFile.toString()) - .withRecordCount(recordPerTask) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withFileSizeInBytes(parquetFile.length()) + .withPath(parquetFile.toString()) + .withRecordCount(recordPerTask) + .build(); appendFiles.appendFile(file); } appendFiles.commit(); - return StreamSupport - .stream(table.newScan().planFiles().spliterator(), false) + return StreamSupport.stream(table.newScan().planFiles().spliterator(), false) .collect(Collectors.toList()); } finally { TestTables.clearTables(); diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java index f6ca5f7de9f4..80876ceb254c 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -62,43 +64,42 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkDataFile { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - optional(103, "i", Types.IntegerType.get()), - required(104, "l", Types.LongType.get()), - optional(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - optional(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - optional(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision - ); - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .identity("b") - .bucket("i", 2) - .identity("l") - .identity("f") - .identity("d") - .identity("date") - .hour("ts") - .identity("ts") - .truncate("s", 2) - .identity("bytes") - .bucket("dec_9_0", 2) - .bucket("dec_11_2", 2) - .bucket("dec_38_10", 2) - .build(); + private static final Schema SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), + optional(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + optional(103, "i", Types.IntegerType.get()), + required(104, "l", Types.LongType.get()), + optional(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + optional(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + optional(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision + ); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA) + .identity("b") + .bucket("i", 2) + .identity("l") + .identity("f") + .identity("d") + .identity("date") + .hour("ts") + .identity("ts") + .truncate("s", 2) + .identity("bytes") + .bucket("dec_9_0", 2) + .bucket("dec_11_2", 2) + .bucket("dec_38_10", 2) + .build(); private static SparkSession spark; private static JavaSparkContext sparkContext = null; @@ -117,8 +118,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String tableLocation = null; @Before @@ -129,7 +129,8 @@ public void setupTableLocation() throws Exception { @Test public void testValueConversion() throws IOException { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); checkSparkDataFile(table); } @@ -150,7 +151,9 @@ public void testValueConversionWithEmptyStats() throws IOException { private void checkSparkDataFile(Table table) throws IOException { Iterable rows = RandomData.generateSpark(table.schema(), 200, 0); JavaRDD rdd = sparkContext.parallelize(Lists.newArrayList(rows)); - Dataset df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false); + Dataset df = + spark.internalCreateDataFrame( + JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false); df.write().format("iceberg").mode("append").save(tableLocation); @@ -170,16 +173,15 @@ private void checkSparkDataFile(Table table) throws IOException { Dataset dataFileDF = spark.read().format("iceberg").load(tableLocation + "#files"); // reorder columns to test arbitrary projections - List columns = Arrays.stream(dataFileDF.columns()) - .map(ColumnName::new) - .collect(Collectors.toList()); + List columns = + Arrays.stream(dataFileDF.columns()).map(ColumnName::new).collect(Collectors.toList()); Collections.shuffle(columns); - List sparkDataFiles = dataFileDF - .select(Iterables.toArray(columns, Column.class)) - .collectAsList(); + List sparkDataFiles = + dataFileDF.select(Iterables.toArray(columns, Column.class)).collectAsList(); - Assert.assertEquals("The number of files should match", dataFiles.size(), sparkDataFiles.size()); + Assert.assertEquals( + "The number of files should match", dataFiles.size(), sparkDataFiles.size()); Types.StructType dataFileType = DataFile.getType(table.spec().partitionType()); StructType sparkDataFileType = sparkDataFiles.get(0).schema(); @@ -195,9 +197,14 @@ private void checkDataFile(DataFile expected, DataFile actual) { Assert.assertEquals("Format must match", expected.format(), actual.format()); Assert.assertEquals("Record count must match", expected.recordCount(), actual.recordCount()); Assert.assertEquals("Size must match", expected.fileSizeInBytes(), actual.fileSizeInBytes()); - Assert.assertEquals("Record value counts must match", expected.valueCounts(), actual.valueCounts()); - Assert.assertEquals("Record null value counts must match", expected.nullValueCounts(), actual.nullValueCounts()); - Assert.assertEquals("Record nan value counts must match", expected.nanValueCounts(), actual.nanValueCounts()); + Assert.assertEquals( + "Record value counts must match", expected.valueCounts(), actual.valueCounts()); + Assert.assertEquals( + "Record null value counts must match", + expected.nullValueCounts(), + actual.nullValueCounts()); + Assert.assertEquals( + "Record nan value counts must match", expected.nanValueCounts(), actual.nanValueCounts()); Assert.assertEquals("Lower bounds must match", expected.lowerBounds(), actual.lowerBounds()); Assert.assertEquals("Upper bounds must match", expected.upperBounds(), actual.upperBounds()); Assert.assertEquals("Key metadata must match", expected.keyMetadata(), actual.keyMetadata()); @@ -210,7 +217,8 @@ private void checkDataFile(DataFile expected, DataFile actual) { private void checkStructLike(StructLike expected, StructLike actual) { Assert.assertEquals("Struct size should match", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i++) { - Assert.assertEquals("Struct values must match", expected.get(i, Object.class), actual.get(i, Object.class)); + Assert.assertEquals( + "Struct values must match", expected.get(i, Object.class), actual.get(i, Object.class)); } } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java index e7a6d16232aa..ed265c1689fe 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + import java.io.File; import java.io.IOException; import java.util.List; @@ -56,28 +61,20 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - @RunWith(Parameterized.class) public class TestSparkDataWrite { private static final Configuration CONF = new Configuration(); private final FileFormat format; private static SparkSession spark = null; - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Parameterized.Parameters(name = "format = {0}") public static Object[] parameters() { - return new Object[] { "parquet", "avro", "orc" }; + return new Object[] {"parquet", "avro", "orc"}; } @BeforeClass @@ -110,15 +107,14 @@ public void testBasicWrite() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); // TODO: incoming columns must be ordered according to the table's schema - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -126,11 +122,10 @@ public void testBasicWrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); for (ManifestFile manifest : table.currentSnapshot().allManifests()) { @@ -161,30 +156,31 @@ public void testAppend() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); - - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "a"), - new SimpleRecord(5, "b"), - new SimpleRecord(6, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); + + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "a"), + new SimpleRecord(5, "b"), + new SimpleRecord(6, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); - df.withColumn("id", df.col("id").plus(3)).select("id", "data").write() + df.withColumn("id", df.col("id").plus(3)) + .select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -192,11 +188,10 @@ public void testAppend() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -210,23 +205,24 @@ public void testEmptyOverwrite() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); List expected = records; Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); Dataset empty = spark.createDataFrame(ImmutableList.of(), SimpleRecord.class); - empty.select("id", "data").write() + empty + .select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Overwrite) @@ -235,11 +231,10 @@ public void testEmptyOverwrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -253,30 +248,31 @@ public void testOverwrite() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "a"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "b"), - new SimpleRecord(6, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "a"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "b"), + new SimpleRecord(6, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); // overwrite with 2*id to replace record 2, append 4 and 6 - df.withColumn("id", df.col("id").multiply(2)).select("id", "data").write() + df.withColumn("id", df.col("id").multiply(2)) + .select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Overwrite) @@ -285,11 +281,10 @@ public void testOverwrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -303,22 +298,22 @@ public void testUnpartitionedOverwrite() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); // overwrite with the same data; should not produce two copies - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Overwrite) @@ -326,11 +321,10 @@ public void testUnpartitionedOverwrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -344,7 +338,8 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "4") // ~4 bytes; low enough to trigger .commit(); @@ -355,7 +350,8 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -363,11 +359,10 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -379,7 +374,8 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws } Assert.assertEquals("Should have 4 DataFiles", 4, files.size()); - Assert.assertTrue("All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); + Assert.assertTrue( + "All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); } @Test @@ -410,15 +406,14 @@ public void testWriteProjection() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, null), - new SimpleRecord(2, null), - new SimpleRecord(3, null) - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null)); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id").write() // select only id column + df.select("id") + .write() // select only id column .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -426,11 +421,10 @@ public void testWriteProjection() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -446,22 +440,23 @@ public void testWriteProjectionWithMiddle() throws IOException { HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); - Schema schema = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + Schema schema = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); Table table = tables.create(schema, spec, location.toString()); - List expected = Lists.newArrayList( - new ThreeColumnRecord(1, null, "hello"), - new ThreeColumnRecord(2, null, "world"), - new ThreeColumnRecord(3, null, null) - ); + List expected = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "hello"), + new ThreeColumnRecord(2, null, "world"), + new ThreeColumnRecord(3, null, null)); Dataset df = spark.createDataFrame(expected, ThreeColumnRecord.class); - df.select("c1", "c3").write() + df.select("c1", "c3") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -469,11 +464,10 @@ public void testWriteProjectionWithMiddle() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); + List actual = + result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -487,44 +481,39 @@ public void testViewsReturnRecentResults() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); - Dataset query = spark.read() - .format("iceberg") - .load(location.toString()) - .where("id = 1"); + Dataset query = spark.read().format("iceberg").load(location.toString()).where("id = 1"); query.createOrReplaceTempView("tmp"); - List actual1 = spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - List expected1 = Lists.newArrayList( - new SimpleRecord(1, "a") - ); + List actual1 = + spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List expected1 = Lists.newArrayList(new SimpleRecord(1, "a")); Assert.assertEquals("Number of rows should match", expected1.size(), actual1.size()); Assert.assertEquals("Result rows should match", expected1, actual1); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); - List actual2 = spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - List expected2 = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(1, "a") - ); + List actual2 = + spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List expected2 = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(1, "a")); Assert.assertEquals("Number of rows should match", expected2.size(), actual2.size()); Assert.assertEquals("Result rows should match", expected2, actual2); } @@ -550,7 +539,9 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti switch (option) { case NONE: - df.select("id", "data").sort("data").write() + df.select("id", "data") + .sort("data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -559,7 +550,8 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti break; case TABLE: table.updateProperties().set(SPARK_WRITE_PARTITIONED_FANOUT_ENABLED, "true").commit(); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -567,7 +559,8 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti .save(location.toString()); break; case JOB: - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -581,11 +574,10 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -597,7 +589,8 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti } Assert.assertEquals("Should have 8 DataFiles", 8, files.size()); - Assert.assertTrue("All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); + Assert.assertTrue( + "All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); } @Test @@ -609,20 +602,21 @@ public void testCommitUnknownException() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); AppendFiles append = table.newFastAppend(); AppendFiles spyAppend = spy(append); - doAnswer(invocation -> { - append.commit(); - throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); - }).when(spyAppend).commit(); + doAnswer( + invocation -> { + append.commit(); + throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); + }) + .when(spyAppend) + .commit(); Table spyTable = spy(table); when(spyTable.newAppend()).thenReturn(spyAppend); @@ -631,20 +625,25 @@ public void testCommitUnknownException() throws IOException { ManualSource.setTable(manualTableName, spyTable); // Although an exception is thrown here, write and commit have succeeded - AssertHelpers.assertThrowsWithCause("Should throw a Commit State Unknown Exception", + AssertHelpers.assertThrowsWithCause( + "Should throw a Commit State Unknown Exception", SparkException.class, "Writing job aborted", CommitStateUnknownException.class, "Datacenter on Fire", - () -> df.select("id", "data").sort("data").write() - .format("org.apache.iceberg.spark.source.ManualSource") - .option(ManualSource.TABLE_NAME, manualTableName) - .mode(SaveMode.Append) - .save(location.toString())); + () -> + df.select("id", "data") + .sort("data") + .write() + .format("org.apache.iceberg.spark.source.ManualSource") + .option(ManualSource.TABLE_NAME, manualTableName) + .mode(SaveMode.Append) + .save(location.toString())); // Since write and commit succeeded, the rows should be readable Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", records.size(), actual.size()); Assert.assertEquals("Result rows should match", records, actual); } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java index 702e8ab98990..4a3263e368c0 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -39,9 +38,11 @@ public TestSparkFileWriterFactory(FileFormat fileFormat, boolean partitioned) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java index be74d1c5a33b..c3bb35ca7df8 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -42,26 +41,32 @@ public TestSparkMergingMetrics(FileFormat fileFormat) { @Override protected FileAppender writeAndGetAppender(List records) throws IOException { - Table testTable = new BaseTable(null, "dummy") { - @Override - public Map properties() { - return Collections.emptyMap(); - } - @Override - public SortOrder sortOrder() { - return SortOrder.unsorted(); - } - @Override - public PartitionSpec spec() { - return PartitionSpec.unpartitioned(); - } - }; + Table testTable = + new BaseTable(null, "dummy") { + @Override + public Map properties() { + return Collections.emptyMap(); + } + + @Override + public SortOrder sortOrder() { + return SortOrder.unsorted(); + } + + @Override + public PartitionSpec spec() { + return PartitionSpec.unpartitioned(); + } + }; FileAppender appender = - SparkAppenderFactory.builderFor(testTable, SCHEMA, SparkSchemaUtil.convert(SCHEMA)).build() + SparkAppenderFactory.builderFor(testTable, SCHEMA, SparkSchemaUtil.convert(SCHEMA)) + .build() .newAppender(org.apache.iceberg.Files.localOutput(temp.newFile()), fileFormat); try (FileAppender fileAppender = appender) { - records.stream().map(r -> new StructInternalRow(SCHEMA.asStruct()).setStruct(r)).forEach(fileAppender::add); + records.stream() + .map(r -> new StructInternalRow(SCHEMA.asStruct()).setStruct(r)) + .forEach(fileAppender::add); } return appender; } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java index 4d07cfbe86ea..276d8c632fc0 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -39,9 +38,11 @@ public TestSparkPartitioningWriters(FileFormat fileFormat) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java index 480448e13a8f..245c392774f5 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -39,9 +38,11 @@ public TestSparkPositionDeltaWriters(FileFormat fileFormat) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java index f42b48d0e30d..7d6f0e76f78f 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -51,10 +54,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSparkReadProjection extends TestReadProjection { @@ -63,11 +62,11 @@ public class TestSparkReadProjection extends TestReadProjection { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } @@ -83,14 +82,17 @@ public TestSparkReadProjection(String format, boolean vectorized) { @BeforeClass public static void startSpark() { TestSparkReadProjection.spark = SparkSession.builder().master("local[2]").getOrCreate(); - ImmutableMap config = ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "parquet-enabled", "true", - "cache-enabled", "false" - ); - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); - config.forEach((key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); + ImmutableMap config = + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "parquet-enabled", "true", + "cache-enabled", "false"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); + config.forEach( + (key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); } @AfterClass @@ -101,8 +103,8 @@ public static void stopSpark() { } @Override - protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, - Record record) throws IOException { + protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, Record record) + throws IOException { File parent = temp.newFolder(desc); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); @@ -116,16 +118,17 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema // When tables are created, the column ids are reassigned. Schema tableSchema = table.schema(); - try (FileAppender writer = new GenericAppenderFactory(tableSchema).newAppender( - localOutput(testFile), format)) { + try (FileAppender writer = + new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), format)) { writer.add(record); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(100) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(100) + .withFileSizeInBytes(testFile.length()) + .withPath(testFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); @@ -139,14 +142,16 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema Schema expectedSchema = reassignIds(readSchema, idMapping); // Set the schema to the expected schema directly to simulate the table schema evolving - TestTables.replaceMetadata(desc, - TestTables.readMetadata(desc).updateSchema(expectedSchema, 100)); + TestTables.replaceMetadata( + desc, TestTables.readMetadata(desc).updateSchema(expectedSchema, 100)); - Dataset df = spark.read() - .format("org.apache.iceberg.spark.source.TestIcebergSource") - .option("iceberg.table.name", desc) - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(); + Dataset df = + spark + .read() + .format("org.apache.iceberg.spark.source.TestIcebergSource") + .option("iceberg.table.name", desc) + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(); return SparkValueConverter.convert(readSchema, df.collectAsList().get(0)); @@ -157,87 +162,98 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema private List allIds(Schema schema) { List ids = Lists.newArrayList(); - TypeUtil.visit(schema, new TypeUtil.SchemaVisitor() { - @Override - public Void field(Types.NestedField field, Void fieldResult) { - ids.add(field.fieldId()); - return null; - } + TypeUtil.visit( + schema, + new TypeUtil.SchemaVisitor() { + @Override + public Void field(Types.NestedField field, Void fieldResult) { + ids.add(field.fieldId()); + return null; + } - @Override - public Void list(Types.ListType list, Void elementResult) { - ids.add(list.elementId()); - return null; - } + @Override + public Void list(Types.ListType list, Void elementResult) { + ids.add(list.elementId()); + return null; + } - @Override - public Void map(Types.MapType map, Void keyResult, Void valueResult) { - ids.add(map.keyId()); - ids.add(map.valueId()); - return null; - } - }); + @Override + public Void map(Types.MapType map, Void keyResult, Void valueResult) { + ids.add(map.keyId()); + ids.add(map.valueId()); + return null; + } + }); return ids; } private Schema reassignIds(Schema schema, Map idMapping) { - return new Schema(TypeUtil.visit(schema, new TypeUtil.SchemaVisitor() { - private int mapId(int id) { - if (idMapping.containsKey(id)) { - return idMapping.get(id); - } - return 1000 + id; // make sure the new IDs don't conflict with reassignment - } + return new Schema( + TypeUtil.visit( + schema, + new TypeUtil.SchemaVisitor() { + private int mapId(int id) { + if (idMapping.containsKey(id)) { + return idMapping.get(id); + } + return 1000 + id; // make sure the new IDs don't conflict with reassignment + } - @Override - public Type schema(Schema schema, Type structResult) { - return structResult; - } + @Override + public Type schema(Schema schema, Type structResult) { + return structResult; + } - @Override - public Type struct(Types.StructType struct, List fieldResults) { - List newFields = Lists.newArrayListWithExpectedSize(fieldResults.size()); - List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - if (field.isOptional()) { - newFields.add(optional(mapId(field.fieldId()), field.name(), fieldResults.get(i))); - } else { - newFields.add(required(mapId(field.fieldId()), field.name(), fieldResults.get(i))); - } - } - return Types.StructType.of(newFields); - } + @Override + public Type struct(Types.StructType struct, List fieldResults) { + List newFields = + Lists.newArrayListWithExpectedSize(fieldResults.size()); + List fields = struct.fields(); + for (int i = 0; i < fields.size(); i += 1) { + Types.NestedField field = fields.get(i); + if (field.isOptional()) { + newFields.add( + optional(mapId(field.fieldId()), field.name(), fieldResults.get(i))); + } else { + newFields.add( + required(mapId(field.fieldId()), field.name(), fieldResults.get(i))); + } + } + return Types.StructType.of(newFields); + } - @Override - public Type field(Types.NestedField field, Type fieldResult) { - return fieldResult; - } + @Override + public Type field(Types.NestedField field, Type fieldResult) { + return fieldResult; + } - @Override - public Type list(Types.ListType list, Type elementResult) { - if (list.isElementOptional()) { - return Types.ListType.ofOptional(mapId(list.elementId()), elementResult); - } else { - return Types.ListType.ofRequired(mapId(list.elementId()), elementResult); - } - } + @Override + public Type list(Types.ListType list, Type elementResult) { + if (list.isElementOptional()) { + return Types.ListType.ofOptional(mapId(list.elementId()), elementResult); + } else { + return Types.ListType.ofRequired(mapId(list.elementId()), elementResult); + } + } - @Override - public Type map(Types.MapType map, Type keyResult, Type valueResult) { - if (map.isValueOptional()) { - return Types.MapType.ofOptional( - mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); - } else { - return Types.MapType.ofRequired( - mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); - } - } + @Override + public Type map(Types.MapType map, Type keyResult, Type valueResult) { + if (map.isValueOptional()) { + return Types.MapType.ofOptional( + mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); + } else { + return Types.MapType.ofRequired( + mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); + } + } - @Override - public Type primitive(Type.PrimitiveType primitive) { - return primitive; - } - }).asNestedType().asStructType().fields()); + @Override + public Type primitive(Type.PrimitiveType primitive) { + return primitive; + } + }) + .asNestedType() + .asStructType() + .fields()); } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java index e543a408e8ce..462f34530725 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; + import java.io.IOException; import java.util.List; import org.apache.hadoop.hive.conf.HiveConf; @@ -60,8 +61,6 @@ import org.junit.BeforeClass; import org.junit.Test; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; - public class TestSparkReaderDeletes extends DeleteReadTests { private static TestHiveMetastore metastore = null; @@ -74,15 +73,18 @@ public static void startMetastoreAndSpark() { metastore.start(); HiveConf hiveConf = metastore.hiveConf(); - spark = SparkSession.builder() - .master("local[2]") - .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .enableHiveSupport() - .getOrCreate(); + spark = + SparkSession.builder() + .master("local[2]") + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .enableHiveSupport() + .getOrCreate(); - catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); try { catalog.createNamespace(Namespace.of("default")); @@ -117,17 +119,21 @@ protected void dropTable(String name) { @Override public StructLikeSet rowSet(String name, Table table, String... columns) { - Dataset df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", name).toString()) - .selectExpr(columns); + Dataset df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", name).toString()) + .selectExpr(columns); Types.StructType projection = table.schema().select(columns).asStruct(); StructLikeSet set = StructLikeSet.create(projection); - df.collectAsList().forEach(row -> { - SparkStructLike rowWrapper = new SparkStructLike(projection); - set.add(rowWrapper.wrap(row)); - }); + df.collectAsList() + .forEach( + row -> { + SparkStructLike rowWrapper = new SparkStructLike(projection); + set.add(rowWrapper.wrap(row)); + }); return set; } @@ -137,31 +143,39 @@ public void testEqualityDeleteWithFilter() throws IOException { String tableName = table.name().substring(table.name().lastIndexOf(".") + 1); Schema deleteRowSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d"), // id = 89 - dataDelete.copy("data", "g") // id = 122 - ); - - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, deleteRowSchema); - - table.newRowDelta() - .addDeletes(eqDeletes) - .commit(); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d"), // id = 89 + dataDelete.copy("data", "g") // id = 122 + ); + + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + deleteRowSchema); + + table.newRowDelta().addDeletes(eqDeletes).commit(); Types.StructType projection = table.schema().select("*").asStruct(); - Dataset df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", tableName).toString()) - .filter("data = 'a'") // select a deleted row - .selectExpr("*"); + Dataset df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", tableName).toString()) + .filter("data = 'a'") // select a deleted row + .selectExpr("*"); StructLikeSet actual = StructLikeSet.create(projection); - df.collectAsList().forEach(row -> { - SparkStructLike rowWrapper = new SparkStructLike(projection); - actual.add(rowWrapper.wrap(row)); - }); + df.collectAsList() + .forEach( + row -> { + SparkStructLike rowWrapper = new SparkStructLike(projection); + actual.add(rowWrapper.wrap(row)); + }); Assert.assertEquals("Table should contain no rows", 0, actual.size()); } @@ -170,44 +184,57 @@ public void testEqualityDeleteWithFilter() throws IOException { public void testReadEqualityDeleteRows() throws IOException { Schema deleteSchema1 = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteSchema1); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d") // id = 89 - ); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d") // id = 89 + ); Schema deleteSchema2 = table.schema().select("id"); Record idDelete = GenericRecord.create(deleteSchema2); - List idDeletes = Lists.newArrayList( - idDelete.copy("id", 121), // id = 121 - idDelete.copy("id", 122) // id = 122 - ); - - DeleteFile eqDelete1 = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, deleteSchema1); - - DeleteFile eqDelete2 = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), idDeletes, deleteSchema2); - - table.newRowDelta() - .addDeletes(eqDelete1) - .addDeletes(eqDelete2) - .commit(); + List idDeletes = + Lists.newArrayList( + idDelete.copy("id", 121), // id = 121 + idDelete.copy("id", 122) // id = 122 + ); + + DeleteFile eqDelete1 = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + deleteSchema1); + + DeleteFile eqDelete2 = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + idDeletes, + deleteSchema2); + + table.newRowDelta().addDeletes(eqDelete1).addDeletes(eqDelete2).commit(); StructLikeSet expectedRowSet = rowSetWithIds(29, 89, 121, 122); Types.StructType type = table.schema().asStruct(); StructLikeSet actualRowSet = StructLikeSet.create(type); - CloseableIterable tasks = TableScanUtil.planTasks( - table.newScan().planFiles(), - TableProperties.METADATA_SPLIT_SIZE_DEFAULT, - TableProperties.SPLIT_LOOKBACK_DEFAULT, - TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); + CloseableIterable tasks = + TableScanUtil.planTasks( + table.newScan().planFiles(), + TableProperties.METADATA_SPLIT_SIZE_DEFAULT, + TableProperties.SPLIT_LOOKBACK_DEFAULT, + TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); for (CombinedScanTask task : tasks) { - try (EqualityDeleteRowReader reader = new EqualityDeleteRowReader(task, table, table.schema(), false)) { + try (EqualityDeleteRowReader reader = + new EqualityDeleteRowReader(task, table, table.schema(), false)) { while (reader.next()) { - actualRowSet.add(new InternalRowWrapper(SparkSchemaUtil.convert(table.schema())).wrap(reader.get().copy())); + actualRowSet.add( + new InternalRowWrapper(SparkSchemaUtil.convert(table.schema())) + .wrap(reader.get().copy())); } } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java index 9023195dcc6a..dcf9140a8885 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -36,9 +35,11 @@ public TestSparkRollingFileWriters(FileFormat fileFormat, boolean partitioned) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkSchema.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkSchema.java index d3c5cc4371bc..93cc24973f6c 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkSchema.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkSchema.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import java.util.List; import org.apache.hadoop.conf.Configuration; @@ -42,19 +43,15 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestSparkSchema { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); private static SparkSession spark = null; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @BeforeClass public static void startSpark() { @@ -76,26 +73,18 @@ public void testSparkReadSchemaIsHonored() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, null, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a") - ); + List expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); StructType sparkReadSchema = new StructType( new StructField[] { - new StructField("id", DataTypes.IntegerType, true, Metadata.empty()) - } - ); + new StructField("id", DataTypes.IntegerType, true, Metadata.empty()) + }); - Dataset resultDf = spark.read() - .schema(sparkReadSchema) - .format("iceberg") - .load(tableLocation); + Dataset resultDf = + spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation); Row[] results = (Row[]) resultDf.collect(); @@ -112,30 +101,22 @@ public void testFailIfSparkReadSchemaIsOff() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, null, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a") - ); + List expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); StructType sparkReadSchema = new StructType( new StructField[] { - new StructField("idd", DataTypes.IntegerType, true, Metadata.empty()) // wrong field name - } - ); - - AssertHelpers.assertThrows("Iceberg should not allow a projection that contain unknown fields", - java.lang.IllegalArgumentException.class, "Field idd not found in source schema", - () -> - spark.read() - .schema(sparkReadSchema) - .format("iceberg") - .load(tableLocation) - ); + new StructField( + "idd", DataTypes.IntegerType, true, Metadata.empty()) // wrong field name + }); + + AssertHelpers.assertThrows( + "Iceberg should not allow a projection that contain unknown fields", + java.lang.IllegalArgumentException.class, + "Field idd not found in source schema", + () -> spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation)); } @Test @@ -146,28 +127,19 @@ public void testSparkReadSchemaCombinedWithProjection() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, null, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a") - ); + List expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); StructType sparkReadSchema = new StructType( new StructField[] { - new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), - new StructField("data", DataTypes.StringType, true, Metadata.empty()) - } - ); + new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), + new StructField("data", DataTypes.StringType, true, Metadata.empty()) + }); - Dataset resultDf = spark.read() - .schema(sparkReadSchema) - .format("iceberg") - .load(tableLocation) - .select("id"); + Dataset resultDf = + spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation).select("id"); Row[] results = (Row[]) resultDf.collect(); @@ -177,37 +149,34 @@ public void testSparkReadSchemaCombinedWithProjection() throws IOException { } @Test - public void testFailSparkReadSchemaCombinedWithProjectionWhenSchemaDoesNotContainProjection() throws IOException { + public void testFailSparkReadSchemaCombinedWithProjectionWhenSchemaDoesNotContainProjection() + throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, null, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a") - ); + List expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); StructType sparkReadSchema = new StructType( new StructField[] { - new StructField("data", DataTypes.StringType, true, Metadata.empty()) - } - ); + new StructField("data", DataTypes.StringType, true, Metadata.empty()) + }); - AssertHelpers.assertThrows("Spark should not allow a projection that is not included in the read schema", - org.apache.spark.sql.AnalysisException.class, "cannot resolve '`id`' given input columns: [data]", + AssertHelpers.assertThrows( + "Spark should not allow a projection that is not included in the read schema", + org.apache.spark.sql.AnalysisException.class, + "cannot resolve '`id`' given input columns: [data]", () -> - spark.read() - .schema(sparkReadSchema) - .format("iceberg") - .load(tableLocation) - .select("id") - ); + spark + .read() + .schema(sparkReadSchema) + .format("iceberg") + .load(tableLocation) + .select("id")); } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtil.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtil.java index 52acbe7fc0e9..438fc0979fe3 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtil.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtil.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; +import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.util.List; @@ -64,15 +67,13 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; -import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; -import static org.apache.iceberg.types.Types.NestedField.optional; - @RunWith(Enclosed.class) public class TestSparkTableUtil extends HiveTableBaseTest { private static final String TABLE_NAME = "hive_table"; - private static final String QUALIFIED_TABLE_NAME = String.format("%s.%s", HiveTableBaseTest.DB_NAME, TABLE_NAME); - private static final Path TABLE_LOCATION_PATH = HiveTableBaseTest.getTableLocationPath(TABLE_NAME); + private static final String QUALIFIED_TABLE_NAME = + String.format("%s.%s", HiveTableBaseTest.DB_NAME, TABLE_NAME); + private static final Path TABLE_LOCATION_PATH = + HiveTableBaseTest.getTableLocationPath(TABLE_NAME); private static final String TABLE_LOCATION_STR = TABLE_LOCATION_PATH.toString(); private static SparkSession spark = null; @@ -81,7 +82,9 @@ public static void startSpark() { String metastoreURI = HiveTableBaseTest.hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname); // Create a spark session. - TestSparkTableUtil.spark = SparkSession.builder().master("local[2]") + TestSparkTableUtil.spark = + SparkSession.builder() + .master("local[2]") .enableHiveSupport() .config("spark.hadoop.hive.metastore.uris", metastoreURI) .config("hive.exec.dynamic.partition", "true") @@ -102,26 +105,23 @@ static void loadData(FileFormat fileFormat) { // Create a hive table. SQLContext sc = new SQLContext(TestSparkTableUtil.spark); - sc.sql(String.format( - "CREATE TABLE %s (\n" + - " id int COMMENT 'unique id'\n" + - ")\n" + - "PARTITIONED BY (data string)\n" + - "STORED AS %s\n" + - "LOCATION '%s'", QUALIFIED_TABLE_NAME, fileFormat, TABLE_LOCATION_STR) - ); - - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + sc.sql( + String.format( + "CREATE TABLE %s (\n" + + " id int COMMENT 'unique id'\n" + + ")\n" + + "PARTITIONED BY (data string)\n" + + "STORED AS %s\n" + + "LOCATION '%s'", + QUALIFIED_TABLE_NAME, fileFormat, TABLE_LOCATION_STR)); + + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").orderBy("data").write() - .mode("append") - .insertInto(QUALIFIED_TABLE_NAME); + df.select("id", "data").orderBy("data").write().mode("append").insertInto(QUALIFIED_TABLE_NAME); } static void cleanupData() throws IOException { @@ -138,12 +138,11 @@ public static class TableImport { private final FileFormat format; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Parameterized.Parameters(name = "format = {0}") public static Object[] parameters() { - return new Object[] { "parquet", "orc" }; + return new Object[] {"parquet", "orc"}; } public TableImport(String format) { @@ -163,16 +162,23 @@ public void after() throws IOException { @Test public void testImportPartitionedTable() throws Exception { File location = temp.newFolder("partitioned_table"); - spark.table(QUALIFIED_TABLE_NAME).write().mode("overwrite").partitionBy("data").format(format.toString()) + spark + .table(QUALIFIED_TABLE_NAME) + .write() + .mode("overwrite") + .partitionBy("data") + .format(format.toString()) .saveAsTable("test_partitioned_table"); - TableIdentifier source = spark.sessionState().sqlParser() - .parseTableIdentifier("test_partitioned_table"); + TableIdentifier source = + spark.sessionState().sqlParser().parseTableIdentifier("test_partitioned_table"); HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); Schema tableSchema = SparkSchemaUtil.schemaForTable(spark, QUALIFIED_TABLE_NAME); - Table table = tables.create(tableSchema, - SparkSchemaUtil.specForTable(spark, QUALIFIED_TABLE_NAME), - ImmutableMap.of(), - location.getCanonicalPath()); + Table table = + tables.create( + tableSchema, + SparkSchemaUtil.specForTable(spark, QUALIFIED_TABLE_NAME), + ImmutableMap.of(), + location.getCanonicalPath()); File stagingDir = temp.newFolder("staging-dir"); SparkTableUtil.importSparkTable(spark, source, table, stagingDir.toString()); long count = spark.read().format("iceberg").load(location.toString()).count(); @@ -182,15 +188,21 @@ public void testImportPartitionedTable() throws Exception { @Test public void testImportUnpartitionedTable() throws Exception { File location = temp.newFolder("unpartitioned_table"); - spark.table(QUALIFIED_TABLE_NAME).write().mode("overwrite").format(format.toString()) + spark + .table(QUALIFIED_TABLE_NAME) + .write() + .mode("overwrite") + .format(format.toString()) .saveAsTable("test_unpartitioned_table"); - TableIdentifier source = spark.sessionState().sqlParser() - .parseTableIdentifier("test_unpartitioned_table"); + TableIdentifier source = + spark.sessionState().sqlParser().parseTableIdentifier("test_unpartitioned_table"); HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - Table table = tables.create(SparkSchemaUtil.schemaForTable(spark, QUALIFIED_TABLE_NAME), - SparkSchemaUtil.specForTable(spark, QUALIFIED_TABLE_NAME), - ImmutableMap.of(), - location.getCanonicalPath()); + Table table = + tables.create( + SparkSchemaUtil.schemaForTable(spark, QUALIFIED_TABLE_NAME), + SparkSchemaUtil.specForTable(spark, QUALIFIED_TABLE_NAME), + ImmutableMap.of(), + location.getCanonicalPath()); File stagingDir = temp.newFolder("staging-dir"); SparkTableUtil.importSparkTable(spark, source, table, stagingDir.toString()); long count = spark.read().format("iceberg").load(location.toString()).count(); @@ -199,31 +211,45 @@ public void testImportUnpartitionedTable() throws Exception { @Test public void testImportAsHiveTable() throws Exception { - spark.table(QUALIFIED_TABLE_NAME).write().mode("overwrite").format(format.toString()) + spark + .table(QUALIFIED_TABLE_NAME) + .write() + .mode("overwrite") + .format(format.toString()) .saveAsTable("unpartitioned_table"); TableIdentifier source = new TableIdentifier("unpartitioned_table"); org.apache.iceberg.catalog.TableIdentifier testUnpartitionedTableId = - org.apache.iceberg.catalog.TableIdentifier.of(DB_NAME, "test_unpartitioned_table_" + format); + org.apache.iceberg.catalog.TableIdentifier.of( + DB_NAME, "test_unpartitioned_table_" + format); File stagingDir = temp.newFolder("staging-dir"); - Table table = catalog.createTable( - testUnpartitionedTableId, - SparkSchemaUtil.schemaForTable(spark, "unpartitioned_table"), - SparkSchemaUtil.specForTable(spark, "unpartitioned_table")); + Table table = + catalog.createTable( + testUnpartitionedTableId, + SparkSchemaUtil.schemaForTable(spark, "unpartitioned_table"), + SparkSchemaUtil.specForTable(spark, "unpartitioned_table")); SparkTableUtil.importSparkTable(spark, source, table, stagingDir.toString()); - long count1 = spark.read().format("iceberg").load(testUnpartitionedTableId.toString()).count(); + long count1 = + spark.read().format("iceberg").load(testUnpartitionedTableId.toString()).count(); Assert.assertEquals("three values ", 3, count1); - spark.table(QUALIFIED_TABLE_NAME).write().mode("overwrite").partitionBy("data").format(format.toString()) + spark + .table(QUALIFIED_TABLE_NAME) + .write() + .mode("overwrite") + .partitionBy("data") + .format(format.toString()) .saveAsTable("partitioned_table"); source = new TableIdentifier("partitioned_table"); org.apache.iceberg.catalog.TableIdentifier testPartitionedTableId = - org.apache.iceberg.catalog.TableIdentifier.of(DB_NAME, "test_partitioned_table_" + format); - table = catalog.createTable( - testPartitionedTableId, - SparkSchemaUtil.schemaForTable(spark, "partitioned_table"), - SparkSchemaUtil.specForTable(spark, "partitioned_table")); + org.apache.iceberg.catalog.TableIdentifier.of( + DB_NAME, "test_partitioned_table_" + format); + table = + catalog.createTable( + testPartitionedTableId, + SparkSchemaUtil.schemaForTable(spark, "partitioned_table"), + SparkSchemaUtil.specForTable(spark, "partitioned_table")); SparkTableUtil.importSparkTable(spark, source, table, stagingDir.toString()); long count2 = spark.read().format("iceberg").load(testPartitionedTableId.toString()).count(); @@ -232,13 +258,15 @@ public void testImportAsHiveTable() throws Exception { @Test public void testImportWithNameMapping() throws Exception { - spark.table(QUALIFIED_TABLE_NAME).write().mode("overwrite").format(format.toString()) + spark + .table(QUALIFIED_TABLE_NAME) + .write() + .mode("overwrite") + .format(format.toString()) .saveAsTable("original_table"); // The field is different so that it will project with name mapping - Schema filteredSchema = new Schema( - optional(1, "data", Types.StringType.get()) - ); + Schema filteredSchema = new Schema(optional(1, "data", Types.StringType.get())); NameMapping nameMapping = MappingUtil.create(filteredSchema); @@ -246,24 +274,30 @@ public void testImportWithNameMapping() throws Exception { TableIdentifier source = new TableIdentifier("original_table"); org.apache.iceberg.catalog.TableIdentifier targetTable = org.apache.iceberg.catalog.TableIdentifier.of(DB_NAME, targetTableName); - Table table = catalog.createTable( - targetTable, - filteredSchema, - SparkSchemaUtil.specForTable(spark, "original_table")); + Table table = + catalog.createTable( + targetTable, filteredSchema, SparkSchemaUtil.specForTable(spark, "original_table")); - table.updateProperties().set(DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)).commit(); + table + .updateProperties() + .set(DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)) + .commit(); File stagingDir = temp.newFolder("staging-dir"); SparkTableUtil.importSparkTable(spark, source, table, stagingDir.toString()); // The filter invoke the metric/dictionary row group filter in which it project schema // with name mapping again to match the metric read from footer. - List actual = spark.read().format("iceberg").load(targetTable.toString()) - .select("data") - .sort("data") - .filter("data >= 'b'") - .as(Encoders.STRING()) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(targetTable.toString()) + .select("data") + .sort("data") + .filter("data >= 'b'") + .as(Encoders.STRING()) + .collectAsList(); List expected = Lists.newArrayList("b", "c"); @@ -272,25 +306,29 @@ public void testImportWithNameMapping() throws Exception { @Test public void testImportWithNameMappingForVectorizedParquetReader() throws Exception { - Assume.assumeTrue("Applies only to parquet format.", - FileFormat.PARQUET == format); - spark.table(QUALIFIED_TABLE_NAME).write().mode("overwrite").format(format.toString()) + Assume.assumeTrue("Applies only to parquet format.", FileFormat.PARQUET == format); + spark + .table(QUALIFIED_TABLE_NAME) + .write() + .mode("overwrite") + .format(format.toString()) .saveAsTable("original_table"); // The field is different so that it will project with name mapping - Schema filteredSchema = new Schema( - optional(1, "data", Types.StringType.get()) - ); + Schema filteredSchema = new Schema(optional(1, "data", Types.StringType.get())); NameMapping nameMapping = MappingUtil.create(filteredSchema); TableIdentifier source = new TableIdentifier("original_table"); - Table table = catalog.createTable( - org.apache.iceberg.catalog.TableIdentifier.of(DB_NAME, "target_table_for_vectorization"), - filteredSchema, - SparkSchemaUtil.specForTable(spark, "original_table")); - - table.updateProperties() + Table table = + catalog.createTable( + org.apache.iceberg.catalog.TableIdentifier.of( + DB_NAME, "target_table_for_vectorization"), + filteredSchema, + SparkSchemaUtil.specForTable(spark, "original_table")); + + table + .updateProperties() .set(DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)) .set(PARQUET_VECTORIZATION_ENABLED, "true") .commit(); @@ -300,13 +338,16 @@ public void testImportWithNameMappingForVectorizedParquetReader() throws Excepti // The filter invoke the metric/dictionary row group filter in which it project schema // with name mapping again to match the metric read from footer. - List actual = spark.read().format("iceberg") - .load(DB_NAME + ".target_table_for_vectorization") - .select("data") - .sort("data") - .filter("data >= 'b'") - .as(Encoders.STRING()) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(DB_NAME + ".target_table_for_vectorization") + .select("data") + .sort("data") + .filter("data >= 'b'") + .as(Encoders.STRING()) + .collectAsList(); List expected = Lists.newArrayList("b", "c"); @@ -323,24 +364,34 @@ public void testImportPartitionedWithWhitespace() throws Exception { File icebergLocation = temp.newFolder("partitioned_table"); - spark.createDataFrame(spacedRecords, SimpleRecord.class) + spark + .createDataFrame(spacedRecords, SimpleRecord.class) .withColumnRenamed("data", partitionCol) - .write().mode("overwrite").partitionBy(partitionCol).format(format.toString()) + .write() + .mode("overwrite") + .partitionBy(partitionCol) + .format(format.toString()) .saveAsTable(spacedTableName); - TableIdentifier source = spark.sessionState().sqlParser() - .parseTableIdentifier(spacedTableName); + TableIdentifier source = + spark.sessionState().sqlParser().parseTableIdentifier(spacedTableName); HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - Table table = tables.create(SparkSchemaUtil.schemaForTable(spark, spacedTableName), - SparkSchemaUtil.specForTable(spark, spacedTableName), - ImmutableMap.of(), - icebergLocation.getCanonicalPath()); + Table table = + tables.create( + SparkSchemaUtil.schemaForTable(spark, spacedTableName), + SparkSchemaUtil.specForTable(spark, spacedTableName), + ImmutableMap.of(), + icebergLocation.getCanonicalPath()); File stagingDir = temp.newFolder("staging-dir"); SparkTableUtil.importSparkTable(spark, source, table, stagingDir.toString()); - List results = spark.read().format("iceberg").load(icebergLocation.toString()) - .withColumnRenamed(partitionCol, "data") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List results = + spark + .read() + .format("iceberg") + .load(icebergLocation.toString()) + .withColumnRenamed(partitionCol, "data") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Data should match", spacedRecords, results); } @@ -355,22 +406,35 @@ public void testImportUnpartitionedWithWhitespace() throws Exception { File whiteSpaceOldLocation = temp.newFolder("white space location"); File icebergLocation = temp.newFolder("partitioned_table"); - spark.createDataFrame(spacedRecords, SimpleRecord.class) - .write().mode("overwrite").format(format.toString()).save(whiteSpaceOldLocation.getPath()); + spark + .createDataFrame(spacedRecords, SimpleRecord.class) + .write() + .mode("overwrite") + .format(format.toString()) + .save(whiteSpaceOldLocation.getPath()); - spark.catalog().createExternalTable(spacedTableName, whiteSpaceOldLocation.getPath(), format.toString()); + spark + .catalog() + .createExternalTable(spacedTableName, whiteSpaceOldLocation.getPath(), format.toString()); - TableIdentifier source = spark.sessionState().sqlParser() - .parseTableIdentifier(spacedTableName); + TableIdentifier source = + spark.sessionState().sqlParser().parseTableIdentifier(spacedTableName); HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); - Table table = tables.create(SparkSchemaUtil.schemaForTable(spark, spacedTableName), - SparkSchemaUtil.specForTable(spark, spacedTableName), - ImmutableMap.of(), - icebergLocation.getCanonicalPath()); + Table table = + tables.create( + SparkSchemaUtil.schemaForTable(spark, spacedTableName), + SparkSchemaUtil.specForTable(spark, spacedTableName), + ImmutableMap.of(), + icebergLocation.getCanonicalPath()); File stagingDir = temp.newFolder("staging-dir"); SparkTableUtil.importSparkTable(spark, source, table, stagingDir.toString()); - List results = spark.read().format("iceberg").load(icebergLocation.toString()) - .as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List results = + spark + .read() + .format("iceberg") + .load(icebergLocation.toString()) + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Data should match", spacedRecords, results); } @@ -398,8 +462,7 @@ public void testSparkPartitionJavaSerialization() throws IOException, ClassNotFo public static class GetPartitions { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); // This logic does not really depend on format private final FileFormat format = FileFormat.PARQUET; @@ -407,63 +470,68 @@ public static class GetPartitions { @Test public void testPartitionScan() throws Exception { - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "ab", "data"), - new ThreeColumnRecord(2, "b c", "data"), - new ThreeColumnRecord(1, "b c", "data"), - new ThreeColumnRecord(2, "ab", "data")); + List records = + Lists.newArrayList( + new ThreeColumnRecord(1, "ab", "data"), + new ThreeColumnRecord(2, "b c", "data"), + new ThreeColumnRecord(1, "b c", "data"), + new ThreeColumnRecord(2, "ab", "data")); String tableName = "external_table"; - spark.createDataFrame(records, ThreeColumnRecord.class) - .write().mode("overwrite").format(format.toString()) - .partitionBy("c1", "c2").saveAsTable(tableName); - - TableIdentifier source = spark.sessionState().sqlParser() - .parseTableIdentifier(tableName); - - Map partition1 = ImmutableMap.of( - "c1", "1", - "c2", "ab"); - Map partition2 = ImmutableMap.of( - "c1", "2", - "c2", "b c"); - Map partition3 = ImmutableMap.of( - "c1", "1", - "c2", "b c"); - Map partition4 = ImmutableMap.of( - "c1", "2", - "c2", "ab"); + spark + .createDataFrame(records, ThreeColumnRecord.class) + .write() + .mode("overwrite") + .format(format.toString()) + .partitionBy("c1", "c2") + .saveAsTable(tableName); + + TableIdentifier source = spark.sessionState().sqlParser().parseTableIdentifier(tableName); + + Map partition1 = + ImmutableMap.of( + "c1", "1", + "c2", "ab"); + Map partition2 = + ImmutableMap.of( + "c1", "2", + "c2", "b c"); + Map partition3 = + ImmutableMap.of( + "c1", "1", + "c2", "b c"); + Map partition4 = + ImmutableMap.of( + "c1", "2", + "c2", "ab"); List partitionsC11 = SparkTableUtil.getPartitions(spark, source, ImmutableMap.of("c1", "1")); - Set> expectedC11 = - Sets.newHashSet(partition1, partition3); - Set> actualC11 = partitionsC11.stream().map( - p -> p.getValues()).collect(Collectors.toSet()); + Set> expectedC11 = Sets.newHashSet(partition1, partition3); + Set> actualC11 = + partitionsC11.stream().map(p -> p.getValues()).collect(Collectors.toSet()); Assert.assertEquals("Wrong partitions fetched for c1=1", expectedC11, actualC11); List partitionsC12 = SparkTableUtil.getPartitions(spark, source, ImmutableMap.of("c1", "2")); Set> expectedC12 = Sets.newHashSet(partition2, partition4); - Set> actualC12 = partitionsC12.stream().map( - p -> p.getValues()).collect(Collectors.toSet()); + Set> actualC12 = + partitionsC12.stream().map(p -> p.getValues()).collect(Collectors.toSet()); Assert.assertEquals("Wrong partitions fetched for c1=2", expectedC12, actualC12); List partitionsC21 = SparkTableUtil.getPartitions(spark, source, ImmutableMap.of("c2", "ab")); - Set> expectedC21 = - Sets.newHashSet(partition1, partition4); - Set> actualC21 = partitionsC21.stream().map( - p -> p.getValues()).collect(Collectors.toSet()); + Set> expectedC21 = Sets.newHashSet(partition1, partition4); + Set> actualC21 = + partitionsC21.stream().map(p -> p.getValues()).collect(Collectors.toSet()); Assert.assertEquals("Wrong partitions fetched for c2=ab", expectedC21, actualC21); List partitionsC22 = SparkTableUtil.getPartitions(spark, source, ImmutableMap.of("c2", "b c")); - Set> expectedC22 = - Sets.newHashSet(partition2, partition3); - Set> actualC22 = partitionsC22.stream().map( - p -> p.getValues()).collect(Collectors.toSet()); + Set> expectedC22 = Sets.newHashSet(partition2, partition3); + Set> actualC22 = + partitionsC22.stream().map(p -> p.getValues()).collect(Collectors.toSet()); Assert.assertEquals("Wrong partitions fetched for c2=b c", expectedC22, actualC22); } } @@ -491,10 +559,12 @@ public void testPartitionScan() { @Test public void testPartitionScanByFilter() { - List partitions = SparkTableUtil.getPartitionsByFilter(spark, QUALIFIED_TABLE_NAME, "data = 'a'"); + List partitions = + SparkTableUtil.getPartitionsByFilter(spark, QUALIFIED_TABLE_NAME, "data = 'a'"); Assert.assertEquals("There should be 1 matching partition", 1, partitions.size()); - Dataset partitionDF = SparkTableUtil.partitionDFByFilter(spark, QUALIFIED_TABLE_NAME, "data = 'a'"); + Dataset partitionDF = + SparkTableUtil.partitionDFByFilter(spark, QUALIFIED_TABLE_NAME, "data = 'a'"); Assert.assertEquals("There should be 1 matching partition", 1, partitionDF.count()); } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtilWithInMemoryCatalog.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtilWithInMemoryCatalog.java index 0a45c6901d3c..734dddc5a75e 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtilWithInMemoryCatalog.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtilWithInMemoryCatalog.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; @@ -55,27 +57,21 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkTableUtilWithInMemoryCatalog { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .identity("data") - .build(); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("data").build(); private static SparkSession spark; @BeforeClass public static void startSpark() { - TestSparkTableUtilWithInMemoryCatalog.spark = SparkSession.builder() - .master("local[2]") - .getOrCreate(); + TestSparkTableUtilWithInMemoryCatalog.spark = + SparkSession.builder().master("local[2]").getOrCreate(); } @AfterClass @@ -85,8 +81,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String tableLocation = null; @@ -103,32 +98,35 @@ public void testImportUnpartitionedTable() throws IOException { props.put(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "data", "full"); Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), props, tableLocation); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); File parquetTableDir = temp.newFolder("parquet_table"); String parquetTableLocation = parquetTableDir.toURI().toString(); try { Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class).coalesce(1); - inputDF.select("id", "data").write() + inputDF + .select("id", "data") + .write() .format("parquet") .mode("append") .option("path", parquetTableLocation) .saveAsTable("parquet_table"); File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); - - List actualRecords = spark.read() - .format("iceberg") - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + SparkTableUtil.importSparkTable( + spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); + + List actualRecords = + spark + .read() + .format("iceberg") + .load(tableLocation) + .orderBy("id") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Result rows should match", records, actualRecords); @@ -149,18 +147,18 @@ public void testImportPartitionedTable() throws IOException { props.put(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "data", "full"); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); File parquetTableDir = temp.newFolder("parquet_table"); String parquetTableLocation = parquetTableDir.toURI().toString(); try { Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF.select("id", "data").write() + inputDF + .select("id", "data") + .write() .format("parquet") .mode("append") .option("path", parquetTableLocation) @@ -169,21 +167,26 @@ public void testImportPartitionedTable() throws IOException { Assert.assertEquals( "Should have 3 partitions", - 3, SparkTableUtil.getPartitions(spark, "parquet_table").size()); + 3, + SparkTableUtil.getPartitions(spark, "parquet_table").size()); Assert.assertEquals( "Should have 1 partition where data = 'a'", - 1, SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'").size()); + 1, + SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'").size()); File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); - - List actualRecords = spark.read() - .format("iceberg") - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + SparkTableUtil.importSparkTable( + spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); + + List actualRecords = + spark + .read() + .format("iceberg") + .load(tableLocation) + .orderBy("id") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Result rows should match", records, actualRecords); @@ -202,18 +205,18 @@ public void testImportPartitionedTable() throws IOException { public void testImportPartitions() throws IOException { Table table = TABLES.create(SCHEMA, SPEC, tableLocation); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); File parquetTableDir = temp.newFolder("parquet_table"); String parquetTableLocation = parquetTableDir.toURI().toString(); try { Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF.select("id", "data").write() + inputDF + .select("id", "data") + .write() .format("parquet") .mode("append") .option("path", parquetTableLocation) @@ -221,17 +224,21 @@ public void testImportPartitions() throws IOException { .saveAsTable("parquet_table"); File stagingDir = temp.newFolder("staging-dir"); - List partitions = SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'"); - SparkTableUtil.importSparkPartitions(spark, partitions, table, table.spec(), stagingDir.toString()); + List partitions = + SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'"); + SparkTableUtil.importSparkPartitions( + spark, partitions, table, table.spec(), stagingDir.toString()); List expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a")); - List actualRecords = spark.read() - .format("iceberg") - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actualRecords = + spark + .read() + .format("iceberg") + .load(tableLocation) + .orderBy("id") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Result rows should match", expectedRecords, actualRecords); } finally { @@ -243,22 +250,20 @@ public void testImportPartitions() throws IOException { public void testImportPartitionsWithSnapshotInheritance() throws IOException { Table table = TABLES.create(SCHEMA, SPEC, tableLocation); - table.updateProperties() - .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); File parquetTableDir = temp.newFolder("parquet_table"); String parquetTableLocation = parquetTableDir.toURI().toString(); try { Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF.select("id", "data").write() + inputDF + .select("id", "data") + .write() .format("parquet") .mode("append") .option("path", parquetTableLocation) @@ -266,17 +271,21 @@ public void testImportPartitionsWithSnapshotInheritance() throws IOException { .saveAsTable("parquet_table"); File stagingDir = temp.newFolder("staging-dir"); - List partitions = SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'"); - SparkTableUtil.importSparkPartitions(spark, partitions, table, table.spec(), stagingDir.toString()); + List partitions = + SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'"); + SparkTableUtil.importSparkPartitions( + spark, partitions, table, table.spec(), stagingDir.toString()); List expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a")); - List actualRecords = spark.read() - .format("iceberg") - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actualRecords = + spark + .read() + .format("iceberg") + .load(tableLocation) + .orderBy("id") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Result rows should match", expectedRecords, actualRecords); } finally { @@ -290,31 +299,47 @@ public void testImportTableWithMappingForNestedData() throws IOException { String parquetTableLocation = parquetTableDir.toURI().toString(); try { - Dataset df1 = spark.range(1, 2) - .withColumn("extra_col", functions.lit(-1)) - .withColumn("struct", functions.expr("named_struct('nested_1', 'a', 'nested_2', 'd', 'nested_3', 'f')")); - Dataset df2 = spark.range(2, 3) - .withColumn("extra_col", functions.lit(-1)) - .withColumn("struct", functions.expr("named_struct('nested_1', 'b', 'nested_2', 'e', 'nested_3', 'g')")); - df1.union(df2).coalesce(1).select("id", "extra_col", "struct").write() + Dataset df1 = + spark + .range(1, 2) + .withColumn("extra_col", functions.lit(-1)) + .withColumn( + "struct", + functions.expr( + "named_struct('nested_1', 'a', 'nested_2', 'd', 'nested_3', 'f')")); + Dataset df2 = + spark + .range(2, 3) + .withColumn("extra_col", functions.lit(-1)) + .withColumn( + "struct", + functions.expr( + "named_struct('nested_1', 'b', 'nested_2', 'e', 'nested_3', 'g')")); + df1.union(df2) + .coalesce(1) + .select("id", "extra_col", "struct") + .write() .format("parquet") .mode("append") .option("path", parquetTableLocation) .saveAsTable("parquet_table"); // don't include `extra_col` and `nested_2` on purpose - Schema schema = new Schema( - optional(1, "id", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - required(3, "nested_1", Types.StringType.get()), - required(4, "nested_3", Types.StringType.get()) - )) - ); + Schema schema = + new Schema( + optional(1, "id", Types.LongType.get()), + required( + 2, + "struct", + Types.StructType.of( + required(3, "nested_1", Types.StringType.get()), + required(4, "nested_3", Types.StringType.get())))); Table table = TABLES.create(schema, PartitionSpec.unpartitioned(), tableLocation); // assign a custom metrics config and a name mapping NameMapping nameMapping = MappingUtil.create(schema); - table.updateProperties() + table + .updateProperties() .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts") .set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "id", "full") .set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "struct.nested_3", "full") @@ -322,23 +347,32 @@ public void testImportTableWithMappingForNestedData() throws IOException { .commit(); File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); + SparkTableUtil.importSparkTable( + spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); // validate we get the expected results back - List expected = spark.table("parquet_table") - .select("id", "struct.nested_1", "struct.nested_3") - .collectAsList(); - List actual = spark.read().format("iceberg").load(tableLocation) - .select("id", "struct.nested_1", "struct.nested_3") - .collectAsList(); + List expected = + spark + .table("parquet_table") + .select("id", "struct.nested_1", "struct.nested_3") + .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(tableLocation) + .select("id", "struct.nested_1", "struct.nested_3") + .collectAsList(); Assert.assertEquals("Rows must match", expected, actual); // validate we persisted correct metrics Dataset fileDF = spark.read().format("iceberg").load(tableLocation + "#files"); List bounds = fileDF.select("lower_bounds", "upper_bounds").collectAsList(); - Assert.assertEquals("Must have lower bounds for 2 columns", 2, bounds.get(0).getMap(0).size()); - Assert.assertEquals("Must have upper bounds for 2 columns", 2, bounds.get(0).getMap(1).size()); + Assert.assertEquals( + "Must have lower bounds for 2 columns", 2, bounds.get(0).getMap(0).size()); + Assert.assertEquals( + "Must have upper bounds for 2 columns", 2, bounds.get(0).getMap(1).size()); Types.NestedField nestedField1 = table.schema().findField("struct.nested_1"); checkFieldMetrics(fileDF, nestedField1, true); @@ -359,15 +393,26 @@ public void testImportTableWithMappingForNestedDataPartitionedTable() throws IOE String parquetTableLocation = parquetTableDir.toURI().toString(); try { - Dataset df1 = spark.range(1, 2) - .withColumn("extra_col", functions.lit(-1)) - .withColumn("struct", functions.expr("named_struct('nested_1', 'a', 'nested_2', 'd', 'nested_3', 'f')")) - .withColumn("data", functions.lit("Z")); - Dataset df2 = spark.range(2, 3) - .withColumn("extra_col", functions.lit(-1)) - .withColumn("struct", functions.expr("named_struct('nested_1', 'b', 'nested_2', 'e', 'nested_3', 'g')")) - .withColumn("data", functions.lit("Z")); - df1.union(df2).coalesce(1).select("id", "extra_col", "struct", "data").write() + Dataset df1 = + spark + .range(1, 2) + .withColumn("extra_col", functions.lit(-1)) + .withColumn( + "struct", + functions.expr("named_struct('nested_1', 'a', 'nested_2', 'd', 'nested_3', 'f')")) + .withColumn("data", functions.lit("Z")); + Dataset df2 = + spark + .range(2, 3) + .withColumn("extra_col", functions.lit(-1)) + .withColumn( + "struct", + functions.expr("named_struct('nested_1', 'b', 'nested_2', 'e', 'nested_3', 'g')")) + .withColumn("data", functions.lit("Z")); + df1.union(df2) + .coalesce(1) + .select("id", "extra_col", "struct", "data") + .write() .format("parquet") .mode("append") .option("path", parquetTableLocation) @@ -375,22 +420,23 @@ public void testImportTableWithMappingForNestedDataPartitionedTable() throws IOE .saveAsTable("parquet_table"); // don't include `extra_col` and `nested_2` on purpose - Schema schema = new Schema( - optional(1, "id", Types.LongType.get()), - required(2, "struct", Types.StructType.of( - required(4, "nested_1", Types.StringType.get()), - required(5, "nested_3", Types.StringType.get()) - )), - required(3, "data", Types.StringType.get()) - ); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .identity("data") - .build(); + Schema schema = + new Schema( + optional(1, "id", Types.LongType.get()), + required( + 2, + "struct", + Types.StructType.of( + required(4, "nested_1", Types.StringType.get()), + required(5, "nested_3", Types.StringType.get()))), + required(3, "data", Types.StringType.get())); + PartitionSpec spec = PartitionSpec.builderFor(schema).identity("data").build(); Table table = TABLES.create(schema, spec, tableLocation); // assign a custom metrics config and a name mapping NameMapping nameMapping = MappingUtil.create(schema); - table.updateProperties() + table + .updateProperties() .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts") .set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "id", "full") .set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "struct.nested_3", "full") @@ -398,23 +444,32 @@ public void testImportTableWithMappingForNestedDataPartitionedTable() throws IOE .commit(); File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); + SparkTableUtil.importSparkTable( + spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); // validate we get the expected results back - List expected = spark.table("parquet_table") - .select("id", "struct.nested_1", "struct.nested_3", "data") - .collectAsList(); - List actual = spark.read().format("iceberg").load(tableLocation) - .select("id", "struct.nested_1", "struct.nested_3", "data") - .collectAsList(); + List expected = + spark + .table("parquet_table") + .select("id", "struct.nested_1", "struct.nested_3", "data") + .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(tableLocation) + .select("id", "struct.nested_1", "struct.nested_3", "data") + .collectAsList(); Assert.assertEquals("Rows must match", expected, actual); // validate we persisted correct metrics Dataset fileDF = spark.read().format("iceberg").load(tableLocation + "#files"); List bounds = fileDF.select("lower_bounds", "upper_bounds").collectAsList(); - Assert.assertEquals("Must have lower bounds for 2 columns", 2, bounds.get(0).getMap(0).size()); - Assert.assertEquals("Must have upper bounds for 2 columns", 2, bounds.get(0).getMap(1).size()); + Assert.assertEquals( + "Must have lower bounds for 2 columns", 2, bounds.get(0).getMap(0).size()); + Assert.assertEquals( + "Must have upper bounds for 2 columns", 2, bounds.get(0).getMap(1).size()); Types.NestedField nestedField1 = table.schema().findField("struct.nested_1"); checkFieldMetrics(fileDF, nestedField1, true); @@ -439,34 +494,40 @@ public void testImportTableWithInt96Timestamp() throws IOException { Column timestampColumn = functions.to_timestamp(functions.lit("2010-03-20 10:40:30.1234")); Dataset df = spark.range(1, 10).withColumn("tmp_col", timestampColumn); - df.coalesce(1).select("id", "tmp_col").write() + df.coalesce(1) + .select("id", "tmp_col") + .write() .format("parquet") .mode("append") .option("path", parquetTableLocation) .saveAsTable("parquet_table"); - Schema schema = new Schema( - optional(1, "id", Types.LongType.get()), - optional(2, "tmp_col", Types.TimestampType.withZone()) - ); + Schema schema = + new Schema( + optional(1, "id", Types.LongType.get()), + optional(2, "tmp_col", Types.TimestampType.withZone())); Table table = TABLES.create(schema, PartitionSpec.unpartitioned(), tableLocation); // assign a custom metrics config and disable vectorized reads - table.updateProperties() + table + .updateProperties() .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full") .set(TableProperties.PARQUET_VECTORIZATION_ENABLED, "false") .commit(); File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); + SparkTableUtil.importSparkTable( + spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); // validate we get the expected results back - List expected = spark.table("parquet_table") - .select("id", "tmp_col") - .collectAsList(); - List actual = spark.read().format("iceberg").load(tableLocation) - .select("id", "tmp_col") - .collectAsList(); + List expected = spark.table("parquet_table").select("id", "tmp_col").collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(tableLocation) + .select("id", "tmp_col") + .collectAsList(); Assert.assertEquals("Rows must match", expected, actual); // validate we did not persist metrics for INT96 @@ -482,32 +543,38 @@ public void testImportTableWithInt96Timestamp() throws IOException { } } - private void checkFieldMetrics(Dataset fileDF, Types.NestedField field, Object min, Object max) { - List metricRows = fileDF - .selectExpr( - String.format("lower_bounds['%d']", field.fieldId()), - String.format("upper_bounds['%d']", field.fieldId()) - ) - .collectAsList(); + private void checkFieldMetrics( + Dataset fileDF, Types.NestedField field, Object min, Object max) { + List metricRows = + fileDF + .selectExpr( + String.format("lower_bounds['%d']", field.fieldId()), + String.format("upper_bounds['%d']", field.fieldId())) + .collectAsList(); // we compare string representations not to deal with HeapCharBuffers for strings - Object actualMin = Conversions.fromByteBuffer(field.type(), ByteBuffer.wrap(metricRows.get(0).getAs(0))); + Object actualMin = + Conversions.fromByteBuffer(field.type(), ByteBuffer.wrap(metricRows.get(0).getAs(0))); Assert.assertEquals("Min value should match", min.toString(), actualMin.toString()); - Object actualMax = Conversions.fromByteBuffer(field.type(), ByteBuffer.wrap(metricRows.get(0).getAs(1))); + Object actualMax = + Conversions.fromByteBuffer(field.type(), ByteBuffer.wrap(metricRows.get(0).getAs(1))); Assert.assertEquals("Max value should match", max.toString(), actualMax.toString()); } private void checkFieldMetrics(Dataset fileDF, Types.NestedField field, boolean isNull) { - List metricRows = fileDF - .selectExpr( - String.format("lower_bounds['%d']", field.fieldId()), - String.format("upper_bounds['%d']", field.fieldId()) - ) - .collectAsList(); - - metricRows.forEach(row -> { - Assert.assertEquals("Invalid metrics for column: " + field.name(), isNull, row.isNullAt(0)); - Assert.assertEquals("Invalid metrics for column: " + field.name(), isNull, row.isNullAt(1)); - }); + List metricRows = + fileDF + .selectExpr( + String.format("lower_bounds['%d']", field.fieldId()), + String.format("upper_bounds['%d']", field.fieldId())) + .collectAsList(); + + metricRows.forEach( + row -> { + Assert.assertEquals( + "Invalid metrics for column: " + field.name(), isNull, row.isNullAt(0)); + Assert.assertEquals( + "Invalid metrics for column: " + field.name(), isNull, row.isNullAt(1)); + }); } } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java index 967f394faa74..06ecc20c2fc3 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.FileFormat; diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java index 84946b4db3a5..a350bc3a44b8 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import com.fasterxml.jackson.databind.node.ObjectNode; @@ -29,21 +28,28 @@ public class TestStreamingOffset { @Test public void testJsonConversion() { - org.apache.iceberg.spark.source.StreamingOffset[] expected = new org.apache.iceberg.spark.source.StreamingOffset[]{ - new org.apache.iceberg.spark.source.StreamingOffset(System.currentTimeMillis(), 1L, false), - new org.apache.iceberg.spark.source.StreamingOffset(System.currentTimeMillis(), 2L, false), - new org.apache.iceberg.spark.source.StreamingOffset(System.currentTimeMillis(), 3L, false), - new org.apache.iceberg.spark.source.StreamingOffset(System.currentTimeMillis(), 4L, true) - }; - Assert.assertArrayEquals("StreamingOffsets should match", expected, + org.apache.iceberg.spark.source.StreamingOffset[] expected = + new org.apache.iceberg.spark.source.StreamingOffset[] { + new org.apache.iceberg.spark.source.StreamingOffset( + System.currentTimeMillis(), 1L, false), + new org.apache.iceberg.spark.source.StreamingOffset( + System.currentTimeMillis(), 2L, false), + new org.apache.iceberg.spark.source.StreamingOffset( + System.currentTimeMillis(), 3L, false), + new org.apache.iceberg.spark.source.StreamingOffset(System.currentTimeMillis(), 4L, true) + }; + Assert.assertArrayEquals( + "StreamingOffsets should match", + expected, Arrays.stream(expected) - .map(elem -> org.apache.iceberg.spark.source.StreamingOffset.fromJson(elem.json())).toArray()); + .map(elem -> org.apache.iceberg.spark.source.StreamingOffset.fromJson(elem.json())) + .toArray()); } @Test public void testToJson() throws Exception { - org.apache.iceberg.spark.source.StreamingOffset expected = new org.apache.iceberg.spark.source.StreamingOffset( - System.currentTimeMillis(), 1L, false); + org.apache.iceberg.spark.source.StreamingOffset expected = + new org.apache.iceberg.spark.source.StreamingOffset(System.currentTimeMillis(), 1L, false); ObjectNode actual = JsonUtil.mapper().createObjectNode(); actual.put("version", 1); actual.put("snapshot_id", expected.snapshotId()); diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index da13f65aaa39..56f54e9c384d 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.util.List; import org.apache.hadoop.conf.Configuration; @@ -48,28 +49,24 @@ import org.junit.rules.TemporaryFolder; import scala.collection.JavaConversions; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestStructuredStreaming { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); private static SparkSession spark = null; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - @Rule - public ExpectedException exceptionRule = ExpectedException.none(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); + @Rule public ExpectedException exceptionRule = ExpectedException.none(); @BeforeClass public static void startSpark() { - TestStructuredStreaming.spark = SparkSession.builder() - .master("local[2]") - .config("spark.sql.shuffle.partitions", 4) - .getOrCreate(); + TestStructuredStreaming.spark = + SparkSession.builder() + .master("local[2]") + .config("spark.sql.shuffle.partitions", 4) + .getOrCreate(); } @AfterClass @@ -89,21 +86,23 @@ public void testStreamingWriteAppendMode() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "1"), - new SimpleRecord(2, "2"), - new SimpleRecord(3, "3"), - new SimpleRecord(4, "4") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "1"), + new SimpleRecord(2, "2"), + new SimpleRecord(3, "3"), + new SimpleRecord(4, "4")); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("append") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("append") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { // start the original query with checkpointing @@ -125,10 +124,9 @@ public void testStreamingWriteAppendMode() throws Exception { restartedQuery.processAllAvailable(); // ensure the write was idempotent - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Dataset result = spark.read().format("iceberg").load(location.toString()); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); @@ -149,22 +147,22 @@ public void testStreamingWriteCompleteMode() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(2, "1"), - new SimpleRecord(3, "2"), - new SimpleRecord(1, "3") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(2, "1"), new SimpleRecord(3, "2"), new SimpleRecord(1, "3")); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .groupBy("value") - .count() - .selectExpr("CAST(count AS INT) AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("complete") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .groupBy("value") + .count() + .selectExpr("CAST(count AS INT) AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("complete") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { // start the original query with checkpointing @@ -186,10 +184,9 @@ public void testStreamingWriteCompleteMode() throws Exception { restartedQuery.processAllAvailable(); // ensure the write was idempotent - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); - List actual = result.orderBy("data").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Dataset result = spark.read().format("iceberg").load(location.toString()); + List actual = + result.orderBy("data").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); @@ -210,22 +207,22 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, null), - new SimpleRecord(2, null), - new SimpleRecord(3, null) - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null)); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .groupBy("value") - .count() - .selectExpr("CAST(count AS INT) AS id") // select only id column - .writeStream() - .outputMode("complete") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .groupBy("value") + .count() + .selectExpr("CAST(count AS INT) AS id") // select only id column + .writeStream() + .outputMode("complete") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { // start the original query with checkpointing @@ -247,10 +244,9 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception { restartedQuery.processAllAvailable(); // ensure the write was idempotent - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Dataset result = spark.read().format("iceberg").load(location.toString()); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); @@ -274,13 +270,15 @@ public void testStreamingWriteUpdateMode() throws Exception { tables.create(SCHEMA, spec, location.toString()); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("update") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("update") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { StreamingQuery query = streamWriter.start(); diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java index e61d6ffb9e5e..ef2f73c3803c 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; @@ -42,15 +41,15 @@ // TODO: Use the copy of this from core. class TestTables { - private TestTables() { - } + private TestTables() {} static TestTable create(File temp, String name, Schema schema, PartitionSpec spec) { TestTableOperations ops = new TestTableOperations(name); if (ops.current() != null) { throw new AlreadyExistsException("Table %s already exists at location: %s", name, temp); } - ops.commit(null, TableMetadata.newTableMetadata(schema, spec, temp.toString(), ImmutableMap.of())); + ops.commit( + null, TableMetadata.newTableMetadata(schema, spec, temp.toString(), ImmutableMap.of())); return new TestTable(ops, name); } @@ -166,8 +165,8 @@ public FileIO io() { @Override public LocationProvider locationProvider() { - Preconditions.checkNotNull(current, - "Current metadata should not be null when locatinProvider is called"); + Preconditions.checkNotNull( + current, "Current metadata should not be null when locatinProvider is called"); return LocationProviders.locationsFor(current.location(), current.properties()); } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java index 20509eef7471..f6cac9e9dd82 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.time.LocalDateTime; @@ -64,18 +65,16 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; - @RunWith(Parameterized.class) public class TestTimestampWithoutZone extends SparkTestBase { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(3, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(3, "data", Types.StringType.get())); private static SparkSession spark = null; @@ -91,8 +90,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String format; private final boolean vectorized; @@ -100,9 +98,9 @@ public static void stopSpark() { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false } + {"parquet", false}, + {"parquet", true}, + {"avro", false} }; } @@ -132,16 +130,17 @@ public void writeUnpartitionedTable() throws IOException { // create records using the table's schema this.records = testRecords(tableSchema); - try (FileAppender writer = new GenericAppenderFactory(tableSchema).newAppender( - localOutput(testFile), fileFormat)) { + try (FileAppender writer = + new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), fileFormat)) { writer.addAll(records); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(records.size()) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(records.size()) + .withFileSizeInBytes(testFile.length()) + .withPath(testFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); } @@ -154,69 +153,89 @@ public void testUnpartitionedTimestampWithoutZone() { @Test public void testUnpartitionedTimestampWithoutZoneProjection() { Schema projection = SCHEMA.select("id", "ts"); - assertEqualsSafe(projection.asStruct(), + assertEqualsSafe( + projection.asStruct(), records.stream().map(r -> projectFlat(projection, r)).collect(Collectors.toList()), read(unpartitioned.toString(), vectorized, "id", "ts")); } - @Rule - public ExpectedException exception = ExpectedException.none(); + @Rule public ExpectedException exception = ExpectedException.none(); @Test public void testUnpartitionedTimestampWithoutZoneError() { - AssertHelpers.assertThrows(String.format("Read operation performed on a timestamp without timezone field while " + - "'%s' set to false should throw exception", - SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), + AssertHelpers.assertThrows( + String.format( + "Read operation performed on a timestamp without timezone field while " + + "'%s' set to false should throw exception", + SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), IllegalArgumentException.class, SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, - () -> spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") - .load(unpartitioned.toString()) - .collectAsList()); + () -> + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") + .load(unpartitioned.toString()) + .collectAsList()); } @Test public void testUnpartitionedTimestampWithoutZoneAppend() { - spark.read().format("iceberg") - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()) - .write() - .format("iceberg") - .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .mode(SaveMode.Append) - .save(unpartitioned.toString()); - - assertEqualsSafe(SCHEMA.asStruct(), - Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), - read(unpartitioned.toString(), vectorized)); + spark + .read() + .format("iceberg") + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()) + .write() + .format("iceberg") + .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .mode(SaveMode.Append) + .save(unpartitioned.toString()); + + assertEqualsSafe( + SCHEMA.asStruct(), + Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), + read(unpartitioned.toString(), vectorized)); } @Test public void testUnpartitionedTimestampWithoutZoneWriteError() { - String errorMessage = String.format("Write operation performed on a timestamp without timezone field while " + - "'%s' set to false should throw exception", + String errorMessage = + String.format( + "Write operation performed on a timestamp without timezone field while " + + "'%s' set to false should throw exception", SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); - Runnable writeOperation = () -> spark.read().format("iceberg") - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()) - .write() - .format("iceberg") - .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") - .mode(SaveMode.Append) - .save(unpartitioned.toString()); - - AssertHelpers.assertThrows(errorMessage, IllegalArgumentException.class, - SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, writeOperation); - + Runnable writeOperation = + () -> + spark + .read() + .format("iceberg") + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()) + .write() + .format("iceberg") + .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") + .mode(SaveMode.Append) + .save(unpartitioned.toString()); + + AssertHelpers.assertThrows( + errorMessage, + IllegalArgumentException.class, + SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, + writeOperation); } @Test public void testUnpartitionedTimestampWithoutZoneSessionProperties() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - spark.read().format("iceberg") + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + spark + .read() + .format("iceberg") .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) .load(unpartitioned.toString()) .write() @@ -224,10 +243,11 @@ public void testUnpartitionedTimestampWithoutZoneSessionProperties() { .mode(SaveMode.Append) .save(unpartitioned.toString()); - assertEqualsSafe(SCHEMA.asStruct(), + assertEqualsSafe( + SCHEMA.asStruct(), Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), read(unpartitioned.toString(), vectorized)); - }); + }); } private static Record projectFlat(Schema projection, Record record) { @@ -240,8 +260,8 @@ private static Record projectFlat(Schema projection, Record record) { return result; } - public static void assertEqualsSafe(Types.StructType struct, - List expected, List actual) { + public static void assertEqualsSafe( + Types.StructType struct, List expected, List actual) { Assert.assertEquals("Number of results should match expected", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i += 1) { GenericsHelpers.assertEqualsSafe(struct, expected.get(i), actual.get(i)); @@ -259,20 +279,23 @@ private List testRecords(Schema schema) { record(schema, 6L, parseToLocal("2017-12-21T21:55:30.589712"), "element"), record(schema, 7L, parseToLocal("2017-12-21T17:31:14.532797"), "limited"), record(schema, 8L, parseToLocal("2017-12-21T15:21:51.237521"), "global"), - record(schema, 9L, parseToLocal("2017-12-21T15:02:15.230570"), "goldfish") - ); + record(schema, 9L, parseToLocal("2017-12-21T15:02:15.230570"), "goldfish")); } private static List read(String table, boolean vectorized) { return read(table, vectorized, "*"); } - private static List read(String table, boolean vectorized, String select0, String... selectN) { - Dataset dataset = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .load(table) - .select(select0, selectN); + private static List read( + String table, boolean vectorized, String select0, String... selectN) { + Dataset dataset = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .load(table) + .select(select0, selectN); return dataset.collectAsList(); } diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java index 7ed71031f3f2..9bf00f1b1365 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; @@ -53,28 +56,24 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestWriteMetricsConfig { private static final Configuration CONF = new Configuration(); - private static final Schema SIMPLE_SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "longCol", Types.IntegerType.get()), - optional(2, "strCol", Types.StringType.get()), - required(3, "record", Types.StructType.of( - required(4, "id", Types.IntegerType.get()), - required(5, "data", Types.StringType.get()) - )) - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final Schema SIMPLE_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "longCol", Types.IntegerType.get()), + optional(2, "strCol", Types.StringType.get()), + required( + 3, + "record", + Types.StructType.of( + required(4, "id", Types.IntegerType.get()), + required(5, "data", Types.StringType.get())))); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; private static JavaSparkContext sc = null; @@ -103,11 +102,9 @@ public void testFullMetricsCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -136,11 +133,9 @@ public void testCountMetricsCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -169,11 +164,9 @@ public void testNoMetricsCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -203,11 +196,9 @@ public void testCustomMetricCollectionForParquet() throws IOException { properties.put("write.metadata.metrics.column.id", "full"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -240,7 +231,8 @@ public void testBadCustomMetricCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); properties.put("write.metadata.metrics.column.ids", "full"); - AssertHelpers.assertThrows("Creating a table with invalid metrics should fail", + AssertHelpers.assertThrows( + "Creating a table with invalid metrics should fail", ValidationException.class, null, () -> tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation)); @@ -251,9 +243,7 @@ public void testCustomMetricCollectionForNestedParquet() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.builderFor(COMPLEX_SCHEMA) - .identity("strCol") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(COMPLEX_SCHEMA).identity("strCol").build(); Map properties = Maps.newHashMap(); properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); properties.put("write.metadata.metrics.column.longCol", "counts"); @@ -263,9 +253,11 @@ public void testCustomMetricCollectionForNestedParquet() throws IOException { Iterable rows = RandomData.generateSpark(COMPLEX_SCHEMA, 10, 0); JavaRDD rdd = sc.parallelize(Lists.newArrayList(rows)); - Dataset df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(COMPLEX_SCHEMA), false); + Dataset df = + spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(COMPLEX_SCHEMA), false); - df.coalesce(1).write() + df.coalesce(1) + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, "parquet") .mode(SaveMode.Append) diff --git a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java index 684dfbb255c7..554557df416c 100644 --- a/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java +++ b/spark/v2.4/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Objects; @@ -26,8 +25,7 @@ public class ThreeColumnRecord { private String c2; private String c3; - public ThreeColumnRecord() { - } + public ThreeColumnRecord() {} public ThreeColumnRecord(Integer c1, String c2, String c3) { this.c1 = c1; @@ -68,9 +66,9 @@ public boolean equals(Object o) { return false; } ThreeColumnRecord that = (ThreeColumnRecord) o; - return Objects.equals(c1, that.c1) && - Objects.equals(c2, that.c2) && - Objects.equals(c3, that.c3); + return Objects.equals(c1, that.c1) + && Objects.equals(c2, that.c2) + && Objects.equals(c3, that.c3); } @Override @@ -80,10 +78,6 @@ public int hashCode() { @Override public String toString() { - return "ThreeColumnRecord{" + - "c1=" + c1 + - ", c2='" + c2 + '\'' + - ", c3='" + c3 + '\'' + - '}'; + return "ThreeColumnRecord{" + "c1=" + c1 + ", c2='" + c2 + '\'' + ", c3='" + c3 + '\'' + '}'; } } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java index 51ac57855bbe..8918dfec6584 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Objects; @@ -25,8 +24,7 @@ public class Employee { private Integer id; private String dep; - public Employee() { - } + public Employee() {} public Employee(Integer id, String dep) { this.id = id; diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java index 36ca608ccd3b..0a1cf7520463 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; + import java.util.Map; import org.apache.iceberg.CatalogUtil; import org.apache.iceberg.hive.HiveCatalog; @@ -30,11 +31,10 @@ import org.apache.spark.sql.internal.SQLConf; import org.junit.BeforeClass; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; - public abstract class SparkExtensionsTestBase extends SparkCatalogTestBase { - public SparkExtensionsTestBase(String catalogName, String implementation, Map config) { + public SparkExtensionsTestBase( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -44,18 +44,21 @@ public static void startMetastoreAndSpark() { metastore.start(); SparkTestBase.hiveConf = metastore.hiveConf(); - SparkTestBase.spark = SparkSession.builder() - .master("local[2]") - .config("spark.testing", "true") - .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") - .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .config("spark.sql.shuffle.partitions", "4") - .config("spark.sql.hive.metastorePartitionPruningFallbackOnException", "true") - .enableHiveSupport() - .getOrCreate(); + SparkTestBase.spark = + SparkSession.builder() + .master("local[2]") + .config("spark.testing", "true") + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config("spark.sql.shuffle.partitions", "4") + .config("spark.sql.hive.metastorePartitionPruningFallbackOnException", "true") + .enableHiveSupport() + .getOrCreate(); - SparkTestBase.catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + SparkTestBase.catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); } } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java index 1dc0bd4c9c77..37f6dc37d580 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -39,13 +45,6 @@ import org.junit.runners.Parameterized; import org.junit.runners.Parameterized.Parameters; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE; - @RunWith(Parameterized.class) public abstract class SparkRowLevelOperationsTestBase extends SparkExtensionsTestBase { @@ -55,49 +54,58 @@ public abstract class SparkRowLevelOperationsTestBase extends SparkExtensionsTes protected final boolean vectorized; protected final String distributionMode; - public SparkRowLevelOperationsTestBase(String catalogName, String implementation, - Map config, String fileFormat, - boolean vectorized, - String distributionMode) { + public SparkRowLevelOperationsTestBase( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config); this.fileFormat = fileFormat; this.vectorized = vectorized; this.distributionMode = distributionMode; } - @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}," + - " format = {3}, vectorized = {4}, distributionMode = {5}") + @Parameters( + name = + "catalogName = {0}, implementation = {1}, config = {2}," + + " format = {3}, vectorized = {4}, distributionMode = {5}") public static Object[][] parameters() { return new Object[][] { - { "testhive", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default" - ), - "orc", - true, - WRITE_DISTRIBUTION_MODE_NONE - }, - { "testhadoop", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hadoop" + { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default"), + "orc", + true, + WRITE_DISTRIBUTION_MODE_NONE + }, + { + "testhadoop", + SparkCatalog.class.getName(), + ImmutableMap.of("type", "hadoop"), + "parquet", + RANDOM.nextBoolean(), + WRITE_DISTRIBUTION_MODE_HASH + }, + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "clients", "1", + "parquet-enabled", "false", + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync ), - "parquet", - RANDOM.nextBoolean(), - WRITE_DISTRIBUTION_MODE_HASH - }, - { "spark_catalog", SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "clients", "1", - "parquet-enabled", "false", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - ), - "avro", - false, - WRITE_DISTRIBUTION_MODE_RANGE - } + "avro", + false, + WRITE_DISTRIBUTION_MODE_RANGE + } }; } @@ -105,11 +113,15 @@ public static Object[][] parameters() { protected void initTable() { sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DEFAULT_FILE_FORMAT, fileFormat); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, distributionMode); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, distributionMode); switch (fileFormat) { case "parquet": - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')", tableName, PARQUET_VECTORIZATION_ENABLED, vectorized); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')", + tableName, PARQUET_VECTORIZATION_ENABLED, vectorized); break; case "orc": Assert.assertTrue(vectorized); @@ -120,9 +132,10 @@ protected void initTable() { } Map props = extraTableProperties(); - props.forEach((prop, value) -> { - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, prop, value); - }); + props.forEach( + (prop, value) -> { + sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, prop, value); + }); } protected void createAndInitTable(String schema) { @@ -170,9 +183,10 @@ protected void createOrReplaceView(String name, List data, Encoder enc } private Dataset toDS(String schema, String jsonData) { - List jsonRows = Arrays.stream(jsonData.split("\n")) - .filter(str -> str.trim().length() > 0) - .collect(Collectors.toList()); + List jsonRows = + Arrays.stream(jsonData.split("\n")) + .filter(str -> str.trim().length() > 0) + .collect(Collectors.toList()); Dataset jsonDS = spark.createDataset(jsonRows, Encoders.STRING()); if (schema != null) { @@ -182,16 +196,23 @@ private Dataset toDS(String schema, String jsonData) { } } - protected void validateSnapshot(Snapshot snapshot, String operation, String changedPartitionCount, - String deletedDataFiles, String addedDataFiles) { + protected void validateSnapshot( + Snapshot snapshot, + String operation, + String changedPartitionCount, + String deletedDataFiles, + String addedDataFiles) { Assert.assertEquals("Operation must match", operation, snapshot.operation()); - Assert.assertEquals("Changed partitions count must match", + Assert.assertEquals( + "Changed partitions count must match", changedPartitionCount, snapshot.summary().get("changed-partition-count")); - Assert.assertEquals("Deleted data files count must match", + Assert.assertEquals( + "Deleted data files count must match", deletedDataFiles, snapshot.summary().get("deleted-data-files")); - Assert.assertEquals("Added data files count must match", + Assert.assertEquals( + "Added data files count must match", addedDataFiles, snapshot.summary().get("added-data-files")); } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java index 215f15a4765a..046590cd0dbd 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.util.List; @@ -59,19 +60,17 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestAddFilesProcedure extends SparkExtensionsTestBase { private final String sourceTableName = "source_table"; private File fileTableDir; - public TestAddFilesProcedure(String catalogName, String implementation, Map config) { + public TestAddFilesProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Before public void setupTempDirs() { @@ -97,12 +96,15 @@ public void addDataUnpartitioned() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -116,12 +118,15 @@ public void addDataUnpartitionedOrc() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`orc`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`orc`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -134,10 +139,12 @@ public void addAvroFile() throws Exception { // Create an Avro file - Schema schema = SchemaBuilder.record("record").fields() - .requiredInt("id") - .requiredString("data") - .endRecord(); + Schema schema = + SchemaBuilder.record("record") + .fields() + .requiredInt("id") + .requiredString("data") + .endRecord(); GenericRecord record1 = new GenericData.Record(schema); record1.put("id", 1L); record1.put("data", "a"); @@ -153,30 +160,30 @@ public void addAvroFile() throws Exception { dataFileWriter.append(record2); dataFileWriter.close(); - String createIceberg = - "CREATE TABLE %s (id Long, data String) USING iceberg"; + String createIceberg = "CREATE TABLE %s (id Long, data String) USING iceberg"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`avro`.`%s`')", - catalogName, tableName, outputFile.getPath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`avro`.`%s`')", + catalogName, tableName, outputFile.getPath()); Assert.assertEquals(1L, result); - List expected = Lists.newArrayList( - new Object[]{1L, "a"}, - new Object[]{2L, "b"} - ); + List expected = Lists.newArrayList(new Object[] {1L, "a"}, new Object[] {2L, "b"}); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); - List actualRecordCount = sql("select %s from %s.files", - DataFile.RECORD_COUNT.name(), - tableName); + List actualRecordCount = + sql("select %s from %s.files", DataFile.RECORD_COUNT.name(), tableName); List expectedRecordCount = Lists.newArrayList(); - expectedRecordCount.add(new Object[]{2L}); - assertEquals("Iceberg file metadata should have correct metadata count", - expectedRecordCount, actualRecordCount); + expectedRecordCount.add(new Object[] {2L}); + assertEquals( + "Iceberg file metadata should have correct metadata count", + expectedRecordCount, + actualRecordCount); } // TODO Adding spark-avro doesn't work in tests @@ -189,12 +196,15 @@ public void addDataUnpartitionedAvro() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`avro`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`avro`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -208,12 +218,13 @@ public void addDataUnpartitionedHive() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + Object result = + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -227,12 +238,15 @@ public void addDataUnpartitionedExtraCol() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -241,17 +255,19 @@ public void addDataUnpartitionedExtraCol() { public void addDataUnpartitionedMissingCol() { createUnpartitionedFileTable("parquet"); - String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String) USING iceberg"; + String createIceberg = "CREATE TABLE %s (id Integer, name String, dept String) USING iceberg"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -265,12 +281,15 @@ public void addDataPartitionedMissingCol() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -284,17 +303,20 @@ public void addDataPartitioned() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } - @Ignore // TODO Classpath issues prevent us from actually writing to a Spark ORC table + @Ignore // TODO Classpath issues prevent us from actually writing to a Spark ORC table public void addDataPartitionedOrc() { createPartitionedFileTable("orc"); @@ -303,12 +325,15 @@ public void addDataPartitionedOrc() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -323,12 +348,15 @@ public void addDataPartitionedAvro() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`avro`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`avro`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -342,12 +370,13 @@ public void addDataPartitionedHive() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + Object result = + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -361,12 +390,15 @@ public void addPartitionToPartitioned() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -376,17 +408,20 @@ public void addFilteredPartitionsToPartitioned() { createCompositePartitionedTable("parquet"); String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (id, dept)"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (id, dept)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -396,18 +431,23 @@ public void addFilteredPartitionsToPartitioned2() { createCompositePartitionedTable("parquet"); String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (id, dept)"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (id, dept)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(6L, result); - assertEquals("Iceberg table contains correct data", - sql("SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", + sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -416,17 +456,20 @@ public void addFilteredPartitionsToPartitionedWithNullValueFilteringOnId() { createCompositePartitionedTableWithNullValueInPartitionColumn("parquet"); String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (id, dept)"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (id, dept)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -436,18 +479,23 @@ public void addFilteredPartitionsToPartitionedWithNullValueFilteringOnDept() { createCompositePartitionedTableWithNullValueInPartitionColumn("parquet"); String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (id, dept)"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (id, dept)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(6L, result); - assertEquals("Iceberg table contains correct data", - sql("SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", + sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -456,13 +504,15 @@ public void addWeirdCaseHiveTable() { createWeirdCaseTable(); String createIceberg = - "CREATE TABLE %s (id Integer, `naMe` String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (`naMe`)"; + "CREATE TABLE %s (id Integer, `naMe` String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (`naMe`)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '%s', map('naMe', 'John Doe'))", - catalogName, tableName, sourceTableName); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '%s', map('naMe', 'John Doe'))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result); @@ -472,22 +522,30 @@ public void addWeirdCaseHiveTable() { Spark does not actually handle this pushdown correctly for hive based tables and it returns 0 records */ List expected = - sql("SELECT id, `naMe`, dept, subdept from %s ORDER BY id", sourceTableName) - .stream() + sql("SELECT id, `naMe`, dept, subdept from %s ORDER BY id", sourceTableName).stream() .filter(r -> r[1].equals("John Doe")) .collect(Collectors.toList()); // TODO when this assert breaks Spark fixed the pushdown issue - Assert.assertEquals("If this assert breaks it means that Spark has fixed the pushdown issue", 0, - sql("SELECT id, `naMe`, dept, subdept from %s WHERE `naMe` = 'John Doe' ORDER BY id", sourceTableName) + Assert.assertEquals( + "If this assert breaks it means that Spark has fixed the pushdown issue", + 0, + sql( + "SELECT id, `naMe`, dept, subdept from %s WHERE `naMe` = 'John Doe' ORDER BY id", + sourceTableName) .size()); // Pushdown works for iceberg - Assert.assertEquals("We should be able to pushdown mixed case partition keys", 2, - sql("SELECT id, `naMe`, dept, subdept FROM %s WHERE `naMe` = 'John Doe' ORDER BY id", tableName) + Assert.assertEquals( + "We should be able to pushdown mixed case partition keys", + 2, + sql( + "SELECT id, `naMe`, dept, subdept FROM %s WHERE `naMe` = 'John Doe' ORDER BY id", + tableName) .size()); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", expected, sql("SELECT id, `naMe`, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -501,12 +559,15 @@ public void addPartitionToPartitionedHive() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '%s', map('id', 1))", - catalogName, tableName, sourceTableName); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '%s', map('id', 1))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -520,19 +581,23 @@ public void invalidDataImport() { sql(createIceberg, tableName); - AssertHelpers.assertThrows("Should forbid adding of partitioned data to unpartitioned table", + AssertHelpers.assertThrows( + "Should forbid adding of partitioned data to unpartitioned table", IllegalArgumentException.class, "Cannot use partition filter with an unpartitioned table", - () -> scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", - catalogName, tableName, fileTableDir.getAbsolutePath()) - ); + () -> + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", + catalogName, tableName, fileTableDir.getAbsolutePath())); - AssertHelpers.assertThrows("Should forbid adding of partitioned data to unpartitioned table", + AssertHelpers.assertThrows( + "Should forbid adding of partitioned data to unpartitioned table", IllegalArgumentException.class, "Cannot add partitioned files to an unpartitioned table", - () -> scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()) - ); + () -> + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath())); } @Test @@ -544,20 +609,25 @@ public void invalidDataImportPartitioned() { sql(createIceberg, tableName); - AssertHelpers.assertThrows("Should forbid adding with a mismatching partition spec", + AssertHelpers.assertThrows( + "Should forbid adding with a mismatching partition spec", IllegalArgumentException.class, "is greater than the number of partitioned columns", - () -> scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('x', '1', 'y', '2'))", - catalogName, tableName, fileTableDir.getAbsolutePath())); + () -> + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('x', '1', 'y', '2'))", + catalogName, tableName, fileTableDir.getAbsolutePath())); - AssertHelpers.assertThrows("Should forbid adding with partition spec with incorrect columns", + AssertHelpers.assertThrows( + "Should forbid adding with partition spec with incorrect columns", IllegalArgumentException.class, "specified partition filter refers to columns that are not partitioned", - () -> scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', '2'))", - catalogName, tableName, fileTableDir.getAbsolutePath())); + () -> + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', '2'))", + catalogName, tableName, fileTableDir.getAbsolutePath())); } - @Test public void addTwice() { createPartitionedHiveTable(); @@ -567,24 +637,30 @@ public void addTwice() { sql(createIceberg, tableName); - Object result1 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1))", - catalogName, tableName, sourceTableName); + Object result1 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result1); - Object result2 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 2))", - catalogName, tableName, sourceTableName); + Object result2 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 2))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result2); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", tableName)); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 2 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s WHERE id = 2 ORDER BY id", tableName)); } @@ -598,21 +674,25 @@ public void duplicateDataPartitioned() { sql(createIceberg, tableName); - scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1))", + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1))", catalogName, tableName, sourceTableName); - AssertHelpers.assertThrows("Should not allow adding duplicate files", + AssertHelpers.assertThrows( + "Should not allow adding duplicate files", IllegalStateException.class, - "Cannot complete import because data files to be imported already" + - " exist within the target table", - () -> scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1))", - catalogName, tableName, sourceTableName)); + "Cannot complete import because data files to be imported already" + + " exist within the target table", + () -> + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1))", + catalogName, tableName, sourceTableName)); } @Test @@ -624,27 +704,33 @@ public void duplicateDataPartitionedAllowed() { sql(createIceberg, tableName); - Object result1 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1))", - catalogName, tableName, sourceTableName); + Object result1 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result1); - Object result2 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1)," + - "check_duplicate_files => false)", - catalogName, tableName, sourceTableName); + Object result2 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1)," + + "check_duplicate_files => false)", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result2); - - assertEquals("Iceberg table contains correct data", - sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 UNION ALL " + - "SELECT id, name, dept, subdept FROM %s WHERE id = 1", sourceTableName, sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT id, name, dept, subdept FROM %s WHERE id = 1 UNION ALL " + + "SELECT id, name, dept, subdept FROM %s WHERE id = 1", + sourceTableName, sourceTableName), sql("SELECT id, name, dept, subdept FROM %s", tableName, tableName)); } @@ -657,15 +743,16 @@ public void duplicateDataUnpartitioned() { sql(createIceberg, tableName); - scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); - AssertHelpers.assertThrows("Should not allow adding duplicate files", + AssertHelpers.assertThrows( + "Should not allow adding duplicate files", IllegalStateException.class, - "Cannot complete import because data files to be imported already" + - " exist within the target table", - () -> scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName)); + "Cannot complete import because data files to be imported already" + + " exist within the target table", + () -> + scalarSql( + "CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName)); } @Test @@ -677,23 +764,25 @@ public void duplicateDataUnpartitionedAllowed() { sql(createIceberg, tableName); - Object result1 = scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + Object result1 = + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result1); - Object result2 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s'," + - "check_duplicate_files => false)", - catalogName, tableName, sourceTableName); + Object result2 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s'," + + "check_duplicate_files => false)", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result2); - assertEquals("Iceberg table contains correct data", - sql("SELECT * FROM (SELECT * FROM %s UNION ALL " + - "SELECT * from %s) ORDER BY id", sourceTableName, sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT * FROM (SELECT * FROM %s UNION ALL " + "SELECT * from %s) ORDER BY id", + sourceTableName, sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); - - } @Test @@ -706,30 +795,34 @@ public void addOrcFileWithDoubleAndFloatColumns() throws Exception { File outputFile = temp.newFile("test.orc"); final int numRows = 5; List expectedRecords = createOrcFile(outputFile, numRows); - String createIceberg = - "CREATE TABLE %s (x float, y double, z long) USING iceberg"; + String createIceberg = "CREATE TABLE %s (x float, y double, z long) USING iceberg"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`orc`.`%s`')", - catalogName, tableName, outputFile.getPath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`orc`.`%s`')", + catalogName, tableName, outputFile.getPath()); Assert.assertEquals(1L, result); - List expected = expectedRecords.stream() - .map(record -> new Object[]{record.get(0), record.get(1), record.get(2)}) - .collect(Collectors.toList()); + List expected = + expectedRecords.stream() + .map(record -> new Object[] {record.get(0), record.get(1), record.get(2)}) + .collect(Collectors.toList()); // x goes 2.00, 1.99, 1.98, ... - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", expected, sql("SELECT * FROM %s ORDER BY x DESC", tableName)); - List actualRecordCount = sql("select %s from %s.files", - DataFile.RECORD_COUNT.name(), - tableName); + List actualRecordCount = + sql("select %s from %s.files", DataFile.RECORD_COUNT.name(), tableName); List expectedRecordCount = Lists.newArrayList(); - expectedRecordCount.add(new Object[]{(long) numRows}); - assertEquals("Iceberg file metadata should have correct metadata count", - expectedRecordCount, actualRecordCount); + expectedRecordCount.add(new Object[] {(long) numRows}); + assertEquals( + "Iceberg file metadata should have correct metadata count", + expectedRecordCount, + actualRecordCount); } @Test @@ -740,21 +833,26 @@ public void testEmptyImportDoesNotThrow() { sql(createIceberg, tableName); // Empty path based import - Object pathResult = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object pathResult = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(0L, pathResult); - assertEquals("Iceberg table contains no added data when importing from an empty path", + assertEquals( + "Iceberg table contains no added data when importing from an empty path", emptyQueryResult, sql("SELECT * FROM %s ORDER BY id", tableName)); // Empty table based import - String createHive = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet"; + String createHive = + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet"; sql(createHive, sourceTableName); - Object tableResult = scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + Object tableResult = + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); Assert.assertEquals(0L, tableResult); - assertEquals("Iceberg table contains no added data when importing from an empty table", + assertEquals( + "Iceberg table contains no added data when importing from an empty table", emptyQueryResult, sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -765,22 +863,26 @@ public void testPartitionedImportFromEmptyPartitionDoesNotThrow() { final int emptyPartitionId = 999; // Add an empty partition to the hive table - sql("ALTER TABLE %s ADD PARTITION (id = '%d') LOCATION '%d'", sourceTableName, - emptyPartitionId, emptyPartitionId); + sql( + "ALTER TABLE %s ADD PARTITION (id = '%d') LOCATION '%d'", + sourceTableName, emptyPartitionId, emptyPartitionId); String createIceberg = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg PARTITIONED BY (id)"; sql(createIceberg, tableName); - Object tableResult = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', %d))", - catalogName, tableName, sourceTableName, emptyPartitionId); + Object tableResult = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', %d))", + catalogName, tableName, sourceTableName, emptyPartitionId); Assert.assertEquals(0L, tableResult); - assertEquals("Iceberg table contains no added data when importing from an empty table", + assertEquals( + "Iceberg table contains no added data when importing from an empty table", emptyQueryResult, sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -788,26 +890,28 @@ public void testPartitionedImportFromEmptyPartitionDoesNotThrow() { private static final List emptyQueryResult = Lists.newArrayList(); private static final StructField[] struct = { - new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), - new StructField("name", DataTypes.StringType, true, Metadata.empty()), - new StructField("dept", DataTypes.StringType, true, Metadata.empty()), - new StructField("subdept", DataTypes.StringType, true, Metadata.empty()) + new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), + new StructField("name", DataTypes.StringType, true, Metadata.empty()), + new StructField("dept", DataTypes.StringType, true, Metadata.empty()), + new StructField("subdept", DataTypes.StringType, true, Metadata.empty()) }; private static final Dataset unpartitionedDF = - spark.createDataFrame( - ImmutableList.of( - RowFactory.create(1, "John Doe", "hr", "communications"), - RowFactory.create(2, "Jane Doe", "hr", "salary"), - RowFactory.create(3, "Matt Doe", "hr", "communications"), - RowFactory.create(4, "Will Doe", "facilities", "all")), - new StructType(struct)).repartition(1); + spark + .createDataFrame( + ImmutableList.of( + RowFactory.create(1, "John Doe", "hr", "communications"), + RowFactory.create(2, "Jane Doe", "hr", "salary"), + RowFactory.create(3, "Matt Doe", "hr", "communications"), + RowFactory.create(4, "Will Doe", "facilities", "all")), + new StructType(struct)) + .repartition(1); private static final Dataset singleNullRecordDF = - spark.createDataFrame( - ImmutableList.of( - RowFactory.create(null, null, null, null)), - new StructType(struct)).repartition(1); + spark + .createDataFrame( + ImmutableList.of(RowFactory.create(null, null, null, null)), new StructType(struct)) + .repartition(1); private static final Dataset partitionedDF = unpartitionedDF.select("name", "dept", "subdept", "id"); @@ -825,8 +929,7 @@ public void testPartitionedImportFromEmptyPartitionDoesNotThrow() { unpartitionedDF.col("dept"), unpartitionedDF.col("name").as("naMe")); - - private void createUnpartitionedFileTable(String format) { + private void createUnpartitionedFileTable(String format) { String createParquet = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s LOCATION '%s'"; @@ -835,10 +938,10 @@ private void createUnpartitionedFileTable(String format) { unpartitionedDF.write().insertInto(sourceTableName); } - private void createPartitionedFileTable(String format) { + private void createPartitionedFileTable(String format) { String createParquet = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s PARTITIONED BY (id) " + - "LOCATION '%s'"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s PARTITIONED BY (id) " + + "LOCATION '%s'"; sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath()); @@ -847,8 +950,9 @@ private void createPartitionedFileTable(String format) { } private void createCompositePartitionedTable(String format) { - String createParquet = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s " + - "PARTITIONED BY (id, dept) LOCATION '%s'"; + String createParquet = + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s " + + "PARTITIONED BY (id, dept) LOCATION '%s'"; sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath()); compositePartitionedDF.write().insertInto(sourceTableName); @@ -856,13 +960,16 @@ private void createCompositePartitionedTable(String format) { } private void createCompositePartitionedTableWithNullValueInPartitionColumn(String format) { - String createParquet = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s " + - "PARTITIONED BY (id, dept) LOCATION '%s'"; + String createParquet = + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s " + + "PARTITIONED BY (id, dept) LOCATION '%s'"; sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath()); - Dataset unionedDF = compositePartitionedDF.unionAll(compositePartitionedNullRecordDF) - .select("name", "subdept", "id", "dept") - .repartition(1); + Dataset unionedDF = + compositePartitionedDF + .unionAll(compositePartitionedNullRecordDF) + .select("name", "subdept", "id", "dept") + .repartition(1); unionedDF.write().insertInto(sourceTableName); unionedDF.write().insertInto(sourceTableName); @@ -870,18 +977,18 @@ private void createCompositePartitionedTableWithNullValueInPartitionColumn(Strin private void createWeirdCaseTable() { String createParquet = - "CREATE TABLE %s (id Integer, subdept String, dept String) " + - "PARTITIONED BY (`naMe` String) STORED AS parquet"; + "CREATE TABLE %s (id Integer, subdept String, dept String) " + + "PARTITIONED BY (`naMe` String) STORED AS parquet"; sql(createParquet, sourceTableName); weirdColumnNamesDF.write().insertInto(sourceTableName); weirdColumnNamesDF.write().insertInto(sourceTableName); - } private void createUnpartitionedHiveTable() { - String createHive = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet"; + String createHive = + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet"; sql(createHive, sourceTableName); @@ -890,8 +997,9 @@ private void createUnpartitionedHiveTable() { } private void createPartitionedHiveTable() { - String createHive = "CREATE TABLE %s (name String, dept String, subdept String) " + - "PARTITIONED BY (id Integer) STORED AS parquet"; + String createHive = + "CREATE TABLE %s (name String, dept String, subdept String) " + + "PARTITIONED BY (id Integer) STORED AS parquet"; sql(createHive, sourceTableName); @@ -905,11 +1013,11 @@ public List createOrcFile(File orcFile, int numRows) throws IOException if (orcFile.exists()) { orcFile.delete(); } - final org.apache.iceberg.Schema icebergSchema = new org.apache.iceberg.Schema( - optional(1, "x", Types.FloatType.get()), - optional(2, "y", Types.DoubleType.get()), - optional(3, "z", Types.LongType.get()) - ); + final org.apache.iceberg.Schema icebergSchema = + new org.apache.iceberg.Schema( + optional(1, "x", Types.FloatType.get()), + optional(2, "y", Types.DoubleType.get()), + optional(3, "z", Types.LongType.get())); List records = Lists.newArrayListWithExpectedSize(numRows); for (int i = 0; i < numRows; i += 1) { @@ -921,11 +1029,18 @@ public List createOrcFile(File orcFile, int numRows) throws IOException } OutputFile outFile = Files.localOutput(orcFile); - try (FileAppender appender = org.apache.iceberg.orc.ORC.write(outFile) - .schema(icebergSchema) - .metricsConfig(MetricsConfig.fromProperties(ImmutableMap.of("write.metadata.metrics.default", "none"))) - .createWriterFunc(GenericOrcWriter::buildWriter) - .build()) { + try (FileAppender appender = + org.apache + .iceberg + .orc + .ORC + .write(outFile) + .schema(icebergSchema) + .metricsConfig( + MetricsConfig.fromProperties( + ImmutableMap.of("write.metadata.metrics.default", "none"))) + .createWriterFunc(GenericOrcWriter::buildWriter) + .build()) { appender.addAll(records); } return records; diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java index 9d630508b6e4..8aee7c97752f 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -31,7 +30,8 @@ import org.junit.Test; public class TestAlterTablePartitionFields extends SparkExtensionsTestBase { - public TestAlterTablePartitionFields(String catalogName, String implementation, Map config) { + public TestAlterTablePartitionFields( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -42,7 +42,9 @@ public void removeTable() { @Test public void testAddIdentityPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -51,17 +53,17 @@ public void testAddIdentityPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .identity("category") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).identity("category").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddBucketPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -70,17 +72,20 @@ public void testAddBucketPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .bucket("id", 16, "id_bucket_16") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .bucket("id", 16, "id_bucket_16") + .build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddTruncatePartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -89,17 +94,20 @@ public void testAddTruncatePartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .truncate("data", 4, "data_trunc_4") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .truncate("data", 4, "data_trunc_4") + .build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddYearsPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -108,17 +116,17 @@ public void testAddYearsPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .year("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).year("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddMonthsPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -127,17 +135,17 @@ public void testAddMonthsPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .month("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).month("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddDaysPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -146,17 +154,17 @@ public void testAddDaysPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddHoursPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -165,17 +173,17 @@ public void testAddHoursPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .hour("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).hour("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddNamedPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -184,77 +192,83 @@ public void testAddNamedPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .bucket("id", 16, "shard") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).bucket("id", 16, "shard").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testDropIdentityPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg PARTITIONED BY (category)", + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg PARTITIONED BY (category)", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Table should start with 1 partition field", 1, table.spec().fields().size()); + Assert.assertEquals( + "Table should start with 1 partition field", 1, table.spec().fields().size()); sql("ALTER TABLE %s DROP PARTITION FIELD category", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .alwaysNull("category", "category") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .alwaysNull("category", "category") + .build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testDropDaysPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, ts timestamp, data string) USING iceberg PARTITIONED BY (days(ts))", + sql( + "CREATE TABLE %s (id bigint NOT NULL, ts timestamp, data string) USING iceberg PARTITIONED BY (days(ts))", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Table should start with 1 partition field", 1, table.spec().fields().size()); + Assert.assertEquals( + "Table should start with 1 partition field", 1, table.spec().fields().size()); sql("ALTER TABLE %s DROP PARTITION FIELD days(ts)", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .alwaysNull("ts", "ts_day") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).alwaysNull("ts", "ts_day").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testDropBucketPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (bucket(16, id))", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (bucket(16, id))", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Table should start with 1 partition field", 1, table.spec().fields().size()); + Assert.assertEquals( + "Table should start with 1 partition field", 1, table.spec().fields().size()); sql("ALTER TABLE %s DROP PARTITION FIELD bucket(16, id)", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .alwaysNull("id", "id_bucket") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .alwaysNull("id", "id_bucket") + .build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testDropPartitionByName() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -270,114 +284,121 @@ public void testDropPartitionByName() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("id", "shard") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(2).alwaysNull("id", "shard").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testReplacePartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts)", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); sql("ALTER TABLE %s REPLACE PARTITION FIELD days(ts) WITH hours(ts)", tableName); table.refresh(); - expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("ts", "ts_day") - .hour("ts") - .build(); - Assert.assertEquals("Should changed from daily to hourly partitioned field", expected, table.spec()); + expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("ts", "ts_day") + .hour("ts") + .build(); + Assert.assertEquals( + "Should changed from daily to hourly partitioned field", expected, table.spec()); } @Test public void testReplacePartitionAndRename() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts)", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); sql("ALTER TABLE %s REPLACE PARTITION FIELD days(ts) WITH hours(ts) AS hour_col", tableName); table.refresh(); - expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("ts", "ts_day") - .hour("ts", "hour_col") - .build(); - Assert.assertEquals("Should changed from daily to hourly partitioned field", expected, table.spec()); + expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("ts", "ts_day") + .hour("ts", "hour_col") + .build(); + Assert.assertEquals( + "Should changed from daily to hourly partitioned field", expected, table.spec()); } @Test public void testReplaceNamedPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts) AS day_col", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts", "day_col") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts", "day_col").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); sql("ALTER TABLE %s REPLACE PARTITION FIELD day_col WITH hours(ts)", tableName); table.refresh(); - expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("ts", "day_col") - .hour("ts") - .build(); - Assert.assertEquals("Should changed from daily to hourly partitioned field", expected, table.spec()); + expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("ts", "day_col") + .hour("ts") + .build(); + Assert.assertEquals( + "Should changed from daily to hourly partitioned field", expected, table.spec()); } @Test public void testReplaceNamedPartitionAndRenameDifferently() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts) AS day_col", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts", "day_col") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts", "day_col").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); sql("ALTER TABLE %s REPLACE PARTITION FIELD day_col WITH hours(ts) AS hour_col", tableName); table.refresh(); - expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("ts", "day_col") - .hour("ts", "hour_col") - .build(); - Assert.assertEquals("Should changed from daily to hourly partitioned field", expected, table.spec()); + expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("ts", "day_col") + .hour("ts", "hour_col") + .build(); + Assert.assertEquals( + "Should changed from daily to hourly partitioned field", expected, table.spec()); } @Test public void testSparkTableAddDropPartitions() throws Exception { sql("CREATE TABLE %s (id bigint NOT NULL, ts timestamp, data string) USING iceberg", tableName); - Assert.assertEquals("spark table partition should be empty", 0, sparkTable().partitioning().length); + Assert.assertEquals( + "spark table partition should be empty", 0, sparkTable().partitioning().length); sql("ALTER TABLE %s ADD PARTITION FIELD bucket(16, id) AS shard", tableName); assertPartitioningEquals(sparkTable(), 1, "bucket(16, id)"); @@ -396,13 +417,16 @@ public void testSparkTableAddDropPartitions() throws Exception { sql("ALTER TABLE %s DROP PARTITION FIELD shard", tableName); sql("DESCRIBE %s", tableName); - Assert.assertEquals("spark table partition should be empty", 0, sparkTable().partitioning().length); + Assert.assertEquals( + "spark table partition should be empty", 0, sparkTable().partitioning().length); } private void assertPartitioningEquals(SparkTable table, int len, String transform) { Assert.assertEquals("spark table partition should be " + len, len, table.partitioning().length); - Assert.assertEquals("latest spark table partition transform should match", - transform, table.partitioning()[len - 1].toString()); + Assert.assertEquals( + "latest spark table partition transform should match", + transform, + table.partitioning()[len - 1].toString()); } private SparkTable sparkTable() throws Exception { diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java index ac12953d0a7e..c993c213dc5e 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -28,7 +27,8 @@ import org.junit.Test; public class TestAlterTableSchema extends SparkExtensionsTestBase { - public TestAlterTableSchema(String catalogName, String implementation, Map config) { + public TestAlterTableSchema( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -39,20 +39,25 @@ public void removeTable() { @Test public void testSetIdentifierFields() { - sql("CREATE TABLE %s (id bigint NOT NULL, " + - "location struct NOT NULL) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, " + + "location struct NOT NULL) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertTrue("Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); + Assert.assertTrue( + "Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier field", + Assert.assertEquals( + "Should have new identifier field", Sets.newHashSet(table.schema().findField("id").fieldId()), table.schema().identifierFieldIds()); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id, location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier field", + Assert.assertEquals( + "Should have new identifier field", Sets.newHashSet( table.schema().findField("id").fieldId(), table.schema().findField("location.lon").fieldId()), @@ -60,7 +65,8 @@ public void testSetIdentifierFields() { sql("ALTER TABLE %s SET IDENTIFIER FIELDS location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier field", + Assert.assertEquals( + "Should have new identifier field", Sets.newHashSet(table.schema().findField("location.lon").fieldId()), table.schema().identifierFieldIds()); } @@ -69,13 +75,16 @@ public void testSetIdentifierFields() { public void testSetInvalidIdentifierFields() { sql("CREATE TABLE %s (id bigint NOT NULL, id2 bigint) USING iceberg", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertTrue("Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); - AssertHelpers.assertThrows("should not allow setting unknown fields", + Assert.assertTrue( + "Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); + AssertHelpers.assertThrows( + "should not allow setting unknown fields", IllegalArgumentException.class, "not found in current schema or added columns", () -> sql("ALTER TABLE %s SET IDENTIFIER FIELDS unknown", tableName)); - AssertHelpers.assertThrows("should not allow setting optional fields", + AssertHelpers.assertThrows( + "should not allow setting optional fields", IllegalArgumentException.class, "not a required field", () -> sql("ALTER TABLE %s SET IDENTIFIER FIELDS id2", tableName)); @@ -83,14 +92,18 @@ public void testSetInvalidIdentifierFields() { @Test public void testDropIdentifierFields() { - sql("CREATE TABLE %s (id bigint NOT NULL, " + - "location struct NOT NULL) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, " + + "location struct NOT NULL) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertTrue("Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); + Assert.assertTrue( + "Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id, location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier fields", + Assert.assertEquals( + "Should have new identifier fields", Sets.newHashSet( table.schema().findField("id").fieldId(), table.schema().findField("location.lon").fieldId()), @@ -98,13 +111,15 @@ public void testDropIdentifierFields() { sql("ALTER TABLE %s DROP IDENTIFIER FIELDS id", tableName); table.refresh(); - Assert.assertEquals("Should removed identifier field", + Assert.assertEquals( + "Should removed identifier field", Sets.newHashSet(table.schema().findField("location.lon").fieldId()), table.schema().identifierFieldIds()); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id, location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier fields", + Assert.assertEquals( + "Should have new identifier fields", Sets.newHashSet( table.schema().findField("id").fieldId(), table.schema().findField("location.lon").fieldId()), @@ -112,29 +127,34 @@ public void testDropIdentifierFields() { sql("ALTER TABLE %s DROP IDENTIFIER FIELDS id, location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have no identifier field", - Sets.newHashSet(), - table.schema().identifierFieldIds()); + Assert.assertEquals( + "Should have no identifier field", Sets.newHashSet(), table.schema().identifierFieldIds()); } @Test public void testDropInvalidIdentifierFields() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string NOT NULL, " + - "location struct NOT NULL) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string NOT NULL, " + + "location struct NOT NULL) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertTrue("Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); - AssertHelpers.assertThrows("should not allow dropping unknown fields", + Assert.assertTrue( + "Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); + AssertHelpers.assertThrows( + "should not allow dropping unknown fields", IllegalArgumentException.class, "field unknown not found", () -> sql("ALTER TABLE %s DROP IDENTIFIER FIELDS unknown", tableName)); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id", tableName); - AssertHelpers.assertThrows("should not allow dropping a field that is not an identifier", + AssertHelpers.assertThrows( + "should not allow dropping a field that is not an identifier", IllegalArgumentException.class, "data is not an identifier field", () -> sql("ALTER TABLE %s DROP IDENTIFIER FIELDS data", tableName)); - AssertHelpers.assertThrows("should not allow dropping a nested field that is not an identifier", + AssertHelpers.assertThrows( + "should not allow dropping a nested field that is not an identifier", IllegalArgumentException.class, "location.lon is not an identifier field", () -> sql("ALTER TABLE %s DROP IDENTIFIER FIELDS location.lon", tableName)); diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java index baf464d94ad0..d676101b1076 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.List; @@ -30,7 +29,8 @@ public class TestAncestorsOfProcedure extends SparkExtensionsTestBase { - public TestAncestorsOfProcedure(String catalogName, String implementation, Map config) { + public TestAncestorsOfProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -51,14 +51,12 @@ public void testAncestorOfUsingEmptyArgs() { Long preSnapshotId = table.currentSnapshot().parentId(); Long preTimeStamp = table.snapshot(table.currentSnapshot().parentId()).timestampMillis(); - List output = sql("CALL %s.system.ancestors_of('%s')", - catalogName, tableIdent); + List output = sql("CALL %s.system.ancestors_of('%s')", catalogName, tableIdent); assertEquals( "Procedure output must match", ImmutableList.of( - row(currentSnapshotId, currentTimestamp), - row(preSnapshotId, preTimeStamp)), + row(currentSnapshotId, currentTimestamp), row(preSnapshotId, preTimeStamp)), output); } @@ -77,8 +75,7 @@ public void testAncestorOfUsingSnapshotId() { assertEquals( "Procedure output must match", ImmutableList.of( - row(currentSnapshotId, currentTimestamp), - row(preSnapshotId, preTimeStamp)), + row(currentSnapshotId, currentTimestamp), row(preSnapshotId, preTimeStamp)), sql("CALL %s.system.ancestors_of('%s', %dL)", catalogName, tableIdent, currentSnapshotId)); assertEquals( @@ -105,7 +102,8 @@ public void testAncestorOfWithRollBack() { Long thirdTimestamp = table.currentSnapshot().timestampMillis(); // roll back - sql("CALL %s.system.rollback_to_snapshot('%s', %dL)", + sql( + "CALL %s.system.rollback_to_snapshot('%s', %dL)", catalogName, tableIdent, secondSnapshotId); sql("INSERT INTO TABLE %s VALUES (4, 'd')", tableName); @@ -142,22 +140,29 @@ public void testAncestorOfUsingNamedArgs() { assertEquals( "Procedure output must match", ImmutableList.of(row(firstSnapshotId, firstTimestamp)), - sql("CALL %s.system.ancestors_of(snapshot_id => %dL, table => '%s')", + sql( + "CALL %s.system.ancestors_of(snapshot_id => %dL, table => '%s')", catalogName, firstSnapshotId, tableIdent)); } @Test public void testInvalidAncestorOfCases() { - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.ancestors_of()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier for argument table", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier for argument table", () -> sql("CALL %s.system.ancestors_of('')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for snapshot_id: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for snapshot_id: cannot cast", () -> sql("CALL %s.system.ancestors_of('%s', 1.1)", catalogName, tableIdent)); } } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java index c8c4316a1524..7bcc0884561d 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.math.BigDecimal; @@ -49,19 +48,19 @@ public class TestCallStatementParser { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; private static ParserInterface parser = null; @BeforeClass public static void startSpark() { - TestCallStatementParser.spark = SparkSession.builder() - .master("local[2]") - .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) - .config("spark.extra.prop", "value") - .getOrCreate(); + TestCallStatementParser.spark = + SparkSession.builder() + .master("local[2]") + .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) + .config("spark.extra.prop", "value") + .getOrCreate(); TestCallStatementParser.parser = spark.sessionState().sqlParser(); } @@ -75,8 +74,10 @@ public static void stopSpark() { @Test public void testCallWithPositionalArgs() throws ParseException { - CallStatement call = (CallStatement) parser.parsePlan("CALL c.n.func(1, '2', 3L, true, 1.0D, 9.0e1, 900e-1BD)"); - Assert.assertEquals(ImmutableList.of("c", "n", "func"), JavaConverters.seqAsJavaList(call.name())); + CallStatement call = + (CallStatement) parser.parsePlan("CALL c.n.func(1, '2', 3L, true, 1.0D, 9.0e1, 900e-1BD)"); + Assert.assertEquals( + ImmutableList.of("c", "n", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(7, call.args().size()); @@ -91,8 +92,10 @@ public void testCallWithPositionalArgs() throws ParseException { @Test public void testCallWithNamedArgs() throws ParseException { - CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func(c1 => 1, c2 => '2', c3 => true)"); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + CallStatement call = + (CallStatement) parser.parsePlan("CALL cat.system.func(c1 => 1, c2 => '2', c3 => true)"); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(3, call.args().size()); @@ -104,7 +107,8 @@ public void testCallWithNamedArgs() throws ParseException { @Test public void testCallWithMixedArgs() throws ParseException { CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func(c1 => 1, '2')"); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(2, call.args().size()); @@ -114,18 +118,24 @@ public void testCallWithMixedArgs() throws ParseException { @Test public void testCallWithTimestampArg() throws ParseException { - CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func(TIMESTAMP '2017-02-03T10:37:30.00Z')"); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + CallStatement call = + (CallStatement) + parser.parsePlan("CALL cat.system.func(TIMESTAMP '2017-02-03T10:37:30.00Z')"); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(1, call.args().size()); - checkArg(call, 0, Timestamp.from(Instant.parse("2017-02-03T10:37:30.00Z")), DataTypes.TimestampType); + checkArg( + call, 0, Timestamp.from(Instant.parse("2017-02-03T10:37:30.00Z")), DataTypes.TimestampType); } @Test public void testCallWithVarSubstitution() throws ParseException { - CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func('${spark.extra.prop}')"); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + CallStatement call = + (CallStatement) parser.parsePlan("CALL cat.system.func('${spark.extra.prop}')"); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(1, call.args().size()); @@ -134,22 +144,22 @@ public void testCallWithVarSubstitution() throws ParseException { @Test public void testCallStripsComments() throws ParseException { - List callStatementsWithComments = Lists.newArrayList( - "/* bracketed comment */ CALL cat.system.func('${spark.extra.prop}')", - "/**/ CALL cat.system.func('${spark.extra.prop}')", - "-- single line comment \n CALL cat.system.func('${spark.extra.prop}')", - "-- multiple \n-- single line \n-- comments \n CALL cat.system.func('${spark.extra.prop}')", - "/* select * from multiline_comment \n where x like '%sql%'; */ CALL cat.system.func('${spark.extra.prop}')", - "/* {\"app\": \"dbt\", \"dbt_version\": \"1.0.1\", \"profile_name\": \"profile1\", \"target_name\": \"dev\", " + - "\"node_id\": \"model.profile1.stg_users\"} \n*/ CALL cat.system.func('${spark.extra.prop}')", - "/* Some multi-line comment \n" + - "*/ CALL /* inline comment */ cat.system.func('${spark.extra.prop}') -- ending comment", - "CALL -- a line ending comment\n" + - "cat.system.func('${spark.extra.prop}')" - ); + List callStatementsWithComments = + Lists.newArrayList( + "/* bracketed comment */ CALL cat.system.func('${spark.extra.prop}')", + "/**/ CALL cat.system.func('${spark.extra.prop}')", + "-- single line comment \n CALL cat.system.func('${spark.extra.prop}')", + "-- multiple \n-- single line \n-- comments \n CALL cat.system.func('${spark.extra.prop}')", + "/* select * from multiline_comment \n where x like '%sql%'; */ CALL cat.system.func('${spark.extra.prop}')", + "/* {\"app\": \"dbt\", \"dbt_version\": \"1.0.1\", \"profile_name\": \"profile1\", \"target_name\": \"dev\", " + + "\"node_id\": \"model.profile1.stg_users\"} \n*/ CALL cat.system.func('${spark.extra.prop}')", + "/* Some multi-line comment \n" + + "*/ CALL /* inline comment */ cat.system.func('${spark.extra.prop}') -- ending comment", + "CALL -- a line ending comment\n" + "cat.system.func('${spark.extra.prop}')"); for (String sqlText : callStatementsWithComments) { CallStatement call = (CallStatement) parser.parsePlan(sqlText); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(1, call.args().size()); @@ -159,17 +169,24 @@ public void testCallStripsComments() throws ParseException { @Test public void testCallParseError() { - AssertHelpers.assertThrows("Should fail with a sensible parse error", IcebergParseException.class, + AssertHelpers.assertThrows( + "Should fail with a sensible parse error", + IcebergParseException.class, "missing '(' at 'radish'", () -> parser.parsePlan("CALL cat.system radish kebab")); } - private void checkArg(CallStatement call, int index, Object expectedValue, DataType expectedType) { + private void checkArg( + CallStatement call, int index, Object expectedValue, DataType expectedType) { checkArg(call, index, null, expectedValue, expectedType); } - private void checkArg(CallStatement call, int index, String expectedName, - Object expectedValue, DataType expectedType) { + private void checkArg( + CallStatement call, + int index, + String expectedName, + Object expectedValue, + DataType expectedType) { if (expectedName != null) { NamedArgument arg = checkCast(call.args().apply(index), NamedArgument.class); @@ -190,7 +207,8 @@ private Literal toSparkLiteral(Object value, DataType dataType) { } private T checkCast(Object value, Class expectedClass) { - Assert.assertTrue("Expected instance of " + expectedClass.getName(), expectedClass.isInstance(value)); + Assert.assertTrue( + "Expected instance of " + expectedClass.getName(), expectedClass.isInstance(value)); return expectedClass.cast(value); } } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java index c69964693189..7309a176b922 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; + import java.util.List; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -34,11 +35,10 @@ import org.junit.After; import org.junit.Test; -import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; - public class TestCherrypickSnapshotProcedure extends SparkExtensionsTestBase { - public TestCherrypickSnapshotProcedure(String catalogName, String implementation, Map config) { + public TestCherrypickSnapshotProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -56,26 +56,30 @@ public void testCherrypickSnapshotUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.cherrypick_snapshot('%s', %dL)", - catalogName, tableIdent, wapSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.cherrypick_snapshot('%s', %dL)", + catalogName, tableIdent, wapSnapshot.snapshotId()); table.refresh(); Snapshot currentSnapshot = table.currentSnapshot(); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())), output); - assertEquals("Cherrypick must be successful", + assertEquals( + "Cherrypick must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -89,26 +93,30 @@ public void testCherrypickSnapshotUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.cherrypick_snapshot(snapshot_id => %dL, table => '%s')", - catalogName, wapSnapshot.snapshotId(), tableIdent); + List output = + sql( + "CALL %s.system.cherrypick_snapshot(snapshot_id => %dL, table => '%s')", + catalogName, wapSnapshot.snapshotId(), tableIdent); table.refresh(); Snapshot currentSnapshot = table.currentSnapshot(); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())), output); - assertEquals("Cherrypick must be successful", + assertEquals( + "Cherrypick must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -129,17 +137,20 @@ public void testCherrypickSnapshotRefreshesRelationCache() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - sql("CALL %s.system.cherrypick_snapshot('%s', %dL)", + sql( + "CALL %s.system.cherrypick_snapshot('%s', %dL)", catalogName, tableIdent, wapSnapshot.snapshotId()); - assertEquals("Cherrypick snapshot should be visible", + assertEquals( + "Cherrypick snapshot should be visible", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); @@ -150,31 +161,43 @@ public void testCherrypickSnapshotRefreshesRelationCache() { public void testCherrypickInvalidSnapshot() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should reject invalid snapshot id", - ValidationException.class, "Cannot cherry-pick unknown snapshot ID", + AssertHelpers.assertThrows( + "Should reject invalid snapshot id", + ValidationException.class, + "Cannot cherry-pick unknown snapshot ID", () -> sql("CALL %s.system.cherrypick_snapshot('%s', -1L)", catalogName, tableIdent)); } @Test public void testInvalidCherrypickSnapshotCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.cherrypick_snapshot('n', table => 't', 1L)", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.cherrypick_snapshot('n', 't', 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.cherrypick_snapshot('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.cherrypick_snapshot('', 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for snapshot_id: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for snapshot_id: cannot cast", () -> sql("CALL %s.system.cherrypick_snapshot('t', 2.2)", catalogName)); } } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java index c9d15906251f..8a8a8c6ab722 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -25,8 +24,13 @@ public class TestCopyOnWriteDelete extends TestDelete { - public TestCopyOnWriteDelete(String catalogName, String implementation, Map config, - String fileFormat, Boolean vectorized, String distributionMode) { + public TestCopyOnWriteDelete( + String catalogName, + String implementation, + Map config, + String fileFormat, + Boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java index 60aba632646f..27cbd1a9d5de 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -25,8 +24,13 @@ public class TestCopyOnWriteMerge extends TestMerge { - public TestCopyOnWriteMerge(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestCopyOnWriteMerge( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java index cc73ecba9ddf..3fa3f74f6a39 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -25,8 +24,13 @@ public class TestCopyOnWriteUpdate extends TestUpdate { - public TestCopyOnWriteUpdate(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestCopyOnWriteUpdate( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java index cd4392177b12..27435137ba85 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.SPLIT_SIZE; +import static org.apache.spark.sql.functions.lit; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -51,15 +55,15 @@ import org.junit.Ignore; import org.junit.Test; -import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.SPLIT_SIZE; -import static org.apache.spark.sql.functions.lit; - public abstract class TestDelete extends SparkRowLevelOperationsTestBase { - public TestDelete(String catalogName, String implementation, Map config, - String fileFormat, Boolean vectorized, String distributionMode) { + public TestDelete( + String catalogName, + String implementation, + Map config, + String fileFormat, + Boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -85,7 +89,8 @@ public void testDeleteFromEmptyTable() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -103,7 +108,8 @@ public void testExplain() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 1 snapshot", 1, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -116,7 +122,8 @@ public void testDeleteWithAlias() { sql("DELETE FROM %s AS t WHERE t.id IS NULL", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -136,7 +143,8 @@ public void testDeleteWithDynamicFileFiltering() throws NoSuchTableException { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "1", "1", "1"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hardware"), row(1, "hr"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -155,7 +163,8 @@ public void testDeleteNonExistingRecords() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "0", null, null); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -177,9 +186,8 @@ public void testDeleteWithoutCondition() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "delete", "2", "3", null); - assertEquals("Should have expected rows", - ImmutableList.of(), - sql("SELECT * FROM %s", tableName)); + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); } @Test @@ -199,7 +207,8 @@ public void testDeleteUsingMetadataWithComplexCondition() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "delete", "2", "2", null); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "dep1")), sql("SELECT * FROM %s", tableName)); } @@ -222,7 +231,8 @@ public void testDeleteWithArbitraryPartitionPredicates() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "1", "1", null); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -233,8 +243,10 @@ public void testDeleteWithNonDeterministicCondition() { sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware')", tableName); - AssertHelpers.assertThrows("Should complain about non-deterministic expressions", - AnalysisException.class, "nondeterministic expressions are only allowed", + AssertHelpers.assertThrows( + "Should complain about non-deterministic expressions", + AnalysisException.class, + "nondeterministic expressions are only allowed", () -> sql("DELETE FROM %s WHERE id = 1 AND rand() > 0.5", tableName)); } @@ -246,25 +258,29 @@ public void testDeleteWithFoldableConditions() { // should keep all rows and don't trigger execution sql("DELETE FROM %s WHERE false", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should keep all rows and don't trigger execution sql("DELETE FROM %s WHERE 50 <> 50", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should keep all rows and don't trigger execution sql("DELETE FROM %s WHERE 1 > null", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should remove all rows sql("DELETE FROM %s WHERE 21 = 21", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -276,24 +292,29 @@ public void testDeleteWithFoldableConditions() { public void testDeleteWithNullConditions() { createAndInitPartitionedTable(); - sql("INSERT INTO TABLE %s VALUES (0, null), (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName); + sql( + "INSERT INTO TABLE %s VALUES (0, null), (1, 'hr'), (2, 'hardware'), (null, 'hr')", + tableName); // should keep all rows as null is never equal to null sql("DELETE FROM %s WHERE dep = null", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); // null = 'software' -> null // should delete using metadata operation only sql("DELETE FROM %s WHERE dep = 'software'", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); // should delete using metadata operation only sql("DELETE FROM %s WHERE dep <=> NULL", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); @@ -311,17 +332,20 @@ public void testDeleteWithInAndNotInConditions() { sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName); sql("DELETE FROM %s WHERE id IN (1, null)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("DELETE FROM %s WHERE id NOT IN (null, 1)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("DELETE FROM %s WHERE id NOT IN (1, 10)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -332,16 +356,20 @@ public void testDeleteWithMultipleRowGroupsParquet() throws NoSuchTableException createAndInitPartitionedTable(); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, SPLIT_SIZE, 100); List ids = Lists.newArrayList(); for (int id = 1; id <= 200; id++) { ids.add(id); } - Dataset df = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); + Dataset df = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); df.coalesce(1).writeTo(tableName).append(); Assert.assertEquals(200, spark.table(tableName).count()); @@ -360,14 +388,12 @@ public void testDeleteWithConditionOnNestedColumn() { sql("INSERT INTO TABLE %s VALUES (2, named_struct(\"c1\", 2, \"c2\", \"v2\"))", tableName); sql("DELETE FROM %s WHERE complex.c1 = id + 2", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(2)), - sql("SELECT id FROM %s", tableName)); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2)), sql("SELECT id FROM %s", tableName)); sql("DELETE FROM %s t WHERE t.complex.c1 = id", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(), - sql("SELECT id FROM %s", tableName)); + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT id FROM %s", tableName)); } @Test @@ -379,28 +405,35 @@ public void testDeleteWithInSubquery() throws NoSuchTableException { createOrReplaceView("deleted_id", Arrays.asList(0, 1, null), Encoders.INT()); createOrReplaceView("deleted_dep", Arrays.asList("software", "hr"), Encoders.STRING()); - sql("DELETE FROM %s WHERE id IN (SELECT * FROM deleted_id) AND dep IN (SELECT * from deleted_dep)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s WHERE id IN (SELECT * FROM deleted_id) AND dep IN (SELECT * from deleted_dep)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); append(new Employee(1, "hr"), new Employee(-1, "hr")); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("DELETE FROM %s WHERE id IS NULL OR id IN (SELECT value + 2 FROM deleted_id)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "hr")), sql("SELECT * FROM %s ORDER BY id", tableName)); append(new Employee(null, "hr"), new Employee(2, "hr")); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "hr"), row(2, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("DELETE FROM %s WHERE id IN (SELECT value + 2 FROM deleted_id) AND dep = 'hr'", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -411,11 +444,13 @@ public void testDeleteWithMultiColumnInSubquery() throws NoSuchTableException { append(new Employee(1, "hr"), new Employee(2, "hardware"), new Employee(null, "hr")); - List deletedEmployees = Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr")); + List deletedEmployees = + Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr")); createOrReplaceView("deleted_employee", deletedEmployees, Encoders.bean(Employee.class)); sql("DELETE FROM %s WHERE (id, dep) IN (SELECT id, dep FROM deleted_employee)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -431,36 +466,50 @@ public void testDeleteWithNotInSubquery() throws NoSuchTableException { // the file filter subquery (nested loop lef-anti join) returns 0 records sql("DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id) OR dep IN ('software', 'hr')", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id) OR dep IN ('software', 'hr')", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE " + - "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) AND " + - "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE " + + "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) AND " + + "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE " + - "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) OR " + - "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE " + + "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) OR " + + "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -473,8 +522,10 @@ public void testDeleteWithNotInSubqueryNotSupported() throws NoSuchTableExceptio createOrReplaceView("deleted_id", Arrays.asList(-1, -2, null), Encoders.INT()); - AssertHelpers.assertThrows("Should complain about NOT IN subquery", - AnalysisException.class, "Null-aware predicate subqueries are not currently supported", + AssertHelpers.assertThrows( + "Should complain about NOT IN subquery", + AnalysisException.class, + "Null-aware predicate subqueries are not currently supported", () -> sql("DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id)", tableName)); } @@ -482,8 +533,10 @@ public void testDeleteWithNotInSubqueryNotSupported() throws NoSuchTableExceptio public void testDeleteOnNonIcebergTableNotSupported() throws NoSuchTableException { createOrReplaceView("testtable", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Delete is not supported for non iceberg table", - AnalysisException.class, "DELETE is only supported with v2 tables.", + AssertHelpers.assertThrows( + "Delete is not supported for non iceberg table", + AnalysisException.class, + "DELETE is only supported with v2 tables.", () -> sql("DELETE FROM %s WHERE c1 = -100", "testtable")); } @@ -496,25 +549,37 @@ public void testDeleteWithExistSubquery() throws NoSuchTableException { createOrReplaceView("deleted_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("deleted_dep", Arrays.asList("software", "hr"), Encoders.STRING()); - sql("DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value) OR t.id IS NULL", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value) OR t.id IS NULL", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s", tableName)); - sql("DELETE FROM %s t WHERE " + - "EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value) AND " + - "EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE " + + "EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value) AND " + + "EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s", tableName)); } @@ -528,21 +593,28 @@ public void testDeleteWithNotExistsSubquery() throws NoSuchTableException { createOrReplaceView("deleted_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("deleted_dep", Arrays.asList("software", "hr"), Encoders.STRING()); - sql("DELETE FROM %s t WHERE " + - "NOT EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value + 2) AND " + - "NOT EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE " + + "NOT EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value + 2) AND " + + "NOT EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE NOT EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE NOT EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); String subquery = "SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2"; sql("DELETE FROM %s t WHERE NOT EXISTS (%s) OR t.id = 1", tableName, subquery); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -556,7 +628,8 @@ public void testDeleteWithScalarSubquery() throws NoSuchTableException { createOrReplaceView("deleted_id", Arrays.asList(1, 100, null), Encoders.INT()); sql("DELETE FROM %s t WHERE id <= (SELECT min(value) FROM deleted_id)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -591,52 +664,61 @@ public synchronized void testDeleteWithSerializableIsolation() throws Interrupte createAndInitUnpartitionedTable(); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DELETE_ISOLATION_LEVEL, "serializable"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, DELETE_ISOLATION_LEVEL, "serializable"); // Pre-populate the table to force it to use the Spark Writers instead of Metadata-Only Delete // for more consistent exception stack List ids = ImmutableList.of(1, 2); - Dataset inputDF = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); + Dataset inputDF = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); try { inputDF.coalesce(1).writeTo(tableName).append(); } catch (NoSuchTableException e) { throw new RuntimeException(e); } - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // delete thread - Future deleteFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("DELETE FROM %s WHERE id = 1", tableName); - barrier.incrementAndGet(); - } - }); + Future deleteFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("DELETE FROM %s WHERE id = 1", tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - - try { - inputDF.coalesce(1).writeTo(tableName).append(); - } catch (NoSuchTableException e) { - throw new RuntimeException(e); - } - - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + + try { + inputDF.coalesce(1).writeTo(tableName).append(); + } catch (NoSuchTableException e) { + throw new RuntimeException(e); + } + + barrier.incrementAndGet(); + } + }); try { deleteFuture.get(); @@ -647,7 +729,8 @@ public synchronized void testDeleteWithSerializableIsolation() throws Interrupte Throwable validationException = sparkException.getCause(); Assert.assertThat(validationException, CoreMatchers.instanceOf(ValidationException.class)); String errMsg = validationException.getMessage(); - Assert.assertThat(errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); + Assert.assertThat( + errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); } finally { appendFuture.cancel(true); } @@ -657,40 +740,48 @@ public synchronized void testDeleteWithSerializableIsolation() throws Interrupte } @Test - public synchronized void testDeleteWithSnapshotIsolation() throws InterruptedException, ExecutionException { + public synchronized void testDeleteWithSnapshotIsolation() + throws InterruptedException, ExecutionException { // cannot run tests with concurrency for Hadoop tables without atomic renames Assume.assumeFalse(catalogName.equalsIgnoreCase("testhadoop")); createAndInitUnpartitionedTable(); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DELETE_ISOLATION_LEVEL, "snapshot"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, DELETE_ISOLATION_LEVEL, "snapshot"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // delete thread - Future deleteFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("DELETE FROM %s WHERE id = 1", tableName); - barrier.incrementAndGet(); - } - }); + Future deleteFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("DELETE FROM %s WHERE id = 1", tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { deleteFuture.get(); @@ -714,7 +805,8 @@ public void testDeleteRefreshesRelationCache() throws NoSuchTableException { spark.sql("CACHE TABLE tmp"); - assertEquals("View should have correct data", + assertEquals( + "View should have correct data", ImmutableList.of(row(1, "hardware"), row(1, "hr")), sql("SELECT * FROM tmp ORDER BY id, dep")); @@ -726,11 +818,13 @@ public void testDeleteRefreshesRelationCache() throws NoSuchTableException { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "2", "2", "2"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - assertEquals("Should refresh the relation cache", + assertEquals( + "Should refresh the relation cache", ImmutableList.of(), sql("SELECT * FROM tmp ORDER BY id, dep")); diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java index bc3e5e4230c6..7db64acf4a3f 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; + import java.io.IOException; import java.sql.Timestamp; import java.time.Instant; @@ -37,11 +38,10 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TableProperties.GC_ENABLED; - public class TestExpireSnapshotsProcedure extends SparkExtensionsTestBase { - public TestExpireSnapshotsProcedure(String catalogName, String implementation, Map config) { + public TestExpireSnapshotsProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -54,9 +54,7 @@ public void removeTables() { public void testExpireSnapshotsInEmptyTable() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - List output = sql( - "CALL %s.system.expire_snapshots('%s')", - catalogName, tableIdent); + List output = sql("CALL %s.system.expire_snapshots('%s')", catalogName, tableIdent); assertEquals("Should not delete any files", ImmutableList.of(row(0L, 0L, 0L)), output); } @@ -75,17 +73,17 @@ public void testExpireSnapshotsUsingPositionalArgs() { table.refresh(); Snapshot secondSnapshot = table.currentSnapshot(); - Timestamp secondSnapshotTimestamp = Timestamp.from(Instant.ofEpochMilli(secondSnapshot.timestampMillis())); + Timestamp secondSnapshotTimestamp = + Timestamp.from(Instant.ofEpochMilli(secondSnapshot.timestampMillis())); Assert.assertEquals("Should be 2 snapshots", 2, Iterables.size(table.snapshots())); // expire without retainLast param - List output1 = sql( - "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s')", - catalogName, tableIdent, secondSnapshotTimestamp); - assertEquals("Procedure output must match", - ImmutableList.of(row(0L, 0L, 1L)), - output1); + List output1 = + sql( + "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s')", + catalogName, tableIdent, secondSnapshotTimestamp); + assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 1L)), output1); table.refresh(); @@ -93,7 +91,8 @@ public void testExpireSnapshotsUsingPositionalArgs() { sql("INSERT OVERWRITE %s VALUES (3, 'c')", tableName); sql("INSERT INTO TABLE %s VALUES (4, 'd')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(3L, "c"), row(4L, "d")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -106,12 +105,11 @@ public void testExpireSnapshotsUsingPositionalArgs() { Assert.assertEquals("Should be 3 snapshots", 3, Iterables.size(table.snapshots())); // expire with retainLast param - List output = sql( - "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s', 2)", - catalogName, tableIdent, currentTimestamp); - assertEquals("Procedure output must match", - ImmutableList.of(row(2L, 2L, 1L)), - output); + List output = + sql( + "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s', 2)", + catalogName, tableIdent, currentTimestamp); + assertEquals("Procedure output must match", ImmutableList.of(row(2L, 2L, 1L)), output); } @Test @@ -129,15 +127,14 @@ public void testExpireSnapshotUsingNamedArgs() { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); - List output = sql( - "CALL %s.system.expire_snapshots(" + - "older_than => TIMESTAMP '%s'," + - "table => '%s'," + - "retain_last => 1)", - catalogName, currentTimestamp, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(0L, 0L, 1L)), - output); + List output = + sql( + "CALL %s.system.expire_snapshots(" + + "older_than => TIMESTAMP '%s'," + + "table => '%s'," + + "retain_last => 1)", + catalogName, currentTimestamp, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 1L)), output); } @Test @@ -146,31 +143,43 @@ public void testExpireSnapshotsGCDisabled() { sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' 'false')", tableName, GC_ENABLED); - AssertHelpers.assertThrows("Should reject call", - ValidationException.class, "Cannot expire snapshots: GC is disabled", + AssertHelpers.assertThrows( + "Should reject call", + ValidationException.class, + "Cannot expire snapshots: GC is disabled", () -> sql("CALL %s.system.expire_snapshots('%s')", catalogName, tableIdent)); } @Test public void testInvalidExpireSnapshotsCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.expire_snapshots('n', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.expire_snapshots('n', 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.expire_snapshots()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.expire_snapshots('n', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.expire_snapshots('')", catalogName)); } @@ -179,13 +188,24 @@ public void testResolvingTableInAnotherCatalog() throws IOException { String anotherCatalog = "another_" + catalogName; spark.conf().set("spark.sql.catalog." + anotherCatalog, SparkCatalog.class.getName()); spark.conf().set("spark.sql.catalog." + anotherCatalog + ".type", "hadoop"); - spark.conf().set("spark.sql.catalog." + anotherCatalog + ".warehouse", "file:" + temp.newFolder().toString()); - - sql("CREATE TABLE %s.%s (id bigint NOT NULL, data string) USING iceberg", anotherCatalog, tableIdent); - - AssertHelpers.assertThrows("Should reject calls for a table in another catalog", - IllegalArgumentException.class, "Cannot run procedure in catalog", - () -> sql("CALL %s.system.expire_snapshots('%s')", catalogName, anotherCatalog + "." + tableName)); + spark + .conf() + .set( + "spark.sql.catalog." + anotherCatalog + ".warehouse", + "file:" + temp.newFolder().toString()); + + sql( + "CREATE TABLE %s.%s (id bigint NOT NULL, data string) USING iceberg", + anotherCatalog, tableIdent); + + AssertHelpers.assertThrows( + "Should reject calls for a table in another catalog", + IllegalArgumentException.class, + "Cannot run procedure in catalog", + () -> + sql( + "CALL %s.system.expire_snapshots('%s')", + catalogName, anotherCatalog + "." + tableName)); } @Test @@ -198,31 +218,41 @@ public void testConcurrentExpireSnapshots() { sql("INSERT INTO TABLE %s VALUES (4, 'd')", tableName); Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); - List output = sql( - "CALL %s.system.expire_snapshots(" + - "older_than => TIMESTAMP '%s'," + - "table => '%s'," + - "max_concurrent_deletes => %s," + - "retain_last => 1)", - catalogName, currentTimestamp, tableIdent, 4); - assertEquals("Expiring snapshots concurrently should succeed", ImmutableList.of(row(0L, 0L, 3L)), output); + List output = + sql( + "CALL %s.system.expire_snapshots(" + + "older_than => TIMESTAMP '%s'," + + "table => '%s'," + + "max_concurrent_deletes => %s," + + "retain_last => 1)", + catalogName, currentTimestamp, tableIdent, 4); + assertEquals( + "Expiring snapshots concurrently should succeed", + ImmutableList.of(row(0L, 0L, 3L)), + output); } @Test public void testConcurrentExpireSnapshotsWithInvalidInput() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should throw an error when max_concurrent_deletes = 0", - IllegalArgumentException.class, "max_concurrent_deletes should have value > 0", - () -> sql("CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)", - catalogName, tableIdent, 0)); - - AssertHelpers.assertThrows("Should throw an error when max_concurrent_deletes < 0 ", - IllegalArgumentException.class, "max_concurrent_deletes should have value > 0", - () -> sql( - "CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)", - catalogName, tableIdent, -1)); - + AssertHelpers.assertThrows( + "Should throw an error when max_concurrent_deletes = 0", + IllegalArgumentException.class, + "max_concurrent_deletes should have value > 0", + () -> + sql( + "CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)", + catalogName, tableIdent, 0)); + + AssertHelpers.assertThrows( + "Should throw an error when max_concurrent_deletes < 0 ", + IllegalArgumentException.class, + "max_concurrent_deletes should have value > 0", + () -> + sql( + "CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)", + catalogName, tableIdent, -1)); } @Test @@ -240,13 +270,14 @@ public void testExpireSnapshotWithStreamResultsEnabled() { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); - List output = sql( - "CALL %s.system.expire_snapshots(" + - "older_than => TIMESTAMP '%s'," + - "table => '%s'," + - "retain_last => 1, " + - "stream_results => true)", - catalogName, currentTimestamp, tableIdent); + List output = + sql( + "CALL %s.system.expire_snapshots(" + + "older_than => TIMESTAMP '%s'," + + "table => '%s'," + + "retain_last => 1, " + + "stream_results => true)", + catalogName, currentTimestamp, tableIdent); assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 1L)), output); } @@ -268,18 +299,15 @@ public void testExpireSnapshotsProcedureWorksWithSqlComments() { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); String callStatement = - "/* CALL statement is used to expire snapshots */\n" + - "-- And we have single line comments as well \n" + - "/* And comments that span *multiple* \n" + - " lines */ CALL /* this is the actual CALL */ %s.system.expire_snapshots(" + - " older_than => TIMESTAMP '%s'," + - " table => '%s'," + - " retain_last => 1)"; - List output = sql( - callStatement, catalogName, currentTimestamp, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(0L, 0L, 1L)), - output); + "/* CALL statement is used to expire snapshots */\n" + + "-- And we have single line comments as well \n" + + "/* And comments that span *multiple* \n" + + " lines */ CALL /* this is the actual CALL */ %s.system.expire_snapshots(" + + " older_than => TIMESTAMP '%s'," + + " table => '%s'," + + " retain_last => 1)"; + List output = sql(callStatement, catalogName, currentTimestamp, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 1L)), output); table.refresh(); diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java index ce88814ce937..8d2e10ea17eb 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.math.BigDecimal; @@ -31,7 +30,8 @@ public class TestIcebergExpressions extends SparkExtensionsTestBase { - public TestIcebergExpressions(String catalogName, String implementation, Map config) { + public TestIcebergExpressions( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -44,26 +44,30 @@ public void removeTables() { @Test public void testTruncateExpressions() { - sql("CREATE TABLE %s ( " + - " int_c INT, long_c LONG, dec_c DECIMAL(4, 2), str_c STRING, binary_c BINARY " + - ") USING iceberg", tableName); + sql( + "CREATE TABLE %s ( " + + " int_c INT, long_c LONG, dec_c DECIMAL(4, 2), str_c STRING, binary_c BINARY " + + ") USING iceberg", + tableName); - sql("CREATE TEMPORARY VIEW emp " + - "AS SELECT * FROM VALUES (101, 10001, 10.65, '101-Employee', CAST('1234' AS BINARY)) " + - "AS EMP(int_c, long_c, dec_c, str_c, binary_c)"); + sql( + "CREATE TEMPORARY VIEW emp " + + "AS SELECT * FROM VALUES (101, 10001, 10.65, '101-Employee', CAST('1234' AS BINARY)) " + + "AS EMP(int_c, long_c, dec_c, str_c, binary_c)"); sql("INSERT INTO %s SELECT * FROM emp", tableName); Dataset df = spark.sql("SELECT * FROM " + tableName); df.select( - new Column(new IcebergTruncateTransform(df.col("int_c").expr(), 2)).as("int_c"), - new Column(new IcebergTruncateTransform(df.col("long_c").expr(), 2)).as("long_c"), - new Column(new IcebergTruncateTransform(df.col("dec_c").expr(), 50)).as("dec_c"), - new Column(new IcebergTruncateTransform(df.col("str_c").expr(), 2)).as("str_c"), - new Column(new IcebergTruncateTransform(df.col("binary_c").expr(), 2)).as("binary_c") - ).createOrReplaceTempView("v"); + new Column(new IcebergTruncateTransform(df.col("int_c").expr(), 2)).as("int_c"), + new Column(new IcebergTruncateTransform(df.col("long_c").expr(), 2)).as("long_c"), + new Column(new IcebergTruncateTransform(df.col("dec_c").expr(), 50)).as("dec_c"), + new Column(new IcebergTruncateTransform(df.col("str_c").expr(), 2)).as("str_c"), + new Column(new IcebergTruncateTransform(df.col("binary_c").expr(), 2)).as("binary_c")) + .createOrReplaceTempView("v"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(100, 10000L, new BigDecimal("10.50"), "10", "12")), sql("SELECT int_c, long_c, dec_c, str_c, CAST(binary_c AS STRING) FROM v")); } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java index 4a44cee49b5d..2d110276e2cb 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.MERGE_CARDINALITY_CHECK_ENABLED; +import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.SPLIT_SIZE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; +import static org.apache.spark.sql.functions.lit; + import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -51,17 +57,15 @@ import org.junit.BeforeClass; import org.junit.Test; -import static org.apache.iceberg.TableProperties.MERGE_CARDINALITY_CHECK_ENABLED; -import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.SPLIT_SIZE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; -import static org.apache.spark.sql.functions.lit; - public abstract class TestMerge extends SparkRowLevelOperationsTestBase { - public TestMerge(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestMerge( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -82,318 +86,395 @@ public void removeTables() { public void testMergeIntoEmptyTargetInsertAllNonMatchingRows() { createAndInitTable("id INT, dep STRING"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // new - row(2, "emp-id-2"), // new - row(3, "emp-id-3") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // new + row(2, "emp-id-2"), // new + row(3, "emp-id-3") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeIntoEmptyTargetInsertOnlyMatchingRows() { createAndInitTable("id INT, dep STRING"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED AND (s.id >=2) THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(2, "emp-id-2"), // new - row(3, "emp-id-3") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED AND (s.id >=2) THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(2, "emp-id-2"), // new + row(3, "emp-id-3") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithOnlyUpdateClause() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-six\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(6, "emp-id-six") // kept - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-six\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(6, "emp-id-six") // kept + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithOnlyDeleteClause() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-one") // kept - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-one") // kept + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithAllCauses() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithAllCausesWithExplicitColumnSpecification() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET t.id = s.id, t.dep = s.dep " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT (t.id, t.dep) VALUES (s.id, s.dep)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET t.id = s.id, t.dep = s.dep " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT (t.id, t.dep) VALUES (s.id, s.dep)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithSourceCTE() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-two\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-3\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 5, \"dep\": \"emp-id-6\" }"); - - sql("WITH cte1 AS (SELECT id + 1 AS id, dep FROM source) " + - "MERGE INTO %s AS t USING cte1 AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 2 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 3 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(2, "emp-id-2"), // updated - row(3, "emp-id-3") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-two\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-3\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 5, \"dep\": \"emp-id-6\" }"); + + sql( + "WITH cte1 AS (SELECT id + 1 AS id, dep FROM source) " + + "MERGE INTO %s AS t USING cte1 AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 2 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 3 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(2, "emp-id-2"), // updated + row(3, "emp-id-3") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithSourceFromSetOps() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); String derivedSource = - "SELECT * FROM source WHERE id = 2 " + - "UNION ALL " + - "SELECT * FROM source WHERE id = 1 OR id = 6"; - - sql("MERGE INTO %s AS t USING (%s) AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName, derivedSource); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + "SELECT * FROM source WHERE id = 2 " + + "UNION ALL " + + "SELECT * FROM source WHERE id = 1 OR id = 6"; + + sql( + "MERGE INTO %s AS t USING (%s) AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName, derivedSource); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithMultipleUpdatesForTargetRow() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrows("Should complain non iceberg target table", - SparkException.class, errorMsg, + AssertHelpers.assertThrows( + "Should complain non iceberg target table", + SparkException.class, + errorMsg, () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one"), row(6, "emp-id-6")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @Test public void testMergeWithDisabledCardinalityCheck() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); try { // disable the cardinality check - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')", tableName, MERGE_CARDINALITY_CHECK_ENABLED, false); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')", + tableName, MERGE_CARDINALITY_CHECK_ENABLED, false); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); } finally { - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')", tableName, MERGE_CARDINALITY_CHECK_ENABLED, true); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')", + tableName, MERGE_CARDINALITY_CHECK_ENABLED, true); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "emp-id-1"), row(1, "emp-id-1"), row(2, "emp-id-2")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @Test public void testMergeWithUnconditionalDelete() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithSingleConditionalDelete() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrows("Should complain non iceberg target table", - SparkException.class, errorMsg, + AssertHelpers.assertThrows( + "Should complain non iceberg target table", + SparkException.class, + errorMsg, () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one"), row(6, "emp-id-6")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -403,31 +484,41 @@ public void testMergeWithIdentityTransform() { for (DistributionMode mode : DistributionMode.values()) { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD identity(dep)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, + sql("SELECT * FROM %s ORDER BY id", tableName)); removeTables(); } @@ -438,31 +529,41 @@ public void testMergeWithDaysTransform() { for (DistributionMode mode : DistributionMode.values()) { createAndInitTable("id INT, ts TIMESTAMP"); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, "id INT, ts TIMESTAMP", - "{ \"id\": 1, \"ts\": \"2000-01-01 00:00:00\" }\n" + - "{ \"id\": 6, \"ts\": \"2000-01-06 00:00:00\" }"); - - createOrReplaceView("source", "id INT, ts TIMESTAMP", - "{ \"id\": 2, \"ts\": \"2001-01-02 00:00:00\" }\n" + - "{ \"id\": 1, \"ts\": \"2001-01-01 00:00:00\" }\n" + - "{ \"id\": 6, \"ts\": \"2001-01-06 00:00:00\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "2001-01-01 00:00:00"), // updated - row(2, "2001-01-02 00:00:00") // new - ); - assertEquals("Should have expected rows", + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "id INT, ts TIMESTAMP", + "{ \"id\": 1, \"ts\": \"2000-01-01 00:00:00\" }\n" + + "{ \"id\": 6, \"ts\": \"2000-01-06 00:00:00\" }"); + + createOrReplaceView( + "source", + "id INT, ts TIMESTAMP", + "{ \"id\": 2, \"ts\": \"2001-01-02 00:00:00\" }\n" + + "{ \"id\": 1, \"ts\": \"2001-01-01 00:00:00\" }\n" + + "{ \"id\": 6, \"ts\": \"2001-01-06 00:00:00\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "2001-01-01 00:00:00"), // updated + row(2, "2001-01-02 00:00:00") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT id, CAST(ts AS STRING) FROM %s ORDER BY id", tableName)); @@ -475,31 +576,41 @@ public void testMergeWithBucketTransform() { for (DistributionMode mode : DistributionMode.values()) { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD bucket(2, dep)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, + sql("SELECT * FROM %s ORDER BY id", tableName)); removeTables(); } @@ -510,31 +621,41 @@ public void testMergeWithTruncateTransform() { for (DistributionMode mode : DistributionMode.values()) { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD truncate(dep, 2)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, + sql("SELECT * FROM %s ORDER BY id", tableName)); removeTables(); } @@ -546,31 +667,41 @@ public void testMergeIntoPartitionedAndOrderedTable() { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); sql("ALTER TABLE %s WRITE ORDERED BY (id)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, + sql("SELECT * FROM %s ORDER BY id", tableName)); removeTables(); } @@ -578,44 +709,50 @@ public void testMergeIntoPartitionedAndOrderedTable() { @Test public void testSelfMerge() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - - sql("MERGE INTO %s t USING %s s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET v = 'x' " + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName, tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "x"), // updated - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + + sql( + "MERGE INTO %s t USING %s s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET v = 'x' " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName, tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "x"), // updated + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithSourceAsSelfSubquery() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); createOrReplaceView("source", Arrays.asList(1, null), Encoders.INT()); - sql("MERGE INTO %s t USING (SELECT id AS value FROM %s r JOIN source ON r.id = source.value) s " + - "ON t.id == s.value " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET v = 'x' " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES ('invalid', -1) ", tableName, tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "x"), // updated - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "MERGE INTO %s t USING (SELECT id AS value FROM %s r JOIN source ON r.id = source.value) s " + + "ON t.id == s.value " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET v = 'x' " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES ('invalid', -1) ", + tableName, tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "x"), // updated + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -626,37 +763,46 @@ public synchronized void testMergeWithSerializableIsolation() throws Interrupted createAndInitTable("id INT, dep STRING"); createOrReplaceView("source", Collections.singletonList(1), Encoders.INT()); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, MERGE_ISOLATION_LEVEL, "serializable"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, MERGE_ISOLATION_LEVEL, "serializable"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // merge thread - Future mergeFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.value " + - "WHEN MATCHED THEN " + - " UPDATE SET dep = 'x'", tableName); - barrier.incrementAndGet(); - } - }); + Future mergeFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.value " + + "WHEN MATCHED THEN " + + " UPDATE SET dep = 'x'", + tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { mergeFuture.get(); @@ -667,7 +813,8 @@ public synchronized void testMergeWithSerializableIsolation() throws Interrupted Throwable validationException = sparkException.getCause(); Assert.assertThat(validationException, CoreMatchers.instanceOf(ValidationException.class)); String errMsg = validationException.getMessage(); - Assert.assertThat(errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); + Assert.assertThat( + errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); } finally { appendFuture.cancel(true); } @@ -677,44 +824,54 @@ public synchronized void testMergeWithSerializableIsolation() throws Interrupted } @Test - public synchronized void testMergeWithSnapshotIsolation() throws InterruptedException, ExecutionException { + public synchronized void testMergeWithSnapshotIsolation() + throws InterruptedException, ExecutionException { // cannot run tests with concurrency for Hadoop tables without atomic renames Assume.assumeFalse(catalogName.equalsIgnoreCase("testhadoop")); createAndInitTable("id INT, dep STRING"); createOrReplaceView("source", Collections.singletonList(1), Encoders.INT()); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, MERGE_ISOLATION_LEVEL, "snapshot"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, MERGE_ISOLATION_LEVEL, "snapshot"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // merge thread - Future mergeFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.value " + - "WHEN MATCHED THEN " + - " UPDATE SET dep = 'x'", tableName); - barrier.incrementAndGet(); - } - }); + Future mergeFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.value " + + "WHEN MATCHED THEN " + + " UPDATE SET dep = 'x'", + tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { mergeFuture.get(); @@ -728,175 +885,195 @@ public synchronized void testMergeWithSnapshotIsolation() throws InterruptedExce @Test public void testMergeWithExtraColumnsInSource() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - createOrReplaceView("source", - "{ \"id\": 1, \"extra_col\": -1, \"v\": \"v1_1\" }\n" + - "{ \"id\": 3, \"extra_col\": -1, \"v\": \"v3\" }\n" + - "{ \"id\": 4, \"extra_col\": -1, \"v\": \"v4\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET v = source.v " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "v1_1"), // new - row(2, "v2"), // kept - row(3, "v3"), // new - row(4, "v4") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + createOrReplaceView( + "source", + "{ \"id\": 1, \"extra_col\": -1, \"v\": \"v1_1\" }\n" + + "{ \"id\": 3, \"extra_col\": -1, \"v\": \"v3\" }\n" + + "{ \"id\": 4, \"extra_col\": -1, \"v\": \"v4\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET v = source.v " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "v1_1"), // new + row(2, "v2"), // kept + row(3, "v3"), // new + row(4, "v4") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithNullsInTargetAndSource() { - createAndInitTable("id INT, v STRING", - "{ \"id\": null, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - - createOrReplaceView("source", - "{ \"id\": null, \"v\": \"v1_1\" }\n" + - "{ \"id\": 4, \"v\": \"v4\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET v = source.v " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(null, "v1"), // kept - row(null, "v1_1"), // new - row(2, "v2"), // kept - row(4, "v4") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": null, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + + createOrReplaceView( + "source", "{ \"id\": null, \"v\": \"v1_1\" }\n" + "{ \"id\": 4, \"v\": \"v4\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET v = source.v " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(null, "v1"), // kept + row(null, "v1_1"), // new + row(2, "v2"), // kept + row(4, "v4") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test public void testMergeWithNullSafeEquals() { - createAndInitTable("id INT, v STRING", - "{ \"id\": null, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - - createOrReplaceView("source", - "{ \"id\": null, \"v\": \"v1_1\" }\n" + - "{ \"id\": 4, \"v\": \"v4\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id <=> source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET v = source.v " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(null, "v1_1"), // updated - row(2, "v2"), // kept - row(4, "v4") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": null, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + + createOrReplaceView( + "source", "{ \"id\": null, \"v\": \"v1_1\" }\n" + "{ \"id\": 4, \"v\": \"v4\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id <=> source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET v = source.v " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(null, "v1_1"), // updated + row(2, "v2"), // kept + row(4, "v4") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test public void testMergeWithNullCondition() { - createAndInitTable("id INT, v STRING", - "{ \"id\": null, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - - createOrReplaceView("source", - "{ \"id\": null, \"v\": \"v1_1\" }\n" + - "{ \"id\": 2, \"v\": \"v2_2\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id AND NULL " + - "WHEN MATCHED THEN " + - " UPDATE SET v = source.v " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(null, "v1"), // kept - row(null, "v1_1"), // new - row(2, "v2"), // kept - row(2, "v2_2") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": null, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + + createOrReplaceView( + "source", "{ \"id\": null, \"v\": \"v1_1\" }\n" + "{ \"id\": 2, \"v\": \"v2_2\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id AND NULL " + + "WHEN MATCHED THEN " + + " UPDATE SET v = source.v " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(null, "v1"), // kept + row(null, "v1_1"), // new + row(2, "v2"), // kept + row(2, "v2_2") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test public void testMergeWithNullActionConditions() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); - createOrReplaceView("source", - "{ \"id\": 1, \"v\": \"v1_1\" }\n" + - "{ \"id\": 2, \"v\": \"v2_2\" }\n" + - "{ \"id\": 3, \"v\": \"v3_3\" }"); + createOrReplaceView( + "source", + "{ \"id\": 1, \"v\": \"v1_1\" }\n" + + "{ \"id\": 2, \"v\": \"v2_2\" }\n" + + "{ \"id\": 3, \"v\": \"v3_3\" }"); // all conditions are NULL and will never match any rows - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED AND source.id = 1 AND NULL THEN " + - " UPDATE SET v = source.v " + - "WHEN MATCHED AND source.v = 'v1_1' AND NULL THEN " + - " DELETE " + - "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows1 = ImmutableList.of( - row(1, "v1"), // kept - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows1, sql("SELECT * FROM %s ORDER BY v", tableName)); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED AND source.id = 1 AND NULL THEN " + + " UPDATE SET v = source.v " + + "WHEN MATCHED AND source.v = 'v1_1' AND NULL THEN " + + " DELETE " + + "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows1 = + ImmutableList.of( + row(1, "v1"), // kept + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows1, sql("SELECT * FROM %s ORDER BY v", tableName)); // only the update and insert conditions are NULL - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED AND source.id = 1 AND NULL THEN " + - " UPDATE SET v = source.v " + - "WHEN MATCHED AND source.v = 'v1_1' THEN " + - " DELETE " + - "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows2 = ImmutableList.of( - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows2, sql("SELECT * FROM %s ORDER BY v", tableName)); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED AND source.id = 1 AND NULL THEN " + + " UPDATE SET v = source.v " + + "WHEN MATCHED AND source.v = 'v1_1' THEN " + + " DELETE " + + "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows2 = + ImmutableList.of( + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows2, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test public void testMergeWithMultipleMatchingActions() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); - createOrReplaceView("source", - "{ \"id\": 1, \"v\": \"v1_1\" }\n" + - "{ \"id\": 2, \"v\": \"v2_2\" }"); + createOrReplaceView( + "source", "{ \"id\": 1, \"v\": \"v1_1\" }\n" + "{ \"id\": 2, \"v\": \"v2_2\" }"); // the order of match actions is important in this case - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED AND source.id = 1 THEN " + - " UPDATE SET v = source.v " + - "WHEN MATCHED AND source.v = 'v1_1' THEN " + - " DELETE " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "v1_1"), // updated (also matches the delete cond but update is first) - row(2, "v2") // kept (matches neither the update nor the delete cond) - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED AND source.id = 1 THEN " + + " UPDATE SET v = source.v " + + "WHEN MATCHED AND source.v = 'v1_1' THEN " + + " DELETE " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "v1_1"), // updated (also matches the delete cond but update is first) + row(2, "v2") // kept (matches neither the update nor the delete cond) + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test @@ -906,7 +1083,9 @@ public void testMergeWithMultipleRowGroupsParquet() throws NoSuchTableException createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, SPLIT_SIZE, 100); createOrReplaceView("source", Collections.singletonList(1), Encoders.INT()); @@ -915,122 +1094,150 @@ public void testMergeWithMultipleRowGroupsParquet() throws NoSuchTableException for (int id = 1; id <= 200; id++) { ids.add(id); } - Dataset df = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); + Dataset df = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); df.coalesce(1).writeTo(tableName).append(); Assert.assertEquals(200, spark.table(tableName).count()); // update a record from one of two row groups and copy over the second one - sql("MERGE INTO %s t USING source " + - "ON t.id == source.value " + - "WHEN MATCHED THEN " + - " UPDATE SET dep = 'x'", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.value " + + "WHEN MATCHED THEN " + + " UPDATE SET dep = 'x'", + tableName); Assert.assertEquals(200, spark.table(tableName).count()); } @Test public void testMergeInsertOnly() { - createAndInitTable("id STRING, v STRING", - "{ \"id\": \"a\", \"v\": \"v1\" }\n" + - "{ \"id\": \"b\", \"v\": \"v2\" }"); - createOrReplaceView("source", - "{ \"id\": \"a\", \"v\": \"v1_1\" }\n" + - "{ \"id\": \"a\", \"v\": \"v1_2\" }\n" + - "{ \"id\": \"c\", \"v\": \"v3\" }\n" + - "{ \"id\": \"d\", \"v\": \"v4_1\" }\n" + - "{ \"id\": \"d\", \"v\": \"v4_2\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row("a", "v1"), // kept - row("b", "v2"), // kept - row("c", "v3"), // new - row("d", "v4_1"), // new - row("d", "v4_2") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id STRING, v STRING", + "{ \"id\": \"a\", \"v\": \"v1\" }\n" + "{ \"id\": \"b\", \"v\": \"v2\" }"); + createOrReplaceView( + "source", + "{ \"id\": \"a\", \"v\": \"v1_1\" }\n" + + "{ \"id\": \"a\", \"v\": \"v1_2\" }\n" + + "{ \"id\": \"c\", \"v\": \"v3\" }\n" + + "{ \"id\": \"d\", \"v\": \"v4_1\" }\n" + + "{ \"id\": \"d\", \"v\": \"v4_2\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row("a", "v1"), // kept + row("b", "v2"), // kept + row("c", "v3"), // new + row("d", "v4_1"), // new + row("d", "v4_2") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeInsertOnlyWithCondition() { createAndInitTable("id INTEGER, v INTEGER", "{ \"id\": 1, \"v\": 1 }"); - createOrReplaceView("source", - "{ \"id\": 1, \"v\": 11, \"is_new\": true }\n" + - "{ \"id\": 2, \"v\": 21, \"is_new\": true }\n" + - "{ \"id\": 2, \"v\": 22, \"is_new\": false }"); + createOrReplaceView( + "source", + "{ \"id\": 1, \"v\": 11, \"is_new\": true }\n" + + "{ \"id\": 2, \"v\": 21, \"is_new\": true }\n" + + "{ \"id\": 2, \"v\": 22, \"is_new\": false }"); // validate assignments are reordered to match the table attrs - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED AND is_new = TRUE THEN " + - " INSERT (v, id) VALUES (s.v + 100, s.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, 1), // kept - row(2, 121) // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED AND is_new = TRUE THEN " + + " INSERT (v, id) VALUES (s.v + 100, s.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, 1), // kept + row(2, 121) // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeAlignsUpdateAndInsertActions() { createAndInitTable("id INT, a INT, b STRING", "{ \"id\": 1, \"a\": 2, \"b\": \"str\" }"); - createOrReplaceView("source", - "{ \"id\": 1, \"c1\": -2, \"c2\": \"new_str_1\" }\n" + - "{ \"id\": 2, \"c1\": -20, \"c2\": \"new_str_2\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET b = c2, a = c1, t.id = source.id " + - "WHEN NOT MATCHED THEN " + - " INSERT (b, a, id) VALUES (c2, c1, id)", tableName); - - assertEquals("Output should match", + createOrReplaceView( + "source", + "{ \"id\": 1, \"c1\": -2, \"c2\": \"new_str_1\" }\n" + + "{ \"id\": 2, \"c1\": -20, \"c2\": \"new_str_2\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET b = c2, a = c1, t.id = source.id " + + "WHEN NOT MATCHED THEN " + + " INSERT (b, a, id) VALUES (c2, c1, id)", + tableName); + + assertEquals( + "Output should match", ImmutableList.of(row(1, -2, "new_str_1"), row(2, -20, "new_str_2")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeUpdatesNestedStructFields() { - createAndInitTable("id INT, s STRUCT,m:MAP>>", + createAndInitTable( + "id INT, s STRUCT,m:MAP>>", "{ \"id\": 1, \"s\": { \"c1\": 2, \"c2\": { \"a\": [1,2], \"m\": { \"a\": \"b\"} } } } }"); createOrReplaceView("source", "{ \"id\": 1, \"c1\": -2 }"); // update primitive, array, map columns inside a struct - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.c1 = source.c1, t.s.c2.a = array(-1, -2), t.s.c2.m = map('k', 'v')", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.c1 = source.c1, t.s.c2.a = array(-1, -2), t.s.c2.m = map('k', 'v')", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(-2, row(ImmutableList.of(-1, -2), ImmutableMap.of("k", "v"))))), sql("SELECT * FROM %s ORDER BY id", tableName)); // set primitive, array, map columns to NULL (proper casts should be in place) - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.c1 = NULL, t.s.c2 = NULL", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.c1 = NULL, t.s.c2 = NULL", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(null, null))), sql("SELECT * FROM %s ORDER BY id", tableName)); // update all fields in a struct - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s = named_struct('c1', 100, 'c2', named_struct('a', array(1), 'm', map('x', 'y')))", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s = named_struct('c1', 100, 'c2', named_struct('a', array(1), 'm', map('x', 'y')))", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(100, row(ImmutableList.of(1), ImmutableMap.of("x", "y"))))), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -1041,12 +1248,15 @@ public void testMergeWithInferredCasts() { createOrReplaceView("source", "{ \"id\": 1, \"c1\": -2}"); // -2 in source should be casted to "-2" in target - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s = source.c1", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s = source.c1", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, "-2")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -1056,12 +1266,15 @@ public void testMergeModifiesNullStruct() { createAndInitTable("id INT, s STRUCT", "{ \"id\": 1, \"s\": null }"); createOrReplaceView("source", "{ \"id\": 1, \"n1\": -10 }"); - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.n1 = s.n1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.n1 = s.n1", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(-10, null))), sql("SELECT * FROM %s", tableName)); } @@ -1076,18 +1289,18 @@ public void testMergeRefreshesRelationCache() { spark.sql("CACHE TABLE tmp"); - assertEquals("View should have correct data", - ImmutableList.of(row("n1")), - sql("SELECT * FROM tmp")); + assertEquals( + "View should have correct data", ImmutableList.of(row("n1")), sql("SELECT * FROM tmp")); - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.name = s.name", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.name = s.name", + tableName); - assertEquals("View should have correct data", - ImmutableList.of(row("n2")), - sql("SELECT * FROM tmp")); + assertEquals( + "View should have correct data", ImmutableList.of(row("n2")), sql("SELECT * FROM tmp")); spark.sql("UNCACHE TABLE tmp"); } @@ -1097,33 +1310,45 @@ public void testMergeWithNonExistingColumns() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about the invalid top-level column", - AnalysisException.class, "cannot resolve '`t.invalid_col`'", + AssertHelpers.assertThrows( + "Should complain about the invalid top-level column", + AnalysisException.class, + "cannot resolve '`t.invalid_col`'", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.invalid_col = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.invalid_col = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about the invalid nested column", - AnalysisException.class, "No such struct field invalid_col", + AssertHelpers.assertThrows( + "Should complain about the invalid nested column", + AnalysisException.class, + "No such struct field invalid_col", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n2.invalid_col = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n2.invalid_col = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about the invalid top-level column", - AnalysisException.class, "cannot resolve '`invalid_col`'", + AssertHelpers.assertThrows( + "Should complain about the invalid top-level column", + AnalysisException.class, + "cannot resolve '`invalid_col`'", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n2.dn1 = s.c2 " + - "WHEN NOT MATCHED THEN " + - " INSERT (id, invalid_col) VALUES (s.c1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n2.dn1 = s.c2 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, invalid_col) VALUES (s.c1, null)", + tableName); }); } @@ -1132,35 +1357,47 @@ public void testMergeWithInvalidColumnsInInsert() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about the nested column", - AnalysisException.class, "Nested fields are not supported inside INSERT clauses", + AssertHelpers.assertThrows( + "Should complain about the nested column", + AnalysisException.class, + "Nested fields are not supported inside INSERT clauses", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n2.dn1 = s.c2 " + - "WHEN NOT MATCHED THEN " + - " INSERT (id, c.n2) VALUES (s.c1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n2.dn1 = s.c2 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, c.n2) VALUES (s.c1, null)", + tableName); }); - AssertHelpers.assertThrows("Should complain about duplicate columns", - AnalysisException.class, "Duplicate column names inside INSERT clause", + AssertHelpers.assertThrows( + "Should complain about duplicate columns", + AnalysisException.class, + "Duplicate column names inside INSERT clause", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n2.dn1 = s.c2 " + - "WHEN NOT MATCHED THEN " + - " INSERT (id, id) VALUES (s.c1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n2.dn1 = s.c2 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, id) VALUES (s.c1, null)", + tableName); }); - AssertHelpers.assertThrows("Should complain about missing columns", - AnalysisException.class, "must provide values for all columns of the target table", + AssertHelpers.assertThrows( + "Should complain about missing columns", + AnalysisException.class, + "must provide values for all columns of the target table", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN NOT MATCHED THEN " + - " INSERT (id) VALUES (s.c1)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id) VALUES (s.c1)", + tableName); }); } @@ -1169,22 +1406,30 @@ public void testMergeWithInvalidUpdates() { createAndInitTable("id INT, a ARRAY>, m MAP"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about updating an array column", - AnalysisException.class, "Updating nested fields is only supported for structs", + AssertHelpers.assertThrows( + "Should complain about updating an array column", + AnalysisException.class, + "Updating nested fields is only supported for structs", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.a.c1 = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.a.c1 = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about updating a map column", - AnalysisException.class, "Updating nested fields is only supported for structs", + AssertHelpers.assertThrows( + "Should complain about updating a map column", + AnalysisException.class, + "Updating nested fields is only supported for structs", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.m.key = 'new_key'", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.m.key = 'new_key'", + tableName); }); } @@ -1193,90 +1438,124 @@ public void testMergeWithConflictingUpdates() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about conflicting updates to a top-level column", - AnalysisException.class, "Updates are in conflict", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a top-level column", + AnalysisException.class, + "Updates are in conflict", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.id = 1, t.c.n1 = 2, t.id = 2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.id = 1, t.c.n1 = 2, t.id = 2", + tableName); }); - AssertHelpers.assertThrows("Should complain about conflicting updates to a nested column", - AnalysisException.class, "Updates are in conflict for these columns", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a nested column", + AnalysisException.class, + "Updates are in conflict for these columns", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n1 = 1, t.id = 2, t.c.n1 = 2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n1 = 1, t.id = 2, t.c.n1 = 2", + tableName); }); - AssertHelpers.assertThrows("Should complain about conflicting updates to a nested column", - AnalysisException.class, "Updates are in conflict", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a nested column", + AnalysisException.class, + "Updates are in conflict", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET c.n1 = 1, c = named_struct('n1', 1, 'n2', named_struct('dn1', 1, 'dn2', 2))", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET c.n1 = 1, c = named_struct('n1', 1, 'n2', named_struct('dn1', 1, 'dn2', 2))", + tableName); }); } @Test public void testMergeWithInvalidAssignments() { - createAndInitTable("id INT NOT NULL, s STRUCT> NOT NULL"); + createAndInitTable( + "id INT NOT NULL, s STRUCT> NOT NULL"); createOrReplaceView( "source", "c1 INT, c2 STRUCT NOT NULL, c3 STRING NOT NULL, c4 STRUCT", "{ \"c1\": -100, \"c2\": { \"n1\" : 1 }, \"c3\" : 'str', \"c4\": { \"dn2\": 1, \"dn2\": 2 } }"); - for (String policy : new String[]{"ansi", "strict"}) { - withSQLConf(ImmutableMap.of("spark.sql.storeAssignmentPolicy", policy), () -> { - - AssertHelpers.assertThrows("Should complain about writing nulls to a top-level column", - AnalysisException.class, "Cannot write nullable values to non-null column", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.id = NULL", tableName); - }); - - AssertHelpers.assertThrows("Should complain about writing nulls to a nested column", - AnalysisException.class, "Cannot write nullable values to non-null column", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.n1 = NULL", tableName); - }); - - AssertHelpers.assertThrows("Should complain about writing missing fields in structs", - AnalysisException.class, "missing fields", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s = s.c2", tableName); - }); - - AssertHelpers.assertThrows("Should complain about writing invalid data types", - AnalysisException.class, "Cannot safely cast", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.n1 = s.c3", tableName); - }); - - AssertHelpers.assertThrows("Should complain about writing incompatible structs", - AnalysisException.class, "field name does not match", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.n2 = s.c4", tableName); - }); - }); + for (String policy : new String[] {"ansi", "strict"}) { + withSQLConf( + ImmutableMap.of("spark.sql.storeAssignmentPolicy", policy), + () -> { + AssertHelpers.assertThrows( + "Should complain about writing nulls to a top-level column", + AnalysisException.class, + "Cannot write nullable values to non-null column", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.id = NULL", + tableName); + }); + + AssertHelpers.assertThrows( + "Should complain about writing nulls to a nested column", + AnalysisException.class, + "Cannot write nullable values to non-null column", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.n1 = NULL", + tableName); + }); + + AssertHelpers.assertThrows( + "Should complain about writing missing fields in structs", + AnalysisException.class, + "missing fields", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s = s.c2", + tableName); + }); + + AssertHelpers.assertThrows( + "Should complain about writing invalid data types", + AnalysisException.class, + "Cannot safely cast", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.n1 = s.c3", + tableName); + }); + + AssertHelpers.assertThrows( + "Should complain about writing incompatible structs", + AnalysisException.class, + "field name does not match", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.n2 = s.c4", + tableName); + }); + }); } } @@ -1285,40 +1564,56 @@ public void testMergeWithNonDeterministicConditions() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about non-deterministic search conditions", - AnalysisException.class, "nondeterministic expressions are only allowed in", + AssertHelpers.assertThrows( + "Should complain about non-deterministic search conditions", + AnalysisException.class, + "nondeterministic expressions are only allowed in", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 AND rand() > t.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n1 = -1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 AND rand() > t.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n1 = -1", + tableName); }); - AssertHelpers.assertThrows("Should complain about non-deterministic update conditions", - AnalysisException.class, "nondeterministic expressions are only allowed in", + AssertHelpers.assertThrows( + "Should complain about non-deterministic update conditions", + AnalysisException.class, + "nondeterministic expressions are only allowed in", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND rand() > t.id THEN " + - " UPDATE SET t.c.n1 = -1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND rand() > t.id THEN " + + " UPDATE SET t.c.n1 = -1", + tableName); }); - AssertHelpers.assertThrows("Should complain about non-deterministic delete conditions", - AnalysisException.class, "nondeterministic expressions are only allowed in", + AssertHelpers.assertThrows( + "Should complain about non-deterministic delete conditions", + AnalysisException.class, + "nondeterministic expressions are only allowed in", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND rand() > t.id THEN " + - " DELETE", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND rand() > t.id THEN " + + " DELETE", + tableName); }); - AssertHelpers.assertThrows("Should complain about non-deterministic insert conditions", - AnalysisException.class, "nondeterministic expressions are only allowed in", + AssertHelpers.assertThrows( + "Should complain about non-deterministic insert conditions", + AnalysisException.class, + "nondeterministic expressions are only allowed in", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN NOT MATCHED AND rand() > c1 THEN " + - " INSERT (id, c) VALUES (1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN NOT MATCHED AND rand() > c1 THEN " + + " INSERT (id, c) VALUES (1, null)", + tableName); }); } @@ -1327,40 +1622,56 @@ public void testMergeWithAggregateExpressions() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about agg expressions in search conditions", - AnalysisException.class, "contains one or more unsupported", + AssertHelpers.assertThrows( + "Should complain about agg expressions in search conditions", + AnalysisException.class, + "contains one or more unsupported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 AND max(t.id) == 1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n1 = -1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 AND max(t.id) == 1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n1 = -1", + tableName); }); - AssertHelpers.assertThrows("Should complain about agg expressions in update conditions", - AnalysisException.class, "contains one or more unsupported", + AssertHelpers.assertThrows( + "Should complain about agg expressions in update conditions", + AnalysisException.class, + "contains one or more unsupported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND sum(t.id) < 1 THEN " + - " UPDATE SET t.c.n1 = -1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND sum(t.id) < 1 THEN " + + " UPDATE SET t.c.n1 = -1", + tableName); }); - AssertHelpers.assertThrows("Should complain about non-deterministic delete conditions", - AnalysisException.class, "contains one or more unsupported", + AssertHelpers.assertThrows( + "Should complain about non-deterministic delete conditions", + AnalysisException.class, + "contains one or more unsupported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND sum(t.id) THEN " + - " DELETE", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND sum(t.id) THEN " + + " DELETE", + tableName); }); - AssertHelpers.assertThrows("Should complain about non-deterministic insert conditions", - AnalysisException.class, "contains one or more unsupported", + AssertHelpers.assertThrows( + "Should complain about non-deterministic insert conditions", + AnalysisException.class, + "contains one or more unsupported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN NOT MATCHED AND sum(c1) < 1 THEN " + - " INSERT (id, c) VALUES (1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN NOT MATCHED AND sum(c1) < 1 THEN " + + " INSERT (id, c) VALUES (1, null)", + tableName); }); } @@ -1369,40 +1680,56 @@ public void testMergeWithSubqueriesInConditions() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about subquery expressions", - AnalysisException.class, "Subqueries are not supported in conditions", + AssertHelpers.assertThrows( + "Should complain about subquery expressions", + AnalysisException.class, + "Subqueries are not supported in conditions", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 AND t.id < (SELECT max(c2) FROM source) " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n1 = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 AND t.id < (SELECT max(c2) FROM source) " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n1 = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about subquery expressions", - AnalysisException.class, "Subqueries are not supported in conditions", + AssertHelpers.assertThrows( + "Should complain about subquery expressions", + AnalysisException.class, + "Subqueries are not supported in conditions", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND t.id < (SELECT max(c2) FROM source) THEN " + - " UPDATE SET t.c.n1 = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND t.id < (SELECT max(c2) FROM source) THEN " + + " UPDATE SET t.c.n1 = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about subquery expressions", - AnalysisException.class, "Subqueries are not supported in conditions", + AssertHelpers.assertThrows( + "Should complain about subquery expressions", + AnalysisException.class, + "Subqueries are not supported in conditions", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND t.id NOT IN (SELECT c2 FROM source) THEN " + - " DELETE", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND t.id NOT IN (SELECT c2 FROM source) THEN " + + " DELETE", + tableName); }); - AssertHelpers.assertThrows("Should complain about subquery expressions", - AnalysisException.class, "Subqueries are not supported in conditions", + AssertHelpers.assertThrows( + "Should complain about subquery expressions", + AnalysisException.class, + "Subqueries are not supported in conditions", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN NOT MATCHED AND s.c1 IN (SELECT c2 FROM source) THEN " + - " INSERT (id, c) VALUES (1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN NOT MATCHED AND s.c1 IN (SELECT c2 FROM source) THEN " + + " INSERT (id, c) VALUES (1, null)", + tableName); }); } @@ -1411,13 +1738,17 @@ public void testMergeWithTargetColumnsInInsertCondtions() { createAndInitTable("id INT, c2 INT"); createOrReplaceView("source", "{ \"id\": 1, \"value\": 11 }"); - AssertHelpers.assertThrows("Should complain about the target column", - AnalysisException.class, "cannot resolve '`c2`'", + AssertHelpers.assertThrows( + "Should complain about the target column", + AnalysisException.class, + "cannot resolve '`c2`'", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED AND c2 = 1 THEN " + - " INSERT (id, c2) VALUES (s.id, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED AND c2 = 1 THEN " + + " INSERT (id, c2) VALUES (s.id, null)", + tableName); }); } @@ -1426,19 +1757,22 @@ public void testMergeWithNonIcebergTargetTableNotSupported() { createOrReplaceView("target", "{ \"c1\": -100, \"c2\": -200 }"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain non iceberg target table", - UnsupportedOperationException.class, "MERGE INTO TABLE is not supported temporarily.", + AssertHelpers.assertThrows( + "Should complain non iceberg target table", + UnsupportedOperationException.class, + "MERGE INTO TABLE is not supported temporarily.", () -> { - sql("MERGE INTO target t USING source s " + - "ON t.c1 == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET *"); + sql( + "MERGE INTO target t USING source s " + + "ON t.c1 == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET *"); }); } /** - * Tests a merge where both the source and target are evaluated to be partitioned by SingePartition at planning time - * but DynamicFileFilterExec will return an empty target. + * Tests a merge where both the source and target are evaluated to be partitioned by + * SingePartition at planning time but DynamicFileFilterExec will return an empty target. */ @Test public void testMergeSinglePartitionPartitioning() { @@ -1448,19 +1782,14 @@ public void testMergeSinglePartitionPartitioning() { // Coalesce forces our source into a SinglePartition distribution spark.range(0, 5).coalesce(1).createOrReplaceTempView("source"); - sql("MERGE INTO %s t USING source s ON t.id = s.id " + - "WHEN MATCHED THEN UPDATE SET *" + - "WHEN NOT MATCHED THEN INSERT *", + sql( + "MERGE INTO %s t USING source s ON t.id = s.id " + + "WHEN MATCHED THEN UPDATE SET *" + + "WHEN NOT MATCHED THEN INSERT *", tableName); - ImmutableList expectedRows = ImmutableList.of( - row(-1), - row(0), - row(1), - row(2), - row(3), - row(4) - ); + ImmutableList expectedRows = + ImmutableList.of(row(-1), row(0), row(1), row(2), row(3), row(4)); List result = sql("SELECT * FROM %s ORDER BY id", tableName); assertEquals("Should correctly add the non-matching rows", expectedRows, result); @@ -1474,18 +1803,13 @@ public void testMergeEmptyTable() { // Coalesce forces our source into a SinglePartition distribution spark.range(0, 5).coalesce(1).createOrReplaceTempView("source"); - sql("MERGE INTO %s t USING source s ON t.id = s.id " + - "WHEN MATCHED THEN UPDATE SET *" + - "WHEN NOT MATCHED THEN INSERT *", + sql( + "MERGE INTO %s t USING source s ON t.id = s.id " + + "WHEN MATCHED THEN UPDATE SET *" + + "WHEN NOT MATCHED THEN INSERT *", tableName); - ImmutableList expectedRows = ImmutableList.of( - row(0), - row(1), - row(2), - row(3), - row(4) - ); + ImmutableList expectedRows = ImmutableList.of(row(0), row(1), row(2), row(3), row(4)); List result = sql("SELECT * FROM %s ORDER BY id", tableName); assertEquals("Should correctly add the non-matching rows", expectedRows, result); @@ -1497,16 +1821,20 @@ public void testFileFilterMetric() throws Exception { spark.sql(String.format("INSERT INTO %s VALUES (1, 'emp-id-one')", tableName)); spark.sql(String.format("INSERT INTO %s VALUES (6, 'emp-id-six')", tableName)); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createOrReplaceView("source", "id INT, dep STRING", "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); Map expectedMetrics = Maps.newHashMap(); expectedMetrics.put("candidate files", "2"); expectedMetrics.put("matching files", "1"); - checkMetrics(() -> spark.sql(String.format( - "MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED THEN UPDATE SET * ", tableName)), expectedMetrics); + checkMetrics( + () -> + spark.sql( + String.format( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN UPDATE SET * ", + tableName)), + expectedMetrics); } } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMigrateTableProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMigrateTableProcedure.java index d66e75add16f..f9c150a3b1dc 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMigrateTableProcedure.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMigrateTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.io.IOException; @@ -35,12 +34,12 @@ public class TestMigrateTableProcedure extends SparkExtensionsTestBase { - public TestMigrateTableProcedure(String catalogName, String implementation, Map config) { + public TestMigrateTableProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @After public void removeTables() { @@ -52,7 +51,9 @@ public void removeTables() { public void testMigrate() throws IOException { Assume.assumeTrue(catalogName.equals("spark_catalog")); String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + tableName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); Object result = scalarSql("CALL %s.system.migrate('%s')", catalogName, tableName); @@ -65,7 +66,8 @@ public void testMigrate() throws IOException { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -76,10 +78,13 @@ public void testMigrate() throws IOException { public void testMigrateWithOptions() throws IOException { Assume.assumeTrue(catalogName.equals("spark_catalog")); String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + tableName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - Object result = scalarSql("CALL %s.system.migrate('%s', map('foo', 'bar'))", catalogName, tableName); + Object result = + scalarSql("CALL %s.system.migrate('%s', map('foo', 'bar'))", catalogName, tableName); Assert.assertEquals("Should have added one file", 1L, result); @@ -93,7 +98,8 @@ public void testMigrateWithOptions() throws IOException { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -105,10 +111,14 @@ public void testMigrateWithInvalidMetricsConfig() throws IOException { Assume.assumeTrue(catalogName.equals("spark_catalog")); String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", tableName, location); - - AssertHelpers.assertThrows("Should reject invalid metrics config", - ValidationException.class, "Invalid metrics config", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + tableName, location); + + AssertHelpers.assertThrows( + "Should reject invalid metrics config", + ValidationException.class, + "Invalid metrics config", () -> { String props = "map('write.metadata.metrics.column.x', 'X')"; sql("CALL %s.system.migrate('%s', %s)", catalogName, tableName, props); @@ -120,13 +130,17 @@ public void testMigrateWithConflictingProps() throws IOException { Assume.assumeTrue(catalogName.equals("spark_catalog")); String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + tableName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - Object result = scalarSql("CALL %s.system.migrate('%s', map('migrated', 'false'))", catalogName, tableName); + Object result = + scalarSql("CALL %s.system.migrate('%s', map('migrated', 'false'))", catalogName, tableName); Assert.assertEquals("Should have added one file", 1L, result); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); @@ -136,16 +150,22 @@ public void testMigrateWithConflictingProps() throws IOException { @Test public void testInvalidMigrateCases() { - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.migrate()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.migrate(map('foo','bar'))", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.migrate('')", catalogName)); } } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPublishChangesProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPublishChangesProcedure.java index f8080818a1e3..2b74cd475fae 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPublishChangesProcedure.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPublishChangesProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; + import java.util.List; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -34,11 +35,10 @@ import org.junit.After; import org.junit.Test; -import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; - public class TestPublishChangesProcedure extends SparkExtensionsTestBase { - public TestPublishChangesProcedure(String catalogName, String implementation, Map config) { + public TestPublishChangesProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -57,26 +57,28 @@ public void testApplyWapChangesUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.publish_changes('%s', '%s')", - catalogName, tableIdent, wapId); + List output = + sql("CALL %s.system.publish_changes('%s', '%s')", catalogName, tableIdent, wapId); table.refresh(); Snapshot currentSnapshot = table.currentSnapshot(); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())), output); - assertEquals("Apply of WAP changes must be successful", + assertEquals( + "Apply of WAP changes must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -91,26 +93,30 @@ public void testApplyWapChangesUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.publish_changes(wap_id => '%s', table => '%s')", - catalogName, wapId, tableIdent); + List output = + sql( + "CALL %s.system.publish_changes(wap_id => '%s', table => '%s')", + catalogName, wapId, tableIdent); table.refresh(); Snapshot currentSnapshot = table.currentSnapshot(); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())), output); - assertEquals("Apply of WAP changes must be successful", + assertEquals( + "Apply of WAP changes must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -132,14 +138,15 @@ public void testApplyWapChangesRefreshesRelationCache() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); - sql("CALL %s.system.publish_changes('%s', '%s')", - catalogName, tableIdent, wapId); + sql("CALL %s.system.publish_changes('%s', '%s')", catalogName, tableIdent, wapId); - assertEquals("Apply of WAP changes should be visible", + assertEquals( + "Apply of WAP changes should be visible", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); @@ -150,27 +157,37 @@ public void testApplyWapChangesRefreshesRelationCache() { public void testApplyInvalidWapId() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should reject invalid wap id", - ValidationException.class, "Cannot apply unknown WAP ID", + AssertHelpers.assertThrows( + "Should reject invalid wap id", + ValidationException.class, + "Cannot apply unknown WAP ID", () -> sql("CALL %s.system.publish_changes('%s', 'not_valid')", catalogName, tableIdent)); } @Test public void testInvalidApplyWapChangesCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.publish_changes('n', table => 't', 'not_valid')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.publish_changes('n', 't', 'not_valid')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.publish_changes('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.publish_changes('', 'not_valid')", catalogName)); } } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRemoveOrphanFilesProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRemoveOrphanFilesProcedure.java index 724e17e50a2c..fa43cf0e276c 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRemoveOrphanFilesProcedure.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRemoveOrphanFilesProcedure.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; + import java.io.IOException; import java.sql.Timestamp; import java.time.Instant; @@ -36,15 +38,12 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; - public class TestRemoveOrphanFilesProcedure extends SparkExtensionsTestBase { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - public TestRemoveOrphanFilesProcedure(String catalogName, String implementation, Map config) { + public TestRemoveOrphanFilesProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -58,14 +57,11 @@ public void removeTable() { public void testRemoveOrphanFilesInEmptyTable() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - List output = sql( - "CALL %s.system.remove_orphan_files('%s')", - catalogName, tableIdent); + List output = + sql("CALL %s.system.remove_orphan_files('%s')", catalogName, tableIdent); assertEquals("Should be no orphan files", ImmutableList.of(), output); - assertEquals("Should have no rows", - ImmutableList.of(), - sql("SELECT * FROM %s", tableName)); + assertEquals("Should have no rows", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); } @Test @@ -75,7 +71,8 @@ public void testRemoveOrphanFilesInDataFolder() throws IOException { } else { // give a fresh location to Hive tables as Spark will not clean up the table location // correctly while dropping tables through spark_catalog - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", tableName, temp.newFolder()); } @@ -97,31 +94,35 @@ public void testRemoveOrphanFilesInDataFolder() throws IOException { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); // check for orphans in the metadata folder - List output1 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s'," + - "location => '%s')", - catalogName, tableIdent, currentTimestamp, metadataLocation); + List output1 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s'," + + "location => '%s')", + catalogName, tableIdent, currentTimestamp, metadataLocation); assertEquals("Should be no orphan files in the metadata folder", ImmutableList.of(), output1); // check for orphans in the table location - List output2 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output2 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be orphan files in the data folder", 1, output2.size()); // the previous call should have deleted all orphan files - List output3 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output3 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be no more orphan files in the data folder", 0, output3.size()); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -133,7 +134,8 @@ public void testRemoveOrphanFilesDryRun() throws IOException { } else { // give a fresh location to Hive tables as Spark will not clean up the table location // correctly while dropping tables through spark_catalog - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", tableName, temp.newFolder()); } @@ -152,31 +154,35 @@ public void testRemoveOrphanFilesDryRun() throws IOException { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); // check for orphans without deleting - List output1 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s'," + - "dry_run => true)", - catalogName, tableIdent, currentTimestamp); + List output1 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s'," + + "dry_run => true)", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be one orphan files", 1, output1.size()); // actually delete orphans - List output2 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output2 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be one orphan files", 1, output2.size()); // the previous call should have deleted all orphan files - List output3 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output3 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be no more orphan files", 0, output3.size()); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -187,8 +193,10 @@ public void testRemoveOrphanFilesGCDisabled() { sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' 'false')", tableName, GC_ENABLED); - AssertHelpers.assertThrows("Should reject call", - ValidationException.class, "Cannot remove orphan files: GC is disabled", + AssertHelpers.assertThrows( + "Should reject call", + ValidationException.class, + "Cannot remove orphan files: GC is disabled", () -> sql("CALL %s.system.remove_orphan_files('%s')", catalogName, tableIdent)); } @@ -201,35 +209,46 @@ public void testRemoveOrphanFilesWap() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); - List output = sql( - "CALL %s.system.remove_orphan_files('%s')", catalogName, tableIdent); + List output = + sql("CALL %s.system.remove_orphan_files('%s')", catalogName, tableIdent); assertEquals("Should be no orphan files", ImmutableList.of(), output); } @Test public void testInvalidRemoveOrphanFilesCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.remove_orphan_files('n', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.remove_orphan_files('n', 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.remove_orphan_files()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.remove_orphan_files('n', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.remove_orphan_files('')", catalogName)); } @@ -240,7 +259,8 @@ public void testConcurrentRemoveOrphanFiles() throws IOException { } else { // give a fresh location to Hive tables as Spark will not clean up the table location // correctly while dropping tables through spark_catalog - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", tableName, temp.newFolder()); } @@ -265,21 +285,23 @@ public void testConcurrentRemoveOrphanFiles() throws IOException { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); // check for orphans in the table location - List output = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "max_concurrent_deletes => %s," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, 4, currentTimestamp); + List output = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "max_concurrent_deletes => %s," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, 4, currentTimestamp); Assert.assertEquals("Should be orphan files in the data folder", 4, output.size()); // the previous call should have deleted all orphan files - List output3 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "max_concurrent_deletes => %s," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, 4, currentTimestamp); + List output3 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "max_concurrent_deletes => %s," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, 4, currentTimestamp); Assert.assertEquals("Should be no more orphan files in the data folder", 0, output3.size()); assertEquals( @@ -292,15 +314,22 @@ public void testConcurrentRemoveOrphanFiles() throws IOException { public void testConcurrentRemoveOrphanFilesWithInvalidInput() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should throw an error when max_concurrent_deletes = 0", - IllegalArgumentException.class, "max_concurrent_deletes should have value > 0", - () -> sql("CALL %s.system.remove_orphan_files(table => '%s', max_concurrent_deletes => %s)", - catalogName, tableIdent, 0)); - - AssertHelpers.assertThrows("Should throw an error when max_concurrent_deletes < 0 ", - IllegalArgumentException.class, "max_concurrent_deletes should have value > 0", - () -> sql( - "CALL %s.system.remove_orphan_files(table => '%s', max_concurrent_deletes => %s)", - catalogName, tableIdent, -1)); + AssertHelpers.assertThrows( + "Should throw an error when max_concurrent_deletes = 0", + IllegalArgumentException.class, + "max_concurrent_deletes should have value > 0", + () -> + sql( + "CALL %s.system.remove_orphan_files(table => '%s', max_concurrent_deletes => %s)", + catalogName, tableIdent, 0)); + + AssertHelpers.assertThrows( + "Should throw an error when max_concurrent_deletes < 0 ", + IllegalArgumentException.class, + "max_concurrent_deletes should have value > 0", + () -> + sql( + "CALL %s.system.remove_orphan_files(table => '%s', max_concurrent_deletes => %s)", + catalogName, tableIdent, -1)); } } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java index a6fa5e45eb7e..5e9ace36791f 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.List; @@ -34,10 +33,10 @@ import org.junit.After; import org.junit.Test; - public class TestRewriteDataFilesProcedure extends SparkExtensionsTestBase { - public TestRewriteDataFilesProcedure(String catalogName, String implementation, Map config) { + public TestRewriteDataFilesProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -49,11 +48,8 @@ public void removeTable() { @Test public void testRewriteDataFilesInEmptyTable() { createTable(); - List output = sql( - "CALL %s.system.rewrite_data_files('%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(0, 0)), - output); + List output = sql("CALL %s.system.rewrite_data_files('%s')", catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(0, 0)), output); } @Test @@ -63,10 +59,11 @@ public void testRewriteDataFilesOnPartitionTable() { insertData(10); List expectedRecords = currentData(); - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s')", catalogName, tableIdent); + List output = + sql("CALL %s.system.rewrite_data_files(table => '%s')", catalogName, tableIdent); - assertEquals("Action should rewrite 10 data files and add 2 data files (one per partition) ", + assertEquals( + "Action should rewrite 10 data files and add 2 data files (one per partition) ", ImmutableList.of(row(10, 2)), output); @@ -81,10 +78,11 @@ public void testRewriteDataFilesOnNonPartitionTable() { insertData(10); List expectedRecords = currentData(); - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s')", catalogName, tableIdent); + List output = + sql("CALL %s.system.rewrite_data_files(table => '%s')", catalogName, tableIdent); - assertEquals("Action should rewrite 10 data files and add 1 data files", + assertEquals( + "Action should rewrite 10 data files and add 1 data files", ImmutableList.of(row(10, 1)), output); @@ -100,11 +98,13 @@ public void testRewriteDataFilesWithOptions() { List expectedRecords = currentData(); // set the min-input-files = 12, instead of default 5 to skip compacting the files. - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s', options => map('min-input-files','12'))", - catalogName, tableIdent); + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s', options => map('min-input-files','12'))", + catalogName, tableIdent); - assertEquals("Action should rewrite 0 data files and add 0 data files", + assertEquals( + "Action should rewrite 0 data files and add 0 data files", ImmutableList.of(row(0, 0)), output); @@ -120,12 +120,14 @@ public void testRewriteDataFilesWithSortStrategy() { List expectedRecords = currentData(); // set sort_order = c1 DESC LAST - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s', " + - "strategy => 'sort', sort_order => 'c1 DESC NULLS LAST')", - catalogName, tableIdent); - - assertEquals("Action should rewrite 10 data files and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s', " + + "strategy => 'sort', sort_order => 'c1 DESC NULLS LAST')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 10 data files and add 1 data files", ImmutableList.of(row(10, 1)), output); @@ -141,11 +143,14 @@ public void testRewriteDataFilesWithFilter() { List expectedRecords = currentData(); // select only 5 files for compaction (files that may have c1 = 1) - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 = 1 and c2 is not null')", catalogName, tableIdent); - - assertEquals("Action should rewrite 5 data files (containing c1 = 1) and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + + " where => 'c1 = 1 and c2 is not null')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 5 data files (containing c1 = 1) and add 1 data files", ImmutableList.of(row(5, 1)), output); @@ -161,12 +166,14 @@ public void testRewriteDataFilesWithFilterOnPartitionTable() { List expectedRecords = currentData(); // select only 5 files for compaction (files in the partition c2 = 'bar') - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c2 = \"bar\"')", catalogName, tableIdent); - - assertEquals("Action should rewrite 5 data files from single matching partition" + - "(containing c2 = bar) and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c2 = \"bar\"')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 5 data files from single matching partition" + + "(containing c2 = bar) and add 1 data files", ImmutableList.of(row(5, 1)), output); @@ -182,12 +189,14 @@ public void testRewriteDataFilesWithInFilterOnPartitionTable() { List expectedRecords = currentData(); // select only 5 files for compaction (files in the partition c2 in ('bar')) - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c2 in (\"bar\")')", catalogName, tableIdent); - - assertEquals("Action should rewrite 5 data files from single matching partition" + - "(containing c2 = bar) and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c2 in (\"bar\")')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 5 data files from single matching partition" + + "(containing c2 = bar) and add 1 data files", ImmutableList.of(row(5, 1)), output); @@ -205,43 +214,56 @@ public void testRewriteDataFilesWithAllPossibleFilters() { // So that parsing can be tested on a same dataset without actually compacting the files. // EqualTo - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 = 3')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 = 3')", + catalogName, tableIdent); // GreaterThan - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 > 3')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 > 3')", + catalogName, tableIdent); // GreaterThanOrEqual - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 >= 3')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 >= 3')", + catalogName, tableIdent); // LessThan - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 < 0')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 < 0')", + catalogName, tableIdent); // LessThanOrEqual - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 <= 0')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 <= 0')", + catalogName, tableIdent); // In - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 in (3,4,5)')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 in (3,4,5)')", + catalogName, tableIdent); // IsNull - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 is null')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 is null')", + catalogName, tableIdent); // IsNotNull - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c3 is not null')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c3 is not null')", + catalogName, tableIdent); // And - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 = 3 and c2 = \"bar\"')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 = 3 and c2 = \"bar\"')", + catalogName, tableIdent); // Or - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 = 3 or c1 = 5')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 = 3 or c1 = 5')", + catalogName, tableIdent); // Not - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 not in (1,2)')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 not in (1,2)')", + catalogName, tableIdent); // StringStartsWith - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c2 like \"%s\"')", catalogName, tableIdent, "car%"); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c2 like \"%s\"')", + catalogName, tableIdent, "car%"); - // TODO: Enable when org.apache.iceberg.spark.SparkFilters have implementations for StringEndsWith & StringContains + // TODO: Enable when org.apache.iceberg.spark.SparkFilters have implementations for + // StringEndsWith & StringContains // StringEndsWith // sql("CALL %s.system.rewrite_data_files(table => '%s'," + // " where => 'c2 like \"%s\"')", catalogName, tableIdent, "%car"); @@ -257,63 +279,102 @@ public void testRewriteDataFilesWithInvalidInputs() { insertData(2); // Test for invalid strategy - AssertHelpers.assertThrows("Should reject calls with unsupported strategy error message", - IllegalArgumentException.class, "unsupported strategy: temp. Only binpack,sort is supported", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', options => map('min-input-files','2'), " + - "strategy => 'temp')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with unsupported strategy error message", + IllegalArgumentException.class, + "unsupported strategy: temp. Only binpack,sort is supported", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', options => map('min-input-files','2'), " + + "strategy => 'temp')", + catalogName, tableIdent)); // Test for sort_order with binpack strategy - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Cannot set strategy to sort, it has already been set", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'binpack', " + - "sort_order => 'c1 ASC NULLS FIRST')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Cannot set strategy to sort, it has already been set", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'binpack', " + + "sort_order => 'c1 ASC NULLS FIRST')", + catalogName, tableIdent)); // Test for sort_order with invalid null order - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Unable to parse sortOrder:", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + - "sort_order => 'c1 ASC none')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Unable to parse sortOrder:", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + + "sort_order => 'c1 ASC none')", + catalogName, tableIdent)); // Test for sort_order with invalid sort direction - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Unable to parse sortOrder:", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + - "sort_order => 'c1 none NULLS FIRST')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Unable to parse sortOrder:", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + + "sort_order => 'c1 none NULLS FIRST')", + catalogName, tableIdent)); // Test for sort_order with invalid column name - AssertHelpers.assertThrows("Should reject calls with error message", - ValidationException.class, "Cannot find field 'col1' in struct:" + - " struct<1: c1: optional int, 2: c2: optional string, 3: c3: optional string>", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + - "sort_order => 'col1 DESC NULLS FIRST')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + ValidationException.class, + "Cannot find field 'col1' in struct:" + + " struct<1: c1: optional int, 2: c2: optional string, 3: c3: optional string>", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + + "sort_order => 'col1 DESC NULLS FIRST')", + catalogName, tableIdent)); // Test for sort_order with invalid filter column col1 - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Cannot parse predicates in where option: col1 = 3", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', " + - "where => 'col1 = 3')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Cannot parse predicates in where option: col1 = 3", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', " + "where => 'col1 = 3')", + catalogName, tableIdent)); } @Test public void testInvalidCasesForRewriteDataFiles() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.rewrite_data_files('n', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.rewrite_data_files('n', 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rewrite_data_files()", catalogName)); - AssertHelpers.assertThrows("Should reject duplicate arg names name", - AnalysisException.class, "Duplicate procedure argument: table", + AssertHelpers.assertThrows( + "Should reject duplicate arg names name", + AnalysisException.class, + "Duplicate procedure argument: table", () -> sql("CALL %s.system.rewrite_data_files(table => 't', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.rewrite_data_files('')", catalogName)); } @@ -322,7 +383,9 @@ private void createTable() { } private void createPartitionTable() { - sql("CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg PARTITIONED BY (c2)", tableName); + sql( + "CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg PARTITIONED BY (c2)", + tableName); } private void insertData(int filesCount) { @@ -330,12 +393,15 @@ private void insertData(int filesCount) { ThreeColumnRecord record2 = new ThreeColumnRecord(2, "bar", null); List records = Lists.newArrayList(); - IntStream.range(0, filesCount / 2).forEach(i -> { - records.add(record1); - records.add(record2); - }); - - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).repartition(filesCount); + IntStream.range(0, filesCount / 2) + .forEach( + i -> { + records.add(record1); + records.add(record2); + }); + + Dataset df = + spark.createDataFrame(records, ThreeColumnRecord.class).repartition(filesCount); try { df.writeTo(tableName).append(); } catch (org.apache.spark.sql.catalyst.analysis.NoSuchTableException e) { @@ -344,6 +410,7 @@ private void insertData(int filesCount) { } private List currentData() { - return rowsToJava(spark.sql("SELECT * FROM " + tableName + " order by c1, c2, c3").collectAsList()); + return rowsToJava( + spark.sql("SELECT * FROM " + tableName + " order by c1, c2, c3").collectAsList()); } } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteManifestsProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteManifestsProcedure.java index dcf0a2d91e3e..7c5ec1f5cf3f 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteManifestsProcedure.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteManifestsProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; + import java.util.List; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -30,11 +31,10 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; - public class TestRewriteManifestsProcedure extends SparkExtensionsTestBase { - public TestRewriteManifestsProcedure(String catalogName, String implementation, Map config) { + public TestRewriteManifestsProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -46,40 +46,42 @@ public void removeTable() { @Test public void testRewriteManifestsInEmptyTable() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - List output = sql( - "CALL %s.system.rewrite_manifests('%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(0, 0)), - output); + List output = sql("CALL %s.system.rewrite_manifests('%s')", catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(0, 0)), output); } @Test public void testRewriteLargeManifests() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", + tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Must have 1 manifest", 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 1 manifest", 1, table.currentSnapshot().allManifests(table.io()).size()); sql("ALTER TABLE %s SET TBLPROPERTIES ('commit.manifest.target-size-bytes' '1')", tableName); - List output = sql( - "CALL %s.system.rewrite_manifests('%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(1, 4)), - output); + List output = sql("CALL %s.system.rewrite_manifests('%s')", catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(1, 4)), output); table.refresh(); - Assert.assertEquals("Must have 4 manifests", 4, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 4 manifests", 4, table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testRewriteSmallManifestsWithSnapshotIdInheritance() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", + tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' '%s')", tableName, SNAPSHOT_ID_INHERITANCE_ENABLED, "true"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' '%s')", + tableName, SNAPSHOT_ID_INHERITANCE_ENABLED, "true"); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName); @@ -88,87 +90,107 @@ public void testRewriteSmallManifestsWithSnapshotIdInheritance() { Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Must have 4 manifest", 4, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 4 manifest", 4, table.currentSnapshot().allManifests(table.io()).size()); - List output = sql( - "CALL %s.system.rewrite_manifests(table => '%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(4, 1)), - output); + List output = + sql("CALL %s.system.rewrite_manifests(table => '%s')", catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(4, 1)), output); table.refresh(); - Assert.assertEquals("Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testRewriteSmallManifestsWithoutCaching() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", + tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Must have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); - List output = sql( - "CALL %s.system.rewrite_manifests(use_caching => false, table => '%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(2, 1)), - output); + List output = + sql( + "CALL %s.system.rewrite_manifests(use_caching => false, table => '%s')", + catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(2, 1)), output); table.refresh(); - Assert.assertEquals("Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testRewriteManifestsCaseInsensitiveArgs() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", + tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Must have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); - List output = sql( - "CALL %s.system.rewrite_manifests(usE_cAcHiNg => false, tAbLe => '%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(2, 1)), - output); + List output = + sql( + "CALL %s.system.rewrite_manifests(usE_cAcHiNg => false, tAbLe => '%s')", + catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(2, 1)), output); table.refresh(); - Assert.assertEquals("Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testInvalidRewriteManifestsCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.rewrite_manifests('n', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.rewrite_manifests('n', 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rewrite_manifests()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.rewrite_manifests('n', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject duplicate arg names name", - AnalysisException.class, "Duplicate procedure argument: table", + AssertHelpers.assertThrows( + "Should reject duplicate arg names name", + AnalysisException.class, + "Duplicate procedure argument: table", () -> sql("CALL %s.system.rewrite_manifests(table => 't', tAbLe => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.rewrite_manifests('')", catalogName)); } } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToSnapshotProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToSnapshotProcedure.java index d3e6bdcbc285..af94b456d02e 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToSnapshotProcedure.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToSnapshotProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.List; @@ -36,7 +35,8 @@ public class TestRollbackToSnapshotProcedure extends SparkExtensionsTestBase { - public TestRollbackToSnapshotProcedure(String catalogName, String implementation, Map config) { + public TestRollbackToSnapshotProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -55,7 +55,8 @@ public void testRollbackToSnapshotUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -63,15 +64,18 @@ public void testRollbackToSnapshotUsingPositionalArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.rollback_to_snapshot('%s', %dL)", - catalogName, tableIdent, firstSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.rollback_to_snapshot('%s', %dL)", + catalogName, tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -86,7 +90,8 @@ public void testRollbackToSnapshotUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -94,15 +99,18 @@ public void testRollbackToSnapshotUsingNamedArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.rollback_to_snapshot(snapshot_id => %dL, table => '%s')", - catalogName, firstSnapshot.snapshotId(), tableIdent); + List output = + sql( + "CALL %s.system.rollback_to_snapshot(snapshot_id => %dL, table => '%s')", + catalogName, firstSnapshot.snapshotId(), tableIdent); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -126,21 +134,23 @@ public void testRollbackToSnapshotRefreshesRelationCache() { spark.sql("CACHE TABLE tmp"); - assertEquals("View should have expected rows", + assertEquals( + "View should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM tmp")); - List output = sql( - "CALL %s.system.rollback_to_snapshot(table => '%s', snapshot_id => %dL)", - catalogName, tableIdent, firstSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.rollback_to_snapshot(table => '%s', snapshot_id => %dL)", + catalogName, tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("View cache must be invalidated", - ImmutableList.of(row(1L, "a")), - sql("SELECT * FROM tmp")); + assertEquals( + "View cache must be invalidated", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); sql("UNCACHE TABLE tmp"); } @@ -155,7 +165,8 @@ public void testRollbackToSnapshotWithQuotedIdentifiers() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -171,15 +182,20 @@ public void testRollbackToSnapshotWithQuotedIdentifiers() { } String quotedNamespace = quotedNamespaceBuilder.toString(); - List output = sql( - "CALL %s.system.rollback_to_snapshot('%s', %d)", - catalogName, quotedNamespace + ".`" + tableIdent.name() + "`", firstSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.rollback_to_snapshot('%s', %d)", + catalogName, + quotedNamespace + ".`" + tableIdent.name() + "`", + firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -196,7 +212,8 @@ public void testRollbackToSnapshotWithoutExplicitCatalog() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -205,15 +222,16 @@ public void testRollbackToSnapshotWithoutExplicitCatalog() { Snapshot secondSnapshot = table.currentSnapshot(); // use camel case intentionally to test case sensitivity - List output = sql( - "CALL SyStEm.rOLlBaCk_to_SnApShOt('%s', %dL)", - tableIdent, firstSnapshot.snapshotId()); + List output = + sql("CALL SyStEm.rOLlBaCk_to_SnApShOt('%s', %dL)", tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -222,39 +240,58 @@ public void testRollbackToSnapshotWithoutExplicitCatalog() { public void testRollbackToInvalidSnapshot() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should reject invalid snapshot id", - ValidationException.class, "Cannot roll back to unknown snapshot id", + AssertHelpers.assertThrows( + "Should reject invalid snapshot id", + ValidationException.class, + "Cannot roll back to unknown snapshot id", () -> sql("CALL %s.system.rollback_to_snapshot('%s', -1L)", catalogName, tableIdent)); } @Test public void testInvalidRollbackToSnapshotCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", - () -> sql("CALL %s.system.rollback_to_snapshot(namespace => 'n1', table => 't', 1L)", catalogName)); - - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", + () -> + sql( + "CALL %s.system.rollback_to_snapshot(namespace => 'n1', table => 't', 1L)", + catalogName)); + + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.rollback_to_snapshot('n', 't', 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_snapshot('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_snapshot(1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_snapshot(table => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for snapshot_id: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for snapshot_id: cannot cast", () -> sql("CALL %s.system.rollback_to_snapshot('t', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.rollback_to_snapshot('', 1L)", catalogName)); } } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToTimestampProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToTimestampProcedure.java index 52fc12c7d01e..6da3853bbe24 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToTimestampProcedure.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToTimestampProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.time.LocalDateTime; @@ -36,7 +35,8 @@ public class TestRollbackToTimestampProcedure extends SparkExtensionsTestBase { - public TestRollbackToTimestampProcedure(String catalogName, String implementation, Map config) { + public TestRollbackToTimestampProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -58,7 +58,8 @@ public void testRollbackToTimestampUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -66,15 +67,18 @@ public void testRollbackToTimestampUsingPositionalArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.rollback_to_timestamp('%s',TIMESTAMP '%s')", - catalogName, tableIdent, firstSnapshotTimestamp); + List output = + sql( + "CALL %s.system.rollback_to_timestamp('%s',TIMESTAMP '%s')", + catalogName, tableIdent, firstSnapshotTimestamp); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -92,7 +96,8 @@ public void testRollbackToTimestampUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -100,15 +105,18 @@ public void testRollbackToTimestampUsingNamedArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.rollback_to_timestamp(timestamp => TIMESTAMP '%s', table => '%s')", - catalogName, firstSnapshotTimestamp, tableIdent); + List output = + sql( + "CALL %s.system.rollback_to_timestamp(timestamp => TIMESTAMP '%s', table => '%s')", + catalogName, firstSnapshotTimestamp, tableIdent); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -135,21 +143,23 @@ public void testRollbackToTimestampRefreshesRelationCache() { spark.sql("CACHE TABLE tmp"); - assertEquals("View should have expected rows", + assertEquals( + "View should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM tmp")); - List output = sql( - "CALL %s.system.rollback_to_timestamp(table => '%s', timestamp => TIMESTAMP '%s')", - catalogName, tableIdent, firstSnapshotTimestamp); + List output = + sql( + "CALL %s.system.rollback_to_timestamp(table => '%s', timestamp => TIMESTAMP '%s')", + catalogName, tableIdent, firstSnapshotTimestamp); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("View cache must be invalidated", - ImmutableList.of(row(1L, "a")), - sql("SELECT * FROM tmp")); + assertEquals( + "View cache must be invalidated", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); sql("UNCACHE TABLE tmp"); } @@ -167,7 +177,8 @@ public void testRollbackToTimestampWithQuotedIdentifiers() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -183,15 +194,18 @@ public void testRollbackToTimestampWithQuotedIdentifiers() { } String quotedNamespace = quotedNamespaceBuilder.toString(); - List output = sql( - "CALL %s.system.rollback_to_timestamp('%s', TIMESTAMP '%s')", - catalogName, quotedNamespace + ".`" + tableIdent.name() + "`", firstSnapshotTimestamp); + List output = + sql( + "CALL %s.system.rollback_to_timestamp('%s', TIMESTAMP '%s')", + catalogName, quotedNamespace + ".`" + tableIdent.name() + "`", firstSnapshotTimestamp); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -211,7 +225,8 @@ public void testRollbackToTimestampWithoutExplicitCatalog() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -220,15 +235,18 @@ public void testRollbackToTimestampWithoutExplicitCatalog() { Snapshot secondSnapshot = table.currentSnapshot(); // use camel case intentionally to test case sensitivity - List output = sql( - "CALL SyStEm.rOLlBaCk_to_TiMeStaMp('%s', TIMESTAMP '%s')", - tableIdent, firstSnapshotTimestamp); + List output = + sql( + "CALL SyStEm.rOLlBaCk_to_TiMeStaMp('%s', TIMESTAMP '%s')", + tableIdent, firstSnapshotTimestamp); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -237,32 +255,50 @@ public void testRollbackToTimestampWithoutExplicitCatalog() { public void testInvalidRollbackToTimestampCases() { String timestamp = "TIMESTAMP '2007-12-03T10:15:30'"; - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", - () -> sql("CALL %s.system.rollback_to_timestamp(namespace => 'n1', 't', %s)", catalogName, timestamp)); - - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", + () -> + sql( + "CALL %s.system.rollback_to_timestamp(namespace => 'n1', 't', %s)", + catalogName, timestamp)); + + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.rollback_to_timestamp('n', 't', %s)", catalogName, timestamp)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_timestamp('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_timestamp(timestamp => %s)", catalogName, timestamp)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_timestamp(table => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with extra args", - AnalysisException.class, "Too many arguments", - () -> sql("CALL %s.system.rollback_to_timestamp('n', 't', %s, 1L)", catalogName, timestamp)); - - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for timestamp: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with extra args", + AnalysisException.class, + "Too many arguments", + () -> + sql("CALL %s.system.rollback_to_timestamp('n', 't', %s, 1L)", catalogName, timestamp)); + + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for timestamp: cannot cast", () -> sql("CALL %s.system.rollback_to_timestamp('t', 2.2)", catalogName)); } } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetCurrentSnapshotProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetCurrentSnapshotProcedure.java index 0ea8c4861e8c..8a8a974bbebe 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetCurrentSnapshotProcedure.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetCurrentSnapshotProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; + import java.util.List; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -34,11 +35,10 @@ import org.junit.Assume; import org.junit.Test; -import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; - public class TestSetCurrentSnapshotProcedure extends SparkExtensionsTestBase { - public TestSetCurrentSnapshotProcedure(String catalogName, String implementation, Map config) { + public TestSetCurrentSnapshotProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -57,7 +57,8 @@ public void testSetCurrentSnapshotUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -65,15 +66,18 @@ public void testSetCurrentSnapshotUsingPositionalArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.set_current_snapshot('%s', %dL)", - catalogName, tableIdent, firstSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.set_current_snapshot('%s', %dL)", + catalogName, tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Set must be successful", + assertEquals( + "Set must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -88,7 +92,8 @@ public void testSetCurrentSnapshotUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -96,15 +101,18 @@ public void testSetCurrentSnapshotUsingNamedArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.set_current_snapshot(snapshot_id => %dL, table => '%s')", - catalogName, firstSnapshot.snapshotId(), tableIdent); + List output = + sql( + "CALL %s.system.set_current_snapshot(snapshot_id => %dL, table => '%s')", + catalogName, firstSnapshot.snapshotId(), tableIdent); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Set must be successful", + assertEquals( + "Set must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -118,22 +126,26 @@ public void testSetCurrentSnapshotWap() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.set_current_snapshot(table => '%s', snapshot_id => %dL)", - catalogName, tableIdent, wapSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.set_current_snapshot(table => '%s', snapshot_id => %dL)", + catalogName, tableIdent, wapSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(null, wapSnapshot.snapshotId())), output); - assertEquals("Current snapshot must be set correctly", + assertEquals( + "Current snapshot must be set correctly", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -150,7 +162,8 @@ public void tesSetCurrentSnapshotWithoutExplicitCatalog() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -159,15 +172,16 @@ public void tesSetCurrentSnapshotWithoutExplicitCatalog() { Snapshot secondSnapshot = table.currentSnapshot(); // use camel case intentionally to test case sensitivity - List output = sql( - "CALL SyStEm.sEt_cuRrEnT_sNaPsHot('%s', %dL)", - tableIdent, firstSnapshot.snapshotId()); + List output = + sql("CALL SyStEm.sEt_cuRrEnT_sNaPsHot('%s', %dL)", tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Set must be successful", + assertEquals( + "Set must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -179,43 +193,64 @@ public void testSetCurrentSnapshotToInvalidSnapshot() { Namespace namespace = tableIdent.namespace(); String tableName = tableIdent.name(); - AssertHelpers.assertThrows("Should reject invalid snapshot id", - ValidationException.class, "Cannot roll back to unknown snapshot id", + AssertHelpers.assertThrows( + "Should reject invalid snapshot id", + ValidationException.class, + "Cannot roll back to unknown snapshot id", () -> sql("CALL %s.system.set_current_snapshot('%s', -1L)", catalogName, tableIdent)); } @Test public void testInvalidRollbackToSnapshotCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", - () -> sql("CALL %s.system.set_current_snapshot(namespace => 'n1', table => 't', 1L)", catalogName)); - - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", + () -> + sql( + "CALL %s.system.set_current_snapshot(namespace => 'n1', table => 't', 1L)", + catalogName)); + + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.set_current_snapshot('n', 't', 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.set_current_snapshot('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.set_current_snapshot(1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.set_current_snapshot(snapshot_id => 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.set_current_snapshot(table => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for snapshot_id: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for snapshot_id: cannot cast", () -> sql("CALL %s.system.set_current_snapshot('t', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.set_current_snapshot('', 1L)", catalogName)); } } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetWriteDistributionAndOrdering.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetWriteDistributionAndOrdering.java index 473278d25068..e7e52806792d 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetWriteDistributionAndOrdering.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetWriteDistributionAndOrdering.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.expressions.Expressions.bucket; + import java.util.Map; import org.apache.iceberg.NullOrder; import org.apache.iceberg.SortOrder; @@ -28,10 +29,9 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.expressions.Expressions.bucket; - public class TestSetWriteDistributionAndOrdering extends SparkExtensionsTestBase { - public TestSetWriteDistributionAndOrdering(String catalogName, String implementation, Map config) { + public TestSetWriteDistributionAndOrdering( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -42,7 +42,9 @@ public void removeTable() { @Test public void testSetWriteOrderByColumn() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -53,17 +55,20 @@ public void testSetWriteOrderByColumn() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "range", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("category", NullOrder.NULLS_FIRST) - .asc("id", NullOrder.NULLS_FIRST) - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .asc("category", NullOrder.NULLS_FIRST) + .asc("id", NullOrder.NULLS_FIRST) + .build(); Assert.assertEquals("Should have expected order", expected, table.sortOrder()); } @Test public void testSetWriteOrderByColumnWithDirection() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -74,17 +79,20 @@ public void testSetWriteOrderByColumnWithDirection() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "range", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("category", NullOrder.NULLS_FIRST) - .desc("id", NullOrder.NULLS_LAST) - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .asc("category", NullOrder.NULLS_FIRST) + .desc("id", NullOrder.NULLS_LAST) + .build(); Assert.assertEquals("Should have expected order", expected, table.sortOrder()); } @Test public void testSetWriteOrderByColumnWithDirectionAndNullOrder() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -95,17 +103,20 @@ public void testSetWriteOrderByColumnWithDirectionAndNullOrder() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "range", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("category", NullOrder.NULLS_LAST) - .desc("id", NullOrder.NULLS_FIRST) - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .asc("category", NullOrder.NULLS_LAST) + .desc("id", NullOrder.NULLS_FIRST) + .build(); Assert.assertEquals("Should have expected order", expected, table.sortOrder()); } @Test public void testSetWriteOrderByTransform() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -116,18 +127,21 @@ public void testSetWriteOrderByTransform() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "range", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .desc("category") - .asc(bucket("id", 16)) - .asc("id") - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .desc("category") + .asc(bucket("id", 16)) + .asc("id") + .build(); Assert.assertEquals("Should have expected order", expected, table.sortOrder()); } @Test public void testSetWriteUnordered() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -152,7 +166,9 @@ public void testSetWriteUnordered() { @Test public void testSetWriteLocallyOrdered() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -163,18 +179,21 @@ public void testSetWriteLocallyOrdered() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "none", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .desc("category") - .asc(bucket("id", 16)) - .asc("id") - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .desc("category") + .asc(bucket("id", 16)) + .asc("id") + .build(); Assert.assertEquals("Sort order must match", expected, table.sortOrder()); } @Test public void testSetWriteDistributedByWithSort() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -185,16 +204,15 @@ public void testSetWriteDistributedByWithSort() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "hash", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("id") - .build(); + SortOrder expected = SortOrder.builderFor(table.schema()).withOrderId(1).asc("id").build(); Assert.assertEquals("Sort order must match", expected, table.sortOrder()); } @Test public void testSetWriteDistributedByWithLocalSort() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -205,16 +223,15 @@ public void testSetWriteDistributedByWithLocalSort() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "hash", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("id") - .build(); + SortOrder expected = SortOrder.builderFor(table.schema()).withOrderId(1).asc("id").build(); Assert.assertEquals("Sort order must match", expected, table.sortOrder()); } @Test public void testSetWriteDistributedByAndUnordered() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -230,7 +247,9 @@ public void testSetWriteDistributedByAndUnordered() { @Test public void testSetWriteDistributedByOnly() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -246,7 +265,9 @@ public void testSetWriteDistributedByOnly() { @Test public void testSetWriteDistributedAndUnorderedInverted() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -262,7 +283,9 @@ public void testSetWriteDistributedAndUnorderedInverted() { @Test public void testSetWriteDistributedAndLocallyOrderedInverted() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -273,10 +296,7 @@ public void testSetWriteDistributedAndLocallyOrderedInverted() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "hash", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("id") - .build(); + SortOrder expected = SortOrder.builderFor(table.schema()).withOrderId(1).asc("id").build(); Assert.assertEquals("Sort order must match", expected, table.sortOrder()); } } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java index 66fa8e80c515..d8e918d8aadd 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.io.IOException; @@ -37,12 +36,12 @@ public class TestSnapshotTableProcedure extends SparkExtensionsTestBase { private static final String sourceName = "spark_catalog.default.source"; // Currently we can only Snapshot only out of the Spark Session Catalog - public TestSnapshotTableProcedure(String catalogName, String implementation, Map config) { + public TestSnapshotTableProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @After public void removeTables() { @@ -53,9 +52,12 @@ public void removeTables() { @Test public void testSnapshot() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object result = scalarSql("CALL %s.system.snapshot('%s', '%s')", catalogName, sourceName, tableName); + Object result = + scalarSql("CALL %s.system.snapshot('%s', '%s')", catalogName, sourceName, tableName); Assert.assertEquals("Should have added one file", 1L, result); @@ -65,7 +67,8 @@ public void testSnapshot() throws IOException { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -73,11 +76,14 @@ public void testSnapshot() throws IOException { @Test public void testSnapshotWithProperties() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object result = scalarSql( - "CALL %s.system.snapshot(source_table => '%s', table => '%s', properties => map('foo','bar'))", - catalogName, sourceName, tableName); + Object result = + scalarSql( + "CALL %s.system.snapshot(source_table => '%s', table => '%s', properties => map('foo','bar'))", + catalogName, sourceName, tableName); Assert.assertEquals("Should have added one file", 1L, result); @@ -91,30 +97,39 @@ public void testSnapshotWithProperties() throws IOException { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testSnapshotWithAlternateLocation() throws IOException { - Assume.assumeTrue("No Snapshoting with Alternate locations with Hadoop Catalogs", !catalogName.contains("hadoop")); + Assume.assumeTrue( + "No Snapshoting with Alternate locations with Hadoop Catalogs", + !catalogName.contains("hadoop")); String location = temp.newFolder().toString(); String snapshotLocation = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object[] result = sql( - "CALL %s.system.snapshot(source_table => '%s', table => '%s', location => '%s')", - catalogName, sourceName, tableName, snapshotLocation).get(0); + Object[] result = + sql( + "CALL %s.system.snapshot(source_table => '%s', table => '%s', location => '%s')", + catalogName, sourceName, tableName, snapshotLocation) + .get(0); Assert.assertEquals("Should have added one file", 1L, result[0]); String storageLocation = validationCatalog.loadTable(tableIdent).location(); - Assert.assertEquals("Snapshot should be made at specified location", snapshotLocation, storageLocation); + Assert.assertEquals( + "Snapshot should be made at specified location", snapshotLocation, storageLocation); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -122,19 +137,24 @@ public void testSnapshotWithAlternateLocation() throws IOException { @Test public void testDropTable() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object result = scalarSql("CALL %s.system.snapshot('%s', '%s')", catalogName, sourceName, tableName); + Object result = + scalarSql("CALL %s.system.snapshot('%s', '%s')", catalogName, sourceName, tableName); Assert.assertEquals("Should have added one file", 1L, result); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); sql("DROP TABLE %s", tableName); - assertEquals("Source table should be intact", + assertEquals( + "Source table should be intact", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", sourceName)); } @@ -142,50 +162,70 @@ public void testDropTable() throws IOException { @Test public void testSnapshotWithConflictingProps() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object result = scalarSql( - "CALL %s.system.snapshot(" + - "source_table => '%s'," + - "table => '%s'," + - "properties => map('%s', 'true', 'snapshot', 'false'))", - catalogName, sourceName, tableName, TableProperties.GC_ENABLED); + Object result = + scalarSql( + "CALL %s.system.snapshot(" + + "source_table => '%s'," + + "table => '%s'," + + "properties => map('%s', 'true', 'snapshot', 'false'))", + catalogName, sourceName, tableName, TableProperties.GC_ENABLED); Assert.assertEquals("Should have added one file", 1L, result); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Map props = table.properties(); Assert.assertEquals("Should override user value", "true", props.get("snapshot")); - Assert.assertEquals("Should override user value", "false", props.get(TableProperties.GC_ENABLED)); + Assert.assertEquals( + "Should override user value", "false", props.get(TableProperties.GC_ENABLED)); } @Test public void testInvalidSnapshotsCases() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); - - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); + + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.snapshot('foo')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.snapshot('n', 't', map('foo', 'bar'))", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid map args", - AnalysisException.class, "cannot resolve 'map", - () -> sql("CALL %s.system.snapshot('%s', 'fable', 'loc', map(2, 1, 1))", catalogName, sourceName)); - - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with invalid map args", + AnalysisException.class, + "cannot resolve 'map", + () -> + sql( + "CALL %s.system.snapshot('%s', 'fable', 'loc', map(2, 1, 1))", + catalogName, sourceName)); + + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.snapshot('', 'dest')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.snapshot('src', '')", catalogName)); } } diff --git a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestUpdate.java b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestUpdate.java index 5726ac44cc99..5e10d6795e44 100644 --- a/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestUpdate.java +++ b/spark/v3.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestUpdate.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.SPLIT_SIZE; +import static org.apache.iceberg.TableProperties.UPDATE_ISOLATION_LEVEL; +import static org.apache.spark.sql.functions.lit; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -53,15 +57,15 @@ import org.junit.Ignore; import org.junit.Test; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.SPLIT_SIZE; -import static org.apache.iceberg.TableProperties.UPDATE_ISOLATION_LEVEL; -import static org.apache.spark.sql.functions.lit; - public abstract class TestUpdate extends SparkRowLevelOperationsTestBase { - public TestUpdate(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestUpdate( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -91,7 +95,8 @@ public void testExplain() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 1 snapshot", 1, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -106,7 +111,8 @@ public void testUpdateEmptyTable() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -121,7 +127,8 @@ public void testUpdateWithAlias() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "invalid")), sql("SELECT * FROM %s", tableName)); } @@ -134,7 +141,8 @@ public void testUpdateAlignsAssignments() { sql("UPDATE %s SET `c2` = c2 - 2, c1 = `c1` - 1 WHERE id <=> 1", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, 10, 109), row(2, 22, 222)), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -148,7 +156,8 @@ public void testUpdateWithUnsupportedPartitionPredicate() { sql("UPDATE %s t SET `t`.`id` = -1 WHERE t.dep LIKE '%%r' ", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "software")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -158,12 +167,10 @@ public void testUpdateWithDynamicFileFiltering() { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 3, \"dep\": \"hr\" }"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hardware\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 3, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hardware\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }"); sql("UPDATE %s SET id = cast('-1' AS INT) WHERE id = 2", tableName); @@ -173,7 +180,8 @@ public void testUpdateWithDynamicFileFiltering() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "1", "1", "1"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(1, "hardware"), row(1, "hr"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -193,7 +201,8 @@ public void testUpdateNonExistingRecords() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "0", null, null); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -217,7 +226,8 @@ public void testUpdateWithoutCondition() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "2", "3", "2"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(-1, "hr")), sql("SELECT * FROM %s ORDER BY dep ASC", tableName)); } @@ -226,26 +236,30 @@ public void testUpdateWithoutCondition() { public void testUpdateWithNullConditions() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 0, \"dep\": null }\n" + - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + append( + tableName, + "{ \"id\": 0, \"dep\": null }\n" + + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }"); // should not update any rows as null is never equal to null sql("UPDATE %s SET id = -1 WHERE dep = NULL", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should not update any rows the condition does not match any records sql("UPDATE %s SET id = -1 WHERE dep = 'software'", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should update one matching row with a null-safe condition sql("UPDATE %s SET dep = 'invalid', id = -1 WHERE dep <=> NULL", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "invalid"), row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -254,23 +268,27 @@ public void testUpdateWithNullConditions() { public void testUpdateWithInAndNotInConditions() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); sql("UPDATE %s SET id = -1 WHERE id IN (1, null)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("UPDATE %s SET id = 100 WHERE id NOT IN (null, 1)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("UPDATE %s SET id = 100 WHERE id NOT IN (1, 10)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(100, "hardware"), row(100, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); } @@ -282,16 +300,20 @@ public void testUpdateWithMultipleRowGroupsParquet() throws NoSuchTableException createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, SPLIT_SIZE, 100); List ids = Lists.newArrayList(); for (int id = 1; id <= 200; id++) { ids.add(id); } - Dataset df = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); + Dataset df = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); df.coalesce(1).writeTo(tableName).append(); Assert.assertEquals(200, spark.table(tableName).count()); @@ -304,27 +326,33 @@ public void testUpdateWithMultipleRowGroupsParquet() throws NoSuchTableException @Test public void testUpdateNestedStructFields() { - createAndInitTable("id INT, s STRUCT,m:MAP>>", + createAndInitTable( + "id INT, s STRUCT,m:MAP>>", "{ \"id\": 1, \"s\": { \"c1\": 2, \"c2\": { \"a\": [1,2], \"m\": { \"a\": \"b\"} } } } }"); // update primitive, array, map columns inside a struct sql("UPDATE %s SET s.c1 = -1, s.c2.m = map('k', 'v'), s.c2.a = array(-1)", tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(-1, row(ImmutableList.of(-1), ImmutableMap.of("k", "v"))))), sql("SELECT * FROM %s", tableName)); // set primitive, array, map columns to NULL (proper casts should be in place) sql("UPDATE %s SET s.c1 = NULL, s.c2 = NULL WHERE id IN (1)", tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(null, null))), sql("SELECT * FROM %s", tableName)); // update all fields in a struct - sql("UPDATE %s SET s = named_struct('c1', 1, 'c2', named_struct('a', array(1), 'm', null))", tableName); + sql( + "UPDATE %s SET s = named_struct('c1', 1, 'c2', named_struct('a', array(1), 'm', null))", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(1, row(ImmutableList.of(1), null)))), sql("SELECT * FROM %s", tableName)); } @@ -334,29 +362,33 @@ public void testUpdateWithUserDefinedDistribution() { createAndInitTable("id INT, c2 INT, c3 INT"); sql("ALTER TABLE %s ADD PARTITION FIELD bucket(8, c3)", tableName); - append(tableName, - "{ \"id\": 1, \"c2\": 11, \"c3\": 1 }\n" + - "{ \"id\": 2, \"c2\": 22, \"c3\": 1 }\n" + - "{ \"id\": 3, \"c2\": 33, \"c3\": 1 }"); + append( + tableName, + "{ \"id\": 1, \"c2\": 11, \"c3\": 1 }\n" + + "{ \"id\": 2, \"c2\": 22, \"c3\": 1 }\n" + + "{ \"id\": 3, \"c2\": 33, \"c3\": 1 }"); // request a global sort sql("ALTER TABLE %s WRITE ORDERED BY c2", tableName); sql("UPDATE %s SET c2 = -22 WHERE id NOT IN (1, 3)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, 11, 1), row(2, -22, 1), row(3, 33, 1)), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); // request a local sort sql("ALTER TABLE %s WRITE LOCALLY ORDERED BY id", tableName); sql("UPDATE %s SET c2 = -33 WHERE id = 3", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, 11, 1), row(2, -22, 1), row(3, -33, 1)), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); // request a hash distribution + local sort sql("ALTER TABLE %s WRITE DISTRIBUTED BY PARTITION ORDERED BY id", tableName); sql("UPDATE %s SET c2 = -11 WHERE id = 1", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, -11, 1), row(2, -22, 1), row(3, -33, 1)), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -368,34 +400,41 @@ public synchronized void testUpdateWithSerializableIsolation() throws Interrupte createAndInitTable("id INT, dep STRING"); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, UPDATE_ISOLATION_LEVEL, "serializable"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, UPDATE_ISOLATION_LEVEL, "serializable"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // update thread - Future updateFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("UPDATE %s SET id = -1 WHERE id = 1", tableName); - barrier.incrementAndGet(); - } - }); + Future updateFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("UPDATE %s SET id = -1 WHERE id = 1", tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { updateFuture.get(); @@ -406,7 +445,8 @@ public synchronized void testUpdateWithSerializableIsolation() throws Interrupte Throwable validationException = sparkException.getCause(); Assert.assertThat(validationException, CoreMatchers.instanceOf(ValidationException.class)); String errMsg = validationException.getMessage(); - Assert.assertThat(errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); + Assert.assertThat( + errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); } finally { appendFuture.cancel(true); } @@ -416,40 +456,48 @@ public synchronized void testUpdateWithSerializableIsolation() throws Interrupte } @Test - public synchronized void testUpdateWithSnapshotIsolation() throws InterruptedException, ExecutionException { + public synchronized void testUpdateWithSnapshotIsolation() + throws InterruptedException, ExecutionException { // cannot run tests with concurrency for Hadoop tables without atomic renames Assume.assumeFalse(catalogName.equalsIgnoreCase("testhadoop")); createAndInitTable("id INT, dep STRING"); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, UPDATE_ISOLATION_LEVEL, "snapshot"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, UPDATE_ISOLATION_LEVEL, "snapshot"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // update thread - Future updateFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("UPDATE %s SET id = -1 WHERE id = 1", tableName); - barrier.incrementAndGet(); - } - }); + Future updateFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("UPDATE %s SET id = -1 WHERE id = 1", tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { updateFuture.get(); @@ -467,7 +515,8 @@ public void testUpdateWithInferredCasts() { sql("UPDATE %s SET s = -1 WHERE id = 1", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "-1")), sql("SELECT * FROM %s", tableName)); } @@ -478,7 +527,8 @@ public void testUpdateModifiesNullStruct() { sql("UPDATE %s SET s.n1 = -1 WHERE id = 1", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, row(-1, null))), sql("SELECT * FROM %s", tableName)); } @@ -488,20 +538,19 @@ public void testUpdateRefreshesRelationCache() { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 3, \"dep\": \"hr\" }"); + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 3, \"dep\": \"hr\" }"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hardware\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hardware\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }"); Dataset query = spark.sql("SELECT * FROM " + tableName + " WHERE id = 1"); query.createOrReplaceTempView("tmp"); spark.sql("CACHE TABLE tmp"); - assertEquals("View should have correct data", + assertEquals( + "View should have correct data", ImmutableList.of(row(1, "hardware"), row(1, "hr")), sql("SELECT * FROM tmp ORDER BY id, dep")); @@ -513,11 +562,13 @@ public void testUpdateRefreshesRelationCache() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "2", "2", "2"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(2, "hardware"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - assertEquals("Should refresh the relation cache", + assertEquals( + "Should refresh the relation cache", ImmutableList.of(), sql("SELECT * FROM tmp ORDER BY id, dep")); @@ -528,36 +579,47 @@ public void testUpdateRefreshesRelationCache() { public void testUpdateWithInSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(0, 1, null), Encoders.INT()); createOrReplaceView("updated_dep", Arrays.asList("software", "hr"), Encoders.STRING()); - sql("UPDATE %s SET id = -1 WHERE " + - "id IN (SELECT * FROM updated_id) AND " + - "dep IN (SELECT * from updated_dep)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET id = -1 WHERE " + + "id IN (SELECT * FROM updated_id) AND " + + "dep IN (SELECT * from updated_dep)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("UPDATE %s SET id = 5 WHERE id IS NULL OR id IN (SELECT value + 1 FROM updated_id)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET id = 5 WHERE id IS NULL OR id IN (SELECT value + 1 FROM updated_id)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(5, "hardware"), row(5, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - append(tableName, - "{ \"id\": null, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); - assertEquals("Should have expected rows", - ImmutableList.of(row(-1, "hr"), row(2, "hr"), row(5, "hardware"), row(5, "hr"), row(null, "hr")), + append(tableName, "{ \"id\": null, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + assertEquals( + "Should have expected rows", + ImmutableList.of( + row(-1, "hr"), row(2, "hr"), row(5, "hardware"), row(5, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); - sql("UPDATE %s SET id = 10 WHERE id IN (SELECT value + 2 FROM updated_id) AND dep = 'hr'", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(-1, "hr"), row(5, "hardware"), row(5, "hr"), row(10, "hr"), row(null, "hr")), + sql( + "UPDATE %s SET id = 10 WHERE id IN (SELECT value + 2 FROM updated_id) AND dep = 'hr'", + tableName); + assertEquals( + "Should have expected rows", + ImmutableList.of( + row(-1, "hr"), row(5, "hardware"), row(5, "hr"), row(10, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); } @@ -568,12 +630,10 @@ public void testUpdateWithInSubqueryAndDynamicFileFiltering() { sql("ALTER TABLE %s WRITE DISTRIBUTED BY PARTITION", tableName); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 3, \"dep\": \"hr\" }"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hardware\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 3, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hardware\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }"); createOrReplaceView("updated_id", Arrays.asList(-1, 2), Encoders.INT()); @@ -585,7 +645,8 @@ public void testUpdateWithInSubqueryAndDynamicFileFiltering() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "1", "1", "1"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(1, "hardware"), row(1, "hr"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -594,23 +655,26 @@ public void testUpdateWithInSubqueryAndDynamicFileFiltering() { public void testUpdateWithSelfSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); sql("UPDATE %s SET dep = 'x' WHERE id IN (SELECT id + 1 FROM %s)", tableName, tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "x")), sql("SELECT * FROM %s ORDER BY id", tableName)); - sql("UPDATE %s SET dep = 'y' WHERE " + - "id = (SELECT count(*) FROM (SELECT DISTINCT id FROM %s) AS t)", tableName, tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET dep = 'y' WHERE " + + "id = (SELECT count(*) FROM (SELECT DISTINCT id FROM %s) AS t)", + tableName, tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "y")), sql("SELECT * FROM %s ORDER BY id", tableName)); sql("UPDATE %s SET id = (SELECT id - 2 FROM %s WHERE id = 1)", tableName, tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(-1, "y")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -619,16 +683,21 @@ public void testUpdateWithSelfSubquery() { public void testUpdateWithMultiColumnInSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); - List deletedEmployees = Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr")); + List deletedEmployees = + Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr")); createOrReplaceView("deleted_employee", deletedEmployees, Encoders.bean(Employee.class)); - sql("UPDATE %s SET dep = 'x', id = -1 WHERE (id, dep) IN (SELECT id, dep FROM deleted_employee)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET dep = 'x', id = -1 WHERE (id, dep) IN (SELECT id, dep FROM deleted_employee)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "x"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -637,27 +706,35 @@ public void testUpdateWithMultiColumnInSubquery() { public void testUpdateWithNotInSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("updated_dep", Arrays.asList("software", "hr"), Encoders.STRING()); // the file filter subquery (nested loop lef-anti join) returns 0 records sql("UPDATE %s SET id = -1 WHERE id NOT IN (SELECT * FROM updated_id)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("UPDATE %s SET id = -1 WHERE id NOT IN (SELECT * FROM updated_id WHERE value IS NOT NULL)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET id = -1 WHERE id NOT IN (SELECT * FROM updated_id WHERE value IS NOT NULL)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); - sql("UPDATE %s SET id = 5 WHERE id NOT IN (SELECT * FROM updated_id) OR dep IN ('software', 'hr')", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET id = 5 WHERE id NOT IN (SELECT * FROM updated_id) OR dep IN ('software', 'hr')", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(5, "hr"), row(5, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); } @@ -668,8 +745,10 @@ public void testUpdateWithNotInSubqueryNotSupported() { createOrReplaceView("updated_id", Arrays.asList(-1, -2, null), Encoders.INT()); - AssertHelpers.assertThrows("Should complain about NOT IN subquery", - AnalysisException.class, "Null-aware predicate subqueries are not currently supported", + AssertHelpers.assertThrows( + "Should complain about NOT IN subquery", + AnalysisException.class, + "Null-aware predicate subqueries are not currently supported", () -> sql("UPDATE %s SET id = -1 WHERE id NOT IN (SELECT * FROM updated_id)", tableName)); } @@ -677,36 +756,49 @@ public void testUpdateWithNotInSubqueryNotSupported() { public void testUpdateWithExistSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("updated_dep", Arrays.asList("hr", null), Encoders.STRING()); - sql("UPDATE %s t SET id = -1 WHERE EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = -1 WHERE EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("UPDATE %s t SET dep = 'x', id = -1 WHERE " + - "EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value + 2)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET dep = 'x', id = -1 WHERE " + + "EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value + 2)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "x"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("UPDATE %s t SET id = -2 WHERE " + - "EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value) OR " + - "t.id IS NULL", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = -2 WHERE " + + "EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value) OR " + + "t.id IS NULL", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-2, "hr"), row(-2, "x"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - sql("UPDATE %s t SET id = 1 WHERE " + - "EXISTS (SELECT 1 FROM updated_id ui WHERE t.id = ui.value) AND " + - "EXISTS (SELECT 1 FROM updated_dep ud WHERE t.dep = ud.value)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = 1 WHERE " + + "EXISTS (SELECT 1 FROM updated_id ui WHERE t.id = ui.value) AND " + + "EXISTS (SELECT 1 FROM updated_dep ud WHERE t.dep = ud.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-2, "x"), row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -715,30 +807,40 @@ public void testUpdateWithExistSubquery() { public void testUpdateWithNotExistsSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("updated_dep", Arrays.asList("hr", "software"), Encoders.STRING()); - sql("UPDATE %s t SET id = -1 WHERE NOT EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value + 2)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = -1 WHERE NOT EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value + 2)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(1, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - sql("UPDATE %s t SET id = 5 WHERE " + - "NOT EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value) OR " + - "t.id = 1", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = 5 WHERE " + + "NOT EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value) OR " + + "t.id = 1", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(5, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - sql("UPDATE %s t SET id = 10 WHERE " + - "NOT EXISTS (SELECT 1 FROM updated_id ui WHERE t.id = ui.value) AND " + - "EXISTS (SELECT 1 FROM updated_dep ud WHERE t.dep = ud.value)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = 10 WHERE " + + "NOT EXISTS (SELECT 1 FROM updated_id ui WHERE t.id = ui.value) AND " + + "EXISTS (SELECT 1 FROM updated_dep ud WHERE t.dep = ud.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(10, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -747,15 +849,17 @@ public void testUpdateWithNotExistsSubquery() { public void testUpdateWithScalarSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(1, 100, null), Encoders.INT()); sql("UPDATE %s SET id = -1 WHERE id <= (SELECT min(value) FROM updated_id)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -765,25 +869,29 @@ public void testUpdateThatRequiresGroupingBeforeWrite() { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - append(tableName, - "{ \"id\": 0, \"dep\": \"hr\" }\n" + - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); - - append(tableName, - "{ \"id\": 0, \"dep\": \"ops\" }\n" + - "{ \"id\": 1, \"dep\": \"ops\" }\n" + - "{ \"id\": 2, \"dep\": \"ops\" }"); - - append(tableName, - "{ \"id\": 0, \"dep\": \"hr\" }\n" + - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); - - append(tableName, - "{ \"id\": 0, \"dep\": \"ops\" }\n" + - "{ \"id\": 1, \"dep\": \"ops\" }\n" + - "{ \"id\": 2, \"dep\": \"ops\" }"); + append( + tableName, + "{ \"id\": 0, \"dep\": \"hr\" }\n" + + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hr\" }"); + + append( + tableName, + "{ \"id\": 0, \"dep\": \"ops\" }\n" + + "{ \"id\": 1, \"dep\": \"ops\" }\n" + + "{ \"id\": 2, \"dep\": \"ops\" }"); + + append( + tableName, + "{ \"id\": 0, \"dep\": \"hr\" }\n" + + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hr\" }"); + + append( + tableName, + "{ \"id\": 0, \"dep\": \"ops\" }\n" + + "{ \"id\": 1, \"dep\": \"ops\" }\n" + + "{ \"id\": 2, \"dep\": \"ops\" }"); createOrReplaceView("updated_id", Arrays.asList(1, 100), Encoders.INT()); @@ -803,30 +911,38 @@ public void testUpdateThatRequiresGroupingBeforeWrite() { public void testUpdateWithVectorization() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 0, \"dep\": \"hr\" }\n" + - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 0, \"dep\": \"hr\" }\n" + + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hr\" }"); - withSQLConf(ImmutableMap.of(SparkSQLProperties.VECTORIZATION_ENABLED, "true"), () -> { - sql("UPDATE %s t SET id = -1", tableName); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.VECTORIZATION_ENABLED, "true"), + () -> { + sql("UPDATE %s t SET id = -1", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(-1, "hr"), row(-1, "hr"), row(-1, "hr")), - sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - }); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(-1, "hr"), row(-1, "hr"), row(-1, "hr")), + sql("SELECT * FROM %s ORDER BY id, dep", tableName)); + }); } @Test public void testUpdateWithInvalidUpdates() { createAndInitTable("id INT, a ARRAY>, m MAP"); - AssertHelpers.assertThrows("Should complain about updating an array column", - AnalysisException.class, "Updating nested fields is only supported for structs", + AssertHelpers.assertThrows( + "Should complain about updating an array column", + AnalysisException.class, + "Updating nested fields is only supported for structs", () -> sql("UPDATE %s SET a.c1 = 1", tableName)); - AssertHelpers.assertThrows("Should complain about updating a map column", - AnalysisException.class, "Updating nested fields is only supported for structs", + AssertHelpers.assertThrows( + "Should complain about updating a map column", + AnalysisException.class, + "Updating nested fields is only supported for structs", () -> sql("UPDATE %s SET m.key = 'new_key'", tableName)); } @@ -834,48 +950,68 @@ public void testUpdateWithInvalidUpdates() { public void testUpdateWithConflictingAssignments() { createAndInitTable("id INT, c STRUCT>"); - AssertHelpers.assertThrows("Should complain about conflicting updates to a top-level column", - AnalysisException.class, "Updates are in conflict", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a top-level column", + AnalysisException.class, + "Updates are in conflict", () -> sql("UPDATE %s t SET t.id = 1, t.c.n1 = 2, t.id = 2", tableName)); - AssertHelpers.assertThrows("Should complain about conflicting updates to a nested column", - AnalysisException.class, "Updates are in conflict for these columns", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a nested column", + AnalysisException.class, + "Updates are in conflict for these columns", () -> sql("UPDATE %s t SET t.c.n1 = 1, t.id = 2, t.c.n1 = 2", tableName)); - AssertHelpers.assertThrows("Should complain about conflicting updates to a nested column", - AnalysisException.class, "Updates are in conflict", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a nested column", + AnalysisException.class, + "Updates are in conflict", () -> { - sql("UPDATE %s SET c.n1 = 1, c = named_struct('n1', 1, 'n2', named_struct('dn1', 1, 'dn2', 2))", tableName); + sql( + "UPDATE %s SET c.n1 = 1, c = named_struct('n1', 1, 'n2', named_struct('dn1', 1, 'dn2', 2))", + tableName); }); } @Test public void testUpdateWithInvalidAssignments() { - createAndInitTable("id INT NOT NULL, s STRUCT> NOT NULL"); - - for (String policy : new String[]{"ansi", "strict"}) { - withSQLConf(ImmutableMap.of("spark.sql.storeAssignmentPolicy", policy), () -> { - - AssertHelpers.assertThrows("Should complain about writing nulls to a top-level column", - AnalysisException.class, "Cannot write nullable values to non-null column", - () -> sql("UPDATE %s t SET t.id = NULL", tableName)); - - AssertHelpers.assertThrows("Should complain about writing nulls to a nested column", - AnalysisException.class, "Cannot write nullable values to non-null column", - () -> sql("UPDATE %s t SET t.s.n1 = NULL", tableName)); - - AssertHelpers.assertThrows("Should complain about writing missing fields in structs", - AnalysisException.class, "missing fields", - () -> sql("UPDATE %s t SET t.s = named_struct('n1', 1)", tableName)); - - AssertHelpers.assertThrows("Should complain about writing invalid data types", - AnalysisException.class, "Cannot safely cast", - () -> sql("UPDATE %s t SET t.s.n1 = 'str'", tableName)); - - AssertHelpers.assertThrows("Should complain about writing incompatible structs", - AnalysisException.class, "field name does not match", - () -> sql("UPDATE %s t SET t.s.n2 = named_struct('dn2', 1, 'dn1', 2)", tableName)); - }); + createAndInitTable( + "id INT NOT NULL, s STRUCT> NOT NULL"); + + for (String policy : new String[] {"ansi", "strict"}) { + withSQLConf( + ImmutableMap.of("spark.sql.storeAssignmentPolicy", policy), + () -> { + AssertHelpers.assertThrows( + "Should complain about writing nulls to a top-level column", + AnalysisException.class, + "Cannot write nullable values to non-null column", + () -> sql("UPDATE %s t SET t.id = NULL", tableName)); + + AssertHelpers.assertThrows( + "Should complain about writing nulls to a nested column", + AnalysisException.class, + "Cannot write nullable values to non-null column", + () -> sql("UPDATE %s t SET t.s.n1 = NULL", tableName)); + + AssertHelpers.assertThrows( + "Should complain about writing missing fields in structs", + AnalysisException.class, + "missing fields", + () -> sql("UPDATE %s t SET t.s = named_struct('n1', 1)", tableName)); + + AssertHelpers.assertThrows( + "Should complain about writing invalid data types", + AnalysisException.class, + "Cannot safely cast", + () -> sql("UPDATE %s t SET t.s.n1 = 'str'", tableName)); + + AssertHelpers.assertThrows( + "Should complain about writing incompatible structs", + AnalysisException.class, + "field name does not match", + () -> sql("UPDATE %s t SET t.s.n2 = named_struct('dn2', 1, 'dn1', 2)", tableName)); + }); } } @@ -883,8 +1019,10 @@ public void testUpdateWithInvalidAssignments() { public void testUpdateWithNonDeterministicCondition() { createAndInitTable("id INT, dep STRING"); - AssertHelpers.assertThrows("Should complain about non-deterministic expressions", - AnalysisException.class, "nondeterministic expressions are only allowed", + AssertHelpers.assertThrows( + "Should complain about non-deterministic expressions", + AnalysisException.class, + "nondeterministic expressions are only allowed", () -> sql("UPDATE %s SET id = -1 WHERE id = 1 AND rand() > 0.5", tableName)); } @@ -892,8 +1030,10 @@ public void testUpdateWithNonDeterministicCondition() { public void testUpdateOnNonIcebergTableNotSupported() { createOrReplaceView("testtable", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("UPDATE is not supported for non iceberg table", - UnsupportedOperationException.class, "not supported temporarily", + AssertHelpers.assertThrows( + "UPDATE is not supported for non iceberg table", + UnsupportedOperationException.class, + "not supported temporarily", () -> sql("UPDATE %s SET c1 = -1 WHERE c2 = 1", "testtable")); } } diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java index 17df7d2cf9d7..d6b0e9c94258 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -32,13 +31,13 @@ public class SparkBenchmarkUtil { - private SparkBenchmarkUtil() { - } + private SparkBenchmarkUtil() {} public static UnsafeProjection projection(Schema expectedSchema, Schema actualSchema) { StructType struct = SparkSchemaUtil.convert(actualSchema); - List refs = JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava(); + List refs = + JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava(); List attrs = Lists.newArrayListWithExpectedSize(struct.fields().length); List exprs = Lists.newArrayListWithExpectedSize(struct.fields().length); diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java index 4da18dc7c421..70f1a7a52a52 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -52,15 +54,11 @@ import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** * A benchmark that evaluates the performance of reading Parquet data with a flat schema using * Iceberg and Spark Parquet readers. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=SparkParquetReadersFlatDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-readers-flat-data-benchmark-result.txt @@ -73,22 +71,23 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetReadersFlatDataBenchmark { - private static final DynMethods.UnboundMethod APPLY_PROJECTION = DynMethods.builder("apply") - .impl(UnsafeProjection.class, InternalRow.class) - .build(); - private static final Schema SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); - private static final Schema PROJECTED_SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(8, "stringCol", Types.StringType.get())); + private static final DynMethods.UnboundMethod APPLY_PROJECTION = + DynMethods.builder("apply").impl(UnsafeProjection.class, InternalRow.class).build(); + private static final Schema SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); + private static final Schema PROJECTED_SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(8, "stringCol", Types.StringType.get())); private static final int NUM_RECORDS = 10000000; private File dataFile; @@ -97,10 +96,8 @@ public void setupBenchmark() throws IOException { dataFile = File.createTempFile("parquet-flat-data-benchmark", ".parquet"); dataFile.delete(); List records = RandomData.generateList(SCHEMA, NUM_RECORDS, 0L); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .schema(SCHEMA) - .named("benchmark") - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)).schema(SCHEMA).named("benchmark").build()) { writer.addAll(records); } } @@ -115,10 +112,11 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void readUsingIcebergReader(Blackhole blackHole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackHole.consume(row); @@ -129,14 +127,15 @@ public void readUsingIcebergReader(Blackhole blackHole) throws IOException { @Benchmark @Threads(1) public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); + Iterable unsafeRows = + Iterables.transform( + rows, APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -148,14 +147,15 @@ public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException @Threads(1) public void readUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -166,10 +166,11 @@ public void readUsingSparkReader(Blackhole blackhole) throws IOException { @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -180,14 +181,18 @@ public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOE @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { - - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA))::invoke); + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { + + Iterable unsafeRows = + Iterables.transform( + rows, + APPLY_PROJECTION.bind( + SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA)) + ::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -199,14 +204,15 @@ public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) thro @Threads(1) public void readWithProjectionUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(PROJECTED_SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java index c1c5a6610401..1e61579cadc7 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -52,15 +54,11 @@ import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** - * A benchmark that evaluates the performance of reading nested Parquet data using - * Iceberg and Spark Parquet readers. + * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg and Spark + * Parquet readers. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=SparkParquetReadersNestedDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-readers-nested-data-benchmark-result.txt @@ -73,22 +71,21 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetReadersNestedDataBenchmark { - private static final DynMethods.UnboundMethod APPLY_PROJECTION = DynMethods.builder("apply") - .impl(UnsafeProjection.class, InternalRow.class) - .build(); - private static final Schema SCHEMA = new Schema( - required(0, "id", Types.LongType.get()), - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get()) - )) - ); - private static final Schema PROJECTED_SCHEMA = new Schema( - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()) - )) - ); + private static final DynMethods.UnboundMethod APPLY_PROJECTION = + DynMethods.builder("apply").impl(UnsafeProjection.class, InternalRow.class).build(); + private static final Schema SCHEMA = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 4, + "nested", + Types.StructType.of( + required(1, "col1", Types.StringType.get()), + required(2, "col2", Types.DoubleType.get()), + required(3, "col3", Types.LongType.get())))); + private static final Schema PROJECTED_SCHEMA = + new Schema( + optional(4, "nested", Types.StructType.of(required(1, "col1", Types.StringType.get())))); private static final int NUM_RECORDS = 10000000; private File dataFile; @@ -97,10 +94,8 @@ public void setupBenchmark() throws IOException { dataFile = File.createTempFile("parquet-nested-data-benchmark", ".parquet"); dataFile.delete(); List records = RandomData.generateList(SCHEMA, NUM_RECORDS, 0L); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .schema(SCHEMA) - .named("benchmark") - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)).schema(SCHEMA).named("benchmark").build()) { writer.addAll(records); } } @@ -115,10 +110,11 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void readUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -129,14 +125,15 @@ public void readUsingIcebergReader(Blackhole blackhole) throws IOException { @Benchmark @Threads(1) public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); + Iterable unsafeRows = + Iterables.transform( + rows, APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -148,14 +145,15 @@ public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException @Threads(1) public void readUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -166,10 +164,11 @@ public void readUsingSparkReader(Blackhole blackhole) throws IOException { @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -180,14 +179,18 @@ public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOE @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { - - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA))::invoke); + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { + + Iterable unsafeRows = + Iterables.transform( + rows, + APPLY_PROJECTION.bind( + SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA)) + ::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -199,14 +202,15 @@ public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) thro @Threads(1) public void readWithProjectionUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(PROJECTED_SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java index 553170fef9ec..2452a8ad678f 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.Files; @@ -45,15 +47,11 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.annotations.Warmup; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** * A benchmark that evaluates the performance of writing Parquet data with a flat schema using * Iceberg and Spark Parquet writers. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=SparkParquetWritersFlatDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-writers-flat-data-benchmark-result.txt @@ -66,15 +64,16 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetWritersFlatDataBenchmark { - private static final Schema SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); private static final int NUM_RECORDS = 1000000; private Iterable rows; private File dataFile; @@ -96,10 +95,13 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void writeUsingIcebergWriter() throws IOException { - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) + .schema(SCHEMA) + .build()) { writer.addAll(rows); } @@ -109,15 +111,16 @@ public void writeUsingIcebergWriter() throws IOException { @Threads(1) public void writeUsingSparkWriter() throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .writeSupport(new ParquetWriteSupport()) - .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) - .set("spark.sql.parquet.writeLegacyFormat", "false") - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .writeSupport(new ParquetWriteSupport()) + .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) + .set("spark.sql.parquet.writeLegacyFormat", "false") + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") + .schema(SCHEMA) + .build()) { writer.addAll(rows); } diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java index 74bae80e5665..07b5968ec6a0 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.Files; @@ -45,15 +47,11 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.annotations.Warmup; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using - * Iceberg and Spark Parquet writers. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and Spark + * Parquet writers. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=SparkParquetWritersNestedDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-writers-nested-data-benchmark-result.txt @@ -66,14 +64,16 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetWritersNestedDataBenchmark { - private static final Schema SCHEMA = new Schema( - required(0, "id", Types.LongType.get()), - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get()) - )) - ); + private static final Schema SCHEMA = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 4, + "nested", + Types.StructType.of( + required(1, "col1", Types.StringType.get()), + required(2, "col2", Types.DoubleType.get()), + required(3, "col3", Types.LongType.get())))); private static final int NUM_RECORDS = 1000000; private Iterable rows; private File dataFile; @@ -95,10 +95,13 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void writeUsingIcebergWriter() throws IOException { - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) + .schema(SCHEMA) + .build()) { writer.addAll(rows); } @@ -108,15 +111,16 @@ public void writeUsingIcebergWriter() throws IOException { @Threads(1) public void writeUsingSparkWriter() throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .writeSupport(new ParquetWriteSupport()) - .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) - .set("spark.sql.parquet.writeLegacyFormat", "false") - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .writeSupport(new ParquetWriteSupport()) + .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) + .set("spark.sql.parquet.writeLegacyFormat", "false") + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") + .schema(SCHEMA) + .build()) { writer.addAll(rows); } diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java index 1820a801b2fb..0dbf07285060 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; @FunctionalInterface diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java index 0ceedfd0e20d..19bcdd672157 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -79,7 +78,8 @@ protected String newTableLocation() { protected String dataLocation() { Map properties = table.properties(); - return properties.getOrDefault(TableProperties.WRITE_DATA_LOCATION, String.format("%s/data", table.location())); + return properties.getOrDefault( + TableProperties.WRITE_DATA_LOCATION, String.format("%s/data", table.location())); } protected void cleanupFiles() throws IOException { @@ -92,12 +92,12 @@ protected void cleanupFiles() throws IOException { } protected void setupSpark(boolean enableDictionaryEncoding) { - SparkSession.Builder builder = SparkSession.builder() - .config("spark.ui.enabled", false); + SparkSession.Builder builder = SparkSession.builder().config("spark.ui.enabled", false); if (!enableDictionaryEncoding) { - builder.config("parquet.dictionary.page.size", "1") - .config("parquet.enable.dictionary", false) - .config(TableProperties.PARQUET_DICT_SIZE_BYTES, "1"); + builder + .config("parquet.dictionary.page.size", "1") + .config("parquet.enable.dictionary", false) + .config(TableProperties.PARQUET_DICT_SIZE_BYTES, "1"); } builder.master("local"); spark = builder.getOrCreate(); @@ -114,13 +114,14 @@ protected void tearDownSpark() { } protected void materialize(Dataset ds) { - ds.queryExecution().toRdd().toJavaRDD().foreach(record -> { }); + ds.queryExecution().toRdd().toJavaRDD().foreach(record -> {}); } protected void appendAsFile(Dataset ds) { // ensure the schema is precise (including nullability) StructType sparkSchema = SparkSchemaUtil.convert(table.schema()); - spark.createDataFrame(ds.rdd(), sparkSchema) + spark + .createDataFrame(ds.rdd(), sparkSchema) .coalesce(1) .write() .format("iceberg") @@ -132,42 +133,49 @@ protected void withSQLConf(Map conf, Action action) { SQLConf sqlConf = SQLConf.get(); Map currentConfValues = Maps.newHashMap(); - conf.keySet().forEach(confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); - - conf.forEach((confKey, confValue) -> { - if (SQLConf.staticConfKeys().contains(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); + conf.keySet() + .forEach( + confKey -> { + if (sqlConf.contains(confKey)) { + String currentConfValue = sqlConf.getConfString(confKey); + currentConfValues.put(confKey, currentConfValue); + } + }); + + conf.forEach( + (confKey, confValue) -> { + if (SQLConf.staticConfKeys().contains(confKey)) { + throw new RuntimeException("Cannot modify the value of a static config: " + confKey); + } + sqlConf.setConfString(confKey, confValue); + }); try { action.invoke(); } finally { - conf.forEach((confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); + conf.forEach( + (confKey, confValue) -> { + if (currentConfValues.containsKey(confKey)) { + sqlConf.setConfString(confKey, currentConfValues.get(confKey)); + } else { + sqlConf.unsetConf(confKey); + } + }); } } protected void withTableProperties(Map props, Action action) { Map tableProps = table.properties(); Map currentPropValues = Maps.newHashMap(); - props.keySet().forEach(propKey -> { - if (tableProps.containsKey(propKey)) { - String currentPropValue = tableProps.get(propKey); - currentPropValues.put(propKey, currentPropValue); - } - }); + props + .keySet() + .forEach( + propKey -> { + if (tableProps.containsKey(propKey)) { + String currentPropValue = tableProps.get(propKey); + currentPropValues.put(propKey, currentPropValue); + } + }); UpdateProperties updateProperties = table.updateProperties(); props.forEach(updateProperties::set); @@ -177,13 +185,14 @@ protected void withTableProperties(Map props, Action action) { action.invoke(); } finally { UpdateProperties restoreProperties = table.updateProperties(); - props.forEach((propKey, propValue) -> { - if (currentPropValues.containsKey(propKey)) { - restoreProperties.set(propKey, currentPropValues.get(propKey)); - } else { - restoreProperties.remove(propKey); - } - }); + props.forEach( + (propKey, propValue) -> { + if (currentPropValues.containsKey(propKey)) { + restoreProperties.set(propKey, currentPropValues.get(propKey)); + } else { + restoreProperties.remove(propKey); + } + }); restoreProperties.commit(); } } diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java index 9e206321a540..59e6230350d9 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -29,9 +31,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class IcebergSourceFlatDataBenchmark extends IcebergSourceBenchmark { @Override @@ -41,15 +40,16 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java index 5a0d9359ec6b..a1c61b9b4de0 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -29,9 +31,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class IcebergSourceNestedDataBenchmark extends IcebergSourceBenchmark { @Override @@ -41,14 +40,16 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(0, "id", Types.LongType.get()), - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get()) - )) - ); + Schema schema = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 4, + "nested", + Types.StructType.of( + required(1, "col1", Types.StringType.get()), + required(2, "col2", Types.DoubleType.get()), + required(3, "col3", Types.LongType.get())))); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java index 369a1507b648..f68b587735dd 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -29,9 +31,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class IcebergSourceNestedListDataBenchmark extends IcebergSourceBenchmark { @Override @@ -41,12 +40,19 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(0, "id", Types.LongType.get()), - optional(1, "outerlist", Types.ListType.ofOptional(2, - Types.StructType.of(required(3, "innerlist", Types.ListType.ofRequired(4, Types.StringType.get()))) - )) - ); + Schema schema = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 1, + "outerlist", + Types.ListType.ofOptional( + 2, + Types.StructType.of( + required( + 3, + "innerlist", + Types.ListType.ofRequired(4, Types.StringType.get())))))); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java index 06e00e3ebab7..eace9d3e44a7 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.Comparator; import java.util.List; @@ -57,23 +59,20 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.infra.Blackhole; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class WritersBenchmark extends IcebergSourceBenchmark { private static final int NUM_ROWS = 2500000; private static final long TARGET_FILE_SIZE_IN_BYTES = 50L * 1024 * 1024; - private static final Schema SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "timestampCol", Types.TimestampType.withZone()), - optional(7, "stringCol", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "timestampCol", Types.TimestampType.withZone()), + optional(7, "stringCol", Types.StringType.get())); private Iterable rows; private Iterable positionDeleteRows; @@ -91,7 +90,8 @@ public void setupBenchmark() { data.sort(Comparator.comparingInt(row -> transform.apply(row.getInt(1)))); this.rows = data; - this.positionDeleteRows = RandomData.generateSpark(DeleteSchemaUtil.pathPosSchema(), NUM_ROWS, 0L); + this.positionDeleteRows = + RandomData.generateSpark(DeleteSchemaUtil.pathPosSchema(), NUM_ROWS, 0L); this.unpartitionedSpec = table().specs().get(0); Preconditions.checkArgument(unpartitionedSpec.isUnpartitioned()); @@ -117,9 +117,7 @@ protected final Table initTable() { Table table = tables.create(SCHEMA, spec, properties, newTableLocation()); // add a partitioned spec to the table - table.updateSpec() - .addField(Expressions.bucket("intCol", 32)) - .commit(); + table.updateSpec().addField(Expressions.bucket("intCol", 32)).commit(); return table; } @@ -130,13 +128,14 @@ public void writeUnpartitionedClusteredDataWriter(Blackhole blackhole) throws IO FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .dataSchema(table().schema()) + .build(); - ClusteredDataWriter writer = new ClusteredDataWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredDataWriter writer = + new ClusteredDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); try (ClusteredDataWriter closeableWriter = writer) { for (InternalRow row : rows) { @@ -156,13 +155,14 @@ public void writeUnpartitionedLegacyDataWriter(Blackhole blackhole) throws IOExc Schema writeSchema = table().schema(); StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(unpartitionedSpec) - .build(); + SparkAppenderFactory appenders = + SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) + .spec(unpartitionedSpec) + .build(); - TaskWriter writer = new UnpartitionedWriter<>( - unpartitionedSpec, fileFormat(), appenders, - fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + TaskWriter writer = + new UnpartitionedWriter<>( + unpartitionedSpec, fileFormat(), appenders, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); try (TaskWriter closableWriter = writer) { for (InternalRow row : rows) { @@ -179,13 +179,14 @@ public void writePartitionedClusteredDataWriter(Blackhole blackhole) throws IOEx FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .dataSchema(table().schema()) + .build(); - ClusteredDataWriter writer = new ClusteredDataWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredDataWriter writer = + new ClusteredDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); StructType dataSparkType = SparkSchemaUtil.convert(table().schema()); @@ -210,14 +211,21 @@ public void writePartitionedLegacyDataWriter(Blackhole blackhole) throws IOExcep Schema writeSchema = table().schema(); StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(partitionedSpec) - .build(); - - TaskWriter writer = new SparkPartitionedWriter( - partitionedSpec, fileFormat(), appenders, - fileFactory, io, TARGET_FILE_SIZE_IN_BYTES, - writeSchema, sparkWriteType); + SparkAppenderFactory appenders = + SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) + .spec(partitionedSpec) + .build(); + + TaskWriter writer = + new SparkPartitionedWriter( + partitionedSpec, + fileFormat(), + appenders, + fileFactory, + io, + TARGET_FILE_SIZE_IN_BYTES, + writeSchema, + sparkWriteType); try (TaskWriter closableWriter = writer) { for (InternalRow row : rows) { @@ -234,13 +242,14 @@ public void writePartitionedFanoutDataWriter(Blackhole blackhole) throws IOExcep FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .dataSchema(table().schema()) + .build(); - FanoutDataWriter writer = new FanoutDataWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + FanoutDataWriter writer = + new FanoutDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); StructType dataSparkType = SparkSchemaUtil.convert(table().schema()); @@ -265,14 +274,21 @@ public void writePartitionedLegacyFanoutDataWriter(Blackhole blackhole) throws I Schema writeSchema = table().schema(); StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(partitionedSpec) - .build(); - - TaskWriter writer = new SparkPartitionedFanoutWriter( - partitionedSpec, fileFormat(), appenders, - fileFactory, io, TARGET_FILE_SIZE_IN_BYTES, - writeSchema, sparkWriteType); + SparkAppenderFactory appenders = + SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) + .spec(partitionedSpec) + .build(); + + TaskWriter writer = + new SparkPartitionedFanoutWriter( + partitionedSpec, + fileFormat(), + appenders, + fileFactory, + io, + TARGET_FILE_SIZE_IN_BYTES, + writeSchema, + sparkWriteType); try (TaskWriter closableWriter = writer) { for (InternalRow row : rows) { @@ -285,20 +301,23 @@ partitionedSpec, fileFormat(), appenders, @Benchmark @Threads(1) - public void writePartitionedClusteredEqualityDeleteWriter(Blackhole blackhole) throws IOException { + public void writePartitionedClusteredEqualityDeleteWriter(Blackhole blackhole) + throws IOException { FileIO io = table().io(); int equalityFieldId = table().schema().findField("longCol").fieldId(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .equalityDeleteRowSchema(table().schema()) - .equalityFieldIds(new int[]{equalityFieldId}) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .equalityDeleteRowSchema(table().schema()) + .equalityFieldIds(new int[] {equalityFieldId}) + .build(); - ClusteredEqualityDeleteWriter writer = new ClusteredEqualityDeleteWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredEqualityDeleteWriter writer = + new ClusteredEqualityDeleteWriter<>( + writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); StructType deleteSparkType = SparkSchemaUtil.convert(table().schema()); @@ -316,16 +335,17 @@ public void writePartitionedClusteredEqualityDeleteWriter(Blackhole blackhole) t @Benchmark @Threads(1) - public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) throws IOException { + public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) + throws IOException { FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).build(); - ClusteredPositionDeleteWriter writer = new ClusteredPositionDeleteWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredPositionDeleteWriter writer = + new ClusteredPositionDeleteWriter<>( + writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PositionDelete positionDelete = PositionDelete.create(); try (ClusteredPositionDeleteWriter closeableWriter = writer) { @@ -341,8 +361,6 @@ public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) } private OutputFileFactory newFileFactory() { - return OutputFileFactory.builderFor(table(), 1, 1) - .format(fileFormat()) - .build(); + return OutputFileFactory.builderFor(table(), 1, 1).format(fileFormat()).build(); } } diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java index f4c5389c0172..0a18ba35695c 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.avro; import org.apache.iceberg.FileFormat; @@ -25,8 +24,7 @@ /** * A benchmark that evaluates the performance of various Iceberg writers for Avro data. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=AvroWritersBenchmark * -PjmhOutputPath=benchmark/avro-writers-benchmark-result.txt diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java index 7520d89d743d..b6dc9dfc03f8 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.avro; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,18 +36,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of reading Avro data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading Avro data with a flat schema using Iceberg + * and the built-in file source in Spark. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=IcebergSourceFlatAvroDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-avro-data-read-benchmark-result.txt @@ -70,11 +68,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,10 +82,12 @@ public void readIceberg() { public void readFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().format("avro").load(dataLocation()); + materialize(df); + }); } @Benchmark @@ -93,11 +95,13 @@ public void readFileSource() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @@ -105,28 +109,34 @@ public void readWithProjectionIceberg() { public void readWithProjectionFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().format("avro").load(dataLocation()).select("longCol"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "avro"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + appendAsFile(df); + } + }); } } diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java index bbdb9a472296..c7473550938f 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.avro; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,19 +36,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - - /** - * A benchmark that evaluates the performance of reading Avro data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading Avro data with a flat schema using Iceberg + * and the built-in file source in Spark. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=IcebergSourceNestedAvroDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-avro-data-read-benchmark-result.txt @@ -71,11 +68,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -83,10 +82,12 @@ public void readIceberg() { public void readFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().format("avro").load(dataLocation()); + materialize(df); + }); } @Benchmark @@ -94,11 +95,14 @@ public void readFileSource() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("nested.col3"); + materialize(df); + }); } @Benchmark @@ -106,27 +110,33 @@ public void readWithProjectionIceberg() { public void readWithProjectionFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()).select("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = + spark().read().format("avro").load(dataLocation()).select("nested.col3"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "avro"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); + appendAsFile(df); + } + }); } } diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java index 329c9ffe7738..d0fdd8915780 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -30,13 +32,10 @@ import org.apache.iceberg.spark.source.IcebergSourceBenchmark; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - - /** - * Same as {@link org.apache.iceberg.spark.source.IcebergSourceFlatDataBenchmark} but we disable the Timestamp with - * zone type for ORC performance tests as Spark native reader does not support ORC's TIMESTAMP_INSTANT type + * Same as {@link org.apache.iceberg.spark.source.IcebergSourceFlatDataBenchmark} but we disable the + * Timestamp with zone type for ORC performance tests as Spark native reader does not support ORC's + * TIMESTAMP_INSTANT type */ public abstract class IcebergSourceFlatORCDataBenchmark extends IcebergSourceBenchmark { @@ -47,17 +46,19 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - // Disable timestamp column for ORC performance tests as Spark native reader does not support ORC's - // TIMESTAMP_INSTANT type - // optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + // Disable timestamp column for ORC performance tests as Spark native reader does not + // support ORC's + // TIMESTAMP_INSTANT type + // optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java index f1b963f7f6e6..bd2a414a9eb4 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,18 +36,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of reading ORC data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading ORC data with a flat schema using Iceberg + * and the built-in file source in Spark. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=IcebergSourceFlatORCDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-orc-data-read-benchmark-result.txt @@ -70,11 +68,13 @@ public void tearDownBenchmark() throws IOException { public void readIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,12 +82,18 @@ public void readIcebergNonVectorized() { public void readIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation); + materialize(df); + }); } @Benchmark @@ -96,10 +102,12 @@ public void readFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()); + materialize(df); + }); } @Benchmark @@ -108,10 +116,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()); + materialize(df); + }); } @Benchmark @@ -119,11 +129,13 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @@ -131,25 +143,33 @@ public void readWithProjectionIcebergNonVectorized() { public void readWithProjectionIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation) + .select("longCol"); + materialize(df); + }); } - @Benchmark @Threads(1) public void readWithProjectionFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()).select("longCol"); + materialize(df); + }); } @Benchmark @@ -158,27 +178,33 @@ public void readWithProjectionFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()).select("longCol"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "orc"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + appendAsFile(df); + } + }); } } diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java index 5e02a0c425d1..a703d5b83cbf 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.spark.sql.functions.array_repeat; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,22 +35,18 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.array_repeat; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=IcebergSourceNestedListORCDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-list-orc-data-write-benchmark-result.txt * */ -public class IcebergSourceNestedListORCDataWriteBenchmark extends IcebergSourceNestedListDataBenchmark { +public class IcebergSourceNestedListORCDataWriteBenchmark + extends IcebergSourceNestedListDataBenchmark { @Setup public void setupBenchmark() { @@ -67,8 +66,12 @@ public void tearDownBenchmark() throws IOException { @Threads(1) public void writeIceberg() { String tableLocation = table().location(); - benchmarkData().write().format("iceberg").option("write-format", "orc") - .mode(SaveMode.Append).save(tableLocation); + benchmarkData() + .write() + .format("iceberg") + .option("write-format", "orc") + .mode(SaveMode.Append) + .save(tableLocation); } @Benchmark @@ -76,11 +79,17 @@ public void writeIceberg() { public void writeIcebergDictionaryOff() { Map tableProperties = Maps.newHashMap(); tableProperties.put("orc.dictionary.key.threshold", "0"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - benchmarkData().write().format("iceberg").option("write-format", "orc") - .mode(SaveMode.Append).save(tableLocation); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + benchmarkData() + .write() + .format("iceberg") + .option("write-format", "orc") + .mode(SaveMode.Append) + .save(tableLocation); + }); } @Benchmark @@ -90,10 +99,11 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(numRows) - .withColumn("outerlist", array_repeat(struct( - expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), - 10)) + return spark() + .range(numRows) + .withColumn( + "outerlist", + array_repeat(struct(expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), 10)) .coalesce(1); } } diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java index 259804b4db21..bd6cb96382d1 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,19 +37,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - - /** - * A benchmark that evaluates the performance of reading ORC data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading ORC data with a flat schema using Iceberg + * and the built-in file source in Spark. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=IcebergSourceNestedORCDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-orc-data-read-benchmark-result.txt @@ -72,11 +69,13 @@ public void tearDownBenchmark() throws IOException { public void readIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -84,12 +83,18 @@ public void readIcebergNonVectorized() { public void readIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation); + materialize(df); + }); } @Benchmark @@ -98,10 +103,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()); + materialize(df); + }); } @Benchmark @@ -109,11 +116,14 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -121,12 +131,19 @@ public void readWithProjectionIcebergNonVectorized() { public void readWithProjectionIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation) + .selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -135,27 +152,32 @@ public void readWithProjectionFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()).selectExpr("nested.col3"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "orc"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); + appendAsFile(df); + } + }); } } diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java index 8136db5829e2..b4e98bc9e2d3 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,21 +35,15 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** * A benchmark that evaluates the file skipping capabilities in the Spark data source for Iceberg. * - * This class uses a dataset with a flat schema, where the records are clustered according to the + *

    This class uses a dataset with a flat schema, where the records are clustered according to the * column used in the filter predicate. * - * The performance is compared to the built-in file source in Spark. + *

    The performance is compared to the built-in file source in Spark. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=IcebergSourceFlatParquetDataFilterBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-filter-benchmark-result.txt @@ -74,11 +72,14 @@ public void tearDownBenchmark() throws IOException { public void readWithFilterIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -87,10 +88,12 @@ public void readWithFilterFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -99,23 +102,27 @@ public void readWithFilterFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } private void appendData() { for (int fileNum = 1; fileNum < NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); appendAsFile(df); } } diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java index 44f758ffec6f..f287318e1e20 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,17 +35,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of reading Parquet data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading Parquet data with a flat schema using + * Iceberg and the built-in file source in Spark. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=IcebergSourceFlatParquetDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-read-benchmark-result.txt @@ -69,11 +67,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,10 +82,12 @@ public void readFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -94,10 +96,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -105,11 +109,13 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @@ -118,10 +124,12 @@ public void readWithProjectionFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("longCol"); + materialize(df); + }); } @Benchmark @@ -130,23 +138,27 @@ public void readWithProjectionFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("longCol"); + materialize(df); + }); } private void appendData() { for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); appendAsFile(df); } } diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java index 8a2dc0dfe746..f30834d129ea 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,14 +33,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of writing Parquet data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing Parquet data with a flat schema using + * Iceberg and the built-in file source in Spark. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=IcebergSourceFlatParquetDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-write-benchmark-result.txt @@ -76,7 +74,8 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(NUM_ROWS) + return spark() + .range(NUM_ROWS) .withColumnRenamed("id", "longCol") .withColumn("intCol", expr("CAST(longCol AS INT)")) .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java index 6709fff9729a..061b6ea88fe4 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.spark.sql.functions.array_repeat; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -33,22 +36,18 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.array_repeat; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=IcebergSourceNestedListParquetDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-list-parquet-data-write-benchmark-result.txt * */ -public class IcebergSourceNestedListParquetDataWriteBenchmark extends IcebergSourceNestedListDataBenchmark { +public class IcebergSourceNestedListParquetDataWriteBenchmark + extends IcebergSourceNestedListDataBenchmark { @Setup public void setupBenchmark() { @@ -80,10 +79,11 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(numRows) - .withColumn("outerlist", array_repeat(struct( - expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), - 10)) + return spark() + .range(numRows) + .withColumn( + "outerlist", + array_repeat(struct(expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), 10)) .coalesce(1); } } diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java index 97443a1adef0..b820021df4ec 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,27 +35,22 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - /** * A benchmark that evaluates the file skipping capabilities in the Spark data source for Iceberg. * - * This class uses a dataset with nested data, where the records are clustered according to the + *

    This class uses a dataset with nested data, where the records are clustered according to the * column used in the filter predicate. * - * The performance is compared to the built-in file source in Spark. + *

    The performance is compared to the built-in file source in Spark. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=IcebergSourceNestedParquetDataFilterBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-filter-benchmark-result.txt * */ -public class IcebergSourceNestedParquetDataFilterBenchmark extends IcebergSourceNestedDataBenchmark { +public class IcebergSourceNestedParquetDataFilterBenchmark + extends IcebergSourceNestedDataBenchmark { private static final String FILTER_COND = "nested.col3 == 0"; private static final int NUM_FILES = 500; @@ -74,11 +73,14 @@ public void tearDownBenchmark() throws IOException { public void readWithFilterIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -87,10 +89,12 @@ public void readWithFilterFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -99,22 +103,25 @@ public void readWithFilterFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } private void appendData() { for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); appendAsFile(df); } } diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java index 8d3b9e50c870..401250585a6e 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,17 +35,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=IcebergSourceNestedParquetDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-read-benchmark-result.txt @@ -69,11 +67,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,10 +82,12 @@ public void readFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -94,10 +96,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -105,11 +109,14 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -119,10 +126,12 @@ public void readWithProjectionFileSourceVectorized() { conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); conf.put(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED().key(), "true"); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -132,22 +141,25 @@ public void readWithProjectionFileSourceNonVectorized() { conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); conf.put(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED().key(), "true"); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); + materialize(df); + }); } private void appendData() { for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); appendAsFile(df); } } diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java index f15575ddbcd6..911ea62e2dd4 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,15 +34,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=IcebergSourceNestedParquetDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-write-benchmark-result.txt @@ -77,14 +75,14 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(NUM_ROWS) + return spark() + .range(NUM_ROWS) .withColumn( "nested", struct( expr("CAST(id AS string) AS col1"), expr("CAST(id AS double) AS col2"), - expr("id AS col3") - )) + expr("id AS col3"))) .coalesce(1); } } diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java index 916564140b49..6d320604a50e 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; import org.apache.iceberg.FileFormat; @@ -25,8 +24,7 @@ /** * A benchmark that evaluates the performance of various Iceberg writers for Parquet data. * - * To run this benchmark for either spark-2 or spark-3: - * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=ParquetWritersBenchmark * -PjmhOutputPath=benchmark/parquet-writers-benchmark-result.txt diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java index d3ecf5664d05..275af23881e0 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet.vectorized; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.pmod; +import static org.apache.spark.sql.functions.to_date; +import static org.apache.spark.sql.functions.to_timestamp; + import java.math.BigDecimal; import java.math.BigInteger; import java.util.Map; @@ -32,32 +38,26 @@ import org.apache.spark.sql.types.DataTypes; import org.openjdk.jmh.annotations.Setup; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.pmod; -import static org.apache.spark.sql.functions.to_date; -import static org.apache.spark.sql.functions.to_timestamp; - /** - * Benchmark to compare performance of reading Parquet dictionary encoded data with a flat schema using vectorized - * Iceberg read path and the built-in file source in Spark. - *

    - * To run this benchmark for either spark-2 or spark-3: - * + * Benchmark to compare performance of reading Parquet dictionary encoded data with a flat schema + * using vectorized Iceberg read path and the built-in file source in Spark. + * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=VectorizedReadDictionaryEncodedFlatParquetDataBenchmark * -PjmhOutputPath=benchmark/results.txt * */ -public class VectorizedReadDictionaryEncodedFlatParquetDataBenchmark extends VectorizedReadFlatParquetDataBenchmark { +public class VectorizedReadDictionaryEncodedFlatParquetDataBenchmark + extends VectorizedReadFlatParquetDataBenchmark { @Setup @Override public void setupBenchmark() { setupSpark(true); appendData(); - // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds + // Allow unsafe memory access to avoid the costly check arrow does to check if index is within + // bounds System.setProperty("arrow.enable_unsafe_memory_access", "true"); // Disable expensive null check for every get(index) call. // Iceberg manages nullability checks itself instead of relying on arrow. @@ -83,9 +83,7 @@ void appendData() { df = withTimestampColumnDictEncoded(df); df = withStringColumnDictEncoded(df); df = df.drop("id"); - df.write().format("iceberg") - .mode(SaveMode.Append) - .save(table().location()); + df.write().format("iceberg").mode(SaveMode.Append).save(table().location()); } private static Column modColumn() { @@ -106,7 +104,6 @@ private static Dataset withIntColumnDictEncoded(Dataset df) { private static Dataset withFloatColumnDictEncoded(Dataset df) { return df.withColumn("floatCol", modColumn().cast(DataTypes.FloatType)); - } private static Dataset withDoubleColumnDictEncoded(Dataset df) { @@ -125,7 +122,8 @@ private static Dataset withDateColumnDictEncoded(Dataset df) { private static Dataset withTimestampColumnDictEncoded(Dataset df) { Column days = modColumn().cast(DataTypes.ShortType); - return df.withColumn("timestampCol", to_timestamp(date_add(to_date(lit("04/12/2019"), "MM/dd/yyyy"), days))); + return df.withColumn( + "timestampCol", to_timestamp(date_add(to_date(lit("04/12/2019"), "MM/dd/yyyy"), days))); } private static Dataset withStringColumnDictEncoded(Dataset df) { diff --git a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java index 39ae0bc6ccf9..efe1f3c45d57 100644 --- a/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java +++ b/spark/v3.0/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java @@ -16,9 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet.vectorized; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.pmod; +import static org.apache.spark.sql.functions.when; + import java.io.IOException; import java.util.Map; import org.apache.hadoop.conf.Configuration; @@ -38,21 +46,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.pmod; -import static org.apache.spark.sql.functions.when; - /** - * Benchmark to compare performance of reading Parquet data with a flat schema using vectorized Iceberg read path and - * the built-in file source in Spark. - *

    - * To run this benchmark for either spark-2 or spark-3: - * + * Benchmark to compare performance of reading Parquet data with a flat schema using vectorized + * Iceberg read path and the built-in file source in Spark. + * + *

    To run this benchmark for either spark-2 or spark-3: * ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh * -PjmhIncludeRegex=VectorizedReadFlatParquetDataBenchmark * -PjmhOutputPath=benchmark/results.txt @@ -67,7 +65,8 @@ public class VectorizedReadFlatParquetDataBenchmark extends IcebergSourceBenchma public void setupBenchmark() { setupSpark(); appendData(); - // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds + // Allow unsafe memory access to avoid the costly check arrow does to check if index is within + // bounds System.setProperty("arrow.enable_unsafe_memory_access", "true"); // Disable expensive null check for every get(index) call. // Iceberg manages nullability checks itself instead of relying on arrow. @@ -87,15 +86,16 @@ protected Configuration initHadoopConf() { @Override protected Table initTable() { - Schema schema = new Schema( - optional(1, "longCol", Types.LongType.get()), - optional(2, "intCol", Types.IntegerType.get()), - optional(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + optional(1, "longCol", Types.LongType.get()), + optional(2, "intCol", Types.IntegerType.get()), + optional(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = parquetWriteProps(); @@ -111,19 +111,20 @@ Map parquetWriteProps() { void appendData() { for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS_PER_FILE) - .withColumn( - "longCol", - when(pmod(col("id"), lit(10)).equalTo(lit(0)), lit(null)) - .otherwise(col("id"))) - .drop("id") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(longCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS_PER_FILE) + .withColumn( + "longCol", + when(pmod(col("id"), lit(10)).equalTo(lit(0)), lit(null)).otherwise(col("id"))) + .drop("id") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(longCol AS STRING)")); appendAsFile(df); } } @@ -131,161 +132,189 @@ void appendData() { @Benchmark @Threads(1) public void readIntegersIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("intCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("intCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readIntegersSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("intCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("intCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readLongsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readLongsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("longCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readFloatsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("floatCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("floatCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readFloatsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("floatCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("floatCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDoublesIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("doubleCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("doubleCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDoublesSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("doubleCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("doubleCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDecimalsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("decimalCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("decimalCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDecimalsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("decimalCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("decimalCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDatesIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("dateCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("dateCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDatesSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("dateCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("dateCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readTimestampsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("timestampCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("timestampCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readTimestampsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("timestampCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("timestampCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readStringsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("stringCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("stringCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readStringsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("stringCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("stringCol"); + materialize(df); + }); } private static Map tablePropsWithVectorizationEnabled(int batchSize) { diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/BaseCatalog.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/BaseCatalog.java index 2942a52a135e..d04f82d339f7 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/BaseCatalog.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/BaseCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.spark.procedures.SparkProcedures; @@ -35,7 +34,8 @@ public Procedure loadProcedure(Identifier ident) throws NoSuchProcedureException String[] namespace = ident.namespace(); String name = ident.name(); - // namespace resolution is case insensitive until we have a way to configure case sensitivity in catalogs + // namespace resolution is case insensitive until we have a way to configure case sensitivity in + // catalogs if (namespace.length == 1 && namespace[0].equalsIgnoreCase("system")) { ProcedureBuilder builder = SparkProcedures.newBuilder(name); if (builder != null) { diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java index 58137250003a..641b957d1176 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -24,20 +23,18 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.util.ExceptionUtil; -/** - * utility class to accept thread local commit properties - */ +/** utility class to accept thread local commit properties */ public class CommitMetadata { - private CommitMetadata() { - - } + private CommitMetadata() {} - private static final ThreadLocal> COMMIT_PROPERTIES = ThreadLocal.withInitial(ImmutableMap::of); + private static final ThreadLocal> COMMIT_PROPERTIES = + ThreadLocal.withInitial(ImmutableMap::of); /** - * running the code wrapped as a caller, and any snapshot committed within the callable object will be attached with - * the metadata defined in properties + * running the code wrapped as a caller, and any snapshot committed within the callable object + * will be attached with the metadata defined in properties + * * @param properties extra commit metadata to attach to the snapshot committed within callable * @param callable the code to be executed * @param exClass the expected type of exception which would be thrown from callable diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/FileRewriteCoordinator.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/FileRewriteCoordinator.java index acd5f64d7ed6..210e861a4c16 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/FileRewriteCoordinator.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/FileRewriteCoordinator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -39,22 +38,26 @@ public class FileRewriteCoordinator { private final Map, Set> resultMap = Maps.newConcurrentMap(); - private FileRewriteCoordinator() { - } + private FileRewriteCoordinator() {} public static FileRewriteCoordinator get() { return INSTANCE; } /** - * Called to persist the output of a rewrite action for a specific group. Since the write is done via a - * Spark Datasource, we have to propagate the result through this side-effect call. + * Called to persist the output of a rewrite action for a specific group. Since the write is done + * via a Spark Datasource, we have to propagate the result through this side-effect call. + * * @param table table where the rewrite is occurring * @param fileSetID the id used to identify the source set of files being rewritten * @param newDataFiles the new files which have been written */ public void stageRewrite(Table table, String fileSetID, Set newDataFiles) { - LOG.debug("Staging the output for {} - fileset {} with {} files", table.name(), fileSetID, newDataFiles.size()); + LOG.debug( + "Staging the output for {} - fileset {} with {} files", + table.name(), + fileSetID, + newDataFiles.size()); Pair id = toID(table, fileSetID); resultMap.put(id, newDataFiles); } @@ -62,9 +65,8 @@ public void stageRewrite(Table table, String fileSetID, Set newDataFil public Set fetchNewDataFiles(Table table, String fileSetID) { Pair id = toID(table, fileSetID); Set result = resultMap.get(id); - ValidationException.check(result != null, - "No results for rewrite of file set %s in table %s", - fileSetID, table); + ValidationException.check( + result != null, "No results for rewrite of file set %s in table %s", fileSetID, table); return result; } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/FileScanTaskSetManager.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/FileScanTaskSetManager.java index 827b674ca16d..4b6da39905c1 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/FileScanTaskSetManager.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/FileScanTaskSetManager.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -37,15 +36,15 @@ public class FileScanTaskSetManager { private final Map, List> tasksMap = Maps.newConcurrentMap(); - private FileScanTaskSetManager() { - } + private FileScanTaskSetManager() {} public static FileScanTaskSetManager get() { return INSTANCE; } public void stageTasks(Table table, String setID, List tasks) { - Preconditions.checkArgument(tasks != null && tasks.size() > 0, "Cannot stage null or empty tasks"); + Preconditions.checkArgument( + tasks != null && tasks.size() > 0, "Cannot stage null or empty tasks"); Pair id = toID(table, setID); tasksMap.put(id, tasks); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java index 862626d0cd6d..87de0a98b934 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.transforms.Transform; @@ -27,14 +26,18 @@ import org.apache.spark.sql.types.DataTypes; public class IcebergSpark { - private IcebergSpark() { - } + private IcebergSpark() {} - public static void registerBucketUDF(SparkSession session, String funcName, DataType sourceType, int numBuckets) { + public static void registerBucketUDF( + SparkSession session, String funcName, DataType sourceType, int numBuckets) { SparkTypeToType typeConverter = new SparkTypeToType(); Type sourceIcebergType = typeConverter.atomic(sourceType); Transform bucket = Transforms.bucket(sourceIcebergType, numBuckets); - session.udf().register(funcName, - value -> bucket.apply(SparkValueConverter.convert(sourceIcebergType, value)), DataTypes.IntegerType); + session + .udf() + .register( + funcName, + value -> bucket.apply(SparkValueConverter.convert(sourceIcebergType, value)), + DataTypes.IntegerType); } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java index a35808fd8ce6..c0756d924e2f 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java @@ -16,13 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; -/** - * Captures information about the current job - * which is used for displaying on the UI - */ +/** Captures information about the current job which is used for displaying on the UI */ public class JobGroupInfo { private String groupId; private String description; diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java index 155dce707701..dc8ba69d40a8 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.spark.SparkContext; @@ -26,10 +25,10 @@ public class JobGroupUtils { private static final String JOB_GROUP_ID = SparkContext$.MODULE$.SPARK_JOB_GROUP_ID(); private static final String JOB_GROUP_DESC = SparkContext$.MODULE$.SPARK_JOB_DESCRIPTION(); - private static final String JOB_INTERRUPT_ON_CANCEL = SparkContext$.MODULE$.SPARK_JOB_INTERRUPT_ON_CANCEL(); + private static final String JOB_INTERRUPT_ON_CANCEL = + SparkContext$.MODULE$.SPARK_JOB_INTERRUPT_ON_CANCEL(); - private JobGroupUtils() { - } + private JobGroupUtils() {} public static JobGroupInfo getJobGroupInfo(SparkContext sparkContext) { String groupId = sparkContext.getLocalProperty(JOB_GROUP_ID); @@ -41,6 +40,7 @@ public static JobGroupInfo getJobGroupInfo(SparkContext sparkContext) { public static void setJobGroupInfo(SparkContext sparkContext, JobGroupInfo info) { sparkContext.setLocalProperty(JOB_GROUP_ID, info.groupId()); sparkContext.setLocalProperty(JOB_GROUP_DESC, info.description()); - sparkContext.setLocalProperty(JOB_INTERRUPT_ON_CANCEL, String.valueOf(info.interruptOnCancel())); + sparkContext.setLocalProperty( + JOB_INTERRUPT_ON_CANCEL, String.valueOf(info.interruptOnCancel())); } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/OrderField.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/OrderField.java index 0f24ee38fceb..387f0f348c36 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/OrderField.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/OrderField.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.NullOrder; @@ -27,40 +26,55 @@ import org.apache.spark.sql.connector.iceberg.expressions.SortOrder; class OrderField implements SortOrder { - static OrderField column(String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + static OrderField column( + String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { return new OrderField(Expressions.column(fieldName), toSpark(direction), toSpark(nullOrder)); } - static OrderField bucket(String fieldName, int numBuckets, org.apache.iceberg.SortDirection direction, - NullOrder nullOrder) { - return new OrderField(Expressions.bucket(numBuckets, fieldName), toSpark(direction), toSpark(nullOrder)); + static OrderField bucket( + String fieldName, + int numBuckets, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { + return new OrderField( + Expressions.bucket(numBuckets, fieldName), toSpark(direction), toSpark(nullOrder)); } - static OrderField truncate(String fieldName, int width, org.apache.iceberg.SortDirection direction, - NullOrder nullOrder) { - return new OrderField(Expressions.apply( - "truncate", Expressions.column(fieldName), Expressions.literal(width)), - toSpark(direction), toSpark(nullOrder)); + static OrderField truncate( + String fieldName, + int width, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { + return new OrderField( + Expressions.apply("truncate", Expressions.column(fieldName), Expressions.literal(width)), + toSpark(direction), + toSpark(nullOrder)); } - static OrderField year(String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + static OrderField year( + String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { return new OrderField(Expressions.years(fieldName), toSpark(direction), toSpark(nullOrder)); } - static OrderField month(String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + static OrderField month( + String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { return new OrderField(Expressions.months(fieldName), toSpark(direction), toSpark(nullOrder)); } - static OrderField day(String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + static OrderField day( + String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { return new OrderField(Expressions.days(fieldName), toSpark(direction), toSpark(nullOrder)); } - static OrderField hour(String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + static OrderField hour( + String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { return new OrderField(Expressions.hours(fieldName), toSpark(direction), toSpark(nullOrder)); } private static SortDirection toSpark(org.apache.iceberg.SortDirection direction) { - return direction == org.apache.iceberg.SortDirection.ASC ? SortDirection.ASCENDING : SortDirection.DESCENDING; + return direction == org.apache.iceberg.SortDirection.ASC + ? SortDirection.ASCENDING + : SortDirection.DESCENDING; } private static NullOrdering toSpark(NullOrder nullOrder) { @@ -94,9 +108,10 @@ public NullOrdering nullOrdering() { @Override public String describe() { - return String.format("%s %s %s", - expr.describe(), - direction == SortDirection.ASCENDING ? "ASC" : "DESC", - nullOrder == NullOrdering.NULLS_FIRST ? "NULLS FIRST" : "NULLS LAST"); + return String.format( + "%s %s %s", + expr.describe(), + direction == SortDirection.ASCENDING ? "ASC" : "DESC", + nullOrder == NullOrdering.NULLS_FIRST ? "NULLS FIRST" : "NULLS LAST"); } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/PathIdentifier.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/PathIdentifier.java index 235097ea46cc..110af6b87de5 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/PathIdentifier.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/PathIdentifier.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -36,9 +35,10 @@ public PathIdentifier(String location) { this.location = location; List pathParts = SPLIT.splitToList(location); name = Iterables.getLast(pathParts); - namespace = pathParts.size() > 1 ? - new String[]{JOIN.join(pathParts.subList(0, pathParts.size() - 1))} : - new String[0]; + namespace = + pathParts.size() > 1 + ? new String[] {JOIN.join(pathParts.subList(0, pathParts.size() - 1))} + : new String[0]; } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java index 3bdf984ed219..3c111d3b44cb 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -70,7 +69,8 @@ public Type schema(Schema schema, Supplier structResult) { @Override public Type struct(Types.StructType struct, Iterable fieldResults) { - Preconditions.checkNotNull(struct, "Cannot prune null struct. Pruning must start with a schema."); + Preconditions.checkNotNull( + struct, "Cannot prune null struct. Pruning must start with a schema."); Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); StructType requestedStruct = (StructType) current; @@ -92,13 +92,13 @@ public Type struct(Types.StructType struct, Iterable fieldResults) { } else if (field.isOptional()) { changed = true; - projectedFields.put(field.name(), - Types.NestedField.optional(field.fieldId(), field.name(), type)); + projectedFields.put( + field.name(), Types.NestedField.optional(field.fieldId(), field.name(), type)); } else { changed = true; - projectedFields.put(field.name(), - Types.NestedField.required(field.fieldId(), field.name(), type)); + projectedFields.put( + field.name(), Types.NestedField.required(field.fieldId(), field.name(), type)); } } @@ -145,8 +145,10 @@ public Type field(Types.NestedField field, Supplier fieldResult) { int fieldIndex = requestedStruct.fieldIndex(field.name()); StructField requestedField = requestedStruct.fields()[fieldIndex]; - Preconditions.checkArgument(requestedField.nullable() || field.isRequired(), - "Cannot project an optional field as non-null: %s", field.name()); + Preconditions.checkArgument( + requestedField.nullable() || field.isRequired(), + "Cannot project an optional field as non-null: %s", + field.name()); this.current = requestedField.dataType(); try { @@ -164,8 +166,10 @@ public Type list(Types.ListType list, Supplier elementResult) { Preconditions.checkArgument(current instanceof ArrayType, "Not an array: %s", current); ArrayType requestedArray = (ArrayType) current; - Preconditions.checkArgument(requestedArray.containsNull() || !list.isElementOptional(), - "Cannot project an array of optional elements as required elements: %s", requestedArray); + Preconditions.checkArgument( + requestedArray.containsNull() || !list.isElementOptional(), + "Cannot project an array of optional elements as required elements: %s", + requestedArray); this.current = requestedArray.elementType(); try { @@ -190,10 +194,14 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu Preconditions.checkArgument(current instanceof MapType, "Not a map: %s", current); MapType requestedMap = (MapType) current; - Preconditions.checkArgument(requestedMap.valueContainsNull() || !map.isValueOptional(), - "Cannot project a map of optional values as required values: %s", map); - Preconditions.checkArgument(StringType.class.isInstance(requestedMap.keyType()), - "Invalid map key type (not string): %s", requestedMap.keyType()); + Preconditions.checkArgument( + requestedMap.valueContainsNull() || !map.isValueOptional(), + "Cannot project a map of optional values as required values: %s", + map); + Preconditions.checkArgument( + StringType.class.isInstance(requestedMap.keyType()), + "Invalid map key type (not string): %s", + requestedMap.keyType()); this.current = requestedMap.valueType(); try { @@ -215,23 +223,32 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu @Override public Type primitive(Type.PrimitiveType primitive) { Class expectedType = TYPES.get(primitive.typeId()); - Preconditions.checkArgument(expectedType != null && expectedType.isInstance(current), - "Cannot project %s to incompatible type: %s", primitive, current); + Preconditions.checkArgument( + expectedType != null && expectedType.isInstance(current), + "Cannot project %s to incompatible type: %s", + primitive, + current); // additional checks based on type switch (primitive.typeId()) { case DECIMAL: Types.DecimalType decimal = (Types.DecimalType) primitive; DecimalType requestedDecimal = (DecimalType) current; - Preconditions.checkArgument(requestedDecimal.scale() == decimal.scale(), - "Cannot project decimal with incompatible scale: %s != %s", requestedDecimal.scale(), decimal.scale()); - Preconditions.checkArgument(requestedDecimal.precision() >= decimal.precision(), + Preconditions.checkArgument( + requestedDecimal.scale() == decimal.scale(), + "Cannot project decimal with incompatible scale: %s != %s", + requestedDecimal.scale(), + decimal.scale()); + Preconditions.checkArgument( + requestedDecimal.precision() >= decimal.precision(), "Cannot project decimal with incompatible precision: %s < %s", - requestedDecimal.precision(), decimal.precision()); + requestedDecimal.precision(), + decimal.precision()); break; case TIMESTAMP: Types.TimestampType timestamp = (Types.TimestampType) primitive; - Preconditions.checkArgument(timestamp.shouldAdjustToUTC(), + Preconditions.checkArgument( + timestamp.shouldAdjustToUTC(), "Cannot project timestamp (without time zone) as timestamptz (with time zone)"); break; default: @@ -240,19 +257,19 @@ public Type primitive(Type.PrimitiveType primitive) { return primitive; } - private static final ImmutableMap> TYPES = ImmutableMap - .>builder() - .put(TypeID.BOOLEAN, BooleanType.class) - .put(TypeID.INTEGER, IntegerType.class) - .put(TypeID.LONG, LongType.class) - .put(TypeID.FLOAT, FloatType.class) - .put(TypeID.DOUBLE, DoubleType.class) - .put(TypeID.DATE, DateType.class) - .put(TypeID.TIMESTAMP, TimestampType.class) - .put(TypeID.DECIMAL, DecimalType.class) - .put(TypeID.UUID, StringType.class) - .put(TypeID.STRING, StringType.class) - .put(TypeID.FIXED, BinaryType.class) - .put(TypeID.BINARY, BinaryType.class) - .build(); + private static final ImmutableMap> TYPES = + ImmutableMap.>builder() + .put(TypeID.BOOLEAN, BooleanType.class) + .put(TypeID.INTEGER, IntegerType.class) + .put(TypeID.LONG, LongType.class) + .put(TypeID.FLOAT, FloatType.class) + .put(TypeID.DOUBLE, DoubleType.class) + .put(TypeID.DATE, DateType.class) + .put(TypeID.TIMESTAMP, TimestampType.class) + .put(TypeID.DECIMAL, DecimalType.class) + .put(TypeID.UUID, StringType.class) + .put(TypeID.STRING, StringType.class) + .put(TypeID.FIXED, BinaryType.class) + .put(TypeID.BINARY, BinaryType.class) + .build(); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java index c6984e2fe8cd..61a215b938c5 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -68,7 +67,8 @@ public Type schema(Schema schema, Supplier structResult) { @Override public Type struct(Types.StructType struct, Iterable fieldResults) { - Preconditions.checkNotNull(struct, "Cannot prune null struct. Pruning must start with a schema."); + Preconditions.checkNotNull( + struct, "Cannot prune null struct. Pruning must start with a schema."); Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); List fields = struct.fields(); @@ -120,8 +120,10 @@ public Type field(Types.NestedField field, Supplier fieldResult) { int fieldIndex = requestedStruct.fieldIndex(field.name()); StructField requestedField = requestedStruct.fields()[fieldIndex]; - Preconditions.checkArgument(requestedField.nullable() || field.isRequired(), - "Cannot project an optional field as non-null: %s", field.name()); + Preconditions.checkArgument( + requestedField.nullable() || field.isRequired(), + "Cannot project an optional field as non-null: %s", + field.name()); this.current = requestedField.dataType(); try { @@ -139,8 +141,10 @@ public Type list(Types.ListType list, Supplier elementResult) { Preconditions.checkArgument(current instanceof ArrayType, "Not an array: %s", current); ArrayType requestedArray = (ArrayType) current; - Preconditions.checkArgument(requestedArray.containsNull() || !list.isElementOptional(), - "Cannot project an array of optional elements as required elements: %s", requestedArray); + Preconditions.checkArgument( + requestedArray.containsNull() || !list.isElementOptional(), + "Cannot project an array of optional elements as required elements: %s", + requestedArray); this.current = requestedArray.elementType(); try { @@ -165,8 +169,10 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu Preconditions.checkArgument(current instanceof MapType, "Not a map: %s", current); MapType requestedMap = (MapType) current; - Preconditions.checkArgument(requestedMap.valueContainsNull() || !map.isValueOptional(), - "Cannot project a map of optional values as required values: %s", map); + Preconditions.checkArgument( + requestedMap.valueContainsNull() || !map.isValueOptional(), + "Cannot project a map of optional values as required values: %s", + map); this.current = requestedMap.valueType(); try { @@ -188,19 +194,27 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu @Override public Type primitive(Type.PrimitiveType primitive) { Class expectedType = TYPES.get(primitive.typeId()); - Preconditions.checkArgument(expectedType != null && expectedType.isInstance(current), - "Cannot project %s to incompatible type: %s", primitive, current); + Preconditions.checkArgument( + expectedType != null && expectedType.isInstance(current), + "Cannot project %s to incompatible type: %s", + primitive, + current); // additional checks based on type switch (primitive.typeId()) { case DECIMAL: Types.DecimalType decimal = (Types.DecimalType) primitive; DecimalType requestedDecimal = (DecimalType) current; - Preconditions.checkArgument(requestedDecimal.scale() == decimal.scale(), - "Cannot project decimal with incompatible scale: %s != %s", requestedDecimal.scale(), decimal.scale()); - Preconditions.checkArgument(requestedDecimal.precision() >= decimal.precision(), + Preconditions.checkArgument( + requestedDecimal.scale() == decimal.scale(), + "Cannot project decimal with incompatible scale: %s != %s", + requestedDecimal.scale(), + decimal.scale()); + Preconditions.checkArgument( + requestedDecimal.precision() >= decimal.precision(), "Cannot project decimal with incompatible precision: %s < %s", - requestedDecimal.precision(), decimal.precision()); + requestedDecimal.precision(), + decimal.precision()); break; default: } @@ -208,19 +222,19 @@ public Type primitive(Type.PrimitiveType primitive) { return primitive; } - private static final ImmutableMap> TYPES = ImmutableMap - .>builder() - .put(TypeID.BOOLEAN, BooleanType.class) - .put(TypeID.INTEGER, IntegerType.class) - .put(TypeID.LONG, LongType.class) - .put(TypeID.FLOAT, FloatType.class) - .put(TypeID.DOUBLE, DoubleType.class) - .put(TypeID.DATE, DateType.class) - .put(TypeID.TIMESTAMP, TimestampType.class) - .put(TypeID.DECIMAL, DecimalType.class) - .put(TypeID.UUID, StringType.class) - .put(TypeID.STRING, StringType.class) - .put(TypeID.FIXED, BinaryType.class) - .put(TypeID.BINARY, BinaryType.class) - .build(); + private static final ImmutableMap> TYPES = + ImmutableMap.>builder() + .put(TypeID.BOOLEAN, BooleanType.class) + .put(TypeID.INTEGER, IntegerType.class) + .put(TypeID.LONG, LongType.class) + .put(TypeID.FLOAT, FloatType.class) + .put(TypeID.DOUBLE, DoubleType.class) + .put(TypeID.DATE, DateType.class) + .put(TypeID.TIMESTAMP, TimestampType.class) + .put(TypeID.DECIMAL, DecimalType.class) + .put(TypeID.UUID, StringType.class) + .put(TypeID.STRING, StringType.class) + .put(TypeID.FIXED, BinaryType.class) + .put(TypeID.BINARY, BinaryType.class) + .build(); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java index a27d06e7a1d7..bc8a966488ee 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -41,22 +40,25 @@ /** * An implementation of StagedTable that mimics the behavior of Spark's non-atomic CTAS and RTAS. - *

    - * A Spark catalog can implement StagingTableCatalog to support atomic operations by producing StagedTable. But if a - * catalog implements StagingTableCatalog, Spark expects the catalog to be able to produce a StagedTable for any table - * loaded by the catalog. This assumption doesn't always work, as in the case of {@link SparkSessionCatalog}, which - * supports atomic operations can produce a StagedTable for Iceberg tables, but wraps the session catalog and cannot - * necessarily produce a working StagedTable implementation for tables that it loads. - *

    - * The work-around is this class, which implements the StagedTable interface but does not have atomic behavior. Instead, - * the StagedTable interface is used to implement the behavior of the non-atomic SQL plans that will create a table, - * write, and will drop the table to roll back. - *

    - * This StagedTable implements SupportsRead, SupportsWrite, and SupportsDelete by passing the calls to the real table. - * Implementing those interfaces is safe because Spark will not use them unless the table supports them and returns the - * corresponding capabilities from {@link #capabilities()}. + * + *

    A Spark catalog can implement StagingTableCatalog to support atomic operations by producing + * StagedTable. But if a catalog implements StagingTableCatalog, Spark expects the catalog to be + * able to produce a StagedTable for any table loaded by the catalog. This assumption doesn't always + * work, as in the case of {@link SparkSessionCatalog}, which supports atomic operations can produce + * a StagedTable for Iceberg tables, but wraps the session catalog and cannot necessarily produce a + * working StagedTable implementation for tables that it loads. + * + *

    The work-around is this class, which implements the StagedTable interface but does not have + * atomic behavior. Instead, the StagedTable interface is used to implement the behavior of the + * non-atomic SQL plans that will create a table, write, and will drop the table to roll back. + * + *

    This StagedTable implements SupportsRead, SupportsWrite, and SupportsDelete by passing the + * calls to the real table. Implementing those interfaces is safe because Spark will not use them + * unless the table supports them and returns the corresponding capabilities from {@link + * #capabilities()}. */ -public class RollbackStagedTable implements StagedTable, SupportsRead, SupportsWrite, SupportsDelete { +public class RollbackStagedTable + implements StagedTable, SupportsRead, SupportsWrite, SupportsDelete { private final TableCatalog catalog; private final Identifier ident; private final Table table; @@ -119,19 +121,22 @@ public WriteBuilder newWriteBuilder(LogicalWriteInfo info) { } private void call(Class requiredClass, Consumer task) { - callReturning(requiredClass, inst -> { - task.accept(inst); - return null; - }); + callReturning( + requiredClass, + inst -> { + task.accept(inst); + return null; + }); } private R callReturning(Class requiredClass, Function task) { if (requiredClass.isInstance(table)) { return task.apply(requiredClass.cast(table)); } else { - throw new UnsupportedOperationException(String.format( - "Table does not implement %s: %s (%s)", - requiredClass.getSimpleName(), table.name(), table.getClass().getName())); + throw new UnsupportedOperationException( + String.format( + "Table does not implement %s: %s (%s)", + requiredClass.getSimpleName(), table.name(), table.getClass().getName())); } } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SortOrderToSpark.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SortOrderToSpark.java index 4f67be73405a..41251eb56a86 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SortOrderToSpark.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SortOrderToSpark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.NullOrder; @@ -30,12 +29,14 @@ public OrderField field(String sourceName, int id, SortDirection direction, Null } @Override - public OrderField bucket(String sourceName, int id, int width, SortDirection direction, NullOrder nullOrder) { + public OrderField bucket( + String sourceName, int id, int width, SortDirection direction, NullOrder nullOrder) { return OrderField.bucket(sourceName, width, direction, nullOrder); } @Override - public OrderField truncate(String sourceName, int id, int width, SortDirection direction, NullOrder nullOrder) { + public OrderField truncate( + String sourceName, int id, int width, SortDirection direction, NullOrder nullOrder) { return OrderField.truncate(sourceName, width, direction, nullOrder); } @@ -59,4 +60,3 @@ public OrderField hour(String sourceName, int id, SortDirection direction, NullO return OrderField.hour(sourceName, direction, nullOrder); } } - diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java index c2c3c6992963..a5a58922c933 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE; + import java.nio.ByteBuffer; import java.util.Arrays; import java.util.Collections; @@ -94,18 +97,13 @@ import scala.collection.JavaConverters; import scala.collection.Seq; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE; - public class Spark3Util { - private static final Set RESERVED_PROPERTIES = ImmutableSet.of( - TableCatalog.PROP_LOCATION, TableCatalog.PROP_PROVIDER); + private static final Set RESERVED_PROPERTIES = + ImmutableSet.of(TableCatalog.PROP_LOCATION, TableCatalog.PROP_PROVIDER); private static final Joiner DOT = Joiner.on("."); - private Spark3Util() { - } + private Spark3Util() {} public static Map rebuildCreateProperties(Map createProperties) { ImmutableMap.Builder tableProperties = ImmutableMap.builder(); @@ -134,7 +132,8 @@ public static Map rebuildCreateProperties(Map cr * @param changes a list of Spark table changes * @return the UpdateProperties operation configured with the changes */ - public static UpdateProperties applyPropertyChanges(UpdateProperties pendingUpdate, List changes) { + public static UpdateProperties applyPropertyChanges( + UpdateProperties pendingUpdate, List changes) { for (TableChange change : changes) { if (change instanceof TableChange.SetProperty) { TableChange.SetProperty set = (TableChange.SetProperty) change; @@ -159,7 +158,8 @@ public static UpdateProperties applyPropertyChanges(UpdateProperties pendingUpda * @param changes a list of Spark table changes * @return the UpdateSchema operation configured with the changes */ - public static UpdateSchema applySchemaChanges(UpdateSchema pendingUpdate, List changes) { + public static UpdateSchema applySchemaChanges( + UpdateSchema pendingUpdate, List changes) { for (TableChange change : changes) { if (change instanceof TableChange.AddColumn) { apply(pendingUpdate, (TableChange.AddColumn) change); @@ -167,8 +167,11 @@ public static UpdateSchema applySchemaChanges(UpdateSchema pendingUpdate, List transforms = PartitionSpecVisitor.visit(spec, - new PartitionSpecVisitor() { - @Override - public Transform identity(String sourceName, int sourceId) { - return Expressions.identity(sourceName); - } - - @Override - public Transform bucket(String sourceName, int sourceId, int numBuckets) { - return Expressions.bucket(numBuckets, sourceName); - } - - @Override - public Transform truncate(String sourceName, int sourceId, int width) { - return Expressions.apply("truncate", Expressions.column(sourceName), Expressions.literal(width)); - } - - @Override - public Transform year(String sourceName, int sourceId) { - return Expressions.years(sourceName); - } - - @Override - public Transform month(String sourceName, int sourceId) { - return Expressions.months(sourceName); - } - - @Override - public Transform day(String sourceName, int sourceId) { - return Expressions.days(sourceName); - } - - @Override - public Transform hour(String sourceName, int sourceId) { - return Expressions.hours(sourceName); - } - - @Override - public Transform alwaysNull(int fieldId, String sourceName, int sourceId) { - // do nothing for alwaysNull, it doesn't need to be converted to a transform - return null; - } - - @Override - public Transform unknown(int fieldId, String sourceName, int sourceId, String transform) { - return Expressions.apply(transform, Expressions.column(sourceName)); - } - }); + List transforms = + PartitionSpecVisitor.visit( + spec, + new PartitionSpecVisitor() { + @Override + public Transform identity(String sourceName, int sourceId) { + return Expressions.identity(sourceName); + } + + @Override + public Transform bucket(String sourceName, int sourceId, int numBuckets) { + return Expressions.bucket(numBuckets, sourceName); + } + + @Override + public Transform truncate(String sourceName, int sourceId, int width) { + return Expressions.apply( + "truncate", Expressions.column(sourceName), Expressions.literal(width)); + } + + @Override + public Transform year(String sourceName, int sourceId) { + return Expressions.years(sourceName); + } + + @Override + public Transform month(String sourceName, int sourceId) { + return Expressions.months(sourceName); + } + + @Override + public Transform day(String sourceName, int sourceId) { + return Expressions.days(sourceName); + } + + @Override + public Transform hour(String sourceName, int sourceId) { + return Expressions.hours(sourceName); + } + + @Override + public Transform alwaysNull(int fieldId, String sourceName, int sourceId) { + // do nothing for alwaysNull, it doesn't need to be converted to a transform + return null; + } + + @Override + public Transform unknown( + int fieldId, String sourceName, int sourceId, String transform) { + return Expressions.apply(transform, Expressions.column(sourceName)); + } + }); return transforms.stream().filter(Objects::nonNull).toArray(Transform[]::new); } @@ -326,7 +340,8 @@ public static Distribution buildRequiredDistribution(org.apache.iceberg.Table ta } } - public static SortOrder[] buildRequiredOrdering(Distribution distribution, org.apache.iceberg.Table table) { + public static SortOrder[] buildRequiredOrdering( + Distribution distribution, org.apache.iceberg.Table table) { if (distribution instanceof OrderedDistribution) { OrderedDistribution orderedDistribution = (OrderedDistribution) distribution; return orderedDistribution.ordering(); @@ -338,7 +353,8 @@ public static SortOrder[] buildRequiredOrdering(Distribution distribution, org.a public static DistributionMode distributionModeFor(org.apache.iceberg.Table table) { boolean isSortedTable = !table.sortOrder().isUnsorted(); - String defaultModeName = isSortedTable ? WRITE_DISTRIBUTION_MODE_RANGE : WRITE_DISTRIBUTION_MODE_NONE; + String defaultModeName = + isSortedTable ? WRITE_DISTRIBUTION_MODE_RANGE : WRITE_DISTRIBUTION_MODE_NONE; String modeName = table.properties().getOrDefault(WRITE_DISTRIBUTION_MODE, defaultModeName); return DistributionMode.fromName(modeName); } @@ -349,8 +365,10 @@ public static SortOrder[] convert(org.apache.iceberg.SortOrder sortOrder) { } public static Term toIcebergTerm(Transform transform) { - Preconditions.checkArgument(transform.references().length == 1, - "Cannot convert transform with more than one column reference: %s", transform); + Preconditions.checkArgument( + transform.references().length == 1, + "Cannot convert transform with more than one column reference: %s", + transform); String colName = DOT.join(transform.references()[0].fieldNames()); switch (transform.name().toLowerCase(Locale.ROOT)) { case "identity": @@ -388,8 +406,10 @@ public static PartitionSpec toPartitionSpec(Schema schema, Transform[] partition PartitionSpec.Builder builder = PartitionSpec.builderFor(schema); for (Transform transform : partitioning) { - Preconditions.checkArgument(transform.references().length == 1, - "Cannot convert transform with more than one column reference: %s", transform); + Preconditions.checkArgument( + transform.references().length == 1, + "Cannot convert transform with more than one column reference: %s", + transform); String colName = DOT.join(transform.references()[0].fieldNames()); switch (transform.name().toLowerCase(Locale.ROOT)) { case "identity": @@ -429,14 +449,16 @@ private static int findWidth(Transform transform) { if (expr instanceof Literal) { if (((Literal) expr).dataType() instanceof IntegerType) { Literal lit = (Literal) expr; - Preconditions.checkArgument(lit.value() > 0, - "Unsupported width for transform: %s", transform.describe()); + Preconditions.checkArgument( + lit.value() > 0, "Unsupported width for transform: %s", transform.describe()); return lit.value(); } else if (((Literal) expr).dataType() instanceof LongType) { Literal lit = (Literal) expr; - Preconditions.checkArgument(lit.value() > 0 && lit.value() < Integer.MAX_VALUE, - "Unsupported width for transform: %s", transform.describe()); + Preconditions.checkArgument( + lit.value() > 0 && lit.value() < Integer.MAX_VALUE, + "Unsupported width for transform: %s", + transform.describe()); if (lit.value() > Integer.MAX_VALUE) { throw new IllegalArgumentException(); } @@ -449,7 +471,8 @@ private static int findWidth(Transform transform) { } private static String leafName(String[] fieldNames) { - Preconditions.checkArgument(fieldNames.length > 0, "Invalid field name: at least one name is required"); + Preconditions.checkArgument( + fieldNames.length > 0, "Invalid field name: at least one name is required"); return fieldNames[fieldNames.length - 1]; } @@ -485,7 +508,8 @@ public static String describe(org.apache.iceberg.SortOrder order) { return Joiner.on(", ").join(SortOrderVisitor.visit(order, DescribeSortOrderVisitor.INSTANCE)); } - public static Long propertyAsLong(CaseInsensitiveStringMap options, String property, Long defaultValue) { + public static Long propertyAsLong( + CaseInsensitiveStringMap options, String property, Long defaultValue) { if (defaultValue != null) { return options.getLong(property, defaultValue); } @@ -498,7 +522,8 @@ public static Long propertyAsLong(CaseInsensitiveStringMap options, String prope return null; } - public static Integer propertyAsInt(CaseInsensitiveStringMap options, String property, Integer defaultValue) { + public static Integer propertyAsInt( + CaseInsensitiveStringMap options, String property, Integer defaultValue) { if (defaultValue != null) { return options.getInt(property, defaultValue); } @@ -511,7 +536,8 @@ public static Integer propertyAsInt(CaseInsensitiveStringMap options, String pro return null; } - public static Boolean propertyAsBoolean(CaseInsensitiveStringMap options, String property, Boolean defaultValue) { + public static Boolean propertyAsBoolean( + CaseInsensitiveStringMap options, String property, Boolean defaultValue) { if (defaultValue != null) { return options.getBoolean(property, defaultValue); } @@ -528,8 +554,7 @@ public static class DescribeSchemaVisitor extends TypeUtil.SchemaVisitor private static final Joiner COMMA = Joiner.on(','); private static final DescribeSchemaVisitor INSTANCE = new DescribeSchemaVisitor(); - private DescribeSchemaVisitor() { - } + private DescribeSchemaVisitor() {} @Override public String schema(Schema schema, String structResult) { @@ -589,11 +614,11 @@ public String primitive(Type.PrimitiveType primitive) { } } - private static class DescribeExpressionVisitor extends ExpressionVisitors.ExpressionVisitor { + private static class DescribeExpressionVisitor + extends ExpressionVisitors.ExpressionVisitor { private static final DescribeExpressionVisitor INSTANCE = new DescribeExpressionVisitor(); - private DescribeExpressionVisitor() { - } + private DescribeExpressionVisitor() {} @Override public String alwaysTrue() { @@ -662,7 +687,9 @@ public String predicate(UnboundPredicate pred) { } private static String sqlString(List> literals) { - return literals.stream().map(DescribeExpressionVisitor::sqlString).collect(Collectors.joining(", ")); + return literals.stream() + .map(DescribeExpressionVisitor::sqlString) + .collect(Collectors.joining(", ")); } private static String sqlString(org.apache.iceberg.expressions.Literal lit) { @@ -684,18 +711,21 @@ private static String sqlString(org.apache.iceberg.expressions.Literal lit) { * @param type the type of metadata table * @return a Dataset that will read the metadata table */ - private static Dataset loadMetadataTable(SparkSession spark, org.apache.iceberg.Table table, - MetadataTableType type) { - Table metadataTable = new SparkTable(MetadataTableUtils.createMetadataTableInstance(table, type), false); - return Dataset.ofRows(spark, DataSourceV2Relation.create(metadataTable, Some.empty(), Some.empty())); + private static Dataset loadMetadataTable( + SparkSession spark, org.apache.iceberg.Table table, MetadataTableType type) { + Table metadataTable = + new SparkTable(MetadataTableUtils.createMetadataTableInstance(table, type), false); + return Dataset.ofRows( + spark, DataSourceV2Relation.create(metadataTable, Some.empty(), Some.empty())); } /** - * Returns an Iceberg Table by its name from a Spark V2 Catalog. If cache is enabled in {@link SparkCatalog}, - * the {@link TableOperations} of the table may be stale, please refresh the table to get the latest one. + * Returns an Iceberg Table by its name from a Spark V2 Catalog. If cache is enabled in {@link + * SparkCatalog}, the {@link TableOperations} of the table may be stale, please refresh the table + * to get the latest one. * * @param spark SparkSession used for looking up catalog references and tables - * @param name The multipart identifier of the Iceberg table + * @param name The multipart identifier of the Iceberg table * @return an Iceberg table */ public static org.apache.iceberg.Table loadIcebergTable(SparkSession spark, String name) @@ -707,24 +737,28 @@ public static org.apache.iceberg.Table loadIcebergTable(SparkSession spark, Stri return toIcebergTable(sparkTable); } - public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, String name) throws ParseException { - return catalogAndIdentifier(spark, name, spark.sessionState().catalogManager().currentCatalog()); + public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, String name) + throws ParseException { + return catalogAndIdentifier( + spark, name, spark.sessionState().catalogManager().currentCatalog()); } - public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, String name, - CatalogPlugin defaultCatalog) throws ParseException { + public static CatalogAndIdentifier catalogAndIdentifier( + SparkSession spark, String name, CatalogPlugin defaultCatalog) throws ParseException { ParserInterface parser = spark.sessionState().sqlParser(); Seq multiPartIdentifier = parser.parseMultipartIdentifier(name); List javaMultiPartIdentifier = JavaConverters.seqAsJavaList(multiPartIdentifier); return catalogAndIdentifier(spark, javaMultiPartIdentifier, defaultCatalog); } - public static CatalogAndIdentifier catalogAndIdentifier(String description, SparkSession spark, String name) { - return catalogAndIdentifier(description, spark, name, spark.sessionState().catalogManager().currentCatalog()); + public static CatalogAndIdentifier catalogAndIdentifier( + String description, SparkSession spark, String name) { + return catalogAndIdentifier( + description, spark, name, spark.sessionState().catalogManager().currentCatalog()); } - public static CatalogAndIdentifier catalogAndIdentifier(String description, SparkSession spark, - String name, CatalogPlugin defaultCatalog) { + public static CatalogAndIdentifier catalogAndIdentifier( + String description, SparkSession spark, String name, CatalogPlugin defaultCatalog) { try { return catalogAndIdentifier(spark, name, defaultCatalog); } catch (ParseException e) { @@ -732,20 +766,23 @@ public static CatalogAndIdentifier catalogAndIdentifier(String description, Spar } } - public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, List nameParts) { - return catalogAndIdentifier(spark, nameParts, spark.sessionState().catalogManager().currentCatalog()); + public static CatalogAndIdentifier catalogAndIdentifier( + SparkSession spark, List nameParts) { + return catalogAndIdentifier( + spark, nameParts, spark.sessionState().catalogManager().currentCatalog()); } /** - * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply - * Attempts to find the catalog and identifier a multipart identifier represents + * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply Attempts to find the + * catalog and identifier a multipart identifier represents + * * @param spark Spark session to use for resolution * @param nameParts Multipart identifier representing a table * @param defaultCatalog Catalog to use if none is specified * @return The CatalogPlugin and Identifier for the table */ - public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, List nameParts, - CatalogPlugin defaultCatalog) { + public static CatalogAndIdentifier catalogAndIdentifier( + SparkSession spark, List nameParts, CatalogPlugin defaultCatalog) { CatalogManager catalogManager = spark.sessionState().catalogManager(); String[] currentNamespace; @@ -755,18 +792,19 @@ public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, List currentNamespace = defaultCatalog.defaultNamespace(); } - Pair catalogIdentifier = SparkUtil.catalogAndIdentifier(nameParts, - catalogName -> { - try { - return catalogManager.catalog(catalogName); - } catch (Exception e) { - return null; - } - }, - Identifier::of, - defaultCatalog, - currentNamespace - ); + Pair catalogIdentifier = + SparkUtil.catalogAndIdentifier( + nameParts, + catalogName -> { + try { + return catalogManager.catalog(catalogName); + } catch (Exception e) { + return null; + } + }, + Identifier::of, + defaultCatalog, + currentNamespace); return new CatalogAndIdentifier(catalogIdentifier); } @@ -775,18 +813,17 @@ private static TableCatalog asTableCatalog(CatalogPlugin catalog) { return (TableCatalog) catalog; } - throw new IllegalArgumentException(String.format( - "Cannot use catalog %s(%s): not a TableCatalog", catalog.name(), catalog.getClass().getName())); + throw new IllegalArgumentException( + String.format( + "Cannot use catalog %s(%s): not a TableCatalog", + catalog.name(), catalog.getClass().getName())); } - /** - * This mimics a class inside of Spark which is private inside of LookupCatalog. - */ + /** This mimics a class inside of Spark which is private inside of LookupCatalog. */ public static class CatalogAndIdentifier { private final CatalogPlugin catalog; private final Identifier identifier; - public CatalogAndIdentifier(CatalogPlugin catalog, Identifier identifier) { this.catalog = catalog; this.identifier = identifier; @@ -818,49 +855,53 @@ public static TableIdentifier identifierToTableIdentifier(Identifier identifier) * @param format format of the file * @return all table's partitions */ - public static List getPartitions(SparkSession spark, Path rootPath, String format) { + public static List getPartitions( + SparkSession spark, Path rootPath, String format) { FileStatusCache fileStatusCache = FileStatusCache.getOrCreate(spark); Map emptyMap = Collections.emptyMap(); - InMemoryFileIndex fileIndex = new InMemoryFileIndex( - spark, - JavaConverters - .collectionAsScalaIterableConverter(ImmutableList.of(rootPath)) - .asScala() - .toSeq(), - JavaConverters - .mapAsScalaMapConverter(emptyMap) - .asScala() - .toMap(Predef.conforms()), - Option.empty(), - fileStatusCache, - Option.empty(), - Option.empty()); + InMemoryFileIndex fileIndex = + new InMemoryFileIndex( + spark, + JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(rootPath)) + .asScala() + .toSeq(), + JavaConverters.mapAsScalaMapConverter(emptyMap).asScala().toMap(Predef.conforms()), + Option.empty(), + fileStatusCache, + Option.empty(), + Option.empty()); org.apache.spark.sql.execution.datasources.PartitionSpec spec = fileIndex.partitionSpec(); StructType schema = spec.partitionColumns(); - return JavaConverters - .seqAsJavaListConverter(spec.partitions()) - .asJava() - .stream() - .map(partition -> { - Map values = Maps.newHashMap(); - JavaConverters.asJavaIterableConverter(schema).asJava().forEach(field -> { - int fieldIndex = schema.fieldIndex(field.name()); - Object catalystValue = partition.values().get(fieldIndex, field.dataType()); - Object value = CatalystTypeConverters.convertToScala(catalystValue, field.dataType()); - values.put(field.name(), String.valueOf(value)); - }); - return new SparkPartition(values, partition.path().toString(), format); - }).collect(Collectors.toList()); - } - - public static org.apache.spark.sql.catalyst.TableIdentifier toV1TableIdentifier(Identifier identifier) { + return JavaConverters.seqAsJavaListConverter(spec.partitions()).asJava().stream() + .map( + partition -> { + Map values = Maps.newHashMap(); + JavaConverters.asJavaIterableConverter(schema) + .asJava() + .forEach( + field -> { + int fieldIndex = schema.fieldIndex(field.name()); + Object catalystValue = partition.values().get(fieldIndex, field.dataType()); + Object value = + CatalystTypeConverters.convertToScala(catalystValue, field.dataType()); + values.put(field.name(), String.valueOf(value)); + }); + return new SparkPartition(values, partition.path().toString(), format); + }) + .collect(Collectors.toList()); + } + + public static org.apache.spark.sql.catalyst.TableIdentifier toV1TableIdentifier( + Identifier identifier) { String[] namespace = identifier.namespace(); - Preconditions.checkArgument(namespace.length <= 1, - "Cannot convert %s to a Spark v1 identifier, namespace contains more than 1 part", identifier); + Preconditions.checkArgument( + namespace.length <= 1, + "Cannot convert %s to a Spark v1 identifier, namespace contains more than 1 part", + identifier); String table = identifier.name(); Option database = namespace.length == 1 ? Option.apply(namespace[0]) : Option.empty(); @@ -870,54 +911,80 @@ public static org.apache.spark.sql.catalyst.TableIdentifier toV1TableIdentifier( private static class DescribeSortOrderVisitor implements SortOrderVisitor { private static final DescribeSortOrderVisitor INSTANCE = new DescribeSortOrderVisitor(); - private DescribeSortOrderVisitor() { - } + private DescribeSortOrderVisitor() {} @Override - public String field(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String field( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("%s %s %s", sourceName, direction, nullOrder); } @Override - public String bucket(String sourceName, int sourceId, int numBuckets, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String bucket( + String sourceName, + int sourceId, + int numBuckets, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("bucket(%s, %s) %s %s", numBuckets, sourceName, direction, nullOrder); } @Override - public String truncate(String sourceName, int sourceId, int width, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String truncate( + String sourceName, + int sourceId, + int width, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("truncate(%s, %s) %s %s", sourceName, width, direction, nullOrder); } @Override - public String year(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String year( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("years(%s) %s %s", sourceName, direction, nullOrder); } @Override - public String month(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String month( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("months(%s) %s %s", sourceName, direction, nullOrder); } @Override - public String day(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String day( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("days(%s) %s %s", sourceName, direction, nullOrder); } @Override - public String hour(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String hour( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("hours(%s) %s %s", sourceName, direction, nullOrder); } @Override - public String unknown(String sourceName, int sourceId, String transform, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String unknown( + String sourceName, + int sourceId, + String transform, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("%s(%s) %s %s", transform, sourceName, direction, nullOrder); } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/Spark3VersionUtil.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/Spark3VersionUtil.java index 984c66d22a82..526b970e8f97 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/Spark3VersionUtil.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/Spark3VersionUtil.java @@ -16,15 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.spark.package$; public class Spark3VersionUtil { - private Spark3VersionUtil() { - } + private Spark3VersionUtil() {} public static boolean isSpark30() { return package$.MODULE$.SPARK_VERSION_SHORT().startsWith("3.0"); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java index 7bac3dd337a0..26191f8c8ee4 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Arrays; @@ -73,21 +72,22 @@ /** * A Spark TableCatalog implementation that wraps an Iceberg {@link Catalog}. - *

    - * This supports the following catalog configuration options: + * + *

    This supports the following catalog configuration options: + * *

      - *
    • type - catalog type, "hive" or "hadoop". - * To specify a non-hive or hadoop catalog, use the catalog-impl option. - *
    • - *
    • uri - the Hive Metastore URI (Hive catalog only)
    • - *
    • warehouse - the warehouse path (Hadoop catalog only)
    • - *
    • catalog-impl - a custom {@link Catalog} implementation to use
    • - *
    • default-namespace - a namespace to use as the default
    • - *
    • cache-enabled - whether to enable catalog cache
    • - *
    • cache.expiration-interval-ms - interval in millis before expiring tables from catalog cache. - * Refer to {@link CatalogProperties#CACHE_EXPIRATION_INTERVAL_MS} for further details and significant values. - *
    • + *
    • type - catalog type, "hive" or "hadoop". To specify a non-hive or hadoop + * catalog, use the catalog-impl option. + *
    • uri - the Hive Metastore URI (Hive catalog only) + *
    • warehouse - the warehouse path (Hadoop catalog only) + *
    • catalog-impl - a custom {@link Catalog} implementation to use + *
    • default-namespace - a namespace to use as the default + *
    • cache-enabled - whether to enable catalog cache + *
    • cache.expiration-interval-ms - interval in millis before expiring tables from + * catalog cache. Refer to {@link CatalogProperties#CACHE_EXPIRATION_INTERVAL_MS} for further + * details and significant values. *
    + * *

    */ public class SparkCatalog extends BaseCatalog { @@ -141,17 +141,18 @@ public SparkTable loadTable(Identifier ident) throws NoSuchTableException { } @Override - public SparkTable createTable(Identifier ident, StructType schema, - Transform[] transforms, - Map properties) throws TableAlreadyExistsException { + public SparkTable createTable( + Identifier ident, StructType schema, Transform[] transforms, Map properties) + throws TableAlreadyExistsException { Schema icebergSchema = SparkSchemaUtil.convert(schema, useTimestampsWithoutZone); try { Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); - Table icebergTable = builder - .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) - .withLocation(properties.get("location")) - .withProperties(Spark3Util.rebuildCreateProperties(properties)) - .create(); + Table icebergTable = + builder + .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) + .withLocation(properties.get("location")) + .withProperties(Spark3Util.rebuildCreateProperties(properties)) + .create(); return new SparkTable(icebergTable, !cacheEnabled); } catch (AlreadyExistsException e) { throw new TableAlreadyExistsException(ident); @@ -159,15 +160,18 @@ public SparkTable createTable(Identifier ident, StructType schema, } @Override - public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] transforms, - Map properties) throws TableAlreadyExistsException { + public StagedTable stageCreate( + Identifier ident, StructType schema, Transform[] transforms, Map properties) + throws TableAlreadyExistsException { Schema icebergSchema = SparkSchemaUtil.convert(schema, useTimestampsWithoutZone); try { Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); - Transaction transaction = builder.withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) - .withLocation(properties.get("location")) - .withProperties(Spark3Util.rebuildCreateProperties(properties)) - .createTransaction(); + Transaction transaction = + builder + .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) + .withLocation(properties.get("location")) + .withProperties(Spark3Util.rebuildCreateProperties(properties)) + .createTransaction(); return new StagedSparkTable(transaction); } catch (AlreadyExistsException e) { throw new TableAlreadyExistsException(ident); @@ -175,15 +179,18 @@ public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] } @Override - public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] transforms, - Map properties) throws NoSuchTableException { + public StagedTable stageReplace( + Identifier ident, StructType schema, Transform[] transforms, Map properties) + throws NoSuchTableException { Schema icebergSchema = SparkSchemaUtil.convert(schema, useTimestampsWithoutZone); try { Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); - Transaction transaction = builder.withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) - .withLocation(properties.get("location")) - .withProperties(Spark3Util.rebuildCreateProperties(properties)) - .replaceTransaction(); + Transaction transaction = + builder + .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) + .withLocation(properties.get("location")) + .withProperties(Spark3Util.rebuildCreateProperties(properties)) + .replaceTransaction(); return new StagedSparkTable(transaction); } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { throw new NoSuchTableException(ident); @@ -191,19 +198,22 @@ public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] } @Override - public StagedTable stageCreateOrReplace(Identifier ident, StructType schema, Transform[] transforms, - Map properties) { + public StagedTable stageCreateOrReplace( + Identifier ident, StructType schema, Transform[] transforms, Map properties) { Schema icebergSchema = SparkSchemaUtil.convert(schema, useTimestampsWithoutZone); Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); - Transaction transaction = builder.withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) - .withLocation(properties.get("location")) - .withProperties(Spark3Util.rebuildCreateProperties(properties)) - .createOrReplaceTransaction(); + Transaction transaction = + builder + .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) + .withLocation(properties.get("location")) + .withProperties(Spark3Util.rebuildCreateProperties(properties)) + .createOrReplaceTransaction(); return new StagedSparkTable(transaction); } @Override - public SparkTable alterTable(Identifier ident, TableChange... changes) throws NoSuchTableException { + public SparkTable alterTable(Identifier ident, TableChange... changes) + throws NoSuchTableException { SetProperty setLocation = null; SetProperty setSnapshotId = null; SetProperty pickSnapshotId = null; @@ -220,8 +230,9 @@ public SparkTable alterTable(Identifier ident, TableChange... changes) throws No } else if ("cherry-pick-snapshot-id".equalsIgnoreCase(set.property())) { pickSnapshotId = set; } else if ("sort-order".equalsIgnoreCase(set.property())) { - throw new UnsupportedOperationException("Cannot specify the 'sort-order' because it's a reserved table " + - "property. Please use the command 'ALTER TABLE ... WRITE ORDERED BY' to specify write sort-orders."); + throw new UnsupportedOperationException( + "Cannot specify the 'sort-order' because it's a reserved table " + + "property. Please use the command 'ALTER TABLE ... WRITE ORDERED BY' to specify write sort-orders."); } else { propertyChanges.add(set); } @@ -236,7 +247,8 @@ public SparkTable alterTable(Identifier ident, TableChange... changes) throws No try { Table table = load(ident).first(); - commitChanges(table, setLocation, setSnapshotId, pickSnapshotId, propertyChanges, schemaChanges); + commitChanges( + table, setLocation, setSnapshotId, pickSnapshotId, propertyChanges, schemaChanges); return new SparkTable(table, true /* refreshEagerly */); } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { throw new NoSuchTableException(ident); @@ -246,16 +258,17 @@ public SparkTable alterTable(Identifier ident, TableChange... changes) throws No @Override public boolean dropTable(Identifier ident) { try { - return isPathIdentifier(ident) ? - tables.dropTable(((PathIdentifier) ident).location()) : - icebergCatalog.dropTable(buildIdentifier(ident)); + return isPathIdentifier(ident) + ? tables.dropTable(((PathIdentifier) ident).location()) + : icebergCatalog.dropTable(buildIdentifier(ident)); } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { return false; } } @Override - public void renameTable(Identifier from, Identifier to) throws NoSuchTableException, TableAlreadyExistsException { + public void renameTable(Identifier from, Identifier to) + throws NoSuchTableException, TableAlreadyExistsException { try { checkNotPathIdentifier(from, "renameTable"); checkNotPathIdentifier(to, "renameTable"); @@ -317,7 +330,8 @@ public String[][] listNamespaces(String[] namespace) throws NoSuchNamespaceExcep } @Override - public Map loadNamespaceMetadata(String[] namespace) throws NoSuchNamespaceException { + public Map loadNamespaceMetadata(String[] namespace) + throws NoSuchNamespaceException { if (asNamespaceCatalog != null) { try { return asNamespaceCatalog.loadNamespaceMetadata(Namespace.of(namespace)); @@ -330,10 +344,12 @@ public Map loadNamespaceMetadata(String[] namespace) throws NoSu } @Override - public void createNamespace(String[] namespace, Map metadata) throws NamespaceAlreadyExistsException { + public void createNamespace(String[] namespace, Map metadata) + throws NamespaceAlreadyExistsException { if (asNamespaceCatalog != null) { try { - if (asNamespaceCatalog instanceof HadoopCatalog && DEFAULT_NS_KEYS.equals(metadata.keySet())) { + if (asNamespaceCatalog instanceof HadoopCatalog + && DEFAULT_NS_KEYS.equals(metadata.keySet())) { // Hadoop catalog will reject metadata properties, but Spark automatically adds "owner". // If only the automatic properties are present, replace metadata with an empty map. asNamespaceCatalog.createNamespace(Namespace.of(namespace), ImmutableMap.of()); @@ -344,12 +360,14 @@ public void createNamespace(String[] namespace, Map metadata) th throw new NamespaceAlreadyExistsException(namespace); } } else { - throw new UnsupportedOperationException("Namespaces are not supported by catalog: " + catalogName); + throw new UnsupportedOperationException( + "Namespaces are not supported by catalog: " + catalogName); } } @Override - public void alterNamespace(String[] namespace, NamespaceChange... changes) throws NoSuchNamespaceException { + public void alterNamespace(String[] namespace, NamespaceChange... changes) + throws NoSuchNamespaceException { if (asNamespaceCatalog != null) { Map updates = Maps.newHashMap(); Set removals = Sets.newHashSet(); @@ -360,7 +378,8 @@ public void alterNamespace(String[] namespace, NamespaceChange... changes) throw } else if (change instanceof NamespaceChange.RemoveProperty) { removals.add(((NamespaceChange.RemoveProperty) change).property()); } else { - throw new UnsupportedOperationException("Cannot apply unknown namespace change: " + change); + throw new UnsupportedOperationException( + "Cannot apply unknown namespace change: " + change); } } @@ -396,12 +415,15 @@ public boolean dropNamespace(String[] namespace) throws NoSuchNamespaceException @Override public final void initialize(String name, CaseInsensitiveStringMap options) { - this.cacheEnabled = PropertyUtil.propertyAsBoolean(options, - CatalogProperties.CACHE_ENABLED, CatalogProperties.CACHE_ENABLED_DEFAULT); + this.cacheEnabled = + PropertyUtil.propertyAsBoolean( + options, CatalogProperties.CACHE_ENABLED, CatalogProperties.CACHE_ENABLED_DEFAULT); - long cacheExpirationIntervalMs = PropertyUtil.propertyAsLong(options, - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS_DEFAULT); + long cacheExpirationIntervalMs = + PropertyUtil.propertyAsLong( + options, + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS_DEFAULT); // An expiration interval of 0ms effectively disables caching. // Do not wrap with CachingCatalog. @@ -413,15 +435,17 @@ public final void initialize(String name, CaseInsensitiveStringMap options) { this.catalogName = name; SparkSession sparkSession = SparkSession.active(); - this.useTimestampsWithoutZone = SparkUtil.useTimestampWithoutZoneInNewTables(sparkSession.conf()); - this.tables = new HadoopTables(SparkUtil.hadoopConfCatalogOverrides(SparkSession.active(), name)); - this.icebergCatalog = cacheEnabled ? CachingCatalog.wrap(catalog, cacheExpirationIntervalMs) : catalog; + this.useTimestampsWithoutZone = + SparkUtil.useTimestampWithoutZoneInNewTables(sparkSession.conf()); + this.tables = + new HadoopTables(SparkUtil.hadoopConfCatalogOverrides(SparkSession.active(), name)); + this.icebergCatalog = + cacheEnabled ? CachingCatalog.wrap(catalog, cacheExpirationIntervalMs) : catalog; if (catalog instanceof SupportsNamespaces) { this.asNamespaceCatalog = (SupportsNamespaces) catalog; if (options.containsKey("default-namespace")) { - this.defaultNamespace = Splitter.on('.') - .splitToList(options.get("default-namespace")) - .toArray(new String[0]); + this.defaultNamespace = + Splitter.on('.').splitToList(options.get("default-namespace")).toArray(new String[0]); } } } @@ -431,12 +455,18 @@ public String name() { return catalogName; } - private static void commitChanges(Table table, SetProperty setLocation, SetProperty setSnapshotId, - SetProperty pickSnapshotId, List propertyChanges, - List schemaChanges) { - // don't allow setting the snapshot and picking a commit at the same time because order is ambiguous and choosing + private static void commitChanges( + Table table, + SetProperty setLocation, + SetProperty setSnapshotId, + SetProperty pickSnapshotId, + List propertyChanges, + List schemaChanges) { + // don't allow setting the snapshot and picking a commit at the same time because order is + // ambiguous and choosing // one order leads to different results - Preconditions.checkArgument(setSnapshotId == null || pickSnapshotId == null, + Preconditions.checkArgument( + setSnapshotId == null || pickSnapshotId == null, "Cannot set the current the current snapshot ID and cherry-pick snapshot changes"); if (setSnapshotId != null) { @@ -453,9 +483,7 @@ private static void commitChanges(Table table, SetProperty setLocation, SetPrope Transaction transaction = table.newTransaction(); if (setLocation != null) { - transaction.updateLocation() - .setLocation(setLocation.value()) - .commit(); + transaction.updateLocation().setLocation(setLocation.value()).commit(); } if (!propertyChanges.isEmpty()) { @@ -475,8 +503,9 @@ private static boolean isPathIdentifier(Identifier ident) { private static void checkNotPathIdentifier(Identifier identifier, String method) { if (identifier instanceof PathIdentifier) { - throw new IllegalArgumentException(String.format("Cannot pass path based identifier to %s method. %s is a path.", - method, identifier)); + throw new IllegalArgumentException( + String.format( + "Cannot pass path based identifier to %s method. %s is a path.", method, identifier)); } } @@ -493,7 +522,8 @@ private Pair load(Identifier ident) { throw e; } - // if the original load didn't work, the identifier may be extended and include a snapshot selector + // if the original load didn't work, the identifier may be extended and include a snapshot + // selector TableIdentifier namespaceAsIdent = buildIdentifier(namespaceToIdentifier(ident.namespace())); Table table; try { @@ -557,10 +587,13 @@ private Pair loadFromPathIdentifier(PathIdentifier ident) { } } - Preconditions.checkArgument(asOfTimestamp == null || snapshotId == null, - "Cannot specify both snapshot-id and as-of-timestamp: %s", ident.location()); + Preconditions.checkArgument( + asOfTimestamp == null || snapshotId == null, + "Cannot specify both snapshot-id and as-of-timestamp: %s", + ident.location()); - Table table = tables.load(parsed.first() + (metadataTableName != null ? "#" + metadataTableName : "")); + Table table = + tables.load(parsed.first() + (metadataTableName != null ? "#" + metadataTableName : "")); if (snapshotId != null) { return Pair.of(table, snapshotId); @@ -572,16 +605,16 @@ private Pair loadFromPathIdentifier(PathIdentifier ident) { } private Identifier namespaceToIdentifier(String[] namespace) { - Preconditions.checkArgument(namespace.length > 0, - "Cannot convert empty namespace to identifier"); + Preconditions.checkArgument( + namespace.length > 0, "Cannot convert empty namespace to identifier"); String[] ns = Arrays.copyOf(namespace, namespace.length - 1); String name = namespace[ns.length]; return Identifier.of(ns, name); } private Catalog.TableBuilder newBuilder(Identifier ident, Schema schema) { - return isPathIdentifier(ident) ? - tables.buildTable(((PathIdentifier) ident).location(), schema) : - icebergCatalog.buildTable(buildIdentifier(ident), schema); + return isPathIdentifier(ident) + ? tables.buildTable(((PathIdentifier) ident).location(), schema) + : icebergCatalog.buildTable(buildIdentifier(ident), schema); } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java index 79051b12625e..33e5ca936800 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Locale; @@ -159,7 +158,8 @@ public ThisT tableProperty(String name) { protected T parse(Function conversion, T defaultValue) { if (optionName != null) { - // use lower case comparison as DataSourceOptions.asMap() in Spark 2 returns a lower case map + // use lower case comparison as DataSourceOptions.asMap() in Spark 2 returns a lower case + // map String optionValue = options.get(optionName.toLowerCase(Locale.ROOT)); if (optionValue != null) { return conversion.apply(optionValue); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java index a6390d39c575..87e831872472 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.nio.ByteBuffer; @@ -62,10 +61,12 @@ public SparkDataFile(Types.StructType type, StructType sparkType) { this.wrappedPartition = new SparkStructLike(type.fieldType("partition").asStructType()); Map positions = Maps.newHashMap(); - type.fields().forEach(field -> { - String fieldName = field.name(); - positions.put(fieldName, fieldPosition(fieldName, sparkType)); - }); + type.fields() + .forEach( + field -> { + String fieldName = field.name(); + positions.put(fieldName, fieldPosition(fieldName, sparkType)); + }); filePathPosition = positions.get("file_path"); fileFormatPosition = positions.get("file_format"); @@ -139,23 +140,29 @@ public Map valueCounts() { @Override public Map nullValueCounts() { - return wrapped.isNullAt(nullValueCountsPosition) ? null : wrapped.getJavaMap(nullValueCountsPosition); + return wrapped.isNullAt(nullValueCountsPosition) + ? null + : wrapped.getJavaMap(nullValueCountsPosition); } @Override public Map nanValueCounts() { - return wrapped.isNullAt(nanValueCountsPosition) ? null : wrapped.getJavaMap(nanValueCountsPosition); + return wrapped.isNullAt(nanValueCountsPosition) + ? null + : wrapped.getJavaMap(nanValueCountsPosition); } @Override public Map lowerBounds() { - Map lowerBounds = wrapped.isNullAt(lowerBoundsPosition) ? null : wrapped.getJavaMap(lowerBoundsPosition); + Map lowerBounds = + wrapped.isNullAt(lowerBoundsPosition) ? null : wrapped.getJavaMap(lowerBoundsPosition); return convert(lowerBoundsType, lowerBounds); } @Override public Map upperBounds() { - Map upperBounds = wrapped.isNullAt(upperBoundsPosition) ? null : wrapped.getJavaMap(upperBoundsPosition); + Map upperBounds = + wrapped.isNullAt(upperBoundsPosition) ? null : wrapped.getJavaMap(upperBoundsPosition); return convert(upperBoundsType, upperBounds); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java index 2eb53baa688e..5c6fe3e0ff96 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import com.google.errorprone.annotations.FormatMethod; @@ -29,8 +28,7 @@ public class SparkExceptionUtil { - private SparkExceptionUtil() { - } + private SparkExceptionUtil() {} /** * Converts checked exceptions to unchecked exceptions. @@ -41,8 +39,8 @@ private SparkExceptionUtil() { * @return unchecked exception. */ @FormatMethod - public static RuntimeException toUncheckedException(final Throwable cause, final String message, - final Object... args) { + public static RuntimeException toUncheckedException( + final Throwable cause, final String message, final Object... args) { // Parameters are required to be final to help @FormatMethod do static analysis if (cause instanceof RuntimeException) { return (RuntimeException) cause; diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java index 2e23d968bde0..8ba75e754f7f 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java @@ -16,9 +16,22 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.expressions.Expressions.and; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.isNaN; +import static org.apache.iceberg.expressions.Expressions.isNull; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.not; +import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.or; +import static org.apache.iceberg.expressions.Expressions.startsWith; + import java.sql.Date; import java.sql.Timestamp; import java.time.Instant; @@ -51,48 +64,34 @@ import org.apache.spark.sql.sources.Or; import org.apache.spark.sql.sources.StringStartsWith; -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.isNull; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.not; -import static org.apache.iceberg.expressions.Expressions.notNull; -import static org.apache.iceberg.expressions.Expressions.or; -import static org.apache.iceberg.expressions.Expressions.startsWith; - public class SparkFilters { - private SparkFilters() { - } - - private static final Map, Operation> FILTERS = ImmutableMap - ., Operation>builder() - .put(AlwaysTrue$.class, Operation.TRUE) - .put(AlwaysFalse$.class, Operation.FALSE) - .put(EqualTo.class, Operation.EQ) - .put(EqualNullSafe.class, Operation.EQ) - .put(GreaterThan.class, Operation.GT) - .put(GreaterThanOrEqual.class, Operation.GT_EQ) - .put(LessThan.class, Operation.LT) - .put(LessThanOrEqual.class, Operation.LT_EQ) - .put(In.class, Operation.IN) - .put(IsNull.class, Operation.IS_NULL) - .put(IsNotNull.class, Operation.NOT_NULL) - .put(And.class, Operation.AND) - .put(Or.class, Operation.OR) - .put(Not.class, Operation.NOT) - .put(StringStartsWith.class, Operation.STARTS_WITH) - .build(); + private SparkFilters() {} + + private static final Map, Operation> FILTERS = + ImmutableMap., Operation>builder() + .put(AlwaysTrue$.class, Operation.TRUE) + .put(AlwaysFalse$.class, Operation.FALSE) + .put(EqualTo.class, Operation.EQ) + .put(EqualNullSafe.class, Operation.EQ) + .put(GreaterThan.class, Operation.GT) + .put(GreaterThanOrEqual.class, Operation.GT_EQ) + .put(LessThan.class, Operation.LT) + .put(LessThanOrEqual.class, Operation.LT_EQ) + .put(In.class, Operation.IN) + .put(IsNull.class, Operation.IS_NULL) + .put(IsNotNull.class, Operation.NOT_NULL) + .put(And.class, Operation.AND) + .put(Or.class, Operation.OR) + .put(Not.class, Operation.NOT) + .put(StringStartsWith.class, Operation.STARTS_WITH) + .build(); public static Expression convert(Filter[] filters) { Expression expression = Expressions.alwaysTrue(); for (Filter filter : filters) { Expression converted = convert(filter); - Preconditions.checkArgument(converted != null, "Cannot convert filter to Iceberg: %s", filter); + Preconditions.checkArgument( + converted != null, "Cannot convert filter to Iceberg: %s", filter); expression = Expressions.and(expression, converted); } return expression; @@ -137,8 +136,8 @@ public static Expression convert(Filter filter) { if (filter instanceof EqualTo) { EqualTo eq = (EqualTo) filter; // comparison with null in normal equality is always null. this is probably a mistake. - Preconditions.checkNotNull(eq.value(), - "Expression is always false (eq is not null-safe): %s", filter); + Preconditions.checkNotNull( + eq.value(), "Expression is always false (eq is not null-safe): %s", filter); return handleEqual(eq.attribute(), eq.value()); } else { EqualNullSafe eq = (EqualNullSafe) filter; @@ -151,7 +150,8 @@ public static Expression convert(Filter filter) { case IN: In inFilter = (In) filter; - return in(inFilter.attribute(), + return in( + inFilter.attribute(), Stream.of(inFilter.values()) .filter(Objects::nonNull) .map(SparkFilters::convertLiteral) @@ -165,30 +165,33 @@ public static Expression convert(Filter filter) { } return null; - case AND: { - And andFilter = (And) filter; - Expression left = convert(andFilter.left()); - Expression right = convert(andFilter.right()); - if (left != null && right != null) { - return and(left, right); + case AND: + { + And andFilter = (And) filter; + Expression left = convert(andFilter.left()); + Expression right = convert(andFilter.right()); + if (left != null && right != null) { + return and(left, right); + } + return null; } - return null; - } - - case OR: { - Or orFilter = (Or) filter; - Expression left = convert(orFilter.left()); - Expression right = convert(orFilter.right()); - if (left != null && right != null) { - return or(left, right); + + case OR: + { + Or orFilter = (Or) filter; + Expression left = convert(orFilter.left()); + Expression right = convert(orFilter.right()); + if (left != null && right != null) { + return or(left, right); + } + return null; } - return null; - } - case STARTS_WITH: { - StringStartsWith stringStartsWith = (StringStartsWith) filter; - return startsWith(stringStartsWith.attribute(), stringStartsWith.value()); - } + case STARTS_WITH: + { + StringStartsWith stringStartsWith = (StringStartsWith) filter; + return startsWith(stringStartsWith.attribute(), stringStartsWith.value()); + } } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java index d4dd53d34a97..b35213501aef 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Schema; @@ -27,9 +26,10 @@ /** * By default Spark type {@link org.apache.iceberg.types.Types.TimestampType} should be converted to - * {@link Types.TimestampType#withZone()} iceberg type. But we also can convert - * {@link org.apache.iceberg.types.Types.TimestampType} to {@link Types.TimestampType#withoutZone()} iceberg type - * by setting {@link SparkSQLProperties#USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES} to 'true' + * {@link Types.TimestampType#withZone()} iceberg type. But we also can convert {@link + * org.apache.iceberg.types.Types.TimestampType} to {@link Types.TimestampType#withoutZone()} + * iceberg type by setting {@link SparkSQLProperties#USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES} + * to 'true' */ class SparkFixupTimestampType extends FixupTypes { @@ -38,8 +38,8 @@ private SparkFixupTimestampType(Schema referenceSchema) { } static Schema fixup(Schema schema) { - return new Schema(TypeUtil.visit(schema, - new SparkFixupTimestampType(schema)).asStructType().fields()); + return new Schema( + TypeUtil.visit(schema, new SparkFixupTimestampType(schema)).asStructType().fields()); } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java index 5508965af249..6c4ec39b20f1 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Schema; @@ -25,8 +24,8 @@ import org.apache.iceberg.types.TypeUtil; /** - * Some types, like binary and fixed, are converted to the same Spark type. Conversion back - * can produce only one, which may not be correct. + * Some types, like binary and fixed, are converted to the same Spark type. Conversion back can + * produce only one, which may not be correct. */ class SparkFixupTypes extends FixupTypes { @@ -35,8 +34,8 @@ private SparkFixupTypes(Schema referenceSchema) { } static Schema fixup(Schema schema, Schema referenceSchema) { - return new Schema(TypeUtil.visit(schema, - new SparkFixupTypes(referenceSchema)).asStructType().fields()); + return new Schema( + TypeUtil.visit(schema, new SparkFixupTypes(referenceSchema)).asStructType().fields()); } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java index 462f44c4ae36..937c31e45960 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -31,18 +30,21 @@ /** * A class for common Iceberg configs for Spark reads. - *

    - * If a config is set at multiple levels, the following order of precedence is used (top to bottom): + * + *

    If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * *

      - *
    1. Read options
    2. - *
    3. Session configuration
    4. - *
    5. Table metadata
    6. + *
    7. Read options + *
    8. Session configuration + *
    9. Table metadata *
    - * The most specific value is set in read options and takes precedence over all other configs. - * If no read option is provided, this class checks the session configuration for any overrides. - * If no applicable value is found in the session configuration, this class uses the table metadata. - *

    - * Note this class is NOT meant to be serialized and sent to executors. + * + * The most specific value is set in read options and takes precedence over all other configs. If no + * read option is provided, this class checks the session configuration for any overrides. If no + * applicable value is found in the session configuration, this class uses the table metadata. + * + *

    Note this class is NOT meant to be serialized and sent to executors. */ public class SparkReadConf { @@ -64,55 +66,47 @@ public boolean localityEnabled() { if (file instanceof HadoopInputFile) { String scheme = ((HadoopInputFile) file).getFileSystem().getScheme(); boolean defaultValue = LOCALITY_WHITELIST_FS.contains(scheme); - return PropertyUtil.propertyAsBoolean( - readOptions, - SparkReadOptions.LOCALITY, - defaultValue); + return PropertyUtil.propertyAsBoolean(readOptions, SparkReadOptions.LOCALITY, defaultValue); } return false; } public Long snapshotId() { - return confParser.longConf() - .option(SparkReadOptions.SNAPSHOT_ID) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.SNAPSHOT_ID).parseOptional(); } public Long asOfTimestamp() { - return confParser.longConf() - .option(SparkReadOptions.AS_OF_TIMESTAMP) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.AS_OF_TIMESTAMP).parseOptional(); } public Long startSnapshotId() { - return confParser.longConf() - .option(SparkReadOptions.START_SNAPSHOT_ID) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.START_SNAPSHOT_ID).parseOptional(); } public Long endSnapshotId() { - return confParser.longConf() - .option(SparkReadOptions.END_SNAPSHOT_ID) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.END_SNAPSHOT_ID).parseOptional(); } public boolean streamingSkipDeleteSnapshots() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS) .defaultValue(SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS_DEFAULT) .parse(); } public boolean streamingSkipOverwriteSnapshots() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS) .defaultValue(SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS_DEFAULT) .parse(); } public boolean parquetVectorizationEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.VECTORIZATION_ENABLED) .sessionConf(SparkSQLProperties.VECTORIZATION_ENABLED) .tableProperty(TableProperties.PARQUET_VECTORIZATION_ENABLED) @@ -121,7 +115,8 @@ public boolean parquetVectorizationEnabled() { } public int parquetBatchSize() { - return confParser.intConf() + return confParser + .intConf() .option(SparkReadOptions.VECTORIZATION_BATCH_SIZE) .tableProperty(TableProperties.PARQUET_BATCH_SIZE) .defaultValue(TableProperties.PARQUET_BATCH_SIZE_DEFAULT) @@ -129,7 +124,8 @@ public int parquetBatchSize() { } public boolean orcVectorizationEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.VECTORIZATION_ENABLED) .sessionConf(SparkSQLProperties.VECTORIZATION_ENABLED) .tableProperty(TableProperties.ORC_VECTORIZATION_ENABLED) @@ -138,7 +134,8 @@ public boolean orcVectorizationEnabled() { } public int orcBatchSize() { - return confParser.intConf() + return confParser + .intConf() .option(SparkReadOptions.VECTORIZATION_BATCH_SIZE) .tableProperty(TableProperties.ORC_BATCH_SIZE) .defaultValue(TableProperties.ORC_BATCH_SIZE_DEFAULT) @@ -146,7 +143,8 @@ public int orcBatchSize() { } public long splitSize() { - return confParser.longConf() + return confParser + .longConf() .option(SparkReadOptions.SPLIT_SIZE) .tableProperty(TableProperties.SPLIT_SIZE) .defaultValue(TableProperties.SPLIT_SIZE_DEFAULT) @@ -154,7 +152,8 @@ public long splitSize() { } public int splitLookback() { - return confParser.intConf() + return confParser + .intConf() .option(SparkReadOptions.LOOKBACK) .tableProperty(TableProperties.SPLIT_LOOKBACK) .defaultValue(TableProperties.SPLIT_LOOKBACK_DEFAULT) @@ -162,7 +161,8 @@ public int splitLookback() { } public long splitOpenFileCost() { - return confParser.longConf() + return confParser + .longConf() .option(SparkReadOptions.FILE_OPEN_COST) .tableProperty(TableProperties.SPLIT_OPEN_FILE_COST) .defaultValue(TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT) @@ -171,18 +171,20 @@ public long splitOpenFileCost() { /** * Enables reading a timestamp without time zone as a timestamp with time zone. - *

    - * Generally, this is not safe as a timestamp without time zone is supposed to represent the wall-clock time, - * i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, - * but a timestamp with time zone represents instant semantics, i.e. the timestamp - * is adjusted so that the corresponding time in the reader timezone is displayed. - *

    - * When set to false (default), an exception must be thrown while reading a timestamp without time zone. + * + *

    Generally, this is not safe as a timestamp without time zone is supposed to represent the + * wall-clock time, i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, + * but a timestamp with time zone represents instant semantics, i.e. the timestamp is adjusted so + * that the corresponding time in the reader timezone is displayed. + * + *

    When set to false (default), an exception must be thrown while reading a timestamp without + * time zone. * * @return boolean indicating if reading timestamps without timezone is allowed */ public boolean handleTimestampWithoutZone() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .sessionConf(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .defaultValue(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT) @@ -190,7 +192,8 @@ public boolean handleTimestampWithoutZone() { } public Long streamFromTimestamp() { - return confParser.longConf() + return confParser + .longConf() .option(SparkReadOptions.STREAM_FROM_TIMESTAMP) .defaultValue(Long.MIN_VALUE) .parse(); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java index edcc2300344a..d13e80d40004 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java @@ -16,16 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; -/** - * Spark DF read options - */ +/** Spark DF read options */ public class SparkReadOptions { - private SparkReadOptions() { - } + private SparkReadOptions() {} // Snapshot ID of the table snapshot to read public static final String SNAPSHOT_ID = "snapshot-id"; @@ -62,11 +58,13 @@ private SparkReadOptions() { public static final boolean STREAMING_SKIP_DELETE_SNAPSHOTS_DEFAULT = false; // skip snapshots of type overwrite while reading stream out of iceberg table - public static final String STREAMING_SKIP_OVERWRITE_SNAPSHOTS = "streaming-skip-overwrite-snapshots"; + public static final String STREAMING_SKIP_OVERWRITE_SNAPSHOTS = + "streaming-skip-overwrite-snapshots"; public static final boolean STREAMING_SKIP_OVERWRITE_SNAPSHOTS_DEFAULT = false; // Controls whether to allow reading timestamps without zone info - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = "handle-timestamp-without-timezone"; + public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = + "handle-timestamp-without-timezone"; // Controls whether to report locality information to Spark while allocating input partitions public static final String LOCALITY = "locality"; diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java index f2dcc13bece0..fa8bd719f391 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java @@ -16,19 +16,18 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; public class SparkSQLProperties { - private SparkSQLProperties() { - } + private SparkSQLProperties() {} // Controls whether vectorized reads are enabled public static final String VECTORIZATION_ENABLED = "spark.sql.iceberg.vectorization.enabled"; // Controls whether reading/writing timestamps without timezones is allowed - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = "spark.sql.iceberg.handle-timestamp-without-timezone"; + public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = + "spark.sql.iceberg.handle-timestamp-without-timezone"; public static final boolean HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT = false; // Controls whether timestamp types for new tables should be stored with timezone info diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java index 321050dceb74..653987e654aa 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Collection; @@ -40,17 +39,14 @@ import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.StructType; -/** - * Helper methods for working with Spark/Hive metadata. - */ +/** Helper methods for working with Spark/Hive metadata. */ public class SparkSchemaUtil { - private SparkSchemaUtil() { - } + private SparkSchemaUtil() {} /** * Returns a {@link Schema} for the given table with fresh field ids. - *

    - * This creates a Schema for an existing table by looking up the table's schema with Spark and + * + *

    This creates a Schema for an existing table by looking up the table's schema with Spark and * converting that schema. Spark/Hive partition columns are included in the schema. * * @param spark a Spark session @@ -65,8 +61,8 @@ public static Schema schemaForTable(SparkSession spark, String name) { /** * Returns a {@link PartitionSpec} for the given table. - *

    - * This creates a partition spec for an existing table by looking up the table's schema and + * + *

    This creates a partition spec for an existing table by looking up the table's schema and * creating a spec with identity partitions for each partition column. * * @param spark a Spark session @@ -74,14 +70,15 @@ public static Schema schemaForTable(SparkSession spark, String name) { * @return a PartitionSpec for the table * @throws AnalysisException if thrown by the Spark catalog */ - public static PartitionSpec specForTable(SparkSession spark, String name) throws AnalysisException { + public static PartitionSpec specForTable(SparkSession spark, String name) + throws AnalysisException { List parts = Lists.newArrayList(Splitter.on('.').limit(2).split(name)); String db = parts.size() == 1 ? "default" : parts.get(0); String table = parts.get(parts.size() == 1 ? 0 : 1); - PartitionSpec spec = identitySpec( - schemaForTable(spark, name), - spark.catalog().listColumns(db, table).collectAsList()); + PartitionSpec spec = + identitySpec( + schemaForTable(spark, name), spark.catalog().listColumns(db, table).collectAsList()); return spec == null ? PartitionSpec.unpartitioned() : spec; } @@ -109,13 +106,14 @@ public static DataType convert(Type type) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. - *

    - * This conversion assigns fresh ids. - *

    - * Some data types are represented as the same Spark type. These are converted to a default type. - *

    - * To convert using a reference schema for field ids and ambiguous types, use - * {@link #convert(Schema, StructType)}. + * + *

    This conversion assigns fresh ids. + * + *

    Some data types are represented as the same Spark type. These are converted to a default + * type. + * + *

    To convert using a reference schema for field ids and ambiguous types, use {@link + * #convert(Schema, StructType)}. * * @param sparkType a Spark StructType * @return the equivalent Schema @@ -127,16 +125,18 @@ public static Schema convert(StructType sparkType) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. - *

    - * This conversion assigns fresh ids. - *

    - * Some data types are represented as the same Spark type. These are converted to a default type. - *

    - * To convert using a reference schema for field ids and ambiguous types, use - * {@link #convert(Schema, StructType)}. + * + *

    This conversion assigns fresh ids. + * + *

    Some data types are represented as the same Spark type. These are converted to a default + * type. + * + *

    To convert using a reference schema for field ids and ambiguous types, use {@link + * #convert(Schema, StructType)}. * * @param sparkType a Spark StructType - * @param useTimestampWithoutZone boolean flag indicates that timestamp should be stored without timezone + * @param useTimestampWithoutZone boolean flag indicates that timestamp should be stored without + * timezone * @return the equivalent Schema * @throws IllegalArgumentException if the type cannot be converted */ @@ -151,13 +151,14 @@ public static Schema convert(StructType sparkType, boolean useTimestampWithoutZo /** * Convert a Spark {@link DataType struct} to a {@link Type} with new field ids. - *

    - * This conversion assigns fresh ids. - *

    - * Some data types are represented as the same Spark type. These are converted to a default type. - *

    - * To convert using a reference schema for field ids and ambiguous types, use - * {@link #convert(Schema, StructType)}. + * + *

    This conversion assigns fresh ids. + * + *

    Some data types are represented as the same Spark type. These are converted to a default + * type. + * + *

    To convert using a reference schema for field ids and ambiguous types, use {@link + * #convert(Schema, StructType)}. * * @param sparkType a Spark DataType * @return the equivalent Type @@ -169,11 +170,11 @@ public static Type convert(DataType sparkType) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} based on the given schema. - *

    - * This conversion does not assign new ids; it uses ids from the base schema. - *

    - * Data types, field order, and nullability will match the spark type. This conversion may return - * a schema that is not compatible with base schema. + * + *

    This conversion does not assign new ids; it uses ids from the base schema. + * + *

    Data types, field order, and nullability will match the spark type. This conversion may + * return a schema that is not compatible with base schema. * * @param baseSchema a Schema on which conversion is based * @param sparkType a Spark StructType @@ -182,7 +183,8 @@ public static Type convert(DataType sparkType) { */ public static Schema convert(Schema baseSchema, StructType sparkType) { // convert to a type with fresh ids - Types.StructType struct = SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); + Types.StructType struct = + SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); // reassign ids to match the base schema Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema); // fix types that can't be represented in Spark (UUID and Fixed) @@ -191,8 +193,8 @@ public static Schema convert(Schema baseSchema, StructType sparkType) { /** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - *

    - * This requires that the Spark type is a projection of the Schema. Nullability and types must + * + *

    This requires that the Spark type is a projection of the Schema. Nullability and types must * match. * * @param schema a Schema @@ -201,19 +203,20 @@ public static Schema convert(Schema baseSchema, StructType sparkType) { * @throws IllegalArgumentException if the Spark type does not match the Schema */ public static Schema prune(Schema schema, StructType requestedType) { - return new Schema(TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, ImmutableSet.of())) - .asNestedType() - .asStructType() - .fields()); + return new Schema( + TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, ImmutableSet.of())) + .asNestedType() + .asStructType() + .fields()); } /** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - *

    - * This requires that the Spark type is a projection of the Schema. Nullability and types must + * + *

    This requires that the Spark type is a projection of the Schema. Nullability and types must * match. - *

    - * The filters list of {@link Expression} is used to ensure that columns referenced by filters + * + *

    The filters list of {@link Expression} is used to ensure that columns referenced by filters * are projected. * * @param schema a Schema @@ -224,19 +227,20 @@ public static Schema prune(Schema schema, StructType requestedType) { */ public static Schema prune(Schema schema, StructType requestedType, List filters) { Set filterRefs = Binder.boundReferences(schema.asStruct(), filters, true); - return new Schema(TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) - .asNestedType() - .asStructType() - .fields()); + return new Schema( + TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) + .asNestedType() + .asStructType() + .fields()); } /** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - *

    - * This requires that the Spark type is a projection of the Schema. Nullability and types must + * + *

    This requires that the Spark type is a projection of the Schema. Nullability and types must * match. - *

    - * The filters list of {@link Expression} is used to ensure that columns referenced by filters + * + *

    The filters list of {@link Expression} is used to ensure that columns referenced by filters * are projected. * * @param schema a Schema @@ -245,14 +249,16 @@ public static Schema prune(Schema schema, StructType requestedType, List filterRefs = Binder.boundReferences(schema.asStruct(), Collections.singletonList(filter), caseSensitive); - return new Schema(TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) - .asNestedType() - .asStructType() - .fields()); + return new Schema( + TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) + .asNestedType() + .asStructType() + .fields()); } private static PartitionSpec identitySpec(Schema schema, Collection columns) { @@ -282,7 +288,7 @@ private static PartitionSpec identitySpec(Schema schema, List partitionN /** * Estimate approximate table size based on Spark schema and total records. * - * @param tableSchema Spark schema + * @param tableSchema Spark schema * @param totalRecords total records in the table * @return approximate size based on table schema */ diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkSessionCatalog.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkSessionCatalog.java index c6310fcc9d0f..4f55bc5e289e 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkSessionCatalog.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkSessionCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -44,8 +43,8 @@ * * @param CatalogPlugin class to avoid casting to TableCatalog and SupportsNamespaces. */ -public class SparkSessionCatalog - extends BaseCatalog implements CatalogExtension { +public class SparkSessionCatalog extends BaseCatalog + implements CatalogExtension { private static final String[] DEFAULT_NAMESPACE = new String[] {"default"}; private String catalogName = null; @@ -58,8 +57,9 @@ public class SparkSessionCatalog /** * Build a {@link SparkCatalog} to be used for Iceberg operations. - *

    - * The default implementation creates a new SparkCatalog with the session catalog's name and options. + * + *

    The default implementation creates a new SparkCatalog with the session catalog's name and + * options. * * @param name catalog name * @param options catalog options @@ -87,17 +87,20 @@ public String[][] listNamespaces(String[] namespace) throws NoSuchNamespaceExcep } @Override - public Map loadNamespaceMetadata(String[] namespace) throws NoSuchNamespaceException { + public Map loadNamespaceMetadata(String[] namespace) + throws NoSuchNamespaceException { return getSessionCatalog().loadNamespaceMetadata(namespace); } @Override - public void createNamespace(String[] namespace, Map metadata) throws NamespaceAlreadyExistsException { + public void createNamespace(String[] namespace, Map metadata) + throws NamespaceAlreadyExistsException { getSessionCatalog().createNamespace(namespace, metadata); } @Override - public void alterNamespace(String[] namespace, NamespaceChange... changes) throws NoSuchNamespaceException { + public void alterNamespace(String[] namespace, NamespaceChange... changes) + throws NoSuchNamespaceException { getSessionCatalog().alterNamespace(namespace, changes); } @@ -130,8 +133,8 @@ public void invalidateTable(Identifier ident) { } @Override - public Table createTable(Identifier ident, StructType schema, Transform[] partitions, - Map properties) + public Table createTable( + Identifier ident, StructType schema, Transform[] partitions, Map properties) throws TableAlreadyExistsException, NoSuchNamespaceException { String provider = properties.get("provider"); if (useIceberg(provider)) { @@ -143,8 +146,8 @@ public Table createTable(Identifier ident, StructType schema, Transform[] partit } @Override - public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] partitions, - Map properties) + public StagedTable stageCreate( + Identifier ident, StructType schema, Transform[] partitions, Map properties) throws TableAlreadyExistsException, NoSuchNamespaceException { String provider = properties.get("provider"); TableCatalog catalog; @@ -157,14 +160,15 @@ public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] catalog = getSessionCatalog(); } - // create the table with the session catalog, then wrap it in a staged table that will delete to roll back + // create the table with the session catalog, then wrap it in a staged table that will delete to + // roll back Table table = catalog.createTable(ident, schema, partitions, properties); return new RollbackStagedTable(catalog, ident, table); } @Override - public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] partitions, - Map properties) + public StagedTable stageReplace( + Identifier ident, StructType schema, Transform[] partitions, Map properties) throws NoSuchNamespaceException, NoSuchTableException { String provider = properties.get("provider"); TableCatalog catalog; @@ -183,7 +187,8 @@ public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] } try { - // create the table with the session catalog, then wrap it in a staged table that will delete to roll back + // create the table with the session catalog, then wrap it in a staged table that will delete + // to roll back Table table = catalog.createTable(ident, schema, partitions, properties); return new RollbackStagedTable(catalog, ident, table); @@ -194,8 +199,9 @@ public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] } @Override - public StagedTable stageCreateOrReplace(Identifier ident, StructType schema, Transform[] partitions, - Map properties) throws NoSuchNamespaceException { + public StagedTable stageCreateOrReplace( + Identifier ident, StructType schema, Transform[] partitions, Map properties) + throws NoSuchNamespaceException { String provider = properties.get("provider"); TableCatalog catalog; if (useIceberg(provider)) { @@ -211,7 +217,8 @@ public StagedTable stageCreateOrReplace(Identifier ident, StructType schema, Tra catalog.dropTable(ident); try { - // create the table with the session catalog, then wrap it in a staged table that will delete to roll back + // create the table with the session catalog, then wrap it in a staged table that will delete + // to roll back Table sessionCatalogTable = catalog.createTable(ident, schema, partitions, properties); return new RollbackStagedTable(catalog, ident, sessionCatalogTable); @@ -232,14 +239,17 @@ public Table alterTable(Identifier ident, TableChange... changes) throws NoSuchT @Override public boolean dropTable(Identifier ident) { - // no need to check table existence to determine which catalog to use. if a table doesn't exist then both are + // no need to check table existence to determine which catalog to use. if a table doesn't exist + // then both are // required to return false. return icebergCatalog.dropTable(ident) || getSessionCatalog().dropTable(ident); } @Override - public void renameTable(Identifier from, Identifier to) throws NoSuchTableException, TableAlreadyExistsException { - // rename is not supported by HadoopCatalog. to avoid UnsupportedOperationException for session catalog tables, + public void renameTable(Identifier from, Identifier to) + throws NoSuchTableException, TableAlreadyExistsException { + // rename is not supported by HadoopCatalog. to avoid UnsupportedOperationException for session + // catalog tables, // check table existence first to ensure that the table belongs to the Iceberg catalog. if (icebergCatalog.tableExists(from)) { icebergCatalog.renameTable(from, to); @@ -264,7 +274,8 @@ public final void initialize(String name, CaseInsensitiveStringMap options) { @Override @SuppressWarnings("unchecked") public void setDelegateCatalog(CatalogPlugin sparkSessionCatalog) { - if (sparkSessionCatalog instanceof TableCatalog && sparkSessionCatalog instanceof SupportsNamespaces) { + if (sparkSessionCatalog instanceof TableCatalog + && sparkSessionCatalog instanceof SupportsNamespaces) { this.sessionCatalog = (T) sparkSessionCatalog; } else { throw new IllegalArgumentException("Invalid session catalog: " + sparkSessionCatalog); @@ -291,8 +302,10 @@ private boolean useIceberg(String provider) { } private T getSessionCatalog() { - Preconditions.checkNotNull(sessionCatalog, "Delegated SessionCatalog is missing. " + - "Please make sure your are replacing Spark's default catalog, named 'spark_catalog'."); + Preconditions.checkNotNull( + sessionCatalog, + "Delegated SessionCatalog is missing. " + + "Please make sure your are replacing Spark's default catalog, named 'spark_catalog'."); return sessionCatalog; } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java index 30509e3381dc..77cfa0f34c63 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.StructLike; diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java index d3e2e8efe08b..c0c1255d200f 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.spark.sql.functions.col; + import java.io.IOException; import java.io.Serializable; import java.net.URI; @@ -94,8 +95,6 @@ import scala.collection.Seq; import scala.runtime.AbstractPartialFunction; -import static org.apache.spark.sql.functions.col; - /** * Java version of the original SparkTableUtil.scala * https://github.com/apache/iceberg/blob/apache-iceberg-0.8.0-incubating/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala @@ -109,20 +108,19 @@ public class SparkTableUtil { private static final PathFilter HIDDEN_PATH_FILTER = p -> !p.getName().startsWith("_") && !p.getName().startsWith("."); - private static final String duplicateFileMessage = "Cannot complete import because data files " + - "to be imported already exist within the target table: %s. " + - "This is disabled by default as Iceberg is not designed for mulitple references to the same file" + - " within the same table. If you are sure, you may set 'check_duplicate_files' to false to force the import."; - + private static final String duplicateFileMessage = + "Cannot complete import because data files " + + "to be imported already exist within the target table: %s. " + + "This is disabled by default as Iceberg is not designed for mulitple references to the same file" + + " within the same table. If you are sure, you may set 'check_duplicate_files' to false to force the import."; - private SparkTableUtil() { - } + private SparkTableUtil() {} /** * Returns a DataFrame with a row for each partition in the table. * - * The DataFrame has 3 columns, partition key (a=1/b=2), partition location, and format - * (avro or parquet). + *

    The DataFrame has 3 columns, partition key (a=1/b=2), partition location, and format (avro + * or parquet). * * @param spark a Spark session * @param table a table name and (optional) database @@ -130,7 +128,9 @@ private SparkTableUtil() { */ public static Dataset partitionDF(SparkSession spark, String table) { List partitions = getPartitions(spark, table); - return spark.createDataFrame(partitions, SparkPartition.class).toDF("partition", "uri", "format"); + return spark + .createDataFrame(partitions, SparkPartition.class) + .toDF("partition", "uri", "format"); } /** @@ -141,9 +141,12 @@ public static Dataset partitionDF(SparkSession spark, String table) { * @param expression The expression whose matching partitions are returned. * @return a DataFrame of the table partitions. */ - public static Dataset partitionDFByFilter(SparkSession spark, String table, String expression) { + public static Dataset partitionDFByFilter( + SparkSession spark, String table, String expression) { List partitions = getPartitionsByFilter(spark, table, expression); - return spark.createDataFrame(partitions, SparkPartition.class).toDF("partition", "uri", "format"); + return spark + .createDataFrame(partitions, SparkPartition.class) + .toDF("partition", "uri", "format"); } /** @@ -158,7 +161,8 @@ public static List getPartitions(SparkSession spark, String tabl TableIdentifier tableIdent = spark.sessionState().sqlParser().parseTableIdentifier(table); return getPartitions(spark, tableIdent, null); } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to parse table identifier: %s", table); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to parse table identifier: %s", table); } } @@ -170,30 +174,33 @@ public static List getPartitions(SparkSession spark, String tabl * @param partitionFilter partition filter, or null if no filter * @return all table's partitions */ - public static List getPartitions(SparkSession spark, TableIdentifier tableIdent, - Map partitionFilter) { + public static List getPartitions( + SparkSession spark, TableIdentifier tableIdent, Map partitionFilter) { try { SessionCatalog catalog = spark.sessionState().catalog(); CatalogTable catalogTable = catalog.getTableMetadata(tableIdent); Option> scalaPartitionFilter; if (partitionFilter != null && !partitionFilter.isEmpty()) { - scalaPartitionFilter = Option.apply(JavaConverters.mapAsScalaMapConverter(partitionFilter).asScala() - .toMap(Predef.conforms())); + scalaPartitionFilter = + Option.apply( + JavaConverters.mapAsScalaMapConverter(partitionFilter) + .asScala() + .toMap(Predef.conforms())); } else { scalaPartitionFilter = Option.empty(); } - Seq partitions = catalog.listPartitions(tableIdent, scalaPartitionFilter); - return JavaConverters - .seqAsJavaListConverter(partitions) - .asJava() - .stream() + Seq partitions = + catalog.listPartitions(tableIdent, scalaPartitionFilter); + return JavaConverters.seqAsJavaListConverter(partitions).asJava().stream() .map(catalogPartition -> toSparkPartition(catalogPartition, catalogTable)) .collect(Collectors.toList()); } catch (NoSuchDatabaseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Database not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Database not found in catalog.", tableIdent); } catch (NoSuchTableException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Table not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Table not found in catalog.", tableIdent); } } @@ -205,19 +212,22 @@ public static List getPartitions(SparkSession spark, TableIdenti * @param predicate a predicate on partition columns * @return matching table's partitions */ - public static List getPartitionsByFilter(SparkSession spark, String table, String predicate) { + public static List getPartitionsByFilter( + SparkSession spark, String table, String predicate) { TableIdentifier tableIdent; try { tableIdent = spark.sessionState().sqlParser().parseTableIdentifier(table); } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to parse the table identifier: %s", table); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to parse the table identifier: %s", table); } Expression unresolvedPredicateExpr; try { unresolvedPredicateExpr = spark.sessionState().sqlParser().parseExpression(predicate); } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to parse the predicate expression: %s", predicate); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to parse the predicate expression: %s", predicate); } Expression resolvedPredicateExpr = resolveAttrs(spark, table, unresolvedPredicateExpr); @@ -232,8 +242,8 @@ public static List getPartitionsByFilter(SparkSession spark, Str * @param predicateExpr a predicate expression on partition columns * @return matching table's partitions */ - public static List getPartitionsByFilter(SparkSession spark, TableIdentifier tableIdent, - Expression predicateExpr) { + public static List getPartitionsByFilter( + SparkSession spark, TableIdentifier tableIdent, Expression predicateExpr) { try { SessionCatalog catalog = spark.sessionState().catalog(); CatalogTable catalogTable = catalog.getTableMetadata(tableIdent); @@ -244,111 +254,131 @@ public static List getPartitionsByFilter(SparkSession spark, Tab } else { resolvedPredicateExpr = predicateExpr; } - Seq predicates = JavaConverters - .collectionAsScalaIterableConverter(ImmutableList.of(resolvedPredicateExpr)) - .asScala().toSeq(); + Seq predicates = + JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(resolvedPredicateExpr)) + .asScala() + .toSeq(); - Seq partitions = catalog.listPartitionsByFilter(tableIdent, predicates); + Seq partitions = + catalog.listPartitionsByFilter(tableIdent, predicates); - return JavaConverters - .seqAsJavaListConverter(partitions) - .asJava() - .stream() + return JavaConverters.seqAsJavaListConverter(partitions).asJava().stream() .map(catalogPartition -> toSparkPartition(catalogPartition, catalogTable)) .collect(Collectors.toList()); } catch (NoSuchDatabaseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Database not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Database not found in catalog.", tableIdent); } catch (NoSuchTableException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Table not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Table not found in catalog.", tableIdent); } } /** * Returns the data files in a partition by listing the partition location. * - * For Parquet and ORC partitions, this will read metrics from the file footer. For Avro partitions, - * metrics are set to null. + *

    For Parquet and ORC partitions, this will read metrics from the file footer. For Avro + * partitions, metrics are set to null. * * @param partition a partition * @param conf a serializable Hadoop conf * @param metricsConfig a metrics conf * @return a List of DataFile - * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, Configuration, - * MetricsConfig, NameMapping)} + * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, + * Configuration, MetricsConfig, NameMapping)} */ @Deprecated - public static List listPartition(SparkPartition partition, PartitionSpec spec, - SerializableConfiguration conf, MetricsConfig metricsConfig) { + public static List listPartition( + SparkPartition partition, + PartitionSpec spec, + SerializableConfiguration conf, + MetricsConfig metricsConfig) { return listPartition(partition, spec, conf, metricsConfig, null); } /** * Returns the data files in a partition by listing the partition location. * - * For Parquet and ORC partitions, this will read metrics from the file footer. For Avro partitions, - * metrics are set to null. + *

    For Parquet and ORC partitions, this will read metrics from the file footer. For Avro + * partitions, metrics are set to null. * * @param partition a partition * @param conf a serializable Hadoop conf * @param metricsConfig a metrics conf * @param mapping a name mapping * @return a List of DataFile - * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, Configuration, - * MetricsConfig, NameMapping)} + * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, + * Configuration, MetricsConfig, NameMapping)} */ @Deprecated - public static List listPartition(SparkPartition partition, PartitionSpec spec, - SerializableConfiguration conf, MetricsConfig metricsConfig, - NameMapping mapping) { - return TableMigrationUtil.listPartition(partition.values, partition.uri, partition.format, spec, conf.get(), - metricsConfig, mapping); + public static List listPartition( + SparkPartition partition, + PartitionSpec spec, + SerializableConfiguration conf, + MetricsConfig metricsConfig, + NameMapping mapping) { + return TableMigrationUtil.listPartition( + partition.values, + partition.uri, + partition.format, + spec, + conf.get(), + metricsConfig, + mapping); } - - private static SparkPartition toSparkPartition(CatalogTablePartition partition, CatalogTable table) { + private static SparkPartition toSparkPartition( + CatalogTablePartition partition, CatalogTable table) { Option locationUri = partition.storage().locationUri(); Option serde = partition.storage().serde(); Preconditions.checkArgument(locationUri.nonEmpty(), "Partition URI should be defined"); - Preconditions.checkArgument(serde.nonEmpty() || table.provider().nonEmpty(), - "Partition format should be defined"); + Preconditions.checkArgument( + serde.nonEmpty() || table.provider().nonEmpty(), "Partition format should be defined"); String uri = Util.uriToString(locationUri.get()); String format = serde.nonEmpty() ? serde.get() : table.provider().get(); - Map partitionSpec = JavaConverters.mapAsJavaMapConverter(partition.spec()).asJava(); + Map partitionSpec = + JavaConverters.mapAsJavaMapConverter(partition.spec()).asJava(); return new SparkPartition(partitionSpec, uri, format); } private static Expression resolveAttrs(SparkSession spark, String table, Expression expr) { Function2 resolver = spark.sessionState().analyzer().resolver(); LogicalPlan plan = spark.table(table).queryExecution().analyzed(); - return expr.transform(new AbstractPartialFunction() { - @Override - public Expression apply(Expression attr) { - UnresolvedAttribute unresolvedAttribute = (UnresolvedAttribute) attr; - Option namedExpressionOption = plan.resolve(unresolvedAttribute.nameParts(), resolver); - if (namedExpressionOption.isDefined()) { - return (Expression) namedExpressionOption.get(); - } else { - throw new IllegalArgumentException( - String.format("Could not resolve %s using columns: %s", attr, plan.output())); - } - } - - @Override - public boolean isDefinedAt(Expression attr) { - return attr instanceof UnresolvedAttribute; - } - }); + return expr.transform( + new AbstractPartialFunction() { + @Override + public Expression apply(Expression attr) { + UnresolvedAttribute unresolvedAttribute = (UnresolvedAttribute) attr; + Option namedExpressionOption = + plan.resolve(unresolvedAttribute.nameParts(), resolver); + if (namedExpressionOption.isDefined()) { + return (Expression) namedExpressionOption.get(); + } else { + throw new IllegalArgumentException( + String.format("Could not resolve %s using columns: %s", attr, plan.output())); + } + } + + @Override + public boolean isDefinedAt(Expression attr) { + return attr instanceof UnresolvedAttribute; + } + }); } - private static Iterator buildManifest(SerializableConfiguration conf, PartitionSpec spec, - String basePath, Iterator> fileTuples) { + private static Iterator buildManifest( + SerializableConfiguration conf, + PartitionSpec spec, + String basePath, + Iterator> fileTuples) { if (fileTuples.hasNext()) { FileIO io = new HadoopFileIO(conf.get()); TaskContext ctx = TaskContext.get(); - String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId()); + String suffix = + String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId()); Path location = new Path(basePath, suffix); String outputPath = FileFormat.AVRO.addExtension(location.toString()); OutputFile outputFile = io.newOutputFile(outputPath); @@ -357,7 +387,8 @@ private static Iterator buildManifest(SerializableConfiguration co try (ManifestWriter writerRef = writer) { fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2)); } catch (IOException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to close the manifest writer: %s", outputPath); } ManifestFile manifestFile = writer.toManifestFile(); @@ -370,42 +401,54 @@ private static Iterator buildManifest(SerializableConfiguration co /** * Import files from an existing Spark table to an Iceberg table. * - * The import uses the Spark session to get table metadata. It assumes no - * operation is going on the original and target table and thus is not - * thread-safe. + *

    The import uses the Spark session to get table metadata. It assumes no operation is going on + * the original and target table and thus is not thread-safe. * * @param spark a Spark session * @param sourceTableIdent an identifier of the source Spark table * @param targetTable an Iceberg table where to import the data * @param stagingDir a staging directory to store temporary manifest files - * @param partitionFilter only import partitions whose values match those in the map, can be partially defined + * @param partitionFilter only import partitions whose values match those in the map, can be + * partially defined * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file */ - public static void importSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, - String stagingDir, Map partitionFilter, - boolean checkDuplicateFiles) { + public static void importSparkTable( + SparkSession spark, + TableIdentifier sourceTableIdent, + Table targetTable, + String stagingDir, + Map partitionFilter, + boolean checkDuplicateFiles) { SessionCatalog catalog = spark.sessionState().catalog(); - String db = sourceTableIdent.database().nonEmpty() ? - sourceTableIdent.database().get() : - catalog.getCurrentDatabase(); - TableIdentifier sourceTableIdentWithDB = new TableIdentifier(sourceTableIdent.table(), Some.apply(db)); + String db = + sourceTableIdent.database().nonEmpty() + ? sourceTableIdent.database().get() + : catalog.getCurrentDatabase(); + TableIdentifier sourceTableIdentWithDB = + new TableIdentifier(sourceTableIdent.table(), Some.apply(db)); if (!catalog.tableExists(sourceTableIdentWithDB)) { - throw new org.apache.iceberg.exceptions.NoSuchTableException("Table %s does not exist", sourceTableIdentWithDB); + throw new org.apache.iceberg.exceptions.NoSuchTableException( + "Table %s does not exist", sourceTableIdentWithDB); } try { - PartitionSpec spec = SparkSchemaUtil.specForTable(spark, sourceTableIdentWithDB.unquotedString()); + PartitionSpec spec = + SparkSchemaUtil.specForTable(spark, sourceTableIdentWithDB.unquotedString()); if (Objects.equal(spec, PartitionSpec.unpartitioned())) { - importUnpartitionedSparkTable(spark, sourceTableIdentWithDB, targetTable, checkDuplicateFiles); + importUnpartitionedSparkTable( + spark, sourceTableIdentWithDB, targetTable, checkDuplicateFiles); } else { - List sourceTablePartitions = getPartitions(spark, sourceTableIdent, - partitionFilter); - Preconditions.checkArgument(!sourceTablePartitions.isEmpty(), - "Cannot find any partitions in table %s", sourceTableIdent); - importSparkPartitions(spark, sourceTablePartitions, targetTable, spec, stagingDir, checkDuplicateFiles); + List sourceTablePartitions = + getPartitions(spark, sourceTableIdent, partitionFilter); + Preconditions.checkArgument( + !sourceTablePartitions.isEmpty(), + "Cannot find any partitions in table %s", + sourceTableIdent); + importSparkPartitions( + spark, sourceTablePartitions, targetTable, spec, stagingDir, checkDuplicateFiles); } } catch (AnalysisException e) { throw SparkExceptionUtil.toUncheckedException( @@ -416,9 +459,8 @@ public static void importSparkTable(SparkSession spark, TableIdentifier sourceTa /** * Import files from an existing Spark table to an Iceberg table. * - * The import uses the Spark session to get table metadata. It assumes no - * operation is going on the original and target table and thus is not - * thread-safe. + *

    The import uses the Spark session to get table metadata. It assumes no operation is going on + * the original and target table and thus is not thread-safe. * * @param spark a Spark session * @param sourceTableIdent an identifier of the source Spark table @@ -426,33 +468,49 @@ public static void importSparkTable(SparkSession spark, TableIdentifier sourceTa * @param stagingDir a staging directory to store temporary manifest files * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file */ - public static void importSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, - String stagingDir, boolean checkDuplicateFiles) { - importSparkTable(spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), checkDuplicateFiles); + public static void importSparkTable( + SparkSession spark, + TableIdentifier sourceTableIdent, + Table targetTable, + String stagingDir, + boolean checkDuplicateFiles) { + importSparkTable( + spark, + sourceTableIdent, + targetTable, + stagingDir, + Collections.emptyMap(), + checkDuplicateFiles); } /** * Import files from an existing Spark table to an Iceberg table. * - * The import uses the Spark session to get table metadata. It assumes no - * operation is going on the original and target table and thus is not - * thread-safe. + *

    The import uses the Spark session to get table metadata. It assumes no operation is going on + * the original and target table and thus is not thread-safe. + * * @param spark a Spark session * @param sourceTableIdent an identifier of the source Spark table * @param targetTable an Iceberg table where to import the data * @param stagingDir a staging directory to store temporary manifest files */ - public static void importSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, - String stagingDir) { - importSparkTable(spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), false); + public static void importSparkTable( + SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, String stagingDir) { + importSparkTable( + spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), false); } - private static void importUnpartitionedSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, - Table targetTable, boolean checkDuplicateFiles) { + private static void importUnpartitionedSparkTable( + SparkSession spark, + TableIdentifier sourceTableIdent, + Table targetTable, + boolean checkDuplicateFiles) { try { CatalogTable sourceTable = spark.sessionState().catalog().getTableMetadata(sourceTableIdent); Option format = - sourceTable.storage().serde().nonEmpty() ? sourceTable.storage().serde() : sourceTable.provider(); + sourceTable.storage().serde().nonEmpty() + ? sourceTable.storage().serde() + : sourceTable.provider(); Preconditions.checkArgument(format.nonEmpty(), "Could not determine table format"); Map partition = Collections.emptyMap(); @@ -460,20 +518,34 @@ private static void importUnpartitionedSparkTable(SparkSession spark, TableIdent Configuration conf = spark.sessionState().newHadoopConf(); MetricsConfig metricsConfig = MetricsConfig.forTable(targetTable); String nameMappingString = targetTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; - - List files = TableMigrationUtil.listPartition( - partition, Util.uriToString(sourceTable.location()), format.get(), spec, conf, metricsConfig, nameMapping); + NameMapping nameMapping = + nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; + + List files = + TableMigrationUtil.listPartition( + partition, + Util.uriToString(sourceTable.location()), + format.get(), + spec, + conf, + metricsConfig, + nameMapping); if (checkDuplicateFiles) { - Dataset importedFiles = spark.createDataset( - Lists.transform(files, f -> f.path().toString()), Encoders.STRING()).toDF("file_path"); - Dataset existingFiles = loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); - Column joinCond = existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); - Dataset duplicates = importedFiles.join(existingFiles, joinCond) - .select("file_path").as(Encoders.STRING()); - Preconditions.checkState(duplicates.isEmpty(), - String.format(duplicateFileMessage, Joiner.on(",").join((String[]) duplicates.take(10)))); + Dataset importedFiles = + spark + .createDataset(Lists.transform(files, f -> f.path().toString()), Encoders.STRING()) + .toDF("file_path"); + Dataset existingFiles = + loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); + Column joinCond = + existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); + Dataset duplicates = + importedFiles.join(existingFiles, joinCond).select("file_path").as(Encoders.STRING()); + Preconditions.checkState( + duplicates.isEmpty(), + String.format( + duplicateFileMessage, Joiner.on(",").join((String[]) duplicates.take(10)))); } AppendFiles append = targetTable.newAppend(); @@ -498,57 +570,74 @@ private static void importUnpartitionedSparkTable(SparkSession spark, TableIdent * @param stagingDir a staging directory to store temporary manifest files * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file */ - public static void importSparkPartitions(SparkSession spark, List partitions, Table targetTable, - PartitionSpec spec, String stagingDir, boolean checkDuplicateFiles) { + public static void importSparkPartitions( + SparkSession spark, + List partitions, + Table targetTable, + PartitionSpec spec, + String stagingDir, + boolean checkDuplicateFiles) { Configuration conf = spark.sessionState().newHadoopConf(); SerializableConfiguration serializableConf = new SerializableConfiguration(conf); - int parallelism = Math.min(partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism()); + int parallelism = + Math.min( + partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism()); int numShufflePartitions = spark.sessionState().conf().numShufflePartitions(); MetricsConfig metricsConfig = MetricsConfig.fromProperties(targetTable.properties()); String nameMappingString = targetTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; + NameMapping nameMapping = + nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD partitionRDD = sparkContext.parallelize(partitions, parallelism); - Dataset partitionDS = spark.createDataset( - partitionRDD.rdd(), - Encoders.javaSerialization(SparkPartition.class)); + Dataset partitionDS = + spark.createDataset(partitionRDD.rdd(), Encoders.javaSerialization(SparkPartition.class)); - Dataset filesToImport = partitionDS - .flatMap((FlatMapFunction) sparkPartition -> - listPartition(sparkPartition, spec, serializableConf, metricsConfig, nameMapping).iterator(), + Dataset filesToImport = + partitionDS.flatMap( + (FlatMapFunction) + sparkPartition -> + listPartition( + sparkPartition, spec, serializableConf, metricsConfig, nameMapping) + .iterator(), Encoders.javaSerialization(DataFile.class)); if (checkDuplicateFiles) { - Dataset importedFiles = filesToImport - .map((MapFunction) f -> f.path().toString(), Encoders.STRING()) - .toDF("file_path"); + Dataset importedFiles = + filesToImport + .map((MapFunction) f -> f.path().toString(), Encoders.STRING()) + .toDF("file_path"); Dataset existingFiles = loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); - Column joinCond = existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); - Dataset duplicates = importedFiles.join(existingFiles, joinCond) - .select("file_path").as(Encoders.STRING()); - Preconditions.checkState(duplicates.isEmpty(), + Column joinCond = + existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); + Dataset duplicates = + importedFiles.join(existingFiles, joinCond).select("file_path").as(Encoders.STRING()); + Preconditions.checkState( + duplicates.isEmpty(), String.format(duplicateFileMessage, Joiner.on(",").join((String[]) duplicates.take(10)))); } - List manifests = filesToImport - .repartition(numShufflePartitions) - .map((MapFunction>) file -> - Tuple2.apply(file.path().toString(), file), - Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class))) - .orderBy(col("_1")) - .mapPartitions( - (MapPartitionsFunction, ManifestFile>) fileTuple -> - buildManifest(serializableConf, spec, stagingDir, fileTuple), - Encoders.javaSerialization(ManifestFile.class)) - .collectAsList(); + List manifests = + filesToImport + .repartition(numShufflePartitions) + .map( + (MapFunction>) + file -> Tuple2.apply(file.path().toString(), file), + Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class))) + .orderBy(col("_1")) + .mapPartitions( + (MapPartitionsFunction, ManifestFile>) + fileTuple -> buildManifest(serializableConf, spec, stagingDir, fileTuple), + Encoders.javaSerialization(ManifestFile.class)) + .collectAsList(); try { - boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean( - targetTable.properties(), - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); + boolean snapshotIdInheritanceEnabled = + PropertyUtil.propertyAsBoolean( + targetTable.properties(), + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); AppendFiles append = targetTable.newAppend(); manifests.forEach(append::appendManifest); @@ -573,13 +662,17 @@ public static void importSparkPartitions(SparkSession spark, List partitions, Table targetTable, - PartitionSpec spec, String stagingDir) { + public static void importSparkPartitions( + SparkSession spark, + List partitions, + Table targetTable, + PartitionSpec spec, + String stagingDir) { importSparkPartitions(spark, partitions, targetTable, spec, stagingDir, false); } - public static List filterPartitions(List partitions, - Map partitionFilter) { + public static List filterPartitions( + List partitions, Map partitionFilter) { if (partitionFilter.isEmpty()) { return partitions; } else { @@ -597,17 +690,25 @@ private static void deleteManifests(FileIO io, List manifests) { } // Attempt to use Spark3 Catalog resolution if available on the path - private static final DynMethods.UnboundMethod LOAD_METADATA_TABLE = DynMethods.builder("loadMetadataTable") - .hiddenImpl("org.apache.iceberg.spark.Spark3Util", SparkSession.class, Table.class, MetadataTableType.class) - .orNoop() - .build(); - - public static Dataset loadCatalogMetadataTable(SparkSession spark, Table table, MetadataTableType type) { - Preconditions.checkArgument(!LOAD_METADATA_TABLE.isNoop(), "Cannot find Spark3Util class but Spark3 is in use"); + private static final DynMethods.UnboundMethod LOAD_METADATA_TABLE = + DynMethods.builder("loadMetadataTable") + .hiddenImpl( + "org.apache.iceberg.spark.Spark3Util", + SparkSession.class, + Table.class, + MetadataTableType.class) + .orNoop() + .build(); + + public static Dataset loadCatalogMetadataTable( + SparkSession spark, Table table, MetadataTableType type) { + Preconditions.checkArgument( + !LOAD_METADATA_TABLE.isNoop(), "Cannot find Spark3Util class but Spark3 is in use"); return LOAD_METADATA_TABLE.asStatic().invoke(spark, table, type); } - public static Dataset loadMetadataTable(SparkSession spark, Table table, MetadataTableType type) { + public static Dataset loadMetadataTable( + SparkSession spark, Table table, MetadataTableType type) { if (spark.version().startsWith("3")) { // construct the metadata table instance directly Dataset catalogMetadataTable = loadCatalogMetadataTable(spark, table, type); @@ -633,14 +734,12 @@ public static Dataset loadMetadataTable(SparkSession spark, Table table, Me // Try loading by name as a Hive table without Catalog return dataFrameReader.load(tableName.replaceFirst("hive\\.", "") + "." + type); } else { - throw new IllegalArgumentException(String.format( - "Cannot find the metadata table for %s of type %s", tableName, type)); + throw new IllegalArgumentException( + String.format("Cannot find the metadata table for %s of type %s", tableName, type)); } } - /** - * Class representing a table partition. - */ + /** Class representing a table partition. */ public static class SparkPartition implements Serializable { private final Map values; private final String uri; @@ -682,9 +781,9 @@ public boolean equals(Object o) { return false; } SparkPartition that = (SparkPartition) o; - return Objects.equal(values, that.values) && - Objects.equal(uri, that.uri) && - Objects.equal(format, that.format); + return Objects.equal(values, that.values) + && Objects.equal(uri, that.uri) + && Objects.equal(format, that.format); } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java index f0b8b2a9762b..17499736fbeb 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -70,7 +69,7 @@ public Type struct(StructType struct, List types) { List newFields = Lists.newArrayListWithExpectedSize(fields.length); boolean isRoot = root == struct; for (int i = 0; i < fields.length; i += 1) { - StructField field = fields[i]; + StructField field = fields[i]; Type type = types.get(i); int id; @@ -122,10 +121,9 @@ public Type atomic(DataType atomic) { if (atomic instanceof BooleanType) { return Types.BooleanType.get(); - } else if ( - atomic instanceof IntegerType || - atomic instanceof ShortType || - atomic instanceof ByteType) { + } else if (atomic instanceof IntegerType + || atomic instanceof ShortType + || atomic instanceof ByteType) { return Types.IntegerType.get(); } else if (atomic instanceof LongType) { @@ -137,10 +135,9 @@ public Type atomic(DataType atomic) { } else if (atomic instanceof DoubleType) { return Types.DoubleType.get(); - } else if ( - atomic instanceof StringType || - atomic instanceof CharType || - atomic instanceof VarcharType) { + } else if (atomic instanceof StringType + || atomic instanceof CharType + || atomic instanceof VarcharType) { return Types.StringType.get(); } else if (atomic instanceof DateType) { @@ -151,13 +148,11 @@ public Type atomic(DataType atomic) { } else if (atomic instanceof DecimalType) { return Types.DecimalType.of( - ((DecimalType) atomic).precision(), - ((DecimalType) atomic).scale()); + ((DecimalType) atomic).precision(), ((DecimalType) atomic).scale()); } else if (atomic instanceof BinaryType) { return Types.BinaryType.get(); } - throw new UnsupportedOperationException( - "Not a supported type: " + atomic.catalogString()); + throw new UnsupportedOperationException("Not a supported type: " + atomic.catalogString()); } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java index 83b31940711e..1ef694263fa4 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -35,26 +34,22 @@ static T visit(DataType type, SparkTypeVisitor visitor) { List fieldResults = Lists.newArrayListWithExpectedSize(fields.length); for (StructField field : fields) { - fieldResults.add(visitor.field( - field, - visit(field.dataType(), visitor))); + fieldResults.add(visitor.field(field, visit(field.dataType(), visitor))); } return visitor.struct((StructType) type, fieldResults); } else if (type instanceof MapType) { - return visitor.map((MapType) type, + return visitor.map( + (MapType) type, visit(((MapType) type).keyType(), visitor), visit(((MapType) type).valueType(), visitor)); } else if (type instanceof ArrayType) { - return visitor.array( - (ArrayType) type, - visit(((ArrayType) type).elementType(), visitor)); + return visitor.array((ArrayType) type, visit(((ArrayType) type).elementType(), visitor)); } else if (type instanceof UserDefinedType) { - throw new UnsupportedOperationException( - "User-defined types are not supported"); + throw new UnsupportedOperationException("User-defined types are not supported"); } else { return visitor.atomic(type); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java index 06f74d4fda06..2cdec2b0629c 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -42,26 +41,33 @@ public class SparkUtil { - public static final String TIMESTAMP_WITHOUT_TIMEZONE_ERROR = String.format("Cannot handle timestamp without" + - " timezone fields in Spark. Spark does not natively support this type but if you would like to handle all" + - " timestamps as timestamp with timezone set '%s' to true. This will not change the underlying values stored" + - " but will change their displayed values in Spark. For more information please see" + - " https://docs.databricks.com/spark/latest/dataframes-datasets/dates-timestamps.html#ansi-sql-and" + - "-spark-sql-timestamps", SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); + public static final String TIMESTAMP_WITHOUT_TIMEZONE_ERROR = + String.format( + "Cannot handle timestamp without" + + " timezone fields in Spark. Spark does not natively support this type but if you would like to handle all" + + " timestamps as timestamp with timezone set '%s' to true. This will not change the underlying values stored" + + " but will change their displayed values in Spark. For more information please see" + + " https://docs.databricks.com/spark/latest/dataframes-datasets/dates-timestamps.html#ansi-sql-and" + + "-spark-sql-timestamps", + SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); private static final String SPARK_CATALOG_CONF_PREFIX = "spark.sql.catalog"; - // Format string used as the prefix for spark configuration keys to override hadoop configuration values - // for Iceberg tables from a given catalog. These keys can be specified as `spark.sql.catalog.$catalogName.hadoop.*`, - // similar to using `spark.hadoop.*` to override hadoop configurations globally for a given spark session. - private static final String SPARK_CATALOG_HADOOP_CONF_OVERRIDE_FMT_STR = SPARK_CATALOG_CONF_PREFIX + ".%s.hadoop."; + // Format string used as the prefix for spark configuration keys to override hadoop configuration + // values + // for Iceberg tables from a given catalog. These keys can be specified as + // `spark.sql.catalog.$catalogName.hadoop.*`, + // similar to using `spark.hadoop.*` to override hadoop configurations globally for a given spark + // session. + private static final String SPARK_CATALOG_HADOOP_CONF_OVERRIDE_FMT_STR = + SPARK_CATALOG_CONF_PREFIX + ".%s.hadoop."; - private SparkUtil() { - } + private SparkUtil() {} public static FileIO serializableFileIO(Table table) { if (table.io() instanceof HadoopConfigurable) { // we need to use Spark's SerializableConfiguration to avoid issues with Kryo serialization - ((HadoopConfigurable) table.io()).serializeConfWith(conf -> new SerializableConfiguration(conf)::value); + ((HadoopConfigurable) table.io()) + .serializeConfWith(conf -> new SerializableConfiguration(conf)::value); } return table.io(); @@ -75,11 +81,12 @@ public static FileIO serializableFileIO(Table table) { */ public static void validatePartitionTransforms(PartitionSpec spec) { if (spec.fields().stream().anyMatch(field -> field.transform() instanceof UnknownTransform)) { - String unsupported = spec.fields().stream() - .map(PartitionField::transform) - .filter(transform -> transform instanceof UnknownTransform) - .map(Transform::toString) - .collect(Collectors.joining(", ")); + String unsupported = + spec.fields().stream() + .map(PartitionField::transform) + .filter(transform -> transform instanceof UnknownTransform) + .map(Transform::toString) + .collect(Collectors.joining(", ")); throw new UnsupportedOperationException( String.format("Cannot write using unsupported transforms: %s", unsupported)); @@ -87,18 +94,20 @@ public static void validatePartitionTransforms(PartitionSpec spec) { } /** - * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply - * Attempts to find the catalog and identifier a multipart identifier represents + * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply Attempts to find the + * catalog and identifier a multipart identifier represents + * * @param nameParts Multipart identifier representing a table * @return The CatalogPlugin and Identifier for the table */ - public static Pair catalogAndIdentifier(List nameParts, - Function catalogProvider, - BiFunction identiferProvider, - C currentCatalog, - String[] currentNamespace) { - Preconditions.checkArgument(!nameParts.isEmpty(), - "Cannot determine catalog and identifier from empty name"); + public static Pair catalogAndIdentifier( + List nameParts, + Function catalogProvider, + BiFunction identiferProvider, + C currentCatalog, + String[] currentNamespace) { + Preconditions.checkArgument( + !nameParts.isEmpty(), "Cannot determine catalog and identifier from empty name"); int lastElementIndex = nameParts.size() - 1; String name = nameParts.get(lastElementIndex); @@ -110,7 +119,7 @@ public static Pair catalogAndIdentifier(List nameParts, C catalog = catalogProvider.apply(nameParts.get(0)); if (catalog == null) { // The first element was not a valid catalog, treat it like part of the namespace - String[] namespace = nameParts.subList(0, lastElementIndex).toArray(new String[0]); + String[] namespace = nameParts.subList(0, lastElementIndex).toArray(new String[0]); return Pair.of(currentCatalog, identiferProvider.apply(namespace, name)); } else { // Assume the first element is a valid catalog @@ -122,6 +131,7 @@ public static Pair catalogAndIdentifier(List nameParts, /** * Responsible for checking if the table schema has a timestamp without timezone column + * * @param schema table schema to check if it contains a timestamp without timezone column * @return boolean indicating if the schema passed in has a timestamp field without a timezone */ @@ -131,15 +141,17 @@ public static boolean hasTimestampWithoutZone(Schema schema) { /** * Checks whether timestamp types for new tables should be stored with timezone info. - *

    - * The default value is false and all timestamp fields are stored as {@link Types.TimestampType#withZone()}. - * If enabled, all timestamp fields in new tables will be stored as {@link Types.TimestampType#withoutZone()}. + * + *

    The default value is false and all timestamp fields are stored as {@link + * Types.TimestampType#withZone()}. If enabled, all timestamp fields in new tables will be stored + * as {@link Types.TimestampType#withoutZone()}. * * @param sessionConf a Spark runtime config * @return true if timestamp types for new tables should be stored with timezone info */ public static boolean useTimestampWithoutZoneInNewTables(RuntimeConfig sessionConf) { - String sessionConfValue = sessionConf.get(SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, null); + String sessionConfValue = + sessionConf.get(SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, null); if (sessionConfValue != null) { return Boolean.parseBoolean(sessionConfValue); } @@ -147,32 +159,40 @@ public static boolean useTimestampWithoutZoneInNewTables(RuntimeConfig sessionCo } /** - * Pulls any Catalog specific overrides for the Hadoop conf from the current SparkSession, which can be - * set via `spark.sql.catalog.$catalogName.hadoop.*` + * Pulls any Catalog specific overrides for the Hadoop conf from the current SparkSession, which + * can be set via `spark.sql.catalog.$catalogName.hadoop.*` * - * Mirrors the override of hadoop configurations for a given spark session using `spark.hadoop.*`. + *

    Mirrors the override of hadoop configurations for a given spark session using + * `spark.hadoop.*`. * - * The SparkCatalog allows for hadoop configurations to be overridden per catalog, by setting + *

    The SparkCatalog allows for hadoop configurations to be overridden per catalog, by setting * them on the SQLConf, where the following will add the property "fs.default.name" with value - * "hdfs://hanksnamenode:8020" to the catalog's hadoop configuration. - * SparkSession.builder() - * .config(s"spark.sql.catalog.$catalogName.hadoop.fs.default.name", "hdfs://hanksnamenode:8020") - * .getOrCreate() + * "hdfs://hanksnamenode:8020" to the catalog's hadoop configuration. SparkSession.builder() + * .config(s"spark.sql.catalog.$catalogName.hadoop.fs.default.name", "hdfs://hanksnamenode:8020") + * .getOrCreate() + * * @param spark The current Spark session * @param catalogName Name of the catalog to find overrides for. - * @return the Hadoop Configuration that should be used for this catalog, with catalog specific overrides applied. + * @return the Hadoop Configuration that should be used for this catalog, with catalog specific + * overrides applied. */ public static Configuration hadoopConfCatalogOverrides(SparkSession spark, String catalogName) { // Find keys for the catalog intended to be hadoop configurations final String hadoopConfCatalogPrefix = hadoopConfPrefixForCatalog(catalogName); final Configuration conf = spark.sessionState().newHadoopConf(); - spark.sqlContext().conf().settings().forEach((k, v) -> { - // These checks are copied from `spark.sessionState().newHadoopConfWithOptions()`, which we - // avoid using to not have to convert back and forth between scala / java map types. - if (v != null && k != null && k.startsWith(hadoopConfCatalogPrefix)) { - conf.set(k.substring(hadoopConfCatalogPrefix.length()), v); - } - }); + spark + .sqlContext() + .conf() + .settings() + .forEach( + (k, v) -> { + // These checks are copied from `spark.sessionState().newHadoopConfWithOptions()`, + // which we + // avoid using to not have to convert back and forth between scala / java map types. + if (v != null && k != null && k.startsWith(hadoopConfCatalogPrefix)) { + conf.set(k.substring(hadoopConfCatalogPrefix.length()), v); + } + }); return conf; } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java index b3e6b2f48887..5a5381099c76 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.nio.ByteBuffer; @@ -34,13 +33,10 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.util.DateTimeUtils; -/** - * A utility class that converts Spark values to Iceberg's internal representation. - */ +/** A utility class that converts Spark values to Iceberg's internal representation. */ public class SparkValueConverter { - private SparkValueConverter() { - } + private SparkValueConverter() {} public static Record convert(Schema schema, Row row) { return convert(schema.asStruct(), row); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java index 756f4197b736..08b3fbee7590 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Locale; @@ -31,18 +30,21 @@ /** * A class for common Iceberg configs for Spark writes. - *

    - * If a config is set at multiple levels, the following order of precedence is used (top to bottom): + * + *

    If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * *

      - *
    1. Write options
    2. - *
    3. Session configuration
    4. - *
    5. Table metadata
    6. + *
    7. Write options + *
    8. Session configuration + *
    9. Table metadata *
    - * The most specific value is set in write options and takes precedence over all other configs. - * If no write option is provided, this class checks the session configuration for any overrides. - * If no applicable value is found in the session configuration, this class uses the table metadata. - *

    - * Note this class is NOT meant to be serialized and sent to executors. + * + * The most specific value is set in write options and takes precedence over all other configs. If + * no write option is provided, this class checks the session configuration for any overrides. If no + * applicable value is found in the session configuration, this class uses the table metadata. + * + *

    Note this class is NOT meant to be serialized and sent to executors. */ public class SparkWriteConf { @@ -57,7 +59,8 @@ public SparkWriteConf(SparkSession spark, Table table, Map write } public boolean checkNullability() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.CHECK_NULLABILITY) .sessionConf(SparkSQLProperties.CHECK_NULLABILITY) .defaultValue(SparkSQLProperties.CHECK_NULLABILITY_DEFAULT) @@ -65,7 +68,8 @@ public boolean checkNullability() { } public boolean checkOrdering() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.CHECK_ORDERING) .sessionConf(SparkSQLProperties.CHECK_ORDERING) .defaultValue(SparkSQLProperties.CHECK_ORDERING_DEFAULT) @@ -74,18 +78,20 @@ public boolean checkOrdering() { /** * Enables writing a timestamp with time zone as a timestamp without time zone. - *

    - * Generally, this is not safe as a timestamp without time zone is supposed to represent the wall-clock time, - * i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, - * but a timestamp with time zone represents instant semantics, i.e. the timestamp - * is adjusted so that the corresponding time in the reader timezone is displayed. - *

    - * When set to false (default), an exception must be thrown if the table contains a timestamp without time zone. + * + *

    Generally, this is not safe as a timestamp without time zone is supposed to represent the + * wall-clock time, i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, + * but a timestamp with time zone represents instant semantics, i.e. the timestamp is adjusted so + * that the corresponding time in the reader timezone is displayed. + * + *

    When set to false (default), an exception must be thrown if the table contains a timestamp + * without time zone. * * @return boolean indicating if writing timestamps without timezone is allowed */ public boolean handleTimestampWithoutZone() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .sessionConf(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .defaultValue(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT) @@ -102,16 +108,19 @@ public String wapId() { } public FileFormat dataFileFormat() { - String valueAsString = confParser.stringConf() - .option(SparkWriteOptions.WRITE_FORMAT) - .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) - .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) - .parse(); + String valueAsString = + confParser + .stringConf() + .option(SparkWriteOptions.WRITE_FORMAT) + .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) + .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) + .parse(); return FileFormat.valueOf(valueAsString.toUpperCase(Locale.ENGLISH)); } public long targetDataFileSize() { - return confParser.longConf() + return confParser + .longConf() .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES) .tableProperty(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES) .defaultValue(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT) @@ -119,7 +128,8 @@ public long targetDataFileSize() { } public boolean fanoutWriterEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.FANOUT_ENABLED) .tableProperty(TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED) .defaultValue(TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED_DEFAULT) @@ -129,11 +139,13 @@ public boolean fanoutWriterEnabled() { public Map extraSnapshotMetadata() { Map extraSnapshotMetadata = Maps.newHashMap(); - writeOptions.forEach((key, value) -> { - if (key.startsWith(SnapshotSummary.EXTRA_METADATA_PREFIX)) { - extraSnapshotMetadata.put(key.substring(SnapshotSummary.EXTRA_METADATA_PREFIX.length()), value); - } - }); + writeOptions.forEach( + (key, value) -> { + if (key.startsWith(SnapshotSummary.EXTRA_METADATA_PREFIX)) { + extraSnapshotMetadata.put( + key.substring(SnapshotSummary.EXTRA_METADATA_PREFIX.length()), value); + } + }); return extraSnapshotMetadata; } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java index 38574d364b20..0ba435ae7429 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java @@ -16,16 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; -/** - * Spark DF write options - */ +/** Spark DF write options */ public class SparkWriteOptions { - private SparkWriteOptions() { - } + private SparkWriteOptions() {} // Fileformat for write operations(default: Table write.format.default ) public static final String WRITE_FORMAT = "write-format"; @@ -52,5 +48,6 @@ private SparkWriteOptions() { public static final String REWRITTEN_FILE_SCAN_TASK_SET_ID = "rewritten-file-scan-task-set-id"; // Controls whether to allow writing timestamps without zone info - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = "handle-timestamp-without-timezone"; + public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = + "handle-timestamp-without-timezone"; } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java index 6a8be60eb078..1e4b0f2f4e3d 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -43,8 +42,7 @@ import org.apache.spark.sql.types.TimestampType$; class TypeToSparkType extends TypeUtil.SchemaVisitor { - TypeToSparkType() { - } + TypeToSparkType() {} @Override public DataType schema(Schema schema, DataType structType) { @@ -59,8 +57,8 @@ public DataType struct(Types.StructType struct, List fieldResults) { for (int i = 0; i < fields.size(); i += 1) { Types.NestedField field = fields.get(i); DataType type = fieldResults.get(i); - StructField sparkField = StructField.apply( - field.name(), type, field.isOptional(), Metadata.empty()); + StructField sparkField = + StructField.apply(field.name(), type, field.isOptional(), Metadata.empty()); if (field.doc() != null) { sparkField = sparkField.withComment(field.doc()); } @@ -101,8 +99,7 @@ public DataType primitive(Type.PrimitiveType primitive) { case DATE: return DateType$.MODULE$; case TIME: - throw new UnsupportedOperationException( - "Spark does not support time fields"); + throw new UnsupportedOperationException("Spark does not support time fields"); case TIMESTAMP: return TimestampType$.MODULE$; case STRING: diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java index e316dfb81c11..a79f075ef442 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -57,35 +59,37 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** * An action that removes orphan metadata and data files by listing a given location and comparing * the actual files in that location with data and metadata files referenced by all valid snapshots. * The location must be accessible for listing via the Hadoop {@link FileSystem}. - *

    - * By default, this action cleans up the table location returned by {@link Table#location()} and - * removes unreachable files that are older than 3 days using {@link Table#io()}. The behavior can be modified - * by passing a custom location to {@link #location} and a custom timestamp to {@link #olderThan(long)}. - * For example, someone might point this action to the data folder to clean up only orphan data files. - * In addition, there is a way to configure an alternative delete method via {@link #deleteWith(Consumer)}. - *

    - * Note: It is dangerous to call this action with a short retention interval as it might corrupt - * the state of the table if another operation is writing at the same time. + * + *

    By default, this action cleans up the table location returned by {@link Table#location()} and + * removes unreachable files that are older than 3 days using {@link Table#io()}. The behavior can + * be modified by passing a custom location to {@link #location} and a custom timestamp to {@link + * #olderThan(long)}. For example, someone might point this action to the data folder to clean up + * only orphan data files. In addition, there is a way to configure an alternative delete method via + * {@link #deleteWith(Consumer)}. + * + *

    Note: It is dangerous to call this action with a short retention interval as it might + * corrupt the state of the table if another operation is writing at the same time. */ public class BaseDeleteOrphanFilesSparkAction - extends BaseSparkAction implements DeleteOrphanFiles { + extends BaseSparkAction + implements DeleteOrphanFiles { private static final Logger LOG = LoggerFactory.getLogger(BaseDeleteOrphanFilesSparkAction.class); - private static final UserDefinedFunction filenameUDF = functions.udf((String path) -> { - int lastIndex = path.lastIndexOf(File.separator); - if (lastIndex == -1) { - return path; - } else { - return path.substring(lastIndex + 1); - } - }, DataTypes.StringType); + private static final UserDefinedFunction filenameUDF = + functions.udf( + (String path) -> { + int lastIndex = path.lastIndexOf(File.separator); + if (lastIndex == -1) { + return path; + } else { + return path.substring(lastIndex + 1); + } + }, + DataTypes.StringType); private static final ExecutorService DEFAULT_DELETE_EXECUTOR_SERVICE = null; @@ -95,12 +99,13 @@ public class BaseDeleteOrphanFilesSparkAction private String location = null; private long olderThanTimestamp = System.currentTimeMillis() - TimeUnit.DAYS.toMillis(3); - private Consumer deleteFunc = new Consumer() { - @Override - public void accept(String file) { - table.io().deleteFile(file); - } - }; + private Consumer deleteFunc = + new Consumer() { + @Override + public void accept(String file) { + table.io().deleteFile(file); + } + }; private ExecutorService deleteExecutorService = DEFAULT_DELETE_EXECUTOR_SERVICE; @@ -108,7 +113,8 @@ public BaseDeleteOrphanFilesSparkAction(SparkSession spark, Table table) { super(spark); this.hadoopConf = new SerializableConfiguration(spark.sessionState().newHadoopConf()); - this.partitionDiscoveryParallelism = spark.sessionState().conf().parallelPartitionDiscoveryParallelism(); + this.partitionDiscoveryParallelism = + spark.sessionState().conf().parallelPartitionDiscoveryParallelism(); this.table = table; this.location = table.location(); @@ -158,7 +164,8 @@ private String jobDesc() { if (location != null) { options.add("location=" + location); } - return String.format("Removing orphan files (%s) from %s", Joiner.on(',').join(options), table.name()); + return String.format( + "Removing orphan files (%s) from %s", Joiner.on(',').join(options), table.name()); } private DeleteOrphanFiles.Result doExecute() { @@ -172,9 +179,8 @@ private DeleteOrphanFiles.Result doExecute() { Column nameEqual = actualFileName.equalTo(validFileName); Column actualContains = actualFileDF.col("file_path").contains(validFileDF.col("file_path")); Column joinCond = nameEqual.and(actualContains); - List orphanFiles = actualFileDF.join(validFileDF, joinCond, "leftanti") - .as(Encoders.STRING()) - .collectAsList(); + List orphanFiles = + actualFileDF.join(validFileDF, joinCond, "leftanti").as(Encoders.STRING()).collectAsList(); Tasks.foreach(orphanFiles) .noRetry() @@ -205,15 +211,23 @@ private Dataset buildActualFileDF() { JavaRDD subDirRDD = sparkContext().parallelize(subDirs, parallelism); Broadcast conf = sparkContext().broadcast(hadoopConf); - JavaRDD matchingLeafFileRDD = subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp)); + JavaRDD matchingLeafFileRDD = + subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp)); JavaRDD completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD); - return spark().createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path"); + return spark() + .createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()) + .toDF("file_path"); } private static void listDirRecursively( - String dir, Predicate predicate, Configuration conf, int maxDepth, - int maxDirectSubDirs, List remainingSubDirs, List matchingFiles) { + String dir, + Predicate predicate, + Configuration conf, + int maxDepth, + int maxDirectSubDirs, + List remainingSubDirs, + List matchingFiles) { // stop listing whenever we reach the max depth if (maxDepth <= 0) { @@ -242,7 +256,14 @@ private static void listDirRecursively( } for (String subDir : subDirs) { - listDirRecursively(subDir, predicate, conf, maxDepth - 1, maxDirectSubDirs, remainingSubDirs, matchingFiles); + listDirRecursively( + subDir, + predicate, + conf, + maxDepth - 1, + maxDirectSubDirs, + remainingSubDirs, + matchingFiles); } } catch (IOException e) { throw new RuntimeIOException(e); @@ -250,8 +271,7 @@ private static void listDirRecursively( } private static FlatMapFunction, String> listDirsRecursively( - Broadcast conf, - long olderThanTimestamp) { + Broadcast conf, long olderThanTimestamp) { return dirs -> { List subDirs = Lists.newArrayList(); @@ -262,12 +282,15 @@ private static FlatMapFunction, String> listDirsRecursively( int maxDepth = 2000; int maxDirectSubDirs = Integer.MAX_VALUE; - dirs.forEachRemaining(dir -> { - listDirRecursively(dir, predicate, conf.value().value(), maxDepth, maxDirectSubDirs, subDirs, files); - }); + dirs.forEachRemaining( + dir -> { + listDirRecursively( + dir, predicate, conf.value().value(), maxDepth, maxDirectSubDirs, subDirs, files); + }); if (!subDirs.isEmpty()) { - throw new RuntimeException("Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth); + throw new RuntimeException( + "Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth); } return files.iterator(); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java index 6534617d2dec..1431ae5d78ec 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.util.Iterator; import java.util.List; import java.util.concurrent.ExecutorService; @@ -47,17 +49,16 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** - * An implementation of {@link DeleteReachableFiles} that uses metadata tables in Spark - * to determine which files should be deleted. + * An implementation of {@link DeleteReachableFiles} that uses metadata tables in Spark to determine + * which files should be deleted. */ @SuppressWarnings("UnnecessaryAnonymousClass") public class BaseDeleteReachableFilesSparkAction - extends BaseSparkAction implements DeleteReachableFiles { - private static final Logger LOG = LoggerFactory.getLogger(BaseDeleteReachableFilesSparkAction.class); + extends BaseSparkAction + implements DeleteReachableFiles { + private static final Logger LOG = + LoggerFactory.getLogger(BaseDeleteReachableFilesSparkAction.class); private static final String DATA_FILE = "Data File"; private static final String MANIFEST = "Manifest"; @@ -71,12 +72,13 @@ public class BaseDeleteReachableFilesSparkAction private final TableMetadata tableMetadata; - private final Consumer defaultDelete = new Consumer() { - @Override - public void accept(String file) { - io.deleteFile(file); - } - }; + private final Consumer defaultDelete = + new Consumer() { + @Override + public void accept(String file) { + io.deleteFile(file); + } + }; private Consumer removeFunc = defaultDelete; private ExecutorService removeExecutorService = DEFAULT_DELETE_EXECUTOR_SERVICE; @@ -105,7 +107,6 @@ public DeleteReachableFiles io(FileIO fileIO) { public DeleteReachableFiles deleteWith(Consumer deleteFunc) { this.removeFunc = deleteFunc; return this; - } @Override @@ -117,7 +118,8 @@ public DeleteReachableFiles executeDeleteWith(ExecutorService executorService) { @Override public Result execute() { Preconditions.checkArgument(io != null, "File IO cannot be null"); - String msg = String.format("Removing files reachable from %s", tableMetadata.metadataFileLocation()); + String msg = + String.format("Removing files reachable from %s", tableMetadata.metadataFileLocation()); JobGroupInfo info = newJobGroupInfo("REMOVE-FILES", msg); return withJobGroupInfo(info, this::doExecute); } @@ -165,40 +167,45 @@ private BaseDeleteReachableFilesActionResult deleteFiles(Iterator deleted) AtomicLong otherFilesCount = new AtomicLong(0L); Tasks.foreach(deleted) - .retry(3).stopRetryOn(NotFoundException.class).suppressFailureWhenFinished() + .retry(3) + .stopRetryOn(NotFoundException.class) + .suppressFailureWhenFinished() .executeWith(removeExecutorService) - .onFailure((fileInfo, exc) -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - LOG.warn("Delete failed for {}: {}", type, file, exc); - }) - .run(fileInfo -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - removeFunc.accept(file); - switch (type) { - case DATA_FILE: - dataFileCount.incrementAndGet(); - LOG.trace("Deleted Data File: {}", file); - break; - case MANIFEST: - manifestCount.incrementAndGet(); - LOG.debug("Deleted Manifest: {}", file); - break; - case MANIFEST_LIST: - manifestListCount.incrementAndGet(); - LOG.debug("Deleted Manifest List: {}", file); - break; - case OTHERS: - otherFilesCount.incrementAndGet(); - LOG.debug("Others: {}", file); - break; - } - }); - - long filesCount = dataFileCount.get() + manifestCount.get() + manifestListCount.get() + otherFilesCount.get(); + .onFailure( + (fileInfo, exc) -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + LOG.warn("Delete failed for {}: {}", type, file, exc); + }) + .run( + fileInfo -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + removeFunc.accept(file); + switch (type) { + case DATA_FILE: + dataFileCount.incrementAndGet(); + LOG.trace("Deleted Data File: {}", file); + break; + case MANIFEST: + manifestCount.incrementAndGet(); + LOG.debug("Deleted Manifest: {}", file); + break; + case MANIFEST_LIST: + manifestListCount.incrementAndGet(); + LOG.debug("Deleted Manifest List: {}", file); + break; + case OTHERS: + otherFilesCount.incrementAndGet(); + LOG.debug("Others: {}", file); + break; + } + }); + + long filesCount = + dataFileCount.get() + manifestCount.get() + manifestListCount.get() + otherFilesCount.get(); LOG.info("Total files removed: {}", filesCount); - return new BaseDeleteReachableFilesActionResult(dataFileCount.get(), manifestCount.get(), manifestListCount.get(), - otherFilesCount.get()); + return new BaseDeleteReachableFilesActionResult( + dataFileCount.get(), manifestCount.get(), manifestListCount.get(), otherFilesCount.get()); } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java index 88589bca5cab..2e1f0c079eca 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.util.Iterator; import java.util.List; import java.util.Set; @@ -48,22 +50,20 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** - * An action that performs the same operation as {@link org.apache.iceberg.ExpireSnapshots} but uses Spark - * to determine the delta in files between the pre and post-expiration table metadata. All of the same - * restrictions of {@link org.apache.iceberg.ExpireSnapshots} also apply to this action. - *

    - * This action first leverages {@link org.apache.iceberg.ExpireSnapshots} to expire snapshots and then - * uses metadata tables to find files that can be safely deleted. This is done by anti-joining two Datasets - * that contain all manifest and data files before and after the expiration. The snapshot expiration - * will be fully committed before any deletes are issued. - *

    - * This operation performs a shuffle so the parallelism can be controlled through 'spark.sql.shuffle.partitions'. - *

    - * Deletes are still performed locally after retrieving the results from the Spark executors. + * An action that performs the same operation as {@link org.apache.iceberg.ExpireSnapshots} but uses + * Spark to determine the delta in files between the pre and post-expiration table metadata. All of + * the same restrictions of {@link org.apache.iceberg.ExpireSnapshots} also apply to this action. + * + *

    This action first leverages {@link org.apache.iceberg.ExpireSnapshots} to expire snapshots and + * then uses metadata tables to find files that can be safely deleted. This is done by anti-joining + * two Datasets that contain all manifest and data files before and after the expiration. The + * snapshot expiration will be fully committed before any deletes are issued. + * + *

    This operation performs a shuffle so the parallelism can be controlled through + * 'spark.sql.shuffle.partitions'. + * + *

    Deletes are still performed locally after retrieving the results from the Spark executors. */ @SuppressWarnings("UnnecessaryAnonymousClass") public class BaseExpireSnapshotsSparkAction @@ -81,12 +81,13 @@ public class BaseExpireSnapshotsSparkAction private final Table table; private final TableOperations ops; - private final Consumer defaultDelete = new Consumer() { - @Override - public void accept(String file) { - ops.io().deleteFile(file); - } - }; + private final Consumer defaultDelete = + new Consumer() { + @Override + public void accept(String file) { + ops.io().deleteFile(file); + } + }; private final Set expiredSnapshotIds = Sets.newHashSet(); private Long expireOlderThanValue = null; @@ -130,8 +131,10 @@ public BaseExpireSnapshotsSparkAction expireOlderThan(long timestampMillis) { @Override public BaseExpireSnapshotsSparkAction retainLast(int numSnapshots) { - Preconditions.checkArgument(1 <= numSnapshots, - "Number of snapshots to retain must be at least 1, cannot be: %s", numSnapshots); + Preconditions.checkArgument( + 1 <= numSnapshots, + "Number of snapshots to retain must be at least 1, cannot be: %s", + numSnapshots); this.retainLastValue = numSnapshots; return this; } @@ -144,10 +147,11 @@ public BaseExpireSnapshotsSparkAction deleteWith(Consumer newDeleteFunc) /** * Expires snapshots and commits the changes to the table, returning a Dataset of files to delete. - *

    - * This does not delete data files. To delete data files, run {@link #execute()}. - *

    - * This may be called before or after {@link #execute()} is called to return the expired file list. + * + *

    This does not delete data files. To delete data files, run {@link #execute()}. + * + *

    This may be called before or after {@link #execute()} is called to return the expired file + * list. * * @return a Dataset of files that are no longer referenced by the table */ @@ -157,7 +161,8 @@ public Dataset expire() { Dataset originalFiles = buildValidFileDF(ops.current()); // perform expiration - org.apache.iceberg.ExpireSnapshots expireSnapshots = table.expireSnapshots().cleanExpiredFiles(false); + org.apache.iceberg.ExpireSnapshots expireSnapshots = + table.expireSnapshots().cleanExpiredFiles(false); for (long id : expiredSnapshotIds) { expireSnapshots = expireSnapshots.expireSnapshotId(id); } @@ -202,13 +207,15 @@ private String jobDesc() { if (!expiredSnapshotIds.isEmpty()) { Long first = expiredSnapshotIds.stream().findFirst().get(); if (expiredSnapshotIds.size() > 1) { - options.add(String.format("snapshot_ids: %s (%s more...)", first, expiredSnapshotIds.size() - 1)); + options.add( + String.format("snapshot_ids: %s (%s more...)", first, expiredSnapshotIds.size() - 1)); } else { options.add(String.format("snapshot_id: %s", first)); } } - return String.format("Expiring snapshots (%s) in %s", Joiner.on(',').join(options), table.name()); + return String.format( + "Expiring snapshots (%s) in %s", Joiner.on(',').join(options), table.name()); } private ExpireSnapshots.Result doExecute() { @@ -243,34 +250,41 @@ private BaseExpireSnapshotsActionResult deleteFiles(Iterator expired) { AtomicLong manifestListCount = new AtomicLong(0L); Tasks.foreach(expired) - .retry(3).stopRetryOn(NotFoundException.class).suppressFailureWhenFinished() + .retry(3) + .stopRetryOn(NotFoundException.class) + .suppressFailureWhenFinished() .executeWith(deleteExecutorService) - .onFailure((fileInfo, exc) -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - LOG.warn("Delete failed for {}: {}", type, file, exc); - }) - .run(fileInfo -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - deleteFunc.accept(file); - switch (type) { - case DATA_FILE: - dataFileCount.incrementAndGet(); - LOG.trace("Deleted Data File: {}", file); - break; - case MANIFEST: - manifestCount.incrementAndGet(); - LOG.debug("Deleted Manifest: {}", file); - break; - case MANIFEST_LIST: - manifestListCount.incrementAndGet(); - LOG.debug("Deleted Manifest List: {}", file); - break; - } - }); - - LOG.info("Deleted {} total files", dataFileCount.get() + manifestCount.get() + manifestListCount.get()); - return new BaseExpireSnapshotsActionResult(dataFileCount.get(), manifestCount.get(), manifestListCount.get()); + .onFailure( + (fileInfo, exc) -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + LOG.warn("Delete failed for {}: {}", type, file, exc); + }) + .run( + fileInfo -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + deleteFunc.accept(file); + switch (type) { + case DATA_FILE: + dataFileCount.incrementAndGet(); + LOG.trace("Deleted Data File: {}", file); + break; + case MANIFEST: + manifestCount.incrementAndGet(); + LOG.debug("Deleted Manifest: {}", file); + break; + case MANIFEST_LIST: + manifestListCount.incrementAndGet(); + LOG.debug("Deleted Manifest List: {}", file); + break; + } + }); + + LOG.info( + "Deleted {} total files", + dataFileCount.get() + manifestCount.get() + manifestListCount.get()); + return new BaseExpireSnapshotsActionResult( + dataFileCount.get(), manifestCount.get(), manifestListCount.get()); } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseMigrateTableSparkAction.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseMigrateTableSparkAction.java index ef9f0d3e2583..856b67dbcd75 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseMigrateTableSparkAction.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseMigrateTableSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.Map; @@ -45,10 +44,9 @@ import scala.collection.JavaConverters; /** - * Takes a Spark table in the source catalog and attempts to transform it into an Iceberg - * table in the same location with the same identifier. Once complete the identifier which - * previously referred to a non-Iceberg table will refer to the newly migrated Iceberg - * table. + * Takes a Spark table in the source catalog and attempts to transform it into an Iceberg table in + * the same location with the same identifier. Once complete the identifier which previously + * referred to a non-Iceberg table will refer to the newly migrated Iceberg table. */ public class BaseMigrateTableSparkAction extends BaseTableCreationSparkAction @@ -61,7 +59,8 @@ public class BaseMigrateTableSparkAction private final Identifier destTableIdent; private final Identifier backupIdent; - public BaseMigrateTableSparkAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { + public BaseMigrateTableSparkAction( + SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { super(spark, sourceCatalog, sourceTableIdent); this.destCatalog = checkDestinationCatalog(sourceCatalog); this.destTableIdent = sourceTableIdent; @@ -132,7 +131,8 @@ private MigrateTable.Result doExecute() { threw = false; } finally { if (threw) { - LOG.error("Failed to perform the migration, aborting table creation and restoring the original table"); + LOG.error( + "Failed to perform the migration, aborting table creation and restoring the original table"); restoreSourceTable(); @@ -147,8 +147,12 @@ private MigrateTable.Result doExecute() { } Snapshot snapshot = icebergTable.currentSnapshot(); - long migratedDataFilesCount = Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); - LOG.info("Successfully loaded Iceberg metadata for {} files to {}", migratedDataFilesCount, destTableIdent()); + long migratedDataFilesCount = + Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); + LOG.info( + "Successfully loaded Iceberg metadata for {} files to {}", + migratedDataFilesCount, + destTableIdent()); return new BaseMigrateTableActionResult(migratedDataFilesCount); } @@ -176,9 +180,11 @@ protected Map destTableProps() { @Override protected TableCatalog checkSourceCatalog(CatalogPlugin catalog) { // currently the import code relies on being able to look up the table in the session catalog - Preconditions.checkArgument(catalog instanceof SparkSessionCatalog, + Preconditions.checkArgument( + catalog instanceof SparkSessionCatalog, "Cannot migrate a table from a non-Iceberg Spark Session Catalog. Found %s of class %s as the source catalog.", - catalog.name(), catalog.getClass().getName()); + catalog.name(), + catalog.getClass().getName()); return (TableCatalog) catalog; } @@ -204,11 +210,15 @@ private void restoreSourceTable() { destCatalog().renameTable(backupIdent, sourceTableIdent()); } catch (org.apache.spark.sql.catalyst.analysis.NoSuchTableException e) { - LOG.error("Cannot restore the original table, the backup table {} cannot be found", backupIdent, e); + LOG.error( + "Cannot restore the original table, the backup table {} cannot be found", backupIdent, e); } catch (org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException e) { - LOG.error("Cannot restore the original table, a table with the original name exists. " + - "Use the backup table {} to restore the original table manually.", backupIdent, e); + LOG.error( + "Cannot restore the original table, a table with the original name exists. " + + "Use the backup table {} to restore the original table manually.", + backupIdent, + e); } } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSpark3Action.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSpark3Action.java index b1c08e607de8..a12ada501796 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSpark3Action.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSpark3Action.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import org.apache.iceberg.Table; diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSparkAction.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSparkAction.java index cd12131bd04d..5d3be38f57e9 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSparkAction.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.io.IOException; @@ -73,18 +72,19 @@ import org.slf4j.LoggerFactory; abstract class BaseRewriteDataFilesSparkAction - extends BaseSnapshotUpdateSparkAction implements RewriteDataFiles { + extends BaseSnapshotUpdateSparkAction + implements RewriteDataFiles { private static final Logger LOG = LoggerFactory.getLogger(BaseRewriteDataFilesSparkAction.class); - private static final Set VALID_OPTIONS = ImmutableSet.of( - MAX_CONCURRENT_FILE_GROUP_REWRITES, - MAX_FILE_GROUP_SIZE_BYTES, - PARTIAL_PROGRESS_ENABLED, - PARTIAL_PROGRESS_MAX_COMMITS, - TARGET_FILE_SIZE_BYTES, - USE_STARTING_SEQUENCE_NUMBER, - REWRITE_JOB_ORDER - ); + private static final Set VALID_OPTIONS = + ImmutableSet.of( + MAX_CONCURRENT_FILE_GROUP_REWRITES, + MAX_FILE_GROUP_SIZE_BYTES, + PARTIAL_PROGRESS_ENABLED, + PARTIAL_PROGRESS_MAX_COMMITS, + TARGET_FILE_SIZE_BYTES, + USE_STARTING_SEQUENCE_NUMBER, + REWRITE_JOB_ORDER); private final Table table; @@ -105,36 +105,38 @@ protected Table table() { return table; } - /** - * The framework specific {@link BinPackStrategy} - */ + /** The framework specific {@link BinPackStrategy} */ protected abstract BinPackStrategy binPackStrategy(); - /** - * The framework specific {@link SortStrategy} - */ + /** The framework specific {@link SortStrategy} */ protected abstract SortStrategy sortStrategy(); @Override public RewriteDataFiles binPack() { - Preconditions.checkArgument(this.strategy == null, - "Cannot set strategy to binpack, it has already been set", this.strategy); + Preconditions.checkArgument( + this.strategy == null, + "Cannot set strategy to binpack, it has already been set", + this.strategy); this.strategy = binPackStrategy(); return this; } @Override public RewriteDataFiles sort(SortOrder sortOrder) { - Preconditions.checkArgument(this.strategy == null, - "Cannot set strategy to sort, it has already been set to %s", this.strategy); + Preconditions.checkArgument( + this.strategy == null, + "Cannot set strategy to sort, it has already been set to %s", + this.strategy); this.strategy = sortStrategy().sortOrder(sortOrder); return this; } @Override public RewriteDataFiles sort() { - Preconditions.checkArgument(this.strategy == null, - "Cannot set strategy to sort, it has already been set to %s", this.strategy); + Preconditions.checkArgument( + this.strategy == null, + "Cannot set strategy to sort, it has already been set to %s", + this.strategy); this.strategy = sortStrategy(); return this; } @@ -160,7 +162,8 @@ public RewriteDataFiles.Result execute() { validateAndInitOptions(); - Map>> fileGroupsByPartition = planFileGroups(startingSnapshotId); + Map>> fileGroupsByPartition = + planFileGroups(startingSnapshotId); RewriteExecutionContext ctx = new RewriteExecutionContext(fileGroupsByPartition); if (ctx.totalGroupCount() == 0) { @@ -179,43 +182,52 @@ public RewriteDataFiles.Result execute() { } Map>> planFileGroups(long startingSnapshotId) { - CloseableIterable fileScanTasks = table.newScan() - .useSnapshot(startingSnapshotId) - .filter(filter) - .ignoreResiduals() - .planFiles(); + CloseableIterable fileScanTasks = + table + .newScan() + .useSnapshot(startingSnapshotId) + .filter(filter) + .ignoreResiduals() + .planFiles(); try { StructType partitionType = table.spec().partitionType(); StructLikeMap> filesByPartition = StructLikeMap.create(partitionType); StructLike emptyStruct = GenericRecord.create(partitionType); - fileScanTasks.forEach(task -> { - // If a task uses an incompatible partition spec the data inside could contain values which - // belong to multiple partitions in the current spec. Treating all such files as un-partitioned and - // grouping them together helps to minimize new files made. - StructLike taskPartition = task.file().specId() == table.spec().specId() ? - task.file().partition() : emptyStruct; - - List files = filesByPartition.get(taskPartition); - if (files == null) { - files = Lists.newArrayList(); - } - - files.add(task); - filesByPartition.put(taskPartition, files); - }); - - StructLikeMap>> fileGroupsByPartition = StructLikeMap.create(partitionType); - - filesByPartition.forEach((partition, tasks) -> { - Iterable filtered = strategy.selectFilesToRewrite(tasks); - Iterable> groupedTasks = strategy.planFileGroups(filtered); - List> fileGroups = ImmutableList.copyOf(groupedTasks); - if (fileGroups.size() > 0) { - fileGroupsByPartition.put(partition, fileGroups); - } - }); + fileScanTasks.forEach( + task -> { + // If a task uses an incompatible partition spec the data inside could contain values + // which + // belong to multiple partitions in the current spec. Treating all such files as + // un-partitioned and + // grouping them together helps to minimize new files made. + StructLike taskPartition = + task.file().specId() == table.spec().specId() + ? task.file().partition() + : emptyStruct; + + List files = filesByPartition.get(taskPartition); + if (files == null) { + files = Lists.newArrayList(); + } + + files.add(task); + filesByPartition.put(taskPartition, files); + }); + + StructLikeMap>> fileGroupsByPartition = + StructLikeMap.create(partitionType); + + filesByPartition.forEach( + (partition, tasks) -> { + Iterable filtered = strategy.selectFilesToRewrite(tasks); + Iterable> groupedTasks = strategy.planFileGroups(filtered); + List> fileGroups = ImmutableList.copyOf(groupedTasks); + if (fileGroups.size() > 0) { + fileGroupsByPartition.put(partition, fileGroups); + } + }); return fileGroupsByPartition; } finally { @@ -230,9 +242,10 @@ Map>> planFileGroups(long startingSnapshotId @VisibleForTesting RewriteFileGroup rewriteFiles(RewriteExecutionContext ctx, RewriteFileGroup fileGroup) { String desc = jobDesc(fileGroup, ctx); - Set addedFiles = withJobGroupInfo( - newJobGroupInfo("REWRITE-DATA-FILES", desc), - () -> strategy.rewriteFiles(fileGroup.fileScans())); + Set addedFiles = + withJobGroupInfo( + newJobGroupInfo("REWRITE-DATA-FILES", desc), + () -> strategy.rewriteFiles(fileGroup.fileScans())); fileGroup.setOutputFiles(addedFiles); LOG.info("Rewrite Files Ready to be Committed - {}", desc); @@ -241,11 +254,10 @@ RewriteFileGroup rewriteFiles(RewriteExecutionContext ctx, RewriteFileGroup file private ExecutorService rewriteService() { return MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool( - maxConcurrentFileGroupRewrites, - new ThreadFactoryBuilder() - .setNameFormat("Rewrite-Service-%d") - .build())); + (ThreadPoolExecutor) + Executors.newFixedThreadPool( + maxConcurrentFileGroupRewrites, + new ThreadFactoryBuilder().setNameFormat("Rewrite-Service-%d").build())); } @VisibleForTesting @@ -253,31 +265,42 @@ RewriteDataFilesCommitManager commitManager(long startingSnapshotId) { return new RewriteDataFilesCommitManager(table, startingSnapshotId, useStartingSequenceNumber); } - private Result doExecute(RewriteExecutionContext ctx, Stream groupStream, - RewriteDataFilesCommitManager commitManager) { + private Result doExecute( + RewriteExecutionContext ctx, + Stream groupStream, + RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); ConcurrentLinkedQueue rewrittenGroups = Queues.newConcurrentLinkedQueue(); - Tasks.Builder rewriteTaskBuilder = Tasks.foreach(groupStream) - .executeWith(rewriteService) - .stopOnFailure() - .noRetry() - .onFailure((fileGroup, exception) -> { - LOG.warn("Failure during rewrite process for group {}", fileGroup.info(), exception); - }); + Tasks.Builder rewriteTaskBuilder = + Tasks.foreach(groupStream) + .executeWith(rewriteService) + .stopOnFailure() + .noRetry() + .onFailure( + (fileGroup, exception) -> { + LOG.warn( + "Failure during rewrite process for group {}", fileGroup.info(), exception); + }); try { - rewriteTaskBuilder.run(fileGroup -> { - rewrittenGroups.add(rewriteFiles(ctx, fileGroup)); - }); + rewriteTaskBuilder.run( + fileGroup -> { + rewrittenGroups.add(rewriteFiles(ctx, fileGroup)); + }); } catch (Exception e) { // At least one rewrite group failed, clean up all completed rewrites - LOG.error("Cannot complete rewrite, {} is not enabled and one of the file set groups failed to " + - "be rewritten. This error occurred during the writing of new files, not during the commit process. This " + - "indicates something is wrong that doesn't involve conflicts with other Iceberg operations. Enabling " + - "{} may help in this case but the root cause should be investigated. Cleaning up {} groups which finished " + - "being written.", PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_ENABLED, rewrittenGroups.size(), e); + LOG.error( + "Cannot complete rewrite, {} is not enabled and one of the file set groups failed to " + + "be rewritten. This error occurred during the writing of new files, not during the commit process. This " + + "indicates something is wrong that doesn't involve conflicts with other Iceberg operations. Enabling " + + "{} may help in this case but the root cause should be investigated. Cleaning up {} groups which finished " + + "being written.", + PARTIAL_PROGRESS_ENABLED, + PARTIAL_PROGRESS_ENABLED, + rewrittenGroups.size(), + e); Tasks.foreach(rewrittenGroups) .suppressFailureWhenFinished() @@ -290,30 +313,33 @@ private Result doExecute(RewriteExecutionContext ctx, Stream g try { commitManager.commitOrClean(Sets.newHashSet(rewrittenGroups)); } catch (ValidationException | CommitFailedException e) { - String errorMessage = String.format( - "Cannot commit rewrite because of a ValidationException or CommitFailedException. This usually means that " + - "this rewrite has conflicted with another concurrent Iceberg operation. To reduce the likelihood of " + - "conflicts, set %s which will break up the rewrite into multiple smaller commits controlled by %s. " + - "Separate smaller rewrite commits can succeed independently while any commits that conflict with " + - "another Iceberg operation will be ignored. This mode will create additional snapshots in the table " + - "history, one for each commit.", - PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_MAX_COMMITS); + String errorMessage = + String.format( + "Cannot commit rewrite because of a ValidationException or CommitFailedException. This usually means that " + + "this rewrite has conflicted with another concurrent Iceberg operation. To reduce the likelihood of " + + "conflicts, set %s which will break up the rewrite into multiple smaller commits controlled by %s. " + + "Separate smaller rewrite commits can succeed independently while any commits that conflict with " + + "another Iceberg operation will be ignored. This mode will create additional snapshots in the table " + + "history, one for each commit.", + PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_MAX_COMMITS); throw new RuntimeException(errorMessage, e); } - List rewriteResults = rewrittenGroups.stream() - .map(RewriteFileGroup::asResult) - .collect(Collectors.toList()); + List rewriteResults = + rewrittenGroups.stream().map(RewriteFileGroup::asResult).collect(Collectors.toList()); return new BaseRewriteDataFilesResult(rewriteResults); } - private Result doExecuteWithPartialProgress(RewriteExecutionContext ctx, Stream groupStream, - RewriteDataFilesCommitManager commitManager) { + private Result doExecuteWithPartialProgress( + RewriteExecutionContext ctx, + Stream groupStream, + RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); // Start Commit Service int groupsPerCommit = IntMath.divide(ctx.totalGroupCount(), maxCommits, RoundingMode.CEILING); - RewriteDataFilesCommitManager.CommitService commitService = commitManager.service(groupsPerCommit); + RewriteDataFilesCommitManager.CommitService commitService = + commitManager.service(groupsPerCommit); commitService.start(); // Start rewrite tasks @@ -321,7 +347,9 @@ private Result doExecuteWithPartialProgress(RewriteExecutionContext ctx, Stream< .suppressFailureWhenFinished() .executeWith(rewriteService) .noRetry() - .onFailure((fileGroup, exception) -> LOG.error("Failure during rewrite group {}", fileGroup.info(), exception)) + .onFailure( + (fileGroup, exception) -> + LOG.error("Failure during rewrite group {}", fileGroup.info(), exception)) .run(fileGroup -> commitService.offer(rewriteFiles(ctx, fileGroup))); rewriteService.shutdown(); @@ -329,31 +357,40 @@ private Result doExecuteWithPartialProgress(RewriteExecutionContext ctx, Stream< commitService.close(); List commitResults = commitService.results(); if (commitResults.size() == 0) { - LOG.error("{} is true but no rewrite commits succeeded. Check the logs to determine why the individual " + - "commits failed. If this is persistent it may help to increase {} which will break the rewrite operation " + - "into smaller commits.", PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_MAX_COMMITS); + LOG.error( + "{} is true but no rewrite commits succeeded. Check the logs to determine why the individual " + + "commits failed. If this is persistent it may help to increase {} which will break the rewrite operation " + + "into smaller commits.", + PARTIAL_PROGRESS_ENABLED, + PARTIAL_PROGRESS_MAX_COMMITS); } - List rewriteResults = commitResults.stream() - .map(RewriteFileGroup::asResult) - .collect(Collectors.toList()); + List rewriteResults = + commitResults.stream().map(RewriteFileGroup::asResult).collect(Collectors.toList()); return new BaseRewriteDataFilesResult(rewriteResults); } - Stream toGroupStream(RewriteExecutionContext ctx, + Stream toGroupStream( + RewriteExecutionContext ctx, Map>> fileGroupsByPartition) { - Stream rewriteFileGroupStream = fileGroupsByPartition.entrySet().stream() - .flatMap(e -> { - StructLike partition = e.getKey(); - List> fileGroups = e.getValue(); - return fileGroups.stream().map(tasks -> { - int globalIndex = ctx.currentGlobalIndex(); - int partitionIndex = ctx.currentPartitionIndex(partition); - FileGroupInfo info = new BaseRewriteDataFilesFileGroupInfo(globalIndex, partitionIndex, partition); - return new RewriteFileGroup(info, tasks); - }); - }); + Stream rewriteFileGroupStream = + fileGroupsByPartition.entrySet().stream() + .flatMap( + e -> { + StructLike partition = e.getKey(); + List> fileGroups = e.getValue(); + return fileGroups.stream() + .map( + tasks -> { + int globalIndex = ctx.currentGlobalIndex(); + int partitionIndex = ctx.currentPartitionIndex(partition); + FileGroupInfo info = + new BaseRewriteDataFilesFileGroupInfo( + globalIndex, partitionIndex, partition); + return new RewriteFileGroup(info, tasks); + }); + }); return rewriteFileGroupStream.sorted(rewriteGroupComparator()); } @@ -380,53 +417,70 @@ void validateAndInitOptions() { Set invalidKeys = Sets.newHashSet(options().keySet()); invalidKeys.removeAll(validOptions); - Preconditions.checkArgument(invalidKeys.isEmpty(), + Preconditions.checkArgument( + invalidKeys.isEmpty(), "Cannot use options %s, they are not supported by the action or the strategy %s", - invalidKeys, strategy.name()); + invalidKeys, + strategy.name()); strategy = strategy.options(options()); - maxConcurrentFileGroupRewrites = PropertyUtil.propertyAsInt(options(), - MAX_CONCURRENT_FILE_GROUP_REWRITES, - MAX_CONCURRENT_FILE_GROUP_REWRITES_DEFAULT); + maxConcurrentFileGroupRewrites = + PropertyUtil.propertyAsInt( + options(), + MAX_CONCURRENT_FILE_GROUP_REWRITES, + MAX_CONCURRENT_FILE_GROUP_REWRITES_DEFAULT); - maxCommits = PropertyUtil.propertyAsInt(options(), - PARTIAL_PROGRESS_MAX_COMMITS, - PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT); + maxCommits = + PropertyUtil.propertyAsInt( + options(), PARTIAL_PROGRESS_MAX_COMMITS, PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT); - partialProgressEnabled = PropertyUtil.propertyAsBoolean(options(), - PARTIAL_PROGRESS_ENABLED, - PARTIAL_PROGRESS_ENABLED_DEFAULT); + partialProgressEnabled = + PropertyUtil.propertyAsBoolean( + options(), PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_ENABLED_DEFAULT); - useStartingSequenceNumber = PropertyUtil.propertyAsBoolean(options(), - USE_STARTING_SEQUENCE_NUMBER, - USE_STARTING_SEQUENCE_NUMBER_DEFAULT); + useStartingSequenceNumber = + PropertyUtil.propertyAsBoolean( + options(), USE_STARTING_SEQUENCE_NUMBER, USE_STARTING_SEQUENCE_NUMBER_DEFAULT); - rewriteJobOrder = RewriteJobOrder.fromName(PropertyUtil.propertyAsString(options(), - REWRITE_JOB_ORDER, - REWRITE_JOB_ORDER_DEFAULT)); + rewriteJobOrder = + RewriteJobOrder.fromName( + PropertyUtil.propertyAsString(options(), REWRITE_JOB_ORDER, REWRITE_JOB_ORDER_DEFAULT)); - Preconditions.checkArgument(maxConcurrentFileGroupRewrites >= 1, + Preconditions.checkArgument( + maxConcurrentFileGroupRewrites >= 1, "Cannot set %s to %s, the value must be positive.", - MAX_CONCURRENT_FILE_GROUP_REWRITES, maxConcurrentFileGroupRewrites); + MAX_CONCURRENT_FILE_GROUP_REWRITES, + maxConcurrentFileGroupRewrites); - Preconditions.checkArgument(!partialProgressEnabled || maxCommits > 0, + Preconditions.checkArgument( + !partialProgressEnabled || maxCommits > 0, "Cannot set %s to %s, the value must be positive when %s is true", - PARTIAL_PROGRESS_MAX_COMMITS, maxCommits, PARTIAL_PROGRESS_ENABLED); + PARTIAL_PROGRESS_MAX_COMMITS, + maxCommits, + PARTIAL_PROGRESS_ENABLED); } private String jobDesc(RewriteFileGroup group, RewriteExecutionContext ctx) { StructLike partition = group.info().partition(); if (partition.size() > 0) { - return String.format("Rewriting %d files (%s, file group %d/%d, %s (%d/%d)) in %s", + return String.format( + "Rewriting %d files (%s, file group %d/%d, %s (%d/%d)) in %s", group.rewrittenFiles().size(), - strategy.name(), group.info().globalIndex(), - ctx.totalGroupCount(), partition, group.info().partitionIndex(), ctx.groupsInPartition(partition), + strategy.name(), + group.info().globalIndex(), + ctx.totalGroupCount(), + partition, + group.info().partitionIndex(), + ctx.groupsInPartition(partition), table.name()); } else { - return String.format("Rewriting %d files (%s, file group %d/%d) in %s", + return String.format( + "Rewriting %d files (%s, file group %d/%d) in %s", group.rewrittenFiles().size(), - strategy.name(), group.info().globalIndex(), ctx.totalGroupCount(), + strategy.name(), + group.info().globalIndex(), + ctx.totalGroupCount(), table.name()); } } @@ -439,11 +493,10 @@ static class RewriteExecutionContext { private final AtomicInteger groupIndex; RewriteExecutionContext(Map>> fileGroupsByPartition) { - this.numGroupsByPartition = fileGroupsByPartition.entrySet().stream() - .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().size())); - this.totalGroupCount = numGroupsByPartition.values().stream() - .reduce(Integer::sum) - .orElse(0); + this.numGroupsByPartition = + fileGroupsByPartition.entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().size())); + this.totalGroupCount = numGroupsByPartition.values().stream().reduce(Integer::sum).orElse(0); this.partitionIndexMap = Maps.newConcurrentMap(); this.groupIndex = new AtomicInteger(1); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java index b1769f428d14..b610302b0c75 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.MetadataTableType.ENTRIES; + import java.io.IOException; import java.util.Collections; import java.util.List; @@ -68,15 +69,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.MetadataTableType.ENTRIES; - /** * An action that rewrites manifests in a distributed manner and co-locates metadata for partitions. - *

    - * By default, this action rewrites all manifests for the current partition spec and writes the result - * to the metadata folder. The behavior can be modified by passing a custom predicate to {@link #rewriteIf(Predicate)} - * and a custom spec id to {@link #specId(int)}. In addition, there is a way to configure a custom location - * for new manifests via {@link #stagingLocation}. + * + *

    By default, this action rewrites all manifests for the current partition spec and writes the + * result to the metadata folder. The behavior can be modified by passing a custom predicate to + * {@link #rewriteIf(Predicate)} and a custom spec id to {@link #specId(int)}. In addition, there is + * a way to configure a custom location for new manifests via {@link #stagingLocation}. */ public class BaseRewriteManifestsSparkAction extends BaseSnapshotUpdateSparkAction @@ -102,10 +101,11 @@ public BaseRewriteManifestsSparkAction(SparkSession spark, Table table) { this.manifestEncoder = Encoders.javaSerialization(ManifestFile.class); this.table = table; this.spec = table.spec(); - this.targetManifestSizeBytes = PropertyUtil.propertyAsLong( - table.properties(), - TableProperties.MANIFEST_TARGET_SIZE_BYTES, - TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT); + this.targetManifestSizeBytes = + PropertyUtil.propertyAsLong( + table.properties(), + TableProperties.MANIFEST_TARGET_SIZE_BYTES, + TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT); this.fileIO = SparkUtil.serializableFileIO(table); // default the staging location to the metadata location @@ -143,7 +143,9 @@ public RewriteManifests stagingLocation(String newStagingLocation) { @Override public RewriteManifests.Result execute() { - String desc = String.format("Rewriting manifests (staging location=%s) of %s", stagingLocation, table.name()); + String desc = + String.format( + "Rewriting manifests (staging location=%s) of %s", stagingLocation, table.name()); JobGroupInfo info = newJobGroupInfo("REWRITE-MANIFESTS", desc); return withJobGroupInfo(info, this::doExecute); } @@ -158,10 +160,12 @@ private RewriteManifests.Result doExecute() { int numEntries = 0; for (ManifestFile manifest : matchingManifests) { - ValidationException.check(hasFileCounts(manifest), "No file counts in manifest: %s", manifest.path()); + ValidationException.check( + hasFileCounts(manifest), "No file counts in manifest: %s", manifest.path()); totalSizeBytes += manifest.length(); - numEntries += manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); + numEntries += + manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); } int targetNumManifests = targetNumManifests(totalSizeBytes); @@ -173,7 +177,9 @@ private RewriteManifests.Result doExecute() { if (spec.fields().size() < 1) { newManifests = writeManifestsForUnpartitionedTable(manifestEntryDF, targetNumManifests); } else { - newManifests = writeManifestsForPartitionedTable(manifestEntryDF, targetNumManifests, targetNumManifestEntries); + newManifests = + writeManifestsForPartitionedTable( + manifestEntryDF, targetNumManifests, targetNumManifestEntries); } replaceManifests(matchingManifests, newManifests); @@ -182,13 +188,16 @@ private RewriteManifests.Result doExecute() { } private Dataset buildManifestEntryDF(List manifests) { - Dataset manifestDF = spark() - .createDataset(Lists.transform(manifests, ManifestFile::path), Encoders.STRING()) - .toDF("manifest"); + Dataset manifestDF = + spark() + .createDataset(Lists.transform(manifests, ManifestFile::path), Encoders.STRING()) + .toDF("manifest"); - Dataset manifestEntryDF = loadMetadataTable(table, ENTRIES) - .filter("status < 2") // select only live entries - .selectExpr("input_file_name() as manifest", "snapshot_id", "sequence_number", "data_file"); + Dataset manifestEntryDF = + loadMetadataTable(table, ENTRIES) + .filter("status < 2") // select only live entries + .selectExpr( + "input_file_name() as manifest", "snapshot_id", "sequence_number", "data_file"); Column joinCond = manifestDF.col("manifest").equalTo(manifestEntryDF.col("manifest")); return manifestEntryDF @@ -196,7 +205,8 @@ private Dataset buildManifestEntryDF(List manifests) { .select("snapshot_id", "sequence_number", "data_file"); } - private List writeManifestsForUnpartitionedTable(Dataset manifestEntryDF, int numManifests) { + private List writeManifestsForUnpartitionedTable( + Dataset manifestEntryDF, int numManifests) { Broadcast io = sparkContext().broadcast(fileIO); StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType(); @@ -208,41 +218,44 @@ private List writeManifestsForUnpartitionedTable(Dataset mani .repartition(numManifests) .mapPartitions( toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), - manifestEncoder - ) + manifestEncoder) .collectAsList(); } private List writeManifestsForPartitionedTable( - Dataset manifestEntryDF, int numManifests, - int targetNumManifestEntries) { + Dataset manifestEntryDF, int numManifests, int targetNumManifestEntries) { Broadcast io = sparkContext().broadcast(fileIO); StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType(); - // we allow the actual size of manifests to be 10% higher if the estimation is not precise enough + // we allow the actual size of manifests to be 10% higher if the estimation is not precise + // enough long maxNumManifestEntries = (long) (1.1 * targetNumManifestEntries); - return withReusableDS(manifestEntryDF, df -> { - Column partitionColumn = df.col("data_file.partition"); - return df.repartitionByRange(numManifests, partitionColumn) - .sortWithinPartitions(partitionColumn) - .mapPartitions( - toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), - manifestEncoder - ) - .collectAsList(); - }); + return withReusableDS( + manifestEntryDF, + df -> { + Column partitionColumn = df.col("data_file.partition"); + return df.repartitionByRange(numManifests, partitionColumn) + .sortWithinPartitions(partitionColumn) + .mapPartitions( + toManifests( + io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), + manifestEncoder) + .collectAsList(); + }); } private U withReusableDS(Dataset ds, Function, U> func) { Dataset reusableDS; - boolean useCaching = PropertyUtil.propertyAsBoolean(options(), USE_CACHING, USE_CACHING_DEFAULT); + boolean useCaching = + PropertyUtil.propertyAsBoolean(options(), USE_CACHING, USE_CACHING_DEFAULT); if (useCaching) { reusableDS = ds.cache(); } else { int parallelism = SQLConf.get().numShufflePartitions(); - reusableDS = ds.repartition(parallelism).map((MapFunction) value -> value, ds.exprEnc()); + reusableDS = + ds.repartition(parallelism).map((MapFunction) value -> value, ds.exprEnc()); } try { @@ -275,17 +288,19 @@ private int targetNumManifestEntries(int numEntries, int numManifests) { } private boolean hasFileCounts(ManifestFile manifest) { - return manifest.addedFilesCount() != null && - manifest.existingFilesCount() != null && - manifest.deletedFilesCount() != null; + return manifest.addedFilesCount() != null + && manifest.existingFilesCount() != null + && manifest.deletedFilesCount() != null; } - private void replaceManifests(Iterable deletedManifests, Iterable addedManifests) { + private void replaceManifests( + Iterable deletedManifests, Iterable addedManifests) { try { - boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean( - table.properties(), - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); + boolean snapshotIdInheritanceEnabled = + PropertyUtil.propertyAsBoolean( + table.properties(), + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); org.apache.iceberg.RewriteManifests rewriteManifests = table.rewriteManifests(); deletedManifests.forEach(rewriteManifests::deleteManifest); @@ -315,12 +330,20 @@ private void deleteFiles(Iterable locations) { } private static ManifestFile writeManifest( - List rows, int startIndex, int endIndex, Broadcast io, - String location, int format, PartitionSpec spec, StructType sparkType) throws IOException { + List rows, + int startIndex, + int endIndex, + Broadcast io, + String location, + int format, + PartitionSpec spec, + StructType sparkType) + throws IOException { String manifestName = "optimized-m-" + UUID.randomUUID(); Path manifestPath = new Path(location, manifestName); - OutputFile outputFile = io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString())); + OutputFile outputFile = + io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString())); Types.StructType dataFileType = DataFile.getType(spec.partitionType()); SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkType); @@ -343,8 +366,12 @@ private static ManifestFile writeManifest( } private static MapPartitionsFunction toManifests( - Broadcast io, long maxNumManifestEntries, String location, - int format, PartitionSpec spec, StructType sparkType) { + Broadcast io, + long maxNumManifestEntries, + String location, + int format, + PartitionSpec spec, + StructType sparkType) { return rows -> { List rowsAsList = Lists.newArrayList(rows); @@ -355,11 +382,15 @@ private static MapPartitionsFunction toManifests( List manifests = Lists.newArrayList(); if (rowsAsList.size() <= maxNumManifestEntries) { - manifests.add(writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType)); + manifests.add( + writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType)); } else { int midIndex = rowsAsList.size() / 2; - manifests.add(writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType)); - manifests.add(writeManifest(rowsAsList, midIndex, rowsAsList.size(), io, location, format, spec, sparkType)); + manifests.add( + writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType)); + manifests.add( + writeManifest( + rowsAsList, midIndex, rowsAsList.size(), io, location, format, spec, sparkType)); } return manifests.iterator(); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotTableSparkAction.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotTableSparkAction.java index 1ccb448f1dcc..a170ca23a5ab 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotTableSparkAction.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotTableSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.Map; @@ -44,9 +43,8 @@ import scala.collection.JavaConverters; /** - * Creates a new Iceberg table based on a source Spark table. The new Iceberg table will - * have a different data and metadata directory allowing it to exist independently of the - * source table. + * Creates a new Iceberg table based on a source Spark table. The new Iceberg table will have a + * different data and metadata directory allowing it to exist independently of the source table. */ public class BaseSnapshotTableSparkAction extends BaseTableCreationSparkAction @@ -58,13 +56,18 @@ public class BaseSnapshotTableSparkAction private Identifier destTableIdent; private String destTableLocation = null; - BaseSnapshotTableSparkAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { + BaseSnapshotTableSparkAction( + SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { super(spark, sourceCatalog, sourceTableIdent); } // used by the old constructor - public BaseSnapshotTableSparkAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent, - CatalogPlugin destCatalog, Identifier destTableIdent) { + public BaseSnapshotTableSparkAction( + SparkSession spark, + CatalogPlugin sourceCatalog, + Identifier sourceTableIdent, + CatalogPlugin destCatalog, + Identifier destTableIdent) { super(spark, sourceCatalog, sourceTableIdent); this.destCatalog = checkDestinationCatalog(destCatalog); this.destTableIdent = destTableIdent; @@ -89,7 +92,8 @@ protected Identifier destTableIdent() { public SnapshotTable as(String ident) { String ctx = "snapshot destination"; CatalogPlugin defaultCatalog = spark().sessionState().catalogManager().currentCatalog(); - CatalogAndIdentifier catalogAndIdent = Spark3Util.catalogAndIdentifier(ctx, spark(), ident, defaultCatalog); + CatalogAndIdentifier catalogAndIdent = + Spark3Util.catalogAndIdentifier(ctx, spark(), ident, defaultCatalog); this.destCatalog = checkDestinationCatalog(catalogAndIdent.catalog()); this.destTableIdent = catalogAndIdent.identifier(); return this; @@ -115,11 +119,13 @@ public SnapshotTable.Result execute() { } private SnapshotTable.Result doExecute() { - Preconditions.checkArgument(destCatalog() != null && destTableIdent() != null, - "The destination catalog and identifier cannot be null. " + - "Make sure to configure the action with a valid destination table identifier via the `as` method."); + Preconditions.checkArgument( + destCatalog() != null && destTableIdent() != null, + "The destination catalog and identifier cannot be null. " + + "Make sure to configure the action with a valid destination table identifier via the `as` method."); - LOG.info("Staging a new Iceberg table {} as a snapshot of {}", destTableIdent(), sourceTableIdent()); + LOG.info( + "Staging a new Iceberg table {} as a snapshot of {}", destTableIdent(), sourceTableIdent()); StagedSparkTable stagedTable = stageDestTable(); Table icebergTable = stagedTable.table(); @@ -151,8 +157,12 @@ private SnapshotTable.Result doExecute() { } Snapshot snapshot = icebergTable.currentSnapshot(); - long importedDataFilesCount = Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); - LOG.info("Successfully loaded Iceberg metadata for {} files to {}", importedDataFilesCount, destTableIdent()); + long importedDataFilesCount = + Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); + LOG.info( + "Successfully loaded Iceberg metadata for {} files to {}", + importedDataFilesCount, + destTableIdent()); return new BaseSnapshotTableActionResult(importedDataFilesCount); } @@ -190,22 +200,27 @@ protected Map destTableProps() { @Override protected TableCatalog checkSourceCatalog(CatalogPlugin catalog) { // currently the import code relies on being able to look up the table in the session catalog - Preconditions.checkArgument(catalog.name().equalsIgnoreCase("spark_catalog"), - "Cannot snapshot a table that isn't in the session catalog (i.e. spark_catalog). " + - "Found source catalog: %s.", catalog.name()); - - Preconditions.checkArgument(catalog instanceof TableCatalog, + Preconditions.checkArgument( + catalog.name().equalsIgnoreCase("spark_catalog"), + "Cannot snapshot a table that isn't in the session catalog (i.e. spark_catalog). " + + "Found source catalog: %s.", + catalog.name()); + + Preconditions.checkArgument( + catalog instanceof TableCatalog, "Cannot snapshot as catalog %s of class %s in not a table catalog", - catalog.name(), catalog.getClass().getName()); + catalog.name(), + catalog.getClass().getName()); return (TableCatalog) catalog; } @Override public SnapshotTable tableLocation(String location) { - Preconditions.checkArgument(!sourceTableLocation().equals(location), - "The snapshot table location cannot be same as the source table location. " + - "This would mix snapshot table files with original table files."); + Preconditions.checkArgument( + !sourceTableLocation().equals(location), + "The snapshot table location cannot be same as the source table location. " + + "This would mix snapshot table files with original table files."); this.destTableLocation = location; return this; } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java index 53fa06bbb5dc..f68fb4e97e78 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.Map; @@ -24,8 +23,8 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.spark.sql.SparkSession; -abstract class BaseSnapshotUpdateSparkAction - extends BaseSparkAction implements SnapshotUpdate { +abstract class BaseSnapshotUpdateSparkAction extends BaseSparkAction + implements SnapshotUpdate { private final Map summary = Maps.newHashMap(); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java index 42c54679b669..c9d93ce9de5f 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.MetadataTableType.ALL_MANIFESTS; + import java.util.Iterator; import java.util.List; import java.util.Map; @@ -49,8 +50,6 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; -import static org.apache.iceberg.MetadataTableType.ALL_MANIFESTS; - abstract class BaseSparkAction implements Action { private static final AtomicInteger JOB_COUNTER = new AtomicInteger(); @@ -115,11 +114,20 @@ protected Dataset buildValidDataFileDF(Table table) { JavaSparkContext context = JavaSparkContext.fromSparkContext(spark.sparkContext()); Broadcast ioBroadcast = context.broadcast(SparkUtil.serializableFileIO(table)); - Dataset allManifests = loadMetadataTable(table, ALL_MANIFESTS) - .selectExpr("path", "length", "partition_spec_id as partitionSpecId", "added_snapshot_id as addedSnapshotId") - .dropDuplicates("path") - .repartition(spark.sessionState().conf().numShufflePartitions()) // avoid adaptive execution combining tasks - .as(Encoders.bean(ManifestFileBean.class)); + Dataset allManifests = + loadMetadataTable(table, ALL_MANIFESTS) + .selectExpr( + "path", + "length", + "partition_spec_id as partitionSpecId", + "added_snapshot_id as addedSnapshotId") + .dropDuplicates("path") + .repartition( + spark + .sessionState() + .conf() + .numShufflePartitions()) // avoid adaptive execution combining tasks + .as(Encoders.bean(ManifestFileBean.class)); return allManifests.flatMap(new ReadManifest(ioBroadcast), Encoders.STRING()).toDF("file_path"); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkActions.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkActions.java index 58b57177cf73..bec51944f222 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkActions.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkActions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import org.apache.iceberg.Table; diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseTableCreationSparkAction.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseTableCreationSparkAction.java index 6eadece65cb6..c6645fcfc8ed 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseTableCreationSparkAction.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/BaseTableCreationSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.List; @@ -50,7 +49,8 @@ import org.apache.spark.sql.types.StructType; abstract class BaseTableCreationSparkAction extends BaseSparkAction { - private static final Set ALLOWED_SOURCES = ImmutableSet.of("parquet", "avro", "orc", "hive"); + private static final Set ALLOWED_SOURCES = + ImmutableSet.of("parquet", "avro", "orc", "hive"); protected static final String LOCATION = "location"; protected static final String ICEBERG_METADATA_FOLDER = "metadata"; protected static final List EXCLUDED_PROPERTIES = @@ -66,7 +66,8 @@ abstract class BaseTableCreationSparkAction extends BaseSparkAction

    additionalProperties = Maps.newHashMap(); - BaseTableCreationSparkAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { + BaseTableCreationSparkAction( + SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { super(spark); this.sourceCatalog = checkSourceCatalog(sourceCatalog); @@ -78,12 +79,13 @@ abstract class BaseTableCreationSparkAction extends BaseSparkAction additionalProperties() { private void validateSourceTable() { String sourceTableProvider = sourceCatalogTable.provider().get().toLowerCase(Locale.ROOT); - Preconditions.checkArgument(ALLOWED_SOURCES.contains(sourceTableProvider), - "Cannot create an Iceberg table from source provider: '%s'", sourceTableProvider); - Preconditions.checkArgument(!sourceCatalogTable.storage().locationUri().isEmpty(), + Preconditions.checkArgument( + ALLOWED_SOURCES.contains(sourceTableProvider), + "Cannot create an Iceberg table from source provider: '%s'", + sourceTableProvider); + Preconditions.checkArgument( + !sourceCatalogTable.storage().locationUri().isEmpty(), "Cannot create an Iceberg table from a source without an explicit location"); } protected StagingTableCatalog checkDestinationCatalog(CatalogPlugin catalog) { - Preconditions.checkArgument(catalog instanceof SparkSessionCatalog || catalog instanceof SparkCatalog, - "Cannot create Iceberg table in non-Iceberg Catalog. " + - "Catalog '%s' was of class '%s' but '%s' or '%s' are required", - catalog.name(), catalog.getClass().getName(), SparkSessionCatalog.class.getName(), + Preconditions.checkArgument( + catalog instanceof SparkSessionCatalog || catalog instanceof SparkCatalog, + "Cannot create Iceberg table in non-Iceberg Catalog. " + + "Catalog '%s' was of class '%s' but '%s' or '%s' are required", + catalog.name(), + catalog.getClass().getName(), + SparkSessionCatalog.class.getName(), SparkCatalog.class.getName()); return (StagingTableCatalog) catalog; @@ -145,11 +153,14 @@ protected StagedSparkTable stageDestTable() { Map props = destTableProps(); StructType schema = sourceTable.schema(); Transform[] partitioning = sourceTable.partitioning(); - return (StagedSparkTable) destCatalog().stageCreate(destTableIdent(), schema, partitioning, props); + return (StagedSparkTable) + destCatalog().stageCreate(destTableIdent(), schema, partitioning, props); } catch (org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException e) { - throw new NoSuchNamespaceException("Cannot create table %s as the namespace does not exist", destTableIdent()); + throw new NoSuchNamespaceException( + "Cannot create table %s as the namespace does not exist", destTableIdent()); } catch (org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException e) { - throw new AlreadyExistsException("Cannot create table %s as it already exists", destTableIdent()); + throw new AlreadyExistsException( + "Cannot create table %s as it already exists", destTableIdent()); } } @@ -162,7 +173,10 @@ protected void ensureNameMappingPresent(Table table) { } protected String getMetadataLocation(Table table) { - return table.properties().getOrDefault(TableProperties.WRITE_METADATA_LOCATION, - table.location() + "/" + ICEBERG_METADATA_FOLDER); + return table + .properties() + .getOrDefault( + TableProperties.WRITE_METADATA_LOCATION, + table.location() + "/" + ICEBERG_METADATA_FOLDER); } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java index 0837fb7d39e4..3660b870c63f 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.nio.ByteBuffer; diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/Spark3BinPackStrategy.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/Spark3BinPackStrategy.java index b2995ced3c1b..130dbe74388f 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/Spark3BinPackStrategy.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/Spark3BinPackStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.List; @@ -61,14 +60,18 @@ public Set rewriteFiles(List filesToRewrite) { SparkSession cloneSession = spark.cloneSession(); cloneSession.conf().set(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), false); - Dataset scanDF = cloneSession.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) - .option(SparkReadOptions.SPLIT_SIZE, splitSize(inputFileSize(filesToRewrite))) - .option(SparkReadOptions.FILE_OPEN_COST, "0") - .load(table.name()); + Dataset scanDF = + cloneSession + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) + .option(SparkReadOptions.SPLIT_SIZE, splitSize(inputFileSize(filesToRewrite))) + .option(SparkReadOptions.FILE_OPEN_COST, "0") + .load(table.name()); // write the packed data into new files where each split becomes a new file - scanDF.write() + scanDF + .write() .format("iceberg") .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, groupID) .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, writeMaxFileSize()) diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/Spark3SortStrategy.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/Spark3SortStrategy.java index 2931796715c1..cfa1d66729c6 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/Spark3SortStrategy.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/Spark3SortStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.List; @@ -51,14 +50,13 @@ public class Spark3SortStrategy extends SortStrategy { /** - * The number of shuffle partitions and consequently the number of output files - * created by the Spark Sort is based on the size of the input data files used - * in this rewrite operation. Due to compression, the disk file sizes may not - * accurately represent the size of files in the output. This parameter lets - * the user adjust the file size used for estimating actual output data size. A - * factor greater than 1.0 would generate more files than we would expect based - * on the on-disk file size. A value less than 1.0 would create fewer files than - * we would expect due to the on-disk size. + * The number of shuffle partitions and consequently the number of output files created by the + * Spark Sort is based on the size of the input data files used in this rewrite operation. Due to + * compression, the disk file sizes may not accurately represent the size of files in the output. + * This parameter lets the user adjust the file size used for estimating actual output data size. + * A factor greater than 1.0 would generate more files than we would expect based on the on-disk + * file size. A value less than 1.0 would create fewer files than we would expect due to the + * on-disk size. */ public static final String COMPRESSION_FACTOR = "compression-factor"; @@ -89,12 +87,12 @@ public Set validOptions() { @Override public RewriteStrategy options(Map options) { - sizeEstimateMultiple = PropertyUtil.propertyAsDouble(options, - COMPRESSION_FACTOR, - 1.0); + sizeEstimateMultiple = PropertyUtil.propertyAsDouble(options, COMPRESSION_FACTOR, 1.0); - Preconditions.checkArgument(sizeEstimateMultiple > 0, - "Invalid compression factor: %s (not positive)", sizeEstimateMultiple); + Preconditions.checkArgument( + sizeEstimateMultiple > 0, + "Invalid compression factor: %s (not positive)", + sizeEstimateMultiple); return super.options(options); } @@ -107,7 +105,9 @@ public Set rewriteFiles(List filesToRewrite) { SortOrder[] ordering; if (requiresRepartition) { // Build in the requirement for Partition Sorting into our sort order - ordering = Spark3Util.convert(SortOrderUtil.buildSortOrder(table.schema(), table.spec(), sortOrder())); + ordering = + Spark3Util.convert( + SortOrderUtil.buildSortOrder(table.schema(), table.spec(), sortOrder())); } else { ordering = Spark3Util.convert(sortOrder()); } @@ -122,23 +122,29 @@ public Set rewriteFiles(List filesToRewrite) { cloneSession.conf().set(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), false); // Reset Shuffle Partitions for our sort - long numOutputFiles = numOutputFiles((long) (inputFileSize(filesToRewrite) * sizeEstimateMultiple)); + long numOutputFiles = + numOutputFiles((long) (inputFileSize(filesToRewrite) * sizeEstimateMultiple)); cloneSession.conf().set(SQLConf.SHUFFLE_PARTITIONS().key(), Math.max(1, numOutputFiles)); - Dataset scanDF = cloneSession.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) - .load(table.name()); + Dataset scanDF = + cloneSession + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) + .load(table.name()); // write the packed data into new files where each split becomes a new file SQLConf sqlConf = cloneSession.sessionState().conf(); LogicalPlan sortPlan = sortPlan(distribution, ordering, scanDF.logicalPlan(), sqlConf); Dataset sortedDf = new Dataset<>(cloneSession, sortPlan, scanDF.encoder()); - sortedDf.write() + sortedDf + .write() .format("iceberg") .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, groupID) .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, writeMaxFileSize()) - .mode("append") // This will only write files without modifying the table, see SparkWrite.RewriteFiles + .mode("append") // This will only write files without modifying the table, see + // SparkWrite.RewriteFiles .save(table.name()); return rewriteCoordinator.fetchNewDataFiles(table, groupID); @@ -152,7 +158,8 @@ protected SparkSession spark() { return this.spark; } - protected LogicalPlan sortPlan(Distribution distribution, SortOrder[] ordering, LogicalPlan plan, SQLConf conf) { + protected LogicalPlan sortPlan( + Distribution distribution, SortOrder[] ordering, LogicalPlan plan, SQLConf conf) { return DistributionAndOrderingUtils$.MODULE$.prepareQuery(distribution, ordering, plan, conf); } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java index 3230728261c9..91ec18a79fe6 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import org.apache.iceberg.Table; @@ -31,9 +30,9 @@ /** * An implementation of {@link ActionsProvider} for Spark. - *

    - * This class is the primary API for interacting with actions in Spark that users should use - * to instantiate particular actions. + * + *

    This class is the primary API for interacting with actions in Spark that users should use to + * instantiate particular actions. */ public class SparkActions extends BaseSparkActions { @@ -53,16 +52,20 @@ public static SparkActions get() { public SnapshotTable snapshotTable(String tableIdent) { String ctx = "snapshot source"; CatalogPlugin defaultCatalog = spark().sessionState().catalogManager().currentCatalog(); - CatalogAndIdentifier catalogAndIdent = Spark3Util.catalogAndIdentifier(ctx, spark(), tableIdent, defaultCatalog); - return new BaseSnapshotTableSparkAction(spark(), catalogAndIdent.catalog(), catalogAndIdent.identifier()); + CatalogAndIdentifier catalogAndIdent = + Spark3Util.catalogAndIdentifier(ctx, spark(), tableIdent, defaultCatalog); + return new BaseSnapshotTableSparkAction( + spark(), catalogAndIdent.catalog(), catalogAndIdent.identifier()); } @Override public MigrateTable migrateTable(String tableIdent) { String ctx = "migrate target"; CatalogPlugin defaultCatalog = spark().sessionState().catalogManager().currentCatalog(); - CatalogAndIdentifier catalogAndIdent = Spark3Util.catalogAndIdentifier(ctx, spark(), tableIdent, defaultCatalog); - return new BaseMigrateTableSparkAction(spark(), catalogAndIdent.catalog(), catalogAndIdent.identifier()); + CatalogAndIdentifier catalogAndIdent = + Spark3Util.catalogAndIdentifier(ctx, spark(), tableIdent, defaultCatalog); + return new BaseMigrateTableSparkAction( + spark(), catalogAndIdent.catalog(), catalogAndIdent.identifier()); } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java index 40ed05b4ce65..74454fc1e466 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import org.apache.iceberg.avro.AvroWithPartnerByStructureVisitor; @@ -30,7 +29,8 @@ import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; -public abstract class AvroWithSparkSchemaVisitor extends AvroWithPartnerByStructureVisitor { +public abstract class AvroWithSparkSchemaVisitor + extends AvroWithPartnerByStructureVisitor { @Override protected boolean isStringType(DataType dataType) { @@ -44,7 +44,8 @@ protected boolean isMapType(DataType dataType) { @Override protected DataType arrayElementType(DataType arrayType) { - Preconditions.checkArgument(arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); + Preconditions.checkArgument( + arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); return ((ArrayType) arrayType).elementType(); } @@ -62,7 +63,8 @@ protected DataType mapValueType(DataType mapType) { @Override protected Pair fieldNameAndType(DataType structType, int pos) { - Preconditions.checkArgument(structType instanceof StructType, "Invalid struct: %s is not a struct", structType); + Preconditions.checkArgument( + structType instanceof StructType, "Invalid struct: %s is not a struct", structType); StructField field = ((StructType) structType).apply(pos); return Pair.of(field.name(), field.dataType()); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java index 924cc3e2325a..d74a76f94e87 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.Deque; @@ -48,9 +47,11 @@ public class ParquetWithSparkSchemaVisitor { public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisitor visitor) { Preconditions.checkArgument(sType != null, "Invalid DataType: null"); if (type instanceof MessageType) { - Preconditions.checkArgument(sType instanceof StructType, "Invalid struct: %s is not a struct", sType); + Preconditions.checkArgument( + sType instanceof StructType, "Invalid struct: %s is not a struct", sType); StructType struct = (StructType) sType; - return visitor.message(struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); + return visitor.message( + struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); } else if (type.isPrimitive()) { return visitor.primitive(sType, type.asPrimitiveType()); @@ -62,21 +63,30 @@ public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisit if (annotation != null) { switch (annotation) { case LIST: - Preconditions.checkArgument(!group.isRepetition(Repetition.REPEATED), - "Invalid list: top-level group is repeated: %s", group); - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid list: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + !group.isRepetition(Repetition.REPEATED), + "Invalid list: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid list: does not contain single repeated field: %s", + group); GroupType repeatedElement = group.getFields().get(0).asGroupType(); - Preconditions.checkArgument(repeatedElement.isRepetition(Repetition.REPEATED), + Preconditions.checkArgument( + repeatedElement.isRepetition(Repetition.REPEATED), "Invalid list: inner group is not repeated"); - Preconditions.checkArgument(repeatedElement.getFieldCount() <= 1, - "Invalid list: repeated group is not a single field: %s", group); + Preconditions.checkArgument( + repeatedElement.getFieldCount() <= 1, + "Invalid list: repeated group is not a single field: %s", + group); - Preconditions.checkArgument(sType instanceof ArrayType, "Invalid list: %s is not an array", sType); + Preconditions.checkArgument( + sType instanceof ArrayType, "Invalid list: %s is not an array", sType); ArrayType array = (ArrayType) sType; - StructField element = new StructField( - "element", array.elementType(), array.containsNull(), Metadata.empty()); + StructField element = + new StructField( + "element", array.elementType(), array.containsNull(), Metadata.empty()); visitor.fieldNames.push(repeatedElement.getName()); try { @@ -92,22 +102,30 @@ public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisit } case MAP: - Preconditions.checkArgument(!group.isRepetition(Repetition.REPEATED), - "Invalid map: top-level group is repeated: %s", group); - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid map: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + !group.isRepetition(Repetition.REPEATED), + "Invalid map: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid map: does not contain single repeated field: %s", + group); GroupType repeatedKeyValue = group.getType(0).asGroupType(); - Preconditions.checkArgument(repeatedKeyValue.isRepetition(Repetition.REPEATED), + Preconditions.checkArgument( + repeatedKeyValue.isRepetition(Repetition.REPEATED), "Invalid map: inner group is not repeated"); - Preconditions.checkArgument(repeatedKeyValue.getFieldCount() <= 2, + Preconditions.checkArgument( + repeatedKeyValue.getFieldCount() <= 2, "Invalid map: repeated group does not have 2 fields"); - Preconditions.checkArgument(sType instanceof MapType, "Invalid map: %s is not a map", sType); + Preconditions.checkArgument( + sType instanceof MapType, "Invalid map: %s is not a map", sType); MapType map = (MapType) sType; StructField keyField = new StructField("key", map.keyType(), false, Metadata.empty()); - StructField valueField = new StructField( - "value", map.valueType(), map.valueContainsNull(), Metadata.empty()); + StructField valueField = + new StructField( + "value", map.valueType(), map.valueContainsNull(), Metadata.empty()); visitor.fieldNames.push(repeatedKeyValue.getName()); try { @@ -144,13 +162,15 @@ public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisit } } - Preconditions.checkArgument(sType instanceof StructType, "Invalid struct: %s is not a struct", sType); + Preconditions.checkArgument( + sType instanceof StructType, "Invalid struct: %s is not a struct", sType); StructType struct = (StructType) sType; return visitor.struct(struct, group, visitFields(struct, group, visitor)); } } - private static T visitField(StructField sField, Type field, ParquetWithSparkSchemaVisitor visitor) { + private static T visitField( + StructField sField, Type field, ParquetWithSparkSchemaVisitor visitor) { visitor.fieldNames.push(field.getName()); try { return visit(sField.dataType(), field, visitor); @@ -159,17 +179,20 @@ private static T visitField(StructField sField, Type field, ParquetWithSpark } } - private static List visitFields(StructType struct, GroupType group, - ParquetWithSparkSchemaVisitor visitor) { + private static List visitFields( + StructType struct, GroupType group, ParquetWithSparkSchemaVisitor visitor) { StructField[] sFields = struct.fields(); - Preconditions.checkArgument(sFields.length == group.getFieldCount(), - "Structs do not match: %s and %s", struct, group); + Preconditions.checkArgument( + sFields.length == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); for (int i = 0; i < sFields.length; i += 1) { Type field = group.getFields().get(i); StructField sField = sFields[i]; - Preconditions.checkArgument(field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), - "Structs do not match: field %s != %s", field.getName(), sField.name()); + Preconditions.checkArgument( + field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), + "Structs do not match: field %s != %s", + field.getName(), + sField.name()); results.add(visitField(sField, field, visitor)); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java index c693e2e2c057..4622d2928ac4 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -38,7 +37,6 @@ import org.apache.iceberg.types.Types; import org.apache.spark.sql.catalyst.InternalRow; - public class SparkAvroReader implements DatumReader, SupportsRowPosition { private final Schema readSchema; @@ -50,10 +48,12 @@ public SparkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSche } @SuppressWarnings("unchecked") - public SparkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { + public SparkAvroReader( + org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { this.readSchema = readSchema; - this.reader = (ValueReader) AvroSchemaWithTypeVisitor - .visit(expectedSchema, readSchema, new ReadBuilder(constants)); + this.reader = + (ValueReader) + AvroSchemaWithTypeVisitor.visit(expectedSchema, readSchema, new ReadBuilder(constants)); } @Override @@ -81,8 +81,8 @@ private ReadBuilder(Map idToConstant) { } @Override - public ValueReader record(Types.StructType expected, Schema record, List names, - List> fields) { + public ValueReader record( + Types.StructType expected, Schema record, List names, List> fields) { return SparkValueReaders.struct(fields, expected, idToConstant); } @@ -92,13 +92,14 @@ public ValueReader union(Type expected, Schema union, List> op } @Override - public ValueReader array(Types.ListType expected, Schema array, ValueReader elementReader) { + public ValueReader array( + Types.ListType expected, Schema array, ValueReader elementReader) { return SparkValueReaders.array(elementReader); } @Override - public ValueReader map(Types.MapType expected, Schema map, - ValueReader keyReader, ValueReader valueReader) { + public ValueReader map( + Types.MapType expected, Schema map, ValueReader keyReader, ValueReader valueReader) { return SparkValueReaders.arrayMap(keyReader, valueReader); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java index 7582125128a7..15465568c231 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -50,8 +49,9 @@ public SparkAvroWriter(StructType dsSchema) { @Override @SuppressWarnings("unchecked") public void setSchema(Schema schema) { - this.writer = (ValueWriter) AvroWithSparkSchemaVisitor - .visit(dsSchema, schema, new WriteBuilder()); + this.writer = + (ValueWriter) + AvroWithSparkSchemaVisitor.visit(dsSchema, schema, new WriteBuilder()); } @Override @@ -66,17 +66,23 @@ public Stream metrics() { private static class WriteBuilder extends AvroWithSparkSchemaVisitor> { @Override - public ValueWriter record(DataType struct, Schema record, List names, List> fields) { - return SparkValueWriters.struct(fields, IntStream.range(0, names.size()) - .mapToObj(i -> fieldNameAndType(struct, i).second()).collect(Collectors.toList())); + public ValueWriter record( + DataType struct, Schema record, List names, List> fields) { + return SparkValueWriters.struct( + fields, + IntStream.range(0, names.size()) + .mapToObj(i -> fieldNameAndType(struct, i).second()) + .collect(Collectors.toList())); } @Override public ValueWriter union(DataType type, Schema union, List> options) { - Preconditions.checkArgument(options.contains(ValueWriters.nulls()), - "Cannot create writer for non-option union: %s", union); - Preconditions.checkArgument(options.size() == 2, - "Cannot create writer for non-option union: %s", union); + Preconditions.checkArgument( + options.contains(ValueWriters.nulls()), + "Cannot create writer for non-option union: %s", + union); + Preconditions.checkArgument( + options.size() == 2, "Cannot create writer for non-option union: %s", union); if (union.getTypes().get(0).getType() == Schema.Type.NULL) { return ValueWriters.option(0, options.get(1)); } else { @@ -91,12 +97,15 @@ public ValueWriter array(DataType sArray, Schema array, ValueWriter elemen @Override public ValueWriter map(DataType sMap, Schema map, ValueWriter valueReader) { - return SparkValueWriters.map(SparkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); + return SparkValueWriters.map( + SparkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); } @Override - public ValueWriter map(DataType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { - return SparkValueWriters.arrayMap(keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); + public ValueWriter map( + DataType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { + return SparkValueWriters.arrayMap( + keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java index 4ed6420a9aa4..78db137054bc 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.List; @@ -34,10 +33,9 @@ import org.apache.spark.sql.catalyst.InternalRow; /** - * Converts the OrcIterator, which returns ORC's VectorizedRowBatch to a - * set of Spark's UnsafeRows. + * Converts the OrcIterator, which returns ORC's VectorizedRowBatch to a set of Spark's UnsafeRows. * - * It minimizes allocations by reusing most of the objects in the implementation. + *

    It minimizes allocations by reusing most of the objects in the implementation. */ public class SparkOrcReader implements OrcRowReader { private final OrcValueReader reader; @@ -48,8 +46,12 @@ public SparkOrcReader(org.apache.iceberg.Schema expectedSchema, TypeDescription @SuppressWarnings("unchecked") public SparkOrcReader( - org.apache.iceberg.Schema expectedSchema, TypeDescription readOrcSchema, Map idToConstant) { - this.reader = OrcSchemaWithTypeVisitor.visit(expectedSchema, readOrcSchema, new ReadBuilder(idToConstant)); + org.apache.iceberg.Schema expectedSchema, + TypeDescription readOrcSchema, + Map idToConstant) { + this.reader = + OrcSchemaWithTypeVisitor.visit( + expectedSchema, readOrcSchema, new ReadBuilder(idToConstant)); } @Override @@ -71,18 +73,25 @@ private ReadBuilder(Map idToConstant) { @Override public OrcValueReader record( - Types.StructType expected, TypeDescription record, List names, List> fields) { + Types.StructType expected, + TypeDescription record, + List names, + List> fields) { return SparkOrcValueReaders.struct(fields, expected, idToConstant); } @Override - public OrcValueReader list(Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { + public OrcValueReader list( + Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { return SparkOrcValueReaders.array(elementReader); } @Override public OrcValueReader map( - Types.MapType iMap, TypeDescription map, OrcValueReader keyReader, OrcValueReader valueReader) { + Types.MapType iMap, + TypeDescription map, + OrcValueReader keyReader, + OrcValueReader valueReader) { return SparkOrcValueReaders.map(keyReader, valueReader); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java index f35ab7a17c63..9e9b3e53bbcc 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.math.BigDecimal; @@ -44,8 +43,7 @@ import org.apache.spark.unsafe.types.UTF8String; public class SparkOrcValueReaders { - private SparkOrcValueReaders() { - } + private SparkOrcValueReaders() {} public static OrcValueReader utf8String() { return StringReader.INSTANCE; @@ -125,8 +123,7 @@ public MapData nonNullRead(ColumnVector vector, int row) { } return new ArrayBasedMapData( - new GenericArrayData(keys.toArray()), - new GenericArrayData(values.toArray())); + new GenericArrayData(keys.toArray()), new GenericArrayData(values.toArray())); } @Override @@ -139,7 +136,8 @@ public void setBatchContext(long batchOffsetInFile) { static class StructReader extends OrcValueReaders.StructReader { private final int numFields; - protected StructReader(List> readers, Types.StructType struct, Map idToConstant) { + protected StructReader( + List> readers, Types.StructType struct, Map idToConstant) { super(readers, struct, idToConstant); this.numFields = struct.fields().size(); } @@ -162,21 +160,20 @@ protected void set(InternalRow struct, int pos, Object value) { private static class StringReader implements OrcValueReader { private static final StringReader INSTANCE = new StringReader(); - private StringReader() { - } + private StringReader() {} @Override public UTF8String nonNullRead(ColumnVector vector, int row) { BytesColumnVector bytesVector = (BytesColumnVector) vector; - return UTF8String.fromBytes(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); + return UTF8String.fromBytes( + bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); } } private static class TimestampTzReader implements OrcValueReader { private static final TimestampTzReader INSTANCE = new TimestampTzReader(); - private TimestampTzReader() { - } + private TimestampTzReader() {} @Override public Long nonNullRead(ColumnVector vector, int row) { @@ -198,12 +195,20 @@ private static class Decimal18Reader implements OrcValueReader { public Decimal nonNullRead(ColumnVector vector, int row) { HiveDecimalWritable value = ((DecimalColumnVector) vector).vector[row]; - // The scale of decimal read from hive ORC file may be not equals to the expected scale. For data type - // decimal(10,3) and the value 10.100, the hive ORC writer will remove its trailing zero and store it - // as 101*10^(-1), its scale will adjust from 3 to 1. So here we could not assert that value.scale() == scale. - // we also need to convert the hive orc decimal to a decimal with expected precision and scale. - Preconditions.checkArgument(value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); + // The scale of decimal read from hive ORC file may be not equals to the expected scale. For + // data type + // decimal(10,3) and the value 10.100, the hive ORC writer will remove its trailing zero and + // store it + // as 101*10^(-1), its scale will adjust from 3 to 1. So here we could not assert that + // value.scale() == scale. + // we also need to convert the hive orc decimal to a decimal with expected precision and + // scale. + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); return new Decimal().set(value.serialize64(scale), precision, scale); } @@ -220,11 +225,15 @@ private static class Decimal38Reader implements OrcValueReader { @Override public Decimal nonNullRead(ColumnVector vector, int row) { - BigDecimal value = ((DecimalColumnVector) vector).vector[row] - .getHiveDecimal().bigDecimalValue(); - - Preconditions.checkArgument(value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); + BigDecimal value = + ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); + + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); return new Decimal().set(new scala.math.BigDecimal(value), precision, scale); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java index abb12dffc050..780090f99109 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.List; @@ -37,8 +36,7 @@ import org.apache.spark.unsafe.types.UTF8String; class SparkOrcValueWriters { - private SparkOrcValueWriters() { - } + private SparkOrcValueWriters() {} static OrcValueWriter strings() { return StringWriter.INSTANCE; @@ -60,8 +58,8 @@ static OrcValueWriter list(OrcValueWriter element, List o return new ListWriter<>(element, orcType); } - static OrcValueWriter map(OrcValueWriter keyWriter, OrcValueWriter valueWriter, - List orcTypes) { + static OrcValueWriter map( + OrcValueWriter keyWriter, OrcValueWriter valueWriter, List orcTypes) { return new MapWriter<>(keyWriter, valueWriter, orcTypes); } @@ -73,7 +71,6 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) { byte[] value = data.getBytes(); ((BytesColumnVector) output).setRef(rowId, value, 0, value.length); } - } private static class TimestampTzWriter implements OrcValueWriter { @@ -85,7 +82,6 @@ public void nonNullWrite(int rowId, Long micros, ColumnVector output) { cv.time[rowId] = Math.floorDiv(micros, 1_000); // millis cv.nanos[rowId] = (int) Math.floorMod(micros, 1_000_000) * 1_000; // nanos } - } private static class Decimal18Writer implements OrcValueWriter { @@ -97,20 +93,18 @@ private static class Decimal18Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, Decimal decimal, ColumnVector output) { - ((DecimalColumnVector) output).vector[rowId].setFromLongAndScale( - decimal.toUnscaledLong(), scale); + ((DecimalColumnVector) output) + .vector[rowId].setFromLongAndScale(decimal.toUnscaledLong(), scale); } - } private static class Decimal38Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, Decimal decimal, ColumnVector output) { - ((DecimalColumnVector) output).vector[rowId].set( - HiveDecimal.create(decimal.toJavaBigDecimal())); + ((DecimalColumnVector) output) + .vector[rowId].set(HiveDecimal.create(decimal.toJavaBigDecimal())); } - } private static class ListWriter implements OrcValueWriter { @@ -120,10 +114,12 @@ private static class ListWriter implements OrcValueWriter { @SuppressWarnings("unchecked") ListWriter(OrcValueWriter writer, List orcTypes) { if (orcTypes.size() != 1) { - throw new IllegalArgumentException("Expected one (and same) ORC type for list elements, got: " + orcTypes); + throw new IllegalArgumentException( + "Expected one (and same) ORC type for list elements, got: " + orcTypes); } this.writer = writer; - this.fieldGetter = (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); + this.fieldGetter = + (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); } @Override @@ -145,7 +141,6 @@ public void nonNullWrite(int rowId, ArrayData value, ColumnVector output) { public Stream> metrics() { return writer.metrics(); } - } private static class MapWriter implements OrcValueWriter { @@ -155,14 +150,20 @@ private static class MapWriter implements OrcValueWriter { private final SparkOrcWriter.FieldGetter valueFieldGetter; @SuppressWarnings("unchecked") - MapWriter(OrcValueWriter keyWriter, OrcValueWriter valueWriter, List orcTypes) { + MapWriter( + OrcValueWriter keyWriter, + OrcValueWriter valueWriter, + List orcTypes) { if (orcTypes.size() != 2) { - throw new IllegalArgumentException("Expected two ORC type descriptions for a map, got: " + orcTypes); + throw new IllegalArgumentException( + "Expected two ORC type descriptions for a map, got: " + orcTypes); } this.keyWriter = keyWriter; this.valueWriter = valueWriter; - this.keyFieldGetter = (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); - this.valueFieldGetter = (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(1)); + this.keyFieldGetter = + (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); + this.valueFieldGetter = + (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(1)); } @Override @@ -189,7 +190,6 @@ public void nonNullWrite(int rowId, MapData map, ColumnVector output) { public Stream> metrics() { return Stream.concat(keyWriter.metrics(), valueWriter.metrics()); } - } private static void growColumnVector(ColumnVector cv, int requestedSize) { diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java index 9c7f3a6eb01d..6a8c7f1d3c88 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.Serializable; @@ -39,19 +38,18 @@ import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.expressions.SpecializedGetters; -/** - * This class acts as an adaptor from an OrcFileAppender to a - * FileAppender<InternalRow>. - */ +/** This class acts as an adaptor from an OrcFileAppender to a FileAppender<InternalRow>. */ public class SparkOrcWriter implements OrcRowWriter { private final InternalRowWriter writer; public SparkOrcWriter(Schema iSchema, TypeDescription orcSchema) { - Preconditions.checkArgument(orcSchema.getCategory() == TypeDescription.Category.STRUCT, + Preconditions.checkArgument( + orcSchema.getCategory() == TypeDescription.Category.STRUCT, "Top level must be a struct " + orcSchema); - writer = (InternalRowWriter) OrcSchemaWithTypeVisitor.visit(iSchema, orcSchema, new WriteBuilder()); + writer = + (InternalRowWriter) OrcSchemaWithTypeVisitor.visit(iSchema, orcSchema, new WriteBuilder()); } @Override @@ -71,24 +69,26 @@ public Stream> metrics() { } private static class WriteBuilder extends OrcSchemaWithTypeVisitor> { - private WriteBuilder() { - } + private WriteBuilder() {} @Override - public OrcValueWriter record(Types.StructType iStruct, TypeDescription record, - List names, List> fields) { + public OrcValueWriter record( + Types.StructType iStruct, + TypeDescription record, + List names, + List> fields) { return new InternalRowWriter(fields, record.getChildren()); } @Override - public OrcValueWriter list(Types.ListType iList, TypeDescription array, - OrcValueWriter element) { + public OrcValueWriter list( + Types.ListType iList, TypeDescription array, OrcValueWriter element) { return SparkOrcValueWriters.list(element, array.getChildren()); } @Override - public OrcValueWriter map(Types.MapType iMap, TypeDescription map, - OrcValueWriter key, OrcValueWriter value) { + public OrcValueWriter map( + Types.MapType iMap, TypeDescription map, OrcValueWriter key, OrcValueWriter value) { return SparkOrcValueWriters.map(key, value, map.getChildren()); } @@ -178,8 +178,9 @@ static FieldGetter createFieldGetter(TypeDescription fieldType) { // being changed behind our back. break; case DECIMAL: - fieldGetter = (row, ordinal) -> - row.getDecimal(ordinal, fieldType.getPrecision(), fieldType.getScale()); + fieldGetter = + (row, ordinal) -> + row.getDecimal(ordinal, fieldType.getPrecision(), fieldType.getScale()); break; case STRING: case CHAR: @@ -196,7 +197,8 @@ static FieldGetter createFieldGetter(TypeDescription fieldType) { fieldGetter = SpecializedGetters::getMap; break; default: - throw new IllegalArgumentException("Encountered an unsupported ORC type during a write from Spark."); + throw new IllegalArgumentException( + "Encountered an unsupported ORC type during a write from Spark."); } return (row, ordinal) -> { @@ -210,10 +212,12 @@ static FieldGetter createFieldGetter(TypeDescription fieldType) { interface FieldGetter extends Serializable { /** - * Returns a value from a complex Spark data holder such ArrayData, InternalRow, etc... - * Calls the appropriate getter for the expected data type. + * Returns a value from a complex Spark data holder such ArrayData, InternalRow, etc... Calls + * the appropriate getter for the expected data type. + * * @param row Spark's data representation - * @param ordinal index in the data structure (e.g. column index for InterRow, list index in ArrayData, etc..) + * @param ordinal index in the data structure (e.g. column index for InterRow, list index in + * ArrayData, etc..) * @return field value at ordinal */ @Nullable diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java index 8abee4a575e1..8c4c3dce226a 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.math.BigDecimal; @@ -66,25 +65,25 @@ import org.apache.spark.unsafe.types.UTF8String; public class SparkParquetReaders { - private SparkParquetReaders() { - } + private SparkParquetReaders() {} - public static ParquetValueReader buildReader(Schema expectedSchema, - MessageType fileSchema) { + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema) { return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); } @SuppressWarnings("unchecked") - public static ParquetValueReader buildReader(Schema expectedSchema, - MessageType fileSchema, - Map idToConstant) { + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema, Map idToConstant) { if (ParquetSchemaUtil.hasIds(fileSchema)) { return (ParquetValueReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, - new ReadBuilder(fileSchema, idToConstant)); + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); } else { return (ParquetValueReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), + fileSchema, new FallbackReadBuilder(fileSchema, idToConstant)); } } @@ -95,18 +94,18 @@ private static class FallbackReadBuilder extends ReadBuilder { } @Override - public ParquetValueReader message(Types.StructType expected, MessageType message, - List> fieldReaders) { + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { // the top level matches by ID, but the remaining IDs are missing return super.struct(expected, message, fieldReaders); } @Override - public ParquetValueReader struct(Types.StructType ignored, GroupType struct, - List> fieldReaders) { + public ParquetValueReader struct( + Types.StructType ignored, GroupType struct, List> fieldReaders) { // the expected struct is ignored because nested fields are never found when the - List> newFields = Lists.newArrayListWithExpectedSize( - fieldReaders.size()); + List> newFields = + Lists.newArrayListWithExpectedSize(fieldReaders.size()); List types = Lists.newArrayListWithExpectedSize(fieldReaders.size()); List fields = struct.getFields(); for (int i = 0; i < fields.size(); i += 1) { @@ -130,14 +129,14 @@ private static class ReadBuilder extends TypeWithSchemaVisitor message(Types.StructType expected, MessageType message, - List> fieldReaders) { + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { return struct(expected, message.asGroupType(), fieldReaders); } @Override - public ParquetValueReader struct(Types.StructType expected, GroupType struct, - List> fieldReaders) { + public ParquetValueReader struct( + Types.StructType expected, GroupType struct, List> fieldReaders) { // match the expected struct's order Map> readersById = Maps.newHashMap(); Map typesById = Maps.newHashMap(); @@ -152,10 +151,10 @@ public ParquetValueReader struct(Types.StructType expected, GroupType struct, } } - List expectedFields = expected != null ? - expected.fields() : ImmutableList.of(); - List> reorderedFields = Lists.newArrayListWithExpectedSize( - expectedFields.size()); + List expectedFields = + expected != null ? expected.fields() : ImmutableList.of(); + List> reorderedFields = + Lists.newArrayListWithExpectedSize(expectedFields.size()); List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); for (Types.NestedField field : expectedFields) { int id = field.fieldId(); @@ -185,8 +184,8 @@ public ParquetValueReader struct(Types.StructType expected, GroupType struct, } @Override - public ParquetValueReader list(Types.ListType expectedList, GroupType array, - ParquetValueReader elementReader) { + public ParquetValueReader list( + Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); @@ -196,13 +195,16 @@ public ParquetValueReader list(Types.ListType expectedList, GroupType array, Type elementType = repeated.getType(0); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - return new ArrayReader<>(repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); + return new ArrayReader<>( + repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); } @Override - public ParquetValueReader map(Types.MapType expectedMap, GroupType map, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { + public ParquetValueReader map( + Types.MapType expectedMap, + GroupType map, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); @@ -214,14 +216,16 @@ public ParquetValueReader map(Types.MapType expectedMap, GroupType map, Type valueType = repeatedKeyValue.getType(1); int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - return new MapReader<>(repeatedD, repeatedR, + return new MapReader<>( + repeatedD, + repeatedR, ParquetValueReaders.option(keyType, keyD, keyReader), ParquetValueReaders.option(valueType, valueD, valueReader)); } @Override - public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveType expected, - PrimitiveType primitive) { + public ParquetValueReader primitive( + org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { ColumnDescriptor desc = type.getColumnDescription(currentPath()); if (primitive.getOriginalType() != null) { @@ -377,12 +381,13 @@ public Long read(Long ignored) { @Override public long readLong() { - final ByteBuffer byteBuffer = column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + final ByteBuffer byteBuffer = + column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); final long timeOfDayNanos = byteBuffer.getLong(); final int julianDay = byteBuffer.getInt(); - return TimeUnit.DAYS.toMicros(julianDay - UNIX_EPOCH_JULIAN) + - TimeUnit.NANOSECONDS.toMicros(timeOfDayNanos); + return TimeUnit.DAYS.toMicros(julianDay - UNIX_EPOCH_JULIAN) + + TimeUnit.NANOSECONDS.toMicros(timeOfDayNanos); } } @@ -456,15 +461,19 @@ protected ArrayData buildList(ReusableArrayData list) { } } - private static class MapReader extends RepeatedKeyValueReader { + private static class MapReader + extends RepeatedKeyValueReader { private int readPos = 0; private int writePos = 0; private final ReusableEntry entry = new ReusableEntry<>(); private final ReusableEntry nullEntry = new ReusableEntry<>(); - MapReader(int definitionLevel, int repetitionLevel, - ParquetValueReader keyReader, ParquetValueReader valueReader) { + MapReader( + int definitionLevel, + int repetitionLevel, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { super(definitionLevel, repetitionLevel, keyReader, valueReader); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index 5e268d26ed9c..c7622678c74d 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.Iterator; @@ -54,12 +53,12 @@ import org.apache.spark.unsafe.types.UTF8String; public class SparkParquetWriters { - private SparkParquetWriters() { - } + private SparkParquetWriters() {} @SuppressWarnings("unchecked") public static ParquetValueWriter buildWriter(StructType dfSchema, MessageType type) { - return (ParquetValueWriter) ParquetWithSparkSchemaVisitor.visit(dfSchema, type, new WriteBuilder(type)); + return (ParquetValueWriter) + ParquetWithSparkSchemaVisitor.visit(dfSchema, type, new WriteBuilder(type)); } private static class WriteBuilder extends ParquetWithSparkSchemaVisitor> { @@ -70,14 +69,14 @@ private static class WriteBuilder extends ParquetWithSparkSchemaVisitor message(StructType sStruct, MessageType message, - List> fieldWriters) { + public ParquetValueWriter message( + StructType sStruct, MessageType message, List> fieldWriters) { return struct(sStruct, message.asGroupType(), fieldWriters); } @Override - public ParquetValueWriter struct(StructType sStruct, GroupType struct, - List> fieldWriters) { + public ParquetValueWriter struct( + StructType sStruct, GroupType struct, List> fieldWriters) { List fields = struct.getFields(); StructField[] sparkFields = sStruct.fields(); List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); @@ -91,31 +90,40 @@ public ParquetValueWriter struct(StructType sStruct, GroupType struct, } @Override - public ParquetValueWriter list(ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { + public ParquetValueWriter list( + ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - return new ArrayDataWriter<>(repeatedD, repeatedR, + return new ArrayDataWriter<>( + repeatedD, + repeatedR, newOption(repeated.getType(0), elementWriter), sArray.elementType()); } @Override - public ParquetValueWriter map(MapType sMap, GroupType map, - ParquetValueWriter keyWriter, ParquetValueWriter valueWriter) { + public ParquetValueWriter map( + MapType sMap, + GroupType map, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - return new MapDataWriter<>(repeatedD, repeatedR, + return new MapDataWriter<>( + repeatedD, + repeatedR, newOption(repeatedKeyValue.getType(0), keyWriter), newOption(repeatedKeyValue.getType(1), valueWriter), - sMap.keyType(), sMap.valueType()); + sMap.keyType(), + sMap.valueType()); } private ParquetValueWriter newOption(Type fieldType, ParquetValueWriter writer) { @@ -197,18 +205,18 @@ private static PrimitiveWriter utf8Strings(ColumnDescriptor desc) { return new UTF8StringWriter(desc); } - private static PrimitiveWriter decimalAsInteger(ColumnDescriptor desc, - int precision, int scale) { + private static PrimitiveWriter decimalAsInteger( + ColumnDescriptor desc, int precision, int scale) { return new IntegerDecimalWriter(desc, precision, scale); } - private static PrimitiveWriter decimalAsLong(ColumnDescriptor desc, - int precision, int scale) { + private static PrimitiveWriter decimalAsLong( + ColumnDescriptor desc, int precision, int scale) { return new LongDecimalWriter(desc, precision, scale); } - private static PrimitiveWriter decimalAsFixed(ColumnDescriptor desc, - int precision, int scale) { + private static PrimitiveWriter decimalAsFixed( + ColumnDescriptor desc, int precision, int scale) { return new FixedDecimalWriter(desc, precision, scale); } @@ -239,10 +247,18 @@ private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, Decimal decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeInteger(repetitionLevel, (int) decimal.toUnscaledLong()); } @@ -260,10 +276,18 @@ private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, Decimal decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeLong(repetitionLevel, decimal.toUnscaledLong()); } @@ -278,12 +302,15 @@ private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { super(desc); this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(int repetitionLevel, Decimal decimal) { - byte[] binary = DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toJavaBigDecimal(), bytes.get()); + byte[] binary = + DecimalUtil.toReusedFixLengthBytes( + precision, scale, decimal.toJavaBigDecimal(), bytes.get()); column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(binary)); } } @@ -302,8 +329,11 @@ public void write(int repetitionLevel, byte[] bytes) { private static class ArrayDataWriter extends RepeatedWriter { private final DataType elementType; - private ArrayDataWriter(int definitionLevel, int repetitionLevel, - ParquetValueWriter writer, DataType elementType) { + private ArrayDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter writer, + DataType elementType) { super(definitionLevel, repetitionLevel, writer); this.elementType = elementType; } @@ -354,9 +384,13 @@ private static class MapDataWriter extends RepeatedKeyValueWriter keyWriter, ParquetValueWriter valueWriter, - DataType keyType, DataType valueType) { + private MapDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter, + DataType keyType, + DataType valueType) { super(definitionLevel, repetitionLevel, keyWriter, valueWriter); this.keyType = keyType; this.valueType = valueType; diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java index 0d3ce2b28d0b..11655c72d857 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -44,8 +43,7 @@ public class SparkValueReaders { - private SparkValueReaders() { - } + private SparkValueReaders() {} static ValueReader strings() { return StringReader.INSTANCE; @@ -67,8 +65,8 @@ static ValueReader array(ValueReader elementReader) { return new ArrayReader(elementReader); } - static ValueReader arrayMap(ValueReader keyReader, - ValueReader valueReader) { + static ValueReader arrayMap( + ValueReader keyReader, ValueReader valueReader) { return new ArrayMapReader(keyReader, valueReader); } @@ -76,16 +74,15 @@ static ValueReader map(ValueReader keyReader, ValueReader< return new MapReader(keyReader, valueReader); } - static ValueReader struct(List> readers, Types.StructType struct, - Map idToConstant) { + static ValueReader struct( + List> readers, Types.StructType struct, Map idToConstant) { return new StructReader(readers, struct, idToConstant); } private static class StringReader implements ValueReader { private static final StringReader INSTANCE = new StringReader(); - private StringReader() { - } + private StringReader() {} @Override public UTF8String read(Decoder decoder, Object reuse) throws IOException { @@ -97,10 +94,10 @@ public UTF8String read(Decoder decoder, Object reuse) throws IOException { Utf8 string = decoder.readString(utf8); return UTF8String.fromBytes(string.getBytes(), 0, string.getByteLength()); -// int length = decoder.readInt(); -// byte[] bytes = new byte[length]; -// decoder.readFixed(bytes, 0, length); -// return UTF8String.fromBytes(bytes); + // int length = decoder.readInt(); + // byte[] bytes = new byte[length]; + // decoder.readFixed(bytes, 0, length); + // return UTF8String.fromBytes(bytes); } } @@ -122,16 +119,17 @@ public UTF8String read(Decoder decoder, Object ignore) throws IOException { } private static class UUIDReader implements ValueReader { - private static final ThreadLocal BUFFER = ThreadLocal.withInitial(() -> { - ByteBuffer buffer = ByteBuffer.allocate(16); - buffer.order(ByteOrder.BIG_ENDIAN); - return buffer; - }); + private static final ThreadLocal BUFFER = + ThreadLocal.withInitial( + () -> { + ByteBuffer buffer = ByteBuffer.allocate(16); + buffer.order(ByteOrder.BIG_ENDIAN); + return buffer; + }); private static final UUIDReader INSTANCE = new UUIDReader(); - private UUIDReader() { - } + private UUIDReader() {} @Override @SuppressWarnings("ByteBufferBackingArray") @@ -258,14 +256,16 @@ public ArrayBasedMapData read(Decoder decoder, Object reuse) throws IOException static class StructReader extends ValueReaders.StructReader { private final int numFields; - protected StructReader(List> readers, Types.StructType struct, Map idToConstant) { + protected StructReader( + List> readers, Types.StructType struct, Map idToConstant) { super(readers, struct, idToConstant); this.numFields = readers.size(); } @Override protected InternalRow reuseOrCreate(Object reuse) { - if (reuse instanceof GenericInternalRow && ((GenericInternalRow) reuse).numFields() == numFields) { + if (reuse instanceof GenericInternalRow + && ((GenericInternalRow) reuse).numFields() == numFields) { return (InternalRow) reuse; } return new GenericInternalRow(numFields); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java index 24a69c1d7f11..5f2e2c054888 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -39,8 +38,7 @@ public class SparkValueWriters { - private SparkValueWriters() { - } + private SparkValueWriters() {} static ValueWriter strings() { return StringWriter.INSTANCE; @@ -75,8 +73,7 @@ static ValueWriter struct(List> writers, List { private static final StringWriter INSTANCE = new StringWriter(); - private StringWriter() { - } + private StringWriter() {} @Override public void write(UTF8String s, Encoder encoder) throws IOException { @@ -88,16 +85,17 @@ public void write(UTF8String s, Encoder encoder) throws IOException { } private static class UUIDWriter implements ValueWriter { - private static final ThreadLocal BUFFER = ThreadLocal.withInitial(() -> { - ByteBuffer buffer = ByteBuffer.allocate(16); - buffer.order(ByteOrder.BIG_ENDIAN); - return buffer; - }); + private static final ThreadLocal BUFFER = + ThreadLocal.withInitial( + () -> { + ByteBuffer buffer = ByteBuffer.allocate(16); + buffer.order(ByteOrder.BIG_ENDIAN); + return buffer; + }); private static final UUIDWriter INSTANCE = new UUIDWriter(); - private UUIDWriter() { - } + private UUIDWriter() {} @Override @SuppressWarnings("ByteBufferBackingArray") @@ -120,12 +118,14 @@ private static class DecimalWriter implements ValueWriter { private DecimalWriter(int precision, int scale) { this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(Decimal d, Encoder encoder) throws IOException { - encoder.writeFixed(DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toJavaBigDecimal(), bytes.get())); + encoder.writeFixed( + DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toJavaBigDecimal(), bytes.get())); } } @@ -158,8 +158,11 @@ private static class ArrayMapWriter implements ValueWriter { private final DataType keyType; private final DataType valueType; - private ArrayMapWriter(ValueWriter keyWriter, DataType keyType, - ValueWriter valueWriter, DataType valueType) { + private ArrayMapWriter( + ValueWriter keyWriter, + DataType keyType, + ValueWriter valueWriter, + DataType valueType) { this.keyWriter = keyWriter; this.keyType = keyType; this.valueWriter = valueWriter; @@ -189,8 +192,11 @@ private static class MapWriter implements ValueWriter { private final DataType keyType; private final DataType valueType; - private MapWriter(ValueWriter keyWriter, DataType keyType, - ValueWriter valueWriter, DataType valueType) { + private MapWriter( + ValueWriter keyWriter, + DataType keyType, + ValueWriter valueWriter, + DataType valueType) { this.keyWriter = keyWriter; this.keyType = keyType; this.valueWriter = valueWriter; diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java index 505ace508352..e32ebcb02bbc 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.math.BigDecimal; @@ -32,10 +31,12 @@ import org.apache.spark.unsafe.types.UTF8String; final class ArrowVectorAccessorFactory - extends GenericArrowVectorAccessorFactory { + extends GenericArrowVectorAccessorFactory< + Decimal, UTF8String, ColumnarArray, ArrowColumnVector> { ArrowVectorAccessorFactory() { - super(DecimalFactoryImpl::new, + super( + DecimalFactoryImpl::new, StringFactoryImpl::new, StructChildFactoryImpl::new, ArrayFactoryImpl::new); @@ -70,9 +71,7 @@ public UTF8String ofRow(VarCharVector vector, int rowId) { int end = vector.getEndOffset(rowId); return UTF8String.fromAddress( - null, - vector.getDataBuffer().memoryAddress() + start, - end - start); + null, vector.getDataBuffer().memoryAddress() + start, end - start); } @Override @@ -84,7 +83,9 @@ public UTF8String ofBytes(byte[] bytes) { public UTF8String ofByteBuffer(ByteBuffer byteBuffer) { if (byteBuffer.hasArray()) { return UTF8String.fromBytes( - byteBuffer.array(), byteBuffer.arrayOffset() + byteBuffer.position(), byteBuffer.remaining()); + byteBuffer.array(), + byteBuffer.arrayOffset() + byteBuffer.position(), + byteBuffer.remaining()); } byte[] bytes = new byte[byteBuffer.remaining()]; byteBuffer.get(bytes); @@ -92,7 +93,8 @@ public UTF8String ofByteBuffer(ByteBuffer byteBuffer) { } } - private static final class ArrayFactoryImpl implements ArrayFactory { + private static final class ArrayFactoryImpl + implements ArrayFactory { @Override public ArrowColumnVector ofChild(ValueVector childVector) { return new ArrowColumnVector(childVector); @@ -108,7 +110,8 @@ public ColumnarArray ofRow(ValueVector vector, ArrowColumnVector childData, int } } - private static final class StructChildFactoryImpl implements StructChildFactory { + private static final class StructChildFactoryImpl + implements StructChildFactory { @Override public Class getGenericClass() { return ArrowColumnVector.class; diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java index f3b3377af2b4..810fef81b5bb 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.arrow.vectorized.ArrowVectorAccessor; @@ -35,6 +34,5 @@ public class ArrowVectorAccessors { return factory.getVectorAccessor(holder); } - private ArrowVectorAccessors() { - } + private ArrowVectorAccessors() {} } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java index f71a6968099c..f761b2eb551b 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.util.List; @@ -28,9 +27,9 @@ import org.apache.spark.sql.vectorized.ColumnarBatch; /** - * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized read path. The - * {@link ColumnarBatch} returned is created by passing in the Arrow vectors populated via delegated read calls to - * {@linkplain VectorizedArrowReader VectorReader(s)}. + * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized + * read path. The {@link ColumnarBatch} returned is created by passing in the Arrow vectors + * populated via delegated read calls to {@linkplain VectorizedArrowReader VectorReader(s)}. */ public class ColumnarBatchReader extends BaseBatchReader { @@ -40,7 +39,8 @@ public ColumnarBatchReader(List> readers) { @Override public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { - Preconditions.checkArgument(numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); + Preconditions.checkArgument( + numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); ColumnVector[] arrowColumnVectors = new ColumnVector[readers.length]; if (reuse == null) { @@ -52,10 +52,10 @@ public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { int numRowsInVector = vectorHolders[i].numValues(); Preconditions.checkState( numRowsInVector == numRowsToRead, - "Number of rows in the vector %s didn't match expected %s ", numRowsInVector, + "Number of rows in the vector %s didn't match expected %s ", + numRowsInVector, numRowsToRead); - arrowColumnVectors[i] = - IcebergArrowColumnVector.forHolder(vectorHolders[i], numRowsInVector); + arrowColumnVectors[i] = IcebergArrowColumnVector.forHolder(vectorHolders[i], numRowsInVector); } ColumnarBatch batch = new ColumnarBatch(arrowColumnVectors); batch.setNumRows(numRowsToRead); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java index 3cdea65b2877..42683ffa901e 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.spark.SparkSchemaUtil; @@ -39,8 +38,7 @@ class ConstantColumnVector extends ColumnVector { } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java index 514eec84fe82..33c1a5284818 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.arrow.vectorized.ArrowVectorAccessor; @@ -33,9 +32,10 @@ import org.apache.spark.unsafe.types.UTF8String; /** - * Implementation of Spark's {@link ColumnVector} interface. The code for this class is heavily inspired from Spark's - * {@link ArrowColumnVector} The main difference is in how nullability checks are made in this class by relying on - * {@link NullabilityHolder} instead of the validity vector in the Arrow vector. + * Implementation of Spark's {@link ColumnVector} interface. The code for this class is heavily + * inspired from Spark's {@link ArrowColumnVector} The main difference is in how nullability checks + * are made in this class by relying on {@link NullabilityHolder} instead of the validity vector in + * the Arrow vector. */ public class IcebergArrowColumnVector extends ColumnVector { @@ -146,12 +146,14 @@ public ArrowColumnVector getChild(int ordinal) { } static ColumnVector forHolder(VectorHolder holder, int numRows) { - return holder.isDummy() ? - new ConstantColumnVector(Types.IntegerType.get(), numRows, ((ConstantVectorHolder) holder).getConstant()) : - new IcebergArrowColumnVector(holder); + return holder.isDummy() + ? new ConstantColumnVector( + Types.IntegerType.get(), numRows, ((ConstantVectorHolder) holder).getConstant()) + : new IcebergArrowColumnVector(holder); } - public ArrowVectorAccessor vectorAccessor() { + public ArrowVectorAccessor + vectorAccessor() { return accessor; } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java index 58db4eb55d04..a389cd8286e5 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.spark.SparkSchemaUtil; @@ -37,8 +36,7 @@ public class RowPositionColumnVector extends ColumnVector { } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java index 418c25993a7e..7c3b825a62e7 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.util.List; @@ -47,23 +46,27 @@ public class VectorizedSparkOrcReaders { - private VectorizedSparkOrcReaders() { - } + private VectorizedSparkOrcReaders() {} - public static OrcBatchReader buildReader(Schema expectedSchema, TypeDescription fileSchema, - Map idToConstant) { - Converter converter = OrcSchemaWithTypeVisitor.visit(expectedSchema, fileSchema, new ReadBuilder(idToConstant)); + public static OrcBatchReader buildReader( + Schema expectedSchema, TypeDescription fileSchema, Map idToConstant) { + Converter converter = + OrcSchemaWithTypeVisitor.visit(expectedSchema, fileSchema, new ReadBuilder(idToConstant)); return new OrcBatchReader() { private long batchOffsetInFile; @Override public ColumnarBatch read(VectorizedRowBatch batch) { - BaseOrcColumnVector cv = (BaseOrcColumnVector) converter.convert(new StructColumnVector(batch.size, batch.cols), - batch.size, batchOffsetInFile); - ColumnarBatch columnarBatch = new ColumnarBatch(IntStream.range(0, expectedSchema.columns().size()) - .mapToObj(cv::getChild) - .toArray(ColumnVector[]::new)); + BaseOrcColumnVector cv = + (BaseOrcColumnVector) + converter.convert( + new StructColumnVector(batch.size, batch.cols), batch.size, batchOffsetInFile); + ColumnarBatch columnarBatch = + new ColumnarBatch( + IntStream.range(0, expectedSchema.columns().size()) + .mapToObj(cv::getChild) + .toArray(ColumnVector[]::new)); columnarBatch.setNumRows(batch.size); return columnarBatch; } @@ -76,8 +79,10 @@ public void setBatchContext(long batchOffsetInFile) { } private interface Converter { - ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector columnVector, int batchSize, - long batchOffsetInFile); + ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector columnVector, + int batchSize, + long batchOffsetInFile); } private static class ReadBuilder extends OrcSchemaWithTypeVisitor { @@ -88,8 +93,11 @@ private ReadBuilder(Map idToConstant) { } @Override - public Converter record(Types.StructType iStruct, TypeDescription record, List names, - List fields) { + public Converter record( + Types.StructType iStruct, + TypeDescription record, + List names, + List fields) { return new StructConverter(iStruct, fields, idToConstant); } @@ -132,7 +140,8 @@ public Converter primitive(Type.PrimitiveType iPrimitive, TypeDescription primit primitiveValueReader = SparkOrcValueReaders.timestampTzs(); break; case DECIMAL: - primitiveValueReader = SparkOrcValueReaders.decimals(primitive.getPrecision(), primitive.getScale()); + primitiveValueReader = + SparkOrcValueReaders.decimals(primitive.getPrecision(), primitive.getScale()); break; case CHAR: case VARCHAR: @@ -146,7 +155,8 @@ public Converter primitive(Type.PrimitiveType iPrimitive, TypeDescription primit throw new IllegalArgumentException("Unhandled type " + primitive); } return (columnVector, batchSize, batchOffsetInFile) -> - new PrimitiveOrcColumnVector(iPrimitive, batchSize, columnVector, primitiveValueReader, batchOffsetInFile); + new PrimitiveOrcColumnVector( + iPrimitive, batchSize, columnVector, primitiveValueReader, batchOffsetInFile); } } @@ -155,15 +165,15 @@ private abstract static class BaseOrcColumnVector extends ColumnVector { private final int batchSize; private Integer numNulls; - BaseOrcColumnVector(Type type, int batchSize, org.apache.orc.storage.ql.exec.vector.ColumnVector vector) { + BaseOrcColumnVector( + Type type, int batchSize, org.apache.orc.storage.ql.exec.vector.ColumnVector vector) { super(SparkSchemaUtil.convert(type)); this.vector = vector; this.batchSize = batchSize; } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { @@ -278,8 +288,12 @@ private static class PrimitiveOrcColumnVector extends BaseOrcColumnVector { private final OrcValueReader primitiveValueReader; private final long batchOffsetInFile; - PrimitiveOrcColumnVector(Type type, int batchSize, org.apache.orc.storage.ql.exec.vector.ColumnVector vector, - OrcValueReader primitiveValueReader, long batchOffsetInFile) { + PrimitiveOrcColumnVector( + Type type, + int batchSize, + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + OrcValueReader primitiveValueReader, + long batchOffsetInFile) { super(type, batchSize, vector); this.vector = vector; this.primitiveValueReader = primitiveValueReader; @@ -313,7 +327,8 @@ public double getDouble(int rowId) { @Override public Decimal getDecimal(int rowId, int precision, int scale) { - // TODO: Is it okay to assume that (precision,scale) parameters == (precision,scale) of the decimal type + // TODO: Is it okay to assume that (precision,scale) parameters == (precision,scale) of the + // decimal type // and return a Decimal with (precision,scale) of the decimal type? return (Decimal) primitiveValueReader.read(vector, rowId); } @@ -339,16 +354,20 @@ private ArrayConverter(Types.ListType listType, Converter elementConverter) { } @Override - public ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector vector, int batchSize, - long batchOffsetInFile) { + public ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + int batchSize, + long batchOffsetInFile) { ListColumnVector listVector = (ListColumnVector) vector; - ColumnVector elementVector = elementConverter.convert(listVector.child, batchSize, batchOffsetInFile); + ColumnVector elementVector = + elementConverter.convert(listVector.child, batchSize, batchOffsetInFile); return new BaseOrcColumnVector(listType, batchSize, vector) { @Override public ColumnarArray getArray(int rowId) { int index = getRowIndex(rowId); - return new ColumnarArray(elementVector, (int) listVector.offsets[index], (int) listVector.lengths[index]); + return new ColumnarArray( + elementVector, (int) listVector.offsets[index], (int) listVector.lengths[index]); } }; } @@ -366,17 +385,23 @@ private MapConverter(Types.MapType mapType, Converter keyConverter, Converter va } @Override - public ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector vector, int batchSize, - long batchOffsetInFile) { + public ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + int batchSize, + long batchOffsetInFile) { MapColumnVector mapVector = (MapColumnVector) vector; ColumnVector keyVector = keyConverter.convert(mapVector.keys, batchSize, batchOffsetInFile); - ColumnVector valueVector = valueConverter.convert(mapVector.values, batchSize, batchOffsetInFile); + ColumnVector valueVector = + valueConverter.convert(mapVector.values, batchSize, batchOffsetInFile); return new BaseOrcColumnVector(mapType, batchSize, vector) { @Override public ColumnarMap getMap(int rowId) { int index = getRowIndex(rowId); - return new ColumnarMap(keyVector, valueVector, (int) mapVector.offsets[index], + return new ColumnarMap( + keyVector, + valueVector, + (int) mapVector.offsets[index], (int) mapVector.lengths[index]); } }; @@ -388,30 +413,37 @@ private static class StructConverter implements Converter { private final List fieldConverters; private final Map idToConstant; - private StructConverter(Types.StructType structType, List fieldConverters, - Map idToConstant) { + private StructConverter( + Types.StructType structType, + List fieldConverters, + Map idToConstant) { this.structType = structType; this.fieldConverters = fieldConverters; this.idToConstant = idToConstant; } @Override - public ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector vector, int batchSize, - long batchOffsetInFile) { + public ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + int batchSize, + long batchOffsetInFile) { StructColumnVector structVector = (StructColumnVector) vector; List fields = structType.fields(); List fieldVectors = Lists.newArrayListWithExpectedSize(fields.size()); for (int pos = 0, vectorIndex = 0; pos < fields.size(); pos += 1) { Types.NestedField field = fields.get(pos); if (idToConstant.containsKey(field.fieldId())) { - fieldVectors.add(new ConstantColumnVector(field.type(), batchSize, idToConstant.get(field.fieldId()))); + fieldVectors.add( + new ConstantColumnVector(field.type(), batchSize, idToConstant.get(field.fieldId()))); } else if (field.equals(MetadataColumns.ROW_POSITION)) { fieldVectors.add(new RowPositionColumnVector(batchOffsetInFile)); } else if (field.equals(MetadataColumns.IS_DELETED)) { fieldVectors.add(new ConstantColumnVector(field.type(), batchSize, false)); } else { - fieldVectors.add(fieldConverters.get(vectorIndex) - .convert(structVector.fields[vectorIndex], batchSize, batchOffsetInFile)); + fieldVectors.add( + fieldConverters + .get(vectorIndex) + .convert(structVector.fields[vectorIndex], batchSize, batchOffsetInFile)); vectorIndex++; } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java index b2d582352d74..bbb63e077bc6 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.util.Map; @@ -28,13 +27,10 @@ public class VectorizedSparkParquetReaders { - private VectorizedSparkParquetReaders() { - } + private VectorizedSparkParquetReaders() {} public static ColumnarBatchReader buildReader( - Schema expectedSchema, - MessageType fileSchema, - boolean setArrowValidityVector) { + Schema expectedSchema, MessageType fileSchema, boolean setArrowValidityVector) { return buildReader(expectedSchema, fileSchema, setArrowValidityVector, Maps.newHashMap()); } @@ -44,9 +40,14 @@ public static ColumnarBatchReader buildReader( boolean setArrowValidityVector, Map idToConstant) { return (ColumnarBatchReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), + fileSchema, new VectorizedReaderBuilder( - expectedSchema, fileSchema, setArrowValidityVector, - idToConstant, ColumnarBatchReader::new)); + expectedSchema, + fileSchema, + setArrowValidityVector, + idToConstant, + ColumnarBatchReader::new)); } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/AddFilesProcedure.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/AddFilesProcedure.java index 6cb74aebfa09..3b29c12b7197 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/AddFilesProcedure.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/AddFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Collections; @@ -56,16 +55,19 @@ class AddFilesProcedure extends BaseProcedure { private static final Joiner.MapJoiner MAP_JOINER = Joiner.on(",").withKeyValueSeparator("="); - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("source_table", DataTypes.StringType), - ProcedureParameter.optional("partition_filter", STRING_MAP), - ProcedureParameter.optional("check_duplicate_files", DataTypes.BooleanType) - }; - - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("added_files_count", DataTypes.LongType, false, Metadata.empty()) - }); + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("source_table", DataTypes.StringType), + ProcedureParameter.optional("partition_filter", STRING_MAP), + ProcedureParameter.optional("check_duplicate_files", DataTypes.BooleanType) + }; + + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("added_files_count", DataTypes.LongType, false, Metadata.empty()) + }); private AddFilesProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -95,15 +97,19 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); CatalogPlugin sessionCat = spark().sessionState().catalogManager().v2SessionCatalog(); - Identifier sourceIdent = toCatalogAndIdentifier(args.getString(1), PARAMETERS[1].name(), sessionCat).identifier(); + Identifier sourceIdent = + toCatalogAndIdentifier(args.getString(1), PARAMETERS[1].name(), sessionCat).identifier(); Map partitionFilter = Maps.newHashMap(); if (!args.isNullAt(2)) { - args.getMap(2).foreach(DataTypes.StringType, DataTypes.StringType, - (k, v) -> { - partitionFilter.put(k.toString(), v.toString()); - return BoxedUnit.UNIT; - }); + args.getMap(2) + .foreach( + DataTypes.StringType, + DataTypes.StringType, + (k, v) -> { + partitionFilter.put(k.toString(), v.toString()); + return BoxedUnit.UNIT; + }); } boolean checkDuplicateFiles; @@ -113,36 +119,42 @@ public InternalRow[] call(InternalRow args) { checkDuplicateFiles = args.getBoolean(3); } - long addedFilesCount = importToIceberg(tableIdent, sourceIdent, partitionFilter, checkDuplicateFiles); - return new InternalRow[]{newInternalRow(addedFilesCount)}; + long addedFilesCount = + importToIceberg(tableIdent, sourceIdent, partitionFilter, checkDuplicateFiles); + return new InternalRow[] {newInternalRow(addedFilesCount)}; } private boolean isFileIdentifier(Identifier ident) { String[] namespace = ident.namespace(); - return namespace.length == 1 && - (namespace[0].equalsIgnoreCase("orc") || - namespace[0].equalsIgnoreCase("parquet") || - namespace[0].equalsIgnoreCase("avro")); + return namespace.length == 1 + && (namespace[0].equalsIgnoreCase("orc") + || namespace[0].equalsIgnoreCase("parquet") + || namespace[0].equalsIgnoreCase("avro")); } - private long importToIceberg(Identifier destIdent, Identifier sourceIdent, Map partitionFilter, - boolean checkDuplicateFiles) { - return modifyIcebergTable(destIdent, table -> { - - validatePartitionSpec(table, partitionFilter); - ensureNameMappingPresent(table); - - if (isFileIdentifier(sourceIdent)) { - Path sourcePath = new Path(sourceIdent.name()); - String format = sourceIdent.namespace()[0]; - importFileTable(table, sourcePath, format, partitionFilter, checkDuplicateFiles); - } else { - importCatalogTable(table, sourceIdent, partitionFilter, checkDuplicateFiles); - } - - Snapshot snapshot = table.currentSnapshot(); - return Long.parseLong(snapshot.summary().getOrDefault(SnapshotSummary.ADDED_FILES_PROP, "0")); - }); + private long importToIceberg( + Identifier destIdent, + Identifier sourceIdent, + Map partitionFilter, + boolean checkDuplicateFiles) { + return modifyIcebergTable( + destIdent, + table -> { + validatePartitionSpec(table, partitionFilter); + ensureNameMappingPresent(table); + + if (isFileIdentifier(sourceIdent)) { + Path sourcePath = new Path(sourceIdent.name()); + String format = sourceIdent.namespace()[0]; + importFileTable(table, sourcePath, format, partitionFilter, checkDuplicateFiles); + } else { + importCatalogTable(table, sourceIdent, partitionFilter, checkDuplicateFiles); + } + + Snapshot snapshot = table.currentSnapshot(); + return Long.parseLong( + snapshot.summary().getOrDefault(SnapshotSummary.ADDED_FILES_PROP, "0")); + }); } private static void ensureNameMappingPresent(Table table) { @@ -150,49 +162,64 @@ private static void ensureNameMappingPresent(Table table) { // Forces Name based resolution instead of position based resolution NameMapping mapping = MappingUtil.create(table.schema()); String mappingJson = NameMappingParser.toJson(mapping); - table.updateProperties() - .set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson) - .commit(); + table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit(); } } - private void importFileTable(Table table, Path tableLocation, String format, Map partitionFilter, - boolean checkDuplicateFiles) { + private void importFileTable( + Table table, + Path tableLocation, + String format, + Map partitionFilter, + boolean checkDuplicateFiles) { // List Partitions via Spark InMemory file search interface List partitions = Spark3Util.getPartitions(spark(), tableLocation, format); if (table.spec().isUnpartitioned()) { - Preconditions.checkArgument(partitions.isEmpty(), "Cannot add partitioned files to an unpartitioned table"); - Preconditions.checkArgument(partitionFilter.isEmpty(), "Cannot use a partition filter when importing" + - "to an unpartitioned table"); + Preconditions.checkArgument( + partitions.isEmpty(), "Cannot add partitioned files to an unpartitioned table"); + Preconditions.checkArgument( + partitionFilter.isEmpty(), + "Cannot use a partition filter when importing" + "to an unpartitioned table"); // Build a Global Partition for the source - SparkPartition partition = new SparkPartition(Collections.emptyMap(), tableLocation.toString(), format); + SparkPartition partition = + new SparkPartition(Collections.emptyMap(), tableLocation.toString(), format); importPartitions(table, ImmutableList.of(partition), checkDuplicateFiles); } else { - Preconditions.checkArgument(!partitions.isEmpty(), - "Cannot find any partitions in table %s", partitions); - List filteredPartitions = SparkTableUtil.filterPartitions(partitions, partitionFilter); - Preconditions.checkArgument(!filteredPartitions.isEmpty(), + Preconditions.checkArgument( + !partitions.isEmpty(), "Cannot find any partitions in table %s", partitions); + List filteredPartitions = + SparkTableUtil.filterPartitions(partitions, partitionFilter); + Preconditions.checkArgument( + !filteredPartitions.isEmpty(), "Cannot find any partitions which match the given filter. Partition filter is %s", MAP_JOINER.join(partitionFilter)); importPartitions(table, filteredPartitions, checkDuplicateFiles); } } - private void importCatalogTable(Table table, Identifier sourceIdent, Map partitionFilter, - boolean checkDuplicateFiles) { + private void importCatalogTable( + Table table, + Identifier sourceIdent, + Map partitionFilter, + boolean checkDuplicateFiles) { String stagingLocation = getMetadataLocation(table); TableIdentifier sourceTableIdentifier = Spark3Util.toV1TableIdentifier(sourceIdent); - SparkTableUtil.importSparkTable(spark(), sourceTableIdentifier, table, stagingLocation, partitionFilter, + SparkTableUtil.importSparkTable( + spark(), + sourceTableIdentifier, + table, + stagingLocation, + partitionFilter, checkDuplicateFiles); } - private void importPartitions(Table table, List partitions, - boolean checkDuplicateFiles) { + private void importPartitions( + Table table, List partitions, boolean checkDuplicateFiles) { String stagingLocation = getMetadataLocation(table); - SparkTableUtil.importSparkPartitions(spark(), partitions, table, table.spec(), stagingLocation, - checkDuplicateFiles); + SparkTableUtil.importSparkPartitions( + spark(), partitions, table, table.spec(), stagingLocation, checkDuplicateFiles); } private String getMetadataLocation(Table table) { @@ -207,38 +234,51 @@ public String description() { private void validatePartitionSpec(Table table, Map partitionFilter) { List partitionFields = table.spec().fields(); - Set partitionNames = table.spec().fields().stream().map(PartitionField::name).collect(Collectors.toSet()); + Set partitionNames = + table.spec().fields().stream().map(PartitionField::name).collect(Collectors.toSet()); boolean tablePartitioned = !partitionFields.isEmpty(); boolean partitionSpecPassed = !partitionFilter.isEmpty(); // Check for any non-identity partition columns - List nonIdentityFields = partitionFields.stream() - .filter(x -> !x.transform().isIdentity()) - .collect(Collectors.toList()); - Preconditions.checkArgument(nonIdentityFields.isEmpty(), - "Cannot add data files to target table %s because that table is partitioned and contains non-identity" + - "partition transforms which will not be compatible. Found non-identity fields %s", - table.name(), nonIdentityFields); + List nonIdentityFields = + partitionFields.stream() + .filter(x -> !x.transform().isIdentity()) + .collect(Collectors.toList()); + Preconditions.checkArgument( + nonIdentityFields.isEmpty(), + "Cannot add data files to target table %s because that table is partitioned and contains non-identity" + + "partition transforms which will not be compatible. Found non-identity fields %s", + table.name(), + nonIdentityFields); if (tablePartitioned && partitionSpecPassed) { // Check to see there are sufficient partition columns to satisfy the filter - Preconditions.checkArgument(partitionFields.size() >= partitionFilter.size(), - "Cannot add data files to target table %s because that table is partitioned, " + - "but the number of columns in the provided partition filter (%s) " + - "is greater than the number of partitioned columns in table (%s)", - table.name(), partitionFilter.size(), partitionFields.size()); + Preconditions.checkArgument( + partitionFields.size() >= partitionFilter.size(), + "Cannot add data files to target table %s because that table is partitioned, " + + "but the number of columns in the provided partition filter (%s) " + + "is greater than the number of partitioned columns in table (%s)", + table.name(), + partitionFilter.size(), + partitionFields.size()); // Check for any filters of non existent columns - List unMatchedFilters = partitionFilter.keySet().stream() - .filter(filterName -> !partitionNames.contains(filterName)) - .collect(Collectors.toList()); - Preconditions.checkArgument(unMatchedFilters.isEmpty(), - "Cannot add files to target table %s. %s is partitioned but the specified partition filter " + - "refers to columns that are not partitioned: '%s' . Valid partition columns %s", - table.name(), table.name(), unMatchedFilters, String.join(",", partitionNames)); + List unMatchedFilters = + partitionFilter.keySet().stream() + .filter(filterName -> !partitionNames.contains(filterName)) + .collect(Collectors.toList()); + Preconditions.checkArgument( + unMatchedFilters.isEmpty(), + "Cannot add files to target table %s. %s is partitioned but the specified partition filter " + + "refers to columns that are not partitioned: '%s' . Valid partition columns %s", + table.name(), + table.name(), + unMatchedFilters, + String.join(",", partitionNames)); } else { - Preconditions.checkArgument(!partitionSpecPassed, + Preconditions.checkArgument( + !partitionSpecPassed, "Cannot use partition filter with an unpartitioned table %s", table.name()); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/AncestorsOfProcedure.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/AncestorsOfProcedure.java index 1ae3715dddd6..e5233bf34753 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/AncestorsOfProcedure.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/AncestorsOfProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.List; @@ -35,15 +34,18 @@ public class AncestorsOfProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[] { - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("snapshot_id", DataTypes.LongType), + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("snapshot_id", DataTypes.LongType), }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[] { - new StructField("snapshot_id", DataTypes.LongType, true, Metadata.empty()), - new StructField("timestamp", DataTypes.LongType, true, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("snapshot_id", DataTypes.LongType, true, Metadata.empty()), + new StructField("timestamp", DataTypes.LongType, true, Metadata.empty()) + }); private AncestorsOfProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -77,11 +79,13 @@ public InternalRow[] call(InternalRow args) { Table icebergTable = sparkTable.table(); if (toSnapshotId == null) { - toSnapshotId = icebergTable.currentSnapshot() != null ? icebergTable.currentSnapshot().snapshotId() : -1; + toSnapshotId = + icebergTable.currentSnapshot() != null ? icebergTable.currentSnapshot().snapshotId() : -1; } - List snapshotIds = Lists.newArrayList( - SnapshotUtil.ancestorIdsBetween(toSnapshotId, null, icebergTable::snapshot)); + List snapshotIds = + Lists.newArrayList( + SnapshotUtil.ancestorIdsBetween(toSnapshotId, null, icebergTable::snapshot)); return toOutputRow(icebergTable, snapshotIds); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/BaseProcedure.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/BaseProcedure.java index bbe1fcd72cdc..90231ef6815b 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/BaseProcedure.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/BaseProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.function.Function; @@ -43,7 +42,8 @@ import scala.Option; abstract class BaseProcedure implements Procedure { - protected static final DataType STRING_MAP = DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType); + protected static final DataType STRING_MAP = + DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType); private final SparkSession spark; private final TableCatalog tableCatalog; @@ -78,7 +78,8 @@ protected T withIcebergTable(Identifier ident, Function T execute(Identifier ident, boolean refreshSparkCache, Function func) { + private T execute( + Identifier ident, boolean refreshSparkCache, Function func) { SparkTable sparkTable = loadSparkTable(ident); org.apache.iceberg.Table icebergTable = sparkTable.table(); @@ -92,38 +93,47 @@ private T execute(Identifier ident, boolean refreshSparkCache, Function - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * A procedure that applies changes in a given snapshot and creates a new snapshot which will be set + * as the current snapshot in a table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#cherrypick(long) */ class CherrypickSnapshotProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("snapshot_id", DataTypes.LongType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("snapshot_id", DataTypes.LongType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("source_snapshot_id", DataTypes.LongType, false, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("source_snapshot_id", DataTypes.LongType, false, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -78,16 +81,16 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); long snapshotId = args.getLong(1); - return modifyIcebergTable(tableIdent, table -> { - table.manageSnapshots() - .cherrypick(snapshotId) - .commit(); + return modifyIcebergTable( + tableIdent, + table -> { + table.manageSnapshots().cherrypick(snapshotId).commit(); - Snapshot currentSnapshot = table.currentSnapshot(); + Snapshot currentSnapshot = table.currentSnapshot(); - InternalRow outputRow = newInternalRow(snapshotId, currentSnapshot.snapshotId()); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = newInternalRow(snapshotId, currentSnapshot.snapshotId()); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/ExpireSnapshotsProcedure.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/ExpireSnapshotsProcedure.java index 9c3a8e99c87a..ad8f6f567d88 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/ExpireSnapshotsProcedure.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/ExpireSnapshotsProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.concurrent.ExecutorService; @@ -47,19 +46,24 @@ */ public class ExpireSnapshotsProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[] { - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("older_than", DataTypes.TimestampType), - ProcedureParameter.optional("retain_last", DataTypes.IntegerType), - ProcedureParameter.optional("max_concurrent_deletes", DataTypes.IntegerType), - ProcedureParameter.optional("stream_results", DataTypes.BooleanType) - }; - - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("deleted_data_files_count", DataTypes.LongType, true, Metadata.empty()), - new StructField("deleted_manifest_files_count", DataTypes.LongType, true, Metadata.empty()), - new StructField("deleted_manifest_lists_count", DataTypes.LongType, true, Metadata.empty()) - }); + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("older_than", DataTypes.TimestampType), + ProcedureParameter.optional("retain_last", DataTypes.IntegerType), + ProcedureParameter.optional("max_concurrent_deletes", DataTypes.IntegerType), + ProcedureParameter.optional("stream_results", DataTypes.BooleanType) + }; + + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("deleted_data_files_count", DataTypes.LongType, true, Metadata.empty()), + new StructField( + "deleted_manifest_files_count", DataTypes.LongType, true, Metadata.empty()), + new StructField( + "deleted_manifest_lists_count", DataTypes.LongType, true, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -92,41 +96,45 @@ public InternalRow[] call(InternalRow args) { Integer maxConcurrentDeletes = args.isNullAt(3) ? null : args.getInt(3); Boolean streamResult = args.isNullAt(4) ? null : args.getBoolean(4); - Preconditions.checkArgument(maxConcurrentDeletes == null || maxConcurrentDeletes > 0, + Preconditions.checkArgument( + maxConcurrentDeletes == null || maxConcurrentDeletes > 0, "max_concurrent_deletes should have value > 0, value: " + maxConcurrentDeletes); - return modifyIcebergTable(tableIdent, table -> { - ExpireSnapshots action = actions().expireSnapshots(table); + return modifyIcebergTable( + tableIdent, + table -> { + ExpireSnapshots action = actions().expireSnapshots(table); - if (olderThanMillis != null) { - action.expireOlderThan(olderThanMillis); - } + if (olderThanMillis != null) { + action.expireOlderThan(olderThanMillis); + } - if (retainLastNum != null) { - action.retainLast(retainLastNum); - } + if (retainLastNum != null) { + action.retainLast(retainLastNum); + } - if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) { - action.executeDeleteWith(expireService(maxConcurrentDeletes)); - } + if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) { + action.executeDeleteWith(expireService(maxConcurrentDeletes)); + } - if (streamResult != null) { - action.option(BaseExpireSnapshotsSparkAction.STREAM_RESULTS, Boolean.toString(streamResult)); - } + if (streamResult != null) { + action.option( + BaseExpireSnapshotsSparkAction.STREAM_RESULTS, Boolean.toString(streamResult)); + } - ExpireSnapshots.Result result = action.execute(); + ExpireSnapshots.Result result = action.execute(); - return toOutputRows(result); - }); + return toOutputRows(result); + }); } private InternalRow[] toOutputRows(ExpireSnapshots.Result result) { - InternalRow row = newInternalRow( - result.deletedDataFilesCount(), - result.deletedManifestsCount(), - result.deletedManifestListsCount() - ); - return new InternalRow[]{row}; + InternalRow row = + newInternalRow( + result.deletedDataFilesCount(), + result.deletedManifestsCount(), + result.deletedManifestListsCount()); + return new InternalRow[] {row}; } @Override @@ -136,10 +144,9 @@ public String description() { private ExecutorService expireService(int concurrentDeletes) { return MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool( - concurrentDeletes, - new ThreadFactoryBuilder() - .setNameFormat("expire-snapshots-%d") - .build())); + (ThreadPoolExecutor) + Executors.newFixedThreadPool( + concurrentDeletes, + new ThreadFactoryBuilder().setNameFormat("expire-snapshots-%d").build())); } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/MigrateTableProcedure.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/MigrateTableProcedure.java index 2f6841924f8c..a49dd7d526b0 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/MigrateTableProcedure.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/MigrateTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Map; @@ -35,14 +34,17 @@ import scala.runtime.BoxedUnit; class MigrateTableProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("properties", STRING_MAP) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("properties", STRING_MAP) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("migrated_files_count", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("migrated_files_count", DataTypes.LongType, false, Metadata.empty()) + }); private MigrateTableProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -70,19 +72,24 @@ public StructType outputType() { @Override public InternalRow[] call(InternalRow args) { String tableName = args.getString(0); - Preconditions.checkArgument(tableName != null && !tableName.isEmpty(), + Preconditions.checkArgument( + tableName != null && !tableName.isEmpty(), "Cannot handle an empty identifier for argument table"); Map properties = Maps.newHashMap(); if (!args.isNullAt(1)) { - args.getMap(1).foreach(DataTypes.StringType, DataTypes.StringType, - (k, v) -> { - properties.put(k.toString(), v.toString()); - return BoxedUnit.UNIT; - }); + args.getMap(1) + .foreach( + DataTypes.StringType, + DataTypes.StringType, + (k, v) -> { + properties.put(k.toString(), v.toString()); + return BoxedUnit.UNIT; + }); } - MigrateTable.Result result = SparkActions.get().migrateTable(tableName).tableProperties(properties).execute(); + MigrateTable.Result result = + SparkActions.get().migrateTable(tableName).tableProperties(properties).execute(); return new InternalRow[] {newInternalRow(result.migratedDataFilesCount())}; } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/PublishChangesProcedure.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/PublishChangesProcedure.java index a7d8b344a8db..a216e5abaf7f 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/PublishChangesProcedure.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/PublishChangesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Optional; @@ -35,24 +34,28 @@ import org.apache.spark.sql.types.StructType; /** - * A procedure that applies changes in a snapshot created within a Write-Audit-Publish workflow with a wap_id and - * creates a new snapshot which will be set as the current snapshot in a table. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * A procedure that applies changes in a snapshot created within a Write-Audit-Publish workflow with + * a wap_id and creates a new snapshot which will be set as the current snapshot in a table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#cherrypick(long) */ class PublishChangesProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("wap_id", DataTypes.StringType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("wap_id", DataTypes.StringType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("source_snapshot_id", DataTypes.LongType, false, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("source_snapshot_id", DataTypes.LongType, false, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new Builder() { @@ -82,23 +85,27 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); String wapId = args.getString(1); - return modifyIcebergTable(tableIdent, table -> { - Optional wapSnapshot = Optional.ofNullable( - Iterables.find(table.snapshots(), snapshot -> wapId.equals(WapUtil.stagedWapId(snapshot)), null)); - if (!wapSnapshot.isPresent()) { - throw new ValidationException(String.format("Cannot apply unknown WAP ID '%s'", wapId)); - } + return modifyIcebergTable( + tableIdent, + table -> { + Optional wapSnapshot = + Optional.ofNullable( + Iterables.find( + table.snapshots(), + snapshot -> wapId.equals(WapUtil.stagedWapId(snapshot)), + null)); + if (!wapSnapshot.isPresent()) { + throw new ValidationException(String.format("Cannot apply unknown WAP ID '%s'", wapId)); + } - long wapSnapshotId = wapSnapshot.get().snapshotId(); - table.manageSnapshots() - .cherrypick(wapSnapshotId) - .commit(); + long wapSnapshotId = wapSnapshot.get().snapshotId(); + table.manageSnapshots().cherrypick(wapSnapshotId).commit(); - Snapshot currentSnapshot = table.currentSnapshot(); + Snapshot currentSnapshot = table.currentSnapshot(); - InternalRow outputRow = newInternalRow(wapSnapshotId, currentSnapshot.snapshotId()); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = newInternalRow(wapSnapshotId, currentSnapshot.snapshotId()); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java index 1f6397ae4ddd..3fd29254ff3c 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.concurrent.ExecutorService; @@ -49,17 +48,20 @@ */ public class RemoveOrphanFilesProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("older_than", DataTypes.TimestampType), - ProcedureParameter.optional("location", DataTypes.StringType), - ProcedureParameter.optional("dry_run", DataTypes.BooleanType), - ProcedureParameter.optional("max_concurrent_deletes", DataTypes.IntegerType) - }; - - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("orphan_file_location", DataTypes.StringType, false, Metadata.empty()) - }); + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("older_than", DataTypes.TimestampType), + ProcedureParameter.optional("location", DataTypes.StringType), + ProcedureParameter.optional("dry_run", DataTypes.BooleanType), + ProcedureParameter.optional("max_concurrent_deletes", DataTypes.IntegerType) + }; + + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("orphan_file_location", DataTypes.StringType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -92,36 +94,39 @@ public InternalRow[] call(InternalRow args) { boolean dryRun = args.isNullAt(3) ? false : args.getBoolean(3); Integer maxConcurrentDeletes = args.isNullAt(4) ? null : args.getInt(4); - Preconditions.checkArgument(maxConcurrentDeletes == null || maxConcurrentDeletes > 0, - "max_concurrent_deletes should have value > 0, value: " + maxConcurrentDeletes); + Preconditions.checkArgument( + maxConcurrentDeletes == null || maxConcurrentDeletes > 0, + "max_concurrent_deletes should have value > 0, value: " + maxConcurrentDeletes); - return withIcebergTable(tableIdent, table -> { - DeleteOrphanFiles action = actions().deleteOrphanFiles(table); + return withIcebergTable( + tableIdent, + table -> { + DeleteOrphanFiles action = actions().deleteOrphanFiles(table); - if (olderThanMillis != null) { - boolean isTesting = Boolean.parseBoolean(spark().conf().get("spark.testing", "false")); - if (!isTesting) { - validateInterval(olderThanMillis); - } - action.olderThan(olderThanMillis); - } + if (olderThanMillis != null) { + boolean isTesting = Boolean.parseBoolean(spark().conf().get("spark.testing", "false")); + if (!isTesting) { + validateInterval(olderThanMillis); + } + action.olderThan(olderThanMillis); + } - if (location != null) { - action.location(location); - } + if (location != null) { + action.location(location); + } - if (dryRun) { - action.deleteWith(file -> { }); - } + if (dryRun) { + action.deleteWith(file -> {}); + } - if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) { - action.executeDeleteWith(removeService(maxConcurrentDeletes)); - } + if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) { + action.executeDeleteWith(removeService(maxConcurrentDeletes)); + } - DeleteOrphanFiles.Result result = action.execute(); + DeleteOrphanFiles.Result result = action.execute(); - return toOutputRows(result); - }); + return toOutputRows(result); + }); } private InternalRow[] toOutputRows(DeleteOrphanFiles.Result result) { @@ -143,21 +148,20 @@ private void validateInterval(long olderThanMillis) { long intervalMillis = System.currentTimeMillis() - olderThanMillis; if (intervalMillis < TimeUnit.DAYS.toMillis(1)) { throw new IllegalArgumentException( - "Cannot remove orphan files with an interval less than 24 hours. Executing this " + - "procedure with a short interval may corrupt the table if other operations are happening " + - "at the same time. If you are absolutely confident that no concurrent operations will be " + - "affected by removing orphan files with such a short interval, you can use the Action API " + - "to remove orphan files with an arbitrary interval."); + "Cannot remove orphan files with an interval less than 24 hours. Executing this " + + "procedure with a short interval may corrupt the table if other operations are happening " + + "at the same time. If you are absolutely confident that no concurrent operations will be " + + "affected by removing orphan files with such a short interval, you can use the Action API " + + "to remove orphan files with an arbitrary interval."); } } private ExecutorService removeService(int concurrentDeletes) { return MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool( - concurrentDeletes, - new ThreadFactoryBuilder() - .setNameFormat("remove-orphans-%d") - .build())); + (ThreadPoolExecutor) + Executors.newFixedThreadPool( + concurrentDeletes, + new ThreadFactoryBuilder().setNameFormat("remove-orphans-%d").build())); } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteDataFilesProcedure.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteDataFilesProcedure.java index b33eab6b5b3c..d703ae6d8177 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteDataFilesProcedure.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteDataFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Map; @@ -48,19 +47,24 @@ */ class RewriteDataFilesProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("strategy", DataTypes.StringType), - ProcedureParameter.optional("sort_order", DataTypes.StringType), - ProcedureParameter.optional("options", STRING_MAP), - ProcedureParameter.optional("where", DataTypes.StringType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("strategy", DataTypes.StringType), + ProcedureParameter.optional("sort_order", DataTypes.StringType), + ProcedureParameter.optional("options", STRING_MAP), + ProcedureParameter.optional("where", DataTypes.StringType) + }; // counts are not nullable since the action result is never null - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("rewritten_data_files_count", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("added_data_files_count", DataTypes.IntegerType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField( + "rewritten_data_files_count", DataTypes.IntegerType, false, Metadata.empty()), + new StructField( + "added_data_files_count", DataTypes.IntegerType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new Builder() { @@ -89,36 +93,40 @@ public StructType outputType() { public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); - return modifyIcebergTable(tableIdent, table -> { - RewriteDataFiles action = actions().rewriteDataFiles(table); + return modifyIcebergTable( + tableIdent, + table -> { + RewriteDataFiles action = actions().rewriteDataFiles(table); - String strategy = args.isNullAt(1) ? null : args.getString(1); - String sortOrderString = args.isNullAt(2) ? null : args.getString(2); - SortOrder sortOrder = null; - if (sortOrderString != null) { - sortOrder = collectSortOrders(table, sortOrderString); - } - if (strategy != null || sortOrder != null) { - action = checkAndApplyStrategy(action, strategy, sortOrder); - } + String strategy = args.isNullAt(1) ? null : args.getString(1); + String sortOrderString = args.isNullAt(2) ? null : args.getString(2); + SortOrder sortOrder = null; + if (sortOrderString != null) { + sortOrder = collectSortOrders(table, sortOrderString); + } + if (strategy != null || sortOrder != null) { + action = checkAndApplyStrategy(action, strategy, sortOrder); + } - if (!args.isNullAt(3)) { - action = checkAndApplyOptions(args, action); - } + if (!args.isNullAt(3)) { + action = checkAndApplyOptions(args, action); + } - String where = args.isNullAt(4) ? null : args.getString(4); - action = checkAndApplyFilter(action, where, table.name()); + String where = args.isNullAt(4) ? null : args.getString(4); + action = checkAndApplyFilter(action, where, table.name()); - RewriteDataFiles.Result result = action.execute(); + RewriteDataFiles.Result result = action.execute(); - return toOutputRows(result); - }); + return toOutputRows(result); + }); } - private RewriteDataFiles checkAndApplyFilter(RewriteDataFiles action, String where, String tableName) { + private RewriteDataFiles checkAndApplyFilter( + RewriteDataFiles action, String where, String tableName) { if (where != null) { try { - Expression expression = SparkExpressionConverter.collectResolvedSparkExpression(spark(), tableName, where); + Expression expression = + SparkExpressionConverter.collectResolvedSparkExpression(spark(), tableName, where); return action.filter(SparkExpressionConverter.convertToIcebergExpression(expression)); } catch (AnalysisException e) { throw new IllegalArgumentException("Cannot parse predicates in where option: " + where); @@ -129,7 +137,10 @@ private RewriteDataFiles checkAndApplyFilter(RewriteDataFiles action, String whe private RewriteDataFiles checkAndApplyOptions(InternalRow args, RewriteDataFiles action) { Map options = Maps.newHashMap(); - args.getMap(3).foreach(DataTypes.StringType, DataTypes.StringType, + args.getMap(3) + .foreach( + DataTypes.StringType, + DataTypes.StringType, (k, v) -> { options.put(k.toString(), v.toString()); return BoxedUnit.UNIT; @@ -137,33 +148,38 @@ private RewriteDataFiles checkAndApplyOptions(InternalRow args, RewriteDataFiles return action.options(options); } - private RewriteDataFiles checkAndApplyStrategy(RewriteDataFiles action, String strategy, SortOrder sortOrder) { - // caller of this function ensures that between strategy and sortOrder, at least one of them is not null. + private RewriteDataFiles checkAndApplyStrategy( + RewriteDataFiles action, String strategy, SortOrder sortOrder) { + // caller of this function ensures that between strategy and sortOrder, at least one of them is + // not null. if (strategy == null || strategy.equalsIgnoreCase("sort")) { return action.sort(sortOrder); } if (strategy.equalsIgnoreCase("binpack")) { RewriteDataFiles rewriteDataFiles = action.binPack(); if (sortOrder != null) { - // calling below method to throw the error as user has set both binpack strategy and sort order + // calling below method to throw the error as user has set both binpack strategy and sort + // order return rewriteDataFiles.sort(sortOrder); } return rewriteDataFiles; } else { - throw new IllegalArgumentException("unsupported strategy: " + strategy + ". Only binpack,sort is supported"); + throw new IllegalArgumentException( + "unsupported strategy: " + strategy + ". Only binpack,sort is supported"); } } private SortOrder collectSortOrders(Table table, String sortOrderStr) { String prefix = "ALTER TABLE temp WRITE ORDERED BY "; try { - // Note: Reusing the existing Iceberg sql parser to avoid implementing the custom parser for sort orders. + // Note: Reusing the existing Iceberg sql parser to avoid implementing the custom parser for + // sort orders. // To reuse the existing parser, adding a prefix of "ALTER TABLE temp WRITE ORDERED BY" // along with input sort order and parsing it as a plan to collect the sortOrder. LogicalPlan logicalPlan = spark().sessionState().sqlParser().parsePlan(prefix + sortOrderStr); - return (new SortOrderParserUtil()).collectSortOrder( - table.schema(), - ((SetWriteDistributionAndOrdering) logicalPlan).sortOrder()); + return (new SortOrderParserUtil()) + .collectSortOrder( + table.schema(), ((SetWriteDistributionAndOrdering) logicalPlan).sortOrder()); } catch (AnalysisException ex) { throw new IllegalArgumentException("Unable to parse sortOrder: " + sortOrderStr); } @@ -173,7 +189,7 @@ private InternalRow[] toOutputRows(RewriteDataFiles.Result result) { int rewrittenDataFilesCount = result.rewrittenDataFilesCount(); int addedDataFilesCount = result.addedDataFilesCount(); InternalRow row = newInternalRow(rewrittenDataFilesCount, addedDataFilesCount); - return new InternalRow[]{row}; + return new InternalRow[] {row}; } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteManifestsProcedure.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteManifestsProcedure.java index eae7ce0fca97..abe287f5a746 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteManifestsProcedure.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteManifestsProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Table; @@ -35,23 +34,28 @@ /** * A procedure that rewrites manifests in a table. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see SparkActions#rewriteManifests(Table) () */ class RewriteManifestsProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("use_caching", DataTypes.BooleanType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("use_caching", DataTypes.BooleanType) + }; // counts are not nullable since the action result is never null - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("rewritten_manifests_count", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("added_manifests_count", DataTypes.IntegerType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField( + "rewritten_manifests_count", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("added_manifests_count", DataTypes.IntegerType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -81,24 +85,26 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); Boolean useCaching = args.isNullAt(1) ? null : args.getBoolean(1); - return modifyIcebergTable(tableIdent, table -> { - RewriteManifests action = actions().rewriteManifests(table); + return modifyIcebergTable( + tableIdent, + table -> { + RewriteManifests action = actions().rewriteManifests(table); - if (useCaching != null) { - action.option("use-caching", useCaching.toString()); - } + if (useCaching != null) { + action.option("use-caching", useCaching.toString()); + } - RewriteManifests.Result result = action.execute(); + RewriteManifests.Result result = action.execute(); - return toOutputRows(result); - }); + return toOutputRows(result); + }); } private InternalRow[] toOutputRows(RewriteManifests.Result result) { int rewrittenManifestsCount = Iterables.size(result.rewrittenManifests()); int addedManifestsCount = Iterables.size(result.addedManifests()); InternalRow row = newInternalRow(rewrittenManifestsCount, addedManifestsCount); - return new InternalRow[]{row}; + return new InternalRow[] {row}; } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToSnapshotProcedure.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToSnapshotProcedure.java index 7cf5b0c77bb2..49cc1a5ceae3 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToSnapshotProcedure.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToSnapshotProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Snapshot; @@ -32,22 +31,26 @@ /** * A procedure that rollbacks a table to a specific snapshot id. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#rollbackTo(long) */ class RollbackToSnapshotProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("snapshot_id", DataTypes.LongType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("snapshot_id", DataTypes.LongType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("previous_snapshot_id", DataTypes.LongType, false, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("previous_snapshot_id", DataTypes.LongType, false, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -77,16 +80,16 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); long snapshotId = args.getLong(1); - return modifyIcebergTable(tableIdent, table -> { - Snapshot previousSnapshot = table.currentSnapshot(); + return modifyIcebergTable( + tableIdent, + table -> { + Snapshot previousSnapshot = table.currentSnapshot(); - table.manageSnapshots() - .rollbackTo(snapshotId) - .commit(); + table.manageSnapshots().rollbackTo(snapshotId).commit(); - InternalRow outputRow = newInternalRow(previousSnapshot.snapshotId(), snapshotId); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = newInternalRow(previousSnapshot.snapshotId(), snapshotId); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToTimestampProcedure.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToTimestampProcedure.java index 519a46c6dbb8..059725f0c152 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToTimestampProcedure.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToTimestampProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Snapshot; @@ -33,22 +32,26 @@ /** * A procedure that rollbacks a table to a given point in time. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#rollbackToTime(long) */ class RollbackToTimestampProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("timestamp", DataTypes.TimestampType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("timestamp", DataTypes.TimestampType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("previous_snapshot_id", DataTypes.LongType, false, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("previous_snapshot_id", DataTypes.LongType, false, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -79,18 +82,19 @@ public InternalRow[] call(InternalRow args) { // timestamps in Spark have microsecond precision so this conversion is lossy long timestampMillis = DateTimeUtil.microsToMillis(args.getLong(1)); - return modifyIcebergTable(tableIdent, table -> { - Snapshot previousSnapshot = table.currentSnapshot(); + return modifyIcebergTable( + tableIdent, + table -> { + Snapshot previousSnapshot = table.currentSnapshot(); - table.manageSnapshots() - .rollbackToTime(timestampMillis) - .commit(); + table.manageSnapshots().rollbackToTime(timestampMillis).commit(); - Snapshot currentSnapshot = table.currentSnapshot(); + Snapshot currentSnapshot = table.currentSnapshot(); - InternalRow outputRow = newInternalRow(previousSnapshot.snapshotId(), currentSnapshot.snapshotId()); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = + newInternalRow(previousSnapshot.snapshotId(), currentSnapshot.snapshotId()); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/SetCurrentSnapshotProcedure.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/SetCurrentSnapshotProcedure.java index 274ca19fc107..f8f8049c22b6 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/SetCurrentSnapshotProcedure.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/SetCurrentSnapshotProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Snapshot; @@ -32,22 +31,26 @@ /** * A procedure that sets the current snapshot in a table. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#setCurrentSnapshot(long) */ class SetCurrentSnapshotProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[] { - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("snapshot_id", DataTypes.LongType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("snapshot_id", DataTypes.LongType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("previous_snapshot_id", DataTypes.LongType, true, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("previous_snapshot_id", DataTypes.LongType, true, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -77,17 +80,17 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); long snapshotId = args.getLong(1); - return modifyIcebergTable(tableIdent, table -> { - Snapshot previousSnapshot = table.currentSnapshot(); - Long previousSnapshotId = previousSnapshot != null ? previousSnapshot.snapshotId() : null; + return modifyIcebergTable( + tableIdent, + table -> { + Snapshot previousSnapshot = table.currentSnapshot(); + Long previousSnapshotId = previousSnapshot != null ? previousSnapshot.snapshotId() : null; - table.manageSnapshots() - .setCurrentSnapshot(snapshotId) - .commit(); + table.manageSnapshots().setCurrentSnapshot(snapshotId).commit(); - InternalRow outputRow = newInternalRow(previousSnapshotId, snapshotId); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = newInternalRow(previousSnapshotId, snapshotId); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/SnapshotTableProcedure.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/SnapshotTableProcedure.java index 96e293d6b1da..7a015a51e8ed 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/SnapshotTableProcedure.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/SnapshotTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Map; @@ -34,16 +33,19 @@ import scala.runtime.BoxedUnit; class SnapshotTableProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("source_table", DataTypes.StringType), - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("location", DataTypes.StringType), - ProcedureParameter.optional("properties", STRING_MAP) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("source_table", DataTypes.StringType), + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("location", DataTypes.StringType), + ProcedureParameter.optional("properties", STRING_MAP) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("imported_files_count", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("imported_files_count", DataTypes.LongType, false, Metadata.empty()) + }); private SnapshotTableProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -71,23 +73,28 @@ public StructType outputType() { @Override public InternalRow[] call(InternalRow args) { String source = args.getString(0); - Preconditions.checkArgument(source != null && !source.isEmpty(), + Preconditions.checkArgument( + source != null && !source.isEmpty(), "Cannot handle an empty identifier for argument source_table"); String dest = args.getString(1); - Preconditions.checkArgument(dest != null && !dest.isEmpty(), - "Cannot handle an empty identifier for argument table"); + Preconditions.checkArgument( + dest != null && !dest.isEmpty(), "Cannot handle an empty identifier for argument table"); String snapshotLocation = args.isNullAt(2) ? null : args.getString(2); Map properties = Maps.newHashMap(); if (!args.isNullAt(3)) { - args.getMap(3).foreach(DataTypes.StringType, DataTypes.StringType, - (k, v) -> { - properties.put(k.toString(), v.toString()); - return BoxedUnit.UNIT; - }); + args.getMap(3) + .foreach( + DataTypes.StringType, + DataTypes.StringType, + (k, v) -> { + properties.put(k.toString(), v.toString()); + return BoxedUnit.UNIT; + }); } - Preconditions.checkArgument(!source.equals(dest), + Preconditions.checkArgument( + !source.equals(dest), "Cannot create a snapshot with the same name as the source of the snapshot."); SnapshotTable action = SparkActions.get().snapshotTable(source).as(dest); @@ -103,5 +110,4 @@ public InternalRow[] call(InternalRow args) { public String description() { return "SnapshotTableProcedure"; } - } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java index d481c19d59a1..47b46cade7c0 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Locale; @@ -30,8 +29,7 @@ public class SparkProcedures { private static final Map> BUILDERS = initProcedureBuilders(); - private SparkProcedures() { - } + private SparkProcedures() {} public static ProcedureBuilder newBuilder(String name) { // procedure resolution is case insensitive to match the existing Spark behavior for functions @@ -59,6 +57,7 @@ private static Map> initProcedureBuilders() { public interface ProcedureBuilder { ProcedureBuilder withTableCatalog(TableCatalog tableCatalog); + Procedure build(); } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseDataReader.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseDataReader.java index b58745c7a00d..2cab8ee238e0 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseDataReader.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseDataReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.Closeable; @@ -76,10 +75,16 @@ abstract class BaseDataReader implements Closeable { this.tasks = task.files().iterator(); Map keyMetadata = Maps.newHashMap(); task.files().stream() - .flatMap(fileScanTask -> Stream.concat(Stream.of(fileScanTask.file()), fileScanTask.deletes().stream())) + .flatMap( + fileScanTask -> + Stream.concat(Stream.of(fileScanTask.file()), fileScanTask.deletes().stream())) .forEach(file -> keyMetadata.put(file.path().toString(), file.keyMetadata())); - Stream encrypted = keyMetadata.entrySet().stream() - .map(entry -> EncryptedFiles.encryptedInput(table.io().newInputFile(entry.getKey()), entry.getValue())); + Stream encrypted = + keyMetadata.entrySet().stream() + .map( + entry -> + EncryptedFiles.encryptedInput( + table.io().newInputFile(entry.getKey()), entry.getValue())); // decrypt with the batch call to avoid multiple RPCs to a key server, if possible Iterable decryptedFiles = table.encryption().decrypt(encrypted::iterator); @@ -188,7 +193,8 @@ protected static Object convertConstant(Type type, Object value) { for (int index = 0; index < fields.size(); index++) { NestedField field = fields.get(index); Type fieldType = field.type(); - values[index] = convertConstant(fieldType, struct.get(index, fieldType.typeId().javaClass())); + values[index] = + convertConstant(fieldType, struct.get(index, fieldType.typeId().javaClass())); } return new GenericInternalRow(values); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java index e4bd3ceba6ce..d620faa979f6 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -50,7 +49,8 @@ class BatchDataReader extends BaseDataReader { private final boolean caseSensitive; private final int batchSize; - BatchDataReader(CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive, int size) { + BatchDataReader( + CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive, int size) { super(table, task); this.expectedSchema = expectedSchema; this.nameMapping = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); @@ -71,18 +71,26 @@ CloseableIterator open(FileScanTask task) { InputFile location = getInputFile(task); Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask"); if (task.file().format() == FileFormat.PARQUET) { - Parquet.ReadBuilder builder = Parquet.read(location) - .project(expectedSchema) - .split(task.start(), task.length()) - .createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(expectedSchema, - fileSchema, /* setArrowValidityVector */ NullCheckingForGet.NULL_CHECKING_ENABLED, idToConstant)) - .recordsPerBatch(batchSize) - .filter(task.residual()) - .caseSensitive(caseSensitive) - // Spark eagerly consumes the batches. So the underlying memory allocated could be reused - // without worrying about subsequent reads clobbering over each other. This improves - // read performance as every batch read doesn't have to pay the cost of allocating memory. - .reuseContainers(); + Parquet.ReadBuilder builder = + Parquet.read(location) + .project(expectedSchema) + .split(task.start(), task.length()) + .createBatchedReaderFunc( + fileSchema -> + VectorizedSparkParquetReaders.buildReader( + expectedSchema, + fileSchema, /* setArrowValidityVector */ + NullCheckingForGet.NULL_CHECKING_ENABLED, + idToConstant)) + .recordsPerBatch(batchSize) + .filter(task.residual()) + .caseSensitive(caseSensitive) + // Spark eagerly consumes the batches. So the underlying memory allocated could be + // reused + // without worrying about subsequent reads clobbering over each other. This improves + // read performance as every batch read doesn't have to pay the cost of allocating + // memory. + .reuseContainers(); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -92,16 +100,21 @@ CloseableIterator open(FileScanTask task) { } else if (task.file().format() == FileFormat.ORC) { Set constantFieldIds = idToConstant.keySet(); Set metadataFieldIds = MetadataColumns.metadataFieldIds(); - Sets.SetView constantAndMetadataFieldIds = Sets.union(constantFieldIds, metadataFieldIds); - Schema schemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(expectedSchema, constantAndMetadataFieldIds); - ORC.ReadBuilder builder = ORC.read(location) - .project(schemaWithoutConstantAndMetadataFields) - .split(task.start(), task.length()) - .createBatchedReaderFunc(fileSchema -> VectorizedSparkOrcReaders.buildReader(expectedSchema, fileSchema, - idToConstant)) - .recordsPerBatch(batchSize) - .filter(task.residual()) - .caseSensitive(caseSensitive); + Sets.SetView constantAndMetadataFieldIds = + Sets.union(constantFieldIds, metadataFieldIds); + Schema schemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot(expectedSchema, constantAndMetadataFieldIds); + ORC.ReadBuilder builder = + ORC.read(location) + .project(schemaWithoutConstantAndMetadataFields) + .split(task.start(), task.length()) + .createBatchedReaderFunc( + fileSchema -> + VectorizedSparkOrcReaders.buildReader( + expectedSchema, fileSchema, idToConstant)) + .recordsPerBatch(batchSize) + .filter(task.residual()) + .caseSensitive(caseSensitive); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java index ce2226f4f75e..1c55e1b8ebe2 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -32,7 +31,8 @@ public class EqualityDeleteRowReader extends RowDataReader { private final Schema expectedSchema; - public EqualityDeleteRowReader(CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { + public EqualityDeleteRowReader( + CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { super(task, table, table.schema(), caseSensitive); this.expectedSchema = expectedSchema; } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java index 7aa66b2223fc..a3c46a6694f4 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Arrays; @@ -43,17 +42,17 @@ /** * The IcebergSource loads/writes tables with format "iceberg". It can load paths and tables. * - * How paths/tables are loaded when using spark.read().format("iceberg").path(table) + *

    How paths/tables are loaded when using spark.read().format("iceberg").path(table) * - * table = "file:/path/to/table" -> loads a HadoopTable at given path - * table = "tablename" -> loads currentCatalog.currentNamespace.tablename - * table = "catalog.tablename" -> load "tablename" from the specified catalog. - * table = "namespace.tablename" -> load "namespace.tablename" from current catalog - * table = "catalog.namespace.tablename" -> "namespace.tablename" from the specified catalog. - * table = "namespace1.namespace2.tablename" -> load "namespace1.namespace2.tablename" from current catalog + *

    table = "file:/path/to/table" -> loads a HadoopTable at given path table = "tablename" + * -> loads currentCatalog.currentNamespace.tablename table = "catalog.tablename" -> load + * "tablename" from the specified catalog. table = "namespace.tablename" -> load + * "namespace.tablename" from current catalog table = "catalog.namespace.tablename" -> + * "namespace.tablename" from the specified catalog. table = "namespace1.namespace2.tablename" -> + * load "namespace1.namespace2.tablename" from current catalog * - * The above list is in order of priority. For example: a matching catalog will take priority over any namespace - * resolution. + *

    The above list is in order of priority. For example: a matching catalog will take priority + * over any namespace resolution. */ public class IcebergSource implements DataSourceRegister, SupportsCatalogOptions { private static final String DEFAULT_CATALOG_NAME = "default_iceberg"; @@ -83,7 +82,8 @@ public boolean supportsExternalMetadata() { @Override public Table getTable(StructType schema, Transform[] partitioning, Map options) { - Spark3Util.CatalogAndIdentifier catalogIdentifier = catalogAndIdentifier(new CaseInsensitiveStringMap(options)); + Spark3Util.CatalogAndIdentifier catalogIdentifier = + catalogAndIdentifier(new CaseInsensitiveStringMap(options)); CatalogPlugin catalog = catalogIdentifier.catalog(); Identifier ident = catalogIdentifier.identifier(); @@ -92,12 +92,16 @@ public Table getTable(StructType schema, Transform[] partitioning, Map config = ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "cache-enabled", "false" // the source should not use a cache - ); + ImmutableMap config = + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "cache-enabled", "false" // the source should not use a cache + ); String catalogName = "org.apache.iceberg.spark.SparkCatalog"; spark.conf().set(DEFAULT_CATALOG, catalogName); config.forEach((key, value) -> spark.conf().set(DEFAULT_CATALOG + "." + key, value)); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java index ef1eb08d873c..524266f6f83a 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.nio.ByteBuffer; @@ -32,8 +31,8 @@ import org.apache.spark.sql.types.StructType; /** - * Class to adapt a Spark {@code InternalRow} to Iceberg {@link StructLike} for uses like - * {@link org.apache.iceberg.PartitionKey#partition(StructLike)} + * Class to adapt a Spark {@code InternalRow} to Iceberg {@link StructLike} for uses like {@link + * org.apache.iceberg.PartitionKey#partition(StructLike)} */ class InternalRowWrapper implements StructLike { private final DataType[] types; @@ -42,12 +41,8 @@ class InternalRowWrapper implements StructLike { @SuppressWarnings("unchecked") InternalRowWrapper(StructType rowType) { - this.types = Stream.of(rowType.fields()) - .map(StructField::dataType) - .toArray(DataType[]::new); - this.getters = Stream.of(types) - .map(InternalRowWrapper::getter) - .toArray(BiFunction[]::new); + this.types = Stream.of(rowType.fields()).map(StructField::dataType).toArray(DataType[]::new); + this.getters = Stream.of(types).map(InternalRowWrapper::getter).toArray(BiFunction[]::new); } InternalRowWrapper wrap(InternalRow internalRow) { diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java index 4f5962494feb..f206149da30e 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -81,7 +80,8 @@ protected Schema tableSchema() { return tableSchema; } - protected CloseableIterable open(FileScanTask task, Schema readSchema, Map idToConstant) { + protected CloseableIterable open( + FileScanTask task, Schema readSchema, Map idToConstant) { CloseableIterable iter; if (task.isDataTask()) { iter = newDataIterable(task.asDataTask(), readSchema); @@ -112,15 +112,14 @@ protected CloseableIterable open(FileScanTask task, Schema readSche } private CloseableIterable newAvroIterable( - InputFile location, - FileScanTask task, - Schema projection, - Map idToConstant) { - Avro.ReadBuilder builder = Avro.read(location) - .reuseContainers() - .project(projection) - .split(task.start(), task.length()) - .createReaderFunc(readSchema -> new SparkAvroReader(projection, readSchema, idToConstant)); + InputFile location, FileScanTask task, Schema projection, Map idToConstant) { + Avro.ReadBuilder builder = + Avro.read(location) + .reuseContainers() + .project(projection) + .split(task.start(), task.length()) + .createReaderFunc( + readSchema -> new SparkAvroReader(projection, readSchema, idToConstant)); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -130,17 +129,16 @@ private CloseableIterable newAvroIterable( } private CloseableIterable newParquetIterable( - InputFile location, - FileScanTask task, - Schema readSchema, - Map idToConstant) { - Parquet.ReadBuilder builder = Parquet.read(location) - .reuseContainers() - .split(task.start(), task.length()) - .project(readSchema) - .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive); + InputFile location, FileScanTask task, Schema readSchema, Map idToConstant) { + Parquet.ReadBuilder builder = + Parquet.read(location) + .reuseContainers() + .split(task.start(), task.length()) + .project(readSchema) + .createReaderFunc( + fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -150,19 +148,19 @@ private CloseableIterable newParquetIterable( } private CloseableIterable newOrcIterable( - InputFile location, - FileScanTask task, - Schema readSchema, - Map idToConstant) { - Schema readSchemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(readSchema, - Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); - - ORC.ReadBuilder builder = ORC.read(location) - .project(readSchemaWithoutConstantAndMetadataFields) - .split(task.start(), task.length()) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive); + InputFile location, FileScanTask task, Schema readSchema, Map idToConstant) { + Schema readSchemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot( + readSchema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); + + ORC.ReadBuilder builder = + ORC.read(location) + .project(readSchemaWithoutConstantAndMetadataFields) + .split(task.start(), task.length()) + .createReaderFunc( + readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -173,8 +171,8 @@ private CloseableIterable newOrcIterable( private CloseableIterable newDataIterable(DataTask task, Schema readSchema) { StructInternalRow row = new StructInternalRow(readSchema.asStruct()); - CloseableIterable asSparkRows = CloseableIterable.transform( - task.asDataTask().rows(), row::setStruct); + CloseableIterable asSparkRows = + CloseableIterable.transform(task.asDataTask().rows(), row::setStruct); return asSparkRows; } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java index 63cc3a466c1a..aee0d4f0586b 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.Serializable; @@ -55,23 +54,25 @@ public class RowDataRewriter implements Serializable { private final FileFormat format; private final boolean caseSensitive; - public RowDataRewriter(Broadcast tableBroadcast, PartitionSpec spec, boolean caseSensitive) { + public RowDataRewriter( + Broadcast
    tableBroadcast, PartitionSpec spec, boolean caseSensitive) { this.tableBroadcast = tableBroadcast; this.spec = spec; this.caseSensitive = caseSensitive; Table table = tableBroadcast.value(); - String formatString = table.properties().getOrDefault( - TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); + String formatString = + table + .properties() + .getOrDefault( + TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); this.format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH)); } public List rewriteDataForTasks(JavaRDD taskRDD) { JavaRDD> dataFilesRDD = taskRDD.map(this::rewriteDataForTask); - return dataFilesRDD.collect().stream() - .flatMap(Collection::stream) - .collect(Collectors.toList()); + return dataFilesRDD.collect().stream().flatMap(Collection::stream).collect(Collectors.toList()); } private List rewriteDataForTask(CombinedScanTask task) throws Exception { @@ -86,28 +87,44 @@ private List rewriteDataForTask(CombinedScanTask task) throws Exceptio RowDataReader dataReader = new RowDataReader(task, table, schema, caseSensitive); StructType structType = SparkSchemaUtil.convert(schema); - SparkAppenderFactory appenderFactory = SparkAppenderFactory.builderFor(table, schema, structType) - .spec(spec) - .build(); - OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, partitionId, taskId) - .defaultSpec(spec) - .format(format) - .build(); + SparkAppenderFactory appenderFactory = + SparkAppenderFactory.builderFor(table, schema, structType).spec(spec).build(); + OutputFileFactory fileFactory = + OutputFileFactory.builderFor(table, partitionId, taskId) + .defaultSpec(spec) + .format(format) + .build(); TaskWriter writer; if (spec.isUnpartitioned()) { - writer = new UnpartitionedWriter<>(spec, format, appenderFactory, fileFactory, table.io(), - Long.MAX_VALUE); - } else if (PropertyUtil.propertyAsBoolean(properties, + writer = + new UnpartitionedWriter<>( + spec, format, appenderFactory, fileFactory, table.io(), Long.MAX_VALUE); + } else if (PropertyUtil.propertyAsBoolean( + properties, TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED, TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED_DEFAULT)) { - writer = new SparkPartitionedFanoutWriter( - spec, format, appenderFactory, fileFactory, table.io(), Long.MAX_VALUE, schema, - structType); + writer = + new SparkPartitionedFanoutWriter( + spec, + format, + appenderFactory, + fileFactory, + table.io(), + Long.MAX_VALUE, + schema, + structType); } else { - writer = new SparkPartitionedWriter( - spec, format, appenderFactory, fileFactory, table.io(), Long.MAX_VALUE, schema, - structType); + writer = + new SparkPartitionedWriter( + spec, + format, + appenderFactory, + fileFactory, + table.io(), + Long.MAX_VALUE, + schema, + structType); } try { @@ -127,14 +144,24 @@ private List rewriteDataForTask(CombinedScanTask task) throws Exceptio LOG.error("Aborting task", originalThrowable); context.markTaskFailed(originalThrowable); - LOG.error("Aborting commit for partition {} (task {}, attempt {}, stage {}.{})", - partitionId, taskId, context.attemptNumber(), context.stageId(), context.stageAttemptNumber()); + LOG.error( + "Aborting commit for partition {} (task {}, attempt {}, stage {}.{})", + partitionId, + taskId, + context.attemptNumber(), + context.stageId(), + context.stageAttemptNumber()); if (dataReader != null) { dataReader.close(); } writer.abort(); - LOG.error("Aborted commit for partition {} (task {}, attempt {}, stage {}.{})", - partitionId, taskId, context.taskAttemptId(), context.stageId(), context.stageAttemptNumber()); + LOG.error( + "Aborted commit for partition {} (task {}, attempt {}, stage {}.{})", + partitionId, + taskId, + context.taskAttemptId(), + context.stageId(), + context.stageAttemptNumber()); } catch (Throwable inner) { if (originalThrowable != inner) { diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SerializableTableWithSize.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SerializableTableWithSize.java index 6275c664410f..e3b81cea7cd1 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SerializableTableWithSize.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SerializableTableWithSize.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.BaseMetadataTable; @@ -25,9 +24,9 @@ import org.apache.spark.util.KnownSizeEstimation; /** - * This class provides a serializable table with a known size estimate. Spark calls - * its SizeEstimator class when broadcasting variables and this can be an expensive - * operation, so providing a known size estimate allows that operation to be skipped. + * This class provides a serializable table with a known size estimate. Spark calls its + * SizeEstimator class when broadcasting variables and this can be an expensive operation, so + * providing a known size estimate allows that operation to be skipped. */ public class SerializableTableWithSize extends SerializableTable implements KnownSizeEstimation { diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java index 4becf666ed3e..6372edde0782 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -61,8 +60,14 @@ class SparkAppenderFactory implements FileAppenderFactory { private StructType eqDeleteSparkType = null; private StructType posDeleteSparkType = null; - SparkAppenderFactory(Map properties, Schema writeSchema, StructType dsSchema, PartitionSpec spec, - int[] equalityFieldIds, Schema eqDeleteRowSchema, Schema posDeleteRowSchema) { + SparkAppenderFactory( + Map properties, + Schema writeSchema, + StructType dsSchema, + PartitionSpec spec, + int[] equalityFieldIds, + Schema eqDeleteRowSchema, + Schema posDeleteRowSchema) { this.properties = properties; this.writeSchema = writeSchema; this.dsSchema = dsSchema; @@ -85,7 +90,6 @@ static class Builder { private Schema eqDeleteRowSchema; private Schema posDeleteRowSchema; - Builder(Table table, Schema writeSchema, StructType dsSchema) { this.table = table; this.spec = table.spec(); @@ -118,16 +122,24 @@ SparkAppenderFactory build() { Preconditions.checkNotNull(writeSchema, "Write Schema must not be null"); Preconditions.checkNotNull(dsSchema, "DS Schema must not be null"); if (equalityFieldIds != null) { - Preconditions.checkNotNull(eqDeleteRowSchema, "Equality Field Ids and Equality Delete Row Schema" + - " must be set together"); + Preconditions.checkNotNull( + eqDeleteRowSchema, + "Equality Field Ids and Equality Delete Row Schema" + " must be set together"); } if (eqDeleteRowSchema != null) { - Preconditions.checkNotNull(equalityFieldIds, "Equality Field Ids and Equality Delete Row Schema" + - " must be set together"); + Preconditions.checkNotNull( + equalityFieldIds, + "Equality Field Ids and Equality Delete Row Schema" + " must be set together"); } - return new SparkAppenderFactory(table.properties(), writeSchema, dsSchema, spec, equalityFieldIds, - eqDeleteRowSchema, posDeleteRowSchema); + return new SparkAppenderFactory( + table.properties(), + writeSchema, + dsSchema, + spec, + equalityFieldIds, + eqDeleteRowSchema, + posDeleteRowSchema); } } @@ -141,7 +153,8 @@ private StructType lazyEqDeleteSparkType() { private StructType lazyPosDeleteSparkType() { if (posDeleteSparkType == null) { - Preconditions.checkNotNull(posDeleteRowSchema, "Position delete row schema shouldn't be null"); + Preconditions.checkNotNull( + posDeleteRowSchema, "Position delete row schema shouldn't be null"); this.posDeleteSparkType = SparkSchemaUtil.convert(posDeleteRowSchema); } return posDeleteSparkType; @@ -187,24 +200,33 @@ public FileAppender newAppender(OutputFile file, FileFormat fileFor } @Override - public DataWriter newDataWriter(EncryptedOutputFile file, FileFormat format, StructLike partition) { - return new DataWriter<>(newAppender(file.encryptingOutputFile(), format), format, - file.encryptingOutputFile().location(), spec, partition, file.keyMetadata()); + public DataWriter newDataWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { + return new DataWriter<>( + newAppender(file.encryptingOutputFile(), format), + format, + file.encryptingOutputFile().location(), + spec, + partition, + file.keyMetadata()); } @Override - public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile file, FileFormat format, - StructLike partition) { - Preconditions.checkState(equalityFieldIds != null && equalityFieldIds.length > 0, + public EqualityDeleteWriter newEqDeleteWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { + Preconditions.checkState( + equalityFieldIds != null && equalityFieldIds.length > 0, "Equality field ids shouldn't be null or empty when creating equality-delete writer"); - Preconditions.checkNotNull(eqDeleteRowSchema, + Preconditions.checkNotNull( + eqDeleteRowSchema, "Equality delete row schema shouldn't be null when creating equality-delete writer"); try { switch (format) { case PARQUET: return Parquet.writeDeletes(file.encryptingOutputFile()) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(lazyEqDeleteSparkType(), msgType)) + .createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(lazyEqDeleteSparkType(), msgType)) .overwrite() .rowSchema(eqDeleteRowSchema) .withSpec(spec) @@ -245,15 +267,16 @@ public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile f } @Override - public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile file, FileFormat format, - StructLike partition) { + public PositionDeleteWriter newPosDeleteWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { try { switch (format) { case PARQUET: StructType sparkPosDeleteSchema = SparkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); return Parquet.writeDeletes(file.encryptingOutputFile()) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(sparkPosDeleteSchema, msgType)) + .createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(sparkPosDeleteSchema, msgType)) .overwrite() .rowSchema(posDeleteRowSchema) .withSpec(spec) diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java index 356271248c35..4fcab5517d44 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -49,8 +48,14 @@ class SparkBatchQueryScan extends SparkBatchScan { private List tasks = null; // lazy cache of tasks - SparkBatchQueryScan(SparkSession spark, Table table, SparkReadConf readConf, boolean caseSensitive, - Schema expectedSchema, List filters, CaseInsensitiveStringMap options) { + SparkBatchQueryScan( + SparkSession spark, + Table table, + SparkReadConf readConf, + boolean caseSensitive, + Schema expectedSchema, + List filters, + CaseInsensitiveStringMap options) { super(spark, table, readConf, caseSensitive, expectedSchema, filters, options); @@ -67,26 +72,28 @@ class SparkBatchQueryScan extends SparkBatchScan { if (snapshotId != null || asOfTimestamp != null) { if (startSnapshotId != null || endSnapshotId != null) { throw new IllegalArgumentException( - "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan when either " + - SparkReadOptions.SNAPSHOT_ID + " or " + SparkReadOptions.AS_OF_TIMESTAMP + " is specified"); + "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan when either " + + SparkReadOptions.SNAPSHOT_ID + + " or " + + SparkReadOptions.AS_OF_TIMESTAMP + + " is specified"); } } else if (startSnapshotId == null && endSnapshotId != null) { - throw new IllegalArgumentException("Cannot only specify option end-snapshot-id to do incremental scan"); + throw new IllegalArgumentException( + "Cannot only specify option end-snapshot-id to do incremental scan"); } // look for split behavior overrides in options this.splitSize = Spark3Util.propertyAsLong(options, SparkReadOptions.SPLIT_SIZE, null); this.splitLookback = Spark3Util.propertyAsInt(options, SparkReadOptions.LOOKBACK, null); - this.splitOpenFileCost = Spark3Util.propertyAsLong(options, SparkReadOptions.FILE_OPEN_COST, null); + this.splitOpenFileCost = + Spark3Util.propertyAsLong(options, SparkReadOptions.FILE_OPEN_COST, null); } @Override protected List tasks() { if (tasks == null) { - TableScan scan = table() - .newScan() - .caseSensitive(caseSensitive()) - .project(expectedSchema()); + TableScan scan = table().newScan().caseSensitive(caseSensitive()).project(expectedSchema()); if (snapshotId != null) { scan = scan.useSnapshot(snapshotId); @@ -122,7 +129,7 @@ protected List tasks() { try (CloseableIterable tasksIterable = scan.planTasks()) { this.tasks = Lists.newArrayList(tasksIterable); - } catch (IOException e) { + } catch (IOException e) { throw new RuntimeIOException(e, "Failed to close table scan: %s", scan); } } @@ -141,19 +148,25 @@ public boolean equals(Object o) { } SparkBatchQueryScan that = (SparkBatchQueryScan) o; - return table().name().equals(that.table().name()) && - readSchema().equals(that.readSchema()) && // compare Spark schemas to ignore field ids - filterExpressions().toString().equals(that.filterExpressions().toString()) && - Objects.equals(snapshotId, that.snapshotId) && - Objects.equals(startSnapshotId, that.startSnapshotId) && - Objects.equals(endSnapshotId, that.endSnapshotId) && - Objects.equals(asOfTimestamp, that.asOfTimestamp); + return table().name().equals(that.table().name()) + && readSchema().equals(that.readSchema()) + && // compare Spark schemas to ignore field ids + filterExpressions().toString().equals(that.filterExpressions().toString()) + && Objects.equals(snapshotId, that.snapshotId) + && Objects.equals(startSnapshotId, that.startSnapshotId) + && Objects.equals(endSnapshotId, that.endSnapshotId) + && Objects.equals(asOfTimestamp, that.asOfTimestamp); } @Override public int hashCode() { return Objects.hash( - table().name(), readSchema(), filterExpressions().toString(), snapshotId, startSnapshotId, endSnapshotId, + table().name(), + readSchema(), + filterExpressions().toString(), + snapshotId, + startSnapshotId, + endSnapshotId, asOfTimestamp); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java index ce5c6a219087..3b3d62d96226 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.Serializable; @@ -77,8 +76,14 @@ abstract class SparkBatchScan implements Scan, Batch, SupportsReportStatistics { // lazy variables private StructType readSchema = null; - SparkBatchScan(SparkSession spark, Table table, SparkReadConf readConf, boolean caseSensitive, - Schema expectedSchema, List filters, CaseInsensitiveStringMap options) { + SparkBatchScan( + SparkSession spark, + Table table, + SparkReadConf readConf, + boolean caseSensitive, + Schema expectedSchema, + List filters, + CaseInsensitiveStringMap options) { this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); this.table = table; this.readConf = readConf; @@ -122,8 +127,9 @@ public MicroBatchStream toMicroBatchStream(String checkpointLocation) { @Override public StructType readSchema() { if (readSchema == null) { - Preconditions.checkArgument(readTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(expectedSchema), - SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); + Preconditions.checkArgument( + readTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(expectedSchema), + SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); this.readSchema = SparkSchemaUtil.convert(expectedSchema); } return readSchema; @@ -134,7 +140,8 @@ public InputPartition[] planInputPartitions() { String expectedSchemaString = SchemaParser.toJson(expectedSchema); // broadcast the table metadata as input partitions will be sent to executors - Broadcast
    tableBroadcast = sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); + Broadcast
    tableBroadcast = + sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); List scanTasks = tasks(); InputPartition[] readTasks = new InputPartition[scanTasks.size()]; @@ -142,9 +149,15 @@ public InputPartition[] planInputPartitions() { Tasks.range(readTasks.length) .stopOnFailure() .executeWith(localityPreferred ? ThreadPools.getWorkerPool() : null) - .run(index -> readTasks[index] = new ReadTask( - scanTasks.get(index), tableBroadcast, expectedSchemaString, - caseSensitive, localityPreferred)); + .run( + index -> + readTasks[index] = + new ReadTask( + scanTasks.get(index), + tableBroadcast, + expectedSchemaString, + caseSensitive, + localityPreferred)); return readTasks; } @@ -153,28 +166,38 @@ public InputPartition[] planInputPartitions() { public PartitionReaderFactory createReaderFactory() { boolean allParquetFileScanTasks = tasks().stream() - .allMatch(combinedScanTask -> !combinedScanTask.isDataTask() && combinedScanTask.files() - .stream() - .allMatch(fileScanTask -> fileScanTask.file().format().equals( - FileFormat.PARQUET))); + .allMatch( + combinedScanTask -> + !combinedScanTask.isDataTask() + && combinedScanTask.files().stream() + .allMatch( + fileScanTask -> + fileScanTask.file().format().equals(FileFormat.PARQUET))); boolean allOrcFileScanTasks = tasks().stream() - .allMatch(combinedScanTask -> !combinedScanTask.isDataTask() && combinedScanTask.files() - .stream() - .allMatch(fileScanTask -> fileScanTask.file().format().equals( - FileFormat.ORC))); + .allMatch( + combinedScanTask -> + !combinedScanTask.isDataTask() + && combinedScanTask.files().stream() + .allMatch( + fileScanTask -> + fileScanTask.file().format().equals(FileFormat.ORC))); boolean atLeastOneColumn = expectedSchema.columns().size() > 0; - boolean onlyPrimitives = expectedSchema.columns().stream().allMatch(c -> c.type().isPrimitiveType()); + boolean onlyPrimitives = + expectedSchema.columns().stream().allMatch(c -> c.type().isPrimitiveType()); boolean hasNoDeleteFiles = tasks().stream().noneMatch(TableScanUtil::hasDeletes); boolean batchReadsEnabled = batchReadsEnabled(allParquetFileScanTasks, allOrcFileScanTasks); - boolean readUsingBatch = batchReadsEnabled && hasNoDeleteFiles && (allOrcFileScanTasks || - (allParquetFileScanTasks && atLeastOneColumn && onlyPrimitives)); + boolean readUsingBatch = + batchReadsEnabled + && hasNoDeleteFiles + && (allOrcFileScanTasks + || (allParquetFileScanTasks && atLeastOneColumn && onlyPrimitives)); int batchSize = readUsingBatch ? batchSize(allParquetFileScanTasks, allOrcFileScanTasks) : 0; @@ -208,14 +231,16 @@ public Statistics estimateStatistics() { return new Stats(0L, 0L); } - // estimate stats using snapshot summary only for partitioned tables (metadata tables are unpartitioned) + // estimate stats using snapshot summary only for partitioned tables (metadata tables are + // unpartitioned) if (!table.spec().isUnpartitioned() && filterExpressions.isEmpty()) { LOG.debug("using table metadata to estimate table statistics"); - long totalRecords = PropertyUtil.propertyAsLong(table.currentSnapshot().summary(), - SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE); - return new Stats( - SparkSchemaUtil.estimateSize(readSchema(), totalRecords), - totalRecords); + long totalRecords = + PropertyUtil.propertyAsLong( + table.currentSnapshot().summary(), + SnapshotSummary.TOTAL_RECORDS_PROP, + Long.MAX_VALUE); + return new Stats(SparkSchemaUtil.estimateSize(readSchema(), totalRecords), totalRecords); } long numRows = 0L; @@ -234,7 +259,8 @@ public Statistics estimateStatistics() { @Override public String description() { - String filters = filterExpressions.stream().map(Spark3Util::describe).collect(Collectors.joining(", ")); + String filters = + filterExpressions.stream().map(Spark3Util::describe).collect(Collectors.joining(", ")); return String.format("%s [filters=%s]", table, filters); } @@ -275,7 +301,8 @@ private static class RowReader extends RowDataReader implements PartitionReader< } } - private static class BatchReader extends BatchDataReader implements PartitionReader { + private static class BatchReader extends BatchDataReader + implements PartitionReader { BatchReader(ReadTask task, int batchSize) { super(task.task, task.table(), task.expectedSchema(), task.isCaseSensitive(), batchSize); } @@ -290,8 +317,12 @@ static class ReadTask implements InputPartition, Serializable { private transient Schema expectedSchema = null; private transient String[] preferredLocations = null; - ReadTask(CombinedScanTask task, Broadcast
    tableBroadcast, String expectedSchemaString, - boolean caseSensitive, boolean localityPreferred) { + ReadTask( + CombinedScanTask task, + Broadcast
    tableBroadcast, + String expectedSchemaString, + boolean caseSensitive, + boolean localityPreferred) { this.task = task; this.tableBroadcast = tableBroadcast; this.expectedSchemaString = expectedSchemaString; diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java index beaa7c295024..a8c894bfc50c 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; +import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; + import java.util.Locale; import java.util.Map; import org.apache.iceberg.FileFormat; @@ -40,24 +44,35 @@ import org.apache.spark.sql.types.StructType; import org.apache.spark.unsafe.types.UTF8String; -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; -import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; - class SparkFileWriterFactory extends BaseFileWriterFactory { private StructType dataSparkType; private StructType equalityDeleteSparkType; private StructType positionDeleteSparkType; - SparkFileWriterFactory(Table table, FileFormat dataFileFormat, Schema dataSchema, StructType dataSparkType, - SortOrder dataSortOrder, FileFormat deleteFileFormat, - int[] equalityFieldIds, Schema equalityDeleteRowSchema, StructType equalityDeleteSparkType, - SortOrder equalityDeleteSortOrder, Schema positionDeleteRowSchema, - StructType positionDeleteSparkType) { - - super(table, dataFileFormat, dataSchema, dataSortOrder, deleteFileFormat, equalityFieldIds, - equalityDeleteRowSchema, equalityDeleteSortOrder, positionDeleteRowSchema); + SparkFileWriterFactory( + Table table, + FileFormat dataFileFormat, + Schema dataSchema, + StructType dataSparkType, + SortOrder dataSortOrder, + FileFormat deleteFileFormat, + int[] equalityFieldIds, + Schema equalityDeleteRowSchema, + StructType equalityDeleteSparkType, + SortOrder equalityDeleteSortOrder, + Schema positionDeleteRowSchema, + StructType positionDeleteSparkType) { + + super( + table, + dataFileFormat, + dataSchema, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteSortOrder, + positionDeleteRowSchema); this.dataSparkType = dataSparkType; this.equalityDeleteSparkType = equalityDeleteSparkType; @@ -80,7 +95,8 @@ protected void configureEqualityDelete(Avro.DeleteWriteBuilder builder) { @Override protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { - boolean withRow = positionDeleteSparkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME).isDefined(); + boolean withRow = + positionDeleteSparkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME).isDefined(); if (withRow) { // SparkAvroWriter accepts just the Spark type of the row ignoring the path and pos StructField rowField = positionDeleteSparkType().apply(DELETE_FILE_ROW_FIELD_NAME); @@ -96,12 +112,14 @@ protected void configureDataWrite(Parquet.DataWriteBuilder builder) { @Override protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(msgType -> SparkParquetWriters.buildWriter(equalityDeleteSparkType(), msgType)); + builder.createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(equalityDeleteSparkType(), msgType)); } @Override protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(msgType -> SparkParquetWriters.buildWriter(positionDeleteSparkType(), msgType)); + builder.createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(positionDeleteSparkType(), msgType)); builder.transformPaths(path -> UTF8String.fromString(path.toString())); } @@ -132,7 +150,8 @@ private StructType dataSparkType() { private StructType equalityDeleteSparkType() { if (equalityDeleteSparkType == null) { - Preconditions.checkNotNull(equalityDeleteRowSchema(), "Equality delete schema must not be null"); + Preconditions.checkNotNull( + equalityDeleteRowSchema(), "Equality delete schema must not be null"); this.equalityDeleteSparkType = SparkSchemaUtil.convert(equalityDeleteRowSchema()); } @@ -141,7 +160,8 @@ private StructType equalityDeleteSparkType() { private StructType positionDeleteSparkType() { if (positionDeleteSparkType == null) { - // wrap the optional row schema into the position delete schema that contains path and position + // wrap the optional row schema into the position delete schema that contains path and + // position Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema()); this.positionDeleteSparkType = SparkSchemaUtil.convert(positionDeleteSchema); } @@ -168,10 +188,12 @@ static class Builder { Map properties = table.properties(); - String dataFileFormatName = properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); + String dataFileFormatName = + properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); this.dataFileFormat = FileFormat.valueOf(dataFileFormatName.toUpperCase(Locale.ENGLISH)); - String deleteFileFormatName = properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); + String deleteFileFormatName = + properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); this.deleteFileFormat = FileFormat.valueOf(deleteFileFormatName.toUpperCase(Locale.ENGLISH)); } @@ -233,13 +255,23 @@ Builder positionDeleteSparkType(StructType newPositionDeleteSparkType) { SparkFileWriterFactory build() { boolean noEqualityDeleteConf = equalityFieldIds == null && equalityDeleteRowSchema == null; boolean fullEqualityDeleteConf = equalityFieldIds != null && equalityDeleteRowSchema != null; - Preconditions.checkArgument(noEqualityDeleteConf || fullEqualityDeleteConf, + Preconditions.checkArgument( + noEqualityDeleteConf || fullEqualityDeleteConf, "Equality field IDs and equality delete row schema must be set together"); return new SparkFileWriterFactory( - table, dataFileFormat, dataSchema, dataSparkType, dataSortOrder, deleteFileFormat, - equalityFieldIds, equalityDeleteRowSchema, equalityDeleteSparkType, equalityDeleteSortOrder, - positionDeleteRowSchema, positionDeleteSparkType); + table, + dataFileFormat, + dataSchema, + dataSparkType, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteSparkType, + equalityDeleteSortOrder, + positionDeleteRowSchema, + positionDeleteSparkType); } } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScan.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScan.java index bdb162fb553c..4eb36b67ea40 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScan.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -43,8 +42,12 @@ class SparkFilesScan extends SparkBatchScan { private List tasks = null; // lazy cache of tasks - SparkFilesScan(SparkSession spark, Table table, SparkReadConf readConf, - boolean caseSensitive, CaseInsensitiveStringMap options) { + SparkFilesScan( + SparkSession spark, + Table table, + SparkReadConf readConf, + boolean caseSensitive, + CaseInsensitiveStringMap options) { super(spark, table, readConf, caseSensitive, table.schema(), ImmutableList.of(), options); this.taskSetID = options.get(SparkReadOptions.FILE_SCAN_TASK_SET_ID); @@ -58,16 +61,18 @@ protected List tasks() { if (tasks == null) { FileScanTaskSetManager taskSetManager = FileScanTaskSetManager.get(); List files = taskSetManager.fetchTasks(table(), taskSetID); - ValidationException.check(files != null, + ValidationException.check( + files != null, "Task set manager has no tasks for table %s with id %s", - table(), taskSetID); + table(), + taskSetID); - CloseableIterable splitFiles = TableScanUtil.splitFiles( - CloseableIterable.withNoopClose(files), - splitSize); - CloseableIterable scanTasks = TableScanUtil.planTasks( - splitFiles, splitSize, - splitLookback, splitOpenFileCost); + CloseableIterable splitFiles = + TableScanUtil.splitFiles(CloseableIterable.withNoopClose(files), splitSize); + CloseableIterable scanTasks = + TableScanUtil.planTasks( + splitFiles, splitSize, + splitLookback, splitOpenFileCost); this.tasks = Lists.newArrayList(scanTasks); } @@ -85,11 +90,11 @@ public boolean equals(Object other) { } SparkFilesScan that = (SparkFilesScan) other; - return table().name().equals(that.table().name()) && - Objects.equals(taskSetID, that.taskSetID) && - Objects.equals(splitSize, that.splitSize) && - Objects.equals(splitLookback, that.splitLookback) && - Objects.equals(splitOpenFileCost, that.splitOpenFileCost); + return table().name().equals(that.table().name()) + && Objects.equals(taskSetID, that.taskSetID) + && Objects.equals(splitSize, that.splitSize) + && Objects.equals(splitLookback, that.splitLookback) + && Objects.equals(splitOpenFileCost, that.splitOpenFileCost); } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScanBuilder.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScanBuilder.java index fc0414b0682f..029585caf944 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScanBuilder.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScanBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.Table; diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMergeBuilder.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMergeBuilder.java index ade6c2f3ddee..24cd831567d1 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMergeBuilder.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMergeBuilder.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL; +import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL_DEFAULT; +import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL; +import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL_DEFAULT; +import static org.apache.iceberg.TableProperties.UPDATE_ISOLATION_LEVEL; +import static org.apache.iceberg.TableProperties.UPDATE_ISOLATION_LEVEL_DEFAULT; + import java.util.Locale; import java.util.Map; import org.apache.iceberg.IsolationLevel; @@ -31,13 +37,6 @@ import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.WriteBuilder; -import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL; -import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL_DEFAULT; -import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL; -import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL_DEFAULT; -import static org.apache.iceberg.TableProperties.UPDATE_ISOLATION_LEVEL; -import static org.apache.iceberg.TableProperties.UPDATE_ISOLATION_LEVEL_DEFAULT; - class SparkMergeBuilder implements MergeBuilder { private final SparkSession spark; @@ -60,11 +59,14 @@ class SparkMergeBuilder implements MergeBuilder { private IsolationLevel getIsolationLevel(Map props, String operation) { String isolationLevelAsString; if (operation.equalsIgnoreCase("delete")) { - isolationLevelAsString = props.getOrDefault(DELETE_ISOLATION_LEVEL, DELETE_ISOLATION_LEVEL_DEFAULT); + isolationLevelAsString = + props.getOrDefault(DELETE_ISOLATION_LEVEL, DELETE_ISOLATION_LEVEL_DEFAULT); } else if (operation.equalsIgnoreCase("update")) { - isolationLevelAsString = props.getOrDefault(UPDATE_ISOLATION_LEVEL, UPDATE_ISOLATION_LEVEL_DEFAULT); + isolationLevelAsString = + props.getOrDefault(UPDATE_ISOLATION_LEVEL, UPDATE_ISOLATION_LEVEL_DEFAULT); } else if (operation.equalsIgnoreCase("merge")) { - isolationLevelAsString = props.getOrDefault(MERGE_ISOLATION_LEVEL, MERGE_ISOLATION_LEVEL_DEFAULT); + isolationLevelAsString = + props.getOrDefault(MERGE_ISOLATION_LEVEL, MERGE_ISOLATION_LEVEL_DEFAULT); } else { throw new IllegalArgumentException("Unsupported operation: " + operation); } @@ -78,14 +80,15 @@ public ScanBuilder asScanBuilder() { private ScanBuilder scanBuilder() { if (lazyScanBuilder == null) { - SparkScanBuilder scanBuilder = new SparkScanBuilder(spark, table, writeInfo.options()) { - @Override - public Scan build() { - Scan scan = super.buildMergeScan(); - SparkMergeBuilder.this.configuredScan = scan; - return scan; - } - }; + SparkScanBuilder scanBuilder = + new SparkScanBuilder(spark, table, writeInfo.options()) { + @Override + public Scan build() { + Scan scan = super.buildMergeScan(); + SparkMergeBuilder.this.configuredScan = scan; + return scan; + } + }; // ignore residuals to ensure we read full files lazyScanBuilder = scanBuilder.ignoreResiduals(); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMergeScan.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMergeScan.java index 924481f377c1..8bc3f7d049cf 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMergeScan.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMergeScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -59,9 +58,15 @@ class SparkMergeScan extends SparkBatchScan implements SupportsFileFilter { private List tasks = null; // lazy cache of tasks private Set filteredLocations = null; - SparkMergeScan(SparkSession spark, Table table, SparkReadConf readConf, - boolean caseSensitive, boolean ignoreResiduals, - Schema expectedSchema, List filters, CaseInsensitiveStringMap options) { + SparkMergeScan( + SparkSession spark, + Table table, + SparkReadConf readConf, + boolean caseSensitive, + boolean ignoreResiduals, + Schema expectedSchema, + List filters, + CaseInsensitiveStringMap options) { super(spark, table, readConf, caseSensitive, expectedSchema, filters, options); @@ -72,7 +77,8 @@ class SparkMergeScan extends SparkBatchScan implements SupportsFileFilter { this.splitLookback = readConf.splitLookback(); this.splitOpenFileCost = readConf.splitOpenFileCost(); - Preconditions.checkArgument(!options.containsKey(SparkReadOptions.SNAPSHOT_ID), "Can't set snapshot-id in options"); + Preconditions.checkArgument( + !options.containsKey(SparkReadOptions.SNAPSHOT_ID), "Can't set snapshot-id in options"); Snapshot currentSnapshot = table.currentSnapshot(); this.snapshotId = currentSnapshot != null ? currentSnapshot.snapshotId() : null; @@ -98,20 +104,22 @@ public SupportsFileFilter.FileFilterMetric filterFiles(Set locations) { tasks = null; filteredLocations = locations; List originalFile = files(); - files = originalFile.stream() - .filter(file -> filteredLocations.contains(file.file().path().toString())) - .collect(Collectors.toList()); + files = + originalFile.stream() + .filter(file -> filteredLocations.contains(file.file().path().toString())) + .collect(Collectors.toList()); return new SupportsFileFilter.FileFilterMetric(originalFile.size(), files.size()); } // should be accessible to the write synchronized List files() { if (files == null) { - TableScan scan = table - .newScan() - .caseSensitive(caseSensitive()) - .useSnapshot(snapshotId) - .project(expectedSchema); + TableScan scan = + table + .newScan() + .caseSensitive(caseSensitive()) + .useSnapshot(snapshotId) + .project(expectedSchema); for (Expression filter : filterExpressions()) { scan = scan.filter(filter); @@ -134,12 +142,12 @@ synchronized List files() { @Override protected synchronized List tasks() { if (tasks == null) { - CloseableIterable splitFiles = TableScanUtil.splitFiles( - CloseableIterable.withNoopClose(files()), - splitSize); - CloseableIterable scanTasks = TableScanUtil.planTasks( - splitFiles, splitSize, - splitLookback, splitOpenFileCost); + CloseableIterable splitFiles = + TableScanUtil.splitFiles(CloseableIterable.withNoopClose(files()), splitSize); + CloseableIterable scanTasks = + TableScanUtil.planTasks( + splitFiles, splitSize, + splitLookback, splitOpenFileCost); tasks = Lists.newArrayList(scanTasks); } @@ -157,19 +165,24 @@ public boolean equals(Object o) { } SparkMergeScan that = (SparkMergeScan) o; - return table().name().equals(that.table().name()) && - readSchema().equals(that.readSchema()) && // compare Spark schemas to ignore field ids - filterExpressions().toString().equals(that.filterExpressions().toString()) && - ignoreResiduals == that.ignoreResiduals && - Objects.equals(snapshotId, that.snapshotId) && - Objects.equals(filteredLocations, that.filteredLocations); + return table().name().equals(that.table().name()) + && readSchema().equals(that.readSchema()) + && // compare Spark schemas to ignore field ids + filterExpressions().toString().equals(that.filterExpressions().toString()) + && ignoreResiduals == that.ignoreResiduals + && Objects.equals(snapshotId, that.snapshotId) + && Objects.equals(filteredLocations, that.filteredLocations); } @Override public int hashCode() { return Objects.hash( - table().name(), readSchema(), filterExpressions().toString(), - ignoreResiduals, snapshotId, filteredLocations); + table().name(), + readSchema(), + filterExpressions().toString(), + ignoreResiduals, + snapshotId, + filteredLocations); } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java index 5c536cb59299..5346b5267c1d 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.BufferedWriter; @@ -79,8 +78,13 @@ public class SparkMicroBatchStream implements MicroBatchStream { private final boolean skipOverwrite; private final Long fromTimestamp; - SparkMicroBatchStream(JavaSparkContext sparkContext, Table table, SparkReadConf readConf, boolean caseSensitive, - Schema expectedSchema, String checkpointLocation) { + SparkMicroBatchStream( + JavaSparkContext sparkContext, + Table table, + SparkReadConf readConf, + boolean caseSensitive, + Schema expectedSchema, + String checkpointLocation) { this.table = table; this.caseSensitive = caseSensitive; this.expectedSchema = SchemaParser.toJson(expectedSchema); @@ -91,7 +95,8 @@ public class SparkMicroBatchStream implements MicroBatchStream { this.splitOpenFileCost = readConf.splitOpenFileCost(); this.fromTimestamp = readConf.streamFromTimestamp(); - InitialOffsetStore initialOffsetStore = new InitialOffsetStore(table, checkpointLocation, fromTimestamp); + InitialOffsetStore initialOffsetStore = + new InitialOffsetStore(table, checkpointLocation, fromTimestamp); this.initialOffset = initialOffsetStore.initialOffset(); this.skipDelete = readConf.streamingSkipDeleteSnapshots(); @@ -111,14 +116,19 @@ public Offset latestOffset() { Snapshot latestSnapshot = table.currentSnapshot(); return new StreamingOffset( - latestSnapshot.snapshotId(), Iterables.size(latestSnapshot.addedDataFiles(table.io())), false); + latestSnapshot.snapshotId(), + Iterables.size(latestSnapshot.addedDataFiles(table.io())), + false); } @Override public InputPartition[] planInputPartitions(Offset start, Offset end) { - Preconditions.checkArgument(end instanceof StreamingOffset, "Invalid end offset: %s is not a StreamingOffset", end); Preconditions.checkArgument( - start instanceof StreamingOffset, "Invalid start offset: %s is not a StreamingOffset", start); + end instanceof StreamingOffset, "Invalid end offset: %s is not a StreamingOffset", end); + Preconditions.checkArgument( + start instanceof StreamingOffset, + "Invalid start offset: %s is not a StreamingOffset", + start); if (end.equals(StreamingOffset.START_OFFSET)) { return new InputPartition[0]; @@ -129,19 +139,25 @@ public InputPartition[] planInputPartitions(Offset start, Offset end) { List fileScanTasks = planFiles(startOffset, endOffset); - CloseableIterable splitTasks = TableScanUtil.splitFiles( - CloseableIterable.withNoopClose(fileScanTasks), - splitSize); - List combinedScanTasks = Lists.newArrayList( - TableScanUtil.planTasks(splitTasks, splitSize, splitLookback, splitOpenFileCost)); + CloseableIterable splitTasks = + TableScanUtil.splitFiles(CloseableIterable.withNoopClose(fileScanTasks), splitSize); + List combinedScanTasks = + Lists.newArrayList( + TableScanUtil.planTasks(splitTasks, splitSize, splitLookback, splitOpenFileCost)); InputPartition[] readTasks = new InputPartition[combinedScanTasks.size()]; Tasks.range(readTasks.length) .stopOnFailure() .executeWith(localityPreferred ? ThreadPools.getWorkerPool() : null) - .run(index -> readTasks[index] = new ReadTask( - combinedScanTasks.get(index), tableBroadcast, expectedSchema, - caseSensitive, localityPreferred)); + .run( + index -> + readTasks[index] = + new ReadTask( + combinedScanTasks.get(index), + tableBroadcast, + expectedSchema, + caseSensitive, + localityPreferred)); return readTasks; } @@ -162,17 +178,17 @@ public Offset deserializeOffset(String json) { } @Override - public void commit(Offset end) { - } + public void commit(Offset end) {} @Override - public void stop() { - } + public void stop() {} private List planFiles(StreamingOffset startOffset, StreamingOffset endOffset) { List fileScanTasks = Lists.newArrayList(); - StreamingOffset batchStartOffset = StreamingOffset.START_OFFSET.equals(startOffset) ? - determineStartingOffset(table, fromTimestamp) : startOffset; + StreamingOffset batchStartOffset = + StreamingOffset.START_OFFSET.equals(startOffset) + ? determineStartingOffset(table, fromTimestamp) + : startOffset; StreamingOffset currentOffset = null; @@ -189,10 +205,12 @@ private List planFiles(StreamingOffset startOffset, StreamingOffse continue; } - MicroBatch latestMicroBatch = MicroBatches.from(table.snapshot(currentOffset.snapshotId()), table.io()) - .caseSensitive(caseSensitive) - .specsById(table.specs()) - .generate(currentOffset.position(), Long.MAX_VALUE, currentOffset.shouldScanAllFiles()); + MicroBatch latestMicroBatch = + MicroBatches.from(table.snapshot(currentOffset.snapshotId()), table.io()) + .caseSensitive(caseSensitive) + .specsById(table.specs()) + .generate( + currentOffset.position(), Long.MAX_VALUE, currentOffset.shouldScanAllFiles()); fileScanTasks.addAll(latestMicroBatch.tasks()); } while (currentOffset.snapshotId() != endOffset.snapshotId()); @@ -208,19 +226,24 @@ private boolean shouldProcess(Snapshot snapshot) { case DataOperations.REPLACE: return false; case DataOperations.DELETE: - Preconditions.checkState(skipDelete, + Preconditions.checkState( + skipDelete, "Cannot process delete snapshot: %s, to ignore deletes, set %s=true", - snapshot.snapshotId(), SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS); + snapshot.snapshotId(), + SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS); return false; case DataOperations.OVERWRITE: - Preconditions.checkState(skipOverwrite, + Preconditions.checkState( + skipOverwrite, "Cannot process overwrite snapshot: %s, to ignore overwrites, set %s=true", - snapshot.snapshotId(), SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS); + snapshot.snapshotId(), + SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS); return false; default: - throw new IllegalStateException(String.format( - "Cannot process unknown snapshot operation: %s (snapshot id %s)", - op.toLowerCase(Locale.ROOT), snapshot.snapshotId())); + throw new IllegalStateException( + String.format( + "Cannot process unknown snapshot operation: %s (snapshot id %s)", + op.toLowerCase(Locale.ROOT), snapshot.snapshotId())); } } @@ -281,7 +304,8 @@ public StreamingOffset initialOffset() { private void writeOffset(StreamingOffset offset, OutputFile file) { try (OutputStream outputStream = file.create()) { - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(outputStream, StandardCharsets.UTF_8)); + BufferedWriter writer = + new BufferedWriter(new OutputStreamWriter(outputStream, StandardCharsets.UTF_8)); writer.write(offset.json()); writer.flush(); } catch (IOException ioException) { diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java index d38ae2f40316..f17cd260f928 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.FileFormat; @@ -34,10 +33,15 @@ public class SparkPartitionedFanoutWriter extends PartitionedFanoutWriter appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize, - Schema schema, StructType sparkSchema) { + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + StructType sparkSchema) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.partitionKey = new PartitionKey(spec, schema); this.internalRowWrapper = new InternalRowWrapper(sparkSchema); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java index f81a09926d85..a86091644360 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.FileFormat; @@ -34,10 +33,15 @@ public class SparkPartitionedWriter extends PartitionedWriter { private final PartitionKey partitionKey; private final InternalRowWrapper internalRowWrapper; - public SparkPartitionedWriter(PartitionSpec spec, FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize, - Schema schema, StructType sparkSchema) { + public SparkPartitionedWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + StructType sparkSchema) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.partitionKey = new PartitionKey(spec, schema); this.internalRowWrapper = new InternalRowWrapper(sparkSchema); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkRewriteBuilder.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkRewriteBuilder.java index 58373f7e6a4b..be4bd3e2c7d1 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkRewriteBuilder.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkRewriteBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.Schema; @@ -54,16 +53,19 @@ public SparkRewriteBuilder(SparkSession spark, Table table, LogicalWriteInfo inf @Override public BatchWrite buildForBatch() { - Preconditions.checkArgument(handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), + Preconditions.checkArgument( + handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsSchema); - TypeUtil.validateWriteSchema(table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); + TypeUtil.validateWriteSchema( + table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); SparkUtil.validatePartitionTransforms(table.spec()); String appId = spark.sparkContext().applicationId(); - SparkWrite write = new SparkWrite(spark, table, writeConf, writeInfo, appId, writeSchema, dsSchema); + SparkWrite write = + new SparkWrite(spark, table, writeConf, writeInfo, appId, writeSchema, dsSchema); return write.asRewrite(fileSetID); } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java index a9b82df80151..708d4378bc1b 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Collections; @@ -48,8 +47,11 @@ import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; -public class SparkScanBuilder implements ScanBuilder, SupportsPushDownFilters, SupportsPushDownRequiredColumns, - SupportsReportStatistics { +public class SparkScanBuilder + implements ScanBuilder, + SupportsPushDownFilters, + SupportsPushDownRequiredColumns, + SupportsReportStatistics { private static final Filter[] NO_FILTERS = new Filter[0]; @@ -66,7 +68,8 @@ public class SparkScanBuilder implements ScanBuilder, SupportsPushDownFilters, S private Filter[] pushedFilters = NO_FILTERS; private boolean ignoreResiduals = false; - SparkScanBuilder(SparkSession spark, Table table, Schema schema, CaseInsensitiveStringMap options) { + SparkScanBuilder( + SparkSession spark, Table table, Schema schema, CaseInsensitiveStringMap options) { this.spark = spark; this.table = table; this.schema = schema; @@ -129,12 +132,16 @@ public Filter[] pushedFilters() { @Override public void pruneColumns(StructType requestedSchema) { - this.requestedProjection = new StructType(Stream.of(requestedSchema.fields()) - .filter(field -> MetadataColumns.nonMetadataColumn(field.name())) - .toArray(StructField[]::new)); + this.requestedProjection = + new StructType( + Stream.of(requestedSchema.fields()) + .filter(field -> MetadataColumns.nonMetadataColumn(field.name())) + .toArray(StructField[]::new)); - // the projection should include all columns that will be returned, including those only used in filters - this.schema = SparkSchemaUtil.prune(schema, requestedProjection, filterExpression(), caseSensitive); + // the projection should include all columns that will be returned, including those only used in + // filters + this.schema = + SparkSchemaUtil.prune(schema, requestedProjection, filterExpression(), caseSensitive); Stream.of(requestedSchema.fields()) .map(StructField::name) @@ -150,10 +157,11 @@ public SparkScanBuilder ignoreResiduals() { private Schema schemaWithMetadataColumns() { // metadata columns - List fields = metaColumns.stream() - .distinct() - .map(name -> MetadataColumns.metadataColumn(table, name)) - .collect(Collectors.toList()); + List fields = + metaColumns.stream() + .distinct() + .map(name -> MetadataColumns.metadataColumn(table, name)) + .collect(Collectors.toList()); Schema meta = new Schema(fields); // schema or rows returned by readers @@ -163,13 +171,25 @@ private Schema schemaWithMetadataColumns() { @Override public Scan build() { return new SparkBatchQueryScan( - spark, table, readConf, caseSensitive, schemaWithMetadataColumns(), filterExpressions, options); + spark, + table, + readConf, + caseSensitive, + schemaWithMetadataColumns(), + filterExpressions, + options); } public Scan buildMergeScan() { return new SparkMergeScan( - spark, table, readConf, caseSensitive, ignoreResiduals, - schemaWithMetadataColumns(), filterExpressions, options); + spark, + table, + readConf, + caseSensitive, + ignoreResiduals, + schemaWithMetadataColumns(), + filterExpressions, + options); } @Override diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java index 6a1c27e6a6d3..a7b26ce7faca 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.DELETE_MODE; +import static org.apache.iceberg.TableProperties.DELETE_MODE_DEFAULT; +import static org.apache.iceberg.TableProperties.MERGE_MODE; +import static org.apache.iceberg.TableProperties.MERGE_MODE_DEFAULT; +import static org.apache.iceberg.TableProperties.UPDATE_MODE; +import static org.apache.iceberg.TableProperties.UPDATE_MODE_DEFAULT; + import java.util.Map; import java.util.Set; import org.apache.iceberg.Schema; @@ -56,27 +62,25 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.DELETE_MODE; -import static org.apache.iceberg.TableProperties.DELETE_MODE_DEFAULT; -import static org.apache.iceberg.TableProperties.MERGE_MODE; -import static org.apache.iceberg.TableProperties.MERGE_MODE_DEFAULT; -import static org.apache.iceberg.TableProperties.UPDATE_MODE; -import static org.apache.iceberg.TableProperties.UPDATE_MODE_DEFAULT; - -public class SparkTable implements org.apache.spark.sql.connector.catalog.Table, - SupportsRead, SupportsWrite, ExtendedSupportsDelete, SupportsMerge { +public class SparkTable + implements org.apache.spark.sql.connector.catalog.Table, + SupportsRead, + SupportsWrite, + ExtendedSupportsDelete, + SupportsMerge { private static final Logger LOG = LoggerFactory.getLogger(SparkTable.class); private static final Set RESERVED_PROPERTIES = ImmutableSet.of("provider", "format", "current-snapshot-id", "location", "sort-order"); - private static final Set CAPABILITIES = ImmutableSet.of( - TableCapability.BATCH_READ, - TableCapability.BATCH_WRITE, - TableCapability.MICRO_BATCH_READ, - TableCapability.STREAMING_WRITE, - TableCapability.OVERWRITE_BY_FILTER, - TableCapability.OVERWRITE_DYNAMIC); + private static final Set CAPABILITIES = + ImmutableSet.of( + TableCapability.BATCH_READ, + TableCapability.BATCH_WRITE, + TableCapability.MICRO_BATCH_READ, + TableCapability.STREAMING_WRITE, + TableCapability.OVERWRITE_BY_FILTER, + TableCapability.OVERWRITE_DYNAMIC); private final Table icebergTable; private final Long snapshotId; @@ -133,12 +137,17 @@ public Transform[] partitioning() { public Map properties() { ImmutableMap.Builder propsBuilder = ImmutableMap.builder(); - String fileFormat = icebergTable.properties() - .getOrDefault(TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); + String fileFormat = + icebergTable + .properties() + .getOrDefault( + TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); propsBuilder.put("format", "iceberg/" + fileFormat); propsBuilder.put("provider", "iceberg"); - String currentSnapshotId = icebergTable.currentSnapshot() != null ? - String.valueOf(icebergTable.currentSnapshot().snapshotId()) : "none"; + String currentSnapshotId = + icebergTable.currentSnapshot() != null + ? String.valueOf(icebergTable.currentSnapshot().snapshotId()) + : "none"; propsBuilder.put("current-snapshot-id", currentSnapshotId); propsBuilder.put("location", icebergTable.location()); @@ -176,8 +185,7 @@ public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) { @Override public WriteBuilder newWriteBuilder(LogicalWriteInfo info) { Preconditions.checkArgument( - snapshotId == null, - "Cannot write to table at a specific snapshot: %s", snapshotId); + snapshotId == null, "Cannot write to table at a specific snapshot: %s", snapshotId); if (info.options().containsKey(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID)) { // replace data files in the given file scan task set with new files @@ -190,7 +198,8 @@ public WriteBuilder newWriteBuilder(LogicalWriteInfo info) { @Override public MergeBuilder newMergeBuilder(String operation, LogicalWriteInfo info) { String mode = getRowLevelOperationMode(operation); - ValidationException.check(mode.equals("copy-on-write"), "Unsupported mode for %s: %s", operation, mode); + ValidationException.check( + mode.equals("copy-on-write"), "Unsupported mode for %s: %s", operation, mode); return new SparkMergeBuilder(sparkSession(), icebergTable, operation, info); } @@ -210,8 +219,7 @@ private String getRowLevelOperationMode(String operation) { @Override public boolean canDeleteWhere(Filter[] filters) { Preconditions.checkArgument( - snapshotId == null, - "Cannot delete from table at a specific snapshot: %s", snapshotId); + snapshotId == null, "Cannot delete from table at a specific snapshot: %s", snapshotId); if (table().specs().size() > 1) { // cannot guarantee a metadata delete will be successful if we have multiple specs @@ -223,7 +231,8 @@ public boolean canDeleteWhere(Filter[] filters) { for (Filter filter : filters) { // return false if the filter requires rewrite or if we cannot translate the filter - if (requiresRewrite(filter, schema, identitySourceIds) || SparkFilters.convert(filter) == null) { + if (requiresRewrite(filter, schema, identitySourceIds) + || SparkFilters.convert(filter) == null) { return false; } } @@ -235,18 +244,19 @@ private boolean requiresRewrite(Filter filter, Schema schema, Set ident // TODO: handle dots correctly via v2references // TODO: detect more cases that don't require rewrites Set filterRefs = Sets.newHashSet(filter.references()); - return filterRefs.stream().anyMatch(ref -> { - Types.NestedField field = schema.findField(ref); - ValidationException.check(field != null, "Cannot find field %s in schema", ref); - return !identitySourceIds.contains(field.fieldId()); - }); + return filterRefs.stream() + .anyMatch( + ref -> { + Types.NestedField field = schema.findField(ref); + ValidationException.check(field != null, "Cannot find field %s in schema", ref); + return !identitySourceIds.contains(field.fieldId()); + }); } @Override public void deleteWhere(Filter[] filters) { Preconditions.checkArgument( - snapshotId == null, - "Cannot delete from table at a specific snapshot: %s", snapshotId); + snapshotId == null, "Cannot delete from table at a specific snapshot: %s", snapshotId); Expression deleteExpr = SparkFilters.convert(filters); @@ -255,7 +265,8 @@ public void deleteWhere(Filter[] filters) { return; } - icebergTable.newDelete() + icebergTable + .newDelete() .set("spark.app.id", sparkSession().sparkContext().applicationId()) .deleteFromRowFilter(deleteExpr) .commit(); @@ -285,12 +296,15 @@ public int hashCode() { return icebergTable.name().hashCode(); } - private static CaseInsensitiveStringMap addSnapshotId(CaseInsensitiveStringMap options, Long snapshotId) { + private static CaseInsensitiveStringMap addSnapshotId( + CaseInsensitiveStringMap options, Long snapshotId) { if (snapshotId != null) { String snapshotIdFromOptions = options.get(SparkReadOptions.SNAPSHOT_ID); String value = snapshotId.toString(); - Preconditions.checkArgument(snapshotIdFromOptions == null || snapshotIdFromOptions.equals(value), - "Cannot override snapshot ID more than once: %s", snapshotIdFromOptions); + Preconditions.checkArgument( + snapshotIdFromOptions == null || snapshotIdFromOptions.equals(value), + "Cannot override snapshot ID more than once: %s", + snapshotIdFromOptions); Map scanOptions = Maps.newHashMap(); scanOptions.putAll(options.asCaseSensitiveMap()); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java index f771281848ff..3ba40bc88582 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.IsolationLevel.SERIALIZABLE; +import static org.apache.iceberg.IsolationLevel.SNAPSHOT; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; + import java.io.IOException; import java.util.Arrays; import java.util.Collections; @@ -80,17 +90,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.IsolationLevel.SERIALIZABLE; -import static org.apache.iceberg.IsolationLevel.SNAPSHOT; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; - class SparkWrite { private static final Logger LOG = LoggerFactory.getLogger(SparkWrite.class); @@ -108,9 +107,14 @@ class SparkWrite { private boolean cleanupOnAbort = true; - SparkWrite(SparkSession spark, Table table, SparkWriteConf writeConf, - LogicalWriteInfo writeInfo, String applicationId, - Schema writeSchema, StructType dsSchema) { + SparkWrite( + SparkSession spark, + Table table, + SparkWriteConf writeConf, + LogicalWriteInfo writeInfo, + String applicationId, + Schema writeSchema, + StructType dsSchema) { this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); this.table = table; this.queryId = writeInfo.queryId(); @@ -153,15 +157,21 @@ StreamingWrite asStreamingOverwrite() { } private boolean isWapTable() { - return Boolean.parseBoolean(table.properties().getOrDefault( - TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED_DEFAULT)); + return Boolean.parseBoolean( + table + .properties() + .getOrDefault( + TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, + TableProperties.WRITE_AUDIT_PUBLISH_ENABLED_DEFAULT)); } // the writer factory works for both batch and streaming private WriterFactory createWriterFactory() { // broadcast the table metadata as the writer factory will be sent to executors - Broadcast
    tableBroadcast = sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); - return new WriterFactory(tableBroadcast, format, targetFileSize, writeSchema, dsSchema, partitionedFanoutEnabled); + Broadcast
    tableBroadcast = + sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); + return new WriterFactory( + tableBroadcast, format, targetFileSize, writeSchema, dsSchema, partitionedFanoutEnabled); } private void commitOperation(SnapshotUpdate operation, String description) { @@ -202,24 +212,33 @@ private void abort(WriterCommitMessage[] messages) { Tasks.foreach(files(messages)) .retry(PropertyUtil.propertyAsInt(props, COMMIT_NUM_RETRIES, COMMIT_NUM_RETRIES_DEFAULT)) .exponentialBackoff( - PropertyUtil.propertyAsInt(props, COMMIT_MIN_RETRY_WAIT_MS, COMMIT_MIN_RETRY_WAIT_MS_DEFAULT), - PropertyUtil.propertyAsInt(props, COMMIT_MAX_RETRY_WAIT_MS, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT), - PropertyUtil.propertyAsInt(props, COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_MIN_RETRY_WAIT_MS, COMMIT_MIN_RETRY_WAIT_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_MAX_RETRY_WAIT_MS, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), 2.0 /* exponential */) .throwFailureWhenFinished() - .run(file -> { - table.io().deleteFile(file.path().toString()); - }); + .run( + file -> { + table.io().deleteFile(file.path().toString()); + }); } else { - LOG.warn("Skipping cleaning up of data files because Iceberg was unable to determine the final commit state"); + LOG.warn( + "Skipping cleaning up of data files because Iceberg was unable to determine the final commit state"); } } private Iterable files(WriterCommitMessage[] messages) { if (messages.length > 0) { - return Iterables.concat(Iterables.transform(Arrays.asList(messages), message -> message != null ? - ImmutableList.copyOf(((TaskCommit) message).files()) : - ImmutableList.of())); + return Iterables.concat( + Iterables.transform( + Arrays.asList(messages), + message -> + message != null + ? ImmutableList.copyOf(((TaskCommit) message).files()) + : ImmutableList.of())); } return ImmutableList.of(); } @@ -274,7 +293,9 @@ public void commit(WriterCommitMessage[] messages) { dynamicOverwrite.addFile(file); } - commitOperation(dynamicOverwrite, String.format("dynamic partition overwrite with %d new data files", numFiles)); + commitOperation( + dynamicOverwrite, + String.format("dynamic partition overwrite with %d new data files", numFiles)); } } @@ -296,7 +317,8 @@ public void commit(WriterCommitMessage[] messages) { overwriteFiles.addFile(file); } - String commitMsg = String.format("overwrite by filter %s with %d new data files", overwriteExpr, numFiles); + String commitMsg = + String.format("overwrite by filter %s with %d new data files", overwriteExpr, numFiles); commitOperation(overwriteFiles, commitMsg); } } @@ -352,9 +374,8 @@ public void commit(WriterCommitMessage[] messages) { } } - private void commitWithSerializableIsolation(OverwriteFiles overwriteFiles, - int numOverwrittenFiles, - int numAddedFiles) { + private void commitWithSerializableIsolation( + OverwriteFiles overwriteFiles, int numOverwrittenFiles, int numAddedFiles) { Long scanSnapshotId = scan.snapshotId(); if (scanSnapshotId != null) { overwriteFiles.validateFromSnapshot(scanSnapshotId); @@ -365,15 +386,15 @@ private void commitWithSerializableIsolation(OverwriteFiles overwriteFiles, overwriteFiles.validateNoConflictingData(); overwriteFiles.validateNoConflictingDeletes(); - String commitMsg = String.format( - "overwrite of %d data files with %d new data files, scanSnapshotId: %d, conflictDetectionFilter: %s", - numOverwrittenFiles, numAddedFiles, scanSnapshotId, conflictDetectionFilter); + String commitMsg = + String.format( + "overwrite of %d data files with %d new data files, scanSnapshotId: %d, conflictDetectionFilter: %s", + numOverwrittenFiles, numAddedFiles, scanSnapshotId, conflictDetectionFilter); commitOperation(overwriteFiles, commitMsg); } - private void commitWithSnapshotIsolation(OverwriteFiles overwriteFiles, - int numOverwrittenFiles, - int numAddedFiles) { + private void commitWithSnapshotIsolation( + OverwriteFiles overwriteFiles, int numOverwrittenFiles, int numAddedFiles) { Long scanSnapshotId = scan.snapshotId(); if (scanSnapshotId != null) { overwriteFiles.validateFromSnapshot(scanSnapshotId); @@ -383,9 +404,10 @@ private void commitWithSnapshotIsolation(OverwriteFiles overwriteFiles, overwriteFiles.conflictDetectionFilter(conflictDetectionFilter); overwriteFiles.validateNoConflictingDeletes(); - String commitMsg = String.format( - "overwrite of %d data files with %d new data files", - numOverwrittenFiles, numAddedFiles); + String commitMsg = + String.format( + "overwrite of %d data files with %d new data files", + numOverwrittenFiles, numAddedFiles); commitOperation(overwriteFiles, commitMsg); } } @@ -504,7 +526,10 @@ public void doCommit(long epochId, WriterCommitMessage[] messages) { overwriteFiles.addFile(file); numFiles++; } - commit(overwriteFiles, epochId, String.format("streaming complete overwrite with %d new data files", numFiles)); + commit( + overwriteFiles, + epochId, + String.format("streaming complete overwrite with %d new data files", numFiles)); } } @@ -546,8 +571,13 @@ private static class WriterFactory implements DataWriterFactory, StreamingDataWr private final StructType dsSchema; private final boolean partitionedFanoutEnabled; - protected WriterFactory(Broadcast
    tableBroadcast, FileFormat format, long targetFileSize, - Schema writeSchema, StructType dsSchema, boolean partitionedFanoutEnabled) { + protected WriterFactory( + Broadcast
    tableBroadcast, + FileFormat format, + long targetFileSize, + Schema writeSchema, + StructType dsSchema, + boolean partitionedFanoutEnabled) { this.tableBroadcast = tableBroadcast; this.format = format; this.targetFileSize = targetFileSize; @@ -567,21 +597,28 @@ public DataWriter createWriter(int partitionId, long taskId, long e PartitionSpec spec = table.spec(); FileIO io = table.io(); - OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, partitionId, taskId) - .format(format) - .build(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table) - .dataFileFormat(format) - .dataSchema(writeSchema) - .dataSparkType(dsSchema) - .build(); + OutputFileFactory fileFactory = + OutputFileFactory.builderFor(table, partitionId, taskId).format(format).build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table) + .dataFileFormat(format) + .dataSchema(writeSchema) + .dataSparkType(dsSchema) + .build(); if (spec.isUnpartitioned()) { return new UnpartitionedDataWriter(writerFactory, fileFactory, io, spec, targetFileSize); } else { return new PartitionedDataWriter( - writerFactory, fileFactory, io, spec, writeSchema, dsSchema, targetFileSize, partitionedFanoutEnabled); + writerFactory, + fileFactory, + io, + spec, + writeSchema, + dsSchema, + targetFileSize, + partitionedFanoutEnabled); } } } @@ -597,9 +634,14 @@ private static class UnpartitionedDataWriter implements DataWriter private final FileWriter delegate; private final FileIO io; - private UnpartitionedDataWriter(SparkFileWriterFactory writerFactory, OutputFileFactory fileFactory, - FileIO io, PartitionSpec spec, long targetFileSize) { - this.delegate = new RollingDataWriter<>(writerFactory, fileFactory, io, targetFileSize, spec, null); + private UnpartitionedDataWriter( + SparkFileWriterFactory writerFactory, + OutputFileFactory fileFactory, + FileIO io, + PartitionSpec spec, + long targetFileSize) { + this.delegate = + new RollingDataWriter<>(writerFactory, fileFactory, io, targetFileSize, spec, null); this.io = io; } @@ -613,7 +655,7 @@ public WriterCommitMessage commit() throws IOException { close(); DataWriteResult result = delegate.result(); - TaskCommit taskCommit = new TaskCommit(result.dataFiles().toArray(new DataFile[0])); + TaskCommit taskCommit = new TaskCommit(result.dataFiles().toArray(new DataFile[0])); taskCommit.reportOutputMetrics(); return taskCommit; } @@ -639,9 +681,15 @@ private static class PartitionedDataWriter implements DataWriter { private final PartitionKey partitionKey; private final InternalRowWrapper internalRowWrapper; - private PartitionedDataWriter(SparkFileWriterFactory writerFactory, OutputFileFactory fileFactory, - FileIO io, PartitionSpec spec, Schema dataSchema, - StructType dataSparkType, long targetFileSize, boolean fanoutEnabled) { + private PartitionedDataWriter( + SparkFileWriterFactory writerFactory, + OutputFileFactory fileFactory, + FileIO io, + PartitionSpec spec, + Schema dataSchema, + StructType dataSparkType, + long targetFileSize, + boolean fanoutEnabled) { if (fanoutEnabled) { this.delegate = new FanoutDataWriter<>(writerFactory, fileFactory, io, targetFileSize); } else { @@ -664,7 +712,7 @@ public WriterCommitMessage commit() throws IOException { close(); DataWriteResult result = delegate.result(); - TaskCommit taskCommit = new TaskCommit(result.dataFiles().toArray(new DataFile[0])); + TaskCommit taskCommit = new TaskCommit(result.dataFiles().toArray(new DataFile[0])); taskCommit.reportOutputMetrics(); return taskCommit; } diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java index 60360847c8c4..fbef4051717e 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.IsolationLevel; @@ -70,7 +69,8 @@ class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, Suppo public WriteBuilder overwriteFiles(Scan scan, IsolationLevel writeIsolationLevel) { Preconditions.checkArgument(scan instanceof SparkMergeScan, "%s is not SparkMergeScan", scan); Preconditions.checkState(!overwriteByFilter, "Cannot overwrite individual files and by filter"); - Preconditions.checkState(!overwriteDynamic, "Cannot overwrite individual files and dynamically"); + Preconditions.checkState( + !overwriteDynamic, "Cannot overwrite individual files and dynamically"); this.overwriteFiles = true; this.mergeScan = (SparkMergeScan) scan; this.isolationLevel = writeIsolationLevel; @@ -79,7 +79,8 @@ public WriteBuilder overwriteFiles(Scan scan, IsolationLevel writeIsolationLevel @Override public WriteBuilder overwriteDynamicPartitions() { - Preconditions.checkState(!overwriteByFilter, "Cannot overwrite dynamically and by filter: %s", overwriteExpr); + Preconditions.checkState( + !overwriteByFilter, "Cannot overwrite dynamically and by filter: %s", overwriteExpr); Preconditions.checkState(!overwriteFiles, "Cannot overwrite individual files and dynamically"); this.overwriteDynamic = true; return this; @@ -87,13 +88,15 @@ public WriteBuilder overwriteDynamicPartitions() { @Override public WriteBuilder overwrite(Filter[] filters) { - Preconditions.checkState(!overwriteFiles, "Cannot overwrite individual files and using filters"); + Preconditions.checkState( + !overwriteFiles, "Cannot overwrite individual files and using filters"); this.overwriteExpr = SparkFilters.convert(filters); if (overwriteExpr == Expressions.alwaysTrue() && "dynamic".equals(overwriteMode)) { // use the write option to override truncating the table. use dynamic overwrite instead. this.overwriteDynamic = true; } else { - Preconditions.checkState(!overwriteDynamic, "Cannot overwrite dynamically and by filter: %s", overwriteExpr); + Preconditions.checkState( + !overwriteDynamic, "Cannot overwrite dynamically and by filter: %s", overwriteExpr); this.overwriteByFilter = true; } return this; @@ -102,17 +105,20 @@ public WriteBuilder overwrite(Filter[] filters) { @Override public BatchWrite buildForBatch() { // Validate - Preconditions.checkArgument(handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), + Preconditions.checkArgument( + handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsSchema); - TypeUtil.validateWriteSchema(table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); + TypeUtil.validateWriteSchema( + table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); SparkUtil.validatePartitionTransforms(table.spec()); // Get application id String appId = spark.sparkContext().applicationId(); - SparkWrite write = new SparkWrite(spark, table, writeConf, writeInfo, appId, writeSchema, dsSchema); + SparkWrite write = + new SparkWrite(spark, table, writeConf, writeInfo, appId, writeSchema, dsSchema); if (overwriteByFilter) { return write.asOverwriteByFilter(overwriteExpr); } else if (overwriteDynamic) { @@ -127,23 +133,28 @@ public BatchWrite buildForBatch() { @Override public StreamingWrite buildForStreaming() { // Validate - Preconditions.checkArgument(handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), + Preconditions.checkArgument( + handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsSchema); - TypeUtil.validateWriteSchema(table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); + TypeUtil.validateWriteSchema( + table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); SparkUtil.validatePartitionTransforms(table.spec()); // Change to streaming write if it is just append - Preconditions.checkState(!overwriteDynamic, - "Unsupported streaming operation: dynamic partition overwrite"); - Preconditions.checkState(!overwriteByFilter || overwriteExpr == Expressions.alwaysTrue(), - "Unsupported streaming operation: overwrite by filter: %s", overwriteExpr); + Preconditions.checkState( + !overwriteDynamic, "Unsupported streaming operation: dynamic partition overwrite"); + Preconditions.checkState( + !overwriteByFilter || overwriteExpr == Expressions.alwaysTrue(), + "Unsupported streaming operation: overwrite by filter: %s", + overwriteExpr); // Get application id String appId = spark.sparkContext().applicationId(); - SparkWrite write = new SparkWrite(spark, table, writeConf, writeInfo, appId, writeSchema, dsSchema); + SparkWrite write = + new SparkWrite(spark, table, writeConf, writeInfo, appId, writeSchema, dsSchema); if (overwriteByFilter) { return write.asStreamingOverwrite(); } else { diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/StagedSparkTable.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/StagedSparkTable.java index 2e018cb09496..b92c02d2b536 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/StagedSparkTable.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/StagedSparkTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.Transaction; diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java index 939b07a0af61..ddf6ca834d9b 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.OptionalLong; diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java index 64277ecf3be5..f2088deb1ee3 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import com.fasterxml.jackson.core.JsonGenerator; @@ -47,10 +46,10 @@ class StreamingOffset extends Offset { * An implementation of Spark Structured Streaming Offset, to track the current processed files of * Iceberg table. * - * @param snapshotId The current processed snapshot id. - * @param position The position of last scanned file in snapshot. - * @param scanAllFiles whether to scan all files in a snapshot; for example, to read - * all data when starting a stream. + * @param snapshotId The current processed snapshot id. + * @param position The position of last scanned file in snapshot. + * @param scanAllFiles whether to scan all files in a snapshot; for example, to read all data when + * starting a stream. */ StreamingOffset(long snapshotId, long position, boolean scanAllFiles) { this.snapshotId = snapshotId; @@ -65,7 +64,8 @@ static StreamingOffset fromJson(String json) { JsonNode node = JsonUtil.mapper().readValue(json, JsonNode.class); return fromJsonNode(node); } catch (IOException e) { - throw new UncheckedIOException(String.format("Failed to parse StreamingOffset from JSON string %s", json), e); + throw new UncheckedIOException( + String.format("Failed to parse StreamingOffset from JSON string %s", json), e); } } @@ -118,9 +118,9 @@ boolean shouldScanAllFiles() { public boolean equals(Object obj) { if (obj instanceof StreamingOffset) { StreamingOffset offset = (StreamingOffset) obj; - return offset.snapshotId == snapshotId && - offset.position == position && - offset.scanAllFiles == scanAllFiles; + return offset.snapshotId == snapshotId + && offset.position == position + && offset.scanAllFiles == scanAllFiles; } else { return false; } @@ -133,17 +133,20 @@ public int hashCode() { @Override public String toString() { - return String.format("Streaming Offset[%d: position (%d) scan_all_files (%b)]", - snapshotId, position, scanAllFiles); + return String.format( + "Streaming Offset[%d: position (%d) scan_all_files (%b)]", + snapshotId, position, scanAllFiles); } private static StreamingOffset fromJsonNode(JsonNode node) { // The version of StreamingOffset. The offset was created with a version number // used to validate when deserializing from json string. int version = JsonUtil.getInt(VERSION, node); - Preconditions.checkArgument(version == CURR_VERSION, + Preconditions.checkArgument( + version == CURR_VERSION, "This version of Iceberg source only supports version %s. Version %s is not supported.", - CURR_VERSION, version); + CURR_VERSION, + version); long snapshotId = JsonUtil.getLong(SNAPSHOT_ID, node); int position = JsonUtil.getInt(POSITION, node); diff --git a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java index a2288ef3edd7..3c7ebabeab3d 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java +++ b/spark/v3.0/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.math.BigDecimal; @@ -128,7 +127,8 @@ public int getInt(int ordinal) { } else if (integer instanceof LocalDate) { return (int) ((LocalDate) integer).toEpochDay(); } else { - throw new IllegalStateException("Unknown type for int field. Type name: " + integer.getClass().getName()); + throw new IllegalStateException( + "Unknown type for int field. Type name: " + integer.getClass().getName()); } } @@ -143,7 +143,8 @@ public long getLong(int ordinal) { } else if (longVal instanceof LocalDate) { return ((LocalDate) longVal).toEpochDay(); } else { - throw new IllegalStateException("Unknown type for long field. Type name: " + longVal.getClass().getName()); + throw new IllegalStateException( + "Unknown type for long field. Type name: " + longVal.getClass().getName()); } } @@ -190,7 +191,8 @@ private byte[] getBinaryInternal(int ordinal) { } else if (bytes instanceof byte[]) { return (byte[]) bytes; } else { - throw new IllegalStateException("Unknown type for binary field. Type name: " + bytes.getClass().getName()); + throw new IllegalStateException( + "Unknown type for binary field. Type name: " + bytes.getClass().getName()); } } @@ -206,8 +208,7 @@ public InternalRow getStruct(int ordinal, int numFields) { private InternalRow getStructInternal(int ordinal, int numFields) { return new StructInternalRow( - type.fields().get(ordinal).type().asStructType(), - struct.get(ordinal, StructLike.class)); + type.fields().get(ordinal).type().asStructType(), struct.get(ordinal, StructLike.class)); } @Override @@ -227,7 +228,8 @@ public MapData getMap(int ordinal) { } private MapData getMapInternal(int ordinal) { - return mapToMapData(type.fields().get(ordinal).type().asMapType(), struct.get(ordinal, Map.class)); + return mapToMapData( + type.fields().get(ordinal).type().asMapType(), struct.get(ordinal, Map.class)); } @Override @@ -292,31 +294,52 @@ private ArrayData collectionToArrayData(Type elementType, Collection values) case DOUBLE: return fillArray(values, array -> (pos, value) -> array[pos] = value); case STRING: - return fillArray(values, array -> - (BiConsumer) (pos, seq) -> array[pos] = UTF8String.fromString(seq.toString())); + return fillArray( + values, + array -> + (BiConsumer) + (pos, seq) -> array[pos] = UTF8String.fromString(seq.toString())); case FIXED: case BINARY: - return fillArray(values, array -> - (BiConsumer) (pos, buf) -> array[pos] = ByteBuffers.toByteArray(buf)); + return fillArray( + values, + array -> + (BiConsumer) + (pos, buf) -> array[pos] = ByteBuffers.toByteArray(buf)); case DECIMAL: - return fillArray(values, array -> - (BiConsumer) (pos, dec) -> array[pos] = Decimal.apply(dec)); + return fillArray( + values, + array -> + (BiConsumer) (pos, dec) -> array[pos] = Decimal.apply(dec)); case STRUCT: - return fillArray(values, array -> (BiConsumer) (pos, tuple) -> - array[pos] = new StructInternalRow(elementType.asStructType(), tuple)); + return fillArray( + values, + array -> + (BiConsumer) + (pos, tuple) -> + array[pos] = new StructInternalRow(elementType.asStructType(), tuple)); case LIST: - return fillArray(values, array -> (BiConsumer>) (pos, list) -> - array[pos] = collectionToArrayData(elementType.asListType().elementType(), list)); + return fillArray( + values, + array -> + (BiConsumer>) + (pos, list) -> + array[pos] = + collectionToArrayData(elementType.asListType().elementType(), list)); case MAP: - return fillArray(values, array -> (BiConsumer>) (pos, map) -> - array[pos] = mapToMapData(elementType.asMapType(), map)); + return fillArray( + values, + array -> + (BiConsumer>) + (pos, map) -> array[pos] = mapToMapData(elementType.asMapType(), map)); default: throw new UnsupportedOperationException("Unsupported array element type: " + elementType); } } @SuppressWarnings("unchecked") - private GenericArrayData fillArray(Collection values, Function> makeSetter) { + private GenericArrayData fillArray( + Collection values, Function> makeSetter) { Object[] array = new Object[values.size()]; BiConsumer setter = makeSetter.apply(array); diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/catalyst/analysis/NoSuchProcedureException.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/catalyst/analysis/NoSuchProcedureException.java index 1c83396f430b..f5ba22a062b0 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/catalyst/analysis/NoSuchProcedureException.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/catalyst/analysis/NoSuchProcedureException.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.catalyst.analysis; import org.apache.spark.sql.AnalysisException; @@ -25,6 +24,11 @@ public class NoSuchProcedureException extends AnalysisException { public NoSuchProcedureException(Identifier ident) { - super("Procedure " + ident + " not found", Option.empty(), Option.empty(), Option.empty(), Option.empty()); + super( + "Procedure " + ident + " not found", + Option.empty(), + Option.empty(), + Option.empty(), + Option.empty()); } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ExtendedSupportsDelete.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ExtendedSupportsDelete.java index e59bfe209f6b..a5f40f04c280 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ExtendedSupportsDelete.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ExtendedSupportsDelete.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.connector.catalog.SupportsDelete; @@ -25,14 +24,15 @@ // this should be part of SupportsDelete when merged upstream public interface ExtendedSupportsDelete extends SupportsDelete { /** - * Checks if it is possible to delete data from a data source table that matches filter expressions. - *

    - * Rows should be deleted from the data source iff all of the filter expressions match. That is, the - * expressions must be interpreted as a set of filters that are ANDed together. - *

    - * Spark will call this method to check if the delete is possible without significant effort. - * Otherwise, Spark will try to rewrite the delete operation if the data source table - * supports row-level operations. + * Checks if it is possible to delete data from a data source table that matches filter + * expressions. + * + *

    Rows should be deleted from the data source iff all of the filter expressions match. That + * is, the expressions must be interpreted as a set of filters that are ANDed together. + * + *

    Spark will call this method to check if the delete is possible without significant effort. + * Otherwise, Spark will try to rewrite the delete operation if the data source table supports + * row-level operations. * * @param filters filter expressions, used to select rows to delete when all expressions match * @return true if the delete operation can be performed diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/Procedure.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/Procedure.java index 8f7a70b9f9fc..11f215ba040a 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/Procedure.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/Procedure.java @@ -16,44 +16,34 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.types.StructType; -/** - * An interface representing a stored procedure available for execution. - */ +/** An interface representing a stored procedure available for execution. */ public interface Procedure { - /** - * Returns the input parameters of this procedure. - */ + /** Returns the input parameters of this procedure. */ ProcedureParameter[] parameters(); - /** - * Returns the type of rows produced by this procedure. - */ + /** Returns the type of rows produced by this procedure. */ StructType outputType(); /** * Executes this procedure. - *

    - * Spark will align the provided arguments according to the input parameters - * defined in {@link #parameters()} either by position or by name before execution. - *

    - * Implementations may provide a summary of execution by returning one or many rows - * as a result. The schema of output rows must match the defined output type - * in {@link #outputType()}. + * + *

    Spark will align the provided arguments according to the input parameters defined in {@link + * #parameters()} either by position or by name before execution. + * + *

    Implementations may provide a summary of execution by returning one or many rows as a + * result. The schema of output rows must match the defined output type in {@link #outputType()}. * * @param args input arguments * @return the result of executing this procedure with the given arguments */ InternalRow[] call(InternalRow args); - /** - * Returns the description of this procedure. - */ + /** Returns the description of this procedure. */ default String description() { return this.getClass().toString(); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureCatalog.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureCatalog.java index 314bd659460e..2cee97ee5938 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureCatalog.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.catalyst.analysis.NoSuchProcedureException; @@ -25,9 +24,9 @@ /** * A catalog API for working with stored procedures. - *

    - * Implementations should implement this interface if they expose stored procedures that - * can be called via CALL statements. + * + *

    Implementations should implement this interface if they expose stored procedures that can be + * called via CALL statements. */ public interface ProcedureCatalog extends CatalogPlugin { /** diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameter.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameter.java index b341dc1e3282..e1e84b2597f3 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameter.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameter.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.types.DataType; -/** - * An input parameter of a {@link Procedure stored procedure}. - */ +/** An input parameter of a {@link Procedure stored procedure}. */ public interface ProcedureParameter { /** @@ -48,18 +45,12 @@ static ProcedureParameter optional(String name, DataType dataType) { return new ProcedureParameterImpl(name, dataType, false); } - /** - * Returns the name of this parameter. - */ + /** Returns the name of this parameter. */ String name(); - /** - * Returns the type of this parameter. - */ + /** Returns the type of this parameter. */ DataType dataType(); - /** - * Returns true if this parameter is required. - */ + /** Returns true if this parameter is required. */ boolean required(); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameterImpl.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameterImpl.java index cea1e80f4051..c59951e24330 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameterImpl.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameterImpl.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import java.util.Objects; import org.apache.spark.sql.types.DataType; -/** - * A {@link ProcedureParameter} implementation. - */ +/** A {@link ProcedureParameter} implementation. */ class ProcedureParameterImpl implements ProcedureParameter { private final String name; private final DataType dataType; @@ -60,9 +57,9 @@ public boolean equals(Object other) { } ProcedureParameterImpl that = (ProcedureParameterImpl) other; - return required == that.required && - Objects.equals(name, that.name) && - Objects.equals(dataType, that.dataType); + return required == that.required + && Objects.equals(name, that.name) + && Objects.equals(dataType, that.dataType); } @Override @@ -72,6 +69,7 @@ public int hashCode() { @Override public String toString() { - return String.format("ProcedureParameter(name='%s', type=%s, required=%b)", name, dataType, required); + return String.format( + "ProcedureParameter(name='%s', type=%s, required=%b)", name, dataType, required); } } diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/SupportsMerge.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/SupportsMerge.java index d36fe926ce3d..49d9047f26ad 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/SupportsMerge.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/SupportsMerge.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.connector.catalog.Table; @@ -25,14 +24,15 @@ /** * A mix-in interface for Table to indicate that it supports row-level operations. - *

    - * This adds {@link #newMergeBuilder(String, LogicalWriteInfo)} that is used to create a scan and + * + *

    This adds {@link #newMergeBuilder(String, LogicalWriteInfo)} that is used to create a scan and * a write for a row-level operation. */ public interface SupportsMerge extends Table { /** - * Returns a {@link MergeBuilder} which can be used to create both a scan and a write for a row-level - * operation. Spark will call this method to configure each data source row-level operation. + * Returns a {@link MergeBuilder} which can be used to create both a scan and a write for a + * row-level operation. Spark will call this method to configure each data source row-level + * operation. * * @param info write info * @return a merge builder diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/ClusteredDistribution.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/ClusteredDistribution.java index 942ffac3d092..fc0b77dc6373 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/ClusteredDistribution.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/ClusteredDistribution.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.distributions; import org.apache.spark.annotation.Experimental; @@ -30,8 +29,6 @@ */ @Experimental public interface ClusteredDistribution extends Distribution { - /** - * Returns clustering expressions. - */ + /** Returns clustering expressions. */ Expression[] clustering(); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/Distribution.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/Distribution.java index 0f3fdbf47c0a..dec9674aaa18 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/Distribution.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/Distribution.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.distributions; import org.apache.spark.annotation.Experimental; @@ -27,6 +26,4 @@ * @since 3.2.0 */ @Experimental -public interface Distribution { -} - +public interface Distribution {} diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/Distributions.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/Distributions.java index 6e25fe09dc67..00860e11edc8 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/Distributions.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/Distributions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.distributions; import org.apache.spark.annotation.Experimental; @@ -33,12 +32,9 @@ */ @Experimental public class Distributions { - private Distributions() { - } + private Distributions() {} - /** - * Creates a distribution where no promises are made about co-location of data. - */ + /** Creates a distribution where no promises are made about co-location of data. */ public static UnspecifiedDistribution unspecified() { return new UnspecifiedDistributionImpl(); } @@ -52,8 +48,8 @@ public static ClusteredDistribution clustered(Expression[] clustering) { } /** - * Creates a distribution where tuples have been ordered across partitions according - * to ordering expressions, but not necessarily within a given partition. + * Creates a distribution where tuples have been ordered across partitions according to ordering + * expressions, but not necessarily within a given partition. */ public static OrderedDistribution ordered(SortOrder[] ordering) { return new OrderedDistributionImpl(ordering); diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/OrderedDistribution.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/OrderedDistribution.java index 09a25f37b627..ba646ee46a45 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/OrderedDistribution.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/OrderedDistribution.java @@ -16,22 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.distributions; import org.apache.spark.annotation.Experimental; import org.apache.spark.sql.connector.iceberg.expressions.SortOrder; /** - * A distribution where tuples have been ordered across partitions according - * to ordering expressions, but not necessarily within a given partition. + * A distribution where tuples have been ordered across partitions according to ordering + * expressions, but not necessarily within a given partition. * * @since 3.2.0 */ @Experimental public interface OrderedDistribution extends Distribution { - /** - * Returns ordering expressions. - */ + /** Returns ordering expressions. */ SortOrder[] ordering(); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/UnspecifiedDistribution.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/UnspecifiedDistribution.java index 1bdcfc1ebbaf..0e88218c6c12 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/UnspecifiedDistribution.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/UnspecifiedDistribution.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.distributions; import org.apache.spark.annotation.Experimental; diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/ClusterDistributionImpl.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/ClusterDistributionImpl.java index d05274668cce..3aadab1defa9 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/ClusterDistributionImpl.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/ClusterDistributionImpl.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.spark.sql.connector.iceberg.distributions.impl; import org.apache.spark.sql.connector.expressions.Expression; diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/OrderedDistributionImpl.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/OrderedDistributionImpl.java index 773ace0692ba..6ae7afa49003 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/OrderedDistributionImpl.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/OrderedDistributionImpl.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.spark.sql.connector.iceberg.distributions.impl; import org.apache.spark.sql.connector.iceberg.distributions.OrderedDistribution; diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/UnspecifiedDistributionImpl.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/UnspecifiedDistributionImpl.java index d69c912ff367..3944b7289c59 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/UnspecifiedDistributionImpl.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/UnspecifiedDistributionImpl.java @@ -16,12 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.spark.sql.connector.iceberg.distributions.impl; import org.apache.spark.sql.connector.iceberg.distributions.UnspecifiedDistribution; -public class UnspecifiedDistributionImpl implements UnspecifiedDistribution { - -} +public class UnspecifiedDistributionImpl implements UnspecifiedDistribution {} diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/NullOrdering.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/NullOrdering.java index b7c6e1c5f414..712ea0b2bb91 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/NullOrdering.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/NullOrdering.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.expressions; import org.apache.spark.annotation.Experimental; @@ -28,7 +27,8 @@ */ @Experimental public enum NullOrdering { - NULLS_FIRST, NULLS_LAST; + NULLS_FIRST, + NULLS_LAST; @Override public String toString() { diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/SortDirection.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/SortDirection.java index 211702548038..cb3d8cac93b1 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/SortDirection.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/SortDirection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.expressions; import org.apache.spark.annotation.Experimental; @@ -28,7 +27,8 @@ */ @Experimental public enum SortDirection { - ASCENDING, DESCENDING; + ASCENDING, + DESCENDING; @Override public String toString() { diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/SortOrder.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/SortOrder.java index d3345d4becec..d31fed59daa3 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/SortOrder.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/SortOrder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.expressions; import org.apache.spark.annotation.Experimental; @@ -29,18 +28,12 @@ */ @Experimental public interface SortOrder extends Expression { - /** - * Returns the sort expression. - */ + /** Returns the sort expression. */ Expression expression(); - /** - * Returns the sort direction. - */ + /** Returns the sort direction. */ SortDirection direction(); - /** - * Returns the null ordering. - */ + /** Returns the null ordering. */ NullOrdering nullOrdering(); } diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/read/SupportsFileFilter.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/read/SupportsFileFilter.java index f0b28ae7a4d4..0eb46f401a92 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/read/SupportsFileFilter.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/read/SupportsFileFilter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.read; import java.util.Set; diff --git a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/MergeBuilder.java b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/MergeBuilder.java index 8b1224b3628b..edb17057d832 100644 --- a/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/MergeBuilder.java +++ b/spark/v3.0/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/MergeBuilder.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import org.apache.spark.sql.connector.read.ScanBuilder; import org.apache.spark.sql.connector.write.WriteBuilder; -/** - * An interface for building a scan and a write for a row-level operation. - */ +/** An interface for building a scan and a write for a row-level operation. */ public interface MergeBuilder { /** * Creates a scan builder for a row-level operation. diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/KryoHelpers.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/KryoHelpers.java index ee0f0a73959a..6d88aaa11813 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/KryoHelpers.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/KryoHelpers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.esotericsoftware.kryo.Kryo; @@ -32,8 +31,7 @@ public class KryoHelpers { - private KryoHelpers() { - } + private KryoHelpers() {} @SuppressWarnings("unchecked") public static T roundTripSerialize(T obj) throws IOException { @@ -45,7 +43,8 @@ public static T roundTripSerialize(T obj) throws IOException { kryo.writeClassAndObject(out, obj); } - try (Input in = new Input(new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray())))) { + try (Input in = + new Input(new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray())))) { return (T) kryo.readClassAndObject(in); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java index 99396647ee3e..235cf69ef449 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Comparator; @@ -25,15 +24,14 @@ import org.junit.Assert; public final class TaskCheckHelper { - private TaskCheckHelper() { - } + private TaskCheckHelper() {} public static void assertEquals(BaseCombinedScanTask expected, BaseCombinedScanTask actual) { List expectedTasks = getFileScanTasksInFilePathOrder(expected); List actualTasks = getFileScanTasksInFilePathOrder(actual); - Assert.assertEquals("The number of file scan tasks should match", - expectedTasks.size(), actualTasks.size()); + Assert.assertEquals( + "The number of file scan tasks should match", expectedTasks.size(), actualTasks.size()); for (int i = 0; i < expectedTasks.size(); i++) { FileScanTask expectedTask = expectedTasks.get(i); @@ -50,38 +48,56 @@ public static void assertEquals(FileScanTask expected, FileScanTask actual) { Assert.assertEquals("starting position doesn't match", expected.start(), actual.start()); - Assert.assertEquals("the number of bytes to scan doesn't match", expected.start(), actual.start()); + Assert.assertEquals( + "the number of bytes to scan doesn't match", expected.start(), actual.start()); // simplify comparison on residual expression via comparing toString - Assert.assertEquals("Residual expression doesn't match", - expected.residual().toString(), actual.residual().toString()); + Assert.assertEquals( + "Residual expression doesn't match", + expected.residual().toString(), + actual.residual().toString()); } public static void assertEquals(DataFile expected, DataFile actual) { - Assert.assertEquals("Should match the serialized record path", - expected.path(), actual.path()); - Assert.assertEquals("Should match the serialized record format", - expected.format(), actual.format()); - Assert.assertEquals("Should match the serialized record partition", - expected.partition().get(0, Object.class), actual.partition().get(0, Object.class)); - Assert.assertEquals("Should match the serialized record count", - expected.recordCount(), actual.recordCount()); - Assert.assertEquals("Should match the serialized record size", - expected.fileSizeInBytes(), actual.fileSizeInBytes()); - Assert.assertEquals("Should match the serialized record value counts", - expected.valueCounts(), actual.valueCounts()); - Assert.assertEquals("Should match the serialized record null value counts", - expected.nullValueCounts(), actual.nullValueCounts()); - Assert.assertEquals("Should match the serialized record lower bounds", - expected.lowerBounds(), actual.lowerBounds()); - Assert.assertEquals("Should match the serialized record upper bounds", - expected.upperBounds(), actual.upperBounds()); - Assert.assertEquals("Should match the serialized record key metadata", - expected.keyMetadata(), actual.keyMetadata()); - Assert.assertEquals("Should match the serialized record offsets", - expected.splitOffsets(), actual.splitOffsets()); - Assert.assertEquals("Should match the serialized record offsets", - expected.keyMetadata(), actual.keyMetadata()); + Assert.assertEquals("Should match the serialized record path", expected.path(), actual.path()); + Assert.assertEquals( + "Should match the serialized record format", expected.format(), actual.format()); + Assert.assertEquals( + "Should match the serialized record partition", + expected.partition().get(0, Object.class), + actual.partition().get(0, Object.class)); + Assert.assertEquals( + "Should match the serialized record count", expected.recordCount(), actual.recordCount()); + Assert.assertEquals( + "Should match the serialized record size", + expected.fileSizeInBytes(), + actual.fileSizeInBytes()); + Assert.assertEquals( + "Should match the serialized record value counts", + expected.valueCounts(), + actual.valueCounts()); + Assert.assertEquals( + "Should match the serialized record null value counts", + expected.nullValueCounts(), + actual.nullValueCounts()); + Assert.assertEquals( + "Should match the serialized record lower bounds", + expected.lowerBounds(), + actual.lowerBounds()); + Assert.assertEquals( + "Should match the serialized record upper bounds", + expected.upperBounds(), + actual.upperBounds()); + Assert.assertEquals( + "Should match the serialized record key metadata", + expected.keyMetadata(), + actual.keyMetadata()); + Assert.assertEquals( + "Should match the serialized record offsets", + expected.splitOffsets(), + actual.splitOffsets()); + Assert.assertEquals( + "Should match the serialized record offsets", expected.keyMetadata(), actual.keyMetadata()); } private static List getFileScanTasksInFilePathOrder(BaseCombinedScanTask task) { diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java index 12fa8b2fc539..33b5316b72b7 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TaskCheckHelper.assertEquals; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; @@ -51,22 +54,17 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.TaskCheckHelper.assertEquals; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestDataFileSerialization { - private static final Schema DATE_SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema DATE_SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec PARTITION_SPEC = PartitionSpec - .builderFor(DATE_SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec PARTITION_SPEC = + PartitionSpec.builderFor(DATE_SCHEMA).identity("date").build(); private static final Map VALUE_COUNTS = Maps.newHashMap(); private static final Map NULL_VALUE_COUNTS = Maps.newHashMap(); @@ -85,20 +83,26 @@ public class TestDataFileSerialization { UPPER_BOUNDS.put(1, longToBuffer(4L)); } - private static final DataFile DATA_FILE = DataFiles - .builder(PARTITION_SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(1234) - .withPartitionPath("date=2018-06-08") - .withMetrics(new Metrics( - 5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS)) - .withSplitOffsets(ImmutableList.of(4L)) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) - .withSortOrder(SortOrder.unsorted()) - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final DataFile DATA_FILE = + DataFiles.builder(PARTITION_SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(1234) + .withPartitionPath("date=2018-06-08") + .withMetrics( + new Metrics( + 5L, + null, + VALUE_COUNTS, + NULL_VALUE_COUNTS, + NAN_VALUE_COUNTS, + LOWER_BOUNDS, + UPPER_BOUNDS)) + .withSplitOffsets(ImmutableList.of(4L)) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) + .withSortOrder(SortOrder.unsorted()) + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testDataFileKryoSerialization() throws Exception { @@ -128,7 +132,8 @@ public void testDataFileJavaSerialization() throws Exception { out.writeObject(DATA_FILE.copy()); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { for (int i = 0; i < 2; i += 1) { Object obj = in.readObject(); Assertions.assertThat(obj).as("Should be a DataFile").isInstanceOf(DataFile.class); @@ -140,13 +145,14 @@ public void testDataFileJavaSerialization() throws Exception { @Test public void testParquetWriterSplitOffsets() throws IOException { Iterable records = RandomData.generateSpark(DATE_SCHEMA, 1, 33L); - File parquetFile = new File( - temp.getRoot(), - FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); + File parquetFile = + new File(temp.getRoot(), FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); FileAppender writer = Parquet.write(Files.localOutput(parquetFile)) .schema(DATE_SCHEMA) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType)) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType)) .build(); try { writer.addAll(records); diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java index 49a85cb68f17..c6f491ece5ad 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -36,36 +38,29 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestFileIOSerialization { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("date").build(); - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA) - .asc("id") - .build(); + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); static { CONF.set("k1", "v1"); CONF.set("k2", "v2"); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @Before diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java index 25004aa110e4..a20b2d9f05de 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; @@ -47,56 +49,57 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestManifestFileSerialization { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - required(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("double") - .build(); - - private static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withPartition(TestHelpers.Row.of(1D)) - .withPartitionPath("double=1") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - ImmutableMap.of(), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); - - private static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(0) - .withPartition(TestHelpers.Row.of(Double.NaN)) - .withPartitionPath("double=NaN") - .withMetrics(new Metrics(1L, - null, // no column sizes - ImmutableMap.of(1, 1L, 4, 1L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - ImmutableMap.of(4, 1L), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(1L)) // upper bounds - )) - .build(); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + required(4, "double", Types.DoubleType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("double").build(); + + private static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(0) + .withPartition(TestHelpers.Row.of(1D)) + .withPartitionPath("double=1") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + ImmutableMap.of(), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(4L)) // upper bounds + )) + .build(); + + private static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-2.parquet") + .withFileSizeInBytes(0) + .withPartition(TestHelpers.Row.of(Double.NaN)) + .withPartitionPath("double=NaN") + .withMetrics( + new Metrics( + 1L, + null, // no column sizes + ImmutableMap.of(1, 1L, 4, 1L), // value count + ImmutableMap.of(1, 0L, 2, 0L), // null count + ImmutableMap.of(4, 1L), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(1L)) // upper bounds + )) + .build(); private static final FileIO FILE_IO = new HadoopFileIO(new Configuration()); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testManifestFileKryoSerialization() throws IOException { @@ -134,7 +137,8 @@ public void testManifestFileJavaSerialization() throws Exception { out.writeObject(GenericManifestFile.copyOf(manifest).build()); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { for (int i = 0; i < 3; i += 1) { Object obj = in.readObject(); Assertions.assertThat(obj).as("Should be a ManifestFile").isInstanceOf(ManifestFile.class); @@ -148,27 +152,46 @@ private void checkManifestFile(ManifestFile expected, ManifestFile actual) { Assert.assertEquals("Length must match", expected.length(), actual.length()); Assert.assertEquals("Spec id must match", expected.partitionSpecId(), actual.partitionSpecId()); Assert.assertEquals("Snapshot id must match", expected.snapshotId(), actual.snapshotId()); - Assert.assertEquals("Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); - Assert.assertEquals("Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); - Assert.assertEquals("Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); - Assert.assertEquals("Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); - Assert.assertEquals("Existing files count must match", expected.existingFilesCount(), actual.existingFilesCount()); - Assert.assertEquals("Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); - Assert.assertEquals("Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); - Assert.assertEquals("Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); - Assert.assertEquals("Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); + Assert.assertEquals( + "Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); + Assert.assertEquals( + "Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); + Assert.assertEquals( + "Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); + Assert.assertEquals( + "Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); + Assert.assertEquals( + "Existing files count must match", + expected.existingFilesCount(), + actual.existingFilesCount()); + Assert.assertEquals( + "Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); + Assert.assertEquals( + "Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); + Assert.assertEquals( + "Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); + Assert.assertEquals( + "Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); PartitionFieldSummary expectedPartition = expected.partitions().get(0); PartitionFieldSummary actualPartition = actual.partitions().get(0); - Assert.assertEquals("Null flag in partition must match", - expectedPartition.containsNull(), actualPartition.containsNull()); - Assert.assertEquals("NaN flag in partition must match", - expectedPartition.containsNaN(), actualPartition.containsNaN()); - Assert.assertEquals("Lower bounds in partition must match", - expectedPartition.lowerBound(), actualPartition.lowerBound()); - Assert.assertEquals("Upper bounds in partition must match", - expectedPartition.upperBound(), actualPartition.upperBound()); + Assert.assertEquals( + "Null flag in partition must match", + expectedPartition.containsNull(), + actualPartition.containsNull()); + Assert.assertEquals( + "NaN flag in partition must match", + expectedPartition.containsNaN(), + actualPartition.containsNaN()); + Assert.assertEquals( + "Lower bounds in partition must match", + expectedPartition.lowerBound(), + actualPartition.lowerBound()); + Assert.assertEquals( + "Upper bounds in partition must match", + expectedPartition.upperBound(), + actualPartition.upperBound()); } private ManifestFile writeManifest(DataFile... files) throws IOException { diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java index e234ee2617aa..4dd34f7a7611 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; + import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; @@ -50,19 +51,16 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestScanTaskSerialization extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String tableLocation = null; @@ -86,7 +84,9 @@ public void testBaseCombinedScanTaskKryoSerialization() throws Exception { try (Input in = new Input(new FileInputStream(data))) { Object obj = kryo.readClassAndObject(in); - Assertions.assertThat(obj).as("Should be a BaseCombinedScanTask").isInstanceOf(BaseCombinedScanTask.class); + Assertions.assertThat(obj) + .as("Should be a BaseCombinedScanTask") + .isInstanceOf(BaseCombinedScanTask.class); TaskCheckHelper.assertEquals(scanTask, (BaseCombinedScanTask) obj); } } @@ -100,9 +100,12 @@ public void testBaseCombinedScanTaskJavaSerialization() throws Exception { out.writeObject(scanTask); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { Object obj = in.readObject(); - Assertions.assertThat(obj).as("Should be a BaseCombinedScanTask").isInstanceOf(BaseCombinedScanTask.class); + Assertions.assertThat(obj) + .as("Should be a BaseCombinedScanTask") + .isInstanceOf(BaseCombinedScanTask.class); TaskCheckHelper.assertEquals(scanTask, (BaseCombinedScanTask) obj); } } @@ -112,16 +115,15 @@ private BaseCombinedScanTask prepareBaseCombinedScanTaskForSerDeTest() { Map options = Maps.newHashMap(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -136,10 +138,6 @@ private void writeRecords(List records) { } private void writeDF(Dataset df) { - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java index 8aa89b9f3199..30a167d575b1 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -32,30 +34,23 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestTableSerialization { private static final HadoopTables TABLES = new HadoopTables(); - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("date").build(); - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA) - .asc("id") - .build(); + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @Before @@ -71,19 +66,20 @@ public void initTable() throws IOException { @Test public void testSerializableTableKryoSerialization() throws IOException { Table serializableTable = SerializableTableWithSize.copyOf(table); - TestHelpers.assertSerializedAndLoadedMetadata(table, KryoHelpers.roundTripSerialize(serializableTable)); + TestHelpers.assertSerializedAndLoadedMetadata( + table, KryoHelpers.roundTripSerialize(serializableTable)); } @Test public void testSerializableMetadataTableKryoSerialization() throws IOException { for (MetadataTableType type : MetadataTableType.values()) { TableOperations ops = ((HasTableOperations) table).operations(); - Table metadataTable = MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); + Table metadataTable = + MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); Table serializableMetadataTable = SerializableTableWithSize.copyOf(metadataTable); TestHelpers.assertSerializedAndLoadedMetadata( - metadataTable, - KryoHelpers.roundTripSerialize(serializableMetadataTable)); + metadataTable, KryoHelpers.roundTripSerialize(serializableMetadataTable)); } } @@ -91,13 +87,12 @@ public void testSerializableMetadataTableKryoSerialization() throws IOException public void testSerializableTransactionTableKryoSerialization() throws IOException { Transaction txn = table.newTransaction(); - txn.updateProperties() - .set("k1", "v1") - .commit(); + txn.updateProperties().set("k1", "v1").commit(); Table txnTable = txn.table(); Table serializableTxnTable = SerializableTableWithSize.copyOf(txnTable); - TestHelpers.assertSerializedMetadata(txnTable, KryoHelpers.roundTripSerialize(serializableTxnTable)); + TestHelpers.assertSerializedMetadata( + txnTable, KryoHelpers.roundTripSerialize(serializableTxnTable)); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java index 5d5dfebf9532..1006ed380ff9 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java @@ -16,26 +16,29 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; public enum SparkCatalogConfig { - HIVE("testhive", SparkCatalog.class.getName(), ImmutableMap.of( - "type", "hive", - "default-namespace", "default" - )), - HADOOP("testhadoop", SparkCatalog.class.getName(), ImmutableMap.of( - "type", "hadoop" - )), - SPARK("spark_catalog", SparkSessionCatalog.class.getName(), ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "parquet-enabled", "true", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - )); + HIVE( + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default")), + HADOOP("testhadoop", SparkCatalog.class.getName(), ImmutableMap.of("type", "hadoop")), + SPARK( + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "parquet-enabled", "true", + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync + )); private final String catalogName; private final String implementation; diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java index 774e81328b2b..89323c26100c 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -31,29 +30,33 @@ public abstract class SparkCatalogTestBase extends SparkTestBaseWithCatalog { // these parameters are broken out to avoid changes that need to modify lots of test suites @Parameterized.Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") public static Object[][] parameters() { - return new Object[][] {{ - SparkCatalogConfig.HIVE.catalogName(), - SparkCatalogConfig.HIVE.implementation(), - SparkCatalogConfig.HIVE.properties() - }, { - SparkCatalogConfig.HADOOP.catalogName(), - SparkCatalogConfig.HADOOP.implementation(), - SparkCatalogConfig.HADOOP.properties() - }, { - SparkCatalogConfig.SPARK.catalogName(), - SparkCatalogConfig.SPARK.implementation(), - SparkCatalogConfig.SPARK.properties() - }}; + return new Object[][] { + { + SparkCatalogConfig.HIVE.catalogName(), + SparkCatalogConfig.HIVE.implementation(), + SparkCatalogConfig.HIVE.properties() + }, + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties() + }, + { + SparkCatalogConfig.SPARK.catalogName(), + SparkCatalogConfig.SPARK.implementation(), + SparkCatalogConfig.SPARK.properties() + } + }; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); public SparkCatalogTestBase(SparkCatalogConfig config) { super(config); } - public SparkCatalogTestBase(String catalogName, String implementation, Map config) { + public SparkCatalogTestBase( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java index 51fd6017031c..9db0d6d410ee 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; + import java.util.List; import java.util.Map; import java.util.Set; @@ -46,8 +47,6 @@ import org.junit.BeforeClass; import scala.collection.JavaConverters; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; - public abstract class SparkTestBase { protected static final Object ANY = new Object(); @@ -63,15 +62,18 @@ public static void startMetastoreAndSpark() { metastore.start(); SparkTestBase.hiveConf = metastore.hiveConf(); - SparkTestBase.spark = SparkSession.builder() - .master("local[2]") - .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .enableHiveSupport() - .getOrCreate(); + SparkTestBase.spark = + SparkSession.builder() + .master("local[2]") + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .enableHiveSupport() + .getOrCreate(); - SparkTestBase.catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + SparkTestBase.catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); try { catalog.createNamespace(Namespace.of("default")); @@ -112,22 +114,23 @@ protected List rowsToJava(List rows) { private Object[] toJava(Row row) { return IntStream.range(0, row.size()) - .mapToObj(pos -> { - if (row.isNullAt(pos)) { - return null; - } + .mapToObj( + pos -> { + if (row.isNullAt(pos)) { + return null; + } - Object value = row.get(pos); - if (value instanceof Row) { - return toJava((Row) value); - } else if (value instanceof scala.collection.Seq) { - return row.getList(pos); - } else if (value instanceof scala.collection.Map) { - return row.getJavaMap(pos); - } else { - return value; - } - }) + Object value = row.get(pos); + if (value instanceof Row) { + return toJava((Row) value); + } else if (value instanceof scala.collection.Seq) { + return row.getList(pos); + } else if (value instanceof scala.collection.Map) { + return row.getJavaMap(pos); + } else { + return value; + } + }) .toArray(Object[]::new); } @@ -143,8 +146,10 @@ protected Object[] row(Object... values) { return values; } - protected void assertEquals(String context, List expectedRows, List actualRows) { - Assert.assertEquals(context + ": number of results should match", expectedRows.size(), actualRows.size()); + protected void assertEquals( + String context, List expectedRows, List actualRows) { + Assert.assertEquals( + context + ": number of results should match", expectedRows.size(), actualRows.size()); for (int row = 0; row < expectedRows.size(); row += 1) { Object[] expected = expectedRows.get(row); Object[] actual = actualRows.get(row); @@ -178,59 +183,70 @@ protected void withSQLConf(Map conf, Action action) { SQLConf sqlConf = SQLConf.get(); Map currentConfValues = Maps.newHashMap(); - conf.keySet().forEach(confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); + conf.keySet() + .forEach( + confKey -> { + if (sqlConf.contains(confKey)) { + String currentConfValue = sqlConf.getConfString(confKey); + currentConfValues.put(confKey, currentConfValue); + } + }); - conf.forEach((confKey, confValue) -> { - if (SQLConf.staticConfKeys().contains(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); + conf.forEach( + (confKey, confValue) -> { + if (SQLConf.staticConfKeys().contains(confKey)) { + throw new RuntimeException("Cannot modify the value of a static config: " + confKey); + } + sqlConf.setConfString(confKey, confValue); + }); try { action.invoke(); } finally { - conf.forEach((confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); + conf.forEach( + (confKey, confValue) -> { + if (currentConfValues.containsKey(confKey)) { + sqlConf.setConfString(confKey, currentConfValues.get(confKey)); + } else { + sqlConf.unsetConf(confKey); + } + }); } } private Map currentExecutionUIDataMap() throws TimeoutException { spark.sparkContext().listenerBus().waitUntilEmpty(10000); - return JavaConverters.seqAsJavaList(spark.sharedState().statusStore().executionsList()) - .stream().collect(Collectors.toMap(data -> data.executionId(), data -> data)); + return JavaConverters.seqAsJavaList(spark.sharedState().statusStore().executionsList()).stream() + .collect(Collectors.toMap(data -> data.executionId(), data -> data)); } - protected void checkMetrics(Callable sparkCallable, Map expectedMetrics) throws Exception { + protected void checkMetrics(Callable sparkCallable, Map expectedMetrics) + throws Exception { Set originalExecutionIds = currentExecutionUIDataMap().keySet(); sparkCallable.call(); Map currentExecutions = currentExecutionUIDataMap(); Set currentExecutionIds = currentExecutions.keySet(); currentExecutionIds.removeAll(originalExecutionIds); Assert.assertEquals(currentExecutionIds.size(), 1); - SQLExecutionUIData currentExecution = currentExecutions.get(currentExecutionIds.stream().findFirst().get()); + SQLExecutionUIData currentExecution = + currentExecutions.get(currentExecutionIds.stream().findFirst().get()); Map metricsIds = Maps.newHashMap(); - JavaConverters.seqAsJavaList(currentExecution.metrics()).stream().forEach(metricsDeclaration -> { - if (expectedMetrics.containsKey(metricsDeclaration.name())) { - metricsIds.put(metricsDeclaration.accumulatorId(), metricsDeclaration.name()); - } - }); - Assert.assertEquals("Expected metric name not match", - expectedMetrics.keySet(), Sets.newHashSet(metricsIds.values())); + JavaConverters.seqAsJavaList(currentExecution.metrics()).stream() + .forEach( + metricsDeclaration -> { + if (expectedMetrics.containsKey(metricsDeclaration.name())) { + metricsIds.put(metricsDeclaration.accumulatorId(), metricsDeclaration.name()); + } + }); + Assert.assertEquals( + "Expected metric name not match", + expectedMetrics.keySet(), + Sets.newHashSet(metricsIds.values())); Map currentMetrics = - JavaConverters.mapAsJavaMap(spark.sharedState().statusStore().executionMetrics(currentExecution.executionId())) + JavaConverters.mapAsJavaMap( + spark.sharedState().statusStore().executionMetrics(currentExecution.executionId())) .entrySet().stream() .filter(x -> metricsIds.containsKey(x.getKey())) .collect(Collectors.toMap(x -> metricsIds.get(x.getKey()), x -> x.getValue())); diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestBaseWithCatalog.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestBaseWithCatalog.java index 00dcd95ec709..857c61e068b0 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestBaseWithCatalog.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestBaseWithCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.io.File; @@ -49,8 +48,7 @@ public static void dropWarehouse() { } } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); protected final String catalogName; protected final Catalog validationCatalog; @@ -66,21 +64,25 @@ public SparkTestBaseWithCatalog(SparkCatalogConfig config) { this(config.catalogName(), config.implementation(), config.properties()); } - public SparkTestBaseWithCatalog(String catalogName, String implementation, Map config) { + public SparkTestBaseWithCatalog( + String catalogName, String implementation, Map config) { this.catalogName = catalogName; - this.validationCatalog = catalogName.equals("testhadoop") ? - new HadoopCatalog(spark.sessionState().newHadoopConf(), "file:" + warehouse) : - catalog; + this.validationCatalog = + catalogName.equals("testhadoop") + ? new HadoopCatalog(spark.sessionState().newHadoopConf(), "file:" + warehouse) + : catalog; this.validationNamespaceCatalog = (SupportsNamespaces) validationCatalog; spark.conf().set("spark.sql.catalog." + catalogName, implementation); - config.forEach((key, value) -> spark.conf().set("spark.sql.catalog." + catalogName + "." + key, value)); + config.forEach( + (key, value) -> spark.conf().set("spark.sql.catalog." + catalogName + "." + key, value)); if (config.get("type").equalsIgnoreCase("hadoop")) { spark.conf().set("spark.sql.catalog." + catalogName + ".warehouse", "file:" + warehouse); } - this.tableName = (catalogName.equals("spark_catalog") ? "" : catalogName + ".") + "default.table"; + this.tableName = + (catalogName.equals("spark_catalog") ? "" : catalogName + ".") + "default.table"; sql("CREATE NAMESPACE IF NOT EXISTS default"); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestFileRewriteCoordinator.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestFileRewriteCoordinator.java index 8aa5cd6faec1..2e6886d32df5 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestFileRewriteCoordinator.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestFileRewriteCoordinator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.io.IOException; @@ -44,7 +43,8 @@ public class TestFileRewriteCoordinator extends SparkCatalogTestBase { - public TestFileRewriteCoordinator(String catalogName, String implementation, Map config) { + public TestFileRewriteCoordinator( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -66,7 +66,8 @@ public void testBinPackRewrite() throws NoSuchTableException, IOException { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should produce 4 snapshots", 4, Iterables.size(table.snapshots())); - Dataset fileDF = spark.read().format("iceberg").load(tableName(tableIdent.name() + ".files")); + Dataset fileDF = + spark.read().format("iceberg").load(tableName(tableIdent.name() + ".files")); List fileSizes = fileDF.select("file_size_in_bytes").as(Encoders.LONG()).collectAsList(); long avgFileSize = fileSizes.stream().mapToLong(i -> i).sum() / fileSizes.size(); @@ -77,22 +78,27 @@ public void testBinPackRewrite() throws NoSuchTableException, IOException { taskSetManager.stageTasks(table, fileSetID, Lists.newArrayList(fileScanTasks)); // read and pack original 4 files into 2 splits - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) - .option(SparkReadOptions.SPLIT_SIZE, Long.toString(avgFileSize * 2)) - .option(SparkReadOptions.FILE_OPEN_COST, "0") - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) + .option(SparkReadOptions.SPLIT_SIZE, Long.toString(avgFileSize * 2)) + .option(SparkReadOptions.FILE_OPEN_COST, "0") + .load(tableName); // write the packed data into new files where each split becomes a new file - scanDF.writeTo(tableName) + scanDF + .writeTo(tableName) .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID) .append(); // commit the rewrite FileRewriteCoordinator rewriteCoordinator = FileRewriteCoordinator.get(); - Set rewrittenFiles = taskSetManager.fetchTasks(table, fileSetID).stream() - .map(FileScanTask::file) - .collect(Collectors.toSet()); + Set rewrittenFiles = + taskSetManager.fetchTasks(table, fileSetID).stream() + .map(FileScanTask::file) + .collect(Collectors.toSet()); Set addedFiles = rewriteCoordinator.fetchNewDataFiles(table, fileSetID); table.newRewrite().rewriteFiles(rewrittenFiles, addedFiles).commit(); } @@ -127,34 +133,42 @@ public void testSortRewrite() throws NoSuchTableException, IOException { taskSetManager.stageTasks(table, fileSetID, Lists.newArrayList(fileScanTasks)); // read original 4 files as 4 splits - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) - .option(SparkReadOptions.SPLIT_SIZE, "134217728") - .option(SparkReadOptions.FILE_OPEN_COST, "134217728") - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) + .option(SparkReadOptions.SPLIT_SIZE, "134217728") + .option(SparkReadOptions.FILE_OPEN_COST, "134217728") + .load(tableName); // make sure we disable AQE and set the number of shuffle partitions as the target num files - ImmutableMap sqlConf = ImmutableMap.of( - "spark.sql.shuffle.partitions", "2", - "spark.sql.adaptive.enabled", "false" - ); - - withSQLConf(sqlConf, () -> { - try { - // write new files with sorted records - scanDF.sort("id").writeTo(tableName) - .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID) - .append(); - } catch (NoSuchTableException e) { - throw new RuntimeException("Could not replace files", e); - } - }); + ImmutableMap sqlConf = + ImmutableMap.of( + "spark.sql.shuffle.partitions", "2", + "spark.sql.adaptive.enabled", "false"); + + withSQLConf( + sqlConf, + () -> { + try { + // write new files with sorted records + scanDF + .sort("id") + .writeTo(tableName) + .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID) + .append(); + } catch (NoSuchTableException e) { + throw new RuntimeException("Could not replace files", e); + } + }); // commit the rewrite FileRewriteCoordinator rewriteCoordinator = FileRewriteCoordinator.get(); - Set rewrittenFiles = taskSetManager.fetchTasks(table, fileSetID).stream() - .map(FileScanTask::file) - .collect(Collectors.toSet()); + Set rewrittenFiles = + taskSetManager.fetchTasks(table, fileSetID).stream() + .map(FileScanTask::file) + .collect(Collectors.toSet()); Set addedFiles = rewriteCoordinator.fetchNewDataFiles(table, fileSetID); table.newRewrite().rewriteFiles(rewrittenFiles, addedFiles).commit(); } @@ -199,7 +213,8 @@ public void testCommitMultipleRewrites() throws NoSuchTableException, IOExceptio String secondFileSetID = UUID.randomUUID().toString(); - try (CloseableIterable tasks = table.newScan().appendsAfter(firstFileSetSnapshotId).planFiles()) { + try (CloseableIterable tasks = + table.newScan().appendsAfter(firstFileSetSnapshotId).planFiles()) { // stage 2 more files for compaction taskSetManager.stageTasks(table, secondFileSetID, Lists.newArrayList(tasks)); } @@ -208,26 +223,32 @@ public void testCommitMultipleRewrites() throws NoSuchTableException, IOExceptio for (String fileSetID : fileSetIDs) { // read and pack 2 files into 1 split - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) - .option(SparkReadOptions.SPLIT_SIZE, Long.MAX_VALUE) - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) + .option(SparkReadOptions.SPLIT_SIZE, Long.MAX_VALUE) + .load(tableName); // write the combined data as one file - scanDF.writeTo(tableName) + scanDF + .writeTo(tableName) .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID) .append(); } // commit both rewrites at the same time FileRewriteCoordinator rewriteCoordinator = FileRewriteCoordinator.get(); - Set rewrittenFiles = fileSetIDs.stream().flatMap(fileSetID -> - taskSetManager.fetchTasks(table, fileSetID).stream()) - .map(FileScanTask::file) - .collect(Collectors.toSet()); - Set addedFiles = fileSetIDs.stream() - .flatMap(fileSetID -> rewriteCoordinator.fetchNewDataFiles(table, fileSetID).stream()) - .collect(Collectors.toSet()); + Set rewrittenFiles = + fileSetIDs.stream() + .flatMap(fileSetID -> taskSetManager.fetchTasks(table, fileSetID).stream()) + .map(FileScanTask::file) + .collect(Collectors.toSet()); + Set addedFiles = + fileSetIDs.stream() + .flatMap(fileSetID -> rewriteCoordinator.fetchNewDataFiles(table, fileSetID).stream()) + .collect(Collectors.toSet()); table.newRewrite().rewriteFiles(rewrittenFiles, addedFiles).commit(); table.refresh(); diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSpark3Util.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSpark3Util.java index 8d57fe112033..e3191c67e263 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSpark3Util.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSpark3Util.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.NullOrder.NULLS_FIRST; +import static org.apache.iceberg.NullOrder.NULLS_LAST; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; import org.apache.iceberg.SortOrderParser; @@ -27,54 +31,69 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.NullOrder.NULLS_FIRST; -import static org.apache.iceberg.NullOrder.NULLS_LAST; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSpark3Util extends SparkTestBase { @Test public void testDescribeSortOrder() { - Schema schema = new Schema( + Schema schema = + new Schema( required(1, "data", Types.StringType.get()), - required(2, "time", Types.TimestampType.withoutZone()) - ); + required(2, "time", Types.TimestampType.withoutZone())); - Assert.assertEquals("Sort order isn't correct.", "data DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "data DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("Identity", schema, 1))); - Assert.assertEquals("Sort order isn't correct.", "bucket(1, data) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "bucket(1, data) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("bucket[1]", schema, 1))); - Assert.assertEquals("Sort order isn't correct.", "truncate(data, 3) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "truncate(data, 3) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("truncate[3]", schema, 1))); - Assert.assertEquals("Sort order isn't correct.", "years(time) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "years(time) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("year", schema, 2))); - Assert.assertEquals("Sort order isn't correct.", "months(time) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "months(time) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("month", schema, 2))); - Assert.assertEquals("Sort order isn't correct.", "days(time) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "days(time) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("day", schema, 2))); - Assert.assertEquals("Sort order isn't correct.", "hours(time) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "hours(time) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("hour", schema, 2))); - Assert.assertEquals("Sort order isn't correct.", "unknown(data) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "unknown(data) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("unknown", schema, 1))); // multiple sort orders - SortOrder multiOrder = SortOrder.builderFor(schema) - .asc("time", NULLS_FIRST) - .asc("data", NULLS_LAST) - .build(); - Assert.assertEquals("Sort order isn't correct.", "time ASC NULLS FIRST, data ASC NULLS LAST", - Spark3Util.describe(multiOrder)); + SortOrder multiOrder = + SortOrder.builderFor(schema).asc("time", NULLS_FIRST).asc("data", NULLS_LAST).build(); + Assert.assertEquals( + "Sort order isn't correct.", + "time ASC NULLS FIRST, data ASC NULLS LAST", + Spark3Util.describe(multiOrder)); } @Test public void testDescribeSchema() { - Schema schema = new Schema( - required(1, "data", Types.ListType.ofRequired(2, Types.StringType.get())), - optional(3, "pairs", Types.MapType.ofOptional(4, 5, Types.StringType.get(), Types.LongType.get())), - required(6, "time", Types.TimestampType.withoutZone()) - ); + Schema schema = + new Schema( + required(1, "data", Types.ListType.ofRequired(2, Types.StringType.get())), + optional( + 3, + "pairs", + Types.MapType.ofOptional(4, 5, Types.StringType.get(), Types.LongType.get())), + required(6, "time", Types.TimestampType.withoutZone())); - Assert.assertEquals("Schema description isn't correct.", + Assert.assertEquals( + "Schema description isn't correct.", "struct not null,pairs: map,time: timestamp not null>", Spark3Util.describe(schema)); } @@ -93,15 +112,20 @@ public void testLoadIcebergTable() throws Exception { } private SortOrder buildSortOrder(String transform, Schema schema, int sourceId) { - String jsonString = "{\n" + - " \"order-id\" : 10,\n" + - " \"fields\" : [ {\n" + - " \"transform\" : \"" + transform + "\",\n" + - " \"source-id\" : " + sourceId + ",\n" + - " \"direction\" : \"desc\",\n" + - " \"null-order\" : \"nulls-first\"\n" + - " } ]\n" + - "}"; + String jsonString = + "{\n" + + " \"order-id\" : 10,\n" + + " \"fields\" : [ {\n" + + " \"transform\" : \"" + + transform + + "\",\n" + + " \"source-id\" : " + + sourceId + + ",\n" + + " \"direction\" : \"desc\",\n" + + " \"null-order\" : \"nulls-first\"\n" + + " } ]\n" + + "}"; return SortOrderParser.fromJson(schema, jsonString); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkCatalogOperations.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkCatalogOperations.java index 5c9f3c4cb189..0836271a7c22 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkCatalogOperations.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkCatalogOperations.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkFilters.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkFilters.java index abda21198360..b3f9df10b698 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkFilters.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkFilters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.sql.Date; @@ -42,10 +41,14 @@ public void testTimestampFilterConversion() { Expression timestampExpression = SparkFilters.convert(GreaterThan.apply("x", timestamp)); Expression rawExpression = Expressions.greaterThan("x", epochMicros); - Assert.assertEquals("Generated Timestamp expression should be correct", - rawExpression.toString(), timestampExpression.toString()); - Assert.assertEquals("Generated Instant expression should be correct", - rawExpression.toString(), instantExpression.toString()); + Assert.assertEquals( + "Generated Timestamp expression should be correct", + rawExpression.toString(), + timestampExpression.toString()); + Assert.assertEquals( + "Generated Instant expression should be correct", + rawExpression.toString(), + instantExpression.toString()); } @Test @@ -58,10 +61,14 @@ public void testDateFilterConversion() { Expression dateExpression = SparkFilters.convert(GreaterThan.apply("x", date)); Expression rawExpression = Expressions.greaterThan("x", epochDay); - Assert.assertEquals("Generated localdate expression should be correct", - rawExpression.toString(), localDateExpression.toString()); + Assert.assertEquals( + "Generated localdate expression should be correct", + rawExpression.toString(), + localDateExpression.toString()); - Assert.assertEquals("Generated date expression should be correct", - rawExpression.toString(), dateExpression.toString()); + Assert.assertEquals( + "Generated date expression should be correct", + rawExpression.toString(), + dateExpression.toString()); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java index 8bb32c969842..4e6331982d85 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java @@ -16,34 +16,33 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import org.apache.iceberg.Schema; import org.apache.iceberg.types.Types; import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestSparkSchemaUtil { - private static final Schema TEST_SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema TEST_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); @Test public void testEstiamteSizeMaxValue() throws IOException { - Assert.assertEquals("estimateSize returns Long max value", Long.MAX_VALUE, - SparkSchemaUtil.estimateSize( - null, - Long.MAX_VALUE)); + Assert.assertEquals( + "estimateSize returns Long max value", + Long.MAX_VALUE, + SparkSchemaUtil.estimateSize(null, Long.MAX_VALUE)); } @Test public void testEstiamteSizeWithOverflow() throws IOException { - long tableSize = SparkSchemaUtil.estimateSize(SparkSchemaUtil.convert(TEST_SCHEMA), Long.MAX_VALUE - 1); + long tableSize = + SparkSchemaUtil.estimateSize(SparkSchemaUtil.convert(TEST_SCHEMA), Long.MAX_VALUE - 1); Assert.assertEquals("estimateSize handles overflow", Long.MAX_VALUE, tableSize); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkTableUtil.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkTableUtil.java index 2b67fc922e0b..5c9baab4f89e 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkTableUtil.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkTableUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.io.IOException; diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java index 57941b8c7940..7f00c7edd8a9 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Schema; @@ -31,51 +30,55 @@ public class TestSparkValueConverter { @Test public void testSparkNullMapConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); assertCorrectNullConversion(schema); } @Test public void testSparkNullListConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", - Types.ListType.ofOptional(6, Types.StringType.get()) - ) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, "locations", Types.ListType.ofOptional(6, Types.StringType.get()))); assertCorrectNullConversion(schema); } @Test public void testSparkNullStructConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); assertCorrectNullConversion(schema); } @Test public void testSparkNullPrimitiveConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "location", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(5, "location", Types.StringType.get())); assertCorrectNullConversion(schema); } @@ -83,7 +86,8 @@ private void assertCorrectNullConversion(Schema schema) { Row sparkRow = RowFactory.create(1, null); Record record = GenericRecord.create(schema); record.set(0, 1); - Assert.assertEquals("Round-trip conversion should produce original value", + Assert.assertEquals( + "Round-trip conversion should produce original value", record, SparkValueConverter.convert(schema, sparkRow)); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java index 0a9ab58f5a56..544c730bfe89 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.io.File; @@ -69,45 +68,61 @@ import scala.collection.Seq; public class TestCreateActions extends SparkCatalogTestBase { - private static final String CREATE_PARTITIONED_PARQUET = "CREATE TABLE %s (id INT, data STRING) " + - "using parquet PARTITIONED BY (id) LOCATION '%s'"; - private static final String CREATE_PARQUET = "CREATE TABLE %s (id INT, data STRING) " + - "using parquet LOCATION '%s'"; - private static final String CREATE_HIVE_EXTERNAL_PARQUET = "CREATE EXTERNAL TABLE %s (data STRING) " + - "PARTITIONED BY (id INT) STORED AS parquet LOCATION '%s'"; - private static final String CREATE_HIVE_PARQUET = "CREATE TABLE %s (data STRING) " + - "PARTITIONED BY (id INT) STORED AS parquet"; + private static final String CREATE_PARTITIONED_PARQUET = + "CREATE TABLE %s (id INT, data STRING) " + "using parquet PARTITIONED BY (id) LOCATION '%s'"; + private static final String CREATE_PARQUET = + "CREATE TABLE %s (id INT, data STRING) " + "using parquet LOCATION '%s'"; + private static final String CREATE_HIVE_EXTERNAL_PARQUET = + "CREATE EXTERNAL TABLE %s (data STRING) " + + "PARTITIONED BY (id INT) STORED AS parquet LOCATION '%s'"; + private static final String CREATE_HIVE_PARQUET = + "CREATE TABLE %s (data STRING) " + "PARTITIONED BY (id INT) STORED AS parquet"; private static final String NAMESPACE = "default"; @Parameterized.Parameters(name = "Catalog Name {0} - Options {2}") public static Object[][] parameters() { return new Object[][] { - new Object[] {"spark_catalog", SparkSessionCatalog.class.getName(), ImmutableMap.of( + new Object[] { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( "type", "hive", "default-namespace", "default", "parquet-enabled", "true", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - )}, - new Object[] {"spark_catalog", SparkSessionCatalog.class.getName(), ImmutableMap.of( + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync + ) + }, + new Object[] { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( "type", "hadoop", "default-namespace", "default", "parquet-enabled", "true", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - )}, - new Object[] { "testhive", SparkCatalog.class.getName(), ImmutableMap.of( + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync + ) + }, + new Object[] { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( "type", "hive", - "default-namespace", "default" - )}, - new Object[] { "testhadoop", SparkCatalog.class.getName(), ImmutableMap.of( + "default-namespace", "default") + }, + new Object[] { + "testhadoop", + SparkCatalog.class.getName(), + ImmutableMap.of( "type", "hadoop", - "default-namespace", "default" - )} + "default-namespace", "default") + } }; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String baseTableName = "baseTable"; private File tableDir; @@ -115,10 +130,7 @@ public static Object[][] parameters() { private final String type; private final TableCatalog catalog; - public TestCreateActions( - String catalogName, - String implementation, - Map config) { + public TestCreateActions(String catalogName, String implementation, Map config) { super(catalogName, implementation, config); this.catalog = (TableCatalog) spark.sessionState().catalogManager().catalog(catalogName); this.type = config.get("type"); @@ -138,15 +150,15 @@ public void before() { spark.conf().set("spark.sql.parquet.writeLegacyFormat", false); spark.sql(String.format("DROP TABLE IF EXISTS %s", baseTableName)); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").orderBy("data").write() + df.select("id", "data") + .orderBy("data") + .write() .mode("append") .option("path", tableLocation) .saveAsTable(baseTableName); @@ -161,7 +173,8 @@ public void after() throws IOException { @Test public void testMigratePartitioned() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_migrate_partitioned_table"); String dest = source; createSourceTable(CREATE_PARTITIONED_PARQUET, source); @@ -171,17 +184,20 @@ public void testMigratePartitioned() throws Exception { @Test public void testPartitionedTableWithUnRecoveredPartitions() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_unrecovered_partitions"); String dest = source; File location = temp.newFolder(); sql(CREATE_PARTITIONED_PARQUET, source, location); // Data generation and partition addition - spark.range(5) + spark + .range(5) .selectExpr("id", "cast(id as STRING) as data") .write() - .partitionBy("id").mode(SaveMode.Overwrite) + .partitionBy("id") + .mode(SaveMode.Overwrite) .parquet(location.toURI().toString()); sql("ALTER TABLE %s ADD PARTITION(id=0)", source); @@ -191,7 +207,8 @@ public void testPartitionedTableWithUnRecoveredPartitions() throws Exception { @Test public void testPartitionedTableWithCustomPartitions() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_custom_parts"); String dest = source; File tblLocation = temp.newFolder(); @@ -199,18 +216,23 @@ public void testPartitionedTableWithCustomPartitions() throws Exception { // Data generation and partition addition spark.sql(String.format(CREATE_PARTITIONED_PARQUET, source, tblLocation)); - spark.range(10) + spark + .range(10) .selectExpr("cast(id as STRING) as data") .write() - .mode(SaveMode.Overwrite).parquet(partitionDataLoc.toURI().toString()); - sql("ALTER TABLE %s ADD PARTITION(id=0) LOCATION '%s'", source, partitionDataLoc.toURI().toString()); + .mode(SaveMode.Overwrite) + .parquet(partitionDataLoc.toURI().toString()); + sql( + "ALTER TABLE %s ADD PARTITION(id=0) LOCATION '%s'", + source, partitionDataLoc.toURI().toString()); assertMigratedFileCount(SparkActions.get().migrateTable(source), source, dest); } @Test public void testAddColumnOnMigratedTableAtEnd() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_add_column_migrated_table"); String dest = source; createSourceTable(CREATE_PARQUET, source); @@ -249,7 +271,8 @@ public void testAddColumnOnMigratedTableAtEnd() throws Exception { @Test public void testAddColumnOnMigratedTableAtMiddle() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_add_column_migrated_table_middle"); String dest = source; createSourceTable(CREATE_PARQUET, source); @@ -263,7 +286,10 @@ public void testAddColumnOnMigratedTableAtMiddle() throws Exception { // test column addition on migrated table Schema beforeSchema = table.schema(); String newCol1 = "newCol"; - sparkTable.table().updateSchema().addColumn("newCol", Types.IntegerType.get()) + sparkTable + .table() + .updateSchema() + .addColumn("newCol", Types.IntegerType.get()) .moveAfter(newCol1, "id") .commit(); Schema afterSchema = table.schema(); @@ -279,16 +305,20 @@ public void testAddColumnOnMigratedTableAtMiddle() throws Exception { @Test public void removeColumnsAtEnd() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_remove_column_migrated_table"); String dest = source; String colName1 = "newCol1"; String colName2 = "newCol2"; File location = temp.newFolder(); - spark.range(10).selectExpr("cast(id as INT)", "CAST(id as INT) " + colName1, "CAST(id as INT) " + colName2) + spark + .range(10) + .selectExpr("cast(id as INT)", "CAST(id as INT) " + colName1, "CAST(id as INT) " + colName2) .write() - .mode(SaveMode.Overwrite).saveAsTable(dest); + .mode(SaveMode.Overwrite) + .saveAsTable(dest); List expected1 = sql("select id, %s from %s order by id", colName1, source); List expected2 = sql("select id from %s order by id", source); @@ -322,13 +352,19 @@ public void removeColumnsAtEnd() throws Exception { @Test public void removeColumnFromMiddle() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_remove_column_migrated_table_from_middle"); String dest = source; String dropColumnName = "col1"; - spark.range(10).selectExpr("cast(id as INT)", "CAST(id as INT) as " + - dropColumnName, "CAST(id as INT) as col2").write().mode(SaveMode.Overwrite).saveAsTable(dest); + spark + .range(10) + .selectExpr( + "cast(id as INT)", "CAST(id as INT) as " + dropColumnName, "CAST(id as INT) as col2") + .write() + .mode(SaveMode.Overwrite) + .saveAsTable(dest); List expected = sql("select id, col2 from %s order by id", source); // migrate table @@ -348,7 +384,8 @@ public void removeColumnFromMiddle() throws Exception { @Test public void testMigrateUnpartitioned() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_migrate_unpartitioned_table"); String dest = source; createSourceTable(CREATE_PARQUET, source); @@ -357,40 +394,49 @@ public void testMigrateUnpartitioned() throws Exception { @Test public void testSnapshotPartitioned() throws Exception { - Assume.assumeTrue("Cannot snapshot with arbitrary location in a hadoop based catalog", + Assume.assumeTrue( + "Cannot snapshot with arbitrary location in a hadoop based catalog", !type.equals("hadoop")); File location = temp.newFolder(); String source = sourceName("test_snapshot_partitioned_table"); String dest = destName("iceberg_snapshot_partitioned"); createSourceTable(CREATE_PARTITIONED_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); assertIsolatedSnapshot(source, dest); } @Test public void testSnapshotUnpartitioned() throws Exception { - Assume.assumeTrue("Cannot snapshot with arbitrary location in a hadoop based catalog", + Assume.assumeTrue( + "Cannot snapshot with arbitrary location in a hadoop based catalog", !type.equals("hadoop")); File location = temp.newFolder(); String source = sourceName("test_snapshot_unpartitioned_table"); String dest = destName("iceberg_snapshot_unpartitioned"); createSourceTable(CREATE_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); assertIsolatedSnapshot(source, dest); } @Test public void testSnapshotHiveTable() throws Exception { - Assume.assumeTrue("Cannot snapshot with arbitrary location in a hadoop based catalog", + Assume.assumeTrue( + "Cannot snapshot with arbitrary location in a hadoop based catalog", !type.equals("hadoop")); File location = temp.newFolder(); String source = sourceName("snapshot_hive_table"); String dest = destName("iceberg_snapshot_hive_table"); createSourceTable(CREATE_HIVE_EXTERNAL_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); assertIsolatedSnapshot(source, dest); } @@ -411,7 +457,9 @@ public void testSnapshotManagedHiveTable() throws Exception { String dest = destName("iceberg_snapshot_managed_hive_table"); createSourceTable(CREATE_HIVE_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); assertIsolatedSnapshot(source, dest); } @@ -423,7 +471,9 @@ public void testMigrateManagedHiveTable() throws Exception { String dest = destName("iceberg_migrate_managed_hive_table"); createSourceTable(CREATE_HIVE_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); } @Test @@ -435,11 +485,15 @@ public void testProperties() throws Exception { props.put("note", "Jazz"); createSourceTable(CREATE_PARQUET, source); for (Map.Entry keyValue : props.entrySet()) { - spark.sql(String.format("ALTER TABLE %s SET TBLPROPERTIES (\"%s\" = \"%s\")", - source, keyValue.getKey(), keyValue.getValue())); + spark.sql( + String.format( + "ALTER TABLE %s SET TBLPROPERTIES (\"%s\" = \"%s\")", + source, keyValue.getKey(), keyValue.getValue())); } assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableProperty("dogs", "sundance"), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableProperty("dogs", "sundance"), + source, + dest); SparkTable table = loadTable(dest); Map expectedProps = Maps.newHashMap(); @@ -450,8 +504,10 @@ public void testProperties() throws Exception { Assert.assertTrue( "Created table missing property " + entry.getKey(), table.properties().containsKey(entry.getKey())); - Assert.assertEquals("Property value is not the expected value", - entry.getValue(), table.properties().get(entry.getKey())); + Assert.assertEquals( + "Property value is not the expected value", + entry.getValue(), + table.properties().get(entry.getKey())); } } @@ -469,14 +525,20 @@ public void testSparkTableReservedProperties() throws Exception { String[] keys = {"provider", "format", "current-snapshot-id", "location", "sort-order"}; for (String entry : keys) { - Assert.assertTrue("Created table missing reserved property " + entry, table.properties().containsKey(entry)); + Assert.assertTrue( + "Created table missing reserved property " + entry, + table.properties().containsKey(entry)); } Assert.assertEquals("Unexpected provider", "iceberg", table.properties().get("provider")); Assert.assertEquals("Unexpected format", "iceberg/parquet", table.properties().get("format")); - Assert.assertNotEquals("No current-snapshot-id found", "none", table.properties().get("current-snapshot-id")); - Assert.assertTrue("Location isn't correct", table.properties().get("location").endsWith(destTableName)); - Assert.assertEquals("Sort-order isn't correct", "id ASC NULLS FIRST, data DESC NULLS LAST", + Assert.assertNotEquals( + "No current-snapshot-id found", "none", table.properties().get("current-snapshot-id")); + Assert.assertTrue( + "Location isn't correct", table.properties().get("location").endsWith(destTableName)); + Assert.assertEquals( + "Sort-order isn't correct", + "id ASC NULLS FIRST, data DESC NULLS LAST", table.properties().get("sort-order")); } @@ -492,30 +554,37 @@ public void testSnapshotDefaultLocation() throws Exception { @Test public void schemaEvolutionTestWithSparkAPI() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); File location = temp.newFolder(); String tblName = sourceName("schema_evolution_test"); // Data generation and partition addition - spark.range(0, 5) + spark + .range(0, 5) .selectExpr("CAST(id as INT) as col0", "CAST(id AS FLOAT) col2", "CAST(id AS LONG) col3") .write() .mode(SaveMode.Append) .parquet(location.toURI().toString()); - Dataset rowDataset = spark.range(6, 10) - .selectExpr("CAST(id as INT) as col0", "CAST(id AS STRING) col1", - "CAST(id AS FLOAT) col2", "CAST(id AS LONG) col3"); - rowDataset - .write() - .mode(SaveMode.Append) - .parquet(location.toURI().toString()); - spark.read() + Dataset rowDataset = + spark + .range(6, 10) + .selectExpr( + "CAST(id as INT) as col0", + "CAST(id AS STRING) col1", + "CAST(id AS FLOAT) col2", + "CAST(id AS LONG) col3"); + rowDataset.write().mode(SaveMode.Append).parquet(location.toURI().toString()); + spark + .read() .schema(rowDataset.schema()) - .parquet(location.toURI().toString()).write().saveAsTable(tblName); + .parquet(location.toURI().toString()) + .write() + .saveAsTable(tblName); List expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName); - List expectedAfterAddColumn = sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", - tblName); + List expectedAfterAddColumn = + sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", tblName); // Migrate table SparkActions.get().migrateTable(tblName).execute(); @@ -526,7 +595,10 @@ public void schemaEvolutionTestWithSparkAPI() throws Exception { // Update schema and check output correctness SparkTable sparkTable = loadTable(tblName); - sparkTable.table().updateSchema().addColumn("newCol", Types.IntegerType.get()) + sparkTable + .table() + .updateSchema() + .addColumn("newCol", Types.IntegerType.get()) .moveAfter("newCol", "col0") .commit(); List afterMigarteAfterAddResults = sql("SELECT * FROM %s ORDER BY col0", tblName); @@ -536,23 +608,30 @@ public void schemaEvolutionTestWithSparkAPI() throws Exception { @Test public void schemaEvolutionTestWithSparkSQL() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String tblName = sourceName("schema_evolution_test_sql"); // Data generation and partition addition - spark.range(0, 5) + spark + .range(0, 5) .selectExpr("CAST(id as INT) col0", "CAST(id AS FLOAT) col1", "CAST(id AS STRING) col2") .write() .mode(SaveMode.Append) .saveAsTable(tblName); sql("ALTER TABLE %s ADD COLUMN col3 INT", tblName); - spark.range(6, 10) - .selectExpr("CAST(id AS INT) col0", "CAST(id AS FLOAT) col1", "CAST(id AS STRING) col2", "CAST(id AS INT) col3") + spark + .range(6, 10) + .selectExpr( + "CAST(id AS INT) col0", + "CAST(id AS FLOAT) col1", + "CAST(id AS STRING) col2", + "CAST(id AS INT) col3") .registerTempTable("tempdata"); sql("INSERT INTO TABLE %s SELECT * FROM tempdata", tblName); List expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName); - List expectedAfterAddColumn = sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", - tblName); + List expectedAfterAddColumn = + sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", tblName); // Migrate table SparkActions.get().migrateTable(tblName).execute(); @@ -563,7 +642,10 @@ public void schemaEvolutionTestWithSparkSQL() throws Exception { // Update schema and check output correctness SparkTable sparkTable = loadTable(tblName); - sparkTable.table().updateSchema().addColumn("newCol", Types.IntegerType.get()) + sparkTable + .table() + .updateSchema() + .addColumn("newCol", Types.IntegerType.get()) .moveAfter("newCol", "col0") .commit(); List afterMigarteAfterAddResults = sql("SELECT * FROM %s ORDER BY col0", tblName); @@ -615,9 +697,9 @@ public void threeLevelList(boolean useLegacyMode) throws Exception { String tableName = sourceName(String.format("threeLevelList_%s", useLegacyMode)); File location = temp.newFolder(); - sql("CREATE TABLE %s (col1 ARRAY>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (col1 ARRAY>)" + " STORED AS parquet" + " LOCATION '%s'", + tableName, location); int testValue = 12345; sql("INSERT INTO %s VALUES (ARRAY(STRUCT(%s)))", tableName, testValue); @@ -635,11 +717,14 @@ public void threeLevelList(boolean useLegacyMode) throws Exception { private void threeLevelListWithNestedStruct(boolean useLegacyMode) throws Exception { spark.conf().set("spark.sql.parquet.writeLegacyFormat", useLegacyMode); - String tableName = sourceName(String.format("threeLevelListWithNestedStruct_%s", useLegacyMode)); + String tableName = + sourceName(String.format("threeLevelListWithNestedStruct_%s", useLegacyMode)); File location = temp.newFolder(); - sql("CREATE TABLE %s (col1 ARRAY>>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (col1 ARRAY>>)" + + " STORED AS parquet" + + " LOCATION '%s'", + tableName, location); int testValue = 12345; sql("INSERT INTO %s VALUES (ARRAY(STRUCT(STRUCT(%s))))", tableName, testValue); @@ -659,13 +744,16 @@ private void threeLevelLists(boolean useLegacyMode) throws Exception { String tableName = sourceName(String.format("threeLevelLists_%s", useLegacyMode)); File location = temp.newFolder(); - sql("CREATE TABLE %s (col1 ARRAY>, col3 ARRAY>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (col1 ARRAY>, col3 ARRAY>)" + + " STORED AS parquet" + + " LOCATION '%s'", + tableName, location); int testValue1 = 12345; int testValue2 = 987654; - sql("INSERT INTO %s VALUES (ARRAY(STRUCT(%s)), ARRAY(STRUCT(%s)))", + sql( + "INSERT INTO %s VALUES (ARRAY(STRUCT(%s)), ARRAY(STRUCT(%s)))", tableName, testValue1, testValue2); List expected = sql(String.format("SELECT * FROM %s", tableName)); @@ -683,13 +771,14 @@ private void structOfThreeLevelLists(boolean useLegacyMode) throws Exception { String tableName = sourceName(String.format("structOfThreeLevelLists_%s", useLegacyMode)); File location = temp.newFolder(); - sql("CREATE TABLE %s (col1 STRUCT>>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (col1 STRUCT>>)" + + " STORED AS parquet" + + " LOCATION '%s'", + tableName, location); int testValue1 = 12345; - sql("INSERT INTO %s VALUES (STRUCT(STRUCT(ARRAY(STRUCT(%s)))))", - tableName, testValue1); + sql("INSERT INTO %s VALUES (STRUCT(STRUCT(ARRAY(STRUCT(%s)))))", tableName, testValue1); List expected = sql(String.format("SELECT * FROM %s", tableName)); // migrate table @@ -701,16 +790,19 @@ private void structOfThreeLevelLists(boolean useLegacyMode) throws Exception { assertEquals("Output must match", expected, results); } - private SparkTable loadTable(String name) throws NoSuchTableException, ParseException { - return (SparkTable) catalog.loadTable(Spark3Util.catalogAndIdentifier(spark, name).identifier()); + return (SparkTable) + catalog.loadTable(Spark3Util.catalogAndIdentifier(spark, name).identifier()); } private CatalogTable loadSessionTable(String name) throws NoSuchTableException, NoSuchDatabaseException, ParseException { Identifier identifier = Spark3Util.catalogAndIdentifier(spark, name).identifier(); Some namespace = Some.apply(identifier.namespace()[0]); - return spark.sessionState().catalog().getTableMetadata(new TableIdentifier(identifier.name(), namespace)); + return spark + .sessionState() + .catalog() + .getTableMetadata(new TableIdentifier(identifier.name(), namespace)); } private void createSourceTable(String createStatement, String tableName) @@ -720,41 +812,57 @@ private void createSourceTable(String createStatement, String tableName) CatalogTable table = loadSessionTable(tableName); Seq partitionColumns = table.partitionColumnNames(); String format = table.provider().get(); - spark.table(baseTableName).write().mode(SaveMode.Append).format(format).partitionBy(partitionColumns) + spark + .table(baseTableName) + .write() + .mode(SaveMode.Append) + .format(format) + .partitionBy(partitionColumns) .saveAsTable(tableName); } - // Counts the number of files in the source table, makes sure the same files exist in the destination table + // Counts the number of files in the source table, makes sure the same files exist in the + // destination table private void assertMigratedFileCount(MigrateTable migrateAction, String source, String dest) throws NoSuchTableException, NoSuchDatabaseException, ParseException { long expectedFiles = expectedFilesCount(source); MigrateTable.Result migratedFiles = migrateAction.execute(); validateTables(source, dest); - Assert.assertEquals("Expected number of migrated files", - expectedFiles, migratedFiles.migratedDataFilesCount()); + Assert.assertEquals( + "Expected number of migrated files", expectedFiles, migratedFiles.migratedDataFilesCount()); } - // Counts the number of files in the source table, makes sure the same files exist in the destination table + // Counts the number of files in the source table, makes sure the same files exist in the + // destination table private void assertSnapshotFileCount(SnapshotTable snapshotTable, String source, String dest) throws NoSuchTableException, NoSuchDatabaseException, ParseException { long expectedFiles = expectedFilesCount(source); SnapshotTable.Result snapshotTableResult = snapshotTable.execute(); validateTables(source, dest); - Assert.assertEquals("Expected number of imported snapshot files", expectedFiles, + Assert.assertEquals( + "Expected number of imported snapshot files", + expectedFiles, snapshotTableResult.importedDataFilesCount()); } - private void validateTables(String source, String dest) throws NoSuchTableException, ParseException { + private void validateTables(String source, String dest) + throws NoSuchTableException, ParseException { List expected = spark.table(source).collectAsList(); SparkTable destTable = loadTable(dest); - Assert.assertEquals("Provider should be iceberg", "iceberg", + Assert.assertEquals( + "Provider should be iceberg", + "iceberg", destTable.properties().get(TableCatalog.PROP_PROVIDER)); List actual = spark.table(dest).collectAsList(); - Assert.assertTrue(String.format("Rows in migrated table did not match\nExpected :%s rows \nFound :%s", - expected, actual), expected.containsAll(actual) && actual.containsAll(expected)); + Assert.assertTrue( + String.format( + "Rows in migrated table did not match\nExpected :%s rows \nFound :%s", + expected, actual), + expected.containsAll(actual) && actual.containsAll(expected)); } - private long expectedFilesCount(String source) throws NoSuchDatabaseException, NoSuchTableException, ParseException { + private long expectedFilesCount(String source) + throws NoSuchDatabaseException, NoSuchTableException, ParseException { CatalogTable sourceTable = loadSessionTable(source); List uris; if (sourceTable.partitionColumnNames().size() == 0) { @@ -762,34 +870,42 @@ private long expectedFilesCount(String source) throws NoSuchDatabaseException, N uris.add(sourceTable.location()); } else { Seq catalogTablePartitionSeq = - spark.sessionState().catalog().listPartitions(sourceTable.identifier(), Option.apply(null)); - uris = JavaConverters.seqAsJavaList(catalogTablePartitionSeq) - .stream() - .map(CatalogTablePartition::location) - .collect(Collectors.toList()); + spark + .sessionState() + .catalog() + .listPartitions(sourceTable.identifier(), Option.apply(null)); + uris = + JavaConverters.seqAsJavaList(catalogTablePartitionSeq).stream() + .map(CatalogTablePartition::location) + .collect(Collectors.toList()); } return uris.stream() - .flatMap(uri -> - FileUtils.listFiles(Paths.get(uri).toFile(), - TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).stream()) - .filter(file -> !file.toString().endsWith("crc") && !file.toString().contains("_SUCCESS")).count(); + .flatMap( + uri -> + FileUtils.listFiles( + Paths.get(uri).toFile(), TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE) + .stream()) + .filter(file -> !file.toString().endsWith("crc") && !file.toString().contains("_SUCCESS")) + .count(); } - // Insert records into the destination, makes sure those records exist and source table is unchanged + // Insert records into the destination, makes sure those records exist and source table is + // unchanged private void assertIsolatedSnapshot(String source, String dest) { List expected = spark.sql(String.format("SELECT * FROM %s", source)).collectAsList(); - List extraData = Lists.newArrayList( - new SimpleRecord(4, "d") - ); + List extraData = Lists.newArrayList(new SimpleRecord(4, "d")); Dataset df = spark.createDataFrame(extraData, SimpleRecord.class); df.write().format("iceberg").mode("append").saveAsTable(dest); List result = spark.sql(String.format("SELECT * FROM %s", source)).collectAsList(); - Assert.assertEquals("No additional rows should be added to the original table", expected.size(), - result.size()); + Assert.assertEquals( + "No additional rows should be added to the original table", expected.size(), result.size()); - List snapshot = spark.sql(String.format("SELECT * FROM %s WHERE id = 4 AND data = 'd'", dest)).collectAsList(); + List snapshot = + spark + .sql(String.format("SELECT * FROM %s WHERE id = 4 AND data = 'd'", dest)) + .collectAsList(); Assert.assertEquals("Added row not found in snapshot", 1, snapshot.size()); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java index f4b296562c94..7124c51ddd3d 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -53,46 +54,47 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestDeleteReachableFilesAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); private static final int SHUFFLE_PARTITIONS = 2; private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("c1").build(); - static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(0)) - .withRecordCount(1) - .build(); - static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(1)) - .withRecordCount(1) - .build(); - static final DataFile FILE_C = DataFiles.builder(SPEC) - .withPath("/path/to/data-c.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(2)) - .withRecordCount(1) - .build(); - static final DataFile FILE_D = DataFiles.builder(SPEC) - .withPath("/path/to/data-d.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(3)) - .withRecordCount(1) - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(0)) + .withRecordCount(1) + .build(); + static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(1)) + .withRecordCount(1) + .build(); + static final DataFile FILE_C = + DataFiles.builder(SPEC) + .withPath("/path/to/data-c.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(2)) + .withRecordCount(1) + .build(); + static final DataFile FILE_D = + DataFiles.builder(SPEC) + .withPath("/path/to/data-d.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(3)) + .withRecordCount(1) + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @@ -104,62 +106,76 @@ public void setupTableLocation() throws Exception { spark.conf().set("spark.sql.shuffle.partitions", SHUFFLE_PARTITIONS); } - private void checkRemoveFilesResults(long expectedDatafiles, long expectedManifestsDeleted, - long expectedManifestListsDeleted, long expectedOtherFilesDeleted, - DeleteReachableFiles.Result results) { - Assert.assertEquals("Incorrect number of manifest files deleted", - expectedManifestsDeleted, results.deletedManifestsCount()); - Assert.assertEquals("Incorrect number of datafiles deleted", - expectedDatafiles, results.deletedDataFilesCount()); - Assert.assertEquals("Incorrect number of manifest lists deleted", - expectedManifestListsDeleted, results.deletedManifestListsCount()); - Assert.assertEquals("Incorrect number of other lists deleted", - expectedOtherFilesDeleted, results.deletedOtherFilesCount()); + private void checkRemoveFilesResults( + long expectedDatafiles, + long expectedManifestsDeleted, + long expectedManifestListsDeleted, + long expectedOtherFilesDeleted, + DeleteReachableFiles.Result results) { + Assert.assertEquals( + "Incorrect number of manifest files deleted", + expectedManifestsDeleted, + results.deletedManifestsCount()); + Assert.assertEquals( + "Incorrect number of datafiles deleted", + expectedDatafiles, + results.deletedDataFilesCount()); + Assert.assertEquals( + "Incorrect number of manifest lists deleted", + expectedManifestListsDeleted, + results.deletedManifestListsCount()); + Assert.assertEquals( + "Incorrect number of other lists deleted", + expectedOtherFilesDeleted, + results.deletedOtherFilesCount()); } @Test public void dataFilesCleanupWithParallelTasks() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit(); Set deletedFiles = ConcurrentHashMap.newKeySet(); Set deleteThreads = ConcurrentHashMap.newKeySet(); AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - DeleteReachableFiles.Result result = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .executeDeleteWith(Executors.newFixedThreadPool(4, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-files-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })) - .deleteWith(s -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(s); - }) - .execute(); - - // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory - Assert.assertEquals(deleteThreads, + DeleteReachableFiles.Result result = + sparkActions() + .deleteReachableFiles(metadataLocation(table)) + .io(table.io()) + .executeDeleteWith( + Executors.newFixedThreadPool( + 4, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("remove-files-" + deleteThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })) + .deleteWith( + s -> { + deleteThreads.add(Thread.currentThread().getName()); + deletedFiles.add(s); + }) + .execute(); + + // Verifies that the delete methods ran in the threads created by the provided ExecutorService + // ThreadFactory + Assert.assertEquals( + deleteThreads, Sets.newHashSet("remove-files-0", "remove-files-1", "remove-files-2", "remove-files-3")); - Lists.newArrayList(FILE_A, FILE_B, FILE_C, FILE_D).forEach(file -> - Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())) - ); + Lists.newArrayList(FILE_A, FILE_B, FILE_C, FILE_D) + .forEach( + file -> + Assert.assertTrue( + "FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString()))); checkRemoveFilesResults(4L, 6L, 4L, 6, result); } @@ -167,64 +183,43 @@ public void dataFilesCleanupWithParallelTasks() { public void testWithExpiringDanglingStageCommit() { table.location(); // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` staged commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); // `C` commit - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); - DeleteReachableFiles.Result result = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .execute(); + DeleteReachableFiles.Result result = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()).execute(); checkRemoveFilesResults(3L, 3L, 3L, 5, result); } @Test public void testRemoveFileActionOnEmptyTable() { - DeleteReachableFiles.Result result = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .execute(); + DeleteReachableFiles.Result result = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()).execute(); checkRemoveFilesResults(0, 0, 0, 2, result); } @Test public void testRemoveFilesActionWithReducedVersionsTable() { - table.updateProperties() - .set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "2").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); - - table.newAppend() - .appendFile(FILE_B) - .commit(); - - table.newAppend() - .appendFile(FILE_B) - .commit(); - - table.newAppend() - .appendFile(FILE_C) - .commit(); - - table.newAppend() - .appendFile(FILE_D) - .commit(); - - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()); + table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "2").commit(); + table.newAppend().appendFile(FILE_A).commit(); + + table.newAppend().appendFile(FILE_B).commit(); + + table.newAppend().appendFile(FILE_B).commit(); + + table.newAppend().appendFile(FILE_C).commit(); + + table.newAppend().appendFile(FILE_D).commit(); + + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); DeleteReachableFiles.Result result = baseRemoveFilesSparkAction.execute(); checkRemoveFilesResults(4, 5, 5, 8, result); @@ -232,57 +227,44 @@ public void testRemoveFilesActionWithReducedVersionsTable() { @Test public void testRemoveFilesAction() { - table.newAppend() - .appendFile(FILE_A) - .commit(); - - table.newAppend() - .appendFile(FILE_B) - .commit(); - - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()); - checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); + table.newAppend().appendFile(FILE_A).commit(); + + table.newAppend().appendFile(FILE_B).commit(); + + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); + checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); } @Test public void testRemoveFilesActionWithDefaultIO() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); // IO not set explicitly on removeReachableFiles action // IO defaults to HadoopFileIO - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)); - checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)); + checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); } @Test public void testUseLocalIterator() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); int jobsBefore = spark.sparkContext().dagScheduler().nextJobId().get(); - DeleteReachableFiles.Result results = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .option("stream-results", "true").execute(); + DeleteReachableFiles.Result results = + sparkActions() + .deleteReachableFiles(metadataLocation(table)) + .io(table.io()) + .option("stream-results", "true") + .execute(); int jobsAfter = spark.sparkContext().dagScheduler().nextJobId().get(); int totalJobsRun = jobsAfter - jobsBefore; @@ -290,52 +272,52 @@ public void testUseLocalIterator() { checkRemoveFilesResults(3L, 4L, 3L, 5, results); Assert.assertEquals( - "Expected total jobs to be equal to total number of shuffle partitions", totalJobsRun, SHUFFLE_PARTITIONS); + "Expected total jobs to be equal to total number of shuffle partitions", + totalJobsRun, + SHUFFLE_PARTITIONS); } @Test public void testIgnoreMetadataFilesNotFound() { - table.updateProperties() - .set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1").commit(); + table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // There are three metadata json files at this point DeleteOrphanFiles.Result result = sparkActions().deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); - Assert.assertTrue("Should remove v1 file", + Assert.assertTrue( + "Should remove v1 file", StreamSupport.stream(result.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("v1.metadata.json"))); - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()); + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); DeleteReachableFiles.Result res = baseRemoveFilesSparkAction.execute(); - checkRemoveFilesResults(1, 1, 1, 4, res); + checkRemoveFilesResults(1, 1, 1, 4, res); } @Test public void testEmptyIOThrowsException() { - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(null); - AssertHelpers.assertThrows("FileIO needs to be set to use RemoveFiles action", - IllegalArgumentException.class, "File IO cannot be null", + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(null); + AssertHelpers.assertThrows( + "FileIO needs to be set to use RemoveFiles action", + IllegalArgumentException.class, + "File IO cannot be null", baseRemoveFilesSparkAction::execute); } @Test public void testRemoveFilesActionWhenGarbageCollectionDisabled() { - table.updateProperties() - .set(TableProperties.GC_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - AssertHelpers.assertThrows("Should complain about removing files when GC is disabled", - ValidationException.class, "Cannot remove files: GC is disabled (deleting files may corrupt other tables)", + AssertHelpers.assertThrows( + "Should complain about removing files when GC is disabled", + ValidationException.class, + "Cannot remove files: GC is disabled (deleting files may corrupt other tables)", () -> sparkActions().deleteReachableFiles(metadataLocation(table))); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java index d411abdb8e26..faa2e2d1b80a 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.util.List; @@ -57,46 +58,47 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestExpireSnapshotsAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); private static final int SHUFFLE_PARTITIONS = 2; private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("c1").build(); - static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=1") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_C = DataFiles.builder(SPEC) - .withPath("/path/to/data-c.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=2") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_D = DataFiles.builder(SPEC) - .withPath("/path/to/data-d.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=3") // easy way to set partition data for now - .withRecordCount(1) - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=1") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_C = + DataFiles.builder(SPEC) + .withPath("/path/to/data-c.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=2") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_D = + DataFiles.builder(SPEC) + .withPath("/path/to/data-d.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=3") // easy way to set partition data for now + .withRecordCount(1) + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableDir; private String tableLocation; @@ -122,37 +124,41 @@ private Long rightAfterSnapshot(long snapshotId) { return end; } - private void checkExpirationResults(long expectedDatafiles, long expectedManifestsDeleted, - long expectedManifestListsDeleted, ExpireSnapshots.Result results) { - - Assert.assertEquals("Incorrect number of manifest files deleted", - expectedManifestsDeleted, results.deletedManifestsCount()); - Assert.assertEquals("Incorrect number of datafiles deleted", - expectedDatafiles, results.deletedDataFilesCount()); - Assert.assertEquals("Incorrect number of manifest lists deleted", - expectedManifestListsDeleted, results.deletedManifestListsCount()); + private void checkExpirationResults( + long expectedDatafiles, + long expectedManifestsDeleted, + long expectedManifestListsDeleted, + ExpireSnapshots.Result results) { + + Assert.assertEquals( + "Incorrect number of manifest files deleted", + expectedManifestsDeleted, + results.deletedManifestsCount()); + Assert.assertEquals( + "Incorrect number of datafiles deleted", + expectedDatafiles, + results.deletedDataFilesCount()); + Assert.assertEquals( + "Incorrect number of manifest lists deleted", + expectedManifestListsDeleted, + results.deletedManifestListsCount()); } @Test public void testFilesCleaned() throws Exception { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); long end = rightAfterSnapshot(); - ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); + ExpireSnapshots.Result results = + SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); - Assert.assertEquals("Table does not have 1 snapshot after expiration", 1, Iterables.size(table.snapshots())); + Assert.assertEquals( + "Table does not have 1 snapshot after expiration", 1, Iterables.size(table.snapshots())); checkExpirationResults(1L, 1L, 2L, results); } @@ -160,21 +166,13 @@ public void testFilesCleaned() throws Exception { @Test public void dataFilesCleanupWithParallelTasks() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit(); long t4 = rightAfterSnapshot(); @@ -182,23 +180,33 @@ public void dataFilesCleanupWithParallelTasks() throws IOException { Set deleteThreads = ConcurrentHashMap.newKeySet(); AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .executeDeleteWith(Executors.newFixedThreadPool(4, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-snapshot-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })) - .expireOlderThan(t4) - .deleteWith(s -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(s); - }) - .execute(); - - // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory - Assert.assertEquals(deleteThreads, - Sets.newHashSet("remove-snapshot-0", "remove-snapshot-1", "remove-snapshot-2", "remove-snapshot-3")); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .executeDeleteWith( + Executors.newFixedThreadPool( + 4, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("remove-snapshot-" + deleteThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })) + .expireOlderThan(t4) + .deleteWith( + s -> { + deleteThreads.add(Thread.currentThread().getName()); + deletedFiles.add(s); + }) + .execute(); + + // Verifies that the delete methods ran in the threads created by the provided ExecutorService + // ThreadFactory + Assert.assertEquals( + deleteThreads, + Sets.newHashSet( + "remove-snapshot-0", "remove-snapshot-1", "remove-snapshot-2", "remove-snapshot-3")); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); Assert.assertTrue("FILE_B should be deleted", deletedFiles.contains(FILE_B.path().toString())); @@ -208,9 +216,7 @@ public void dataFilesCleanupWithParallelTasks() throws IOException { @Test public void testNoFilesDeletedWhenNoSnapshotsExpired() throws Exception { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).execute(); checkExpirationResults(0L, 0L, 0L, results); @@ -218,30 +224,24 @@ public void testNoFilesDeletedWhenNoSnapshotsExpired() throws Exception { @Test public void testCleanupRepeatedOverwrites() throws Exception { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); for (int i = 0; i < 10; i++) { - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); - - table.newOverwrite() - .deleteFile(FILE_B) - .addFile(FILE_A) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); + + table.newOverwrite().deleteFile(FILE_B).addFile(FILE_A).commit(); } long end = rightAfterSnapshot(); - ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); + ExpireSnapshots.Result results = + SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); checkExpirationResults(1L, 39L, 20L, results); } @Test public void testRetainLastWithExpireOlderThan() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); @@ -250,217 +250,256 @@ public void testRetainLastWithExpireOlderThan() { t1 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); long t3 = rightAfterSnapshot(); // Retain last 2 snapshots - SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .retainLast(2) - .execute(); + SparkActions.get().expireSnapshots(table).expireOlderThan(t3).retainLast(2).execute(); - Assert.assertEquals("Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + Assert.assertEquals( + "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); } @Test public void testExpireTwoSnapshotsById() throws Exception { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); long secondSnapshotID = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); // Retain last 2 snapshots - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireSnapshotId(firstSnapshotId) - .expireSnapshotId(secondSnapshotID) - .execute(); - - Assert.assertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId)); - Assert.assertEquals("Second snapshot should not be present.", null, table.snapshot(secondSnapshotID)); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireSnapshotId(firstSnapshotId) + .expireSnapshotId(secondSnapshotID) + .execute(); + + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + Assert.assertEquals( + "Second snapshot should not be present.", null, table.snapshot(secondSnapshotID)); checkExpirationResults(0L, 0L, 2L, result); } @Test public void testRetainLastWithExpireById() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); // Retain last 3 snapshots, but explicitly remove the first snapshot - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireSnapshotId(firstSnapshotId) - .retainLast(3) - .execute(); - - Assert.assertEquals("Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireSnapshotId(firstSnapshotId) + .retainLast(3) + .execute(); + + Assert.assertEquals( + "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); checkExpirationResults(0L, 0L, 1L, result); } @Test public void testRetainLastWithTooFewSnapshots() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .appendFile(FILE_B) // data_bucket=1 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); long t2 = rightAfterSnapshot(); // Retain last 3 snapshots - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t2) - .retainLast(3) - .execute(); - - Assert.assertEquals("Should have two snapshots", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should still present", - firstSnapshotId, table.snapshot(firstSnapshotId).snapshotId()); + ExpireSnapshots.Result result = + SparkActions.get().expireSnapshots(table).expireOlderThan(t2).retainLast(3).execute(); + + Assert.assertEquals( + "Should have two snapshots", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should still present", + firstSnapshotId, + table.snapshot(firstSnapshotId).snapshotId()); checkExpirationResults(0L, 0L, 0L, result); } @Test public void testRetainLastKeepsExpiringSnapshot() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_D) // data_bucket=3 .commit(); // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(secondSnapshot.timestampMillis()) - .retainLast(2) - .execute(); - - Assert.assertEquals("Should have three snapshots.", 3, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNotNull("Second snapshot should present.", table.snapshot(secondSnapshot.snapshotId())); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(secondSnapshot.timestampMillis()) + .retainLast(2) + .execute(); + + Assert.assertEquals( + "Should have three snapshots.", 3, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNotNull( + "Second snapshot should present.", table.snapshot(secondSnapshot.snapshotId())); checkExpirationResults(0L, 0L, 1L, result); } @Test public void testExpireSnapshotsWithDisabledGarbageCollection() { - table.updateProperties() - .set(TableProperties.GC_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - AssertHelpers.assertThrows("Should complain about expiring snapshots", - ValidationException.class, "Cannot expire snapshots: GC is disabled", + AssertHelpers.assertThrows( + "Should complain about expiring snapshots", + ValidationException.class, + "Cannot expire snapshots: GC is disabled", () -> SparkActions.get().expireSnapshots(table)); } @Test public void testExpireOlderThanMultipleCalls() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); Snapshot thirdSnapshot = table.currentSnapshot(); // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(secondSnapshot.timestampMillis()) - .expireOlderThan(thirdSnapshot.timestampMillis()) - .execute(); - - Assert.assertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNull("Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(secondSnapshot.timestampMillis()) + .expireOlderThan(thirdSnapshot.timestampMillis()) + .execute(); + + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNull( + "Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); checkExpirationResults(0L, 0L, 2L, result); } @Test public void testRetainLastMultipleCalls() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); long t3 = rightAfterSnapshot(); // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .retainLast(2) - .retainLast(1) - .execute(); - - Assert.assertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNull("Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(t3) + .retainLast(2) + .retainLast(1) + .execute(); + + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNull( + "Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); checkExpirationResults(0L, 0L, 2L, result); } @Test public void testRetainZeroSnapshots() { - AssertHelpers.assertThrows("Should fail retain 0 snapshots " + - "because number of snapshots to retain cannot be zero", + AssertHelpers.assertThrows( + "Should fail retain 0 snapshots " + "because number of snapshots to retain cannot be zero", IllegalArgumentException.class, "Number of snapshots to retain must be at least 1, cannot be: 0", () -> SparkActions.get().expireSnapshots(table).retainLast(0).execute()); @@ -468,28 +507,22 @@ public void testRetainZeroSnapshots() { @Test public void testScanExpiredManifestInValidSnapshotAppend() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.newOverwrite() - .addFile(FILE_C) - .deleteFile(FILE_A) - .commit(); + table.newOverwrite().addFile(FILE_C).deleteFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_D).commit(); long t3 = rightAfterSnapshot(); Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(t3) + .deleteWith(deletedFiles::add) + .execute(); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); checkExpirationResults(1L, 1L, 2L, result); @@ -497,72 +530,61 @@ public void testScanExpiredManifestInValidSnapshotAppend() { @Test public void testScanExpiredManifestInValidSnapshotFastAppend() { - table.updateProperties() + table + .updateProperties() .set(TableProperties.MANIFEST_MERGE_ENABLED, "true") .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1") .commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.newOverwrite() - .addFile(FILE_C) - .deleteFile(FILE_A) - .commit(); + table.newOverwrite().addFile(FILE_C).deleteFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_D) - .commit(); + table.newFastAppend().appendFile(FILE_D).commit(); long t3 = rightAfterSnapshot(); Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(t3) + .deleteWith(deletedFiles::add) + .execute(); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); checkExpirationResults(1L, 1L, 2L, result); } /** - * Test on table below, and expiring the staged commit `B` using `expireOlderThan` API. - * Table: A - C - * ` B (staged) + * Test on table below, and expiring the staged commit `B` using `expireOlderThan` API. Table: A - + * C ` B (staged) */ @Test public void testWithExpiringDanglingStageCommit() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` staged commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); TableMetadata base = ((BaseTable) table).operations().current(); Snapshot snapshotA = base.snapshots().get(0); Snapshot snapshotB = base.snapshots().get(1); // `C` commit - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Set deletedFiles = Sets.newHashSet(); // Expire all commits including dangling staged snapshot. - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(snapshotB.timestampMillis() + 1) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireOlderThan(snapshotB.timestampMillis() + 1) + .execute(); checkExpirationResults(1L, 1L, 2L, result); @@ -570,122 +592,107 @@ public void testWithExpiringDanglingStageCommit() { expectedDeletes.add(snapshotA.manifestListLocation()); // Files should be deleted of dangling staged snapshot - snapshotB.addedDataFiles(table.io()).forEach(i -> { - expectedDeletes.add(i.path().toString()); - }); + snapshotB + .addedDataFiles(table.io()) + .forEach( + i -> { + expectedDeletes.add(i.path().toString()); + }); // ManifestList should be deleted too expectedDeletes.add(snapshotB.manifestListLocation()); - snapshotB.dataManifests(table.io()).forEach(file -> { - // Only the manifest of B should be deleted. - if (file.snapshotId() == snapshotB.snapshotId()) { - expectedDeletes.add(file.path()); - } - }); - Assert.assertSame("Files deleted count should be expected", expectedDeletes.size(), deletedFiles.size()); + snapshotB + .dataManifests(table.io()) + .forEach( + file -> { + // Only the manifest of B should be deleted. + if (file.snapshotId() == snapshotB.snapshotId()) { + expectedDeletes.add(file.path()); + } + }); + Assert.assertSame( + "Files deleted count should be expected", expectedDeletes.size(), deletedFiles.size()); // Take the diff expectedDeletes.removeAll(deletedFiles); Assert.assertTrue("Exactly same files should be deleted", expectedDeletes.isEmpty()); } /** - * Expire cherry-pick the commit as shown below, when `B` is in table's current state - * Table: - * A - B - C <--current snapshot - * `- D (source=B) + * Expire cherry-pick the commit as shown below, when `B` is in table's current state Table: A - B + * - C <--current snapshot `- D (source=B) */ @Test public void testWithCherryPickTableSnapshot() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot snapshotA = table.currentSnapshot(); // `B` commit Set deletedAFiles = Sets.newHashSet(); - table.newOverwrite() - .addFile(FILE_B) - .deleteFile(FILE_A) - .deleteWith(deletedAFiles::add) - .commit(); + table.newOverwrite().addFile(FILE_B).deleteFile(FILE_A).deleteWith(deletedAFiles::add).commit(); Assert.assertTrue("No files should be physically deleted", deletedAFiles.isEmpty()); // pick the snapshot 'B` Snapshot snapshotB = table.currentSnapshot(); // `C` commit to let cherry-pick take effect, and avoid fast-forward of `B` with cherry-pick - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Snapshot snapshotC = table.currentSnapshot(); // Move the table back to `A` - table.manageSnapshots() - .setCurrentSnapshot(snapshotA.snapshotId()) - .commit(); + table.manageSnapshots().setCurrentSnapshot(snapshotA.snapshotId()).commit(); // Generate A -> `D (B)` - table.manageSnapshots() - .cherrypick(snapshotB.snapshotId()) - .commit(); + table.manageSnapshots().cherrypick(snapshotB.snapshotId()).commit(); Snapshot snapshotD = table.currentSnapshot(); // Move the table back to `C` - table.manageSnapshots() - .setCurrentSnapshot(snapshotC.snapshotId()) - .commit(); + table.manageSnapshots().setCurrentSnapshot(snapshotC.snapshotId()).commit(); List deletedFiles = Lists.newArrayList(); // Expire `C` - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(snapshotC.timestampMillis() + 1) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireOlderThan(snapshotC.timestampMillis() + 1) + .execute(); // Make sure no dataFiles are deleted for the B, C, D snapshot - Lists.newArrayList(snapshotB, snapshotC, snapshotD).forEach(i -> { - i.addedDataFiles(table.io()).forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); + Lists.newArrayList(snapshotB, snapshotC, snapshotD) + .forEach( + i -> { + i.addedDataFiles(table.io()) + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); checkExpirationResults(1L, 2L, 2L, result); } /** - * Test on table below, and expiring `B` which is not in current table state. - * 1) Expire `B` - * 2) All commit - * Table: A - C - D (B) - * ` B (staged) + * Test on table below, and expiring `B` which is not in current table state. 1) Expire `B` 2) All + * commit Table: A - C - D (B) ` B (staged) */ @Test public void testWithExpiringStagedThenCherrypick() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); // pick the snapshot that's staged but not committed TableMetadata base = ((BaseTable) table).operations().current(); Snapshot snapshotB = base.snapshots().get(1); // `C` commit to let cherry-pick take effect, and avoid fast-forward of `B` with cherry-pick - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); // `D (B)` cherry-pick commit - table.manageSnapshots() - .cherrypick(snapshotB.snapshotId()) - .commit(); + table.manageSnapshots().cherrypick(snapshotB.snapshotId()).commit(); base = ((BaseTable) table).operations().current(); Snapshot snapshotD = base.snapshots().get(3); @@ -693,47 +700,55 @@ public void testWithExpiringStagedThenCherrypick() { List deletedFiles = Lists.newArrayList(); // Expire `B` commit. - ExpireSnapshots.Result firstResult = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireSnapshotId(snapshotB.snapshotId()) - .execute(); + ExpireSnapshots.Result firstResult = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireSnapshotId(snapshotB.snapshotId()) + .execute(); // Make sure no dataFiles are deleted for the staged snapshot - Lists.newArrayList(snapshotB).forEach(i -> { - i.addedDataFiles(table.io()).forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); + Lists.newArrayList(snapshotB) + .forEach( + i -> { + i.addedDataFiles(table.io()) + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); checkExpirationResults(0L, 1L, 1L, firstResult); // Expire all snapshots including cherry-pick - ExpireSnapshots.Result secondResult = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(table.currentSnapshot().timestampMillis() + 1) - .execute(); + ExpireSnapshots.Result secondResult = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireOlderThan(table.currentSnapshot().timestampMillis() + 1) + .execute(); // Make sure no dataFiles are deleted for the staged and cherry-pick - Lists.newArrayList(snapshotB, snapshotD).forEach(i -> { - i.addedDataFiles(table.io()).forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); + Lists.newArrayList(snapshotB, snapshotD) + .forEach( + i -> { + i.addedDataFiles(table.io()) + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); checkExpirationResults(0L, 0L, 2L, secondResult); } @Test public void testExpireOlderThan() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); long snapshotId = table.currentSnapshot().snapshotId(); @@ -741,42 +756,46 @@ public void testExpireOlderThan() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertEquals("Should remove only the expired manifest list location", - Sets.newHashSet(firstSnapshot.manifestListLocation()), deletedFiles); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertEquals( + "Should remove only the expired manifest list location", + Sets.newHashSet(firstSnapshot.manifestListLocation()), + deletedFiles); checkExpirationResults(0, 0, 1, result); } @Test public void testExpireOlderThanWithDelete() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); rightAfterSnapshot(); - table.newDelete() - .deleteFile(FILE_A) - .commit(); + table.newDelete().deleteFile(FILE_A).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create replace manifest with a rewritten manifest", - 1, secondSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create replace manifest with a rewritten manifest", + 1, + secondSnapshot.allManifests(table.io()).size()); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); rightAfterSnapshot(); @@ -786,21 +805,36 @@ public void testExpireOlderThanWithDelete() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the second oldest snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and deleted data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the second oldest snapshot", + table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and deleted data file", Sets.newHashSet( firstSnapshot.manifestListLocation(), // snapshot expired - firstSnapshot.allManifests(table.io()).get(0).path(), // manifest was rewritten for delete + firstSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest was rewritten for delete secondSnapshot.manifestListLocation(), // snapshot expired - secondSnapshot.allManifests(table.io()).get(0).path(), // manifest contained only deletes, was dropped + secondSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest contained only deletes, was dropped FILE_A.path()), // deleted deletedFiles); @@ -810,30 +844,29 @@ public void testExpireOlderThanWithDelete() { @Test public void testExpireOlderThanWithDeleteInMergedManifests() { // merge every commit - table.updateProperties() - .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); rightAfterSnapshot(); - table.newDelete() + table + .newDelete() .deleteFile(FILE_A) // FILE_B is still in the dataset .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should replace manifest with a rewritten manifest", - 1, secondSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should replace manifest with a rewritten manifest", + 1, + secondSnapshot.allManifests(table.io()).size()); - table.newFastAppend() // do not merge to keep the last snapshot's manifest valid + table + .newFastAppend() // do not merge to keep the last snapshot's manifest valid .appendFile(FILE_C) .commit(); @@ -845,19 +878,31 @@ public void testExpireOlderThanWithDeleteInMergedManifests() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the second oldest snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and deleted data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the second oldest snapshot", + table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and deleted data file", Sets.newHashSet( firstSnapshot.manifestListLocation(), // snapshot expired - firstSnapshot.allManifests(table.io()).get(0).path(), // manifest was rewritten for delete + firstSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest was rewritten for delete secondSnapshot.manifestListLocation(), // snapshot expired FILE_A.path()), // deleted deletedFiles); @@ -868,33 +913,26 @@ public void testExpireOlderThanWithDeleteInMergedManifests() { @Test public void testExpireOlderThanWithRollback() { // merge every commit - table.updateProperties() - .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); rightAfterSnapshot(); - table.newDelete() - .deleteFile(FILE_B) - .commit(); + table.newDelete().deleteFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Set secondSnapshotManifests = Sets.newHashSet(secondSnapshot.allManifests(table.io())); + Set secondSnapshotManifests = + Sets.newHashSet(secondSnapshot.allManifests(table.io())); secondSnapshotManifests.removeAll(firstSnapshot.allManifests(table.io())); - Assert.assertEquals("Should add one new manifest for append", 1, secondSnapshotManifests.size()); + Assert.assertEquals( + "Should add one new manifest for append", 1, secondSnapshotManifests.size()); - table.manageSnapshots() - .rollbackTo(firstSnapshot.snapshotId()) - .commit(); + table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit(); long tAfterCommits = rightAfterSnapshot(secondSnapshot.snapshotId()); @@ -902,19 +940,29 @@ public void testExpireOlderThanWithRollback() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNotNull("Expire should keep the oldest snapshot, current", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and reverted appended data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNotNull( + "Expire should keep the oldest snapshot, current", + table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and reverted appended data file", Sets.newHashSet( secondSnapshot.manifestListLocation(), // snapshot expired - Iterables.getOnlyElement(secondSnapshotManifests).path()), // manifest is no longer referenced + Iterables.getOnlyElement(secondSnapshotManifests) + .path()), // manifest is no longer referenced deletedFiles); checkExpirationResults(0, 1, 1, result); @@ -922,28 +970,24 @@ public void testExpireOlderThanWithRollback() { @Test public void testExpireOlderThanWithRollbackAndMergedManifests() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Set secondSnapshotManifests = Sets.newHashSet(secondSnapshot.allManifests(table.io())); + Set secondSnapshotManifests = + Sets.newHashSet(secondSnapshot.allManifests(table.io())); secondSnapshotManifests.removeAll(firstSnapshot.allManifests(table.io())); - Assert.assertEquals("Should add one new manifest for append", 1, secondSnapshotManifests.size()); + Assert.assertEquals( + "Should add one new manifest for append", 1, secondSnapshotManifests.size()); - table.manageSnapshots() - .rollbackTo(firstSnapshot.snapshotId()) - .commit(); + table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit(); long tAfterCommits = rightAfterSnapshot(secondSnapshot.snapshotId()); @@ -951,19 +995,29 @@ public void testExpireOlderThanWithRollbackAndMergedManifests() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNotNull("Expire should keep the oldest snapshot, current", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and reverted appended data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNotNull( + "Expire should keep the oldest snapshot, current", + table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and reverted appended data file", Sets.newHashSet( secondSnapshot.manifestListLocation(), // snapshot expired - Iterables.getOnlyElement(secondSnapshotManifests).path(), // manifest is no longer referenced + Iterables.getOnlyElement(secondSnapshotManifests) + .path(), // manifest is no longer referenced FILE_B.path()), // added, but rolled back deletedFiles); @@ -975,27 +1029,25 @@ public void testExpireOnEmptyTable() { Set deletedFiles = Sets.newHashSet(); // table has no data, testing ExpireSnapshots should not fail with no snapshot - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(System.currentTimeMillis()) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(System.currentTimeMillis()) + .deleteWith(deletedFiles::add) + .execute(); checkExpirationResults(0, 0, 0, result); } @Test public void testExpireAction() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); long snapshotId = table.currentSnapshot().snapshotId(); @@ -1003,58 +1055,68 @@ public void testExpireAction() { Set deletedFiles = Sets.newHashSet(); - BaseExpireSnapshotsSparkAction action = (BaseExpireSnapshotsSparkAction) SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add); + BaseExpireSnapshotsSparkAction action = + (BaseExpireSnapshotsSparkAction) + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add); Dataset pendingDeletes = action.expire(); List pending = pendingDeletes.collectAsList(); - Assert.assertEquals("Should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertEquals( + "Should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); Assert.assertEquals("Pending deletes should contain one row", 1, pending.size()); - Assert.assertEquals("Pending delete should be the expired manifest list location", - firstSnapshot.manifestListLocation(), pending.get(0).getString(0)); - Assert.assertEquals("Pending delete should be a manifest list", - "Manifest List", pending.get(0).getString(1)); + Assert.assertEquals( + "Pending delete should be the expired manifest list location", + firstSnapshot.manifestListLocation(), + pending.get(0).getString(0)); + Assert.assertEquals( + "Pending delete should be a manifest list", "Manifest List", pending.get(0).getString(1)); Assert.assertEquals("Should not delete any files", 0, deletedFiles.size()); - Assert.assertSame("Multiple calls to expire should return the same deleted files", - pendingDeletes, action.expire()); + Assert.assertSame( + "Multiple calls to expire should return the same deleted files", + pendingDeletes, + action.expire()); } @Test public void testUseLocalIterator() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); long end = rightAfterSnapshot(); int jobsBeforeStreamResults = spark.sparkContext().dagScheduler().nextJobId().get(); - withSQLConf(ImmutableMap.of("spark.sql.adaptive.enabled", "false"), () -> { - ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).expireOlderThan(end) - .option("stream-results", "true").execute(); - - int jobsAfterStreamResults = spark.sparkContext().dagScheduler().nextJobId().get(); - int jobsRunDuringStreamResults = jobsAfterStreamResults - jobsBeforeStreamResults; - - checkExpirationResults(1L, 1L, 2L, results); - - Assert.assertEquals("Expected total number of jobs with stream-results should match the expected number", - 5L, jobsRunDuringStreamResults); - }); + withSQLConf( + ImmutableMap.of("spark.sql.adaptive.enabled", "false"), + () -> { + ExpireSnapshots.Result results = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(end) + .option("stream-results", "true") + .execute(); + + int jobsAfterStreamResults = spark.sparkContext().dagScheduler().nextJobId().get(); + int jobsRunDuringStreamResults = jobsAfterStreamResults - jobsBeforeStreamResults; + + checkExpirationResults(1L, 1L, 2L, results); + + Assert.assertEquals( + "Expected total number of jobs with stream-results should match the expected number", + 5L, + jobsRunDuringStreamResults); + }); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java index 17fb891a89c8..70f119c45aaa 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -64,23 +65,18 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public abstract class TestRemoveOrphanFilesAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - protected static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); - protected static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .truncate("c2", 2) - .identity("c3") - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + protected static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); + protected static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).truncate("c2", 2).identity("c3").build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableDir = null; protected String tableLocation = null; @@ -92,41 +88,37 @@ public void setupTableLocation() throws Exception { @Test public void testDryRun() throws IOException, InterruptedException { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - List validFiles = spark.read().format("iceberg") - .load(tableLocation + "#files") - .select("file_path") - .as(Encoders.STRING()) - .collectAsList(); + List validFiles = + spark + .read() + .format("iceberg") + .load(tableLocation + "#files") + .select("file_path") + .as(Encoders.STRING()) + .collectAsList(); Assert.assertEquals("Should be 2 valid files", 2, validFiles.size()); df.write().mode("append").parquet(tableLocation + "/data"); Path dataPath = new Path(tableLocation + "/data"); FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf()); - List allFiles = Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) - .filter(FileStatus::isFile) - .map(file -> file.getPath().toString()) - .collect(Collectors.toList()); + List allFiles = + Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) + .filter(FileStatus::isFile) + .map(file -> file.getPath().toString()) + .collect(Collectors.toList()); Assert.assertEquals("Should be 3 files", 3, allFiles.size()); List invalidFiles = Lists.newArrayList(allFiles); @@ -138,32 +130,34 @@ public void testDryRun() throws IOException, InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result1 = actions.deleteOrphanFiles(table) - .deleteWith(s -> { }) - .execute(); - Assert.assertTrue("Default olderThan interval should be safe", Iterables.isEmpty(result1.orphanFileLocations())); + DeleteOrphanFiles.Result result1 = + actions.deleteOrphanFiles(table).deleteWith(s -> {}).execute(); + Assert.assertTrue( + "Default olderThan interval should be safe", + Iterables.isEmpty(result1.orphanFileLocations())); - DeleteOrphanFiles.Result result2 = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .deleteWith(s -> { }) - .execute(); + DeleteOrphanFiles.Result result2 = + actions + .deleteOrphanFiles(table) + .olderThan(System.currentTimeMillis()) + .deleteWith(s -> {}) + .execute(); Assert.assertEquals("Action should find 1 file", invalidFiles, result2.orphanFileLocations()); Assert.assertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0)))); - DeleteOrphanFiles.Result result3 = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result3 = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Action should delete 1 file", invalidFiles, result3.orphanFileLocations()); - Assert.assertFalse("Invalid file should not be present", fs.exists(new Path(invalidFiles.get(0)))); + Assert.assertFalse( + "Invalid file should not be present", fs.exists(new Path(invalidFiles.get(0)))); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(records); expectedRecords.addAll(records); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -171,36 +165,22 @@ public void testDryRun() throws IOException, InterruptedException { public void testAllValidFilesAreKept() throws IOException, InterruptedException { Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records1 = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df1 = spark.createDataFrame(records1, ThreeColumnRecord.class).coalesce(1); // original append - df1.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA") - ); + List records2 = + Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA")); Dataset df2 = spark.createDataFrame(records2, ThreeColumnRecord.class).coalesce(1); // dynamic partition overwrite - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("overwrite") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation); // second append - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); List snapshots = Lists.newArrayList(table.snapshots()); @@ -223,9 +203,8 @@ public void testAllValidFilesAreKept() throws IOException, InterruptedException SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 4 files", 4, Iterables.size(result.orphanFileLocations())); @@ -249,36 +228,22 @@ public void testAllValidFilesAreKept() throws IOException, InterruptedException public void orphanedFileRemovedWithParallelTasks() throws InterruptedException, IOException { Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records1 = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df1 = spark.createDataFrame(records1, ThreeColumnRecord.class).coalesce(1); // original append - df1.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA") - ); + List records2 = + Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA")); Dataset df2 = spark.createDataFrame(records2, ThreeColumnRecord.class).coalesce(1); // dynamic partition overwrite - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("overwrite") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation); // second append - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data"); df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA"); @@ -292,25 +257,34 @@ public void orphanedFileRemovedWithParallelTasks() throws InterruptedException, Set deleteThreads = ConcurrentHashMap.newKeySet(); AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - ExecutorService executorService = Executors.newFixedThreadPool(4, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-orphan-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); - return thread; - }); - - DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table) + ExecutorService executorService = + Executors.newFixedThreadPool( + 4, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("remove-orphan-" + deleteThreadsIndex.getAndIncrement()); + thread.setDaemon(true); + return thread; + }); + + DeleteOrphanFiles.Result result = + SparkActions.get() + .deleteOrphanFiles(table) .executeDeleteWith(executorService) .olderThan(System.currentTimeMillis()) - .deleteWith(file -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(file); - }) + .deleteWith( + file -> { + deleteThreads.add(Thread.currentThread().getName()); + deletedFiles.add(file); + }) .execute(); - // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory - Assert.assertEquals(deleteThreads, - Sets.newHashSet("remove-orphan-0", "remove-orphan-1", "remove-orphan-2", "remove-orphan-3")); + // Verifies that the delete methods ran in the threads created by the provided ExecutorService + // ThreadFactory + Assert.assertEquals( + deleteThreads, + Sets.newHashSet( + "remove-orphan-0", "remove-orphan-1", "remove-orphan-2", "remove-orphan-3")); Assert.assertEquals("Should delete 4 files", 4, deletedFiles.size()); } @@ -321,31 +295,21 @@ public void testWapFilesAreKept() throws InterruptedException { props.put(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, "true"); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); // normal write - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); spark.conf().set("spark.wap.id", "1"); // wap write - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Should not return data from the staged snapshot", records, actualRecords); // sleep for 1 second to unsure files will be old enough @@ -353,11 +317,11 @@ public void testWapFilesAreKept() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); + Assert.assertTrue( + "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); } @Test @@ -367,16 +331,11 @@ public void testMetadataFolderIsIntact() throws InterruptedException { props.put(TableProperties.WRITE_DATA_LOCATION, tableLocation); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df.write().mode("append").parquet(tableLocation + "/c2_trunc=AA/c3=AAAA"); @@ -385,16 +344,14 @@ public void testMetadataFolderIsIntact() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @@ -402,16 +359,11 @@ public void testMetadataFolderIsIntact() throws InterruptedException { public void testOlderThanTimestamp() throws InterruptedException { Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); @@ -426,11 +378,11 @@ public void testOlderThanTimestamp() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(timestamp) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(timestamp).execute(); - Assert.assertEquals("Should delete only 2 files", 2, Iterables.size(result.orphanFileLocations())); + Assert.assertEquals( + "Should delete only 2 files", 2, Iterables.size(result.orphanFileLocations())); } @Test @@ -440,34 +392,26 @@ public void testRemoveUnreachableMetadataVersionFiles() throws InterruptedExcept props.put(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1"); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); // sleep for 1 second to unsure files will be old enough Thread.sleep(1000); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); - Assert.assertTrue("Should remove v1 file", StreamSupport.stream(result.orphanFileLocations().spliterator(), false) + Assert.assertTrue( + "Should remove v1 file", + StreamSupport.stream(result.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("v1.metadata.json"))); List expectedRecords = Lists.newArrayList(); @@ -475,9 +419,8 @@ public void testRemoveUnreachableMetadataVersionFiles() throws InterruptedExcept expectedRecords.addAll(records); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -492,27 +435,22 @@ public void testManyTopLevelPartitions() throws InterruptedException { Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); // sleep for 1 second to unsure files will be old enough Thread.sleep(1000); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); + Assert.assertTrue( + "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @@ -527,32 +465,29 @@ public void testManyLeafPartitions() throws InterruptedException { Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); // sleep for 1 second to unsure files will be old enough Thread.sleep(1000); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); + Assert.assertTrue( + "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } private List snapshotFiles(long snapshotId) { - return spark.read().format("iceberg") + return spark + .read() + .format("iceberg") .option("snapshot-id", snapshotId) .load(tableLocation + "#files") .select("file_path") @@ -562,11 +497,12 @@ private List snapshotFiles(long snapshotId) { @Test public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, InterruptedException { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableDir.getAbsolutePath()); + Table table = + TABLES.create( + SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableDir.getAbsolutePath()); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); @@ -576,11 +512,14 @@ public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, Inte .mode("append") .save(tableDir.getAbsolutePath()); - List validFiles = spark.read().format("iceberg") - .load(tableLocation + "#files") - .select("file_path") - .as(Encoders.STRING()) - .collectAsList(); + List validFiles = + spark + .read() + .format("iceberg") + .load(tableLocation + "#files") + .select("file_path") + .as(Encoders.STRING()) + .collectAsList(); Assert.assertEquals("Should be 1 valid files", 1, validFiles.size()); String validFile = validFiles.get(0); @@ -588,10 +527,11 @@ public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, Inte Path dataPath = new Path(tableLocation + "/data"); FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf()); - List allFiles = Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) - .filter(FileStatus::isFile) - .map(file -> file.getPath().toString()) - .collect(Collectors.toList()); + List allFiles = + Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) + .filter(FileStatus::isFile) + .map(file -> file.getPath().toString()) + .collect(Collectors.toList()); Assert.assertEquals("Should be 2 files", 2, allFiles.size()); List invalidFiles = Lists.newArrayList(allFiles); @@ -602,10 +542,12 @@ public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, Inte Thread.sleep(1000); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .deleteWith(s -> { }) - .execute(); + DeleteOrphanFiles.Result result = + actions + .deleteOrphanFiles(table) + .olderThan(System.currentTimeMillis()) + .deleteWith(s -> {}) + .execute(); Assert.assertEquals("Action should find 1 file", invalidFiles, result.orphanFileLocations()); Assert.assertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0)))); } @@ -618,18 +560,15 @@ public void testRemoveOrphanFilesWithHadoopCatalog() throws InterruptedException Namespace namespace = Namespace.of(namespaceName); TableIdentifier tableIdentifier = TableIdentifier.of(namespace, tableName); - Table table = catalog.createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap()); + Table table = + catalog.createTable( + tableIdentifier, SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap()); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(table.location()); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(table.location()); df.write().mode("append").parquet(table.location() + "/data"); @@ -638,28 +577,30 @@ public void testRemoveOrphanFilesWithHadoopCatalog() throws InterruptedException table.refresh(); - DeleteOrphanFiles.Result result = SparkActions.get() - .deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + SparkActions.get().deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertEquals("Should delete only 1 files", 1, Iterables.size(result.orphanFileLocations())); + Assert.assertEquals( + "Should delete only 1 files", 1, Iterables.size(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(table.location()); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @Test public void testHiveCatalogTable() throws IOException { - Table table = catalog.createTable(TableIdentifier.of("default", "hivetestorphan"), SCHEMA, SPEC, tableLocation, - Maps.newHashMap()); + Table table = + catalog.createTable( + TableIdentifier.of("default", "hivetestorphan"), + SCHEMA, + SPEC, + tableLocation, + Maps.newHashMap()); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); @@ -672,35 +613,35 @@ public void testHiveCatalogTable() throws IOException { String location = table.location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result result = + SparkActions.get() + .deleteOrphanFiles(table) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(result.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "data/trashfile"))); } @Test public void testGarbageCollectionDisabled() { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - table.updateProperties() - .set(TableProperties.GC_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - AssertHelpers.assertThrows("Should complain about removing orphan files", - ValidationException.class, "Cannot remove orphan files: GC is disabled", + AssertHelpers.assertThrows( + "Should complain about removing orphan files", + ValidationException.class, + "Cannot remove orphan files: GC is disabled", () -> SparkActions.get().deleteOrphanFiles(table).execute()); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction3.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction3.java index 77eb23a6dffc..e3699eaeded1 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction3.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction3.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.io.File; @@ -54,9 +53,13 @@ public void testSparkCatalogTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @@ -80,9 +83,13 @@ public void testSparkCatalogNamedHadoopTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @@ -106,19 +113,26 @@ public void testSparkCatalogNamedHiveTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @Test public void testSparkSessionCatalogHadoopTable() throws Exception { - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); spark.conf().set("spark.sql.catalog.spark_catalog.type", "hadoop"); spark.conf().set("spark.sql.catalog.spark_catalog.warehouse", tableLocation); - SparkSessionCatalog cat = (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog(); + SparkSessionCatalog cat = + (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog(); String[] database = {"default"}; Identifier id = Identifier.of(database, "table"); @@ -132,18 +146,25 @@ public void testSparkSessionCatalogHadoopTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @Test public void testSparkSessionCatalogHiveTable() throws Exception { - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); spark.conf().set("spark.sql.catalog.spark_catalog.type", "hive"); - SparkSessionCatalog cat = (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog(); + SparkSessionCatalog cat = + (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog(); String[] database = {"default"}; Identifier id = Identifier.of(database, "sessioncattest"); @@ -158,9 +179,13 @@ public void testSparkSessionCatalogHiveTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @@ -171,5 +196,4 @@ public void resetSparkSessionCatalog() throws Exception { spark.conf().unset("spark.sql.catalog.spark_catalog.type"); spark.conf().unset("spark.sql.catalog.spark_catalog.warehouse"); } - } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index b057b8e62ba7..3bc07093714f 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -16,9 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.argThat; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.doCallRealMethod; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.spy; + import java.io.File; import java.io.IOException; import java.io.UncheckedIOException; @@ -95,28 +103,18 @@ import org.mockito.ArgumentMatcher; import org.mockito.Mockito; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.argThat; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.doCallRealMethod; -import static org.mockito.Mockito.doReturn; -import static org.mockito.Mockito.doThrow; -import static org.mockito.Mockito.spy; - public class TestRewriteDataFilesAction extends SparkTestBase { private static final int SCALE = 400000; private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final FileRewriteCoordinator coordinator = FileRewriteCoordinator.get(); private final FileScanTaskSetManager manager = FileScanTaskSetManager.get(); @@ -185,10 +183,11 @@ public void testBinPackWithFilter() { shouldHaveFiles(table, 8); List expectedRecords = currentData(); - Result result = basicRewrite(table) - .filter(Expressions.equal("c1", 1)) - .filter(Expressions.startsWith("c2", "foo")) - .execute(); + Result result = + basicRewrite(table) + .filter(Expressions.equal("c1", 1)) + .filter(Expressions.startsWith("c2", "foo")) + .execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFilesCount()); @@ -207,32 +206,33 @@ public void testBinPackWithDeletes() throws Exception { table.refresh(); CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum(); RowDelta rowDelta = table.newRowDelta(); // add 1 delete file for data files 0, 1, 2 for (int i = 0; i < 3; i++) { - writePosDeletesToFile(table, dataFiles.get(i), 1) - .forEach(rowDelta::addDeletes); + writePosDeletesToFile(table, dataFiles.get(i), 1).forEach(rowDelta::addDeletes); } // add 2 delete files for data files 3, 4 for (int i = 3; i < 5; i++) { - writePosDeletesToFile(table, dataFiles.get(i), 2) - .forEach(rowDelta::addDeletes); + writePosDeletesToFile(table, dataFiles.get(i), 2).forEach(rowDelta::addDeletes); } rowDelta.commit(); table.refresh(); List expectedRecords = currentData(); - Result result = actions().rewriteDataFiles(table) - // do not include any file based on bin pack file size configs - .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, "0") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)) - .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "2") - .execute(); + Result result = + actions() + .rewriteDataFiles(table) + // do not include any file based on bin pack file size configs + .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, "0") + .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)) + .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)) + .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "2") + .execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount()); List actualRecords = currentData(); @@ -249,20 +249,22 @@ public void testBinPackWithDeleteAllData() { table.refresh(); CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum(); RowDelta rowDelta = table.newRowDelta(); // remove all data - writePosDeletesToFile(table, dataFiles.get(0), total) - .forEach(rowDelta::addDeletes); + writePosDeletesToFile(table, dataFiles.get(0), total).forEach(rowDelta::addDeletes); rowDelta.commit(); table.refresh(); List expectedRecords = currentData(); - Result result = actions().rewriteDataFiles(table) - .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "1") - .execute(); + Result result = + actions() + .rewriteDataFiles(table) + .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "1") + .execute(); Assert.assertEquals("Action should rewrite 1 data files", 1, result.rewrittenDataFilesCount()); List actualRecords = currentData(); @@ -271,7 +273,8 @@ public void testBinPackWithDeleteAllData() { "Data manifest should not have existing data file", 0, (long) table.currentSnapshot().dataManifests(table.io()).get(0).existingFilesCount()); - Assert.assertEquals("Data manifest should have 1 delete data file", + Assert.assertEquals( + "Data manifest should have 1 delete data file", 1L, (long) table.currentSnapshot().dataManifests(table.io()).get(0).deletedFilesCount()); Assert.assertEquals( @@ -289,9 +292,8 @@ public void testBinPackWithStartingSequenceNumber() { table.refresh(); long oldSequenceNumber = table.currentSnapshot().sequenceNumber(); - Result result = basicRewrite(table) - .option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true") - .execute(); + Result result = + basicRewrite(table).option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true").execute(); Assert.assertEquals("Action should rewrite 8 data files", 8, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFilesCount()); @@ -300,13 +302,15 @@ public void testBinPackWithStartingSequenceNumber() { assertEquals("Rows must match", expectedRecords, actualRecords); table.refresh(); - Assert.assertTrue("Table sequence number should be incremented", + Assert.assertTrue( + "Table sequence number should be incremented", oldSequenceNumber < table.currentSnapshot().sequenceNumber()); Dataset rows = SparkTableUtil.loadMetadataTable(spark, table, MetadataTableType.ENTRIES); for (Row row : rows.collectAsList()) { if (row.getInt(0) == 1) { - Assert.assertEquals("Expect old sequence number for added entries", oldSequenceNumber, row.getLong(2)); + Assert.assertEquals( + "Expect old sequence number for added entries", oldSequenceNumber, row.getLong(2)); } } } @@ -320,9 +324,8 @@ public void testBinPackWithStartingSequenceNumberV1Compatibility() { long oldSequenceNumber = table.currentSnapshot().sequenceNumber(); Assert.assertEquals("Table sequence number should be 0", 0, oldSequenceNumber); - Result result = basicRewrite(table) - .option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true") - .execute(); + Result result = + basicRewrite(table).option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true").execute(); Assert.assertEquals("Action should rewrite 8 data files", 8, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFilesCount()); @@ -331,13 +334,15 @@ public void testBinPackWithStartingSequenceNumberV1Compatibility() { assertEquals("Rows must match", expectedRecords, actualRecords); table.refresh(); - Assert.assertEquals("Table sequence number should still be 0", - oldSequenceNumber, table.currentSnapshot().sequenceNumber()); + Assert.assertEquals( + "Table sequence number should still be 0", + oldSequenceNumber, + table.currentSnapshot().sequenceNumber()); Dataset rows = SparkTableUtil.loadMetadataTable(spark, table, MetadataTableType.ENTRIES); for (Row row : rows.collectAsList()) { - Assert.assertEquals("Expect sequence number 0 for all entries", - oldSequenceNumber, row.getLong(2)); + Assert.assertEquals( + "Expect sequence number 0 for all entries", oldSequenceNumber, row.getLong(2)); } } @@ -360,19 +365,15 @@ public void testRewriteLargeTableHasResiduals() { table.refresh(); - CloseableIterable tasks = table.newScan() - .ignoreResiduals() - .filter(Expressions.equal("c3", "0")) - .planFiles(); + CloseableIterable tasks = + table.newScan().ignoreResiduals().filter(Expressions.equal("c3", "0")).planFiles(); for (FileScanTask task : tasks) { Assert.assertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual()); } shouldHaveFiles(table, 2); - Result result = basicRewrite(table) - .filter(Expressions.equal("c3", "0")) - .execute(); + Result result = basicRewrite(table).filter(Expressions.equal("c3", "0")).execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFilesCount()); @@ -389,10 +390,11 @@ public void testBinPackSplitLargeFile() { List expectedRecords = currentData(); long targetSize = testDataSize(table) / 2; - Result result = basicRewrite(table) - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(targetSize)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(targetSize * 2 - 2000)) - .execute(); + Result result = + basicRewrite(table) + .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(targetSize)) + .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(targetSize * 2 - 2000)) + .execute(); Assert.assertEquals("Action should delete 1 data files", 1, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 2 data files", 2, result.addedDataFilesCount()); @@ -417,14 +419,16 @@ public void testBinPackCombineMixedFiles() { int targetSize = averageFileSize(table); - Result result = basicRewrite(table) - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize + 1000)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString(targetSize + 80000)) - .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 1000)) - .execute(); + Result result = + basicRewrite(table) + .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize + 1000)) + .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString(targetSize + 80000)) + .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 1000)) + .execute(); Assert.assertEquals("Action should delete 3 data files", 3, result.rewrittenDataFilesCount()); - // Should Split the big files into 3 pieces, one of which should be combined with the two smaller files + // Should Split the big files into 3 pieces, one of which should be combined with the two + // smaller files Assert.assertEquals("Action should add 3 data files", 3, result.addedDataFilesCount()); shouldHaveFiles(table, 3); @@ -442,11 +446,14 @@ public void testBinPackCombineMediumFiles() { int targetSize = ((int) testDataSize(table) / 3); // The test is to see if we can combine parts of files to make files of the correct size - Result result = basicRewrite(table) - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString((int) (targetSize * 1.8))) - .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 100)) // All files too small - .execute(); + Result result = + basicRewrite(table) + .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize)) + .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString((int) (targetSize * 1.8))) + .option( + BinPackStrategy.MIN_FILE_SIZE_BYTES, + Integer.toString(targetSize - 100)) // All files too small + .execute(); Assert.assertEquals("Action should delete 4 data files", 4, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 3 data files", 3, result.addedDataFilesCount()); @@ -468,7 +475,8 @@ public void testPartialProgressEnabled() { RewriteDataFiles.Result result = basicRewrite(table) .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "10") .execute(); @@ -493,7 +501,8 @@ public void testMultipleGroups() { // Perform a rewrite but only allow 2 files to be compacted at a time RewriteDataFiles.Result result = basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .option(BinPackStrategy.MIN_INPUT_FILES, "1") .execute(); @@ -518,7 +527,8 @@ public void testPartialProgressMaxCommits() { // Perform a rewrite but only allow 2 files to be compacted at a time RewriteDataFiles.Result result = basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3") .execute(); @@ -544,7 +554,9 @@ public void testSingleCommitWithRewriteFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)); + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)); BaseRewriteDataFilesSparkAction spyRewrite = Mockito.spy(realRewrite); @@ -554,7 +566,9 @@ public void testSingleCommitWithRewriteFailure() { .when(spyRewrite) .rewriteFiles(any(), argThat(failGroup)); - AssertHelpers.assertThrows("Should fail entire rewrite if part fails", RuntimeException.class, + AssertHelpers.assertThrows( + "Should fail entire rewrite if part fails", + RuntimeException.class, () -> spyRewrite.execute()); table.refresh(); @@ -577,21 +591,21 @@ public void testSingleCommitWithCommitFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)); + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)); BaseRewriteDataFilesSparkAction spyRewrite = spy(realRewrite); RewriteDataFilesCommitManager util = spy(new RewriteDataFilesCommitManager(table)); // Fail to commit - doThrow(new RuntimeException("Commit Failure")) - .when(util) - .commitFileGroups(any()); + doThrow(new RuntimeException("Commit Failure")).when(util).commitFileGroups(any()); - doReturn(util) - .when(spyRewrite) - .commitManager(table.currentSnapshot().snapshotId()); + doReturn(util).when(spyRewrite).commitManager(table.currentSnapshot().snapshotId()); - AssertHelpers.assertThrows("Should fail entire rewrite if commit fails", RuntimeException.class, + AssertHelpers.assertThrows( + "Should fail entire rewrite if commit fails", + RuntimeException.class, () -> spyRewrite.execute()); table.refresh(); @@ -614,7 +628,9 @@ public void testParallelSingleCommitWithRewriteFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3"); BaseRewriteDataFilesSparkAction spyRewrite = Mockito.spy(realRewrite); @@ -625,7 +641,9 @@ public void testParallelSingleCommitWithRewriteFailure() { .when(spyRewrite) .rewriteFiles(any(), argThat(failGroup)); - AssertHelpers.assertThrows("Should fail entire rewrite if part fails", RuntimeException.class, + AssertHelpers.assertThrows( + "Should fail entire rewrite if part fails", + RuntimeException.class, () -> spyRewrite.execute()); table.refresh(); @@ -648,7 +666,9 @@ public void testPartialProgressWithRewriteFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3"); @@ -686,7 +706,9 @@ public void testParallelPartialProgressWithRewriteFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3") .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3"); @@ -725,7 +747,9 @@ public void testParallelPartialProgressWithCommitFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3") .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3"); @@ -740,9 +764,7 @@ public void testParallelPartialProgressWithCommitFailure() { .when(util) .commitFileGroups(any()); - doReturn(util) - .when(spyRewrite) - .commitManager(table.currentSnapshot().snapshotId()); + doReturn(util).when(spyRewrite).commitManager(table.currentSnapshot().snapshotId()); RewriteDataFiles.Result result = spyRewrite.execute(); @@ -764,30 +786,32 @@ public void testParallelPartialProgressWithCommitFailure() { public void testInvalidOptions() { Table table = createTable(20); - AssertHelpers.assertThrows("No negative values for partial progress max commits", + AssertHelpers.assertThrows( + "No negative values for partial progress max commits", IllegalArgumentException.class, - () -> basicRewrite(table) - .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") - .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "-5") - .execute()); + () -> + basicRewrite(table) + .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") + .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "-5") + .execute()); - AssertHelpers.assertThrows("No negative values for max concurrent groups", + AssertHelpers.assertThrows( + "No negative values for max concurrent groups", IllegalArgumentException.class, - () -> basicRewrite(table) - .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "-5") - .execute()); + () -> + basicRewrite(table) + .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "-5") + .execute()); - AssertHelpers.assertThrows("No unknown options allowed", + AssertHelpers.assertThrows( + "No unknown options allowed", IllegalArgumentException.class, - () -> basicRewrite(table) - .option("foobarity", "-5") - .execute()); + () -> basicRewrite(table).option("foobarity", "-5").execute()); - AssertHelpers.assertThrows("Cannot set rewrite-job-order to foo", + AssertHelpers.assertThrows( + "Cannot set rewrite-job-order to foo", IllegalArgumentException.class, - () -> basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, "foo") - .execute()); + () -> basicRewrite(table).option(RewriteDataFiles.REWRITE_JOB_ORDER, "foo").execute()); } @Test @@ -805,7 +829,8 @@ public void testSortMultipleGroups() { basicRewrite(table) .sort() .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .execute(); Assert.assertEquals("Should have 10 fileGroups", result.rewriteResults().size(), 10); @@ -833,7 +858,8 @@ public void testSimpleSort() { .sort() .option(SortStrategy.MIN_INPUT_FILES, "1") .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); @@ -864,11 +890,14 @@ public void testSortAfterPartitionChange() { .sort() .option(SortStrategy.MIN_INPUT_FILES, "1") .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); - Assert.assertEquals("Should have 1 fileGroup because all files were not correctly partitioned", - result.rewriteResults().size(), 1); + Assert.assertEquals( + "Should have 1 fileGroup because all files were not correctly partitioned", + result.rewriteResults().size(), + 1); table.refresh(); @@ -893,7 +922,8 @@ public void testSortCustomSortOrder() { basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c2").build()) .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); @@ -928,7 +958,9 @@ public void testSortCustomSortOrderRequiresRepartition() { basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c3").build()) .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) / partitions)) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + Integer.toString(averageFileSize(table) / partitions)) .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); @@ -956,14 +988,19 @@ public void testAutoSortShuffleOutput() { RewriteDataFiles.Result result = basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c2").build()) - .option(SortStrategy.MAX_FILE_SIZE_BYTES, Integer.toString((averageFileSize(table) / 2) + 2)) + .option( + SortStrategy.MAX_FILE_SIZE_BYTES, + Integer.toString((averageFileSize(table) / 2) + 2)) // Divide files in 2 - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) / 2)) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + Integer.toString(averageFileSize(table) / 2)) .option(SortStrategy.MIN_INPUT_FILES, "1") .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); - Assert.assertTrue("Should have written 40+ files", + Assert.assertTrue( + "Should have written 40+ files", Iterables.size(table.currentSnapshot().addedDataFiles(table.io())) >= 40); table.refresh(); @@ -988,17 +1025,20 @@ public void testCommitStateUnknownException() { BaseRewriteDataFilesSparkAction spyAction = spy(action); RewriteDataFilesCommitManager util = spy(new RewriteDataFilesCommitManager(table)); - doAnswer(invocationOnMock -> { - invocationOnMock.callRealMethod(); - throw new CommitStateUnknownException(new RuntimeException("Unknown State")); - }).when(util).commitFileGroups(any()); + doAnswer( + invocationOnMock -> { + invocationOnMock.callRealMethod(); + throw new CommitStateUnknownException(new RuntimeException("Unknown State")); + }) + .when(util) + .commitFileGroups(any()); - doReturn(util) - .when(spyAction) - .commitManager(table.currentSnapshot().snapshotId()); + doReturn(util).when(spyAction).commitManager(table.currentSnapshot().snapshotId()); - AssertHelpers.assertThrows("Should propagate CommitStateUnknown Exception", - CommitStateUnknownException.class, () -> spyAction.execute()); + AssertHelpers.assertThrows( + "Should propagate CommitStateUnknown Exception", + CommitStateUnknownException.class, + () -> spyAction.execute()); List postRewriteData = currentData(); assertEquals("We shouldn't have changed the data", originalData, postRewriteData); @@ -1010,14 +1050,23 @@ public void testCommitStateUnknownException() { public void testInvalidAPIUsage() { Table table = createTable(1); - AssertHelpers.assertThrows("Should be unable to set Strategy more than once", IllegalArgumentException.class, - "Cannot set strategy", () -> actions().rewriteDataFiles(table).binPack().sort()); + AssertHelpers.assertThrows( + "Should be unable to set Strategy more than once", + IllegalArgumentException.class, + "Cannot set strategy", + () -> actions().rewriteDataFiles(table).binPack().sort()); - AssertHelpers.assertThrows("Should be unable to set Strategy more than once", IllegalArgumentException.class, - "Cannot set strategy", () -> actions().rewriteDataFiles(table).sort().binPack()); + AssertHelpers.assertThrows( + "Should be unable to set Strategy more than once", + IllegalArgumentException.class, + "Cannot set strategy", + () -> actions().rewriteDataFiles(table).sort().binPack()); - AssertHelpers.assertThrows("Should be unable to set Strategy more than once", IllegalArgumentException.class, - "Cannot set strategy", () -> actions().rewriteDataFiles(table).sort(SortOrder.unsorted()).binPack()); + AssertHelpers.assertThrows( + "Should be unable to set Strategy more than once", + IllegalArgumentException.class, + "Cannot set strategy", + () -> actions().rewriteDataFiles(table).sort(SortOrder.unsorted()).binPack()); } @Test @@ -1030,21 +1079,23 @@ public void testRewriteJobOrderBytesAsc() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); BaseRewriteDataFilesSparkAction basicRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .binPack(); - List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::sizeInBytes) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) basicRewrite(table).binPack(); + List expected = + toGroupStream(table, basicRewrite) + .mapToLong(RewriteFileGroup::sizeInBytes) + .boxed() + .collect(Collectors.toList()); BaseRewriteDataFilesSparkAction jobOrderRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.BYTES_ASC.orderName()) - .binPack(); - List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::sizeInBytes) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) + basicRewrite(table) + .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.BYTES_ASC.orderName()) + .binPack(); + List actual = + toGroupStream(table, jobOrderRewrite) + .mapToLong(RewriteFileGroup::sizeInBytes) + .boxed() + .collect(Collectors.toList()); expected.sort(Comparator.naturalOrder()); Assert.assertEquals("Size in bytes order should be ascending", actual, expected); @@ -1062,21 +1113,23 @@ public void testRewriteJobOrderBytesDesc() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); BaseRewriteDataFilesSparkAction basicRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .binPack(); - List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::sizeInBytes) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) basicRewrite(table).binPack(); + List expected = + toGroupStream(table, basicRewrite) + .mapToLong(RewriteFileGroup::sizeInBytes) + .boxed() + .collect(Collectors.toList()); BaseRewriteDataFilesSparkAction jobOrderRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.BYTES_DESC.orderName()) - .binPack(); - List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::sizeInBytes) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) + basicRewrite(table) + .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.BYTES_DESC.orderName()) + .binPack(); + List actual = + toGroupStream(table, jobOrderRewrite) + .mapToLong(RewriteFileGroup::sizeInBytes) + .boxed() + .collect(Collectors.toList()); expected.sort(Comparator.reverseOrder()); Assert.assertEquals("Size in bytes order should be descending", actual, expected); @@ -1094,21 +1147,23 @@ public void testRewriteJobOrderFilesAsc() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); BaseRewriteDataFilesSparkAction basicRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .binPack(); - List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::numFiles) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) basicRewrite(table).binPack(); + List expected = + toGroupStream(table, basicRewrite) + .mapToLong(RewriteFileGroup::numFiles) + .boxed() + .collect(Collectors.toList()); BaseRewriteDataFilesSparkAction jobOrderRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.FILES_ASC.orderName()) - .binPack(); - List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::numFiles) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) + basicRewrite(table) + .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.FILES_ASC.orderName()) + .binPack(); + List actual = + toGroupStream(table, jobOrderRewrite) + .mapToLong(RewriteFileGroup::numFiles) + .boxed() + .collect(Collectors.toList()); expected.sort(Comparator.naturalOrder()); Assert.assertEquals("Number of files order should be ascending", actual, expected); @@ -1126,21 +1181,23 @@ public void testRewriteJobOrderFilesDesc() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); BaseRewriteDataFilesSparkAction basicRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .binPack(); - List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::numFiles) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) basicRewrite(table).binPack(); + List expected = + toGroupStream(table, basicRewrite) + .mapToLong(RewriteFileGroup::numFiles) + .boxed() + .collect(Collectors.toList()); BaseRewriteDataFilesSparkAction jobOrderRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.FILES_DESC.orderName()) - .binPack(); - List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::numFiles) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) + basicRewrite(table) + .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.FILES_DESC.orderName()) + .binPack(); + List actual = + toGroupStream(table, jobOrderRewrite) + .mapToLong(RewriteFileGroup::numFiles) + .boxed() + .collect(Collectors.toList()); expected.sort(Comparator.reverseOrder()); Assert.assertEquals("Number of files order should be descending", actual, expected); @@ -1148,8 +1205,8 @@ public void testRewriteJobOrderFilesDesc() { Assert.assertNotEquals("Number of files order should not be ascending", actual, expected); } - private Stream toGroupStream(Table table, - BaseRewriteDataFilesSparkAction rewrite) { + private Stream toGroupStream( + Table table, BaseRewriteDataFilesSparkAction rewrite) { rewrite.validateAndInitOptions(); Map>> fileGroupsByPartition = rewrite.planFileGroups(table.currentSnapshot().snapshotId()); @@ -1159,9 +1216,8 @@ private Stream toGroupStream(Table table, } protected List currentData() { - return rowsToJava(spark.read().format("iceberg").load(tableLocation) - .sort("c1", "c2", "c3") - .collectAsList()); + return rowsToJava( + spark.read().format("iceberg").load(tableLocation).sort("c1", "c2", "c3").collectAsList()); } protected long testDataSize(Table table) { @@ -1183,83 +1239,102 @@ protected void shouldHaveFiles(Table table, int numExpected) { protected void shouldHaveSnapshots(Table table, int expectedSnapshots) { table.refresh(); int actualSnapshots = Iterables.size(table.snapshots()); - Assert.assertEquals("Table did not have the expected number of snapshots", - expectedSnapshots, actualSnapshots); + Assert.assertEquals( + "Table did not have the expected number of snapshots", expectedSnapshots, actualSnapshots); } protected void shouldHaveNoOrphans(Table table) { - Assert.assertEquals("Should not have found any orphan files", ImmutableList.of(), - actions().deleteOrphanFiles(table) + Assert.assertEquals( + "Should not have found any orphan files", + ImmutableList.of(), + actions() + .deleteOrphanFiles(table) .olderThan(System.currentTimeMillis()) .execute() .orphanFileLocations()); } protected void shouldHaveACleanCache(Table table) { - Assert.assertEquals("Should not have any entries in cache", ImmutableSet.of(), - cacheContents(table)); + Assert.assertEquals( + "Should not have any entries in cache", ImmutableSet.of(), cacheContents(table)); } protected void shouldHaveLastCommitSorted(Table table, String column) { - List, Pair>> - overlappingFiles = checkForOverlappingFiles(table, column); + List, Pair>> overlappingFiles = checkForOverlappingFiles(table, column); Assert.assertEquals("Found overlapping files", Collections.emptyList(), overlappingFiles); } protected void shouldHaveLastCommitUnsorted(Table table, String column) { - List, Pair>> - overlappingFiles = checkForOverlappingFiles(table, column); + List, Pair>> overlappingFiles = checkForOverlappingFiles(table, column); Assert.assertNotEquals("Found no overlapping files", Collections.emptyList(), overlappingFiles); } private Pair boundsOf(DataFile file, NestedField field, Class javaClass) { int columnId = field.fieldId(); - return Pair.of(javaClass.cast(Conversions.fromByteBuffer(field.type(), file.lowerBounds().get(columnId))), + return Pair.of( + javaClass.cast(Conversions.fromByteBuffer(field.type(), file.lowerBounds().get(columnId))), javaClass.cast(Conversions.fromByteBuffer(field.type(), file.upperBounds().get(columnId)))); } - private List, Pair>> checkForOverlappingFiles(Table table, String column) { + private List, Pair>> checkForOverlappingFiles( + Table table, String column) { table.refresh(); NestedField field = table.schema().caseInsensitiveFindField(column); Class javaClass = (Class) field.type().typeId().javaClass(); Snapshot snapshot = table.currentSnapshot(); - Map> filesByPartition = Streams.stream(snapshot.addedDataFiles(table.io())) - .collect(Collectors.groupingBy(DataFile::partition)); + Map> filesByPartition = + Streams.stream(snapshot.addedDataFiles(table.io())) + .collect(Collectors.groupingBy(DataFile::partition)); Stream, Pair>> overlaps = - filesByPartition.entrySet().stream().flatMap(entry -> { - List datafiles = entry.getValue(); - Preconditions.checkArgument(datafiles.size() > 1, - "This test is checking for overlaps in a situation where no overlaps can actually occur because the " + - "partition %s does not contain multiple datafiles", entry.getKey()); - - List, Pair>> boundComparisons = Lists.cartesianProduct(datafiles, datafiles).stream() - .filter(tuple -> tuple.get(0) != tuple.get(1)) - .map(tuple -> Pair.of(boundsOf(tuple.get(0), field, javaClass), boundsOf(tuple.get(1), field, javaClass))) - .collect(Collectors.toList()); - - Comparator comparator = Comparators.forType(field.type().asPrimitiveType()); - - List, Pair>> overlappingFiles = boundComparisons.stream() - .filter(filePair -> { - Pair left = filePair.first(); - T lMin = left.first(); - T lMax = left.second(); - Pair right = filePair.second(); - T rMin = right.first(); - T rMax = right.second(); - boolean boundsDoNotOverlap = - // Min and Max of a range are greater than or equal to the max value of the other range - (comparator.compare(rMax, lMax) >= 0 && comparator.compare(rMin, lMax) >= 0) || - (comparator.compare(lMax, rMax) >= 0 && comparator.compare(lMin, rMax) >= 0); - - return !boundsDoNotOverlap; - }).collect(Collectors.toList()); - return overlappingFiles.stream(); - }); + filesByPartition.entrySet().stream() + .flatMap( + entry -> { + List datafiles = entry.getValue(); + Preconditions.checkArgument( + datafiles.size() > 1, + "This test is checking for overlaps in a situation where no overlaps can actually occur because the " + + "partition %s does not contain multiple datafiles", + entry.getKey()); + + List, Pair>> boundComparisons = + Lists.cartesianProduct(datafiles, datafiles).stream() + .filter(tuple -> tuple.get(0) != tuple.get(1)) + .map( + tuple -> + Pair.of( + boundsOf(tuple.get(0), field, javaClass), + boundsOf(tuple.get(1), field, javaClass))) + .collect(Collectors.toList()); + + Comparator comparator = Comparators.forType(field.type().asPrimitiveType()); + + List, Pair>> overlappingFiles = + boundComparisons.stream() + .filter( + filePair -> { + Pair left = filePair.first(); + T lMin = left.first(); + T lMax = left.second(); + Pair right = filePair.second(); + T rMin = right.first(); + T rMax = right.second(); + boolean boundsDoNotOverlap = + // Min and Max of a range are greater than or equal to the max + // value of the other range + (comparator.compare(rMax, lMax) >= 0 + && comparator.compare(rMin, lMax) >= 0) + || (comparator.compare(lMax, rMax) >= 0 + && comparator.compare(lMin, rMax) >= 0); + + return !boundsDoNotOverlap; + }) + .collect(Collectors.toList()); + return overlappingFiles.stream(); + }); return overlaps.collect(Collectors.toList()); } @@ -1268,13 +1343,17 @@ protected Table createTable() { PartitionSpec spec = PartitionSpec.unpartitioned(); Map options = Maps.newHashMap(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - table.updateProperties().set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, Integer.toString(20 * 1024)).commit(); + table + .updateProperties() + .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, Integer.toString(20 * 1024)) + .commit(); Assert.assertNull("Table must be empty", table.currentSnapshot()); return table; } /** * Create a table with a certain number of files, returns the size of a file + * * @param files number of files to create * @return the created table */ @@ -1284,12 +1363,9 @@ protected Table createTable(int files) { return table; } - protected Table createTablePartitioned(int partitions, int files, - int numRecords, Map options) { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .truncate("c2", 2) - .build(); + protected Table createTablePartitioned( + int partitions, int files, int numRecords, Map options) { + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); Assert.assertNull("Table must be empty", table.currentSnapshot()); @@ -1303,7 +1379,11 @@ protected Table createTablePartitioned(int partitions, int files) { protected int averageFileSize(Table table) { table.refresh(); - return (int) Streams.stream(table.newScan().planFiles()).mapToLong(FileScanTask::length).average().getAsDouble(); + return (int) + Streams.stream(table.newScan().planFiles()) + .mapToLong(FileScanTask::length) + .average() + .getAsDouble(); } private void writeRecords(int files, int numRecords) { @@ -1314,20 +1394,21 @@ private void writeRecords(int files, int numRecords, int partitions) { List records = Lists.newArrayList(); int rowDimension = (int) Math.ceil(Math.sqrt(numRecords)); List> data = - IntStream.range(0, rowDimension).boxed().flatMap(x -> - IntStream.range(0, rowDimension).boxed().map(y -> Pair.of(x, y))) + IntStream.range(0, rowDimension) + .boxed() + .flatMap(x -> IntStream.range(0, rowDimension).boxed().map(y -> Pair.of(x, y))) .collect(Collectors.toList()); Collections.shuffle(data, new Random(42)); if (partitions > 0) { - data.forEach(i -> records.add(new ThreeColumnRecord( - i.first() % partitions, - "foo" + i.first(), - "bar" + i.second()))); + data.forEach( + i -> + records.add( + new ThreeColumnRecord( + i.first() % partitions, "foo" + i.first(), "bar" + i.second()))); } else { - data.forEach(i -> records.add(new ThreeColumnRecord( - i.first(), - "foo" + i.first(), - "bar" + i.second()))); + data.forEach( + i -> + records.add(new ThreeColumnRecord(i.first(), "foo" + i.first(), "bar" + i.second()))); } Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).repartition(files); writeDF(df); @@ -1342,24 +1423,31 @@ private void writeDF(Dataset df) { .save(tableLocation); } - private List writePosDeletesToFile(Table table, DataFile dataFile, int outputDeleteFiles) { - return writePosDeletes(table, dataFile.partition(), dataFile.path().toString(), outputDeleteFiles); + private List writePosDeletesToFile( + Table table, DataFile dataFile, int outputDeleteFiles) { + return writePosDeletes( + table, dataFile.partition(), dataFile.path().toString(), outputDeleteFiles); } - private List writePosDeletes(Table table, StructLike partition, String path, int outputDeleteFiles) { + private List writePosDeletes( + Table table, StructLike partition, String path, int outputDeleteFiles) { List results = Lists.newArrayList(); int rowPosition = 0; for (int file = 0; file < outputDeleteFiles; file++) { - OutputFile outputFile = table.io().newOutputFile( - table.locationProvider().newDataLocation(UUID.randomUUID().toString())); - EncryptedOutputFile encryptedOutputFile = EncryptedFiles.encryptedOutput( - outputFile, EncryptionKeyMetadata.EMPTY); - - GenericAppenderFactory appenderFactory = new GenericAppenderFactory( - table.schema(), table.spec(), null, null, null); - PositionDeleteWriter posDeleteWriter = appenderFactory - .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full") - .newPosDeleteWriter(encryptedOutputFile, FileFormat.PARQUET, partition); + OutputFile outputFile = + table + .io() + .newOutputFile( + table.locationProvider().newDataLocation(UUID.randomUUID().toString())); + EncryptedOutputFile encryptedOutputFile = + EncryptedFiles.encryptedOutput(outputFile, EncryptionKeyMetadata.EMPTY); + + GenericAppenderFactory appenderFactory = + new GenericAppenderFactory(table.schema(), table.spec(), null, null, null); + PositionDeleteWriter posDeleteWriter = + appenderFactory + .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full") + .newPosDeleteWriter(encryptedOutputFile, FileFormat.PARQUET, partition); posDeleteWriter.delete(path, rowPosition); try { diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java index f30251e74001..4b50ea0c29f3 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + import java.io.File; import java.io.IOException; import java.util.List; @@ -53,28 +57,22 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - @RunWith(Parameterized.class) public class TestRewriteManifestsAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); @Parameterized.Parameters(name = "snapshotIdInheritanceEnabled = {0}") public static Object[] parameters() { - return new Object[] { "true", "false" }; + return new Object[] {"true", "false"}; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String snapshotIdInheritanceEnabled; private String tableLocation = null; @@ -100,7 +98,8 @@ public void testRewriteManifestsEmptyTable() throws IOException { SparkActions actions = SparkActions.get(); - actions.rewriteManifests(table) + actions + .rewriteManifests(table) .rewriteIf(manifest -> true) .stagingLocation(temp.newFolder().toString()) .execute(); @@ -115,16 +114,15 @@ public void testRewriteSmallManifestsNonPartitionedTable() { options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -134,12 +132,13 @@ public void testRewriteSmallManifestsNonPartitionedTable() { SparkActions actions = SparkActions.get(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .execute(); + RewriteManifests.Result result = + actions.rewriteManifests(table).rewriteIf(manifest -> true).execute(); - Assert.assertEquals("Action should rewrite 2 manifests", 2, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite 2 manifests", 2, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); table.refresh(); @@ -155,9 +154,8 @@ public void testRewriteSmallManifestsNonPartitionedTable() { expectedRecords.addAll(records2); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -169,16 +167,15 @@ public void testRewriteManifestsWithCommitStateUnknownException() { options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -191,15 +188,19 @@ public void testRewriteManifestsWithCommitStateUnknownException() { // create a spy which would throw a CommitStateUnknownException after successful commit. org.apache.iceberg.RewriteManifests newRewriteManifests = table.rewriteManifests(); org.apache.iceberg.RewriteManifests spyNewRewriteManifests = spy(newRewriteManifests); - doAnswer(invocation -> { - newRewriteManifests.commit(); - throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); - }).when(spyNewRewriteManifests).commit(); + doAnswer( + invocation -> { + newRewriteManifests.commit(); + throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); + }) + .when(spyNewRewriteManifests) + .commit(); Table spyTable = spy(table); when(spyTable.rewriteManifests()).thenReturn(spyNewRewriteManifests); - AssertHelpers.assertThrowsCause("Should throw a Commit State Unknown Exception", + AssertHelpers.assertThrowsCause( + "Should throw a Commit State Unknown Exception", RuntimeException.class, "Datacenter on Fire", () -> actions.rewriteManifests(spyTable).rewriteIf(manifest -> true).execute()); @@ -219,45 +220,40 @@ public void testRewriteManifestsWithCommitStateUnknownException() { expectedRecords.addAll(records2); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @Test public void testRewriteSmallManifestsPartitionedTable() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .truncate("c2", 2) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); - List records3 = Lists.newArrayList( - new ThreeColumnRecord(3, "EEEEEEEEEE", "EEEE"), - new ThreeColumnRecord(3, "FFFFFFFFFF", "FFFF") - ); + List records3 = + Lists.newArrayList( + new ThreeColumnRecord(3, "EEEEEEEEEE", "EEEE"), + new ThreeColumnRecord(3, "FFFFFFFFFF", "FFFF")); writeRecords(records3); - List records4 = Lists.newArrayList( - new ThreeColumnRecord(4, "GGGGGGGGGG", "GGGG"), - new ThreeColumnRecord(4, "HHHHHHHHHG", "HHHH") - ); + List records4 = + Lists.newArrayList( + new ThreeColumnRecord(4, "GGGGGGGGGG", "GGGG"), + new ThreeColumnRecord(4, "HHHHHHHHHG", "HHHH")); writeRecords(records4); table.refresh(); @@ -271,16 +267,18 @@ public void testRewriteSmallManifestsPartitionedTable() { long manifestEntrySizeBytes = computeManifestEntrySizeBytes(manifests); long targetManifestSizeBytes = (long) (1.05 * 4 * manifestEntrySizeBytes); - table.updateProperties() + table + .updateProperties() .set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(targetManifestSizeBytes)) .commit(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .execute(); + RewriteManifests.Result result = + actions.rewriteManifests(table).rewriteIf(manifest -> true).execute(); - Assert.assertEquals("Action should rewrite 4 manifests", 4, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite 4 manifests", 4, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); table.refresh(); @@ -302,32 +300,29 @@ public void testRewriteSmallManifestsPartitionedTable() { expectedRecords.addAll(records4); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @Test public void testRewriteImportedManifests() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c3") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c3").build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); File parquetTableDir = temp.newFolder("parquet_table"); String parquetTableLocation = parquetTableDir.toURI().toString(); try { Dataset inputDF = spark.createDataFrame(records, ThreeColumnRecord.class); - inputDF.select("c1", "c2", "c3") + inputDF + .select("c1", "c2", "c3") .write() .format("parquet") .mode("overwrite") @@ -336,20 +331,26 @@ public void testRewriteImportedManifests() throws IOException { .saveAsTable("parquet_table"); File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); + SparkTableUtil.importSparkTable( + spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); Snapshot snapshot = table.currentSnapshot(); SparkActions actions = SparkActions.get(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .stagingLocation(temp.newFolder().toString()) - .execute(); + RewriteManifests.Result result = + actions + .rewriteManifests(table) + .rewriteIf(manifest -> true) + .stagingLocation(temp.newFolder().toString()) + .execute(); - Assert.assertEquals("Action should rewrite all manifests", - snapshot.allManifests(table.io()), result.rewrittenManifests()); - Assert.assertEquals("Action should add 1 manifest", 1, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite all manifests", + snapshot.allManifests(table.io()), + result.rewrittenManifests()); + Assert.assertEquals( + "Action should add 1 manifest", 1, Iterables.size(result.addedManifests())); } finally { spark.sql("DROP TABLE parquet_table"); @@ -358,9 +359,7 @@ public void testRewriteImportedManifests() throws IOException { @Test public void testRewriteLargeManifestsPartitionedTable() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c3") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c3").build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); @@ -380,19 +379,26 @@ public void testRewriteLargeManifestsPartitionedTable() throws IOException { Assert.assertEquals("Should have 1 manifests before rewrite", 1, manifests.size()); // set the target manifest size to a small value to force splitting records into multiple files - table.updateProperties() - .set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(manifests.get(0).length() / 2)) + table + .updateProperties() + .set( + TableProperties.MANIFEST_TARGET_SIZE_BYTES, + String.valueOf(manifests.get(0).length() / 2)) .commit(); SparkActions actions = SparkActions.get(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .stagingLocation(temp.newFolder().toString()) - .execute(); + RewriteManifests.Result result = + actions + .rewriteManifests(table) + .rewriteIf(manifest -> true) + .stagingLocation(temp.newFolder().toString()) + .execute(); - Assert.assertEquals("Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); table.refresh(); @@ -400,33 +406,28 @@ public void testRewriteLargeManifestsPartitionedTable() throws IOException { Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size()); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @Test public void testRewriteManifestsWithPredicate() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .truncate("c2", 2) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -437,14 +438,18 @@ public void testRewriteManifestsWithPredicate() throws IOException { SparkActions actions = SparkActions.get(); // rewrite only the first manifest without caching - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> manifest.path().equals(manifests.get(0).path())) - .stagingLocation(temp.newFolder().toString()) - .option("use-caching", "false") - .execute(); - - Assert.assertEquals("Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); + RewriteManifests.Result result = + actions + .rewriteManifests(table) + .rewriteIf(manifest -> manifest.path().equals(manifests.get(0).path())) + .stagingLocation(temp.newFolder().toString()) + .option("use-caching", "false") + .execute(); + + Assert.assertEquals( + "Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); table.refresh(); @@ -452,16 +457,16 @@ public void testRewriteManifestsWithPredicate() throws IOException { Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size()); Assert.assertFalse("First manifest must be rewritten", newManifests.contains(manifests.get(0))); - Assert.assertTrue("Second manifest must not be rewritten", newManifests.contains(manifests.get(1))); + Assert.assertTrue( + "Second manifest must not be rewritten", newManifests.contains(manifests.get(1))); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(records1); expectedRecords.addAll(records2); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -472,11 +477,7 @@ private void writeRecords(List records) { } private void writeDF(Dataset df) { - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); } private long computeManifestEntrySizeBytes(List manifests) { @@ -485,7 +486,8 @@ private long computeManifestEntrySizeBytes(List manifests) { for (ManifestFile manifest : manifests) { totalSize += manifest.length(); - numEntries += manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); + numEntries += + manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); } return totalSize / numEntries; diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java index ead159477094..2e99ca98ba16 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; @@ -38,34 +40,31 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class AvroDataTest { protected abstract void writeAndValidate(Schema schema) throws IOException; - protected static final StructType SUPPORTED_PRIMITIVES = StructType.of( - required(100, "id", LongType.get()), - optional(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - optional(103, "i", Types.IntegerType.get()), - required(104, "l", LongType.get()), - optional(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - optional(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - // required(111, "uuid", Types.UUIDType.get()), - required(112, "fixed", Types.FixedType.ofLength(7)), - optional(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + protected static final StructType SUPPORTED_PRIMITIVES = + StructType.of( + required(100, "id", LongType.get()), + optional(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + optional(103, "i", Types.IntegerType.get()), + required(104, "l", LongType.get()), + optional(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + optional(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + // required(111, "uuid", Types.UUIDType.get()), + required(112, "fixed", Types.FixedType.ofLength(7)), + optional(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision + ); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testSimpleStruct() throws IOException { @@ -74,162 +73,208 @@ public void testSimpleStruct() throws IOException { @Test public void testStructWithRequiredFields() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asRequired)))); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds( + new Schema( + Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asRequired)))); } @Test public void testStructWithOptionalFields() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)))); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds( + new Schema( + Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)))); } @Test public void testNestedStruct() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema(required(1, "struct", SUPPORTED_PRIMITIVES)))); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds(new Schema(required(1, "struct", SUPPORTED_PRIMITIVES)))); } @Test public void testArray() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", ListType.ofOptional(2, Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, Types.StringType.get()))); writeAndValidate(schema); } @Test public void testArrayOfStructs() throws IOException { - Schema schema = TypeUtil.assignIncreasingFreshIds(new Schema( - required(0, "id", LongType.get()), - optional(1, "data", ListType.ofOptional(2, SUPPORTED_PRIMITIVES)))); + Schema schema = + TypeUtil.assignIncreasingFreshIds( + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, SUPPORTED_PRIMITIVES)))); writeAndValidate(schema); } @Test public void testMap() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StringType.get(), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional(2, 3, Types.StringType.get(), Types.StringType.get()))); writeAndValidate(schema); } @Test public void testNumericMapKey() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.LongType.get(), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, "data", MapType.ofOptional(2, 3, Types.LongType.get(), Types.StringType.get()))); writeAndValidate(schema); } @Test public void testComplexMapKey() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StructType.of( - required(4, "i", Types.IntegerType.get()), - optional(5, "s", Types.StringType.get())), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional( + 2, + 3, + Types.StructType.of( + required(4, "i", Types.IntegerType.get()), + optional(5, "s", Types.StringType.get())), + Types.StringType.get()))); writeAndValidate(schema); } @Test public void testMapOfStructs() throws IOException { - Schema schema = TypeUtil.assignIncreasingFreshIds(new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)))); + Schema schema = + TypeUtil.assignIncreasingFreshIds( + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional(2, 3, Types.StringType.get(), SUPPORTED_PRIMITIVES)))); writeAndValidate(schema); } @Test public void testMixedTypes() throws IOException { - StructType structType = StructType.of( - required(0, "id", LongType.get()), - optional(1, "list_of_maps", - ListType.ofOptional(2, MapType.ofOptional(3, 4, - Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - optional(5, "map_of_lists", - MapType.ofOptional(6, 7, - Types.StringType.get(), - ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), - required(9, "list_of_lists", - ListType.ofOptional(10, ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), - required(12, "map_of_maps", - MapType.ofOptional(13, 14, - Types.StringType.get(), - MapType.ofOptional(15, 16, + StructType structType = + StructType.of( + required(0, "id", LongType.get()), + optional( + 1, + "list_of_maps", + ListType.ofOptional( + 2, MapType.ofOptional(3, 4, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + optional( + 5, + "map_of_lists", + MapType.ofOptional( + 6, 7, Types.StringType.get(), ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), + required( + 9, + "list_of_lists", + ListType.ofOptional(10, ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), + required( + 12, + "map_of_maps", + MapType.ofOptional( + 13, + 14, Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - required(17, "list_of_struct_of_nested_types", ListType.ofOptional(19, StructType.of( - Types.NestedField.required(20, "m1", MapType.ofOptional(21, 22, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(23, "l1", ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), - Types.NestedField.required(25, "l2", ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(27, "m2", MapType.ofOptional(28, 29, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)) - ))) - ); - - Schema schema = new Schema(TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) - .asStructType().fields()); + MapType.ofOptional(15, 16, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + required( + 17, + "list_of_struct_of_nested_types", + ListType.ofOptional( + 19, + StructType.of( + Types.NestedField.required( + 20, + "m1", + MapType.ofOptional( + 21, 22, Types.StringType.get(), SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 23, "l1", ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), + Types.NestedField.required( + 25, "l2", ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 27, + "m2", + MapType.ofOptional( + 28, 29, Types.StringType.get(), SUPPORTED_PRIMITIVES)))))); + + Schema schema = + new Schema( + TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) + .asStructType() + .fields()); writeAndValidate(schema); } @Test public void testTimestampWithoutZone() throws IOException { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - Schema schema = TypeUtil.assignIncreasingFreshIds(new Schema( - required(0, "id", LongType.get()), - optional(1, "ts_without_zone", Types.TimestampType.withoutZone()))); - - writeAndValidate(schema); - }); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + Schema schema = + TypeUtil.assignIncreasingFreshIds( + new Schema( + required(0, "id", LongType.get()), + optional(1, "ts_without_zone", Types.TimestampType.withoutZone()))); + + writeAndValidate(schema); + }); } protected void withSQLConf(Map conf, Action action) throws IOException { SQLConf sqlConf = SQLConf.get(); Map currentConfValues = Maps.newHashMap(); - conf.keySet().forEach(confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); - - conf.forEach((confKey, confValue) -> { - if (SQLConf.staticConfKeys().contains(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); + conf.keySet() + .forEach( + confKey -> { + if (sqlConf.contains(confKey)) { + String currentConfValue = sqlConf.getConfString(confKey); + currentConfValues.put(confKey, currentConfValue); + } + }); + + conf.forEach( + (confKey, confValue) -> { + if (SQLConf.staticConfKeys().contains(confKey)) { + throw new RuntimeException("Cannot modify the value of a static config: " + confKey); + } + sqlConf.setConfString(confKey, confValue); + }); try { action.invoke(); } finally { - conf.forEach((confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); + conf.forEach( + (confKey, confValue) -> { + if (currentConfValues.containsKey(confKey)) { + sqlConf.setConfString(confKey, currentConfValues.get(confKey)); + } else { + sqlConf.unsetConf(confKey); + } + }); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java index 46c95cef112d..a96e3b1f57f5 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static scala.collection.JavaConverters.mapAsJavaMapConverter; +import static scala.collection.JavaConverters.seqAsJavaListConverter; + import java.math.BigDecimal; import java.nio.ByteBuffer; import java.sql.Timestamp; @@ -48,13 +51,8 @@ import org.junit.Assert; import scala.collection.Seq; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static scala.collection.JavaConverters.mapAsJavaMapConverter; -import static scala.collection.JavaConverters.seqAsJavaListConverter; - public class GenericsHelpers { - private GenericsHelpers() { - } + private GenericsHelpers() {} private static final OffsetDateTime EPOCH = Instant.ofEpochMilli(0L).atOffset(ZoneOffset.UTC); private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); @@ -71,7 +69,8 @@ public static void assertEqualsSafe(Types.StructType struct, Record expected, Ro } } - private static void assertEqualsSafe(Types.ListType list, Collection expected, List actual) { + private static void assertEqualsSafe( + Types.ListType list, Collection expected, List actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); for (int i = 0; i < expectedElements.size(); i += 1) { @@ -82,11 +81,11 @@ private static void assertEqualsSafe(Types.ListType list, Collection expected } } - private static void assertEqualsSafe(Types.MapType map, - Map expected, Map actual) { + private static void assertEqualsSafe(Types.MapType map, Map expected, Map actual) { Type keyType = map.keyType(); Type valueType = map.valueType(); - Assert.assertEquals("Should have the same number of keys", expected.keySet().size(), actual.keySet().size()); + Assert.assertEquals( + "Should have the same number of keys", expected.keySet().size(), actual.keySet().size()); for (Object expectedKey : expected.keySet()) { Object matchingKey = null; @@ -120,22 +119,29 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) Assert.assertEquals("Primitive value should be equal to expected", expected, actual); break; case DATE: - Assertions.assertThat(expected).as("Should expect a LocalDate").isInstanceOf(LocalDate.class); + Assertions.assertThat(expected) + .as("Should expect a LocalDate") + .isInstanceOf(LocalDate.class); Assertions.assertThat(actual).as("Should be a Date").isInstanceOf(Date.class); - Assert.assertEquals("ISO-8601 date should be equal", expected.toString(), actual.toString()); + Assert.assertEquals( + "ISO-8601 date should be equal", expected.toString(), actual.toString()); break; case TIMESTAMP: Assertions.assertThat(actual).as("Should be a Timestamp").isInstanceOf(Timestamp.class); Timestamp ts = (Timestamp) actual; // milliseconds from nanos has already been added by getTime - OffsetDateTime actualTs = EPOCH.plusNanos( - (ts.getTime() * 1_000_000) + (ts.getNanos() % 1_000_000)); + OffsetDateTime actualTs = + EPOCH.plusNanos((ts.getTime() * 1_000_000) + (ts.getNanos() % 1_000_000)); Types.TimestampType timestampType = (Types.TimestampType) type; if (timestampType.shouldAdjustToUTC()) { - Assertions.assertThat(expected).as("Should expect an OffsetDateTime").isInstanceOf(OffsetDateTime.class); + Assertions.assertThat(expected) + .as("Should expect an OffsetDateTime") + .isInstanceOf(OffsetDateTime.class); Assert.assertEquals("Timestamp should be equal", expected, actualTs); } else { - Assertions.assertThat(expected).as("Should expect an LocalDateTime").isInstanceOf(LocalDateTime.class); + Assertions.assertThat(expected) + .as("Should expect an LocalDateTime") + .isInstanceOf(LocalDateTime.class); Assert.assertEquals("Timestamp should be equal", expected, actualTs.toLocalDateTime()); } break; @@ -146,23 +152,25 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a String").isInstanceOf(String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual); + Assert.assertEquals("UUID string representation should match", expected.toString(), actual); break; case FIXED: Assertions.assertThat(expected).as("Should expect a byte[]").isInstanceOf(byte[].class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - (byte[]) expected, (byte[]) actual); + Assert.assertArrayEquals("Bytes should match", (byte[]) expected, (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a BigDecimal").isInstanceOf(BigDecimal.class); Assert.assertEquals("BigDecimals should be equal", expected, actual); break; @@ -172,16 +180,20 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) assertEqualsSafe(type.asNestedType().asStructType(), (Record) expected, (Row) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be a Seq").isInstanceOf(Seq.class); List asList = seqAsJavaListConverter((Seq) actual).asJava(); assertEqualsSafe(type.asNestedType().asListType(), (Collection) expected, asList); break; case MAP: Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be a Map").isInstanceOf(scala.collection.Map.class); - Map asMap = mapAsJavaMapConverter( - (scala.collection.Map) actual).asJava(); + Assertions.assertThat(actual) + .as("Should be a Map") + .isInstanceOf(scala.collection.Map.class); + Map asMap = + mapAsJavaMapConverter((scala.collection.Map) actual).asJava(); assertEqualsSafe(type.asNestedType().asMapType(), (Map) expected, asMap); break; case TIME: @@ -190,7 +202,8 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) } } - public static void assertEqualsUnsafe(Types.StructType struct, Record expected, InternalRow actual) { + public static void assertEqualsUnsafe( + Types.StructType struct, Record expected, InternalRow actual) { List fields = struct.fields(); for (int i = 0; i < fields.size(); i += 1) { Type fieldType = fields.get(i).type(); @@ -202,7 +215,8 @@ public static void assertEqualsUnsafe(Types.StructType struct, Record expected, } } - private static void assertEqualsUnsafe(Types.ListType list, Collection expected, ArrayData actual) { + private static void assertEqualsUnsafe( + Types.ListType list, Collection expected, ArrayData actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); for (int i = 0; i < expectedElements.size(); i += 1) { @@ -245,20 +259,29 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual Assert.assertEquals("Primitive value should be equal to expected", expected, actual); break; case DATE: - Assertions.assertThat(expected).as("Should expect a LocalDate").isInstanceOf(LocalDate.class); + Assertions.assertThat(expected) + .as("Should expect a LocalDate") + .isInstanceOf(LocalDate.class); int expectedDays = (int) ChronoUnit.DAYS.between(EPOCH_DAY, (LocalDate) expected); Assert.assertEquals("Primitive value should be equal to expected", expectedDays, actual); break; case TIMESTAMP: Types.TimestampType timestampType = (Types.TimestampType) type; if (timestampType.shouldAdjustToUTC()) { - Assertions.assertThat(expected).as("Should expect an OffsetDateTime").isInstanceOf(OffsetDateTime.class); + Assertions.assertThat(expected) + .as("Should expect an OffsetDateTime") + .isInstanceOf(OffsetDateTime.class); long expectedMicros = ChronoUnit.MICROS.between(EPOCH, (OffsetDateTime) expected); - Assert.assertEquals("Primitive value should be equal to expected", expectedMicros, actual); + Assert.assertEquals( + "Primitive value should be equal to expected", expectedMicros, actual); } else { - Assertions.assertThat(expected).as("Should expect an LocalDateTime").isInstanceOf(LocalDateTime.class); - long expectedMicros = ChronoUnit.MICROS.between(EPOCH, ((LocalDateTime) expected).atZone(ZoneId.of("UTC"))); - Assert.assertEquals("Primitive value should be equal to expected", expectedMicros, actual); + Assertions.assertThat(expected) + .as("Should expect an LocalDateTime") + .isInstanceOf(LocalDateTime.class); + long expectedMicros = + ChronoUnit.MICROS.between(EPOCH, ((LocalDateTime) expected).atZone(ZoneId.of("UTC"))); + Assert.assertEquals( + "Primitive value should be equal to expected", expectedMicros, actual); } break; case STRING: @@ -268,8 +291,8 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a UTF8String").isInstanceOf(UTF8String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual.toString()); + Assert.assertEquals( + "UUID string representation should match", expected.toString(), actual.toString()); break; case FIXED: Assertions.assertThat(expected).as("Should expect a byte[]").isInstanceOf(byte[].class); @@ -277,30 +300,42 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual Assert.assertArrayEquals("Bytes should match", (byte[]) expected, (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a Decimal").isInstanceOf(Decimal.class); - Assert.assertEquals("BigDecimals should be equal", - expected, ((Decimal) actual).toJavaBigDecimal()); + Assert.assertEquals( + "BigDecimals should be equal", expected, ((Decimal) actual).toJavaBigDecimal()); break; case STRUCT: Assertions.assertThat(expected).as("Should expect a Record").isInstanceOf(Record.class); - Assertions.assertThat(actual).as("Should be an InternalRow").isInstanceOf(InternalRow.class); - assertEqualsUnsafe(type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); + Assertions.assertThat(actual) + .as("Should be an InternalRow") + .isInstanceOf(InternalRow.class); + assertEqualsUnsafe( + type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be an ArrayData").isInstanceOf(ArrayData.class); - assertEqualsUnsafe(type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); + assertEqualsUnsafe( + type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); break; case MAP: Assertions.assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be an ArrayBasedMapData").isInstanceOf(MapData.class); + Assertions.assertThat(actual) + .as("Should be an ArrayBasedMapData") + .isInstanceOf(MapData.class); assertEqualsUnsafe(type.asNestedType().asMapType(), (Map) expected, (MapData) actual); break; case TIME: diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java index d3bffb75eb5c..1c95df8ced12 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.math.BigDecimal; @@ -53,8 +52,7 @@ public class RandomData { // Default percentage of number of values that are null for optional fields public static final float DEFAULT_NULL_PERCENTAGE = 0.05f; - private RandomData() { - } + private RandomData() {} public static List generateList(Schema schema, int numRecords, long seed) { RandomDataGenerator generator = new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE); @@ -67,63 +65,71 @@ public static List generateList(Schema schema, int numRecords, long seed } public static Iterable generateSpark(Schema schema, int numRecords, long seed) { - return () -> new Iterator() { - private SparkRandomDataGenerator generator = new SparkRandomDataGenerator(seed); - private int count = 0; - - @Override - public boolean hasNext() { - return count < numRecords; - } - - @Override - public InternalRow next() { - if (count >= numRecords) { - throw new NoSuchElementException(); - } - count += 1; - return (InternalRow) TypeUtil.visit(schema, generator); - } - }; + return () -> + new Iterator() { + private SparkRandomDataGenerator generator = new SparkRandomDataGenerator(seed); + private int count = 0; + + @Override + public boolean hasNext() { + return count < numRecords; + } + + @Override + public InternalRow next() { + if (count >= numRecords) { + throw new NoSuchElementException(); + } + count += 1; + return (InternalRow) TypeUtil.visit(schema, generator); + } + }; } public static Iterable generate(Schema schema, int numRecords, long seed) { - return newIterable(() -> new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE), schema, numRecords); + return newIterable( + () -> new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE), schema, numRecords); } - public static Iterable generate(Schema schema, int numRecords, long seed, float nullPercentage) { - return newIterable(() -> new RandomDataGenerator(schema, seed, nullPercentage), schema, numRecords); + public static Iterable generate( + Schema schema, int numRecords, long seed, float nullPercentage) { + return newIterable( + () -> new RandomDataGenerator(schema, seed, nullPercentage), schema, numRecords); } - public static Iterable generateFallbackData(Schema schema, int numRecords, long seed, long numDictRecords) { - return newIterable(() -> new FallbackDataGenerator(schema, seed, numDictRecords), schema, numRecords); + public static Iterable generateFallbackData( + Schema schema, int numRecords, long seed, long numDictRecords) { + return newIterable( + () -> new FallbackDataGenerator(schema, seed, numDictRecords), schema, numRecords); } public static Iterable generateDictionaryEncodableData( Schema schema, int numRecords, long seed, float nullPercentage) { - return newIterable(() -> new DictionaryEncodedDataGenerator(schema, seed, nullPercentage), schema, numRecords); + return newIterable( + () -> new DictionaryEncodedDataGenerator(schema, seed, nullPercentage), schema, numRecords); } - private static Iterable newIterable(Supplier newGenerator, - Schema schema, int numRecords) { - return () -> new Iterator() { - private int count = 0; - private RandomDataGenerator generator = newGenerator.get(); - - @Override - public boolean hasNext() { - return count < numRecords; - } - - @Override - public Record next() { - if (count >= numRecords) { - throw new NoSuchElementException(); - } - count += 1; - return (Record) TypeUtil.visit(schema, generator); - } - }; + private static Iterable newIterable( + Supplier newGenerator, Schema schema, int numRecords) { + return () -> + new Iterator() { + private int count = 0; + private RandomDataGenerator generator = newGenerator.get(); + + @Override + public boolean hasNext() { + return count < numRecords; + } + + @Override + public Record next() { + if (count >= numRecords) { + throw new NoSuchElementException(); + } + count += 1; + return (Record) TypeUtil.visit(schema, generator); + } + }; } private static class RandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor { @@ -218,8 +224,7 @@ public Object primitive(Type.PrimitiveType primitive) { // them here. switch (primitive.typeId()) { case FIXED: - return new GenericData.Fixed(typeToSchema.get(primitive), - (byte[]) result); + return new GenericData.Fixed(typeToSchema.get(primitive), (byte[]) result); case BINARY: return ByteBuffer.wrap((byte[]) result); case UUID: diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java index 53d5e8763e6f..42f4c1a1ab42 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static scala.collection.JavaConverters.mapAsJavaMapConverter; +import static scala.collection.JavaConverters.seqAsJavaListConverter; + import java.math.BigDecimal; import java.nio.ByteBuffer; import java.sql.Timestamp; @@ -63,14 +66,9 @@ import org.junit.Assert; import scala.collection.Seq; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static scala.collection.JavaConverters.mapAsJavaMapConverter; -import static scala.collection.JavaConverters.seqAsJavaListConverter; - public class TestHelpers { - private TestHelpers() { - } + private TestHelpers() {} public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row) { List fields = struct.fields(); @@ -84,8 +82,11 @@ public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row } } - public static void assertEqualsBatch(Types.StructType struct, Iterator expected, ColumnarBatch batch, - boolean checkArrowValidityVector) { + public static void assertEqualsBatch( + Types.StructType struct, + Iterator expected, + ColumnarBatch batch, + boolean checkArrowValidityVector) { for (int rowId = 0; rowId < batch.numRows(); rowId++) { List fields = struct.fields(); InternalRow row = batch.getRow(rowId); @@ -98,15 +99,16 @@ public static void assertEqualsBatch(Types.StructType struct, Iterator e if (checkArrowValidityVector) { ColumnVector columnVector = batch.column(i); - ValueVector arrowVector = ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector(); - Assert.assertFalse("Nullability doesn't match of " + columnVector.dataType(), + ValueVector arrowVector = + ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector(); + Assert.assertFalse( + "Nullability doesn't match of " + columnVector.dataType(), expectedValue == null ^ arrowVector.isNull(rowId)); } } } } - private static void assertEqualsSafe(Types.ListType list, Collection expected, List actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); @@ -118,8 +120,7 @@ private static void assertEqualsSafe(Types.ListType list, Collection expected } } - private static void assertEqualsSafe(Types.MapType map, - Map expected, Map actual) { + private static void assertEqualsSafe(Types.MapType map, Map expected, Map actual) { Type keyType = map.keyType(); Type valueType = map.valueType(); @@ -178,23 +179,28 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a String").isInstanceOf(String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual); + Assert.assertEquals("UUID string representation should match", expected.toString(), actual); break; case FIXED: - Assertions.assertThat(expected).as("Should expect a Fixed").isInstanceOf(GenericData.Fixed.class); + Assertions.assertThat(expected) + .as("Should expect a Fixed") + .isInstanceOf(GenericData.Fixed.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((GenericData.Fixed) expected).bytes(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((GenericData.Fixed) expected).bytes(), (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a BigDecimal").isInstanceOf(BigDecimal.class); Assert.assertEquals("BigDecimals should be equal", expected, actual); break; @@ -204,16 +210,20 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) assertEqualsSafe(type.asNestedType().asStructType(), (Record) expected, (Row) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be a Seq").isInstanceOf(Seq.class); List asList = seqAsJavaListConverter((Seq) actual).asJava(); assertEqualsSafe(type.asNestedType().asListType(), (Collection) expected, asList); break; case MAP: Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be a Map").isInstanceOf(scala.collection.Map.class); - Map asMap = mapAsJavaMapConverter( - (scala.collection.Map) actual).asJava(); + Assertions.assertThat(actual) + .as("Should be a Map") + .isInstanceOf(scala.collection.Map.class); + Map asMap = + mapAsJavaMapConverter((scala.collection.Map) actual).asJava(); assertEqualsSafe(type.asNestedType().asMapType(), (Map) expected, asMap); break; case TIME: @@ -234,7 +244,8 @@ public static void assertEqualsUnsafe(Types.StructType struct, Record rec, Inter } } - private static void assertEqualsUnsafe(Types.ListType list, Collection expected, ArrayData actual) { + private static void assertEqualsUnsafe( + Types.ListType list, Collection expected, ArrayData actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); for (int i = 0; i < expectedElements.size(); i += 1) { @@ -280,8 +291,10 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual case DOUBLE: Assertions.assertThat(actual).as("Should be a double").isInstanceOf(Double.class); if (expected instanceof Float) { - Assert.assertEquals("Values didn't match", Double.doubleToLongBits(((Number) expected).doubleValue()), - Double.doubleToLongBits((double) actual)); + Assert.assertEquals( + "Values didn't match", + Double.doubleToLongBits(((Number) expected).doubleValue()), + Double.doubleToLongBits((double) actual)); } else { Assert.assertEquals("Primitive value should be equal to expected", expected, actual); } @@ -300,40 +313,54 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a UTF8String").isInstanceOf(UTF8String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual.toString()); + Assert.assertEquals( + "UUID string representation should match", expected.toString(), actual.toString()); break; case FIXED: - Assertions.assertThat(expected).as("Should expect a Fixed").isInstanceOf(GenericData.Fixed.class); + Assertions.assertThat(expected) + .as("Should expect a Fixed") + .isInstanceOf(GenericData.Fixed.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((GenericData.Fixed) expected).bytes(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((GenericData.Fixed) expected).bytes(), (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a Decimal").isInstanceOf(Decimal.class); - Assert.assertEquals("BigDecimals should be equal", - expected, ((Decimal) actual).toJavaBigDecimal()); + Assert.assertEquals( + "BigDecimals should be equal", expected, ((Decimal) actual).toJavaBigDecimal()); break; case STRUCT: Assertions.assertThat(expected).as("Should expect a Record").isInstanceOf(Record.class); - Assertions.assertThat(actual).as("Should be an InternalRow").isInstanceOf(InternalRow.class); - assertEqualsUnsafe(type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); + Assertions.assertThat(actual) + .as("Should be an InternalRow") + .isInstanceOf(InternalRow.class); + assertEqualsUnsafe( + type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be an ArrayData").isInstanceOf(ArrayData.class); - assertEqualsUnsafe(type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); + assertEqualsUnsafe( + type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); break; case MAP: Assertions.assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be an ArrayBasedMapData").isInstanceOf(MapData.class); + Assertions.assertThat(actual) + .as("Should be an ArrayBasedMapData") + .isInstanceOf(MapData.class); assertEqualsUnsafe(type.asNestedType().asMapType(), (Map) expected, (MapData) actual); break; case TIME: @@ -344,13 +371,14 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual /** * Check that the given InternalRow is equivalent to the Row. + * * @param prefix context for error messages * @param type the type of the row * @param expected the expected value of the row * @param actual the actual value of the row */ - public static void assertEquals(String prefix, Types.StructType type, - InternalRow expected, Row actual) { + public static void assertEquals( + String prefix, Types.StructType type, InternalRow expected, Row actual) { if (expected == null || actual == null) { Assert.assertEquals(prefix, expected, actual); } else { @@ -368,30 +396,41 @@ public static void assertEquals(String prefix, Types.StructType type, case DECIMAL: case DATE: case TIMESTAMP: - Assert.assertEquals(prefix + "." + fieldName + " - " + childType, + Assert.assertEquals( + prefix + "." + fieldName + " - " + childType, getValue(expected, c, childType), getPrimitiveValue(actual, c, childType)); break; case UUID: case FIXED: case BINARY: - assertEqualBytes(prefix + "." + fieldName, + assertEqualBytes( + prefix + "." + fieldName, (byte[]) getValue(expected, c, childType), (byte[]) actual.get(c)); break; - case STRUCT: { - Types.StructType st = (Types.StructType) childType; - assertEquals(prefix + "." + fieldName, st, - expected.getStruct(c, st.fields().size()), actual.getStruct(c)); - break; - } + case STRUCT: + { + Types.StructType st = (Types.StructType) childType; + assertEquals( + prefix + "." + fieldName, + st, + expected.getStruct(c, st.fields().size()), + actual.getStruct(c)); + break; + } case LIST: - assertEqualsLists(prefix + "." + fieldName, childType.asListType(), + assertEqualsLists( + prefix + "." + fieldName, + childType.asListType(), expected.getArray(c), toList((Seq) actual.get(c))); break; case MAP: - assertEqualsMaps(prefix + "." + fieldName, childType.asMapType(), expected.getMap(c), + assertEqualsMaps( + prefix + "." + fieldName, + childType.asMapType(), + expected.getMap(c), toJavaMap((scala.collection.Map) actual.getMap(c))); break; default: @@ -401,8 +440,8 @@ public static void assertEquals(String prefix, Types.StructType type, } } - private static void assertEqualsLists(String prefix, Types.ListType type, - ArrayData expected, List actual) { + private static void assertEqualsLists( + String prefix, Types.ListType type, ArrayData expected, List actual) { if (expected == null || actual == null) { Assert.assertEquals(prefix, expected, actual); } else { @@ -419,31 +458,42 @@ private static void assertEqualsLists(String prefix, Types.ListType type, case DECIMAL: case DATE: case TIMESTAMP: - Assert.assertEquals(prefix + ".elem " + e + " - " + childType, + Assert.assertEquals( + prefix + ".elem " + e + " - " + childType, getValue(expected, e, childType), actual.get(e)); break; case UUID: case FIXED: case BINARY: - assertEqualBytes(prefix + ".elem " + e, + assertEqualBytes( + prefix + ".elem " + e, (byte[]) getValue(expected, e, childType), (byte[]) actual.get(e)); break; - case STRUCT: { - Types.StructType st = (Types.StructType) childType; - assertEquals(prefix + ".elem " + e, st, - expected.getStruct(e, st.fields().size()), (Row) actual.get(e)); - break; - } + case STRUCT: + { + Types.StructType st = (Types.StructType) childType; + assertEquals( + prefix + ".elem " + e, + st, + expected.getStruct(e, st.fields().size()), + (Row) actual.get(e)); + break; + } case LIST: - assertEqualsLists(prefix + ".elem " + e, childType.asListType(), + assertEqualsLists( + prefix + ".elem " + e, + childType.asListType(), expected.getArray(e), toList((Seq) actual.get(e))); break; case MAP: - assertEqualsMaps(prefix + ".elem " + e, childType.asMapType(), - expected.getMap(e), toJavaMap((scala.collection.Map) actual.get(e))); + assertEqualsMaps( + prefix + ".elem " + e, + childType.asMapType(), + expected.getMap(e), + toJavaMap((scala.collection.Map) actual.get(e))); break; default: throw new IllegalArgumentException("Unhandled type " + childType); @@ -452,8 +502,8 @@ private static void assertEqualsLists(String prefix, Types.ListType type, } } - private static void assertEqualsMaps(String prefix, Types.MapType type, - MapData expected, Map actual) { + private static void assertEqualsMaps( + String prefix, Types.MapType type, MapData expected, Map actual) { if (expected == null || actual == null) { Assert.assertEquals(prefix, expected, actual); } else { @@ -466,7 +516,9 @@ private static void assertEqualsMaps(String prefix, Types.MapType type, Object expectedKey = getValue(expectedKeyArray, e, keyType); Object actualValue = actual.get(expectedKey); if (actualValue == null) { - Assert.assertEquals(prefix + ".key=" + expectedKey + " has null", true, + Assert.assertEquals( + prefix + ".key=" + expectedKey + " has null", + true, expected.valueArray().isNullAt(e)); } else { switch (valueType.typeId()) { @@ -479,32 +531,40 @@ private static void assertEqualsMaps(String prefix, Types.MapType type, case DECIMAL: case DATE: case TIMESTAMP: - Assert.assertEquals(prefix + ".key=" + expectedKey + " - " + valueType, + Assert.assertEquals( + prefix + ".key=" + expectedKey + " - " + valueType, getValue(expectedValueArray, e, valueType), actual.get(expectedKey)); break; case UUID: case FIXED: case BINARY: - assertEqualBytes(prefix + ".key=" + expectedKey, + assertEqualBytes( + prefix + ".key=" + expectedKey, (byte[]) getValue(expectedValueArray, e, valueType), (byte[]) actual.get(expectedKey)); break; - case STRUCT: { - Types.StructType st = (Types.StructType) valueType; - assertEquals(prefix + ".key=" + expectedKey, st, - expectedValueArray.getStruct(e, st.fields().size()), - (Row) actual.get(expectedKey)); - break; - } + case STRUCT: + { + Types.StructType st = (Types.StructType) valueType; + assertEquals( + prefix + ".key=" + expectedKey, + st, + expectedValueArray.getStruct(e, st.fields().size()), + (Row) actual.get(expectedKey)); + break; + } case LIST: - assertEqualsLists(prefix + ".key=" + expectedKey, + assertEqualsLists( + prefix + ".key=" + expectedKey, valueType.asListType(), expectedValueArray.getArray(e), toList((Seq) actual.get(expectedKey))); break; case MAP: - assertEqualsMaps(prefix + ".key=" + expectedKey, valueType.asMapType(), + assertEqualsMaps( + prefix + ".key=" + expectedKey, + valueType.asMapType(), expectedValueArray.getMap(e), toJavaMap((scala.collection.Map) actual.get(expectedKey))); break; @@ -516,8 +576,7 @@ private static void assertEqualsMaps(String prefix, Types.MapType type, } } - private static Object getValue(SpecializedGetters container, int ord, - Type type) { + private static Object getValue(SpecializedGetters container, int ord, Type type) { if (container.isNullAt(ord)) { return null; } @@ -542,10 +601,11 @@ private static Object getValue(SpecializedGetters container, int ord, return new DateWritable(container.getInt(ord)).get(); case TIMESTAMP: return DateTimeUtils.toJavaTimestamp(container.getLong(ord)); - case DECIMAL: { - Types.DecimalType dt = (Types.DecimalType) type; - return container.getDecimal(ord, dt.precision(), dt.scale()).toJavaBigDecimal(); - } + case DECIMAL: + { + Types.DecimalType dt = (Types.DecimalType) type; + return container.getDecimal(ord, dt.precision(), dt.scale()).toJavaBigDecimal(); + } case STRUCT: Types.StructType struct = type.asStructType(); InternalRow internalRow = container.getStruct(ord, struct.fields().size()); @@ -603,8 +663,7 @@ private static List toList(Seq val) { return val == null ? null : seqAsJavaListConverter(val).asJava(); } - private static void assertEqualBytes(String context, byte[] expected, - byte[] actual) { + private static void assertEqualBytes(String context, byte[] expected, byte[] actual) { if (expected == null || actual == null) { Assert.assertEquals(context, expected, actual); } else { @@ -622,23 +681,29 @@ private static void assertEquals(String context, DataType type, Object expected, } if (type instanceof StructType) { - Assertions.assertThat(expected).as("Expected should be an InternalRow: " + context) + Assertions.assertThat(expected) + .as("Expected should be an InternalRow: " + context) .isInstanceOf(InternalRow.class); - Assertions.assertThat(actual).as("Actual should be an InternalRow: " + context) + Assertions.assertThat(actual) + .as("Actual should be an InternalRow: " + context) .isInstanceOf(InternalRow.class); assertEquals(context, (StructType) type, (InternalRow) expected, (InternalRow) actual); } else if (type instanceof ArrayType) { - Assertions.assertThat(expected).as("Expected should be an ArrayData: " + context) + Assertions.assertThat(expected) + .as("Expected should be an ArrayData: " + context) .isInstanceOf(ArrayData.class); - Assertions.assertThat(actual).as("Actual should be an ArrayData: " + context) + Assertions.assertThat(actual) + .as("Actual should be an ArrayData: " + context) .isInstanceOf(ArrayData.class); assertEquals(context, (ArrayType) type, (ArrayData) expected, (ArrayData) actual); } else if (type instanceof MapType) { - Assertions.assertThat(expected).as("Expected should be a MapData: " + context) + Assertions.assertThat(expected) + .as("Expected should be a MapData: " + context) .isInstanceOf(MapData.class); - Assertions.assertThat(actual).as("Actual should be a MapData: " + context) + Assertions.assertThat(actual) + .as("Actual should be a MapData: " + context) .isInstanceOf(MapData.class); assertEquals(context, (MapType) type, (MapData) expected, (MapData) actual); @@ -649,32 +714,37 @@ private static void assertEquals(String context, DataType type, Object expected, } } - private static void assertEquals(String context, StructType struct, - InternalRow expected, InternalRow actual) { + private static void assertEquals( + String context, StructType struct, InternalRow expected, InternalRow actual) { Assert.assertEquals("Should have correct number of fields", struct.size(), actual.numFields()); for (int i = 0; i < actual.numFields(); i += 1) { StructField field = struct.fields()[i]; DataType type = field.dataType(); - assertEquals(context + "." + field.name(), type, + assertEquals( + context + "." + field.name(), + type, expected.isNullAt(i) ? null : expected.get(i, type), actual.isNullAt(i) ? null : actual.get(i, type)); } } - private static void assertEquals(String context, ArrayType array, ArrayData expected, ArrayData actual) { - Assert.assertEquals("Should have the same number of elements", - expected.numElements(), actual.numElements()); + private static void assertEquals( + String context, ArrayType array, ArrayData expected, ArrayData actual) { + Assert.assertEquals( + "Should have the same number of elements", expected.numElements(), actual.numElements()); DataType type = array.elementType(); for (int i = 0; i < actual.numElements(); i += 1) { - assertEquals(context + ".element", type, + assertEquals( + context + ".element", + type, expected.isNullAt(i) ? null : expected.get(i, type), actual.isNullAt(i) ? null : actual.get(i, type)); } } private static void assertEquals(String context, MapType map, MapData expected, MapData actual) { - Assert.assertEquals("Should have the same number of elements", - expected.numElements(), actual.numElements()); + Assert.assertEquals( + "Should have the same number of elements", expected.numElements(), actual.numElements()); DataType keyType = map.keyType(); ArrayData expectedKeys = expected.keyArray(); @@ -685,10 +755,14 @@ private static void assertEquals(String context, MapType map, MapData expected, ArrayData actualValues = actual.valueArray(); for (int i = 0; i < actual.numElements(); i += 1) { - assertEquals(context + ".key", keyType, + assertEquals( + context + ".key", + keyType, expectedKeys.isNullAt(i) ? null : expectedKeys.get(i, keyType), actualKeys.isNullAt(i) ? null : actualKeys.get(i, keyType)); - assertEquals(context + ".value", valueType, + assertEquals( + context + ".value", + valueType, expectedValues.isNullAt(i) ? null : expectedValues.get(i, valueType), actualValues.isNullAt(i) ? null : actualValues.get(i, valueType)); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java index 7cf9b9c736c6..1e51a088390e 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import org.apache.iceberg.Files; @@ -32,16 +33,12 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestOrcWrite { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); @Test public void splitOffsets() throws IOException { @@ -49,10 +46,11 @@ public void splitOffsets() throws IOException { Assert.assertTrue("Delete should succeed", testFile.delete()); Iterable rows = RandomData.generateSpark(SCHEMA, 1, 0L); - FileAppender writer = ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(SCHEMA) - .build(); + FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(SCHEMA) + .build(); writer.addAll(rows); writer.close(); diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java index 464e3165583c..a4ffc2fea437 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -38,54 +40,68 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestParquetAvroReader { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required(5, "strict", Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional(6, "hopeful", Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()) - )), - optional(10, "vehement", Types.LongType.get()) - )), - optional(11, "metamorphosis", Types.MapType.ofRequired(12, 13, - Types.StringType.get(), Types.TimestampType.withoutZone())), - required(14, "winter", Types.ListType.ofOptional(15, Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.TimeType.get()), - optional(18, "wheeze", Types.StringType.get()) - ))), - optional(19, "renovate", Types.MapType.ofRequired(20, 21, - Types.StringType.get(), Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.TimeType.get()), - required(24, "couch rope", Types.IntegerType.get()) - ))), - optional(2, "slide", Types.StringType.get()) - ); + @Rule public TemporaryFolder temp = new TemporaryFolder(); + + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "roots", Types.LongType.get()), + optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), + required( + 5, + "strict", + Types.StructType.of( + required(9, "tangerine", Types.StringType.get()), + optional( + 6, + "hopeful", + Types.StructType.of( + required(7, "steel", Types.FloatType.get()), + required(8, "lantern", Types.DateType.get()))), + optional(10, "vehement", Types.LongType.get()))), + optional( + 11, + "metamorphosis", + Types.MapType.ofRequired( + 12, 13, Types.StringType.get(), Types.TimestampType.withoutZone())), + required( + 14, + "winter", + Types.ListType.ofOptional( + 15, + Types.StructType.of( + optional(16, "beet", Types.DoubleType.get()), + required(17, "stamp", Types.TimeType.get()), + optional(18, "wheeze", Types.StringType.get())))), + optional( + 19, + "renovate", + Types.MapType.ofRequired( + 20, + 21, + Types.StringType.get(), + Types.StructType.of( + optional(22, "jumpy", Types.DoubleType.get()), + required(23, "koala", Types.TimeType.get()), + required(24, "couch rope", Types.IntegerType.get())))), + optional(2, "slide", Types.StringType.get())); @Ignore public void testStructSchema() throws IOException { - Schema structSchema = new Schema( - required(1, "circumvent", Types.LongType.get()), - optional(2, "antarctica", Types.StringType.get()), - optional(3, "fluent", Types.DoubleType.get()), - required(4, "quell", Types.StructType.of( - required(5, "operator", Types.BooleanType.get()), - optional(6, "fanta", Types.IntegerType.get()), - optional(7, "cable", Types.FloatType.get()) - )), - required(8, "chimney", Types.TimestampType.withZone()), - required(9, "wool", Types.DateType.get()) - ); + Schema structSchema = + new Schema( + required(1, "circumvent", Types.LongType.get()), + optional(2, "antarctica", Types.StringType.get()), + optional(3, "fluent", Types.DoubleType.get()), + required( + 4, + "quell", + Types.StructType.of( + required(5, "operator", Types.BooleanType.get()), + optional(6, "fanta", Types.IntegerType.get()), + optional(7, "cable", Types.FloatType.get()))), + required(8, "chimney", Types.TimestampType.withZone()), + required(9, "wool", Types.DateType.get())); File testFile = writeTestData(structSchema, 5_000_000, 1059); // RandomData uses the root record name "test", which must match for records to be equal @@ -100,11 +116,12 @@ public void testStructSchema() throws IOException { // clean up as much memory as possible to avoid a large GC during the timed run System.gc(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(structSchema) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(structSchema, readSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(structSchema) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(structSchema, readSchema)) + .build()) { long start = System.currentTimeMillis(); long val = 0; long count = 0; @@ -137,9 +154,8 @@ public void testWithOldReadPath() throws IOException { // clean up as much memory as possible to avoid a large GC during the timed run System.gc(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)).project(COMPLEX_SCHEMA).build()) { long start = System.currentTimeMillis(); long val = 0; long count = 0; @@ -154,11 +170,12 @@ public void testWithOldReadPath() throws IOException { // clean up as much memory as possible to avoid a large GC during the timed run System.gc(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) + .build()) { long start = System.currentTimeMillis(); long val = 0; long count = 0; @@ -179,9 +196,8 @@ public void testCorrectness() throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)).schema(COMPLEX_SCHEMA).build()) { writer.addAll(records); } @@ -189,12 +205,13 @@ public void testCorrectness() throws IOException { MessageType readSchema = ParquetSchemaUtil.convert(COMPLEX_SCHEMA, "test"); // verify that the new read path is correct - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .reuseContainers() - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) + .reuseContainers() + .build()) { int recordNum = 0; Iterator iter = records.iterator(); for (Record actual : reader) { @@ -209,9 +226,8 @@ private File writeTestData(Schema schema, int numRecords, int seed) throws IOExc File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)).schema(schema).build()) { writer.addAll(RandomData.generate(schema, numRecords, seed)); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java index dcfc873a5a67..15c6268da478 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -38,39 +40,51 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestParquetAvroWriter { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required(5, "strict", Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional(6, "hopeful", Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()) - )), - optional(10, "vehement", Types.LongType.get()) - )), - optional(11, "metamorphosis", Types.MapType.ofRequired(12, 13, - Types.StringType.get(), Types.TimestampType.withoutZone())), - required(14, "winter", Types.ListType.ofOptional(15, Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.TimeType.get()), - optional(18, "wheeze", Types.StringType.get()) - ))), - optional(19, "renovate", Types.MapType.ofRequired(20, 21, - Types.StringType.get(), Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.TimeType.get()), - required(24, "couch rope", Types.IntegerType.get()) - ))), - optional(2, "slide", Types.StringType.get()) - ); + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "roots", Types.LongType.get()), + optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), + required( + 5, + "strict", + Types.StructType.of( + required(9, "tangerine", Types.StringType.get()), + optional( + 6, + "hopeful", + Types.StructType.of( + required(7, "steel", Types.FloatType.get()), + required(8, "lantern", Types.DateType.get()))), + optional(10, "vehement", Types.LongType.get()))), + optional( + 11, + "metamorphosis", + Types.MapType.ofRequired( + 12, 13, Types.StringType.get(), Types.TimestampType.withoutZone())), + required( + 14, + "winter", + Types.ListType.ofOptional( + 15, + Types.StructType.of( + optional(16, "beet", Types.DoubleType.get()), + required(17, "stamp", Types.TimeType.get()), + optional(18, "wheeze", Types.StringType.get())))), + optional( + 19, + "renovate", + Types.MapType.ofRequired( + 20, + 21, + Types.StringType.get(), + Types.StructType.of( + optional(22, "jumpy", Types.DoubleType.get()), + required(23, "koala", Types.TimeType.get()), + required(24, "couch rope", Types.IntegerType.get())))), + optional(2, "slide", Types.StringType.get())); @Test public void testCorrectness() throws IOException { @@ -79,10 +93,11 @@ public void testCorrectness() throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .createWriterFunc(ParquetAvroWriter::buildWriter) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(COMPLEX_SCHEMA) + .createWriterFunc(ParquetAvroWriter::buildWriter) + .build()) { writer.addAll(records); } @@ -90,11 +105,12 @@ public void testCorrectness() throws IOException { MessageType readSchema = ParquetSchemaUtil.convert(COMPLEX_SCHEMA, "test"); // verify that the new read path is correct - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) + .build()) { int recordNum = 0; Iterator iter = records.iterator(); for (Record actual : reader) { diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java index 3517c32ffebb..6f05a9ed7c1f 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.File; @@ -42,20 +41,20 @@ public class TestSparkAvroEnums { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void writeAndValidateEnums() throws IOException { - org.apache.avro.Schema avroSchema = SchemaBuilder.record("root") - .fields() - .name("enumCol") - .type() - .nullable() - .enumeration("testEnum") - .symbols("SYMB1", "SYMB2") - .enumDefault("SYMB2") - .endRecord(); + org.apache.avro.Schema avroSchema = + SchemaBuilder.record("root") + .fields() + .name("enumCol") + .type() + .nullable() + .enumeration("testEnum") + .symbols("SYMB1", "SYMB2") + .enumDefault("SYMB2") + .endRecord(); org.apache.avro.Schema enumSchema = avroSchema.getField("enumCol").schema().getTypes().get(0); Record enumRecord1 = new GenericData.Record(avroSchema); @@ -77,10 +76,11 @@ public void writeAndValidateEnums() throws IOException { Schema schema = new Schema(AvroSchemaUtil.convert(avroSchema).asStructType().fields()); List rows; - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)) + .createReaderFunc(SparkAvroReader::new) + .project(schema) + .build()) { rows = Lists.newArrayList(reader); } @@ -88,7 +88,8 @@ public void writeAndValidateEnums() throws IOException { for (int i = 0; i < expected.size(); i += 1) { String expectedEnumString = expected.get(i).get("enumCol") == null ? null : expected.get(i).get("enumCol").toString(); - String sparkString = rows.get(i).getUTF8String(0) == null ? null : rows.get(i).getUTF8String(0).toString(); + String sparkString = + rows.get(i).getUTF8String(0) == null ? null : rows.get(i).getUTF8String(0).toString(); Assert.assertEquals(expectedEnumString, sparkString); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java index e4398df39cc8..6d1ef3db3657 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; + import java.io.File; import java.io.IOException; import java.util.List; @@ -32,8 +33,6 @@ import org.apache.spark.sql.catalyst.InternalRow; import org.junit.Assert; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; - public class TestSparkAvroReader extends AvroDataTest { @Override protected void writeAndValidate(Schema schema) throws IOException { @@ -42,20 +41,19 @@ protected void writeAndValidate(Schema schema) throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Avro.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Avro.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { for (Record rec : expected) { writer.add(rec); } } List rows; - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)) + .createReaderFunc(SparkAvroReader::new) + .project(schema) + .build()) { rows = Lists.newArrayList(reader); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java index 89e3550dc0af..259f2ad1e295 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.time.ZoneId; @@ -69,7 +68,7 @@ public void checkSparkTimestamp(String timestampString, String sparkRepr) { ZoneId zoneId = DateTimeUtils.getZoneId("UTC"); TimestampFormatter formatter = TimestampFormatter.getFractionFormatter(zoneId); String sparkTimestamp = DateTimeUtils.timestampToString(formatter, ts.value()); - Assert.assertEquals("Should be the same timestamp (" + ts.value() + ")", - sparkRepr, sparkTimestamp); + Assert.assertEquals( + "Should be the same timestamp (" + ts.value() + ")", sparkRepr, sparkTimestamp); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java index b8ee56370edf..3c9037adc393 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -57,21 +58,18 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSparkOrcReadMetadataColumns { - private static final Schema DATA_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()) - ); - - private static final Schema PROJECTION_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - MetadataColumns.ROW_POSITION, - MetadataColumns.IS_DELETED - ); + private static final Schema DATA_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), required(101, "data", Types.StringType.get())); + + private static final Schema PROJECTION_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get()), + MetadataColumns.ROW_POSITION, + MetadataColumns.IS_DELETED); private static final int NUM_ROWS = 1000; private static final List DATA_ROWS; @@ -99,11 +97,10 @@ public class TestSparkOrcReadMetadataColumns { @Parameterized.Parameters(name = "vectorized = {0}") public static Object[] parameters() { - return new Object[] { false, true }; + return new Object[] {false, true}; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private boolean vectorized; private File testFile; @@ -117,14 +114,15 @@ public void writeFile() throws IOException { testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(DATA_SCHEMA) - // write in such a way that the file contains 10 stripes each with 100 rows - .set("iceberg.orc.vectorbatch.size", "100") - .set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "100") - .set(OrcConf.STRIPE_SIZE.getAttribute(), "1") - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(DATA_SCHEMA) + // write in such a way that the file contains 10 stripes each with 100 rows + .set("iceberg.orc.vectorbatch.size", "100") + .set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "100") + .set(OrcConf.STRIPE_SIZE.getAttribute(), "1") + .build()) { writer.addAll(DATA_ROWS); } } @@ -136,41 +134,54 @@ public void testReadRowNumbers() throws IOException { @Test public void testReadRowNumbersWithFilter() throws IOException { - readAndValidate(Expressions.greaterThanOrEqual("id", 500), null, null, EXPECTED_ROWS.subList(500, 1000)); + readAndValidate( + Expressions.greaterThanOrEqual("id", 500), null, null, EXPECTED_ROWS.subList(500, 1000)); } @Test public void testReadRowNumbersWithSplits() throws IOException { Reader reader; try { - OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(new Configuration()).useUTCTimestamp(true); - reader = OrcFile.createReader(new Path(testFile.toString()), readerOptions); + OrcFile.ReaderOptions readerOptions = + OrcFile.readerOptions(new Configuration()).useUTCTimestamp(true); + reader = OrcFile.createReader(new Path(testFile.toString()), readerOptions); } catch (IOException ioe) { throw new RuntimeIOException(ioe, "Failed to open file: %s", testFile); } - List splitOffsets = reader.getStripes().stream().map(StripeInformation::getOffset) - .collect(Collectors.toList()); - List splitLengths = reader.getStripes().stream().map(StripeInformation::getLength) - .collect(Collectors.toList()); + List splitOffsets = + reader.getStripes().stream().map(StripeInformation::getOffset).collect(Collectors.toList()); + List splitLengths = + reader.getStripes().stream().map(StripeInformation::getLength).collect(Collectors.toList()); for (int i = 0; i < 10; i++) { - readAndValidate(null, splitOffsets.get(i), splitLengths.get(i), EXPECTED_ROWS.subList(i * 100, (i + 1) * 100)); + readAndValidate( + null, + splitOffsets.get(i), + splitLengths.get(i), + EXPECTED_ROWS.subList(i * 100, (i + 1) * 100)); } } - private void readAndValidate(Expression filter, Long splitStart, Long splitLength, List expected) + private void readAndValidate( + Expression filter, Long splitStart, Long splitLength, List expected) throws IOException { - Schema projectionWithoutMetadataFields = TypeUtil.selectNot(PROJECTION_SCHEMA, MetadataColumns.metadataFieldIds()); + Schema projectionWithoutMetadataFields = + TypeUtil.selectNot(PROJECTION_SCHEMA, MetadataColumns.metadataFieldIds()); CloseableIterable reader = null; try { - ORC.ReadBuilder builder = ORC.read(Files.localInput(testFile)) - .project(projectionWithoutMetadataFields); + ORC.ReadBuilder builder = + ORC.read(Files.localInput(testFile)).project(projectionWithoutMetadataFields); if (vectorized) { - builder = builder.createBatchedReaderFunc(readOrcSchema -> - VectorizedSparkOrcReaders.buildReader(PROJECTION_SCHEMA, readOrcSchema, ImmutableMap.of())); + builder = + builder.createBatchedReaderFunc( + readOrcSchema -> + VectorizedSparkOrcReaders.buildReader( + PROJECTION_SCHEMA, readOrcSchema, ImmutableMap.of())); } else { - builder = builder.createReaderFunc(readOrcSchema -> new SparkOrcReader(PROJECTION_SCHEMA, readOrcSchema)); + builder = + builder.createReaderFunc( + readOrcSchema -> new SparkOrcReader(PROJECTION_SCHEMA, readOrcSchema)); } if (filter != null) { diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java index 5042d1cc1338..b23fe729a187 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.data.TestHelpers.assertEquals; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Collections; @@ -38,45 +40,44 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.spark.data.TestHelpers.assertEquals; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkOrcReader extends AvroDataTest { @Override protected void writeAndValidate(Schema schema) throws IOException { - final Iterable expected = RandomData - .generateSpark(schema, 100, 0L); + final Iterable expected = RandomData.generateSpark(schema, 100, 0L); writeAndValidateRecords(schema, expected); } @Test public void writeAndValidateRepeatingRecords() throws IOException { - Schema structSchema = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()) - ); - List expectedRepeating = Collections.nCopies(100, - RandomData.generateSpark(structSchema, 1, 0L).iterator().next()); + Schema structSchema = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get())); + List expectedRepeating = + Collections.nCopies(100, RandomData.generateSpark(structSchema, 1, 0L).iterator().next()); writeAndValidateRecords(structSchema, expectedRepeating); } - private void writeAndValidateRecords(Schema schema, Iterable expected) throws IOException { + private void writeAndValidateRecords(Schema schema, Iterable expected) + throws IOException { final File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(schema) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(schema) + .build()) { writer.addAll(expected); } - try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) + .build()) { final Iterator actualRows = reader.iterator(); final Iterator expectedRows = expected.iterator(); while (expectedRows.hasNext()) { @@ -86,11 +87,13 @@ private void writeAndValidateRecords(Schema schema, Iterable expect Assert.assertFalse("Should not have extra rows", actualRows.hasNext()); } - try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) - .project(schema) - .createBatchedReaderFunc(readOrcSchema -> - VectorizedSparkOrcReaders.buildReader(schema, readOrcSchema, ImmutableMap.of())) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(testFile)) + .project(schema) + .createBatchedReaderFunc( + readOrcSchema -> + VectorizedSparkOrcReaders.buildReader(schema, readOrcSchema, ImmutableMap.of())) + .build()) { final Iterator actualRows = batchesToRows(reader.iterator()); final Iterator expectedRows = expected.iterator(); while (expectedRows.hasNext()) { diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java index a8a6313dbfaa..929d08f2cdb6 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -58,20 +59,17 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSparkParquetReadMetadataColumns { - private static final Schema DATA_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()) - ); + private static final Schema DATA_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), required(101, "data", Types.StringType.get())); - private static final Schema PROJECTION_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - MetadataColumns.ROW_POSITION - ); + private static final Schema PROJECTION_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get()), + MetadataColumns.ROW_POSITION); private static final int NUM_ROWS = 1000; private static final List DATA_ROWS; @@ -107,16 +105,12 @@ public class TestSparkParquetReadMetadataColumns { } } - @Parameterized.Parameters(name = "vectorized = {0}") + @Parameterized.Parameters(name = "vectorized = {0}") public static Object[][] parameters() { - return new Object[][] { - new Object[] { false }, - new Object[] { true } - }; + return new Object[][] {new Object[] {false}, new Object[] {true}}; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final boolean vectorized; private File testFile; @@ -133,28 +127,32 @@ public void writeFile() throws IOException { testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - ParquetFileWriter parquetFileWriter = new ParquetFileWriter( - conf, - ParquetSchemaUtil.convert(DATA_SCHEMA, "testSchema"), - new Path(testFile.getAbsolutePath()) - ); + ParquetFileWriter parquetFileWriter = + new ParquetFileWriter( + conf, + ParquetSchemaUtil.convert(DATA_SCHEMA, "testSchema"), + new Path(testFile.getAbsolutePath())); parquetFileWriter.start(); for (int i = 0; i < NUM_ROW_GROUPS; i += 1) { File split = temp.newFile(); Assert.assertTrue("Delete should succeed", split.delete()); fileSplits.add(new Path(split.getAbsolutePath())); - try (FileAppender writer = Parquet.write(Files.localOutput(split)) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(struct, msgType)) - .schema(DATA_SCHEMA) - .overwrite() - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(split)) + .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(struct, msgType)) + .schema(DATA_SCHEMA) + .overwrite() + .build()) { writer.addAll(DATA_ROWS.subList(i * ROWS_PER_SPLIT, (i + 1) * ROWS_PER_SPLIT)); } - parquetFileWriter.appendFile(HadoopInputFile.fromPath(new Path(split.getAbsolutePath()), conf)); + parquetFileWriter.appendFile( + HadoopInputFile.fromPath(new Path(split.getAbsolutePath()), conf)); } - parquetFileWriter - .end(ParquetFileWriter.mergeMetadataFiles(fileSplits, conf).getFileMetaData().getKeyValueMetaData()); + parquetFileWriter.end( + ParquetFileWriter.mergeMetadataFiles(fileSplits, conf) + .getFileMetaData() + .getKeyValueMetaData()); } @Test @@ -167,7 +165,8 @@ public void testReadRowNumbersWithFilter() throws IOException { // current iceberg supports row group filter. for (int i = 1; i < 5; i += 1) { readAndValidate( - Expressions.and(Expressions.lessThan("id", NUM_ROWS / 2), + Expressions.and( + Expressions.lessThan("id", NUM_ROWS / 2), Expressions.greaterThanOrEqual("id", i * ROWS_PER_SPLIT)), null, null, @@ -177,28 +176,36 @@ public void testReadRowNumbersWithFilter() throws IOException { @Test public void testReadRowNumbersWithSplits() throws IOException { - ParquetFileReader fileReader = new ParquetFileReader( - HadoopInputFile.fromPath(new Path(testFile.getAbsolutePath()), new Configuration()), - ParquetReadOptions.builder().build()); + ParquetFileReader fileReader = + new ParquetFileReader( + HadoopInputFile.fromPath(new Path(testFile.getAbsolutePath()), new Configuration()), + ParquetReadOptions.builder().build()); List rowGroups = fileReader.getRowGroups(); for (int i = 0; i < NUM_ROW_GROUPS; i += 1) { - readAndValidate(null, + readAndValidate( + null, rowGroups.get(i).getColumns().get(0).getStartingPos(), rowGroups.get(i).getCompressedSize(), EXPECTED_ROWS.subList(i * ROWS_PER_SPLIT, (i + 1) * ROWS_PER_SPLIT)); } } - private void readAndValidate(Expression filter, Long splitStart, Long splitLength, List expected) + private void readAndValidate( + Expression filter, Long splitStart, Long splitLength, List expected) throws IOException { - Parquet.ReadBuilder builder = Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA); + Parquet.ReadBuilder builder = + Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA); if (vectorized) { - builder.createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(PROJECTION_SCHEMA, - fileSchema, NullCheckingForGet.NULL_CHECKING_ENABLED)); + builder.createBatchedReaderFunc( + fileSchema -> + VectorizedSparkParquetReaders.buildReader( + PROJECTION_SCHEMA, fileSchema, NullCheckingForGet.NULL_CHECKING_ENABLED)); builder.recordsPerBatch(RECORDS_PER_BATCH); } else { - builder = builder.createReaderFunc(msgType -> SparkParquetReaders.buildReader(PROJECTION_SCHEMA, msgType)); + builder = + builder.createReaderFunc( + msgType -> SparkParquetReaders.buildReader(PROJECTION_SCHEMA, msgType)); } if (filter != null) { @@ -209,7 +216,8 @@ private void readAndValidate(Expression filter, Long splitStart, Long splitLengt builder = builder.split(splitStart, splitLength); } - try (CloseableIterable reader = vectorized ? batchesToRows(builder.build()) : builder.build()) { + try (CloseableIterable reader = + vectorized ? batchesToRows(builder.build()) : builder.build()) { final Iterator actualRows = reader.iterator(); for (InternalRow internalRow : expected) { diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java index 03d234c1eca5..d4b7443e2e20 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -60,31 +62,31 @@ import org.junit.Assume; import org.junit.Test; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkParquetReader extends AvroDataTest { @Override protected void writeAndValidate(Schema schema) throws IOException { - Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find(schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); + Assume.assumeTrue( + "Parquet Avro cannot write non-string map keys", + null + == TypeUtil.find( + schema, + type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); List expected = RandomData.generateList(schema, 100, 0L); File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { writer.addAll(expected); } - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(type -> SparkParquetReaders.buildReader(schema, type)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(type -> SparkParquetReaders.buildReader(schema, type)) + .build()) { Iterator rows = reader.iterator(); for (int i = 0; i < expected.size(); i += 1) { Assert.assertTrue("Should have expected number of rows", rows.hasNext()); @@ -129,7 +131,8 @@ protected Table tableFromInputFile(InputFile inputFile, Schema schema) throws IO @Test public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOException { - String outputFilePath = String.format("%s/%s", temp.getRoot().getAbsolutePath(), "parquet_int96.parquet"); + String outputFilePath = + String.format("%s/%s", temp.getRoot().getAbsolutePath(), "parquet_int96.parquet"); HadoopOutputFile outputFile = HadoopOutputFile.fromPath( new org.apache.hadoop.fs.Path(outputFilePath), new Configuration()); @@ -137,7 +140,7 @@ public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOExceptio StructType sparkSchema = new StructType( new StructField[] { - new StructField("ts", DataTypes.TimestampType, true, Metadata.empty()) + new StructField("ts", DataTypes.TimestampType, true, Metadata.empty()) }); List rows = Lists.newArrayList(RandomData.generateSpark(schema, 10, 0L)); @@ -164,14 +167,14 @@ public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOExceptio Assert.assertEquals(rows.size(), tableRecords.size()); - for (int i = 0; i < tableRecords.size(); i++) { + for (int i = 0; i < tableRecords.size(); i++) { GenericsHelpers.assertEqualsUnsafe(schema.asStruct(), tableRecords.get(i), rows.get(i)); } } /** - * Native Spark ParquetWriter.Builder implementation so that we can write timestamps using Spark's native - * ParquetWriteSupport. + * Native Spark ParquetWriter.Builder implementation so that we can write timestamps using Spark's + * native ParquetWriteSupport. */ private static class NativeSparkWriterBuilder extends ParquetWriter.Builder { diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java index c75a87abc45c..261fb8838aa4 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -35,39 +37,51 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkParquetWriter { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required(5, "strict", Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional(6, "hopeful", Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()) - )), - optional(10, "vehement", Types.LongType.get()) - )), - optional(11, "metamorphosis", Types.MapType.ofRequired(12, 13, - Types.StringType.get(), Types.TimestampType.withZone())), - required(14, "winter", Types.ListType.ofOptional(15, Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.FloatType.get()), - optional(18, "wheeze", Types.StringType.get()) - ))), - optional(19, "renovate", Types.MapType.ofRequired(20, 21, - Types.StringType.get(), Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.IntegerType.get()), - required(24, "couch rope", Types.IntegerType.get()) - ))), - optional(2, "slide", Types.StringType.get()) - ); + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "roots", Types.LongType.get()), + optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), + required( + 5, + "strict", + Types.StructType.of( + required(9, "tangerine", Types.StringType.get()), + optional( + 6, + "hopeful", + Types.StructType.of( + required(7, "steel", Types.FloatType.get()), + required(8, "lantern", Types.DateType.get()))), + optional(10, "vehement", Types.LongType.get()))), + optional( + 11, + "metamorphosis", + Types.MapType.ofRequired( + 12, 13, Types.StringType.get(), Types.TimestampType.withZone())), + required( + 14, + "winter", + Types.ListType.ofOptional( + 15, + Types.StructType.of( + optional(16, "beet", Types.DoubleType.get()), + required(17, "stamp", Types.FloatType.get()), + optional(18, "wheeze", Types.StringType.get())))), + optional( + 19, + "renovate", + Types.MapType.ofRequired( + 20, + 21, + Types.StringType.get(), + Types.StructType.of( + optional(22, "jumpy", Types.DoubleType.get()), + required(23, "koala", Types.IntegerType.get()), + required(24, "couch rope", Types.IntegerType.get())))), + optional(2, "slide", Types.StringType.get())); @Test public void testCorrectness() throws IOException { @@ -77,17 +91,22 @@ public void testCorrectness() throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(COMPLEX_SCHEMA), msgType)) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(COMPLEX_SCHEMA) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter( + SparkSchemaUtil.convert(COMPLEX_SCHEMA), msgType)) + .build()) { writer.addAll(records); } - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(COMPLEX_SCHEMA, type)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(COMPLEX_SCHEMA, type)) + .build()) { Iterator expected = records.iterator(); Iterator rows = reader.iterator(); for (int i = 0; i < numRows; i += 1) { diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java index 1e7430d16df7..d10e7f5a19e3 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.math.BigDecimal; @@ -40,8 +41,6 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkRecordOrcReaderWriter extends AvroDataTest { private static final int NUM_RECORDS = 200; @@ -50,19 +49,21 @@ private void writeAndValidate(Schema schema, List expectedRecords) throw Assert.assertTrue("Delete should succeed", originalFile.delete()); // Write few generic records into the original test file. - try (FileAppender writer = ORC.write(Files.localOutput(originalFile)) - .createWriterFunc(GenericOrcWriter::buildWriter) - .schema(schema) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(originalFile)) + .createWriterFunc(GenericOrcWriter::buildWriter) + .schema(schema) + .build()) { writer.addAll(expectedRecords); } // Read into spark InternalRow from the original test file. List internalRows = Lists.newArrayList(); - try (CloseableIterable reader = ORC.read(Files.localInput(originalFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(originalFile)) + .project(schema) + .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) + .build()) { reader.forEach(internalRows::add); assertEqualsUnsafe(schema.asStruct(), expectedRecords, reader, expectedRecords.size()); } @@ -71,26 +72,29 @@ private void writeAndValidate(Schema schema, List expectedRecords) throw Assert.assertTrue("Delete should succeed", anotherFile.delete()); // Write those spark InternalRows into a new file again. - try (FileAppender writer = ORC.write(Files.localOutput(anotherFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(schema) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(anotherFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(schema) + .build()) { writer.addAll(internalRows); } // Check whether the InternalRows are expected records. - try (CloseableIterable reader = ORC.read(Files.localInput(anotherFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(anotherFile)) + .project(schema) + .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) + .build()) { assertEqualsUnsafe(schema.asStruct(), expectedRecords, reader, expectedRecords.size()); } // Read into iceberg GenericRecord and check again. - try (CloseableIterable reader = ORC.read(Files.localInput(anotherFile)) - .createReaderFunc(typeDesc -> GenericOrcReader.buildReader(schema, typeDesc)) - .project(schema) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(anotherFile)) + .createReaderFunc(typeDesc -> GenericOrcReader.buildReader(schema, typeDesc)) + .project(schema) + .build()) { assertRecordEquals(expectedRecords, reader, expectedRecords.size()); } } @@ -103,11 +107,11 @@ protected void writeAndValidate(Schema schema) throws IOException { @Test public void testDecimalWithTrailingZero() throws IOException { - Schema schema = new Schema( - required(1, "d1", Types.DecimalType.of(10, 2)), - required(2, "d2", Types.DecimalType.of(20, 5)), - required(3, "d3", Types.DecimalType.of(38, 20)) - ); + Schema schema = + new Schema( + required(1, "d1", Types.DecimalType.of(10, 2)), + required(2, "d2", Types.DecimalType.of(20, 5)), + required(3, "d3", Types.DecimalType.of(38, 20))); List expected = Lists.newArrayList(); @@ -121,7 +125,8 @@ public void testDecimalWithTrailingZero() throws IOException { writeAndValidate(schema, expected); } - private static void assertRecordEquals(Iterable expected, Iterable actual, int size) { + private static void assertRecordEquals( + Iterable expected, Iterable actual, int size) { Iterator expectedIter = expected.iterator(); Iterator actualIter = actual.iterator(); for (int i = 0; i < size; i += 1) { @@ -133,8 +138,8 @@ private static void assertRecordEquals(Iterable expected, Iterable expected, - Iterable actual, int size) { + private static void assertEqualsUnsafe( + Types.StructType struct, Iterable expected, Iterable actual, int size) { Iterator expectedIter = expected.iterator(); Iterator actualIter = actual.iterator(); for (int i = 0; i < size; i += 1) { diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java index f292df0c3bf8..756f49a2aad6 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet.vectorized; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; + import java.io.File; import java.io.IOException; import org.apache.avro.generic.GenericData; @@ -35,42 +36,42 @@ import org.junit.Ignore; import org.junit.Test; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; - public class TestParquetDictionaryEncodedVectorizedReads extends TestParquetVectorizedReads { @Override - Iterable generateData(Schema schema, int numRecords, long seed, float nullPercentage, - Function transform) { - Iterable data = RandomData.generateDictionaryEncodableData(schema, numRecords, seed, nullPercentage); + Iterable generateData( + Schema schema, + int numRecords, + long seed, + float nullPercentage, + Function transform) { + Iterable data = + RandomData.generateDictionaryEncodableData(schema, numRecords, seed, nullPercentage); return transform == IDENTITY ? data : Iterables.transform(data, transform); } @Test @Override @Ignore // Ignored since this code path is already tested in TestParquetVectorizedReads - public void testVectorizedReadsWithNewContainers() throws IOException { - - } + public void testVectorizedReadsWithNewContainers() throws IOException {} @Test public void testMixedDictionaryNonDictionaryReads() throws IOException { Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); File dictionaryEncodedFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dictionaryEncodedFile.delete()); - Iterable dictionaryEncodableData = RandomData.generateDictionaryEncodableData( - schema, - 10000, - 0L, - RandomData.DEFAULT_NULL_PERCENTAGE); - try (FileAppender writer = getParquetWriter(schema, dictionaryEncodedFile)) { + Iterable dictionaryEncodableData = + RandomData.generateDictionaryEncodableData( + schema, 10000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE); + try (FileAppender writer = + getParquetWriter(schema, dictionaryEncodedFile)) { writer.addAll(dictionaryEncodableData); } File plainEncodingFile = temp.newFile(); Assert.assertTrue("Delete should succeed", plainEncodingFile.delete()); - Iterable nonDictionaryData = RandomData.generate(schema, 10000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE); + Iterable nonDictionaryData = + RandomData.generate(schema, 10000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE); try (FileAppender writer = getParquetWriter(schema, plainEncodingFile)) { writer.addAll(nonDictionaryData); } @@ -78,15 +79,19 @@ public void testMixedDictionaryNonDictionaryReads() throws IOException { int rowGroupSize = PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; File mixedFile = temp.newFile(); Assert.assertTrue("Delete should succeed", mixedFile.delete()); - Parquet.concat(ImmutableList.of(dictionaryEncodedFile, plainEncodingFile, dictionaryEncodedFile), - mixedFile, rowGroupSize, schema, ImmutableMap.of()); + Parquet.concat( + ImmutableList.of(dictionaryEncodedFile, plainEncodingFile, dictionaryEncodedFile), + mixedFile, + rowGroupSize, + schema, + ImmutableMap.of()); assertRecordsMatch( - schema, - 30000, - FluentIterable.concat(dictionaryEncodableData, nonDictionaryData, dictionaryEncodableData), - mixedFile, - false, - true, - BATCH_SIZE); + schema, + 30000, + FluentIterable.concat(dictionaryEncodableData, nonDictionaryData, dictionaryEncodableData), + mixedFile, + false, + true, + BATCH_SIZE); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java index 5ceac3fdb76e..42ea34936b5f 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet.vectorized; import java.io.File; @@ -33,7 +32,8 @@ import org.junit.Ignore; import org.junit.Test; -public class TestParquetDictionaryFallbackToPlainEncodingVectorizedReads extends TestParquetVectorizedReads { +public class TestParquetDictionaryFallbackToPlainEncodingVectorizedReads + extends TestParquetVectorizedReads { private static final int NUM_ROWS = 1_000_000; @Override @@ -42,15 +42,20 @@ protected int getNumRows() { } @Override - Iterable generateData(Schema schema, int numRecords, long seed, float nullPercentage, - Function transform) { + Iterable generateData( + Schema schema, + int numRecords, + long seed, + float nullPercentage, + Function transform) { // TODO: take into account nullPercentage when generating fallback encoding data Iterable data = RandomData.generateFallbackData(schema, numRecords, seed, numRecords / 20); return transform == IDENTITY ? data : Iterables.transform(data, transform); } @Override - FileAppender getParquetWriter(Schema schema, File testFile) throws IOException { + FileAppender getParquetWriter(Schema schema, File testFile) + throws IOException { return Parquet.write(Files.localOutput(testFile)) .schema(schema) .named("test") @@ -61,14 +66,10 @@ FileAppender getParquetWriter(Schema schema, File testFile) @Test @Override @Ignore // Fallback encoding not triggered when data is mostly null - public void testMostlyNullsForOptionalFields() { - - } + public void testMostlyNullsForOptionalFields() {} @Test @Override @Ignore // Ignored since this code path is already tested in TestParquetVectorizedReads - public void testVectorizedReadsWithNewContainers() throws IOException { - - } + public void testVectorizedReadsWithNewContainers() throws IOException {} } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java index 48dcc94a5fce..8908a23fad8f 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet.vectorized; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -49,9 +51,6 @@ import org.junit.Ignore; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestParquetVectorizedReads extends AvroDataTest { private static final int NUM_ROWS = 200_000; static final int BATCH_SIZE = 10_000; @@ -64,24 +63,44 @@ protected void writeAndValidate(Schema schema) throws IOException { } private void writeAndValidate( - Schema schema, int numRecords, long seed, float nullPercentage, - boolean setAndCheckArrowValidityVector, boolean reuseContainers) - throws IOException { - writeAndValidate(schema, numRecords, seed, nullPercentage, - setAndCheckArrowValidityVector, reuseContainers, BATCH_SIZE, IDENTITY); + Schema schema, + int numRecords, + long seed, + float nullPercentage, + boolean setAndCheckArrowValidityVector, + boolean reuseContainers) + throws IOException { + writeAndValidate( + schema, + numRecords, + seed, + nullPercentage, + setAndCheckArrowValidityVector, + reuseContainers, + BATCH_SIZE, + IDENTITY); } private void writeAndValidate( - Schema schema, int numRecords, long seed, float nullPercentage, - boolean setAndCheckArrowValidityVector, boolean reuseContainers, int batchSize, - Function transform) + Schema schema, + int numRecords, + long seed, + float nullPercentage, + boolean setAndCheckArrowValidityVector, + boolean reuseContainers, + int batchSize, + Function transform) throws IOException { // Write test data - Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find( - schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); + Assume.assumeTrue( + "Parquet Avro cannot write non-string map keys", + null + == TypeUtil.find( + schema, + type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); - Iterable expected = generateData(schema, numRecords, seed, nullPercentage, transform); + Iterable expected = + generateData(schema, numRecords, seed, nullPercentage, transform); // write a test parquet file using iceberg writer File testFile = temp.newFile(); @@ -90,58 +109,74 @@ private void writeAndValidate( try (FileAppender writer = getParquetWriter(schema, testFile)) { writer.addAll(expected); } - assertRecordsMatch(schema, numRecords, expected, testFile, setAndCheckArrowValidityVector, - reuseContainers, batchSize); + assertRecordsMatch( + schema, + numRecords, + expected, + testFile, + setAndCheckArrowValidityVector, + reuseContainers, + batchSize); } protected int getNumRows() { return NUM_ROWS; } - Iterable generateData(Schema schema, int numRecords, long seed, float nullPercentage, - Function transform) { - Iterable data = RandomData.generate(schema, numRecords, seed, nullPercentage); + Iterable generateData( + Schema schema, + int numRecords, + long seed, + float nullPercentage, + Function transform) { + Iterable data = + RandomData.generate(schema, numRecords, seed, nullPercentage); return transform == IDENTITY ? data : Iterables.transform(data, transform); } - FileAppender getParquetWriter(Schema schema, File testFile) throws IOException { + FileAppender getParquetWriter(Schema schema, File testFile) + throws IOException { + return Parquet.write(Files.localOutput(testFile)).schema(schema).named("test").build(); + } + + FileAppender getParquetV2Writer(Schema schema, File testFile) + throws IOException { return Parquet.write(Files.localOutput(testFile)) .schema(schema) .named("test") + .writerVersion(ParquetProperties.WriterVersion.PARQUET_2_0) .build(); } - FileAppender getParquetV2Writer(Schema schema, File testFile) throws IOException { - return Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .writerVersion(ParquetProperties.WriterVersion.PARQUET_2_0) - .build(); - } - void assertRecordsMatch( - Schema schema, int expectedSize, Iterable expected, File testFile, - boolean setAndCheckArrowValidityBuffer, boolean reuseContainers, int batchSize) + Schema schema, + int expectedSize, + Iterable expected, + File testFile, + boolean setAndCheckArrowValidityBuffer, + boolean reuseContainers, + int batchSize) throws IOException { - Parquet.ReadBuilder readBuilder = Parquet.read(Files.localInput(testFile)) - .project(schema) - .recordsPerBatch(batchSize) - .createBatchedReaderFunc(type -> VectorizedSparkParquetReaders.buildReader( - schema, - type, - setAndCheckArrowValidityBuffer)); + Parquet.ReadBuilder readBuilder = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .recordsPerBatch(batchSize) + .createBatchedReaderFunc( + type -> + VectorizedSparkParquetReaders.buildReader( + schema, type, setAndCheckArrowValidityBuffer)); if (reuseContainers) { readBuilder.reuseContainers(); } - try (CloseableIterable batchReader = - readBuilder.build()) { + try (CloseableIterable batchReader = readBuilder.build()) { Iterator expectedIter = expected.iterator(); Iterator batches = batchReader.iterator(); int numRowsRead = 0; while (batches.hasNext()) { ColumnarBatch batch = batches.next(); numRowsRead += batch.numRows(); - TestHelpers.assertEqualsBatch(schema.asStruct(), expectedIter, batch, setAndCheckArrowValidityBuffer); + TestHelpers.assertEqualsBatch( + schema.asStruct(), expectedIter, batch, setAndCheckArrowValidityBuffer); } Assert.assertEquals(expectedSize, numRowsRead); } @@ -149,38 +184,31 @@ void assertRecordsMatch( @Test @Ignore - public void testArray() { - } + public void testArray() {} @Test @Ignore - public void testArrayOfStructs() { - } + public void testArrayOfStructs() {} @Test @Ignore - public void testMap() { - } + public void testMap() {} @Test @Ignore - public void testNumericMapKey() { - } + public void testNumericMapKey() {} @Test @Ignore - public void testComplexMapKey() { - } + public void testComplexMapKey() {} @Test @Ignore - public void testMapOfStructs() { - } + public void testMapOfStructs() {} @Test @Ignore - public void testMixedTypes() { - } + public void testMixedTypes() {} @Test @Override @@ -189,13 +217,13 @@ public void testNestedStruct() { "Vectorized reads are not supported yet for struct fields", UnsupportedOperationException.class, "Vectorized reads are not supported yet for struct fields", - () -> VectorizedSparkParquetReaders.buildReader( - TypeUtil.assignIncreasingFreshIds(new Schema(required( - 1, - "struct", - SUPPORTED_PRIMITIVES))), - new MessageType("struct", new GroupType(Type.Repetition.OPTIONAL, "struct").withId(1)), - false)); + () -> + VectorizedSparkParquetReaders.buildReader( + TypeUtil.assignIncreasingFreshIds( + new Schema(required(1, "struct", SUPPORTED_PRIMITIVES))), + new MessageType( + "struct", new GroupType(Type.Repetition.OPTIONAL, "struct").withId(1)), + false)); } @Test @@ -211,27 +239,40 @@ public void testMostlyNullsForOptionalFields() throws IOException { @Test public void testSettingArrowValidityVector() throws IOException { - writeAndValidate(new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)), - getNumRows(), 0L, RandomData.DEFAULT_NULL_PERCENTAGE, true, true); + writeAndValidate( + new Schema(Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)), + getNumRows(), + 0L, + RandomData.DEFAULT_NULL_PERCENTAGE, + true, + true); } @Test public void testVectorizedReadsWithNewContainers() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema(SUPPORTED_PRIMITIVES.fields())), - getNumRows(), 0L, RandomData.DEFAULT_NULL_PERCENTAGE, true, false); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds(new Schema(SUPPORTED_PRIMITIVES.fields())), + getNumRows(), + 0L, + RandomData.DEFAULT_NULL_PERCENTAGE, + true, + false); } @Test public void testVectorizedReadsWithReallocatedArrowBuffers() throws IOException { // With a batch size of 2, 256 bytes are allocated in the VarCharVector. By adding strings of // length 512, the vector will need to be reallocated for storing the batch. - writeAndValidate(new Schema( + writeAndValidate( + new Schema( Lists.newArrayList( - SUPPORTED_PRIMITIVES.field("id"), - SUPPORTED_PRIMITIVES.field("data"))), - 10, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, - true, true, 2, + SUPPORTED_PRIMITIVES.field("id"), SUPPORTED_PRIMITIVES.field("data"))), + 10, + 0L, + RandomData.DEFAULT_NULL_PERCENTAGE, + true, + true, + 2, record -> { if (record.get("data") != null) { record.put("data", Strings.padEnd((String) record.get("data"), 512, 'a')); @@ -244,65 +285,67 @@ record -> { @Test public void testReadsForTypePromotedColumns() throws Exception { - Schema writeSchema = new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "int_data", Types.IntegerType.get()), - optional(102, "float_data", Types.FloatType.get()), - optional(103, "decimal_data", Types.DecimalType.of(10, 5)) - ); + Schema writeSchema = + new Schema( + required(100, "id", Types.LongType.get()), + optional(101, "int_data", Types.IntegerType.get()), + optional(102, "float_data", Types.FloatType.get()), + optional(103, "decimal_data", Types.DecimalType.of(10, 5))); File dataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = generateData(writeSchema, 30000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); + Iterable data = + generateData(writeSchema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); try (FileAppender writer = getParquetWriter(writeSchema, dataFile)) { writer.addAll(data); } - Schema readSchema = new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "int_data", Types.LongType.get()), - optional(102, "float_data", Types.DoubleType.get()), - optional(103, "decimal_data", Types.DecimalType.of(25, 5)) - ); + Schema readSchema = + new Schema( + required(100, "id", Types.LongType.get()), + optional(101, "int_data", Types.LongType.get()), + optional(102, "float_data", Types.DoubleType.get()), + optional(103, "decimal_data", Types.DecimalType.of(25, 5))); - assertRecordsMatch(readSchema, 30000, data, dataFile, false, - true, BATCH_SIZE); + assertRecordsMatch(readSchema, 30000, data, dataFile, false, true, BATCH_SIZE); } @Test public void testSupportedReadsForParquetV2() throws Exception { // Only float and double column types are written using plain encoding with Parquet V2 - Schema schema = new Schema( + Schema schema = + new Schema( optional(102, "float_data", Types.FloatType.get()), optional(103, "double_data", Types.DoubleType.get())); File dataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = generateData(schema, 30000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); + Iterable data = + generateData(schema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); try (FileAppender writer = getParquetV2Writer(schema, dataFile)) { writer.addAll(data); } - assertRecordsMatch(schema, 30000, data, dataFile, false, - true, BATCH_SIZE); + assertRecordsMatch(schema, 30000, data, dataFile, false, true, BATCH_SIZE); } @Test public void testUnsupportedReadsForParquetV2() throws Exception { - // Longs, ints, string types etc use delta encoding and which are not supported for vectorized reads + // Longs, ints, string types etc use delta encoding and which are not supported for vectorized + // reads Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); File dataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = generateData(schema, 30000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); + Iterable data = + generateData(schema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); try (FileAppender writer = getParquetV2Writer(schema, dataFile)) { writer.addAll(data); } - AssertHelpers.assertThrows("Vectorized reads not supported", - UnsupportedOperationException.class, "Cannot support vectorized reads for column", () -> { - assertRecordsMatch(schema, 30000, data, dataFile, false, - true, BATCH_SIZE); + AssertHelpers.assertThrows( + "Vectorized reads not supported", + UnsupportedOperationException.class, + "Cannot support vectorized reads for column", + () -> { + assertRecordsMatch(schema, 30000, data, dataFile, false, true, BATCH_SIZE); return null; }); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java index 5e22daeb0841..53a35eec61ce 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.time.Instant; diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java index 0f8c8b3b65c6..c9c1c29ea8fc 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -35,7 +34,8 @@ public class ManualSource implements TableProvider, DataSourceRegister { private static final Map tableMap = Maps.newHashMap(); public static void setTable(String name, Table table) { - Preconditions.checkArgument(!tableMap.containsKey(name), "Cannot set " + name + ". It is already set"); + Preconditions.checkArgument( + !tableMap.containsKey(name), "Cannot set " + name + ". It is already set"); tableMap.put(name, table); } @@ -61,7 +61,8 @@ public Transform[] inferPartitioning(CaseInsensitiveStringMap options) { @Override public org.apache.spark.sql.connector.catalog.Table getTable( StructType schema, Transform[] partitioning, Map properties) { - Preconditions.checkArgument(properties.containsKey(TABLE_NAME), "Missing property " + TABLE_NAME); + Preconditions.checkArgument( + properties.containsKey(TABLE_NAME), "Missing property " + TABLE_NAME); String tableName = properties.get(TABLE_NAME); Preconditions.checkArgument(tableMap.containsKey(tableName), "Table missing " + tableName); return tableMap.get(tableName); diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java index c8b7a31b3ba0..550e20b9338e 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.relocated.com.google.common.base.Objects; @@ -25,8 +24,7 @@ public class SimpleRecord { private Integer id; private String data; - public SimpleRecord() { - } + public SimpleRecord() {} public SimpleRecord(Integer id, String data) { this.id = id; diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/SparkTestTable.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/SparkTestTable.java index afb1136f4fa5..ff0afd87782d 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/SparkTestTable.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/SparkTestTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.MetadataColumns; diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java index 4d2e12229813..9491adde4605 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.List; @@ -46,13 +47,10 @@ import org.junit.Rule; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.Files.localOutput; - public class TestAvroScan extends AvroDataTest { private static final Configuration CONF = new Configuration(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; @@ -75,8 +73,8 @@ protected void writeAndValidate(Schema schema) throws IOException { File dataFolder = new File(location, "data"); dataFolder.mkdirs(); - File avroFile = new File(dataFolder, - FileFormat.AVRO.addExtension(UUID.randomUUID().toString())); + File avroFile = + new File(dataFolder, FileFormat.AVRO.addExtension(UUID.randomUUID().toString())); HadoopTables tables = new HadoopTables(CONF); Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); @@ -87,23 +85,21 @@ protected void writeAndValidate(Schema schema) throws IOException { List expected = RandomData.generateList(tableSchema, 100, 1L); - try (FileAppender writer = Avro.write(localOutput(avroFile)) - .schema(tableSchema) - .build()) { + try (FileAppender writer = + Avro.write(localOutput(avroFile)).schema(tableSchema).build()) { writer.addAll(expected); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(100) - .withFileSizeInBytes(avroFile.length()) - .withPath(avroFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(100) + .withFileSizeInBytes(avroFile.length()) + .withPath(avroFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); - Dataset df = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset df = spark.read().format("iceberg").load(location.toString()); List rows = df.collectAsList(); Assert.assertEquals("Should contain 100 rows", 100, rows.size()); diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java index b0a77b72b431..9f32769379c8 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsSafe; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; + import java.io.File; import java.io.IOException; import java.net.URI; @@ -68,10 +71,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsSafe; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; - @RunWith(Parameterized.class) public class TestDataFrameWrites extends AvroDataTest { private static final Configuration CONF = new Configuration(); @@ -80,7 +79,7 @@ public class TestDataFrameWrites extends AvroDataTest { @Parameterized.Parameters(name = "format = {0}") public static Object[] parameters() { - return new Object[] { "parquet", "avro", "orc" }; + return new Object[] {"parquet", "avro", "orc"}; } public TestDataFrameWrites(String format) { @@ -92,32 +91,36 @@ public TestDataFrameWrites(String format) { private Map tableProperties; - private org.apache.spark.sql.types.StructType sparkSchema = new org.apache.spark.sql.types.StructType( - new org.apache.spark.sql.types.StructField[] { - new org.apache.spark.sql.types.StructField( - "optionalField", - org.apache.spark.sql.types.DataTypes.StringType, - true, - org.apache.spark.sql.types.Metadata.empty()), - new org.apache.spark.sql.types.StructField( - "requiredField", - org.apache.spark.sql.types.DataTypes.StringType, - false, - org.apache.spark.sql.types.Metadata.empty()) - }); - - private Schema icebergSchema = new Schema( - Types.NestedField.optional(1, "optionalField", Types.StringType.get()), - Types.NestedField.required(2, "requiredField", Types.StringType.get())); - - private List data0 = Arrays.asList( - "{\"optionalField\": \"a1\", \"requiredField\": \"bid_001\"}", - "{\"optionalField\": \"a2\", \"requiredField\": \"bid_002\"}"); - private List data1 = Arrays.asList( - "{\"optionalField\": \"d1\", \"requiredField\": \"bid_101\"}", - "{\"optionalField\": \"d2\", \"requiredField\": \"bid_102\"}", - "{\"optionalField\": \"d3\", \"requiredField\": \"bid_103\"}", - "{\"optionalField\": \"d4\", \"requiredField\": \"bid_104\"}"); + private org.apache.spark.sql.types.StructType sparkSchema = + new org.apache.spark.sql.types.StructType( + new org.apache.spark.sql.types.StructField[] { + new org.apache.spark.sql.types.StructField( + "optionalField", + org.apache.spark.sql.types.DataTypes.StringType, + true, + org.apache.spark.sql.types.Metadata.empty()), + new org.apache.spark.sql.types.StructField( + "requiredField", + org.apache.spark.sql.types.DataTypes.StringType, + false, + org.apache.spark.sql.types.Metadata.empty()) + }); + + private Schema icebergSchema = + new Schema( + Types.NestedField.optional(1, "optionalField", Types.StringType.get()), + Types.NestedField.required(2, "requiredField", Types.StringType.get())); + + private List data0 = + Arrays.asList( + "{\"optionalField\": \"a1\", \"requiredField\": \"bid_001\"}", + "{\"optionalField\": \"a2\", \"requiredField\": \"bid_002\"}"); + private List data1 = + Arrays.asList( + "{\"optionalField\": \"d1\", \"requiredField\": \"bid_101\"}", + "{\"optionalField\": \"d2\", \"requiredField\": \"bid_102\"}", + "{\"optionalField\": \"d3\", \"requiredField\": \"bid_103\"}", + "{\"optionalField\": \"d4\", \"requiredField\": \"bid_104\"}"); @BeforeClass public static void startSpark() { @@ -145,8 +148,10 @@ public void testWriteWithCustomDataLocation() throws IOException { File location = createTableFolder(); File tablePropertyDataLocation = temp.newFolder("test-table-property-data-dir"); Table table = createTable(new Schema(SUPPORTED_PRIMITIVES.fields()), location); - table.updateProperties().set( - TableProperties.WRITE_DATA_LOCATION, tablePropertyDataLocation.getAbsolutePath()).commit(); + table + .updateProperties() + .set(TableProperties.WRITE_DATA_LOCATION, tablePropertyDataLocation.getAbsolutePath()) + .commit(); writeAndValidateWithLocations(table, location, tablePropertyDataLocation); } @@ -162,7 +167,8 @@ private Table createTable(Schema schema, File location) { return tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); } - private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) throws IOException { + private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) + throws IOException { Schema tableSchema = table.schema(); // use the table schema because ids are reassigned table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); @@ -179,47 +185,56 @@ private void writeAndValidateWithLocations(Table table, File location, File expe while (expectedIter.hasNext() && actualIter.hasNext()) { assertEqualsSafe(tableSchema.asStruct(), expectedIter.next(), actualIter.next()); } - Assert.assertEquals("Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext()); - - table.currentSnapshot().addedDataFiles(table.io()).forEach(dataFile -> - Assert.assertTrue( - String.format( - "File should have the parent directory %s, but has: %s.", - expectedDataDir.getAbsolutePath(), - dataFile.path()), - URI.create(dataFile.path().toString()).getPath().startsWith(expectedDataDir.getAbsolutePath()))); + Assert.assertEquals( + "Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext()); + + table + .currentSnapshot() + .addedDataFiles(table.io()) + .forEach( + dataFile -> + Assert.assertTrue( + String.format( + "File should have the parent directory %s, but has: %s.", + expectedDataDir.getAbsolutePath(), dataFile.path()), + URI.create(dataFile.path().toString()) + .getPath() + .startsWith(expectedDataDir.getAbsolutePath()))); } private List readTable(String location) { - Dataset result = spark.read() - .format("iceberg") - .load(location); + Dataset result = spark.read().format("iceberg").load(location); return result.collectAsList(); } - private void writeData(Iterable records, Schema schema, String location) throws IOException { + private void writeData(Iterable records, Schema schema, String location) + throws IOException { Dataset df = createDataset(records, schema); DataFrameWriter writer = df.write().format("iceberg").mode("append"); writer.save(location); } - private void writeDataWithFailOnPartition(Iterable records, Schema schema, String location) - throws IOException, SparkException { + private void writeDataWithFailOnPartition( + Iterable records, Schema schema, String location) throws IOException, SparkException { final int numPartitions = 10; final int partitionToFail = new Random().nextInt(numPartitions); - MapPartitionsFunction failOnFirstPartitionFunc = (MapPartitionsFunction) input -> { - int partitionId = TaskContext.getPartitionId(); - - if (partitionId == partitionToFail) { - throw new SparkException(String.format("Intended exception in partition %d !", partitionId)); - } - return input; - }; - - Dataset df = createDataset(records, schema) - .repartition(numPartitions) - .mapPartitions(failOnFirstPartitionFunc, RowEncoder.apply(convert(schema))); + MapPartitionsFunction failOnFirstPartitionFunc = + (MapPartitionsFunction) + input -> { + int partitionId = TaskContext.getPartitionId(); + + if (partitionId == partitionToFail) { + throw new SparkException( + String.format("Intended exception in partition %d !", partitionId)); + } + return input; + }; + + Dataset df = + createDataset(records, schema) + .repartition(numPartitions) + .mapPartitions(failOnFirstPartitionFunc, RowEncoder.apply(convert(schema))); // This trick is needed because Spark 3 handles decimal overflow in RowEncoder which "changes" // nullability of the column to "true" regardless of original nullability. // Setting "check-nullability" option to "false" doesn't help as it fails at Spark analyzer. @@ -234,10 +249,8 @@ private Dataset createDataset(Iterable records, Schema schema) thro File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Avro.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Avro.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { for (Record rec : records) { writer.add(rec); } @@ -245,10 +258,11 @@ private Dataset createDataset(Iterable records, Schema schema) thro // make sure the dataframe matches the records before moving on List rows = Lists.newArrayList(); - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)) + .createReaderFunc(SparkAvroReader::new) + .project(schema) + .build()) { Iterator recordIter = records.iterator(); Iterator readIter = reader.iterator(); @@ -257,7 +271,8 @@ private Dataset createDataset(Iterable records, Schema schema) thro assertEqualsUnsafe(schema.asStruct(), recordIter.next(), row); rows.add(row); } - Assert.assertEquals("Both iterators should be exhausted", recordIter.hasNext(), readIter.hasNext()); + Assert.assertEquals( + "Both iterators should be exhausted", recordIter.hasNext(), readIter.hasNext()); } JavaRDD rdd = sc.parallelize(rows); @@ -266,7 +281,8 @@ private Dataset createDataset(Iterable records, Schema schema) thro @Test public void testNullableWithWriteOption() throws IOException { - Assume.assumeTrue("Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); + Assume.assumeTrue( + "Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); File location = new File(temp.newFolder("parquet"), "test"); String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString()); @@ -276,9 +292,11 @@ public void testNullableWithWriteOption() throws IOException { // read this and append to iceberg dataset spark - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) - .write().parquet(sourcePath); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) + .write() + .parquet(sourcePath); // this is our iceberg dataset to which we will append data new HadoopTables(spark.sessionState().newHadoopConf()) @@ -290,15 +308,24 @@ public void testNullableWithWriteOption() throws IOException { // this is the initial data inside the iceberg dataset spark - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) - .write().format("iceberg").mode(SaveMode.Append).save(targetPath); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) + .write() + .format("iceberg") + .mode(SaveMode.Append) + .save(targetPath); // read from parquet and append to iceberg w/ nullability check disabled spark - .read().schema(SparkSchemaUtil.convert(icebergSchema)).parquet(sourcePath) - .write().format("iceberg").option(SparkWriteOptions.CHECK_NULLABILITY, false) - .mode(SaveMode.Append).save(targetPath); + .read() + .schema(SparkSchemaUtil.convert(icebergSchema)) + .parquet(sourcePath) + .write() + .format("iceberg") + .option(SparkWriteOptions.CHECK_NULLABILITY, false) + .mode(SaveMode.Append) + .save(targetPath); // read all data List rows = spark.read().format("iceberg").load(targetPath).collectAsList(); @@ -307,7 +334,8 @@ public void testNullableWithWriteOption() throws IOException { @Test public void testNullableWithSparkSqlOption() throws IOException { - Assume.assumeTrue("Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); + Assume.assumeTrue( + "Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); File location = new File(temp.newFolder("parquet"), "test"); String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString()); @@ -317,15 +345,18 @@ public void testNullableWithSparkSqlOption() throws IOException { // read this and append to iceberg dataset spark - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) - .write().parquet(sourcePath); - - SparkSession newSparkSession = SparkSession.builder() - .master("local[2]") - .appName("NullableTest") - .config(SparkSQLProperties.CHECK_NULLABILITY, false) - .getOrCreate(); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) + .write() + .parquet(sourcePath); + + SparkSession newSparkSession = + SparkSession.builder() + .master("local[2]") + .appName("NullableTest") + .config(SparkSQLProperties.CHECK_NULLABILITY, false) + .getOrCreate(); // this is our iceberg dataset to which we will append data new HadoopTables(newSparkSession.sessionState().newHadoopConf()) @@ -337,19 +368,27 @@ public void testNullableWithSparkSqlOption() throws IOException { // this is the initial data inside the iceberg dataset newSparkSession - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) - .write().format("iceberg").mode(SaveMode.Append).save(targetPath); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) + .write() + .format("iceberg") + .mode(SaveMode.Append) + .save(targetPath); // read from parquet and append to iceberg newSparkSession - .read().schema(SparkSchemaUtil.convert(icebergSchema)).parquet(sourcePath) - .write().format("iceberg").mode(SaveMode.Append).save(targetPath); + .read() + .schema(SparkSchemaUtil.convert(icebergSchema)) + .parquet(sourcePath) + .write() + .format("iceberg") + .mode(SaveMode.Append) + .save(targetPath); // read all data List rows = newSparkSession.read().format("iceberg").load(targetPath).collectAsList(); Assert.assertEquals("Should contain 6 rows", 6, rows.size()); - } @Test diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java index ffcb86052074..750474564078 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import java.math.RoundingMode; import java.util.List; @@ -58,19 +59,15 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestDataSourceOptions { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); private static SparkSession spark = null; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @BeforeClass public static void startSpark() { @@ -94,23 +91,23 @@ public void testWriteFormatOptionOverridesTableProperties() throws IOException { options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro"); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, "parquet") .mode(SaveMode.Append) .save(tableLocation); try (CloseableIterable tasks = table.newScan().planFiles()) { - tasks.forEach(task -> { - FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); - Assert.assertEquals(FileFormat.PARQUET, fileFormat); - }); + tasks.forEach( + task -> { + FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); + Assert.assertEquals(FileFormat.PARQUET, fileFormat); + }); } } @@ -124,22 +121,18 @@ public void testNoWriteFormatOption() throws IOException { options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro"); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); - df.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); try (CloseableIterable tasks = table.newScan().planFiles()) { - tasks.forEach(task -> { - FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); - Assert.assertEquals(FileFormat.AVRO, fileFormat); - }); + tasks.forEach( + task -> { + FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); + Assert.assertEquals(FileFormat.AVRO, fileFormat); + }); } } @@ -159,24 +152,25 @@ public void testHadoopOptions() throws IOException { // to verify that 'hadoop.' data source options are propagated correctly sparkHadoopConf.set("fs.default.name", "hdfs://localhost:9000"); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() + originalDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .option("hadoop.fs.default.name", "file:///") .save(tableLocation); - Dataset resultDf = spark.read() - .format("iceberg") - .option("hadoop.fs.default.name", "file:///") - .load(tableLocation); - List resultRecords = resultDf.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset resultDf = + spark + .read() + .format("iceberg") + .option("hadoop.fs.default.name", "file:///") + .load(tableLocation); + List resultRecords = + resultDf.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Records should match", expectedRecords, resultRecords); } finally { @@ -192,31 +186,35 @@ public void testSplitOptionsOverridesTableProperties() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Map options = Maps.newHashMap(); options.put(TableProperties.SPLIT_SIZE, String.valueOf(128L * 1024 * 1024)); // 128Mb - options.put(TableProperties.DEFAULT_FILE_FORMAT, String.valueOf(FileFormat.AVRO)); // Arbitrarily splittable + options.put( + TableProperties.DEFAULT_FILE_FORMAT, + String.valueOf(FileFormat.AVRO)); // Arbitrarily splittable Table icebergTable = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data") + originalDf + .select("id", "data") .repartition(1) .write() .format("iceberg") .mode("append") .save(tableLocation); - List files = Lists.newArrayList(icebergTable.currentSnapshot().addedDataFiles(icebergTable.io())); + List files = + Lists.newArrayList(icebergTable.currentSnapshot().addedDataFiles(icebergTable.io())); Assert.assertEquals("Should have written 1 file", 1, files.size()); long fileSize = files.get(0).fileSizeInBytes(); long splitSize = LongMath.divide(fileSize, 2, RoundingMode.CEILING); - Dataset resultDf = spark.read() - .format("iceberg") - .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(splitSize)) - .load(tableLocation); + Dataset resultDf = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(splitSize)) + .load(tableLocation); Assert.assertEquals("Spark partitions should match", 2, resultDf.javaRDD().getNumPartitions()); } @@ -230,18 +228,16 @@ public void testIncrementalScanOptions() throws IOException { Map options = Maps.newHashMap(); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "d") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "d")); for (SimpleRecord record : expectedRecords) { - Dataset originalDf = spark.createDataFrame(Lists.newArrayList(record), SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + Dataset originalDf = + spark.createDataFrame(Lists.newArrayList(record), SimpleRecord.class); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); } List snapshotIds = SnapshotUtil.currentAncestorIds(table); @@ -251,11 +247,13 @@ public void testIncrementalScanOptions() throws IOException { IllegalArgumentException.class, "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan", () -> { - spark.read() + spark + .read() .format("iceberg") .option("snapshot-id", snapshotIds.get(3).toString()) .option("start-snapshot-id", snapshotIds.get(3).toString()) - .load(tableLocation).explain(); + .load(tableLocation) + .explain(); }); // end-snapshot-id and as-of-timestamp are both configured. @@ -264,12 +262,15 @@ public void testIncrementalScanOptions() throws IOException { IllegalArgumentException.class, "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan", () -> { - spark.read() + spark + .read() .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, + .option( + SparkReadOptions.AS_OF_TIMESTAMP, Long.toString(table.snapshot(snapshotIds.get(3)).timestampMillis())) .option("end-snapshot-id", snapshotIds.get(2).toString()) - .load(tableLocation).explain(); + .load(tableLocation) + .explain(); }); // only end-snapshot-id is configured. @@ -278,31 +279,37 @@ public void testIncrementalScanOptions() throws IOException { IllegalArgumentException.class, "Cannot only specify option end-snapshot-id to do incremental scan", () -> { - spark.read() + spark + .read() .format("iceberg") .option("end-snapshot-id", snapshotIds.get(2).toString()) - .load(tableLocation).explain(); + .load(tableLocation) + .explain(); }); // test (1st snapshot, current snapshot] incremental scan. - List result = spark.read() - .format("iceberg") - .option("start-snapshot-id", snapshotIds.get(3).toString()) - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List result = + spark + .read() + .format("iceberg") + .option("start-snapshot-id", snapshotIds.get(3).toString()) + .load(tableLocation) + .orderBy("id") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Records should match", expectedRecords.subList(1, 4), result); // test (2nd snapshot, 3rd snapshot] incremental scan. - List result1 = spark.read() - .format("iceberg") - .option("start-snapshot-id", snapshotIds.get(2).toString()) - .option("end-snapshot-id", snapshotIds.get(1).toString()) - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List result1 = + spark + .read() + .format("iceberg") + .option("start-snapshot-id", snapshotIds.get(2).toString()) + .option("end-snapshot-id", snapshotIds.get(1).toString()) + .load(tableLocation) + .orderBy("id") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Records should match", expectedRecords.subList(2, 3), result1); } @@ -315,41 +322,34 @@ public void testMetadataSplitSizeOptionOverrideTableProperties() throws IOExcept Map options = Maps.newHashMap(); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); // produce 1st manifest - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); // produce 2nd manifest - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); List manifests = table.currentSnapshot().allManifests(table.io()); Assert.assertEquals("Must be 2 manifests", 2, manifests.size()); // set the target metadata split size so each manifest ends up in a separate split - table.updateProperties() + table + .updateProperties() .set(TableProperties.METADATA_SPLIT_SIZE, String.valueOf(manifests.get(0).length())) .commit(); - Dataset entriesDf = spark.read() - .format("iceberg") - .load(tableLocation + "#entries"); + Dataset entriesDf = spark.read().format("iceberg").load(tableLocation + "#entries"); Assert.assertEquals("Num partitions must match", 2, entriesDf.javaRDD().getNumPartitions()); // override the table property using options - entriesDf = spark.read() - .format("iceberg") - .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)) - .load(tableLocation + "#entries"); + entriesDf = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)) + .load(tableLocation + "#entries"); Assert.assertEquals("Num partitions must match", 1, entriesDf.javaRDD().getNumPartitions()); } @@ -362,24 +362,26 @@ public void testDefaultMetadataSplitSize() throws IOException { Map options = Maps.newHashMap(); Table icebergTable = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); int splitSize = (int) TableProperties.METADATA_SPLIT_SIZE_DEFAULT; // 32MB split size - int expectedSplits = ((int) tables.load(tableLocation + "#entries") - .currentSnapshot().allManifests(icebergTable.io()).get(0).length() + splitSize - 1) / splitSize; + int expectedSplits = + ((int) + tables + .load(tableLocation + "#entries") + .currentSnapshot() + .allManifests(icebergTable.io()) + .get(0) + .length() + + splitSize + - 1) + / splitSize; - Dataset metadataDf = spark.read() - .format("iceberg") - .load(tableLocation + "#entries"); + Dataset metadataDf = spark.read().format("iceberg").load(tableLocation + "#entries"); int partitionNum = metadataDf.javaRDD().getNumPartitions(); Assert.assertEquals("Spark partitions should match", expectedSplits, partitionNum); @@ -391,17 +393,17 @@ public void testExtraSnapshotMetadata() throws IOException { HadoopTables tables = new HadoopTables(CONF); tables.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".extra-key", "someValue") - .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".another-key", "anotherValue") - .save(tableLocation); + originalDf + .select("id", "data") + .write() + .format("iceberg") + .mode("append") + .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".extra-key", "someValue") + .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".another-key", "anotherValue") + .save(tableLocation); Table table = tables.load(tableLocation); @@ -414,26 +416,27 @@ public void testExtraSnapshotMetadataWithSQL() throws InterruptedException, IOEx String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); - Table table = tables.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + tables.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); spark.read().format("iceberg").load(tableLocation).createOrReplaceTempView("target"); - Thread writerThread = new Thread(() -> { - Map properties = Maps.newHashMap(); - properties.put("writer-thread", String.valueOf(Thread.currentThread().getName())); - CommitMetadata.withCommitProperties(properties, () -> { - spark.sql("INSERT INTO target VALUES (3, 'c'), (4, 'd')"); - return 0; - }, RuntimeException.class); - }); + Thread writerThread = + new Thread( + () -> { + Map properties = Maps.newHashMap(); + properties.put("writer-thread", String.valueOf(Thread.currentThread().getName())); + CommitMetadata.withCommitProperties( + properties, + () -> { + spark.sql("INSERT INTO target VALUES (3, 'c'), (4, 'd')"); + return 0; + }, + RuntimeException.class); + }); writerThread.setName("test-extra-commit-message-writer-thread"); writerThread.start(); writerThread.join(); diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java index d51fd3c4e8eb..b30bbf145f23 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; +import static org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp; +import static org.apache.spark.sql.functions.callUDF; +import static org.apache.spark.sql.functions.column; + import java.io.File; import java.io.IOException; import java.sql.Timestamp; @@ -79,41 +83,31 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; -import static org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp; -import static org.apache.spark.sql.functions.callUDF; -import static org.apache.spark.sql.functions.column; - @RunWith(Parameterized.class) public class TestFilteredScan { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "ts", Types.TimestampType.withZone()), - Types.NestedField.optional(3, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "ts", Types.TimestampType.withZone()), + Types.NestedField.optional(3, "data", Types.StringType.get())); - private static final PartitionSpec BUCKET_BY_ID = PartitionSpec.builderFor(SCHEMA) - .bucket("id", 4) - .build(); + private static final PartitionSpec BUCKET_BY_ID = + PartitionSpec.builderFor(SCHEMA).bucket("id", 4).build(); - private static final PartitionSpec PARTITION_BY_DAY = PartitionSpec.builderFor(SCHEMA) - .day("ts") - .build(); + private static final PartitionSpec PARTITION_BY_DAY = + PartitionSpec.builderFor(SCHEMA).day("ts").build(); - private static final PartitionSpec PARTITION_BY_HOUR = PartitionSpec.builderFor(SCHEMA) - .hour("ts") - .build(); + private static final PartitionSpec PARTITION_BY_HOUR = + PartitionSpec.builderFor(SCHEMA).hour("ts").build(); - private static final PartitionSpec PARTITION_BY_DATA = PartitionSpec.builderFor(SCHEMA) - .identity("data") - .build(); + private static final PartitionSpec PARTITION_BY_DATA = + PartitionSpec.builderFor(SCHEMA).identity("data").build(); - private static final PartitionSpec PARTITION_BY_ID = PartitionSpec.builderFor(SCHEMA) - .identity("id") - .build(); + private static final PartitionSpec PARTITION_BY_ID = + PartitionSpec.builderFor(SCHEMA).identity("id").build(); private static SparkSession spark = null; @@ -126,14 +120,20 @@ public static void startSpark() { spark.udf().register("bucket4", (UDF1) bucket4::apply, IntegerType$.MODULE$); Transform day = Transforms.day(Types.TimestampType.withZone()); - spark.udf().register("ts_day", - (UDF1) timestamp -> day.apply((Long) fromJavaTimestamp(timestamp)), - IntegerType$.MODULE$); + spark + .udf() + .register( + "ts_day", + (UDF1) timestamp -> day.apply((Long) fromJavaTimestamp(timestamp)), + IntegerType$.MODULE$); Transform hour = Transforms.hour(Types.TimestampType.withZone()); - spark.udf().register("ts_hour", - (UDF1) timestamp -> hour.apply((Long) fromJavaTimestamp(timestamp)), - IntegerType$.MODULE$); + spark + .udf() + .register( + "ts_hour", + (UDF1) timestamp -> hour.apply((Long) fromJavaTimestamp(timestamp)), + IntegerType$.MODULE$); spark.udf().register("data_ident", (UDF1) data -> data, StringType$.MODULE$); spark.udf().register("id_ident", (UDF1) id -> id, LongType$.MODULE$); @@ -146,8 +146,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String format; private final boolean vectorized; @@ -155,11 +154,11 @@ public static void stopSpark() { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } @@ -188,26 +187,27 @@ public void writeUnpartitionedTable() throws IOException { this.records = testRecords(tableSchema); - try (FileAppender writer = new GenericAppenderFactory(tableSchema).newAppender( - localOutput(testFile), fileFormat)) { + try (FileAppender writer = + new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), fileFormat)) { writer.addAll(records); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(records.size()) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(records.size()) + .withFileSizeInBytes(testFile.length()) + .withPath(testFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); } @Test public void testUnpartitionedIDFilters() { - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", unpartitioned.toString()) - ); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString())); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); for (int i = 0; i < 10; i += 1) { pushFilters(builder, EqualTo.apply("id", i)); @@ -217,16 +217,15 @@ public void testUnpartitionedIDFilters() { Assert.assertEquals("Should only create one task for a small file", 1, partitions.length); // validate row filtering - assertEqualsSafe(SCHEMA.asStruct(), expected(i), - read(unpartitioned.toString(), vectorized, "id = " + i)); + assertEqualsSafe( + SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), vectorized, "id = " + i)); } } @Test public void testUnpartitionedCaseInsensitiveIDFilters() { - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", unpartitioned.toString()) - ); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString())); // set spark.sql.caseSensitive to false String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive"); @@ -235,17 +234,22 @@ public void testUnpartitionedCaseInsensitiveIDFilters() { try { for (int i = 0; i < 10; i += 1) { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options) - .caseSensitive(false); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options) + .caseSensitive(false); - pushFilters(builder, EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match + pushFilters( + builder, + EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match Batch scan = builder.build().toBatch(); InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.length); // validate row filtering - assertEqualsSafe(SCHEMA.asStruct(), expected(i), + assertEqualsSafe( + SCHEMA.asStruct(), + expected(i), read(unpartitioned.toString(), vectorized, "id = " + i)); } } finally { @@ -256,11 +260,11 @@ public void testUnpartitionedCaseInsensitiveIDFilters() { @Test public void testUnpartitionedTimestampFilter() { - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", unpartitioned.toString()) - ); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); Batch scan = builder.build().toBatch(); @@ -268,21 +272,29 @@ public void testUnpartitionedTimestampFilter() { InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), - read(unpartitioned.toString(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(5, 6, 7, 8, 9), + read( + unpartitioned.toString(), + vectorized, + "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } @Test public void testBucketPartitionedIDFilters() { Table table = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); - Assert.assertEquals("Unfiltered table should created 4 read tasks", - 4, unfiltered.planInputPartitions().length); + Batch unfiltered = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); + Assert.assertEquals( + "Unfiltered table should created 4 read tasks", 4, unfiltered.planInputPartitions().length); for (int i = 0; i < 10; i += 1) { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, EqualTo.apply("id", i)); Batch scan = builder.build().toBatch(); @@ -293,7 +305,8 @@ public void testBucketPartitionedIDFilters() { Assert.assertEquals("Should create one task for a single bucket", 1, tasks.length); // validate row filtering - assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(table.location(), vectorized, "id = " + i)); + assertEqualsSafe( + SCHEMA.asStruct(), expected(i), read(table.location(), vectorized, "id = " + i)); } } @@ -301,14 +314,17 @@ public void testBucketPartitionedIDFilters() { @Test public void testDayPartitionedTimestampFilters() { Table table = buildPartitionedTable("partitioned_by_day", PARTITION_BY_DAY, "ts_day", "ts"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + Batch unfiltered = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); - Assert.assertEquals("Unfiltered table should created 2 read tasks", - 2, unfiltered.planInputPartitions().length); + Assert.assertEquals( + "Unfiltered table should created 2 read tasks", 2, unfiltered.planInputPartitions().length); { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); Batch scan = builder.build().toBatch(); @@ -316,24 +332,35 @@ public void testDayPartitionedTimestampFilters() { InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should create one task for 2017-12-21", 1, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), - read(table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(5, 6, 7, 8, 9), + read( + table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); - - pushFilters(builder, And.apply( - GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), - LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + + pushFilters( + builder, + And.apply( + GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), + LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); Batch scan = builder.build().toBatch(); InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should create one task for 2017-12-22", 1, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(1, 2), read(table.location(), vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + - "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(1, 2), + read( + table.location(), + vectorized, + "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); } } @@ -342,14 +369,17 @@ public void testDayPartitionedTimestampFilters() { public void testHourPartitionedTimestampFilters() { Table table = buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + Batch unfiltered = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); - Assert.assertEquals("Unfiltered table should created 9 read tasks", - 9, unfiltered.planInputPartitions().length); + Assert.assertEquals( + "Unfiltered table should created 9 read tasks", 9, unfiltered.planInputPartitions().length); { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); Batch scan = builder.build().toBatch(); @@ -357,24 +387,35 @@ public void testHourPartitionedTimestampFilters() { InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should create 4 tasks for 2017-12-21: 15, 17, 21, 22", 4, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(8, 9, 7, 6, 5), - read(table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(8, 9, 7, 6, 5), + read( + table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); - - pushFilters(builder, And.apply( - GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), - LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + + pushFilters( + builder, + And.apply( + GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), + LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); Batch scan = builder.build().toBatch(); InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should create 2 tasks for 2017-12-22: 6, 7", 2, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(2, 1), read(table.location(), vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + - "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(2, 1), + read( + table.location(), + vectorized, + "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); } } @@ -388,10 +429,15 @@ public void testFilterByNonProjectedColumn() { expected.add(projectFlat(actualProjection, rec)); } - assertEqualsSafe(actualProjection.asStruct(), expected, read( - unpartitioned.toString(), vectorized, - "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)", - "id", "data")); + assertEqualsSafe( + actualProjection.asStruct(), + expected, + read( + unpartitioned.toString(), + vectorized, + "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)", + "id", + "data")); } { @@ -403,20 +449,27 @@ public void testFilterByNonProjectedColumn() { expected.add(projectFlat(actualProjection, rec)); } - assertEqualsSafe(actualProjection.asStruct(), expected, read( - unpartitioned.toString(), vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + - "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)", - "id")); + assertEqualsSafe( + actualProjection.asStruct(), + expected, + read( + unpartitioned.toString(), + vectorized, + "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)", + "id")); } } @Test public void testPartitionedByDataStartsWithFilter() { - Table table = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + Table table = + buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new StringStartsWith("data", "junc")); Batch scan = builder.build().toBatch(); @@ -426,10 +479,13 @@ public void testPartitionedByDataStartsWithFilter() { @Test public void testPartitionedByDataNotStartsWithFilter() { - Table table = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + Table table = + buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new Not(new StringStartsWith("data", "junc"))); Batch scan = builder.build().toBatch(); @@ -441,11 +497,11 @@ public void testPartitionedByDataNotStartsWithFilter() { public void testPartitionedByIdStartsWith() { Table table = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", table.location()) - ); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new StringStartsWith("data", "junc")); Batch scan = builder.build().toBatch(); @@ -457,11 +513,11 @@ public void testPartitionedByIdStartsWith() { public void testPartitionedByIdNotStartsWith() { Table table = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", table.location()) - ); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new Not(new StringStartsWith("data", "junc"))); Batch scan = builder.build().toBatch(); @@ -471,15 +527,15 @@ public void testPartitionedByIdNotStartsWith() { @Test public void testUnpartitionedStartsWith() { - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()); - List matchedData = df.select("data") - .where("data LIKE 'jun%'") - .as(Encoders.STRING()) - .collectAsList(); + List matchedData = + df.select("data").where("data LIKE 'jun%'").as(Encoders.STRING()).collectAsList(); Assert.assertEquals(1, matchedData.size()); Assert.assertEquals("junction", matchedData.get(0)); @@ -487,20 +543,21 @@ public void testUnpartitionedStartsWith() { @Test public void testUnpartitionedNotStartsWith() { - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()); - - List matchedData = df.select("data") - .where("data NOT LIKE 'jun%'") - .as(Encoders.STRING()) - .collectAsList(); - - List expected = testRecords(SCHEMA).stream() - .map(r -> r.getField("data").toString()) - .filter(d -> !d.startsWith("jun")) - .collect(Collectors.toList()); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()); + + List matchedData = + df.select("data").where("data NOT LIKE 'jun%'").as(Encoders.STRING()).collectAsList(); + + List expected = + testRecords(SCHEMA).stream() + .map(r -> r.getField("data").toString()) + .filter(d -> !d.startsWith("jun")) + .collect(Collectors.toList()); Assert.assertEquals(9, matchedData.size()); Assert.assertEquals(Sets.newHashSet(expected), Sets.newHashSet(matchedData)); @@ -516,8 +573,8 @@ private static Record projectFlat(Schema projection, Record record) { return result; } - public static void assertEqualsUnsafe(Types.StructType struct, - List expected, List actual) { + public static void assertEqualsUnsafe( + Types.StructType struct, List expected, List actual) { // TODO: match records by ID int numRecords = Math.min(expected.size(), actual.size()); for (int i = 0; i < numRecords; i += 1) { @@ -526,8 +583,8 @@ public static void assertEqualsUnsafe(Types.StructType struct, Assert.assertEquals("Number of results should match expected", expected.size(), actual.size()); } - public static void assertEqualsSafe(Types.StructType struct, - List expected, List actual) { + public static void assertEqualsSafe( + Types.StructType struct, List expected, List actual) { // TODO: match records by ID int numRecords = Math.min(expected.size(), actual.size()); for (int i = 0; i < numRecords; i += 1) { @@ -550,7 +607,8 @@ private void pushFilters(ScanBuilder scan, Filter... filters) { filterable.pushFilters(filters); } - private Table buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) { + private Table buildPartitionedTable( + String desc, PartitionSpec spec, String udf, String partitionColumn) { File location = new File(parent, desc); Table table = TABLES.create(SCHEMA, spec, location.toString()); @@ -559,10 +617,12 @@ private Table buildPartitionedTable(String desc, PartitionSpec spec, String udf, table.updateProperties().set("read.split.target-size", "2048").commit(); // copy the unpartitioned table into the partitioned table to produce the partitioned data - Dataset allRows = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()); + Dataset allRows = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()); allRows .coalesce(1) // ensure only 1 file per partition is written @@ -590,19 +650,23 @@ private List testRecords(Schema schema) { record(schema, 6L, parse("2017-12-21T21:55:30.589712+00:00"), "element"), record(schema, 7L, parse("2017-12-21T17:31:14.532797+00:00"), "limited"), record(schema, 8L, parse("2017-12-21T15:21:51.237521+00:00"), "global"), - record(schema, 9L, parse("2017-12-21T15:02:15.230570+00:00"), "goldfish") - ); + record(schema, 9L, parse("2017-12-21T15:02:15.230570+00:00"), "goldfish")); } private static List read(String table, boolean vectorized, String expr) { return read(table, vectorized, expr, "*"); } - private static List read(String table, boolean vectorized, String expr, String select0, String... selectN) { - Dataset dataset = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table).filter(expr) - .select(select0, selectN); + private static List read( + String table, boolean vectorized, String expr, String select0, String... selectN) { + Dataset dataset = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table) + .filter(expr) + .select(select0, selectN); return dataset.collectAsList(); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java index 20e1352de107..585cfc44a254 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localInput; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.List; @@ -63,25 +65,26 @@ import scala.Option; import scala.collection.JavaConversions; -import static org.apache.iceberg.Files.localInput; -import static org.apache.iceberg.Files.localOutput; - public class TestForwardCompatibility { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); // create a spec for the schema that uses a "zero" transform that produces all 0s - private static final PartitionSpec UNKNOWN_SPEC = PartitionSpecParser.fromJson(SCHEMA, - "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); + private static final PartitionSpec UNKNOWN_SPEC = + PartitionSpecParser.fromJson( + SCHEMA, + "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); // create a fake spec to use to write table metadata - private static final PartitionSpec FAKE_SPEC = PartitionSpecParser.fromJson(SCHEMA, - "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"identity\", \"source-id\": 1 } ] }"); + private static final PartitionSpec FAKE_SPEC = + PartitionSpecParser.fromJson( + SCHEMA, + "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"identity\", \"source-id\": 1 } ] }"); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; @@ -107,20 +110,22 @@ public void testSparkWriteFailsUnknownTransform() throws IOException { HadoopTables tables = new HadoopTables(CONF); tables.create(SCHEMA, UNKNOWN_SPEC, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - AssertHelpers.assertThrows("Should reject write with unsupported transform", - UnsupportedOperationException.class, "Cannot write using unsupported transforms: zero", - () -> df.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(location.toString())); + AssertHelpers.assertThrows( + "Should reject write with unsupported transform", + UnsupportedOperationException.class, + "Cannot write using unsupported transforms: zero", + () -> + df.select("id", "data") + .write() + .format("iceberg") + .mode("append") + .save(location.toString())); } @Test @@ -136,20 +141,24 @@ public void testSparkStreamingWriteFailsUnknownTransform() throws IOException, T tables.create(SCHEMA, UNKNOWN_SPEC, location.toString()); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - StreamingQuery query = inputStream.toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("append") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()) - .start(); + StreamingQuery query = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("append") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()) + .start(); List batch1 = Lists.newArrayList(1, 2); send(batch1, inputStream); - AssertHelpers.assertThrows("Should reject streaming write with unsupported transform", - StreamingQueryException.class, "Cannot write using unsupported transforms: zero", + AssertHelpers.assertThrows( + "Should reject streaming write with unsupported transform", + StreamingQueryException.class, + "Cannot write using unsupported transforms: zero", query::processAllAvailable); } @@ -168,22 +177,22 @@ public void testSparkCanReadUnknownTransform() throws IOException { List expected = RandomData.generateList(table.schema(), 100, 1L); - File parquetFile = new File(dataFolder, - FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); - FileAppender writer = Parquet.write(localOutput(parquetFile)) - .schema(table.schema()) - .build(); + File parquetFile = + new File(dataFolder, FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); + FileAppender writer = + Parquet.write(localOutput(parquetFile)).schema(table.schema()).build(); try { writer.addAll(expected); } finally { writer.close(); } - DataFile file = DataFiles.builder(FAKE_SPEC) - .withInputFile(localInput(parquetFile)) - .withMetrics(writer.metrics()) - .withPartitionPath("id_zero=0") - .build(); + DataFile file = + DataFiles.builder(FAKE_SPEC) + .withInputFile(localInput(parquetFile)) + .withMetrics(writer.metrics()) + .withPartitionPath("id_zero=0") + .build(); OutputFile manifestFile = localOutput(FileFormat.AVRO.addExtension(temp.newFile().toString())); ManifestWriter manifestWriter = ManifestFiles.write(FAKE_SPEC, manifestFile); @@ -195,9 +204,7 @@ public void testSparkCanReadUnknownTransform() throws IOException { table.newFastAppend().appendManifest(manifestWriter.toManifestFile()).commit(); - Dataset df = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset df = spark.read().format("iceberg").load(location.toString()); List rows = df.collectAsList(); Assert.assertEquals("Should contain 100 rows", 100, rows.size()); diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java index 42f53d585601..a850275118db 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.catalog.TableIdentifier; @@ -40,5 +39,4 @@ public Identifier extractIdentifier(CaseInsensitiveStringMap options) { public String extractCatalog(CaseInsensitiveStringMap options) { return SparkSession.active().sessionState().catalogManager().currentCatalog().name(); } - } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java index f1cfc7a72e17..b55ba0e2199a 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java index 76923d43a3bc..f6df8d495b90 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -62,7 +61,8 @@ public Table createTable(TableIdentifier ident, Schema schema, PartitionSpec spe @Override public Table loadTable(TableIdentifier ident, String entriesSuffix) { - TableIdentifier identifier = TableIdentifier.of(ident.namespace().level(0), ident.name(), entriesSuffix); + TableIdentifier identifier = + TableIdentifier.of(ident.namespace().level(0), ident.name(), entriesSuffix); return TestIcebergSourceHiveTables.catalog.loadTable(identifier); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java index 7258fbd9690c..386e57b7877f 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.ManifestContent.DATA; +import static org.apache.iceberg.ManifestContent.DELETES; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.io.UncheckedIOException; import java.util.Comparator; @@ -76,33 +80,26 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.ManifestContent.DATA; -import static org.apache.iceberg.ManifestContent.DELETES; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class TestIcebergSourceTablesBase extends SparkTestBase { - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - private static final Schema SCHEMA2 = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()), - optional(3, "category", Types.StringType.get()) - ); + private static final Schema SCHEMA2 = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get()), + optional(3, "category", Types.StringType.get())); - private static final Schema SCHEMA3 = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(3, "category", Types.StringType.get()) - ); + private static final Schema SCHEMA3 = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(3, "category", Types.StringType.get())); private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("id").build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); public abstract Table createTable(TableIdentifier ident, Schema schema, PartitionSpec spec); @@ -117,23 +114,21 @@ public synchronized void testTablesSupport() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "1"), - new SimpleRecord(2, "2"), - new SimpleRecord(3, "3")); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "1"), new SimpleRecord(2, "2"), new SimpleRecord(3, "3")); Dataset inputDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - List actualRecords = resultDf.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + List actualRecords = + resultDf.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Records should match", expectedRecords, actualRecords); } @@ -147,32 +142,39 @@ public void testEntriesTable() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .collectAsList(); Snapshot snapshot = table.currentSnapshot(); - Assert.assertEquals("Should only contain one manifest", 1, snapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should only contain one manifest", 1, snapshot.allManifests(table.io()).size()); InputFile manifest = table.io().newInputFile(snapshot.allManifests(table.io()).get(0).path()); List expected = Lists.newArrayList(); - try (CloseableIterable rows = Avro.read(manifest).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(manifest).project(entriesTable.schema()).build()) { // each row must inherit snapshot_id and sequence_number - rows.forEach(row -> { - row.put(2, 0L); - GenericData.Record file = (GenericData.Record) row.get("data_file"); - asMetadataRecord(file); - expected.add(row); - }); + rows.forEach( + row -> { + row.put(2, 0L); + GenericData.Record file = (GenericData.Record) row.get("data_file"); + asMetadataRecord(file); + expected.add(row); + }); } Assert.assertEquals("Entries table should have one row", 1, expected.size()); @@ -188,18 +190,22 @@ public void testEntriesTablePartitionedPrune() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("status") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select("status") + .collectAsList(); Assert.assertEquals("Results should contain only one status", 1, actual.size()); Assert.assertEquals("That status should be Added (1)", 1, actual.get(0).getInt(0)); @@ -213,7 +219,9 @@ public void testEntriesTableDataFilePrune() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -221,15 +229,19 @@ public void testEntriesTableDataFilePrune() throws Exception { table.refresh(); DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - List singleActual = rowsToJava(spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("data_file.file_path") - .collectAsList()); + List singleActual = + rowsToJava( + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select("data_file.file_path") + .collectAsList()); List singleExpected = ImmutableList.of(row(file.path())); - assertEquals("Should prune a single element from a nested struct", singleExpected, singleActual); + assertEquals( + "Should prune a single element from a nested struct", singleExpected, singleActual); } @Test @@ -240,7 +252,9 @@ public void testEntriesTableDataFilePruneMulti() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -248,14 +262,22 @@ public void testEntriesTableDataFilePruneMulti() throws Exception { table.refresh(); DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - List multiActual = rowsToJava(spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("data_file.file_path", "data_file.value_counts", "data_file.record_count", "data_file.column_sizes") - .collectAsList()); - - List multiExpected = ImmutableList.of( - row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); + List multiActual = + rowsToJava( + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select( + "data_file.file_path", + "data_file.value_counts", + "data_file.record_count", + "data_file.column_sizes") + .collectAsList()); + + List multiExpected = + ImmutableList.of( + row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); assertEquals("Should prune a single element from a nested struct", multiExpected, multiActual); } @@ -268,7 +290,9 @@ public void testFilesSelectMap() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -276,14 +300,18 @@ public void testFilesSelectMap() throws Exception { table.refresh(); DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - List multiActual = rowsToJava(spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .select("file_path", "value_counts", "record_count", "column_sizes") - .collectAsList()); + List multiActual = + rowsToJava( + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "files")) + .select("file_path", "value_counts", "record_count", "column_sizes") + .collectAsList()); - List multiExpected = ImmutableList.of( - row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); + List multiExpected = + ImmutableList.of( + row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); assertEquals("Should prune a single element from a row", multiExpected, multiActual); } @@ -294,10 +322,13 @@ public void testAllEntriesTable() throws Exception { Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); Table entriesTable = loadTable(tableIdentifier, "all_entries"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -306,7 +337,8 @@ public void testAllEntriesTable() throws Exception { table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -314,24 +346,28 @@ public void testAllEntriesTable() throws Exception { // ensure table data isn't stale table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_entries")) - .orderBy("snapshot_id") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_entries")) + .orderBy("snapshot_id") + .collectAsList(); List expected = Lists.newArrayList(); - for (ManifestFile manifest : Iterables.concat( - Iterables.transform(table.snapshots(), s -> s.allManifests(table.io())))) { + for (ManifestFile manifest : + Iterables.concat(Iterables.transform(table.snapshots(), s -> s.allManifests(table.io())))) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { // each row must inherit snapshot_id and sequence_number - rows.forEach(row -> { - row.put(2, 0L); - GenericData.Record file = (GenericData.Record) row.get("data_file"); - asMetadataRecord(file); - expected.add(row); - }); + rows.forEach( + row -> { + row.put(2, 0L); + GenericData.Record file = (GenericData.Record) row.get("data_file"); + asMetadataRecord(file); + expected.add(row); + }); } } @@ -340,7 +376,8 @@ public void testAllEntriesTable() throws Exception { Assert.assertEquals("Entries table should have 3 rows", 3, expected.size()); Assert.assertEquals("Actual results should have 3 rows", 3, actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(entriesTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + entriesTable.schema().asStruct(), expected.get(i), actual.get(i)); } } @@ -352,7 +389,9 @@ public void testCountEntriesTable() { // init load List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -360,12 +399,16 @@ public void testCountEntriesTable() { final int expectedEntryCount = 1; // count entries - Assert.assertEquals("Count should return " + expectedEntryCount, - expectedEntryCount, spark.read().format("iceberg").load(loadLocation(tableIdentifier, "entries")).count()); + Assert.assertEquals( + "Count should return " + expectedEntryCount, + expectedEntryCount, + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "entries")).count()); // count all_entries - Assert.assertEquals("Count should return " + expectedEntryCount, - expectedEntryCount, spark.read().format("iceberg").load(loadLocation(tableIdentifier, "all_entries")).count()); + Assert.assertEquals( + "Count should return " + expectedEntryCount, + expectedEntryCount, + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "all_entries")).count()); } @Test @@ -375,16 +418,20 @@ public void testFilesTable() throws Exception { Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "files"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -392,15 +439,14 @@ public void testFilesTable() throws Exception { // delete the first file to test that only live files are listed table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .collectAsList(); + List actual = + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")).collectAsList(); List expected = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().dataManifests(table.io())) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { if ((Integer) record.get("status") < 2 /* added or existing */) { GenericData.Record file = (GenericData.Record) record.get("data_file"); @@ -422,42 +468,42 @@ public void testFilesTableWithSnapshotIdInheritance() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "files_inheritance_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); - table.updateProperties() - .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "files"); - spark.sql(String.format( - "CREATE TABLE parquet_table (data string, id int) " + - "USING parquet PARTITIONED BY (id) LOCATION '%s'", - temp.newFolder())); + spark.sql( + String.format( + "CREATE TABLE parquet_table (data string, id int) " + + "USING parquet PARTITIONED BY (id) LOCATION '%s'", + temp.newFolder())); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF.select("data", "id").write() - .mode("overwrite") - .insertInto("parquet_table"); + inputDF.select("data", "id").write().mode("overwrite").insertInto("parquet_table"); try { String stagingLocation = table.location() + "/metadata"; - SparkTableUtil.importSparkTable(spark, + SparkTableUtil.importSparkTable( + spark, new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), - table, stagingLocation); + table, + stagingLocation); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "files")) + .collectAsList(); List expected = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().dataManifests(table.io())) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { GenericData.Record file = (GenericData.Record) record.get("data_file"); asMetadataRecord(file); @@ -473,7 +519,6 @@ public void testFilesTableWithSnapshotIdInheritance() throws Exception { } finally { spark.sql("DROP TABLE parquet_table"); } - } @Test @@ -484,35 +529,35 @@ public void testEntriesTableWithSnapshotIdInheritance() throws Exception { PartitionSpec spec = SPEC; Table table = createTable(tableIdentifier, SCHEMA, spec); - table.updateProperties() - .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); - spark.sql(String.format( - "CREATE TABLE parquet_table (data string, id int) " + - "USING parquet PARTITIONED BY (id) LOCATION '%s'", - temp.newFolder())); + spark.sql( + String.format( + "CREATE TABLE parquet_table (data string, id int) " + + "USING parquet PARTITIONED BY (id) LOCATION '%s'", + temp.newFolder())); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF.select("data", "id").write() - .mode("overwrite") - .insertInto("parquet_table"); + inputDF.select("data", "id").write().mode("overwrite").insertInto("parquet_table"); try { String stagingLocation = table.location() + "/metadata"; SparkTableUtil.importSparkTable( - spark, new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), table, stagingLocation); + spark, + new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), + table, + stagingLocation); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("sequence_number", "snapshot_id", "data_file") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select("sequence_number", "snapshot_id", "data_file") + .collectAsList(); table.refresh(); @@ -535,19 +580,24 @@ public void testFilesUnpartitionedTable() throws Exception { Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "files"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); - DataFile toDelete = Iterables.getOnlyElement(table.currentSnapshot().addedDataFiles(table.io())); + DataFile toDelete = + Iterables.getOnlyElement(table.currentSnapshot().addedDataFiles(table.io())); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -555,15 +605,14 @@ public void testFilesUnpartitionedTable() throws Exception { // delete the first file to test that only live files are listed table.newDelete().deleteFile(toDelete).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .collectAsList(); + List actual = + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")).collectAsList(); List expected = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().dataManifests(table.io())) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { if ((Integer) record.get("status") < 2 /* added or existing */) { GenericData.Record file = (GenericData.Record) record.get("data_file"); @@ -586,38 +635,49 @@ public void testAllMetadataTablesWithStagedCommits() throws Exception { table.updateProperties().set(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, "true").commit(); spark.conf().set("spark.wap.id", "1234567"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actualAllData = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_data_files")) - .collectAsList(); - - List actualAllManifests = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .collectAsList(); - - List actualAllEntries = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_entries")) - .collectAsList(); - - Assert.assertTrue("Stage table should have some snapshots", table.snapshots().iterator().hasNext()); - Assert.assertEquals("Stage table should have null currentSnapshot", - null, table.currentSnapshot()); + List actualAllData = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_data_files")) + .collectAsList(); + + List actualAllManifests = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_manifests")) + .collectAsList(); + + List actualAllEntries = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_entries")) + .collectAsList(); + + Assert.assertTrue( + "Stage table should have some snapshots", table.snapshots().iterator().hasNext()); + Assert.assertEquals( + "Stage table should have null currentSnapshot", null, table.currentSnapshot()); Assert.assertEquals("Actual results should have two rows", 2, actualAllData.size()); Assert.assertEquals("Actual results should have two rows", 2, actualAllManifests.size()); Assert.assertEquals("Actual results should have two rows", 2, actualAllEntries.size()); @@ -630,10 +690,13 @@ public void testAllDataFilesTable() throws Exception { Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "all_data_files"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -642,7 +705,8 @@ public void testAllDataFilesTable() throws Exception { table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -650,19 +714,23 @@ public void testAllDataFilesTable() throws Exception { // ensure table data isn't stale table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_data_files")) - .orderBy("file_path") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_data_files")) + .orderBy("file_path") + .collectAsList(); actual.sort(Comparator.comparing(o -> o.getString(1))); List expected = Lists.newArrayList(); - Iterable dataManifests = Iterables.concat(Iterables.transform(table.snapshots(), - snapshot -> snapshot.dataManifests(table.io()))); + Iterable dataManifests = + Iterables.concat( + Iterables.transform(table.snapshots(), snapshot -> snapshot.dataManifests(table.io()))); for (ManifestFile manifest : dataManifests) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { if ((Integer) record.get("status") < 2 /* added or existing */) { GenericData.Record file = (GenericData.Record) record.get("data_file"); @@ -691,7 +759,9 @@ public void testHistoryTable() { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -700,7 +770,9 @@ public void testHistoryTable() { long firstSnapshotTimestamp = table.currentSnapshot().timestampMillis(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -713,7 +785,9 @@ public void testHistoryTable() { table.rollback().toSnapshotId(firstSnapshotId).commit(); long rollbackTimestamp = Iterables.getLast(table.history()).timestampMillis(); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -722,34 +796,43 @@ public void testHistoryTable() { long thirdSnapshotTimestamp = table.currentSnapshot().timestampMillis(); long thirdSnapshotId = table.currentSnapshot().snapshotId(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "history")) - .collectAsList(); - - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(historyTable.schema(), "history")); - List expected = Lists.newArrayList( - builder.set("made_current_at", firstSnapshotTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("is_current_ancestor", true) - .build(), - builder.set("made_current_at", secondSnapshotTimestamp * 1000) - .set("snapshot_id", secondSnapshotId) - .set("parent_id", firstSnapshotId) - .set("is_current_ancestor", false) // commit rolled back, not an ancestor of the current table state - .build(), - builder.set("made_current_at", rollbackTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("is_current_ancestor", true) - .build(), - builder.set("made_current_at", thirdSnapshotTimestamp * 1000) - .set("snapshot_id", thirdSnapshotId) - .set("parent_id", firstSnapshotId) - .set("is_current_ancestor", true) - .build() - ); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "history")) + .collectAsList(); + + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(historyTable.schema(), "history")); + List expected = + Lists.newArrayList( + builder + .set("made_current_at", firstSnapshotTimestamp * 1000) + .set("snapshot_id", firstSnapshotId) + .set("parent_id", null) + .set("is_current_ancestor", true) + .build(), + builder + .set("made_current_at", secondSnapshotTimestamp * 1000) + .set("snapshot_id", secondSnapshotId) + .set("parent_id", firstSnapshotId) + .set( + "is_current_ancestor", + false) // commit rolled back, not an ancestor of the current table state + .build(), + builder + .set("made_current_at", rollbackTimestamp * 1000) + .set("snapshot_id", firstSnapshotId) + .set("parent_id", null) + .set("is_current_ancestor", true) + .build(), + builder + .set("made_current_at", thirdSnapshotTimestamp * 1000) + .set("snapshot_id", thirdSnapshotId) + .set("parent_id", firstSnapshotId) + .set("is_current_ancestor", true) + .build()); Assert.assertEquals("History table should have a row for each commit", 4, actual.size()); TestHelpers.assertEqualsSafe(historyTable.schema().asStruct(), expected.get(0), actual.get(0)); @@ -766,7 +849,9 @@ public void testSnapshotsTable() { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -785,40 +870,47 @@ public void testSnapshotsTable() { // rollback the table state to the first snapshot table.rollback().toSnapshotId(firstSnapshotId).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "snapshots")) - .collectAsList(); - - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(snapTable.schema(), "snapshots")); - List expected = Lists.newArrayList( - builder.set("committed_at", firstSnapshotTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("operation", "append") - .set("manifest_list", firstManifestList) - .set("summary", ImmutableMap.of( - "added-records", "1", - "added-data-files", "1", - "changed-partition-count", "1", - "total-data-files", "1", - "total-records", "1" - )) - .build(), - builder.set("committed_at", secondSnapshotTimestamp * 1000) - .set("snapshot_id", secondSnapshotId) - .set("parent_id", firstSnapshotId) - .set("operation", "delete") - .set("manifest_list", secondManifestList) - .set("summary", ImmutableMap.of( - "deleted-records", "1", - "deleted-data-files", "1", - "changed-partition-count", "1", - "total-records", "0", - "total-data-files", "0" - )) - .build() - ); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "snapshots")) + .collectAsList(); + + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(snapTable.schema(), "snapshots")); + List expected = + Lists.newArrayList( + builder + .set("committed_at", firstSnapshotTimestamp * 1000) + .set("snapshot_id", firstSnapshotId) + .set("parent_id", null) + .set("operation", "append") + .set("manifest_list", firstManifestList) + .set( + "summary", + ImmutableMap.of( + "added-records", "1", + "added-data-files", "1", + "changed-partition-count", "1", + "total-data-files", "1", + "total-records", "1")) + .build(), + builder + .set("committed_at", secondSnapshotTimestamp * 1000) + .set("snapshot_id", secondSnapshotId) + .set("parent_id", firstSnapshotId) + .set("operation", "delete") + .set("manifest_list", secondManifestList) + .set( + "summary", + ImmutableMap.of( + "deleted-records", "1", + "deleted-data-files", "1", + "changed-partition-count", "1", + "total-records", "0", + "total-data-files", "0")) + .build()); Assert.assertEquals("Snapshots table should have a row for each snapshot", 2, actual.size()); TestHelpers.assertEqualsSafe(snapTable.schema().asStruct(), expected.get(0), actual.get(0)); @@ -833,7 +925,9 @@ public void testPrunedSnapshotsTable() { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -849,40 +943,47 @@ public void testPrunedSnapshotsTable() { // rollback the table state to the first snapshot table.rollback().toSnapshotId(firstSnapshotId).commit(); - Dataset actualDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "snapshots")) - .select("operation", "committed_at", "summary", "parent_id"); + Dataset actualDf = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "snapshots")) + .select("operation", "committed_at", "summary", "parent_id"); Schema projectedSchema = SparkSchemaUtil.convert(actualDf.schema()); List actual = actualDf.collectAsList(); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema, "snapshots")); - List expected = Lists.newArrayList( - builder.set("committed_at", firstSnapshotTimestamp * 1000) - .set("parent_id", null) - .set("operation", "append") - .set("summary", ImmutableMap.of( - "added-records", "1", - "added-data-files", "1", - "changed-partition-count", "1", - "total-data-files", "1", - "total-records", "1" - )) - .build(), - builder.set("committed_at", secondSnapshotTimestamp * 1000) - .set("parent_id", firstSnapshotId) - .set("operation", "delete") - .set("summary", ImmutableMap.of( - "deleted-records", "1", - "deleted-data-files", "1", - "changed-partition-count", "1", - "total-records", "0", - "total-data-files", "0" - )) - .build() - ); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema, "snapshots")); + List expected = + Lists.newArrayList( + builder + .set("committed_at", firstSnapshotTimestamp * 1000) + .set("parent_id", null) + .set("operation", "append") + .set( + "summary", + ImmutableMap.of( + "added-records", "1", + "added-data-files", "1", + "changed-partition-count", "1", + "total-data-files", "1", + "total-records", "1")) + .build(), + builder + .set("committed_at", secondSnapshotTimestamp * 1000) + .set("parent_id", firstSnapshotId) + .set("operation", "delete") + .set( + "summary", + ImmutableMap.of( + "deleted-records", "1", + "deleted-data-files", "1", + "changed-partition-count", "1", + "total-records", "0", + "total-data-files", "0")) + .build()); Assert.assertEquals("Snapshots table should have a row for each snapshot", 2, actual.size()); TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(0), actual.get(0)); @@ -894,48 +995,73 @@ public void testManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "manifests"); - Dataset df1 = spark.createDataFrame( - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame( + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), + SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .collectAsList(); table.refresh(); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema(), "manifests")); - GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema().findType("partition_summaries.element").asStructType(), "partition_summary")); - List expected = Lists.transform(table.currentSnapshot().allManifests(table.io()), manifest -> - builder - .set("content", manifest.content().id()) - .set("path", manifest.path()) - .set("length", manifest.length()) - .set("partition_spec_id", manifest.partitionSpecId()) - .set("added_snapshot_id", manifest.snapshotId()) - .set("added_data_files_count", manifest.content() == DATA ? manifest.addedFilesCount() : 0) - .set("existing_data_files_count", manifest.content() == DATA ? manifest.existingFilesCount() : 0) - .set("deleted_data_files_count", manifest.content() == DATA ? manifest.deletedFilesCount() : 0) - .set("added_delete_files_count", manifest.content() == DELETES ? manifest.addedFilesCount() : 0) - .set("existing_delete_files_count", manifest.content() == DELETES ? manifest.existingFilesCount() : 0) - .set("deleted_delete_files_count", manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) - .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> - summaryBuilder - .set("contains_null", true) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build() - )) - .build() - ); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(manifestTable.schema(), "manifests")); + GenericRecordBuilder summaryBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + manifestTable.schema().findType("partition_summaries.element").asStructType(), + "partition_summary")); + List expected = + Lists.transform( + table.currentSnapshot().allManifests(table.io()), + manifest -> + builder + .set("content", manifest.content().id()) + .set("path", manifest.path()) + .set("length", manifest.length()) + .set("partition_spec_id", manifest.partitionSpecId()) + .set("added_snapshot_id", manifest.snapshotId()) + .set( + "added_data_files_count", + manifest.content() == DATA ? manifest.addedFilesCount() : 0) + .set( + "existing_data_files_count", + manifest.content() == DATA ? manifest.existingFilesCount() : 0) + .set( + "deleted_data_files_count", + manifest.content() == DATA ? manifest.deletedFilesCount() : 0) + .set( + "added_delete_files_count", + manifest.content() == DELETES ? manifest.addedFilesCount() : 0) + .set( + "existing_delete_files_count", + manifest.content() == DELETES ? manifest.existingFilesCount() : 0) + .set( + "deleted_delete_files_count", + manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) + .set( + "partition_summaries", + Lists.transform( + manifest.partitions(), + partition -> + summaryBuilder + .set("contains_null", true) + .set("contains_nan", false) + .set("lower_bound", "1") + .set("upper_bound", "1") + .build())) + .build()); Assert.assertEquals("Manifests table should have one manifest row", 1, actual.size()); TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(0), actual.get(0)); @@ -946,56 +1072,77 @@ public void testPruneManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "manifests"); - Dataset df1 = spark.createDataFrame( - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame( + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), + SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); if (!spark.version().startsWith("2")) { // Spark 2 isn't able to actually push down nested struct projections so this will not break - AssertHelpers.assertThrows("Can't prune struct inside list", SparkException.class, + AssertHelpers.assertThrows( + "Can't prune struct inside list", + SparkException.class, "Cannot project a partial list element struct", - () -> spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries.contains_null") - .collectAsList()); + () -> + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .select("partition_spec_id", "path", "partition_summaries.contains_null") + .collectAsList()); } - Dataset actualDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries"); + Dataset actualDf = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .select("partition_spec_id", "path", "partition_summaries"); Schema projectedSchema = SparkSchemaUtil.convert(actualDf.schema()); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .select("partition_spec_id", "path", "partition_summaries") + .collectAsList(); table.refresh(); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema.asStruct())); - GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - projectedSchema.findType("partition_summaries.element").asStructType(), "partition_summary")); - List expected = Lists.transform(table.currentSnapshot().allManifests(table.io()), manifest -> - builder.set("partition_spec_id", manifest.partitionSpecId()) - .set("path", manifest.path()) - .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> - summaryBuilder - .set("contains_null", true) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build() - )) - .build() - ); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema.asStruct())); + GenericRecordBuilder summaryBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + projectedSchema.findType("partition_summaries.element").asStructType(), + "partition_summary")); + List expected = + Lists.transform( + table.currentSnapshot().allManifests(table.io()), + manifest -> + builder + .set("partition_spec_id", manifest.partitionSpecId()) + .set("path", manifest.path()) + .set( + "partition_summaries", + Lists.transform( + manifest.partitions(), + partition -> + summaryBuilder + .set("contains_null", true) + .set("contains_nan", false) + .set("lower_bound", "1") + .set("upper_bound", "1") + .build())) + .build()); Assert.assertEquals("Manifests table should have one manifest row", 1, actual.size()); TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(0), actual.get(0)); @@ -1006,53 +1153,62 @@ public void testAllManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "all_manifests"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - table.updateProperties() - .set(TableProperties.FORMAT_VERSION, "2") - .commit(); + table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); - DataFile dataFile = Iterables.getFirst(table.currentSnapshot().addedDataFiles(table.io()), null); + DataFile dataFile = + Iterables.getFirst(table.currentSnapshot().addedDataFiles(table.io()), null); PartitionSpec dataFileSpec = table.specs().get(dataFile.specId()); StructLike dataFilePartition = dataFile.partition(); PositionDelete delete = PositionDelete.create(); delete.set(dataFile.path(), 0L, null); - DeleteFile deleteFile = writePositionDeletes(table, dataFileSpec, dataFilePartition, ImmutableList.of(delete)); + DeleteFile deleteFile = + writePositionDeletes(table, dataFileSpec, dataFilePartition, ImmutableList.of(delete)); - table.newRowDelta() - .addDeletes(deleteFile) - .commit(); + table.newRowDelta().addDeletes(deleteFile).commit(); table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); Stream> snapshotIdToManifests = StreamSupport.stream(table.snapshots().spliterator(), false) - .flatMap(snapshot -> snapshot.allManifests(table.io()).stream().map( - manifest -> Pair.of(snapshot.snapshotId(), manifest))); - - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .orderBy("path") - .collectAsList(); + .flatMap( + snapshot -> + snapshot.allManifests(table.io()).stream() + .map(manifest -> Pair.of(snapshot.snapshotId(), manifest))); + + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_manifests")) + .orderBy("path") + .collectAsList(); table.refresh(); - List expected = snapshotIdToManifests - .map(snapshotManifest -> manifestRecord(manifestTable, snapshotManifest.first(), snapshotManifest.second())) - .collect(Collectors.toList()); + List expected = + snapshotIdToManifests + .map( + snapshotManifest -> + manifestRecord( + manifestTable, snapshotManifest.first(), snapshotManifest.second())) + .collect(Collectors.toList()); expected.sort(Comparator.comparing(o -> o.get("path").toString())); Assert.assertEquals("Manifests table should have 5 manifest rows", 5, actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); } } @@ -1061,33 +1217,37 @@ public void testUnpartitionedPartitionsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "unpartitioned_partitions_test"); createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - Dataset df = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - Types.StructType expectedSchema = Types.StructType.of( - required(2, "record_count", Types.LongType.get()), - required(3, "file_count", Types.IntegerType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + required(2, "record_count", Types.LongType.get()), + required(3, "file_count", Types.IntegerType.get())); Table partitionsTable = loadTable(tableIdentifier, "partitions"); - Assert.assertEquals("Schema should not have partition field", - expectedSchema, partitionsTable.schema().asStruct()); + Assert.assertEquals( + "Schema should not have partition field", + expectedSchema, + partitionsTable.schema().asStruct()); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - partitionsTable.schema(), "partitions")); - GenericData.Record expectedRow = builder - .set("record_count", 1L) - .set("file_count", 1) - .build(); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(partitionsTable.schema(), "partitions")); + GenericData.Record expectedRow = builder.set("record_count", 1L).set("file_count", 1).build(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .collectAsList(); Assert.assertEquals("Unpartitioned partitions table should have one row", 1, actual.size()); TestHelpers.assertEqualsSafe(expectedSchema, expectedRow, actual.get(0)); @@ -1098,10 +1258,13 @@ public void testPartitionsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "partitions_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table partitionsTable = loadTable(tableIdentifier, "partitions"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1110,69 +1273,86 @@ public void testPartitionsTable() { long firstCommitId = table.currentSnapshot().snapshotId(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .orderBy("partition.id") - .collectAsList(); - - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - partitionsTable.schema(), "partitions")); - GenericRecordBuilder partitionBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - partitionsTable.schema().findType("partition").asStructType(), "partition")); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .orderBy("partition.id") + .collectAsList(); + + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(partitionsTable.schema(), "partitions")); + GenericRecordBuilder partitionBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + partitionsTable.schema().findType("partition").asStructType(), "partition")); List expected = Lists.newArrayList(); - expected.add(builder - .set("partition", partitionBuilder.set("id", 1).build()) - .set("record_count", 1L) - .set("file_count", 1) - .set("spec_id", 0) - .build()); - expected.add(builder - .set("partition", partitionBuilder.set("id", 2).build()) - .set("record_count", 1L) - .set("file_count", 1) - .set("spec_id", 0) - .build()); + expected.add( + builder + .set("partition", partitionBuilder.set("id", 1).build()) + .set("record_count", 1L) + .set("file_count", 1) + .set("spec_id", 0) + .build()); + expected.add( + builder + .set("partition", partitionBuilder.set("id", 2).build()) + .set("record_count", 1L) + .set("file_count", 1) + .set("spec_id", 0) + .build()); Assert.assertEquals("Partitions table should have two rows", 2, expected.size()); Assert.assertEquals("Actual results should have two rows", 2, actual.size()); for (int i = 0; i < 2; i += 1) { - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); } // check time travel - List actualAfterFirstCommit = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, String.valueOf(firstCommitId)) - .load(loadLocation(tableIdentifier, "partitions")) - .orderBy("partition.id") - .collectAsList(); + List actualAfterFirstCommit = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, String.valueOf(firstCommitId)) + .load(loadLocation(tableIdentifier, "partitions")) + .orderBy("partition.id") + .collectAsList(); Assert.assertEquals("Actual results should have one row", 1, actualAfterFirstCommit.size()); - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(0), actualAfterFirstCommit.get(0)); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(0), actualAfterFirstCommit.get(0)); // check predicate push down - List filtered = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .filter("partition.id < 2") - .collectAsList(); + List filtered = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .filter("partition.id < 2") + .collectAsList(); Assert.assertEquals("Actual results should have one row", 1, filtered.size()); - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(0), filtered.get(0)); - - List nonFiltered = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .filter("partition.id < 2 or record_count=1") - .collectAsList(); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(0), filtered.get(0)); + + List nonFiltered = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .filter("partition.id < 2 or record_count=1") + .collectAsList(); Assert.assertEquals("Actual results should have one row", 2, nonFiltered.size()); for (int i = 0; i < 2; i += 1) { - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); } } @@ -1181,62 +1361,63 @@ public synchronized void testSnapshotReadAfterAddColumn() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List originalRecords = Lists.newArrayList( - RowFactory.create(1, "x"), - RowFactory.create(2, "y"), - RowFactory.create(3, "z")); + List originalRecords = + Lists.newArrayList( + RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z")); StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA); Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf.orderBy("id").collectAsList()); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); Snapshot snapshotBeforeAddColumn = table.currentSnapshot(); table.updateSchema().addColumn("category", Types.StringType.get()).commit(); - List newRecords = Lists.newArrayList( - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); + List newRecords = + Lists.newArrayList(RowFactory.create(4, "xy", "B"), RowFactory.create(5, "xyz", "C")); StructType newSparkSchema = SparkSchemaUtil.convert(SCHEMA2); Dataset inputDf2 = spark.createDataFrame(newRecords, newSparkSchema); - inputDf2.select("id", "data", "category").write() + inputDf2 + .select("id", "data", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - List updatedRecords = Lists.newArrayList( - RowFactory.create(1, "x", null), - RowFactory.create(2, "y", null), - RowFactory.create(3, "z", null), - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); - - Dataset resultDf2 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", updatedRecords, - resultDf2.orderBy("id").collectAsList()); - - Dataset resultDf3 = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf3.orderBy("id").collectAsList()); + List updatedRecords = + Lists.newArrayList( + RowFactory.create(1, "x", null), + RowFactory.create(2, "y", null), + RowFactory.create(3, "z", null), + RowFactory.create(4, "xy", "B"), + RowFactory.create(5, "xyz", "C")); + + Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); + + Dataset resultDf3 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf3.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf3.schema()); } @@ -1245,72 +1426,76 @@ public synchronized void testSnapshotReadAfterDropColumn() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA2, PartitionSpec.unpartitioned()); - List originalRecords = Lists.newArrayList( - RowFactory.create(1, "x", "A"), - RowFactory.create(2, "y", "A"), - RowFactory.create(3, "z", "B")); + List originalRecords = + Lists.newArrayList( + RowFactory.create(1, "x", "A"), + RowFactory.create(2, "y", "A"), + RowFactory.create(3, "z", "B")); StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA2); Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf.select("id", "data", "category").write() + inputDf + .select("id", "data", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf.orderBy("id").collectAsList()); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); long tsBeforeDropColumn = waitUntilAfter(System.currentTimeMillis()); table.updateSchema().deleteColumn("data").commit(); long tsAfterDropColumn = waitUntilAfter(System.currentTimeMillis()); - List newRecords = Lists.newArrayList( - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); + List newRecords = Lists.newArrayList(RowFactory.create(4, "B"), RowFactory.create(5, "C")); StructType newSparkSchema = SparkSchemaUtil.convert(SCHEMA3); Dataset inputDf2 = spark.createDataFrame(newRecords, newSparkSchema); - inputDf2.select("id", "category").write() + inputDf2 + .select("id", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - List updatedRecords = Lists.newArrayList( - RowFactory.create(1, "A"), - RowFactory.create(2, "A"), - RowFactory.create(3, "B"), - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); - - Dataset resultDf2 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", updatedRecords, - resultDf2.orderBy("id").collectAsList()); - - Dataset resultDf3 = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, tsBeforeDropColumn) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf3.orderBy("id").collectAsList()); + List updatedRecords = + Lists.newArrayList( + RowFactory.create(1, "A"), + RowFactory.create(2, "A"), + RowFactory.create(3, "B"), + RowFactory.create(4, "B"), + RowFactory.create(5, "C")); + + Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); + + Dataset resultDf3 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, tsBeforeDropColumn) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf3.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf3.schema()); // At tsAfterDropColumn, there has been a schema change, but no new snapshot, // so the snapshot as of tsAfterDropColumn is the same as that as of tsBeforeDropColumn. - Dataset resultDf4 = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, tsAfterDropColumn) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf4.orderBy("id").collectAsList()); + Dataset resultDf4 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, tsAfterDropColumn) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf4.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf4.schema()); } @@ -1319,77 +1504,77 @@ public synchronized void testSnapshotReadAfterAddAndDropColumn() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List originalRecords = Lists.newArrayList( - RowFactory.create(1, "x"), - RowFactory.create(2, "y"), - RowFactory.create(3, "z")); + List originalRecords = + Lists.newArrayList( + RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z")); StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA); Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf.orderBy("id").collectAsList()); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); Snapshot snapshotBeforeAddColumn = table.currentSnapshot(); table.updateSchema().addColumn("category", Types.StringType.get()).commit(); - List newRecords = Lists.newArrayList( - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); + List newRecords = + Lists.newArrayList(RowFactory.create(4, "xy", "B"), RowFactory.create(5, "xyz", "C")); StructType sparkSchemaAfterAddColumn = SparkSchemaUtil.convert(SCHEMA2); Dataset inputDf2 = spark.createDataFrame(newRecords, sparkSchemaAfterAddColumn); - inputDf2.select("id", "data", "category").write() + inputDf2 + .select("id", "data", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - List updatedRecords = Lists.newArrayList( - RowFactory.create(1, "x", null), - RowFactory.create(2, "y", null), - RowFactory.create(3, "z", null), - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); + List updatedRecords = + Lists.newArrayList( + RowFactory.create(1, "x", null), + RowFactory.create(2, "y", null), + RowFactory.create(3, "z", null), + RowFactory.create(4, "xy", "B"), + RowFactory.create(5, "xyz", "C")); - Dataset resultDf2 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", updatedRecords, - resultDf2.orderBy("id").collectAsList()); + Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); table.updateSchema().deleteColumn("data").commit(); - List recordsAfterDropColumn = Lists.newArrayList( - RowFactory.create(1, null), - RowFactory.create(2, null), - RowFactory.create(3, null), - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); - - Dataset resultDf3 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", recordsAfterDropColumn, - resultDf3.orderBy("id").collectAsList()); - - Dataset resultDf4 = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf4.orderBy("id").collectAsList()); + List recordsAfterDropColumn = + Lists.newArrayList( + RowFactory.create(1, null), + RowFactory.create(2, null), + RowFactory.create(3, null), + RowFactory.create(4, "B"), + RowFactory.create(5, "C")); + + Dataset resultDf3 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", recordsAfterDropColumn, resultDf3.orderBy("id").collectAsList()); + + Dataset resultDf4 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf4.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf4.schema()); } @@ -1398,13 +1583,12 @@ public void testRemoveOrphanFilesActionSupport() throws InterruptedException { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List records = Lists.newArrayList( - new SimpleRecord(1, "1") - ); + List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1416,21 +1600,23 @@ public void testRemoveOrphanFilesActionSupport() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result1 = actions.deleteOrphanFiles(table) - .location(table.location() + "/metadata") - .olderThan(System.currentTimeMillis()) - .execute(); - Assert.assertTrue("Should not delete any metadata files", Iterables.isEmpty(result1.orphanFileLocations())); + DeleteOrphanFiles.Result result1 = + actions + .deleteOrphanFiles(table) + .location(table.location() + "/metadata") + .olderThan(System.currentTimeMillis()) + .execute(); + Assert.assertTrue( + "Should not delete any metadata files", Iterables.isEmpty(result1.orphanFileLocations())); - DeleteOrphanFiles.Result result2 = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); - Assert.assertEquals("Should delete 1 data file", 1, Iterables.size(result2.orphanFileLocations())); + DeleteOrphanFiles.Result result2 = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); + Assert.assertEquals( + "Should delete 1 data file", 1, Iterables.size(result2.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); - List actualRecords = resultDF - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @@ -1438,13 +1624,18 @@ public void testRemoveOrphanFilesActionSupport() throws InterruptedException { @Test public void testFilesTablePartitionId() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "files_test"); - Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("id").build()); + Table table = + createTable( + tableIdentifier, SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("id").build()); int spec0 = table.spec().specId(); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1455,17 +1646,17 @@ public void testFilesTablePartitionId() throws Exception { int spec1 = table.spec().specId(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .sort(DataFile.SPEC_ID.name()) - .collectAsList() - .stream().map(r -> (Integer) r.getAs(DataFile.SPEC_ID.name())).collect(Collectors.toList()); + List actual = + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")) + .sort(DataFile.SPEC_ID.name()).collectAsList().stream() + .map(r -> (Integer) r.getAs(DataFile.SPEC_ID.name())) + .collect(Collectors.toList()); Assert.assertEquals("Should have two partition specs", ImmutableList.of(spec0, spec1), actual); } @@ -1475,22 +1666,26 @@ public void testAllManifestTableSnapshotFiltering() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "all_manifest_snapshot_filtering"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "all_manifests"); - Dataset df = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); List> snapshotIdToManifests = Lists.newArrayList(); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); Snapshot snapshot1 = table.currentSnapshot(); - snapshotIdToManifests.addAll(snapshot1.allManifests().stream() - .map(manifest -> Pair.of(snapshot1.snapshotId(), manifest)) - .collect(Collectors.toList())); + snapshotIdToManifests.addAll( + snapshot1.allManifests().stream() + .map(manifest -> Pair.of(snapshot1.snapshotId(), manifest)) + .collect(Collectors.toList())); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1498,16 +1693,19 @@ public void testAllManifestTableSnapshotFiltering() throws Exception { table.refresh(); Snapshot snapshot2 = table.currentSnapshot(); Assert.assertEquals("Should have two manifests", 2, snapshot2.allManifests().size()); - snapshotIdToManifests.addAll(snapshot2.allManifests().stream() - .map(manifest -> Pair.of(snapshot2.snapshotId(), manifest)) - .collect(Collectors.toList())); + snapshotIdToManifests.addAll( + snapshot2.allManifests().stream() + .map(manifest -> Pair.of(snapshot2.snapshotId(), manifest)) + .collect(Collectors.toList())); // Add manifests that will not be selected - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1517,30 +1715,41 @@ public void testAllManifestTableSnapshotFiltering() throws Exception { snapshotIds.add(String.valueOf(snapshot2.snapshotId())); snapshotIds.toString(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .filter("reference_snapshot_id in " + snapshotIds) - .orderBy("path") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_manifests")) + .filter("reference_snapshot_id in " + snapshotIds) + .orderBy("path") + .collectAsList(); table.refresh(); - List expected = snapshotIdToManifests.stream() - .map(snapshotManifest -> manifestRecord(manifestTable, snapshotManifest.first(), snapshotManifest.second())) - .collect(Collectors.toList()); + List expected = + snapshotIdToManifests.stream() + .map( + snapshotManifest -> + manifestRecord( + manifestTable, snapshotManifest.first(), snapshotManifest.second())) + .collect(Collectors.toList()); expected.sort(Comparator.comparing(o -> o.get("path").toString())); Assert.assertEquals("Manifests table should have 3 manifest rows", 3, actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); } } - private GenericData.Record manifestRecord(Table manifestTable, Long referenceSnapshotId, ManifestFile manifest) { - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema(), "manifests")); - GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema().findType("partition_summaries.element").asStructType(), "partition_summary")); + private GenericData.Record manifestRecord( + Table manifestTable, Long referenceSnapshotId, ManifestFile manifest) { + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(manifestTable.schema(), "manifests")); + GenericRecordBuilder summaryBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + manifestTable.schema().findType("partition_summaries.element").asStructType(), + "partition_summary")); return builder .set("content", manifest.content().id()) .set("path", manifest.path()) @@ -1548,19 +1757,32 @@ private GenericData.Record manifestRecord(Table manifestTable, Long referenceSna .set("partition_spec_id", manifest.partitionSpecId()) .set("added_snapshot_id", manifest.snapshotId()) .set("added_data_files_count", manifest.content() == DATA ? manifest.addedFilesCount() : 0) - .set("existing_data_files_count", manifest.content() == DATA ? manifest.existingFilesCount() : 0) - .set("deleted_data_files_count", manifest.content() == DATA ? manifest.deletedFilesCount() : 0) - .set("added_delete_files_count", manifest.content() == DELETES ? manifest.addedFilesCount() : 0) - .set("existing_delete_files_count", manifest.content() == DELETES ? manifest.existingFilesCount() : 0) - .set("deleted_delete_files_count", manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) - .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> - summaryBuilder - .set("contains_null", false) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build() - )) + .set( + "existing_data_files_count", + manifest.content() == DATA ? manifest.existingFilesCount() : 0) + .set( + "deleted_data_files_count", + manifest.content() == DATA ? manifest.deletedFilesCount() : 0) + .set( + "added_delete_files_count", + manifest.content() == DELETES ? manifest.addedFilesCount() : 0) + .set( + "existing_delete_files_count", + manifest.content() == DELETES ? manifest.existingFilesCount() : 0) + .set( + "deleted_delete_files_count", + manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) + .set( + "partition_summaries", + Lists.transform( + manifest.partitions(), + partition -> + summaryBuilder + .set("contains_null", false) + .set("contains_nan", false) + .set("lower_bound", "1") + .set("upper_bound", "1") + .build())) .set("reference_snapshot_id", referenceSnapshotId) .build(); } @@ -1570,8 +1792,8 @@ private void asMetadataRecord(GenericData.Record file) { file.put(3, 0); // specId } - private PositionDeleteWriter newPositionDeleteWriter(Table table, PartitionSpec spec, - StructLike partition) { + private PositionDeleteWriter newPositionDeleteWriter( + Table table, PartitionSpec spec, StructLike partition) { OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, 0, 0).build(); EncryptedOutputFile outputFile = fileFactory.newOutputFile(spec, partition); @@ -1579,9 +1801,13 @@ private PositionDeleteWriter newPositionDeleteWriter(Table table, P return fileWriterFactory.newPositionDeleteWriter(outputFile, spec, partition); } - private DeleteFile writePositionDeletes(Table table, PartitionSpec spec, StructLike partition, - Iterable> deletes) { - PositionDeleteWriter positionDeleteWriter = newPositionDeleteWriter(table, spec, partition); + private DeleteFile writePositionDeletes( + Table table, + PartitionSpec spec, + StructLike partition, + Iterable> deletes) { + PositionDeleteWriter positionDeleteWriter = + newPositionDeleteWriter(table, spec, partition); try (PositionDeleteWriter writer = positionDeleteWriter) { for (PositionDelete delete : deletes) { diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java index c275daee5f7e..559668ee31a1 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.math.BigDecimal; @@ -61,8 +60,8 @@ public void testRegisterIntegerBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_int_16", DataTypes.IntegerType, 16); List results = spark.sql("SELECT iceberg_bucket_int_16(1)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); } @Test @@ -70,8 +69,8 @@ public void testRegisterShortBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_short_16", DataTypes.ShortType, 16); List results = spark.sql("SELECT iceberg_bucket_short_16(1S)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); } @Test @@ -79,8 +78,8 @@ public void testRegisterByteBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_byte_16", DataTypes.ByteType, 16); List results = spark.sql("SELECT iceberg_bucket_byte_16(1Y)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); } @Test @@ -88,8 +87,8 @@ public void testRegisterLongBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_long_16", DataTypes.LongType, 16); List results = spark.sql("SELECT iceberg_bucket_long_16(1L)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.LongType.get(), 16).apply(1L), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.LongType.get(), 16).apply(1L), results.get(0).getInt(0)); } @Test @@ -97,7 +96,8 @@ public void testRegisterStringBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_string_16", DataTypes.StringType, 16); List results = spark.sql("SELECT iceberg_bucket_string_16('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), + Assert.assertEquals( + (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), results.get(0).getInt(0)); } @@ -106,7 +106,8 @@ public void testRegisterCharBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_char_16", new CharType(5), 16); List results = spark.sql("SELECT iceberg_bucket_char_16('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), + Assert.assertEquals( + (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), results.get(0).getInt(0)); } @@ -115,73 +116,89 @@ public void testRegisterVarCharBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_varchar_16", new VarcharType(5), 16); List results = spark.sql("SELECT iceberg_bucket_varchar_16('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), + Assert.assertEquals( + (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), results.get(0).getInt(0)); } @Test public void testRegisterDateBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_date_16", DataTypes.DateType, 16); - List results = spark.sql("SELECT iceberg_bucket_date_16(DATE '2021-06-30')").collectAsList(); + List results = + spark.sql("SELECT iceberg_bucket_date_16(DATE '2021-06-30')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.DateType.get(), 16) - .apply(DateTimeUtils.fromJavaDate(Date.valueOf("2021-06-30"))), + Assert.assertEquals( + (int) + Transforms.bucket(Types.DateType.get(), 16) + .apply(DateTimeUtils.fromJavaDate(Date.valueOf("2021-06-30"))), results.get(0).getInt(0)); } @Test public void testRegisterTimestampBucketUDF() { - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_timestamp_16", DataTypes.TimestampType, 16); + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_timestamp_16", DataTypes.TimestampType, 16); List results = - spark.sql("SELECT iceberg_bucket_timestamp_16(TIMESTAMP '2021-06-30 00:00:00.000')").collectAsList(); + spark + .sql("SELECT iceberg_bucket_timestamp_16(TIMESTAMP '2021-06-30 00:00:00.000')") + .collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.TimestampType.withZone(), 16) - .apply(DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2021-06-30 00:00:00.000"))), + Assert.assertEquals( + (int) + Transforms.bucket(Types.TimestampType.withZone(), 16) + .apply( + DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2021-06-30 00:00:00.000"))), results.get(0).getInt(0)); } @Test public void testRegisterBinaryBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_binary_16", DataTypes.BinaryType, 16); - List results = - spark.sql("SELECT iceberg_bucket_binary_16(X'0020001F')").collectAsList(); + List results = spark.sql("SELECT iceberg_bucket_binary_16(X'0020001F')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.BinaryType.get(), 16) - .apply(ByteBuffer.wrap(new byte[]{0x00, 0x20, 0x00, 0x1F})), + Assert.assertEquals( + (int) + Transforms.bucket(Types.BinaryType.get(), 16) + .apply(ByteBuffer.wrap(new byte[] {0x00, 0x20, 0x00, 0x1F})), results.get(0).getInt(0)); } @Test public void testRegisterDecimalBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_decimal_16", new DecimalType(4, 2), 16); - List results = - spark.sql("SELECT iceberg_bucket_decimal_16(11.11)").collectAsList(); + List results = spark.sql("SELECT iceberg_bucket_decimal_16(11.11)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.DecimalType.of(4, 2), 16) - .apply(new BigDecimal("11.11")), + Assert.assertEquals( + (int) Transforms.bucket(Types.DecimalType.of(4, 2), 16).apply(new BigDecimal("11.11")), results.get(0).getInt(0)); } @Test public void testRegisterBooleanBucketUDF() { - Assertions.assertThatThrownBy(() -> - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_boolean_16", DataTypes.BooleanType, 16)) + Assertions.assertThatThrownBy( + () -> + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_boolean_16", DataTypes.BooleanType, 16)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot bucket by type: boolean"); } @Test public void testRegisterDoubleBucketUDF() { - Assertions.assertThatThrownBy(() -> - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_double_16", DataTypes.DoubleType, 16)) + Assertions.assertThatThrownBy( + () -> + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_double_16", DataTypes.DoubleType, 16)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot bucket by type: double"); } @Test public void testRegisterFloatBucketUDF() { - Assertions.assertThatThrownBy(() -> - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_float_16", DataTypes.FloatType, 16)) + Assertions.assertThatThrownBy( + () -> + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_float_16", DataTypes.FloatType, 16)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot bucket by type: float"); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java index e07798301db8..7313c18cc09d 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; @@ -55,11 +54,11 @@ public class TestIdentityPartitionData extends SparkTestBase { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true }, + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true}, }; } @@ -71,36 +70,37 @@ public TestIdentityPartitionData(String format, boolean vectorized) { this.vectorized = vectorized; } - private static final Schema LOG_SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "date", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get()) - ); - - private static final List LOGS = ImmutableList.of( - LogMessage.debug("2020-02-02", "debug event 1"), - LogMessage.info("2020-02-02", "info event 1"), - LogMessage.debug("2020-02-02", "debug event 2"), - LogMessage.info("2020-02-03", "info event 2"), - LogMessage.debug("2020-02-03", "debug event 3"), - LogMessage.info("2020-02-03", "info event 3"), - LogMessage.error("2020-02-03", "error event 1"), - LogMessage.debug("2020-02-04", "debug event 4"), - LogMessage.warn("2020-02-04", "warn event 1"), - LogMessage.debug("2020-02-04", "debug event 5") - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - - private PartitionSpec spec = PartitionSpec.builderFor(LOG_SCHEMA).identity("date").identity("level").build(); + private static final Schema LOG_SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "date", Types.StringType.get()), + Types.NestedField.optional(3, "level", Types.StringType.get()), + Types.NestedField.optional(4, "message", Types.StringType.get())); + + private static final List LOGS = + ImmutableList.of( + LogMessage.debug("2020-02-02", "debug event 1"), + LogMessage.info("2020-02-02", "info event 1"), + LogMessage.debug("2020-02-02", "debug event 2"), + LogMessage.info("2020-02-03", "info event 2"), + LogMessage.debug("2020-02-03", "debug event 3"), + LogMessage.info("2020-02-03", "info event 3"), + LogMessage.error("2020-02-03", "error event 1"), + LogMessage.debug("2020-02-04", "debug event 4"), + LogMessage.warn("2020-02-04", "warn event 1"), + LogMessage.debug("2020-02-04", "debug event 5")); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); + + private PartitionSpec spec = + PartitionSpec.builderFor(LOG_SCHEMA).identity("date").identity("level").build(); private Table table = null; private Dataset logs = null; /** - * Use the Hive Based table to make Identity Partition Columns with no duplication of the data in the underlying - * parquet files. This makes sure that if the identity mapping fails, the test will also fail. + * Use the Hive Based table to make Identity Partition Columns with no duplication of the data in + * the underlying parquet files. This makes sure that if the identity mapping fails, the test will + * also fail. */ private void setupParquet() throws Exception { File location = temp.newFolder("logs"); @@ -109,15 +109,25 @@ private void setupParquet() throws Exception { Assert.assertTrue("Temp folder should exist", location.exists()); Map properties = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format); - this.logs = spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); + this.logs = + spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); spark.sql(String.format("DROP TABLE IF EXISTS %s", hiveTable)); - logs.orderBy("date", "level", "id").write().partitionBy("date", "level").format("parquet") - .option("path", hiveLocation.toString()).saveAsTable(hiveTable); - - this.table = TABLES.create(SparkSchemaUtil.schemaForTable(spark, hiveTable), - SparkSchemaUtil.specForTable(spark, hiveTable), properties, location.toString()); - - SparkTableUtil.importSparkTable(spark, new TableIdentifier(hiveTable), table, location.toString()); + logs.orderBy("date", "level", "id") + .write() + .partitionBy("date", "level") + .format("parquet") + .option("path", hiveLocation.toString()) + .saveAsTable(hiveTable); + + this.table = + TABLES.create( + SparkSchemaUtil.schemaForTable(spark, hiveTable), + SparkSchemaUtil.specForTable(spark, hiveTable), + properties, + location.toString()); + + SparkTableUtil.importSparkTable( + spark, new TableIdentifier(hiveTable), table, location.toString()); } @Before @@ -130,56 +140,70 @@ public void setupTable() throws Exception { Map properties = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format); this.table = TABLES.create(LOG_SCHEMA, spec, properties, location.toString()); - this.logs = spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); + this.logs = + spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); - logs.orderBy("date", "level", "id").write().format("iceberg").mode("append").save(location.toString()); + logs.orderBy("date", "level", "id") + .write() + .format("iceberg") + .mode("append") + .save(location.toString()); } } @Test public void testFullProjection() { List expected = logs.orderBy("id").collectAsList(); - List actual = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()).orderBy("id") - .select("id", "date", "level", "message") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table.location()) + .orderBy("id") + .select("id", "date", "level", "message") + .collectAsList(); Assert.assertEquals("Rows should match", expected, actual); } @Test public void testProjections() { - String[][] cases = new String[][] { - // individual fields - new String[] { "date" }, - new String[] { "level" }, - new String[] { "message" }, - // field pairs - new String[] { "date", "message" }, - new String[] { "level", "message" }, - new String[] { "date", "level" }, - // out-of-order pairs - new String[] { "message", "date" }, - new String[] { "message", "level" }, - new String[] { "level", "date" }, - // full projection, different orderings - new String[] { "date", "level", "message" }, - new String[] { "level", "date", "message" }, - new String[] { "date", "message", "level" }, - new String[] { "level", "message", "date" }, - new String[] { "message", "date", "level" }, - new String[] { "message", "level", "date" } - }; + String[][] cases = + new String[][] { + // individual fields + new String[] {"date"}, + new String[] {"level"}, + new String[] {"message"}, + // field pairs + new String[] {"date", "message"}, + new String[] {"level", "message"}, + new String[] {"date", "level"}, + // out-of-order pairs + new String[] {"message", "date"}, + new String[] {"message", "level"}, + new String[] {"level", "date"}, + // full projection, different orderings + new String[] {"date", "level", "message"}, + new String[] {"level", "date", "message"}, + new String[] {"date", "message", "level"}, + new String[] {"level", "message", "date"}, + new String[] {"message", "date", "level"}, + new String[] {"message", "level", "date"} + }; for (String[] ordering : cases) { List expected = logs.select("id", ordering).orderBy("id").collectAsList(); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()) - .select("id", ordering).orderBy("id") - .collectAsList(); - Assert.assertEquals("Rows should match for ordering: " + Arrays.toString(ordering), expected, actual); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table.location()) + .select("id", ordering) + .orderBy("id") + .collectAsList(); + Assert.assertEquals( + "Rows should match for ordering: " + Arrays.toString(ordering), expected, actual); } } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java index 4ab01044046f..9e75145faff9 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Iterator; @@ -68,8 +67,10 @@ protected void generateAndValidate(Schema schema, AssertMethod assertMethod) { StructLike recordStructLike = recordWrapper.wrap(actual.next()); StructLike rowStructLike = rowWrapper.wrap(expected.next()); - assertMethod.assertEquals("Should have expected StructLike values", - actualWrapper.set(recordStructLike), expectedWrapper.set(rowStructLike)); + assertMethod.assertEquals( + "Should have expected StructLike values", + actualWrapper.set(recordStructLike), + expectedWrapper.set(rowStructLike)); } Assert.assertFalse("Shouldn't have more record", actual.hasNext()); diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTablesWithPartitionEvolution.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTablesWithPartitionEvolution.java index ea9818cae9d9..950c1c1b40ae 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTablesWithPartitionEvolution.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTablesWithPartitionEvolution.java @@ -16,9 +16,18 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.FileFormat.AVRO; +import static org.apache.iceberg.FileFormat.ORC; +import static org.apache.iceberg.FileFormat.PARQUET; +import static org.apache.iceberg.MetadataTableType.ALL_DATA_FILES; +import static org.apache.iceberg.MetadataTableType.ALL_ENTRIES; +import static org.apache.iceberg.MetadataTableType.ENTRIES; +import static org.apache.iceberg.MetadataTableType.FILES; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.FORMAT_VERSION; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -52,48 +61,42 @@ import org.junit.runners.Parameterized; import org.junit.runners.Parameterized.Parameters; -import static org.apache.iceberg.FileFormat.AVRO; -import static org.apache.iceberg.FileFormat.ORC; -import static org.apache.iceberg.FileFormat.PARQUET; -import static org.apache.iceberg.MetadataTableType.ALL_DATA_FILES; -import static org.apache.iceberg.MetadataTableType.ALL_ENTRIES; -import static org.apache.iceberg.MetadataTableType.ENTRIES; -import static org.apache.iceberg.MetadataTableType.FILES; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.FORMAT_VERSION; - @RunWith(Parameterized.class) public class TestMetadataTablesWithPartitionEvolution extends SparkCatalogTestBase { @Parameters(name = "catalog = {0}, impl = {1}, conf = {2}, fileFormat = {3}, formatVersion = {4}") public static Object[][] parameters() { return new Object[][] { - { "testhive", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default" - ), - ORC, - formatVersion() - }, - { "testhadoop", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hadoop" + { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default"), + ORC, + formatVersion() + }, + { + "testhadoop", + SparkCatalog.class.getName(), + ImmutableMap.of("type", "hadoop"), + PARQUET, + formatVersion() + }, + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "clients", "1", + "parquet-enabled", "false", + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync ), - PARQUET, - formatVersion() - }, - { "spark_catalog", SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "clients", "1", - "parquet-enabled", "false", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - ), - AVRO, - formatVersion() - } + AVRO, + formatVersion() + } }; } @@ -106,8 +109,12 @@ private static int formatVersion() { private final FileFormat fileFormat; private final int formatVersion; - public TestMetadataTablesWithPartitionEvolution(String catalogName, String implementation, Map config, - FileFormat fileFormat, int formatVersion) { + public TestMetadataTablesWithPartitionEvolution( + String catalogName, + String implementation, + Map config, + FileFormat fileFormat, + int formatVersion) { super(catalogName, implementation, config); this.fileFormat = fileFormat; this.formatVersion = formatVersion; @@ -120,7 +127,9 @@ public void removeTable() { @Test public void testFilesMetadataTable() throws ParseException { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -128,28 +137,23 @@ public void testFilesMetadataTable() throws ParseException { // verify the metadata tables while the current spec is still unpartitioned for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES)) { Dataset df = loadMetadataTable(tableType); - Assert.assertTrue("Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); + Assert.assertTrue( + "Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); } Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); // verify the metadata tables after adding the first partition column for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES)) { assertPartitions( - ImmutableList.of(row(new Object[]{null}), row("b1")), - "STRUCT", - tableType); + ImmutableList.of(row(new Object[] {null}), row("b1")), "STRUCT", tableType); } - table.updateSpec() - .addField(Expressions.bucket("category", 8)) - .commit(); + table.updateSpec().addField(Expressions.bucket("category", 8)).commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -161,9 +165,7 @@ public void testFilesMetadataTable() throws ParseException { tableType); } - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -175,9 +177,7 @@ public void testFilesMetadataTable() throws ParseException { tableType); } - table.updateSpec() - .renameField("category_bucket_8", "category_bucket_8_another_name") - .commit(); + table.updateSpec().renameField("category_bucket_8", "category_bucket_8_another_name").commit(); sql("REFRESH TABLE %s", tableName); // verify the metadata tables after renaming the second partition column @@ -191,7 +191,9 @@ public void testFilesMetadataTable() throws ParseException { @Test public void testEntriesMetadataTable() throws ParseException { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -205,23 +207,17 @@ public void testEntriesMetadataTable() throws ParseException { Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); // verify the metadata tables after adding the first partition column for (MetadataTableType tableType : Arrays.asList(ENTRIES, ALL_ENTRIES)) { assertPartitions( - ImmutableList.of(row(new Object[]{null}), row("b1")), - "STRUCT", - tableType); + ImmutableList.of(row(new Object[] {null}), row("b1")), "STRUCT", tableType); } - table.updateSpec() - .addField(Expressions.bucket("category", 8)) - .commit(); + table.updateSpec().addField(Expressions.bucket("category", 8)).commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -233,9 +229,7 @@ public void testEntriesMetadataTable() throws ParseException { tableType); } - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -247,9 +241,7 @@ public void testEntriesMetadataTable() throws ParseException { tableType); } - table.updateSpec() - .renameField("category_bucket_8", "category_bucket_8_another_name") - .commit(); + table.updateSpec().renameField("category_bucket_8", "category_bucket_8_another_name").commit(); sql("REFRESH TABLE %s", tableName); // verify the metadata tables after renaming the second partition column @@ -263,15 +255,19 @@ public void testEntriesMetadataTable() throws ParseException { @Test public void testMetadataTablesWithUnknownTransforms() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); Table table = validationCatalog.loadTable(tableIdent); - PartitionSpec unknownSpec = PartitionSpecParser.fromJson(table.schema(), - "{ \"spec-id\": 1, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); + PartitionSpec unknownSpec = + PartitionSpecParser.fromJson( + table.schema(), + "{ \"spec-id\": 1, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); // replace the table spec to include an unknown transform TableOperations ops = ((HasTableOperations) table).operations(); @@ -281,14 +277,17 @@ public void testMetadataTablesWithUnknownTransforms() { sql("REFRESH TABLE %s", tableName); for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES, ENTRIES, ALL_ENTRIES)) { - AssertHelpers.assertThrows("Should complain about the partition type", - ValidationException.class, "Cannot build table partition type, unknown transforms", + AssertHelpers.assertThrows( + "Should complain about the partition type", + ValidationException.class, + "Cannot build table partition type, unknown transforms", () -> loadMetadataTable(tableType)); } } - private void assertPartitions(List expectedPartitions, String expectedTypeAsString, - MetadataTableType tableType) throws ParseException { + private void assertPartitions( + List expectedPartitions, String expectedTypeAsString, MetadataTableType tableType) + throws ParseException { Dataset df = loadMetadataTable(tableType); DataType expectedType = spark.sessionState().sqlParser().parseDataType(expectedTypeAsString); @@ -313,18 +312,18 @@ private void assertPartitions(List expectedPartitions, String expected switch (tableType) { case FILES: case ALL_DATA_FILES: - List actualFilesPartitions = df.orderBy("partition") - .select("partition.*") - .collectAsList(); - assertEquals("Partitions must match", expectedPartitions, rowsToJava(actualFilesPartitions)); + List actualFilesPartitions = + df.orderBy("partition").select("partition.*").collectAsList(); + assertEquals( + "Partitions must match", expectedPartitions, rowsToJava(actualFilesPartitions)); break; case ENTRIES: case ALL_ENTRIES: - List actualEntriesPartitions = df.orderBy("data_file.partition") - .select("data_file.partition.*") - .collectAsList(); - assertEquals("Partitions must match", expectedPartitions, rowsToJava(actualEntriesPartitions)); + List actualEntriesPartitions = + df.orderBy("data_file.partition").select("data_file.partition.*").collectAsList(); + assertEquals( + "Partitions must match", expectedPartitions, rowsToJava(actualEntriesPartitions)); break; default: @@ -337,7 +336,9 @@ private Dataset loadMetadataTable(MetadataTableType tableType) { } private void initTable() { - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DEFAULT_FILE_FORMAT, fileFormat.name()); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, DEFAULT_FILE_FORMAT, fileFormat.name()); sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, FORMAT_VERSION, formatVersion); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java index adfe8c7d3649..f585ed360f95 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.List; @@ -52,8 +53,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; - @RunWith(Parameterized.class) public class TestParquetScan extends AvroDataTest { private static final Configuration CONF = new Configuration(); @@ -72,12 +71,11 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Parameterized.Parameters(name = "vectorized = {0}") public static Object[] parameters() { - return new Object[] { false, true }; + return new Object[] {false, true}; } private final boolean vectorized; @@ -88,18 +86,20 @@ public TestParquetScan(boolean vectorized) { @Override protected void writeAndValidate(Schema schema) throws IOException { - Assume.assumeTrue("Cannot handle non-string map keys in parquet-avro", - null == TypeUtil.find( - schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); + Assume.assumeTrue( + "Cannot handle non-string map keys in parquet-avro", + null + == TypeUtil.find( + schema, + type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); File parent = temp.newFolder("parquet"); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); dataFolder.mkdirs(); - File parquetFile = new File(dataFolder, - FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); + File parquetFile = + new File(dataFolder, FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); HadoopTables tables = new HadoopTables(CONF); Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); @@ -110,24 +110,25 @@ protected void writeAndValidate(Schema schema) throws IOException { List expected = RandomData.generateList(tableSchema, 100, 1L); - try (FileAppender writer = Parquet.write(localOutput(parquetFile)) - .schema(tableSchema) - .build()) { + try (FileAppender writer = + Parquet.write(localOutput(parquetFile)).schema(tableSchema).build()) { writer.addAll(expected); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withFileSizeInBytes(parquetFile.length()) - .withPath(parquetFile.toString()) - .withRecordCount(100) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withFileSizeInBytes(parquetFile.length()) + .withPath(parquetFile.toString()) + .withRecordCount(100) + .build(); table.newAppend().appendFile(file).commit(); - table.updateProperties().set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)).commit(); + table + .updateProperties() + .set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .commit(); - Dataset df = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset df = spark.read().format("iceberg").load(location.toString()); List rows = df.collectAsList(); Assert.assertEquals("Should contain 100 rows", 100, rows.size()); diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java index 24f7b69e1dc5..ffe21432f00c 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; @@ -78,11 +77,11 @@ public class TestPartitionPruning { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } @@ -97,9 +96,12 @@ public TestPartitionPruning(String format, boolean vectorized) { private static SparkSession spark = null; private static JavaSparkContext sparkContext = null; - private static Transform bucketTransform = Transforms.bucket(Types.IntegerType.get(), 3); - private static Transform truncateTransform = Transforms.truncate(Types.StringType.get(), 5); - private static Transform hourTransform = Transforms.hour(Types.TimestampType.withoutZone()); + private static Transform bucketTransform = + Transforms.bucket(Types.IntegerType.get(), 3); + private static Transform truncateTransform = + Transforms.truncate(Types.StringType.get(), 5); + private static Transform hourTransform = + Transforms.hour(Types.TimestampType.withoutZone()); @BeforeClass public static void startSpark() { @@ -110,12 +112,21 @@ public static void startSpark() { CONF.set(optionKey, CountOpenLocalFileSystem.class.getName()); spark.conf().set(optionKey, CountOpenLocalFileSystem.class.getName()); spark.conf().set("spark.sql.session.timeZone", "UTC"); - spark.udf().register("bucket3", (Integer num) -> bucketTransform.apply(num), DataTypes.IntegerType); - spark.udf().register("truncate5", (String str) -> truncateTransform.apply(str), DataTypes.StringType); + spark + .udf() + .register("bucket3", (Integer num) -> bucketTransform.apply(num), DataTypes.IntegerType); + spark + .udf() + .register("truncate5", (String str) -> truncateTransform.apply(str), DataTypes.StringType); // NOTE: date transforms take the type long, not Timestamp - spark.udf().register("hour", (Timestamp ts) -> hourTransform.apply( - org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp(ts)), - DataTypes.IntegerType); + spark + .udf() + .register( + "hour", + (Timestamp ts) -> + hourTransform.apply( + org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp(ts)), + DataTypes.IntegerType); } @AfterClass @@ -125,70 +136,70 @@ public static void stopSpark() { currentSpark.stop(); } - private static final Schema LOG_SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "date", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get()), - Types.NestedField.optional(5, "timestamp", Types.TimestampType.withZone()) - ); - - private static final List LOGS = ImmutableList.of( - LogMessage.debug("2020-02-02", "debug event 1", getInstant("2020-02-02T00:00:00")), - LogMessage.info("2020-02-02", "info event 1", getInstant("2020-02-02T01:00:00")), - LogMessage.debug("2020-02-02", "debug event 2", getInstant("2020-02-02T02:00:00")), - LogMessage.info("2020-02-03", "info event 2", getInstant("2020-02-03T00:00:00")), - LogMessage.debug("2020-02-03", "debug event 3", getInstant("2020-02-03T01:00:00")), - LogMessage.info("2020-02-03", "info event 3", getInstant("2020-02-03T02:00:00")), - LogMessage.error("2020-02-03", "error event 1", getInstant("2020-02-03T03:00:00")), - LogMessage.debug("2020-02-04", "debug event 4", getInstant("2020-02-04T01:00:00")), - LogMessage.warn("2020-02-04", "warn event 1", getInstant("2020-02-04T02:00:00")), - LogMessage.debug("2020-02-04", "debug event 5", getInstant("2020-02-04T03:00:00")) - ); + private static final Schema LOG_SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "date", Types.StringType.get()), + Types.NestedField.optional(3, "level", Types.StringType.get()), + Types.NestedField.optional(4, "message", Types.StringType.get()), + Types.NestedField.optional(5, "timestamp", Types.TimestampType.withZone())); + + private static final List LOGS = + ImmutableList.of( + LogMessage.debug("2020-02-02", "debug event 1", getInstant("2020-02-02T00:00:00")), + LogMessage.info("2020-02-02", "info event 1", getInstant("2020-02-02T01:00:00")), + LogMessage.debug("2020-02-02", "debug event 2", getInstant("2020-02-02T02:00:00")), + LogMessage.info("2020-02-03", "info event 2", getInstant("2020-02-03T00:00:00")), + LogMessage.debug("2020-02-03", "debug event 3", getInstant("2020-02-03T01:00:00")), + LogMessage.info("2020-02-03", "info event 3", getInstant("2020-02-03T02:00:00")), + LogMessage.error("2020-02-03", "error event 1", getInstant("2020-02-03T03:00:00")), + LogMessage.debug("2020-02-04", "debug event 4", getInstant("2020-02-04T01:00:00")), + LogMessage.warn("2020-02-04", "warn event 1", getInstant("2020-02-04T02:00:00")), + LogMessage.debug("2020-02-04", "debug event 5", getInstant("2020-02-04T03:00:00"))); private static Instant getInstant(String timestampWithoutZone) { - Long epochMicros = (Long) Literal.of(timestampWithoutZone).to(Types.TimestampType.withoutZone()).value(); + Long epochMicros = + (Long) Literal.of(timestampWithoutZone).to(Types.TimestampType.withoutZone()).value(); return Instant.ofEpochMilli(TimeUnit.MICROSECONDS.toMillis(epochMicros)); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private PartitionSpec spec = PartitionSpec.builderFor(LOG_SCHEMA) - .identity("date") - .identity("level") - .bucket("id", 3) - .truncate("message", 5) - .hour("timestamp") - .build(); + private PartitionSpec spec = + PartitionSpec.builderFor(LOG_SCHEMA) + .identity("date") + .identity("level") + .bucket("id", 3) + .truncate("message", 5) + .hour("timestamp") + .build(); @Test public void testPartitionPruningIdentityString() { String filterCond = "date >= '2020-02-03' AND level = 'DEBUG'"; - Predicate partCondition = (Row r) -> { - String date = r.getString(0); - String level = r.getString(1); - return date.compareTo("2020-02-03") >= 0 && level.equals("DEBUG"); - }; + Predicate partCondition = + (Row r) -> { + String date = r.getString(0); + String level = r.getString(1); + return date.compareTo("2020-02-03") >= 0 && level.equals("DEBUG"); + }; runTest(filterCond, partCondition); } @Test public void testPartitionPruningBucketingInteger() { - final int[] ids = new int[]{ - LOGS.get(3).getId(), - LOGS.get(7).getId() - }; - String condForIds = Arrays.stream(ids).mapToObj(String::valueOf) - .collect(Collectors.joining(",", "(", ")")); + final int[] ids = new int[] {LOGS.get(3).getId(), LOGS.get(7).getId()}; + String condForIds = + Arrays.stream(ids).mapToObj(String::valueOf).collect(Collectors.joining(",", "(", ")")); String filterCond = "id in " + condForIds; - Predicate partCondition = (Row r) -> { - int bucketId = r.getInt(2); - Set buckets = Arrays.stream(ids).map(bucketTransform::apply) - .boxed().collect(Collectors.toSet()); - return buckets.contains(bucketId); - }; + Predicate partCondition = + (Row r) -> { + int bucketId = r.getInt(2); + Set buckets = + Arrays.stream(ids).map(bucketTransform::apply).boxed().collect(Collectors.toSet()); + return buckets.contains(bucketId); + }; runTest(filterCond, partCondition); } @@ -196,10 +207,11 @@ public void testPartitionPruningBucketingInteger() { @Test public void testPartitionPruningTruncatedString() { String filterCond = "message like 'info event%'"; - Predicate partCondition = (Row r) -> { - String truncatedMessage = r.getString(3); - return truncatedMessage.equals("info "); - }; + Predicate partCondition = + (Row r) -> { + String truncatedMessage = r.getString(3); + return truncatedMessage.equals("info "); + }; runTest(filterCond, partCondition); } @@ -207,10 +219,11 @@ public void testPartitionPruningTruncatedString() { @Test public void testPartitionPruningTruncatedStringComparingValueShorterThanPartitionValue() { String filterCond = "message like 'inf%'"; - Predicate partCondition = (Row r) -> { - String truncatedMessage = r.getString(3); - return truncatedMessage.startsWith("inf"); - }; + Predicate partCondition = + (Row r) -> { + String truncatedMessage = r.getString(3); + return truncatedMessage.startsWith("inf"); + }; runTest(filterCond, partCondition); } @@ -219,17 +232,20 @@ public void testPartitionPruningTruncatedStringComparingValueShorterThanPartitio public void testPartitionPruningHourlyPartition() { String filterCond; if (spark.version().startsWith("2")) { - // Looks like from Spark 2 we need to compare timestamp with timestamp to push down the filter. + // Looks like from Spark 2 we need to compare timestamp with timestamp to push down the + // filter. filterCond = "timestamp >= to_timestamp('2020-02-03T01:00:00')"; } else { filterCond = "timestamp >= '2020-02-03T01:00:00'"; } - Predicate partCondition = (Row r) -> { - int hourValue = r.getInt(4); - Instant instant = getInstant("2020-02-03T01:00:00"); - Integer hourValueToFilter = hourTransform.apply(TimeUnit.MILLISECONDS.toMicros(instant.toEpochMilli())); - return hourValue >= hourValueToFilter; - }; + Predicate partCondition = + (Row r) -> { + int hourValue = r.getInt(4); + Instant instant = getInstant("2020-02-03T01:00:00"); + Integer hourValueToFilter = + hourTransform.apply(TimeUnit.MILLISECONDS.toMicros(instant.toEpochMilli())); + return hourValue >= hourValueToFilter; + }; runTest(filterCond, partCondition); } @@ -242,24 +258,26 @@ private void runTest(String filterCond, Predicate partCondition) { Dataset logs = createTestDataset(); saveTestDatasetToTable(logs, table); - List expected = logs - .select("id", "date", "level", "message", "timestamp") - .filter(filterCond) - .orderBy("id") - .collectAsList(); + List expected = + logs.select("id", "date", "level", "message", "timestamp") + .filter(filterCond) + .orderBy("id") + .collectAsList(); Assert.assertFalse("Expected rows should be not empty", expected.isEmpty()); // remove records which may be recorded during storing to table CountOpenLocalFileSystem.resetRecordsInPathPrefix(originTableLocation.getAbsolutePath()); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()) - .select("id", "date", "level", "message", "timestamp") - .filter(filterCond) - .orderBy("id") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table.location()) + .select("id", "date", "level", "message", "timestamp") + .filter(filterCond) + .orderBy("id") + .collectAsList(); Assert.assertFalse("Actual rows should not be empty", actual.isEmpty()); Assert.assertEquals("Rows should match", expected, actual); @@ -282,40 +300,59 @@ private Table createTable(File originTableLocation) { } private Dataset createTestDataset() { - List rows = LOGS.stream().map(logMessage -> { - Object[] underlying = new Object[] { - logMessage.getId(), - UTF8String.fromString(logMessage.getDate()), - UTF8String.fromString(logMessage.getLevel()), - UTF8String.fromString(logMessage.getMessage()), - // discard the nanoseconds part to simplify - TimeUnit.MILLISECONDS.toMicros(logMessage.getTimestamp().toEpochMilli()) - }; - return new GenericInternalRow(underlying); - }).collect(Collectors.toList()); + List rows = + LOGS.stream() + .map( + logMessage -> { + Object[] underlying = + new Object[] { + logMessage.getId(), + UTF8String.fromString(logMessage.getDate()), + UTF8String.fromString(logMessage.getLevel()), + UTF8String.fromString(logMessage.getMessage()), + // discard the nanoseconds part to simplify + TimeUnit.MILLISECONDS.toMicros(logMessage.getTimestamp().toEpochMilli()) + }; + return new GenericInternalRow(underlying); + }) + .collect(Collectors.toList()); JavaRDD rdd = sparkContext.parallelize(rows); - Dataset df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(LOG_SCHEMA), false); - - return df - .selectExpr("id", "date", "level", "message", "timestamp") - .selectExpr("id", "date", "level", "message", "timestamp", "bucket3(id) AS bucket_id", - "truncate5(message) AS truncated_message", "hour(timestamp) AS ts_hour"); + Dataset df = + spark.internalCreateDataFrame( + JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(LOG_SCHEMA), false); + + return df.selectExpr("id", "date", "level", "message", "timestamp") + .selectExpr( + "id", + "date", + "level", + "message", + "timestamp", + "bucket3(id) AS bucket_id", + "truncate5(message) AS truncated_message", + "hour(timestamp) AS ts_hour"); } private void saveTestDatasetToTable(Dataset logs, Table table) { logs.orderBy("date", "level", "bucket_id", "truncated_message", "ts_hour") .select("id", "date", "level", "message", "timestamp") - .write().format("iceberg").mode("append").save(table.location()); + .write() + .format("iceberg") + .mode("append") + .save(table.location()); } - private void assertAccessOnDataFiles(File originTableLocation, Table table, Predicate partCondition) { + private void assertAccessOnDataFiles( + File originTableLocation, Table table, Predicate partCondition) { // only use files in current table location to avoid side-effects on concurrent test runs - Set readFilesInQuery = CountOpenLocalFileSystem.pathToNumOpenCalled.keySet() - .stream().filter(path -> path.startsWith(originTableLocation.getAbsolutePath())) - .collect(Collectors.toSet()); + Set readFilesInQuery = + CountOpenLocalFileSystem.pathToNumOpenCalled.keySet().stream() + .filter(path -> path.startsWith(originTableLocation.getAbsolutePath())) + .collect(Collectors.toSet()); - List files = spark.read().format("iceberg").load(table.location() + "#files").collectAsList(); + List files = + spark.read().format("iceberg").load(table.location() + "#files").collectAsList(); Set filesToRead = extractFilePathsMatchingConditionOnPartition(files, partCondition); Set filesToNotRead = extractFilePathsNotIn(files, filesToRead); @@ -325,37 +362,51 @@ private void assertAccessOnDataFiles(File originTableLocation, Table table, Pred Assert.assertFalse("The query should prune some data files.", filesToNotRead.isEmpty()); - // We don't check "all" data files bound to the condition are being read, as data files can be pruned on + // We don't check "all" data files bound to the condition are being read, as data files can be + // pruned on // other conditions like lower/upper bound of columns. - Assert.assertFalse("Some of data files in partition range should be read. " + - "Read files in query: " + readFilesInQuery + " / data files in partition range: " + filesToRead, + Assert.assertFalse( + "Some of data files in partition range should be read. " + + "Read files in query: " + + readFilesInQuery + + " / data files in partition range: " + + filesToRead, Sets.intersection(filesToRead, readFilesInQuery).isEmpty()); // Data files which aren't bound to the condition shouldn't be read. - Assert.assertTrue("Data files outside of partition range should not be read. " + - "Read files in query: " + readFilesInQuery + " / data files outside of partition range: " + filesToNotRead, + Assert.assertTrue( + "Data files outside of partition range should not be read. " + + "Read files in query: " + + readFilesInQuery + + " / data files outside of partition range: " + + filesToNotRead, Sets.intersection(filesToNotRead, readFilesInQuery).isEmpty()); } - private Set extractFilePathsMatchingConditionOnPartition(List files, Predicate condition) { + private Set extractFilePathsMatchingConditionOnPartition( + List files, Predicate condition) { // idx 1: file_path, idx 3: partition return files.stream() - .filter(r -> { - Row partition = r.getStruct(4); - return condition.test(partition); - }).map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) + .filter( + r -> { + Row partition = r.getStruct(4); + return condition.test(partition); + }) + .map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) .collect(Collectors.toSet()); } private Set extractFilePathsNotIn(List files, Set filePaths) { - Set allFilePaths = files.stream().map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) - .collect(Collectors.toSet()); + Set allFilePaths = + files.stream() + .map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) + .collect(Collectors.toSet()); return Sets.newHashSet(Sets.symmetricDifference(allFilePaths, filePaths)); } public static class CountOpenLocalFileSystem extends RawLocalFileSystem { - public static String scheme = String.format("TestIdentityPartitionData%dfs", - new Random().nextInt()); + public static String scheme = + String.format("TestIdentityPartitionData%dfs", new Random().nextInt()); public static Map pathToNumOpenCalled = Maps.newConcurrentMap(); public static String convertPath(String absPath) { @@ -401,13 +452,15 @@ public String getScheme() { @Override public FSDataInputStream open(Path f, int bufferSize) throws IOException { String path = f.toUri().getPath(); - pathToNumOpenCalled.compute(path, (ignored, v) -> { - if (v == null) { - return 1L; - } else { - return v + 1; - } - }); + pathToNumOpenCalled.compute( + path, + (ignored, v) -> { + if (v == null) { + return 1L; + } else { + return v + 1; + } + }); return super.open(f, bufferSize); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java index f63181766852..fedac9aee3ac 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.util.List; import org.apache.avro.generic.GenericData; @@ -56,46 +58,43 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestPartitionValues { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } - private static final Schema SUPPORTED_PRIMITIVES = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - required(103, "i", Types.IntegerType.get()), - required(104, "l", Types.LongType.get()), - required(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - required(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - required(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision - ); - - private static final Schema SIMPLE_SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get())); - - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SIMPLE_SCHEMA) - .identity("data") - .build(); + private static final Schema SUPPORTED_PRIMITIVES = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + required(103, "i", Types.IntegerType.get()), + required(104, "l", Types.LongType.get()), + required(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + required(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + required(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision + ); + + private static final Schema SIMPLE_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SIMPLE_SCHEMA).identity("data").build(); private static SparkSession spark = null; @@ -111,8 +110,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String format; private final boolean vectorized; @@ -134,29 +132,30 @@ public void testNullPartitionValue() throws Exception { Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, null) - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, null)); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(location.toString()); - Dataset result = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()); + Dataset result = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(location.toString()); - List actual = result - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -174,29 +173,28 @@ public void testReorderedColumns() throws Exception { Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("data", "id").write() - .format("iceberg") - .mode(SaveMode.Append) - .option(SparkWriteOptions.CHECK_ORDERING, "false") - .save(location.toString()); + df.select("data", "id") + .write() + .format("iceberg") + .mode(SaveMode.Append) + .option(SparkWriteOptions.CHECK_ORDERING, "false") + .save(location.toString()); - Dataset result = spark.read() + Dataset result = + spark + .read() .format("iceberg") .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) .load(location.toString()); - List actual = result - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -214,30 +212,29 @@ public void testReorderedColumnsNoNullability() throws Exception { Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("data", "id").write() - .format("iceberg") - .mode(SaveMode.Append) - .option(SparkWriteOptions.CHECK_ORDERING, "false") - .option(SparkWriteOptions.CHECK_NULLABILITY, "false") - .save(location.toString()); + df.select("data", "id") + .write() + .format("iceberg") + .mode(SaveMode.Append) + .option(SparkWriteOptions.CHECK_ORDERING, "false") + .option(SparkWriteOptions.CHECK_NULLABILITY, "false") + .save(location.toString()); - Dataset result = spark.read() + Dataset result = + spark + .read() .format("iceberg") .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) .load(location.toString()); - List actual = result - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -245,9 +242,10 @@ public void testReorderedColumnsNoNullability() throws Exception { @Test public void testPartitionValueTypes() throws Exception { - String[] columnNames = new String[] { - "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" - }; + String[] columnNames = + new String[] { + "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" + }; HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); @@ -259,23 +257,27 @@ public void testPartitionValueTypes() throws Exception { List expected = RandomData.generateList(source.schema(), 2, 128735L); File avroData = temp.newFile("data.avro"); Assert.assertTrue(avroData.delete()); - try (FileAppender appender = Avro.write(Files.localOutput(avroData)) - .schema(source.schema()) - .build()) { + try (FileAppender appender = + Avro.write(Files.localOutput(avroData)).schema(source.schema()).build()) { appender.addAll(expected); } // add the Avro data file to the source table - source.newAppend() - .appendFile(DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(10) - .withInputFile(Files.localInput(avroData)) - .build()) + source + .newAppend() + .appendFile( + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(10) + .withInputFile(Files.localInput(avroData)) + .build()) .commit(); - Dataset sourceDF = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(sourceLocation); + Dataset sourceDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(sourceLocation); for (String column : columnNames) { String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString(); @@ -290,16 +292,15 @@ public void testPartitionValueTypes() throws Exception { Table table = tables.create(SUPPORTED_PRIMITIVES, spec, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - sourceDF.write() - .format("iceberg") - .mode(SaveMode.Append) - .save(location.toString()); + sourceDF.write().format("iceberg").mode(SaveMode.Append).save(location.toString()); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(location.toString()) + .collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); @@ -312,9 +313,10 @@ public void testPartitionValueTypes() throws Exception { @Test public void testNestedPartitionValues() throws Exception { - String[] columnNames = new String[] { - "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" - }; + String[] columnNames = + new String[] { + "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" + }; HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); Schema nestedSchema = new Schema(optional(1, "nested", SUPPORTED_PRIMITIVES.asStruct())); @@ -327,23 +329,27 @@ public void testNestedPartitionValues() throws Exception { List expected = RandomData.generateList(source.schema(), 2, 128735L); File avroData = temp.newFile("data.avro"); Assert.assertTrue(avroData.delete()); - try (FileAppender appender = Avro.write(Files.localOutput(avroData)) - .schema(source.schema()) - .build()) { + try (FileAppender appender = + Avro.write(Files.localOutput(avroData)).schema(source.schema()).build()) { appender.addAll(expected); } // add the Avro data file to the source table - source.newAppend() - .appendFile(DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(10) - .withInputFile(Files.localInput(avroData)) - .build()) + source + .newAppend() + .appendFile( + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(10) + .withInputFile(Files.localInput(avroData)) + .build()) .commit(); - Dataset sourceDF = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(sourceLocation); + Dataset sourceDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(sourceLocation); for (String column : columnNames) { String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString(); @@ -353,45 +359,46 @@ public void testNestedPartitionValues() throws Exception { File dataFolder = new File(location, "data"); Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity("nested." + column).build(); + PartitionSpec spec = + PartitionSpec.builderFor(nestedSchema).identity("nested." + column).build(); Table table = tables.create(nestedSchema, spec, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - sourceDF.write() - .format("iceberg") - .mode(SaveMode.Append) - .save(location.toString()); + sourceDF.write().format("iceberg").mode(SaveMode.Append).save(location.toString()); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(location.toString()) + .collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe( - nestedSchema.asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe(nestedSchema.asStruct(), expected.get(i), actual.get(i)); } } } /** * To verify if WrappedPositionAccessor is generated against a string field within a nested field, - * rather than a Position2Accessor. - * Or when building the partition path, a ClassCastException is thrown with the message like: - * Cannot cast org.apache.spark.unsafe.types.UTF8String to java.lang.CharSequence + * rather than a Position2Accessor. Or when building the partition path, a ClassCastException is + * thrown with the message like: Cannot cast org.apache.spark.unsafe.types.UTF8String to + * java.lang.CharSequence */ @Test public void testPartitionedByNestedString() throws Exception { // schema and partition spec - Schema nestedSchema = new Schema( - Types.NestedField.required(1, "struct", - Types.StructType.of(Types.NestedField.required(2, "string", Types.StringType.get())) - ) - ); + Schema nestedSchema = + new Schema( + Types.NestedField.required( + 1, + "struct", + Types.StructType.of( + Types.NestedField.required(2, "string", Types.StringType.get())))); PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity("struct.string").build(); // create table @@ -401,14 +408,14 @@ public void testPartitionedByNestedString() throws Exception { // input data frame StructField[] structFields = { - new StructField("struct", - DataTypes.createStructType( - new StructField[] { - new StructField("string", DataTypes.StringType, false, Metadata.empty()) - } - ), - false, Metadata.empty() - ) + new StructField( + "struct", + DataTypes.createStructType( + new StructField[] { + new StructField("string", DataTypes.StringType, false, Metadata.empty()) + }), + false, + Metadata.empty()) }; List rows = Lists.newArrayList(); @@ -416,17 +423,16 @@ public void testPartitionedByNestedString() throws Exception { Dataset sourceDF = spark.createDataFrame(rows, new StructType(structFields)); // write into iceberg - sourceDF.write() - .format("iceberg") - .mode(SaveMode.Append) - .save(baseLocation); + sourceDF.write().format("iceberg").mode(SaveMode.Append).save(baseLocation); // verify - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(baseLocation) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(baseLocation) + .collectAsList(); Assert.assertEquals("Number of rows should match", rows.size(), actual.size()); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPathIdentifier.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPathIdentifier.java index ff4fe22a7a8a..f58451296cef 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPathIdentifier.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPathIdentifier.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.BaseTable; @@ -42,16 +43,13 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestPathIdentifier extends SparkTestBase { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), required(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableLocation; private PathIdentifier identifier; private SparkCatalog sparkCatalog; @@ -72,17 +70,16 @@ public void after() { @Test public void testPathIdentifier() throws TableAlreadyExistsException, NoSuchTableException { - SparkTable table = sparkCatalog.createTable(identifier, - SparkSchemaUtil.convert(SCHEMA), - new Transform[0], - ImmutableMap.of()); + SparkTable table = + sparkCatalog.createTable( + identifier, SparkSchemaUtil.convert(SCHEMA), new Transform[0], ImmutableMap.of()); Assert.assertEquals(table.table().location(), tableLocation.getAbsolutePath()); Assertions.assertThat(table.table()).isInstanceOf(BaseTable.class); - Assertions.assertThat(((BaseTable) table.table()).operations()).isInstanceOf(HadoopTableOperations.class); + Assertions.assertThat(((BaseTable) table.table()).operations()) + .isInstanceOf(HadoopTableOperations.class); Assert.assertEquals(sparkCatalog.loadTable(identifier), table); Assert.assertTrue(sparkCatalog.dropTable(identifier)); } } - diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java index 8d65b64cab6d..cfc746f6e932 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.avro.Schema.Type.UNION; + import java.io.IOException; import java.util.List; import java.util.Map; @@ -37,8 +38,6 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.avro.Schema.Type.UNION; - public abstract class TestReadProjection { final String format; @@ -46,20 +45,17 @@ public abstract class TestReadProjection { this.format = format; } - protected abstract Record writeAndRead(String desc, - Schema writeSchema, - Schema readSchema, - Record record) throws IOException; + protected abstract Record writeAndRead( + String desc, Schema writeSchema, Schema readSchema, Record record) throws IOException; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); @@ -67,32 +63,33 @@ public void testFullProjection() throws Exception { Record projected = writeAndRead("full_projection", schema, schema, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("data")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("data")); Assert.assertEquals("Should contain the correct data value", 0, cmp); } @Test public void testReorderedFullProjection() throws Exception { -// Assume.assumeTrue( -// "Spark's Parquet read support does not support reordered columns", -// !format.equalsIgnoreCase("parquet")); + // Assume.assumeTrue( + // "Spark's Parquet read support does not support reordered columns", + // !format.equalsIgnoreCase("parquet")); - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); record.setField("data", "test"); - Schema reordered = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("reordered_full_projection", schema, reordered, record); @@ -102,24 +99,24 @@ public void testReorderedFullProjection() throws Exception { @Test public void testReorderedProjection() throws Exception { -// Assume.assumeTrue( -// "Spark's Parquet read support does not support reordered columns", -// !format.equalsIgnoreCase("parquet")); + // Assume.assumeTrue( + // "Spark's Parquet read support does not support reordered columns", + // !format.equalsIgnoreCase("parquet")); - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); record.setField("data", "test"); - Schema reordered = new Schema( - Types.NestedField.optional(2, "missing_1", Types.StringType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(3, "missing_2", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(2, "missing_1", Types.StringType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(3, "missing_2", Types.LongType.get())); Record projected = writeAndRead("reordered_projection", schema, reordered, record); @@ -130,10 +127,10 @@ public void testReorderedProjection() throws Exception { @Test public void testEmptyProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); @@ -152,68 +149,68 @@ public void testEmptyProjection() throws Exception { @Test public void testBasicProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); record.setField("data", "test"); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("basic_projection_id", writeSchema, idOnly, record); Assert.assertNull("Should not project data", projected.getField("data")); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); - Schema dataOnly = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("data")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("data")); Assert.assertEquals("Should contain the correct data value", 0, cmp); } @Test public void testRename() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); record.setField("data", "test"); - Schema readSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get()) - ); + Schema readSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); Record projected = writeAndRead("project_and_rename", writeSchema, readSchema, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("renamed")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("renamed")); Assert.assertEquals("Should contain the correct data/renamed value", 0, cmp); } @Test public void testNestedStructProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); @@ -222,61 +219,76 @@ public void testNestedStructProjection() throws Exception { location.setField("long", -1.539054f); record.setField("location", location); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); Record projectedLocation = (Record) projected.getField("location"); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project location", projectedLocation); - Schema latOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()) - )) - ); + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); projected = writeAndRead("latitude_only", writeSchema, latOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); Assert.assertNull("Should not project longitude", projectedLocation.getField("long")); - Assert.assertEquals("Should project latitude", - 52.995143f, (float) projectedLocation.getField("lat"), 0.000001f); - - Schema longOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Assert.assertEquals( + "Should project latitude", + 52.995143f, + (float) projectedLocation.getField("lat"), + 0.000001f); + + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); projected = writeAndRead("longitude_only", writeSchema, longOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); Assert.assertNull("Should not project latitutde", projectedLocation.getField("lat")); - Assert.assertEquals("Should project longitude", - -1.539054f, (float) projectedLocation.getField("long"), 0.000001f); + Assert.assertEquals( + "Should project longitude", + -1.539054f, + (float) projectedLocation.getField("long"), + 0.000001f); Schema locationOnly = writeSchema.select("location"); projected = writeAndRead("location_only", writeSchema, locationOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); - Assert.assertEquals("Should project latitude", - 52.995143f, (float) projectedLocation.getField("lat"), 0.000001f); - Assert.assertEquals("Should project longitude", - -1.539054f, (float) projectedLocation.getField("long"), 0.000001f); + Assert.assertEquals( + "Should project latitude", + 52.995143f, + (float) projectedLocation.getField("lat"), + 0.000001f); + Assert.assertEquals( + "Should project longitude", + -1.539054f, + (float) projectedLocation.getField("long"), + 0.000001f); } @Test public void testMapProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "properties", - Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "properties", + Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); Map properties = ImmutableMap.of("a", "A", "b", "B"); @@ -284,31 +296,36 @@ public void testMapProjection() throws IOException { record.setField("id", 34L); record.setField("properties", properties); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project properties map", projected.getField("properties")); Schema keyOnly = writeSchema.select("properties.key"); projected = writeAndRead("key_only", writeSchema, keyOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); Schema valueOnly = writeSchema.select("properties.value"); projected = writeAndRead("value_only", writeSchema, valueOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); Schema mapOnly = writeSchema.select("properties"); projected = writeAndRead("map_only", writeSchema, mapOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); } private Map toStringMap(Map map) { @@ -325,16 +342,19 @@ public void testMapProjection() throws IOException { @Test public void testMapOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); @@ -346,91 +366,100 @@ public void testMapOfStructsProjection() throws IOException { l2.setField("long", -1.539054f); record.setField("locations", ImmutableMap.of("L1", l1, "L2", l2)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project locations map", projected.getField("locations")); projected = writeAndRead("all_locations", writeSchema, writeSchema.select("locations"), record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project locations map", - record.getField("locations"), toStringMap((Map) projected.getField("locations"))); + Assert.assertEquals( + "Should project locations map", + record.getField("locations"), + toStringMap((Map) projected.getField("locations"))); - projected = writeAndRead("lat_only", - writeSchema, writeSchema.select("locations.lat"), record); + projected = writeAndRead("lat_only", writeSchema, writeSchema.select("locations.lat"), record); Assert.assertNull("Should not project id", projected.getField("id")); Map locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); Record projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain lat", - 53.992811f, (float) projectedL1.getField("lat"), 0.000001); + Assert.assertEquals( + "L1 should contain lat", 53.992811f, (float) projectedL1.getField("lat"), 0.000001); Assert.assertNull("L1 should not contain long", projectedL1.getField("long")); Record projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain lat", - 52.995143f, (float) projectedL2.getField("lat"), 0.000001); + Assert.assertEquals( + "L2 should contain lat", 52.995143f, (float) projectedL2.getField("lat"), 0.000001); Assert.assertNull("L2 should not contain long", projectedL2.getField("long")); - projected = writeAndRead("long_only", - writeSchema, writeSchema.select("locations.long"), record); + projected = + writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), record); Assert.assertNull("Should not project id", projected.getField("id")); locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); Assert.assertNull("L1 should not contain lat", projectedL1.getField("lat")); - Assert.assertEquals("L1 should contain long", - -1.542616f, (float) projectedL1.getField("long"), 0.000001); + Assert.assertEquals( + "L1 should contain long", -1.542616f, (float) projectedL1.getField("long"), 0.000001); projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); Assert.assertNull("L2 should not contain lat", projectedL2.getField("lat")); - Assert.assertEquals("L2 should contain long", - -1.539054f, (float) projectedL2.getField("long"), 0.000001); - - Schema latitiudeRenamed = new Schema( - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "latitude", Types.FloatType.get()) - ) - )) - ); + Assert.assertEquals( + "L2 should contain long", -1.539054f, (float) projectedL2.getField("long"), 0.000001); + + Schema latitiudeRenamed = + new Schema( + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, record); Assert.assertNull("Should not project id", projected.getField("id")); locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain latitude", - 53.992811f, (float) projectedL1.getField("latitude"), 0.000001); + Assert.assertEquals( + "L1 should contain latitude", + 53.992811f, + (float) projectedL1.getField("latitude"), + 0.000001); Assert.assertNull("L1 should not contain lat", projectedL1.getField("lat")); Assert.assertNull("L1 should not contain long", projectedL1.getField("long")); projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain latitude", - 52.995143f, (float) projectedL2.getField("latitude"), 0.000001); + Assert.assertEquals( + "L2 should contain latitude", + 52.995143f, + (float) projectedL2.getField("latitude"), + 0.000001); Assert.assertNull("L2 should not contain lat", projectedL2.getField("lat")); Assert.assertNull("L2 should not contain long", projectedL2.getField("long")); } @Test public void testListProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(10, "values", - Types.ListType.ofOptional(11, Types.LongType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); List values = ImmutableList.of(56L, 57L, 58L); @@ -438,12 +467,11 @@ public void testListProjection() throws IOException { record.setField("id", 34L); record.setField("values", values); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project values list", projected.getField("values")); Schema elementOnly = writeSchema.select("values.element"); @@ -460,15 +488,17 @@ public void testListProjection() throws IOException { @Test @SuppressWarnings("unchecked") public void testListOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()) - )) - ) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); @@ -480,18 +510,17 @@ public void testListOfStructsProjection() throws IOException { p2.setField("y", null); record.setField("points", ImmutableList.of(p1, p2)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project points list", projected.getField("points")); projected = writeAndRead("all_points", writeSchema, writeSchema.select("points"), record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project points list", - record.getField("points"), projected.getField("points")); + Assert.assertEquals( + "Should project points list", record.getField("points"), projected.getField("points")); projected = writeAndRead("x_only", writeSchema, writeSchema.select("points.x"), record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -517,13 +546,15 @@ public void testListOfStructsProjection() throws IOException { Assert.assertNull("Should not project x", projectedP2.getField("x")); Assert.assertNull("Should project null y", projectedP2.getField("y")); - Schema yRenamed = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.optional(18, "z", Types.IntegerType.get()) - )) - ) - ); + Schema yRenamed = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); projected = writeAndRead("y_renamed", writeSchema, yRenamed, record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -539,15 +570,17 @@ public void testListOfStructsProjection() throws IOException { Assert.assertNull("Should not project y", projectedP2.getField("y")); Assert.assertNull("Should project null z", projectedP2.getField("z")); - Schema zAdded = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()), - Types.NestedField.optional(20, "z", Types.IntegerType.get()) - )) - ) - ); + Schema zAdded = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()), + Types.NestedField.optional(20, "z", Types.IntegerType.get()))))); projected = writeAndRead("z_added", writeSchema, zAdded, record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -565,10 +598,10 @@ public void testListOfStructsProjection() throws IOException { } private static org.apache.avro.Schema fromOption(org.apache.avro.Schema schema) { - Preconditions.checkArgument(schema.getType() == UNION, - "Expected union schema but was passed: %s", schema); - Preconditions.checkArgument(schema.getTypes().size() == 2, - "Expected optional schema, but was passed: %s", schema); + Preconditions.checkArgument( + schema.getType() == UNION, "Expected union schema but was passed: %s", schema); + Preconditions.checkArgument( + schema.getTypes().size() == 2, "Expected optional schema, but was passed: %s", schema); if (schema.getTypes().get(0).getType() == org.apache.avro.Schema.Type.NULL) { return schema.getTypes().get(1); } else { diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java index 22756dd36717..9661cfe20b1c 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import java.util.List; import org.apache.hadoop.conf.Configuration; @@ -43,18 +44,14 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestSnapshotSelection { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; @@ -79,48 +76,40 @@ public void testSnapshotSelectionById() throws IOException { Table table = tables.create(SCHEMA, spec, tableLocation); // produce the first snapshot - List firstBatchRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List firstBatchRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); // produce the second snapshot - List secondBatchRecords = Lists.newArrayList( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e"), - new SimpleRecord(6, "f") - ); + List secondBatchRecords = + Lists.newArrayList( + new SimpleRecord(4, "d"), new SimpleRecord(5, "e"), new SimpleRecord(6, "f")); Dataset secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class); secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); Assert.assertEquals("Expected 2 snapshots", 2, Iterables.size(table.snapshots())); // verify records in the current snapshot - Dataset currentSnapshotResult = spark.read() - .format("iceberg") - .load(tableLocation); - List currentSnapshotRecords = currentSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset currentSnapshotResult = spark.read().format("iceberg").load(tableLocation); + List currentSnapshotRecords = + currentSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(firstBatchRecords); expectedRecords.addAll(secondBatchRecords); - Assert.assertEquals("Current snapshot rows should match", expectedRecords, currentSnapshotRecords); + Assert.assertEquals( + "Current snapshot rows should match", expectedRecords, currentSnapshotRecords); // verify records in the previous snapshot Snapshot currentSnapshot = table.currentSnapshot(); Long parentSnapshotId = currentSnapshot.parentId(); - Dataset previousSnapshotResult = spark.read() - .format("iceberg") - .option("snapshot-id", parentSnapshotId) - .load(tableLocation); - List previousSnapshotRecords = previousSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - Assert.assertEquals("Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); + Dataset previousSnapshotResult = + spark.read().format("iceberg").option("snapshot-id", parentSnapshotId).load(tableLocation); + List previousSnapshotRecords = + previousSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Assert.assertEquals( + "Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); } @Test @@ -132,11 +121,9 @@ public void testSnapshotSelectionByTimestamp() throws IOException { Table table = tables.create(SCHEMA, spec, tableLocation); // produce the first snapshot - List firstBatchRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List firstBatchRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); @@ -144,37 +131,35 @@ public void testSnapshotSelectionByTimestamp() throws IOException { long firstSnapshotTimestamp = System.currentTimeMillis(); // produce the second snapshot - List secondBatchRecords = Lists.newArrayList( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e"), - new SimpleRecord(6, "f") - ); + List secondBatchRecords = + Lists.newArrayList( + new SimpleRecord(4, "d"), new SimpleRecord(5, "e"), new SimpleRecord(6, "f")); Dataset secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class); secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); Assert.assertEquals("Expected 2 snapshots", 2, Iterables.size(table.snapshots())); // verify records in the current snapshot - Dataset currentSnapshotResult = spark.read() - .format("iceberg") - .load(tableLocation); - List currentSnapshotRecords = currentSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset currentSnapshotResult = spark.read().format("iceberg").load(tableLocation); + List currentSnapshotRecords = + currentSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(firstBatchRecords); expectedRecords.addAll(secondBatchRecords); - Assert.assertEquals("Current snapshot rows should match", expectedRecords, currentSnapshotRecords); + Assert.assertEquals( + "Current snapshot rows should match", expectedRecords, currentSnapshotRecords); // verify records in the previous snapshot - Dataset previousSnapshotResult = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, firstSnapshotTimestamp) - .load(tableLocation); - List previousSnapshotRecords = previousSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - Assert.assertEquals("Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); + Dataset previousSnapshotResult = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, firstSnapshotTimestamp) + .load(tableLocation); + List previousSnapshotRecords = + previousSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Assert.assertEquals( + "Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); } @Test @@ -185,14 +170,11 @@ public void testSnapshotSelectionByInvalidSnapshotId() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, tableLocation); - Dataset df = spark.read() - .format("iceberg") - .option("snapshot-id", -10) - .load(tableLocation); + Dataset df = spark.read().format("iceberg").option("snapshot-id", -10).load(tableLocation); Assertions.assertThatThrownBy(df::collectAsList) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot find snapshot with ID -10"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot find snapshot with ID -10"); } @Test @@ -204,12 +186,15 @@ public void testSnapshotSelectionByInvalidTimestamp() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, tableLocation); - Assertions.assertThatThrownBy(() -> spark.read() + Assertions.assertThatThrownBy( + () -> + spark + .read() .format("iceberg") .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) .load(tableLocation)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot find a snapshot older than"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot find a snapshot older than"); } @Test @@ -220,24 +205,25 @@ public void testSnapshotSelectionBySnapshotIdAndTimestamp() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, tableLocation); - List firstBatchRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List firstBatchRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); long timestamp = System.currentTimeMillis(); long snapshotId = table.currentSnapshot().snapshotId(); - Assertions.assertThatThrownBy(() -> spark.read() + Assertions.assertThatThrownBy( + () -> + spark + .read() .format("iceberg") .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) .load(tableLocation)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot specify both snapshot-id") - .hasMessageContaining("and as-of-timestamp"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot specify both snapshot-id") + .hasMessageContaining("and as-of-timestamp"); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java index bda525780d8b..3fb2a630fe81 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -41,13 +40,13 @@ public TestSparkAppenderFactory(String fileFormat, boolean partitioned) { } @Override - protected FileAppenderFactory createAppenderFactory(List equalityFieldIds, - Schema eqDeleteSchema, - Schema posDeleteRowSchema) { + protected FileAppenderFactory createAppenderFactory( + List equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema) { return SparkAppenderFactory.builderFor(table, table.schema(), sparkType) .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) .eqDeleteRowSchema(eqDeleteSchema) - .posDelRowSchema(posDeleteRowSchema).build(); + .posDelRowSchema(posDeleteRowSchema) + .build(); } @Override diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkBaseDataReader.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkBaseDataReader.java index 870be890da90..6c4239371476 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkBaseDataReader.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkBaseDataReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.FileFormat.PARQUET; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -48,13 +50,9 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.FileFormat.PARQUET; -import static org.apache.iceberg.Files.localOutput; - public class TestSparkBaseDataReader { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @@ -127,14 +125,12 @@ public void testClosureOnDataExhaustion() throws IOException { Assert.assertNotNull("Reader should return non-null value", reader.get()); } - Assert.assertEquals("Reader returned incorrect number of records", - totalTasks * recordPerTask, - countRecords - ); - tasks.forEach(t -> - Assert.assertTrue("All iterators should be closed after read exhausion", - reader.isIteratorClosed(t)) - ); + Assert.assertEquals( + "Reader returned incorrect number of records", totalTasks * recordPerTask, countRecords); + tasks.forEach( + t -> + Assert.assertTrue( + "All iterators should be closed after read exhausion", reader.isIteratorClosed(t))); } @Test @@ -150,13 +146,15 @@ public void testClosureDuringIteration() throws IOException { // Total of 2 elements Assert.assertTrue(reader.next()); - Assert.assertFalse("First iter should not be closed on its last element", - reader.isIteratorClosed(firstTask)); + Assert.assertFalse( + "First iter should not be closed on its last element", reader.isIteratorClosed(firstTask)); Assert.assertTrue(reader.next()); - Assert.assertTrue("First iter should be closed after moving to second iter", + Assert.assertTrue( + "First iter should be closed after moving to second iter", reader.isIteratorClosed(firstTask)); - Assert.assertFalse("Second iter should not be closed on its last element", + Assert.assertFalse( + "Second iter should not be closed on its last element", reader.isIteratorClosed(secondTask)); Assert.assertFalse(reader.next()); @@ -174,10 +172,10 @@ public void testClosureWithoutAnyRead() throws IOException { reader.close(); - tasks.forEach(t -> - Assert.assertFalse("Iterator should not be created eagerly for tasks", - reader.hasIterator(t)) - ); + tasks.forEach( + t -> + Assert.assertFalse( + "Iterator should not be created eagerly for tasks", reader.hasIterator(t))); } @Test @@ -198,12 +196,13 @@ public void testExplicitClosure() throws IOException { // Some tasks might have not been opened yet, so we don't have corresponding tracker for it. // But all that have been created must be closed. - tasks.forEach(t -> { - if (reader.hasIterator(t)) { - Assert.assertTrue("Iterator should be closed after read exhausion", - reader.isIteratorClosed(t)); - } - }); + tasks.forEach( + t -> { + if (reader.hasIterator(t)) { + Assert.assertTrue( + "Iterator should be closed after read exhausion", reader.isIteratorClosed(t)); + } + }); } @Test @@ -223,26 +222,26 @@ public void testIdempotentExplicitClosure() throws IOException { for (int closeAttempt = 0; closeAttempt < 5; closeAttempt++) { reader.close(); for (int i = 0; i < 5; i++) { - Assert.assertTrue("Iterator should be closed after read exhausion", + Assert.assertTrue( + "Iterator should be closed after read exhausion", reader.isIteratorClosed(tasks.get(i))); } for (int i = 5; i < 10; i++) { - Assert.assertFalse("Iterator should not be created eagerly for tasks", - reader.hasIterator(tasks.get(i))); + Assert.assertFalse( + "Iterator should not be created eagerly for tasks", reader.hasIterator(tasks.get(i))); } } } - private List createFileScanTasks(Integer totalTasks, Integer recordPerTask) throws IOException { + private List createFileScanTasks(Integer totalTasks, Integer recordPerTask) + throws IOException { String desc = "make_scan_tasks"; File parent = temp.newFolder(desc); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); try { this.table = TestTables.create(location, desc, schema, PartitionSpec.unpartitioned()); @@ -254,22 +253,21 @@ private List createFileScanTasks(Integer totalTasks, Integer recor AppendFiles appendFiles = table.newAppend(); for (int i = 0; i < totalTasks; i++) { File parquetFile = new File(dataFolder, PARQUET.addExtension(UUID.randomUUID().toString())); - try (FileAppender writer = Parquet.write(localOutput(parquetFile)) - .schema(tableSchema) - .build()) { + try (FileAppender writer = + Parquet.write(localOutput(parquetFile)).schema(tableSchema).build()) { writer.addAll(expected); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withFileSizeInBytes(parquetFile.length()) - .withPath(parquetFile.toString()) - .withRecordCount(recordPerTask) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withFileSizeInBytes(parquetFile.length()) + .withPath(parquetFile.toString()) + .withRecordCount(recordPerTask) + .build(); appendFiles.appendFile(file); } appendFiles.commit(); - return StreamSupport - .stream(table.newScan().planFiles().spliterator(), false) + return StreamSupport.stream(table.newScan().planFiles().spliterator(), false) .collect(Collectors.toList()); } finally { TestTables.clearTables(); diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalog.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalog.java index 027c88cd4df6..f61545df79a0 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalog.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.spark.SparkSessionCatalog; @@ -26,7 +25,8 @@ import org.apache.spark.sql.connector.catalog.Table; import org.apache.spark.sql.connector.catalog.TableCatalog; -public class TestSparkCatalog extends SparkSessionCatalog { +public class TestSparkCatalog + extends SparkSessionCatalog { @Override public Table loadTable(Identifier ident) throws NoSuchTableException { diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogCacheExpiration.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogCacheExpiration.java index 96aeed65bfa7..3d668197fd51 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogCacheExpiration.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogCacheExpiration.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -36,12 +35,16 @@ public class TestSparkCatalogCacheExpiration extends SparkTestBaseWithCatalog { private static final String sessionCatalogName = "spark_catalog"; private static final String sessionCatalogImpl = SparkSessionCatalog.class.getName(); - private static final Map sessionCatalogConfig = ImmutableMap.of( - "type", "hadoop", - "default-namespace", "default", - CatalogProperties.CACHE_ENABLED, "true", - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, "3000" - ); + private static final Map sessionCatalogConfig = + ImmutableMap.of( + "type", + "hadoop", + "default-namespace", + "default", + CatalogProperties.CACHE_ENABLED, + "true", + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + "3000"); private static String asSqlConfCatalogKeyFor(String catalog, String configKey) { // configKey is empty when the catalog's class is being defined @@ -58,19 +61,29 @@ private static String asSqlConfCatalogKeyFor(String catalog, String configKey) { public static void beforeClass() { // Catalog - expiration_disabled: Catalog with caching on and expiration disabled. ImmutableMap.of( - "", "org.apache.iceberg.spark.SparkCatalog", - "type", "hive", - CatalogProperties.CACHE_ENABLED, "true", - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, "-1" - ).forEach((k, v) -> spark.conf().set(asSqlConfCatalogKeyFor("expiration_disabled", k), v)); - - // Catalog - cache_disabled_implicitly: Catalog that does not cache, as the cache expiration interval is 0. + "", + "org.apache.iceberg.spark.SparkCatalog", + "type", + "hive", + CatalogProperties.CACHE_ENABLED, + "true", + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + "-1") + .forEach((k, v) -> spark.conf().set(asSqlConfCatalogKeyFor("expiration_disabled", k), v)); + + // Catalog - cache_disabled_implicitly: Catalog that does not cache, as the cache expiration + // interval is 0. ImmutableMap.of( - "", "org.apache.iceberg.spark.SparkCatalog", - "type", "hive", - CatalogProperties.CACHE_ENABLED, "true", - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, "0" - ).forEach((k, v) -> spark.conf().set(asSqlConfCatalogKeyFor("cache_disabled_implicitly", k), v)); + "", + "org.apache.iceberg.spark.SparkCatalog", + "type", + "hive", + CatalogProperties.CACHE_ENABLED, + "true", + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + "0") + .forEach( + (k, v) -> spark.conf().set(asSqlConfCatalogKeyFor("cache_disabled_implicitly", k), v)); } public TestSparkCatalogCacheExpiration() { @@ -85,56 +98,55 @@ public void testSparkSessionCatalogWithExpirationEnabled() { .extracting("cacheEnabled") .isEqualTo(true); - Assertions - .assertThat(sparkCatalog) + Assertions.assertThat(sparkCatalog) .extracting("icebergCatalog") .extracting("icebergCatalog") - .isInstanceOfSatisfying(Catalog.class, icebergCatalog -> { - Assertions.assertThat(icebergCatalog) - .isExactlyInstanceOf(CachingCatalog.class) - .extracting("expirationIntervalMillis") - .isEqualTo(3000L); - }); + .isInstanceOfSatisfying( + Catalog.class, + icebergCatalog -> { + Assertions.assertThat(icebergCatalog) + .isExactlyInstanceOf(CachingCatalog.class) + .extracting("expirationIntervalMillis") + .isEqualTo(3000L); + }); } @Test public void testCacheEnabledAndExpirationDisabled() { SparkCatalog sparkCatalog = getSparkCatalog("expiration_disabled"); - Assertions.assertThat(sparkCatalog) - .extracting("cacheEnabled") - .isEqualTo(true); + Assertions.assertThat(sparkCatalog).extracting("cacheEnabled").isEqualTo(true); - Assertions - .assertThat(sparkCatalog) + Assertions.assertThat(sparkCatalog) .extracting("icebergCatalog") - .isInstanceOfSatisfying(CachingCatalog.class, icebergCatalog -> { - Assertions.assertThat(icebergCatalog) - .extracting("expirationIntervalMillis") - .isEqualTo(-1L); - }); + .isInstanceOfSatisfying( + CachingCatalog.class, + icebergCatalog -> { + Assertions.assertThat(icebergCatalog) + .extracting("expirationIntervalMillis") + .isEqualTo(-1L); + }); } @Test public void testCacheDisabledImplicitly() { SparkCatalog sparkCatalog = getSparkCatalog("cache_disabled_implicitly"); - Assertions.assertThat(sparkCatalog) - .extracting("cacheEnabled") - .isEqualTo(false); + Assertions.assertThat(sparkCatalog).extracting("cacheEnabled").isEqualTo(false); - Assertions - .assertThat(sparkCatalog) + Assertions.assertThat(sparkCatalog) .extracting("icebergCatalog") .isInstanceOfSatisfying( Catalog.class, - icebergCatalog -> Assertions.assertThat(icebergCatalog).isNotInstanceOf(CachingCatalog.class)); + icebergCatalog -> + Assertions.assertThat(icebergCatalog).isNotInstanceOf(CachingCatalog.class)); } private SparkSessionCatalog sparkSessionCatalog() { - TableCatalog catalog = (TableCatalog) spark.sessionState().catalogManager().catalog("spark_catalog"); + TableCatalog catalog = + (TableCatalog) spark.sessionState().catalogManager().catalog("spark_catalog"); return (SparkSessionCatalog) catalog; } - private SparkCatalog getSparkCatalog(String catalog) { + private SparkCatalog getSparkCatalog(String catalog) { return (SparkCatalog) spark.sessionState().catalogManager().catalog(catalog); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogHadoopOverrides.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogHadoopOverrides.java index 267270308de5..607f1d45ba3a 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogHadoopOverrides.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogHadoopOverrides.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -37,7 +36,6 @@ import org.junit.Test; import org.junit.runners.Parameterized; - public class TestSparkCatalogHadoopOverrides extends SparkCatalogTestBase { private static final String configToOverride = "fs.s3a.buffer.dir"; @@ -49,29 +47,38 @@ public class TestSparkCatalogHadoopOverrides extends SparkCatalogTestBase { @Parameterized.Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") public static Object[][] parameters() { return new Object[][] { - { "testhive", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - hadoopPrefixedConfigToOverride, configOverrideValue - ) }, - { "testhadoop", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hadoop", - hadoopPrefixedConfigToOverride, configOverrideValue - ) }, - { "spark_catalog", SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - hadoopPrefixedConfigToOverride, configOverrideValue - ) } + { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", + "hive", + "default-namespace", + "default", + hadoopPrefixedConfigToOverride, + configOverrideValue) + }, + { + "testhadoop", + SparkCatalog.class.getName(), + ImmutableMap.of("type", "hadoop", hadoopPrefixedConfigToOverride, configOverrideValue) + }, + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", + "hive", + "default-namespace", + "default", + hadoopPrefixedConfigToOverride, + configOverrideValue) + } }; } - public TestSparkCatalogHadoopOverrides(String catalogName, - String implementation, - Map config) { + public TestSparkCatalogHadoopOverrides( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -92,7 +99,8 @@ public void testTableFromCatalogHasOverrides() throws Exception { String actualCatalogOverride = conf.get(configToOverride, "/whammies"); Assert.assertEquals( "Iceberg tables from spark should have the overridden hadoop configurations from the spark config", - configOverrideValue, actualCatalogOverride); + configOverrideValue, + actualCatalogOverride); } @Test @@ -102,16 +110,19 @@ public void ensureRoundTripSerializedTableRetainsHadoopConfig() throws Exception String actualCatalogOverride = originalConf.get(configToOverride, "/whammies"); Assert.assertEquals( "Iceberg tables from spark should have the overridden hadoop configurations from the spark config", - configOverrideValue, actualCatalogOverride); + configOverrideValue, + actualCatalogOverride); // Now convert to SerializableTable and ensure overridden property is still present. Table serializableTable = SerializableTableWithSize.copyOf(table); - Table kryoSerializedTable = KryoHelpers.roundTripSerialize(SerializableTableWithSize.copyOf(table)); + Table kryoSerializedTable = + KryoHelpers.roundTripSerialize(SerializableTableWithSize.copyOf(table)); Configuration configFromKryoSerde = ((Configurable) kryoSerializedTable.io()).getConf(); String kryoSerializedCatalogOverride = configFromKryoSerde.get(configToOverride, "/whammies"); Assert.assertEquals( "Tables serialized with Kryo serialization should retain overridden hadoop configuration properties", - configOverrideValue, kryoSerializedCatalogOverride); + configOverrideValue, + kryoSerializedCatalogOverride); // Do the same for Java based serde Table javaSerializedTable = TestHelpers.roundTripSerialize(serializableTable); @@ -119,14 +130,16 @@ public void ensureRoundTripSerializedTableRetainsHadoopConfig() throws Exception String javaSerializedCatalogOverride = configFromJavaSerde.get(configToOverride, "/whammies"); Assert.assertEquals( "Tables serialized with Java serialization should retain overridden hadoop configuration properties", - configOverrideValue, javaSerializedCatalogOverride); + configOverrideValue, + javaSerializedCatalogOverride); } @SuppressWarnings("ThrowSpecificity") private Table getIcebergTableFromSparkCatalog() throws Exception { Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name()); - TableCatalog catalog = (TableCatalog) spark.sessionState().catalogManager().catalog(catalogName); - SparkTable sparkTable = (SparkTable) catalog.loadTable(identifier); + TableCatalog catalog = + (TableCatalog) spark.sessionState().catalogManager().catalog(catalogName); + SparkTable sparkTable = (SparkTable) catalog.loadTable(identifier); return sparkTable.table(); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java index cd1404766d46..b1f2082b5d9b 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -62,43 +64,42 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkDataFile { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - optional(103, "i", Types.IntegerType.get()), - required(104, "l", Types.LongType.get()), - optional(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - optional(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - optional(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision - ); - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .identity("b") - .bucket("i", 2) - .identity("l") - .identity("f") - .identity("d") - .identity("date") - .hour("ts") - .identity("ts") - .truncate("s", 2) - .identity("bytes") - .bucket("dec_9_0", 2) - .bucket("dec_11_2", 2) - .bucket("dec_38_10", 2) - .build(); + private static final Schema SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), + optional(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + optional(103, "i", Types.IntegerType.get()), + required(104, "l", Types.LongType.get()), + optional(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + optional(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + optional(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision + ); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA) + .identity("b") + .bucket("i", 2) + .identity("l") + .identity("f") + .identity("d") + .identity("date") + .hour("ts") + .identity("ts") + .truncate("s", 2) + .identity("bytes") + .bucket("dec_9_0", 2) + .bucket("dec_11_2", 2) + .bucket("dec_38_10", 2) + .build(); private static SparkSession spark; private static JavaSparkContext sparkContext = null; @@ -117,8 +118,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String tableLocation = null; @Before @@ -129,7 +129,8 @@ public void setupTableLocation() throws Exception { @Test public void testValueConversion() throws IOException { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); checkSparkDataFile(table); } @@ -150,7 +151,9 @@ public void testValueConversionWithEmptyStats() throws IOException { private void checkSparkDataFile(Table table) throws IOException { Iterable rows = RandomData.generateSpark(table.schema(), 200, 0); JavaRDD rdd = sparkContext.parallelize(Lists.newArrayList(rows)); - Dataset df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false); + Dataset df = + spark.internalCreateDataFrame( + JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false); df.write().format("iceberg").mode("append").save(tableLocation); @@ -170,16 +173,15 @@ private void checkSparkDataFile(Table table) throws IOException { Dataset dataFileDF = spark.read().format("iceberg").load(tableLocation + "#files"); // reorder columns to test arbitrary projections - List columns = Arrays.stream(dataFileDF.columns()) - .map(ColumnName::new) - .collect(Collectors.toList()); + List columns = + Arrays.stream(dataFileDF.columns()).map(ColumnName::new).collect(Collectors.toList()); Collections.shuffle(columns); - List sparkDataFiles = dataFileDF - .select(Iterables.toArray(columns, Column.class)) - .collectAsList(); + List sparkDataFiles = + dataFileDF.select(Iterables.toArray(columns, Column.class)).collectAsList(); - Assert.assertEquals("The number of files should match", dataFiles.size(), sparkDataFiles.size()); + Assert.assertEquals( + "The number of files should match", dataFiles.size(), sparkDataFiles.size()); Types.StructType dataFileType = DataFile.getType(table.spec().partitionType()); StructType sparkDataFileType = sparkDataFiles.get(0).schema(); @@ -195,9 +197,14 @@ private void checkDataFile(DataFile expected, DataFile actual) { Assert.assertEquals("Format must match", expected.format(), actual.format()); Assert.assertEquals("Record count must match", expected.recordCount(), actual.recordCount()); Assert.assertEquals("Size must match", expected.fileSizeInBytes(), actual.fileSizeInBytes()); - Assert.assertEquals("Record value counts must match", expected.valueCounts(), actual.valueCounts()); - Assert.assertEquals("Record null value counts must match", expected.nullValueCounts(), actual.nullValueCounts()); - Assert.assertEquals("Record nan value counts must match", expected.nanValueCounts(), actual.nanValueCounts()); + Assert.assertEquals( + "Record value counts must match", expected.valueCounts(), actual.valueCounts()); + Assert.assertEquals( + "Record null value counts must match", + expected.nullValueCounts(), + actual.nullValueCounts()); + Assert.assertEquals( + "Record nan value counts must match", expected.nanValueCounts(), actual.nanValueCounts()); Assert.assertEquals("Lower bounds must match", expected.lowerBounds(), actual.lowerBounds()); Assert.assertEquals("Upper bounds must match", expected.upperBounds(), actual.upperBounds()); Assert.assertEquals("Key metadata must match", expected.keyMetadata(), actual.keyMetadata()); @@ -210,7 +217,8 @@ private void checkDataFile(DataFile expected, DataFile actual) { private void checkStructLike(StructLike expected, StructLike actual) { Assert.assertEquals("Struct size should match", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i++) { - Assert.assertEquals("Struct values must match", expected.get(i, Object.class), actual.get(i, Object.class)); + Assert.assertEquals( + "Struct values must match", expected.get(i, Object.class), actual.get(i, Object.class)); } } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java index 5b158c518ae4..b2db853d4753 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + import java.io.File; import java.io.IOException; import java.util.List; @@ -56,28 +61,20 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - @RunWith(Parameterized.class) public class TestSparkDataWrite { private static final Configuration CONF = new Configuration(); private final FileFormat format; private static SparkSession spark = null; - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Parameterized.Parameters(name = "format = {0}") public static Object[] parameters() { - return new Object[] { "parquet", "avro", "orc" }; + return new Object[] {"parquet", "avro", "orc"}; } @BeforeClass @@ -110,15 +107,14 @@ public void testBasicWrite() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); // TODO: incoming columns must be ordered according to the table's schema - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -126,11 +122,10 @@ public void testBasicWrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); for (ManifestFile manifest : table.currentSnapshot().allManifests(table.io())) { @@ -161,30 +156,31 @@ public void testAppend() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); - - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "a"), - new SimpleRecord(5, "b"), - new SimpleRecord(6, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); + + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "a"), + new SimpleRecord(5, "b"), + new SimpleRecord(6, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); - df.withColumn("id", df.col("id").plus(3)).select("id", "data").write() + df.withColumn("id", df.col("id").plus(3)) + .select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -192,11 +188,10 @@ public void testAppend() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -210,23 +205,24 @@ public void testEmptyOverwrite() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); List expected = records; Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); Dataset empty = spark.createDataFrame(ImmutableList.of(), SimpleRecord.class); - empty.select("id", "data").write() + empty + .select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Overwrite) @@ -235,11 +231,10 @@ public void testEmptyOverwrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -253,30 +248,31 @@ public void testOverwrite() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "a"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "b"), - new SimpleRecord(6, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "a"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "b"), + new SimpleRecord(6, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); // overwrite with 2*id to replace record 2, append 4 and 6 - df.withColumn("id", df.col("id").multiply(2)).select("id", "data").write() + df.withColumn("id", df.col("id").multiply(2)) + .select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Overwrite) @@ -285,11 +281,10 @@ public void testOverwrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -303,22 +298,22 @@ public void testUnpartitionedOverwrite() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); // overwrite with the same data; should not produce two copies - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Overwrite) @@ -326,11 +321,10 @@ public void testUnpartitionedOverwrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -344,7 +338,8 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "4") // ~4 bytes; low enough to trigger .commit(); @@ -355,7 +350,8 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -363,11 +359,10 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -379,7 +374,8 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws } Assert.assertEquals("Should have 4 DataFiles", 4, files.size()); - Assert.assertTrue("All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); + Assert.assertTrue( + "All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); } @Test @@ -410,15 +406,14 @@ public void testWriteProjection() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, null), - new SimpleRecord(2, null), - new SimpleRecord(3, null) - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null)); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id").write() // select only id column + df.select("id") + .write() // select only id column .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -426,11 +421,10 @@ public void testWriteProjection() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -446,22 +440,23 @@ public void testWriteProjectionWithMiddle() throws IOException { HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); - Schema schema = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + Schema schema = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); Table table = tables.create(schema, spec, location.toString()); - List expected = Lists.newArrayList( - new ThreeColumnRecord(1, null, "hello"), - new ThreeColumnRecord(2, null, "world"), - new ThreeColumnRecord(3, null, null) - ); + List expected = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "hello"), + new ThreeColumnRecord(2, null, "world"), + new ThreeColumnRecord(3, null, null)); Dataset df = spark.createDataFrame(expected, ThreeColumnRecord.class); - df.select("c1", "c3").write() + df.select("c1", "c3") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -469,11 +464,10 @@ public void testWriteProjectionWithMiddle() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); + List actual = + result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -487,44 +481,39 @@ public void testViewsReturnRecentResults() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); - Dataset query = spark.read() - .format("iceberg") - .load(location.toString()) - .where("id = 1"); + Dataset query = spark.read().format("iceberg").load(location.toString()).where("id = 1"); query.createOrReplaceTempView("tmp"); - List actual1 = spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - List expected1 = Lists.newArrayList( - new SimpleRecord(1, "a") - ); + List actual1 = + spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List expected1 = Lists.newArrayList(new SimpleRecord(1, "a")); Assert.assertEquals("Number of rows should match", expected1.size(), actual1.size()); Assert.assertEquals("Result rows should match", expected1, actual1); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); - List actual2 = spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - List expected2 = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(1, "a") - ); + List actual2 = + spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List expected2 = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(1, "a")); Assert.assertEquals("Number of rows should match", expected2.size(), actual2.size()); Assert.assertEquals("Result rows should match", expected2, actual2); } @@ -550,7 +539,9 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti switch (option) { case NONE: - df.select("id", "data").sort("data").write() + df.select("id", "data") + .sort("data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -559,7 +550,8 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti break; case TABLE: table.updateProperties().set(SPARK_WRITE_PARTITIONED_FANOUT_ENABLED, "true").commit(); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -567,7 +559,8 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti .save(location.toString()); break; case JOB: - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -581,11 +574,10 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -597,7 +589,8 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti } Assert.assertEquals("Should have 8 DataFiles", 8, files.size()); - Assert.assertTrue("All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); + Assert.assertTrue( + "All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); } @Test @@ -609,20 +602,21 @@ public void testCommitUnknownException() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); AppendFiles append = table.newFastAppend(); AppendFiles spyAppend = spy(append); - doAnswer(invocation -> { - append.commit(); - throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); - }).when(spyAppend).commit(); + doAnswer( + invocation -> { + append.commit(); + throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); + }) + .when(spyAppend) + .commit(); Table spyTable = spy(table); when(spyTable.newAppend()).thenReturn(spyAppend); @@ -632,20 +626,25 @@ public void testCommitUnknownException() throws IOException { ManualSource.setTable(manualTableName, sparkTable); // Although an exception is thrown here, write and commit have succeeded - AssertHelpers.assertThrowsWithCause("Should throw a Commit State Unknown Exception", + AssertHelpers.assertThrowsWithCause( + "Should throw a Commit State Unknown Exception", SparkException.class, "Writing job aborted", CommitStateUnknownException.class, "Datacenter on Fire", - () -> df.select("id", "data").sort("data").write() - .format("org.apache.iceberg.spark.source.ManualSource") - .option(ManualSource.TABLE_NAME, manualTableName) - .mode(SaveMode.Append) - .save(location.toString())); + () -> + df.select("id", "data") + .sort("data") + .write() + .format("org.apache.iceberg.spark.source.ManualSource") + .option(ManualSource.TABLE_NAME, manualTableName) + .mode(SaveMode.Append) + .save(location.toString())); // Since write and commit succeeded, the rows should be readable Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", records.size(), actual.size()); Assert.assertEquals("Result rows should match", records, actual); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java index 702e8ab98990..4a3263e368c0 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -39,9 +38,11 @@ public TestSparkFileWriterFactory(FileFormat fileFormat, boolean partitioned) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFilesScan.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFilesScan.java index 63195cfd3967..d0959d6866bc 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFilesScan.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFilesScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -53,10 +52,8 @@ public void removeTables() { public void testTaskSetLoading() throws NoSuchTableException, IOException { sql("CREATE TABLE %s (id INT, data STRING) USING iceberg", tableName); - List records = ImmutableList.of( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + ImmutableList.of(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.writeTo(tableName).append(); @@ -69,15 +66,19 @@ public void testTaskSetLoading() throws NoSuchTableException, IOException { taskSetManager.stageTasks(table, setID, ImmutableList.copyOf(fileScanTasks)); // load the staged file set - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) + .load(tableName); // write the records back essentially duplicating data scanDF.writeTo(tableName).append(); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "a"), row(1, "a"), row(2, "b"), row(2, "b")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -86,10 +87,8 @@ public void testTaskSetLoading() throws NoSuchTableException, IOException { public void testTaskSetPlanning() throws NoSuchTableException, IOException { sql("CREATE TABLE %s (id INT, data STRING) USING iceberg", tableName); - List records = ImmutableList.of( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + ImmutableList.of(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.coalesce(1).writeTo(tableName).append(); df.coalesce(1).writeTo(tableName).append(); @@ -104,17 +103,23 @@ public void testTaskSetPlanning() throws NoSuchTableException, IOException { taskSetManager.stageTasks(table, setID, tasks); // load the staged file set and make sure each file is in a separate split - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) - .option(SparkReadOptions.SPLIT_SIZE, tasks.get(0).file().fileSizeInBytes()) - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) + .option(SparkReadOptions.SPLIT_SIZE, tasks.get(0).file().fileSizeInBytes()) + .load(tableName); Assert.assertEquals("Num partitions should match", 2, scanDF.javaRDD().getNumPartitions()); // load the staged file set and make sure we combine both files into a single split - scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) - .option(SparkReadOptions.SPLIT_SIZE, Long.MAX_VALUE) - .load(tableName); + scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) + .option(SparkReadOptions.SPLIT_SIZE, Long.MAX_VALUE) + .load(tableName); Assert.assertEquals("Num partitions should match", 1, scanDF.javaRDD().getNumPartitions()); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java index be74d1c5a33b..c3bb35ca7df8 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -42,26 +41,32 @@ public TestSparkMergingMetrics(FileFormat fileFormat) { @Override protected FileAppender writeAndGetAppender(List records) throws IOException { - Table testTable = new BaseTable(null, "dummy") { - @Override - public Map properties() { - return Collections.emptyMap(); - } - @Override - public SortOrder sortOrder() { - return SortOrder.unsorted(); - } - @Override - public PartitionSpec spec() { - return PartitionSpec.unpartitioned(); - } - }; + Table testTable = + new BaseTable(null, "dummy") { + @Override + public Map properties() { + return Collections.emptyMap(); + } + + @Override + public SortOrder sortOrder() { + return SortOrder.unsorted(); + } + + @Override + public PartitionSpec spec() { + return PartitionSpec.unpartitioned(); + } + }; FileAppender appender = - SparkAppenderFactory.builderFor(testTable, SCHEMA, SparkSchemaUtil.convert(SCHEMA)).build() + SparkAppenderFactory.builderFor(testTable, SCHEMA, SparkSchemaUtil.convert(SCHEMA)) + .build() .newAppender(org.apache.iceberg.Files.localOutput(temp.newFile()), fileFormat); try (FileAppender fileAppender = appender) { - records.stream().map(r -> new StructInternalRow(SCHEMA.asStruct()).setStruct(r)).forEach(fileAppender::add); + records.stream() + .map(r -> new StructInternalRow(SCHEMA.asStruct()).setStruct(r)) + .forEach(fileAppender::add); } return appender; } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMetadataColumns.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMetadataColumns.java index b29d281863cb..5ee042f55e66 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMetadataColumns.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMetadataColumns.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.FORMAT_VERSION; +import static org.apache.iceberg.TableProperties.ORC_VECTORIZATION_ENABLED; +import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; + import java.io.IOException; import java.util.List; import org.apache.iceberg.AssertHelpers; @@ -48,41 +52,37 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.FORMAT_VERSION; -import static org.apache.iceberg.TableProperties.ORC_VECTORIZATION_ENABLED; -import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; - @RunWith(Parameterized.class) public class TestSparkMetadataColumns extends SparkTestBase { private static final String TABLE_NAME = "test_table"; - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "category", Types.StringType.get()), - Types.NestedField.optional(3, "data", Types.StringType.get()) - ); - private static final PartitionSpec UNKNOWN_SPEC = PartitionSpecParser.fromJson(SCHEMA, - "{ \"spec-id\": 1, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "category", Types.StringType.get()), + Types.NestedField.optional(3, "data", Types.StringType.get())); + private static final PartitionSpec UNKNOWN_SPEC = + PartitionSpecParser.fromJson( + SCHEMA, + "{ \"spec-id\": 1, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); @Parameterized.Parameters(name = "fileFormat = {0}, vectorized = {1}, formatVersion = {2}") public static Object[][] parameters() { return new Object[][] { - { FileFormat.PARQUET, false, 1}, - { FileFormat.PARQUET, true, 1}, - { FileFormat.PARQUET, false, 2}, - { FileFormat.PARQUET, true, 2}, - { FileFormat.AVRO, false, 1}, - { FileFormat.AVRO, false, 2}, - { FileFormat.ORC, false, 1}, - { FileFormat.ORC, true, 1}, - { FileFormat.ORC, false, 2}, - { FileFormat.ORC, true, 2}, + {FileFormat.PARQUET, false, 1}, + {FileFormat.PARQUET, true, 1}, + {FileFormat.PARQUET, false, 2}, + {FileFormat.PARQUET, true, 2}, + {FileFormat.AVRO, false, 1}, + {FileFormat.AVRO, false, 2}, + {FileFormat.ORC, false, 1}, + {FileFormat.ORC, true, 1}, + {FileFormat.ORC, false, 2}, + {FileFormat.ORC, true, 2}, }; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final FileFormat fileFormat; private final boolean vectorized; @@ -98,13 +98,16 @@ public TestSparkMetadataColumns(FileFormat fileFormat, boolean vectorized, int f @BeforeClass public static void setupSpark() { - ImmutableMap config = ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "cache-enabled", "true" - ); - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); - config.forEach((key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); + ImmutableMap config = + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "cache-enabled", "true"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); + config.forEach( + (key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); } @Before @@ -127,36 +130,32 @@ public void testSpecAndPartitionMetadataColumns() { sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", TABLE_NAME); table.refresh(); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", TABLE_NAME); table.refresh(); - table.updateSpec() - .addField(Expressions.bucket("category", 8)) - .commit(); + table.updateSpec().addField(Expressions.bucket("category", 8)).commit(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", TABLE_NAME); table.refresh(); - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", TABLE_NAME); table.refresh(); - table.updateSpec() - .renameField("category_bucket_8", "category_bucket_8_another_name") - .commit(); - - List expected = ImmutableList.of( - row(0, row(null, null)), - row(1, row("b1", null)), - row(2, row("b1", 2)), - row(3, row(null, 2)) - ); - assertEquals("Rows must match", expected, - sql("SELECT _spec_id, _partition FROM `%s$_spec_id,_partition` ORDER BY _spec_id", TABLE_NAME)); + table.updateSpec().renameField("category_bucket_8", "category_bucket_8_another_name").commit(); + + List expected = + ImmutableList.of( + row(0, row(null, null)), + row(1, row("b1", null)), + row(2, row("b1", 2)), + row(3, row(null, 2))); + assertEquals( + "Rows must match", + expected, + sql( + "SELECT _spec_id, _partition FROM `%s$_spec_id,_partition` ORDER BY _spec_id", + TABLE_NAME)); } @Test @@ -166,13 +165,16 @@ public void testPartitionMetadataColumnWithUnknownTransforms() { TableMetadata base = ops.current(); ops.commit(base, base.updatePartitionSpec(UNKNOWN_SPEC)); - AssertHelpers.assertThrows("Should fail to query the partition metadata column", - ValidationException.class, "Cannot build table partition type, unknown transforms", + AssertHelpers.assertThrows( + "Should fail to query the partition metadata column", + ValidationException.class, + "Cannot build table partition type, unknown transforms", () -> sql("SELECT _partition FROM `%s$_partition`", TABLE_NAME)); } private void createAndInitTable() throws IOException { - this.table = TestTables.create(temp.newFolder(), TABLE_NAME, SCHEMA, PartitionSpec.unpartitioned()); + this.table = + TestTables.create(temp.newFolder(), TABLE_NAME, SCHEMA, PartitionSpec.unpartitioned()); UpdateProperties updateProperties = table.updateProperties(); updateProperties.set(FORMAT_VERSION, String.valueOf(formatVersion)); @@ -186,7 +188,8 @@ private void createAndInitTable() throws IOException { updateProperties.set(ORC_VECTORIZATION_ENABLED, String.valueOf(vectorized)); break; default: - Preconditions.checkState(!vectorized, "File format %s does not support vectorized reads", fileFormat); + Preconditions.checkState( + !vectorized, "File format %s does not support vectorized reads", fileFormat); } updateProperties.commit(); diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java index 4d07cfbe86ea..276d8c632fc0 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -39,9 +38,11 @@ public TestSparkPartitioningWriters(FileFormat fileFormat) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java index 480448e13a8f..245c392774f5 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -39,9 +38,11 @@ public TestSparkPositionDeltaWriters(FileFormat fileFormat) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java index f42b48d0e30d..7d6f0e76f78f 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -51,10 +54,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSparkReadProjection extends TestReadProjection { @@ -63,11 +62,11 @@ public class TestSparkReadProjection extends TestReadProjection { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } @@ -83,14 +82,17 @@ public TestSparkReadProjection(String format, boolean vectorized) { @BeforeClass public static void startSpark() { TestSparkReadProjection.spark = SparkSession.builder().master("local[2]").getOrCreate(); - ImmutableMap config = ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "parquet-enabled", "true", - "cache-enabled", "false" - ); - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); - config.forEach((key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); + ImmutableMap config = + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "parquet-enabled", "true", + "cache-enabled", "false"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); + config.forEach( + (key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); } @AfterClass @@ -101,8 +103,8 @@ public static void stopSpark() { } @Override - protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, - Record record) throws IOException { + protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, Record record) + throws IOException { File parent = temp.newFolder(desc); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); @@ -116,16 +118,17 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema // When tables are created, the column ids are reassigned. Schema tableSchema = table.schema(); - try (FileAppender writer = new GenericAppenderFactory(tableSchema).newAppender( - localOutput(testFile), format)) { + try (FileAppender writer = + new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), format)) { writer.add(record); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(100) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(100) + .withFileSizeInBytes(testFile.length()) + .withPath(testFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); @@ -139,14 +142,16 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema Schema expectedSchema = reassignIds(readSchema, idMapping); // Set the schema to the expected schema directly to simulate the table schema evolving - TestTables.replaceMetadata(desc, - TestTables.readMetadata(desc).updateSchema(expectedSchema, 100)); + TestTables.replaceMetadata( + desc, TestTables.readMetadata(desc).updateSchema(expectedSchema, 100)); - Dataset df = spark.read() - .format("org.apache.iceberg.spark.source.TestIcebergSource") - .option("iceberg.table.name", desc) - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(); + Dataset df = + spark + .read() + .format("org.apache.iceberg.spark.source.TestIcebergSource") + .option("iceberg.table.name", desc) + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(); return SparkValueConverter.convert(readSchema, df.collectAsList().get(0)); @@ -157,87 +162,98 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema private List allIds(Schema schema) { List ids = Lists.newArrayList(); - TypeUtil.visit(schema, new TypeUtil.SchemaVisitor() { - @Override - public Void field(Types.NestedField field, Void fieldResult) { - ids.add(field.fieldId()); - return null; - } + TypeUtil.visit( + schema, + new TypeUtil.SchemaVisitor() { + @Override + public Void field(Types.NestedField field, Void fieldResult) { + ids.add(field.fieldId()); + return null; + } - @Override - public Void list(Types.ListType list, Void elementResult) { - ids.add(list.elementId()); - return null; - } + @Override + public Void list(Types.ListType list, Void elementResult) { + ids.add(list.elementId()); + return null; + } - @Override - public Void map(Types.MapType map, Void keyResult, Void valueResult) { - ids.add(map.keyId()); - ids.add(map.valueId()); - return null; - } - }); + @Override + public Void map(Types.MapType map, Void keyResult, Void valueResult) { + ids.add(map.keyId()); + ids.add(map.valueId()); + return null; + } + }); return ids; } private Schema reassignIds(Schema schema, Map idMapping) { - return new Schema(TypeUtil.visit(schema, new TypeUtil.SchemaVisitor() { - private int mapId(int id) { - if (idMapping.containsKey(id)) { - return idMapping.get(id); - } - return 1000 + id; // make sure the new IDs don't conflict with reassignment - } + return new Schema( + TypeUtil.visit( + schema, + new TypeUtil.SchemaVisitor() { + private int mapId(int id) { + if (idMapping.containsKey(id)) { + return idMapping.get(id); + } + return 1000 + id; // make sure the new IDs don't conflict with reassignment + } - @Override - public Type schema(Schema schema, Type structResult) { - return structResult; - } + @Override + public Type schema(Schema schema, Type structResult) { + return structResult; + } - @Override - public Type struct(Types.StructType struct, List fieldResults) { - List newFields = Lists.newArrayListWithExpectedSize(fieldResults.size()); - List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - if (field.isOptional()) { - newFields.add(optional(mapId(field.fieldId()), field.name(), fieldResults.get(i))); - } else { - newFields.add(required(mapId(field.fieldId()), field.name(), fieldResults.get(i))); - } - } - return Types.StructType.of(newFields); - } + @Override + public Type struct(Types.StructType struct, List fieldResults) { + List newFields = + Lists.newArrayListWithExpectedSize(fieldResults.size()); + List fields = struct.fields(); + for (int i = 0; i < fields.size(); i += 1) { + Types.NestedField field = fields.get(i); + if (field.isOptional()) { + newFields.add( + optional(mapId(field.fieldId()), field.name(), fieldResults.get(i))); + } else { + newFields.add( + required(mapId(field.fieldId()), field.name(), fieldResults.get(i))); + } + } + return Types.StructType.of(newFields); + } - @Override - public Type field(Types.NestedField field, Type fieldResult) { - return fieldResult; - } + @Override + public Type field(Types.NestedField field, Type fieldResult) { + return fieldResult; + } - @Override - public Type list(Types.ListType list, Type elementResult) { - if (list.isElementOptional()) { - return Types.ListType.ofOptional(mapId(list.elementId()), elementResult); - } else { - return Types.ListType.ofRequired(mapId(list.elementId()), elementResult); - } - } + @Override + public Type list(Types.ListType list, Type elementResult) { + if (list.isElementOptional()) { + return Types.ListType.ofOptional(mapId(list.elementId()), elementResult); + } else { + return Types.ListType.ofRequired(mapId(list.elementId()), elementResult); + } + } - @Override - public Type map(Types.MapType map, Type keyResult, Type valueResult) { - if (map.isValueOptional()) { - return Types.MapType.ofOptional( - mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); - } else { - return Types.MapType.ofRequired( - mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); - } - } + @Override + public Type map(Types.MapType map, Type keyResult, Type valueResult) { + if (map.isValueOptional()) { + return Types.MapType.ofOptional( + mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); + } else { + return Types.MapType.ofRequired( + mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); + } + } - @Override - public Type primitive(Type.PrimitiveType primitive) { - return primitive; - } - }).asNestedType().asStructType().fields()); + @Override + public Type primitive(Type.PrimitiveType primitive) { + return primitive; + } + }) + .asNestedType() + .asStructType() + .fields()); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java index e543a408e8ce..462f34530725 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; + import java.io.IOException; import java.util.List; import org.apache.hadoop.hive.conf.HiveConf; @@ -60,8 +61,6 @@ import org.junit.BeforeClass; import org.junit.Test; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; - public class TestSparkReaderDeletes extends DeleteReadTests { private static TestHiveMetastore metastore = null; @@ -74,15 +73,18 @@ public static void startMetastoreAndSpark() { metastore.start(); HiveConf hiveConf = metastore.hiveConf(); - spark = SparkSession.builder() - .master("local[2]") - .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .enableHiveSupport() - .getOrCreate(); + spark = + SparkSession.builder() + .master("local[2]") + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .enableHiveSupport() + .getOrCreate(); - catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); try { catalog.createNamespace(Namespace.of("default")); @@ -117,17 +119,21 @@ protected void dropTable(String name) { @Override public StructLikeSet rowSet(String name, Table table, String... columns) { - Dataset df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", name).toString()) - .selectExpr(columns); + Dataset df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", name).toString()) + .selectExpr(columns); Types.StructType projection = table.schema().select(columns).asStruct(); StructLikeSet set = StructLikeSet.create(projection); - df.collectAsList().forEach(row -> { - SparkStructLike rowWrapper = new SparkStructLike(projection); - set.add(rowWrapper.wrap(row)); - }); + df.collectAsList() + .forEach( + row -> { + SparkStructLike rowWrapper = new SparkStructLike(projection); + set.add(rowWrapper.wrap(row)); + }); return set; } @@ -137,31 +143,39 @@ public void testEqualityDeleteWithFilter() throws IOException { String tableName = table.name().substring(table.name().lastIndexOf(".") + 1); Schema deleteRowSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d"), // id = 89 - dataDelete.copy("data", "g") // id = 122 - ); - - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, deleteRowSchema); - - table.newRowDelta() - .addDeletes(eqDeletes) - .commit(); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d"), // id = 89 + dataDelete.copy("data", "g") // id = 122 + ); + + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + deleteRowSchema); + + table.newRowDelta().addDeletes(eqDeletes).commit(); Types.StructType projection = table.schema().select("*").asStruct(); - Dataset df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", tableName).toString()) - .filter("data = 'a'") // select a deleted row - .selectExpr("*"); + Dataset df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", tableName).toString()) + .filter("data = 'a'") // select a deleted row + .selectExpr("*"); StructLikeSet actual = StructLikeSet.create(projection); - df.collectAsList().forEach(row -> { - SparkStructLike rowWrapper = new SparkStructLike(projection); - actual.add(rowWrapper.wrap(row)); - }); + df.collectAsList() + .forEach( + row -> { + SparkStructLike rowWrapper = new SparkStructLike(projection); + actual.add(rowWrapper.wrap(row)); + }); Assert.assertEquals("Table should contain no rows", 0, actual.size()); } @@ -170,44 +184,57 @@ public void testEqualityDeleteWithFilter() throws IOException { public void testReadEqualityDeleteRows() throws IOException { Schema deleteSchema1 = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteSchema1); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d") // id = 89 - ); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d") // id = 89 + ); Schema deleteSchema2 = table.schema().select("id"); Record idDelete = GenericRecord.create(deleteSchema2); - List idDeletes = Lists.newArrayList( - idDelete.copy("id", 121), // id = 121 - idDelete.copy("id", 122) // id = 122 - ); - - DeleteFile eqDelete1 = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, deleteSchema1); - - DeleteFile eqDelete2 = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), idDeletes, deleteSchema2); - - table.newRowDelta() - .addDeletes(eqDelete1) - .addDeletes(eqDelete2) - .commit(); + List idDeletes = + Lists.newArrayList( + idDelete.copy("id", 121), // id = 121 + idDelete.copy("id", 122) // id = 122 + ); + + DeleteFile eqDelete1 = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + deleteSchema1); + + DeleteFile eqDelete2 = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + idDeletes, + deleteSchema2); + + table.newRowDelta().addDeletes(eqDelete1).addDeletes(eqDelete2).commit(); StructLikeSet expectedRowSet = rowSetWithIds(29, 89, 121, 122); Types.StructType type = table.schema().asStruct(); StructLikeSet actualRowSet = StructLikeSet.create(type); - CloseableIterable tasks = TableScanUtil.planTasks( - table.newScan().planFiles(), - TableProperties.METADATA_SPLIT_SIZE_DEFAULT, - TableProperties.SPLIT_LOOKBACK_DEFAULT, - TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); + CloseableIterable tasks = + TableScanUtil.planTasks( + table.newScan().planFiles(), + TableProperties.METADATA_SPLIT_SIZE_DEFAULT, + TableProperties.SPLIT_LOOKBACK_DEFAULT, + TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); for (CombinedScanTask task : tasks) { - try (EqualityDeleteRowReader reader = new EqualityDeleteRowReader(task, table, table.schema(), false)) { + try (EqualityDeleteRowReader reader = + new EqualityDeleteRowReader(task, table, table.schema(), false)) { while (reader.next()) { - actualRowSet.add(new InternalRowWrapper(SparkSchemaUtil.convert(table.schema())).wrap(reader.get().copy())); + actualRowSet.add( + new InternalRowWrapper(SparkSchemaUtil.convert(table.schema())) + .wrap(reader.get().copy())); } } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java index 9023195dcc6a..dcf9140a8885 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -36,9 +35,11 @@ public TestSparkRollingFileWriters(FileFormat fileFormat, boolean partitioned) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java index 1b4fb5f8ce58..616a196872de 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java index 967f394faa74..06ecc20c2fc3 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.FileFormat; diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java index 69302e9d24d7..17370aaa22f2 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import com.fasterxml.jackson.databind.node.ObjectNode; @@ -29,14 +28,17 @@ public class TestStreamingOffset { @Test public void testJsonConversion() { - StreamingOffset[] expected = new StreamingOffset[]{ - new StreamingOffset(System.currentTimeMillis(), 1L, false), - new StreamingOffset(System.currentTimeMillis(), 2L, false), - new StreamingOffset(System.currentTimeMillis(), 3L, false), - new StreamingOffset(System.currentTimeMillis(), 4L, true) - }; - Assert.assertArrayEquals("StreamingOffsets should match", expected, - Arrays.stream(expected).map(elem -> StreamingOffset.fromJson(elem.json())).toArray()); + StreamingOffset[] expected = + new StreamingOffset[] { + new StreamingOffset(System.currentTimeMillis(), 1L, false), + new StreamingOffset(System.currentTimeMillis(), 2L, false), + new StreamingOffset(System.currentTimeMillis(), 3L, false), + new StreamingOffset(System.currentTimeMillis(), 4L, true) + }; + Assert.assertArrayEquals( + "StreamingOffsets should match", + expected, + Arrays.stream(expected).map(elem -> StreamingOffset.fromJson(elem.json())).toArray()); } @Test diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index 4225747dcab4..73e95d102875 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.util.List; import org.apache.hadoop.conf.Configuration; @@ -49,28 +50,24 @@ import scala.Option; import scala.collection.JavaConversions; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestStructuredStreaming { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); private static SparkSession spark = null; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - @Rule - public ExpectedException exceptionRule = ExpectedException.none(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); + @Rule public ExpectedException exceptionRule = ExpectedException.none(); @BeforeClass public static void startSpark() { - TestStructuredStreaming.spark = SparkSession.builder() - .master("local[2]") - .config("spark.sql.shuffle.partitions", 4) - .getOrCreate(); + TestStructuredStreaming.spark = + SparkSession.builder() + .master("local[2]") + .config("spark.sql.shuffle.partitions", 4) + .getOrCreate(); } @AfterClass @@ -90,21 +87,23 @@ public void testStreamingWriteAppendMode() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "1"), - new SimpleRecord(2, "2"), - new SimpleRecord(3, "3"), - new SimpleRecord(4, "4") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "1"), + new SimpleRecord(2, "2"), + new SimpleRecord(3, "3"), + new SimpleRecord(4, "4")); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("append") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("append") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { // start the original query with checkpointing @@ -126,10 +125,9 @@ public void testStreamingWriteAppendMode() throws Exception { restartedQuery.processAllAvailable(); // ensure the write was idempotent - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Dataset result = spark.read().format("iceberg").load(location.toString()); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); @@ -150,22 +148,22 @@ public void testStreamingWriteCompleteMode() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(2, "1"), - new SimpleRecord(3, "2"), - new SimpleRecord(1, "3") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(2, "1"), new SimpleRecord(3, "2"), new SimpleRecord(1, "3")); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .groupBy("value") - .count() - .selectExpr("CAST(count AS INT) AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("complete") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .groupBy("value") + .count() + .selectExpr("CAST(count AS INT) AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("complete") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { // start the original query with checkpointing @@ -187,10 +185,9 @@ public void testStreamingWriteCompleteMode() throws Exception { restartedQuery.processAllAvailable(); // ensure the write was idempotent - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); - List actual = result.orderBy("data").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Dataset result = spark.read().format("iceberg").load(location.toString()); + List actual = + result.orderBy("data").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); @@ -211,22 +208,22 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, null), - new SimpleRecord(2, null), - new SimpleRecord(3, null) - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null)); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .groupBy("value") - .count() - .selectExpr("CAST(count AS INT) AS id") // select only id column - .writeStream() - .outputMode("complete") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .groupBy("value") + .count() + .selectExpr("CAST(count AS INT) AS id") // select only id column + .writeStream() + .outputMode("complete") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { // start the original query with checkpointing @@ -248,10 +245,9 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception { restartedQuery.processAllAvailable(); // ensure the write was idempotent - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Dataset result = spark.read().format("iceberg").load(location.toString()); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); @@ -275,13 +271,15 @@ public void testStreamingWriteUpdateMode() throws Exception { tables.create(SCHEMA, spec, location.toString()); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("update") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("update") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { StreamingQuery query = streamWriter.start(); diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java index e609412f8be0..23fdfb09cb83 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.expressions.Expressions.ref; + import java.io.File; import java.util.Collections; import java.util.List; @@ -58,8 +59,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.expressions.Expressions.ref; - @RunWith(Parameterized.class) public final class TestStructuredStreamingRead3 extends SparkCatalogTestBase { public TestStructuredStreamingRead3( @@ -70,59 +69,49 @@ public TestStructuredStreamingRead3( private Table table; /** - * test data to be used by multiple writes - * each write creates a snapshot and writes a list of records + * test data to be used by multiple writes each write creates a snapshot and writes a list of + * records */ - private static final List> TEST_DATA_MULTIPLE_SNAPSHOTS = Lists.newArrayList( - Lists.newArrayList( - new SimpleRecord(1, "one"), - new SimpleRecord(2, "two"), - new SimpleRecord(3, "three")), + private static final List> TEST_DATA_MULTIPLE_SNAPSHOTS = Lists.newArrayList( - new SimpleRecord(4, "four"), - new SimpleRecord(5, "five")), - Lists.newArrayList( - new SimpleRecord(6, "six"), - new SimpleRecord(7, "seven"))); + Lists.newArrayList( + new SimpleRecord(1, "one"), new SimpleRecord(2, "two"), new SimpleRecord(3, "three")), + Lists.newArrayList(new SimpleRecord(4, "four"), new SimpleRecord(5, "five")), + Lists.newArrayList(new SimpleRecord(6, "six"), new SimpleRecord(7, "seven"))); /** - * test data - to be used for multiple write batches - * each batch inturn will have multiple snapshots + * test data - to be used for multiple write batches each batch inturn will have multiple + * snapshots */ - private static final List>> TEST_DATA_MULTIPLE_WRITES_MULTIPLE_SNAPSHOTS = Lists.newArrayList( - Lists.newArrayList( - Lists.newArrayList( - new SimpleRecord(1, "one"), - new SimpleRecord(2, "two"), - new SimpleRecord(3, "three")), - Lists.newArrayList( - new SimpleRecord(4, "four"), - new SimpleRecord(5, "five"))), + private static final List>> TEST_DATA_MULTIPLE_WRITES_MULTIPLE_SNAPSHOTS = Lists.newArrayList( Lists.newArrayList( - new SimpleRecord(6, "six"), - new SimpleRecord(7, "seven")), + Lists.newArrayList( + new SimpleRecord(1, "one"), + new SimpleRecord(2, "two"), + new SimpleRecord(3, "three")), + Lists.newArrayList(new SimpleRecord(4, "four"), new SimpleRecord(5, "five"))), Lists.newArrayList( - new SimpleRecord(8, "eight"), - new SimpleRecord(9, "nine"))), - Lists.newArrayList( + Lists.newArrayList(new SimpleRecord(6, "six"), new SimpleRecord(7, "seven")), + Lists.newArrayList(new SimpleRecord(8, "eight"), new SimpleRecord(9, "nine"))), Lists.newArrayList( - new SimpleRecord(10, "ten"), - new SimpleRecord(11, "eleven"), - new SimpleRecord(12, "twelve")), - Lists.newArrayList( - new SimpleRecord(13, "thirteen"), - new SimpleRecord(14, "fourteen")), - Lists.newArrayList( - new SimpleRecord(15, "fifteen"), - new SimpleRecord(16, "sixteen")))); + Lists.newArrayList( + new SimpleRecord(10, "ten"), + new SimpleRecord(11, "eleven"), + new SimpleRecord(12, "twelve")), + Lists.newArrayList( + new SimpleRecord(13, "thirteen"), new SimpleRecord(14, "fourteen")), + Lists.newArrayList( + new SimpleRecord(15, "fifteen"), new SimpleRecord(16, "sixteen")))); @Before public void setupTable() { - sql("CREATE TABLE %s " + - "(id INT, data STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(3, id))", tableName); + sql( + "CREATE TABLE %s " + + "(id INT, data STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(3, id))", + tableName); this.table = validationCatalog.loadTable(tableIdent); } @@ -163,17 +152,19 @@ public void testReadStreamOnIcebergThenAddData() throws Exception { @Test public void testReadingStreamFromTimestamp() throws Exception { - List dataBeforeTimestamp = Lists.newArrayList( - new SimpleRecord(-2, "minustwo"), - new SimpleRecord(-1, "minusone"), - new SimpleRecord(0, "zero")); + List dataBeforeTimestamp = + Lists.newArrayList( + new SimpleRecord(-2, "minustwo"), + new SimpleRecord(-1, "minusone"), + new SimpleRecord(0, "zero")); appendData(dataBeforeTimestamp); table.refresh(); long streamStartTimestamp = table.currentSnapshot().timestampMillis() + 1; - StreamingQuery query = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(streamStartTimestamp)); + StreamingQuery query = + startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(streamStartTimestamp)); List empty = rowsAvailable(query); Assertions.assertThat(empty.isEmpty()).isTrue(); @@ -190,21 +181,25 @@ public void testReadingStreamFromTimestamp() throws Exception { public void testReadingStreamFromFutureTimetsamp() throws Exception { long futureTimestamp = System.currentTimeMillis() + 10000; - StreamingQuery query = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(futureTimestamp)); + StreamingQuery query = + startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(futureTimestamp)); List actual = rowsAvailable(query); Assertions.assertThat(actual.isEmpty()).isTrue(); - List data = Lists.newArrayList( - new SimpleRecord(-2, "minustwo"), - new SimpleRecord(-1, "minusone"), - new SimpleRecord(0, "zero")); + List data = + Lists.newArrayList( + new SimpleRecord(-2, "minustwo"), + new SimpleRecord(-1, "minusone"), + new SimpleRecord(0, "zero")); // Perform several inserts that should not show up because the fromTimestamp has not elapsed - IntStream.range(0, 3).forEach(x -> { - appendData(data); - Assertions.assertThat(rowsAvailable(query).isEmpty()).isTrue(); - }); + IntStream.range(0, 3) + .forEach( + x -> { + appendData(data); + Assertions.assertThat(rowsAvailable(query).isEmpty()).isTrue(); + }); waitUntilAfter(futureTimestamp); @@ -216,16 +211,16 @@ public void testReadingStreamFromFutureTimetsamp() throws Exception { @Test public void testReadingStreamFromTimestampFutureWithExistingSnapshots() throws Exception { - List dataBeforeTimestamp = Lists.newArrayList( - new SimpleRecord(1, "one"), - new SimpleRecord(2, "two"), - new SimpleRecord(3, "three")); + List dataBeforeTimestamp = + Lists.newArrayList( + new SimpleRecord(1, "one"), new SimpleRecord(2, "two"), new SimpleRecord(3, "three")); appendData(dataBeforeTimestamp); long streamStartTimestamp = System.currentTimeMillis() + 2000; // Start the stream with a future timestamp after the current snapshot - StreamingQuery query = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(streamStartTimestamp)); + StreamingQuery query = + startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(streamStartTimestamp)); List actual = rowsAvailable(query); Assert.assertEquals(Collections.emptyList(), actual); @@ -233,7 +228,8 @@ public void testReadingStreamFromTimestampFutureWithExistingSnapshots() throws E waitUntilAfter(streamStartTimestamp); List> expected = TEST_DATA_MULTIPLE_SNAPSHOTS; appendDataAsMultipleSnapshots(expected); - Assertions.assertThat(rowsAvailable(query)).containsExactlyInAnyOrderElementsOf(Iterables.concat(expected)); + Assertions.assertThat(rowsAvailable(query)) + .containsExactlyInAnyOrderElementsOf(Iterables.concat(expected)); } @Test @@ -246,7 +242,8 @@ public void testReadingStreamFromTimestampOfExistingSnapshot() throws Exception long firstSnapshotTime = table.currentSnapshot().timestampMillis(); // Start stream giving the first Snapshot's time as the start point - StreamingQuery stream = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(firstSnapshotTime)); + StreamingQuery stream = + startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(firstSnapshotTime)); // Append rest of expected data for (int i = 1; i < expected.size(); i++) { @@ -259,14 +256,11 @@ public void testReadingStreamFromTimestampOfExistingSnapshot() throws Exception @Test public void testReadingStreamWithExpiredSnapshotFromTimestamp() throws TimeoutException { - List firstSnapshotRecordList = Lists.newArrayList( - new SimpleRecord(1, "one")); + List firstSnapshotRecordList = Lists.newArrayList(new SimpleRecord(1, "one")); - List secondSnapshotRecordList = Lists.newArrayList( - new SimpleRecord(2, "two")); + List secondSnapshotRecordList = Lists.newArrayList(new SimpleRecord(2, "two")); - List thirdSnapshotRecordList = Lists.newArrayList( - new SimpleRecord(3, "three")); + List thirdSnapshotRecordList = Lists.newArrayList(new SimpleRecord(3, "three")); List expectedRecordList = Lists.newArrayList(); expectedRecordList.addAll(secondSnapshotRecordList); @@ -277,13 +271,14 @@ public void testReadingStreamWithExpiredSnapshotFromTimestamp() throws TimeoutEx long firstSnapshotid = table.currentSnapshot().snapshotId(); long firstSnapshotCommitTime = table.currentSnapshot().timestampMillis(); - appendData(secondSnapshotRecordList); appendData(thirdSnapshotRecordList); table.expireSnapshots().expireSnapshotId(firstSnapshotid).commit(); - StreamingQuery query = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, String.valueOf(firstSnapshotCommitTime)); + StreamingQuery query = + startStream( + SparkReadOptions.STREAM_FROM_TIMESTAMP, String.valueOf(firstSnapshotCommitTime)); List actual = rowsAvailable(query); Assertions.assertThat(actual).containsExactlyInAnyOrderElementsOf(expectedRecordList); } @@ -294,21 +289,24 @@ public void testResumingStreamReadFromCheckpoint() throws Exception { File writerCheckpoint = new File(writerCheckpointFolder, "writer-checkpoint"); File output = temp.newFolder(); - DataStreamWriter querySource = spark.readStream() - .format("iceberg") - .load(tableName) - .writeStream() - .option("checkpointLocation", writerCheckpoint.toString()) - .format("parquet") - .queryName("checkpoint_test") - .option("path", output.getPath()); + DataStreamWriter querySource = + spark + .readStream() + .format("iceberg") + .load(tableName) + .writeStream() + .option("checkpointLocation", writerCheckpoint.toString()) + .format("parquet") + .queryName("checkpoint_test") + .option("path", output.getPath()); StreamingQuery startQuery = querySource.start(); startQuery.processAllAvailable(); startQuery.stop(); List expected = Lists.newArrayList(); - for (List> expectedCheckpoint : TEST_DATA_MULTIPLE_WRITES_MULTIPLE_SNAPSHOTS) { + for (List> expectedCheckpoint : + TEST_DATA_MULTIPLE_WRITES_MULTIPLE_SNAPSHOTS) { // New data was added while the stream was down appendDataAsMultipleSnapshots(expectedCheckpoint); expected.addAll(Lists.newArrayList(Iterables.concat(Iterables.concat(expectedCheckpoint)))); @@ -319,28 +317,23 @@ public void testResumingStreamReadFromCheckpoint() throws Exception { restartedQuery.stop(); // Read data added by the stream - List actual = spark.read() - .load(output.getPath()) - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + spark.read().load(output.getPath()).as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assertions.assertThat(actual).containsExactlyInAnyOrderElementsOf(Iterables.concat(expected)); } } @Test public void testParquetOrcAvroDataInOneTable() throws Exception { - List parquetFileRecords = Lists.newArrayList( - new SimpleRecord(1, "one"), - new SimpleRecord(2, "two"), - new SimpleRecord(3, "three")); + List parquetFileRecords = + Lists.newArrayList( + new SimpleRecord(1, "one"), new SimpleRecord(2, "two"), new SimpleRecord(3, "three")); - List orcFileRecords = Lists.newArrayList( - new SimpleRecord(4, "four"), - new SimpleRecord(5, "five")); + List orcFileRecords = + Lists.newArrayList(new SimpleRecord(4, "four"), new SimpleRecord(5, "five")); - List avroFileRecords = Lists.newArrayList( - new SimpleRecord(6, "six"), - new SimpleRecord(7, "seven")); + List avroFileRecords = + Lists.newArrayList(new SimpleRecord(6, "six"), new SimpleRecord(7, "seven")); appendData(parquetFileRecords); appendData(orcFileRecords, "orc"); @@ -348,7 +341,8 @@ public void testParquetOrcAvroDataInOneTable() throws Exception { StreamingQuery query = startStream(); Assertions.assertThat(rowsAvailable(query)) - .containsExactlyInAnyOrderElementsOf(Iterables.concat(parquetFileRecords, orcFileRecords, avroFileRecords)); + .containsExactlyInAnyOrderElementsOf( + Iterables.concat(parquetFileRecords, orcFileRecords, avroFileRecords)); } @Test @@ -371,18 +365,23 @@ public void testReadStreamWithSnapshotTypeOverwriteErrorsOut() throws Exception Schema deleteRowSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "one") // id = 1 - ); - - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, deleteRowSchema); - - table.newRowDelta() - .addDeletes(eqDeletes) - .commit(); - - // check pre-condition - that the above Delete file write - actually resulted in snapshot of type OVERWRITE + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "one") // id = 1 + ); + + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + deleteRowSchema); + + table.newRowDelta().addDeletes(eqDeletes).commit(); + + // check pre-condition - that the above Delete file write - actually resulted in snapshot of + // type OVERWRITE Assert.assertEquals(DataOperations.OVERWRITE, table.currentSnapshot().operation()); StreamingQuery query = startStream(); @@ -391,8 +390,7 @@ public void testReadStreamWithSnapshotTypeOverwriteErrorsOut() throws Exception "Streaming should fail with IllegalStateException, as the snapshot is not of type APPEND", IllegalStateException.class, "Cannot process overwrite snapshot", - () -> query.processAllAvailable() - ); + () -> query.processAllAvailable()); } @Test @@ -402,9 +400,7 @@ public void testReadStreamWithSnapshotTypeReplaceIgnoresReplace() throws Excepti appendDataAsMultipleSnapshots(expected); // this should create a snapshot with type Replace. - table.rewriteManifests() - .clusterBy(f -> 1) - .commit(); + table.rewriteManifests().clusterBy(f -> 1).commit(); // check pre-condition Assert.assertEquals(DataOperations.REPLACE, table.currentSnapshot().operation()); @@ -416,21 +412,17 @@ public void testReadStreamWithSnapshotTypeReplaceIgnoresReplace() throws Excepti @Test public void testReadStreamWithSnapshotTypeDeleteErrorsOut() throws Exception { - table.updateSpec() - .removeField("id_bucket") - .addField(ref("id")) - .commit(); + table.updateSpec().removeField("id_bucket").addField(ref("id")).commit(); // fill table with some data List> dataAcrossSnapshots = TEST_DATA_MULTIPLE_SNAPSHOTS; appendDataAsMultipleSnapshots(dataAcrossSnapshots); // this should create a snapshot with type delete. - table.newDelete() - .deleteFromRowFilter(Expressions.equal("id", 4)) - .commit(); + table.newDelete().deleteFromRowFilter(Expressions.equal("id", 4)).commit(); - // check pre-condition - that the above delete operation on table resulted in Snapshot of Type DELETE. + // check pre-condition - that the above delete operation on table resulted in Snapshot of Type + // DELETE. Assert.assertEquals(DataOperations.DELETE, table.currentSnapshot().operation()); StreamingQuery query = startStream(); @@ -439,27 +431,22 @@ public void testReadStreamWithSnapshotTypeDeleteErrorsOut() throws Exception { "Streaming should fail with IllegalStateException, as the snapshot is not of type APPEND", IllegalStateException.class, "Cannot process delete snapshot", - () -> query.processAllAvailable() - ); + () -> query.processAllAvailable()); } @Test public void testReadStreamWithSnapshotTypeDeleteAndSkipDeleteOption() throws Exception { - table.updateSpec() - .removeField("id_bucket") - .addField(ref("id")) - .commit(); + table.updateSpec().removeField("id_bucket").addField(ref("id")).commit(); // fill table with some data List> dataAcrossSnapshots = TEST_DATA_MULTIPLE_SNAPSHOTS; appendDataAsMultipleSnapshots(dataAcrossSnapshots); // this should create a snapshot with type delete. - table.newDelete() - .deleteFromRowFilter(Expressions.equal("id", 4)) - .commit(); + table.newDelete().deleteFromRowFilter(Expressions.equal("id", 4)).commit(); - // check pre-condition - that the above delete operation on table resulted in Snapshot of Type DELETE. + // check pre-condition - that the above delete operation on table resulted in Snapshot of Type + // DELETE. Assert.assertEquals(DataOperations.DELETE, table.currentSnapshot().operation()); StreamingQuery query = startStream(SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS, "true"); @@ -469,21 +456,17 @@ public void testReadStreamWithSnapshotTypeDeleteAndSkipDeleteOption() throws Exc @Test public void testReadStreamWithSnapshotTypeDeleteAndSkipOverwriteOption() throws Exception { - table.updateSpec() - .removeField("id_bucket") - .addField(ref("id")) - .commit(); + table.updateSpec().removeField("id_bucket").addField(ref("id")).commit(); // fill table with some data List> dataAcrossSnapshots = TEST_DATA_MULTIPLE_SNAPSHOTS; appendDataAsMultipleSnapshots(dataAcrossSnapshots); // this should create a snapshot with type overwrite. - table.newOverwrite() - .overwriteByRowFilter(Expressions.greaterThan("id", 4)) - .commit(); + table.newOverwrite().overwriteByRowFilter(Expressions.greaterThan("id", 4)).commit(); - // check pre-condition - that the above delete operation on table resulted in Snapshot of Type OVERWRITE. + // check pre-condition - that the above delete operation on table resulted in Snapshot of Type + // OVERWRITE. Assert.assertEquals(DataOperations.OVERWRITE, table.currentSnapshot().operation()); StreamingQuery query = startStream(SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS, "true"); @@ -492,8 +475,8 @@ public void testReadStreamWithSnapshotTypeDeleteAndSkipOverwriteOption() throws } /** - * appends each list as a Snapshot on the iceberg table at the given location. - * accepts a list of lists - each list representing data per snapshot. + * appends each list as a Snapshot on the iceberg table at the given location. accepts a list of + * lists - each list representing data per snapshot. */ private void appendDataAsMultipleSnapshots(List> data) { for (List l : data) { @@ -507,7 +490,8 @@ private void appendData(List data) { private void appendData(List data, String format) { Dataset df = spark.createDataFrame(data, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option("write-format", format) .mode("append") @@ -517,7 +501,8 @@ private void appendData(List data, String format) { private static final String MEMORY_TABLE = "_stream_view_mem"; private StreamingQuery startStream(Map options) throws TimeoutException { - return spark.readStream() + return spark + .readStream() .options(options) .format("iceberg") .load(tableName) @@ -539,9 +524,9 @@ private StreamingQuery startStream(String key, String value) throws TimeoutExcep private List rowsAvailable(StreamingQuery query) { query.processAllAvailable(); - return spark.sql("select * from " + MEMORY_TABLE) + return spark + .sql("select * from " + MEMORY_TABLE) .as(Encoders.bean(SimpleRecord.class)) .collectAsList(); } - } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java index e61d6ffb9e5e..ef2f73c3803c 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; @@ -42,15 +41,15 @@ // TODO: Use the copy of this from core. class TestTables { - private TestTables() { - } + private TestTables() {} static TestTable create(File temp, String name, Schema schema, PartitionSpec spec) { TestTableOperations ops = new TestTableOperations(name); if (ops.current() != null) { throw new AlreadyExistsException("Table %s already exists at location: %s", name, temp); } - ops.commit(null, TableMetadata.newTableMetadata(schema, spec, temp.toString(), ImmutableMap.of())); + ops.commit( + null, TableMetadata.newTableMetadata(schema, spec, temp.toString(), ImmutableMap.of())); return new TestTable(ops, name); } @@ -166,8 +165,8 @@ public FileIO io() { @Override public LocationProvider locationProvider() { - Preconditions.checkNotNull(current, - "Current metadata should not be null when locatinProvider is called"); + Preconditions.checkNotNull( + current, "Current metadata should not be null when locatinProvider is called"); return LocationProviders.locationsFor(current.location(), current.properties()); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java index 20509eef7471..f6cac9e9dd82 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.time.LocalDateTime; @@ -64,18 +65,16 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; - @RunWith(Parameterized.class) public class TestTimestampWithoutZone extends SparkTestBase { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(3, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(3, "data", Types.StringType.get())); private static SparkSession spark = null; @@ -91,8 +90,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String format; private final boolean vectorized; @@ -100,9 +98,9 @@ public static void stopSpark() { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false } + {"parquet", false}, + {"parquet", true}, + {"avro", false} }; } @@ -132,16 +130,17 @@ public void writeUnpartitionedTable() throws IOException { // create records using the table's schema this.records = testRecords(tableSchema); - try (FileAppender writer = new GenericAppenderFactory(tableSchema).newAppender( - localOutput(testFile), fileFormat)) { + try (FileAppender writer = + new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), fileFormat)) { writer.addAll(records); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(records.size()) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(records.size()) + .withFileSizeInBytes(testFile.length()) + .withPath(testFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); } @@ -154,69 +153,89 @@ public void testUnpartitionedTimestampWithoutZone() { @Test public void testUnpartitionedTimestampWithoutZoneProjection() { Schema projection = SCHEMA.select("id", "ts"); - assertEqualsSafe(projection.asStruct(), + assertEqualsSafe( + projection.asStruct(), records.stream().map(r -> projectFlat(projection, r)).collect(Collectors.toList()), read(unpartitioned.toString(), vectorized, "id", "ts")); } - @Rule - public ExpectedException exception = ExpectedException.none(); + @Rule public ExpectedException exception = ExpectedException.none(); @Test public void testUnpartitionedTimestampWithoutZoneError() { - AssertHelpers.assertThrows(String.format("Read operation performed on a timestamp without timezone field while " + - "'%s' set to false should throw exception", - SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), + AssertHelpers.assertThrows( + String.format( + "Read operation performed on a timestamp without timezone field while " + + "'%s' set to false should throw exception", + SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), IllegalArgumentException.class, SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, - () -> spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") - .load(unpartitioned.toString()) - .collectAsList()); + () -> + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") + .load(unpartitioned.toString()) + .collectAsList()); } @Test public void testUnpartitionedTimestampWithoutZoneAppend() { - spark.read().format("iceberg") - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()) - .write() - .format("iceberg") - .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .mode(SaveMode.Append) - .save(unpartitioned.toString()); - - assertEqualsSafe(SCHEMA.asStruct(), - Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), - read(unpartitioned.toString(), vectorized)); + spark + .read() + .format("iceberg") + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()) + .write() + .format("iceberg") + .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .mode(SaveMode.Append) + .save(unpartitioned.toString()); + + assertEqualsSafe( + SCHEMA.asStruct(), + Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), + read(unpartitioned.toString(), vectorized)); } @Test public void testUnpartitionedTimestampWithoutZoneWriteError() { - String errorMessage = String.format("Write operation performed on a timestamp without timezone field while " + - "'%s' set to false should throw exception", + String errorMessage = + String.format( + "Write operation performed on a timestamp without timezone field while " + + "'%s' set to false should throw exception", SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); - Runnable writeOperation = () -> spark.read().format("iceberg") - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()) - .write() - .format("iceberg") - .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") - .mode(SaveMode.Append) - .save(unpartitioned.toString()); - - AssertHelpers.assertThrows(errorMessage, IllegalArgumentException.class, - SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, writeOperation); - + Runnable writeOperation = + () -> + spark + .read() + .format("iceberg") + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()) + .write() + .format("iceberg") + .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") + .mode(SaveMode.Append) + .save(unpartitioned.toString()); + + AssertHelpers.assertThrows( + errorMessage, + IllegalArgumentException.class, + SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, + writeOperation); } @Test public void testUnpartitionedTimestampWithoutZoneSessionProperties() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - spark.read().format("iceberg") + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + spark + .read() + .format("iceberg") .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) .load(unpartitioned.toString()) .write() @@ -224,10 +243,11 @@ public void testUnpartitionedTimestampWithoutZoneSessionProperties() { .mode(SaveMode.Append) .save(unpartitioned.toString()); - assertEqualsSafe(SCHEMA.asStruct(), + assertEqualsSafe( + SCHEMA.asStruct(), Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), read(unpartitioned.toString(), vectorized)); - }); + }); } private static Record projectFlat(Schema projection, Record record) { @@ -240,8 +260,8 @@ private static Record projectFlat(Schema projection, Record record) { return result; } - public static void assertEqualsSafe(Types.StructType struct, - List expected, List actual) { + public static void assertEqualsSafe( + Types.StructType struct, List expected, List actual) { Assert.assertEquals("Number of results should match expected", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i += 1) { GenericsHelpers.assertEqualsSafe(struct, expected.get(i), actual.get(i)); @@ -259,20 +279,23 @@ private List testRecords(Schema schema) { record(schema, 6L, parseToLocal("2017-12-21T21:55:30.589712"), "element"), record(schema, 7L, parseToLocal("2017-12-21T17:31:14.532797"), "limited"), record(schema, 8L, parseToLocal("2017-12-21T15:21:51.237521"), "global"), - record(schema, 9L, parseToLocal("2017-12-21T15:02:15.230570"), "goldfish") - ); + record(schema, 9L, parseToLocal("2017-12-21T15:02:15.230570"), "goldfish")); } private static List read(String table, boolean vectorized) { return read(table, vectorized, "*"); } - private static List read(String table, boolean vectorized, String select0, String... selectN) { - Dataset dataset = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .load(table) - .select(select0, selectN); + private static List read( + String table, boolean vectorized, String select0, String... selectN) { + Dataset dataset = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .load(table) + .select(select0, selectN); return dataset.collectAsList(); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java index 7ed71031f3f2..9bf00f1b1365 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; @@ -53,28 +56,24 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestWriteMetricsConfig { private static final Configuration CONF = new Configuration(); - private static final Schema SIMPLE_SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "longCol", Types.IntegerType.get()), - optional(2, "strCol", Types.StringType.get()), - required(3, "record", Types.StructType.of( - required(4, "id", Types.IntegerType.get()), - required(5, "data", Types.StringType.get()) - )) - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final Schema SIMPLE_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "longCol", Types.IntegerType.get()), + optional(2, "strCol", Types.StringType.get()), + required( + 3, + "record", + Types.StructType.of( + required(4, "id", Types.IntegerType.get()), + required(5, "data", Types.StringType.get())))); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; private static JavaSparkContext sc = null; @@ -103,11 +102,9 @@ public void testFullMetricsCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -136,11 +133,9 @@ public void testCountMetricsCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -169,11 +164,9 @@ public void testNoMetricsCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -203,11 +196,9 @@ public void testCustomMetricCollectionForParquet() throws IOException { properties.put("write.metadata.metrics.column.id", "full"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -240,7 +231,8 @@ public void testBadCustomMetricCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); properties.put("write.metadata.metrics.column.ids", "full"); - AssertHelpers.assertThrows("Creating a table with invalid metrics should fail", + AssertHelpers.assertThrows( + "Creating a table with invalid metrics should fail", ValidationException.class, null, () -> tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation)); @@ -251,9 +243,7 @@ public void testCustomMetricCollectionForNestedParquet() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.builderFor(COMPLEX_SCHEMA) - .identity("strCol") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(COMPLEX_SCHEMA).identity("strCol").build(); Map properties = Maps.newHashMap(); properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); properties.put("write.metadata.metrics.column.longCol", "counts"); @@ -263,9 +253,11 @@ public void testCustomMetricCollectionForNestedParquet() throws IOException { Iterable rows = RandomData.generateSpark(COMPLEX_SCHEMA, 10, 0); JavaRDD rdd = sc.parallelize(Lists.newArrayList(rows)); - Dataset df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(COMPLEX_SCHEMA), false); + Dataset df = + spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(COMPLEX_SCHEMA), false); - df.coalesce(1).write() + df.coalesce(1) + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, "parquet") .mode(SaveMode.Append) diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java index 684dfbb255c7..554557df416c 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Objects; @@ -26,8 +25,7 @@ public class ThreeColumnRecord { private String c2; private String c3; - public ThreeColumnRecord() { - } + public ThreeColumnRecord() {} public ThreeColumnRecord(Integer c1, String c2, String c3) { this.c1 = c1; @@ -68,9 +66,9 @@ public boolean equals(Object o) { return false; } ThreeColumnRecord that = (ThreeColumnRecord) o; - return Objects.equals(c1, that.c1) && - Objects.equals(c2, that.c2) && - Objects.equals(c3, that.c3); + return Objects.equals(c1, that.c1) + && Objects.equals(c2, that.c2) + && Objects.equals(c3, that.c3); } @Override @@ -80,10 +78,6 @@ public int hashCode() { @Override public String toString() { - return "ThreeColumnRecord{" + - "c1=" + c1 + - ", c2='" + c2 + '\'' + - ", c3='" + c3 + '\'' + - '}'; + return "ThreeColumnRecord{" + "c1=" + c1 + ", c2='" + c2 + '\'' + ", c3='" + c3 + '\'' + '}'; } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestAlterTable.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestAlterTable.java index 6172bd1fd0fe..e347cde7ba32 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestAlterTable.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestAlterTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.Map; @@ -36,7 +35,8 @@ import org.junit.Test; public class TestAlterTable extends SparkCatalogTestBase { - private final TableIdentifier renamedIdent = TableIdentifier.of(Namespace.of("default"), "table2"); + private final TableIdentifier renamedIdent = + TableIdentifier.of(Namespace.of("default"), "table2"); public TestAlterTable(String catalogName, String implementation, Map config) { super(catalogName, implementation, config); @@ -55,39 +55,53 @@ public void removeTable() { @Test public void testAddColumnNotNull() { - AssertHelpers.assertThrows("Should reject adding NOT NULL column", - SparkException.class, "Incompatible change: cannot add required column", + AssertHelpers.assertThrows( + "Should reject adding NOT NULL column", + SparkException.class, + "Incompatible change: cannot add required column", () -> sql("ALTER TABLE %s ADD COLUMN c3 INT NOT NULL", tableName)); } @Test public void testAddColumn() { - sql("ALTER TABLE %s ADD COLUMN point struct AFTER id", tableName); - - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(3, "point", Types.StructType.of( - NestedField.required(4, "x", Types.DoubleType.get()), - NestedField.required(5, "y", Types.DoubleType.get()) - )), - NestedField.optional(2, "data", Types.StringType.get())); - - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + sql( + "ALTER TABLE %s ADD COLUMN point struct AFTER id", + tableName); + + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional( + 3, + "point", + Types.StructType.of( + NestedField.required(4, "x", Types.DoubleType.get()), + NestedField.required(5, "y", Types.DoubleType.get()))), + NestedField.optional(2, "data", Types.StringType.get())); + + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); sql("ALTER TABLE %s ADD COLUMN point.z double COMMENT 'May be null' FIRST", tableName); - Types.StructType expectedSchema2 = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(3, "point", Types.StructType.of( - NestedField.optional(6, "z", Types.DoubleType.get(), "May be null"), - NestedField.required(4, "x", Types.DoubleType.get()), - NestedField.required(5, "y", Types.DoubleType.get()) - )), - NestedField.optional(2, "data", Types.StringType.get())); - - Assert.assertEquals("Schema should match expected", - expectedSchema2, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Types.StructType expectedSchema2 = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional( + 3, + "point", + Types.StructType.of( + NestedField.optional(6, "z", Types.DoubleType.get(), "May be null"), + NestedField.required(4, "x", Types.DoubleType.get()), + NestedField.required(5, "y", Types.DoubleType.get()))), + NestedField.optional(2, "data", Types.StringType.get())); + + Assert.assertEquals( + "Schema should match expected", + expectedSchema2, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -95,19 +109,24 @@ public void testAddColumnWithArray() { sql("ALTER TABLE %s ADD COLUMN data2 array>", tableName); // use the implicit column name 'element' to access member of array and add column d to struct. sql("ALTER TABLE %s ADD COLUMN data2.element.d int", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get()), - NestedField.optional(3, "data2", Types.ListType.ofOptional( - 4, - Types.StructType.of( - NestedField.optional(5, "a", Types.IntegerType.get()), - NestedField.optional(6, "b", Types.IntegerType.get()), - NestedField.optional(7, "c", Types.IntegerType.get()), - NestedField.optional(8, "d", Types.IntegerType.get())) - ))); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get()), + NestedField.optional( + 3, + "data2", + Types.ListType.ofOptional( + 4, + Types.StructType.of( + NestedField.optional(5, "a", Types.IntegerType.get()), + NestedField.optional(6, "b", Types.IntegerType.get()), + NestedField.optional(7, "c", Types.IntegerType.get()), + NestedField.optional(8, "d", Types.IntegerType.get()))))); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -116,25 +135,31 @@ public void testAddColumnWithMap() { // use the implicit column name 'key' and 'value' to access member of map. // add column to value struct column sql("ALTER TABLE %s ADD COLUMN data2.value.c int", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get()), - NestedField.optional(3, "data2", Types.MapType.ofOptional( - 4, - 5, - Types.StructType.of( - NestedField.optional(6, "x", Types.IntegerType.get())), - Types.StructType.of( - NestedField.optional(7, "a", Types.IntegerType.get()), - NestedField.optional(8, "b", Types.IntegerType.get()), - NestedField.optional(9, "c", Types.IntegerType.get())) - ))); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get()), + NestedField.optional( + 3, + "data2", + Types.MapType.ofOptional( + 4, + 5, + Types.StructType.of(NestedField.optional(6, "x", Types.IntegerType.get())), + Types.StructType.of( + NestedField.optional(7, "a", Types.IntegerType.get()), + NestedField.optional(8, "b", Types.IntegerType.get()), + NestedField.optional(9, "c", Types.IntegerType.get()))))); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); // should not allow changing map key column - AssertHelpers.assertThrows("Should reject changing key of the map column", - SparkException.class, "Unsupported table change: Cannot add fields to map keys:", + AssertHelpers.assertThrows( + "Should reject changing key of the map column", + SparkException.class, + "Unsupported table change: Cannot add fields to map keys:", () -> sql("ALTER TABLE %s ADD COLUMN data2.key.y int", tableName)); } @@ -142,35 +167,43 @@ public void testAddColumnWithMap() { public void testDropColumn() { sql("ALTER TABLE %s DROP COLUMN data", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get())); + Types.StructType expectedSchema = + Types.StructType.of(NestedField.required(1, "id", Types.LongType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test public void testRenameColumn() { sql("ALTER TABLE %s RENAME COLUMN id TO row_id", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "row_id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "row_id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test public void testAlterColumnComment() { sql("ALTER TABLE %s ALTER COLUMN id COMMENT 'Record id'", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get(), "Record id"), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get(), "Record id"), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -178,25 +211,31 @@ public void testAlterColumnType() { sql("ALTER TABLE %s ADD COLUMN count int", tableName); sql("ALTER TABLE %s ALTER COLUMN count TYPE bigint", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get()), - NestedField.optional(3, "count", Types.LongType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get()), + NestedField.optional(3, "count", Types.LongType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test public void testAlterColumnDropNotNull() { sql("ALTER TABLE %s ALTER COLUMN id DROP NOT NULL", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.optional(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.optional(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -204,15 +243,20 @@ public void testAlterColumnSetNotNull() { // no-op changes are allowed sql("ALTER TABLE %s ALTER COLUMN id SET NOT NULL", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); - AssertHelpers.assertThrows("Should reject adding NOT NULL constraint to an optional column", - AnalysisException.class, "Cannot change nullable column to non-nullable: data", + AssertHelpers.assertThrows( + "Should reject adding NOT NULL constraint to an optional column", + AnalysisException.class, + "Cannot change nullable column to non-nullable: data", () -> sql("ALTER TABLE %s ALTER COLUMN data SET NOT NULL", tableName)); } @@ -221,13 +265,16 @@ public void testAlterColumnPositionAfter() { sql("ALTER TABLE %s ADD COLUMN count int", tableName); sql("ALTER TABLE %s ALTER COLUMN count AFTER id", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(3, "count", Types.IntegerType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(3, "count", Types.IntegerType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -235,18 +282,22 @@ public void testAlterColumnPositionFirst() { sql("ALTER TABLE %s ADD COLUMN count int", tableName); sql("ALTER TABLE %s ALTER COLUMN count FIRST", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.optional(3, "count", Types.IntegerType.get()), - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.optional(3, "count", Types.IntegerType.get()), + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test public void testTableRename() { - Assume.assumeFalse("Hadoop catalog does not support rename", validationCatalog instanceof HadoopCatalog); + Assume.assumeFalse( + "Hadoop catalog does not support rename", validationCatalog instanceof HadoopCatalog); Assert.assertTrue("Initial name should exist", validationCatalog.tableExists(tableIdent)); Assert.assertFalse("New name should not exist", validationCatalog.tableExists(renamedIdent)); @@ -261,15 +312,19 @@ public void testTableRename() { public void testSetTableProperties() { sql("ALTER TABLE %s SET TBLPROPERTIES ('prop'='value')", tableName); - Assert.assertEquals("Should have the new table property", - "value", validationCatalog.loadTable(tableIdent).properties().get("prop")); + Assert.assertEquals( + "Should have the new table property", + "value", + validationCatalog.loadTable(tableIdent).properties().get("prop")); sql("ALTER TABLE %s UNSET TBLPROPERTIES ('prop')", tableName); - Assert.assertNull("Should not have the removed table property", + Assert.assertNull( + "Should not have the removed table property", validationCatalog.loadTable(tableIdent).properties().get("prop")); - AssertHelpers.assertThrows("Cannot specify the 'sort-order' because it's a reserved table property", + AssertHelpers.assertThrows( + "Cannot specify the 'sort-order' because it's a reserved table property", UnsupportedOperationException.class, () -> sql("ALTER TABLE %s SET TBLPROPERTIES ('sort-order'='value')", tableName)); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTable.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTable.java index 986098543d25..1411c83ddc65 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTable.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.io.File; @@ -52,11 +51,15 @@ public void dropTestTable() { @Test public void testTransformIgnoreCase() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE IF NOT EXISTS %s (id BIGINT NOT NULL, ts timestamp) " + - "USING iceberg partitioned by (HOURS(ts))", tableName); + sql( + "CREATE TABLE IF NOT EXISTS %s (id BIGINT NOT NULL, ts timestamp) " + + "USING iceberg partitioned by (HOURS(ts))", + tableName); Assert.assertTrue("Table should already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE IF NOT EXISTS %s (id BIGINT NOT NULL, ts timestamp) " + - "USING iceberg partitioned by (hours(ts))", tableName); + sql( + "CREATE TABLE IF NOT EXISTS %s (id BIGINT NOT NULL, ts timestamp) " + + "USING iceberg partitioned by (hours(ts))", + tableName); Assert.assertTrue("Table should already exist", validationCatalog.tableExists(tableIdent)); } @@ -69,18 +72,22 @@ public void testCreateTable() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); } @Test public void testCreateTableInRootNamespace() { - Assume.assumeTrue("Hadoop has no default namespace configured", "testhadoop".equals(catalogName)); + Assume.assumeTrue( + "Hadoop has no default namespace configured", "testhadoop".equals(catalogName)); try { sql("CREATE TABLE %s.table (id bigint) USING iceberg", catalogName); @@ -102,47 +109,61 @@ public void testCreateTableUsingParquet() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertEquals("Should not have default format parquet", + Assert.assertEquals( + "Should not have default format parquet", "parquet", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); - AssertHelpers.assertThrows("Should reject unsupported format names", - IllegalArgumentException.class, "Unsupported format in USING: crocodile", - () -> sql("CREATE TABLE %s.default.fail (id BIGINT NOT NULL, data STRING) USING crocodile", catalogName)); + AssertHelpers.assertThrows( + "Should reject unsupported format names", + IllegalArgumentException.class, + "Unsupported format in USING: crocodile", + () -> + sql( + "CREATE TABLE %s.default.fail (id BIGINT NOT NULL, data STRING) USING crocodile", + catalogName)); } @Test public void testCreateTablePartitionedBy() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, created_at TIMESTAMP, category STRING, data STRING) " + - "USING iceberg " + - "PARTITIONED BY (category, bucket(8, id), days(created_at))", tableName); + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, created_at TIMESTAMP, category STRING, data STRING) " + + "USING iceberg " + + "PARTITIONED BY (category, bucket(8, id), days(created_at))", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "created_at", Types.TimestampType.withZone()), - NestedField.optional(3, "category", Types.StringType.get()), - NestedField.optional(4, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); - - PartitionSpec expectedSpec = PartitionSpec.builderFor(new Schema(expectedSchema.fields())) - .identity("category") - .bucket("id", 8) - .day("created_at") - .build(); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "created_at", Types.TimestampType.withZone()), + NestedField.optional(3, "category", Types.StringType.get()), + NestedField.optional(4, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); + + PartitionSpec expectedSpec = + PartitionSpec.builderFor(new Schema(expectedSchema.fields())) + .identity("category") + .bucket("id", 8) + .day("created_at") + .build(); Assert.assertEquals("Should be partitioned correctly", expectedSpec, table.spec()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); } @@ -150,20 +171,24 @@ public void testCreateTablePartitionedBy() { public void testCreateTableColumnComments() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL COMMENT 'Unique identifier', data STRING COMMENT 'Data value') " + - "USING iceberg", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL COMMENT 'Unique identifier', data STRING COMMENT 'Data value') " + + "USING iceberg", tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get(), "Unique identifier"), - NestedField.optional(2, "data", Types.StringType.get(), "Data value")); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get(), "Unique identifier"), + NestedField.optional(2, "data", Types.StringType.get(), "Data value")); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); } @@ -171,24 +196,30 @@ public void testCreateTableColumnComments() { public void testCreateTableComment() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "COMMENT 'Table doc'", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "COMMENT 'Table doc'", tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); - Assert.assertEquals("Should have the table comment set in properties", - "Table doc", table.properties().get(TableCatalog.PROP_COMMENT)); + Assert.assertEquals( + "Should have the table comment set in properties", + "Table doc", + table.properties().get(TableCatalog.PROP_COMMENT)); } @Test @@ -204,43 +235,49 @@ public void testCreateTableLocation() throws Exception { String location = "file:" + tableLocation.toString(); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "LOCATION '%s'", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "LOCATION '%s'", tableName, location); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); - Assert.assertEquals("Should have a custom table location", - location, table.location()); + Assert.assertEquals("Should have a custom table location", location, table.location()); } @Test public void testCreateTableProperties() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "TBLPROPERTIES (p1=2, p2='x')", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "TBLPROPERTIES (p1=2, p2='x')", tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); Assert.assertEquals("Should have property p1", "2", table.properties().get("p1")); Assert.assertEquals("Should have property p2", "x", table.properties().get("p2")); @@ -250,53 +287,56 @@ public void testCreateTableProperties() { public void testCreateTableWithFormatV2ThroughTableProperty() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "TBLPROPERTIES ('format-version'='2')", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "TBLPROPERTIES ('format-version'='2')", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("should create table using format v2", - 2, ((BaseTable) table).operations().current().formatVersion()); + Assert.assertEquals( + "should create table using format v2", + 2, + ((BaseTable) table).operations().current().formatVersion()); } @Test public void testUpgradeTableWithFormatV2ThroughTableProperty() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "TBLPROPERTIES ('format-version'='1')", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "TBLPROPERTIES ('format-version'='1')", tableName); Table table = validationCatalog.loadTable(tableIdent); TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("should create table using format v1", - 1, ops.refresh().formatVersion()); + Assert.assertEquals("should create table using format v1", 1, ops.refresh().formatVersion()); sql("ALTER TABLE %s SET TBLPROPERTIES ('format-version'='2')", tableName); - Assert.assertEquals("should update table to use format v2", - 2, ops.refresh().formatVersion()); + Assert.assertEquals("should update table to use format v2", 2, ops.refresh().formatVersion()); } @Test public void testDowngradeTableToFormatV1ThroughTablePropertyFails() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "TBLPROPERTIES ('format-version'='2')", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "TBLPROPERTIES ('format-version'='2')", tableName); Table table = validationCatalog.loadTable(tableIdent); TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("should create table using format v2", - 2, ops.refresh().formatVersion()); + Assert.assertEquals("should create table using format v2", 2, ops.refresh().formatVersion()); - AssertHelpers.assertThrowsCause("should fail to downgrade to v1", + AssertHelpers.assertThrowsCause( + "should fail to downgrade to v1", IllegalArgumentException.class, "Cannot downgrade v2 table to v1", () -> sql("ALTER TABLE %s SET TBLPROPERTIES ('format-version'='1')", tableName)); diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java index fb883d637ab5..2581c0fd3c56 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.when; + import java.util.Map; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; @@ -30,20 +33,19 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.when; - public class TestCreateTableAsSelect extends SparkCatalogTestBase { private final String sourceName; - public TestCreateTableAsSelect(String catalogName, String implementation, Map config) { + public TestCreateTableAsSelect( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); this.sourceName = tableName("source"); - sql("CREATE TABLE IF NOT EXISTS %s (id bigint NOT NULL, data string) " + - "USING iceberg PARTITIONED BY (truncate(id, 3))", sourceName); + sql( + "CREATE TABLE IF NOT EXISTS %s (id bigint NOT NULL, data string) " + + "USING iceberg PARTITIONED BY (truncate(id, 3))", + sourceName); sql("INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", sourceName); } @@ -56,153 +58,178 @@ public void removeTables() { public void testUnpartitionedCTAS() { sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); Table ctasTable = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), ctasTable.schema().asStruct()); - Assert.assertEquals("Should be an unpartitioned table", - 0, ctasTable.spec().fields().size()); - assertEquals("Should have rows matching the source table", + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + ctasTable.schema().asStruct()); + Assert.assertEquals("Should be an unpartitioned table", 0, ctasTable.spec().fields().size()); + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testPartitionedCTAS() { - sql("CREATE TABLE %s USING iceberg PARTITIONED BY (id) AS SELECT * FROM %s ORDER BY id", tableName, sourceName); + sql( + "CREATE TABLE %s USING iceberg PARTITIONED BY (id) AS SELECT * FROM %s ORDER BY id", + tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("id") - .build(); + PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema).identity("id").build(); Table ctasTable = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), ctasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by id", - expectedSpec, ctasTable.spec()); - assertEquals("Should have rows matching the source table", + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + ctasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by id", expectedSpec, ctasTable.spec()); + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testRTAS() { - sql("CREATE TABLE %s USING iceberg TBLPROPERTIES ('prop1'='val1', 'prop2'='val2')" + - "AS SELECT * FROM %s", tableName, sourceName); + sql( + "CREATE TABLE %s USING iceberg TBLPROPERTIES ('prop1'='val1', 'prop2'='val2')" + + "AS SELECT * FROM %s", + tableName, sourceName); - assertEquals("Should have rows matching the source table", + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - sql("REPLACE TABLE %s USING iceberg PARTITIONED BY (part) TBLPROPERTIES ('prop1'='newval1', 'prop3'='val3') AS " + - "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); + sql( + "REPLACE TABLE %s USING iceberg PARTITIONED BY (part) TBLPROPERTIES ('prop1'='newval1', 'prop3'='val3') AS " + + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("part") - .withSpecId(1) - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema).identity("part").withSpecId(1).build(); Table rtasTable = validationCatalog.loadTable(tableIdent); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by part", - expectedSpec, rtasTable.spec()); - - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by part", expectedSpec, rtasTable.spec()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); - Assert.assertEquals("Should have updated table property", - "newval1", rtasTable.properties().get("prop1")); - Assert.assertEquals("Should have preserved table property", - "val2", rtasTable.properties().get("prop2")); - Assert.assertEquals("Should have new table property", - "val3", rtasTable.properties().get("prop3")); + Assert.assertEquals( + "Should have updated table property", "newval1", rtasTable.properties().get("prop1")); + Assert.assertEquals( + "Should have preserved table property", "val2", rtasTable.properties().get("prop2")); + Assert.assertEquals( + "Should have new table property", "val3", rtasTable.properties().get("prop3")); } @Test public void testCreateRTAS() { - sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + - "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); - - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + sql( + "CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + - "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); + sql( + "CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + + "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("part") - .withSpecId(0) // the spec is identical and should be reused - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema) + .identity("part") + .withSpecId(0) // the spec is identical and should be reused + .build(); Table rtasTable = validationCatalog.loadTable(tableIdent); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by part", - expectedSpec, rtasTable.spec()); - - assertEquals("Should have rows matching the source table", - sql("SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by part", expectedSpec, rtasTable.spec()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); } @Test public void testDataFrameV2Create() throws Exception { spark.table(sourceName).writeTo(tableName).using("iceberg").create(); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); Table ctasTable = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), ctasTable.schema().asStruct()); - Assert.assertEquals("Should be an unpartitioned table", - 0, ctasTable.spec().fields().size()); - assertEquals("Should have rows matching the source table", + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + ctasTable.schema().asStruct()); + Assert.assertEquals("Should be an unpartitioned table", 0, ctasTable.spec().fields().size()); + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -211,11 +238,13 @@ public void testDataFrameV2Create() throws Exception { public void testDataFrameV2Replace() throws Exception { spark.table(sourceName).writeTo(tableName).using("iceberg").create(); - assertEquals("Should have rows matching the source table", + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - spark.table(sourceName) + spark + .table(sourceName) .select( col("id"), col("data"), @@ -226,37 +255,40 @@ public void testDataFrameV2Replace() throws Exception { .using("iceberg") .replace(); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("part") - .withSpecId(1) - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema).identity("part").withSpecId(1).build(); Table rtasTable = validationCatalog.loadTable(tableIdent); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by part", - expectedSpec, rtasTable.spec()); - - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by part", expectedSpec, rtasTable.spec()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); } @Test public void testDataFrameV2CreateOrReplace() { - spark.table(sourceName) + spark + .table(sourceName) .select( col("id"), col("data"), @@ -267,12 +299,16 @@ public void testDataFrameV2CreateOrReplace() { .using("iceberg") .createOrReplace(); - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - spark.table(sourceName) + spark + .table(sourceName) .select(col("id").multiply(lit(2)).as("id"), col("data")) .select( col("id"), @@ -284,80 +320,97 @@ public void testDataFrameV2CreateOrReplace() { .using("iceberg") .createOrReplace(); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("part") - .withSpecId(0) // the spec is identical and should be reused - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema) + .identity("part") + .withSpecId(0) // the spec is identical and should be reused + .build(); Table rtasTable = validationCatalog.loadTable(tableIdent); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by part", - expectedSpec, rtasTable.spec()); - - assertEquals("Should have rows matching the source table", - sql("SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by part", expectedSpec, rtasTable.spec()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); } @Test public void testCreateRTASWithPartitionSpecChanging() { - sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + - "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); + sql( + "CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); Table rtasTable = validationCatalog.loadTable(tableIdent); - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), - sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), + sql("SELECT * FROM %s ORDER BY id", tableName)); // Change the partitioning of the table rtasTable.updateSpec().removeField("part").commit(); // Spec 1 - sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part, id) AS " + - "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); + sql( + "CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part, id) AS " + + "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); - Schema expectedSchema = new Schema( + Schema expectedSchema = + new Schema( Types.NestedField.optional(1, "id", Types.LongType.get()), Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema) .alwaysNull("part", "part_1000") .identity("part") .identity("id") .withSpecId(2) // The Spec is new .build(); - Assert.assertEquals("Should be partitioned by part and id", - expectedSpec, rtasTable.spec()); + Assert.assertEquals("Should be partitioned by part and id", expectedSpec, rtasTable.spec()); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - - assertEquals("Should have rows matching the source table", - sql("SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), - sql("SELECT * FROM %s ORDER BY id", tableName)); + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), + sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java index e538334103e5..cef2fd58ed71 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -51,26 +50,29 @@ public void removeTables() { public void testDeleteFromUnpartitionedTable() throws NoSuchTableException { sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.coalesce(1).writeTo(tableName).append(); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName)); - AssertHelpers.assertThrows("Should not delete when not all rows of a file match the filter", - ValidationException.class, "Cannot delete file where some, but not all, rows match filter", + AssertHelpers.assertThrows( + "Should not delete when not all rows of a file match the filter", + ValidationException.class, + "Cannot delete file where some, but not all, rows match filter", () -> sql("DELETE FROM %s WHERE id < 2", tableName)); sql("DELETE FROM %s WHERE id < 4", tableName); - Assert.assertEquals("Should have no rows after successful delete", - 0L, scalarSql("SELECT count(1) FROM %s", tableName)); + Assert.assertEquals( + "Should have no rows after successful delete", + 0L, + scalarSql("SELECT count(1) FROM %s", tableName)); } @Test @@ -81,46 +83,50 @@ public void testDeleteFromTableAtSnapshot() throws NoSuchTableException { sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.coalesce(1).writeTo(tableName).append(); long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); String prefix = "snapshot_id_"; - AssertHelpers.assertThrows("Should not be able to delete from a table at a specific snapshot", - IllegalArgumentException.class, "Cannot delete from table at a specific snapshot", + AssertHelpers.assertThrows( + "Should not be able to delete from a table at a specific snapshot", + IllegalArgumentException.class, + "Cannot delete from table at a specific snapshot", () -> sql("DELETE FROM %s.%s WHERE id < 4", tableName, prefix + snapshotId)); } @Test public void testDeleteFromPartitionedTable() throws NoSuchTableException { - sql("CREATE TABLE %s (id bigint, data string) " + - "USING iceberg " + - "PARTITIONED BY (truncate(id, 2))", tableName); - - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + sql( + "CREATE TABLE %s (id bigint, data string) " + + "USING iceberg " + + "PARTITIONED BY (truncate(id, 2))", + tableName); + + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.coalesce(1).writeTo(tableName).append(); - assertEquals("Should have 3 rows in 2 partitions", + assertEquals( + "Should have 3 rows in 2 partitions", ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName)); - AssertHelpers.assertThrows("Should not delete when not all rows of a file match the filter", - ValidationException.class, "Cannot delete file where some, but not all, rows match filter", + AssertHelpers.assertThrows( + "Should not delete when not all rows of a file match the filter", + ValidationException.class, + "Cannot delete file where some, but not all, rows match filter", () -> sql("DELETE FROM %s WHERE id > 2", tableName)); sql("DELETE FROM %s WHERE id < 2", tableName); - assertEquals("Should have two rows in the second partition", + assertEquals( + "Should have two rows in the second partition", ImmutableList.of(row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -130,7 +136,8 @@ public void testDeleteFromWhereFalse() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -141,6 +148,7 @@ public void testDeleteFromWhereFalse() { table.refresh(); - Assert.assertEquals("Delete should not produce a new snapshot", 1, Iterables.size(table.snapshots())); + Assert.assertEquals( + "Delete should not produce a new snapshot", 1, Iterables.size(table.snapshots())); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestNamespaceSQL.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestNamespaceSQL.java index d1eac312669a..317a95cd0140 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestNamespaceSQL.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestNamespaceSQL.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.io.File; @@ -56,7 +55,8 @@ public void cleanNamespaces() { @Test public void testCreateNamespace() { - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); @@ -76,7 +76,8 @@ public void testDefaultNamespace() { @Test public void testDropEmptyNamespace() { - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); @@ -84,23 +85,28 @@ public void testDropEmptyNamespace() { sql("DROP NAMESPACE %s", fullNamespace); - Assert.assertFalse("Namespace should have been dropped", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should have been dropped", validationNamespaceCatalog.namespaceExists(NS)); } @Test public void testDropNonEmptyNamespace() { Assume.assumeFalse("Session catalog has flaky behavior", "spark_catalog".equals(catalogName)); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); sql("CREATE TABLE %s.table (id bigint) USING iceberg", fullNamespace); Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(NS)); - Assert.assertTrue("Table should exist", validationCatalog.tableExists(TableIdentifier.of(NS, "table"))); + Assert.assertTrue( + "Table should exist", validationCatalog.tableExists(TableIdentifier.of(NS, "table"))); - AssertHelpers.assertThrows("Should fail if trying to delete a non-empty namespace", - SparkException.class, "non-empty namespace", + AssertHelpers.assertThrows( + "Should fail if trying to delete a non-empty namespace", + SparkException.class, + "non-empty namespace", () -> sql("DROP NAMESPACE %s", fullNamespace)); sql("DROP TABLE %s.table", fullNamespace); @@ -108,7 +114,8 @@ public void testDropNonEmptyNamespace() { @Test public void testListTables() { - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); @@ -126,7 +133,8 @@ public void testListTables() { @Test public void testListNamespace() { - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); @@ -136,17 +144,23 @@ public void testListNamespace() { if (isHadoopCatalog) { Assert.assertEquals("Should have 1 namespace", 1, namespaces.size()); - Set namespaceNames = namespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); + Set namespaceNames = + namespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); Assert.assertEquals("Should have only db namespace", ImmutableSet.of("db"), namespaceNames); } else { Assert.assertEquals("Should have 2 namespaces", 2, namespaces.size()); - Set namespaceNames = namespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); - Assert.assertEquals("Should have default and db namespaces", ImmutableSet.of("default", "db"), namespaceNames); + Set namespaceNames = + namespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); + Assert.assertEquals( + "Should have default and db namespaces", + ImmutableSet.of("default", "db"), + namespaceNames); } List nestedNamespaces = sql("SHOW NAMESPACES IN %s", fullNamespace); - Set nestedNames = nestedNamespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); + Set nestedNames = + nestedNamespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); Assert.assertEquals("Should not have nested namespaces", ImmutableSet.of(), nestedNames); } @@ -154,7 +168,8 @@ public void testListNamespace() { public void testCreateNamespaceWithMetadata() { Assume.assumeFalse("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s WITH PROPERTIES ('prop'='value')", fullNamespace); @@ -162,14 +177,16 @@ public void testCreateNamespaceWithMetadata() { Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Namespace should have expected prop value", "value", nsMetadata.get("prop")); + Assert.assertEquals( + "Namespace should have expected prop value", "value", nsMetadata.get("prop")); } @Test public void testCreateNamespaceWithComment() { Assume.assumeFalse("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s COMMENT 'namespace doc'", fullNamespace); @@ -177,14 +194,16 @@ public void testCreateNamespaceWithComment() { Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Namespace should have expected comment", "namespace doc", nsMetadata.get("comment")); + Assert.assertEquals( + "Namespace should have expected comment", "namespace doc", nsMetadata.get("comment")); } @Test public void testCreateNamespaceWithLocation() throws Exception { Assume.assumeFalse("HadoopCatalog does not support namespace locations", isHadoopCatalog); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); File location = temp.newFile(); Assert.assertTrue(location.delete()); @@ -195,27 +214,32 @@ public void testCreateNamespaceWithLocation() throws Exception { Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Namespace should have expected location", - "file:" + location.getPath(), nsMetadata.get("location")); + Assert.assertEquals( + "Namespace should have expected location", + "file:" + location.getPath(), + nsMetadata.get("location")); } @Test public void testSetProperties() { Assume.assumeFalse("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(NS)); Map defaultMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertFalse("Default metadata should not have custom property", defaultMetadata.containsKey("prop")); + Assert.assertFalse( + "Default metadata should not have custom property", defaultMetadata.containsKey("prop")); sql("ALTER NAMESPACE %s SET PROPERTIES ('prop'='value')", fullNamespace); Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Namespace should have expected prop value", "value", nsMetadata.get("prop")); + Assert.assertEquals( + "Namespace should have expected prop value", "value", nsMetadata.get("prop")); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWrites.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWrites.java index 9223797ada32..51c56ac79d4d 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWrites.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWrites.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -34,13 +33,16 @@ import org.junit.Test; public class TestPartitionedWrites extends SparkCatalogTestBase { - public TestPartitionedWrites(String catalogName, String implementation, Map config) { + public TestPartitionedWrites( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @Before public void createTables() { - sql("CREATE TABLE %s (id bigint, data string) USING iceberg PARTITIONED BY (truncate(id, 3))", tableName); + sql( + "CREATE TABLE %s (id bigint, data string) USING iceberg PARTITIONED BY (truncate(id, 3))", + tableName); sql("INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", tableName); } @@ -55,17 +57,14 @@ public void testInsertAppend() { sql("INSERT INTO %s VALUES (4, 'd'), (5, 'e')", tableName); - Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -75,88 +74,70 @@ public void testInsertOverwrite() { // 4 and 5 replace 3 in the partition (id - (id % 3)) = 3 sql("INSERT OVERWRITE %s VALUES (4, 'd'), (5, 'e')", tableName); - Assert.assertEquals("Should have 4 rows after overwrite", 4L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 4 rows after overwrite", 4L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2Append() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).append(); - Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2DynamicOverwrite() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).overwritePartitions(); - Assert.assertEquals("Should have 4 rows after overwrite", 4L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 4 rows after overwrite", 4L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2Overwrite() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).overwrite(functions.col("id").$less(3)); - Assert.assertEquals("Should have 3 rows after overwrite", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 3 rows after overwrite", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = ImmutableList.of(row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -166,13 +147,13 @@ public void testViewsReturnRecentResults() { Dataset query = spark.sql("SELECT * FROM " + tableName + " WHERE id = 1"); query.createOrReplaceTempView("tmp"); - assertEquals("View should have expected rows", - ImmutableList.of(row(1L, "a")), - sql("SELECT * FROM tmp")); + assertEquals( + "View should have expected rows", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("View should have expected rows", + assertEquals( + "View should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM tmp")); } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestRefreshTable.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestRefreshTable.java index 7489c3963d3a..555ef440074b 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestRefreshTable.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestRefreshTable.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.spark.sql; import java.util.List; @@ -51,10 +49,12 @@ public void removeTables() { @Test public void testRefreshCommand() { - Assume.assumeFalse("Spark 3.0 Spark Session Catalog does not use V2 Catalogs so Iceberg refresh is impossible", + Assume.assumeFalse( + "Spark 3.0 Spark Session Catalog does not use V2 Catalogs so Iceberg refresh is impossible", Spark3VersionUtil.isSpark30() && catalogName.equals("spark_catalog")); - // We are not allowed to change the session catalog after it has been initialized, so build a new one + // We are not allowed to change the session catalog after it has been initialized, so build a + // new one if (catalogName.equals("spark_catalog")) { spark.conf().set("spark.sql.catalog." + catalogName + ".cache-enabled", true); spark = spark.cloneSession(); @@ -64,7 +64,8 @@ public void testRefreshCommand() { List originalActual = sql("SELECT * FROM %s", tableName); assertEquals("Table should start as expected", originalExpected, originalActual); - // Modify table outside of spark, it should be cached so Spark should see the same value after mutation + // Modify table outside of spark, it should be cached so Spark should see the same value after + // mutation Table table = validationCatalog.loadTable(tableIdent); DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); table.newDelete().deleteFile(file).commit(); diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java index 38dfe0e5afda..f20ded4c7b2a 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -45,10 +44,12 @@ public TestSelect(String catalogName, String implementation, Map super(catalogName, implementation, config); // register a scan event listener to validate pushdown - Listeners.register(event -> { - scanEventCount += 1; - lastScanEvent = event; - }, ScanEvent.class); + Listeners.register( + event -> { + scanEventCount += 1; + lastScanEvent = event; + }, + ScanEvent.class); } @Before @@ -67,8 +68,8 @@ public void removeTables() { @Test public void testSelect() { - List expected = ImmutableList.of( - row(1L, "a", 1.0F), row(2L, "b", 2.0F), row(3L, "c", Float.NaN)); + List expected = + ImmutableList.of(row(1L, "a", 1.0F), row(2L, "b", 2.0F), row(3L, "c", Float.NaN)); assertEquals("Should return all expected rows", expected, sql("SELECT * FROM %s", tableName)); } @@ -77,11 +78,14 @@ public void testSelect() { public void testSelectRewrite() { List expected = ImmutableList.of(row(3L, "c", Float.NaN)); - assertEquals("Should return all expected rows", expected, + assertEquals( + "Should return all expected rows", + expected, sql("SELECT * FROM %s where float = float('NaN')", tableName)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should push down expected filter", + Assert.assertEquals( + "Should push down expected filter", "(float IS NOT NULL AND is_nan(float))", Spark3Util.describe(lastScanEvent.filter())); } @@ -93,8 +97,10 @@ public void testProjection() { assertEquals("Should return all expected rows", expected, sql("SELECT id FROM %s", tableName)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); - Assert.assertEquals("Should project only the id column", + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should project only the id column", validationCatalog.loadTable(tableIdent).schema().select("id").asStruct(), lastScanEvent.projection().asStruct()); } @@ -103,13 +109,18 @@ public void testProjection() { public void testExpressionPushdown() { List expected = ImmutableList.of(row("b")); - assertEquals("Should return all expected rows", expected, sql("SELECT data FROM %s WHERE id = 2", tableName)); + assertEquals( + "Should return all expected rows", + expected, + sql("SELECT data FROM %s WHERE id = 2", tableName)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should push down expected filter", + Assert.assertEquals( + "Should push down expected filter", "(id IS NOT NULL AND id = 2)", Spark3Util.describe(lastScanEvent.filter())); - Assert.assertEquals("Should project only id and data columns", + Assert.assertEquals( + "Should project only id and data columns", validationCatalog.loadTable(tableIdent).schema().select("id", "data").asStruct(), lastScanEvent.projection().asStruct()); } @@ -120,7 +131,8 @@ public void testMetadataTables() { "Spark session catalog does not support metadata tables", "spark_catalog".equals(catalogName)); - assertEquals("Snapshot metadata table", + assertEquals( + "Snapshot metadata table", ImmutableList.of(row(ANY, ANY, null, "append", ANY, ANY)), sql("SELECT * FROM %s.snapshots", tableName)); } @@ -144,10 +156,12 @@ public void testSnapshotInTableName() { assertEquals("Snapshot at specific ID, prefix " + prefix, expected, actual); // read the table using DataFrameReader option - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) - .load(tableName); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) + .load(tableName); List fromDF = rowsToJava(df.collectAsList()); assertEquals("Snapshot at specific ID " + snapshotId, expected, fromDF); } @@ -172,10 +186,12 @@ public void testTimestampInTableName() { assertEquals("Snapshot at timestamp, prefix " + prefix, expected, actual); // read the table using DataFrameReader option - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) - .load(tableName); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) + .load(tableName); List fromDF = rowsToJava(df.collectAsList()); assertEquals("Snapshot at timestamp " + timestamp, expected, fromDF); } @@ -185,22 +201,25 @@ public void testSpecifySnapshotAndTimestamp() { // get the snapshot ID of the last write long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); // get a timestamp just after the last write - long timestamp = validationCatalog.loadTable(tableIdent).currentSnapshot().timestampMillis() + 2; + long timestamp = + validationCatalog.loadTable(tableIdent).currentSnapshot().timestampMillis() + 2; // create a second snapshot sql("INSERT INTO %s VALUES (4, 'd', 4.0), (5, 'e', 5.0)", tableName); - AssertHelpers.assertThrows("Should not be able to specify both snapshot id and timestamp", + AssertHelpers.assertThrows( + "Should not be able to specify both snapshot id and timestamp", IllegalArgumentException.class, - String.format("Cannot specify both snapshot-id (%s) and as-of-timestamp (%s)", - snapshotId, timestamp), + String.format( + "Cannot specify both snapshot-id (%s) and as-of-timestamp (%s)", snapshotId, timestamp), () -> { - spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) - .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) - .load(tableName) - .collectAsList(); + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) + .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) + .load(tableName) + .collectAsList(); }); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestTimestampWithoutZone.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestTimestampWithoutZone.java index ddaac5256e10..51b8d255a99b 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestTimestampWithoutZone.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestTimestampWithoutZone.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.sql.Timestamp; @@ -50,32 +49,35 @@ public class TestTimestampWithoutZone extends SparkCatalogTestBase { private static final String newTableName = "created_table"; private final Map config; - private static final Schema schema = new Schema( + private static final Schema schema = + new Schema( Types.NestedField.required(1, "id", Types.LongType.get()), Types.NestedField.required(2, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.required(3, "tsz", Types.TimestampType.withZone()) - ); + Types.NestedField.required(3, "tsz", Types.TimestampType.withZone())); - private final List values = ImmutableList.of( + private final List values = + ImmutableList.of( row(1L, toTimestamp("2021-01-01T00:00:00.0"), toTimestamp("2021-02-01T00:00:00.0")), row(2L, toTimestamp("2021-01-01T00:00:00.0"), toTimestamp("2021-02-01T00:00:00.0")), - row(3L, toTimestamp("2021-01-01T00:00:00.0"), toTimestamp("2021-02-01T00:00:00.0")) - ); + row(3L, toTimestamp("2021-01-01T00:00:00.0"), toTimestamp("2021-02-01T00:00:00.0"))); @Parameterized.Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") public static Object[][] parameters() { - return new Object[][]{{"spark_catalog", - SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "parquet-enabled", "true", - "cache-enabled", "false" - )} + return new Object[][] { + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "parquet-enabled", "true", + "cache-enabled", "false") + } }; } - public TestTimestampWithoutZone(String catalogName, String implementation, Map config) { + public TestTimestampWithoutZone( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); this.config = config; } @@ -94,8 +96,10 @@ public void removeTables() { @Test public void testWriteTimestampWithoutZoneError() { AssertHelpers.assertThrows( - String.format("Write operation performed on a timestamp without timezone field while " + - "'%s' set to false should throw exception", SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), + String.format( + "Write operation performed on a timestamp without timezone field while " + + "'%s' set to false should throw exception", + SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), IllegalArgumentException.class, SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, () -> sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values))); @@ -103,72 +107,98 @@ public void testWriteTimestampWithoutZoneError() { @Test public void testAppendTimestampWithoutZone() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); - - Assert.assertEquals("Should have " + values.size() + " row", - (long) values.size(), scalarSql("SELECT count(*) FROM %s", tableName)); - - assertEquals("Row data should match expected", - values, sql("SELECT * FROM %s ORDER BY id", tableName)); - }); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); + + Assert.assertEquals( + "Should have " + values.size() + " row", + (long) values.size(), + scalarSql("SELECT count(*) FROM %s", tableName)); + + assertEquals( + "Row data should match expected", + values, + sql("SELECT * FROM %s ORDER BY id", tableName)); + }); } @Test public void testCreateAsSelectWithTimestampWithoutZone() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); - sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); + sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); - Assert.assertEquals("Should have " + values.size() + " row", (long) values.size(), + Assert.assertEquals( + "Should have " + values.size() + " row", + (long) values.size(), scalarSql("SELECT count(*) FROM %s", newTableName)); - assertEquals("Row data should match expected", + assertEquals( + "Row data should match expected", sql("SELECT * FROM %s ORDER BY id", tableName), sql("SELECT * FROM %s ORDER BY id", newTableName)); - }); + }); } @Test public void testCreateNewTableShouldHaveTimestampWithZoneIcebergType() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); - sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); + sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); - Assert.assertEquals("Should have " + values.size() + " row", (long) values.size(), + Assert.assertEquals( + "Should have " + values.size() + " row", + (long) values.size(), scalarSql("SELECT count(*) FROM %s", newTableName)); - assertEquals("Data from created table should match data from base table", + assertEquals( + "Data from created table should match data from base table", sql("SELECT * FROM %s ORDER BY id", tableName), sql("SELECT * FROM %s ORDER BY id", newTableName)); - Table createdTable = validationCatalog.loadTable(TableIdentifier.of("default", newTableName)); - assertFieldsType(createdTable.schema(), Types.TimestampType.withZone(), "ts", "tsz"); - }); + Table createdTable = + validationCatalog.loadTable(TableIdentifier.of("default", newTableName)); + assertFieldsType(createdTable.schema(), Types.TimestampType.withZone(), "ts", "tsz"); + }); } @Test public void testCreateNewTableShouldHaveTimestampWithoutZoneIcebergType() { - withSQLConf(ImmutableMap.of( + withSQLConf( + ImmutableMap.of( SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true", - SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, "true"), () -> { - spark.sessionState().catalogManager().currentCatalog() + SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, "true"), + () -> { + spark + .sessionState() + .catalogManager() + .currentCatalog() .initialize(catalog.name(), new CaseInsensitiveStringMap(config)); - sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); + sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); - sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); + sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); - Assert.assertEquals("Should have " + values.size() + " row", (long) values.size(), + Assert.assertEquals( + "Should have " + values.size() + " row", + (long) values.size(), scalarSql("SELECT count(*) FROM %s", newTableName)); - assertEquals("Row data should match expected", + assertEquals( + "Row data should match expected", sql("SELECT * FROM %s ORDER BY id", tableName), sql("SELECT * FROM %s ORDER BY id", newTableName)); - Table createdTable = validationCatalog.loadTable(TableIdentifier.of("default", newTableName)); - assertFieldsType(createdTable.schema(), Types.TimestampType.withoutZone(), "ts", "tsz"); - }); + Table createdTable = + validationCatalog.loadTable(TableIdentifier.of("default", newTableName)); + assertFieldsType(createdTable.schema(), Types.TimestampType.withoutZone(), "ts", "tsz"); + }); } private Timestamp toTimestamp(String value) { @@ -176,21 +206,33 @@ private Timestamp toTimestamp(String value) { } private String rowToSqlValues(List rows) { - List rowValues = rows.stream().map(row -> { - List columns = Arrays.stream(row).map(value -> { - if (value instanceof Long) { - return value.toString(); - } else if (value instanceof Timestamp) { - return String.format("timestamp '%s'", value); - } - throw new RuntimeException("Type is not supported"); - }).collect(Collectors.toList()); - return "(" + Joiner.on(",").join(columns) + ")"; - }).collect(Collectors.toList()); + List rowValues = + rows.stream() + .map( + row -> { + List columns = + Arrays.stream(row) + .map( + value -> { + if (value instanceof Long) { + return value.toString(); + } else if (value instanceof Timestamp) { + return String.format("timestamp '%s'", value); + } + throw new RuntimeException("Type is not supported"); + }) + .collect(Collectors.toList()); + return "(" + Joiner.on(",").join(columns) + ")"; + }) + .collect(Collectors.toList()); return Joiner.on(",").join(rowValues); } private void assertFieldsType(Schema actual, Type.PrimitiveType expected, String... fields) { - actual.select(fields).asStruct().fields().forEach(field -> Assert.assertEquals(expected, field.type())); + actual + .select(fields) + .asStruct() + .fields() + .forEach(field -> Assert.assertEquals(expected, field.type())); } } diff --git a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestUnpartitionedWrites.java b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestUnpartitionedWrites.java index 7a6ea0996e22..8db2fefabdfb 100644 --- a/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestUnpartitionedWrites.java +++ b/spark/v3.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestUnpartitionedWrites.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -36,7 +35,8 @@ import org.junit.Test; public class TestUnpartitionedWrites extends SparkCatalogTestBase { - public TestUnpartitionedWrites(String catalogName, String implementation, Map config) { + public TestUnpartitionedWrites( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -57,17 +57,14 @@ public void testInsertAppend() { sql("INSERT INTO %s VALUES (4, 'd'), (5, 'e')", tableName); - Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -76,14 +73,13 @@ public void testInsertOverwrite() { sql("INSERT OVERWRITE %s VALUES (4, 'd'), (5, 'e')", tableName); - Assert.assertEquals("Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(4L, "d"), - row(5L, "e") - ); + List expected = ImmutableList.of(row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -94,8 +90,10 @@ public void testInsertAppendAtSnapshot() { long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); String prefix = "snapshot_id_"; - AssertHelpers.assertThrows("Should not be able to insert into a table at a specific snapshot", - IllegalArgumentException.class, "Cannot write to table at a specific snapshot", + AssertHelpers.assertThrows( + "Should not be able to insert into a table at a specific snapshot", + IllegalArgumentException.class, + "Cannot write to table at a specific snapshot", () -> sql("INSERT INTO %s.%s VALUES (4, 'd'), (5, 'e')", tableName, prefix + snapshotId)); } @@ -107,77 +105,68 @@ public void testInsertOverwriteAtSnapshot() { long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); String prefix = "snapshot_id_"; - AssertHelpers.assertThrows("Should not be able to insert into a table at a specific snapshot", - IllegalArgumentException.class, "Cannot write to table at a specific snapshot", - () -> sql("INSERT OVERWRITE %s.%s VALUES (4, 'd'), (5, 'e')", tableName, prefix + snapshotId)); + AssertHelpers.assertThrows( + "Should not be able to insert into a table at a specific snapshot", + IllegalArgumentException.class, + "Cannot write to table at a specific snapshot", + () -> + sql( + "INSERT OVERWRITE %s.%s VALUES (4, 'd'), (5, 'e')", + tableName, prefix + snapshotId)); } @Test public void testDataFrameV2Append() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).append(); - Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2DynamicOverwrite() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).overwritePartitions(); - Assert.assertEquals("Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(4L, "d"), - row(5L, "e") - ); + List expected = ImmutableList.of(row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2Overwrite() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).overwrite(functions.col("id").$less$eq(3)); - Assert.assertEquals("Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(4L, "d"), - row(5L, "e") - ); + List expected = ImmutableList.of(row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java index 51ac57855bbe..8918dfec6584 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Objects; @@ -25,8 +24,7 @@ public class Employee { private Integer id; private String dep; - public Employee() { - } + public Employee() {} public Employee(Integer id, String dep) { this.id = id; diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java index 36ca608ccd3b..0a1cf7520463 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; + import java.util.Map; import org.apache.iceberg.CatalogUtil; import org.apache.iceberg.hive.HiveCatalog; @@ -30,11 +31,10 @@ import org.apache.spark.sql.internal.SQLConf; import org.junit.BeforeClass; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; - public abstract class SparkExtensionsTestBase extends SparkCatalogTestBase { - public SparkExtensionsTestBase(String catalogName, String implementation, Map config) { + public SparkExtensionsTestBase( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -44,18 +44,21 @@ public static void startMetastoreAndSpark() { metastore.start(); SparkTestBase.hiveConf = metastore.hiveConf(); - SparkTestBase.spark = SparkSession.builder() - .master("local[2]") - .config("spark.testing", "true") - .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") - .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .config("spark.sql.shuffle.partitions", "4") - .config("spark.sql.hive.metastorePartitionPruningFallbackOnException", "true") - .enableHiveSupport() - .getOrCreate(); + SparkTestBase.spark = + SparkSession.builder() + .master("local[2]") + .config("spark.testing", "true") + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config("spark.sql.shuffle.partitions", "4") + .config("spark.sql.hive.metastorePartitionPruningFallbackOnException", "true") + .enableHiveSupport() + .getOrCreate(); - SparkTestBase.catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + SparkTestBase.catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); } } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java index 1dc0bd4c9c77..37f6dc37d580 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -39,13 +45,6 @@ import org.junit.runners.Parameterized; import org.junit.runners.Parameterized.Parameters; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE; - @RunWith(Parameterized.class) public abstract class SparkRowLevelOperationsTestBase extends SparkExtensionsTestBase { @@ -55,49 +54,58 @@ public abstract class SparkRowLevelOperationsTestBase extends SparkExtensionsTes protected final boolean vectorized; protected final String distributionMode; - public SparkRowLevelOperationsTestBase(String catalogName, String implementation, - Map config, String fileFormat, - boolean vectorized, - String distributionMode) { + public SparkRowLevelOperationsTestBase( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config); this.fileFormat = fileFormat; this.vectorized = vectorized; this.distributionMode = distributionMode; } - @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}," + - " format = {3}, vectorized = {4}, distributionMode = {5}") + @Parameters( + name = + "catalogName = {0}, implementation = {1}, config = {2}," + + " format = {3}, vectorized = {4}, distributionMode = {5}") public static Object[][] parameters() { return new Object[][] { - { "testhive", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default" - ), - "orc", - true, - WRITE_DISTRIBUTION_MODE_NONE - }, - { "testhadoop", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hadoop" + { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default"), + "orc", + true, + WRITE_DISTRIBUTION_MODE_NONE + }, + { + "testhadoop", + SparkCatalog.class.getName(), + ImmutableMap.of("type", "hadoop"), + "parquet", + RANDOM.nextBoolean(), + WRITE_DISTRIBUTION_MODE_HASH + }, + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "clients", "1", + "parquet-enabled", "false", + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync ), - "parquet", - RANDOM.nextBoolean(), - WRITE_DISTRIBUTION_MODE_HASH - }, - { "spark_catalog", SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "clients", "1", - "parquet-enabled", "false", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - ), - "avro", - false, - WRITE_DISTRIBUTION_MODE_RANGE - } + "avro", + false, + WRITE_DISTRIBUTION_MODE_RANGE + } }; } @@ -105,11 +113,15 @@ public static Object[][] parameters() { protected void initTable() { sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DEFAULT_FILE_FORMAT, fileFormat); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, distributionMode); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, distributionMode); switch (fileFormat) { case "parquet": - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')", tableName, PARQUET_VECTORIZATION_ENABLED, vectorized); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')", + tableName, PARQUET_VECTORIZATION_ENABLED, vectorized); break; case "orc": Assert.assertTrue(vectorized); @@ -120,9 +132,10 @@ protected void initTable() { } Map props = extraTableProperties(); - props.forEach((prop, value) -> { - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, prop, value); - }); + props.forEach( + (prop, value) -> { + sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, prop, value); + }); } protected void createAndInitTable(String schema) { @@ -170,9 +183,10 @@ protected void createOrReplaceView(String name, List data, Encoder enc } private Dataset toDS(String schema, String jsonData) { - List jsonRows = Arrays.stream(jsonData.split("\n")) - .filter(str -> str.trim().length() > 0) - .collect(Collectors.toList()); + List jsonRows = + Arrays.stream(jsonData.split("\n")) + .filter(str -> str.trim().length() > 0) + .collect(Collectors.toList()); Dataset jsonDS = spark.createDataset(jsonRows, Encoders.STRING()); if (schema != null) { @@ -182,16 +196,23 @@ private Dataset toDS(String schema, String jsonData) { } } - protected void validateSnapshot(Snapshot snapshot, String operation, String changedPartitionCount, - String deletedDataFiles, String addedDataFiles) { + protected void validateSnapshot( + Snapshot snapshot, + String operation, + String changedPartitionCount, + String deletedDataFiles, + String addedDataFiles) { Assert.assertEquals("Operation must match", operation, snapshot.operation()); - Assert.assertEquals("Changed partitions count must match", + Assert.assertEquals( + "Changed partitions count must match", changedPartitionCount, snapshot.summary().get("changed-partition-count")); - Assert.assertEquals("Deleted data files count must match", + Assert.assertEquals( + "Deleted data files count must match", deletedDataFiles, snapshot.summary().get("deleted-data-files")); - Assert.assertEquals("Added data files count must match", + Assert.assertEquals( + "Added data files count must match", addedDataFiles, snapshot.summary().get("added-data-files")); } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java index 215f15a4765a..046590cd0dbd 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.util.List; @@ -59,19 +60,17 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestAddFilesProcedure extends SparkExtensionsTestBase { private final String sourceTableName = "source_table"; private File fileTableDir; - public TestAddFilesProcedure(String catalogName, String implementation, Map config) { + public TestAddFilesProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Before public void setupTempDirs() { @@ -97,12 +96,15 @@ public void addDataUnpartitioned() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -116,12 +118,15 @@ public void addDataUnpartitionedOrc() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`orc`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`orc`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -134,10 +139,12 @@ public void addAvroFile() throws Exception { // Create an Avro file - Schema schema = SchemaBuilder.record("record").fields() - .requiredInt("id") - .requiredString("data") - .endRecord(); + Schema schema = + SchemaBuilder.record("record") + .fields() + .requiredInt("id") + .requiredString("data") + .endRecord(); GenericRecord record1 = new GenericData.Record(schema); record1.put("id", 1L); record1.put("data", "a"); @@ -153,30 +160,30 @@ public void addAvroFile() throws Exception { dataFileWriter.append(record2); dataFileWriter.close(); - String createIceberg = - "CREATE TABLE %s (id Long, data String) USING iceberg"; + String createIceberg = "CREATE TABLE %s (id Long, data String) USING iceberg"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`avro`.`%s`')", - catalogName, tableName, outputFile.getPath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`avro`.`%s`')", + catalogName, tableName, outputFile.getPath()); Assert.assertEquals(1L, result); - List expected = Lists.newArrayList( - new Object[]{1L, "a"}, - new Object[]{2L, "b"} - ); + List expected = Lists.newArrayList(new Object[] {1L, "a"}, new Object[] {2L, "b"}); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); - List actualRecordCount = sql("select %s from %s.files", - DataFile.RECORD_COUNT.name(), - tableName); + List actualRecordCount = + sql("select %s from %s.files", DataFile.RECORD_COUNT.name(), tableName); List expectedRecordCount = Lists.newArrayList(); - expectedRecordCount.add(new Object[]{2L}); - assertEquals("Iceberg file metadata should have correct metadata count", - expectedRecordCount, actualRecordCount); + expectedRecordCount.add(new Object[] {2L}); + assertEquals( + "Iceberg file metadata should have correct metadata count", + expectedRecordCount, + actualRecordCount); } // TODO Adding spark-avro doesn't work in tests @@ -189,12 +196,15 @@ public void addDataUnpartitionedAvro() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`avro`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`avro`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -208,12 +218,13 @@ public void addDataUnpartitionedHive() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + Object result = + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -227,12 +238,15 @@ public void addDataUnpartitionedExtraCol() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -241,17 +255,19 @@ public void addDataUnpartitionedExtraCol() { public void addDataUnpartitionedMissingCol() { createUnpartitionedFileTable("parquet"); - String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String) USING iceberg"; + String createIceberg = "CREATE TABLE %s (id Integer, name String, dept String) USING iceberg"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -265,12 +281,15 @@ public void addDataPartitionedMissingCol() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -284,17 +303,20 @@ public void addDataPartitioned() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } - @Ignore // TODO Classpath issues prevent us from actually writing to a Spark ORC table + @Ignore // TODO Classpath issues prevent us from actually writing to a Spark ORC table public void addDataPartitionedOrc() { createPartitionedFileTable("orc"); @@ -303,12 +325,15 @@ public void addDataPartitionedOrc() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -323,12 +348,15 @@ public void addDataPartitionedAvro() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`avro`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`avro`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -342,12 +370,13 @@ public void addDataPartitionedHive() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + Object result = + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -361,12 +390,15 @@ public void addPartitionToPartitioned() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -376,17 +408,20 @@ public void addFilteredPartitionsToPartitioned() { createCompositePartitionedTable("parquet"); String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (id, dept)"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (id, dept)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -396,18 +431,23 @@ public void addFilteredPartitionsToPartitioned2() { createCompositePartitionedTable("parquet"); String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (id, dept)"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (id, dept)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(6L, result); - assertEquals("Iceberg table contains correct data", - sql("SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", + sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -416,17 +456,20 @@ public void addFilteredPartitionsToPartitionedWithNullValueFilteringOnId() { createCompositePartitionedTableWithNullValueInPartitionColumn("parquet"); String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (id, dept)"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (id, dept)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -436,18 +479,23 @@ public void addFilteredPartitionsToPartitionedWithNullValueFilteringOnDept() { createCompositePartitionedTableWithNullValueInPartitionColumn("parquet"); String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (id, dept)"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (id, dept)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(6L, result); - assertEquals("Iceberg table contains correct data", - sql("SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", + sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -456,13 +504,15 @@ public void addWeirdCaseHiveTable() { createWeirdCaseTable(); String createIceberg = - "CREATE TABLE %s (id Integer, `naMe` String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (`naMe`)"; + "CREATE TABLE %s (id Integer, `naMe` String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (`naMe`)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '%s', map('naMe', 'John Doe'))", - catalogName, tableName, sourceTableName); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '%s', map('naMe', 'John Doe'))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result); @@ -472,22 +522,30 @@ public void addWeirdCaseHiveTable() { Spark does not actually handle this pushdown correctly for hive based tables and it returns 0 records */ List expected = - sql("SELECT id, `naMe`, dept, subdept from %s ORDER BY id", sourceTableName) - .stream() + sql("SELECT id, `naMe`, dept, subdept from %s ORDER BY id", sourceTableName).stream() .filter(r -> r[1].equals("John Doe")) .collect(Collectors.toList()); // TODO when this assert breaks Spark fixed the pushdown issue - Assert.assertEquals("If this assert breaks it means that Spark has fixed the pushdown issue", 0, - sql("SELECT id, `naMe`, dept, subdept from %s WHERE `naMe` = 'John Doe' ORDER BY id", sourceTableName) + Assert.assertEquals( + "If this assert breaks it means that Spark has fixed the pushdown issue", + 0, + sql( + "SELECT id, `naMe`, dept, subdept from %s WHERE `naMe` = 'John Doe' ORDER BY id", + sourceTableName) .size()); // Pushdown works for iceberg - Assert.assertEquals("We should be able to pushdown mixed case partition keys", 2, - sql("SELECT id, `naMe`, dept, subdept FROM %s WHERE `naMe` = 'John Doe' ORDER BY id", tableName) + Assert.assertEquals( + "We should be able to pushdown mixed case partition keys", + 2, + sql( + "SELECT id, `naMe`, dept, subdept FROM %s WHERE `naMe` = 'John Doe' ORDER BY id", + tableName) .size()); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", expected, sql("SELECT id, `naMe`, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -501,12 +559,15 @@ public void addPartitionToPartitionedHive() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '%s', map('id', 1))", - catalogName, tableName, sourceTableName); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '%s', map('id', 1))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -520,19 +581,23 @@ public void invalidDataImport() { sql(createIceberg, tableName); - AssertHelpers.assertThrows("Should forbid adding of partitioned data to unpartitioned table", + AssertHelpers.assertThrows( + "Should forbid adding of partitioned data to unpartitioned table", IllegalArgumentException.class, "Cannot use partition filter with an unpartitioned table", - () -> scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", - catalogName, tableName, fileTableDir.getAbsolutePath()) - ); + () -> + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", + catalogName, tableName, fileTableDir.getAbsolutePath())); - AssertHelpers.assertThrows("Should forbid adding of partitioned data to unpartitioned table", + AssertHelpers.assertThrows( + "Should forbid adding of partitioned data to unpartitioned table", IllegalArgumentException.class, "Cannot add partitioned files to an unpartitioned table", - () -> scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()) - ); + () -> + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath())); } @Test @@ -544,20 +609,25 @@ public void invalidDataImportPartitioned() { sql(createIceberg, tableName); - AssertHelpers.assertThrows("Should forbid adding with a mismatching partition spec", + AssertHelpers.assertThrows( + "Should forbid adding with a mismatching partition spec", IllegalArgumentException.class, "is greater than the number of partitioned columns", - () -> scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('x', '1', 'y', '2'))", - catalogName, tableName, fileTableDir.getAbsolutePath())); + () -> + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('x', '1', 'y', '2'))", + catalogName, tableName, fileTableDir.getAbsolutePath())); - AssertHelpers.assertThrows("Should forbid adding with partition spec with incorrect columns", + AssertHelpers.assertThrows( + "Should forbid adding with partition spec with incorrect columns", IllegalArgumentException.class, "specified partition filter refers to columns that are not partitioned", - () -> scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', '2'))", - catalogName, tableName, fileTableDir.getAbsolutePath())); + () -> + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', '2'))", + catalogName, tableName, fileTableDir.getAbsolutePath())); } - @Test public void addTwice() { createPartitionedHiveTable(); @@ -567,24 +637,30 @@ public void addTwice() { sql(createIceberg, tableName); - Object result1 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1))", - catalogName, tableName, sourceTableName); + Object result1 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result1); - Object result2 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 2))", - catalogName, tableName, sourceTableName); + Object result2 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 2))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result2); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", tableName)); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 2 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s WHERE id = 2 ORDER BY id", tableName)); } @@ -598,21 +674,25 @@ public void duplicateDataPartitioned() { sql(createIceberg, tableName); - scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1))", + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1))", catalogName, tableName, sourceTableName); - AssertHelpers.assertThrows("Should not allow adding duplicate files", + AssertHelpers.assertThrows( + "Should not allow adding duplicate files", IllegalStateException.class, - "Cannot complete import because data files to be imported already" + - " exist within the target table", - () -> scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1))", - catalogName, tableName, sourceTableName)); + "Cannot complete import because data files to be imported already" + + " exist within the target table", + () -> + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1))", + catalogName, tableName, sourceTableName)); } @Test @@ -624,27 +704,33 @@ public void duplicateDataPartitionedAllowed() { sql(createIceberg, tableName); - Object result1 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1))", - catalogName, tableName, sourceTableName); + Object result1 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result1); - Object result2 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1)," + - "check_duplicate_files => false)", - catalogName, tableName, sourceTableName); + Object result2 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1)," + + "check_duplicate_files => false)", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result2); - - assertEquals("Iceberg table contains correct data", - sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 UNION ALL " + - "SELECT id, name, dept, subdept FROM %s WHERE id = 1", sourceTableName, sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT id, name, dept, subdept FROM %s WHERE id = 1 UNION ALL " + + "SELECT id, name, dept, subdept FROM %s WHERE id = 1", + sourceTableName, sourceTableName), sql("SELECT id, name, dept, subdept FROM %s", tableName, tableName)); } @@ -657,15 +743,16 @@ public void duplicateDataUnpartitioned() { sql(createIceberg, tableName); - scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); - AssertHelpers.assertThrows("Should not allow adding duplicate files", + AssertHelpers.assertThrows( + "Should not allow adding duplicate files", IllegalStateException.class, - "Cannot complete import because data files to be imported already" + - " exist within the target table", - () -> scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName)); + "Cannot complete import because data files to be imported already" + + " exist within the target table", + () -> + scalarSql( + "CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName)); } @Test @@ -677,23 +764,25 @@ public void duplicateDataUnpartitionedAllowed() { sql(createIceberg, tableName); - Object result1 = scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + Object result1 = + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result1); - Object result2 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s'," + - "check_duplicate_files => false)", - catalogName, tableName, sourceTableName); + Object result2 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s'," + + "check_duplicate_files => false)", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result2); - assertEquals("Iceberg table contains correct data", - sql("SELECT * FROM (SELECT * FROM %s UNION ALL " + - "SELECT * from %s) ORDER BY id", sourceTableName, sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT * FROM (SELECT * FROM %s UNION ALL " + "SELECT * from %s) ORDER BY id", + sourceTableName, sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); - - } @Test @@ -706,30 +795,34 @@ public void addOrcFileWithDoubleAndFloatColumns() throws Exception { File outputFile = temp.newFile("test.orc"); final int numRows = 5; List expectedRecords = createOrcFile(outputFile, numRows); - String createIceberg = - "CREATE TABLE %s (x float, y double, z long) USING iceberg"; + String createIceberg = "CREATE TABLE %s (x float, y double, z long) USING iceberg"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`orc`.`%s`')", - catalogName, tableName, outputFile.getPath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`orc`.`%s`')", + catalogName, tableName, outputFile.getPath()); Assert.assertEquals(1L, result); - List expected = expectedRecords.stream() - .map(record -> new Object[]{record.get(0), record.get(1), record.get(2)}) - .collect(Collectors.toList()); + List expected = + expectedRecords.stream() + .map(record -> new Object[] {record.get(0), record.get(1), record.get(2)}) + .collect(Collectors.toList()); // x goes 2.00, 1.99, 1.98, ... - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", expected, sql("SELECT * FROM %s ORDER BY x DESC", tableName)); - List actualRecordCount = sql("select %s from %s.files", - DataFile.RECORD_COUNT.name(), - tableName); + List actualRecordCount = + sql("select %s from %s.files", DataFile.RECORD_COUNT.name(), tableName); List expectedRecordCount = Lists.newArrayList(); - expectedRecordCount.add(new Object[]{(long) numRows}); - assertEquals("Iceberg file metadata should have correct metadata count", - expectedRecordCount, actualRecordCount); + expectedRecordCount.add(new Object[] {(long) numRows}); + assertEquals( + "Iceberg file metadata should have correct metadata count", + expectedRecordCount, + actualRecordCount); } @Test @@ -740,21 +833,26 @@ public void testEmptyImportDoesNotThrow() { sql(createIceberg, tableName); // Empty path based import - Object pathResult = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object pathResult = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(0L, pathResult); - assertEquals("Iceberg table contains no added data when importing from an empty path", + assertEquals( + "Iceberg table contains no added data when importing from an empty path", emptyQueryResult, sql("SELECT * FROM %s ORDER BY id", tableName)); // Empty table based import - String createHive = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet"; + String createHive = + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet"; sql(createHive, sourceTableName); - Object tableResult = scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + Object tableResult = + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); Assert.assertEquals(0L, tableResult); - assertEquals("Iceberg table contains no added data when importing from an empty table", + assertEquals( + "Iceberg table contains no added data when importing from an empty table", emptyQueryResult, sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -765,22 +863,26 @@ public void testPartitionedImportFromEmptyPartitionDoesNotThrow() { final int emptyPartitionId = 999; // Add an empty partition to the hive table - sql("ALTER TABLE %s ADD PARTITION (id = '%d') LOCATION '%d'", sourceTableName, - emptyPartitionId, emptyPartitionId); + sql( + "ALTER TABLE %s ADD PARTITION (id = '%d') LOCATION '%d'", + sourceTableName, emptyPartitionId, emptyPartitionId); String createIceberg = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg PARTITIONED BY (id)"; sql(createIceberg, tableName); - Object tableResult = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', %d))", - catalogName, tableName, sourceTableName, emptyPartitionId); + Object tableResult = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', %d))", + catalogName, tableName, sourceTableName, emptyPartitionId); Assert.assertEquals(0L, tableResult); - assertEquals("Iceberg table contains no added data when importing from an empty table", + assertEquals( + "Iceberg table contains no added data when importing from an empty table", emptyQueryResult, sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -788,26 +890,28 @@ public void testPartitionedImportFromEmptyPartitionDoesNotThrow() { private static final List emptyQueryResult = Lists.newArrayList(); private static final StructField[] struct = { - new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), - new StructField("name", DataTypes.StringType, true, Metadata.empty()), - new StructField("dept", DataTypes.StringType, true, Metadata.empty()), - new StructField("subdept", DataTypes.StringType, true, Metadata.empty()) + new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), + new StructField("name", DataTypes.StringType, true, Metadata.empty()), + new StructField("dept", DataTypes.StringType, true, Metadata.empty()), + new StructField("subdept", DataTypes.StringType, true, Metadata.empty()) }; private static final Dataset unpartitionedDF = - spark.createDataFrame( - ImmutableList.of( - RowFactory.create(1, "John Doe", "hr", "communications"), - RowFactory.create(2, "Jane Doe", "hr", "salary"), - RowFactory.create(3, "Matt Doe", "hr", "communications"), - RowFactory.create(4, "Will Doe", "facilities", "all")), - new StructType(struct)).repartition(1); + spark + .createDataFrame( + ImmutableList.of( + RowFactory.create(1, "John Doe", "hr", "communications"), + RowFactory.create(2, "Jane Doe", "hr", "salary"), + RowFactory.create(3, "Matt Doe", "hr", "communications"), + RowFactory.create(4, "Will Doe", "facilities", "all")), + new StructType(struct)) + .repartition(1); private static final Dataset singleNullRecordDF = - spark.createDataFrame( - ImmutableList.of( - RowFactory.create(null, null, null, null)), - new StructType(struct)).repartition(1); + spark + .createDataFrame( + ImmutableList.of(RowFactory.create(null, null, null, null)), new StructType(struct)) + .repartition(1); private static final Dataset partitionedDF = unpartitionedDF.select("name", "dept", "subdept", "id"); @@ -825,8 +929,7 @@ public void testPartitionedImportFromEmptyPartitionDoesNotThrow() { unpartitionedDF.col("dept"), unpartitionedDF.col("name").as("naMe")); - - private void createUnpartitionedFileTable(String format) { + private void createUnpartitionedFileTable(String format) { String createParquet = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s LOCATION '%s'"; @@ -835,10 +938,10 @@ private void createUnpartitionedFileTable(String format) { unpartitionedDF.write().insertInto(sourceTableName); } - private void createPartitionedFileTable(String format) { + private void createPartitionedFileTable(String format) { String createParquet = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s PARTITIONED BY (id) " + - "LOCATION '%s'"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s PARTITIONED BY (id) " + + "LOCATION '%s'"; sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath()); @@ -847,8 +950,9 @@ private void createPartitionedFileTable(String format) { } private void createCompositePartitionedTable(String format) { - String createParquet = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s " + - "PARTITIONED BY (id, dept) LOCATION '%s'"; + String createParquet = + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s " + + "PARTITIONED BY (id, dept) LOCATION '%s'"; sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath()); compositePartitionedDF.write().insertInto(sourceTableName); @@ -856,13 +960,16 @@ private void createCompositePartitionedTable(String format) { } private void createCompositePartitionedTableWithNullValueInPartitionColumn(String format) { - String createParquet = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s " + - "PARTITIONED BY (id, dept) LOCATION '%s'"; + String createParquet = + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s " + + "PARTITIONED BY (id, dept) LOCATION '%s'"; sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath()); - Dataset unionedDF = compositePartitionedDF.unionAll(compositePartitionedNullRecordDF) - .select("name", "subdept", "id", "dept") - .repartition(1); + Dataset unionedDF = + compositePartitionedDF + .unionAll(compositePartitionedNullRecordDF) + .select("name", "subdept", "id", "dept") + .repartition(1); unionedDF.write().insertInto(sourceTableName); unionedDF.write().insertInto(sourceTableName); @@ -870,18 +977,18 @@ private void createCompositePartitionedTableWithNullValueInPartitionColumn(Strin private void createWeirdCaseTable() { String createParquet = - "CREATE TABLE %s (id Integer, subdept String, dept String) " + - "PARTITIONED BY (`naMe` String) STORED AS parquet"; + "CREATE TABLE %s (id Integer, subdept String, dept String) " + + "PARTITIONED BY (`naMe` String) STORED AS parquet"; sql(createParquet, sourceTableName); weirdColumnNamesDF.write().insertInto(sourceTableName); weirdColumnNamesDF.write().insertInto(sourceTableName); - } private void createUnpartitionedHiveTable() { - String createHive = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet"; + String createHive = + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet"; sql(createHive, sourceTableName); @@ -890,8 +997,9 @@ private void createUnpartitionedHiveTable() { } private void createPartitionedHiveTable() { - String createHive = "CREATE TABLE %s (name String, dept String, subdept String) " + - "PARTITIONED BY (id Integer) STORED AS parquet"; + String createHive = + "CREATE TABLE %s (name String, dept String, subdept String) " + + "PARTITIONED BY (id Integer) STORED AS parquet"; sql(createHive, sourceTableName); @@ -905,11 +1013,11 @@ public List createOrcFile(File orcFile, int numRows) throws IOException if (orcFile.exists()) { orcFile.delete(); } - final org.apache.iceberg.Schema icebergSchema = new org.apache.iceberg.Schema( - optional(1, "x", Types.FloatType.get()), - optional(2, "y", Types.DoubleType.get()), - optional(3, "z", Types.LongType.get()) - ); + final org.apache.iceberg.Schema icebergSchema = + new org.apache.iceberg.Schema( + optional(1, "x", Types.FloatType.get()), + optional(2, "y", Types.DoubleType.get()), + optional(3, "z", Types.LongType.get())); List records = Lists.newArrayListWithExpectedSize(numRows); for (int i = 0; i < numRows; i += 1) { @@ -921,11 +1029,18 @@ public List createOrcFile(File orcFile, int numRows) throws IOException } OutputFile outFile = Files.localOutput(orcFile); - try (FileAppender appender = org.apache.iceberg.orc.ORC.write(outFile) - .schema(icebergSchema) - .metricsConfig(MetricsConfig.fromProperties(ImmutableMap.of("write.metadata.metrics.default", "none"))) - .createWriterFunc(GenericOrcWriter::buildWriter) - .build()) { + try (FileAppender appender = + org.apache + .iceberg + .orc + .ORC + .write(outFile) + .schema(icebergSchema) + .metricsConfig( + MetricsConfig.fromProperties( + ImmutableMap.of("write.metadata.metrics.default", "none"))) + .createWriterFunc(GenericOrcWriter::buildWriter) + .build()) { appender.addAll(records); } return records; diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java index 9d630508b6e4..8aee7c97752f 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -31,7 +30,8 @@ import org.junit.Test; public class TestAlterTablePartitionFields extends SparkExtensionsTestBase { - public TestAlterTablePartitionFields(String catalogName, String implementation, Map config) { + public TestAlterTablePartitionFields( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -42,7 +42,9 @@ public void removeTable() { @Test public void testAddIdentityPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -51,17 +53,17 @@ public void testAddIdentityPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .identity("category") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).identity("category").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddBucketPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -70,17 +72,20 @@ public void testAddBucketPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .bucket("id", 16, "id_bucket_16") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .bucket("id", 16, "id_bucket_16") + .build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddTruncatePartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -89,17 +94,20 @@ public void testAddTruncatePartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .truncate("data", 4, "data_trunc_4") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .truncate("data", 4, "data_trunc_4") + .build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddYearsPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -108,17 +116,17 @@ public void testAddYearsPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .year("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).year("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddMonthsPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -127,17 +135,17 @@ public void testAddMonthsPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .month("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).month("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddDaysPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -146,17 +154,17 @@ public void testAddDaysPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddHoursPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -165,17 +173,17 @@ public void testAddHoursPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .hour("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).hour("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddNamedPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -184,77 +192,83 @@ public void testAddNamedPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .bucket("id", 16, "shard") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).bucket("id", 16, "shard").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testDropIdentityPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg PARTITIONED BY (category)", + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg PARTITIONED BY (category)", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Table should start with 1 partition field", 1, table.spec().fields().size()); + Assert.assertEquals( + "Table should start with 1 partition field", 1, table.spec().fields().size()); sql("ALTER TABLE %s DROP PARTITION FIELD category", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .alwaysNull("category", "category") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .alwaysNull("category", "category") + .build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testDropDaysPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, ts timestamp, data string) USING iceberg PARTITIONED BY (days(ts))", + sql( + "CREATE TABLE %s (id bigint NOT NULL, ts timestamp, data string) USING iceberg PARTITIONED BY (days(ts))", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Table should start with 1 partition field", 1, table.spec().fields().size()); + Assert.assertEquals( + "Table should start with 1 partition field", 1, table.spec().fields().size()); sql("ALTER TABLE %s DROP PARTITION FIELD days(ts)", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .alwaysNull("ts", "ts_day") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).alwaysNull("ts", "ts_day").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testDropBucketPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (bucket(16, id))", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (bucket(16, id))", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Table should start with 1 partition field", 1, table.spec().fields().size()); + Assert.assertEquals( + "Table should start with 1 partition field", 1, table.spec().fields().size()); sql("ALTER TABLE %s DROP PARTITION FIELD bucket(16, id)", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .alwaysNull("id", "id_bucket") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .alwaysNull("id", "id_bucket") + .build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testDropPartitionByName() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -270,114 +284,121 @@ public void testDropPartitionByName() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("id", "shard") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(2).alwaysNull("id", "shard").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testReplacePartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts)", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); sql("ALTER TABLE %s REPLACE PARTITION FIELD days(ts) WITH hours(ts)", tableName); table.refresh(); - expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("ts", "ts_day") - .hour("ts") - .build(); - Assert.assertEquals("Should changed from daily to hourly partitioned field", expected, table.spec()); + expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("ts", "ts_day") + .hour("ts") + .build(); + Assert.assertEquals( + "Should changed from daily to hourly partitioned field", expected, table.spec()); } @Test public void testReplacePartitionAndRename() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts)", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); sql("ALTER TABLE %s REPLACE PARTITION FIELD days(ts) WITH hours(ts) AS hour_col", tableName); table.refresh(); - expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("ts", "ts_day") - .hour("ts", "hour_col") - .build(); - Assert.assertEquals("Should changed from daily to hourly partitioned field", expected, table.spec()); + expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("ts", "ts_day") + .hour("ts", "hour_col") + .build(); + Assert.assertEquals( + "Should changed from daily to hourly partitioned field", expected, table.spec()); } @Test public void testReplaceNamedPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts) AS day_col", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts", "day_col") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts", "day_col").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); sql("ALTER TABLE %s REPLACE PARTITION FIELD day_col WITH hours(ts)", tableName); table.refresh(); - expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("ts", "day_col") - .hour("ts") - .build(); - Assert.assertEquals("Should changed from daily to hourly partitioned field", expected, table.spec()); + expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("ts", "day_col") + .hour("ts") + .build(); + Assert.assertEquals( + "Should changed from daily to hourly partitioned field", expected, table.spec()); } @Test public void testReplaceNamedPartitionAndRenameDifferently() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts) AS day_col", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts", "day_col") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts", "day_col").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); sql("ALTER TABLE %s REPLACE PARTITION FIELD day_col WITH hours(ts) AS hour_col", tableName); table.refresh(); - expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("ts", "day_col") - .hour("ts", "hour_col") - .build(); - Assert.assertEquals("Should changed from daily to hourly partitioned field", expected, table.spec()); + expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("ts", "day_col") + .hour("ts", "hour_col") + .build(); + Assert.assertEquals( + "Should changed from daily to hourly partitioned field", expected, table.spec()); } @Test public void testSparkTableAddDropPartitions() throws Exception { sql("CREATE TABLE %s (id bigint NOT NULL, ts timestamp, data string) USING iceberg", tableName); - Assert.assertEquals("spark table partition should be empty", 0, sparkTable().partitioning().length); + Assert.assertEquals( + "spark table partition should be empty", 0, sparkTable().partitioning().length); sql("ALTER TABLE %s ADD PARTITION FIELD bucket(16, id) AS shard", tableName); assertPartitioningEquals(sparkTable(), 1, "bucket(16, id)"); @@ -396,13 +417,16 @@ public void testSparkTableAddDropPartitions() throws Exception { sql("ALTER TABLE %s DROP PARTITION FIELD shard", tableName); sql("DESCRIBE %s", tableName); - Assert.assertEquals("spark table partition should be empty", 0, sparkTable().partitioning().length); + Assert.assertEquals( + "spark table partition should be empty", 0, sparkTable().partitioning().length); } private void assertPartitioningEquals(SparkTable table, int len, String transform) { Assert.assertEquals("spark table partition should be " + len, len, table.partitioning().length); - Assert.assertEquals("latest spark table partition transform should match", - transform, table.partitioning()[len - 1].toString()); + Assert.assertEquals( + "latest spark table partition transform should match", + transform, + table.partitioning()[len - 1].toString()); } private SparkTable sparkTable() throws Exception { diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java index ac12953d0a7e..c993c213dc5e 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -28,7 +27,8 @@ import org.junit.Test; public class TestAlterTableSchema extends SparkExtensionsTestBase { - public TestAlterTableSchema(String catalogName, String implementation, Map config) { + public TestAlterTableSchema( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -39,20 +39,25 @@ public void removeTable() { @Test public void testSetIdentifierFields() { - sql("CREATE TABLE %s (id bigint NOT NULL, " + - "location struct NOT NULL) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, " + + "location struct NOT NULL) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertTrue("Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); + Assert.assertTrue( + "Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier field", + Assert.assertEquals( + "Should have new identifier field", Sets.newHashSet(table.schema().findField("id").fieldId()), table.schema().identifierFieldIds()); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id, location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier field", + Assert.assertEquals( + "Should have new identifier field", Sets.newHashSet( table.schema().findField("id").fieldId(), table.schema().findField("location.lon").fieldId()), @@ -60,7 +65,8 @@ public void testSetIdentifierFields() { sql("ALTER TABLE %s SET IDENTIFIER FIELDS location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier field", + Assert.assertEquals( + "Should have new identifier field", Sets.newHashSet(table.schema().findField("location.lon").fieldId()), table.schema().identifierFieldIds()); } @@ -69,13 +75,16 @@ public void testSetIdentifierFields() { public void testSetInvalidIdentifierFields() { sql("CREATE TABLE %s (id bigint NOT NULL, id2 bigint) USING iceberg", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertTrue("Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); - AssertHelpers.assertThrows("should not allow setting unknown fields", + Assert.assertTrue( + "Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); + AssertHelpers.assertThrows( + "should not allow setting unknown fields", IllegalArgumentException.class, "not found in current schema or added columns", () -> sql("ALTER TABLE %s SET IDENTIFIER FIELDS unknown", tableName)); - AssertHelpers.assertThrows("should not allow setting optional fields", + AssertHelpers.assertThrows( + "should not allow setting optional fields", IllegalArgumentException.class, "not a required field", () -> sql("ALTER TABLE %s SET IDENTIFIER FIELDS id2", tableName)); @@ -83,14 +92,18 @@ public void testSetInvalidIdentifierFields() { @Test public void testDropIdentifierFields() { - sql("CREATE TABLE %s (id bigint NOT NULL, " + - "location struct NOT NULL) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, " + + "location struct NOT NULL) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertTrue("Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); + Assert.assertTrue( + "Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id, location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier fields", + Assert.assertEquals( + "Should have new identifier fields", Sets.newHashSet( table.schema().findField("id").fieldId(), table.schema().findField("location.lon").fieldId()), @@ -98,13 +111,15 @@ public void testDropIdentifierFields() { sql("ALTER TABLE %s DROP IDENTIFIER FIELDS id", tableName); table.refresh(); - Assert.assertEquals("Should removed identifier field", + Assert.assertEquals( + "Should removed identifier field", Sets.newHashSet(table.schema().findField("location.lon").fieldId()), table.schema().identifierFieldIds()); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id, location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier fields", + Assert.assertEquals( + "Should have new identifier fields", Sets.newHashSet( table.schema().findField("id").fieldId(), table.schema().findField("location.lon").fieldId()), @@ -112,29 +127,34 @@ public void testDropIdentifierFields() { sql("ALTER TABLE %s DROP IDENTIFIER FIELDS id, location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have no identifier field", - Sets.newHashSet(), - table.schema().identifierFieldIds()); + Assert.assertEquals( + "Should have no identifier field", Sets.newHashSet(), table.schema().identifierFieldIds()); } @Test public void testDropInvalidIdentifierFields() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string NOT NULL, " + - "location struct NOT NULL) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string NOT NULL, " + + "location struct NOT NULL) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertTrue("Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); - AssertHelpers.assertThrows("should not allow dropping unknown fields", + Assert.assertTrue( + "Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); + AssertHelpers.assertThrows( + "should not allow dropping unknown fields", IllegalArgumentException.class, "field unknown not found", () -> sql("ALTER TABLE %s DROP IDENTIFIER FIELDS unknown", tableName)); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id", tableName); - AssertHelpers.assertThrows("should not allow dropping a field that is not an identifier", + AssertHelpers.assertThrows( + "should not allow dropping a field that is not an identifier", IllegalArgumentException.class, "data is not an identifier field", () -> sql("ALTER TABLE %s DROP IDENTIFIER FIELDS data", tableName)); - AssertHelpers.assertThrows("should not allow dropping a nested field that is not an identifier", + AssertHelpers.assertThrows( + "should not allow dropping a nested field that is not an identifier", IllegalArgumentException.class, "location.lon is not an identifier field", () -> sql("ALTER TABLE %s DROP IDENTIFIER FIELDS location.lon", tableName)); diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java index baf464d94ad0..d676101b1076 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.List; @@ -30,7 +29,8 @@ public class TestAncestorsOfProcedure extends SparkExtensionsTestBase { - public TestAncestorsOfProcedure(String catalogName, String implementation, Map config) { + public TestAncestorsOfProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -51,14 +51,12 @@ public void testAncestorOfUsingEmptyArgs() { Long preSnapshotId = table.currentSnapshot().parentId(); Long preTimeStamp = table.snapshot(table.currentSnapshot().parentId()).timestampMillis(); - List output = sql("CALL %s.system.ancestors_of('%s')", - catalogName, tableIdent); + List output = sql("CALL %s.system.ancestors_of('%s')", catalogName, tableIdent); assertEquals( "Procedure output must match", ImmutableList.of( - row(currentSnapshotId, currentTimestamp), - row(preSnapshotId, preTimeStamp)), + row(currentSnapshotId, currentTimestamp), row(preSnapshotId, preTimeStamp)), output); } @@ -77,8 +75,7 @@ public void testAncestorOfUsingSnapshotId() { assertEquals( "Procedure output must match", ImmutableList.of( - row(currentSnapshotId, currentTimestamp), - row(preSnapshotId, preTimeStamp)), + row(currentSnapshotId, currentTimestamp), row(preSnapshotId, preTimeStamp)), sql("CALL %s.system.ancestors_of('%s', %dL)", catalogName, tableIdent, currentSnapshotId)); assertEquals( @@ -105,7 +102,8 @@ public void testAncestorOfWithRollBack() { Long thirdTimestamp = table.currentSnapshot().timestampMillis(); // roll back - sql("CALL %s.system.rollback_to_snapshot('%s', %dL)", + sql( + "CALL %s.system.rollback_to_snapshot('%s', %dL)", catalogName, tableIdent, secondSnapshotId); sql("INSERT INTO TABLE %s VALUES (4, 'd')", tableName); @@ -142,22 +140,29 @@ public void testAncestorOfUsingNamedArgs() { assertEquals( "Procedure output must match", ImmutableList.of(row(firstSnapshotId, firstTimestamp)), - sql("CALL %s.system.ancestors_of(snapshot_id => %dL, table => '%s')", + sql( + "CALL %s.system.ancestors_of(snapshot_id => %dL, table => '%s')", catalogName, firstSnapshotId, tableIdent)); } @Test public void testInvalidAncestorOfCases() { - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.ancestors_of()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier for argument table", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier for argument table", () -> sql("CALL %s.system.ancestors_of('')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for snapshot_id: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for snapshot_id: cannot cast", () -> sql("CALL %s.system.ancestors_of('%s', 1.1)", catalogName, tableIdent)); } } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java index c8c4316a1524..7bcc0884561d 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.math.BigDecimal; @@ -49,19 +48,19 @@ public class TestCallStatementParser { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; private static ParserInterface parser = null; @BeforeClass public static void startSpark() { - TestCallStatementParser.spark = SparkSession.builder() - .master("local[2]") - .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) - .config("spark.extra.prop", "value") - .getOrCreate(); + TestCallStatementParser.spark = + SparkSession.builder() + .master("local[2]") + .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) + .config("spark.extra.prop", "value") + .getOrCreate(); TestCallStatementParser.parser = spark.sessionState().sqlParser(); } @@ -75,8 +74,10 @@ public static void stopSpark() { @Test public void testCallWithPositionalArgs() throws ParseException { - CallStatement call = (CallStatement) parser.parsePlan("CALL c.n.func(1, '2', 3L, true, 1.0D, 9.0e1, 900e-1BD)"); - Assert.assertEquals(ImmutableList.of("c", "n", "func"), JavaConverters.seqAsJavaList(call.name())); + CallStatement call = + (CallStatement) parser.parsePlan("CALL c.n.func(1, '2', 3L, true, 1.0D, 9.0e1, 900e-1BD)"); + Assert.assertEquals( + ImmutableList.of("c", "n", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(7, call.args().size()); @@ -91,8 +92,10 @@ public void testCallWithPositionalArgs() throws ParseException { @Test public void testCallWithNamedArgs() throws ParseException { - CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func(c1 => 1, c2 => '2', c3 => true)"); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + CallStatement call = + (CallStatement) parser.parsePlan("CALL cat.system.func(c1 => 1, c2 => '2', c3 => true)"); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(3, call.args().size()); @@ -104,7 +107,8 @@ public void testCallWithNamedArgs() throws ParseException { @Test public void testCallWithMixedArgs() throws ParseException { CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func(c1 => 1, '2')"); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(2, call.args().size()); @@ -114,18 +118,24 @@ public void testCallWithMixedArgs() throws ParseException { @Test public void testCallWithTimestampArg() throws ParseException { - CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func(TIMESTAMP '2017-02-03T10:37:30.00Z')"); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + CallStatement call = + (CallStatement) + parser.parsePlan("CALL cat.system.func(TIMESTAMP '2017-02-03T10:37:30.00Z')"); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(1, call.args().size()); - checkArg(call, 0, Timestamp.from(Instant.parse("2017-02-03T10:37:30.00Z")), DataTypes.TimestampType); + checkArg( + call, 0, Timestamp.from(Instant.parse("2017-02-03T10:37:30.00Z")), DataTypes.TimestampType); } @Test public void testCallWithVarSubstitution() throws ParseException { - CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func('${spark.extra.prop}')"); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + CallStatement call = + (CallStatement) parser.parsePlan("CALL cat.system.func('${spark.extra.prop}')"); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(1, call.args().size()); @@ -134,22 +144,22 @@ public void testCallWithVarSubstitution() throws ParseException { @Test public void testCallStripsComments() throws ParseException { - List callStatementsWithComments = Lists.newArrayList( - "/* bracketed comment */ CALL cat.system.func('${spark.extra.prop}')", - "/**/ CALL cat.system.func('${spark.extra.prop}')", - "-- single line comment \n CALL cat.system.func('${spark.extra.prop}')", - "-- multiple \n-- single line \n-- comments \n CALL cat.system.func('${spark.extra.prop}')", - "/* select * from multiline_comment \n where x like '%sql%'; */ CALL cat.system.func('${spark.extra.prop}')", - "/* {\"app\": \"dbt\", \"dbt_version\": \"1.0.1\", \"profile_name\": \"profile1\", \"target_name\": \"dev\", " + - "\"node_id\": \"model.profile1.stg_users\"} \n*/ CALL cat.system.func('${spark.extra.prop}')", - "/* Some multi-line comment \n" + - "*/ CALL /* inline comment */ cat.system.func('${spark.extra.prop}') -- ending comment", - "CALL -- a line ending comment\n" + - "cat.system.func('${spark.extra.prop}')" - ); + List callStatementsWithComments = + Lists.newArrayList( + "/* bracketed comment */ CALL cat.system.func('${spark.extra.prop}')", + "/**/ CALL cat.system.func('${spark.extra.prop}')", + "-- single line comment \n CALL cat.system.func('${spark.extra.prop}')", + "-- multiple \n-- single line \n-- comments \n CALL cat.system.func('${spark.extra.prop}')", + "/* select * from multiline_comment \n where x like '%sql%'; */ CALL cat.system.func('${spark.extra.prop}')", + "/* {\"app\": \"dbt\", \"dbt_version\": \"1.0.1\", \"profile_name\": \"profile1\", \"target_name\": \"dev\", " + + "\"node_id\": \"model.profile1.stg_users\"} \n*/ CALL cat.system.func('${spark.extra.prop}')", + "/* Some multi-line comment \n" + + "*/ CALL /* inline comment */ cat.system.func('${spark.extra.prop}') -- ending comment", + "CALL -- a line ending comment\n" + "cat.system.func('${spark.extra.prop}')"); for (String sqlText : callStatementsWithComments) { CallStatement call = (CallStatement) parser.parsePlan(sqlText); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(1, call.args().size()); @@ -159,17 +169,24 @@ public void testCallStripsComments() throws ParseException { @Test public void testCallParseError() { - AssertHelpers.assertThrows("Should fail with a sensible parse error", IcebergParseException.class, + AssertHelpers.assertThrows( + "Should fail with a sensible parse error", + IcebergParseException.class, "missing '(' at 'radish'", () -> parser.parsePlan("CALL cat.system radish kebab")); } - private void checkArg(CallStatement call, int index, Object expectedValue, DataType expectedType) { + private void checkArg( + CallStatement call, int index, Object expectedValue, DataType expectedType) { checkArg(call, index, null, expectedValue, expectedType); } - private void checkArg(CallStatement call, int index, String expectedName, - Object expectedValue, DataType expectedType) { + private void checkArg( + CallStatement call, + int index, + String expectedName, + Object expectedValue, + DataType expectedType) { if (expectedName != null) { NamedArgument arg = checkCast(call.args().apply(index), NamedArgument.class); @@ -190,7 +207,8 @@ private Literal toSparkLiteral(Object value, DataType dataType) { } private T checkCast(Object value, Class expectedClass) { - Assert.assertTrue("Expected instance of " + expectedClass.getName(), expectedClass.isInstance(value)); + Assert.assertTrue( + "Expected instance of " + expectedClass.getName(), expectedClass.isInstance(value)); return expectedClass.cast(value); } } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java index c69964693189..7309a176b922 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; + import java.util.List; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -34,11 +35,10 @@ import org.junit.After; import org.junit.Test; -import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; - public class TestCherrypickSnapshotProcedure extends SparkExtensionsTestBase { - public TestCherrypickSnapshotProcedure(String catalogName, String implementation, Map config) { + public TestCherrypickSnapshotProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -56,26 +56,30 @@ public void testCherrypickSnapshotUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.cherrypick_snapshot('%s', %dL)", - catalogName, tableIdent, wapSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.cherrypick_snapshot('%s', %dL)", + catalogName, tableIdent, wapSnapshot.snapshotId()); table.refresh(); Snapshot currentSnapshot = table.currentSnapshot(); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())), output); - assertEquals("Cherrypick must be successful", + assertEquals( + "Cherrypick must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -89,26 +93,30 @@ public void testCherrypickSnapshotUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.cherrypick_snapshot(snapshot_id => %dL, table => '%s')", - catalogName, wapSnapshot.snapshotId(), tableIdent); + List output = + sql( + "CALL %s.system.cherrypick_snapshot(snapshot_id => %dL, table => '%s')", + catalogName, wapSnapshot.snapshotId(), tableIdent); table.refresh(); Snapshot currentSnapshot = table.currentSnapshot(); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())), output); - assertEquals("Cherrypick must be successful", + assertEquals( + "Cherrypick must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -129,17 +137,20 @@ public void testCherrypickSnapshotRefreshesRelationCache() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - sql("CALL %s.system.cherrypick_snapshot('%s', %dL)", + sql( + "CALL %s.system.cherrypick_snapshot('%s', %dL)", catalogName, tableIdent, wapSnapshot.snapshotId()); - assertEquals("Cherrypick snapshot should be visible", + assertEquals( + "Cherrypick snapshot should be visible", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); @@ -150,31 +161,43 @@ public void testCherrypickSnapshotRefreshesRelationCache() { public void testCherrypickInvalidSnapshot() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should reject invalid snapshot id", - ValidationException.class, "Cannot cherry-pick unknown snapshot ID", + AssertHelpers.assertThrows( + "Should reject invalid snapshot id", + ValidationException.class, + "Cannot cherry-pick unknown snapshot ID", () -> sql("CALL %s.system.cherrypick_snapshot('%s', -1L)", catalogName, tableIdent)); } @Test public void testInvalidCherrypickSnapshotCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.cherrypick_snapshot('n', table => 't', 1L)", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.cherrypick_snapshot('n', 't', 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.cherrypick_snapshot('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.cherrypick_snapshot('', 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for snapshot_id: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for snapshot_id: cannot cast", () -> sql("CALL %s.system.cherrypick_snapshot('t', 2.2)", catalogName)); } } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java index c9d15906251f..8a8a8c6ab722 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -25,8 +24,13 @@ public class TestCopyOnWriteDelete extends TestDelete { - public TestCopyOnWriteDelete(String catalogName, String implementation, Map config, - String fileFormat, Boolean vectorized, String distributionMode) { + public TestCopyOnWriteDelete( + String catalogName, + String implementation, + Map config, + String fileFormat, + Boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java index 60aba632646f..27cbd1a9d5de 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -25,8 +24,13 @@ public class TestCopyOnWriteMerge extends TestMerge { - public TestCopyOnWriteMerge(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestCopyOnWriteMerge( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java index cc73ecba9ddf..3fa3f74f6a39 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -25,8 +24,13 @@ public class TestCopyOnWriteUpdate extends TestUpdate { - public TestCopyOnWriteUpdate(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestCopyOnWriteUpdate( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java index cd4392177b12..27435137ba85 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.SPLIT_SIZE; +import static org.apache.spark.sql.functions.lit; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -51,15 +55,15 @@ import org.junit.Ignore; import org.junit.Test; -import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.SPLIT_SIZE; -import static org.apache.spark.sql.functions.lit; - public abstract class TestDelete extends SparkRowLevelOperationsTestBase { - public TestDelete(String catalogName, String implementation, Map config, - String fileFormat, Boolean vectorized, String distributionMode) { + public TestDelete( + String catalogName, + String implementation, + Map config, + String fileFormat, + Boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -85,7 +89,8 @@ public void testDeleteFromEmptyTable() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -103,7 +108,8 @@ public void testExplain() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 1 snapshot", 1, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -116,7 +122,8 @@ public void testDeleteWithAlias() { sql("DELETE FROM %s AS t WHERE t.id IS NULL", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -136,7 +143,8 @@ public void testDeleteWithDynamicFileFiltering() throws NoSuchTableException { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "1", "1", "1"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hardware"), row(1, "hr"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -155,7 +163,8 @@ public void testDeleteNonExistingRecords() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "0", null, null); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -177,9 +186,8 @@ public void testDeleteWithoutCondition() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "delete", "2", "3", null); - assertEquals("Should have expected rows", - ImmutableList.of(), - sql("SELECT * FROM %s", tableName)); + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); } @Test @@ -199,7 +207,8 @@ public void testDeleteUsingMetadataWithComplexCondition() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "delete", "2", "2", null); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "dep1")), sql("SELECT * FROM %s", tableName)); } @@ -222,7 +231,8 @@ public void testDeleteWithArbitraryPartitionPredicates() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "1", "1", null); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -233,8 +243,10 @@ public void testDeleteWithNonDeterministicCondition() { sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware')", tableName); - AssertHelpers.assertThrows("Should complain about non-deterministic expressions", - AnalysisException.class, "nondeterministic expressions are only allowed", + AssertHelpers.assertThrows( + "Should complain about non-deterministic expressions", + AnalysisException.class, + "nondeterministic expressions are only allowed", () -> sql("DELETE FROM %s WHERE id = 1 AND rand() > 0.5", tableName)); } @@ -246,25 +258,29 @@ public void testDeleteWithFoldableConditions() { // should keep all rows and don't trigger execution sql("DELETE FROM %s WHERE false", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should keep all rows and don't trigger execution sql("DELETE FROM %s WHERE 50 <> 50", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should keep all rows and don't trigger execution sql("DELETE FROM %s WHERE 1 > null", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should remove all rows sql("DELETE FROM %s WHERE 21 = 21", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -276,24 +292,29 @@ public void testDeleteWithFoldableConditions() { public void testDeleteWithNullConditions() { createAndInitPartitionedTable(); - sql("INSERT INTO TABLE %s VALUES (0, null), (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName); + sql( + "INSERT INTO TABLE %s VALUES (0, null), (1, 'hr'), (2, 'hardware'), (null, 'hr')", + tableName); // should keep all rows as null is never equal to null sql("DELETE FROM %s WHERE dep = null", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); // null = 'software' -> null // should delete using metadata operation only sql("DELETE FROM %s WHERE dep = 'software'", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); // should delete using metadata operation only sql("DELETE FROM %s WHERE dep <=> NULL", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); @@ -311,17 +332,20 @@ public void testDeleteWithInAndNotInConditions() { sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName); sql("DELETE FROM %s WHERE id IN (1, null)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("DELETE FROM %s WHERE id NOT IN (null, 1)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("DELETE FROM %s WHERE id NOT IN (1, 10)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -332,16 +356,20 @@ public void testDeleteWithMultipleRowGroupsParquet() throws NoSuchTableException createAndInitPartitionedTable(); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, SPLIT_SIZE, 100); List ids = Lists.newArrayList(); for (int id = 1; id <= 200; id++) { ids.add(id); } - Dataset df = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); + Dataset df = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); df.coalesce(1).writeTo(tableName).append(); Assert.assertEquals(200, spark.table(tableName).count()); @@ -360,14 +388,12 @@ public void testDeleteWithConditionOnNestedColumn() { sql("INSERT INTO TABLE %s VALUES (2, named_struct(\"c1\", 2, \"c2\", \"v2\"))", tableName); sql("DELETE FROM %s WHERE complex.c1 = id + 2", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(2)), - sql("SELECT id FROM %s", tableName)); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2)), sql("SELECT id FROM %s", tableName)); sql("DELETE FROM %s t WHERE t.complex.c1 = id", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(), - sql("SELECT id FROM %s", tableName)); + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT id FROM %s", tableName)); } @Test @@ -379,28 +405,35 @@ public void testDeleteWithInSubquery() throws NoSuchTableException { createOrReplaceView("deleted_id", Arrays.asList(0, 1, null), Encoders.INT()); createOrReplaceView("deleted_dep", Arrays.asList("software", "hr"), Encoders.STRING()); - sql("DELETE FROM %s WHERE id IN (SELECT * FROM deleted_id) AND dep IN (SELECT * from deleted_dep)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s WHERE id IN (SELECT * FROM deleted_id) AND dep IN (SELECT * from deleted_dep)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); append(new Employee(1, "hr"), new Employee(-1, "hr")); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("DELETE FROM %s WHERE id IS NULL OR id IN (SELECT value + 2 FROM deleted_id)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "hr")), sql("SELECT * FROM %s ORDER BY id", tableName)); append(new Employee(null, "hr"), new Employee(2, "hr")); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "hr"), row(2, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("DELETE FROM %s WHERE id IN (SELECT value + 2 FROM deleted_id) AND dep = 'hr'", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -411,11 +444,13 @@ public void testDeleteWithMultiColumnInSubquery() throws NoSuchTableException { append(new Employee(1, "hr"), new Employee(2, "hardware"), new Employee(null, "hr")); - List deletedEmployees = Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr")); + List deletedEmployees = + Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr")); createOrReplaceView("deleted_employee", deletedEmployees, Encoders.bean(Employee.class)); sql("DELETE FROM %s WHERE (id, dep) IN (SELECT id, dep FROM deleted_employee)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -431,36 +466,50 @@ public void testDeleteWithNotInSubquery() throws NoSuchTableException { // the file filter subquery (nested loop lef-anti join) returns 0 records sql("DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id) OR dep IN ('software', 'hr')", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id) OR dep IN ('software', 'hr')", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE " + - "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) AND " + - "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE " + + "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) AND " + + "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE " + - "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) OR " + - "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE " + + "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) OR " + + "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -473,8 +522,10 @@ public void testDeleteWithNotInSubqueryNotSupported() throws NoSuchTableExceptio createOrReplaceView("deleted_id", Arrays.asList(-1, -2, null), Encoders.INT()); - AssertHelpers.assertThrows("Should complain about NOT IN subquery", - AnalysisException.class, "Null-aware predicate subqueries are not currently supported", + AssertHelpers.assertThrows( + "Should complain about NOT IN subquery", + AnalysisException.class, + "Null-aware predicate subqueries are not currently supported", () -> sql("DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id)", tableName)); } @@ -482,8 +533,10 @@ public void testDeleteWithNotInSubqueryNotSupported() throws NoSuchTableExceptio public void testDeleteOnNonIcebergTableNotSupported() throws NoSuchTableException { createOrReplaceView("testtable", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Delete is not supported for non iceberg table", - AnalysisException.class, "DELETE is only supported with v2 tables.", + AssertHelpers.assertThrows( + "Delete is not supported for non iceberg table", + AnalysisException.class, + "DELETE is only supported with v2 tables.", () -> sql("DELETE FROM %s WHERE c1 = -100", "testtable")); } @@ -496,25 +549,37 @@ public void testDeleteWithExistSubquery() throws NoSuchTableException { createOrReplaceView("deleted_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("deleted_dep", Arrays.asList("software", "hr"), Encoders.STRING()); - sql("DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value) OR t.id IS NULL", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value) OR t.id IS NULL", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s", tableName)); - sql("DELETE FROM %s t WHERE " + - "EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value) AND " + - "EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE " + + "EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value) AND " + + "EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s", tableName)); } @@ -528,21 +593,28 @@ public void testDeleteWithNotExistsSubquery() throws NoSuchTableException { createOrReplaceView("deleted_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("deleted_dep", Arrays.asList("software", "hr"), Encoders.STRING()); - sql("DELETE FROM %s t WHERE " + - "NOT EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value + 2) AND " + - "NOT EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE " + + "NOT EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value + 2) AND " + + "NOT EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE NOT EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE NOT EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); String subquery = "SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2"; sql("DELETE FROM %s t WHERE NOT EXISTS (%s) OR t.id = 1", tableName, subquery); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -556,7 +628,8 @@ public void testDeleteWithScalarSubquery() throws NoSuchTableException { createOrReplaceView("deleted_id", Arrays.asList(1, 100, null), Encoders.INT()); sql("DELETE FROM %s t WHERE id <= (SELECT min(value) FROM deleted_id)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -591,52 +664,61 @@ public synchronized void testDeleteWithSerializableIsolation() throws Interrupte createAndInitUnpartitionedTable(); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DELETE_ISOLATION_LEVEL, "serializable"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, DELETE_ISOLATION_LEVEL, "serializable"); // Pre-populate the table to force it to use the Spark Writers instead of Metadata-Only Delete // for more consistent exception stack List ids = ImmutableList.of(1, 2); - Dataset inputDF = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); + Dataset inputDF = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); try { inputDF.coalesce(1).writeTo(tableName).append(); } catch (NoSuchTableException e) { throw new RuntimeException(e); } - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // delete thread - Future deleteFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("DELETE FROM %s WHERE id = 1", tableName); - barrier.incrementAndGet(); - } - }); + Future deleteFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("DELETE FROM %s WHERE id = 1", tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - - try { - inputDF.coalesce(1).writeTo(tableName).append(); - } catch (NoSuchTableException e) { - throw new RuntimeException(e); - } - - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + + try { + inputDF.coalesce(1).writeTo(tableName).append(); + } catch (NoSuchTableException e) { + throw new RuntimeException(e); + } + + barrier.incrementAndGet(); + } + }); try { deleteFuture.get(); @@ -647,7 +729,8 @@ public synchronized void testDeleteWithSerializableIsolation() throws Interrupte Throwable validationException = sparkException.getCause(); Assert.assertThat(validationException, CoreMatchers.instanceOf(ValidationException.class)); String errMsg = validationException.getMessage(); - Assert.assertThat(errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); + Assert.assertThat( + errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); } finally { appendFuture.cancel(true); } @@ -657,40 +740,48 @@ public synchronized void testDeleteWithSerializableIsolation() throws Interrupte } @Test - public synchronized void testDeleteWithSnapshotIsolation() throws InterruptedException, ExecutionException { + public synchronized void testDeleteWithSnapshotIsolation() + throws InterruptedException, ExecutionException { // cannot run tests with concurrency for Hadoop tables without atomic renames Assume.assumeFalse(catalogName.equalsIgnoreCase("testhadoop")); createAndInitUnpartitionedTable(); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DELETE_ISOLATION_LEVEL, "snapshot"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, DELETE_ISOLATION_LEVEL, "snapshot"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // delete thread - Future deleteFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("DELETE FROM %s WHERE id = 1", tableName); - barrier.incrementAndGet(); - } - }); + Future deleteFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("DELETE FROM %s WHERE id = 1", tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { deleteFuture.get(); @@ -714,7 +805,8 @@ public void testDeleteRefreshesRelationCache() throws NoSuchTableException { spark.sql("CACHE TABLE tmp"); - assertEquals("View should have correct data", + assertEquals( + "View should have correct data", ImmutableList.of(row(1, "hardware"), row(1, "hr")), sql("SELECT * FROM tmp ORDER BY id, dep")); @@ -726,11 +818,13 @@ public void testDeleteRefreshesRelationCache() throws NoSuchTableException { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "2", "2", "2"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - assertEquals("Should refresh the relation cache", + assertEquals( + "Should refresh the relation cache", ImmutableList.of(), sql("SELECT * FROM tmp ORDER BY id, dep")); diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java index 43c3ab7399ea..419b7ed0f689 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; + import java.io.IOException; import java.sql.Timestamp; import java.time.Instant; @@ -37,11 +38,10 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TableProperties.GC_ENABLED; - public class TestExpireSnapshotsProcedure extends SparkExtensionsTestBase { - public TestExpireSnapshotsProcedure(String catalogName, String implementation, Map config) { + public TestExpireSnapshotsProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -54,9 +54,7 @@ public void removeTables() { public void testExpireSnapshotsInEmptyTable() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - List output = sql( - "CALL %s.system.expire_snapshots('%s')", - catalogName, tableIdent); + List output = sql("CALL %s.system.expire_snapshots('%s')", catalogName, tableIdent); assertEquals("Should not delete any files", ImmutableList.of(row(0L, 0L, 0L)), output); } @@ -75,17 +73,17 @@ public void testExpireSnapshotsUsingPositionalArgs() { table.refresh(); Snapshot secondSnapshot = table.currentSnapshot(); - Timestamp secondSnapshotTimestamp = Timestamp.from(Instant.ofEpochMilli(secondSnapshot.timestampMillis())); + Timestamp secondSnapshotTimestamp = + Timestamp.from(Instant.ofEpochMilli(secondSnapshot.timestampMillis())); Assert.assertEquals("Should be 2 snapshots", 2, Iterables.size(table.snapshots())); // expire without retainLast param - List output1 = sql( - "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s')", - catalogName, tableIdent, secondSnapshotTimestamp); - assertEquals("Procedure output must match", - ImmutableList.of(row(0L, 0L, 1L)), - output1); + List output1 = + sql( + "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s')", + catalogName, tableIdent, secondSnapshotTimestamp); + assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 1L)), output1); table.refresh(); @@ -93,7 +91,8 @@ public void testExpireSnapshotsUsingPositionalArgs() { sql("INSERT OVERWRITE %s VALUES (3, 'c')", tableName); sql("INSERT INTO TABLE %s VALUES (4, 'd')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(3L, "c"), row(4L, "d")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -106,12 +105,11 @@ public void testExpireSnapshotsUsingPositionalArgs() { Assert.assertEquals("Should be 3 snapshots", 3, Iterables.size(table.snapshots())); // expire with retainLast param - List output = sql( - "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s', 2)", - catalogName, tableIdent, currentTimestamp); - assertEquals("Procedure output must match", - ImmutableList.of(row(2L, 2L, 1L)), - output); + List output = + sql( + "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s', 2)", + catalogName, tableIdent, currentTimestamp); + assertEquals("Procedure output must match", ImmutableList.of(row(2L, 2L, 1L)), output); } @Test @@ -129,15 +127,14 @@ public void testExpireSnapshotUsingNamedArgs() { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); - List output = sql( - "CALL %s.system.expire_snapshots(" + - "older_than => TIMESTAMP '%s'," + - "table => '%s'," + - "retain_last => 1)", - catalogName, currentTimestamp, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(0L, 0L, 1L)), - output); + List output = + sql( + "CALL %s.system.expire_snapshots(" + + "older_than => TIMESTAMP '%s'," + + "table => '%s'," + + "retain_last => 1)", + catalogName, currentTimestamp, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 1L)), output); } @Test @@ -146,31 +143,43 @@ public void testExpireSnapshotsGCDisabled() { sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' 'false')", tableName, GC_ENABLED); - AssertHelpers.assertThrows("Should reject call", - ValidationException.class, "Cannot expire snapshots: GC is disabled", + AssertHelpers.assertThrows( + "Should reject call", + ValidationException.class, + "Cannot expire snapshots: GC is disabled", () -> sql("CALL %s.system.expire_snapshots('%s')", catalogName, tableIdent)); } @Test public void testInvalidExpireSnapshotsCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.expire_snapshots('n', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.expire_snapshots('n', 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.expire_snapshots()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.expire_snapshots('n', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.expire_snapshots('')", catalogName)); } @@ -179,13 +188,24 @@ public void testResolvingTableInAnotherCatalog() throws IOException { String anotherCatalog = "another_" + catalogName; spark.conf().set("spark.sql.catalog." + anotherCatalog, SparkCatalog.class.getName()); spark.conf().set("spark.sql.catalog." + anotherCatalog + ".type", "hadoop"); - spark.conf().set("spark.sql.catalog." + anotherCatalog + ".warehouse", "file:" + temp.newFolder().toString()); - - sql("CREATE TABLE %s.%s (id bigint NOT NULL, data string) USING iceberg", anotherCatalog, tableIdent); - - AssertHelpers.assertThrows("Should reject calls for a table in another catalog", - IllegalArgumentException.class, "Cannot run procedure in catalog", - () -> sql("CALL %s.system.expire_snapshots('%s')", catalogName, anotherCatalog + "." + tableName)); + spark + .conf() + .set( + "spark.sql.catalog." + anotherCatalog + ".warehouse", + "file:" + temp.newFolder().toString()); + + sql( + "CREATE TABLE %s.%s (id bigint NOT NULL, data string) USING iceberg", + anotherCatalog, tableIdent); + + AssertHelpers.assertThrows( + "Should reject calls for a table in another catalog", + IllegalArgumentException.class, + "Cannot run procedure in catalog", + () -> + sql( + "CALL %s.system.expire_snapshots('%s')", + catalogName, anotherCatalog + "." + tableName)); } @Test @@ -198,31 +218,41 @@ public void testConcurrentExpireSnapshots() { sql("INSERT INTO TABLE %s VALUES (4, 'd')", tableName); Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); - List output = sql( - "CALL %s.system.expire_snapshots(" + - "older_than => TIMESTAMP '%s'," + - "table => '%s'," + - "max_concurrent_deletes => %s," + - "retain_last => 1)", - catalogName, currentTimestamp, tableIdent, 4); - assertEquals("Expiring snapshots concurrently should succeed", ImmutableList.of(row(0L, 0L, 3L)), output); + List output = + sql( + "CALL %s.system.expire_snapshots(" + + "older_than => TIMESTAMP '%s'," + + "table => '%s'," + + "max_concurrent_deletes => %s," + + "retain_last => 1)", + catalogName, currentTimestamp, tableIdent, 4); + assertEquals( + "Expiring snapshots concurrently should succeed", + ImmutableList.of(row(0L, 0L, 3L)), + output); } @Test public void testConcurrentExpireSnapshotsWithInvalidInput() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should throw an error when max_concurrent_deletes = 0", - IllegalArgumentException.class, "max_concurrent_deletes should have value > 0", - () -> sql("CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)", - catalogName, tableIdent, 0)); - - AssertHelpers.assertThrows("Should throw an error when max_concurrent_deletes < 0 ", - IllegalArgumentException.class, "max_concurrent_deletes should have value > 0", - () -> sql( - "CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)", - catalogName, tableIdent, -1)); - + AssertHelpers.assertThrows( + "Should throw an error when max_concurrent_deletes = 0", + IllegalArgumentException.class, + "max_concurrent_deletes should have value > 0", + () -> + sql( + "CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)", + catalogName, tableIdent, 0)); + + AssertHelpers.assertThrows( + "Should throw an error when max_concurrent_deletes < 0 ", + IllegalArgumentException.class, + "max_concurrent_deletes should have value > 0", + () -> + sql( + "CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)", + catalogName, tableIdent, -1)); } @Test @@ -240,19 +270,21 @@ public void testExpireSnapshotWithStreamResultsEnabled() { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); - List output = sql( - "CALL %s.system.expire_snapshots(" + - "older_than => TIMESTAMP '%s'," + - "table => '%s'," + - "retain_last => 1, " + - "stream_results => true)", - catalogName, currentTimestamp, tableIdent); + List output = + sql( + "CALL %s.system.expire_snapshots(" + + "older_than => TIMESTAMP '%s'," + + "table => '%s'," + + "retain_last => 1, " + + "stream_results => true)", + catalogName, currentTimestamp, tableIdent); assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 1L)), output); } @Test public void testExpireSnapshotsProcedureWorksWithSqlComments() { - // Ensure that systems such as dbt, that add leading comments into the generated SQL commands, will + // Ensure that systems such as dbt, that add leading comments into the generated SQL commands, + // will // work with Iceberg-specific DDL sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); @@ -268,18 +300,15 @@ public void testExpireSnapshotsProcedureWorksWithSqlComments() { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); String callStatement = - "/* CALL statement is used to expire snapshots */\n" + - "-- And we have single line comments as well \n" + - "/* And comments that span *multiple* \n" + - " lines */ CALL /* this is the actual CALL */ %s.system.expire_snapshots(" + - " older_than => TIMESTAMP '%s'," + - " table => '%s'," + - " retain_last => 1)"; - List output = sql( - callStatement, catalogName, currentTimestamp, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(0L, 0L, 1L)), - output); + "/* CALL statement is used to expire snapshots */\n" + + "-- And we have single line comments as well \n" + + "/* And comments that span *multiple* \n" + + " lines */ CALL /* this is the actual CALL */ %s.system.expire_snapshots(" + + " older_than => TIMESTAMP '%s'," + + " table => '%s'," + + " retain_last => 1)"; + List output = sql(callStatement, catalogName, currentTimestamp, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 1L)), output); table.refresh(); diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java index ce88814ce937..8d2e10ea17eb 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.math.BigDecimal; @@ -31,7 +30,8 @@ public class TestIcebergExpressions extends SparkExtensionsTestBase { - public TestIcebergExpressions(String catalogName, String implementation, Map config) { + public TestIcebergExpressions( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -44,26 +44,30 @@ public void removeTables() { @Test public void testTruncateExpressions() { - sql("CREATE TABLE %s ( " + - " int_c INT, long_c LONG, dec_c DECIMAL(4, 2), str_c STRING, binary_c BINARY " + - ") USING iceberg", tableName); + sql( + "CREATE TABLE %s ( " + + " int_c INT, long_c LONG, dec_c DECIMAL(4, 2), str_c STRING, binary_c BINARY " + + ") USING iceberg", + tableName); - sql("CREATE TEMPORARY VIEW emp " + - "AS SELECT * FROM VALUES (101, 10001, 10.65, '101-Employee', CAST('1234' AS BINARY)) " + - "AS EMP(int_c, long_c, dec_c, str_c, binary_c)"); + sql( + "CREATE TEMPORARY VIEW emp " + + "AS SELECT * FROM VALUES (101, 10001, 10.65, '101-Employee', CAST('1234' AS BINARY)) " + + "AS EMP(int_c, long_c, dec_c, str_c, binary_c)"); sql("INSERT INTO %s SELECT * FROM emp", tableName); Dataset df = spark.sql("SELECT * FROM " + tableName); df.select( - new Column(new IcebergTruncateTransform(df.col("int_c").expr(), 2)).as("int_c"), - new Column(new IcebergTruncateTransform(df.col("long_c").expr(), 2)).as("long_c"), - new Column(new IcebergTruncateTransform(df.col("dec_c").expr(), 50)).as("dec_c"), - new Column(new IcebergTruncateTransform(df.col("str_c").expr(), 2)).as("str_c"), - new Column(new IcebergTruncateTransform(df.col("binary_c").expr(), 2)).as("binary_c") - ).createOrReplaceTempView("v"); + new Column(new IcebergTruncateTransform(df.col("int_c").expr(), 2)).as("int_c"), + new Column(new IcebergTruncateTransform(df.col("long_c").expr(), 2)).as("long_c"), + new Column(new IcebergTruncateTransform(df.col("dec_c").expr(), 50)).as("dec_c"), + new Column(new IcebergTruncateTransform(df.col("str_c").expr(), 2)).as("str_c"), + new Column(new IcebergTruncateTransform(df.col("binary_c").expr(), 2)).as("binary_c")) + .createOrReplaceTempView("v"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(100, 10000L, new BigDecimal("10.50"), "10", "12")), sql("SELECT int_c, long_c, dec_c, str_c, CAST(binary_c AS STRING) FROM v")); } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java index 04fefb1f5436..a185af09e3a5 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.MERGE_CARDINALITY_CHECK_ENABLED; +import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.SPLIT_SIZE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; +import static org.apache.spark.sql.functions.lit; + import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -51,17 +57,15 @@ import org.junit.BeforeClass; import org.junit.Test; -import static org.apache.iceberg.TableProperties.MERGE_CARDINALITY_CHECK_ENABLED; -import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.SPLIT_SIZE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; -import static org.apache.spark.sql.functions.lit; - public abstract class TestMerge extends SparkRowLevelOperationsTestBase { - public TestMerge(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestMerge( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -82,318 +86,395 @@ public void removeTables() { public void testMergeIntoEmptyTargetInsertAllNonMatchingRows() { createAndInitTable("id INT, dep STRING"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // new - row(2, "emp-id-2"), // new - row(3, "emp-id-3") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // new + row(2, "emp-id-2"), // new + row(3, "emp-id-3") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeIntoEmptyTargetInsertOnlyMatchingRows() { createAndInitTable("id INT, dep STRING"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED AND (s.id >=2) THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(2, "emp-id-2"), // new - row(3, "emp-id-3") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED AND (s.id >=2) THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(2, "emp-id-2"), // new + row(3, "emp-id-3") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithOnlyUpdateClause() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-six\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(6, "emp-id-six") // kept - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-six\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(6, "emp-id-six") // kept + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithOnlyDeleteClause() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-one") // kept - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-one") // kept + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithAllCauses() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithAllCausesWithExplicitColumnSpecification() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET t.id = s.id, t.dep = s.dep " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT (t.id, t.dep) VALUES (s.id, s.dep)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET t.id = s.id, t.dep = s.dep " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT (t.id, t.dep) VALUES (s.id, s.dep)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithSourceCTE() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-two\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-3\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 5, \"dep\": \"emp-id-6\" }"); - - sql("WITH cte1 AS (SELECT id + 1 AS id, dep FROM source) " + - "MERGE INTO %s AS t USING cte1 AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 2 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 3 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(2, "emp-id-2"), // updated - row(3, "emp-id-3") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-two\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-3\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 5, \"dep\": \"emp-id-6\" }"); + + sql( + "WITH cte1 AS (SELECT id + 1 AS id, dep FROM source) " + + "MERGE INTO %s AS t USING cte1 AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 2 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 3 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(2, "emp-id-2"), // updated + row(3, "emp-id-3") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithSourceFromSetOps() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); String derivedSource = - "SELECT * FROM source WHERE id = 2 " + - "UNION ALL " + - "SELECT * FROM source WHERE id = 1 OR id = 6"; - - sql("MERGE INTO %s AS t USING (%s) AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName, derivedSource); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + "SELECT * FROM source WHERE id = 2 " + + "UNION ALL " + + "SELECT * FROM source WHERE id = 1 OR id = 6"; + + sql( + "MERGE INTO %s AS t USING (%s) AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName, derivedSource); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithMultipleUpdatesForTargetRow() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrows("Should complain non iceberg target table", - SparkException.class, errorMsg, + AssertHelpers.assertThrows( + "Should complain non iceberg target table", + SparkException.class, + errorMsg, () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one"), row(6, "emp-id-6")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @Test public void testMergeWithDisabledCardinalityCheck() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); try { // disable the cardinality check - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')", tableName, MERGE_CARDINALITY_CHECK_ENABLED, false); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')", + tableName, MERGE_CARDINALITY_CHECK_ENABLED, false); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); } finally { - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')", tableName, MERGE_CARDINALITY_CHECK_ENABLED, true); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')", + tableName, MERGE_CARDINALITY_CHECK_ENABLED, true); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "emp-id-1"), row(1, "emp-id-1"), row(2, "emp-id-2")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @Test public void testMergeWithUnconditionalDelete() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithSingleConditionalDelete() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrows("Should complain non iceberg target table", - SparkException.class, errorMsg, + AssertHelpers.assertThrows( + "Should complain non iceberg target table", + SparkException.class, + errorMsg, () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one"), row(6, "emp-id-6")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -403,31 +484,41 @@ public void testMergeWithIdentityTransform() { for (DistributionMode mode : DistributionMode.values()) { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD identity(dep)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, + sql("SELECT * FROM %s ORDER BY id", tableName)); removeTables(); } @@ -438,31 +529,41 @@ public void testMergeWithDaysTransform() { for (DistributionMode mode : DistributionMode.values()) { createAndInitTable("id INT, ts TIMESTAMP"); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, "id INT, ts TIMESTAMP", - "{ \"id\": 1, \"ts\": \"2000-01-01 00:00:00\" }\n" + - "{ \"id\": 6, \"ts\": \"2000-01-06 00:00:00\" }"); - - createOrReplaceView("source", "id INT, ts TIMESTAMP", - "{ \"id\": 2, \"ts\": \"2001-01-02 00:00:00\" }\n" + - "{ \"id\": 1, \"ts\": \"2001-01-01 00:00:00\" }\n" + - "{ \"id\": 6, \"ts\": \"2001-01-06 00:00:00\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "2001-01-01 00:00:00"), // updated - row(2, "2001-01-02 00:00:00") // new - ); - assertEquals("Should have expected rows", + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "id INT, ts TIMESTAMP", + "{ \"id\": 1, \"ts\": \"2000-01-01 00:00:00\" }\n" + + "{ \"id\": 6, \"ts\": \"2000-01-06 00:00:00\" }"); + + createOrReplaceView( + "source", + "id INT, ts TIMESTAMP", + "{ \"id\": 2, \"ts\": \"2001-01-02 00:00:00\" }\n" + + "{ \"id\": 1, \"ts\": \"2001-01-01 00:00:00\" }\n" + + "{ \"id\": 6, \"ts\": \"2001-01-06 00:00:00\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "2001-01-01 00:00:00"), // updated + row(2, "2001-01-02 00:00:00") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT id, CAST(ts AS STRING) FROM %s ORDER BY id", tableName)); @@ -475,31 +576,41 @@ public void testMergeWithBucketTransform() { for (DistributionMode mode : DistributionMode.values()) { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD bucket(2, dep)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, + sql("SELECT * FROM %s ORDER BY id", tableName)); removeTables(); } @@ -510,31 +621,41 @@ public void testMergeWithTruncateTransform() { for (DistributionMode mode : DistributionMode.values()) { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD truncate(dep, 2)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, + sql("SELECT * FROM %s ORDER BY id", tableName)); removeTables(); } @@ -546,31 +667,41 @@ public void testMergeIntoPartitionedAndOrderedTable() { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); sql("ALTER TABLE %s WRITE ORDERED BY (id)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, + sql("SELECT * FROM %s ORDER BY id", tableName)); removeTables(); } @@ -578,44 +709,50 @@ public void testMergeIntoPartitionedAndOrderedTable() { @Test public void testSelfMerge() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - - sql("MERGE INTO %s t USING %s s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET v = 'x' " + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName, tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "x"), // updated - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + + sql( + "MERGE INTO %s t USING %s s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET v = 'x' " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName, tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "x"), // updated + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithSourceAsSelfSubquery() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); createOrReplaceView("source", Arrays.asList(1, null), Encoders.INT()); - sql("MERGE INTO %s t USING (SELECT id AS value FROM %s r JOIN source ON r.id = source.value) s " + - "ON t.id == s.value " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET v = 'x' " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES ('invalid', -1) ", tableName, tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "x"), // updated - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "MERGE INTO %s t USING (SELECT id AS value FROM %s r JOIN source ON r.id = source.value) s " + + "ON t.id == s.value " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET v = 'x' " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES ('invalid', -1) ", + tableName, tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "x"), // updated + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -626,37 +763,46 @@ public synchronized void testMergeWithSerializableIsolation() throws Interrupted createAndInitTable("id INT, dep STRING"); createOrReplaceView("source", Collections.singletonList(1), Encoders.INT()); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, MERGE_ISOLATION_LEVEL, "serializable"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, MERGE_ISOLATION_LEVEL, "serializable"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // merge thread - Future mergeFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.value " + - "WHEN MATCHED THEN " + - " UPDATE SET dep = 'x'", tableName); - barrier.incrementAndGet(); - } - }); + Future mergeFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.value " + + "WHEN MATCHED THEN " + + " UPDATE SET dep = 'x'", + tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { mergeFuture.get(); @@ -667,7 +813,8 @@ public synchronized void testMergeWithSerializableIsolation() throws Interrupted Throwable validationException = sparkException.getCause(); Assert.assertThat(validationException, CoreMatchers.instanceOf(ValidationException.class)); String errMsg = validationException.getMessage(); - Assert.assertThat(errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); + Assert.assertThat( + errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); } finally { appendFuture.cancel(true); } @@ -677,44 +824,54 @@ public synchronized void testMergeWithSerializableIsolation() throws Interrupted } @Test - public synchronized void testMergeWithSnapshotIsolation() throws InterruptedException, ExecutionException { + public synchronized void testMergeWithSnapshotIsolation() + throws InterruptedException, ExecutionException { // cannot run tests with concurrency for Hadoop tables without atomic renames Assume.assumeFalse(catalogName.equalsIgnoreCase("testhadoop")); createAndInitTable("id INT, dep STRING"); createOrReplaceView("source", Collections.singletonList(1), Encoders.INT()); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, MERGE_ISOLATION_LEVEL, "snapshot"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, MERGE_ISOLATION_LEVEL, "snapshot"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // merge thread - Future mergeFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.value " + - "WHEN MATCHED THEN " + - " UPDATE SET dep = 'x'", tableName); - barrier.incrementAndGet(); - } - }); + Future mergeFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.value " + + "WHEN MATCHED THEN " + + " UPDATE SET dep = 'x'", + tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { mergeFuture.get(); @@ -728,175 +885,195 @@ public synchronized void testMergeWithSnapshotIsolation() throws InterruptedExce @Test public void testMergeWithExtraColumnsInSource() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - createOrReplaceView("source", - "{ \"id\": 1, \"extra_col\": -1, \"v\": \"v1_1\" }\n" + - "{ \"id\": 3, \"extra_col\": -1, \"v\": \"v3\" }\n" + - "{ \"id\": 4, \"extra_col\": -1, \"v\": \"v4\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET v = source.v " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "v1_1"), // new - row(2, "v2"), // kept - row(3, "v3"), // new - row(4, "v4") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + createOrReplaceView( + "source", + "{ \"id\": 1, \"extra_col\": -1, \"v\": \"v1_1\" }\n" + + "{ \"id\": 3, \"extra_col\": -1, \"v\": \"v3\" }\n" + + "{ \"id\": 4, \"extra_col\": -1, \"v\": \"v4\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET v = source.v " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "v1_1"), // new + row(2, "v2"), // kept + row(3, "v3"), // new + row(4, "v4") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithNullsInTargetAndSource() { - createAndInitTable("id INT, v STRING", - "{ \"id\": null, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - - createOrReplaceView("source", - "{ \"id\": null, \"v\": \"v1_1\" }\n" + - "{ \"id\": 4, \"v\": \"v4\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET v = source.v " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(null, "v1"), // kept - row(null, "v1_1"), // new - row(2, "v2"), // kept - row(4, "v4") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": null, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + + createOrReplaceView( + "source", "{ \"id\": null, \"v\": \"v1_1\" }\n" + "{ \"id\": 4, \"v\": \"v4\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET v = source.v " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(null, "v1"), // kept + row(null, "v1_1"), // new + row(2, "v2"), // kept + row(4, "v4") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test public void testMergeWithNullSafeEquals() { - createAndInitTable("id INT, v STRING", - "{ \"id\": null, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - - createOrReplaceView("source", - "{ \"id\": null, \"v\": \"v1_1\" }\n" + - "{ \"id\": 4, \"v\": \"v4\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id <=> source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET v = source.v " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(null, "v1_1"), // updated - row(2, "v2"), // kept - row(4, "v4") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": null, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + + createOrReplaceView( + "source", "{ \"id\": null, \"v\": \"v1_1\" }\n" + "{ \"id\": 4, \"v\": \"v4\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id <=> source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET v = source.v " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(null, "v1_1"), // updated + row(2, "v2"), // kept + row(4, "v4") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test public void testMergeWithNullCondition() { - createAndInitTable("id INT, v STRING", - "{ \"id\": null, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - - createOrReplaceView("source", - "{ \"id\": null, \"v\": \"v1_1\" }\n" + - "{ \"id\": 2, \"v\": \"v2_2\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id AND NULL " + - "WHEN MATCHED THEN " + - " UPDATE SET v = source.v " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(null, "v1"), // kept - row(null, "v1_1"), // new - row(2, "v2"), // kept - row(2, "v2_2") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": null, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + + createOrReplaceView( + "source", "{ \"id\": null, \"v\": \"v1_1\" }\n" + "{ \"id\": 2, \"v\": \"v2_2\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id AND NULL " + + "WHEN MATCHED THEN " + + " UPDATE SET v = source.v " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(null, "v1"), // kept + row(null, "v1_1"), // new + row(2, "v2"), // kept + row(2, "v2_2") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test public void testMergeWithNullActionConditions() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); - createOrReplaceView("source", - "{ \"id\": 1, \"v\": \"v1_1\" }\n" + - "{ \"id\": 2, \"v\": \"v2_2\" }\n" + - "{ \"id\": 3, \"v\": \"v3_3\" }"); + createOrReplaceView( + "source", + "{ \"id\": 1, \"v\": \"v1_1\" }\n" + + "{ \"id\": 2, \"v\": \"v2_2\" }\n" + + "{ \"id\": 3, \"v\": \"v3_3\" }"); // all conditions are NULL and will never match any rows - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED AND source.id = 1 AND NULL THEN " + - " UPDATE SET v = source.v " + - "WHEN MATCHED AND source.v = 'v1_1' AND NULL THEN " + - " DELETE " + - "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows1 = ImmutableList.of( - row(1, "v1"), // kept - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows1, sql("SELECT * FROM %s ORDER BY v", tableName)); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED AND source.id = 1 AND NULL THEN " + + " UPDATE SET v = source.v " + + "WHEN MATCHED AND source.v = 'v1_1' AND NULL THEN " + + " DELETE " + + "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows1 = + ImmutableList.of( + row(1, "v1"), // kept + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows1, sql("SELECT * FROM %s ORDER BY v", tableName)); // only the update and insert conditions are NULL - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED AND source.id = 1 AND NULL THEN " + - " UPDATE SET v = source.v " + - "WHEN MATCHED AND source.v = 'v1_1' THEN " + - " DELETE " + - "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows2 = ImmutableList.of( - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows2, sql("SELECT * FROM %s ORDER BY v", tableName)); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED AND source.id = 1 AND NULL THEN " + + " UPDATE SET v = source.v " + + "WHEN MATCHED AND source.v = 'v1_1' THEN " + + " DELETE " + + "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows2 = + ImmutableList.of( + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows2, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test public void testMergeWithMultipleMatchingActions() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); - createOrReplaceView("source", - "{ \"id\": 1, \"v\": \"v1_1\" }\n" + - "{ \"id\": 2, \"v\": \"v2_2\" }"); + createOrReplaceView( + "source", "{ \"id\": 1, \"v\": \"v1_1\" }\n" + "{ \"id\": 2, \"v\": \"v2_2\" }"); // the order of match actions is important in this case - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED AND source.id = 1 THEN " + - " UPDATE SET v = source.v " + - "WHEN MATCHED AND source.v = 'v1_1' THEN " + - " DELETE " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "v1_1"), // updated (also matches the delete cond but update is first) - row(2, "v2") // kept (matches neither the update nor the delete cond) - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED AND source.id = 1 THEN " + + " UPDATE SET v = source.v " + + "WHEN MATCHED AND source.v = 'v1_1' THEN " + + " DELETE " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "v1_1"), // updated (also matches the delete cond but update is first) + row(2, "v2") // kept (matches neither the update nor the delete cond) + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test @@ -906,7 +1083,9 @@ public void testMergeWithMultipleRowGroupsParquet() throws NoSuchTableException createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, SPLIT_SIZE, 100); createOrReplaceView("source", Collections.singletonList(1), Encoders.INT()); @@ -915,122 +1094,150 @@ public void testMergeWithMultipleRowGroupsParquet() throws NoSuchTableException for (int id = 1; id <= 200; id++) { ids.add(id); } - Dataset df = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); + Dataset df = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); df.coalesce(1).writeTo(tableName).append(); Assert.assertEquals(200, spark.table(tableName).count()); // update a record from one of two row groups and copy over the second one - sql("MERGE INTO %s t USING source " + - "ON t.id == source.value " + - "WHEN MATCHED THEN " + - " UPDATE SET dep = 'x'", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.value " + + "WHEN MATCHED THEN " + + " UPDATE SET dep = 'x'", + tableName); Assert.assertEquals(200, spark.table(tableName).count()); } @Test public void testMergeInsertOnly() { - createAndInitTable("id STRING, v STRING", - "{ \"id\": \"a\", \"v\": \"v1\" }\n" + - "{ \"id\": \"b\", \"v\": \"v2\" }"); - createOrReplaceView("source", - "{ \"id\": \"a\", \"v\": \"v1_1\" }\n" + - "{ \"id\": \"a\", \"v\": \"v1_2\" }\n" + - "{ \"id\": \"c\", \"v\": \"v3\" }\n" + - "{ \"id\": \"d\", \"v\": \"v4_1\" }\n" + - "{ \"id\": \"d\", \"v\": \"v4_2\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row("a", "v1"), // kept - row("b", "v2"), // kept - row("c", "v3"), // new - row("d", "v4_1"), // new - row("d", "v4_2") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id STRING, v STRING", + "{ \"id\": \"a\", \"v\": \"v1\" }\n" + "{ \"id\": \"b\", \"v\": \"v2\" }"); + createOrReplaceView( + "source", + "{ \"id\": \"a\", \"v\": \"v1_1\" }\n" + + "{ \"id\": \"a\", \"v\": \"v1_2\" }\n" + + "{ \"id\": \"c\", \"v\": \"v3\" }\n" + + "{ \"id\": \"d\", \"v\": \"v4_1\" }\n" + + "{ \"id\": \"d\", \"v\": \"v4_2\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row("a", "v1"), // kept + row("b", "v2"), // kept + row("c", "v3"), // new + row("d", "v4_1"), // new + row("d", "v4_2") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeInsertOnlyWithCondition() { createAndInitTable("id INTEGER, v INTEGER", "{ \"id\": 1, \"v\": 1 }"); - createOrReplaceView("source", - "{ \"id\": 1, \"v\": 11, \"is_new\": true }\n" + - "{ \"id\": 2, \"v\": 21, \"is_new\": true }\n" + - "{ \"id\": 2, \"v\": 22, \"is_new\": false }"); + createOrReplaceView( + "source", + "{ \"id\": 1, \"v\": 11, \"is_new\": true }\n" + + "{ \"id\": 2, \"v\": 21, \"is_new\": true }\n" + + "{ \"id\": 2, \"v\": 22, \"is_new\": false }"); // validate assignments are reordered to match the table attrs - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED AND is_new = TRUE THEN " + - " INSERT (v, id) VALUES (s.v + 100, s.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, 1), // kept - row(2, 121) // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED AND is_new = TRUE THEN " + + " INSERT (v, id) VALUES (s.v + 100, s.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, 1), // kept + row(2, 121) // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeAlignsUpdateAndInsertActions() { createAndInitTable("id INT, a INT, b STRING", "{ \"id\": 1, \"a\": 2, \"b\": \"str\" }"); - createOrReplaceView("source", - "{ \"id\": 1, \"c1\": -2, \"c2\": \"new_str_1\" }\n" + - "{ \"id\": 2, \"c1\": -20, \"c2\": \"new_str_2\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET b = c2, a = c1, t.id = source.id " + - "WHEN NOT MATCHED THEN " + - " INSERT (b, a, id) VALUES (c2, c1, id)", tableName); - - assertEquals("Output should match", + createOrReplaceView( + "source", + "{ \"id\": 1, \"c1\": -2, \"c2\": \"new_str_1\" }\n" + + "{ \"id\": 2, \"c1\": -20, \"c2\": \"new_str_2\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET b = c2, a = c1, t.id = source.id " + + "WHEN NOT MATCHED THEN " + + " INSERT (b, a, id) VALUES (c2, c1, id)", + tableName); + + assertEquals( + "Output should match", ImmutableList.of(row(1, -2, "new_str_1"), row(2, -20, "new_str_2")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeUpdatesNestedStructFields() { - createAndInitTable("id INT, s STRUCT,m:MAP>>", + createAndInitTable( + "id INT, s STRUCT,m:MAP>>", "{ \"id\": 1, \"s\": { \"c1\": 2, \"c2\": { \"a\": [1,2], \"m\": { \"a\": \"b\"} } } } }"); createOrReplaceView("source", "{ \"id\": 1, \"c1\": -2 }"); // update primitive, array, map columns inside a struct - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.c1 = source.c1, t.s.c2.a = array(-1, -2), t.s.c2.m = map('k', 'v')", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.c1 = source.c1, t.s.c2.a = array(-1, -2), t.s.c2.m = map('k', 'v')", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(-2, row(ImmutableList.of(-1, -2), ImmutableMap.of("k", "v"))))), sql("SELECT * FROM %s ORDER BY id", tableName)); // set primitive, array, map columns to NULL (proper casts should be in place) - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.c1 = NULL, t.s.c2 = NULL", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.c1 = NULL, t.s.c2 = NULL", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(null, null))), sql("SELECT * FROM %s ORDER BY id", tableName)); // update all fields in a struct - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s = named_struct('c1', 100, 'c2', named_struct('a', array(1), 'm', map('x', 'y')))", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s = named_struct('c1', 100, 'c2', named_struct('a', array(1), 'm', map('x', 'y')))", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(100, row(ImmutableList.of(1), ImmutableMap.of("x", "y"))))), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -1041,12 +1248,15 @@ public void testMergeWithInferredCasts() { createOrReplaceView("source", "{ \"id\": 1, \"c1\": -2}"); // -2 in source should be casted to "-2" in target - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s = source.c1", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s = source.c1", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, "-2")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -1056,12 +1266,15 @@ public void testMergeModifiesNullStruct() { createAndInitTable("id INT, s STRUCT", "{ \"id\": 1, \"s\": null }"); createOrReplaceView("source", "{ \"id\": 1, \"n1\": -10 }"); - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.n1 = s.n1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.n1 = s.n1", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(-10, null))), sql("SELECT * FROM %s", tableName)); } @@ -1076,18 +1289,18 @@ public void testMergeRefreshesRelationCache() { spark.sql("CACHE TABLE tmp"); - assertEquals("View should have correct data", - ImmutableList.of(row("n1")), - sql("SELECT * FROM tmp")); + assertEquals( + "View should have correct data", ImmutableList.of(row("n1")), sql("SELECT * FROM tmp")); - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.name = s.name", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.name = s.name", + tableName); - assertEquals("View should have correct data", - ImmutableList.of(row("n2")), - sql("SELECT * FROM tmp")); + assertEquals( + "View should have correct data", ImmutableList.of(row("n2")), sql("SELECT * FROM tmp")); spark.sql("UNCACHE TABLE tmp"); } @@ -1097,33 +1310,45 @@ public void testMergeWithNonExistingColumns() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about the invalid top-level column", - AnalysisException.class, "cannot resolve '`t.invalid_col`'", + AssertHelpers.assertThrows( + "Should complain about the invalid top-level column", + AnalysisException.class, + "cannot resolve '`t.invalid_col`'", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.invalid_col = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.invalid_col = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about the invalid nested column", - AnalysisException.class, "No such struct field invalid_col", + AssertHelpers.assertThrows( + "Should complain about the invalid nested column", + AnalysisException.class, + "No such struct field invalid_col", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n2.invalid_col = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n2.invalid_col = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about the invalid top-level column", - AnalysisException.class, "cannot resolve '`invalid_col`'", + AssertHelpers.assertThrows( + "Should complain about the invalid top-level column", + AnalysisException.class, + "cannot resolve '`invalid_col`'", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n2.dn1 = s.c2 " + - "WHEN NOT MATCHED THEN " + - " INSERT (id, invalid_col) VALUES (s.c1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n2.dn1 = s.c2 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, invalid_col) VALUES (s.c1, null)", + tableName); }); } @@ -1132,35 +1357,47 @@ public void testMergeWithInvalidColumnsInInsert() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about the nested column", - AnalysisException.class, "Nested fields are not supported inside INSERT clauses", + AssertHelpers.assertThrows( + "Should complain about the nested column", + AnalysisException.class, + "Nested fields are not supported inside INSERT clauses", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n2.dn1 = s.c2 " + - "WHEN NOT MATCHED THEN " + - " INSERT (id, c.n2) VALUES (s.c1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n2.dn1 = s.c2 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, c.n2) VALUES (s.c1, null)", + tableName); }); - AssertHelpers.assertThrows("Should complain about duplicate columns", - AnalysisException.class, "Duplicate column names inside INSERT clause", + AssertHelpers.assertThrows( + "Should complain about duplicate columns", + AnalysisException.class, + "Duplicate column names inside INSERT clause", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n2.dn1 = s.c2 " + - "WHEN NOT MATCHED THEN " + - " INSERT (id, id) VALUES (s.c1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n2.dn1 = s.c2 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, id) VALUES (s.c1, null)", + tableName); }); - AssertHelpers.assertThrows("Should complain about missing columns", - AnalysisException.class, "must provide values for all columns of the target table", + AssertHelpers.assertThrows( + "Should complain about missing columns", + AnalysisException.class, + "must provide values for all columns of the target table", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN NOT MATCHED THEN " + - " INSERT (id) VALUES (s.c1)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id) VALUES (s.c1)", + tableName); }); } @@ -1169,22 +1406,30 @@ public void testMergeWithInvalidUpdates() { createAndInitTable("id INT, a ARRAY>, m MAP"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about updating an array column", - AnalysisException.class, "Updating nested fields is only supported for structs", + AssertHelpers.assertThrows( + "Should complain about updating an array column", + AnalysisException.class, + "Updating nested fields is only supported for structs", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.a.c1 = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.a.c1 = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about updating a map column", - AnalysisException.class, "Updating nested fields is only supported for structs", + AssertHelpers.assertThrows( + "Should complain about updating a map column", + AnalysisException.class, + "Updating nested fields is only supported for structs", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.m.key = 'new_key'", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.m.key = 'new_key'", + tableName); }); } @@ -1193,90 +1438,124 @@ public void testMergeWithConflictingUpdates() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about conflicting updates to a top-level column", - AnalysisException.class, "Updates are in conflict", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a top-level column", + AnalysisException.class, + "Updates are in conflict", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.id = 1, t.c.n1 = 2, t.id = 2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.id = 1, t.c.n1 = 2, t.id = 2", + tableName); }); - AssertHelpers.assertThrows("Should complain about conflicting updates to a nested column", - AnalysisException.class, "Updates are in conflict for these columns", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a nested column", + AnalysisException.class, + "Updates are in conflict for these columns", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n1 = 1, t.id = 2, t.c.n1 = 2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n1 = 1, t.id = 2, t.c.n1 = 2", + tableName); }); - AssertHelpers.assertThrows("Should complain about conflicting updates to a nested column", - AnalysisException.class, "Updates are in conflict", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a nested column", + AnalysisException.class, + "Updates are in conflict", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET c.n1 = 1, c = named_struct('n1', 1, 'n2', named_struct('dn1', 1, 'dn2', 2))", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET c.n1 = 1, c = named_struct('n1', 1, 'n2', named_struct('dn1', 1, 'dn2', 2))", + tableName); }); } @Test public void testMergeWithInvalidAssignments() { - createAndInitTable("id INT NOT NULL, s STRUCT> NOT NULL"); + createAndInitTable( + "id INT NOT NULL, s STRUCT> NOT NULL"); createOrReplaceView( "source", "c1 INT, c2 STRUCT NOT NULL, c3 STRING NOT NULL, c4 STRUCT", "{ \"c1\": -100, \"c2\": { \"n1\" : 1 }, \"c3\" : 'str', \"c4\": { \"dn2\": 1, \"dn2\": 2 } }"); - for (String policy : new String[]{"ansi", "strict"}) { - withSQLConf(ImmutableMap.of("spark.sql.storeAssignmentPolicy", policy), () -> { - - AssertHelpers.assertThrows("Should complain about writing nulls to a top-level column", - AnalysisException.class, "Cannot write nullable values to non-null column", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.id = NULL", tableName); - }); - - AssertHelpers.assertThrows("Should complain about writing nulls to a nested column", - AnalysisException.class, "Cannot write nullable values to non-null column", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.n1 = NULL", tableName); - }); - - AssertHelpers.assertThrows("Should complain about writing missing fields in structs", - AnalysisException.class, "missing fields", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s = s.c2", tableName); - }); - - AssertHelpers.assertThrows("Should complain about writing invalid data types", - AnalysisException.class, "Cannot safely cast", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.n1 = s.c3", tableName); - }); - - AssertHelpers.assertThrows("Should complain about writing incompatible structs", - AnalysisException.class, "field name does not match", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.n2 = s.c4", tableName); - }); - }); + for (String policy : new String[] {"ansi", "strict"}) { + withSQLConf( + ImmutableMap.of("spark.sql.storeAssignmentPolicy", policy), + () -> { + AssertHelpers.assertThrows( + "Should complain about writing nulls to a top-level column", + AnalysisException.class, + "Cannot write nullable values to non-null column", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.id = NULL", + tableName); + }); + + AssertHelpers.assertThrows( + "Should complain about writing nulls to a nested column", + AnalysisException.class, + "Cannot write nullable values to non-null column", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.n1 = NULL", + tableName); + }); + + AssertHelpers.assertThrows( + "Should complain about writing missing fields in structs", + AnalysisException.class, + "missing fields", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s = s.c2", + tableName); + }); + + AssertHelpers.assertThrows( + "Should complain about writing invalid data types", + AnalysisException.class, + "Cannot safely cast", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.n1 = s.c3", + tableName); + }); + + AssertHelpers.assertThrows( + "Should complain about writing incompatible structs", + AnalysisException.class, + "field name does not match", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.n2 = s.c4", + tableName); + }); + }); } } @@ -1285,40 +1564,56 @@ public void testMergeWithNonDeterministicConditions() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about non-deterministic search conditions", - AnalysisException.class, "nondeterministic expressions are only allowed in", + AssertHelpers.assertThrows( + "Should complain about non-deterministic search conditions", + AnalysisException.class, + "nondeterministic expressions are only allowed in", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 AND rand() > t.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n1 = -1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 AND rand() > t.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n1 = -1", + tableName); }); - AssertHelpers.assertThrows("Should complain about non-deterministic update conditions", - AnalysisException.class, "nondeterministic expressions are only allowed in", + AssertHelpers.assertThrows( + "Should complain about non-deterministic update conditions", + AnalysisException.class, + "nondeterministic expressions are only allowed in", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND rand() > t.id THEN " + - " UPDATE SET t.c.n1 = -1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND rand() > t.id THEN " + + " UPDATE SET t.c.n1 = -1", + tableName); }); - AssertHelpers.assertThrows("Should complain about non-deterministic delete conditions", - AnalysisException.class, "nondeterministic expressions are only allowed in", + AssertHelpers.assertThrows( + "Should complain about non-deterministic delete conditions", + AnalysisException.class, + "nondeterministic expressions are only allowed in", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND rand() > t.id THEN " + - " DELETE", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND rand() > t.id THEN " + + " DELETE", + tableName); }); - AssertHelpers.assertThrows("Should complain about non-deterministic insert conditions", - AnalysisException.class, "nondeterministic expressions are only allowed in", + AssertHelpers.assertThrows( + "Should complain about non-deterministic insert conditions", + AnalysisException.class, + "nondeterministic expressions are only allowed in", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN NOT MATCHED AND rand() > c1 THEN " + - " INSERT (id, c) VALUES (1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN NOT MATCHED AND rand() > c1 THEN " + + " INSERT (id, c) VALUES (1, null)", + tableName); }); } @@ -1327,40 +1622,56 @@ public void testMergeWithAggregateExpressions() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about agg expressions in search conditions", - AnalysisException.class, "contains one or more unsupported", + AssertHelpers.assertThrows( + "Should complain about agg expressions in search conditions", + AnalysisException.class, + "contains one or more unsupported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 AND max(t.id) == 1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n1 = -1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 AND max(t.id) == 1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n1 = -1", + tableName); }); - AssertHelpers.assertThrows("Should complain about agg expressions in update conditions", - AnalysisException.class, "contains one or more unsupported", + AssertHelpers.assertThrows( + "Should complain about agg expressions in update conditions", + AnalysisException.class, + "contains one or more unsupported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND sum(t.id) < 1 THEN " + - " UPDATE SET t.c.n1 = -1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND sum(t.id) < 1 THEN " + + " UPDATE SET t.c.n1 = -1", + tableName); }); - AssertHelpers.assertThrows("Should complain about non-deterministic delete conditions", - AnalysisException.class, "contains one or more unsupported", + AssertHelpers.assertThrows( + "Should complain about non-deterministic delete conditions", + AnalysisException.class, + "contains one or more unsupported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND sum(t.id) THEN " + - " DELETE", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND sum(t.id) THEN " + + " DELETE", + tableName); }); - AssertHelpers.assertThrows("Should complain about non-deterministic insert conditions", - AnalysisException.class, "contains one or more unsupported", + AssertHelpers.assertThrows( + "Should complain about non-deterministic insert conditions", + AnalysisException.class, + "contains one or more unsupported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN NOT MATCHED AND sum(c1) < 1 THEN " + - " INSERT (id, c) VALUES (1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN NOT MATCHED AND sum(c1) < 1 THEN " + + " INSERT (id, c) VALUES (1, null)", + tableName); }); } @@ -1369,40 +1680,56 @@ public void testMergeWithSubqueriesInConditions() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about subquery expressions", - AnalysisException.class, "Subqueries are not supported in conditions", + AssertHelpers.assertThrows( + "Should complain about subquery expressions", + AnalysisException.class, + "Subqueries are not supported in conditions", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 AND t.id < (SELECT max(c2) FROM source) " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n1 = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 AND t.id < (SELECT max(c2) FROM source) " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n1 = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about subquery expressions", - AnalysisException.class, "Subqueries are not supported in conditions", + AssertHelpers.assertThrows( + "Should complain about subquery expressions", + AnalysisException.class, + "Subqueries are not supported in conditions", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND t.id < (SELECT max(c2) FROM source) THEN " + - " UPDATE SET t.c.n1 = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND t.id < (SELECT max(c2) FROM source) THEN " + + " UPDATE SET t.c.n1 = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about subquery expressions", - AnalysisException.class, "Subqueries are not supported in conditions", + AssertHelpers.assertThrows( + "Should complain about subquery expressions", + AnalysisException.class, + "Subqueries are not supported in conditions", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND t.id NOT IN (SELECT c2 FROM source) THEN " + - " DELETE", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND t.id NOT IN (SELECT c2 FROM source) THEN " + + " DELETE", + tableName); }); - AssertHelpers.assertThrows("Should complain about subquery expressions", - AnalysisException.class, "Subqueries are not supported in conditions", + AssertHelpers.assertThrows( + "Should complain about subquery expressions", + AnalysisException.class, + "Subqueries are not supported in conditions", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN NOT MATCHED AND s.c1 IN (SELECT c2 FROM source) THEN " + - " INSERT (id, c) VALUES (1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN NOT MATCHED AND s.c1 IN (SELECT c2 FROM source) THEN " + + " INSERT (id, c) VALUES (1, null)", + tableName); }); } @@ -1411,13 +1738,17 @@ public void testMergeWithTargetColumnsInInsertCondtions() { createAndInitTable("id INT, c2 INT"); createOrReplaceView("source", "{ \"id\": 1, \"value\": 11 }"); - AssertHelpers.assertThrows("Should complain about the target column", - AnalysisException.class, "cannot resolve '`c2`'", + AssertHelpers.assertThrows( + "Should complain about the target column", + AnalysisException.class, + "cannot resolve '`c2`'", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED AND c2 = 1 THEN " + - " INSERT (id, c2) VALUES (s.id, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED AND c2 = 1 THEN " + + " INSERT (id, c2) VALUES (s.id, null)", + tableName); }); } @@ -1426,19 +1757,22 @@ public void testMergeWithNonIcebergTargetTableNotSupported() { createOrReplaceView("target", "{ \"c1\": -100, \"c2\": -200 }"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain non iceberg target table", - UnsupportedOperationException.class, "MERGE INTO TABLE is not supported temporarily.", + AssertHelpers.assertThrows( + "Should complain non iceberg target table", + UnsupportedOperationException.class, + "MERGE INTO TABLE is not supported temporarily.", () -> { - sql("MERGE INTO target t USING source s " + - "ON t.c1 == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET *"); + sql( + "MERGE INTO target t USING source s " + + "ON t.c1 == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET *"); }); } /** - * Tests a merge where both the source and target are evaluated to be partitioned by SingePartition at planning time - * but DynamicFileFilterExec will return an empty target. + * Tests a merge where both the source and target are evaluated to be partitioned by + * SingePartition at planning time but DynamicFileFilterExec will return an empty target. */ @Test public void testMergeSinglePartitionPartitioning() { @@ -1448,19 +1782,14 @@ public void testMergeSinglePartitionPartitioning() { // Coalesce forces our source into a SinglePartition distribution spark.range(0, 5).coalesce(1).createOrReplaceTempView("source"); - sql("MERGE INTO %s t USING source s ON t.id = s.id " + - "WHEN MATCHED THEN UPDATE SET *" + - "WHEN NOT MATCHED THEN INSERT *", + sql( + "MERGE INTO %s t USING source s ON t.id = s.id " + + "WHEN MATCHED THEN UPDATE SET *" + + "WHEN NOT MATCHED THEN INSERT *", tableName); - ImmutableList expectedRows = ImmutableList.of( - row(-1), - row(0), - row(1), - row(2), - row(3), - row(4) - ); + ImmutableList expectedRows = + ImmutableList.of(row(-1), row(0), row(1), row(2), row(3), row(4)); List result = sql("SELECT * FROM %s ORDER BY id", tableName); assertEquals("Should correctly add the non-matching rows", expectedRows, result); @@ -1474,18 +1803,13 @@ public void testMergeEmptyTable() { // Coalesce forces our source into a SinglePartition distribution spark.range(0, 5).coalesce(1).createOrReplaceTempView("source"); - sql("MERGE INTO %s t USING source s ON t.id = s.id " + - "WHEN MATCHED THEN UPDATE SET *" + - "WHEN NOT MATCHED THEN INSERT *", + sql( + "MERGE INTO %s t USING source s ON t.id = s.id " + + "WHEN MATCHED THEN UPDATE SET *" + + "WHEN NOT MATCHED THEN INSERT *", tableName); - ImmutableList expectedRows = ImmutableList.of( - row(0), - row(1), - row(2), - row(3), - row(4) - ); + ImmutableList expectedRows = ImmutableList.of(row(0), row(1), row(2), row(3), row(4)); List result = sql("SELECT * FROM %s ORDER BY id", tableName); assertEquals("Should correctly add the non-matching rows", expectedRows, result); @@ -1497,16 +1821,20 @@ public void testFileFilterMetric() throws Exception { spark.sql(String.format("INSERT INTO %s VALUES (1, 'emp-id-one')", tableName)); spark.sql(String.format("INSERT INTO %s VALUES (6, 'emp-id-six')", tableName)); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createOrReplaceView("source", "id INT, dep STRING", "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); Map expectedMetrics = Maps.newHashMap(); expectedMetrics.put("candidate files", "2"); expectedMetrics.put("matching files", "1"); - checkMetrics(() -> spark.sql(String.format( - "MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED THEN UPDATE SET * ", tableName)), expectedMetrics); + checkMetrics( + () -> + spark.sql( + String.format( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN UPDATE SET * ", + tableName)), + expectedMetrics); } } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMigrateTableProcedure.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMigrateTableProcedure.java index d66e75add16f..f9c150a3b1dc 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMigrateTableProcedure.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMigrateTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.io.IOException; @@ -35,12 +34,12 @@ public class TestMigrateTableProcedure extends SparkExtensionsTestBase { - public TestMigrateTableProcedure(String catalogName, String implementation, Map config) { + public TestMigrateTableProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @After public void removeTables() { @@ -52,7 +51,9 @@ public void removeTables() { public void testMigrate() throws IOException { Assume.assumeTrue(catalogName.equals("spark_catalog")); String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + tableName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); Object result = scalarSql("CALL %s.system.migrate('%s')", catalogName, tableName); @@ -65,7 +66,8 @@ public void testMigrate() throws IOException { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -76,10 +78,13 @@ public void testMigrate() throws IOException { public void testMigrateWithOptions() throws IOException { Assume.assumeTrue(catalogName.equals("spark_catalog")); String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + tableName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - Object result = scalarSql("CALL %s.system.migrate('%s', map('foo', 'bar'))", catalogName, tableName); + Object result = + scalarSql("CALL %s.system.migrate('%s', map('foo', 'bar'))", catalogName, tableName); Assert.assertEquals("Should have added one file", 1L, result); @@ -93,7 +98,8 @@ public void testMigrateWithOptions() throws IOException { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -105,10 +111,14 @@ public void testMigrateWithInvalidMetricsConfig() throws IOException { Assume.assumeTrue(catalogName.equals("spark_catalog")); String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", tableName, location); - - AssertHelpers.assertThrows("Should reject invalid metrics config", - ValidationException.class, "Invalid metrics config", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + tableName, location); + + AssertHelpers.assertThrows( + "Should reject invalid metrics config", + ValidationException.class, + "Invalid metrics config", () -> { String props = "map('write.metadata.metrics.column.x', 'X')"; sql("CALL %s.system.migrate('%s', %s)", catalogName, tableName, props); @@ -120,13 +130,17 @@ public void testMigrateWithConflictingProps() throws IOException { Assume.assumeTrue(catalogName.equals("spark_catalog")); String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + tableName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - Object result = scalarSql("CALL %s.system.migrate('%s', map('migrated', 'false'))", catalogName, tableName); + Object result = + scalarSql("CALL %s.system.migrate('%s', map('migrated', 'false'))", catalogName, tableName); Assert.assertEquals("Should have added one file", 1L, result); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); @@ -136,16 +150,22 @@ public void testMigrateWithConflictingProps() throws IOException { @Test public void testInvalidMigrateCases() { - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.migrate()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.migrate(map('foo','bar'))", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.migrate('')", catalogName)); } } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPublishChangesProcedure.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPublishChangesProcedure.java index f8080818a1e3..2b74cd475fae 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPublishChangesProcedure.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPublishChangesProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; + import java.util.List; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -34,11 +35,10 @@ import org.junit.After; import org.junit.Test; -import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; - public class TestPublishChangesProcedure extends SparkExtensionsTestBase { - public TestPublishChangesProcedure(String catalogName, String implementation, Map config) { + public TestPublishChangesProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -57,26 +57,28 @@ public void testApplyWapChangesUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.publish_changes('%s', '%s')", - catalogName, tableIdent, wapId); + List output = + sql("CALL %s.system.publish_changes('%s', '%s')", catalogName, tableIdent, wapId); table.refresh(); Snapshot currentSnapshot = table.currentSnapshot(); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())), output); - assertEquals("Apply of WAP changes must be successful", + assertEquals( + "Apply of WAP changes must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -91,26 +93,30 @@ public void testApplyWapChangesUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.publish_changes(wap_id => '%s', table => '%s')", - catalogName, wapId, tableIdent); + List output = + sql( + "CALL %s.system.publish_changes(wap_id => '%s', table => '%s')", + catalogName, wapId, tableIdent); table.refresh(); Snapshot currentSnapshot = table.currentSnapshot(); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())), output); - assertEquals("Apply of WAP changes must be successful", + assertEquals( + "Apply of WAP changes must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -132,14 +138,15 @@ public void testApplyWapChangesRefreshesRelationCache() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); - sql("CALL %s.system.publish_changes('%s', '%s')", - catalogName, tableIdent, wapId); + sql("CALL %s.system.publish_changes('%s', '%s')", catalogName, tableIdent, wapId); - assertEquals("Apply of WAP changes should be visible", + assertEquals( + "Apply of WAP changes should be visible", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); @@ -150,27 +157,37 @@ public void testApplyWapChangesRefreshesRelationCache() { public void testApplyInvalidWapId() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should reject invalid wap id", - ValidationException.class, "Cannot apply unknown WAP ID", + AssertHelpers.assertThrows( + "Should reject invalid wap id", + ValidationException.class, + "Cannot apply unknown WAP ID", () -> sql("CALL %s.system.publish_changes('%s', 'not_valid')", catalogName, tableIdent)); } @Test public void testInvalidApplyWapChangesCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.publish_changes('n', table => 't', 'not_valid')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.publish_changes('n', 't', 'not_valid')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.publish_changes('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.publish_changes('', 'not_valid')", catalogName)); } } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRemoveOrphanFilesProcedure.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRemoveOrphanFilesProcedure.java index 724e17e50a2c..fa43cf0e276c 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRemoveOrphanFilesProcedure.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRemoveOrphanFilesProcedure.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; + import java.io.IOException; import java.sql.Timestamp; import java.time.Instant; @@ -36,15 +38,12 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; - public class TestRemoveOrphanFilesProcedure extends SparkExtensionsTestBase { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - public TestRemoveOrphanFilesProcedure(String catalogName, String implementation, Map config) { + public TestRemoveOrphanFilesProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -58,14 +57,11 @@ public void removeTable() { public void testRemoveOrphanFilesInEmptyTable() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - List output = sql( - "CALL %s.system.remove_orphan_files('%s')", - catalogName, tableIdent); + List output = + sql("CALL %s.system.remove_orphan_files('%s')", catalogName, tableIdent); assertEquals("Should be no orphan files", ImmutableList.of(), output); - assertEquals("Should have no rows", - ImmutableList.of(), - sql("SELECT * FROM %s", tableName)); + assertEquals("Should have no rows", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); } @Test @@ -75,7 +71,8 @@ public void testRemoveOrphanFilesInDataFolder() throws IOException { } else { // give a fresh location to Hive tables as Spark will not clean up the table location // correctly while dropping tables through spark_catalog - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", tableName, temp.newFolder()); } @@ -97,31 +94,35 @@ public void testRemoveOrphanFilesInDataFolder() throws IOException { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); // check for orphans in the metadata folder - List output1 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s'," + - "location => '%s')", - catalogName, tableIdent, currentTimestamp, metadataLocation); + List output1 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s'," + + "location => '%s')", + catalogName, tableIdent, currentTimestamp, metadataLocation); assertEquals("Should be no orphan files in the metadata folder", ImmutableList.of(), output1); // check for orphans in the table location - List output2 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output2 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be orphan files in the data folder", 1, output2.size()); // the previous call should have deleted all orphan files - List output3 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output3 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be no more orphan files in the data folder", 0, output3.size()); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -133,7 +134,8 @@ public void testRemoveOrphanFilesDryRun() throws IOException { } else { // give a fresh location to Hive tables as Spark will not clean up the table location // correctly while dropping tables through spark_catalog - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", tableName, temp.newFolder()); } @@ -152,31 +154,35 @@ public void testRemoveOrphanFilesDryRun() throws IOException { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); // check for orphans without deleting - List output1 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s'," + - "dry_run => true)", - catalogName, tableIdent, currentTimestamp); + List output1 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s'," + + "dry_run => true)", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be one orphan files", 1, output1.size()); // actually delete orphans - List output2 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output2 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be one orphan files", 1, output2.size()); // the previous call should have deleted all orphan files - List output3 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output3 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be no more orphan files", 0, output3.size()); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -187,8 +193,10 @@ public void testRemoveOrphanFilesGCDisabled() { sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' 'false')", tableName, GC_ENABLED); - AssertHelpers.assertThrows("Should reject call", - ValidationException.class, "Cannot remove orphan files: GC is disabled", + AssertHelpers.assertThrows( + "Should reject call", + ValidationException.class, + "Cannot remove orphan files: GC is disabled", () -> sql("CALL %s.system.remove_orphan_files('%s')", catalogName, tableIdent)); } @@ -201,35 +209,46 @@ public void testRemoveOrphanFilesWap() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); - List output = sql( - "CALL %s.system.remove_orphan_files('%s')", catalogName, tableIdent); + List output = + sql("CALL %s.system.remove_orphan_files('%s')", catalogName, tableIdent); assertEquals("Should be no orphan files", ImmutableList.of(), output); } @Test public void testInvalidRemoveOrphanFilesCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.remove_orphan_files('n', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.remove_orphan_files('n', 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.remove_orphan_files()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.remove_orphan_files('n', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.remove_orphan_files('')", catalogName)); } @@ -240,7 +259,8 @@ public void testConcurrentRemoveOrphanFiles() throws IOException { } else { // give a fresh location to Hive tables as Spark will not clean up the table location // correctly while dropping tables through spark_catalog - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", tableName, temp.newFolder()); } @@ -265,21 +285,23 @@ public void testConcurrentRemoveOrphanFiles() throws IOException { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); // check for orphans in the table location - List output = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "max_concurrent_deletes => %s," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, 4, currentTimestamp); + List output = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "max_concurrent_deletes => %s," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, 4, currentTimestamp); Assert.assertEquals("Should be orphan files in the data folder", 4, output.size()); // the previous call should have deleted all orphan files - List output3 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "max_concurrent_deletes => %s," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, 4, currentTimestamp); + List output3 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "max_concurrent_deletes => %s," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, 4, currentTimestamp); Assert.assertEquals("Should be no more orphan files in the data folder", 0, output3.size()); assertEquals( @@ -292,15 +314,22 @@ public void testConcurrentRemoveOrphanFiles() throws IOException { public void testConcurrentRemoveOrphanFilesWithInvalidInput() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should throw an error when max_concurrent_deletes = 0", - IllegalArgumentException.class, "max_concurrent_deletes should have value > 0", - () -> sql("CALL %s.system.remove_orphan_files(table => '%s', max_concurrent_deletes => %s)", - catalogName, tableIdent, 0)); - - AssertHelpers.assertThrows("Should throw an error when max_concurrent_deletes < 0 ", - IllegalArgumentException.class, "max_concurrent_deletes should have value > 0", - () -> sql( - "CALL %s.system.remove_orphan_files(table => '%s', max_concurrent_deletes => %s)", - catalogName, tableIdent, -1)); + AssertHelpers.assertThrows( + "Should throw an error when max_concurrent_deletes = 0", + IllegalArgumentException.class, + "max_concurrent_deletes should have value > 0", + () -> + sql( + "CALL %s.system.remove_orphan_files(table => '%s', max_concurrent_deletes => %s)", + catalogName, tableIdent, 0)); + + AssertHelpers.assertThrows( + "Should throw an error when max_concurrent_deletes < 0 ", + IllegalArgumentException.class, + "max_concurrent_deletes should have value > 0", + () -> + sql( + "CALL %s.system.remove_orphan_files(table => '%s', max_concurrent_deletes => %s)", + catalogName, tableIdent, -1)); } } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java index a6fa5e45eb7e..5e9ace36791f 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.List; @@ -34,10 +33,10 @@ import org.junit.After; import org.junit.Test; - public class TestRewriteDataFilesProcedure extends SparkExtensionsTestBase { - public TestRewriteDataFilesProcedure(String catalogName, String implementation, Map config) { + public TestRewriteDataFilesProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -49,11 +48,8 @@ public void removeTable() { @Test public void testRewriteDataFilesInEmptyTable() { createTable(); - List output = sql( - "CALL %s.system.rewrite_data_files('%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(0, 0)), - output); + List output = sql("CALL %s.system.rewrite_data_files('%s')", catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(0, 0)), output); } @Test @@ -63,10 +59,11 @@ public void testRewriteDataFilesOnPartitionTable() { insertData(10); List expectedRecords = currentData(); - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s')", catalogName, tableIdent); + List output = + sql("CALL %s.system.rewrite_data_files(table => '%s')", catalogName, tableIdent); - assertEquals("Action should rewrite 10 data files and add 2 data files (one per partition) ", + assertEquals( + "Action should rewrite 10 data files and add 2 data files (one per partition) ", ImmutableList.of(row(10, 2)), output); @@ -81,10 +78,11 @@ public void testRewriteDataFilesOnNonPartitionTable() { insertData(10); List expectedRecords = currentData(); - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s')", catalogName, tableIdent); + List output = + sql("CALL %s.system.rewrite_data_files(table => '%s')", catalogName, tableIdent); - assertEquals("Action should rewrite 10 data files and add 1 data files", + assertEquals( + "Action should rewrite 10 data files and add 1 data files", ImmutableList.of(row(10, 1)), output); @@ -100,11 +98,13 @@ public void testRewriteDataFilesWithOptions() { List expectedRecords = currentData(); // set the min-input-files = 12, instead of default 5 to skip compacting the files. - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s', options => map('min-input-files','12'))", - catalogName, tableIdent); + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s', options => map('min-input-files','12'))", + catalogName, tableIdent); - assertEquals("Action should rewrite 0 data files and add 0 data files", + assertEquals( + "Action should rewrite 0 data files and add 0 data files", ImmutableList.of(row(0, 0)), output); @@ -120,12 +120,14 @@ public void testRewriteDataFilesWithSortStrategy() { List expectedRecords = currentData(); // set sort_order = c1 DESC LAST - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s', " + - "strategy => 'sort', sort_order => 'c1 DESC NULLS LAST')", - catalogName, tableIdent); - - assertEquals("Action should rewrite 10 data files and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s', " + + "strategy => 'sort', sort_order => 'c1 DESC NULLS LAST')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 10 data files and add 1 data files", ImmutableList.of(row(10, 1)), output); @@ -141,11 +143,14 @@ public void testRewriteDataFilesWithFilter() { List expectedRecords = currentData(); // select only 5 files for compaction (files that may have c1 = 1) - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 = 1 and c2 is not null')", catalogName, tableIdent); - - assertEquals("Action should rewrite 5 data files (containing c1 = 1) and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + + " where => 'c1 = 1 and c2 is not null')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 5 data files (containing c1 = 1) and add 1 data files", ImmutableList.of(row(5, 1)), output); @@ -161,12 +166,14 @@ public void testRewriteDataFilesWithFilterOnPartitionTable() { List expectedRecords = currentData(); // select only 5 files for compaction (files in the partition c2 = 'bar') - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c2 = \"bar\"')", catalogName, tableIdent); - - assertEquals("Action should rewrite 5 data files from single matching partition" + - "(containing c2 = bar) and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c2 = \"bar\"')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 5 data files from single matching partition" + + "(containing c2 = bar) and add 1 data files", ImmutableList.of(row(5, 1)), output); @@ -182,12 +189,14 @@ public void testRewriteDataFilesWithInFilterOnPartitionTable() { List expectedRecords = currentData(); // select only 5 files for compaction (files in the partition c2 in ('bar')) - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c2 in (\"bar\")')", catalogName, tableIdent); - - assertEquals("Action should rewrite 5 data files from single matching partition" + - "(containing c2 = bar) and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c2 in (\"bar\")')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 5 data files from single matching partition" + + "(containing c2 = bar) and add 1 data files", ImmutableList.of(row(5, 1)), output); @@ -205,43 +214,56 @@ public void testRewriteDataFilesWithAllPossibleFilters() { // So that parsing can be tested on a same dataset without actually compacting the files. // EqualTo - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 = 3')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 = 3')", + catalogName, tableIdent); // GreaterThan - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 > 3')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 > 3')", + catalogName, tableIdent); // GreaterThanOrEqual - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 >= 3')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 >= 3')", + catalogName, tableIdent); // LessThan - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 < 0')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 < 0')", + catalogName, tableIdent); // LessThanOrEqual - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 <= 0')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 <= 0')", + catalogName, tableIdent); // In - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 in (3,4,5)')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 in (3,4,5)')", + catalogName, tableIdent); // IsNull - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 is null')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 is null')", + catalogName, tableIdent); // IsNotNull - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c3 is not null')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c3 is not null')", + catalogName, tableIdent); // And - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 = 3 and c2 = \"bar\"')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 = 3 and c2 = \"bar\"')", + catalogName, tableIdent); // Or - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 = 3 or c1 = 5')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 = 3 or c1 = 5')", + catalogName, tableIdent); // Not - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 not in (1,2)')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 not in (1,2)')", + catalogName, tableIdent); // StringStartsWith - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c2 like \"%s\"')", catalogName, tableIdent, "car%"); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c2 like \"%s\"')", + catalogName, tableIdent, "car%"); - // TODO: Enable when org.apache.iceberg.spark.SparkFilters have implementations for StringEndsWith & StringContains + // TODO: Enable when org.apache.iceberg.spark.SparkFilters have implementations for + // StringEndsWith & StringContains // StringEndsWith // sql("CALL %s.system.rewrite_data_files(table => '%s'," + // " where => 'c2 like \"%s\"')", catalogName, tableIdent, "%car"); @@ -257,63 +279,102 @@ public void testRewriteDataFilesWithInvalidInputs() { insertData(2); // Test for invalid strategy - AssertHelpers.assertThrows("Should reject calls with unsupported strategy error message", - IllegalArgumentException.class, "unsupported strategy: temp. Only binpack,sort is supported", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', options => map('min-input-files','2'), " + - "strategy => 'temp')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with unsupported strategy error message", + IllegalArgumentException.class, + "unsupported strategy: temp. Only binpack,sort is supported", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', options => map('min-input-files','2'), " + + "strategy => 'temp')", + catalogName, tableIdent)); // Test for sort_order with binpack strategy - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Cannot set strategy to sort, it has already been set", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'binpack', " + - "sort_order => 'c1 ASC NULLS FIRST')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Cannot set strategy to sort, it has already been set", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'binpack', " + + "sort_order => 'c1 ASC NULLS FIRST')", + catalogName, tableIdent)); // Test for sort_order with invalid null order - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Unable to parse sortOrder:", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + - "sort_order => 'c1 ASC none')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Unable to parse sortOrder:", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + + "sort_order => 'c1 ASC none')", + catalogName, tableIdent)); // Test for sort_order with invalid sort direction - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Unable to parse sortOrder:", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + - "sort_order => 'c1 none NULLS FIRST')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Unable to parse sortOrder:", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + + "sort_order => 'c1 none NULLS FIRST')", + catalogName, tableIdent)); // Test for sort_order with invalid column name - AssertHelpers.assertThrows("Should reject calls with error message", - ValidationException.class, "Cannot find field 'col1' in struct:" + - " struct<1: c1: optional int, 2: c2: optional string, 3: c3: optional string>", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + - "sort_order => 'col1 DESC NULLS FIRST')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + ValidationException.class, + "Cannot find field 'col1' in struct:" + + " struct<1: c1: optional int, 2: c2: optional string, 3: c3: optional string>", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + + "sort_order => 'col1 DESC NULLS FIRST')", + catalogName, tableIdent)); // Test for sort_order with invalid filter column col1 - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Cannot parse predicates in where option: col1 = 3", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', " + - "where => 'col1 = 3')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Cannot parse predicates in where option: col1 = 3", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', " + "where => 'col1 = 3')", + catalogName, tableIdent)); } @Test public void testInvalidCasesForRewriteDataFiles() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.rewrite_data_files('n', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.rewrite_data_files('n', 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rewrite_data_files()", catalogName)); - AssertHelpers.assertThrows("Should reject duplicate arg names name", - AnalysisException.class, "Duplicate procedure argument: table", + AssertHelpers.assertThrows( + "Should reject duplicate arg names name", + AnalysisException.class, + "Duplicate procedure argument: table", () -> sql("CALL %s.system.rewrite_data_files(table => 't', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.rewrite_data_files('')", catalogName)); } @@ -322,7 +383,9 @@ private void createTable() { } private void createPartitionTable() { - sql("CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg PARTITIONED BY (c2)", tableName); + sql( + "CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg PARTITIONED BY (c2)", + tableName); } private void insertData(int filesCount) { @@ -330,12 +393,15 @@ private void insertData(int filesCount) { ThreeColumnRecord record2 = new ThreeColumnRecord(2, "bar", null); List records = Lists.newArrayList(); - IntStream.range(0, filesCount / 2).forEach(i -> { - records.add(record1); - records.add(record2); - }); - - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).repartition(filesCount); + IntStream.range(0, filesCount / 2) + .forEach( + i -> { + records.add(record1); + records.add(record2); + }); + + Dataset df = + spark.createDataFrame(records, ThreeColumnRecord.class).repartition(filesCount); try { df.writeTo(tableName).append(); } catch (org.apache.spark.sql.catalyst.analysis.NoSuchTableException e) { @@ -344,6 +410,7 @@ private void insertData(int filesCount) { } private List currentData() { - return rowsToJava(spark.sql("SELECT * FROM " + tableName + " order by c1, c2, c3").collectAsList()); + return rowsToJava( + spark.sql("SELECT * FROM " + tableName + " order by c1, c2, c3").collectAsList()); } } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteManifestsProcedure.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteManifestsProcedure.java index dcf0a2d91e3e..7c5ec1f5cf3f 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteManifestsProcedure.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteManifestsProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; + import java.util.List; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -30,11 +31,10 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; - public class TestRewriteManifestsProcedure extends SparkExtensionsTestBase { - public TestRewriteManifestsProcedure(String catalogName, String implementation, Map config) { + public TestRewriteManifestsProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -46,40 +46,42 @@ public void removeTable() { @Test public void testRewriteManifestsInEmptyTable() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - List output = sql( - "CALL %s.system.rewrite_manifests('%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(0, 0)), - output); + List output = sql("CALL %s.system.rewrite_manifests('%s')", catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(0, 0)), output); } @Test public void testRewriteLargeManifests() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", + tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Must have 1 manifest", 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 1 manifest", 1, table.currentSnapshot().allManifests(table.io()).size()); sql("ALTER TABLE %s SET TBLPROPERTIES ('commit.manifest.target-size-bytes' '1')", tableName); - List output = sql( - "CALL %s.system.rewrite_manifests('%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(1, 4)), - output); + List output = sql("CALL %s.system.rewrite_manifests('%s')", catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(1, 4)), output); table.refresh(); - Assert.assertEquals("Must have 4 manifests", 4, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 4 manifests", 4, table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testRewriteSmallManifestsWithSnapshotIdInheritance() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", + tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' '%s')", tableName, SNAPSHOT_ID_INHERITANCE_ENABLED, "true"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' '%s')", + tableName, SNAPSHOT_ID_INHERITANCE_ENABLED, "true"); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName); @@ -88,87 +90,107 @@ public void testRewriteSmallManifestsWithSnapshotIdInheritance() { Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Must have 4 manifest", 4, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 4 manifest", 4, table.currentSnapshot().allManifests(table.io()).size()); - List output = sql( - "CALL %s.system.rewrite_manifests(table => '%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(4, 1)), - output); + List output = + sql("CALL %s.system.rewrite_manifests(table => '%s')", catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(4, 1)), output); table.refresh(); - Assert.assertEquals("Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testRewriteSmallManifestsWithoutCaching() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", + tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Must have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); - List output = sql( - "CALL %s.system.rewrite_manifests(use_caching => false, table => '%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(2, 1)), - output); + List output = + sql( + "CALL %s.system.rewrite_manifests(use_caching => false, table => '%s')", + catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(2, 1)), output); table.refresh(); - Assert.assertEquals("Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testRewriteManifestsCaseInsensitiveArgs() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", + tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Must have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); - List output = sql( - "CALL %s.system.rewrite_manifests(usE_cAcHiNg => false, tAbLe => '%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(2, 1)), - output); + List output = + sql( + "CALL %s.system.rewrite_manifests(usE_cAcHiNg => false, tAbLe => '%s')", + catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(2, 1)), output); table.refresh(); - Assert.assertEquals("Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testInvalidRewriteManifestsCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.rewrite_manifests('n', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.rewrite_manifests('n', 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rewrite_manifests()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.rewrite_manifests('n', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject duplicate arg names name", - AnalysisException.class, "Duplicate procedure argument: table", + AssertHelpers.assertThrows( + "Should reject duplicate arg names name", + AnalysisException.class, + "Duplicate procedure argument: table", () -> sql("CALL %s.system.rewrite_manifests(table => 't', tAbLe => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.rewrite_manifests('')", catalogName)); } } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToSnapshotProcedure.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToSnapshotProcedure.java index d3e6bdcbc285..af94b456d02e 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToSnapshotProcedure.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToSnapshotProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.List; @@ -36,7 +35,8 @@ public class TestRollbackToSnapshotProcedure extends SparkExtensionsTestBase { - public TestRollbackToSnapshotProcedure(String catalogName, String implementation, Map config) { + public TestRollbackToSnapshotProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -55,7 +55,8 @@ public void testRollbackToSnapshotUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -63,15 +64,18 @@ public void testRollbackToSnapshotUsingPositionalArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.rollback_to_snapshot('%s', %dL)", - catalogName, tableIdent, firstSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.rollback_to_snapshot('%s', %dL)", + catalogName, tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -86,7 +90,8 @@ public void testRollbackToSnapshotUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -94,15 +99,18 @@ public void testRollbackToSnapshotUsingNamedArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.rollback_to_snapshot(snapshot_id => %dL, table => '%s')", - catalogName, firstSnapshot.snapshotId(), tableIdent); + List output = + sql( + "CALL %s.system.rollback_to_snapshot(snapshot_id => %dL, table => '%s')", + catalogName, firstSnapshot.snapshotId(), tableIdent); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -126,21 +134,23 @@ public void testRollbackToSnapshotRefreshesRelationCache() { spark.sql("CACHE TABLE tmp"); - assertEquals("View should have expected rows", + assertEquals( + "View should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM tmp")); - List output = sql( - "CALL %s.system.rollback_to_snapshot(table => '%s', snapshot_id => %dL)", - catalogName, tableIdent, firstSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.rollback_to_snapshot(table => '%s', snapshot_id => %dL)", + catalogName, tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("View cache must be invalidated", - ImmutableList.of(row(1L, "a")), - sql("SELECT * FROM tmp")); + assertEquals( + "View cache must be invalidated", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); sql("UNCACHE TABLE tmp"); } @@ -155,7 +165,8 @@ public void testRollbackToSnapshotWithQuotedIdentifiers() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -171,15 +182,20 @@ public void testRollbackToSnapshotWithQuotedIdentifiers() { } String quotedNamespace = quotedNamespaceBuilder.toString(); - List output = sql( - "CALL %s.system.rollback_to_snapshot('%s', %d)", - catalogName, quotedNamespace + ".`" + tableIdent.name() + "`", firstSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.rollback_to_snapshot('%s', %d)", + catalogName, + quotedNamespace + ".`" + tableIdent.name() + "`", + firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -196,7 +212,8 @@ public void testRollbackToSnapshotWithoutExplicitCatalog() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -205,15 +222,16 @@ public void testRollbackToSnapshotWithoutExplicitCatalog() { Snapshot secondSnapshot = table.currentSnapshot(); // use camel case intentionally to test case sensitivity - List output = sql( - "CALL SyStEm.rOLlBaCk_to_SnApShOt('%s', %dL)", - tableIdent, firstSnapshot.snapshotId()); + List output = + sql("CALL SyStEm.rOLlBaCk_to_SnApShOt('%s', %dL)", tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -222,39 +240,58 @@ public void testRollbackToSnapshotWithoutExplicitCatalog() { public void testRollbackToInvalidSnapshot() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should reject invalid snapshot id", - ValidationException.class, "Cannot roll back to unknown snapshot id", + AssertHelpers.assertThrows( + "Should reject invalid snapshot id", + ValidationException.class, + "Cannot roll back to unknown snapshot id", () -> sql("CALL %s.system.rollback_to_snapshot('%s', -1L)", catalogName, tableIdent)); } @Test public void testInvalidRollbackToSnapshotCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", - () -> sql("CALL %s.system.rollback_to_snapshot(namespace => 'n1', table => 't', 1L)", catalogName)); - - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", + () -> + sql( + "CALL %s.system.rollback_to_snapshot(namespace => 'n1', table => 't', 1L)", + catalogName)); + + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.rollback_to_snapshot('n', 't', 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_snapshot('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_snapshot(1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_snapshot(table => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for snapshot_id: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for snapshot_id: cannot cast", () -> sql("CALL %s.system.rollback_to_snapshot('t', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.rollback_to_snapshot('', 1L)", catalogName)); } } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToTimestampProcedure.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToTimestampProcedure.java index 52fc12c7d01e..6da3853bbe24 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToTimestampProcedure.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToTimestampProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.time.LocalDateTime; @@ -36,7 +35,8 @@ public class TestRollbackToTimestampProcedure extends SparkExtensionsTestBase { - public TestRollbackToTimestampProcedure(String catalogName, String implementation, Map config) { + public TestRollbackToTimestampProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -58,7 +58,8 @@ public void testRollbackToTimestampUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -66,15 +67,18 @@ public void testRollbackToTimestampUsingPositionalArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.rollback_to_timestamp('%s',TIMESTAMP '%s')", - catalogName, tableIdent, firstSnapshotTimestamp); + List output = + sql( + "CALL %s.system.rollback_to_timestamp('%s',TIMESTAMP '%s')", + catalogName, tableIdent, firstSnapshotTimestamp); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -92,7 +96,8 @@ public void testRollbackToTimestampUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -100,15 +105,18 @@ public void testRollbackToTimestampUsingNamedArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.rollback_to_timestamp(timestamp => TIMESTAMP '%s', table => '%s')", - catalogName, firstSnapshotTimestamp, tableIdent); + List output = + sql( + "CALL %s.system.rollback_to_timestamp(timestamp => TIMESTAMP '%s', table => '%s')", + catalogName, firstSnapshotTimestamp, tableIdent); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -135,21 +143,23 @@ public void testRollbackToTimestampRefreshesRelationCache() { spark.sql("CACHE TABLE tmp"); - assertEquals("View should have expected rows", + assertEquals( + "View should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM tmp")); - List output = sql( - "CALL %s.system.rollback_to_timestamp(table => '%s', timestamp => TIMESTAMP '%s')", - catalogName, tableIdent, firstSnapshotTimestamp); + List output = + sql( + "CALL %s.system.rollback_to_timestamp(table => '%s', timestamp => TIMESTAMP '%s')", + catalogName, tableIdent, firstSnapshotTimestamp); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("View cache must be invalidated", - ImmutableList.of(row(1L, "a")), - sql("SELECT * FROM tmp")); + assertEquals( + "View cache must be invalidated", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); sql("UNCACHE TABLE tmp"); } @@ -167,7 +177,8 @@ public void testRollbackToTimestampWithQuotedIdentifiers() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -183,15 +194,18 @@ public void testRollbackToTimestampWithQuotedIdentifiers() { } String quotedNamespace = quotedNamespaceBuilder.toString(); - List output = sql( - "CALL %s.system.rollback_to_timestamp('%s', TIMESTAMP '%s')", - catalogName, quotedNamespace + ".`" + tableIdent.name() + "`", firstSnapshotTimestamp); + List output = + sql( + "CALL %s.system.rollback_to_timestamp('%s', TIMESTAMP '%s')", + catalogName, quotedNamespace + ".`" + tableIdent.name() + "`", firstSnapshotTimestamp); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -211,7 +225,8 @@ public void testRollbackToTimestampWithoutExplicitCatalog() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -220,15 +235,18 @@ public void testRollbackToTimestampWithoutExplicitCatalog() { Snapshot secondSnapshot = table.currentSnapshot(); // use camel case intentionally to test case sensitivity - List output = sql( - "CALL SyStEm.rOLlBaCk_to_TiMeStaMp('%s', TIMESTAMP '%s')", - tableIdent, firstSnapshotTimestamp); + List output = + sql( + "CALL SyStEm.rOLlBaCk_to_TiMeStaMp('%s', TIMESTAMP '%s')", + tableIdent, firstSnapshotTimestamp); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -237,32 +255,50 @@ public void testRollbackToTimestampWithoutExplicitCatalog() { public void testInvalidRollbackToTimestampCases() { String timestamp = "TIMESTAMP '2007-12-03T10:15:30'"; - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", - () -> sql("CALL %s.system.rollback_to_timestamp(namespace => 'n1', 't', %s)", catalogName, timestamp)); - - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", + () -> + sql( + "CALL %s.system.rollback_to_timestamp(namespace => 'n1', 't', %s)", + catalogName, timestamp)); + + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.rollback_to_timestamp('n', 't', %s)", catalogName, timestamp)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_timestamp('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_timestamp(timestamp => %s)", catalogName, timestamp)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_timestamp(table => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with extra args", - AnalysisException.class, "Too many arguments", - () -> sql("CALL %s.system.rollback_to_timestamp('n', 't', %s, 1L)", catalogName, timestamp)); - - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for timestamp: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with extra args", + AnalysisException.class, + "Too many arguments", + () -> + sql("CALL %s.system.rollback_to_timestamp('n', 't', %s, 1L)", catalogName, timestamp)); + + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for timestamp: cannot cast", () -> sql("CALL %s.system.rollback_to_timestamp('t', 2.2)", catalogName)); } } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetCurrentSnapshotProcedure.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetCurrentSnapshotProcedure.java index 0ea8c4861e8c..8a8a974bbebe 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetCurrentSnapshotProcedure.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetCurrentSnapshotProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; + import java.util.List; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -34,11 +35,10 @@ import org.junit.Assume; import org.junit.Test; -import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; - public class TestSetCurrentSnapshotProcedure extends SparkExtensionsTestBase { - public TestSetCurrentSnapshotProcedure(String catalogName, String implementation, Map config) { + public TestSetCurrentSnapshotProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -57,7 +57,8 @@ public void testSetCurrentSnapshotUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -65,15 +66,18 @@ public void testSetCurrentSnapshotUsingPositionalArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.set_current_snapshot('%s', %dL)", - catalogName, tableIdent, firstSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.set_current_snapshot('%s', %dL)", + catalogName, tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Set must be successful", + assertEquals( + "Set must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -88,7 +92,8 @@ public void testSetCurrentSnapshotUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -96,15 +101,18 @@ public void testSetCurrentSnapshotUsingNamedArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.set_current_snapshot(snapshot_id => %dL, table => '%s')", - catalogName, firstSnapshot.snapshotId(), tableIdent); + List output = + sql( + "CALL %s.system.set_current_snapshot(snapshot_id => %dL, table => '%s')", + catalogName, firstSnapshot.snapshotId(), tableIdent); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Set must be successful", + assertEquals( + "Set must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -118,22 +126,26 @@ public void testSetCurrentSnapshotWap() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.set_current_snapshot(table => '%s', snapshot_id => %dL)", - catalogName, tableIdent, wapSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.set_current_snapshot(table => '%s', snapshot_id => %dL)", + catalogName, tableIdent, wapSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(null, wapSnapshot.snapshotId())), output); - assertEquals("Current snapshot must be set correctly", + assertEquals( + "Current snapshot must be set correctly", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -150,7 +162,8 @@ public void tesSetCurrentSnapshotWithoutExplicitCatalog() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -159,15 +172,16 @@ public void tesSetCurrentSnapshotWithoutExplicitCatalog() { Snapshot secondSnapshot = table.currentSnapshot(); // use camel case intentionally to test case sensitivity - List output = sql( - "CALL SyStEm.sEt_cuRrEnT_sNaPsHot('%s', %dL)", - tableIdent, firstSnapshot.snapshotId()); + List output = + sql("CALL SyStEm.sEt_cuRrEnT_sNaPsHot('%s', %dL)", tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Set must be successful", + assertEquals( + "Set must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -179,43 +193,64 @@ public void testSetCurrentSnapshotToInvalidSnapshot() { Namespace namespace = tableIdent.namespace(); String tableName = tableIdent.name(); - AssertHelpers.assertThrows("Should reject invalid snapshot id", - ValidationException.class, "Cannot roll back to unknown snapshot id", + AssertHelpers.assertThrows( + "Should reject invalid snapshot id", + ValidationException.class, + "Cannot roll back to unknown snapshot id", () -> sql("CALL %s.system.set_current_snapshot('%s', -1L)", catalogName, tableIdent)); } @Test public void testInvalidRollbackToSnapshotCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", - () -> sql("CALL %s.system.set_current_snapshot(namespace => 'n1', table => 't', 1L)", catalogName)); - - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", + () -> + sql( + "CALL %s.system.set_current_snapshot(namespace => 'n1', table => 't', 1L)", + catalogName)); + + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.set_current_snapshot('n', 't', 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.set_current_snapshot('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.set_current_snapshot(1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.set_current_snapshot(snapshot_id => 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.set_current_snapshot(table => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for snapshot_id: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for snapshot_id: cannot cast", () -> sql("CALL %s.system.set_current_snapshot('t', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.set_current_snapshot('', 1L)", catalogName)); } } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetWriteDistributionAndOrdering.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetWriteDistributionAndOrdering.java index 473278d25068..e7e52806792d 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetWriteDistributionAndOrdering.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetWriteDistributionAndOrdering.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.expressions.Expressions.bucket; + import java.util.Map; import org.apache.iceberg.NullOrder; import org.apache.iceberg.SortOrder; @@ -28,10 +29,9 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.expressions.Expressions.bucket; - public class TestSetWriteDistributionAndOrdering extends SparkExtensionsTestBase { - public TestSetWriteDistributionAndOrdering(String catalogName, String implementation, Map config) { + public TestSetWriteDistributionAndOrdering( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -42,7 +42,9 @@ public void removeTable() { @Test public void testSetWriteOrderByColumn() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -53,17 +55,20 @@ public void testSetWriteOrderByColumn() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "range", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("category", NullOrder.NULLS_FIRST) - .asc("id", NullOrder.NULLS_FIRST) - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .asc("category", NullOrder.NULLS_FIRST) + .asc("id", NullOrder.NULLS_FIRST) + .build(); Assert.assertEquals("Should have expected order", expected, table.sortOrder()); } @Test public void testSetWriteOrderByColumnWithDirection() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -74,17 +79,20 @@ public void testSetWriteOrderByColumnWithDirection() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "range", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("category", NullOrder.NULLS_FIRST) - .desc("id", NullOrder.NULLS_LAST) - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .asc("category", NullOrder.NULLS_FIRST) + .desc("id", NullOrder.NULLS_LAST) + .build(); Assert.assertEquals("Should have expected order", expected, table.sortOrder()); } @Test public void testSetWriteOrderByColumnWithDirectionAndNullOrder() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -95,17 +103,20 @@ public void testSetWriteOrderByColumnWithDirectionAndNullOrder() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "range", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("category", NullOrder.NULLS_LAST) - .desc("id", NullOrder.NULLS_FIRST) - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .asc("category", NullOrder.NULLS_LAST) + .desc("id", NullOrder.NULLS_FIRST) + .build(); Assert.assertEquals("Should have expected order", expected, table.sortOrder()); } @Test public void testSetWriteOrderByTransform() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -116,18 +127,21 @@ public void testSetWriteOrderByTransform() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "range", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .desc("category") - .asc(bucket("id", 16)) - .asc("id") - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .desc("category") + .asc(bucket("id", 16)) + .asc("id") + .build(); Assert.assertEquals("Should have expected order", expected, table.sortOrder()); } @Test public void testSetWriteUnordered() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -152,7 +166,9 @@ public void testSetWriteUnordered() { @Test public void testSetWriteLocallyOrdered() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -163,18 +179,21 @@ public void testSetWriteLocallyOrdered() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "none", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .desc("category") - .asc(bucket("id", 16)) - .asc("id") - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .desc("category") + .asc(bucket("id", 16)) + .asc("id") + .build(); Assert.assertEquals("Sort order must match", expected, table.sortOrder()); } @Test public void testSetWriteDistributedByWithSort() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -185,16 +204,15 @@ public void testSetWriteDistributedByWithSort() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "hash", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("id") - .build(); + SortOrder expected = SortOrder.builderFor(table.schema()).withOrderId(1).asc("id").build(); Assert.assertEquals("Sort order must match", expected, table.sortOrder()); } @Test public void testSetWriteDistributedByWithLocalSort() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -205,16 +223,15 @@ public void testSetWriteDistributedByWithLocalSort() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "hash", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("id") - .build(); + SortOrder expected = SortOrder.builderFor(table.schema()).withOrderId(1).asc("id").build(); Assert.assertEquals("Sort order must match", expected, table.sortOrder()); } @Test public void testSetWriteDistributedByAndUnordered() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -230,7 +247,9 @@ public void testSetWriteDistributedByAndUnordered() { @Test public void testSetWriteDistributedByOnly() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -246,7 +265,9 @@ public void testSetWriteDistributedByOnly() { @Test public void testSetWriteDistributedAndUnorderedInverted() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -262,7 +283,9 @@ public void testSetWriteDistributedAndUnorderedInverted() { @Test public void testSetWriteDistributedAndLocallyOrderedInverted() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -273,10 +296,7 @@ public void testSetWriteDistributedAndLocallyOrderedInverted() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "hash", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("id") - .build(); + SortOrder expected = SortOrder.builderFor(table.schema()).withOrderId(1).asc("id").build(); Assert.assertEquals("Sort order must match", expected, table.sortOrder()); } } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java index 66fa8e80c515..d8e918d8aadd 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.io.IOException; @@ -37,12 +36,12 @@ public class TestSnapshotTableProcedure extends SparkExtensionsTestBase { private static final String sourceName = "spark_catalog.default.source"; // Currently we can only Snapshot only out of the Spark Session Catalog - public TestSnapshotTableProcedure(String catalogName, String implementation, Map config) { + public TestSnapshotTableProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @After public void removeTables() { @@ -53,9 +52,12 @@ public void removeTables() { @Test public void testSnapshot() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object result = scalarSql("CALL %s.system.snapshot('%s', '%s')", catalogName, sourceName, tableName); + Object result = + scalarSql("CALL %s.system.snapshot('%s', '%s')", catalogName, sourceName, tableName); Assert.assertEquals("Should have added one file", 1L, result); @@ -65,7 +67,8 @@ public void testSnapshot() throws IOException { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -73,11 +76,14 @@ public void testSnapshot() throws IOException { @Test public void testSnapshotWithProperties() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object result = scalarSql( - "CALL %s.system.snapshot(source_table => '%s', table => '%s', properties => map('foo','bar'))", - catalogName, sourceName, tableName); + Object result = + scalarSql( + "CALL %s.system.snapshot(source_table => '%s', table => '%s', properties => map('foo','bar'))", + catalogName, sourceName, tableName); Assert.assertEquals("Should have added one file", 1L, result); @@ -91,30 +97,39 @@ public void testSnapshotWithProperties() throws IOException { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testSnapshotWithAlternateLocation() throws IOException { - Assume.assumeTrue("No Snapshoting with Alternate locations with Hadoop Catalogs", !catalogName.contains("hadoop")); + Assume.assumeTrue( + "No Snapshoting with Alternate locations with Hadoop Catalogs", + !catalogName.contains("hadoop")); String location = temp.newFolder().toString(); String snapshotLocation = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object[] result = sql( - "CALL %s.system.snapshot(source_table => '%s', table => '%s', location => '%s')", - catalogName, sourceName, tableName, snapshotLocation).get(0); + Object[] result = + sql( + "CALL %s.system.snapshot(source_table => '%s', table => '%s', location => '%s')", + catalogName, sourceName, tableName, snapshotLocation) + .get(0); Assert.assertEquals("Should have added one file", 1L, result[0]); String storageLocation = validationCatalog.loadTable(tableIdent).location(); - Assert.assertEquals("Snapshot should be made at specified location", snapshotLocation, storageLocation); + Assert.assertEquals( + "Snapshot should be made at specified location", snapshotLocation, storageLocation); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -122,19 +137,24 @@ public void testSnapshotWithAlternateLocation() throws IOException { @Test public void testDropTable() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object result = scalarSql("CALL %s.system.snapshot('%s', '%s')", catalogName, sourceName, tableName); + Object result = + scalarSql("CALL %s.system.snapshot('%s', '%s')", catalogName, sourceName, tableName); Assert.assertEquals("Should have added one file", 1L, result); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); sql("DROP TABLE %s", tableName); - assertEquals("Source table should be intact", + assertEquals( + "Source table should be intact", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", sourceName)); } @@ -142,50 +162,70 @@ public void testDropTable() throws IOException { @Test public void testSnapshotWithConflictingProps() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object result = scalarSql( - "CALL %s.system.snapshot(" + - "source_table => '%s'," + - "table => '%s'," + - "properties => map('%s', 'true', 'snapshot', 'false'))", - catalogName, sourceName, tableName, TableProperties.GC_ENABLED); + Object result = + scalarSql( + "CALL %s.system.snapshot(" + + "source_table => '%s'," + + "table => '%s'," + + "properties => map('%s', 'true', 'snapshot', 'false'))", + catalogName, sourceName, tableName, TableProperties.GC_ENABLED); Assert.assertEquals("Should have added one file", 1L, result); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Map props = table.properties(); Assert.assertEquals("Should override user value", "true", props.get("snapshot")); - Assert.assertEquals("Should override user value", "false", props.get(TableProperties.GC_ENABLED)); + Assert.assertEquals( + "Should override user value", "false", props.get(TableProperties.GC_ENABLED)); } @Test public void testInvalidSnapshotsCases() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); - - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); + + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.snapshot('foo')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.snapshot('n', 't', map('foo', 'bar'))", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid map args", - AnalysisException.class, "cannot resolve 'map", - () -> sql("CALL %s.system.snapshot('%s', 'fable', 'loc', map(2, 1, 1))", catalogName, sourceName)); - - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with invalid map args", + AnalysisException.class, + "cannot resolve 'map", + () -> + sql( + "CALL %s.system.snapshot('%s', 'fable', 'loc', map(2, 1, 1))", + catalogName, sourceName)); + + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.snapshot('', 'dest')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.snapshot('src', '')", catalogName)); } } diff --git a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestUpdate.java b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestUpdate.java index 1b2893f8718d..d1b09775c458 100644 --- a/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestUpdate.java +++ b/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestUpdate.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.SPLIT_SIZE; +import static org.apache.iceberg.TableProperties.UPDATE_ISOLATION_LEVEL; +import static org.apache.spark.sql.functions.lit; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -53,15 +57,15 @@ import org.junit.Ignore; import org.junit.Test; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.SPLIT_SIZE; -import static org.apache.iceberg.TableProperties.UPDATE_ISOLATION_LEVEL; -import static org.apache.spark.sql.functions.lit; - public abstract class TestUpdate extends SparkRowLevelOperationsTestBase { - public TestUpdate(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestUpdate( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -91,7 +95,8 @@ public void testExplain() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 1 snapshot", 1, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -106,7 +111,8 @@ public void testUpdateEmptyTable() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -121,7 +127,8 @@ public void testUpdateWithAlias() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "invalid")), sql("SELECT * FROM %s", tableName)); } @@ -134,7 +141,8 @@ public void testUpdateAlignsAssignments() { sql("UPDATE %s SET `c2` = c2 - 2, c1 = `c1` - 1 WHERE id <=> 1", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, 10, 109), row(2, 22, 222)), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -148,7 +156,8 @@ public void testUpdateWithUnsupportedPartitionPredicate() { sql("UPDATE %s t SET `t`.`id` = -1 WHERE t.dep LIKE '%%r' ", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "software")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -158,12 +167,10 @@ public void testUpdateWithDynamicFileFiltering() { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 3, \"dep\": \"hr\" }"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hardware\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 3, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hardware\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }"); sql("UPDATE %s SET id = cast('-1' AS INT) WHERE id = 2", tableName); @@ -173,7 +180,8 @@ public void testUpdateWithDynamicFileFiltering() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "1", "1", "1"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(1, "hardware"), row(1, "hr"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -193,7 +201,8 @@ public void testUpdateNonExistingRecords() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "0", null, null); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -217,7 +226,8 @@ public void testUpdateWithoutCondition() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "2", "3", "2"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(-1, "hr")), sql("SELECT * FROM %s ORDER BY dep ASC", tableName)); } @@ -226,26 +236,30 @@ public void testUpdateWithoutCondition() { public void testUpdateWithNullConditions() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 0, \"dep\": null }\n" + - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + append( + tableName, + "{ \"id\": 0, \"dep\": null }\n" + + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }"); // should not update any rows as null is never equal to null sql("UPDATE %s SET id = -1 WHERE dep = NULL", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should not update any rows the condition does not match any records sql("UPDATE %s SET id = -1 WHERE dep = 'software'", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should update one matching row with a null-safe condition sql("UPDATE %s SET dep = 'invalid', id = -1 WHERE dep <=> NULL", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "invalid"), row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -254,23 +268,27 @@ public void testUpdateWithNullConditions() { public void testUpdateWithInAndNotInConditions() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); sql("UPDATE %s SET id = -1 WHERE id IN (1, null)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("UPDATE %s SET id = 100 WHERE id NOT IN (null, 1)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("UPDATE %s SET id = 100 WHERE id NOT IN (1, 10)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(100, "hardware"), row(100, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); } @@ -282,16 +300,20 @@ public void testUpdateWithMultipleRowGroupsParquet() throws NoSuchTableException createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, SPLIT_SIZE, 100); List ids = Lists.newArrayListWithCapacity(200); for (int id = 1; id <= 200; id++) { ids.add(id); } - Dataset df = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); + Dataset df = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); df.coalesce(1).writeTo(tableName).append(); Assert.assertEquals(200, spark.table(tableName).count()); @@ -304,27 +326,33 @@ public void testUpdateWithMultipleRowGroupsParquet() throws NoSuchTableException @Test public void testUpdateNestedStructFields() { - createAndInitTable("id INT, s STRUCT,m:MAP>>", + createAndInitTable( + "id INT, s STRUCT,m:MAP>>", "{ \"id\": 1, \"s\": { \"c1\": 2, \"c2\": { \"a\": [1,2], \"m\": { \"a\": \"b\"} } } } }"); // update primitive, array, map columns inside a struct sql("UPDATE %s SET s.c1 = -1, s.c2.m = map('k', 'v'), s.c2.a = array(-1)", tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(-1, row(ImmutableList.of(-1), ImmutableMap.of("k", "v"))))), sql("SELECT * FROM %s", tableName)); // set primitive, array, map columns to NULL (proper casts should be in place) sql("UPDATE %s SET s.c1 = NULL, s.c2 = NULL WHERE id IN (1)", tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(null, null))), sql("SELECT * FROM %s", tableName)); // update all fields in a struct - sql("UPDATE %s SET s = named_struct('c1', 1, 'c2', named_struct('a', array(1), 'm', null))", tableName); + sql( + "UPDATE %s SET s = named_struct('c1', 1, 'c2', named_struct('a', array(1), 'm', null))", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(1, row(ImmutableList.of(1), null)))), sql("SELECT * FROM %s", tableName)); } @@ -334,29 +362,33 @@ public void testUpdateWithUserDefinedDistribution() { createAndInitTable("id INT, c2 INT, c3 INT"); sql("ALTER TABLE %s ADD PARTITION FIELD bucket(8, c3)", tableName); - append(tableName, - "{ \"id\": 1, \"c2\": 11, \"c3\": 1 }\n" + - "{ \"id\": 2, \"c2\": 22, \"c3\": 1 }\n" + - "{ \"id\": 3, \"c2\": 33, \"c3\": 1 }"); + append( + tableName, + "{ \"id\": 1, \"c2\": 11, \"c3\": 1 }\n" + + "{ \"id\": 2, \"c2\": 22, \"c3\": 1 }\n" + + "{ \"id\": 3, \"c2\": 33, \"c3\": 1 }"); // request a global sort sql("ALTER TABLE %s WRITE ORDERED BY c2", tableName); sql("UPDATE %s SET c2 = -22 WHERE id NOT IN (1, 3)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, 11, 1), row(2, -22, 1), row(3, 33, 1)), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); // request a local sort sql("ALTER TABLE %s WRITE LOCALLY ORDERED BY id", tableName); sql("UPDATE %s SET c2 = -33 WHERE id = 3", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, 11, 1), row(2, -22, 1), row(3, -33, 1)), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); // request a hash distribution + local sort sql("ALTER TABLE %s WRITE DISTRIBUTED BY PARTITION ORDERED BY id", tableName); sql("UPDATE %s SET c2 = -11 WHERE id = 1", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, -11, 1), row(2, -22, 1), row(3, -33, 1)), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -368,34 +400,41 @@ public synchronized void testUpdateWithSerializableIsolation() throws Interrupte createAndInitTable("id INT, dep STRING"); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, UPDATE_ISOLATION_LEVEL, "serializable"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, UPDATE_ISOLATION_LEVEL, "serializable"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // update thread - Future updateFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("UPDATE %s SET id = -1 WHERE id = 1", tableName); - barrier.incrementAndGet(); - } - }); + Future updateFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("UPDATE %s SET id = -1 WHERE id = 1", tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { updateFuture.get(); @@ -406,7 +445,8 @@ public synchronized void testUpdateWithSerializableIsolation() throws Interrupte Throwable validationException = sparkException.getCause(); Assert.assertThat(validationException, CoreMatchers.instanceOf(ValidationException.class)); String errMsg = validationException.getMessage(); - Assert.assertThat(errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); + Assert.assertThat( + errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); } finally { appendFuture.cancel(true); } @@ -416,40 +456,48 @@ public synchronized void testUpdateWithSerializableIsolation() throws Interrupte } @Test - public synchronized void testUpdateWithSnapshotIsolation() throws InterruptedException, ExecutionException { + public synchronized void testUpdateWithSnapshotIsolation() + throws InterruptedException, ExecutionException { // cannot run tests with concurrency for Hadoop tables without atomic renames Assume.assumeFalse(catalogName.equalsIgnoreCase("testhadoop")); createAndInitTable("id INT, dep STRING"); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, UPDATE_ISOLATION_LEVEL, "snapshot"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, UPDATE_ISOLATION_LEVEL, "snapshot"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // update thread - Future updateFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("UPDATE %s SET id = -1 WHERE id = 1", tableName); - barrier.incrementAndGet(); - } - }); + Future updateFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("UPDATE %s SET id = -1 WHERE id = 1", tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { updateFuture.get(); @@ -467,7 +515,8 @@ public void testUpdateWithInferredCasts() { sql("UPDATE %s SET s = -1 WHERE id = 1", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "-1")), sql("SELECT * FROM %s", tableName)); } @@ -478,7 +527,8 @@ public void testUpdateModifiesNullStruct() { sql("UPDATE %s SET s.n1 = -1 WHERE id = 1", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, row(-1, null))), sql("SELECT * FROM %s", tableName)); } @@ -488,20 +538,19 @@ public void testUpdateRefreshesRelationCache() { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 3, \"dep\": \"hr\" }"); + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 3, \"dep\": \"hr\" }"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hardware\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hardware\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }"); Dataset query = spark.sql("SELECT * FROM " + tableName + " WHERE id = 1"); query.createOrReplaceTempView("tmp"); spark.sql("CACHE TABLE tmp"); - assertEquals("View should have correct data", + assertEquals( + "View should have correct data", ImmutableList.of(row(1, "hardware"), row(1, "hr")), sql("SELECT * FROM tmp ORDER BY id, dep")); @@ -513,11 +562,13 @@ public void testUpdateRefreshesRelationCache() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "2", "2", "2"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(2, "hardware"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - assertEquals("Should refresh the relation cache", + assertEquals( + "Should refresh the relation cache", ImmutableList.of(), sql("SELECT * FROM tmp ORDER BY id, dep")); @@ -528,36 +579,47 @@ public void testUpdateRefreshesRelationCache() { public void testUpdateWithInSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(0, 1, null), Encoders.INT()); createOrReplaceView("updated_dep", Arrays.asList("software", "hr"), Encoders.STRING()); - sql("UPDATE %s SET id = -1 WHERE " + - "id IN (SELECT * FROM updated_id) AND " + - "dep IN (SELECT * from updated_dep)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET id = -1 WHERE " + + "id IN (SELECT * FROM updated_id) AND " + + "dep IN (SELECT * from updated_dep)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("UPDATE %s SET id = 5 WHERE id IS NULL OR id IN (SELECT value + 1 FROM updated_id)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET id = 5 WHERE id IS NULL OR id IN (SELECT value + 1 FROM updated_id)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(5, "hardware"), row(5, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - append(tableName, - "{ \"id\": null, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); - assertEquals("Should have expected rows", - ImmutableList.of(row(-1, "hr"), row(2, "hr"), row(5, "hardware"), row(5, "hr"), row(null, "hr")), + append(tableName, "{ \"id\": null, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + assertEquals( + "Should have expected rows", + ImmutableList.of( + row(-1, "hr"), row(2, "hr"), row(5, "hardware"), row(5, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); - sql("UPDATE %s SET id = 10 WHERE id IN (SELECT value + 2 FROM updated_id) AND dep = 'hr'", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(-1, "hr"), row(5, "hardware"), row(5, "hr"), row(10, "hr"), row(null, "hr")), + sql( + "UPDATE %s SET id = 10 WHERE id IN (SELECT value + 2 FROM updated_id) AND dep = 'hr'", + tableName); + assertEquals( + "Should have expected rows", + ImmutableList.of( + row(-1, "hr"), row(5, "hardware"), row(5, "hr"), row(10, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); } @@ -568,12 +630,10 @@ public void testUpdateWithInSubqueryAndDynamicFileFiltering() { sql("ALTER TABLE %s WRITE DISTRIBUTED BY PARTITION", tableName); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 3, \"dep\": \"hr\" }"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hardware\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 3, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hardware\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }"); createOrReplaceView("updated_id", Arrays.asList(-1, 2), Encoders.INT()); @@ -585,7 +645,8 @@ public void testUpdateWithInSubqueryAndDynamicFileFiltering() { Snapshot currentSnapshot = table.currentSnapshot(); validateSnapshot(currentSnapshot, "overwrite", "1", "1", "1"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(1, "hardware"), row(1, "hr"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -594,23 +655,26 @@ public void testUpdateWithInSubqueryAndDynamicFileFiltering() { public void testUpdateWithSelfSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); sql("UPDATE %s SET dep = 'x' WHERE id IN (SELECT id + 1 FROM %s)", tableName, tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "x")), sql("SELECT * FROM %s ORDER BY id", tableName)); - sql("UPDATE %s SET dep = 'y' WHERE " + - "id = (SELECT count(*) FROM (SELECT DISTINCT id FROM %s) AS t)", tableName, tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET dep = 'y' WHERE " + + "id = (SELECT count(*) FROM (SELECT DISTINCT id FROM %s) AS t)", + tableName, tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "y")), sql("SELECT * FROM %s ORDER BY id", tableName)); sql("UPDATE %s SET id = (SELECT id - 2 FROM %s WHERE id = 1)", tableName, tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(-1, "y")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -619,16 +683,21 @@ public void testUpdateWithSelfSubquery() { public void testUpdateWithMultiColumnInSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); - List deletedEmployees = Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr")); + List deletedEmployees = + Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr")); createOrReplaceView("deleted_employee", deletedEmployees, Encoders.bean(Employee.class)); - sql("UPDATE %s SET dep = 'x', id = -1 WHERE (id, dep) IN (SELECT id, dep FROM deleted_employee)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET dep = 'x', id = -1 WHERE (id, dep) IN (SELECT id, dep FROM deleted_employee)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "x"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -637,27 +706,35 @@ public void testUpdateWithMultiColumnInSubquery() { public void testUpdateWithNotInSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("updated_dep", Arrays.asList("software", "hr"), Encoders.STRING()); // the file filter subquery (nested loop lef-anti join) returns 0 records sql("UPDATE %s SET id = -1 WHERE id NOT IN (SELECT * FROM updated_id)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("UPDATE %s SET id = -1 WHERE id NOT IN (SELECT * FROM updated_id WHERE value IS NOT NULL)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET id = -1 WHERE id NOT IN (SELECT * FROM updated_id WHERE value IS NOT NULL)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); - sql("UPDATE %s SET id = 5 WHERE id NOT IN (SELECT * FROM updated_id) OR dep IN ('software', 'hr')", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET id = 5 WHERE id NOT IN (SELECT * FROM updated_id) OR dep IN ('software', 'hr')", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(5, "hr"), row(5, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); } @@ -668,8 +745,10 @@ public void testUpdateWithNotInSubqueryNotSupported() { createOrReplaceView("updated_id", Arrays.asList(-1, -2, null), Encoders.INT()); - AssertHelpers.assertThrows("Should complain about NOT IN subquery", - AnalysisException.class, "Null-aware predicate subqueries are not currently supported", + AssertHelpers.assertThrows( + "Should complain about NOT IN subquery", + AnalysisException.class, + "Null-aware predicate subqueries are not currently supported", () -> sql("UPDATE %s SET id = -1 WHERE id NOT IN (SELECT * FROM updated_id)", tableName)); } @@ -677,36 +756,49 @@ public void testUpdateWithNotInSubqueryNotSupported() { public void testUpdateWithExistSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("updated_dep", Arrays.asList("hr", null), Encoders.STRING()); - sql("UPDATE %s t SET id = -1 WHERE EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = -1 WHERE EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("UPDATE %s t SET dep = 'x', id = -1 WHERE " + - "EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value + 2)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET dep = 'x', id = -1 WHERE " + + "EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value + 2)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "x"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("UPDATE %s t SET id = -2 WHERE " + - "EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value) OR " + - "t.id IS NULL", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = -2 WHERE " + + "EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value) OR " + + "t.id IS NULL", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-2, "hr"), row(-2, "x"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - sql("UPDATE %s t SET id = 1 WHERE " + - "EXISTS (SELECT 1 FROM updated_id ui WHERE t.id = ui.value) AND " + - "EXISTS (SELECT 1 FROM updated_dep ud WHERE t.dep = ud.value)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = 1 WHERE " + + "EXISTS (SELECT 1 FROM updated_id ui WHERE t.id = ui.value) AND " + + "EXISTS (SELECT 1 FROM updated_dep ud WHERE t.dep = ud.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-2, "x"), row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -715,30 +807,40 @@ public void testUpdateWithExistSubquery() { public void testUpdateWithNotExistsSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("updated_dep", Arrays.asList("hr", "software"), Encoders.STRING()); - sql("UPDATE %s t SET id = -1 WHERE NOT EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value + 2)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = -1 WHERE NOT EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value + 2)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(1, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - sql("UPDATE %s t SET id = 5 WHERE " + - "NOT EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value) OR " + - "t.id = 1", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = 5 WHERE " + + "NOT EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value) OR " + + "t.id = 1", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(5, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - sql("UPDATE %s t SET id = 10 WHERE " + - "NOT EXISTS (SELECT 1 FROM updated_id ui WHERE t.id = ui.value) AND " + - "EXISTS (SELECT 1 FROM updated_dep ud WHERE t.dep = ud.value)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = 10 WHERE " + + "NOT EXISTS (SELECT 1 FROM updated_id ui WHERE t.id = ui.value) AND " + + "EXISTS (SELECT 1 FROM updated_dep ud WHERE t.dep = ud.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(10, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -747,15 +849,17 @@ public void testUpdateWithNotExistsSubquery() { public void testUpdateWithScalarSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(1, 100, null), Encoders.INT()); sql("UPDATE %s SET id = -1 WHERE id <= (SELECT min(value) FROM updated_id)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -765,25 +869,29 @@ public void testUpdateThatRequiresGroupingBeforeWrite() { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - append(tableName, - "{ \"id\": 0, \"dep\": \"hr\" }\n" + - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); - - append(tableName, - "{ \"id\": 0, \"dep\": \"ops\" }\n" + - "{ \"id\": 1, \"dep\": \"ops\" }\n" + - "{ \"id\": 2, \"dep\": \"ops\" }"); - - append(tableName, - "{ \"id\": 0, \"dep\": \"hr\" }\n" + - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); - - append(tableName, - "{ \"id\": 0, \"dep\": \"ops\" }\n" + - "{ \"id\": 1, \"dep\": \"ops\" }\n" + - "{ \"id\": 2, \"dep\": \"ops\" }"); + append( + tableName, + "{ \"id\": 0, \"dep\": \"hr\" }\n" + + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hr\" }"); + + append( + tableName, + "{ \"id\": 0, \"dep\": \"ops\" }\n" + + "{ \"id\": 1, \"dep\": \"ops\" }\n" + + "{ \"id\": 2, \"dep\": \"ops\" }"); + + append( + tableName, + "{ \"id\": 0, \"dep\": \"hr\" }\n" + + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hr\" }"); + + append( + tableName, + "{ \"id\": 0, \"dep\": \"ops\" }\n" + + "{ \"id\": 1, \"dep\": \"ops\" }\n" + + "{ \"id\": 2, \"dep\": \"ops\" }"); createOrReplaceView("updated_id", Arrays.asList(1, 100), Encoders.INT()); @@ -803,30 +911,38 @@ public void testUpdateThatRequiresGroupingBeforeWrite() { public void testUpdateWithVectorization() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 0, \"dep\": \"hr\" }\n" + - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 0, \"dep\": \"hr\" }\n" + + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hr\" }"); - withSQLConf(ImmutableMap.of(SparkSQLProperties.VECTORIZATION_ENABLED, "true"), () -> { - sql("UPDATE %s t SET id = -1", tableName); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.VECTORIZATION_ENABLED, "true"), + () -> { + sql("UPDATE %s t SET id = -1", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(-1, "hr"), row(-1, "hr"), row(-1, "hr")), - sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - }); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(-1, "hr"), row(-1, "hr"), row(-1, "hr")), + sql("SELECT * FROM %s ORDER BY id, dep", tableName)); + }); } @Test public void testUpdateWithInvalidUpdates() { createAndInitTable("id INT, a ARRAY>, m MAP"); - AssertHelpers.assertThrows("Should complain about updating an array column", - AnalysisException.class, "Updating nested fields is only supported for structs", + AssertHelpers.assertThrows( + "Should complain about updating an array column", + AnalysisException.class, + "Updating nested fields is only supported for structs", () -> sql("UPDATE %s SET a.c1 = 1", tableName)); - AssertHelpers.assertThrows("Should complain about updating a map column", - AnalysisException.class, "Updating nested fields is only supported for structs", + AssertHelpers.assertThrows( + "Should complain about updating a map column", + AnalysisException.class, + "Updating nested fields is only supported for structs", () -> sql("UPDATE %s SET m.key = 'new_key'", tableName)); } @@ -834,48 +950,68 @@ public void testUpdateWithInvalidUpdates() { public void testUpdateWithConflictingAssignments() { createAndInitTable("id INT, c STRUCT>"); - AssertHelpers.assertThrows("Should complain about conflicting updates to a top-level column", - AnalysisException.class, "Updates are in conflict", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a top-level column", + AnalysisException.class, + "Updates are in conflict", () -> sql("UPDATE %s t SET t.id = 1, t.c.n1 = 2, t.id = 2", tableName)); - AssertHelpers.assertThrows("Should complain about conflicting updates to a nested column", - AnalysisException.class, "Updates are in conflict for these columns", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a nested column", + AnalysisException.class, + "Updates are in conflict for these columns", () -> sql("UPDATE %s t SET t.c.n1 = 1, t.id = 2, t.c.n1 = 2", tableName)); - AssertHelpers.assertThrows("Should complain about conflicting updates to a nested column", - AnalysisException.class, "Updates are in conflict", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a nested column", + AnalysisException.class, + "Updates are in conflict", () -> { - sql("UPDATE %s SET c.n1 = 1, c = named_struct('n1', 1, 'n2', named_struct('dn1', 1, 'dn2', 2))", tableName); + sql( + "UPDATE %s SET c.n1 = 1, c = named_struct('n1', 1, 'n2', named_struct('dn1', 1, 'dn2', 2))", + tableName); }); } @Test public void testUpdateWithInvalidAssignments() { - createAndInitTable("id INT NOT NULL, s STRUCT> NOT NULL"); - - for (String policy : new String[]{"ansi", "strict"}) { - withSQLConf(ImmutableMap.of("spark.sql.storeAssignmentPolicy", policy), () -> { - - AssertHelpers.assertThrows("Should complain about writing nulls to a top-level column", - AnalysisException.class, "Cannot write nullable values to non-null column", - () -> sql("UPDATE %s t SET t.id = NULL", tableName)); - - AssertHelpers.assertThrows("Should complain about writing nulls to a nested column", - AnalysisException.class, "Cannot write nullable values to non-null column", - () -> sql("UPDATE %s t SET t.s.n1 = NULL", tableName)); - - AssertHelpers.assertThrows("Should complain about writing missing fields in structs", - AnalysisException.class, "missing fields", - () -> sql("UPDATE %s t SET t.s = named_struct('n1', 1)", tableName)); - - AssertHelpers.assertThrows("Should complain about writing invalid data types", - AnalysisException.class, "Cannot safely cast", - () -> sql("UPDATE %s t SET t.s.n1 = 'str'", tableName)); - - AssertHelpers.assertThrows("Should complain about writing incompatible structs", - AnalysisException.class, "field name does not match", - () -> sql("UPDATE %s t SET t.s.n2 = named_struct('dn2', 1, 'dn1', 2)", tableName)); - }); + createAndInitTable( + "id INT NOT NULL, s STRUCT> NOT NULL"); + + for (String policy : new String[] {"ansi", "strict"}) { + withSQLConf( + ImmutableMap.of("spark.sql.storeAssignmentPolicy", policy), + () -> { + AssertHelpers.assertThrows( + "Should complain about writing nulls to a top-level column", + AnalysisException.class, + "Cannot write nullable values to non-null column", + () -> sql("UPDATE %s t SET t.id = NULL", tableName)); + + AssertHelpers.assertThrows( + "Should complain about writing nulls to a nested column", + AnalysisException.class, + "Cannot write nullable values to non-null column", + () -> sql("UPDATE %s t SET t.s.n1 = NULL", tableName)); + + AssertHelpers.assertThrows( + "Should complain about writing missing fields in structs", + AnalysisException.class, + "missing fields", + () -> sql("UPDATE %s t SET t.s = named_struct('n1', 1)", tableName)); + + AssertHelpers.assertThrows( + "Should complain about writing invalid data types", + AnalysisException.class, + "Cannot safely cast", + () -> sql("UPDATE %s t SET t.s.n1 = 'str'", tableName)); + + AssertHelpers.assertThrows( + "Should complain about writing incompatible structs", + AnalysisException.class, + "field name does not match", + () -> sql("UPDATE %s t SET t.s.n2 = named_struct('dn2', 1, 'dn1', 2)", tableName)); + }); } } @@ -883,8 +1019,10 @@ public void testUpdateWithInvalidAssignments() { public void testUpdateWithNonDeterministicCondition() { createAndInitTable("id INT, dep STRING"); - AssertHelpers.assertThrows("Should complain about non-deterministic expressions", - AnalysisException.class, "nondeterministic expressions are only allowed", + AssertHelpers.assertThrows( + "Should complain about non-deterministic expressions", + AnalysisException.class, + "nondeterministic expressions are only allowed", () -> sql("UPDATE %s SET id = -1 WHERE id = 1 AND rand() > 0.5", tableName)); } @@ -892,8 +1030,10 @@ public void testUpdateWithNonDeterministicCondition() { public void testUpdateOnNonIcebergTableNotSupported() { createOrReplaceView("testtable", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("UPDATE is not supported for non iceberg table", - UnsupportedOperationException.class, "not supported temporarily", + AssertHelpers.assertThrows( + "UPDATE is not supported for non iceberg table", + UnsupportedOperationException.class, + "not supported temporarily", () -> sql("UPDATE %s SET c1 = -1 WHERE c2 = 1", "testtable")); } } diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java index 17df7d2cf9d7..d6b0e9c94258 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -32,13 +31,13 @@ public class SparkBenchmarkUtil { - private SparkBenchmarkUtil() { - } + private SparkBenchmarkUtil() {} public static UnsafeProjection projection(Schema expectedSchema, Schema actualSchema) { StructType struct = SparkSchemaUtil.convert(actualSchema); - List refs = JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava(); + List refs = + JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava(); List attrs = Lists.newArrayListWithExpectedSize(struct.fields().length); List exprs = Lists.newArrayListWithExpectedSize(struct.fields().length); diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java index eb4da65f9256..fa69059c0b36 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -52,15 +54,11 @@ import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** * A benchmark that evaluates the performance of reading Parquet data with a flat schema using * Iceberg and Spark Parquet readers. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=SparkParquetReadersFlatDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-readers-flat-data-benchmark-result.txt @@ -73,22 +71,23 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetReadersFlatDataBenchmark { - private static final DynMethods.UnboundMethod APPLY_PROJECTION = DynMethods.builder("apply") - .impl(UnsafeProjection.class, InternalRow.class) - .build(); - private static final Schema SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); - private static final Schema PROJECTED_SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(8, "stringCol", Types.StringType.get())); + private static final DynMethods.UnboundMethod APPLY_PROJECTION = + DynMethods.builder("apply").impl(UnsafeProjection.class, InternalRow.class).build(); + private static final Schema SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); + private static final Schema PROJECTED_SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(8, "stringCol", Types.StringType.get())); private static final int NUM_RECORDS = 10000000; private File dataFile; @@ -97,10 +96,8 @@ public void setupBenchmark() throws IOException { dataFile = File.createTempFile("parquet-flat-data-benchmark", ".parquet"); dataFile.delete(); List records = RandomData.generateList(SCHEMA, NUM_RECORDS, 0L); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .schema(SCHEMA) - .named("benchmark") - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)).schema(SCHEMA).named("benchmark").build()) { writer.addAll(records); } } @@ -115,10 +112,11 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void readUsingIcebergReader(Blackhole blackHole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackHole.consume(row); @@ -129,14 +127,15 @@ public void readUsingIcebergReader(Blackhole blackHole) throws IOException { @Benchmark @Threads(1) public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); + Iterable unsafeRows = + Iterables.transform( + rows, APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -148,14 +147,15 @@ public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException @Threads(1) public void readUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -166,10 +166,11 @@ public void readUsingSparkReader(Blackhole blackhole) throws IOException { @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -180,14 +181,18 @@ public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOE @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { - - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA))::invoke); + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { + + Iterable unsafeRows = + Iterables.transform( + rows, + APPLY_PROJECTION.bind( + SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA)) + ::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -199,14 +204,15 @@ public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) thro @Threads(1) public void readWithProjectionUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(PROJECTED_SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java index 5f98cb908e25..46ae345f7290 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -52,15 +54,11 @@ import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** - * A benchmark that evaluates the performance of reading nested Parquet data using - * Iceberg and Spark Parquet readers. + * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg and Spark + * Parquet readers. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=SparkParquetReadersNestedDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-readers-nested-data-benchmark-result.txt @@ -73,22 +71,21 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetReadersNestedDataBenchmark { - private static final DynMethods.UnboundMethod APPLY_PROJECTION = DynMethods.builder("apply") - .impl(UnsafeProjection.class, InternalRow.class) - .build(); - private static final Schema SCHEMA = new Schema( - required(0, "id", Types.LongType.get()), - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get()) - )) - ); - private static final Schema PROJECTED_SCHEMA = new Schema( - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()) - )) - ); + private static final DynMethods.UnboundMethod APPLY_PROJECTION = + DynMethods.builder("apply").impl(UnsafeProjection.class, InternalRow.class).build(); + private static final Schema SCHEMA = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 4, + "nested", + Types.StructType.of( + required(1, "col1", Types.StringType.get()), + required(2, "col2", Types.DoubleType.get()), + required(3, "col3", Types.LongType.get())))); + private static final Schema PROJECTED_SCHEMA = + new Schema( + optional(4, "nested", Types.StructType.of(required(1, "col1", Types.StringType.get())))); private static final int NUM_RECORDS = 10000000; private File dataFile; @@ -97,10 +94,8 @@ public void setupBenchmark() throws IOException { dataFile = File.createTempFile("parquet-nested-data-benchmark", ".parquet"); dataFile.delete(); List records = RandomData.generateList(SCHEMA, NUM_RECORDS, 0L); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .schema(SCHEMA) - .named("benchmark") - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)).schema(SCHEMA).named("benchmark").build()) { writer.addAll(records); } } @@ -115,10 +110,11 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void readUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -129,14 +125,15 @@ public void readUsingIcebergReader(Blackhole blackhole) throws IOException { @Benchmark @Threads(1) public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); + Iterable unsafeRows = + Iterables.transform( + rows, APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -148,14 +145,15 @@ public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException @Threads(1) public void readUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -166,10 +164,11 @@ public void readUsingSparkReader(Blackhole blackhole) throws IOException { @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -180,14 +179,18 @@ public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOE @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { - - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA))::invoke); + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { + + Iterable unsafeRows = + Iterables.transform( + rows, + APPLY_PROJECTION.bind( + SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA)) + ::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -199,14 +202,15 @@ public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) thro @Threads(1) public void readWithProjectionUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(PROJECTED_SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java index ced246121d5e..e9cded3d2d97 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.Files; @@ -45,15 +47,11 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.annotations.Warmup; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** * A benchmark that evaluates the performance of writing Parquet data with a flat schema using * Iceberg and Spark Parquet writers. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=SparkParquetWritersFlatDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-writers-flat-data-benchmark-result.txt @@ -66,15 +64,16 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetWritersFlatDataBenchmark { - private static final Schema SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); private static final int NUM_RECORDS = 1000000; private Iterable rows; private File dataFile; @@ -96,10 +95,13 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void writeUsingIcebergWriter() throws IOException { - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) + .schema(SCHEMA) + .build()) { writer.addAll(rows); } @@ -109,15 +111,16 @@ public void writeUsingIcebergWriter() throws IOException { @Threads(1) public void writeUsingSparkWriter() throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .writeSupport(new ParquetWriteSupport()) - .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) - .set("spark.sql.parquet.writeLegacyFormat", "false") - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .writeSupport(new ParquetWriteSupport()) + .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) + .set("spark.sql.parquet.writeLegacyFormat", "false") + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") + .schema(SCHEMA) + .build()) { writer.addAll(rows); } diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java index 34bcaf6a971f..6086de09af17 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.Files; @@ -45,15 +47,11 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.annotations.Warmup; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using - * Iceberg and Spark Parquet writers. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and Spark + * Parquet writers. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=SparkParquetWritersNestedDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-writers-nested-data-benchmark-result.txt @@ -66,14 +64,16 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetWritersNestedDataBenchmark { - private static final Schema SCHEMA = new Schema( - required(0, "id", Types.LongType.get()), - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get()) - )) - ); + private static final Schema SCHEMA = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 4, + "nested", + Types.StructType.of( + required(1, "col1", Types.StringType.get()), + required(2, "col2", Types.DoubleType.get()), + required(3, "col3", Types.LongType.get())))); private static final int NUM_RECORDS = 1000000; private Iterable rows; private File dataFile; @@ -95,10 +95,13 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void writeUsingIcebergWriter() throws IOException { - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) + .schema(SCHEMA) + .build()) { writer.addAll(rows); } @@ -108,15 +111,16 @@ public void writeUsingIcebergWriter() throws IOException { @Threads(1) public void writeUsingSparkWriter() throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .writeSupport(new ParquetWriteSupport()) - .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) - .set("spark.sql.parquet.writeLegacyFormat", "false") - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .writeSupport(new ParquetWriteSupport()) + .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) + .set("spark.sql.parquet.writeLegacyFormat", "false") + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") + .schema(SCHEMA) + .build()) { writer.addAll(rows); } diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java index 1820a801b2fb..0dbf07285060 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; @FunctionalInterface diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java index 0ceedfd0e20d..19bcdd672157 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -79,7 +78,8 @@ protected String newTableLocation() { protected String dataLocation() { Map properties = table.properties(); - return properties.getOrDefault(TableProperties.WRITE_DATA_LOCATION, String.format("%s/data", table.location())); + return properties.getOrDefault( + TableProperties.WRITE_DATA_LOCATION, String.format("%s/data", table.location())); } protected void cleanupFiles() throws IOException { @@ -92,12 +92,12 @@ protected void cleanupFiles() throws IOException { } protected void setupSpark(boolean enableDictionaryEncoding) { - SparkSession.Builder builder = SparkSession.builder() - .config("spark.ui.enabled", false); + SparkSession.Builder builder = SparkSession.builder().config("spark.ui.enabled", false); if (!enableDictionaryEncoding) { - builder.config("parquet.dictionary.page.size", "1") - .config("parquet.enable.dictionary", false) - .config(TableProperties.PARQUET_DICT_SIZE_BYTES, "1"); + builder + .config("parquet.dictionary.page.size", "1") + .config("parquet.enable.dictionary", false) + .config(TableProperties.PARQUET_DICT_SIZE_BYTES, "1"); } builder.master("local"); spark = builder.getOrCreate(); @@ -114,13 +114,14 @@ protected void tearDownSpark() { } protected void materialize(Dataset ds) { - ds.queryExecution().toRdd().toJavaRDD().foreach(record -> { }); + ds.queryExecution().toRdd().toJavaRDD().foreach(record -> {}); } protected void appendAsFile(Dataset ds) { // ensure the schema is precise (including nullability) StructType sparkSchema = SparkSchemaUtil.convert(table.schema()); - spark.createDataFrame(ds.rdd(), sparkSchema) + spark + .createDataFrame(ds.rdd(), sparkSchema) .coalesce(1) .write() .format("iceberg") @@ -132,42 +133,49 @@ protected void withSQLConf(Map conf, Action action) { SQLConf sqlConf = SQLConf.get(); Map currentConfValues = Maps.newHashMap(); - conf.keySet().forEach(confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); - - conf.forEach((confKey, confValue) -> { - if (SQLConf.staticConfKeys().contains(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); + conf.keySet() + .forEach( + confKey -> { + if (sqlConf.contains(confKey)) { + String currentConfValue = sqlConf.getConfString(confKey); + currentConfValues.put(confKey, currentConfValue); + } + }); + + conf.forEach( + (confKey, confValue) -> { + if (SQLConf.staticConfKeys().contains(confKey)) { + throw new RuntimeException("Cannot modify the value of a static config: " + confKey); + } + sqlConf.setConfString(confKey, confValue); + }); try { action.invoke(); } finally { - conf.forEach((confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); + conf.forEach( + (confKey, confValue) -> { + if (currentConfValues.containsKey(confKey)) { + sqlConf.setConfString(confKey, currentConfValues.get(confKey)); + } else { + sqlConf.unsetConf(confKey); + } + }); } } protected void withTableProperties(Map props, Action action) { Map tableProps = table.properties(); Map currentPropValues = Maps.newHashMap(); - props.keySet().forEach(propKey -> { - if (tableProps.containsKey(propKey)) { - String currentPropValue = tableProps.get(propKey); - currentPropValues.put(propKey, currentPropValue); - } - }); + props + .keySet() + .forEach( + propKey -> { + if (tableProps.containsKey(propKey)) { + String currentPropValue = tableProps.get(propKey); + currentPropValues.put(propKey, currentPropValue); + } + }); UpdateProperties updateProperties = table.updateProperties(); props.forEach(updateProperties::set); @@ -177,13 +185,14 @@ protected void withTableProperties(Map props, Action action) { action.invoke(); } finally { UpdateProperties restoreProperties = table.updateProperties(); - props.forEach((propKey, propValue) -> { - if (currentPropValues.containsKey(propKey)) { - restoreProperties.set(propKey, currentPropValues.get(propKey)); - } else { - restoreProperties.remove(propKey); - } - }); + props.forEach( + (propKey, propValue) -> { + if (currentPropValues.containsKey(propKey)) { + restoreProperties.set(propKey, currentPropValues.get(propKey)); + } else { + restoreProperties.remove(propKey); + } + }); restoreProperties.commit(); } } diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java index 9e206321a540..59e6230350d9 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -29,9 +31,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class IcebergSourceFlatDataBenchmark extends IcebergSourceBenchmark { @Override @@ -41,15 +40,16 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java index 5a0d9359ec6b..a1c61b9b4de0 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -29,9 +31,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class IcebergSourceNestedDataBenchmark extends IcebergSourceBenchmark { @Override @@ -41,14 +40,16 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(0, "id", Types.LongType.get()), - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get()) - )) - ); + Schema schema = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 4, + "nested", + Types.StructType.of( + required(1, "col1", Types.StringType.get()), + required(2, "col2", Types.DoubleType.get()), + required(3, "col3", Types.LongType.get())))); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java index 369a1507b648..f68b587735dd 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -29,9 +31,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class IcebergSourceNestedListDataBenchmark extends IcebergSourceBenchmark { @Override @@ -41,12 +40,19 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(0, "id", Types.LongType.get()), - optional(1, "outerlist", Types.ListType.ofOptional(2, - Types.StructType.of(required(3, "innerlist", Types.ListType.ofRequired(4, Types.StringType.get()))) - )) - ); + Schema schema = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 1, + "outerlist", + Types.ListType.ofOptional( + 2, + Types.StructType.of( + required( + 3, + "innerlist", + Types.ListType.ofRequired(4, Types.StringType.get())))))); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java index 06e00e3ebab7..eace9d3e44a7 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.Comparator; import java.util.List; @@ -57,23 +59,20 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.infra.Blackhole; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class WritersBenchmark extends IcebergSourceBenchmark { private static final int NUM_ROWS = 2500000; private static final long TARGET_FILE_SIZE_IN_BYTES = 50L * 1024 * 1024; - private static final Schema SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "timestampCol", Types.TimestampType.withZone()), - optional(7, "stringCol", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "timestampCol", Types.TimestampType.withZone()), + optional(7, "stringCol", Types.StringType.get())); private Iterable rows; private Iterable positionDeleteRows; @@ -91,7 +90,8 @@ public void setupBenchmark() { data.sort(Comparator.comparingInt(row -> transform.apply(row.getInt(1)))); this.rows = data; - this.positionDeleteRows = RandomData.generateSpark(DeleteSchemaUtil.pathPosSchema(), NUM_ROWS, 0L); + this.positionDeleteRows = + RandomData.generateSpark(DeleteSchemaUtil.pathPosSchema(), NUM_ROWS, 0L); this.unpartitionedSpec = table().specs().get(0); Preconditions.checkArgument(unpartitionedSpec.isUnpartitioned()); @@ -117,9 +117,7 @@ protected final Table initTable() { Table table = tables.create(SCHEMA, spec, properties, newTableLocation()); // add a partitioned spec to the table - table.updateSpec() - .addField(Expressions.bucket("intCol", 32)) - .commit(); + table.updateSpec().addField(Expressions.bucket("intCol", 32)).commit(); return table; } @@ -130,13 +128,14 @@ public void writeUnpartitionedClusteredDataWriter(Blackhole blackhole) throws IO FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .dataSchema(table().schema()) + .build(); - ClusteredDataWriter writer = new ClusteredDataWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredDataWriter writer = + new ClusteredDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); try (ClusteredDataWriter closeableWriter = writer) { for (InternalRow row : rows) { @@ -156,13 +155,14 @@ public void writeUnpartitionedLegacyDataWriter(Blackhole blackhole) throws IOExc Schema writeSchema = table().schema(); StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(unpartitionedSpec) - .build(); + SparkAppenderFactory appenders = + SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) + .spec(unpartitionedSpec) + .build(); - TaskWriter writer = new UnpartitionedWriter<>( - unpartitionedSpec, fileFormat(), appenders, - fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + TaskWriter writer = + new UnpartitionedWriter<>( + unpartitionedSpec, fileFormat(), appenders, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); try (TaskWriter closableWriter = writer) { for (InternalRow row : rows) { @@ -179,13 +179,14 @@ public void writePartitionedClusteredDataWriter(Blackhole blackhole) throws IOEx FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .dataSchema(table().schema()) + .build(); - ClusteredDataWriter writer = new ClusteredDataWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredDataWriter writer = + new ClusteredDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); StructType dataSparkType = SparkSchemaUtil.convert(table().schema()); @@ -210,14 +211,21 @@ public void writePartitionedLegacyDataWriter(Blackhole blackhole) throws IOExcep Schema writeSchema = table().schema(); StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(partitionedSpec) - .build(); - - TaskWriter writer = new SparkPartitionedWriter( - partitionedSpec, fileFormat(), appenders, - fileFactory, io, TARGET_FILE_SIZE_IN_BYTES, - writeSchema, sparkWriteType); + SparkAppenderFactory appenders = + SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) + .spec(partitionedSpec) + .build(); + + TaskWriter writer = + new SparkPartitionedWriter( + partitionedSpec, + fileFormat(), + appenders, + fileFactory, + io, + TARGET_FILE_SIZE_IN_BYTES, + writeSchema, + sparkWriteType); try (TaskWriter closableWriter = writer) { for (InternalRow row : rows) { @@ -234,13 +242,14 @@ public void writePartitionedFanoutDataWriter(Blackhole blackhole) throws IOExcep FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .dataSchema(table().schema()) + .build(); - FanoutDataWriter writer = new FanoutDataWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + FanoutDataWriter writer = + new FanoutDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); StructType dataSparkType = SparkSchemaUtil.convert(table().schema()); @@ -265,14 +274,21 @@ public void writePartitionedLegacyFanoutDataWriter(Blackhole blackhole) throws I Schema writeSchema = table().schema(); StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(partitionedSpec) - .build(); - - TaskWriter writer = new SparkPartitionedFanoutWriter( - partitionedSpec, fileFormat(), appenders, - fileFactory, io, TARGET_FILE_SIZE_IN_BYTES, - writeSchema, sparkWriteType); + SparkAppenderFactory appenders = + SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) + .spec(partitionedSpec) + .build(); + + TaskWriter writer = + new SparkPartitionedFanoutWriter( + partitionedSpec, + fileFormat(), + appenders, + fileFactory, + io, + TARGET_FILE_SIZE_IN_BYTES, + writeSchema, + sparkWriteType); try (TaskWriter closableWriter = writer) { for (InternalRow row : rows) { @@ -285,20 +301,23 @@ partitionedSpec, fileFormat(), appenders, @Benchmark @Threads(1) - public void writePartitionedClusteredEqualityDeleteWriter(Blackhole blackhole) throws IOException { + public void writePartitionedClusteredEqualityDeleteWriter(Blackhole blackhole) + throws IOException { FileIO io = table().io(); int equalityFieldId = table().schema().findField("longCol").fieldId(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .equalityDeleteRowSchema(table().schema()) - .equalityFieldIds(new int[]{equalityFieldId}) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .equalityDeleteRowSchema(table().schema()) + .equalityFieldIds(new int[] {equalityFieldId}) + .build(); - ClusteredEqualityDeleteWriter writer = new ClusteredEqualityDeleteWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredEqualityDeleteWriter writer = + new ClusteredEqualityDeleteWriter<>( + writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); StructType deleteSparkType = SparkSchemaUtil.convert(table().schema()); @@ -316,16 +335,17 @@ public void writePartitionedClusteredEqualityDeleteWriter(Blackhole blackhole) t @Benchmark @Threads(1) - public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) throws IOException { + public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) + throws IOException { FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).build(); - ClusteredPositionDeleteWriter writer = new ClusteredPositionDeleteWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredPositionDeleteWriter writer = + new ClusteredPositionDeleteWriter<>( + writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PositionDelete positionDelete = PositionDelete.create(); try (ClusteredPositionDeleteWriter closeableWriter = writer) { @@ -341,8 +361,6 @@ public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) } private OutputFileFactory newFileFactory() { - return OutputFileFactory.builderFor(table(), 1, 1) - .format(fileFormat()) - .build(); + return OutputFileFactory.builderFor(table(), 1, 1).format(fileFormat()).build(); } } diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java index c62ba5376741..ae2c15be304b 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.avro; import org.apache.iceberg.FileFormat; @@ -25,8 +24,7 @@ /** * A benchmark that evaluates the performance of various Iceberg writers for Avro data. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=AvroWritersBenchmark * -PjmhOutputPath=benchmark/avro-writers-benchmark-result.txt diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java index 8439bc6b4fc9..19279e77cf89 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.avro; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,18 +36,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of reading Avro data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading Avro data with a flat schema using Iceberg + * and the built-in file source in Spark. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=IcebergSourceFlatAvroDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-avro-data-read-benchmark-result.txt @@ -70,11 +68,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,10 +82,12 @@ public void readIceberg() { public void readFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().format("avro").load(dataLocation()); + materialize(df); + }); } @Benchmark @@ -93,11 +95,13 @@ public void readFileSource() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @@ -105,28 +109,34 @@ public void readWithProjectionIceberg() { public void readWithProjectionFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().format("avro").load(dataLocation()).select("longCol"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "avro"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + appendAsFile(df); + } + }); } } diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java index 04ee0b1ec3c6..9b23209192d5 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.avro; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,19 +36,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - - /** - * A benchmark that evaluates the performance of reading Avro data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading Avro data with a flat schema using Iceberg + * and the built-in file source in Spark. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedAvroDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-avro-data-read-benchmark-result.txt @@ -71,11 +68,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -83,10 +82,12 @@ public void readIceberg() { public void readFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().format("avro").load(dataLocation()); + materialize(df); + }); } @Benchmark @@ -94,11 +95,14 @@ public void readFileSource() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("nested.col3"); + materialize(df); + }); } @Benchmark @@ -106,27 +110,33 @@ public void readWithProjectionIceberg() { public void readWithProjectionFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()).select("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = + spark().read().format("avro").load(dataLocation()).select("nested.col3"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "avro"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); + appendAsFile(df); + } + }); } } diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java index 329c9ffe7738..d0fdd8915780 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -30,13 +32,10 @@ import org.apache.iceberg.spark.source.IcebergSourceBenchmark; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - - /** - * Same as {@link org.apache.iceberg.spark.source.IcebergSourceFlatDataBenchmark} but we disable the Timestamp with - * zone type for ORC performance tests as Spark native reader does not support ORC's TIMESTAMP_INSTANT type + * Same as {@link org.apache.iceberg.spark.source.IcebergSourceFlatDataBenchmark} but we disable the + * Timestamp with zone type for ORC performance tests as Spark native reader does not support ORC's + * TIMESTAMP_INSTANT type */ public abstract class IcebergSourceFlatORCDataBenchmark extends IcebergSourceBenchmark { @@ -47,17 +46,19 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - // Disable timestamp column for ORC performance tests as Spark native reader does not support ORC's - // TIMESTAMP_INSTANT type - // optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + // Disable timestamp column for ORC performance tests as Spark native reader does not + // support ORC's + // TIMESTAMP_INSTANT type + // optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java index 1cf213a524d3..292fcc6ee9f1 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,18 +36,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of reading ORC data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading ORC data with a flat schema using Iceberg + * and the built-in file source in Spark. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=IcebergSourceFlatORCDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-orc-data-read-benchmark-result.txt @@ -70,11 +68,13 @@ public void tearDownBenchmark() throws IOException { public void readIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,12 +82,18 @@ public void readIcebergNonVectorized() { public void readIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation); + materialize(df); + }); } @Benchmark @@ -96,10 +102,12 @@ public void readFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()); + materialize(df); + }); } @Benchmark @@ -108,10 +116,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()); + materialize(df); + }); } @Benchmark @@ -119,11 +129,13 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @@ -131,25 +143,33 @@ public void readWithProjectionIcebergNonVectorized() { public void readWithProjectionIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation) + .select("longCol"); + materialize(df); + }); } - @Benchmark @Threads(1) public void readWithProjectionFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()).select("longCol"); + materialize(df); + }); } @Benchmark @@ -158,27 +178,33 @@ public void readWithProjectionFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()).select("longCol"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "orc"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + appendAsFile(df); + } + }); } } diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java index b7723eefa503..dd4cd9961acc 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.spark.sql.functions.array_repeat; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,22 +35,18 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.array_repeat; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedListORCDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-list-orc-data-write-benchmark-result.txt * */ -public class IcebergSourceNestedListORCDataWriteBenchmark extends IcebergSourceNestedListDataBenchmark { +public class IcebergSourceNestedListORCDataWriteBenchmark + extends IcebergSourceNestedListDataBenchmark { @Setup public void setupBenchmark() { @@ -67,8 +66,12 @@ public void tearDownBenchmark() throws IOException { @Threads(1) public void writeIceberg() { String tableLocation = table().location(); - benchmarkData().write().format("iceberg").option("write-format", "orc") - .mode(SaveMode.Append).save(tableLocation); + benchmarkData() + .write() + .format("iceberg") + .option("write-format", "orc") + .mode(SaveMode.Append) + .save(tableLocation); } @Benchmark @@ -76,11 +79,17 @@ public void writeIceberg() { public void writeIcebergDictionaryOff() { Map tableProperties = Maps.newHashMap(); tableProperties.put("orc.dictionary.key.threshold", "0"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - benchmarkData().write().format("iceberg").option("write-format", "orc") - .mode(SaveMode.Append).save(tableLocation); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + benchmarkData() + .write() + .format("iceberg") + .option("write-format", "orc") + .mode(SaveMode.Append) + .save(tableLocation); + }); } @Benchmark @@ -90,10 +99,11 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(numRows) - .withColumn("outerlist", array_repeat(struct( - expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), - 10)) + return spark() + .range(numRows) + .withColumn( + "outerlist", + array_repeat(struct(expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), 10)) .coalesce(1); } } diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java index c87824bc8704..cc6b837ff1e5 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,19 +37,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - - /** - * A benchmark that evaluates the performance of reading ORC data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading ORC data with a flat schema using Iceberg + * and the built-in file source in Spark. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedORCDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-orc-data-read-benchmark-result.txt @@ -72,11 +69,13 @@ public void tearDownBenchmark() throws IOException { public void readIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -84,12 +83,18 @@ public void readIcebergNonVectorized() { public void readIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation); + materialize(df); + }); } @Benchmark @@ -98,10 +103,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()); + materialize(df); + }); } @Benchmark @@ -109,11 +116,14 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -121,12 +131,19 @@ public void readWithProjectionIcebergNonVectorized() { public void readWithProjectionIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation) + .selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -135,27 +152,32 @@ public void readWithProjectionFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()).selectExpr("nested.col3"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "orc"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); + appendAsFile(df); + } + }); } } diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java index 0406ea34ba62..c88536721d17 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,21 +35,15 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** * A benchmark that evaluates the file skipping capabilities in the Spark data source for Iceberg. * - * This class uses a dataset with a flat schema, where the records are clustered according to the + *

    This class uses a dataset with a flat schema, where the records are clustered according to the * column used in the filter predicate. * - * The performance is compared to the built-in file source in Spark. + *

    The performance is compared to the built-in file source in Spark. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=IcebergSourceFlatParquetDataFilterBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-filter-benchmark-result.txt @@ -74,11 +72,14 @@ public void tearDownBenchmark() throws IOException { public void readWithFilterIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -87,10 +88,12 @@ public void readWithFilterFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -99,23 +102,27 @@ public void readWithFilterFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } private void appendData() { for (int fileNum = 1; fileNum < NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); appendAsFile(df); } } diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java index 0e9c493e2f53..36c5f6952cd3 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,17 +35,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of reading Parquet data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading Parquet data with a flat schema using + * Iceberg and the built-in file source in Spark. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=IcebergSourceFlatParquetDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-read-benchmark-result.txt @@ -69,11 +67,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,10 +82,12 @@ public void readFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -94,10 +96,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -105,11 +109,13 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @@ -118,10 +124,12 @@ public void readWithProjectionFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("longCol"); + materialize(df); + }); } @Benchmark @@ -130,23 +138,27 @@ public void readWithProjectionFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("longCol"); + materialize(df); + }); } private void appendData() { for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); appendAsFile(df); } } diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java index bbbd23208e1a..481be7a40bc6 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,14 +33,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of writing Parquet data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing Parquet data with a flat schema using + * Iceberg and the built-in file source in Spark. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=IcebergSourceFlatParquetDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-write-benchmark-result.txt @@ -76,7 +74,8 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(NUM_ROWS) + return spark() + .range(NUM_ROWS) .withColumnRenamed("id", "longCol") .withColumn("intCol", expr("CAST(longCol AS INT)")) .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java index ccb73048fc3a..fe3faa0fcd8c 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.spark.sql.functions.array_repeat; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -33,22 +36,18 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.array_repeat; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedListParquetDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-list-parquet-data-write-benchmark-result.txt * */ -public class IcebergSourceNestedListParquetDataWriteBenchmark extends IcebergSourceNestedListDataBenchmark { +public class IcebergSourceNestedListParquetDataWriteBenchmark + extends IcebergSourceNestedListDataBenchmark { @Setup public void setupBenchmark() { @@ -80,10 +79,11 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(numRows) - .withColumn("outerlist", array_repeat(struct( - expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), - 10)) + return spark() + .range(numRows) + .withColumn( + "outerlist", + array_repeat(struct(expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), 10)) .coalesce(1); } } diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java index d365f251d317..943f1b384189 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,27 +35,22 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - /** * A benchmark that evaluates the file skipping capabilities in the Spark data source for Iceberg. * - * This class uses a dataset with nested data, where the records are clustered according to the + *

    This class uses a dataset with nested data, where the records are clustered according to the * column used in the filter predicate. * - * The performance is compared to the built-in file source in Spark. + *

    The performance is compared to the built-in file source in Spark. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedParquetDataFilterBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-filter-benchmark-result.txt * */ -public class IcebergSourceNestedParquetDataFilterBenchmark extends IcebergSourceNestedDataBenchmark { +public class IcebergSourceNestedParquetDataFilterBenchmark + extends IcebergSourceNestedDataBenchmark { private static final String FILTER_COND = "nested.col3 == 0"; private static final int NUM_FILES = 500; @@ -74,11 +73,14 @@ public void tearDownBenchmark() throws IOException { public void readWithFilterIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -87,10 +89,12 @@ public void readWithFilterFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -99,22 +103,25 @@ public void readWithFilterFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } private void appendData() { for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); appendAsFile(df); } } diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java index 409d3301615c..7c36c31d514b 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,17 +35,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedParquetDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-read-benchmark-result.txt @@ -69,11 +67,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,10 +82,12 @@ public void readFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -94,10 +96,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -105,11 +109,14 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -119,10 +126,12 @@ public void readWithProjectionFileSourceVectorized() { conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); conf.put(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED().key(), "true"); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -132,22 +141,25 @@ public void readWithProjectionFileSourceNonVectorized() { conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); conf.put(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED().key(), "true"); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); + materialize(df); + }); } private void appendData() { for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); appendAsFile(df); } } diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java index e5e184382e38..1f845db31e1c 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,15 +34,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedParquetDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-write-benchmark-result.txt @@ -77,14 +75,14 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(NUM_ROWS) + return spark() + .range(NUM_ROWS) .withColumn( "nested", struct( expr("CAST(id AS string) AS col1"), expr("CAST(id AS double) AS col2"), - expr("id AS col3") - )) + expr("id AS col3"))) .coalesce(1); } } diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java index 9730807b937d..b5091b62f4cb 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; import org.apache.iceberg.FileFormat; @@ -25,8 +24,7 @@ /** * A benchmark that evaluates the performance of various Iceberg writers for Parquet data. * - * To run this benchmark for spark-3.1: - * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=ParquetWritersBenchmark * -PjmhOutputPath=benchmark/parquet-writers-benchmark-result.txt diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java index d565d62c1e07..b6387bbee2d7 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet.vectorized; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.pmod; +import static org.apache.spark.sql.functions.to_date; +import static org.apache.spark.sql.functions.to_timestamp; + import java.math.BigDecimal; import java.math.BigInteger; import java.util.Map; @@ -32,32 +38,26 @@ import org.apache.spark.sql.types.DataTypes; import org.openjdk.jmh.annotations.Setup; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.pmod; -import static org.apache.spark.sql.functions.to_date; -import static org.apache.spark.sql.functions.to_timestamp; - /** - * Benchmark to compare performance of reading Parquet dictionary encoded data with a flat schema using vectorized - * Iceberg read path and the built-in file source in Spark. - *

    - * To run this benchmark for spark-3.1: - * + * Benchmark to compare performance of reading Parquet dictionary encoded data with a flat schema + * using vectorized Iceberg read path and the built-in file source in Spark. + * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=VectorizedReadDictionaryEncodedFlatParquetDataBenchmark * -PjmhOutputPath=benchmark/results.txt * */ -public class VectorizedReadDictionaryEncodedFlatParquetDataBenchmark extends VectorizedReadFlatParquetDataBenchmark { +public class VectorizedReadDictionaryEncodedFlatParquetDataBenchmark + extends VectorizedReadFlatParquetDataBenchmark { @Setup @Override public void setupBenchmark() { setupSpark(true); appendData(); - // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds + // Allow unsafe memory access to avoid the costly check arrow does to check if index is within + // bounds System.setProperty("arrow.enable_unsafe_memory_access", "true"); // Disable expensive null check for every get(index) call. // Iceberg manages nullability checks itself instead of relying on arrow. @@ -83,9 +83,7 @@ void appendData() { df = withTimestampColumnDictEncoded(df); df = withStringColumnDictEncoded(df); df = df.drop("id"); - df.write().format("iceberg") - .mode(SaveMode.Append) - .save(table().location()); + df.write().format("iceberg").mode(SaveMode.Append).save(table().location()); } private static Column modColumn() { @@ -106,7 +104,6 @@ private static Dataset withIntColumnDictEncoded(Dataset df) { private static Dataset withFloatColumnDictEncoded(Dataset df) { return df.withColumn("floatCol", modColumn().cast(DataTypes.FloatType)); - } private static Dataset withDoubleColumnDictEncoded(Dataset df) { @@ -125,7 +122,8 @@ private static Dataset withDateColumnDictEncoded(Dataset df) { private static Dataset withTimestampColumnDictEncoded(Dataset df) { Column days = modColumn().cast(DataTypes.ShortType); - return df.withColumn("timestampCol", to_timestamp(date_add(to_date(lit("04/12/2019"), "MM/dd/yyyy"), days))); + return df.withColumn( + "timestampCol", to_timestamp(date_add(to_date(lit("04/12/2019"), "MM/dd/yyyy"), days))); } private static Dataset withStringColumnDictEncoded(Dataset df) { diff --git a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java index b223758cbe31..d82ddbdc38e6 100644 --- a/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java +++ b/spark/v3.1/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java @@ -16,9 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet.vectorized; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.pmod; +import static org.apache.spark.sql.functions.when; + import java.io.IOException; import java.util.Map; import org.apache.hadoop.conf.Configuration; @@ -38,21 +46,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.pmod; -import static org.apache.spark.sql.functions.when; - /** - * Benchmark to compare performance of reading Parquet data with a flat schema using vectorized Iceberg read path and - * the built-in file source in Spark. - *

    - * To run this benchmark for spark-3.1: - * + * Benchmark to compare performance of reading Parquet data with a flat schema using vectorized + * Iceberg read path and the built-in file source in Spark. + * + *

    To run this benchmark for spark-3.1: * ./gradlew -DsparkVersions=3.1 :iceberg-spark:iceberg-spark-3.1_2.12:jmh * -PjmhIncludeRegex=VectorizedReadFlatParquetDataBenchmark * -PjmhOutputPath=benchmark/results.txt @@ -67,7 +65,8 @@ public class VectorizedReadFlatParquetDataBenchmark extends IcebergSourceBenchma public void setupBenchmark() { setupSpark(); appendData(); - // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds + // Allow unsafe memory access to avoid the costly check arrow does to check if index is within + // bounds System.setProperty("arrow.enable_unsafe_memory_access", "true"); // Disable expensive null check for every get(index) call. // Iceberg manages nullability checks itself instead of relying on arrow. @@ -87,15 +86,16 @@ protected Configuration initHadoopConf() { @Override protected Table initTable() { - Schema schema = new Schema( - optional(1, "longCol", Types.LongType.get()), - optional(2, "intCol", Types.IntegerType.get()), - optional(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + optional(1, "longCol", Types.LongType.get()), + optional(2, "intCol", Types.IntegerType.get()), + optional(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = parquetWriteProps(); @@ -111,19 +111,20 @@ Map parquetWriteProps() { void appendData() { for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS_PER_FILE) - .withColumn( - "longCol", - when(pmod(col("id"), lit(10)).equalTo(lit(0)), lit(null)) - .otherwise(col("id"))) - .drop("id") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(longCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS_PER_FILE) + .withColumn( + "longCol", + when(pmod(col("id"), lit(10)).equalTo(lit(0)), lit(null)).otherwise(col("id"))) + .drop("id") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(longCol AS STRING)")); appendAsFile(df); } } @@ -131,161 +132,189 @@ void appendData() { @Benchmark @Threads(1) public void readIntegersIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("intCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("intCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readIntegersSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("intCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("intCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readLongsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readLongsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("longCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readFloatsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("floatCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("floatCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readFloatsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("floatCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("floatCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDoublesIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("doubleCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("doubleCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDoublesSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("doubleCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("doubleCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDecimalsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("decimalCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("decimalCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDecimalsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("decimalCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("decimalCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDatesIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("dateCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("dateCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDatesSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("dateCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("dateCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readTimestampsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("timestampCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("timestampCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readTimestampsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("timestampCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("timestampCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readStringsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("stringCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("stringCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readStringsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("stringCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("stringCol"); + materialize(df); + }); } private static Map tablePropsWithVectorizationEnabled(int batchSize) { diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/BaseCatalog.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/BaseCatalog.java index 2942a52a135e..d04f82d339f7 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/BaseCatalog.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/BaseCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.spark.procedures.SparkProcedures; @@ -35,7 +34,8 @@ public Procedure loadProcedure(Identifier ident) throws NoSuchProcedureException String[] namespace = ident.namespace(); String name = ident.name(); - // namespace resolution is case insensitive until we have a way to configure case sensitivity in catalogs + // namespace resolution is case insensitive until we have a way to configure case sensitivity in + // catalogs if (namespace.length == 1 && namespace[0].equalsIgnoreCase("system")) { ProcedureBuilder builder = SparkProcedures.newBuilder(name); if (builder != null) { diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java index 58137250003a..641b957d1176 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -24,20 +23,18 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.util.ExceptionUtil; -/** - * utility class to accept thread local commit properties - */ +/** utility class to accept thread local commit properties */ public class CommitMetadata { - private CommitMetadata() { - - } + private CommitMetadata() {} - private static final ThreadLocal> COMMIT_PROPERTIES = ThreadLocal.withInitial(ImmutableMap::of); + private static final ThreadLocal> COMMIT_PROPERTIES = + ThreadLocal.withInitial(ImmutableMap::of); /** - * running the code wrapped as a caller, and any snapshot committed within the callable object will be attached with - * the metadata defined in properties + * running the code wrapped as a caller, and any snapshot committed within the callable object + * will be attached with the metadata defined in properties + * * @param properties extra commit metadata to attach to the snapshot committed within callable * @param callable the code to be executed * @param exClass the expected type of exception which would be thrown from callable diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/FileRewriteCoordinator.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/FileRewriteCoordinator.java index acd5f64d7ed6..210e861a4c16 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/FileRewriteCoordinator.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/FileRewriteCoordinator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -39,22 +38,26 @@ public class FileRewriteCoordinator { private final Map, Set> resultMap = Maps.newConcurrentMap(); - private FileRewriteCoordinator() { - } + private FileRewriteCoordinator() {} public static FileRewriteCoordinator get() { return INSTANCE; } /** - * Called to persist the output of a rewrite action for a specific group. Since the write is done via a - * Spark Datasource, we have to propagate the result through this side-effect call. + * Called to persist the output of a rewrite action for a specific group. Since the write is done + * via a Spark Datasource, we have to propagate the result through this side-effect call. + * * @param table table where the rewrite is occurring * @param fileSetID the id used to identify the source set of files being rewritten * @param newDataFiles the new files which have been written */ public void stageRewrite(Table table, String fileSetID, Set newDataFiles) { - LOG.debug("Staging the output for {} - fileset {} with {} files", table.name(), fileSetID, newDataFiles.size()); + LOG.debug( + "Staging the output for {} - fileset {} with {} files", + table.name(), + fileSetID, + newDataFiles.size()); Pair id = toID(table, fileSetID); resultMap.put(id, newDataFiles); } @@ -62,9 +65,8 @@ public void stageRewrite(Table table, String fileSetID, Set newDataFil public Set fetchNewDataFiles(Table table, String fileSetID) { Pair id = toID(table, fileSetID); Set result = resultMap.get(id); - ValidationException.check(result != null, - "No results for rewrite of file set %s in table %s", - fileSetID, table); + ValidationException.check( + result != null, "No results for rewrite of file set %s in table %s", fileSetID, table); return result; } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/FileScanTaskSetManager.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/FileScanTaskSetManager.java index 827b674ca16d..4b6da39905c1 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/FileScanTaskSetManager.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/FileScanTaskSetManager.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -37,15 +36,15 @@ public class FileScanTaskSetManager { private final Map, List> tasksMap = Maps.newConcurrentMap(); - private FileScanTaskSetManager() { - } + private FileScanTaskSetManager() {} public static FileScanTaskSetManager get() { return INSTANCE; } public void stageTasks(Table table, String setID, List tasks) { - Preconditions.checkArgument(tasks != null && tasks.size() > 0, "Cannot stage null or empty tasks"); + Preconditions.checkArgument( + tasks != null && tasks.size() > 0, "Cannot stage null or empty tasks"); Pair id = toID(table, setID); tasksMap.put(id, tasks); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java index 862626d0cd6d..87de0a98b934 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.transforms.Transform; @@ -27,14 +26,18 @@ import org.apache.spark.sql.types.DataTypes; public class IcebergSpark { - private IcebergSpark() { - } + private IcebergSpark() {} - public static void registerBucketUDF(SparkSession session, String funcName, DataType sourceType, int numBuckets) { + public static void registerBucketUDF( + SparkSession session, String funcName, DataType sourceType, int numBuckets) { SparkTypeToType typeConverter = new SparkTypeToType(); Type sourceIcebergType = typeConverter.atomic(sourceType); Transform bucket = Transforms.bucket(sourceIcebergType, numBuckets); - session.udf().register(funcName, - value -> bucket.apply(SparkValueConverter.convert(sourceIcebergType, value)), DataTypes.IntegerType); + session + .udf() + .register( + funcName, + value -> bucket.apply(SparkValueConverter.convert(sourceIcebergType, value)), + DataTypes.IntegerType); } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java index a35808fd8ce6..c0756d924e2f 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java @@ -16,13 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; -/** - * Captures information about the current job - * which is used for displaying on the UI - */ +/** Captures information about the current job which is used for displaying on the UI */ public class JobGroupInfo { private String groupId; private String description; diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java index 155dce707701..dc8ba69d40a8 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.spark.SparkContext; @@ -26,10 +25,10 @@ public class JobGroupUtils { private static final String JOB_GROUP_ID = SparkContext$.MODULE$.SPARK_JOB_GROUP_ID(); private static final String JOB_GROUP_DESC = SparkContext$.MODULE$.SPARK_JOB_DESCRIPTION(); - private static final String JOB_INTERRUPT_ON_CANCEL = SparkContext$.MODULE$.SPARK_JOB_INTERRUPT_ON_CANCEL(); + private static final String JOB_INTERRUPT_ON_CANCEL = + SparkContext$.MODULE$.SPARK_JOB_INTERRUPT_ON_CANCEL(); - private JobGroupUtils() { - } + private JobGroupUtils() {} public static JobGroupInfo getJobGroupInfo(SparkContext sparkContext) { String groupId = sparkContext.getLocalProperty(JOB_GROUP_ID); @@ -41,6 +40,7 @@ public static JobGroupInfo getJobGroupInfo(SparkContext sparkContext) { public static void setJobGroupInfo(SparkContext sparkContext, JobGroupInfo info) { sparkContext.setLocalProperty(JOB_GROUP_ID, info.groupId()); sparkContext.setLocalProperty(JOB_GROUP_DESC, info.description()); - sparkContext.setLocalProperty(JOB_INTERRUPT_ON_CANCEL, String.valueOf(info.interruptOnCancel())); + sparkContext.setLocalProperty( + JOB_INTERRUPT_ON_CANCEL, String.valueOf(info.interruptOnCancel())); } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/OrderField.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/OrderField.java index 0f24ee38fceb..387f0f348c36 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/OrderField.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/OrderField.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.NullOrder; @@ -27,40 +26,55 @@ import org.apache.spark.sql.connector.iceberg.expressions.SortOrder; class OrderField implements SortOrder { - static OrderField column(String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + static OrderField column( + String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { return new OrderField(Expressions.column(fieldName), toSpark(direction), toSpark(nullOrder)); } - static OrderField bucket(String fieldName, int numBuckets, org.apache.iceberg.SortDirection direction, - NullOrder nullOrder) { - return new OrderField(Expressions.bucket(numBuckets, fieldName), toSpark(direction), toSpark(nullOrder)); + static OrderField bucket( + String fieldName, + int numBuckets, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { + return new OrderField( + Expressions.bucket(numBuckets, fieldName), toSpark(direction), toSpark(nullOrder)); } - static OrderField truncate(String fieldName, int width, org.apache.iceberg.SortDirection direction, - NullOrder nullOrder) { - return new OrderField(Expressions.apply( - "truncate", Expressions.column(fieldName), Expressions.literal(width)), - toSpark(direction), toSpark(nullOrder)); + static OrderField truncate( + String fieldName, + int width, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { + return new OrderField( + Expressions.apply("truncate", Expressions.column(fieldName), Expressions.literal(width)), + toSpark(direction), + toSpark(nullOrder)); } - static OrderField year(String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + static OrderField year( + String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { return new OrderField(Expressions.years(fieldName), toSpark(direction), toSpark(nullOrder)); } - static OrderField month(String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + static OrderField month( + String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { return new OrderField(Expressions.months(fieldName), toSpark(direction), toSpark(nullOrder)); } - static OrderField day(String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + static OrderField day( + String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { return new OrderField(Expressions.days(fieldName), toSpark(direction), toSpark(nullOrder)); } - static OrderField hour(String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + static OrderField hour( + String fieldName, org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { return new OrderField(Expressions.hours(fieldName), toSpark(direction), toSpark(nullOrder)); } private static SortDirection toSpark(org.apache.iceberg.SortDirection direction) { - return direction == org.apache.iceberg.SortDirection.ASC ? SortDirection.ASCENDING : SortDirection.DESCENDING; + return direction == org.apache.iceberg.SortDirection.ASC + ? SortDirection.ASCENDING + : SortDirection.DESCENDING; } private static NullOrdering toSpark(NullOrder nullOrder) { @@ -94,9 +108,10 @@ public NullOrdering nullOrdering() { @Override public String describe() { - return String.format("%s %s %s", - expr.describe(), - direction == SortDirection.ASCENDING ? "ASC" : "DESC", - nullOrder == NullOrdering.NULLS_FIRST ? "NULLS FIRST" : "NULLS LAST"); + return String.format( + "%s %s %s", + expr.describe(), + direction == SortDirection.ASCENDING ? "ASC" : "DESC", + nullOrder == NullOrdering.NULLS_FIRST ? "NULLS FIRST" : "NULLS LAST"); } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/PathIdentifier.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/PathIdentifier.java index 235097ea46cc..110af6b87de5 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/PathIdentifier.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/PathIdentifier.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -36,9 +35,10 @@ public PathIdentifier(String location) { this.location = location; List pathParts = SPLIT.splitToList(location); name = Iterables.getLast(pathParts); - namespace = pathParts.size() > 1 ? - new String[]{JOIN.join(pathParts.subList(0, pathParts.size() - 1))} : - new String[0]; + namespace = + pathParts.size() > 1 + ? new String[] {JOIN.join(pathParts.subList(0, pathParts.size() - 1))} + : new String[0]; } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java index 3bdf984ed219..3c111d3b44cb 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -70,7 +69,8 @@ public Type schema(Schema schema, Supplier structResult) { @Override public Type struct(Types.StructType struct, Iterable fieldResults) { - Preconditions.checkNotNull(struct, "Cannot prune null struct. Pruning must start with a schema."); + Preconditions.checkNotNull( + struct, "Cannot prune null struct. Pruning must start with a schema."); Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); StructType requestedStruct = (StructType) current; @@ -92,13 +92,13 @@ public Type struct(Types.StructType struct, Iterable fieldResults) { } else if (field.isOptional()) { changed = true; - projectedFields.put(field.name(), - Types.NestedField.optional(field.fieldId(), field.name(), type)); + projectedFields.put( + field.name(), Types.NestedField.optional(field.fieldId(), field.name(), type)); } else { changed = true; - projectedFields.put(field.name(), - Types.NestedField.required(field.fieldId(), field.name(), type)); + projectedFields.put( + field.name(), Types.NestedField.required(field.fieldId(), field.name(), type)); } } @@ -145,8 +145,10 @@ public Type field(Types.NestedField field, Supplier fieldResult) { int fieldIndex = requestedStruct.fieldIndex(field.name()); StructField requestedField = requestedStruct.fields()[fieldIndex]; - Preconditions.checkArgument(requestedField.nullable() || field.isRequired(), - "Cannot project an optional field as non-null: %s", field.name()); + Preconditions.checkArgument( + requestedField.nullable() || field.isRequired(), + "Cannot project an optional field as non-null: %s", + field.name()); this.current = requestedField.dataType(); try { @@ -164,8 +166,10 @@ public Type list(Types.ListType list, Supplier elementResult) { Preconditions.checkArgument(current instanceof ArrayType, "Not an array: %s", current); ArrayType requestedArray = (ArrayType) current; - Preconditions.checkArgument(requestedArray.containsNull() || !list.isElementOptional(), - "Cannot project an array of optional elements as required elements: %s", requestedArray); + Preconditions.checkArgument( + requestedArray.containsNull() || !list.isElementOptional(), + "Cannot project an array of optional elements as required elements: %s", + requestedArray); this.current = requestedArray.elementType(); try { @@ -190,10 +194,14 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu Preconditions.checkArgument(current instanceof MapType, "Not a map: %s", current); MapType requestedMap = (MapType) current; - Preconditions.checkArgument(requestedMap.valueContainsNull() || !map.isValueOptional(), - "Cannot project a map of optional values as required values: %s", map); - Preconditions.checkArgument(StringType.class.isInstance(requestedMap.keyType()), - "Invalid map key type (not string): %s", requestedMap.keyType()); + Preconditions.checkArgument( + requestedMap.valueContainsNull() || !map.isValueOptional(), + "Cannot project a map of optional values as required values: %s", + map); + Preconditions.checkArgument( + StringType.class.isInstance(requestedMap.keyType()), + "Invalid map key type (not string): %s", + requestedMap.keyType()); this.current = requestedMap.valueType(); try { @@ -215,23 +223,32 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu @Override public Type primitive(Type.PrimitiveType primitive) { Class expectedType = TYPES.get(primitive.typeId()); - Preconditions.checkArgument(expectedType != null && expectedType.isInstance(current), - "Cannot project %s to incompatible type: %s", primitive, current); + Preconditions.checkArgument( + expectedType != null && expectedType.isInstance(current), + "Cannot project %s to incompatible type: %s", + primitive, + current); // additional checks based on type switch (primitive.typeId()) { case DECIMAL: Types.DecimalType decimal = (Types.DecimalType) primitive; DecimalType requestedDecimal = (DecimalType) current; - Preconditions.checkArgument(requestedDecimal.scale() == decimal.scale(), - "Cannot project decimal with incompatible scale: %s != %s", requestedDecimal.scale(), decimal.scale()); - Preconditions.checkArgument(requestedDecimal.precision() >= decimal.precision(), + Preconditions.checkArgument( + requestedDecimal.scale() == decimal.scale(), + "Cannot project decimal with incompatible scale: %s != %s", + requestedDecimal.scale(), + decimal.scale()); + Preconditions.checkArgument( + requestedDecimal.precision() >= decimal.precision(), "Cannot project decimal with incompatible precision: %s < %s", - requestedDecimal.precision(), decimal.precision()); + requestedDecimal.precision(), + decimal.precision()); break; case TIMESTAMP: Types.TimestampType timestamp = (Types.TimestampType) primitive; - Preconditions.checkArgument(timestamp.shouldAdjustToUTC(), + Preconditions.checkArgument( + timestamp.shouldAdjustToUTC(), "Cannot project timestamp (without time zone) as timestamptz (with time zone)"); break; default: @@ -240,19 +257,19 @@ public Type primitive(Type.PrimitiveType primitive) { return primitive; } - private static final ImmutableMap> TYPES = ImmutableMap - .>builder() - .put(TypeID.BOOLEAN, BooleanType.class) - .put(TypeID.INTEGER, IntegerType.class) - .put(TypeID.LONG, LongType.class) - .put(TypeID.FLOAT, FloatType.class) - .put(TypeID.DOUBLE, DoubleType.class) - .put(TypeID.DATE, DateType.class) - .put(TypeID.TIMESTAMP, TimestampType.class) - .put(TypeID.DECIMAL, DecimalType.class) - .put(TypeID.UUID, StringType.class) - .put(TypeID.STRING, StringType.class) - .put(TypeID.FIXED, BinaryType.class) - .put(TypeID.BINARY, BinaryType.class) - .build(); + private static final ImmutableMap> TYPES = + ImmutableMap.>builder() + .put(TypeID.BOOLEAN, BooleanType.class) + .put(TypeID.INTEGER, IntegerType.class) + .put(TypeID.LONG, LongType.class) + .put(TypeID.FLOAT, FloatType.class) + .put(TypeID.DOUBLE, DoubleType.class) + .put(TypeID.DATE, DateType.class) + .put(TypeID.TIMESTAMP, TimestampType.class) + .put(TypeID.DECIMAL, DecimalType.class) + .put(TypeID.UUID, StringType.class) + .put(TypeID.STRING, StringType.class) + .put(TypeID.FIXED, BinaryType.class) + .put(TypeID.BINARY, BinaryType.class) + .build(); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java index c6984e2fe8cd..61a215b938c5 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -68,7 +67,8 @@ public Type schema(Schema schema, Supplier structResult) { @Override public Type struct(Types.StructType struct, Iterable fieldResults) { - Preconditions.checkNotNull(struct, "Cannot prune null struct. Pruning must start with a schema."); + Preconditions.checkNotNull( + struct, "Cannot prune null struct. Pruning must start with a schema."); Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); List fields = struct.fields(); @@ -120,8 +120,10 @@ public Type field(Types.NestedField field, Supplier fieldResult) { int fieldIndex = requestedStruct.fieldIndex(field.name()); StructField requestedField = requestedStruct.fields()[fieldIndex]; - Preconditions.checkArgument(requestedField.nullable() || field.isRequired(), - "Cannot project an optional field as non-null: %s", field.name()); + Preconditions.checkArgument( + requestedField.nullable() || field.isRequired(), + "Cannot project an optional field as non-null: %s", + field.name()); this.current = requestedField.dataType(); try { @@ -139,8 +141,10 @@ public Type list(Types.ListType list, Supplier elementResult) { Preconditions.checkArgument(current instanceof ArrayType, "Not an array: %s", current); ArrayType requestedArray = (ArrayType) current; - Preconditions.checkArgument(requestedArray.containsNull() || !list.isElementOptional(), - "Cannot project an array of optional elements as required elements: %s", requestedArray); + Preconditions.checkArgument( + requestedArray.containsNull() || !list.isElementOptional(), + "Cannot project an array of optional elements as required elements: %s", + requestedArray); this.current = requestedArray.elementType(); try { @@ -165,8 +169,10 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu Preconditions.checkArgument(current instanceof MapType, "Not a map: %s", current); MapType requestedMap = (MapType) current; - Preconditions.checkArgument(requestedMap.valueContainsNull() || !map.isValueOptional(), - "Cannot project a map of optional values as required values: %s", map); + Preconditions.checkArgument( + requestedMap.valueContainsNull() || !map.isValueOptional(), + "Cannot project a map of optional values as required values: %s", + map); this.current = requestedMap.valueType(); try { @@ -188,19 +194,27 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu @Override public Type primitive(Type.PrimitiveType primitive) { Class expectedType = TYPES.get(primitive.typeId()); - Preconditions.checkArgument(expectedType != null && expectedType.isInstance(current), - "Cannot project %s to incompatible type: %s", primitive, current); + Preconditions.checkArgument( + expectedType != null && expectedType.isInstance(current), + "Cannot project %s to incompatible type: %s", + primitive, + current); // additional checks based on type switch (primitive.typeId()) { case DECIMAL: Types.DecimalType decimal = (Types.DecimalType) primitive; DecimalType requestedDecimal = (DecimalType) current; - Preconditions.checkArgument(requestedDecimal.scale() == decimal.scale(), - "Cannot project decimal with incompatible scale: %s != %s", requestedDecimal.scale(), decimal.scale()); - Preconditions.checkArgument(requestedDecimal.precision() >= decimal.precision(), + Preconditions.checkArgument( + requestedDecimal.scale() == decimal.scale(), + "Cannot project decimal with incompatible scale: %s != %s", + requestedDecimal.scale(), + decimal.scale()); + Preconditions.checkArgument( + requestedDecimal.precision() >= decimal.precision(), "Cannot project decimal with incompatible precision: %s < %s", - requestedDecimal.precision(), decimal.precision()); + requestedDecimal.precision(), + decimal.precision()); break; default: } @@ -208,19 +222,19 @@ public Type primitive(Type.PrimitiveType primitive) { return primitive; } - private static final ImmutableMap> TYPES = ImmutableMap - .>builder() - .put(TypeID.BOOLEAN, BooleanType.class) - .put(TypeID.INTEGER, IntegerType.class) - .put(TypeID.LONG, LongType.class) - .put(TypeID.FLOAT, FloatType.class) - .put(TypeID.DOUBLE, DoubleType.class) - .put(TypeID.DATE, DateType.class) - .put(TypeID.TIMESTAMP, TimestampType.class) - .put(TypeID.DECIMAL, DecimalType.class) - .put(TypeID.UUID, StringType.class) - .put(TypeID.STRING, StringType.class) - .put(TypeID.FIXED, BinaryType.class) - .put(TypeID.BINARY, BinaryType.class) - .build(); + private static final ImmutableMap> TYPES = + ImmutableMap.>builder() + .put(TypeID.BOOLEAN, BooleanType.class) + .put(TypeID.INTEGER, IntegerType.class) + .put(TypeID.LONG, LongType.class) + .put(TypeID.FLOAT, FloatType.class) + .put(TypeID.DOUBLE, DoubleType.class) + .put(TypeID.DATE, DateType.class) + .put(TypeID.TIMESTAMP, TimestampType.class) + .put(TypeID.DECIMAL, DecimalType.class) + .put(TypeID.UUID, StringType.class) + .put(TypeID.STRING, StringType.class) + .put(TypeID.FIXED, BinaryType.class) + .put(TypeID.BINARY, BinaryType.class) + .build(); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java index a27d06e7a1d7..bc8a966488ee 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -41,22 +40,25 @@ /** * An implementation of StagedTable that mimics the behavior of Spark's non-atomic CTAS and RTAS. - *

    - * A Spark catalog can implement StagingTableCatalog to support atomic operations by producing StagedTable. But if a - * catalog implements StagingTableCatalog, Spark expects the catalog to be able to produce a StagedTable for any table - * loaded by the catalog. This assumption doesn't always work, as in the case of {@link SparkSessionCatalog}, which - * supports atomic operations can produce a StagedTable for Iceberg tables, but wraps the session catalog and cannot - * necessarily produce a working StagedTable implementation for tables that it loads. - *

    - * The work-around is this class, which implements the StagedTable interface but does not have atomic behavior. Instead, - * the StagedTable interface is used to implement the behavior of the non-atomic SQL plans that will create a table, - * write, and will drop the table to roll back. - *

    - * This StagedTable implements SupportsRead, SupportsWrite, and SupportsDelete by passing the calls to the real table. - * Implementing those interfaces is safe because Spark will not use them unless the table supports them and returns the - * corresponding capabilities from {@link #capabilities()}. + * + *

    A Spark catalog can implement StagingTableCatalog to support atomic operations by producing + * StagedTable. But if a catalog implements StagingTableCatalog, Spark expects the catalog to be + * able to produce a StagedTable for any table loaded by the catalog. This assumption doesn't always + * work, as in the case of {@link SparkSessionCatalog}, which supports atomic operations can produce + * a StagedTable for Iceberg tables, but wraps the session catalog and cannot necessarily produce a + * working StagedTable implementation for tables that it loads. + * + *

    The work-around is this class, which implements the StagedTable interface but does not have + * atomic behavior. Instead, the StagedTable interface is used to implement the behavior of the + * non-atomic SQL plans that will create a table, write, and will drop the table to roll back. + * + *

    This StagedTable implements SupportsRead, SupportsWrite, and SupportsDelete by passing the + * calls to the real table. Implementing those interfaces is safe because Spark will not use them + * unless the table supports them and returns the corresponding capabilities from {@link + * #capabilities()}. */ -public class RollbackStagedTable implements StagedTable, SupportsRead, SupportsWrite, SupportsDelete { +public class RollbackStagedTable + implements StagedTable, SupportsRead, SupportsWrite, SupportsDelete { private final TableCatalog catalog; private final Identifier ident; private final Table table; @@ -119,19 +121,22 @@ public WriteBuilder newWriteBuilder(LogicalWriteInfo info) { } private void call(Class requiredClass, Consumer task) { - callReturning(requiredClass, inst -> { - task.accept(inst); - return null; - }); + callReturning( + requiredClass, + inst -> { + task.accept(inst); + return null; + }); } private R callReturning(Class requiredClass, Function task) { if (requiredClass.isInstance(table)) { return task.apply(requiredClass.cast(table)); } else { - throw new UnsupportedOperationException(String.format( - "Table does not implement %s: %s (%s)", - requiredClass.getSimpleName(), table.name(), table.getClass().getName())); + throw new UnsupportedOperationException( + String.format( + "Table does not implement %s: %s (%s)", + requiredClass.getSimpleName(), table.name(), table.getClass().getName())); } } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SortOrderToSpark.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SortOrderToSpark.java index 4f67be73405a..41251eb56a86 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SortOrderToSpark.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SortOrderToSpark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.NullOrder; @@ -30,12 +29,14 @@ public OrderField field(String sourceName, int id, SortDirection direction, Null } @Override - public OrderField bucket(String sourceName, int id, int width, SortDirection direction, NullOrder nullOrder) { + public OrderField bucket( + String sourceName, int id, int width, SortDirection direction, NullOrder nullOrder) { return OrderField.bucket(sourceName, width, direction, nullOrder); } @Override - public OrderField truncate(String sourceName, int id, int width, SortDirection direction, NullOrder nullOrder) { + public OrderField truncate( + String sourceName, int id, int width, SortDirection direction, NullOrder nullOrder) { return OrderField.truncate(sourceName, width, direction, nullOrder); } @@ -59,4 +60,3 @@ public OrderField hour(String sourceName, int id, SortDirection direction, NullO return OrderField.hour(sourceName, direction, nullOrder); } } - diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java index c2c3c6992963..a5a58922c933 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE; + import java.nio.ByteBuffer; import java.util.Arrays; import java.util.Collections; @@ -94,18 +97,13 @@ import scala.collection.JavaConverters; import scala.collection.Seq; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE; - public class Spark3Util { - private static final Set RESERVED_PROPERTIES = ImmutableSet.of( - TableCatalog.PROP_LOCATION, TableCatalog.PROP_PROVIDER); + private static final Set RESERVED_PROPERTIES = + ImmutableSet.of(TableCatalog.PROP_LOCATION, TableCatalog.PROP_PROVIDER); private static final Joiner DOT = Joiner.on("."); - private Spark3Util() { - } + private Spark3Util() {} public static Map rebuildCreateProperties(Map createProperties) { ImmutableMap.Builder tableProperties = ImmutableMap.builder(); @@ -134,7 +132,8 @@ public static Map rebuildCreateProperties(Map cr * @param changes a list of Spark table changes * @return the UpdateProperties operation configured with the changes */ - public static UpdateProperties applyPropertyChanges(UpdateProperties pendingUpdate, List changes) { + public static UpdateProperties applyPropertyChanges( + UpdateProperties pendingUpdate, List changes) { for (TableChange change : changes) { if (change instanceof TableChange.SetProperty) { TableChange.SetProperty set = (TableChange.SetProperty) change; @@ -159,7 +158,8 @@ public static UpdateProperties applyPropertyChanges(UpdateProperties pendingUpda * @param changes a list of Spark table changes * @return the UpdateSchema operation configured with the changes */ - public static UpdateSchema applySchemaChanges(UpdateSchema pendingUpdate, List changes) { + public static UpdateSchema applySchemaChanges( + UpdateSchema pendingUpdate, List changes) { for (TableChange change : changes) { if (change instanceof TableChange.AddColumn) { apply(pendingUpdate, (TableChange.AddColumn) change); @@ -167,8 +167,11 @@ public static UpdateSchema applySchemaChanges(UpdateSchema pendingUpdate, List transforms = PartitionSpecVisitor.visit(spec, - new PartitionSpecVisitor() { - @Override - public Transform identity(String sourceName, int sourceId) { - return Expressions.identity(sourceName); - } - - @Override - public Transform bucket(String sourceName, int sourceId, int numBuckets) { - return Expressions.bucket(numBuckets, sourceName); - } - - @Override - public Transform truncate(String sourceName, int sourceId, int width) { - return Expressions.apply("truncate", Expressions.column(sourceName), Expressions.literal(width)); - } - - @Override - public Transform year(String sourceName, int sourceId) { - return Expressions.years(sourceName); - } - - @Override - public Transform month(String sourceName, int sourceId) { - return Expressions.months(sourceName); - } - - @Override - public Transform day(String sourceName, int sourceId) { - return Expressions.days(sourceName); - } - - @Override - public Transform hour(String sourceName, int sourceId) { - return Expressions.hours(sourceName); - } - - @Override - public Transform alwaysNull(int fieldId, String sourceName, int sourceId) { - // do nothing for alwaysNull, it doesn't need to be converted to a transform - return null; - } - - @Override - public Transform unknown(int fieldId, String sourceName, int sourceId, String transform) { - return Expressions.apply(transform, Expressions.column(sourceName)); - } - }); + List transforms = + PartitionSpecVisitor.visit( + spec, + new PartitionSpecVisitor() { + @Override + public Transform identity(String sourceName, int sourceId) { + return Expressions.identity(sourceName); + } + + @Override + public Transform bucket(String sourceName, int sourceId, int numBuckets) { + return Expressions.bucket(numBuckets, sourceName); + } + + @Override + public Transform truncate(String sourceName, int sourceId, int width) { + return Expressions.apply( + "truncate", Expressions.column(sourceName), Expressions.literal(width)); + } + + @Override + public Transform year(String sourceName, int sourceId) { + return Expressions.years(sourceName); + } + + @Override + public Transform month(String sourceName, int sourceId) { + return Expressions.months(sourceName); + } + + @Override + public Transform day(String sourceName, int sourceId) { + return Expressions.days(sourceName); + } + + @Override + public Transform hour(String sourceName, int sourceId) { + return Expressions.hours(sourceName); + } + + @Override + public Transform alwaysNull(int fieldId, String sourceName, int sourceId) { + // do nothing for alwaysNull, it doesn't need to be converted to a transform + return null; + } + + @Override + public Transform unknown( + int fieldId, String sourceName, int sourceId, String transform) { + return Expressions.apply(transform, Expressions.column(sourceName)); + } + }); return transforms.stream().filter(Objects::nonNull).toArray(Transform[]::new); } @@ -326,7 +340,8 @@ public static Distribution buildRequiredDistribution(org.apache.iceberg.Table ta } } - public static SortOrder[] buildRequiredOrdering(Distribution distribution, org.apache.iceberg.Table table) { + public static SortOrder[] buildRequiredOrdering( + Distribution distribution, org.apache.iceberg.Table table) { if (distribution instanceof OrderedDistribution) { OrderedDistribution orderedDistribution = (OrderedDistribution) distribution; return orderedDistribution.ordering(); @@ -338,7 +353,8 @@ public static SortOrder[] buildRequiredOrdering(Distribution distribution, org.a public static DistributionMode distributionModeFor(org.apache.iceberg.Table table) { boolean isSortedTable = !table.sortOrder().isUnsorted(); - String defaultModeName = isSortedTable ? WRITE_DISTRIBUTION_MODE_RANGE : WRITE_DISTRIBUTION_MODE_NONE; + String defaultModeName = + isSortedTable ? WRITE_DISTRIBUTION_MODE_RANGE : WRITE_DISTRIBUTION_MODE_NONE; String modeName = table.properties().getOrDefault(WRITE_DISTRIBUTION_MODE, defaultModeName); return DistributionMode.fromName(modeName); } @@ -349,8 +365,10 @@ public static SortOrder[] convert(org.apache.iceberg.SortOrder sortOrder) { } public static Term toIcebergTerm(Transform transform) { - Preconditions.checkArgument(transform.references().length == 1, - "Cannot convert transform with more than one column reference: %s", transform); + Preconditions.checkArgument( + transform.references().length == 1, + "Cannot convert transform with more than one column reference: %s", + transform); String colName = DOT.join(transform.references()[0].fieldNames()); switch (transform.name().toLowerCase(Locale.ROOT)) { case "identity": @@ -388,8 +406,10 @@ public static PartitionSpec toPartitionSpec(Schema schema, Transform[] partition PartitionSpec.Builder builder = PartitionSpec.builderFor(schema); for (Transform transform : partitioning) { - Preconditions.checkArgument(transform.references().length == 1, - "Cannot convert transform with more than one column reference: %s", transform); + Preconditions.checkArgument( + transform.references().length == 1, + "Cannot convert transform with more than one column reference: %s", + transform); String colName = DOT.join(transform.references()[0].fieldNames()); switch (transform.name().toLowerCase(Locale.ROOT)) { case "identity": @@ -429,14 +449,16 @@ private static int findWidth(Transform transform) { if (expr instanceof Literal) { if (((Literal) expr).dataType() instanceof IntegerType) { Literal lit = (Literal) expr; - Preconditions.checkArgument(lit.value() > 0, - "Unsupported width for transform: %s", transform.describe()); + Preconditions.checkArgument( + lit.value() > 0, "Unsupported width for transform: %s", transform.describe()); return lit.value(); } else if (((Literal) expr).dataType() instanceof LongType) { Literal lit = (Literal) expr; - Preconditions.checkArgument(lit.value() > 0 && lit.value() < Integer.MAX_VALUE, - "Unsupported width for transform: %s", transform.describe()); + Preconditions.checkArgument( + lit.value() > 0 && lit.value() < Integer.MAX_VALUE, + "Unsupported width for transform: %s", + transform.describe()); if (lit.value() > Integer.MAX_VALUE) { throw new IllegalArgumentException(); } @@ -449,7 +471,8 @@ private static int findWidth(Transform transform) { } private static String leafName(String[] fieldNames) { - Preconditions.checkArgument(fieldNames.length > 0, "Invalid field name: at least one name is required"); + Preconditions.checkArgument( + fieldNames.length > 0, "Invalid field name: at least one name is required"); return fieldNames[fieldNames.length - 1]; } @@ -485,7 +508,8 @@ public static String describe(org.apache.iceberg.SortOrder order) { return Joiner.on(", ").join(SortOrderVisitor.visit(order, DescribeSortOrderVisitor.INSTANCE)); } - public static Long propertyAsLong(CaseInsensitiveStringMap options, String property, Long defaultValue) { + public static Long propertyAsLong( + CaseInsensitiveStringMap options, String property, Long defaultValue) { if (defaultValue != null) { return options.getLong(property, defaultValue); } @@ -498,7 +522,8 @@ public static Long propertyAsLong(CaseInsensitiveStringMap options, String prope return null; } - public static Integer propertyAsInt(CaseInsensitiveStringMap options, String property, Integer defaultValue) { + public static Integer propertyAsInt( + CaseInsensitiveStringMap options, String property, Integer defaultValue) { if (defaultValue != null) { return options.getInt(property, defaultValue); } @@ -511,7 +536,8 @@ public static Integer propertyAsInt(CaseInsensitiveStringMap options, String pro return null; } - public static Boolean propertyAsBoolean(CaseInsensitiveStringMap options, String property, Boolean defaultValue) { + public static Boolean propertyAsBoolean( + CaseInsensitiveStringMap options, String property, Boolean defaultValue) { if (defaultValue != null) { return options.getBoolean(property, defaultValue); } @@ -528,8 +554,7 @@ public static class DescribeSchemaVisitor extends TypeUtil.SchemaVisitor private static final Joiner COMMA = Joiner.on(','); private static final DescribeSchemaVisitor INSTANCE = new DescribeSchemaVisitor(); - private DescribeSchemaVisitor() { - } + private DescribeSchemaVisitor() {} @Override public String schema(Schema schema, String structResult) { @@ -589,11 +614,11 @@ public String primitive(Type.PrimitiveType primitive) { } } - private static class DescribeExpressionVisitor extends ExpressionVisitors.ExpressionVisitor { + private static class DescribeExpressionVisitor + extends ExpressionVisitors.ExpressionVisitor { private static final DescribeExpressionVisitor INSTANCE = new DescribeExpressionVisitor(); - private DescribeExpressionVisitor() { - } + private DescribeExpressionVisitor() {} @Override public String alwaysTrue() { @@ -662,7 +687,9 @@ public String predicate(UnboundPredicate pred) { } private static String sqlString(List> literals) { - return literals.stream().map(DescribeExpressionVisitor::sqlString).collect(Collectors.joining(", ")); + return literals.stream() + .map(DescribeExpressionVisitor::sqlString) + .collect(Collectors.joining(", ")); } private static String sqlString(org.apache.iceberg.expressions.Literal lit) { @@ -684,18 +711,21 @@ private static String sqlString(org.apache.iceberg.expressions.Literal lit) { * @param type the type of metadata table * @return a Dataset that will read the metadata table */ - private static Dataset loadMetadataTable(SparkSession spark, org.apache.iceberg.Table table, - MetadataTableType type) { - Table metadataTable = new SparkTable(MetadataTableUtils.createMetadataTableInstance(table, type), false); - return Dataset.ofRows(spark, DataSourceV2Relation.create(metadataTable, Some.empty(), Some.empty())); + private static Dataset loadMetadataTable( + SparkSession spark, org.apache.iceberg.Table table, MetadataTableType type) { + Table metadataTable = + new SparkTable(MetadataTableUtils.createMetadataTableInstance(table, type), false); + return Dataset.ofRows( + spark, DataSourceV2Relation.create(metadataTable, Some.empty(), Some.empty())); } /** - * Returns an Iceberg Table by its name from a Spark V2 Catalog. If cache is enabled in {@link SparkCatalog}, - * the {@link TableOperations} of the table may be stale, please refresh the table to get the latest one. + * Returns an Iceberg Table by its name from a Spark V2 Catalog. If cache is enabled in {@link + * SparkCatalog}, the {@link TableOperations} of the table may be stale, please refresh the table + * to get the latest one. * * @param spark SparkSession used for looking up catalog references and tables - * @param name The multipart identifier of the Iceberg table + * @param name The multipart identifier of the Iceberg table * @return an Iceberg table */ public static org.apache.iceberg.Table loadIcebergTable(SparkSession spark, String name) @@ -707,24 +737,28 @@ public static org.apache.iceberg.Table loadIcebergTable(SparkSession spark, Stri return toIcebergTable(sparkTable); } - public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, String name) throws ParseException { - return catalogAndIdentifier(spark, name, spark.sessionState().catalogManager().currentCatalog()); + public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, String name) + throws ParseException { + return catalogAndIdentifier( + spark, name, spark.sessionState().catalogManager().currentCatalog()); } - public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, String name, - CatalogPlugin defaultCatalog) throws ParseException { + public static CatalogAndIdentifier catalogAndIdentifier( + SparkSession spark, String name, CatalogPlugin defaultCatalog) throws ParseException { ParserInterface parser = spark.sessionState().sqlParser(); Seq multiPartIdentifier = parser.parseMultipartIdentifier(name); List javaMultiPartIdentifier = JavaConverters.seqAsJavaList(multiPartIdentifier); return catalogAndIdentifier(spark, javaMultiPartIdentifier, defaultCatalog); } - public static CatalogAndIdentifier catalogAndIdentifier(String description, SparkSession spark, String name) { - return catalogAndIdentifier(description, spark, name, spark.sessionState().catalogManager().currentCatalog()); + public static CatalogAndIdentifier catalogAndIdentifier( + String description, SparkSession spark, String name) { + return catalogAndIdentifier( + description, spark, name, spark.sessionState().catalogManager().currentCatalog()); } - public static CatalogAndIdentifier catalogAndIdentifier(String description, SparkSession spark, - String name, CatalogPlugin defaultCatalog) { + public static CatalogAndIdentifier catalogAndIdentifier( + String description, SparkSession spark, String name, CatalogPlugin defaultCatalog) { try { return catalogAndIdentifier(spark, name, defaultCatalog); } catch (ParseException e) { @@ -732,20 +766,23 @@ public static CatalogAndIdentifier catalogAndIdentifier(String description, Spar } } - public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, List nameParts) { - return catalogAndIdentifier(spark, nameParts, spark.sessionState().catalogManager().currentCatalog()); + public static CatalogAndIdentifier catalogAndIdentifier( + SparkSession spark, List nameParts) { + return catalogAndIdentifier( + spark, nameParts, spark.sessionState().catalogManager().currentCatalog()); } /** - * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply - * Attempts to find the catalog and identifier a multipart identifier represents + * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply Attempts to find the + * catalog and identifier a multipart identifier represents + * * @param spark Spark session to use for resolution * @param nameParts Multipart identifier representing a table * @param defaultCatalog Catalog to use if none is specified * @return The CatalogPlugin and Identifier for the table */ - public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, List nameParts, - CatalogPlugin defaultCatalog) { + public static CatalogAndIdentifier catalogAndIdentifier( + SparkSession spark, List nameParts, CatalogPlugin defaultCatalog) { CatalogManager catalogManager = spark.sessionState().catalogManager(); String[] currentNamespace; @@ -755,18 +792,19 @@ public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, List currentNamespace = defaultCatalog.defaultNamespace(); } - Pair catalogIdentifier = SparkUtil.catalogAndIdentifier(nameParts, - catalogName -> { - try { - return catalogManager.catalog(catalogName); - } catch (Exception e) { - return null; - } - }, - Identifier::of, - defaultCatalog, - currentNamespace - ); + Pair catalogIdentifier = + SparkUtil.catalogAndIdentifier( + nameParts, + catalogName -> { + try { + return catalogManager.catalog(catalogName); + } catch (Exception e) { + return null; + } + }, + Identifier::of, + defaultCatalog, + currentNamespace); return new CatalogAndIdentifier(catalogIdentifier); } @@ -775,18 +813,17 @@ private static TableCatalog asTableCatalog(CatalogPlugin catalog) { return (TableCatalog) catalog; } - throw new IllegalArgumentException(String.format( - "Cannot use catalog %s(%s): not a TableCatalog", catalog.name(), catalog.getClass().getName())); + throw new IllegalArgumentException( + String.format( + "Cannot use catalog %s(%s): not a TableCatalog", + catalog.name(), catalog.getClass().getName())); } - /** - * This mimics a class inside of Spark which is private inside of LookupCatalog. - */ + /** This mimics a class inside of Spark which is private inside of LookupCatalog. */ public static class CatalogAndIdentifier { private final CatalogPlugin catalog; private final Identifier identifier; - public CatalogAndIdentifier(CatalogPlugin catalog, Identifier identifier) { this.catalog = catalog; this.identifier = identifier; @@ -818,49 +855,53 @@ public static TableIdentifier identifierToTableIdentifier(Identifier identifier) * @param format format of the file * @return all table's partitions */ - public static List getPartitions(SparkSession spark, Path rootPath, String format) { + public static List getPartitions( + SparkSession spark, Path rootPath, String format) { FileStatusCache fileStatusCache = FileStatusCache.getOrCreate(spark); Map emptyMap = Collections.emptyMap(); - InMemoryFileIndex fileIndex = new InMemoryFileIndex( - spark, - JavaConverters - .collectionAsScalaIterableConverter(ImmutableList.of(rootPath)) - .asScala() - .toSeq(), - JavaConverters - .mapAsScalaMapConverter(emptyMap) - .asScala() - .toMap(Predef.conforms()), - Option.empty(), - fileStatusCache, - Option.empty(), - Option.empty()); + InMemoryFileIndex fileIndex = + new InMemoryFileIndex( + spark, + JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(rootPath)) + .asScala() + .toSeq(), + JavaConverters.mapAsScalaMapConverter(emptyMap).asScala().toMap(Predef.conforms()), + Option.empty(), + fileStatusCache, + Option.empty(), + Option.empty()); org.apache.spark.sql.execution.datasources.PartitionSpec spec = fileIndex.partitionSpec(); StructType schema = spec.partitionColumns(); - return JavaConverters - .seqAsJavaListConverter(spec.partitions()) - .asJava() - .stream() - .map(partition -> { - Map values = Maps.newHashMap(); - JavaConverters.asJavaIterableConverter(schema).asJava().forEach(field -> { - int fieldIndex = schema.fieldIndex(field.name()); - Object catalystValue = partition.values().get(fieldIndex, field.dataType()); - Object value = CatalystTypeConverters.convertToScala(catalystValue, field.dataType()); - values.put(field.name(), String.valueOf(value)); - }); - return new SparkPartition(values, partition.path().toString(), format); - }).collect(Collectors.toList()); - } - - public static org.apache.spark.sql.catalyst.TableIdentifier toV1TableIdentifier(Identifier identifier) { + return JavaConverters.seqAsJavaListConverter(spec.partitions()).asJava().stream() + .map( + partition -> { + Map values = Maps.newHashMap(); + JavaConverters.asJavaIterableConverter(schema) + .asJava() + .forEach( + field -> { + int fieldIndex = schema.fieldIndex(field.name()); + Object catalystValue = partition.values().get(fieldIndex, field.dataType()); + Object value = + CatalystTypeConverters.convertToScala(catalystValue, field.dataType()); + values.put(field.name(), String.valueOf(value)); + }); + return new SparkPartition(values, partition.path().toString(), format); + }) + .collect(Collectors.toList()); + } + + public static org.apache.spark.sql.catalyst.TableIdentifier toV1TableIdentifier( + Identifier identifier) { String[] namespace = identifier.namespace(); - Preconditions.checkArgument(namespace.length <= 1, - "Cannot convert %s to a Spark v1 identifier, namespace contains more than 1 part", identifier); + Preconditions.checkArgument( + namespace.length <= 1, + "Cannot convert %s to a Spark v1 identifier, namespace contains more than 1 part", + identifier); String table = identifier.name(); Option database = namespace.length == 1 ? Option.apply(namespace[0]) : Option.empty(); @@ -870,54 +911,80 @@ public static org.apache.spark.sql.catalyst.TableIdentifier toV1TableIdentifier( private static class DescribeSortOrderVisitor implements SortOrderVisitor { private static final DescribeSortOrderVisitor INSTANCE = new DescribeSortOrderVisitor(); - private DescribeSortOrderVisitor() { - } + private DescribeSortOrderVisitor() {} @Override - public String field(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String field( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("%s %s %s", sourceName, direction, nullOrder); } @Override - public String bucket(String sourceName, int sourceId, int numBuckets, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String bucket( + String sourceName, + int sourceId, + int numBuckets, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("bucket(%s, %s) %s %s", numBuckets, sourceName, direction, nullOrder); } @Override - public String truncate(String sourceName, int sourceId, int width, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String truncate( + String sourceName, + int sourceId, + int width, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("truncate(%s, %s) %s %s", sourceName, width, direction, nullOrder); } @Override - public String year(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String year( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("years(%s) %s %s", sourceName, direction, nullOrder); } @Override - public String month(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String month( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("months(%s) %s %s", sourceName, direction, nullOrder); } @Override - public String day(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String day( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("days(%s) %s %s", sourceName, direction, nullOrder); } @Override - public String hour(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String hour( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("hours(%s) %s %s", sourceName, direction, nullOrder); } @Override - public String unknown(String sourceName, int sourceId, String transform, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String unknown( + String sourceName, + int sourceId, + String transform, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("%s(%s) %s %s", transform, sourceName, direction, nullOrder); } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java index 117e7e629431..de0ff91481e5 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -74,26 +76,24 @@ import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** * A Spark TableCatalog implementation that wraps an Iceberg {@link Catalog}. - *

    - * This supports the following catalog configuration options: + * + *

    This supports the following catalog configuration options: + * *

      - *
    • type - catalog type, "hive" or "hadoop". - * To specify a non-hive or hadoop catalog, use the catalog-impl option. - *
    • - *
    • uri - the Hive Metastore URI (Hive catalog only)
    • - *
    • warehouse - the warehouse path (Hadoop catalog only)
    • - *
    • catalog-impl - a custom {@link Catalog} implementation to use
    • - *
    • default-namespace - a namespace to use as the default
    • - *
    • cache-enabled - whether to enable catalog cache
    • - *
    • cache.expiration-interval-ms - interval in millis before expiring tables from catalog cache. - * Refer to {@link CatalogProperties#CACHE_EXPIRATION_INTERVAL_MS} for further details and significant values. - *
    • + *
    • type - catalog type, "hive" or "hadoop". To specify a non-hive or hadoop + * catalog, use the catalog-impl option. + *
    • uri - the Hive Metastore URI (Hive catalog only) + *
    • warehouse - the warehouse path (Hadoop catalog only) + *
    • catalog-impl - a custom {@link Catalog} implementation to use + *
    • default-namespace - a namespace to use as the default + *
    • cache-enabled - whether to enable catalog cache + *
    • cache.expiration-interval-ms - interval in millis before expiring tables from + * catalog cache. Refer to {@link CatalogProperties#CACHE_EXPIRATION_INTERVAL_MS} for further + * details and significant values. *
    + * *

    */ public class SparkCatalog extends BaseCatalog { @@ -147,17 +147,18 @@ public SparkTable loadTable(Identifier ident) throws NoSuchTableException { } @Override - public SparkTable createTable(Identifier ident, StructType schema, - Transform[] transforms, - Map properties) throws TableAlreadyExistsException { + public SparkTable createTable( + Identifier ident, StructType schema, Transform[] transforms, Map properties) + throws TableAlreadyExistsException { Schema icebergSchema = SparkSchemaUtil.convert(schema, useTimestampsWithoutZone); try { Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); - Table icebergTable = builder - .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) - .withLocation(properties.get("location")) - .withProperties(Spark3Util.rebuildCreateProperties(properties)) - .create(); + Table icebergTable = + builder + .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) + .withLocation(properties.get("location")) + .withProperties(Spark3Util.rebuildCreateProperties(properties)) + .create(); return new SparkTable(icebergTable, !cacheEnabled); } catch (AlreadyExistsException e) { throw new TableAlreadyExistsException(ident); @@ -165,15 +166,18 @@ public SparkTable createTable(Identifier ident, StructType schema, } @Override - public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] transforms, - Map properties) throws TableAlreadyExistsException { + public StagedTable stageCreate( + Identifier ident, StructType schema, Transform[] transforms, Map properties) + throws TableAlreadyExistsException { Schema icebergSchema = SparkSchemaUtil.convert(schema, useTimestampsWithoutZone); try { Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); - Transaction transaction = builder.withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) - .withLocation(properties.get("location")) - .withProperties(Spark3Util.rebuildCreateProperties(properties)) - .createTransaction(); + Transaction transaction = + builder + .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) + .withLocation(properties.get("location")) + .withProperties(Spark3Util.rebuildCreateProperties(properties)) + .createTransaction(); return new StagedSparkTable(transaction); } catch (AlreadyExistsException e) { throw new TableAlreadyExistsException(ident); @@ -181,15 +185,18 @@ public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] } @Override - public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] transforms, - Map properties) throws NoSuchTableException { + public StagedTable stageReplace( + Identifier ident, StructType schema, Transform[] transforms, Map properties) + throws NoSuchTableException { Schema icebergSchema = SparkSchemaUtil.convert(schema, useTimestampsWithoutZone); try { Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); - Transaction transaction = builder.withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) - .withLocation(properties.get("location")) - .withProperties(Spark3Util.rebuildCreateProperties(properties)) - .replaceTransaction(); + Transaction transaction = + builder + .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) + .withLocation(properties.get("location")) + .withProperties(Spark3Util.rebuildCreateProperties(properties)) + .replaceTransaction(); return new StagedSparkTable(transaction); } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { throw new NoSuchTableException(ident); @@ -197,19 +204,22 @@ public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] } @Override - public StagedTable stageCreateOrReplace(Identifier ident, StructType schema, Transform[] transforms, - Map properties) { + public StagedTable stageCreateOrReplace( + Identifier ident, StructType schema, Transform[] transforms, Map properties) { Schema icebergSchema = SparkSchemaUtil.convert(schema, useTimestampsWithoutZone); Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); - Transaction transaction = builder.withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) - .withLocation(properties.get("location")) - .withProperties(Spark3Util.rebuildCreateProperties(properties)) - .createOrReplaceTransaction(); + Transaction transaction = + builder + .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) + .withLocation(properties.get("location")) + .withProperties(Spark3Util.rebuildCreateProperties(properties)) + .createOrReplaceTransaction(); return new StagedSparkTable(transaction); } @Override - public SparkTable alterTable(Identifier ident, TableChange... changes) throws NoSuchTableException { + public SparkTable alterTable(Identifier ident, TableChange... changes) + throws NoSuchTableException { SetProperty setLocation = null; SetProperty setSnapshotId = null; SetProperty pickSnapshotId = null; @@ -226,8 +236,9 @@ public SparkTable alterTable(Identifier ident, TableChange... changes) throws No } else if ("cherry-pick-snapshot-id".equalsIgnoreCase(set.property())) { pickSnapshotId = set; } else if ("sort-order".equalsIgnoreCase(set.property())) { - throw new UnsupportedOperationException("Cannot specify the 'sort-order' because it's a reserved table " + - "property. Please use the command 'ALTER TABLE ... WRITE ORDERED BY' to specify write sort-orders."); + throw new UnsupportedOperationException( + "Cannot specify the 'sort-order' because it's a reserved table " + + "property. Please use the command 'ALTER TABLE ... WRITE ORDERED BY' to specify write sort-orders."); } else { propertyChanges.add(set); } @@ -242,7 +253,8 @@ public SparkTable alterTable(Identifier ident, TableChange... changes) throws No try { Table table = load(ident).first(); - commitChanges(table, setLocation, setSnapshotId, pickSnapshotId, propertyChanges, schemaChanges); + commitChanges( + table, setLocation, setSnapshotId, pickSnapshotId, propertyChanges, schemaChanges); return new SparkTable(table, true /* refreshEagerly */); } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { throw new NoSuchTableException(ident); @@ -261,20 +273,19 @@ public boolean purgeTable(Identifier ident) { ValidationException.check( PropertyUtil.propertyAsBoolean(table.properties(), GC_ENABLED, GC_ENABLED_DEFAULT), "Cannot purge table: GC is disabled (deleting files may corrupt other tables)"); - String metadataFileLocation = ((HasTableOperations) table).operations().current().metadataFileLocation(); + String metadataFileLocation = + ((HasTableOperations) table).operations().current().metadataFileLocation(); boolean dropped = dropTableWithoutPurging(ident); if (dropped) { - // We should check whether the metadata file exists. Because the HadoopCatalog/HadoopTables will drop the + // We should check whether the metadata file exists. Because the HadoopCatalog/HadoopTables + // will drop the // warehouse directly and ignore the `purge` argument. boolean metadataFileExists = table.io().newInputFile(metadataFileLocation).exists(); if (metadataFileExists) { - SparkActions.get() - .deleteReachableFiles(metadataFileLocation) - .io(table.io()) - .execute(); + SparkActions.get().deleteReachableFiles(metadataFileLocation).io(table.io()).execute(); } } @@ -293,7 +304,8 @@ private boolean dropTableWithoutPurging(Identifier ident) { } @Override - public void renameTable(Identifier from, Identifier to) throws NoSuchTableException, TableAlreadyExistsException { + public void renameTable(Identifier from, Identifier to) + throws NoSuchTableException, TableAlreadyExistsException { try { checkNotPathIdentifier(from, "renameTable"); checkNotPathIdentifier(to, "renameTable"); @@ -355,7 +367,8 @@ public String[][] listNamespaces(String[] namespace) throws NoSuchNamespaceExcep } @Override - public Map loadNamespaceMetadata(String[] namespace) throws NoSuchNamespaceException { + public Map loadNamespaceMetadata(String[] namespace) + throws NoSuchNamespaceException { if (asNamespaceCatalog != null) { try { return asNamespaceCatalog.loadNamespaceMetadata(Namespace.of(namespace)); @@ -368,10 +381,12 @@ public Map loadNamespaceMetadata(String[] namespace) throws NoSu } @Override - public void createNamespace(String[] namespace, Map metadata) throws NamespaceAlreadyExistsException { + public void createNamespace(String[] namespace, Map metadata) + throws NamespaceAlreadyExistsException { if (asNamespaceCatalog != null) { try { - if (asNamespaceCatalog instanceof HadoopCatalog && DEFAULT_NS_KEYS.equals(metadata.keySet())) { + if (asNamespaceCatalog instanceof HadoopCatalog + && DEFAULT_NS_KEYS.equals(metadata.keySet())) { // Hadoop catalog will reject metadata properties, but Spark automatically adds "owner". // If only the automatic properties are present, replace metadata with an empty map. asNamespaceCatalog.createNamespace(Namespace.of(namespace), ImmutableMap.of()); @@ -382,12 +397,14 @@ public void createNamespace(String[] namespace, Map metadata) th throw new NamespaceAlreadyExistsException(namespace); } } else { - throw new UnsupportedOperationException("Namespaces are not supported by catalog: " + catalogName); + throw new UnsupportedOperationException( + "Namespaces are not supported by catalog: " + catalogName); } } @Override - public void alterNamespace(String[] namespace, NamespaceChange... changes) throws NoSuchNamespaceException { + public void alterNamespace(String[] namespace, NamespaceChange... changes) + throws NoSuchNamespaceException { if (asNamespaceCatalog != null) { Map updates = Maps.newHashMap(); Set removals = Sets.newHashSet(); @@ -398,7 +415,8 @@ public void alterNamespace(String[] namespace, NamespaceChange... changes) throw } else if (change instanceof NamespaceChange.RemoveProperty) { removals.add(((NamespaceChange.RemoveProperty) change).property()); } else { - throw new UnsupportedOperationException("Cannot apply unknown namespace change: " + change); + throw new UnsupportedOperationException( + "Cannot apply unknown namespace change: " + change); } } @@ -434,12 +452,15 @@ public boolean dropNamespace(String[] namespace) throws NoSuchNamespaceException @Override public final void initialize(String name, CaseInsensitiveStringMap options) { - this.cacheEnabled = PropertyUtil.propertyAsBoolean(options, - CatalogProperties.CACHE_ENABLED, CatalogProperties.CACHE_ENABLED_DEFAULT); + this.cacheEnabled = + PropertyUtil.propertyAsBoolean( + options, CatalogProperties.CACHE_ENABLED, CatalogProperties.CACHE_ENABLED_DEFAULT); - long cacheExpirationIntervalMs = PropertyUtil.propertyAsLong(options, - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS_DEFAULT); + long cacheExpirationIntervalMs = + PropertyUtil.propertyAsLong( + options, + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS_DEFAULT); // An expiration interval of 0ms effectively disables caching. // Do not wrap with CachingCatalog. @@ -451,15 +472,17 @@ public final void initialize(String name, CaseInsensitiveStringMap options) { this.catalogName = name; SparkSession sparkSession = SparkSession.active(); - this.useTimestampsWithoutZone = SparkUtil.useTimestampWithoutZoneInNewTables(sparkSession.conf()); - this.tables = new HadoopTables(SparkUtil.hadoopConfCatalogOverrides(SparkSession.active(), name)); - this.icebergCatalog = cacheEnabled ? CachingCatalog.wrap(catalog, cacheExpirationIntervalMs) : catalog; + this.useTimestampsWithoutZone = + SparkUtil.useTimestampWithoutZoneInNewTables(sparkSession.conf()); + this.tables = + new HadoopTables(SparkUtil.hadoopConfCatalogOverrides(SparkSession.active(), name)); + this.icebergCatalog = + cacheEnabled ? CachingCatalog.wrap(catalog, cacheExpirationIntervalMs) : catalog; if (catalog instanceof SupportsNamespaces) { this.asNamespaceCatalog = (SupportsNamespaces) catalog; if (options.containsKey("default-namespace")) { - this.defaultNamespace = Splitter.on('.') - .splitToList(options.get("default-namespace")) - .toArray(new String[0]); + this.defaultNamespace = + Splitter.on('.').splitToList(options.get("default-namespace")).toArray(new String[0]); } } } @@ -469,12 +492,18 @@ public String name() { return catalogName; } - private static void commitChanges(Table table, SetProperty setLocation, SetProperty setSnapshotId, - SetProperty pickSnapshotId, List propertyChanges, - List schemaChanges) { - // don't allow setting the snapshot and picking a commit at the same time because order is ambiguous and choosing + private static void commitChanges( + Table table, + SetProperty setLocation, + SetProperty setSnapshotId, + SetProperty pickSnapshotId, + List propertyChanges, + List schemaChanges) { + // don't allow setting the snapshot and picking a commit at the same time because order is + // ambiguous and choosing // one order leads to different results - Preconditions.checkArgument(setSnapshotId == null || pickSnapshotId == null, + Preconditions.checkArgument( + setSnapshotId == null || pickSnapshotId == null, "Cannot set the current the current snapshot ID and cherry-pick snapshot changes"); if (setSnapshotId != null) { @@ -491,9 +520,7 @@ private static void commitChanges(Table table, SetProperty setLocation, SetPrope Transaction transaction = table.newTransaction(); if (setLocation != null) { - transaction.updateLocation() - .setLocation(setLocation.value()) - .commit(); + transaction.updateLocation().setLocation(setLocation.value()).commit(); } if (!propertyChanges.isEmpty()) { @@ -513,8 +540,9 @@ private static boolean isPathIdentifier(Identifier ident) { private static void checkNotPathIdentifier(Identifier identifier, String method) { if (identifier instanceof PathIdentifier) { - throw new IllegalArgumentException(String.format("Cannot pass path based identifier to %s method. %s is a path.", - method, identifier)); + throw new IllegalArgumentException( + String.format( + "Cannot pass path based identifier to %s method. %s is a path.", method, identifier)); } } @@ -531,7 +559,8 @@ private Pair load(Identifier ident) { throw e; } - // if the original load didn't work, the identifier may be extended and include a snapshot selector + // if the original load didn't work, the identifier may be extended and include a snapshot + // selector TableIdentifier namespaceAsIdent = buildIdentifier(namespaceToIdentifier(ident.namespace())); Table table; try { @@ -595,10 +624,13 @@ private Pair loadFromPathIdentifier(PathIdentifier ident) { } } - Preconditions.checkArgument(asOfTimestamp == null || snapshotId == null, - "Cannot specify both snapshot-id and as-of-timestamp: %s", ident.location()); + Preconditions.checkArgument( + asOfTimestamp == null || snapshotId == null, + "Cannot specify both snapshot-id and as-of-timestamp: %s", + ident.location()); - Table table = tables.load(parsed.first() + (metadataTableName != null ? "#" + metadataTableName : "")); + Table table = + tables.load(parsed.first() + (metadataTableName != null ? "#" + metadataTableName : "")); if (snapshotId != null) { return Pair.of(table, snapshotId); @@ -610,16 +642,16 @@ private Pair loadFromPathIdentifier(PathIdentifier ident) { } private Identifier namespaceToIdentifier(String[] namespace) { - Preconditions.checkArgument(namespace.length > 0, - "Cannot convert empty namespace to identifier"); + Preconditions.checkArgument( + namespace.length > 0, "Cannot convert empty namespace to identifier"); String[] ns = Arrays.copyOf(namespace, namespace.length - 1); String name = namespace[ns.length]; return Identifier.of(ns, name); } private Catalog.TableBuilder newBuilder(Identifier ident, Schema schema) { - return isPathIdentifier(ident) ? - tables.buildTable(((PathIdentifier) ident).location(), schema) : - icebergCatalog.buildTable(buildIdentifier(ident), schema); + return isPathIdentifier(ident) + ? tables.buildTable(((PathIdentifier) ident).location(), schema) + : icebergCatalog.buildTable(buildIdentifier(ident), schema); } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java index 79051b12625e..33e5ca936800 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Locale; @@ -159,7 +158,8 @@ public ThisT tableProperty(String name) { protected T parse(Function conversion, T defaultValue) { if (optionName != null) { - // use lower case comparison as DataSourceOptions.asMap() in Spark 2 returns a lower case map + // use lower case comparison as DataSourceOptions.asMap() in Spark 2 returns a lower case + // map String optionValue = options.get(optionName.toLowerCase(Locale.ROOT)); if (optionValue != null) { return conversion.apply(optionValue); diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java index a6390d39c575..87e831872472 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.nio.ByteBuffer; @@ -62,10 +61,12 @@ public SparkDataFile(Types.StructType type, StructType sparkType) { this.wrappedPartition = new SparkStructLike(type.fieldType("partition").asStructType()); Map positions = Maps.newHashMap(); - type.fields().forEach(field -> { - String fieldName = field.name(); - positions.put(fieldName, fieldPosition(fieldName, sparkType)); - }); + type.fields() + .forEach( + field -> { + String fieldName = field.name(); + positions.put(fieldName, fieldPosition(fieldName, sparkType)); + }); filePathPosition = positions.get("file_path"); fileFormatPosition = positions.get("file_format"); @@ -139,23 +140,29 @@ public Map valueCounts() { @Override public Map nullValueCounts() { - return wrapped.isNullAt(nullValueCountsPosition) ? null : wrapped.getJavaMap(nullValueCountsPosition); + return wrapped.isNullAt(nullValueCountsPosition) + ? null + : wrapped.getJavaMap(nullValueCountsPosition); } @Override public Map nanValueCounts() { - return wrapped.isNullAt(nanValueCountsPosition) ? null : wrapped.getJavaMap(nanValueCountsPosition); + return wrapped.isNullAt(nanValueCountsPosition) + ? null + : wrapped.getJavaMap(nanValueCountsPosition); } @Override public Map lowerBounds() { - Map lowerBounds = wrapped.isNullAt(lowerBoundsPosition) ? null : wrapped.getJavaMap(lowerBoundsPosition); + Map lowerBounds = + wrapped.isNullAt(lowerBoundsPosition) ? null : wrapped.getJavaMap(lowerBoundsPosition); return convert(lowerBoundsType, lowerBounds); } @Override public Map upperBounds() { - Map upperBounds = wrapped.isNullAt(upperBoundsPosition) ? null : wrapped.getJavaMap(upperBoundsPosition); + Map upperBounds = + wrapped.isNullAt(upperBoundsPosition) ? null : wrapped.getJavaMap(upperBoundsPosition); return convert(upperBoundsType, upperBounds); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java index 2eb53baa688e..5c6fe3e0ff96 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import com.google.errorprone.annotations.FormatMethod; @@ -29,8 +28,7 @@ public class SparkExceptionUtil { - private SparkExceptionUtil() { - } + private SparkExceptionUtil() {} /** * Converts checked exceptions to unchecked exceptions. @@ -41,8 +39,8 @@ private SparkExceptionUtil() { * @return unchecked exception. */ @FormatMethod - public static RuntimeException toUncheckedException(final Throwable cause, final String message, - final Object... args) { + public static RuntimeException toUncheckedException( + final Throwable cause, final String message, final Object... args) { // Parameters are required to be final to help @FormatMethod do static analysis if (cause instanceof RuntimeException) { return (RuntimeException) cause; diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java index 2e23d968bde0..8ba75e754f7f 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java @@ -16,9 +16,22 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.expressions.Expressions.and; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.isNaN; +import static org.apache.iceberg.expressions.Expressions.isNull; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.not; +import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.or; +import static org.apache.iceberg.expressions.Expressions.startsWith; + import java.sql.Date; import java.sql.Timestamp; import java.time.Instant; @@ -51,48 +64,34 @@ import org.apache.spark.sql.sources.Or; import org.apache.spark.sql.sources.StringStartsWith; -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.isNull; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.not; -import static org.apache.iceberg.expressions.Expressions.notNull; -import static org.apache.iceberg.expressions.Expressions.or; -import static org.apache.iceberg.expressions.Expressions.startsWith; - public class SparkFilters { - private SparkFilters() { - } - - private static final Map, Operation> FILTERS = ImmutableMap - ., Operation>builder() - .put(AlwaysTrue$.class, Operation.TRUE) - .put(AlwaysFalse$.class, Operation.FALSE) - .put(EqualTo.class, Operation.EQ) - .put(EqualNullSafe.class, Operation.EQ) - .put(GreaterThan.class, Operation.GT) - .put(GreaterThanOrEqual.class, Operation.GT_EQ) - .put(LessThan.class, Operation.LT) - .put(LessThanOrEqual.class, Operation.LT_EQ) - .put(In.class, Operation.IN) - .put(IsNull.class, Operation.IS_NULL) - .put(IsNotNull.class, Operation.NOT_NULL) - .put(And.class, Operation.AND) - .put(Or.class, Operation.OR) - .put(Not.class, Operation.NOT) - .put(StringStartsWith.class, Operation.STARTS_WITH) - .build(); + private SparkFilters() {} + + private static final Map, Operation> FILTERS = + ImmutableMap., Operation>builder() + .put(AlwaysTrue$.class, Operation.TRUE) + .put(AlwaysFalse$.class, Operation.FALSE) + .put(EqualTo.class, Operation.EQ) + .put(EqualNullSafe.class, Operation.EQ) + .put(GreaterThan.class, Operation.GT) + .put(GreaterThanOrEqual.class, Operation.GT_EQ) + .put(LessThan.class, Operation.LT) + .put(LessThanOrEqual.class, Operation.LT_EQ) + .put(In.class, Operation.IN) + .put(IsNull.class, Operation.IS_NULL) + .put(IsNotNull.class, Operation.NOT_NULL) + .put(And.class, Operation.AND) + .put(Or.class, Operation.OR) + .put(Not.class, Operation.NOT) + .put(StringStartsWith.class, Operation.STARTS_WITH) + .build(); public static Expression convert(Filter[] filters) { Expression expression = Expressions.alwaysTrue(); for (Filter filter : filters) { Expression converted = convert(filter); - Preconditions.checkArgument(converted != null, "Cannot convert filter to Iceberg: %s", filter); + Preconditions.checkArgument( + converted != null, "Cannot convert filter to Iceberg: %s", filter); expression = Expressions.and(expression, converted); } return expression; @@ -137,8 +136,8 @@ public static Expression convert(Filter filter) { if (filter instanceof EqualTo) { EqualTo eq = (EqualTo) filter; // comparison with null in normal equality is always null. this is probably a mistake. - Preconditions.checkNotNull(eq.value(), - "Expression is always false (eq is not null-safe): %s", filter); + Preconditions.checkNotNull( + eq.value(), "Expression is always false (eq is not null-safe): %s", filter); return handleEqual(eq.attribute(), eq.value()); } else { EqualNullSafe eq = (EqualNullSafe) filter; @@ -151,7 +150,8 @@ public static Expression convert(Filter filter) { case IN: In inFilter = (In) filter; - return in(inFilter.attribute(), + return in( + inFilter.attribute(), Stream.of(inFilter.values()) .filter(Objects::nonNull) .map(SparkFilters::convertLiteral) @@ -165,30 +165,33 @@ public static Expression convert(Filter filter) { } return null; - case AND: { - And andFilter = (And) filter; - Expression left = convert(andFilter.left()); - Expression right = convert(andFilter.right()); - if (left != null && right != null) { - return and(left, right); + case AND: + { + And andFilter = (And) filter; + Expression left = convert(andFilter.left()); + Expression right = convert(andFilter.right()); + if (left != null && right != null) { + return and(left, right); + } + return null; } - return null; - } - - case OR: { - Or orFilter = (Or) filter; - Expression left = convert(orFilter.left()); - Expression right = convert(orFilter.right()); - if (left != null && right != null) { - return or(left, right); + + case OR: + { + Or orFilter = (Or) filter; + Expression left = convert(orFilter.left()); + Expression right = convert(orFilter.right()); + if (left != null && right != null) { + return or(left, right); + } + return null; } - return null; - } - case STARTS_WITH: { - StringStartsWith stringStartsWith = (StringStartsWith) filter; - return startsWith(stringStartsWith.attribute(), stringStartsWith.value()); - } + case STARTS_WITH: + { + StringStartsWith stringStartsWith = (StringStartsWith) filter; + return startsWith(stringStartsWith.attribute(), stringStartsWith.value()); + } } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java index d4dd53d34a97..b35213501aef 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Schema; @@ -27,9 +26,10 @@ /** * By default Spark type {@link org.apache.iceberg.types.Types.TimestampType} should be converted to - * {@link Types.TimestampType#withZone()} iceberg type. But we also can convert - * {@link org.apache.iceberg.types.Types.TimestampType} to {@link Types.TimestampType#withoutZone()} iceberg type - * by setting {@link SparkSQLProperties#USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES} to 'true' + * {@link Types.TimestampType#withZone()} iceberg type. But we also can convert {@link + * org.apache.iceberg.types.Types.TimestampType} to {@link Types.TimestampType#withoutZone()} + * iceberg type by setting {@link SparkSQLProperties#USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES} + * to 'true' */ class SparkFixupTimestampType extends FixupTypes { @@ -38,8 +38,8 @@ private SparkFixupTimestampType(Schema referenceSchema) { } static Schema fixup(Schema schema) { - return new Schema(TypeUtil.visit(schema, - new SparkFixupTimestampType(schema)).asStructType().fields()); + return new Schema( + TypeUtil.visit(schema, new SparkFixupTimestampType(schema)).asStructType().fields()); } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java index 5508965af249..6c4ec39b20f1 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Schema; @@ -25,8 +24,8 @@ import org.apache.iceberg.types.TypeUtil; /** - * Some types, like binary and fixed, are converted to the same Spark type. Conversion back - * can produce only one, which may not be correct. + * Some types, like binary and fixed, are converted to the same Spark type. Conversion back can + * produce only one, which may not be correct. */ class SparkFixupTypes extends FixupTypes { @@ -35,8 +34,8 @@ private SparkFixupTypes(Schema referenceSchema) { } static Schema fixup(Schema schema, Schema referenceSchema) { - return new Schema(TypeUtil.visit(schema, - new SparkFixupTypes(referenceSchema)).asStructType().fields()); + return new Schema( + TypeUtil.visit(schema, new SparkFixupTypes(referenceSchema)).asStructType().fields()); } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java index 462f44c4ae36..937c31e45960 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -31,18 +30,21 @@ /** * A class for common Iceberg configs for Spark reads. - *

    - * If a config is set at multiple levels, the following order of precedence is used (top to bottom): + * + *

    If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * *

      - *
    1. Read options
    2. - *
    3. Session configuration
    4. - *
    5. Table metadata
    6. + *
    7. Read options + *
    8. Session configuration + *
    9. Table metadata *
    - * The most specific value is set in read options and takes precedence over all other configs. - * If no read option is provided, this class checks the session configuration for any overrides. - * If no applicable value is found in the session configuration, this class uses the table metadata. - *

    - * Note this class is NOT meant to be serialized and sent to executors. + * + * The most specific value is set in read options and takes precedence over all other configs. If no + * read option is provided, this class checks the session configuration for any overrides. If no + * applicable value is found in the session configuration, this class uses the table metadata. + * + *

    Note this class is NOT meant to be serialized and sent to executors. */ public class SparkReadConf { @@ -64,55 +66,47 @@ public boolean localityEnabled() { if (file instanceof HadoopInputFile) { String scheme = ((HadoopInputFile) file).getFileSystem().getScheme(); boolean defaultValue = LOCALITY_WHITELIST_FS.contains(scheme); - return PropertyUtil.propertyAsBoolean( - readOptions, - SparkReadOptions.LOCALITY, - defaultValue); + return PropertyUtil.propertyAsBoolean(readOptions, SparkReadOptions.LOCALITY, defaultValue); } return false; } public Long snapshotId() { - return confParser.longConf() - .option(SparkReadOptions.SNAPSHOT_ID) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.SNAPSHOT_ID).parseOptional(); } public Long asOfTimestamp() { - return confParser.longConf() - .option(SparkReadOptions.AS_OF_TIMESTAMP) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.AS_OF_TIMESTAMP).parseOptional(); } public Long startSnapshotId() { - return confParser.longConf() - .option(SparkReadOptions.START_SNAPSHOT_ID) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.START_SNAPSHOT_ID).parseOptional(); } public Long endSnapshotId() { - return confParser.longConf() - .option(SparkReadOptions.END_SNAPSHOT_ID) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.END_SNAPSHOT_ID).parseOptional(); } public boolean streamingSkipDeleteSnapshots() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS) .defaultValue(SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS_DEFAULT) .parse(); } public boolean streamingSkipOverwriteSnapshots() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS) .defaultValue(SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS_DEFAULT) .parse(); } public boolean parquetVectorizationEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.VECTORIZATION_ENABLED) .sessionConf(SparkSQLProperties.VECTORIZATION_ENABLED) .tableProperty(TableProperties.PARQUET_VECTORIZATION_ENABLED) @@ -121,7 +115,8 @@ public boolean parquetVectorizationEnabled() { } public int parquetBatchSize() { - return confParser.intConf() + return confParser + .intConf() .option(SparkReadOptions.VECTORIZATION_BATCH_SIZE) .tableProperty(TableProperties.PARQUET_BATCH_SIZE) .defaultValue(TableProperties.PARQUET_BATCH_SIZE_DEFAULT) @@ -129,7 +124,8 @@ public int parquetBatchSize() { } public boolean orcVectorizationEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.VECTORIZATION_ENABLED) .sessionConf(SparkSQLProperties.VECTORIZATION_ENABLED) .tableProperty(TableProperties.ORC_VECTORIZATION_ENABLED) @@ -138,7 +134,8 @@ public boolean orcVectorizationEnabled() { } public int orcBatchSize() { - return confParser.intConf() + return confParser + .intConf() .option(SparkReadOptions.VECTORIZATION_BATCH_SIZE) .tableProperty(TableProperties.ORC_BATCH_SIZE) .defaultValue(TableProperties.ORC_BATCH_SIZE_DEFAULT) @@ -146,7 +143,8 @@ public int orcBatchSize() { } public long splitSize() { - return confParser.longConf() + return confParser + .longConf() .option(SparkReadOptions.SPLIT_SIZE) .tableProperty(TableProperties.SPLIT_SIZE) .defaultValue(TableProperties.SPLIT_SIZE_DEFAULT) @@ -154,7 +152,8 @@ public long splitSize() { } public int splitLookback() { - return confParser.intConf() + return confParser + .intConf() .option(SparkReadOptions.LOOKBACK) .tableProperty(TableProperties.SPLIT_LOOKBACK) .defaultValue(TableProperties.SPLIT_LOOKBACK_DEFAULT) @@ -162,7 +161,8 @@ public int splitLookback() { } public long splitOpenFileCost() { - return confParser.longConf() + return confParser + .longConf() .option(SparkReadOptions.FILE_OPEN_COST) .tableProperty(TableProperties.SPLIT_OPEN_FILE_COST) .defaultValue(TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT) @@ -171,18 +171,20 @@ public long splitOpenFileCost() { /** * Enables reading a timestamp without time zone as a timestamp with time zone. - *

    - * Generally, this is not safe as a timestamp without time zone is supposed to represent the wall-clock time, - * i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, - * but a timestamp with time zone represents instant semantics, i.e. the timestamp - * is adjusted so that the corresponding time in the reader timezone is displayed. - *

    - * When set to false (default), an exception must be thrown while reading a timestamp without time zone. + * + *

    Generally, this is not safe as a timestamp without time zone is supposed to represent the + * wall-clock time, i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, + * but a timestamp with time zone represents instant semantics, i.e. the timestamp is adjusted so + * that the corresponding time in the reader timezone is displayed. + * + *

    When set to false (default), an exception must be thrown while reading a timestamp without + * time zone. * * @return boolean indicating if reading timestamps without timezone is allowed */ public boolean handleTimestampWithoutZone() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .sessionConf(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .defaultValue(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT) @@ -190,7 +192,8 @@ public boolean handleTimestampWithoutZone() { } public Long streamFromTimestamp() { - return confParser.longConf() + return confParser + .longConf() .option(SparkReadOptions.STREAM_FROM_TIMESTAMP) .defaultValue(Long.MIN_VALUE) .parse(); diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java index edcc2300344a..d13e80d40004 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java @@ -16,16 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; -/** - * Spark DF read options - */ +/** Spark DF read options */ public class SparkReadOptions { - private SparkReadOptions() { - } + private SparkReadOptions() {} // Snapshot ID of the table snapshot to read public static final String SNAPSHOT_ID = "snapshot-id"; @@ -62,11 +58,13 @@ private SparkReadOptions() { public static final boolean STREAMING_SKIP_DELETE_SNAPSHOTS_DEFAULT = false; // skip snapshots of type overwrite while reading stream out of iceberg table - public static final String STREAMING_SKIP_OVERWRITE_SNAPSHOTS = "streaming-skip-overwrite-snapshots"; + public static final String STREAMING_SKIP_OVERWRITE_SNAPSHOTS = + "streaming-skip-overwrite-snapshots"; public static final boolean STREAMING_SKIP_OVERWRITE_SNAPSHOTS_DEFAULT = false; // Controls whether to allow reading timestamps without zone info - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = "handle-timestamp-without-timezone"; + public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = + "handle-timestamp-without-timezone"; // Controls whether to report locality information to Spark while allocating input partitions public static final String LOCALITY = "locality"; diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java index f2dcc13bece0..fa8bd719f391 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java @@ -16,19 +16,18 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; public class SparkSQLProperties { - private SparkSQLProperties() { - } + private SparkSQLProperties() {} // Controls whether vectorized reads are enabled public static final String VECTORIZATION_ENABLED = "spark.sql.iceberg.vectorization.enabled"; // Controls whether reading/writing timestamps without timezones is allowed - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = "spark.sql.iceberg.handle-timestamp-without-timezone"; + public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = + "spark.sql.iceberg.handle-timestamp-without-timezone"; public static final boolean HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT = false; // Controls whether timestamp types for new tables should be stored with timezone info diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java index 321050dceb74..653987e654aa 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Collection; @@ -40,17 +39,14 @@ import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.StructType; -/** - * Helper methods for working with Spark/Hive metadata. - */ +/** Helper methods for working with Spark/Hive metadata. */ public class SparkSchemaUtil { - private SparkSchemaUtil() { - } + private SparkSchemaUtil() {} /** * Returns a {@link Schema} for the given table with fresh field ids. - *

    - * This creates a Schema for an existing table by looking up the table's schema with Spark and + * + *

    This creates a Schema for an existing table by looking up the table's schema with Spark and * converting that schema. Spark/Hive partition columns are included in the schema. * * @param spark a Spark session @@ -65,8 +61,8 @@ public static Schema schemaForTable(SparkSession spark, String name) { /** * Returns a {@link PartitionSpec} for the given table. - *

    - * This creates a partition spec for an existing table by looking up the table's schema and + * + *

    This creates a partition spec for an existing table by looking up the table's schema and * creating a spec with identity partitions for each partition column. * * @param spark a Spark session @@ -74,14 +70,15 @@ public static Schema schemaForTable(SparkSession spark, String name) { * @return a PartitionSpec for the table * @throws AnalysisException if thrown by the Spark catalog */ - public static PartitionSpec specForTable(SparkSession spark, String name) throws AnalysisException { + public static PartitionSpec specForTable(SparkSession spark, String name) + throws AnalysisException { List parts = Lists.newArrayList(Splitter.on('.').limit(2).split(name)); String db = parts.size() == 1 ? "default" : parts.get(0); String table = parts.get(parts.size() == 1 ? 0 : 1); - PartitionSpec spec = identitySpec( - schemaForTable(spark, name), - spark.catalog().listColumns(db, table).collectAsList()); + PartitionSpec spec = + identitySpec( + schemaForTable(spark, name), spark.catalog().listColumns(db, table).collectAsList()); return spec == null ? PartitionSpec.unpartitioned() : spec; } @@ -109,13 +106,14 @@ public static DataType convert(Type type) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. - *

    - * This conversion assigns fresh ids. - *

    - * Some data types are represented as the same Spark type. These are converted to a default type. - *

    - * To convert using a reference schema for field ids and ambiguous types, use - * {@link #convert(Schema, StructType)}. + * + *

    This conversion assigns fresh ids. + * + *

    Some data types are represented as the same Spark type. These are converted to a default + * type. + * + *

    To convert using a reference schema for field ids and ambiguous types, use {@link + * #convert(Schema, StructType)}. * * @param sparkType a Spark StructType * @return the equivalent Schema @@ -127,16 +125,18 @@ public static Schema convert(StructType sparkType) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. - *

    - * This conversion assigns fresh ids. - *

    - * Some data types are represented as the same Spark type. These are converted to a default type. - *

    - * To convert using a reference schema for field ids and ambiguous types, use - * {@link #convert(Schema, StructType)}. + * + *

    This conversion assigns fresh ids. + * + *

    Some data types are represented as the same Spark type. These are converted to a default + * type. + * + *

    To convert using a reference schema for field ids and ambiguous types, use {@link + * #convert(Schema, StructType)}. * * @param sparkType a Spark StructType - * @param useTimestampWithoutZone boolean flag indicates that timestamp should be stored without timezone + * @param useTimestampWithoutZone boolean flag indicates that timestamp should be stored without + * timezone * @return the equivalent Schema * @throws IllegalArgumentException if the type cannot be converted */ @@ -151,13 +151,14 @@ public static Schema convert(StructType sparkType, boolean useTimestampWithoutZo /** * Convert a Spark {@link DataType struct} to a {@link Type} with new field ids. - *

    - * This conversion assigns fresh ids. - *

    - * Some data types are represented as the same Spark type. These are converted to a default type. - *

    - * To convert using a reference schema for field ids and ambiguous types, use - * {@link #convert(Schema, StructType)}. + * + *

    This conversion assigns fresh ids. + * + *

    Some data types are represented as the same Spark type. These are converted to a default + * type. + * + *

    To convert using a reference schema for field ids and ambiguous types, use {@link + * #convert(Schema, StructType)}. * * @param sparkType a Spark DataType * @return the equivalent Type @@ -169,11 +170,11 @@ public static Type convert(DataType sparkType) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} based on the given schema. - *

    - * This conversion does not assign new ids; it uses ids from the base schema. - *

    - * Data types, field order, and nullability will match the spark type. This conversion may return - * a schema that is not compatible with base schema. + * + *

    This conversion does not assign new ids; it uses ids from the base schema. + * + *

    Data types, field order, and nullability will match the spark type. This conversion may + * return a schema that is not compatible with base schema. * * @param baseSchema a Schema on which conversion is based * @param sparkType a Spark StructType @@ -182,7 +183,8 @@ public static Type convert(DataType sparkType) { */ public static Schema convert(Schema baseSchema, StructType sparkType) { // convert to a type with fresh ids - Types.StructType struct = SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); + Types.StructType struct = + SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); // reassign ids to match the base schema Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema); // fix types that can't be represented in Spark (UUID and Fixed) @@ -191,8 +193,8 @@ public static Schema convert(Schema baseSchema, StructType sparkType) { /** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - *

    - * This requires that the Spark type is a projection of the Schema. Nullability and types must + * + *

    This requires that the Spark type is a projection of the Schema. Nullability and types must * match. * * @param schema a Schema @@ -201,19 +203,20 @@ public static Schema convert(Schema baseSchema, StructType sparkType) { * @throws IllegalArgumentException if the Spark type does not match the Schema */ public static Schema prune(Schema schema, StructType requestedType) { - return new Schema(TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, ImmutableSet.of())) - .asNestedType() - .asStructType() - .fields()); + return new Schema( + TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, ImmutableSet.of())) + .asNestedType() + .asStructType() + .fields()); } /** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - *

    - * This requires that the Spark type is a projection of the Schema. Nullability and types must + * + *

    This requires that the Spark type is a projection of the Schema. Nullability and types must * match. - *

    - * The filters list of {@link Expression} is used to ensure that columns referenced by filters + * + *

    The filters list of {@link Expression} is used to ensure that columns referenced by filters * are projected. * * @param schema a Schema @@ -224,19 +227,20 @@ public static Schema prune(Schema schema, StructType requestedType) { */ public static Schema prune(Schema schema, StructType requestedType, List filters) { Set filterRefs = Binder.boundReferences(schema.asStruct(), filters, true); - return new Schema(TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) - .asNestedType() - .asStructType() - .fields()); + return new Schema( + TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) + .asNestedType() + .asStructType() + .fields()); } /** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - *

    - * This requires that the Spark type is a projection of the Schema. Nullability and types must + * + *

    This requires that the Spark type is a projection of the Schema. Nullability and types must * match. - *

    - * The filters list of {@link Expression} is used to ensure that columns referenced by filters + * + *

    The filters list of {@link Expression} is used to ensure that columns referenced by filters * are projected. * * @param schema a Schema @@ -245,14 +249,16 @@ public static Schema prune(Schema schema, StructType requestedType, List filterRefs = Binder.boundReferences(schema.asStruct(), Collections.singletonList(filter), caseSensitive); - return new Schema(TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) - .asNestedType() - .asStructType() - .fields()); + return new Schema( + TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) + .asNestedType() + .asStructType() + .fields()); } private static PartitionSpec identitySpec(Schema schema, Collection columns) { @@ -282,7 +288,7 @@ private static PartitionSpec identitySpec(Schema schema, List partitionN /** * Estimate approximate table size based on Spark schema and total records. * - * @param tableSchema Spark schema + * @param tableSchema Spark schema * @param totalRecords total records in the table * @return approximate size based on table schema */ diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkSessionCatalog.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkSessionCatalog.java index 8245ef1a0cd6..fb1978dc8140 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkSessionCatalog.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkSessionCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -44,8 +43,8 @@ * * @param CatalogPlugin class to avoid casting to TableCatalog and SupportsNamespaces. */ -public class SparkSessionCatalog - extends BaseCatalog implements CatalogExtension { +public class SparkSessionCatalog extends BaseCatalog + implements CatalogExtension { private static final String[] DEFAULT_NAMESPACE = new String[] {"default"}; private String catalogName = null; @@ -58,8 +57,9 @@ public class SparkSessionCatalog /** * Build a {@link SparkCatalog} to be used for Iceberg operations. - *

    - * The default implementation creates a new SparkCatalog with the session catalog's name and options. + * + *

    The default implementation creates a new SparkCatalog with the session catalog's name and + * options. * * @param name catalog name * @param options catalog options @@ -87,17 +87,20 @@ public String[][] listNamespaces(String[] namespace) throws NoSuchNamespaceExcep } @Override - public Map loadNamespaceMetadata(String[] namespace) throws NoSuchNamespaceException { + public Map loadNamespaceMetadata(String[] namespace) + throws NoSuchNamespaceException { return getSessionCatalog().loadNamespaceMetadata(namespace); } @Override - public void createNamespace(String[] namespace, Map metadata) throws NamespaceAlreadyExistsException { + public void createNamespace(String[] namespace, Map metadata) + throws NamespaceAlreadyExistsException { getSessionCatalog().createNamespace(namespace, metadata); } @Override - public void alterNamespace(String[] namespace, NamespaceChange... changes) throws NoSuchNamespaceException { + public void alterNamespace(String[] namespace, NamespaceChange... changes) + throws NoSuchNamespaceException { getSessionCatalog().alterNamespace(namespace, changes); } @@ -130,8 +133,8 @@ public void invalidateTable(Identifier ident) { } @Override - public Table createTable(Identifier ident, StructType schema, Transform[] partitions, - Map properties) + public Table createTable( + Identifier ident, StructType schema, Transform[] partitions, Map properties) throws TableAlreadyExistsException, NoSuchNamespaceException { String provider = properties.get("provider"); if (useIceberg(provider)) { @@ -143,8 +146,8 @@ public Table createTable(Identifier ident, StructType schema, Transform[] partit } @Override - public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] partitions, - Map properties) + public StagedTable stageCreate( + Identifier ident, StructType schema, Transform[] partitions, Map properties) throws TableAlreadyExistsException, NoSuchNamespaceException { String provider = properties.get("provider"); TableCatalog catalog; @@ -157,14 +160,15 @@ public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] catalog = getSessionCatalog(); } - // create the table with the session catalog, then wrap it in a staged table that will delete to roll back + // create the table with the session catalog, then wrap it in a staged table that will delete to + // roll back Table table = catalog.createTable(ident, schema, partitions, properties); return new RollbackStagedTable(catalog, ident, table); } @Override - public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] partitions, - Map properties) + public StagedTable stageReplace( + Identifier ident, StructType schema, Transform[] partitions, Map properties) throws NoSuchNamespaceException, NoSuchTableException { String provider = properties.get("provider"); TableCatalog catalog; @@ -183,7 +187,8 @@ public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] } try { - // create the table with the session catalog, then wrap it in a staged table that will delete to roll back + // create the table with the session catalog, then wrap it in a staged table that will delete + // to roll back Table table = catalog.createTable(ident, schema, partitions, properties); return new RollbackStagedTable(catalog, ident, table); @@ -194,8 +199,9 @@ public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] } @Override - public StagedTable stageCreateOrReplace(Identifier ident, StructType schema, Transform[] partitions, - Map properties) throws NoSuchNamespaceException { + public StagedTable stageCreateOrReplace( + Identifier ident, StructType schema, Transform[] partitions, Map properties) + throws NoSuchNamespaceException { String provider = properties.get("provider"); TableCatalog catalog; if (useIceberg(provider)) { @@ -211,7 +217,8 @@ public StagedTable stageCreateOrReplace(Identifier ident, StructType schema, Tra catalog.dropTable(ident); try { - // create the table with the session catalog, then wrap it in a staged table that will delete to roll back + // create the table with the session catalog, then wrap it in a staged table that will delete + // to roll back Table sessionCatalogTable = catalog.createTable(ident, schema, partitions, properties); return new RollbackStagedTable(catalog, ident, sessionCatalogTable); @@ -232,21 +239,25 @@ public Table alterTable(Identifier ident, TableChange... changes) throws NoSuchT @Override public boolean dropTable(Identifier ident) { - // no need to check table existence to determine which catalog to use. if a table doesn't exist then both are + // no need to check table existence to determine which catalog to use. if a table doesn't exist + // then both are // required to return false. return icebergCatalog.dropTable(ident) || getSessionCatalog().dropTable(ident); } @Override public boolean purgeTable(Identifier ident) { - // no need to check table existence to determine which catalog to use. if a table doesn't exist then both are + // no need to check table existence to determine which catalog to use. if a table doesn't exist + // then both are // required to return false. return icebergCatalog.purgeTable(ident) || getSessionCatalog().purgeTable(ident); } @Override - public void renameTable(Identifier from, Identifier to) throws NoSuchTableException, TableAlreadyExistsException { - // rename is not supported by HadoopCatalog. to avoid UnsupportedOperationException for session catalog tables, + public void renameTable(Identifier from, Identifier to) + throws NoSuchTableException, TableAlreadyExistsException { + // rename is not supported by HadoopCatalog. to avoid UnsupportedOperationException for session + // catalog tables, // check table existence first to ensure that the table belongs to the Iceberg catalog. if (icebergCatalog.tableExists(from)) { icebergCatalog.renameTable(from, to); @@ -271,7 +282,8 @@ public final void initialize(String name, CaseInsensitiveStringMap options) { @Override @SuppressWarnings("unchecked") public void setDelegateCatalog(CatalogPlugin sparkSessionCatalog) { - if (sparkSessionCatalog instanceof TableCatalog && sparkSessionCatalog instanceof SupportsNamespaces) { + if (sparkSessionCatalog instanceof TableCatalog + && sparkSessionCatalog instanceof SupportsNamespaces) { this.sessionCatalog = (T) sparkSessionCatalog; } else { throw new IllegalArgumentException("Invalid session catalog: " + sparkSessionCatalog); @@ -298,8 +310,10 @@ private boolean useIceberg(String provider) { } private T getSessionCatalog() { - Preconditions.checkNotNull(sessionCatalog, "Delegated SessionCatalog is missing. " + - "Please make sure your are replacing Spark's default catalog, named 'spark_catalog'."); + Preconditions.checkNotNull( + sessionCatalog, + "Delegated SessionCatalog is missing. " + + "Please make sure your are replacing Spark's default catalog, named 'spark_catalog'."); return sessionCatalog; } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java index 30509e3381dc..77cfa0f34c63 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.StructLike; diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java index d3e2e8efe08b..c0c1255d200f 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.spark.sql.functions.col; + import java.io.IOException; import java.io.Serializable; import java.net.URI; @@ -94,8 +95,6 @@ import scala.collection.Seq; import scala.runtime.AbstractPartialFunction; -import static org.apache.spark.sql.functions.col; - /** * Java version of the original SparkTableUtil.scala * https://github.com/apache/iceberg/blob/apache-iceberg-0.8.0-incubating/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala @@ -109,20 +108,19 @@ public class SparkTableUtil { private static final PathFilter HIDDEN_PATH_FILTER = p -> !p.getName().startsWith("_") && !p.getName().startsWith("."); - private static final String duplicateFileMessage = "Cannot complete import because data files " + - "to be imported already exist within the target table: %s. " + - "This is disabled by default as Iceberg is not designed for mulitple references to the same file" + - " within the same table. If you are sure, you may set 'check_duplicate_files' to false to force the import."; - + private static final String duplicateFileMessage = + "Cannot complete import because data files " + + "to be imported already exist within the target table: %s. " + + "This is disabled by default as Iceberg is not designed for mulitple references to the same file" + + " within the same table. If you are sure, you may set 'check_duplicate_files' to false to force the import."; - private SparkTableUtil() { - } + private SparkTableUtil() {} /** * Returns a DataFrame with a row for each partition in the table. * - * The DataFrame has 3 columns, partition key (a=1/b=2), partition location, and format - * (avro or parquet). + *

    The DataFrame has 3 columns, partition key (a=1/b=2), partition location, and format (avro + * or parquet). * * @param spark a Spark session * @param table a table name and (optional) database @@ -130,7 +128,9 @@ private SparkTableUtil() { */ public static Dataset partitionDF(SparkSession spark, String table) { List partitions = getPartitions(spark, table); - return spark.createDataFrame(partitions, SparkPartition.class).toDF("partition", "uri", "format"); + return spark + .createDataFrame(partitions, SparkPartition.class) + .toDF("partition", "uri", "format"); } /** @@ -141,9 +141,12 @@ public static Dataset partitionDF(SparkSession spark, String table) { * @param expression The expression whose matching partitions are returned. * @return a DataFrame of the table partitions. */ - public static Dataset partitionDFByFilter(SparkSession spark, String table, String expression) { + public static Dataset partitionDFByFilter( + SparkSession spark, String table, String expression) { List partitions = getPartitionsByFilter(spark, table, expression); - return spark.createDataFrame(partitions, SparkPartition.class).toDF("partition", "uri", "format"); + return spark + .createDataFrame(partitions, SparkPartition.class) + .toDF("partition", "uri", "format"); } /** @@ -158,7 +161,8 @@ public static List getPartitions(SparkSession spark, String tabl TableIdentifier tableIdent = spark.sessionState().sqlParser().parseTableIdentifier(table); return getPartitions(spark, tableIdent, null); } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to parse table identifier: %s", table); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to parse table identifier: %s", table); } } @@ -170,30 +174,33 @@ public static List getPartitions(SparkSession spark, String tabl * @param partitionFilter partition filter, or null if no filter * @return all table's partitions */ - public static List getPartitions(SparkSession spark, TableIdentifier tableIdent, - Map partitionFilter) { + public static List getPartitions( + SparkSession spark, TableIdentifier tableIdent, Map partitionFilter) { try { SessionCatalog catalog = spark.sessionState().catalog(); CatalogTable catalogTable = catalog.getTableMetadata(tableIdent); Option> scalaPartitionFilter; if (partitionFilter != null && !partitionFilter.isEmpty()) { - scalaPartitionFilter = Option.apply(JavaConverters.mapAsScalaMapConverter(partitionFilter).asScala() - .toMap(Predef.conforms())); + scalaPartitionFilter = + Option.apply( + JavaConverters.mapAsScalaMapConverter(partitionFilter) + .asScala() + .toMap(Predef.conforms())); } else { scalaPartitionFilter = Option.empty(); } - Seq partitions = catalog.listPartitions(tableIdent, scalaPartitionFilter); - return JavaConverters - .seqAsJavaListConverter(partitions) - .asJava() - .stream() + Seq partitions = + catalog.listPartitions(tableIdent, scalaPartitionFilter); + return JavaConverters.seqAsJavaListConverter(partitions).asJava().stream() .map(catalogPartition -> toSparkPartition(catalogPartition, catalogTable)) .collect(Collectors.toList()); } catch (NoSuchDatabaseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Database not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Database not found in catalog.", tableIdent); } catch (NoSuchTableException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Table not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Table not found in catalog.", tableIdent); } } @@ -205,19 +212,22 @@ public static List getPartitions(SparkSession spark, TableIdenti * @param predicate a predicate on partition columns * @return matching table's partitions */ - public static List getPartitionsByFilter(SparkSession spark, String table, String predicate) { + public static List getPartitionsByFilter( + SparkSession spark, String table, String predicate) { TableIdentifier tableIdent; try { tableIdent = spark.sessionState().sqlParser().parseTableIdentifier(table); } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to parse the table identifier: %s", table); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to parse the table identifier: %s", table); } Expression unresolvedPredicateExpr; try { unresolvedPredicateExpr = spark.sessionState().sqlParser().parseExpression(predicate); } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to parse the predicate expression: %s", predicate); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to parse the predicate expression: %s", predicate); } Expression resolvedPredicateExpr = resolveAttrs(spark, table, unresolvedPredicateExpr); @@ -232,8 +242,8 @@ public static List getPartitionsByFilter(SparkSession spark, Str * @param predicateExpr a predicate expression on partition columns * @return matching table's partitions */ - public static List getPartitionsByFilter(SparkSession spark, TableIdentifier tableIdent, - Expression predicateExpr) { + public static List getPartitionsByFilter( + SparkSession spark, TableIdentifier tableIdent, Expression predicateExpr) { try { SessionCatalog catalog = spark.sessionState().catalog(); CatalogTable catalogTable = catalog.getTableMetadata(tableIdent); @@ -244,111 +254,131 @@ public static List getPartitionsByFilter(SparkSession spark, Tab } else { resolvedPredicateExpr = predicateExpr; } - Seq predicates = JavaConverters - .collectionAsScalaIterableConverter(ImmutableList.of(resolvedPredicateExpr)) - .asScala().toSeq(); + Seq predicates = + JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(resolvedPredicateExpr)) + .asScala() + .toSeq(); - Seq partitions = catalog.listPartitionsByFilter(tableIdent, predicates); + Seq partitions = + catalog.listPartitionsByFilter(tableIdent, predicates); - return JavaConverters - .seqAsJavaListConverter(partitions) - .asJava() - .stream() + return JavaConverters.seqAsJavaListConverter(partitions).asJava().stream() .map(catalogPartition -> toSparkPartition(catalogPartition, catalogTable)) .collect(Collectors.toList()); } catch (NoSuchDatabaseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Database not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Database not found in catalog.", tableIdent); } catch (NoSuchTableException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Table not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Table not found in catalog.", tableIdent); } } /** * Returns the data files in a partition by listing the partition location. * - * For Parquet and ORC partitions, this will read metrics from the file footer. For Avro partitions, - * metrics are set to null. + *

    For Parquet and ORC partitions, this will read metrics from the file footer. For Avro + * partitions, metrics are set to null. * * @param partition a partition * @param conf a serializable Hadoop conf * @param metricsConfig a metrics conf * @return a List of DataFile - * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, Configuration, - * MetricsConfig, NameMapping)} + * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, + * Configuration, MetricsConfig, NameMapping)} */ @Deprecated - public static List listPartition(SparkPartition partition, PartitionSpec spec, - SerializableConfiguration conf, MetricsConfig metricsConfig) { + public static List listPartition( + SparkPartition partition, + PartitionSpec spec, + SerializableConfiguration conf, + MetricsConfig metricsConfig) { return listPartition(partition, spec, conf, metricsConfig, null); } /** * Returns the data files in a partition by listing the partition location. * - * For Parquet and ORC partitions, this will read metrics from the file footer. For Avro partitions, - * metrics are set to null. + *

    For Parquet and ORC partitions, this will read metrics from the file footer. For Avro + * partitions, metrics are set to null. * * @param partition a partition * @param conf a serializable Hadoop conf * @param metricsConfig a metrics conf * @param mapping a name mapping * @return a List of DataFile - * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, Configuration, - * MetricsConfig, NameMapping)} + * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, + * Configuration, MetricsConfig, NameMapping)} */ @Deprecated - public static List listPartition(SparkPartition partition, PartitionSpec spec, - SerializableConfiguration conf, MetricsConfig metricsConfig, - NameMapping mapping) { - return TableMigrationUtil.listPartition(partition.values, partition.uri, partition.format, spec, conf.get(), - metricsConfig, mapping); + public static List listPartition( + SparkPartition partition, + PartitionSpec spec, + SerializableConfiguration conf, + MetricsConfig metricsConfig, + NameMapping mapping) { + return TableMigrationUtil.listPartition( + partition.values, + partition.uri, + partition.format, + spec, + conf.get(), + metricsConfig, + mapping); } - - private static SparkPartition toSparkPartition(CatalogTablePartition partition, CatalogTable table) { + private static SparkPartition toSparkPartition( + CatalogTablePartition partition, CatalogTable table) { Option locationUri = partition.storage().locationUri(); Option serde = partition.storage().serde(); Preconditions.checkArgument(locationUri.nonEmpty(), "Partition URI should be defined"); - Preconditions.checkArgument(serde.nonEmpty() || table.provider().nonEmpty(), - "Partition format should be defined"); + Preconditions.checkArgument( + serde.nonEmpty() || table.provider().nonEmpty(), "Partition format should be defined"); String uri = Util.uriToString(locationUri.get()); String format = serde.nonEmpty() ? serde.get() : table.provider().get(); - Map partitionSpec = JavaConverters.mapAsJavaMapConverter(partition.spec()).asJava(); + Map partitionSpec = + JavaConverters.mapAsJavaMapConverter(partition.spec()).asJava(); return new SparkPartition(partitionSpec, uri, format); } private static Expression resolveAttrs(SparkSession spark, String table, Expression expr) { Function2 resolver = spark.sessionState().analyzer().resolver(); LogicalPlan plan = spark.table(table).queryExecution().analyzed(); - return expr.transform(new AbstractPartialFunction() { - @Override - public Expression apply(Expression attr) { - UnresolvedAttribute unresolvedAttribute = (UnresolvedAttribute) attr; - Option namedExpressionOption = plan.resolve(unresolvedAttribute.nameParts(), resolver); - if (namedExpressionOption.isDefined()) { - return (Expression) namedExpressionOption.get(); - } else { - throw new IllegalArgumentException( - String.format("Could not resolve %s using columns: %s", attr, plan.output())); - } - } - - @Override - public boolean isDefinedAt(Expression attr) { - return attr instanceof UnresolvedAttribute; - } - }); + return expr.transform( + new AbstractPartialFunction() { + @Override + public Expression apply(Expression attr) { + UnresolvedAttribute unresolvedAttribute = (UnresolvedAttribute) attr; + Option namedExpressionOption = + plan.resolve(unresolvedAttribute.nameParts(), resolver); + if (namedExpressionOption.isDefined()) { + return (Expression) namedExpressionOption.get(); + } else { + throw new IllegalArgumentException( + String.format("Could not resolve %s using columns: %s", attr, plan.output())); + } + } + + @Override + public boolean isDefinedAt(Expression attr) { + return attr instanceof UnresolvedAttribute; + } + }); } - private static Iterator buildManifest(SerializableConfiguration conf, PartitionSpec spec, - String basePath, Iterator> fileTuples) { + private static Iterator buildManifest( + SerializableConfiguration conf, + PartitionSpec spec, + String basePath, + Iterator> fileTuples) { if (fileTuples.hasNext()) { FileIO io = new HadoopFileIO(conf.get()); TaskContext ctx = TaskContext.get(); - String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId()); + String suffix = + String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId()); Path location = new Path(basePath, suffix); String outputPath = FileFormat.AVRO.addExtension(location.toString()); OutputFile outputFile = io.newOutputFile(outputPath); @@ -357,7 +387,8 @@ private static Iterator buildManifest(SerializableConfiguration co try (ManifestWriter writerRef = writer) { fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2)); } catch (IOException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to close the manifest writer: %s", outputPath); } ManifestFile manifestFile = writer.toManifestFile(); @@ -370,42 +401,54 @@ private static Iterator buildManifest(SerializableConfiguration co /** * Import files from an existing Spark table to an Iceberg table. * - * The import uses the Spark session to get table metadata. It assumes no - * operation is going on the original and target table and thus is not - * thread-safe. + *

    The import uses the Spark session to get table metadata. It assumes no operation is going on + * the original and target table and thus is not thread-safe. * * @param spark a Spark session * @param sourceTableIdent an identifier of the source Spark table * @param targetTable an Iceberg table where to import the data * @param stagingDir a staging directory to store temporary manifest files - * @param partitionFilter only import partitions whose values match those in the map, can be partially defined + * @param partitionFilter only import partitions whose values match those in the map, can be + * partially defined * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file */ - public static void importSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, - String stagingDir, Map partitionFilter, - boolean checkDuplicateFiles) { + public static void importSparkTable( + SparkSession spark, + TableIdentifier sourceTableIdent, + Table targetTable, + String stagingDir, + Map partitionFilter, + boolean checkDuplicateFiles) { SessionCatalog catalog = spark.sessionState().catalog(); - String db = sourceTableIdent.database().nonEmpty() ? - sourceTableIdent.database().get() : - catalog.getCurrentDatabase(); - TableIdentifier sourceTableIdentWithDB = new TableIdentifier(sourceTableIdent.table(), Some.apply(db)); + String db = + sourceTableIdent.database().nonEmpty() + ? sourceTableIdent.database().get() + : catalog.getCurrentDatabase(); + TableIdentifier sourceTableIdentWithDB = + new TableIdentifier(sourceTableIdent.table(), Some.apply(db)); if (!catalog.tableExists(sourceTableIdentWithDB)) { - throw new org.apache.iceberg.exceptions.NoSuchTableException("Table %s does not exist", sourceTableIdentWithDB); + throw new org.apache.iceberg.exceptions.NoSuchTableException( + "Table %s does not exist", sourceTableIdentWithDB); } try { - PartitionSpec spec = SparkSchemaUtil.specForTable(spark, sourceTableIdentWithDB.unquotedString()); + PartitionSpec spec = + SparkSchemaUtil.specForTable(spark, sourceTableIdentWithDB.unquotedString()); if (Objects.equal(spec, PartitionSpec.unpartitioned())) { - importUnpartitionedSparkTable(spark, sourceTableIdentWithDB, targetTable, checkDuplicateFiles); + importUnpartitionedSparkTable( + spark, sourceTableIdentWithDB, targetTable, checkDuplicateFiles); } else { - List sourceTablePartitions = getPartitions(spark, sourceTableIdent, - partitionFilter); - Preconditions.checkArgument(!sourceTablePartitions.isEmpty(), - "Cannot find any partitions in table %s", sourceTableIdent); - importSparkPartitions(spark, sourceTablePartitions, targetTable, spec, stagingDir, checkDuplicateFiles); + List sourceTablePartitions = + getPartitions(spark, sourceTableIdent, partitionFilter); + Preconditions.checkArgument( + !sourceTablePartitions.isEmpty(), + "Cannot find any partitions in table %s", + sourceTableIdent); + importSparkPartitions( + spark, sourceTablePartitions, targetTable, spec, stagingDir, checkDuplicateFiles); } } catch (AnalysisException e) { throw SparkExceptionUtil.toUncheckedException( @@ -416,9 +459,8 @@ public static void importSparkTable(SparkSession spark, TableIdentifier sourceTa /** * Import files from an existing Spark table to an Iceberg table. * - * The import uses the Spark session to get table metadata. It assumes no - * operation is going on the original and target table and thus is not - * thread-safe. + *

    The import uses the Spark session to get table metadata. It assumes no operation is going on + * the original and target table and thus is not thread-safe. * * @param spark a Spark session * @param sourceTableIdent an identifier of the source Spark table @@ -426,33 +468,49 @@ public static void importSparkTable(SparkSession spark, TableIdentifier sourceTa * @param stagingDir a staging directory to store temporary manifest files * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file */ - public static void importSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, - String stagingDir, boolean checkDuplicateFiles) { - importSparkTable(spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), checkDuplicateFiles); + public static void importSparkTable( + SparkSession spark, + TableIdentifier sourceTableIdent, + Table targetTable, + String stagingDir, + boolean checkDuplicateFiles) { + importSparkTable( + spark, + sourceTableIdent, + targetTable, + stagingDir, + Collections.emptyMap(), + checkDuplicateFiles); } /** * Import files from an existing Spark table to an Iceberg table. * - * The import uses the Spark session to get table metadata. It assumes no - * operation is going on the original and target table and thus is not - * thread-safe. + *

    The import uses the Spark session to get table metadata. It assumes no operation is going on + * the original and target table and thus is not thread-safe. + * * @param spark a Spark session * @param sourceTableIdent an identifier of the source Spark table * @param targetTable an Iceberg table where to import the data * @param stagingDir a staging directory to store temporary manifest files */ - public static void importSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, - String stagingDir) { - importSparkTable(spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), false); + public static void importSparkTable( + SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, String stagingDir) { + importSparkTable( + spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), false); } - private static void importUnpartitionedSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, - Table targetTable, boolean checkDuplicateFiles) { + private static void importUnpartitionedSparkTable( + SparkSession spark, + TableIdentifier sourceTableIdent, + Table targetTable, + boolean checkDuplicateFiles) { try { CatalogTable sourceTable = spark.sessionState().catalog().getTableMetadata(sourceTableIdent); Option format = - sourceTable.storage().serde().nonEmpty() ? sourceTable.storage().serde() : sourceTable.provider(); + sourceTable.storage().serde().nonEmpty() + ? sourceTable.storage().serde() + : sourceTable.provider(); Preconditions.checkArgument(format.nonEmpty(), "Could not determine table format"); Map partition = Collections.emptyMap(); @@ -460,20 +518,34 @@ private static void importUnpartitionedSparkTable(SparkSession spark, TableIdent Configuration conf = spark.sessionState().newHadoopConf(); MetricsConfig metricsConfig = MetricsConfig.forTable(targetTable); String nameMappingString = targetTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; - - List files = TableMigrationUtil.listPartition( - partition, Util.uriToString(sourceTable.location()), format.get(), spec, conf, metricsConfig, nameMapping); + NameMapping nameMapping = + nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; + + List files = + TableMigrationUtil.listPartition( + partition, + Util.uriToString(sourceTable.location()), + format.get(), + spec, + conf, + metricsConfig, + nameMapping); if (checkDuplicateFiles) { - Dataset importedFiles = spark.createDataset( - Lists.transform(files, f -> f.path().toString()), Encoders.STRING()).toDF("file_path"); - Dataset existingFiles = loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); - Column joinCond = existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); - Dataset duplicates = importedFiles.join(existingFiles, joinCond) - .select("file_path").as(Encoders.STRING()); - Preconditions.checkState(duplicates.isEmpty(), - String.format(duplicateFileMessage, Joiner.on(",").join((String[]) duplicates.take(10)))); + Dataset importedFiles = + spark + .createDataset(Lists.transform(files, f -> f.path().toString()), Encoders.STRING()) + .toDF("file_path"); + Dataset existingFiles = + loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); + Column joinCond = + existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); + Dataset duplicates = + importedFiles.join(existingFiles, joinCond).select("file_path").as(Encoders.STRING()); + Preconditions.checkState( + duplicates.isEmpty(), + String.format( + duplicateFileMessage, Joiner.on(",").join((String[]) duplicates.take(10)))); } AppendFiles append = targetTable.newAppend(); @@ -498,57 +570,74 @@ private static void importUnpartitionedSparkTable(SparkSession spark, TableIdent * @param stagingDir a staging directory to store temporary manifest files * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file */ - public static void importSparkPartitions(SparkSession spark, List partitions, Table targetTable, - PartitionSpec spec, String stagingDir, boolean checkDuplicateFiles) { + public static void importSparkPartitions( + SparkSession spark, + List partitions, + Table targetTable, + PartitionSpec spec, + String stagingDir, + boolean checkDuplicateFiles) { Configuration conf = spark.sessionState().newHadoopConf(); SerializableConfiguration serializableConf = new SerializableConfiguration(conf); - int parallelism = Math.min(partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism()); + int parallelism = + Math.min( + partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism()); int numShufflePartitions = spark.sessionState().conf().numShufflePartitions(); MetricsConfig metricsConfig = MetricsConfig.fromProperties(targetTable.properties()); String nameMappingString = targetTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; + NameMapping nameMapping = + nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD partitionRDD = sparkContext.parallelize(partitions, parallelism); - Dataset partitionDS = spark.createDataset( - partitionRDD.rdd(), - Encoders.javaSerialization(SparkPartition.class)); + Dataset partitionDS = + spark.createDataset(partitionRDD.rdd(), Encoders.javaSerialization(SparkPartition.class)); - Dataset filesToImport = partitionDS - .flatMap((FlatMapFunction) sparkPartition -> - listPartition(sparkPartition, spec, serializableConf, metricsConfig, nameMapping).iterator(), + Dataset filesToImport = + partitionDS.flatMap( + (FlatMapFunction) + sparkPartition -> + listPartition( + sparkPartition, spec, serializableConf, metricsConfig, nameMapping) + .iterator(), Encoders.javaSerialization(DataFile.class)); if (checkDuplicateFiles) { - Dataset importedFiles = filesToImport - .map((MapFunction) f -> f.path().toString(), Encoders.STRING()) - .toDF("file_path"); + Dataset importedFiles = + filesToImport + .map((MapFunction) f -> f.path().toString(), Encoders.STRING()) + .toDF("file_path"); Dataset existingFiles = loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); - Column joinCond = existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); - Dataset duplicates = importedFiles.join(existingFiles, joinCond) - .select("file_path").as(Encoders.STRING()); - Preconditions.checkState(duplicates.isEmpty(), + Column joinCond = + existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); + Dataset duplicates = + importedFiles.join(existingFiles, joinCond).select("file_path").as(Encoders.STRING()); + Preconditions.checkState( + duplicates.isEmpty(), String.format(duplicateFileMessage, Joiner.on(",").join((String[]) duplicates.take(10)))); } - List manifests = filesToImport - .repartition(numShufflePartitions) - .map((MapFunction>) file -> - Tuple2.apply(file.path().toString(), file), - Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class))) - .orderBy(col("_1")) - .mapPartitions( - (MapPartitionsFunction, ManifestFile>) fileTuple -> - buildManifest(serializableConf, spec, stagingDir, fileTuple), - Encoders.javaSerialization(ManifestFile.class)) - .collectAsList(); + List manifests = + filesToImport + .repartition(numShufflePartitions) + .map( + (MapFunction>) + file -> Tuple2.apply(file.path().toString(), file), + Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class))) + .orderBy(col("_1")) + .mapPartitions( + (MapPartitionsFunction, ManifestFile>) + fileTuple -> buildManifest(serializableConf, spec, stagingDir, fileTuple), + Encoders.javaSerialization(ManifestFile.class)) + .collectAsList(); try { - boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean( - targetTable.properties(), - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); + boolean snapshotIdInheritanceEnabled = + PropertyUtil.propertyAsBoolean( + targetTable.properties(), + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); AppendFiles append = targetTable.newAppend(); manifests.forEach(append::appendManifest); @@ -573,13 +662,17 @@ public static void importSparkPartitions(SparkSession spark, List partitions, Table targetTable, - PartitionSpec spec, String stagingDir) { + public static void importSparkPartitions( + SparkSession spark, + List partitions, + Table targetTable, + PartitionSpec spec, + String stagingDir) { importSparkPartitions(spark, partitions, targetTable, spec, stagingDir, false); } - public static List filterPartitions(List partitions, - Map partitionFilter) { + public static List filterPartitions( + List partitions, Map partitionFilter) { if (partitionFilter.isEmpty()) { return partitions; } else { @@ -597,17 +690,25 @@ private static void deleteManifests(FileIO io, List manifests) { } // Attempt to use Spark3 Catalog resolution if available on the path - private static final DynMethods.UnboundMethod LOAD_METADATA_TABLE = DynMethods.builder("loadMetadataTable") - .hiddenImpl("org.apache.iceberg.spark.Spark3Util", SparkSession.class, Table.class, MetadataTableType.class) - .orNoop() - .build(); - - public static Dataset loadCatalogMetadataTable(SparkSession spark, Table table, MetadataTableType type) { - Preconditions.checkArgument(!LOAD_METADATA_TABLE.isNoop(), "Cannot find Spark3Util class but Spark3 is in use"); + private static final DynMethods.UnboundMethod LOAD_METADATA_TABLE = + DynMethods.builder("loadMetadataTable") + .hiddenImpl( + "org.apache.iceberg.spark.Spark3Util", + SparkSession.class, + Table.class, + MetadataTableType.class) + .orNoop() + .build(); + + public static Dataset loadCatalogMetadataTable( + SparkSession spark, Table table, MetadataTableType type) { + Preconditions.checkArgument( + !LOAD_METADATA_TABLE.isNoop(), "Cannot find Spark3Util class but Spark3 is in use"); return LOAD_METADATA_TABLE.asStatic().invoke(spark, table, type); } - public static Dataset loadMetadataTable(SparkSession spark, Table table, MetadataTableType type) { + public static Dataset loadMetadataTable( + SparkSession spark, Table table, MetadataTableType type) { if (spark.version().startsWith("3")) { // construct the metadata table instance directly Dataset catalogMetadataTable = loadCatalogMetadataTable(spark, table, type); @@ -633,14 +734,12 @@ public static Dataset loadMetadataTable(SparkSession spark, Table table, Me // Try loading by name as a Hive table without Catalog return dataFrameReader.load(tableName.replaceFirst("hive\\.", "") + "." + type); } else { - throw new IllegalArgumentException(String.format( - "Cannot find the metadata table for %s of type %s", tableName, type)); + throw new IllegalArgumentException( + String.format("Cannot find the metadata table for %s of type %s", tableName, type)); } } - /** - * Class representing a table partition. - */ + /** Class representing a table partition. */ public static class SparkPartition implements Serializable { private final Map values; private final String uri; @@ -682,9 +781,9 @@ public boolean equals(Object o) { return false; } SparkPartition that = (SparkPartition) o; - return Objects.equal(values, that.values) && - Objects.equal(uri, that.uri) && - Objects.equal(format, that.format); + return Objects.equal(values, that.values) + && Objects.equal(uri, that.uri) + && Objects.equal(format, that.format); } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java index f0b8b2a9762b..17499736fbeb 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -70,7 +69,7 @@ public Type struct(StructType struct, List types) { List newFields = Lists.newArrayListWithExpectedSize(fields.length); boolean isRoot = root == struct; for (int i = 0; i < fields.length; i += 1) { - StructField field = fields[i]; + StructField field = fields[i]; Type type = types.get(i); int id; @@ -122,10 +121,9 @@ public Type atomic(DataType atomic) { if (atomic instanceof BooleanType) { return Types.BooleanType.get(); - } else if ( - atomic instanceof IntegerType || - atomic instanceof ShortType || - atomic instanceof ByteType) { + } else if (atomic instanceof IntegerType + || atomic instanceof ShortType + || atomic instanceof ByteType) { return Types.IntegerType.get(); } else if (atomic instanceof LongType) { @@ -137,10 +135,9 @@ public Type atomic(DataType atomic) { } else if (atomic instanceof DoubleType) { return Types.DoubleType.get(); - } else if ( - atomic instanceof StringType || - atomic instanceof CharType || - atomic instanceof VarcharType) { + } else if (atomic instanceof StringType + || atomic instanceof CharType + || atomic instanceof VarcharType) { return Types.StringType.get(); } else if (atomic instanceof DateType) { @@ -151,13 +148,11 @@ public Type atomic(DataType atomic) { } else if (atomic instanceof DecimalType) { return Types.DecimalType.of( - ((DecimalType) atomic).precision(), - ((DecimalType) atomic).scale()); + ((DecimalType) atomic).precision(), ((DecimalType) atomic).scale()); } else if (atomic instanceof BinaryType) { return Types.BinaryType.get(); } - throw new UnsupportedOperationException( - "Not a supported type: " + atomic.catalogString()); + throw new UnsupportedOperationException("Not a supported type: " + atomic.catalogString()); } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java index 83b31940711e..1ef694263fa4 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -35,26 +34,22 @@ static T visit(DataType type, SparkTypeVisitor visitor) { List fieldResults = Lists.newArrayListWithExpectedSize(fields.length); for (StructField field : fields) { - fieldResults.add(visitor.field( - field, - visit(field.dataType(), visitor))); + fieldResults.add(visitor.field(field, visit(field.dataType(), visitor))); } return visitor.struct((StructType) type, fieldResults); } else if (type instanceof MapType) { - return visitor.map((MapType) type, + return visitor.map( + (MapType) type, visit(((MapType) type).keyType(), visitor), visit(((MapType) type).valueType(), visitor)); } else if (type instanceof ArrayType) { - return visitor.array( - (ArrayType) type, - visit(((ArrayType) type).elementType(), visitor)); + return visitor.array((ArrayType) type, visit(((ArrayType) type).elementType(), visitor)); } else if (type instanceof UserDefinedType) { - throw new UnsupportedOperationException( - "User-defined types are not supported"); + throw new UnsupportedOperationException("User-defined types are not supported"); } else { return visitor.atomic(type); diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java index 06f74d4fda06..2cdec2b0629c 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -42,26 +41,33 @@ public class SparkUtil { - public static final String TIMESTAMP_WITHOUT_TIMEZONE_ERROR = String.format("Cannot handle timestamp without" + - " timezone fields in Spark. Spark does not natively support this type but if you would like to handle all" + - " timestamps as timestamp with timezone set '%s' to true. This will not change the underlying values stored" + - " but will change their displayed values in Spark. For more information please see" + - " https://docs.databricks.com/spark/latest/dataframes-datasets/dates-timestamps.html#ansi-sql-and" + - "-spark-sql-timestamps", SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); + public static final String TIMESTAMP_WITHOUT_TIMEZONE_ERROR = + String.format( + "Cannot handle timestamp without" + + " timezone fields in Spark. Spark does not natively support this type but if you would like to handle all" + + " timestamps as timestamp with timezone set '%s' to true. This will not change the underlying values stored" + + " but will change their displayed values in Spark. For more information please see" + + " https://docs.databricks.com/spark/latest/dataframes-datasets/dates-timestamps.html#ansi-sql-and" + + "-spark-sql-timestamps", + SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); private static final String SPARK_CATALOG_CONF_PREFIX = "spark.sql.catalog"; - // Format string used as the prefix for spark configuration keys to override hadoop configuration values - // for Iceberg tables from a given catalog. These keys can be specified as `spark.sql.catalog.$catalogName.hadoop.*`, - // similar to using `spark.hadoop.*` to override hadoop configurations globally for a given spark session. - private static final String SPARK_CATALOG_HADOOP_CONF_OVERRIDE_FMT_STR = SPARK_CATALOG_CONF_PREFIX + ".%s.hadoop."; + // Format string used as the prefix for spark configuration keys to override hadoop configuration + // values + // for Iceberg tables from a given catalog. These keys can be specified as + // `spark.sql.catalog.$catalogName.hadoop.*`, + // similar to using `spark.hadoop.*` to override hadoop configurations globally for a given spark + // session. + private static final String SPARK_CATALOG_HADOOP_CONF_OVERRIDE_FMT_STR = + SPARK_CATALOG_CONF_PREFIX + ".%s.hadoop."; - private SparkUtil() { - } + private SparkUtil() {} public static FileIO serializableFileIO(Table table) { if (table.io() instanceof HadoopConfigurable) { // we need to use Spark's SerializableConfiguration to avoid issues with Kryo serialization - ((HadoopConfigurable) table.io()).serializeConfWith(conf -> new SerializableConfiguration(conf)::value); + ((HadoopConfigurable) table.io()) + .serializeConfWith(conf -> new SerializableConfiguration(conf)::value); } return table.io(); @@ -75,11 +81,12 @@ public static FileIO serializableFileIO(Table table) { */ public static void validatePartitionTransforms(PartitionSpec spec) { if (spec.fields().stream().anyMatch(field -> field.transform() instanceof UnknownTransform)) { - String unsupported = spec.fields().stream() - .map(PartitionField::transform) - .filter(transform -> transform instanceof UnknownTransform) - .map(Transform::toString) - .collect(Collectors.joining(", ")); + String unsupported = + spec.fields().stream() + .map(PartitionField::transform) + .filter(transform -> transform instanceof UnknownTransform) + .map(Transform::toString) + .collect(Collectors.joining(", ")); throw new UnsupportedOperationException( String.format("Cannot write using unsupported transforms: %s", unsupported)); @@ -87,18 +94,20 @@ public static void validatePartitionTransforms(PartitionSpec spec) { } /** - * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply - * Attempts to find the catalog and identifier a multipart identifier represents + * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply Attempts to find the + * catalog and identifier a multipart identifier represents + * * @param nameParts Multipart identifier representing a table * @return The CatalogPlugin and Identifier for the table */ - public static Pair catalogAndIdentifier(List nameParts, - Function catalogProvider, - BiFunction identiferProvider, - C currentCatalog, - String[] currentNamespace) { - Preconditions.checkArgument(!nameParts.isEmpty(), - "Cannot determine catalog and identifier from empty name"); + public static Pair catalogAndIdentifier( + List nameParts, + Function catalogProvider, + BiFunction identiferProvider, + C currentCatalog, + String[] currentNamespace) { + Preconditions.checkArgument( + !nameParts.isEmpty(), "Cannot determine catalog and identifier from empty name"); int lastElementIndex = nameParts.size() - 1; String name = nameParts.get(lastElementIndex); @@ -110,7 +119,7 @@ public static Pair catalogAndIdentifier(List nameParts, C catalog = catalogProvider.apply(nameParts.get(0)); if (catalog == null) { // The first element was not a valid catalog, treat it like part of the namespace - String[] namespace = nameParts.subList(0, lastElementIndex).toArray(new String[0]); + String[] namespace = nameParts.subList(0, lastElementIndex).toArray(new String[0]); return Pair.of(currentCatalog, identiferProvider.apply(namespace, name)); } else { // Assume the first element is a valid catalog @@ -122,6 +131,7 @@ public static Pair catalogAndIdentifier(List nameParts, /** * Responsible for checking if the table schema has a timestamp without timezone column + * * @param schema table schema to check if it contains a timestamp without timezone column * @return boolean indicating if the schema passed in has a timestamp field without a timezone */ @@ -131,15 +141,17 @@ public static boolean hasTimestampWithoutZone(Schema schema) { /** * Checks whether timestamp types for new tables should be stored with timezone info. - *

    - * The default value is false and all timestamp fields are stored as {@link Types.TimestampType#withZone()}. - * If enabled, all timestamp fields in new tables will be stored as {@link Types.TimestampType#withoutZone()}. + * + *

    The default value is false and all timestamp fields are stored as {@link + * Types.TimestampType#withZone()}. If enabled, all timestamp fields in new tables will be stored + * as {@link Types.TimestampType#withoutZone()}. * * @param sessionConf a Spark runtime config * @return true if timestamp types for new tables should be stored with timezone info */ public static boolean useTimestampWithoutZoneInNewTables(RuntimeConfig sessionConf) { - String sessionConfValue = sessionConf.get(SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, null); + String sessionConfValue = + sessionConf.get(SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, null); if (sessionConfValue != null) { return Boolean.parseBoolean(sessionConfValue); } @@ -147,32 +159,40 @@ public static boolean useTimestampWithoutZoneInNewTables(RuntimeConfig sessionCo } /** - * Pulls any Catalog specific overrides for the Hadoop conf from the current SparkSession, which can be - * set via `spark.sql.catalog.$catalogName.hadoop.*` + * Pulls any Catalog specific overrides for the Hadoop conf from the current SparkSession, which + * can be set via `spark.sql.catalog.$catalogName.hadoop.*` * - * Mirrors the override of hadoop configurations for a given spark session using `spark.hadoop.*`. + *

    Mirrors the override of hadoop configurations for a given spark session using + * `spark.hadoop.*`. * - * The SparkCatalog allows for hadoop configurations to be overridden per catalog, by setting + *

    The SparkCatalog allows for hadoop configurations to be overridden per catalog, by setting * them on the SQLConf, where the following will add the property "fs.default.name" with value - * "hdfs://hanksnamenode:8020" to the catalog's hadoop configuration. - * SparkSession.builder() - * .config(s"spark.sql.catalog.$catalogName.hadoop.fs.default.name", "hdfs://hanksnamenode:8020") - * .getOrCreate() + * "hdfs://hanksnamenode:8020" to the catalog's hadoop configuration. SparkSession.builder() + * .config(s"spark.sql.catalog.$catalogName.hadoop.fs.default.name", "hdfs://hanksnamenode:8020") + * .getOrCreate() + * * @param spark The current Spark session * @param catalogName Name of the catalog to find overrides for. - * @return the Hadoop Configuration that should be used for this catalog, with catalog specific overrides applied. + * @return the Hadoop Configuration that should be used for this catalog, with catalog specific + * overrides applied. */ public static Configuration hadoopConfCatalogOverrides(SparkSession spark, String catalogName) { // Find keys for the catalog intended to be hadoop configurations final String hadoopConfCatalogPrefix = hadoopConfPrefixForCatalog(catalogName); final Configuration conf = spark.sessionState().newHadoopConf(); - spark.sqlContext().conf().settings().forEach((k, v) -> { - // These checks are copied from `spark.sessionState().newHadoopConfWithOptions()`, which we - // avoid using to not have to convert back and forth between scala / java map types. - if (v != null && k != null && k.startsWith(hadoopConfCatalogPrefix)) { - conf.set(k.substring(hadoopConfCatalogPrefix.length()), v); - } - }); + spark + .sqlContext() + .conf() + .settings() + .forEach( + (k, v) -> { + // These checks are copied from `spark.sessionState().newHadoopConfWithOptions()`, + // which we + // avoid using to not have to convert back and forth between scala / java map types. + if (v != null && k != null && k.startsWith(hadoopConfCatalogPrefix)) { + conf.set(k.substring(hadoopConfCatalogPrefix.length()), v); + } + }); return conf; } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java index 8a7920351c30..2f1c4d30b391 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.nio.ByteBuffer; @@ -34,13 +33,10 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.util.DateTimeUtils; -/** - * A utility class that converts Spark values to Iceberg's internal representation. - */ +/** A utility class that converts Spark values to Iceberg's internal representation. */ public class SparkValueConverter { - private SparkValueConverter() { - } + private SparkValueConverter() {} public static Record convert(Schema schema, Row row) { return convert(schema.asStruct(), row); diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java index 756f4197b736..08b3fbee7590 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Locale; @@ -31,18 +30,21 @@ /** * A class for common Iceberg configs for Spark writes. - *

    - * If a config is set at multiple levels, the following order of precedence is used (top to bottom): + * + *

    If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * *

      - *
    1. Write options
    2. - *
    3. Session configuration
    4. - *
    5. Table metadata
    6. + *
    7. Write options + *
    8. Session configuration + *
    9. Table metadata *
    - * The most specific value is set in write options and takes precedence over all other configs. - * If no write option is provided, this class checks the session configuration for any overrides. - * If no applicable value is found in the session configuration, this class uses the table metadata. - *

    - * Note this class is NOT meant to be serialized and sent to executors. + * + * The most specific value is set in write options and takes precedence over all other configs. If + * no write option is provided, this class checks the session configuration for any overrides. If no + * applicable value is found in the session configuration, this class uses the table metadata. + * + *

    Note this class is NOT meant to be serialized and sent to executors. */ public class SparkWriteConf { @@ -57,7 +59,8 @@ public SparkWriteConf(SparkSession spark, Table table, Map write } public boolean checkNullability() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.CHECK_NULLABILITY) .sessionConf(SparkSQLProperties.CHECK_NULLABILITY) .defaultValue(SparkSQLProperties.CHECK_NULLABILITY_DEFAULT) @@ -65,7 +68,8 @@ public boolean checkNullability() { } public boolean checkOrdering() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.CHECK_ORDERING) .sessionConf(SparkSQLProperties.CHECK_ORDERING) .defaultValue(SparkSQLProperties.CHECK_ORDERING_DEFAULT) @@ -74,18 +78,20 @@ public boolean checkOrdering() { /** * Enables writing a timestamp with time zone as a timestamp without time zone. - *

    - * Generally, this is not safe as a timestamp without time zone is supposed to represent the wall-clock time, - * i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, - * but a timestamp with time zone represents instant semantics, i.e. the timestamp - * is adjusted so that the corresponding time in the reader timezone is displayed. - *

    - * When set to false (default), an exception must be thrown if the table contains a timestamp without time zone. + * + *

    Generally, this is not safe as a timestamp without time zone is supposed to represent the + * wall-clock time, i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, + * but a timestamp with time zone represents instant semantics, i.e. the timestamp is adjusted so + * that the corresponding time in the reader timezone is displayed. + * + *

    When set to false (default), an exception must be thrown if the table contains a timestamp + * without time zone. * * @return boolean indicating if writing timestamps without timezone is allowed */ public boolean handleTimestampWithoutZone() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .sessionConf(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .defaultValue(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT) @@ -102,16 +108,19 @@ public String wapId() { } public FileFormat dataFileFormat() { - String valueAsString = confParser.stringConf() - .option(SparkWriteOptions.WRITE_FORMAT) - .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) - .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) - .parse(); + String valueAsString = + confParser + .stringConf() + .option(SparkWriteOptions.WRITE_FORMAT) + .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) + .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) + .parse(); return FileFormat.valueOf(valueAsString.toUpperCase(Locale.ENGLISH)); } public long targetDataFileSize() { - return confParser.longConf() + return confParser + .longConf() .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES) .tableProperty(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES) .defaultValue(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT) @@ -119,7 +128,8 @@ public long targetDataFileSize() { } public boolean fanoutWriterEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.FANOUT_ENABLED) .tableProperty(TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED) .defaultValue(TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED_DEFAULT) @@ -129,11 +139,13 @@ public boolean fanoutWriterEnabled() { public Map extraSnapshotMetadata() { Map extraSnapshotMetadata = Maps.newHashMap(); - writeOptions.forEach((key, value) -> { - if (key.startsWith(SnapshotSummary.EXTRA_METADATA_PREFIX)) { - extraSnapshotMetadata.put(key.substring(SnapshotSummary.EXTRA_METADATA_PREFIX.length()), value); - } - }); + writeOptions.forEach( + (key, value) -> { + if (key.startsWith(SnapshotSummary.EXTRA_METADATA_PREFIX)) { + extraSnapshotMetadata.put( + key.substring(SnapshotSummary.EXTRA_METADATA_PREFIX.length()), value); + } + }); return extraSnapshotMetadata; } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java index 38574d364b20..0ba435ae7429 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java @@ -16,16 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; -/** - * Spark DF write options - */ +/** Spark DF write options */ public class SparkWriteOptions { - private SparkWriteOptions() { - } + private SparkWriteOptions() {} // Fileformat for write operations(default: Table write.format.default ) public static final String WRITE_FORMAT = "write-format"; @@ -52,5 +48,6 @@ private SparkWriteOptions() { public static final String REWRITTEN_FILE_SCAN_TASK_SET_ID = "rewritten-file-scan-task-set-id"; // Controls whether to allow writing timestamps without zone info - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = "handle-timestamp-without-timezone"; + public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = + "handle-timestamp-without-timezone"; } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java index 6a8be60eb078..1e4b0f2f4e3d 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -43,8 +42,7 @@ import org.apache.spark.sql.types.TimestampType$; class TypeToSparkType extends TypeUtil.SchemaVisitor { - TypeToSparkType() { - } + TypeToSparkType() {} @Override public DataType schema(Schema schema, DataType structType) { @@ -59,8 +57,8 @@ public DataType struct(Types.StructType struct, List fieldResults) { for (int i = 0; i < fields.size(); i += 1) { Types.NestedField field = fields.get(i); DataType type = fieldResults.get(i); - StructField sparkField = StructField.apply( - field.name(), type, field.isOptional(), Metadata.empty()); + StructField sparkField = + StructField.apply(field.name(), type, field.isOptional(), Metadata.empty()); if (field.doc() != null) { sparkField = sparkField.withComment(field.doc()); } @@ -101,8 +99,7 @@ public DataType primitive(Type.PrimitiveType primitive) { case DATE: return DateType$.MODULE$; case TIME: - throw new UnsupportedOperationException( - "Spark does not support time fields"); + throw new UnsupportedOperationException("Spark does not support time fields"); case TIMESTAMP: return TimestampType$.MODULE$; case STRING: diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java index e316dfb81c11..a79f075ef442 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -57,35 +59,37 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** * An action that removes orphan metadata and data files by listing a given location and comparing * the actual files in that location with data and metadata files referenced by all valid snapshots. * The location must be accessible for listing via the Hadoop {@link FileSystem}. - *

    - * By default, this action cleans up the table location returned by {@link Table#location()} and - * removes unreachable files that are older than 3 days using {@link Table#io()}. The behavior can be modified - * by passing a custom location to {@link #location} and a custom timestamp to {@link #olderThan(long)}. - * For example, someone might point this action to the data folder to clean up only orphan data files. - * In addition, there is a way to configure an alternative delete method via {@link #deleteWith(Consumer)}. - *

    - * Note: It is dangerous to call this action with a short retention interval as it might corrupt - * the state of the table if another operation is writing at the same time. + * + *

    By default, this action cleans up the table location returned by {@link Table#location()} and + * removes unreachable files that are older than 3 days using {@link Table#io()}. The behavior can + * be modified by passing a custom location to {@link #location} and a custom timestamp to {@link + * #olderThan(long)}. For example, someone might point this action to the data folder to clean up + * only orphan data files. In addition, there is a way to configure an alternative delete method via + * {@link #deleteWith(Consumer)}. + * + *

    Note: It is dangerous to call this action with a short retention interval as it might + * corrupt the state of the table if another operation is writing at the same time. */ public class BaseDeleteOrphanFilesSparkAction - extends BaseSparkAction implements DeleteOrphanFiles { + extends BaseSparkAction + implements DeleteOrphanFiles { private static final Logger LOG = LoggerFactory.getLogger(BaseDeleteOrphanFilesSparkAction.class); - private static final UserDefinedFunction filenameUDF = functions.udf((String path) -> { - int lastIndex = path.lastIndexOf(File.separator); - if (lastIndex == -1) { - return path; - } else { - return path.substring(lastIndex + 1); - } - }, DataTypes.StringType); + private static final UserDefinedFunction filenameUDF = + functions.udf( + (String path) -> { + int lastIndex = path.lastIndexOf(File.separator); + if (lastIndex == -1) { + return path; + } else { + return path.substring(lastIndex + 1); + } + }, + DataTypes.StringType); private static final ExecutorService DEFAULT_DELETE_EXECUTOR_SERVICE = null; @@ -95,12 +99,13 @@ public class BaseDeleteOrphanFilesSparkAction private String location = null; private long olderThanTimestamp = System.currentTimeMillis() - TimeUnit.DAYS.toMillis(3); - private Consumer deleteFunc = new Consumer() { - @Override - public void accept(String file) { - table.io().deleteFile(file); - } - }; + private Consumer deleteFunc = + new Consumer() { + @Override + public void accept(String file) { + table.io().deleteFile(file); + } + }; private ExecutorService deleteExecutorService = DEFAULT_DELETE_EXECUTOR_SERVICE; @@ -108,7 +113,8 @@ public BaseDeleteOrphanFilesSparkAction(SparkSession spark, Table table) { super(spark); this.hadoopConf = new SerializableConfiguration(spark.sessionState().newHadoopConf()); - this.partitionDiscoveryParallelism = spark.sessionState().conf().parallelPartitionDiscoveryParallelism(); + this.partitionDiscoveryParallelism = + spark.sessionState().conf().parallelPartitionDiscoveryParallelism(); this.table = table; this.location = table.location(); @@ -158,7 +164,8 @@ private String jobDesc() { if (location != null) { options.add("location=" + location); } - return String.format("Removing orphan files (%s) from %s", Joiner.on(',').join(options), table.name()); + return String.format( + "Removing orphan files (%s) from %s", Joiner.on(',').join(options), table.name()); } private DeleteOrphanFiles.Result doExecute() { @@ -172,9 +179,8 @@ private DeleteOrphanFiles.Result doExecute() { Column nameEqual = actualFileName.equalTo(validFileName); Column actualContains = actualFileDF.col("file_path").contains(validFileDF.col("file_path")); Column joinCond = nameEqual.and(actualContains); - List orphanFiles = actualFileDF.join(validFileDF, joinCond, "leftanti") - .as(Encoders.STRING()) - .collectAsList(); + List orphanFiles = + actualFileDF.join(validFileDF, joinCond, "leftanti").as(Encoders.STRING()).collectAsList(); Tasks.foreach(orphanFiles) .noRetry() @@ -205,15 +211,23 @@ private Dataset buildActualFileDF() { JavaRDD subDirRDD = sparkContext().parallelize(subDirs, parallelism); Broadcast conf = sparkContext().broadcast(hadoopConf); - JavaRDD matchingLeafFileRDD = subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp)); + JavaRDD matchingLeafFileRDD = + subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp)); JavaRDD completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD); - return spark().createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path"); + return spark() + .createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()) + .toDF("file_path"); } private static void listDirRecursively( - String dir, Predicate predicate, Configuration conf, int maxDepth, - int maxDirectSubDirs, List remainingSubDirs, List matchingFiles) { + String dir, + Predicate predicate, + Configuration conf, + int maxDepth, + int maxDirectSubDirs, + List remainingSubDirs, + List matchingFiles) { // stop listing whenever we reach the max depth if (maxDepth <= 0) { @@ -242,7 +256,14 @@ private static void listDirRecursively( } for (String subDir : subDirs) { - listDirRecursively(subDir, predicate, conf, maxDepth - 1, maxDirectSubDirs, remainingSubDirs, matchingFiles); + listDirRecursively( + subDir, + predicate, + conf, + maxDepth - 1, + maxDirectSubDirs, + remainingSubDirs, + matchingFiles); } } catch (IOException e) { throw new RuntimeIOException(e); @@ -250,8 +271,7 @@ private static void listDirRecursively( } private static FlatMapFunction, String> listDirsRecursively( - Broadcast conf, - long olderThanTimestamp) { + Broadcast conf, long olderThanTimestamp) { return dirs -> { List subDirs = Lists.newArrayList(); @@ -262,12 +282,15 @@ private static FlatMapFunction, String> listDirsRecursively( int maxDepth = 2000; int maxDirectSubDirs = Integer.MAX_VALUE; - dirs.forEachRemaining(dir -> { - listDirRecursively(dir, predicate, conf.value().value(), maxDepth, maxDirectSubDirs, subDirs, files); - }); + dirs.forEachRemaining( + dir -> { + listDirRecursively( + dir, predicate, conf.value().value(), maxDepth, maxDirectSubDirs, subDirs, files); + }); if (!subDirs.isEmpty()) { - throw new RuntimeException("Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth); + throw new RuntimeException( + "Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth); } return files.iterator(); diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java index 6534617d2dec..1431ae5d78ec 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.util.Iterator; import java.util.List; import java.util.concurrent.ExecutorService; @@ -47,17 +49,16 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** - * An implementation of {@link DeleteReachableFiles} that uses metadata tables in Spark - * to determine which files should be deleted. + * An implementation of {@link DeleteReachableFiles} that uses metadata tables in Spark to determine + * which files should be deleted. */ @SuppressWarnings("UnnecessaryAnonymousClass") public class BaseDeleteReachableFilesSparkAction - extends BaseSparkAction implements DeleteReachableFiles { - private static final Logger LOG = LoggerFactory.getLogger(BaseDeleteReachableFilesSparkAction.class); + extends BaseSparkAction + implements DeleteReachableFiles { + private static final Logger LOG = + LoggerFactory.getLogger(BaseDeleteReachableFilesSparkAction.class); private static final String DATA_FILE = "Data File"; private static final String MANIFEST = "Manifest"; @@ -71,12 +72,13 @@ public class BaseDeleteReachableFilesSparkAction private final TableMetadata tableMetadata; - private final Consumer defaultDelete = new Consumer() { - @Override - public void accept(String file) { - io.deleteFile(file); - } - }; + private final Consumer defaultDelete = + new Consumer() { + @Override + public void accept(String file) { + io.deleteFile(file); + } + }; private Consumer removeFunc = defaultDelete; private ExecutorService removeExecutorService = DEFAULT_DELETE_EXECUTOR_SERVICE; @@ -105,7 +107,6 @@ public DeleteReachableFiles io(FileIO fileIO) { public DeleteReachableFiles deleteWith(Consumer deleteFunc) { this.removeFunc = deleteFunc; return this; - } @Override @@ -117,7 +118,8 @@ public DeleteReachableFiles executeDeleteWith(ExecutorService executorService) { @Override public Result execute() { Preconditions.checkArgument(io != null, "File IO cannot be null"); - String msg = String.format("Removing files reachable from %s", tableMetadata.metadataFileLocation()); + String msg = + String.format("Removing files reachable from %s", tableMetadata.metadataFileLocation()); JobGroupInfo info = newJobGroupInfo("REMOVE-FILES", msg); return withJobGroupInfo(info, this::doExecute); } @@ -165,40 +167,45 @@ private BaseDeleteReachableFilesActionResult deleteFiles(Iterator deleted) AtomicLong otherFilesCount = new AtomicLong(0L); Tasks.foreach(deleted) - .retry(3).stopRetryOn(NotFoundException.class).suppressFailureWhenFinished() + .retry(3) + .stopRetryOn(NotFoundException.class) + .suppressFailureWhenFinished() .executeWith(removeExecutorService) - .onFailure((fileInfo, exc) -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - LOG.warn("Delete failed for {}: {}", type, file, exc); - }) - .run(fileInfo -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - removeFunc.accept(file); - switch (type) { - case DATA_FILE: - dataFileCount.incrementAndGet(); - LOG.trace("Deleted Data File: {}", file); - break; - case MANIFEST: - manifestCount.incrementAndGet(); - LOG.debug("Deleted Manifest: {}", file); - break; - case MANIFEST_LIST: - manifestListCount.incrementAndGet(); - LOG.debug("Deleted Manifest List: {}", file); - break; - case OTHERS: - otherFilesCount.incrementAndGet(); - LOG.debug("Others: {}", file); - break; - } - }); - - long filesCount = dataFileCount.get() + manifestCount.get() + manifestListCount.get() + otherFilesCount.get(); + .onFailure( + (fileInfo, exc) -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + LOG.warn("Delete failed for {}: {}", type, file, exc); + }) + .run( + fileInfo -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + removeFunc.accept(file); + switch (type) { + case DATA_FILE: + dataFileCount.incrementAndGet(); + LOG.trace("Deleted Data File: {}", file); + break; + case MANIFEST: + manifestCount.incrementAndGet(); + LOG.debug("Deleted Manifest: {}", file); + break; + case MANIFEST_LIST: + manifestListCount.incrementAndGet(); + LOG.debug("Deleted Manifest List: {}", file); + break; + case OTHERS: + otherFilesCount.incrementAndGet(); + LOG.debug("Others: {}", file); + break; + } + }); + + long filesCount = + dataFileCount.get() + manifestCount.get() + manifestListCount.get() + otherFilesCount.get(); LOG.info("Total files removed: {}", filesCount); - return new BaseDeleteReachableFilesActionResult(dataFileCount.get(), manifestCount.get(), manifestListCount.get(), - otherFilesCount.get()); + return new BaseDeleteReachableFilesActionResult( + dataFileCount.get(), manifestCount.get(), manifestListCount.get(), otherFilesCount.get()); } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java index 88589bca5cab..2e1f0c079eca 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.util.Iterator; import java.util.List; import java.util.Set; @@ -48,22 +50,20 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** - * An action that performs the same operation as {@link org.apache.iceberg.ExpireSnapshots} but uses Spark - * to determine the delta in files between the pre and post-expiration table metadata. All of the same - * restrictions of {@link org.apache.iceberg.ExpireSnapshots} also apply to this action. - *

    - * This action first leverages {@link org.apache.iceberg.ExpireSnapshots} to expire snapshots and then - * uses metadata tables to find files that can be safely deleted. This is done by anti-joining two Datasets - * that contain all manifest and data files before and after the expiration. The snapshot expiration - * will be fully committed before any deletes are issued. - *

    - * This operation performs a shuffle so the parallelism can be controlled through 'spark.sql.shuffle.partitions'. - *

    - * Deletes are still performed locally after retrieving the results from the Spark executors. + * An action that performs the same operation as {@link org.apache.iceberg.ExpireSnapshots} but uses + * Spark to determine the delta in files between the pre and post-expiration table metadata. All of + * the same restrictions of {@link org.apache.iceberg.ExpireSnapshots} also apply to this action. + * + *

    This action first leverages {@link org.apache.iceberg.ExpireSnapshots} to expire snapshots and + * then uses metadata tables to find files that can be safely deleted. This is done by anti-joining + * two Datasets that contain all manifest and data files before and after the expiration. The + * snapshot expiration will be fully committed before any deletes are issued. + * + *

    This operation performs a shuffle so the parallelism can be controlled through + * 'spark.sql.shuffle.partitions'. + * + *

    Deletes are still performed locally after retrieving the results from the Spark executors. */ @SuppressWarnings("UnnecessaryAnonymousClass") public class BaseExpireSnapshotsSparkAction @@ -81,12 +81,13 @@ public class BaseExpireSnapshotsSparkAction private final Table table; private final TableOperations ops; - private final Consumer defaultDelete = new Consumer() { - @Override - public void accept(String file) { - ops.io().deleteFile(file); - } - }; + private final Consumer defaultDelete = + new Consumer() { + @Override + public void accept(String file) { + ops.io().deleteFile(file); + } + }; private final Set expiredSnapshotIds = Sets.newHashSet(); private Long expireOlderThanValue = null; @@ -130,8 +131,10 @@ public BaseExpireSnapshotsSparkAction expireOlderThan(long timestampMillis) { @Override public BaseExpireSnapshotsSparkAction retainLast(int numSnapshots) { - Preconditions.checkArgument(1 <= numSnapshots, - "Number of snapshots to retain must be at least 1, cannot be: %s", numSnapshots); + Preconditions.checkArgument( + 1 <= numSnapshots, + "Number of snapshots to retain must be at least 1, cannot be: %s", + numSnapshots); this.retainLastValue = numSnapshots; return this; } @@ -144,10 +147,11 @@ public BaseExpireSnapshotsSparkAction deleteWith(Consumer newDeleteFunc) /** * Expires snapshots and commits the changes to the table, returning a Dataset of files to delete. - *

    - * This does not delete data files. To delete data files, run {@link #execute()}. - *

    - * This may be called before or after {@link #execute()} is called to return the expired file list. + * + *

    This does not delete data files. To delete data files, run {@link #execute()}. + * + *

    This may be called before or after {@link #execute()} is called to return the expired file + * list. * * @return a Dataset of files that are no longer referenced by the table */ @@ -157,7 +161,8 @@ public Dataset expire() { Dataset originalFiles = buildValidFileDF(ops.current()); // perform expiration - org.apache.iceberg.ExpireSnapshots expireSnapshots = table.expireSnapshots().cleanExpiredFiles(false); + org.apache.iceberg.ExpireSnapshots expireSnapshots = + table.expireSnapshots().cleanExpiredFiles(false); for (long id : expiredSnapshotIds) { expireSnapshots = expireSnapshots.expireSnapshotId(id); } @@ -202,13 +207,15 @@ private String jobDesc() { if (!expiredSnapshotIds.isEmpty()) { Long first = expiredSnapshotIds.stream().findFirst().get(); if (expiredSnapshotIds.size() > 1) { - options.add(String.format("snapshot_ids: %s (%s more...)", first, expiredSnapshotIds.size() - 1)); + options.add( + String.format("snapshot_ids: %s (%s more...)", first, expiredSnapshotIds.size() - 1)); } else { options.add(String.format("snapshot_id: %s", first)); } } - return String.format("Expiring snapshots (%s) in %s", Joiner.on(',').join(options), table.name()); + return String.format( + "Expiring snapshots (%s) in %s", Joiner.on(',').join(options), table.name()); } private ExpireSnapshots.Result doExecute() { @@ -243,34 +250,41 @@ private BaseExpireSnapshotsActionResult deleteFiles(Iterator expired) { AtomicLong manifestListCount = new AtomicLong(0L); Tasks.foreach(expired) - .retry(3).stopRetryOn(NotFoundException.class).suppressFailureWhenFinished() + .retry(3) + .stopRetryOn(NotFoundException.class) + .suppressFailureWhenFinished() .executeWith(deleteExecutorService) - .onFailure((fileInfo, exc) -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - LOG.warn("Delete failed for {}: {}", type, file, exc); - }) - .run(fileInfo -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - deleteFunc.accept(file); - switch (type) { - case DATA_FILE: - dataFileCount.incrementAndGet(); - LOG.trace("Deleted Data File: {}", file); - break; - case MANIFEST: - manifestCount.incrementAndGet(); - LOG.debug("Deleted Manifest: {}", file); - break; - case MANIFEST_LIST: - manifestListCount.incrementAndGet(); - LOG.debug("Deleted Manifest List: {}", file); - break; - } - }); - - LOG.info("Deleted {} total files", dataFileCount.get() + manifestCount.get() + manifestListCount.get()); - return new BaseExpireSnapshotsActionResult(dataFileCount.get(), manifestCount.get(), manifestListCount.get()); + .onFailure( + (fileInfo, exc) -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + LOG.warn("Delete failed for {}: {}", type, file, exc); + }) + .run( + fileInfo -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + deleteFunc.accept(file); + switch (type) { + case DATA_FILE: + dataFileCount.incrementAndGet(); + LOG.trace("Deleted Data File: {}", file); + break; + case MANIFEST: + manifestCount.incrementAndGet(); + LOG.debug("Deleted Manifest: {}", file); + break; + case MANIFEST_LIST: + manifestListCount.incrementAndGet(); + LOG.debug("Deleted Manifest List: {}", file); + break; + } + }); + + LOG.info( + "Deleted {} total files", + dataFileCount.get() + manifestCount.get() + manifestListCount.get()); + return new BaseExpireSnapshotsActionResult( + dataFileCount.get(), manifestCount.get(), manifestListCount.get()); } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseMigrateTableSparkAction.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseMigrateTableSparkAction.java index ef9f0d3e2583..856b67dbcd75 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseMigrateTableSparkAction.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseMigrateTableSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.Map; @@ -45,10 +44,9 @@ import scala.collection.JavaConverters; /** - * Takes a Spark table in the source catalog and attempts to transform it into an Iceberg - * table in the same location with the same identifier. Once complete the identifier which - * previously referred to a non-Iceberg table will refer to the newly migrated Iceberg - * table. + * Takes a Spark table in the source catalog and attempts to transform it into an Iceberg table in + * the same location with the same identifier. Once complete the identifier which previously + * referred to a non-Iceberg table will refer to the newly migrated Iceberg table. */ public class BaseMigrateTableSparkAction extends BaseTableCreationSparkAction @@ -61,7 +59,8 @@ public class BaseMigrateTableSparkAction private final Identifier destTableIdent; private final Identifier backupIdent; - public BaseMigrateTableSparkAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { + public BaseMigrateTableSparkAction( + SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { super(spark, sourceCatalog, sourceTableIdent); this.destCatalog = checkDestinationCatalog(sourceCatalog); this.destTableIdent = sourceTableIdent; @@ -132,7 +131,8 @@ private MigrateTable.Result doExecute() { threw = false; } finally { if (threw) { - LOG.error("Failed to perform the migration, aborting table creation and restoring the original table"); + LOG.error( + "Failed to perform the migration, aborting table creation and restoring the original table"); restoreSourceTable(); @@ -147,8 +147,12 @@ private MigrateTable.Result doExecute() { } Snapshot snapshot = icebergTable.currentSnapshot(); - long migratedDataFilesCount = Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); - LOG.info("Successfully loaded Iceberg metadata for {} files to {}", migratedDataFilesCount, destTableIdent()); + long migratedDataFilesCount = + Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); + LOG.info( + "Successfully loaded Iceberg metadata for {} files to {}", + migratedDataFilesCount, + destTableIdent()); return new BaseMigrateTableActionResult(migratedDataFilesCount); } @@ -176,9 +180,11 @@ protected Map destTableProps() { @Override protected TableCatalog checkSourceCatalog(CatalogPlugin catalog) { // currently the import code relies on being able to look up the table in the session catalog - Preconditions.checkArgument(catalog instanceof SparkSessionCatalog, + Preconditions.checkArgument( + catalog instanceof SparkSessionCatalog, "Cannot migrate a table from a non-Iceberg Spark Session Catalog. Found %s of class %s as the source catalog.", - catalog.name(), catalog.getClass().getName()); + catalog.name(), + catalog.getClass().getName()); return (TableCatalog) catalog; } @@ -204,11 +210,15 @@ private void restoreSourceTable() { destCatalog().renameTable(backupIdent, sourceTableIdent()); } catch (org.apache.spark.sql.catalyst.analysis.NoSuchTableException e) { - LOG.error("Cannot restore the original table, the backup table {} cannot be found", backupIdent, e); + LOG.error( + "Cannot restore the original table, the backup table {} cannot be found", backupIdent, e); } catch (org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException e) { - LOG.error("Cannot restore the original table, a table with the original name exists. " + - "Use the backup table {} to restore the original table manually.", backupIdent, e); + LOG.error( + "Cannot restore the original table, a table with the original name exists. " + + "Use the backup table {} to restore the original table manually.", + backupIdent, + e); } } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSpark3Action.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSpark3Action.java index b1c08e607de8..a12ada501796 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSpark3Action.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSpark3Action.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import org.apache.iceberg.Table; diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSparkAction.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSparkAction.java index 5c3c349cd835..6791ef433e60 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSparkAction.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.io.IOException; @@ -73,18 +72,19 @@ import org.slf4j.LoggerFactory; abstract class BaseRewriteDataFilesSparkAction - extends BaseSnapshotUpdateSparkAction implements RewriteDataFiles { + extends BaseSnapshotUpdateSparkAction + implements RewriteDataFiles { private static final Logger LOG = LoggerFactory.getLogger(BaseRewriteDataFilesSparkAction.class); - private static final Set VALID_OPTIONS = ImmutableSet.of( - MAX_CONCURRENT_FILE_GROUP_REWRITES, - MAX_FILE_GROUP_SIZE_BYTES, - PARTIAL_PROGRESS_ENABLED, - PARTIAL_PROGRESS_MAX_COMMITS, - TARGET_FILE_SIZE_BYTES, - USE_STARTING_SEQUENCE_NUMBER, - REWRITE_JOB_ORDER - ); + private static final Set VALID_OPTIONS = + ImmutableSet.of( + MAX_CONCURRENT_FILE_GROUP_REWRITES, + MAX_FILE_GROUP_SIZE_BYTES, + PARTIAL_PROGRESS_ENABLED, + PARTIAL_PROGRESS_MAX_COMMITS, + TARGET_FILE_SIZE_BYTES, + USE_STARTING_SEQUENCE_NUMBER, + REWRITE_JOB_ORDER); private final Table table; @@ -105,36 +105,38 @@ protected Table table() { return table; } - /** - * The framework specific {@link BinPackStrategy} - */ + /** The framework specific {@link BinPackStrategy} */ protected abstract BinPackStrategy binPackStrategy(); - /** - * The framework specific {@link SortStrategy} - */ + /** The framework specific {@link SortStrategy} */ protected abstract SortStrategy sortStrategy(); @Override public RewriteDataFiles binPack() { - Preconditions.checkArgument(this.strategy == null, - "Cannot set strategy to binpack, it has already been set", this.strategy); + Preconditions.checkArgument( + this.strategy == null, + "Cannot set strategy to binpack, it has already been set", + this.strategy); this.strategy = binPackStrategy(); return this; } @Override public RewriteDataFiles sort(SortOrder sortOrder) { - Preconditions.checkArgument(this.strategy == null, - "Cannot set strategy to sort, it has already been set to %s", this.strategy); + Preconditions.checkArgument( + this.strategy == null, + "Cannot set strategy to sort, it has already been set to %s", + this.strategy); this.strategy = sortStrategy().sortOrder(sortOrder); return this; } @Override public RewriteDataFiles sort() { - Preconditions.checkArgument(this.strategy == null, - "Cannot set strategy to sort, it has already been set to %s", this.strategy); + Preconditions.checkArgument( + this.strategy == null, + "Cannot set strategy to sort, it has already been set to %s", + this.strategy); this.strategy = sortStrategy(); return this; } @@ -160,7 +162,8 @@ public RewriteDataFiles.Result execute() { validateAndInitOptions(); - Map>> fileGroupsByPartition = planFileGroups(startingSnapshotId); + Map>> fileGroupsByPartition = + planFileGroups(startingSnapshotId); RewriteExecutionContext ctx = new RewriteExecutionContext(fileGroupsByPartition); if (ctx.totalGroupCount() == 0) { @@ -179,43 +182,52 @@ public RewriteDataFiles.Result execute() { } Map>> planFileGroups(long startingSnapshotId) { - CloseableIterable fileScanTasks = table.newScan() - .useSnapshot(startingSnapshotId) - .filter(filter) - .ignoreResiduals() - .planFiles(); + CloseableIterable fileScanTasks = + table + .newScan() + .useSnapshot(startingSnapshotId) + .filter(filter) + .ignoreResiduals() + .planFiles(); try { StructType partitionType = table.spec().partitionType(); StructLikeMap> filesByPartition = StructLikeMap.create(partitionType); StructLike emptyStruct = GenericRecord.create(partitionType); - fileScanTasks.forEach(task -> { - // If a task uses an incompatible partition spec the data inside could contain values which - // belong to multiple partitions in the current spec. Treating all such files as un-partitioned and - // grouping them together helps to minimize new files made. - StructLike taskPartition = task.file().specId() == table.spec().specId() ? - task.file().partition() : emptyStruct; - - List files = filesByPartition.get(taskPartition); - if (files == null) { - files = Lists.newArrayList(); - } - - files.add(task); - filesByPartition.put(taskPartition, files); - }); - - StructLikeMap>> fileGroupsByPartition = StructLikeMap.create(partitionType); - - filesByPartition.forEach((partition, tasks) -> { - Iterable filtered = strategy.selectFilesToRewrite(tasks); - Iterable> groupedTasks = strategy.planFileGroups(filtered); - List> fileGroups = ImmutableList.copyOf(groupedTasks); - if (fileGroups.size() > 0) { - fileGroupsByPartition.put(partition, fileGroups); - } - }); + fileScanTasks.forEach( + task -> { + // If a task uses an incompatible partition spec the data inside could contain values + // which + // belong to multiple partitions in the current spec. Treating all such files as + // un-partitioned and + // grouping them together helps to minimize new files made. + StructLike taskPartition = + task.file().specId() == table.spec().specId() + ? task.file().partition() + : emptyStruct; + + List files = filesByPartition.get(taskPartition); + if (files == null) { + files = Lists.newArrayList(); + } + + files.add(task); + filesByPartition.put(taskPartition, files); + }); + + StructLikeMap>> fileGroupsByPartition = + StructLikeMap.create(partitionType); + + filesByPartition.forEach( + (partition, tasks) -> { + Iterable filtered = strategy.selectFilesToRewrite(tasks); + Iterable> groupedTasks = strategy.planFileGroups(filtered); + List> fileGroups = ImmutableList.copyOf(groupedTasks); + if (fileGroups.size() > 0) { + fileGroupsByPartition.put(partition, fileGroups); + } + }); return fileGroupsByPartition; } finally { @@ -230,9 +242,10 @@ Map>> planFileGroups(long startingSnapshotId @VisibleForTesting RewriteFileGroup rewriteFiles(RewriteExecutionContext ctx, RewriteFileGroup fileGroup) { String desc = jobDesc(fileGroup, ctx); - Set addedFiles = withJobGroupInfo( - newJobGroupInfo("REWRITE-DATA-FILES", desc), - () -> strategy.rewriteFiles(fileGroup.fileScans())); + Set addedFiles = + withJobGroupInfo( + newJobGroupInfo("REWRITE-DATA-FILES", desc), + () -> strategy.rewriteFiles(fileGroup.fileScans())); fileGroup.setOutputFiles(addedFiles); LOG.info("Rewrite Files Ready to be Committed - {}", desc); @@ -241,11 +254,10 @@ RewriteFileGroup rewriteFiles(RewriteExecutionContext ctx, RewriteFileGroup file private ExecutorService rewriteService() { return MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool( - maxConcurrentFileGroupRewrites, - new ThreadFactoryBuilder() - .setNameFormat("Rewrite-Service-%d") - .build())); + (ThreadPoolExecutor) + Executors.newFixedThreadPool( + maxConcurrentFileGroupRewrites, + new ThreadFactoryBuilder().setNameFormat("Rewrite-Service-%d").build())); } @VisibleForTesting @@ -253,31 +265,42 @@ RewriteDataFilesCommitManager commitManager(long startingSnapshotId) { return new RewriteDataFilesCommitManager(table, startingSnapshotId, useStartingSequenceNumber); } - private Result doExecute(RewriteExecutionContext ctx, Stream groupStream, - RewriteDataFilesCommitManager commitManager) { + private Result doExecute( + RewriteExecutionContext ctx, + Stream groupStream, + RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); ConcurrentLinkedQueue rewrittenGroups = Queues.newConcurrentLinkedQueue(); - Tasks.Builder rewriteTaskBuilder = Tasks.foreach(groupStream) - .executeWith(rewriteService) - .stopOnFailure() - .noRetry() - .onFailure((fileGroup, exception) -> { - LOG.warn("Failure during rewrite process for group {}", fileGroup.info(), exception); - }); + Tasks.Builder rewriteTaskBuilder = + Tasks.foreach(groupStream) + .executeWith(rewriteService) + .stopOnFailure() + .noRetry() + .onFailure( + (fileGroup, exception) -> { + LOG.warn( + "Failure during rewrite process for group {}", fileGroup.info(), exception); + }); try { - rewriteTaskBuilder.run(fileGroup -> { - rewrittenGroups.add(rewriteFiles(ctx, fileGroup)); - }); + rewriteTaskBuilder.run( + fileGroup -> { + rewrittenGroups.add(rewriteFiles(ctx, fileGroup)); + }); } catch (Exception e) { // At least one rewrite group failed, clean up all completed rewrites - LOG.error("Cannot complete rewrite, {} is not enabled and one of the file set groups failed to " + - "be rewritten. This error occurred during the writing of new files, not during the commit process. This " + - "indicates something is wrong that doesn't involve conflicts with other Iceberg operations. Enabling " + - "{} may help in this case but the root cause should be investigated. Cleaning up {} groups which finished " + - "being written.", PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_ENABLED, rewrittenGroups.size(), e); + LOG.error( + "Cannot complete rewrite, {} is not enabled and one of the file set groups failed to " + + "be rewritten. This error occurred during the writing of new files, not during the commit process. This " + + "indicates something is wrong that doesn't involve conflicts with other Iceberg operations. Enabling " + + "{} may help in this case but the root cause should be investigated. Cleaning up {} groups which finished " + + "being written.", + PARTIAL_PROGRESS_ENABLED, + PARTIAL_PROGRESS_ENABLED, + rewrittenGroups.size(), + e); Tasks.foreach(rewrittenGroups) .suppressFailureWhenFinished() @@ -290,30 +313,33 @@ private Result doExecute(RewriteExecutionContext ctx, Stream g try { commitManager.commitOrClean(Sets.newHashSet(rewrittenGroups)); } catch (ValidationException | CommitFailedException e) { - String errorMessage = String.format( - "Cannot commit rewrite because of a ValidationException or CommitFailedException. This usually means that " + - "this rewrite has conflicted with another concurrent Iceberg operation. To reduce the likelihood of " + - "conflicts, set %s which will break up the rewrite into multiple smaller commits controlled by %s. " + - "Separate smaller rewrite commits can succeed independently while any commits that conflict with " + - "another Iceberg operation will be ignored. This mode will create additional snapshots in the table " + - "history, one for each commit.", - PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_MAX_COMMITS); + String errorMessage = + String.format( + "Cannot commit rewrite because of a ValidationException or CommitFailedException. This usually means that " + + "this rewrite has conflicted with another concurrent Iceberg operation. To reduce the likelihood of " + + "conflicts, set %s which will break up the rewrite into multiple smaller commits controlled by %s. " + + "Separate smaller rewrite commits can succeed independently while any commits that conflict with " + + "another Iceberg operation will be ignored. This mode will create additional snapshots in the table " + + "history, one for each commit.", + PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_MAX_COMMITS); throw new RuntimeException(errorMessage, e); } - List rewriteResults = rewrittenGroups.stream() - .map(RewriteFileGroup::asResult) - .collect(Collectors.toList()); + List rewriteResults = + rewrittenGroups.stream().map(RewriteFileGroup::asResult).collect(Collectors.toList()); return new BaseRewriteDataFilesResult(rewriteResults); } - private Result doExecuteWithPartialProgress(RewriteExecutionContext ctx, Stream groupStream, - RewriteDataFilesCommitManager commitManager) { + private Result doExecuteWithPartialProgress( + RewriteExecutionContext ctx, + Stream groupStream, + RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); // Start Commit Service int groupsPerCommit = IntMath.divide(ctx.totalGroupCount(), maxCommits, RoundingMode.CEILING); - RewriteDataFilesCommitManager.CommitService commitService = commitManager.service(groupsPerCommit); + RewriteDataFilesCommitManager.CommitService commitService = + commitManager.service(groupsPerCommit); commitService.start(); // Start rewrite tasks @@ -321,7 +347,9 @@ private Result doExecuteWithPartialProgress(RewriteExecutionContext ctx, Stream< .suppressFailureWhenFinished() .executeWith(rewriteService) .noRetry() - .onFailure((fileGroup, exception) -> LOG.error("Failure during rewrite group {}", fileGroup.info(), exception)) + .onFailure( + (fileGroup, exception) -> + LOG.error("Failure during rewrite group {}", fileGroup.info(), exception)) .run(fileGroup -> commitService.offer(rewriteFiles(ctx, fileGroup))); rewriteService.shutdown(); @@ -329,31 +357,40 @@ private Result doExecuteWithPartialProgress(RewriteExecutionContext ctx, Stream< commitService.close(); List commitResults = commitService.results(); if (commitResults.size() == 0) { - LOG.error("{} is true but no rewrite commits succeeded. Check the logs to determine why the individual " + - "commits failed. If this is persistent it may help to increase {} which will break the rewrite operation " + - "into smaller commits.", PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_MAX_COMMITS); + LOG.error( + "{} is true but no rewrite commits succeeded. Check the logs to determine why the individual " + + "commits failed. If this is persistent it may help to increase {} which will break the rewrite operation " + + "into smaller commits.", + PARTIAL_PROGRESS_ENABLED, + PARTIAL_PROGRESS_MAX_COMMITS); } - List rewriteResults = commitResults.stream() - .map(RewriteFileGroup::asResult) - .collect(Collectors.toList()); + List rewriteResults = + commitResults.stream().map(RewriteFileGroup::asResult).collect(Collectors.toList()); return new BaseRewriteDataFilesResult(rewriteResults); } - Stream toGroupStream(RewriteExecutionContext ctx, + Stream toGroupStream( + RewriteExecutionContext ctx, Map>> fileGroupsByPartition) { - Stream rewriteFileGroupStream = fileGroupsByPartition.entrySet().stream() - .flatMap(e -> { - StructLike partition = e.getKey(); - List> fileGroups = e.getValue(); - return fileGroups.stream().map(tasks -> { - int globalIndex = ctx.currentGlobalIndex(); - int partitionIndex = ctx.currentPartitionIndex(partition); - FileGroupInfo info = new BaseRewriteDataFilesFileGroupInfo(globalIndex, partitionIndex, partition); - return new RewriteFileGroup(info, tasks); - }); - }); + Stream rewriteFileGroupStream = + fileGroupsByPartition.entrySet().stream() + .flatMap( + e -> { + StructLike partition = e.getKey(); + List> fileGroups = e.getValue(); + return fileGroups.stream() + .map( + tasks -> { + int globalIndex = ctx.currentGlobalIndex(); + int partitionIndex = ctx.currentPartitionIndex(partition); + FileGroupInfo info = + new BaseRewriteDataFilesFileGroupInfo( + globalIndex, partitionIndex, partition); + return new RewriteFileGroup(info, tasks); + }); + }); return rewriteFileGroupStream.sorted(rewriteGroupComparator()); } @@ -379,53 +416,70 @@ void validateAndInitOptions() { Set invalidKeys = Sets.newHashSet(options().keySet()); invalidKeys.removeAll(validOptions); - Preconditions.checkArgument(invalidKeys.isEmpty(), + Preconditions.checkArgument( + invalidKeys.isEmpty(), "Cannot use options %s, they are not supported by the action or the strategy %s", - invalidKeys, strategy.name()); + invalidKeys, + strategy.name()); strategy = strategy.options(options()); - maxConcurrentFileGroupRewrites = PropertyUtil.propertyAsInt(options(), - MAX_CONCURRENT_FILE_GROUP_REWRITES, - MAX_CONCURRENT_FILE_GROUP_REWRITES_DEFAULT); + maxConcurrentFileGroupRewrites = + PropertyUtil.propertyAsInt( + options(), + MAX_CONCURRENT_FILE_GROUP_REWRITES, + MAX_CONCURRENT_FILE_GROUP_REWRITES_DEFAULT); - maxCommits = PropertyUtil.propertyAsInt(options(), - PARTIAL_PROGRESS_MAX_COMMITS, - PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT); + maxCommits = + PropertyUtil.propertyAsInt( + options(), PARTIAL_PROGRESS_MAX_COMMITS, PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT); - partialProgressEnabled = PropertyUtil.propertyAsBoolean(options(), - PARTIAL_PROGRESS_ENABLED, - PARTIAL_PROGRESS_ENABLED_DEFAULT); + partialProgressEnabled = + PropertyUtil.propertyAsBoolean( + options(), PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_ENABLED_DEFAULT); - useStartingSequenceNumber = PropertyUtil.propertyAsBoolean(options(), - USE_STARTING_SEQUENCE_NUMBER, - USE_STARTING_SEQUENCE_NUMBER_DEFAULT); + useStartingSequenceNumber = + PropertyUtil.propertyAsBoolean( + options(), USE_STARTING_SEQUENCE_NUMBER, USE_STARTING_SEQUENCE_NUMBER_DEFAULT); - rewriteJobOrder = RewriteJobOrder.fromName(PropertyUtil.propertyAsString(options(), - REWRITE_JOB_ORDER, - REWRITE_JOB_ORDER_DEFAULT)); + rewriteJobOrder = + RewriteJobOrder.fromName( + PropertyUtil.propertyAsString(options(), REWRITE_JOB_ORDER, REWRITE_JOB_ORDER_DEFAULT)); - Preconditions.checkArgument(maxConcurrentFileGroupRewrites >= 1, + Preconditions.checkArgument( + maxConcurrentFileGroupRewrites >= 1, "Cannot set %s to %s, the value must be positive.", - MAX_CONCURRENT_FILE_GROUP_REWRITES, maxConcurrentFileGroupRewrites); + MAX_CONCURRENT_FILE_GROUP_REWRITES, + maxConcurrentFileGroupRewrites); - Preconditions.checkArgument(!partialProgressEnabled || maxCommits > 0, + Preconditions.checkArgument( + !partialProgressEnabled || maxCommits > 0, "Cannot set %s to %s, the value must be positive when %s is true", - PARTIAL_PROGRESS_MAX_COMMITS, maxCommits, PARTIAL_PROGRESS_ENABLED); + PARTIAL_PROGRESS_MAX_COMMITS, + maxCommits, + PARTIAL_PROGRESS_ENABLED); } private String jobDesc(RewriteFileGroup group, RewriteExecutionContext ctx) { StructLike partition = group.info().partition(); if (partition.size() > 0) { - return String.format("Rewriting %d files (%s, file group %d/%d, %s (%d/%d)) in %s", + return String.format( + "Rewriting %d files (%s, file group %d/%d, %s (%d/%d)) in %s", group.rewrittenFiles().size(), - strategy.name(), group.info().globalIndex(), - ctx.totalGroupCount(), partition, group.info().partitionIndex(), ctx.groupsInPartition(partition), + strategy.name(), + group.info().globalIndex(), + ctx.totalGroupCount(), + partition, + group.info().partitionIndex(), + ctx.groupsInPartition(partition), table.name()); } else { - return String.format("Rewriting %d files (%s, file group %d/%d) in %s", + return String.format( + "Rewriting %d files (%s, file group %d/%d) in %s", group.rewrittenFiles().size(), - strategy.name(), group.info().globalIndex(), ctx.totalGroupCount(), + strategy.name(), + group.info().globalIndex(), + ctx.totalGroupCount(), table.name()); } } @@ -438,11 +492,10 @@ static class RewriteExecutionContext { private final AtomicInteger groupIndex; RewriteExecutionContext(Map>> fileGroupsByPartition) { - this.numGroupsByPartition = fileGroupsByPartition.entrySet().stream() - .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().size())); - this.totalGroupCount = numGroupsByPartition.values().stream() - .reduce(Integer::sum) - .orElse(0); + this.numGroupsByPartition = + fileGroupsByPartition.entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().size())); + this.totalGroupCount = numGroupsByPartition.values().stream().reduce(Integer::sum).orElse(0); this.partitionIndexMap = Maps.newConcurrentMap(); this.groupIndex = new AtomicInteger(1); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java index b1769f428d14..b610302b0c75 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.MetadataTableType.ENTRIES; + import java.io.IOException; import java.util.Collections; import java.util.List; @@ -68,15 +69,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.MetadataTableType.ENTRIES; - /** * An action that rewrites manifests in a distributed manner and co-locates metadata for partitions. - *

    - * By default, this action rewrites all manifests for the current partition spec and writes the result - * to the metadata folder. The behavior can be modified by passing a custom predicate to {@link #rewriteIf(Predicate)} - * and a custom spec id to {@link #specId(int)}. In addition, there is a way to configure a custom location - * for new manifests via {@link #stagingLocation}. + * + *

    By default, this action rewrites all manifests for the current partition spec and writes the + * result to the metadata folder. The behavior can be modified by passing a custom predicate to + * {@link #rewriteIf(Predicate)} and a custom spec id to {@link #specId(int)}. In addition, there is + * a way to configure a custom location for new manifests via {@link #stagingLocation}. */ public class BaseRewriteManifestsSparkAction extends BaseSnapshotUpdateSparkAction @@ -102,10 +101,11 @@ public BaseRewriteManifestsSparkAction(SparkSession spark, Table table) { this.manifestEncoder = Encoders.javaSerialization(ManifestFile.class); this.table = table; this.spec = table.spec(); - this.targetManifestSizeBytes = PropertyUtil.propertyAsLong( - table.properties(), - TableProperties.MANIFEST_TARGET_SIZE_BYTES, - TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT); + this.targetManifestSizeBytes = + PropertyUtil.propertyAsLong( + table.properties(), + TableProperties.MANIFEST_TARGET_SIZE_BYTES, + TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT); this.fileIO = SparkUtil.serializableFileIO(table); // default the staging location to the metadata location @@ -143,7 +143,9 @@ public RewriteManifests stagingLocation(String newStagingLocation) { @Override public RewriteManifests.Result execute() { - String desc = String.format("Rewriting manifests (staging location=%s) of %s", stagingLocation, table.name()); + String desc = + String.format( + "Rewriting manifests (staging location=%s) of %s", stagingLocation, table.name()); JobGroupInfo info = newJobGroupInfo("REWRITE-MANIFESTS", desc); return withJobGroupInfo(info, this::doExecute); } @@ -158,10 +160,12 @@ private RewriteManifests.Result doExecute() { int numEntries = 0; for (ManifestFile manifest : matchingManifests) { - ValidationException.check(hasFileCounts(manifest), "No file counts in manifest: %s", manifest.path()); + ValidationException.check( + hasFileCounts(manifest), "No file counts in manifest: %s", manifest.path()); totalSizeBytes += manifest.length(); - numEntries += manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); + numEntries += + manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); } int targetNumManifests = targetNumManifests(totalSizeBytes); @@ -173,7 +177,9 @@ private RewriteManifests.Result doExecute() { if (spec.fields().size() < 1) { newManifests = writeManifestsForUnpartitionedTable(manifestEntryDF, targetNumManifests); } else { - newManifests = writeManifestsForPartitionedTable(manifestEntryDF, targetNumManifests, targetNumManifestEntries); + newManifests = + writeManifestsForPartitionedTable( + manifestEntryDF, targetNumManifests, targetNumManifestEntries); } replaceManifests(matchingManifests, newManifests); @@ -182,13 +188,16 @@ private RewriteManifests.Result doExecute() { } private Dataset buildManifestEntryDF(List manifests) { - Dataset manifestDF = spark() - .createDataset(Lists.transform(manifests, ManifestFile::path), Encoders.STRING()) - .toDF("manifest"); + Dataset manifestDF = + spark() + .createDataset(Lists.transform(manifests, ManifestFile::path), Encoders.STRING()) + .toDF("manifest"); - Dataset manifestEntryDF = loadMetadataTable(table, ENTRIES) - .filter("status < 2") // select only live entries - .selectExpr("input_file_name() as manifest", "snapshot_id", "sequence_number", "data_file"); + Dataset manifestEntryDF = + loadMetadataTable(table, ENTRIES) + .filter("status < 2") // select only live entries + .selectExpr( + "input_file_name() as manifest", "snapshot_id", "sequence_number", "data_file"); Column joinCond = manifestDF.col("manifest").equalTo(manifestEntryDF.col("manifest")); return manifestEntryDF @@ -196,7 +205,8 @@ private Dataset buildManifestEntryDF(List manifests) { .select("snapshot_id", "sequence_number", "data_file"); } - private List writeManifestsForUnpartitionedTable(Dataset manifestEntryDF, int numManifests) { + private List writeManifestsForUnpartitionedTable( + Dataset manifestEntryDF, int numManifests) { Broadcast io = sparkContext().broadcast(fileIO); StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType(); @@ -208,41 +218,44 @@ private List writeManifestsForUnpartitionedTable(Dataset mani .repartition(numManifests) .mapPartitions( toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), - manifestEncoder - ) + manifestEncoder) .collectAsList(); } private List writeManifestsForPartitionedTable( - Dataset manifestEntryDF, int numManifests, - int targetNumManifestEntries) { + Dataset manifestEntryDF, int numManifests, int targetNumManifestEntries) { Broadcast io = sparkContext().broadcast(fileIO); StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType(); - // we allow the actual size of manifests to be 10% higher if the estimation is not precise enough + // we allow the actual size of manifests to be 10% higher if the estimation is not precise + // enough long maxNumManifestEntries = (long) (1.1 * targetNumManifestEntries); - return withReusableDS(manifestEntryDF, df -> { - Column partitionColumn = df.col("data_file.partition"); - return df.repartitionByRange(numManifests, partitionColumn) - .sortWithinPartitions(partitionColumn) - .mapPartitions( - toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), - manifestEncoder - ) - .collectAsList(); - }); + return withReusableDS( + manifestEntryDF, + df -> { + Column partitionColumn = df.col("data_file.partition"); + return df.repartitionByRange(numManifests, partitionColumn) + .sortWithinPartitions(partitionColumn) + .mapPartitions( + toManifests( + io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), + manifestEncoder) + .collectAsList(); + }); } private U withReusableDS(Dataset ds, Function, U> func) { Dataset reusableDS; - boolean useCaching = PropertyUtil.propertyAsBoolean(options(), USE_CACHING, USE_CACHING_DEFAULT); + boolean useCaching = + PropertyUtil.propertyAsBoolean(options(), USE_CACHING, USE_CACHING_DEFAULT); if (useCaching) { reusableDS = ds.cache(); } else { int parallelism = SQLConf.get().numShufflePartitions(); - reusableDS = ds.repartition(parallelism).map((MapFunction) value -> value, ds.exprEnc()); + reusableDS = + ds.repartition(parallelism).map((MapFunction) value -> value, ds.exprEnc()); } try { @@ -275,17 +288,19 @@ private int targetNumManifestEntries(int numEntries, int numManifests) { } private boolean hasFileCounts(ManifestFile manifest) { - return manifest.addedFilesCount() != null && - manifest.existingFilesCount() != null && - manifest.deletedFilesCount() != null; + return manifest.addedFilesCount() != null + && manifest.existingFilesCount() != null + && manifest.deletedFilesCount() != null; } - private void replaceManifests(Iterable deletedManifests, Iterable addedManifests) { + private void replaceManifests( + Iterable deletedManifests, Iterable addedManifests) { try { - boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean( - table.properties(), - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); + boolean snapshotIdInheritanceEnabled = + PropertyUtil.propertyAsBoolean( + table.properties(), + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); org.apache.iceberg.RewriteManifests rewriteManifests = table.rewriteManifests(); deletedManifests.forEach(rewriteManifests::deleteManifest); @@ -315,12 +330,20 @@ private void deleteFiles(Iterable locations) { } private static ManifestFile writeManifest( - List rows, int startIndex, int endIndex, Broadcast io, - String location, int format, PartitionSpec spec, StructType sparkType) throws IOException { + List rows, + int startIndex, + int endIndex, + Broadcast io, + String location, + int format, + PartitionSpec spec, + StructType sparkType) + throws IOException { String manifestName = "optimized-m-" + UUID.randomUUID(); Path manifestPath = new Path(location, manifestName); - OutputFile outputFile = io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString())); + OutputFile outputFile = + io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString())); Types.StructType dataFileType = DataFile.getType(spec.partitionType()); SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkType); @@ -343,8 +366,12 @@ private static ManifestFile writeManifest( } private static MapPartitionsFunction toManifests( - Broadcast io, long maxNumManifestEntries, String location, - int format, PartitionSpec spec, StructType sparkType) { + Broadcast io, + long maxNumManifestEntries, + String location, + int format, + PartitionSpec spec, + StructType sparkType) { return rows -> { List rowsAsList = Lists.newArrayList(rows); @@ -355,11 +382,15 @@ private static MapPartitionsFunction toManifests( List manifests = Lists.newArrayList(); if (rowsAsList.size() <= maxNumManifestEntries) { - manifests.add(writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType)); + manifests.add( + writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType)); } else { int midIndex = rowsAsList.size() / 2; - manifests.add(writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType)); - manifests.add(writeManifest(rowsAsList, midIndex, rowsAsList.size(), io, location, format, spec, sparkType)); + manifests.add( + writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType)); + manifests.add( + writeManifest( + rowsAsList, midIndex, rowsAsList.size(), io, location, format, spec, sparkType)); } return manifests.iterator(); diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotTableSparkAction.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotTableSparkAction.java index 1ccb448f1dcc..a170ca23a5ab 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotTableSparkAction.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotTableSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.Map; @@ -44,9 +43,8 @@ import scala.collection.JavaConverters; /** - * Creates a new Iceberg table based on a source Spark table. The new Iceberg table will - * have a different data and metadata directory allowing it to exist independently of the - * source table. + * Creates a new Iceberg table based on a source Spark table. The new Iceberg table will have a + * different data and metadata directory allowing it to exist independently of the source table. */ public class BaseSnapshotTableSparkAction extends BaseTableCreationSparkAction @@ -58,13 +56,18 @@ public class BaseSnapshotTableSparkAction private Identifier destTableIdent; private String destTableLocation = null; - BaseSnapshotTableSparkAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { + BaseSnapshotTableSparkAction( + SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { super(spark, sourceCatalog, sourceTableIdent); } // used by the old constructor - public BaseSnapshotTableSparkAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent, - CatalogPlugin destCatalog, Identifier destTableIdent) { + public BaseSnapshotTableSparkAction( + SparkSession spark, + CatalogPlugin sourceCatalog, + Identifier sourceTableIdent, + CatalogPlugin destCatalog, + Identifier destTableIdent) { super(spark, sourceCatalog, sourceTableIdent); this.destCatalog = checkDestinationCatalog(destCatalog); this.destTableIdent = destTableIdent; @@ -89,7 +92,8 @@ protected Identifier destTableIdent() { public SnapshotTable as(String ident) { String ctx = "snapshot destination"; CatalogPlugin defaultCatalog = spark().sessionState().catalogManager().currentCatalog(); - CatalogAndIdentifier catalogAndIdent = Spark3Util.catalogAndIdentifier(ctx, spark(), ident, defaultCatalog); + CatalogAndIdentifier catalogAndIdent = + Spark3Util.catalogAndIdentifier(ctx, spark(), ident, defaultCatalog); this.destCatalog = checkDestinationCatalog(catalogAndIdent.catalog()); this.destTableIdent = catalogAndIdent.identifier(); return this; @@ -115,11 +119,13 @@ public SnapshotTable.Result execute() { } private SnapshotTable.Result doExecute() { - Preconditions.checkArgument(destCatalog() != null && destTableIdent() != null, - "The destination catalog and identifier cannot be null. " + - "Make sure to configure the action with a valid destination table identifier via the `as` method."); + Preconditions.checkArgument( + destCatalog() != null && destTableIdent() != null, + "The destination catalog and identifier cannot be null. " + + "Make sure to configure the action with a valid destination table identifier via the `as` method."); - LOG.info("Staging a new Iceberg table {} as a snapshot of {}", destTableIdent(), sourceTableIdent()); + LOG.info( + "Staging a new Iceberg table {} as a snapshot of {}", destTableIdent(), sourceTableIdent()); StagedSparkTable stagedTable = stageDestTable(); Table icebergTable = stagedTable.table(); @@ -151,8 +157,12 @@ private SnapshotTable.Result doExecute() { } Snapshot snapshot = icebergTable.currentSnapshot(); - long importedDataFilesCount = Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); - LOG.info("Successfully loaded Iceberg metadata for {} files to {}", importedDataFilesCount, destTableIdent()); + long importedDataFilesCount = + Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); + LOG.info( + "Successfully loaded Iceberg metadata for {} files to {}", + importedDataFilesCount, + destTableIdent()); return new BaseSnapshotTableActionResult(importedDataFilesCount); } @@ -190,22 +200,27 @@ protected Map destTableProps() { @Override protected TableCatalog checkSourceCatalog(CatalogPlugin catalog) { // currently the import code relies on being able to look up the table in the session catalog - Preconditions.checkArgument(catalog.name().equalsIgnoreCase("spark_catalog"), - "Cannot snapshot a table that isn't in the session catalog (i.e. spark_catalog). " + - "Found source catalog: %s.", catalog.name()); - - Preconditions.checkArgument(catalog instanceof TableCatalog, + Preconditions.checkArgument( + catalog.name().equalsIgnoreCase("spark_catalog"), + "Cannot snapshot a table that isn't in the session catalog (i.e. spark_catalog). " + + "Found source catalog: %s.", + catalog.name()); + + Preconditions.checkArgument( + catalog instanceof TableCatalog, "Cannot snapshot as catalog %s of class %s in not a table catalog", - catalog.name(), catalog.getClass().getName()); + catalog.name(), + catalog.getClass().getName()); return (TableCatalog) catalog; } @Override public SnapshotTable tableLocation(String location) { - Preconditions.checkArgument(!sourceTableLocation().equals(location), - "The snapshot table location cannot be same as the source table location. " + - "This would mix snapshot table files with original table files."); + Preconditions.checkArgument( + !sourceTableLocation().equals(location), + "The snapshot table location cannot be same as the source table location. " + + "This would mix snapshot table files with original table files."); this.destTableLocation = location; return this; } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java index 53fa06bbb5dc..f68fb4e97e78 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.Map; @@ -24,8 +23,8 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.spark.sql.SparkSession; -abstract class BaseSnapshotUpdateSparkAction - extends BaseSparkAction implements SnapshotUpdate { +abstract class BaseSnapshotUpdateSparkAction extends BaseSparkAction + implements SnapshotUpdate { private final Map summary = Maps.newHashMap(); diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java index 42c54679b669..c9d93ce9de5f 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.MetadataTableType.ALL_MANIFESTS; + import java.util.Iterator; import java.util.List; import java.util.Map; @@ -49,8 +50,6 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; -import static org.apache.iceberg.MetadataTableType.ALL_MANIFESTS; - abstract class BaseSparkAction implements Action { private static final AtomicInteger JOB_COUNTER = new AtomicInteger(); @@ -115,11 +114,20 @@ protected Dataset buildValidDataFileDF(Table table) { JavaSparkContext context = JavaSparkContext.fromSparkContext(spark.sparkContext()); Broadcast ioBroadcast = context.broadcast(SparkUtil.serializableFileIO(table)); - Dataset allManifests = loadMetadataTable(table, ALL_MANIFESTS) - .selectExpr("path", "length", "partition_spec_id as partitionSpecId", "added_snapshot_id as addedSnapshotId") - .dropDuplicates("path") - .repartition(spark.sessionState().conf().numShufflePartitions()) // avoid adaptive execution combining tasks - .as(Encoders.bean(ManifestFileBean.class)); + Dataset allManifests = + loadMetadataTable(table, ALL_MANIFESTS) + .selectExpr( + "path", + "length", + "partition_spec_id as partitionSpecId", + "added_snapshot_id as addedSnapshotId") + .dropDuplicates("path") + .repartition( + spark + .sessionState() + .conf() + .numShufflePartitions()) // avoid adaptive execution combining tasks + .as(Encoders.bean(ManifestFileBean.class)); return allManifests.flatMap(new ReadManifest(ioBroadcast), Encoders.STRING()).toDF("file_path"); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkActions.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkActions.java index 58b57177cf73..bec51944f222 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkActions.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkActions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import org.apache.iceberg.Table; diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseTableCreationSparkAction.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseTableCreationSparkAction.java index 6eadece65cb6..c6645fcfc8ed 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseTableCreationSparkAction.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/BaseTableCreationSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.List; @@ -50,7 +49,8 @@ import org.apache.spark.sql.types.StructType; abstract class BaseTableCreationSparkAction extends BaseSparkAction { - private static final Set ALLOWED_SOURCES = ImmutableSet.of("parquet", "avro", "orc", "hive"); + private static final Set ALLOWED_SOURCES = + ImmutableSet.of("parquet", "avro", "orc", "hive"); protected static final String LOCATION = "location"; protected static final String ICEBERG_METADATA_FOLDER = "metadata"; protected static final List EXCLUDED_PROPERTIES = @@ -66,7 +66,8 @@ abstract class BaseTableCreationSparkAction extends BaseSparkAction

    additionalProperties = Maps.newHashMap(); - BaseTableCreationSparkAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { + BaseTableCreationSparkAction( + SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { super(spark); this.sourceCatalog = checkSourceCatalog(sourceCatalog); @@ -78,12 +79,13 @@ abstract class BaseTableCreationSparkAction extends BaseSparkAction additionalProperties() { private void validateSourceTable() { String sourceTableProvider = sourceCatalogTable.provider().get().toLowerCase(Locale.ROOT); - Preconditions.checkArgument(ALLOWED_SOURCES.contains(sourceTableProvider), - "Cannot create an Iceberg table from source provider: '%s'", sourceTableProvider); - Preconditions.checkArgument(!sourceCatalogTable.storage().locationUri().isEmpty(), + Preconditions.checkArgument( + ALLOWED_SOURCES.contains(sourceTableProvider), + "Cannot create an Iceberg table from source provider: '%s'", + sourceTableProvider); + Preconditions.checkArgument( + !sourceCatalogTable.storage().locationUri().isEmpty(), "Cannot create an Iceberg table from a source without an explicit location"); } protected StagingTableCatalog checkDestinationCatalog(CatalogPlugin catalog) { - Preconditions.checkArgument(catalog instanceof SparkSessionCatalog || catalog instanceof SparkCatalog, - "Cannot create Iceberg table in non-Iceberg Catalog. " + - "Catalog '%s' was of class '%s' but '%s' or '%s' are required", - catalog.name(), catalog.getClass().getName(), SparkSessionCatalog.class.getName(), + Preconditions.checkArgument( + catalog instanceof SparkSessionCatalog || catalog instanceof SparkCatalog, + "Cannot create Iceberg table in non-Iceberg Catalog. " + + "Catalog '%s' was of class '%s' but '%s' or '%s' are required", + catalog.name(), + catalog.getClass().getName(), + SparkSessionCatalog.class.getName(), SparkCatalog.class.getName()); return (StagingTableCatalog) catalog; @@ -145,11 +153,14 @@ protected StagedSparkTable stageDestTable() { Map props = destTableProps(); StructType schema = sourceTable.schema(); Transform[] partitioning = sourceTable.partitioning(); - return (StagedSparkTable) destCatalog().stageCreate(destTableIdent(), schema, partitioning, props); + return (StagedSparkTable) + destCatalog().stageCreate(destTableIdent(), schema, partitioning, props); } catch (org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException e) { - throw new NoSuchNamespaceException("Cannot create table %s as the namespace does not exist", destTableIdent()); + throw new NoSuchNamespaceException( + "Cannot create table %s as the namespace does not exist", destTableIdent()); } catch (org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException e) { - throw new AlreadyExistsException("Cannot create table %s as it already exists", destTableIdent()); + throw new AlreadyExistsException( + "Cannot create table %s as it already exists", destTableIdent()); } } @@ -162,7 +173,10 @@ protected void ensureNameMappingPresent(Table table) { } protected String getMetadataLocation(Table table) { - return table.properties().getOrDefault(TableProperties.WRITE_METADATA_LOCATION, - table.location() + "/" + ICEBERG_METADATA_FOLDER); + return table + .properties() + .getOrDefault( + TableProperties.WRITE_METADATA_LOCATION, + table.location() + "/" + ICEBERG_METADATA_FOLDER); } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java index 0837fb7d39e4..3660b870c63f 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.nio.ByteBuffer; diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/Spark3BinPackStrategy.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/Spark3BinPackStrategy.java index b2995ced3c1b..130dbe74388f 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/Spark3BinPackStrategy.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/Spark3BinPackStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.List; @@ -61,14 +60,18 @@ public Set rewriteFiles(List filesToRewrite) { SparkSession cloneSession = spark.cloneSession(); cloneSession.conf().set(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), false); - Dataset scanDF = cloneSession.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) - .option(SparkReadOptions.SPLIT_SIZE, splitSize(inputFileSize(filesToRewrite))) - .option(SparkReadOptions.FILE_OPEN_COST, "0") - .load(table.name()); + Dataset scanDF = + cloneSession + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) + .option(SparkReadOptions.SPLIT_SIZE, splitSize(inputFileSize(filesToRewrite))) + .option(SparkReadOptions.FILE_OPEN_COST, "0") + .load(table.name()); // write the packed data into new files where each split becomes a new file - scanDF.write() + scanDF + .write() .format("iceberg") .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, groupID) .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, writeMaxFileSize()) diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/Spark3SortStrategy.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/Spark3SortStrategy.java index 2931796715c1..cfa1d66729c6 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/Spark3SortStrategy.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/Spark3SortStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.List; @@ -51,14 +50,13 @@ public class Spark3SortStrategy extends SortStrategy { /** - * The number of shuffle partitions and consequently the number of output files - * created by the Spark Sort is based on the size of the input data files used - * in this rewrite operation. Due to compression, the disk file sizes may not - * accurately represent the size of files in the output. This parameter lets - * the user adjust the file size used for estimating actual output data size. A - * factor greater than 1.0 would generate more files than we would expect based - * on the on-disk file size. A value less than 1.0 would create fewer files than - * we would expect due to the on-disk size. + * The number of shuffle partitions and consequently the number of output files created by the + * Spark Sort is based on the size of the input data files used in this rewrite operation. Due to + * compression, the disk file sizes may not accurately represent the size of files in the output. + * This parameter lets the user adjust the file size used for estimating actual output data size. + * A factor greater than 1.0 would generate more files than we would expect based on the on-disk + * file size. A value less than 1.0 would create fewer files than we would expect due to the + * on-disk size. */ public static final String COMPRESSION_FACTOR = "compression-factor"; @@ -89,12 +87,12 @@ public Set validOptions() { @Override public RewriteStrategy options(Map options) { - sizeEstimateMultiple = PropertyUtil.propertyAsDouble(options, - COMPRESSION_FACTOR, - 1.0); + sizeEstimateMultiple = PropertyUtil.propertyAsDouble(options, COMPRESSION_FACTOR, 1.0); - Preconditions.checkArgument(sizeEstimateMultiple > 0, - "Invalid compression factor: %s (not positive)", sizeEstimateMultiple); + Preconditions.checkArgument( + sizeEstimateMultiple > 0, + "Invalid compression factor: %s (not positive)", + sizeEstimateMultiple); return super.options(options); } @@ -107,7 +105,9 @@ public Set rewriteFiles(List filesToRewrite) { SortOrder[] ordering; if (requiresRepartition) { // Build in the requirement for Partition Sorting into our sort order - ordering = Spark3Util.convert(SortOrderUtil.buildSortOrder(table.schema(), table.spec(), sortOrder())); + ordering = + Spark3Util.convert( + SortOrderUtil.buildSortOrder(table.schema(), table.spec(), sortOrder())); } else { ordering = Spark3Util.convert(sortOrder()); } @@ -122,23 +122,29 @@ public Set rewriteFiles(List filesToRewrite) { cloneSession.conf().set(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), false); // Reset Shuffle Partitions for our sort - long numOutputFiles = numOutputFiles((long) (inputFileSize(filesToRewrite) * sizeEstimateMultiple)); + long numOutputFiles = + numOutputFiles((long) (inputFileSize(filesToRewrite) * sizeEstimateMultiple)); cloneSession.conf().set(SQLConf.SHUFFLE_PARTITIONS().key(), Math.max(1, numOutputFiles)); - Dataset scanDF = cloneSession.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) - .load(table.name()); + Dataset scanDF = + cloneSession + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) + .load(table.name()); // write the packed data into new files where each split becomes a new file SQLConf sqlConf = cloneSession.sessionState().conf(); LogicalPlan sortPlan = sortPlan(distribution, ordering, scanDF.logicalPlan(), sqlConf); Dataset sortedDf = new Dataset<>(cloneSession, sortPlan, scanDF.encoder()); - sortedDf.write() + sortedDf + .write() .format("iceberg") .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, groupID) .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, writeMaxFileSize()) - .mode("append") // This will only write files without modifying the table, see SparkWrite.RewriteFiles + .mode("append") // This will only write files without modifying the table, see + // SparkWrite.RewriteFiles .save(table.name()); return rewriteCoordinator.fetchNewDataFiles(table, groupID); @@ -152,7 +158,8 @@ protected SparkSession spark() { return this.spark; } - protected LogicalPlan sortPlan(Distribution distribution, SortOrder[] ordering, LogicalPlan plan, SQLConf conf) { + protected LogicalPlan sortPlan( + Distribution distribution, SortOrder[] ordering, LogicalPlan plan, SQLConf conf) { return DistributionAndOrderingUtils$.MODULE$.prepareQuery(distribution, ordering, plan, conf); } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java index 3230728261c9..91ec18a79fe6 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import org.apache.iceberg.Table; @@ -31,9 +30,9 @@ /** * An implementation of {@link ActionsProvider} for Spark. - *

    - * This class is the primary API for interacting with actions in Spark that users should use - * to instantiate particular actions. + * + *

    This class is the primary API for interacting with actions in Spark that users should use to + * instantiate particular actions. */ public class SparkActions extends BaseSparkActions { @@ -53,16 +52,20 @@ public static SparkActions get() { public SnapshotTable snapshotTable(String tableIdent) { String ctx = "snapshot source"; CatalogPlugin defaultCatalog = spark().sessionState().catalogManager().currentCatalog(); - CatalogAndIdentifier catalogAndIdent = Spark3Util.catalogAndIdentifier(ctx, spark(), tableIdent, defaultCatalog); - return new BaseSnapshotTableSparkAction(spark(), catalogAndIdent.catalog(), catalogAndIdent.identifier()); + CatalogAndIdentifier catalogAndIdent = + Spark3Util.catalogAndIdentifier(ctx, spark(), tableIdent, defaultCatalog); + return new BaseSnapshotTableSparkAction( + spark(), catalogAndIdent.catalog(), catalogAndIdent.identifier()); } @Override public MigrateTable migrateTable(String tableIdent) { String ctx = "migrate target"; CatalogPlugin defaultCatalog = spark().sessionState().catalogManager().currentCatalog(); - CatalogAndIdentifier catalogAndIdent = Spark3Util.catalogAndIdentifier(ctx, spark(), tableIdent, defaultCatalog); - return new BaseMigrateTableSparkAction(spark(), catalogAndIdent.catalog(), catalogAndIdent.identifier()); + CatalogAndIdentifier catalogAndIdent = + Spark3Util.catalogAndIdentifier(ctx, spark(), tableIdent, defaultCatalog); + return new BaseMigrateTableSparkAction( + spark(), catalogAndIdent.catalog(), catalogAndIdent.identifier()); } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java index 40ed05b4ce65..74454fc1e466 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import org.apache.iceberg.avro.AvroWithPartnerByStructureVisitor; @@ -30,7 +29,8 @@ import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; -public abstract class AvroWithSparkSchemaVisitor extends AvroWithPartnerByStructureVisitor { +public abstract class AvroWithSparkSchemaVisitor + extends AvroWithPartnerByStructureVisitor { @Override protected boolean isStringType(DataType dataType) { @@ -44,7 +44,8 @@ protected boolean isMapType(DataType dataType) { @Override protected DataType arrayElementType(DataType arrayType) { - Preconditions.checkArgument(arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); + Preconditions.checkArgument( + arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); return ((ArrayType) arrayType).elementType(); } @@ -62,7 +63,8 @@ protected DataType mapValueType(DataType mapType) { @Override protected Pair fieldNameAndType(DataType structType, int pos) { - Preconditions.checkArgument(structType instanceof StructType, "Invalid struct: %s is not a struct", structType); + Preconditions.checkArgument( + structType instanceof StructType, "Invalid struct: %s is not a struct", structType); StructField field = ((StructType) structType).apply(pos); return Pair.of(field.name(), field.dataType()); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java index 924cc3e2325a..d74a76f94e87 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.Deque; @@ -48,9 +47,11 @@ public class ParquetWithSparkSchemaVisitor { public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisitor visitor) { Preconditions.checkArgument(sType != null, "Invalid DataType: null"); if (type instanceof MessageType) { - Preconditions.checkArgument(sType instanceof StructType, "Invalid struct: %s is not a struct", sType); + Preconditions.checkArgument( + sType instanceof StructType, "Invalid struct: %s is not a struct", sType); StructType struct = (StructType) sType; - return visitor.message(struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); + return visitor.message( + struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); } else if (type.isPrimitive()) { return visitor.primitive(sType, type.asPrimitiveType()); @@ -62,21 +63,30 @@ public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisit if (annotation != null) { switch (annotation) { case LIST: - Preconditions.checkArgument(!group.isRepetition(Repetition.REPEATED), - "Invalid list: top-level group is repeated: %s", group); - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid list: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + !group.isRepetition(Repetition.REPEATED), + "Invalid list: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid list: does not contain single repeated field: %s", + group); GroupType repeatedElement = group.getFields().get(0).asGroupType(); - Preconditions.checkArgument(repeatedElement.isRepetition(Repetition.REPEATED), + Preconditions.checkArgument( + repeatedElement.isRepetition(Repetition.REPEATED), "Invalid list: inner group is not repeated"); - Preconditions.checkArgument(repeatedElement.getFieldCount() <= 1, - "Invalid list: repeated group is not a single field: %s", group); + Preconditions.checkArgument( + repeatedElement.getFieldCount() <= 1, + "Invalid list: repeated group is not a single field: %s", + group); - Preconditions.checkArgument(sType instanceof ArrayType, "Invalid list: %s is not an array", sType); + Preconditions.checkArgument( + sType instanceof ArrayType, "Invalid list: %s is not an array", sType); ArrayType array = (ArrayType) sType; - StructField element = new StructField( - "element", array.elementType(), array.containsNull(), Metadata.empty()); + StructField element = + new StructField( + "element", array.elementType(), array.containsNull(), Metadata.empty()); visitor.fieldNames.push(repeatedElement.getName()); try { @@ -92,22 +102,30 @@ public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisit } case MAP: - Preconditions.checkArgument(!group.isRepetition(Repetition.REPEATED), - "Invalid map: top-level group is repeated: %s", group); - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid map: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + !group.isRepetition(Repetition.REPEATED), + "Invalid map: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid map: does not contain single repeated field: %s", + group); GroupType repeatedKeyValue = group.getType(0).asGroupType(); - Preconditions.checkArgument(repeatedKeyValue.isRepetition(Repetition.REPEATED), + Preconditions.checkArgument( + repeatedKeyValue.isRepetition(Repetition.REPEATED), "Invalid map: inner group is not repeated"); - Preconditions.checkArgument(repeatedKeyValue.getFieldCount() <= 2, + Preconditions.checkArgument( + repeatedKeyValue.getFieldCount() <= 2, "Invalid map: repeated group does not have 2 fields"); - Preconditions.checkArgument(sType instanceof MapType, "Invalid map: %s is not a map", sType); + Preconditions.checkArgument( + sType instanceof MapType, "Invalid map: %s is not a map", sType); MapType map = (MapType) sType; StructField keyField = new StructField("key", map.keyType(), false, Metadata.empty()); - StructField valueField = new StructField( - "value", map.valueType(), map.valueContainsNull(), Metadata.empty()); + StructField valueField = + new StructField( + "value", map.valueType(), map.valueContainsNull(), Metadata.empty()); visitor.fieldNames.push(repeatedKeyValue.getName()); try { @@ -144,13 +162,15 @@ public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisit } } - Preconditions.checkArgument(sType instanceof StructType, "Invalid struct: %s is not a struct", sType); + Preconditions.checkArgument( + sType instanceof StructType, "Invalid struct: %s is not a struct", sType); StructType struct = (StructType) sType; return visitor.struct(struct, group, visitFields(struct, group, visitor)); } } - private static T visitField(StructField sField, Type field, ParquetWithSparkSchemaVisitor visitor) { + private static T visitField( + StructField sField, Type field, ParquetWithSparkSchemaVisitor visitor) { visitor.fieldNames.push(field.getName()); try { return visit(sField.dataType(), field, visitor); @@ -159,17 +179,20 @@ private static T visitField(StructField sField, Type field, ParquetWithSpark } } - private static List visitFields(StructType struct, GroupType group, - ParquetWithSparkSchemaVisitor visitor) { + private static List visitFields( + StructType struct, GroupType group, ParquetWithSparkSchemaVisitor visitor) { StructField[] sFields = struct.fields(); - Preconditions.checkArgument(sFields.length == group.getFieldCount(), - "Structs do not match: %s and %s", struct, group); + Preconditions.checkArgument( + sFields.length == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); for (int i = 0; i < sFields.length; i += 1) { Type field = group.getFields().get(i); StructField sField = sFields[i]; - Preconditions.checkArgument(field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), - "Structs do not match: field %s != %s", field.getName(), sField.name()); + Preconditions.checkArgument( + field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), + "Structs do not match: field %s != %s", + field.getName(), + sField.name()); results.add(visitField(sField, field, visitor)); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java index c693e2e2c057..4622d2928ac4 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -38,7 +37,6 @@ import org.apache.iceberg.types.Types; import org.apache.spark.sql.catalyst.InternalRow; - public class SparkAvroReader implements DatumReader, SupportsRowPosition { private final Schema readSchema; @@ -50,10 +48,12 @@ public SparkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSche } @SuppressWarnings("unchecked") - public SparkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { + public SparkAvroReader( + org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { this.readSchema = readSchema; - this.reader = (ValueReader) AvroSchemaWithTypeVisitor - .visit(expectedSchema, readSchema, new ReadBuilder(constants)); + this.reader = + (ValueReader) + AvroSchemaWithTypeVisitor.visit(expectedSchema, readSchema, new ReadBuilder(constants)); } @Override @@ -81,8 +81,8 @@ private ReadBuilder(Map idToConstant) { } @Override - public ValueReader record(Types.StructType expected, Schema record, List names, - List> fields) { + public ValueReader record( + Types.StructType expected, Schema record, List names, List> fields) { return SparkValueReaders.struct(fields, expected, idToConstant); } @@ -92,13 +92,14 @@ public ValueReader union(Type expected, Schema union, List> op } @Override - public ValueReader array(Types.ListType expected, Schema array, ValueReader elementReader) { + public ValueReader array( + Types.ListType expected, Schema array, ValueReader elementReader) { return SparkValueReaders.array(elementReader); } @Override - public ValueReader map(Types.MapType expected, Schema map, - ValueReader keyReader, ValueReader valueReader) { + public ValueReader map( + Types.MapType expected, Schema map, ValueReader keyReader, ValueReader valueReader) { return SparkValueReaders.arrayMap(keyReader, valueReader); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java index 7582125128a7..15465568c231 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -50,8 +49,9 @@ public SparkAvroWriter(StructType dsSchema) { @Override @SuppressWarnings("unchecked") public void setSchema(Schema schema) { - this.writer = (ValueWriter) AvroWithSparkSchemaVisitor - .visit(dsSchema, schema, new WriteBuilder()); + this.writer = + (ValueWriter) + AvroWithSparkSchemaVisitor.visit(dsSchema, schema, new WriteBuilder()); } @Override @@ -66,17 +66,23 @@ public Stream metrics() { private static class WriteBuilder extends AvroWithSparkSchemaVisitor> { @Override - public ValueWriter record(DataType struct, Schema record, List names, List> fields) { - return SparkValueWriters.struct(fields, IntStream.range(0, names.size()) - .mapToObj(i -> fieldNameAndType(struct, i).second()).collect(Collectors.toList())); + public ValueWriter record( + DataType struct, Schema record, List names, List> fields) { + return SparkValueWriters.struct( + fields, + IntStream.range(0, names.size()) + .mapToObj(i -> fieldNameAndType(struct, i).second()) + .collect(Collectors.toList())); } @Override public ValueWriter union(DataType type, Schema union, List> options) { - Preconditions.checkArgument(options.contains(ValueWriters.nulls()), - "Cannot create writer for non-option union: %s", union); - Preconditions.checkArgument(options.size() == 2, - "Cannot create writer for non-option union: %s", union); + Preconditions.checkArgument( + options.contains(ValueWriters.nulls()), + "Cannot create writer for non-option union: %s", + union); + Preconditions.checkArgument( + options.size() == 2, "Cannot create writer for non-option union: %s", union); if (union.getTypes().get(0).getType() == Schema.Type.NULL) { return ValueWriters.option(0, options.get(1)); } else { @@ -91,12 +97,15 @@ public ValueWriter array(DataType sArray, Schema array, ValueWriter elemen @Override public ValueWriter map(DataType sMap, Schema map, ValueWriter valueReader) { - return SparkValueWriters.map(SparkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); + return SparkValueWriters.map( + SparkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); } @Override - public ValueWriter map(DataType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { - return SparkValueWriters.arrayMap(keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); + public ValueWriter map( + DataType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { + return SparkValueWriters.arrayMap( + keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java index 4ed6420a9aa4..78db137054bc 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.List; @@ -34,10 +33,9 @@ import org.apache.spark.sql.catalyst.InternalRow; /** - * Converts the OrcIterator, which returns ORC's VectorizedRowBatch to a - * set of Spark's UnsafeRows. + * Converts the OrcIterator, which returns ORC's VectorizedRowBatch to a set of Spark's UnsafeRows. * - * It minimizes allocations by reusing most of the objects in the implementation. + *

    It minimizes allocations by reusing most of the objects in the implementation. */ public class SparkOrcReader implements OrcRowReader { private final OrcValueReader reader; @@ -48,8 +46,12 @@ public SparkOrcReader(org.apache.iceberg.Schema expectedSchema, TypeDescription @SuppressWarnings("unchecked") public SparkOrcReader( - org.apache.iceberg.Schema expectedSchema, TypeDescription readOrcSchema, Map idToConstant) { - this.reader = OrcSchemaWithTypeVisitor.visit(expectedSchema, readOrcSchema, new ReadBuilder(idToConstant)); + org.apache.iceberg.Schema expectedSchema, + TypeDescription readOrcSchema, + Map idToConstant) { + this.reader = + OrcSchemaWithTypeVisitor.visit( + expectedSchema, readOrcSchema, new ReadBuilder(idToConstant)); } @Override @@ -71,18 +73,25 @@ private ReadBuilder(Map idToConstant) { @Override public OrcValueReader record( - Types.StructType expected, TypeDescription record, List names, List> fields) { + Types.StructType expected, + TypeDescription record, + List names, + List> fields) { return SparkOrcValueReaders.struct(fields, expected, idToConstant); } @Override - public OrcValueReader list(Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { + public OrcValueReader list( + Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { return SparkOrcValueReaders.array(elementReader); } @Override public OrcValueReader map( - Types.MapType iMap, TypeDescription map, OrcValueReader keyReader, OrcValueReader valueReader) { + Types.MapType iMap, + TypeDescription map, + OrcValueReader keyReader, + OrcValueReader valueReader) { return SparkOrcValueReaders.map(keyReader, valueReader); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java index f35ab7a17c63..9e9b3e53bbcc 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.math.BigDecimal; @@ -44,8 +43,7 @@ import org.apache.spark.unsafe.types.UTF8String; public class SparkOrcValueReaders { - private SparkOrcValueReaders() { - } + private SparkOrcValueReaders() {} public static OrcValueReader utf8String() { return StringReader.INSTANCE; @@ -125,8 +123,7 @@ public MapData nonNullRead(ColumnVector vector, int row) { } return new ArrayBasedMapData( - new GenericArrayData(keys.toArray()), - new GenericArrayData(values.toArray())); + new GenericArrayData(keys.toArray()), new GenericArrayData(values.toArray())); } @Override @@ -139,7 +136,8 @@ public void setBatchContext(long batchOffsetInFile) { static class StructReader extends OrcValueReaders.StructReader { private final int numFields; - protected StructReader(List> readers, Types.StructType struct, Map idToConstant) { + protected StructReader( + List> readers, Types.StructType struct, Map idToConstant) { super(readers, struct, idToConstant); this.numFields = struct.fields().size(); } @@ -162,21 +160,20 @@ protected void set(InternalRow struct, int pos, Object value) { private static class StringReader implements OrcValueReader { private static final StringReader INSTANCE = new StringReader(); - private StringReader() { - } + private StringReader() {} @Override public UTF8String nonNullRead(ColumnVector vector, int row) { BytesColumnVector bytesVector = (BytesColumnVector) vector; - return UTF8String.fromBytes(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); + return UTF8String.fromBytes( + bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); } } private static class TimestampTzReader implements OrcValueReader { private static final TimestampTzReader INSTANCE = new TimestampTzReader(); - private TimestampTzReader() { - } + private TimestampTzReader() {} @Override public Long nonNullRead(ColumnVector vector, int row) { @@ -198,12 +195,20 @@ private static class Decimal18Reader implements OrcValueReader { public Decimal nonNullRead(ColumnVector vector, int row) { HiveDecimalWritable value = ((DecimalColumnVector) vector).vector[row]; - // The scale of decimal read from hive ORC file may be not equals to the expected scale. For data type - // decimal(10,3) and the value 10.100, the hive ORC writer will remove its trailing zero and store it - // as 101*10^(-1), its scale will adjust from 3 to 1. So here we could not assert that value.scale() == scale. - // we also need to convert the hive orc decimal to a decimal with expected precision and scale. - Preconditions.checkArgument(value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); + // The scale of decimal read from hive ORC file may be not equals to the expected scale. For + // data type + // decimal(10,3) and the value 10.100, the hive ORC writer will remove its trailing zero and + // store it + // as 101*10^(-1), its scale will adjust from 3 to 1. So here we could not assert that + // value.scale() == scale. + // we also need to convert the hive orc decimal to a decimal with expected precision and + // scale. + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); return new Decimal().set(value.serialize64(scale), precision, scale); } @@ -220,11 +225,15 @@ private static class Decimal38Reader implements OrcValueReader { @Override public Decimal nonNullRead(ColumnVector vector, int row) { - BigDecimal value = ((DecimalColumnVector) vector).vector[row] - .getHiveDecimal().bigDecimalValue(); - - Preconditions.checkArgument(value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); + BigDecimal value = + ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); + + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); return new Decimal().set(new scala.math.BigDecimal(value), precision, scale); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java index abb12dffc050..780090f99109 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.List; @@ -37,8 +36,7 @@ import org.apache.spark.unsafe.types.UTF8String; class SparkOrcValueWriters { - private SparkOrcValueWriters() { - } + private SparkOrcValueWriters() {} static OrcValueWriter strings() { return StringWriter.INSTANCE; @@ -60,8 +58,8 @@ static OrcValueWriter list(OrcValueWriter element, List o return new ListWriter<>(element, orcType); } - static OrcValueWriter map(OrcValueWriter keyWriter, OrcValueWriter valueWriter, - List orcTypes) { + static OrcValueWriter map( + OrcValueWriter keyWriter, OrcValueWriter valueWriter, List orcTypes) { return new MapWriter<>(keyWriter, valueWriter, orcTypes); } @@ -73,7 +71,6 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) { byte[] value = data.getBytes(); ((BytesColumnVector) output).setRef(rowId, value, 0, value.length); } - } private static class TimestampTzWriter implements OrcValueWriter { @@ -85,7 +82,6 @@ public void nonNullWrite(int rowId, Long micros, ColumnVector output) { cv.time[rowId] = Math.floorDiv(micros, 1_000); // millis cv.nanos[rowId] = (int) Math.floorMod(micros, 1_000_000) * 1_000; // nanos } - } private static class Decimal18Writer implements OrcValueWriter { @@ -97,20 +93,18 @@ private static class Decimal18Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, Decimal decimal, ColumnVector output) { - ((DecimalColumnVector) output).vector[rowId].setFromLongAndScale( - decimal.toUnscaledLong(), scale); + ((DecimalColumnVector) output) + .vector[rowId].setFromLongAndScale(decimal.toUnscaledLong(), scale); } - } private static class Decimal38Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, Decimal decimal, ColumnVector output) { - ((DecimalColumnVector) output).vector[rowId].set( - HiveDecimal.create(decimal.toJavaBigDecimal())); + ((DecimalColumnVector) output) + .vector[rowId].set(HiveDecimal.create(decimal.toJavaBigDecimal())); } - } private static class ListWriter implements OrcValueWriter { @@ -120,10 +114,12 @@ private static class ListWriter implements OrcValueWriter { @SuppressWarnings("unchecked") ListWriter(OrcValueWriter writer, List orcTypes) { if (orcTypes.size() != 1) { - throw new IllegalArgumentException("Expected one (and same) ORC type for list elements, got: " + orcTypes); + throw new IllegalArgumentException( + "Expected one (and same) ORC type for list elements, got: " + orcTypes); } this.writer = writer; - this.fieldGetter = (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); + this.fieldGetter = + (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); } @Override @@ -145,7 +141,6 @@ public void nonNullWrite(int rowId, ArrayData value, ColumnVector output) { public Stream> metrics() { return writer.metrics(); } - } private static class MapWriter implements OrcValueWriter { @@ -155,14 +150,20 @@ private static class MapWriter implements OrcValueWriter { private final SparkOrcWriter.FieldGetter valueFieldGetter; @SuppressWarnings("unchecked") - MapWriter(OrcValueWriter keyWriter, OrcValueWriter valueWriter, List orcTypes) { + MapWriter( + OrcValueWriter keyWriter, + OrcValueWriter valueWriter, + List orcTypes) { if (orcTypes.size() != 2) { - throw new IllegalArgumentException("Expected two ORC type descriptions for a map, got: " + orcTypes); + throw new IllegalArgumentException( + "Expected two ORC type descriptions for a map, got: " + orcTypes); } this.keyWriter = keyWriter; this.valueWriter = valueWriter; - this.keyFieldGetter = (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); - this.valueFieldGetter = (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(1)); + this.keyFieldGetter = + (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); + this.valueFieldGetter = + (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(1)); } @Override @@ -189,7 +190,6 @@ public void nonNullWrite(int rowId, MapData map, ColumnVector output) { public Stream> metrics() { return Stream.concat(keyWriter.metrics(), valueWriter.metrics()); } - } private static void growColumnVector(ColumnVector cv, int requestedSize) { diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java index 9c7f3a6eb01d..6a8c7f1d3c88 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.Serializable; @@ -39,19 +38,18 @@ import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.expressions.SpecializedGetters; -/** - * This class acts as an adaptor from an OrcFileAppender to a - * FileAppender<InternalRow>. - */ +/** This class acts as an adaptor from an OrcFileAppender to a FileAppender<InternalRow>. */ public class SparkOrcWriter implements OrcRowWriter { private final InternalRowWriter writer; public SparkOrcWriter(Schema iSchema, TypeDescription orcSchema) { - Preconditions.checkArgument(orcSchema.getCategory() == TypeDescription.Category.STRUCT, + Preconditions.checkArgument( + orcSchema.getCategory() == TypeDescription.Category.STRUCT, "Top level must be a struct " + orcSchema); - writer = (InternalRowWriter) OrcSchemaWithTypeVisitor.visit(iSchema, orcSchema, new WriteBuilder()); + writer = + (InternalRowWriter) OrcSchemaWithTypeVisitor.visit(iSchema, orcSchema, new WriteBuilder()); } @Override @@ -71,24 +69,26 @@ public Stream> metrics() { } private static class WriteBuilder extends OrcSchemaWithTypeVisitor> { - private WriteBuilder() { - } + private WriteBuilder() {} @Override - public OrcValueWriter record(Types.StructType iStruct, TypeDescription record, - List names, List> fields) { + public OrcValueWriter record( + Types.StructType iStruct, + TypeDescription record, + List names, + List> fields) { return new InternalRowWriter(fields, record.getChildren()); } @Override - public OrcValueWriter list(Types.ListType iList, TypeDescription array, - OrcValueWriter element) { + public OrcValueWriter list( + Types.ListType iList, TypeDescription array, OrcValueWriter element) { return SparkOrcValueWriters.list(element, array.getChildren()); } @Override - public OrcValueWriter map(Types.MapType iMap, TypeDescription map, - OrcValueWriter key, OrcValueWriter value) { + public OrcValueWriter map( + Types.MapType iMap, TypeDescription map, OrcValueWriter key, OrcValueWriter value) { return SparkOrcValueWriters.map(key, value, map.getChildren()); } @@ -178,8 +178,9 @@ static FieldGetter createFieldGetter(TypeDescription fieldType) { // being changed behind our back. break; case DECIMAL: - fieldGetter = (row, ordinal) -> - row.getDecimal(ordinal, fieldType.getPrecision(), fieldType.getScale()); + fieldGetter = + (row, ordinal) -> + row.getDecimal(ordinal, fieldType.getPrecision(), fieldType.getScale()); break; case STRING: case CHAR: @@ -196,7 +197,8 @@ static FieldGetter createFieldGetter(TypeDescription fieldType) { fieldGetter = SpecializedGetters::getMap; break; default: - throw new IllegalArgumentException("Encountered an unsupported ORC type during a write from Spark."); + throw new IllegalArgumentException( + "Encountered an unsupported ORC type during a write from Spark."); } return (row, ordinal) -> { @@ -210,10 +212,12 @@ static FieldGetter createFieldGetter(TypeDescription fieldType) { interface FieldGetter extends Serializable { /** - * Returns a value from a complex Spark data holder such ArrayData, InternalRow, etc... - * Calls the appropriate getter for the expected data type. + * Returns a value from a complex Spark data holder such ArrayData, InternalRow, etc... Calls + * the appropriate getter for the expected data type. + * * @param row Spark's data representation - * @param ordinal index in the data structure (e.g. column index for InterRow, list index in ArrayData, etc..) + * @param ordinal index in the data structure (e.g. column index for InterRow, list index in + * ArrayData, etc..) * @return field value at ordinal */ @Nullable diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java index 8abee4a575e1..8c4c3dce226a 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.math.BigDecimal; @@ -66,25 +65,25 @@ import org.apache.spark.unsafe.types.UTF8String; public class SparkParquetReaders { - private SparkParquetReaders() { - } + private SparkParquetReaders() {} - public static ParquetValueReader buildReader(Schema expectedSchema, - MessageType fileSchema) { + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema) { return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); } @SuppressWarnings("unchecked") - public static ParquetValueReader buildReader(Schema expectedSchema, - MessageType fileSchema, - Map idToConstant) { + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema, Map idToConstant) { if (ParquetSchemaUtil.hasIds(fileSchema)) { return (ParquetValueReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, - new ReadBuilder(fileSchema, idToConstant)); + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); } else { return (ParquetValueReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), + fileSchema, new FallbackReadBuilder(fileSchema, idToConstant)); } } @@ -95,18 +94,18 @@ private static class FallbackReadBuilder extends ReadBuilder { } @Override - public ParquetValueReader message(Types.StructType expected, MessageType message, - List> fieldReaders) { + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { // the top level matches by ID, but the remaining IDs are missing return super.struct(expected, message, fieldReaders); } @Override - public ParquetValueReader struct(Types.StructType ignored, GroupType struct, - List> fieldReaders) { + public ParquetValueReader struct( + Types.StructType ignored, GroupType struct, List> fieldReaders) { // the expected struct is ignored because nested fields are never found when the - List> newFields = Lists.newArrayListWithExpectedSize( - fieldReaders.size()); + List> newFields = + Lists.newArrayListWithExpectedSize(fieldReaders.size()); List types = Lists.newArrayListWithExpectedSize(fieldReaders.size()); List fields = struct.getFields(); for (int i = 0; i < fields.size(); i += 1) { @@ -130,14 +129,14 @@ private static class ReadBuilder extends TypeWithSchemaVisitor message(Types.StructType expected, MessageType message, - List> fieldReaders) { + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { return struct(expected, message.asGroupType(), fieldReaders); } @Override - public ParquetValueReader struct(Types.StructType expected, GroupType struct, - List> fieldReaders) { + public ParquetValueReader struct( + Types.StructType expected, GroupType struct, List> fieldReaders) { // match the expected struct's order Map> readersById = Maps.newHashMap(); Map typesById = Maps.newHashMap(); @@ -152,10 +151,10 @@ public ParquetValueReader struct(Types.StructType expected, GroupType struct, } } - List expectedFields = expected != null ? - expected.fields() : ImmutableList.of(); - List> reorderedFields = Lists.newArrayListWithExpectedSize( - expectedFields.size()); + List expectedFields = + expected != null ? expected.fields() : ImmutableList.of(); + List> reorderedFields = + Lists.newArrayListWithExpectedSize(expectedFields.size()); List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); for (Types.NestedField field : expectedFields) { int id = field.fieldId(); @@ -185,8 +184,8 @@ public ParquetValueReader struct(Types.StructType expected, GroupType struct, } @Override - public ParquetValueReader list(Types.ListType expectedList, GroupType array, - ParquetValueReader elementReader) { + public ParquetValueReader list( + Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); @@ -196,13 +195,16 @@ public ParquetValueReader list(Types.ListType expectedList, GroupType array, Type elementType = repeated.getType(0); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - return new ArrayReader<>(repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); + return new ArrayReader<>( + repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); } @Override - public ParquetValueReader map(Types.MapType expectedMap, GroupType map, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { + public ParquetValueReader map( + Types.MapType expectedMap, + GroupType map, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); @@ -214,14 +216,16 @@ public ParquetValueReader map(Types.MapType expectedMap, GroupType map, Type valueType = repeatedKeyValue.getType(1); int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - return new MapReader<>(repeatedD, repeatedR, + return new MapReader<>( + repeatedD, + repeatedR, ParquetValueReaders.option(keyType, keyD, keyReader), ParquetValueReaders.option(valueType, valueD, valueReader)); } @Override - public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveType expected, - PrimitiveType primitive) { + public ParquetValueReader primitive( + org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { ColumnDescriptor desc = type.getColumnDescription(currentPath()); if (primitive.getOriginalType() != null) { @@ -377,12 +381,13 @@ public Long read(Long ignored) { @Override public long readLong() { - final ByteBuffer byteBuffer = column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + final ByteBuffer byteBuffer = + column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); final long timeOfDayNanos = byteBuffer.getLong(); final int julianDay = byteBuffer.getInt(); - return TimeUnit.DAYS.toMicros(julianDay - UNIX_EPOCH_JULIAN) + - TimeUnit.NANOSECONDS.toMicros(timeOfDayNanos); + return TimeUnit.DAYS.toMicros(julianDay - UNIX_EPOCH_JULIAN) + + TimeUnit.NANOSECONDS.toMicros(timeOfDayNanos); } } @@ -456,15 +461,19 @@ protected ArrayData buildList(ReusableArrayData list) { } } - private static class MapReader extends RepeatedKeyValueReader { + private static class MapReader + extends RepeatedKeyValueReader { private int readPos = 0; private int writePos = 0; private final ReusableEntry entry = new ReusableEntry<>(); private final ReusableEntry nullEntry = new ReusableEntry<>(); - MapReader(int definitionLevel, int repetitionLevel, - ParquetValueReader keyReader, ParquetValueReader valueReader) { + MapReader( + int definitionLevel, + int repetitionLevel, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { super(definitionLevel, repetitionLevel, keyReader, valueReader); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index 5e268d26ed9c..c7622678c74d 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.Iterator; @@ -54,12 +53,12 @@ import org.apache.spark.unsafe.types.UTF8String; public class SparkParquetWriters { - private SparkParquetWriters() { - } + private SparkParquetWriters() {} @SuppressWarnings("unchecked") public static ParquetValueWriter buildWriter(StructType dfSchema, MessageType type) { - return (ParquetValueWriter) ParquetWithSparkSchemaVisitor.visit(dfSchema, type, new WriteBuilder(type)); + return (ParquetValueWriter) + ParquetWithSparkSchemaVisitor.visit(dfSchema, type, new WriteBuilder(type)); } private static class WriteBuilder extends ParquetWithSparkSchemaVisitor> { @@ -70,14 +69,14 @@ private static class WriteBuilder extends ParquetWithSparkSchemaVisitor message(StructType sStruct, MessageType message, - List> fieldWriters) { + public ParquetValueWriter message( + StructType sStruct, MessageType message, List> fieldWriters) { return struct(sStruct, message.asGroupType(), fieldWriters); } @Override - public ParquetValueWriter struct(StructType sStruct, GroupType struct, - List> fieldWriters) { + public ParquetValueWriter struct( + StructType sStruct, GroupType struct, List> fieldWriters) { List fields = struct.getFields(); StructField[] sparkFields = sStruct.fields(); List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); @@ -91,31 +90,40 @@ public ParquetValueWriter struct(StructType sStruct, GroupType struct, } @Override - public ParquetValueWriter list(ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { + public ParquetValueWriter list( + ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - return new ArrayDataWriter<>(repeatedD, repeatedR, + return new ArrayDataWriter<>( + repeatedD, + repeatedR, newOption(repeated.getType(0), elementWriter), sArray.elementType()); } @Override - public ParquetValueWriter map(MapType sMap, GroupType map, - ParquetValueWriter keyWriter, ParquetValueWriter valueWriter) { + public ParquetValueWriter map( + MapType sMap, + GroupType map, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - return new MapDataWriter<>(repeatedD, repeatedR, + return new MapDataWriter<>( + repeatedD, + repeatedR, newOption(repeatedKeyValue.getType(0), keyWriter), newOption(repeatedKeyValue.getType(1), valueWriter), - sMap.keyType(), sMap.valueType()); + sMap.keyType(), + sMap.valueType()); } private ParquetValueWriter newOption(Type fieldType, ParquetValueWriter writer) { @@ -197,18 +205,18 @@ private static PrimitiveWriter utf8Strings(ColumnDescriptor desc) { return new UTF8StringWriter(desc); } - private static PrimitiveWriter decimalAsInteger(ColumnDescriptor desc, - int precision, int scale) { + private static PrimitiveWriter decimalAsInteger( + ColumnDescriptor desc, int precision, int scale) { return new IntegerDecimalWriter(desc, precision, scale); } - private static PrimitiveWriter decimalAsLong(ColumnDescriptor desc, - int precision, int scale) { + private static PrimitiveWriter decimalAsLong( + ColumnDescriptor desc, int precision, int scale) { return new LongDecimalWriter(desc, precision, scale); } - private static PrimitiveWriter decimalAsFixed(ColumnDescriptor desc, - int precision, int scale) { + private static PrimitiveWriter decimalAsFixed( + ColumnDescriptor desc, int precision, int scale) { return new FixedDecimalWriter(desc, precision, scale); } @@ -239,10 +247,18 @@ private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, Decimal decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeInteger(repetitionLevel, (int) decimal.toUnscaledLong()); } @@ -260,10 +276,18 @@ private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, Decimal decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeLong(repetitionLevel, decimal.toUnscaledLong()); } @@ -278,12 +302,15 @@ private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { super(desc); this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(int repetitionLevel, Decimal decimal) { - byte[] binary = DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toJavaBigDecimal(), bytes.get()); + byte[] binary = + DecimalUtil.toReusedFixLengthBytes( + precision, scale, decimal.toJavaBigDecimal(), bytes.get()); column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(binary)); } } @@ -302,8 +329,11 @@ public void write(int repetitionLevel, byte[] bytes) { private static class ArrayDataWriter extends RepeatedWriter { private final DataType elementType; - private ArrayDataWriter(int definitionLevel, int repetitionLevel, - ParquetValueWriter writer, DataType elementType) { + private ArrayDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter writer, + DataType elementType) { super(definitionLevel, repetitionLevel, writer); this.elementType = elementType; } @@ -354,9 +384,13 @@ private static class MapDataWriter extends RepeatedKeyValueWriter keyWriter, ParquetValueWriter valueWriter, - DataType keyType, DataType valueType) { + private MapDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter, + DataType keyType, + DataType valueType) { super(definitionLevel, repetitionLevel, keyWriter, valueWriter); this.keyType = keyType; this.valueType = valueType; diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java index 0d3ce2b28d0b..11655c72d857 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -44,8 +43,7 @@ public class SparkValueReaders { - private SparkValueReaders() { - } + private SparkValueReaders() {} static ValueReader strings() { return StringReader.INSTANCE; @@ -67,8 +65,8 @@ static ValueReader array(ValueReader elementReader) { return new ArrayReader(elementReader); } - static ValueReader arrayMap(ValueReader keyReader, - ValueReader valueReader) { + static ValueReader arrayMap( + ValueReader keyReader, ValueReader valueReader) { return new ArrayMapReader(keyReader, valueReader); } @@ -76,16 +74,15 @@ static ValueReader map(ValueReader keyReader, ValueReader< return new MapReader(keyReader, valueReader); } - static ValueReader struct(List> readers, Types.StructType struct, - Map idToConstant) { + static ValueReader struct( + List> readers, Types.StructType struct, Map idToConstant) { return new StructReader(readers, struct, idToConstant); } private static class StringReader implements ValueReader { private static final StringReader INSTANCE = new StringReader(); - private StringReader() { - } + private StringReader() {} @Override public UTF8String read(Decoder decoder, Object reuse) throws IOException { @@ -97,10 +94,10 @@ public UTF8String read(Decoder decoder, Object reuse) throws IOException { Utf8 string = decoder.readString(utf8); return UTF8String.fromBytes(string.getBytes(), 0, string.getByteLength()); -// int length = decoder.readInt(); -// byte[] bytes = new byte[length]; -// decoder.readFixed(bytes, 0, length); -// return UTF8String.fromBytes(bytes); + // int length = decoder.readInt(); + // byte[] bytes = new byte[length]; + // decoder.readFixed(bytes, 0, length); + // return UTF8String.fromBytes(bytes); } } @@ -122,16 +119,17 @@ public UTF8String read(Decoder decoder, Object ignore) throws IOException { } private static class UUIDReader implements ValueReader { - private static final ThreadLocal BUFFER = ThreadLocal.withInitial(() -> { - ByteBuffer buffer = ByteBuffer.allocate(16); - buffer.order(ByteOrder.BIG_ENDIAN); - return buffer; - }); + private static final ThreadLocal BUFFER = + ThreadLocal.withInitial( + () -> { + ByteBuffer buffer = ByteBuffer.allocate(16); + buffer.order(ByteOrder.BIG_ENDIAN); + return buffer; + }); private static final UUIDReader INSTANCE = new UUIDReader(); - private UUIDReader() { - } + private UUIDReader() {} @Override @SuppressWarnings("ByteBufferBackingArray") @@ -258,14 +256,16 @@ public ArrayBasedMapData read(Decoder decoder, Object reuse) throws IOException static class StructReader extends ValueReaders.StructReader { private final int numFields; - protected StructReader(List> readers, Types.StructType struct, Map idToConstant) { + protected StructReader( + List> readers, Types.StructType struct, Map idToConstant) { super(readers, struct, idToConstant); this.numFields = readers.size(); } @Override protected InternalRow reuseOrCreate(Object reuse) { - if (reuse instanceof GenericInternalRow && ((GenericInternalRow) reuse).numFields() == numFields) { + if (reuse instanceof GenericInternalRow + && ((GenericInternalRow) reuse).numFields() == numFields) { return (InternalRow) reuse; } return new GenericInternalRow(numFields); diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java index 24a69c1d7f11..5f2e2c054888 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -39,8 +38,7 @@ public class SparkValueWriters { - private SparkValueWriters() { - } + private SparkValueWriters() {} static ValueWriter strings() { return StringWriter.INSTANCE; @@ -75,8 +73,7 @@ static ValueWriter struct(List> writers, List { private static final StringWriter INSTANCE = new StringWriter(); - private StringWriter() { - } + private StringWriter() {} @Override public void write(UTF8String s, Encoder encoder) throws IOException { @@ -88,16 +85,17 @@ public void write(UTF8String s, Encoder encoder) throws IOException { } private static class UUIDWriter implements ValueWriter { - private static final ThreadLocal BUFFER = ThreadLocal.withInitial(() -> { - ByteBuffer buffer = ByteBuffer.allocate(16); - buffer.order(ByteOrder.BIG_ENDIAN); - return buffer; - }); + private static final ThreadLocal BUFFER = + ThreadLocal.withInitial( + () -> { + ByteBuffer buffer = ByteBuffer.allocate(16); + buffer.order(ByteOrder.BIG_ENDIAN); + return buffer; + }); private static final UUIDWriter INSTANCE = new UUIDWriter(); - private UUIDWriter() { - } + private UUIDWriter() {} @Override @SuppressWarnings("ByteBufferBackingArray") @@ -120,12 +118,14 @@ private static class DecimalWriter implements ValueWriter { private DecimalWriter(int precision, int scale) { this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(Decimal d, Encoder encoder) throws IOException { - encoder.writeFixed(DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toJavaBigDecimal(), bytes.get())); + encoder.writeFixed( + DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toJavaBigDecimal(), bytes.get())); } } @@ -158,8 +158,11 @@ private static class ArrayMapWriter implements ValueWriter { private final DataType keyType; private final DataType valueType; - private ArrayMapWriter(ValueWriter keyWriter, DataType keyType, - ValueWriter valueWriter, DataType valueType) { + private ArrayMapWriter( + ValueWriter keyWriter, + DataType keyType, + ValueWriter valueWriter, + DataType valueType) { this.keyWriter = keyWriter; this.keyType = keyType; this.valueWriter = valueWriter; @@ -189,8 +192,11 @@ private static class MapWriter implements ValueWriter { private final DataType keyType; private final DataType valueType; - private MapWriter(ValueWriter keyWriter, DataType keyType, - ValueWriter valueWriter, DataType valueType) { + private MapWriter( + ValueWriter keyWriter, + DataType keyType, + ValueWriter valueWriter, + DataType valueType) { this.keyWriter = keyWriter; this.keyType = keyType; this.valueWriter = valueWriter; diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java index 505ace508352..e32ebcb02bbc 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.math.BigDecimal; @@ -32,10 +31,12 @@ import org.apache.spark.unsafe.types.UTF8String; final class ArrowVectorAccessorFactory - extends GenericArrowVectorAccessorFactory { + extends GenericArrowVectorAccessorFactory< + Decimal, UTF8String, ColumnarArray, ArrowColumnVector> { ArrowVectorAccessorFactory() { - super(DecimalFactoryImpl::new, + super( + DecimalFactoryImpl::new, StringFactoryImpl::new, StructChildFactoryImpl::new, ArrayFactoryImpl::new); @@ -70,9 +71,7 @@ public UTF8String ofRow(VarCharVector vector, int rowId) { int end = vector.getEndOffset(rowId); return UTF8String.fromAddress( - null, - vector.getDataBuffer().memoryAddress() + start, - end - start); + null, vector.getDataBuffer().memoryAddress() + start, end - start); } @Override @@ -84,7 +83,9 @@ public UTF8String ofBytes(byte[] bytes) { public UTF8String ofByteBuffer(ByteBuffer byteBuffer) { if (byteBuffer.hasArray()) { return UTF8String.fromBytes( - byteBuffer.array(), byteBuffer.arrayOffset() + byteBuffer.position(), byteBuffer.remaining()); + byteBuffer.array(), + byteBuffer.arrayOffset() + byteBuffer.position(), + byteBuffer.remaining()); } byte[] bytes = new byte[byteBuffer.remaining()]; byteBuffer.get(bytes); @@ -92,7 +93,8 @@ public UTF8String ofByteBuffer(ByteBuffer byteBuffer) { } } - private static final class ArrayFactoryImpl implements ArrayFactory { + private static final class ArrayFactoryImpl + implements ArrayFactory { @Override public ArrowColumnVector ofChild(ValueVector childVector) { return new ArrowColumnVector(childVector); @@ -108,7 +110,8 @@ public ColumnarArray ofRow(ValueVector vector, ArrowColumnVector childData, int } } - private static final class StructChildFactoryImpl implements StructChildFactory { + private static final class StructChildFactoryImpl + implements StructChildFactory { @Override public Class getGenericClass() { return ArrowColumnVector.class; diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java index f3b3377af2b4..810fef81b5bb 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.arrow.vectorized.ArrowVectorAccessor; @@ -35,6 +34,5 @@ public class ArrowVectorAccessors { return factory.getVectorAccessor(holder); } - private ArrowVectorAccessors() { - } + private ArrowVectorAccessors() {} } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java index f71a6968099c..f761b2eb551b 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.util.List; @@ -28,9 +27,9 @@ import org.apache.spark.sql.vectorized.ColumnarBatch; /** - * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized read path. The - * {@link ColumnarBatch} returned is created by passing in the Arrow vectors populated via delegated read calls to - * {@linkplain VectorizedArrowReader VectorReader(s)}. + * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized + * read path. The {@link ColumnarBatch} returned is created by passing in the Arrow vectors + * populated via delegated read calls to {@linkplain VectorizedArrowReader VectorReader(s)}. */ public class ColumnarBatchReader extends BaseBatchReader { @@ -40,7 +39,8 @@ public ColumnarBatchReader(List> readers) { @Override public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { - Preconditions.checkArgument(numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); + Preconditions.checkArgument( + numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); ColumnVector[] arrowColumnVectors = new ColumnVector[readers.length]; if (reuse == null) { @@ -52,10 +52,10 @@ public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { int numRowsInVector = vectorHolders[i].numValues(); Preconditions.checkState( numRowsInVector == numRowsToRead, - "Number of rows in the vector %s didn't match expected %s ", numRowsInVector, + "Number of rows in the vector %s didn't match expected %s ", + numRowsInVector, numRowsToRead); - arrowColumnVectors[i] = - IcebergArrowColumnVector.forHolder(vectorHolders[i], numRowsInVector); + arrowColumnVectors[i] = IcebergArrowColumnVector.forHolder(vectorHolders[i], numRowsInVector); } ColumnarBatch batch = new ColumnarBatch(arrowColumnVectors); batch.setNumRows(numRowsToRead); diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java index 3cdea65b2877..42683ffa901e 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.spark.SparkSchemaUtil; @@ -39,8 +38,7 @@ class ConstantColumnVector extends ColumnVector { } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java index 514eec84fe82..33c1a5284818 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.arrow.vectorized.ArrowVectorAccessor; @@ -33,9 +32,10 @@ import org.apache.spark.unsafe.types.UTF8String; /** - * Implementation of Spark's {@link ColumnVector} interface. The code for this class is heavily inspired from Spark's - * {@link ArrowColumnVector} The main difference is in how nullability checks are made in this class by relying on - * {@link NullabilityHolder} instead of the validity vector in the Arrow vector. + * Implementation of Spark's {@link ColumnVector} interface. The code for this class is heavily + * inspired from Spark's {@link ArrowColumnVector} The main difference is in how nullability checks + * are made in this class by relying on {@link NullabilityHolder} instead of the validity vector in + * the Arrow vector. */ public class IcebergArrowColumnVector extends ColumnVector { @@ -146,12 +146,14 @@ public ArrowColumnVector getChild(int ordinal) { } static ColumnVector forHolder(VectorHolder holder, int numRows) { - return holder.isDummy() ? - new ConstantColumnVector(Types.IntegerType.get(), numRows, ((ConstantVectorHolder) holder).getConstant()) : - new IcebergArrowColumnVector(holder); + return holder.isDummy() + ? new ConstantColumnVector( + Types.IntegerType.get(), numRows, ((ConstantVectorHolder) holder).getConstant()) + : new IcebergArrowColumnVector(holder); } - public ArrowVectorAccessor vectorAccessor() { + public ArrowVectorAccessor + vectorAccessor() { return accessor; } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java index 58db4eb55d04..a389cd8286e5 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.spark.SparkSchemaUtil; @@ -37,8 +36,7 @@ public class RowPositionColumnVector extends ColumnVector { } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java index 418c25993a7e..7c3b825a62e7 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.util.List; @@ -47,23 +46,27 @@ public class VectorizedSparkOrcReaders { - private VectorizedSparkOrcReaders() { - } + private VectorizedSparkOrcReaders() {} - public static OrcBatchReader buildReader(Schema expectedSchema, TypeDescription fileSchema, - Map idToConstant) { - Converter converter = OrcSchemaWithTypeVisitor.visit(expectedSchema, fileSchema, new ReadBuilder(idToConstant)); + public static OrcBatchReader buildReader( + Schema expectedSchema, TypeDescription fileSchema, Map idToConstant) { + Converter converter = + OrcSchemaWithTypeVisitor.visit(expectedSchema, fileSchema, new ReadBuilder(idToConstant)); return new OrcBatchReader() { private long batchOffsetInFile; @Override public ColumnarBatch read(VectorizedRowBatch batch) { - BaseOrcColumnVector cv = (BaseOrcColumnVector) converter.convert(new StructColumnVector(batch.size, batch.cols), - batch.size, batchOffsetInFile); - ColumnarBatch columnarBatch = new ColumnarBatch(IntStream.range(0, expectedSchema.columns().size()) - .mapToObj(cv::getChild) - .toArray(ColumnVector[]::new)); + BaseOrcColumnVector cv = + (BaseOrcColumnVector) + converter.convert( + new StructColumnVector(batch.size, batch.cols), batch.size, batchOffsetInFile); + ColumnarBatch columnarBatch = + new ColumnarBatch( + IntStream.range(0, expectedSchema.columns().size()) + .mapToObj(cv::getChild) + .toArray(ColumnVector[]::new)); columnarBatch.setNumRows(batch.size); return columnarBatch; } @@ -76,8 +79,10 @@ public void setBatchContext(long batchOffsetInFile) { } private interface Converter { - ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector columnVector, int batchSize, - long batchOffsetInFile); + ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector columnVector, + int batchSize, + long batchOffsetInFile); } private static class ReadBuilder extends OrcSchemaWithTypeVisitor { @@ -88,8 +93,11 @@ private ReadBuilder(Map idToConstant) { } @Override - public Converter record(Types.StructType iStruct, TypeDescription record, List names, - List fields) { + public Converter record( + Types.StructType iStruct, + TypeDescription record, + List names, + List fields) { return new StructConverter(iStruct, fields, idToConstant); } @@ -132,7 +140,8 @@ public Converter primitive(Type.PrimitiveType iPrimitive, TypeDescription primit primitiveValueReader = SparkOrcValueReaders.timestampTzs(); break; case DECIMAL: - primitiveValueReader = SparkOrcValueReaders.decimals(primitive.getPrecision(), primitive.getScale()); + primitiveValueReader = + SparkOrcValueReaders.decimals(primitive.getPrecision(), primitive.getScale()); break; case CHAR: case VARCHAR: @@ -146,7 +155,8 @@ public Converter primitive(Type.PrimitiveType iPrimitive, TypeDescription primit throw new IllegalArgumentException("Unhandled type " + primitive); } return (columnVector, batchSize, batchOffsetInFile) -> - new PrimitiveOrcColumnVector(iPrimitive, batchSize, columnVector, primitiveValueReader, batchOffsetInFile); + new PrimitiveOrcColumnVector( + iPrimitive, batchSize, columnVector, primitiveValueReader, batchOffsetInFile); } } @@ -155,15 +165,15 @@ private abstract static class BaseOrcColumnVector extends ColumnVector { private final int batchSize; private Integer numNulls; - BaseOrcColumnVector(Type type, int batchSize, org.apache.orc.storage.ql.exec.vector.ColumnVector vector) { + BaseOrcColumnVector( + Type type, int batchSize, org.apache.orc.storage.ql.exec.vector.ColumnVector vector) { super(SparkSchemaUtil.convert(type)); this.vector = vector; this.batchSize = batchSize; } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { @@ -278,8 +288,12 @@ private static class PrimitiveOrcColumnVector extends BaseOrcColumnVector { private final OrcValueReader primitiveValueReader; private final long batchOffsetInFile; - PrimitiveOrcColumnVector(Type type, int batchSize, org.apache.orc.storage.ql.exec.vector.ColumnVector vector, - OrcValueReader primitiveValueReader, long batchOffsetInFile) { + PrimitiveOrcColumnVector( + Type type, + int batchSize, + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + OrcValueReader primitiveValueReader, + long batchOffsetInFile) { super(type, batchSize, vector); this.vector = vector; this.primitiveValueReader = primitiveValueReader; @@ -313,7 +327,8 @@ public double getDouble(int rowId) { @Override public Decimal getDecimal(int rowId, int precision, int scale) { - // TODO: Is it okay to assume that (precision,scale) parameters == (precision,scale) of the decimal type + // TODO: Is it okay to assume that (precision,scale) parameters == (precision,scale) of the + // decimal type // and return a Decimal with (precision,scale) of the decimal type? return (Decimal) primitiveValueReader.read(vector, rowId); } @@ -339,16 +354,20 @@ private ArrayConverter(Types.ListType listType, Converter elementConverter) { } @Override - public ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector vector, int batchSize, - long batchOffsetInFile) { + public ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + int batchSize, + long batchOffsetInFile) { ListColumnVector listVector = (ListColumnVector) vector; - ColumnVector elementVector = elementConverter.convert(listVector.child, batchSize, batchOffsetInFile); + ColumnVector elementVector = + elementConverter.convert(listVector.child, batchSize, batchOffsetInFile); return new BaseOrcColumnVector(listType, batchSize, vector) { @Override public ColumnarArray getArray(int rowId) { int index = getRowIndex(rowId); - return new ColumnarArray(elementVector, (int) listVector.offsets[index], (int) listVector.lengths[index]); + return new ColumnarArray( + elementVector, (int) listVector.offsets[index], (int) listVector.lengths[index]); } }; } @@ -366,17 +385,23 @@ private MapConverter(Types.MapType mapType, Converter keyConverter, Converter va } @Override - public ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector vector, int batchSize, - long batchOffsetInFile) { + public ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + int batchSize, + long batchOffsetInFile) { MapColumnVector mapVector = (MapColumnVector) vector; ColumnVector keyVector = keyConverter.convert(mapVector.keys, batchSize, batchOffsetInFile); - ColumnVector valueVector = valueConverter.convert(mapVector.values, batchSize, batchOffsetInFile); + ColumnVector valueVector = + valueConverter.convert(mapVector.values, batchSize, batchOffsetInFile); return new BaseOrcColumnVector(mapType, batchSize, vector) { @Override public ColumnarMap getMap(int rowId) { int index = getRowIndex(rowId); - return new ColumnarMap(keyVector, valueVector, (int) mapVector.offsets[index], + return new ColumnarMap( + keyVector, + valueVector, + (int) mapVector.offsets[index], (int) mapVector.lengths[index]); } }; @@ -388,30 +413,37 @@ private static class StructConverter implements Converter { private final List fieldConverters; private final Map idToConstant; - private StructConverter(Types.StructType structType, List fieldConverters, - Map idToConstant) { + private StructConverter( + Types.StructType structType, + List fieldConverters, + Map idToConstant) { this.structType = structType; this.fieldConverters = fieldConverters; this.idToConstant = idToConstant; } @Override - public ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector vector, int batchSize, - long batchOffsetInFile) { + public ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + int batchSize, + long batchOffsetInFile) { StructColumnVector structVector = (StructColumnVector) vector; List fields = structType.fields(); List fieldVectors = Lists.newArrayListWithExpectedSize(fields.size()); for (int pos = 0, vectorIndex = 0; pos < fields.size(); pos += 1) { Types.NestedField field = fields.get(pos); if (idToConstant.containsKey(field.fieldId())) { - fieldVectors.add(new ConstantColumnVector(field.type(), batchSize, idToConstant.get(field.fieldId()))); + fieldVectors.add( + new ConstantColumnVector(field.type(), batchSize, idToConstant.get(field.fieldId()))); } else if (field.equals(MetadataColumns.ROW_POSITION)) { fieldVectors.add(new RowPositionColumnVector(batchOffsetInFile)); } else if (field.equals(MetadataColumns.IS_DELETED)) { fieldVectors.add(new ConstantColumnVector(field.type(), batchSize, false)); } else { - fieldVectors.add(fieldConverters.get(vectorIndex) - .convert(structVector.fields[vectorIndex], batchSize, batchOffsetInFile)); + fieldVectors.add( + fieldConverters + .get(vectorIndex) + .convert(structVector.fields[vectorIndex], batchSize, batchOffsetInFile)); vectorIndex++; } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java index b2d582352d74..bbb63e077bc6 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.util.Map; @@ -28,13 +27,10 @@ public class VectorizedSparkParquetReaders { - private VectorizedSparkParquetReaders() { - } + private VectorizedSparkParquetReaders() {} public static ColumnarBatchReader buildReader( - Schema expectedSchema, - MessageType fileSchema, - boolean setArrowValidityVector) { + Schema expectedSchema, MessageType fileSchema, boolean setArrowValidityVector) { return buildReader(expectedSchema, fileSchema, setArrowValidityVector, Maps.newHashMap()); } @@ -44,9 +40,14 @@ public static ColumnarBatchReader buildReader( boolean setArrowValidityVector, Map idToConstant) { return (ColumnarBatchReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), + fileSchema, new VectorizedReaderBuilder( - expectedSchema, fileSchema, setArrowValidityVector, - idToConstant, ColumnarBatchReader::new)); + expectedSchema, + fileSchema, + setArrowValidityVector, + idToConstant, + ColumnarBatchReader::new)); } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/AddFilesProcedure.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/AddFilesProcedure.java index 6cb74aebfa09..3b29c12b7197 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/AddFilesProcedure.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/AddFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Collections; @@ -56,16 +55,19 @@ class AddFilesProcedure extends BaseProcedure { private static final Joiner.MapJoiner MAP_JOINER = Joiner.on(",").withKeyValueSeparator("="); - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("source_table", DataTypes.StringType), - ProcedureParameter.optional("partition_filter", STRING_MAP), - ProcedureParameter.optional("check_duplicate_files", DataTypes.BooleanType) - }; - - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("added_files_count", DataTypes.LongType, false, Metadata.empty()) - }); + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("source_table", DataTypes.StringType), + ProcedureParameter.optional("partition_filter", STRING_MAP), + ProcedureParameter.optional("check_duplicate_files", DataTypes.BooleanType) + }; + + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("added_files_count", DataTypes.LongType, false, Metadata.empty()) + }); private AddFilesProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -95,15 +97,19 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); CatalogPlugin sessionCat = spark().sessionState().catalogManager().v2SessionCatalog(); - Identifier sourceIdent = toCatalogAndIdentifier(args.getString(1), PARAMETERS[1].name(), sessionCat).identifier(); + Identifier sourceIdent = + toCatalogAndIdentifier(args.getString(1), PARAMETERS[1].name(), sessionCat).identifier(); Map partitionFilter = Maps.newHashMap(); if (!args.isNullAt(2)) { - args.getMap(2).foreach(DataTypes.StringType, DataTypes.StringType, - (k, v) -> { - partitionFilter.put(k.toString(), v.toString()); - return BoxedUnit.UNIT; - }); + args.getMap(2) + .foreach( + DataTypes.StringType, + DataTypes.StringType, + (k, v) -> { + partitionFilter.put(k.toString(), v.toString()); + return BoxedUnit.UNIT; + }); } boolean checkDuplicateFiles; @@ -113,36 +119,42 @@ public InternalRow[] call(InternalRow args) { checkDuplicateFiles = args.getBoolean(3); } - long addedFilesCount = importToIceberg(tableIdent, sourceIdent, partitionFilter, checkDuplicateFiles); - return new InternalRow[]{newInternalRow(addedFilesCount)}; + long addedFilesCount = + importToIceberg(tableIdent, sourceIdent, partitionFilter, checkDuplicateFiles); + return new InternalRow[] {newInternalRow(addedFilesCount)}; } private boolean isFileIdentifier(Identifier ident) { String[] namespace = ident.namespace(); - return namespace.length == 1 && - (namespace[0].equalsIgnoreCase("orc") || - namespace[0].equalsIgnoreCase("parquet") || - namespace[0].equalsIgnoreCase("avro")); + return namespace.length == 1 + && (namespace[0].equalsIgnoreCase("orc") + || namespace[0].equalsIgnoreCase("parquet") + || namespace[0].equalsIgnoreCase("avro")); } - private long importToIceberg(Identifier destIdent, Identifier sourceIdent, Map partitionFilter, - boolean checkDuplicateFiles) { - return modifyIcebergTable(destIdent, table -> { - - validatePartitionSpec(table, partitionFilter); - ensureNameMappingPresent(table); - - if (isFileIdentifier(sourceIdent)) { - Path sourcePath = new Path(sourceIdent.name()); - String format = sourceIdent.namespace()[0]; - importFileTable(table, sourcePath, format, partitionFilter, checkDuplicateFiles); - } else { - importCatalogTable(table, sourceIdent, partitionFilter, checkDuplicateFiles); - } - - Snapshot snapshot = table.currentSnapshot(); - return Long.parseLong(snapshot.summary().getOrDefault(SnapshotSummary.ADDED_FILES_PROP, "0")); - }); + private long importToIceberg( + Identifier destIdent, + Identifier sourceIdent, + Map partitionFilter, + boolean checkDuplicateFiles) { + return modifyIcebergTable( + destIdent, + table -> { + validatePartitionSpec(table, partitionFilter); + ensureNameMappingPresent(table); + + if (isFileIdentifier(sourceIdent)) { + Path sourcePath = new Path(sourceIdent.name()); + String format = sourceIdent.namespace()[0]; + importFileTable(table, sourcePath, format, partitionFilter, checkDuplicateFiles); + } else { + importCatalogTable(table, sourceIdent, partitionFilter, checkDuplicateFiles); + } + + Snapshot snapshot = table.currentSnapshot(); + return Long.parseLong( + snapshot.summary().getOrDefault(SnapshotSummary.ADDED_FILES_PROP, "0")); + }); } private static void ensureNameMappingPresent(Table table) { @@ -150,49 +162,64 @@ private static void ensureNameMappingPresent(Table table) { // Forces Name based resolution instead of position based resolution NameMapping mapping = MappingUtil.create(table.schema()); String mappingJson = NameMappingParser.toJson(mapping); - table.updateProperties() - .set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson) - .commit(); + table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit(); } } - private void importFileTable(Table table, Path tableLocation, String format, Map partitionFilter, - boolean checkDuplicateFiles) { + private void importFileTable( + Table table, + Path tableLocation, + String format, + Map partitionFilter, + boolean checkDuplicateFiles) { // List Partitions via Spark InMemory file search interface List partitions = Spark3Util.getPartitions(spark(), tableLocation, format); if (table.spec().isUnpartitioned()) { - Preconditions.checkArgument(partitions.isEmpty(), "Cannot add partitioned files to an unpartitioned table"); - Preconditions.checkArgument(partitionFilter.isEmpty(), "Cannot use a partition filter when importing" + - "to an unpartitioned table"); + Preconditions.checkArgument( + partitions.isEmpty(), "Cannot add partitioned files to an unpartitioned table"); + Preconditions.checkArgument( + partitionFilter.isEmpty(), + "Cannot use a partition filter when importing" + "to an unpartitioned table"); // Build a Global Partition for the source - SparkPartition partition = new SparkPartition(Collections.emptyMap(), tableLocation.toString(), format); + SparkPartition partition = + new SparkPartition(Collections.emptyMap(), tableLocation.toString(), format); importPartitions(table, ImmutableList.of(partition), checkDuplicateFiles); } else { - Preconditions.checkArgument(!partitions.isEmpty(), - "Cannot find any partitions in table %s", partitions); - List filteredPartitions = SparkTableUtil.filterPartitions(partitions, partitionFilter); - Preconditions.checkArgument(!filteredPartitions.isEmpty(), + Preconditions.checkArgument( + !partitions.isEmpty(), "Cannot find any partitions in table %s", partitions); + List filteredPartitions = + SparkTableUtil.filterPartitions(partitions, partitionFilter); + Preconditions.checkArgument( + !filteredPartitions.isEmpty(), "Cannot find any partitions which match the given filter. Partition filter is %s", MAP_JOINER.join(partitionFilter)); importPartitions(table, filteredPartitions, checkDuplicateFiles); } } - private void importCatalogTable(Table table, Identifier sourceIdent, Map partitionFilter, - boolean checkDuplicateFiles) { + private void importCatalogTable( + Table table, + Identifier sourceIdent, + Map partitionFilter, + boolean checkDuplicateFiles) { String stagingLocation = getMetadataLocation(table); TableIdentifier sourceTableIdentifier = Spark3Util.toV1TableIdentifier(sourceIdent); - SparkTableUtil.importSparkTable(spark(), sourceTableIdentifier, table, stagingLocation, partitionFilter, + SparkTableUtil.importSparkTable( + spark(), + sourceTableIdentifier, + table, + stagingLocation, + partitionFilter, checkDuplicateFiles); } - private void importPartitions(Table table, List partitions, - boolean checkDuplicateFiles) { + private void importPartitions( + Table table, List partitions, boolean checkDuplicateFiles) { String stagingLocation = getMetadataLocation(table); - SparkTableUtil.importSparkPartitions(spark(), partitions, table, table.spec(), stagingLocation, - checkDuplicateFiles); + SparkTableUtil.importSparkPartitions( + spark(), partitions, table, table.spec(), stagingLocation, checkDuplicateFiles); } private String getMetadataLocation(Table table) { @@ -207,38 +234,51 @@ public String description() { private void validatePartitionSpec(Table table, Map partitionFilter) { List partitionFields = table.spec().fields(); - Set partitionNames = table.spec().fields().stream().map(PartitionField::name).collect(Collectors.toSet()); + Set partitionNames = + table.spec().fields().stream().map(PartitionField::name).collect(Collectors.toSet()); boolean tablePartitioned = !partitionFields.isEmpty(); boolean partitionSpecPassed = !partitionFilter.isEmpty(); // Check for any non-identity partition columns - List nonIdentityFields = partitionFields.stream() - .filter(x -> !x.transform().isIdentity()) - .collect(Collectors.toList()); - Preconditions.checkArgument(nonIdentityFields.isEmpty(), - "Cannot add data files to target table %s because that table is partitioned and contains non-identity" + - "partition transforms which will not be compatible. Found non-identity fields %s", - table.name(), nonIdentityFields); + List nonIdentityFields = + partitionFields.stream() + .filter(x -> !x.transform().isIdentity()) + .collect(Collectors.toList()); + Preconditions.checkArgument( + nonIdentityFields.isEmpty(), + "Cannot add data files to target table %s because that table is partitioned and contains non-identity" + + "partition transforms which will not be compatible. Found non-identity fields %s", + table.name(), + nonIdentityFields); if (tablePartitioned && partitionSpecPassed) { // Check to see there are sufficient partition columns to satisfy the filter - Preconditions.checkArgument(partitionFields.size() >= partitionFilter.size(), - "Cannot add data files to target table %s because that table is partitioned, " + - "but the number of columns in the provided partition filter (%s) " + - "is greater than the number of partitioned columns in table (%s)", - table.name(), partitionFilter.size(), partitionFields.size()); + Preconditions.checkArgument( + partitionFields.size() >= partitionFilter.size(), + "Cannot add data files to target table %s because that table is partitioned, " + + "but the number of columns in the provided partition filter (%s) " + + "is greater than the number of partitioned columns in table (%s)", + table.name(), + partitionFilter.size(), + partitionFields.size()); // Check for any filters of non existent columns - List unMatchedFilters = partitionFilter.keySet().stream() - .filter(filterName -> !partitionNames.contains(filterName)) - .collect(Collectors.toList()); - Preconditions.checkArgument(unMatchedFilters.isEmpty(), - "Cannot add files to target table %s. %s is partitioned but the specified partition filter " + - "refers to columns that are not partitioned: '%s' . Valid partition columns %s", - table.name(), table.name(), unMatchedFilters, String.join(",", partitionNames)); + List unMatchedFilters = + partitionFilter.keySet().stream() + .filter(filterName -> !partitionNames.contains(filterName)) + .collect(Collectors.toList()); + Preconditions.checkArgument( + unMatchedFilters.isEmpty(), + "Cannot add files to target table %s. %s is partitioned but the specified partition filter " + + "refers to columns that are not partitioned: '%s' . Valid partition columns %s", + table.name(), + table.name(), + unMatchedFilters, + String.join(",", partitionNames)); } else { - Preconditions.checkArgument(!partitionSpecPassed, + Preconditions.checkArgument( + !partitionSpecPassed, "Cannot use partition filter with an unpartitioned table %s", table.name()); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/AncestorsOfProcedure.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/AncestorsOfProcedure.java index 1ae3715dddd6..e5233bf34753 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/AncestorsOfProcedure.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/AncestorsOfProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.List; @@ -35,15 +34,18 @@ public class AncestorsOfProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[] { - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("snapshot_id", DataTypes.LongType), + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("snapshot_id", DataTypes.LongType), }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[] { - new StructField("snapshot_id", DataTypes.LongType, true, Metadata.empty()), - new StructField("timestamp", DataTypes.LongType, true, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("snapshot_id", DataTypes.LongType, true, Metadata.empty()), + new StructField("timestamp", DataTypes.LongType, true, Metadata.empty()) + }); private AncestorsOfProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -77,11 +79,13 @@ public InternalRow[] call(InternalRow args) { Table icebergTable = sparkTable.table(); if (toSnapshotId == null) { - toSnapshotId = icebergTable.currentSnapshot() != null ? icebergTable.currentSnapshot().snapshotId() : -1; + toSnapshotId = + icebergTable.currentSnapshot() != null ? icebergTable.currentSnapshot().snapshotId() : -1; } - List snapshotIds = Lists.newArrayList( - SnapshotUtil.ancestorIdsBetween(toSnapshotId, null, icebergTable::snapshot)); + List snapshotIds = + Lists.newArrayList( + SnapshotUtil.ancestorIdsBetween(toSnapshotId, null, icebergTable::snapshot)); return toOutputRow(icebergTable, snapshotIds); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/BaseProcedure.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/BaseProcedure.java index bbe1fcd72cdc..90231ef6815b 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/BaseProcedure.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/BaseProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.function.Function; @@ -43,7 +42,8 @@ import scala.Option; abstract class BaseProcedure implements Procedure { - protected static final DataType STRING_MAP = DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType); + protected static final DataType STRING_MAP = + DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType); private final SparkSession spark; private final TableCatalog tableCatalog; @@ -78,7 +78,8 @@ protected T withIcebergTable(Identifier ident, Function T execute(Identifier ident, boolean refreshSparkCache, Function func) { + private T execute( + Identifier ident, boolean refreshSparkCache, Function func) { SparkTable sparkTable = loadSparkTable(ident); org.apache.iceberg.Table icebergTable = sparkTable.table(); @@ -92,38 +93,47 @@ private T execute(Identifier ident, boolean refreshSparkCache, Function - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * A procedure that applies changes in a given snapshot and creates a new snapshot which will be set + * as the current snapshot in a table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#cherrypick(long) */ class CherrypickSnapshotProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("snapshot_id", DataTypes.LongType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("snapshot_id", DataTypes.LongType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("source_snapshot_id", DataTypes.LongType, false, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("source_snapshot_id", DataTypes.LongType, false, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -78,16 +81,16 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); long snapshotId = args.getLong(1); - return modifyIcebergTable(tableIdent, table -> { - table.manageSnapshots() - .cherrypick(snapshotId) - .commit(); + return modifyIcebergTable( + tableIdent, + table -> { + table.manageSnapshots().cherrypick(snapshotId).commit(); - Snapshot currentSnapshot = table.currentSnapshot(); + Snapshot currentSnapshot = table.currentSnapshot(); - InternalRow outputRow = newInternalRow(snapshotId, currentSnapshot.snapshotId()); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = newInternalRow(snapshotId, currentSnapshot.snapshotId()); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/ExpireSnapshotsProcedure.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/ExpireSnapshotsProcedure.java index 9c3a8e99c87a..ad8f6f567d88 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/ExpireSnapshotsProcedure.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/ExpireSnapshotsProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.concurrent.ExecutorService; @@ -47,19 +46,24 @@ */ public class ExpireSnapshotsProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[] { - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("older_than", DataTypes.TimestampType), - ProcedureParameter.optional("retain_last", DataTypes.IntegerType), - ProcedureParameter.optional("max_concurrent_deletes", DataTypes.IntegerType), - ProcedureParameter.optional("stream_results", DataTypes.BooleanType) - }; - - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("deleted_data_files_count", DataTypes.LongType, true, Metadata.empty()), - new StructField("deleted_manifest_files_count", DataTypes.LongType, true, Metadata.empty()), - new StructField("deleted_manifest_lists_count", DataTypes.LongType, true, Metadata.empty()) - }); + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("older_than", DataTypes.TimestampType), + ProcedureParameter.optional("retain_last", DataTypes.IntegerType), + ProcedureParameter.optional("max_concurrent_deletes", DataTypes.IntegerType), + ProcedureParameter.optional("stream_results", DataTypes.BooleanType) + }; + + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("deleted_data_files_count", DataTypes.LongType, true, Metadata.empty()), + new StructField( + "deleted_manifest_files_count", DataTypes.LongType, true, Metadata.empty()), + new StructField( + "deleted_manifest_lists_count", DataTypes.LongType, true, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -92,41 +96,45 @@ public InternalRow[] call(InternalRow args) { Integer maxConcurrentDeletes = args.isNullAt(3) ? null : args.getInt(3); Boolean streamResult = args.isNullAt(4) ? null : args.getBoolean(4); - Preconditions.checkArgument(maxConcurrentDeletes == null || maxConcurrentDeletes > 0, + Preconditions.checkArgument( + maxConcurrentDeletes == null || maxConcurrentDeletes > 0, "max_concurrent_deletes should have value > 0, value: " + maxConcurrentDeletes); - return modifyIcebergTable(tableIdent, table -> { - ExpireSnapshots action = actions().expireSnapshots(table); + return modifyIcebergTable( + tableIdent, + table -> { + ExpireSnapshots action = actions().expireSnapshots(table); - if (olderThanMillis != null) { - action.expireOlderThan(olderThanMillis); - } + if (olderThanMillis != null) { + action.expireOlderThan(olderThanMillis); + } - if (retainLastNum != null) { - action.retainLast(retainLastNum); - } + if (retainLastNum != null) { + action.retainLast(retainLastNum); + } - if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) { - action.executeDeleteWith(expireService(maxConcurrentDeletes)); - } + if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) { + action.executeDeleteWith(expireService(maxConcurrentDeletes)); + } - if (streamResult != null) { - action.option(BaseExpireSnapshotsSparkAction.STREAM_RESULTS, Boolean.toString(streamResult)); - } + if (streamResult != null) { + action.option( + BaseExpireSnapshotsSparkAction.STREAM_RESULTS, Boolean.toString(streamResult)); + } - ExpireSnapshots.Result result = action.execute(); + ExpireSnapshots.Result result = action.execute(); - return toOutputRows(result); - }); + return toOutputRows(result); + }); } private InternalRow[] toOutputRows(ExpireSnapshots.Result result) { - InternalRow row = newInternalRow( - result.deletedDataFilesCount(), - result.deletedManifestsCount(), - result.deletedManifestListsCount() - ); - return new InternalRow[]{row}; + InternalRow row = + newInternalRow( + result.deletedDataFilesCount(), + result.deletedManifestsCount(), + result.deletedManifestListsCount()); + return new InternalRow[] {row}; } @Override @@ -136,10 +144,9 @@ public String description() { private ExecutorService expireService(int concurrentDeletes) { return MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool( - concurrentDeletes, - new ThreadFactoryBuilder() - .setNameFormat("expire-snapshots-%d") - .build())); + (ThreadPoolExecutor) + Executors.newFixedThreadPool( + concurrentDeletes, + new ThreadFactoryBuilder().setNameFormat("expire-snapshots-%d").build())); } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/MigrateTableProcedure.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/MigrateTableProcedure.java index 2f6841924f8c..a49dd7d526b0 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/MigrateTableProcedure.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/MigrateTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Map; @@ -35,14 +34,17 @@ import scala.runtime.BoxedUnit; class MigrateTableProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("properties", STRING_MAP) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("properties", STRING_MAP) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("migrated_files_count", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("migrated_files_count", DataTypes.LongType, false, Metadata.empty()) + }); private MigrateTableProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -70,19 +72,24 @@ public StructType outputType() { @Override public InternalRow[] call(InternalRow args) { String tableName = args.getString(0); - Preconditions.checkArgument(tableName != null && !tableName.isEmpty(), + Preconditions.checkArgument( + tableName != null && !tableName.isEmpty(), "Cannot handle an empty identifier for argument table"); Map properties = Maps.newHashMap(); if (!args.isNullAt(1)) { - args.getMap(1).foreach(DataTypes.StringType, DataTypes.StringType, - (k, v) -> { - properties.put(k.toString(), v.toString()); - return BoxedUnit.UNIT; - }); + args.getMap(1) + .foreach( + DataTypes.StringType, + DataTypes.StringType, + (k, v) -> { + properties.put(k.toString(), v.toString()); + return BoxedUnit.UNIT; + }); } - MigrateTable.Result result = SparkActions.get().migrateTable(tableName).tableProperties(properties).execute(); + MigrateTable.Result result = + SparkActions.get().migrateTable(tableName).tableProperties(properties).execute(); return new InternalRow[] {newInternalRow(result.migratedDataFilesCount())}; } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/PublishChangesProcedure.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/PublishChangesProcedure.java index a7d8b344a8db..a216e5abaf7f 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/PublishChangesProcedure.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/PublishChangesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Optional; @@ -35,24 +34,28 @@ import org.apache.spark.sql.types.StructType; /** - * A procedure that applies changes in a snapshot created within a Write-Audit-Publish workflow with a wap_id and - * creates a new snapshot which will be set as the current snapshot in a table. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * A procedure that applies changes in a snapshot created within a Write-Audit-Publish workflow with + * a wap_id and creates a new snapshot which will be set as the current snapshot in a table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#cherrypick(long) */ class PublishChangesProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("wap_id", DataTypes.StringType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("wap_id", DataTypes.StringType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("source_snapshot_id", DataTypes.LongType, false, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("source_snapshot_id", DataTypes.LongType, false, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new Builder() { @@ -82,23 +85,27 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); String wapId = args.getString(1); - return modifyIcebergTable(tableIdent, table -> { - Optional wapSnapshot = Optional.ofNullable( - Iterables.find(table.snapshots(), snapshot -> wapId.equals(WapUtil.stagedWapId(snapshot)), null)); - if (!wapSnapshot.isPresent()) { - throw new ValidationException(String.format("Cannot apply unknown WAP ID '%s'", wapId)); - } + return modifyIcebergTable( + tableIdent, + table -> { + Optional wapSnapshot = + Optional.ofNullable( + Iterables.find( + table.snapshots(), + snapshot -> wapId.equals(WapUtil.stagedWapId(snapshot)), + null)); + if (!wapSnapshot.isPresent()) { + throw new ValidationException(String.format("Cannot apply unknown WAP ID '%s'", wapId)); + } - long wapSnapshotId = wapSnapshot.get().snapshotId(); - table.manageSnapshots() - .cherrypick(wapSnapshotId) - .commit(); + long wapSnapshotId = wapSnapshot.get().snapshotId(); + table.manageSnapshots().cherrypick(wapSnapshotId).commit(); - Snapshot currentSnapshot = table.currentSnapshot(); + Snapshot currentSnapshot = table.currentSnapshot(); - InternalRow outputRow = newInternalRow(wapSnapshotId, currentSnapshot.snapshotId()); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = newInternalRow(wapSnapshotId, currentSnapshot.snapshotId()); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java index 1f6397ae4ddd..3fd29254ff3c 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.concurrent.ExecutorService; @@ -49,17 +48,20 @@ */ public class RemoveOrphanFilesProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("older_than", DataTypes.TimestampType), - ProcedureParameter.optional("location", DataTypes.StringType), - ProcedureParameter.optional("dry_run", DataTypes.BooleanType), - ProcedureParameter.optional("max_concurrent_deletes", DataTypes.IntegerType) - }; - - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("orphan_file_location", DataTypes.StringType, false, Metadata.empty()) - }); + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("older_than", DataTypes.TimestampType), + ProcedureParameter.optional("location", DataTypes.StringType), + ProcedureParameter.optional("dry_run", DataTypes.BooleanType), + ProcedureParameter.optional("max_concurrent_deletes", DataTypes.IntegerType) + }; + + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("orphan_file_location", DataTypes.StringType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -92,36 +94,39 @@ public InternalRow[] call(InternalRow args) { boolean dryRun = args.isNullAt(3) ? false : args.getBoolean(3); Integer maxConcurrentDeletes = args.isNullAt(4) ? null : args.getInt(4); - Preconditions.checkArgument(maxConcurrentDeletes == null || maxConcurrentDeletes > 0, - "max_concurrent_deletes should have value > 0, value: " + maxConcurrentDeletes); + Preconditions.checkArgument( + maxConcurrentDeletes == null || maxConcurrentDeletes > 0, + "max_concurrent_deletes should have value > 0, value: " + maxConcurrentDeletes); - return withIcebergTable(tableIdent, table -> { - DeleteOrphanFiles action = actions().deleteOrphanFiles(table); + return withIcebergTable( + tableIdent, + table -> { + DeleteOrphanFiles action = actions().deleteOrphanFiles(table); - if (olderThanMillis != null) { - boolean isTesting = Boolean.parseBoolean(spark().conf().get("spark.testing", "false")); - if (!isTesting) { - validateInterval(olderThanMillis); - } - action.olderThan(olderThanMillis); - } + if (olderThanMillis != null) { + boolean isTesting = Boolean.parseBoolean(spark().conf().get("spark.testing", "false")); + if (!isTesting) { + validateInterval(olderThanMillis); + } + action.olderThan(olderThanMillis); + } - if (location != null) { - action.location(location); - } + if (location != null) { + action.location(location); + } - if (dryRun) { - action.deleteWith(file -> { }); - } + if (dryRun) { + action.deleteWith(file -> {}); + } - if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) { - action.executeDeleteWith(removeService(maxConcurrentDeletes)); - } + if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) { + action.executeDeleteWith(removeService(maxConcurrentDeletes)); + } - DeleteOrphanFiles.Result result = action.execute(); + DeleteOrphanFiles.Result result = action.execute(); - return toOutputRows(result); - }); + return toOutputRows(result); + }); } private InternalRow[] toOutputRows(DeleteOrphanFiles.Result result) { @@ -143,21 +148,20 @@ private void validateInterval(long olderThanMillis) { long intervalMillis = System.currentTimeMillis() - olderThanMillis; if (intervalMillis < TimeUnit.DAYS.toMillis(1)) { throw new IllegalArgumentException( - "Cannot remove orphan files with an interval less than 24 hours. Executing this " + - "procedure with a short interval may corrupt the table if other operations are happening " + - "at the same time. If you are absolutely confident that no concurrent operations will be " + - "affected by removing orphan files with such a short interval, you can use the Action API " + - "to remove orphan files with an arbitrary interval."); + "Cannot remove orphan files with an interval less than 24 hours. Executing this " + + "procedure with a short interval may corrupt the table if other operations are happening " + + "at the same time. If you are absolutely confident that no concurrent operations will be " + + "affected by removing orphan files with such a short interval, you can use the Action API " + + "to remove orphan files with an arbitrary interval."); } } private ExecutorService removeService(int concurrentDeletes) { return MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool( - concurrentDeletes, - new ThreadFactoryBuilder() - .setNameFormat("remove-orphans-%d") - .build())); + (ThreadPoolExecutor) + Executors.newFixedThreadPool( + concurrentDeletes, + new ThreadFactoryBuilder().setNameFormat("remove-orphans-%d").build())); } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteDataFilesProcedure.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteDataFilesProcedure.java index b33eab6b5b3c..d703ae6d8177 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteDataFilesProcedure.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteDataFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Map; @@ -48,19 +47,24 @@ */ class RewriteDataFilesProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("strategy", DataTypes.StringType), - ProcedureParameter.optional("sort_order", DataTypes.StringType), - ProcedureParameter.optional("options", STRING_MAP), - ProcedureParameter.optional("where", DataTypes.StringType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("strategy", DataTypes.StringType), + ProcedureParameter.optional("sort_order", DataTypes.StringType), + ProcedureParameter.optional("options", STRING_MAP), + ProcedureParameter.optional("where", DataTypes.StringType) + }; // counts are not nullable since the action result is never null - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("rewritten_data_files_count", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("added_data_files_count", DataTypes.IntegerType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField( + "rewritten_data_files_count", DataTypes.IntegerType, false, Metadata.empty()), + new StructField( + "added_data_files_count", DataTypes.IntegerType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new Builder() { @@ -89,36 +93,40 @@ public StructType outputType() { public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); - return modifyIcebergTable(tableIdent, table -> { - RewriteDataFiles action = actions().rewriteDataFiles(table); + return modifyIcebergTable( + tableIdent, + table -> { + RewriteDataFiles action = actions().rewriteDataFiles(table); - String strategy = args.isNullAt(1) ? null : args.getString(1); - String sortOrderString = args.isNullAt(2) ? null : args.getString(2); - SortOrder sortOrder = null; - if (sortOrderString != null) { - sortOrder = collectSortOrders(table, sortOrderString); - } - if (strategy != null || sortOrder != null) { - action = checkAndApplyStrategy(action, strategy, sortOrder); - } + String strategy = args.isNullAt(1) ? null : args.getString(1); + String sortOrderString = args.isNullAt(2) ? null : args.getString(2); + SortOrder sortOrder = null; + if (sortOrderString != null) { + sortOrder = collectSortOrders(table, sortOrderString); + } + if (strategy != null || sortOrder != null) { + action = checkAndApplyStrategy(action, strategy, sortOrder); + } - if (!args.isNullAt(3)) { - action = checkAndApplyOptions(args, action); - } + if (!args.isNullAt(3)) { + action = checkAndApplyOptions(args, action); + } - String where = args.isNullAt(4) ? null : args.getString(4); - action = checkAndApplyFilter(action, where, table.name()); + String where = args.isNullAt(4) ? null : args.getString(4); + action = checkAndApplyFilter(action, where, table.name()); - RewriteDataFiles.Result result = action.execute(); + RewriteDataFiles.Result result = action.execute(); - return toOutputRows(result); - }); + return toOutputRows(result); + }); } - private RewriteDataFiles checkAndApplyFilter(RewriteDataFiles action, String where, String tableName) { + private RewriteDataFiles checkAndApplyFilter( + RewriteDataFiles action, String where, String tableName) { if (where != null) { try { - Expression expression = SparkExpressionConverter.collectResolvedSparkExpression(spark(), tableName, where); + Expression expression = + SparkExpressionConverter.collectResolvedSparkExpression(spark(), tableName, where); return action.filter(SparkExpressionConverter.convertToIcebergExpression(expression)); } catch (AnalysisException e) { throw new IllegalArgumentException("Cannot parse predicates in where option: " + where); @@ -129,7 +137,10 @@ private RewriteDataFiles checkAndApplyFilter(RewriteDataFiles action, String whe private RewriteDataFiles checkAndApplyOptions(InternalRow args, RewriteDataFiles action) { Map options = Maps.newHashMap(); - args.getMap(3).foreach(DataTypes.StringType, DataTypes.StringType, + args.getMap(3) + .foreach( + DataTypes.StringType, + DataTypes.StringType, (k, v) -> { options.put(k.toString(), v.toString()); return BoxedUnit.UNIT; @@ -137,33 +148,38 @@ private RewriteDataFiles checkAndApplyOptions(InternalRow args, RewriteDataFiles return action.options(options); } - private RewriteDataFiles checkAndApplyStrategy(RewriteDataFiles action, String strategy, SortOrder sortOrder) { - // caller of this function ensures that between strategy and sortOrder, at least one of them is not null. + private RewriteDataFiles checkAndApplyStrategy( + RewriteDataFiles action, String strategy, SortOrder sortOrder) { + // caller of this function ensures that between strategy and sortOrder, at least one of them is + // not null. if (strategy == null || strategy.equalsIgnoreCase("sort")) { return action.sort(sortOrder); } if (strategy.equalsIgnoreCase("binpack")) { RewriteDataFiles rewriteDataFiles = action.binPack(); if (sortOrder != null) { - // calling below method to throw the error as user has set both binpack strategy and sort order + // calling below method to throw the error as user has set both binpack strategy and sort + // order return rewriteDataFiles.sort(sortOrder); } return rewriteDataFiles; } else { - throw new IllegalArgumentException("unsupported strategy: " + strategy + ". Only binpack,sort is supported"); + throw new IllegalArgumentException( + "unsupported strategy: " + strategy + ". Only binpack,sort is supported"); } } private SortOrder collectSortOrders(Table table, String sortOrderStr) { String prefix = "ALTER TABLE temp WRITE ORDERED BY "; try { - // Note: Reusing the existing Iceberg sql parser to avoid implementing the custom parser for sort orders. + // Note: Reusing the existing Iceberg sql parser to avoid implementing the custom parser for + // sort orders. // To reuse the existing parser, adding a prefix of "ALTER TABLE temp WRITE ORDERED BY" // along with input sort order and parsing it as a plan to collect the sortOrder. LogicalPlan logicalPlan = spark().sessionState().sqlParser().parsePlan(prefix + sortOrderStr); - return (new SortOrderParserUtil()).collectSortOrder( - table.schema(), - ((SetWriteDistributionAndOrdering) logicalPlan).sortOrder()); + return (new SortOrderParserUtil()) + .collectSortOrder( + table.schema(), ((SetWriteDistributionAndOrdering) logicalPlan).sortOrder()); } catch (AnalysisException ex) { throw new IllegalArgumentException("Unable to parse sortOrder: " + sortOrderStr); } @@ -173,7 +189,7 @@ private InternalRow[] toOutputRows(RewriteDataFiles.Result result) { int rewrittenDataFilesCount = result.rewrittenDataFilesCount(); int addedDataFilesCount = result.addedDataFilesCount(); InternalRow row = newInternalRow(rewrittenDataFilesCount, addedDataFilesCount); - return new InternalRow[]{row}; + return new InternalRow[] {row}; } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteManifestsProcedure.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteManifestsProcedure.java index eae7ce0fca97..abe287f5a746 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteManifestsProcedure.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteManifestsProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Table; @@ -35,23 +34,28 @@ /** * A procedure that rewrites manifests in a table. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see SparkActions#rewriteManifests(Table) () */ class RewriteManifestsProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("use_caching", DataTypes.BooleanType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("use_caching", DataTypes.BooleanType) + }; // counts are not nullable since the action result is never null - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("rewritten_manifests_count", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("added_manifests_count", DataTypes.IntegerType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField( + "rewritten_manifests_count", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("added_manifests_count", DataTypes.IntegerType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -81,24 +85,26 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); Boolean useCaching = args.isNullAt(1) ? null : args.getBoolean(1); - return modifyIcebergTable(tableIdent, table -> { - RewriteManifests action = actions().rewriteManifests(table); + return modifyIcebergTable( + tableIdent, + table -> { + RewriteManifests action = actions().rewriteManifests(table); - if (useCaching != null) { - action.option("use-caching", useCaching.toString()); - } + if (useCaching != null) { + action.option("use-caching", useCaching.toString()); + } - RewriteManifests.Result result = action.execute(); + RewriteManifests.Result result = action.execute(); - return toOutputRows(result); - }); + return toOutputRows(result); + }); } private InternalRow[] toOutputRows(RewriteManifests.Result result) { int rewrittenManifestsCount = Iterables.size(result.rewrittenManifests()); int addedManifestsCount = Iterables.size(result.addedManifests()); InternalRow row = newInternalRow(rewrittenManifestsCount, addedManifestsCount); - return new InternalRow[]{row}; + return new InternalRow[] {row}; } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToSnapshotProcedure.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToSnapshotProcedure.java index 7cf5b0c77bb2..49cc1a5ceae3 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToSnapshotProcedure.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToSnapshotProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Snapshot; @@ -32,22 +31,26 @@ /** * A procedure that rollbacks a table to a specific snapshot id. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#rollbackTo(long) */ class RollbackToSnapshotProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("snapshot_id", DataTypes.LongType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("snapshot_id", DataTypes.LongType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("previous_snapshot_id", DataTypes.LongType, false, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("previous_snapshot_id", DataTypes.LongType, false, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -77,16 +80,16 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); long snapshotId = args.getLong(1); - return modifyIcebergTable(tableIdent, table -> { - Snapshot previousSnapshot = table.currentSnapshot(); + return modifyIcebergTable( + tableIdent, + table -> { + Snapshot previousSnapshot = table.currentSnapshot(); - table.manageSnapshots() - .rollbackTo(snapshotId) - .commit(); + table.manageSnapshots().rollbackTo(snapshotId).commit(); - InternalRow outputRow = newInternalRow(previousSnapshot.snapshotId(), snapshotId); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = newInternalRow(previousSnapshot.snapshotId(), snapshotId); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToTimestampProcedure.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToTimestampProcedure.java index 519a46c6dbb8..059725f0c152 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToTimestampProcedure.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToTimestampProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Snapshot; @@ -33,22 +32,26 @@ /** * A procedure that rollbacks a table to a given point in time. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#rollbackToTime(long) */ class RollbackToTimestampProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("timestamp", DataTypes.TimestampType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("timestamp", DataTypes.TimestampType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("previous_snapshot_id", DataTypes.LongType, false, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("previous_snapshot_id", DataTypes.LongType, false, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -79,18 +82,19 @@ public InternalRow[] call(InternalRow args) { // timestamps in Spark have microsecond precision so this conversion is lossy long timestampMillis = DateTimeUtil.microsToMillis(args.getLong(1)); - return modifyIcebergTable(tableIdent, table -> { - Snapshot previousSnapshot = table.currentSnapshot(); + return modifyIcebergTable( + tableIdent, + table -> { + Snapshot previousSnapshot = table.currentSnapshot(); - table.manageSnapshots() - .rollbackToTime(timestampMillis) - .commit(); + table.manageSnapshots().rollbackToTime(timestampMillis).commit(); - Snapshot currentSnapshot = table.currentSnapshot(); + Snapshot currentSnapshot = table.currentSnapshot(); - InternalRow outputRow = newInternalRow(previousSnapshot.snapshotId(), currentSnapshot.snapshotId()); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = + newInternalRow(previousSnapshot.snapshotId(), currentSnapshot.snapshotId()); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/SetCurrentSnapshotProcedure.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/SetCurrentSnapshotProcedure.java index 274ca19fc107..f8f8049c22b6 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/SetCurrentSnapshotProcedure.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/SetCurrentSnapshotProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Snapshot; @@ -32,22 +31,26 @@ /** * A procedure that sets the current snapshot in a table. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#setCurrentSnapshot(long) */ class SetCurrentSnapshotProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[] { - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("snapshot_id", DataTypes.LongType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("snapshot_id", DataTypes.LongType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("previous_snapshot_id", DataTypes.LongType, true, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("previous_snapshot_id", DataTypes.LongType, true, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -77,17 +80,17 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); long snapshotId = args.getLong(1); - return modifyIcebergTable(tableIdent, table -> { - Snapshot previousSnapshot = table.currentSnapshot(); - Long previousSnapshotId = previousSnapshot != null ? previousSnapshot.snapshotId() : null; + return modifyIcebergTable( + tableIdent, + table -> { + Snapshot previousSnapshot = table.currentSnapshot(); + Long previousSnapshotId = previousSnapshot != null ? previousSnapshot.snapshotId() : null; - table.manageSnapshots() - .setCurrentSnapshot(snapshotId) - .commit(); + table.manageSnapshots().setCurrentSnapshot(snapshotId).commit(); - InternalRow outputRow = newInternalRow(previousSnapshotId, snapshotId); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = newInternalRow(previousSnapshotId, snapshotId); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/SnapshotTableProcedure.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/SnapshotTableProcedure.java index 96e293d6b1da..7a015a51e8ed 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/SnapshotTableProcedure.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/SnapshotTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Map; @@ -34,16 +33,19 @@ import scala.runtime.BoxedUnit; class SnapshotTableProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("source_table", DataTypes.StringType), - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("location", DataTypes.StringType), - ProcedureParameter.optional("properties", STRING_MAP) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("source_table", DataTypes.StringType), + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("location", DataTypes.StringType), + ProcedureParameter.optional("properties", STRING_MAP) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("imported_files_count", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("imported_files_count", DataTypes.LongType, false, Metadata.empty()) + }); private SnapshotTableProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -71,23 +73,28 @@ public StructType outputType() { @Override public InternalRow[] call(InternalRow args) { String source = args.getString(0); - Preconditions.checkArgument(source != null && !source.isEmpty(), + Preconditions.checkArgument( + source != null && !source.isEmpty(), "Cannot handle an empty identifier for argument source_table"); String dest = args.getString(1); - Preconditions.checkArgument(dest != null && !dest.isEmpty(), - "Cannot handle an empty identifier for argument table"); + Preconditions.checkArgument( + dest != null && !dest.isEmpty(), "Cannot handle an empty identifier for argument table"); String snapshotLocation = args.isNullAt(2) ? null : args.getString(2); Map properties = Maps.newHashMap(); if (!args.isNullAt(3)) { - args.getMap(3).foreach(DataTypes.StringType, DataTypes.StringType, - (k, v) -> { - properties.put(k.toString(), v.toString()); - return BoxedUnit.UNIT; - }); + args.getMap(3) + .foreach( + DataTypes.StringType, + DataTypes.StringType, + (k, v) -> { + properties.put(k.toString(), v.toString()); + return BoxedUnit.UNIT; + }); } - Preconditions.checkArgument(!source.equals(dest), + Preconditions.checkArgument( + !source.equals(dest), "Cannot create a snapshot with the same name as the source of the snapshot."); SnapshotTable action = SparkActions.get().snapshotTable(source).as(dest); @@ -103,5 +110,4 @@ public InternalRow[] call(InternalRow args) { public String description() { return "SnapshotTableProcedure"; } - } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java index d481c19d59a1..47b46cade7c0 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Locale; @@ -30,8 +29,7 @@ public class SparkProcedures { private static final Map> BUILDERS = initProcedureBuilders(); - private SparkProcedures() { - } + private SparkProcedures() {} public static ProcedureBuilder newBuilder(String name) { // procedure resolution is case insensitive to match the existing Spark behavior for functions @@ -59,6 +57,7 @@ private static Map> initProcedureBuilders() { public interface ProcedureBuilder { ProcedureBuilder withTableCatalog(TableCatalog tableCatalog); + Procedure build(); } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/BaseDataReader.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/BaseDataReader.java index b58745c7a00d..2cab8ee238e0 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/BaseDataReader.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/BaseDataReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.Closeable; @@ -76,10 +75,16 @@ abstract class BaseDataReader implements Closeable { this.tasks = task.files().iterator(); Map keyMetadata = Maps.newHashMap(); task.files().stream() - .flatMap(fileScanTask -> Stream.concat(Stream.of(fileScanTask.file()), fileScanTask.deletes().stream())) + .flatMap( + fileScanTask -> + Stream.concat(Stream.of(fileScanTask.file()), fileScanTask.deletes().stream())) .forEach(file -> keyMetadata.put(file.path().toString(), file.keyMetadata())); - Stream encrypted = keyMetadata.entrySet().stream() - .map(entry -> EncryptedFiles.encryptedInput(table.io().newInputFile(entry.getKey()), entry.getValue())); + Stream encrypted = + keyMetadata.entrySet().stream() + .map( + entry -> + EncryptedFiles.encryptedInput( + table.io().newInputFile(entry.getKey()), entry.getValue())); // decrypt with the batch call to avoid multiple RPCs to a key server, if possible Iterable decryptedFiles = table.encryption().decrypt(encrypted::iterator); @@ -188,7 +193,8 @@ protected static Object convertConstant(Type type, Object value) { for (int index = 0; index < fields.size(); index++) { NestedField field = fields.get(index); Type fieldType = field.type(); - values[index] = convertConstant(fieldType, struct.get(index, fieldType.typeId().javaClass())); + values[index] = + convertConstant(fieldType, struct.get(index, fieldType.typeId().javaClass())); } return new GenericInternalRow(values); diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java index e4bd3ceba6ce..d620faa979f6 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -50,7 +49,8 @@ class BatchDataReader extends BaseDataReader { private final boolean caseSensitive; private final int batchSize; - BatchDataReader(CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive, int size) { + BatchDataReader( + CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive, int size) { super(table, task); this.expectedSchema = expectedSchema; this.nameMapping = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); @@ -71,18 +71,26 @@ CloseableIterator open(FileScanTask task) { InputFile location = getInputFile(task); Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask"); if (task.file().format() == FileFormat.PARQUET) { - Parquet.ReadBuilder builder = Parquet.read(location) - .project(expectedSchema) - .split(task.start(), task.length()) - .createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(expectedSchema, - fileSchema, /* setArrowValidityVector */ NullCheckingForGet.NULL_CHECKING_ENABLED, idToConstant)) - .recordsPerBatch(batchSize) - .filter(task.residual()) - .caseSensitive(caseSensitive) - // Spark eagerly consumes the batches. So the underlying memory allocated could be reused - // without worrying about subsequent reads clobbering over each other. This improves - // read performance as every batch read doesn't have to pay the cost of allocating memory. - .reuseContainers(); + Parquet.ReadBuilder builder = + Parquet.read(location) + .project(expectedSchema) + .split(task.start(), task.length()) + .createBatchedReaderFunc( + fileSchema -> + VectorizedSparkParquetReaders.buildReader( + expectedSchema, + fileSchema, /* setArrowValidityVector */ + NullCheckingForGet.NULL_CHECKING_ENABLED, + idToConstant)) + .recordsPerBatch(batchSize) + .filter(task.residual()) + .caseSensitive(caseSensitive) + // Spark eagerly consumes the batches. So the underlying memory allocated could be + // reused + // without worrying about subsequent reads clobbering over each other. This improves + // read performance as every batch read doesn't have to pay the cost of allocating + // memory. + .reuseContainers(); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -92,16 +100,21 @@ CloseableIterator open(FileScanTask task) { } else if (task.file().format() == FileFormat.ORC) { Set constantFieldIds = idToConstant.keySet(); Set metadataFieldIds = MetadataColumns.metadataFieldIds(); - Sets.SetView constantAndMetadataFieldIds = Sets.union(constantFieldIds, metadataFieldIds); - Schema schemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(expectedSchema, constantAndMetadataFieldIds); - ORC.ReadBuilder builder = ORC.read(location) - .project(schemaWithoutConstantAndMetadataFields) - .split(task.start(), task.length()) - .createBatchedReaderFunc(fileSchema -> VectorizedSparkOrcReaders.buildReader(expectedSchema, fileSchema, - idToConstant)) - .recordsPerBatch(batchSize) - .filter(task.residual()) - .caseSensitive(caseSensitive); + Sets.SetView constantAndMetadataFieldIds = + Sets.union(constantFieldIds, metadataFieldIds); + Schema schemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot(expectedSchema, constantAndMetadataFieldIds); + ORC.ReadBuilder builder = + ORC.read(location) + .project(schemaWithoutConstantAndMetadataFields) + .split(task.start(), task.length()) + .createBatchedReaderFunc( + fileSchema -> + VectorizedSparkOrcReaders.buildReader( + expectedSchema, fileSchema, idToConstant)) + .recordsPerBatch(batchSize) + .filter(task.residual()) + .caseSensitive(caseSensitive); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java index ce2226f4f75e..1c55e1b8ebe2 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -32,7 +31,8 @@ public class EqualityDeleteRowReader extends RowDataReader { private final Schema expectedSchema; - public EqualityDeleteRowReader(CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { + public EqualityDeleteRowReader( + CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { super(task, table, table.schema(), caseSensitive); this.expectedSchema = expectedSchema; } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java index 7aa66b2223fc..a3c46a6694f4 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Arrays; @@ -43,17 +42,17 @@ /** * The IcebergSource loads/writes tables with format "iceberg". It can load paths and tables. * - * How paths/tables are loaded when using spark.read().format("iceberg").path(table) + *

    How paths/tables are loaded when using spark.read().format("iceberg").path(table) * - * table = "file:/path/to/table" -> loads a HadoopTable at given path - * table = "tablename" -> loads currentCatalog.currentNamespace.tablename - * table = "catalog.tablename" -> load "tablename" from the specified catalog. - * table = "namespace.tablename" -> load "namespace.tablename" from current catalog - * table = "catalog.namespace.tablename" -> "namespace.tablename" from the specified catalog. - * table = "namespace1.namespace2.tablename" -> load "namespace1.namespace2.tablename" from current catalog + *

    table = "file:/path/to/table" -> loads a HadoopTable at given path table = "tablename" + * -> loads currentCatalog.currentNamespace.tablename table = "catalog.tablename" -> load + * "tablename" from the specified catalog. table = "namespace.tablename" -> load + * "namespace.tablename" from current catalog table = "catalog.namespace.tablename" -> + * "namespace.tablename" from the specified catalog. table = "namespace1.namespace2.tablename" -> + * load "namespace1.namespace2.tablename" from current catalog * - * The above list is in order of priority. For example: a matching catalog will take priority over any namespace - * resolution. + *

    The above list is in order of priority. For example: a matching catalog will take priority + * over any namespace resolution. */ public class IcebergSource implements DataSourceRegister, SupportsCatalogOptions { private static final String DEFAULT_CATALOG_NAME = "default_iceberg"; @@ -83,7 +82,8 @@ public boolean supportsExternalMetadata() { @Override public Table getTable(StructType schema, Transform[] partitioning, Map options) { - Spark3Util.CatalogAndIdentifier catalogIdentifier = catalogAndIdentifier(new CaseInsensitiveStringMap(options)); + Spark3Util.CatalogAndIdentifier catalogIdentifier = + catalogAndIdentifier(new CaseInsensitiveStringMap(options)); CatalogPlugin catalog = catalogIdentifier.catalog(); Identifier ident = catalogIdentifier.identifier(); @@ -92,12 +92,16 @@ public Table getTable(StructType schema, Transform[] partitioning, Map config = ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "cache-enabled", "false" // the source should not use a cache - ); + ImmutableMap config = + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "cache-enabled", "false" // the source should not use a cache + ); String catalogName = "org.apache.iceberg.spark.SparkCatalog"; spark.conf().set(DEFAULT_CATALOG, catalogName); config.forEach((key, value) -> spark.conf().set(DEFAULT_CATALOG + "." + key, value)); diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java index ef1eb08d873c..524266f6f83a 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.nio.ByteBuffer; @@ -32,8 +31,8 @@ import org.apache.spark.sql.types.StructType; /** - * Class to adapt a Spark {@code InternalRow} to Iceberg {@link StructLike} for uses like - * {@link org.apache.iceberg.PartitionKey#partition(StructLike)} + * Class to adapt a Spark {@code InternalRow} to Iceberg {@link StructLike} for uses like {@link + * org.apache.iceberg.PartitionKey#partition(StructLike)} */ class InternalRowWrapper implements StructLike { private final DataType[] types; @@ -42,12 +41,8 @@ class InternalRowWrapper implements StructLike { @SuppressWarnings("unchecked") InternalRowWrapper(StructType rowType) { - this.types = Stream.of(rowType.fields()) - .map(StructField::dataType) - .toArray(DataType[]::new); - this.getters = Stream.of(types) - .map(InternalRowWrapper::getter) - .toArray(BiFunction[]::new); + this.types = Stream.of(rowType.fields()).map(StructField::dataType).toArray(DataType[]::new); + this.getters = Stream.of(types).map(InternalRowWrapper::getter).toArray(BiFunction[]::new); } InternalRowWrapper wrap(InternalRow internalRow) { diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java index 4f5962494feb..f206149da30e 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -81,7 +80,8 @@ protected Schema tableSchema() { return tableSchema; } - protected CloseableIterable open(FileScanTask task, Schema readSchema, Map idToConstant) { + protected CloseableIterable open( + FileScanTask task, Schema readSchema, Map idToConstant) { CloseableIterable iter; if (task.isDataTask()) { iter = newDataIterable(task.asDataTask(), readSchema); @@ -112,15 +112,14 @@ protected CloseableIterable open(FileScanTask task, Schema readSche } private CloseableIterable newAvroIterable( - InputFile location, - FileScanTask task, - Schema projection, - Map idToConstant) { - Avro.ReadBuilder builder = Avro.read(location) - .reuseContainers() - .project(projection) - .split(task.start(), task.length()) - .createReaderFunc(readSchema -> new SparkAvroReader(projection, readSchema, idToConstant)); + InputFile location, FileScanTask task, Schema projection, Map idToConstant) { + Avro.ReadBuilder builder = + Avro.read(location) + .reuseContainers() + .project(projection) + .split(task.start(), task.length()) + .createReaderFunc( + readSchema -> new SparkAvroReader(projection, readSchema, idToConstant)); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -130,17 +129,16 @@ private CloseableIterable newAvroIterable( } private CloseableIterable newParquetIterable( - InputFile location, - FileScanTask task, - Schema readSchema, - Map idToConstant) { - Parquet.ReadBuilder builder = Parquet.read(location) - .reuseContainers() - .split(task.start(), task.length()) - .project(readSchema) - .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive); + InputFile location, FileScanTask task, Schema readSchema, Map idToConstant) { + Parquet.ReadBuilder builder = + Parquet.read(location) + .reuseContainers() + .split(task.start(), task.length()) + .project(readSchema) + .createReaderFunc( + fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -150,19 +148,19 @@ private CloseableIterable newParquetIterable( } private CloseableIterable newOrcIterable( - InputFile location, - FileScanTask task, - Schema readSchema, - Map idToConstant) { - Schema readSchemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(readSchema, - Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); - - ORC.ReadBuilder builder = ORC.read(location) - .project(readSchemaWithoutConstantAndMetadataFields) - .split(task.start(), task.length()) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive); + InputFile location, FileScanTask task, Schema readSchema, Map idToConstant) { + Schema readSchemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot( + readSchema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); + + ORC.ReadBuilder builder = + ORC.read(location) + .project(readSchemaWithoutConstantAndMetadataFields) + .split(task.start(), task.length()) + .createReaderFunc( + readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant)) + .filter(task.residual()) + .caseSensitive(caseSensitive); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); @@ -173,8 +171,8 @@ private CloseableIterable newOrcIterable( private CloseableIterable newDataIterable(DataTask task, Schema readSchema) { StructInternalRow row = new StructInternalRow(readSchema.asStruct()); - CloseableIterable asSparkRows = CloseableIterable.transform( - task.asDataTask().rows(), row::setStruct); + CloseableIterable asSparkRows = + CloseableIterable.transform(task.asDataTask().rows(), row::setStruct); return asSparkRows; } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java index 63cc3a466c1a..aee0d4f0586b 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.Serializable; @@ -55,23 +54,25 @@ public class RowDataRewriter implements Serializable { private final FileFormat format; private final boolean caseSensitive; - public RowDataRewriter(Broadcast tableBroadcast, PartitionSpec spec, boolean caseSensitive) { + public RowDataRewriter( + Broadcast
    tableBroadcast, PartitionSpec spec, boolean caseSensitive) { this.tableBroadcast = tableBroadcast; this.spec = spec; this.caseSensitive = caseSensitive; Table table = tableBroadcast.value(); - String formatString = table.properties().getOrDefault( - TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); + String formatString = + table + .properties() + .getOrDefault( + TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); this.format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH)); } public List rewriteDataForTasks(JavaRDD taskRDD) { JavaRDD> dataFilesRDD = taskRDD.map(this::rewriteDataForTask); - return dataFilesRDD.collect().stream() - .flatMap(Collection::stream) - .collect(Collectors.toList()); + return dataFilesRDD.collect().stream().flatMap(Collection::stream).collect(Collectors.toList()); } private List rewriteDataForTask(CombinedScanTask task) throws Exception { @@ -86,28 +87,44 @@ private List rewriteDataForTask(CombinedScanTask task) throws Exceptio RowDataReader dataReader = new RowDataReader(task, table, schema, caseSensitive); StructType structType = SparkSchemaUtil.convert(schema); - SparkAppenderFactory appenderFactory = SparkAppenderFactory.builderFor(table, schema, structType) - .spec(spec) - .build(); - OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, partitionId, taskId) - .defaultSpec(spec) - .format(format) - .build(); + SparkAppenderFactory appenderFactory = + SparkAppenderFactory.builderFor(table, schema, structType).spec(spec).build(); + OutputFileFactory fileFactory = + OutputFileFactory.builderFor(table, partitionId, taskId) + .defaultSpec(spec) + .format(format) + .build(); TaskWriter writer; if (spec.isUnpartitioned()) { - writer = new UnpartitionedWriter<>(spec, format, appenderFactory, fileFactory, table.io(), - Long.MAX_VALUE); - } else if (PropertyUtil.propertyAsBoolean(properties, + writer = + new UnpartitionedWriter<>( + spec, format, appenderFactory, fileFactory, table.io(), Long.MAX_VALUE); + } else if (PropertyUtil.propertyAsBoolean( + properties, TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED, TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED_DEFAULT)) { - writer = new SparkPartitionedFanoutWriter( - spec, format, appenderFactory, fileFactory, table.io(), Long.MAX_VALUE, schema, - structType); + writer = + new SparkPartitionedFanoutWriter( + spec, + format, + appenderFactory, + fileFactory, + table.io(), + Long.MAX_VALUE, + schema, + structType); } else { - writer = new SparkPartitionedWriter( - spec, format, appenderFactory, fileFactory, table.io(), Long.MAX_VALUE, schema, - structType); + writer = + new SparkPartitionedWriter( + spec, + format, + appenderFactory, + fileFactory, + table.io(), + Long.MAX_VALUE, + schema, + structType); } try { @@ -127,14 +144,24 @@ private List rewriteDataForTask(CombinedScanTask task) throws Exceptio LOG.error("Aborting task", originalThrowable); context.markTaskFailed(originalThrowable); - LOG.error("Aborting commit for partition {} (task {}, attempt {}, stage {}.{})", - partitionId, taskId, context.attemptNumber(), context.stageId(), context.stageAttemptNumber()); + LOG.error( + "Aborting commit for partition {} (task {}, attempt {}, stage {}.{})", + partitionId, + taskId, + context.attemptNumber(), + context.stageId(), + context.stageAttemptNumber()); if (dataReader != null) { dataReader.close(); } writer.abort(); - LOG.error("Aborted commit for partition {} (task {}, attempt {}, stage {}.{})", - partitionId, taskId, context.taskAttemptId(), context.stageId(), context.stageAttemptNumber()); + LOG.error( + "Aborted commit for partition {} (task {}, attempt {}, stage {}.{})", + partitionId, + taskId, + context.taskAttemptId(), + context.stageId(), + context.stageAttemptNumber()); } catch (Throwable inner) { if (originalThrowable != inner) { diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SerializableTableWithSize.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SerializableTableWithSize.java index 6275c664410f..e3b81cea7cd1 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SerializableTableWithSize.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SerializableTableWithSize.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.BaseMetadataTable; @@ -25,9 +24,9 @@ import org.apache.spark.util.KnownSizeEstimation; /** - * This class provides a serializable table with a known size estimate. Spark calls - * its SizeEstimator class when broadcasting variables and this can be an expensive - * operation, so providing a known size estimate allows that operation to be skipped. + * This class provides a serializable table with a known size estimate. Spark calls its + * SizeEstimator class when broadcasting variables and this can be an expensive operation, so + * providing a known size estimate allows that operation to be skipped. */ public class SerializableTableWithSize extends SerializableTable implements KnownSizeEstimation { diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java index 4becf666ed3e..6372edde0782 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -61,8 +60,14 @@ class SparkAppenderFactory implements FileAppenderFactory { private StructType eqDeleteSparkType = null; private StructType posDeleteSparkType = null; - SparkAppenderFactory(Map properties, Schema writeSchema, StructType dsSchema, PartitionSpec spec, - int[] equalityFieldIds, Schema eqDeleteRowSchema, Schema posDeleteRowSchema) { + SparkAppenderFactory( + Map properties, + Schema writeSchema, + StructType dsSchema, + PartitionSpec spec, + int[] equalityFieldIds, + Schema eqDeleteRowSchema, + Schema posDeleteRowSchema) { this.properties = properties; this.writeSchema = writeSchema; this.dsSchema = dsSchema; @@ -85,7 +90,6 @@ static class Builder { private Schema eqDeleteRowSchema; private Schema posDeleteRowSchema; - Builder(Table table, Schema writeSchema, StructType dsSchema) { this.table = table; this.spec = table.spec(); @@ -118,16 +122,24 @@ SparkAppenderFactory build() { Preconditions.checkNotNull(writeSchema, "Write Schema must not be null"); Preconditions.checkNotNull(dsSchema, "DS Schema must not be null"); if (equalityFieldIds != null) { - Preconditions.checkNotNull(eqDeleteRowSchema, "Equality Field Ids and Equality Delete Row Schema" + - " must be set together"); + Preconditions.checkNotNull( + eqDeleteRowSchema, + "Equality Field Ids and Equality Delete Row Schema" + " must be set together"); } if (eqDeleteRowSchema != null) { - Preconditions.checkNotNull(equalityFieldIds, "Equality Field Ids and Equality Delete Row Schema" + - " must be set together"); + Preconditions.checkNotNull( + equalityFieldIds, + "Equality Field Ids and Equality Delete Row Schema" + " must be set together"); } - return new SparkAppenderFactory(table.properties(), writeSchema, dsSchema, spec, equalityFieldIds, - eqDeleteRowSchema, posDeleteRowSchema); + return new SparkAppenderFactory( + table.properties(), + writeSchema, + dsSchema, + spec, + equalityFieldIds, + eqDeleteRowSchema, + posDeleteRowSchema); } } @@ -141,7 +153,8 @@ private StructType lazyEqDeleteSparkType() { private StructType lazyPosDeleteSparkType() { if (posDeleteSparkType == null) { - Preconditions.checkNotNull(posDeleteRowSchema, "Position delete row schema shouldn't be null"); + Preconditions.checkNotNull( + posDeleteRowSchema, "Position delete row schema shouldn't be null"); this.posDeleteSparkType = SparkSchemaUtil.convert(posDeleteRowSchema); } return posDeleteSparkType; @@ -187,24 +200,33 @@ public FileAppender newAppender(OutputFile file, FileFormat fileFor } @Override - public DataWriter newDataWriter(EncryptedOutputFile file, FileFormat format, StructLike partition) { - return new DataWriter<>(newAppender(file.encryptingOutputFile(), format), format, - file.encryptingOutputFile().location(), spec, partition, file.keyMetadata()); + public DataWriter newDataWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { + return new DataWriter<>( + newAppender(file.encryptingOutputFile(), format), + format, + file.encryptingOutputFile().location(), + spec, + partition, + file.keyMetadata()); } @Override - public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile file, FileFormat format, - StructLike partition) { - Preconditions.checkState(equalityFieldIds != null && equalityFieldIds.length > 0, + public EqualityDeleteWriter newEqDeleteWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { + Preconditions.checkState( + equalityFieldIds != null && equalityFieldIds.length > 0, "Equality field ids shouldn't be null or empty when creating equality-delete writer"); - Preconditions.checkNotNull(eqDeleteRowSchema, + Preconditions.checkNotNull( + eqDeleteRowSchema, "Equality delete row schema shouldn't be null when creating equality-delete writer"); try { switch (format) { case PARQUET: return Parquet.writeDeletes(file.encryptingOutputFile()) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(lazyEqDeleteSparkType(), msgType)) + .createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(lazyEqDeleteSparkType(), msgType)) .overwrite() .rowSchema(eqDeleteRowSchema) .withSpec(spec) @@ -245,15 +267,16 @@ public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile f } @Override - public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile file, FileFormat format, - StructLike partition) { + public PositionDeleteWriter newPosDeleteWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { try { switch (format) { case PARQUET: StructType sparkPosDeleteSchema = SparkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); return Parquet.writeDeletes(file.encryptingOutputFile()) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(sparkPosDeleteSchema, msgType)) + .createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(sparkPosDeleteSchema, msgType)) .overwrite() .rowSchema(posDeleteRowSchema) .withSpec(spec) diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java index 356271248c35..4fcab5517d44 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -49,8 +48,14 @@ class SparkBatchQueryScan extends SparkBatchScan { private List tasks = null; // lazy cache of tasks - SparkBatchQueryScan(SparkSession spark, Table table, SparkReadConf readConf, boolean caseSensitive, - Schema expectedSchema, List filters, CaseInsensitiveStringMap options) { + SparkBatchQueryScan( + SparkSession spark, + Table table, + SparkReadConf readConf, + boolean caseSensitive, + Schema expectedSchema, + List filters, + CaseInsensitiveStringMap options) { super(spark, table, readConf, caseSensitive, expectedSchema, filters, options); @@ -67,26 +72,28 @@ class SparkBatchQueryScan extends SparkBatchScan { if (snapshotId != null || asOfTimestamp != null) { if (startSnapshotId != null || endSnapshotId != null) { throw new IllegalArgumentException( - "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan when either " + - SparkReadOptions.SNAPSHOT_ID + " or " + SparkReadOptions.AS_OF_TIMESTAMP + " is specified"); + "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan when either " + + SparkReadOptions.SNAPSHOT_ID + + " or " + + SparkReadOptions.AS_OF_TIMESTAMP + + " is specified"); } } else if (startSnapshotId == null && endSnapshotId != null) { - throw new IllegalArgumentException("Cannot only specify option end-snapshot-id to do incremental scan"); + throw new IllegalArgumentException( + "Cannot only specify option end-snapshot-id to do incremental scan"); } // look for split behavior overrides in options this.splitSize = Spark3Util.propertyAsLong(options, SparkReadOptions.SPLIT_SIZE, null); this.splitLookback = Spark3Util.propertyAsInt(options, SparkReadOptions.LOOKBACK, null); - this.splitOpenFileCost = Spark3Util.propertyAsLong(options, SparkReadOptions.FILE_OPEN_COST, null); + this.splitOpenFileCost = + Spark3Util.propertyAsLong(options, SparkReadOptions.FILE_OPEN_COST, null); } @Override protected List tasks() { if (tasks == null) { - TableScan scan = table() - .newScan() - .caseSensitive(caseSensitive()) - .project(expectedSchema()); + TableScan scan = table().newScan().caseSensitive(caseSensitive()).project(expectedSchema()); if (snapshotId != null) { scan = scan.useSnapshot(snapshotId); @@ -122,7 +129,7 @@ protected List tasks() { try (CloseableIterable tasksIterable = scan.planTasks()) { this.tasks = Lists.newArrayList(tasksIterable); - } catch (IOException e) { + } catch (IOException e) { throw new RuntimeIOException(e, "Failed to close table scan: %s", scan); } } @@ -141,19 +148,25 @@ public boolean equals(Object o) { } SparkBatchQueryScan that = (SparkBatchQueryScan) o; - return table().name().equals(that.table().name()) && - readSchema().equals(that.readSchema()) && // compare Spark schemas to ignore field ids - filterExpressions().toString().equals(that.filterExpressions().toString()) && - Objects.equals(snapshotId, that.snapshotId) && - Objects.equals(startSnapshotId, that.startSnapshotId) && - Objects.equals(endSnapshotId, that.endSnapshotId) && - Objects.equals(asOfTimestamp, that.asOfTimestamp); + return table().name().equals(that.table().name()) + && readSchema().equals(that.readSchema()) + && // compare Spark schemas to ignore field ids + filterExpressions().toString().equals(that.filterExpressions().toString()) + && Objects.equals(snapshotId, that.snapshotId) + && Objects.equals(startSnapshotId, that.startSnapshotId) + && Objects.equals(endSnapshotId, that.endSnapshotId) + && Objects.equals(asOfTimestamp, that.asOfTimestamp); } @Override public int hashCode() { return Objects.hash( - table().name(), readSchema(), filterExpressions().toString(), snapshotId, startSnapshotId, endSnapshotId, + table().name(), + readSchema(), + filterExpressions().toString(), + snapshotId, + startSnapshotId, + endSnapshotId, asOfTimestamp); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java index ce5c6a219087..3b3d62d96226 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.Serializable; @@ -77,8 +76,14 @@ abstract class SparkBatchScan implements Scan, Batch, SupportsReportStatistics { // lazy variables private StructType readSchema = null; - SparkBatchScan(SparkSession spark, Table table, SparkReadConf readConf, boolean caseSensitive, - Schema expectedSchema, List filters, CaseInsensitiveStringMap options) { + SparkBatchScan( + SparkSession spark, + Table table, + SparkReadConf readConf, + boolean caseSensitive, + Schema expectedSchema, + List filters, + CaseInsensitiveStringMap options) { this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); this.table = table; this.readConf = readConf; @@ -122,8 +127,9 @@ public MicroBatchStream toMicroBatchStream(String checkpointLocation) { @Override public StructType readSchema() { if (readSchema == null) { - Preconditions.checkArgument(readTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(expectedSchema), - SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); + Preconditions.checkArgument( + readTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(expectedSchema), + SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); this.readSchema = SparkSchemaUtil.convert(expectedSchema); } return readSchema; @@ -134,7 +140,8 @@ public InputPartition[] planInputPartitions() { String expectedSchemaString = SchemaParser.toJson(expectedSchema); // broadcast the table metadata as input partitions will be sent to executors - Broadcast
    tableBroadcast = sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); + Broadcast
    tableBroadcast = + sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); List scanTasks = tasks(); InputPartition[] readTasks = new InputPartition[scanTasks.size()]; @@ -142,9 +149,15 @@ public InputPartition[] planInputPartitions() { Tasks.range(readTasks.length) .stopOnFailure() .executeWith(localityPreferred ? ThreadPools.getWorkerPool() : null) - .run(index -> readTasks[index] = new ReadTask( - scanTasks.get(index), tableBroadcast, expectedSchemaString, - caseSensitive, localityPreferred)); + .run( + index -> + readTasks[index] = + new ReadTask( + scanTasks.get(index), + tableBroadcast, + expectedSchemaString, + caseSensitive, + localityPreferred)); return readTasks; } @@ -153,28 +166,38 @@ public InputPartition[] planInputPartitions() { public PartitionReaderFactory createReaderFactory() { boolean allParquetFileScanTasks = tasks().stream() - .allMatch(combinedScanTask -> !combinedScanTask.isDataTask() && combinedScanTask.files() - .stream() - .allMatch(fileScanTask -> fileScanTask.file().format().equals( - FileFormat.PARQUET))); + .allMatch( + combinedScanTask -> + !combinedScanTask.isDataTask() + && combinedScanTask.files().stream() + .allMatch( + fileScanTask -> + fileScanTask.file().format().equals(FileFormat.PARQUET))); boolean allOrcFileScanTasks = tasks().stream() - .allMatch(combinedScanTask -> !combinedScanTask.isDataTask() && combinedScanTask.files() - .stream() - .allMatch(fileScanTask -> fileScanTask.file().format().equals( - FileFormat.ORC))); + .allMatch( + combinedScanTask -> + !combinedScanTask.isDataTask() + && combinedScanTask.files().stream() + .allMatch( + fileScanTask -> + fileScanTask.file().format().equals(FileFormat.ORC))); boolean atLeastOneColumn = expectedSchema.columns().size() > 0; - boolean onlyPrimitives = expectedSchema.columns().stream().allMatch(c -> c.type().isPrimitiveType()); + boolean onlyPrimitives = + expectedSchema.columns().stream().allMatch(c -> c.type().isPrimitiveType()); boolean hasNoDeleteFiles = tasks().stream().noneMatch(TableScanUtil::hasDeletes); boolean batchReadsEnabled = batchReadsEnabled(allParquetFileScanTasks, allOrcFileScanTasks); - boolean readUsingBatch = batchReadsEnabled && hasNoDeleteFiles && (allOrcFileScanTasks || - (allParquetFileScanTasks && atLeastOneColumn && onlyPrimitives)); + boolean readUsingBatch = + batchReadsEnabled + && hasNoDeleteFiles + && (allOrcFileScanTasks + || (allParquetFileScanTasks && atLeastOneColumn && onlyPrimitives)); int batchSize = readUsingBatch ? batchSize(allParquetFileScanTasks, allOrcFileScanTasks) : 0; @@ -208,14 +231,16 @@ public Statistics estimateStatistics() { return new Stats(0L, 0L); } - // estimate stats using snapshot summary only for partitioned tables (metadata tables are unpartitioned) + // estimate stats using snapshot summary only for partitioned tables (metadata tables are + // unpartitioned) if (!table.spec().isUnpartitioned() && filterExpressions.isEmpty()) { LOG.debug("using table metadata to estimate table statistics"); - long totalRecords = PropertyUtil.propertyAsLong(table.currentSnapshot().summary(), - SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE); - return new Stats( - SparkSchemaUtil.estimateSize(readSchema(), totalRecords), - totalRecords); + long totalRecords = + PropertyUtil.propertyAsLong( + table.currentSnapshot().summary(), + SnapshotSummary.TOTAL_RECORDS_PROP, + Long.MAX_VALUE); + return new Stats(SparkSchemaUtil.estimateSize(readSchema(), totalRecords), totalRecords); } long numRows = 0L; @@ -234,7 +259,8 @@ public Statistics estimateStatistics() { @Override public String description() { - String filters = filterExpressions.stream().map(Spark3Util::describe).collect(Collectors.joining(", ")); + String filters = + filterExpressions.stream().map(Spark3Util::describe).collect(Collectors.joining(", ")); return String.format("%s [filters=%s]", table, filters); } @@ -275,7 +301,8 @@ private static class RowReader extends RowDataReader implements PartitionReader< } } - private static class BatchReader extends BatchDataReader implements PartitionReader { + private static class BatchReader extends BatchDataReader + implements PartitionReader { BatchReader(ReadTask task, int batchSize) { super(task.task, task.table(), task.expectedSchema(), task.isCaseSensitive(), batchSize); } @@ -290,8 +317,12 @@ static class ReadTask implements InputPartition, Serializable { private transient Schema expectedSchema = null; private transient String[] preferredLocations = null; - ReadTask(CombinedScanTask task, Broadcast
    tableBroadcast, String expectedSchemaString, - boolean caseSensitive, boolean localityPreferred) { + ReadTask( + CombinedScanTask task, + Broadcast
    tableBroadcast, + String expectedSchemaString, + boolean caseSensitive, + boolean localityPreferred) { this.task = task; this.tableBroadcast = tableBroadcast; this.expectedSchemaString = expectedSchemaString; diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java index beaa7c295024..a8c894bfc50c 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; +import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; + import java.util.Locale; import java.util.Map; import org.apache.iceberg.FileFormat; @@ -40,24 +44,35 @@ import org.apache.spark.sql.types.StructType; import org.apache.spark.unsafe.types.UTF8String; -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; -import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; - class SparkFileWriterFactory extends BaseFileWriterFactory { private StructType dataSparkType; private StructType equalityDeleteSparkType; private StructType positionDeleteSparkType; - SparkFileWriterFactory(Table table, FileFormat dataFileFormat, Schema dataSchema, StructType dataSparkType, - SortOrder dataSortOrder, FileFormat deleteFileFormat, - int[] equalityFieldIds, Schema equalityDeleteRowSchema, StructType equalityDeleteSparkType, - SortOrder equalityDeleteSortOrder, Schema positionDeleteRowSchema, - StructType positionDeleteSparkType) { - - super(table, dataFileFormat, dataSchema, dataSortOrder, deleteFileFormat, equalityFieldIds, - equalityDeleteRowSchema, equalityDeleteSortOrder, positionDeleteRowSchema); + SparkFileWriterFactory( + Table table, + FileFormat dataFileFormat, + Schema dataSchema, + StructType dataSparkType, + SortOrder dataSortOrder, + FileFormat deleteFileFormat, + int[] equalityFieldIds, + Schema equalityDeleteRowSchema, + StructType equalityDeleteSparkType, + SortOrder equalityDeleteSortOrder, + Schema positionDeleteRowSchema, + StructType positionDeleteSparkType) { + + super( + table, + dataFileFormat, + dataSchema, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteSortOrder, + positionDeleteRowSchema); this.dataSparkType = dataSparkType; this.equalityDeleteSparkType = equalityDeleteSparkType; @@ -80,7 +95,8 @@ protected void configureEqualityDelete(Avro.DeleteWriteBuilder builder) { @Override protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { - boolean withRow = positionDeleteSparkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME).isDefined(); + boolean withRow = + positionDeleteSparkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME).isDefined(); if (withRow) { // SparkAvroWriter accepts just the Spark type of the row ignoring the path and pos StructField rowField = positionDeleteSparkType().apply(DELETE_FILE_ROW_FIELD_NAME); @@ -96,12 +112,14 @@ protected void configureDataWrite(Parquet.DataWriteBuilder builder) { @Override protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(msgType -> SparkParquetWriters.buildWriter(equalityDeleteSparkType(), msgType)); + builder.createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(equalityDeleteSparkType(), msgType)); } @Override protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(msgType -> SparkParquetWriters.buildWriter(positionDeleteSparkType(), msgType)); + builder.createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(positionDeleteSparkType(), msgType)); builder.transformPaths(path -> UTF8String.fromString(path.toString())); } @@ -132,7 +150,8 @@ private StructType dataSparkType() { private StructType equalityDeleteSparkType() { if (equalityDeleteSparkType == null) { - Preconditions.checkNotNull(equalityDeleteRowSchema(), "Equality delete schema must not be null"); + Preconditions.checkNotNull( + equalityDeleteRowSchema(), "Equality delete schema must not be null"); this.equalityDeleteSparkType = SparkSchemaUtil.convert(equalityDeleteRowSchema()); } @@ -141,7 +160,8 @@ private StructType equalityDeleteSparkType() { private StructType positionDeleteSparkType() { if (positionDeleteSparkType == null) { - // wrap the optional row schema into the position delete schema that contains path and position + // wrap the optional row schema into the position delete schema that contains path and + // position Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema()); this.positionDeleteSparkType = SparkSchemaUtil.convert(positionDeleteSchema); } @@ -168,10 +188,12 @@ static class Builder { Map properties = table.properties(); - String dataFileFormatName = properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); + String dataFileFormatName = + properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); this.dataFileFormat = FileFormat.valueOf(dataFileFormatName.toUpperCase(Locale.ENGLISH)); - String deleteFileFormatName = properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); + String deleteFileFormatName = + properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); this.deleteFileFormat = FileFormat.valueOf(deleteFileFormatName.toUpperCase(Locale.ENGLISH)); } @@ -233,13 +255,23 @@ Builder positionDeleteSparkType(StructType newPositionDeleteSparkType) { SparkFileWriterFactory build() { boolean noEqualityDeleteConf = equalityFieldIds == null && equalityDeleteRowSchema == null; boolean fullEqualityDeleteConf = equalityFieldIds != null && equalityDeleteRowSchema != null; - Preconditions.checkArgument(noEqualityDeleteConf || fullEqualityDeleteConf, + Preconditions.checkArgument( + noEqualityDeleteConf || fullEqualityDeleteConf, "Equality field IDs and equality delete row schema must be set together"); return new SparkFileWriterFactory( - table, dataFileFormat, dataSchema, dataSparkType, dataSortOrder, deleteFileFormat, - equalityFieldIds, equalityDeleteRowSchema, equalityDeleteSparkType, equalityDeleteSortOrder, - positionDeleteRowSchema, positionDeleteSparkType); + table, + dataFileFormat, + dataSchema, + dataSparkType, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteSparkType, + equalityDeleteSortOrder, + positionDeleteRowSchema, + positionDeleteSparkType); } } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScan.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScan.java index bdb162fb553c..4eb36b67ea40 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScan.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -43,8 +42,12 @@ class SparkFilesScan extends SparkBatchScan { private List tasks = null; // lazy cache of tasks - SparkFilesScan(SparkSession spark, Table table, SparkReadConf readConf, - boolean caseSensitive, CaseInsensitiveStringMap options) { + SparkFilesScan( + SparkSession spark, + Table table, + SparkReadConf readConf, + boolean caseSensitive, + CaseInsensitiveStringMap options) { super(spark, table, readConf, caseSensitive, table.schema(), ImmutableList.of(), options); this.taskSetID = options.get(SparkReadOptions.FILE_SCAN_TASK_SET_ID); @@ -58,16 +61,18 @@ protected List tasks() { if (tasks == null) { FileScanTaskSetManager taskSetManager = FileScanTaskSetManager.get(); List files = taskSetManager.fetchTasks(table(), taskSetID); - ValidationException.check(files != null, + ValidationException.check( + files != null, "Task set manager has no tasks for table %s with id %s", - table(), taskSetID); + table(), + taskSetID); - CloseableIterable splitFiles = TableScanUtil.splitFiles( - CloseableIterable.withNoopClose(files), - splitSize); - CloseableIterable scanTasks = TableScanUtil.planTasks( - splitFiles, splitSize, - splitLookback, splitOpenFileCost); + CloseableIterable splitFiles = + TableScanUtil.splitFiles(CloseableIterable.withNoopClose(files), splitSize); + CloseableIterable scanTasks = + TableScanUtil.planTasks( + splitFiles, splitSize, + splitLookback, splitOpenFileCost); this.tasks = Lists.newArrayList(scanTasks); } @@ -85,11 +90,11 @@ public boolean equals(Object other) { } SparkFilesScan that = (SparkFilesScan) other; - return table().name().equals(that.table().name()) && - Objects.equals(taskSetID, that.taskSetID) && - Objects.equals(splitSize, that.splitSize) && - Objects.equals(splitLookback, that.splitLookback) && - Objects.equals(splitOpenFileCost, that.splitOpenFileCost); + return table().name().equals(that.table().name()) + && Objects.equals(taskSetID, that.taskSetID) + && Objects.equals(splitSize, that.splitSize) + && Objects.equals(splitLookback, that.splitLookback) + && Objects.equals(splitOpenFileCost, that.splitOpenFileCost); } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScanBuilder.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScanBuilder.java index fc0414b0682f..029585caf944 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScanBuilder.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScanBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.Table; diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkMergeBuilder.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkMergeBuilder.java index ade6c2f3ddee..24cd831567d1 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkMergeBuilder.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkMergeBuilder.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL; +import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL_DEFAULT; +import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL; +import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL_DEFAULT; +import static org.apache.iceberg.TableProperties.UPDATE_ISOLATION_LEVEL; +import static org.apache.iceberg.TableProperties.UPDATE_ISOLATION_LEVEL_DEFAULT; + import java.util.Locale; import java.util.Map; import org.apache.iceberg.IsolationLevel; @@ -31,13 +37,6 @@ import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.WriteBuilder; -import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL; -import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL_DEFAULT; -import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL; -import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL_DEFAULT; -import static org.apache.iceberg.TableProperties.UPDATE_ISOLATION_LEVEL; -import static org.apache.iceberg.TableProperties.UPDATE_ISOLATION_LEVEL_DEFAULT; - class SparkMergeBuilder implements MergeBuilder { private final SparkSession spark; @@ -60,11 +59,14 @@ class SparkMergeBuilder implements MergeBuilder { private IsolationLevel getIsolationLevel(Map props, String operation) { String isolationLevelAsString; if (operation.equalsIgnoreCase("delete")) { - isolationLevelAsString = props.getOrDefault(DELETE_ISOLATION_LEVEL, DELETE_ISOLATION_LEVEL_DEFAULT); + isolationLevelAsString = + props.getOrDefault(DELETE_ISOLATION_LEVEL, DELETE_ISOLATION_LEVEL_DEFAULT); } else if (operation.equalsIgnoreCase("update")) { - isolationLevelAsString = props.getOrDefault(UPDATE_ISOLATION_LEVEL, UPDATE_ISOLATION_LEVEL_DEFAULT); + isolationLevelAsString = + props.getOrDefault(UPDATE_ISOLATION_LEVEL, UPDATE_ISOLATION_LEVEL_DEFAULT); } else if (operation.equalsIgnoreCase("merge")) { - isolationLevelAsString = props.getOrDefault(MERGE_ISOLATION_LEVEL, MERGE_ISOLATION_LEVEL_DEFAULT); + isolationLevelAsString = + props.getOrDefault(MERGE_ISOLATION_LEVEL, MERGE_ISOLATION_LEVEL_DEFAULT); } else { throw new IllegalArgumentException("Unsupported operation: " + operation); } @@ -78,14 +80,15 @@ public ScanBuilder asScanBuilder() { private ScanBuilder scanBuilder() { if (lazyScanBuilder == null) { - SparkScanBuilder scanBuilder = new SparkScanBuilder(spark, table, writeInfo.options()) { - @Override - public Scan build() { - Scan scan = super.buildMergeScan(); - SparkMergeBuilder.this.configuredScan = scan; - return scan; - } - }; + SparkScanBuilder scanBuilder = + new SparkScanBuilder(spark, table, writeInfo.options()) { + @Override + public Scan build() { + Scan scan = super.buildMergeScan(); + SparkMergeBuilder.this.configuredScan = scan; + return scan; + } + }; // ignore residuals to ensure we read full files lazyScanBuilder = scanBuilder.ignoreResiduals(); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkMergeScan.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkMergeScan.java index 924481f377c1..8bc3f7d049cf 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkMergeScan.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkMergeScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -59,9 +58,15 @@ class SparkMergeScan extends SparkBatchScan implements SupportsFileFilter { private List tasks = null; // lazy cache of tasks private Set filteredLocations = null; - SparkMergeScan(SparkSession spark, Table table, SparkReadConf readConf, - boolean caseSensitive, boolean ignoreResiduals, - Schema expectedSchema, List filters, CaseInsensitiveStringMap options) { + SparkMergeScan( + SparkSession spark, + Table table, + SparkReadConf readConf, + boolean caseSensitive, + boolean ignoreResiduals, + Schema expectedSchema, + List filters, + CaseInsensitiveStringMap options) { super(spark, table, readConf, caseSensitive, expectedSchema, filters, options); @@ -72,7 +77,8 @@ class SparkMergeScan extends SparkBatchScan implements SupportsFileFilter { this.splitLookback = readConf.splitLookback(); this.splitOpenFileCost = readConf.splitOpenFileCost(); - Preconditions.checkArgument(!options.containsKey(SparkReadOptions.SNAPSHOT_ID), "Can't set snapshot-id in options"); + Preconditions.checkArgument( + !options.containsKey(SparkReadOptions.SNAPSHOT_ID), "Can't set snapshot-id in options"); Snapshot currentSnapshot = table.currentSnapshot(); this.snapshotId = currentSnapshot != null ? currentSnapshot.snapshotId() : null; @@ -98,20 +104,22 @@ public SupportsFileFilter.FileFilterMetric filterFiles(Set locations) { tasks = null; filteredLocations = locations; List originalFile = files(); - files = originalFile.stream() - .filter(file -> filteredLocations.contains(file.file().path().toString())) - .collect(Collectors.toList()); + files = + originalFile.stream() + .filter(file -> filteredLocations.contains(file.file().path().toString())) + .collect(Collectors.toList()); return new SupportsFileFilter.FileFilterMetric(originalFile.size(), files.size()); } // should be accessible to the write synchronized List files() { if (files == null) { - TableScan scan = table - .newScan() - .caseSensitive(caseSensitive()) - .useSnapshot(snapshotId) - .project(expectedSchema); + TableScan scan = + table + .newScan() + .caseSensitive(caseSensitive()) + .useSnapshot(snapshotId) + .project(expectedSchema); for (Expression filter : filterExpressions()) { scan = scan.filter(filter); @@ -134,12 +142,12 @@ synchronized List files() { @Override protected synchronized List tasks() { if (tasks == null) { - CloseableIterable splitFiles = TableScanUtil.splitFiles( - CloseableIterable.withNoopClose(files()), - splitSize); - CloseableIterable scanTasks = TableScanUtil.planTasks( - splitFiles, splitSize, - splitLookback, splitOpenFileCost); + CloseableIterable splitFiles = + TableScanUtil.splitFiles(CloseableIterable.withNoopClose(files()), splitSize); + CloseableIterable scanTasks = + TableScanUtil.planTasks( + splitFiles, splitSize, + splitLookback, splitOpenFileCost); tasks = Lists.newArrayList(scanTasks); } @@ -157,19 +165,24 @@ public boolean equals(Object o) { } SparkMergeScan that = (SparkMergeScan) o; - return table().name().equals(that.table().name()) && - readSchema().equals(that.readSchema()) && // compare Spark schemas to ignore field ids - filterExpressions().toString().equals(that.filterExpressions().toString()) && - ignoreResiduals == that.ignoreResiduals && - Objects.equals(snapshotId, that.snapshotId) && - Objects.equals(filteredLocations, that.filteredLocations); + return table().name().equals(that.table().name()) + && readSchema().equals(that.readSchema()) + && // compare Spark schemas to ignore field ids + filterExpressions().toString().equals(that.filterExpressions().toString()) + && ignoreResiduals == that.ignoreResiduals + && Objects.equals(snapshotId, that.snapshotId) + && Objects.equals(filteredLocations, that.filteredLocations); } @Override public int hashCode() { return Objects.hash( - table().name(), readSchema(), filterExpressions().toString(), - ignoreResiduals, snapshotId, filteredLocations); + table().name(), + readSchema(), + filterExpressions().toString(), + ignoreResiduals, + snapshotId, + filteredLocations); } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java index 5c536cb59299..5346b5267c1d 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.BufferedWriter; @@ -79,8 +78,13 @@ public class SparkMicroBatchStream implements MicroBatchStream { private final boolean skipOverwrite; private final Long fromTimestamp; - SparkMicroBatchStream(JavaSparkContext sparkContext, Table table, SparkReadConf readConf, boolean caseSensitive, - Schema expectedSchema, String checkpointLocation) { + SparkMicroBatchStream( + JavaSparkContext sparkContext, + Table table, + SparkReadConf readConf, + boolean caseSensitive, + Schema expectedSchema, + String checkpointLocation) { this.table = table; this.caseSensitive = caseSensitive; this.expectedSchema = SchemaParser.toJson(expectedSchema); @@ -91,7 +95,8 @@ public class SparkMicroBatchStream implements MicroBatchStream { this.splitOpenFileCost = readConf.splitOpenFileCost(); this.fromTimestamp = readConf.streamFromTimestamp(); - InitialOffsetStore initialOffsetStore = new InitialOffsetStore(table, checkpointLocation, fromTimestamp); + InitialOffsetStore initialOffsetStore = + new InitialOffsetStore(table, checkpointLocation, fromTimestamp); this.initialOffset = initialOffsetStore.initialOffset(); this.skipDelete = readConf.streamingSkipDeleteSnapshots(); @@ -111,14 +116,19 @@ public Offset latestOffset() { Snapshot latestSnapshot = table.currentSnapshot(); return new StreamingOffset( - latestSnapshot.snapshotId(), Iterables.size(latestSnapshot.addedDataFiles(table.io())), false); + latestSnapshot.snapshotId(), + Iterables.size(latestSnapshot.addedDataFiles(table.io())), + false); } @Override public InputPartition[] planInputPartitions(Offset start, Offset end) { - Preconditions.checkArgument(end instanceof StreamingOffset, "Invalid end offset: %s is not a StreamingOffset", end); Preconditions.checkArgument( - start instanceof StreamingOffset, "Invalid start offset: %s is not a StreamingOffset", start); + end instanceof StreamingOffset, "Invalid end offset: %s is not a StreamingOffset", end); + Preconditions.checkArgument( + start instanceof StreamingOffset, + "Invalid start offset: %s is not a StreamingOffset", + start); if (end.equals(StreamingOffset.START_OFFSET)) { return new InputPartition[0]; @@ -129,19 +139,25 @@ public InputPartition[] planInputPartitions(Offset start, Offset end) { List fileScanTasks = planFiles(startOffset, endOffset); - CloseableIterable splitTasks = TableScanUtil.splitFiles( - CloseableIterable.withNoopClose(fileScanTasks), - splitSize); - List combinedScanTasks = Lists.newArrayList( - TableScanUtil.planTasks(splitTasks, splitSize, splitLookback, splitOpenFileCost)); + CloseableIterable splitTasks = + TableScanUtil.splitFiles(CloseableIterable.withNoopClose(fileScanTasks), splitSize); + List combinedScanTasks = + Lists.newArrayList( + TableScanUtil.planTasks(splitTasks, splitSize, splitLookback, splitOpenFileCost)); InputPartition[] readTasks = new InputPartition[combinedScanTasks.size()]; Tasks.range(readTasks.length) .stopOnFailure() .executeWith(localityPreferred ? ThreadPools.getWorkerPool() : null) - .run(index -> readTasks[index] = new ReadTask( - combinedScanTasks.get(index), tableBroadcast, expectedSchema, - caseSensitive, localityPreferred)); + .run( + index -> + readTasks[index] = + new ReadTask( + combinedScanTasks.get(index), + tableBroadcast, + expectedSchema, + caseSensitive, + localityPreferred)); return readTasks; } @@ -162,17 +178,17 @@ public Offset deserializeOffset(String json) { } @Override - public void commit(Offset end) { - } + public void commit(Offset end) {} @Override - public void stop() { - } + public void stop() {} private List planFiles(StreamingOffset startOffset, StreamingOffset endOffset) { List fileScanTasks = Lists.newArrayList(); - StreamingOffset batchStartOffset = StreamingOffset.START_OFFSET.equals(startOffset) ? - determineStartingOffset(table, fromTimestamp) : startOffset; + StreamingOffset batchStartOffset = + StreamingOffset.START_OFFSET.equals(startOffset) + ? determineStartingOffset(table, fromTimestamp) + : startOffset; StreamingOffset currentOffset = null; @@ -189,10 +205,12 @@ private List planFiles(StreamingOffset startOffset, StreamingOffse continue; } - MicroBatch latestMicroBatch = MicroBatches.from(table.snapshot(currentOffset.snapshotId()), table.io()) - .caseSensitive(caseSensitive) - .specsById(table.specs()) - .generate(currentOffset.position(), Long.MAX_VALUE, currentOffset.shouldScanAllFiles()); + MicroBatch latestMicroBatch = + MicroBatches.from(table.snapshot(currentOffset.snapshotId()), table.io()) + .caseSensitive(caseSensitive) + .specsById(table.specs()) + .generate( + currentOffset.position(), Long.MAX_VALUE, currentOffset.shouldScanAllFiles()); fileScanTasks.addAll(latestMicroBatch.tasks()); } while (currentOffset.snapshotId() != endOffset.snapshotId()); @@ -208,19 +226,24 @@ private boolean shouldProcess(Snapshot snapshot) { case DataOperations.REPLACE: return false; case DataOperations.DELETE: - Preconditions.checkState(skipDelete, + Preconditions.checkState( + skipDelete, "Cannot process delete snapshot: %s, to ignore deletes, set %s=true", - snapshot.snapshotId(), SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS); + snapshot.snapshotId(), + SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS); return false; case DataOperations.OVERWRITE: - Preconditions.checkState(skipOverwrite, + Preconditions.checkState( + skipOverwrite, "Cannot process overwrite snapshot: %s, to ignore overwrites, set %s=true", - snapshot.snapshotId(), SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS); + snapshot.snapshotId(), + SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS); return false; default: - throw new IllegalStateException(String.format( - "Cannot process unknown snapshot operation: %s (snapshot id %s)", - op.toLowerCase(Locale.ROOT), snapshot.snapshotId())); + throw new IllegalStateException( + String.format( + "Cannot process unknown snapshot operation: %s (snapshot id %s)", + op.toLowerCase(Locale.ROOT), snapshot.snapshotId())); } } @@ -281,7 +304,8 @@ public StreamingOffset initialOffset() { private void writeOffset(StreamingOffset offset, OutputFile file) { try (OutputStream outputStream = file.create()) { - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(outputStream, StandardCharsets.UTF_8)); + BufferedWriter writer = + new BufferedWriter(new OutputStreamWriter(outputStream, StandardCharsets.UTF_8)); writer.write(offset.json()); writer.flush(); } catch (IOException ioException) { diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java index d38ae2f40316..f17cd260f928 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.FileFormat; @@ -34,10 +33,15 @@ public class SparkPartitionedFanoutWriter extends PartitionedFanoutWriter appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize, - Schema schema, StructType sparkSchema) { + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + StructType sparkSchema) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.partitionKey = new PartitionKey(spec, schema); this.internalRowWrapper = new InternalRowWrapper(sparkSchema); diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java index f81a09926d85..a86091644360 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.FileFormat; @@ -34,10 +33,15 @@ public class SparkPartitionedWriter extends PartitionedWriter { private final PartitionKey partitionKey; private final InternalRowWrapper internalRowWrapper; - public SparkPartitionedWriter(PartitionSpec spec, FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize, - Schema schema, StructType sparkSchema) { + public SparkPartitionedWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + StructType sparkSchema) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.partitionKey = new PartitionKey(spec, schema); this.internalRowWrapper = new InternalRowWrapper(sparkSchema); diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkRewriteBuilder.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkRewriteBuilder.java index 58373f7e6a4b..be4bd3e2c7d1 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkRewriteBuilder.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkRewriteBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.Schema; @@ -54,16 +53,19 @@ public SparkRewriteBuilder(SparkSession spark, Table table, LogicalWriteInfo inf @Override public BatchWrite buildForBatch() { - Preconditions.checkArgument(handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), + Preconditions.checkArgument( + handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsSchema); - TypeUtil.validateWriteSchema(table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); + TypeUtil.validateWriteSchema( + table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); SparkUtil.validatePartitionTransforms(table.spec()); String appId = spark.sparkContext().applicationId(); - SparkWrite write = new SparkWrite(spark, table, writeConf, writeInfo, appId, writeSchema, dsSchema); + SparkWrite write = + new SparkWrite(spark, table, writeConf, writeInfo, appId, writeSchema, dsSchema); return write.asRewrite(fileSetID); } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java index a9b82df80151..708d4378bc1b 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Collections; @@ -48,8 +47,11 @@ import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; -public class SparkScanBuilder implements ScanBuilder, SupportsPushDownFilters, SupportsPushDownRequiredColumns, - SupportsReportStatistics { +public class SparkScanBuilder + implements ScanBuilder, + SupportsPushDownFilters, + SupportsPushDownRequiredColumns, + SupportsReportStatistics { private static final Filter[] NO_FILTERS = new Filter[0]; @@ -66,7 +68,8 @@ public class SparkScanBuilder implements ScanBuilder, SupportsPushDownFilters, S private Filter[] pushedFilters = NO_FILTERS; private boolean ignoreResiduals = false; - SparkScanBuilder(SparkSession spark, Table table, Schema schema, CaseInsensitiveStringMap options) { + SparkScanBuilder( + SparkSession spark, Table table, Schema schema, CaseInsensitiveStringMap options) { this.spark = spark; this.table = table; this.schema = schema; @@ -129,12 +132,16 @@ public Filter[] pushedFilters() { @Override public void pruneColumns(StructType requestedSchema) { - this.requestedProjection = new StructType(Stream.of(requestedSchema.fields()) - .filter(field -> MetadataColumns.nonMetadataColumn(field.name())) - .toArray(StructField[]::new)); + this.requestedProjection = + new StructType( + Stream.of(requestedSchema.fields()) + .filter(field -> MetadataColumns.nonMetadataColumn(field.name())) + .toArray(StructField[]::new)); - // the projection should include all columns that will be returned, including those only used in filters - this.schema = SparkSchemaUtil.prune(schema, requestedProjection, filterExpression(), caseSensitive); + // the projection should include all columns that will be returned, including those only used in + // filters + this.schema = + SparkSchemaUtil.prune(schema, requestedProjection, filterExpression(), caseSensitive); Stream.of(requestedSchema.fields()) .map(StructField::name) @@ -150,10 +157,11 @@ public SparkScanBuilder ignoreResiduals() { private Schema schemaWithMetadataColumns() { // metadata columns - List fields = metaColumns.stream() - .distinct() - .map(name -> MetadataColumns.metadataColumn(table, name)) - .collect(Collectors.toList()); + List fields = + metaColumns.stream() + .distinct() + .map(name -> MetadataColumns.metadataColumn(table, name)) + .collect(Collectors.toList()); Schema meta = new Schema(fields); // schema or rows returned by readers @@ -163,13 +171,25 @@ private Schema schemaWithMetadataColumns() { @Override public Scan build() { return new SparkBatchQueryScan( - spark, table, readConf, caseSensitive, schemaWithMetadataColumns(), filterExpressions, options); + spark, + table, + readConf, + caseSensitive, + schemaWithMetadataColumns(), + filterExpressions, + options); } public Scan buildMergeScan() { return new SparkMergeScan( - spark, table, readConf, caseSensitive, ignoreResiduals, - schemaWithMetadataColumns(), filterExpressions, options); + spark, + table, + readConf, + caseSensitive, + ignoreResiduals, + schemaWithMetadataColumns(), + filterExpressions, + options); } @Override diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java index 5494d7cffd5a..0f0c86fe89ee 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.DELETE_MODE; +import static org.apache.iceberg.TableProperties.DELETE_MODE_DEFAULT; +import static org.apache.iceberg.TableProperties.MERGE_MODE; +import static org.apache.iceberg.TableProperties.MERGE_MODE_DEFAULT; +import static org.apache.iceberg.TableProperties.UPDATE_MODE; +import static org.apache.iceberg.TableProperties.UPDATE_MODE_DEFAULT; + import java.util.Map; import java.util.Set; import org.apache.iceberg.Schema; @@ -56,27 +62,25 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.DELETE_MODE; -import static org.apache.iceberg.TableProperties.DELETE_MODE_DEFAULT; -import static org.apache.iceberg.TableProperties.MERGE_MODE; -import static org.apache.iceberg.TableProperties.MERGE_MODE_DEFAULT; -import static org.apache.iceberg.TableProperties.UPDATE_MODE; -import static org.apache.iceberg.TableProperties.UPDATE_MODE_DEFAULT; - -public class SparkTable implements org.apache.spark.sql.connector.catalog.Table, - SupportsRead, SupportsWrite, ExtendedSupportsDelete, SupportsMerge { +public class SparkTable + implements org.apache.spark.sql.connector.catalog.Table, + SupportsRead, + SupportsWrite, + ExtendedSupportsDelete, + SupportsMerge { private static final Logger LOG = LoggerFactory.getLogger(SparkTable.class); private static final Set RESERVED_PROPERTIES = ImmutableSet.of("provider", "format", "current-snapshot-id", "location", "sort-order"); - private static final Set CAPABILITIES = ImmutableSet.of( - TableCapability.BATCH_READ, - TableCapability.BATCH_WRITE, - TableCapability.MICRO_BATCH_READ, - TableCapability.STREAMING_WRITE, - TableCapability.OVERWRITE_BY_FILTER, - TableCapability.OVERWRITE_DYNAMIC); + private static final Set CAPABILITIES = + ImmutableSet.of( + TableCapability.BATCH_READ, + TableCapability.BATCH_WRITE, + TableCapability.MICRO_BATCH_READ, + TableCapability.STREAMING_WRITE, + TableCapability.OVERWRITE_BY_FILTER, + TableCapability.OVERWRITE_DYNAMIC); private final Table icebergTable; private final Long snapshotId; @@ -133,12 +137,17 @@ public Transform[] partitioning() { public Map properties() { ImmutableMap.Builder propsBuilder = ImmutableMap.builder(); - String fileFormat = icebergTable.properties() - .getOrDefault(TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); + String fileFormat = + icebergTable + .properties() + .getOrDefault( + TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); propsBuilder.put("format", "iceberg/" + fileFormat); propsBuilder.put("provider", "iceberg"); - String currentSnapshotId = icebergTable.currentSnapshot() != null ? - String.valueOf(icebergTable.currentSnapshot().snapshotId()) : "none"; + String currentSnapshotId = + icebergTable.currentSnapshot() != null + ? String.valueOf(icebergTable.currentSnapshot().snapshotId()) + : "none"; propsBuilder.put("current-snapshot-id", currentSnapshotId); propsBuilder.put("location", icebergTable.location()); @@ -176,8 +185,7 @@ public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) { @Override public WriteBuilder newWriteBuilder(LogicalWriteInfo info) { Preconditions.checkArgument( - snapshotId == null, - "Cannot write to table at a specific snapshot: %s", snapshotId); + snapshotId == null, "Cannot write to table at a specific snapshot: %s", snapshotId); if (info.options().containsKey(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID)) { // replace data files in the given file scan task set with new files @@ -190,7 +198,8 @@ public WriteBuilder newWriteBuilder(LogicalWriteInfo info) { @Override public MergeBuilder newMergeBuilder(String operation, LogicalWriteInfo info) { String mode = getRowLevelOperationMode(operation); - ValidationException.check(mode.equals("copy-on-write"), "Unsupported mode for %s: %s", operation, mode); + ValidationException.check( + mode.equals("copy-on-write"), "Unsupported mode for %s: %s", operation, mode); return new SparkMergeBuilder(sparkSession(), icebergTable, operation, info); } @@ -210,8 +219,7 @@ private String getRowLevelOperationMode(String operation) { @Override public boolean canDeleteWhere(Filter[] filters) { Preconditions.checkArgument( - snapshotId == null, - "Cannot delete from table at a specific snapshot: %s", snapshotId); + snapshotId == null, "Cannot delete from table at a specific snapshot: %s", snapshotId); if (table().specs().size() > 1) { // cannot guarantee a metadata delete will be successful if we have multiple specs @@ -223,7 +231,8 @@ public boolean canDeleteWhere(Filter[] filters) { for (Filter filter : filters) { // return false if the filter requires rewrite or if we cannot translate the filter - if (requiresRewrite(filter, schema, identitySourceIds) || SparkFilters.convert(filter) == null) { + if (requiresRewrite(filter, schema, identitySourceIds) + || SparkFilters.convert(filter) == null) { return false; } } @@ -235,11 +244,13 @@ private boolean requiresRewrite(Filter filter, Schema schema, Set ident // TODO: handle dots correctly via v2references // TODO: detect more cases that don't require rewrites Set filterRefs = Sets.newHashSet(filter.references()); - return filterRefs.stream().anyMatch(ref -> { - Types.NestedField field = schema.findField(ref); - ValidationException.check(field != null, "Cannot find field %s in schema", ref); - return !identitySourceIds.contains(field.fieldId()); - }); + return filterRefs.stream() + .anyMatch( + ref -> { + Types.NestedField field = schema.findField(ref); + ValidationException.check(field != null, "Cannot find field %s in schema", ref); + return !identitySourceIds.contains(field.fieldId()); + }); } @Override @@ -251,7 +262,8 @@ public void deleteWhere(Filter[] filters) { return; } - icebergTable.newDelete() + icebergTable + .newDelete() .set("spark.app.id", sparkSession().sparkContext().applicationId()) .deleteFromRowFilter(deleteExpr) .commit(); @@ -281,12 +293,15 @@ public int hashCode() { return icebergTable.name().hashCode(); } - private static CaseInsensitiveStringMap addSnapshotId(CaseInsensitiveStringMap options, Long snapshotId) { + private static CaseInsensitiveStringMap addSnapshotId( + CaseInsensitiveStringMap options, Long snapshotId) { if (snapshotId != null) { String snapshotIdFromOptions = options.get(SparkReadOptions.SNAPSHOT_ID); String value = snapshotId.toString(); - Preconditions.checkArgument(snapshotIdFromOptions == null || snapshotIdFromOptions.equals(value), - "Cannot override snapshot ID more than once: %s", snapshotIdFromOptions); + Preconditions.checkArgument( + snapshotIdFromOptions == null || snapshotIdFromOptions.equals(value), + "Cannot override snapshot ID more than once: %s", + snapshotIdFromOptions); Map scanOptions = Maps.newHashMap(); scanOptions.putAll(options.asCaseSensitiveMap()); diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java index f771281848ff..3ba40bc88582 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.IsolationLevel.SERIALIZABLE; +import static org.apache.iceberg.IsolationLevel.SNAPSHOT; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; + import java.io.IOException; import java.util.Arrays; import java.util.Collections; @@ -80,17 +90,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.IsolationLevel.SERIALIZABLE; -import static org.apache.iceberg.IsolationLevel.SNAPSHOT; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; - class SparkWrite { private static final Logger LOG = LoggerFactory.getLogger(SparkWrite.class); @@ -108,9 +107,14 @@ class SparkWrite { private boolean cleanupOnAbort = true; - SparkWrite(SparkSession spark, Table table, SparkWriteConf writeConf, - LogicalWriteInfo writeInfo, String applicationId, - Schema writeSchema, StructType dsSchema) { + SparkWrite( + SparkSession spark, + Table table, + SparkWriteConf writeConf, + LogicalWriteInfo writeInfo, + String applicationId, + Schema writeSchema, + StructType dsSchema) { this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); this.table = table; this.queryId = writeInfo.queryId(); @@ -153,15 +157,21 @@ StreamingWrite asStreamingOverwrite() { } private boolean isWapTable() { - return Boolean.parseBoolean(table.properties().getOrDefault( - TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED_DEFAULT)); + return Boolean.parseBoolean( + table + .properties() + .getOrDefault( + TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, + TableProperties.WRITE_AUDIT_PUBLISH_ENABLED_DEFAULT)); } // the writer factory works for both batch and streaming private WriterFactory createWriterFactory() { // broadcast the table metadata as the writer factory will be sent to executors - Broadcast
    tableBroadcast = sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); - return new WriterFactory(tableBroadcast, format, targetFileSize, writeSchema, dsSchema, partitionedFanoutEnabled); + Broadcast
    tableBroadcast = + sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); + return new WriterFactory( + tableBroadcast, format, targetFileSize, writeSchema, dsSchema, partitionedFanoutEnabled); } private void commitOperation(SnapshotUpdate operation, String description) { @@ -202,24 +212,33 @@ private void abort(WriterCommitMessage[] messages) { Tasks.foreach(files(messages)) .retry(PropertyUtil.propertyAsInt(props, COMMIT_NUM_RETRIES, COMMIT_NUM_RETRIES_DEFAULT)) .exponentialBackoff( - PropertyUtil.propertyAsInt(props, COMMIT_MIN_RETRY_WAIT_MS, COMMIT_MIN_RETRY_WAIT_MS_DEFAULT), - PropertyUtil.propertyAsInt(props, COMMIT_MAX_RETRY_WAIT_MS, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT), - PropertyUtil.propertyAsInt(props, COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_MIN_RETRY_WAIT_MS, COMMIT_MIN_RETRY_WAIT_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_MAX_RETRY_WAIT_MS, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), 2.0 /* exponential */) .throwFailureWhenFinished() - .run(file -> { - table.io().deleteFile(file.path().toString()); - }); + .run( + file -> { + table.io().deleteFile(file.path().toString()); + }); } else { - LOG.warn("Skipping cleaning up of data files because Iceberg was unable to determine the final commit state"); + LOG.warn( + "Skipping cleaning up of data files because Iceberg was unable to determine the final commit state"); } } private Iterable files(WriterCommitMessage[] messages) { if (messages.length > 0) { - return Iterables.concat(Iterables.transform(Arrays.asList(messages), message -> message != null ? - ImmutableList.copyOf(((TaskCommit) message).files()) : - ImmutableList.of())); + return Iterables.concat( + Iterables.transform( + Arrays.asList(messages), + message -> + message != null + ? ImmutableList.copyOf(((TaskCommit) message).files()) + : ImmutableList.of())); } return ImmutableList.of(); } @@ -274,7 +293,9 @@ public void commit(WriterCommitMessage[] messages) { dynamicOverwrite.addFile(file); } - commitOperation(dynamicOverwrite, String.format("dynamic partition overwrite with %d new data files", numFiles)); + commitOperation( + dynamicOverwrite, + String.format("dynamic partition overwrite with %d new data files", numFiles)); } } @@ -296,7 +317,8 @@ public void commit(WriterCommitMessage[] messages) { overwriteFiles.addFile(file); } - String commitMsg = String.format("overwrite by filter %s with %d new data files", overwriteExpr, numFiles); + String commitMsg = + String.format("overwrite by filter %s with %d new data files", overwriteExpr, numFiles); commitOperation(overwriteFiles, commitMsg); } } @@ -352,9 +374,8 @@ public void commit(WriterCommitMessage[] messages) { } } - private void commitWithSerializableIsolation(OverwriteFiles overwriteFiles, - int numOverwrittenFiles, - int numAddedFiles) { + private void commitWithSerializableIsolation( + OverwriteFiles overwriteFiles, int numOverwrittenFiles, int numAddedFiles) { Long scanSnapshotId = scan.snapshotId(); if (scanSnapshotId != null) { overwriteFiles.validateFromSnapshot(scanSnapshotId); @@ -365,15 +386,15 @@ private void commitWithSerializableIsolation(OverwriteFiles overwriteFiles, overwriteFiles.validateNoConflictingData(); overwriteFiles.validateNoConflictingDeletes(); - String commitMsg = String.format( - "overwrite of %d data files with %d new data files, scanSnapshotId: %d, conflictDetectionFilter: %s", - numOverwrittenFiles, numAddedFiles, scanSnapshotId, conflictDetectionFilter); + String commitMsg = + String.format( + "overwrite of %d data files with %d new data files, scanSnapshotId: %d, conflictDetectionFilter: %s", + numOverwrittenFiles, numAddedFiles, scanSnapshotId, conflictDetectionFilter); commitOperation(overwriteFiles, commitMsg); } - private void commitWithSnapshotIsolation(OverwriteFiles overwriteFiles, - int numOverwrittenFiles, - int numAddedFiles) { + private void commitWithSnapshotIsolation( + OverwriteFiles overwriteFiles, int numOverwrittenFiles, int numAddedFiles) { Long scanSnapshotId = scan.snapshotId(); if (scanSnapshotId != null) { overwriteFiles.validateFromSnapshot(scanSnapshotId); @@ -383,9 +404,10 @@ private void commitWithSnapshotIsolation(OverwriteFiles overwriteFiles, overwriteFiles.conflictDetectionFilter(conflictDetectionFilter); overwriteFiles.validateNoConflictingDeletes(); - String commitMsg = String.format( - "overwrite of %d data files with %d new data files", - numOverwrittenFiles, numAddedFiles); + String commitMsg = + String.format( + "overwrite of %d data files with %d new data files", + numOverwrittenFiles, numAddedFiles); commitOperation(overwriteFiles, commitMsg); } } @@ -504,7 +526,10 @@ public void doCommit(long epochId, WriterCommitMessage[] messages) { overwriteFiles.addFile(file); numFiles++; } - commit(overwriteFiles, epochId, String.format("streaming complete overwrite with %d new data files", numFiles)); + commit( + overwriteFiles, + epochId, + String.format("streaming complete overwrite with %d new data files", numFiles)); } } @@ -546,8 +571,13 @@ private static class WriterFactory implements DataWriterFactory, StreamingDataWr private final StructType dsSchema; private final boolean partitionedFanoutEnabled; - protected WriterFactory(Broadcast
    tableBroadcast, FileFormat format, long targetFileSize, - Schema writeSchema, StructType dsSchema, boolean partitionedFanoutEnabled) { + protected WriterFactory( + Broadcast
    tableBroadcast, + FileFormat format, + long targetFileSize, + Schema writeSchema, + StructType dsSchema, + boolean partitionedFanoutEnabled) { this.tableBroadcast = tableBroadcast; this.format = format; this.targetFileSize = targetFileSize; @@ -567,21 +597,28 @@ public DataWriter createWriter(int partitionId, long taskId, long e PartitionSpec spec = table.spec(); FileIO io = table.io(); - OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, partitionId, taskId) - .format(format) - .build(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table) - .dataFileFormat(format) - .dataSchema(writeSchema) - .dataSparkType(dsSchema) - .build(); + OutputFileFactory fileFactory = + OutputFileFactory.builderFor(table, partitionId, taskId).format(format).build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table) + .dataFileFormat(format) + .dataSchema(writeSchema) + .dataSparkType(dsSchema) + .build(); if (spec.isUnpartitioned()) { return new UnpartitionedDataWriter(writerFactory, fileFactory, io, spec, targetFileSize); } else { return new PartitionedDataWriter( - writerFactory, fileFactory, io, spec, writeSchema, dsSchema, targetFileSize, partitionedFanoutEnabled); + writerFactory, + fileFactory, + io, + spec, + writeSchema, + dsSchema, + targetFileSize, + partitionedFanoutEnabled); } } } @@ -597,9 +634,14 @@ private static class UnpartitionedDataWriter implements DataWriter private final FileWriter delegate; private final FileIO io; - private UnpartitionedDataWriter(SparkFileWriterFactory writerFactory, OutputFileFactory fileFactory, - FileIO io, PartitionSpec spec, long targetFileSize) { - this.delegate = new RollingDataWriter<>(writerFactory, fileFactory, io, targetFileSize, spec, null); + private UnpartitionedDataWriter( + SparkFileWriterFactory writerFactory, + OutputFileFactory fileFactory, + FileIO io, + PartitionSpec spec, + long targetFileSize) { + this.delegate = + new RollingDataWriter<>(writerFactory, fileFactory, io, targetFileSize, spec, null); this.io = io; } @@ -613,7 +655,7 @@ public WriterCommitMessage commit() throws IOException { close(); DataWriteResult result = delegate.result(); - TaskCommit taskCommit = new TaskCommit(result.dataFiles().toArray(new DataFile[0])); + TaskCommit taskCommit = new TaskCommit(result.dataFiles().toArray(new DataFile[0])); taskCommit.reportOutputMetrics(); return taskCommit; } @@ -639,9 +681,15 @@ private static class PartitionedDataWriter implements DataWriter { private final PartitionKey partitionKey; private final InternalRowWrapper internalRowWrapper; - private PartitionedDataWriter(SparkFileWriterFactory writerFactory, OutputFileFactory fileFactory, - FileIO io, PartitionSpec spec, Schema dataSchema, - StructType dataSparkType, long targetFileSize, boolean fanoutEnabled) { + private PartitionedDataWriter( + SparkFileWriterFactory writerFactory, + OutputFileFactory fileFactory, + FileIO io, + PartitionSpec spec, + Schema dataSchema, + StructType dataSparkType, + long targetFileSize, + boolean fanoutEnabled) { if (fanoutEnabled) { this.delegate = new FanoutDataWriter<>(writerFactory, fileFactory, io, targetFileSize); } else { @@ -664,7 +712,7 @@ public WriterCommitMessage commit() throws IOException { close(); DataWriteResult result = delegate.result(); - TaskCommit taskCommit = new TaskCommit(result.dataFiles().toArray(new DataFile[0])); + TaskCommit taskCommit = new TaskCommit(result.dataFiles().toArray(new DataFile[0])); taskCommit.reportOutputMetrics(); return taskCommit; } diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java index 60360847c8c4..fbef4051717e 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.IsolationLevel; @@ -70,7 +69,8 @@ class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, Suppo public WriteBuilder overwriteFiles(Scan scan, IsolationLevel writeIsolationLevel) { Preconditions.checkArgument(scan instanceof SparkMergeScan, "%s is not SparkMergeScan", scan); Preconditions.checkState(!overwriteByFilter, "Cannot overwrite individual files and by filter"); - Preconditions.checkState(!overwriteDynamic, "Cannot overwrite individual files and dynamically"); + Preconditions.checkState( + !overwriteDynamic, "Cannot overwrite individual files and dynamically"); this.overwriteFiles = true; this.mergeScan = (SparkMergeScan) scan; this.isolationLevel = writeIsolationLevel; @@ -79,7 +79,8 @@ public WriteBuilder overwriteFiles(Scan scan, IsolationLevel writeIsolationLevel @Override public WriteBuilder overwriteDynamicPartitions() { - Preconditions.checkState(!overwriteByFilter, "Cannot overwrite dynamically and by filter: %s", overwriteExpr); + Preconditions.checkState( + !overwriteByFilter, "Cannot overwrite dynamically and by filter: %s", overwriteExpr); Preconditions.checkState(!overwriteFiles, "Cannot overwrite individual files and dynamically"); this.overwriteDynamic = true; return this; @@ -87,13 +88,15 @@ public WriteBuilder overwriteDynamicPartitions() { @Override public WriteBuilder overwrite(Filter[] filters) { - Preconditions.checkState(!overwriteFiles, "Cannot overwrite individual files and using filters"); + Preconditions.checkState( + !overwriteFiles, "Cannot overwrite individual files and using filters"); this.overwriteExpr = SparkFilters.convert(filters); if (overwriteExpr == Expressions.alwaysTrue() && "dynamic".equals(overwriteMode)) { // use the write option to override truncating the table. use dynamic overwrite instead. this.overwriteDynamic = true; } else { - Preconditions.checkState(!overwriteDynamic, "Cannot overwrite dynamically and by filter: %s", overwriteExpr); + Preconditions.checkState( + !overwriteDynamic, "Cannot overwrite dynamically and by filter: %s", overwriteExpr); this.overwriteByFilter = true; } return this; @@ -102,17 +105,20 @@ public WriteBuilder overwrite(Filter[] filters) { @Override public BatchWrite buildForBatch() { // Validate - Preconditions.checkArgument(handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), + Preconditions.checkArgument( + handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsSchema); - TypeUtil.validateWriteSchema(table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); + TypeUtil.validateWriteSchema( + table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); SparkUtil.validatePartitionTransforms(table.spec()); // Get application id String appId = spark.sparkContext().applicationId(); - SparkWrite write = new SparkWrite(spark, table, writeConf, writeInfo, appId, writeSchema, dsSchema); + SparkWrite write = + new SparkWrite(spark, table, writeConf, writeInfo, appId, writeSchema, dsSchema); if (overwriteByFilter) { return write.asOverwriteByFilter(overwriteExpr); } else if (overwriteDynamic) { @@ -127,23 +133,28 @@ public BatchWrite buildForBatch() { @Override public StreamingWrite buildForStreaming() { // Validate - Preconditions.checkArgument(handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), + Preconditions.checkArgument( + handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsSchema); - TypeUtil.validateWriteSchema(table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); + TypeUtil.validateWriteSchema( + table.schema(), writeSchema, writeConf.checkNullability(), writeConf.checkOrdering()); SparkUtil.validatePartitionTransforms(table.spec()); // Change to streaming write if it is just append - Preconditions.checkState(!overwriteDynamic, - "Unsupported streaming operation: dynamic partition overwrite"); - Preconditions.checkState(!overwriteByFilter || overwriteExpr == Expressions.alwaysTrue(), - "Unsupported streaming operation: overwrite by filter: %s", overwriteExpr); + Preconditions.checkState( + !overwriteDynamic, "Unsupported streaming operation: dynamic partition overwrite"); + Preconditions.checkState( + !overwriteByFilter || overwriteExpr == Expressions.alwaysTrue(), + "Unsupported streaming operation: overwrite by filter: %s", + overwriteExpr); // Get application id String appId = spark.sparkContext().applicationId(); - SparkWrite write = new SparkWrite(spark, table, writeConf, writeInfo, appId, writeSchema, dsSchema); + SparkWrite write = + new SparkWrite(spark, table, writeConf, writeInfo, appId, writeSchema, dsSchema); if (overwriteByFilter) { return write.asStreamingOverwrite(); } else { diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/StagedSparkTable.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/StagedSparkTable.java index 2e018cb09496..b92c02d2b536 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/StagedSparkTable.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/StagedSparkTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.Transaction; diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java index 939b07a0af61..ddf6ca834d9b 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.OptionalLong; diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java index 64277ecf3be5..f2088deb1ee3 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import com.fasterxml.jackson.core.JsonGenerator; @@ -47,10 +46,10 @@ class StreamingOffset extends Offset { * An implementation of Spark Structured Streaming Offset, to track the current processed files of * Iceberg table. * - * @param snapshotId The current processed snapshot id. - * @param position The position of last scanned file in snapshot. - * @param scanAllFiles whether to scan all files in a snapshot; for example, to read - * all data when starting a stream. + * @param snapshotId The current processed snapshot id. + * @param position The position of last scanned file in snapshot. + * @param scanAllFiles whether to scan all files in a snapshot; for example, to read all data when + * starting a stream. */ StreamingOffset(long snapshotId, long position, boolean scanAllFiles) { this.snapshotId = snapshotId; @@ -65,7 +64,8 @@ static StreamingOffset fromJson(String json) { JsonNode node = JsonUtil.mapper().readValue(json, JsonNode.class); return fromJsonNode(node); } catch (IOException e) { - throw new UncheckedIOException(String.format("Failed to parse StreamingOffset from JSON string %s", json), e); + throw new UncheckedIOException( + String.format("Failed to parse StreamingOffset from JSON string %s", json), e); } } @@ -118,9 +118,9 @@ boolean shouldScanAllFiles() { public boolean equals(Object obj) { if (obj instanceof StreamingOffset) { StreamingOffset offset = (StreamingOffset) obj; - return offset.snapshotId == snapshotId && - offset.position == position && - offset.scanAllFiles == scanAllFiles; + return offset.snapshotId == snapshotId + && offset.position == position + && offset.scanAllFiles == scanAllFiles; } else { return false; } @@ -133,17 +133,20 @@ public int hashCode() { @Override public String toString() { - return String.format("Streaming Offset[%d: position (%d) scan_all_files (%b)]", - snapshotId, position, scanAllFiles); + return String.format( + "Streaming Offset[%d: position (%d) scan_all_files (%b)]", + snapshotId, position, scanAllFiles); } private static StreamingOffset fromJsonNode(JsonNode node) { // The version of StreamingOffset. The offset was created with a version number // used to validate when deserializing from json string. int version = JsonUtil.getInt(VERSION, node); - Preconditions.checkArgument(version == CURR_VERSION, + Preconditions.checkArgument( + version == CURR_VERSION, "This version of Iceberg source only supports version %s. Version %s is not supported.", - CURR_VERSION, version); + CURR_VERSION, + version); long snapshotId = JsonUtil.getLong(SNAPSHOT_ID, node); int position = JsonUtil.getInt(POSITION, node); diff --git a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java index a2288ef3edd7..3c7ebabeab3d 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java +++ b/spark/v3.1/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.math.BigDecimal; @@ -128,7 +127,8 @@ public int getInt(int ordinal) { } else if (integer instanceof LocalDate) { return (int) ((LocalDate) integer).toEpochDay(); } else { - throw new IllegalStateException("Unknown type for int field. Type name: " + integer.getClass().getName()); + throw new IllegalStateException( + "Unknown type for int field. Type name: " + integer.getClass().getName()); } } @@ -143,7 +143,8 @@ public long getLong(int ordinal) { } else if (longVal instanceof LocalDate) { return ((LocalDate) longVal).toEpochDay(); } else { - throw new IllegalStateException("Unknown type for long field. Type name: " + longVal.getClass().getName()); + throw new IllegalStateException( + "Unknown type for long field. Type name: " + longVal.getClass().getName()); } } @@ -190,7 +191,8 @@ private byte[] getBinaryInternal(int ordinal) { } else if (bytes instanceof byte[]) { return (byte[]) bytes; } else { - throw new IllegalStateException("Unknown type for binary field. Type name: " + bytes.getClass().getName()); + throw new IllegalStateException( + "Unknown type for binary field. Type name: " + bytes.getClass().getName()); } } @@ -206,8 +208,7 @@ public InternalRow getStruct(int ordinal, int numFields) { private InternalRow getStructInternal(int ordinal, int numFields) { return new StructInternalRow( - type.fields().get(ordinal).type().asStructType(), - struct.get(ordinal, StructLike.class)); + type.fields().get(ordinal).type().asStructType(), struct.get(ordinal, StructLike.class)); } @Override @@ -227,7 +228,8 @@ public MapData getMap(int ordinal) { } private MapData getMapInternal(int ordinal) { - return mapToMapData(type.fields().get(ordinal).type().asMapType(), struct.get(ordinal, Map.class)); + return mapToMapData( + type.fields().get(ordinal).type().asMapType(), struct.get(ordinal, Map.class)); } @Override @@ -292,31 +294,52 @@ private ArrayData collectionToArrayData(Type elementType, Collection values) case DOUBLE: return fillArray(values, array -> (pos, value) -> array[pos] = value); case STRING: - return fillArray(values, array -> - (BiConsumer) (pos, seq) -> array[pos] = UTF8String.fromString(seq.toString())); + return fillArray( + values, + array -> + (BiConsumer) + (pos, seq) -> array[pos] = UTF8String.fromString(seq.toString())); case FIXED: case BINARY: - return fillArray(values, array -> - (BiConsumer) (pos, buf) -> array[pos] = ByteBuffers.toByteArray(buf)); + return fillArray( + values, + array -> + (BiConsumer) + (pos, buf) -> array[pos] = ByteBuffers.toByteArray(buf)); case DECIMAL: - return fillArray(values, array -> - (BiConsumer) (pos, dec) -> array[pos] = Decimal.apply(dec)); + return fillArray( + values, + array -> + (BiConsumer) (pos, dec) -> array[pos] = Decimal.apply(dec)); case STRUCT: - return fillArray(values, array -> (BiConsumer) (pos, tuple) -> - array[pos] = new StructInternalRow(elementType.asStructType(), tuple)); + return fillArray( + values, + array -> + (BiConsumer) + (pos, tuple) -> + array[pos] = new StructInternalRow(elementType.asStructType(), tuple)); case LIST: - return fillArray(values, array -> (BiConsumer>) (pos, list) -> - array[pos] = collectionToArrayData(elementType.asListType().elementType(), list)); + return fillArray( + values, + array -> + (BiConsumer>) + (pos, list) -> + array[pos] = + collectionToArrayData(elementType.asListType().elementType(), list)); case MAP: - return fillArray(values, array -> (BiConsumer>) (pos, map) -> - array[pos] = mapToMapData(elementType.asMapType(), map)); + return fillArray( + values, + array -> + (BiConsumer>) + (pos, map) -> array[pos] = mapToMapData(elementType.asMapType(), map)); default: throw new UnsupportedOperationException("Unsupported array element type: " + elementType); } } @SuppressWarnings("unchecked") - private GenericArrayData fillArray(Collection values, Function> makeSetter) { + private GenericArrayData fillArray( + Collection values, Function> makeSetter) { Object[] array = new Object[values.size()]; BiConsumer setter = makeSetter.apply(array); diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/catalyst/analysis/NoSuchProcedureException.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/catalyst/analysis/NoSuchProcedureException.java index 1c83396f430b..f5ba22a062b0 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/catalyst/analysis/NoSuchProcedureException.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/catalyst/analysis/NoSuchProcedureException.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.catalyst.analysis; import org.apache.spark.sql.AnalysisException; @@ -25,6 +24,11 @@ public class NoSuchProcedureException extends AnalysisException { public NoSuchProcedureException(Identifier ident) { - super("Procedure " + ident + " not found", Option.empty(), Option.empty(), Option.empty(), Option.empty()); + super( + "Procedure " + ident + " not found", + Option.empty(), + Option.empty(), + Option.empty(), + Option.empty()); } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ExtendedSupportsDelete.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ExtendedSupportsDelete.java index e59bfe209f6b..a5f40f04c280 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ExtendedSupportsDelete.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ExtendedSupportsDelete.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.connector.catalog.SupportsDelete; @@ -25,14 +24,15 @@ // this should be part of SupportsDelete when merged upstream public interface ExtendedSupportsDelete extends SupportsDelete { /** - * Checks if it is possible to delete data from a data source table that matches filter expressions. - *

    - * Rows should be deleted from the data source iff all of the filter expressions match. That is, the - * expressions must be interpreted as a set of filters that are ANDed together. - *

    - * Spark will call this method to check if the delete is possible without significant effort. - * Otherwise, Spark will try to rewrite the delete operation if the data source table - * supports row-level operations. + * Checks if it is possible to delete data from a data source table that matches filter + * expressions. + * + *

    Rows should be deleted from the data source iff all of the filter expressions match. That + * is, the expressions must be interpreted as a set of filters that are ANDed together. + * + *

    Spark will call this method to check if the delete is possible without significant effort. + * Otherwise, Spark will try to rewrite the delete operation if the data source table supports + * row-level operations. * * @param filters filter expressions, used to select rows to delete when all expressions match * @return true if the delete operation can be performed diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/Procedure.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/Procedure.java index 8f7a70b9f9fc..11f215ba040a 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/Procedure.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/Procedure.java @@ -16,44 +16,34 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.types.StructType; -/** - * An interface representing a stored procedure available for execution. - */ +/** An interface representing a stored procedure available for execution. */ public interface Procedure { - /** - * Returns the input parameters of this procedure. - */ + /** Returns the input parameters of this procedure. */ ProcedureParameter[] parameters(); - /** - * Returns the type of rows produced by this procedure. - */ + /** Returns the type of rows produced by this procedure. */ StructType outputType(); /** * Executes this procedure. - *

    - * Spark will align the provided arguments according to the input parameters - * defined in {@link #parameters()} either by position or by name before execution. - *

    - * Implementations may provide a summary of execution by returning one or many rows - * as a result. The schema of output rows must match the defined output type - * in {@link #outputType()}. + * + *

    Spark will align the provided arguments according to the input parameters defined in {@link + * #parameters()} either by position or by name before execution. + * + *

    Implementations may provide a summary of execution by returning one or many rows as a + * result. The schema of output rows must match the defined output type in {@link #outputType()}. * * @param args input arguments * @return the result of executing this procedure with the given arguments */ InternalRow[] call(InternalRow args); - /** - * Returns the description of this procedure. - */ + /** Returns the description of this procedure. */ default String description() { return this.getClass().toString(); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureCatalog.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureCatalog.java index 314bd659460e..2cee97ee5938 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureCatalog.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.catalyst.analysis.NoSuchProcedureException; @@ -25,9 +24,9 @@ /** * A catalog API for working with stored procedures. - *

    - * Implementations should implement this interface if they expose stored procedures that - * can be called via CALL statements. + * + *

    Implementations should implement this interface if they expose stored procedures that can be + * called via CALL statements. */ public interface ProcedureCatalog extends CatalogPlugin { /** diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameter.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameter.java index b341dc1e3282..e1e84b2597f3 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameter.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameter.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.types.DataType; -/** - * An input parameter of a {@link Procedure stored procedure}. - */ +/** An input parameter of a {@link Procedure stored procedure}. */ public interface ProcedureParameter { /** @@ -48,18 +45,12 @@ static ProcedureParameter optional(String name, DataType dataType) { return new ProcedureParameterImpl(name, dataType, false); } - /** - * Returns the name of this parameter. - */ + /** Returns the name of this parameter. */ String name(); - /** - * Returns the type of this parameter. - */ + /** Returns the type of this parameter. */ DataType dataType(); - /** - * Returns true if this parameter is required. - */ + /** Returns true if this parameter is required. */ boolean required(); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameterImpl.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameterImpl.java index cea1e80f4051..c59951e24330 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameterImpl.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameterImpl.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import java.util.Objects; import org.apache.spark.sql.types.DataType; -/** - * A {@link ProcedureParameter} implementation. - */ +/** A {@link ProcedureParameter} implementation. */ class ProcedureParameterImpl implements ProcedureParameter { private final String name; private final DataType dataType; @@ -60,9 +57,9 @@ public boolean equals(Object other) { } ProcedureParameterImpl that = (ProcedureParameterImpl) other; - return required == that.required && - Objects.equals(name, that.name) && - Objects.equals(dataType, that.dataType); + return required == that.required + && Objects.equals(name, that.name) + && Objects.equals(dataType, that.dataType); } @Override @@ -72,6 +69,7 @@ public int hashCode() { @Override public String toString() { - return String.format("ProcedureParameter(name='%s', type=%s, required=%b)", name, dataType, required); + return String.format( + "ProcedureParameter(name='%s', type=%s, required=%b)", name, dataType, required); } } diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/SupportsMerge.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/SupportsMerge.java index d36fe926ce3d..49d9047f26ad 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/SupportsMerge.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/SupportsMerge.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.connector.catalog.Table; @@ -25,14 +24,15 @@ /** * A mix-in interface for Table to indicate that it supports row-level operations. - *

    - * This adds {@link #newMergeBuilder(String, LogicalWriteInfo)} that is used to create a scan and + * + *

    This adds {@link #newMergeBuilder(String, LogicalWriteInfo)} that is used to create a scan and * a write for a row-level operation. */ public interface SupportsMerge extends Table { /** - * Returns a {@link MergeBuilder} which can be used to create both a scan and a write for a row-level - * operation. Spark will call this method to configure each data source row-level operation. + * Returns a {@link MergeBuilder} which can be used to create both a scan and a write for a + * row-level operation. Spark will call this method to configure each data source row-level + * operation. * * @param info write info * @return a merge builder diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/ClusteredDistribution.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/ClusteredDistribution.java index 942ffac3d092..fc0b77dc6373 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/ClusteredDistribution.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/ClusteredDistribution.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.distributions; import org.apache.spark.annotation.Experimental; @@ -30,8 +29,6 @@ */ @Experimental public interface ClusteredDistribution extends Distribution { - /** - * Returns clustering expressions. - */ + /** Returns clustering expressions. */ Expression[] clustering(); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/Distribution.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/Distribution.java index 0f3fdbf47c0a..dec9674aaa18 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/Distribution.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/Distribution.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.distributions; import org.apache.spark.annotation.Experimental; @@ -27,6 +26,4 @@ * @since 3.2.0 */ @Experimental -public interface Distribution { -} - +public interface Distribution {} diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/Distributions.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/Distributions.java index 6e25fe09dc67..00860e11edc8 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/Distributions.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/Distributions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.distributions; import org.apache.spark.annotation.Experimental; @@ -33,12 +32,9 @@ */ @Experimental public class Distributions { - private Distributions() { - } + private Distributions() {} - /** - * Creates a distribution where no promises are made about co-location of data. - */ + /** Creates a distribution where no promises are made about co-location of data. */ public static UnspecifiedDistribution unspecified() { return new UnspecifiedDistributionImpl(); } @@ -52,8 +48,8 @@ public static ClusteredDistribution clustered(Expression[] clustering) { } /** - * Creates a distribution where tuples have been ordered across partitions according - * to ordering expressions, but not necessarily within a given partition. + * Creates a distribution where tuples have been ordered across partitions according to ordering + * expressions, but not necessarily within a given partition. */ public static OrderedDistribution ordered(SortOrder[] ordering) { return new OrderedDistributionImpl(ordering); diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/OrderedDistribution.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/OrderedDistribution.java index 09a25f37b627..ba646ee46a45 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/OrderedDistribution.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/OrderedDistribution.java @@ -16,22 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.distributions; import org.apache.spark.annotation.Experimental; import org.apache.spark.sql.connector.iceberg.expressions.SortOrder; /** - * A distribution where tuples have been ordered across partitions according - * to ordering expressions, but not necessarily within a given partition. + * A distribution where tuples have been ordered across partitions according to ordering + * expressions, but not necessarily within a given partition. * * @since 3.2.0 */ @Experimental public interface OrderedDistribution extends Distribution { - /** - * Returns ordering expressions. - */ + /** Returns ordering expressions. */ SortOrder[] ordering(); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/UnspecifiedDistribution.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/UnspecifiedDistribution.java index 1bdcfc1ebbaf..0e88218c6c12 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/UnspecifiedDistribution.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/UnspecifiedDistribution.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.distributions; import org.apache.spark.annotation.Experimental; diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/ClusterDistributionImpl.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/ClusterDistributionImpl.java index d05274668cce..3aadab1defa9 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/ClusterDistributionImpl.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/ClusterDistributionImpl.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.spark.sql.connector.iceberg.distributions.impl; import org.apache.spark.sql.connector.expressions.Expression; diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/OrderedDistributionImpl.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/OrderedDistributionImpl.java index 773ace0692ba..6ae7afa49003 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/OrderedDistributionImpl.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/OrderedDistributionImpl.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.spark.sql.connector.iceberg.distributions.impl; import org.apache.spark.sql.connector.iceberg.distributions.OrderedDistribution; diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/UnspecifiedDistributionImpl.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/UnspecifiedDistributionImpl.java index d69c912ff367..3944b7289c59 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/UnspecifiedDistributionImpl.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/distributions/impl/UnspecifiedDistributionImpl.java @@ -16,12 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.spark.sql.connector.iceberg.distributions.impl; import org.apache.spark.sql.connector.iceberg.distributions.UnspecifiedDistribution; -public class UnspecifiedDistributionImpl implements UnspecifiedDistribution { - -} +public class UnspecifiedDistributionImpl implements UnspecifiedDistribution {} diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/NullOrdering.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/NullOrdering.java index b7c6e1c5f414..712ea0b2bb91 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/NullOrdering.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/NullOrdering.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.expressions; import org.apache.spark.annotation.Experimental; @@ -28,7 +27,8 @@ */ @Experimental public enum NullOrdering { - NULLS_FIRST, NULLS_LAST; + NULLS_FIRST, + NULLS_LAST; @Override public String toString() { diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/SortDirection.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/SortDirection.java index 211702548038..cb3d8cac93b1 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/SortDirection.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/SortDirection.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.expressions; import org.apache.spark.annotation.Experimental; @@ -28,7 +27,8 @@ */ @Experimental public enum SortDirection { - ASCENDING, DESCENDING; + ASCENDING, + DESCENDING; @Override public String toString() { diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/SortOrder.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/SortOrder.java index d3345d4becec..d31fed59daa3 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/SortOrder.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/expressions/SortOrder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.expressions; import org.apache.spark.annotation.Experimental; @@ -29,18 +28,12 @@ */ @Experimental public interface SortOrder extends Expression { - /** - * Returns the sort expression. - */ + /** Returns the sort expression. */ Expression expression(); - /** - * Returns the sort direction. - */ + /** Returns the sort direction. */ SortDirection direction(); - /** - * Returns the null ordering. - */ + /** Returns the null ordering. */ NullOrdering nullOrdering(); } diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/read/SupportsFileFilter.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/read/SupportsFileFilter.java index f0b28ae7a4d4..0eb46f401a92 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/read/SupportsFileFilter.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/read/SupportsFileFilter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.read; import java.util.Set; diff --git a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/MergeBuilder.java b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/MergeBuilder.java index 8b1224b3628b..edb17057d832 100644 --- a/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/MergeBuilder.java +++ b/spark/v3.1/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/MergeBuilder.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import org.apache.spark.sql.connector.read.ScanBuilder; import org.apache.spark.sql.connector.write.WriteBuilder; -/** - * An interface for building a scan and a write for a row-level operation. - */ +/** An interface for building a scan and a write for a row-level operation. */ public interface MergeBuilder { /** * Creates a scan builder for a row-level operation. diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/KryoHelpers.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/KryoHelpers.java index ee0f0a73959a..6d88aaa11813 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/KryoHelpers.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/KryoHelpers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.esotericsoftware.kryo.Kryo; @@ -32,8 +31,7 @@ public class KryoHelpers { - private KryoHelpers() { - } + private KryoHelpers() {} @SuppressWarnings("unchecked") public static T roundTripSerialize(T obj) throws IOException { @@ -45,7 +43,8 @@ public static T roundTripSerialize(T obj) throws IOException { kryo.writeClassAndObject(out, obj); } - try (Input in = new Input(new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray())))) { + try (Input in = + new Input(new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray())))) { return (T) kryo.readClassAndObject(in); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java index 99396647ee3e..235cf69ef449 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Comparator; @@ -25,15 +24,14 @@ import org.junit.Assert; public final class TaskCheckHelper { - private TaskCheckHelper() { - } + private TaskCheckHelper() {} public static void assertEquals(BaseCombinedScanTask expected, BaseCombinedScanTask actual) { List expectedTasks = getFileScanTasksInFilePathOrder(expected); List actualTasks = getFileScanTasksInFilePathOrder(actual); - Assert.assertEquals("The number of file scan tasks should match", - expectedTasks.size(), actualTasks.size()); + Assert.assertEquals( + "The number of file scan tasks should match", expectedTasks.size(), actualTasks.size()); for (int i = 0; i < expectedTasks.size(); i++) { FileScanTask expectedTask = expectedTasks.get(i); @@ -50,38 +48,56 @@ public static void assertEquals(FileScanTask expected, FileScanTask actual) { Assert.assertEquals("starting position doesn't match", expected.start(), actual.start()); - Assert.assertEquals("the number of bytes to scan doesn't match", expected.start(), actual.start()); + Assert.assertEquals( + "the number of bytes to scan doesn't match", expected.start(), actual.start()); // simplify comparison on residual expression via comparing toString - Assert.assertEquals("Residual expression doesn't match", - expected.residual().toString(), actual.residual().toString()); + Assert.assertEquals( + "Residual expression doesn't match", + expected.residual().toString(), + actual.residual().toString()); } public static void assertEquals(DataFile expected, DataFile actual) { - Assert.assertEquals("Should match the serialized record path", - expected.path(), actual.path()); - Assert.assertEquals("Should match the serialized record format", - expected.format(), actual.format()); - Assert.assertEquals("Should match the serialized record partition", - expected.partition().get(0, Object.class), actual.partition().get(0, Object.class)); - Assert.assertEquals("Should match the serialized record count", - expected.recordCount(), actual.recordCount()); - Assert.assertEquals("Should match the serialized record size", - expected.fileSizeInBytes(), actual.fileSizeInBytes()); - Assert.assertEquals("Should match the serialized record value counts", - expected.valueCounts(), actual.valueCounts()); - Assert.assertEquals("Should match the serialized record null value counts", - expected.nullValueCounts(), actual.nullValueCounts()); - Assert.assertEquals("Should match the serialized record lower bounds", - expected.lowerBounds(), actual.lowerBounds()); - Assert.assertEquals("Should match the serialized record upper bounds", - expected.upperBounds(), actual.upperBounds()); - Assert.assertEquals("Should match the serialized record key metadata", - expected.keyMetadata(), actual.keyMetadata()); - Assert.assertEquals("Should match the serialized record offsets", - expected.splitOffsets(), actual.splitOffsets()); - Assert.assertEquals("Should match the serialized record offsets", - expected.keyMetadata(), actual.keyMetadata()); + Assert.assertEquals("Should match the serialized record path", expected.path(), actual.path()); + Assert.assertEquals( + "Should match the serialized record format", expected.format(), actual.format()); + Assert.assertEquals( + "Should match the serialized record partition", + expected.partition().get(0, Object.class), + actual.partition().get(0, Object.class)); + Assert.assertEquals( + "Should match the serialized record count", expected.recordCount(), actual.recordCount()); + Assert.assertEquals( + "Should match the serialized record size", + expected.fileSizeInBytes(), + actual.fileSizeInBytes()); + Assert.assertEquals( + "Should match the serialized record value counts", + expected.valueCounts(), + actual.valueCounts()); + Assert.assertEquals( + "Should match the serialized record null value counts", + expected.nullValueCounts(), + actual.nullValueCounts()); + Assert.assertEquals( + "Should match the serialized record lower bounds", + expected.lowerBounds(), + actual.lowerBounds()); + Assert.assertEquals( + "Should match the serialized record upper bounds", + expected.upperBounds(), + actual.upperBounds()); + Assert.assertEquals( + "Should match the serialized record key metadata", + expected.keyMetadata(), + actual.keyMetadata()); + Assert.assertEquals( + "Should match the serialized record offsets", + expected.splitOffsets(), + actual.splitOffsets()); + Assert.assertEquals( + "Should match the serialized record offsets", expected.keyMetadata(), actual.keyMetadata()); } private static List getFileScanTasksInFilePathOrder(BaseCombinedScanTask task) { diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java index 12fa8b2fc539..33b5316b72b7 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TaskCheckHelper.assertEquals; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; @@ -51,22 +54,17 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.TaskCheckHelper.assertEquals; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestDataFileSerialization { - private static final Schema DATE_SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema DATE_SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec PARTITION_SPEC = PartitionSpec - .builderFor(DATE_SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec PARTITION_SPEC = + PartitionSpec.builderFor(DATE_SCHEMA).identity("date").build(); private static final Map VALUE_COUNTS = Maps.newHashMap(); private static final Map NULL_VALUE_COUNTS = Maps.newHashMap(); @@ -85,20 +83,26 @@ public class TestDataFileSerialization { UPPER_BOUNDS.put(1, longToBuffer(4L)); } - private static final DataFile DATA_FILE = DataFiles - .builder(PARTITION_SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(1234) - .withPartitionPath("date=2018-06-08") - .withMetrics(new Metrics( - 5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS)) - .withSplitOffsets(ImmutableList.of(4L)) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) - .withSortOrder(SortOrder.unsorted()) - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final DataFile DATA_FILE = + DataFiles.builder(PARTITION_SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(1234) + .withPartitionPath("date=2018-06-08") + .withMetrics( + new Metrics( + 5L, + null, + VALUE_COUNTS, + NULL_VALUE_COUNTS, + NAN_VALUE_COUNTS, + LOWER_BOUNDS, + UPPER_BOUNDS)) + .withSplitOffsets(ImmutableList.of(4L)) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) + .withSortOrder(SortOrder.unsorted()) + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testDataFileKryoSerialization() throws Exception { @@ -128,7 +132,8 @@ public void testDataFileJavaSerialization() throws Exception { out.writeObject(DATA_FILE.copy()); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { for (int i = 0; i < 2; i += 1) { Object obj = in.readObject(); Assertions.assertThat(obj).as("Should be a DataFile").isInstanceOf(DataFile.class); @@ -140,13 +145,14 @@ public void testDataFileJavaSerialization() throws Exception { @Test public void testParquetWriterSplitOffsets() throws IOException { Iterable records = RandomData.generateSpark(DATE_SCHEMA, 1, 33L); - File parquetFile = new File( - temp.getRoot(), - FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); + File parquetFile = + new File(temp.getRoot(), FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); FileAppender writer = Parquet.write(Files.localOutput(parquetFile)) .schema(DATE_SCHEMA) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType)) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType)) .build(); try { writer.addAll(records); diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java index 49a85cb68f17..c6f491ece5ad 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -36,36 +38,29 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestFileIOSerialization { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("date").build(); - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA) - .asc("id") - .build(); + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); static { CONF.set("k1", "v1"); CONF.set("k2", "v2"); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @Before diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java index 25004aa110e4..a20b2d9f05de 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; @@ -47,56 +49,57 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestManifestFileSerialization { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - required(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("double") - .build(); - - private static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withPartition(TestHelpers.Row.of(1D)) - .withPartitionPath("double=1") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - ImmutableMap.of(), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); - - private static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(0) - .withPartition(TestHelpers.Row.of(Double.NaN)) - .withPartitionPath("double=NaN") - .withMetrics(new Metrics(1L, - null, // no column sizes - ImmutableMap.of(1, 1L, 4, 1L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - ImmutableMap.of(4, 1L), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(1L)) // upper bounds - )) - .build(); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + required(4, "double", Types.DoubleType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("double").build(); + + private static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(0) + .withPartition(TestHelpers.Row.of(1D)) + .withPartitionPath("double=1") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + ImmutableMap.of(), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(4L)) // upper bounds + )) + .build(); + + private static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-2.parquet") + .withFileSizeInBytes(0) + .withPartition(TestHelpers.Row.of(Double.NaN)) + .withPartitionPath("double=NaN") + .withMetrics( + new Metrics( + 1L, + null, // no column sizes + ImmutableMap.of(1, 1L, 4, 1L), // value count + ImmutableMap.of(1, 0L, 2, 0L), // null count + ImmutableMap.of(4, 1L), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(1L)) // upper bounds + )) + .build(); private static final FileIO FILE_IO = new HadoopFileIO(new Configuration()); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testManifestFileKryoSerialization() throws IOException { @@ -134,7 +137,8 @@ public void testManifestFileJavaSerialization() throws Exception { out.writeObject(GenericManifestFile.copyOf(manifest).build()); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { for (int i = 0; i < 3; i += 1) { Object obj = in.readObject(); Assertions.assertThat(obj).as("Should be a ManifestFile").isInstanceOf(ManifestFile.class); @@ -148,27 +152,46 @@ private void checkManifestFile(ManifestFile expected, ManifestFile actual) { Assert.assertEquals("Length must match", expected.length(), actual.length()); Assert.assertEquals("Spec id must match", expected.partitionSpecId(), actual.partitionSpecId()); Assert.assertEquals("Snapshot id must match", expected.snapshotId(), actual.snapshotId()); - Assert.assertEquals("Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); - Assert.assertEquals("Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); - Assert.assertEquals("Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); - Assert.assertEquals("Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); - Assert.assertEquals("Existing files count must match", expected.existingFilesCount(), actual.existingFilesCount()); - Assert.assertEquals("Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); - Assert.assertEquals("Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); - Assert.assertEquals("Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); - Assert.assertEquals("Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); + Assert.assertEquals( + "Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); + Assert.assertEquals( + "Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); + Assert.assertEquals( + "Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); + Assert.assertEquals( + "Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); + Assert.assertEquals( + "Existing files count must match", + expected.existingFilesCount(), + actual.existingFilesCount()); + Assert.assertEquals( + "Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); + Assert.assertEquals( + "Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); + Assert.assertEquals( + "Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); + Assert.assertEquals( + "Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); PartitionFieldSummary expectedPartition = expected.partitions().get(0); PartitionFieldSummary actualPartition = actual.partitions().get(0); - Assert.assertEquals("Null flag in partition must match", - expectedPartition.containsNull(), actualPartition.containsNull()); - Assert.assertEquals("NaN flag in partition must match", - expectedPartition.containsNaN(), actualPartition.containsNaN()); - Assert.assertEquals("Lower bounds in partition must match", - expectedPartition.lowerBound(), actualPartition.lowerBound()); - Assert.assertEquals("Upper bounds in partition must match", - expectedPartition.upperBound(), actualPartition.upperBound()); + Assert.assertEquals( + "Null flag in partition must match", + expectedPartition.containsNull(), + actualPartition.containsNull()); + Assert.assertEquals( + "NaN flag in partition must match", + expectedPartition.containsNaN(), + actualPartition.containsNaN()); + Assert.assertEquals( + "Lower bounds in partition must match", + expectedPartition.lowerBound(), + actualPartition.lowerBound()); + Assert.assertEquals( + "Upper bounds in partition must match", + expectedPartition.upperBound(), + actualPartition.upperBound()); } private ManifestFile writeManifest(DataFile... files) throws IOException { diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java index e234ee2617aa..4dd34f7a7611 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; + import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; @@ -50,19 +51,16 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestScanTaskSerialization extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String tableLocation = null; @@ -86,7 +84,9 @@ public void testBaseCombinedScanTaskKryoSerialization() throws Exception { try (Input in = new Input(new FileInputStream(data))) { Object obj = kryo.readClassAndObject(in); - Assertions.assertThat(obj).as("Should be a BaseCombinedScanTask").isInstanceOf(BaseCombinedScanTask.class); + Assertions.assertThat(obj) + .as("Should be a BaseCombinedScanTask") + .isInstanceOf(BaseCombinedScanTask.class); TaskCheckHelper.assertEquals(scanTask, (BaseCombinedScanTask) obj); } } @@ -100,9 +100,12 @@ public void testBaseCombinedScanTaskJavaSerialization() throws Exception { out.writeObject(scanTask); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { Object obj = in.readObject(); - Assertions.assertThat(obj).as("Should be a BaseCombinedScanTask").isInstanceOf(BaseCombinedScanTask.class); + Assertions.assertThat(obj) + .as("Should be a BaseCombinedScanTask") + .isInstanceOf(BaseCombinedScanTask.class); TaskCheckHelper.assertEquals(scanTask, (BaseCombinedScanTask) obj); } } @@ -112,16 +115,15 @@ private BaseCombinedScanTask prepareBaseCombinedScanTaskForSerDeTest() { Map options = Maps.newHashMap(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -136,10 +138,6 @@ private void writeRecords(List records) { } private void writeDF(Dataset df) { - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java index 8aa89b9f3199..30a167d575b1 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -32,30 +34,23 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestTableSerialization { private static final HadoopTables TABLES = new HadoopTables(); - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("date").build(); - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA) - .asc("id") - .build(); + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @Before @@ -71,19 +66,20 @@ public void initTable() throws IOException { @Test public void testSerializableTableKryoSerialization() throws IOException { Table serializableTable = SerializableTableWithSize.copyOf(table); - TestHelpers.assertSerializedAndLoadedMetadata(table, KryoHelpers.roundTripSerialize(serializableTable)); + TestHelpers.assertSerializedAndLoadedMetadata( + table, KryoHelpers.roundTripSerialize(serializableTable)); } @Test public void testSerializableMetadataTableKryoSerialization() throws IOException { for (MetadataTableType type : MetadataTableType.values()) { TableOperations ops = ((HasTableOperations) table).operations(); - Table metadataTable = MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); + Table metadataTable = + MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); Table serializableMetadataTable = SerializableTableWithSize.copyOf(metadataTable); TestHelpers.assertSerializedAndLoadedMetadata( - metadataTable, - KryoHelpers.roundTripSerialize(serializableMetadataTable)); + metadataTable, KryoHelpers.roundTripSerialize(serializableMetadataTable)); } } @@ -91,13 +87,12 @@ public void testSerializableMetadataTableKryoSerialization() throws IOException public void testSerializableTransactionTableKryoSerialization() throws IOException { Transaction txn = table.newTransaction(); - txn.updateProperties() - .set("k1", "v1") - .commit(); + txn.updateProperties().set("k1", "v1").commit(); Table txnTable = txn.table(); Table serializableTxnTable = SerializableTableWithSize.copyOf(txnTable); - TestHelpers.assertSerializedMetadata(txnTable, KryoHelpers.roundTripSerialize(serializableTxnTable)); + TestHelpers.assertSerializedMetadata( + txnTable, KryoHelpers.roundTripSerialize(serializableTxnTable)); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java index 5d5dfebf9532..1006ed380ff9 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java @@ -16,26 +16,29 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; public enum SparkCatalogConfig { - HIVE("testhive", SparkCatalog.class.getName(), ImmutableMap.of( - "type", "hive", - "default-namespace", "default" - )), - HADOOP("testhadoop", SparkCatalog.class.getName(), ImmutableMap.of( - "type", "hadoop" - )), - SPARK("spark_catalog", SparkSessionCatalog.class.getName(), ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "parquet-enabled", "true", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - )); + HIVE( + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default")), + HADOOP("testhadoop", SparkCatalog.class.getName(), ImmutableMap.of("type", "hadoop")), + SPARK( + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "parquet-enabled", "true", + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync + )); private final String catalogName; private final String implementation; diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java index 01537a928144..9f69fce018be 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -31,31 +30,32 @@ public abstract class SparkCatalogTestBase extends SparkTestBaseWithCatalog { @Parameterized.Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") public static Object[][] parameters() { return new Object[][] { - { - SparkCatalogConfig.HIVE.catalogName(), - SparkCatalogConfig.HIVE.implementation(), - SparkCatalogConfig.HIVE.properties() - }, - { - SparkCatalogConfig.HADOOP.catalogName(), - SparkCatalogConfig.HADOOP.implementation(), - SparkCatalogConfig.HADOOP.properties() - }, - { - SparkCatalogConfig.SPARK.catalogName(), - SparkCatalogConfig.SPARK.implementation(), - SparkCatalogConfig.SPARK.properties() - }}; + { + SparkCatalogConfig.HIVE.catalogName(), + SparkCatalogConfig.HIVE.implementation(), + SparkCatalogConfig.HIVE.properties() + }, + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties() + }, + { + SparkCatalogConfig.SPARK.catalogName(), + SparkCatalogConfig.SPARK.implementation(), + SparkCatalogConfig.SPARK.properties() + } + }; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); public SparkCatalogTestBase(SparkCatalogConfig config) { super(config); } - public SparkCatalogTestBase(String catalogName, String implementation, Map config) { + public SparkCatalogTestBase( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java index 4dfdcf758181..9db0d6d410ee 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; + import java.util.List; import java.util.Map; import java.util.Set; @@ -46,8 +47,6 @@ import org.junit.BeforeClass; import scala.collection.JavaConverters; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; - public abstract class SparkTestBase { protected static final Object ANY = new Object(); @@ -63,15 +62,18 @@ public static void startMetastoreAndSpark() { metastore.start(); SparkTestBase.hiveConf = metastore.hiveConf(); - SparkTestBase.spark = SparkSession.builder() - .master("local[2]") - .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .enableHiveSupport() - .getOrCreate(); + SparkTestBase.spark = + SparkSession.builder() + .master("local[2]") + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .enableHiveSupport() + .getOrCreate(); - SparkTestBase.catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + SparkTestBase.catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); try { catalog.createNamespace(Namespace.of("default")); @@ -112,22 +114,23 @@ protected List rowsToJava(List rows) { private Object[] toJava(Row row) { return IntStream.range(0, row.size()) - .mapToObj(pos -> { - if (row.isNullAt(pos)) { - return null; - } + .mapToObj( + pos -> { + if (row.isNullAt(pos)) { + return null; + } - Object value = row.get(pos); - if (value instanceof Row) { - return toJava((Row) value); - } else if (value instanceof scala.collection.Seq) { - return row.getList(pos); - } else if (value instanceof scala.collection.Map) { - return row.getJavaMap(pos); - } else { - return value; - } - }) + Object value = row.get(pos); + if (value instanceof Row) { + return toJava((Row) value); + } else if (value instanceof scala.collection.Seq) { + return row.getList(pos); + } else if (value instanceof scala.collection.Map) { + return row.getJavaMap(pos); + } else { + return value; + } + }) .toArray(Object[]::new); } @@ -143,8 +146,10 @@ protected Object[] row(Object... values) { return values; } - protected void assertEquals(String context, List expectedRows, List actualRows) { - Assert.assertEquals(context + ": number of results should match", expectedRows.size(), actualRows.size()); + protected void assertEquals( + String context, List expectedRows, List actualRows) { + Assert.assertEquals( + context + ": number of results should match", expectedRows.size(), actualRows.size()); for (int row = 0; row < expectedRows.size(); row += 1) { Object[] expected = expectedRows.get(row); Object[] actual = actualRows.get(row); @@ -178,59 +183,70 @@ protected void withSQLConf(Map conf, Action action) { SQLConf sqlConf = SQLConf.get(); Map currentConfValues = Maps.newHashMap(); - conf.keySet().forEach(confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); + conf.keySet() + .forEach( + confKey -> { + if (sqlConf.contains(confKey)) { + String currentConfValue = sqlConf.getConfString(confKey); + currentConfValues.put(confKey, currentConfValue); + } + }); - conf.forEach((confKey, confValue) -> { - if (SQLConf.staticConfKeys().contains(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); + conf.forEach( + (confKey, confValue) -> { + if (SQLConf.staticConfKeys().contains(confKey)) { + throw new RuntimeException("Cannot modify the value of a static config: " + confKey); + } + sqlConf.setConfString(confKey, confValue); + }); try { action.invoke(); } finally { - conf.forEach((confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); + conf.forEach( + (confKey, confValue) -> { + if (currentConfValues.containsKey(confKey)) { + sqlConf.setConfString(confKey, currentConfValues.get(confKey)); + } else { + sqlConf.unsetConf(confKey); + } + }); } } private Map currentExecutionUIDataMap() throws TimeoutException { spark.sparkContext().listenerBus().waitUntilEmpty(10000); - return JavaConverters.seqAsJavaList(spark.sharedState().statusStore().executionsList()) - .stream().collect(Collectors.toMap(data -> data.executionId(), data -> data)); + return JavaConverters.seqAsJavaList(spark.sharedState().statusStore().executionsList()).stream() + .collect(Collectors.toMap(data -> data.executionId(), data -> data)); } - protected void checkMetrics(Callable sparkCallable, Map expectedMetrics) throws Exception { + protected void checkMetrics(Callable sparkCallable, Map expectedMetrics) + throws Exception { Set originalExecutionIds = currentExecutionUIDataMap().keySet(); sparkCallable.call(); Map currentExecutions = currentExecutionUIDataMap(); Set currentExecutionIds = currentExecutions.keySet(); currentExecutionIds.removeAll(originalExecutionIds); Assert.assertEquals(currentExecutionIds.size(), 1); - SQLExecutionUIData currentExecution = currentExecutions.get(currentExecutionIds.stream().findFirst().get()); + SQLExecutionUIData currentExecution = + currentExecutions.get(currentExecutionIds.stream().findFirst().get()); Map metricsIds = Maps.newHashMap(); - JavaConverters.seqAsJavaList(currentExecution.metrics()).stream().forEach(metricsDeclaration -> { - if (expectedMetrics.containsKey(metricsDeclaration.name())) { - metricsIds.put(metricsDeclaration.accumulatorId(), metricsDeclaration.name()); - } - }); - Assert.assertEquals("Expected metric name not match", - expectedMetrics.keySet(), Sets.newHashSet(metricsIds.values())); + JavaConverters.seqAsJavaList(currentExecution.metrics()).stream() + .forEach( + metricsDeclaration -> { + if (expectedMetrics.containsKey(metricsDeclaration.name())) { + metricsIds.put(metricsDeclaration.accumulatorId(), metricsDeclaration.name()); + } + }); + Assert.assertEquals( + "Expected metric name not match", + expectedMetrics.keySet(), + Sets.newHashSet(metricsIds.values())); Map currentMetrics = - JavaConverters.mapAsJavaMap(spark.sharedState().statusStore().executionMetrics(currentExecution.executionId())) + JavaConverters.mapAsJavaMap( + spark.sharedState().statusStore().executionMetrics(currentExecution.executionId())) .entrySet().stream() .filter(x -> metricsIds.containsKey(x.getKey())) .collect(Collectors.toMap(x -> metricsIds.get(x.getKey()), x -> x.getValue())); diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/SparkTestBaseWithCatalog.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/SparkTestBaseWithCatalog.java index 00dcd95ec709..857c61e068b0 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/SparkTestBaseWithCatalog.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/SparkTestBaseWithCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.io.File; @@ -49,8 +48,7 @@ public static void dropWarehouse() { } } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); protected final String catalogName; protected final Catalog validationCatalog; @@ -66,21 +64,25 @@ public SparkTestBaseWithCatalog(SparkCatalogConfig config) { this(config.catalogName(), config.implementation(), config.properties()); } - public SparkTestBaseWithCatalog(String catalogName, String implementation, Map config) { + public SparkTestBaseWithCatalog( + String catalogName, String implementation, Map config) { this.catalogName = catalogName; - this.validationCatalog = catalogName.equals("testhadoop") ? - new HadoopCatalog(spark.sessionState().newHadoopConf(), "file:" + warehouse) : - catalog; + this.validationCatalog = + catalogName.equals("testhadoop") + ? new HadoopCatalog(spark.sessionState().newHadoopConf(), "file:" + warehouse) + : catalog; this.validationNamespaceCatalog = (SupportsNamespaces) validationCatalog; spark.conf().set("spark.sql.catalog." + catalogName, implementation); - config.forEach((key, value) -> spark.conf().set("spark.sql.catalog." + catalogName + "." + key, value)); + config.forEach( + (key, value) -> spark.conf().set("spark.sql.catalog." + catalogName + "." + key, value)); if (config.get("type").equalsIgnoreCase("hadoop")) { spark.conf().set("spark.sql.catalog." + catalogName + ".warehouse", "file:" + warehouse); } - this.tableName = (catalogName.equals("spark_catalog") ? "" : catalogName + ".") + "default.table"; + this.tableName = + (catalogName.equals("spark_catalog") ? "" : catalogName + ".") + "default.table"; sql("CREATE NAMESPACE IF NOT EXISTS default"); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestFileRewriteCoordinator.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestFileRewriteCoordinator.java index 8aa5cd6faec1..2e6886d32df5 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestFileRewriteCoordinator.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestFileRewriteCoordinator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.io.IOException; @@ -44,7 +43,8 @@ public class TestFileRewriteCoordinator extends SparkCatalogTestBase { - public TestFileRewriteCoordinator(String catalogName, String implementation, Map config) { + public TestFileRewriteCoordinator( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -66,7 +66,8 @@ public void testBinPackRewrite() throws NoSuchTableException, IOException { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should produce 4 snapshots", 4, Iterables.size(table.snapshots())); - Dataset fileDF = spark.read().format("iceberg").load(tableName(tableIdent.name() + ".files")); + Dataset fileDF = + spark.read().format("iceberg").load(tableName(tableIdent.name() + ".files")); List fileSizes = fileDF.select("file_size_in_bytes").as(Encoders.LONG()).collectAsList(); long avgFileSize = fileSizes.stream().mapToLong(i -> i).sum() / fileSizes.size(); @@ -77,22 +78,27 @@ public void testBinPackRewrite() throws NoSuchTableException, IOException { taskSetManager.stageTasks(table, fileSetID, Lists.newArrayList(fileScanTasks)); // read and pack original 4 files into 2 splits - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) - .option(SparkReadOptions.SPLIT_SIZE, Long.toString(avgFileSize * 2)) - .option(SparkReadOptions.FILE_OPEN_COST, "0") - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) + .option(SparkReadOptions.SPLIT_SIZE, Long.toString(avgFileSize * 2)) + .option(SparkReadOptions.FILE_OPEN_COST, "0") + .load(tableName); // write the packed data into new files where each split becomes a new file - scanDF.writeTo(tableName) + scanDF + .writeTo(tableName) .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID) .append(); // commit the rewrite FileRewriteCoordinator rewriteCoordinator = FileRewriteCoordinator.get(); - Set rewrittenFiles = taskSetManager.fetchTasks(table, fileSetID).stream() - .map(FileScanTask::file) - .collect(Collectors.toSet()); + Set rewrittenFiles = + taskSetManager.fetchTasks(table, fileSetID).stream() + .map(FileScanTask::file) + .collect(Collectors.toSet()); Set addedFiles = rewriteCoordinator.fetchNewDataFiles(table, fileSetID); table.newRewrite().rewriteFiles(rewrittenFiles, addedFiles).commit(); } @@ -127,34 +133,42 @@ public void testSortRewrite() throws NoSuchTableException, IOException { taskSetManager.stageTasks(table, fileSetID, Lists.newArrayList(fileScanTasks)); // read original 4 files as 4 splits - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) - .option(SparkReadOptions.SPLIT_SIZE, "134217728") - .option(SparkReadOptions.FILE_OPEN_COST, "134217728") - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) + .option(SparkReadOptions.SPLIT_SIZE, "134217728") + .option(SparkReadOptions.FILE_OPEN_COST, "134217728") + .load(tableName); // make sure we disable AQE and set the number of shuffle partitions as the target num files - ImmutableMap sqlConf = ImmutableMap.of( - "spark.sql.shuffle.partitions", "2", - "spark.sql.adaptive.enabled", "false" - ); - - withSQLConf(sqlConf, () -> { - try { - // write new files with sorted records - scanDF.sort("id").writeTo(tableName) - .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID) - .append(); - } catch (NoSuchTableException e) { - throw new RuntimeException("Could not replace files", e); - } - }); + ImmutableMap sqlConf = + ImmutableMap.of( + "spark.sql.shuffle.partitions", "2", + "spark.sql.adaptive.enabled", "false"); + + withSQLConf( + sqlConf, + () -> { + try { + // write new files with sorted records + scanDF + .sort("id") + .writeTo(tableName) + .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID) + .append(); + } catch (NoSuchTableException e) { + throw new RuntimeException("Could not replace files", e); + } + }); // commit the rewrite FileRewriteCoordinator rewriteCoordinator = FileRewriteCoordinator.get(); - Set rewrittenFiles = taskSetManager.fetchTasks(table, fileSetID).stream() - .map(FileScanTask::file) - .collect(Collectors.toSet()); + Set rewrittenFiles = + taskSetManager.fetchTasks(table, fileSetID).stream() + .map(FileScanTask::file) + .collect(Collectors.toSet()); Set addedFiles = rewriteCoordinator.fetchNewDataFiles(table, fileSetID); table.newRewrite().rewriteFiles(rewrittenFiles, addedFiles).commit(); } @@ -199,7 +213,8 @@ public void testCommitMultipleRewrites() throws NoSuchTableException, IOExceptio String secondFileSetID = UUID.randomUUID().toString(); - try (CloseableIterable tasks = table.newScan().appendsAfter(firstFileSetSnapshotId).planFiles()) { + try (CloseableIterable tasks = + table.newScan().appendsAfter(firstFileSetSnapshotId).planFiles()) { // stage 2 more files for compaction taskSetManager.stageTasks(table, secondFileSetID, Lists.newArrayList(tasks)); } @@ -208,26 +223,32 @@ public void testCommitMultipleRewrites() throws NoSuchTableException, IOExceptio for (String fileSetID : fileSetIDs) { // read and pack 2 files into 1 split - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) - .option(SparkReadOptions.SPLIT_SIZE, Long.MAX_VALUE) - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) + .option(SparkReadOptions.SPLIT_SIZE, Long.MAX_VALUE) + .load(tableName); // write the combined data as one file - scanDF.writeTo(tableName) + scanDF + .writeTo(tableName) .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID) .append(); } // commit both rewrites at the same time FileRewriteCoordinator rewriteCoordinator = FileRewriteCoordinator.get(); - Set rewrittenFiles = fileSetIDs.stream().flatMap(fileSetID -> - taskSetManager.fetchTasks(table, fileSetID).stream()) - .map(FileScanTask::file) - .collect(Collectors.toSet()); - Set addedFiles = fileSetIDs.stream() - .flatMap(fileSetID -> rewriteCoordinator.fetchNewDataFiles(table, fileSetID).stream()) - .collect(Collectors.toSet()); + Set rewrittenFiles = + fileSetIDs.stream() + .flatMap(fileSetID -> taskSetManager.fetchTasks(table, fileSetID).stream()) + .map(FileScanTask::file) + .collect(Collectors.toSet()); + Set addedFiles = + fileSetIDs.stream() + .flatMap(fileSetID -> rewriteCoordinator.fetchNewDataFiles(table, fileSetID).stream()) + .collect(Collectors.toSet()); table.newRewrite().rewriteFiles(rewrittenFiles, addedFiles).commit(); table.refresh(); diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSpark3Util.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSpark3Util.java index 8d57fe112033..e3191c67e263 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSpark3Util.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSpark3Util.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.NullOrder.NULLS_FIRST; +import static org.apache.iceberg.NullOrder.NULLS_LAST; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; import org.apache.iceberg.SortOrderParser; @@ -27,54 +31,69 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.NullOrder.NULLS_FIRST; -import static org.apache.iceberg.NullOrder.NULLS_LAST; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSpark3Util extends SparkTestBase { @Test public void testDescribeSortOrder() { - Schema schema = new Schema( + Schema schema = + new Schema( required(1, "data", Types.StringType.get()), - required(2, "time", Types.TimestampType.withoutZone()) - ); + required(2, "time", Types.TimestampType.withoutZone())); - Assert.assertEquals("Sort order isn't correct.", "data DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "data DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("Identity", schema, 1))); - Assert.assertEquals("Sort order isn't correct.", "bucket(1, data) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "bucket(1, data) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("bucket[1]", schema, 1))); - Assert.assertEquals("Sort order isn't correct.", "truncate(data, 3) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "truncate(data, 3) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("truncate[3]", schema, 1))); - Assert.assertEquals("Sort order isn't correct.", "years(time) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "years(time) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("year", schema, 2))); - Assert.assertEquals("Sort order isn't correct.", "months(time) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "months(time) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("month", schema, 2))); - Assert.assertEquals("Sort order isn't correct.", "days(time) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "days(time) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("day", schema, 2))); - Assert.assertEquals("Sort order isn't correct.", "hours(time) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "hours(time) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("hour", schema, 2))); - Assert.assertEquals("Sort order isn't correct.", "unknown(data) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "unknown(data) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("unknown", schema, 1))); // multiple sort orders - SortOrder multiOrder = SortOrder.builderFor(schema) - .asc("time", NULLS_FIRST) - .asc("data", NULLS_LAST) - .build(); - Assert.assertEquals("Sort order isn't correct.", "time ASC NULLS FIRST, data ASC NULLS LAST", - Spark3Util.describe(multiOrder)); + SortOrder multiOrder = + SortOrder.builderFor(schema).asc("time", NULLS_FIRST).asc("data", NULLS_LAST).build(); + Assert.assertEquals( + "Sort order isn't correct.", + "time ASC NULLS FIRST, data ASC NULLS LAST", + Spark3Util.describe(multiOrder)); } @Test public void testDescribeSchema() { - Schema schema = new Schema( - required(1, "data", Types.ListType.ofRequired(2, Types.StringType.get())), - optional(3, "pairs", Types.MapType.ofOptional(4, 5, Types.StringType.get(), Types.LongType.get())), - required(6, "time", Types.TimestampType.withoutZone()) - ); + Schema schema = + new Schema( + required(1, "data", Types.ListType.ofRequired(2, Types.StringType.get())), + optional( + 3, + "pairs", + Types.MapType.ofOptional(4, 5, Types.StringType.get(), Types.LongType.get())), + required(6, "time", Types.TimestampType.withoutZone())); - Assert.assertEquals("Schema description isn't correct.", + Assert.assertEquals( + "Schema description isn't correct.", "struct not null,pairs: map,time: timestamp not null>", Spark3Util.describe(schema)); } @@ -93,15 +112,20 @@ public void testLoadIcebergTable() throws Exception { } private SortOrder buildSortOrder(String transform, Schema schema, int sourceId) { - String jsonString = "{\n" + - " \"order-id\" : 10,\n" + - " \"fields\" : [ {\n" + - " \"transform\" : \"" + transform + "\",\n" + - " \"source-id\" : " + sourceId + ",\n" + - " \"direction\" : \"desc\",\n" + - " \"null-order\" : \"nulls-first\"\n" + - " } ]\n" + - "}"; + String jsonString = + "{\n" + + " \"order-id\" : 10,\n" + + " \"fields\" : [ {\n" + + " \"transform\" : \"" + + transform + + "\",\n" + + " \"source-id\" : " + + sourceId + + ",\n" + + " \"direction\" : \"desc\",\n" + + " \"null-order\" : \"nulls-first\"\n" + + " } ]\n" + + "}"; return SortOrderParser.fromJson(schema, jsonString); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkCatalogOperations.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkCatalogOperations.java index 5c9f3c4cb189..0836271a7c22 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkCatalogOperations.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkCatalogOperations.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkFilters.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkFilters.java index abda21198360..b3f9df10b698 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkFilters.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkFilters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.sql.Date; @@ -42,10 +41,14 @@ public void testTimestampFilterConversion() { Expression timestampExpression = SparkFilters.convert(GreaterThan.apply("x", timestamp)); Expression rawExpression = Expressions.greaterThan("x", epochMicros); - Assert.assertEquals("Generated Timestamp expression should be correct", - rawExpression.toString(), timestampExpression.toString()); - Assert.assertEquals("Generated Instant expression should be correct", - rawExpression.toString(), instantExpression.toString()); + Assert.assertEquals( + "Generated Timestamp expression should be correct", + rawExpression.toString(), + timestampExpression.toString()); + Assert.assertEquals( + "Generated Instant expression should be correct", + rawExpression.toString(), + instantExpression.toString()); } @Test @@ -58,10 +61,14 @@ public void testDateFilterConversion() { Expression dateExpression = SparkFilters.convert(GreaterThan.apply("x", date)); Expression rawExpression = Expressions.greaterThan("x", epochDay); - Assert.assertEquals("Generated localdate expression should be correct", - rawExpression.toString(), localDateExpression.toString()); + Assert.assertEquals( + "Generated localdate expression should be correct", + rawExpression.toString(), + localDateExpression.toString()); - Assert.assertEquals("Generated date expression should be correct", - rawExpression.toString(), dateExpression.toString()); + Assert.assertEquals( + "Generated date expression should be correct", + rawExpression.toString(), + dateExpression.toString()); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java index 8bb32c969842..4e6331982d85 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java @@ -16,34 +16,33 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import org.apache.iceberg.Schema; import org.apache.iceberg.types.Types; import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestSparkSchemaUtil { - private static final Schema TEST_SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema TEST_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); @Test public void testEstiamteSizeMaxValue() throws IOException { - Assert.assertEquals("estimateSize returns Long max value", Long.MAX_VALUE, - SparkSchemaUtil.estimateSize( - null, - Long.MAX_VALUE)); + Assert.assertEquals( + "estimateSize returns Long max value", + Long.MAX_VALUE, + SparkSchemaUtil.estimateSize(null, Long.MAX_VALUE)); } @Test public void testEstiamteSizeWithOverflow() throws IOException { - long tableSize = SparkSchemaUtil.estimateSize(SparkSchemaUtil.convert(TEST_SCHEMA), Long.MAX_VALUE - 1); + long tableSize = + SparkSchemaUtil.estimateSize(SparkSchemaUtil.convert(TEST_SCHEMA), Long.MAX_VALUE - 1); Assert.assertEquals("estimateSize handles overflow", Long.MAX_VALUE, tableSize); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkTableUtil.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkTableUtil.java index 2b67fc922e0b..5c9baab4f89e 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkTableUtil.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkTableUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.io.IOException; diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java index 57941b8c7940..7f00c7edd8a9 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Schema; @@ -31,51 +30,55 @@ public class TestSparkValueConverter { @Test public void testSparkNullMapConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); assertCorrectNullConversion(schema); } @Test public void testSparkNullListConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", - Types.ListType.ofOptional(6, Types.StringType.get()) - ) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, "locations", Types.ListType.ofOptional(6, Types.StringType.get()))); assertCorrectNullConversion(schema); } @Test public void testSparkNullStructConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); assertCorrectNullConversion(schema); } @Test public void testSparkNullPrimitiveConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "location", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(5, "location", Types.StringType.get())); assertCorrectNullConversion(schema); } @@ -83,7 +86,8 @@ private void assertCorrectNullConversion(Schema schema) { Row sparkRow = RowFactory.create(1, null); Record record = GenericRecord.create(schema); record.set(0, 1); - Assert.assertEquals("Round-trip conversion should produce original value", + Assert.assertEquals( + "Round-trip conversion should produce original value", record, SparkValueConverter.convert(schema, sparkRow)); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java index 0a9ab58f5a56..544c730bfe89 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.io.File; @@ -69,45 +68,61 @@ import scala.collection.Seq; public class TestCreateActions extends SparkCatalogTestBase { - private static final String CREATE_PARTITIONED_PARQUET = "CREATE TABLE %s (id INT, data STRING) " + - "using parquet PARTITIONED BY (id) LOCATION '%s'"; - private static final String CREATE_PARQUET = "CREATE TABLE %s (id INT, data STRING) " + - "using parquet LOCATION '%s'"; - private static final String CREATE_HIVE_EXTERNAL_PARQUET = "CREATE EXTERNAL TABLE %s (data STRING) " + - "PARTITIONED BY (id INT) STORED AS parquet LOCATION '%s'"; - private static final String CREATE_HIVE_PARQUET = "CREATE TABLE %s (data STRING) " + - "PARTITIONED BY (id INT) STORED AS parquet"; + private static final String CREATE_PARTITIONED_PARQUET = + "CREATE TABLE %s (id INT, data STRING) " + "using parquet PARTITIONED BY (id) LOCATION '%s'"; + private static final String CREATE_PARQUET = + "CREATE TABLE %s (id INT, data STRING) " + "using parquet LOCATION '%s'"; + private static final String CREATE_HIVE_EXTERNAL_PARQUET = + "CREATE EXTERNAL TABLE %s (data STRING) " + + "PARTITIONED BY (id INT) STORED AS parquet LOCATION '%s'"; + private static final String CREATE_HIVE_PARQUET = + "CREATE TABLE %s (data STRING) " + "PARTITIONED BY (id INT) STORED AS parquet"; private static final String NAMESPACE = "default"; @Parameterized.Parameters(name = "Catalog Name {0} - Options {2}") public static Object[][] parameters() { return new Object[][] { - new Object[] {"spark_catalog", SparkSessionCatalog.class.getName(), ImmutableMap.of( + new Object[] { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( "type", "hive", "default-namespace", "default", "parquet-enabled", "true", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - )}, - new Object[] {"spark_catalog", SparkSessionCatalog.class.getName(), ImmutableMap.of( + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync + ) + }, + new Object[] { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( "type", "hadoop", "default-namespace", "default", "parquet-enabled", "true", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - )}, - new Object[] { "testhive", SparkCatalog.class.getName(), ImmutableMap.of( + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync + ) + }, + new Object[] { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( "type", "hive", - "default-namespace", "default" - )}, - new Object[] { "testhadoop", SparkCatalog.class.getName(), ImmutableMap.of( + "default-namespace", "default") + }, + new Object[] { + "testhadoop", + SparkCatalog.class.getName(), + ImmutableMap.of( "type", "hadoop", - "default-namespace", "default" - )} + "default-namespace", "default") + } }; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String baseTableName = "baseTable"; private File tableDir; @@ -115,10 +130,7 @@ public static Object[][] parameters() { private final String type; private final TableCatalog catalog; - public TestCreateActions( - String catalogName, - String implementation, - Map config) { + public TestCreateActions(String catalogName, String implementation, Map config) { super(catalogName, implementation, config); this.catalog = (TableCatalog) spark.sessionState().catalogManager().catalog(catalogName); this.type = config.get("type"); @@ -138,15 +150,15 @@ public void before() { spark.conf().set("spark.sql.parquet.writeLegacyFormat", false); spark.sql(String.format("DROP TABLE IF EXISTS %s", baseTableName)); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").orderBy("data").write() + df.select("id", "data") + .orderBy("data") + .write() .mode("append") .option("path", tableLocation) .saveAsTable(baseTableName); @@ -161,7 +173,8 @@ public void after() throws IOException { @Test public void testMigratePartitioned() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_migrate_partitioned_table"); String dest = source; createSourceTable(CREATE_PARTITIONED_PARQUET, source); @@ -171,17 +184,20 @@ public void testMigratePartitioned() throws Exception { @Test public void testPartitionedTableWithUnRecoveredPartitions() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_unrecovered_partitions"); String dest = source; File location = temp.newFolder(); sql(CREATE_PARTITIONED_PARQUET, source, location); // Data generation and partition addition - spark.range(5) + spark + .range(5) .selectExpr("id", "cast(id as STRING) as data") .write() - .partitionBy("id").mode(SaveMode.Overwrite) + .partitionBy("id") + .mode(SaveMode.Overwrite) .parquet(location.toURI().toString()); sql("ALTER TABLE %s ADD PARTITION(id=0)", source); @@ -191,7 +207,8 @@ public void testPartitionedTableWithUnRecoveredPartitions() throws Exception { @Test public void testPartitionedTableWithCustomPartitions() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_custom_parts"); String dest = source; File tblLocation = temp.newFolder(); @@ -199,18 +216,23 @@ public void testPartitionedTableWithCustomPartitions() throws Exception { // Data generation and partition addition spark.sql(String.format(CREATE_PARTITIONED_PARQUET, source, tblLocation)); - spark.range(10) + spark + .range(10) .selectExpr("cast(id as STRING) as data") .write() - .mode(SaveMode.Overwrite).parquet(partitionDataLoc.toURI().toString()); - sql("ALTER TABLE %s ADD PARTITION(id=0) LOCATION '%s'", source, partitionDataLoc.toURI().toString()); + .mode(SaveMode.Overwrite) + .parquet(partitionDataLoc.toURI().toString()); + sql( + "ALTER TABLE %s ADD PARTITION(id=0) LOCATION '%s'", + source, partitionDataLoc.toURI().toString()); assertMigratedFileCount(SparkActions.get().migrateTable(source), source, dest); } @Test public void testAddColumnOnMigratedTableAtEnd() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_add_column_migrated_table"); String dest = source; createSourceTable(CREATE_PARQUET, source); @@ -249,7 +271,8 @@ public void testAddColumnOnMigratedTableAtEnd() throws Exception { @Test public void testAddColumnOnMigratedTableAtMiddle() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_add_column_migrated_table_middle"); String dest = source; createSourceTable(CREATE_PARQUET, source); @@ -263,7 +286,10 @@ public void testAddColumnOnMigratedTableAtMiddle() throws Exception { // test column addition on migrated table Schema beforeSchema = table.schema(); String newCol1 = "newCol"; - sparkTable.table().updateSchema().addColumn("newCol", Types.IntegerType.get()) + sparkTable + .table() + .updateSchema() + .addColumn("newCol", Types.IntegerType.get()) .moveAfter(newCol1, "id") .commit(); Schema afterSchema = table.schema(); @@ -279,16 +305,20 @@ public void testAddColumnOnMigratedTableAtMiddle() throws Exception { @Test public void removeColumnsAtEnd() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_remove_column_migrated_table"); String dest = source; String colName1 = "newCol1"; String colName2 = "newCol2"; File location = temp.newFolder(); - spark.range(10).selectExpr("cast(id as INT)", "CAST(id as INT) " + colName1, "CAST(id as INT) " + colName2) + spark + .range(10) + .selectExpr("cast(id as INT)", "CAST(id as INT) " + colName1, "CAST(id as INT) " + colName2) .write() - .mode(SaveMode.Overwrite).saveAsTable(dest); + .mode(SaveMode.Overwrite) + .saveAsTable(dest); List expected1 = sql("select id, %s from %s order by id", colName1, source); List expected2 = sql("select id from %s order by id", source); @@ -322,13 +352,19 @@ public void removeColumnsAtEnd() throws Exception { @Test public void removeColumnFromMiddle() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_remove_column_migrated_table_from_middle"); String dest = source; String dropColumnName = "col1"; - spark.range(10).selectExpr("cast(id as INT)", "CAST(id as INT) as " + - dropColumnName, "CAST(id as INT) as col2").write().mode(SaveMode.Overwrite).saveAsTable(dest); + spark + .range(10) + .selectExpr( + "cast(id as INT)", "CAST(id as INT) as " + dropColumnName, "CAST(id as INT) as col2") + .write() + .mode(SaveMode.Overwrite) + .saveAsTable(dest); List expected = sql("select id, col2 from %s order by id", source); // migrate table @@ -348,7 +384,8 @@ public void removeColumnFromMiddle() throws Exception { @Test public void testMigrateUnpartitioned() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_migrate_unpartitioned_table"); String dest = source; createSourceTable(CREATE_PARQUET, source); @@ -357,40 +394,49 @@ public void testMigrateUnpartitioned() throws Exception { @Test public void testSnapshotPartitioned() throws Exception { - Assume.assumeTrue("Cannot snapshot with arbitrary location in a hadoop based catalog", + Assume.assumeTrue( + "Cannot snapshot with arbitrary location in a hadoop based catalog", !type.equals("hadoop")); File location = temp.newFolder(); String source = sourceName("test_snapshot_partitioned_table"); String dest = destName("iceberg_snapshot_partitioned"); createSourceTable(CREATE_PARTITIONED_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); assertIsolatedSnapshot(source, dest); } @Test public void testSnapshotUnpartitioned() throws Exception { - Assume.assumeTrue("Cannot snapshot with arbitrary location in a hadoop based catalog", + Assume.assumeTrue( + "Cannot snapshot with arbitrary location in a hadoop based catalog", !type.equals("hadoop")); File location = temp.newFolder(); String source = sourceName("test_snapshot_unpartitioned_table"); String dest = destName("iceberg_snapshot_unpartitioned"); createSourceTable(CREATE_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); assertIsolatedSnapshot(source, dest); } @Test public void testSnapshotHiveTable() throws Exception { - Assume.assumeTrue("Cannot snapshot with arbitrary location in a hadoop based catalog", + Assume.assumeTrue( + "Cannot snapshot with arbitrary location in a hadoop based catalog", !type.equals("hadoop")); File location = temp.newFolder(); String source = sourceName("snapshot_hive_table"); String dest = destName("iceberg_snapshot_hive_table"); createSourceTable(CREATE_HIVE_EXTERNAL_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); assertIsolatedSnapshot(source, dest); } @@ -411,7 +457,9 @@ public void testSnapshotManagedHiveTable() throws Exception { String dest = destName("iceberg_snapshot_managed_hive_table"); createSourceTable(CREATE_HIVE_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); assertIsolatedSnapshot(source, dest); } @@ -423,7 +471,9 @@ public void testMigrateManagedHiveTable() throws Exception { String dest = destName("iceberg_migrate_managed_hive_table"); createSourceTable(CREATE_HIVE_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); } @Test @@ -435,11 +485,15 @@ public void testProperties() throws Exception { props.put("note", "Jazz"); createSourceTable(CREATE_PARQUET, source); for (Map.Entry keyValue : props.entrySet()) { - spark.sql(String.format("ALTER TABLE %s SET TBLPROPERTIES (\"%s\" = \"%s\")", - source, keyValue.getKey(), keyValue.getValue())); + spark.sql( + String.format( + "ALTER TABLE %s SET TBLPROPERTIES (\"%s\" = \"%s\")", + source, keyValue.getKey(), keyValue.getValue())); } assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableProperty("dogs", "sundance"), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableProperty("dogs", "sundance"), + source, + dest); SparkTable table = loadTable(dest); Map expectedProps = Maps.newHashMap(); @@ -450,8 +504,10 @@ public void testProperties() throws Exception { Assert.assertTrue( "Created table missing property " + entry.getKey(), table.properties().containsKey(entry.getKey())); - Assert.assertEquals("Property value is not the expected value", - entry.getValue(), table.properties().get(entry.getKey())); + Assert.assertEquals( + "Property value is not the expected value", + entry.getValue(), + table.properties().get(entry.getKey())); } } @@ -469,14 +525,20 @@ public void testSparkTableReservedProperties() throws Exception { String[] keys = {"provider", "format", "current-snapshot-id", "location", "sort-order"}; for (String entry : keys) { - Assert.assertTrue("Created table missing reserved property " + entry, table.properties().containsKey(entry)); + Assert.assertTrue( + "Created table missing reserved property " + entry, + table.properties().containsKey(entry)); } Assert.assertEquals("Unexpected provider", "iceberg", table.properties().get("provider")); Assert.assertEquals("Unexpected format", "iceberg/parquet", table.properties().get("format")); - Assert.assertNotEquals("No current-snapshot-id found", "none", table.properties().get("current-snapshot-id")); - Assert.assertTrue("Location isn't correct", table.properties().get("location").endsWith(destTableName)); - Assert.assertEquals("Sort-order isn't correct", "id ASC NULLS FIRST, data DESC NULLS LAST", + Assert.assertNotEquals( + "No current-snapshot-id found", "none", table.properties().get("current-snapshot-id")); + Assert.assertTrue( + "Location isn't correct", table.properties().get("location").endsWith(destTableName)); + Assert.assertEquals( + "Sort-order isn't correct", + "id ASC NULLS FIRST, data DESC NULLS LAST", table.properties().get("sort-order")); } @@ -492,30 +554,37 @@ public void testSnapshotDefaultLocation() throws Exception { @Test public void schemaEvolutionTestWithSparkAPI() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); File location = temp.newFolder(); String tblName = sourceName("schema_evolution_test"); // Data generation and partition addition - spark.range(0, 5) + spark + .range(0, 5) .selectExpr("CAST(id as INT) as col0", "CAST(id AS FLOAT) col2", "CAST(id AS LONG) col3") .write() .mode(SaveMode.Append) .parquet(location.toURI().toString()); - Dataset rowDataset = spark.range(6, 10) - .selectExpr("CAST(id as INT) as col0", "CAST(id AS STRING) col1", - "CAST(id AS FLOAT) col2", "CAST(id AS LONG) col3"); - rowDataset - .write() - .mode(SaveMode.Append) - .parquet(location.toURI().toString()); - spark.read() + Dataset rowDataset = + spark + .range(6, 10) + .selectExpr( + "CAST(id as INT) as col0", + "CAST(id AS STRING) col1", + "CAST(id AS FLOAT) col2", + "CAST(id AS LONG) col3"); + rowDataset.write().mode(SaveMode.Append).parquet(location.toURI().toString()); + spark + .read() .schema(rowDataset.schema()) - .parquet(location.toURI().toString()).write().saveAsTable(tblName); + .parquet(location.toURI().toString()) + .write() + .saveAsTable(tblName); List expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName); - List expectedAfterAddColumn = sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", - tblName); + List expectedAfterAddColumn = + sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", tblName); // Migrate table SparkActions.get().migrateTable(tblName).execute(); @@ -526,7 +595,10 @@ public void schemaEvolutionTestWithSparkAPI() throws Exception { // Update schema and check output correctness SparkTable sparkTable = loadTable(tblName); - sparkTable.table().updateSchema().addColumn("newCol", Types.IntegerType.get()) + sparkTable + .table() + .updateSchema() + .addColumn("newCol", Types.IntegerType.get()) .moveAfter("newCol", "col0") .commit(); List afterMigarteAfterAddResults = sql("SELECT * FROM %s ORDER BY col0", tblName); @@ -536,23 +608,30 @@ public void schemaEvolutionTestWithSparkAPI() throws Exception { @Test public void schemaEvolutionTestWithSparkSQL() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String tblName = sourceName("schema_evolution_test_sql"); // Data generation and partition addition - spark.range(0, 5) + spark + .range(0, 5) .selectExpr("CAST(id as INT) col0", "CAST(id AS FLOAT) col1", "CAST(id AS STRING) col2") .write() .mode(SaveMode.Append) .saveAsTable(tblName); sql("ALTER TABLE %s ADD COLUMN col3 INT", tblName); - spark.range(6, 10) - .selectExpr("CAST(id AS INT) col0", "CAST(id AS FLOAT) col1", "CAST(id AS STRING) col2", "CAST(id AS INT) col3") + spark + .range(6, 10) + .selectExpr( + "CAST(id AS INT) col0", + "CAST(id AS FLOAT) col1", + "CAST(id AS STRING) col2", + "CAST(id AS INT) col3") .registerTempTable("tempdata"); sql("INSERT INTO TABLE %s SELECT * FROM tempdata", tblName); List expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName); - List expectedAfterAddColumn = sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", - tblName); + List expectedAfterAddColumn = + sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", tblName); // Migrate table SparkActions.get().migrateTable(tblName).execute(); @@ -563,7 +642,10 @@ public void schemaEvolutionTestWithSparkSQL() throws Exception { // Update schema and check output correctness SparkTable sparkTable = loadTable(tblName); - sparkTable.table().updateSchema().addColumn("newCol", Types.IntegerType.get()) + sparkTable + .table() + .updateSchema() + .addColumn("newCol", Types.IntegerType.get()) .moveAfter("newCol", "col0") .commit(); List afterMigarteAfterAddResults = sql("SELECT * FROM %s ORDER BY col0", tblName); @@ -615,9 +697,9 @@ public void threeLevelList(boolean useLegacyMode) throws Exception { String tableName = sourceName(String.format("threeLevelList_%s", useLegacyMode)); File location = temp.newFolder(); - sql("CREATE TABLE %s (col1 ARRAY>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (col1 ARRAY>)" + " STORED AS parquet" + " LOCATION '%s'", + tableName, location); int testValue = 12345; sql("INSERT INTO %s VALUES (ARRAY(STRUCT(%s)))", tableName, testValue); @@ -635,11 +717,14 @@ public void threeLevelList(boolean useLegacyMode) throws Exception { private void threeLevelListWithNestedStruct(boolean useLegacyMode) throws Exception { spark.conf().set("spark.sql.parquet.writeLegacyFormat", useLegacyMode); - String tableName = sourceName(String.format("threeLevelListWithNestedStruct_%s", useLegacyMode)); + String tableName = + sourceName(String.format("threeLevelListWithNestedStruct_%s", useLegacyMode)); File location = temp.newFolder(); - sql("CREATE TABLE %s (col1 ARRAY>>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (col1 ARRAY>>)" + + " STORED AS parquet" + + " LOCATION '%s'", + tableName, location); int testValue = 12345; sql("INSERT INTO %s VALUES (ARRAY(STRUCT(STRUCT(%s))))", tableName, testValue); @@ -659,13 +744,16 @@ private void threeLevelLists(boolean useLegacyMode) throws Exception { String tableName = sourceName(String.format("threeLevelLists_%s", useLegacyMode)); File location = temp.newFolder(); - sql("CREATE TABLE %s (col1 ARRAY>, col3 ARRAY>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (col1 ARRAY>, col3 ARRAY>)" + + " STORED AS parquet" + + " LOCATION '%s'", + tableName, location); int testValue1 = 12345; int testValue2 = 987654; - sql("INSERT INTO %s VALUES (ARRAY(STRUCT(%s)), ARRAY(STRUCT(%s)))", + sql( + "INSERT INTO %s VALUES (ARRAY(STRUCT(%s)), ARRAY(STRUCT(%s)))", tableName, testValue1, testValue2); List expected = sql(String.format("SELECT * FROM %s", tableName)); @@ -683,13 +771,14 @@ private void structOfThreeLevelLists(boolean useLegacyMode) throws Exception { String tableName = sourceName(String.format("structOfThreeLevelLists_%s", useLegacyMode)); File location = temp.newFolder(); - sql("CREATE TABLE %s (col1 STRUCT>>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (col1 STRUCT>>)" + + " STORED AS parquet" + + " LOCATION '%s'", + tableName, location); int testValue1 = 12345; - sql("INSERT INTO %s VALUES (STRUCT(STRUCT(ARRAY(STRUCT(%s)))))", - tableName, testValue1); + sql("INSERT INTO %s VALUES (STRUCT(STRUCT(ARRAY(STRUCT(%s)))))", tableName, testValue1); List expected = sql(String.format("SELECT * FROM %s", tableName)); // migrate table @@ -701,16 +790,19 @@ private void structOfThreeLevelLists(boolean useLegacyMode) throws Exception { assertEquals("Output must match", expected, results); } - private SparkTable loadTable(String name) throws NoSuchTableException, ParseException { - return (SparkTable) catalog.loadTable(Spark3Util.catalogAndIdentifier(spark, name).identifier()); + return (SparkTable) + catalog.loadTable(Spark3Util.catalogAndIdentifier(spark, name).identifier()); } private CatalogTable loadSessionTable(String name) throws NoSuchTableException, NoSuchDatabaseException, ParseException { Identifier identifier = Spark3Util.catalogAndIdentifier(spark, name).identifier(); Some namespace = Some.apply(identifier.namespace()[0]); - return spark.sessionState().catalog().getTableMetadata(new TableIdentifier(identifier.name(), namespace)); + return spark + .sessionState() + .catalog() + .getTableMetadata(new TableIdentifier(identifier.name(), namespace)); } private void createSourceTable(String createStatement, String tableName) @@ -720,41 +812,57 @@ private void createSourceTable(String createStatement, String tableName) CatalogTable table = loadSessionTable(tableName); Seq partitionColumns = table.partitionColumnNames(); String format = table.provider().get(); - spark.table(baseTableName).write().mode(SaveMode.Append).format(format).partitionBy(partitionColumns) + spark + .table(baseTableName) + .write() + .mode(SaveMode.Append) + .format(format) + .partitionBy(partitionColumns) .saveAsTable(tableName); } - // Counts the number of files in the source table, makes sure the same files exist in the destination table + // Counts the number of files in the source table, makes sure the same files exist in the + // destination table private void assertMigratedFileCount(MigrateTable migrateAction, String source, String dest) throws NoSuchTableException, NoSuchDatabaseException, ParseException { long expectedFiles = expectedFilesCount(source); MigrateTable.Result migratedFiles = migrateAction.execute(); validateTables(source, dest); - Assert.assertEquals("Expected number of migrated files", - expectedFiles, migratedFiles.migratedDataFilesCount()); + Assert.assertEquals( + "Expected number of migrated files", expectedFiles, migratedFiles.migratedDataFilesCount()); } - // Counts the number of files in the source table, makes sure the same files exist in the destination table + // Counts the number of files in the source table, makes sure the same files exist in the + // destination table private void assertSnapshotFileCount(SnapshotTable snapshotTable, String source, String dest) throws NoSuchTableException, NoSuchDatabaseException, ParseException { long expectedFiles = expectedFilesCount(source); SnapshotTable.Result snapshotTableResult = snapshotTable.execute(); validateTables(source, dest); - Assert.assertEquals("Expected number of imported snapshot files", expectedFiles, + Assert.assertEquals( + "Expected number of imported snapshot files", + expectedFiles, snapshotTableResult.importedDataFilesCount()); } - private void validateTables(String source, String dest) throws NoSuchTableException, ParseException { + private void validateTables(String source, String dest) + throws NoSuchTableException, ParseException { List expected = spark.table(source).collectAsList(); SparkTable destTable = loadTable(dest); - Assert.assertEquals("Provider should be iceberg", "iceberg", + Assert.assertEquals( + "Provider should be iceberg", + "iceberg", destTable.properties().get(TableCatalog.PROP_PROVIDER)); List actual = spark.table(dest).collectAsList(); - Assert.assertTrue(String.format("Rows in migrated table did not match\nExpected :%s rows \nFound :%s", - expected, actual), expected.containsAll(actual) && actual.containsAll(expected)); + Assert.assertTrue( + String.format( + "Rows in migrated table did not match\nExpected :%s rows \nFound :%s", + expected, actual), + expected.containsAll(actual) && actual.containsAll(expected)); } - private long expectedFilesCount(String source) throws NoSuchDatabaseException, NoSuchTableException, ParseException { + private long expectedFilesCount(String source) + throws NoSuchDatabaseException, NoSuchTableException, ParseException { CatalogTable sourceTable = loadSessionTable(source); List uris; if (sourceTable.partitionColumnNames().size() == 0) { @@ -762,34 +870,42 @@ private long expectedFilesCount(String source) throws NoSuchDatabaseException, N uris.add(sourceTable.location()); } else { Seq catalogTablePartitionSeq = - spark.sessionState().catalog().listPartitions(sourceTable.identifier(), Option.apply(null)); - uris = JavaConverters.seqAsJavaList(catalogTablePartitionSeq) - .stream() - .map(CatalogTablePartition::location) - .collect(Collectors.toList()); + spark + .sessionState() + .catalog() + .listPartitions(sourceTable.identifier(), Option.apply(null)); + uris = + JavaConverters.seqAsJavaList(catalogTablePartitionSeq).stream() + .map(CatalogTablePartition::location) + .collect(Collectors.toList()); } return uris.stream() - .flatMap(uri -> - FileUtils.listFiles(Paths.get(uri).toFile(), - TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).stream()) - .filter(file -> !file.toString().endsWith("crc") && !file.toString().contains("_SUCCESS")).count(); + .flatMap( + uri -> + FileUtils.listFiles( + Paths.get(uri).toFile(), TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE) + .stream()) + .filter(file -> !file.toString().endsWith("crc") && !file.toString().contains("_SUCCESS")) + .count(); } - // Insert records into the destination, makes sure those records exist and source table is unchanged + // Insert records into the destination, makes sure those records exist and source table is + // unchanged private void assertIsolatedSnapshot(String source, String dest) { List expected = spark.sql(String.format("SELECT * FROM %s", source)).collectAsList(); - List extraData = Lists.newArrayList( - new SimpleRecord(4, "d") - ); + List extraData = Lists.newArrayList(new SimpleRecord(4, "d")); Dataset df = spark.createDataFrame(extraData, SimpleRecord.class); df.write().format("iceberg").mode("append").saveAsTable(dest); List result = spark.sql(String.format("SELECT * FROM %s", source)).collectAsList(); - Assert.assertEquals("No additional rows should be added to the original table", expected.size(), - result.size()); + Assert.assertEquals( + "No additional rows should be added to the original table", expected.size(), result.size()); - List snapshot = spark.sql(String.format("SELECT * FROM %s WHERE id = 4 AND data = 'd'", dest)).collectAsList(); + List snapshot = + spark + .sql(String.format("SELECT * FROM %s WHERE id = 4 AND data = 'd'", dest)) + .collectAsList(); Assert.assertEquals("Added row not found in snapshot", 1, snapshot.size()); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java index f4b296562c94..7124c51ddd3d 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -53,46 +54,47 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestDeleteReachableFilesAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); private static final int SHUFFLE_PARTITIONS = 2; private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("c1").build(); - static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(0)) - .withRecordCount(1) - .build(); - static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(1)) - .withRecordCount(1) - .build(); - static final DataFile FILE_C = DataFiles.builder(SPEC) - .withPath("/path/to/data-c.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(2)) - .withRecordCount(1) - .build(); - static final DataFile FILE_D = DataFiles.builder(SPEC) - .withPath("/path/to/data-d.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(3)) - .withRecordCount(1) - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(0)) + .withRecordCount(1) + .build(); + static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(1)) + .withRecordCount(1) + .build(); + static final DataFile FILE_C = + DataFiles.builder(SPEC) + .withPath("/path/to/data-c.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(2)) + .withRecordCount(1) + .build(); + static final DataFile FILE_D = + DataFiles.builder(SPEC) + .withPath("/path/to/data-d.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(3)) + .withRecordCount(1) + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @@ -104,62 +106,76 @@ public void setupTableLocation() throws Exception { spark.conf().set("spark.sql.shuffle.partitions", SHUFFLE_PARTITIONS); } - private void checkRemoveFilesResults(long expectedDatafiles, long expectedManifestsDeleted, - long expectedManifestListsDeleted, long expectedOtherFilesDeleted, - DeleteReachableFiles.Result results) { - Assert.assertEquals("Incorrect number of manifest files deleted", - expectedManifestsDeleted, results.deletedManifestsCount()); - Assert.assertEquals("Incorrect number of datafiles deleted", - expectedDatafiles, results.deletedDataFilesCount()); - Assert.assertEquals("Incorrect number of manifest lists deleted", - expectedManifestListsDeleted, results.deletedManifestListsCount()); - Assert.assertEquals("Incorrect number of other lists deleted", - expectedOtherFilesDeleted, results.deletedOtherFilesCount()); + private void checkRemoveFilesResults( + long expectedDatafiles, + long expectedManifestsDeleted, + long expectedManifestListsDeleted, + long expectedOtherFilesDeleted, + DeleteReachableFiles.Result results) { + Assert.assertEquals( + "Incorrect number of manifest files deleted", + expectedManifestsDeleted, + results.deletedManifestsCount()); + Assert.assertEquals( + "Incorrect number of datafiles deleted", + expectedDatafiles, + results.deletedDataFilesCount()); + Assert.assertEquals( + "Incorrect number of manifest lists deleted", + expectedManifestListsDeleted, + results.deletedManifestListsCount()); + Assert.assertEquals( + "Incorrect number of other lists deleted", + expectedOtherFilesDeleted, + results.deletedOtherFilesCount()); } @Test public void dataFilesCleanupWithParallelTasks() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit(); Set deletedFiles = ConcurrentHashMap.newKeySet(); Set deleteThreads = ConcurrentHashMap.newKeySet(); AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - DeleteReachableFiles.Result result = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .executeDeleteWith(Executors.newFixedThreadPool(4, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-files-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })) - .deleteWith(s -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(s); - }) - .execute(); - - // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory - Assert.assertEquals(deleteThreads, + DeleteReachableFiles.Result result = + sparkActions() + .deleteReachableFiles(metadataLocation(table)) + .io(table.io()) + .executeDeleteWith( + Executors.newFixedThreadPool( + 4, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("remove-files-" + deleteThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })) + .deleteWith( + s -> { + deleteThreads.add(Thread.currentThread().getName()); + deletedFiles.add(s); + }) + .execute(); + + // Verifies that the delete methods ran in the threads created by the provided ExecutorService + // ThreadFactory + Assert.assertEquals( + deleteThreads, Sets.newHashSet("remove-files-0", "remove-files-1", "remove-files-2", "remove-files-3")); - Lists.newArrayList(FILE_A, FILE_B, FILE_C, FILE_D).forEach(file -> - Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())) - ); + Lists.newArrayList(FILE_A, FILE_B, FILE_C, FILE_D) + .forEach( + file -> + Assert.assertTrue( + "FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString()))); checkRemoveFilesResults(4L, 6L, 4L, 6, result); } @@ -167,64 +183,43 @@ public void dataFilesCleanupWithParallelTasks() { public void testWithExpiringDanglingStageCommit() { table.location(); // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` staged commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); // `C` commit - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); - DeleteReachableFiles.Result result = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .execute(); + DeleteReachableFiles.Result result = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()).execute(); checkRemoveFilesResults(3L, 3L, 3L, 5, result); } @Test public void testRemoveFileActionOnEmptyTable() { - DeleteReachableFiles.Result result = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .execute(); + DeleteReachableFiles.Result result = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()).execute(); checkRemoveFilesResults(0, 0, 0, 2, result); } @Test public void testRemoveFilesActionWithReducedVersionsTable() { - table.updateProperties() - .set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "2").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); - - table.newAppend() - .appendFile(FILE_B) - .commit(); - - table.newAppend() - .appendFile(FILE_B) - .commit(); - - table.newAppend() - .appendFile(FILE_C) - .commit(); - - table.newAppend() - .appendFile(FILE_D) - .commit(); - - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()); + table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "2").commit(); + table.newAppend().appendFile(FILE_A).commit(); + + table.newAppend().appendFile(FILE_B).commit(); + + table.newAppend().appendFile(FILE_B).commit(); + + table.newAppend().appendFile(FILE_C).commit(); + + table.newAppend().appendFile(FILE_D).commit(); + + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); DeleteReachableFiles.Result result = baseRemoveFilesSparkAction.execute(); checkRemoveFilesResults(4, 5, 5, 8, result); @@ -232,57 +227,44 @@ public void testRemoveFilesActionWithReducedVersionsTable() { @Test public void testRemoveFilesAction() { - table.newAppend() - .appendFile(FILE_A) - .commit(); - - table.newAppend() - .appendFile(FILE_B) - .commit(); - - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()); - checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); + table.newAppend().appendFile(FILE_A).commit(); + + table.newAppend().appendFile(FILE_B).commit(); + + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); + checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); } @Test public void testRemoveFilesActionWithDefaultIO() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); // IO not set explicitly on removeReachableFiles action // IO defaults to HadoopFileIO - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)); - checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)); + checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); } @Test public void testUseLocalIterator() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); int jobsBefore = spark.sparkContext().dagScheduler().nextJobId().get(); - DeleteReachableFiles.Result results = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .option("stream-results", "true").execute(); + DeleteReachableFiles.Result results = + sparkActions() + .deleteReachableFiles(metadataLocation(table)) + .io(table.io()) + .option("stream-results", "true") + .execute(); int jobsAfter = spark.sparkContext().dagScheduler().nextJobId().get(); int totalJobsRun = jobsAfter - jobsBefore; @@ -290,52 +272,52 @@ public void testUseLocalIterator() { checkRemoveFilesResults(3L, 4L, 3L, 5, results); Assert.assertEquals( - "Expected total jobs to be equal to total number of shuffle partitions", totalJobsRun, SHUFFLE_PARTITIONS); + "Expected total jobs to be equal to total number of shuffle partitions", + totalJobsRun, + SHUFFLE_PARTITIONS); } @Test public void testIgnoreMetadataFilesNotFound() { - table.updateProperties() - .set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1").commit(); + table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // There are three metadata json files at this point DeleteOrphanFiles.Result result = sparkActions().deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); - Assert.assertTrue("Should remove v1 file", + Assert.assertTrue( + "Should remove v1 file", StreamSupport.stream(result.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("v1.metadata.json"))); - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()); + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); DeleteReachableFiles.Result res = baseRemoveFilesSparkAction.execute(); - checkRemoveFilesResults(1, 1, 1, 4, res); + checkRemoveFilesResults(1, 1, 1, 4, res); } @Test public void testEmptyIOThrowsException() { - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(null); - AssertHelpers.assertThrows("FileIO needs to be set to use RemoveFiles action", - IllegalArgumentException.class, "File IO cannot be null", + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(null); + AssertHelpers.assertThrows( + "FileIO needs to be set to use RemoveFiles action", + IllegalArgumentException.class, + "File IO cannot be null", baseRemoveFilesSparkAction::execute); } @Test public void testRemoveFilesActionWhenGarbageCollectionDisabled() { - table.updateProperties() - .set(TableProperties.GC_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - AssertHelpers.assertThrows("Should complain about removing files when GC is disabled", - ValidationException.class, "Cannot remove files: GC is disabled (deleting files may corrupt other tables)", + AssertHelpers.assertThrows( + "Should complain about removing files when GC is disabled", + ValidationException.class, + "Cannot remove files: GC is disabled (deleting files may corrupt other tables)", () -> sparkActions().deleteReachableFiles(metadataLocation(table))); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java index d411abdb8e26..faa2e2d1b80a 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.util.List; @@ -57,46 +58,47 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestExpireSnapshotsAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); private static final int SHUFFLE_PARTITIONS = 2; private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("c1").build(); - static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=1") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_C = DataFiles.builder(SPEC) - .withPath("/path/to/data-c.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=2") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_D = DataFiles.builder(SPEC) - .withPath("/path/to/data-d.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=3") // easy way to set partition data for now - .withRecordCount(1) - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=1") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_C = + DataFiles.builder(SPEC) + .withPath("/path/to/data-c.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=2") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_D = + DataFiles.builder(SPEC) + .withPath("/path/to/data-d.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=3") // easy way to set partition data for now + .withRecordCount(1) + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableDir; private String tableLocation; @@ -122,37 +124,41 @@ private Long rightAfterSnapshot(long snapshotId) { return end; } - private void checkExpirationResults(long expectedDatafiles, long expectedManifestsDeleted, - long expectedManifestListsDeleted, ExpireSnapshots.Result results) { - - Assert.assertEquals("Incorrect number of manifest files deleted", - expectedManifestsDeleted, results.deletedManifestsCount()); - Assert.assertEquals("Incorrect number of datafiles deleted", - expectedDatafiles, results.deletedDataFilesCount()); - Assert.assertEquals("Incorrect number of manifest lists deleted", - expectedManifestListsDeleted, results.deletedManifestListsCount()); + private void checkExpirationResults( + long expectedDatafiles, + long expectedManifestsDeleted, + long expectedManifestListsDeleted, + ExpireSnapshots.Result results) { + + Assert.assertEquals( + "Incorrect number of manifest files deleted", + expectedManifestsDeleted, + results.deletedManifestsCount()); + Assert.assertEquals( + "Incorrect number of datafiles deleted", + expectedDatafiles, + results.deletedDataFilesCount()); + Assert.assertEquals( + "Incorrect number of manifest lists deleted", + expectedManifestListsDeleted, + results.deletedManifestListsCount()); } @Test public void testFilesCleaned() throws Exception { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); long end = rightAfterSnapshot(); - ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); + ExpireSnapshots.Result results = + SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); - Assert.assertEquals("Table does not have 1 snapshot after expiration", 1, Iterables.size(table.snapshots())); + Assert.assertEquals( + "Table does not have 1 snapshot after expiration", 1, Iterables.size(table.snapshots())); checkExpirationResults(1L, 1L, 2L, results); } @@ -160,21 +166,13 @@ public void testFilesCleaned() throws Exception { @Test public void dataFilesCleanupWithParallelTasks() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit(); long t4 = rightAfterSnapshot(); @@ -182,23 +180,33 @@ public void dataFilesCleanupWithParallelTasks() throws IOException { Set deleteThreads = ConcurrentHashMap.newKeySet(); AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .executeDeleteWith(Executors.newFixedThreadPool(4, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-snapshot-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })) - .expireOlderThan(t4) - .deleteWith(s -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(s); - }) - .execute(); - - // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory - Assert.assertEquals(deleteThreads, - Sets.newHashSet("remove-snapshot-0", "remove-snapshot-1", "remove-snapshot-2", "remove-snapshot-3")); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .executeDeleteWith( + Executors.newFixedThreadPool( + 4, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("remove-snapshot-" + deleteThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })) + .expireOlderThan(t4) + .deleteWith( + s -> { + deleteThreads.add(Thread.currentThread().getName()); + deletedFiles.add(s); + }) + .execute(); + + // Verifies that the delete methods ran in the threads created by the provided ExecutorService + // ThreadFactory + Assert.assertEquals( + deleteThreads, + Sets.newHashSet( + "remove-snapshot-0", "remove-snapshot-1", "remove-snapshot-2", "remove-snapshot-3")); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); Assert.assertTrue("FILE_B should be deleted", deletedFiles.contains(FILE_B.path().toString())); @@ -208,9 +216,7 @@ public void dataFilesCleanupWithParallelTasks() throws IOException { @Test public void testNoFilesDeletedWhenNoSnapshotsExpired() throws Exception { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).execute(); checkExpirationResults(0L, 0L, 0L, results); @@ -218,30 +224,24 @@ public void testNoFilesDeletedWhenNoSnapshotsExpired() throws Exception { @Test public void testCleanupRepeatedOverwrites() throws Exception { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); for (int i = 0; i < 10; i++) { - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); - - table.newOverwrite() - .deleteFile(FILE_B) - .addFile(FILE_A) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); + + table.newOverwrite().deleteFile(FILE_B).addFile(FILE_A).commit(); } long end = rightAfterSnapshot(); - ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); + ExpireSnapshots.Result results = + SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); checkExpirationResults(1L, 39L, 20L, results); } @Test public void testRetainLastWithExpireOlderThan() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); @@ -250,217 +250,256 @@ public void testRetainLastWithExpireOlderThan() { t1 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); long t3 = rightAfterSnapshot(); // Retain last 2 snapshots - SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .retainLast(2) - .execute(); + SparkActions.get().expireSnapshots(table).expireOlderThan(t3).retainLast(2).execute(); - Assert.assertEquals("Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + Assert.assertEquals( + "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); } @Test public void testExpireTwoSnapshotsById() throws Exception { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); long secondSnapshotID = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); // Retain last 2 snapshots - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireSnapshotId(firstSnapshotId) - .expireSnapshotId(secondSnapshotID) - .execute(); - - Assert.assertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId)); - Assert.assertEquals("Second snapshot should not be present.", null, table.snapshot(secondSnapshotID)); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireSnapshotId(firstSnapshotId) + .expireSnapshotId(secondSnapshotID) + .execute(); + + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + Assert.assertEquals( + "Second snapshot should not be present.", null, table.snapshot(secondSnapshotID)); checkExpirationResults(0L, 0L, 2L, result); } @Test public void testRetainLastWithExpireById() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); // Retain last 3 snapshots, but explicitly remove the first snapshot - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireSnapshotId(firstSnapshotId) - .retainLast(3) - .execute(); - - Assert.assertEquals("Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireSnapshotId(firstSnapshotId) + .retainLast(3) + .execute(); + + Assert.assertEquals( + "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); checkExpirationResults(0L, 0L, 1L, result); } @Test public void testRetainLastWithTooFewSnapshots() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .appendFile(FILE_B) // data_bucket=1 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); long t2 = rightAfterSnapshot(); // Retain last 3 snapshots - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t2) - .retainLast(3) - .execute(); - - Assert.assertEquals("Should have two snapshots", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should still present", - firstSnapshotId, table.snapshot(firstSnapshotId).snapshotId()); + ExpireSnapshots.Result result = + SparkActions.get().expireSnapshots(table).expireOlderThan(t2).retainLast(3).execute(); + + Assert.assertEquals( + "Should have two snapshots", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should still present", + firstSnapshotId, + table.snapshot(firstSnapshotId).snapshotId()); checkExpirationResults(0L, 0L, 0L, result); } @Test public void testRetainLastKeepsExpiringSnapshot() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_D) // data_bucket=3 .commit(); // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(secondSnapshot.timestampMillis()) - .retainLast(2) - .execute(); - - Assert.assertEquals("Should have three snapshots.", 3, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNotNull("Second snapshot should present.", table.snapshot(secondSnapshot.snapshotId())); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(secondSnapshot.timestampMillis()) + .retainLast(2) + .execute(); + + Assert.assertEquals( + "Should have three snapshots.", 3, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNotNull( + "Second snapshot should present.", table.snapshot(secondSnapshot.snapshotId())); checkExpirationResults(0L, 0L, 1L, result); } @Test public void testExpireSnapshotsWithDisabledGarbageCollection() { - table.updateProperties() - .set(TableProperties.GC_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - AssertHelpers.assertThrows("Should complain about expiring snapshots", - ValidationException.class, "Cannot expire snapshots: GC is disabled", + AssertHelpers.assertThrows( + "Should complain about expiring snapshots", + ValidationException.class, + "Cannot expire snapshots: GC is disabled", () -> SparkActions.get().expireSnapshots(table)); } @Test public void testExpireOlderThanMultipleCalls() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); Snapshot thirdSnapshot = table.currentSnapshot(); // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(secondSnapshot.timestampMillis()) - .expireOlderThan(thirdSnapshot.timestampMillis()) - .execute(); - - Assert.assertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNull("Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(secondSnapshot.timestampMillis()) + .expireOlderThan(thirdSnapshot.timestampMillis()) + .execute(); + + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNull( + "Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); checkExpirationResults(0L, 0L, 2L, result); } @Test public void testRetainLastMultipleCalls() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); long t3 = rightAfterSnapshot(); // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .retainLast(2) - .retainLast(1) - .execute(); - - Assert.assertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNull("Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(t3) + .retainLast(2) + .retainLast(1) + .execute(); + + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNull( + "Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); checkExpirationResults(0L, 0L, 2L, result); } @Test public void testRetainZeroSnapshots() { - AssertHelpers.assertThrows("Should fail retain 0 snapshots " + - "because number of snapshots to retain cannot be zero", + AssertHelpers.assertThrows( + "Should fail retain 0 snapshots " + "because number of snapshots to retain cannot be zero", IllegalArgumentException.class, "Number of snapshots to retain must be at least 1, cannot be: 0", () -> SparkActions.get().expireSnapshots(table).retainLast(0).execute()); @@ -468,28 +507,22 @@ public void testRetainZeroSnapshots() { @Test public void testScanExpiredManifestInValidSnapshotAppend() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.newOverwrite() - .addFile(FILE_C) - .deleteFile(FILE_A) - .commit(); + table.newOverwrite().addFile(FILE_C).deleteFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_D).commit(); long t3 = rightAfterSnapshot(); Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(t3) + .deleteWith(deletedFiles::add) + .execute(); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); checkExpirationResults(1L, 1L, 2L, result); @@ -497,72 +530,61 @@ public void testScanExpiredManifestInValidSnapshotAppend() { @Test public void testScanExpiredManifestInValidSnapshotFastAppend() { - table.updateProperties() + table + .updateProperties() .set(TableProperties.MANIFEST_MERGE_ENABLED, "true") .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1") .commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.newOverwrite() - .addFile(FILE_C) - .deleteFile(FILE_A) - .commit(); + table.newOverwrite().addFile(FILE_C).deleteFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_D) - .commit(); + table.newFastAppend().appendFile(FILE_D).commit(); long t3 = rightAfterSnapshot(); Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(t3) + .deleteWith(deletedFiles::add) + .execute(); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); checkExpirationResults(1L, 1L, 2L, result); } /** - * Test on table below, and expiring the staged commit `B` using `expireOlderThan` API. - * Table: A - C - * ` B (staged) + * Test on table below, and expiring the staged commit `B` using `expireOlderThan` API. Table: A - + * C ` B (staged) */ @Test public void testWithExpiringDanglingStageCommit() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` staged commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); TableMetadata base = ((BaseTable) table).operations().current(); Snapshot snapshotA = base.snapshots().get(0); Snapshot snapshotB = base.snapshots().get(1); // `C` commit - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Set deletedFiles = Sets.newHashSet(); // Expire all commits including dangling staged snapshot. - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(snapshotB.timestampMillis() + 1) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireOlderThan(snapshotB.timestampMillis() + 1) + .execute(); checkExpirationResults(1L, 1L, 2L, result); @@ -570,122 +592,107 @@ public void testWithExpiringDanglingStageCommit() { expectedDeletes.add(snapshotA.manifestListLocation()); // Files should be deleted of dangling staged snapshot - snapshotB.addedDataFiles(table.io()).forEach(i -> { - expectedDeletes.add(i.path().toString()); - }); + snapshotB + .addedDataFiles(table.io()) + .forEach( + i -> { + expectedDeletes.add(i.path().toString()); + }); // ManifestList should be deleted too expectedDeletes.add(snapshotB.manifestListLocation()); - snapshotB.dataManifests(table.io()).forEach(file -> { - // Only the manifest of B should be deleted. - if (file.snapshotId() == snapshotB.snapshotId()) { - expectedDeletes.add(file.path()); - } - }); - Assert.assertSame("Files deleted count should be expected", expectedDeletes.size(), deletedFiles.size()); + snapshotB + .dataManifests(table.io()) + .forEach( + file -> { + // Only the manifest of B should be deleted. + if (file.snapshotId() == snapshotB.snapshotId()) { + expectedDeletes.add(file.path()); + } + }); + Assert.assertSame( + "Files deleted count should be expected", expectedDeletes.size(), deletedFiles.size()); // Take the diff expectedDeletes.removeAll(deletedFiles); Assert.assertTrue("Exactly same files should be deleted", expectedDeletes.isEmpty()); } /** - * Expire cherry-pick the commit as shown below, when `B` is in table's current state - * Table: - * A - B - C <--current snapshot - * `- D (source=B) + * Expire cherry-pick the commit as shown below, when `B` is in table's current state Table: A - B + * - C <--current snapshot `- D (source=B) */ @Test public void testWithCherryPickTableSnapshot() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot snapshotA = table.currentSnapshot(); // `B` commit Set deletedAFiles = Sets.newHashSet(); - table.newOverwrite() - .addFile(FILE_B) - .deleteFile(FILE_A) - .deleteWith(deletedAFiles::add) - .commit(); + table.newOverwrite().addFile(FILE_B).deleteFile(FILE_A).deleteWith(deletedAFiles::add).commit(); Assert.assertTrue("No files should be physically deleted", deletedAFiles.isEmpty()); // pick the snapshot 'B` Snapshot snapshotB = table.currentSnapshot(); // `C` commit to let cherry-pick take effect, and avoid fast-forward of `B` with cherry-pick - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Snapshot snapshotC = table.currentSnapshot(); // Move the table back to `A` - table.manageSnapshots() - .setCurrentSnapshot(snapshotA.snapshotId()) - .commit(); + table.manageSnapshots().setCurrentSnapshot(snapshotA.snapshotId()).commit(); // Generate A -> `D (B)` - table.manageSnapshots() - .cherrypick(snapshotB.snapshotId()) - .commit(); + table.manageSnapshots().cherrypick(snapshotB.snapshotId()).commit(); Snapshot snapshotD = table.currentSnapshot(); // Move the table back to `C` - table.manageSnapshots() - .setCurrentSnapshot(snapshotC.snapshotId()) - .commit(); + table.manageSnapshots().setCurrentSnapshot(snapshotC.snapshotId()).commit(); List deletedFiles = Lists.newArrayList(); // Expire `C` - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(snapshotC.timestampMillis() + 1) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireOlderThan(snapshotC.timestampMillis() + 1) + .execute(); // Make sure no dataFiles are deleted for the B, C, D snapshot - Lists.newArrayList(snapshotB, snapshotC, snapshotD).forEach(i -> { - i.addedDataFiles(table.io()).forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); + Lists.newArrayList(snapshotB, snapshotC, snapshotD) + .forEach( + i -> { + i.addedDataFiles(table.io()) + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); checkExpirationResults(1L, 2L, 2L, result); } /** - * Test on table below, and expiring `B` which is not in current table state. - * 1) Expire `B` - * 2) All commit - * Table: A - C - D (B) - * ` B (staged) + * Test on table below, and expiring `B` which is not in current table state. 1) Expire `B` 2) All + * commit Table: A - C - D (B) ` B (staged) */ @Test public void testWithExpiringStagedThenCherrypick() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); // pick the snapshot that's staged but not committed TableMetadata base = ((BaseTable) table).operations().current(); Snapshot snapshotB = base.snapshots().get(1); // `C` commit to let cherry-pick take effect, and avoid fast-forward of `B` with cherry-pick - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); // `D (B)` cherry-pick commit - table.manageSnapshots() - .cherrypick(snapshotB.snapshotId()) - .commit(); + table.manageSnapshots().cherrypick(snapshotB.snapshotId()).commit(); base = ((BaseTable) table).operations().current(); Snapshot snapshotD = base.snapshots().get(3); @@ -693,47 +700,55 @@ public void testWithExpiringStagedThenCherrypick() { List deletedFiles = Lists.newArrayList(); // Expire `B` commit. - ExpireSnapshots.Result firstResult = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireSnapshotId(snapshotB.snapshotId()) - .execute(); + ExpireSnapshots.Result firstResult = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireSnapshotId(snapshotB.snapshotId()) + .execute(); // Make sure no dataFiles are deleted for the staged snapshot - Lists.newArrayList(snapshotB).forEach(i -> { - i.addedDataFiles(table.io()).forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); + Lists.newArrayList(snapshotB) + .forEach( + i -> { + i.addedDataFiles(table.io()) + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); checkExpirationResults(0L, 1L, 1L, firstResult); // Expire all snapshots including cherry-pick - ExpireSnapshots.Result secondResult = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(table.currentSnapshot().timestampMillis() + 1) - .execute(); + ExpireSnapshots.Result secondResult = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireOlderThan(table.currentSnapshot().timestampMillis() + 1) + .execute(); // Make sure no dataFiles are deleted for the staged and cherry-pick - Lists.newArrayList(snapshotB, snapshotD).forEach(i -> { - i.addedDataFiles(table.io()).forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); + Lists.newArrayList(snapshotB, snapshotD) + .forEach( + i -> { + i.addedDataFiles(table.io()) + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); checkExpirationResults(0L, 0L, 2L, secondResult); } @Test public void testExpireOlderThan() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); long snapshotId = table.currentSnapshot().snapshotId(); @@ -741,42 +756,46 @@ public void testExpireOlderThan() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertEquals("Should remove only the expired manifest list location", - Sets.newHashSet(firstSnapshot.manifestListLocation()), deletedFiles); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertEquals( + "Should remove only the expired manifest list location", + Sets.newHashSet(firstSnapshot.manifestListLocation()), + deletedFiles); checkExpirationResults(0, 0, 1, result); } @Test public void testExpireOlderThanWithDelete() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); rightAfterSnapshot(); - table.newDelete() - .deleteFile(FILE_A) - .commit(); + table.newDelete().deleteFile(FILE_A).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create replace manifest with a rewritten manifest", - 1, secondSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create replace manifest with a rewritten manifest", + 1, + secondSnapshot.allManifests(table.io()).size()); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); rightAfterSnapshot(); @@ -786,21 +805,36 @@ public void testExpireOlderThanWithDelete() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the second oldest snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and deleted data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the second oldest snapshot", + table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and deleted data file", Sets.newHashSet( firstSnapshot.manifestListLocation(), // snapshot expired - firstSnapshot.allManifests(table.io()).get(0).path(), // manifest was rewritten for delete + firstSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest was rewritten for delete secondSnapshot.manifestListLocation(), // snapshot expired - secondSnapshot.allManifests(table.io()).get(0).path(), // manifest contained only deletes, was dropped + secondSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest contained only deletes, was dropped FILE_A.path()), // deleted deletedFiles); @@ -810,30 +844,29 @@ public void testExpireOlderThanWithDelete() { @Test public void testExpireOlderThanWithDeleteInMergedManifests() { // merge every commit - table.updateProperties() - .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); rightAfterSnapshot(); - table.newDelete() + table + .newDelete() .deleteFile(FILE_A) // FILE_B is still in the dataset .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should replace manifest with a rewritten manifest", - 1, secondSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should replace manifest with a rewritten manifest", + 1, + secondSnapshot.allManifests(table.io()).size()); - table.newFastAppend() // do not merge to keep the last snapshot's manifest valid + table + .newFastAppend() // do not merge to keep the last snapshot's manifest valid .appendFile(FILE_C) .commit(); @@ -845,19 +878,31 @@ public void testExpireOlderThanWithDeleteInMergedManifests() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the second oldest snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and deleted data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the second oldest snapshot", + table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and deleted data file", Sets.newHashSet( firstSnapshot.manifestListLocation(), // snapshot expired - firstSnapshot.allManifests(table.io()).get(0).path(), // manifest was rewritten for delete + firstSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest was rewritten for delete secondSnapshot.manifestListLocation(), // snapshot expired FILE_A.path()), // deleted deletedFiles); @@ -868,33 +913,26 @@ public void testExpireOlderThanWithDeleteInMergedManifests() { @Test public void testExpireOlderThanWithRollback() { // merge every commit - table.updateProperties() - .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); rightAfterSnapshot(); - table.newDelete() - .deleteFile(FILE_B) - .commit(); + table.newDelete().deleteFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Set secondSnapshotManifests = Sets.newHashSet(secondSnapshot.allManifests(table.io())); + Set secondSnapshotManifests = + Sets.newHashSet(secondSnapshot.allManifests(table.io())); secondSnapshotManifests.removeAll(firstSnapshot.allManifests(table.io())); - Assert.assertEquals("Should add one new manifest for append", 1, secondSnapshotManifests.size()); + Assert.assertEquals( + "Should add one new manifest for append", 1, secondSnapshotManifests.size()); - table.manageSnapshots() - .rollbackTo(firstSnapshot.snapshotId()) - .commit(); + table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit(); long tAfterCommits = rightAfterSnapshot(secondSnapshot.snapshotId()); @@ -902,19 +940,29 @@ public void testExpireOlderThanWithRollback() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNotNull("Expire should keep the oldest snapshot, current", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and reverted appended data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNotNull( + "Expire should keep the oldest snapshot, current", + table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and reverted appended data file", Sets.newHashSet( secondSnapshot.manifestListLocation(), // snapshot expired - Iterables.getOnlyElement(secondSnapshotManifests).path()), // manifest is no longer referenced + Iterables.getOnlyElement(secondSnapshotManifests) + .path()), // manifest is no longer referenced deletedFiles); checkExpirationResults(0, 1, 1, result); @@ -922,28 +970,24 @@ public void testExpireOlderThanWithRollback() { @Test public void testExpireOlderThanWithRollbackAndMergedManifests() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Set secondSnapshotManifests = Sets.newHashSet(secondSnapshot.allManifests(table.io())); + Set secondSnapshotManifests = + Sets.newHashSet(secondSnapshot.allManifests(table.io())); secondSnapshotManifests.removeAll(firstSnapshot.allManifests(table.io())); - Assert.assertEquals("Should add one new manifest for append", 1, secondSnapshotManifests.size()); + Assert.assertEquals( + "Should add one new manifest for append", 1, secondSnapshotManifests.size()); - table.manageSnapshots() - .rollbackTo(firstSnapshot.snapshotId()) - .commit(); + table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit(); long tAfterCommits = rightAfterSnapshot(secondSnapshot.snapshotId()); @@ -951,19 +995,29 @@ public void testExpireOlderThanWithRollbackAndMergedManifests() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNotNull("Expire should keep the oldest snapshot, current", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and reverted appended data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNotNull( + "Expire should keep the oldest snapshot, current", + table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and reverted appended data file", Sets.newHashSet( secondSnapshot.manifestListLocation(), // snapshot expired - Iterables.getOnlyElement(secondSnapshotManifests).path(), // manifest is no longer referenced + Iterables.getOnlyElement(secondSnapshotManifests) + .path(), // manifest is no longer referenced FILE_B.path()), // added, but rolled back deletedFiles); @@ -975,27 +1029,25 @@ public void testExpireOnEmptyTable() { Set deletedFiles = Sets.newHashSet(); // table has no data, testing ExpireSnapshots should not fail with no snapshot - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(System.currentTimeMillis()) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(System.currentTimeMillis()) + .deleteWith(deletedFiles::add) + .execute(); checkExpirationResults(0, 0, 0, result); } @Test public void testExpireAction() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); long snapshotId = table.currentSnapshot().snapshotId(); @@ -1003,58 +1055,68 @@ public void testExpireAction() { Set deletedFiles = Sets.newHashSet(); - BaseExpireSnapshotsSparkAction action = (BaseExpireSnapshotsSparkAction) SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add); + BaseExpireSnapshotsSparkAction action = + (BaseExpireSnapshotsSparkAction) + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add); Dataset pendingDeletes = action.expire(); List pending = pendingDeletes.collectAsList(); - Assert.assertEquals("Should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertEquals( + "Should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); Assert.assertEquals("Pending deletes should contain one row", 1, pending.size()); - Assert.assertEquals("Pending delete should be the expired manifest list location", - firstSnapshot.manifestListLocation(), pending.get(0).getString(0)); - Assert.assertEquals("Pending delete should be a manifest list", - "Manifest List", pending.get(0).getString(1)); + Assert.assertEquals( + "Pending delete should be the expired manifest list location", + firstSnapshot.manifestListLocation(), + pending.get(0).getString(0)); + Assert.assertEquals( + "Pending delete should be a manifest list", "Manifest List", pending.get(0).getString(1)); Assert.assertEquals("Should not delete any files", 0, deletedFiles.size()); - Assert.assertSame("Multiple calls to expire should return the same deleted files", - pendingDeletes, action.expire()); + Assert.assertSame( + "Multiple calls to expire should return the same deleted files", + pendingDeletes, + action.expire()); } @Test public void testUseLocalIterator() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); long end = rightAfterSnapshot(); int jobsBeforeStreamResults = spark.sparkContext().dagScheduler().nextJobId().get(); - withSQLConf(ImmutableMap.of("spark.sql.adaptive.enabled", "false"), () -> { - ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).expireOlderThan(end) - .option("stream-results", "true").execute(); - - int jobsAfterStreamResults = spark.sparkContext().dagScheduler().nextJobId().get(); - int jobsRunDuringStreamResults = jobsAfterStreamResults - jobsBeforeStreamResults; - - checkExpirationResults(1L, 1L, 2L, results); - - Assert.assertEquals("Expected total number of jobs with stream-results should match the expected number", - 5L, jobsRunDuringStreamResults); - }); + withSQLConf( + ImmutableMap.of("spark.sql.adaptive.enabled", "false"), + () -> { + ExpireSnapshots.Result results = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(end) + .option("stream-results", "true") + .execute(); + + int jobsAfterStreamResults = spark.sparkContext().dagScheduler().nextJobId().get(); + int jobsRunDuringStreamResults = jobsAfterStreamResults - jobsBeforeStreamResults; + + checkExpirationResults(1L, 1L, 2L, results); + + Assert.assertEquals( + "Expected total number of jobs with stream-results should match the expected number", + 5L, + jobsRunDuringStreamResults); + }); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java index 17fb891a89c8..70f119c45aaa 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -64,23 +65,18 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public abstract class TestRemoveOrphanFilesAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - protected static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); - protected static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .truncate("c2", 2) - .identity("c3") - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + protected static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); + protected static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).truncate("c2", 2).identity("c3").build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableDir = null; protected String tableLocation = null; @@ -92,41 +88,37 @@ public void setupTableLocation() throws Exception { @Test public void testDryRun() throws IOException, InterruptedException { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - List validFiles = spark.read().format("iceberg") - .load(tableLocation + "#files") - .select("file_path") - .as(Encoders.STRING()) - .collectAsList(); + List validFiles = + spark + .read() + .format("iceberg") + .load(tableLocation + "#files") + .select("file_path") + .as(Encoders.STRING()) + .collectAsList(); Assert.assertEquals("Should be 2 valid files", 2, validFiles.size()); df.write().mode("append").parquet(tableLocation + "/data"); Path dataPath = new Path(tableLocation + "/data"); FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf()); - List allFiles = Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) - .filter(FileStatus::isFile) - .map(file -> file.getPath().toString()) - .collect(Collectors.toList()); + List allFiles = + Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) + .filter(FileStatus::isFile) + .map(file -> file.getPath().toString()) + .collect(Collectors.toList()); Assert.assertEquals("Should be 3 files", 3, allFiles.size()); List invalidFiles = Lists.newArrayList(allFiles); @@ -138,32 +130,34 @@ public void testDryRun() throws IOException, InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result1 = actions.deleteOrphanFiles(table) - .deleteWith(s -> { }) - .execute(); - Assert.assertTrue("Default olderThan interval should be safe", Iterables.isEmpty(result1.orphanFileLocations())); + DeleteOrphanFiles.Result result1 = + actions.deleteOrphanFiles(table).deleteWith(s -> {}).execute(); + Assert.assertTrue( + "Default olderThan interval should be safe", + Iterables.isEmpty(result1.orphanFileLocations())); - DeleteOrphanFiles.Result result2 = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .deleteWith(s -> { }) - .execute(); + DeleteOrphanFiles.Result result2 = + actions + .deleteOrphanFiles(table) + .olderThan(System.currentTimeMillis()) + .deleteWith(s -> {}) + .execute(); Assert.assertEquals("Action should find 1 file", invalidFiles, result2.orphanFileLocations()); Assert.assertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0)))); - DeleteOrphanFiles.Result result3 = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result3 = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Action should delete 1 file", invalidFiles, result3.orphanFileLocations()); - Assert.assertFalse("Invalid file should not be present", fs.exists(new Path(invalidFiles.get(0)))); + Assert.assertFalse( + "Invalid file should not be present", fs.exists(new Path(invalidFiles.get(0)))); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(records); expectedRecords.addAll(records); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -171,36 +165,22 @@ public void testDryRun() throws IOException, InterruptedException { public void testAllValidFilesAreKept() throws IOException, InterruptedException { Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records1 = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df1 = spark.createDataFrame(records1, ThreeColumnRecord.class).coalesce(1); // original append - df1.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA") - ); + List records2 = + Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA")); Dataset df2 = spark.createDataFrame(records2, ThreeColumnRecord.class).coalesce(1); // dynamic partition overwrite - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("overwrite") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation); // second append - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); List snapshots = Lists.newArrayList(table.snapshots()); @@ -223,9 +203,8 @@ public void testAllValidFilesAreKept() throws IOException, InterruptedException SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 4 files", 4, Iterables.size(result.orphanFileLocations())); @@ -249,36 +228,22 @@ public void testAllValidFilesAreKept() throws IOException, InterruptedException public void orphanedFileRemovedWithParallelTasks() throws InterruptedException, IOException { Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records1 = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df1 = spark.createDataFrame(records1, ThreeColumnRecord.class).coalesce(1); // original append - df1.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA") - ); + List records2 = + Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA")); Dataset df2 = spark.createDataFrame(records2, ThreeColumnRecord.class).coalesce(1); // dynamic partition overwrite - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("overwrite") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation); // second append - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data"); df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA"); @@ -292,25 +257,34 @@ public void orphanedFileRemovedWithParallelTasks() throws InterruptedException, Set deleteThreads = ConcurrentHashMap.newKeySet(); AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - ExecutorService executorService = Executors.newFixedThreadPool(4, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-orphan-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); - return thread; - }); - - DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table) + ExecutorService executorService = + Executors.newFixedThreadPool( + 4, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("remove-orphan-" + deleteThreadsIndex.getAndIncrement()); + thread.setDaemon(true); + return thread; + }); + + DeleteOrphanFiles.Result result = + SparkActions.get() + .deleteOrphanFiles(table) .executeDeleteWith(executorService) .olderThan(System.currentTimeMillis()) - .deleteWith(file -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(file); - }) + .deleteWith( + file -> { + deleteThreads.add(Thread.currentThread().getName()); + deletedFiles.add(file); + }) .execute(); - // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory - Assert.assertEquals(deleteThreads, - Sets.newHashSet("remove-orphan-0", "remove-orphan-1", "remove-orphan-2", "remove-orphan-3")); + // Verifies that the delete methods ran in the threads created by the provided ExecutorService + // ThreadFactory + Assert.assertEquals( + deleteThreads, + Sets.newHashSet( + "remove-orphan-0", "remove-orphan-1", "remove-orphan-2", "remove-orphan-3")); Assert.assertEquals("Should delete 4 files", 4, deletedFiles.size()); } @@ -321,31 +295,21 @@ public void testWapFilesAreKept() throws InterruptedException { props.put(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, "true"); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); // normal write - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); spark.conf().set("spark.wap.id", "1"); // wap write - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Should not return data from the staged snapshot", records, actualRecords); // sleep for 1 second to unsure files will be old enough @@ -353,11 +317,11 @@ public void testWapFilesAreKept() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); + Assert.assertTrue( + "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); } @Test @@ -367,16 +331,11 @@ public void testMetadataFolderIsIntact() throws InterruptedException { props.put(TableProperties.WRITE_DATA_LOCATION, tableLocation); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df.write().mode("append").parquet(tableLocation + "/c2_trunc=AA/c3=AAAA"); @@ -385,16 +344,14 @@ public void testMetadataFolderIsIntact() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @@ -402,16 +359,11 @@ public void testMetadataFolderIsIntact() throws InterruptedException { public void testOlderThanTimestamp() throws InterruptedException { Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); @@ -426,11 +378,11 @@ public void testOlderThanTimestamp() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(timestamp) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(timestamp).execute(); - Assert.assertEquals("Should delete only 2 files", 2, Iterables.size(result.orphanFileLocations())); + Assert.assertEquals( + "Should delete only 2 files", 2, Iterables.size(result.orphanFileLocations())); } @Test @@ -440,34 +392,26 @@ public void testRemoveUnreachableMetadataVersionFiles() throws InterruptedExcept props.put(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1"); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); // sleep for 1 second to unsure files will be old enough Thread.sleep(1000); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); - Assert.assertTrue("Should remove v1 file", StreamSupport.stream(result.orphanFileLocations().spliterator(), false) + Assert.assertTrue( + "Should remove v1 file", + StreamSupport.stream(result.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("v1.metadata.json"))); List expectedRecords = Lists.newArrayList(); @@ -475,9 +419,8 @@ public void testRemoveUnreachableMetadataVersionFiles() throws InterruptedExcept expectedRecords.addAll(records); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -492,27 +435,22 @@ public void testManyTopLevelPartitions() throws InterruptedException { Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); // sleep for 1 second to unsure files will be old enough Thread.sleep(1000); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); + Assert.assertTrue( + "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @@ -527,32 +465,29 @@ public void testManyLeafPartitions() throws InterruptedException { Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); // sleep for 1 second to unsure files will be old enough Thread.sleep(1000); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); + Assert.assertTrue( + "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } private List snapshotFiles(long snapshotId) { - return spark.read().format("iceberg") + return spark + .read() + .format("iceberg") .option("snapshot-id", snapshotId) .load(tableLocation + "#files") .select("file_path") @@ -562,11 +497,12 @@ private List snapshotFiles(long snapshotId) { @Test public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, InterruptedException { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableDir.getAbsolutePath()); + Table table = + TABLES.create( + SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableDir.getAbsolutePath()); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); @@ -576,11 +512,14 @@ public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, Inte .mode("append") .save(tableDir.getAbsolutePath()); - List validFiles = spark.read().format("iceberg") - .load(tableLocation + "#files") - .select("file_path") - .as(Encoders.STRING()) - .collectAsList(); + List validFiles = + spark + .read() + .format("iceberg") + .load(tableLocation + "#files") + .select("file_path") + .as(Encoders.STRING()) + .collectAsList(); Assert.assertEquals("Should be 1 valid files", 1, validFiles.size()); String validFile = validFiles.get(0); @@ -588,10 +527,11 @@ public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, Inte Path dataPath = new Path(tableLocation + "/data"); FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf()); - List allFiles = Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) - .filter(FileStatus::isFile) - .map(file -> file.getPath().toString()) - .collect(Collectors.toList()); + List allFiles = + Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) + .filter(FileStatus::isFile) + .map(file -> file.getPath().toString()) + .collect(Collectors.toList()); Assert.assertEquals("Should be 2 files", 2, allFiles.size()); List invalidFiles = Lists.newArrayList(allFiles); @@ -602,10 +542,12 @@ public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, Inte Thread.sleep(1000); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .deleteWith(s -> { }) - .execute(); + DeleteOrphanFiles.Result result = + actions + .deleteOrphanFiles(table) + .olderThan(System.currentTimeMillis()) + .deleteWith(s -> {}) + .execute(); Assert.assertEquals("Action should find 1 file", invalidFiles, result.orphanFileLocations()); Assert.assertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0)))); } @@ -618,18 +560,15 @@ public void testRemoveOrphanFilesWithHadoopCatalog() throws InterruptedException Namespace namespace = Namespace.of(namespaceName); TableIdentifier tableIdentifier = TableIdentifier.of(namespace, tableName); - Table table = catalog.createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap()); + Table table = + catalog.createTable( + tableIdentifier, SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap()); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(table.location()); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(table.location()); df.write().mode("append").parquet(table.location() + "/data"); @@ -638,28 +577,30 @@ public void testRemoveOrphanFilesWithHadoopCatalog() throws InterruptedException table.refresh(); - DeleteOrphanFiles.Result result = SparkActions.get() - .deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + SparkActions.get().deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertEquals("Should delete only 1 files", 1, Iterables.size(result.orphanFileLocations())); + Assert.assertEquals( + "Should delete only 1 files", 1, Iterables.size(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(table.location()); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @Test public void testHiveCatalogTable() throws IOException { - Table table = catalog.createTable(TableIdentifier.of("default", "hivetestorphan"), SCHEMA, SPEC, tableLocation, - Maps.newHashMap()); + Table table = + catalog.createTable( + TableIdentifier.of("default", "hivetestorphan"), + SCHEMA, + SPEC, + tableLocation, + Maps.newHashMap()); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); @@ -672,35 +613,35 @@ public void testHiveCatalogTable() throws IOException { String location = table.location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result result = + SparkActions.get() + .deleteOrphanFiles(table) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(result.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "data/trashfile"))); } @Test public void testGarbageCollectionDisabled() { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - table.updateProperties() - .set(TableProperties.GC_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - AssertHelpers.assertThrows("Should complain about removing orphan files", - ValidationException.class, "Cannot remove orphan files: GC is disabled", + AssertHelpers.assertThrows( + "Should complain about removing orphan files", + ValidationException.class, + "Cannot remove orphan files: GC is disabled", () -> SparkActions.get().deleteOrphanFiles(table).execute()); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction3.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction3.java index 77eb23a6dffc..e3699eaeded1 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction3.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction3.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.io.File; @@ -54,9 +53,13 @@ public void testSparkCatalogTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @@ -80,9 +83,13 @@ public void testSparkCatalogNamedHadoopTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @@ -106,19 +113,26 @@ public void testSparkCatalogNamedHiveTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @Test public void testSparkSessionCatalogHadoopTable() throws Exception { - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); spark.conf().set("spark.sql.catalog.spark_catalog.type", "hadoop"); spark.conf().set("spark.sql.catalog.spark_catalog.warehouse", tableLocation); - SparkSessionCatalog cat = (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog(); + SparkSessionCatalog cat = + (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog(); String[] database = {"default"}; Identifier id = Identifier.of(database, "table"); @@ -132,18 +146,25 @@ public void testSparkSessionCatalogHadoopTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @Test public void testSparkSessionCatalogHiveTable() throws Exception { - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); spark.conf().set("spark.sql.catalog.spark_catalog.type", "hive"); - SparkSessionCatalog cat = (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog(); + SparkSessionCatalog cat = + (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog(); String[] database = {"default"}; Identifier id = Identifier.of(database, "sessioncattest"); @@ -158,9 +179,13 @@ public void testSparkSessionCatalogHiveTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @@ -171,5 +196,4 @@ public void resetSparkSessionCatalog() throws Exception { spark.conf().unset("spark.sql.catalog.spark_catalog.type"); spark.conf().unset("spark.sql.catalog.spark_catalog.warehouse"); } - } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index b057b8e62ba7..3bc07093714f 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -16,9 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.argThat; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.doCallRealMethod; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.spy; + import java.io.File; import java.io.IOException; import java.io.UncheckedIOException; @@ -95,28 +103,18 @@ import org.mockito.ArgumentMatcher; import org.mockito.Mockito; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.argThat; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.doCallRealMethod; -import static org.mockito.Mockito.doReturn; -import static org.mockito.Mockito.doThrow; -import static org.mockito.Mockito.spy; - public class TestRewriteDataFilesAction extends SparkTestBase { private static final int SCALE = 400000; private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final FileRewriteCoordinator coordinator = FileRewriteCoordinator.get(); private final FileScanTaskSetManager manager = FileScanTaskSetManager.get(); @@ -185,10 +183,11 @@ public void testBinPackWithFilter() { shouldHaveFiles(table, 8); List expectedRecords = currentData(); - Result result = basicRewrite(table) - .filter(Expressions.equal("c1", 1)) - .filter(Expressions.startsWith("c2", "foo")) - .execute(); + Result result = + basicRewrite(table) + .filter(Expressions.equal("c1", 1)) + .filter(Expressions.startsWith("c2", "foo")) + .execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFilesCount()); @@ -207,32 +206,33 @@ public void testBinPackWithDeletes() throws Exception { table.refresh(); CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum(); RowDelta rowDelta = table.newRowDelta(); // add 1 delete file for data files 0, 1, 2 for (int i = 0; i < 3; i++) { - writePosDeletesToFile(table, dataFiles.get(i), 1) - .forEach(rowDelta::addDeletes); + writePosDeletesToFile(table, dataFiles.get(i), 1).forEach(rowDelta::addDeletes); } // add 2 delete files for data files 3, 4 for (int i = 3; i < 5; i++) { - writePosDeletesToFile(table, dataFiles.get(i), 2) - .forEach(rowDelta::addDeletes); + writePosDeletesToFile(table, dataFiles.get(i), 2).forEach(rowDelta::addDeletes); } rowDelta.commit(); table.refresh(); List expectedRecords = currentData(); - Result result = actions().rewriteDataFiles(table) - // do not include any file based on bin pack file size configs - .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, "0") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)) - .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "2") - .execute(); + Result result = + actions() + .rewriteDataFiles(table) + // do not include any file based on bin pack file size configs + .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, "0") + .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)) + .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)) + .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "2") + .execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount()); List actualRecords = currentData(); @@ -249,20 +249,22 @@ public void testBinPackWithDeleteAllData() { table.refresh(); CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum(); RowDelta rowDelta = table.newRowDelta(); // remove all data - writePosDeletesToFile(table, dataFiles.get(0), total) - .forEach(rowDelta::addDeletes); + writePosDeletesToFile(table, dataFiles.get(0), total).forEach(rowDelta::addDeletes); rowDelta.commit(); table.refresh(); List expectedRecords = currentData(); - Result result = actions().rewriteDataFiles(table) - .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "1") - .execute(); + Result result = + actions() + .rewriteDataFiles(table) + .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "1") + .execute(); Assert.assertEquals("Action should rewrite 1 data files", 1, result.rewrittenDataFilesCount()); List actualRecords = currentData(); @@ -271,7 +273,8 @@ public void testBinPackWithDeleteAllData() { "Data manifest should not have existing data file", 0, (long) table.currentSnapshot().dataManifests(table.io()).get(0).existingFilesCount()); - Assert.assertEquals("Data manifest should have 1 delete data file", + Assert.assertEquals( + "Data manifest should have 1 delete data file", 1L, (long) table.currentSnapshot().dataManifests(table.io()).get(0).deletedFilesCount()); Assert.assertEquals( @@ -289,9 +292,8 @@ public void testBinPackWithStartingSequenceNumber() { table.refresh(); long oldSequenceNumber = table.currentSnapshot().sequenceNumber(); - Result result = basicRewrite(table) - .option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true") - .execute(); + Result result = + basicRewrite(table).option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true").execute(); Assert.assertEquals("Action should rewrite 8 data files", 8, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFilesCount()); @@ -300,13 +302,15 @@ public void testBinPackWithStartingSequenceNumber() { assertEquals("Rows must match", expectedRecords, actualRecords); table.refresh(); - Assert.assertTrue("Table sequence number should be incremented", + Assert.assertTrue( + "Table sequence number should be incremented", oldSequenceNumber < table.currentSnapshot().sequenceNumber()); Dataset rows = SparkTableUtil.loadMetadataTable(spark, table, MetadataTableType.ENTRIES); for (Row row : rows.collectAsList()) { if (row.getInt(0) == 1) { - Assert.assertEquals("Expect old sequence number for added entries", oldSequenceNumber, row.getLong(2)); + Assert.assertEquals( + "Expect old sequence number for added entries", oldSequenceNumber, row.getLong(2)); } } } @@ -320,9 +324,8 @@ public void testBinPackWithStartingSequenceNumberV1Compatibility() { long oldSequenceNumber = table.currentSnapshot().sequenceNumber(); Assert.assertEquals("Table sequence number should be 0", 0, oldSequenceNumber); - Result result = basicRewrite(table) - .option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true") - .execute(); + Result result = + basicRewrite(table).option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true").execute(); Assert.assertEquals("Action should rewrite 8 data files", 8, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFilesCount()); @@ -331,13 +334,15 @@ public void testBinPackWithStartingSequenceNumberV1Compatibility() { assertEquals("Rows must match", expectedRecords, actualRecords); table.refresh(); - Assert.assertEquals("Table sequence number should still be 0", - oldSequenceNumber, table.currentSnapshot().sequenceNumber()); + Assert.assertEquals( + "Table sequence number should still be 0", + oldSequenceNumber, + table.currentSnapshot().sequenceNumber()); Dataset rows = SparkTableUtil.loadMetadataTable(spark, table, MetadataTableType.ENTRIES); for (Row row : rows.collectAsList()) { - Assert.assertEquals("Expect sequence number 0 for all entries", - oldSequenceNumber, row.getLong(2)); + Assert.assertEquals( + "Expect sequence number 0 for all entries", oldSequenceNumber, row.getLong(2)); } } @@ -360,19 +365,15 @@ public void testRewriteLargeTableHasResiduals() { table.refresh(); - CloseableIterable tasks = table.newScan() - .ignoreResiduals() - .filter(Expressions.equal("c3", "0")) - .planFiles(); + CloseableIterable tasks = + table.newScan().ignoreResiduals().filter(Expressions.equal("c3", "0")).planFiles(); for (FileScanTask task : tasks) { Assert.assertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual()); } shouldHaveFiles(table, 2); - Result result = basicRewrite(table) - .filter(Expressions.equal("c3", "0")) - .execute(); + Result result = basicRewrite(table).filter(Expressions.equal("c3", "0")).execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFilesCount()); @@ -389,10 +390,11 @@ public void testBinPackSplitLargeFile() { List expectedRecords = currentData(); long targetSize = testDataSize(table) / 2; - Result result = basicRewrite(table) - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(targetSize)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(targetSize * 2 - 2000)) - .execute(); + Result result = + basicRewrite(table) + .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(targetSize)) + .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(targetSize * 2 - 2000)) + .execute(); Assert.assertEquals("Action should delete 1 data files", 1, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 2 data files", 2, result.addedDataFilesCount()); @@ -417,14 +419,16 @@ public void testBinPackCombineMixedFiles() { int targetSize = averageFileSize(table); - Result result = basicRewrite(table) - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize + 1000)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString(targetSize + 80000)) - .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 1000)) - .execute(); + Result result = + basicRewrite(table) + .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize + 1000)) + .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString(targetSize + 80000)) + .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 1000)) + .execute(); Assert.assertEquals("Action should delete 3 data files", 3, result.rewrittenDataFilesCount()); - // Should Split the big files into 3 pieces, one of which should be combined with the two smaller files + // Should Split the big files into 3 pieces, one of which should be combined with the two + // smaller files Assert.assertEquals("Action should add 3 data files", 3, result.addedDataFilesCount()); shouldHaveFiles(table, 3); @@ -442,11 +446,14 @@ public void testBinPackCombineMediumFiles() { int targetSize = ((int) testDataSize(table) / 3); // The test is to see if we can combine parts of files to make files of the correct size - Result result = basicRewrite(table) - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString((int) (targetSize * 1.8))) - .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 100)) // All files too small - .execute(); + Result result = + basicRewrite(table) + .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize)) + .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString((int) (targetSize * 1.8))) + .option( + BinPackStrategy.MIN_FILE_SIZE_BYTES, + Integer.toString(targetSize - 100)) // All files too small + .execute(); Assert.assertEquals("Action should delete 4 data files", 4, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 3 data files", 3, result.addedDataFilesCount()); @@ -468,7 +475,8 @@ public void testPartialProgressEnabled() { RewriteDataFiles.Result result = basicRewrite(table) .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "10") .execute(); @@ -493,7 +501,8 @@ public void testMultipleGroups() { // Perform a rewrite but only allow 2 files to be compacted at a time RewriteDataFiles.Result result = basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .option(BinPackStrategy.MIN_INPUT_FILES, "1") .execute(); @@ -518,7 +527,8 @@ public void testPartialProgressMaxCommits() { // Perform a rewrite but only allow 2 files to be compacted at a time RewriteDataFiles.Result result = basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3") .execute(); @@ -544,7 +554,9 @@ public void testSingleCommitWithRewriteFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)); + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)); BaseRewriteDataFilesSparkAction spyRewrite = Mockito.spy(realRewrite); @@ -554,7 +566,9 @@ public void testSingleCommitWithRewriteFailure() { .when(spyRewrite) .rewriteFiles(any(), argThat(failGroup)); - AssertHelpers.assertThrows("Should fail entire rewrite if part fails", RuntimeException.class, + AssertHelpers.assertThrows( + "Should fail entire rewrite if part fails", + RuntimeException.class, () -> spyRewrite.execute()); table.refresh(); @@ -577,21 +591,21 @@ public void testSingleCommitWithCommitFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)); + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)); BaseRewriteDataFilesSparkAction spyRewrite = spy(realRewrite); RewriteDataFilesCommitManager util = spy(new RewriteDataFilesCommitManager(table)); // Fail to commit - doThrow(new RuntimeException("Commit Failure")) - .when(util) - .commitFileGroups(any()); + doThrow(new RuntimeException("Commit Failure")).when(util).commitFileGroups(any()); - doReturn(util) - .when(spyRewrite) - .commitManager(table.currentSnapshot().snapshotId()); + doReturn(util).when(spyRewrite).commitManager(table.currentSnapshot().snapshotId()); - AssertHelpers.assertThrows("Should fail entire rewrite if commit fails", RuntimeException.class, + AssertHelpers.assertThrows( + "Should fail entire rewrite if commit fails", + RuntimeException.class, () -> spyRewrite.execute()); table.refresh(); @@ -614,7 +628,9 @@ public void testParallelSingleCommitWithRewriteFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3"); BaseRewriteDataFilesSparkAction spyRewrite = Mockito.spy(realRewrite); @@ -625,7 +641,9 @@ public void testParallelSingleCommitWithRewriteFailure() { .when(spyRewrite) .rewriteFiles(any(), argThat(failGroup)); - AssertHelpers.assertThrows("Should fail entire rewrite if part fails", RuntimeException.class, + AssertHelpers.assertThrows( + "Should fail entire rewrite if part fails", + RuntimeException.class, () -> spyRewrite.execute()); table.refresh(); @@ -648,7 +666,9 @@ public void testPartialProgressWithRewriteFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3"); @@ -686,7 +706,9 @@ public void testParallelPartialProgressWithRewriteFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3") .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3"); @@ -725,7 +747,9 @@ public void testParallelPartialProgressWithCommitFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3") .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3"); @@ -740,9 +764,7 @@ public void testParallelPartialProgressWithCommitFailure() { .when(util) .commitFileGroups(any()); - doReturn(util) - .when(spyRewrite) - .commitManager(table.currentSnapshot().snapshotId()); + doReturn(util).when(spyRewrite).commitManager(table.currentSnapshot().snapshotId()); RewriteDataFiles.Result result = spyRewrite.execute(); @@ -764,30 +786,32 @@ public void testParallelPartialProgressWithCommitFailure() { public void testInvalidOptions() { Table table = createTable(20); - AssertHelpers.assertThrows("No negative values for partial progress max commits", + AssertHelpers.assertThrows( + "No negative values for partial progress max commits", IllegalArgumentException.class, - () -> basicRewrite(table) - .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") - .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "-5") - .execute()); + () -> + basicRewrite(table) + .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") + .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "-5") + .execute()); - AssertHelpers.assertThrows("No negative values for max concurrent groups", + AssertHelpers.assertThrows( + "No negative values for max concurrent groups", IllegalArgumentException.class, - () -> basicRewrite(table) - .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "-5") - .execute()); + () -> + basicRewrite(table) + .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "-5") + .execute()); - AssertHelpers.assertThrows("No unknown options allowed", + AssertHelpers.assertThrows( + "No unknown options allowed", IllegalArgumentException.class, - () -> basicRewrite(table) - .option("foobarity", "-5") - .execute()); + () -> basicRewrite(table).option("foobarity", "-5").execute()); - AssertHelpers.assertThrows("Cannot set rewrite-job-order to foo", + AssertHelpers.assertThrows( + "Cannot set rewrite-job-order to foo", IllegalArgumentException.class, - () -> basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, "foo") - .execute()); + () -> basicRewrite(table).option(RewriteDataFiles.REWRITE_JOB_ORDER, "foo").execute()); } @Test @@ -805,7 +829,8 @@ public void testSortMultipleGroups() { basicRewrite(table) .sort() .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .execute(); Assert.assertEquals("Should have 10 fileGroups", result.rewriteResults().size(), 10); @@ -833,7 +858,8 @@ public void testSimpleSort() { .sort() .option(SortStrategy.MIN_INPUT_FILES, "1") .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); @@ -864,11 +890,14 @@ public void testSortAfterPartitionChange() { .sort() .option(SortStrategy.MIN_INPUT_FILES, "1") .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); - Assert.assertEquals("Should have 1 fileGroup because all files were not correctly partitioned", - result.rewriteResults().size(), 1); + Assert.assertEquals( + "Should have 1 fileGroup because all files were not correctly partitioned", + result.rewriteResults().size(), + 1); table.refresh(); @@ -893,7 +922,8 @@ public void testSortCustomSortOrder() { basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c2").build()) .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); @@ -928,7 +958,9 @@ public void testSortCustomSortOrderRequiresRepartition() { basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c3").build()) .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) / partitions)) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + Integer.toString(averageFileSize(table) / partitions)) .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); @@ -956,14 +988,19 @@ public void testAutoSortShuffleOutput() { RewriteDataFiles.Result result = basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c2").build()) - .option(SortStrategy.MAX_FILE_SIZE_BYTES, Integer.toString((averageFileSize(table) / 2) + 2)) + .option( + SortStrategy.MAX_FILE_SIZE_BYTES, + Integer.toString((averageFileSize(table) / 2) + 2)) // Divide files in 2 - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) / 2)) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + Integer.toString(averageFileSize(table) / 2)) .option(SortStrategy.MIN_INPUT_FILES, "1") .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); - Assert.assertTrue("Should have written 40+ files", + Assert.assertTrue( + "Should have written 40+ files", Iterables.size(table.currentSnapshot().addedDataFiles(table.io())) >= 40); table.refresh(); @@ -988,17 +1025,20 @@ public void testCommitStateUnknownException() { BaseRewriteDataFilesSparkAction spyAction = spy(action); RewriteDataFilesCommitManager util = spy(new RewriteDataFilesCommitManager(table)); - doAnswer(invocationOnMock -> { - invocationOnMock.callRealMethod(); - throw new CommitStateUnknownException(new RuntimeException("Unknown State")); - }).when(util).commitFileGroups(any()); + doAnswer( + invocationOnMock -> { + invocationOnMock.callRealMethod(); + throw new CommitStateUnknownException(new RuntimeException("Unknown State")); + }) + .when(util) + .commitFileGroups(any()); - doReturn(util) - .when(spyAction) - .commitManager(table.currentSnapshot().snapshotId()); + doReturn(util).when(spyAction).commitManager(table.currentSnapshot().snapshotId()); - AssertHelpers.assertThrows("Should propagate CommitStateUnknown Exception", - CommitStateUnknownException.class, () -> spyAction.execute()); + AssertHelpers.assertThrows( + "Should propagate CommitStateUnknown Exception", + CommitStateUnknownException.class, + () -> spyAction.execute()); List postRewriteData = currentData(); assertEquals("We shouldn't have changed the data", originalData, postRewriteData); @@ -1010,14 +1050,23 @@ public void testCommitStateUnknownException() { public void testInvalidAPIUsage() { Table table = createTable(1); - AssertHelpers.assertThrows("Should be unable to set Strategy more than once", IllegalArgumentException.class, - "Cannot set strategy", () -> actions().rewriteDataFiles(table).binPack().sort()); + AssertHelpers.assertThrows( + "Should be unable to set Strategy more than once", + IllegalArgumentException.class, + "Cannot set strategy", + () -> actions().rewriteDataFiles(table).binPack().sort()); - AssertHelpers.assertThrows("Should be unable to set Strategy more than once", IllegalArgumentException.class, - "Cannot set strategy", () -> actions().rewriteDataFiles(table).sort().binPack()); + AssertHelpers.assertThrows( + "Should be unable to set Strategy more than once", + IllegalArgumentException.class, + "Cannot set strategy", + () -> actions().rewriteDataFiles(table).sort().binPack()); - AssertHelpers.assertThrows("Should be unable to set Strategy more than once", IllegalArgumentException.class, - "Cannot set strategy", () -> actions().rewriteDataFiles(table).sort(SortOrder.unsorted()).binPack()); + AssertHelpers.assertThrows( + "Should be unable to set Strategy more than once", + IllegalArgumentException.class, + "Cannot set strategy", + () -> actions().rewriteDataFiles(table).sort(SortOrder.unsorted()).binPack()); } @Test @@ -1030,21 +1079,23 @@ public void testRewriteJobOrderBytesAsc() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); BaseRewriteDataFilesSparkAction basicRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .binPack(); - List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::sizeInBytes) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) basicRewrite(table).binPack(); + List expected = + toGroupStream(table, basicRewrite) + .mapToLong(RewriteFileGroup::sizeInBytes) + .boxed() + .collect(Collectors.toList()); BaseRewriteDataFilesSparkAction jobOrderRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.BYTES_ASC.orderName()) - .binPack(); - List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::sizeInBytes) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) + basicRewrite(table) + .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.BYTES_ASC.orderName()) + .binPack(); + List actual = + toGroupStream(table, jobOrderRewrite) + .mapToLong(RewriteFileGroup::sizeInBytes) + .boxed() + .collect(Collectors.toList()); expected.sort(Comparator.naturalOrder()); Assert.assertEquals("Size in bytes order should be ascending", actual, expected); @@ -1062,21 +1113,23 @@ public void testRewriteJobOrderBytesDesc() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); BaseRewriteDataFilesSparkAction basicRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .binPack(); - List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::sizeInBytes) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) basicRewrite(table).binPack(); + List expected = + toGroupStream(table, basicRewrite) + .mapToLong(RewriteFileGroup::sizeInBytes) + .boxed() + .collect(Collectors.toList()); BaseRewriteDataFilesSparkAction jobOrderRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.BYTES_DESC.orderName()) - .binPack(); - List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::sizeInBytes) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) + basicRewrite(table) + .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.BYTES_DESC.orderName()) + .binPack(); + List actual = + toGroupStream(table, jobOrderRewrite) + .mapToLong(RewriteFileGroup::sizeInBytes) + .boxed() + .collect(Collectors.toList()); expected.sort(Comparator.reverseOrder()); Assert.assertEquals("Size in bytes order should be descending", actual, expected); @@ -1094,21 +1147,23 @@ public void testRewriteJobOrderFilesAsc() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); BaseRewriteDataFilesSparkAction basicRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .binPack(); - List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::numFiles) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) basicRewrite(table).binPack(); + List expected = + toGroupStream(table, basicRewrite) + .mapToLong(RewriteFileGroup::numFiles) + .boxed() + .collect(Collectors.toList()); BaseRewriteDataFilesSparkAction jobOrderRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.FILES_ASC.orderName()) - .binPack(); - List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::numFiles) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) + basicRewrite(table) + .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.FILES_ASC.orderName()) + .binPack(); + List actual = + toGroupStream(table, jobOrderRewrite) + .mapToLong(RewriteFileGroup::numFiles) + .boxed() + .collect(Collectors.toList()); expected.sort(Comparator.naturalOrder()); Assert.assertEquals("Number of files order should be ascending", actual, expected); @@ -1126,21 +1181,23 @@ public void testRewriteJobOrderFilesDesc() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); BaseRewriteDataFilesSparkAction basicRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .binPack(); - List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::numFiles) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) basicRewrite(table).binPack(); + List expected = + toGroupStream(table, basicRewrite) + .mapToLong(RewriteFileGroup::numFiles) + .boxed() + .collect(Collectors.toList()); BaseRewriteDataFilesSparkAction jobOrderRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.FILES_DESC.orderName()) - .binPack(); - List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::numFiles) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) + basicRewrite(table) + .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.FILES_DESC.orderName()) + .binPack(); + List actual = + toGroupStream(table, jobOrderRewrite) + .mapToLong(RewriteFileGroup::numFiles) + .boxed() + .collect(Collectors.toList()); expected.sort(Comparator.reverseOrder()); Assert.assertEquals("Number of files order should be descending", actual, expected); @@ -1148,8 +1205,8 @@ public void testRewriteJobOrderFilesDesc() { Assert.assertNotEquals("Number of files order should not be ascending", actual, expected); } - private Stream toGroupStream(Table table, - BaseRewriteDataFilesSparkAction rewrite) { + private Stream toGroupStream( + Table table, BaseRewriteDataFilesSparkAction rewrite) { rewrite.validateAndInitOptions(); Map>> fileGroupsByPartition = rewrite.planFileGroups(table.currentSnapshot().snapshotId()); @@ -1159,9 +1216,8 @@ private Stream toGroupStream(Table table, } protected List currentData() { - return rowsToJava(spark.read().format("iceberg").load(tableLocation) - .sort("c1", "c2", "c3") - .collectAsList()); + return rowsToJava( + spark.read().format("iceberg").load(tableLocation).sort("c1", "c2", "c3").collectAsList()); } protected long testDataSize(Table table) { @@ -1183,83 +1239,102 @@ protected void shouldHaveFiles(Table table, int numExpected) { protected void shouldHaveSnapshots(Table table, int expectedSnapshots) { table.refresh(); int actualSnapshots = Iterables.size(table.snapshots()); - Assert.assertEquals("Table did not have the expected number of snapshots", - expectedSnapshots, actualSnapshots); + Assert.assertEquals( + "Table did not have the expected number of snapshots", expectedSnapshots, actualSnapshots); } protected void shouldHaveNoOrphans(Table table) { - Assert.assertEquals("Should not have found any orphan files", ImmutableList.of(), - actions().deleteOrphanFiles(table) + Assert.assertEquals( + "Should not have found any orphan files", + ImmutableList.of(), + actions() + .deleteOrphanFiles(table) .olderThan(System.currentTimeMillis()) .execute() .orphanFileLocations()); } protected void shouldHaveACleanCache(Table table) { - Assert.assertEquals("Should not have any entries in cache", ImmutableSet.of(), - cacheContents(table)); + Assert.assertEquals( + "Should not have any entries in cache", ImmutableSet.of(), cacheContents(table)); } protected void shouldHaveLastCommitSorted(Table table, String column) { - List, Pair>> - overlappingFiles = checkForOverlappingFiles(table, column); + List, Pair>> overlappingFiles = checkForOverlappingFiles(table, column); Assert.assertEquals("Found overlapping files", Collections.emptyList(), overlappingFiles); } protected void shouldHaveLastCommitUnsorted(Table table, String column) { - List, Pair>> - overlappingFiles = checkForOverlappingFiles(table, column); + List, Pair>> overlappingFiles = checkForOverlappingFiles(table, column); Assert.assertNotEquals("Found no overlapping files", Collections.emptyList(), overlappingFiles); } private Pair boundsOf(DataFile file, NestedField field, Class javaClass) { int columnId = field.fieldId(); - return Pair.of(javaClass.cast(Conversions.fromByteBuffer(field.type(), file.lowerBounds().get(columnId))), + return Pair.of( + javaClass.cast(Conversions.fromByteBuffer(field.type(), file.lowerBounds().get(columnId))), javaClass.cast(Conversions.fromByteBuffer(field.type(), file.upperBounds().get(columnId)))); } - private List, Pair>> checkForOverlappingFiles(Table table, String column) { + private List, Pair>> checkForOverlappingFiles( + Table table, String column) { table.refresh(); NestedField field = table.schema().caseInsensitiveFindField(column); Class javaClass = (Class) field.type().typeId().javaClass(); Snapshot snapshot = table.currentSnapshot(); - Map> filesByPartition = Streams.stream(snapshot.addedDataFiles(table.io())) - .collect(Collectors.groupingBy(DataFile::partition)); + Map> filesByPartition = + Streams.stream(snapshot.addedDataFiles(table.io())) + .collect(Collectors.groupingBy(DataFile::partition)); Stream, Pair>> overlaps = - filesByPartition.entrySet().stream().flatMap(entry -> { - List datafiles = entry.getValue(); - Preconditions.checkArgument(datafiles.size() > 1, - "This test is checking for overlaps in a situation where no overlaps can actually occur because the " + - "partition %s does not contain multiple datafiles", entry.getKey()); - - List, Pair>> boundComparisons = Lists.cartesianProduct(datafiles, datafiles).stream() - .filter(tuple -> tuple.get(0) != tuple.get(1)) - .map(tuple -> Pair.of(boundsOf(tuple.get(0), field, javaClass), boundsOf(tuple.get(1), field, javaClass))) - .collect(Collectors.toList()); - - Comparator comparator = Comparators.forType(field.type().asPrimitiveType()); - - List, Pair>> overlappingFiles = boundComparisons.stream() - .filter(filePair -> { - Pair left = filePair.first(); - T lMin = left.first(); - T lMax = left.second(); - Pair right = filePair.second(); - T rMin = right.first(); - T rMax = right.second(); - boolean boundsDoNotOverlap = - // Min and Max of a range are greater than or equal to the max value of the other range - (comparator.compare(rMax, lMax) >= 0 && comparator.compare(rMin, lMax) >= 0) || - (comparator.compare(lMax, rMax) >= 0 && comparator.compare(lMin, rMax) >= 0); - - return !boundsDoNotOverlap; - }).collect(Collectors.toList()); - return overlappingFiles.stream(); - }); + filesByPartition.entrySet().stream() + .flatMap( + entry -> { + List datafiles = entry.getValue(); + Preconditions.checkArgument( + datafiles.size() > 1, + "This test is checking for overlaps in a situation where no overlaps can actually occur because the " + + "partition %s does not contain multiple datafiles", + entry.getKey()); + + List, Pair>> boundComparisons = + Lists.cartesianProduct(datafiles, datafiles).stream() + .filter(tuple -> tuple.get(0) != tuple.get(1)) + .map( + tuple -> + Pair.of( + boundsOf(tuple.get(0), field, javaClass), + boundsOf(tuple.get(1), field, javaClass))) + .collect(Collectors.toList()); + + Comparator comparator = Comparators.forType(field.type().asPrimitiveType()); + + List, Pair>> overlappingFiles = + boundComparisons.stream() + .filter( + filePair -> { + Pair left = filePair.first(); + T lMin = left.first(); + T lMax = left.second(); + Pair right = filePair.second(); + T rMin = right.first(); + T rMax = right.second(); + boolean boundsDoNotOverlap = + // Min and Max of a range are greater than or equal to the max + // value of the other range + (comparator.compare(rMax, lMax) >= 0 + && comparator.compare(rMin, lMax) >= 0) + || (comparator.compare(lMax, rMax) >= 0 + && comparator.compare(lMin, rMax) >= 0); + + return !boundsDoNotOverlap; + }) + .collect(Collectors.toList()); + return overlappingFiles.stream(); + }); return overlaps.collect(Collectors.toList()); } @@ -1268,13 +1343,17 @@ protected Table createTable() { PartitionSpec spec = PartitionSpec.unpartitioned(); Map options = Maps.newHashMap(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - table.updateProperties().set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, Integer.toString(20 * 1024)).commit(); + table + .updateProperties() + .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, Integer.toString(20 * 1024)) + .commit(); Assert.assertNull("Table must be empty", table.currentSnapshot()); return table; } /** * Create a table with a certain number of files, returns the size of a file + * * @param files number of files to create * @return the created table */ @@ -1284,12 +1363,9 @@ protected Table createTable(int files) { return table; } - protected Table createTablePartitioned(int partitions, int files, - int numRecords, Map options) { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .truncate("c2", 2) - .build(); + protected Table createTablePartitioned( + int partitions, int files, int numRecords, Map options) { + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); Assert.assertNull("Table must be empty", table.currentSnapshot()); @@ -1303,7 +1379,11 @@ protected Table createTablePartitioned(int partitions, int files) { protected int averageFileSize(Table table) { table.refresh(); - return (int) Streams.stream(table.newScan().planFiles()).mapToLong(FileScanTask::length).average().getAsDouble(); + return (int) + Streams.stream(table.newScan().planFiles()) + .mapToLong(FileScanTask::length) + .average() + .getAsDouble(); } private void writeRecords(int files, int numRecords) { @@ -1314,20 +1394,21 @@ private void writeRecords(int files, int numRecords, int partitions) { List records = Lists.newArrayList(); int rowDimension = (int) Math.ceil(Math.sqrt(numRecords)); List> data = - IntStream.range(0, rowDimension).boxed().flatMap(x -> - IntStream.range(0, rowDimension).boxed().map(y -> Pair.of(x, y))) + IntStream.range(0, rowDimension) + .boxed() + .flatMap(x -> IntStream.range(0, rowDimension).boxed().map(y -> Pair.of(x, y))) .collect(Collectors.toList()); Collections.shuffle(data, new Random(42)); if (partitions > 0) { - data.forEach(i -> records.add(new ThreeColumnRecord( - i.first() % partitions, - "foo" + i.first(), - "bar" + i.second()))); + data.forEach( + i -> + records.add( + new ThreeColumnRecord( + i.first() % partitions, "foo" + i.first(), "bar" + i.second()))); } else { - data.forEach(i -> records.add(new ThreeColumnRecord( - i.first(), - "foo" + i.first(), - "bar" + i.second()))); + data.forEach( + i -> + records.add(new ThreeColumnRecord(i.first(), "foo" + i.first(), "bar" + i.second()))); } Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).repartition(files); writeDF(df); @@ -1342,24 +1423,31 @@ private void writeDF(Dataset df) { .save(tableLocation); } - private List writePosDeletesToFile(Table table, DataFile dataFile, int outputDeleteFiles) { - return writePosDeletes(table, dataFile.partition(), dataFile.path().toString(), outputDeleteFiles); + private List writePosDeletesToFile( + Table table, DataFile dataFile, int outputDeleteFiles) { + return writePosDeletes( + table, dataFile.partition(), dataFile.path().toString(), outputDeleteFiles); } - private List writePosDeletes(Table table, StructLike partition, String path, int outputDeleteFiles) { + private List writePosDeletes( + Table table, StructLike partition, String path, int outputDeleteFiles) { List results = Lists.newArrayList(); int rowPosition = 0; for (int file = 0; file < outputDeleteFiles; file++) { - OutputFile outputFile = table.io().newOutputFile( - table.locationProvider().newDataLocation(UUID.randomUUID().toString())); - EncryptedOutputFile encryptedOutputFile = EncryptedFiles.encryptedOutput( - outputFile, EncryptionKeyMetadata.EMPTY); - - GenericAppenderFactory appenderFactory = new GenericAppenderFactory( - table.schema(), table.spec(), null, null, null); - PositionDeleteWriter posDeleteWriter = appenderFactory - .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full") - .newPosDeleteWriter(encryptedOutputFile, FileFormat.PARQUET, partition); + OutputFile outputFile = + table + .io() + .newOutputFile( + table.locationProvider().newDataLocation(UUID.randomUUID().toString())); + EncryptedOutputFile encryptedOutputFile = + EncryptedFiles.encryptedOutput(outputFile, EncryptionKeyMetadata.EMPTY); + + GenericAppenderFactory appenderFactory = + new GenericAppenderFactory(table.schema(), table.spec(), null, null, null); + PositionDeleteWriter posDeleteWriter = + appenderFactory + .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full") + .newPosDeleteWriter(encryptedOutputFile, FileFormat.PARQUET, partition); posDeleteWriter.delete(path, rowPosition); try { diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java index f30251e74001..4b50ea0c29f3 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + import java.io.File; import java.io.IOException; import java.util.List; @@ -53,28 +57,22 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - @RunWith(Parameterized.class) public class TestRewriteManifestsAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); @Parameterized.Parameters(name = "snapshotIdInheritanceEnabled = {0}") public static Object[] parameters() { - return new Object[] { "true", "false" }; + return new Object[] {"true", "false"}; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String snapshotIdInheritanceEnabled; private String tableLocation = null; @@ -100,7 +98,8 @@ public void testRewriteManifestsEmptyTable() throws IOException { SparkActions actions = SparkActions.get(); - actions.rewriteManifests(table) + actions + .rewriteManifests(table) .rewriteIf(manifest -> true) .stagingLocation(temp.newFolder().toString()) .execute(); @@ -115,16 +114,15 @@ public void testRewriteSmallManifestsNonPartitionedTable() { options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -134,12 +132,13 @@ public void testRewriteSmallManifestsNonPartitionedTable() { SparkActions actions = SparkActions.get(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .execute(); + RewriteManifests.Result result = + actions.rewriteManifests(table).rewriteIf(manifest -> true).execute(); - Assert.assertEquals("Action should rewrite 2 manifests", 2, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite 2 manifests", 2, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); table.refresh(); @@ -155,9 +154,8 @@ public void testRewriteSmallManifestsNonPartitionedTable() { expectedRecords.addAll(records2); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -169,16 +167,15 @@ public void testRewriteManifestsWithCommitStateUnknownException() { options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -191,15 +188,19 @@ public void testRewriteManifestsWithCommitStateUnknownException() { // create a spy which would throw a CommitStateUnknownException after successful commit. org.apache.iceberg.RewriteManifests newRewriteManifests = table.rewriteManifests(); org.apache.iceberg.RewriteManifests spyNewRewriteManifests = spy(newRewriteManifests); - doAnswer(invocation -> { - newRewriteManifests.commit(); - throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); - }).when(spyNewRewriteManifests).commit(); + doAnswer( + invocation -> { + newRewriteManifests.commit(); + throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); + }) + .when(spyNewRewriteManifests) + .commit(); Table spyTable = spy(table); when(spyTable.rewriteManifests()).thenReturn(spyNewRewriteManifests); - AssertHelpers.assertThrowsCause("Should throw a Commit State Unknown Exception", + AssertHelpers.assertThrowsCause( + "Should throw a Commit State Unknown Exception", RuntimeException.class, "Datacenter on Fire", () -> actions.rewriteManifests(spyTable).rewriteIf(manifest -> true).execute()); @@ -219,45 +220,40 @@ public void testRewriteManifestsWithCommitStateUnknownException() { expectedRecords.addAll(records2); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @Test public void testRewriteSmallManifestsPartitionedTable() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .truncate("c2", 2) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); - List records3 = Lists.newArrayList( - new ThreeColumnRecord(3, "EEEEEEEEEE", "EEEE"), - new ThreeColumnRecord(3, "FFFFFFFFFF", "FFFF") - ); + List records3 = + Lists.newArrayList( + new ThreeColumnRecord(3, "EEEEEEEEEE", "EEEE"), + new ThreeColumnRecord(3, "FFFFFFFFFF", "FFFF")); writeRecords(records3); - List records4 = Lists.newArrayList( - new ThreeColumnRecord(4, "GGGGGGGGGG", "GGGG"), - new ThreeColumnRecord(4, "HHHHHHHHHG", "HHHH") - ); + List records4 = + Lists.newArrayList( + new ThreeColumnRecord(4, "GGGGGGGGGG", "GGGG"), + new ThreeColumnRecord(4, "HHHHHHHHHG", "HHHH")); writeRecords(records4); table.refresh(); @@ -271,16 +267,18 @@ public void testRewriteSmallManifestsPartitionedTable() { long manifestEntrySizeBytes = computeManifestEntrySizeBytes(manifests); long targetManifestSizeBytes = (long) (1.05 * 4 * manifestEntrySizeBytes); - table.updateProperties() + table + .updateProperties() .set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(targetManifestSizeBytes)) .commit(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .execute(); + RewriteManifests.Result result = + actions.rewriteManifests(table).rewriteIf(manifest -> true).execute(); - Assert.assertEquals("Action should rewrite 4 manifests", 4, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite 4 manifests", 4, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); table.refresh(); @@ -302,32 +300,29 @@ public void testRewriteSmallManifestsPartitionedTable() { expectedRecords.addAll(records4); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @Test public void testRewriteImportedManifests() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c3") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c3").build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); File parquetTableDir = temp.newFolder("parquet_table"); String parquetTableLocation = parquetTableDir.toURI().toString(); try { Dataset inputDF = spark.createDataFrame(records, ThreeColumnRecord.class); - inputDF.select("c1", "c2", "c3") + inputDF + .select("c1", "c2", "c3") .write() .format("parquet") .mode("overwrite") @@ -336,20 +331,26 @@ public void testRewriteImportedManifests() throws IOException { .saveAsTable("parquet_table"); File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); + SparkTableUtil.importSparkTable( + spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); Snapshot snapshot = table.currentSnapshot(); SparkActions actions = SparkActions.get(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .stagingLocation(temp.newFolder().toString()) - .execute(); + RewriteManifests.Result result = + actions + .rewriteManifests(table) + .rewriteIf(manifest -> true) + .stagingLocation(temp.newFolder().toString()) + .execute(); - Assert.assertEquals("Action should rewrite all manifests", - snapshot.allManifests(table.io()), result.rewrittenManifests()); - Assert.assertEquals("Action should add 1 manifest", 1, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite all manifests", + snapshot.allManifests(table.io()), + result.rewrittenManifests()); + Assert.assertEquals( + "Action should add 1 manifest", 1, Iterables.size(result.addedManifests())); } finally { spark.sql("DROP TABLE parquet_table"); @@ -358,9 +359,7 @@ public void testRewriteImportedManifests() throws IOException { @Test public void testRewriteLargeManifestsPartitionedTable() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c3") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c3").build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); @@ -380,19 +379,26 @@ public void testRewriteLargeManifestsPartitionedTable() throws IOException { Assert.assertEquals("Should have 1 manifests before rewrite", 1, manifests.size()); // set the target manifest size to a small value to force splitting records into multiple files - table.updateProperties() - .set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(manifests.get(0).length() / 2)) + table + .updateProperties() + .set( + TableProperties.MANIFEST_TARGET_SIZE_BYTES, + String.valueOf(manifests.get(0).length() / 2)) .commit(); SparkActions actions = SparkActions.get(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .stagingLocation(temp.newFolder().toString()) - .execute(); + RewriteManifests.Result result = + actions + .rewriteManifests(table) + .rewriteIf(manifest -> true) + .stagingLocation(temp.newFolder().toString()) + .execute(); - Assert.assertEquals("Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); table.refresh(); @@ -400,33 +406,28 @@ public void testRewriteLargeManifestsPartitionedTable() throws IOException { Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size()); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @Test public void testRewriteManifestsWithPredicate() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .truncate("c2", 2) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -437,14 +438,18 @@ public void testRewriteManifestsWithPredicate() throws IOException { SparkActions actions = SparkActions.get(); // rewrite only the first manifest without caching - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> manifest.path().equals(manifests.get(0).path())) - .stagingLocation(temp.newFolder().toString()) - .option("use-caching", "false") - .execute(); - - Assert.assertEquals("Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); + RewriteManifests.Result result = + actions + .rewriteManifests(table) + .rewriteIf(manifest -> manifest.path().equals(manifests.get(0).path())) + .stagingLocation(temp.newFolder().toString()) + .option("use-caching", "false") + .execute(); + + Assert.assertEquals( + "Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); table.refresh(); @@ -452,16 +457,16 @@ public void testRewriteManifestsWithPredicate() throws IOException { Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size()); Assert.assertFalse("First manifest must be rewritten", newManifests.contains(manifests.get(0))); - Assert.assertTrue("Second manifest must not be rewritten", newManifests.contains(manifests.get(1))); + Assert.assertTrue( + "Second manifest must not be rewritten", newManifests.contains(manifests.get(1))); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(records1); expectedRecords.addAll(records2); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -472,11 +477,7 @@ private void writeRecords(List records) { } private void writeDF(Dataset df) { - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); } private long computeManifestEntrySizeBytes(List manifests) { @@ -485,7 +486,8 @@ private long computeManifestEntrySizeBytes(List manifests) { for (ManifestFile manifest : manifests) { totalSize += manifest.length(); - numEntries += manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); + numEntries += + manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); } return totalSize / numEntries; diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java index ead159477094..2e99ca98ba16 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; @@ -38,34 +40,31 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class AvroDataTest { protected abstract void writeAndValidate(Schema schema) throws IOException; - protected static final StructType SUPPORTED_PRIMITIVES = StructType.of( - required(100, "id", LongType.get()), - optional(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - optional(103, "i", Types.IntegerType.get()), - required(104, "l", LongType.get()), - optional(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - optional(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - // required(111, "uuid", Types.UUIDType.get()), - required(112, "fixed", Types.FixedType.ofLength(7)), - optional(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + protected static final StructType SUPPORTED_PRIMITIVES = + StructType.of( + required(100, "id", LongType.get()), + optional(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + optional(103, "i", Types.IntegerType.get()), + required(104, "l", LongType.get()), + optional(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + optional(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + // required(111, "uuid", Types.UUIDType.get()), + required(112, "fixed", Types.FixedType.ofLength(7)), + optional(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision + ); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testSimpleStruct() throws IOException { @@ -74,162 +73,208 @@ public void testSimpleStruct() throws IOException { @Test public void testStructWithRequiredFields() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asRequired)))); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds( + new Schema( + Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asRequired)))); } @Test public void testStructWithOptionalFields() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)))); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds( + new Schema( + Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)))); } @Test public void testNestedStruct() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema(required(1, "struct", SUPPORTED_PRIMITIVES)))); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds(new Schema(required(1, "struct", SUPPORTED_PRIMITIVES)))); } @Test public void testArray() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", ListType.ofOptional(2, Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, Types.StringType.get()))); writeAndValidate(schema); } @Test public void testArrayOfStructs() throws IOException { - Schema schema = TypeUtil.assignIncreasingFreshIds(new Schema( - required(0, "id", LongType.get()), - optional(1, "data", ListType.ofOptional(2, SUPPORTED_PRIMITIVES)))); + Schema schema = + TypeUtil.assignIncreasingFreshIds( + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, SUPPORTED_PRIMITIVES)))); writeAndValidate(schema); } @Test public void testMap() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StringType.get(), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional(2, 3, Types.StringType.get(), Types.StringType.get()))); writeAndValidate(schema); } @Test public void testNumericMapKey() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.LongType.get(), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, "data", MapType.ofOptional(2, 3, Types.LongType.get(), Types.StringType.get()))); writeAndValidate(schema); } @Test public void testComplexMapKey() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StructType.of( - required(4, "i", Types.IntegerType.get()), - optional(5, "s", Types.StringType.get())), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional( + 2, + 3, + Types.StructType.of( + required(4, "i", Types.IntegerType.get()), + optional(5, "s", Types.StringType.get())), + Types.StringType.get()))); writeAndValidate(schema); } @Test public void testMapOfStructs() throws IOException { - Schema schema = TypeUtil.assignIncreasingFreshIds(new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)))); + Schema schema = + TypeUtil.assignIncreasingFreshIds( + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional(2, 3, Types.StringType.get(), SUPPORTED_PRIMITIVES)))); writeAndValidate(schema); } @Test public void testMixedTypes() throws IOException { - StructType structType = StructType.of( - required(0, "id", LongType.get()), - optional(1, "list_of_maps", - ListType.ofOptional(2, MapType.ofOptional(3, 4, - Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - optional(5, "map_of_lists", - MapType.ofOptional(6, 7, - Types.StringType.get(), - ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), - required(9, "list_of_lists", - ListType.ofOptional(10, ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), - required(12, "map_of_maps", - MapType.ofOptional(13, 14, - Types.StringType.get(), - MapType.ofOptional(15, 16, + StructType structType = + StructType.of( + required(0, "id", LongType.get()), + optional( + 1, + "list_of_maps", + ListType.ofOptional( + 2, MapType.ofOptional(3, 4, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + optional( + 5, + "map_of_lists", + MapType.ofOptional( + 6, 7, Types.StringType.get(), ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), + required( + 9, + "list_of_lists", + ListType.ofOptional(10, ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), + required( + 12, + "map_of_maps", + MapType.ofOptional( + 13, + 14, Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - required(17, "list_of_struct_of_nested_types", ListType.ofOptional(19, StructType.of( - Types.NestedField.required(20, "m1", MapType.ofOptional(21, 22, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(23, "l1", ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), - Types.NestedField.required(25, "l2", ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(27, "m2", MapType.ofOptional(28, 29, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)) - ))) - ); - - Schema schema = new Schema(TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) - .asStructType().fields()); + MapType.ofOptional(15, 16, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + required( + 17, + "list_of_struct_of_nested_types", + ListType.ofOptional( + 19, + StructType.of( + Types.NestedField.required( + 20, + "m1", + MapType.ofOptional( + 21, 22, Types.StringType.get(), SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 23, "l1", ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), + Types.NestedField.required( + 25, "l2", ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 27, + "m2", + MapType.ofOptional( + 28, 29, Types.StringType.get(), SUPPORTED_PRIMITIVES)))))); + + Schema schema = + new Schema( + TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) + .asStructType() + .fields()); writeAndValidate(schema); } @Test public void testTimestampWithoutZone() throws IOException { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - Schema schema = TypeUtil.assignIncreasingFreshIds(new Schema( - required(0, "id", LongType.get()), - optional(1, "ts_without_zone", Types.TimestampType.withoutZone()))); - - writeAndValidate(schema); - }); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + Schema schema = + TypeUtil.assignIncreasingFreshIds( + new Schema( + required(0, "id", LongType.get()), + optional(1, "ts_without_zone", Types.TimestampType.withoutZone()))); + + writeAndValidate(schema); + }); } protected void withSQLConf(Map conf, Action action) throws IOException { SQLConf sqlConf = SQLConf.get(); Map currentConfValues = Maps.newHashMap(); - conf.keySet().forEach(confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); - - conf.forEach((confKey, confValue) -> { - if (SQLConf.staticConfKeys().contains(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); + conf.keySet() + .forEach( + confKey -> { + if (sqlConf.contains(confKey)) { + String currentConfValue = sqlConf.getConfString(confKey); + currentConfValues.put(confKey, currentConfValue); + } + }); + + conf.forEach( + (confKey, confValue) -> { + if (SQLConf.staticConfKeys().contains(confKey)) { + throw new RuntimeException("Cannot modify the value of a static config: " + confKey); + } + sqlConf.setConfString(confKey, confValue); + }); try { action.invoke(); } finally { - conf.forEach((confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); + conf.forEach( + (confKey, confValue) -> { + if (currentConfValues.containsKey(confKey)) { + sqlConf.setConfString(confKey, currentConfValues.get(confKey)); + } else { + sqlConf.unsetConf(confKey); + } + }); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java index 46c95cef112d..a96e3b1f57f5 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static scala.collection.JavaConverters.mapAsJavaMapConverter; +import static scala.collection.JavaConverters.seqAsJavaListConverter; + import java.math.BigDecimal; import java.nio.ByteBuffer; import java.sql.Timestamp; @@ -48,13 +51,8 @@ import org.junit.Assert; import scala.collection.Seq; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static scala.collection.JavaConverters.mapAsJavaMapConverter; -import static scala.collection.JavaConverters.seqAsJavaListConverter; - public class GenericsHelpers { - private GenericsHelpers() { - } + private GenericsHelpers() {} private static final OffsetDateTime EPOCH = Instant.ofEpochMilli(0L).atOffset(ZoneOffset.UTC); private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); @@ -71,7 +69,8 @@ public static void assertEqualsSafe(Types.StructType struct, Record expected, Ro } } - private static void assertEqualsSafe(Types.ListType list, Collection expected, List actual) { + private static void assertEqualsSafe( + Types.ListType list, Collection expected, List actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); for (int i = 0; i < expectedElements.size(); i += 1) { @@ -82,11 +81,11 @@ private static void assertEqualsSafe(Types.ListType list, Collection expected } } - private static void assertEqualsSafe(Types.MapType map, - Map expected, Map actual) { + private static void assertEqualsSafe(Types.MapType map, Map expected, Map actual) { Type keyType = map.keyType(); Type valueType = map.valueType(); - Assert.assertEquals("Should have the same number of keys", expected.keySet().size(), actual.keySet().size()); + Assert.assertEquals( + "Should have the same number of keys", expected.keySet().size(), actual.keySet().size()); for (Object expectedKey : expected.keySet()) { Object matchingKey = null; @@ -120,22 +119,29 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) Assert.assertEquals("Primitive value should be equal to expected", expected, actual); break; case DATE: - Assertions.assertThat(expected).as("Should expect a LocalDate").isInstanceOf(LocalDate.class); + Assertions.assertThat(expected) + .as("Should expect a LocalDate") + .isInstanceOf(LocalDate.class); Assertions.assertThat(actual).as("Should be a Date").isInstanceOf(Date.class); - Assert.assertEquals("ISO-8601 date should be equal", expected.toString(), actual.toString()); + Assert.assertEquals( + "ISO-8601 date should be equal", expected.toString(), actual.toString()); break; case TIMESTAMP: Assertions.assertThat(actual).as("Should be a Timestamp").isInstanceOf(Timestamp.class); Timestamp ts = (Timestamp) actual; // milliseconds from nanos has already been added by getTime - OffsetDateTime actualTs = EPOCH.plusNanos( - (ts.getTime() * 1_000_000) + (ts.getNanos() % 1_000_000)); + OffsetDateTime actualTs = + EPOCH.plusNanos((ts.getTime() * 1_000_000) + (ts.getNanos() % 1_000_000)); Types.TimestampType timestampType = (Types.TimestampType) type; if (timestampType.shouldAdjustToUTC()) { - Assertions.assertThat(expected).as("Should expect an OffsetDateTime").isInstanceOf(OffsetDateTime.class); + Assertions.assertThat(expected) + .as("Should expect an OffsetDateTime") + .isInstanceOf(OffsetDateTime.class); Assert.assertEquals("Timestamp should be equal", expected, actualTs); } else { - Assertions.assertThat(expected).as("Should expect an LocalDateTime").isInstanceOf(LocalDateTime.class); + Assertions.assertThat(expected) + .as("Should expect an LocalDateTime") + .isInstanceOf(LocalDateTime.class); Assert.assertEquals("Timestamp should be equal", expected, actualTs.toLocalDateTime()); } break; @@ -146,23 +152,25 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a String").isInstanceOf(String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual); + Assert.assertEquals("UUID string representation should match", expected.toString(), actual); break; case FIXED: Assertions.assertThat(expected).as("Should expect a byte[]").isInstanceOf(byte[].class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - (byte[]) expected, (byte[]) actual); + Assert.assertArrayEquals("Bytes should match", (byte[]) expected, (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a BigDecimal").isInstanceOf(BigDecimal.class); Assert.assertEquals("BigDecimals should be equal", expected, actual); break; @@ -172,16 +180,20 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) assertEqualsSafe(type.asNestedType().asStructType(), (Record) expected, (Row) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be a Seq").isInstanceOf(Seq.class); List asList = seqAsJavaListConverter((Seq) actual).asJava(); assertEqualsSafe(type.asNestedType().asListType(), (Collection) expected, asList); break; case MAP: Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be a Map").isInstanceOf(scala.collection.Map.class); - Map asMap = mapAsJavaMapConverter( - (scala.collection.Map) actual).asJava(); + Assertions.assertThat(actual) + .as("Should be a Map") + .isInstanceOf(scala.collection.Map.class); + Map asMap = + mapAsJavaMapConverter((scala.collection.Map) actual).asJava(); assertEqualsSafe(type.asNestedType().asMapType(), (Map) expected, asMap); break; case TIME: @@ -190,7 +202,8 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) } } - public static void assertEqualsUnsafe(Types.StructType struct, Record expected, InternalRow actual) { + public static void assertEqualsUnsafe( + Types.StructType struct, Record expected, InternalRow actual) { List fields = struct.fields(); for (int i = 0; i < fields.size(); i += 1) { Type fieldType = fields.get(i).type(); @@ -202,7 +215,8 @@ public static void assertEqualsUnsafe(Types.StructType struct, Record expected, } } - private static void assertEqualsUnsafe(Types.ListType list, Collection expected, ArrayData actual) { + private static void assertEqualsUnsafe( + Types.ListType list, Collection expected, ArrayData actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); for (int i = 0; i < expectedElements.size(); i += 1) { @@ -245,20 +259,29 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual Assert.assertEquals("Primitive value should be equal to expected", expected, actual); break; case DATE: - Assertions.assertThat(expected).as("Should expect a LocalDate").isInstanceOf(LocalDate.class); + Assertions.assertThat(expected) + .as("Should expect a LocalDate") + .isInstanceOf(LocalDate.class); int expectedDays = (int) ChronoUnit.DAYS.between(EPOCH_DAY, (LocalDate) expected); Assert.assertEquals("Primitive value should be equal to expected", expectedDays, actual); break; case TIMESTAMP: Types.TimestampType timestampType = (Types.TimestampType) type; if (timestampType.shouldAdjustToUTC()) { - Assertions.assertThat(expected).as("Should expect an OffsetDateTime").isInstanceOf(OffsetDateTime.class); + Assertions.assertThat(expected) + .as("Should expect an OffsetDateTime") + .isInstanceOf(OffsetDateTime.class); long expectedMicros = ChronoUnit.MICROS.between(EPOCH, (OffsetDateTime) expected); - Assert.assertEquals("Primitive value should be equal to expected", expectedMicros, actual); + Assert.assertEquals( + "Primitive value should be equal to expected", expectedMicros, actual); } else { - Assertions.assertThat(expected).as("Should expect an LocalDateTime").isInstanceOf(LocalDateTime.class); - long expectedMicros = ChronoUnit.MICROS.between(EPOCH, ((LocalDateTime) expected).atZone(ZoneId.of("UTC"))); - Assert.assertEquals("Primitive value should be equal to expected", expectedMicros, actual); + Assertions.assertThat(expected) + .as("Should expect an LocalDateTime") + .isInstanceOf(LocalDateTime.class); + long expectedMicros = + ChronoUnit.MICROS.between(EPOCH, ((LocalDateTime) expected).atZone(ZoneId.of("UTC"))); + Assert.assertEquals( + "Primitive value should be equal to expected", expectedMicros, actual); } break; case STRING: @@ -268,8 +291,8 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a UTF8String").isInstanceOf(UTF8String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual.toString()); + Assert.assertEquals( + "UUID string representation should match", expected.toString(), actual.toString()); break; case FIXED: Assertions.assertThat(expected).as("Should expect a byte[]").isInstanceOf(byte[].class); @@ -277,30 +300,42 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual Assert.assertArrayEquals("Bytes should match", (byte[]) expected, (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a Decimal").isInstanceOf(Decimal.class); - Assert.assertEquals("BigDecimals should be equal", - expected, ((Decimal) actual).toJavaBigDecimal()); + Assert.assertEquals( + "BigDecimals should be equal", expected, ((Decimal) actual).toJavaBigDecimal()); break; case STRUCT: Assertions.assertThat(expected).as("Should expect a Record").isInstanceOf(Record.class); - Assertions.assertThat(actual).as("Should be an InternalRow").isInstanceOf(InternalRow.class); - assertEqualsUnsafe(type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); + Assertions.assertThat(actual) + .as("Should be an InternalRow") + .isInstanceOf(InternalRow.class); + assertEqualsUnsafe( + type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be an ArrayData").isInstanceOf(ArrayData.class); - assertEqualsUnsafe(type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); + assertEqualsUnsafe( + type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); break; case MAP: Assertions.assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be an ArrayBasedMapData").isInstanceOf(MapData.class); + Assertions.assertThat(actual) + .as("Should be an ArrayBasedMapData") + .isInstanceOf(MapData.class); assertEqualsUnsafe(type.asNestedType().asMapType(), (Map) expected, (MapData) actual); break; case TIME: diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java index d3bffb75eb5c..1c95df8ced12 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.math.BigDecimal; @@ -53,8 +52,7 @@ public class RandomData { // Default percentage of number of values that are null for optional fields public static final float DEFAULT_NULL_PERCENTAGE = 0.05f; - private RandomData() { - } + private RandomData() {} public static List generateList(Schema schema, int numRecords, long seed) { RandomDataGenerator generator = new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE); @@ -67,63 +65,71 @@ public static List generateList(Schema schema, int numRecords, long seed } public static Iterable generateSpark(Schema schema, int numRecords, long seed) { - return () -> new Iterator() { - private SparkRandomDataGenerator generator = new SparkRandomDataGenerator(seed); - private int count = 0; - - @Override - public boolean hasNext() { - return count < numRecords; - } - - @Override - public InternalRow next() { - if (count >= numRecords) { - throw new NoSuchElementException(); - } - count += 1; - return (InternalRow) TypeUtil.visit(schema, generator); - } - }; + return () -> + new Iterator() { + private SparkRandomDataGenerator generator = new SparkRandomDataGenerator(seed); + private int count = 0; + + @Override + public boolean hasNext() { + return count < numRecords; + } + + @Override + public InternalRow next() { + if (count >= numRecords) { + throw new NoSuchElementException(); + } + count += 1; + return (InternalRow) TypeUtil.visit(schema, generator); + } + }; } public static Iterable generate(Schema schema, int numRecords, long seed) { - return newIterable(() -> new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE), schema, numRecords); + return newIterable( + () -> new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE), schema, numRecords); } - public static Iterable generate(Schema schema, int numRecords, long seed, float nullPercentage) { - return newIterable(() -> new RandomDataGenerator(schema, seed, nullPercentage), schema, numRecords); + public static Iterable generate( + Schema schema, int numRecords, long seed, float nullPercentage) { + return newIterable( + () -> new RandomDataGenerator(schema, seed, nullPercentage), schema, numRecords); } - public static Iterable generateFallbackData(Schema schema, int numRecords, long seed, long numDictRecords) { - return newIterable(() -> new FallbackDataGenerator(schema, seed, numDictRecords), schema, numRecords); + public static Iterable generateFallbackData( + Schema schema, int numRecords, long seed, long numDictRecords) { + return newIterable( + () -> new FallbackDataGenerator(schema, seed, numDictRecords), schema, numRecords); } public static Iterable generateDictionaryEncodableData( Schema schema, int numRecords, long seed, float nullPercentage) { - return newIterable(() -> new DictionaryEncodedDataGenerator(schema, seed, nullPercentage), schema, numRecords); + return newIterable( + () -> new DictionaryEncodedDataGenerator(schema, seed, nullPercentage), schema, numRecords); } - private static Iterable newIterable(Supplier newGenerator, - Schema schema, int numRecords) { - return () -> new Iterator() { - private int count = 0; - private RandomDataGenerator generator = newGenerator.get(); - - @Override - public boolean hasNext() { - return count < numRecords; - } - - @Override - public Record next() { - if (count >= numRecords) { - throw new NoSuchElementException(); - } - count += 1; - return (Record) TypeUtil.visit(schema, generator); - } - }; + private static Iterable newIterable( + Supplier newGenerator, Schema schema, int numRecords) { + return () -> + new Iterator() { + private int count = 0; + private RandomDataGenerator generator = newGenerator.get(); + + @Override + public boolean hasNext() { + return count < numRecords; + } + + @Override + public Record next() { + if (count >= numRecords) { + throw new NoSuchElementException(); + } + count += 1; + return (Record) TypeUtil.visit(schema, generator); + } + }; } private static class RandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor { @@ -218,8 +224,7 @@ public Object primitive(Type.PrimitiveType primitive) { // them here. switch (primitive.typeId()) { case FIXED: - return new GenericData.Fixed(typeToSchema.get(primitive), - (byte[]) result); + return new GenericData.Fixed(typeToSchema.get(primitive), (byte[]) result); case BINARY: return ByteBuffer.wrap((byte[]) result); case UUID: diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java index 53d5e8763e6f..42f4c1a1ab42 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static scala.collection.JavaConverters.mapAsJavaMapConverter; +import static scala.collection.JavaConverters.seqAsJavaListConverter; + import java.math.BigDecimal; import java.nio.ByteBuffer; import java.sql.Timestamp; @@ -63,14 +66,9 @@ import org.junit.Assert; import scala.collection.Seq; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static scala.collection.JavaConverters.mapAsJavaMapConverter; -import static scala.collection.JavaConverters.seqAsJavaListConverter; - public class TestHelpers { - private TestHelpers() { - } + private TestHelpers() {} public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row) { List fields = struct.fields(); @@ -84,8 +82,11 @@ public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row } } - public static void assertEqualsBatch(Types.StructType struct, Iterator expected, ColumnarBatch batch, - boolean checkArrowValidityVector) { + public static void assertEqualsBatch( + Types.StructType struct, + Iterator expected, + ColumnarBatch batch, + boolean checkArrowValidityVector) { for (int rowId = 0; rowId < batch.numRows(); rowId++) { List fields = struct.fields(); InternalRow row = batch.getRow(rowId); @@ -98,15 +99,16 @@ public static void assertEqualsBatch(Types.StructType struct, Iterator e if (checkArrowValidityVector) { ColumnVector columnVector = batch.column(i); - ValueVector arrowVector = ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector(); - Assert.assertFalse("Nullability doesn't match of " + columnVector.dataType(), + ValueVector arrowVector = + ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector(); + Assert.assertFalse( + "Nullability doesn't match of " + columnVector.dataType(), expectedValue == null ^ arrowVector.isNull(rowId)); } } } } - private static void assertEqualsSafe(Types.ListType list, Collection expected, List actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); @@ -118,8 +120,7 @@ private static void assertEqualsSafe(Types.ListType list, Collection expected } } - private static void assertEqualsSafe(Types.MapType map, - Map expected, Map actual) { + private static void assertEqualsSafe(Types.MapType map, Map expected, Map actual) { Type keyType = map.keyType(); Type valueType = map.valueType(); @@ -178,23 +179,28 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a String").isInstanceOf(String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual); + Assert.assertEquals("UUID string representation should match", expected.toString(), actual); break; case FIXED: - Assertions.assertThat(expected).as("Should expect a Fixed").isInstanceOf(GenericData.Fixed.class); + Assertions.assertThat(expected) + .as("Should expect a Fixed") + .isInstanceOf(GenericData.Fixed.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((GenericData.Fixed) expected).bytes(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((GenericData.Fixed) expected).bytes(), (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a BigDecimal").isInstanceOf(BigDecimal.class); Assert.assertEquals("BigDecimals should be equal", expected, actual); break; @@ -204,16 +210,20 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) assertEqualsSafe(type.asNestedType().asStructType(), (Record) expected, (Row) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be a Seq").isInstanceOf(Seq.class); List asList = seqAsJavaListConverter((Seq) actual).asJava(); assertEqualsSafe(type.asNestedType().asListType(), (Collection) expected, asList); break; case MAP: Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be a Map").isInstanceOf(scala.collection.Map.class); - Map asMap = mapAsJavaMapConverter( - (scala.collection.Map) actual).asJava(); + Assertions.assertThat(actual) + .as("Should be a Map") + .isInstanceOf(scala.collection.Map.class); + Map asMap = + mapAsJavaMapConverter((scala.collection.Map) actual).asJava(); assertEqualsSafe(type.asNestedType().asMapType(), (Map) expected, asMap); break; case TIME: @@ -234,7 +244,8 @@ public static void assertEqualsUnsafe(Types.StructType struct, Record rec, Inter } } - private static void assertEqualsUnsafe(Types.ListType list, Collection expected, ArrayData actual) { + private static void assertEqualsUnsafe( + Types.ListType list, Collection expected, ArrayData actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); for (int i = 0; i < expectedElements.size(); i += 1) { @@ -280,8 +291,10 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual case DOUBLE: Assertions.assertThat(actual).as("Should be a double").isInstanceOf(Double.class); if (expected instanceof Float) { - Assert.assertEquals("Values didn't match", Double.doubleToLongBits(((Number) expected).doubleValue()), - Double.doubleToLongBits((double) actual)); + Assert.assertEquals( + "Values didn't match", + Double.doubleToLongBits(((Number) expected).doubleValue()), + Double.doubleToLongBits((double) actual)); } else { Assert.assertEquals("Primitive value should be equal to expected", expected, actual); } @@ -300,40 +313,54 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a UTF8String").isInstanceOf(UTF8String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual.toString()); + Assert.assertEquals( + "UUID string representation should match", expected.toString(), actual.toString()); break; case FIXED: - Assertions.assertThat(expected).as("Should expect a Fixed").isInstanceOf(GenericData.Fixed.class); + Assertions.assertThat(expected) + .as("Should expect a Fixed") + .isInstanceOf(GenericData.Fixed.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((GenericData.Fixed) expected).bytes(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((GenericData.Fixed) expected).bytes(), (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a Decimal").isInstanceOf(Decimal.class); - Assert.assertEquals("BigDecimals should be equal", - expected, ((Decimal) actual).toJavaBigDecimal()); + Assert.assertEquals( + "BigDecimals should be equal", expected, ((Decimal) actual).toJavaBigDecimal()); break; case STRUCT: Assertions.assertThat(expected).as("Should expect a Record").isInstanceOf(Record.class); - Assertions.assertThat(actual).as("Should be an InternalRow").isInstanceOf(InternalRow.class); - assertEqualsUnsafe(type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); + Assertions.assertThat(actual) + .as("Should be an InternalRow") + .isInstanceOf(InternalRow.class); + assertEqualsUnsafe( + type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be an ArrayData").isInstanceOf(ArrayData.class); - assertEqualsUnsafe(type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); + assertEqualsUnsafe( + type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); break; case MAP: Assertions.assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be an ArrayBasedMapData").isInstanceOf(MapData.class); + Assertions.assertThat(actual) + .as("Should be an ArrayBasedMapData") + .isInstanceOf(MapData.class); assertEqualsUnsafe(type.asNestedType().asMapType(), (Map) expected, (MapData) actual); break; case TIME: @@ -344,13 +371,14 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual /** * Check that the given InternalRow is equivalent to the Row. + * * @param prefix context for error messages * @param type the type of the row * @param expected the expected value of the row * @param actual the actual value of the row */ - public static void assertEquals(String prefix, Types.StructType type, - InternalRow expected, Row actual) { + public static void assertEquals( + String prefix, Types.StructType type, InternalRow expected, Row actual) { if (expected == null || actual == null) { Assert.assertEquals(prefix, expected, actual); } else { @@ -368,30 +396,41 @@ public static void assertEquals(String prefix, Types.StructType type, case DECIMAL: case DATE: case TIMESTAMP: - Assert.assertEquals(prefix + "." + fieldName + " - " + childType, + Assert.assertEquals( + prefix + "." + fieldName + " - " + childType, getValue(expected, c, childType), getPrimitiveValue(actual, c, childType)); break; case UUID: case FIXED: case BINARY: - assertEqualBytes(prefix + "." + fieldName, + assertEqualBytes( + prefix + "." + fieldName, (byte[]) getValue(expected, c, childType), (byte[]) actual.get(c)); break; - case STRUCT: { - Types.StructType st = (Types.StructType) childType; - assertEquals(prefix + "." + fieldName, st, - expected.getStruct(c, st.fields().size()), actual.getStruct(c)); - break; - } + case STRUCT: + { + Types.StructType st = (Types.StructType) childType; + assertEquals( + prefix + "." + fieldName, + st, + expected.getStruct(c, st.fields().size()), + actual.getStruct(c)); + break; + } case LIST: - assertEqualsLists(prefix + "." + fieldName, childType.asListType(), + assertEqualsLists( + prefix + "." + fieldName, + childType.asListType(), expected.getArray(c), toList((Seq) actual.get(c))); break; case MAP: - assertEqualsMaps(prefix + "." + fieldName, childType.asMapType(), expected.getMap(c), + assertEqualsMaps( + prefix + "." + fieldName, + childType.asMapType(), + expected.getMap(c), toJavaMap((scala.collection.Map) actual.getMap(c))); break; default: @@ -401,8 +440,8 @@ public static void assertEquals(String prefix, Types.StructType type, } } - private static void assertEqualsLists(String prefix, Types.ListType type, - ArrayData expected, List actual) { + private static void assertEqualsLists( + String prefix, Types.ListType type, ArrayData expected, List actual) { if (expected == null || actual == null) { Assert.assertEquals(prefix, expected, actual); } else { @@ -419,31 +458,42 @@ private static void assertEqualsLists(String prefix, Types.ListType type, case DECIMAL: case DATE: case TIMESTAMP: - Assert.assertEquals(prefix + ".elem " + e + " - " + childType, + Assert.assertEquals( + prefix + ".elem " + e + " - " + childType, getValue(expected, e, childType), actual.get(e)); break; case UUID: case FIXED: case BINARY: - assertEqualBytes(prefix + ".elem " + e, + assertEqualBytes( + prefix + ".elem " + e, (byte[]) getValue(expected, e, childType), (byte[]) actual.get(e)); break; - case STRUCT: { - Types.StructType st = (Types.StructType) childType; - assertEquals(prefix + ".elem " + e, st, - expected.getStruct(e, st.fields().size()), (Row) actual.get(e)); - break; - } + case STRUCT: + { + Types.StructType st = (Types.StructType) childType; + assertEquals( + prefix + ".elem " + e, + st, + expected.getStruct(e, st.fields().size()), + (Row) actual.get(e)); + break; + } case LIST: - assertEqualsLists(prefix + ".elem " + e, childType.asListType(), + assertEqualsLists( + prefix + ".elem " + e, + childType.asListType(), expected.getArray(e), toList((Seq) actual.get(e))); break; case MAP: - assertEqualsMaps(prefix + ".elem " + e, childType.asMapType(), - expected.getMap(e), toJavaMap((scala.collection.Map) actual.get(e))); + assertEqualsMaps( + prefix + ".elem " + e, + childType.asMapType(), + expected.getMap(e), + toJavaMap((scala.collection.Map) actual.get(e))); break; default: throw new IllegalArgumentException("Unhandled type " + childType); @@ -452,8 +502,8 @@ private static void assertEqualsLists(String prefix, Types.ListType type, } } - private static void assertEqualsMaps(String prefix, Types.MapType type, - MapData expected, Map actual) { + private static void assertEqualsMaps( + String prefix, Types.MapType type, MapData expected, Map actual) { if (expected == null || actual == null) { Assert.assertEquals(prefix, expected, actual); } else { @@ -466,7 +516,9 @@ private static void assertEqualsMaps(String prefix, Types.MapType type, Object expectedKey = getValue(expectedKeyArray, e, keyType); Object actualValue = actual.get(expectedKey); if (actualValue == null) { - Assert.assertEquals(prefix + ".key=" + expectedKey + " has null", true, + Assert.assertEquals( + prefix + ".key=" + expectedKey + " has null", + true, expected.valueArray().isNullAt(e)); } else { switch (valueType.typeId()) { @@ -479,32 +531,40 @@ private static void assertEqualsMaps(String prefix, Types.MapType type, case DECIMAL: case DATE: case TIMESTAMP: - Assert.assertEquals(prefix + ".key=" + expectedKey + " - " + valueType, + Assert.assertEquals( + prefix + ".key=" + expectedKey + " - " + valueType, getValue(expectedValueArray, e, valueType), actual.get(expectedKey)); break; case UUID: case FIXED: case BINARY: - assertEqualBytes(prefix + ".key=" + expectedKey, + assertEqualBytes( + prefix + ".key=" + expectedKey, (byte[]) getValue(expectedValueArray, e, valueType), (byte[]) actual.get(expectedKey)); break; - case STRUCT: { - Types.StructType st = (Types.StructType) valueType; - assertEquals(prefix + ".key=" + expectedKey, st, - expectedValueArray.getStruct(e, st.fields().size()), - (Row) actual.get(expectedKey)); - break; - } + case STRUCT: + { + Types.StructType st = (Types.StructType) valueType; + assertEquals( + prefix + ".key=" + expectedKey, + st, + expectedValueArray.getStruct(e, st.fields().size()), + (Row) actual.get(expectedKey)); + break; + } case LIST: - assertEqualsLists(prefix + ".key=" + expectedKey, + assertEqualsLists( + prefix + ".key=" + expectedKey, valueType.asListType(), expectedValueArray.getArray(e), toList((Seq) actual.get(expectedKey))); break; case MAP: - assertEqualsMaps(prefix + ".key=" + expectedKey, valueType.asMapType(), + assertEqualsMaps( + prefix + ".key=" + expectedKey, + valueType.asMapType(), expectedValueArray.getMap(e), toJavaMap((scala.collection.Map) actual.get(expectedKey))); break; @@ -516,8 +576,7 @@ private static void assertEqualsMaps(String prefix, Types.MapType type, } } - private static Object getValue(SpecializedGetters container, int ord, - Type type) { + private static Object getValue(SpecializedGetters container, int ord, Type type) { if (container.isNullAt(ord)) { return null; } @@ -542,10 +601,11 @@ private static Object getValue(SpecializedGetters container, int ord, return new DateWritable(container.getInt(ord)).get(); case TIMESTAMP: return DateTimeUtils.toJavaTimestamp(container.getLong(ord)); - case DECIMAL: { - Types.DecimalType dt = (Types.DecimalType) type; - return container.getDecimal(ord, dt.precision(), dt.scale()).toJavaBigDecimal(); - } + case DECIMAL: + { + Types.DecimalType dt = (Types.DecimalType) type; + return container.getDecimal(ord, dt.precision(), dt.scale()).toJavaBigDecimal(); + } case STRUCT: Types.StructType struct = type.asStructType(); InternalRow internalRow = container.getStruct(ord, struct.fields().size()); @@ -603,8 +663,7 @@ private static List toList(Seq val) { return val == null ? null : seqAsJavaListConverter(val).asJava(); } - private static void assertEqualBytes(String context, byte[] expected, - byte[] actual) { + private static void assertEqualBytes(String context, byte[] expected, byte[] actual) { if (expected == null || actual == null) { Assert.assertEquals(context, expected, actual); } else { @@ -622,23 +681,29 @@ private static void assertEquals(String context, DataType type, Object expected, } if (type instanceof StructType) { - Assertions.assertThat(expected).as("Expected should be an InternalRow: " + context) + Assertions.assertThat(expected) + .as("Expected should be an InternalRow: " + context) .isInstanceOf(InternalRow.class); - Assertions.assertThat(actual).as("Actual should be an InternalRow: " + context) + Assertions.assertThat(actual) + .as("Actual should be an InternalRow: " + context) .isInstanceOf(InternalRow.class); assertEquals(context, (StructType) type, (InternalRow) expected, (InternalRow) actual); } else if (type instanceof ArrayType) { - Assertions.assertThat(expected).as("Expected should be an ArrayData: " + context) + Assertions.assertThat(expected) + .as("Expected should be an ArrayData: " + context) .isInstanceOf(ArrayData.class); - Assertions.assertThat(actual).as("Actual should be an ArrayData: " + context) + Assertions.assertThat(actual) + .as("Actual should be an ArrayData: " + context) .isInstanceOf(ArrayData.class); assertEquals(context, (ArrayType) type, (ArrayData) expected, (ArrayData) actual); } else if (type instanceof MapType) { - Assertions.assertThat(expected).as("Expected should be a MapData: " + context) + Assertions.assertThat(expected) + .as("Expected should be a MapData: " + context) .isInstanceOf(MapData.class); - Assertions.assertThat(actual).as("Actual should be a MapData: " + context) + Assertions.assertThat(actual) + .as("Actual should be a MapData: " + context) .isInstanceOf(MapData.class); assertEquals(context, (MapType) type, (MapData) expected, (MapData) actual); @@ -649,32 +714,37 @@ private static void assertEquals(String context, DataType type, Object expected, } } - private static void assertEquals(String context, StructType struct, - InternalRow expected, InternalRow actual) { + private static void assertEquals( + String context, StructType struct, InternalRow expected, InternalRow actual) { Assert.assertEquals("Should have correct number of fields", struct.size(), actual.numFields()); for (int i = 0; i < actual.numFields(); i += 1) { StructField field = struct.fields()[i]; DataType type = field.dataType(); - assertEquals(context + "." + field.name(), type, + assertEquals( + context + "." + field.name(), + type, expected.isNullAt(i) ? null : expected.get(i, type), actual.isNullAt(i) ? null : actual.get(i, type)); } } - private static void assertEquals(String context, ArrayType array, ArrayData expected, ArrayData actual) { - Assert.assertEquals("Should have the same number of elements", - expected.numElements(), actual.numElements()); + private static void assertEquals( + String context, ArrayType array, ArrayData expected, ArrayData actual) { + Assert.assertEquals( + "Should have the same number of elements", expected.numElements(), actual.numElements()); DataType type = array.elementType(); for (int i = 0; i < actual.numElements(); i += 1) { - assertEquals(context + ".element", type, + assertEquals( + context + ".element", + type, expected.isNullAt(i) ? null : expected.get(i, type), actual.isNullAt(i) ? null : actual.get(i, type)); } } private static void assertEquals(String context, MapType map, MapData expected, MapData actual) { - Assert.assertEquals("Should have the same number of elements", - expected.numElements(), actual.numElements()); + Assert.assertEquals( + "Should have the same number of elements", expected.numElements(), actual.numElements()); DataType keyType = map.keyType(); ArrayData expectedKeys = expected.keyArray(); @@ -685,10 +755,14 @@ private static void assertEquals(String context, MapType map, MapData expected, ArrayData actualValues = actual.valueArray(); for (int i = 0; i < actual.numElements(); i += 1) { - assertEquals(context + ".key", keyType, + assertEquals( + context + ".key", + keyType, expectedKeys.isNullAt(i) ? null : expectedKeys.get(i, keyType), actualKeys.isNullAt(i) ? null : actualKeys.get(i, keyType)); - assertEquals(context + ".value", valueType, + assertEquals( + context + ".value", + valueType, expectedValues.isNullAt(i) ? null : expectedValues.get(i, valueType), actualValues.isNullAt(i) ? null : actualValues.get(i, valueType)); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java index 7cf9b9c736c6..1e51a088390e 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import org.apache.iceberg.Files; @@ -32,16 +33,12 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestOrcWrite { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); @Test public void splitOffsets() throws IOException { @@ -49,10 +46,11 @@ public void splitOffsets() throws IOException { Assert.assertTrue("Delete should succeed", testFile.delete()); Iterable rows = RandomData.generateSpark(SCHEMA, 1, 0L); - FileAppender writer = ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(SCHEMA) - .build(); + FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(SCHEMA) + .build(); writer.addAll(rows); writer.close(); diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java index 464e3165583c..a4ffc2fea437 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -38,54 +40,68 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestParquetAvroReader { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required(5, "strict", Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional(6, "hopeful", Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()) - )), - optional(10, "vehement", Types.LongType.get()) - )), - optional(11, "metamorphosis", Types.MapType.ofRequired(12, 13, - Types.StringType.get(), Types.TimestampType.withoutZone())), - required(14, "winter", Types.ListType.ofOptional(15, Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.TimeType.get()), - optional(18, "wheeze", Types.StringType.get()) - ))), - optional(19, "renovate", Types.MapType.ofRequired(20, 21, - Types.StringType.get(), Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.TimeType.get()), - required(24, "couch rope", Types.IntegerType.get()) - ))), - optional(2, "slide", Types.StringType.get()) - ); + @Rule public TemporaryFolder temp = new TemporaryFolder(); + + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "roots", Types.LongType.get()), + optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), + required( + 5, + "strict", + Types.StructType.of( + required(9, "tangerine", Types.StringType.get()), + optional( + 6, + "hopeful", + Types.StructType.of( + required(7, "steel", Types.FloatType.get()), + required(8, "lantern", Types.DateType.get()))), + optional(10, "vehement", Types.LongType.get()))), + optional( + 11, + "metamorphosis", + Types.MapType.ofRequired( + 12, 13, Types.StringType.get(), Types.TimestampType.withoutZone())), + required( + 14, + "winter", + Types.ListType.ofOptional( + 15, + Types.StructType.of( + optional(16, "beet", Types.DoubleType.get()), + required(17, "stamp", Types.TimeType.get()), + optional(18, "wheeze", Types.StringType.get())))), + optional( + 19, + "renovate", + Types.MapType.ofRequired( + 20, + 21, + Types.StringType.get(), + Types.StructType.of( + optional(22, "jumpy", Types.DoubleType.get()), + required(23, "koala", Types.TimeType.get()), + required(24, "couch rope", Types.IntegerType.get())))), + optional(2, "slide", Types.StringType.get())); @Ignore public void testStructSchema() throws IOException { - Schema structSchema = new Schema( - required(1, "circumvent", Types.LongType.get()), - optional(2, "antarctica", Types.StringType.get()), - optional(3, "fluent", Types.DoubleType.get()), - required(4, "quell", Types.StructType.of( - required(5, "operator", Types.BooleanType.get()), - optional(6, "fanta", Types.IntegerType.get()), - optional(7, "cable", Types.FloatType.get()) - )), - required(8, "chimney", Types.TimestampType.withZone()), - required(9, "wool", Types.DateType.get()) - ); + Schema structSchema = + new Schema( + required(1, "circumvent", Types.LongType.get()), + optional(2, "antarctica", Types.StringType.get()), + optional(3, "fluent", Types.DoubleType.get()), + required( + 4, + "quell", + Types.StructType.of( + required(5, "operator", Types.BooleanType.get()), + optional(6, "fanta", Types.IntegerType.get()), + optional(7, "cable", Types.FloatType.get()))), + required(8, "chimney", Types.TimestampType.withZone()), + required(9, "wool", Types.DateType.get())); File testFile = writeTestData(structSchema, 5_000_000, 1059); // RandomData uses the root record name "test", which must match for records to be equal @@ -100,11 +116,12 @@ public void testStructSchema() throws IOException { // clean up as much memory as possible to avoid a large GC during the timed run System.gc(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(structSchema) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(structSchema, readSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(structSchema) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(structSchema, readSchema)) + .build()) { long start = System.currentTimeMillis(); long val = 0; long count = 0; @@ -137,9 +154,8 @@ public void testWithOldReadPath() throws IOException { // clean up as much memory as possible to avoid a large GC during the timed run System.gc(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)).project(COMPLEX_SCHEMA).build()) { long start = System.currentTimeMillis(); long val = 0; long count = 0; @@ -154,11 +170,12 @@ public void testWithOldReadPath() throws IOException { // clean up as much memory as possible to avoid a large GC during the timed run System.gc(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) + .build()) { long start = System.currentTimeMillis(); long val = 0; long count = 0; @@ -179,9 +196,8 @@ public void testCorrectness() throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)).schema(COMPLEX_SCHEMA).build()) { writer.addAll(records); } @@ -189,12 +205,13 @@ public void testCorrectness() throws IOException { MessageType readSchema = ParquetSchemaUtil.convert(COMPLEX_SCHEMA, "test"); // verify that the new read path is correct - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .reuseContainers() - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) + .reuseContainers() + .build()) { int recordNum = 0; Iterator iter = records.iterator(); for (Record actual : reader) { @@ -209,9 +226,8 @@ private File writeTestData(Schema schema, int numRecords, int seed) throws IOExc File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)).schema(schema).build()) { writer.addAll(RandomData.generate(schema, numRecords, seed)); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java index dcfc873a5a67..15c6268da478 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -38,39 +40,51 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestParquetAvroWriter { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required(5, "strict", Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional(6, "hopeful", Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()) - )), - optional(10, "vehement", Types.LongType.get()) - )), - optional(11, "metamorphosis", Types.MapType.ofRequired(12, 13, - Types.StringType.get(), Types.TimestampType.withoutZone())), - required(14, "winter", Types.ListType.ofOptional(15, Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.TimeType.get()), - optional(18, "wheeze", Types.StringType.get()) - ))), - optional(19, "renovate", Types.MapType.ofRequired(20, 21, - Types.StringType.get(), Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.TimeType.get()), - required(24, "couch rope", Types.IntegerType.get()) - ))), - optional(2, "slide", Types.StringType.get()) - ); + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "roots", Types.LongType.get()), + optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), + required( + 5, + "strict", + Types.StructType.of( + required(9, "tangerine", Types.StringType.get()), + optional( + 6, + "hopeful", + Types.StructType.of( + required(7, "steel", Types.FloatType.get()), + required(8, "lantern", Types.DateType.get()))), + optional(10, "vehement", Types.LongType.get()))), + optional( + 11, + "metamorphosis", + Types.MapType.ofRequired( + 12, 13, Types.StringType.get(), Types.TimestampType.withoutZone())), + required( + 14, + "winter", + Types.ListType.ofOptional( + 15, + Types.StructType.of( + optional(16, "beet", Types.DoubleType.get()), + required(17, "stamp", Types.TimeType.get()), + optional(18, "wheeze", Types.StringType.get())))), + optional( + 19, + "renovate", + Types.MapType.ofRequired( + 20, + 21, + Types.StringType.get(), + Types.StructType.of( + optional(22, "jumpy", Types.DoubleType.get()), + required(23, "koala", Types.TimeType.get()), + required(24, "couch rope", Types.IntegerType.get())))), + optional(2, "slide", Types.StringType.get())); @Test public void testCorrectness() throws IOException { @@ -79,10 +93,11 @@ public void testCorrectness() throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .createWriterFunc(ParquetAvroWriter::buildWriter) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(COMPLEX_SCHEMA) + .createWriterFunc(ParquetAvroWriter::buildWriter) + .build()) { writer.addAll(records); } @@ -90,11 +105,12 @@ public void testCorrectness() throws IOException { MessageType readSchema = ParquetSchemaUtil.convert(COMPLEX_SCHEMA, "test"); // verify that the new read path is correct - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) + .build()) { int recordNum = 0; Iterator iter = records.iterator(); for (Record actual : reader) { diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java index 3517c32ffebb..6f05a9ed7c1f 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.File; @@ -42,20 +41,20 @@ public class TestSparkAvroEnums { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void writeAndValidateEnums() throws IOException { - org.apache.avro.Schema avroSchema = SchemaBuilder.record("root") - .fields() - .name("enumCol") - .type() - .nullable() - .enumeration("testEnum") - .symbols("SYMB1", "SYMB2") - .enumDefault("SYMB2") - .endRecord(); + org.apache.avro.Schema avroSchema = + SchemaBuilder.record("root") + .fields() + .name("enumCol") + .type() + .nullable() + .enumeration("testEnum") + .symbols("SYMB1", "SYMB2") + .enumDefault("SYMB2") + .endRecord(); org.apache.avro.Schema enumSchema = avroSchema.getField("enumCol").schema().getTypes().get(0); Record enumRecord1 = new GenericData.Record(avroSchema); @@ -77,10 +76,11 @@ public void writeAndValidateEnums() throws IOException { Schema schema = new Schema(AvroSchemaUtil.convert(avroSchema).asStructType().fields()); List rows; - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)) + .createReaderFunc(SparkAvroReader::new) + .project(schema) + .build()) { rows = Lists.newArrayList(reader); } @@ -88,7 +88,8 @@ public void writeAndValidateEnums() throws IOException { for (int i = 0; i < expected.size(); i += 1) { String expectedEnumString = expected.get(i).get("enumCol") == null ? null : expected.get(i).get("enumCol").toString(); - String sparkString = rows.get(i).getUTF8String(0) == null ? null : rows.get(i).getUTF8String(0).toString(); + String sparkString = + rows.get(i).getUTF8String(0) == null ? null : rows.get(i).getUTF8String(0).toString(); Assert.assertEquals(expectedEnumString, sparkString); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java index e4398df39cc8..6d1ef3db3657 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; + import java.io.File; import java.io.IOException; import java.util.List; @@ -32,8 +33,6 @@ import org.apache.spark.sql.catalyst.InternalRow; import org.junit.Assert; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; - public class TestSparkAvroReader extends AvroDataTest { @Override protected void writeAndValidate(Schema schema) throws IOException { @@ -42,20 +41,19 @@ protected void writeAndValidate(Schema schema) throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Avro.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Avro.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { for (Record rec : expected) { writer.add(rec); } } List rows; - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)) + .createReaderFunc(SparkAvroReader::new) + .project(schema) + .build()) { rows = Lists.newArrayList(reader); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java index b67e57310b4c..b31ea8fd277d 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.time.ZoneId; @@ -69,7 +68,7 @@ public void checkSparkTimestamp(String timestampString, String sparkRepr) { ZoneId zoneId = DateTimeUtils.getZoneId("UTC"); TimestampFormatter formatter = TimestampFormatter.getFractionFormatter(zoneId); String sparkTimestamp = formatter.format(ts.value()); - Assert.assertEquals("Should be the same timestamp (" + ts.value() + ")", - sparkRepr, sparkTimestamp); + Assert.assertEquals( + "Should be the same timestamp (" + ts.value() + ")", sparkRepr, sparkTimestamp); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java index b8ee56370edf..3c9037adc393 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -57,21 +58,18 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSparkOrcReadMetadataColumns { - private static final Schema DATA_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()) - ); - - private static final Schema PROJECTION_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - MetadataColumns.ROW_POSITION, - MetadataColumns.IS_DELETED - ); + private static final Schema DATA_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), required(101, "data", Types.StringType.get())); + + private static final Schema PROJECTION_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get()), + MetadataColumns.ROW_POSITION, + MetadataColumns.IS_DELETED); private static final int NUM_ROWS = 1000; private static final List DATA_ROWS; @@ -99,11 +97,10 @@ public class TestSparkOrcReadMetadataColumns { @Parameterized.Parameters(name = "vectorized = {0}") public static Object[] parameters() { - return new Object[] { false, true }; + return new Object[] {false, true}; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private boolean vectorized; private File testFile; @@ -117,14 +114,15 @@ public void writeFile() throws IOException { testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(DATA_SCHEMA) - // write in such a way that the file contains 10 stripes each with 100 rows - .set("iceberg.orc.vectorbatch.size", "100") - .set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "100") - .set(OrcConf.STRIPE_SIZE.getAttribute(), "1") - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(DATA_SCHEMA) + // write in such a way that the file contains 10 stripes each with 100 rows + .set("iceberg.orc.vectorbatch.size", "100") + .set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "100") + .set(OrcConf.STRIPE_SIZE.getAttribute(), "1") + .build()) { writer.addAll(DATA_ROWS); } } @@ -136,41 +134,54 @@ public void testReadRowNumbers() throws IOException { @Test public void testReadRowNumbersWithFilter() throws IOException { - readAndValidate(Expressions.greaterThanOrEqual("id", 500), null, null, EXPECTED_ROWS.subList(500, 1000)); + readAndValidate( + Expressions.greaterThanOrEqual("id", 500), null, null, EXPECTED_ROWS.subList(500, 1000)); } @Test public void testReadRowNumbersWithSplits() throws IOException { Reader reader; try { - OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(new Configuration()).useUTCTimestamp(true); - reader = OrcFile.createReader(new Path(testFile.toString()), readerOptions); + OrcFile.ReaderOptions readerOptions = + OrcFile.readerOptions(new Configuration()).useUTCTimestamp(true); + reader = OrcFile.createReader(new Path(testFile.toString()), readerOptions); } catch (IOException ioe) { throw new RuntimeIOException(ioe, "Failed to open file: %s", testFile); } - List splitOffsets = reader.getStripes().stream().map(StripeInformation::getOffset) - .collect(Collectors.toList()); - List splitLengths = reader.getStripes().stream().map(StripeInformation::getLength) - .collect(Collectors.toList()); + List splitOffsets = + reader.getStripes().stream().map(StripeInformation::getOffset).collect(Collectors.toList()); + List splitLengths = + reader.getStripes().stream().map(StripeInformation::getLength).collect(Collectors.toList()); for (int i = 0; i < 10; i++) { - readAndValidate(null, splitOffsets.get(i), splitLengths.get(i), EXPECTED_ROWS.subList(i * 100, (i + 1) * 100)); + readAndValidate( + null, + splitOffsets.get(i), + splitLengths.get(i), + EXPECTED_ROWS.subList(i * 100, (i + 1) * 100)); } } - private void readAndValidate(Expression filter, Long splitStart, Long splitLength, List expected) + private void readAndValidate( + Expression filter, Long splitStart, Long splitLength, List expected) throws IOException { - Schema projectionWithoutMetadataFields = TypeUtil.selectNot(PROJECTION_SCHEMA, MetadataColumns.metadataFieldIds()); + Schema projectionWithoutMetadataFields = + TypeUtil.selectNot(PROJECTION_SCHEMA, MetadataColumns.metadataFieldIds()); CloseableIterable reader = null; try { - ORC.ReadBuilder builder = ORC.read(Files.localInput(testFile)) - .project(projectionWithoutMetadataFields); + ORC.ReadBuilder builder = + ORC.read(Files.localInput(testFile)).project(projectionWithoutMetadataFields); if (vectorized) { - builder = builder.createBatchedReaderFunc(readOrcSchema -> - VectorizedSparkOrcReaders.buildReader(PROJECTION_SCHEMA, readOrcSchema, ImmutableMap.of())); + builder = + builder.createBatchedReaderFunc( + readOrcSchema -> + VectorizedSparkOrcReaders.buildReader( + PROJECTION_SCHEMA, readOrcSchema, ImmutableMap.of())); } else { - builder = builder.createReaderFunc(readOrcSchema -> new SparkOrcReader(PROJECTION_SCHEMA, readOrcSchema)); + builder = + builder.createReaderFunc( + readOrcSchema -> new SparkOrcReader(PROJECTION_SCHEMA, readOrcSchema)); } if (filter != null) { diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java index 5042d1cc1338..b23fe729a187 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.data.TestHelpers.assertEquals; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Collections; @@ -38,45 +40,44 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.spark.data.TestHelpers.assertEquals; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkOrcReader extends AvroDataTest { @Override protected void writeAndValidate(Schema schema) throws IOException { - final Iterable expected = RandomData - .generateSpark(schema, 100, 0L); + final Iterable expected = RandomData.generateSpark(schema, 100, 0L); writeAndValidateRecords(schema, expected); } @Test public void writeAndValidateRepeatingRecords() throws IOException { - Schema structSchema = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()) - ); - List expectedRepeating = Collections.nCopies(100, - RandomData.generateSpark(structSchema, 1, 0L).iterator().next()); + Schema structSchema = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get())); + List expectedRepeating = + Collections.nCopies(100, RandomData.generateSpark(structSchema, 1, 0L).iterator().next()); writeAndValidateRecords(structSchema, expectedRepeating); } - private void writeAndValidateRecords(Schema schema, Iterable expected) throws IOException { + private void writeAndValidateRecords(Schema schema, Iterable expected) + throws IOException { final File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(schema) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(schema) + .build()) { writer.addAll(expected); } - try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) + .build()) { final Iterator actualRows = reader.iterator(); final Iterator expectedRows = expected.iterator(); while (expectedRows.hasNext()) { @@ -86,11 +87,13 @@ private void writeAndValidateRecords(Schema schema, Iterable expect Assert.assertFalse("Should not have extra rows", actualRows.hasNext()); } - try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) - .project(schema) - .createBatchedReaderFunc(readOrcSchema -> - VectorizedSparkOrcReaders.buildReader(schema, readOrcSchema, ImmutableMap.of())) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(testFile)) + .project(schema) + .createBatchedReaderFunc( + readOrcSchema -> + VectorizedSparkOrcReaders.buildReader(schema, readOrcSchema, ImmutableMap.of())) + .build()) { final Iterator actualRows = batchesToRows(reader.iterator()); final Iterator expectedRows = expected.iterator(); while (expectedRows.hasNext()) { diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java index a8a6313dbfaa..929d08f2cdb6 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -58,20 +59,17 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSparkParquetReadMetadataColumns { - private static final Schema DATA_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()) - ); + private static final Schema DATA_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), required(101, "data", Types.StringType.get())); - private static final Schema PROJECTION_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - MetadataColumns.ROW_POSITION - ); + private static final Schema PROJECTION_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get()), + MetadataColumns.ROW_POSITION); private static final int NUM_ROWS = 1000; private static final List DATA_ROWS; @@ -107,16 +105,12 @@ public class TestSparkParquetReadMetadataColumns { } } - @Parameterized.Parameters(name = "vectorized = {0}") + @Parameterized.Parameters(name = "vectorized = {0}") public static Object[][] parameters() { - return new Object[][] { - new Object[] { false }, - new Object[] { true } - }; + return new Object[][] {new Object[] {false}, new Object[] {true}}; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final boolean vectorized; private File testFile; @@ -133,28 +127,32 @@ public void writeFile() throws IOException { testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - ParquetFileWriter parquetFileWriter = new ParquetFileWriter( - conf, - ParquetSchemaUtil.convert(DATA_SCHEMA, "testSchema"), - new Path(testFile.getAbsolutePath()) - ); + ParquetFileWriter parquetFileWriter = + new ParquetFileWriter( + conf, + ParquetSchemaUtil.convert(DATA_SCHEMA, "testSchema"), + new Path(testFile.getAbsolutePath())); parquetFileWriter.start(); for (int i = 0; i < NUM_ROW_GROUPS; i += 1) { File split = temp.newFile(); Assert.assertTrue("Delete should succeed", split.delete()); fileSplits.add(new Path(split.getAbsolutePath())); - try (FileAppender writer = Parquet.write(Files.localOutput(split)) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(struct, msgType)) - .schema(DATA_SCHEMA) - .overwrite() - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(split)) + .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(struct, msgType)) + .schema(DATA_SCHEMA) + .overwrite() + .build()) { writer.addAll(DATA_ROWS.subList(i * ROWS_PER_SPLIT, (i + 1) * ROWS_PER_SPLIT)); } - parquetFileWriter.appendFile(HadoopInputFile.fromPath(new Path(split.getAbsolutePath()), conf)); + parquetFileWriter.appendFile( + HadoopInputFile.fromPath(new Path(split.getAbsolutePath()), conf)); } - parquetFileWriter - .end(ParquetFileWriter.mergeMetadataFiles(fileSplits, conf).getFileMetaData().getKeyValueMetaData()); + parquetFileWriter.end( + ParquetFileWriter.mergeMetadataFiles(fileSplits, conf) + .getFileMetaData() + .getKeyValueMetaData()); } @Test @@ -167,7 +165,8 @@ public void testReadRowNumbersWithFilter() throws IOException { // current iceberg supports row group filter. for (int i = 1; i < 5; i += 1) { readAndValidate( - Expressions.and(Expressions.lessThan("id", NUM_ROWS / 2), + Expressions.and( + Expressions.lessThan("id", NUM_ROWS / 2), Expressions.greaterThanOrEqual("id", i * ROWS_PER_SPLIT)), null, null, @@ -177,28 +176,36 @@ public void testReadRowNumbersWithFilter() throws IOException { @Test public void testReadRowNumbersWithSplits() throws IOException { - ParquetFileReader fileReader = new ParquetFileReader( - HadoopInputFile.fromPath(new Path(testFile.getAbsolutePath()), new Configuration()), - ParquetReadOptions.builder().build()); + ParquetFileReader fileReader = + new ParquetFileReader( + HadoopInputFile.fromPath(new Path(testFile.getAbsolutePath()), new Configuration()), + ParquetReadOptions.builder().build()); List rowGroups = fileReader.getRowGroups(); for (int i = 0; i < NUM_ROW_GROUPS; i += 1) { - readAndValidate(null, + readAndValidate( + null, rowGroups.get(i).getColumns().get(0).getStartingPos(), rowGroups.get(i).getCompressedSize(), EXPECTED_ROWS.subList(i * ROWS_PER_SPLIT, (i + 1) * ROWS_PER_SPLIT)); } } - private void readAndValidate(Expression filter, Long splitStart, Long splitLength, List expected) + private void readAndValidate( + Expression filter, Long splitStart, Long splitLength, List expected) throws IOException { - Parquet.ReadBuilder builder = Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA); + Parquet.ReadBuilder builder = + Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA); if (vectorized) { - builder.createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(PROJECTION_SCHEMA, - fileSchema, NullCheckingForGet.NULL_CHECKING_ENABLED)); + builder.createBatchedReaderFunc( + fileSchema -> + VectorizedSparkParquetReaders.buildReader( + PROJECTION_SCHEMA, fileSchema, NullCheckingForGet.NULL_CHECKING_ENABLED)); builder.recordsPerBatch(RECORDS_PER_BATCH); } else { - builder = builder.createReaderFunc(msgType -> SparkParquetReaders.buildReader(PROJECTION_SCHEMA, msgType)); + builder = + builder.createReaderFunc( + msgType -> SparkParquetReaders.buildReader(PROJECTION_SCHEMA, msgType)); } if (filter != null) { @@ -209,7 +216,8 @@ private void readAndValidate(Expression filter, Long splitStart, Long splitLengt builder = builder.split(splitStart, splitLength); } - try (CloseableIterable reader = vectorized ? batchesToRows(builder.build()) : builder.build()) { + try (CloseableIterable reader = + vectorized ? batchesToRows(builder.build()) : builder.build()) { final Iterator actualRows = reader.iterator(); for (InternalRow internalRow : expected) { diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java index 03d234c1eca5..d4b7443e2e20 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -60,31 +62,31 @@ import org.junit.Assume; import org.junit.Test; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkParquetReader extends AvroDataTest { @Override protected void writeAndValidate(Schema schema) throws IOException { - Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find(schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); + Assume.assumeTrue( + "Parquet Avro cannot write non-string map keys", + null + == TypeUtil.find( + schema, + type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); List expected = RandomData.generateList(schema, 100, 0L); File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { writer.addAll(expected); } - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(type -> SparkParquetReaders.buildReader(schema, type)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(type -> SparkParquetReaders.buildReader(schema, type)) + .build()) { Iterator rows = reader.iterator(); for (int i = 0; i < expected.size(); i += 1) { Assert.assertTrue("Should have expected number of rows", rows.hasNext()); @@ -129,7 +131,8 @@ protected Table tableFromInputFile(InputFile inputFile, Schema schema) throws IO @Test public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOException { - String outputFilePath = String.format("%s/%s", temp.getRoot().getAbsolutePath(), "parquet_int96.parquet"); + String outputFilePath = + String.format("%s/%s", temp.getRoot().getAbsolutePath(), "parquet_int96.parquet"); HadoopOutputFile outputFile = HadoopOutputFile.fromPath( new org.apache.hadoop.fs.Path(outputFilePath), new Configuration()); @@ -137,7 +140,7 @@ public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOExceptio StructType sparkSchema = new StructType( new StructField[] { - new StructField("ts", DataTypes.TimestampType, true, Metadata.empty()) + new StructField("ts", DataTypes.TimestampType, true, Metadata.empty()) }); List rows = Lists.newArrayList(RandomData.generateSpark(schema, 10, 0L)); @@ -164,14 +167,14 @@ public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOExceptio Assert.assertEquals(rows.size(), tableRecords.size()); - for (int i = 0; i < tableRecords.size(); i++) { + for (int i = 0; i < tableRecords.size(); i++) { GenericsHelpers.assertEqualsUnsafe(schema.asStruct(), tableRecords.get(i), rows.get(i)); } } /** - * Native Spark ParquetWriter.Builder implementation so that we can write timestamps using Spark's native - * ParquetWriteSupport. + * Native Spark ParquetWriter.Builder implementation so that we can write timestamps using Spark's + * native ParquetWriteSupport. */ private static class NativeSparkWriterBuilder extends ParquetWriter.Builder { diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java index c75a87abc45c..261fb8838aa4 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -35,39 +37,51 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkParquetWriter { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required(5, "strict", Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional(6, "hopeful", Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()) - )), - optional(10, "vehement", Types.LongType.get()) - )), - optional(11, "metamorphosis", Types.MapType.ofRequired(12, 13, - Types.StringType.get(), Types.TimestampType.withZone())), - required(14, "winter", Types.ListType.ofOptional(15, Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.FloatType.get()), - optional(18, "wheeze", Types.StringType.get()) - ))), - optional(19, "renovate", Types.MapType.ofRequired(20, 21, - Types.StringType.get(), Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.IntegerType.get()), - required(24, "couch rope", Types.IntegerType.get()) - ))), - optional(2, "slide", Types.StringType.get()) - ); + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "roots", Types.LongType.get()), + optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), + required( + 5, + "strict", + Types.StructType.of( + required(9, "tangerine", Types.StringType.get()), + optional( + 6, + "hopeful", + Types.StructType.of( + required(7, "steel", Types.FloatType.get()), + required(8, "lantern", Types.DateType.get()))), + optional(10, "vehement", Types.LongType.get()))), + optional( + 11, + "metamorphosis", + Types.MapType.ofRequired( + 12, 13, Types.StringType.get(), Types.TimestampType.withZone())), + required( + 14, + "winter", + Types.ListType.ofOptional( + 15, + Types.StructType.of( + optional(16, "beet", Types.DoubleType.get()), + required(17, "stamp", Types.FloatType.get()), + optional(18, "wheeze", Types.StringType.get())))), + optional( + 19, + "renovate", + Types.MapType.ofRequired( + 20, + 21, + Types.StringType.get(), + Types.StructType.of( + optional(22, "jumpy", Types.DoubleType.get()), + required(23, "koala", Types.IntegerType.get()), + required(24, "couch rope", Types.IntegerType.get())))), + optional(2, "slide", Types.StringType.get())); @Test public void testCorrectness() throws IOException { @@ -77,17 +91,22 @@ public void testCorrectness() throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(COMPLEX_SCHEMA), msgType)) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(COMPLEX_SCHEMA) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter( + SparkSchemaUtil.convert(COMPLEX_SCHEMA), msgType)) + .build()) { writer.addAll(records); } - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(COMPLEX_SCHEMA, type)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(COMPLEX_SCHEMA, type)) + .build()) { Iterator expected = records.iterator(); Iterator rows = reader.iterator(); for (int i = 0; i < numRows; i += 1) { diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java index 1e7430d16df7..d10e7f5a19e3 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.math.BigDecimal; @@ -40,8 +41,6 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkRecordOrcReaderWriter extends AvroDataTest { private static final int NUM_RECORDS = 200; @@ -50,19 +49,21 @@ private void writeAndValidate(Schema schema, List expectedRecords) throw Assert.assertTrue("Delete should succeed", originalFile.delete()); // Write few generic records into the original test file. - try (FileAppender writer = ORC.write(Files.localOutput(originalFile)) - .createWriterFunc(GenericOrcWriter::buildWriter) - .schema(schema) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(originalFile)) + .createWriterFunc(GenericOrcWriter::buildWriter) + .schema(schema) + .build()) { writer.addAll(expectedRecords); } // Read into spark InternalRow from the original test file. List internalRows = Lists.newArrayList(); - try (CloseableIterable reader = ORC.read(Files.localInput(originalFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(originalFile)) + .project(schema) + .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) + .build()) { reader.forEach(internalRows::add); assertEqualsUnsafe(schema.asStruct(), expectedRecords, reader, expectedRecords.size()); } @@ -71,26 +72,29 @@ private void writeAndValidate(Schema schema, List expectedRecords) throw Assert.assertTrue("Delete should succeed", anotherFile.delete()); // Write those spark InternalRows into a new file again. - try (FileAppender writer = ORC.write(Files.localOutput(anotherFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(schema) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(anotherFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(schema) + .build()) { writer.addAll(internalRows); } // Check whether the InternalRows are expected records. - try (CloseableIterable reader = ORC.read(Files.localInput(anotherFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(anotherFile)) + .project(schema) + .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) + .build()) { assertEqualsUnsafe(schema.asStruct(), expectedRecords, reader, expectedRecords.size()); } // Read into iceberg GenericRecord and check again. - try (CloseableIterable reader = ORC.read(Files.localInput(anotherFile)) - .createReaderFunc(typeDesc -> GenericOrcReader.buildReader(schema, typeDesc)) - .project(schema) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(anotherFile)) + .createReaderFunc(typeDesc -> GenericOrcReader.buildReader(schema, typeDesc)) + .project(schema) + .build()) { assertRecordEquals(expectedRecords, reader, expectedRecords.size()); } } @@ -103,11 +107,11 @@ protected void writeAndValidate(Schema schema) throws IOException { @Test public void testDecimalWithTrailingZero() throws IOException { - Schema schema = new Schema( - required(1, "d1", Types.DecimalType.of(10, 2)), - required(2, "d2", Types.DecimalType.of(20, 5)), - required(3, "d3", Types.DecimalType.of(38, 20)) - ); + Schema schema = + new Schema( + required(1, "d1", Types.DecimalType.of(10, 2)), + required(2, "d2", Types.DecimalType.of(20, 5)), + required(3, "d3", Types.DecimalType.of(38, 20))); List expected = Lists.newArrayList(); @@ -121,7 +125,8 @@ public void testDecimalWithTrailingZero() throws IOException { writeAndValidate(schema, expected); } - private static void assertRecordEquals(Iterable expected, Iterable actual, int size) { + private static void assertRecordEquals( + Iterable expected, Iterable actual, int size) { Iterator expectedIter = expected.iterator(); Iterator actualIter = actual.iterator(); for (int i = 0; i < size; i += 1) { @@ -133,8 +138,8 @@ private static void assertRecordEquals(Iterable expected, Iterable expected, - Iterable actual, int size) { + private static void assertEqualsUnsafe( + Types.StructType struct, Iterable expected, Iterable actual, int size) { Iterator expectedIter = expected.iterator(); Iterator actualIter = actual.iterator(); for (int i = 0; i < size; i += 1) { diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java index f292df0c3bf8..756f49a2aad6 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet.vectorized; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; + import java.io.File; import java.io.IOException; import org.apache.avro.generic.GenericData; @@ -35,42 +36,42 @@ import org.junit.Ignore; import org.junit.Test; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; - public class TestParquetDictionaryEncodedVectorizedReads extends TestParquetVectorizedReads { @Override - Iterable generateData(Schema schema, int numRecords, long seed, float nullPercentage, - Function transform) { - Iterable data = RandomData.generateDictionaryEncodableData(schema, numRecords, seed, nullPercentage); + Iterable generateData( + Schema schema, + int numRecords, + long seed, + float nullPercentage, + Function transform) { + Iterable data = + RandomData.generateDictionaryEncodableData(schema, numRecords, seed, nullPercentage); return transform == IDENTITY ? data : Iterables.transform(data, transform); } @Test @Override @Ignore // Ignored since this code path is already tested in TestParquetVectorizedReads - public void testVectorizedReadsWithNewContainers() throws IOException { - - } + public void testVectorizedReadsWithNewContainers() throws IOException {} @Test public void testMixedDictionaryNonDictionaryReads() throws IOException { Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); File dictionaryEncodedFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dictionaryEncodedFile.delete()); - Iterable dictionaryEncodableData = RandomData.generateDictionaryEncodableData( - schema, - 10000, - 0L, - RandomData.DEFAULT_NULL_PERCENTAGE); - try (FileAppender writer = getParquetWriter(schema, dictionaryEncodedFile)) { + Iterable dictionaryEncodableData = + RandomData.generateDictionaryEncodableData( + schema, 10000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE); + try (FileAppender writer = + getParquetWriter(schema, dictionaryEncodedFile)) { writer.addAll(dictionaryEncodableData); } File plainEncodingFile = temp.newFile(); Assert.assertTrue("Delete should succeed", plainEncodingFile.delete()); - Iterable nonDictionaryData = RandomData.generate(schema, 10000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE); + Iterable nonDictionaryData = + RandomData.generate(schema, 10000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE); try (FileAppender writer = getParquetWriter(schema, plainEncodingFile)) { writer.addAll(nonDictionaryData); } @@ -78,15 +79,19 @@ public void testMixedDictionaryNonDictionaryReads() throws IOException { int rowGroupSize = PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; File mixedFile = temp.newFile(); Assert.assertTrue("Delete should succeed", mixedFile.delete()); - Parquet.concat(ImmutableList.of(dictionaryEncodedFile, plainEncodingFile, dictionaryEncodedFile), - mixedFile, rowGroupSize, schema, ImmutableMap.of()); + Parquet.concat( + ImmutableList.of(dictionaryEncodedFile, plainEncodingFile, dictionaryEncodedFile), + mixedFile, + rowGroupSize, + schema, + ImmutableMap.of()); assertRecordsMatch( - schema, - 30000, - FluentIterable.concat(dictionaryEncodableData, nonDictionaryData, dictionaryEncodableData), - mixedFile, - false, - true, - BATCH_SIZE); + schema, + 30000, + FluentIterable.concat(dictionaryEncodableData, nonDictionaryData, dictionaryEncodableData), + mixedFile, + false, + true, + BATCH_SIZE); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java index 5ceac3fdb76e..42ea34936b5f 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet.vectorized; import java.io.File; @@ -33,7 +32,8 @@ import org.junit.Ignore; import org.junit.Test; -public class TestParquetDictionaryFallbackToPlainEncodingVectorizedReads extends TestParquetVectorizedReads { +public class TestParquetDictionaryFallbackToPlainEncodingVectorizedReads + extends TestParquetVectorizedReads { private static final int NUM_ROWS = 1_000_000; @Override @@ -42,15 +42,20 @@ protected int getNumRows() { } @Override - Iterable generateData(Schema schema, int numRecords, long seed, float nullPercentage, - Function transform) { + Iterable generateData( + Schema schema, + int numRecords, + long seed, + float nullPercentage, + Function transform) { // TODO: take into account nullPercentage when generating fallback encoding data Iterable data = RandomData.generateFallbackData(schema, numRecords, seed, numRecords / 20); return transform == IDENTITY ? data : Iterables.transform(data, transform); } @Override - FileAppender getParquetWriter(Schema schema, File testFile) throws IOException { + FileAppender getParquetWriter(Schema schema, File testFile) + throws IOException { return Parquet.write(Files.localOutput(testFile)) .schema(schema) .named("test") @@ -61,14 +66,10 @@ FileAppender getParquetWriter(Schema schema, File testFile) @Test @Override @Ignore // Fallback encoding not triggered when data is mostly null - public void testMostlyNullsForOptionalFields() { - - } + public void testMostlyNullsForOptionalFields() {} @Test @Override @Ignore // Ignored since this code path is already tested in TestParquetVectorizedReads - public void testVectorizedReadsWithNewContainers() throws IOException { - - } + public void testVectorizedReadsWithNewContainers() throws IOException {} } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java index 48dcc94a5fce..8908a23fad8f 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet.vectorized; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -49,9 +51,6 @@ import org.junit.Ignore; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestParquetVectorizedReads extends AvroDataTest { private static final int NUM_ROWS = 200_000; static final int BATCH_SIZE = 10_000; @@ -64,24 +63,44 @@ protected void writeAndValidate(Schema schema) throws IOException { } private void writeAndValidate( - Schema schema, int numRecords, long seed, float nullPercentage, - boolean setAndCheckArrowValidityVector, boolean reuseContainers) - throws IOException { - writeAndValidate(schema, numRecords, seed, nullPercentage, - setAndCheckArrowValidityVector, reuseContainers, BATCH_SIZE, IDENTITY); + Schema schema, + int numRecords, + long seed, + float nullPercentage, + boolean setAndCheckArrowValidityVector, + boolean reuseContainers) + throws IOException { + writeAndValidate( + schema, + numRecords, + seed, + nullPercentage, + setAndCheckArrowValidityVector, + reuseContainers, + BATCH_SIZE, + IDENTITY); } private void writeAndValidate( - Schema schema, int numRecords, long seed, float nullPercentage, - boolean setAndCheckArrowValidityVector, boolean reuseContainers, int batchSize, - Function transform) + Schema schema, + int numRecords, + long seed, + float nullPercentage, + boolean setAndCheckArrowValidityVector, + boolean reuseContainers, + int batchSize, + Function transform) throws IOException { // Write test data - Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find( - schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); + Assume.assumeTrue( + "Parquet Avro cannot write non-string map keys", + null + == TypeUtil.find( + schema, + type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); - Iterable expected = generateData(schema, numRecords, seed, nullPercentage, transform); + Iterable expected = + generateData(schema, numRecords, seed, nullPercentage, transform); // write a test parquet file using iceberg writer File testFile = temp.newFile(); @@ -90,58 +109,74 @@ private void writeAndValidate( try (FileAppender writer = getParquetWriter(schema, testFile)) { writer.addAll(expected); } - assertRecordsMatch(schema, numRecords, expected, testFile, setAndCheckArrowValidityVector, - reuseContainers, batchSize); + assertRecordsMatch( + schema, + numRecords, + expected, + testFile, + setAndCheckArrowValidityVector, + reuseContainers, + batchSize); } protected int getNumRows() { return NUM_ROWS; } - Iterable generateData(Schema schema, int numRecords, long seed, float nullPercentage, - Function transform) { - Iterable data = RandomData.generate(schema, numRecords, seed, nullPercentage); + Iterable generateData( + Schema schema, + int numRecords, + long seed, + float nullPercentage, + Function transform) { + Iterable data = + RandomData.generate(schema, numRecords, seed, nullPercentage); return transform == IDENTITY ? data : Iterables.transform(data, transform); } - FileAppender getParquetWriter(Schema schema, File testFile) throws IOException { + FileAppender getParquetWriter(Schema schema, File testFile) + throws IOException { + return Parquet.write(Files.localOutput(testFile)).schema(schema).named("test").build(); + } + + FileAppender getParquetV2Writer(Schema schema, File testFile) + throws IOException { return Parquet.write(Files.localOutput(testFile)) .schema(schema) .named("test") + .writerVersion(ParquetProperties.WriterVersion.PARQUET_2_0) .build(); } - FileAppender getParquetV2Writer(Schema schema, File testFile) throws IOException { - return Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .writerVersion(ParquetProperties.WriterVersion.PARQUET_2_0) - .build(); - } - void assertRecordsMatch( - Schema schema, int expectedSize, Iterable expected, File testFile, - boolean setAndCheckArrowValidityBuffer, boolean reuseContainers, int batchSize) + Schema schema, + int expectedSize, + Iterable expected, + File testFile, + boolean setAndCheckArrowValidityBuffer, + boolean reuseContainers, + int batchSize) throws IOException { - Parquet.ReadBuilder readBuilder = Parquet.read(Files.localInput(testFile)) - .project(schema) - .recordsPerBatch(batchSize) - .createBatchedReaderFunc(type -> VectorizedSparkParquetReaders.buildReader( - schema, - type, - setAndCheckArrowValidityBuffer)); + Parquet.ReadBuilder readBuilder = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .recordsPerBatch(batchSize) + .createBatchedReaderFunc( + type -> + VectorizedSparkParquetReaders.buildReader( + schema, type, setAndCheckArrowValidityBuffer)); if (reuseContainers) { readBuilder.reuseContainers(); } - try (CloseableIterable batchReader = - readBuilder.build()) { + try (CloseableIterable batchReader = readBuilder.build()) { Iterator expectedIter = expected.iterator(); Iterator batches = batchReader.iterator(); int numRowsRead = 0; while (batches.hasNext()) { ColumnarBatch batch = batches.next(); numRowsRead += batch.numRows(); - TestHelpers.assertEqualsBatch(schema.asStruct(), expectedIter, batch, setAndCheckArrowValidityBuffer); + TestHelpers.assertEqualsBatch( + schema.asStruct(), expectedIter, batch, setAndCheckArrowValidityBuffer); } Assert.assertEquals(expectedSize, numRowsRead); } @@ -149,38 +184,31 @@ void assertRecordsMatch( @Test @Ignore - public void testArray() { - } + public void testArray() {} @Test @Ignore - public void testArrayOfStructs() { - } + public void testArrayOfStructs() {} @Test @Ignore - public void testMap() { - } + public void testMap() {} @Test @Ignore - public void testNumericMapKey() { - } + public void testNumericMapKey() {} @Test @Ignore - public void testComplexMapKey() { - } + public void testComplexMapKey() {} @Test @Ignore - public void testMapOfStructs() { - } + public void testMapOfStructs() {} @Test @Ignore - public void testMixedTypes() { - } + public void testMixedTypes() {} @Test @Override @@ -189,13 +217,13 @@ public void testNestedStruct() { "Vectorized reads are not supported yet for struct fields", UnsupportedOperationException.class, "Vectorized reads are not supported yet for struct fields", - () -> VectorizedSparkParquetReaders.buildReader( - TypeUtil.assignIncreasingFreshIds(new Schema(required( - 1, - "struct", - SUPPORTED_PRIMITIVES))), - new MessageType("struct", new GroupType(Type.Repetition.OPTIONAL, "struct").withId(1)), - false)); + () -> + VectorizedSparkParquetReaders.buildReader( + TypeUtil.assignIncreasingFreshIds( + new Schema(required(1, "struct", SUPPORTED_PRIMITIVES))), + new MessageType( + "struct", new GroupType(Type.Repetition.OPTIONAL, "struct").withId(1)), + false)); } @Test @@ -211,27 +239,40 @@ public void testMostlyNullsForOptionalFields() throws IOException { @Test public void testSettingArrowValidityVector() throws IOException { - writeAndValidate(new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)), - getNumRows(), 0L, RandomData.DEFAULT_NULL_PERCENTAGE, true, true); + writeAndValidate( + new Schema(Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)), + getNumRows(), + 0L, + RandomData.DEFAULT_NULL_PERCENTAGE, + true, + true); } @Test public void testVectorizedReadsWithNewContainers() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema(SUPPORTED_PRIMITIVES.fields())), - getNumRows(), 0L, RandomData.DEFAULT_NULL_PERCENTAGE, true, false); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds(new Schema(SUPPORTED_PRIMITIVES.fields())), + getNumRows(), + 0L, + RandomData.DEFAULT_NULL_PERCENTAGE, + true, + false); } @Test public void testVectorizedReadsWithReallocatedArrowBuffers() throws IOException { // With a batch size of 2, 256 bytes are allocated in the VarCharVector. By adding strings of // length 512, the vector will need to be reallocated for storing the batch. - writeAndValidate(new Schema( + writeAndValidate( + new Schema( Lists.newArrayList( - SUPPORTED_PRIMITIVES.field("id"), - SUPPORTED_PRIMITIVES.field("data"))), - 10, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, - true, true, 2, + SUPPORTED_PRIMITIVES.field("id"), SUPPORTED_PRIMITIVES.field("data"))), + 10, + 0L, + RandomData.DEFAULT_NULL_PERCENTAGE, + true, + true, + 2, record -> { if (record.get("data") != null) { record.put("data", Strings.padEnd((String) record.get("data"), 512, 'a')); @@ -244,65 +285,67 @@ record -> { @Test public void testReadsForTypePromotedColumns() throws Exception { - Schema writeSchema = new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "int_data", Types.IntegerType.get()), - optional(102, "float_data", Types.FloatType.get()), - optional(103, "decimal_data", Types.DecimalType.of(10, 5)) - ); + Schema writeSchema = + new Schema( + required(100, "id", Types.LongType.get()), + optional(101, "int_data", Types.IntegerType.get()), + optional(102, "float_data", Types.FloatType.get()), + optional(103, "decimal_data", Types.DecimalType.of(10, 5))); File dataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = generateData(writeSchema, 30000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); + Iterable data = + generateData(writeSchema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); try (FileAppender writer = getParquetWriter(writeSchema, dataFile)) { writer.addAll(data); } - Schema readSchema = new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "int_data", Types.LongType.get()), - optional(102, "float_data", Types.DoubleType.get()), - optional(103, "decimal_data", Types.DecimalType.of(25, 5)) - ); + Schema readSchema = + new Schema( + required(100, "id", Types.LongType.get()), + optional(101, "int_data", Types.LongType.get()), + optional(102, "float_data", Types.DoubleType.get()), + optional(103, "decimal_data", Types.DecimalType.of(25, 5))); - assertRecordsMatch(readSchema, 30000, data, dataFile, false, - true, BATCH_SIZE); + assertRecordsMatch(readSchema, 30000, data, dataFile, false, true, BATCH_SIZE); } @Test public void testSupportedReadsForParquetV2() throws Exception { // Only float and double column types are written using plain encoding with Parquet V2 - Schema schema = new Schema( + Schema schema = + new Schema( optional(102, "float_data", Types.FloatType.get()), optional(103, "double_data", Types.DoubleType.get())); File dataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = generateData(schema, 30000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); + Iterable data = + generateData(schema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); try (FileAppender writer = getParquetV2Writer(schema, dataFile)) { writer.addAll(data); } - assertRecordsMatch(schema, 30000, data, dataFile, false, - true, BATCH_SIZE); + assertRecordsMatch(schema, 30000, data, dataFile, false, true, BATCH_SIZE); } @Test public void testUnsupportedReadsForParquetV2() throws Exception { - // Longs, ints, string types etc use delta encoding and which are not supported for vectorized reads + // Longs, ints, string types etc use delta encoding and which are not supported for vectorized + // reads Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); File dataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = generateData(schema, 30000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); + Iterable data = + generateData(schema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); try (FileAppender writer = getParquetV2Writer(schema, dataFile)) { writer.addAll(data); } - AssertHelpers.assertThrows("Vectorized reads not supported", - UnsupportedOperationException.class, "Cannot support vectorized reads for column", () -> { - assertRecordsMatch(schema, 30000, data, dataFile, false, - true, BATCH_SIZE); + AssertHelpers.assertThrows( + "Vectorized reads not supported", + UnsupportedOperationException.class, + "Cannot support vectorized reads for column", + () -> { + assertRecordsMatch(schema, 30000, data, dataFile, false, true, BATCH_SIZE); return null; }); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java index 5e22daeb0841..53a35eec61ce 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.time.Instant; diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java index 0f8c8b3b65c6..c9c1c29ea8fc 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -35,7 +34,8 @@ public class ManualSource implements TableProvider, DataSourceRegister { private static final Map tableMap = Maps.newHashMap(); public static void setTable(String name, Table table) { - Preconditions.checkArgument(!tableMap.containsKey(name), "Cannot set " + name + ". It is already set"); + Preconditions.checkArgument( + !tableMap.containsKey(name), "Cannot set " + name + ". It is already set"); tableMap.put(name, table); } @@ -61,7 +61,8 @@ public Transform[] inferPartitioning(CaseInsensitiveStringMap options) { @Override public org.apache.spark.sql.connector.catalog.Table getTable( StructType schema, Transform[] partitioning, Map properties) { - Preconditions.checkArgument(properties.containsKey(TABLE_NAME), "Missing property " + TABLE_NAME); + Preconditions.checkArgument( + properties.containsKey(TABLE_NAME), "Missing property " + TABLE_NAME); String tableName = properties.get(TABLE_NAME); Preconditions.checkArgument(tableMap.containsKey(tableName), "Table missing " + tableName); return tableMap.get(tableName); diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java index c8b7a31b3ba0..550e20b9338e 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.relocated.com.google.common.base.Objects; @@ -25,8 +24,7 @@ public class SimpleRecord { private Integer id; private String data; - public SimpleRecord() { - } + public SimpleRecord() {} public SimpleRecord(Integer id, String data) { this.id = id; diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/SparkTestTable.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/SparkTestTable.java index afb1136f4fa5..ff0afd87782d 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/SparkTestTable.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/SparkTestTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.MetadataColumns; diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java index 4d2e12229813..9491adde4605 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.List; @@ -46,13 +47,10 @@ import org.junit.Rule; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.Files.localOutput; - public class TestAvroScan extends AvroDataTest { private static final Configuration CONF = new Configuration(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; @@ -75,8 +73,8 @@ protected void writeAndValidate(Schema schema) throws IOException { File dataFolder = new File(location, "data"); dataFolder.mkdirs(); - File avroFile = new File(dataFolder, - FileFormat.AVRO.addExtension(UUID.randomUUID().toString())); + File avroFile = + new File(dataFolder, FileFormat.AVRO.addExtension(UUID.randomUUID().toString())); HadoopTables tables = new HadoopTables(CONF); Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); @@ -87,23 +85,21 @@ protected void writeAndValidate(Schema schema) throws IOException { List expected = RandomData.generateList(tableSchema, 100, 1L); - try (FileAppender writer = Avro.write(localOutput(avroFile)) - .schema(tableSchema) - .build()) { + try (FileAppender writer = + Avro.write(localOutput(avroFile)).schema(tableSchema).build()) { writer.addAll(expected); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(100) - .withFileSizeInBytes(avroFile.length()) - .withPath(avroFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(100) + .withFileSizeInBytes(avroFile.length()) + .withPath(avroFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); - Dataset df = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset df = spark.read().format("iceberg").load(location.toString()); List rows = df.collectAsList(); Assert.assertEquals("Should contain 100 rows", 100, rows.size()); diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java index b0a77b72b431..9f32769379c8 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsSafe; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; + import java.io.File; import java.io.IOException; import java.net.URI; @@ -68,10 +71,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsSafe; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; - @RunWith(Parameterized.class) public class TestDataFrameWrites extends AvroDataTest { private static final Configuration CONF = new Configuration(); @@ -80,7 +79,7 @@ public class TestDataFrameWrites extends AvroDataTest { @Parameterized.Parameters(name = "format = {0}") public static Object[] parameters() { - return new Object[] { "parquet", "avro", "orc" }; + return new Object[] {"parquet", "avro", "orc"}; } public TestDataFrameWrites(String format) { @@ -92,32 +91,36 @@ public TestDataFrameWrites(String format) { private Map tableProperties; - private org.apache.spark.sql.types.StructType sparkSchema = new org.apache.spark.sql.types.StructType( - new org.apache.spark.sql.types.StructField[] { - new org.apache.spark.sql.types.StructField( - "optionalField", - org.apache.spark.sql.types.DataTypes.StringType, - true, - org.apache.spark.sql.types.Metadata.empty()), - new org.apache.spark.sql.types.StructField( - "requiredField", - org.apache.spark.sql.types.DataTypes.StringType, - false, - org.apache.spark.sql.types.Metadata.empty()) - }); - - private Schema icebergSchema = new Schema( - Types.NestedField.optional(1, "optionalField", Types.StringType.get()), - Types.NestedField.required(2, "requiredField", Types.StringType.get())); - - private List data0 = Arrays.asList( - "{\"optionalField\": \"a1\", \"requiredField\": \"bid_001\"}", - "{\"optionalField\": \"a2\", \"requiredField\": \"bid_002\"}"); - private List data1 = Arrays.asList( - "{\"optionalField\": \"d1\", \"requiredField\": \"bid_101\"}", - "{\"optionalField\": \"d2\", \"requiredField\": \"bid_102\"}", - "{\"optionalField\": \"d3\", \"requiredField\": \"bid_103\"}", - "{\"optionalField\": \"d4\", \"requiredField\": \"bid_104\"}"); + private org.apache.spark.sql.types.StructType sparkSchema = + new org.apache.spark.sql.types.StructType( + new org.apache.spark.sql.types.StructField[] { + new org.apache.spark.sql.types.StructField( + "optionalField", + org.apache.spark.sql.types.DataTypes.StringType, + true, + org.apache.spark.sql.types.Metadata.empty()), + new org.apache.spark.sql.types.StructField( + "requiredField", + org.apache.spark.sql.types.DataTypes.StringType, + false, + org.apache.spark.sql.types.Metadata.empty()) + }); + + private Schema icebergSchema = + new Schema( + Types.NestedField.optional(1, "optionalField", Types.StringType.get()), + Types.NestedField.required(2, "requiredField", Types.StringType.get())); + + private List data0 = + Arrays.asList( + "{\"optionalField\": \"a1\", \"requiredField\": \"bid_001\"}", + "{\"optionalField\": \"a2\", \"requiredField\": \"bid_002\"}"); + private List data1 = + Arrays.asList( + "{\"optionalField\": \"d1\", \"requiredField\": \"bid_101\"}", + "{\"optionalField\": \"d2\", \"requiredField\": \"bid_102\"}", + "{\"optionalField\": \"d3\", \"requiredField\": \"bid_103\"}", + "{\"optionalField\": \"d4\", \"requiredField\": \"bid_104\"}"); @BeforeClass public static void startSpark() { @@ -145,8 +148,10 @@ public void testWriteWithCustomDataLocation() throws IOException { File location = createTableFolder(); File tablePropertyDataLocation = temp.newFolder("test-table-property-data-dir"); Table table = createTable(new Schema(SUPPORTED_PRIMITIVES.fields()), location); - table.updateProperties().set( - TableProperties.WRITE_DATA_LOCATION, tablePropertyDataLocation.getAbsolutePath()).commit(); + table + .updateProperties() + .set(TableProperties.WRITE_DATA_LOCATION, tablePropertyDataLocation.getAbsolutePath()) + .commit(); writeAndValidateWithLocations(table, location, tablePropertyDataLocation); } @@ -162,7 +167,8 @@ private Table createTable(Schema schema, File location) { return tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); } - private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) throws IOException { + private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) + throws IOException { Schema tableSchema = table.schema(); // use the table schema because ids are reassigned table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); @@ -179,47 +185,56 @@ private void writeAndValidateWithLocations(Table table, File location, File expe while (expectedIter.hasNext() && actualIter.hasNext()) { assertEqualsSafe(tableSchema.asStruct(), expectedIter.next(), actualIter.next()); } - Assert.assertEquals("Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext()); - - table.currentSnapshot().addedDataFiles(table.io()).forEach(dataFile -> - Assert.assertTrue( - String.format( - "File should have the parent directory %s, but has: %s.", - expectedDataDir.getAbsolutePath(), - dataFile.path()), - URI.create(dataFile.path().toString()).getPath().startsWith(expectedDataDir.getAbsolutePath()))); + Assert.assertEquals( + "Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext()); + + table + .currentSnapshot() + .addedDataFiles(table.io()) + .forEach( + dataFile -> + Assert.assertTrue( + String.format( + "File should have the parent directory %s, but has: %s.", + expectedDataDir.getAbsolutePath(), dataFile.path()), + URI.create(dataFile.path().toString()) + .getPath() + .startsWith(expectedDataDir.getAbsolutePath()))); } private List readTable(String location) { - Dataset result = spark.read() - .format("iceberg") - .load(location); + Dataset result = spark.read().format("iceberg").load(location); return result.collectAsList(); } - private void writeData(Iterable records, Schema schema, String location) throws IOException { + private void writeData(Iterable records, Schema schema, String location) + throws IOException { Dataset df = createDataset(records, schema); DataFrameWriter writer = df.write().format("iceberg").mode("append"); writer.save(location); } - private void writeDataWithFailOnPartition(Iterable records, Schema schema, String location) - throws IOException, SparkException { + private void writeDataWithFailOnPartition( + Iterable records, Schema schema, String location) throws IOException, SparkException { final int numPartitions = 10; final int partitionToFail = new Random().nextInt(numPartitions); - MapPartitionsFunction failOnFirstPartitionFunc = (MapPartitionsFunction) input -> { - int partitionId = TaskContext.getPartitionId(); - - if (partitionId == partitionToFail) { - throw new SparkException(String.format("Intended exception in partition %d !", partitionId)); - } - return input; - }; - - Dataset df = createDataset(records, schema) - .repartition(numPartitions) - .mapPartitions(failOnFirstPartitionFunc, RowEncoder.apply(convert(schema))); + MapPartitionsFunction failOnFirstPartitionFunc = + (MapPartitionsFunction) + input -> { + int partitionId = TaskContext.getPartitionId(); + + if (partitionId == partitionToFail) { + throw new SparkException( + String.format("Intended exception in partition %d !", partitionId)); + } + return input; + }; + + Dataset df = + createDataset(records, schema) + .repartition(numPartitions) + .mapPartitions(failOnFirstPartitionFunc, RowEncoder.apply(convert(schema))); // This trick is needed because Spark 3 handles decimal overflow in RowEncoder which "changes" // nullability of the column to "true" regardless of original nullability. // Setting "check-nullability" option to "false" doesn't help as it fails at Spark analyzer. @@ -234,10 +249,8 @@ private Dataset createDataset(Iterable records, Schema schema) thro File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Avro.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Avro.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { for (Record rec : records) { writer.add(rec); } @@ -245,10 +258,11 @@ private Dataset createDataset(Iterable records, Schema schema) thro // make sure the dataframe matches the records before moving on List rows = Lists.newArrayList(); - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)) + .createReaderFunc(SparkAvroReader::new) + .project(schema) + .build()) { Iterator recordIter = records.iterator(); Iterator readIter = reader.iterator(); @@ -257,7 +271,8 @@ private Dataset createDataset(Iterable records, Schema schema) thro assertEqualsUnsafe(schema.asStruct(), recordIter.next(), row); rows.add(row); } - Assert.assertEquals("Both iterators should be exhausted", recordIter.hasNext(), readIter.hasNext()); + Assert.assertEquals( + "Both iterators should be exhausted", recordIter.hasNext(), readIter.hasNext()); } JavaRDD rdd = sc.parallelize(rows); @@ -266,7 +281,8 @@ private Dataset createDataset(Iterable records, Schema schema) thro @Test public void testNullableWithWriteOption() throws IOException { - Assume.assumeTrue("Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); + Assume.assumeTrue( + "Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); File location = new File(temp.newFolder("parquet"), "test"); String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString()); @@ -276,9 +292,11 @@ public void testNullableWithWriteOption() throws IOException { // read this and append to iceberg dataset spark - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) - .write().parquet(sourcePath); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) + .write() + .parquet(sourcePath); // this is our iceberg dataset to which we will append data new HadoopTables(spark.sessionState().newHadoopConf()) @@ -290,15 +308,24 @@ public void testNullableWithWriteOption() throws IOException { // this is the initial data inside the iceberg dataset spark - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) - .write().format("iceberg").mode(SaveMode.Append).save(targetPath); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) + .write() + .format("iceberg") + .mode(SaveMode.Append) + .save(targetPath); // read from parquet and append to iceberg w/ nullability check disabled spark - .read().schema(SparkSchemaUtil.convert(icebergSchema)).parquet(sourcePath) - .write().format("iceberg").option(SparkWriteOptions.CHECK_NULLABILITY, false) - .mode(SaveMode.Append).save(targetPath); + .read() + .schema(SparkSchemaUtil.convert(icebergSchema)) + .parquet(sourcePath) + .write() + .format("iceberg") + .option(SparkWriteOptions.CHECK_NULLABILITY, false) + .mode(SaveMode.Append) + .save(targetPath); // read all data List rows = spark.read().format("iceberg").load(targetPath).collectAsList(); @@ -307,7 +334,8 @@ public void testNullableWithWriteOption() throws IOException { @Test public void testNullableWithSparkSqlOption() throws IOException { - Assume.assumeTrue("Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); + Assume.assumeTrue( + "Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); File location = new File(temp.newFolder("parquet"), "test"); String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString()); @@ -317,15 +345,18 @@ public void testNullableWithSparkSqlOption() throws IOException { // read this and append to iceberg dataset spark - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) - .write().parquet(sourcePath); - - SparkSession newSparkSession = SparkSession.builder() - .master("local[2]") - .appName("NullableTest") - .config(SparkSQLProperties.CHECK_NULLABILITY, false) - .getOrCreate(); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) + .write() + .parquet(sourcePath); + + SparkSession newSparkSession = + SparkSession.builder() + .master("local[2]") + .appName("NullableTest") + .config(SparkSQLProperties.CHECK_NULLABILITY, false) + .getOrCreate(); // this is our iceberg dataset to which we will append data new HadoopTables(newSparkSession.sessionState().newHadoopConf()) @@ -337,19 +368,27 @@ public void testNullableWithSparkSqlOption() throws IOException { // this is the initial data inside the iceberg dataset newSparkSession - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) - .write().format("iceberg").mode(SaveMode.Append).save(targetPath); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) + .write() + .format("iceberg") + .mode(SaveMode.Append) + .save(targetPath); // read from parquet and append to iceberg newSparkSession - .read().schema(SparkSchemaUtil.convert(icebergSchema)).parquet(sourcePath) - .write().format("iceberg").mode(SaveMode.Append).save(targetPath); + .read() + .schema(SparkSchemaUtil.convert(icebergSchema)) + .parquet(sourcePath) + .write() + .format("iceberg") + .mode(SaveMode.Append) + .save(targetPath); // read all data List rows = newSparkSession.read().format("iceberg").load(targetPath).collectAsList(); Assert.assertEquals("Should contain 6 rows", 6, rows.size()); - } @Test diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java index ffcb86052074..750474564078 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import java.math.RoundingMode; import java.util.List; @@ -58,19 +59,15 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestDataSourceOptions { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); private static SparkSession spark = null; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @BeforeClass public static void startSpark() { @@ -94,23 +91,23 @@ public void testWriteFormatOptionOverridesTableProperties() throws IOException { options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro"); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, "parquet") .mode(SaveMode.Append) .save(tableLocation); try (CloseableIterable tasks = table.newScan().planFiles()) { - tasks.forEach(task -> { - FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); - Assert.assertEquals(FileFormat.PARQUET, fileFormat); - }); + tasks.forEach( + task -> { + FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); + Assert.assertEquals(FileFormat.PARQUET, fileFormat); + }); } } @@ -124,22 +121,18 @@ public void testNoWriteFormatOption() throws IOException { options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro"); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); - df.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); try (CloseableIterable tasks = table.newScan().planFiles()) { - tasks.forEach(task -> { - FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); - Assert.assertEquals(FileFormat.AVRO, fileFormat); - }); + tasks.forEach( + task -> { + FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); + Assert.assertEquals(FileFormat.AVRO, fileFormat); + }); } } @@ -159,24 +152,25 @@ public void testHadoopOptions() throws IOException { // to verify that 'hadoop.' data source options are propagated correctly sparkHadoopConf.set("fs.default.name", "hdfs://localhost:9000"); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() + originalDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .option("hadoop.fs.default.name", "file:///") .save(tableLocation); - Dataset resultDf = spark.read() - .format("iceberg") - .option("hadoop.fs.default.name", "file:///") - .load(tableLocation); - List resultRecords = resultDf.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset resultDf = + spark + .read() + .format("iceberg") + .option("hadoop.fs.default.name", "file:///") + .load(tableLocation); + List resultRecords = + resultDf.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Records should match", expectedRecords, resultRecords); } finally { @@ -192,31 +186,35 @@ public void testSplitOptionsOverridesTableProperties() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Map options = Maps.newHashMap(); options.put(TableProperties.SPLIT_SIZE, String.valueOf(128L * 1024 * 1024)); // 128Mb - options.put(TableProperties.DEFAULT_FILE_FORMAT, String.valueOf(FileFormat.AVRO)); // Arbitrarily splittable + options.put( + TableProperties.DEFAULT_FILE_FORMAT, + String.valueOf(FileFormat.AVRO)); // Arbitrarily splittable Table icebergTable = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data") + originalDf + .select("id", "data") .repartition(1) .write() .format("iceberg") .mode("append") .save(tableLocation); - List files = Lists.newArrayList(icebergTable.currentSnapshot().addedDataFiles(icebergTable.io())); + List files = + Lists.newArrayList(icebergTable.currentSnapshot().addedDataFiles(icebergTable.io())); Assert.assertEquals("Should have written 1 file", 1, files.size()); long fileSize = files.get(0).fileSizeInBytes(); long splitSize = LongMath.divide(fileSize, 2, RoundingMode.CEILING); - Dataset resultDf = spark.read() - .format("iceberg") - .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(splitSize)) - .load(tableLocation); + Dataset resultDf = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(splitSize)) + .load(tableLocation); Assert.assertEquals("Spark partitions should match", 2, resultDf.javaRDD().getNumPartitions()); } @@ -230,18 +228,16 @@ public void testIncrementalScanOptions() throws IOException { Map options = Maps.newHashMap(); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "d") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "d")); for (SimpleRecord record : expectedRecords) { - Dataset originalDf = spark.createDataFrame(Lists.newArrayList(record), SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + Dataset originalDf = + spark.createDataFrame(Lists.newArrayList(record), SimpleRecord.class); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); } List snapshotIds = SnapshotUtil.currentAncestorIds(table); @@ -251,11 +247,13 @@ public void testIncrementalScanOptions() throws IOException { IllegalArgumentException.class, "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan", () -> { - spark.read() + spark + .read() .format("iceberg") .option("snapshot-id", snapshotIds.get(3).toString()) .option("start-snapshot-id", snapshotIds.get(3).toString()) - .load(tableLocation).explain(); + .load(tableLocation) + .explain(); }); // end-snapshot-id and as-of-timestamp are both configured. @@ -264,12 +262,15 @@ public void testIncrementalScanOptions() throws IOException { IllegalArgumentException.class, "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan", () -> { - spark.read() + spark + .read() .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, + .option( + SparkReadOptions.AS_OF_TIMESTAMP, Long.toString(table.snapshot(snapshotIds.get(3)).timestampMillis())) .option("end-snapshot-id", snapshotIds.get(2).toString()) - .load(tableLocation).explain(); + .load(tableLocation) + .explain(); }); // only end-snapshot-id is configured. @@ -278,31 +279,37 @@ public void testIncrementalScanOptions() throws IOException { IllegalArgumentException.class, "Cannot only specify option end-snapshot-id to do incremental scan", () -> { - spark.read() + spark + .read() .format("iceberg") .option("end-snapshot-id", snapshotIds.get(2).toString()) - .load(tableLocation).explain(); + .load(tableLocation) + .explain(); }); // test (1st snapshot, current snapshot] incremental scan. - List result = spark.read() - .format("iceberg") - .option("start-snapshot-id", snapshotIds.get(3).toString()) - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List result = + spark + .read() + .format("iceberg") + .option("start-snapshot-id", snapshotIds.get(3).toString()) + .load(tableLocation) + .orderBy("id") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Records should match", expectedRecords.subList(1, 4), result); // test (2nd snapshot, 3rd snapshot] incremental scan. - List result1 = spark.read() - .format("iceberg") - .option("start-snapshot-id", snapshotIds.get(2).toString()) - .option("end-snapshot-id", snapshotIds.get(1).toString()) - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List result1 = + spark + .read() + .format("iceberg") + .option("start-snapshot-id", snapshotIds.get(2).toString()) + .option("end-snapshot-id", snapshotIds.get(1).toString()) + .load(tableLocation) + .orderBy("id") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Records should match", expectedRecords.subList(2, 3), result1); } @@ -315,41 +322,34 @@ public void testMetadataSplitSizeOptionOverrideTableProperties() throws IOExcept Map options = Maps.newHashMap(); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); // produce 1st manifest - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); // produce 2nd manifest - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); List manifests = table.currentSnapshot().allManifests(table.io()); Assert.assertEquals("Must be 2 manifests", 2, manifests.size()); // set the target metadata split size so each manifest ends up in a separate split - table.updateProperties() + table + .updateProperties() .set(TableProperties.METADATA_SPLIT_SIZE, String.valueOf(manifests.get(0).length())) .commit(); - Dataset entriesDf = spark.read() - .format("iceberg") - .load(tableLocation + "#entries"); + Dataset entriesDf = spark.read().format("iceberg").load(tableLocation + "#entries"); Assert.assertEquals("Num partitions must match", 2, entriesDf.javaRDD().getNumPartitions()); // override the table property using options - entriesDf = spark.read() - .format("iceberg") - .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)) - .load(tableLocation + "#entries"); + entriesDf = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)) + .load(tableLocation + "#entries"); Assert.assertEquals("Num partitions must match", 1, entriesDf.javaRDD().getNumPartitions()); } @@ -362,24 +362,26 @@ public void testDefaultMetadataSplitSize() throws IOException { Map options = Maps.newHashMap(); Table icebergTable = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); int splitSize = (int) TableProperties.METADATA_SPLIT_SIZE_DEFAULT; // 32MB split size - int expectedSplits = ((int) tables.load(tableLocation + "#entries") - .currentSnapshot().allManifests(icebergTable.io()).get(0).length() + splitSize - 1) / splitSize; + int expectedSplits = + ((int) + tables + .load(tableLocation + "#entries") + .currentSnapshot() + .allManifests(icebergTable.io()) + .get(0) + .length() + + splitSize + - 1) + / splitSize; - Dataset metadataDf = spark.read() - .format("iceberg") - .load(tableLocation + "#entries"); + Dataset metadataDf = spark.read().format("iceberg").load(tableLocation + "#entries"); int partitionNum = metadataDf.javaRDD().getNumPartitions(); Assert.assertEquals("Spark partitions should match", expectedSplits, partitionNum); @@ -391,17 +393,17 @@ public void testExtraSnapshotMetadata() throws IOException { HadoopTables tables = new HadoopTables(CONF); tables.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".extra-key", "someValue") - .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".another-key", "anotherValue") - .save(tableLocation); + originalDf + .select("id", "data") + .write() + .format("iceberg") + .mode("append") + .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".extra-key", "someValue") + .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".another-key", "anotherValue") + .save(tableLocation); Table table = tables.load(tableLocation); @@ -414,26 +416,27 @@ public void testExtraSnapshotMetadataWithSQL() throws InterruptedException, IOEx String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); - Table table = tables.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + tables.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); spark.read().format("iceberg").load(tableLocation).createOrReplaceTempView("target"); - Thread writerThread = new Thread(() -> { - Map properties = Maps.newHashMap(); - properties.put("writer-thread", String.valueOf(Thread.currentThread().getName())); - CommitMetadata.withCommitProperties(properties, () -> { - spark.sql("INSERT INTO target VALUES (3, 'c'), (4, 'd')"); - return 0; - }, RuntimeException.class); - }); + Thread writerThread = + new Thread( + () -> { + Map properties = Maps.newHashMap(); + properties.put("writer-thread", String.valueOf(Thread.currentThread().getName())); + CommitMetadata.withCommitProperties( + properties, + () -> { + spark.sql("INSERT INTO target VALUES (3, 'c'), (4, 'd')"); + return 0; + }, + RuntimeException.class); + }); writerThread.setName("test-extra-commit-message-writer-thread"); writerThread.start(); writerThread.join(); diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java index d51fd3c4e8eb..b30bbf145f23 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; +import static org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp; +import static org.apache.spark.sql.functions.callUDF; +import static org.apache.spark.sql.functions.column; + import java.io.File; import java.io.IOException; import java.sql.Timestamp; @@ -79,41 +83,31 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; -import static org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp; -import static org.apache.spark.sql.functions.callUDF; -import static org.apache.spark.sql.functions.column; - @RunWith(Parameterized.class) public class TestFilteredScan { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "ts", Types.TimestampType.withZone()), - Types.NestedField.optional(3, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "ts", Types.TimestampType.withZone()), + Types.NestedField.optional(3, "data", Types.StringType.get())); - private static final PartitionSpec BUCKET_BY_ID = PartitionSpec.builderFor(SCHEMA) - .bucket("id", 4) - .build(); + private static final PartitionSpec BUCKET_BY_ID = + PartitionSpec.builderFor(SCHEMA).bucket("id", 4).build(); - private static final PartitionSpec PARTITION_BY_DAY = PartitionSpec.builderFor(SCHEMA) - .day("ts") - .build(); + private static final PartitionSpec PARTITION_BY_DAY = + PartitionSpec.builderFor(SCHEMA).day("ts").build(); - private static final PartitionSpec PARTITION_BY_HOUR = PartitionSpec.builderFor(SCHEMA) - .hour("ts") - .build(); + private static final PartitionSpec PARTITION_BY_HOUR = + PartitionSpec.builderFor(SCHEMA).hour("ts").build(); - private static final PartitionSpec PARTITION_BY_DATA = PartitionSpec.builderFor(SCHEMA) - .identity("data") - .build(); + private static final PartitionSpec PARTITION_BY_DATA = + PartitionSpec.builderFor(SCHEMA).identity("data").build(); - private static final PartitionSpec PARTITION_BY_ID = PartitionSpec.builderFor(SCHEMA) - .identity("id") - .build(); + private static final PartitionSpec PARTITION_BY_ID = + PartitionSpec.builderFor(SCHEMA).identity("id").build(); private static SparkSession spark = null; @@ -126,14 +120,20 @@ public static void startSpark() { spark.udf().register("bucket4", (UDF1) bucket4::apply, IntegerType$.MODULE$); Transform day = Transforms.day(Types.TimestampType.withZone()); - spark.udf().register("ts_day", - (UDF1) timestamp -> day.apply((Long) fromJavaTimestamp(timestamp)), - IntegerType$.MODULE$); + spark + .udf() + .register( + "ts_day", + (UDF1) timestamp -> day.apply((Long) fromJavaTimestamp(timestamp)), + IntegerType$.MODULE$); Transform hour = Transforms.hour(Types.TimestampType.withZone()); - spark.udf().register("ts_hour", - (UDF1) timestamp -> hour.apply((Long) fromJavaTimestamp(timestamp)), - IntegerType$.MODULE$); + spark + .udf() + .register( + "ts_hour", + (UDF1) timestamp -> hour.apply((Long) fromJavaTimestamp(timestamp)), + IntegerType$.MODULE$); spark.udf().register("data_ident", (UDF1) data -> data, StringType$.MODULE$); spark.udf().register("id_ident", (UDF1) id -> id, LongType$.MODULE$); @@ -146,8 +146,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String format; private final boolean vectorized; @@ -155,11 +154,11 @@ public static void stopSpark() { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } @@ -188,26 +187,27 @@ public void writeUnpartitionedTable() throws IOException { this.records = testRecords(tableSchema); - try (FileAppender writer = new GenericAppenderFactory(tableSchema).newAppender( - localOutput(testFile), fileFormat)) { + try (FileAppender writer = + new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), fileFormat)) { writer.addAll(records); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(records.size()) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(records.size()) + .withFileSizeInBytes(testFile.length()) + .withPath(testFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); } @Test public void testUnpartitionedIDFilters() { - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", unpartitioned.toString()) - ); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString())); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); for (int i = 0; i < 10; i += 1) { pushFilters(builder, EqualTo.apply("id", i)); @@ -217,16 +217,15 @@ public void testUnpartitionedIDFilters() { Assert.assertEquals("Should only create one task for a small file", 1, partitions.length); // validate row filtering - assertEqualsSafe(SCHEMA.asStruct(), expected(i), - read(unpartitioned.toString(), vectorized, "id = " + i)); + assertEqualsSafe( + SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), vectorized, "id = " + i)); } } @Test public void testUnpartitionedCaseInsensitiveIDFilters() { - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", unpartitioned.toString()) - ); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString())); // set spark.sql.caseSensitive to false String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive"); @@ -235,17 +234,22 @@ public void testUnpartitionedCaseInsensitiveIDFilters() { try { for (int i = 0; i < 10; i += 1) { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options) - .caseSensitive(false); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options) + .caseSensitive(false); - pushFilters(builder, EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match + pushFilters( + builder, + EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match Batch scan = builder.build().toBatch(); InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.length); // validate row filtering - assertEqualsSafe(SCHEMA.asStruct(), expected(i), + assertEqualsSafe( + SCHEMA.asStruct(), + expected(i), read(unpartitioned.toString(), vectorized, "id = " + i)); } } finally { @@ -256,11 +260,11 @@ public void testUnpartitionedCaseInsensitiveIDFilters() { @Test public void testUnpartitionedTimestampFilter() { - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", unpartitioned.toString()) - ); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); Batch scan = builder.build().toBatch(); @@ -268,21 +272,29 @@ public void testUnpartitionedTimestampFilter() { InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), - read(unpartitioned.toString(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(5, 6, 7, 8, 9), + read( + unpartitioned.toString(), + vectorized, + "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } @Test public void testBucketPartitionedIDFilters() { Table table = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); - Assert.assertEquals("Unfiltered table should created 4 read tasks", - 4, unfiltered.planInputPartitions().length); + Batch unfiltered = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); + Assert.assertEquals( + "Unfiltered table should created 4 read tasks", 4, unfiltered.planInputPartitions().length); for (int i = 0; i < 10; i += 1) { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, EqualTo.apply("id", i)); Batch scan = builder.build().toBatch(); @@ -293,7 +305,8 @@ public void testBucketPartitionedIDFilters() { Assert.assertEquals("Should create one task for a single bucket", 1, tasks.length); // validate row filtering - assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(table.location(), vectorized, "id = " + i)); + assertEqualsSafe( + SCHEMA.asStruct(), expected(i), read(table.location(), vectorized, "id = " + i)); } } @@ -301,14 +314,17 @@ public void testBucketPartitionedIDFilters() { @Test public void testDayPartitionedTimestampFilters() { Table table = buildPartitionedTable("partitioned_by_day", PARTITION_BY_DAY, "ts_day", "ts"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + Batch unfiltered = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); - Assert.assertEquals("Unfiltered table should created 2 read tasks", - 2, unfiltered.planInputPartitions().length); + Assert.assertEquals( + "Unfiltered table should created 2 read tasks", 2, unfiltered.planInputPartitions().length); { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); Batch scan = builder.build().toBatch(); @@ -316,24 +332,35 @@ public void testDayPartitionedTimestampFilters() { InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should create one task for 2017-12-21", 1, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), - read(table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(5, 6, 7, 8, 9), + read( + table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); - - pushFilters(builder, And.apply( - GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), - LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + + pushFilters( + builder, + And.apply( + GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), + LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); Batch scan = builder.build().toBatch(); InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should create one task for 2017-12-22", 1, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(1, 2), read(table.location(), vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + - "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(1, 2), + read( + table.location(), + vectorized, + "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); } } @@ -342,14 +369,17 @@ public void testDayPartitionedTimestampFilters() { public void testHourPartitionedTimestampFilters() { Table table = buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + Batch unfiltered = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); - Assert.assertEquals("Unfiltered table should created 9 read tasks", - 9, unfiltered.planInputPartitions().length); + Assert.assertEquals( + "Unfiltered table should created 9 read tasks", 9, unfiltered.planInputPartitions().length); { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); Batch scan = builder.build().toBatch(); @@ -357,24 +387,35 @@ public void testHourPartitionedTimestampFilters() { InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should create 4 tasks for 2017-12-21: 15, 17, 21, 22", 4, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(8, 9, 7, 6, 5), - read(table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(8, 9, 7, 6, 5), + read( + table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); - - pushFilters(builder, And.apply( - GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), - LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + + pushFilters( + builder, + And.apply( + GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), + LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); Batch scan = builder.build().toBatch(); InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should create 2 tasks for 2017-12-22: 6, 7", 2, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(2, 1), read(table.location(), vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + - "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(2, 1), + read( + table.location(), + vectorized, + "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); } } @@ -388,10 +429,15 @@ public void testFilterByNonProjectedColumn() { expected.add(projectFlat(actualProjection, rec)); } - assertEqualsSafe(actualProjection.asStruct(), expected, read( - unpartitioned.toString(), vectorized, - "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)", - "id", "data")); + assertEqualsSafe( + actualProjection.asStruct(), + expected, + read( + unpartitioned.toString(), + vectorized, + "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)", + "id", + "data")); } { @@ -403,20 +449,27 @@ public void testFilterByNonProjectedColumn() { expected.add(projectFlat(actualProjection, rec)); } - assertEqualsSafe(actualProjection.asStruct(), expected, read( - unpartitioned.toString(), vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + - "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)", - "id")); + assertEqualsSafe( + actualProjection.asStruct(), + expected, + read( + unpartitioned.toString(), + vectorized, + "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)", + "id")); } } @Test public void testPartitionedByDataStartsWithFilter() { - Table table = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + Table table = + buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new StringStartsWith("data", "junc")); Batch scan = builder.build().toBatch(); @@ -426,10 +479,13 @@ public void testPartitionedByDataStartsWithFilter() { @Test public void testPartitionedByDataNotStartsWithFilter() { - Table table = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + Table table = + buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new Not(new StringStartsWith("data", "junc"))); Batch scan = builder.build().toBatch(); @@ -441,11 +497,11 @@ public void testPartitionedByDataNotStartsWithFilter() { public void testPartitionedByIdStartsWith() { Table table = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", table.location()) - ); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new StringStartsWith("data", "junc")); Batch scan = builder.build().toBatch(); @@ -457,11 +513,11 @@ public void testPartitionedByIdStartsWith() { public void testPartitionedByIdNotStartsWith() { Table table = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", table.location()) - ); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new Not(new StringStartsWith("data", "junc"))); Batch scan = builder.build().toBatch(); @@ -471,15 +527,15 @@ public void testPartitionedByIdNotStartsWith() { @Test public void testUnpartitionedStartsWith() { - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()); - List matchedData = df.select("data") - .where("data LIKE 'jun%'") - .as(Encoders.STRING()) - .collectAsList(); + List matchedData = + df.select("data").where("data LIKE 'jun%'").as(Encoders.STRING()).collectAsList(); Assert.assertEquals(1, matchedData.size()); Assert.assertEquals("junction", matchedData.get(0)); @@ -487,20 +543,21 @@ public void testUnpartitionedStartsWith() { @Test public void testUnpartitionedNotStartsWith() { - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()); - - List matchedData = df.select("data") - .where("data NOT LIKE 'jun%'") - .as(Encoders.STRING()) - .collectAsList(); - - List expected = testRecords(SCHEMA).stream() - .map(r -> r.getField("data").toString()) - .filter(d -> !d.startsWith("jun")) - .collect(Collectors.toList()); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()); + + List matchedData = + df.select("data").where("data NOT LIKE 'jun%'").as(Encoders.STRING()).collectAsList(); + + List expected = + testRecords(SCHEMA).stream() + .map(r -> r.getField("data").toString()) + .filter(d -> !d.startsWith("jun")) + .collect(Collectors.toList()); Assert.assertEquals(9, matchedData.size()); Assert.assertEquals(Sets.newHashSet(expected), Sets.newHashSet(matchedData)); @@ -516,8 +573,8 @@ private static Record projectFlat(Schema projection, Record record) { return result; } - public static void assertEqualsUnsafe(Types.StructType struct, - List expected, List actual) { + public static void assertEqualsUnsafe( + Types.StructType struct, List expected, List actual) { // TODO: match records by ID int numRecords = Math.min(expected.size(), actual.size()); for (int i = 0; i < numRecords; i += 1) { @@ -526,8 +583,8 @@ public static void assertEqualsUnsafe(Types.StructType struct, Assert.assertEquals("Number of results should match expected", expected.size(), actual.size()); } - public static void assertEqualsSafe(Types.StructType struct, - List expected, List actual) { + public static void assertEqualsSafe( + Types.StructType struct, List expected, List actual) { // TODO: match records by ID int numRecords = Math.min(expected.size(), actual.size()); for (int i = 0; i < numRecords; i += 1) { @@ -550,7 +607,8 @@ private void pushFilters(ScanBuilder scan, Filter... filters) { filterable.pushFilters(filters); } - private Table buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) { + private Table buildPartitionedTable( + String desc, PartitionSpec spec, String udf, String partitionColumn) { File location = new File(parent, desc); Table table = TABLES.create(SCHEMA, spec, location.toString()); @@ -559,10 +617,12 @@ private Table buildPartitionedTable(String desc, PartitionSpec spec, String udf, table.updateProperties().set("read.split.target-size", "2048").commit(); // copy the unpartitioned table into the partitioned table to produce the partitioned data - Dataset allRows = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()); + Dataset allRows = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()); allRows .coalesce(1) // ensure only 1 file per partition is written @@ -590,19 +650,23 @@ private List testRecords(Schema schema) { record(schema, 6L, parse("2017-12-21T21:55:30.589712+00:00"), "element"), record(schema, 7L, parse("2017-12-21T17:31:14.532797+00:00"), "limited"), record(schema, 8L, parse("2017-12-21T15:21:51.237521+00:00"), "global"), - record(schema, 9L, parse("2017-12-21T15:02:15.230570+00:00"), "goldfish") - ); + record(schema, 9L, parse("2017-12-21T15:02:15.230570+00:00"), "goldfish")); } private static List read(String table, boolean vectorized, String expr) { return read(table, vectorized, expr, "*"); } - private static List read(String table, boolean vectorized, String expr, String select0, String... selectN) { - Dataset dataset = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table).filter(expr) - .select(select0, selectN); + private static List read( + String table, boolean vectorized, String expr, String select0, String... selectN) { + Dataset dataset = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table) + .filter(expr) + .select(select0, selectN); return dataset.collectAsList(); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java index 20e1352de107..585cfc44a254 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localInput; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.List; @@ -63,25 +65,26 @@ import scala.Option; import scala.collection.JavaConversions; -import static org.apache.iceberg.Files.localInput; -import static org.apache.iceberg.Files.localOutput; - public class TestForwardCompatibility { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); // create a spec for the schema that uses a "zero" transform that produces all 0s - private static final PartitionSpec UNKNOWN_SPEC = PartitionSpecParser.fromJson(SCHEMA, - "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); + private static final PartitionSpec UNKNOWN_SPEC = + PartitionSpecParser.fromJson( + SCHEMA, + "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); // create a fake spec to use to write table metadata - private static final PartitionSpec FAKE_SPEC = PartitionSpecParser.fromJson(SCHEMA, - "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"identity\", \"source-id\": 1 } ] }"); + private static final PartitionSpec FAKE_SPEC = + PartitionSpecParser.fromJson( + SCHEMA, + "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"identity\", \"source-id\": 1 } ] }"); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; @@ -107,20 +110,22 @@ public void testSparkWriteFailsUnknownTransform() throws IOException { HadoopTables tables = new HadoopTables(CONF); tables.create(SCHEMA, UNKNOWN_SPEC, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - AssertHelpers.assertThrows("Should reject write with unsupported transform", - UnsupportedOperationException.class, "Cannot write using unsupported transforms: zero", - () -> df.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(location.toString())); + AssertHelpers.assertThrows( + "Should reject write with unsupported transform", + UnsupportedOperationException.class, + "Cannot write using unsupported transforms: zero", + () -> + df.select("id", "data") + .write() + .format("iceberg") + .mode("append") + .save(location.toString())); } @Test @@ -136,20 +141,24 @@ public void testSparkStreamingWriteFailsUnknownTransform() throws IOException, T tables.create(SCHEMA, UNKNOWN_SPEC, location.toString()); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - StreamingQuery query = inputStream.toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("append") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()) - .start(); + StreamingQuery query = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("append") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()) + .start(); List batch1 = Lists.newArrayList(1, 2); send(batch1, inputStream); - AssertHelpers.assertThrows("Should reject streaming write with unsupported transform", - StreamingQueryException.class, "Cannot write using unsupported transforms: zero", + AssertHelpers.assertThrows( + "Should reject streaming write with unsupported transform", + StreamingQueryException.class, + "Cannot write using unsupported transforms: zero", query::processAllAvailable); } @@ -168,22 +177,22 @@ public void testSparkCanReadUnknownTransform() throws IOException { List expected = RandomData.generateList(table.schema(), 100, 1L); - File parquetFile = new File(dataFolder, - FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); - FileAppender writer = Parquet.write(localOutput(parquetFile)) - .schema(table.schema()) - .build(); + File parquetFile = + new File(dataFolder, FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); + FileAppender writer = + Parquet.write(localOutput(parquetFile)).schema(table.schema()).build(); try { writer.addAll(expected); } finally { writer.close(); } - DataFile file = DataFiles.builder(FAKE_SPEC) - .withInputFile(localInput(parquetFile)) - .withMetrics(writer.metrics()) - .withPartitionPath("id_zero=0") - .build(); + DataFile file = + DataFiles.builder(FAKE_SPEC) + .withInputFile(localInput(parquetFile)) + .withMetrics(writer.metrics()) + .withPartitionPath("id_zero=0") + .build(); OutputFile manifestFile = localOutput(FileFormat.AVRO.addExtension(temp.newFile().toString())); ManifestWriter manifestWriter = ManifestFiles.write(FAKE_SPEC, manifestFile); @@ -195,9 +204,7 @@ public void testSparkCanReadUnknownTransform() throws IOException { table.newFastAppend().appendManifest(manifestWriter.toManifestFile()).commit(); - Dataset df = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset df = spark.read().format("iceberg").load(location.toString()); List rows = df.collectAsList(); Assert.assertEquals("Should contain 100 rows", 100, rows.size()); diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java index 42f53d585601..a850275118db 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.catalog.TableIdentifier; @@ -40,5 +39,4 @@ public Identifier extractIdentifier(CaseInsensitiveStringMap options) { public String extractCatalog(CaseInsensitiveStringMap options) { return SparkSession.active().sessionState().catalogManager().currentCatalog().name(); } - } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java index f1cfc7a72e17..b55ba0e2199a 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java index 76923d43a3bc..f6df8d495b90 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -62,7 +61,8 @@ public Table createTable(TableIdentifier ident, Schema schema, PartitionSpec spe @Override public Table loadTable(TableIdentifier ident, String entriesSuffix) { - TableIdentifier identifier = TableIdentifier.of(ident.namespace().level(0), ident.name(), entriesSuffix); + TableIdentifier identifier = + TableIdentifier.of(ident.namespace().level(0), ident.name(), entriesSuffix); return TestIcebergSourceHiveTables.catalog.loadTable(identifier); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java index 5e8776c4c68a..386e57b7877f 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.ManifestContent.DATA; +import static org.apache.iceberg.ManifestContent.DELETES; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.io.UncheckedIOException; import java.util.Comparator; @@ -76,33 +80,26 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.ManifestContent.DATA; -import static org.apache.iceberg.ManifestContent.DELETES; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class TestIcebergSourceTablesBase extends SparkTestBase { - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - private static final Schema SCHEMA2 = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()), - optional(3, "category", Types.StringType.get()) - ); + private static final Schema SCHEMA2 = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get()), + optional(3, "category", Types.StringType.get())); - private static final Schema SCHEMA3 = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(3, "category", Types.StringType.get()) - ); + private static final Schema SCHEMA3 = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(3, "category", Types.StringType.get())); private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("id").build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); public abstract Table createTable(TableIdentifier ident, Schema schema, PartitionSpec spec); @@ -117,23 +114,21 @@ public synchronized void testTablesSupport() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "1"), - new SimpleRecord(2, "2"), - new SimpleRecord(3, "3")); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "1"), new SimpleRecord(2, "2"), new SimpleRecord(3, "3")); Dataset inputDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - List actualRecords = resultDf.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + List actualRecords = + resultDf.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Records should match", expectedRecords, actualRecords); } @@ -147,32 +142,39 @@ public void testEntriesTable() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .collectAsList(); Snapshot snapshot = table.currentSnapshot(); - Assert.assertEquals("Should only contain one manifest", 1, snapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should only contain one manifest", 1, snapshot.allManifests(table.io()).size()); InputFile manifest = table.io().newInputFile(snapshot.allManifests(table.io()).get(0).path()); List expected = Lists.newArrayList(); - try (CloseableIterable rows = Avro.read(manifest).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(manifest).project(entriesTable.schema()).build()) { // each row must inherit snapshot_id and sequence_number - rows.forEach(row -> { - row.put(2, 0L); - GenericData.Record file = (GenericData.Record) row.get("data_file"); - asMetadataRecord(file); - expected.add(row); - }); + rows.forEach( + row -> { + row.put(2, 0L); + GenericData.Record file = (GenericData.Record) row.get("data_file"); + asMetadataRecord(file); + expected.add(row); + }); } Assert.assertEquals("Entries table should have one row", 1, expected.size()); @@ -188,18 +190,22 @@ public void testEntriesTablePartitionedPrune() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("status") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select("status") + .collectAsList(); Assert.assertEquals("Results should contain only one status", 1, actual.size()); Assert.assertEquals("That status should be Added (1)", 1, actual.get(0).getInt(0)); @@ -213,7 +219,9 @@ public void testEntriesTableDataFilePrune() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -221,15 +229,19 @@ public void testEntriesTableDataFilePrune() throws Exception { table.refresh(); DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - List singleActual = rowsToJava(spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("data_file.file_path") - .collectAsList()); + List singleActual = + rowsToJava( + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select("data_file.file_path") + .collectAsList()); List singleExpected = ImmutableList.of(row(file.path())); - assertEquals("Should prune a single element from a nested struct", singleExpected, singleActual); + assertEquals( + "Should prune a single element from a nested struct", singleExpected, singleActual); } @Test @@ -240,7 +252,9 @@ public void testEntriesTableDataFilePruneMulti() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -248,14 +262,22 @@ public void testEntriesTableDataFilePruneMulti() throws Exception { table.refresh(); DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - List multiActual = rowsToJava(spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("data_file.file_path", "data_file.value_counts", "data_file.record_count", "data_file.column_sizes") - .collectAsList()); - - List multiExpected = ImmutableList.of( - row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); + List multiActual = + rowsToJava( + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select( + "data_file.file_path", + "data_file.value_counts", + "data_file.record_count", + "data_file.column_sizes") + .collectAsList()); + + List multiExpected = + ImmutableList.of( + row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); assertEquals("Should prune a single element from a nested struct", multiExpected, multiActual); } @@ -268,7 +290,9 @@ public void testFilesSelectMap() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -276,14 +300,18 @@ public void testFilesSelectMap() throws Exception { table.refresh(); DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - List multiActual = rowsToJava(spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .select("file_path", "value_counts", "record_count", "column_sizes") - .collectAsList()); + List multiActual = + rowsToJava( + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "files")) + .select("file_path", "value_counts", "record_count", "column_sizes") + .collectAsList()); - List multiExpected = ImmutableList.of( - row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); + List multiExpected = + ImmutableList.of( + row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); assertEquals("Should prune a single element from a row", multiExpected, multiActual); } @@ -294,10 +322,13 @@ public void testAllEntriesTable() throws Exception { Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); Table entriesTable = loadTable(tableIdentifier, "all_entries"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -306,7 +337,8 @@ public void testAllEntriesTable() throws Exception { table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -314,24 +346,28 @@ public void testAllEntriesTable() throws Exception { // ensure table data isn't stale table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_entries")) - .orderBy("snapshot_id") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_entries")) + .orderBy("snapshot_id") + .collectAsList(); List expected = Lists.newArrayList(); - for (ManifestFile manifest : Iterables.concat( - Iterables.transform(table.snapshots(), s -> s.allManifests(table.io())))) { + for (ManifestFile manifest : + Iterables.concat(Iterables.transform(table.snapshots(), s -> s.allManifests(table.io())))) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { // each row must inherit snapshot_id and sequence_number - rows.forEach(row -> { - row.put(2, 0L); - GenericData.Record file = (GenericData.Record) row.get("data_file"); - asMetadataRecord(file); - expected.add(row); - }); + rows.forEach( + row -> { + row.put(2, 0L); + GenericData.Record file = (GenericData.Record) row.get("data_file"); + asMetadataRecord(file); + expected.add(row); + }); } } @@ -340,7 +376,8 @@ public void testAllEntriesTable() throws Exception { Assert.assertEquals("Entries table should have 3 rows", 3, expected.size()); Assert.assertEquals("Actual results should have 3 rows", 3, actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(entriesTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + entriesTable.schema().asStruct(), expected.get(i), actual.get(i)); } } @@ -352,7 +389,9 @@ public void testCountEntriesTable() { // init load List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -360,12 +399,16 @@ public void testCountEntriesTable() { final int expectedEntryCount = 1; // count entries - Assert.assertEquals("Count should return " + expectedEntryCount, - expectedEntryCount, spark.read().format("iceberg").load(loadLocation(tableIdentifier, "entries")).count()); + Assert.assertEquals( + "Count should return " + expectedEntryCount, + expectedEntryCount, + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "entries")).count()); // count all_entries - Assert.assertEquals("Count should return " + expectedEntryCount, - expectedEntryCount, spark.read().format("iceberg").load(loadLocation(tableIdentifier, "all_entries")).count()); + Assert.assertEquals( + "Count should return " + expectedEntryCount, + expectedEntryCount, + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "all_entries")).count()); } @Test @@ -375,16 +418,20 @@ public void testFilesTable() throws Exception { Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "files"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -392,15 +439,14 @@ public void testFilesTable() throws Exception { // delete the first file to test that only live files are listed table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .collectAsList(); + List actual = + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")).collectAsList(); List expected = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().dataManifests(table.io())) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { if ((Integer) record.get("status") < 2 /* added or existing */) { GenericData.Record file = (GenericData.Record) record.get("data_file"); @@ -422,42 +468,42 @@ public void testFilesTableWithSnapshotIdInheritance() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "files_inheritance_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); - table.updateProperties() - .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "files"); - spark.sql(String.format( - "CREATE TABLE parquet_table (data string, id int) " + - "USING parquet PARTITIONED BY (id) LOCATION '%s'", - temp.newFolder())); + spark.sql( + String.format( + "CREATE TABLE parquet_table (data string, id int) " + + "USING parquet PARTITIONED BY (id) LOCATION '%s'", + temp.newFolder())); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF.select("data", "id").write() - .mode("overwrite") - .insertInto("parquet_table"); + inputDF.select("data", "id").write().mode("overwrite").insertInto("parquet_table"); try { String stagingLocation = table.location() + "/metadata"; - SparkTableUtil.importSparkTable(spark, + SparkTableUtil.importSparkTable( + spark, new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), - table, stagingLocation); + table, + stagingLocation); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "files")) + .collectAsList(); List expected = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().dataManifests(table.io())) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { GenericData.Record file = (GenericData.Record) record.get("data_file"); asMetadataRecord(file); @@ -473,7 +519,6 @@ public void testFilesTableWithSnapshotIdInheritance() throws Exception { } finally { spark.sql("DROP TABLE parquet_table"); } - } @Test @@ -484,35 +529,35 @@ public void testEntriesTableWithSnapshotIdInheritance() throws Exception { PartitionSpec spec = SPEC; Table table = createTable(tableIdentifier, SCHEMA, spec); - table.updateProperties() - .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); - spark.sql(String.format( - "CREATE TABLE parquet_table (data string, id int) " + - "USING parquet PARTITIONED BY (id) LOCATION '%s'", - temp.newFolder())); + spark.sql( + String.format( + "CREATE TABLE parquet_table (data string, id int) " + + "USING parquet PARTITIONED BY (id) LOCATION '%s'", + temp.newFolder())); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF.select("data", "id").write() - .mode("overwrite") - .insertInto("parquet_table"); + inputDF.select("data", "id").write().mode("overwrite").insertInto("parquet_table"); try { String stagingLocation = table.location() + "/metadata"; SparkTableUtil.importSparkTable( - spark, new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), table, stagingLocation); + spark, + new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), + table, + stagingLocation); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("sequence_number", "snapshot_id", "data_file") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select("sequence_number", "snapshot_id", "data_file") + .collectAsList(); table.refresh(); @@ -535,19 +580,24 @@ public void testFilesUnpartitionedTable() throws Exception { Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "files"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); - DataFile toDelete = Iterables.getOnlyElement(table.currentSnapshot().addedDataFiles(table.io())); + DataFile toDelete = + Iterables.getOnlyElement(table.currentSnapshot().addedDataFiles(table.io())); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -555,15 +605,14 @@ public void testFilesUnpartitionedTable() throws Exception { // delete the first file to test that only live files are listed table.newDelete().deleteFile(toDelete).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .collectAsList(); + List actual = + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")).collectAsList(); List expected = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().dataManifests(table.io())) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { if ((Integer) record.get("status") < 2 /* added or existing */) { GenericData.Record file = (GenericData.Record) record.get("data_file"); @@ -586,38 +635,49 @@ public void testAllMetadataTablesWithStagedCommits() throws Exception { table.updateProperties().set(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, "true").commit(); spark.conf().set("spark.wap.id", "1234567"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actualAllData = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_data_files")) - .collectAsList(); - - List actualAllManifests = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .collectAsList(); - - List actualAllEntries = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_entries")) - .collectAsList(); - - Assert.assertTrue("Stage table should have some snapshots", table.snapshots().iterator().hasNext()); - Assert.assertEquals("Stage table should have null currentSnapshot", - null, table.currentSnapshot()); + List actualAllData = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_data_files")) + .collectAsList(); + + List actualAllManifests = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_manifests")) + .collectAsList(); + + List actualAllEntries = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_entries")) + .collectAsList(); + + Assert.assertTrue( + "Stage table should have some snapshots", table.snapshots().iterator().hasNext()); + Assert.assertEquals( + "Stage table should have null currentSnapshot", null, table.currentSnapshot()); Assert.assertEquals("Actual results should have two rows", 2, actualAllData.size()); Assert.assertEquals("Actual results should have two rows", 2, actualAllManifests.size()); Assert.assertEquals("Actual results should have two rows", 2, actualAllEntries.size()); @@ -630,10 +690,13 @@ public void testAllDataFilesTable() throws Exception { Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "all_data_files"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -642,7 +705,8 @@ public void testAllDataFilesTable() throws Exception { table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -650,19 +714,23 @@ public void testAllDataFilesTable() throws Exception { // ensure table data isn't stale table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_data_files")) - .orderBy("file_path") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_data_files")) + .orderBy("file_path") + .collectAsList(); actual.sort(Comparator.comparing(o -> o.getString(1))); List expected = Lists.newArrayList(); - Iterable dataManifests = Iterables.concat(Iterables.transform(table.snapshots(), - snapshot -> snapshot.dataManifests(table.io()))); + Iterable dataManifests = + Iterables.concat( + Iterables.transform(table.snapshots(), snapshot -> snapshot.dataManifests(table.io()))); for (ManifestFile manifest : dataManifests) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { if ((Integer) record.get("status") < 2 /* added or existing */) { GenericData.Record file = (GenericData.Record) record.get("data_file"); @@ -691,7 +759,9 @@ public void testHistoryTable() { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -700,7 +770,9 @@ public void testHistoryTable() { long firstSnapshotTimestamp = table.currentSnapshot().timestampMillis(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -713,7 +785,9 @@ public void testHistoryTable() { table.rollback().toSnapshotId(firstSnapshotId).commit(); long rollbackTimestamp = Iterables.getLast(table.history()).timestampMillis(); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -722,34 +796,43 @@ public void testHistoryTable() { long thirdSnapshotTimestamp = table.currentSnapshot().timestampMillis(); long thirdSnapshotId = table.currentSnapshot().snapshotId(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "history")) - .collectAsList(); - - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(historyTable.schema(), "history")); - List expected = Lists.newArrayList( - builder.set("made_current_at", firstSnapshotTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("is_current_ancestor", true) - .build(), - builder.set("made_current_at", secondSnapshotTimestamp * 1000) - .set("snapshot_id", secondSnapshotId) - .set("parent_id", firstSnapshotId) - .set("is_current_ancestor", false) // commit rolled back, not an ancestor of the current table state - .build(), - builder.set("made_current_at", rollbackTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("is_current_ancestor", true) - .build(), - builder.set("made_current_at", thirdSnapshotTimestamp * 1000) - .set("snapshot_id", thirdSnapshotId) - .set("parent_id", firstSnapshotId) - .set("is_current_ancestor", true) - .build() - ); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "history")) + .collectAsList(); + + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(historyTable.schema(), "history")); + List expected = + Lists.newArrayList( + builder + .set("made_current_at", firstSnapshotTimestamp * 1000) + .set("snapshot_id", firstSnapshotId) + .set("parent_id", null) + .set("is_current_ancestor", true) + .build(), + builder + .set("made_current_at", secondSnapshotTimestamp * 1000) + .set("snapshot_id", secondSnapshotId) + .set("parent_id", firstSnapshotId) + .set( + "is_current_ancestor", + false) // commit rolled back, not an ancestor of the current table state + .build(), + builder + .set("made_current_at", rollbackTimestamp * 1000) + .set("snapshot_id", firstSnapshotId) + .set("parent_id", null) + .set("is_current_ancestor", true) + .build(), + builder + .set("made_current_at", thirdSnapshotTimestamp * 1000) + .set("snapshot_id", thirdSnapshotId) + .set("parent_id", firstSnapshotId) + .set("is_current_ancestor", true) + .build()); Assert.assertEquals("History table should have a row for each commit", 4, actual.size()); TestHelpers.assertEqualsSafe(historyTable.schema().asStruct(), expected.get(0), actual.get(0)); @@ -766,7 +849,9 @@ public void testSnapshotsTable() { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -785,40 +870,47 @@ public void testSnapshotsTable() { // rollback the table state to the first snapshot table.rollback().toSnapshotId(firstSnapshotId).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "snapshots")) - .collectAsList(); - - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(snapTable.schema(), "snapshots")); - List expected = Lists.newArrayList( - builder.set("committed_at", firstSnapshotTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("operation", "append") - .set("manifest_list", firstManifestList) - .set("summary", ImmutableMap.of( - "added-records", "1", - "added-data-files", "1", - "changed-partition-count", "1", - "total-data-files", "1", - "total-records", "1" - )) - .build(), - builder.set("committed_at", secondSnapshotTimestamp * 1000) - .set("snapshot_id", secondSnapshotId) - .set("parent_id", firstSnapshotId) - .set("operation", "delete") - .set("manifest_list", secondManifestList) - .set("summary", ImmutableMap.of( - "deleted-records", "1", - "deleted-data-files", "1", - "changed-partition-count", "1", - "total-records", "0", - "total-data-files", "0" - )) - .build() - ); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "snapshots")) + .collectAsList(); + + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(snapTable.schema(), "snapshots")); + List expected = + Lists.newArrayList( + builder + .set("committed_at", firstSnapshotTimestamp * 1000) + .set("snapshot_id", firstSnapshotId) + .set("parent_id", null) + .set("operation", "append") + .set("manifest_list", firstManifestList) + .set( + "summary", + ImmutableMap.of( + "added-records", "1", + "added-data-files", "1", + "changed-partition-count", "1", + "total-data-files", "1", + "total-records", "1")) + .build(), + builder + .set("committed_at", secondSnapshotTimestamp * 1000) + .set("snapshot_id", secondSnapshotId) + .set("parent_id", firstSnapshotId) + .set("operation", "delete") + .set("manifest_list", secondManifestList) + .set( + "summary", + ImmutableMap.of( + "deleted-records", "1", + "deleted-data-files", "1", + "changed-partition-count", "1", + "total-records", "0", + "total-data-files", "0")) + .build()); Assert.assertEquals("Snapshots table should have a row for each snapshot", 2, actual.size()); TestHelpers.assertEqualsSafe(snapTable.schema().asStruct(), expected.get(0), actual.get(0)); @@ -833,7 +925,9 @@ public void testPrunedSnapshotsTable() { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -849,40 +943,47 @@ public void testPrunedSnapshotsTable() { // rollback the table state to the first snapshot table.rollback().toSnapshotId(firstSnapshotId).commit(); - Dataset actualDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "snapshots")) - .select("operation", "committed_at", "summary", "parent_id"); + Dataset actualDf = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "snapshots")) + .select("operation", "committed_at", "summary", "parent_id"); Schema projectedSchema = SparkSchemaUtil.convert(actualDf.schema()); List actual = actualDf.collectAsList(); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema, "snapshots")); - List expected = Lists.newArrayList( - builder.set("committed_at", firstSnapshotTimestamp * 1000) - .set("parent_id", null) - .set("operation", "append") - .set("summary", ImmutableMap.of( - "added-records", "1", - "added-data-files", "1", - "changed-partition-count", "1", - "total-data-files", "1", - "total-records", "1" - )) - .build(), - builder.set("committed_at", secondSnapshotTimestamp * 1000) - .set("parent_id", firstSnapshotId) - .set("operation", "delete") - .set("summary", ImmutableMap.of( - "deleted-records", "1", - "deleted-data-files", "1", - "changed-partition-count", "1", - "total-records", "0", - "total-data-files", "0" - )) - .build() - ); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema, "snapshots")); + List expected = + Lists.newArrayList( + builder + .set("committed_at", firstSnapshotTimestamp * 1000) + .set("parent_id", null) + .set("operation", "append") + .set( + "summary", + ImmutableMap.of( + "added-records", "1", + "added-data-files", "1", + "changed-partition-count", "1", + "total-data-files", "1", + "total-records", "1")) + .build(), + builder + .set("committed_at", secondSnapshotTimestamp * 1000) + .set("parent_id", firstSnapshotId) + .set("operation", "delete") + .set( + "summary", + ImmutableMap.of( + "deleted-records", "1", + "deleted-data-files", "1", + "changed-partition-count", "1", + "total-records", "0", + "total-data-files", "0")) + .build()); Assert.assertEquals("Snapshots table should have a row for each snapshot", 2, actual.size()); TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(0), actual.get(0)); @@ -894,48 +995,73 @@ public void testManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "manifests"); - Dataset df1 = spark.createDataFrame( - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame( + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), + SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .collectAsList(); table.refresh(); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema(), "manifests")); - GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema().findType("partition_summaries.element").asStructType(), "partition_summary")); - List expected = Lists.transform(table.currentSnapshot().allManifests(table.io()), manifest -> - builder - .set("content", manifest.content().id()) - .set("path", manifest.path()) - .set("length", manifest.length()) - .set("partition_spec_id", manifest.partitionSpecId()) - .set("added_snapshot_id", manifest.snapshotId()) - .set("added_data_files_count", manifest.content() == DATA ? manifest.addedFilesCount() : 0) - .set("existing_data_files_count", manifest.content() == DATA ? manifest.existingFilesCount() : 0) - .set("deleted_data_files_count", manifest.content() == DATA ? manifest.deletedFilesCount() : 0) - .set("added_delete_files_count", manifest.content() == DELETES ? manifest.addedFilesCount() : 0) - .set("existing_delete_files_count", manifest.content() == DELETES ? manifest.existingFilesCount() : 0) - .set("deleted_delete_files_count", manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) - .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> - summaryBuilder - .set("contains_null", true) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build() - )) - .build() - ); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(manifestTable.schema(), "manifests")); + GenericRecordBuilder summaryBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + manifestTable.schema().findType("partition_summaries.element").asStructType(), + "partition_summary")); + List expected = + Lists.transform( + table.currentSnapshot().allManifests(table.io()), + manifest -> + builder + .set("content", manifest.content().id()) + .set("path", manifest.path()) + .set("length", manifest.length()) + .set("partition_spec_id", manifest.partitionSpecId()) + .set("added_snapshot_id", manifest.snapshotId()) + .set( + "added_data_files_count", + manifest.content() == DATA ? manifest.addedFilesCount() : 0) + .set( + "existing_data_files_count", + manifest.content() == DATA ? manifest.existingFilesCount() : 0) + .set( + "deleted_data_files_count", + manifest.content() == DATA ? manifest.deletedFilesCount() : 0) + .set( + "added_delete_files_count", + manifest.content() == DELETES ? manifest.addedFilesCount() : 0) + .set( + "existing_delete_files_count", + manifest.content() == DELETES ? manifest.existingFilesCount() : 0) + .set( + "deleted_delete_files_count", + manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) + .set( + "partition_summaries", + Lists.transform( + manifest.partitions(), + partition -> + summaryBuilder + .set("contains_null", true) + .set("contains_nan", false) + .set("lower_bound", "1") + .set("upper_bound", "1") + .build())) + .build()); Assert.assertEquals("Manifests table should have one manifest row", 1, actual.size()); TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(0), actual.get(0)); @@ -946,56 +1072,77 @@ public void testPruneManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "manifests"); - Dataset df1 = spark.createDataFrame( - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame( + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), + SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); if (!spark.version().startsWith("2")) { // Spark 2 isn't able to actually push down nested struct projections so this will not break - AssertHelpers.assertThrows("Can't prune struct inside list", SparkException.class, + AssertHelpers.assertThrows( + "Can't prune struct inside list", + SparkException.class, "Cannot project a partial list element struct", - () -> spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries.contains_null") - .collectAsList()); + () -> + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .select("partition_spec_id", "path", "partition_summaries.contains_null") + .collectAsList()); } - Dataset actualDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries"); + Dataset actualDf = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .select("partition_spec_id", "path", "partition_summaries"); Schema projectedSchema = SparkSchemaUtil.convert(actualDf.schema()); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .select("partition_spec_id", "path", "partition_summaries") + .collectAsList(); table.refresh(); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema.asStruct())); - GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - projectedSchema.findType("partition_summaries.element").asStructType(), "partition_summary")); - List expected = Lists.transform(table.currentSnapshot().allManifests(table.io()), manifest -> - builder.set("partition_spec_id", manifest.partitionSpecId()) - .set("path", manifest.path()) - .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> - summaryBuilder - .set("contains_null", true) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build() - )) - .build() - ); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema.asStruct())); + GenericRecordBuilder summaryBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + projectedSchema.findType("partition_summaries.element").asStructType(), + "partition_summary")); + List expected = + Lists.transform( + table.currentSnapshot().allManifests(table.io()), + manifest -> + builder + .set("partition_spec_id", manifest.partitionSpecId()) + .set("path", manifest.path()) + .set( + "partition_summaries", + Lists.transform( + manifest.partitions(), + partition -> + summaryBuilder + .set("contains_null", true) + .set("contains_nan", false) + .set("lower_bound", "1") + .set("upper_bound", "1") + .build())) + .build()); Assert.assertEquals("Manifests table should have one manifest row", 1, actual.size()); TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(0), actual.get(0)); @@ -1006,53 +1153,62 @@ public void testAllManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "all_manifests"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - table.updateProperties() - .set(TableProperties.FORMAT_VERSION, "2") - .commit(); + table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); - DataFile dataFile = Iterables.getFirst(table.currentSnapshot().addedDataFiles(table.io()), null); + DataFile dataFile = + Iterables.getFirst(table.currentSnapshot().addedDataFiles(table.io()), null); PartitionSpec dataFileSpec = table.specs().get(dataFile.specId()); StructLike dataFilePartition = dataFile.partition(); PositionDelete delete = PositionDelete.create(); delete.set(dataFile.path(), 0L, null); - DeleteFile deleteFile = writePositionDeletes(table, dataFileSpec, dataFilePartition, ImmutableList.of(delete)); + DeleteFile deleteFile = + writePositionDeletes(table, dataFileSpec, dataFilePartition, ImmutableList.of(delete)); - table.newRowDelta() - .addDeletes(deleteFile) - .commit(); + table.newRowDelta().addDeletes(deleteFile).commit(); table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); Stream> snapshotIdToManifests = StreamSupport.stream(table.snapshots().spliterator(), false) - .flatMap(snapshot -> snapshot.allManifests(table.io()).stream().map( - manifest -> Pair.of(snapshot.snapshotId(), manifest))); - - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .orderBy("path") - .collectAsList(); + .flatMap( + snapshot -> + snapshot.allManifests(table.io()).stream() + .map(manifest -> Pair.of(snapshot.snapshotId(), manifest))); + + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_manifests")) + .orderBy("path") + .collectAsList(); table.refresh(); - List expected = snapshotIdToManifests - .map(snapshotManifest -> manifestRecord(manifestTable, snapshotManifest.first(), snapshotManifest.second())) - .collect(Collectors.toList()); + List expected = + snapshotIdToManifests + .map( + snapshotManifest -> + manifestRecord( + manifestTable, snapshotManifest.first(), snapshotManifest.second())) + .collect(Collectors.toList()); expected.sort(Comparator.comparing(o -> o.get("path").toString())); Assert.assertEquals("Manifests table should have 5 manifest rows", 5, actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); } } @@ -1061,33 +1217,37 @@ public void testUnpartitionedPartitionsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "unpartitioned_partitions_test"); createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - Dataset df = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - Types.StructType expectedSchema = Types.StructType.of( - required(2, "record_count", Types.LongType.get()), - required(3, "file_count", Types.IntegerType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + required(2, "record_count", Types.LongType.get()), + required(3, "file_count", Types.IntegerType.get())); Table partitionsTable = loadTable(tableIdentifier, "partitions"); - Assert.assertEquals("Schema should not have partition field", - expectedSchema, partitionsTable.schema().asStruct()); + Assert.assertEquals( + "Schema should not have partition field", + expectedSchema, + partitionsTable.schema().asStruct()); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - partitionsTable.schema(), "partitions")); - GenericData.Record expectedRow = builder - .set("record_count", 1L) - .set("file_count", 1) - .build(); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(partitionsTable.schema(), "partitions")); + GenericData.Record expectedRow = builder.set("record_count", 1L).set("file_count", 1).build(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .collectAsList(); Assert.assertEquals("Unpartitioned partitions table should have one row", 1, actual.size()); TestHelpers.assertEqualsSafe(expectedSchema, expectedRow, actual.get(0)); @@ -1098,10 +1258,13 @@ public void testPartitionsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "partitions_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table partitionsTable = loadTable(tableIdentifier, "partitions"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1110,69 +1273,86 @@ public void testPartitionsTable() { long firstCommitId = table.currentSnapshot().snapshotId(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .orderBy("partition.id") - .collectAsList(); - - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - partitionsTable.schema(), "partitions")); - GenericRecordBuilder partitionBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - partitionsTable.schema().findType("partition").asStructType(), "partition")); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .orderBy("partition.id") + .collectAsList(); + + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(partitionsTable.schema(), "partitions")); + GenericRecordBuilder partitionBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + partitionsTable.schema().findType("partition").asStructType(), "partition")); List expected = Lists.newArrayList(); - expected.add(builder - .set("partition", partitionBuilder.set("id", 1).build()) - .set("record_count", 1L) - .set("file_count", 1) - .set("spec_id", 0) - .build()); - expected.add(builder - .set("partition", partitionBuilder.set("id", 2).build()) - .set("record_count", 1L) - .set("file_count", 1) - .set("spec_id", 0) - .build()); + expected.add( + builder + .set("partition", partitionBuilder.set("id", 1).build()) + .set("record_count", 1L) + .set("file_count", 1) + .set("spec_id", 0) + .build()); + expected.add( + builder + .set("partition", partitionBuilder.set("id", 2).build()) + .set("record_count", 1L) + .set("file_count", 1) + .set("spec_id", 0) + .build()); Assert.assertEquals("Partitions table should have two rows", 2, expected.size()); Assert.assertEquals("Actual results should have two rows", 2, actual.size()); for (int i = 0; i < 2; i += 1) { - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); } // check time travel - List actualAfterFirstCommit = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, String.valueOf(firstCommitId)) - .load(loadLocation(tableIdentifier, "partitions")) - .orderBy("partition.id") - .collectAsList(); + List actualAfterFirstCommit = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, String.valueOf(firstCommitId)) + .load(loadLocation(tableIdentifier, "partitions")) + .orderBy("partition.id") + .collectAsList(); Assert.assertEquals("Actual results should have one row", 1, actualAfterFirstCommit.size()); - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(0), actualAfterFirstCommit.get(0)); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(0), actualAfterFirstCommit.get(0)); // check predicate push down - List filtered = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .filter("partition.id < 2") - .collectAsList(); + List filtered = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .filter("partition.id < 2") + .collectAsList(); Assert.assertEquals("Actual results should have one row", 1, filtered.size()); - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(0), filtered.get(0)); - - List nonFiltered = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .filter("partition.id < 2 or record_count=1") - .collectAsList(); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(0), filtered.get(0)); + + List nonFiltered = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .filter("partition.id < 2 or record_count=1") + .collectAsList(); Assert.assertEquals("Actual results should have one row", 2, nonFiltered.size()); for (int i = 0; i < 2; i += 1) { - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); } } @@ -1181,62 +1361,63 @@ public synchronized void testSnapshotReadAfterAddColumn() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List originalRecords = Lists.newArrayList( - RowFactory.create(1, "x"), - RowFactory.create(2, "y"), - RowFactory.create(3, "z")); + List originalRecords = + Lists.newArrayList( + RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z")); StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA); Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf.orderBy("id").collectAsList()); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); Snapshot snapshotBeforeAddColumn = table.currentSnapshot(); table.updateSchema().addColumn("category", Types.StringType.get()).commit(); - List newRecords = Lists.newArrayList( - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); + List newRecords = + Lists.newArrayList(RowFactory.create(4, "xy", "B"), RowFactory.create(5, "xyz", "C")); StructType newSparkSchema = SparkSchemaUtil.convert(SCHEMA2); Dataset inputDf2 = spark.createDataFrame(newRecords, newSparkSchema); - inputDf2.select("id", "data", "category").write() + inputDf2 + .select("id", "data", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - List updatedRecords = Lists.newArrayList( - RowFactory.create(1, "x", null), - RowFactory.create(2, "y", null), - RowFactory.create(3, "z", null), - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); - - Dataset resultDf2 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", updatedRecords, - resultDf2.orderBy("id").collectAsList()); - - Dataset resultDf3 = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf3.orderBy("id").collectAsList()); + List updatedRecords = + Lists.newArrayList( + RowFactory.create(1, "x", null), + RowFactory.create(2, "y", null), + RowFactory.create(3, "z", null), + RowFactory.create(4, "xy", "B"), + RowFactory.create(5, "xyz", "C")); + + Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); + + Dataset resultDf3 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf3.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf3.schema()); } @@ -1245,72 +1426,76 @@ public synchronized void testSnapshotReadAfterDropColumn() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA2, PartitionSpec.unpartitioned()); - List originalRecords = Lists.newArrayList( - RowFactory.create(1, "x", "A"), - RowFactory.create(2, "y", "A"), - RowFactory.create(3, "z", "B")); + List originalRecords = + Lists.newArrayList( + RowFactory.create(1, "x", "A"), + RowFactory.create(2, "y", "A"), + RowFactory.create(3, "z", "B")); StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA2); Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf.select("id", "data", "category").write() + inputDf + .select("id", "data", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf.orderBy("id").collectAsList()); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); long tsBeforeDropColumn = waitUntilAfter(System.currentTimeMillis()); table.updateSchema().deleteColumn("data").commit(); long tsAfterDropColumn = waitUntilAfter(System.currentTimeMillis()); - List newRecords = Lists.newArrayList( - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); + List newRecords = Lists.newArrayList(RowFactory.create(4, "B"), RowFactory.create(5, "C")); StructType newSparkSchema = SparkSchemaUtil.convert(SCHEMA3); Dataset inputDf2 = spark.createDataFrame(newRecords, newSparkSchema); - inputDf2.select("id", "category").write() + inputDf2 + .select("id", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - List updatedRecords = Lists.newArrayList( - RowFactory.create(1, "A"), - RowFactory.create(2, "A"), - RowFactory.create(3, "B"), - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); - - Dataset resultDf2 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", updatedRecords, - resultDf2.orderBy("id").collectAsList()); - - Dataset resultDf3 = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, tsBeforeDropColumn) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf3.orderBy("id").collectAsList()); + List updatedRecords = + Lists.newArrayList( + RowFactory.create(1, "A"), + RowFactory.create(2, "A"), + RowFactory.create(3, "B"), + RowFactory.create(4, "B"), + RowFactory.create(5, "C")); + + Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); + + Dataset resultDf3 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, tsBeforeDropColumn) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf3.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf3.schema()); // At tsAfterDropColumn, there has been a schema change, but no new snapshot, // so the snapshot as of tsAfterDropColumn is the same as that as of tsBeforeDropColumn. - Dataset resultDf4 = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, tsAfterDropColumn) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf4.orderBy("id").collectAsList()); + Dataset resultDf4 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, tsAfterDropColumn) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf4.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf4.schema()); } @@ -1319,77 +1504,77 @@ public synchronized void testSnapshotReadAfterAddAndDropColumn() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List originalRecords = Lists.newArrayList( - RowFactory.create(1, "x"), - RowFactory.create(2, "y"), - RowFactory.create(3, "z")); + List originalRecords = + Lists.newArrayList( + RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z")); StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA); Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf.orderBy("id").collectAsList()); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); Snapshot snapshotBeforeAddColumn = table.currentSnapshot(); table.updateSchema().addColumn("category", Types.StringType.get()).commit(); - List newRecords = Lists.newArrayList( - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); + List newRecords = + Lists.newArrayList(RowFactory.create(4, "xy", "B"), RowFactory.create(5, "xyz", "C")); StructType sparkSchemaAfterAddColumn = SparkSchemaUtil.convert(SCHEMA2); Dataset inputDf2 = spark.createDataFrame(newRecords, sparkSchemaAfterAddColumn); - inputDf2.select("id", "data", "category").write() + inputDf2 + .select("id", "data", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - List updatedRecords = Lists.newArrayList( - RowFactory.create(1, "x", null), - RowFactory.create(2, "y", null), - RowFactory.create(3, "z", null), - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); + List updatedRecords = + Lists.newArrayList( + RowFactory.create(1, "x", null), + RowFactory.create(2, "y", null), + RowFactory.create(3, "z", null), + RowFactory.create(4, "xy", "B"), + RowFactory.create(5, "xyz", "C")); - Dataset resultDf2 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", updatedRecords, - resultDf2.orderBy("id").collectAsList()); + Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); table.updateSchema().deleteColumn("data").commit(); - List recordsAfterDropColumn = Lists.newArrayList( - RowFactory.create(1, null), - RowFactory.create(2, null), - RowFactory.create(3, null), - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); - - Dataset resultDf3 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", recordsAfterDropColumn, - resultDf3.orderBy("id").collectAsList()); - - Dataset resultDf4 = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf4.orderBy("id").collectAsList()); + List recordsAfterDropColumn = + Lists.newArrayList( + RowFactory.create(1, null), + RowFactory.create(2, null), + RowFactory.create(3, null), + RowFactory.create(4, "B"), + RowFactory.create(5, "C")); + + Dataset resultDf3 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", recordsAfterDropColumn, resultDf3.orderBy("id").collectAsList()); + + Dataset resultDf4 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf4.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf4.schema()); } @@ -1398,13 +1583,12 @@ public void testRemoveOrphanFilesActionSupport() throws InterruptedException { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List records = Lists.newArrayList( - new SimpleRecord(1, "1") - ); + List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1416,36 +1600,42 @@ public void testRemoveOrphanFilesActionSupport() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result1 = actions.deleteOrphanFiles(table) - .location(table.location() + "/metadata") - .olderThan(System.currentTimeMillis()) - .execute(); - Assert.assertTrue("Should not delete any metadata files", Iterables.isEmpty(result1.orphanFileLocations())); + DeleteOrphanFiles.Result result1 = + actions + .deleteOrphanFiles(table) + .location(table.location() + "/metadata") + .olderThan(System.currentTimeMillis()) + .execute(); + Assert.assertTrue( + "Should not delete any metadata files", Iterables.isEmpty(result1.orphanFileLocations())); - DeleteOrphanFiles.Result result2 = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); - Assert.assertEquals("Should delete 1 data file", 1, Iterables.size(result2.orphanFileLocations())); + DeleteOrphanFiles.Result result2 = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); + Assert.assertEquals( + "Should delete 1 data file", 1, Iterables.size(result2.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); - List actualRecords = resultDF - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } - @Test public void testFilesTablePartitionId() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "files_test"); - Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("id").build()); + Table table = + createTable( + tableIdentifier, SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("id").build()); int spec0 = table.spec().specId(); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1456,17 +1646,17 @@ public void testFilesTablePartitionId() throws Exception { int spec1 = table.spec().specId(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .sort(DataFile.SPEC_ID.name()) - .collectAsList() - .stream().map(r -> (Integer) r.getAs(DataFile.SPEC_ID.name())).collect(Collectors.toList()); + List actual = + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")) + .sort(DataFile.SPEC_ID.name()).collectAsList().stream() + .map(r -> (Integer) r.getAs(DataFile.SPEC_ID.name())) + .collect(Collectors.toList()); Assert.assertEquals("Should have two partition specs", ImmutableList.of(spec0, spec1), actual); } @@ -1476,22 +1666,26 @@ public void testAllManifestTableSnapshotFiltering() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "all_manifest_snapshot_filtering"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "all_manifests"); - Dataset df = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); List> snapshotIdToManifests = Lists.newArrayList(); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); Snapshot snapshot1 = table.currentSnapshot(); - snapshotIdToManifests.addAll(snapshot1.allManifests().stream() - .map(manifest -> Pair.of(snapshot1.snapshotId(), manifest)) - .collect(Collectors.toList())); + snapshotIdToManifests.addAll( + snapshot1.allManifests().stream() + .map(manifest -> Pair.of(snapshot1.snapshotId(), manifest)) + .collect(Collectors.toList())); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1499,16 +1693,19 @@ public void testAllManifestTableSnapshotFiltering() throws Exception { table.refresh(); Snapshot snapshot2 = table.currentSnapshot(); Assert.assertEquals("Should have two manifests", 2, snapshot2.allManifests().size()); - snapshotIdToManifests.addAll(snapshot2.allManifests().stream() - .map(manifest -> Pair.of(snapshot2.snapshotId(), manifest)) - .collect(Collectors.toList())); + snapshotIdToManifests.addAll( + snapshot2.allManifests().stream() + .map(manifest -> Pair.of(snapshot2.snapshotId(), manifest)) + .collect(Collectors.toList())); // Add manifests that will not be selected - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1518,30 +1715,41 @@ public void testAllManifestTableSnapshotFiltering() throws Exception { snapshotIds.add(String.valueOf(snapshot2.snapshotId())); snapshotIds.toString(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .filter("reference_snapshot_id in " + snapshotIds) - .orderBy("path") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_manifests")) + .filter("reference_snapshot_id in " + snapshotIds) + .orderBy("path") + .collectAsList(); table.refresh(); - List expected = snapshotIdToManifests.stream() - .map(snapshotManifest -> manifestRecord(manifestTable, snapshotManifest.first(), snapshotManifest.second())) - .collect(Collectors.toList()); + List expected = + snapshotIdToManifests.stream() + .map( + snapshotManifest -> + manifestRecord( + manifestTable, snapshotManifest.first(), snapshotManifest.second())) + .collect(Collectors.toList()); expected.sort(Comparator.comparing(o -> o.get("path").toString())); Assert.assertEquals("Manifests table should have 3 manifest rows", 3, actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); } } - private GenericData.Record manifestRecord(Table manifestTable, Long referenceSnapshotId, ManifestFile manifest) { - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema(), "manifests")); - GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema().findType("partition_summaries.element").asStructType(), "partition_summary")); + private GenericData.Record manifestRecord( + Table manifestTable, Long referenceSnapshotId, ManifestFile manifest) { + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(manifestTable.schema(), "manifests")); + GenericRecordBuilder summaryBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + manifestTable.schema().findType("partition_summaries.element").asStructType(), + "partition_summary")); return builder .set("content", manifest.content().id()) .set("path", manifest.path()) @@ -1549,19 +1757,32 @@ private GenericData.Record manifestRecord(Table manifestTable, Long referenceSna .set("partition_spec_id", manifest.partitionSpecId()) .set("added_snapshot_id", manifest.snapshotId()) .set("added_data_files_count", manifest.content() == DATA ? manifest.addedFilesCount() : 0) - .set("existing_data_files_count", manifest.content() == DATA ? manifest.existingFilesCount() : 0) - .set("deleted_data_files_count", manifest.content() == DATA ? manifest.deletedFilesCount() : 0) - .set("added_delete_files_count", manifest.content() == DELETES ? manifest.addedFilesCount() : 0) - .set("existing_delete_files_count", manifest.content() == DELETES ? manifest.existingFilesCount() : 0) - .set("deleted_delete_files_count", manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) - .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> - summaryBuilder - .set("contains_null", false) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build() - )) + .set( + "existing_data_files_count", + manifest.content() == DATA ? manifest.existingFilesCount() : 0) + .set( + "deleted_data_files_count", + manifest.content() == DATA ? manifest.deletedFilesCount() : 0) + .set( + "added_delete_files_count", + manifest.content() == DELETES ? manifest.addedFilesCount() : 0) + .set( + "existing_delete_files_count", + manifest.content() == DELETES ? manifest.existingFilesCount() : 0) + .set( + "deleted_delete_files_count", + manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) + .set( + "partition_summaries", + Lists.transform( + manifest.partitions(), + partition -> + summaryBuilder + .set("contains_null", false) + .set("contains_nan", false) + .set("lower_bound", "1") + .set("upper_bound", "1") + .build())) .set("reference_snapshot_id", referenceSnapshotId) .build(); } @@ -1571,8 +1792,8 @@ private void asMetadataRecord(GenericData.Record file) { file.put(3, 0); // specId } - private PositionDeleteWriter newPositionDeleteWriter(Table table, PartitionSpec spec, - StructLike partition) { + private PositionDeleteWriter newPositionDeleteWriter( + Table table, PartitionSpec spec, StructLike partition) { OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, 0, 0).build(); EncryptedOutputFile outputFile = fileFactory.newOutputFile(spec, partition); @@ -1580,9 +1801,13 @@ private PositionDeleteWriter newPositionDeleteWriter(Table table, P return fileWriterFactory.newPositionDeleteWriter(outputFile, spec, partition); } - private DeleteFile writePositionDeletes(Table table, PartitionSpec spec, StructLike partition, - Iterable> deletes) { - PositionDeleteWriter positionDeleteWriter = newPositionDeleteWriter(table, spec, partition); + private DeleteFile writePositionDeletes( + Table table, + PartitionSpec spec, + StructLike partition, + Iterable> deletes) { + PositionDeleteWriter positionDeleteWriter = + newPositionDeleteWriter(table, spec, partition); try (PositionDeleteWriter writer = positionDeleteWriter) { for (PositionDelete delete : deletes) { diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java index c275daee5f7e..559668ee31a1 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.math.BigDecimal; @@ -61,8 +60,8 @@ public void testRegisterIntegerBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_int_16", DataTypes.IntegerType, 16); List results = spark.sql("SELECT iceberg_bucket_int_16(1)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); } @Test @@ -70,8 +69,8 @@ public void testRegisterShortBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_short_16", DataTypes.ShortType, 16); List results = spark.sql("SELECT iceberg_bucket_short_16(1S)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); } @Test @@ -79,8 +78,8 @@ public void testRegisterByteBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_byte_16", DataTypes.ByteType, 16); List results = spark.sql("SELECT iceberg_bucket_byte_16(1Y)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); } @Test @@ -88,8 +87,8 @@ public void testRegisterLongBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_long_16", DataTypes.LongType, 16); List results = spark.sql("SELECT iceberg_bucket_long_16(1L)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.LongType.get(), 16).apply(1L), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.LongType.get(), 16).apply(1L), results.get(0).getInt(0)); } @Test @@ -97,7 +96,8 @@ public void testRegisterStringBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_string_16", DataTypes.StringType, 16); List results = spark.sql("SELECT iceberg_bucket_string_16('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), + Assert.assertEquals( + (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), results.get(0).getInt(0)); } @@ -106,7 +106,8 @@ public void testRegisterCharBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_char_16", new CharType(5), 16); List results = spark.sql("SELECT iceberg_bucket_char_16('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), + Assert.assertEquals( + (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), results.get(0).getInt(0)); } @@ -115,73 +116,89 @@ public void testRegisterVarCharBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_varchar_16", new VarcharType(5), 16); List results = spark.sql("SELECT iceberg_bucket_varchar_16('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), + Assert.assertEquals( + (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), results.get(0).getInt(0)); } @Test public void testRegisterDateBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_date_16", DataTypes.DateType, 16); - List results = spark.sql("SELECT iceberg_bucket_date_16(DATE '2021-06-30')").collectAsList(); + List results = + spark.sql("SELECT iceberg_bucket_date_16(DATE '2021-06-30')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.DateType.get(), 16) - .apply(DateTimeUtils.fromJavaDate(Date.valueOf("2021-06-30"))), + Assert.assertEquals( + (int) + Transforms.bucket(Types.DateType.get(), 16) + .apply(DateTimeUtils.fromJavaDate(Date.valueOf("2021-06-30"))), results.get(0).getInt(0)); } @Test public void testRegisterTimestampBucketUDF() { - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_timestamp_16", DataTypes.TimestampType, 16); + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_timestamp_16", DataTypes.TimestampType, 16); List results = - spark.sql("SELECT iceberg_bucket_timestamp_16(TIMESTAMP '2021-06-30 00:00:00.000')").collectAsList(); + spark + .sql("SELECT iceberg_bucket_timestamp_16(TIMESTAMP '2021-06-30 00:00:00.000')") + .collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.TimestampType.withZone(), 16) - .apply(DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2021-06-30 00:00:00.000"))), + Assert.assertEquals( + (int) + Transforms.bucket(Types.TimestampType.withZone(), 16) + .apply( + DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2021-06-30 00:00:00.000"))), results.get(0).getInt(0)); } @Test public void testRegisterBinaryBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_binary_16", DataTypes.BinaryType, 16); - List results = - spark.sql("SELECT iceberg_bucket_binary_16(X'0020001F')").collectAsList(); + List results = spark.sql("SELECT iceberg_bucket_binary_16(X'0020001F')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.BinaryType.get(), 16) - .apply(ByteBuffer.wrap(new byte[]{0x00, 0x20, 0x00, 0x1F})), + Assert.assertEquals( + (int) + Transforms.bucket(Types.BinaryType.get(), 16) + .apply(ByteBuffer.wrap(new byte[] {0x00, 0x20, 0x00, 0x1F})), results.get(0).getInt(0)); } @Test public void testRegisterDecimalBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_decimal_16", new DecimalType(4, 2), 16); - List results = - spark.sql("SELECT iceberg_bucket_decimal_16(11.11)").collectAsList(); + List results = spark.sql("SELECT iceberg_bucket_decimal_16(11.11)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.DecimalType.of(4, 2), 16) - .apply(new BigDecimal("11.11")), + Assert.assertEquals( + (int) Transforms.bucket(Types.DecimalType.of(4, 2), 16).apply(new BigDecimal("11.11")), results.get(0).getInt(0)); } @Test public void testRegisterBooleanBucketUDF() { - Assertions.assertThatThrownBy(() -> - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_boolean_16", DataTypes.BooleanType, 16)) + Assertions.assertThatThrownBy( + () -> + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_boolean_16", DataTypes.BooleanType, 16)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot bucket by type: boolean"); } @Test public void testRegisterDoubleBucketUDF() { - Assertions.assertThatThrownBy(() -> - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_double_16", DataTypes.DoubleType, 16)) + Assertions.assertThatThrownBy( + () -> + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_double_16", DataTypes.DoubleType, 16)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot bucket by type: double"); } @Test public void testRegisterFloatBucketUDF() { - Assertions.assertThatThrownBy(() -> - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_float_16", DataTypes.FloatType, 16)) + Assertions.assertThatThrownBy( + () -> + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_float_16", DataTypes.FloatType, 16)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot bucket by type: float"); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java index e07798301db8..7313c18cc09d 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; @@ -55,11 +54,11 @@ public class TestIdentityPartitionData extends SparkTestBase { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true }, + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true}, }; } @@ -71,36 +70,37 @@ public TestIdentityPartitionData(String format, boolean vectorized) { this.vectorized = vectorized; } - private static final Schema LOG_SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "date", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get()) - ); - - private static final List LOGS = ImmutableList.of( - LogMessage.debug("2020-02-02", "debug event 1"), - LogMessage.info("2020-02-02", "info event 1"), - LogMessage.debug("2020-02-02", "debug event 2"), - LogMessage.info("2020-02-03", "info event 2"), - LogMessage.debug("2020-02-03", "debug event 3"), - LogMessage.info("2020-02-03", "info event 3"), - LogMessage.error("2020-02-03", "error event 1"), - LogMessage.debug("2020-02-04", "debug event 4"), - LogMessage.warn("2020-02-04", "warn event 1"), - LogMessage.debug("2020-02-04", "debug event 5") - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - - private PartitionSpec spec = PartitionSpec.builderFor(LOG_SCHEMA).identity("date").identity("level").build(); + private static final Schema LOG_SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "date", Types.StringType.get()), + Types.NestedField.optional(3, "level", Types.StringType.get()), + Types.NestedField.optional(4, "message", Types.StringType.get())); + + private static final List LOGS = + ImmutableList.of( + LogMessage.debug("2020-02-02", "debug event 1"), + LogMessage.info("2020-02-02", "info event 1"), + LogMessage.debug("2020-02-02", "debug event 2"), + LogMessage.info("2020-02-03", "info event 2"), + LogMessage.debug("2020-02-03", "debug event 3"), + LogMessage.info("2020-02-03", "info event 3"), + LogMessage.error("2020-02-03", "error event 1"), + LogMessage.debug("2020-02-04", "debug event 4"), + LogMessage.warn("2020-02-04", "warn event 1"), + LogMessage.debug("2020-02-04", "debug event 5")); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); + + private PartitionSpec spec = + PartitionSpec.builderFor(LOG_SCHEMA).identity("date").identity("level").build(); private Table table = null; private Dataset logs = null; /** - * Use the Hive Based table to make Identity Partition Columns with no duplication of the data in the underlying - * parquet files. This makes sure that if the identity mapping fails, the test will also fail. + * Use the Hive Based table to make Identity Partition Columns with no duplication of the data in + * the underlying parquet files. This makes sure that if the identity mapping fails, the test will + * also fail. */ private void setupParquet() throws Exception { File location = temp.newFolder("logs"); @@ -109,15 +109,25 @@ private void setupParquet() throws Exception { Assert.assertTrue("Temp folder should exist", location.exists()); Map properties = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format); - this.logs = spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); + this.logs = + spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); spark.sql(String.format("DROP TABLE IF EXISTS %s", hiveTable)); - logs.orderBy("date", "level", "id").write().partitionBy("date", "level").format("parquet") - .option("path", hiveLocation.toString()).saveAsTable(hiveTable); - - this.table = TABLES.create(SparkSchemaUtil.schemaForTable(spark, hiveTable), - SparkSchemaUtil.specForTable(spark, hiveTable), properties, location.toString()); - - SparkTableUtil.importSparkTable(spark, new TableIdentifier(hiveTable), table, location.toString()); + logs.orderBy("date", "level", "id") + .write() + .partitionBy("date", "level") + .format("parquet") + .option("path", hiveLocation.toString()) + .saveAsTable(hiveTable); + + this.table = + TABLES.create( + SparkSchemaUtil.schemaForTable(spark, hiveTable), + SparkSchemaUtil.specForTable(spark, hiveTable), + properties, + location.toString()); + + SparkTableUtil.importSparkTable( + spark, new TableIdentifier(hiveTable), table, location.toString()); } @Before @@ -130,56 +140,70 @@ public void setupTable() throws Exception { Map properties = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format); this.table = TABLES.create(LOG_SCHEMA, spec, properties, location.toString()); - this.logs = spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); + this.logs = + spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); - logs.orderBy("date", "level", "id").write().format("iceberg").mode("append").save(location.toString()); + logs.orderBy("date", "level", "id") + .write() + .format("iceberg") + .mode("append") + .save(location.toString()); } } @Test public void testFullProjection() { List expected = logs.orderBy("id").collectAsList(); - List actual = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()).orderBy("id") - .select("id", "date", "level", "message") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table.location()) + .orderBy("id") + .select("id", "date", "level", "message") + .collectAsList(); Assert.assertEquals("Rows should match", expected, actual); } @Test public void testProjections() { - String[][] cases = new String[][] { - // individual fields - new String[] { "date" }, - new String[] { "level" }, - new String[] { "message" }, - // field pairs - new String[] { "date", "message" }, - new String[] { "level", "message" }, - new String[] { "date", "level" }, - // out-of-order pairs - new String[] { "message", "date" }, - new String[] { "message", "level" }, - new String[] { "level", "date" }, - // full projection, different orderings - new String[] { "date", "level", "message" }, - new String[] { "level", "date", "message" }, - new String[] { "date", "message", "level" }, - new String[] { "level", "message", "date" }, - new String[] { "message", "date", "level" }, - new String[] { "message", "level", "date" } - }; + String[][] cases = + new String[][] { + // individual fields + new String[] {"date"}, + new String[] {"level"}, + new String[] {"message"}, + // field pairs + new String[] {"date", "message"}, + new String[] {"level", "message"}, + new String[] {"date", "level"}, + // out-of-order pairs + new String[] {"message", "date"}, + new String[] {"message", "level"}, + new String[] {"level", "date"}, + // full projection, different orderings + new String[] {"date", "level", "message"}, + new String[] {"level", "date", "message"}, + new String[] {"date", "message", "level"}, + new String[] {"level", "message", "date"}, + new String[] {"message", "date", "level"}, + new String[] {"message", "level", "date"} + }; for (String[] ordering : cases) { List expected = logs.select("id", ordering).orderBy("id").collectAsList(); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()) - .select("id", ordering).orderBy("id") - .collectAsList(); - Assert.assertEquals("Rows should match for ordering: " + Arrays.toString(ordering), expected, actual); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table.location()) + .select("id", ordering) + .orderBy("id") + .collectAsList(); + Assert.assertEquals( + "Rows should match for ordering: " + Arrays.toString(ordering), expected, actual); } } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java index 4ab01044046f..9e75145faff9 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Iterator; @@ -68,8 +67,10 @@ protected void generateAndValidate(Schema schema, AssertMethod assertMethod) { StructLike recordStructLike = recordWrapper.wrap(actual.next()); StructLike rowStructLike = rowWrapper.wrap(expected.next()); - assertMethod.assertEquals("Should have expected StructLike values", - actualWrapper.set(recordStructLike), expectedWrapper.set(rowStructLike)); + assertMethod.assertEquals( + "Should have expected StructLike values", + actualWrapper.set(recordStructLike), + expectedWrapper.set(rowStructLike)); } Assert.assertFalse("Shouldn't have more record", actual.hasNext()); diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTablesWithPartitionEvolution.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTablesWithPartitionEvolution.java index ea9818cae9d9..950c1c1b40ae 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTablesWithPartitionEvolution.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTablesWithPartitionEvolution.java @@ -16,9 +16,18 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.FileFormat.AVRO; +import static org.apache.iceberg.FileFormat.ORC; +import static org.apache.iceberg.FileFormat.PARQUET; +import static org.apache.iceberg.MetadataTableType.ALL_DATA_FILES; +import static org.apache.iceberg.MetadataTableType.ALL_ENTRIES; +import static org.apache.iceberg.MetadataTableType.ENTRIES; +import static org.apache.iceberg.MetadataTableType.FILES; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.FORMAT_VERSION; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -52,48 +61,42 @@ import org.junit.runners.Parameterized; import org.junit.runners.Parameterized.Parameters; -import static org.apache.iceberg.FileFormat.AVRO; -import static org.apache.iceberg.FileFormat.ORC; -import static org.apache.iceberg.FileFormat.PARQUET; -import static org.apache.iceberg.MetadataTableType.ALL_DATA_FILES; -import static org.apache.iceberg.MetadataTableType.ALL_ENTRIES; -import static org.apache.iceberg.MetadataTableType.ENTRIES; -import static org.apache.iceberg.MetadataTableType.FILES; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.FORMAT_VERSION; - @RunWith(Parameterized.class) public class TestMetadataTablesWithPartitionEvolution extends SparkCatalogTestBase { @Parameters(name = "catalog = {0}, impl = {1}, conf = {2}, fileFormat = {3}, formatVersion = {4}") public static Object[][] parameters() { return new Object[][] { - { "testhive", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default" - ), - ORC, - formatVersion() - }, - { "testhadoop", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hadoop" + { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default"), + ORC, + formatVersion() + }, + { + "testhadoop", + SparkCatalog.class.getName(), + ImmutableMap.of("type", "hadoop"), + PARQUET, + formatVersion() + }, + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "clients", "1", + "parquet-enabled", "false", + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync ), - PARQUET, - formatVersion() - }, - { "spark_catalog", SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "clients", "1", - "parquet-enabled", "false", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - ), - AVRO, - formatVersion() - } + AVRO, + formatVersion() + } }; } @@ -106,8 +109,12 @@ private static int formatVersion() { private final FileFormat fileFormat; private final int formatVersion; - public TestMetadataTablesWithPartitionEvolution(String catalogName, String implementation, Map config, - FileFormat fileFormat, int formatVersion) { + public TestMetadataTablesWithPartitionEvolution( + String catalogName, + String implementation, + Map config, + FileFormat fileFormat, + int formatVersion) { super(catalogName, implementation, config); this.fileFormat = fileFormat; this.formatVersion = formatVersion; @@ -120,7 +127,9 @@ public void removeTable() { @Test public void testFilesMetadataTable() throws ParseException { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -128,28 +137,23 @@ public void testFilesMetadataTable() throws ParseException { // verify the metadata tables while the current spec is still unpartitioned for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES)) { Dataset df = loadMetadataTable(tableType); - Assert.assertTrue("Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); + Assert.assertTrue( + "Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); } Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); // verify the metadata tables after adding the first partition column for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES)) { assertPartitions( - ImmutableList.of(row(new Object[]{null}), row("b1")), - "STRUCT", - tableType); + ImmutableList.of(row(new Object[] {null}), row("b1")), "STRUCT", tableType); } - table.updateSpec() - .addField(Expressions.bucket("category", 8)) - .commit(); + table.updateSpec().addField(Expressions.bucket("category", 8)).commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -161,9 +165,7 @@ public void testFilesMetadataTable() throws ParseException { tableType); } - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -175,9 +177,7 @@ public void testFilesMetadataTable() throws ParseException { tableType); } - table.updateSpec() - .renameField("category_bucket_8", "category_bucket_8_another_name") - .commit(); + table.updateSpec().renameField("category_bucket_8", "category_bucket_8_another_name").commit(); sql("REFRESH TABLE %s", tableName); // verify the metadata tables after renaming the second partition column @@ -191,7 +191,9 @@ public void testFilesMetadataTable() throws ParseException { @Test public void testEntriesMetadataTable() throws ParseException { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -205,23 +207,17 @@ public void testEntriesMetadataTable() throws ParseException { Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); // verify the metadata tables after adding the first partition column for (MetadataTableType tableType : Arrays.asList(ENTRIES, ALL_ENTRIES)) { assertPartitions( - ImmutableList.of(row(new Object[]{null}), row("b1")), - "STRUCT", - tableType); + ImmutableList.of(row(new Object[] {null}), row("b1")), "STRUCT", tableType); } - table.updateSpec() - .addField(Expressions.bucket("category", 8)) - .commit(); + table.updateSpec().addField(Expressions.bucket("category", 8)).commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -233,9 +229,7 @@ public void testEntriesMetadataTable() throws ParseException { tableType); } - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -247,9 +241,7 @@ public void testEntriesMetadataTable() throws ParseException { tableType); } - table.updateSpec() - .renameField("category_bucket_8", "category_bucket_8_another_name") - .commit(); + table.updateSpec().renameField("category_bucket_8", "category_bucket_8_another_name").commit(); sql("REFRESH TABLE %s", tableName); // verify the metadata tables after renaming the second partition column @@ -263,15 +255,19 @@ public void testEntriesMetadataTable() throws ParseException { @Test public void testMetadataTablesWithUnknownTransforms() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); Table table = validationCatalog.loadTable(tableIdent); - PartitionSpec unknownSpec = PartitionSpecParser.fromJson(table.schema(), - "{ \"spec-id\": 1, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); + PartitionSpec unknownSpec = + PartitionSpecParser.fromJson( + table.schema(), + "{ \"spec-id\": 1, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); // replace the table spec to include an unknown transform TableOperations ops = ((HasTableOperations) table).operations(); @@ -281,14 +277,17 @@ public void testMetadataTablesWithUnknownTransforms() { sql("REFRESH TABLE %s", tableName); for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES, ENTRIES, ALL_ENTRIES)) { - AssertHelpers.assertThrows("Should complain about the partition type", - ValidationException.class, "Cannot build table partition type, unknown transforms", + AssertHelpers.assertThrows( + "Should complain about the partition type", + ValidationException.class, + "Cannot build table partition type, unknown transforms", () -> loadMetadataTable(tableType)); } } - private void assertPartitions(List expectedPartitions, String expectedTypeAsString, - MetadataTableType tableType) throws ParseException { + private void assertPartitions( + List expectedPartitions, String expectedTypeAsString, MetadataTableType tableType) + throws ParseException { Dataset df = loadMetadataTable(tableType); DataType expectedType = spark.sessionState().sqlParser().parseDataType(expectedTypeAsString); @@ -313,18 +312,18 @@ private void assertPartitions(List expectedPartitions, String expected switch (tableType) { case FILES: case ALL_DATA_FILES: - List actualFilesPartitions = df.orderBy("partition") - .select("partition.*") - .collectAsList(); - assertEquals("Partitions must match", expectedPartitions, rowsToJava(actualFilesPartitions)); + List actualFilesPartitions = + df.orderBy("partition").select("partition.*").collectAsList(); + assertEquals( + "Partitions must match", expectedPartitions, rowsToJava(actualFilesPartitions)); break; case ENTRIES: case ALL_ENTRIES: - List actualEntriesPartitions = df.orderBy("data_file.partition") - .select("data_file.partition.*") - .collectAsList(); - assertEquals("Partitions must match", expectedPartitions, rowsToJava(actualEntriesPartitions)); + List actualEntriesPartitions = + df.orderBy("data_file.partition").select("data_file.partition.*").collectAsList(); + assertEquals( + "Partitions must match", expectedPartitions, rowsToJava(actualEntriesPartitions)); break; default: @@ -337,7 +336,9 @@ private Dataset loadMetadataTable(MetadataTableType tableType) { } private void initTable() { - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DEFAULT_FILE_FORMAT, fileFormat.name()); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, DEFAULT_FILE_FORMAT, fileFormat.name()); sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, FORMAT_VERSION, formatVersion); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java index adfe8c7d3649..f585ed360f95 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.List; @@ -52,8 +53,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; - @RunWith(Parameterized.class) public class TestParquetScan extends AvroDataTest { private static final Configuration CONF = new Configuration(); @@ -72,12 +71,11 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Parameterized.Parameters(name = "vectorized = {0}") public static Object[] parameters() { - return new Object[] { false, true }; + return new Object[] {false, true}; } private final boolean vectorized; @@ -88,18 +86,20 @@ public TestParquetScan(boolean vectorized) { @Override protected void writeAndValidate(Schema schema) throws IOException { - Assume.assumeTrue("Cannot handle non-string map keys in parquet-avro", - null == TypeUtil.find( - schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); + Assume.assumeTrue( + "Cannot handle non-string map keys in parquet-avro", + null + == TypeUtil.find( + schema, + type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); File parent = temp.newFolder("parquet"); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); dataFolder.mkdirs(); - File parquetFile = new File(dataFolder, - FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); + File parquetFile = + new File(dataFolder, FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); HadoopTables tables = new HadoopTables(CONF); Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); @@ -110,24 +110,25 @@ protected void writeAndValidate(Schema schema) throws IOException { List expected = RandomData.generateList(tableSchema, 100, 1L); - try (FileAppender writer = Parquet.write(localOutput(parquetFile)) - .schema(tableSchema) - .build()) { + try (FileAppender writer = + Parquet.write(localOutput(parquetFile)).schema(tableSchema).build()) { writer.addAll(expected); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withFileSizeInBytes(parquetFile.length()) - .withPath(parquetFile.toString()) - .withRecordCount(100) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withFileSizeInBytes(parquetFile.length()) + .withPath(parquetFile.toString()) + .withRecordCount(100) + .build(); table.newAppend().appendFile(file).commit(); - table.updateProperties().set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)).commit(); + table + .updateProperties() + .set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .commit(); - Dataset df = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset df = spark.read().format("iceberg").load(location.toString()); List rows = df.collectAsList(); Assert.assertEquals("Should contain 100 rows", 100, rows.size()); diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java index 24f7b69e1dc5..ffe21432f00c 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; @@ -78,11 +77,11 @@ public class TestPartitionPruning { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } @@ -97,9 +96,12 @@ public TestPartitionPruning(String format, boolean vectorized) { private static SparkSession spark = null; private static JavaSparkContext sparkContext = null; - private static Transform bucketTransform = Transforms.bucket(Types.IntegerType.get(), 3); - private static Transform truncateTransform = Transforms.truncate(Types.StringType.get(), 5); - private static Transform hourTransform = Transforms.hour(Types.TimestampType.withoutZone()); + private static Transform bucketTransform = + Transforms.bucket(Types.IntegerType.get(), 3); + private static Transform truncateTransform = + Transforms.truncate(Types.StringType.get(), 5); + private static Transform hourTransform = + Transforms.hour(Types.TimestampType.withoutZone()); @BeforeClass public static void startSpark() { @@ -110,12 +112,21 @@ public static void startSpark() { CONF.set(optionKey, CountOpenLocalFileSystem.class.getName()); spark.conf().set(optionKey, CountOpenLocalFileSystem.class.getName()); spark.conf().set("spark.sql.session.timeZone", "UTC"); - spark.udf().register("bucket3", (Integer num) -> bucketTransform.apply(num), DataTypes.IntegerType); - spark.udf().register("truncate5", (String str) -> truncateTransform.apply(str), DataTypes.StringType); + spark + .udf() + .register("bucket3", (Integer num) -> bucketTransform.apply(num), DataTypes.IntegerType); + spark + .udf() + .register("truncate5", (String str) -> truncateTransform.apply(str), DataTypes.StringType); // NOTE: date transforms take the type long, not Timestamp - spark.udf().register("hour", (Timestamp ts) -> hourTransform.apply( - org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp(ts)), - DataTypes.IntegerType); + spark + .udf() + .register( + "hour", + (Timestamp ts) -> + hourTransform.apply( + org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp(ts)), + DataTypes.IntegerType); } @AfterClass @@ -125,70 +136,70 @@ public static void stopSpark() { currentSpark.stop(); } - private static final Schema LOG_SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "date", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get()), - Types.NestedField.optional(5, "timestamp", Types.TimestampType.withZone()) - ); - - private static final List LOGS = ImmutableList.of( - LogMessage.debug("2020-02-02", "debug event 1", getInstant("2020-02-02T00:00:00")), - LogMessage.info("2020-02-02", "info event 1", getInstant("2020-02-02T01:00:00")), - LogMessage.debug("2020-02-02", "debug event 2", getInstant("2020-02-02T02:00:00")), - LogMessage.info("2020-02-03", "info event 2", getInstant("2020-02-03T00:00:00")), - LogMessage.debug("2020-02-03", "debug event 3", getInstant("2020-02-03T01:00:00")), - LogMessage.info("2020-02-03", "info event 3", getInstant("2020-02-03T02:00:00")), - LogMessage.error("2020-02-03", "error event 1", getInstant("2020-02-03T03:00:00")), - LogMessage.debug("2020-02-04", "debug event 4", getInstant("2020-02-04T01:00:00")), - LogMessage.warn("2020-02-04", "warn event 1", getInstant("2020-02-04T02:00:00")), - LogMessage.debug("2020-02-04", "debug event 5", getInstant("2020-02-04T03:00:00")) - ); + private static final Schema LOG_SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "date", Types.StringType.get()), + Types.NestedField.optional(3, "level", Types.StringType.get()), + Types.NestedField.optional(4, "message", Types.StringType.get()), + Types.NestedField.optional(5, "timestamp", Types.TimestampType.withZone())); + + private static final List LOGS = + ImmutableList.of( + LogMessage.debug("2020-02-02", "debug event 1", getInstant("2020-02-02T00:00:00")), + LogMessage.info("2020-02-02", "info event 1", getInstant("2020-02-02T01:00:00")), + LogMessage.debug("2020-02-02", "debug event 2", getInstant("2020-02-02T02:00:00")), + LogMessage.info("2020-02-03", "info event 2", getInstant("2020-02-03T00:00:00")), + LogMessage.debug("2020-02-03", "debug event 3", getInstant("2020-02-03T01:00:00")), + LogMessage.info("2020-02-03", "info event 3", getInstant("2020-02-03T02:00:00")), + LogMessage.error("2020-02-03", "error event 1", getInstant("2020-02-03T03:00:00")), + LogMessage.debug("2020-02-04", "debug event 4", getInstant("2020-02-04T01:00:00")), + LogMessage.warn("2020-02-04", "warn event 1", getInstant("2020-02-04T02:00:00")), + LogMessage.debug("2020-02-04", "debug event 5", getInstant("2020-02-04T03:00:00"))); private static Instant getInstant(String timestampWithoutZone) { - Long epochMicros = (Long) Literal.of(timestampWithoutZone).to(Types.TimestampType.withoutZone()).value(); + Long epochMicros = + (Long) Literal.of(timestampWithoutZone).to(Types.TimestampType.withoutZone()).value(); return Instant.ofEpochMilli(TimeUnit.MICROSECONDS.toMillis(epochMicros)); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private PartitionSpec spec = PartitionSpec.builderFor(LOG_SCHEMA) - .identity("date") - .identity("level") - .bucket("id", 3) - .truncate("message", 5) - .hour("timestamp") - .build(); + private PartitionSpec spec = + PartitionSpec.builderFor(LOG_SCHEMA) + .identity("date") + .identity("level") + .bucket("id", 3) + .truncate("message", 5) + .hour("timestamp") + .build(); @Test public void testPartitionPruningIdentityString() { String filterCond = "date >= '2020-02-03' AND level = 'DEBUG'"; - Predicate partCondition = (Row r) -> { - String date = r.getString(0); - String level = r.getString(1); - return date.compareTo("2020-02-03") >= 0 && level.equals("DEBUG"); - }; + Predicate partCondition = + (Row r) -> { + String date = r.getString(0); + String level = r.getString(1); + return date.compareTo("2020-02-03") >= 0 && level.equals("DEBUG"); + }; runTest(filterCond, partCondition); } @Test public void testPartitionPruningBucketingInteger() { - final int[] ids = new int[]{ - LOGS.get(3).getId(), - LOGS.get(7).getId() - }; - String condForIds = Arrays.stream(ids).mapToObj(String::valueOf) - .collect(Collectors.joining(",", "(", ")")); + final int[] ids = new int[] {LOGS.get(3).getId(), LOGS.get(7).getId()}; + String condForIds = + Arrays.stream(ids).mapToObj(String::valueOf).collect(Collectors.joining(",", "(", ")")); String filterCond = "id in " + condForIds; - Predicate partCondition = (Row r) -> { - int bucketId = r.getInt(2); - Set buckets = Arrays.stream(ids).map(bucketTransform::apply) - .boxed().collect(Collectors.toSet()); - return buckets.contains(bucketId); - }; + Predicate partCondition = + (Row r) -> { + int bucketId = r.getInt(2); + Set buckets = + Arrays.stream(ids).map(bucketTransform::apply).boxed().collect(Collectors.toSet()); + return buckets.contains(bucketId); + }; runTest(filterCond, partCondition); } @@ -196,10 +207,11 @@ public void testPartitionPruningBucketingInteger() { @Test public void testPartitionPruningTruncatedString() { String filterCond = "message like 'info event%'"; - Predicate partCondition = (Row r) -> { - String truncatedMessage = r.getString(3); - return truncatedMessage.equals("info "); - }; + Predicate partCondition = + (Row r) -> { + String truncatedMessage = r.getString(3); + return truncatedMessage.equals("info "); + }; runTest(filterCond, partCondition); } @@ -207,10 +219,11 @@ public void testPartitionPruningTruncatedString() { @Test public void testPartitionPruningTruncatedStringComparingValueShorterThanPartitionValue() { String filterCond = "message like 'inf%'"; - Predicate partCondition = (Row r) -> { - String truncatedMessage = r.getString(3); - return truncatedMessage.startsWith("inf"); - }; + Predicate partCondition = + (Row r) -> { + String truncatedMessage = r.getString(3); + return truncatedMessage.startsWith("inf"); + }; runTest(filterCond, partCondition); } @@ -219,17 +232,20 @@ public void testPartitionPruningTruncatedStringComparingValueShorterThanPartitio public void testPartitionPruningHourlyPartition() { String filterCond; if (spark.version().startsWith("2")) { - // Looks like from Spark 2 we need to compare timestamp with timestamp to push down the filter. + // Looks like from Spark 2 we need to compare timestamp with timestamp to push down the + // filter. filterCond = "timestamp >= to_timestamp('2020-02-03T01:00:00')"; } else { filterCond = "timestamp >= '2020-02-03T01:00:00'"; } - Predicate partCondition = (Row r) -> { - int hourValue = r.getInt(4); - Instant instant = getInstant("2020-02-03T01:00:00"); - Integer hourValueToFilter = hourTransform.apply(TimeUnit.MILLISECONDS.toMicros(instant.toEpochMilli())); - return hourValue >= hourValueToFilter; - }; + Predicate partCondition = + (Row r) -> { + int hourValue = r.getInt(4); + Instant instant = getInstant("2020-02-03T01:00:00"); + Integer hourValueToFilter = + hourTransform.apply(TimeUnit.MILLISECONDS.toMicros(instant.toEpochMilli())); + return hourValue >= hourValueToFilter; + }; runTest(filterCond, partCondition); } @@ -242,24 +258,26 @@ private void runTest(String filterCond, Predicate partCondition) { Dataset logs = createTestDataset(); saveTestDatasetToTable(logs, table); - List expected = logs - .select("id", "date", "level", "message", "timestamp") - .filter(filterCond) - .orderBy("id") - .collectAsList(); + List expected = + logs.select("id", "date", "level", "message", "timestamp") + .filter(filterCond) + .orderBy("id") + .collectAsList(); Assert.assertFalse("Expected rows should be not empty", expected.isEmpty()); // remove records which may be recorded during storing to table CountOpenLocalFileSystem.resetRecordsInPathPrefix(originTableLocation.getAbsolutePath()); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()) - .select("id", "date", "level", "message", "timestamp") - .filter(filterCond) - .orderBy("id") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table.location()) + .select("id", "date", "level", "message", "timestamp") + .filter(filterCond) + .orderBy("id") + .collectAsList(); Assert.assertFalse("Actual rows should not be empty", actual.isEmpty()); Assert.assertEquals("Rows should match", expected, actual); @@ -282,40 +300,59 @@ private Table createTable(File originTableLocation) { } private Dataset createTestDataset() { - List rows = LOGS.stream().map(logMessage -> { - Object[] underlying = new Object[] { - logMessage.getId(), - UTF8String.fromString(logMessage.getDate()), - UTF8String.fromString(logMessage.getLevel()), - UTF8String.fromString(logMessage.getMessage()), - // discard the nanoseconds part to simplify - TimeUnit.MILLISECONDS.toMicros(logMessage.getTimestamp().toEpochMilli()) - }; - return new GenericInternalRow(underlying); - }).collect(Collectors.toList()); + List rows = + LOGS.stream() + .map( + logMessage -> { + Object[] underlying = + new Object[] { + logMessage.getId(), + UTF8String.fromString(logMessage.getDate()), + UTF8String.fromString(logMessage.getLevel()), + UTF8String.fromString(logMessage.getMessage()), + // discard the nanoseconds part to simplify + TimeUnit.MILLISECONDS.toMicros(logMessage.getTimestamp().toEpochMilli()) + }; + return new GenericInternalRow(underlying); + }) + .collect(Collectors.toList()); JavaRDD rdd = sparkContext.parallelize(rows); - Dataset df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(LOG_SCHEMA), false); - - return df - .selectExpr("id", "date", "level", "message", "timestamp") - .selectExpr("id", "date", "level", "message", "timestamp", "bucket3(id) AS bucket_id", - "truncate5(message) AS truncated_message", "hour(timestamp) AS ts_hour"); + Dataset df = + spark.internalCreateDataFrame( + JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(LOG_SCHEMA), false); + + return df.selectExpr("id", "date", "level", "message", "timestamp") + .selectExpr( + "id", + "date", + "level", + "message", + "timestamp", + "bucket3(id) AS bucket_id", + "truncate5(message) AS truncated_message", + "hour(timestamp) AS ts_hour"); } private void saveTestDatasetToTable(Dataset logs, Table table) { logs.orderBy("date", "level", "bucket_id", "truncated_message", "ts_hour") .select("id", "date", "level", "message", "timestamp") - .write().format("iceberg").mode("append").save(table.location()); + .write() + .format("iceberg") + .mode("append") + .save(table.location()); } - private void assertAccessOnDataFiles(File originTableLocation, Table table, Predicate partCondition) { + private void assertAccessOnDataFiles( + File originTableLocation, Table table, Predicate partCondition) { // only use files in current table location to avoid side-effects on concurrent test runs - Set readFilesInQuery = CountOpenLocalFileSystem.pathToNumOpenCalled.keySet() - .stream().filter(path -> path.startsWith(originTableLocation.getAbsolutePath())) - .collect(Collectors.toSet()); + Set readFilesInQuery = + CountOpenLocalFileSystem.pathToNumOpenCalled.keySet().stream() + .filter(path -> path.startsWith(originTableLocation.getAbsolutePath())) + .collect(Collectors.toSet()); - List files = spark.read().format("iceberg").load(table.location() + "#files").collectAsList(); + List files = + spark.read().format("iceberg").load(table.location() + "#files").collectAsList(); Set filesToRead = extractFilePathsMatchingConditionOnPartition(files, partCondition); Set filesToNotRead = extractFilePathsNotIn(files, filesToRead); @@ -325,37 +362,51 @@ private void assertAccessOnDataFiles(File originTableLocation, Table table, Pred Assert.assertFalse("The query should prune some data files.", filesToNotRead.isEmpty()); - // We don't check "all" data files bound to the condition are being read, as data files can be pruned on + // We don't check "all" data files bound to the condition are being read, as data files can be + // pruned on // other conditions like lower/upper bound of columns. - Assert.assertFalse("Some of data files in partition range should be read. " + - "Read files in query: " + readFilesInQuery + " / data files in partition range: " + filesToRead, + Assert.assertFalse( + "Some of data files in partition range should be read. " + + "Read files in query: " + + readFilesInQuery + + " / data files in partition range: " + + filesToRead, Sets.intersection(filesToRead, readFilesInQuery).isEmpty()); // Data files which aren't bound to the condition shouldn't be read. - Assert.assertTrue("Data files outside of partition range should not be read. " + - "Read files in query: " + readFilesInQuery + " / data files outside of partition range: " + filesToNotRead, + Assert.assertTrue( + "Data files outside of partition range should not be read. " + + "Read files in query: " + + readFilesInQuery + + " / data files outside of partition range: " + + filesToNotRead, Sets.intersection(filesToNotRead, readFilesInQuery).isEmpty()); } - private Set extractFilePathsMatchingConditionOnPartition(List files, Predicate condition) { + private Set extractFilePathsMatchingConditionOnPartition( + List files, Predicate condition) { // idx 1: file_path, idx 3: partition return files.stream() - .filter(r -> { - Row partition = r.getStruct(4); - return condition.test(partition); - }).map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) + .filter( + r -> { + Row partition = r.getStruct(4); + return condition.test(partition); + }) + .map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) .collect(Collectors.toSet()); } private Set extractFilePathsNotIn(List files, Set filePaths) { - Set allFilePaths = files.stream().map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) - .collect(Collectors.toSet()); + Set allFilePaths = + files.stream() + .map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) + .collect(Collectors.toSet()); return Sets.newHashSet(Sets.symmetricDifference(allFilePaths, filePaths)); } public static class CountOpenLocalFileSystem extends RawLocalFileSystem { - public static String scheme = String.format("TestIdentityPartitionData%dfs", - new Random().nextInt()); + public static String scheme = + String.format("TestIdentityPartitionData%dfs", new Random().nextInt()); public static Map pathToNumOpenCalled = Maps.newConcurrentMap(); public static String convertPath(String absPath) { @@ -401,13 +452,15 @@ public String getScheme() { @Override public FSDataInputStream open(Path f, int bufferSize) throws IOException { String path = f.toUri().getPath(); - pathToNumOpenCalled.compute(path, (ignored, v) -> { - if (v == null) { - return 1L; - } else { - return v + 1; - } - }); + pathToNumOpenCalled.compute( + path, + (ignored, v) -> { + if (v == null) { + return 1L; + } else { + return v + 1; + } + }); return super.open(f, bufferSize); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java index f63181766852..fedac9aee3ac 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.util.List; import org.apache.avro.generic.GenericData; @@ -56,46 +58,43 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestPartitionValues { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } - private static final Schema SUPPORTED_PRIMITIVES = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - required(103, "i", Types.IntegerType.get()), - required(104, "l", Types.LongType.get()), - required(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - required(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - required(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision - ); - - private static final Schema SIMPLE_SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get())); - - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SIMPLE_SCHEMA) - .identity("data") - .build(); + private static final Schema SUPPORTED_PRIMITIVES = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + required(103, "i", Types.IntegerType.get()), + required(104, "l", Types.LongType.get()), + required(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + required(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + required(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision + ); + + private static final Schema SIMPLE_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SIMPLE_SCHEMA).identity("data").build(); private static SparkSession spark = null; @@ -111,8 +110,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String format; private final boolean vectorized; @@ -134,29 +132,30 @@ public void testNullPartitionValue() throws Exception { Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, null) - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, null)); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(location.toString()); - Dataset result = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()); + Dataset result = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(location.toString()); - List actual = result - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -174,29 +173,28 @@ public void testReorderedColumns() throws Exception { Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("data", "id").write() - .format("iceberg") - .mode(SaveMode.Append) - .option(SparkWriteOptions.CHECK_ORDERING, "false") - .save(location.toString()); + df.select("data", "id") + .write() + .format("iceberg") + .mode(SaveMode.Append) + .option(SparkWriteOptions.CHECK_ORDERING, "false") + .save(location.toString()); - Dataset result = spark.read() + Dataset result = + spark + .read() .format("iceberg") .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) .load(location.toString()); - List actual = result - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -214,30 +212,29 @@ public void testReorderedColumnsNoNullability() throws Exception { Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("data", "id").write() - .format("iceberg") - .mode(SaveMode.Append) - .option(SparkWriteOptions.CHECK_ORDERING, "false") - .option(SparkWriteOptions.CHECK_NULLABILITY, "false") - .save(location.toString()); + df.select("data", "id") + .write() + .format("iceberg") + .mode(SaveMode.Append) + .option(SparkWriteOptions.CHECK_ORDERING, "false") + .option(SparkWriteOptions.CHECK_NULLABILITY, "false") + .save(location.toString()); - Dataset result = spark.read() + Dataset result = + spark + .read() .format("iceberg") .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) .load(location.toString()); - List actual = result - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -245,9 +242,10 @@ public void testReorderedColumnsNoNullability() throws Exception { @Test public void testPartitionValueTypes() throws Exception { - String[] columnNames = new String[] { - "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" - }; + String[] columnNames = + new String[] { + "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" + }; HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); @@ -259,23 +257,27 @@ public void testPartitionValueTypes() throws Exception { List expected = RandomData.generateList(source.schema(), 2, 128735L); File avroData = temp.newFile("data.avro"); Assert.assertTrue(avroData.delete()); - try (FileAppender appender = Avro.write(Files.localOutput(avroData)) - .schema(source.schema()) - .build()) { + try (FileAppender appender = + Avro.write(Files.localOutput(avroData)).schema(source.schema()).build()) { appender.addAll(expected); } // add the Avro data file to the source table - source.newAppend() - .appendFile(DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(10) - .withInputFile(Files.localInput(avroData)) - .build()) + source + .newAppend() + .appendFile( + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(10) + .withInputFile(Files.localInput(avroData)) + .build()) .commit(); - Dataset sourceDF = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(sourceLocation); + Dataset sourceDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(sourceLocation); for (String column : columnNames) { String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString(); @@ -290,16 +292,15 @@ public void testPartitionValueTypes() throws Exception { Table table = tables.create(SUPPORTED_PRIMITIVES, spec, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - sourceDF.write() - .format("iceberg") - .mode(SaveMode.Append) - .save(location.toString()); + sourceDF.write().format("iceberg").mode(SaveMode.Append).save(location.toString()); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(location.toString()) + .collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); @@ -312,9 +313,10 @@ public void testPartitionValueTypes() throws Exception { @Test public void testNestedPartitionValues() throws Exception { - String[] columnNames = new String[] { - "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" - }; + String[] columnNames = + new String[] { + "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" + }; HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); Schema nestedSchema = new Schema(optional(1, "nested", SUPPORTED_PRIMITIVES.asStruct())); @@ -327,23 +329,27 @@ public void testNestedPartitionValues() throws Exception { List expected = RandomData.generateList(source.schema(), 2, 128735L); File avroData = temp.newFile("data.avro"); Assert.assertTrue(avroData.delete()); - try (FileAppender appender = Avro.write(Files.localOutput(avroData)) - .schema(source.schema()) - .build()) { + try (FileAppender appender = + Avro.write(Files.localOutput(avroData)).schema(source.schema()).build()) { appender.addAll(expected); } // add the Avro data file to the source table - source.newAppend() - .appendFile(DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(10) - .withInputFile(Files.localInput(avroData)) - .build()) + source + .newAppend() + .appendFile( + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(10) + .withInputFile(Files.localInput(avroData)) + .build()) .commit(); - Dataset sourceDF = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(sourceLocation); + Dataset sourceDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(sourceLocation); for (String column : columnNames) { String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString(); @@ -353,45 +359,46 @@ public void testNestedPartitionValues() throws Exception { File dataFolder = new File(location, "data"); Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity("nested." + column).build(); + PartitionSpec spec = + PartitionSpec.builderFor(nestedSchema).identity("nested." + column).build(); Table table = tables.create(nestedSchema, spec, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - sourceDF.write() - .format("iceberg") - .mode(SaveMode.Append) - .save(location.toString()); + sourceDF.write().format("iceberg").mode(SaveMode.Append).save(location.toString()); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(location.toString()) + .collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe( - nestedSchema.asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe(nestedSchema.asStruct(), expected.get(i), actual.get(i)); } } } /** * To verify if WrappedPositionAccessor is generated against a string field within a nested field, - * rather than a Position2Accessor. - * Or when building the partition path, a ClassCastException is thrown with the message like: - * Cannot cast org.apache.spark.unsafe.types.UTF8String to java.lang.CharSequence + * rather than a Position2Accessor. Or when building the partition path, a ClassCastException is + * thrown with the message like: Cannot cast org.apache.spark.unsafe.types.UTF8String to + * java.lang.CharSequence */ @Test public void testPartitionedByNestedString() throws Exception { // schema and partition spec - Schema nestedSchema = new Schema( - Types.NestedField.required(1, "struct", - Types.StructType.of(Types.NestedField.required(2, "string", Types.StringType.get())) - ) - ); + Schema nestedSchema = + new Schema( + Types.NestedField.required( + 1, + "struct", + Types.StructType.of( + Types.NestedField.required(2, "string", Types.StringType.get())))); PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity("struct.string").build(); // create table @@ -401,14 +408,14 @@ public void testPartitionedByNestedString() throws Exception { // input data frame StructField[] structFields = { - new StructField("struct", - DataTypes.createStructType( - new StructField[] { - new StructField("string", DataTypes.StringType, false, Metadata.empty()) - } - ), - false, Metadata.empty() - ) + new StructField( + "struct", + DataTypes.createStructType( + new StructField[] { + new StructField("string", DataTypes.StringType, false, Metadata.empty()) + }), + false, + Metadata.empty()) }; List rows = Lists.newArrayList(); @@ -416,17 +423,16 @@ public void testPartitionedByNestedString() throws Exception { Dataset sourceDF = spark.createDataFrame(rows, new StructType(structFields)); // write into iceberg - sourceDF.write() - .format("iceberg") - .mode(SaveMode.Append) - .save(baseLocation); + sourceDF.write().format("iceberg").mode(SaveMode.Append).save(baseLocation); // verify - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(baseLocation) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(baseLocation) + .collectAsList(); Assert.assertEquals("Number of rows should match", rows.size(), actual.size()); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPathIdentifier.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPathIdentifier.java index ff4fe22a7a8a..f58451296cef 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPathIdentifier.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPathIdentifier.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.BaseTable; @@ -42,16 +43,13 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestPathIdentifier extends SparkTestBase { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), required(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableLocation; private PathIdentifier identifier; private SparkCatalog sparkCatalog; @@ -72,17 +70,16 @@ public void after() { @Test public void testPathIdentifier() throws TableAlreadyExistsException, NoSuchTableException { - SparkTable table = sparkCatalog.createTable(identifier, - SparkSchemaUtil.convert(SCHEMA), - new Transform[0], - ImmutableMap.of()); + SparkTable table = + sparkCatalog.createTable( + identifier, SparkSchemaUtil.convert(SCHEMA), new Transform[0], ImmutableMap.of()); Assert.assertEquals(table.table().location(), tableLocation.getAbsolutePath()); Assertions.assertThat(table.table()).isInstanceOf(BaseTable.class); - Assertions.assertThat(((BaseTable) table.table()).operations()).isInstanceOf(HadoopTableOperations.class); + Assertions.assertThat(((BaseTable) table.table()).operations()) + .isInstanceOf(HadoopTableOperations.class); Assert.assertEquals(sparkCatalog.loadTable(identifier), table); Assert.assertTrue(sparkCatalog.dropTable(identifier)); } } - diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java index 8d65b64cab6d..cfc746f6e932 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.avro.Schema.Type.UNION; + import java.io.IOException; import java.util.List; import java.util.Map; @@ -37,8 +38,6 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.avro.Schema.Type.UNION; - public abstract class TestReadProjection { final String format; @@ -46,20 +45,17 @@ public abstract class TestReadProjection { this.format = format; } - protected abstract Record writeAndRead(String desc, - Schema writeSchema, - Schema readSchema, - Record record) throws IOException; + protected abstract Record writeAndRead( + String desc, Schema writeSchema, Schema readSchema, Record record) throws IOException; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); @@ -67,32 +63,33 @@ public void testFullProjection() throws Exception { Record projected = writeAndRead("full_projection", schema, schema, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("data")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("data")); Assert.assertEquals("Should contain the correct data value", 0, cmp); } @Test public void testReorderedFullProjection() throws Exception { -// Assume.assumeTrue( -// "Spark's Parquet read support does not support reordered columns", -// !format.equalsIgnoreCase("parquet")); + // Assume.assumeTrue( + // "Spark's Parquet read support does not support reordered columns", + // !format.equalsIgnoreCase("parquet")); - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); record.setField("data", "test"); - Schema reordered = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("reordered_full_projection", schema, reordered, record); @@ -102,24 +99,24 @@ public void testReorderedFullProjection() throws Exception { @Test public void testReorderedProjection() throws Exception { -// Assume.assumeTrue( -// "Spark's Parquet read support does not support reordered columns", -// !format.equalsIgnoreCase("parquet")); + // Assume.assumeTrue( + // "Spark's Parquet read support does not support reordered columns", + // !format.equalsIgnoreCase("parquet")); - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); record.setField("data", "test"); - Schema reordered = new Schema( - Types.NestedField.optional(2, "missing_1", Types.StringType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(3, "missing_2", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(2, "missing_1", Types.StringType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(3, "missing_2", Types.LongType.get())); Record projected = writeAndRead("reordered_projection", schema, reordered, record); @@ -130,10 +127,10 @@ public void testReorderedProjection() throws Exception { @Test public void testEmptyProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); @@ -152,68 +149,68 @@ public void testEmptyProjection() throws Exception { @Test public void testBasicProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); record.setField("data", "test"); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("basic_projection_id", writeSchema, idOnly, record); Assert.assertNull("Should not project data", projected.getField("data")); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); - Schema dataOnly = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("data")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("data")); Assert.assertEquals("Should contain the correct data value", 0, cmp); } @Test public void testRename() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); record.setField("data", "test"); - Schema readSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get()) - ); + Schema readSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); Record projected = writeAndRead("project_and_rename", writeSchema, readSchema, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("renamed")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("renamed")); Assert.assertEquals("Should contain the correct data/renamed value", 0, cmp); } @Test public void testNestedStructProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); @@ -222,61 +219,76 @@ public void testNestedStructProjection() throws Exception { location.setField("long", -1.539054f); record.setField("location", location); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); Record projectedLocation = (Record) projected.getField("location"); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project location", projectedLocation); - Schema latOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()) - )) - ); + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); projected = writeAndRead("latitude_only", writeSchema, latOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); Assert.assertNull("Should not project longitude", projectedLocation.getField("long")); - Assert.assertEquals("Should project latitude", - 52.995143f, (float) projectedLocation.getField("lat"), 0.000001f); - - Schema longOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Assert.assertEquals( + "Should project latitude", + 52.995143f, + (float) projectedLocation.getField("lat"), + 0.000001f); + + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); projected = writeAndRead("longitude_only", writeSchema, longOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); Assert.assertNull("Should not project latitutde", projectedLocation.getField("lat")); - Assert.assertEquals("Should project longitude", - -1.539054f, (float) projectedLocation.getField("long"), 0.000001f); + Assert.assertEquals( + "Should project longitude", + -1.539054f, + (float) projectedLocation.getField("long"), + 0.000001f); Schema locationOnly = writeSchema.select("location"); projected = writeAndRead("location_only", writeSchema, locationOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); - Assert.assertEquals("Should project latitude", - 52.995143f, (float) projectedLocation.getField("lat"), 0.000001f); - Assert.assertEquals("Should project longitude", - -1.539054f, (float) projectedLocation.getField("long"), 0.000001f); + Assert.assertEquals( + "Should project latitude", + 52.995143f, + (float) projectedLocation.getField("lat"), + 0.000001f); + Assert.assertEquals( + "Should project longitude", + -1.539054f, + (float) projectedLocation.getField("long"), + 0.000001f); } @Test public void testMapProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "properties", - Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "properties", + Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); Map properties = ImmutableMap.of("a", "A", "b", "B"); @@ -284,31 +296,36 @@ public void testMapProjection() throws IOException { record.setField("id", 34L); record.setField("properties", properties); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project properties map", projected.getField("properties")); Schema keyOnly = writeSchema.select("properties.key"); projected = writeAndRead("key_only", writeSchema, keyOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); Schema valueOnly = writeSchema.select("properties.value"); projected = writeAndRead("value_only", writeSchema, valueOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); Schema mapOnly = writeSchema.select("properties"); projected = writeAndRead("map_only", writeSchema, mapOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); } private Map toStringMap(Map map) { @@ -325,16 +342,19 @@ public void testMapProjection() throws IOException { @Test public void testMapOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); @@ -346,91 +366,100 @@ public void testMapOfStructsProjection() throws IOException { l2.setField("long", -1.539054f); record.setField("locations", ImmutableMap.of("L1", l1, "L2", l2)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project locations map", projected.getField("locations")); projected = writeAndRead("all_locations", writeSchema, writeSchema.select("locations"), record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project locations map", - record.getField("locations"), toStringMap((Map) projected.getField("locations"))); + Assert.assertEquals( + "Should project locations map", + record.getField("locations"), + toStringMap((Map) projected.getField("locations"))); - projected = writeAndRead("lat_only", - writeSchema, writeSchema.select("locations.lat"), record); + projected = writeAndRead("lat_only", writeSchema, writeSchema.select("locations.lat"), record); Assert.assertNull("Should not project id", projected.getField("id")); Map locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); Record projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain lat", - 53.992811f, (float) projectedL1.getField("lat"), 0.000001); + Assert.assertEquals( + "L1 should contain lat", 53.992811f, (float) projectedL1.getField("lat"), 0.000001); Assert.assertNull("L1 should not contain long", projectedL1.getField("long")); Record projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain lat", - 52.995143f, (float) projectedL2.getField("lat"), 0.000001); + Assert.assertEquals( + "L2 should contain lat", 52.995143f, (float) projectedL2.getField("lat"), 0.000001); Assert.assertNull("L2 should not contain long", projectedL2.getField("long")); - projected = writeAndRead("long_only", - writeSchema, writeSchema.select("locations.long"), record); + projected = + writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), record); Assert.assertNull("Should not project id", projected.getField("id")); locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); Assert.assertNull("L1 should not contain lat", projectedL1.getField("lat")); - Assert.assertEquals("L1 should contain long", - -1.542616f, (float) projectedL1.getField("long"), 0.000001); + Assert.assertEquals( + "L1 should contain long", -1.542616f, (float) projectedL1.getField("long"), 0.000001); projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); Assert.assertNull("L2 should not contain lat", projectedL2.getField("lat")); - Assert.assertEquals("L2 should contain long", - -1.539054f, (float) projectedL2.getField("long"), 0.000001); - - Schema latitiudeRenamed = new Schema( - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "latitude", Types.FloatType.get()) - ) - )) - ); + Assert.assertEquals( + "L2 should contain long", -1.539054f, (float) projectedL2.getField("long"), 0.000001); + + Schema latitiudeRenamed = + new Schema( + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, record); Assert.assertNull("Should not project id", projected.getField("id")); locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain latitude", - 53.992811f, (float) projectedL1.getField("latitude"), 0.000001); + Assert.assertEquals( + "L1 should contain latitude", + 53.992811f, + (float) projectedL1.getField("latitude"), + 0.000001); Assert.assertNull("L1 should not contain lat", projectedL1.getField("lat")); Assert.assertNull("L1 should not contain long", projectedL1.getField("long")); projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain latitude", - 52.995143f, (float) projectedL2.getField("latitude"), 0.000001); + Assert.assertEquals( + "L2 should contain latitude", + 52.995143f, + (float) projectedL2.getField("latitude"), + 0.000001); Assert.assertNull("L2 should not contain lat", projectedL2.getField("lat")); Assert.assertNull("L2 should not contain long", projectedL2.getField("long")); } @Test public void testListProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(10, "values", - Types.ListType.ofOptional(11, Types.LongType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); List values = ImmutableList.of(56L, 57L, 58L); @@ -438,12 +467,11 @@ public void testListProjection() throws IOException { record.setField("id", 34L); record.setField("values", values); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project values list", projected.getField("values")); Schema elementOnly = writeSchema.select("values.element"); @@ -460,15 +488,17 @@ public void testListProjection() throws IOException { @Test @SuppressWarnings("unchecked") public void testListOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()) - )) - ) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); @@ -480,18 +510,17 @@ public void testListOfStructsProjection() throws IOException { p2.setField("y", null); record.setField("points", ImmutableList.of(p1, p2)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project points list", projected.getField("points")); projected = writeAndRead("all_points", writeSchema, writeSchema.select("points"), record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project points list", - record.getField("points"), projected.getField("points")); + Assert.assertEquals( + "Should project points list", record.getField("points"), projected.getField("points")); projected = writeAndRead("x_only", writeSchema, writeSchema.select("points.x"), record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -517,13 +546,15 @@ public void testListOfStructsProjection() throws IOException { Assert.assertNull("Should not project x", projectedP2.getField("x")); Assert.assertNull("Should project null y", projectedP2.getField("y")); - Schema yRenamed = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.optional(18, "z", Types.IntegerType.get()) - )) - ) - ); + Schema yRenamed = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); projected = writeAndRead("y_renamed", writeSchema, yRenamed, record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -539,15 +570,17 @@ public void testListOfStructsProjection() throws IOException { Assert.assertNull("Should not project y", projectedP2.getField("y")); Assert.assertNull("Should project null z", projectedP2.getField("z")); - Schema zAdded = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()), - Types.NestedField.optional(20, "z", Types.IntegerType.get()) - )) - ) - ); + Schema zAdded = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()), + Types.NestedField.optional(20, "z", Types.IntegerType.get()))))); projected = writeAndRead("z_added", writeSchema, zAdded, record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -565,10 +598,10 @@ public void testListOfStructsProjection() throws IOException { } private static org.apache.avro.Schema fromOption(org.apache.avro.Schema schema) { - Preconditions.checkArgument(schema.getType() == UNION, - "Expected union schema but was passed: %s", schema); - Preconditions.checkArgument(schema.getTypes().size() == 2, - "Expected optional schema, but was passed: %s", schema); + Preconditions.checkArgument( + schema.getType() == UNION, "Expected union schema but was passed: %s", schema); + Preconditions.checkArgument( + schema.getTypes().size() == 2, "Expected optional schema, but was passed: %s", schema); if (schema.getTypes().get(0).getType() == org.apache.avro.Schema.Type.NULL) { return schema.getTypes().get(1); } else { diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java index 22756dd36717..9661cfe20b1c 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import java.util.List; import org.apache.hadoop.conf.Configuration; @@ -43,18 +44,14 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestSnapshotSelection { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; @@ -79,48 +76,40 @@ public void testSnapshotSelectionById() throws IOException { Table table = tables.create(SCHEMA, spec, tableLocation); // produce the first snapshot - List firstBatchRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List firstBatchRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); // produce the second snapshot - List secondBatchRecords = Lists.newArrayList( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e"), - new SimpleRecord(6, "f") - ); + List secondBatchRecords = + Lists.newArrayList( + new SimpleRecord(4, "d"), new SimpleRecord(5, "e"), new SimpleRecord(6, "f")); Dataset secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class); secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); Assert.assertEquals("Expected 2 snapshots", 2, Iterables.size(table.snapshots())); // verify records in the current snapshot - Dataset currentSnapshotResult = spark.read() - .format("iceberg") - .load(tableLocation); - List currentSnapshotRecords = currentSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset currentSnapshotResult = spark.read().format("iceberg").load(tableLocation); + List currentSnapshotRecords = + currentSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(firstBatchRecords); expectedRecords.addAll(secondBatchRecords); - Assert.assertEquals("Current snapshot rows should match", expectedRecords, currentSnapshotRecords); + Assert.assertEquals( + "Current snapshot rows should match", expectedRecords, currentSnapshotRecords); // verify records in the previous snapshot Snapshot currentSnapshot = table.currentSnapshot(); Long parentSnapshotId = currentSnapshot.parentId(); - Dataset previousSnapshotResult = spark.read() - .format("iceberg") - .option("snapshot-id", parentSnapshotId) - .load(tableLocation); - List previousSnapshotRecords = previousSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - Assert.assertEquals("Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); + Dataset previousSnapshotResult = + spark.read().format("iceberg").option("snapshot-id", parentSnapshotId).load(tableLocation); + List previousSnapshotRecords = + previousSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Assert.assertEquals( + "Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); } @Test @@ -132,11 +121,9 @@ public void testSnapshotSelectionByTimestamp() throws IOException { Table table = tables.create(SCHEMA, spec, tableLocation); // produce the first snapshot - List firstBatchRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List firstBatchRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); @@ -144,37 +131,35 @@ public void testSnapshotSelectionByTimestamp() throws IOException { long firstSnapshotTimestamp = System.currentTimeMillis(); // produce the second snapshot - List secondBatchRecords = Lists.newArrayList( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e"), - new SimpleRecord(6, "f") - ); + List secondBatchRecords = + Lists.newArrayList( + new SimpleRecord(4, "d"), new SimpleRecord(5, "e"), new SimpleRecord(6, "f")); Dataset secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class); secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); Assert.assertEquals("Expected 2 snapshots", 2, Iterables.size(table.snapshots())); // verify records in the current snapshot - Dataset currentSnapshotResult = spark.read() - .format("iceberg") - .load(tableLocation); - List currentSnapshotRecords = currentSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset currentSnapshotResult = spark.read().format("iceberg").load(tableLocation); + List currentSnapshotRecords = + currentSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(firstBatchRecords); expectedRecords.addAll(secondBatchRecords); - Assert.assertEquals("Current snapshot rows should match", expectedRecords, currentSnapshotRecords); + Assert.assertEquals( + "Current snapshot rows should match", expectedRecords, currentSnapshotRecords); // verify records in the previous snapshot - Dataset previousSnapshotResult = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, firstSnapshotTimestamp) - .load(tableLocation); - List previousSnapshotRecords = previousSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - Assert.assertEquals("Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); + Dataset previousSnapshotResult = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, firstSnapshotTimestamp) + .load(tableLocation); + List previousSnapshotRecords = + previousSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Assert.assertEquals( + "Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); } @Test @@ -185,14 +170,11 @@ public void testSnapshotSelectionByInvalidSnapshotId() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, tableLocation); - Dataset df = spark.read() - .format("iceberg") - .option("snapshot-id", -10) - .load(tableLocation); + Dataset df = spark.read().format("iceberg").option("snapshot-id", -10).load(tableLocation); Assertions.assertThatThrownBy(df::collectAsList) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot find snapshot with ID -10"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot find snapshot with ID -10"); } @Test @@ -204,12 +186,15 @@ public void testSnapshotSelectionByInvalidTimestamp() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, tableLocation); - Assertions.assertThatThrownBy(() -> spark.read() + Assertions.assertThatThrownBy( + () -> + spark + .read() .format("iceberg") .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) .load(tableLocation)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot find a snapshot older than"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot find a snapshot older than"); } @Test @@ -220,24 +205,25 @@ public void testSnapshotSelectionBySnapshotIdAndTimestamp() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, tableLocation); - List firstBatchRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List firstBatchRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); long timestamp = System.currentTimeMillis(); long snapshotId = table.currentSnapshot().snapshotId(); - Assertions.assertThatThrownBy(() -> spark.read() + Assertions.assertThatThrownBy( + () -> + spark + .read() .format("iceberg") .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) .load(tableLocation)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot specify both snapshot-id") - .hasMessageContaining("and as-of-timestamp"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot specify both snapshot-id") + .hasMessageContaining("and as-of-timestamp"); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java index bda525780d8b..3fb2a630fe81 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -41,13 +40,13 @@ public TestSparkAppenderFactory(String fileFormat, boolean partitioned) { } @Override - protected FileAppenderFactory createAppenderFactory(List equalityFieldIds, - Schema eqDeleteSchema, - Schema posDeleteRowSchema) { + protected FileAppenderFactory createAppenderFactory( + List equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema) { return SparkAppenderFactory.builderFor(table, table.schema(), sparkType) .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) .eqDeleteRowSchema(eqDeleteSchema) - .posDelRowSchema(posDeleteRowSchema).build(); + .posDelRowSchema(posDeleteRowSchema) + .build(); } @Override diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkBaseDataReader.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkBaseDataReader.java index 870be890da90..6c4239371476 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkBaseDataReader.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkBaseDataReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.FileFormat.PARQUET; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -48,13 +50,9 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.FileFormat.PARQUET; -import static org.apache.iceberg.Files.localOutput; - public class TestSparkBaseDataReader { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @@ -127,14 +125,12 @@ public void testClosureOnDataExhaustion() throws IOException { Assert.assertNotNull("Reader should return non-null value", reader.get()); } - Assert.assertEquals("Reader returned incorrect number of records", - totalTasks * recordPerTask, - countRecords - ); - tasks.forEach(t -> - Assert.assertTrue("All iterators should be closed after read exhausion", - reader.isIteratorClosed(t)) - ); + Assert.assertEquals( + "Reader returned incorrect number of records", totalTasks * recordPerTask, countRecords); + tasks.forEach( + t -> + Assert.assertTrue( + "All iterators should be closed after read exhausion", reader.isIteratorClosed(t))); } @Test @@ -150,13 +146,15 @@ public void testClosureDuringIteration() throws IOException { // Total of 2 elements Assert.assertTrue(reader.next()); - Assert.assertFalse("First iter should not be closed on its last element", - reader.isIteratorClosed(firstTask)); + Assert.assertFalse( + "First iter should not be closed on its last element", reader.isIteratorClosed(firstTask)); Assert.assertTrue(reader.next()); - Assert.assertTrue("First iter should be closed after moving to second iter", + Assert.assertTrue( + "First iter should be closed after moving to second iter", reader.isIteratorClosed(firstTask)); - Assert.assertFalse("Second iter should not be closed on its last element", + Assert.assertFalse( + "Second iter should not be closed on its last element", reader.isIteratorClosed(secondTask)); Assert.assertFalse(reader.next()); @@ -174,10 +172,10 @@ public void testClosureWithoutAnyRead() throws IOException { reader.close(); - tasks.forEach(t -> - Assert.assertFalse("Iterator should not be created eagerly for tasks", - reader.hasIterator(t)) - ); + tasks.forEach( + t -> + Assert.assertFalse( + "Iterator should not be created eagerly for tasks", reader.hasIterator(t))); } @Test @@ -198,12 +196,13 @@ public void testExplicitClosure() throws IOException { // Some tasks might have not been opened yet, so we don't have corresponding tracker for it. // But all that have been created must be closed. - tasks.forEach(t -> { - if (reader.hasIterator(t)) { - Assert.assertTrue("Iterator should be closed after read exhausion", - reader.isIteratorClosed(t)); - } - }); + tasks.forEach( + t -> { + if (reader.hasIterator(t)) { + Assert.assertTrue( + "Iterator should be closed after read exhausion", reader.isIteratorClosed(t)); + } + }); } @Test @@ -223,26 +222,26 @@ public void testIdempotentExplicitClosure() throws IOException { for (int closeAttempt = 0; closeAttempt < 5; closeAttempt++) { reader.close(); for (int i = 0; i < 5; i++) { - Assert.assertTrue("Iterator should be closed after read exhausion", + Assert.assertTrue( + "Iterator should be closed after read exhausion", reader.isIteratorClosed(tasks.get(i))); } for (int i = 5; i < 10; i++) { - Assert.assertFalse("Iterator should not be created eagerly for tasks", - reader.hasIterator(tasks.get(i))); + Assert.assertFalse( + "Iterator should not be created eagerly for tasks", reader.hasIterator(tasks.get(i))); } } } - private List createFileScanTasks(Integer totalTasks, Integer recordPerTask) throws IOException { + private List createFileScanTasks(Integer totalTasks, Integer recordPerTask) + throws IOException { String desc = "make_scan_tasks"; File parent = temp.newFolder(desc); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); try { this.table = TestTables.create(location, desc, schema, PartitionSpec.unpartitioned()); @@ -254,22 +253,21 @@ private List createFileScanTasks(Integer totalTasks, Integer recor AppendFiles appendFiles = table.newAppend(); for (int i = 0; i < totalTasks; i++) { File parquetFile = new File(dataFolder, PARQUET.addExtension(UUID.randomUUID().toString())); - try (FileAppender writer = Parquet.write(localOutput(parquetFile)) - .schema(tableSchema) - .build()) { + try (FileAppender writer = + Parquet.write(localOutput(parquetFile)).schema(tableSchema).build()) { writer.addAll(expected); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withFileSizeInBytes(parquetFile.length()) - .withPath(parquetFile.toString()) - .withRecordCount(recordPerTask) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withFileSizeInBytes(parquetFile.length()) + .withPath(parquetFile.toString()) + .withRecordCount(recordPerTask) + .build(); appendFiles.appendFile(file); } appendFiles.commit(); - return StreamSupport - .stream(table.newScan().planFiles().spliterator(), false) + return StreamSupport.stream(table.newScan().planFiles().spliterator(), false) .collect(Collectors.toList()); } finally { TestTables.clearTables(); diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalog.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalog.java index 027c88cd4df6..f61545df79a0 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalog.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.spark.SparkSessionCatalog; @@ -26,7 +25,8 @@ import org.apache.spark.sql.connector.catalog.Table; import org.apache.spark.sql.connector.catalog.TableCatalog; -public class TestSparkCatalog extends SparkSessionCatalog { +public class TestSparkCatalog + extends SparkSessionCatalog { @Override public Table loadTable(Identifier ident) throws NoSuchTableException { diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogCacheExpiration.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogCacheExpiration.java index 96aeed65bfa7..3d668197fd51 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogCacheExpiration.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogCacheExpiration.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -36,12 +35,16 @@ public class TestSparkCatalogCacheExpiration extends SparkTestBaseWithCatalog { private static final String sessionCatalogName = "spark_catalog"; private static final String sessionCatalogImpl = SparkSessionCatalog.class.getName(); - private static final Map sessionCatalogConfig = ImmutableMap.of( - "type", "hadoop", - "default-namespace", "default", - CatalogProperties.CACHE_ENABLED, "true", - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, "3000" - ); + private static final Map sessionCatalogConfig = + ImmutableMap.of( + "type", + "hadoop", + "default-namespace", + "default", + CatalogProperties.CACHE_ENABLED, + "true", + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + "3000"); private static String asSqlConfCatalogKeyFor(String catalog, String configKey) { // configKey is empty when the catalog's class is being defined @@ -58,19 +61,29 @@ private static String asSqlConfCatalogKeyFor(String catalog, String configKey) { public static void beforeClass() { // Catalog - expiration_disabled: Catalog with caching on and expiration disabled. ImmutableMap.of( - "", "org.apache.iceberg.spark.SparkCatalog", - "type", "hive", - CatalogProperties.CACHE_ENABLED, "true", - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, "-1" - ).forEach((k, v) -> spark.conf().set(asSqlConfCatalogKeyFor("expiration_disabled", k), v)); - - // Catalog - cache_disabled_implicitly: Catalog that does not cache, as the cache expiration interval is 0. + "", + "org.apache.iceberg.spark.SparkCatalog", + "type", + "hive", + CatalogProperties.CACHE_ENABLED, + "true", + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + "-1") + .forEach((k, v) -> spark.conf().set(asSqlConfCatalogKeyFor("expiration_disabled", k), v)); + + // Catalog - cache_disabled_implicitly: Catalog that does not cache, as the cache expiration + // interval is 0. ImmutableMap.of( - "", "org.apache.iceberg.spark.SparkCatalog", - "type", "hive", - CatalogProperties.CACHE_ENABLED, "true", - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, "0" - ).forEach((k, v) -> spark.conf().set(asSqlConfCatalogKeyFor("cache_disabled_implicitly", k), v)); + "", + "org.apache.iceberg.spark.SparkCatalog", + "type", + "hive", + CatalogProperties.CACHE_ENABLED, + "true", + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + "0") + .forEach( + (k, v) -> spark.conf().set(asSqlConfCatalogKeyFor("cache_disabled_implicitly", k), v)); } public TestSparkCatalogCacheExpiration() { @@ -85,56 +98,55 @@ public void testSparkSessionCatalogWithExpirationEnabled() { .extracting("cacheEnabled") .isEqualTo(true); - Assertions - .assertThat(sparkCatalog) + Assertions.assertThat(sparkCatalog) .extracting("icebergCatalog") .extracting("icebergCatalog") - .isInstanceOfSatisfying(Catalog.class, icebergCatalog -> { - Assertions.assertThat(icebergCatalog) - .isExactlyInstanceOf(CachingCatalog.class) - .extracting("expirationIntervalMillis") - .isEqualTo(3000L); - }); + .isInstanceOfSatisfying( + Catalog.class, + icebergCatalog -> { + Assertions.assertThat(icebergCatalog) + .isExactlyInstanceOf(CachingCatalog.class) + .extracting("expirationIntervalMillis") + .isEqualTo(3000L); + }); } @Test public void testCacheEnabledAndExpirationDisabled() { SparkCatalog sparkCatalog = getSparkCatalog("expiration_disabled"); - Assertions.assertThat(sparkCatalog) - .extracting("cacheEnabled") - .isEqualTo(true); + Assertions.assertThat(sparkCatalog).extracting("cacheEnabled").isEqualTo(true); - Assertions - .assertThat(sparkCatalog) + Assertions.assertThat(sparkCatalog) .extracting("icebergCatalog") - .isInstanceOfSatisfying(CachingCatalog.class, icebergCatalog -> { - Assertions.assertThat(icebergCatalog) - .extracting("expirationIntervalMillis") - .isEqualTo(-1L); - }); + .isInstanceOfSatisfying( + CachingCatalog.class, + icebergCatalog -> { + Assertions.assertThat(icebergCatalog) + .extracting("expirationIntervalMillis") + .isEqualTo(-1L); + }); } @Test public void testCacheDisabledImplicitly() { SparkCatalog sparkCatalog = getSparkCatalog("cache_disabled_implicitly"); - Assertions.assertThat(sparkCatalog) - .extracting("cacheEnabled") - .isEqualTo(false); + Assertions.assertThat(sparkCatalog).extracting("cacheEnabled").isEqualTo(false); - Assertions - .assertThat(sparkCatalog) + Assertions.assertThat(sparkCatalog) .extracting("icebergCatalog") .isInstanceOfSatisfying( Catalog.class, - icebergCatalog -> Assertions.assertThat(icebergCatalog).isNotInstanceOf(CachingCatalog.class)); + icebergCatalog -> + Assertions.assertThat(icebergCatalog).isNotInstanceOf(CachingCatalog.class)); } private SparkSessionCatalog sparkSessionCatalog() { - TableCatalog catalog = (TableCatalog) spark.sessionState().catalogManager().catalog("spark_catalog"); + TableCatalog catalog = + (TableCatalog) spark.sessionState().catalogManager().catalog("spark_catalog"); return (SparkSessionCatalog) catalog; } - private SparkCatalog getSparkCatalog(String catalog) { + private SparkCatalog getSparkCatalog(String catalog) { return (SparkCatalog) spark.sessionState().catalogManager().catalog(catalog); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogHadoopOverrides.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogHadoopOverrides.java index 267270308de5..607f1d45ba3a 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogHadoopOverrides.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogHadoopOverrides.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -37,7 +36,6 @@ import org.junit.Test; import org.junit.runners.Parameterized; - public class TestSparkCatalogHadoopOverrides extends SparkCatalogTestBase { private static final String configToOverride = "fs.s3a.buffer.dir"; @@ -49,29 +47,38 @@ public class TestSparkCatalogHadoopOverrides extends SparkCatalogTestBase { @Parameterized.Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") public static Object[][] parameters() { return new Object[][] { - { "testhive", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - hadoopPrefixedConfigToOverride, configOverrideValue - ) }, - { "testhadoop", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hadoop", - hadoopPrefixedConfigToOverride, configOverrideValue - ) }, - { "spark_catalog", SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - hadoopPrefixedConfigToOverride, configOverrideValue - ) } + { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", + "hive", + "default-namespace", + "default", + hadoopPrefixedConfigToOverride, + configOverrideValue) + }, + { + "testhadoop", + SparkCatalog.class.getName(), + ImmutableMap.of("type", "hadoop", hadoopPrefixedConfigToOverride, configOverrideValue) + }, + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", + "hive", + "default-namespace", + "default", + hadoopPrefixedConfigToOverride, + configOverrideValue) + } }; } - public TestSparkCatalogHadoopOverrides(String catalogName, - String implementation, - Map config) { + public TestSparkCatalogHadoopOverrides( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -92,7 +99,8 @@ public void testTableFromCatalogHasOverrides() throws Exception { String actualCatalogOverride = conf.get(configToOverride, "/whammies"); Assert.assertEquals( "Iceberg tables from spark should have the overridden hadoop configurations from the spark config", - configOverrideValue, actualCatalogOverride); + configOverrideValue, + actualCatalogOverride); } @Test @@ -102,16 +110,19 @@ public void ensureRoundTripSerializedTableRetainsHadoopConfig() throws Exception String actualCatalogOverride = originalConf.get(configToOverride, "/whammies"); Assert.assertEquals( "Iceberg tables from spark should have the overridden hadoop configurations from the spark config", - configOverrideValue, actualCatalogOverride); + configOverrideValue, + actualCatalogOverride); // Now convert to SerializableTable and ensure overridden property is still present. Table serializableTable = SerializableTableWithSize.copyOf(table); - Table kryoSerializedTable = KryoHelpers.roundTripSerialize(SerializableTableWithSize.copyOf(table)); + Table kryoSerializedTable = + KryoHelpers.roundTripSerialize(SerializableTableWithSize.copyOf(table)); Configuration configFromKryoSerde = ((Configurable) kryoSerializedTable.io()).getConf(); String kryoSerializedCatalogOverride = configFromKryoSerde.get(configToOverride, "/whammies"); Assert.assertEquals( "Tables serialized with Kryo serialization should retain overridden hadoop configuration properties", - configOverrideValue, kryoSerializedCatalogOverride); + configOverrideValue, + kryoSerializedCatalogOverride); // Do the same for Java based serde Table javaSerializedTable = TestHelpers.roundTripSerialize(serializableTable); @@ -119,14 +130,16 @@ public void ensureRoundTripSerializedTableRetainsHadoopConfig() throws Exception String javaSerializedCatalogOverride = configFromJavaSerde.get(configToOverride, "/whammies"); Assert.assertEquals( "Tables serialized with Java serialization should retain overridden hadoop configuration properties", - configOverrideValue, javaSerializedCatalogOverride); + configOverrideValue, + javaSerializedCatalogOverride); } @SuppressWarnings("ThrowSpecificity") private Table getIcebergTableFromSparkCatalog() throws Exception { Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name()); - TableCatalog catalog = (TableCatalog) spark.sessionState().catalogManager().catalog(catalogName); - SparkTable sparkTable = (SparkTable) catalog.loadTable(identifier); + TableCatalog catalog = + (TableCatalog) spark.sessionState().catalogManager().catalog(catalogName); + SparkTable sparkTable = (SparkTable) catalog.loadTable(identifier); return sparkTable.table(); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java index cd1404766d46..b1f2082b5d9b 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -62,43 +64,42 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkDataFile { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - optional(103, "i", Types.IntegerType.get()), - required(104, "l", Types.LongType.get()), - optional(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - optional(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - optional(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision - ); - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .identity("b") - .bucket("i", 2) - .identity("l") - .identity("f") - .identity("d") - .identity("date") - .hour("ts") - .identity("ts") - .truncate("s", 2) - .identity("bytes") - .bucket("dec_9_0", 2) - .bucket("dec_11_2", 2) - .bucket("dec_38_10", 2) - .build(); + private static final Schema SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), + optional(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + optional(103, "i", Types.IntegerType.get()), + required(104, "l", Types.LongType.get()), + optional(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + optional(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + optional(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision + ); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA) + .identity("b") + .bucket("i", 2) + .identity("l") + .identity("f") + .identity("d") + .identity("date") + .hour("ts") + .identity("ts") + .truncate("s", 2) + .identity("bytes") + .bucket("dec_9_0", 2) + .bucket("dec_11_2", 2) + .bucket("dec_38_10", 2) + .build(); private static SparkSession spark; private static JavaSparkContext sparkContext = null; @@ -117,8 +118,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String tableLocation = null; @Before @@ -129,7 +129,8 @@ public void setupTableLocation() throws Exception { @Test public void testValueConversion() throws IOException { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); checkSparkDataFile(table); } @@ -150,7 +151,9 @@ public void testValueConversionWithEmptyStats() throws IOException { private void checkSparkDataFile(Table table) throws IOException { Iterable rows = RandomData.generateSpark(table.schema(), 200, 0); JavaRDD rdd = sparkContext.parallelize(Lists.newArrayList(rows)); - Dataset df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false); + Dataset df = + spark.internalCreateDataFrame( + JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false); df.write().format("iceberg").mode("append").save(tableLocation); @@ -170,16 +173,15 @@ private void checkSparkDataFile(Table table) throws IOException { Dataset dataFileDF = spark.read().format("iceberg").load(tableLocation + "#files"); // reorder columns to test arbitrary projections - List columns = Arrays.stream(dataFileDF.columns()) - .map(ColumnName::new) - .collect(Collectors.toList()); + List columns = + Arrays.stream(dataFileDF.columns()).map(ColumnName::new).collect(Collectors.toList()); Collections.shuffle(columns); - List sparkDataFiles = dataFileDF - .select(Iterables.toArray(columns, Column.class)) - .collectAsList(); + List sparkDataFiles = + dataFileDF.select(Iterables.toArray(columns, Column.class)).collectAsList(); - Assert.assertEquals("The number of files should match", dataFiles.size(), sparkDataFiles.size()); + Assert.assertEquals( + "The number of files should match", dataFiles.size(), sparkDataFiles.size()); Types.StructType dataFileType = DataFile.getType(table.spec().partitionType()); StructType sparkDataFileType = sparkDataFiles.get(0).schema(); @@ -195,9 +197,14 @@ private void checkDataFile(DataFile expected, DataFile actual) { Assert.assertEquals("Format must match", expected.format(), actual.format()); Assert.assertEquals("Record count must match", expected.recordCount(), actual.recordCount()); Assert.assertEquals("Size must match", expected.fileSizeInBytes(), actual.fileSizeInBytes()); - Assert.assertEquals("Record value counts must match", expected.valueCounts(), actual.valueCounts()); - Assert.assertEquals("Record null value counts must match", expected.nullValueCounts(), actual.nullValueCounts()); - Assert.assertEquals("Record nan value counts must match", expected.nanValueCounts(), actual.nanValueCounts()); + Assert.assertEquals( + "Record value counts must match", expected.valueCounts(), actual.valueCounts()); + Assert.assertEquals( + "Record null value counts must match", + expected.nullValueCounts(), + actual.nullValueCounts()); + Assert.assertEquals( + "Record nan value counts must match", expected.nanValueCounts(), actual.nanValueCounts()); Assert.assertEquals("Lower bounds must match", expected.lowerBounds(), actual.lowerBounds()); Assert.assertEquals("Upper bounds must match", expected.upperBounds(), actual.upperBounds()); Assert.assertEquals("Key metadata must match", expected.keyMetadata(), actual.keyMetadata()); @@ -210,7 +217,8 @@ private void checkDataFile(DataFile expected, DataFile actual) { private void checkStructLike(StructLike expected, StructLike actual) { Assert.assertEquals("Struct size should match", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i++) { - Assert.assertEquals("Struct values must match", expected.get(i, Object.class), actual.get(i, Object.class)); + Assert.assertEquals( + "Struct values must match", expected.get(i, Object.class), actual.get(i, Object.class)); } } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java index 5b158c518ae4..b2db853d4753 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + import java.io.File; import java.io.IOException; import java.util.List; @@ -56,28 +61,20 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - @RunWith(Parameterized.class) public class TestSparkDataWrite { private static final Configuration CONF = new Configuration(); private final FileFormat format; private static SparkSession spark = null; - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Parameterized.Parameters(name = "format = {0}") public static Object[] parameters() { - return new Object[] { "parquet", "avro", "orc" }; + return new Object[] {"parquet", "avro", "orc"}; } @BeforeClass @@ -110,15 +107,14 @@ public void testBasicWrite() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); // TODO: incoming columns must be ordered according to the table's schema - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -126,11 +122,10 @@ public void testBasicWrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); for (ManifestFile manifest : table.currentSnapshot().allManifests(table.io())) { @@ -161,30 +156,31 @@ public void testAppend() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); - - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "a"), - new SimpleRecord(5, "b"), - new SimpleRecord(6, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); + + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "a"), + new SimpleRecord(5, "b"), + new SimpleRecord(6, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); - df.withColumn("id", df.col("id").plus(3)).select("id", "data").write() + df.withColumn("id", df.col("id").plus(3)) + .select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -192,11 +188,10 @@ public void testAppend() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -210,23 +205,24 @@ public void testEmptyOverwrite() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); List expected = records; Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); Dataset empty = spark.createDataFrame(ImmutableList.of(), SimpleRecord.class); - empty.select("id", "data").write() + empty + .select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Overwrite) @@ -235,11 +231,10 @@ public void testEmptyOverwrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -253,30 +248,31 @@ public void testOverwrite() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "a"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "b"), - new SimpleRecord(6, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "a"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "b"), + new SimpleRecord(6, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); // overwrite with 2*id to replace record 2, append 4 and 6 - df.withColumn("id", df.col("id").multiply(2)).select("id", "data").write() + df.withColumn("id", df.col("id").multiply(2)) + .select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Overwrite) @@ -285,11 +281,10 @@ public void testOverwrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -303,22 +298,22 @@ public void testUnpartitionedOverwrite() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); // overwrite with the same data; should not produce two copies - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Overwrite) @@ -326,11 +321,10 @@ public void testUnpartitionedOverwrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -344,7 +338,8 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "4") // ~4 bytes; low enough to trigger .commit(); @@ -355,7 +350,8 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -363,11 +359,10 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -379,7 +374,8 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws } Assert.assertEquals("Should have 4 DataFiles", 4, files.size()); - Assert.assertTrue("All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); + Assert.assertTrue( + "All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); } @Test @@ -410,15 +406,14 @@ public void testWriteProjection() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, null), - new SimpleRecord(2, null), - new SimpleRecord(3, null) - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null)); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id").write() // select only id column + df.select("id") + .write() // select only id column .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -426,11 +421,10 @@ public void testWriteProjection() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -446,22 +440,23 @@ public void testWriteProjectionWithMiddle() throws IOException { HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); - Schema schema = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + Schema schema = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); Table table = tables.create(schema, spec, location.toString()); - List expected = Lists.newArrayList( - new ThreeColumnRecord(1, null, "hello"), - new ThreeColumnRecord(2, null, "world"), - new ThreeColumnRecord(3, null, null) - ); + List expected = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "hello"), + new ThreeColumnRecord(2, null, "world"), + new ThreeColumnRecord(3, null, null)); Dataset df = spark.createDataFrame(expected, ThreeColumnRecord.class); - df.select("c1", "c3").write() + df.select("c1", "c3") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -469,11 +464,10 @@ public void testWriteProjectionWithMiddle() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); + List actual = + result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -487,44 +481,39 @@ public void testViewsReturnRecentResults() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); - Dataset query = spark.read() - .format("iceberg") - .load(location.toString()) - .where("id = 1"); + Dataset query = spark.read().format("iceberg").load(location.toString()).where("id = 1"); query.createOrReplaceTempView("tmp"); - List actual1 = spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - List expected1 = Lists.newArrayList( - new SimpleRecord(1, "a") - ); + List actual1 = + spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List expected1 = Lists.newArrayList(new SimpleRecord(1, "a")); Assert.assertEquals("Number of rows should match", expected1.size(), actual1.size()); Assert.assertEquals("Result rows should match", expected1, actual1); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); - List actual2 = spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - List expected2 = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(1, "a") - ); + List actual2 = + spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List expected2 = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(1, "a")); Assert.assertEquals("Number of rows should match", expected2.size(), actual2.size()); Assert.assertEquals("Result rows should match", expected2, actual2); } @@ -550,7 +539,9 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti switch (option) { case NONE: - df.select("id", "data").sort("data").write() + df.select("id", "data") + .sort("data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -559,7 +550,8 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti break; case TABLE: table.updateProperties().set(SPARK_WRITE_PARTITIONED_FANOUT_ENABLED, "true").commit(); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -567,7 +559,8 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti .save(location.toString()); break; case JOB: - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -581,11 +574,10 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -597,7 +589,8 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti } Assert.assertEquals("Should have 8 DataFiles", 8, files.size()); - Assert.assertTrue("All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); + Assert.assertTrue( + "All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); } @Test @@ -609,20 +602,21 @@ public void testCommitUnknownException() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); AppendFiles append = table.newFastAppend(); AppendFiles spyAppend = spy(append); - doAnswer(invocation -> { - append.commit(); - throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); - }).when(spyAppend).commit(); + doAnswer( + invocation -> { + append.commit(); + throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); + }) + .when(spyAppend) + .commit(); Table spyTable = spy(table); when(spyTable.newAppend()).thenReturn(spyAppend); @@ -632,20 +626,25 @@ public void testCommitUnknownException() throws IOException { ManualSource.setTable(manualTableName, sparkTable); // Although an exception is thrown here, write and commit have succeeded - AssertHelpers.assertThrowsWithCause("Should throw a Commit State Unknown Exception", + AssertHelpers.assertThrowsWithCause( + "Should throw a Commit State Unknown Exception", SparkException.class, "Writing job aborted", CommitStateUnknownException.class, "Datacenter on Fire", - () -> df.select("id", "data").sort("data").write() - .format("org.apache.iceberg.spark.source.ManualSource") - .option(ManualSource.TABLE_NAME, manualTableName) - .mode(SaveMode.Append) - .save(location.toString())); + () -> + df.select("id", "data") + .sort("data") + .write() + .format("org.apache.iceberg.spark.source.ManualSource") + .option(ManualSource.TABLE_NAME, manualTableName) + .mode(SaveMode.Append) + .save(location.toString())); // Since write and commit succeeded, the rows should be readable Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", records.size(), actual.size()); Assert.assertEquals("Result rows should match", records, actual); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java index 702e8ab98990..4a3263e368c0 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -39,9 +38,11 @@ public TestSparkFileWriterFactory(FileFormat fileFormat, boolean partitioned) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFilesScan.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFilesScan.java index 63195cfd3967..d0959d6866bc 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFilesScan.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFilesScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -53,10 +52,8 @@ public void removeTables() { public void testTaskSetLoading() throws NoSuchTableException, IOException { sql("CREATE TABLE %s (id INT, data STRING) USING iceberg", tableName); - List records = ImmutableList.of( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + ImmutableList.of(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.writeTo(tableName).append(); @@ -69,15 +66,19 @@ public void testTaskSetLoading() throws NoSuchTableException, IOException { taskSetManager.stageTasks(table, setID, ImmutableList.copyOf(fileScanTasks)); // load the staged file set - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) + .load(tableName); // write the records back essentially duplicating data scanDF.writeTo(tableName).append(); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "a"), row(1, "a"), row(2, "b"), row(2, "b")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -86,10 +87,8 @@ public void testTaskSetLoading() throws NoSuchTableException, IOException { public void testTaskSetPlanning() throws NoSuchTableException, IOException { sql("CREATE TABLE %s (id INT, data STRING) USING iceberg", tableName); - List records = ImmutableList.of( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + ImmutableList.of(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.coalesce(1).writeTo(tableName).append(); df.coalesce(1).writeTo(tableName).append(); @@ -104,17 +103,23 @@ public void testTaskSetPlanning() throws NoSuchTableException, IOException { taskSetManager.stageTasks(table, setID, tasks); // load the staged file set and make sure each file is in a separate split - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) - .option(SparkReadOptions.SPLIT_SIZE, tasks.get(0).file().fileSizeInBytes()) - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) + .option(SparkReadOptions.SPLIT_SIZE, tasks.get(0).file().fileSizeInBytes()) + .load(tableName); Assert.assertEquals("Num partitions should match", 2, scanDF.javaRDD().getNumPartitions()); // load the staged file set and make sure we combine both files into a single split - scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) - .option(SparkReadOptions.SPLIT_SIZE, Long.MAX_VALUE) - .load(tableName); + scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) + .option(SparkReadOptions.SPLIT_SIZE, Long.MAX_VALUE) + .load(tableName); Assert.assertEquals("Num partitions should match", 1, scanDF.javaRDD().getNumPartitions()); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java index be74d1c5a33b..c3bb35ca7df8 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -42,26 +41,32 @@ public TestSparkMergingMetrics(FileFormat fileFormat) { @Override protected FileAppender writeAndGetAppender(List records) throws IOException { - Table testTable = new BaseTable(null, "dummy") { - @Override - public Map properties() { - return Collections.emptyMap(); - } - @Override - public SortOrder sortOrder() { - return SortOrder.unsorted(); - } - @Override - public PartitionSpec spec() { - return PartitionSpec.unpartitioned(); - } - }; + Table testTable = + new BaseTable(null, "dummy") { + @Override + public Map properties() { + return Collections.emptyMap(); + } + + @Override + public SortOrder sortOrder() { + return SortOrder.unsorted(); + } + + @Override + public PartitionSpec spec() { + return PartitionSpec.unpartitioned(); + } + }; FileAppender appender = - SparkAppenderFactory.builderFor(testTable, SCHEMA, SparkSchemaUtil.convert(SCHEMA)).build() + SparkAppenderFactory.builderFor(testTable, SCHEMA, SparkSchemaUtil.convert(SCHEMA)) + .build() .newAppender(org.apache.iceberg.Files.localOutput(temp.newFile()), fileFormat); try (FileAppender fileAppender = appender) { - records.stream().map(r -> new StructInternalRow(SCHEMA.asStruct()).setStruct(r)).forEach(fileAppender::add); + records.stream() + .map(r -> new StructInternalRow(SCHEMA.asStruct()).setStruct(r)) + .forEach(fileAppender::add); } return appender; } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMetadataColumns.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMetadataColumns.java index b29d281863cb..5ee042f55e66 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMetadataColumns.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMetadataColumns.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.FORMAT_VERSION; +import static org.apache.iceberg.TableProperties.ORC_VECTORIZATION_ENABLED; +import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; + import java.io.IOException; import java.util.List; import org.apache.iceberg.AssertHelpers; @@ -48,41 +52,37 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.FORMAT_VERSION; -import static org.apache.iceberg.TableProperties.ORC_VECTORIZATION_ENABLED; -import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; - @RunWith(Parameterized.class) public class TestSparkMetadataColumns extends SparkTestBase { private static final String TABLE_NAME = "test_table"; - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "category", Types.StringType.get()), - Types.NestedField.optional(3, "data", Types.StringType.get()) - ); - private static final PartitionSpec UNKNOWN_SPEC = PartitionSpecParser.fromJson(SCHEMA, - "{ \"spec-id\": 1, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "category", Types.StringType.get()), + Types.NestedField.optional(3, "data", Types.StringType.get())); + private static final PartitionSpec UNKNOWN_SPEC = + PartitionSpecParser.fromJson( + SCHEMA, + "{ \"spec-id\": 1, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); @Parameterized.Parameters(name = "fileFormat = {0}, vectorized = {1}, formatVersion = {2}") public static Object[][] parameters() { return new Object[][] { - { FileFormat.PARQUET, false, 1}, - { FileFormat.PARQUET, true, 1}, - { FileFormat.PARQUET, false, 2}, - { FileFormat.PARQUET, true, 2}, - { FileFormat.AVRO, false, 1}, - { FileFormat.AVRO, false, 2}, - { FileFormat.ORC, false, 1}, - { FileFormat.ORC, true, 1}, - { FileFormat.ORC, false, 2}, - { FileFormat.ORC, true, 2}, + {FileFormat.PARQUET, false, 1}, + {FileFormat.PARQUET, true, 1}, + {FileFormat.PARQUET, false, 2}, + {FileFormat.PARQUET, true, 2}, + {FileFormat.AVRO, false, 1}, + {FileFormat.AVRO, false, 2}, + {FileFormat.ORC, false, 1}, + {FileFormat.ORC, true, 1}, + {FileFormat.ORC, false, 2}, + {FileFormat.ORC, true, 2}, }; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final FileFormat fileFormat; private final boolean vectorized; @@ -98,13 +98,16 @@ public TestSparkMetadataColumns(FileFormat fileFormat, boolean vectorized, int f @BeforeClass public static void setupSpark() { - ImmutableMap config = ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "cache-enabled", "true" - ); - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); - config.forEach((key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); + ImmutableMap config = + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "cache-enabled", "true"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); + config.forEach( + (key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); } @Before @@ -127,36 +130,32 @@ public void testSpecAndPartitionMetadataColumns() { sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", TABLE_NAME); table.refresh(); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", TABLE_NAME); table.refresh(); - table.updateSpec() - .addField(Expressions.bucket("category", 8)) - .commit(); + table.updateSpec().addField(Expressions.bucket("category", 8)).commit(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", TABLE_NAME); table.refresh(); - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", TABLE_NAME); table.refresh(); - table.updateSpec() - .renameField("category_bucket_8", "category_bucket_8_another_name") - .commit(); - - List expected = ImmutableList.of( - row(0, row(null, null)), - row(1, row("b1", null)), - row(2, row("b1", 2)), - row(3, row(null, 2)) - ); - assertEquals("Rows must match", expected, - sql("SELECT _spec_id, _partition FROM `%s$_spec_id,_partition` ORDER BY _spec_id", TABLE_NAME)); + table.updateSpec().renameField("category_bucket_8", "category_bucket_8_another_name").commit(); + + List expected = + ImmutableList.of( + row(0, row(null, null)), + row(1, row("b1", null)), + row(2, row("b1", 2)), + row(3, row(null, 2))); + assertEquals( + "Rows must match", + expected, + sql( + "SELECT _spec_id, _partition FROM `%s$_spec_id,_partition` ORDER BY _spec_id", + TABLE_NAME)); } @Test @@ -166,13 +165,16 @@ public void testPartitionMetadataColumnWithUnknownTransforms() { TableMetadata base = ops.current(); ops.commit(base, base.updatePartitionSpec(UNKNOWN_SPEC)); - AssertHelpers.assertThrows("Should fail to query the partition metadata column", - ValidationException.class, "Cannot build table partition type, unknown transforms", + AssertHelpers.assertThrows( + "Should fail to query the partition metadata column", + ValidationException.class, + "Cannot build table partition type, unknown transforms", () -> sql("SELECT _partition FROM `%s$_partition`", TABLE_NAME)); } private void createAndInitTable() throws IOException { - this.table = TestTables.create(temp.newFolder(), TABLE_NAME, SCHEMA, PartitionSpec.unpartitioned()); + this.table = + TestTables.create(temp.newFolder(), TABLE_NAME, SCHEMA, PartitionSpec.unpartitioned()); UpdateProperties updateProperties = table.updateProperties(); updateProperties.set(FORMAT_VERSION, String.valueOf(formatVersion)); @@ -186,7 +188,8 @@ private void createAndInitTable() throws IOException { updateProperties.set(ORC_VECTORIZATION_ENABLED, String.valueOf(vectorized)); break; default: - Preconditions.checkState(!vectorized, "File format %s does not support vectorized reads", fileFormat); + Preconditions.checkState( + !vectorized, "File format %s does not support vectorized reads", fileFormat); } updateProperties.commit(); diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java index 4d07cfbe86ea..276d8c632fc0 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -39,9 +38,11 @@ public TestSparkPartitioningWriters(FileFormat fileFormat) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java index 480448e13a8f..245c392774f5 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -39,9 +38,11 @@ public TestSparkPositionDeltaWriters(FileFormat fileFormat) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java index f42b48d0e30d..7d6f0e76f78f 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -51,10 +54,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSparkReadProjection extends TestReadProjection { @@ -63,11 +62,11 @@ public class TestSparkReadProjection extends TestReadProjection { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } @@ -83,14 +82,17 @@ public TestSparkReadProjection(String format, boolean vectorized) { @BeforeClass public static void startSpark() { TestSparkReadProjection.spark = SparkSession.builder().master("local[2]").getOrCreate(); - ImmutableMap config = ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "parquet-enabled", "true", - "cache-enabled", "false" - ); - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); - config.forEach((key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); + ImmutableMap config = + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "parquet-enabled", "true", + "cache-enabled", "false"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); + config.forEach( + (key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); } @AfterClass @@ -101,8 +103,8 @@ public static void stopSpark() { } @Override - protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, - Record record) throws IOException { + protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, Record record) + throws IOException { File parent = temp.newFolder(desc); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); @@ -116,16 +118,17 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema // When tables are created, the column ids are reassigned. Schema tableSchema = table.schema(); - try (FileAppender writer = new GenericAppenderFactory(tableSchema).newAppender( - localOutput(testFile), format)) { + try (FileAppender writer = + new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), format)) { writer.add(record); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(100) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(100) + .withFileSizeInBytes(testFile.length()) + .withPath(testFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); @@ -139,14 +142,16 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema Schema expectedSchema = reassignIds(readSchema, idMapping); // Set the schema to the expected schema directly to simulate the table schema evolving - TestTables.replaceMetadata(desc, - TestTables.readMetadata(desc).updateSchema(expectedSchema, 100)); + TestTables.replaceMetadata( + desc, TestTables.readMetadata(desc).updateSchema(expectedSchema, 100)); - Dataset df = spark.read() - .format("org.apache.iceberg.spark.source.TestIcebergSource") - .option("iceberg.table.name", desc) - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(); + Dataset df = + spark + .read() + .format("org.apache.iceberg.spark.source.TestIcebergSource") + .option("iceberg.table.name", desc) + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(); return SparkValueConverter.convert(readSchema, df.collectAsList().get(0)); @@ -157,87 +162,98 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema private List allIds(Schema schema) { List ids = Lists.newArrayList(); - TypeUtil.visit(schema, new TypeUtil.SchemaVisitor() { - @Override - public Void field(Types.NestedField field, Void fieldResult) { - ids.add(field.fieldId()); - return null; - } + TypeUtil.visit( + schema, + new TypeUtil.SchemaVisitor() { + @Override + public Void field(Types.NestedField field, Void fieldResult) { + ids.add(field.fieldId()); + return null; + } - @Override - public Void list(Types.ListType list, Void elementResult) { - ids.add(list.elementId()); - return null; - } + @Override + public Void list(Types.ListType list, Void elementResult) { + ids.add(list.elementId()); + return null; + } - @Override - public Void map(Types.MapType map, Void keyResult, Void valueResult) { - ids.add(map.keyId()); - ids.add(map.valueId()); - return null; - } - }); + @Override + public Void map(Types.MapType map, Void keyResult, Void valueResult) { + ids.add(map.keyId()); + ids.add(map.valueId()); + return null; + } + }); return ids; } private Schema reassignIds(Schema schema, Map idMapping) { - return new Schema(TypeUtil.visit(schema, new TypeUtil.SchemaVisitor() { - private int mapId(int id) { - if (idMapping.containsKey(id)) { - return idMapping.get(id); - } - return 1000 + id; // make sure the new IDs don't conflict with reassignment - } + return new Schema( + TypeUtil.visit( + schema, + new TypeUtil.SchemaVisitor() { + private int mapId(int id) { + if (idMapping.containsKey(id)) { + return idMapping.get(id); + } + return 1000 + id; // make sure the new IDs don't conflict with reassignment + } - @Override - public Type schema(Schema schema, Type structResult) { - return structResult; - } + @Override + public Type schema(Schema schema, Type structResult) { + return structResult; + } - @Override - public Type struct(Types.StructType struct, List fieldResults) { - List newFields = Lists.newArrayListWithExpectedSize(fieldResults.size()); - List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - if (field.isOptional()) { - newFields.add(optional(mapId(field.fieldId()), field.name(), fieldResults.get(i))); - } else { - newFields.add(required(mapId(field.fieldId()), field.name(), fieldResults.get(i))); - } - } - return Types.StructType.of(newFields); - } + @Override + public Type struct(Types.StructType struct, List fieldResults) { + List newFields = + Lists.newArrayListWithExpectedSize(fieldResults.size()); + List fields = struct.fields(); + for (int i = 0; i < fields.size(); i += 1) { + Types.NestedField field = fields.get(i); + if (field.isOptional()) { + newFields.add( + optional(mapId(field.fieldId()), field.name(), fieldResults.get(i))); + } else { + newFields.add( + required(mapId(field.fieldId()), field.name(), fieldResults.get(i))); + } + } + return Types.StructType.of(newFields); + } - @Override - public Type field(Types.NestedField field, Type fieldResult) { - return fieldResult; - } + @Override + public Type field(Types.NestedField field, Type fieldResult) { + return fieldResult; + } - @Override - public Type list(Types.ListType list, Type elementResult) { - if (list.isElementOptional()) { - return Types.ListType.ofOptional(mapId(list.elementId()), elementResult); - } else { - return Types.ListType.ofRequired(mapId(list.elementId()), elementResult); - } - } + @Override + public Type list(Types.ListType list, Type elementResult) { + if (list.isElementOptional()) { + return Types.ListType.ofOptional(mapId(list.elementId()), elementResult); + } else { + return Types.ListType.ofRequired(mapId(list.elementId()), elementResult); + } + } - @Override - public Type map(Types.MapType map, Type keyResult, Type valueResult) { - if (map.isValueOptional()) { - return Types.MapType.ofOptional( - mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); - } else { - return Types.MapType.ofRequired( - mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); - } - } + @Override + public Type map(Types.MapType map, Type keyResult, Type valueResult) { + if (map.isValueOptional()) { + return Types.MapType.ofOptional( + mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); + } else { + return Types.MapType.ofRequired( + mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); + } + } - @Override - public Type primitive(Type.PrimitiveType primitive) { - return primitive; - } - }).asNestedType().asStructType().fields()); + @Override + public Type primitive(Type.PrimitiveType primitive) { + return primitive; + } + }) + .asNestedType() + .asStructType() + .fields()); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java index e543a408e8ce..462f34530725 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; + import java.io.IOException; import java.util.List; import org.apache.hadoop.hive.conf.HiveConf; @@ -60,8 +61,6 @@ import org.junit.BeforeClass; import org.junit.Test; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; - public class TestSparkReaderDeletes extends DeleteReadTests { private static TestHiveMetastore metastore = null; @@ -74,15 +73,18 @@ public static void startMetastoreAndSpark() { metastore.start(); HiveConf hiveConf = metastore.hiveConf(); - spark = SparkSession.builder() - .master("local[2]") - .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .enableHiveSupport() - .getOrCreate(); + spark = + SparkSession.builder() + .master("local[2]") + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .enableHiveSupport() + .getOrCreate(); - catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); try { catalog.createNamespace(Namespace.of("default")); @@ -117,17 +119,21 @@ protected void dropTable(String name) { @Override public StructLikeSet rowSet(String name, Table table, String... columns) { - Dataset df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", name).toString()) - .selectExpr(columns); + Dataset df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", name).toString()) + .selectExpr(columns); Types.StructType projection = table.schema().select(columns).asStruct(); StructLikeSet set = StructLikeSet.create(projection); - df.collectAsList().forEach(row -> { - SparkStructLike rowWrapper = new SparkStructLike(projection); - set.add(rowWrapper.wrap(row)); - }); + df.collectAsList() + .forEach( + row -> { + SparkStructLike rowWrapper = new SparkStructLike(projection); + set.add(rowWrapper.wrap(row)); + }); return set; } @@ -137,31 +143,39 @@ public void testEqualityDeleteWithFilter() throws IOException { String tableName = table.name().substring(table.name().lastIndexOf(".") + 1); Schema deleteRowSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d"), // id = 89 - dataDelete.copy("data", "g") // id = 122 - ); - - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, deleteRowSchema); - - table.newRowDelta() - .addDeletes(eqDeletes) - .commit(); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d"), // id = 89 + dataDelete.copy("data", "g") // id = 122 + ); + + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + deleteRowSchema); + + table.newRowDelta().addDeletes(eqDeletes).commit(); Types.StructType projection = table.schema().select("*").asStruct(); - Dataset df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", tableName).toString()) - .filter("data = 'a'") // select a deleted row - .selectExpr("*"); + Dataset df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", tableName).toString()) + .filter("data = 'a'") // select a deleted row + .selectExpr("*"); StructLikeSet actual = StructLikeSet.create(projection); - df.collectAsList().forEach(row -> { - SparkStructLike rowWrapper = new SparkStructLike(projection); - actual.add(rowWrapper.wrap(row)); - }); + df.collectAsList() + .forEach( + row -> { + SparkStructLike rowWrapper = new SparkStructLike(projection); + actual.add(rowWrapper.wrap(row)); + }); Assert.assertEquals("Table should contain no rows", 0, actual.size()); } @@ -170,44 +184,57 @@ public void testEqualityDeleteWithFilter() throws IOException { public void testReadEqualityDeleteRows() throws IOException { Schema deleteSchema1 = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteSchema1); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d") // id = 89 - ); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d") // id = 89 + ); Schema deleteSchema2 = table.schema().select("id"); Record idDelete = GenericRecord.create(deleteSchema2); - List idDeletes = Lists.newArrayList( - idDelete.copy("id", 121), // id = 121 - idDelete.copy("id", 122) // id = 122 - ); - - DeleteFile eqDelete1 = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, deleteSchema1); - - DeleteFile eqDelete2 = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), idDeletes, deleteSchema2); - - table.newRowDelta() - .addDeletes(eqDelete1) - .addDeletes(eqDelete2) - .commit(); + List idDeletes = + Lists.newArrayList( + idDelete.copy("id", 121), // id = 121 + idDelete.copy("id", 122) // id = 122 + ); + + DeleteFile eqDelete1 = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + deleteSchema1); + + DeleteFile eqDelete2 = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + idDeletes, + deleteSchema2); + + table.newRowDelta().addDeletes(eqDelete1).addDeletes(eqDelete2).commit(); StructLikeSet expectedRowSet = rowSetWithIds(29, 89, 121, 122); Types.StructType type = table.schema().asStruct(); StructLikeSet actualRowSet = StructLikeSet.create(type); - CloseableIterable tasks = TableScanUtil.planTasks( - table.newScan().planFiles(), - TableProperties.METADATA_SPLIT_SIZE_DEFAULT, - TableProperties.SPLIT_LOOKBACK_DEFAULT, - TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); + CloseableIterable tasks = + TableScanUtil.planTasks( + table.newScan().planFiles(), + TableProperties.METADATA_SPLIT_SIZE_DEFAULT, + TableProperties.SPLIT_LOOKBACK_DEFAULT, + TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); for (CombinedScanTask task : tasks) { - try (EqualityDeleteRowReader reader = new EqualityDeleteRowReader(task, table, table.schema(), false)) { + try (EqualityDeleteRowReader reader = + new EqualityDeleteRowReader(task, table, table.schema(), false)) { while (reader.next()) { - actualRowSet.add(new InternalRowWrapper(SparkSchemaUtil.convert(table.schema())).wrap(reader.get().copy())); + actualRowSet.add( + new InternalRowWrapper(SparkSchemaUtil.convert(table.schema())) + .wrap(reader.get().copy())); } } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java index 9023195dcc6a..dcf9140a8885 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -36,9 +35,11 @@ public TestSparkRollingFileWriters(FileFormat fileFormat, boolean partitioned) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java index 1b4fb5f8ce58..616a196872de 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java index 967f394faa74..06ecc20c2fc3 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.FileFormat; diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java index 69302e9d24d7..17370aaa22f2 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import com.fasterxml.jackson.databind.node.ObjectNode; @@ -29,14 +28,17 @@ public class TestStreamingOffset { @Test public void testJsonConversion() { - StreamingOffset[] expected = new StreamingOffset[]{ - new StreamingOffset(System.currentTimeMillis(), 1L, false), - new StreamingOffset(System.currentTimeMillis(), 2L, false), - new StreamingOffset(System.currentTimeMillis(), 3L, false), - new StreamingOffset(System.currentTimeMillis(), 4L, true) - }; - Assert.assertArrayEquals("StreamingOffsets should match", expected, - Arrays.stream(expected).map(elem -> StreamingOffset.fromJson(elem.json())).toArray()); + StreamingOffset[] expected = + new StreamingOffset[] { + new StreamingOffset(System.currentTimeMillis(), 1L, false), + new StreamingOffset(System.currentTimeMillis(), 2L, false), + new StreamingOffset(System.currentTimeMillis(), 3L, false), + new StreamingOffset(System.currentTimeMillis(), 4L, true) + }; + Assert.assertArrayEquals( + "StreamingOffsets should match", + expected, + Arrays.stream(expected).map(elem -> StreamingOffset.fromJson(elem.json())).toArray()); } @Test diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index 4225747dcab4..73e95d102875 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.util.List; import org.apache.hadoop.conf.Configuration; @@ -49,28 +50,24 @@ import scala.Option; import scala.collection.JavaConversions; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestStructuredStreaming { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); private static SparkSession spark = null; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - @Rule - public ExpectedException exceptionRule = ExpectedException.none(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); + @Rule public ExpectedException exceptionRule = ExpectedException.none(); @BeforeClass public static void startSpark() { - TestStructuredStreaming.spark = SparkSession.builder() - .master("local[2]") - .config("spark.sql.shuffle.partitions", 4) - .getOrCreate(); + TestStructuredStreaming.spark = + SparkSession.builder() + .master("local[2]") + .config("spark.sql.shuffle.partitions", 4) + .getOrCreate(); } @AfterClass @@ -90,21 +87,23 @@ public void testStreamingWriteAppendMode() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "1"), - new SimpleRecord(2, "2"), - new SimpleRecord(3, "3"), - new SimpleRecord(4, "4") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "1"), + new SimpleRecord(2, "2"), + new SimpleRecord(3, "3"), + new SimpleRecord(4, "4")); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("append") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("append") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { // start the original query with checkpointing @@ -126,10 +125,9 @@ public void testStreamingWriteAppendMode() throws Exception { restartedQuery.processAllAvailable(); // ensure the write was idempotent - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Dataset result = spark.read().format("iceberg").load(location.toString()); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); @@ -150,22 +148,22 @@ public void testStreamingWriteCompleteMode() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(2, "1"), - new SimpleRecord(3, "2"), - new SimpleRecord(1, "3") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(2, "1"), new SimpleRecord(3, "2"), new SimpleRecord(1, "3")); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .groupBy("value") - .count() - .selectExpr("CAST(count AS INT) AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("complete") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .groupBy("value") + .count() + .selectExpr("CAST(count AS INT) AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("complete") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { // start the original query with checkpointing @@ -187,10 +185,9 @@ public void testStreamingWriteCompleteMode() throws Exception { restartedQuery.processAllAvailable(); // ensure the write was idempotent - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); - List actual = result.orderBy("data").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Dataset result = spark.read().format("iceberg").load(location.toString()); + List actual = + result.orderBy("data").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); @@ -211,22 +208,22 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, null), - new SimpleRecord(2, null), - new SimpleRecord(3, null) - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null)); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .groupBy("value") - .count() - .selectExpr("CAST(count AS INT) AS id") // select only id column - .writeStream() - .outputMode("complete") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .groupBy("value") + .count() + .selectExpr("CAST(count AS INT) AS id") // select only id column + .writeStream() + .outputMode("complete") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { // start the original query with checkpointing @@ -248,10 +245,9 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception { restartedQuery.processAllAvailable(); // ensure the write was idempotent - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Dataset result = spark.read().format("iceberg").load(location.toString()); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); @@ -275,13 +271,15 @@ public void testStreamingWriteUpdateMode() throws Exception { tables.create(SCHEMA, spec, location.toString()); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("update") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("update") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { StreamingQuery query = streamWriter.start(); diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java index e609412f8be0..23fdfb09cb83 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.expressions.Expressions.ref; + import java.io.File; import java.util.Collections; import java.util.List; @@ -58,8 +59,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.expressions.Expressions.ref; - @RunWith(Parameterized.class) public final class TestStructuredStreamingRead3 extends SparkCatalogTestBase { public TestStructuredStreamingRead3( @@ -70,59 +69,49 @@ public TestStructuredStreamingRead3( private Table table; /** - * test data to be used by multiple writes - * each write creates a snapshot and writes a list of records + * test data to be used by multiple writes each write creates a snapshot and writes a list of + * records */ - private static final List> TEST_DATA_MULTIPLE_SNAPSHOTS = Lists.newArrayList( - Lists.newArrayList( - new SimpleRecord(1, "one"), - new SimpleRecord(2, "two"), - new SimpleRecord(3, "three")), + private static final List> TEST_DATA_MULTIPLE_SNAPSHOTS = Lists.newArrayList( - new SimpleRecord(4, "four"), - new SimpleRecord(5, "five")), - Lists.newArrayList( - new SimpleRecord(6, "six"), - new SimpleRecord(7, "seven"))); + Lists.newArrayList( + new SimpleRecord(1, "one"), new SimpleRecord(2, "two"), new SimpleRecord(3, "three")), + Lists.newArrayList(new SimpleRecord(4, "four"), new SimpleRecord(5, "five")), + Lists.newArrayList(new SimpleRecord(6, "six"), new SimpleRecord(7, "seven"))); /** - * test data - to be used for multiple write batches - * each batch inturn will have multiple snapshots + * test data - to be used for multiple write batches each batch inturn will have multiple + * snapshots */ - private static final List>> TEST_DATA_MULTIPLE_WRITES_MULTIPLE_SNAPSHOTS = Lists.newArrayList( - Lists.newArrayList( - Lists.newArrayList( - new SimpleRecord(1, "one"), - new SimpleRecord(2, "two"), - new SimpleRecord(3, "three")), - Lists.newArrayList( - new SimpleRecord(4, "four"), - new SimpleRecord(5, "five"))), + private static final List>> TEST_DATA_MULTIPLE_WRITES_MULTIPLE_SNAPSHOTS = Lists.newArrayList( Lists.newArrayList( - new SimpleRecord(6, "six"), - new SimpleRecord(7, "seven")), + Lists.newArrayList( + new SimpleRecord(1, "one"), + new SimpleRecord(2, "two"), + new SimpleRecord(3, "three")), + Lists.newArrayList(new SimpleRecord(4, "four"), new SimpleRecord(5, "five"))), Lists.newArrayList( - new SimpleRecord(8, "eight"), - new SimpleRecord(9, "nine"))), - Lists.newArrayList( + Lists.newArrayList(new SimpleRecord(6, "six"), new SimpleRecord(7, "seven")), + Lists.newArrayList(new SimpleRecord(8, "eight"), new SimpleRecord(9, "nine"))), Lists.newArrayList( - new SimpleRecord(10, "ten"), - new SimpleRecord(11, "eleven"), - new SimpleRecord(12, "twelve")), - Lists.newArrayList( - new SimpleRecord(13, "thirteen"), - new SimpleRecord(14, "fourteen")), - Lists.newArrayList( - new SimpleRecord(15, "fifteen"), - new SimpleRecord(16, "sixteen")))); + Lists.newArrayList( + new SimpleRecord(10, "ten"), + new SimpleRecord(11, "eleven"), + new SimpleRecord(12, "twelve")), + Lists.newArrayList( + new SimpleRecord(13, "thirteen"), new SimpleRecord(14, "fourteen")), + Lists.newArrayList( + new SimpleRecord(15, "fifteen"), new SimpleRecord(16, "sixteen")))); @Before public void setupTable() { - sql("CREATE TABLE %s " + - "(id INT, data STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(3, id))", tableName); + sql( + "CREATE TABLE %s " + + "(id INT, data STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(3, id))", + tableName); this.table = validationCatalog.loadTable(tableIdent); } @@ -163,17 +152,19 @@ public void testReadStreamOnIcebergThenAddData() throws Exception { @Test public void testReadingStreamFromTimestamp() throws Exception { - List dataBeforeTimestamp = Lists.newArrayList( - new SimpleRecord(-2, "minustwo"), - new SimpleRecord(-1, "minusone"), - new SimpleRecord(0, "zero")); + List dataBeforeTimestamp = + Lists.newArrayList( + new SimpleRecord(-2, "minustwo"), + new SimpleRecord(-1, "minusone"), + new SimpleRecord(0, "zero")); appendData(dataBeforeTimestamp); table.refresh(); long streamStartTimestamp = table.currentSnapshot().timestampMillis() + 1; - StreamingQuery query = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(streamStartTimestamp)); + StreamingQuery query = + startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(streamStartTimestamp)); List empty = rowsAvailable(query); Assertions.assertThat(empty.isEmpty()).isTrue(); @@ -190,21 +181,25 @@ public void testReadingStreamFromTimestamp() throws Exception { public void testReadingStreamFromFutureTimetsamp() throws Exception { long futureTimestamp = System.currentTimeMillis() + 10000; - StreamingQuery query = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(futureTimestamp)); + StreamingQuery query = + startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(futureTimestamp)); List actual = rowsAvailable(query); Assertions.assertThat(actual.isEmpty()).isTrue(); - List data = Lists.newArrayList( - new SimpleRecord(-2, "minustwo"), - new SimpleRecord(-1, "minusone"), - new SimpleRecord(0, "zero")); + List data = + Lists.newArrayList( + new SimpleRecord(-2, "minustwo"), + new SimpleRecord(-1, "minusone"), + new SimpleRecord(0, "zero")); // Perform several inserts that should not show up because the fromTimestamp has not elapsed - IntStream.range(0, 3).forEach(x -> { - appendData(data); - Assertions.assertThat(rowsAvailable(query).isEmpty()).isTrue(); - }); + IntStream.range(0, 3) + .forEach( + x -> { + appendData(data); + Assertions.assertThat(rowsAvailable(query).isEmpty()).isTrue(); + }); waitUntilAfter(futureTimestamp); @@ -216,16 +211,16 @@ public void testReadingStreamFromFutureTimetsamp() throws Exception { @Test public void testReadingStreamFromTimestampFutureWithExistingSnapshots() throws Exception { - List dataBeforeTimestamp = Lists.newArrayList( - new SimpleRecord(1, "one"), - new SimpleRecord(2, "two"), - new SimpleRecord(3, "three")); + List dataBeforeTimestamp = + Lists.newArrayList( + new SimpleRecord(1, "one"), new SimpleRecord(2, "two"), new SimpleRecord(3, "three")); appendData(dataBeforeTimestamp); long streamStartTimestamp = System.currentTimeMillis() + 2000; // Start the stream with a future timestamp after the current snapshot - StreamingQuery query = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(streamStartTimestamp)); + StreamingQuery query = + startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(streamStartTimestamp)); List actual = rowsAvailable(query); Assert.assertEquals(Collections.emptyList(), actual); @@ -233,7 +228,8 @@ public void testReadingStreamFromTimestampFutureWithExistingSnapshots() throws E waitUntilAfter(streamStartTimestamp); List> expected = TEST_DATA_MULTIPLE_SNAPSHOTS; appendDataAsMultipleSnapshots(expected); - Assertions.assertThat(rowsAvailable(query)).containsExactlyInAnyOrderElementsOf(Iterables.concat(expected)); + Assertions.assertThat(rowsAvailable(query)) + .containsExactlyInAnyOrderElementsOf(Iterables.concat(expected)); } @Test @@ -246,7 +242,8 @@ public void testReadingStreamFromTimestampOfExistingSnapshot() throws Exception long firstSnapshotTime = table.currentSnapshot().timestampMillis(); // Start stream giving the first Snapshot's time as the start point - StreamingQuery stream = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(firstSnapshotTime)); + StreamingQuery stream = + startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(firstSnapshotTime)); // Append rest of expected data for (int i = 1; i < expected.size(); i++) { @@ -259,14 +256,11 @@ public void testReadingStreamFromTimestampOfExistingSnapshot() throws Exception @Test public void testReadingStreamWithExpiredSnapshotFromTimestamp() throws TimeoutException { - List firstSnapshotRecordList = Lists.newArrayList( - new SimpleRecord(1, "one")); + List firstSnapshotRecordList = Lists.newArrayList(new SimpleRecord(1, "one")); - List secondSnapshotRecordList = Lists.newArrayList( - new SimpleRecord(2, "two")); + List secondSnapshotRecordList = Lists.newArrayList(new SimpleRecord(2, "two")); - List thirdSnapshotRecordList = Lists.newArrayList( - new SimpleRecord(3, "three")); + List thirdSnapshotRecordList = Lists.newArrayList(new SimpleRecord(3, "three")); List expectedRecordList = Lists.newArrayList(); expectedRecordList.addAll(secondSnapshotRecordList); @@ -277,13 +271,14 @@ public void testReadingStreamWithExpiredSnapshotFromTimestamp() throws TimeoutEx long firstSnapshotid = table.currentSnapshot().snapshotId(); long firstSnapshotCommitTime = table.currentSnapshot().timestampMillis(); - appendData(secondSnapshotRecordList); appendData(thirdSnapshotRecordList); table.expireSnapshots().expireSnapshotId(firstSnapshotid).commit(); - StreamingQuery query = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, String.valueOf(firstSnapshotCommitTime)); + StreamingQuery query = + startStream( + SparkReadOptions.STREAM_FROM_TIMESTAMP, String.valueOf(firstSnapshotCommitTime)); List actual = rowsAvailable(query); Assertions.assertThat(actual).containsExactlyInAnyOrderElementsOf(expectedRecordList); } @@ -294,21 +289,24 @@ public void testResumingStreamReadFromCheckpoint() throws Exception { File writerCheckpoint = new File(writerCheckpointFolder, "writer-checkpoint"); File output = temp.newFolder(); - DataStreamWriter querySource = spark.readStream() - .format("iceberg") - .load(tableName) - .writeStream() - .option("checkpointLocation", writerCheckpoint.toString()) - .format("parquet") - .queryName("checkpoint_test") - .option("path", output.getPath()); + DataStreamWriter querySource = + spark + .readStream() + .format("iceberg") + .load(tableName) + .writeStream() + .option("checkpointLocation", writerCheckpoint.toString()) + .format("parquet") + .queryName("checkpoint_test") + .option("path", output.getPath()); StreamingQuery startQuery = querySource.start(); startQuery.processAllAvailable(); startQuery.stop(); List expected = Lists.newArrayList(); - for (List> expectedCheckpoint : TEST_DATA_MULTIPLE_WRITES_MULTIPLE_SNAPSHOTS) { + for (List> expectedCheckpoint : + TEST_DATA_MULTIPLE_WRITES_MULTIPLE_SNAPSHOTS) { // New data was added while the stream was down appendDataAsMultipleSnapshots(expectedCheckpoint); expected.addAll(Lists.newArrayList(Iterables.concat(Iterables.concat(expectedCheckpoint)))); @@ -319,28 +317,23 @@ public void testResumingStreamReadFromCheckpoint() throws Exception { restartedQuery.stop(); // Read data added by the stream - List actual = spark.read() - .load(output.getPath()) - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + spark.read().load(output.getPath()).as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assertions.assertThat(actual).containsExactlyInAnyOrderElementsOf(Iterables.concat(expected)); } } @Test public void testParquetOrcAvroDataInOneTable() throws Exception { - List parquetFileRecords = Lists.newArrayList( - new SimpleRecord(1, "one"), - new SimpleRecord(2, "two"), - new SimpleRecord(3, "three")); + List parquetFileRecords = + Lists.newArrayList( + new SimpleRecord(1, "one"), new SimpleRecord(2, "two"), new SimpleRecord(3, "three")); - List orcFileRecords = Lists.newArrayList( - new SimpleRecord(4, "four"), - new SimpleRecord(5, "five")); + List orcFileRecords = + Lists.newArrayList(new SimpleRecord(4, "four"), new SimpleRecord(5, "five")); - List avroFileRecords = Lists.newArrayList( - new SimpleRecord(6, "six"), - new SimpleRecord(7, "seven")); + List avroFileRecords = + Lists.newArrayList(new SimpleRecord(6, "six"), new SimpleRecord(7, "seven")); appendData(parquetFileRecords); appendData(orcFileRecords, "orc"); @@ -348,7 +341,8 @@ public void testParquetOrcAvroDataInOneTable() throws Exception { StreamingQuery query = startStream(); Assertions.assertThat(rowsAvailable(query)) - .containsExactlyInAnyOrderElementsOf(Iterables.concat(parquetFileRecords, orcFileRecords, avroFileRecords)); + .containsExactlyInAnyOrderElementsOf( + Iterables.concat(parquetFileRecords, orcFileRecords, avroFileRecords)); } @Test @@ -371,18 +365,23 @@ public void testReadStreamWithSnapshotTypeOverwriteErrorsOut() throws Exception Schema deleteRowSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "one") // id = 1 - ); - - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, deleteRowSchema); - - table.newRowDelta() - .addDeletes(eqDeletes) - .commit(); - - // check pre-condition - that the above Delete file write - actually resulted in snapshot of type OVERWRITE + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "one") // id = 1 + ); + + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + deleteRowSchema); + + table.newRowDelta().addDeletes(eqDeletes).commit(); + + // check pre-condition - that the above Delete file write - actually resulted in snapshot of + // type OVERWRITE Assert.assertEquals(DataOperations.OVERWRITE, table.currentSnapshot().operation()); StreamingQuery query = startStream(); @@ -391,8 +390,7 @@ public void testReadStreamWithSnapshotTypeOverwriteErrorsOut() throws Exception "Streaming should fail with IllegalStateException, as the snapshot is not of type APPEND", IllegalStateException.class, "Cannot process overwrite snapshot", - () -> query.processAllAvailable() - ); + () -> query.processAllAvailable()); } @Test @@ -402,9 +400,7 @@ public void testReadStreamWithSnapshotTypeReplaceIgnoresReplace() throws Excepti appendDataAsMultipleSnapshots(expected); // this should create a snapshot with type Replace. - table.rewriteManifests() - .clusterBy(f -> 1) - .commit(); + table.rewriteManifests().clusterBy(f -> 1).commit(); // check pre-condition Assert.assertEquals(DataOperations.REPLACE, table.currentSnapshot().operation()); @@ -416,21 +412,17 @@ public void testReadStreamWithSnapshotTypeReplaceIgnoresReplace() throws Excepti @Test public void testReadStreamWithSnapshotTypeDeleteErrorsOut() throws Exception { - table.updateSpec() - .removeField("id_bucket") - .addField(ref("id")) - .commit(); + table.updateSpec().removeField("id_bucket").addField(ref("id")).commit(); // fill table with some data List> dataAcrossSnapshots = TEST_DATA_MULTIPLE_SNAPSHOTS; appendDataAsMultipleSnapshots(dataAcrossSnapshots); // this should create a snapshot with type delete. - table.newDelete() - .deleteFromRowFilter(Expressions.equal("id", 4)) - .commit(); + table.newDelete().deleteFromRowFilter(Expressions.equal("id", 4)).commit(); - // check pre-condition - that the above delete operation on table resulted in Snapshot of Type DELETE. + // check pre-condition - that the above delete operation on table resulted in Snapshot of Type + // DELETE. Assert.assertEquals(DataOperations.DELETE, table.currentSnapshot().operation()); StreamingQuery query = startStream(); @@ -439,27 +431,22 @@ public void testReadStreamWithSnapshotTypeDeleteErrorsOut() throws Exception { "Streaming should fail with IllegalStateException, as the snapshot is not of type APPEND", IllegalStateException.class, "Cannot process delete snapshot", - () -> query.processAllAvailable() - ); + () -> query.processAllAvailable()); } @Test public void testReadStreamWithSnapshotTypeDeleteAndSkipDeleteOption() throws Exception { - table.updateSpec() - .removeField("id_bucket") - .addField(ref("id")) - .commit(); + table.updateSpec().removeField("id_bucket").addField(ref("id")).commit(); // fill table with some data List> dataAcrossSnapshots = TEST_DATA_MULTIPLE_SNAPSHOTS; appendDataAsMultipleSnapshots(dataAcrossSnapshots); // this should create a snapshot with type delete. - table.newDelete() - .deleteFromRowFilter(Expressions.equal("id", 4)) - .commit(); + table.newDelete().deleteFromRowFilter(Expressions.equal("id", 4)).commit(); - // check pre-condition - that the above delete operation on table resulted in Snapshot of Type DELETE. + // check pre-condition - that the above delete operation on table resulted in Snapshot of Type + // DELETE. Assert.assertEquals(DataOperations.DELETE, table.currentSnapshot().operation()); StreamingQuery query = startStream(SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS, "true"); @@ -469,21 +456,17 @@ public void testReadStreamWithSnapshotTypeDeleteAndSkipDeleteOption() throws Exc @Test public void testReadStreamWithSnapshotTypeDeleteAndSkipOverwriteOption() throws Exception { - table.updateSpec() - .removeField("id_bucket") - .addField(ref("id")) - .commit(); + table.updateSpec().removeField("id_bucket").addField(ref("id")).commit(); // fill table with some data List> dataAcrossSnapshots = TEST_DATA_MULTIPLE_SNAPSHOTS; appendDataAsMultipleSnapshots(dataAcrossSnapshots); // this should create a snapshot with type overwrite. - table.newOverwrite() - .overwriteByRowFilter(Expressions.greaterThan("id", 4)) - .commit(); + table.newOverwrite().overwriteByRowFilter(Expressions.greaterThan("id", 4)).commit(); - // check pre-condition - that the above delete operation on table resulted in Snapshot of Type OVERWRITE. + // check pre-condition - that the above delete operation on table resulted in Snapshot of Type + // OVERWRITE. Assert.assertEquals(DataOperations.OVERWRITE, table.currentSnapshot().operation()); StreamingQuery query = startStream(SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS, "true"); @@ -492,8 +475,8 @@ public void testReadStreamWithSnapshotTypeDeleteAndSkipOverwriteOption() throws } /** - * appends each list as a Snapshot on the iceberg table at the given location. - * accepts a list of lists - each list representing data per snapshot. + * appends each list as a Snapshot on the iceberg table at the given location. accepts a list of + * lists - each list representing data per snapshot. */ private void appendDataAsMultipleSnapshots(List> data) { for (List l : data) { @@ -507,7 +490,8 @@ private void appendData(List data) { private void appendData(List data, String format) { Dataset df = spark.createDataFrame(data, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option("write-format", format) .mode("append") @@ -517,7 +501,8 @@ private void appendData(List data, String format) { private static final String MEMORY_TABLE = "_stream_view_mem"; private StreamingQuery startStream(Map options) throws TimeoutException { - return spark.readStream() + return spark + .readStream() .options(options) .format("iceberg") .load(tableName) @@ -539,9 +524,9 @@ private StreamingQuery startStream(String key, String value) throws TimeoutExcep private List rowsAvailable(StreamingQuery query) { query.processAllAvailable(); - return spark.sql("select * from " + MEMORY_TABLE) + return spark + .sql("select * from " + MEMORY_TABLE) .as(Encoders.bean(SimpleRecord.class)) .collectAsList(); } - } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java index e61d6ffb9e5e..ef2f73c3803c 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; @@ -42,15 +41,15 @@ // TODO: Use the copy of this from core. class TestTables { - private TestTables() { - } + private TestTables() {} static TestTable create(File temp, String name, Schema schema, PartitionSpec spec) { TestTableOperations ops = new TestTableOperations(name); if (ops.current() != null) { throw new AlreadyExistsException("Table %s already exists at location: %s", name, temp); } - ops.commit(null, TableMetadata.newTableMetadata(schema, spec, temp.toString(), ImmutableMap.of())); + ops.commit( + null, TableMetadata.newTableMetadata(schema, spec, temp.toString(), ImmutableMap.of())); return new TestTable(ops, name); } @@ -166,8 +165,8 @@ public FileIO io() { @Override public LocationProvider locationProvider() { - Preconditions.checkNotNull(current, - "Current metadata should not be null when locatinProvider is called"); + Preconditions.checkNotNull( + current, "Current metadata should not be null when locatinProvider is called"); return LocationProviders.locationsFor(current.location(), current.properties()); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java index 20509eef7471..f6cac9e9dd82 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.time.LocalDateTime; @@ -64,18 +65,16 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; - @RunWith(Parameterized.class) public class TestTimestampWithoutZone extends SparkTestBase { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(3, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(3, "data", Types.StringType.get())); private static SparkSession spark = null; @@ -91,8 +90,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String format; private final boolean vectorized; @@ -100,9 +98,9 @@ public static void stopSpark() { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false } + {"parquet", false}, + {"parquet", true}, + {"avro", false} }; } @@ -132,16 +130,17 @@ public void writeUnpartitionedTable() throws IOException { // create records using the table's schema this.records = testRecords(tableSchema); - try (FileAppender writer = new GenericAppenderFactory(tableSchema).newAppender( - localOutput(testFile), fileFormat)) { + try (FileAppender writer = + new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), fileFormat)) { writer.addAll(records); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(records.size()) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(records.size()) + .withFileSizeInBytes(testFile.length()) + .withPath(testFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); } @@ -154,69 +153,89 @@ public void testUnpartitionedTimestampWithoutZone() { @Test public void testUnpartitionedTimestampWithoutZoneProjection() { Schema projection = SCHEMA.select("id", "ts"); - assertEqualsSafe(projection.asStruct(), + assertEqualsSafe( + projection.asStruct(), records.stream().map(r -> projectFlat(projection, r)).collect(Collectors.toList()), read(unpartitioned.toString(), vectorized, "id", "ts")); } - @Rule - public ExpectedException exception = ExpectedException.none(); + @Rule public ExpectedException exception = ExpectedException.none(); @Test public void testUnpartitionedTimestampWithoutZoneError() { - AssertHelpers.assertThrows(String.format("Read operation performed on a timestamp without timezone field while " + - "'%s' set to false should throw exception", - SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), + AssertHelpers.assertThrows( + String.format( + "Read operation performed on a timestamp without timezone field while " + + "'%s' set to false should throw exception", + SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), IllegalArgumentException.class, SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, - () -> spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") - .load(unpartitioned.toString()) - .collectAsList()); + () -> + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") + .load(unpartitioned.toString()) + .collectAsList()); } @Test public void testUnpartitionedTimestampWithoutZoneAppend() { - spark.read().format("iceberg") - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()) - .write() - .format("iceberg") - .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .mode(SaveMode.Append) - .save(unpartitioned.toString()); - - assertEqualsSafe(SCHEMA.asStruct(), - Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), - read(unpartitioned.toString(), vectorized)); + spark + .read() + .format("iceberg") + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()) + .write() + .format("iceberg") + .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .mode(SaveMode.Append) + .save(unpartitioned.toString()); + + assertEqualsSafe( + SCHEMA.asStruct(), + Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), + read(unpartitioned.toString(), vectorized)); } @Test public void testUnpartitionedTimestampWithoutZoneWriteError() { - String errorMessage = String.format("Write operation performed on a timestamp without timezone field while " + - "'%s' set to false should throw exception", + String errorMessage = + String.format( + "Write operation performed on a timestamp without timezone field while " + + "'%s' set to false should throw exception", SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); - Runnable writeOperation = () -> spark.read().format("iceberg") - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()) - .write() - .format("iceberg") - .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") - .mode(SaveMode.Append) - .save(unpartitioned.toString()); - - AssertHelpers.assertThrows(errorMessage, IllegalArgumentException.class, - SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, writeOperation); - + Runnable writeOperation = + () -> + spark + .read() + .format("iceberg") + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()) + .write() + .format("iceberg") + .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") + .mode(SaveMode.Append) + .save(unpartitioned.toString()); + + AssertHelpers.assertThrows( + errorMessage, + IllegalArgumentException.class, + SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, + writeOperation); } @Test public void testUnpartitionedTimestampWithoutZoneSessionProperties() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - spark.read().format("iceberg") + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + spark + .read() + .format("iceberg") .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) .load(unpartitioned.toString()) .write() @@ -224,10 +243,11 @@ public void testUnpartitionedTimestampWithoutZoneSessionProperties() { .mode(SaveMode.Append) .save(unpartitioned.toString()); - assertEqualsSafe(SCHEMA.asStruct(), + assertEqualsSafe( + SCHEMA.asStruct(), Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), read(unpartitioned.toString(), vectorized)); - }); + }); } private static Record projectFlat(Schema projection, Record record) { @@ -240,8 +260,8 @@ private static Record projectFlat(Schema projection, Record record) { return result; } - public static void assertEqualsSafe(Types.StructType struct, - List expected, List actual) { + public static void assertEqualsSafe( + Types.StructType struct, List expected, List actual) { Assert.assertEquals("Number of results should match expected", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i += 1) { GenericsHelpers.assertEqualsSafe(struct, expected.get(i), actual.get(i)); @@ -259,20 +279,23 @@ private List testRecords(Schema schema) { record(schema, 6L, parseToLocal("2017-12-21T21:55:30.589712"), "element"), record(schema, 7L, parseToLocal("2017-12-21T17:31:14.532797"), "limited"), record(schema, 8L, parseToLocal("2017-12-21T15:21:51.237521"), "global"), - record(schema, 9L, parseToLocal("2017-12-21T15:02:15.230570"), "goldfish") - ); + record(schema, 9L, parseToLocal("2017-12-21T15:02:15.230570"), "goldfish")); } private static List read(String table, boolean vectorized) { return read(table, vectorized, "*"); } - private static List read(String table, boolean vectorized, String select0, String... selectN) { - Dataset dataset = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .load(table) - .select(select0, selectN); + private static List read( + String table, boolean vectorized, String select0, String... selectN) { + Dataset dataset = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .load(table) + .select(select0, selectN); return dataset.collectAsList(); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java index 7ed71031f3f2..9bf00f1b1365 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; @@ -53,28 +56,24 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestWriteMetricsConfig { private static final Configuration CONF = new Configuration(); - private static final Schema SIMPLE_SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "longCol", Types.IntegerType.get()), - optional(2, "strCol", Types.StringType.get()), - required(3, "record", Types.StructType.of( - required(4, "id", Types.IntegerType.get()), - required(5, "data", Types.StringType.get()) - )) - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final Schema SIMPLE_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "longCol", Types.IntegerType.get()), + optional(2, "strCol", Types.StringType.get()), + required( + 3, + "record", + Types.StructType.of( + required(4, "id", Types.IntegerType.get()), + required(5, "data", Types.StringType.get())))); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; private static JavaSparkContext sc = null; @@ -103,11 +102,9 @@ public void testFullMetricsCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -136,11 +133,9 @@ public void testCountMetricsCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -169,11 +164,9 @@ public void testNoMetricsCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -203,11 +196,9 @@ public void testCustomMetricCollectionForParquet() throws IOException { properties.put("write.metadata.metrics.column.id", "full"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -240,7 +231,8 @@ public void testBadCustomMetricCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); properties.put("write.metadata.metrics.column.ids", "full"); - AssertHelpers.assertThrows("Creating a table with invalid metrics should fail", + AssertHelpers.assertThrows( + "Creating a table with invalid metrics should fail", ValidationException.class, null, () -> tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation)); @@ -251,9 +243,7 @@ public void testCustomMetricCollectionForNestedParquet() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.builderFor(COMPLEX_SCHEMA) - .identity("strCol") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(COMPLEX_SCHEMA).identity("strCol").build(); Map properties = Maps.newHashMap(); properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); properties.put("write.metadata.metrics.column.longCol", "counts"); @@ -263,9 +253,11 @@ public void testCustomMetricCollectionForNestedParquet() throws IOException { Iterable rows = RandomData.generateSpark(COMPLEX_SCHEMA, 10, 0); JavaRDD rdd = sc.parallelize(Lists.newArrayList(rows)); - Dataset df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(COMPLEX_SCHEMA), false); + Dataset df = + spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(COMPLEX_SCHEMA), false); - df.coalesce(1).write() + df.coalesce(1) + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, "parquet") .mode(SaveMode.Append) diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java index 684dfbb255c7..554557df416c 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Objects; @@ -26,8 +25,7 @@ public class ThreeColumnRecord { private String c2; private String c3; - public ThreeColumnRecord() { - } + public ThreeColumnRecord() {} public ThreeColumnRecord(Integer c1, String c2, String c3) { this.c1 = c1; @@ -68,9 +66,9 @@ public boolean equals(Object o) { return false; } ThreeColumnRecord that = (ThreeColumnRecord) o; - return Objects.equals(c1, that.c1) && - Objects.equals(c2, that.c2) && - Objects.equals(c3, that.c3); + return Objects.equals(c1, that.c1) + && Objects.equals(c2, that.c2) + && Objects.equals(c3, that.c3); } @Override @@ -80,10 +78,6 @@ public int hashCode() { @Override public String toString() { - return "ThreeColumnRecord{" + - "c1=" + c1 + - ", c2='" + c2 + '\'' + - ", c3='" + c3 + '\'' + - '}'; + return "ThreeColumnRecord{" + "c1=" + c1 + ", c2='" + c2 + '\'' + ", c3='" + c3 + '\'' + '}'; } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestAlterTable.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestAlterTable.java index 6172bd1fd0fe..e347cde7ba32 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestAlterTable.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestAlterTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.Map; @@ -36,7 +35,8 @@ import org.junit.Test; public class TestAlterTable extends SparkCatalogTestBase { - private final TableIdentifier renamedIdent = TableIdentifier.of(Namespace.of("default"), "table2"); + private final TableIdentifier renamedIdent = + TableIdentifier.of(Namespace.of("default"), "table2"); public TestAlterTable(String catalogName, String implementation, Map config) { super(catalogName, implementation, config); @@ -55,39 +55,53 @@ public void removeTable() { @Test public void testAddColumnNotNull() { - AssertHelpers.assertThrows("Should reject adding NOT NULL column", - SparkException.class, "Incompatible change: cannot add required column", + AssertHelpers.assertThrows( + "Should reject adding NOT NULL column", + SparkException.class, + "Incompatible change: cannot add required column", () -> sql("ALTER TABLE %s ADD COLUMN c3 INT NOT NULL", tableName)); } @Test public void testAddColumn() { - sql("ALTER TABLE %s ADD COLUMN point struct AFTER id", tableName); - - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(3, "point", Types.StructType.of( - NestedField.required(4, "x", Types.DoubleType.get()), - NestedField.required(5, "y", Types.DoubleType.get()) - )), - NestedField.optional(2, "data", Types.StringType.get())); - - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + sql( + "ALTER TABLE %s ADD COLUMN point struct AFTER id", + tableName); + + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional( + 3, + "point", + Types.StructType.of( + NestedField.required(4, "x", Types.DoubleType.get()), + NestedField.required(5, "y", Types.DoubleType.get()))), + NestedField.optional(2, "data", Types.StringType.get())); + + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); sql("ALTER TABLE %s ADD COLUMN point.z double COMMENT 'May be null' FIRST", tableName); - Types.StructType expectedSchema2 = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(3, "point", Types.StructType.of( - NestedField.optional(6, "z", Types.DoubleType.get(), "May be null"), - NestedField.required(4, "x", Types.DoubleType.get()), - NestedField.required(5, "y", Types.DoubleType.get()) - )), - NestedField.optional(2, "data", Types.StringType.get())); - - Assert.assertEquals("Schema should match expected", - expectedSchema2, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Types.StructType expectedSchema2 = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional( + 3, + "point", + Types.StructType.of( + NestedField.optional(6, "z", Types.DoubleType.get(), "May be null"), + NestedField.required(4, "x", Types.DoubleType.get()), + NestedField.required(5, "y", Types.DoubleType.get()))), + NestedField.optional(2, "data", Types.StringType.get())); + + Assert.assertEquals( + "Schema should match expected", + expectedSchema2, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -95,19 +109,24 @@ public void testAddColumnWithArray() { sql("ALTER TABLE %s ADD COLUMN data2 array>", tableName); // use the implicit column name 'element' to access member of array and add column d to struct. sql("ALTER TABLE %s ADD COLUMN data2.element.d int", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get()), - NestedField.optional(3, "data2", Types.ListType.ofOptional( - 4, - Types.StructType.of( - NestedField.optional(5, "a", Types.IntegerType.get()), - NestedField.optional(6, "b", Types.IntegerType.get()), - NestedField.optional(7, "c", Types.IntegerType.get()), - NestedField.optional(8, "d", Types.IntegerType.get())) - ))); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get()), + NestedField.optional( + 3, + "data2", + Types.ListType.ofOptional( + 4, + Types.StructType.of( + NestedField.optional(5, "a", Types.IntegerType.get()), + NestedField.optional(6, "b", Types.IntegerType.get()), + NestedField.optional(7, "c", Types.IntegerType.get()), + NestedField.optional(8, "d", Types.IntegerType.get()))))); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -116,25 +135,31 @@ public void testAddColumnWithMap() { // use the implicit column name 'key' and 'value' to access member of map. // add column to value struct column sql("ALTER TABLE %s ADD COLUMN data2.value.c int", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get()), - NestedField.optional(3, "data2", Types.MapType.ofOptional( - 4, - 5, - Types.StructType.of( - NestedField.optional(6, "x", Types.IntegerType.get())), - Types.StructType.of( - NestedField.optional(7, "a", Types.IntegerType.get()), - NestedField.optional(8, "b", Types.IntegerType.get()), - NestedField.optional(9, "c", Types.IntegerType.get())) - ))); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get()), + NestedField.optional( + 3, + "data2", + Types.MapType.ofOptional( + 4, + 5, + Types.StructType.of(NestedField.optional(6, "x", Types.IntegerType.get())), + Types.StructType.of( + NestedField.optional(7, "a", Types.IntegerType.get()), + NestedField.optional(8, "b", Types.IntegerType.get()), + NestedField.optional(9, "c", Types.IntegerType.get()))))); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); // should not allow changing map key column - AssertHelpers.assertThrows("Should reject changing key of the map column", - SparkException.class, "Unsupported table change: Cannot add fields to map keys:", + AssertHelpers.assertThrows( + "Should reject changing key of the map column", + SparkException.class, + "Unsupported table change: Cannot add fields to map keys:", () -> sql("ALTER TABLE %s ADD COLUMN data2.key.y int", tableName)); } @@ -142,35 +167,43 @@ public void testAddColumnWithMap() { public void testDropColumn() { sql("ALTER TABLE %s DROP COLUMN data", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get())); + Types.StructType expectedSchema = + Types.StructType.of(NestedField.required(1, "id", Types.LongType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test public void testRenameColumn() { sql("ALTER TABLE %s RENAME COLUMN id TO row_id", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "row_id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "row_id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test public void testAlterColumnComment() { sql("ALTER TABLE %s ALTER COLUMN id COMMENT 'Record id'", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get(), "Record id"), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get(), "Record id"), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -178,25 +211,31 @@ public void testAlterColumnType() { sql("ALTER TABLE %s ADD COLUMN count int", tableName); sql("ALTER TABLE %s ALTER COLUMN count TYPE bigint", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get()), - NestedField.optional(3, "count", Types.LongType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get()), + NestedField.optional(3, "count", Types.LongType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test public void testAlterColumnDropNotNull() { sql("ALTER TABLE %s ALTER COLUMN id DROP NOT NULL", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.optional(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.optional(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -204,15 +243,20 @@ public void testAlterColumnSetNotNull() { // no-op changes are allowed sql("ALTER TABLE %s ALTER COLUMN id SET NOT NULL", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); - AssertHelpers.assertThrows("Should reject adding NOT NULL constraint to an optional column", - AnalysisException.class, "Cannot change nullable column to non-nullable: data", + AssertHelpers.assertThrows( + "Should reject adding NOT NULL constraint to an optional column", + AnalysisException.class, + "Cannot change nullable column to non-nullable: data", () -> sql("ALTER TABLE %s ALTER COLUMN data SET NOT NULL", tableName)); } @@ -221,13 +265,16 @@ public void testAlterColumnPositionAfter() { sql("ALTER TABLE %s ADD COLUMN count int", tableName); sql("ALTER TABLE %s ALTER COLUMN count AFTER id", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(3, "count", Types.IntegerType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(3, "count", Types.IntegerType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -235,18 +282,22 @@ public void testAlterColumnPositionFirst() { sql("ALTER TABLE %s ADD COLUMN count int", tableName); sql("ALTER TABLE %s ALTER COLUMN count FIRST", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.optional(3, "count", Types.IntegerType.get()), - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.optional(3, "count", Types.IntegerType.get()), + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test public void testTableRename() { - Assume.assumeFalse("Hadoop catalog does not support rename", validationCatalog instanceof HadoopCatalog); + Assume.assumeFalse( + "Hadoop catalog does not support rename", validationCatalog instanceof HadoopCatalog); Assert.assertTrue("Initial name should exist", validationCatalog.tableExists(tableIdent)); Assert.assertFalse("New name should not exist", validationCatalog.tableExists(renamedIdent)); @@ -261,15 +312,19 @@ public void testTableRename() { public void testSetTableProperties() { sql("ALTER TABLE %s SET TBLPROPERTIES ('prop'='value')", tableName); - Assert.assertEquals("Should have the new table property", - "value", validationCatalog.loadTable(tableIdent).properties().get("prop")); + Assert.assertEquals( + "Should have the new table property", + "value", + validationCatalog.loadTable(tableIdent).properties().get("prop")); sql("ALTER TABLE %s UNSET TBLPROPERTIES ('prop')", tableName); - Assert.assertNull("Should not have the removed table property", + Assert.assertNull( + "Should not have the removed table property", validationCatalog.loadTable(tableIdent).properties().get("prop")); - AssertHelpers.assertThrows("Cannot specify the 'sort-order' because it's a reserved table property", + AssertHelpers.assertThrows( + "Cannot specify the 'sort-order' because it's a reserved table property", UnsupportedOperationException.class, () -> sql("ALTER TABLE %s SET TBLPROPERTIES ('sort-order'='value')", tableName)); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTable.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTable.java index 986098543d25..1411c83ddc65 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTable.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.io.File; @@ -52,11 +51,15 @@ public void dropTestTable() { @Test public void testTransformIgnoreCase() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE IF NOT EXISTS %s (id BIGINT NOT NULL, ts timestamp) " + - "USING iceberg partitioned by (HOURS(ts))", tableName); + sql( + "CREATE TABLE IF NOT EXISTS %s (id BIGINT NOT NULL, ts timestamp) " + + "USING iceberg partitioned by (HOURS(ts))", + tableName); Assert.assertTrue("Table should already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE IF NOT EXISTS %s (id BIGINT NOT NULL, ts timestamp) " + - "USING iceberg partitioned by (hours(ts))", tableName); + sql( + "CREATE TABLE IF NOT EXISTS %s (id BIGINT NOT NULL, ts timestamp) " + + "USING iceberg partitioned by (hours(ts))", + tableName); Assert.assertTrue("Table should already exist", validationCatalog.tableExists(tableIdent)); } @@ -69,18 +72,22 @@ public void testCreateTable() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); } @Test public void testCreateTableInRootNamespace() { - Assume.assumeTrue("Hadoop has no default namespace configured", "testhadoop".equals(catalogName)); + Assume.assumeTrue( + "Hadoop has no default namespace configured", "testhadoop".equals(catalogName)); try { sql("CREATE TABLE %s.table (id bigint) USING iceberg", catalogName); @@ -102,47 +109,61 @@ public void testCreateTableUsingParquet() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertEquals("Should not have default format parquet", + Assert.assertEquals( + "Should not have default format parquet", "parquet", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); - AssertHelpers.assertThrows("Should reject unsupported format names", - IllegalArgumentException.class, "Unsupported format in USING: crocodile", - () -> sql("CREATE TABLE %s.default.fail (id BIGINT NOT NULL, data STRING) USING crocodile", catalogName)); + AssertHelpers.assertThrows( + "Should reject unsupported format names", + IllegalArgumentException.class, + "Unsupported format in USING: crocodile", + () -> + sql( + "CREATE TABLE %s.default.fail (id BIGINT NOT NULL, data STRING) USING crocodile", + catalogName)); } @Test public void testCreateTablePartitionedBy() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, created_at TIMESTAMP, category STRING, data STRING) " + - "USING iceberg " + - "PARTITIONED BY (category, bucket(8, id), days(created_at))", tableName); + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, created_at TIMESTAMP, category STRING, data STRING) " + + "USING iceberg " + + "PARTITIONED BY (category, bucket(8, id), days(created_at))", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "created_at", Types.TimestampType.withZone()), - NestedField.optional(3, "category", Types.StringType.get()), - NestedField.optional(4, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); - - PartitionSpec expectedSpec = PartitionSpec.builderFor(new Schema(expectedSchema.fields())) - .identity("category") - .bucket("id", 8) - .day("created_at") - .build(); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "created_at", Types.TimestampType.withZone()), + NestedField.optional(3, "category", Types.StringType.get()), + NestedField.optional(4, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); + + PartitionSpec expectedSpec = + PartitionSpec.builderFor(new Schema(expectedSchema.fields())) + .identity("category") + .bucket("id", 8) + .day("created_at") + .build(); Assert.assertEquals("Should be partitioned correctly", expectedSpec, table.spec()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); } @@ -150,20 +171,24 @@ public void testCreateTablePartitionedBy() { public void testCreateTableColumnComments() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL COMMENT 'Unique identifier', data STRING COMMENT 'Data value') " + - "USING iceberg", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL COMMENT 'Unique identifier', data STRING COMMENT 'Data value') " + + "USING iceberg", tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get(), "Unique identifier"), - NestedField.optional(2, "data", Types.StringType.get(), "Data value")); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get(), "Unique identifier"), + NestedField.optional(2, "data", Types.StringType.get(), "Data value")); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); } @@ -171,24 +196,30 @@ public void testCreateTableColumnComments() { public void testCreateTableComment() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "COMMENT 'Table doc'", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "COMMENT 'Table doc'", tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); - Assert.assertEquals("Should have the table comment set in properties", - "Table doc", table.properties().get(TableCatalog.PROP_COMMENT)); + Assert.assertEquals( + "Should have the table comment set in properties", + "Table doc", + table.properties().get(TableCatalog.PROP_COMMENT)); } @Test @@ -204,43 +235,49 @@ public void testCreateTableLocation() throws Exception { String location = "file:" + tableLocation.toString(); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "LOCATION '%s'", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "LOCATION '%s'", tableName, location); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); - Assert.assertEquals("Should have a custom table location", - location, table.location()); + Assert.assertEquals("Should have a custom table location", location, table.location()); } @Test public void testCreateTableProperties() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "TBLPROPERTIES (p1=2, p2='x')", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "TBLPROPERTIES (p1=2, p2='x')", tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); Assert.assertEquals("Should have property p1", "2", table.properties().get("p1")); Assert.assertEquals("Should have property p2", "x", table.properties().get("p2")); @@ -250,53 +287,56 @@ public void testCreateTableProperties() { public void testCreateTableWithFormatV2ThroughTableProperty() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "TBLPROPERTIES ('format-version'='2')", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "TBLPROPERTIES ('format-version'='2')", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("should create table using format v2", - 2, ((BaseTable) table).operations().current().formatVersion()); + Assert.assertEquals( + "should create table using format v2", + 2, + ((BaseTable) table).operations().current().formatVersion()); } @Test public void testUpgradeTableWithFormatV2ThroughTableProperty() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "TBLPROPERTIES ('format-version'='1')", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "TBLPROPERTIES ('format-version'='1')", tableName); Table table = validationCatalog.loadTable(tableIdent); TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("should create table using format v1", - 1, ops.refresh().formatVersion()); + Assert.assertEquals("should create table using format v1", 1, ops.refresh().formatVersion()); sql("ALTER TABLE %s SET TBLPROPERTIES ('format-version'='2')", tableName); - Assert.assertEquals("should update table to use format v2", - 2, ops.refresh().formatVersion()); + Assert.assertEquals("should update table to use format v2", 2, ops.refresh().formatVersion()); } @Test public void testDowngradeTableToFormatV1ThroughTablePropertyFails() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "TBLPROPERTIES ('format-version'='2')", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "TBLPROPERTIES ('format-version'='2')", tableName); Table table = validationCatalog.loadTable(tableIdent); TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("should create table using format v2", - 2, ops.refresh().formatVersion()); + Assert.assertEquals("should create table using format v2", 2, ops.refresh().formatVersion()); - AssertHelpers.assertThrowsCause("should fail to downgrade to v1", + AssertHelpers.assertThrowsCause( + "should fail to downgrade to v1", IllegalArgumentException.class, "Cannot downgrade v2 table to v1", () -> sql("ALTER TABLE %s SET TBLPROPERTIES ('format-version'='1')", tableName)); diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java index 4a70327e21a1..2581c0fd3c56 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.when; + import java.util.Map; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; @@ -30,20 +33,19 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.when; - public class TestCreateTableAsSelect extends SparkCatalogTestBase { private final String sourceName; - public TestCreateTableAsSelect(String catalogName, String implementation, Map config) { + public TestCreateTableAsSelect( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); this.sourceName = tableName("source"); - sql("CREATE TABLE IF NOT EXISTS %s (id bigint NOT NULL, data string) " + - "USING iceberg PARTITIONED BY (truncate(id, 3))", sourceName); + sql( + "CREATE TABLE IF NOT EXISTS %s (id bigint NOT NULL, data string) " + + "USING iceberg PARTITIONED BY (truncate(id, 3))", + sourceName); sql("INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", sourceName); } @@ -56,153 +58,178 @@ public void removeTables() { public void testUnpartitionedCTAS() { sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); Table ctasTable = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), ctasTable.schema().asStruct()); - Assert.assertEquals("Should be an unpartitioned table", - 0, ctasTable.spec().fields().size()); - assertEquals("Should have rows matching the source table", + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + ctasTable.schema().asStruct()); + Assert.assertEquals("Should be an unpartitioned table", 0, ctasTable.spec().fields().size()); + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testPartitionedCTAS() { - sql("CREATE TABLE %s USING iceberg PARTITIONED BY (id) AS SELECT * FROM %s ORDER BY id", tableName, sourceName); + sql( + "CREATE TABLE %s USING iceberg PARTITIONED BY (id) AS SELECT * FROM %s ORDER BY id", + tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("id") - .build(); + PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema).identity("id").build(); Table ctasTable = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), ctasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by id", - expectedSpec, ctasTable.spec()); - assertEquals("Should have rows matching the source table", + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + ctasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by id", expectedSpec, ctasTable.spec()); + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testRTAS() { - sql("CREATE TABLE %s USING iceberg TBLPROPERTIES ('prop1'='val1', 'prop2'='val2')" + - "AS SELECT * FROM %s", tableName, sourceName); + sql( + "CREATE TABLE %s USING iceberg TBLPROPERTIES ('prop1'='val1', 'prop2'='val2')" + + "AS SELECT * FROM %s", + tableName, sourceName); - assertEquals("Should have rows matching the source table", + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - sql("REPLACE TABLE %s USING iceberg PARTITIONED BY (part) TBLPROPERTIES ('prop1'='newval1', 'prop3'='val3') AS " + - "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); + sql( + "REPLACE TABLE %s USING iceberg PARTITIONED BY (part) TBLPROPERTIES ('prop1'='newval1', 'prop3'='val3') AS " + + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("part") - .withSpecId(1) - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema).identity("part").withSpecId(1).build(); Table rtasTable = validationCatalog.loadTable(tableIdent); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by part", - expectedSpec, rtasTable.spec()); - - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by part", expectedSpec, rtasTable.spec()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); - Assert.assertEquals("Should have updated table property", - "newval1", rtasTable.properties().get("prop1")); - Assert.assertEquals("Should have preserved table property", - "val2", rtasTable.properties().get("prop2")); - Assert.assertEquals("Should have new table property", - "val3", rtasTable.properties().get("prop3")); + Assert.assertEquals( + "Should have updated table property", "newval1", rtasTable.properties().get("prop1")); + Assert.assertEquals( + "Should have preserved table property", "val2", rtasTable.properties().get("prop2")); + Assert.assertEquals( + "Should have new table property", "val3", rtasTable.properties().get("prop3")); } @Test public void testCreateRTAS() { - sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + - "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); - - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + sql( + "CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + - "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); + sql( + "CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + + "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("part") - .withSpecId(0) // the spec is identical and should be reused - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema) + .identity("part") + .withSpecId(0) // the spec is identical and should be reused + .build(); Table rtasTable = validationCatalog.loadTable(tableIdent); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by part", - expectedSpec, rtasTable.spec()); - - assertEquals("Should have rows matching the source table", - sql("SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by part", expectedSpec, rtasTable.spec()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); } @Test public void testDataFrameV2Create() throws Exception { spark.table(sourceName).writeTo(tableName).using("iceberg").create(); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); Table ctasTable = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), ctasTable.schema().asStruct()); - Assert.assertEquals("Should be an unpartitioned table", - 0, ctasTable.spec().fields().size()); - assertEquals("Should have rows matching the source table", + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + ctasTable.schema().asStruct()); + Assert.assertEquals("Should be an unpartitioned table", 0, ctasTable.spec().fields().size()); + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -211,11 +238,13 @@ public void testDataFrameV2Create() throws Exception { public void testDataFrameV2Replace() throws Exception { spark.table(sourceName).writeTo(tableName).using("iceberg").create(); - assertEquals("Should have rows matching the source table", + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - spark.table(sourceName) + spark + .table(sourceName) .select( col("id"), col("data"), @@ -226,37 +255,40 @@ public void testDataFrameV2Replace() throws Exception { .using("iceberg") .replace(); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("part") - .withSpecId(1) - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema).identity("part").withSpecId(1).build(); Table rtasTable = validationCatalog.loadTable(tableIdent); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by part", - expectedSpec, rtasTable.spec()); - - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by part", expectedSpec, rtasTable.spec()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); } @Test public void testDataFrameV2CreateOrReplace() { - spark.table(sourceName) + spark + .table(sourceName) .select( col("id"), col("data"), @@ -267,12 +299,16 @@ public void testDataFrameV2CreateOrReplace() { .using("iceberg") .createOrReplace(); - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - spark.table(sourceName) + spark + .table(sourceName) .select(col("id").multiply(lit(2)).as("id"), col("data")) .select( col("id"), @@ -284,80 +320,97 @@ public void testDataFrameV2CreateOrReplace() { .using("iceberg") .createOrReplace(); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("part") - .withSpecId(0) // the spec is identical and should be reused - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema) + .identity("part") + .withSpecId(0) // the spec is identical and should be reused + .build(); Table rtasTable = validationCatalog.loadTable(tableIdent); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by part", - expectedSpec, rtasTable.spec()); - - assertEquals("Should have rows matching the source table", - sql("SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by part", expectedSpec, rtasTable.spec()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); } @Test public void testCreateRTASWithPartitionSpecChanging() { - sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + - "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); + sql( + "CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); Table rtasTable = validationCatalog.loadTable(tableIdent); - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); // Change the partitioning of the table rtasTable.updateSpec().removeField("part").commit(); // Spec 1 - sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part, id) AS " + - "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); + sql( + "CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part, id) AS " + + "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .alwaysNull("part", "part_1000") - .identity("part") - .identity("id") - .withSpecId(2) // The Spec is new - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema) + .alwaysNull("part", "part_1000") + .identity("part") + .identity("id") + .withSpecId(2) // The Spec is new + .build(); - Assert.assertEquals("Should be partitioned by part and id", - expectedSpec, rtasTable.spec()); + Assert.assertEquals("Should be partitioned by part and id", expectedSpec, rtasTable.spec()); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - - assertEquals("Should have rows matching the source table", - sql("SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java index 9b490ec03bff..68335e25b6c3 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -54,18 +53,18 @@ public void testDeleteFromTableAtSnapshot() throws NoSuchTableException { sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.coalesce(1).writeTo(tableName).append(); long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); String prefix = "snapshot_id_"; - AssertHelpers.assertThrows("Should not be able to delete from a table at a specific snapshot", - IllegalArgumentException.class, "Cannot delete from table at a specific snapshot", + AssertHelpers.assertThrows( + "Should not be able to delete from a table at a specific snapshot", + IllegalArgumentException.class, + "Cannot delete from table at a specific snapshot", () -> sql("DELETE FROM %s.%s WHERE id < 4", tableName, prefix + snapshotId)); } @@ -74,7 +73,8 @@ public void testDeleteFromWhereFalse() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -85,6 +85,7 @@ public void testDeleteFromWhereFalse() { table.refresh(); - Assert.assertEquals("Delete should not produce a new snapshot", 1, Iterables.size(table.snapshots())); + Assert.assertEquals( + "Delete should not produce a new snapshot", 1, Iterables.size(table.snapshots())); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java index a0fbcaab3538..52a99854c5b4 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.io.IOException; @@ -66,11 +65,14 @@ public void testDropTableGCDisabled() throws IOException { } private void dropTableInternal() throws IOException { - assertEquals("Should have expected rows", - ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(1, "test")), + sql("SELECT * FROM %s", tableName)); List manifestAndFiles = manifestsAndFiles(); - Assert.assertEquals("There should be 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertEquals( + "There should be 2 files for manifests and files", 2, manifestAndFiles.size()); Assert.assertTrue("All files should be existed", checkFilesExist(manifestAndFiles, true)); sql("DROP TABLE %s", tableName); @@ -86,11 +88,14 @@ private void dropTableInternal() throws IOException { @Test public void testPurgeTable() throws IOException { - assertEquals("Should have expected rows", - ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(1, "test")), + sql("SELECT * FROM %s", tableName)); List manifestAndFiles = manifestsAndFiles(); - Assert.assertEquals("There should be 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertEquals( + "There should be 2 files for manifests and files", 2, manifestAndFiles.size()); Assert.assertTrue("All files should exist", checkFilesExist(manifestAndFiles, true)); sql("DROP TABLE %s PURGE", tableName); @@ -102,14 +107,19 @@ public void testPurgeTable() throws IOException { public void testPurgeTableGCDisabled() throws IOException { sql("ALTER TABLE %s SET TBLPROPERTIES (gc.enabled = false)", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(1, "test")), + sql("SELECT * FROM %s", tableName)); List manifestAndFiles = manifestsAndFiles(); - Assert.assertEquals("There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertEquals( + "There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); Assert.assertTrue("All files should be existed", checkFilesExist(manifestAndFiles, true)); - AssertHelpers.assertThrows("Purge table is not allowed when GC is disabled", ValidationException.class, + AssertHelpers.assertThrows( + "Purge table is not allowed when GC is disabled", + ValidationException.class, "Cannot purge table: GC is disabled (deleting files may corrupt other tables", () -> sql("DROP TABLE %s PURGE", tableName)); @@ -119,15 +129,21 @@ public void testPurgeTableGCDisabled() throws IOException { private List manifestsAndFiles() { List files = readMetadataTableToJavaList(MetadataTableType.FILES.name(), "file_path"); - List manifests = readMetadataTableToJavaList(MetadataTableType.MANIFESTS.name(), "path"); - return Streams.concat(files.stream(), manifests.stream()).map(row -> (String) row[0]).collect(Collectors.toList()); + List manifests = + readMetadataTableToJavaList(MetadataTableType.MANIFESTS.name(), "path"); + return Streams.concat(files.stream(), manifests.stream()) + .map(row -> (String) row[0]) + .collect(Collectors.toList()); } private List readMetadataTableToJavaList(String metadataTableType, String column) { - List rows = spark.read().format("iceberg") - .load(String.format("%s.%s", tableName, metadataTableType)) - .select(column) - .collectAsList(); + List rows = + spark + .read() + .format("iceberg") + .load(String.format("%s.%s", tableName, metadataTableType)) + .select(column) + .collectAsList(); if (rows.size() < 1) { return ImmutableList.of(); @@ -143,12 +159,14 @@ private boolean checkFilesExist(List files, boolean shouldExist) throws } FileSystem fs = new Path(files.get(0)).getFileSystem(hiveConf); - return files.stream().allMatch(file -> { - try { - return fs.exists(new Path(file)) ^ mask; - } catch (IOException e) { - throw new RuntimeException(e); - } - }); + return files.stream() + .allMatch( + file -> { + try { + return fs.exists(new Path(file)) ^ mask; + } catch (IOException e) { + throw new RuntimeException(e); + } + }); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestNamespaceSQL.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestNamespaceSQL.java index d1eac312669a..317a95cd0140 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestNamespaceSQL.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestNamespaceSQL.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.io.File; @@ -56,7 +55,8 @@ public void cleanNamespaces() { @Test public void testCreateNamespace() { - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); @@ -76,7 +76,8 @@ public void testDefaultNamespace() { @Test public void testDropEmptyNamespace() { - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); @@ -84,23 +85,28 @@ public void testDropEmptyNamespace() { sql("DROP NAMESPACE %s", fullNamespace); - Assert.assertFalse("Namespace should have been dropped", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should have been dropped", validationNamespaceCatalog.namespaceExists(NS)); } @Test public void testDropNonEmptyNamespace() { Assume.assumeFalse("Session catalog has flaky behavior", "spark_catalog".equals(catalogName)); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); sql("CREATE TABLE %s.table (id bigint) USING iceberg", fullNamespace); Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(NS)); - Assert.assertTrue("Table should exist", validationCatalog.tableExists(TableIdentifier.of(NS, "table"))); + Assert.assertTrue( + "Table should exist", validationCatalog.tableExists(TableIdentifier.of(NS, "table"))); - AssertHelpers.assertThrows("Should fail if trying to delete a non-empty namespace", - SparkException.class, "non-empty namespace", + AssertHelpers.assertThrows( + "Should fail if trying to delete a non-empty namespace", + SparkException.class, + "non-empty namespace", () -> sql("DROP NAMESPACE %s", fullNamespace)); sql("DROP TABLE %s.table", fullNamespace); @@ -108,7 +114,8 @@ public void testDropNonEmptyNamespace() { @Test public void testListTables() { - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); @@ -126,7 +133,8 @@ public void testListTables() { @Test public void testListNamespace() { - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); @@ -136,17 +144,23 @@ public void testListNamespace() { if (isHadoopCatalog) { Assert.assertEquals("Should have 1 namespace", 1, namespaces.size()); - Set namespaceNames = namespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); + Set namespaceNames = + namespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); Assert.assertEquals("Should have only db namespace", ImmutableSet.of("db"), namespaceNames); } else { Assert.assertEquals("Should have 2 namespaces", 2, namespaces.size()); - Set namespaceNames = namespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); - Assert.assertEquals("Should have default and db namespaces", ImmutableSet.of("default", "db"), namespaceNames); + Set namespaceNames = + namespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); + Assert.assertEquals( + "Should have default and db namespaces", + ImmutableSet.of("default", "db"), + namespaceNames); } List nestedNamespaces = sql("SHOW NAMESPACES IN %s", fullNamespace); - Set nestedNames = nestedNamespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); + Set nestedNames = + nestedNamespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); Assert.assertEquals("Should not have nested namespaces", ImmutableSet.of(), nestedNames); } @@ -154,7 +168,8 @@ public void testListNamespace() { public void testCreateNamespaceWithMetadata() { Assume.assumeFalse("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s WITH PROPERTIES ('prop'='value')", fullNamespace); @@ -162,14 +177,16 @@ public void testCreateNamespaceWithMetadata() { Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Namespace should have expected prop value", "value", nsMetadata.get("prop")); + Assert.assertEquals( + "Namespace should have expected prop value", "value", nsMetadata.get("prop")); } @Test public void testCreateNamespaceWithComment() { Assume.assumeFalse("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s COMMENT 'namespace doc'", fullNamespace); @@ -177,14 +194,16 @@ public void testCreateNamespaceWithComment() { Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Namespace should have expected comment", "namespace doc", nsMetadata.get("comment")); + Assert.assertEquals( + "Namespace should have expected comment", "namespace doc", nsMetadata.get("comment")); } @Test public void testCreateNamespaceWithLocation() throws Exception { Assume.assumeFalse("HadoopCatalog does not support namespace locations", isHadoopCatalog); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); File location = temp.newFile(); Assert.assertTrue(location.delete()); @@ -195,27 +214,32 @@ public void testCreateNamespaceWithLocation() throws Exception { Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Namespace should have expected location", - "file:" + location.getPath(), nsMetadata.get("location")); + Assert.assertEquals( + "Namespace should have expected location", + "file:" + location.getPath(), + nsMetadata.get("location")); } @Test public void testSetProperties() { Assume.assumeFalse("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(NS)); Map defaultMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertFalse("Default metadata should not have custom property", defaultMetadata.containsKey("prop")); + Assert.assertFalse( + "Default metadata should not have custom property", defaultMetadata.containsKey("prop")); sql("ALTER NAMESPACE %s SET PROPERTIES ('prop'='value')", fullNamespace); Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Namespace should have expected prop value", "value", nsMetadata.get("prop")); + Assert.assertEquals( + "Namespace should have expected prop value", "value", nsMetadata.get("prop")); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWrites.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWrites.java index 9223797ada32..51c56ac79d4d 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWrites.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWrites.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -34,13 +33,16 @@ import org.junit.Test; public class TestPartitionedWrites extends SparkCatalogTestBase { - public TestPartitionedWrites(String catalogName, String implementation, Map config) { + public TestPartitionedWrites( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @Before public void createTables() { - sql("CREATE TABLE %s (id bigint, data string) USING iceberg PARTITIONED BY (truncate(id, 3))", tableName); + sql( + "CREATE TABLE %s (id bigint, data string) USING iceberg PARTITIONED BY (truncate(id, 3))", + tableName); sql("INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", tableName); } @@ -55,17 +57,14 @@ public void testInsertAppend() { sql("INSERT INTO %s VALUES (4, 'd'), (5, 'e')", tableName); - Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -75,88 +74,70 @@ public void testInsertOverwrite() { // 4 and 5 replace 3 in the partition (id - (id % 3)) = 3 sql("INSERT OVERWRITE %s VALUES (4, 'd'), (5, 'e')", tableName); - Assert.assertEquals("Should have 4 rows after overwrite", 4L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 4 rows after overwrite", 4L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2Append() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).append(); - Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2DynamicOverwrite() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).overwritePartitions(); - Assert.assertEquals("Should have 4 rows after overwrite", 4L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 4 rows after overwrite", 4L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2Overwrite() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).overwrite(functions.col("id").$less(3)); - Assert.assertEquals("Should have 3 rows after overwrite", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 3 rows after overwrite", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = ImmutableList.of(row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -166,13 +147,13 @@ public void testViewsReturnRecentResults() { Dataset query = spark.sql("SELECT * FROM " + tableName + " WHERE id = 1"); query.createOrReplaceTempView("tmp"); - assertEquals("View should have expected rows", - ImmutableList.of(row(1L, "a")), - sql("SELECT * FROM tmp")); + assertEquals( + "View should have expected rows", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("View should have expected rows", + assertEquals( + "View should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM tmp")); } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestRefreshTable.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestRefreshTable.java index a8bdea77e237..3eaca6329477 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestRefreshTable.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestRefreshTable.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.spark.sql; import java.util.List; @@ -49,7 +47,8 @@ public void removeTables() { @Test public void testRefreshCommand() { - // We are not allowed to change the session catalog after it has been initialized, so build a new one + // We are not allowed to change the session catalog after it has been initialized, so build a + // new one if (catalogName.equals("spark_catalog")) { spark.conf().set("spark.sql.catalog." + catalogName + ".cache-enabled", true); spark = spark.cloneSession(); @@ -59,7 +58,8 @@ public void testRefreshCommand() { List originalActual = sql("SELECT * FROM %s", tableName); assertEquals("Table should start as expected", originalExpected, originalActual); - // Modify table outside of spark, it should be cached so Spark should see the same value after mutation + // Modify table outside of spark, it should be cached so Spark should see the same value after + // mutation Table table = validationCatalog.loadTable(tableIdent); DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); table.newDelete().deleteFile(file).commit(); diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java index 38dfe0e5afda..f20ded4c7b2a 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -45,10 +44,12 @@ public TestSelect(String catalogName, String implementation, Map super(catalogName, implementation, config); // register a scan event listener to validate pushdown - Listeners.register(event -> { - scanEventCount += 1; - lastScanEvent = event; - }, ScanEvent.class); + Listeners.register( + event -> { + scanEventCount += 1; + lastScanEvent = event; + }, + ScanEvent.class); } @Before @@ -67,8 +68,8 @@ public void removeTables() { @Test public void testSelect() { - List expected = ImmutableList.of( - row(1L, "a", 1.0F), row(2L, "b", 2.0F), row(3L, "c", Float.NaN)); + List expected = + ImmutableList.of(row(1L, "a", 1.0F), row(2L, "b", 2.0F), row(3L, "c", Float.NaN)); assertEquals("Should return all expected rows", expected, sql("SELECT * FROM %s", tableName)); } @@ -77,11 +78,14 @@ public void testSelect() { public void testSelectRewrite() { List expected = ImmutableList.of(row(3L, "c", Float.NaN)); - assertEquals("Should return all expected rows", expected, + assertEquals( + "Should return all expected rows", + expected, sql("SELECT * FROM %s where float = float('NaN')", tableName)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should push down expected filter", + Assert.assertEquals( + "Should push down expected filter", "(float IS NOT NULL AND is_nan(float))", Spark3Util.describe(lastScanEvent.filter())); } @@ -93,8 +97,10 @@ public void testProjection() { assertEquals("Should return all expected rows", expected, sql("SELECT id FROM %s", tableName)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); - Assert.assertEquals("Should project only the id column", + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should project only the id column", validationCatalog.loadTable(tableIdent).schema().select("id").asStruct(), lastScanEvent.projection().asStruct()); } @@ -103,13 +109,18 @@ public void testProjection() { public void testExpressionPushdown() { List expected = ImmutableList.of(row("b")); - assertEquals("Should return all expected rows", expected, sql("SELECT data FROM %s WHERE id = 2", tableName)); + assertEquals( + "Should return all expected rows", + expected, + sql("SELECT data FROM %s WHERE id = 2", tableName)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should push down expected filter", + Assert.assertEquals( + "Should push down expected filter", "(id IS NOT NULL AND id = 2)", Spark3Util.describe(lastScanEvent.filter())); - Assert.assertEquals("Should project only id and data columns", + Assert.assertEquals( + "Should project only id and data columns", validationCatalog.loadTable(tableIdent).schema().select("id", "data").asStruct(), lastScanEvent.projection().asStruct()); } @@ -120,7 +131,8 @@ public void testMetadataTables() { "Spark session catalog does not support metadata tables", "spark_catalog".equals(catalogName)); - assertEquals("Snapshot metadata table", + assertEquals( + "Snapshot metadata table", ImmutableList.of(row(ANY, ANY, null, "append", ANY, ANY)), sql("SELECT * FROM %s.snapshots", tableName)); } @@ -144,10 +156,12 @@ public void testSnapshotInTableName() { assertEquals("Snapshot at specific ID, prefix " + prefix, expected, actual); // read the table using DataFrameReader option - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) - .load(tableName); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) + .load(tableName); List fromDF = rowsToJava(df.collectAsList()); assertEquals("Snapshot at specific ID " + snapshotId, expected, fromDF); } @@ -172,10 +186,12 @@ public void testTimestampInTableName() { assertEquals("Snapshot at timestamp, prefix " + prefix, expected, actual); // read the table using DataFrameReader option - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) - .load(tableName); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) + .load(tableName); List fromDF = rowsToJava(df.collectAsList()); assertEquals("Snapshot at timestamp " + timestamp, expected, fromDF); } @@ -185,22 +201,25 @@ public void testSpecifySnapshotAndTimestamp() { // get the snapshot ID of the last write long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); // get a timestamp just after the last write - long timestamp = validationCatalog.loadTable(tableIdent).currentSnapshot().timestampMillis() + 2; + long timestamp = + validationCatalog.loadTable(tableIdent).currentSnapshot().timestampMillis() + 2; // create a second snapshot sql("INSERT INTO %s VALUES (4, 'd', 4.0), (5, 'e', 5.0)", tableName); - AssertHelpers.assertThrows("Should not be able to specify both snapshot id and timestamp", + AssertHelpers.assertThrows( + "Should not be able to specify both snapshot id and timestamp", IllegalArgumentException.class, - String.format("Cannot specify both snapshot-id (%s) and as-of-timestamp (%s)", - snapshotId, timestamp), + String.format( + "Cannot specify both snapshot-id (%s) and as-of-timestamp (%s)", snapshotId, timestamp), () -> { - spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) - .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) - .load(tableName) - .collectAsList(); + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) + .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) + .load(tableName) + .collectAsList(); }); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestTimestampWithoutZone.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestTimestampWithoutZone.java index ddaac5256e10..51b8d255a99b 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestTimestampWithoutZone.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestTimestampWithoutZone.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.sql.Timestamp; @@ -50,32 +49,35 @@ public class TestTimestampWithoutZone extends SparkCatalogTestBase { private static final String newTableName = "created_table"; private final Map config; - private static final Schema schema = new Schema( + private static final Schema schema = + new Schema( Types.NestedField.required(1, "id", Types.LongType.get()), Types.NestedField.required(2, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.required(3, "tsz", Types.TimestampType.withZone()) - ); + Types.NestedField.required(3, "tsz", Types.TimestampType.withZone())); - private final List values = ImmutableList.of( + private final List values = + ImmutableList.of( row(1L, toTimestamp("2021-01-01T00:00:00.0"), toTimestamp("2021-02-01T00:00:00.0")), row(2L, toTimestamp("2021-01-01T00:00:00.0"), toTimestamp("2021-02-01T00:00:00.0")), - row(3L, toTimestamp("2021-01-01T00:00:00.0"), toTimestamp("2021-02-01T00:00:00.0")) - ); + row(3L, toTimestamp("2021-01-01T00:00:00.0"), toTimestamp("2021-02-01T00:00:00.0"))); @Parameterized.Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") public static Object[][] parameters() { - return new Object[][]{{"spark_catalog", - SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "parquet-enabled", "true", - "cache-enabled", "false" - )} + return new Object[][] { + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "parquet-enabled", "true", + "cache-enabled", "false") + } }; } - public TestTimestampWithoutZone(String catalogName, String implementation, Map config) { + public TestTimestampWithoutZone( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); this.config = config; } @@ -94,8 +96,10 @@ public void removeTables() { @Test public void testWriteTimestampWithoutZoneError() { AssertHelpers.assertThrows( - String.format("Write operation performed on a timestamp without timezone field while " + - "'%s' set to false should throw exception", SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), + String.format( + "Write operation performed on a timestamp without timezone field while " + + "'%s' set to false should throw exception", + SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), IllegalArgumentException.class, SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, () -> sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values))); @@ -103,72 +107,98 @@ public void testWriteTimestampWithoutZoneError() { @Test public void testAppendTimestampWithoutZone() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); - - Assert.assertEquals("Should have " + values.size() + " row", - (long) values.size(), scalarSql("SELECT count(*) FROM %s", tableName)); - - assertEquals("Row data should match expected", - values, sql("SELECT * FROM %s ORDER BY id", tableName)); - }); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); + + Assert.assertEquals( + "Should have " + values.size() + " row", + (long) values.size(), + scalarSql("SELECT count(*) FROM %s", tableName)); + + assertEquals( + "Row data should match expected", + values, + sql("SELECT * FROM %s ORDER BY id", tableName)); + }); } @Test public void testCreateAsSelectWithTimestampWithoutZone() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); - sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); + sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); - Assert.assertEquals("Should have " + values.size() + " row", (long) values.size(), + Assert.assertEquals( + "Should have " + values.size() + " row", + (long) values.size(), scalarSql("SELECT count(*) FROM %s", newTableName)); - assertEquals("Row data should match expected", + assertEquals( + "Row data should match expected", sql("SELECT * FROM %s ORDER BY id", tableName), sql("SELECT * FROM %s ORDER BY id", newTableName)); - }); + }); } @Test public void testCreateNewTableShouldHaveTimestampWithZoneIcebergType() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); - sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); + sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); - Assert.assertEquals("Should have " + values.size() + " row", (long) values.size(), + Assert.assertEquals( + "Should have " + values.size() + " row", + (long) values.size(), scalarSql("SELECT count(*) FROM %s", newTableName)); - assertEquals("Data from created table should match data from base table", + assertEquals( + "Data from created table should match data from base table", sql("SELECT * FROM %s ORDER BY id", tableName), sql("SELECT * FROM %s ORDER BY id", newTableName)); - Table createdTable = validationCatalog.loadTable(TableIdentifier.of("default", newTableName)); - assertFieldsType(createdTable.schema(), Types.TimestampType.withZone(), "ts", "tsz"); - }); + Table createdTable = + validationCatalog.loadTable(TableIdentifier.of("default", newTableName)); + assertFieldsType(createdTable.schema(), Types.TimestampType.withZone(), "ts", "tsz"); + }); } @Test public void testCreateNewTableShouldHaveTimestampWithoutZoneIcebergType() { - withSQLConf(ImmutableMap.of( + withSQLConf( + ImmutableMap.of( SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true", - SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, "true"), () -> { - spark.sessionState().catalogManager().currentCatalog() + SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, "true"), + () -> { + spark + .sessionState() + .catalogManager() + .currentCatalog() .initialize(catalog.name(), new CaseInsensitiveStringMap(config)); - sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); + sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); - sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); + sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); - Assert.assertEquals("Should have " + values.size() + " row", (long) values.size(), + Assert.assertEquals( + "Should have " + values.size() + " row", + (long) values.size(), scalarSql("SELECT count(*) FROM %s", newTableName)); - assertEquals("Row data should match expected", + assertEquals( + "Row data should match expected", sql("SELECT * FROM %s ORDER BY id", tableName), sql("SELECT * FROM %s ORDER BY id", newTableName)); - Table createdTable = validationCatalog.loadTable(TableIdentifier.of("default", newTableName)); - assertFieldsType(createdTable.schema(), Types.TimestampType.withoutZone(), "ts", "tsz"); - }); + Table createdTable = + validationCatalog.loadTable(TableIdentifier.of("default", newTableName)); + assertFieldsType(createdTable.schema(), Types.TimestampType.withoutZone(), "ts", "tsz"); + }); } private Timestamp toTimestamp(String value) { @@ -176,21 +206,33 @@ private Timestamp toTimestamp(String value) { } private String rowToSqlValues(List rows) { - List rowValues = rows.stream().map(row -> { - List columns = Arrays.stream(row).map(value -> { - if (value instanceof Long) { - return value.toString(); - } else if (value instanceof Timestamp) { - return String.format("timestamp '%s'", value); - } - throw new RuntimeException("Type is not supported"); - }).collect(Collectors.toList()); - return "(" + Joiner.on(",").join(columns) + ")"; - }).collect(Collectors.toList()); + List rowValues = + rows.stream() + .map( + row -> { + List columns = + Arrays.stream(row) + .map( + value -> { + if (value instanceof Long) { + return value.toString(); + } else if (value instanceof Timestamp) { + return String.format("timestamp '%s'", value); + } + throw new RuntimeException("Type is not supported"); + }) + .collect(Collectors.toList()); + return "(" + Joiner.on(",").join(columns) + ")"; + }) + .collect(Collectors.toList()); return Joiner.on(",").join(rowValues); } private void assertFieldsType(Schema actual, Type.PrimitiveType expected, String... fields) { - actual.select(fields).asStruct().fields().forEach(field -> Assert.assertEquals(expected, field.type())); + actual + .select(fields) + .asStruct() + .fields() + .forEach(field -> Assert.assertEquals(expected, field.type())); } } diff --git a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestUnpartitionedWrites.java b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestUnpartitionedWrites.java index 7a6ea0996e22..8db2fefabdfb 100644 --- a/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestUnpartitionedWrites.java +++ b/spark/v3.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestUnpartitionedWrites.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -36,7 +35,8 @@ import org.junit.Test; public class TestUnpartitionedWrites extends SparkCatalogTestBase { - public TestUnpartitionedWrites(String catalogName, String implementation, Map config) { + public TestUnpartitionedWrites( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -57,17 +57,14 @@ public void testInsertAppend() { sql("INSERT INTO %s VALUES (4, 'd'), (5, 'e')", tableName); - Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -76,14 +73,13 @@ public void testInsertOverwrite() { sql("INSERT OVERWRITE %s VALUES (4, 'd'), (5, 'e')", tableName); - Assert.assertEquals("Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(4L, "d"), - row(5L, "e") - ); + List expected = ImmutableList.of(row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -94,8 +90,10 @@ public void testInsertAppendAtSnapshot() { long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); String prefix = "snapshot_id_"; - AssertHelpers.assertThrows("Should not be able to insert into a table at a specific snapshot", - IllegalArgumentException.class, "Cannot write to table at a specific snapshot", + AssertHelpers.assertThrows( + "Should not be able to insert into a table at a specific snapshot", + IllegalArgumentException.class, + "Cannot write to table at a specific snapshot", () -> sql("INSERT INTO %s.%s VALUES (4, 'd'), (5, 'e')", tableName, prefix + snapshotId)); } @@ -107,77 +105,68 @@ public void testInsertOverwriteAtSnapshot() { long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); String prefix = "snapshot_id_"; - AssertHelpers.assertThrows("Should not be able to insert into a table at a specific snapshot", - IllegalArgumentException.class, "Cannot write to table at a specific snapshot", - () -> sql("INSERT OVERWRITE %s.%s VALUES (4, 'd'), (5, 'e')", tableName, prefix + snapshotId)); + AssertHelpers.assertThrows( + "Should not be able to insert into a table at a specific snapshot", + IllegalArgumentException.class, + "Cannot write to table at a specific snapshot", + () -> + sql( + "INSERT OVERWRITE %s.%s VALUES (4, 'd'), (5, 'e')", + tableName, prefix + snapshotId)); } @Test public void testDataFrameV2Append() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).append(); - Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2DynamicOverwrite() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).overwritePartitions(); - Assert.assertEquals("Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(4L, "d"), - row(5L, "e") - ); + List expected = ImmutableList.of(row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2Overwrite() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).overwrite(functions.col("id").$less$eq(3)); - Assert.assertEquals("Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(4L, "d"), - row(5L, "e") - ); + List expected = ImmutableList.of(row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java index 51ac57855bbe..8918dfec6584 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Objects; @@ -25,8 +24,7 @@ public class Employee { private Integer id; private String dep; - public Employee() { - } + public Employee() {} public Employee(Integer id, String dep) { this.id = id; diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java index 32b45b8200b0..fb9cc09567fa 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; + import java.util.Map; import java.util.Random; import java.util.concurrent.ThreadLocalRandom; @@ -32,13 +33,12 @@ import org.apache.spark.sql.internal.SQLConf; import org.junit.BeforeClass; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; - public abstract class SparkExtensionsTestBase extends SparkCatalogTestBase { private static final Random RANDOM = ThreadLocalRandom.current(); - public SparkExtensionsTestBase(String catalogName, String implementation, Map config) { + public SparkExtensionsTestBase( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -48,19 +48,23 @@ public static void startMetastoreAndSpark() { metastore.start(); SparkTestBase.hiveConf = metastore.hiveConf(); - SparkTestBase.spark = SparkSession.builder() - .master("local[2]") - .config("spark.testing", "true") - .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") - .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .config("spark.sql.shuffle.partitions", "4") - .config("spark.sql.hive.metastorePartitionPruningFallbackOnException", "true") - .config(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), String.valueOf(RANDOM.nextBoolean())) - .enableHiveSupport() - .getOrCreate(); + SparkTestBase.spark = + SparkSession.builder() + .master("local[2]") + .config("spark.testing", "true") + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config("spark.sql.shuffle.partitions", "4") + .config("spark.sql.hive.metastorePartitionPruningFallbackOnException", "true") + .config( + SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), String.valueOf(RANDOM.nextBoolean())) + .enableHiveSupport() + .getOrCreate(); - SparkTestBase.catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + SparkTestBase.catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java index 278559a5601d..eafd968d01a7 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java @@ -16,9 +16,21 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.DataOperations.DELETE; +import static org.apache.iceberg.DataOperations.OVERWRITE; +import static org.apache.iceberg.SnapshotSummary.ADDED_DELETE_FILES_PROP; +import static org.apache.iceberg.SnapshotSummary.ADDED_FILES_PROP; +import static org.apache.iceberg.SnapshotSummary.CHANGED_PARTITION_COUNT_PROP; +import static org.apache.iceberg.SnapshotSummary.DELETED_FILES_PROP; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -40,19 +52,6 @@ import org.junit.runners.Parameterized; import org.junit.runners.Parameterized.Parameters; -import static org.apache.iceberg.DataOperations.DELETE; -import static org.apache.iceberg.DataOperations.OVERWRITE; -import static org.apache.iceberg.SnapshotSummary.ADDED_DELETE_FILES_PROP; -import static org.apache.iceberg.SnapshotSummary.ADDED_FILES_PROP; -import static org.apache.iceberg.SnapshotSummary.CHANGED_PARTITION_COUNT_PROP; -import static org.apache.iceberg.SnapshotSummary.DELETED_FILES_PROP; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE; - @RunWith(Parameterized.class) public abstract class SparkRowLevelOperationsTestBase extends SparkExtensionsTestBase { @@ -62,58 +61,68 @@ public abstract class SparkRowLevelOperationsTestBase extends SparkExtensionsTes protected final boolean vectorized; protected final String distributionMode; - public SparkRowLevelOperationsTestBase(String catalogName, String implementation, - Map config, String fileFormat, - boolean vectorized, - String distributionMode) { + public SparkRowLevelOperationsTestBase( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config); this.fileFormat = fileFormat; this.vectorized = vectorized; this.distributionMode = distributionMode; } - @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}," + - " format = {3}, vectorized = {4}, distributionMode = {5}") + @Parameters( + name = + "catalogName = {0}, implementation = {1}, config = {2}," + + " format = {3}, vectorized = {4}, distributionMode = {5}") public static Object[][] parameters() { return new Object[][] { - { "testhive", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default" - ), - "orc", - true, - WRITE_DISTRIBUTION_MODE_NONE - }, - { "testhive", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default" + { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default"), + "orc", + true, + WRITE_DISTRIBUTION_MODE_NONE + }, + { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default"), + "parquet", + true, + WRITE_DISTRIBUTION_MODE_NONE + }, + { + "testhadoop", + SparkCatalog.class.getName(), + ImmutableMap.of("type", "hadoop"), + "parquet", + RANDOM.nextBoolean(), + WRITE_DISTRIBUTION_MODE_HASH + }, + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "clients", "1", + "parquet-enabled", "false", + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync ), - "parquet", - true, - WRITE_DISTRIBUTION_MODE_NONE - }, - { "testhadoop", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hadoop" - ), - "parquet", - RANDOM.nextBoolean(), - WRITE_DISTRIBUTION_MODE_HASH - }, - { "spark_catalog", SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "clients", "1", - "parquet-enabled", "false", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - ), - "avro", - false, - WRITE_DISTRIBUTION_MODE_RANGE - } + "avro", + false, + WRITE_DISTRIBUTION_MODE_RANGE + } }; } @@ -121,11 +130,15 @@ public static Object[][] parameters() { protected void initTable() { sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DEFAULT_FILE_FORMAT, fileFormat); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, distributionMode); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, distributionMode); switch (fileFormat) { case "parquet": - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')", tableName, PARQUET_VECTORIZATION_ENABLED, vectorized); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')", + tableName, PARQUET_VECTORIZATION_ENABLED, vectorized); break; case "orc": Assert.assertTrue(vectorized); @@ -136,9 +149,10 @@ protected void initTable() { } Map props = extraTableProperties(); - props.forEach((prop, value) -> { - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, prop, value); - }); + props.forEach( + (prop, value) -> { + sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, prop, value); + }); } protected void createAndInitTable(String schema) { @@ -186,9 +200,10 @@ protected void createOrReplaceView(String name, List data, Encoder enc } private Dataset toDS(String schema, String jsonData) { - List jsonRows = Arrays.stream(jsonData.split("\n")) - .filter(str -> str.trim().length() > 0) - .collect(Collectors.toList()); + List jsonRows = + Arrays.stream(jsonData.split("\n")) + .filter(str -> str.trim().length() > 0) + .collect(Collectors.toList()); Dataset jsonDS = spark.createDataset(jsonRows, Encoders.STRING()); if (schema != null) { @@ -198,22 +213,36 @@ private Dataset toDS(String schema, String jsonData) { } } - protected void validateDelete(Snapshot snapshot, String changedPartitionCount, String deletedDataFiles) { + protected void validateDelete( + Snapshot snapshot, String changedPartitionCount, String deletedDataFiles) { validateSnapshot(snapshot, DELETE, changedPartitionCount, deletedDataFiles, null, null); } - protected void validateCopyOnWrite(Snapshot snapshot, String changedPartitionCount, - String deletedDataFiles, String addedDataFiles) { - validateSnapshot(snapshot, OVERWRITE, changedPartitionCount, deletedDataFiles, null, addedDataFiles); + protected void validateCopyOnWrite( + Snapshot snapshot, + String changedPartitionCount, + String deletedDataFiles, + String addedDataFiles) { + validateSnapshot( + snapshot, OVERWRITE, changedPartitionCount, deletedDataFiles, null, addedDataFiles); } - protected void validateMergeOnRead(Snapshot snapshot, String changedPartitionCount, - String addedDeleteFiles, String addedDataFiles) { - validateSnapshot(snapshot, OVERWRITE, changedPartitionCount, null, addedDeleteFiles, addedDataFiles); + protected void validateMergeOnRead( + Snapshot snapshot, + String changedPartitionCount, + String addedDeleteFiles, + String addedDataFiles) { + validateSnapshot( + snapshot, OVERWRITE, changedPartitionCount, null, addedDeleteFiles, addedDataFiles); } - protected void validateSnapshot(Snapshot snapshot, String operation, String changedPartitionCount, - String deletedDataFiles, String addedDeleteFiles, String addedDataFiles) { + protected void validateSnapshot( + Snapshot snapshot, + String operation, + String changedPartitionCount, + String deletedDataFiles, + String addedDeleteFiles, + String addedDataFiles) { Assert.assertEquals("Operation must match", operation, snapshot.operation()); validateProperty(snapshot, CHANGED_PARTITION_COUNT_PROP, changedPartitionCount); validateProperty(snapshot, DELETED_FILES_PROP, deletedDataFiles); @@ -223,14 +252,20 @@ protected void validateSnapshot(Snapshot snapshot, String operation, String chan protected void validateProperty(Snapshot snapshot, String property, Set expectedValues) { String actual = snapshot.summary().get(property); - Assert.assertTrue("Snapshot property " + property + " has unexpected value, actual = " + - actual + ", expected one of : " + String.join(",", expectedValues), + Assert.assertTrue( + "Snapshot property " + + property + + " has unexpected value, actual = " + + actual + + ", expected one of : " + + String.join(",", expectedValues), expectedValues.contains(actual)); } protected void validateProperty(Snapshot snapshot, String property, String expectedValue) { String actual = snapshot.summary().get(property); - Assert.assertEquals("Snapshot property " + property + " has unexpected value.", expectedValue, actual); + Assert.assertEquals( + "Snapshot property " + property + " has unexpected value.", expectedValue, actual); } protected void sleep(long millis) { diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java index e274ad857875..f689401653f7 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.io.File; @@ -57,12 +56,12 @@ public class TestAddFilesProcedure extends SparkExtensionsTestBase { private final String sourceTableName = "source_table"; private File fileTableDir; - public TestAddFilesProcedure(String catalogName, String implementation, Map config) { + public TestAddFilesProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Before public void setupTempDirs() { @@ -88,12 +87,15 @@ public void addDataUnpartitioned() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -107,12 +109,15 @@ public void addDataUnpartitionedOrc() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`orc`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`orc`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -125,10 +130,12 @@ public void addAvroFile() throws Exception { // Create an Avro file - Schema schema = SchemaBuilder.record("record").fields() - .requiredInt("id") - .requiredString("data") - .endRecord(); + Schema schema = + SchemaBuilder.record("record") + .fields() + .requiredInt("id") + .requiredString("data") + .endRecord(); GenericRecord record1 = new GenericData.Record(schema); record1.put("id", 1L); record1.put("data", "a"); @@ -144,30 +151,30 @@ public void addAvroFile() throws Exception { dataFileWriter.append(record2); dataFileWriter.close(); - String createIceberg = - "CREATE TABLE %s (id Long, data String) USING iceberg"; + String createIceberg = "CREATE TABLE %s (id Long, data String) USING iceberg"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`avro`.`%s`')", - catalogName, tableName, outputFile.getPath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`avro`.`%s`')", + catalogName, tableName, outputFile.getPath()); Assert.assertEquals(1L, result); - List expected = Lists.newArrayList( - new Object[]{1L, "a"}, - new Object[]{2L, "b"} - ); + List expected = Lists.newArrayList(new Object[] {1L, "a"}, new Object[] {2L, "b"}); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); - List actualRecordCount = sql("select %s from %s.files", - DataFile.RECORD_COUNT.name(), - tableName); + List actualRecordCount = + sql("select %s from %s.files", DataFile.RECORD_COUNT.name(), tableName); List expectedRecordCount = Lists.newArrayList(); - expectedRecordCount.add(new Object[]{2L}); - assertEquals("Iceberg file metadata should have correct metadata count", - expectedRecordCount, actualRecordCount); + expectedRecordCount.add(new Object[] {2L}); + assertEquals( + "Iceberg file metadata should have correct metadata count", + expectedRecordCount, + actualRecordCount); } // TODO Adding spark-avro doesn't work in tests @@ -180,12 +187,15 @@ public void addDataUnpartitionedAvro() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`avro`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`avro`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -199,12 +209,13 @@ public void addDataUnpartitionedHive() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + Object result = + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -218,12 +229,15 @@ public void addDataUnpartitionedExtraCol() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -232,17 +246,19 @@ public void addDataUnpartitionedExtraCol() { public void addDataUnpartitionedMissingCol() { createUnpartitionedFileTable("parquet"); - String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String) USING iceberg"; + String createIceberg = "CREATE TABLE %s (id Integer, name String, dept String) USING iceberg"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -256,12 +272,15 @@ public void addDataPartitionedMissingCol() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -275,17 +294,20 @@ public void addDataPartitioned() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } - @Ignore // TODO Classpath issues prevent us from actually writing to a Spark ORC table + @Ignore // TODO Classpath issues prevent us from actually writing to a Spark ORC table public void addDataPartitionedOrc() { createPartitionedFileTable("orc"); @@ -294,12 +316,15 @@ public void addDataPartitionedOrc() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -314,12 +339,15 @@ public void addDataPartitionedAvro() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`avro`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`avro`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -333,12 +361,13 @@ public void addDataPartitionedHive() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + Object result = + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -352,12 +381,15 @@ public void addPartitionToPartitioned() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -371,13 +403,18 @@ public void addDataPartitionedByDateToPartitioned() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('date', '2021-01-01'))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('date', '2021-01-01'))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", - sql("SELECT id, name, dept, date FROM %s WHERE date = '2021-01-01' ORDER BY id", sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT id, name, dept, date FROM %s WHERE date = '2021-01-01' ORDER BY id", + sourceTableName), sql("SELECT id, name, dept, date FROM %s ORDER BY id", tableName)); } @@ -386,17 +423,20 @@ public void addFilteredPartitionsToPartitioned() { createCompositePartitionedTable("parquet"); String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (id, dept)"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (id, dept)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -406,18 +446,23 @@ public void addFilteredPartitionsToPartitioned2() { createCompositePartitionedTable("parquet"); String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (id, dept)"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (id, dept)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(6L, result); - assertEquals("Iceberg table contains correct data", - sql("SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", + sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -426,17 +471,20 @@ public void addFilteredPartitionsToPartitionedWithNullValueFilteringOnId() { createCompositePartitionedTableWithNullValueInPartitionColumn("parquet"); String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (id, dept)"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (id, dept)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -446,18 +494,23 @@ public void addFilteredPartitionsToPartitionedWithNullValueFilteringOnDept() { createCompositePartitionedTableWithNullValueInPartitionColumn("parquet"); String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (id, dept)"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (id, dept)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(6L, result); - assertEquals("Iceberg table contains correct data", - sql("SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", + sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -466,13 +519,15 @@ public void addWeirdCaseHiveTable() { createWeirdCaseTable(); String createIceberg = - "CREATE TABLE %s (id Integer, `naMe` String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (`naMe`)"; + "CREATE TABLE %s (id Integer, `naMe` String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (`naMe`)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '%s', map('naMe', 'John Doe'))", - catalogName, tableName, sourceTableName); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '%s', map('naMe', 'John Doe'))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result); @@ -482,22 +537,30 @@ public void addWeirdCaseHiveTable() { Spark does not actually handle this pushdown correctly for hive based tables and it returns 0 records */ List expected = - sql("SELECT id, `naMe`, dept, subdept from %s ORDER BY id", sourceTableName) - .stream() + sql("SELECT id, `naMe`, dept, subdept from %s ORDER BY id", sourceTableName).stream() .filter(r -> r[1].equals("John Doe")) .collect(Collectors.toList()); // TODO when this assert breaks Spark fixed the pushdown issue - Assert.assertEquals("If this assert breaks it means that Spark has fixed the pushdown issue", 0, - sql("SELECT id, `naMe`, dept, subdept from %s WHERE `naMe` = 'John Doe' ORDER BY id", sourceTableName) + Assert.assertEquals( + "If this assert breaks it means that Spark has fixed the pushdown issue", + 0, + sql( + "SELECT id, `naMe`, dept, subdept from %s WHERE `naMe` = 'John Doe' ORDER BY id", + sourceTableName) .size()); // Pushdown works for iceberg - Assert.assertEquals("We should be able to pushdown mixed case partition keys", 2, - sql("SELECT id, `naMe`, dept, subdept FROM %s WHERE `naMe` = 'John Doe' ORDER BY id", tableName) + Assert.assertEquals( + "We should be able to pushdown mixed case partition keys", + 2, + sql( + "SELECT id, `naMe`, dept, subdept FROM %s WHERE `naMe` = 'John Doe' ORDER BY id", + tableName) .size()); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", expected, sql("SELECT id, `naMe`, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -511,12 +574,15 @@ public void addPartitionToPartitionedHive() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '%s', map('id', 1))", - catalogName, tableName, sourceTableName); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '%s', map('id', 1))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -530,19 +596,23 @@ public void invalidDataImport() { sql(createIceberg, tableName); - AssertHelpers.assertThrows("Should forbid adding of partitioned data to unpartitioned table", + AssertHelpers.assertThrows( + "Should forbid adding of partitioned data to unpartitioned table", IllegalArgumentException.class, "Cannot use partition filter with an unpartitioned table", - () -> scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", - catalogName, tableName, fileTableDir.getAbsolutePath()) - ); + () -> + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", + catalogName, tableName, fileTableDir.getAbsolutePath())); - AssertHelpers.assertThrows("Should forbid adding of partitioned data to unpartitioned table", + AssertHelpers.assertThrows( + "Should forbid adding of partitioned data to unpartitioned table", IllegalArgumentException.class, "Cannot add partitioned files to an unpartitioned table", - () -> scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()) - ); + () -> + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath())); } @Test @@ -554,20 +624,25 @@ public void invalidDataImportPartitioned() { sql(createIceberg, tableName); - AssertHelpers.assertThrows("Should forbid adding with a mismatching partition spec", + AssertHelpers.assertThrows( + "Should forbid adding with a mismatching partition spec", IllegalArgumentException.class, "is greater than the number of partitioned columns", - () -> scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('x', '1', 'y', '2'))", - catalogName, tableName, fileTableDir.getAbsolutePath())); + () -> + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('x', '1', 'y', '2'))", + catalogName, tableName, fileTableDir.getAbsolutePath())); - AssertHelpers.assertThrows("Should forbid adding with partition spec with incorrect columns", + AssertHelpers.assertThrows( + "Should forbid adding with partition spec with incorrect columns", IllegalArgumentException.class, "specified partition filter refers to columns that are not partitioned", - () -> scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', '2'))", - catalogName, tableName, fileTableDir.getAbsolutePath())); + () -> + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', '2'))", + catalogName, tableName, fileTableDir.getAbsolutePath())); } - @Test public void addTwice() { createPartitionedHiveTable(); @@ -577,24 +652,30 @@ public void addTwice() { sql(createIceberg, tableName); - Object result1 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1))", - catalogName, tableName, sourceTableName); + Object result1 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result1); - Object result2 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 2))", - catalogName, tableName, sourceTableName); + Object result2 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 2))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result2); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", tableName)); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 2 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s WHERE id = 2 ORDER BY id", tableName)); } @@ -608,21 +689,25 @@ public void duplicateDataPartitioned() { sql(createIceberg, tableName); - scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1))", + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1))", catalogName, tableName, sourceTableName); - AssertHelpers.assertThrows("Should not allow adding duplicate files", + AssertHelpers.assertThrows( + "Should not allow adding duplicate files", IllegalStateException.class, - "Cannot complete import because data files to be imported already" + - " exist within the target table", - () -> scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1))", - catalogName, tableName, sourceTableName)); + "Cannot complete import because data files to be imported already" + + " exist within the target table", + () -> + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1))", + catalogName, tableName, sourceTableName)); } @Test @@ -634,27 +719,33 @@ public void duplicateDataPartitionedAllowed() { sql(createIceberg, tableName); - Object result1 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1))", - catalogName, tableName, sourceTableName); + Object result1 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result1); - Object result2 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1)," + - "check_duplicate_files => false)", - catalogName, tableName, sourceTableName); + Object result2 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1)," + + "check_duplicate_files => false)", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result2); - - assertEquals("Iceberg table contains correct data", - sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 UNION ALL " + - "SELECT id, name, dept, subdept FROM %s WHERE id = 1", sourceTableName, sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT id, name, dept, subdept FROM %s WHERE id = 1 UNION ALL " + + "SELECT id, name, dept, subdept FROM %s WHERE id = 1", + sourceTableName, sourceTableName), sql("SELECT id, name, dept, subdept FROM %s", tableName, tableName)); } @@ -667,15 +758,16 @@ public void duplicateDataUnpartitioned() { sql(createIceberg, tableName); - scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); - AssertHelpers.assertThrows("Should not allow adding duplicate files", + AssertHelpers.assertThrows( + "Should not allow adding duplicate files", IllegalStateException.class, - "Cannot complete import because data files to be imported already" + - " exist within the target table", - () -> scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName)); + "Cannot complete import because data files to be imported already" + + " exist within the target table", + () -> + scalarSql( + "CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName)); } @Test @@ -687,23 +779,25 @@ public void duplicateDataUnpartitionedAllowed() { sql(createIceberg, tableName); - Object result1 = scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + Object result1 = + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result1); - Object result2 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s'," + - "check_duplicate_files => false)", - catalogName, tableName, sourceTableName); + Object result2 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s'," + + "check_duplicate_files => false)", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result2); - assertEquals("Iceberg table contains correct data", - sql("SELECT * FROM (SELECT * FROM %s UNION ALL " + - "SELECT * from %s) ORDER BY id", sourceTableName, sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT * FROM (SELECT * FROM %s UNION ALL " + "SELECT * from %s) ORDER BY id", + sourceTableName, sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); - - } @Test @@ -714,21 +808,26 @@ public void testEmptyImportDoesNotThrow() { sql(createIceberg, tableName); // Empty path based import - Object pathResult = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object pathResult = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(0L, pathResult); - assertEquals("Iceberg table contains no added data when importing from an empty path", + assertEquals( + "Iceberg table contains no added data when importing from an empty path", emptyQueryResult, sql("SELECT * FROM %s ORDER BY id", tableName)); // Empty table based import - String createHive = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet"; + String createHive = + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet"; sql(createHive, sourceTableName); - Object tableResult = scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + Object tableResult = + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); Assert.assertEquals(0L, tableResult); - assertEquals("Iceberg table contains no added data when importing from an empty table", + assertEquals( + "Iceberg table contains no added data when importing from an empty table", emptyQueryResult, sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -739,22 +838,26 @@ public void testPartitionedImportFromEmptyPartitionDoesNotThrow() { final int emptyPartitionId = 999; // Add an empty partition to the hive table - sql("ALTER TABLE %s ADD PARTITION (id = '%d') LOCATION '%d'", sourceTableName, - emptyPartitionId, emptyPartitionId); + sql( + "ALTER TABLE %s ADD PARTITION (id = '%d') LOCATION '%d'", + sourceTableName, emptyPartitionId, emptyPartitionId); String createIceberg = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg PARTITIONED BY (id)"; sql(createIceberg, tableName); - Object tableResult = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', %d))", - catalogName, tableName, sourceTableName, emptyPartitionId); + Object tableResult = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', %d))", + catalogName, tableName, sourceTableName, emptyPartitionId); Assert.assertEquals(0L, tableResult); - assertEquals("Iceberg table contains no added data when importing from an empty table", + assertEquals( + "Iceberg table contains no added data when importing from an empty table", emptyQueryResult, sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -762,26 +865,28 @@ public void testPartitionedImportFromEmptyPartitionDoesNotThrow() { private static final List emptyQueryResult = Lists.newArrayList(); private static final StructField[] struct = { - new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), - new StructField("name", DataTypes.StringType, true, Metadata.empty()), - new StructField("dept", DataTypes.StringType, true, Metadata.empty()), - new StructField("subdept", DataTypes.StringType, true, Metadata.empty()) + new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), + new StructField("name", DataTypes.StringType, true, Metadata.empty()), + new StructField("dept", DataTypes.StringType, true, Metadata.empty()), + new StructField("subdept", DataTypes.StringType, true, Metadata.empty()) }; private static final Dataset unpartitionedDF = - spark.createDataFrame( - ImmutableList.of( - RowFactory.create(1, "John Doe", "hr", "communications"), - RowFactory.create(2, "Jane Doe", "hr", "salary"), - RowFactory.create(3, "Matt Doe", "hr", "communications"), - RowFactory.create(4, "Will Doe", "facilities", "all")), - new StructType(struct)).repartition(1); + spark + .createDataFrame( + ImmutableList.of( + RowFactory.create(1, "John Doe", "hr", "communications"), + RowFactory.create(2, "Jane Doe", "hr", "salary"), + RowFactory.create(3, "Matt Doe", "hr", "communications"), + RowFactory.create(4, "Will Doe", "facilities", "all")), + new StructType(struct)) + .repartition(1); private static final Dataset singleNullRecordDF = - spark.createDataFrame( - ImmutableList.of( - RowFactory.create(null, null, null, null)), - new StructType(struct)).repartition(1); + spark + .createDataFrame( + ImmutableList.of(RowFactory.create(null, null, null, null)), new StructType(struct)) + .repartition(1); private static final Dataset partitionedDF = unpartitionedDF.select("name", "dept", "subdept", "id"); @@ -800,10 +905,10 @@ public void testPartitionedImportFromEmptyPartitionDoesNotThrow() { unpartitionedDF.col("name").as("naMe")); private static final StructField[] dateStruct = { - new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), - new StructField("name", DataTypes.StringType, true, Metadata.empty()), - new StructField("dept", DataTypes.StringType, true, Metadata.empty()), - new StructField("ts", DataTypes.DateType, true, Metadata.empty()) + new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), + new StructField("name", DataTypes.StringType, true, Metadata.empty()), + new StructField("dept", DataTypes.StringType, true, Metadata.empty()), + new StructField("ts", DataTypes.DateType, true, Metadata.empty()) }; private static java.sql.Date toDate(String value) { @@ -811,15 +916,17 @@ private static java.sql.Date toDate(String value) { } private static final Dataset dateDF = - spark.createDataFrame( - ImmutableList.of( - RowFactory.create(1, "John Doe", "hr", toDate("2021-01-01")), - RowFactory.create(2, "Jane Doe", "hr", toDate("2021-01-01")), - RowFactory.create(3, "Matt Doe", "hr", toDate("2021-01-02")), - RowFactory.create(4, "Will Doe", "facilities", toDate("2021-01-02"))), - new StructType(dateStruct)).repartition(2); - - private void createUnpartitionedFileTable(String format) { + spark + .createDataFrame( + ImmutableList.of( + RowFactory.create(1, "John Doe", "hr", toDate("2021-01-01")), + RowFactory.create(2, "Jane Doe", "hr", toDate("2021-01-01")), + RowFactory.create(3, "Matt Doe", "hr", toDate("2021-01-02")), + RowFactory.create(4, "Will Doe", "facilities", toDate("2021-01-02"))), + new StructType(dateStruct)) + .repartition(2); + + private void createUnpartitionedFileTable(String format) { String createParquet = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s LOCATION '%s'"; @@ -828,10 +935,10 @@ private void createUnpartitionedFileTable(String format) { unpartitionedDF.write().insertInto(sourceTableName); } - private void createPartitionedFileTable(String format) { + private void createPartitionedFileTable(String format) { String createParquet = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s PARTITIONED BY (id) " + - "LOCATION '%s'"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s PARTITIONED BY (id) " + + "LOCATION '%s'"; sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath()); @@ -840,8 +947,9 @@ private void createPartitionedFileTable(String format) { } private void createCompositePartitionedTable(String format) { - String createParquet = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s " + - "PARTITIONED BY (id, dept) LOCATION '%s'"; + String createParquet = + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s " + + "PARTITIONED BY (id, dept) LOCATION '%s'"; sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath()); compositePartitionedDF.write().insertInto(sourceTableName); @@ -849,11 +957,14 @@ private void createCompositePartitionedTable(String format) { } private void createCompositePartitionedTableWithNullValueInPartitionColumn(String format) { - String createParquet = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s " + - "PARTITIONED BY (id, dept) LOCATION '%s'"; + String createParquet = + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s " + + "PARTITIONED BY (id, dept) LOCATION '%s'"; sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath()); - Dataset unionedDF = compositePartitionedDF.unionAll(compositePartitionedNullRecordDF) + Dataset unionedDF = + compositePartitionedDF + .unionAll(compositePartitionedNullRecordDF) .select("name", "subdept", "id", "dept") .repartition(1); @@ -863,18 +974,18 @@ private void createCompositePartitionedTableWithNullValueInPartitionColumn(Strin private void createWeirdCaseTable() { String createParquet = - "CREATE TABLE %s (id Integer, subdept String, dept String) " + - "PARTITIONED BY (`naMe` String) STORED AS parquet"; + "CREATE TABLE %s (id Integer, subdept String, dept String) " + + "PARTITIONED BY (`naMe` String) STORED AS parquet"; sql(createParquet, sourceTableName); weirdColumnNamesDF.write().insertInto(sourceTableName); weirdColumnNamesDF.write().insertInto(sourceTableName); - } private void createUnpartitionedHiveTable() { - String createHive = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet"; + String createHive = + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet"; sql(createHive, sourceTableName); @@ -883,8 +994,9 @@ private void createUnpartitionedHiveTable() { } private void createPartitionedHiveTable() { - String createHive = "CREATE TABLE %s (name String, dept String, subdept String) " + - "PARTITIONED BY (id Integer) STORED AS parquet"; + String createHive = + "CREATE TABLE %s (name String, dept String, subdept String) " + + "PARTITIONED BY (id Integer) STORED AS parquet"; sql(createHive, sourceTableName); @@ -892,9 +1004,10 @@ private void createPartitionedHiveTable() { partitionedDF.write().insertInto(sourceTableName); } - private void createDatePartitionedFileTable(String format) { - String createParquet = "CREATE TABLE %s (id Integer, name String, dept String, date Date) USING %s " + - "PARTITIONED BY (date) LOCATION '%s'"; + private void createDatePartitionedFileTable(String format) { + String createParquet = + "CREATE TABLE %s (id Integer, name String, dept String, date Date) USING %s " + + "PARTITIONED BY (date) LOCATION '%s'"; sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath()); diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java index 9d630508b6e4..8aee7c97752f 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -31,7 +30,8 @@ import org.junit.Test; public class TestAlterTablePartitionFields extends SparkExtensionsTestBase { - public TestAlterTablePartitionFields(String catalogName, String implementation, Map config) { + public TestAlterTablePartitionFields( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -42,7 +42,9 @@ public void removeTable() { @Test public void testAddIdentityPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -51,17 +53,17 @@ public void testAddIdentityPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .identity("category") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).identity("category").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddBucketPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -70,17 +72,20 @@ public void testAddBucketPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .bucket("id", 16, "id_bucket_16") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .bucket("id", 16, "id_bucket_16") + .build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddTruncatePartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -89,17 +94,20 @@ public void testAddTruncatePartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .truncate("data", 4, "data_trunc_4") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .truncate("data", 4, "data_trunc_4") + .build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddYearsPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -108,17 +116,17 @@ public void testAddYearsPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .year("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).year("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddMonthsPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -127,17 +135,17 @@ public void testAddMonthsPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .month("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).month("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddDaysPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -146,17 +154,17 @@ public void testAddDaysPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddHoursPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -165,17 +173,17 @@ public void testAddHoursPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .hour("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).hour("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddNamedPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -184,77 +192,83 @@ public void testAddNamedPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .bucket("id", 16, "shard") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).bucket("id", 16, "shard").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testDropIdentityPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg PARTITIONED BY (category)", + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg PARTITIONED BY (category)", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Table should start with 1 partition field", 1, table.spec().fields().size()); + Assert.assertEquals( + "Table should start with 1 partition field", 1, table.spec().fields().size()); sql("ALTER TABLE %s DROP PARTITION FIELD category", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .alwaysNull("category", "category") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .alwaysNull("category", "category") + .build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testDropDaysPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, ts timestamp, data string) USING iceberg PARTITIONED BY (days(ts))", + sql( + "CREATE TABLE %s (id bigint NOT NULL, ts timestamp, data string) USING iceberg PARTITIONED BY (days(ts))", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Table should start with 1 partition field", 1, table.spec().fields().size()); + Assert.assertEquals( + "Table should start with 1 partition field", 1, table.spec().fields().size()); sql("ALTER TABLE %s DROP PARTITION FIELD days(ts)", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .alwaysNull("ts", "ts_day") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).alwaysNull("ts", "ts_day").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testDropBucketPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (bucket(16, id))", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (bucket(16, id))", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Table should start with 1 partition field", 1, table.spec().fields().size()); + Assert.assertEquals( + "Table should start with 1 partition field", 1, table.spec().fields().size()); sql("ALTER TABLE %s DROP PARTITION FIELD bucket(16, id)", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .alwaysNull("id", "id_bucket") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .alwaysNull("id", "id_bucket") + .build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testDropPartitionByName() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -270,114 +284,121 @@ public void testDropPartitionByName() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("id", "shard") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(2).alwaysNull("id", "shard").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testReplacePartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts)", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); sql("ALTER TABLE %s REPLACE PARTITION FIELD days(ts) WITH hours(ts)", tableName); table.refresh(); - expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("ts", "ts_day") - .hour("ts") - .build(); - Assert.assertEquals("Should changed from daily to hourly partitioned field", expected, table.spec()); + expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("ts", "ts_day") + .hour("ts") + .build(); + Assert.assertEquals( + "Should changed from daily to hourly partitioned field", expected, table.spec()); } @Test public void testReplacePartitionAndRename() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts)", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); sql("ALTER TABLE %s REPLACE PARTITION FIELD days(ts) WITH hours(ts) AS hour_col", tableName); table.refresh(); - expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("ts", "ts_day") - .hour("ts", "hour_col") - .build(); - Assert.assertEquals("Should changed from daily to hourly partitioned field", expected, table.spec()); + expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("ts", "ts_day") + .hour("ts", "hour_col") + .build(); + Assert.assertEquals( + "Should changed from daily to hourly partitioned field", expected, table.spec()); } @Test public void testReplaceNamedPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts) AS day_col", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts", "day_col") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts", "day_col").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); sql("ALTER TABLE %s REPLACE PARTITION FIELD day_col WITH hours(ts)", tableName); table.refresh(); - expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("ts", "day_col") - .hour("ts") - .build(); - Assert.assertEquals("Should changed from daily to hourly partitioned field", expected, table.spec()); + expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("ts", "day_col") + .hour("ts") + .build(); + Assert.assertEquals( + "Should changed from daily to hourly partitioned field", expected, table.spec()); } @Test public void testReplaceNamedPartitionAndRenameDifferently() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts) AS day_col", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts", "day_col") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts", "day_col").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); sql("ALTER TABLE %s REPLACE PARTITION FIELD day_col WITH hours(ts) AS hour_col", tableName); table.refresh(); - expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("ts", "day_col") - .hour("ts", "hour_col") - .build(); - Assert.assertEquals("Should changed from daily to hourly partitioned field", expected, table.spec()); + expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("ts", "day_col") + .hour("ts", "hour_col") + .build(); + Assert.assertEquals( + "Should changed from daily to hourly partitioned field", expected, table.spec()); } @Test public void testSparkTableAddDropPartitions() throws Exception { sql("CREATE TABLE %s (id bigint NOT NULL, ts timestamp, data string) USING iceberg", tableName); - Assert.assertEquals("spark table partition should be empty", 0, sparkTable().partitioning().length); + Assert.assertEquals( + "spark table partition should be empty", 0, sparkTable().partitioning().length); sql("ALTER TABLE %s ADD PARTITION FIELD bucket(16, id) AS shard", tableName); assertPartitioningEquals(sparkTable(), 1, "bucket(16, id)"); @@ -396,13 +417,16 @@ public void testSparkTableAddDropPartitions() throws Exception { sql("ALTER TABLE %s DROP PARTITION FIELD shard", tableName); sql("DESCRIBE %s", tableName); - Assert.assertEquals("spark table partition should be empty", 0, sparkTable().partitioning().length); + Assert.assertEquals( + "spark table partition should be empty", 0, sparkTable().partitioning().length); } private void assertPartitioningEquals(SparkTable table, int len, String transform) { Assert.assertEquals("spark table partition should be " + len, len, table.partitioning().length); - Assert.assertEquals("latest spark table partition transform should match", - transform, table.partitioning()[len - 1].toString()); + Assert.assertEquals( + "latest spark table partition transform should match", + transform, + table.partitioning()[len - 1].toString()); } private SparkTable sparkTable() throws Exception { diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java index ac12953d0a7e..c993c213dc5e 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -28,7 +27,8 @@ import org.junit.Test; public class TestAlterTableSchema extends SparkExtensionsTestBase { - public TestAlterTableSchema(String catalogName, String implementation, Map config) { + public TestAlterTableSchema( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -39,20 +39,25 @@ public void removeTable() { @Test public void testSetIdentifierFields() { - sql("CREATE TABLE %s (id bigint NOT NULL, " + - "location struct NOT NULL) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, " + + "location struct NOT NULL) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertTrue("Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); + Assert.assertTrue( + "Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier field", + Assert.assertEquals( + "Should have new identifier field", Sets.newHashSet(table.schema().findField("id").fieldId()), table.schema().identifierFieldIds()); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id, location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier field", + Assert.assertEquals( + "Should have new identifier field", Sets.newHashSet( table.schema().findField("id").fieldId(), table.schema().findField("location.lon").fieldId()), @@ -60,7 +65,8 @@ public void testSetIdentifierFields() { sql("ALTER TABLE %s SET IDENTIFIER FIELDS location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier field", + Assert.assertEquals( + "Should have new identifier field", Sets.newHashSet(table.schema().findField("location.lon").fieldId()), table.schema().identifierFieldIds()); } @@ -69,13 +75,16 @@ public void testSetIdentifierFields() { public void testSetInvalidIdentifierFields() { sql("CREATE TABLE %s (id bigint NOT NULL, id2 bigint) USING iceberg", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertTrue("Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); - AssertHelpers.assertThrows("should not allow setting unknown fields", + Assert.assertTrue( + "Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); + AssertHelpers.assertThrows( + "should not allow setting unknown fields", IllegalArgumentException.class, "not found in current schema or added columns", () -> sql("ALTER TABLE %s SET IDENTIFIER FIELDS unknown", tableName)); - AssertHelpers.assertThrows("should not allow setting optional fields", + AssertHelpers.assertThrows( + "should not allow setting optional fields", IllegalArgumentException.class, "not a required field", () -> sql("ALTER TABLE %s SET IDENTIFIER FIELDS id2", tableName)); @@ -83,14 +92,18 @@ public void testSetInvalidIdentifierFields() { @Test public void testDropIdentifierFields() { - sql("CREATE TABLE %s (id bigint NOT NULL, " + - "location struct NOT NULL) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, " + + "location struct NOT NULL) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertTrue("Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); + Assert.assertTrue( + "Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id, location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier fields", + Assert.assertEquals( + "Should have new identifier fields", Sets.newHashSet( table.schema().findField("id").fieldId(), table.schema().findField("location.lon").fieldId()), @@ -98,13 +111,15 @@ public void testDropIdentifierFields() { sql("ALTER TABLE %s DROP IDENTIFIER FIELDS id", tableName); table.refresh(); - Assert.assertEquals("Should removed identifier field", + Assert.assertEquals( + "Should removed identifier field", Sets.newHashSet(table.schema().findField("location.lon").fieldId()), table.schema().identifierFieldIds()); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id, location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier fields", + Assert.assertEquals( + "Should have new identifier fields", Sets.newHashSet( table.schema().findField("id").fieldId(), table.schema().findField("location.lon").fieldId()), @@ -112,29 +127,34 @@ public void testDropIdentifierFields() { sql("ALTER TABLE %s DROP IDENTIFIER FIELDS id, location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have no identifier field", - Sets.newHashSet(), - table.schema().identifierFieldIds()); + Assert.assertEquals( + "Should have no identifier field", Sets.newHashSet(), table.schema().identifierFieldIds()); } @Test public void testDropInvalidIdentifierFields() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string NOT NULL, " + - "location struct NOT NULL) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string NOT NULL, " + + "location struct NOT NULL) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertTrue("Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); - AssertHelpers.assertThrows("should not allow dropping unknown fields", + Assert.assertTrue( + "Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); + AssertHelpers.assertThrows( + "should not allow dropping unknown fields", IllegalArgumentException.class, "field unknown not found", () -> sql("ALTER TABLE %s DROP IDENTIFIER FIELDS unknown", tableName)); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id", tableName); - AssertHelpers.assertThrows("should not allow dropping a field that is not an identifier", + AssertHelpers.assertThrows( + "should not allow dropping a field that is not an identifier", IllegalArgumentException.class, "data is not an identifier field", () -> sql("ALTER TABLE %s DROP IDENTIFIER FIELDS data", tableName)); - AssertHelpers.assertThrows("should not allow dropping a nested field that is not an identifier", + AssertHelpers.assertThrows( + "should not allow dropping a nested field that is not an identifier", IllegalArgumentException.class, "location.lon is not an identifier field", () -> sql("ALTER TABLE %s DROP IDENTIFIER FIELDS location.lon", tableName)); diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java index baf464d94ad0..d676101b1076 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.List; @@ -30,7 +29,8 @@ public class TestAncestorsOfProcedure extends SparkExtensionsTestBase { - public TestAncestorsOfProcedure(String catalogName, String implementation, Map config) { + public TestAncestorsOfProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -51,14 +51,12 @@ public void testAncestorOfUsingEmptyArgs() { Long preSnapshotId = table.currentSnapshot().parentId(); Long preTimeStamp = table.snapshot(table.currentSnapshot().parentId()).timestampMillis(); - List output = sql("CALL %s.system.ancestors_of('%s')", - catalogName, tableIdent); + List output = sql("CALL %s.system.ancestors_of('%s')", catalogName, tableIdent); assertEquals( "Procedure output must match", ImmutableList.of( - row(currentSnapshotId, currentTimestamp), - row(preSnapshotId, preTimeStamp)), + row(currentSnapshotId, currentTimestamp), row(preSnapshotId, preTimeStamp)), output); } @@ -77,8 +75,7 @@ public void testAncestorOfUsingSnapshotId() { assertEquals( "Procedure output must match", ImmutableList.of( - row(currentSnapshotId, currentTimestamp), - row(preSnapshotId, preTimeStamp)), + row(currentSnapshotId, currentTimestamp), row(preSnapshotId, preTimeStamp)), sql("CALL %s.system.ancestors_of('%s', %dL)", catalogName, tableIdent, currentSnapshotId)); assertEquals( @@ -105,7 +102,8 @@ public void testAncestorOfWithRollBack() { Long thirdTimestamp = table.currentSnapshot().timestampMillis(); // roll back - sql("CALL %s.system.rollback_to_snapshot('%s', %dL)", + sql( + "CALL %s.system.rollback_to_snapshot('%s', %dL)", catalogName, tableIdent, secondSnapshotId); sql("INSERT INTO TABLE %s VALUES (4, 'd')", tableName); @@ -142,22 +140,29 @@ public void testAncestorOfUsingNamedArgs() { assertEquals( "Procedure output must match", ImmutableList.of(row(firstSnapshotId, firstTimestamp)), - sql("CALL %s.system.ancestors_of(snapshot_id => %dL, table => '%s')", + sql( + "CALL %s.system.ancestors_of(snapshot_id => %dL, table => '%s')", catalogName, firstSnapshotId, tableIdent)); } @Test public void testInvalidAncestorOfCases() { - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.ancestors_of()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier for argument table", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier for argument table", () -> sql("CALL %s.system.ancestors_of('')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for snapshot_id: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for snapshot_id: cannot cast", () -> sql("CALL %s.system.ancestors_of('%s', 1.1)", catalogName, tableIdent)); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java index a6815d21a7b3..9c2233ccb791 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.math.BigDecimal; @@ -49,19 +48,19 @@ public class TestCallStatementParser { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; private static ParserInterface parser = null; @BeforeClass public static void startSpark() { - TestCallStatementParser.spark = SparkSession.builder() - .master("local[2]") - .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) - .config("spark.extra.prop", "value") - .getOrCreate(); + TestCallStatementParser.spark = + SparkSession.builder() + .master("local[2]") + .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) + .config("spark.extra.prop", "value") + .getOrCreate(); TestCallStatementParser.parser = spark.sessionState().sqlParser(); } @@ -75,8 +74,10 @@ public static void stopSpark() { @Test public void testCallWithPositionalArgs() throws ParseException { - CallStatement call = (CallStatement) parser.parsePlan("CALL c.n.func(1, '2', 3L, true, 1.0D, 9.0e1, 900e-1BD)"); - Assert.assertEquals(ImmutableList.of("c", "n", "func"), JavaConverters.seqAsJavaList(call.name())); + CallStatement call = + (CallStatement) parser.parsePlan("CALL c.n.func(1, '2', 3L, true, 1.0D, 9.0e1, 900e-1BD)"); + Assert.assertEquals( + ImmutableList.of("c", "n", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(7, call.args().size()); @@ -91,8 +92,10 @@ public void testCallWithPositionalArgs() throws ParseException { @Test public void testCallWithNamedArgs() throws ParseException { - CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func(c1 => 1, c2 => '2', c3 => true)"); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + CallStatement call = + (CallStatement) parser.parsePlan("CALL cat.system.func(c1 => 1, c2 => '2', c3 => true)"); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(3, call.args().size()); @@ -104,7 +107,8 @@ public void testCallWithNamedArgs() throws ParseException { @Test public void testCallWithMixedArgs() throws ParseException { CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func(c1 => 1, '2')"); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(2, call.args().size()); @@ -114,18 +118,24 @@ public void testCallWithMixedArgs() throws ParseException { @Test public void testCallWithTimestampArg() throws ParseException { - CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func(TIMESTAMP '2017-02-03T10:37:30.00Z')"); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + CallStatement call = + (CallStatement) + parser.parsePlan("CALL cat.system.func(TIMESTAMP '2017-02-03T10:37:30.00Z')"); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(1, call.args().size()); - checkArg(call, 0, Timestamp.from(Instant.parse("2017-02-03T10:37:30.00Z")), DataTypes.TimestampType); + checkArg( + call, 0, Timestamp.from(Instant.parse("2017-02-03T10:37:30.00Z")), DataTypes.TimestampType); } @Test public void testCallWithVarSubstitution() throws ParseException { - CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func('${spark.extra.prop}')"); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + CallStatement call = + (CallStatement) parser.parsePlan("CALL cat.system.func('${spark.extra.prop}')"); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(1, call.args().size()); @@ -134,29 +144,31 @@ public void testCallWithVarSubstitution() throws ParseException { @Test public void testCallParseError() { - AssertHelpers.assertThrows("Should fail with a sensible parse error", IcebergParseException.class, + AssertHelpers.assertThrows( + "Should fail with a sensible parse error", + IcebergParseException.class, "missing '(' at 'radish'", () -> parser.parsePlan("CALL cat.system radish kebab")); } @Test public void testCallStripsComments() throws ParseException { - List callStatementsWithComments = Lists.newArrayList( - "/* bracketed comment */ CALL cat.system.func('${spark.extra.prop}')", - "/**/ CALL cat.system.func('${spark.extra.prop}')", - "-- single line comment \n CALL cat.system.func('${spark.extra.prop}')", - "-- multiple \n-- single line \n-- comments \n CALL cat.system.func('${spark.extra.prop}')", - "/* select * from multiline_comment \n where x like '%sql%'; */ CALL cat.system.func('${spark.extra.prop}')", - "/* {\"app\": \"dbt\", \"dbt_version\": \"1.0.1\", \"profile_name\": \"profile1\", \"target_name\": \"dev\", " + - "\"node_id\": \"model.profile1.stg_users\"} \n*/ CALL cat.system.func('${spark.extra.prop}')", - "/* Some multi-line comment \n" + - "*/ CALL /* inline comment */ cat.system.func('${spark.extra.prop}') -- ending comment", - "CALL -- a line ending comment\n" + - "cat.system.func('${spark.extra.prop}')" - ); + List callStatementsWithComments = + Lists.newArrayList( + "/* bracketed comment */ CALL cat.system.func('${spark.extra.prop}')", + "/**/ CALL cat.system.func('${spark.extra.prop}')", + "-- single line comment \n CALL cat.system.func('${spark.extra.prop}')", + "-- multiple \n-- single line \n-- comments \n CALL cat.system.func('${spark.extra.prop}')", + "/* select * from multiline_comment \n where x like '%sql%'; */ CALL cat.system.func('${spark.extra.prop}')", + "/* {\"app\": \"dbt\", \"dbt_version\": \"1.0.1\", \"profile_name\": \"profile1\", \"target_name\": \"dev\", " + + "\"node_id\": \"model.profile1.stg_users\"} \n*/ CALL cat.system.func('${spark.extra.prop}')", + "/* Some multi-line comment \n" + + "*/ CALL /* inline comment */ cat.system.func('${spark.extra.prop}') -- ending comment", + "CALL -- a line ending comment\n" + "cat.system.func('${spark.extra.prop}')"); for (String sqlText : callStatementsWithComments) { CallStatement call = (CallStatement) parser.parsePlan(sqlText); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(1, call.args().size()); @@ -164,12 +176,17 @@ public void testCallStripsComments() throws ParseException { } } - private void checkArg(CallStatement call, int index, Object expectedValue, DataType expectedType) { + private void checkArg( + CallStatement call, int index, Object expectedValue, DataType expectedType) { checkArg(call, index, null, expectedValue, expectedType); } - private void checkArg(CallStatement call, int index, String expectedName, - Object expectedValue, DataType expectedType) { + private void checkArg( + CallStatement call, + int index, + String expectedName, + Object expectedValue, + DataType expectedType) { if (expectedName != null) { NamedArgument arg = checkCast(call.args().apply(index), NamedArgument.class); @@ -190,7 +207,8 @@ private Literal toSparkLiteral(Object value, DataType dataType) { } private T checkCast(Object value, Class expectedClass) { - Assert.assertTrue("Expected instance of " + expectedClass.getName(), expectedClass.isInstance(value)); + Assert.assertTrue( + "Expected instance of " + expectedClass.getName(), expectedClass.isInstance(value)); return expectedClass.cast(value); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java index c69964693189..7309a176b922 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; + import java.util.List; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -34,11 +35,10 @@ import org.junit.After; import org.junit.Test; -import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; - public class TestCherrypickSnapshotProcedure extends SparkExtensionsTestBase { - public TestCherrypickSnapshotProcedure(String catalogName, String implementation, Map config) { + public TestCherrypickSnapshotProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -56,26 +56,30 @@ public void testCherrypickSnapshotUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.cherrypick_snapshot('%s', %dL)", - catalogName, tableIdent, wapSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.cherrypick_snapshot('%s', %dL)", + catalogName, tableIdent, wapSnapshot.snapshotId()); table.refresh(); Snapshot currentSnapshot = table.currentSnapshot(); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())), output); - assertEquals("Cherrypick must be successful", + assertEquals( + "Cherrypick must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -89,26 +93,30 @@ public void testCherrypickSnapshotUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.cherrypick_snapshot(snapshot_id => %dL, table => '%s')", - catalogName, wapSnapshot.snapshotId(), tableIdent); + List output = + sql( + "CALL %s.system.cherrypick_snapshot(snapshot_id => %dL, table => '%s')", + catalogName, wapSnapshot.snapshotId(), tableIdent); table.refresh(); Snapshot currentSnapshot = table.currentSnapshot(); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())), output); - assertEquals("Cherrypick must be successful", + assertEquals( + "Cherrypick must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -129,17 +137,20 @@ public void testCherrypickSnapshotRefreshesRelationCache() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - sql("CALL %s.system.cherrypick_snapshot('%s', %dL)", + sql( + "CALL %s.system.cherrypick_snapshot('%s', %dL)", catalogName, tableIdent, wapSnapshot.snapshotId()); - assertEquals("Cherrypick snapshot should be visible", + assertEquals( + "Cherrypick snapshot should be visible", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); @@ -150,31 +161,43 @@ public void testCherrypickSnapshotRefreshesRelationCache() { public void testCherrypickInvalidSnapshot() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should reject invalid snapshot id", - ValidationException.class, "Cannot cherry-pick unknown snapshot ID", + AssertHelpers.assertThrows( + "Should reject invalid snapshot id", + ValidationException.class, + "Cannot cherry-pick unknown snapshot ID", () -> sql("CALL %s.system.cherrypick_snapshot('%s', -1L)", catalogName, tableIdent)); } @Test public void testInvalidCherrypickSnapshotCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.cherrypick_snapshot('n', table => 't', 1L)", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.cherrypick_snapshot('n', 't', 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.cherrypick_snapshot('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.cherrypick_snapshot('', 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for snapshot_id: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for snapshot_id: cannot cast", () -> sql("CALL %s.system.cherrypick_snapshot('t', 2.2)", catalogName)); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestConflictValidation.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestConflictValidation.java index 4ce44818ab46..6fcbf1f903be 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestConflictValidation.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestConflictValidation.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.List; @@ -38,17 +37,20 @@ public class TestConflictValidation extends SparkExtensionsTestBase { - public TestConflictValidation(String catalogName, String implementation, Map config) { + public TestConflictValidation( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @Before public void createTables() { - sql("CREATE TABLE %s (id int, data string) USING iceberg " + - "PARTITIONED BY (id)" + - "TBLPROPERTIES" + - "('format-version'='2'," + - "'write.delete.mode'='merge-on-read')", tableName); + sql( + "CREATE TABLE %s (id int, data string) USING iceberg " + + "PARTITIONED BY (id)" + + "TBLPROPERTIES" + + "('format-version'='2'," + + "'write.delete.mode'='merge-on-read')", + tableName); sql("INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", tableName); } @@ -67,12 +69,14 @@ public void testOverwriteFilterSerializableIsolation() throws Exception { // Validating from previous snapshot finds conflicts Dataset conflictingDf = spark.createDataFrame(records, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting new data files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting new data files should throw exception", ValidationException.class, "Found conflicting files that can contain records matching ref(name=\"id\") == 1:", () -> { try { - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -84,7 +88,8 @@ public void testOverwriteFilterSerializableIsolation() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long newSnapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -92,9 +97,8 @@ public void testOverwriteFilterSerializableIsolation() throws Exception { @Test public void testOverwriteFilterSerializableIsolation2() throws Exception { - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(1, "b")); + List records = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(1, "b")); spark.createDataFrame(records, SimpleRecord.class).coalesce(1).writeTo(tableName).append(); Table table = validationCatalog.loadTable(tableIdent); @@ -107,12 +111,14 @@ public void testOverwriteFilterSerializableIsolation2() throws Exception { // Validating from previous snapshot finds conflicts List conflictingRecords = Lists.newArrayList(new SimpleRecord(1, "a")); Dataset conflictingDf = spark.createDataFrame(conflictingRecords, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting new delete files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting new delete files should throw exception", ValidationException.class, "Found new conflicting delete files that can apply to records matching ref(name=\"id\") == 1:", () -> { try { - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -124,7 +130,8 @@ public void testOverwriteFilterSerializableIsolation2() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long newSnapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -142,12 +149,14 @@ public void testOverwriteFilterSerializableIsolation3() throws Exception { // Validating from previous snapshot finds conflicts List conflictingRecords = Lists.newArrayList(new SimpleRecord(1, "a")); Dataset conflictingDf = spark.createDataFrame(conflictingRecords, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting deleted data files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting deleted data files should throw exception", ValidationException.class, "Found conflicting deleted files that can contain records matching ref(name=\"id\") == 1:", () -> { try { - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -159,7 +168,8 @@ public void testOverwriteFilterSerializableIsolation3() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long newSnapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -174,12 +184,14 @@ public void testOverwriteFilterNoSnapshotIdValidation() throws Exception { // Validating from no snapshot id defaults to beginning snapshot id and finds conflicts Dataset conflictingDf = spark.createDataFrame(records, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting new data files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting new data files should throw exception", ValidationException.class, "Found conflicting files that can contain records matching ref(name=\"id\") == 1:", () -> { try { - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwrite(functions.col("id").equalTo(1)); } catch (NoSuchTableException e) { @@ -190,7 +202,8 @@ public void testOverwriteFilterNoSnapshotIdValidation() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long newSnapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -198,9 +211,8 @@ public void testOverwriteFilterNoSnapshotIdValidation() throws Exception { @Test public void testOverwriteFilterSnapshotIsolation() throws Exception { - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(1, "b")); + List records = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(1, "b")); spark.createDataFrame(records, SimpleRecord.class).coalesce(1).writeTo(tableName).append(); Table table = validationCatalog.loadTable(tableIdent); @@ -213,12 +225,14 @@ public void testOverwriteFilterSnapshotIsolation() throws Exception { // Validating from previous snapshot finds conflicts List conflictingRecords = Lists.newArrayList(new SimpleRecord(1, "a")); Dataset conflictingDf = spark.createDataFrame(conflictingRecords, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting new delete files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting new delete files should throw exception", ValidationException.class, "Found new conflicting delete files that can apply to records matching ref(name=\"id\") == 1:", () -> { try { - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -230,7 +244,8 @@ public void testOverwriteFilterSnapshotIsolation() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long newSnapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -241,13 +256,13 @@ public void testOverwriteFilterSnapshotIsolation2() throws Exception { Table table = validationCatalog.loadTable(tableIdent); long snapshotId = table.currentSnapshot().snapshotId(); - List records = Lists.newArrayList( - new SimpleRecord(1, "a")); + List records = Lists.newArrayList(new SimpleRecord(1, "a")); spark.createDataFrame(records, SimpleRecord.class).writeTo(tableName).append(); // Validation should not fail due to conflicting data file in snapshot isolation mode Dataset conflictingDf = spark.createDataFrame(records, SimpleRecord.class); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -263,12 +278,14 @@ public void testOverwritePartitionSerializableIsolation() throws Exception { // Validating from previous snapshot finds conflicts Dataset conflictingDf = spark.createDataFrame(records, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting deleted data files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting deleted data files should throw exception", ValidationException.class, "Found conflicting files that can contain records matching partitions [id=1]", () -> { try { - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwritePartitions(); @@ -280,7 +297,8 @@ public void testOverwritePartitionSerializableIsolation() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long newSnapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwritePartitions(); @@ -288,9 +306,8 @@ public void testOverwritePartitionSerializableIsolation() throws Exception { @Test public void testOverwritePartitionSnapshotIsolation() throws Exception { - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(1, "b")); + List records = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(1, "b")); spark.createDataFrame(records, SimpleRecord.class).coalesce(1).writeTo(tableName).append(); Table table = validationCatalog.loadTable(tableIdent); @@ -301,12 +318,14 @@ public void testOverwritePartitionSnapshotIsolation() throws Exception { // Validating from previous snapshot finds conflicts Dataset conflictingDf = spark.createDataFrame(records, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting deleted data files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting deleted data files should throw exception", ValidationException.class, "Found new conflicting delete files that can apply to records matching [id=1]", () -> { try { - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) .overwritePartitions(); @@ -318,7 +337,8 @@ public void testOverwritePartitionSnapshotIsolation() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long newSnapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) .overwritePartitions(); @@ -333,20 +353,21 @@ public void testOverwritePartitionSnapshotIsolation2() throws Exception { sql("DELETE FROM %s WHERE id='1'", tableName); // Validating from previous snapshot finds conflicts - List records = Lists.newArrayList( - new SimpleRecord(1, "a")); + List records = Lists.newArrayList(new SimpleRecord(1, "a")); spark.createDataFrame(records, SimpleRecord.class).coalesce(1).writeTo(tableName).append(); Dataset conflictingDf = spark.createDataFrame(records, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting deleted data files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting deleted data files should throw exception", ValidationException.class, "Found conflicting deleted files that can apply to records matching [id=1]", () -> { try { - conflictingDf.writeTo(tableName) - .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) - .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) - .overwritePartitions(); + conflictingDf + .writeTo(tableName) + .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) + .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) + .overwritePartitions(); } catch (NoSuchTableException e) { throw new RuntimeException(e); } @@ -355,7 +376,8 @@ public void testOverwritePartitionSnapshotIsolation2() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long newSnapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) .overwritePartitions(); @@ -371,7 +393,8 @@ public void testOverwritePartitionSnapshotIsolation3() throws Exception { // Validation should not find conflicting data file in snapshot isolation mode Dataset conflictingDf = spark.createDataFrame(records, SimpleRecord.class); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) .overwritePartitions(); @@ -381,18 +404,19 @@ public void testOverwritePartitionSnapshotIsolation3() throws Exception { public void testOverwritePartitionNoSnapshotIdValidation() throws Exception { Table table = validationCatalog.loadTable(tableIdent); - List records = Lists.newArrayList( - new SimpleRecord(1, "a")); + List records = Lists.newArrayList(new SimpleRecord(1, "a")); spark.createDataFrame(records, SimpleRecord.class).writeTo(tableName).append(); // Validating from null snapshot is equivalent to validating from beginning Dataset conflictingDf = spark.createDataFrame(records, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting deleted data files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting deleted data files should throw exception", ValidationException.class, "Found conflicting files that can contain records matching partitions [id=1]", () -> { try { - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwritePartitions(); } catch (NoSuchTableException e) { @@ -403,7 +427,8 @@ public void testOverwritePartitionNoSnapshotIdValidation() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long snapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwritePartitions(); diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java index c9d15906251f..8a8a8c6ab722 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -25,8 +24,13 @@ public class TestCopyOnWriteDelete extends TestDelete { - public TestCopyOnWriteDelete(String catalogName, String implementation, Map config, - String fileFormat, Boolean vectorized, String distributionMode) { + public TestCopyOnWriteDelete( + String catalogName, + String implementation, + Map config, + String fileFormat, + Boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java index 60aba632646f..27cbd1a9d5de 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -25,8 +24,13 @@ public class TestCopyOnWriteMerge extends TestMerge { - public TestCopyOnWriteMerge(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestCopyOnWriteMerge( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java index cc73ecba9ddf..3fa3f74f6a39 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -25,8 +24,13 @@ public class TestCopyOnWriteUpdate extends TestUpdate { - public TestCopyOnWriteUpdate(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestCopyOnWriteUpdate( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java index d492a30eb827..e91dfa9e90ba 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java @@ -16,9 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.RowLevelOperationMode.COPY_ON_WRITE; +import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL; +import static org.apache.iceberg.TableProperties.DELETE_MODE; +import static org.apache.iceberg.TableProperties.DELETE_MODE_DEFAULT; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.SPLIT_SIZE; +import static org.apache.spark.sql.functions.lit; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -57,18 +64,15 @@ import org.junit.BeforeClass; import org.junit.Test; -import static org.apache.iceberg.RowLevelOperationMode.COPY_ON_WRITE; -import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL; -import static org.apache.iceberg.TableProperties.DELETE_MODE; -import static org.apache.iceberg.TableProperties.DELETE_MODE_DEFAULT; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.SPLIT_SIZE; -import static org.apache.spark.sql.functions.lit; - public abstract class TestDelete extends SparkRowLevelOperationsTestBase { - public TestDelete(String catalogName, String implementation, Map config, - String fileFormat, Boolean vectorized, String distributionMode) { + public TestDelete( + String catalogName, + String implementation, + Map config, + String fileFormat, + Boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -101,9 +105,11 @@ public void testDeleteFileThenMetadataDelete() throws Exception { sql("DELETE FROM %s AS t WHERE t.id = 1", tableName); Set dataFilesAfter = TestHelpers.dataFiles(table); - Assert.assertTrue("Data file should have been removed", dataFilesBefore.size() > dataFilesAfter.size()); + Assert.assertTrue( + "Data file should have been removed", dataFilesBefore.size() > dataFilesAfter.size()); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -119,7 +125,8 @@ public void testDeleteWithFalseCondition() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -134,7 +141,8 @@ public void testDeleteFromEmptyTable() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -152,7 +160,8 @@ public void testExplain() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 1 snapshot", 1, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -165,7 +174,8 @@ public void testDeleteWithAlias() { sql("DELETE FROM %s AS t WHERE t.id IS NULL", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -189,7 +199,8 @@ public void testDeleteWithDynamicFileFiltering() throws NoSuchTableException { validateMergeOnRead(currentSnapshot, "1", "1", null); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hardware"), row(1, "hr"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -217,7 +228,8 @@ public void testDeleteNonExistingRecords() { } } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -239,9 +251,8 @@ public void testDeleteWithoutCondition() { Snapshot currentSnapshot = table.currentSnapshot(); validateDelete(currentSnapshot, "2", "3"); - assertEquals("Should have expected rows", - ImmutableList.of(), - sql("SELECT * FROM %s", tableName)); + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); } @Test @@ -261,7 +272,8 @@ public void testDeleteUsingMetadataWithComplexCondition() { Snapshot currentSnapshot = table.currentSnapshot(); validateDelete(currentSnapshot, "2", "2"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "dep1")), sql("SELECT * FROM %s", tableName)); } @@ -288,7 +300,8 @@ public void testDeleteWithArbitraryPartitionPredicates() { validateMergeOnRead(currentSnapshot, "1", "1", null); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -299,8 +312,10 @@ public void testDeleteWithNonDeterministicCondition() { sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware')", tableName); - AssertHelpers.assertThrows("Should complain about non-deterministic expressions", - AnalysisException.class, "nondeterministic expressions are only allowed", + AssertHelpers.assertThrows( + "Should complain about non-deterministic expressions", + AnalysisException.class, + "nondeterministic expressions are only allowed", () -> sql("DELETE FROM %s WHERE id = 1 AND rand() > 0.5", tableName)); } @@ -312,25 +327,29 @@ public void testDeleteWithFoldableConditions() { // should keep all rows and don't trigger execution sql("DELETE FROM %s WHERE false", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should keep all rows and don't trigger execution sql("DELETE FROM %s WHERE 50 <> 50", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should keep all rows and don't trigger execution sql("DELETE FROM %s WHERE 1 > null", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should remove all rows sql("DELETE FROM %s WHERE 21 = 21", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -342,24 +361,29 @@ public void testDeleteWithFoldableConditions() { public void testDeleteWithNullConditions() { createAndInitPartitionedTable(); - sql("INSERT INTO TABLE %s VALUES (0, null), (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName); + sql( + "INSERT INTO TABLE %s VALUES (0, null), (1, 'hr'), (2, 'hardware'), (null, 'hr')", + tableName); // should keep all rows as null is never equal to null sql("DELETE FROM %s WHERE dep = null", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); // null = 'software' -> null // should delete using metadata operation only sql("DELETE FROM %s WHERE dep = 'software'", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); // should delete using metadata operation only sql("DELETE FROM %s WHERE dep <=> NULL", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); @@ -377,17 +401,20 @@ public void testDeleteWithInAndNotInConditions() { sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName); sql("DELETE FROM %s WHERE id IN (1, null)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("DELETE FROM %s WHERE id NOT IN (null, 1)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("DELETE FROM %s WHERE id NOT IN (1, 10)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -398,16 +425,20 @@ public void testDeleteWithMultipleRowGroupsParquet() throws NoSuchTableException createAndInitPartitionedTable(); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, SPLIT_SIZE, 100); List ids = Lists.newArrayListWithCapacity(200); for (int id = 1; id <= 200; id++) { ids.add(id); } - Dataset df = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); + Dataset df = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); df.coalesce(1).writeTo(tableName).append(); Assert.assertEquals(200, spark.table(tableName).count()); @@ -426,14 +457,12 @@ public void testDeleteWithConditionOnNestedColumn() { sql("INSERT INTO TABLE %s VALUES (2, named_struct(\"c1\", 2, \"c2\", \"v2\"))", tableName); sql("DELETE FROM %s WHERE complex.c1 = id + 2", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(2)), - sql("SELECT id FROM %s", tableName)); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2)), sql("SELECT id FROM %s", tableName)); sql("DELETE FROM %s t WHERE t.complex.c1 = id", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(), - sql("SELECT id FROM %s", tableName)); + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT id FROM %s", tableName)); } @Test @@ -445,28 +474,35 @@ public void testDeleteWithInSubquery() throws NoSuchTableException { createOrReplaceView("deleted_id", Arrays.asList(0, 1, null), Encoders.INT()); createOrReplaceView("deleted_dep", Arrays.asList("software", "hr"), Encoders.STRING()); - sql("DELETE FROM %s WHERE id IN (SELECT * FROM deleted_id) AND dep IN (SELECT * from deleted_dep)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s WHERE id IN (SELECT * FROM deleted_id) AND dep IN (SELECT * from deleted_dep)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); append(new Employee(1, "hr"), new Employee(-1, "hr")); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("DELETE FROM %s WHERE id IS NULL OR id IN (SELECT value + 2 FROM deleted_id)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "hr")), sql("SELECT * FROM %s ORDER BY id", tableName)); append(new Employee(null, "hr"), new Employee(2, "hr")); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "hr"), row(2, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("DELETE FROM %s WHERE id IN (SELECT value + 2 FROM deleted_id) AND dep = 'hr'", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -477,11 +513,13 @@ public void testDeleteWithMultiColumnInSubquery() throws NoSuchTableException { append(new Employee(1, "hr"), new Employee(2, "hardware"), new Employee(null, "hr")); - List deletedEmployees = Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr")); + List deletedEmployees = + Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr")); createOrReplaceView("deleted_employee", deletedEmployees, Encoders.bean(Employee.class)); sql("DELETE FROM %s WHERE (id, dep) IN (SELECT id, dep FROM deleted_employee)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -497,36 +535,50 @@ public void testDeleteWithNotInSubquery() throws NoSuchTableException { // the file filter subquery (nested loop lef-anti join) returns 0 records sql("DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id) OR dep IN ('software', 'hr')", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id) OR dep IN ('software', 'hr')", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE " + - "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) AND " + - "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE " + + "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) AND " + + "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE " + - "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) OR " + - "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE " + + "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) OR " + + "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -535,8 +587,10 @@ public void testDeleteWithNotInSubquery() throws NoSuchTableException { public void testDeleteOnNonIcebergTableNotSupported() { createOrReplaceView("testtable", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Delete is supported only for Iceberg tables", - AnalysisException.class, "DELETE is only supported with v2 tables.", + AssertHelpers.assertThrows( + "Delete is supported only for Iceberg tables", + AnalysisException.class, + "DELETE is only supported with v2 tables.", () -> sql("DELETE FROM %s WHERE c1 = -100", "testtable")); } @@ -549,25 +603,37 @@ public void testDeleteWithExistSubquery() throws NoSuchTableException { createOrReplaceView("deleted_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("deleted_dep", Arrays.asList("software", "hr"), Encoders.STRING()); - sql("DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value) OR t.id IS NULL", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value) OR t.id IS NULL", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s", tableName)); - sql("DELETE FROM %s t WHERE " + - "EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value) AND " + - "EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE " + + "EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value) AND " + + "EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s", tableName)); } @@ -581,21 +647,28 @@ public void testDeleteWithNotExistsSubquery() throws NoSuchTableException { createOrReplaceView("deleted_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("deleted_dep", Arrays.asList("software", "hr"), Encoders.STRING()); - sql("DELETE FROM %s t WHERE " + - "NOT EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value + 2) AND " + - "NOT EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE " + + "NOT EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value + 2) AND " + + "NOT EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE NOT EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE NOT EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); String subquery = "SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2"; sql("DELETE FROM %s t WHERE NOT EXISTS (%s) OR t.id = 1", tableName, subquery); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -609,12 +682,15 @@ public void testDeleteWithScalarSubquery() throws NoSuchTableException { createOrReplaceView("deleted_id", Arrays.asList(1, 100, null), Encoders.INT()); // TODO: Spark does not support AQE and DPP with aggregates at the moment - withSQLConf(ImmutableMap.of(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), "false"), () -> { - sql("DELETE FROM %s t WHERE id <= (SELECT min(value) FROM deleted_id)", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(2, "hardware"), row(null, "hr")), - sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - }); + withSQLConf( + ImmutableMap.of(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), "false"), + () -> { + sql("DELETE FROM %s t WHERE id <= (SELECT min(value) FROM deleted_id)", tableName); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(2, "hardware"), row(null, "hr")), + sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); + }); } @Test @@ -647,52 +723,61 @@ public synchronized void testDeleteWithSerializableIsolation() throws Interrupte createAndInitUnpartitionedTable(); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DELETE_ISOLATION_LEVEL, "serializable"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, DELETE_ISOLATION_LEVEL, "serializable"); // Pre-populate the table to force it to use the Spark Writers instead of Metadata-Only Delete // for more consistent exception stack List ids = ImmutableList.of(1, 2); - Dataset inputDF = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); + Dataset inputDF = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); try { inputDF.coalesce(1).writeTo(tableName).append(); } catch (NoSuchTableException e) { throw new RuntimeException(e); } - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // delete thread - Future deleteFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("DELETE FROM %s WHERE id = 1", tableName); - barrier.incrementAndGet(); - } - }); + Future deleteFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("DELETE FROM %s WHERE id = 1", tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - - try { - inputDF.coalesce(1).writeTo(tableName).append(); - } catch (NoSuchTableException e) { - throw new RuntimeException(e); - } - - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + + try { + inputDF.coalesce(1).writeTo(tableName).append(); + } catch (NoSuchTableException e) { + throw new RuntimeException(e); + } + + barrier.incrementAndGet(); + } + }); try { deleteFuture.get(); @@ -703,7 +788,8 @@ public synchronized void testDeleteWithSerializableIsolation() throws Interrupte Throwable validationException = sparkException.getCause(); Assert.assertThat(validationException, CoreMatchers.instanceOf(ValidationException.class)); String errMsg = validationException.getMessage(); - Assert.assertThat(errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); + Assert.assertThat( + errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); } finally { appendFuture.cancel(true); } @@ -713,51 +799,61 @@ public synchronized void testDeleteWithSerializableIsolation() throws Interrupte } @Test - public synchronized void testDeleteWithSnapshotIsolation() throws InterruptedException, ExecutionException { + public synchronized void testDeleteWithSnapshotIsolation() + throws InterruptedException, ExecutionException { // cannot run tests with concurrency for Hadoop tables without atomic renames Assume.assumeFalse(catalogName.equalsIgnoreCase("testhadoop")); createAndInitUnpartitionedTable(); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DELETE_ISOLATION_LEVEL, "snapshot"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, DELETE_ISOLATION_LEVEL, "snapshot"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // delete thread - Future deleteFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("DELETE FROM %s WHERE id = 1", tableName); - barrier.incrementAndGet(); - } - }); + Future deleteFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("DELETE FROM %s WHERE id = 1", tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - List ids = ImmutableList.of(1, 2); - Dataset inputDF = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); - - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - - try { - inputDF.coalesce(1).writeTo(tableName).append(); - } catch (NoSuchTableException e) { - throw new RuntimeException(e); - } - - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + List ids = ImmutableList.of(1, 2); + Dataset inputDF = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); + + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + + try { + inputDF.coalesce(1).writeTo(tableName).append(); + } catch (NoSuchTableException e) { + throw new RuntimeException(e); + } + + barrier.incrementAndGet(); + } + }); try { deleteFuture.get(); @@ -781,7 +877,8 @@ public void testDeleteRefreshesRelationCache() throws NoSuchTableException { spark.sql("CACHE TABLE tmp"); - assertEquals("View should have correct data", + assertEquals( + "View should have correct data", ImmutableList.of(row(1, "hardware"), row(1, "hr")), sql("SELECT * FROM tmp ORDER BY id, dep")); @@ -796,11 +893,13 @@ public void testDeleteRefreshesRelationCache() throws NoSuchTableException { } else { validateMergeOnRead(currentSnapshot, "2", "2", null); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - assertEquals("Should refresh the relation cache", + assertEquals( + "Should refresh the relation cache", ImmutableList.of(), sql("SELECT * FROM tmp ORDER BY id, dep")); @@ -816,8 +915,10 @@ public void testDeleteWithMultipleSpecs() { // write a file partitioned by dep sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - append(tableName, "{ \"id\": 2, \"dep\": \"hr\", \"category\": \"c1\" }\n" + - "{ \"id\": 3, \"dep\": \"hr\", \"category\": \"c1\" }"); + append( + tableName, + "{ \"id\": 2, \"dep\": \"hr\", \"category\": \"c1\" }\n" + + "{ \"id\": 3, \"dep\": \"hr\", \"category\": \"c1\" }"); // write a file partitioned by dep and category sql("ALTER TABLE %s ADD PARTITION FIELD category", tableName); @@ -834,14 +935,16 @@ public void testDeleteWithMultipleSpecs() { Snapshot currentSnapshot = table.currentSnapshot(); if (mode(table) == COPY_ON_WRITE) { - // copy-on-write is tested against v1 and such tables have different partition evolution behavior + // copy-on-write is tested against v1 and such tables have different partition evolution + // behavior // that's why the number of changed partitions is 4 for copy-on-write validateCopyOnWrite(currentSnapshot, "4", "4", "1"); } else { validateMergeOnRead(currentSnapshot, "3", "3", null); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hr", "c1")), sql("SELECT * FROM %s ORDER BY id", tableName)); } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDuplicateSnapshotIDs.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDuplicateSnapshotIDs.java index 685113f35175..690351840ad9 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDuplicateSnapshotIDs.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDuplicateSnapshotIDs.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -30,7 +29,8 @@ public class TestDuplicateSnapshotIDs extends SparkExtensionsTestBase { - public TestDuplicateSnapshotIDs(String catalogName, String implementation, Map config) { + public TestDuplicateSnapshotIDs( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -44,7 +44,8 @@ public void testSameSnapshotIDBackToBack() { sql("DROP TABLE IF EXISTS %s ", tableName); sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - try (MockedStatic utilities = Mockito.mockStatic(SnapshotIdGeneratorUtil.class)) { + try (MockedStatic utilities = + Mockito.mockStatic(SnapshotIdGeneratorUtil.class)) { utilities.when(SnapshotIdGeneratorUtil::generateSnapshotID).thenReturn(42L, 42L, 43L); // use 42L as snapshot id for the insert sql("INSERT INTO TABLE %s SELECT 1, 'a' ", tableName); @@ -54,15 +55,15 @@ public void testSameSnapshotIDBackToBack() { // use regular snapshot id logic for the insert sql("INSERT INTO TABLE %s SELECT 3, 'c' ", tableName); - ImmutableList expectedRows = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c") - ); - assertEquals("should have all the rows", expectedRows, sql("SELECT * from %s order by id", tableName)); + ImmutableList expectedRows = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c")); + assertEquals( + "should have all the rows", expectedRows, sql("SELECT * from %s order by id", tableName)); Assert.assertEquals(sql("SELECT * from %s.snapshots", tableName).size(), 3); - Assert.assertEquals(sql("SELECT * from %s.snapshots where snapshot_id = 42L", tableName).size(), 1); - Assert.assertEquals(sql("SELECT * from %s.snapshots where snapshot_id = 43L", tableName).size(), 1); + Assert.assertEquals( + sql("SELECT * from %s.snapshots where snapshot_id = 42L", tableName).size(), 1); + Assert.assertEquals( + sql("SELECT * from %s.snapshots where snapshot_id = 43L", tableName).size(), 1); } @Test @@ -70,7 +71,8 @@ public void testSameSnapshotID() { sql("DROP TABLE IF EXISTS %s ", tableName); sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - try (MockedStatic utilities = Mockito.mockStatic(SnapshotIdGeneratorUtil.class)) { + try (MockedStatic utilities = + Mockito.mockStatic(SnapshotIdGeneratorUtil.class)) { utilities.when(SnapshotIdGeneratorUtil::generateSnapshotID).thenReturn(42L); // use 42L as snapshot id for the insert sql("INSERT INTO TABLE %s SELECT 1, 'a' ", tableName); @@ -78,7 +80,8 @@ public void testSameSnapshotID() { // use regular snapshot id logic for the inserts sql("INSERT INTO TABLE %s SELECT 2, 'b' ", tableName); sql("INSERT INTO TABLE %s SELECT 3, 'c' ", tableName); - try (MockedStatic utilities = Mockito.mockStatic(SnapshotIdGeneratorUtil.class)) { + try (MockedStatic utilities = + Mockito.mockStatic(SnapshotIdGeneratorUtil.class)) { utilities.when(SnapshotIdGeneratorUtil::generateSnapshotID).thenReturn(42L, 43L); // use 42L as snapshot id for the insert and retry with 43L. sql("INSERT INTO TABLE %s SELECT 4, 'd' ", tableName); @@ -86,16 +89,14 @@ public void testSameSnapshotID() { // use regular snapshot id logic for the insert sql("INSERT INTO TABLE %s SELECT 5, 'e' ", tableName); - ImmutableList expectedRows = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); - assertEquals("should have all the rows", expectedRows, sql("SELECT * from %s order by id", tableName)); + ImmutableList expectedRows = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); + assertEquals( + "should have all the rows", expectedRows, sql("SELECT * from %s order by id", tableName)); Assert.assertEquals(sql("SELECT * from %s.snapshots", tableName).size(), 5); - Assert.assertEquals(sql("SELECT * from %s.snapshots where snapshot_id = 42L", tableName).size(), 1); - Assert.assertEquals(sql("SELECT * from %s.snapshots where snapshot_id = 43L", tableName).size(), 1); + Assert.assertEquals( + sql("SELECT * from %s.snapshots where snapshot_id = 42L", tableName).size(), 1); + Assert.assertEquals( + sql("SELECT * from %s.snapshots where snapshot_id = 43L", tableName).size(), 1); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java index a77dafe5af05..5cb4f17edcf4 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; + import java.io.IOException; import java.sql.Timestamp; import java.time.Instant; @@ -45,11 +46,10 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TableProperties.GC_ENABLED; - public class TestExpireSnapshotsProcedure extends SparkExtensionsTestBase { - public TestExpireSnapshotsProcedure(String catalogName, String implementation, Map config) { + public TestExpireSnapshotsProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -62,9 +62,7 @@ public void removeTables() { public void testExpireSnapshotsInEmptyTable() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - List output = sql( - "CALL %s.system.expire_snapshots('%s')", - catalogName, tableIdent); + List output = sql("CALL %s.system.expire_snapshots('%s')", catalogName, tableIdent); assertEquals("Should not delete any files", ImmutableList.of(row(0L, 0L, 0L, 0L, 0L)), output); } @@ -83,17 +81,17 @@ public void testExpireSnapshotsUsingPositionalArgs() { table.refresh(); Snapshot secondSnapshot = table.currentSnapshot(); - Timestamp secondSnapshotTimestamp = Timestamp.from(Instant.ofEpochMilli(secondSnapshot.timestampMillis())); + Timestamp secondSnapshotTimestamp = + Timestamp.from(Instant.ofEpochMilli(secondSnapshot.timestampMillis())); Assert.assertEquals("Should be 2 snapshots", 2, Iterables.size(table.snapshots())); // expire without retainLast param - List output1 = sql( - "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s')", - catalogName, tableIdent, secondSnapshotTimestamp); - assertEquals("Procedure output must match", - ImmutableList.of(row(0L, 0L, 0L, 0L, 1L)), - output1); + List output1 = + sql( + "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s')", + catalogName, tableIdent, secondSnapshotTimestamp); + assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 0L, 0L, 1L)), output1); table.refresh(); @@ -101,7 +99,8 @@ public void testExpireSnapshotsUsingPositionalArgs() { sql("INSERT OVERWRITE %s VALUES (3, 'c')", tableName); sql("INSERT INTO TABLE %s VALUES (4, 'd')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(3L, "c"), row(4L, "d")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -114,12 +113,11 @@ public void testExpireSnapshotsUsingPositionalArgs() { Assert.assertEquals("Should be 3 snapshots", 3, Iterables.size(table.snapshots())); // expire with retainLast param - List output = sql( - "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s', 2)", - catalogName, tableIdent, currentTimestamp); - assertEquals("Procedure output must match", - ImmutableList.of(row(2L, 0L, 0L, 2L, 1L)), - output); + List output = + sql( + "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s', 2)", + catalogName, tableIdent, currentTimestamp); + assertEquals("Procedure output must match", ImmutableList.of(row(2L, 0L, 0L, 2L, 1L)), output); } @Test @@ -137,15 +135,14 @@ public void testExpireSnapshotUsingNamedArgs() { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); - List output = sql( - "CALL %s.system.expire_snapshots(" + - "older_than => TIMESTAMP '%s'," + - "table => '%s'," + - "retain_last => 1)", - catalogName, currentTimestamp, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(0L, 0L, 0L, 0L, 1L)), - output); + List output = + sql( + "CALL %s.system.expire_snapshots(" + + "older_than => TIMESTAMP '%s'," + + "table => '%s'," + + "retain_last => 1)", + catalogName, currentTimestamp, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 0L, 0L, 1L)), output); } @Test @@ -154,31 +151,43 @@ public void testExpireSnapshotsGCDisabled() { sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' 'false')", tableName, GC_ENABLED); - AssertHelpers.assertThrows("Should reject call", - ValidationException.class, "Cannot expire snapshots: GC is disabled", + AssertHelpers.assertThrows( + "Should reject call", + ValidationException.class, + "Cannot expire snapshots: GC is disabled", () -> sql("CALL %s.system.expire_snapshots('%s')", catalogName, tableIdent)); } @Test public void testInvalidExpireSnapshotsCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.expire_snapshots('n', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.expire_snapshots('n', 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.expire_snapshots()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.expire_snapshots('n', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.expire_snapshots('')", catalogName)); } @@ -187,13 +196,24 @@ public void testResolvingTableInAnotherCatalog() throws IOException { String anotherCatalog = "another_" + catalogName; spark.conf().set("spark.sql.catalog." + anotherCatalog, SparkCatalog.class.getName()); spark.conf().set("spark.sql.catalog." + anotherCatalog + ".type", "hadoop"); - spark.conf().set("spark.sql.catalog." + anotherCatalog + ".warehouse", "file:" + temp.newFolder().toString()); - - sql("CREATE TABLE %s.%s (id bigint NOT NULL, data string) USING iceberg", anotherCatalog, tableIdent); - - AssertHelpers.assertThrows("Should reject calls for a table in another catalog", - IllegalArgumentException.class, "Cannot run procedure in catalog", - () -> sql("CALL %s.system.expire_snapshots('%s')", catalogName, anotherCatalog + "." + tableName)); + spark + .conf() + .set( + "spark.sql.catalog." + anotherCatalog + ".warehouse", + "file:" + temp.newFolder().toString()); + + sql( + "CREATE TABLE %s.%s (id bigint NOT NULL, data string) USING iceberg", + anotherCatalog, tableIdent); + + AssertHelpers.assertThrows( + "Should reject calls for a table in another catalog", + IllegalArgumentException.class, + "Cannot run procedure in catalog", + () -> + sql( + "CALL %s.system.expire_snapshots('%s')", + catalogName, anotherCatalog + "." + tableName)); } @Test @@ -206,68 +226,89 @@ public void testConcurrentExpireSnapshots() { sql("INSERT INTO TABLE %s VALUES (4, 'd')", tableName); Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); - List output = sql( - "CALL %s.system.expire_snapshots(" + - "older_than => TIMESTAMP '%s'," + - "table => '%s'," + - "max_concurrent_deletes => %s," + - "retain_last => 1)", - catalogName, currentTimestamp, tableIdent, 4); - assertEquals("Expiring snapshots concurrently should succeed", - ImmutableList.of(row(0L, 0L, 0L, 0L, 3L)), output); + List output = + sql( + "CALL %s.system.expire_snapshots(" + + "older_than => TIMESTAMP '%s'," + + "table => '%s'," + + "max_concurrent_deletes => %s," + + "retain_last => 1)", + catalogName, currentTimestamp, tableIdent, 4); + assertEquals( + "Expiring snapshots concurrently should succeed", + ImmutableList.of(row(0L, 0L, 0L, 0L, 3L)), + output); } @Test public void testConcurrentExpireSnapshotsWithInvalidInput() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should throw an error when max_concurrent_deletes = 0", - IllegalArgumentException.class, "max_concurrent_deletes should have value > 0", - () -> sql("CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)", - catalogName, tableIdent, 0)); - - AssertHelpers.assertThrows("Should throw an error when max_concurrent_deletes < 0 ", - IllegalArgumentException.class, "max_concurrent_deletes should have value > 0", - () -> sql( - "CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)", - catalogName, tableIdent, -1)); - + AssertHelpers.assertThrows( + "Should throw an error when max_concurrent_deletes = 0", + IllegalArgumentException.class, + "max_concurrent_deletes should have value > 0", + () -> + sql( + "CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)", + catalogName, tableIdent, 0)); + + AssertHelpers.assertThrows( + "Should throw an error when max_concurrent_deletes < 0 ", + IllegalArgumentException.class, + "max_concurrent_deletes should have value > 0", + () -> + sql( + "CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)", + catalogName, tableIdent, -1)); } @Test public void testExpireDeleteFiles() throws Exception { - sql("CREATE TABLE %s (id bigint, data string) USING iceberg TBLPROPERTIES" + - "('format-version'='2', 'write.delete.mode'='merge-on-read')", tableName); - - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "d") - ); - spark.createDataset(records, Encoders.bean(SimpleRecord.class)).coalesce(1).writeTo(tableName).append(); + sql( + "CREATE TABLE %s (id bigint, data string) USING iceberg TBLPROPERTIES" + + "('format-version'='2', 'write.delete.mode'='merge-on-read')", + tableName); + + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "d")); + spark + .createDataset(records, Encoders.bean(SimpleRecord.class)) + .coalesce(1) + .writeTo(tableName) + .append(); sql("DELETE FROM %s WHERE id=1", tableName); Table table = Spark3Util.loadIcebergTable(spark, tableName); - Assert.assertEquals("Should have 1 delete manifest", 1, TestHelpers.deleteManifests(table).size()); + Assert.assertEquals( + "Should have 1 delete manifest", 1, TestHelpers.deleteManifests(table).size()); Assert.assertEquals("Should have 1 delete file", 1, TestHelpers.deleteFiles(table).size()); Path deleteManifestPath = new Path(TestHelpers.deleteManifests(table).iterator().next().path()); - Path deleteFilePath = new Path(String.valueOf(TestHelpers.deleteFiles(table).iterator().next().path())); - - sql("CALL %s.system.rewrite_data_files(" + - "table => '%s'," + - "options => map(" + - "'delete-file-threshold','1'," + - "'use-starting-sequence-number', 'false'))", + Path deleteFilePath = + new Path(String.valueOf(TestHelpers.deleteFiles(table).iterator().next().path())); + + sql( + "CALL %s.system.rewrite_data_files(" + + "table => '%s'," + + "options => map(" + + "'delete-file-threshold','1'," + + "'use-starting-sequence-number', 'false'))", catalogName, tableIdent); table.refresh(); - sql("INSERT INTO TABLE %s VALUES (5, 'e')", tableName); // this txn moves the file to the DELETED state + sql( + "INSERT INTO TABLE %s VALUES (5, 'e')", + tableName); // this txn moves the file to the DELETED state sql("INSERT INTO TABLE %s VALUES (6, 'f')", tableName); // this txn removes the file reference table.refresh(); - Assert.assertEquals("Should have no delete manifests", 0, TestHelpers.deleteManifests(table).size()); + Assert.assertEquals( + "Should have no delete manifests", 0, TestHelpers.deleteManifests(table).size()); Assert.assertEquals("Should have no delete files", 0, TestHelpers.deleteFiles(table).size()); FileSystem localFs = FileSystem.getLocal(new Configuration()); @@ -275,14 +316,18 @@ public void testExpireDeleteFiles() throws Exception { Assert.assertTrue("Delete file should still exist", localFs.exists(deleteFilePath)); Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); - List output = sql("CALL %s.system.expire_snapshots(" + - "older_than => TIMESTAMP '%s'," + - "table => '%s'," + - "retain_last => 1)", - catalogName, currentTimestamp, tableIdent); - - assertEquals("Should deleted 1 data and pos delete file and 4 manifests and lists (one for each txn)", - ImmutableList.of(row(1L, 1L, 0L, 4L, 4L)), output); + List output = + sql( + "CALL %s.system.expire_snapshots(" + + "older_than => TIMESTAMP '%s'," + + "table => '%s'," + + "retain_last => 1)", + catalogName, currentTimestamp, tableIdent); + + assertEquals( + "Should deleted 1 data and pos delete file and 4 manifests and lists (one for each txn)", + ImmutableList.of(row(1L, 1L, 0L, 4L, 4L)), + output); Assert.assertFalse("Delete manifest should be removed", localFs.exists(deleteManifestPath)); Assert.assertFalse("Delete file should be removed", localFs.exists(deleteFilePath)); } @@ -302,13 +347,14 @@ public void testExpireSnapshotWithStreamResultsEnabled() { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); - List output = sql( - "CALL %s.system.expire_snapshots(" + - "older_than => TIMESTAMP '%s'," + - "table => '%s'," + - "retain_last => 1, " + - "stream_results => true)", - catalogName, currentTimestamp, tableIdent); + List output = + sql( + "CALL %s.system.expire_snapshots(" + + "older_than => TIMESTAMP '%s'," + + "table => '%s'," + + "retain_last => 1, " + + "stream_results => true)", + catalogName, currentTimestamp, tableIdent); assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 0L, 0L, 1L)), output); } @@ -330,18 +376,15 @@ public void testExpireSnapshotsProcedureWorksWithSqlComments() { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); String callStatement = - "/* CALL statement is used to expire snapshots */\n" + - "-- And we have single line comments as well \n" + - "/* And comments that span *multiple* \n" + - " lines */ CALL /* this is the actual CALL */ %s.system.expire_snapshots(" + - " older_than => TIMESTAMP '%s'," + - " table => '%s'," + - " retain_last => 1)"; - List output = sql( - callStatement, catalogName, currentTimestamp, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(0L, 0L, 0L, 0L, 1L)), - output); + "/* CALL statement is used to expire snapshots */\n" + + "-- And we have single line comments as well \n" + + "/* And comments that span *multiple* \n" + + " lines */ CALL /* this is the actual CALL */ %s.system.expire_snapshots(" + + " older_than => TIMESTAMP '%s'," + + " table => '%s'," + + " retain_last => 1)"; + List output = sql(callStatement, catalogName, currentTimestamp, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 0L, 0L, 1L)), output); table.refresh(); diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java index ce88814ce937..8d2e10ea17eb 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.math.BigDecimal; @@ -31,7 +30,8 @@ public class TestIcebergExpressions extends SparkExtensionsTestBase { - public TestIcebergExpressions(String catalogName, String implementation, Map config) { + public TestIcebergExpressions( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -44,26 +44,30 @@ public void removeTables() { @Test public void testTruncateExpressions() { - sql("CREATE TABLE %s ( " + - " int_c INT, long_c LONG, dec_c DECIMAL(4, 2), str_c STRING, binary_c BINARY " + - ") USING iceberg", tableName); + sql( + "CREATE TABLE %s ( " + + " int_c INT, long_c LONG, dec_c DECIMAL(4, 2), str_c STRING, binary_c BINARY " + + ") USING iceberg", + tableName); - sql("CREATE TEMPORARY VIEW emp " + - "AS SELECT * FROM VALUES (101, 10001, 10.65, '101-Employee', CAST('1234' AS BINARY)) " + - "AS EMP(int_c, long_c, dec_c, str_c, binary_c)"); + sql( + "CREATE TEMPORARY VIEW emp " + + "AS SELECT * FROM VALUES (101, 10001, 10.65, '101-Employee', CAST('1234' AS BINARY)) " + + "AS EMP(int_c, long_c, dec_c, str_c, binary_c)"); sql("INSERT INTO %s SELECT * FROM emp", tableName); Dataset df = spark.sql("SELECT * FROM " + tableName); df.select( - new Column(new IcebergTruncateTransform(df.col("int_c").expr(), 2)).as("int_c"), - new Column(new IcebergTruncateTransform(df.col("long_c").expr(), 2)).as("long_c"), - new Column(new IcebergTruncateTransform(df.col("dec_c").expr(), 50)).as("dec_c"), - new Column(new IcebergTruncateTransform(df.col("str_c").expr(), 2)).as("str_c"), - new Column(new IcebergTruncateTransform(df.col("binary_c").expr(), 2)).as("binary_c") - ).createOrReplaceTempView("v"); + new Column(new IcebergTruncateTransform(df.col("int_c").expr(), 2)).as("int_c"), + new Column(new IcebergTruncateTransform(df.col("long_c").expr(), 2)).as("long_c"), + new Column(new IcebergTruncateTransform(df.col("dec_c").expr(), 50)).as("dec_c"), + new Column(new IcebergTruncateTransform(df.col("str_c").expr(), 2)).as("str_c"), + new Column(new IcebergTruncateTransform(df.col("binary_c").expr(), 2)).as("binary_c")) + .createOrReplaceTempView("v"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(100, 10000L, new BigDecimal("10.50"), "10", "12")), sql("SELECT int_c, long_c, dec_c, str_c, CAST(binary_c AS STRING) FROM v")); } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java index 52f7efceb74a..c485dbfe2f93 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.SPLIT_SIZE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; +import static org.apache.spark.sql.functions.lit; + import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -54,16 +59,15 @@ import org.junit.BeforeClass; import org.junit.Test; -import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.SPLIT_SIZE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; -import static org.apache.spark.sql.functions.lit; - public abstract class TestMerge extends SparkRowLevelOperationsTestBase { - public TestMerge(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestMerge( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -96,238 +100,293 @@ public void testMergeWithStaticPredicatePushDown() { String dataFilesCount = snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP); Assert.assertEquals("Must have 2 files before MERGE", "2", dataFilesCount); - createOrReplaceView("source", - "{ \"id\": 1, \"dep\": \"finance\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + createOrReplaceView( + "source", "{ \"id\": 1, \"dep\": \"finance\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }"); // remove the data file from the 'hr' partition to ensure it is not scanned - withUnavailableFiles(snapshot.addedDataFiles(table.io()), () -> { - // disable dynamic pruning and rely only on static predicate pushdown - withSQLConf(ImmutableMap.of(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED().key(), "false"), () -> { - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id AND t.dep IN ('software') AND source.id < 10 " + - "WHEN MATCHED AND source.id = 1 THEN " + - " UPDATE SET dep = source.dep " + - "WHEN NOT MATCHED THEN " + - " INSERT (dep, id) VALUES (source.dep, source.id)", tableName); - }); - }); - - ImmutableList expectedRows = ImmutableList.of( - row(1L, "finance"), // updated - row(1L, "hr"), // kept - row(2L, "hardware") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id, dep", tableName)); + withUnavailableFiles( + snapshot.addedDataFiles(table.io()), + () -> { + // disable dynamic pruning and rely only on static predicate pushdown + withSQLConf( + ImmutableMap.of(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED().key(), "false"), + () -> { + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id AND t.dep IN ('software') AND source.id < 10 " + + "WHEN MATCHED AND source.id = 1 THEN " + + " UPDATE SET dep = source.dep " + + "WHEN NOT MATCHED THEN " + + " INSERT (dep, id) VALUES (source.dep, source.id)", + tableName); + }); + }); + + ImmutableList expectedRows = + ImmutableList.of( + row(1L, "finance"), // updated + row(1L, "hr"), // kept + row(2L, "hardware") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @Test public void testMergeIntoEmptyTargetInsertAllNonMatchingRows() { createAndInitTable("id INT, dep STRING"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // new - row(2, "emp-id-2"), // new - row(3, "emp-id-3") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // new + row(2, "emp-id-2"), // new + row(3, "emp-id-3") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeIntoEmptyTargetInsertOnlyMatchingRows() { createAndInitTable("id INT, dep STRING"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED AND (s.id >=2) THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(2, "emp-id-2"), // new - row(3, "emp-id-3") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED AND (s.id >=2) THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(2, "emp-id-2"), // new + row(3, "emp-id-3") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithOnlyUpdateClause() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-six\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(6, "emp-id-six") // kept - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-six\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(6, "emp-id-six") // kept + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithOnlyDeleteClause() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-one") // kept - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-one") // kept + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithAllCauses() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithAllCausesWithExplicitColumnSpecification() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET t.id = s.id, t.dep = s.dep " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT (t.id, t.dep) VALUES (s.id, s.dep)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET t.id = s.id, t.dep = s.dep " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT (t.id, t.dep) VALUES (s.id, s.dep)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithSourceCTE() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-two\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-3\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 5, \"dep\": \"emp-id-6\" }"); - - sql("WITH cte1 AS (SELECT id + 1 AS id, dep FROM source) " + - "MERGE INTO %s AS t USING cte1 AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 2 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 3 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(2, "emp-id-2"), // updated - row(3, "emp-id-3") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-two\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-3\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 5, \"dep\": \"emp-id-6\" }"); + + sql( + "WITH cte1 AS (SELECT id + 1 AS id, dep FROM source) " + + "MERGE INTO %s AS t USING cte1 AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 2 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 3 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(2, "emp-id-2"), // updated + row(3, "emp-id-3") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithSourceFromSetOps() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); String derivedSource = - "SELECT * FROM source WHERE id = 2 " + - "UNION ALL " + - "SELECT * FROM source WHERE id = 1 OR id = 6"; - - sql("MERGE INTO %s AS t USING (%s) AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName, derivedSource); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + "SELECT * FROM source WHERE id = 2 " + + "UNION ALL " + + "SELECT * FROM source WHERE id = 1 OR id = 6"; + + sql( + "MERGE INTO %s AS t USING (%s) AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName, derivedSource); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSource() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); List sourceIds = Lists.newArrayList(); for (int i = 0; i < 10_000; i++) { @@ -337,29 +396,35 @@ public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSource() { ds.union(ds).createOrReplaceTempView("source"); String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrowsCause("Should complain about multiple matches", - SparkException.class, errorMsg, + AssertHelpers.assertThrowsCause( + "Should complain about multiple matches", + SparkException.class, + errorMsg, () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.value " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET id = 10 " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.value = 2 THEN " + - " INSERT (id, dep) VALUES (s.value, null)", tableName); + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.value " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET id = 10 " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.value = 2 THEN " + + " INSERT (id, dep) VALUES (s.value, null)", + tableName); }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one"), row(6, "emp-id-6")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @Test - public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceEnabledHashShuffleJoin() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + public void + testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceEnabledHashShuffleJoin() { + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); List sourceIds = Lists.newArrayList(); for (int i = 0; i < 10_000; i++) { @@ -368,23 +433,31 @@ public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceEnable Dataset ds = spark.createDataset(sourceIds, Encoders.INT()); ds.union(ds).createOrReplaceTempView("source"); - withSQLConf(ImmutableMap.of(SQLConf.PREFER_SORTMERGEJOIN().key(), "false"), () -> { - String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrowsCause("Should complain about multiple matches", - SparkException.class, errorMsg, - () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.value " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET id = 10 " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.value = 2 THEN " + - " INSERT (id, dep) VALUES (s.value, null)", tableName); - }); - }); + withSQLConf( + ImmutableMap.of(SQLConf.PREFER_SORTMERGEJOIN().key(), "false"), + () -> { + String errorMsg = + "a single row from the target table with multiple rows of the source table"; + AssertHelpers.assertThrowsCause( + "Should complain about multiple matches", + SparkException.class, + errorMsg, + () -> { + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.value " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET id = 10 " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.value = 2 THEN " + + " INSERT (id, dep) VALUES (s.value, null)", + tableName); + }); + }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one"), row(6, "emp-id-6")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -400,32 +473,40 @@ public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceNoEqua Dataset ds = spark.createDataset(sourceIds, Encoders.INT()); ds.union(ds).createOrReplaceTempView("source"); - withSQLConf(ImmutableMap.of(SQLConf.PREFER_SORTMERGEJOIN().key(), "false"), () -> { - String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrowsCause("Should complain about multiple matches", - SparkException.class, errorMsg, - () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id > s.value " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET id = 10 " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.value = 2 THEN " + - " INSERT (id, dep) VALUES (s.value, null)", tableName); - }); - }); + withSQLConf( + ImmutableMap.of(SQLConf.PREFER_SORTMERGEJOIN().key(), "false"), + () -> { + String errorMsg = + "a single row from the target table with multiple rows of the source table"; + AssertHelpers.assertThrowsCause( + "Should complain about multiple matches", + SparkException.class, + errorMsg, + () -> { + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id > s.value " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET id = 10 " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.value = 2 THEN " + + " INSERT (id, dep) VALUES (s.value, null)", + tableName); + }); + }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @Test public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceNoNotMatchedActions() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); List sourceIds = Lists.newArrayList(); for (int i = 0; i < 10_000; i++) { @@ -435,24 +516,30 @@ public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceNoNotM ds.union(ds).createOrReplaceTempView("source"); String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrowsCause("Should complain about multiple matches", - SparkException.class, errorMsg, + AssertHelpers.assertThrowsCause( + "Should complain about multiple matches", + SparkException.class, + errorMsg, () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.value " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET id = 10 " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE", tableName); + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.value " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET id = 10 " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE", + tableName); }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one"), row(6, "emp-id-6")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @Test - public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceNoNotMatchedActionsNoEqualityCondition() { + public void + testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceNoNotMatchedActionsNoEqualityCondition() { createAndInitTable("id INT, dep STRING", "{ \"id\": 1, \"dep\": \"emp-id-one\" }"); List sourceIds = Lists.newArrayList(); @@ -463,103 +550,128 @@ public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceNoNotM ds.union(ds).createOrReplaceTempView("source"); String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrowsCause("Should complain about multiple matches", - SparkException.class, errorMsg, + AssertHelpers.assertThrowsCause( + "Should complain about multiple matches", + SparkException.class, + errorMsg, () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id > s.value " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET id = 10 " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE", tableName); + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id > s.value " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET id = 10 " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE", + tableName); }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @Test public void testMergeWithMultipleUpdatesForTargetRow() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrowsCause("Should complain about multiple matches", - SparkException.class, errorMsg, + AssertHelpers.assertThrowsCause( + "Should complain about multiple matches", + SparkException.class, + errorMsg, () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one"), row(6, "emp-id-6")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @Test public void testMergeWithUnconditionalDelete() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithSingleConditionalDelete() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrowsCause("Should complain about multiple matches", - SparkException.class, errorMsg, + AssertHelpers.assertThrowsCause( + "Should complain about multiple matches", + SparkException.class, + errorMsg, () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one"), row(6, "emp-id-6")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -569,31 +681,41 @@ public void testMergeWithIdentityTransform() { for (DistributionMode mode : DistributionMode.values()) { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD identity(dep)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, + sql("SELECT * FROM %s ORDER BY id", tableName)); removeTables(); } @@ -604,31 +726,41 @@ public void testMergeWithDaysTransform() { for (DistributionMode mode : DistributionMode.values()) { createAndInitTable("id INT, ts TIMESTAMP"); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, "id INT, ts TIMESTAMP", - "{ \"id\": 1, \"ts\": \"2000-01-01 00:00:00\" }\n" + - "{ \"id\": 6, \"ts\": \"2000-01-06 00:00:00\" }"); - - createOrReplaceView("source", "id INT, ts TIMESTAMP", - "{ \"id\": 2, \"ts\": \"2001-01-02 00:00:00\" }\n" + - "{ \"id\": 1, \"ts\": \"2001-01-01 00:00:00\" }\n" + - "{ \"id\": 6, \"ts\": \"2001-01-06 00:00:00\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "2001-01-01 00:00:00"), // updated - row(2, "2001-01-02 00:00:00") // new - ); - assertEquals("Should have expected rows", + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "id INT, ts TIMESTAMP", + "{ \"id\": 1, \"ts\": \"2000-01-01 00:00:00\" }\n" + + "{ \"id\": 6, \"ts\": \"2000-01-06 00:00:00\" }"); + + createOrReplaceView( + "source", + "id INT, ts TIMESTAMP", + "{ \"id\": 2, \"ts\": \"2001-01-02 00:00:00\" }\n" + + "{ \"id\": 1, \"ts\": \"2001-01-01 00:00:00\" }\n" + + "{ \"id\": 6, \"ts\": \"2001-01-06 00:00:00\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "2001-01-01 00:00:00"), // updated + row(2, "2001-01-02 00:00:00") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT id, CAST(ts AS STRING) FROM %s ORDER BY id", tableName)); @@ -641,31 +773,41 @@ public void testMergeWithBucketTransform() { for (DistributionMode mode : DistributionMode.values()) { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD bucket(2, dep)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, + sql("SELECT * FROM %s ORDER BY id", tableName)); removeTables(); } @@ -676,31 +818,41 @@ public void testMergeWithTruncateTransform() { for (DistributionMode mode : DistributionMode.values()) { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD truncate(dep, 2)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, + sql("SELECT * FROM %s ORDER BY id", tableName)); removeTables(); } @@ -712,31 +864,41 @@ public void testMergeIntoPartitionedAndOrderedTable() { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); sql("ALTER TABLE %s WRITE ORDERED BY (id)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, + sql("SELECT * FROM %s ORDER BY id", tableName)); removeTables(); } @@ -744,66 +906,75 @@ public void testMergeIntoPartitionedAndOrderedTable() { @Test public void testSelfMerge() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - - sql("MERGE INTO %s t USING %s s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET v = 'x' " + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName, tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "x"), // updated - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + + sql( + "MERGE INTO %s t USING %s s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET v = 'x' " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName, tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "x"), // updated + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testSelfMergeWithCaching() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); sql("CACHE TABLE %s", tableName); - sql("MERGE INTO %s t USING %s s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET v = 'x' " + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName, tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "x"), // updated - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "MERGE INTO %s t USING %s s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET v = 'x' " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName, tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "x"), // updated + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithSourceAsSelfSubquery() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); createOrReplaceView("source", Arrays.asList(1, null), Encoders.INT()); - sql("MERGE INTO %s t USING (SELECT id AS value FROM %s r JOIN source ON r.id = source.value) s " + - "ON t.id == s.value " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET v = 'x' " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES ('invalid', -1) ", tableName, tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "x"), // updated - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "MERGE INTO %s t USING (SELECT id AS value FROM %s r JOIN source ON r.id = source.value) s " + + "ON t.id == s.value " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET v = 'x' " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES ('invalid', -1) ", + tableName, tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "x"), // updated + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -814,37 +985,46 @@ public synchronized void testMergeWithSerializableIsolation() throws Interrupted createAndInitTable("id INT, dep STRING"); createOrReplaceView("source", Collections.singletonList(1), Encoders.INT()); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, MERGE_ISOLATION_LEVEL, "serializable"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, MERGE_ISOLATION_LEVEL, "serializable"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // merge thread - Future mergeFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.value " + - "WHEN MATCHED THEN " + - " UPDATE SET dep = 'x'", tableName); - barrier.incrementAndGet(); - } - }); + Future mergeFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.value " + + "WHEN MATCHED THEN " + + " UPDATE SET dep = 'x'", + tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { mergeFuture.get(); @@ -855,7 +1035,8 @@ public synchronized void testMergeWithSerializableIsolation() throws Interrupted Throwable validationException = sparkException.getCause(); Assert.assertThat(validationException, CoreMatchers.instanceOf(ValidationException.class)); String errMsg = validationException.getMessage(); - Assert.assertThat(errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); + Assert.assertThat( + errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); } finally { appendFuture.cancel(true); } @@ -865,44 +1046,54 @@ public synchronized void testMergeWithSerializableIsolation() throws Interrupted } @Test - public synchronized void testMergeWithSnapshotIsolation() throws InterruptedException, ExecutionException { + public synchronized void testMergeWithSnapshotIsolation() + throws InterruptedException, ExecutionException { // cannot run tests with concurrency for Hadoop tables without atomic renames Assume.assumeFalse(catalogName.equalsIgnoreCase("testhadoop")); createAndInitTable("id INT, dep STRING"); createOrReplaceView("source", Collections.singletonList(1), Encoders.INT()); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, MERGE_ISOLATION_LEVEL, "snapshot"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, MERGE_ISOLATION_LEVEL, "snapshot"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // merge thread - Future mergeFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.value " + - "WHEN MATCHED THEN " + - " UPDATE SET dep = 'x'", tableName); - barrier.incrementAndGet(); - } - }); + Future mergeFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.value " + + "WHEN MATCHED THEN " + + " UPDATE SET dep = 'x'", + tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { mergeFuture.get(); @@ -916,175 +1107,195 @@ public synchronized void testMergeWithSnapshotIsolation() throws InterruptedExce @Test public void testMergeWithExtraColumnsInSource() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - createOrReplaceView("source", - "{ \"id\": 1, \"extra_col\": -1, \"v\": \"v1_1\" }\n" + - "{ \"id\": 3, \"extra_col\": -1, \"v\": \"v3\" }\n" + - "{ \"id\": 4, \"extra_col\": -1, \"v\": \"v4\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET v = source.v " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "v1_1"), // new - row(2, "v2"), // kept - row(3, "v3"), // new - row(4, "v4") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + createOrReplaceView( + "source", + "{ \"id\": 1, \"extra_col\": -1, \"v\": \"v1_1\" }\n" + + "{ \"id\": 3, \"extra_col\": -1, \"v\": \"v3\" }\n" + + "{ \"id\": 4, \"extra_col\": -1, \"v\": \"v4\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET v = source.v " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "v1_1"), // new + row(2, "v2"), // kept + row(3, "v3"), // new + row(4, "v4") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithNullsInTargetAndSource() { - createAndInitTable("id INT, v STRING", - "{ \"id\": null, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - - createOrReplaceView("source", - "{ \"id\": null, \"v\": \"v1_1\" }\n" + - "{ \"id\": 4, \"v\": \"v4\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET v = source.v " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(null, "v1"), // kept - row(null, "v1_1"), // new - row(2, "v2"), // kept - row(4, "v4") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": null, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + + createOrReplaceView( + "source", "{ \"id\": null, \"v\": \"v1_1\" }\n" + "{ \"id\": 4, \"v\": \"v4\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET v = source.v " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(null, "v1"), // kept + row(null, "v1_1"), // new + row(2, "v2"), // kept + row(4, "v4") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test public void testMergeWithNullSafeEquals() { - createAndInitTable("id INT, v STRING", - "{ \"id\": null, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - - createOrReplaceView("source", - "{ \"id\": null, \"v\": \"v1_1\" }\n" + - "{ \"id\": 4, \"v\": \"v4\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id <=> source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET v = source.v " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(null, "v1_1"), // updated - row(2, "v2"), // kept - row(4, "v4") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": null, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + + createOrReplaceView( + "source", "{ \"id\": null, \"v\": \"v1_1\" }\n" + "{ \"id\": 4, \"v\": \"v4\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id <=> source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET v = source.v " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(null, "v1_1"), // updated + row(2, "v2"), // kept + row(4, "v4") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test public void testMergeWithNullCondition() { - createAndInitTable("id INT, v STRING", - "{ \"id\": null, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - - createOrReplaceView("source", - "{ \"id\": null, \"v\": \"v1_1\" }\n" + - "{ \"id\": 2, \"v\": \"v2_2\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id AND NULL " + - "WHEN MATCHED THEN " + - " UPDATE SET v = source.v " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(null, "v1"), // kept - row(null, "v1_1"), // new - row(2, "v2"), // kept - row(2, "v2_2") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": null, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + + createOrReplaceView( + "source", "{ \"id\": null, \"v\": \"v1_1\" }\n" + "{ \"id\": 2, \"v\": \"v2_2\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id AND NULL " + + "WHEN MATCHED THEN " + + " UPDATE SET v = source.v " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(null, "v1"), // kept + row(null, "v1_1"), // new + row(2, "v2"), // kept + row(2, "v2_2") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test public void testMergeWithNullActionConditions() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); - createOrReplaceView("source", - "{ \"id\": 1, \"v\": \"v1_1\" }\n" + - "{ \"id\": 2, \"v\": \"v2_2\" }\n" + - "{ \"id\": 3, \"v\": \"v3_3\" }"); + createOrReplaceView( + "source", + "{ \"id\": 1, \"v\": \"v1_1\" }\n" + + "{ \"id\": 2, \"v\": \"v2_2\" }\n" + + "{ \"id\": 3, \"v\": \"v3_3\" }"); // all conditions are NULL and will never match any rows - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED AND source.id = 1 AND NULL THEN " + - " UPDATE SET v = source.v " + - "WHEN MATCHED AND source.v = 'v1_1' AND NULL THEN " + - " DELETE " + - "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows1 = ImmutableList.of( - row(1, "v1"), // kept - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows1, sql("SELECT * FROM %s ORDER BY v", tableName)); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED AND source.id = 1 AND NULL THEN " + + " UPDATE SET v = source.v " + + "WHEN MATCHED AND source.v = 'v1_1' AND NULL THEN " + + " DELETE " + + "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows1 = + ImmutableList.of( + row(1, "v1"), // kept + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows1, sql("SELECT * FROM %s ORDER BY v", tableName)); // only the update and insert conditions are NULL - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED AND source.id = 1 AND NULL THEN " + - " UPDATE SET v = source.v " + - "WHEN MATCHED AND source.v = 'v1_1' THEN " + - " DELETE " + - "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows2 = ImmutableList.of( - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows2, sql("SELECT * FROM %s ORDER BY v", tableName)); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED AND source.id = 1 AND NULL THEN " + + " UPDATE SET v = source.v " + + "WHEN MATCHED AND source.v = 'v1_1' THEN " + + " DELETE " + + "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows2 = + ImmutableList.of( + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows2, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test public void testMergeWithMultipleMatchingActions() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); - createOrReplaceView("source", - "{ \"id\": 1, \"v\": \"v1_1\" }\n" + - "{ \"id\": 2, \"v\": \"v2_2\" }"); + createOrReplaceView( + "source", "{ \"id\": 1, \"v\": \"v1_1\" }\n" + "{ \"id\": 2, \"v\": \"v2_2\" }"); // the order of match actions is important in this case - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED AND source.id = 1 THEN " + - " UPDATE SET v = source.v " + - "WHEN MATCHED AND source.v = 'v1_1' THEN " + - " DELETE " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "v1_1"), // updated (also matches the delete cond but update is first) - row(2, "v2") // kept (matches neither the update nor the delete cond) - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED AND source.id = 1 THEN " + + " UPDATE SET v = source.v " + + "WHEN MATCHED AND source.v = 'v1_1' THEN " + + " DELETE " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "v1_1"), // updated (also matches the delete cond but update is first) + row(2, "v2") // kept (matches neither the update nor the delete cond) + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test @@ -1094,7 +1305,9 @@ public void testMergeWithMultipleRowGroupsParquet() throws NoSuchTableException createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, SPLIT_SIZE, 100); createOrReplaceView("source", Collections.singletonList(1), Encoders.INT()); @@ -1103,85 +1316,103 @@ public void testMergeWithMultipleRowGroupsParquet() throws NoSuchTableException for (int id = 1; id <= 200; id++) { ids.add(id); } - Dataset df = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); + Dataset df = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); df.coalesce(1).writeTo(tableName).append(); Assert.assertEquals(200, spark.table(tableName).count()); // update a record from one of two row groups and copy over the second one - sql("MERGE INTO %s t USING source " + - "ON t.id == source.value " + - "WHEN MATCHED THEN " + - " UPDATE SET dep = 'x'", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.value " + + "WHEN MATCHED THEN " + + " UPDATE SET dep = 'x'", + tableName); Assert.assertEquals(200, spark.table(tableName).count()); } @Test public void testMergeInsertOnly() { - createAndInitTable("id STRING, v STRING", - "{ \"id\": \"a\", \"v\": \"v1\" }\n" + - "{ \"id\": \"b\", \"v\": \"v2\" }"); - createOrReplaceView("source", - "{ \"id\": \"a\", \"v\": \"v1_1\" }\n" + - "{ \"id\": \"a\", \"v\": \"v1_2\" }\n" + - "{ \"id\": \"c\", \"v\": \"v3\" }\n" + - "{ \"id\": \"d\", \"v\": \"v4_1\" }\n" + - "{ \"id\": \"d\", \"v\": \"v4_2\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row("a", "v1"), // kept - row("b", "v2"), // kept - row("c", "v3"), // new - row("d", "v4_1"), // new - row("d", "v4_2") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id STRING, v STRING", + "{ \"id\": \"a\", \"v\": \"v1\" }\n" + "{ \"id\": \"b\", \"v\": \"v2\" }"); + createOrReplaceView( + "source", + "{ \"id\": \"a\", \"v\": \"v1_1\" }\n" + + "{ \"id\": \"a\", \"v\": \"v1_2\" }\n" + + "{ \"id\": \"c\", \"v\": \"v3\" }\n" + + "{ \"id\": \"d\", \"v\": \"v4_1\" }\n" + + "{ \"id\": \"d\", \"v\": \"v4_2\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row("a", "v1"), // kept + row("b", "v2"), // kept + row("c", "v3"), // new + row("d", "v4_1"), // new + row("d", "v4_2") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeInsertOnlyWithCondition() { createAndInitTable("id INTEGER, v INTEGER", "{ \"id\": 1, \"v\": 1 }"); - createOrReplaceView("source", - "{ \"id\": 1, \"v\": 11, \"is_new\": true }\n" + - "{ \"id\": 2, \"v\": 21, \"is_new\": true }\n" + - "{ \"id\": 2, \"v\": 22, \"is_new\": false }"); + createOrReplaceView( + "source", + "{ \"id\": 1, \"v\": 11, \"is_new\": true }\n" + + "{ \"id\": 2, \"v\": 21, \"is_new\": true }\n" + + "{ \"id\": 2, \"v\": 22, \"is_new\": false }"); // validate assignments are reordered to match the table attrs - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED AND is_new = TRUE THEN " + - " INSERT (v, id) VALUES (s.v + 100, s.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, 1), // kept - row(2, 121) // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED AND is_new = TRUE THEN " + + " INSERT (v, id) VALUES (s.v + 100, s.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, 1), // kept + row(2, 121) // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeAlignsUpdateAndInsertActions() { createAndInitTable("id INT, a INT, b STRING", "{ \"id\": 1, \"a\": 2, \"b\": \"str\" }"); - createOrReplaceView("source", - "{ \"id\": 1, \"c1\": -2, \"c2\": \"new_str_1\" }\n" + - "{ \"id\": 2, \"c1\": -20, \"c2\": \"new_str_2\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET b = c2, a = c1, t.id = source.id " + - "WHEN NOT MATCHED THEN " + - " INSERT (b, a, id) VALUES (c2, c1, id)", tableName); - - assertEquals("Output should match", + createOrReplaceView( + "source", + "{ \"id\": 1, \"c1\": -2, \"c2\": \"new_str_1\" }\n" + + "{ \"id\": 2, \"c1\": -20, \"c2\": \"new_str_2\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET b = c2, a = c1, t.id = source.id " + + "WHEN NOT MATCHED THEN " + + " INSERT (b, a, id) VALUES (c2, c1, id)", + tableName); + + assertEquals( + "Output should match", ImmutableList.of(row(1, -2, "new_str_1"), row(2, -20, "new_str_2")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -1191,15 +1422,17 @@ public void testMergeMixedCaseAlignsUpdateAndInsertActions() { createAndInitTable("id INT, a INT, b STRING", "{ \"id\": 1, \"a\": 2, \"b\": \"str\" }"); createOrReplaceView( "source", - "{ \"id\": 1, \"c1\": -2, \"c2\": \"new_str_1\" }\n" + - "{ \"id\": 2, \"c1\": -20, \"c2\": \"new_str_2\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.iD == source.Id " + - "WHEN MATCHED THEN " + - " UPDATE SET B = c2, A = c1, t.Id = source.ID " + - "WHEN NOT MATCHED THEN " + - " INSERT (b, A, iD) VALUES (c2, c1, id)", tableName); + "{ \"id\": 1, \"c1\": -2, \"c2\": \"new_str_1\" }\n" + + "{ \"id\": 2, \"c1\": -20, \"c2\": \"new_str_2\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.iD == source.Id " + + "WHEN MATCHED THEN " + + " UPDATE SET B = c2, A = c1, t.Id = source.ID " + + "WHEN NOT MATCHED THEN " + + " INSERT (b, A, iD) VALUES (c2, c1, id)", + tableName); assertEquals( "Output should match", @@ -1218,37 +1451,47 @@ public void testMergeMixedCaseAlignsUpdateAndInsertActions() { @Test public void testMergeUpdatesNestedStructFields() { - createAndInitTable("id INT, s STRUCT,m:MAP>>", + createAndInitTable( + "id INT, s STRUCT,m:MAP>>", "{ \"id\": 1, \"s\": { \"c1\": 2, \"c2\": { \"a\": [1,2], \"m\": { \"a\": \"b\"} } } } }"); createOrReplaceView("source", "{ \"id\": 1, \"c1\": -2 }"); // update primitive, array, map columns inside a struct - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.c1 = source.c1, t.s.c2.a = array(-1, -2), t.s.c2.m = map('k', 'v')", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.c1 = source.c1, t.s.c2.a = array(-1, -2), t.s.c2.m = map('k', 'v')", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(-2, row(ImmutableList.of(-1, -2), ImmutableMap.of("k", "v"))))), sql("SELECT * FROM %s ORDER BY id", tableName)); // set primitive, array, map columns to NULL (proper casts should be in place) - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.c1 = NULL, t.s.c2 = NULL", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.c1 = NULL, t.s.c2 = NULL", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(null, null))), sql("SELECT * FROM %s ORDER BY id", tableName)); // update all fields in a struct - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s = named_struct('c1', 100, 'c2', named_struct('a', array(1), 'm', map('x', 'y')))", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s = named_struct('c1', 100, 'c2', named_struct('a', array(1), 'm', map('x', 'y')))", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(100, row(ImmutableList.of(1), ImmutableMap.of("x", "y"))))), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -1259,12 +1502,15 @@ public void testMergeWithInferredCasts() { createOrReplaceView("source", "{ \"id\": 1, \"c1\": -2}"); // -2 in source should be casted to "-2" in target - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s = source.c1", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s = source.c1", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, "-2")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -1274,12 +1520,15 @@ public void testMergeModifiesNullStruct() { createAndInitTable("id INT, s STRUCT", "{ \"id\": 1, \"s\": null }"); createOrReplaceView("source", "{ \"id\": 1, \"n1\": -10 }"); - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.n1 = s.n1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.n1 = s.n1", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(-10, null))), sql("SELECT * FROM %s", tableName)); } @@ -1294,18 +1543,18 @@ public void testMergeRefreshesRelationCache() { spark.sql("CACHE TABLE tmp"); - assertEquals("View should have correct data", - ImmutableList.of(row("n1")), - sql("SELECT * FROM tmp")); + assertEquals( + "View should have correct data", ImmutableList.of(row("n1")), sql("SELECT * FROM tmp")); - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.name = s.name", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.name = s.name", + tableName); - assertEquals("View should have correct data", - ImmutableList.of(row("n2")), - sql("SELECT * FROM tmp")); + assertEquals( + "View should have correct data", ImmutableList.of(row("n2")), sql("SELECT * FROM tmp")); spark.sql("UNCACHE TABLE tmp"); } @@ -1314,75 +1563,95 @@ public void testMergeRefreshesRelationCache() { public void testMergeWithMultipleNotMatchedActions() { createAndInitTable("id INT, dep STRING", "{ \"id\": 0, \"dep\": \"emp-id-0\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED AND s.id = 1 THEN " + - " INSERT (dep, id) VALUES (s.dep, -1)" + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(-1, "emp-id-1"), // new - row(0, "emp-id-0"), // kept - row(2, "emp-id-2"), // new - row(3, "emp-id-3") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED AND s.id = 1 THEN " + + " INSERT (dep, id) VALUES (s.dep, -1)" + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(-1, "emp-id-1"), // new + row(0, "emp-id-0"), // kept + row(2, "emp-id-2"), // new + row(3, "emp-id-3") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithMultipleConditionalNotMatchedActions() { createAndInitTable("id INT, dep STRING", "{ \"id\": 0, \"dep\": \"emp-id-0\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED AND s.id = 1 THEN " + - " INSERT (dep, id) VALUES (s.dep, -1)" + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(-1, "emp-id-1"), // new - row(0, "emp-id-0"), // kept - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED AND s.id = 1 THEN " + + " INSERT (dep, id) VALUES (s.dep, -1)" + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(-1, "emp-id-1"), // new + row(0, "emp-id-0"), // kept + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeResolvesColumnsByName() { - createAndInitTable("id INT, badge INT, dep STRING", - "{ \"id\": 1, \"badge\": 1000, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"badge\": 6000, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "badge INT, id INT, dep STRING", - "{ \"badge\": 1001, \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"badge\": 6006, \"id\": 6, \"dep\": \"emp-id-6\" }\n" + - "{ \"badge\": 7007, \"id\": 7, \"dep\": \"emp-id-7\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED THEN " + - " UPDATE SET * " + - "WHEN NOT MATCHED THEN " + - " INSERT * ", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, 1001, "emp-id-1"), // updated - row(6, 6006, "emp-id-6"), // updated - row(7, 7007, "emp-id-7") // new - ); - assertEquals("Should have expected rows", expectedRows, + createAndInitTable( + "id INT, badge INT, dep STRING", + "{ \"id\": 1, \"badge\": 1000, \"dep\": \"emp-id-one\" }\n" + + "{ \"id\": 6, \"badge\": 6000, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "badge INT, id INT, dep STRING", + "{ \"badge\": 1001, \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"badge\": 6006, \"id\": 6, \"dep\": \"emp-id-6\" }\n" + + "{ \"badge\": 7007, \"id\": 7, \"dep\": \"emp-id-7\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN " + + " UPDATE SET * " + + "WHEN NOT MATCHED THEN " + + " INSERT * ", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, 1001, "emp-id-1"), // updated + row(6, 6006, "emp-id-6"), // updated + row(7, 7007, "emp-id-7") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, sql("SELECT id, badge, dep FROM %s ORDER BY id", tableName)); } @@ -1392,24 +1661,30 @@ public void testMergeShouldResolveWhenThereAreNoUnresolvedExpressionsOrColumns() // or otherwise unresolved expressions exist in the query (testing SPARK-34962) createAndInitTable("id INT, dep STRING"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON 1 != 1 " + - "WHEN MATCHED THEN " + - " UPDATE SET * " + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // new - row(2, "emp-id-2"), // new - row(3, "emp-id-3") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON 1 != 1 " + + "WHEN MATCHED THEN " + + " UPDATE SET * " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // new + row(2, "emp-id-2"), // new + row(3, "emp-id-3") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -1417,33 +1692,45 @@ public void testMergeWithNonExistingColumns() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about the invalid top-level column", - AnalysisException.class, "cannot resolve t.invalid_col", + AssertHelpers.assertThrows( + "Should complain about the invalid top-level column", + AnalysisException.class, + "cannot resolve t.invalid_col", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.invalid_col = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.invalid_col = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about the invalid nested column", - AnalysisException.class, "No such struct field invalid_col", + AssertHelpers.assertThrows( + "Should complain about the invalid nested column", + AnalysisException.class, + "No such struct field invalid_col", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n2.invalid_col = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n2.invalid_col = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about the invalid top-level column", - AnalysisException.class, "cannot resolve invalid_col", + AssertHelpers.assertThrows( + "Should complain about the invalid top-level column", + AnalysisException.class, + "cannot resolve invalid_col", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n2.dn1 = s.c2 " + - "WHEN NOT MATCHED THEN " + - " INSERT (id, invalid_col) VALUES (s.c1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n2.dn1 = s.c2 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, invalid_col) VALUES (s.c1, null)", + tableName); }); } @@ -1452,35 +1739,47 @@ public void testMergeWithInvalidColumnsInInsert() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about the nested column", - AnalysisException.class, "Nested fields are not supported inside INSERT clauses", + AssertHelpers.assertThrows( + "Should complain about the nested column", + AnalysisException.class, + "Nested fields are not supported inside INSERT clauses", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n2.dn1 = s.c2 " + - "WHEN NOT MATCHED THEN " + - " INSERT (id, c.n2) VALUES (s.c1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n2.dn1 = s.c2 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, c.n2) VALUES (s.c1, null)", + tableName); }); - AssertHelpers.assertThrows("Should complain about duplicate columns", - AnalysisException.class, "Duplicate column names inside INSERT clause", + AssertHelpers.assertThrows( + "Should complain about duplicate columns", + AnalysisException.class, + "Duplicate column names inside INSERT clause", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n2.dn1 = s.c2 " + - "WHEN NOT MATCHED THEN " + - " INSERT (id, id) VALUES (s.c1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n2.dn1 = s.c2 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, id) VALUES (s.c1, null)", + tableName); }); - AssertHelpers.assertThrows("Should complain about missing columns", - AnalysisException.class, "must provide values for all columns of the target table", + AssertHelpers.assertThrows( + "Should complain about missing columns", + AnalysisException.class, + "must provide values for all columns of the target table", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN NOT MATCHED THEN " + - " INSERT (id) VALUES (s.c1)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id) VALUES (s.c1)", + tableName); }); } @@ -1489,22 +1788,30 @@ public void testMergeWithInvalidUpdates() { createAndInitTable("id INT, a ARRAY>, m MAP"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about updating an array column", - AnalysisException.class, "Updating nested fields is only supported for structs", + AssertHelpers.assertThrows( + "Should complain about updating an array column", + AnalysisException.class, + "Updating nested fields is only supported for structs", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.a.c1 = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.a.c1 = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about updating a map column", - AnalysisException.class, "Updating nested fields is only supported for structs", + AssertHelpers.assertThrows( + "Should complain about updating a map column", + AnalysisException.class, + "Updating nested fields is only supported for structs", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.m.key = 'new_key'", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.m.key = 'new_key'", + tableName); }); } @@ -1513,90 +1820,124 @@ public void testMergeWithConflictingUpdates() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about conflicting updates to a top-level column", - AnalysisException.class, "Updates are in conflict", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a top-level column", + AnalysisException.class, + "Updates are in conflict", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.id = 1, t.c.n1 = 2, t.id = 2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.id = 1, t.c.n1 = 2, t.id = 2", + tableName); }); - AssertHelpers.assertThrows("Should complain about conflicting updates to a nested column", - AnalysisException.class, "Updates are in conflict for these columns", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a nested column", + AnalysisException.class, + "Updates are in conflict for these columns", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n1 = 1, t.id = 2, t.c.n1 = 2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n1 = 1, t.id = 2, t.c.n1 = 2", + tableName); }); - AssertHelpers.assertThrows("Should complain about conflicting updates to a nested column", - AnalysisException.class, "Updates are in conflict", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a nested column", + AnalysisException.class, + "Updates are in conflict", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET c.n1 = 1, c = named_struct('n1', 1, 'n2', named_struct('dn1', 1, 'dn2', 2))", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET c.n1 = 1, c = named_struct('n1', 1, 'n2', named_struct('dn1', 1, 'dn2', 2))", + tableName); }); } @Test public void testMergeWithInvalidAssignments() { - createAndInitTable("id INT NOT NULL, s STRUCT> NOT NULL"); + createAndInitTable( + "id INT NOT NULL, s STRUCT> NOT NULL"); createOrReplaceView( "source", "c1 INT, c2 STRUCT NOT NULL, c3 STRING NOT NULL, c4 STRUCT", "{ \"c1\": -100, \"c2\": { \"n1\" : 1 }, \"c3\" : 'str', \"c4\": { \"dn2\": 1, \"dn2\": 2 } }"); - for (String policy : new String[]{"ansi", "strict"}) { - withSQLConf(ImmutableMap.of("spark.sql.storeAssignmentPolicy", policy), () -> { - - AssertHelpers.assertThrows("Should complain about writing nulls to a top-level column", - AnalysisException.class, "Cannot write nullable values to non-null column", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.id = NULL", tableName); - }); - - AssertHelpers.assertThrows("Should complain about writing nulls to a nested column", - AnalysisException.class, "Cannot write nullable values to non-null column", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.n1 = NULL", tableName); - }); - - AssertHelpers.assertThrows("Should complain about writing missing fields in structs", - AnalysisException.class, "missing fields", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s = s.c2", tableName); - }); - - AssertHelpers.assertThrows("Should complain about writing invalid data types", - AnalysisException.class, "Cannot safely cast", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.n1 = s.c3", tableName); - }); - - AssertHelpers.assertThrows("Should complain about writing incompatible structs", - AnalysisException.class, "field name does not match", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.n2 = s.c4", tableName); - }); - }); + for (String policy : new String[] {"ansi", "strict"}) { + withSQLConf( + ImmutableMap.of("spark.sql.storeAssignmentPolicy", policy), + () -> { + AssertHelpers.assertThrows( + "Should complain about writing nulls to a top-level column", + AnalysisException.class, + "Cannot write nullable values to non-null column", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.id = NULL", + tableName); + }); + + AssertHelpers.assertThrows( + "Should complain about writing nulls to a nested column", + AnalysisException.class, + "Cannot write nullable values to non-null column", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.n1 = NULL", + tableName); + }); + + AssertHelpers.assertThrows( + "Should complain about writing missing fields in structs", + AnalysisException.class, + "missing fields", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s = s.c2", + tableName); + }); + + AssertHelpers.assertThrows( + "Should complain about writing invalid data types", + AnalysisException.class, + "Cannot safely cast", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.n1 = s.c3", + tableName); + }); + + AssertHelpers.assertThrows( + "Should complain about writing incompatible structs", + AnalysisException.class, + "field name does not match", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.n2 = s.c4", + tableName); + }); + }); } } @@ -1605,40 +1946,56 @@ public void testMergeWithNonDeterministicConditions() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about non-deterministic search conditions", - AnalysisException.class, "Non-deterministic functions are not supported", + AssertHelpers.assertThrows( + "Should complain about non-deterministic search conditions", + AnalysisException.class, + "Non-deterministic functions are not supported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 AND rand() > t.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n1 = -1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 AND rand() > t.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n1 = -1", + tableName); }); - AssertHelpers.assertThrows("Should complain about non-deterministic update conditions", - AnalysisException.class, "Non-deterministic functions are not supported", + AssertHelpers.assertThrows( + "Should complain about non-deterministic update conditions", + AnalysisException.class, + "Non-deterministic functions are not supported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND rand() > t.id THEN " + - " UPDATE SET t.c.n1 = -1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND rand() > t.id THEN " + + " UPDATE SET t.c.n1 = -1", + tableName); }); - AssertHelpers.assertThrows("Should complain about non-deterministic delete conditions", - AnalysisException.class, "Non-deterministic functions are not supported", + AssertHelpers.assertThrows( + "Should complain about non-deterministic delete conditions", + AnalysisException.class, + "Non-deterministic functions are not supported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND rand() > t.id THEN " + - " DELETE", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND rand() > t.id THEN " + + " DELETE", + tableName); }); - AssertHelpers.assertThrows("Should complain about non-deterministic insert conditions", - AnalysisException.class, "Non-deterministic functions are not supported", + AssertHelpers.assertThrows( + "Should complain about non-deterministic insert conditions", + AnalysisException.class, + "Non-deterministic functions are not supported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN NOT MATCHED AND rand() > c1 THEN " + - " INSERT (id, c) VALUES (1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN NOT MATCHED AND rand() > c1 THEN " + + " INSERT (id, c) VALUES (1, null)", + tableName); }); } @@ -1647,40 +2004,56 @@ public void testMergeWithAggregateExpressions() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about agg expressions in search conditions", - AnalysisException.class, "Agg functions are not supported", + AssertHelpers.assertThrows( + "Should complain about agg expressions in search conditions", + AnalysisException.class, + "Agg functions are not supported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 AND max(t.id) == 1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n1 = -1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 AND max(t.id) == 1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n1 = -1", + tableName); }); - AssertHelpers.assertThrows("Should complain about agg expressions in update conditions", - AnalysisException.class, "Agg functions are not supported", + AssertHelpers.assertThrows( + "Should complain about agg expressions in update conditions", + AnalysisException.class, + "Agg functions are not supported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND sum(t.id) < 1 THEN " + - " UPDATE SET t.c.n1 = -1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND sum(t.id) < 1 THEN " + + " UPDATE SET t.c.n1 = -1", + tableName); }); - AssertHelpers.assertThrows("Should complain about agg expressions in delete conditions", - AnalysisException.class, "Agg functions are not supported", + AssertHelpers.assertThrows( + "Should complain about agg expressions in delete conditions", + AnalysisException.class, + "Agg functions are not supported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND sum(t.id) THEN " + - " DELETE", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND sum(t.id) THEN " + + " DELETE", + tableName); }); - AssertHelpers.assertThrows("Should complain about agg expressions in insert conditions", - AnalysisException.class, "Agg functions are not supported", + AssertHelpers.assertThrows( + "Should complain about agg expressions in insert conditions", + AnalysisException.class, + "Agg functions are not supported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN NOT MATCHED AND sum(c1) < 1 THEN " + - " INSERT (id, c) VALUES (1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN NOT MATCHED AND sum(c1) < 1 THEN " + + " INSERT (id, c) VALUES (1, null)", + tableName); }); } @@ -1689,40 +2062,56 @@ public void testMergeWithSubqueriesInConditions() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about subquery expressions", - AnalysisException.class, "Subqueries are not supported in conditions", + AssertHelpers.assertThrows( + "Should complain about subquery expressions", + AnalysisException.class, + "Subqueries are not supported in conditions", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 AND t.id < (SELECT max(c2) FROM source) " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n1 = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 AND t.id < (SELECT max(c2) FROM source) " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n1 = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about subquery expressions", - AnalysisException.class, "Subqueries are not supported in conditions", + AssertHelpers.assertThrows( + "Should complain about subquery expressions", + AnalysisException.class, + "Subqueries are not supported in conditions", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND t.id < (SELECT max(c2) FROM source) THEN " + - " UPDATE SET t.c.n1 = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND t.id < (SELECT max(c2) FROM source) THEN " + + " UPDATE SET t.c.n1 = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about subquery expressions", - AnalysisException.class, "Subqueries are not supported in conditions", + AssertHelpers.assertThrows( + "Should complain about subquery expressions", + AnalysisException.class, + "Subqueries are not supported in conditions", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND t.id NOT IN (SELECT c2 FROM source) THEN " + - " DELETE", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND t.id NOT IN (SELECT c2 FROM source) THEN " + + " DELETE", + tableName); }); - AssertHelpers.assertThrows("Should complain about subquery expressions", - AnalysisException.class, "Subqueries are not supported in conditions", + AssertHelpers.assertThrows( + "Should complain about subquery expressions", + AnalysisException.class, + "Subqueries are not supported in conditions", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN NOT MATCHED AND s.c1 IN (SELECT c2 FROM source) THEN " + - " INSERT (id, c) VALUES (1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN NOT MATCHED AND s.c1 IN (SELECT c2 FROM source) THEN " + + " INSERT (id, c) VALUES (1, null)", + tableName); }); } @@ -1731,13 +2120,17 @@ public void testMergeWithTargetColumnsInInsertConditions() { createAndInitTable("id INT, c2 INT"); createOrReplaceView("source", "{ \"id\": 1, \"value\": 11 }"); - AssertHelpers.assertThrows("Should complain about the target column", - AnalysisException.class, "Cannot resolve [c2]", + AssertHelpers.assertThrows( + "Should complain about the target column", + AnalysisException.class, + "Cannot resolve [c2]", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED AND c2 = 1 THEN " + - " INSERT (id, c2) VALUES (s.id, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED AND c2 = 1 THEN " + + " INSERT (id, c2) VALUES (s.id, null)", + tableName); }); } @@ -1746,19 +2139,22 @@ public void testMergeWithNonIcebergTargetTableNotSupported() { createOrReplaceView("target", "{ \"c1\": -100, \"c2\": -200 }"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain non iceberg target table", - UnsupportedOperationException.class, "MERGE INTO TABLE is not supported temporarily.", + AssertHelpers.assertThrows( + "Should complain non iceberg target table", + UnsupportedOperationException.class, + "MERGE INTO TABLE is not supported temporarily.", () -> { - sql("MERGE INTO target t USING source s " + - "ON t.c1 == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET *"); + sql( + "MERGE INTO target t USING source s " + + "ON t.c1 == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET *"); }); } /** - * Tests a merge where both the source and target are evaluated to be partitioned by SingePartition at planning time - * but DynamicFileFilterExec will return an empty target. + * Tests a merge where both the source and target are evaluated to be partitioned by + * SingePartition at planning time but DynamicFileFilterExec will return an empty target. */ @Test public void testMergeSinglePartitionPartitioning() { @@ -1768,19 +2164,14 @@ public void testMergeSinglePartitionPartitioning() { // Coalesce forces our source into a SinglePartition distribution spark.range(0, 5).coalesce(1).createOrReplaceTempView("source"); - sql("MERGE INTO %s t USING source s ON t.id = s.id " + - "WHEN MATCHED THEN UPDATE SET *" + - "WHEN NOT MATCHED THEN INSERT *", + sql( + "MERGE INTO %s t USING source s ON t.id = s.id " + + "WHEN MATCHED THEN UPDATE SET *" + + "WHEN NOT MATCHED THEN INSERT *", tableName); - ImmutableList expectedRows = ImmutableList.of( - row(-1), - row(0), - row(1), - row(2), - row(3), - row(4) - ); + ImmutableList expectedRows = + ImmutableList.of(row(-1), row(0), row(1), row(2), row(3), row(4)); List result = sql("SELECT * FROM %s ORDER BY id", tableName); assertEquals("Should correctly add the non-matching rows", expectedRows, result); @@ -1794,18 +2185,13 @@ public void testMergeEmptyTable() { // Coalesce forces our source into a SinglePartition distribution spark.range(0, 5).coalesce(1).createOrReplaceTempView("source"); - sql("MERGE INTO %s t USING source s ON t.id = s.id " + - "WHEN MATCHED THEN UPDATE SET *" + - "WHEN NOT MATCHED THEN INSERT *", + sql( + "MERGE INTO %s t USING source s ON t.id = s.id " + + "WHEN MATCHED THEN UPDATE SET *" + + "WHEN NOT MATCHED THEN INSERT *", tableName); - ImmutableList expectedRows = ImmutableList.of( - row(0), - row(1), - row(2), - row(3), - row(4) - ); + ImmutableList expectedRows = ImmutableList.of(row(0), row(1), row(2), row(3), row(4)); List result = sql("SELECT * FROM %s ORDER BY id", tableName); assertEquals("Should correctly add the non-matching rows", expectedRows, result); diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadDelete.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadDelete.java index d5454e036017..048040408fac 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadDelete.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadDelete.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + import java.util.Map; import org.apache.iceberg.AssertHelpers; import org.apache.iceberg.RowDelta; @@ -34,14 +37,15 @@ import org.junit.Test; import org.junit.runners.Parameterized; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - public class TestMergeOnReadDelete extends TestDelete { - public TestMergeOnReadDelete(String catalogName, String implementation, Map config, - String fileFormat, Boolean vectorized, String distributionMode) { + public TestMergeOnReadDelete( + String catalogName, + String implementation, + Map config, + String fileFormat, + Boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -49,8 +53,7 @@ public TestMergeOnReadDelete(String catalogName, String implementation, Map extraTableProperties() { return ImmutableMap.of( TableProperties.FORMAT_VERSION, "2", - TableProperties.DELETE_MODE, "merge-on-read" - ); + TableProperties.DELETE_MODE, "merge-on-read"); } @Parameterized.AfterParam @@ -64,33 +67,42 @@ public void testCommitUnknownException() { // write unpartitioned files append(tableName, "{ \"id\": 1, \"dep\": \"hr\", \"category\": \"c1\"}"); - append(tableName, "{ \"id\": 2, \"dep\": \"hr\", \"category\": \"c1\" }\n" + - "{ \"id\": 3, \"dep\": \"hr\", \"category\": \"c1\" }"); + append( + tableName, + "{ \"id\": 2, \"dep\": \"hr\", \"category\": \"c1\" }\n" + + "{ \"id\": 3, \"dep\": \"hr\", \"category\": \"c1\" }"); Table table = validationCatalog.loadTable(tableIdent); RowDelta newRowDelta = table.newRowDelta(); RowDelta spyNewRowDelta = spy(newRowDelta); - doAnswer(invocation -> { - newRowDelta.commit(); - throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); - }).when(spyNewRowDelta).commit(); + doAnswer( + invocation -> { + newRowDelta.commit(); + throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); + }) + .when(spyNewRowDelta) + .commit(); Table spyTable = spy(table); when(spyTable.newRowDelta()).thenReturn(spyNewRowDelta); SparkTable sparkTable = new SparkTable(spyTable, false); - ImmutableMap config = ImmutableMap.of( - "type", "hive", - "default-namespace", "default" - ); - spark.conf().set("spark.sql.catalog.dummy_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); - config.forEach((key, value) -> spark.conf().set("spark.sql.catalog.dummy_catalog." + key, value)); - Identifier ident = Identifier.of(new String[]{"default"}, "table"); + ImmutableMap config = + ImmutableMap.of( + "type", "hive", + "default-namespace", "default"); + spark + .conf() + .set("spark.sql.catalog.dummy_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); + config.forEach( + (key, value) -> spark.conf().set("spark.sql.catalog.dummy_catalog." + key, value)); + Identifier ident = Identifier.of(new String[] {"default"}, "table"); TestSparkCatalog.setTable(ident, sparkTable); // Although an exception is thrown here, write and commit have succeeded - AssertHelpers.assertThrowsWithCause("Should throw a Commit State Unknown Exception", + AssertHelpers.assertThrowsWithCause( + "Should throw a Commit State Unknown Exception", SparkException.class, "Writing job aborted", CommitStateUnknownException.class, @@ -98,7 +110,8 @@ public void testCommitUnknownException() { () -> sql("DELETE FROM %s WHERE id = 2", "dummy_catalog.default.table")); // Since write and commit succeeded, the rows should be readable - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr", "c1"), row(3, "hr", "c1")), sql("SELECT * FROM %s ORDER BY id", "dummy_catalog.default.table")); } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java index d32f3464c3d0..95e77d441ffb 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -25,8 +24,13 @@ public class TestMergeOnReadMerge extends TestMerge { - public TestMergeOnReadMerge(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestMergeOnReadMerge( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -34,7 +38,6 @@ public TestMergeOnReadMerge(String catalogName, String implementation, Map extraTableProperties() { return ImmutableMap.of( TableProperties.FORMAT_VERSION, "2", - TableProperties.MERGE_MODE, "merge-on-read" - ); + TableProperties.MERGE_MODE, "merge-on-read"); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java index 97c34f155894..9ac6d51b72b1 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -25,8 +24,13 @@ public class TestMergeOnReadUpdate extends TestUpdate { - public TestMergeOnReadUpdate(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestMergeOnReadUpdate( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -34,7 +38,6 @@ public TestMergeOnReadUpdate(String catalogName, String implementation, Map extraTableProperties() { return ImmutableMap.of( TableProperties.FORMAT_VERSION, "2", - TableProperties.UPDATE_MODE, "merge-on-read" - ); + TableProperties.UPDATE_MODE, "merge-on-read"); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMetadataTables.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMetadataTables.java index c33cbfcce0fc..973cf4844a18 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMetadataTables.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMetadataTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.io.IOException; @@ -48,7 +47,6 @@ import org.junit.Assert; import org.junit.Test; - public class TestMetadataTables extends SparkExtensionsTestBase { public TestMetadataTables(String catalogName, String implementation, Map config) { @@ -62,16 +60,19 @@ public void removeTables() { @Test public void testUnpartitionedTable() throws Exception { - sql("CREATE TABLE %s (id bigint, data string) USING iceberg TBLPROPERTIES" + - "('format-version'='2', 'write.delete.mode'='merge-on-read')", tableName); - - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "d") - ); - spark.createDataset(records, Encoders.bean(SimpleRecord.class)) + sql( + "CREATE TABLE %s (id bigint, data string) USING iceberg TBLPROPERTIES" + + "('format-version'='2', 'write.delete.mode'='merge-on-read')", + tableName); + + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "d")); + spark + .createDataset(records, Encoders.bean(SimpleRecord.class)) .coalesce(1) .writeTo(tableName) .append(); @@ -88,56 +89,66 @@ public void testUnpartitionedTable() throws Exception { Schema filesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + ".files").schema(); // check delete files table - List actualDeleteFiles = spark.sql("SELECT * FROM " + tableName + ".delete_files").collectAsList(); - Assert.assertEquals("Metadata table should return one delete file", 1, actualDeleteFiles.size()); - - List expectedDeleteFiles = expectedEntries(table, FileContent.POSITION_DELETES, - entriesTableSchema, expectedDeleteManifests, null); + List actualDeleteFiles = + spark.sql("SELECT * FROM " + tableName + ".delete_files").collectAsList(); + Assert.assertEquals( + "Metadata table should return one delete file", 1, actualDeleteFiles.size()); + + List expectedDeleteFiles = + expectedEntries( + table, FileContent.POSITION_DELETES, entriesTableSchema, expectedDeleteManifests, null); Assert.assertEquals("Should be one delete file manifest entry", 1, expectedDeleteFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); // check data files table - List actualDataFiles = spark.sql("SELECT * FROM " + tableName + ".data_files").collectAsList(); + List actualDataFiles = + spark.sql("SELECT * FROM " + tableName + ".data_files").collectAsList(); Assert.assertEquals("Metadata table should return one data file", 1, actualDataFiles.size()); - List expectedDataFiles = expectedEntries(table, FileContent.DATA, - entriesTableSchema, expectedDataManifests, null); + List expectedDataFiles = + expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, null); Assert.assertEquals("Should be one data file manifest entry", 1, expectedDataFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedDataFiles.get(0), actualDataFiles.get(0)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedDataFiles.get(0), actualDataFiles.get(0)); // check all files table - List actualFiles = spark.sql("SELECT * FROM " + tableName + ".files ORDER BY content").collectAsList(); + List actualFiles = + spark.sql("SELECT * FROM " + tableName + ".files ORDER BY content").collectAsList(); Assert.assertEquals("Metadata table should return two files", 2, actualFiles.size()); - List expectedFiles = Stream.concat(expectedDataFiles.stream(), expectedDeleteFiles.stream()) - .collect(Collectors.toList()); + List expectedFiles = + Stream.concat(expectedDataFiles.stream(), expectedDeleteFiles.stream()) + .collect(Collectors.toList()); Assert.assertEquals("Should have two files manifest entries", 2, expectedFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedFiles.get(0), actualFiles.get(0)); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedFiles.get(1), actualFiles.get(1)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedFiles.get(0), actualFiles.get(0)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedFiles.get(1), actualFiles.get(1)); } @Test public void testPartitionedTable() throws Exception { - sql("CREATE TABLE %s (id bigint, data string) " + - "USING iceberg " + - "PARTITIONED BY (data) " + - "TBLPROPERTIES" + - "('format-version'='2', 'write.delete.mode'='merge-on-read')", tableName); - - List recordsA = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "a") - ); - spark.createDataset(recordsA, Encoders.bean(SimpleRecord.class)) + sql( + "CREATE TABLE %s (id bigint, data string) " + + "USING iceberg " + + "PARTITIONED BY (data) " + + "TBLPROPERTIES" + + "('format-version'='2', 'write.delete.mode'='merge-on-read')", + tableName); + + List recordsA = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "a")); + spark + .createDataset(recordsA, Encoders.bean(SimpleRecord.class)) .coalesce(1) .writeTo(tableName) .append(); - List recordsB = Lists.newArrayList( - new SimpleRecord(1, "b"), - new SimpleRecord(2, "b") - ); - spark.createDataset(recordsB, Encoders.bean(SimpleRecord.class)) + List recordsB = + Lists.newArrayList(new SimpleRecord(1, "b"), new SimpleRecord(2, "b")); + spark + .createDataset(recordsB, Encoders.bean(SimpleRecord.class)) .coalesce(1) .writeTo(tableName) .append(); @@ -153,59 +164,84 @@ public void testPartitionedTable() throws Exception { Assert.assertEquals("Should have 2 data manifests", 2, expectedDataManifests.size()); Assert.assertEquals("Should have 2 delete manifests", 2, expectedDeleteManifests.size()); - Schema filesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + ".delete_files").schema(); + Schema filesTableSchema = + Spark3Util.loadIcebergTable(spark, tableName + ".delete_files").schema(); // Check delete files table - List expectedDeleteFiles = expectedEntries(table, FileContent.POSITION_DELETES, - entriesTableSchema, expectedDeleteManifests, "a"); - Assert.assertEquals("Should have one delete file manifest entry", 1, expectedDeleteFiles.size()); - - List actualDeleteFiles = spark.sql("SELECT * FROM " + tableName + ".delete_files " + - "WHERE partition.data='a'").collectAsList(); - Assert.assertEquals("Metadata table should return one delete file", 1, actualDeleteFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); + List expectedDeleteFiles = + expectedEntries( + table, FileContent.POSITION_DELETES, entriesTableSchema, expectedDeleteManifests, "a"); + Assert.assertEquals( + "Should have one delete file manifest entry", 1, expectedDeleteFiles.size()); + + List actualDeleteFiles = + spark + .sql("SELECT * FROM " + tableName + ".delete_files " + "WHERE partition.data='a'") + .collectAsList(); + Assert.assertEquals( + "Metadata table should return one delete file", 1, actualDeleteFiles.size()); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); // Check data files table - List expectedDataFiles = expectedEntries(table, FileContent.DATA, - entriesTableSchema, expectedDataManifests, "a"); + List expectedDataFiles = + expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, "a"); Assert.assertEquals("Should have one data file manifest entry", 1, expectedDataFiles.size()); - List actualDataFiles = spark.sql("SELECT * FROM " + tableName + ".data_files " + - "WHERE partition.data='a'").collectAsList(); + List actualDataFiles = + spark + .sql("SELECT * FROM " + tableName + ".data_files " + "WHERE partition.data='a'") + .collectAsList(); Assert.assertEquals("Metadata table should return one data file", 1, actualDataFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedDataFiles.get(0), actualDataFiles.get(0)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedDataFiles.get(0), actualDataFiles.get(0)); List actualPartitionsWithProjection = spark.sql("SELECT file_count FROM " + tableName + ".partitions ").collectAsList(); - Assert.assertEquals("Metadata table should return two partitions record", 2, actualPartitionsWithProjection.size()); + Assert.assertEquals( + "Metadata table should return two partitions record", + 2, + actualPartitionsWithProjection.size()); for (int i = 0; i < 2; ++i) { Assert.assertEquals(1, actualPartitionsWithProjection.get(i).get(0)); } // Check files table - List expectedFiles = Stream.concat(expectedDataFiles.stream(), expectedDeleteFiles.stream()) - .collect(Collectors.toList()); + List expectedFiles = + Stream.concat(expectedDataFiles.stream(), expectedDeleteFiles.stream()) + .collect(Collectors.toList()); Assert.assertEquals("Should have two file manifest entries", 2, expectedFiles.size()); - List actualFiles = spark.sql("SELECT * FROM " + tableName + ".files " + - "WHERE partition.data='a' ORDER BY content").collectAsList(); + List actualFiles = + spark + .sql( + "SELECT * FROM " + + tableName + + ".files " + + "WHERE partition.data='a' ORDER BY content") + .collectAsList(); Assert.assertEquals("Metadata table should return two files", 2, actualFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedFiles.get(0), actualFiles.get(0)); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedFiles.get(1), actualFiles.get(1)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedFiles.get(0), actualFiles.get(0)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedFiles.get(1), actualFiles.get(1)); } @Test public void testAllFilesUnpartitioned() throws Exception { - sql("CREATE TABLE %s (id bigint, data string) USING iceberg TBLPROPERTIES" + - "('format-version'='2', 'write.delete.mode'='merge-on-read')", tableName); - - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "d") - ); - spark.createDataset(records, Encoders.bean(SimpleRecord.class)) + sql( + "CREATE TABLE %s (id bigint, data string) USING iceberg TBLPROPERTIES" + + "('format-version'='2', 'write.delete.mode'='merge-on-read')", + tableName); + + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "d")); + spark + .createDataset(records, Encoders.bean(SimpleRecord.class)) .coalesce(1) .writeTo(tableName) .append(); @@ -224,28 +260,35 @@ public void testAllFilesUnpartitioned() throws Exception { Assert.assertEquals("Table should be cleared", 0, results.size()); Schema entriesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + ".entries").schema(); - Schema filesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + ".all_data_files").schema(); + Schema filesTableSchema = + Spark3Util.loadIcebergTable(spark, tableName + ".all_data_files").schema(); // Check all data files table - List actualDataFiles = spark.sql("SELECT * FROM " + tableName + ".all_data_files").collectAsList(); + List actualDataFiles = + spark.sql("SELECT * FROM " + tableName + ".all_data_files").collectAsList(); - List expectedDataFiles = expectedEntries(table, FileContent.DATA, - entriesTableSchema, expectedDataManifests, null); + List expectedDataFiles = + expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, null); Assert.assertEquals("Should be one data file manifest entry", 1, expectedDataFiles.size()); Assert.assertEquals("Metadata table should return one data file", 1, actualDataFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedDataFiles.get(0), actualDataFiles.get(0)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedDataFiles.get(0), actualDataFiles.get(0)); // Check all delete files table - List actualDeleteFiles = spark.sql("SELECT * FROM " + tableName + ".all_delete_files").collectAsList(); - List expectedDeleteFiles = expectedEntries(table, FileContent.POSITION_DELETES, - entriesTableSchema, expectedDeleteManifests, null); + List actualDeleteFiles = + spark.sql("SELECT * FROM " + tableName + ".all_delete_files").collectAsList(); + List expectedDeleteFiles = + expectedEntries( + table, FileContent.POSITION_DELETES, entriesTableSchema, expectedDeleteManifests, null); Assert.assertEquals("Should be one delete file manifest entry", 1, expectedDeleteFiles.size()); - Assert.assertEquals("Metadata table should return one delete file", 1, actualDeleteFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); + Assert.assertEquals( + "Metadata table should return one delete file", 1, actualDeleteFiles.size()); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); // Check all files table - List actualFiles = spark.sql("SELECT * FROM " + tableName + ".all_files ORDER BY content") - .collectAsList(); + List actualFiles = + spark.sql("SELECT * FROM " + tableName + ".all_files ORDER BY content").collectAsList(); List expectedFiles = ListUtils.union(expectedDataFiles, expectedDeleteFiles); expectedFiles.sort(Comparator.comparing(r -> ((Integer) r.get("content")))); Assert.assertEquals("Metadata table should return two files", 2, actualFiles.size()); @@ -255,26 +298,26 @@ public void testAllFilesUnpartitioned() throws Exception { @Test public void testAllFilesPartitioned() throws Exception { // Create table and insert data - sql("CREATE TABLE %s (id bigint, data string) " + - "USING iceberg " + - "PARTITIONED BY (data) " + - "TBLPROPERTIES" + - "('format-version'='2', 'write.delete.mode'='merge-on-read')", tableName); - - List recordsA = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "a") - ); - spark.createDataset(recordsA, Encoders.bean(SimpleRecord.class)) + sql( + "CREATE TABLE %s (id bigint, data string) " + + "USING iceberg " + + "PARTITIONED BY (data) " + + "TBLPROPERTIES" + + "('format-version'='2', 'write.delete.mode'='merge-on-read')", + tableName); + + List recordsA = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "a")); + spark + .createDataset(recordsA, Encoders.bean(SimpleRecord.class)) .coalesce(1) .writeTo(tableName) .append(); - List recordsB = Lists.newArrayList( - new SimpleRecord(1, "b"), - new SimpleRecord(2, "b") - ); - spark.createDataset(recordsB, Encoders.bean(SimpleRecord.class)) + List recordsB = + Lists.newArrayList(new SimpleRecord(1, "b"), new SimpleRecord(2, "b")); + spark + .createDataset(recordsB, Encoders.bean(SimpleRecord.class)) .coalesce(1) .writeTo(tableName) .append(); @@ -293,30 +336,44 @@ public void testAllFilesPartitioned() throws Exception { Assert.assertEquals("Table should be cleared", 0, results.size()); Schema entriesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + ".entries").schema(); - Schema filesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + ".all_data_files").schema(); + Schema filesTableSchema = + Spark3Util.loadIcebergTable(spark, tableName + ".all_data_files").schema(); // Check all data files table - List actualDataFiles = spark.sql("SELECT * FROM " + tableName + ".all_data_files " + - "WHERE partition.data='a'").collectAsList(); - List expectedDataFiles = expectedEntries(table, FileContent.DATA, - entriesTableSchema, expectedDataManifests, "a"); + List actualDataFiles = + spark + .sql("SELECT * FROM " + tableName + ".all_data_files " + "WHERE partition.data='a'") + .collectAsList(); + List expectedDataFiles = + expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, "a"); Assert.assertEquals("Should be one data file manifest entry", 1, expectedDataFiles.size()); Assert.assertEquals("Metadata table should return one data file", 1, actualDataFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedDataFiles.get(0), actualDataFiles.get(0)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedDataFiles.get(0), actualDataFiles.get(0)); // Check all delete files table - List actualDeleteFiles = spark.sql("SELECT * FROM " + tableName + ".all_delete_files " + - "WHERE partition.data='a'").collectAsList(); - List expectedDeleteFiles = expectedEntries(table, FileContent.POSITION_DELETES, - entriesTableSchema, expectedDeleteManifests, "a"); + List actualDeleteFiles = + spark + .sql("SELECT * FROM " + tableName + ".all_delete_files " + "WHERE partition.data='a'") + .collectAsList(); + List expectedDeleteFiles = + expectedEntries( + table, FileContent.POSITION_DELETES, entriesTableSchema, expectedDeleteManifests, "a"); Assert.assertEquals("Should be one data file manifest entry", 1, expectedDeleteFiles.size()); Assert.assertEquals("Metadata table should return one data file", 1, actualDeleteFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); // Check all files table - List actualFiles = spark.sql("SELECT * FROM " + tableName + ".all_files WHERE partition.data='a' " + - "ORDER BY content").collectAsList(); + List actualFiles = + spark + .sql( + "SELECT * FROM " + + tableName + + ".all_files WHERE partition.data='a' " + + "ORDER BY content") + .collectAsList(); List expectedFiles = ListUtils.union(expectedDataFiles, expectedDeleteFiles); expectedFiles.sort(Comparator.comparing(r -> ((Integer) r.get("content")))); Assert.assertEquals("Metadata table should return two files", 2, actualFiles.size()); @@ -326,96 +383,103 @@ public void testAllFilesPartitioned() throws Exception { @Test public void testMetadataLogs() throws Exception { // Create table and insert data - sql("CREATE TABLE %s (id bigint, data string) " + - "USING iceberg " + - "PARTITIONED BY (data) " + - "TBLPROPERTIES " + - "('format-version'='2')", tableName); - - List recordsA = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "a") - ); - spark.createDataset(recordsA, Encoders.bean(SimpleRecord.class)) - .writeTo(tableName) - .append(); - - List recordsB = Lists.newArrayList( - new SimpleRecord(1, "b"), - new SimpleRecord(2, "b") - ); - spark.createDataset(recordsB, Encoders.bean(SimpleRecord.class)) - .writeTo(tableName) - .append(); + sql( + "CREATE TABLE %s (id bigint, data string) " + + "USING iceberg " + + "PARTITIONED BY (data) " + + "TBLPROPERTIES " + + "('format-version'='2')", + tableName); + + List recordsA = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "a")); + spark.createDataset(recordsA, Encoders.bean(SimpleRecord.class)).writeTo(tableName).append(); + + List recordsB = + Lists.newArrayList(new SimpleRecord(1, "b"), new SimpleRecord(2, "b")); + spark.createDataset(recordsB, Encoders.bean(SimpleRecord.class)).writeTo(tableName).append(); Table table = Spark3Util.loadIcebergTable(spark, tableName); Long currentSnapshotId = table.currentSnapshot().snapshotId(); TableMetadata tableMetadata = ((HasTableOperations) table).operations().current(); Snapshot currentSnapshot = tableMetadata.currentSnapshot(); Snapshot parentSnapshot = table.snapshot(currentSnapshot.parentId()); - List metadataLogEntries = Lists.newArrayList(tableMetadata.previousFiles()); + List metadataLogEntries = + Lists.newArrayList(tableMetadata.previousFiles()); // Check metadataLog table List metadataLogs = sql("SELECT * FROM %s.metadata_logs", tableName); - assertEquals("MetadataLogsTable result should match the metadataLog entries", + assertEquals( + "MetadataLogsTable result should match the metadataLog entries", ImmutableList.of( row( metadataLogEntries.get(0).timestampMillis(), metadataLogEntries.get(0).file(), null, null, - null - ), + null), row( metadataLogEntries.get(1).timestampMillis(), metadataLogEntries.get(1).file(), parentSnapshot.snapshotId(), parentSnapshot.schemaId(), - parentSnapshot.sequenceNumber() - ), + parentSnapshot.sequenceNumber()), row( currentSnapshot.timestampMillis(), tableMetadata.metadataFileLocation(), currentSnapshot.snapshotId(), currentSnapshot.schemaId(), - currentSnapshot.sequenceNumber() - )), + currentSnapshot.sequenceNumber())), metadataLogs); // test filtering List metadataLogWithFilters = - sql("SELECT * FROM %s.metadata_logs WHERE latest_snapshot_id = %s", tableName, currentSnapshotId); + sql( + "SELECT * FROM %s.metadata_logs WHERE latest_snapshot_id = %s", + tableName, currentSnapshotId); Assert.assertEquals("metadataLog table should return 1 row", 1, metadataLogWithFilters.size()); - assertEquals("Result should match the latest snapshot entry", - ImmutableList.of(row( - tableMetadata.currentSnapshot().timestampMillis(), - tableMetadata.metadataFileLocation(), - tableMetadata.currentSnapshot().snapshotId(), - tableMetadata.currentSnapshot().schemaId(), - tableMetadata.currentSnapshot().sequenceNumber())), + assertEquals( + "Result should match the latest snapshot entry", + ImmutableList.of( + row( + tableMetadata.currentSnapshot().timestampMillis(), + tableMetadata.metadataFileLocation(), + tableMetadata.currentSnapshot().snapshotId(), + tableMetadata.currentSnapshot().schemaId(), + tableMetadata.currentSnapshot().sequenceNumber())), metadataLogWithFilters); // test projection List metadataFiles = - metadataLogEntries.stream().map(TableMetadata.MetadataLogEntry::file).collect(Collectors.toList()); + metadataLogEntries.stream() + .map(TableMetadata.MetadataLogEntry::file) + .collect(Collectors.toList()); metadataFiles.add(tableMetadata.metadataFileLocation()); List metadataLogWithProjection = sql("SELECT file FROM %s.metadata_logs", tableName); - Assert.assertEquals("metadataLog table should return 3 rows", 3, metadataLogWithProjection.size()); - assertEquals("metadataLog entry should be of same file", + Assert.assertEquals( + "metadataLog table should return 3 rows", 3, metadataLogWithProjection.size()); + assertEquals( + "metadataLog entry should be of same file", metadataFiles.stream().map(this::row).collect(Collectors.toList()), metadataLogWithProjection); } /** * Find matching manifest entries of an Iceberg table + * * @param table iceberg table * @param expectedContent file content to populate on entries * @param entriesTableSchema schema of Manifest entries * @param manifestsToExplore manifests to explore of the table * @param partValue partition value that manifest entries must match, or null to skip filtering */ - private List expectedEntries(Table table, FileContent expectedContent, Schema entriesTableSchema, - List manifestsToExplore, String partValue) throws IOException { + private List expectedEntries( + Table table, + FileContent expectedContent, + Schema entriesTableSchema, + List manifestsToExplore, + String partValue) + throws IOException { List expected = Lists.newArrayList(); for (ManifestFile manifest : manifestsToExplore) { InputFile in = table.io().newInputFile(manifest.path()); diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMigrateTableProcedure.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMigrateTableProcedure.java index d66e75add16f..f9c150a3b1dc 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMigrateTableProcedure.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMigrateTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.io.IOException; @@ -35,12 +34,12 @@ public class TestMigrateTableProcedure extends SparkExtensionsTestBase { - public TestMigrateTableProcedure(String catalogName, String implementation, Map config) { + public TestMigrateTableProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @After public void removeTables() { @@ -52,7 +51,9 @@ public void removeTables() { public void testMigrate() throws IOException { Assume.assumeTrue(catalogName.equals("spark_catalog")); String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + tableName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); Object result = scalarSql("CALL %s.system.migrate('%s')", catalogName, tableName); @@ -65,7 +66,8 @@ public void testMigrate() throws IOException { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -76,10 +78,13 @@ public void testMigrate() throws IOException { public void testMigrateWithOptions() throws IOException { Assume.assumeTrue(catalogName.equals("spark_catalog")); String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + tableName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - Object result = scalarSql("CALL %s.system.migrate('%s', map('foo', 'bar'))", catalogName, tableName); + Object result = + scalarSql("CALL %s.system.migrate('%s', map('foo', 'bar'))", catalogName, tableName); Assert.assertEquals("Should have added one file", 1L, result); @@ -93,7 +98,8 @@ public void testMigrateWithOptions() throws IOException { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -105,10 +111,14 @@ public void testMigrateWithInvalidMetricsConfig() throws IOException { Assume.assumeTrue(catalogName.equals("spark_catalog")); String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", tableName, location); - - AssertHelpers.assertThrows("Should reject invalid metrics config", - ValidationException.class, "Invalid metrics config", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + tableName, location); + + AssertHelpers.assertThrows( + "Should reject invalid metrics config", + ValidationException.class, + "Invalid metrics config", () -> { String props = "map('write.metadata.metrics.column.x', 'X')"; sql("CALL %s.system.migrate('%s', %s)", catalogName, tableName, props); @@ -120,13 +130,17 @@ public void testMigrateWithConflictingProps() throws IOException { Assume.assumeTrue(catalogName.equals("spark_catalog")); String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + tableName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - Object result = scalarSql("CALL %s.system.migrate('%s', map('migrated', 'false'))", catalogName, tableName); + Object result = + scalarSql("CALL %s.system.migrate('%s', map('migrated', 'false'))", catalogName, tableName); Assert.assertEquals("Should have added one file", 1L, result); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); @@ -136,16 +150,22 @@ public void testMigrateWithConflictingProps() throws IOException { @Test public void testInvalidMigrateCases() { - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.migrate()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.migrate(map('foo','bar'))", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.migrate('')", catalogName)); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPublishChangesProcedure.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPublishChangesProcedure.java index f8080818a1e3..2b74cd475fae 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPublishChangesProcedure.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPublishChangesProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; + import java.util.List; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -34,11 +35,10 @@ import org.junit.After; import org.junit.Test; -import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; - public class TestPublishChangesProcedure extends SparkExtensionsTestBase { - public TestPublishChangesProcedure(String catalogName, String implementation, Map config) { + public TestPublishChangesProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -57,26 +57,28 @@ public void testApplyWapChangesUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.publish_changes('%s', '%s')", - catalogName, tableIdent, wapId); + List output = + sql("CALL %s.system.publish_changes('%s', '%s')", catalogName, tableIdent, wapId); table.refresh(); Snapshot currentSnapshot = table.currentSnapshot(); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())), output); - assertEquals("Apply of WAP changes must be successful", + assertEquals( + "Apply of WAP changes must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -91,26 +93,30 @@ public void testApplyWapChangesUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.publish_changes(wap_id => '%s', table => '%s')", - catalogName, wapId, tableIdent); + List output = + sql( + "CALL %s.system.publish_changes(wap_id => '%s', table => '%s')", + catalogName, wapId, tableIdent); table.refresh(); Snapshot currentSnapshot = table.currentSnapshot(); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())), output); - assertEquals("Apply of WAP changes must be successful", + assertEquals( + "Apply of WAP changes must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -132,14 +138,15 @@ public void testApplyWapChangesRefreshesRelationCache() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); - sql("CALL %s.system.publish_changes('%s', '%s')", - catalogName, tableIdent, wapId); + sql("CALL %s.system.publish_changes('%s', '%s')", catalogName, tableIdent, wapId); - assertEquals("Apply of WAP changes should be visible", + assertEquals( + "Apply of WAP changes should be visible", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); @@ -150,27 +157,37 @@ public void testApplyWapChangesRefreshesRelationCache() { public void testApplyInvalidWapId() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should reject invalid wap id", - ValidationException.class, "Cannot apply unknown WAP ID", + AssertHelpers.assertThrows( + "Should reject invalid wap id", + ValidationException.class, + "Cannot apply unknown WAP ID", () -> sql("CALL %s.system.publish_changes('%s', 'not_valid')", catalogName, tableIdent)); } @Test public void testInvalidApplyWapChangesCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.publish_changes('n', table => 't', 'not_valid')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.publish_changes('n', 't', 'not_valid')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.publish_changes('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.publish_changes('', 'not_valid')", catalogName)); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRegisterTableProcedure.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRegisterTableProcedure.java index d1f1905e098c..b1a1e30ec8d9 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRegisterTableProcedure.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRegisterTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.List; @@ -41,15 +40,12 @@ public class TestRegisterTableProcedure extends SparkExtensionsTestBase { private final String targetName; public TestRegisterTableProcedure( - String catalogName, - String implementation, - Map config) { + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); targetName = tableName("register_table"); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @After public void dropTables() { @@ -59,13 +55,15 @@ public void dropTables() { @Test public void testRegisterTable() throws NoSuchTableException, ParseException { - Assume.assumeTrue("Register only implemented on Hive Catalogs", + Assume.assumeTrue( + "Register only implemented on Hive Catalogs", spark.conf().get("spark.sql.catalog." + catalogName + ".type").equals("hive")); long numRows = 1000; sql("CREATE TABLE %s (id int, data string) using ICEBERG", tableName); - spark.range(0, numRows) + spark + .range(0, numRows) .withColumn("data", functions.col("id").cast(DataTypes.StringType)) .writeTo(tableName) .append(); @@ -73,17 +71,22 @@ public void testRegisterTable() throws NoSuchTableException, ParseException { Table table = Spark3Util.loadIcebergTable(spark, tableName); long originalFileCount = (long) scalarSql("SELECT COUNT(*) from %s.files", tableName); long currentSnapshotId = table.currentSnapshot().snapshotId(); - String metadataJson = ((HiveTableOperations) (((HasTableOperations) table).operations())).currentMetadataLocation(); + String metadataJson = + ((HiveTableOperations) (((HasTableOperations) table).operations())) + .currentMetadataLocation(); - List result = sql("CALL %s.system.register_table('%s', '%s')", catalogName, targetName, metadataJson); + List result = + sql("CALL %s.system.register_table('%s', '%s')", catalogName, targetName, metadataJson); Assert.assertEquals("Current Snapshot is not correct", currentSnapshotId, result.get(0)[0]); List original = sql("SELECT * FROM %s", tableName); List registered = sql("SELECT * FROM %s", targetName); assertEquals("Registered table rows should match original table rows", original, registered); - Assert.assertEquals("Should have the right row count in the procedure result", - numRows, result.get(0)[1]); - Assert.assertEquals("Should have the right datafile count in the procedure result", - originalFileCount, result.get(0)[2]); + Assert.assertEquals( + "Should have the right row count in the procedure result", numRows, result.get(0)[1]); + Assert.assertEquals( + "Should have the right datafile count in the procedure result", + originalFileCount, + result.get(0)[2]); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRemoveOrphanFilesProcedure.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRemoveOrphanFilesProcedure.java index 70c09cb30077..7a2d5c56ec7c 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRemoveOrphanFilesProcedure.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRemoveOrphanFilesProcedure.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; + import java.io.IOException; import java.net.URI; import java.sql.Timestamp; @@ -55,15 +57,12 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; - public class TestRemoveOrphanFilesProcedure extends SparkExtensionsTestBase { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - public TestRemoveOrphanFilesProcedure(String catalogName, String implementation, Map config) { + public TestRemoveOrphanFilesProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -77,14 +76,11 @@ public void removeTable() { public void testRemoveOrphanFilesInEmptyTable() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - List output = sql( - "CALL %s.system.remove_orphan_files('%s')", - catalogName, tableIdent); + List output = + sql("CALL %s.system.remove_orphan_files('%s')", catalogName, tableIdent); assertEquals("Should be no orphan files", ImmutableList.of(), output); - assertEquals("Should have no rows", - ImmutableList.of(), - sql("SELECT * FROM %s", tableName)); + assertEquals("Should have no rows", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); } @Test @@ -94,7 +90,8 @@ public void testRemoveOrphanFilesInDataFolder() throws IOException { } else { // give a fresh location to Hive tables as Spark will not clean up the table location // correctly while dropping tables through spark_catalog - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", tableName, temp.newFolder()); } @@ -116,31 +113,35 @@ public void testRemoveOrphanFilesInDataFolder() throws IOException { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); // check for orphans in the metadata folder - List output1 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s'," + - "location => '%s')", - catalogName, tableIdent, currentTimestamp, metadataLocation); + List output1 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s'," + + "location => '%s')", + catalogName, tableIdent, currentTimestamp, metadataLocation); assertEquals("Should be no orphan files in the metadata folder", ImmutableList.of(), output1); // check for orphans in the table location - List output2 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output2 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be orphan files in the data folder", 1, output2.size()); // the previous call should have deleted all orphan files - List output3 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output3 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be no more orphan files in the data folder", 0, output3.size()); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -152,7 +153,8 @@ public void testRemoveOrphanFilesDryRun() throws IOException { } else { // give a fresh location to Hive tables as Spark will not clean up the table location // correctly while dropping tables through spark_catalog - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", tableName, temp.newFolder()); } @@ -171,31 +173,35 @@ public void testRemoveOrphanFilesDryRun() throws IOException { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); // check for orphans without deleting - List output1 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s'," + - "dry_run => true)", - catalogName, tableIdent, currentTimestamp); + List output1 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s'," + + "dry_run => true)", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be one orphan files", 1, output1.size()); // actually delete orphans - List output2 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output2 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be one orphan files", 1, output2.size()); // the previous call should have deleted all orphan files - List output3 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output3 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be no more orphan files", 0, output3.size()); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -206,8 +212,10 @@ public void testRemoveOrphanFilesGCDisabled() { sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' 'false')", tableName, GC_ENABLED); - AssertHelpers.assertThrows("Should reject call", - ValidationException.class, "Cannot delete orphan files: GC is disabled", + AssertHelpers.assertThrows( + "Should reject call", + ValidationException.class, + "Cannot delete orphan files: GC is disabled", () -> sql("CALL %s.system.remove_orphan_files('%s')", catalogName, tableIdent)); // reset the property to enable the table purging in removeTable. @@ -223,35 +231,46 @@ public void testRemoveOrphanFilesWap() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); - List output = sql( - "CALL %s.system.remove_orphan_files('%s')", catalogName, tableIdent); + List output = + sql("CALL %s.system.remove_orphan_files('%s')", catalogName, tableIdent); assertEquals("Should be no orphan files", ImmutableList.of(), output); } @Test public void testInvalidRemoveOrphanFilesCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.remove_orphan_files('n', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.remove_orphan_files('n', 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.remove_orphan_files()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.remove_orphan_files('n', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.remove_orphan_files('')", catalogName)); } @@ -262,7 +281,8 @@ public void testConcurrentRemoveOrphanFiles() throws IOException { } else { // give a fresh location to Hive tables as Spark will not clean up the table location // correctly while dropping tables through spark_catalog - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", tableName, temp.newFolder()); } @@ -287,21 +307,23 @@ public void testConcurrentRemoveOrphanFiles() throws IOException { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); // check for orphans in the table location - List output = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "max_concurrent_deletes => %s," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, 4, currentTimestamp); + List output = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "max_concurrent_deletes => %s," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, 4, currentTimestamp); Assert.assertEquals("Should be orphan files in the data folder", 4, output.size()); // the previous call should have deleted all orphan files - List output3 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "max_concurrent_deletes => %s," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, 4, currentTimestamp); + List output3 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "max_concurrent_deletes => %s," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, 4, currentTimestamp); Assert.assertEquals("Should be no more orphan files in the data folder", 0, output3.size()); assertEquals( @@ -314,16 +336,23 @@ public void testConcurrentRemoveOrphanFiles() throws IOException { public void testConcurrentRemoveOrphanFilesWithInvalidInput() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should throw an error when max_concurrent_deletes = 0", - IllegalArgumentException.class, "max_concurrent_deletes should have value > 0", - () -> sql("CALL %s.system.remove_orphan_files(table => '%s', max_concurrent_deletes => %s)", - catalogName, tableIdent, 0)); + AssertHelpers.assertThrows( + "Should throw an error when max_concurrent_deletes = 0", + IllegalArgumentException.class, + "max_concurrent_deletes should have value > 0", + () -> + sql( + "CALL %s.system.remove_orphan_files(table => '%s', max_concurrent_deletes => %s)", + catalogName, tableIdent, 0)); - AssertHelpers.assertThrows("Should throw an error when max_concurrent_deletes < 0 ", - IllegalArgumentException.class, "max_concurrent_deletes should have value > 0", - () -> sql( - "CALL %s.system.remove_orphan_files(table => '%s', max_concurrent_deletes => %s)", - catalogName, tableIdent, -1)); + AssertHelpers.assertThrows( + "Should throw an error when max_concurrent_deletes < 0 ", + IllegalArgumentException.class, + "max_concurrent_deletes should have value > 0", + () -> + sql( + "CALL %s.system.remove_orphan_files(table => '%s', max_concurrent_deletes => %s)", + catalogName, tableIdent, -1)); String tempViewName = "file_list_test"; spark.emptyDataFrame().createOrReplaceTempView(tempViewName); @@ -368,34 +397,43 @@ public void testConcurrentRemoveOrphanFilesWithInvalidInput() { @Test public void testRemoveOrphanFilesWithDeleteFiles() throws Exception { - sql("CREATE TABLE %s (id int, data string) USING iceberg TBLPROPERTIES" + - "('format-version'='2', 'write.delete.mode'='merge-on-read')", tableName); - - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "d") - ); - spark.createDataset(records, Encoders.bean(SimpleRecord.class)).coalesce(1).writeTo(tableName).append(); + sql( + "CREATE TABLE %s (id int, data string) USING iceberg TBLPROPERTIES" + + "('format-version'='2', 'write.delete.mode'='merge-on-read')", + tableName); + + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "d")); + spark + .createDataset(records, Encoders.bean(SimpleRecord.class)) + .coalesce(1) + .writeTo(tableName) + .append(); sql("DELETE FROM %s WHERE id=1", tableName); Table table = Spark3Util.loadIcebergTable(spark, tableName); - Assert.assertEquals("Should have 1 delete manifest", 1, TestHelpers.deleteManifests(table).size()); + Assert.assertEquals( + "Should have 1 delete manifest", 1, TestHelpers.deleteManifests(table).size()); Assert.assertEquals("Should have 1 delete file", 1, TestHelpers.deleteFiles(table).size()); Path deleteManifestPath = new Path(TestHelpers.deleteManifests(table).iterator().next().path()); - Path deleteFilePath = new Path(String.valueOf(TestHelpers.deleteFiles(table).iterator().next().path())); + Path deleteFilePath = + new Path(String.valueOf(TestHelpers.deleteFiles(table).iterator().next().path())); // wait to ensure files are old enough waitUntilAfter(System.currentTimeMillis()); Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); // delete orphans - List output = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be no orphan files", 0, output.size()); FileSystem localFs = FileSystem.getLocal(new Configuration()); @@ -404,19 +442,20 @@ public void testRemoveOrphanFilesWithDeleteFiles() throws Exception { records.remove(new SimpleRecord(1, "a")); Dataset resultDF = spark.read().format("iceberg").load(tableName); - List actualRecords = resultDF - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @Test - public void testRemoveOrphanFilesProcedureWithPrefixMode() throws NoSuchTableException, ParseException, IOException { + public void testRemoveOrphanFilesProcedureWithPrefixMode() + throws NoSuchTableException, ParseException, IOException { if (catalogName.equals("testhadoop")) { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); } else { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", tableName, - temp.newFolder().toURI().toString()); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", + tableName, temp.newFolder().toURI().toString()); } Table table = Spark3Util.loadIcebergTable(spark, tableName); String location = table.location(); @@ -425,35 +464,31 @@ public void testRemoveOrphanFilesProcedureWithPrefixMode() throws NoSuchTableExc URI uri = originalPath.toUri(); Path newParentPath = new Path("file1", uri.getAuthority(), uri.getPath()); - DataFile dataFile1 = DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath(new Path(newParentPath, "path/to/data-a.parquet").toString()) - .withFileSizeInBytes(10) - .withRecordCount(1) - .build(); - DataFile dataFile2 = DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath(new Path(newParentPath, "path/to/data-b.parquet").toString()) - .withFileSizeInBytes(10) - .withRecordCount(1) - .build(); - - table.newFastAppend() - .appendFile(dataFile1) - .appendFile(dataFile2) - .commit(); - + DataFile dataFile1 = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath(new Path(newParentPath, "path/to/data-a.parquet").toString()) + .withFileSizeInBytes(10) + .withRecordCount(1) + .build(); + DataFile dataFile2 = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath(new Path(newParentPath, "path/to/data-b.parquet").toString()) + .withFileSizeInBytes(10) + .withRecordCount(1) + .build(); + + table.newFastAppend().appendFile(dataFile1).appendFile(dataFile2).commit(); Timestamp lastModifiedTimestamp = new Timestamp(10000); - List allFiles = Lists.newArrayList( - new FilePathLastModifiedRecord( - new Path(originalPath, "path/to/data-a.parquet").toString(), - lastModifiedTimestamp), - new FilePathLastModifiedRecord( - new Path(originalPath, "path/to/data-b.parquet").toString(), - lastModifiedTimestamp), - new FilePathLastModifiedRecord( - ReachableFileUtil.versionHintLocation(table), - lastModifiedTimestamp)); + List allFiles = + Lists.newArrayList( + new FilePathLastModifiedRecord( + new Path(originalPath, "path/to/data-a.parquet").toString(), lastModifiedTimestamp), + new FilePathLastModifiedRecord( + new Path(originalPath, "path/to/data-b.parquet").toString(), lastModifiedTimestamp), + new FilePathLastModifiedRecord( + ReachableFileUtil.versionHintLocation(table), lastModifiedTimestamp)); for (String file : ReachableFileUtil.metadataFileLocations(table, true)) { allFiles.add(new FilePathLastModifiedRecord(file, lastModifiedTimestamp)); @@ -463,26 +498,32 @@ public void testRemoveOrphanFilesProcedureWithPrefixMode() throws NoSuchTableExc allFiles.add(new FilePathLastModifiedRecord(manifest.path(), lastModifiedTimestamp)); } - Dataset compareToFileList = spark.createDataFrame(allFiles, - FilePathLastModifiedRecord.class).withColumnRenamed("filePath", "file_path") + Dataset compareToFileList = + spark + .createDataFrame(allFiles, FilePathLastModifiedRecord.class) + .withColumnRenamed("filePath", "file_path") .withColumnRenamed("lastModified", "last_modified"); String fileListViewName = "files_view"; compareToFileList.createOrReplaceTempView(fileListViewName); - List orphanFiles = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "equal_schemes => map('file1', 'file')," + - "file_list_view => '%s')", + List orphanFiles = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "equal_schemes => map('file1', 'file')," + + "file_list_view => '%s')", catalogName, tableIdent, fileListViewName); Assert.assertEquals(0, orphanFiles.size()); // Test with no equal schemes - AssertHelpers.assertThrows("Should complain about removing orphan files", - ValidationException.class, "Conflicting authorities/schemes: [(file1, file)]", - () -> sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "file_list_view => '%s')", + AssertHelpers.assertThrows( + "Should complain about removing orphan files", + ValidationException.class, + "Conflicting authorities/schemes: [(file1, file)]", + () -> + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "file_list_view => '%s')", catalogName, tableIdent, fileListViewName)); // Drop table in afterEach has purge and fails due to invalid scheme "file1" used in this test diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRequiredDistributionAndOrdering.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRequiredDistributionAndOrdering.java index 1d3ccc766288..cfa7f6622aed 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRequiredDistributionAndOrdering.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRequiredDistributionAndOrdering.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.math.BigDecimal; @@ -35,7 +34,8 @@ public class TestRequiredDistributionAndOrdering extends SparkExtensionsTestBase { - public TestRequiredDistributionAndOrdering(String catalogName, String implementation, Map config) { + public TestRequiredDistributionAndOrdering( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -46,45 +46,50 @@ public void dropTestTable() { @Test public void testDefaultLocalSortWithBucketTransforms() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, c1))", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, c1))", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); // should insert a local sort by partition columns by default inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testPartitionColumnsArePrependedForRangeDistribution() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, c1))", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, c1))", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); @@ -93,26 +98,29 @@ public void testPartitionColumnsArePrependedForRangeDistribution() throws NoSuch inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testSortOrderIncludesPartitionColumns() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, c1))", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, c1))", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); @@ -121,26 +129,29 @@ public void testSortOrderIncludesPartitionColumns() throws NoSuchTableException inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testHashDistributionOnBucketedColumn() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, c1))", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, c1))", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); @@ -149,35 +160,41 @@ public void testHashDistributionOnBucketedColumn() throws NoSuchTableException { inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testDisabledDistributionAndOrdering() { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, c1))", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, c1))", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); // should fail if ordering is disabled - AssertHelpers.assertThrows("Should reject writes without ordering", - SparkException.class, "Writing job aborted", + AssertHelpers.assertThrows( + "Should reject writes without ordering", + SparkException.class, + "Writing job aborted", () -> { try { - inputDF.writeTo(tableName) + inputDF + .writeTo(tableName) .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false") .append(); } catch (NoSuchTableException e) { @@ -188,92 +205,96 @@ public void testDisabledDistributionAndOrdering() { @Test public void testDefaultSortOnDecimalBucketedColumn() { - sql("CREATE TABLE %s (c1 INT, c2 DECIMAL(20, 2)) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, c2))", tableName); + sql( + "CREATE TABLE %s (c1 INT, c2 DECIMAL(20, 2)) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, c2))", + tableName); sql("INSERT INTO %s VALUES (1, 20.2), (2, 40.2), (3, 60.2)", tableName); - List expected = ImmutableList.of( - row(1, new BigDecimal("20.20")), - row(2, new BigDecimal("40.20")), - row(3, new BigDecimal("60.20")) - ); + List expected = + ImmutableList.of( + row(1, new BigDecimal("20.20")), + row(2, new BigDecimal("40.20")), + row(3, new BigDecimal("60.20"))); assertEquals("Rows must match", expected, sql("SELECT * FROM %s ORDER BY c1", tableName)); } @Test public void testDefaultSortOnStringBucketedColumn() { - sql("CREATE TABLE %s (c1 INT, c2 STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, c2))", tableName); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, c2))", + tableName); sql("INSERT INTO %s VALUES (1, 'A'), (2, 'B')", tableName); - List expected = ImmutableList.of( - row(1, "A"), - row(2, "B") - ); + List expected = ImmutableList.of(row(1, "A"), row(2, "B")); assertEquals("Rows must match", expected, sql("SELECT * FROM %s ORDER BY c1", tableName)); } @Test public void testDefaultSortOnDecimalTruncatedColumn() { - sql("CREATE TABLE %s (c1 INT, c2 DECIMAL(20, 2)) " + - "USING iceberg " + - "PARTITIONED BY (truncate(2, c2))", tableName); + sql( + "CREATE TABLE %s (c1 INT, c2 DECIMAL(20, 2)) " + + "USING iceberg " + + "PARTITIONED BY (truncate(2, c2))", + tableName); sql("INSERT INTO %s VALUES (1, 20.2), (2, 40.2)", tableName); - List expected = ImmutableList.of( - row(1, new BigDecimal("20.20")), - row(2, new BigDecimal("40.20")) - ); + List expected = + ImmutableList.of(row(1, new BigDecimal("20.20")), row(2, new BigDecimal("40.20"))); assertEquals("Rows must match", expected, sql("SELECT * FROM %s ORDER BY c1", tableName)); } @Test public void testDefaultSortOnLongTruncatedColumn() { - sql("CREATE TABLE %s (c1 INT, c2 BIGINT) " + - "USING iceberg " + - "PARTITIONED BY (truncate(2, c2))", tableName); + sql( + "CREATE TABLE %s (c1 INT, c2 BIGINT) " + + "USING iceberg " + + "PARTITIONED BY (truncate(2, c2))", + tableName); sql("INSERT INTO %s VALUES (1, 22222222222222), (2, 444444444444)", tableName); - List expected = ImmutableList.of( - row(1, 22222222222222L), - row(2, 444444444444L) - ); + List expected = ImmutableList.of(row(1, 22222222222222L), row(2, 444444444444L)); assertEquals("Rows must match", expected, sql("SELECT * FROM %s ORDER BY c1", tableName)); } @Test public void testRangeDistributionWithQuotedColumnNames() throws NoSuchTableException { - sql("CREATE TABLE %s (`c.1` INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, `c.1`))", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (`c.1` INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, `c.1`))", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); - Dataset inputDF = ds.selectExpr("c1 as `c.1`", "c2", "c3").coalesce(1).sortWithinPartitions("`c.1`"); + Dataset inputDF = + ds.selectExpr("c1 as `c.1`", "c2", "c3").coalesce(1).sortWithinPartitions("`c.1`"); sql("ALTER TABLE %s WRITE ORDERED BY `c.1`, c2", tableName); inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java index 9fdb0b1ed5e5..0a09b22f1a4d 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.List; @@ -41,12 +40,12 @@ import org.junit.Assume; import org.junit.Test; - public class TestRewriteDataFilesProcedure extends SparkExtensionsTestBase { private static final String QUOTED_SPECIAL_CHARS_TABLE_NAME = "`table:with.special:chars`"; - public TestRewriteDataFilesProcedure(String catalogName, String implementation, Map config) { + public TestRewriteDataFilesProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -58,20 +57,19 @@ public void removeTable() { @Test public void testZOrderSortExpression() { - List order = ExtendedParser.parseSortOrder(spark, "c1, zorder(c2, c3)"); + List order = + ExtendedParser.parseSortOrder(spark, "c1, zorder(c2, c3)"); Assert.assertEquals("Should parse 2 order fields", 2, order.size()); - Assert.assertEquals("First field should be a ref", "c1", ((NamedReference) order.get(0).term()).name()); + Assert.assertEquals( + "First field should be a ref", "c1", ((NamedReference) order.get(0).term()).name()); Assert.assertTrue("Second field should be zorder", order.get(1).term() instanceof Zorder); } @Test public void testRewriteDataFilesInEmptyTable() { createTable(); - List output = sql( - "CALL %s.system.rewrite_data_files('%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(0, 0)), - output); + List output = sql("CALL %s.system.rewrite_data_files('%s')", catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(0, 0)), output); } @Test @@ -81,10 +79,11 @@ public void testRewriteDataFilesOnPartitionTable() { insertData(10); List expectedRecords = currentData(); - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s')", catalogName, tableIdent); + List output = + sql("CALL %s.system.rewrite_data_files(table => '%s')", catalogName, tableIdent); - assertEquals("Action should rewrite 10 data files and add 2 data files (one per partition) ", + assertEquals( + "Action should rewrite 10 data files and add 2 data files (one per partition) ", ImmutableList.of(row(10, 2)), output); @@ -99,10 +98,11 @@ public void testRewriteDataFilesOnNonPartitionTable() { insertData(10); List expectedRecords = currentData(); - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s')", catalogName, tableIdent); + List output = + sql("CALL %s.system.rewrite_data_files(table => '%s')", catalogName, tableIdent); - assertEquals("Action should rewrite 10 data files and add 1 data files", + assertEquals( + "Action should rewrite 10 data files and add 1 data files", ImmutableList.of(row(10, 1)), output); @@ -118,11 +118,13 @@ public void testRewriteDataFilesWithOptions() { List expectedRecords = currentData(); // set the min-input-files = 12, instead of default 5 to skip compacting the files. - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s', options => map('min-input-files','12'))", - catalogName, tableIdent); + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s', options => map('min-input-files','12'))", + catalogName, tableIdent); - assertEquals("Action should rewrite 0 data files and add 0 data files", + assertEquals( + "Action should rewrite 0 data files and add 0 data files", ImmutableList.of(row(0, 0)), output); @@ -138,12 +140,14 @@ public void testRewriteDataFilesWithSortStrategy() { List expectedRecords = currentData(); // set sort_order = c1 DESC LAST - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s', " + - "strategy => 'sort', sort_order => 'c1 DESC NULLS LAST')", - catalogName, tableIdent); - - assertEquals("Action should rewrite 10 data files and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s', " + + "strategy => 'sort', sort_order => 'c1 DESC NULLS LAST')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 10 data files and add 1 data files", ImmutableList.of(row(10, 1)), output); @@ -158,29 +162,32 @@ public void testRewriteDataFilesWithZOrder() { insertData(10); // set z_order = c1,c2 - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s', " + - "strategy => 'sort', sort_order => 'zorder(c1,c2)')", - catalogName, tableIdent); - - assertEquals("Action should rewrite 10 data files and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s', " + + "strategy => 'sort', sort_order => 'zorder(c1,c2)')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 10 data files and add 1 data files", ImmutableList.of(row(10, 1)), output); // Due to Z_order, the data written will be in the below order. - // As there is only one small output file, we can validate the query ordering (as it will not change). - ImmutableList expectedRows = ImmutableList.of( - row(2, "bar", null), - row(2, "bar", null), - row(2, "bar", null), - row(2, "bar", null), - row(2, "bar", null), - row(1, "foo", null), - row(1, "foo", null), - row(1, "foo", null), - row(1, "foo", null), - row(1, "foo", null) - ); + // As there is only one small output file, we can validate the query ordering (as it will not + // change). + ImmutableList expectedRows = + ImmutableList.of( + row(2, "bar", null), + row(2, "bar", null), + row(2, "bar", null), + row(2, "bar", null), + row(2, "bar", null), + row(1, "foo", null), + row(1, "foo", null), + row(1, "foo", null), + row(1, "foo", null), + row(1, "foo", null)); assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s", tableName)); } @@ -192,11 +199,14 @@ public void testRewriteDataFilesWithFilter() { List expectedRecords = currentData(); // select only 5 files for compaction (files that may have c1 = 1) - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 = 1 and c2 is not null')", catalogName, tableIdent); - - assertEquals("Action should rewrite 5 data files (containing c1 = 1) and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + + " where => 'c1 = 1 and c2 is not null')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 5 data files (containing c1 = 1) and add 1 data files", ImmutableList.of(row(5, 1)), output); @@ -212,12 +222,14 @@ public void testRewriteDataFilesWithFilterOnPartitionTable() { List expectedRecords = currentData(); // select only 5 files for compaction (files in the partition c2 = 'bar') - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c2 = \"bar\"')", catalogName, tableIdent); - - assertEquals("Action should rewrite 5 data files from single matching partition" + - "(containing c2 = bar) and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c2 = \"bar\"')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 5 data files from single matching partition" + + "(containing c2 = bar) and add 1 data files", ImmutableList.of(row(5, 1)), output); @@ -233,12 +245,14 @@ public void testRewriteDataFilesWithInFilterOnPartitionTable() { List expectedRecords = currentData(); // select only 5 files for compaction (files in the partition c2 in ('bar')) - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c2 in (\"bar\")')", catalogName, tableIdent); - - assertEquals("Action should rewrite 5 data files from single matching partition" + - "(containing c2 = bar) and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c2 in (\"bar\")')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 5 data files from single matching partition" + + "(containing c2 = bar) and add 1 data files", ImmutableList.of(row(5, 1)), output); @@ -256,43 +270,56 @@ public void testRewriteDataFilesWithAllPossibleFilters() { // So that parsing can be tested on a same dataset without actually compacting the files. // EqualTo - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 = 3')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 = 3')", + catalogName, tableIdent); // GreaterThan - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 > 3')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 > 3')", + catalogName, tableIdent); // GreaterThanOrEqual - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 >= 3')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 >= 3')", + catalogName, tableIdent); // LessThan - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 < 0')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 < 0')", + catalogName, tableIdent); // LessThanOrEqual - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 <= 0')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 <= 0')", + catalogName, tableIdent); // In - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 in (3,4,5)')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 in (3,4,5)')", + catalogName, tableIdent); // IsNull - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 is null')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 is null')", + catalogName, tableIdent); // IsNotNull - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c3 is not null')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c3 is not null')", + catalogName, tableIdent); // And - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 = 3 and c2 = \"bar\"')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 = 3 and c2 = \"bar\"')", + catalogName, tableIdent); // Or - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 = 3 or c1 = 5')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 = 3 or c1 = 5')", + catalogName, tableIdent); // Not - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 not in (1,2)')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 not in (1,2)')", + catalogName, tableIdent); // StringStartsWith - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c2 like \"%s\"')", catalogName, tableIdent, "car%"); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c2 like \"%s\"')", + catalogName, tableIdent, "car%"); - // TODO: Enable when org.apache.iceberg.spark.SparkFilters have implementations for StringEndsWith & StringContains + // TODO: Enable when org.apache.iceberg.spark.SparkFilters have implementations for + // StringEndsWith & StringContains // StringEndsWith // sql("CALL %s.system.rewrite_data_files(table => '%s'," + // " where => 'c2 like \"%s\"')", catalogName, tableIdent, "%car"); @@ -308,77 +335,125 @@ public void testRewriteDataFilesWithInvalidInputs() { insertData(2); // Test for invalid strategy - AssertHelpers.assertThrows("Should reject calls with unsupported strategy error message", - IllegalArgumentException.class, "unsupported strategy: temp. Only binpack or sort is supported", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', options => map('min-input-files','2'), " + - "strategy => 'temp')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with unsupported strategy error message", + IllegalArgumentException.class, + "unsupported strategy: temp. Only binpack or sort is supported", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', options => map('min-input-files','2'), " + + "strategy => 'temp')", + catalogName, tableIdent)); // Test for sort_order with binpack strategy - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Cannot set strategy to sort, it has already been set", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'binpack', " + - "sort_order => 'c1 ASC NULLS FIRST')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Cannot set strategy to sort, it has already been set", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'binpack', " + + "sort_order => 'c1 ASC NULLS FIRST')", + catalogName, tableIdent)); // Test for sort_order with invalid null order - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Unable to parse sortOrder:", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + - "sort_order => 'c1 ASC none')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Unable to parse sortOrder:", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + + "sort_order => 'c1 ASC none')", + catalogName, tableIdent)); // Test for sort_order with invalid sort direction - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Unable to parse sortOrder:", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + - "sort_order => 'c1 none NULLS FIRST')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Unable to parse sortOrder:", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + + "sort_order => 'c1 none NULLS FIRST')", + catalogName, tableIdent)); // Test for sort_order with invalid column name - AssertHelpers.assertThrows("Should reject calls with error message", - ValidationException.class, "Cannot find field 'col1' in struct:" + - " struct<1: c1: optional int, 2: c2: optional string, 3: c3: optional string>", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + - "sort_order => 'col1 DESC NULLS FIRST')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + ValidationException.class, + "Cannot find field 'col1' in struct:" + + " struct<1: c1: optional int, 2: c2: optional string, 3: c3: optional string>", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + + "sort_order => 'col1 DESC NULLS FIRST')", + catalogName, tableIdent)); // Test for sort_order with invalid filter column col1 - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Cannot parse predicates in where option: col1 = 3", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', " + - "where => 'col1 = 3')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Cannot parse predicates in where option: col1 = 3", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', " + "where => 'col1 = 3')", + catalogName, tableIdent)); // Test for z_order with invalid column name - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Cannot find column 'col1' in table schema: " + - "struct<1: c1: optional int, 2: c2: optional string, 3: c3: optional string>", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + - "sort_order => 'zorder(col1)')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Cannot find column 'col1' in table schema: " + + "struct<1: c1: optional int, 2: c2: optional string, 3: c3: optional string>", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + + "sort_order => 'zorder(col1)')", + catalogName, tableIdent)); // Test for z_order with sort_order - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Cannot mix identity sort columns and a Zorder sort expression:" + - " c1,zorder(c2,c3)", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + - "sort_order => 'c1,zorder(c2,c3)')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Cannot mix identity sort columns and a Zorder sort expression:" + " c1,zorder(c2,c3)", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + + "sort_order => 'c1,zorder(c2,c3)')", + catalogName, tableIdent)); } @Test public void testInvalidCasesForRewriteDataFiles() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.rewrite_data_files('n', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.rewrite_data_files('n', 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rewrite_data_files()", catalogName)); - AssertHelpers.assertThrows("Should reject duplicate arg names name", - AnalysisException.class, "Duplicate procedure argument: table", + AssertHelpers.assertThrows( + "Should reject duplicate arg names name", + AnalysisException.class, + "Duplicate procedure argument: table", () -> sql("CALL %s.system.rewrite_data_files(table => 't', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.rewrite_data_files('')", catalogName)); } @@ -386,17 +461,21 @@ public void testInvalidCasesForRewriteDataFiles() { public void testBinPackTableWithSpecialChars() { Assume.assumeTrue(catalogName.equals(SparkCatalogConfig.HADOOP.catalogName())); - sql("CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg", tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); + sql( + "CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg", + tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); insertData(tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME), 10); List expectedRecords = currentData(tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s', where => 'c2 is not null')", - catalogName, tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s', where => 'c2 is not null')", + catalogName, tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); - assertEquals("Action should rewrite 10 data files and add 1 data file", + assertEquals( + "Action should rewrite 10 data files and add 1 data file", ImmutableList.of(row(10, 1)), output); @@ -410,21 +489,25 @@ public void testBinPackTableWithSpecialChars() { public void testSortTableWithSpecialChars() { Assume.assumeTrue(catalogName.equals(SparkCatalogConfig.HADOOP.catalogName())); - sql("CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg", tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); + sql( + "CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg", + tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); insertData(tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME), 10); List expectedRecords = currentData(tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); - List output = sql( - "CALL %s.system.rewrite_data_files(" + - " table => '%s'," + - " strategy => 'sort'," + - " sort_order => 'c1'," + - " where => 'c2 is not null')", - catalogName, tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); - - assertEquals("Action should rewrite 10 data files and add 1 data file", + List output = + sql( + "CALL %s.system.rewrite_data_files(" + + " table => '%s'," + + " strategy => 'sort'," + + " sort_order => 'c1'," + + " where => 'c2 is not null')", + catalogName, tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); + + assertEquals( + "Action should rewrite 10 data files and add 1 data file", ImmutableList.of(row(10, 1)), output); @@ -438,21 +521,25 @@ public void testSortTableWithSpecialChars() { public void testZOrderTableWithSpecialChars() { Assume.assumeTrue(catalogName.equals(SparkCatalogConfig.HADOOP.catalogName())); - sql("CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg", tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); + sql( + "CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg", + tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); insertData(tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME), 10); List expectedRecords = currentData(tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); - List output = sql( - "CALL %s.system.rewrite_data_files(" + - " table => '%s'," + - " strategy => 'sort'," + - " sort_order => 'zorder(c1, c2)'," + - " where => 'c2 is not null')", - catalogName, tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); - - assertEquals("Action should rewrite 10 data files and add 1 data file", + List output = + sql( + "CALL %s.system.rewrite_data_files(" + + " table => '%s'," + + " strategy => 'sort'," + + " sort_order => 'zorder(c1, c2)'," + + " where => 'c2 is not null')", + catalogName, tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); + + assertEquals( + "Action should rewrite 10 data files and add 1 data file", ImmutableList.of(row(10, 1)), output); @@ -467,7 +554,9 @@ private void createTable() { } private void createPartitionTable() { - sql("CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg PARTITIONED BY (c2)", tableName); + sql( + "CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg PARTITIONED BY (c2)", + tableName); } private void insertData(int filesCount) { @@ -479,12 +568,15 @@ private void insertData(String table, int filesCount) { ThreeColumnRecord record2 = new ThreeColumnRecord(2, "bar", null); List records = Lists.newArrayList(); - IntStream.range(0, filesCount / 2).forEach(i -> { - records.add(record1); - records.add(record2); - }); - - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).repartition(filesCount); + IntStream.range(0, filesCount / 2) + .forEach( + i -> { + records.add(record1); + records.add(record2); + }); + + Dataset df = + spark.createDataFrame(records, ThreeColumnRecord.class).repartition(filesCount); try { df.writeTo(table).append(); } catch (org.apache.spark.sql.catalyst.analysis.NoSuchTableException e) { diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteManifestsProcedure.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteManifestsProcedure.java index dcf0a2d91e3e..7c5ec1f5cf3f 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteManifestsProcedure.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteManifestsProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; + import java.util.List; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -30,11 +31,10 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; - public class TestRewriteManifestsProcedure extends SparkExtensionsTestBase { - public TestRewriteManifestsProcedure(String catalogName, String implementation, Map config) { + public TestRewriteManifestsProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -46,40 +46,42 @@ public void removeTable() { @Test public void testRewriteManifestsInEmptyTable() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - List output = sql( - "CALL %s.system.rewrite_manifests('%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(0, 0)), - output); + List output = sql("CALL %s.system.rewrite_manifests('%s')", catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(0, 0)), output); } @Test public void testRewriteLargeManifests() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", + tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Must have 1 manifest", 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 1 manifest", 1, table.currentSnapshot().allManifests(table.io()).size()); sql("ALTER TABLE %s SET TBLPROPERTIES ('commit.manifest.target-size-bytes' '1')", tableName); - List output = sql( - "CALL %s.system.rewrite_manifests('%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(1, 4)), - output); + List output = sql("CALL %s.system.rewrite_manifests('%s')", catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(1, 4)), output); table.refresh(); - Assert.assertEquals("Must have 4 manifests", 4, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 4 manifests", 4, table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testRewriteSmallManifestsWithSnapshotIdInheritance() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", + tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' '%s')", tableName, SNAPSHOT_ID_INHERITANCE_ENABLED, "true"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' '%s')", + tableName, SNAPSHOT_ID_INHERITANCE_ENABLED, "true"); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName); @@ -88,87 +90,107 @@ public void testRewriteSmallManifestsWithSnapshotIdInheritance() { Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Must have 4 manifest", 4, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 4 manifest", 4, table.currentSnapshot().allManifests(table.io()).size()); - List output = sql( - "CALL %s.system.rewrite_manifests(table => '%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(4, 1)), - output); + List output = + sql("CALL %s.system.rewrite_manifests(table => '%s')", catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(4, 1)), output); table.refresh(); - Assert.assertEquals("Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testRewriteSmallManifestsWithoutCaching() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", + tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Must have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); - List output = sql( - "CALL %s.system.rewrite_manifests(use_caching => false, table => '%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(2, 1)), - output); + List output = + sql( + "CALL %s.system.rewrite_manifests(use_caching => false, table => '%s')", + catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(2, 1)), output); table.refresh(); - Assert.assertEquals("Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testRewriteManifestsCaseInsensitiveArgs() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", + tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Must have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); - List output = sql( - "CALL %s.system.rewrite_manifests(usE_cAcHiNg => false, tAbLe => '%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(2, 1)), - output); + List output = + sql( + "CALL %s.system.rewrite_manifests(usE_cAcHiNg => false, tAbLe => '%s')", + catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(2, 1)), output); table.refresh(); - Assert.assertEquals("Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testInvalidRewriteManifestsCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.rewrite_manifests('n', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.rewrite_manifests('n', 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rewrite_manifests()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.rewrite_manifests('n', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject duplicate arg names name", - AnalysisException.class, "Duplicate procedure argument: table", + AssertHelpers.assertThrows( + "Should reject duplicate arg names name", + AnalysisException.class, + "Duplicate procedure argument: table", () -> sql("CALL %s.system.rewrite_manifests(table => 't', tAbLe => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.rewrite_manifests('')", catalogName)); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToSnapshotProcedure.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToSnapshotProcedure.java index d3e6bdcbc285..af94b456d02e 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToSnapshotProcedure.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToSnapshotProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.List; @@ -36,7 +35,8 @@ public class TestRollbackToSnapshotProcedure extends SparkExtensionsTestBase { - public TestRollbackToSnapshotProcedure(String catalogName, String implementation, Map config) { + public TestRollbackToSnapshotProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -55,7 +55,8 @@ public void testRollbackToSnapshotUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -63,15 +64,18 @@ public void testRollbackToSnapshotUsingPositionalArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.rollback_to_snapshot('%s', %dL)", - catalogName, tableIdent, firstSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.rollback_to_snapshot('%s', %dL)", + catalogName, tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -86,7 +90,8 @@ public void testRollbackToSnapshotUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -94,15 +99,18 @@ public void testRollbackToSnapshotUsingNamedArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.rollback_to_snapshot(snapshot_id => %dL, table => '%s')", - catalogName, firstSnapshot.snapshotId(), tableIdent); + List output = + sql( + "CALL %s.system.rollback_to_snapshot(snapshot_id => %dL, table => '%s')", + catalogName, firstSnapshot.snapshotId(), tableIdent); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -126,21 +134,23 @@ public void testRollbackToSnapshotRefreshesRelationCache() { spark.sql("CACHE TABLE tmp"); - assertEquals("View should have expected rows", + assertEquals( + "View should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM tmp")); - List output = sql( - "CALL %s.system.rollback_to_snapshot(table => '%s', snapshot_id => %dL)", - catalogName, tableIdent, firstSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.rollback_to_snapshot(table => '%s', snapshot_id => %dL)", + catalogName, tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("View cache must be invalidated", - ImmutableList.of(row(1L, "a")), - sql("SELECT * FROM tmp")); + assertEquals( + "View cache must be invalidated", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); sql("UNCACHE TABLE tmp"); } @@ -155,7 +165,8 @@ public void testRollbackToSnapshotWithQuotedIdentifiers() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -171,15 +182,20 @@ public void testRollbackToSnapshotWithQuotedIdentifiers() { } String quotedNamespace = quotedNamespaceBuilder.toString(); - List output = sql( - "CALL %s.system.rollback_to_snapshot('%s', %d)", - catalogName, quotedNamespace + ".`" + tableIdent.name() + "`", firstSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.rollback_to_snapshot('%s', %d)", + catalogName, + quotedNamespace + ".`" + tableIdent.name() + "`", + firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -196,7 +212,8 @@ public void testRollbackToSnapshotWithoutExplicitCatalog() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -205,15 +222,16 @@ public void testRollbackToSnapshotWithoutExplicitCatalog() { Snapshot secondSnapshot = table.currentSnapshot(); // use camel case intentionally to test case sensitivity - List output = sql( - "CALL SyStEm.rOLlBaCk_to_SnApShOt('%s', %dL)", - tableIdent, firstSnapshot.snapshotId()); + List output = + sql("CALL SyStEm.rOLlBaCk_to_SnApShOt('%s', %dL)", tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -222,39 +240,58 @@ public void testRollbackToSnapshotWithoutExplicitCatalog() { public void testRollbackToInvalidSnapshot() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should reject invalid snapshot id", - ValidationException.class, "Cannot roll back to unknown snapshot id", + AssertHelpers.assertThrows( + "Should reject invalid snapshot id", + ValidationException.class, + "Cannot roll back to unknown snapshot id", () -> sql("CALL %s.system.rollback_to_snapshot('%s', -1L)", catalogName, tableIdent)); } @Test public void testInvalidRollbackToSnapshotCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", - () -> sql("CALL %s.system.rollback_to_snapshot(namespace => 'n1', table => 't', 1L)", catalogName)); - - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", + () -> + sql( + "CALL %s.system.rollback_to_snapshot(namespace => 'n1', table => 't', 1L)", + catalogName)); + + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.rollback_to_snapshot('n', 't', 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_snapshot('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_snapshot(1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_snapshot(table => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for snapshot_id: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for snapshot_id: cannot cast", () -> sql("CALL %s.system.rollback_to_snapshot('t', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.rollback_to_snapshot('', 1L)", catalogName)); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToTimestampProcedure.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToTimestampProcedure.java index 52fc12c7d01e..6da3853bbe24 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToTimestampProcedure.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToTimestampProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.time.LocalDateTime; @@ -36,7 +35,8 @@ public class TestRollbackToTimestampProcedure extends SparkExtensionsTestBase { - public TestRollbackToTimestampProcedure(String catalogName, String implementation, Map config) { + public TestRollbackToTimestampProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -58,7 +58,8 @@ public void testRollbackToTimestampUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -66,15 +67,18 @@ public void testRollbackToTimestampUsingPositionalArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.rollback_to_timestamp('%s',TIMESTAMP '%s')", - catalogName, tableIdent, firstSnapshotTimestamp); + List output = + sql( + "CALL %s.system.rollback_to_timestamp('%s',TIMESTAMP '%s')", + catalogName, tableIdent, firstSnapshotTimestamp); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -92,7 +96,8 @@ public void testRollbackToTimestampUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -100,15 +105,18 @@ public void testRollbackToTimestampUsingNamedArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.rollback_to_timestamp(timestamp => TIMESTAMP '%s', table => '%s')", - catalogName, firstSnapshotTimestamp, tableIdent); + List output = + sql( + "CALL %s.system.rollback_to_timestamp(timestamp => TIMESTAMP '%s', table => '%s')", + catalogName, firstSnapshotTimestamp, tableIdent); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -135,21 +143,23 @@ public void testRollbackToTimestampRefreshesRelationCache() { spark.sql("CACHE TABLE tmp"); - assertEquals("View should have expected rows", + assertEquals( + "View should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM tmp")); - List output = sql( - "CALL %s.system.rollback_to_timestamp(table => '%s', timestamp => TIMESTAMP '%s')", - catalogName, tableIdent, firstSnapshotTimestamp); + List output = + sql( + "CALL %s.system.rollback_to_timestamp(table => '%s', timestamp => TIMESTAMP '%s')", + catalogName, tableIdent, firstSnapshotTimestamp); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("View cache must be invalidated", - ImmutableList.of(row(1L, "a")), - sql("SELECT * FROM tmp")); + assertEquals( + "View cache must be invalidated", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); sql("UNCACHE TABLE tmp"); } @@ -167,7 +177,8 @@ public void testRollbackToTimestampWithQuotedIdentifiers() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -183,15 +194,18 @@ public void testRollbackToTimestampWithQuotedIdentifiers() { } String quotedNamespace = quotedNamespaceBuilder.toString(); - List output = sql( - "CALL %s.system.rollback_to_timestamp('%s', TIMESTAMP '%s')", - catalogName, quotedNamespace + ".`" + tableIdent.name() + "`", firstSnapshotTimestamp); + List output = + sql( + "CALL %s.system.rollback_to_timestamp('%s', TIMESTAMP '%s')", + catalogName, quotedNamespace + ".`" + tableIdent.name() + "`", firstSnapshotTimestamp); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -211,7 +225,8 @@ public void testRollbackToTimestampWithoutExplicitCatalog() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -220,15 +235,18 @@ public void testRollbackToTimestampWithoutExplicitCatalog() { Snapshot secondSnapshot = table.currentSnapshot(); // use camel case intentionally to test case sensitivity - List output = sql( - "CALL SyStEm.rOLlBaCk_to_TiMeStaMp('%s', TIMESTAMP '%s')", - tableIdent, firstSnapshotTimestamp); + List output = + sql( + "CALL SyStEm.rOLlBaCk_to_TiMeStaMp('%s', TIMESTAMP '%s')", + tableIdent, firstSnapshotTimestamp); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -237,32 +255,50 @@ public void testRollbackToTimestampWithoutExplicitCatalog() { public void testInvalidRollbackToTimestampCases() { String timestamp = "TIMESTAMP '2007-12-03T10:15:30'"; - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", - () -> sql("CALL %s.system.rollback_to_timestamp(namespace => 'n1', 't', %s)", catalogName, timestamp)); - - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", + () -> + sql( + "CALL %s.system.rollback_to_timestamp(namespace => 'n1', 't', %s)", + catalogName, timestamp)); + + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.rollback_to_timestamp('n', 't', %s)", catalogName, timestamp)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_timestamp('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_timestamp(timestamp => %s)", catalogName, timestamp)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_timestamp(table => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with extra args", - AnalysisException.class, "Too many arguments", - () -> sql("CALL %s.system.rollback_to_timestamp('n', 't', %s, 1L)", catalogName, timestamp)); - - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for timestamp: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with extra args", + AnalysisException.class, + "Too many arguments", + () -> + sql("CALL %s.system.rollback_to_timestamp('n', 't', %s, 1L)", catalogName, timestamp)); + + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for timestamp: cannot cast", () -> sql("CALL %s.system.rollback_to_timestamp('t', 2.2)", catalogName)); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetCurrentSnapshotProcedure.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetCurrentSnapshotProcedure.java index 0ea8c4861e8c..8a8a974bbebe 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetCurrentSnapshotProcedure.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetCurrentSnapshotProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; + import java.util.List; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -34,11 +35,10 @@ import org.junit.Assume; import org.junit.Test; -import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; - public class TestSetCurrentSnapshotProcedure extends SparkExtensionsTestBase { - public TestSetCurrentSnapshotProcedure(String catalogName, String implementation, Map config) { + public TestSetCurrentSnapshotProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -57,7 +57,8 @@ public void testSetCurrentSnapshotUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -65,15 +66,18 @@ public void testSetCurrentSnapshotUsingPositionalArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.set_current_snapshot('%s', %dL)", - catalogName, tableIdent, firstSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.set_current_snapshot('%s', %dL)", + catalogName, tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Set must be successful", + assertEquals( + "Set must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -88,7 +92,8 @@ public void testSetCurrentSnapshotUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -96,15 +101,18 @@ public void testSetCurrentSnapshotUsingNamedArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.set_current_snapshot(snapshot_id => %dL, table => '%s')", - catalogName, firstSnapshot.snapshotId(), tableIdent); + List output = + sql( + "CALL %s.system.set_current_snapshot(snapshot_id => %dL, table => '%s')", + catalogName, firstSnapshot.snapshotId(), tableIdent); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Set must be successful", + assertEquals( + "Set must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -118,22 +126,26 @@ public void testSetCurrentSnapshotWap() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.set_current_snapshot(table => '%s', snapshot_id => %dL)", - catalogName, tableIdent, wapSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.set_current_snapshot(table => '%s', snapshot_id => %dL)", + catalogName, tableIdent, wapSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(null, wapSnapshot.snapshotId())), output); - assertEquals("Current snapshot must be set correctly", + assertEquals( + "Current snapshot must be set correctly", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -150,7 +162,8 @@ public void tesSetCurrentSnapshotWithoutExplicitCatalog() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -159,15 +172,16 @@ public void tesSetCurrentSnapshotWithoutExplicitCatalog() { Snapshot secondSnapshot = table.currentSnapshot(); // use camel case intentionally to test case sensitivity - List output = sql( - "CALL SyStEm.sEt_cuRrEnT_sNaPsHot('%s', %dL)", - tableIdent, firstSnapshot.snapshotId()); + List output = + sql("CALL SyStEm.sEt_cuRrEnT_sNaPsHot('%s', %dL)", tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Set must be successful", + assertEquals( + "Set must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -179,43 +193,64 @@ public void testSetCurrentSnapshotToInvalidSnapshot() { Namespace namespace = tableIdent.namespace(); String tableName = tableIdent.name(); - AssertHelpers.assertThrows("Should reject invalid snapshot id", - ValidationException.class, "Cannot roll back to unknown snapshot id", + AssertHelpers.assertThrows( + "Should reject invalid snapshot id", + ValidationException.class, + "Cannot roll back to unknown snapshot id", () -> sql("CALL %s.system.set_current_snapshot('%s', -1L)", catalogName, tableIdent)); } @Test public void testInvalidRollbackToSnapshotCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", - () -> sql("CALL %s.system.set_current_snapshot(namespace => 'n1', table => 't', 1L)", catalogName)); - - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", + () -> + sql( + "CALL %s.system.set_current_snapshot(namespace => 'n1', table => 't', 1L)", + catalogName)); + + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.set_current_snapshot('n', 't', 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.set_current_snapshot('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.set_current_snapshot(1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.set_current_snapshot(snapshot_id => 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.set_current_snapshot(table => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for snapshot_id: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for snapshot_id: cannot cast", () -> sql("CALL %s.system.set_current_snapshot('t', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.set_current_snapshot('', 1L)", catalogName)); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetWriteDistributionAndOrdering.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetWriteDistributionAndOrdering.java index 473278d25068..e7e52806792d 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetWriteDistributionAndOrdering.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetWriteDistributionAndOrdering.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.expressions.Expressions.bucket; + import java.util.Map; import org.apache.iceberg.NullOrder; import org.apache.iceberg.SortOrder; @@ -28,10 +29,9 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.expressions.Expressions.bucket; - public class TestSetWriteDistributionAndOrdering extends SparkExtensionsTestBase { - public TestSetWriteDistributionAndOrdering(String catalogName, String implementation, Map config) { + public TestSetWriteDistributionAndOrdering( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -42,7 +42,9 @@ public void removeTable() { @Test public void testSetWriteOrderByColumn() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -53,17 +55,20 @@ public void testSetWriteOrderByColumn() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "range", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("category", NullOrder.NULLS_FIRST) - .asc("id", NullOrder.NULLS_FIRST) - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .asc("category", NullOrder.NULLS_FIRST) + .asc("id", NullOrder.NULLS_FIRST) + .build(); Assert.assertEquals("Should have expected order", expected, table.sortOrder()); } @Test public void testSetWriteOrderByColumnWithDirection() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -74,17 +79,20 @@ public void testSetWriteOrderByColumnWithDirection() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "range", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("category", NullOrder.NULLS_FIRST) - .desc("id", NullOrder.NULLS_LAST) - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .asc("category", NullOrder.NULLS_FIRST) + .desc("id", NullOrder.NULLS_LAST) + .build(); Assert.assertEquals("Should have expected order", expected, table.sortOrder()); } @Test public void testSetWriteOrderByColumnWithDirectionAndNullOrder() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -95,17 +103,20 @@ public void testSetWriteOrderByColumnWithDirectionAndNullOrder() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "range", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("category", NullOrder.NULLS_LAST) - .desc("id", NullOrder.NULLS_FIRST) - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .asc("category", NullOrder.NULLS_LAST) + .desc("id", NullOrder.NULLS_FIRST) + .build(); Assert.assertEquals("Should have expected order", expected, table.sortOrder()); } @Test public void testSetWriteOrderByTransform() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -116,18 +127,21 @@ public void testSetWriteOrderByTransform() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "range", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .desc("category") - .asc(bucket("id", 16)) - .asc("id") - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .desc("category") + .asc(bucket("id", 16)) + .asc("id") + .build(); Assert.assertEquals("Should have expected order", expected, table.sortOrder()); } @Test public void testSetWriteUnordered() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -152,7 +166,9 @@ public void testSetWriteUnordered() { @Test public void testSetWriteLocallyOrdered() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -163,18 +179,21 @@ public void testSetWriteLocallyOrdered() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "none", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .desc("category") - .asc(bucket("id", 16)) - .asc("id") - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .desc("category") + .asc(bucket("id", 16)) + .asc("id") + .build(); Assert.assertEquals("Sort order must match", expected, table.sortOrder()); } @Test public void testSetWriteDistributedByWithSort() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -185,16 +204,15 @@ public void testSetWriteDistributedByWithSort() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "hash", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("id") - .build(); + SortOrder expected = SortOrder.builderFor(table.schema()).withOrderId(1).asc("id").build(); Assert.assertEquals("Sort order must match", expected, table.sortOrder()); } @Test public void testSetWriteDistributedByWithLocalSort() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -205,16 +223,15 @@ public void testSetWriteDistributedByWithLocalSort() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "hash", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("id") - .build(); + SortOrder expected = SortOrder.builderFor(table.schema()).withOrderId(1).asc("id").build(); Assert.assertEquals("Sort order must match", expected, table.sortOrder()); } @Test public void testSetWriteDistributedByAndUnordered() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -230,7 +247,9 @@ public void testSetWriteDistributedByAndUnordered() { @Test public void testSetWriteDistributedByOnly() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -246,7 +265,9 @@ public void testSetWriteDistributedByOnly() { @Test public void testSetWriteDistributedAndUnorderedInverted() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -262,7 +283,9 @@ public void testSetWriteDistributedAndUnorderedInverted() { @Test public void testSetWriteDistributedAndLocallyOrderedInverted() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -273,10 +296,7 @@ public void testSetWriteDistributedAndLocallyOrderedInverted() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "hash", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("id") - .build(); + SortOrder expected = SortOrder.builderFor(table.schema()).withOrderId(1).asc("id").build(); Assert.assertEquals("Sort order must match", expected, table.sortOrder()); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java index 66fa8e80c515..d8e918d8aadd 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.io.IOException; @@ -37,12 +36,12 @@ public class TestSnapshotTableProcedure extends SparkExtensionsTestBase { private static final String sourceName = "spark_catalog.default.source"; // Currently we can only Snapshot only out of the Spark Session Catalog - public TestSnapshotTableProcedure(String catalogName, String implementation, Map config) { + public TestSnapshotTableProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @After public void removeTables() { @@ -53,9 +52,12 @@ public void removeTables() { @Test public void testSnapshot() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object result = scalarSql("CALL %s.system.snapshot('%s', '%s')", catalogName, sourceName, tableName); + Object result = + scalarSql("CALL %s.system.snapshot('%s', '%s')", catalogName, sourceName, tableName); Assert.assertEquals("Should have added one file", 1L, result); @@ -65,7 +67,8 @@ public void testSnapshot() throws IOException { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -73,11 +76,14 @@ public void testSnapshot() throws IOException { @Test public void testSnapshotWithProperties() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object result = scalarSql( - "CALL %s.system.snapshot(source_table => '%s', table => '%s', properties => map('foo','bar'))", - catalogName, sourceName, tableName); + Object result = + scalarSql( + "CALL %s.system.snapshot(source_table => '%s', table => '%s', properties => map('foo','bar'))", + catalogName, sourceName, tableName); Assert.assertEquals("Should have added one file", 1L, result); @@ -91,30 +97,39 @@ public void testSnapshotWithProperties() throws IOException { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testSnapshotWithAlternateLocation() throws IOException { - Assume.assumeTrue("No Snapshoting with Alternate locations with Hadoop Catalogs", !catalogName.contains("hadoop")); + Assume.assumeTrue( + "No Snapshoting with Alternate locations with Hadoop Catalogs", + !catalogName.contains("hadoop")); String location = temp.newFolder().toString(); String snapshotLocation = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object[] result = sql( - "CALL %s.system.snapshot(source_table => '%s', table => '%s', location => '%s')", - catalogName, sourceName, tableName, snapshotLocation).get(0); + Object[] result = + sql( + "CALL %s.system.snapshot(source_table => '%s', table => '%s', location => '%s')", + catalogName, sourceName, tableName, snapshotLocation) + .get(0); Assert.assertEquals("Should have added one file", 1L, result[0]); String storageLocation = validationCatalog.loadTable(tableIdent).location(); - Assert.assertEquals("Snapshot should be made at specified location", snapshotLocation, storageLocation); + Assert.assertEquals( + "Snapshot should be made at specified location", snapshotLocation, storageLocation); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -122,19 +137,24 @@ public void testSnapshotWithAlternateLocation() throws IOException { @Test public void testDropTable() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object result = scalarSql("CALL %s.system.snapshot('%s', '%s')", catalogName, sourceName, tableName); + Object result = + scalarSql("CALL %s.system.snapshot('%s', '%s')", catalogName, sourceName, tableName); Assert.assertEquals("Should have added one file", 1L, result); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); sql("DROP TABLE %s", tableName); - assertEquals("Source table should be intact", + assertEquals( + "Source table should be intact", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", sourceName)); } @@ -142,50 +162,70 @@ public void testDropTable() throws IOException { @Test public void testSnapshotWithConflictingProps() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object result = scalarSql( - "CALL %s.system.snapshot(" + - "source_table => '%s'," + - "table => '%s'," + - "properties => map('%s', 'true', 'snapshot', 'false'))", - catalogName, sourceName, tableName, TableProperties.GC_ENABLED); + Object result = + scalarSql( + "CALL %s.system.snapshot(" + + "source_table => '%s'," + + "table => '%s'," + + "properties => map('%s', 'true', 'snapshot', 'false'))", + catalogName, sourceName, tableName, TableProperties.GC_ENABLED); Assert.assertEquals("Should have added one file", 1L, result); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Map props = table.properties(); Assert.assertEquals("Should override user value", "true", props.get("snapshot")); - Assert.assertEquals("Should override user value", "false", props.get(TableProperties.GC_ENABLED)); + Assert.assertEquals( + "Should override user value", "false", props.get(TableProperties.GC_ENABLED)); } @Test public void testInvalidSnapshotsCases() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); - - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); + + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.snapshot('foo')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.snapshot('n', 't', map('foo', 'bar'))", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid map args", - AnalysisException.class, "cannot resolve 'map", - () -> sql("CALL %s.system.snapshot('%s', 'fable', 'loc', map(2, 1, 1))", catalogName, sourceName)); - - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with invalid map args", + AnalysisException.class, + "cannot resolve 'map", + () -> + sql( + "CALL %s.system.snapshot('%s', 'fable', 'loc', map(2, 1, 1))", + catalogName, sourceName)); + + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.snapshot('', 'dest')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.snapshot('src', '')", catalogName)); } } diff --git a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestUpdate.java b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestUpdate.java index 7871e02c5b02..cdaf1c336012 100644 --- a/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestUpdate.java +++ b/spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestUpdate.java @@ -16,9 +16,20 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.DataOperations.OVERWRITE; +import static org.apache.iceberg.RowLevelOperationMode.COPY_ON_WRITE; +import static org.apache.iceberg.SnapshotSummary.ADDED_FILES_PROP; +import static org.apache.iceberg.SnapshotSummary.CHANGED_PARTITION_COUNT_PROP; +import static org.apache.iceberg.SnapshotSummary.DELETED_FILES_PROP; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.SPLIT_SIZE; +import static org.apache.iceberg.TableProperties.UPDATE_ISOLATION_LEVEL; +import static org.apache.iceberg.TableProperties.UPDATE_MODE; +import static org.apache.iceberg.TableProperties.UPDATE_MODE_DEFAULT; +import static org.apache.spark.sql.functions.lit; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -57,22 +68,15 @@ import org.junit.BeforeClass; import org.junit.Test; -import static org.apache.iceberg.DataOperations.OVERWRITE; -import static org.apache.iceberg.RowLevelOperationMode.COPY_ON_WRITE; -import static org.apache.iceberg.SnapshotSummary.ADDED_FILES_PROP; -import static org.apache.iceberg.SnapshotSummary.CHANGED_PARTITION_COUNT_PROP; -import static org.apache.iceberg.SnapshotSummary.DELETED_FILES_PROP; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.SPLIT_SIZE; -import static org.apache.iceberg.TableProperties.UPDATE_ISOLATION_LEVEL; -import static org.apache.iceberg.TableProperties.UPDATE_MODE; -import static org.apache.iceberg.TableProperties.UPDATE_MODE_DEFAULT; -import static org.apache.spark.sql.functions.lit; - public abstract class TestUpdate extends SparkRowLevelOperationsTestBase { - public TestUpdate(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestUpdate( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -102,7 +106,8 @@ public void testExplain() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 1 snapshot", 1, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -117,7 +122,8 @@ public void testUpdateEmptyTable() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -132,7 +138,8 @@ public void testUpdateWithAlias() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "invalid")), sql("SELECT * FROM %s", tableName)); } @@ -145,7 +152,8 @@ public void testUpdateAlignsAssignments() { sql("UPDATE %s SET `c2` = c2 - 2, c1 = `c1` - 1 WHERE id <=> 1", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, 10, 109), row(2, 22, 222)), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -159,7 +167,8 @@ public void testUpdateWithUnsupportedPartitionPredicate() { sql("UPDATE %s t SET `t`.`id` = -1 WHERE t.dep LIKE '%%r' ", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "software")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -169,12 +178,10 @@ public void testUpdateWithDynamicFileFiltering() { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 3, \"dep\": \"hr\" }"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hardware\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 3, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hardware\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }"); sql("UPDATE %s SET id = cast('-1' AS INT) WHERE id = 2", tableName); @@ -188,7 +195,8 @@ public void testUpdateWithDynamicFileFiltering() { validateMergeOnRead(currentSnapshot, "1", "1", "1"); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(1, "hardware"), row(1, "hr"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -212,7 +220,8 @@ public void testUpdateNonExistingRecords() { validateMergeOnRead(currentSnapshot, "0", null, null); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -229,10 +238,13 @@ public void testUpdateWithoutCondition() { sql("INSERT INTO TABLE %s VALUES (null, 'hr')", tableName); // set the num of shuffle partitions to 200 instead of default 4 to reduce the chance of hashing - // records for multiple source files to one writing task (needed for a predictable num of output files) - withSQLConf(ImmutableMap.of(SQLConf.SHUFFLE_PARTITIONS().key(), "200"), () -> { - sql("UPDATE %s SET id = -1", tableName); - }); + // records for multiple source files to one writing task (needed for a predictable num of output + // files) + withSQLConf( + ImmutableMap.of(SQLConf.SHUFFLE_PARTITIONS().key(), "200"), + () -> { + sql("UPDATE %s SET id = -1", tableName); + }); Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 4 snapshots", 4, Iterables.size(table.snapshots())); @@ -249,7 +261,8 @@ public void testUpdateWithoutCondition() { validateMergeOnRead(currentSnapshot, "2", "2", "2"); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(-1, "hr")), sql("SELECT * FROM %s ORDER BY dep ASC", tableName)); } @@ -258,26 +271,30 @@ public void testUpdateWithoutCondition() { public void testUpdateWithNullConditions() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 0, \"dep\": null }\n" + - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + append( + tableName, + "{ \"id\": 0, \"dep\": null }\n" + + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }"); // should not update any rows as null is never equal to null sql("UPDATE %s SET id = -1 WHERE dep = NULL", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should not update any rows the condition does not match any records sql("UPDATE %s SET id = -1 WHERE dep = 'software'", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should update one matching row with a null-safe condition sql("UPDATE %s SET dep = 'invalid', id = -1 WHERE dep <=> NULL", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "invalid"), row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -286,23 +303,27 @@ public void testUpdateWithNullConditions() { public void testUpdateWithInAndNotInConditions() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); sql("UPDATE %s SET id = -1 WHERE id IN (1, null)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("UPDATE %s SET id = 100 WHERE id NOT IN (null, 1)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("UPDATE %s SET id = 100 WHERE id NOT IN (1, 10)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(100, "hardware"), row(100, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); } @@ -314,16 +335,20 @@ public void testUpdateWithMultipleRowGroupsParquet() throws NoSuchTableException createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, SPLIT_SIZE, 100); List ids = Lists.newArrayListWithCapacity(200); for (int id = 1; id <= 200; id++) { ids.add(id); } - Dataset df = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); + Dataset df = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); df.coalesce(1).writeTo(tableName).append(); Assert.assertEquals(200, spark.table(tableName).count()); @@ -336,27 +361,33 @@ public void testUpdateWithMultipleRowGroupsParquet() throws NoSuchTableException @Test public void testUpdateNestedStructFields() { - createAndInitTable("id INT, s STRUCT,m:MAP>>", + createAndInitTable( + "id INT, s STRUCT,m:MAP>>", "{ \"id\": 1, \"s\": { \"c1\": 2, \"c2\": { \"a\": [1,2], \"m\": { \"a\": \"b\"} } } } }"); // update primitive, array, map columns inside a struct sql("UPDATE %s SET s.c1 = -1, s.c2.m = map('k', 'v'), s.c2.a = array(-1)", tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(-1, row(ImmutableList.of(-1), ImmutableMap.of("k", "v"))))), sql("SELECT * FROM %s", tableName)); // set primitive, array, map columns to NULL (proper casts should be in place) sql("UPDATE %s SET s.c1 = NULL, s.c2 = NULL WHERE id IN (1)", tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(null, null))), sql("SELECT * FROM %s", tableName)); // update all fields in a struct - sql("UPDATE %s SET s = named_struct('c1', 1, 'c2', named_struct('a', array(1), 'm', null))", tableName); + sql( + "UPDATE %s SET s = named_struct('c1', 1, 'c2', named_struct('a', array(1), 'm', null))", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(1, row(ImmutableList.of(1), null)))), sql("SELECT * FROM %s", tableName)); } @@ -366,29 +397,33 @@ public void testUpdateWithUserDefinedDistribution() { createAndInitTable("id INT, c2 INT, c3 INT"); sql("ALTER TABLE %s ADD PARTITION FIELD bucket(8, c3)", tableName); - append(tableName, - "{ \"id\": 1, \"c2\": 11, \"c3\": 1 }\n" + - "{ \"id\": 2, \"c2\": 22, \"c3\": 1 }\n" + - "{ \"id\": 3, \"c2\": 33, \"c3\": 1 }"); + append( + tableName, + "{ \"id\": 1, \"c2\": 11, \"c3\": 1 }\n" + + "{ \"id\": 2, \"c2\": 22, \"c3\": 1 }\n" + + "{ \"id\": 3, \"c2\": 33, \"c3\": 1 }"); // request a global sort sql("ALTER TABLE %s WRITE ORDERED BY c2", tableName); sql("UPDATE %s SET c2 = -22 WHERE id NOT IN (1, 3)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, 11, 1), row(2, -22, 1), row(3, 33, 1)), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); // request a local sort sql("ALTER TABLE %s WRITE LOCALLY ORDERED BY id", tableName); sql("UPDATE %s SET c2 = -33 WHERE id = 3", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, 11, 1), row(2, -22, 1), row(3, -33, 1)), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); // request a hash distribution + local sort sql("ALTER TABLE %s WRITE DISTRIBUTED BY PARTITION ORDERED BY id", tableName); sql("UPDATE %s SET c2 = -11 WHERE id = 1", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, -11, 1), row(2, -22, 1), row(3, -33, 1)), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -400,34 +435,41 @@ public synchronized void testUpdateWithSerializableIsolation() throws Interrupte createAndInitTable("id INT, dep STRING"); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, UPDATE_ISOLATION_LEVEL, "serializable"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, UPDATE_ISOLATION_LEVEL, "serializable"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // update thread - Future updateFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("UPDATE %s SET id = -1 WHERE id = 1", tableName); - barrier.incrementAndGet(); - } - }); + Future updateFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("UPDATE %s SET id = -1 WHERE id = 1", tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { updateFuture.get(); @@ -438,7 +480,8 @@ public synchronized void testUpdateWithSerializableIsolation() throws Interrupte Throwable validationException = sparkException.getCause(); Assert.assertThat(validationException, CoreMatchers.instanceOf(ValidationException.class)); String errMsg = validationException.getMessage(); - Assert.assertThat(errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); + Assert.assertThat( + errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); } finally { appendFuture.cancel(true); } @@ -448,40 +491,48 @@ public synchronized void testUpdateWithSerializableIsolation() throws Interrupte } @Test - public synchronized void testUpdateWithSnapshotIsolation() throws InterruptedException, ExecutionException { + public synchronized void testUpdateWithSnapshotIsolation() + throws InterruptedException, ExecutionException { // cannot run tests with concurrency for Hadoop tables without atomic renames Assume.assumeFalse(catalogName.equalsIgnoreCase("testhadoop")); createAndInitTable("id INT, dep STRING"); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, UPDATE_ISOLATION_LEVEL, "snapshot"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, UPDATE_ISOLATION_LEVEL, "snapshot"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // update thread - Future updateFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("UPDATE %s SET id = -1 WHERE id = 1", tableName); - barrier.incrementAndGet(); - } - }); + Future updateFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("UPDATE %s SET id = -1 WHERE id = 1", tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { updateFuture.get(); @@ -499,7 +550,8 @@ public void testUpdateWithInferredCasts() { sql("UPDATE %s SET s = -1 WHERE id = 1", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "-1")), sql("SELECT * FROM %s", tableName)); } @@ -510,7 +562,8 @@ public void testUpdateModifiesNullStruct() { sql("UPDATE %s SET s.n1 = -1 WHERE id = 1", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, row(-1, null))), sql("SELECT * FROM %s", tableName)); } @@ -520,20 +573,19 @@ public void testUpdateRefreshesRelationCache() { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 3, \"dep\": \"hr\" }"); + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 3, \"dep\": \"hr\" }"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hardware\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hardware\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }"); Dataset query = spark.sql("SELECT * FROM " + tableName + " WHERE id = 1"); query.createOrReplaceTempView("tmp"); spark.sql("CACHE TABLE tmp"); - assertEquals("View should have correct data", + assertEquals( + "View should have correct data", ImmutableList.of(row(1, "hardware"), row(1, "hr")), sql("SELECT * FROM tmp ORDER BY id, dep")); @@ -549,11 +601,13 @@ public void testUpdateRefreshesRelationCache() { validateMergeOnRead(currentSnapshot, "2", "2", "2"); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(2, "hardware"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - assertEquals("Should refresh the relation cache", + assertEquals( + "Should refresh the relation cache", ImmutableList.of(), sql("SELECT * FROM tmp ORDER BY id, dep")); @@ -564,36 +618,47 @@ public void testUpdateRefreshesRelationCache() { public void testUpdateWithInSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(0, 1, null), Encoders.INT()); createOrReplaceView("updated_dep", Arrays.asList("software", "hr"), Encoders.STRING()); - sql("UPDATE %s SET id = -1 WHERE " + - "id IN (SELECT * FROM updated_id) AND " + - "dep IN (SELECT * from updated_dep)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET id = -1 WHERE " + + "id IN (SELECT * FROM updated_id) AND " + + "dep IN (SELECT * from updated_dep)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("UPDATE %s SET id = 5 WHERE id IS NULL OR id IN (SELECT value + 1 FROM updated_id)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET id = 5 WHERE id IS NULL OR id IN (SELECT value + 1 FROM updated_id)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(5, "hardware"), row(5, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - append(tableName, - "{ \"id\": null, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); - assertEquals("Should have expected rows", - ImmutableList.of(row(-1, "hr"), row(2, "hr"), row(5, "hardware"), row(5, "hr"), row(null, "hr")), + append(tableName, "{ \"id\": null, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + assertEquals( + "Should have expected rows", + ImmutableList.of( + row(-1, "hr"), row(2, "hr"), row(5, "hardware"), row(5, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); - sql("UPDATE %s SET id = 10 WHERE id IN (SELECT value + 2 FROM updated_id) AND dep = 'hr'", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(-1, "hr"), row(5, "hardware"), row(5, "hr"), row(10, "hr"), row(null, "hr")), + sql( + "UPDATE %s SET id = 10 WHERE id IN (SELECT value + 2 FROM updated_id) AND dep = 'hr'", + tableName); + assertEquals( + "Should have expected rows", + ImmutableList.of( + row(-1, "hr"), row(5, "hardware"), row(5, "hr"), row(10, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); } @@ -604,12 +669,10 @@ public void testUpdateWithInSubqueryAndDynamicFileFiltering() { sql("ALTER TABLE %s WRITE DISTRIBUTED BY PARTITION", tableName); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 3, \"dep\": \"hr\" }"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hardware\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 3, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hardware\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }"); createOrReplaceView("updated_id", Arrays.asList(-1, 2), Encoders.INT()); @@ -625,7 +688,8 @@ public void testUpdateWithInSubqueryAndDynamicFileFiltering() { validateMergeOnRead(currentSnapshot, "1", "1", "1"); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(1, "hardware"), row(1, "hr"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -634,26 +698,31 @@ public void testUpdateWithInSubqueryAndDynamicFileFiltering() { public void testUpdateWithSelfSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); sql("UPDATE %s SET dep = 'x' WHERE id IN (SELECT id + 1 FROM %s)", tableName, tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "x")), sql("SELECT * FROM %s ORDER BY id", tableName)); // TODO: Spark does not support AQE and DPP with aggregates at the moment - withSQLConf(ImmutableMap.of(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), "false"), () -> { - sql("UPDATE %s SET dep = 'y' WHERE " + - "id = (SELECT count(*) FROM (SELECT DISTINCT id FROM %s) AS t)", tableName, tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(1, "hr"), row(2, "y")), - sql("SELECT * FROM %s ORDER BY id", tableName)); - }); + withSQLConf( + ImmutableMap.of(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), "false"), + () -> { + sql( + "UPDATE %s SET dep = 'y' WHERE " + + "id = (SELECT count(*) FROM (SELECT DISTINCT id FROM %s) AS t)", + tableName, tableName); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(1, "hr"), row(2, "y")), + sql("SELECT * FROM %s ORDER BY id", tableName)); + }); sql("UPDATE %s SET id = (SELECT id - 2 FROM %s WHERE id = 1)", tableName, tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(-1, "y")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -662,16 +731,21 @@ public void testUpdateWithSelfSubquery() { public void testUpdateWithMultiColumnInSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); - List deletedEmployees = Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr")); + List deletedEmployees = + Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr")); createOrReplaceView("deleted_employee", deletedEmployees, Encoders.bean(Employee.class)); - sql("UPDATE %s SET dep = 'x', id = -1 WHERE (id, dep) IN (SELECT id, dep FROM deleted_employee)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET dep = 'x', id = -1 WHERE (id, dep) IN (SELECT id, dep FROM deleted_employee)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "x"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -680,27 +754,35 @@ public void testUpdateWithMultiColumnInSubquery() { public void testUpdateWithNotInSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("updated_dep", Arrays.asList("software", "hr"), Encoders.STRING()); // the file filter subquery (nested loop lef-anti join) returns 0 records sql("UPDATE %s SET id = -1 WHERE id NOT IN (SELECT * FROM updated_id)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("UPDATE %s SET id = -1 WHERE id NOT IN (SELECT * FROM updated_id WHERE value IS NOT NULL)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET id = -1 WHERE id NOT IN (SELECT * FROM updated_id WHERE value IS NOT NULL)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); - sql("UPDATE %s SET id = 5 WHERE id NOT IN (SELECT * FROM updated_id) OR dep IN ('software', 'hr')", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET id = 5 WHERE id NOT IN (SELECT * FROM updated_id) OR dep IN ('software', 'hr')", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(5, "hr"), row(5, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); } @@ -709,36 +791,49 @@ public void testUpdateWithNotInSubquery() { public void testUpdateWithExistSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("updated_dep", Arrays.asList("hr", null), Encoders.STRING()); - sql("UPDATE %s t SET id = -1 WHERE EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = -1 WHERE EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("UPDATE %s t SET dep = 'x', id = -1 WHERE " + - "EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value + 2)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET dep = 'x', id = -1 WHERE " + + "EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value + 2)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "x"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("UPDATE %s t SET id = -2 WHERE " + - "EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value) OR " + - "t.id IS NULL", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = -2 WHERE " + + "EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value) OR " + + "t.id IS NULL", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-2, "hr"), row(-2, "x"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - sql("UPDATE %s t SET id = 1 WHERE " + - "EXISTS (SELECT 1 FROM updated_id ui WHERE t.id = ui.value) AND " + - "EXISTS (SELECT 1 FROM updated_dep ud WHERE t.dep = ud.value)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = 1 WHERE " + + "EXISTS (SELECT 1 FROM updated_id ui WHERE t.id = ui.value) AND " + + "EXISTS (SELECT 1 FROM updated_dep ud WHERE t.dep = ud.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-2, "x"), row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -747,30 +842,40 @@ public void testUpdateWithExistSubquery() { public void testUpdateWithNotExistsSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("updated_dep", Arrays.asList("hr", "software"), Encoders.STRING()); - sql("UPDATE %s t SET id = -1 WHERE NOT EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value + 2)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = -1 WHERE NOT EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value + 2)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(1, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - sql("UPDATE %s t SET id = 5 WHERE " + - "NOT EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value) OR " + - "t.id = 1", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = 5 WHERE " + + "NOT EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value) OR " + + "t.id = 1", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(5, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - sql("UPDATE %s t SET id = 10 WHERE " + - "NOT EXISTS (SELECT 1 FROM updated_id ui WHERE t.id = ui.value) AND " + - "EXISTS (SELECT 1 FROM updated_dep ud WHERE t.dep = ud.value)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = 10 WHERE " + + "NOT EXISTS (SELECT 1 FROM updated_id ui WHERE t.id = ui.value) AND " + + "EXISTS (SELECT 1 FROM updated_dep ud WHERE t.dep = ud.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(10, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -779,20 +884,24 @@ public void testUpdateWithNotExistsSubquery() { public void testUpdateWithScalarSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(1, 100, null), Encoders.INT()); // TODO: Spark does not support AQE and DPP with aggregates at the moment - withSQLConf(ImmutableMap.of(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), "false"), () -> { - sql("UPDATE %s SET id = -1 WHERE id <= (SELECT min(value) FROM updated_id)", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), - sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - }); + withSQLConf( + ImmutableMap.of(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), "false"), + () -> { + sql("UPDATE %s SET id = -1 WHERE id <= (SELECT min(value) FROM updated_id)", tableName); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), + sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); + }); } @Test @@ -800,25 +909,29 @@ public void testUpdateThatRequiresGroupingBeforeWrite() { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - append(tableName, - "{ \"id\": 0, \"dep\": \"hr\" }\n" + - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); - - append(tableName, - "{ \"id\": 0, \"dep\": \"ops\" }\n" + - "{ \"id\": 1, \"dep\": \"ops\" }\n" + - "{ \"id\": 2, \"dep\": \"ops\" }"); - - append(tableName, - "{ \"id\": 0, \"dep\": \"hr\" }\n" + - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); - - append(tableName, - "{ \"id\": 0, \"dep\": \"ops\" }\n" + - "{ \"id\": 1, \"dep\": \"ops\" }\n" + - "{ \"id\": 2, \"dep\": \"ops\" }"); + append( + tableName, + "{ \"id\": 0, \"dep\": \"hr\" }\n" + + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hr\" }"); + + append( + tableName, + "{ \"id\": 0, \"dep\": \"ops\" }\n" + + "{ \"id\": 1, \"dep\": \"ops\" }\n" + + "{ \"id\": 2, \"dep\": \"ops\" }"); + + append( + tableName, + "{ \"id\": 0, \"dep\": \"hr\" }\n" + + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hr\" }"); + + append( + tableName, + "{ \"id\": 0, \"dep\": \"ops\" }\n" + + "{ \"id\": 1, \"dep\": \"ops\" }\n" + + "{ \"id\": 2, \"dep\": \"ops\" }"); createOrReplaceView("updated_id", Arrays.asList(1, 100), Encoders.INT()); @@ -838,18 +951,22 @@ public void testUpdateThatRequiresGroupingBeforeWrite() { public void testUpdateWithVectorization() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 0, \"dep\": \"hr\" }\n" + - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 0, \"dep\": \"hr\" }\n" + + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hr\" }"); - withSQLConf(ImmutableMap.of(SparkSQLProperties.VECTORIZATION_ENABLED, "true"), () -> { - sql("UPDATE %s t SET id = -1", tableName); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.VECTORIZATION_ENABLED, "true"), + () -> { + sql("UPDATE %s t SET id = -1", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(-1, "hr"), row(-1, "hr"), row(-1, "hr")), - sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - }); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(-1, "hr"), row(-1, "hr"), row(-1, "hr")), + sql("SELECT * FROM %s ORDER BY id, dep", tableName)); + }); } @Test @@ -864,22 +981,28 @@ public void testUpdateModifyPartitionSourceField() throws NoSuchTableException { ids.add(id); } - Dataset df1 = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")) - .withColumn("country", lit("usa")); + Dataset df1 = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")) + .withColumn("country", lit("usa")); df1.coalesce(1).writeTo(tableName).append(); - Dataset df2 = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("software")) - .withColumn("country", lit("usa")); + Dataset df2 = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("software")) + .withColumn("country", lit("usa")); df2.coalesce(1).writeTo(tableName).append(); - Dataset df3 = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hardware")) - .withColumn("country", lit("usa")); + Dataset df3 = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hardware")) + .withColumn("country", lit("usa")); df3.coalesce(1).writeTo(tableName).append(); sql("UPDATE %s SET id = -1 WHERE id IN (10, 11, 12, 13, 14, 15, 16, 17, 18, 19)", tableName); @@ -909,21 +1032,27 @@ public void testUpdateWithStaticPredicatePushdown() { table.io().deleteFile(dataFile.path().toString()); // disable dynamic pruning and rely only on static predicate pushdown - withSQLConf(ImmutableMap.of(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED().key(), "false"), () -> { - sql("UPDATE %s SET id = -1 WHERE dep IN ('software') AND id == 1", tableName); - }); + withSQLConf( + ImmutableMap.of(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED().key(), "false"), + () -> { + sql("UPDATE %s SET id = -1 WHERE dep IN ('software') AND id == 1", tableName); + }); } @Test public void testUpdateWithInvalidUpdates() { createAndInitTable("id INT, a ARRAY>, m MAP"); - AssertHelpers.assertThrows("Should complain about updating an array column", - AnalysisException.class, "Updating nested fields is only supported for structs", + AssertHelpers.assertThrows( + "Should complain about updating an array column", + AnalysisException.class, + "Updating nested fields is only supported for structs", () -> sql("UPDATE %s SET a.c1 = 1", tableName)); - AssertHelpers.assertThrows("Should complain about updating a map column", - AnalysisException.class, "Updating nested fields is only supported for structs", + AssertHelpers.assertThrows( + "Should complain about updating a map column", + AnalysisException.class, + "Updating nested fields is only supported for structs", () -> sql("UPDATE %s SET m.key = 'new_key'", tableName)); } @@ -931,48 +1060,68 @@ public void testUpdateWithInvalidUpdates() { public void testUpdateWithConflictingAssignments() { createAndInitTable("id INT, c STRUCT>"); - AssertHelpers.assertThrows("Should complain about conflicting updates to a top-level column", - AnalysisException.class, "Updates are in conflict", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a top-level column", + AnalysisException.class, + "Updates are in conflict", () -> sql("UPDATE %s t SET t.id = 1, t.c.n1 = 2, t.id = 2", tableName)); - AssertHelpers.assertThrows("Should complain about conflicting updates to a nested column", - AnalysisException.class, "Updates are in conflict for these columns", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a nested column", + AnalysisException.class, + "Updates are in conflict for these columns", () -> sql("UPDATE %s t SET t.c.n1 = 1, t.id = 2, t.c.n1 = 2", tableName)); - AssertHelpers.assertThrows("Should complain about conflicting updates to a nested column", - AnalysisException.class, "Updates are in conflict", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a nested column", + AnalysisException.class, + "Updates are in conflict", () -> { - sql("UPDATE %s SET c.n1 = 1, c = named_struct('n1', 1, 'n2', named_struct('dn1', 1, 'dn2', 2))", tableName); + sql( + "UPDATE %s SET c.n1 = 1, c = named_struct('n1', 1, 'n2', named_struct('dn1', 1, 'dn2', 2))", + tableName); }); } @Test public void testUpdateWithInvalidAssignments() { - createAndInitTable("id INT NOT NULL, s STRUCT> NOT NULL"); - - for (String policy : new String[]{"ansi", "strict"}) { - withSQLConf(ImmutableMap.of("spark.sql.storeAssignmentPolicy", policy), () -> { - - AssertHelpers.assertThrows("Should complain about writing nulls to a top-level column", - AnalysisException.class, "Cannot write nullable values to non-null column", - () -> sql("UPDATE %s t SET t.id = NULL", tableName)); - - AssertHelpers.assertThrows("Should complain about writing nulls to a nested column", - AnalysisException.class, "Cannot write nullable values to non-null column", - () -> sql("UPDATE %s t SET t.s.n1 = NULL", tableName)); - - AssertHelpers.assertThrows("Should complain about writing missing fields in structs", - AnalysisException.class, "missing fields", - () -> sql("UPDATE %s t SET t.s = named_struct('n1', 1)", tableName)); - - AssertHelpers.assertThrows("Should complain about writing invalid data types", - AnalysisException.class, "Cannot safely cast", - () -> sql("UPDATE %s t SET t.s.n1 = 'str'", tableName)); - - AssertHelpers.assertThrows("Should complain about writing incompatible structs", - AnalysisException.class, "field name does not match", - () -> sql("UPDATE %s t SET t.s.n2 = named_struct('dn2', 1, 'dn1', 2)", tableName)); - }); + createAndInitTable( + "id INT NOT NULL, s STRUCT> NOT NULL"); + + for (String policy : new String[] {"ansi", "strict"}) { + withSQLConf( + ImmutableMap.of("spark.sql.storeAssignmentPolicy", policy), + () -> { + AssertHelpers.assertThrows( + "Should complain about writing nulls to a top-level column", + AnalysisException.class, + "Cannot write nullable values to non-null column", + () -> sql("UPDATE %s t SET t.id = NULL", tableName)); + + AssertHelpers.assertThrows( + "Should complain about writing nulls to a nested column", + AnalysisException.class, + "Cannot write nullable values to non-null column", + () -> sql("UPDATE %s t SET t.s.n1 = NULL", tableName)); + + AssertHelpers.assertThrows( + "Should complain about writing missing fields in structs", + AnalysisException.class, + "missing fields", + () -> sql("UPDATE %s t SET t.s = named_struct('n1', 1)", tableName)); + + AssertHelpers.assertThrows( + "Should complain about writing invalid data types", + AnalysisException.class, + "Cannot safely cast", + () -> sql("UPDATE %s t SET t.s.n1 = 'str'", tableName)); + + AssertHelpers.assertThrows( + "Should complain about writing incompatible structs", + AnalysisException.class, + "field name does not match", + () -> sql("UPDATE %s t SET t.s.n2 = named_struct('dn2', 1, 'dn1', 2)", tableName)); + }); } } @@ -980,8 +1129,10 @@ public void testUpdateWithInvalidAssignments() { public void testUpdateWithNonDeterministicCondition() { createAndInitTable("id INT, dep STRING"); - AssertHelpers.assertThrows("Should complain about non-deterministic expressions", - AnalysisException.class, "nondeterministic expressions are only allowed", + AssertHelpers.assertThrows( + "Should complain about non-deterministic expressions", + AnalysisException.class, + "nondeterministic expressions are only allowed", () -> sql("UPDATE %s SET id = -1 WHERE id = 1 AND rand() > 0.5", tableName)); } @@ -989,8 +1140,10 @@ public void testUpdateWithNonDeterministicCondition() { public void testUpdateOnNonIcebergTableNotSupported() { createOrReplaceView("testtable", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("UPDATE is not supported for non iceberg table", - UnsupportedOperationException.class, "not supported temporarily", + AssertHelpers.assertThrows( + "UPDATE is not supported for non iceberg table", + UnsupportedOperationException.class, + "not supported temporarily", () -> sql("UPDATE %s SET c1 = -1 WHERE c2 = 1", "testtable")); } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java index 17df7d2cf9d7..d6b0e9c94258 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -32,13 +31,13 @@ public class SparkBenchmarkUtil { - private SparkBenchmarkUtil() { - } + private SparkBenchmarkUtil() {} public static UnsafeProjection projection(Schema expectedSchema, Schema actualSchema) { StructType struct = SparkSchemaUtil.convert(actualSchema); - List refs = JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava(); + List refs = + JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava(); List attrs = Lists.newArrayListWithExpectedSize(struct.fields().length); List exprs = Lists.newArrayListWithExpectedSize(struct.fields().length); diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java index 8c205037f56e..eaef8e0bccaa 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java @@ -16,10 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.spark.action; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Collections; import java.util.UUID; @@ -57,13 +62,6 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.annotations.Timeout; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - @Fork(1) @State(Scope.Benchmark) @Measurement(iterations = 10) @@ -108,10 +106,10 @@ public void sortInt() { SparkActions.get() .rewriteDataFiles(table()) .option(BinPackStrategy.REWRITE_ALL, "true") - .sort(SortOrder - .builderFor(table().schema()) - .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .build()) + .sort( + SortOrder.builderFor(table().schema()) + .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .build()) .execute(); } @@ -121,11 +119,11 @@ public void sortInt2() { SparkActions.get() .rewriteDataFiles(table()) .option(BinPackStrategy.REWRITE_ALL, "true") - .sort(SortOrder - .builderFor(table().schema()) - .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol2", SortDirection.ASC, NullOrder.NULLS_FIRST) - .build()) + .sort( + SortOrder.builderFor(table().schema()) + .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol2", SortDirection.ASC, NullOrder.NULLS_FIRST) + .build()) .execute(); } @@ -135,13 +133,13 @@ public void sortInt3() { SparkActions.get() .rewriteDataFiles(table()) .option(BinPackStrategy.REWRITE_ALL, "true") - .sort(SortOrder - .builderFor(table().schema()) - .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol2", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol3", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol4", SortDirection.ASC, NullOrder.NULLS_FIRST) - .build()) + .sort( + SortOrder.builderFor(table().schema()) + .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol2", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol3", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol4", SortDirection.ASC, NullOrder.NULLS_FIRST) + .build()) .execute(); } @@ -151,13 +149,13 @@ public void sortInt4() { SparkActions.get() .rewriteDataFiles(table()) .option(BinPackStrategy.REWRITE_ALL, "true") - .sort(SortOrder - .builderFor(table().schema()) - .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol2", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol3", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol4", SortDirection.ASC, NullOrder.NULLS_FIRST) - .build()) + .sort( + SortOrder.builderFor(table().schema()) + .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol2", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol3", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol4", SortDirection.ASC, NullOrder.NULLS_FIRST) + .build()) .execute(); } @@ -167,10 +165,10 @@ public void sortString() { SparkActions.get() .rewriteDataFiles(table()) .option(BinPackStrategy.REWRITE_ALL, "true") - .sort(SortOrder - .builderFor(table().schema()) - .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .build()) + .sort( + SortOrder.builderFor(table().schema()) + .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .build()) .execute(); } @@ -180,13 +178,13 @@ public void sortFourColumns() { SparkActions.get() .rewriteDataFiles(table()) .option(BinPackStrategy.REWRITE_ALL, "true") - .sort(SortOrder - .builderFor(table().schema()) - .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("dateCol", SortDirection.DESC, NullOrder.NULLS_FIRST) - .sortBy("doubleCol", SortDirection.DESC, NullOrder.NULLS_FIRST) - .build()) + .sort( + SortOrder.builderFor(table().schema()) + .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("dateCol", SortDirection.DESC, NullOrder.NULLS_FIRST) + .sortBy("doubleCol", SortDirection.DESC, NullOrder.NULLS_FIRST) + .build()) .execute(); } @@ -196,15 +194,15 @@ public void sortSixColumns() { SparkActions.get() .rewriteDataFiles(table()) .option(BinPackStrategy.REWRITE_ALL, "true") - .sort(SortOrder - .builderFor(table().schema()) - .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("dateCol", SortDirection.DESC, NullOrder.NULLS_FIRST) - .sortBy("timestampCol", SortDirection.DESC, NullOrder.NULLS_FIRST) - .sortBy("doubleCol", SortDirection.DESC, NullOrder.NULLS_FIRST) - .sortBy("longCol", SortDirection.DESC, NullOrder.NULLS_FIRST) - .build()) + .sort( + SortOrder.builderFor(table().schema()) + .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("dateCol", SortDirection.DESC, NullOrder.NULLS_FIRST) + .sortBy("timestampCol", SortDirection.DESC, NullOrder.NULLS_FIRST) + .sortBy("doubleCol", SortDirection.DESC, NullOrder.NULLS_FIRST) + .sortBy("longCol", SortDirection.DESC, NullOrder.NULLS_FIRST) + .build()) .execute(); } @@ -283,54 +281,76 @@ protected Configuration initHadoopConf() { } protected final void initTable() { - Schema schema = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "intCol2", Types.IntegerType.get()), - required(4, "intCol3", Types.IntegerType.get()), - required(5, "intCol4", Types.IntegerType.get()), - required(6, "floatCol", Types.FloatType.get()), - optional(7, "doubleCol", Types.DoubleType.get()), - optional(8, "dateCol", Types.DateType.get()), - optional(9, "timestampCol", Types.TimestampType.withZone()), - optional(10, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "intCol2", Types.IntegerType.get()), + required(4, "intCol3", Types.IntegerType.get()), + required(5, "intCol4", Types.IntegerType.get()), + required(6, "floatCol", Types.FloatType.get()), + optional(7, "doubleCol", Types.DoubleType.get()), + optional(8, "dateCol", Types.DateType.get()), + optional(9, "timestampCol", Types.TimestampType.withZone()), + optional(10, "stringCol", Types.StringType.get())); SparkSessionCatalog catalog; try { - catalog = (SparkSessionCatalog) - Spark3Util.catalogAndIdentifier(spark(), "spark_catalog").catalog(); + catalog = + (SparkSessionCatalog) Spark3Util.catalogAndIdentifier(spark(), "spark_catalog").catalog(); catalog.dropTable(IDENT); - catalog.createTable(IDENT, SparkSchemaUtil.convert(schema), new Transform[0], Collections.emptyMap()); + catalog.createTable( + IDENT, SparkSchemaUtil.convert(schema), new Transform[0], Collections.emptyMap()); } catch (Exception e) { throw new RuntimeException(e); } } private void appendData() { - Dataset df = spark().range(0, NUM_ROWS * NUM_FILES, 1, NUM_FILES) - .drop("id") - .withColumn("longCol", new RandomGeneratingUDF(UNIQUE_VALUES).randomLongUDF().apply()) - .withColumn( - "intCol", - new RandomGeneratingUDF(UNIQUE_VALUES).randomLongUDF().apply().cast(DataTypes.IntegerType)) - .withColumn( - "intCol2", - new RandomGeneratingUDF(UNIQUE_VALUES).randomLongUDF().apply().cast(DataTypes.IntegerType)) - .withColumn( - "intCol3", - new RandomGeneratingUDF(UNIQUE_VALUES).randomLongUDF().apply().cast(DataTypes.IntegerType)) - .withColumn( - "intCol4", - new RandomGeneratingUDF(UNIQUE_VALUES).randomLongUDF().apply().cast(DataTypes.IntegerType)) - .withColumn( - "floatCol", - new RandomGeneratingUDF(UNIQUE_VALUES).randomLongUDF().apply().cast(DataTypes.FloatType)) - .withColumn( - "doubleCol", - new RandomGeneratingUDF(UNIQUE_VALUES).randomLongUDF().apply().cast(DataTypes.DoubleType)) - .withColumn("dateCol", date_add(current_date(), col("intCol").mod(NUM_FILES))) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", new RandomGeneratingUDF(UNIQUE_VALUES).randomString().apply()); + Dataset df = + spark() + .range(0, NUM_ROWS * NUM_FILES, 1, NUM_FILES) + .drop("id") + .withColumn("longCol", new RandomGeneratingUDF(UNIQUE_VALUES).randomLongUDF().apply()) + .withColumn( + "intCol", + new RandomGeneratingUDF(UNIQUE_VALUES) + .randomLongUDF() + .apply() + .cast(DataTypes.IntegerType)) + .withColumn( + "intCol2", + new RandomGeneratingUDF(UNIQUE_VALUES) + .randomLongUDF() + .apply() + .cast(DataTypes.IntegerType)) + .withColumn( + "intCol3", + new RandomGeneratingUDF(UNIQUE_VALUES) + .randomLongUDF() + .apply() + .cast(DataTypes.IntegerType)) + .withColumn( + "intCol4", + new RandomGeneratingUDF(UNIQUE_VALUES) + .randomLongUDF() + .apply() + .cast(DataTypes.IntegerType)) + .withColumn( + "floatCol", + new RandomGeneratingUDF(UNIQUE_VALUES) + .randomLongUDF() + .apply() + .cast(DataTypes.FloatType)) + .withColumn( + "doubleCol", + new RandomGeneratingUDF(UNIQUE_VALUES) + .randomLongUDF() + .apply() + .cast(DataTypes.DoubleType)) + .withColumn("dateCol", date_add(current_date(), col("intCol").mod(NUM_FILES))) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", new RandomGeneratingUDF(UNIQUE_VALUES).randomString().apply()); writeData(df); } @@ -362,7 +382,8 @@ protected void cleanupFiles() throws IOException { protected void setupSpark() { SparkSession.Builder builder = SparkSession.builder() - .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") + .config( + "spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", getCatalogWarehouse()) .master("local[*]"); diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/action/RandomGeneratingUDF.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/action/RandomGeneratingUDF.java index cfbd9d4fb3f6..63d24f7da553 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/action/RandomGeneratingUDF.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/action/RandomGeneratingUDF.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.action; +import static org.apache.spark.sql.functions.udf; + import java.io.Serializable; import java.util.Random; import org.apache.iceberg.types.Types; @@ -26,8 +27,6 @@ import org.apache.spark.sql.expressions.UserDefinedFunction; import org.apache.spark.sql.types.DataTypes; -import static org.apache.spark.sql.functions.udf; - class RandomGeneratingUDF implements Serializable { private final long uniqueValues; private Random rand = new Random(); @@ -37,11 +36,16 @@ class RandomGeneratingUDF implements Serializable { } UserDefinedFunction randomLongUDF() { - return udf(() -> rand.nextLong() % (uniqueValues / 2), DataTypes.LongType).asNondeterministic().asNonNullable(); + return udf(() -> rand.nextLong() % (uniqueValues / 2), DataTypes.LongType) + .asNondeterministic() + .asNonNullable(); } UserDefinedFunction randomString() { - return udf(() -> (String) RandomUtil.generatePrimitive(Types.StringType.get(), rand), DataTypes.StringType) - .asNondeterministic().asNonNullable(); + return udf( + () -> (String) RandomUtil.generatePrimitive(Types.StringType.get(), rand), + DataTypes.StringType) + .asNondeterministic() + .asNonNullable(); } } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java index 0394e358a52d..761ae6fa426d 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -52,15 +54,11 @@ import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** * A benchmark that evaluates the performance of reading Parquet data with a flat schema using * Iceberg and Spark Parquet readers. * - * To run this benchmark for spark-3.2: - * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=SparkParquetReadersFlatDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-readers-flat-data-benchmark-result.txt @@ -73,22 +71,23 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetReadersFlatDataBenchmark { - private static final DynMethods.UnboundMethod APPLY_PROJECTION = DynMethods.builder("apply") - .impl(UnsafeProjection.class, InternalRow.class) - .build(); - private static final Schema SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); - private static final Schema PROJECTED_SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(8, "stringCol", Types.StringType.get())); + private static final DynMethods.UnboundMethod APPLY_PROJECTION = + DynMethods.builder("apply").impl(UnsafeProjection.class, InternalRow.class).build(); + private static final Schema SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); + private static final Schema PROJECTED_SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(8, "stringCol", Types.StringType.get())); private static final int NUM_RECORDS = 10000000; private File dataFile; @@ -97,10 +96,8 @@ public void setupBenchmark() throws IOException { dataFile = File.createTempFile("parquet-flat-data-benchmark", ".parquet"); dataFile.delete(); List records = RandomData.generateList(SCHEMA, NUM_RECORDS, 0L); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .schema(SCHEMA) - .named("benchmark") - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)).schema(SCHEMA).named("benchmark").build()) { writer.addAll(records); } } @@ -115,10 +112,11 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void readUsingIcebergReader(Blackhole blackHole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackHole.consume(row); @@ -129,14 +127,15 @@ public void readUsingIcebergReader(Blackhole blackHole) throws IOException { @Benchmark @Threads(1) public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); + Iterable unsafeRows = + Iterables.transform( + rows, APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -148,14 +147,15 @@ public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException @Threads(1) public void readUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -166,10 +166,11 @@ public void readUsingSparkReader(Blackhole blackhole) throws IOException { @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -180,14 +181,18 @@ public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOE @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { - - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA))::invoke); + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { + + Iterable unsafeRows = + Iterables.transform( + rows, + APPLY_PROJECTION.bind( + SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA)) + ::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -199,14 +204,15 @@ public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) thro @Threads(1) public void readWithProjectionUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(PROJECTED_SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java index 0b0ebd0913a1..dc96730234e6 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -52,15 +54,11 @@ import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** - * A benchmark that evaluates the performance of reading nested Parquet data using - * Iceberg and Spark Parquet readers. + * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg and Spark + * Parquet readers. * - * To run this benchmark for spark-3.2: - * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=SparkParquetReadersNestedDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-readers-nested-data-benchmark-result.txt @@ -73,22 +71,21 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetReadersNestedDataBenchmark { - private static final DynMethods.UnboundMethod APPLY_PROJECTION = DynMethods.builder("apply") - .impl(UnsafeProjection.class, InternalRow.class) - .build(); - private static final Schema SCHEMA = new Schema( - required(0, "id", Types.LongType.get()), - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get()) - )) - ); - private static final Schema PROJECTED_SCHEMA = new Schema( - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()) - )) - ); + private static final DynMethods.UnboundMethod APPLY_PROJECTION = + DynMethods.builder("apply").impl(UnsafeProjection.class, InternalRow.class).build(); + private static final Schema SCHEMA = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 4, + "nested", + Types.StructType.of( + required(1, "col1", Types.StringType.get()), + required(2, "col2", Types.DoubleType.get()), + required(3, "col3", Types.LongType.get())))); + private static final Schema PROJECTED_SCHEMA = + new Schema( + optional(4, "nested", Types.StructType.of(required(1, "col1", Types.StringType.get())))); private static final int NUM_RECORDS = 10000000; private File dataFile; @@ -97,10 +94,8 @@ public void setupBenchmark() throws IOException { dataFile = File.createTempFile("parquet-nested-data-benchmark", ".parquet"); dataFile.delete(); List records = RandomData.generateList(SCHEMA, NUM_RECORDS, 0L); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .schema(SCHEMA) - .named("benchmark") - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)).schema(SCHEMA).named("benchmark").build()) { writer.addAll(records); } } @@ -115,10 +110,11 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void readUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -129,14 +125,15 @@ public void readUsingIcebergReader(Blackhole blackhole) throws IOException { @Benchmark @Threads(1) public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); + Iterable unsafeRows = + Iterables.transform( + rows, APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -148,14 +145,15 @@ public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException @Threads(1) public void readUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -166,10 +164,11 @@ public void readUsingSparkReader(Blackhole blackhole) throws IOException { @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -180,14 +179,18 @@ public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOE @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { - - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA))::invoke); + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { + + Iterable unsafeRows = + Iterables.transform( + rows, + APPLY_PROJECTION.bind( + SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA)) + ::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -199,14 +202,15 @@ public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) thro @Threads(1) public void readWithProjectionUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(PROJECTED_SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java index 555907caaf75..59ee377cb32f 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.Files; @@ -45,15 +47,11 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.annotations.Warmup; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** * A benchmark that evaluates the performance of writing Parquet data with a flat schema using * Iceberg and Spark Parquet writers. * - * To run this benchmark for spark-3.2: - * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=SparkParquetWritersFlatDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-writers-flat-data-benchmark-result.txt @@ -66,15 +64,16 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetWritersFlatDataBenchmark { - private static final Schema SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); private static final int NUM_RECORDS = 1000000; private Iterable rows; private File dataFile; @@ -96,10 +95,13 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void writeUsingIcebergWriter() throws IOException { - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) + .schema(SCHEMA) + .build()) { writer.addAll(rows); } @@ -109,15 +111,16 @@ public void writeUsingIcebergWriter() throws IOException { @Threads(1) public void writeUsingSparkWriter() throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .writeSupport(new ParquetWriteSupport()) - .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) - .set("spark.sql.parquet.writeLegacyFormat", "false") - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .writeSupport(new ParquetWriteSupport()) + .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) + .set("spark.sql.parquet.writeLegacyFormat", "false") + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") + .schema(SCHEMA) + .build()) { writer.addAll(rows); } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java index 4718eb66dbf1..65d50c80ca2d 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.Files; @@ -45,15 +47,11 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.annotations.Warmup; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using - * Iceberg and Spark Parquet writers. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and Spark + * Parquet writers. * - * To run this benchmark for spark-3.2: - * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=SparkParquetWritersNestedDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-writers-nested-data-benchmark-result.txt @@ -66,14 +64,16 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetWritersNestedDataBenchmark { - private static final Schema SCHEMA = new Schema( - required(0, "id", Types.LongType.get()), - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get()) - )) - ); + private static final Schema SCHEMA = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 4, + "nested", + Types.StructType.of( + required(1, "col1", Types.StringType.get()), + required(2, "col2", Types.DoubleType.get()), + required(3, "col3", Types.LongType.get())))); private static final int NUM_RECORDS = 1000000; private Iterable rows; private File dataFile; @@ -95,10 +95,13 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void writeUsingIcebergWriter() throws IOException { - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) + .schema(SCHEMA) + .build()) { writer.addAll(rows); } @@ -108,15 +111,16 @@ public void writeUsingIcebergWriter() throws IOException { @Threads(1) public void writeUsingSparkWriter() throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .writeSupport(new ParquetWriteSupport()) - .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) - .set("spark.sql.parquet.writeLegacyFormat", "false") - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .writeSupport(new ParquetWriteSupport()) + .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) + .set("spark.sql.parquet.writeLegacyFormat", "false") + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") + .schema(SCHEMA) + .build()) { writer.addAll(rows); } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java index 1820a801b2fb..0dbf07285060 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; @FunctionalInterface diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java index 60dde6e98a16..68c537e34a4a 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -81,7 +80,8 @@ protected String newTableLocation() { protected String dataLocation() { Map properties = table.properties(); - return properties.getOrDefault(TableProperties.WRITE_DATA_LOCATION, String.format("%s/data", table.location())); + return properties.getOrDefault( + TableProperties.WRITE_DATA_LOCATION, String.format("%s/data", table.location())); } protected void cleanupFiles() throws IOException { @@ -94,12 +94,12 @@ protected void cleanupFiles() throws IOException { } protected void setupSpark(boolean enableDictionaryEncoding) { - SparkSession.Builder builder = SparkSession.builder() - .config("spark.ui.enabled", false); + SparkSession.Builder builder = SparkSession.builder().config("spark.ui.enabled", false); if (!enableDictionaryEncoding) { - builder.config("parquet.dictionary.page.size", "1") - .config("parquet.enable.dictionary", false) - .config(TableProperties.PARQUET_DICT_SIZE_BYTES, "1"); + builder + .config("parquet.dictionary.page.size", "1") + .config("parquet.enable.dictionary", false) + .config(TableProperties.PARQUET_DICT_SIZE_BYTES, "1"); } builder.master("local"); spark = builder.getOrCreate(); @@ -116,7 +116,7 @@ protected void tearDownSpark() { } protected void materialize(Dataset ds) { - ds.queryExecution().toRdd().toJavaRDD().foreach(record -> { }); + ds.queryExecution().toRdd().toJavaRDD().foreach(record -> {}); } protected void materialize(Dataset ds, Blackhole blackhole) { @@ -126,7 +126,8 @@ protected void materialize(Dataset ds, Blackhole blackhole) { protected void appendAsFile(Dataset ds) { // ensure the schema is precise (including nullability) StructType sparkSchema = SparkSchemaUtil.convert(table.schema()); - spark.createDataFrame(ds.rdd(), sparkSchema) + spark + .createDataFrame(ds.rdd(), sparkSchema) .coalesce(1) .write() .format("iceberg") @@ -138,42 +139,49 @@ protected void withSQLConf(Map conf, Action action) { SQLConf sqlConf = SQLConf.get(); Map currentConfValues = Maps.newHashMap(); - conf.keySet().forEach(confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); - - conf.forEach((confKey, confValue) -> { - if (SQLConf.isStaticConfigKey(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); + conf.keySet() + .forEach( + confKey -> { + if (sqlConf.contains(confKey)) { + String currentConfValue = sqlConf.getConfString(confKey); + currentConfValues.put(confKey, currentConfValue); + } + }); + + conf.forEach( + (confKey, confValue) -> { + if (SQLConf.isStaticConfigKey(confKey)) { + throw new RuntimeException("Cannot modify the value of a static config: " + confKey); + } + sqlConf.setConfString(confKey, confValue); + }); try { action.invoke(); } finally { - conf.forEach((confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); + conf.forEach( + (confKey, confValue) -> { + if (currentConfValues.containsKey(confKey)) { + sqlConf.setConfString(confKey, currentConfValues.get(confKey)); + } else { + sqlConf.unsetConf(confKey); + } + }); } } protected void withTableProperties(Map props, Action action) { Map tableProps = table.properties(); Map currentPropValues = Maps.newHashMap(); - props.keySet().forEach(propKey -> { - if (tableProps.containsKey(propKey)) { - String currentPropValue = tableProps.get(propKey); - currentPropValues.put(propKey, currentPropValue); - } - }); + props + .keySet() + .forEach( + propKey -> { + if (tableProps.containsKey(propKey)) { + String currentPropValue = tableProps.get(propKey); + currentPropValues.put(propKey, currentPropValue); + } + }); UpdateProperties updateProperties = table.updateProperties(); props.forEach(updateProperties::set); @@ -183,13 +191,14 @@ protected void withTableProperties(Map props, Action action) { action.invoke(); } finally { UpdateProperties restoreProperties = table.updateProperties(); - props.forEach((propKey, propValue) -> { - if (currentPropValues.containsKey(propKey)) { - restoreProperties.set(propKey, currentPropValues.get(propKey)); - } else { - restoreProperties.remove(propKey); - } - }); + props.forEach( + (propKey, propValue) -> { + if (currentPropValues.containsKey(propKey)) { + restoreProperties.set(propKey, currentPropValues.get(propKey)); + } else { + restoreProperties.remove(propKey); + } + }); restoreProperties.commit(); } } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceDeleteBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceDeleteBenchmark.java index 5db431eaa50c..e42707bf102b 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceDeleteBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceDeleteBenchmark.java @@ -16,9 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.List; import java.util.Map; @@ -53,14 +60,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - public abstract class IcebergSourceDeleteBenchmark extends IcebergSourceBenchmark { private static final Logger LOG = LoggerFactory.getLogger(IcebergSourceDeleteBenchmark.class); private static final long TARGET_FILE_SIZE_IN_BYTES = 512L * 1024 * 1024; @@ -86,11 +85,13 @@ public void readIceberg(Blackhole blackhole) { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); tableProperties.put(PARQUET_VECTORIZATION_ENABLED, "false"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df, blackhole); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df, blackhole); + }); } @Benchmark @@ -99,11 +100,14 @@ public void readIcebergWithIsDeletedColumn(Blackhole blackhole) { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); tableProperties.put(PARQUET_VECTORIZATION_ENABLED, "false"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter("_deleted = false"); - materialize(df, blackhole); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter("_deleted = false"); + materialize(df, blackhole); + }); } @Benchmark @@ -112,11 +116,14 @@ public void readDeletedRows(Blackhole blackhole) { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); tableProperties.put(PARQUET_VECTORIZATION_ENABLED, "false"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter("_deleted = true"); - materialize(df, blackhole); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter("_deleted = true"); + materialize(df, blackhole); + }); } @Benchmark @@ -125,11 +132,13 @@ public void readIcebergVectorized(Blackhole blackhole) { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); tableProperties.put(PARQUET_VECTORIZATION_ENABLED, "true"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df, blackhole); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df, blackhole); + }); } @Benchmark @@ -138,11 +147,14 @@ public void readIcebergWithIsDeletedColumnVectorized(Blackhole blackhole) { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); tableProperties.put(PARQUET_VECTORIZATION_ENABLED, "true"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter("_deleted = false"); - materialize(df, blackhole); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter("_deleted = false"); + materialize(df, blackhole); + }); } @Benchmark @@ -151,37 +163,43 @@ public void readDeletedRowsVectorized(Blackhole blackhole) { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); tableProperties.put(PARQUET_VECTORIZATION_ENABLED, "true"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter("_deleted = true"); - materialize(df, blackhole); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter("_deleted = true"); + materialize(df, blackhole); + }); } protected abstract void appendData() throws IOException; protected void writeData(int fileNum) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(MOD(longCol, 2147483647) AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(MOD(longCol, 2147483647) AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); appendAsFile(df); } @Override protected Table initTable() { - Schema schema = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); @@ -195,17 +213,19 @@ protected Configuration initHadoopConf() { return new Configuration(); } - protected void writePosDeletes(CharSequence path, long numRows, double percentage) throws IOException { + protected void writePosDeletes(CharSequence path, long numRows, double percentage) + throws IOException { writePosDeletes(path, numRows, percentage, 1); } - protected void writePosDeletes(CharSequence path, long numRows, double percentage, - int numDeleteFile) throws IOException { + protected void writePosDeletes( + CharSequence path, long numRows, double percentage, int numDeleteFile) throws IOException { writePosDeletesWithNoise(path, numRows, percentage, 0, numDeleteFile); } - protected void writePosDeletesWithNoise(CharSequence path, long numRows, double percentage, int numNoise, - int numDeleteFile) throws IOException { + protected void writePosDeletesWithNoise( + CharSequence path, long numRows, double percentage, int numNoise, int numDeleteFile) + throws IOException { Set deletedPos = Sets.newHashSet(); while (deletedPos.size() < numRows * percentage) { deletedPos.add(ThreadLocalRandom.current().nextLong(numRows)); @@ -219,14 +239,15 @@ protected void writePosDeletesWithNoise(CharSequence path, long numRows, double } } - protected void writePosDeletes(CharSequence path, List deletedPos, int numNoise) throws IOException { + protected void writePosDeletes(CharSequence path, List deletedPos, int numNoise) + throws IOException { OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).build(); - ClusteredPositionDeleteWriter writer = new ClusteredPositionDeleteWriter<>( - writerFactory, fileFactory, table().io(), TARGET_FILE_SIZE_IN_BYTES); + ClusteredPositionDeleteWriter writer = + new ClusteredPositionDeleteWriter<>( + writerFactory, fileFactory, table().io(), TARGET_FILE_SIZE_IN_BYTES); PartitionSpec unpartitionedSpec = table().specs().get(0); @@ -274,15 +295,16 @@ private void writeEqDeletes(List rows) throws IOException { int equalityFieldId = table().schema().findField("longCol").fieldId(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory - .builderFor(table()) - .dataFileFormat(fileFormat()) - .equalityDeleteRowSchema(table().schema()) - .equalityFieldIds(new int[]{equalityFieldId}) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .equalityDeleteRowSchema(table().schema()) + .equalityFieldIds(new int[] {equalityFieldId}) + .build(); - ClusteredEqualityDeleteWriter writer = new ClusteredEqualityDeleteWriter<>( - writerFactory, fileFactory, table().io(), TARGET_FILE_SIZE_IN_BYTES); + ClusteredEqualityDeleteWriter writer = + new ClusteredEqualityDeleteWriter<>( + writerFactory, fileFactory, table().io(), TARGET_FILE_SIZE_IN_BYTES); PartitionSpec unpartitionedSpec = table().specs().get(0); try (ClusteredEqualityDeleteWriter closeableWriter = writer) { @@ -298,14 +320,14 @@ private void writeEqDeletes(List rows) throws IOException { } private OutputFileFactory newFileFactory() { - return OutputFileFactory.builderFor(table(), 1, 1) - .format(fileFormat()) - .build(); + return OutputFileFactory.builderFor(table(), 1, 1).format(fileFormat()).build(); } private CharSequence noisePath(CharSequence path) { - // assume the data file name would be something like "00000-0-30da64e0-56b5-4743-a11b-3188a1695bf7-00001.parquet" - // so the dataFileSuffixLen is the UUID string length + length of "-00001.parquet", which is 36 + 14 = 60. It's OK + // assume the data file name would be something like + // "00000-0-30da64e0-56b5-4743-a11b-3188a1695bf7-00001.parquet" + // so the dataFileSuffixLen is the UUID string length + length of "-00001.parquet", which is 36 + // + 14 = 60. It's OK // to be not accurate here. int dataFileSuffixLen = 60; UUID uuid = UUID.randomUUID(); diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java index 9e206321a540..59e6230350d9 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -29,9 +31,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class IcebergSourceFlatDataBenchmark extends IcebergSourceBenchmark { @Override @@ -41,15 +40,16 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java index 5a0d9359ec6b..a1c61b9b4de0 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -29,9 +31,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class IcebergSourceNestedDataBenchmark extends IcebergSourceBenchmark { @Override @@ -41,14 +40,16 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(0, "id", Types.LongType.get()), - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get()) - )) - ); + Schema schema = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 4, + "nested", + Types.StructType.of( + required(1, "col1", Types.StringType.get()), + required(2, "col2", Types.DoubleType.get()), + required(3, "col3", Types.LongType.get())))); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java index 369a1507b648..f68b587735dd 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -29,9 +31,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class IcebergSourceNestedListDataBenchmark extends IcebergSourceBenchmark { @Override @@ -41,12 +40,19 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(0, "id", Types.LongType.get()), - optional(1, "outerlist", Types.ListType.ofOptional(2, - Types.StructType.of(required(3, "innerlist", Types.ListType.ofRequired(4, Types.StringType.get()))) - )) - ); + Schema schema = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 1, + "outerlist", + Types.ListType.ofOptional( + 2, + Types.StructType.of( + required( + 3, + "innerlist", + Types.ListType.ofRequired(4, Types.StringType.get())))))); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java index acec471bdfd1..8d0b94262aee 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.Comparator; import java.util.List; @@ -57,23 +59,20 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.infra.Blackhole; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class WritersBenchmark extends IcebergSourceBenchmark { private static final int NUM_ROWS = 2500000; private static final long TARGET_FILE_SIZE_IN_BYTES = 50L * 1024 * 1024; - private static final Schema SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "timestampCol", Types.TimestampType.withZone()), - optional(7, "stringCol", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "timestampCol", Types.TimestampType.withZone()), + optional(7, "stringCol", Types.StringType.get())); private Iterable rows; private Iterable positionDeleteRows; @@ -92,7 +91,8 @@ public void setupBenchmark() { data.sort(Comparator.comparingInt(row -> transform.apply(row.getInt(1)))); this.rows = data; - this.positionDeleteRows = RandomData.generateSpark(DeleteSchemaUtil.pathPosSchema(), NUM_ROWS, 0L); + this.positionDeleteRows = + RandomData.generateSpark(DeleteSchemaUtil.pathPosSchema(), NUM_ROWS, 0L); this.unpartitionedSpec = table().specs().get(0); Preconditions.checkArgument(unpartitionedSpec.isUnpartitioned()); @@ -118,9 +118,7 @@ protected final Table initTable() { Table table = tables.create(SCHEMA, spec, properties, newTableLocation()); // add a partitioned spec to the table - table.updateSpec() - .addField(Expressions.bucket("intCol", 32)) - .commit(); + table.updateSpec().addField(Expressions.bucket("intCol", 32)).commit(); return table; } @@ -131,13 +129,14 @@ public void writeUnpartitionedClusteredDataWriter(Blackhole blackhole) throws IO FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .dataSchema(table().schema()) + .build(); - ClusteredDataWriter writer = new ClusteredDataWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredDataWriter writer = + new ClusteredDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); try (ClusteredDataWriter closeableWriter = writer) { for (InternalRow row : rows) { @@ -157,13 +156,14 @@ public void writeUnpartitionedLegacyDataWriter(Blackhole blackhole) throws IOExc Schema writeSchema = table().schema(); StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(unpartitionedSpec) - .build(); + SparkAppenderFactory appenders = + SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) + .spec(unpartitionedSpec) + .build(); - TaskWriter writer = new UnpartitionedWriter<>( - unpartitionedSpec, fileFormat(), appenders, - fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + TaskWriter writer = + new UnpartitionedWriter<>( + unpartitionedSpec, fileFormat(), appenders, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); try (TaskWriter closableWriter = writer) { for (InternalRow row : rows) { @@ -180,13 +180,14 @@ public void writePartitionedClusteredDataWriter(Blackhole blackhole) throws IOEx FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .dataSchema(table().schema()) + .build(); - ClusteredDataWriter writer = new ClusteredDataWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredDataWriter writer = + new ClusteredDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); StructType dataSparkType = SparkSchemaUtil.convert(table().schema()); @@ -211,14 +212,21 @@ public void writePartitionedLegacyDataWriter(Blackhole blackhole) throws IOExcep Schema writeSchema = table().schema(); StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(partitionedSpec) - .build(); - - TaskWriter writer = new SparkPartitionedWriter( - partitionedSpec, fileFormat(), appenders, - fileFactory, io, TARGET_FILE_SIZE_IN_BYTES, - writeSchema, sparkWriteType); + SparkAppenderFactory appenders = + SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) + .spec(partitionedSpec) + .build(); + + TaskWriter writer = + new SparkPartitionedWriter( + partitionedSpec, + fileFormat(), + appenders, + fileFactory, + io, + TARGET_FILE_SIZE_IN_BYTES, + writeSchema, + sparkWriteType); try (TaskWriter closableWriter = writer) { for (InternalRow row : rows) { @@ -235,13 +243,14 @@ public void writePartitionedFanoutDataWriter(Blackhole blackhole) throws IOExcep FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .dataSchema(table().schema()) + .build(); - FanoutDataWriter writer = new FanoutDataWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + FanoutDataWriter writer = + new FanoutDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); StructType dataSparkType = SparkSchemaUtil.convert(table().schema()); @@ -266,14 +275,21 @@ public void writePartitionedLegacyFanoutDataWriter(Blackhole blackhole) throws I Schema writeSchema = table().schema(); StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(partitionedSpec) - .build(); - - TaskWriter writer = new SparkPartitionedFanoutWriter( - partitionedSpec, fileFormat(), appenders, - fileFactory, io, TARGET_FILE_SIZE_IN_BYTES, - writeSchema, sparkWriteType); + SparkAppenderFactory appenders = + SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) + .spec(partitionedSpec) + .build(); + + TaskWriter writer = + new SparkPartitionedFanoutWriter( + partitionedSpec, + fileFormat(), + appenders, + fileFactory, + io, + TARGET_FILE_SIZE_IN_BYTES, + writeSchema, + sparkWriteType); try (TaskWriter closableWriter = writer) { for (InternalRow row : rows) { @@ -286,20 +302,23 @@ partitionedSpec, fileFormat(), appenders, @Benchmark @Threads(1) - public void writePartitionedClusteredEqualityDeleteWriter(Blackhole blackhole) throws IOException { + public void writePartitionedClusteredEqualityDeleteWriter(Blackhole blackhole) + throws IOException { FileIO io = table().io(); int equalityFieldId = table().schema().findField("longCol").fieldId(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .equalityDeleteRowSchema(table().schema()) - .equalityFieldIds(new int[]{equalityFieldId}) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .equalityDeleteRowSchema(table().schema()) + .equalityFieldIds(new int[] {equalityFieldId}) + .build(); - ClusteredEqualityDeleteWriter writer = new ClusteredEqualityDeleteWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredEqualityDeleteWriter writer = + new ClusteredEqualityDeleteWriter<>( + writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); StructType deleteSparkType = SparkSchemaUtil.convert(table().schema()); @@ -317,16 +336,17 @@ public void writePartitionedClusteredEqualityDeleteWriter(Blackhole blackhole) t @Benchmark @Threads(1) - public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) throws IOException { + public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) + throws IOException { FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).build(); - ClusteredPositionDeleteWriter writer = new ClusteredPositionDeleteWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredPositionDeleteWriter writer = + new ClusteredPositionDeleteWriter<>( + writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PositionDelete positionDelete = PositionDelete.create(); try (ClusteredPositionDeleteWriter closeableWriter = writer) { @@ -342,8 +362,6 @@ public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) } private OutputFileFactory newFileFactory() { - return OutputFileFactory.builderFor(table(), 1, 1) - .format(fileFormat()) - .build(); + return OutputFileFactory.builderFor(table(), 1, 1).format(fileFormat()).build(); } } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java index eb07dbd27875..9d06cfcf7beb 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.avro; import org.apache.iceberg.FileFormat; @@ -24,9 +23,8 @@ /** * A benchmark that evaluates the performance of various Iceberg writers for Avro data. - *

    - * To run this benchmark for spark-3.2: - * + * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=AvroWritersBenchmark * -PjmhOutputPath=benchmark/avro-writers-benchmark-result.txt diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java index dd6eac28fa69..96f2e98e4c7c 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.avro; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,18 +36,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of reading Avro data with a flat schema - * using Iceberg and the built-in file source in Spark. - *

    - * To run this benchmark for spark-3.2: - * + * A benchmark that evaluates the performance of reading Avro data with a flat schema using Iceberg + * and the built-in file source in Spark. + * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=IcebergSourceFlatAvroDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-avro-data-read-benchmark-result.txt @@ -70,11 +68,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,10 +82,12 @@ public void readIceberg() { public void readFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().format("avro").load(dataLocation()); + materialize(df); + }); } @Benchmark @@ -93,11 +95,13 @@ public void readFileSource() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @@ -105,28 +109,34 @@ public void readWithProjectionIceberg() { public void readWithProjectionFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().format("avro").load(dataLocation()).select("longCol"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "avro"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + appendAsFile(df); + } + }); } } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java index ccb369e72123..5d11816a0a98 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.avro; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,19 +36,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - - /** - * A benchmark that evaluates the performance of reading Avro data with a flat schema - * using Iceberg and the built-in file source in Spark. - *

    - * To run this benchmark for spark-3.2: - * + * A benchmark that evaluates the performance of reading Avro data with a flat schema using Iceberg + * and the built-in file source in Spark. + * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedAvroDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-avro-data-read-benchmark-result.txt @@ -71,11 +68,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -83,10 +82,12 @@ public void readIceberg() { public void readFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().format("avro").load(dataLocation()); + materialize(df); + }); } @Benchmark @@ -94,11 +95,14 @@ public void readFileSource() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("nested.col3"); + materialize(df); + }); } @Benchmark @@ -106,27 +110,33 @@ public void readWithProjectionIceberg() { public void readWithProjectionFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()).select("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = + spark().read().format("avro").load(dataLocation()).select("nested.col3"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "avro"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); + appendAsFile(df); + } + }); } } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java index 329c9ffe7738..d0fdd8915780 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -30,13 +32,10 @@ import org.apache.iceberg.spark.source.IcebergSourceBenchmark; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - - /** - * Same as {@link org.apache.iceberg.spark.source.IcebergSourceFlatDataBenchmark} but we disable the Timestamp with - * zone type for ORC performance tests as Spark native reader does not support ORC's TIMESTAMP_INSTANT type + * Same as {@link org.apache.iceberg.spark.source.IcebergSourceFlatDataBenchmark} but we disable the + * Timestamp with zone type for ORC performance tests as Spark native reader does not support ORC's + * TIMESTAMP_INSTANT type */ public abstract class IcebergSourceFlatORCDataBenchmark extends IcebergSourceBenchmark { @@ -47,17 +46,19 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - // Disable timestamp column for ORC performance tests as Spark native reader does not support ORC's - // TIMESTAMP_INSTANT type - // optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + // Disable timestamp column for ORC performance tests as Spark native reader does not + // support ORC's + // TIMESTAMP_INSTANT type + // optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java index b54f04feb5c3..ac4dcdd61abb 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,18 +36,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of reading ORC data with a flat schema - * using Iceberg and the built-in file source in Spark. - *

    - * To run this benchmark for spark-3.2: - * + * A benchmark that evaluates the performance of reading ORC data with a flat schema using Iceberg + * and the built-in file source in Spark. + * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=IcebergSourceFlatORCDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-orc-data-read-benchmark-result.txt @@ -70,11 +68,13 @@ public void tearDownBenchmark() throws IOException { public void readIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,12 +82,18 @@ public void readIcebergNonVectorized() { public void readIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation); + materialize(df); + }); } @Benchmark @@ -96,10 +102,12 @@ public void readFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()); + materialize(df); + }); } @Benchmark @@ -108,10 +116,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()); + materialize(df); + }); } @Benchmark @@ -119,11 +129,13 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @@ -131,25 +143,33 @@ public void readWithProjectionIcebergNonVectorized() { public void readWithProjectionIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation) + .select("longCol"); + materialize(df); + }); } - @Benchmark @Threads(1) public void readWithProjectionFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()).select("longCol"); + materialize(df); + }); } @Benchmark @@ -158,27 +178,33 @@ public void readWithProjectionFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()).select("longCol"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "orc"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + appendAsFile(df); + } + }); } } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java index b73effa39904..bffb5a1da8eb 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.spark.sql.functions.array_repeat; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,22 +35,18 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.array_repeat; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg - * and the built-in file source in Spark. - *

    - * To run this benchmark for spark-3.2: - * + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the + * built-in file source in Spark. + * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedListORCDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-list-orc-data-write-benchmark-result.txt * */ -public class IcebergSourceNestedListORCDataWriteBenchmark extends IcebergSourceNestedListDataBenchmark { +public class IcebergSourceNestedListORCDataWriteBenchmark + extends IcebergSourceNestedListDataBenchmark { @Setup public void setupBenchmark() { @@ -67,8 +66,12 @@ public void tearDownBenchmark() throws IOException { @Threads(1) public void writeIceberg() { String tableLocation = table().location(); - benchmarkData().write().format("iceberg").option("write-format", "orc") - .mode(SaveMode.Append).save(tableLocation); + benchmarkData() + .write() + .format("iceberg") + .option("write-format", "orc") + .mode(SaveMode.Append) + .save(tableLocation); } @Benchmark @@ -76,11 +79,17 @@ public void writeIceberg() { public void writeIcebergDictionaryOff() { Map tableProperties = Maps.newHashMap(); tableProperties.put("orc.dictionary.key.threshold", "0"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - benchmarkData().write().format("iceberg").option("write-format", "orc") - .mode(SaveMode.Append).save(tableLocation); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + benchmarkData() + .write() + .format("iceberg") + .option("write-format", "orc") + .mode(SaveMode.Append) + .save(tableLocation); + }); } @Benchmark @@ -90,10 +99,11 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(numRows) - .withColumn("outerlist", array_repeat(struct( - expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), - 10)) + return spark() + .range(numRows) + .withColumn( + "outerlist", + array_repeat(struct(expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), 10)) .coalesce(1); } } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java index f450080c6a56..e42ad9e3283d 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,19 +37,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - - /** - * A benchmark that evaluates the performance of reading ORC data with a flat schema - * using Iceberg and the built-in file source in Spark. - *

    - * To run this benchmark for spark-3.2: - * + * A benchmark that evaluates the performance of reading ORC data with a flat schema using Iceberg + * and the built-in file source in Spark. + * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedORCDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-orc-data-read-benchmark-result.txt @@ -72,11 +69,13 @@ public void tearDownBenchmark() throws IOException { public void readIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -84,12 +83,18 @@ public void readIcebergNonVectorized() { public void readIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation); + materialize(df); + }); } @Benchmark @@ -98,10 +103,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()); + materialize(df); + }); } @Benchmark @@ -109,11 +116,14 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -121,12 +131,19 @@ public void readWithProjectionIcebergNonVectorized() { public void readWithProjectionIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation) + .selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -135,27 +152,32 @@ public void readWithProjectionFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()).selectExpr("nested.col3"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "orc"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); + appendAsFile(df); + } + }); } } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java index 879120e25b7b..50fa51e36ff3 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,21 +35,15 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** * A benchmark that evaluates the file skipping capabilities in the Spark data source for Iceberg. * - * This class uses a dataset with a flat schema, where the records are clustered according to the + *

    This class uses a dataset with a flat schema, where the records are clustered according to the * column used in the filter predicate. * - * The performance is compared to the built-in file source in Spark. + *

    The performance is compared to the built-in file source in Spark. * - * To run this benchmark for spark-3.2: - * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=IcebergSourceFlatParquetDataFilterBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-filter-benchmark-result.txt @@ -74,11 +72,14 @@ public void tearDownBenchmark() throws IOException { public void readWithFilterIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -87,10 +88,12 @@ public void readWithFilterFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -99,23 +102,27 @@ public void readWithFilterFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } private void appendData() { for (int fileNum = 1; fileNum < NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); appendAsFile(df); } } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java index 1b47841833cd..87063092c840 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,17 +35,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of reading Parquet data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading Parquet data with a flat schema using + * Iceberg and the built-in file source in Spark. * - * To run this benchmark for spark-3.2: - * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=IcebergSourceFlatParquetDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-read-benchmark-result.txt @@ -69,11 +67,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,10 +82,12 @@ public void readFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -94,10 +96,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -105,11 +109,13 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @@ -118,10 +124,12 @@ public void readWithProjectionFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("longCol"); + materialize(df); + }); } @Benchmark @@ -130,23 +138,27 @@ public void readWithProjectionFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("longCol"); + materialize(df); + }); } private void appendData() { for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); appendAsFile(df); } } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java index dcd6c96650a9..2e81556de017 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,14 +33,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of writing Parquet data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing Parquet data with a flat schema using + * Iceberg and the built-in file source in Spark. * - * To run this benchmark for spark-3.2: - * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=IcebergSourceFlatParquetDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-write-benchmark-result.txt @@ -76,7 +74,8 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(NUM_ROWS) + return spark() + .range(NUM_ROWS) .withColumnRenamed("id", "longCol") .withColumn("intCol", expr("CAST(longCol AS INT)")) .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java index fefb1a843a22..4dc6b789ffdb 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.spark.sql.functions.array_repeat; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -33,22 +36,18 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.array_repeat; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for spark-3.2: - * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedListParquetDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-list-parquet-data-write-benchmark-result.txt * */ -public class IcebergSourceNestedListParquetDataWriteBenchmark extends IcebergSourceNestedListDataBenchmark { +public class IcebergSourceNestedListParquetDataWriteBenchmark + extends IcebergSourceNestedListDataBenchmark { @Setup public void setupBenchmark() { @@ -80,10 +79,11 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(numRows) - .withColumn("outerlist", array_repeat(struct( - expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), - 10)) + return spark() + .range(numRows) + .withColumn( + "outerlist", + array_repeat(struct(expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), 10)) .coalesce(1); } } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java index e492f544d222..9d8c2fe111bc 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,27 +35,22 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - /** * A benchmark that evaluates the file skipping capabilities in the Spark data source for Iceberg. * - * This class uses a dataset with nested data, where the records are clustered according to the + *

    This class uses a dataset with nested data, where the records are clustered according to the * column used in the filter predicate. * - * The performance is compared to the built-in file source in Spark. + *

    The performance is compared to the built-in file source in Spark. * - * To run this benchmark for spark-3.2: - * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedParquetDataFilterBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-filter-benchmark-result.txt * */ -public class IcebergSourceNestedParquetDataFilterBenchmark extends IcebergSourceNestedDataBenchmark { +public class IcebergSourceNestedParquetDataFilterBenchmark + extends IcebergSourceNestedDataBenchmark { private static final String FILTER_COND = "nested.col3 == 0"; private static final int NUM_FILES = 500; @@ -74,11 +73,14 @@ public void tearDownBenchmark() throws IOException { public void readWithFilterIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -87,10 +89,12 @@ public void readWithFilterFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -99,22 +103,25 @@ public void readWithFilterFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } private void appendData() { for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); appendAsFile(df); } } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java index 64264749c0ab..6c3a983fae42 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,17 +35,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for spark-3.2: - * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedParquetDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-read-benchmark-result.txt @@ -69,11 +67,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,10 +82,12 @@ public void readFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -94,10 +96,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -105,11 +109,14 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -119,10 +126,12 @@ public void readWithProjectionFileSourceVectorized() { conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); conf.put(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED().key(), "true"); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -132,22 +141,25 @@ public void readWithProjectionFileSourceNonVectorized() { conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); conf.put(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED().key(), "true"); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); + materialize(df); + }); } private void appendData() { for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); appendAsFile(df); } } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java index 42324795dccd..ca347d4190dd 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,15 +34,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for spark-3.2: - * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedParquetDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-write-benchmark-result.txt @@ -77,14 +75,14 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(NUM_ROWS) + return spark() + .range(NUM_ROWS) .withColumn( "nested", struct( expr("CAST(id AS string) AS col1"), expr("CAST(id AS double) AS col2"), - expr("id AS col3") - )) + expr("id AS col3"))) .coalesce(1); } } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetEqDeleteBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetEqDeleteBenchmark.java index 677aef7eb174..83301ca9102b 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetEqDeleteBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetEqDeleteBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; import java.io.IOException; @@ -25,12 +24,10 @@ import org.openjdk.jmh.annotations.Param; /** - * A benchmark that evaluates the non-vectorized read and vectorized read with equality delete in the Spark data source - * for Iceberg. - *

    - * This class uses a dataset with a flat schema. - * To run this benchmark for spark-3.2: - * + * A benchmark that evaluates the non-vectorized read and vectorized read with equality delete in + * the Spark data source for Iceberg. + * + *

    This class uses a dataset with a flat schema. To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2:jmh * -PjmhIncludeRegex=IcebergSourceParquetEqDeleteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-parquet-eq-delete-benchmark-result.txt diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetMultiDeleteFileBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetMultiDeleteFileBenchmark.java index 3b1b4211174c..d963689ac529 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetMultiDeleteFileBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetMultiDeleteFileBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; import java.io.IOException; @@ -26,12 +25,10 @@ import org.openjdk.jmh.annotations.Param; /** - * A benchmark that evaluates the non-vectorized read and vectorized read with pos-delete in the Spark data source for - * Iceberg. - *

    - * This class uses a dataset with a flat schema. - * To run this benchmark for spark-3.2: - * + * A benchmark that evaluates the non-vectorized read and vectorized read with pos-delete in the + * Spark data source for Iceberg. + * + *

    This class uses a dataset with a flat schema. To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2:jmh \ * -PjmhIncludeRegex=IcebergSourceParquetMultiDeleteFileBenchmark \ * -PjmhOutputPath=benchmark/iceberg-source-parquet-multi-delete-file-benchmark-result.txt diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetPosDeleteBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetPosDeleteBenchmark.java index 5e36bad11f7a..d1f2d6de3c46 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetPosDeleteBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetPosDeleteBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; import java.io.IOException; @@ -26,12 +25,10 @@ import org.openjdk.jmh.annotations.Param; /** - * A benchmark that evaluates the non-vectorized read and vectorized read with pos-delete in the Spark data source for - * Iceberg. - *

    - * This class uses a dataset with a flat schema. - * To run this benchmark for spark-3.2: - * + * A benchmark that evaluates the non-vectorized read and vectorized read with pos-delete in the + * Spark data source for Iceberg. + * + *

    This class uses a dataset with a flat schema. To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2:jmh * -PjmhIncludeRegex=IcebergSourceParquetPosDeleteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-parquet-pos-delete-benchmark-result.txt diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetWithUnrelatedDeleteBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetWithUnrelatedDeleteBenchmark.java index 5105c340518d..af079ce27890 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetWithUnrelatedDeleteBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetWithUnrelatedDeleteBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; import java.io.IOException; @@ -26,12 +25,10 @@ import org.openjdk.jmh.annotations.Param; /** - * A benchmark that evaluates the non-vectorized read and vectorized read with pos-delete in the Spark data source for - * Iceberg. - *

    - * This class uses a dataset with a flat schema. - * To run this benchmark for spark-3.2: - * + * A benchmark that evaluates the non-vectorized read and vectorized read with pos-delete in the + * Spark data source for Iceberg. + * + *

    This class uses a dataset with a flat schema. To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2:jmh * -PjmhIncludeRegex=IcebergSourceParquetWithUnrelatedDeleteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-parquet-with-unrelated-delete-benchmark-result.txt @@ -39,6 +36,7 @@ */ public class IcebergSourceParquetWithUnrelatedDeleteBenchmark extends IcebergSourceDeleteBenchmark { private static final double PERCENT_DELETE_ROW = 0.05; + @Param({"0", "0.05", "0.25", "0.5"}) private double percentUnrelatedDeletes; @@ -49,8 +47,12 @@ protected void appendData() throws IOException { table().refresh(); for (DataFile file : table().currentSnapshot().addedDataFiles(table().io())) { - writePosDeletesWithNoise(file.path(), NUM_ROWS, PERCENT_DELETE_ROW, - (int) (percentUnrelatedDeletes / PERCENT_DELETE_ROW), 1); + writePosDeletesWithNoise( + file.path(), + NUM_ROWS, + PERCENT_DELETE_ROW, + (int) (percentUnrelatedDeletes / PERCENT_DELETE_ROW), + 1); } } } diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java index 3ba7541317d1..a9a7dcd76572 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; import org.apache.iceberg.FileFormat; @@ -25,8 +24,7 @@ /** * A benchmark that evaluates the performance of various Iceberg writers for Parquet data. * - * To run this benchmark for spark 3.2: - * + *

    To run this benchmark for spark 3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh \ * -PjmhIncludeRegex=ParquetWritersBenchmark \ * -PjmhOutputPath=benchmark/parquet-writers-benchmark-result.txt diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java index cfc55150262b..97ec026a960f 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet.vectorized; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.pmod; +import static org.apache.spark.sql.functions.to_date; +import static org.apache.spark.sql.functions.to_timestamp; + import java.math.BigDecimal; import java.math.BigInteger; import java.util.Map; @@ -32,32 +38,26 @@ import org.apache.spark.sql.types.DataTypes; import org.openjdk.jmh.annotations.Setup; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.pmod; -import static org.apache.spark.sql.functions.to_date; -import static org.apache.spark.sql.functions.to_timestamp; - /** - * Benchmark to compare performance of reading Parquet dictionary encoded data with a flat schema using vectorized - * Iceberg read path and the built-in file source in Spark. - *

    - * To run this benchmark for spark-3.2: - * + * Benchmark to compare performance of reading Parquet dictionary encoded data with a flat schema + * using vectorized Iceberg read path and the built-in file source in Spark. + * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=VectorizedReadDictionaryEncodedFlatParquetDataBenchmark * -PjmhOutputPath=benchmark/results.txt * */ -public class VectorizedReadDictionaryEncodedFlatParquetDataBenchmark extends VectorizedReadFlatParquetDataBenchmark { +public class VectorizedReadDictionaryEncodedFlatParquetDataBenchmark + extends VectorizedReadFlatParquetDataBenchmark { @Setup @Override public void setupBenchmark() { setupSpark(true); appendData(); - // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds + // Allow unsafe memory access to avoid the costly check arrow does to check if index is within + // bounds System.setProperty("arrow.enable_unsafe_memory_access", "true"); // Disable expensive null check for every get(index) call. // Iceberg manages nullability checks itself instead of relying on arrow. @@ -83,9 +83,7 @@ void appendData() { df = withTimestampColumnDictEncoded(df); df = withStringColumnDictEncoded(df); df = df.drop("id"); - df.write().format("iceberg") - .mode(SaveMode.Append) - .save(table().location()); + df.write().format("iceberg").mode(SaveMode.Append).save(table().location()); } private static Column modColumn() { @@ -106,7 +104,6 @@ private static Dataset withIntColumnDictEncoded(Dataset df) { private static Dataset withFloatColumnDictEncoded(Dataset df) { return df.withColumn("floatCol", modColumn().cast(DataTypes.FloatType)); - } private static Dataset withDoubleColumnDictEncoded(Dataset df) { @@ -125,7 +122,8 @@ private static Dataset withDateColumnDictEncoded(Dataset df) { private static Dataset withTimestampColumnDictEncoded(Dataset df) { Column days = modColumn().cast(DataTypes.ShortType); - return df.withColumn("timestampCol", to_timestamp(date_add(to_date(lit("04/12/2019"), "MM/dd/yyyy"), days))); + return df.withColumn( + "timestampCol", to_timestamp(date_add(to_date(lit("04/12/2019"), "MM/dd/yyyy"), days))); } private static Dataset withStringColumnDictEncoded(Dataset df) { diff --git a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java index 8b1f918912b0..c0f24364412f 100644 --- a/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java +++ b/spark/v3.2/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java @@ -16,9 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet.vectorized; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.pmod; +import static org.apache.spark.sql.functions.when; + import java.io.IOException; import java.util.Map; import org.apache.hadoop.conf.Configuration; @@ -38,21 +46,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.pmod; -import static org.apache.spark.sql.functions.when; - /** - * Benchmark to compare performance of reading Parquet data with a flat schema using vectorized Iceberg read path and - * the built-in file source in Spark. - *

    - * To run this benchmark for spark-3.2: - * + * Benchmark to compare performance of reading Parquet data with a flat schema using vectorized + * Iceberg read path and the built-in file source in Spark. + * + *

    To run this benchmark for spark-3.2: * ./gradlew -DsparkVersions=3.2 :iceberg-spark:iceberg-spark-3.2_2.12:jmh * -PjmhIncludeRegex=VectorizedReadFlatParquetDataBenchmark * -PjmhOutputPath=benchmark/results.txt @@ -67,7 +65,8 @@ public class VectorizedReadFlatParquetDataBenchmark extends IcebergSourceBenchma public void setupBenchmark() { setupSpark(); appendData(); - // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds + // Allow unsafe memory access to avoid the costly check arrow does to check if index is within + // bounds System.setProperty("arrow.enable_unsafe_memory_access", "true"); // Disable expensive null check for every get(index) call. // Iceberg manages nullability checks itself instead of relying on arrow. @@ -87,15 +86,16 @@ protected Configuration initHadoopConf() { @Override protected Table initTable() { - Schema schema = new Schema( - optional(1, "longCol", Types.LongType.get()), - optional(2, "intCol", Types.IntegerType.get()), - optional(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + optional(1, "longCol", Types.LongType.get()), + optional(2, "intCol", Types.IntegerType.get()), + optional(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = parquetWriteProps(); @@ -111,19 +111,20 @@ Map parquetWriteProps() { void appendData() { for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS_PER_FILE) - .withColumn( - "longCol", - when(pmod(col("id"), lit(10)).equalTo(lit(0)), lit(null)) - .otherwise(col("id"))) - .drop("id") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(longCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS_PER_FILE) + .withColumn( + "longCol", + when(pmod(col("id"), lit(10)).equalTo(lit(0)), lit(null)).otherwise(col("id"))) + .drop("id") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(longCol AS STRING)")); appendAsFile(df); } } @@ -131,161 +132,189 @@ void appendData() { @Benchmark @Threads(1) public void readIntegersIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("intCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("intCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readIntegersSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("intCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("intCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readLongsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readLongsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("longCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readFloatsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("floatCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("floatCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readFloatsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("floatCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("floatCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDoublesIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("doubleCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("doubleCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDoublesSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("doubleCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("doubleCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDecimalsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("decimalCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("decimalCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDecimalsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("decimalCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("decimalCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDatesIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("dateCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("dateCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDatesSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("dateCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("dateCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readTimestampsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("timestampCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("timestampCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readTimestampsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("timestampCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("timestampCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readStringsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("stringCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("stringCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readStringsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("stringCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("stringCol"); + materialize(df); + }); } private static Map tablePropsWithVectorizationEnabled(int batchSize) { diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/BaseCatalog.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/BaseCatalog.java index 43447e346648..f215aa033c5a 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/BaseCatalog.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/BaseCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.spark.procedures.SparkProcedures; @@ -29,14 +28,16 @@ import org.apache.spark.sql.connector.iceberg.catalog.Procedure; import org.apache.spark.sql.connector.iceberg.catalog.ProcedureCatalog; -abstract class BaseCatalog implements StagingTableCatalog, ProcedureCatalog, SupportsNamespaces, HasIcebergCatalog { +abstract class BaseCatalog + implements StagingTableCatalog, ProcedureCatalog, SupportsNamespaces, HasIcebergCatalog { @Override public Procedure loadProcedure(Identifier ident) throws NoSuchProcedureException { String[] namespace = ident.namespace(); String name = ident.name(); - // namespace resolution is case insensitive until we have a way to configure case sensitivity in catalogs + // namespace resolution is case insensitive until we have a way to configure case sensitivity in + // catalogs if (namespace.length == 1 && namespace[0].equalsIgnoreCase("system")) { ProcedureBuilder builder = SparkProcedures.newBuilder(name); if (builder != null) { diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java index 58137250003a..641b957d1176 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -24,20 +23,18 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.util.ExceptionUtil; -/** - * utility class to accept thread local commit properties - */ +/** utility class to accept thread local commit properties */ public class CommitMetadata { - private CommitMetadata() { - - } + private CommitMetadata() {} - private static final ThreadLocal> COMMIT_PROPERTIES = ThreadLocal.withInitial(ImmutableMap::of); + private static final ThreadLocal> COMMIT_PROPERTIES = + ThreadLocal.withInitial(ImmutableMap::of); /** - * running the code wrapped as a caller, and any snapshot committed within the callable object will be attached with - * the metadata defined in properties + * running the code wrapped as a caller, and any snapshot committed within the callable object + * will be attached with the metadata defined in properties + * * @param properties extra commit metadata to attach to the snapshot committed within callable * @param callable the code to be executed * @param exClass the expected type of exception which would be thrown from callable diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/ExtendedParser.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/ExtendedParser.java index 4c7cdf229411..19b3dd8f49be 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/ExtendedParser.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/ExtendedParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -58,10 +57,12 @@ static List parseSortOrder(SparkSession spark, String orderString try { return parser.parseSortOrder(orderString); } catch (AnalysisException e) { - throw new IllegalArgumentException(String.format("Unable to parse sortOrder: %s", orderString), e); + throw new IllegalArgumentException( + String.format("Unable to parse sortOrder: %s", orderString), e); } } else { - throw new IllegalStateException("Cannot parse order: parser is not an Iceberg ExtendedParser"); + throw new IllegalStateException( + "Cannot parse order: parser is not an Iceberg ExtendedParser"); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/FileRewriteCoordinator.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/FileRewriteCoordinator.java index acd5f64d7ed6..210e861a4c16 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/FileRewriteCoordinator.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/FileRewriteCoordinator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -39,22 +38,26 @@ public class FileRewriteCoordinator { private final Map, Set> resultMap = Maps.newConcurrentMap(); - private FileRewriteCoordinator() { - } + private FileRewriteCoordinator() {} public static FileRewriteCoordinator get() { return INSTANCE; } /** - * Called to persist the output of a rewrite action for a specific group. Since the write is done via a - * Spark Datasource, we have to propagate the result through this side-effect call. + * Called to persist the output of a rewrite action for a specific group. Since the write is done + * via a Spark Datasource, we have to propagate the result through this side-effect call. + * * @param table table where the rewrite is occurring * @param fileSetID the id used to identify the source set of files being rewritten * @param newDataFiles the new files which have been written */ public void stageRewrite(Table table, String fileSetID, Set newDataFiles) { - LOG.debug("Staging the output for {} - fileset {} with {} files", table.name(), fileSetID, newDataFiles.size()); + LOG.debug( + "Staging the output for {} - fileset {} with {} files", + table.name(), + fileSetID, + newDataFiles.size()); Pair id = toID(table, fileSetID); resultMap.put(id, newDataFiles); } @@ -62,9 +65,8 @@ public void stageRewrite(Table table, String fileSetID, Set newDataFil public Set fetchNewDataFiles(Table table, String fileSetID) { Pair id = toID(table, fileSetID); Set result = resultMap.get(id); - ValidationException.check(result != null, - "No results for rewrite of file set %s in table %s", - fileSetID, table); + ValidationException.check( + result != null, "No results for rewrite of file set %s in table %s", fileSetID, table); return result; } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/FileScanTaskSetManager.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/FileScanTaskSetManager.java index 827b674ca16d..4b6da39905c1 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/FileScanTaskSetManager.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/FileScanTaskSetManager.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -37,15 +36,15 @@ public class FileScanTaskSetManager { private final Map, List> tasksMap = Maps.newConcurrentMap(); - private FileScanTaskSetManager() { - } + private FileScanTaskSetManager() {} public static FileScanTaskSetManager get() { return INSTANCE; } public void stageTasks(Table table, String setID, List tasks) { - Preconditions.checkArgument(tasks != null && tasks.size() > 0, "Cannot stage null or empty tasks"); + Preconditions.checkArgument( + tasks != null && tasks.size() > 0, "Cannot stage null or empty tasks"); Pair id = toID(table, setID); tasksMap.put(id, tasks); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java index e8889bc1fd01..094364d229b3 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.transforms.Transform; @@ -27,22 +26,31 @@ import org.apache.spark.sql.types.DataTypes; public class IcebergSpark { - private IcebergSpark() { - } + private IcebergSpark() {} - public static void registerBucketUDF(SparkSession session, String funcName, DataType sourceType, int numBuckets) { + public static void registerBucketUDF( + SparkSession session, String funcName, DataType sourceType, int numBuckets) { SparkTypeToType typeConverter = new SparkTypeToType(); Type sourceIcebergType = typeConverter.atomic(sourceType); Transform bucket = Transforms.bucket(sourceIcebergType, numBuckets); - session.udf().register(funcName, - value -> bucket.apply(SparkValueConverter.convert(sourceIcebergType, value)), DataTypes.IntegerType); + session + .udf() + .register( + funcName, + value -> bucket.apply(SparkValueConverter.convert(sourceIcebergType, value)), + DataTypes.IntegerType); } - public static void registerTruncateUDF(SparkSession session, String funcName, DataType sourceType, int width) { + public static void registerTruncateUDF( + SparkSession session, String funcName, DataType sourceType, int width) { SparkTypeToType typeConverter = new SparkTypeToType(); Type sourceIcebergType = typeConverter.atomic(sourceType); Transform truncate = Transforms.truncate(sourceIcebergType, width); - session.udf().register(funcName, - value -> truncate.apply(SparkValueConverter.convert(sourceIcebergType, value)), sourceType); + session + .udf() + .register( + funcName, + value -> truncate.apply(SparkValueConverter.convert(sourceIcebergType, value)), + sourceType); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java index a35808fd8ce6..c0756d924e2f 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java @@ -16,13 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; -/** - * Captures information about the current job - * which is used for displaying on the UI - */ +/** Captures information about the current job which is used for displaying on the UI */ public class JobGroupInfo { private String groupId; private String description; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java index 155dce707701..dc8ba69d40a8 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.spark.SparkContext; @@ -26,10 +25,10 @@ public class JobGroupUtils { private static final String JOB_GROUP_ID = SparkContext$.MODULE$.SPARK_JOB_GROUP_ID(); private static final String JOB_GROUP_DESC = SparkContext$.MODULE$.SPARK_JOB_DESCRIPTION(); - private static final String JOB_INTERRUPT_ON_CANCEL = SparkContext$.MODULE$.SPARK_JOB_INTERRUPT_ON_CANCEL(); + private static final String JOB_INTERRUPT_ON_CANCEL = + SparkContext$.MODULE$.SPARK_JOB_INTERRUPT_ON_CANCEL(); - private JobGroupUtils() { - } + private JobGroupUtils() {} public static JobGroupInfo getJobGroupInfo(SparkContext sparkContext) { String groupId = sparkContext.getLocalProperty(JOB_GROUP_ID); @@ -41,6 +40,7 @@ public static JobGroupInfo getJobGroupInfo(SparkContext sparkContext) { public static void setJobGroupInfo(SparkContext sparkContext, JobGroupInfo info) { sparkContext.setLocalProperty(JOB_GROUP_ID, info.groupId()); sparkContext.setLocalProperty(JOB_GROUP_DESC, info.description()); - sparkContext.setLocalProperty(JOB_INTERRUPT_ON_CANCEL, String.valueOf(info.interruptOnCancel())); + sparkContext.setLocalProperty( + JOB_INTERRUPT_ON_CANCEL, String.valueOf(info.interruptOnCancel())); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/PathIdentifier.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/PathIdentifier.java index 235097ea46cc..110af6b87de5 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/PathIdentifier.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/PathIdentifier.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -36,9 +35,10 @@ public PathIdentifier(String location) { this.location = location; List pathParts = SPLIT.splitToList(location); name = Iterables.getLast(pathParts); - namespace = pathParts.size() > 1 ? - new String[]{JOIN.join(pathParts.subList(0, pathParts.size() - 1))} : - new String[0]; + namespace = + pathParts.size() > 1 + ? new String[] {JOIN.join(pathParts.subList(0, pathParts.size() - 1))} + : new String[0]; } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java index 3bdf984ed219..3c111d3b44cb 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -70,7 +69,8 @@ public Type schema(Schema schema, Supplier structResult) { @Override public Type struct(Types.StructType struct, Iterable fieldResults) { - Preconditions.checkNotNull(struct, "Cannot prune null struct. Pruning must start with a schema."); + Preconditions.checkNotNull( + struct, "Cannot prune null struct. Pruning must start with a schema."); Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); StructType requestedStruct = (StructType) current; @@ -92,13 +92,13 @@ public Type struct(Types.StructType struct, Iterable fieldResults) { } else if (field.isOptional()) { changed = true; - projectedFields.put(field.name(), - Types.NestedField.optional(field.fieldId(), field.name(), type)); + projectedFields.put( + field.name(), Types.NestedField.optional(field.fieldId(), field.name(), type)); } else { changed = true; - projectedFields.put(field.name(), - Types.NestedField.required(field.fieldId(), field.name(), type)); + projectedFields.put( + field.name(), Types.NestedField.required(field.fieldId(), field.name(), type)); } } @@ -145,8 +145,10 @@ public Type field(Types.NestedField field, Supplier fieldResult) { int fieldIndex = requestedStruct.fieldIndex(field.name()); StructField requestedField = requestedStruct.fields()[fieldIndex]; - Preconditions.checkArgument(requestedField.nullable() || field.isRequired(), - "Cannot project an optional field as non-null: %s", field.name()); + Preconditions.checkArgument( + requestedField.nullable() || field.isRequired(), + "Cannot project an optional field as non-null: %s", + field.name()); this.current = requestedField.dataType(); try { @@ -164,8 +166,10 @@ public Type list(Types.ListType list, Supplier elementResult) { Preconditions.checkArgument(current instanceof ArrayType, "Not an array: %s", current); ArrayType requestedArray = (ArrayType) current; - Preconditions.checkArgument(requestedArray.containsNull() || !list.isElementOptional(), - "Cannot project an array of optional elements as required elements: %s", requestedArray); + Preconditions.checkArgument( + requestedArray.containsNull() || !list.isElementOptional(), + "Cannot project an array of optional elements as required elements: %s", + requestedArray); this.current = requestedArray.elementType(); try { @@ -190,10 +194,14 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu Preconditions.checkArgument(current instanceof MapType, "Not a map: %s", current); MapType requestedMap = (MapType) current; - Preconditions.checkArgument(requestedMap.valueContainsNull() || !map.isValueOptional(), - "Cannot project a map of optional values as required values: %s", map); - Preconditions.checkArgument(StringType.class.isInstance(requestedMap.keyType()), - "Invalid map key type (not string): %s", requestedMap.keyType()); + Preconditions.checkArgument( + requestedMap.valueContainsNull() || !map.isValueOptional(), + "Cannot project a map of optional values as required values: %s", + map); + Preconditions.checkArgument( + StringType.class.isInstance(requestedMap.keyType()), + "Invalid map key type (not string): %s", + requestedMap.keyType()); this.current = requestedMap.valueType(); try { @@ -215,23 +223,32 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu @Override public Type primitive(Type.PrimitiveType primitive) { Class expectedType = TYPES.get(primitive.typeId()); - Preconditions.checkArgument(expectedType != null && expectedType.isInstance(current), - "Cannot project %s to incompatible type: %s", primitive, current); + Preconditions.checkArgument( + expectedType != null && expectedType.isInstance(current), + "Cannot project %s to incompatible type: %s", + primitive, + current); // additional checks based on type switch (primitive.typeId()) { case DECIMAL: Types.DecimalType decimal = (Types.DecimalType) primitive; DecimalType requestedDecimal = (DecimalType) current; - Preconditions.checkArgument(requestedDecimal.scale() == decimal.scale(), - "Cannot project decimal with incompatible scale: %s != %s", requestedDecimal.scale(), decimal.scale()); - Preconditions.checkArgument(requestedDecimal.precision() >= decimal.precision(), + Preconditions.checkArgument( + requestedDecimal.scale() == decimal.scale(), + "Cannot project decimal with incompatible scale: %s != %s", + requestedDecimal.scale(), + decimal.scale()); + Preconditions.checkArgument( + requestedDecimal.precision() >= decimal.precision(), "Cannot project decimal with incompatible precision: %s < %s", - requestedDecimal.precision(), decimal.precision()); + requestedDecimal.precision(), + decimal.precision()); break; case TIMESTAMP: Types.TimestampType timestamp = (Types.TimestampType) primitive; - Preconditions.checkArgument(timestamp.shouldAdjustToUTC(), + Preconditions.checkArgument( + timestamp.shouldAdjustToUTC(), "Cannot project timestamp (without time zone) as timestamptz (with time zone)"); break; default: @@ -240,19 +257,19 @@ public Type primitive(Type.PrimitiveType primitive) { return primitive; } - private static final ImmutableMap> TYPES = ImmutableMap - .>builder() - .put(TypeID.BOOLEAN, BooleanType.class) - .put(TypeID.INTEGER, IntegerType.class) - .put(TypeID.LONG, LongType.class) - .put(TypeID.FLOAT, FloatType.class) - .put(TypeID.DOUBLE, DoubleType.class) - .put(TypeID.DATE, DateType.class) - .put(TypeID.TIMESTAMP, TimestampType.class) - .put(TypeID.DECIMAL, DecimalType.class) - .put(TypeID.UUID, StringType.class) - .put(TypeID.STRING, StringType.class) - .put(TypeID.FIXED, BinaryType.class) - .put(TypeID.BINARY, BinaryType.class) - .build(); + private static final ImmutableMap> TYPES = + ImmutableMap.>builder() + .put(TypeID.BOOLEAN, BooleanType.class) + .put(TypeID.INTEGER, IntegerType.class) + .put(TypeID.LONG, LongType.class) + .put(TypeID.FLOAT, FloatType.class) + .put(TypeID.DOUBLE, DoubleType.class) + .put(TypeID.DATE, DateType.class) + .put(TypeID.TIMESTAMP, TimestampType.class) + .put(TypeID.DECIMAL, DecimalType.class) + .put(TypeID.UUID, StringType.class) + .put(TypeID.STRING, StringType.class) + .put(TypeID.FIXED, BinaryType.class) + .put(TypeID.BINARY, BinaryType.class) + .build(); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java index c6984e2fe8cd..61a215b938c5 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -68,7 +67,8 @@ public Type schema(Schema schema, Supplier structResult) { @Override public Type struct(Types.StructType struct, Iterable fieldResults) { - Preconditions.checkNotNull(struct, "Cannot prune null struct. Pruning must start with a schema."); + Preconditions.checkNotNull( + struct, "Cannot prune null struct. Pruning must start with a schema."); Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); List fields = struct.fields(); @@ -120,8 +120,10 @@ public Type field(Types.NestedField field, Supplier fieldResult) { int fieldIndex = requestedStruct.fieldIndex(field.name()); StructField requestedField = requestedStruct.fields()[fieldIndex]; - Preconditions.checkArgument(requestedField.nullable() || field.isRequired(), - "Cannot project an optional field as non-null: %s", field.name()); + Preconditions.checkArgument( + requestedField.nullable() || field.isRequired(), + "Cannot project an optional field as non-null: %s", + field.name()); this.current = requestedField.dataType(); try { @@ -139,8 +141,10 @@ public Type list(Types.ListType list, Supplier elementResult) { Preconditions.checkArgument(current instanceof ArrayType, "Not an array: %s", current); ArrayType requestedArray = (ArrayType) current; - Preconditions.checkArgument(requestedArray.containsNull() || !list.isElementOptional(), - "Cannot project an array of optional elements as required elements: %s", requestedArray); + Preconditions.checkArgument( + requestedArray.containsNull() || !list.isElementOptional(), + "Cannot project an array of optional elements as required elements: %s", + requestedArray); this.current = requestedArray.elementType(); try { @@ -165,8 +169,10 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu Preconditions.checkArgument(current instanceof MapType, "Not a map: %s", current); MapType requestedMap = (MapType) current; - Preconditions.checkArgument(requestedMap.valueContainsNull() || !map.isValueOptional(), - "Cannot project a map of optional values as required values: %s", map); + Preconditions.checkArgument( + requestedMap.valueContainsNull() || !map.isValueOptional(), + "Cannot project a map of optional values as required values: %s", + map); this.current = requestedMap.valueType(); try { @@ -188,19 +194,27 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu @Override public Type primitive(Type.PrimitiveType primitive) { Class expectedType = TYPES.get(primitive.typeId()); - Preconditions.checkArgument(expectedType != null && expectedType.isInstance(current), - "Cannot project %s to incompatible type: %s", primitive, current); + Preconditions.checkArgument( + expectedType != null && expectedType.isInstance(current), + "Cannot project %s to incompatible type: %s", + primitive, + current); // additional checks based on type switch (primitive.typeId()) { case DECIMAL: Types.DecimalType decimal = (Types.DecimalType) primitive; DecimalType requestedDecimal = (DecimalType) current; - Preconditions.checkArgument(requestedDecimal.scale() == decimal.scale(), - "Cannot project decimal with incompatible scale: %s != %s", requestedDecimal.scale(), decimal.scale()); - Preconditions.checkArgument(requestedDecimal.precision() >= decimal.precision(), + Preconditions.checkArgument( + requestedDecimal.scale() == decimal.scale(), + "Cannot project decimal with incompatible scale: %s != %s", + requestedDecimal.scale(), + decimal.scale()); + Preconditions.checkArgument( + requestedDecimal.precision() >= decimal.precision(), "Cannot project decimal with incompatible precision: %s < %s", - requestedDecimal.precision(), decimal.precision()); + requestedDecimal.precision(), + decimal.precision()); break; default: } @@ -208,19 +222,19 @@ public Type primitive(Type.PrimitiveType primitive) { return primitive; } - private static final ImmutableMap> TYPES = ImmutableMap - .>builder() - .put(TypeID.BOOLEAN, BooleanType.class) - .put(TypeID.INTEGER, IntegerType.class) - .put(TypeID.LONG, LongType.class) - .put(TypeID.FLOAT, FloatType.class) - .put(TypeID.DOUBLE, DoubleType.class) - .put(TypeID.DATE, DateType.class) - .put(TypeID.TIMESTAMP, TimestampType.class) - .put(TypeID.DECIMAL, DecimalType.class) - .put(TypeID.UUID, StringType.class) - .put(TypeID.STRING, StringType.class) - .put(TypeID.FIXED, BinaryType.class) - .put(TypeID.BINARY, BinaryType.class) - .build(); + private static final ImmutableMap> TYPES = + ImmutableMap.>builder() + .put(TypeID.BOOLEAN, BooleanType.class) + .put(TypeID.INTEGER, IntegerType.class) + .put(TypeID.LONG, LongType.class) + .put(TypeID.FLOAT, FloatType.class) + .put(TypeID.DOUBLE, DoubleType.class) + .put(TypeID.DATE, DateType.class) + .put(TypeID.TIMESTAMP, TimestampType.class) + .put(TypeID.DECIMAL, DecimalType.class) + .put(TypeID.UUID, StringType.class) + .put(TypeID.STRING, StringType.class) + .put(TypeID.FIXED, BinaryType.class) + .put(TypeID.BINARY, BinaryType.class) + .build(); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java index a27d06e7a1d7..bc8a966488ee 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -41,22 +40,25 @@ /** * An implementation of StagedTable that mimics the behavior of Spark's non-atomic CTAS and RTAS. - *

    - * A Spark catalog can implement StagingTableCatalog to support atomic operations by producing StagedTable. But if a - * catalog implements StagingTableCatalog, Spark expects the catalog to be able to produce a StagedTable for any table - * loaded by the catalog. This assumption doesn't always work, as in the case of {@link SparkSessionCatalog}, which - * supports atomic operations can produce a StagedTable for Iceberg tables, but wraps the session catalog and cannot - * necessarily produce a working StagedTable implementation for tables that it loads. - *

    - * The work-around is this class, which implements the StagedTable interface but does not have atomic behavior. Instead, - * the StagedTable interface is used to implement the behavior of the non-atomic SQL plans that will create a table, - * write, and will drop the table to roll back. - *

    - * This StagedTable implements SupportsRead, SupportsWrite, and SupportsDelete by passing the calls to the real table. - * Implementing those interfaces is safe because Spark will not use them unless the table supports them and returns the - * corresponding capabilities from {@link #capabilities()}. + * + *

    A Spark catalog can implement StagingTableCatalog to support atomic operations by producing + * StagedTable. But if a catalog implements StagingTableCatalog, Spark expects the catalog to be + * able to produce a StagedTable for any table loaded by the catalog. This assumption doesn't always + * work, as in the case of {@link SparkSessionCatalog}, which supports atomic operations can produce + * a StagedTable for Iceberg tables, but wraps the session catalog and cannot necessarily produce a + * working StagedTable implementation for tables that it loads. + * + *

    The work-around is this class, which implements the StagedTable interface but does not have + * atomic behavior. Instead, the StagedTable interface is used to implement the behavior of the + * non-atomic SQL plans that will create a table, write, and will drop the table to roll back. + * + *

    This StagedTable implements SupportsRead, SupportsWrite, and SupportsDelete by passing the + * calls to the real table. Implementing those interfaces is safe because Spark will not use them + * unless the table supports them and returns the corresponding capabilities from {@link + * #capabilities()}. */ -public class RollbackStagedTable implements StagedTable, SupportsRead, SupportsWrite, SupportsDelete { +public class RollbackStagedTable + implements StagedTable, SupportsRead, SupportsWrite, SupportsDelete { private final TableCatalog catalog; private final Identifier ident; private final Table table; @@ -119,19 +121,22 @@ public WriteBuilder newWriteBuilder(LogicalWriteInfo info) { } private void call(Class requiredClass, Consumer task) { - callReturning(requiredClass, inst -> { - task.accept(inst); - return null; - }); + callReturning( + requiredClass, + inst -> { + task.accept(inst); + return null; + }); } private R callReturning(Class requiredClass, Function task) { if (requiredClass.isInstance(table)) { return task.apply(requiredClass.cast(table)); } else { - throw new UnsupportedOperationException(String.format( - "Table does not implement %s: %s (%s)", - requiredClass.getSimpleName(), table.name(), table.getClass().getName())); + throw new UnsupportedOperationException( + String.format( + "Table does not implement %s: %s (%s)", + requiredClass.getSimpleName(), table.name(), table.getClass().getName())); } } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SortOrderToSpark.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SortOrderToSpark.java index 5677f83a95ee..52d68db2e4f9 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SortOrderToSpark.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SortOrderToSpark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -38,46 +37,57 @@ class SortOrderToSpark implements SortOrderVisitor { @Override public SortOrder field(String sourceName, int id, SortDirection direction, NullOrder nullOrder) { - return Expressions.sort(Expressions.column(quotedName(id)), toSpark(direction), toSpark(nullOrder)); + return Expressions.sort( + Expressions.column(quotedName(id)), toSpark(direction), toSpark(nullOrder)); } @Override - public SortOrder bucket(String sourceName, int id, int width, SortDirection direction, NullOrder nullOrder) { - return Expressions.sort(Expressions.bucket(width, quotedName(id)), toSpark(direction), toSpark(nullOrder)); + public SortOrder bucket( + String sourceName, int id, int width, SortDirection direction, NullOrder nullOrder) { + return Expressions.sort( + Expressions.bucket(width, quotedName(id)), toSpark(direction), toSpark(nullOrder)); } @Override - public SortOrder truncate(String sourceName, int id, int width, SortDirection direction, NullOrder nullOrder) { - return Expressions.sort(Expressions.apply( - "truncate", Expressions.column(quotedName(id)), Expressions.literal(width)), - toSpark(direction), toSpark(nullOrder)); + public SortOrder truncate( + String sourceName, int id, int width, SortDirection direction, NullOrder nullOrder) { + return Expressions.sort( + Expressions.apply( + "truncate", Expressions.column(quotedName(id)), Expressions.literal(width)), + toSpark(direction), + toSpark(nullOrder)); } @Override public SortOrder year(String sourceName, int id, SortDirection direction, NullOrder nullOrder) { - return Expressions.sort(Expressions.years(quotedName(id)), toSpark(direction), toSpark(nullOrder)); + return Expressions.sort( + Expressions.years(quotedName(id)), toSpark(direction), toSpark(nullOrder)); } @Override public SortOrder month(String sourceName, int id, SortDirection direction, NullOrder nullOrder) { - return Expressions.sort(Expressions.months(quotedName(id)), toSpark(direction), toSpark(nullOrder)); + return Expressions.sort( + Expressions.months(quotedName(id)), toSpark(direction), toSpark(nullOrder)); } @Override public SortOrder day(String sourceName, int id, SortDirection direction, NullOrder nullOrder) { - return Expressions.sort(Expressions.days(quotedName(id)), toSpark(direction), toSpark(nullOrder)); + return Expressions.sort( + Expressions.days(quotedName(id)), toSpark(direction), toSpark(nullOrder)); } @Override public SortOrder hour(String sourceName, int id, SortDirection direction, NullOrder nullOrder) { - return Expressions.sort(Expressions.hours(quotedName(id)), toSpark(direction), toSpark(nullOrder)); + return Expressions.sort( + Expressions.hours(quotedName(id)), toSpark(direction), toSpark(nullOrder)); } private String quotedName(int id) { return quotedNameById.get(id); } - private org.apache.spark.sql.connector.expressions.SortDirection toSpark(SortDirection direction) { + private org.apache.spark.sql.connector.expressions.SortDirection toSpark( + SortDirection direction) { if (direction == SortDirection.ASC) { return org.apache.spark.sql.connector.expressions.SortDirection.ASCENDING; } else { @@ -89,4 +99,3 @@ private NullOrdering toSpark(NullOrder nullOrder) { return nullOrder == NullOrder.NULLS_FIRST ? NullOrdering.NULLS_FIRST : NullOrdering.NULLS_LAST; } } - diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java index d472987957dd..94a86edd38e0 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.nio.ByteBuffer; @@ -93,14 +92,14 @@ public class Spark3Util { - private static final Set RESERVED_PROPERTIES = ImmutableSet.of( - TableCatalog.PROP_LOCATION, TableCatalog.PROP_PROVIDER); + private static final Set RESERVED_PROPERTIES = + ImmutableSet.of(TableCatalog.PROP_LOCATION, TableCatalog.PROP_PROVIDER); private static final Joiner DOT = Joiner.on("."); - private Spark3Util() { - } + private Spark3Util() {} - public static CaseInsensitiveStringMap setOption(String key, String value, CaseInsensitiveStringMap options) { + public static CaseInsensitiveStringMap setOption( + String key, String value, CaseInsensitiveStringMap options) { Map newOptions = Maps.newHashMap(); newOptions.putAll(options); newOptions.put(key, value); @@ -134,7 +133,8 @@ public static Map rebuildCreateProperties(Map cr * @param changes a list of Spark table changes * @return the UpdateProperties operation configured with the changes */ - public static UpdateProperties applyPropertyChanges(UpdateProperties pendingUpdate, List changes) { + public static UpdateProperties applyPropertyChanges( + UpdateProperties pendingUpdate, List changes) { for (TableChange change : changes) { if (change instanceof TableChange.SetProperty) { TableChange.SetProperty set = (TableChange.SetProperty) change; @@ -159,7 +159,8 @@ public static UpdateProperties applyPropertyChanges(UpdateProperties pendingUpda * @param changes a list of Spark table changes * @return the UpdateSchema operation configured with the changes */ - public static UpdateSchema applySchemaChanges(UpdateSchema pendingUpdate, List changes) { + public static UpdateSchema applySchemaChanges( + UpdateSchema pendingUpdate, List changes) { for (TableChange change : changes) { if (change instanceof TableChange.AddColumn) { apply(pendingUpdate, (TableChange.AddColumn) change); @@ -167,8 +168,11 @@ public static UpdateSchema applySchemaChanges(UpdateSchema pendingUpdate, List quotedNameById = SparkSchemaUtil.indexQuotedNameById(spec.schema()); - List transforms = PartitionSpecVisitor.visit(spec, - new PartitionSpecVisitor() { - @Override - public Transform identity(String sourceName, int sourceId) { - return Expressions.identity(quotedName(sourceId)); - } - - @Override - public Transform bucket(String sourceName, int sourceId, int numBuckets) { - return Expressions.bucket(numBuckets, quotedName(sourceId)); - } - - @Override - public Transform truncate(String sourceName, int sourceId, int width) { - return Expressions.apply("truncate", Expressions.column(quotedName(sourceId)), Expressions.literal(width)); - } - - @Override - public Transform year(String sourceName, int sourceId) { - return Expressions.years(quotedName(sourceId)); - } - - @Override - public Transform month(String sourceName, int sourceId) { - return Expressions.months(quotedName(sourceId)); - } - - @Override - public Transform day(String sourceName, int sourceId) { - return Expressions.days(quotedName(sourceId)); - } - - @Override - public Transform hour(String sourceName, int sourceId) { - return Expressions.hours(quotedName(sourceId)); - } - - @Override - public Transform alwaysNull(int fieldId, String sourceName, int sourceId) { - // do nothing for alwaysNull, it doesn't need to be converted to a transform - return null; - } - - @Override - public Transform unknown(int fieldId, String sourceName, int sourceId, String transform) { - return Expressions.apply(transform, Expressions.column(quotedName(sourceId))); - } - - private String quotedName(int id) { - return quotedNameById.get(id); - } - }); + List transforms = + PartitionSpecVisitor.visit( + spec, + new PartitionSpecVisitor() { + @Override + public Transform identity(String sourceName, int sourceId) { + return Expressions.identity(quotedName(sourceId)); + } + + @Override + public Transform bucket(String sourceName, int sourceId, int numBuckets) { + return Expressions.bucket(numBuckets, quotedName(sourceId)); + } + + @Override + public Transform truncate(String sourceName, int sourceId, int width) { + return Expressions.apply( + "truncate", + Expressions.column(quotedName(sourceId)), + Expressions.literal(width)); + } + + @Override + public Transform year(String sourceName, int sourceId) { + return Expressions.years(quotedName(sourceId)); + } + + @Override + public Transform month(String sourceName, int sourceId) { + return Expressions.months(quotedName(sourceId)); + } + + @Override + public Transform day(String sourceName, int sourceId) { + return Expressions.days(quotedName(sourceId)); + } + + @Override + public Transform hour(String sourceName, int sourceId) { + return Expressions.hours(quotedName(sourceId)); + } + + @Override + public Transform alwaysNull(int fieldId, String sourceName, int sourceId) { + // do nothing for alwaysNull, it doesn't need to be converted to a transform + return null; + } + + @Override + public Transform unknown( + int fieldId, String sourceName, int sourceId, String transform) { + return Expressions.apply(transform, Expressions.column(quotedName(sourceId))); + } + + private String quotedName(int id) { + return quotedNameById.get(id); + } + }); return transforms.stream().filter(Objects::nonNull).toArray(Transform[]::new); } @@ -315,8 +332,10 @@ public static NamedReference toNamedReference(String name) { public static Term toIcebergTerm(Expression expr) { if (expr instanceof Transform) { Transform transform = (Transform) expr; - Preconditions.checkArgument("zorder".equals(transform.name()) || transform.references().length == 1, - "Cannot convert transform with more than one column reference: %s", transform); + Preconditions.checkArgument( + "zorder".equals(transform.name()) || transform.references().length == 1, + "Cannot convert transform with more than one column reference: %s", + transform); String colName = DOT.join(transform.references()[0].fieldNames()); switch (transform.name().toLowerCase(Locale.ROOT)) { case "identity": @@ -336,10 +355,11 @@ public static Term toIcebergTerm(Expression expr) { case "truncate": return org.apache.iceberg.expressions.Expressions.truncate(colName, findWidth(transform)); case "zorder": - return new Zorder(Stream.of(transform.references()) - .map(ref -> DOT.join(ref.fieldNames())) - .map(org.apache.iceberg.expressions.Expressions::ref) - .collect(Collectors.toList())); + return new Zorder( + Stream.of(transform.references()) + .map(ref -> DOT.join(ref.fieldNames())) + .map(org.apache.iceberg.expressions.Expressions::ref) + .collect(Collectors.toList())); default: throw new UnsupportedOperationException("Transform is not supported: " + transform); } @@ -367,8 +387,10 @@ public static PartitionSpec toPartitionSpec(Schema schema, Transform[] partition PartitionSpec.Builder builder = PartitionSpec.builderFor(schema); for (Transform transform : partitioning) { - Preconditions.checkArgument(transform.references().length == 1, - "Cannot convert transform with more than one column reference: %s", transform); + Preconditions.checkArgument( + transform.references().length == 1, + "Cannot convert transform with more than one column reference: %s", + transform); String colName = DOT.join(transform.references()[0].fieldNames()); switch (transform.name().toLowerCase(Locale.ROOT)) { case "identity": @@ -408,14 +430,16 @@ private static int findWidth(Transform transform) { if (expr instanceof Literal) { if (((Literal) expr).dataType() instanceof IntegerType) { Literal lit = (Literal) expr; - Preconditions.checkArgument(lit.value() > 0, - "Unsupported width for transform: %s", transform.describe()); + Preconditions.checkArgument( + lit.value() > 0, "Unsupported width for transform: %s", transform.describe()); return lit.value(); } else if (((Literal) expr).dataType() instanceof LongType) { Literal lit = (Literal) expr; - Preconditions.checkArgument(lit.value() > 0 && lit.value() < Integer.MAX_VALUE, - "Unsupported width for transform: %s", transform.describe()); + Preconditions.checkArgument( + lit.value() > 0 && lit.value() < Integer.MAX_VALUE, + "Unsupported width for transform: %s", + transform.describe()); if (lit.value() > Integer.MAX_VALUE) { throw new IllegalArgumentException(); } @@ -428,7 +452,8 @@ private static int findWidth(Transform transform) { } private static String leafName(String[] fieldNames) { - Preconditions.checkArgument(fieldNames.length > 0, "Invalid field name: at least one name is required"); + Preconditions.checkArgument( + fieldNames.length > 0, "Invalid field name: at least one name is required"); return fieldNames[fieldNames.length - 1]; } @@ -473,8 +498,7 @@ public static class DescribeSchemaVisitor extends TypeUtil.SchemaVisitor private static final Joiner COMMA = Joiner.on(','); private static final DescribeSchemaVisitor INSTANCE = new DescribeSchemaVisitor(); - private DescribeSchemaVisitor() { - } + private DescribeSchemaVisitor() {} @Override public String schema(Schema schema, String structResult) { @@ -534,11 +558,11 @@ public String primitive(Type.PrimitiveType primitive) { } } - private static class DescribeExpressionVisitor extends ExpressionVisitors.ExpressionVisitor { + private static class DescribeExpressionVisitor + extends ExpressionVisitors.ExpressionVisitor { private static final DescribeExpressionVisitor INSTANCE = new DescribeExpressionVisitor(); - private DescribeExpressionVisitor() { - } + private DescribeExpressionVisitor() {} @Override public String alwaysTrue() { @@ -607,7 +631,9 @@ public String predicate(UnboundPredicate pred) { } private static String sqlString(List> literals) { - return literals.stream().map(DescribeExpressionVisitor::sqlString).collect(Collectors.joining(", ")); + return literals.stream() + .map(DescribeExpressionVisitor::sqlString) + .collect(Collectors.joining(", ")); } private static String sqlString(org.apache.iceberg.expressions.Literal lit) { @@ -623,11 +649,12 @@ private static String sqlString(org.apache.iceberg.expressions.Literal lit) { } /** - * Returns an Iceberg Table by its name from a Spark V2 Catalog. If cache is enabled in {@link SparkCatalog}, - * the {@link TableOperations} of the table may be stale, please refresh the table to get the latest one. + * Returns an Iceberg Table by its name from a Spark V2 Catalog. If cache is enabled in {@link + * SparkCatalog}, the {@link TableOperations} of the table may be stale, please refresh the table + * to get the latest one. * * @param spark SparkSession used for looking up catalog references and tables - * @param name The multipart identifier of the Iceberg table + * @param name The multipart identifier of the Iceberg table * @return an Iceberg table */ public static org.apache.iceberg.Table loadIcebergTable(SparkSession spark, String name) @@ -641,38 +668,44 @@ public static org.apache.iceberg.Table loadIcebergTable(SparkSession spark, Stri /** * Returns the underlying Iceberg Catalog object represented by a Spark Catalog + * * @param spark SparkSession used for looking up catalog reference * @param catalogName The name of the Spark Catalog being referenced * @return the Iceberg catalog class being wrapped by the Spark Catalog */ public static Catalog loadIcebergCatalog(SparkSession spark, String catalogName) { CatalogPlugin catalogPlugin = spark.sessionState().catalogManager().catalog(catalogName); - Preconditions.checkArgument(catalogPlugin instanceof HasIcebergCatalog, - String.format("Cannot load Iceberg catalog from catalog %s because it does not contain an Iceberg Catalog. " + - "Actual Class: %s", + Preconditions.checkArgument( + catalogPlugin instanceof HasIcebergCatalog, + String.format( + "Cannot load Iceberg catalog from catalog %s because it does not contain an Iceberg Catalog. " + + "Actual Class: %s", catalogName, catalogPlugin.getClass().getName())); return ((HasIcebergCatalog) catalogPlugin).icebergCatalog(); } - - public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, String name) throws ParseException { - return catalogAndIdentifier(spark, name, spark.sessionState().catalogManager().currentCatalog()); + public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, String name) + throws ParseException { + return catalogAndIdentifier( + spark, name, spark.sessionState().catalogManager().currentCatalog()); } - public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, String name, - CatalogPlugin defaultCatalog) throws ParseException { + public static CatalogAndIdentifier catalogAndIdentifier( + SparkSession spark, String name, CatalogPlugin defaultCatalog) throws ParseException { ParserInterface parser = spark.sessionState().sqlParser(); Seq multiPartIdentifier = parser.parseMultipartIdentifier(name).toIndexedSeq(); List javaMultiPartIdentifier = JavaConverters.seqAsJavaList(multiPartIdentifier); return catalogAndIdentifier(spark, javaMultiPartIdentifier, defaultCatalog); } - public static CatalogAndIdentifier catalogAndIdentifier(String description, SparkSession spark, String name) { - return catalogAndIdentifier(description, spark, name, spark.sessionState().catalogManager().currentCatalog()); + public static CatalogAndIdentifier catalogAndIdentifier( + String description, SparkSession spark, String name) { + return catalogAndIdentifier( + description, spark, name, spark.sessionState().catalogManager().currentCatalog()); } - public static CatalogAndIdentifier catalogAndIdentifier(String description, SparkSession spark, - String name, CatalogPlugin defaultCatalog) { + public static CatalogAndIdentifier catalogAndIdentifier( + String description, SparkSession spark, String name, CatalogPlugin defaultCatalog) { try { return catalogAndIdentifier(spark, name, defaultCatalog); } catch (ParseException e) { @@ -680,20 +713,23 @@ public static CatalogAndIdentifier catalogAndIdentifier(String description, Spar } } - public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, List nameParts) { - return catalogAndIdentifier(spark, nameParts, spark.sessionState().catalogManager().currentCatalog()); + public static CatalogAndIdentifier catalogAndIdentifier( + SparkSession spark, List nameParts) { + return catalogAndIdentifier( + spark, nameParts, spark.sessionState().catalogManager().currentCatalog()); } /** - * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply - * Attempts to find the catalog and identifier a multipart identifier represents + * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply Attempts to find the + * catalog and identifier a multipart identifier represents + * * @param spark Spark session to use for resolution * @param nameParts Multipart identifier representing a table * @param defaultCatalog Catalog to use if none is specified * @return The CatalogPlugin and Identifier for the table */ - public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, List nameParts, - CatalogPlugin defaultCatalog) { + public static CatalogAndIdentifier catalogAndIdentifier( + SparkSession spark, List nameParts, CatalogPlugin defaultCatalog) { CatalogManager catalogManager = spark.sessionState().catalogManager(); String[] currentNamespace; @@ -703,18 +739,19 @@ public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, List currentNamespace = defaultCatalog.defaultNamespace(); } - Pair catalogIdentifier = SparkUtil.catalogAndIdentifier(nameParts, - catalogName -> { - try { - return catalogManager.catalog(catalogName); - } catch (Exception e) { - return null; - } - }, - Identifier::of, - defaultCatalog, - currentNamespace - ); + Pair catalogIdentifier = + SparkUtil.catalogAndIdentifier( + nameParts, + catalogName -> { + try { + return catalogManager.catalog(catalogName); + } catch (Exception e) { + return null; + } + }, + Identifier::of, + defaultCatalog, + currentNamespace); return new CatalogAndIdentifier(catalogIdentifier); } @@ -723,18 +760,17 @@ private static TableCatalog asTableCatalog(CatalogPlugin catalog) { return (TableCatalog) catalog; } - throw new IllegalArgumentException(String.format( - "Cannot use catalog %s(%s): not a TableCatalog", catalog.name(), catalog.getClass().getName())); + throw new IllegalArgumentException( + String.format( + "Cannot use catalog %s(%s): not a TableCatalog", + catalog.name(), catalog.getClass().getName())); } - /** - * This mimics a class inside of Spark which is private inside of LookupCatalog. - */ + /** This mimics a class inside of Spark which is private inside of LookupCatalog. */ public static class CatalogAndIdentifier { private final CatalogPlugin catalog; private final Identifier identifier; - public CatalogAndIdentifier(CatalogPlugin catalog, Identifier identifier) { this.catalog = catalog; this.identifier = identifier; @@ -767,8 +803,8 @@ public static String quotedFullIdentifier(String catalogName, Identifier identif .build(); return CatalogV2Implicits.MultipartIdentifierHelper( - JavaConverters.asScalaIteratorConverter(parts.iterator()).asScala().toSeq() - ).quoted(); + JavaConverters.asScalaIteratorConverter(parts.iterator()).asScala().toSeq()) + .quoted(); } /** @@ -780,21 +816,21 @@ public static String quotedFullIdentifier(String catalogName, Identifier identif * @param partitionFilter partitionFilter of the file * @return all table's partitions */ - public static List getPartitions(SparkSession spark, Path rootPath, String format, - Map partitionFilter) { + public static List getPartitions( + SparkSession spark, Path rootPath, String format, Map partitionFilter) { FileStatusCache fileStatusCache = FileStatusCache.getOrCreate(spark); - InMemoryFileIndex fileIndex = new InMemoryFileIndex( - spark, - JavaConverters - .collectionAsScalaIterableConverter(ImmutableList.of(rootPath)) - .asScala() - .toSeq(), + InMemoryFileIndex fileIndex = + new InMemoryFileIndex( + spark, + JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(rootPath)) + .asScala() + .toSeq(), scala.collection.immutable.Map$.MODULE$.empty(), - Option.empty(), - fileStatusCache, - Option.empty(), - Option.empty()); + Option.empty(), + fileStatusCache, + Option.empty(), + Option.empty()); org.apache.spark.sql.execution.datasources.PartitionSpec spec = fileIndex.partitionSpec(); StructType schema = spec.partitionColumns(); @@ -814,31 +850,38 @@ public static List getPartitions(SparkSession spark, Path rootPa Seq filteredPartitions = fileIndex.listFiles(scalaPartitionFilters, scalaDataFilters).toIndexedSeq(); - return JavaConverters - .seqAsJavaListConverter(filteredPartitions) - .asJava() - .stream() - .map(partition -> { - Map values = Maps.newHashMap(); - JavaConverters.asJavaIterableConverter(schema).asJava().forEach(field -> { - int fieldIndex = schema.fieldIndex(field.name()); - Object catalystValue = partition.values().get(fieldIndex, field.dataType()); - Object value = CatalystTypeConverters.convertToScala(catalystValue, field.dataType()); - values.put(field.name(), String.valueOf(value)); - }); - - FileStatus fileStatus = - JavaConverters.seqAsJavaListConverter(partition.files()).asJava().get(0); - - return new SparkPartition(values, fileStatus.getPath().getParent().toString(), format); - }).collect(Collectors.toList()); - } - - public static org.apache.spark.sql.catalyst.TableIdentifier toV1TableIdentifier(Identifier identifier) { + return JavaConverters.seqAsJavaListConverter(filteredPartitions).asJava().stream() + .map( + partition -> { + Map values = Maps.newHashMap(); + JavaConverters.asJavaIterableConverter(schema) + .asJava() + .forEach( + field -> { + int fieldIndex = schema.fieldIndex(field.name()); + Object catalystValue = partition.values().get(fieldIndex, field.dataType()); + Object value = + CatalystTypeConverters.convertToScala(catalystValue, field.dataType()); + values.put(field.name(), String.valueOf(value)); + }); + + FileStatus fileStatus = + JavaConverters.seqAsJavaListConverter(partition.files()).asJava().get(0); + + return new SparkPartition( + values, fileStatus.getPath().getParent().toString(), format); + }) + .collect(Collectors.toList()); + } + + public static org.apache.spark.sql.catalyst.TableIdentifier toV1TableIdentifier( + Identifier identifier) { String[] namespace = identifier.namespace(); - Preconditions.checkArgument(namespace.length <= 1, - "Cannot convert %s to a Spark v1 identifier, namespace contains more than 1 part", identifier); + Preconditions.checkArgument( + namespace.length <= 1, + "Cannot convert %s to a Spark v1 identifier, namespace contains more than 1 part", + identifier); String table = identifier.name(); Option database = namespace.length == 1 ? Option.apply(namespace[0]) : Option.empty(); @@ -848,54 +891,80 @@ public static org.apache.spark.sql.catalyst.TableIdentifier toV1TableIdentifier( private static class DescribeSortOrderVisitor implements SortOrderVisitor { private static final DescribeSortOrderVisitor INSTANCE = new DescribeSortOrderVisitor(); - private DescribeSortOrderVisitor() { - } + private DescribeSortOrderVisitor() {} @Override - public String field(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String field( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("%s %s %s", sourceName, direction, nullOrder); } @Override - public String bucket(String sourceName, int sourceId, int numBuckets, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String bucket( + String sourceName, + int sourceId, + int numBuckets, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("bucket(%s, %s) %s %s", numBuckets, sourceName, direction, nullOrder); } @Override - public String truncate(String sourceName, int sourceId, int width, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String truncate( + String sourceName, + int sourceId, + int width, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("truncate(%s, %s) %s %s", sourceName, width, direction, nullOrder); } @Override - public String year(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String year( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("years(%s) %s %s", sourceName, direction, nullOrder); } @Override - public String month(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String month( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("months(%s) %s %s", sourceName, direction, nullOrder); } @Override - public String day(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String day( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("days(%s) %s %s", sourceName, direction, nullOrder); } @Override - public String hour(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String hour( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("hours(%s) %s %s", sourceName, direction, nullOrder); } @Override - public String unknown(String sourceName, int sourceId, String transform, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String unknown( + String sourceName, + int sourceId, + String transform, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("%s(%s) %s %s", transform, sourceName, direction, nullOrder); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkCachedTableCatalog.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkCachedTableCatalog.java index ff4081764bed..41b3fc7aa99d 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkCachedTableCatalog.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkCachedTableCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -39,9 +38,7 @@ import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; -/** - * An internal table catalog that is capable of loading tables from a cache. - */ +/** An internal table catalog that is capable of loading tables from a cache. */ public class SparkCachedTableCatalog implements TableCatalog { private static final String CLASS_NAME = SparkCachedTableCatalog.class.getName(); @@ -70,8 +67,9 @@ public void invalidateTable(Identifier ident) { } @Override - public SparkTable createTable(Identifier ident, StructType schema, Transform[] partitions, - Map properties) throws TableAlreadyExistsException { + public SparkTable createTable( + Identifier ident, StructType schema, Transform[] partitions, Map properties) + throws TableAlreadyExistsException { throw new UnsupportedOperationException(CLASS_NAME + " does not support creating tables"); } @@ -106,7 +104,8 @@ public String name() { } private Pair load(Identifier ident) throws NoSuchTableException { - Preconditions.checkArgument(ident.namespace().length == 0, CLASS_NAME + " does not support namespaces"); + Preconditions.checkArgument( + ident.namespace().length == 0, CLASS_NAME + " does not support namespaces"); Pair> parsedIdent = parseIdent(ident); String key = parsedIdent.first(); @@ -127,8 +126,10 @@ private Pair load(Identifier ident) throws NoSuchTableException { } } - Preconditions.checkArgument(asOfTimestamp == null || snapshotId == null, - "Cannot specify both snapshot and timestamp for time travel: %s", ident); + Preconditions.checkArgument( + asOfTimestamp == null || snapshotId == null, + "Cannot specify both snapshot and timestamp for time travel: %s", + ident); Table table = TABLE_CACHE.get(key); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java index 95aecab0ff64..835a395f6e0e 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -74,26 +76,24 @@ import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** * A Spark TableCatalog implementation that wraps an Iceberg {@link Catalog}. - *

    - * This supports the following catalog configuration options: + * + *

    This supports the following catalog configuration options: + * *

      - *
    • type - catalog type, "hive" or "hadoop". - * To specify a non-hive or hadoop catalog, use the catalog-impl option. - *
    • - *
    • uri - the Hive Metastore URI (Hive catalog only)
    • - *
    • warehouse - the warehouse path (Hadoop catalog only)
    • - *
    • catalog-impl - a custom {@link Catalog} implementation to use
    • - *
    • default-namespace - a namespace to use as the default
    • - *
    • cache-enabled - whether to enable catalog cache
    • - *
    • cache.expiration-interval-ms - interval in millis before expiring tables from catalog cache. - * Refer to {@link CatalogProperties#CACHE_EXPIRATION_INTERVAL_MS} for further details and significant values. - *
    • + *
    • type - catalog type, "hive" or "hadoop". To specify a non-hive or hadoop + * catalog, use the catalog-impl option. + *
    • uri - the Hive Metastore URI (Hive catalog only) + *
    • warehouse - the warehouse path (Hadoop catalog only) + *
    • catalog-impl - a custom {@link Catalog} implementation to use + *
    • default-namespace - a namespace to use as the default + *
    • cache-enabled - whether to enable catalog cache + *
    • cache.expiration-interval-ms - interval in millis before expiring tables from + * catalog cache. Refer to {@link CatalogProperties#CACHE_EXPIRATION_INTERVAL_MS} for further + * details and significant values. *
    + * *

    */ public class SparkCatalog extends BaseCatalog { @@ -147,17 +147,18 @@ public SparkTable loadTable(Identifier ident) throws NoSuchTableException { } @Override - public SparkTable createTable(Identifier ident, StructType schema, - Transform[] transforms, - Map properties) throws TableAlreadyExistsException { + public SparkTable createTable( + Identifier ident, StructType schema, Transform[] transforms, Map properties) + throws TableAlreadyExistsException { Schema icebergSchema = SparkSchemaUtil.convert(schema, useTimestampsWithoutZone); try { Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); - Table icebergTable = builder - .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) - .withLocation(properties.get("location")) - .withProperties(Spark3Util.rebuildCreateProperties(properties)) - .create(); + Table icebergTable = + builder + .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) + .withLocation(properties.get("location")) + .withProperties(Spark3Util.rebuildCreateProperties(properties)) + .create(); return new SparkTable(icebergTable, !cacheEnabled); } catch (AlreadyExistsException e) { throw new TableAlreadyExistsException(ident); @@ -165,15 +166,18 @@ public SparkTable createTable(Identifier ident, StructType schema, } @Override - public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] transforms, - Map properties) throws TableAlreadyExistsException { + public StagedTable stageCreate( + Identifier ident, StructType schema, Transform[] transforms, Map properties) + throws TableAlreadyExistsException { Schema icebergSchema = SparkSchemaUtil.convert(schema, useTimestampsWithoutZone); try { Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); - Transaction transaction = builder.withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) - .withLocation(properties.get("location")) - .withProperties(Spark3Util.rebuildCreateProperties(properties)) - .createTransaction(); + Transaction transaction = + builder + .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) + .withLocation(properties.get("location")) + .withProperties(Spark3Util.rebuildCreateProperties(properties)) + .createTransaction(); return new StagedSparkTable(transaction); } catch (AlreadyExistsException e) { throw new TableAlreadyExistsException(ident); @@ -181,15 +185,18 @@ public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] } @Override - public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] transforms, - Map properties) throws NoSuchTableException { + public StagedTable stageReplace( + Identifier ident, StructType schema, Transform[] transforms, Map properties) + throws NoSuchTableException { Schema icebergSchema = SparkSchemaUtil.convert(schema, useTimestampsWithoutZone); try { Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); - Transaction transaction = builder.withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) - .withLocation(properties.get("location")) - .withProperties(Spark3Util.rebuildCreateProperties(properties)) - .replaceTransaction(); + Transaction transaction = + builder + .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) + .withLocation(properties.get("location")) + .withProperties(Spark3Util.rebuildCreateProperties(properties)) + .replaceTransaction(); return new StagedSparkTable(transaction); } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { throw new NoSuchTableException(ident); @@ -197,19 +204,22 @@ public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] } @Override - public StagedTable stageCreateOrReplace(Identifier ident, StructType schema, Transform[] transforms, - Map properties) { + public StagedTable stageCreateOrReplace( + Identifier ident, StructType schema, Transform[] transforms, Map properties) { Schema icebergSchema = SparkSchemaUtil.convert(schema, useTimestampsWithoutZone); Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); - Transaction transaction = builder.withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) - .withLocation(properties.get("location")) - .withProperties(Spark3Util.rebuildCreateProperties(properties)) - .createOrReplaceTransaction(); + Transaction transaction = + builder + .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) + .withLocation(properties.get("location")) + .withProperties(Spark3Util.rebuildCreateProperties(properties)) + .createOrReplaceTransaction(); return new StagedSparkTable(transaction); } @Override - public SparkTable alterTable(Identifier ident, TableChange... changes) throws NoSuchTableException { + public SparkTable alterTable(Identifier ident, TableChange... changes) + throws NoSuchTableException { SetProperty setLocation = null; SetProperty setSnapshotId = null; SetProperty pickSnapshotId = null; @@ -226,8 +236,9 @@ public SparkTable alterTable(Identifier ident, TableChange... changes) throws No } else if ("cherry-pick-snapshot-id".equalsIgnoreCase(set.property())) { pickSnapshotId = set; } else if ("sort-order".equalsIgnoreCase(set.property())) { - throw new UnsupportedOperationException("Cannot specify the 'sort-order' because it's a reserved table " + - "property. Please use the command 'ALTER TABLE ... WRITE ORDERED BY' to specify write sort-orders."); + throw new UnsupportedOperationException( + "Cannot specify the 'sort-order' because it's a reserved table " + + "property. Please use the command 'ALTER TABLE ... WRITE ORDERED BY' to specify write sort-orders."); } else { propertyChanges.add(set); } @@ -242,7 +253,8 @@ public SparkTable alterTable(Identifier ident, TableChange... changes) throws No try { Table table = load(ident).first(); - commitChanges(table, setLocation, setSnapshotId, pickSnapshotId, propertyChanges, schemaChanges); + commitChanges( + table, setLocation, setSnapshotId, pickSnapshotId, propertyChanges, schemaChanges); return new SparkTable(table, true /* refreshEagerly */); } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { throw new NoSuchTableException(ident); @@ -261,20 +273,19 @@ public boolean purgeTable(Identifier ident) { ValidationException.check( PropertyUtil.propertyAsBoolean(table.properties(), GC_ENABLED, GC_ENABLED_DEFAULT), "Cannot purge table: GC is disabled (deleting files may corrupt other tables)"); - String metadataFileLocation = ((HasTableOperations) table).operations().current().metadataFileLocation(); + String metadataFileLocation = + ((HasTableOperations) table).operations().current().metadataFileLocation(); boolean dropped = dropTableWithoutPurging(ident); if (dropped) { - // We should check whether the metadata file exists. Because the HadoopCatalog/HadoopTables will drop the + // We should check whether the metadata file exists. Because the HadoopCatalog/HadoopTables + // will drop the // warehouse directly and ignore the `purge` argument. boolean metadataFileExists = table.io().newInputFile(metadataFileLocation).exists(); if (metadataFileExists) { - SparkActions.get() - .deleteReachableFiles(metadataFileLocation) - .io(table.io()) - .execute(); + SparkActions.get().deleteReachableFiles(metadataFileLocation).io(table.io()).execute(); } } @@ -293,7 +304,8 @@ private boolean dropTableWithoutPurging(Identifier ident) { } @Override - public void renameTable(Identifier from, Identifier to) throws NoSuchTableException, TableAlreadyExistsException { + public void renameTable(Identifier from, Identifier to) + throws NoSuchTableException, TableAlreadyExistsException { try { checkNotPathIdentifier(from, "renameTable"); checkNotPathIdentifier(to, "renameTable"); @@ -355,7 +367,8 @@ public String[][] listNamespaces(String[] namespace) throws NoSuchNamespaceExcep } @Override - public Map loadNamespaceMetadata(String[] namespace) throws NoSuchNamespaceException { + public Map loadNamespaceMetadata(String[] namespace) + throws NoSuchNamespaceException { if (asNamespaceCatalog != null) { try { return asNamespaceCatalog.loadNamespaceMetadata(Namespace.of(namespace)); @@ -368,10 +381,12 @@ public Map loadNamespaceMetadata(String[] namespace) throws NoSu } @Override - public void createNamespace(String[] namespace, Map metadata) throws NamespaceAlreadyExistsException { + public void createNamespace(String[] namespace, Map metadata) + throws NamespaceAlreadyExistsException { if (asNamespaceCatalog != null) { try { - if (asNamespaceCatalog instanceof HadoopCatalog && DEFAULT_NS_KEYS.equals(metadata.keySet())) { + if (asNamespaceCatalog instanceof HadoopCatalog + && DEFAULT_NS_KEYS.equals(metadata.keySet())) { // Hadoop catalog will reject metadata properties, but Spark automatically adds "owner". // If only the automatic properties are present, replace metadata with an empty map. asNamespaceCatalog.createNamespace(Namespace.of(namespace), ImmutableMap.of()); @@ -382,12 +397,14 @@ public void createNamespace(String[] namespace, Map metadata) th throw new NamespaceAlreadyExistsException(namespace); } } else { - throw new UnsupportedOperationException("Namespaces are not supported by catalog: " + catalogName); + throw new UnsupportedOperationException( + "Namespaces are not supported by catalog: " + catalogName); } } @Override - public void alterNamespace(String[] namespace, NamespaceChange... changes) throws NoSuchNamespaceException { + public void alterNamespace(String[] namespace, NamespaceChange... changes) + throws NoSuchNamespaceException { if (asNamespaceCatalog != null) { Map updates = Maps.newHashMap(); Set removals = Sets.newHashSet(); @@ -398,7 +415,8 @@ public void alterNamespace(String[] namespace, NamespaceChange... changes) throw } else if (change instanceof NamespaceChange.RemoveProperty) { removals.add(((NamespaceChange.RemoveProperty) change).property()); } else { - throw new UnsupportedOperationException("Cannot apply unknown namespace change: " + change); + throw new UnsupportedOperationException( + "Cannot apply unknown namespace change: " + change); } } @@ -434,12 +452,15 @@ public boolean dropNamespace(String[] namespace) throws NoSuchNamespaceException @Override public final void initialize(String name, CaseInsensitiveStringMap options) { - this.cacheEnabled = PropertyUtil.propertyAsBoolean(options, - CatalogProperties.CACHE_ENABLED, CatalogProperties.CACHE_ENABLED_DEFAULT); + this.cacheEnabled = + PropertyUtil.propertyAsBoolean( + options, CatalogProperties.CACHE_ENABLED, CatalogProperties.CACHE_ENABLED_DEFAULT); - long cacheExpirationIntervalMs = PropertyUtil.propertyAsLong(options, - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS_DEFAULT); + long cacheExpirationIntervalMs = + PropertyUtil.propertyAsLong( + options, + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS_DEFAULT); // An expiration interval of 0ms effectively disables caching. // Do not wrap with CachingCatalog. @@ -451,15 +472,17 @@ public final void initialize(String name, CaseInsensitiveStringMap options) { this.catalogName = name; SparkSession sparkSession = SparkSession.active(); - this.useTimestampsWithoutZone = SparkUtil.useTimestampWithoutZoneInNewTables(sparkSession.conf()); - this.tables = new HadoopTables(SparkUtil.hadoopConfCatalogOverrides(SparkSession.active(), name)); - this.icebergCatalog = cacheEnabled ? CachingCatalog.wrap(catalog, cacheExpirationIntervalMs) : catalog; + this.useTimestampsWithoutZone = + SparkUtil.useTimestampWithoutZoneInNewTables(sparkSession.conf()); + this.tables = + new HadoopTables(SparkUtil.hadoopConfCatalogOverrides(SparkSession.active(), name)); + this.icebergCatalog = + cacheEnabled ? CachingCatalog.wrap(catalog, cacheExpirationIntervalMs) : catalog; if (catalog instanceof SupportsNamespaces) { this.asNamespaceCatalog = (SupportsNamespaces) catalog; if (options.containsKey("default-namespace")) { - this.defaultNamespace = Splitter.on('.') - .splitToList(options.get("default-namespace")) - .toArray(new String[0]); + this.defaultNamespace = + Splitter.on('.').splitToList(options.get("default-namespace")).toArray(new String[0]); } } } @@ -469,12 +492,18 @@ public String name() { return catalogName; } - private static void commitChanges(Table table, SetProperty setLocation, SetProperty setSnapshotId, - SetProperty pickSnapshotId, List propertyChanges, - List schemaChanges) { - // don't allow setting the snapshot and picking a commit at the same time because order is ambiguous and choosing + private static void commitChanges( + Table table, + SetProperty setLocation, + SetProperty setSnapshotId, + SetProperty pickSnapshotId, + List propertyChanges, + List schemaChanges) { + // don't allow setting the snapshot and picking a commit at the same time because order is + // ambiguous and choosing // one order leads to different results - Preconditions.checkArgument(setSnapshotId == null || pickSnapshotId == null, + Preconditions.checkArgument( + setSnapshotId == null || pickSnapshotId == null, "Cannot set the current the current snapshot ID and cherry-pick snapshot changes"); if (setSnapshotId != null) { @@ -491,9 +520,7 @@ private static void commitChanges(Table table, SetProperty setLocation, SetPrope Transaction transaction = table.newTransaction(); if (setLocation != null) { - transaction.updateLocation() - .setLocation(setLocation.value()) - .commit(); + transaction.updateLocation().setLocation(setLocation.value()).commit(); } if (!propertyChanges.isEmpty()) { @@ -513,8 +540,9 @@ private static boolean isPathIdentifier(Identifier ident) { private static void checkNotPathIdentifier(Identifier identifier, String method) { if (identifier instanceof PathIdentifier) { - throw new IllegalArgumentException(String.format("Cannot pass path based identifier to %s method. %s is a path.", - method, identifier)); + throw new IllegalArgumentException( + String.format( + "Cannot pass path based identifier to %s method. %s is a path.", method, identifier)); } } @@ -531,7 +559,8 @@ private Pair load(Identifier ident) { throw e; } - // if the original load didn't work, the identifier may be extended and include a snapshot selector + // if the original load didn't work, the identifier may be extended and include a snapshot + // selector TableIdentifier namespaceAsIdent = buildIdentifier(namespaceToIdentifier(ident.namespace())); Table table; try { @@ -595,10 +624,13 @@ private Pair loadFromPathIdentifier(PathIdentifier ident) { } } - Preconditions.checkArgument(asOfTimestamp == null || snapshotId == null, - "Cannot specify both snapshot-id and as-of-timestamp: %s", ident.location()); + Preconditions.checkArgument( + asOfTimestamp == null || snapshotId == null, + "Cannot specify both snapshot-id and as-of-timestamp: %s", + ident.location()); - Table table = tables.load(parsed.first() + (metadataTableName != null ? "#" + metadataTableName : "")); + Table table = + tables.load(parsed.first() + (metadataTableName != null ? "#" + metadataTableName : "")); if (snapshotId != null) { return Pair.of(table, snapshotId); @@ -610,17 +642,17 @@ private Pair loadFromPathIdentifier(PathIdentifier ident) { } private Identifier namespaceToIdentifier(String[] namespace) { - Preconditions.checkArgument(namespace.length > 0, - "Cannot convert empty namespace to identifier"); + Preconditions.checkArgument( + namespace.length > 0, "Cannot convert empty namespace to identifier"); String[] ns = Arrays.copyOf(namespace, namespace.length - 1); String name = namespace[ns.length]; return Identifier.of(ns, name); } private Catalog.TableBuilder newBuilder(Identifier ident, Schema schema) { - return isPathIdentifier(ident) ? - tables.buildTable(((PathIdentifier) ident).location(), schema) : - icebergCatalog.buildTable(buildIdentifier(ident), schema); + return isPathIdentifier(ident) + ? tables.buildTable(((PathIdentifier) ident).location(), schema) + : icebergCatalog.buildTable(buildIdentifier(ident), schema); } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java index f3d89467fcf7..8242e67da64b 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -175,7 +174,8 @@ public ThisT tableProperty(String name) { protected T parse(Function conversion, T defaultValue) { if (!optionNames.isEmpty()) { for (String optionName : optionNames) { - // use lower case comparison as DataSourceOptions.asMap() in Spark 2 returns a lower case map + // use lower case comparison as DataSourceOptions.asMap() in Spark 2 returns a lower case + // map String optionValue = options.get(optionName.toLowerCase(Locale.ROOT)); if (optionValue != null) { return conversion.apply(optionValue); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java index a6390d39c575..87e831872472 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.nio.ByteBuffer; @@ -62,10 +61,12 @@ public SparkDataFile(Types.StructType type, StructType sparkType) { this.wrappedPartition = new SparkStructLike(type.fieldType("partition").asStructType()); Map positions = Maps.newHashMap(); - type.fields().forEach(field -> { - String fieldName = field.name(); - positions.put(fieldName, fieldPosition(fieldName, sparkType)); - }); + type.fields() + .forEach( + field -> { + String fieldName = field.name(); + positions.put(fieldName, fieldPosition(fieldName, sparkType)); + }); filePathPosition = positions.get("file_path"); fileFormatPosition = positions.get("file_format"); @@ -139,23 +140,29 @@ public Map valueCounts() { @Override public Map nullValueCounts() { - return wrapped.isNullAt(nullValueCountsPosition) ? null : wrapped.getJavaMap(nullValueCountsPosition); + return wrapped.isNullAt(nullValueCountsPosition) + ? null + : wrapped.getJavaMap(nullValueCountsPosition); } @Override public Map nanValueCounts() { - return wrapped.isNullAt(nanValueCountsPosition) ? null : wrapped.getJavaMap(nanValueCountsPosition); + return wrapped.isNullAt(nanValueCountsPosition) + ? null + : wrapped.getJavaMap(nanValueCountsPosition); } @Override public Map lowerBounds() { - Map lowerBounds = wrapped.isNullAt(lowerBoundsPosition) ? null : wrapped.getJavaMap(lowerBoundsPosition); + Map lowerBounds = + wrapped.isNullAt(lowerBoundsPosition) ? null : wrapped.getJavaMap(lowerBoundsPosition); return convert(lowerBoundsType, lowerBounds); } @Override public Map upperBounds() { - Map upperBounds = wrapped.isNullAt(upperBoundsPosition) ? null : wrapped.getJavaMap(upperBoundsPosition); + Map upperBounds = + wrapped.isNullAt(upperBoundsPosition) ? null : wrapped.getJavaMap(upperBoundsPosition); return convert(upperBoundsType, upperBounds); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkDistributionAndOrderingUtil.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkDistributionAndOrderingUtil.java index 81935b59eebc..c82cbacbd42b 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkDistributionAndOrderingUtil.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkDistributionAndOrderingUtil.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.DELETE; +import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.UPDATE; + import java.util.List; import org.apache.iceberg.DistributionMode; import org.apache.iceberg.MetadataColumns; @@ -38,30 +40,33 @@ import org.apache.spark.sql.connector.expressions.SortOrder; import org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command; -import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.DELETE; -import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.UPDATE; - public class SparkDistributionAndOrderingUtil { private static final NamedReference SPEC_ID = Expressions.column(MetadataColumns.SPEC_ID.name()); - private static final NamedReference PARTITION = Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME); - private static final NamedReference FILE_PATH = Expressions.column(MetadataColumns.FILE_PATH.name()); - private static final NamedReference ROW_POSITION = Expressions.column(MetadataColumns.ROW_POSITION.name()); + private static final NamedReference PARTITION = + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME); + private static final NamedReference FILE_PATH = + Expressions.column(MetadataColumns.FILE_PATH.name()); + private static final NamedReference ROW_POSITION = + Expressions.column(MetadataColumns.ROW_POSITION.name()); private static final SortOrder SPEC_ID_ORDER = Expressions.sort(SPEC_ID, SortDirection.ASCENDING); - private static final SortOrder PARTITION_ORDER = Expressions.sort(PARTITION, SortDirection.ASCENDING); - private static final SortOrder FILE_PATH_ORDER = Expressions.sort(FILE_PATH, SortDirection.ASCENDING); - private static final SortOrder ROW_POSITION_ORDER = Expressions.sort(ROW_POSITION, SortDirection.ASCENDING); - - private static final SortOrder[] EXISTING_FILE_ORDERING = new SortOrder[]{FILE_PATH_ORDER, ROW_POSITION_ORDER}; - private static final SortOrder[] POSITION_DELETE_ORDERING = new SortOrder[]{ - SPEC_ID_ORDER, PARTITION_ORDER, FILE_PATH_ORDER, ROW_POSITION_ORDER - }; - - private SparkDistributionAndOrderingUtil() { - } - - public static Distribution buildRequiredDistribution(Table table, DistributionMode distributionMode) { + private static final SortOrder PARTITION_ORDER = + Expressions.sort(PARTITION, SortDirection.ASCENDING); + private static final SortOrder FILE_PATH_ORDER = + Expressions.sort(FILE_PATH, SortDirection.ASCENDING); + private static final SortOrder ROW_POSITION_ORDER = + Expressions.sort(ROW_POSITION, SortDirection.ASCENDING); + + private static final SortOrder[] EXISTING_FILE_ORDERING = + new SortOrder[] {FILE_PATH_ORDER, ROW_POSITION_ORDER}; + private static final SortOrder[] POSITION_DELETE_ORDERING = + new SortOrder[] {SPEC_ID_ORDER, PARTITION_ORDER, FILE_PATH_ORDER, ROW_POSITION_ORDER}; + + private SparkDistributionAndOrderingUtil() {} + + public static Distribution buildRequiredDistribution( + Table table, DistributionMode distributionMode) { switch (distributionMode) { case NONE: return Distributions.unspecified(); @@ -86,8 +91,8 @@ public static SortOrder[] buildRequiredOrdering(Table table, Distribution distri } } - public static Distribution buildCopyOnWriteDistribution(Table table, Command command, - DistributionMode distributionMode) { + public static Distribution buildCopyOnWriteDistribution( + Table table, Command command, DistributionMode distributionMode) { if (command == DELETE || command == UPDATE) { return buildCopyOnWriteDeleteUpdateDistribution(table, distributionMode); } else { @@ -95,13 +100,14 @@ public static Distribution buildCopyOnWriteDistribution(Table table, Command com } } - private static Distribution buildCopyOnWriteDeleteUpdateDistribution(Table table, DistributionMode distributionMode) { + private static Distribution buildCopyOnWriteDeleteUpdateDistribution( + Table table, DistributionMode distributionMode) { switch (distributionMode) { case NONE: return Distributions.unspecified(); case HASH: - Expression[] clustering = new Expression[]{FILE_PATH}; + Expression[] clustering = new Expression[] {FILE_PATH}; return Distributions.clustered(clustering); case RANGE: @@ -109,7 +115,8 @@ private static Distribution buildCopyOnWriteDeleteUpdateDistribution(Table table if (table.sortOrder().isSorted()) { return Distributions.ordered(tableOrdering); } else { - SortOrder[] ordering = ObjectArrays.concat(tableOrdering, EXISTING_FILE_ORDERING, SortOrder.class); + SortOrder[] ordering = + ObjectArrays.concat(tableOrdering, EXISTING_FILE_ORDERING, SortOrder.class); return Distributions.ordered(ordering); } @@ -118,7 +125,8 @@ private static Distribution buildCopyOnWriteDeleteUpdateDistribution(Table table } } - public static SortOrder[] buildCopyOnWriteOrdering(Table table, Command command, Distribution distribution) { + public static SortOrder[] buildCopyOnWriteOrdering( + Table table, Command command, Distribution distribution) { if (command == DELETE || command == UPDATE) { return buildCopyOnWriteDeleteUpdateOrdering(table, distribution); } else { @@ -126,7 +134,8 @@ public static SortOrder[] buildCopyOnWriteOrdering(Table table, Command command, } } - private static SortOrder[] buildCopyOnWriteDeleteUpdateOrdering(Table table, Distribution distribution) { + private static SortOrder[] buildCopyOnWriteDeleteUpdateOrdering( + Table table, Distribution distribution) { if (distribution instanceof UnspecifiedDistribution) { return buildTableOrdering(table); @@ -143,12 +152,13 @@ private static SortOrder[] buildCopyOnWriteDeleteUpdateOrdering(Table table, Dis return orderedDistribution.ordering(); } else { - throw new IllegalArgumentException("Unexpected distribution type: " + distribution.getClass().getName()); + throw new IllegalArgumentException( + "Unexpected distribution type: " + distribution.getClass().getName()); } } - public static Distribution buildPositionDeltaDistribution(Table table, Command command, - DistributionMode distributionMode) { + public static Distribution buildPositionDeltaDistribution( + Table table, Command command, DistributionMode distributionMode) { if (command == DELETE || command == UPDATE) { return buildPositionDeleteUpdateDistribution(distributionMode); } else { @@ -156,27 +166,30 @@ public static Distribution buildPositionDeltaDistribution(Table table, Command c } } - private static Distribution buildPositionMergeDistribution(Table table, DistributionMode distributionMode) { + private static Distribution buildPositionMergeDistribution( + Table table, DistributionMode distributionMode) { switch (distributionMode) { case NONE: return Distributions.unspecified(); case HASH: if (table.spec().isUnpartitioned()) { - Expression[] clustering = new Expression[]{SPEC_ID, PARTITION, FILE_PATH}; + Expression[] clustering = new Expression[] {SPEC_ID, PARTITION, FILE_PATH}; return Distributions.clustered(clustering); } else { Distribution dataDistribution = buildRequiredDistribution(table, distributionMode); Expression[] dataClustering = ((ClusteredDistribution) dataDistribution).clustering(); - Expression[] deleteClustering = new Expression[]{SPEC_ID, PARTITION}; - Expression[] clustering = ObjectArrays.concat(deleteClustering, dataClustering, Expression.class); + Expression[] deleteClustering = new Expression[] {SPEC_ID, PARTITION}; + Expression[] clustering = + ObjectArrays.concat(deleteClustering, dataClustering, Expression.class); return Distributions.clustered(clustering); } case RANGE: Distribution dataDistribution = buildRequiredDistribution(table, distributionMode); SortOrder[] dataOrdering = ((OrderedDistribution) dataDistribution).ordering(); - SortOrder[] deleteOrdering = new SortOrder[]{SPEC_ID_ORDER, PARTITION_ORDER, FILE_PATH_ORDER}; + SortOrder[] deleteOrdering = + new SortOrder[] {SPEC_ID_ORDER, PARTITION_ORDER, FILE_PATH_ORDER}; SortOrder[] ordering = ObjectArrays.concat(deleteOrdering, dataOrdering, SortOrder.class); return Distributions.ordered(ordering); @@ -185,17 +198,18 @@ private static Distribution buildPositionMergeDistribution(Table table, Distribu } } - private static Distribution buildPositionDeleteUpdateDistribution(DistributionMode distributionMode) { + private static Distribution buildPositionDeleteUpdateDistribution( + DistributionMode distributionMode) { switch (distributionMode) { case NONE: return Distributions.unspecified(); case HASH: - Expression[] clustering = new Expression[]{SPEC_ID, PARTITION}; + Expression[] clustering = new Expression[] {SPEC_ID, PARTITION}; return Distributions.clustered(clustering); case RANGE: - SortOrder[] ordering = new SortOrder[]{SPEC_ID_ORDER, PARTITION_ORDER, FILE_PATH_ORDER}; + SortOrder[] ordering = new SortOrder[] {SPEC_ID_ORDER, PARTITION_ORDER, FILE_PATH_ORDER}; return Distributions.ordered(ordering); default: @@ -214,7 +228,8 @@ public static SortOrder[] buildPositionDeltaOrdering(Table table, Command comman } public static SortOrder[] convert(org.apache.iceberg.SortOrder sortOrder) { - List converted = SortOrderVisitor.visit(sortOrder, new SortOrderToSpark(sortOrder.schema())); + List converted = + SortOrderVisitor.visit(sortOrder, new SortOrderToSpark(sortOrder.schema())); return converted.toArray(new SortOrder[0]); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java index 2eb53baa688e..5c6fe3e0ff96 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import com.google.errorprone.annotations.FormatMethod; @@ -29,8 +28,7 @@ public class SparkExceptionUtil { - private SparkExceptionUtil() { - } + private SparkExceptionUtil() {} /** * Converts checked exceptions to unchecked exceptions. @@ -41,8 +39,8 @@ private SparkExceptionUtil() { * @return unchecked exception. */ @FormatMethod - public static RuntimeException toUncheckedException(final Throwable cause, final String message, - final Object... args) { + public static RuntimeException toUncheckedException( + final Throwable cause, final String message, final Object... args) { // Parameters are required to be final to help @FormatMethod do static analysis if (cause instanceof RuntimeException) { return (RuntimeException) cause; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java index 50c108c0b01b..c8dd54954fd6 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java @@ -16,9 +16,23 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.expressions.Expressions.and; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.isNaN; +import static org.apache.iceberg.expressions.Expressions.isNull; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.not; +import static org.apache.iceberg.expressions.Expressions.notIn; +import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.or; +import static org.apache.iceberg.expressions.Expressions.startsWith; + import java.sql.Date; import java.sql.Timestamp; import java.time.Instant; @@ -55,54 +69,39 @@ import org.apache.spark.sql.sources.Or; import org.apache.spark.sql.sources.StringStartsWith; -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.isNull; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.not; -import static org.apache.iceberg.expressions.Expressions.notIn; -import static org.apache.iceberg.expressions.Expressions.notNull; -import static org.apache.iceberg.expressions.Expressions.or; -import static org.apache.iceberg.expressions.Expressions.startsWith; - public class SparkFilters { private static final Pattern BACKTICKS_PATTERN = Pattern.compile("([`])(.|$)"); - private SparkFilters() { - } + private SparkFilters() {} - private static final Map, Operation> FILTERS = ImmutableMap - ., Operation>builder() - .put(AlwaysTrue.class, Operation.TRUE) - .put(AlwaysTrue$.class, Operation.TRUE) - .put(AlwaysFalse$.class, Operation.FALSE) - .put(AlwaysFalse.class, Operation.FALSE) - .put(EqualTo.class, Operation.EQ) - .put(EqualNullSafe.class, Operation.EQ) - .put(GreaterThan.class, Operation.GT) - .put(GreaterThanOrEqual.class, Operation.GT_EQ) - .put(LessThan.class, Operation.LT) - .put(LessThanOrEqual.class, Operation.LT_EQ) - .put(In.class, Operation.IN) - .put(IsNull.class, Operation.IS_NULL) - .put(IsNotNull.class, Operation.NOT_NULL) - .put(And.class, Operation.AND) - .put(Or.class, Operation.OR) - .put(Not.class, Operation.NOT) - .put(StringStartsWith.class, Operation.STARTS_WITH) - .build(); + private static final Map, Operation> FILTERS = + ImmutableMap., Operation>builder() + .put(AlwaysTrue.class, Operation.TRUE) + .put(AlwaysTrue$.class, Operation.TRUE) + .put(AlwaysFalse$.class, Operation.FALSE) + .put(AlwaysFalse.class, Operation.FALSE) + .put(EqualTo.class, Operation.EQ) + .put(EqualNullSafe.class, Operation.EQ) + .put(GreaterThan.class, Operation.GT) + .put(GreaterThanOrEqual.class, Operation.GT_EQ) + .put(LessThan.class, Operation.LT) + .put(LessThanOrEqual.class, Operation.LT_EQ) + .put(In.class, Operation.IN) + .put(IsNull.class, Operation.IS_NULL) + .put(IsNotNull.class, Operation.NOT_NULL) + .put(And.class, Operation.AND) + .put(Or.class, Operation.OR) + .put(Not.class, Operation.NOT) + .put(StringStartsWith.class, Operation.STARTS_WITH) + .build(); public static Expression convert(Filter[] filters) { Expression expression = Expressions.alwaysTrue(); for (Filter filter : filters) { Expression converted = convert(filter); - Preconditions.checkArgument(converted != null, "Cannot convert filter to Iceberg: %s", filter); + Preconditions.checkArgument( + converted != null, "Cannot convert filter to Iceberg: %s", filter); expression = Expressions.and(expression, converted); } return expression; @@ -147,8 +146,8 @@ public static Expression convert(Filter filter) { if (filter instanceof EqualTo) { EqualTo eq = (EqualTo) filter; // comparison with null in normal equality is always null. this is probably a mistake. - Preconditions.checkNotNull(eq.value(), - "Expression is always false (eq is not null-safe): %s", filter); + Preconditions.checkNotNull( + eq.value(), "Expression is always false (eq is not null-safe): %s", filter); return handleEqual(unquote(eq.attribute()), eq.value()); } else { EqualNullSafe eq = (EqualNullSafe) filter; @@ -161,7 +160,8 @@ public static Expression convert(Filter filter) { case IN: In inFilter = (In) filter; - return in(unquote(inFilter.attribute()), + return in( + unquote(inFilter.attribute()), Stream.of(inFilter.values()) .filter(Objects::nonNull) .map(SparkFilters::convertLiteral) @@ -174,12 +174,15 @@ public static Expression convert(Filter filter) { if (childOp == Operation.IN) { // infer an extra notNull predicate for Spark NOT IN filters // as Iceberg expressions don't follow the 3-value SQL boolean logic - // col NOT IN (1, 2) in Spark is equivalent to notNull(col) && notIn(col, 1, 2) in Iceberg + // col NOT IN (1, 2) in Spark is equivalent to notNull(col) && notIn(col, 1, 2) in + // Iceberg In childInFilter = (In) childFilter; - Expression notIn = notIn(unquote(childInFilter.attribute()), - Stream.of(childInFilter.values()) - .map(SparkFilters::convertLiteral) - .collect(Collectors.toList())); + Expression notIn = + notIn( + unquote(childInFilter.attribute()), + Stream.of(childInFilter.values()) + .map(SparkFilters::convertLiteral) + .collect(Collectors.toList())); return and(notNull(childInFilter.attribute()), notIn); } else if (hasNoInFilter(childFilter)) { Expression child = convert(childFilter); @@ -189,30 +192,33 @@ public static Expression convert(Filter filter) { } return null; - case AND: { - And andFilter = (And) filter; - Expression left = convert(andFilter.left()); - Expression right = convert(andFilter.right()); - if (left != null && right != null) { - return and(left, right); + case AND: + { + And andFilter = (And) filter; + Expression left = convert(andFilter.left()); + Expression right = convert(andFilter.right()); + if (left != null && right != null) { + return and(left, right); + } + return null; } - return null; - } - case OR: { - Or orFilter = (Or) filter; - Expression left = convert(orFilter.left()); - Expression right = convert(orFilter.right()); - if (left != null && right != null) { - return or(left, right); + case OR: + { + Or orFilter = (Or) filter; + Expression left = convert(orFilter.left()); + Expression right = convert(orFilter.right()); + if (left != null && right != null) { + return or(left, right); + } + return null; } - return null; - } - case STARTS_WITH: { - StringStartsWith stringStartsWith = (StringStartsWith) filter; - return startsWith(unquote(stringStartsWith.attribute()), stringStartsWith.value()); - } + case STARTS_WITH: + { + StringStartsWith stringStartsWith = (StringStartsWith) filter; + return startsWith(unquote(stringStartsWith.attribute()), stringStartsWith.value()); + } } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java index d4dd53d34a97..b35213501aef 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Schema; @@ -27,9 +26,10 @@ /** * By default Spark type {@link org.apache.iceberg.types.Types.TimestampType} should be converted to - * {@link Types.TimestampType#withZone()} iceberg type. But we also can convert - * {@link org.apache.iceberg.types.Types.TimestampType} to {@link Types.TimestampType#withoutZone()} iceberg type - * by setting {@link SparkSQLProperties#USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES} to 'true' + * {@link Types.TimestampType#withZone()} iceberg type. But we also can convert {@link + * org.apache.iceberg.types.Types.TimestampType} to {@link Types.TimestampType#withoutZone()} + * iceberg type by setting {@link SparkSQLProperties#USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES} + * to 'true' */ class SparkFixupTimestampType extends FixupTypes { @@ -38,8 +38,8 @@ private SparkFixupTimestampType(Schema referenceSchema) { } static Schema fixup(Schema schema) { - return new Schema(TypeUtil.visit(schema, - new SparkFixupTimestampType(schema)).asStructType().fields()); + return new Schema( + TypeUtil.visit(schema, new SparkFixupTimestampType(schema)).asStructType().fields()); } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java index 5508965af249..6c4ec39b20f1 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Schema; @@ -25,8 +24,8 @@ import org.apache.iceberg.types.TypeUtil; /** - * Some types, like binary and fixed, are converted to the same Spark type. Conversion back - * can produce only one, which may not be correct. + * Some types, like binary and fixed, are converted to the same Spark type. Conversion back can + * produce only one, which may not be correct. */ class SparkFixupTypes extends FixupTypes { @@ -35,8 +34,8 @@ private SparkFixupTypes(Schema referenceSchema) { } static Schema fixup(Schema schema, Schema referenceSchema) { - return new Schema(TypeUtil.visit(schema, - new SparkFixupTypes(referenceSchema)).asStructType().fields()); + return new Schema( + TypeUtil.visit(schema, new SparkFixupTypes(referenceSchema)).asStructType().fields()); } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java index c7c01758c3ee..ef262e11f02b 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -31,18 +30,21 @@ /** * A class for common Iceberg configs for Spark reads. - *

    - * If a config is set at multiple levels, the following order of precedence is used (top to bottom): + * + *

    If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * *

      - *
    1. Read options
    2. - *
    3. Session configuration
    4. - *
    5. Table metadata
    6. + *
    7. Read options + *
    8. Session configuration + *
    9. Table metadata *
    - * The most specific value is set in read options and takes precedence over all other configs. - * If no read option is provided, this class checks the session configuration for any overrides. - * If no applicable value is found in the session configuration, this class uses the table metadata. - *

    - * Note this class is NOT meant to be serialized and sent to executors. + * + * The most specific value is set in read options and takes precedence over all other configs. If no + * read option is provided, this class checks the session configuration for any overrides. If no + * applicable value is found in the session configuration, this class uses the table metadata. + * + *

    Note this class is NOT meant to be serialized and sent to executors. */ public class SparkReadConf { @@ -70,61 +72,51 @@ public boolean localityEnabled() { if (file instanceof HadoopInputFile) { String scheme = ((HadoopInputFile) file).getFileSystem().getScheme(); boolean defaultValue = LOCALITY_WHITELIST_FS.contains(scheme); - return PropertyUtil.propertyAsBoolean( - readOptions, - SparkReadOptions.LOCALITY, - defaultValue); + return PropertyUtil.propertyAsBoolean(readOptions, SparkReadOptions.LOCALITY, defaultValue); } return false; } public Long snapshotId() { - return confParser.longConf() - .option(SparkReadOptions.SNAPSHOT_ID) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.SNAPSHOT_ID).parseOptional(); } public Long asOfTimestamp() { - return confParser.longConf() - .option(SparkReadOptions.AS_OF_TIMESTAMP) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.AS_OF_TIMESTAMP).parseOptional(); } public Long startSnapshotId() { - return confParser.longConf() - .option(SparkReadOptions.START_SNAPSHOT_ID) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.START_SNAPSHOT_ID).parseOptional(); } public Long endSnapshotId() { - return confParser.longConf() - .option(SparkReadOptions.END_SNAPSHOT_ID) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.END_SNAPSHOT_ID).parseOptional(); } public String fileScanTaskSetId() { - return confParser.stringConf() - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID) - .parseOptional(); + return confParser.stringConf().option(SparkReadOptions.FILE_SCAN_TASK_SET_ID).parseOptional(); } public boolean streamingSkipDeleteSnapshots() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS) .defaultValue(SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS_DEFAULT) .parse(); } public boolean streamingSkipOverwriteSnapshots() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS) .defaultValue(SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS_DEFAULT) .parse(); } public boolean parquetVectorizationEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.VECTORIZATION_ENABLED) .sessionConf(SparkSQLProperties.VECTORIZATION_ENABLED) .tableProperty(TableProperties.PARQUET_VECTORIZATION_ENABLED) @@ -133,7 +125,8 @@ public boolean parquetVectorizationEnabled() { } public int parquetBatchSize() { - return confParser.intConf() + return confParser + .intConf() .option(SparkReadOptions.VECTORIZATION_BATCH_SIZE) .tableProperty(TableProperties.PARQUET_BATCH_SIZE) .defaultValue(TableProperties.PARQUET_BATCH_SIZE_DEFAULT) @@ -141,7 +134,8 @@ public int parquetBatchSize() { } public boolean orcVectorizationEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.VECTORIZATION_ENABLED) .sessionConf(SparkSQLProperties.VECTORIZATION_ENABLED) .tableProperty(TableProperties.ORC_VECTORIZATION_ENABLED) @@ -150,7 +144,8 @@ public boolean orcVectorizationEnabled() { } public int orcBatchSize() { - return confParser.intConf() + return confParser + .intConf() .option(SparkReadOptions.VECTORIZATION_BATCH_SIZE) .tableProperty(TableProperties.ORC_BATCH_SIZE) .defaultValue(TableProperties.ORC_BATCH_SIZE_DEFAULT) @@ -158,13 +153,12 @@ public int orcBatchSize() { } public Long splitSizeOption() { - return confParser.longConf() - .option(SparkReadOptions.SPLIT_SIZE) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.SPLIT_SIZE).parseOptional(); } public long splitSize() { - return confParser.longConf() + return confParser + .longConf() .option(SparkReadOptions.SPLIT_SIZE) .tableProperty(TableProperties.SPLIT_SIZE) .defaultValue(TableProperties.SPLIT_SIZE_DEFAULT) @@ -172,13 +166,12 @@ public long splitSize() { } public Integer splitLookbackOption() { - return confParser.intConf() - .option(SparkReadOptions.LOOKBACK) - .parseOptional(); + return confParser.intConf().option(SparkReadOptions.LOOKBACK).parseOptional(); } public int splitLookback() { - return confParser.intConf() + return confParser + .intConf() .option(SparkReadOptions.LOOKBACK) .tableProperty(TableProperties.SPLIT_LOOKBACK) .defaultValue(TableProperties.SPLIT_LOOKBACK_DEFAULT) @@ -186,13 +179,12 @@ public int splitLookback() { } public Long splitOpenFileCostOption() { - return confParser.longConf() - .option(SparkReadOptions.FILE_OPEN_COST) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.FILE_OPEN_COST).parseOptional(); } public long splitOpenFileCost() { - return confParser.longConf() + return confParser + .longConf() .option(SparkReadOptions.FILE_OPEN_COST) .tableProperty(TableProperties.SPLIT_OPEN_FILE_COST) .defaultValue(TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT) @@ -201,18 +193,20 @@ public long splitOpenFileCost() { /** * Enables reading a timestamp without time zone as a timestamp with time zone. - *

    - * Generally, this is not safe as a timestamp without time zone is supposed to represent the wall-clock time, - * i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, - * but a timestamp with time zone represents instant semantics, i.e. the timestamp - * is adjusted so that the corresponding time in the reader timezone is displayed. - *

    - * When set to false (default), an exception must be thrown while reading a timestamp without time zone. + * + *

    Generally, this is not safe as a timestamp without time zone is supposed to represent the + * wall-clock time, i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, + * but a timestamp with time zone represents instant semantics, i.e. the timestamp is adjusted so + * that the corresponding time in the reader timezone is displayed. + * + *

    When set to false (default), an exception must be thrown while reading a timestamp without + * time zone. * * @return boolean indicating if reading timestamps without timezone is allowed */ public boolean handleTimestampWithoutZone() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .sessionConf(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .defaultValue(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT) @@ -220,7 +214,8 @@ public boolean handleTimestampWithoutZone() { } public Long streamFromTimestamp() { - return confParser.longConf() + return confParser + .longConf() .option(SparkReadOptions.STREAM_FROM_TIMESTAMP) .defaultValue(Long.MIN_VALUE) .parse(); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java index edcc2300344a..d13e80d40004 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java @@ -16,16 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; -/** - * Spark DF read options - */ +/** Spark DF read options */ public class SparkReadOptions { - private SparkReadOptions() { - } + private SparkReadOptions() {} // Snapshot ID of the table snapshot to read public static final String SNAPSHOT_ID = "snapshot-id"; @@ -62,11 +58,13 @@ private SparkReadOptions() { public static final boolean STREAMING_SKIP_DELETE_SNAPSHOTS_DEFAULT = false; // skip snapshots of type overwrite while reading stream out of iceberg table - public static final String STREAMING_SKIP_OVERWRITE_SNAPSHOTS = "streaming-skip-overwrite-snapshots"; + public static final String STREAMING_SKIP_OVERWRITE_SNAPSHOTS = + "streaming-skip-overwrite-snapshots"; public static final boolean STREAMING_SKIP_OVERWRITE_SNAPSHOTS_DEFAULT = false; // Controls whether to allow reading timestamps without zone info - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = "handle-timestamp-without-timezone"; + public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = + "handle-timestamp-without-timezone"; // Controls whether to report locality information to Spark while allocating input partitions public static final String LOCALITY = "locality"; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java index f2dcc13bece0..fa8bd719f391 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java @@ -16,19 +16,18 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; public class SparkSQLProperties { - private SparkSQLProperties() { - } + private SparkSQLProperties() {} // Controls whether vectorized reads are enabled public static final String VECTORIZATION_ENABLED = "spark.sql.iceberg.vectorization.enabled"; // Controls whether reading/writing timestamps without timezones is allowed - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = "spark.sql.iceberg.handle-timestamp-without-timezone"; + public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = + "spark.sql.iceberg.handle-timestamp-without-timezone"; public static final boolean HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT = false; // Controls whether timestamp types for new tables should be stored with timezone info diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java index c242f8535206..822a5cc97ea2 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Collection; @@ -45,17 +44,14 @@ import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.StructType; -/** - * Helper methods for working with Spark/Hive metadata. - */ +/** Helper methods for working with Spark/Hive metadata. */ public class SparkSchemaUtil { - private SparkSchemaUtil() { - } + private SparkSchemaUtil() {} /** * Returns a {@link Schema} for the given table with fresh field ids. - *

    - * This creates a Schema for an existing table by looking up the table's schema with Spark and + * + *

    This creates a Schema for an existing table by looking up the table's schema with Spark and * converting that schema. Spark/Hive partition columns are included in the schema. * * @param spark a Spark session @@ -70,8 +66,8 @@ public static Schema schemaForTable(SparkSession spark, String name) { /** * Returns a {@link PartitionSpec} for the given table. - *

    - * This creates a partition spec for an existing table by looking up the table's schema and + * + *

    This creates a partition spec for an existing table by looking up the table's schema and * creating a spec with identity partitions for each partition column. * * @param spark a Spark session @@ -79,14 +75,15 @@ public static Schema schemaForTable(SparkSession spark, String name) { * @return a PartitionSpec for the table * @throws AnalysisException if thrown by the Spark catalog */ - public static PartitionSpec specForTable(SparkSession spark, String name) throws AnalysisException { + public static PartitionSpec specForTable(SparkSession spark, String name) + throws AnalysisException { List parts = Lists.newArrayList(Splitter.on('.').limit(2).split(name)); String db = parts.size() == 1 ? "default" : parts.get(0); String table = parts.get(parts.size() == 1 ? 0 : 1); - PartitionSpec spec = identitySpec( - schemaForTable(spark, name), - spark.catalog().listColumns(db, table).collectAsList()); + PartitionSpec spec = + identitySpec( + schemaForTable(spark, name), spark.catalog().listColumns(db, table).collectAsList()); return spec == null ? PartitionSpec.unpartitioned() : spec; } @@ -114,13 +111,14 @@ public static DataType convert(Type type) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. - *

    - * This conversion assigns fresh ids. - *

    - * Some data types are represented as the same Spark type. These are converted to a default type. - *

    - * To convert using a reference schema for field ids and ambiguous types, use - * {@link #convert(Schema, StructType)}. + * + *

    This conversion assigns fresh ids. + * + *

    Some data types are represented as the same Spark type. These are converted to a default + * type. + * + *

    To convert using a reference schema for field ids and ambiguous types, use {@link + * #convert(Schema, StructType)}. * * @param sparkType a Spark StructType * @return the equivalent Schema @@ -132,16 +130,18 @@ public static Schema convert(StructType sparkType) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. - *

    - * This conversion assigns fresh ids. - *

    - * Some data types are represented as the same Spark type. These are converted to a default type. - *

    - * To convert using a reference schema for field ids and ambiguous types, use - * {@link #convert(Schema, StructType)}. + * + *

    This conversion assigns fresh ids. + * + *

    Some data types are represented as the same Spark type. These are converted to a default + * type. + * + *

    To convert using a reference schema for field ids and ambiguous types, use {@link + * #convert(Schema, StructType)}. * * @param sparkType a Spark StructType - * @param useTimestampWithoutZone boolean flag indicates that timestamp should be stored without timezone + * @param useTimestampWithoutZone boolean flag indicates that timestamp should be stored without + * timezone * @return the equivalent Schema * @throws IllegalArgumentException if the type cannot be converted */ @@ -156,13 +156,14 @@ public static Schema convert(StructType sparkType, boolean useTimestampWithoutZo /** * Convert a Spark {@link DataType struct} to a {@link Type} with new field ids. - *

    - * This conversion assigns fresh ids. - *

    - * Some data types are represented as the same Spark type. These are converted to a default type. - *

    - * To convert using a reference schema for field ids and ambiguous types, use - * {@link #convert(Schema, StructType)}. + * + *

    This conversion assigns fresh ids. + * + *

    Some data types are represented as the same Spark type. These are converted to a default + * type. + * + *

    To convert using a reference schema for field ids and ambiguous types, use {@link + * #convert(Schema, StructType)}. * * @param sparkType a Spark DataType * @return the equivalent Type @@ -174,11 +175,11 @@ public static Type convert(DataType sparkType) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} based on the given schema. - *

    - * This conversion does not assign new ids; it uses ids from the base schema. - *

    - * Data types, field order, and nullability will match the spark type. This conversion may return - * a schema that is not compatible with base schema. + * + *

    This conversion does not assign new ids; it uses ids from the base schema. + * + *

    Data types, field order, and nullability will match the spark type. This conversion may + * return a schema that is not compatible with base schema. * * @param baseSchema a Schema on which conversion is based * @param sparkType a Spark StructType @@ -187,7 +188,8 @@ public static Type convert(DataType sparkType) { */ public static Schema convert(Schema baseSchema, StructType sparkType) { // convert to a type with fresh ids - Types.StructType struct = SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); + Types.StructType struct = + SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); // reassign ids to match the base schema Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema); // fix types that can't be represented in Spark (UUID and Fixed) @@ -196,11 +198,11 @@ public static Schema convert(Schema baseSchema, StructType sparkType) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} based on the given schema. - *

    - * This conversion will assign new ids for fields that are not found in the base schema. - *

    - * Data types, field order, and nullability will match the spark type. This conversion may return - * a schema that is not compatible with base schema. + * + *

    This conversion will assign new ids for fields that are not found in the base schema. + * + *

    Data types, field order, and nullability will match the spark type. This conversion may + * return a schema that is not compatible with base schema. * * @param baseSchema a Schema on which conversion is based * @param sparkType a Spark StructType @@ -209,7 +211,8 @@ public static Schema convert(Schema baseSchema, StructType sparkType) { */ public static Schema convertWithFreshIds(Schema baseSchema, StructType sparkType) { // convert to a type with fresh ids - Types.StructType struct = SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); + Types.StructType struct = + SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); // reassign ids to match the base schema Schema schema = TypeUtil.reassignOrRefreshIds(new Schema(struct.fields()), baseSchema); // fix types that can't be represented in Spark (UUID and Fixed) @@ -218,8 +221,8 @@ public static Schema convertWithFreshIds(Schema baseSchema, StructType sparkType /** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - *

    - * This requires that the Spark type is a projection of the Schema. Nullability and types must + * + *

    This requires that the Spark type is a projection of the Schema. Nullability and types must * match. * * @param schema a Schema @@ -228,19 +231,20 @@ public static Schema convertWithFreshIds(Schema baseSchema, StructType sparkType * @throws IllegalArgumentException if the Spark type does not match the Schema */ public static Schema prune(Schema schema, StructType requestedType) { - return new Schema(TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, ImmutableSet.of())) - .asNestedType() - .asStructType() - .fields()); + return new Schema( + TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, ImmutableSet.of())) + .asNestedType() + .asStructType() + .fields()); } /** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - *

    - * This requires that the Spark type is a projection of the Schema. Nullability and types must + * + *

    This requires that the Spark type is a projection of the Schema. Nullability and types must * match. - *

    - * The filters list of {@link Expression} is used to ensure that columns referenced by filters + * + *

    The filters list of {@link Expression} is used to ensure that columns referenced by filters * are projected. * * @param schema a Schema @@ -251,19 +255,20 @@ public static Schema prune(Schema schema, StructType requestedType) { */ public static Schema prune(Schema schema, StructType requestedType, List filters) { Set filterRefs = Binder.boundReferences(schema.asStruct(), filters, true); - return new Schema(TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) - .asNestedType() - .asStructType() - .fields()); + return new Schema( + TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) + .asNestedType() + .asStructType() + .fields()); } /** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - *

    - * This requires that the Spark type is a projection of the Schema. Nullability and types must + * + *

    This requires that the Spark type is a projection of the Schema. Nullability and types must * match. - *

    - * The filters list of {@link Expression} is used to ensure that columns referenced by filters + * + *

    The filters list of {@link Expression} is used to ensure that columns referenced by filters * are projected. * * @param schema a Schema @@ -272,14 +277,16 @@ public static Schema prune(Schema schema, StructType requestedType, List filterRefs = Binder.boundReferences(schema.asStruct(), Collections.singletonList(filter), caseSensitive); - return new Schema(TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) - .asNestedType() - .asStructType() - .fields()); + return new Schema( + TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) + .asNestedType() + .asStructType() + .fields()); } private static PartitionSpec identitySpec(Schema schema, Collection columns) { @@ -309,7 +316,7 @@ private static PartitionSpec identitySpec(Schema schema, List partitionN /** * Estimate approximate table size based on Spark schema and total records. * - * @param tableSchema Spark schema + * @param tableSchema Spark schema * @param totalRecords total records in the table * @return approximate size based on table schema */ @@ -328,15 +335,18 @@ public static long estimateSize(StructType tableSchema, long totalRecords) { } public static void validateMetadataColumnReferences(Schema tableSchema, Schema readSchema) { - List conflictingColumnNames = readSchema.columns().stream() - .map(Types.NestedField::name) - .filter(name -> MetadataColumns.isMetadataColumn(name) && tableSchema.findField(name) != null) - .collect(Collectors.toList()); + List conflictingColumnNames = + readSchema.columns().stream() + .map(Types.NestedField::name) + .filter( + name -> + MetadataColumns.isMetadataColumn(name) && tableSchema.findField(name) != null) + .collect(Collectors.toList()); ValidationException.check( conflictingColumnNames.isEmpty(), - "Table column names conflict with names reserved for Iceberg metadata columns: %s.\n" + - "Please, use ALTER TABLE statements to rename the conflicting table columns.", + "Table column names conflict with names reserved for Iceberg metadata columns: %s.\n" + + "Please, use ALTER TABLE statements to rename the conflicting table columns.", conflictingColumnNames); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkSessionCatalog.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkSessionCatalog.java index 483b0d4c675a..68c72113111d 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkSessionCatalog.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkSessionCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -50,8 +49,8 @@ * * @param CatalogPlugin class to avoid casting to TableCatalog and SupportsNamespaces. */ -public class SparkSessionCatalog - extends BaseCatalog implements CatalogExtension { +public class SparkSessionCatalog extends BaseCatalog + implements CatalogExtension { private static final String[] DEFAULT_NAMESPACE = new String[] {"default"}; private String catalogName = null; @@ -64,8 +63,9 @@ public class SparkSessionCatalog /** * Build a {@link SparkCatalog} to be used for Iceberg operations. - *

    - * The default implementation creates a new SparkCatalog with the session catalog's name and options. + * + *

    The default implementation creates a new SparkCatalog with the session catalog's name and + * options. * * @param name catalog name * @param options catalog options @@ -93,17 +93,20 @@ public String[][] listNamespaces(String[] namespace) throws NoSuchNamespaceExcep } @Override - public Map loadNamespaceMetadata(String[] namespace) throws NoSuchNamespaceException { + public Map loadNamespaceMetadata(String[] namespace) + throws NoSuchNamespaceException { return getSessionCatalog().loadNamespaceMetadata(namespace); } @Override - public void createNamespace(String[] namespace, Map metadata) throws NamespaceAlreadyExistsException { + public void createNamespace(String[] namespace, Map metadata) + throws NamespaceAlreadyExistsException { getSessionCatalog().createNamespace(namespace, metadata); } @Override - public void alterNamespace(String[] namespace, NamespaceChange... changes) throws NoSuchNamespaceException { + public void alterNamespace(String[] namespace, NamespaceChange... changes) + throws NoSuchNamespaceException { getSessionCatalog().alterNamespace(namespace, changes); } @@ -136,8 +139,8 @@ public void invalidateTable(Identifier ident) { } @Override - public Table createTable(Identifier ident, StructType schema, Transform[] partitions, - Map properties) + public Table createTable( + Identifier ident, StructType schema, Transform[] partitions, Map properties) throws TableAlreadyExistsException, NoSuchNamespaceException { String provider = properties.get("provider"); if (useIceberg(provider)) { @@ -149,8 +152,8 @@ public Table createTable(Identifier ident, StructType schema, Transform[] partit } @Override - public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] partitions, - Map properties) + public StagedTable stageCreate( + Identifier ident, StructType schema, Transform[] partitions, Map properties) throws TableAlreadyExistsException, NoSuchNamespaceException { String provider = properties.get("provider"); TableCatalog catalog; @@ -163,14 +166,15 @@ public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] catalog = getSessionCatalog(); } - // create the table with the session catalog, then wrap it in a staged table that will delete to roll back + // create the table with the session catalog, then wrap it in a staged table that will delete to + // roll back Table table = catalog.createTable(ident, schema, partitions, properties); return new RollbackStagedTable(catalog, ident, table); } @Override - public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] partitions, - Map properties) + public StagedTable stageReplace( + Identifier ident, StructType schema, Transform[] partitions, Map properties) throws NoSuchNamespaceException, NoSuchTableException { String provider = properties.get("provider"); TableCatalog catalog; @@ -189,7 +193,8 @@ public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] } try { - // create the table with the session catalog, then wrap it in a staged table that will delete to roll back + // create the table with the session catalog, then wrap it in a staged table that will delete + // to roll back Table table = catalog.createTable(ident, schema, partitions, properties); return new RollbackStagedTable(catalog, ident, table); @@ -200,8 +205,9 @@ public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] } @Override - public StagedTable stageCreateOrReplace(Identifier ident, StructType schema, Transform[] partitions, - Map properties) throws NoSuchNamespaceException { + public StagedTable stageCreateOrReplace( + Identifier ident, StructType schema, Transform[] partitions, Map properties) + throws NoSuchNamespaceException { String provider = properties.get("provider"); TableCatalog catalog; if (useIceberg(provider)) { @@ -217,7 +223,8 @@ public StagedTable stageCreateOrReplace(Identifier ident, StructType schema, Tra catalog.dropTable(ident); try { - // create the table with the session catalog, then wrap it in a staged table that will delete to roll back + // create the table with the session catalog, then wrap it in a staged table that will delete + // to roll back Table sessionCatalogTable = catalog.createTable(ident, schema, partitions, properties); return new RollbackStagedTable(catalog, ident, sessionCatalogTable); @@ -238,21 +245,25 @@ public Table alterTable(Identifier ident, TableChange... changes) throws NoSuchT @Override public boolean dropTable(Identifier ident) { - // no need to check table existence to determine which catalog to use. if a table doesn't exist then both are + // no need to check table existence to determine which catalog to use. if a table doesn't exist + // then both are // required to return false. return icebergCatalog.dropTable(ident) || getSessionCatalog().dropTable(ident); } @Override public boolean purgeTable(Identifier ident) { - // no need to check table existence to determine which catalog to use. if a table doesn't exist then both are + // no need to check table existence to determine which catalog to use. if a table doesn't exist + // then both are // required to return false. return icebergCatalog.purgeTable(ident) || getSessionCatalog().purgeTable(ident); } @Override - public void renameTable(Identifier from, Identifier to) throws NoSuchTableException, TableAlreadyExistsException { - // rename is not supported by HadoopCatalog. to avoid UnsupportedOperationException for session catalog tables, + public void renameTable(Identifier from, Identifier to) + throws NoSuchTableException, TableAlreadyExistsException { + // rename is not supported by HadoopCatalog. to avoid UnsupportedOperationException for session + // catalog tables, // check table existence first to ensure that the table belongs to the Iceberg catalog. if (icebergCatalog.tableExists(from)) { icebergCatalog.renameTable(from, to); @@ -289,15 +300,18 @@ private void validateHmsUri(String catalogHmsUri) { return; } - Preconditions.checkArgument(catalogHmsUri.equals(envHmsUri), + Preconditions.checkArgument( + catalogHmsUri.equals(envHmsUri), "Inconsistent Hive metastore URIs: %s (Spark session) != %s (spark_catalog)", - envHmsUri, catalogHmsUri); + envHmsUri, + catalogHmsUri); } @Override @SuppressWarnings("unchecked") public void setDelegateCatalog(CatalogPlugin sparkSessionCatalog) { - if (sparkSessionCatalog instanceof TableCatalog && sparkSessionCatalog instanceof SupportsNamespaces) { + if (sparkSessionCatalog instanceof TableCatalog + && sparkSessionCatalog instanceof SupportsNamespaces) { this.sessionCatalog = (T) sparkSessionCatalog; } else { throw new IllegalArgumentException("Invalid session catalog: " + sparkSessionCatalog); @@ -324,14 +338,17 @@ private boolean useIceberg(String provider) { } private T getSessionCatalog() { - Preconditions.checkNotNull(sessionCatalog, "Delegated SessionCatalog is missing. " + - "Please make sure your are replacing Spark's default catalog, named 'spark_catalog'."); + Preconditions.checkNotNull( + sessionCatalog, + "Delegated SessionCatalog is missing. " + + "Please make sure your are replacing Spark's default catalog, named 'spark_catalog'."); return sessionCatalog; } @Override public Catalog icebergCatalog() { - Preconditions.checkArgument(icebergCatalog instanceof HasIcebergCatalog, + Preconditions.checkArgument( + icebergCatalog instanceof HasIcebergCatalog, "Cannot return underlying Iceberg Catalog, wrapped catalog does not contain an Iceberg Catalog"); return ((HasIcebergCatalog) icebergCatalog).icebergCatalog(); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java index 30509e3381dc..77cfa0f34c63 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.StructLike; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkTableCache.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkTableCache.java index ec587c529f41..6218423db491 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkTableCache.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkTableCache.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java index 2fedb4c74a70..3b7e063d9960 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.spark.sql.functions.col; + import java.io.IOException; import java.io.Serializable; import java.net.URI; @@ -96,28 +97,25 @@ import scala.collection.mutable.Builder; import scala.runtime.AbstractPartialFunction; -import static org.apache.spark.sql.functions.col; - /** * Java version of the original SparkTableUtil.scala * https://github.com/apache/iceberg/blob/apache-iceberg-0.8.0-incubating/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala */ public class SparkTableUtil { - private static final String DUPLICATE_FILE_MESSAGE = "Cannot complete import because data files " + - "to be imported already exist within the target table: %s. " + - "This is disabled by default as Iceberg is not designed for mulitple references to the same file" + - " within the same table. If you are sure, you may set 'check_duplicate_files' to false to force the import."; + private static final String DUPLICATE_FILE_MESSAGE = + "Cannot complete import because data files " + + "to be imported already exist within the target table: %s. " + + "This is disabled by default as Iceberg is not designed for mulitple references to the same file" + + " within the same table. If you are sure, you may set 'check_duplicate_files' to false to force the import."; - - private SparkTableUtil() { - } + private SparkTableUtil() {} /** * Returns a DataFrame with a row for each partition in the table. * - * The DataFrame has 3 columns, partition key (a=1/b=2), partition location, and format - * (avro or parquet). + *

    The DataFrame has 3 columns, partition key (a=1/b=2), partition location, and format (avro + * or parquet). * * @param spark a Spark session * @param table a table name and (optional) database @@ -125,7 +123,9 @@ private SparkTableUtil() { */ public static Dataset partitionDF(SparkSession spark, String table) { List partitions = getPartitions(spark, table); - return spark.createDataFrame(partitions, SparkPartition.class).toDF("partition", "uri", "format"); + return spark + .createDataFrame(partitions, SparkPartition.class) + .toDF("partition", "uri", "format"); } /** @@ -136,9 +136,12 @@ public static Dataset partitionDF(SparkSession spark, String table) { * @param expression The expression whose matching partitions are returned. * @return a DataFrame of the table partitions. */ - public static Dataset partitionDFByFilter(SparkSession spark, String table, String expression) { + public static Dataset partitionDFByFilter( + SparkSession spark, String table, String expression) { List partitions = getPartitionsByFilter(spark, table, expression); - return spark.createDataFrame(partitions, SparkPartition.class).toDF("partition", "uri", "format"); + return spark + .createDataFrame(partitions, SparkPartition.class) + .toDF("partition", "uri", "format"); } /** @@ -153,7 +156,8 @@ public static List getPartitions(SparkSession spark, String tabl TableIdentifier tableIdent = spark.sessionState().sqlParser().parseTableIdentifier(table); return getPartitions(spark, tableIdent, null); } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to parse table identifier: %s", table); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to parse table identifier: %s", table); } } @@ -165,8 +169,8 @@ public static List getPartitions(SparkSession spark, String tabl * @param partitionFilter partition filter, or null if no filter * @return all table's partitions */ - public static List getPartitions(SparkSession spark, TableIdentifier tableIdent, - Map partitionFilter) { + public static List getPartitions( + SparkSession spark, TableIdentifier tableIdent, Map partitionFilter) { try { SessionCatalog catalog = spark.sessionState().catalog(); CatalogTable catalogTable = catalog.getTableMetadata(tableIdent); @@ -180,17 +184,17 @@ public static List getPartitions(SparkSession spark, TableIdenti } else { scalaPartitionFilter = Option.empty(); } - Seq partitions = catalog.listPartitions(tableIdent, scalaPartitionFilter).toIndexedSeq(); - return JavaConverters - .seqAsJavaListConverter(partitions) - .asJava() - .stream() + Seq partitions = + catalog.listPartitions(tableIdent, scalaPartitionFilter).toIndexedSeq(); + return JavaConverters.seqAsJavaListConverter(partitions).asJava().stream() .map(catalogPartition -> toSparkPartition(catalogPartition, catalogTable)) .collect(Collectors.toList()); } catch (NoSuchDatabaseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Database not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Database not found in catalog.", tableIdent); } catch (NoSuchTableException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Table not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Table not found in catalog.", tableIdent); } } @@ -202,19 +206,22 @@ public static List getPartitions(SparkSession spark, TableIdenti * @param predicate a predicate on partition columns * @return matching table's partitions */ - public static List getPartitionsByFilter(SparkSession spark, String table, String predicate) { + public static List getPartitionsByFilter( + SparkSession spark, String table, String predicate) { TableIdentifier tableIdent; try { tableIdent = spark.sessionState().sqlParser().parseTableIdentifier(table); } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to parse the table identifier: %s", table); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to parse the table identifier: %s", table); } Expression unresolvedPredicateExpr; try { unresolvedPredicateExpr = spark.sessionState().sqlParser().parseExpression(predicate); } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to parse the predicate expression: %s", predicate); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to parse the predicate expression: %s", predicate); } Expression resolvedPredicateExpr = resolveAttrs(spark, table, unresolvedPredicateExpr); @@ -229,8 +236,8 @@ public static List getPartitionsByFilter(SparkSession spark, Str * @param predicateExpr a predicate expression on partition columns * @return matching table's partitions */ - public static List getPartitionsByFilter(SparkSession spark, TableIdentifier tableIdent, - Expression predicateExpr) { + public static List getPartitionsByFilter( + SparkSession spark, TableIdentifier tableIdent, Expression predicateExpr) { try { SessionCatalog catalog = spark.sessionState().catalog(); CatalogTable catalogTable = catalog.getTableMetadata(tableIdent); @@ -241,111 +248,131 @@ public static List getPartitionsByFilter(SparkSession spark, Tab } else { resolvedPredicateExpr = predicateExpr; } - Seq predicates = JavaConverters - .collectionAsScalaIterableConverter(ImmutableList.of(resolvedPredicateExpr)) - .asScala().toIndexedSeq(); + Seq predicates = + JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(resolvedPredicateExpr)) + .asScala() + .toIndexedSeq(); - Seq partitions = catalog.listPartitionsByFilter(tableIdent, predicates).toIndexedSeq(); + Seq partitions = + catalog.listPartitionsByFilter(tableIdent, predicates).toIndexedSeq(); - return JavaConverters - .seqAsJavaListConverter(partitions) - .asJava() - .stream() + return JavaConverters.seqAsJavaListConverter(partitions).asJava().stream() .map(catalogPartition -> toSparkPartition(catalogPartition, catalogTable)) .collect(Collectors.toList()); } catch (NoSuchDatabaseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Database not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Database not found in catalog.", tableIdent); } catch (NoSuchTableException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Table not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Table not found in catalog.", tableIdent); } } /** * Returns the data files in a partition by listing the partition location. * - * For Parquet and ORC partitions, this will read metrics from the file footer. For Avro partitions, - * metrics are set to null. + *

    For Parquet and ORC partitions, this will read metrics from the file footer. For Avro + * partitions, metrics are set to null. * * @param partition a partition * @param conf a serializable Hadoop conf * @param metricsConfig a metrics conf * @return a List of DataFile - * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, Configuration, - * MetricsConfig, NameMapping)} + * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, + * Configuration, MetricsConfig, NameMapping)} */ @Deprecated - public static List listPartition(SparkPartition partition, PartitionSpec spec, - SerializableConfiguration conf, MetricsConfig metricsConfig) { + public static List listPartition( + SparkPartition partition, + PartitionSpec spec, + SerializableConfiguration conf, + MetricsConfig metricsConfig) { return listPartition(partition, spec, conf, metricsConfig, null); } /** * Returns the data files in a partition by listing the partition location. * - * For Parquet and ORC partitions, this will read metrics from the file footer. For Avro partitions, - * metrics are set to null. + *

    For Parquet and ORC partitions, this will read metrics from the file footer. For Avro + * partitions, metrics are set to null. * * @param partition a partition * @param conf a serializable Hadoop conf * @param metricsConfig a metrics conf * @param mapping a name mapping * @return a List of DataFile - * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, Configuration, - * MetricsConfig, NameMapping)} + * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, + * Configuration, MetricsConfig, NameMapping)} */ @Deprecated - public static List listPartition(SparkPartition partition, PartitionSpec spec, - SerializableConfiguration conf, MetricsConfig metricsConfig, - NameMapping mapping) { - return TableMigrationUtil.listPartition(partition.values, partition.uri, partition.format, spec, conf.get(), - metricsConfig, mapping); + public static List listPartition( + SparkPartition partition, + PartitionSpec spec, + SerializableConfiguration conf, + MetricsConfig metricsConfig, + NameMapping mapping) { + return TableMigrationUtil.listPartition( + partition.values, + partition.uri, + partition.format, + spec, + conf.get(), + metricsConfig, + mapping); } - - private static SparkPartition toSparkPartition(CatalogTablePartition partition, CatalogTable table) { + private static SparkPartition toSparkPartition( + CatalogTablePartition partition, CatalogTable table) { Option locationUri = partition.storage().locationUri(); Option serde = partition.storage().serde(); Preconditions.checkArgument(locationUri.nonEmpty(), "Partition URI should be defined"); - Preconditions.checkArgument(serde.nonEmpty() || table.provider().nonEmpty(), - "Partition format should be defined"); + Preconditions.checkArgument( + serde.nonEmpty() || table.provider().nonEmpty(), "Partition format should be defined"); String uri = Util.uriToString(locationUri.get()); String format = serde.nonEmpty() ? serde.get() : table.provider().get(); - Map partitionSpec = JavaConverters.mapAsJavaMapConverter(partition.spec()).asJava(); + Map partitionSpec = + JavaConverters.mapAsJavaMapConverter(partition.spec()).asJava(); return new SparkPartition(partitionSpec, uri, format); } private static Expression resolveAttrs(SparkSession spark, String table, Expression expr) { Function2 resolver = spark.sessionState().analyzer().resolver(); LogicalPlan plan = spark.table(table).queryExecution().analyzed(); - return expr.transform(new AbstractPartialFunction() { - @Override - public Expression apply(Expression attr) { - UnresolvedAttribute unresolvedAttribute = (UnresolvedAttribute) attr; - Option namedExpressionOption = plan.resolve(unresolvedAttribute.nameParts(), resolver); - if (namedExpressionOption.isDefined()) { - return (Expression) namedExpressionOption.get(); - } else { - throw new IllegalArgumentException( - String.format("Could not resolve %s using columns: %s", attr, plan.output())); - } - } - - @Override - public boolean isDefinedAt(Expression attr) { - return attr instanceof UnresolvedAttribute; - } - }); + return expr.transform( + new AbstractPartialFunction() { + @Override + public Expression apply(Expression attr) { + UnresolvedAttribute unresolvedAttribute = (UnresolvedAttribute) attr; + Option namedExpressionOption = + plan.resolve(unresolvedAttribute.nameParts(), resolver); + if (namedExpressionOption.isDefined()) { + return (Expression) namedExpressionOption.get(); + } else { + throw new IllegalArgumentException( + String.format("Could not resolve %s using columns: %s", attr, plan.output())); + } + } + + @Override + public boolean isDefinedAt(Expression attr) { + return attr instanceof UnresolvedAttribute; + } + }); } - private static Iterator buildManifest(SerializableConfiguration conf, PartitionSpec spec, - String basePath, Iterator> fileTuples) { + private static Iterator buildManifest( + SerializableConfiguration conf, + PartitionSpec spec, + String basePath, + Iterator> fileTuples) { if (fileTuples.hasNext()) { FileIO io = new HadoopFileIO(conf.get()); TaskContext ctx = TaskContext.get(); - String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId()); + String suffix = + String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId()); Path location = new Path(basePath, suffix); String outputPath = FileFormat.AVRO.addExtension(location.toString()); OutputFile outputFile = io.newOutputFile(outputPath); @@ -354,7 +381,8 @@ private static Iterator buildManifest(SerializableConfiguration co try (ManifestWriter writerRef = writer) { fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2)); } catch (IOException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to close the manifest writer: %s", outputPath); } ManifestFile manifestFile = writer.toManifestFile(); @@ -367,42 +395,54 @@ private static Iterator buildManifest(SerializableConfiguration co /** * Import files from an existing Spark table to an Iceberg table. * - * The import uses the Spark session to get table metadata. It assumes no - * operation is going on the original and target table and thus is not - * thread-safe. + *

    The import uses the Spark session to get table metadata. It assumes no operation is going on + * the original and target table and thus is not thread-safe. * * @param spark a Spark session * @param sourceTableIdent an identifier of the source Spark table * @param targetTable an Iceberg table where to import the data * @param stagingDir a staging directory to store temporary manifest files - * @param partitionFilter only import partitions whose values match those in the map, can be partially defined + * @param partitionFilter only import partitions whose values match those in the map, can be + * partially defined * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file */ - public static void importSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, - String stagingDir, Map partitionFilter, - boolean checkDuplicateFiles) { + public static void importSparkTable( + SparkSession spark, + TableIdentifier sourceTableIdent, + Table targetTable, + String stagingDir, + Map partitionFilter, + boolean checkDuplicateFiles) { SessionCatalog catalog = spark.sessionState().catalog(); - String db = sourceTableIdent.database().nonEmpty() ? - sourceTableIdent.database().get() : - catalog.getCurrentDatabase(); - TableIdentifier sourceTableIdentWithDB = new TableIdentifier(sourceTableIdent.table(), Some.apply(db)); + String db = + sourceTableIdent.database().nonEmpty() + ? sourceTableIdent.database().get() + : catalog.getCurrentDatabase(); + TableIdentifier sourceTableIdentWithDB = + new TableIdentifier(sourceTableIdent.table(), Some.apply(db)); if (!catalog.tableExists(sourceTableIdentWithDB)) { - throw new org.apache.iceberg.exceptions.NoSuchTableException("Table %s does not exist", sourceTableIdentWithDB); + throw new org.apache.iceberg.exceptions.NoSuchTableException( + "Table %s does not exist", sourceTableIdentWithDB); } try { - PartitionSpec spec = SparkSchemaUtil.specForTable(spark, sourceTableIdentWithDB.unquotedString()); + PartitionSpec spec = + SparkSchemaUtil.specForTable(spark, sourceTableIdentWithDB.unquotedString()); if (Objects.equal(spec, PartitionSpec.unpartitioned())) { - importUnpartitionedSparkTable(spark, sourceTableIdentWithDB, targetTable, checkDuplicateFiles); + importUnpartitionedSparkTable( + spark, sourceTableIdentWithDB, targetTable, checkDuplicateFiles); } else { - List sourceTablePartitions = getPartitions(spark, sourceTableIdent, - partitionFilter); - Preconditions.checkArgument(!sourceTablePartitions.isEmpty(), - "Cannot find any partitions in table %s", sourceTableIdent); - importSparkPartitions(spark, sourceTablePartitions, targetTable, spec, stagingDir, checkDuplicateFiles); + List sourceTablePartitions = + getPartitions(spark, sourceTableIdent, partitionFilter); + Preconditions.checkArgument( + !sourceTablePartitions.isEmpty(), + "Cannot find any partitions in table %s", + sourceTableIdent); + importSparkPartitions( + spark, sourceTablePartitions, targetTable, spec, stagingDir, checkDuplicateFiles); } } catch (AnalysisException e) { throw SparkExceptionUtil.toUncheckedException( @@ -413,9 +453,8 @@ public static void importSparkTable(SparkSession spark, TableIdentifier sourceTa /** * Import files from an existing Spark table to an Iceberg table. * - * The import uses the Spark session to get table metadata. It assumes no - * operation is going on the original and target table and thus is not - * thread-safe. + *

    The import uses the Spark session to get table metadata. It assumes no operation is going on + * the original and target table and thus is not thread-safe. * * @param spark a Spark session * @param sourceTableIdent an identifier of the source Spark table @@ -423,33 +462,49 @@ public static void importSparkTable(SparkSession spark, TableIdentifier sourceTa * @param stagingDir a staging directory to store temporary manifest files * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file */ - public static void importSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, - String stagingDir, boolean checkDuplicateFiles) { - importSparkTable(spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), checkDuplicateFiles); + public static void importSparkTable( + SparkSession spark, + TableIdentifier sourceTableIdent, + Table targetTable, + String stagingDir, + boolean checkDuplicateFiles) { + importSparkTable( + spark, + sourceTableIdent, + targetTable, + stagingDir, + Collections.emptyMap(), + checkDuplicateFiles); } /** * Import files from an existing Spark table to an Iceberg table. * - * The import uses the Spark session to get table metadata. It assumes no - * operation is going on the original and target table and thus is not - * thread-safe. + *

    The import uses the Spark session to get table metadata. It assumes no operation is going on + * the original and target table and thus is not thread-safe. + * * @param spark a Spark session * @param sourceTableIdent an identifier of the source Spark table * @param targetTable an Iceberg table where to import the data * @param stagingDir a staging directory to store temporary manifest files */ - public static void importSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, - String stagingDir) { - importSparkTable(spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), false); + public static void importSparkTable( + SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, String stagingDir) { + importSparkTable( + spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), false); } - private static void importUnpartitionedSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, - Table targetTable, boolean checkDuplicateFiles) { + private static void importUnpartitionedSparkTable( + SparkSession spark, + TableIdentifier sourceTableIdent, + Table targetTable, + boolean checkDuplicateFiles) { try { CatalogTable sourceTable = spark.sessionState().catalog().getTableMetadata(sourceTableIdent); Option format = - sourceTable.storage().serde().nonEmpty() ? sourceTable.storage().serde() : sourceTable.provider(); + sourceTable.storage().serde().nonEmpty() + ? sourceTable.storage().serde() + : sourceTable.provider(); Preconditions.checkArgument(format.nonEmpty(), "Could not determine table format"); Map partition = Collections.emptyMap(); @@ -457,20 +512,34 @@ private static void importUnpartitionedSparkTable(SparkSession spark, TableIdent Configuration conf = spark.sessionState().newHadoopConf(); MetricsConfig metricsConfig = MetricsConfig.forTable(targetTable); String nameMappingString = targetTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; - - List files = TableMigrationUtil.listPartition( - partition, Util.uriToString(sourceTable.location()), format.get(), spec, conf, metricsConfig, nameMapping); + NameMapping nameMapping = + nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; + + List files = + TableMigrationUtil.listPartition( + partition, + Util.uriToString(sourceTable.location()), + format.get(), + spec, + conf, + metricsConfig, + nameMapping); if (checkDuplicateFiles) { - Dataset importedFiles = spark.createDataset( - Lists.transform(files, f -> f.path().toString()), Encoders.STRING()).toDF("file_path"); - Dataset existingFiles = loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); - Column joinCond = existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); - Dataset duplicates = importedFiles.join(existingFiles, joinCond) - .select("file_path").as(Encoders.STRING()); - Preconditions.checkState(duplicates.isEmpty(), - String.format(DUPLICATE_FILE_MESSAGE, Joiner.on(",").join((String[]) duplicates.take(10)))); + Dataset importedFiles = + spark + .createDataset(Lists.transform(files, f -> f.path().toString()), Encoders.STRING()) + .toDF("file_path"); + Dataset existingFiles = + loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); + Column joinCond = + existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); + Dataset duplicates = + importedFiles.join(existingFiles, joinCond).select("file_path").as(Encoders.STRING()); + Preconditions.checkState( + duplicates.isEmpty(), + String.format( + DUPLICATE_FILE_MESSAGE, Joiner.on(",").join((String[]) duplicates.take(10)))); } AppendFiles append = targetTable.newAppend(); @@ -495,57 +564,75 @@ private static void importUnpartitionedSparkTable(SparkSession spark, TableIdent * @param stagingDir a staging directory to store temporary manifest files * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file */ - public static void importSparkPartitions(SparkSession spark, List partitions, Table targetTable, - PartitionSpec spec, String stagingDir, boolean checkDuplicateFiles) { + public static void importSparkPartitions( + SparkSession spark, + List partitions, + Table targetTable, + PartitionSpec spec, + String stagingDir, + boolean checkDuplicateFiles) { Configuration conf = spark.sessionState().newHadoopConf(); SerializableConfiguration serializableConf = new SerializableConfiguration(conf); - int parallelism = Math.min(partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism()); + int parallelism = + Math.min( + partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism()); int numShufflePartitions = spark.sessionState().conf().numShufflePartitions(); MetricsConfig metricsConfig = MetricsConfig.fromProperties(targetTable.properties()); String nameMappingString = targetTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; + NameMapping nameMapping = + nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD partitionRDD = sparkContext.parallelize(partitions, parallelism); - Dataset partitionDS = spark.createDataset( - partitionRDD.rdd(), - Encoders.javaSerialization(SparkPartition.class)); + Dataset partitionDS = + spark.createDataset(partitionRDD.rdd(), Encoders.javaSerialization(SparkPartition.class)); - Dataset filesToImport = partitionDS - .flatMap((FlatMapFunction) sparkPartition -> - listPartition(sparkPartition, spec, serializableConf, metricsConfig, nameMapping).iterator(), + Dataset filesToImport = + partitionDS.flatMap( + (FlatMapFunction) + sparkPartition -> + listPartition( + sparkPartition, spec, serializableConf, metricsConfig, nameMapping) + .iterator(), Encoders.javaSerialization(DataFile.class)); if (checkDuplicateFiles) { - Dataset importedFiles = filesToImport - .map((MapFunction) f -> f.path().toString(), Encoders.STRING()) - .toDF("file_path"); + Dataset importedFiles = + filesToImport + .map((MapFunction) f -> f.path().toString(), Encoders.STRING()) + .toDF("file_path"); Dataset existingFiles = loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); - Column joinCond = existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); - Dataset duplicates = importedFiles.join(existingFiles, joinCond) - .select("file_path").as(Encoders.STRING()); - Preconditions.checkState(duplicates.isEmpty(), - String.format(DUPLICATE_FILE_MESSAGE, Joiner.on(",").join((String[]) duplicates.take(10)))); + Column joinCond = + existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); + Dataset duplicates = + importedFiles.join(existingFiles, joinCond).select("file_path").as(Encoders.STRING()); + Preconditions.checkState( + duplicates.isEmpty(), + String.format( + DUPLICATE_FILE_MESSAGE, Joiner.on(",").join((String[]) duplicates.take(10)))); } - List manifests = filesToImport - .repartition(numShufflePartitions) - .map((MapFunction>) file -> - Tuple2.apply(file.path().toString(), file), - Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class))) - .orderBy(col("_1")) - .mapPartitions( - (MapPartitionsFunction, ManifestFile>) fileTuple -> - buildManifest(serializableConf, spec, stagingDir, fileTuple), - Encoders.javaSerialization(ManifestFile.class)) - .collectAsList(); + List manifests = + filesToImport + .repartition(numShufflePartitions) + .map( + (MapFunction>) + file -> Tuple2.apply(file.path().toString(), file), + Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class))) + .orderBy(col("_1")) + .mapPartitions( + (MapPartitionsFunction, ManifestFile>) + fileTuple -> buildManifest(serializableConf, spec, stagingDir, fileTuple), + Encoders.javaSerialization(ManifestFile.class)) + .collectAsList(); try { - boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean( - targetTable.properties(), - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); + boolean snapshotIdInheritanceEnabled = + PropertyUtil.propertyAsBoolean( + targetTable.properties(), + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); AppendFiles append = targetTable.newAppend(); manifests.forEach(append::appendManifest); @@ -570,13 +657,17 @@ public static void importSparkPartitions(SparkSession spark, List partitions, Table targetTable, - PartitionSpec spec, String stagingDir) { + public static void importSparkPartitions( + SparkSession spark, + List partitions, + Table targetTable, + PartitionSpec spec, + String stagingDir) { importSparkPartitions(spark, partitions, targetTable, spec, stagingDir, false); } - public static List filterPartitions(List partitions, - Map partitionFilter) { + public static List filterPartitions( + List partitions, Map partitionFilter) { if (partitionFilter.isEmpty()) { return partitions; } else { @@ -597,28 +688,30 @@ private static void deleteManifests(FileIO io, List manifests) { /** * Loads a metadata table. * - * @deprecated since 0.14.0, will be removed in 0.15.0; - * use {@link #loadMetadataTable(SparkSession, Table, MetadataTableType)}. + * @deprecated since 0.14.0, will be removed in 0.15.0; use {@link + * #loadMetadataTable(SparkSession, Table, MetadataTableType)}. */ @Deprecated - public static Dataset loadCatalogMetadataTable(SparkSession spark, Table table, MetadataTableType type) { + public static Dataset loadCatalogMetadataTable( + SparkSession spark, Table table, MetadataTableType type) { return loadMetadataTable(spark, table, type); } - public static Dataset loadMetadataTable(SparkSession spark, Table table, MetadataTableType type) { + public static Dataset loadMetadataTable( + SparkSession spark, Table table, MetadataTableType type) { return loadMetadataTable(spark, table, type, ImmutableMap.of()); } - public static Dataset loadMetadataTable(SparkSession spark, Table table, MetadataTableType type, - Map extraOptions) { - SparkTable metadataTable = new SparkTable(MetadataTableUtils.createMetadataTableInstance(table, type), false); + public static Dataset loadMetadataTable( + SparkSession spark, Table table, MetadataTableType type, Map extraOptions) { + SparkTable metadataTable = + new SparkTable(MetadataTableUtils.createMetadataTableInstance(table, type), false); CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(extraOptions); - return Dataset.ofRows(spark, DataSourceV2Relation.create(metadataTable, Some.empty(), Some.empty(), options)); + return Dataset.ofRows( + spark, DataSourceV2Relation.create(metadataTable, Some.empty(), Some.empty(), options)); } - /** - * Class representing a table partition. - */ + /** Class representing a table partition. */ public static class SparkPartition implements Serializable { private final Map values; private final String uri; @@ -660,9 +753,9 @@ public boolean equals(Object o) { return false; } SparkPartition that = (SparkPartition) o; - return Objects.equal(values, that.values) && - Objects.equal(uri, that.uri) && - Objects.equal(format, that.format); + return Objects.equal(values, that.values) + && Objects.equal(uri, that.uri) + && Objects.equal(format, that.format); } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java index f0b8b2a9762b..17499736fbeb 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -70,7 +69,7 @@ public Type struct(StructType struct, List types) { List newFields = Lists.newArrayListWithExpectedSize(fields.length); boolean isRoot = root == struct; for (int i = 0; i < fields.length; i += 1) { - StructField field = fields[i]; + StructField field = fields[i]; Type type = types.get(i); int id; @@ -122,10 +121,9 @@ public Type atomic(DataType atomic) { if (atomic instanceof BooleanType) { return Types.BooleanType.get(); - } else if ( - atomic instanceof IntegerType || - atomic instanceof ShortType || - atomic instanceof ByteType) { + } else if (atomic instanceof IntegerType + || atomic instanceof ShortType + || atomic instanceof ByteType) { return Types.IntegerType.get(); } else if (atomic instanceof LongType) { @@ -137,10 +135,9 @@ public Type atomic(DataType atomic) { } else if (atomic instanceof DoubleType) { return Types.DoubleType.get(); - } else if ( - atomic instanceof StringType || - atomic instanceof CharType || - atomic instanceof VarcharType) { + } else if (atomic instanceof StringType + || atomic instanceof CharType + || atomic instanceof VarcharType) { return Types.StringType.get(); } else if (atomic instanceof DateType) { @@ -151,13 +148,11 @@ public Type atomic(DataType atomic) { } else if (atomic instanceof DecimalType) { return Types.DecimalType.of( - ((DecimalType) atomic).precision(), - ((DecimalType) atomic).scale()); + ((DecimalType) atomic).precision(), ((DecimalType) atomic).scale()); } else if (atomic instanceof BinaryType) { return Types.BinaryType.get(); } - throw new UnsupportedOperationException( - "Not a supported type: " + atomic.catalogString()); + throw new UnsupportedOperationException("Not a supported type: " + atomic.catalogString()); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java index 83b31940711e..1ef694263fa4 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -35,26 +34,22 @@ static T visit(DataType type, SparkTypeVisitor visitor) { List fieldResults = Lists.newArrayListWithExpectedSize(fields.length); for (StructField field : fields) { - fieldResults.add(visitor.field( - field, - visit(field.dataType(), visitor))); + fieldResults.add(visitor.field(field, visit(field.dataType(), visitor))); } return visitor.struct((StructType) type, fieldResults); } else if (type instanceof MapType) { - return visitor.map((MapType) type, + return visitor.map( + (MapType) type, visit(((MapType) type).keyType(), visitor), visit(((MapType) type).valueType(), visitor)); } else if (type instanceof ArrayType) { - return visitor.array( - (ArrayType) type, - visit(((ArrayType) type).elementType(), visitor)); + return visitor.array((ArrayType) type, visit(((ArrayType) type).elementType(), visitor)); } else if (type instanceof UserDefinedType) { - throw new UnsupportedOperationException( - "User-defined types are not supported"); + throw new UnsupportedOperationException("User-defined types are not supported"); } else { return visitor.atomic(type); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java index 7754aa406123..950ed7bc87b8 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.sql.Date; @@ -54,26 +53,33 @@ public class SparkUtil { - public static final String TIMESTAMP_WITHOUT_TIMEZONE_ERROR = String.format("Cannot handle timestamp without" + - " timezone fields in Spark. Spark does not natively support this type but if you would like to handle all" + - " timestamps as timestamp with timezone set '%s' to true. This will not change the underlying values stored" + - " but will change their displayed values in Spark. For more information please see" + - " https://docs.databricks.com/spark/latest/dataframes-datasets/dates-timestamps.html#ansi-sql-and" + - "-spark-sql-timestamps", SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); + public static final String TIMESTAMP_WITHOUT_TIMEZONE_ERROR = + String.format( + "Cannot handle timestamp without" + + " timezone fields in Spark. Spark does not natively support this type but if you would like to handle all" + + " timestamps as timestamp with timezone set '%s' to true. This will not change the underlying values stored" + + " but will change their displayed values in Spark. For more information please see" + + " https://docs.databricks.com/spark/latest/dataframes-datasets/dates-timestamps.html#ansi-sql-and" + + "-spark-sql-timestamps", + SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); private static final String SPARK_CATALOG_CONF_PREFIX = "spark.sql.catalog"; - // Format string used as the prefix for spark configuration keys to override hadoop configuration values - // for Iceberg tables from a given catalog. These keys can be specified as `spark.sql.catalog.$catalogName.hadoop.*`, - // similar to using `spark.hadoop.*` to override hadoop configurations globally for a given spark session. - private static final String SPARK_CATALOG_HADOOP_CONF_OVERRIDE_FMT_STR = SPARK_CATALOG_CONF_PREFIX + ".%s.hadoop."; + // Format string used as the prefix for spark configuration keys to override hadoop configuration + // values + // for Iceberg tables from a given catalog. These keys can be specified as + // `spark.sql.catalog.$catalogName.hadoop.*`, + // similar to using `spark.hadoop.*` to override hadoop configurations globally for a given spark + // session. + private static final String SPARK_CATALOG_HADOOP_CONF_OVERRIDE_FMT_STR = + SPARK_CATALOG_CONF_PREFIX + ".%s.hadoop."; - private SparkUtil() { - } + private SparkUtil() {} public static FileIO serializableFileIO(Table table) { if (table.io() instanceof HadoopConfigurable) { // we need to use Spark's SerializableConfiguration to avoid issues with Kryo serialization - ((HadoopConfigurable) table.io()).serializeConfWith(conf -> new SerializableConfiguration(conf)::value); + ((HadoopConfigurable) table.io()) + .serializeConfWith(conf -> new SerializableConfiguration(conf)::value); } return table.io(); @@ -87,11 +93,12 @@ public static FileIO serializableFileIO(Table table) { */ public static void validatePartitionTransforms(PartitionSpec spec) { if (spec.fields().stream().anyMatch(field -> field.transform() instanceof UnknownTransform)) { - String unsupported = spec.fields().stream() - .map(PartitionField::transform) - .filter(transform -> transform instanceof UnknownTransform) - .map(Transform::toString) - .collect(Collectors.joining(", ")); + String unsupported = + spec.fields().stream() + .map(PartitionField::transform) + .filter(transform -> transform instanceof UnknownTransform) + .map(Transform::toString) + .collect(Collectors.joining(", ")); throw new UnsupportedOperationException( String.format("Cannot write using unsupported transforms: %s", unsupported)); @@ -99,18 +106,20 @@ public static void validatePartitionTransforms(PartitionSpec spec) { } /** - * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply - * Attempts to find the catalog and identifier a multipart identifier represents + * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply Attempts to find the + * catalog and identifier a multipart identifier represents + * * @param nameParts Multipart identifier representing a table * @return The CatalogPlugin and Identifier for the table */ - public static Pair catalogAndIdentifier(List nameParts, - Function catalogProvider, - BiFunction identiferProvider, - C currentCatalog, - String[] currentNamespace) { - Preconditions.checkArgument(!nameParts.isEmpty(), - "Cannot determine catalog and identifier from empty name"); + public static Pair catalogAndIdentifier( + List nameParts, + Function catalogProvider, + BiFunction identiferProvider, + C currentCatalog, + String[] currentNamespace) { + Preconditions.checkArgument( + !nameParts.isEmpty(), "Cannot determine catalog and identifier from empty name"); int lastElementIndex = nameParts.size() - 1; String name = nameParts.get(lastElementIndex); @@ -122,7 +131,7 @@ public static Pair catalogAndIdentifier(List nameParts, C catalog = catalogProvider.apply(nameParts.get(0)); if (catalog == null) { // The first element was not a valid catalog, treat it like part of the namespace - String[] namespace = nameParts.subList(0, lastElementIndex).toArray(new String[0]); + String[] namespace = nameParts.subList(0, lastElementIndex).toArray(new String[0]); return Pair.of(currentCatalog, identiferProvider.apply(namespace, name)); } else { // Assume the first element is a valid catalog @@ -134,6 +143,7 @@ public static Pair catalogAndIdentifier(List nameParts, /** * Responsible for checking if the table schema has a timestamp without timezone column + * * @param schema table schema to check if it contains a timestamp without timezone column * @return boolean indicating if the schema passed in has a timestamp field without a timezone */ @@ -143,15 +153,17 @@ public static boolean hasTimestampWithoutZone(Schema schema) { /** * Checks whether timestamp types for new tables should be stored with timezone info. - *

    - * The default value is false and all timestamp fields are stored as {@link Types.TimestampType#withZone()}. - * If enabled, all timestamp fields in new tables will be stored as {@link Types.TimestampType#withoutZone()}. + * + *

    The default value is false and all timestamp fields are stored as {@link + * Types.TimestampType#withZone()}. If enabled, all timestamp fields in new tables will be stored + * as {@link Types.TimestampType#withoutZone()}. * * @param sessionConf a Spark runtime config * @return true if timestamp types for new tables should be stored with timezone info */ public static boolean useTimestampWithoutZoneInNewTables(RuntimeConfig sessionConf) { - String sessionConfValue = sessionConf.get(SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, null); + String sessionConfValue = + sessionConf.get(SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, null); if (sessionConfValue != null) { return Boolean.parseBoolean(sessionConfValue); } @@ -159,32 +171,40 @@ public static boolean useTimestampWithoutZoneInNewTables(RuntimeConfig sessionCo } /** - * Pulls any Catalog specific overrides for the Hadoop conf from the current SparkSession, which can be - * set via `spark.sql.catalog.$catalogName.hadoop.*` + * Pulls any Catalog specific overrides for the Hadoop conf from the current SparkSession, which + * can be set via `spark.sql.catalog.$catalogName.hadoop.*` * - * Mirrors the override of hadoop configurations for a given spark session using `spark.hadoop.*`. + *

    Mirrors the override of hadoop configurations for a given spark session using + * `spark.hadoop.*`. * - * The SparkCatalog allows for hadoop configurations to be overridden per catalog, by setting + *

    The SparkCatalog allows for hadoop configurations to be overridden per catalog, by setting * them on the SQLConf, where the following will add the property "fs.default.name" with value - * "hdfs://hanksnamenode:8020" to the catalog's hadoop configuration. - * SparkSession.builder() - * .config(s"spark.sql.catalog.$catalogName.hadoop.fs.default.name", "hdfs://hanksnamenode:8020") - * .getOrCreate() + * "hdfs://hanksnamenode:8020" to the catalog's hadoop configuration. SparkSession.builder() + * .config(s"spark.sql.catalog.$catalogName.hadoop.fs.default.name", "hdfs://hanksnamenode:8020") + * .getOrCreate() + * * @param spark The current Spark session * @param catalogName Name of the catalog to find overrides for. - * @return the Hadoop Configuration that should be used for this catalog, with catalog specific overrides applied. + * @return the Hadoop Configuration that should be used for this catalog, with catalog specific + * overrides applied. */ public static Configuration hadoopConfCatalogOverrides(SparkSession spark, String catalogName) { // Find keys for the catalog intended to be hadoop configurations final String hadoopConfCatalogPrefix = hadoopConfPrefixForCatalog(catalogName); final Configuration conf = spark.sessionState().newHadoopConf(); - spark.sqlContext().conf().settings().forEach((k, v) -> { - // These checks are copied from `spark.sessionState().newHadoopConfWithOptions()`, which we - // avoid using to not have to convert back and forth between scala / java map types. - if (v != null && k != null && k.startsWith(hadoopConfCatalogPrefix)) { - conf.set(k.substring(hadoopConfCatalogPrefix.length()), v); - } - }); + spark + .sqlContext() + .conf() + .settings() + .forEach( + (k, v) -> { + // These checks are copied from `spark.sessionState().newHadoopConfWithOptions()`, + // which we + // avoid using to not have to convert back and forth between scala / java map types. + if (v != null && k != null && k.startsWith(hadoopConfCatalogPrefix)) { + conf.set(k.substring(hadoopConfCatalogPrefix.length()), v); + } + }); return conf; } @@ -196,12 +216,12 @@ private static String hadoopConfPrefixForCatalog(String catalogName) { * Get a List of Spark filter Expression. * * @param schema table schema - * @param filters filters in the format of a Map, where key is one of the table column name, - * and value is the specific value to be filtered on the column. + * @param filters filters in the format of a Map, where key is one of the table column name, and + * value is the specific value to be filtered on the column. * @return a List of filters in the format of Spark Expression. */ - public static List partitionMapToExpression(StructType schema, - Map filters) { + public static List partitionMapToExpression( + StructType schema, Map filters) { List filterExpressions = Lists.newArrayList(); for (Map.Entry entry : filters.entrySet()) { try { @@ -210,38 +230,55 @@ public static List partitionMapToExpression(StructType schema, BoundReference ref = new BoundReference(index, dataType, true); switch (dataType.typeName()) { case "integer": - filterExpressions.add(new EqualTo(ref, - Literal.create(Integer.parseInt(entry.getValue()), DataTypes.IntegerType))); + filterExpressions.add( + new EqualTo( + ref, + Literal.create(Integer.parseInt(entry.getValue()), DataTypes.IntegerType))); break; case "string": - filterExpressions.add(new EqualTo(ref, Literal.create(entry.getValue(), DataTypes.StringType))); + filterExpressions.add( + new EqualTo(ref, Literal.create(entry.getValue(), DataTypes.StringType))); break; case "short": - filterExpressions.add(new EqualTo(ref, - Literal.create(Short.parseShort(entry.getValue()), DataTypes.ShortType))); + filterExpressions.add( + new EqualTo( + ref, Literal.create(Short.parseShort(entry.getValue()), DataTypes.ShortType))); break; case "long": - filterExpressions.add(new EqualTo(ref, - Literal.create(Long.parseLong(entry.getValue()), DataTypes.LongType))); + filterExpressions.add( + new EqualTo( + ref, Literal.create(Long.parseLong(entry.getValue()), DataTypes.LongType))); break; case "float": - filterExpressions.add(new EqualTo(ref, - Literal.create(Float.parseFloat(entry.getValue()), DataTypes.FloatType))); + filterExpressions.add( + new EqualTo( + ref, Literal.create(Float.parseFloat(entry.getValue()), DataTypes.FloatType))); break; case "double": - filterExpressions.add(new EqualTo(ref, - Literal.create(Double.parseDouble(entry.getValue()), DataTypes.DoubleType))); + filterExpressions.add( + new EqualTo( + ref, + Literal.create(Double.parseDouble(entry.getValue()), DataTypes.DoubleType))); break; case "date": - filterExpressions.add(new EqualTo(ref, - Literal.create(new Date(DateTime.parse(entry.getValue()).getMillis()), DataTypes.DateType))); + filterExpressions.add( + new EqualTo( + ref, + Literal.create( + new Date(DateTime.parse(entry.getValue()).getMillis()), + DataTypes.DateType))); break; case "timestamp": - filterExpressions.add(new EqualTo(ref, - Literal.create(new Timestamp(DateTime.parse(entry.getValue()).getMillis()), DataTypes.TimestampType))); + filterExpressions.add( + new EqualTo( + ref, + Literal.create( + new Timestamp(DateTime.parse(entry.getValue()).getMillis()), + DataTypes.TimestampType))); break; default: - throw new IllegalStateException("Unexpected data type in partition filters: " + dataType); + throw new IllegalStateException( + "Unexpected data type in partition filters: " + dataType); } } catch (IllegalArgumentException e) { // ignore if filter is not on table columns diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java index b3e6b2f48887..5a5381099c76 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.nio.ByteBuffer; @@ -34,13 +33,10 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.util.DateTimeUtils; -/** - * A utility class that converts Spark values to Iceberg's internal representation. - */ +/** A utility class that converts Spark values to Iceberg's internal representation. */ public class SparkValueConverter { - private SparkValueConverter() { - } + private SparkValueConverter() {} public static Record convert(Schema schema, Row row) { return convert(schema.asStruct(), row); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java index 047a0b8169ed..e75830f820ff 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.DistributionMode.HASH; +import static org.apache.iceberg.DistributionMode.NONE; +import static org.apache.iceberg.DistributionMode.RANGE; + import java.util.Locale; import java.util.Map; import org.apache.iceberg.DistributionMode; @@ -31,24 +34,23 @@ import org.apache.spark.sql.RuntimeConfig; import org.apache.spark.sql.SparkSession; -import static org.apache.iceberg.DistributionMode.HASH; -import static org.apache.iceberg.DistributionMode.NONE; -import static org.apache.iceberg.DistributionMode.RANGE; - /** * A class for common Iceberg configs for Spark writes. - *

    - * If a config is set at multiple levels, the following order of precedence is used (top to bottom): + * + *

    If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * *

      - *
    1. Write options
    2. - *
    3. Session configuration
    4. - *
    5. Table metadata
    6. + *
    7. Write options + *
    8. Session configuration + *
    9. Table metadata *
    - * The most specific value is set in write options and takes precedence over all other configs. - * If no write option is provided, this class checks the session configuration for any overrides. - * If no applicable value is found in the session configuration, this class uses the table metadata. - *

    - * Note this class is NOT meant to be serialized and sent to executors. + * + * The most specific value is set in write options and takes precedence over all other configs. If + * no write option is provided, this class checks the session configuration for any overrides. If no + * applicable value is found in the session configuration, this class uses the table metadata. + * + *

    Note this class is NOT meant to be serialized and sent to executors. */ public class SparkWriteConf { @@ -65,7 +67,8 @@ public SparkWriteConf(SparkSession spark, Table table, Map write } public boolean checkNullability() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.CHECK_NULLABILITY) .sessionConf(SparkSQLProperties.CHECK_NULLABILITY) .defaultValue(SparkSQLProperties.CHECK_NULLABILITY_DEFAULT) @@ -73,7 +76,8 @@ public boolean checkNullability() { } public boolean checkOrdering() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.CHECK_ORDERING) .sessionConf(SparkSQLProperties.CHECK_ORDERING) .defaultValue(SparkSQLProperties.CHECK_ORDERING_DEFAULT) @@ -82,18 +86,20 @@ public boolean checkOrdering() { /** * Enables writing a timestamp with time zone as a timestamp without time zone. - *

    - * Generally, this is not safe as a timestamp without time zone is supposed to represent the wall-clock time, - * i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, - * but a timestamp with time zone represents instant semantics, i.e. the timestamp - * is adjusted so that the corresponding time in the reader timezone is displayed. - *

    - * When set to false (default), an exception must be thrown if the table contains a timestamp without time zone. + * + *

    Generally, this is not safe as a timestamp without time zone is supposed to represent the + * wall-clock time, i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, + * but a timestamp with time zone represents instant semantics, i.e. the timestamp is adjusted so + * that the corresponding time in the reader timezone is displayed. + * + *

    When set to false (default), an exception must be thrown if the table contains a timestamp + * without time zone. * * @return boolean indicating if writing timestamps without timezone is allowed */ public boolean handleTimestampWithoutZone() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .sessionConf(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .defaultValue(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT) @@ -106,7 +112,8 @@ public String overwriteMode() { } public boolean wapEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .tableProperty(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED) .defaultValue(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED_DEFAULT) .parse(); @@ -117,7 +124,8 @@ public String wapId() { } public boolean mergeSchema() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.MERGE_SCHEMA) .option(SparkWriteOptions.SPARK_MERGE_SCHEMA) .defaultValue(SparkWriteOptions.MERGE_SCHEMA_DEFAULT) @@ -125,16 +133,19 @@ public boolean mergeSchema() { } public FileFormat dataFileFormat() { - String valueAsString = confParser.stringConf() - .option(SparkWriteOptions.WRITE_FORMAT) - .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) - .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) - .parse(); + String valueAsString = + confParser + .stringConf() + .option(SparkWriteOptions.WRITE_FORMAT) + .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) + .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) + .parse(); return FileFormat.valueOf(valueAsString.toUpperCase(Locale.ENGLISH)); } public long targetDataFileSize() { - return confParser.longConf() + return confParser + .longConf() .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES) .tableProperty(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES) .defaultValue(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT) @@ -142,7 +153,8 @@ public long targetDataFileSize() { } public boolean fanoutWriterEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.FANOUT_ENABLED) .tableProperty(TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED) .defaultValue(TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED_DEFAULT) @@ -150,15 +162,20 @@ public boolean fanoutWriterEnabled() { } public FileFormat deleteFileFormat() { - String valueAsString = confParser.stringConf() - .option(SparkWriteOptions.DELETE_FORMAT) - .tableProperty(TableProperties.DELETE_DEFAULT_FILE_FORMAT) - .parseOptional(); - return valueAsString != null ? FileFormat.valueOf(valueAsString.toUpperCase(Locale.ENGLISH)) : dataFileFormat(); + String valueAsString = + confParser + .stringConf() + .option(SparkWriteOptions.DELETE_FORMAT) + .tableProperty(TableProperties.DELETE_DEFAULT_FILE_FORMAT) + .parseOptional(); + return valueAsString != null + ? FileFormat.valueOf(valueAsString.toUpperCase(Locale.ENGLISH)) + : dataFileFormat(); } public long targetDeleteFileSize() { - return confParser.longConf() + return confParser + .longConf() .option(SparkWriteOptions.TARGET_DELETE_FILE_SIZE_BYTES) .tableProperty(TableProperties.DELETE_TARGET_FILE_SIZE_BYTES) .defaultValue(TableProperties.DELETE_TARGET_FILE_SIZE_BYTES_DEFAULT) @@ -168,26 +185,31 @@ public long targetDeleteFileSize() { public Map extraSnapshotMetadata() { Map extraSnapshotMetadata = Maps.newHashMap(); - writeOptions.forEach((key, value) -> { - if (key.startsWith(SnapshotSummary.EXTRA_METADATA_PREFIX)) { - extraSnapshotMetadata.put(key.substring(SnapshotSummary.EXTRA_METADATA_PREFIX.length()), value); - } - }); + writeOptions.forEach( + (key, value) -> { + if (key.startsWith(SnapshotSummary.EXTRA_METADATA_PREFIX)) { + extraSnapshotMetadata.put( + key.substring(SnapshotSummary.EXTRA_METADATA_PREFIX.length()), value); + } + }); return extraSnapshotMetadata; } public String rewrittenFileSetId() { - return confParser.stringConf() + return confParser + .stringConf() .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID) .parseOptional(); } public DistributionMode distributionMode() { - String modeName = confParser.stringConf() - .option(SparkWriteOptions.DISTRIBUTION_MODE) - .tableProperty(TableProperties.WRITE_DISTRIBUTION_MODE) - .parseOptional(); + String modeName = + confParser + .stringConf() + .option(SparkWriteOptions.DISTRIBUTION_MODE) + .tableProperty(TableProperties.WRITE_DISTRIBUTION_MODE) + .parseOptional(); if (modeName != null) { DistributionMode mode = DistributionMode.fromName(modeName); @@ -208,28 +230,34 @@ private DistributionMode adjustWriteDistributionMode(DistributionMode mode) { } public DistributionMode deleteDistributionMode() { - String deleteModeName = confParser.stringConf() - .option(SparkWriteOptions.DISTRIBUTION_MODE) - .tableProperty(TableProperties.DELETE_DISTRIBUTION_MODE) - .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_HASH) - .parse(); + String deleteModeName = + confParser + .stringConf() + .option(SparkWriteOptions.DISTRIBUTION_MODE) + .tableProperty(TableProperties.DELETE_DISTRIBUTION_MODE) + .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_HASH) + .parse(); return DistributionMode.fromName(deleteModeName); } public DistributionMode updateDistributionMode() { - String updateModeName = confParser.stringConf() - .option(SparkWriteOptions.DISTRIBUTION_MODE) - .tableProperty(TableProperties.UPDATE_DISTRIBUTION_MODE) - .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_HASH) - .parse(); + String updateModeName = + confParser + .stringConf() + .option(SparkWriteOptions.DISTRIBUTION_MODE) + .tableProperty(TableProperties.UPDATE_DISTRIBUTION_MODE) + .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_HASH) + .parse(); return DistributionMode.fromName(updateModeName); } public DistributionMode copyOnWriteMergeDistributionMode() { - String mergeModeName = confParser.stringConf() - .option(SparkWriteOptions.DISTRIBUTION_MODE) - .tableProperty(TableProperties.MERGE_DISTRIBUTION_MODE) - .parseOptional(); + String mergeModeName = + confParser + .stringConf() + .option(SparkWriteOptions.DISTRIBUTION_MODE) + .tableProperty(TableProperties.MERGE_DISTRIBUTION_MODE) + .parseOptional(); if (mergeModeName != null) { DistributionMode mergeMode = DistributionMode.fromName(mergeModeName); @@ -240,30 +268,33 @@ public DistributionMode copyOnWriteMergeDistributionMode() { } public DistributionMode positionDeltaMergeDistributionMode() { - String mergeModeName = confParser.stringConf() - .option(SparkWriteOptions.DISTRIBUTION_MODE) - .tableProperty(TableProperties.MERGE_DISTRIBUTION_MODE) - .parseOptional(); + String mergeModeName = + confParser + .stringConf() + .option(SparkWriteOptions.DISTRIBUTION_MODE) + .tableProperty(TableProperties.MERGE_DISTRIBUTION_MODE) + .parseOptional(); return mergeModeName != null ? DistributionMode.fromName(mergeModeName) : distributionMode(); } public boolean useTableDistributionAndOrdering() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING) .defaultValue(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING_DEFAULT) .parse(); } public Long validateFromSnapshotId() { - return confParser.longConf() + return confParser + .longConf() .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID) .parseOptional(); } public IsolationLevel isolationLevel() { - String isolationLevelName = confParser.stringConf() - .option(SparkWriteOptions.ISOLATION_LEVEL) - .parseOptional(); + String isolationLevelName = + confParser.stringConf().option(SparkWriteOptions.ISOLATION_LEVEL).parseOptional(); return isolationLevelName != null ? IsolationLevel.fromName(isolationLevelName) : null; } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java index 72de545f1298..6f4649642c57 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java @@ -16,16 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; -/** - * Spark DF write options - */ +/** Spark DF write options */ public class SparkWriteOptions { - private SparkWriteOptions() { - } + private SparkWriteOptions() {} // Fileformat for write operations(default: Table write.format.default ) public static final String WRITE_FORMAT = "write-format"; @@ -58,15 +54,18 @@ private SparkWriteOptions() { public static final String REWRITTEN_FILE_SCAN_TASK_SET_ID = "rewritten-file-scan-task-set-id"; // Controls whether to allow writing timestamps without zone info - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = "handle-timestamp-without-timezone"; + public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = + "handle-timestamp-without-timezone"; public static final String OVERWRITE_MODE = "overwrite-mode"; // Overrides the default distribution mode for a write operation public static final String DISTRIBUTION_MODE = "distribution-mode"; - // Controls whether to take into account the table distribution and sort order during a write operation - public static final String USE_TABLE_DISTRIBUTION_AND_ORDERING = "use-table-distribution-and-ordering"; + // Controls whether to take into account the table distribution and sort order during a write + // operation + public static final String USE_TABLE_DISTRIBUTION_AND_ORDERING = + "use-table-distribution-and-ordering"; public static final boolean USE_TABLE_DISTRIBUTION_AND_ORDERING_DEFAULT = true; public static final String MERGE_SCHEMA = "merge-schema"; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java index b12d3e5030b7..4d4ec6782c72 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -45,8 +44,7 @@ import org.apache.spark.sql.types.TimestampType$; class TypeToSparkType extends TypeUtil.SchemaVisitor { - TypeToSparkType() { - } + TypeToSparkType() {} public static final String METADATA_COL_ATTR_KEY = "__metadata_col"; @@ -105,8 +103,7 @@ public DataType primitive(Type.PrimitiveType primitive) { case DATE: return DateType$.MODULE$; case TIME: - throw new UnsupportedOperationException( - "Spark does not support time fields"); + throw new UnsupportedOperationException("Spark does not support time fields"); case TIMESTAMP: return TimestampType$.MODULE$; case STRING: @@ -129,9 +126,7 @@ public DataType primitive(Type.PrimitiveType primitive) { private Metadata fieldMetadata(int fieldId) { if (MetadataColumns.metadataFieldIds().contains(fieldId)) { - return new MetadataBuilder() - .putBoolean(METADATA_COL_ATTR_KEY, true) - .build(); + return new MetadataBuilder().putBoolean(METADATA_COL_ATTR_KEY, true).build(); } return Metadata.empty(); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java index 7cbcf164d478..ec871e149406 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteOrphanFilesSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.concurrent.ExecutorService; @@ -29,8 +28,8 @@ /** * An action to delete orphan files. * - * @deprecated since 0.14.0, will be removed in 1.0.0; - * use {@link SparkActions} and {@link DeleteOrphanFilesSparkAction} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; use {@link SparkActions} and {@link + * DeleteOrphanFilesSparkAction} instead. */ @Deprecated public class BaseDeleteOrphanFilesSparkAction extends DeleteOrphanFilesSparkAction { diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java index 0c69e4e1e614..42bd2f644da6 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseDeleteReachableFilesSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import org.apache.spark.sql.SparkSession; @@ -24,8 +23,8 @@ /** * An action that deletes reachable files from a given metadata file. * - * @deprecated since 0.14.0, will be removed in 1.0.0; - * use {@link SparkActions} and {@link DeleteReachableFilesSparkAction} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; use {@link SparkActions} and {@link + * DeleteReachableFilesSparkAction} instead. */ @Deprecated public class BaseDeleteReachableFilesSparkAction extends DeleteReachableFilesSparkAction { diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java index cc70279f3970..6efa8385aa37 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseExpireSnapshotsSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.concurrent.ExecutorService; @@ -27,8 +26,8 @@ /** * An action to expire snapshots. * - * @deprecated since 0.14.0, will be removed in 1.0.0; - * use {@link SparkActions} and {@link ExpireSnapshotsSparkAction} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; use {@link SparkActions} and {@link + * ExpireSnapshotsSparkAction} instead. */ @Deprecated public class BaseExpireSnapshotsSparkAction extends ExpireSnapshotsSparkAction { diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseMigrateTableSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseMigrateTableSparkAction.java index 78a6fa20f529..de355f9b025c 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseMigrateTableSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseMigrateTableSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import org.apache.spark.sql.SparkSession; @@ -26,12 +25,13 @@ /** * An action to migrate a table to Iceberg. * - * @deprecated since 0.14.0, will be removed in 1.0.0; - * use {@link SparkActions} and {@link MigrateTableSparkAction} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; use {@link SparkActions} and {@link + * MigrateTableSparkAction} instead. */ @Deprecated public class BaseMigrateTableSparkAction extends MigrateTableSparkAction { - public BaseMigrateTableSparkAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { + public BaseMigrateTableSparkAction( + SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { super(spark, sourceCatalog, sourceTableIdent); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSparkAction.java index d23a367fbd91..a18910121ade 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteDataFilesSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import org.apache.iceberg.Table; @@ -25,8 +24,8 @@ /** * An action to rewrite data files. * - * @deprecated since 0.14.0, will be removed in 1.0.0; - * use {@link SparkActions} and {@link RewriteDataFilesSparkAction} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; use {@link SparkActions} and {@link + * RewriteDataFilesSparkAction} instead. */ @Deprecated public class BaseRewriteDataFilesSparkAction extends RewriteDataFilesSparkAction { diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java index d570397fc509..15378507d2d9 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseRewriteManifestsSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import org.apache.iceberg.Table; @@ -25,8 +24,8 @@ /** * An action to rewrite manifests. * - * @deprecated since 0.14.0, will be removed in 1.0.0; - * use {@link SparkActions} and {@link RewriteManifestsSparkAction} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; use {@link SparkActions} and {@link + * RewriteManifestsSparkAction} instead. */ @Deprecated public class BaseRewriteManifestsSparkAction extends RewriteManifestsSparkAction { diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotTableSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotTableSparkAction.java index 3305401e5308..a8c7fb3d9daf 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotTableSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotTableSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import org.apache.spark.sql.SparkSession; @@ -26,12 +25,13 @@ /** * An action to snapshot a table as an Iceberg table. * - * @deprecated since 0.14.0, will be removed in 1.0.0; - * use {@link SparkActions} and {@link SnapshotTableSparkAction} instead. + * @deprecated since 0.14.0, will be removed in 1.0.0; use {@link SparkActions} and {@link + * SnapshotTableSparkAction} instead. */ @Deprecated public class BaseSnapshotTableSparkAction extends SnapshotTableSparkAction { - BaseSnapshotTableSparkAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { + BaseSnapshotTableSparkAction( + SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { super(spark, sourceCatalog, sourceTableIdent); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java index 7ed17d75dd8a..77debe1e589d 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.Map; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java index 8b9821f40c52..acfdeb326416 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.MetadataTableType.ALL_MANIFESTS; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.lit; + import java.util.Iterator; import java.util.List; import java.util.Map; @@ -54,10 +57,6 @@ import org.apache.spark.sql.SparkSession; import scala.Tuple2; -import static org.apache.iceberg.MetadataTableType.ALL_MANIFESTS; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.lit; - abstract class BaseSparkAction { protected static final String CONTENT_FILE = "Content File"; @@ -127,21 +126,28 @@ protected Table newStaticTable(TableMetadata metadata, FileIO io) { // builds a DF of delete and data file path and type by reading all manifests protected Dataset buildValidContentFileWithTypeDF(Table table) { - Broadcast

    tableBroadcast = sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); - - Dataset allManifests = loadMetadataTable(table, ALL_MANIFESTS) - .selectExpr( - "content", - "path", - "length", - "partition_spec_id as partitionSpecId", - "added_snapshot_id as addedSnapshotId") - .dropDuplicates("path") - .repartition(spark.sessionState().conf().numShufflePartitions()) // avoid adaptive execution combining tasks - .as(Encoders.bean(ManifestFileBean.class)); + Broadcast
    tableBroadcast = + sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); + + Dataset allManifests = + loadMetadataTable(table, ALL_MANIFESTS) + .selectExpr( + "content", + "path", + "length", + "partition_spec_id as partitionSpecId", + "added_snapshot_id as addedSnapshotId") + .dropDuplicates("path") + .repartition( + spark + .sessionState() + .conf() + .numShufflePartitions()) // avoid adaptive execution combining tasks + .as(Encoders.bean(ManifestFileBean.class)); return allManifests - .flatMap(new ReadManifest(tableBroadcast), Encoders.tuple(Encoders.STRING(), Encoders.STRING())) + .flatMap( + new ReadManifest(tableBroadcast), Encoders.tuple(Encoders.STRING(), Encoders.STRING())) .toDF(FILE_PATH, FILE_TYPE); } @@ -160,16 +166,20 @@ protected Dataset buildManifestListDF(Table table) { } protected Dataset buildOtherMetadataFileDF(Table table) { - return buildOtherMetadataFileDF(table, false /* include all reachable previous metadata locations */); + return buildOtherMetadataFileDF( + table, false /* include all reachable previous metadata locations */); } protected Dataset buildAllReachableOtherMetadataFileDF(Table table) { - return buildOtherMetadataFileDF(table, true /* include all reachable previous metadata locations */); + return buildOtherMetadataFileDF( + table, true /* include all reachable previous metadata locations */); } - private Dataset buildOtherMetadataFileDF(Table table, boolean includePreviousMetadataLocations) { + private Dataset buildOtherMetadataFileDF( + Table table, boolean includePreviousMetadataLocations) { List otherMetadataFiles = Lists.newArrayList(); - otherMetadataFiles.addAll(ReachableFileUtil.metadataFileLocations(table, includePreviousMetadataLocations)); + otherMetadataFiles.addAll( + ReachableFileUtil.metadataFileLocations(table, includePreviousMetadataLocations)); otherMetadataFiles.add(ReachableFileUtil.versionHintLocation(table)); return spark.createDataset(otherMetadataFiles, Encoders.STRING()).toDF(FILE_PATH); } @@ -190,7 +200,8 @@ protected Dataset loadMetadataTable(Table table, MetadataTableType type) { return SparkTableUtil.loadMetadataTable(spark, table, type); } - private static class ReadManifest implements FlatMapFunction> { + private static class ReadManifest + implements FlatMapFunction> { private final Broadcast
    table; ReadManifest(Broadcast
    table) { @@ -205,7 +216,8 @@ public Iterator> call(ManifestFileBean manifest) { public CloseableIterator> entries(ManifestFileBean manifest) { FileIO io = table.getValue().io(); Map specs = table.getValue().specs(); - ImmutableList projection = ImmutableList.of(DataFile.FILE_PATH.name(), DataFile.CONTENT.name()); + ImmutableList projection = + ImmutableList.of(DataFile.FILE_PATH.name(), DataFile.CONTENT.name()); switch (manifest.content()) { case DATA: @@ -217,7 +229,8 @@ public CloseableIterator> entries(ManifestFileBean manife ManifestFiles.readDeleteManifest(manifest, io, specs).select(projection).iterator(), ReadManifest::contentFileWithType); default: - throw new IllegalArgumentException("Unsupported manifest content type:" + manifest.content()); + throw new IllegalArgumentException( + "Unsupported manifest content type:" + manifest.content()); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseTableCreationSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseTableCreationSparkAction.java index e3ddd7abc910..9639b205ac59 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseTableCreationSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/BaseTableCreationSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.List; @@ -50,7 +49,8 @@ import org.apache.spark.sql.types.StructType; abstract class BaseTableCreationSparkAction extends BaseSparkAction { - private static final Set ALLOWED_SOURCES = ImmutableSet.of("parquet", "avro", "orc", "hive"); + private static final Set ALLOWED_SOURCES = + ImmutableSet.of("parquet", "avro", "orc", "hive"); protected static final String LOCATION = "location"; protected static final String ICEBERG_METADATA_FOLDER = "metadata"; protected static final List EXCLUDED_PROPERTIES = @@ -66,7 +66,8 @@ abstract class BaseTableCreationSparkAction extends BaseSparkAction additionalProperties = Maps.newHashMap(); - BaseTableCreationSparkAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { + BaseTableCreationSparkAction( + SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { super(spark); this.sourceCatalog = checkSourceCatalog(sourceCatalog); @@ -78,12 +79,13 @@ abstract class BaseTableCreationSparkAction extends BaseSparkAction additionalProperties() { private void validateSourceTable() { String sourceTableProvider = sourceCatalogTable.provider().get().toLowerCase(Locale.ROOT); - Preconditions.checkArgument(ALLOWED_SOURCES.contains(sourceTableProvider), - "Cannot create an Iceberg table from source provider: '%s'", sourceTableProvider); - Preconditions.checkArgument(!sourceCatalogTable.storage().locationUri().isEmpty(), + Preconditions.checkArgument( + ALLOWED_SOURCES.contains(sourceTableProvider), + "Cannot create an Iceberg table from source provider: '%s'", + sourceTableProvider); + Preconditions.checkArgument( + !sourceCatalogTable.storage().locationUri().isEmpty(), "Cannot create an Iceberg table from a source without an explicit location"); } protected StagingTableCatalog checkDestinationCatalog(CatalogPlugin catalog) { - Preconditions.checkArgument(catalog instanceof SparkSessionCatalog || catalog instanceof SparkCatalog, - "Cannot create Iceberg table in non-Iceberg Catalog. " + - "Catalog '%s' was of class '%s' but '%s' or '%s' are required", - catalog.name(), catalog.getClass().getName(), SparkSessionCatalog.class.getName(), + Preconditions.checkArgument( + catalog instanceof SparkSessionCatalog || catalog instanceof SparkCatalog, + "Cannot create Iceberg table in non-Iceberg Catalog. " + + "Catalog '%s' was of class '%s' but '%s' or '%s' are required", + catalog.name(), + catalog.getClass().getName(), + SparkSessionCatalog.class.getName(), SparkCatalog.class.getName()); return (StagingTableCatalog) catalog; @@ -145,11 +153,14 @@ protected StagedSparkTable stageDestTable() { Map props = destTableProps(); StructType schema = sourceTable.schema(); Transform[] partitioning = sourceTable.partitioning(); - return (StagedSparkTable) destCatalog().stageCreate(destTableIdent(), schema, partitioning, props); + return (StagedSparkTable) + destCatalog().stageCreate(destTableIdent(), schema, partitioning, props); } catch (org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException e) { - throw new NoSuchNamespaceException("Cannot create table %s as the namespace does not exist", destTableIdent()); + throw new NoSuchNamespaceException( + "Cannot create table %s as the namespace does not exist", destTableIdent()); } catch (org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException e) { - throw new AlreadyExistsException("Cannot create table %s as it already exists", destTableIdent()); + throw new AlreadyExistsException( + "Cannot create table %s as it already exists", destTableIdent()); } } @@ -162,7 +173,10 @@ protected void ensureNameMappingPresent(Table table) { } protected String getMetadataLocation(Table table) { - return table.properties().getOrDefault(TableProperties.WRITE_METADATA_LOCATION, - table.location() + "/" + ICEBERG_METADATA_FOLDER); + return table + .properties() + .getOrDefault( + TableProperties.WRITE_METADATA_LOCATION, + table.location() + "/" + ICEBERG_METADATA_FOLDER); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteOrphanFilesSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteOrphanFilesSparkAction.java index 9c0cebc57aab..7df3eaf94a09 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteOrphanFilesSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteOrphanFilesSparkAction.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.io.IOException; import java.io.Serializable; import java.net.URI; @@ -76,30 +78,29 @@ import org.slf4j.LoggerFactory; import scala.Tuple2; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** - * An action that removes orphan metadata, data and delete files by listing a given location and comparing - * the actual files in that location with content and metadata files referenced by all valid snapshots. - * The location must be accessible for listing via the Hadoop {@link FileSystem}. - *

    - * By default, this action cleans up the table location returned by {@link Table#location()} and - * removes unreachable files that are older than 3 days using {@link Table#io()}. The behavior can be modified - * by passing a custom location to {@link #location} and a custom timestamp to {@link #olderThan(long)}. - * For example, someone might point this action to the data folder to clean up only orphan data files. - *

    - * Configure an alternative delete method using {@link #deleteWith(Consumer)}. - *

    - * For full control of the set of files being evaluated, use the {@link #compareToFileList(Dataset)} argument. This - * skips the directory listing - any files in the dataset provided which are not found in table metadata will - * be deleted, using the same {@link Table#location()} and {@link #olderThan(long)} filtering as above. - *

    - * Note: It is dangerous to call this action with a short retention interval as it might corrupt - * the state of the table if another operation is writing at the same time. + * An action that removes orphan metadata, data and delete files by listing a given location and + * comparing the actual files in that location with content and metadata files referenced by all + * valid snapshots. The location must be accessible for listing via the Hadoop {@link FileSystem}. + * + *

    By default, this action cleans up the table location returned by {@link Table#location()} and + * removes unreachable files that are older than 3 days using {@link Table#io()}. The behavior can + * be modified by passing a custom location to {@link #location} and a custom timestamp to {@link + * #olderThan(long)}. For example, someone might point this action to the data folder to clean up + * only orphan data files. + * + *

    Configure an alternative delete method using {@link #deleteWith(Consumer)}. + * + *

    For full control of the set of files being evaluated, use the {@link + * #compareToFileList(Dataset)} argument. This skips the directory listing - any files in the + * dataset provided which are not found in table metadata will be deleted, using the same {@link + * Table#location()} and {@link #olderThan(long)} filtering as above. + * + *

    Note: It is dangerous to call this action with a short retention interval as it might + * corrupt the state of the table if another operation is writing at the same time. */ -public class DeleteOrphanFilesSparkAction - extends BaseSparkAction implements DeleteOrphanFiles { +public class DeleteOrphanFilesSparkAction extends BaseSparkAction + implements DeleteOrphanFiles { private static final Logger LOG = LoggerFactory.getLogger(DeleteOrphanFilesSparkAction.class); private static final Splitter COMMA = Splitter.on(","); @@ -108,12 +109,13 @@ public class DeleteOrphanFilesSparkAction private final SerializableConfiguration hadoopConf; private final int partitionDiscoveryParallelism; private final Table table; - private final Consumer defaultDelete = new Consumer() { - @Override - public void accept(String file) { - table.io().deleteFile(file); - } - }; + private final Consumer defaultDelete = + new Consumer() { + @Override + public void accept(String file) { + table.io().deleteFile(file); + } + }; private Map equalSchemes = flattenMap(EQUAL_SCHEMES_DEFAULT); private Map equalAuthorities = Collections.emptyMap(); @@ -128,7 +130,8 @@ public void accept(String file) { super(spark); this.hadoopConf = new SerializableConfiguration(spark.sessionState().newHadoopConf()); - this.partitionDiscoveryParallelism = spark.sessionState().conf().parallelPartitionDiscoveryParallelism(); + this.partitionDiscoveryParallelism = + spark.sessionState().conf().parallelPartitionDiscoveryParallelism(); this.table = table; this.location = table.location(); @@ -230,17 +233,20 @@ private String jobDesc() { if (location != null) { options.add("location=" + location); } - return String.format("Deleting orphan files (%s) from %s", Joiner.on(',').join(options), table.name()); + return String.format( + "Deleting orphan files (%s) from %s", Joiner.on(',').join(options), table.name()); } private DeleteOrphanFiles.Result doExecute() { Dataset validContentFileDF = buildValidContentFileDF(table); Dataset validMetadataFileDF = buildValidMetadataFileDF(table); Dataset validFileDF = validContentFileDF.union(validMetadataFileDF); - Dataset actualFileDF = compareToFileList == null ? buildActualFileDF() : filteredCompareToFileList(); + Dataset actualFileDF = + compareToFileList == null ? buildActualFileDF() : filteredCompareToFileList(); - List orphanFiles = findOrphanFiles(spark(), actualFileDF, validFileDF, - equalSchemes, equalAuthorities, prefixMismatchMode); + List orphanFiles = + findOrphanFiles( + spark(), actualFileDF, validFileDF, equalSchemes, equalAuthorities, prefixMismatchMode); Tasks.foreach(orphanFiles) .noRetry() @@ -260,7 +266,8 @@ private Dataset buildActualFileDF() { PathFilter pathFilter = PartitionAwareHiddenPathFilter.forSpecs(table.specs()); // list at most 3 levels and only dirs that have less than 10 direct sub dirs on the driver - listDirRecursively(location, predicate, hadoopConf.value(), 3, 10, subDirs, pathFilter, matchingFiles); + listDirRecursively( + location, predicate, hadoopConf.value(), 3, 10, subDirs, pathFilter, matchingFiles); JavaRDD matchingFileRDD = sparkContext().parallelize(matchingFiles, 1); @@ -272,17 +279,22 @@ private Dataset buildActualFileDF() { JavaRDD subDirRDD = sparkContext().parallelize(subDirs, parallelism); Broadcast conf = sparkContext().broadcast(hadoopConf); - JavaRDD matchingLeafFileRDD = subDirRDD.mapPartitions( - listDirsRecursively(conf, olderThanTimestamp, pathFilter) - ); + JavaRDD matchingLeafFileRDD = + subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp, pathFilter)); JavaRDD completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD); return spark().createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF(FILE_PATH); } private static void listDirRecursively( - String dir, Predicate predicate, Configuration conf, int maxDepth, - int maxDirectSubDirs, List remainingSubDirs, PathFilter pathFilter, List matchingFiles) { + String dir, + Predicate predicate, + Configuration conf, + int maxDepth, + int maxDirectSubDirs, + List remainingSubDirs, + PathFilter pathFilter, + List matchingFiles) { // stop listing whenever we reach the max depth if (maxDepth <= 0) { @@ -312,7 +324,14 @@ private static void listDirRecursively( for (String subDir : subDirs) { listDirRecursively( - subDir, predicate, conf, maxDepth - 1, maxDirectSubDirs, remainingSubDirs, pathFilter, matchingFiles); + subDir, + predicate, + conf, + maxDepth - 1, + maxDirectSubDirs, + remainingSubDirs, + pathFilter, + matchingFiles); } } catch (IOException e) { throw new RuntimeIOException(e); @@ -320,9 +339,7 @@ private static void listDirRecursively( } private static FlatMapFunction, String> listDirsRecursively( - Broadcast conf, - long olderThanTimestamp, - PathFilter pathFilter) { + Broadcast conf, long olderThanTimestamp, PathFilter pathFilter) { return dirs -> { List subDirs = Lists.newArrayList(); @@ -333,13 +350,22 @@ private static FlatMapFunction, String> listDirsRecursively( int maxDepth = 2000; int maxDirectSubDirs = Integer.MAX_VALUE; - dirs.forEachRemaining(dir -> { - listDirRecursively( - dir, predicate, conf.value().value(), maxDepth, maxDirectSubDirs, subDirs, pathFilter, files); - }); + dirs.forEachRemaining( + dir -> { + listDirRecursively( + dir, + predicate, + conf.value().value(), + maxDepth, + maxDirectSubDirs, + subDirs, + pathFilter, + files); + }); if (!subDirs.isEmpty()) { - throw new RuntimeException("Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth); + throw new RuntimeException( + "Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth); } return files.iterator(); @@ -348,34 +374,41 @@ private static FlatMapFunction, String> listDirsRecursively( @VisibleForTesting static List findOrphanFiles( - SparkSession spark, Dataset actualFileDF, Dataset validFileDF, - Map equalSchemes, Map equalAuthorities, + SparkSession spark, + Dataset actualFileDF, + Dataset validFileDF, + Map equalSchemes, + Map equalAuthorities, PrefixMismatchMode prefixMismatchMode) { - Dataset actualFileMetadataDS = actualFileDF.mapPartitions( - toFileMetadata(equalSchemes, equalAuthorities), - Encoders.bean(FileMetadata.class)); - Dataset validFileMetadataDS = validFileDF.mapPartitions( - toFileMetadata(equalSchemes, equalAuthorities), - Encoders.bean(FileMetadata.class)); + Dataset actualFileMetadataDS = + actualFileDF.mapPartitions( + toFileMetadata(equalSchemes, equalAuthorities), Encoders.bean(FileMetadata.class)); + Dataset validFileMetadataDS = + validFileDF.mapPartitions( + toFileMetadata(equalSchemes, equalAuthorities), Encoders.bean(FileMetadata.class)); SetAccumulator> conflicts = new SetAccumulator<>(); spark.sparkContext().register(conflicts); Column joinCond = actualFileMetadataDS.col("path").equalTo(validFileMetadataDS.col("path")); - List orphanFiles = actualFileMetadataDS.joinWith(validFileMetadataDS, joinCond, "leftouter") - .mapPartitions(findOrphanFiles(prefixMismatchMode, conflicts), Encoders.STRING()) - .collectAsList(); + List orphanFiles = + actualFileMetadataDS + .joinWith(validFileMetadataDS, joinCond, "leftouter") + .mapPartitions(findOrphanFiles(prefixMismatchMode, conflicts), Encoders.STRING()) + .collectAsList(); if (prefixMismatchMode == PrefixMismatchMode.ERROR && !conflicts.value().isEmpty()) { - throw new ValidationException("Unable to determine whether certain files are orphan. " + - "Metadata references files that match listed/provided files except for authority/scheme. " + - "Please, inspect the conflicting authorities/schemes and provide which of them are equal " + - "by further configuring the action via equalSchemes() and equalAuthorities() methods. " + - "Set the prefix mismatch mode to 'NONE' to ignore remaining locations with conflicting " + - "authorities/schemes or to 'DELETE' iff you are ABSOLUTELY confident that remaining conflicting " + - "authorities/schemes are different. It will be impossible to recover deleted files. " + - "Conflicting authorities/schemes: %s.", conflicts.value()); + throw new ValidationException( + "Unable to determine whether certain files are orphan. " + + "Metadata references files that match listed/provided files except for authority/scheme. " + + "Please, inspect the conflicting authorities/schemes and provide which of them are equal " + + "by further configuring the action via equalSchemes() and equalAuthorities() methods. " + + "Set the prefix mismatch mode to 'NONE' to ignore remaining locations with conflicting " + + "authorities/schemes or to 'DELETE' iff you are ABSOLUTELY confident that remaining conflicting " + + "authorities/schemes are different. It will be impossible to recover deleted files. " + + "Conflicting authorities/schemes: %s.", + conflicts.value()); } return orphanFiles; @@ -395,54 +428,62 @@ private static Map flattenMap(Map map) { } private static MapPartitionsFunction, String> findOrphanFiles( - PrefixMismatchMode mode, - SetAccumulator> conflicts) { + PrefixMismatchMode mode, SetAccumulator> conflicts) { return rows -> { - Iterator transformed = Iterators.transform(rows, row -> { - FileMetadata actual = row._1; - FileMetadata valid = row._2; - - if (valid == null) { - return actual.location; - } - - boolean schemeMatch = Strings.isNullOrEmpty(valid.scheme) || - valid.scheme.equalsIgnoreCase(actual.scheme); - boolean authorityMatch = Strings.isNullOrEmpty(valid.authority) || - valid.authority.equalsIgnoreCase(actual.authority); - - if ((!schemeMatch || !authorityMatch) && mode == PrefixMismatchMode.DELETE) { - return actual.location; - } else { - if (!schemeMatch) { - conflicts.add(Pair.of(valid.scheme, actual.scheme)); - } - if (!authorityMatch) { - conflicts.add(Pair.of(valid.authority, actual.authority)); - } - } - - return null; - }); + Iterator transformed = + Iterators.transform( + rows, + row -> { + FileMetadata actual = row._1; + FileMetadata valid = row._2; + + if (valid == null) { + return actual.location; + } + + boolean schemeMatch = + Strings.isNullOrEmpty(valid.scheme) + || valid.scheme.equalsIgnoreCase(actual.scheme); + boolean authorityMatch = + Strings.isNullOrEmpty(valid.authority) + || valid.authority.equalsIgnoreCase(actual.authority); + + if ((!schemeMatch || !authorityMatch) && mode == PrefixMismatchMode.DELETE) { + return actual.location; + } else { + if (!schemeMatch) { + conflicts.add(Pair.of(valid.scheme, actual.scheme)); + } + if (!authorityMatch) { + conflicts.add(Pair.of(valid.authority, actual.authority)); + } + } + + return null; + }); return Iterators.filter(transformed, Objects::nonNull); }; } private static MapPartitionsFunction toFileMetadata( Map equalSchemesMap, Map equalAuthoritiesMap) { - return rows -> Iterators.transform(rows, row -> { - String location = row.getString(0); - URI uri = new Path(location).toUri(); - String scheme = equalSchemesMap.getOrDefault(uri.getScheme(), uri.getScheme()); - String authority = equalAuthoritiesMap.getOrDefault(uri.getAuthority(), uri.getAuthority()); - return new FileMetadata(scheme, authority, uri.getPath(), location); - }); + return rows -> + Iterators.transform( + rows, + row -> { + String location = row.getString(0); + URI uri = new Path(location).toUri(); + String scheme = equalSchemesMap.getOrDefault(uri.getScheme(), uri.getScheme()); + String authority = + equalAuthoritiesMap.getOrDefault(uri.getAuthority(), uri.getAuthority()); + return new FileMetadata(scheme, authority, uri.getPath(), location); + }); } /** - * A {@link PathFilter} that filters out hidden path, but does not filter out paths that would be marked - * as hidden by {@link HiddenPathFilter} due to a partition field that starts with one of the characters that - * indicate a hidden path. + * A {@link PathFilter} that filters out hidden path, but does not filter out paths that would be + * marked as hidden by {@link HiddenPathFilter} due to a partition field that starts with one of + * the characters that indicate a hidden path. */ @VisibleForTesting static class PartitionAwareHiddenPathFilter implements PathFilter, Serializable { @@ -455,7 +496,8 @@ static class PartitionAwareHiddenPathFilter implements PathFilter, Serializable @Override public boolean accept(Path path) { - boolean isHiddenPartitionPath = hiddenPathPartitionNames.stream().anyMatch(path.getName()::startsWith); + boolean isHiddenPartitionPath = + hiddenPathPartitionNames.stream().anyMatch(path.getName()::startsWith); return isHiddenPartitionPath || HiddenPathFilter.get().accept(path); } @@ -464,14 +506,20 @@ static PathFilter forSpecs(Map specs) { return HiddenPathFilter.get(); } - Set partitionNames = specs.values().stream() - .map(PartitionSpec::fields) - .flatMap(List::stream) - .filter(partitionField -> partitionField.name().startsWith("_") || partitionField.name().startsWith(".")) - .map(partitionField -> partitionField.name() + "=") - .collect(Collectors.toSet()); - - return partitionNames.isEmpty() ? HiddenPathFilter.get() : new PartitionAwareHiddenPathFilter(partitionNames); + Set partitionNames = + specs.values().stream() + .map(PartitionSpec::fields) + .flatMap(List::stream) + .filter( + partitionField -> + partitionField.name().startsWith("_") + || partitionField.name().startsWith(".")) + .map(partitionField -> partitionField.name() + "=") + .collect(Collectors.toSet()); + + return partitionNames.isEmpty() + ? HiddenPathFilter.get() + : new PartitionAwareHiddenPathFilter(partitionNames); } } @@ -488,8 +536,7 @@ public FileMetadata(String scheme, String authority, String path, String locatio this.location = location; } - public FileMetadata() { - } + public FileMetadata() {} public void setScheme(String scheme) { this.scheme = scheme; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteReachableFilesSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteReachableFilesSparkAction.java index e840d90c2c3a..a9828d2c7894 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteReachableFilesSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteReachableFilesSparkAction.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.util.Iterator; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicLong; @@ -42,12 +44,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** - * An implementation of {@link DeleteReachableFiles} that uses metadata tables in Spark - * to determine which files should be deleted. + * An implementation of {@link DeleteReachableFiles} that uses metadata tables in Spark to determine + * which files should be deleted. */ @SuppressWarnings("UnnecessaryAnonymousClass") public class DeleteReachableFilesSparkAction @@ -59,12 +58,13 @@ public class DeleteReachableFilesSparkAction private static final Logger LOG = LoggerFactory.getLogger(DeleteReachableFilesSparkAction.class); private final String metadataFileLocation; - private final Consumer defaultDelete = new Consumer() { - @Override - public void accept(String file) { - io.deleteFile(file); - } - }; + private final Consumer defaultDelete = + new Consumer() { + @Override + public void accept(String file) { + io.deleteFile(file); + } + }; private Consumer deleteFunc = defaultDelete; private ExecutorService deleteExecutorService = null; @@ -115,7 +115,8 @@ private Result doExecute() { Dataset reachableFileDF = buildReachableFileDF(metadata).distinct(); - boolean streamResults = PropertyUtil.propertyAsBoolean(options(), STREAM_RESULTS, STREAM_RESULTS_DEFAULT); + boolean streamResults = + PropertyUtil.propertyAsBoolean(options(), STREAM_RESULTS, STREAM_RESULTS_DEFAULT); if (streamResults) { return deleteFiles(reachableFileDF.toLocalIterator()); } else { @@ -144,40 +145,45 @@ private BaseDeleteReachableFilesActionResult deleteFiles(Iterator deleted) AtomicLong otherFilesCount = new AtomicLong(0L); Tasks.foreach(deleted) - .retry(3).stopRetryOn(NotFoundException.class).suppressFailureWhenFinished() + .retry(3) + .stopRetryOn(NotFoundException.class) + .suppressFailureWhenFinished() .executeWith(deleteExecutorService) - .onFailure((fileInfo, exc) -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - LOG.warn("Delete failed for {}: {}", type, file, exc); - }) - .run(fileInfo -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - deleteFunc.accept(file); - switch (type) { - case CONTENT_FILE: - dataFileCount.incrementAndGet(); - LOG.trace("Deleted Content File: {}", file); - break; - case MANIFEST: - manifestCount.incrementAndGet(); - LOG.debug("Deleted Manifest: {}", file); - break; - case MANIFEST_LIST: - manifestListCount.incrementAndGet(); - LOG.debug("Deleted Manifest List: {}", file); - break; - case OTHERS: - otherFilesCount.incrementAndGet(); - LOG.debug("Others: {}", file); - break; - } - }); - - long filesCount = dataFileCount.get() + manifestCount.get() + manifestListCount.get() + otherFilesCount.get(); + .onFailure( + (fileInfo, exc) -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + LOG.warn("Delete failed for {}: {}", type, file, exc); + }) + .run( + fileInfo -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + deleteFunc.accept(file); + switch (type) { + case CONTENT_FILE: + dataFileCount.incrementAndGet(); + LOG.trace("Deleted Content File: {}", file); + break; + case MANIFEST: + manifestCount.incrementAndGet(); + LOG.debug("Deleted Manifest: {}", file); + break; + case MANIFEST_LIST: + manifestListCount.incrementAndGet(); + LOG.debug("Deleted Manifest List: {}", file); + break; + case OTHERS: + otherFilesCount.incrementAndGet(); + LOG.debug("Others: {}", file); + break; + } + }); + + long filesCount = + dataFileCount.get() + manifestCount.get() + manifestListCount.get() + otherFilesCount.get(); LOG.info("Total files removed: {}", filesCount); - return new BaseDeleteReachableFilesActionResult(dataFileCount.get(), manifestCount.get(), manifestListCount.get(), - otherFilesCount.get()); + return new BaseDeleteReachableFilesActionResult( + dataFileCount.get(), manifestCount.get(), manifestListCount.get(), otherFilesCount.get()); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/ExpireSnapshotsSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/ExpireSnapshotsSparkAction.java index f3fc7bdc1469..c9e5f7cca785 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/ExpireSnapshotsSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/ExpireSnapshotsSparkAction.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.util.Iterator; import java.util.List; import java.util.Set; @@ -47,26 +49,24 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** - * An action that performs the same operation as {@link org.apache.iceberg.ExpireSnapshots} but uses Spark - * to determine the delta in files between the pre and post-expiration table metadata. All of the same - * restrictions of {@link org.apache.iceberg.ExpireSnapshots} also apply to this action. - *

    - * This action first leverages {@link org.apache.iceberg.ExpireSnapshots} to expire snapshots and then - * uses metadata tables to find files that can be safely deleted. This is done by anti-joining two Datasets - * that contain all manifest and content files before and after the expiration. The snapshot expiration - * will be fully committed before any deletes are issued. - *

    - * This operation performs a shuffle so the parallelism can be controlled through 'spark.sql.shuffle.partitions'. - *

    - * Deletes are still performed locally after retrieving the results from the Spark executors. + * An action that performs the same operation as {@link org.apache.iceberg.ExpireSnapshots} but uses + * Spark to determine the delta in files between the pre and post-expiration table metadata. All of + * the same restrictions of {@link org.apache.iceberg.ExpireSnapshots} also apply to this action. + * + *

    This action first leverages {@link org.apache.iceberg.ExpireSnapshots} to expire snapshots and + * then uses metadata tables to find files that can be safely deleted. This is done by anti-joining + * two Datasets that contain all manifest and content files before and after the expiration. The + * snapshot expiration will be fully committed before any deletes are issued. + * + *

    This operation performs a shuffle so the parallelism can be controlled through + * 'spark.sql.shuffle.partitions'. + * + *

    Deletes are still performed locally after retrieving the results from the Spark executors. */ @SuppressWarnings("UnnecessaryAnonymousClass") -public class ExpireSnapshotsSparkAction - extends BaseSparkAction implements ExpireSnapshots { +public class ExpireSnapshotsSparkAction extends BaseSparkAction + implements ExpireSnapshots { public static final String STREAM_RESULTS = "stream-results"; public static final boolean STREAM_RESULTS_DEFAULT = false; @@ -75,12 +75,13 @@ public class ExpireSnapshotsSparkAction private final Table table; private final TableOperations ops; - private final Consumer defaultDelete = new Consumer() { - @Override - public void accept(String file) { - ops.io().deleteFile(file); - } - }; + private final Consumer defaultDelete = + new Consumer() { + @Override + public void accept(String file) { + ops.io().deleteFile(file); + } + }; private final Set expiredSnapshotIds = Sets.newHashSet(); private Long expireOlderThanValue = null; @@ -124,8 +125,10 @@ public ExpireSnapshotsSparkAction expireOlderThan(long timestampMillis) { @Override public ExpireSnapshotsSparkAction retainLast(int numSnapshots) { - Preconditions.checkArgument(1 <= numSnapshots, - "Number of snapshots to retain must be at least 1, cannot be: %s", numSnapshots); + Preconditions.checkArgument( + 1 <= numSnapshots, + "Number of snapshots to retain must be at least 1, cannot be: %s", + numSnapshots); this.retainLastValue = numSnapshots; return this; } @@ -138,10 +141,11 @@ public ExpireSnapshotsSparkAction deleteWith(Consumer newDeleteFunc) { /** * Expires snapshots and commits the changes to the table, returning a Dataset of files to delete. - *

    - * This does not delete data files. To delete data files, run {@link #execute()}. - *

    - * This may be called before or after {@link #execute()} is called to return the expired file list. + * + *

    This does not delete data files. To delete data files, run {@link #execute()}. + * + *

    This may be called before or after {@link #execute()} is called to return the expired file + * list. * * @return a Dataset of files that are no longer referenced by the table */ @@ -151,7 +155,8 @@ public Dataset expire() { Dataset originalFiles = buildValidFileDF(ops.current()); // perform expiration - org.apache.iceberg.ExpireSnapshots expireSnapshots = table.expireSnapshots().cleanExpiredFiles(false); + org.apache.iceberg.ExpireSnapshots expireSnapshots = + table.expireSnapshots().cleanExpiredFiles(false); for (long id : expiredSnapshotIds) { expireSnapshots = expireSnapshots.expireSnapshotId(id); } @@ -196,17 +201,20 @@ private String jobDesc() { if (!expiredSnapshotIds.isEmpty()) { Long first = expiredSnapshotIds.stream().findFirst().get(); if (expiredSnapshotIds.size() > 1) { - options.add(String.format("snapshot_ids: %s (%s more...)", first, expiredSnapshotIds.size() - 1)); + options.add( + String.format("snapshot_ids: %s (%s more...)", first, expiredSnapshotIds.size() - 1)); } else { options.add(String.format("snapshot_id: %s", first)); } } - return String.format("Expiring snapshots (%s) in %s", Joiner.on(',').join(options), table.name()); + return String.format( + "Expiring snapshots (%s) in %s", Joiner.on(',').join(options), table.name()); } private ExpireSnapshots.Result doExecute() { - boolean streamResults = PropertyUtil.propertyAsBoolean(options(), STREAM_RESULTS, STREAM_RESULTS_DEFAULT); + boolean streamResults = + PropertyUtil.propertyAsBoolean(options(), STREAM_RESULTS, STREAM_RESULTS_DEFAULT); if (streamResults) { return deleteFiles(expire().toLocalIterator()); } else { @@ -235,42 +243,52 @@ private BaseExpireSnapshotsActionResult deleteFiles(Iterator expired) { AtomicLong manifestListCount = new AtomicLong(0L); Tasks.foreach(expired) - .retry(3).stopRetryOn(NotFoundException.class).suppressFailureWhenFinished() + .retry(3) + .stopRetryOn(NotFoundException.class) + .suppressFailureWhenFinished() .executeWith(deleteExecutorService) - .onFailure((fileInfo, exc) -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - LOG.warn("Delete failed for {}: {}", type, file, exc); - }) - .run(fileInfo -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - deleteFunc.accept(file); - - if (FileContent.DATA.name().equalsIgnoreCase(type)) { - dataFileCount.incrementAndGet(); - LOG.trace("Deleted Data File: {}", file); - } else if (FileContent.POSITION_DELETES.name().equalsIgnoreCase(type)) { - posDeleteFileCount.incrementAndGet(); - LOG.trace("Deleted Positional Delete File: {}", file); - } else if (FileContent.EQUALITY_DELETES.name().equalsIgnoreCase(type)) { - eqDeleteFileCount.incrementAndGet(); - LOG.trace("Deleted Equality Delete File: {}", file); - } else if (MANIFEST.equals(type)) { - manifestCount.incrementAndGet(); - LOG.debug("Deleted Manifest: {}", file); - } else if (MANIFEST_LIST.equalsIgnoreCase(type)) { - manifestListCount.incrementAndGet(); - LOG.debug("Deleted Manifest List: {}", file); - } else { - throw new ValidationException("Illegal file type: %s", type); - } - }); - - long contentFileCount = dataFileCount.get() + posDeleteFileCount.get() + eqDeleteFileCount.get(); - LOG.info("Deleted {} total files", contentFileCount + manifestCount.get() + manifestListCount.get()); - - return new BaseExpireSnapshotsActionResult(dataFileCount.get(), posDeleteFileCount.get(), - eqDeleteFileCount.get(), manifestCount.get(), manifestListCount.get()); + .onFailure( + (fileInfo, exc) -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + LOG.warn("Delete failed for {}: {}", type, file, exc); + }) + .run( + fileInfo -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + deleteFunc.accept(file); + + if (FileContent.DATA.name().equalsIgnoreCase(type)) { + dataFileCount.incrementAndGet(); + LOG.trace("Deleted Data File: {}", file); + } else if (FileContent.POSITION_DELETES.name().equalsIgnoreCase(type)) { + posDeleteFileCount.incrementAndGet(); + LOG.trace("Deleted Positional Delete File: {}", file); + } else if (FileContent.EQUALITY_DELETES.name().equalsIgnoreCase(type)) { + eqDeleteFileCount.incrementAndGet(); + LOG.trace("Deleted Equality Delete File: {}", file); + } else if (MANIFEST.equals(type)) { + manifestCount.incrementAndGet(); + LOG.debug("Deleted Manifest: {}", file); + } else if (MANIFEST_LIST.equalsIgnoreCase(type)) { + manifestListCount.incrementAndGet(); + LOG.debug("Deleted Manifest List: {}", file); + } else { + throw new ValidationException("Illegal file type: %s", type); + } + }); + + long contentFileCount = + dataFileCount.get() + posDeleteFileCount.get() + eqDeleteFileCount.get(); + LOG.info( + "Deleted {} total files", contentFileCount + manifestCount.get() + manifestListCount.get()); + + return new BaseExpireSnapshotsActionResult( + dataFileCount.get(), + posDeleteFileCount.get(), + eqDeleteFileCount.get(), + manifestCount.get(), + manifestListCount.get()); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java index 269130496dc9..1f82eabc6b6c 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.nio.ByteBuffer; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/MigrateTableSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/MigrateTableSparkAction.java index 7146bffcbe73..e5716ea15320 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/MigrateTableSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/MigrateTableSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.Map; @@ -45,13 +44,11 @@ import scala.collection.JavaConverters; /** - * Takes a Spark table in the source catalog and attempts to transform it into an Iceberg - * table in the same location with the same identifier. Once complete the identifier which - * previously referred to a non-Iceberg table will refer to the newly migrated Iceberg - * table. + * Takes a Spark table in the source catalog and attempts to transform it into an Iceberg table in + * the same location with the same identifier. Once complete the identifier which previously + * referred to a non-Iceberg table will refer to the newly migrated Iceberg table. */ -public class MigrateTableSparkAction - extends BaseTableCreationSparkAction +public class MigrateTableSparkAction extends BaseTableCreationSparkAction implements MigrateTable { private static final Logger LOG = LoggerFactory.getLogger(MigrateTableSparkAction.class); @@ -61,7 +58,8 @@ public class MigrateTableSparkAction private final Identifier destTableIdent; private final Identifier backupIdent; - MigrateTableSparkAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { + MigrateTableSparkAction( + SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { super(spark, sourceCatalog, sourceTableIdent); this.destCatalog = checkDestinationCatalog(sourceCatalog); this.destTableIdent = sourceTableIdent; @@ -132,7 +130,8 @@ private MigrateTable.Result doExecute() { threw = false; } finally { if (threw) { - LOG.error("Failed to perform the migration, aborting table creation and restoring the original table"); + LOG.error( + "Failed to perform the migration, aborting table creation and restoring the original table"); restoreSourceTable(); @@ -147,8 +146,12 @@ private MigrateTable.Result doExecute() { } Snapshot snapshot = icebergTable.currentSnapshot(); - long migratedDataFilesCount = Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); - LOG.info("Successfully loaded Iceberg metadata for {} files to {}", migratedDataFilesCount, destTableIdent()); + long migratedDataFilesCount = + Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); + LOG.info( + "Successfully loaded Iceberg metadata for {} files to {}", + migratedDataFilesCount, + destTableIdent()); return new BaseMigrateTableActionResult(migratedDataFilesCount); } @@ -176,9 +179,11 @@ protected Map destTableProps() { @Override protected TableCatalog checkSourceCatalog(CatalogPlugin catalog) { // currently the import code relies on being able to look up the table in the session catalog - Preconditions.checkArgument(catalog instanceof SparkSessionCatalog, + Preconditions.checkArgument( + catalog instanceof SparkSessionCatalog, "Cannot migrate a table from a non-Iceberg Spark Session Catalog. Found %s of class %s as the source catalog.", - catalog.name(), catalog.getClass().getName()); + catalog.name(), + catalog.getClass().getName()); return (TableCatalog) catalog; } @@ -204,11 +209,15 @@ private void restoreSourceTable() { destCatalog().renameTable(backupIdent, sourceTableIdent()); } catch (org.apache.spark.sql.catalyst.analysis.NoSuchTableException e) { - LOG.error("Cannot restore the original table, the backup table {} cannot be found", backupIdent, e); + LOG.error( + "Cannot restore the original table, the backup table {} cannot be found", backupIdent, e); } catch (org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException e) { - LOG.error("Cannot restore the original table, a table with the original name exists. " + - "Use the backup table {} to restore the original table manually.", backupIdent, e); + LOG.error( + "Cannot restore the original table, a table with the original name exists. " + + "Use the backup table {} to restore the original table manually.", + backupIdent, + e); } } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java index bb2cbd83298f..f754fcb4c74a 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.io.IOException; @@ -73,19 +72,18 @@ import org.slf4j.LoggerFactory; public class RewriteDataFilesSparkAction - extends BaseSnapshotUpdateSparkAction - implements RewriteDataFiles { + extends BaseSnapshotUpdateSparkAction implements RewriteDataFiles { private static final Logger LOG = LoggerFactory.getLogger(RewriteDataFilesSparkAction.class); - private static final Set VALID_OPTIONS = ImmutableSet.of( - MAX_CONCURRENT_FILE_GROUP_REWRITES, - MAX_FILE_GROUP_SIZE_BYTES, - PARTIAL_PROGRESS_ENABLED, - PARTIAL_PROGRESS_MAX_COMMITS, - TARGET_FILE_SIZE_BYTES, - USE_STARTING_SEQUENCE_NUMBER, - REWRITE_JOB_ORDER - ); + private static final Set VALID_OPTIONS = + ImmutableSet.of( + MAX_CONCURRENT_FILE_GROUP_REWRITES, + MAX_FILE_GROUP_SIZE_BYTES, + PARTIAL_PROGRESS_ENABLED, + PARTIAL_PROGRESS_MAX_COMMITS, + TARGET_FILE_SIZE_BYTES, + USE_STARTING_SEQUENCE_NUMBER, + REWRITE_JOB_ORDER); private final Table table; @@ -109,32 +107,40 @@ protected RewriteDataFilesSparkAction self() { @Override public RewriteDataFilesSparkAction binPack() { - Preconditions.checkArgument(this.strategy == null, - "Cannot set strategy to binpack, it has already been set", this.strategy); + Preconditions.checkArgument( + this.strategy == null, + "Cannot set strategy to binpack, it has already been set", + this.strategy); this.strategy = binPackStrategy(); return this; } @Override public RewriteDataFilesSparkAction sort(SortOrder sortOrder) { - Preconditions.checkArgument(this.strategy == null, - "Cannot set strategy to sort, it has already been set to %s", this.strategy); + Preconditions.checkArgument( + this.strategy == null, + "Cannot set strategy to sort, it has already been set to %s", + this.strategy); this.strategy = sortStrategy().sortOrder(sortOrder); return this; } @Override public RewriteDataFilesSparkAction sort() { - Preconditions.checkArgument(this.strategy == null, - "Cannot set strategy to sort, it has already been set to %s", this.strategy); + Preconditions.checkArgument( + this.strategy == null, + "Cannot set strategy to sort, it has already been set to %s", + this.strategy); this.strategy = sortStrategy(); return this; } @Override public RewriteDataFilesSparkAction zOrder(String... columnNames) { - Preconditions.checkArgument(this.strategy == null, - "Cannot set strategy to zorder, it has already been set to %s", this.strategy); + Preconditions.checkArgument( + this.strategy == null, + "Cannot set strategy to zorder, it has already been set to %s", + this.strategy); this.strategy = zOrderStrategy(columnNames); return this; } @@ -160,7 +166,8 @@ public RewriteDataFiles.Result execute() { validateAndInitOptions(); - Map>> fileGroupsByPartition = planFileGroups(startingSnapshotId); + Map>> fileGroupsByPartition = + planFileGroups(startingSnapshotId); RewriteExecutionContext ctx = new RewriteExecutionContext(fileGroupsByPartition); if (ctx.totalGroupCount() == 0) { @@ -179,43 +186,52 @@ public RewriteDataFiles.Result execute() { } Map>> planFileGroups(long startingSnapshotId) { - CloseableIterable fileScanTasks = table.newScan() - .useSnapshot(startingSnapshotId) - .filter(filter) - .ignoreResiduals() - .planFiles(); + CloseableIterable fileScanTasks = + table + .newScan() + .useSnapshot(startingSnapshotId) + .filter(filter) + .ignoreResiduals() + .planFiles(); try { StructType partitionType = table.spec().partitionType(); StructLikeMap> filesByPartition = StructLikeMap.create(partitionType); StructLike emptyStruct = GenericRecord.create(partitionType); - fileScanTasks.forEach(task -> { - // If a task uses an incompatible partition spec the data inside could contain values which - // belong to multiple partitions in the current spec. Treating all such files as un-partitioned and - // grouping them together helps to minimize new files made. - StructLike taskPartition = task.file().specId() == table.spec().specId() ? - task.file().partition() : emptyStruct; - - List files = filesByPartition.get(taskPartition); - if (files == null) { - files = Lists.newArrayList(); - } - - files.add(task); - filesByPartition.put(taskPartition, files); - }); - - StructLikeMap>> fileGroupsByPartition = StructLikeMap.create(partitionType); - - filesByPartition.forEach((partition, tasks) -> { - Iterable filtered = strategy.selectFilesToRewrite(tasks); - Iterable> groupedTasks = strategy.planFileGroups(filtered); - List> fileGroups = ImmutableList.copyOf(groupedTasks); - if (fileGroups.size() > 0) { - fileGroupsByPartition.put(partition, fileGroups); - } - }); + fileScanTasks.forEach( + task -> { + // If a task uses an incompatible partition spec the data inside could contain values + // which + // belong to multiple partitions in the current spec. Treating all such files as + // un-partitioned and + // grouping them together helps to minimize new files made. + StructLike taskPartition = + task.file().specId() == table.spec().specId() + ? task.file().partition() + : emptyStruct; + + List files = filesByPartition.get(taskPartition); + if (files == null) { + files = Lists.newArrayList(); + } + + files.add(task); + filesByPartition.put(taskPartition, files); + }); + + StructLikeMap>> fileGroupsByPartition = + StructLikeMap.create(partitionType); + + filesByPartition.forEach( + (partition, tasks) -> { + Iterable filtered = strategy.selectFilesToRewrite(tasks); + Iterable> groupedTasks = strategy.planFileGroups(filtered); + List> fileGroups = ImmutableList.copyOf(groupedTasks); + if (fileGroups.size() > 0) { + fileGroupsByPartition.put(partition, fileGroups); + } + }); return fileGroupsByPartition; } finally { @@ -230,9 +246,10 @@ Map>> planFileGroups(long startingSnapshotId @VisibleForTesting RewriteFileGroup rewriteFiles(RewriteExecutionContext ctx, RewriteFileGroup fileGroup) { String desc = jobDesc(fileGroup, ctx); - Set addedFiles = withJobGroupInfo( - newJobGroupInfo("REWRITE-DATA-FILES", desc), - () -> strategy.rewriteFiles(fileGroup.fileScans())); + Set addedFiles = + withJobGroupInfo( + newJobGroupInfo("REWRITE-DATA-FILES", desc), + () -> strategy.rewriteFiles(fileGroup.fileScans())); fileGroup.setOutputFiles(addedFiles); LOG.info("Rewrite Files Ready to be Committed - {}", desc); @@ -241,11 +258,10 @@ RewriteFileGroup rewriteFiles(RewriteExecutionContext ctx, RewriteFileGroup file private ExecutorService rewriteService() { return MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool( - maxConcurrentFileGroupRewrites, - new ThreadFactoryBuilder() - .setNameFormat("Rewrite-Service-%d") - .build())); + (ThreadPoolExecutor) + Executors.newFixedThreadPool( + maxConcurrentFileGroupRewrites, + new ThreadFactoryBuilder().setNameFormat("Rewrite-Service-%d").build())); } @VisibleForTesting @@ -253,31 +269,42 @@ RewriteDataFilesCommitManager commitManager(long startingSnapshotId) { return new RewriteDataFilesCommitManager(table, startingSnapshotId, useStartingSequenceNumber); } - private Result doExecute(RewriteExecutionContext ctx, Stream groupStream, - RewriteDataFilesCommitManager commitManager) { + private Result doExecute( + RewriteExecutionContext ctx, + Stream groupStream, + RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); ConcurrentLinkedQueue rewrittenGroups = Queues.newConcurrentLinkedQueue(); - Tasks.Builder rewriteTaskBuilder = Tasks.foreach(groupStream) - .executeWith(rewriteService) - .stopOnFailure() - .noRetry() - .onFailure((fileGroup, exception) -> { - LOG.warn("Failure during rewrite process for group {}", fileGroup.info(), exception); - }); + Tasks.Builder rewriteTaskBuilder = + Tasks.foreach(groupStream) + .executeWith(rewriteService) + .stopOnFailure() + .noRetry() + .onFailure( + (fileGroup, exception) -> { + LOG.warn( + "Failure during rewrite process for group {}", fileGroup.info(), exception); + }); try { - rewriteTaskBuilder.run(fileGroup -> { - rewrittenGroups.add(rewriteFiles(ctx, fileGroup)); - }); + rewriteTaskBuilder.run( + fileGroup -> { + rewrittenGroups.add(rewriteFiles(ctx, fileGroup)); + }); } catch (Exception e) { // At least one rewrite group failed, clean up all completed rewrites - LOG.error("Cannot complete rewrite, {} is not enabled and one of the file set groups failed to " + - "be rewritten. This error occurred during the writing of new files, not during the commit process. This " + - "indicates something is wrong that doesn't involve conflicts with other Iceberg operations. Enabling " + - "{} may help in this case but the root cause should be investigated. Cleaning up {} groups which finished " + - "being written.", PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_ENABLED, rewrittenGroups.size(), e); + LOG.error( + "Cannot complete rewrite, {} is not enabled and one of the file set groups failed to " + + "be rewritten. This error occurred during the writing of new files, not during the commit process. This " + + "indicates something is wrong that doesn't involve conflicts with other Iceberg operations. Enabling " + + "{} may help in this case but the root cause should be investigated. Cleaning up {} groups which finished " + + "being written.", + PARTIAL_PROGRESS_ENABLED, + PARTIAL_PROGRESS_ENABLED, + rewrittenGroups.size(), + e); Tasks.foreach(rewrittenGroups) .suppressFailureWhenFinished() @@ -290,30 +317,33 @@ private Result doExecute(RewriteExecutionContext ctx, Stream g try { commitManager.commitOrClean(Sets.newHashSet(rewrittenGroups)); } catch (ValidationException | CommitFailedException e) { - String errorMessage = String.format( - "Cannot commit rewrite because of a ValidationException or CommitFailedException. This usually means that " + - "this rewrite has conflicted with another concurrent Iceberg operation. To reduce the likelihood of " + - "conflicts, set %s which will break up the rewrite into multiple smaller commits controlled by %s. " + - "Separate smaller rewrite commits can succeed independently while any commits that conflict with " + - "another Iceberg operation will be ignored. This mode will create additional snapshots in the table " + - "history, one for each commit.", - PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_MAX_COMMITS); + String errorMessage = + String.format( + "Cannot commit rewrite because of a ValidationException or CommitFailedException. This usually means that " + + "this rewrite has conflicted with another concurrent Iceberg operation. To reduce the likelihood of " + + "conflicts, set %s which will break up the rewrite into multiple smaller commits controlled by %s. " + + "Separate smaller rewrite commits can succeed independently while any commits that conflict with " + + "another Iceberg operation will be ignored. This mode will create additional snapshots in the table " + + "history, one for each commit.", + PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_MAX_COMMITS); throw new RuntimeException(errorMessage, e); } - List rewriteResults = rewrittenGroups.stream() - .map(RewriteFileGroup::asResult) - .collect(Collectors.toList()); + List rewriteResults = + rewrittenGroups.stream().map(RewriteFileGroup::asResult).collect(Collectors.toList()); return new BaseRewriteDataFilesResult(rewriteResults); } - private Result doExecuteWithPartialProgress(RewriteExecutionContext ctx, Stream groupStream, - RewriteDataFilesCommitManager commitManager) { + private Result doExecuteWithPartialProgress( + RewriteExecutionContext ctx, + Stream groupStream, + RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); // Start Commit Service int groupsPerCommit = IntMath.divide(ctx.totalGroupCount(), maxCommits, RoundingMode.CEILING); - RewriteDataFilesCommitManager.CommitService commitService = commitManager.service(groupsPerCommit); + RewriteDataFilesCommitManager.CommitService commitService = + commitManager.service(groupsPerCommit); commitService.start(); // Start rewrite tasks @@ -321,7 +351,9 @@ private Result doExecuteWithPartialProgress(RewriteExecutionContext ctx, Stream< .suppressFailureWhenFinished() .executeWith(rewriteService) .noRetry() - .onFailure((fileGroup, exception) -> LOG.error("Failure during rewrite group {}", fileGroup.info(), exception)) + .onFailure( + (fileGroup, exception) -> + LOG.error("Failure during rewrite group {}", fileGroup.info(), exception)) .run(fileGroup -> commitService.offer(rewriteFiles(ctx, fileGroup))); rewriteService.shutdown(); @@ -329,30 +361,39 @@ private Result doExecuteWithPartialProgress(RewriteExecutionContext ctx, Stream< commitService.close(); List commitResults = commitService.results(); if (commitResults.size() == 0) { - LOG.error("{} is true but no rewrite commits succeeded. Check the logs to determine why the individual " + - "commits failed. If this is persistent it may help to increase {} which will break the rewrite operation " + - "into smaller commits.", PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_MAX_COMMITS); + LOG.error( + "{} is true but no rewrite commits succeeded. Check the logs to determine why the individual " + + "commits failed. If this is persistent it may help to increase {} which will break the rewrite operation " + + "into smaller commits.", + PARTIAL_PROGRESS_ENABLED, + PARTIAL_PROGRESS_MAX_COMMITS); } - List rewriteResults = commitResults.stream() - .map(RewriteFileGroup::asResult) - .collect(Collectors.toList()); + List rewriteResults = + commitResults.stream().map(RewriteFileGroup::asResult).collect(Collectors.toList()); return new BaseRewriteDataFilesResult(rewriteResults); } - Stream toGroupStream(RewriteExecutionContext ctx, + Stream toGroupStream( + RewriteExecutionContext ctx, Map>> fileGroupsByPartition) { - Stream rewriteFileGroupStream = fileGroupsByPartition.entrySet().stream() - .flatMap(e -> { - StructLike partition = e.getKey(); - List> fileGroups = e.getValue(); - return fileGroups.stream().map(tasks -> { - int globalIndex = ctx.currentGlobalIndex(); - int partitionIndex = ctx.currentPartitionIndex(partition); - FileGroupInfo info = new BaseRewriteDataFilesFileGroupInfo(globalIndex, partitionIndex, partition); - return new RewriteFileGroup(info, tasks); - }); - }); + Stream rewriteFileGroupStream = + fileGroupsByPartition.entrySet().stream() + .flatMap( + e -> { + StructLike partition = e.getKey(); + List> fileGroups = e.getValue(); + return fileGroups.stream() + .map( + tasks -> { + int globalIndex = ctx.currentGlobalIndex(); + int partitionIndex = ctx.currentPartitionIndex(partition); + FileGroupInfo info = + new BaseRewriteDataFilesFileGroupInfo( + globalIndex, partitionIndex, partition); + return new RewriteFileGroup(info, tasks); + }); + }); return rewriteFileGroupStream.sorted(rewriteGroupComparator()); } @@ -379,53 +420,70 @@ void validateAndInitOptions() { Set invalidKeys = Sets.newHashSet(options().keySet()); invalidKeys.removeAll(validOptions); - Preconditions.checkArgument(invalidKeys.isEmpty(), + Preconditions.checkArgument( + invalidKeys.isEmpty(), "Cannot use options %s, they are not supported by the action or the strategy %s", - invalidKeys, strategy.name()); + invalidKeys, + strategy.name()); strategy = strategy.options(options()); - maxConcurrentFileGroupRewrites = PropertyUtil.propertyAsInt(options(), - MAX_CONCURRENT_FILE_GROUP_REWRITES, - MAX_CONCURRENT_FILE_GROUP_REWRITES_DEFAULT); + maxConcurrentFileGroupRewrites = + PropertyUtil.propertyAsInt( + options(), + MAX_CONCURRENT_FILE_GROUP_REWRITES, + MAX_CONCURRENT_FILE_GROUP_REWRITES_DEFAULT); - maxCommits = PropertyUtil.propertyAsInt(options(), - PARTIAL_PROGRESS_MAX_COMMITS, - PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT); + maxCommits = + PropertyUtil.propertyAsInt( + options(), PARTIAL_PROGRESS_MAX_COMMITS, PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT); - partialProgressEnabled = PropertyUtil.propertyAsBoolean(options(), - PARTIAL_PROGRESS_ENABLED, - PARTIAL_PROGRESS_ENABLED_DEFAULT); + partialProgressEnabled = + PropertyUtil.propertyAsBoolean( + options(), PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_ENABLED_DEFAULT); - useStartingSequenceNumber = PropertyUtil.propertyAsBoolean(options(), - USE_STARTING_SEQUENCE_NUMBER, - USE_STARTING_SEQUENCE_NUMBER_DEFAULT); + useStartingSequenceNumber = + PropertyUtil.propertyAsBoolean( + options(), USE_STARTING_SEQUENCE_NUMBER, USE_STARTING_SEQUENCE_NUMBER_DEFAULT); - rewriteJobOrder = RewriteJobOrder.fromName(PropertyUtil.propertyAsString(options(), - REWRITE_JOB_ORDER, - REWRITE_JOB_ORDER_DEFAULT)); + rewriteJobOrder = + RewriteJobOrder.fromName( + PropertyUtil.propertyAsString(options(), REWRITE_JOB_ORDER, REWRITE_JOB_ORDER_DEFAULT)); - Preconditions.checkArgument(maxConcurrentFileGroupRewrites >= 1, + Preconditions.checkArgument( + maxConcurrentFileGroupRewrites >= 1, "Cannot set %s to %s, the value must be positive.", - MAX_CONCURRENT_FILE_GROUP_REWRITES, maxConcurrentFileGroupRewrites); + MAX_CONCURRENT_FILE_GROUP_REWRITES, + maxConcurrentFileGroupRewrites); - Preconditions.checkArgument(!partialProgressEnabled || maxCommits > 0, + Preconditions.checkArgument( + !partialProgressEnabled || maxCommits > 0, "Cannot set %s to %s, the value must be positive when %s is true", - PARTIAL_PROGRESS_MAX_COMMITS, maxCommits, PARTIAL_PROGRESS_ENABLED); + PARTIAL_PROGRESS_MAX_COMMITS, + maxCommits, + PARTIAL_PROGRESS_ENABLED); } private String jobDesc(RewriteFileGroup group, RewriteExecutionContext ctx) { StructLike partition = group.info().partition(); if (partition.size() > 0) { - return String.format("Rewriting %d files (%s, file group %d/%d, %s (%d/%d)) in %s", + return String.format( + "Rewriting %d files (%s, file group %d/%d, %s (%d/%d)) in %s", group.rewrittenFiles().size(), - strategy.name(), group.info().globalIndex(), - ctx.totalGroupCount(), partition, group.info().partitionIndex(), ctx.groupsInPartition(partition), + strategy.name(), + group.info().globalIndex(), + ctx.totalGroupCount(), + partition, + group.info().partitionIndex(), + ctx.groupsInPartition(partition), table.name()); } else { - return String.format("Rewriting %d files (%s, file group %d/%d) in %s", + return String.format( + "Rewriting %d files (%s, file group %d/%d) in %s", group.rewrittenFiles().size(), - strategy.name(), group.info().globalIndex(), ctx.totalGroupCount(), + strategy.name(), + group.info().globalIndex(), + ctx.totalGroupCount(), table.name()); } } @@ -450,11 +508,10 @@ static class RewriteExecutionContext { private final AtomicInteger groupIndex; RewriteExecutionContext(Map>> fileGroupsByPartition) { - this.numGroupsByPartition = fileGroupsByPartition.entrySet().stream() - .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().size())); - this.totalGroupCount = numGroupsByPartition.values().stream() - .reduce(Integer::sum) - .orElse(0); + this.numGroupsByPartition = + fileGroupsByPartition.entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().size())); + this.totalGroupCount = numGroupsByPartition.values().stream().reduce(Integer::sum).orElse(0); this.partitionIndexMap = Maps.newConcurrentMap(); this.groupIndex = new AtomicInteger(1); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteManifestsSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteManifestsSparkAction.java index 99e51a37aa30..1e0034eb3005 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteManifestsSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteManifestsSparkAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.MetadataTableType.ENTRIES; + import java.io.IOException; import java.util.Collections; import java.util.List; @@ -69,19 +70,16 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.MetadataTableType.ENTRIES; - /** * An action that rewrites manifests in a distributed manner and co-locates metadata for partitions. - *

    - * By default, this action rewrites all manifests for the current partition spec and writes the result - * to the metadata folder. The behavior can be modified by passing a custom predicate to {@link #rewriteIf(Predicate)} - * and a custom spec id to {@link #specId(int)}. In addition, there is a way to configure a custom location - * for new manifests via {@link #stagingLocation}. + * + *

    By default, this action rewrites all manifests for the current partition spec and writes the + * result to the metadata folder. The behavior can be modified by passing a custom predicate to + * {@link #rewriteIf(Predicate)} and a custom spec id to {@link #specId(int)}. In addition, there is + * a way to configure a custom location for new manifests via {@link #stagingLocation}. */ public class RewriteManifestsSparkAction - extends BaseSnapshotUpdateSparkAction - implements RewriteManifests { + extends BaseSnapshotUpdateSparkAction implements RewriteManifests { public static final String USE_CACHING = "use-caching"; public static final boolean USE_CACHING_DEFAULT = true; @@ -103,10 +101,11 @@ public class RewriteManifestsSparkAction this.manifestEncoder = Encoders.javaSerialization(ManifestFile.class); this.table = table; this.spec = table.spec(); - this.targetManifestSizeBytes = PropertyUtil.propertyAsLong( - table.properties(), - TableProperties.MANIFEST_TARGET_SIZE_BYTES, - TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT); + this.targetManifestSizeBytes = + PropertyUtil.propertyAsLong( + table.properties(), + TableProperties.MANIFEST_TARGET_SIZE_BYTES, + TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT); this.fileIO = SparkUtil.serializableFileIO(table); // default the staging location to the metadata location @@ -144,7 +143,9 @@ public RewriteManifestsSparkAction stagingLocation(String newStagingLocation) { @Override public RewriteManifests.Result execute() { - String desc = String.format("Rewriting manifests (staging location=%s) of %s", stagingLocation, table.name()); + String desc = + String.format( + "Rewriting manifests (staging location=%s) of %s", stagingLocation, table.name()); JobGroupInfo info = newJobGroupInfo("REWRITE-MANIFESTS", desc); return withJobGroupInfo(info, this::doExecute); } @@ -159,10 +160,12 @@ private RewriteManifests.Result doExecute() { int numEntries = 0; for (ManifestFile manifest : matchingManifests) { - ValidationException.check(hasFileCounts(manifest), "No file counts in manifest: %s", manifest.path()); + ValidationException.check( + hasFileCounts(manifest), "No file counts in manifest: %s", manifest.path()); totalSizeBytes += manifest.length(); - numEntries += manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); + numEntries += + manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); } int targetNumManifests = targetNumManifests(totalSizeBytes); @@ -174,7 +177,9 @@ private RewriteManifests.Result doExecute() { if (spec.fields().size() < 1) { newManifests = writeManifestsForUnpartitionedTable(manifestEntryDF, targetNumManifests); } else { - newManifests = writeManifestsForPartitionedTable(manifestEntryDF, targetNumManifests, targetNumManifestEntries); + newManifests = + writeManifestsForPartitionedTable( + manifestEntryDF, targetNumManifests, targetNumManifestEntries); } replaceManifests(matchingManifests, newManifests); @@ -183,13 +188,16 @@ private RewriteManifests.Result doExecute() { } private Dataset buildManifestEntryDF(List manifests) { - Dataset manifestDF = spark() - .createDataset(Lists.transform(manifests, ManifestFile::path), Encoders.STRING()) - .toDF("manifest"); + Dataset manifestDF = + spark() + .createDataset(Lists.transform(manifests, ManifestFile::path), Encoders.STRING()) + .toDF("manifest"); - Dataset manifestEntryDF = loadMetadataTable(table, ENTRIES) - .filter("status < 2") // select only live entries - .selectExpr("input_file_name() as manifest", "snapshot_id", "sequence_number", "data_file"); + Dataset manifestEntryDF = + loadMetadataTable(table, ENTRIES) + .filter("status < 2") // select only live entries + .selectExpr( + "input_file_name() as manifest", "snapshot_id", "sequence_number", "data_file"); Column joinCond = manifestDF.col("manifest").equalTo(manifestEntryDF.col("manifest")); return manifestEntryDF @@ -197,7 +205,8 @@ private Dataset buildManifestEntryDF(List manifests) { .select("snapshot_id", "sequence_number", "data_file"); } - private List writeManifestsForUnpartitionedTable(Dataset manifestEntryDF, int numManifests) { + private List writeManifestsForUnpartitionedTable( + Dataset manifestEntryDF, int numManifests) { Broadcast io = sparkContext().broadcast(fileIO); StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType(); @@ -209,41 +218,44 @@ private List writeManifestsForUnpartitionedTable(Dataset mani .repartition(numManifests) .mapPartitions( toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), - manifestEncoder - ) + manifestEncoder) .collectAsList(); } private List writeManifestsForPartitionedTable( - Dataset manifestEntryDF, int numManifests, - int targetNumManifestEntries) { + Dataset manifestEntryDF, int numManifests, int targetNumManifestEntries) { Broadcast io = sparkContext().broadcast(fileIO); StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType(); - // we allow the actual size of manifests to be 10% higher if the estimation is not precise enough + // we allow the actual size of manifests to be 10% higher if the estimation is not precise + // enough long maxNumManifestEntries = (long) (1.1 * targetNumManifestEntries); - return withReusableDS(manifestEntryDF, df -> { - Column partitionColumn = df.col("data_file.partition"); - return df.repartitionByRange(numManifests, partitionColumn) - .sortWithinPartitions(partitionColumn) - .mapPartitions( - toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), - manifestEncoder - ) - .collectAsList(); - }); + return withReusableDS( + manifestEntryDF, + df -> { + Column partitionColumn = df.col("data_file.partition"); + return df.repartitionByRange(numManifests, partitionColumn) + .sortWithinPartitions(partitionColumn) + .mapPartitions( + toManifests( + io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), + manifestEncoder) + .collectAsList(); + }); } private U withReusableDS(Dataset ds, Function, U> func) { Dataset reusableDS; - boolean useCaching = PropertyUtil.propertyAsBoolean(options(), USE_CACHING, USE_CACHING_DEFAULT); + boolean useCaching = + PropertyUtil.propertyAsBoolean(options(), USE_CACHING, USE_CACHING_DEFAULT); if (useCaching) { reusableDS = ds.cache(); } else { int parallelism = SQLConf.get().numShufflePartitions(); - reusableDS = ds.repartition(parallelism).map((MapFunction) value -> value, ds.exprEnc()); + reusableDS = + ds.repartition(parallelism).map((MapFunction) value -> value, ds.exprEnc()); } try { @@ -276,17 +288,19 @@ private int targetNumManifestEntries(int numEntries, int numManifests) { } private boolean hasFileCounts(ManifestFile manifest) { - return manifest.addedFilesCount() != null && - manifest.existingFilesCount() != null && - manifest.deletedFilesCount() != null; + return manifest.addedFilesCount() != null + && manifest.existingFilesCount() != null + && manifest.deletedFilesCount() != null; } - private void replaceManifests(Iterable deletedManifests, Iterable addedManifests) { + private void replaceManifests( + Iterable deletedManifests, Iterable addedManifests) { try { - boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean( - table.properties(), - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); + boolean snapshotIdInheritanceEnabled = + PropertyUtil.propertyAsBoolean( + table.properties(), + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); org.apache.iceberg.RewriteManifests rewriteManifests = table.rewriteManifests(); deletedManifests.forEach(rewriteManifests::deleteManifest); @@ -317,12 +331,20 @@ private void deleteFiles(Iterable locations) { } private static ManifestFile writeManifest( - List rows, int startIndex, int endIndex, Broadcast io, - String location, int format, PartitionSpec spec, StructType sparkType) throws IOException { + List rows, + int startIndex, + int endIndex, + Broadcast io, + String location, + int format, + PartitionSpec spec, + StructType sparkType) + throws IOException { String manifestName = "optimized-m-" + UUID.randomUUID(); Path manifestPath = new Path(location, manifestName); - OutputFile outputFile = io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString())); + OutputFile outputFile = + io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString())); Types.StructType dataFileType = DataFile.getType(spec.partitionType()); SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkType); @@ -345,8 +367,12 @@ private static ManifestFile writeManifest( } private static MapPartitionsFunction toManifests( - Broadcast io, long maxNumManifestEntries, String location, - int format, PartitionSpec spec, StructType sparkType) { + Broadcast io, + long maxNumManifestEntries, + String location, + int format, + PartitionSpec spec, + StructType sparkType) { return rows -> { List rowsAsList = Lists.newArrayList(rows); @@ -357,11 +383,15 @@ private static MapPartitionsFunction toManifests( List manifests = Lists.newArrayList(); if (rowsAsList.size() <= maxNumManifestEntries) { - manifests.add(writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType)); + manifests.add( + writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType)); } else { int midIndex = rowsAsList.size() / 2; - manifests.add(writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType)); - manifests.add(writeManifest(rowsAsList, midIndex, rowsAsList.size(), io, location, format, spec, sparkType)); + manifests.add( + writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType)); + manifests.add( + writeManifest( + rowsAsList, midIndex, rowsAsList.size(), io, location, format, spec, sparkType)); } return manifests.iterator(); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SetAccumulator.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SetAccumulator.java index f16936949ff6..745169fc1efd 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SetAccumulator.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SetAccumulator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.Collections; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SnapshotTableSparkAction.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SnapshotTableSparkAction.java index 526b46af1a88..289e408b8960 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SnapshotTableSparkAction.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SnapshotTableSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.Map; @@ -44,12 +43,10 @@ import scala.collection.JavaConverters; /** - * Creates a new Iceberg table based on a source Spark table. The new Iceberg table will - * have a different data and metadata directory allowing it to exist independently of the - * source table. + * Creates a new Iceberg table based on a source Spark table. The new Iceberg table will have a + * different data and metadata directory allowing it to exist independently of the source table. */ -public class SnapshotTableSparkAction - extends BaseTableCreationSparkAction +public class SnapshotTableSparkAction extends BaseTableCreationSparkAction implements SnapshotTable { private static final Logger LOG = LoggerFactory.getLogger(SnapshotTableSparkAction.class); @@ -58,7 +55,8 @@ public class SnapshotTableSparkAction private Identifier destTableIdent; private String destTableLocation = null; - SnapshotTableSparkAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { + SnapshotTableSparkAction( + SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { super(spark, sourceCatalog, sourceTableIdent); } @@ -81,7 +79,8 @@ protected Identifier destTableIdent() { public SnapshotTableSparkAction as(String ident) { String ctx = "snapshot destination"; CatalogPlugin defaultCatalog = spark().sessionState().catalogManager().currentCatalog(); - CatalogAndIdentifier catalogAndIdent = Spark3Util.catalogAndIdentifier(ctx, spark(), ident, defaultCatalog); + CatalogAndIdentifier catalogAndIdent = + Spark3Util.catalogAndIdentifier(ctx, spark(), ident, defaultCatalog); this.destCatalog = checkDestinationCatalog(catalogAndIdent.catalog()); this.destTableIdent = catalogAndIdent.identifier(); return this; @@ -107,11 +106,13 @@ public SnapshotTable.Result execute() { } private SnapshotTable.Result doExecute() { - Preconditions.checkArgument(destCatalog() != null && destTableIdent() != null, - "The destination catalog and identifier cannot be null. " + - "Make sure to configure the action with a valid destination table identifier via the `as` method."); + Preconditions.checkArgument( + destCatalog() != null && destTableIdent() != null, + "The destination catalog and identifier cannot be null. " + + "Make sure to configure the action with a valid destination table identifier via the `as` method."); - LOG.info("Staging a new Iceberg table {} as a snapshot of {}", destTableIdent(), sourceTableIdent()); + LOG.info( + "Staging a new Iceberg table {} as a snapshot of {}", destTableIdent(), sourceTableIdent()); StagedSparkTable stagedTable = stageDestTable(); Table icebergTable = stagedTable.table(); @@ -143,8 +144,12 @@ private SnapshotTable.Result doExecute() { } Snapshot snapshot = icebergTable.currentSnapshot(); - long importedDataFilesCount = Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); - LOG.info("Successfully loaded Iceberg metadata for {} files to {}", importedDataFilesCount, destTableIdent()); + long importedDataFilesCount = + Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); + LOG.info( + "Successfully loaded Iceberg metadata for {} files to {}", + importedDataFilesCount, + destTableIdent()); return new BaseSnapshotTableActionResult(importedDataFilesCount); } @@ -182,22 +187,27 @@ protected Map destTableProps() { @Override protected TableCatalog checkSourceCatalog(CatalogPlugin catalog) { // currently the import code relies on being able to look up the table in the session catalog - Preconditions.checkArgument(catalog.name().equalsIgnoreCase("spark_catalog"), - "Cannot snapshot a table that isn't in the session catalog (i.e. spark_catalog). " + - "Found source catalog: %s.", catalog.name()); - - Preconditions.checkArgument(catalog instanceof TableCatalog, + Preconditions.checkArgument( + catalog.name().equalsIgnoreCase("spark_catalog"), + "Cannot snapshot a table that isn't in the session catalog (i.e. spark_catalog). " + + "Found source catalog: %s.", + catalog.name()); + + Preconditions.checkArgument( + catalog instanceof TableCatalog, "Cannot snapshot as catalog %s of class %s in not a table catalog", - catalog.name(), catalog.getClass().getName()); + catalog.name(), + catalog.getClass().getName()); return (TableCatalog) catalog; } @Override public SnapshotTableSparkAction tableLocation(String location) { - Preconditions.checkArgument(!sourceTableLocation().equals(location), - "The snapshot table location cannot be same as the source table location. " + - "This would mix snapshot table files with original table files."); + Preconditions.checkArgument( + !sourceTableLocation().equals(location), + "The snapshot table location cannot be same as the source table location. " + + "This would mix snapshot table files with original table files."); this.destTableLocation = location; return this; } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java index 7131f869f37e..5ef0458b1cd2 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import org.apache.iceberg.Table; @@ -28,9 +27,9 @@ /** * An implementation of {@link ActionsProvider} for Spark. - *

    - * This class is the primary API for interacting with actions in Spark that users should use - * to instantiate particular actions. + * + *

    This class is the primary API for interacting with actions in Spark that users should use to + * instantiate particular actions. */ public class SparkActions implements ActionsProvider { @@ -52,16 +51,20 @@ public static SparkActions get() { public SnapshotTableSparkAction snapshotTable(String tableIdent) { String ctx = "snapshot source"; CatalogPlugin defaultCatalog = spark.sessionState().catalogManager().currentCatalog(); - CatalogAndIdentifier catalogAndIdent = Spark3Util.catalogAndIdentifier(ctx, spark, tableIdent, defaultCatalog); - return new BaseSnapshotTableSparkAction(spark, catalogAndIdent.catalog(), catalogAndIdent.identifier()); + CatalogAndIdentifier catalogAndIdent = + Spark3Util.catalogAndIdentifier(ctx, spark, tableIdent, defaultCatalog); + return new BaseSnapshotTableSparkAction( + spark, catalogAndIdent.catalog(), catalogAndIdent.identifier()); } @Override public MigrateTableSparkAction migrateTable(String tableIdent) { String ctx = "migrate target"; CatalogPlugin defaultCatalog = spark.sessionState().catalogManager().currentCatalog(); - CatalogAndIdentifier catalogAndIdent = Spark3Util.catalogAndIdentifier(ctx, spark, tableIdent, defaultCatalog); - return new BaseMigrateTableSparkAction(spark, catalogAndIdent.catalog(), catalogAndIdent.identifier()); + CatalogAndIdentifier catalogAndIdent = + Spark3Util.catalogAndIdentifier(ctx, spark, tableIdent, defaultCatalog); + return new BaseMigrateTableSparkAction( + spark, catalogAndIdent.catalog(), catalogAndIdent.identifier()); } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackStrategy.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackStrategy.java index d8c1cc3610bd..aaa63c014165 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackStrategy.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.List; @@ -65,21 +64,27 @@ public Set rewriteFiles(List filesToRewrite) { SparkSession cloneSession = spark.cloneSession(); cloneSession.conf().set(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), false); - Dataset scanDF = cloneSession.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) - .option(SparkReadOptions.SPLIT_SIZE, splitSize(inputFileSize(filesToRewrite))) - .option(SparkReadOptions.FILE_OPEN_COST, "0") - .load(groupID); + Dataset scanDF = + cloneSession + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) + .option(SparkReadOptions.SPLIT_SIZE, splitSize(inputFileSize(filesToRewrite))) + .option(SparkReadOptions.FILE_OPEN_COST, "0") + .load(groupID); // All files within a file group are written with the same spec, so check the first boolean requiresRepartition = !filesToRewrite.get(0).spec().equals(table.spec()); // Invoke a shuffle if the partition spec of the incoming partition does not match the table - String distributionMode = requiresRepartition ? DistributionMode.RANGE.modeName() : - DistributionMode.NONE.modeName(); + String distributionMode = + requiresRepartition + ? DistributionMode.RANGE.modeName() + : DistributionMode.NONE.modeName(); // write the packed data into new files where each split becomes a new file - scanDF.write() + scanDF + .write() .format("iceberg") .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, groupID) .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, writeMaxFileSize()) diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSortStrategy.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSortStrategy.java index 97f46d79382f..285a46fc5431 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSortStrategy.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSortStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.List; @@ -51,14 +50,13 @@ public class SparkSortStrategy extends SortStrategy { /** - * The number of shuffle partitions and consequently the number of output files - * created by the Spark Sort is based on the size of the input data files used - * in this rewrite operation. Due to compression, the disk file sizes may not - * accurately represent the size of files in the output. This parameter lets - * the user adjust the file size used for estimating actual output data size. A - * factor greater than 1.0 would generate more files than we would expect based - * on the on-disk file size. A value less than 1.0 would create fewer files than - * we would expect due to the on-disk size. + * The number of shuffle partitions and consequently the number of output files created by the + * Spark Sort is based on the size of the input data files used in this rewrite operation. Due to + * compression, the disk file sizes may not accurately represent the size of files in the output. + * This parameter lets the user adjust the file size used for estimating actual output data size. + * A factor greater than 1.0 would generate more files than we would expect based on the on-disk + * file size. A value less than 1.0 would create fewer files than we would expect due to the + * on-disk size. */ public static final String COMPRESSION_FACTOR = "compression-factor"; @@ -90,12 +88,12 @@ public Set validOptions() { @Override public RewriteStrategy options(Map options) { - sizeEstimateMultiple = PropertyUtil.propertyAsDouble(options, - COMPRESSION_FACTOR, - 1.0); + sizeEstimateMultiple = PropertyUtil.propertyAsDouble(options, COMPRESSION_FACTOR, 1.0); - Preconditions.checkArgument(sizeEstimateMultiple > 0, - "Invalid compression factor: %s (not positive)", sizeEstimateMultiple); + Preconditions.checkArgument( + sizeEstimateMultiple > 0, + "Invalid compression factor: %s (not positive)", + sizeEstimateMultiple); return super.options(options); } @@ -108,7 +106,9 @@ public Set rewriteFiles(List filesToRewrite) { SortOrder[] ordering; if (requiresRepartition) { // Build in the requirement for Partition Sorting into our sort order - ordering = SparkDistributionAndOrderingUtil.convert(SortOrderUtil.buildSortOrder(table, sortOrder())); + ordering = + SparkDistributionAndOrderingUtil.convert( + SortOrderUtil.buildSortOrder(table, sortOrder())); } else { ordering = SparkDistributionAndOrderingUtil.convert(sortOrder()); } @@ -124,24 +124,30 @@ public Set rewriteFiles(List filesToRewrite) { cloneSession.conf().set(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), false); // Reset Shuffle Partitions for our sort - long numOutputFiles = numOutputFiles((long) (inputFileSize(filesToRewrite) * sizeEstimateMultiple)); + long numOutputFiles = + numOutputFiles((long) (inputFileSize(filesToRewrite) * sizeEstimateMultiple)); cloneSession.conf().set(SQLConf.SHUFFLE_PARTITIONS().key(), Math.max(1, numOutputFiles)); - Dataset scanDF = cloneSession.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) - .load(groupID); + Dataset scanDF = + cloneSession + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) + .load(groupID); // write the packed data into new files where each split becomes a new file SQLConf sqlConf = cloneSession.sessionState().conf(); LogicalPlan sortPlan = sortPlan(distribution, ordering, scanDF.logicalPlan(), sqlConf); Dataset sortedDf = new Dataset<>(cloneSession, sortPlan, scanDF.encoder()); - sortedDf.write() + sortedDf + .write() .format("iceberg") .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, groupID) .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, writeMaxFileSize()) .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false") - .mode("append") // This will only write files without modifying the table, see SparkWrite.RewriteFiles + .mode("append") // This will only write files without modifying the table, see + // SparkWrite.RewriteFiles .save(groupID); return rewriteCoordinator.fetchNewDataFiles(table, groupID); @@ -156,7 +162,8 @@ protected SparkSession spark() { return this.spark; } - protected LogicalPlan sortPlan(Distribution distribution, SortOrder[] ordering, LogicalPlan plan, SQLConf conf) { + protected LogicalPlan sortPlan( + Distribution distribution, SortOrder[] ordering, LogicalPlan plan, SQLConf conf) { return DistributionAndOrderingUtils$.MODULE$.prepareQuery(distribution, ordering, plan, conf); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderStrategy.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderStrategy.java index 8044039a592c..2b3397c9dbca 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderStrategy.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.Arrays; @@ -62,22 +61,28 @@ public class SparkZOrderStrategy extends SparkSortStrategy { private static final Logger LOG = LoggerFactory.getLogger(SparkZOrderStrategy.class); private static final String Z_COLUMN = "ICEZVALUE"; - private static final Schema Z_SCHEMA = new Schema(NestedField.required(0, Z_COLUMN, Types.BinaryType.get())); - private static final org.apache.iceberg.SortOrder Z_SORT_ORDER = org.apache.iceberg.SortOrder.builderFor(Z_SCHEMA) - .sortBy(Z_COLUMN, SortDirection.ASC, NullOrder.NULLS_LAST) - .build(); + private static final Schema Z_SCHEMA = + new Schema(NestedField.required(0, Z_COLUMN, Types.BinaryType.get())); + private static final org.apache.iceberg.SortOrder Z_SORT_ORDER = + org.apache.iceberg.SortOrder.builderFor(Z_SCHEMA) + .sortBy(Z_COLUMN, SortDirection.ASC, NullOrder.NULLS_LAST) + .build(); /** - * Controls the amount of bytes interleaved in the ZOrder Algorithm. Default is all bytes being interleaved. + * Controls the amount of bytes interleaved in the ZOrder Algorithm. Default is all bytes being + * interleaved. */ private static final String MAX_OUTPUT_SIZE_KEY = "max-output-size"; + private static final int DEFAULT_MAX_OUTPUT_SIZE = Integer.MAX_VALUE; /** - * Controls the number of bytes considered from an input column of a type with variable length (String, Binary). - * Default is to use the same size as primitives {@link ZOrderByteUtils#PRIMITIVE_BUFFER_SIZE} + * Controls the number of bytes considered from an input column of a type with variable length + * (String, Binary). Default is to use the same size as primitives {@link + * ZOrderByteUtils#PRIMITIVE_BUFFER_SIZE} */ private static final String VAR_LENGTH_CONTRIBUTION_KEY = "var-length-contribution"; + private static final int DEFAULT_VAR_LENGTH_CONTRIBUTION = ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE; private final List zOrderColNames; @@ -98,16 +103,22 @@ public Set validOptions() { public RewriteStrategy options(Map options) { super.options(options); - varLengthContribution = PropertyUtil.propertyAsInt(options, VAR_LENGTH_CONTRIBUTION_KEY, - DEFAULT_VAR_LENGTH_CONTRIBUTION); - Preconditions.checkArgument(varLengthContribution > 0, + varLengthContribution = + PropertyUtil.propertyAsInt( + options, VAR_LENGTH_CONTRIBUTION_KEY, DEFAULT_VAR_LENGTH_CONTRIBUTION); + Preconditions.checkArgument( + varLengthContribution > 0, "Cannot use less than 1 byte for variable length types with zOrder, %s was set to %s", - VAR_LENGTH_CONTRIBUTION_KEY, varLengthContribution); + VAR_LENGTH_CONTRIBUTION_KEY, + varLengthContribution); - maxOutputSize = PropertyUtil.propertyAsInt(options, MAX_OUTPUT_SIZE_KEY, DEFAULT_MAX_OUTPUT_SIZE); - Preconditions.checkArgument(maxOutputSize > 0, + maxOutputSize = + PropertyUtil.propertyAsInt(options, MAX_OUTPUT_SIZE_KEY, DEFAULT_MAX_OUTPUT_SIZE); + Preconditions.checkArgument( + maxOutputSize > 0, "Cannot have the interleaved ZOrder value use less than 1 byte, %s was set to %s", - MAX_OUTPUT_SIZE_KEY, maxOutputSize); + MAX_OUTPUT_SIZE_KEY, + maxOutputSize); return this; } @@ -115,21 +126,25 @@ public RewriteStrategy options(Map options) { public SparkZOrderStrategy(Table table, SparkSession spark, List zOrderColNames) { super(table, spark); - Preconditions.checkArgument(zOrderColNames != null && !zOrderColNames.isEmpty(), + Preconditions.checkArgument( + zOrderColNames != null && !zOrderColNames.isEmpty(), "Cannot ZOrder when no columns are specified"); - Stream identityPartitionColumns = table.spec().fields().stream() - .filter(f -> f.transform().isIdentity()) - .map(PartitionField::name); - List partZOrderCols = identityPartitionColumns - .filter(zOrderColNames::contains) - .collect(Collectors.toList()); + Stream identityPartitionColumns = + table.spec().fields().stream() + .filter(f -> f.transform().isIdentity()) + .map(PartitionField::name); + List partZOrderCols = + identityPartitionColumns.filter(zOrderColNames::contains).collect(Collectors.toList()); if (!partZOrderCols.isEmpty()) { - LOG.warn("Cannot ZOrder on an Identity partition column as these values are constant within a partition " + - "and will be removed from the ZOrder expression: {}", partZOrderCols); + LOG.warn( + "Cannot ZOrder on an Identity partition column as these values are constant within a partition " + + "and will be removed from the ZOrder expression: {}", + partZOrderCols); zOrderColNames.removeAll(partZOrderCols); - Preconditions.checkArgument(!zOrderColNames.isEmpty(), + Preconditions.checkArgument( + !zOrderColNames.isEmpty(), "Cannot perform ZOrdering, all columns provided were identity partition columns and cannot be used."); } @@ -141,13 +156,16 @@ public SparkZOrderStrategy(Table table, SparkSession spark, List zOrderC private void validateColumnsExistence(Table table, SparkSession spark, List colNames) { boolean caseSensitive = Boolean.parseBoolean(spark.conf().get("spark.sql.caseSensitive")); Schema schema = table.schema(); - colNames.forEach(col -> { - NestedField nestedField = caseSensitive ? schema.findField(col) : schema.caseInsensitiveFindField(col); - if (nestedField == null) { - throw new IllegalArgumentException( - String.format("Cannot find column '%s' in table schema: %s", col, schema.asStruct())); - } - }); + colNames.forEach( + col -> { + NestedField nestedField = + caseSensitive ? schema.findField(col) : schema.caseInsensitiveFindField(col); + if (nestedField == null) { + throw new IllegalArgumentException( + String.format( + "Cannot find column '%s' in table schema: %s", col, schema.asStruct())); + } + }); } @Override @@ -163,14 +181,17 @@ protected void validateOptions() { @Override public Set rewriteFiles(List filesToRewrite) { - SparkZOrderUDF zOrderUDF = new SparkZOrderUDF(zOrderColNames.size(), varLengthContribution, maxOutputSize); + SparkZOrderUDF zOrderUDF = + new SparkZOrderUDF(zOrderColNames.size(), varLengthContribution, maxOutputSize); String groupID = UUID.randomUUID().toString(); boolean requiresRepartition = !filesToRewrite.get(0).spec().equals(table().spec()); SortOrder[] ordering; if (requiresRepartition) { - ordering = SparkDistributionAndOrderingUtil.convert(SortOrderUtil.buildSortOrder(table(), sortOrder())); + ordering = + SparkDistributionAndOrderingUtil.convert( + SortOrderUtil.buildSortOrder(table(), sortOrder())); } else { ordering = SparkDistributionAndOrderingUtil.convert(sortOrder()); } @@ -186,24 +207,31 @@ public Set rewriteFiles(List filesToRewrite) { cloneSession.conf().set(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), false); // Reset Shuffle Partitions for our sort - long numOutputFiles = numOutputFiles((long) (inputFileSize(filesToRewrite) * sizeEstimateMultiple())); + long numOutputFiles = + numOutputFiles((long) (inputFileSize(filesToRewrite) * sizeEstimateMultiple())); cloneSession.conf().set(SQLConf.SHUFFLE_PARTITIONS().key(), Math.max(1, numOutputFiles)); - Dataset scanDF = cloneSession.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) - .load(groupID); - - Column[] originalColumns = Arrays.stream(scanDF.schema().names()) - .map(n -> functions.col(n)) - .toArray(Column[]::new); - - List zOrderColumns = zOrderColNames.stream() - .map(scanDF.schema()::apply) - .collect(Collectors.toList()); - - Column zvalueArray = functions.array(zOrderColumns.stream().map(colStruct -> - zOrderUDF.sortedLexicographically(functions.col(colStruct.name()), colStruct.dataType()) - ).toArray(Column[]::new)); + Dataset scanDF = + cloneSession + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) + .load(groupID); + + Column[] originalColumns = + Arrays.stream(scanDF.schema().names()).map(n -> functions.col(n)).toArray(Column[]::new); + + List zOrderColumns = + zOrderColNames.stream().map(scanDF.schema()::apply).collect(Collectors.toList()); + + Column zvalueArray = + functions.array( + zOrderColumns.stream() + .map( + colStruct -> + zOrderUDF.sortedLexicographically( + functions.col(colStruct.name()), colStruct.dataType())) + .toArray(Column[]::new)); Dataset zvalueDF = scanDF.withColumn(Z_COLUMN, zOrderUDF.interleaveBytes(zvalueArray)); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java index eea3689211e2..db359fdd62fc 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.io.IOException; @@ -49,10 +48,11 @@ class SparkZOrderUDF implements Serializable { private static final byte[] PRIMITIVE_EMPTY = new byte[ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE]; /** - * Every Spark task runs iteratively on a rows in a single thread so ThreadLocal should protect from - * concurrent access to any of these structures. + * Every Spark task runs iteratively on a rows in a single thread so ThreadLocal should protect + * from concurrent access to any of these structures. */ private transient ThreadLocal outputBuffer; + private transient ThreadLocal inputHolder; private transient ThreadLocal inputBuffers; private transient ThreadLocal encoder; @@ -94,13 +94,19 @@ byte[] interleaveBits(Seq scalaBinary) { private UserDefinedFunction tinyToOrderedBytesUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((Byte value) -> { - if (value == null) { - return PRIMITIVE_EMPTY; - } - return ZOrderByteUtils.tinyintToOrderedBytes(value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) - .array(); - }, DataTypes.BinaryType).withName("TINY_ORDERED_BYTES"); + UserDefinedFunction udf = + functions + .udf( + (Byte value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + return ZOrderByteUtils.tinyintToOrderedBytes( + value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("TINY_ORDERED_BYTES"); this.inputCol++; increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); @@ -110,13 +116,19 @@ private UserDefinedFunction tinyToOrderedBytesUDF() { private UserDefinedFunction shortToOrderedBytesUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((Short value) -> { - if (value == null) { - return PRIMITIVE_EMPTY; - } - return ZOrderByteUtils.shortToOrderedBytes(value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) - .array(); - }, DataTypes.BinaryType).withName("SHORT_ORDERED_BYTES"); + UserDefinedFunction udf = + functions + .udf( + (Short value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + return ZOrderByteUtils.shortToOrderedBytes( + value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("SHORT_ORDERED_BYTES"); this.inputCol++; increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); @@ -126,13 +138,19 @@ private UserDefinedFunction shortToOrderedBytesUDF() { private UserDefinedFunction intToOrderedBytesUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((Integer value) -> { - if (value == null) { - return PRIMITIVE_EMPTY; - } - return ZOrderByteUtils.intToOrderedBytes(value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) - .array(); - }, DataTypes.BinaryType).withName("INT_ORDERED_BYTES"); + UserDefinedFunction udf = + functions + .udf( + (Integer value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + return ZOrderByteUtils.intToOrderedBytes( + value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("INT_ORDERED_BYTES"); this.inputCol++; increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); @@ -142,13 +160,19 @@ private UserDefinedFunction intToOrderedBytesUDF() { private UserDefinedFunction longToOrderedBytesUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((Long value) -> { - if (value == null) { - return PRIMITIVE_EMPTY; - } - return ZOrderByteUtils.longToOrderedBytes(value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) - .array(); - }, DataTypes.BinaryType).withName("LONG_ORDERED_BYTES"); + UserDefinedFunction udf = + functions + .udf( + (Long value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + return ZOrderByteUtils.longToOrderedBytes( + value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("LONG_ORDERED_BYTES"); this.inputCol++; increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); @@ -158,13 +182,19 @@ private UserDefinedFunction longToOrderedBytesUDF() { private UserDefinedFunction floatToOrderedBytesUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((Float value) -> { - if (value == null) { - return PRIMITIVE_EMPTY; - } - return ZOrderByteUtils.floatToOrderedBytes(value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) - .array(); - }, DataTypes.BinaryType).withName("FLOAT_ORDERED_BYTES"); + UserDefinedFunction udf = + functions + .udf( + (Float value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + return ZOrderByteUtils.floatToOrderedBytes( + value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("FLOAT_ORDERED_BYTES"); this.inputCol++; increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); @@ -174,13 +204,19 @@ private UserDefinedFunction floatToOrderedBytesUDF() { private UserDefinedFunction doubleToOrderedBytesUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((Double value) -> { - if (value == null) { - return PRIMITIVE_EMPTY; - } - return ZOrderByteUtils.doubleToOrderedBytes(value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) - .array(); - }, DataTypes.BinaryType).withName("DOUBLE_ORDERED_BYTES"); + UserDefinedFunction udf = + functions + .udf( + (Double value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + return ZOrderByteUtils.doubleToOrderedBytes( + value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("DOUBLE_ORDERED_BYTES"); this.inputCol++; increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); @@ -190,11 +226,16 @@ private UserDefinedFunction doubleToOrderedBytesUDF() { private UserDefinedFunction booleanToOrderedBytesUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((Boolean value) -> { - ByteBuffer buffer = inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); - buffer.put(0, (byte) (value ? -127 : 0)); - return buffer.array(); - }, DataTypes.BinaryType).withName("BOOLEAN-LEXICAL-BYTES"); + UserDefinedFunction udf = + functions + .udf( + (Boolean value) -> { + ByteBuffer buffer = inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); + buffer.put(0, (byte) (value ? -127 : 0)); + return buffer.array(); + }, + DataTypes.BinaryType) + .withName("BOOLEAN-LEXICAL-BYTES"); this.inputCol++; increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); @@ -203,13 +244,15 @@ private UserDefinedFunction booleanToOrderedBytesUDF() { private UserDefinedFunction stringToOrderedBytesUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((String value) -> - ZOrderByteUtils.stringToOrderedBytes( - value, - varTypeSize, - inputBuffer(position, varTypeSize), - encoder.get()).array(), DataTypes.BinaryType) - .withName("STRING-LEXICAL-BYTES"); + UserDefinedFunction udf = + functions + .udf( + (String value) -> + ZOrderByteUtils.stringToOrderedBytes( + value, varTypeSize, inputBuffer(position, varTypeSize), encoder.get()) + .array(), + DataTypes.BinaryType) + .withName("STRING-LEXICAL-BYTES"); this.inputCol++; increaseOutputSize(varTypeSize); @@ -219,10 +262,15 @@ private UserDefinedFunction stringToOrderedBytesUDF() { private UserDefinedFunction bytesTruncateUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((byte[] value) -> - ZOrderByteUtils.byteTruncateOrFill(value, varTypeSize, inputBuffer(position, varTypeSize)).array(), - DataTypes.BinaryType) - .withName("BYTE-TRUNCATE"); + UserDefinedFunction udf = + functions + .udf( + (byte[] value) -> + ZOrderByteUtils.byteTruncateOrFill( + value, varTypeSize, inputBuffer(position, varTypeSize)) + .array(), + DataTypes.BinaryType) + .withName("BYTE-TRUNCATE"); this.inputCol++; increaseOutputSize(varTypeSize); @@ -231,7 +279,8 @@ private UserDefinedFunction bytesTruncateUDF() { } private final UserDefinedFunction interleaveUDF = - functions.udf((Seq arrayBinary) -> interleaveBits(arrayBinary), DataTypes.BinaryType) + functions + .udf((Seq arrayBinary) -> interleaveBits(arrayBinary), DataTypes.BinaryType) .withName("INTERLEAVE_BYTES"); Column interleaveBytes(Column arrayBinary) { @@ -264,7 +313,9 @@ Column sortedLexicographically(Column column, DataType type) { return longToOrderedBytesUDF().apply(column.cast(DataTypes.LongType)); } else { throw new IllegalArgumentException( - String.format("Cannot use column %s of type %s in ZOrdering, the type is unsupported", column, type)); + String.format( + "Cannot use column %s of type %s in ZOrdering, the type is unsupported", + column, type)); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java index 40ed05b4ce65..74454fc1e466 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import org.apache.iceberg.avro.AvroWithPartnerByStructureVisitor; @@ -30,7 +29,8 @@ import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; -public abstract class AvroWithSparkSchemaVisitor extends AvroWithPartnerByStructureVisitor { +public abstract class AvroWithSparkSchemaVisitor + extends AvroWithPartnerByStructureVisitor { @Override protected boolean isStringType(DataType dataType) { @@ -44,7 +44,8 @@ protected boolean isMapType(DataType dataType) { @Override protected DataType arrayElementType(DataType arrayType) { - Preconditions.checkArgument(arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); + Preconditions.checkArgument( + arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); return ((ArrayType) arrayType).elementType(); } @@ -62,7 +63,8 @@ protected DataType mapValueType(DataType mapType) { @Override protected Pair fieldNameAndType(DataType structType, int pos) { - Preconditions.checkArgument(structType instanceof StructType, "Invalid struct: %s is not a struct", structType); + Preconditions.checkArgument( + structType instanceof StructType, "Invalid struct: %s is not a struct", structType); StructField field = ((StructType) structType).apply(pos); return Pair.of(field.name(), field.dataType()); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java index 924cc3e2325a..d74a76f94e87 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.Deque; @@ -48,9 +47,11 @@ public class ParquetWithSparkSchemaVisitor { public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisitor visitor) { Preconditions.checkArgument(sType != null, "Invalid DataType: null"); if (type instanceof MessageType) { - Preconditions.checkArgument(sType instanceof StructType, "Invalid struct: %s is not a struct", sType); + Preconditions.checkArgument( + sType instanceof StructType, "Invalid struct: %s is not a struct", sType); StructType struct = (StructType) sType; - return visitor.message(struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); + return visitor.message( + struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); } else if (type.isPrimitive()) { return visitor.primitive(sType, type.asPrimitiveType()); @@ -62,21 +63,30 @@ public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisit if (annotation != null) { switch (annotation) { case LIST: - Preconditions.checkArgument(!group.isRepetition(Repetition.REPEATED), - "Invalid list: top-level group is repeated: %s", group); - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid list: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + !group.isRepetition(Repetition.REPEATED), + "Invalid list: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid list: does not contain single repeated field: %s", + group); GroupType repeatedElement = group.getFields().get(0).asGroupType(); - Preconditions.checkArgument(repeatedElement.isRepetition(Repetition.REPEATED), + Preconditions.checkArgument( + repeatedElement.isRepetition(Repetition.REPEATED), "Invalid list: inner group is not repeated"); - Preconditions.checkArgument(repeatedElement.getFieldCount() <= 1, - "Invalid list: repeated group is not a single field: %s", group); + Preconditions.checkArgument( + repeatedElement.getFieldCount() <= 1, + "Invalid list: repeated group is not a single field: %s", + group); - Preconditions.checkArgument(sType instanceof ArrayType, "Invalid list: %s is not an array", sType); + Preconditions.checkArgument( + sType instanceof ArrayType, "Invalid list: %s is not an array", sType); ArrayType array = (ArrayType) sType; - StructField element = new StructField( - "element", array.elementType(), array.containsNull(), Metadata.empty()); + StructField element = + new StructField( + "element", array.elementType(), array.containsNull(), Metadata.empty()); visitor.fieldNames.push(repeatedElement.getName()); try { @@ -92,22 +102,30 @@ public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisit } case MAP: - Preconditions.checkArgument(!group.isRepetition(Repetition.REPEATED), - "Invalid map: top-level group is repeated: %s", group); - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid map: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + !group.isRepetition(Repetition.REPEATED), + "Invalid map: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid map: does not contain single repeated field: %s", + group); GroupType repeatedKeyValue = group.getType(0).asGroupType(); - Preconditions.checkArgument(repeatedKeyValue.isRepetition(Repetition.REPEATED), + Preconditions.checkArgument( + repeatedKeyValue.isRepetition(Repetition.REPEATED), "Invalid map: inner group is not repeated"); - Preconditions.checkArgument(repeatedKeyValue.getFieldCount() <= 2, + Preconditions.checkArgument( + repeatedKeyValue.getFieldCount() <= 2, "Invalid map: repeated group does not have 2 fields"); - Preconditions.checkArgument(sType instanceof MapType, "Invalid map: %s is not a map", sType); + Preconditions.checkArgument( + sType instanceof MapType, "Invalid map: %s is not a map", sType); MapType map = (MapType) sType; StructField keyField = new StructField("key", map.keyType(), false, Metadata.empty()); - StructField valueField = new StructField( - "value", map.valueType(), map.valueContainsNull(), Metadata.empty()); + StructField valueField = + new StructField( + "value", map.valueType(), map.valueContainsNull(), Metadata.empty()); visitor.fieldNames.push(repeatedKeyValue.getName()); try { @@ -144,13 +162,15 @@ public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisit } } - Preconditions.checkArgument(sType instanceof StructType, "Invalid struct: %s is not a struct", sType); + Preconditions.checkArgument( + sType instanceof StructType, "Invalid struct: %s is not a struct", sType); StructType struct = (StructType) sType; return visitor.struct(struct, group, visitFields(struct, group, visitor)); } } - private static T visitField(StructField sField, Type field, ParquetWithSparkSchemaVisitor visitor) { + private static T visitField( + StructField sField, Type field, ParquetWithSparkSchemaVisitor visitor) { visitor.fieldNames.push(field.getName()); try { return visit(sField.dataType(), field, visitor); @@ -159,17 +179,20 @@ private static T visitField(StructField sField, Type field, ParquetWithSpark } } - private static List visitFields(StructType struct, GroupType group, - ParquetWithSparkSchemaVisitor visitor) { + private static List visitFields( + StructType struct, GroupType group, ParquetWithSparkSchemaVisitor visitor) { StructField[] sFields = struct.fields(); - Preconditions.checkArgument(sFields.length == group.getFieldCount(), - "Structs do not match: %s and %s", struct, group); + Preconditions.checkArgument( + sFields.length == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); for (int i = 0; i < sFields.length; i += 1) { Type field = group.getFields().get(i); StructField sField = sFields[i]; - Preconditions.checkArgument(field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), - "Structs do not match: field %s != %s", field.getName(), sField.name()); + Preconditions.checkArgument( + field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), + "Structs do not match: field %s != %s", + field.getName(), + sField.name()); results.add(visitField(sField, field, visitor)); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java index c693e2e2c057..4622d2928ac4 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -38,7 +37,6 @@ import org.apache.iceberg.types.Types; import org.apache.spark.sql.catalyst.InternalRow; - public class SparkAvroReader implements DatumReader, SupportsRowPosition { private final Schema readSchema; @@ -50,10 +48,12 @@ public SparkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSche } @SuppressWarnings("unchecked") - public SparkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { + public SparkAvroReader( + org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { this.readSchema = readSchema; - this.reader = (ValueReader) AvroSchemaWithTypeVisitor - .visit(expectedSchema, readSchema, new ReadBuilder(constants)); + this.reader = + (ValueReader) + AvroSchemaWithTypeVisitor.visit(expectedSchema, readSchema, new ReadBuilder(constants)); } @Override @@ -81,8 +81,8 @@ private ReadBuilder(Map idToConstant) { } @Override - public ValueReader record(Types.StructType expected, Schema record, List names, - List> fields) { + public ValueReader record( + Types.StructType expected, Schema record, List names, List> fields) { return SparkValueReaders.struct(fields, expected, idToConstant); } @@ -92,13 +92,14 @@ public ValueReader union(Type expected, Schema union, List> op } @Override - public ValueReader array(Types.ListType expected, Schema array, ValueReader elementReader) { + public ValueReader array( + Types.ListType expected, Schema array, ValueReader elementReader) { return SparkValueReaders.array(elementReader); } @Override - public ValueReader map(Types.MapType expected, Schema map, - ValueReader keyReader, ValueReader valueReader) { + public ValueReader map( + Types.MapType expected, Schema map, ValueReader keyReader, ValueReader valueReader) { return SparkValueReaders.arrayMap(keyReader, valueReader); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java index 7582125128a7..15465568c231 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -50,8 +49,9 @@ public SparkAvroWriter(StructType dsSchema) { @Override @SuppressWarnings("unchecked") public void setSchema(Schema schema) { - this.writer = (ValueWriter) AvroWithSparkSchemaVisitor - .visit(dsSchema, schema, new WriteBuilder()); + this.writer = + (ValueWriter) + AvroWithSparkSchemaVisitor.visit(dsSchema, schema, new WriteBuilder()); } @Override @@ -66,17 +66,23 @@ public Stream metrics() { private static class WriteBuilder extends AvroWithSparkSchemaVisitor> { @Override - public ValueWriter record(DataType struct, Schema record, List names, List> fields) { - return SparkValueWriters.struct(fields, IntStream.range(0, names.size()) - .mapToObj(i -> fieldNameAndType(struct, i).second()).collect(Collectors.toList())); + public ValueWriter record( + DataType struct, Schema record, List names, List> fields) { + return SparkValueWriters.struct( + fields, + IntStream.range(0, names.size()) + .mapToObj(i -> fieldNameAndType(struct, i).second()) + .collect(Collectors.toList())); } @Override public ValueWriter union(DataType type, Schema union, List> options) { - Preconditions.checkArgument(options.contains(ValueWriters.nulls()), - "Cannot create writer for non-option union: %s", union); - Preconditions.checkArgument(options.size() == 2, - "Cannot create writer for non-option union: %s", union); + Preconditions.checkArgument( + options.contains(ValueWriters.nulls()), + "Cannot create writer for non-option union: %s", + union); + Preconditions.checkArgument( + options.size() == 2, "Cannot create writer for non-option union: %s", union); if (union.getTypes().get(0).getType() == Schema.Type.NULL) { return ValueWriters.option(0, options.get(1)); } else { @@ -91,12 +97,15 @@ public ValueWriter array(DataType sArray, Schema array, ValueWriter elemen @Override public ValueWriter map(DataType sMap, Schema map, ValueWriter valueReader) { - return SparkValueWriters.map(SparkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); + return SparkValueWriters.map( + SparkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); } @Override - public ValueWriter map(DataType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { - return SparkValueWriters.arrayMap(keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); + public ValueWriter map( + DataType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { + return SparkValueWriters.arrayMap( + keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java index 4ed6420a9aa4..78db137054bc 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.List; @@ -34,10 +33,9 @@ import org.apache.spark.sql.catalyst.InternalRow; /** - * Converts the OrcIterator, which returns ORC's VectorizedRowBatch to a - * set of Spark's UnsafeRows. + * Converts the OrcIterator, which returns ORC's VectorizedRowBatch to a set of Spark's UnsafeRows. * - * It minimizes allocations by reusing most of the objects in the implementation. + *

    It minimizes allocations by reusing most of the objects in the implementation. */ public class SparkOrcReader implements OrcRowReader { private final OrcValueReader reader; @@ -48,8 +46,12 @@ public SparkOrcReader(org.apache.iceberg.Schema expectedSchema, TypeDescription @SuppressWarnings("unchecked") public SparkOrcReader( - org.apache.iceberg.Schema expectedSchema, TypeDescription readOrcSchema, Map idToConstant) { - this.reader = OrcSchemaWithTypeVisitor.visit(expectedSchema, readOrcSchema, new ReadBuilder(idToConstant)); + org.apache.iceberg.Schema expectedSchema, + TypeDescription readOrcSchema, + Map idToConstant) { + this.reader = + OrcSchemaWithTypeVisitor.visit( + expectedSchema, readOrcSchema, new ReadBuilder(idToConstant)); } @Override @@ -71,18 +73,25 @@ private ReadBuilder(Map idToConstant) { @Override public OrcValueReader record( - Types.StructType expected, TypeDescription record, List names, List> fields) { + Types.StructType expected, + TypeDescription record, + List names, + List> fields) { return SparkOrcValueReaders.struct(fields, expected, idToConstant); } @Override - public OrcValueReader list(Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { + public OrcValueReader list( + Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { return SparkOrcValueReaders.array(elementReader); } @Override public OrcValueReader map( - Types.MapType iMap, TypeDescription map, OrcValueReader keyReader, OrcValueReader valueReader) { + Types.MapType iMap, + TypeDescription map, + OrcValueReader keyReader, + OrcValueReader valueReader) { return SparkOrcValueReaders.map(keyReader, valueReader); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java index f35ab7a17c63..9e9b3e53bbcc 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.math.BigDecimal; @@ -44,8 +43,7 @@ import org.apache.spark.unsafe.types.UTF8String; public class SparkOrcValueReaders { - private SparkOrcValueReaders() { - } + private SparkOrcValueReaders() {} public static OrcValueReader utf8String() { return StringReader.INSTANCE; @@ -125,8 +123,7 @@ public MapData nonNullRead(ColumnVector vector, int row) { } return new ArrayBasedMapData( - new GenericArrayData(keys.toArray()), - new GenericArrayData(values.toArray())); + new GenericArrayData(keys.toArray()), new GenericArrayData(values.toArray())); } @Override @@ -139,7 +136,8 @@ public void setBatchContext(long batchOffsetInFile) { static class StructReader extends OrcValueReaders.StructReader { private final int numFields; - protected StructReader(List> readers, Types.StructType struct, Map idToConstant) { + protected StructReader( + List> readers, Types.StructType struct, Map idToConstant) { super(readers, struct, idToConstant); this.numFields = struct.fields().size(); } @@ -162,21 +160,20 @@ protected void set(InternalRow struct, int pos, Object value) { private static class StringReader implements OrcValueReader { private static final StringReader INSTANCE = new StringReader(); - private StringReader() { - } + private StringReader() {} @Override public UTF8String nonNullRead(ColumnVector vector, int row) { BytesColumnVector bytesVector = (BytesColumnVector) vector; - return UTF8String.fromBytes(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); + return UTF8String.fromBytes( + bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); } } private static class TimestampTzReader implements OrcValueReader { private static final TimestampTzReader INSTANCE = new TimestampTzReader(); - private TimestampTzReader() { - } + private TimestampTzReader() {} @Override public Long nonNullRead(ColumnVector vector, int row) { @@ -198,12 +195,20 @@ private static class Decimal18Reader implements OrcValueReader { public Decimal nonNullRead(ColumnVector vector, int row) { HiveDecimalWritable value = ((DecimalColumnVector) vector).vector[row]; - // The scale of decimal read from hive ORC file may be not equals to the expected scale. For data type - // decimal(10,3) and the value 10.100, the hive ORC writer will remove its trailing zero and store it - // as 101*10^(-1), its scale will adjust from 3 to 1. So here we could not assert that value.scale() == scale. - // we also need to convert the hive orc decimal to a decimal with expected precision and scale. - Preconditions.checkArgument(value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); + // The scale of decimal read from hive ORC file may be not equals to the expected scale. For + // data type + // decimal(10,3) and the value 10.100, the hive ORC writer will remove its trailing zero and + // store it + // as 101*10^(-1), its scale will adjust from 3 to 1. So here we could not assert that + // value.scale() == scale. + // we also need to convert the hive orc decimal to a decimal with expected precision and + // scale. + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); return new Decimal().set(value.serialize64(scale), precision, scale); } @@ -220,11 +225,15 @@ private static class Decimal38Reader implements OrcValueReader { @Override public Decimal nonNullRead(ColumnVector vector, int row) { - BigDecimal value = ((DecimalColumnVector) vector).vector[row] - .getHiveDecimal().bigDecimalValue(); - - Preconditions.checkArgument(value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); + BigDecimal value = + ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); + + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); return new Decimal().set(new scala.math.BigDecimal(value), precision, scale); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java index abb12dffc050..780090f99109 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.List; @@ -37,8 +36,7 @@ import org.apache.spark.unsafe.types.UTF8String; class SparkOrcValueWriters { - private SparkOrcValueWriters() { - } + private SparkOrcValueWriters() {} static OrcValueWriter strings() { return StringWriter.INSTANCE; @@ -60,8 +58,8 @@ static OrcValueWriter list(OrcValueWriter element, List o return new ListWriter<>(element, orcType); } - static OrcValueWriter map(OrcValueWriter keyWriter, OrcValueWriter valueWriter, - List orcTypes) { + static OrcValueWriter map( + OrcValueWriter keyWriter, OrcValueWriter valueWriter, List orcTypes) { return new MapWriter<>(keyWriter, valueWriter, orcTypes); } @@ -73,7 +71,6 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) { byte[] value = data.getBytes(); ((BytesColumnVector) output).setRef(rowId, value, 0, value.length); } - } private static class TimestampTzWriter implements OrcValueWriter { @@ -85,7 +82,6 @@ public void nonNullWrite(int rowId, Long micros, ColumnVector output) { cv.time[rowId] = Math.floorDiv(micros, 1_000); // millis cv.nanos[rowId] = (int) Math.floorMod(micros, 1_000_000) * 1_000; // nanos } - } private static class Decimal18Writer implements OrcValueWriter { @@ -97,20 +93,18 @@ private static class Decimal18Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, Decimal decimal, ColumnVector output) { - ((DecimalColumnVector) output).vector[rowId].setFromLongAndScale( - decimal.toUnscaledLong(), scale); + ((DecimalColumnVector) output) + .vector[rowId].setFromLongAndScale(decimal.toUnscaledLong(), scale); } - } private static class Decimal38Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, Decimal decimal, ColumnVector output) { - ((DecimalColumnVector) output).vector[rowId].set( - HiveDecimal.create(decimal.toJavaBigDecimal())); + ((DecimalColumnVector) output) + .vector[rowId].set(HiveDecimal.create(decimal.toJavaBigDecimal())); } - } private static class ListWriter implements OrcValueWriter { @@ -120,10 +114,12 @@ private static class ListWriter implements OrcValueWriter { @SuppressWarnings("unchecked") ListWriter(OrcValueWriter writer, List orcTypes) { if (orcTypes.size() != 1) { - throw new IllegalArgumentException("Expected one (and same) ORC type for list elements, got: " + orcTypes); + throw new IllegalArgumentException( + "Expected one (and same) ORC type for list elements, got: " + orcTypes); } this.writer = writer; - this.fieldGetter = (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); + this.fieldGetter = + (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); } @Override @@ -145,7 +141,6 @@ public void nonNullWrite(int rowId, ArrayData value, ColumnVector output) { public Stream> metrics() { return writer.metrics(); } - } private static class MapWriter implements OrcValueWriter { @@ -155,14 +150,20 @@ private static class MapWriter implements OrcValueWriter { private final SparkOrcWriter.FieldGetter valueFieldGetter; @SuppressWarnings("unchecked") - MapWriter(OrcValueWriter keyWriter, OrcValueWriter valueWriter, List orcTypes) { + MapWriter( + OrcValueWriter keyWriter, + OrcValueWriter valueWriter, + List orcTypes) { if (orcTypes.size() != 2) { - throw new IllegalArgumentException("Expected two ORC type descriptions for a map, got: " + orcTypes); + throw new IllegalArgumentException( + "Expected two ORC type descriptions for a map, got: " + orcTypes); } this.keyWriter = keyWriter; this.valueWriter = valueWriter; - this.keyFieldGetter = (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); - this.valueFieldGetter = (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(1)); + this.keyFieldGetter = + (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); + this.valueFieldGetter = + (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(1)); } @Override @@ -189,7 +190,6 @@ public void nonNullWrite(int rowId, MapData map, ColumnVector output) { public Stream> metrics() { return Stream.concat(keyWriter.metrics(), valueWriter.metrics()); } - } private static void growColumnVector(ColumnVector cv, int requestedSize) { diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java index 9c7f3a6eb01d..6a8c7f1d3c88 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.Serializable; @@ -39,19 +38,18 @@ import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.expressions.SpecializedGetters; -/** - * This class acts as an adaptor from an OrcFileAppender to a - * FileAppender<InternalRow>. - */ +/** This class acts as an adaptor from an OrcFileAppender to a FileAppender<InternalRow>. */ public class SparkOrcWriter implements OrcRowWriter { private final InternalRowWriter writer; public SparkOrcWriter(Schema iSchema, TypeDescription orcSchema) { - Preconditions.checkArgument(orcSchema.getCategory() == TypeDescription.Category.STRUCT, + Preconditions.checkArgument( + orcSchema.getCategory() == TypeDescription.Category.STRUCT, "Top level must be a struct " + orcSchema); - writer = (InternalRowWriter) OrcSchemaWithTypeVisitor.visit(iSchema, orcSchema, new WriteBuilder()); + writer = + (InternalRowWriter) OrcSchemaWithTypeVisitor.visit(iSchema, orcSchema, new WriteBuilder()); } @Override @@ -71,24 +69,26 @@ public Stream> metrics() { } private static class WriteBuilder extends OrcSchemaWithTypeVisitor> { - private WriteBuilder() { - } + private WriteBuilder() {} @Override - public OrcValueWriter record(Types.StructType iStruct, TypeDescription record, - List names, List> fields) { + public OrcValueWriter record( + Types.StructType iStruct, + TypeDescription record, + List names, + List> fields) { return new InternalRowWriter(fields, record.getChildren()); } @Override - public OrcValueWriter list(Types.ListType iList, TypeDescription array, - OrcValueWriter element) { + public OrcValueWriter list( + Types.ListType iList, TypeDescription array, OrcValueWriter element) { return SparkOrcValueWriters.list(element, array.getChildren()); } @Override - public OrcValueWriter map(Types.MapType iMap, TypeDescription map, - OrcValueWriter key, OrcValueWriter value) { + public OrcValueWriter map( + Types.MapType iMap, TypeDescription map, OrcValueWriter key, OrcValueWriter value) { return SparkOrcValueWriters.map(key, value, map.getChildren()); } @@ -178,8 +178,9 @@ static FieldGetter createFieldGetter(TypeDescription fieldType) { // being changed behind our back. break; case DECIMAL: - fieldGetter = (row, ordinal) -> - row.getDecimal(ordinal, fieldType.getPrecision(), fieldType.getScale()); + fieldGetter = + (row, ordinal) -> + row.getDecimal(ordinal, fieldType.getPrecision(), fieldType.getScale()); break; case STRING: case CHAR: @@ -196,7 +197,8 @@ static FieldGetter createFieldGetter(TypeDescription fieldType) { fieldGetter = SpecializedGetters::getMap; break; default: - throw new IllegalArgumentException("Encountered an unsupported ORC type during a write from Spark."); + throw new IllegalArgumentException( + "Encountered an unsupported ORC type during a write from Spark."); } return (row, ordinal) -> { @@ -210,10 +212,12 @@ static FieldGetter createFieldGetter(TypeDescription fieldType) { interface FieldGetter extends Serializable { /** - * Returns a value from a complex Spark data holder such ArrayData, InternalRow, etc... - * Calls the appropriate getter for the expected data type. + * Returns a value from a complex Spark data holder such ArrayData, InternalRow, etc... Calls + * the appropriate getter for the expected data type. + * * @param row Spark's data representation - * @param ordinal index in the data structure (e.g. column index for InterRow, list index in ArrayData, etc..) + * @param ordinal index in the data structure (e.g. column index for InterRow, list index in + * ArrayData, etc..) * @return field value at ordinal */ @Nullable diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java index 35627ace23b0..4b4964f05fa0 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.math.BigDecimal; @@ -66,25 +65,25 @@ import org.apache.spark.unsafe.types.UTF8String; public class SparkParquetReaders { - private SparkParquetReaders() { - } + private SparkParquetReaders() {} - public static ParquetValueReader buildReader(Schema expectedSchema, - MessageType fileSchema) { + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema) { return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); } @SuppressWarnings("unchecked") - public static ParquetValueReader buildReader(Schema expectedSchema, - MessageType fileSchema, - Map idToConstant) { + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema, Map idToConstant) { if (ParquetSchemaUtil.hasIds(fileSchema)) { return (ParquetValueReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, - new ReadBuilder(fileSchema, idToConstant)); + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); } else { return (ParquetValueReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), + fileSchema, new FallbackReadBuilder(fileSchema, idToConstant)); } } @@ -95,18 +94,18 @@ private static class FallbackReadBuilder extends ReadBuilder { } @Override - public ParquetValueReader message(Types.StructType expected, MessageType message, - List> fieldReaders) { + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { // the top level matches by ID, but the remaining IDs are missing return super.struct(expected, message, fieldReaders); } @Override - public ParquetValueReader struct(Types.StructType ignored, GroupType struct, - List> fieldReaders) { + public ParquetValueReader struct( + Types.StructType ignored, GroupType struct, List> fieldReaders) { // the expected struct is ignored because nested fields are never found when the - List> newFields = Lists.newArrayListWithExpectedSize( - fieldReaders.size()); + List> newFields = + Lists.newArrayListWithExpectedSize(fieldReaders.size()); List types = Lists.newArrayListWithExpectedSize(fieldReaders.size()); List fields = struct.getFields(); for (int i = 0; i < fields.size(); i += 1) { @@ -130,14 +129,14 @@ private static class ReadBuilder extends TypeWithSchemaVisitor message(Types.StructType expected, MessageType message, - List> fieldReaders) { + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { return struct(expected, message.asGroupType(), fieldReaders); } @Override - public ParquetValueReader struct(Types.StructType expected, GroupType struct, - List> fieldReaders) { + public ParquetValueReader struct( + Types.StructType expected, GroupType struct, List> fieldReaders) { // match the expected struct's order Map> readersById = Maps.newHashMap(); Map typesById = Maps.newHashMap(); @@ -152,10 +151,10 @@ public ParquetValueReader struct(Types.StructType expected, GroupType struct, } } - List expectedFields = expected != null ? - expected.fields() : ImmutableList.of(); - List> reorderedFields = Lists.newArrayListWithExpectedSize( - expectedFields.size()); + List expectedFields = + expected != null ? expected.fields() : ImmutableList.of(); + List> reorderedFields = + Lists.newArrayListWithExpectedSize(expectedFields.size()); List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); for (Types.NestedField field : expectedFields) { int id = field.fieldId(); @@ -185,8 +184,8 @@ public ParquetValueReader struct(Types.StructType expected, GroupType struct, } @Override - public ParquetValueReader list(Types.ListType expectedList, GroupType array, - ParquetValueReader elementReader) { + public ParquetValueReader list( + Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; @@ -195,13 +194,16 @@ public ParquetValueReader list(Types.ListType expectedList, GroupType array, Type elementType = ParquetSchemaUtil.determineListElementType(array); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - return new ArrayReader<>(repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); + return new ArrayReader<>( + repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); } @Override - public ParquetValueReader map(Types.MapType expectedMap, GroupType map, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { + public ParquetValueReader map( + Types.MapType expectedMap, + GroupType map, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); @@ -213,14 +215,16 @@ public ParquetValueReader map(Types.MapType expectedMap, GroupType map, Type valueType = repeatedKeyValue.getType(1); int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - return new MapReader<>(repeatedD, repeatedR, + return new MapReader<>( + repeatedD, + repeatedR, ParquetValueReaders.option(keyType, keyD, keyReader), ParquetValueReaders.option(valueType, valueD, valueReader)); } @Override - public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveType expected, - PrimitiveType primitive) { + public ParquetValueReader primitive( + org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { ColumnDescriptor desc = type.getColumnDescription(currentPath()); if (primitive.getOriginalType() != null) { @@ -376,12 +380,13 @@ public Long read(Long ignored) { @Override public long readLong() { - final ByteBuffer byteBuffer = column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + final ByteBuffer byteBuffer = + column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); final long timeOfDayNanos = byteBuffer.getLong(); final int julianDay = byteBuffer.getInt(); - return TimeUnit.DAYS.toMicros(julianDay - UNIX_EPOCH_JULIAN) + - TimeUnit.NANOSECONDS.toMicros(timeOfDayNanos); + return TimeUnit.DAYS.toMicros(julianDay - UNIX_EPOCH_JULIAN) + + TimeUnit.NANOSECONDS.toMicros(timeOfDayNanos); } } @@ -455,15 +460,19 @@ protected ArrayData buildList(ReusableArrayData list) { } } - private static class MapReader extends RepeatedKeyValueReader { + private static class MapReader + extends RepeatedKeyValueReader { private int readPos = 0; private int writePos = 0; private final ReusableEntry entry = new ReusableEntry<>(); private final ReusableEntry nullEntry = new ReusableEntry<>(); - MapReader(int definitionLevel, int repetitionLevel, - ParquetValueReader keyReader, ParquetValueReader valueReader) { + MapReader( + int definitionLevel, + int repetitionLevel, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { super(definitionLevel, repetitionLevel, keyReader, valueReader); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index 5e268d26ed9c..c7622678c74d 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.Iterator; @@ -54,12 +53,12 @@ import org.apache.spark.unsafe.types.UTF8String; public class SparkParquetWriters { - private SparkParquetWriters() { - } + private SparkParquetWriters() {} @SuppressWarnings("unchecked") public static ParquetValueWriter buildWriter(StructType dfSchema, MessageType type) { - return (ParquetValueWriter) ParquetWithSparkSchemaVisitor.visit(dfSchema, type, new WriteBuilder(type)); + return (ParquetValueWriter) + ParquetWithSparkSchemaVisitor.visit(dfSchema, type, new WriteBuilder(type)); } private static class WriteBuilder extends ParquetWithSparkSchemaVisitor> { @@ -70,14 +69,14 @@ private static class WriteBuilder extends ParquetWithSparkSchemaVisitor message(StructType sStruct, MessageType message, - List> fieldWriters) { + public ParquetValueWriter message( + StructType sStruct, MessageType message, List> fieldWriters) { return struct(sStruct, message.asGroupType(), fieldWriters); } @Override - public ParquetValueWriter struct(StructType sStruct, GroupType struct, - List> fieldWriters) { + public ParquetValueWriter struct( + StructType sStruct, GroupType struct, List> fieldWriters) { List fields = struct.getFields(); StructField[] sparkFields = sStruct.fields(); List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); @@ -91,31 +90,40 @@ public ParquetValueWriter struct(StructType sStruct, GroupType struct, } @Override - public ParquetValueWriter list(ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { + public ParquetValueWriter list( + ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - return new ArrayDataWriter<>(repeatedD, repeatedR, + return new ArrayDataWriter<>( + repeatedD, + repeatedR, newOption(repeated.getType(0), elementWriter), sArray.elementType()); } @Override - public ParquetValueWriter map(MapType sMap, GroupType map, - ParquetValueWriter keyWriter, ParquetValueWriter valueWriter) { + public ParquetValueWriter map( + MapType sMap, + GroupType map, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - return new MapDataWriter<>(repeatedD, repeatedR, + return new MapDataWriter<>( + repeatedD, + repeatedR, newOption(repeatedKeyValue.getType(0), keyWriter), newOption(repeatedKeyValue.getType(1), valueWriter), - sMap.keyType(), sMap.valueType()); + sMap.keyType(), + sMap.valueType()); } private ParquetValueWriter newOption(Type fieldType, ParquetValueWriter writer) { @@ -197,18 +205,18 @@ private static PrimitiveWriter utf8Strings(ColumnDescriptor desc) { return new UTF8StringWriter(desc); } - private static PrimitiveWriter decimalAsInteger(ColumnDescriptor desc, - int precision, int scale) { + private static PrimitiveWriter decimalAsInteger( + ColumnDescriptor desc, int precision, int scale) { return new IntegerDecimalWriter(desc, precision, scale); } - private static PrimitiveWriter decimalAsLong(ColumnDescriptor desc, - int precision, int scale) { + private static PrimitiveWriter decimalAsLong( + ColumnDescriptor desc, int precision, int scale) { return new LongDecimalWriter(desc, precision, scale); } - private static PrimitiveWriter decimalAsFixed(ColumnDescriptor desc, - int precision, int scale) { + private static PrimitiveWriter decimalAsFixed( + ColumnDescriptor desc, int precision, int scale) { return new FixedDecimalWriter(desc, precision, scale); } @@ -239,10 +247,18 @@ private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, Decimal decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeInteger(repetitionLevel, (int) decimal.toUnscaledLong()); } @@ -260,10 +276,18 @@ private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, Decimal decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeLong(repetitionLevel, decimal.toUnscaledLong()); } @@ -278,12 +302,15 @@ private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { super(desc); this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(int repetitionLevel, Decimal decimal) { - byte[] binary = DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toJavaBigDecimal(), bytes.get()); + byte[] binary = + DecimalUtil.toReusedFixLengthBytes( + precision, scale, decimal.toJavaBigDecimal(), bytes.get()); column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(binary)); } } @@ -302,8 +329,11 @@ public void write(int repetitionLevel, byte[] bytes) { private static class ArrayDataWriter extends RepeatedWriter { private final DataType elementType; - private ArrayDataWriter(int definitionLevel, int repetitionLevel, - ParquetValueWriter writer, DataType elementType) { + private ArrayDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter writer, + DataType elementType) { super(definitionLevel, repetitionLevel, writer); this.elementType = elementType; } @@ -354,9 +384,13 @@ private static class MapDataWriter extends RepeatedKeyValueWriter keyWriter, ParquetValueWriter valueWriter, - DataType keyType, DataType valueType) { + private MapDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter, + DataType keyType, + DataType valueType) { super(definitionLevel, repetitionLevel, keyWriter, valueWriter); this.keyType = keyType; this.valueType = valueType; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java index 0d3ce2b28d0b..11655c72d857 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -44,8 +43,7 @@ public class SparkValueReaders { - private SparkValueReaders() { - } + private SparkValueReaders() {} static ValueReader strings() { return StringReader.INSTANCE; @@ -67,8 +65,8 @@ static ValueReader array(ValueReader elementReader) { return new ArrayReader(elementReader); } - static ValueReader arrayMap(ValueReader keyReader, - ValueReader valueReader) { + static ValueReader arrayMap( + ValueReader keyReader, ValueReader valueReader) { return new ArrayMapReader(keyReader, valueReader); } @@ -76,16 +74,15 @@ static ValueReader map(ValueReader keyReader, ValueReader< return new MapReader(keyReader, valueReader); } - static ValueReader struct(List> readers, Types.StructType struct, - Map idToConstant) { + static ValueReader struct( + List> readers, Types.StructType struct, Map idToConstant) { return new StructReader(readers, struct, idToConstant); } private static class StringReader implements ValueReader { private static final StringReader INSTANCE = new StringReader(); - private StringReader() { - } + private StringReader() {} @Override public UTF8String read(Decoder decoder, Object reuse) throws IOException { @@ -97,10 +94,10 @@ public UTF8String read(Decoder decoder, Object reuse) throws IOException { Utf8 string = decoder.readString(utf8); return UTF8String.fromBytes(string.getBytes(), 0, string.getByteLength()); -// int length = decoder.readInt(); -// byte[] bytes = new byte[length]; -// decoder.readFixed(bytes, 0, length); -// return UTF8String.fromBytes(bytes); + // int length = decoder.readInt(); + // byte[] bytes = new byte[length]; + // decoder.readFixed(bytes, 0, length); + // return UTF8String.fromBytes(bytes); } } @@ -122,16 +119,17 @@ public UTF8String read(Decoder decoder, Object ignore) throws IOException { } private static class UUIDReader implements ValueReader { - private static final ThreadLocal BUFFER = ThreadLocal.withInitial(() -> { - ByteBuffer buffer = ByteBuffer.allocate(16); - buffer.order(ByteOrder.BIG_ENDIAN); - return buffer; - }); + private static final ThreadLocal BUFFER = + ThreadLocal.withInitial( + () -> { + ByteBuffer buffer = ByteBuffer.allocate(16); + buffer.order(ByteOrder.BIG_ENDIAN); + return buffer; + }); private static final UUIDReader INSTANCE = new UUIDReader(); - private UUIDReader() { - } + private UUIDReader() {} @Override @SuppressWarnings("ByteBufferBackingArray") @@ -258,14 +256,16 @@ public ArrayBasedMapData read(Decoder decoder, Object reuse) throws IOException static class StructReader extends ValueReaders.StructReader { private final int numFields; - protected StructReader(List> readers, Types.StructType struct, Map idToConstant) { + protected StructReader( + List> readers, Types.StructType struct, Map idToConstant) { super(readers, struct, idToConstant); this.numFields = readers.size(); } @Override protected InternalRow reuseOrCreate(Object reuse) { - if (reuse instanceof GenericInternalRow && ((GenericInternalRow) reuse).numFields() == numFields) { + if (reuse instanceof GenericInternalRow + && ((GenericInternalRow) reuse).numFields() == numFields) { return (InternalRow) reuse; } return new GenericInternalRow(numFields); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java index 24a69c1d7f11..5f2e2c054888 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -39,8 +38,7 @@ public class SparkValueWriters { - private SparkValueWriters() { - } + private SparkValueWriters() {} static ValueWriter strings() { return StringWriter.INSTANCE; @@ -75,8 +73,7 @@ static ValueWriter struct(List> writers, List { private static final StringWriter INSTANCE = new StringWriter(); - private StringWriter() { - } + private StringWriter() {} @Override public void write(UTF8String s, Encoder encoder) throws IOException { @@ -88,16 +85,17 @@ public void write(UTF8String s, Encoder encoder) throws IOException { } private static class UUIDWriter implements ValueWriter { - private static final ThreadLocal BUFFER = ThreadLocal.withInitial(() -> { - ByteBuffer buffer = ByteBuffer.allocate(16); - buffer.order(ByteOrder.BIG_ENDIAN); - return buffer; - }); + private static final ThreadLocal BUFFER = + ThreadLocal.withInitial( + () -> { + ByteBuffer buffer = ByteBuffer.allocate(16); + buffer.order(ByteOrder.BIG_ENDIAN); + return buffer; + }); private static final UUIDWriter INSTANCE = new UUIDWriter(); - private UUIDWriter() { - } + private UUIDWriter() {} @Override @SuppressWarnings("ByteBufferBackingArray") @@ -120,12 +118,14 @@ private static class DecimalWriter implements ValueWriter { private DecimalWriter(int precision, int scale) { this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(Decimal d, Encoder encoder) throws IOException { - encoder.writeFixed(DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toJavaBigDecimal(), bytes.get())); + encoder.writeFixed( + DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toJavaBigDecimal(), bytes.get())); } } @@ -158,8 +158,11 @@ private static class ArrayMapWriter implements ValueWriter { private final DataType keyType; private final DataType valueType; - private ArrayMapWriter(ValueWriter keyWriter, DataType keyType, - ValueWriter valueWriter, DataType valueType) { + private ArrayMapWriter( + ValueWriter keyWriter, + DataType keyType, + ValueWriter valueWriter, + DataType valueType) { this.keyWriter = keyWriter; this.keyType = keyType; this.valueWriter = valueWriter; @@ -189,8 +192,11 @@ private static class MapWriter implements ValueWriter { private final DataType keyType; private final DataType valueType; - private MapWriter(ValueWriter keyWriter, DataType keyType, - ValueWriter valueWriter, DataType valueType) { + private MapWriter( + ValueWriter keyWriter, + DataType keyType, + ValueWriter valueWriter, + DataType valueType) { this.keyWriter = keyWriter; this.keyType = keyType; this.valueWriter = valueWriter; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java index 505ace508352..e32ebcb02bbc 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.math.BigDecimal; @@ -32,10 +31,12 @@ import org.apache.spark.unsafe.types.UTF8String; final class ArrowVectorAccessorFactory - extends GenericArrowVectorAccessorFactory { + extends GenericArrowVectorAccessorFactory< + Decimal, UTF8String, ColumnarArray, ArrowColumnVector> { ArrowVectorAccessorFactory() { - super(DecimalFactoryImpl::new, + super( + DecimalFactoryImpl::new, StringFactoryImpl::new, StructChildFactoryImpl::new, ArrayFactoryImpl::new); @@ -70,9 +71,7 @@ public UTF8String ofRow(VarCharVector vector, int rowId) { int end = vector.getEndOffset(rowId); return UTF8String.fromAddress( - null, - vector.getDataBuffer().memoryAddress() + start, - end - start); + null, vector.getDataBuffer().memoryAddress() + start, end - start); } @Override @@ -84,7 +83,9 @@ public UTF8String ofBytes(byte[] bytes) { public UTF8String ofByteBuffer(ByteBuffer byteBuffer) { if (byteBuffer.hasArray()) { return UTF8String.fromBytes( - byteBuffer.array(), byteBuffer.arrayOffset() + byteBuffer.position(), byteBuffer.remaining()); + byteBuffer.array(), + byteBuffer.arrayOffset() + byteBuffer.position(), + byteBuffer.remaining()); } byte[] bytes = new byte[byteBuffer.remaining()]; byteBuffer.get(bytes); @@ -92,7 +93,8 @@ public UTF8String ofByteBuffer(ByteBuffer byteBuffer) { } } - private static final class ArrayFactoryImpl implements ArrayFactory { + private static final class ArrayFactoryImpl + implements ArrayFactory { @Override public ArrowColumnVector ofChild(ValueVector childVector) { return new ArrowColumnVector(childVector); @@ -108,7 +110,8 @@ public ColumnarArray ofRow(ValueVector vector, ArrowColumnVector childData, int } } - private static final class StructChildFactoryImpl implements StructChildFactory { + private static final class StructChildFactoryImpl + implements StructChildFactory { @Override public Class getGenericClass() { return ArrowColumnVector.class; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java index f3b3377af2b4..810fef81b5bb 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.arrow.vectorized.ArrowVectorAccessor; @@ -35,6 +34,5 @@ public class ArrowVectorAccessors { return factory.getVectorAccessor(holder); } - private ArrowVectorAccessors() { - } + private ArrowVectorAccessors() {} } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnVectorBuilder.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnVectorBuilder.java index c0459aae382b..8080a946c6f7 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnVectorBuilder.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnVectorBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.arrow.vectorized.VectorHolder; @@ -39,8 +38,8 @@ public ColumnVector build(VectorHolder holder, int numRows) { if (holder instanceof VectorHolder.DeletedVectorHolder) { return new DeletedColumnVector(Types.BooleanType.get(), isDeleted); } else if (holder instanceof ConstantVectorHolder) { - return new ConstantColumnVector(Types.IntegerType.get(), numRows, - ((ConstantVectorHolder) holder).getConstant()); + return new ConstantColumnVector( + Types.IntegerType.get(), numRows, ((ConstantVectorHolder) holder).getConstant()); } else { throw new IllegalStateException("Unknown dummy vector holder: " + holder); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnVectorWithFilter.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnVectorWithFilter.java index db4e41b04176..ab0d652321d3 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnVectorWithFilter.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnVectorWithFilter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.arrow.vectorized.VectorHolder; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java index 6dada0f84332..9686b63d1858 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.util.Iterator; @@ -38,9 +37,9 @@ import org.apache.spark.sql.vectorized.ColumnarBatch; /** - * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized read path. The - * {@link ColumnarBatch} returned is created by passing in the Arrow vectors populated via delegated read calls to - * {@linkplain VectorizedArrowReader VectorReader(s)}. + * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized + * read path. The {@link ColumnarBatch} returned is created by passing in the Arrow vectors + * populated via delegated read calls to {@linkplain VectorizedArrowReader VectorReader(s)}. */ public class ColumnarBatchReader extends BaseBatchReader { private final boolean hasIsDeletedColumn; @@ -49,12 +48,13 @@ public class ColumnarBatchReader extends BaseBatchReader { public ColumnarBatchReader(List> readers) { super(readers); - this.hasIsDeletedColumn = readers.stream().anyMatch(reader -> reader instanceof DeletedVectorReader); + this.hasIsDeletedColumn = + readers.stream().anyMatch(reader -> reader instanceof DeletedVectorReader); } @Override - public void setRowGroupInfo(PageReadStore pageStore, Map metaData, - long rowPosition) { + public void setRowGroupInfo( + PageReadStore pageStore, Map metaData, long rowPosition) { super.setRowGroupInfo(pageStore, metaData, rowPosition); this.rowStartPosInBatch = rowPosition; } @@ -76,13 +76,16 @@ public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { private class ColumnBatchLoader { private final int numRowsToRead; - // the rowId mapping to skip deleted rows for all column vectors inside a batch, it is null when there is no deletes + // the rowId mapping to skip deleted rows for all column vectors inside a batch, it is null when + // there is no deletes private int[] rowIdMapping; - // the array to indicate if a row is deleted or not, it is null when there is no "_deleted" metadata column + // the array to indicate if a row is deleted or not, it is null when there is no "_deleted" + // metadata column private boolean[] isDeleted; ColumnBatchLoader(int numRowsToRead) { - Preconditions.checkArgument(numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); + Preconditions.checkArgument( + numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); this.numRowsToRead = numRowsToRead; if (hasIsDeletedColumn) { isDeleted = new boolean[numRowsToRead]; @@ -121,11 +124,14 @@ ColumnVector[] readDataToColumnVectors() { int numRowsInVector = vectorHolders[i].numValues(); Preconditions.checkState( numRowsInVector == numRowsToRead, - "Number of rows in the vector %s didn't match expected %s ", numRowsInVector, + "Number of rows in the vector %s didn't match expected %s ", + numRowsInVector, numRowsToRead); - arrowColumnVectors[i] = columnVectorBuilder.withDeletedRows(rowIdMapping, isDeleted) - .build(vectorHolders[i], numRowsInVector); + arrowColumnVectors[i] = + columnVectorBuilder + .withDeletedRows(rowIdMapping, isDeleted) + .build(vectorHolders[i], numRowsInVector); } return arrowColumnVectors; } @@ -154,12 +160,10 @@ Pair posDelRowIdMapping() { } /** - * Build a row id mapping inside a batch, which skips deleted rows. Here is an example of how we delete 2 rows in a - * batch with 8 rows in total. - * [0,1,2,3,4,5,6,7] -- Original status of the row id mapping array - * [F,F,F,F,F,F,F,F] -- Original status of the isDeleted array - * Position delete 2, 6 - * [0,1,3,4,5,7,-,-] -- After applying position deletes [Set Num records to 6] + * Build a row id mapping inside a batch, which skips deleted rows. Here is an example of how we + * delete 2 rows in a batch with 8 rows in total. [0,1,2,3,4,5,6,7] -- Original status of the + * row id mapping array [F,F,F,F,F,F,F,F] -- Original status of the isDeleted array Position + * delete 2, 6 [0,1,3,4,5,7,-,-] -- After applying position deletes [Set Num records to 6] * [F,F,T,F,F,F,T,F] -- After applying position deletes * * @param deletedRowPositions a set of deleted row positions @@ -203,14 +207,11 @@ int[] initEqDeleteRowIdMapping() { } /** - * Filter out the equality deleted rows. Here is an example, - * [0,1,2,3,4,5,6,7] -- Original status of the row id mapping array - * [F,F,F,F,F,F,F,F] -- Original status of the isDeleted array - * Position delete 2, 6 - * [0,1,3,4,5,7,-,-] -- After applying position deletes [Set Num records to 6] - * [F,F,T,F,F,F,T,F] -- After applying position deletes - * Equality delete 1 <= x <= 3 - * [0,4,5,7,-,-,-,-] -- After applying equality deletes [Set Num records to 4] + * Filter out the equality deleted rows. Here is an example, [0,1,2,3,4,5,6,7] -- Original + * status of the row id mapping array [F,F,F,F,F,F,F,F] -- Original status of the isDeleted + * array Position delete 2, 6 [0,1,3,4,5,7,-,-] -- After applying position deletes [Set Num + * records to 6] [F,F,T,F,F,F,T,F] -- After applying position deletes Equality delete 1 <= x <= + * 3 [0,4,5,7,-,-,-,-] -- After applying equality deletes [Set Num records to 4] * [F,T,T,T,F,F,T,F] -- After applying equality deletes * * @param columnarBatch the {@link ColumnarBatch} to apply the equality delete diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java index 3cdea65b2877..42683ffa901e 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.spark.SparkSchemaUtil; @@ -39,8 +38,7 @@ class ConstantColumnVector extends ColumnVector { } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/DeletedColumnVector.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/DeletedColumnVector.java index 8fc3d4527321..eec6ecb9ace4 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/DeletedColumnVector.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/DeletedColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; @@ -38,8 +37,7 @@ public DeletedColumnVector(Type type, boolean[] isDeleted) { } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java index 1812282a34f6..38ec3a0e838c 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.arrow.vectorized.ArrowVectorAccessor; @@ -31,9 +30,10 @@ import org.apache.spark.unsafe.types.UTF8String; /** - * Implementation of Spark's {@link ColumnVector} interface. The code for this class is heavily inspired from Spark's - * {@link ArrowColumnVector} The main difference is in how nullability checks are made in this class by relying on - * {@link NullabilityHolder} instead of the validity vector in the Arrow vector. + * Implementation of Spark's {@link ColumnVector} interface. The code for this class is heavily + * inspired from Spark's {@link ArrowColumnVector} The main difference is in how nullability checks + * are made in this class by relying on {@link NullabilityHolder} instead of the validity vector in + * the Arrow vector. */ public class IcebergArrowColumnVector extends ColumnVector { @@ -151,7 +151,8 @@ public ArrowColumnVector getChild(int ordinal) { return accessor.childColumn(ordinal); } - public ArrowVectorAccessor vectorAccessor() { + public ArrowVectorAccessor + vectorAccessor() { return accessor; } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java index 58db4eb55d04..a389cd8286e5 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.spark.SparkSchemaUtil; @@ -37,8 +36,7 @@ public class RowPositionColumnVector extends ColumnVector { } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java index 418c25993a7e..7c3b825a62e7 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.util.List; @@ -47,23 +46,27 @@ public class VectorizedSparkOrcReaders { - private VectorizedSparkOrcReaders() { - } + private VectorizedSparkOrcReaders() {} - public static OrcBatchReader buildReader(Schema expectedSchema, TypeDescription fileSchema, - Map idToConstant) { - Converter converter = OrcSchemaWithTypeVisitor.visit(expectedSchema, fileSchema, new ReadBuilder(idToConstant)); + public static OrcBatchReader buildReader( + Schema expectedSchema, TypeDescription fileSchema, Map idToConstant) { + Converter converter = + OrcSchemaWithTypeVisitor.visit(expectedSchema, fileSchema, new ReadBuilder(idToConstant)); return new OrcBatchReader() { private long batchOffsetInFile; @Override public ColumnarBatch read(VectorizedRowBatch batch) { - BaseOrcColumnVector cv = (BaseOrcColumnVector) converter.convert(new StructColumnVector(batch.size, batch.cols), - batch.size, batchOffsetInFile); - ColumnarBatch columnarBatch = new ColumnarBatch(IntStream.range(0, expectedSchema.columns().size()) - .mapToObj(cv::getChild) - .toArray(ColumnVector[]::new)); + BaseOrcColumnVector cv = + (BaseOrcColumnVector) + converter.convert( + new StructColumnVector(batch.size, batch.cols), batch.size, batchOffsetInFile); + ColumnarBatch columnarBatch = + new ColumnarBatch( + IntStream.range(0, expectedSchema.columns().size()) + .mapToObj(cv::getChild) + .toArray(ColumnVector[]::new)); columnarBatch.setNumRows(batch.size); return columnarBatch; } @@ -76,8 +79,10 @@ public void setBatchContext(long batchOffsetInFile) { } private interface Converter { - ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector columnVector, int batchSize, - long batchOffsetInFile); + ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector columnVector, + int batchSize, + long batchOffsetInFile); } private static class ReadBuilder extends OrcSchemaWithTypeVisitor { @@ -88,8 +93,11 @@ private ReadBuilder(Map idToConstant) { } @Override - public Converter record(Types.StructType iStruct, TypeDescription record, List names, - List fields) { + public Converter record( + Types.StructType iStruct, + TypeDescription record, + List names, + List fields) { return new StructConverter(iStruct, fields, idToConstant); } @@ -132,7 +140,8 @@ public Converter primitive(Type.PrimitiveType iPrimitive, TypeDescription primit primitiveValueReader = SparkOrcValueReaders.timestampTzs(); break; case DECIMAL: - primitiveValueReader = SparkOrcValueReaders.decimals(primitive.getPrecision(), primitive.getScale()); + primitiveValueReader = + SparkOrcValueReaders.decimals(primitive.getPrecision(), primitive.getScale()); break; case CHAR: case VARCHAR: @@ -146,7 +155,8 @@ public Converter primitive(Type.PrimitiveType iPrimitive, TypeDescription primit throw new IllegalArgumentException("Unhandled type " + primitive); } return (columnVector, batchSize, batchOffsetInFile) -> - new PrimitiveOrcColumnVector(iPrimitive, batchSize, columnVector, primitiveValueReader, batchOffsetInFile); + new PrimitiveOrcColumnVector( + iPrimitive, batchSize, columnVector, primitiveValueReader, batchOffsetInFile); } } @@ -155,15 +165,15 @@ private abstract static class BaseOrcColumnVector extends ColumnVector { private final int batchSize; private Integer numNulls; - BaseOrcColumnVector(Type type, int batchSize, org.apache.orc.storage.ql.exec.vector.ColumnVector vector) { + BaseOrcColumnVector( + Type type, int batchSize, org.apache.orc.storage.ql.exec.vector.ColumnVector vector) { super(SparkSchemaUtil.convert(type)); this.vector = vector; this.batchSize = batchSize; } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { @@ -278,8 +288,12 @@ private static class PrimitiveOrcColumnVector extends BaseOrcColumnVector { private final OrcValueReader primitiveValueReader; private final long batchOffsetInFile; - PrimitiveOrcColumnVector(Type type, int batchSize, org.apache.orc.storage.ql.exec.vector.ColumnVector vector, - OrcValueReader primitiveValueReader, long batchOffsetInFile) { + PrimitiveOrcColumnVector( + Type type, + int batchSize, + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + OrcValueReader primitiveValueReader, + long batchOffsetInFile) { super(type, batchSize, vector); this.vector = vector; this.primitiveValueReader = primitiveValueReader; @@ -313,7 +327,8 @@ public double getDouble(int rowId) { @Override public Decimal getDecimal(int rowId, int precision, int scale) { - // TODO: Is it okay to assume that (precision,scale) parameters == (precision,scale) of the decimal type + // TODO: Is it okay to assume that (precision,scale) parameters == (precision,scale) of the + // decimal type // and return a Decimal with (precision,scale) of the decimal type? return (Decimal) primitiveValueReader.read(vector, rowId); } @@ -339,16 +354,20 @@ private ArrayConverter(Types.ListType listType, Converter elementConverter) { } @Override - public ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector vector, int batchSize, - long batchOffsetInFile) { + public ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + int batchSize, + long batchOffsetInFile) { ListColumnVector listVector = (ListColumnVector) vector; - ColumnVector elementVector = elementConverter.convert(listVector.child, batchSize, batchOffsetInFile); + ColumnVector elementVector = + elementConverter.convert(listVector.child, batchSize, batchOffsetInFile); return new BaseOrcColumnVector(listType, batchSize, vector) { @Override public ColumnarArray getArray(int rowId) { int index = getRowIndex(rowId); - return new ColumnarArray(elementVector, (int) listVector.offsets[index], (int) listVector.lengths[index]); + return new ColumnarArray( + elementVector, (int) listVector.offsets[index], (int) listVector.lengths[index]); } }; } @@ -366,17 +385,23 @@ private MapConverter(Types.MapType mapType, Converter keyConverter, Converter va } @Override - public ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector vector, int batchSize, - long batchOffsetInFile) { + public ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + int batchSize, + long batchOffsetInFile) { MapColumnVector mapVector = (MapColumnVector) vector; ColumnVector keyVector = keyConverter.convert(mapVector.keys, batchSize, batchOffsetInFile); - ColumnVector valueVector = valueConverter.convert(mapVector.values, batchSize, batchOffsetInFile); + ColumnVector valueVector = + valueConverter.convert(mapVector.values, batchSize, batchOffsetInFile); return new BaseOrcColumnVector(mapType, batchSize, vector) { @Override public ColumnarMap getMap(int rowId) { int index = getRowIndex(rowId); - return new ColumnarMap(keyVector, valueVector, (int) mapVector.offsets[index], + return new ColumnarMap( + keyVector, + valueVector, + (int) mapVector.offsets[index], (int) mapVector.lengths[index]); } }; @@ -388,30 +413,37 @@ private static class StructConverter implements Converter { private final List fieldConverters; private final Map idToConstant; - private StructConverter(Types.StructType structType, List fieldConverters, - Map idToConstant) { + private StructConverter( + Types.StructType structType, + List fieldConverters, + Map idToConstant) { this.structType = structType; this.fieldConverters = fieldConverters; this.idToConstant = idToConstant; } @Override - public ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector vector, int batchSize, - long batchOffsetInFile) { + public ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + int batchSize, + long batchOffsetInFile) { StructColumnVector structVector = (StructColumnVector) vector; List fields = structType.fields(); List fieldVectors = Lists.newArrayListWithExpectedSize(fields.size()); for (int pos = 0, vectorIndex = 0; pos < fields.size(); pos += 1) { Types.NestedField field = fields.get(pos); if (idToConstant.containsKey(field.fieldId())) { - fieldVectors.add(new ConstantColumnVector(field.type(), batchSize, idToConstant.get(field.fieldId()))); + fieldVectors.add( + new ConstantColumnVector(field.type(), batchSize, idToConstant.get(field.fieldId()))); } else if (field.equals(MetadataColumns.ROW_POSITION)) { fieldVectors.add(new RowPositionColumnVector(batchOffsetInFile)); } else if (field.equals(MetadataColumns.IS_DELETED)) { fieldVectors.add(new ConstantColumnVector(field.type(), batchSize, false)); } else { - fieldVectors.add(fieldConverters.get(vectorIndex) - .convert(structVector.fields[vectorIndex], batchSize, batchOffsetInFile)); + fieldVectors.add( + fieldConverters + .get(vectorIndex) + .convert(structVector.fields[vectorIndex], batchSize, batchOffsetInFile)); vectorIndex++; } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java index 020b35f52844..bf85bdb7ed05 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.util.List; @@ -33,13 +32,10 @@ public class VectorizedSparkParquetReaders { - private VectorizedSparkParquetReaders() { - } + private VectorizedSparkParquetReaders() {} public static ColumnarBatchReader buildReader( - Schema expectedSchema, - MessageType fileSchema, - boolean setArrowValidityVector) { + Schema expectedSchema, MessageType fileSchema, boolean setArrowValidityVector) { return buildReader(expectedSchema, fileSchema, setArrowValidityVector, Maps.newHashMap()); } @@ -49,33 +45,46 @@ public static ColumnarBatchReader buildReader( boolean setArrowValidityVector, Map idToConstant) { return (ColumnarBatchReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), + fileSchema, new VectorizedReaderBuilder( - expectedSchema, fileSchema, setArrowValidityVector, - idToConstant, ColumnarBatchReader::new)); + expectedSchema, + fileSchema, + setArrowValidityVector, + idToConstant, + ColumnarBatchReader::new)); } - public static ColumnarBatchReader buildReader(Schema expectedSchema, - MessageType fileSchema, - boolean setArrowValidityVector, - Map idToConstant, - DeleteFilter deleteFilter) { + public static ColumnarBatchReader buildReader( + Schema expectedSchema, + MessageType fileSchema, + boolean setArrowValidityVector, + Map idToConstant, + DeleteFilter deleteFilter) { return (ColumnarBatchReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), + fileSchema, new ReaderBuilder( - expectedSchema, fileSchema, setArrowValidityVector, - idToConstant, ColumnarBatchReader::new, deleteFilter)); + expectedSchema, + fileSchema, + setArrowValidityVector, + idToConstant, + ColumnarBatchReader::new, + deleteFilter)); } private static class ReaderBuilder extends VectorizedReaderBuilder { private final DeleteFilter deleteFilter; - ReaderBuilder(Schema expectedSchema, - MessageType parquetSchema, - boolean setArrowValidityVector, - Map idToConstant, - Function>, VectorizedReader> readerFactory, - DeleteFilter deleteFilter) { + ReaderBuilder( + Schema expectedSchema, + MessageType parquetSchema, + boolean setArrowValidityVector, + Map idToConstant, + Function>, VectorizedReader> readerFactory, + DeleteFilter deleteFilter) { super(expectedSchema, parquetSchema, setArrowValidityVector, idToConstant, readerFactory); this.deleteFilter = deleteFilter; } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/AddFilesProcedure.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/AddFilesProcedure.java index a902887f2965..ea0ca4c5a9b9 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/AddFilesProcedure.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/AddFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Collections; @@ -56,16 +55,19 @@ class AddFilesProcedure extends BaseProcedure { private static final Joiner.MapJoiner MAP_JOINER = Joiner.on(",").withKeyValueSeparator("="); - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("source_table", DataTypes.StringType), - ProcedureParameter.optional("partition_filter", STRING_MAP), - ProcedureParameter.optional("check_duplicate_files", DataTypes.BooleanType) - }; - - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("added_files_count", DataTypes.LongType, false, Metadata.empty()) - }); + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("source_table", DataTypes.StringType), + ProcedureParameter.optional("partition_filter", STRING_MAP), + ProcedureParameter.optional("check_duplicate_files", DataTypes.BooleanType) + }; + + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("added_files_count", DataTypes.LongType, false, Metadata.empty()) + }); private AddFilesProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -95,15 +97,19 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); CatalogPlugin sessionCat = spark().sessionState().catalogManager().v2SessionCatalog(); - Identifier sourceIdent = toCatalogAndIdentifier(args.getString(1), PARAMETERS[1].name(), sessionCat).identifier(); + Identifier sourceIdent = + toCatalogAndIdentifier(args.getString(1), PARAMETERS[1].name(), sessionCat).identifier(); Map partitionFilter = Maps.newHashMap(); if (!args.isNullAt(2)) { - args.getMap(2).foreach(DataTypes.StringType, DataTypes.StringType, - (k, v) -> { - partitionFilter.put(k.toString(), v.toString()); - return BoxedUnit.UNIT; - }); + args.getMap(2) + .foreach( + DataTypes.StringType, + DataTypes.StringType, + (k, v) -> { + partitionFilter.put(k.toString(), v.toString()); + return BoxedUnit.UNIT; + }); } boolean checkDuplicateFiles; @@ -113,36 +119,42 @@ public InternalRow[] call(InternalRow args) { checkDuplicateFiles = args.getBoolean(3); } - long addedFilesCount = importToIceberg(tableIdent, sourceIdent, partitionFilter, checkDuplicateFiles); - return new InternalRow[]{newInternalRow(addedFilesCount)}; + long addedFilesCount = + importToIceberg(tableIdent, sourceIdent, partitionFilter, checkDuplicateFiles); + return new InternalRow[] {newInternalRow(addedFilesCount)}; } private boolean isFileIdentifier(Identifier ident) { String[] namespace = ident.namespace(); - return namespace.length == 1 && - (namespace[0].equalsIgnoreCase("orc") || - namespace[0].equalsIgnoreCase("parquet") || - namespace[0].equalsIgnoreCase("avro")); + return namespace.length == 1 + && (namespace[0].equalsIgnoreCase("orc") + || namespace[0].equalsIgnoreCase("parquet") + || namespace[0].equalsIgnoreCase("avro")); } - private long importToIceberg(Identifier destIdent, Identifier sourceIdent, Map partitionFilter, - boolean checkDuplicateFiles) { - return modifyIcebergTable(destIdent, table -> { - - validatePartitionSpec(table, partitionFilter); - ensureNameMappingPresent(table); - - if (isFileIdentifier(sourceIdent)) { - Path sourcePath = new Path(sourceIdent.name()); - String format = sourceIdent.namespace()[0]; - importFileTable(table, sourcePath, format, partitionFilter, checkDuplicateFiles); - } else { - importCatalogTable(table, sourceIdent, partitionFilter, checkDuplicateFiles); - } - - Snapshot snapshot = table.currentSnapshot(); - return Long.parseLong(snapshot.summary().getOrDefault(SnapshotSummary.ADDED_FILES_PROP, "0")); - }); + private long importToIceberg( + Identifier destIdent, + Identifier sourceIdent, + Map partitionFilter, + boolean checkDuplicateFiles) { + return modifyIcebergTable( + destIdent, + table -> { + validatePartitionSpec(table, partitionFilter); + ensureNameMappingPresent(table); + + if (isFileIdentifier(sourceIdent)) { + Path sourcePath = new Path(sourceIdent.name()); + String format = sourceIdent.namespace()[0]; + importFileTable(table, sourcePath, format, partitionFilter, checkDuplicateFiles); + } else { + importCatalogTable(table, sourceIdent, partitionFilter, checkDuplicateFiles); + } + + Snapshot snapshot = table.currentSnapshot(); + return Long.parseLong( + snapshot.summary().getOrDefault(SnapshotSummary.ADDED_FILES_PROP, "0")); + }); } private static void ensureNameMappingPresent(Table table) { @@ -150,46 +162,59 @@ private static void ensureNameMappingPresent(Table table) { // Forces Name based resolution instead of position based resolution NameMapping mapping = MappingUtil.create(table.schema()); String mappingJson = NameMappingParser.toJson(mapping); - table.updateProperties() - .set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson) - .commit(); + table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit(); } } - private void importFileTable(Table table, Path tableLocation, String format, Map partitionFilter, - boolean checkDuplicateFiles) { + private void importFileTable( + Table table, + Path tableLocation, + String format, + Map partitionFilter, + boolean checkDuplicateFiles) { // List Partitions via Spark InMemory file search interface List partitions = Spark3Util.getPartitions(spark(), tableLocation, format, partitionFilter); if (table.spec().isUnpartitioned()) { - Preconditions.checkArgument(partitions.isEmpty(), "Cannot add partitioned files to an unpartitioned table"); - Preconditions.checkArgument(partitionFilter.isEmpty(), "Cannot use a partition filter when importing" + - "to an unpartitioned table"); + Preconditions.checkArgument( + partitions.isEmpty(), "Cannot add partitioned files to an unpartitioned table"); + Preconditions.checkArgument( + partitionFilter.isEmpty(), + "Cannot use a partition filter when importing" + "to an unpartitioned table"); // Build a Global Partition for the source - SparkPartition partition = new SparkPartition(Collections.emptyMap(), tableLocation.toString(), format); + SparkPartition partition = + new SparkPartition(Collections.emptyMap(), tableLocation.toString(), format); importPartitions(table, ImmutableList.of(partition), checkDuplicateFiles); } else { - Preconditions.checkArgument(!partitions.isEmpty(), - "Cannot find any matching partitions in table %s", partitions); + Preconditions.checkArgument( + !partitions.isEmpty(), "Cannot find any matching partitions in table %s", partitions); importPartitions(table, partitions, checkDuplicateFiles); } } - private void importCatalogTable(Table table, Identifier sourceIdent, Map partitionFilter, - boolean checkDuplicateFiles) { + private void importCatalogTable( + Table table, + Identifier sourceIdent, + Map partitionFilter, + boolean checkDuplicateFiles) { String stagingLocation = getMetadataLocation(table); TableIdentifier sourceTableIdentifier = Spark3Util.toV1TableIdentifier(sourceIdent); - SparkTableUtil.importSparkTable(spark(), sourceTableIdentifier, table, stagingLocation, partitionFilter, + SparkTableUtil.importSparkTable( + spark(), + sourceTableIdentifier, + table, + stagingLocation, + partitionFilter, checkDuplicateFiles); } - private void importPartitions(Table table, List partitions, - boolean checkDuplicateFiles) { + private void importPartitions( + Table table, List partitions, boolean checkDuplicateFiles) { String stagingLocation = getMetadataLocation(table); - SparkTableUtil.importSparkPartitions(spark(), partitions, table, table.spec(), stagingLocation, - checkDuplicateFiles); + SparkTableUtil.importSparkPartitions( + spark(), partitions, table, table.spec(), stagingLocation, checkDuplicateFiles); } private String getMetadataLocation(Table table) { @@ -204,38 +229,51 @@ public String description() { private void validatePartitionSpec(Table table, Map partitionFilter) { List partitionFields = table.spec().fields(); - Set partitionNames = table.spec().fields().stream().map(PartitionField::name).collect(Collectors.toSet()); + Set partitionNames = + table.spec().fields().stream().map(PartitionField::name).collect(Collectors.toSet()); boolean tablePartitioned = !partitionFields.isEmpty(); boolean partitionSpecPassed = !partitionFilter.isEmpty(); // Check for any non-identity partition columns - List nonIdentityFields = partitionFields.stream() - .filter(x -> !x.transform().isIdentity()) - .collect(Collectors.toList()); - Preconditions.checkArgument(nonIdentityFields.isEmpty(), - "Cannot add data files to target table %s because that table is partitioned and contains non-identity" + - "partition transforms which will not be compatible. Found non-identity fields %s", - table.name(), nonIdentityFields); + List nonIdentityFields = + partitionFields.stream() + .filter(x -> !x.transform().isIdentity()) + .collect(Collectors.toList()); + Preconditions.checkArgument( + nonIdentityFields.isEmpty(), + "Cannot add data files to target table %s because that table is partitioned and contains non-identity" + + "partition transforms which will not be compatible. Found non-identity fields %s", + table.name(), + nonIdentityFields); if (tablePartitioned && partitionSpecPassed) { // Check to see there are sufficient partition columns to satisfy the filter - Preconditions.checkArgument(partitionFields.size() >= partitionFilter.size(), - "Cannot add data files to target table %s because that table is partitioned, " + - "but the number of columns in the provided partition filter (%s) " + - "is greater than the number of partitioned columns in table (%s)", - table.name(), partitionFilter.size(), partitionFields.size()); + Preconditions.checkArgument( + partitionFields.size() >= partitionFilter.size(), + "Cannot add data files to target table %s because that table is partitioned, " + + "but the number of columns in the provided partition filter (%s) " + + "is greater than the number of partitioned columns in table (%s)", + table.name(), + partitionFilter.size(), + partitionFields.size()); // Check for any filters of non existent columns - List unMatchedFilters = partitionFilter.keySet().stream() - .filter(filterName -> !partitionNames.contains(filterName)) - .collect(Collectors.toList()); - Preconditions.checkArgument(unMatchedFilters.isEmpty(), - "Cannot add files to target table %s. %s is partitioned but the specified partition filter " + - "refers to columns that are not partitioned: '%s' . Valid partition columns %s", - table.name(), table.name(), unMatchedFilters, String.join(",", partitionNames)); + List unMatchedFilters = + partitionFilter.keySet().stream() + .filter(filterName -> !partitionNames.contains(filterName)) + .collect(Collectors.toList()); + Preconditions.checkArgument( + unMatchedFilters.isEmpty(), + "Cannot add files to target table %s. %s is partitioned but the specified partition filter " + + "refers to columns that are not partitioned: '%s' . Valid partition columns %s", + table.name(), + table.name(), + unMatchedFilters, + String.join(",", partitionNames)); } else { - Preconditions.checkArgument(!partitionSpecPassed, + Preconditions.checkArgument( + !partitionSpecPassed, "Cannot use partition filter with an unpartitioned table %s", table.name()); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/AncestorsOfProcedure.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/AncestorsOfProcedure.java index bfbca05a5744..60d6247411b6 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/AncestorsOfProcedure.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/AncestorsOfProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.List; @@ -35,15 +34,18 @@ public class AncestorsOfProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[] { - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("snapshot_id", DataTypes.LongType), + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("snapshot_id", DataTypes.LongType), }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[] { - new StructField("snapshot_id", DataTypes.LongType, true, Metadata.empty()), - new StructField("timestamp", DataTypes.LongType, true, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("snapshot_id", DataTypes.LongType, true, Metadata.empty()), + new StructField("timestamp", DataTypes.LongType, true, Metadata.empty()) + }); private AncestorsOfProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -77,11 +79,13 @@ public InternalRow[] call(InternalRow args) { Table icebergTable = sparkTable.table(); if (toSnapshotId == null) { - toSnapshotId = icebergTable.currentSnapshot() != null ? icebergTable.currentSnapshot().snapshotId() : -1; + toSnapshotId = + icebergTable.currentSnapshot() != null ? icebergTable.currentSnapshot().snapshotId() : -1; } - List snapshotIds = Lists.newArrayList( - SnapshotUtil.ancestorIdsBetween(toSnapshotId, null, icebergTable::snapshot)); + List snapshotIds = + Lists.newArrayList( + SnapshotUtil.ancestorIdsBetween(toSnapshotId, null, icebergTable::snapshot)); return toOutputRow(icebergTable, snapshotIds); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/BaseProcedure.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/BaseProcedure.java index 3bb936e32dcb..86364dc262b2 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/BaseProcedure.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/BaseProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.concurrent.ExecutorService; @@ -48,7 +47,8 @@ import scala.Option; abstract class BaseProcedure implements Procedure { - protected static final DataType STRING_MAP = DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType); + protected static final DataType STRING_MAP = + DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType); private final SparkSession spark; private final TableCatalog tableCatalog; @@ -92,7 +92,8 @@ protected T withIcebergTable(Identifier ident, Function T execute(Identifier ident, boolean refreshSparkCache, Function func) { + private T execute( + Identifier ident, boolean refreshSparkCache, Function func) { SparkTable sparkTable = loadSparkTable(ident); org.apache.iceberg.Table icebergTable = sparkTable.table(); @@ -106,38 +107,47 @@ private T execute(Identifier ident, boolean refreshSparkCache, Function - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * A procedure that applies changes in a given snapshot and creates a new snapshot which will be set + * as the current snapshot in a table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#cherrypick(long) */ class CherrypickSnapshotProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("snapshot_id", DataTypes.LongType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("snapshot_id", DataTypes.LongType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("source_snapshot_id", DataTypes.LongType, false, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("source_snapshot_id", DataTypes.LongType, false, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -78,16 +81,16 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); long snapshotId = args.getLong(1); - return modifyIcebergTable(tableIdent, table -> { - table.manageSnapshots() - .cherrypick(snapshotId) - .commit(); + return modifyIcebergTable( + tableIdent, + table -> { + table.manageSnapshots().cherrypick(snapshotId).commit(); - Snapshot currentSnapshot = table.currentSnapshot(); + Snapshot currentSnapshot = table.currentSnapshot(); - InternalRow outputRow = newInternalRow(snapshotId, currentSnapshot.snapshotId()); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = newInternalRow(snapshotId, currentSnapshot.snapshotId()); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/ExpireSnapshotsProcedure.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/ExpireSnapshotsProcedure.java index 272cacc4d438..69cab80a51bc 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/ExpireSnapshotsProcedure.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/ExpireSnapshotsProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Table; @@ -42,21 +41,28 @@ */ public class ExpireSnapshotsProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[] { - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("older_than", DataTypes.TimestampType), - ProcedureParameter.optional("retain_last", DataTypes.IntegerType), - ProcedureParameter.optional("max_concurrent_deletes", DataTypes.IntegerType), - ProcedureParameter.optional("stream_results", DataTypes.BooleanType) - }; - - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("deleted_data_files_count", DataTypes.LongType, true, Metadata.empty()), - new StructField("deleted_position_delete_files_count", DataTypes.LongType, true, Metadata.empty()), - new StructField("deleted_equality_delete_files_count", DataTypes.LongType, true, Metadata.empty()), - new StructField("deleted_manifest_files_count", DataTypes.LongType, true, Metadata.empty()), - new StructField("deleted_manifest_lists_count", DataTypes.LongType, true, Metadata.empty()) - }); + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("older_than", DataTypes.TimestampType), + ProcedureParameter.optional("retain_last", DataTypes.IntegerType), + ProcedureParameter.optional("max_concurrent_deletes", DataTypes.IntegerType), + ProcedureParameter.optional("stream_results", DataTypes.BooleanType) + }; + + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("deleted_data_files_count", DataTypes.LongType, true, Metadata.empty()), + new StructField( + "deleted_position_delete_files_count", DataTypes.LongType, true, Metadata.empty()), + new StructField( + "deleted_equality_delete_files_count", DataTypes.LongType, true, Metadata.empty()), + new StructField( + "deleted_manifest_files_count", DataTypes.LongType, true, Metadata.empty()), + new StructField( + "deleted_manifest_lists_count", DataTypes.LongType, true, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -89,43 +95,47 @@ public InternalRow[] call(InternalRow args) { Integer maxConcurrentDeletes = args.isNullAt(3) ? null : args.getInt(3); Boolean streamResult = args.isNullAt(4) ? null : args.getBoolean(4); - Preconditions.checkArgument(maxConcurrentDeletes == null || maxConcurrentDeletes > 0, + Preconditions.checkArgument( + maxConcurrentDeletes == null || maxConcurrentDeletes > 0, "max_concurrent_deletes should have value > 0, value: " + maxConcurrentDeletes); - return modifyIcebergTable(tableIdent, table -> { - ExpireSnapshots action = actions().expireSnapshots(table); + return modifyIcebergTable( + tableIdent, + table -> { + ExpireSnapshots action = actions().expireSnapshots(table); - if (olderThanMillis != null) { - action.expireOlderThan(olderThanMillis); - } + if (olderThanMillis != null) { + action.expireOlderThan(olderThanMillis); + } - if (retainLastNum != null) { - action.retainLast(retainLastNum); - } + if (retainLastNum != null) { + action.retainLast(retainLastNum); + } - if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) { - action.executeDeleteWith(executorService(maxConcurrentDeletes, "expire-snapshots")); - } + if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) { + action.executeDeleteWith(executorService(maxConcurrentDeletes, "expire-snapshots")); + } - if (streamResult != null) { - action.option(ExpireSnapshotsSparkAction.STREAM_RESULTS, Boolean.toString(streamResult)); - } + if (streamResult != null) { + action.option( + ExpireSnapshotsSparkAction.STREAM_RESULTS, Boolean.toString(streamResult)); + } - ExpireSnapshots.Result result = action.execute(); + ExpireSnapshots.Result result = action.execute(); - return toOutputRows(result); - }); + return toOutputRows(result); + }); } private InternalRow[] toOutputRows(ExpireSnapshots.Result result) { - InternalRow row = newInternalRow( - result.deletedDataFilesCount(), - result.deletedPositionDeleteFilesCount(), - result.deletedEqualityDeleteFilesCount(), - result.deletedManifestsCount(), - result.deletedManifestListsCount() - ); - return new InternalRow[]{row}; + InternalRow row = + newInternalRow( + result.deletedDataFilesCount(), + result.deletedPositionDeleteFilesCount(), + result.deletedEqualityDeleteFilesCount(), + result.deletedManifestsCount(), + result.deletedManifestListsCount()); + return new InternalRow[] {row}; } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/MigrateTableProcedure.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/MigrateTableProcedure.java index 2f6841924f8c..a49dd7d526b0 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/MigrateTableProcedure.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/MigrateTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Map; @@ -35,14 +34,17 @@ import scala.runtime.BoxedUnit; class MigrateTableProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("properties", STRING_MAP) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("properties", STRING_MAP) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("migrated_files_count", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("migrated_files_count", DataTypes.LongType, false, Metadata.empty()) + }); private MigrateTableProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -70,19 +72,24 @@ public StructType outputType() { @Override public InternalRow[] call(InternalRow args) { String tableName = args.getString(0); - Preconditions.checkArgument(tableName != null && !tableName.isEmpty(), + Preconditions.checkArgument( + tableName != null && !tableName.isEmpty(), "Cannot handle an empty identifier for argument table"); Map properties = Maps.newHashMap(); if (!args.isNullAt(1)) { - args.getMap(1).foreach(DataTypes.StringType, DataTypes.StringType, - (k, v) -> { - properties.put(k.toString(), v.toString()); - return BoxedUnit.UNIT; - }); + args.getMap(1) + .foreach( + DataTypes.StringType, + DataTypes.StringType, + (k, v) -> { + properties.put(k.toString(), v.toString()); + return BoxedUnit.UNIT; + }); } - MigrateTable.Result result = SparkActions.get().migrateTable(tableName).tableProperties(properties).execute(); + MigrateTable.Result result = + SparkActions.get().migrateTable(tableName).tableProperties(properties).execute(); return new InternalRow[] {newInternalRow(result.migratedDataFilesCount())}; } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/PublishChangesProcedure.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/PublishChangesProcedure.java index e86d698e1646..eb6c762ed51e 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/PublishChangesProcedure.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/PublishChangesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Optional; @@ -35,24 +34,28 @@ import org.apache.spark.sql.types.StructType; /** - * A procedure that applies changes in a snapshot created within a Write-Audit-Publish workflow with a wap_id and - * creates a new snapshot which will be set as the current snapshot in a table. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * A procedure that applies changes in a snapshot created within a Write-Audit-Publish workflow with + * a wap_id and creates a new snapshot which will be set as the current snapshot in a table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#cherrypick(long) */ class PublishChangesProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("wap_id", DataTypes.StringType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("wap_id", DataTypes.StringType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("source_snapshot_id", DataTypes.LongType, false, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("source_snapshot_id", DataTypes.LongType, false, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new Builder() { @@ -82,23 +85,27 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); String wapId = args.getString(1); - return modifyIcebergTable(tableIdent, table -> { - Optional wapSnapshot = Optional.ofNullable( - Iterables.find(table.snapshots(), snapshot -> wapId.equals(WapUtil.stagedWapId(snapshot)), null)); - if (!wapSnapshot.isPresent()) { - throw new ValidationException(String.format("Cannot apply unknown WAP ID '%s'", wapId)); - } + return modifyIcebergTable( + tableIdent, + table -> { + Optional wapSnapshot = + Optional.ofNullable( + Iterables.find( + table.snapshots(), + snapshot -> wapId.equals(WapUtil.stagedWapId(snapshot)), + null)); + if (!wapSnapshot.isPresent()) { + throw new ValidationException(String.format("Cannot apply unknown WAP ID '%s'", wapId)); + } - long wapSnapshotId = wapSnapshot.get().snapshotId(); - table.manageSnapshots() - .cherrypick(wapSnapshotId) - .commit(); + long wapSnapshotId = wapSnapshot.get().snapshotId(); + table.manageSnapshots().cherrypick(wapSnapshotId).commit(); - Snapshot currentSnapshot = table.currentSnapshot(); + Snapshot currentSnapshot = table.currentSnapshot(); - InternalRow outputRow = newInternalRow(wapSnapshotId, currentSnapshot.snapshotId()); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = newInternalRow(wapSnapshotId, currentSnapshot.snapshotId()); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RegisterTableProcedure.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RegisterTableProcedure.java index 85e374d4800f..857949e052c8 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RegisterTableProcedure.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RegisterTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Snapshot; @@ -37,16 +36,19 @@ import org.apache.spark.sql.types.StructType; class RegisterTableProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("metadata_file", DataTypes.StringType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("metadata_file", DataTypes.StringType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("current_snapshot_id", DataTypes.LongType, true, Metadata.empty()), - new StructField("total_records_count", DataTypes.LongType, true, Metadata.empty()), - new StructField("total_data_files_count", DataTypes.LongType, true, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("current_snapshot_id", DataTypes.LongType, true, Metadata.empty()), + new StructField("total_records_count", DataTypes.LongType, true, Metadata.empty()), + new StructField("total_data_files_count", DataTypes.LongType, true, Metadata.empty()) + }); private RegisterTableProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -73,11 +75,14 @@ public StructType outputType() { @Override public InternalRow[] call(InternalRow args) { - TableIdentifier tableName = Spark3Util.identifierToTableIdentifier(toIdentifier(args.getString(0), "table")); + TableIdentifier tableName = + Spark3Util.identifierToTableIdentifier(toIdentifier(args.getString(0), "table")); String metadataFile = args.getString(1); - Preconditions.checkArgument(tableCatalog() instanceof HasIcebergCatalog, + Preconditions.checkArgument( + tableCatalog() instanceof HasIcebergCatalog, "Cannot use Register Table in a non-Iceberg catalog"); - Preconditions.checkArgument(metadataFile != null && !metadataFile.isEmpty(), + Preconditions.checkArgument( + metadataFile != null && !metadataFile.isEmpty(), "Cannot handle an empty argument metadata_file"); Catalog icebergCatalog = ((HasIcebergCatalog) tableCatalog()).icebergCatalog(); @@ -89,8 +94,10 @@ public InternalRow[] call(InternalRow args) { Snapshot currentSnapshot = table.currentSnapshot(); if (currentSnapshot != null) { currentSnapshotId = currentSnapshot.snapshotId(); - totalDataFiles = Long.parseLong(currentSnapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); - totalRecords = Long.parseLong(currentSnapshot.summary().get(SnapshotSummary.TOTAL_RECORDS_PROP)); + totalDataFiles = + Long.parseLong(currentSnapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); + totalRecords = + Long.parseLong(currentSnapshot.summary().get(SnapshotSummary.TOTAL_RECORDS_PROP)); } return new InternalRow[] {newInternalRow(currentSnapshotId, totalRecords, totalDataFiles)}; @@ -101,4 +108,3 @@ public String description() { return "RegisterTableProcedure"; } } - diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java index 92b0ba30b5ba..24377c32d0ac 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Map; @@ -49,21 +48,24 @@ */ public class RemoveOrphanFilesProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("older_than", DataTypes.TimestampType), - ProcedureParameter.optional("location", DataTypes.StringType), - ProcedureParameter.optional("dry_run", DataTypes.BooleanType), - ProcedureParameter.optional("max_concurrent_deletes", DataTypes.IntegerType), - ProcedureParameter.optional("file_list_view", DataTypes.StringType), - ProcedureParameter.optional("equal_schemes", STRING_MAP), - ProcedureParameter.optional("equal_authorities", STRING_MAP), - ProcedureParameter.optional("prefix_mismatch_mode", DataTypes.StringType), - }; - - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("orphan_file_location", DataTypes.StringType, false, Metadata.empty()) - }); + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("older_than", DataTypes.TimestampType), + ProcedureParameter.optional("location", DataTypes.StringType), + ProcedureParameter.optional("dry_run", DataTypes.BooleanType), + ProcedureParameter.optional("max_concurrent_deletes", DataTypes.IntegerType), + ProcedureParameter.optional("file_list_view", DataTypes.StringType), + ProcedureParameter.optional("equal_schemes", STRING_MAP), + ProcedureParameter.optional("equal_authorities", STRING_MAP), + ProcedureParameter.optional("prefix_mismatch_mode", DataTypes.StringType), + }; + + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("orphan_file_location", DataTypes.StringType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -98,68 +100,77 @@ public InternalRow[] call(InternalRow args) { Integer maxConcurrentDeletes = args.isNullAt(4) ? null : args.getInt(4); String fileListView = args.isNullAt(5) ? null : args.getString(5); - Preconditions.checkArgument(maxConcurrentDeletes == null || maxConcurrentDeletes > 0, - "max_concurrent_deletes should have value > 0, value: " + maxConcurrentDeletes); + Preconditions.checkArgument( + maxConcurrentDeletes == null || maxConcurrentDeletes > 0, + "max_concurrent_deletes should have value > 0, value: " + maxConcurrentDeletes); Map equalSchemes = Maps.newHashMap(); if (!args.isNullAt(6)) { - args.getMap(6).foreach(DataTypes.StringType, DataTypes.StringType, - (k, v) -> { - equalSchemes.put(k.toString(), v.toString()); - return BoxedUnit.UNIT; - }); + args.getMap(6) + .foreach( + DataTypes.StringType, + DataTypes.StringType, + (k, v) -> { + equalSchemes.put(k.toString(), v.toString()); + return BoxedUnit.UNIT; + }); } Map equalAuthorities = Maps.newHashMap(); if (!args.isNullAt(7)) { - args.getMap(7).foreach(DataTypes.StringType, DataTypes.StringType, - (k, v) -> { - equalSchemes.put(k.toString(), v.toString()); - return BoxedUnit.UNIT; - }); + args.getMap(7) + .foreach( + DataTypes.StringType, + DataTypes.StringType, + (k, v) -> { + equalSchemes.put(k.toString(), v.toString()); + return BoxedUnit.UNIT; + }); } - PrefixMismatchMode prefixMismatchMode = args.isNullAt(8) ? null : - PrefixMismatchMode.fromString(args.getString(8)); + PrefixMismatchMode prefixMismatchMode = + args.isNullAt(8) ? null : PrefixMismatchMode.fromString(args.getString(8)); - return withIcebergTable(tableIdent, table -> { - DeleteOrphanFilesSparkAction action = actions().deleteOrphanFiles(table); + return withIcebergTable( + tableIdent, + table -> { + DeleteOrphanFilesSparkAction action = actions().deleteOrphanFiles(table); - if (olderThanMillis != null) { - boolean isTesting = Boolean.parseBoolean(spark().conf().get("spark.testing", "false")); - if (!isTesting) { - validateInterval(olderThanMillis); - } - action.olderThan(olderThanMillis); - } + if (olderThanMillis != null) { + boolean isTesting = Boolean.parseBoolean(spark().conf().get("spark.testing", "false")); + if (!isTesting) { + validateInterval(olderThanMillis); + } + action.olderThan(olderThanMillis); + } - if (location != null) { - action.location(location); - } + if (location != null) { + action.location(location); + } - if (dryRun) { - action.deleteWith(file -> { }); - } + if (dryRun) { + action.deleteWith(file -> {}); + } - if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) { - action.executeDeleteWith(executorService(maxConcurrentDeletes, "remove-orphans")); - } + if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) { + action.executeDeleteWith(executorService(maxConcurrentDeletes, "remove-orphans")); + } - if (fileListView != null) { - action.compareToFileList(spark().table(fileListView)); - } + if (fileListView != null) { + action.compareToFileList(spark().table(fileListView)); + } - action.equalSchemes(equalSchemes); - action.equalAuthorities(equalAuthorities); + action.equalSchemes(equalSchemes); + action.equalAuthorities(equalAuthorities); - if (prefixMismatchMode != null) { - action.prefixMismatchMode(prefixMismatchMode); - } + if (prefixMismatchMode != null) { + action.prefixMismatchMode(prefixMismatchMode); + } - DeleteOrphanFiles.Result result = action.execute(); + DeleteOrphanFiles.Result result = action.execute(); - return toOutputRows(result); - }); + return toOutputRows(result); + }); } private InternalRow[] toOutputRows(DeleteOrphanFiles.Result result) { @@ -181,11 +192,11 @@ private void validateInterval(long olderThanMillis) { long intervalMillis = System.currentTimeMillis() - olderThanMillis; if (intervalMillis < TimeUnit.DAYS.toMillis(1)) { throw new IllegalArgumentException( - "Cannot remove orphan files with an interval less than 24 hours. Executing this " + - "procedure with a short interval may corrupt the table if other operations are happening " + - "at the same time. If you are absolutely confident that no concurrent operations will be " + - "affected by removing orphan files with such a short interval, you can use the Action API " + - "to remove orphan files with an arbitrary interval."); + "Cannot remove orphan files with an interval less than 24 hours. Executing this " + + "procedure with a short interval may corrupt the table if other operations are happening " + + "at the same time. If you are absolutely confident that no concurrent operations will be " + + "affected by removing orphan files with such a short interval, you can use the Action API " + + "to remove orphan files with an arbitrary interval."); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteDataFilesProcedure.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteDataFilesProcedure.java index 5c71dbf55529..a0636a75b950 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteDataFilesProcedure.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteDataFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.List; @@ -52,19 +51,24 @@ */ class RewriteDataFilesProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("strategy", DataTypes.StringType), - ProcedureParameter.optional("sort_order", DataTypes.StringType), - ProcedureParameter.optional("options", STRING_MAP), - ProcedureParameter.optional("where", DataTypes.StringType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("strategy", DataTypes.StringType), + ProcedureParameter.optional("sort_order", DataTypes.StringType), + ProcedureParameter.optional("options", STRING_MAP), + ProcedureParameter.optional("where", DataTypes.StringType) + }; // counts are not nullable since the action result is never null - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("rewritten_data_files_count", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("added_data_files_count", DataTypes.IntegerType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField( + "rewritten_data_files_count", DataTypes.IntegerType, false, Metadata.empty()), + new StructField( + "added_data_files_count", DataTypes.IntegerType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new Builder() { @@ -93,35 +97,40 @@ public StructType outputType() { public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); - return modifyIcebergTable(tableIdent, table -> { - String quotedFullIdentifier = Spark3Util.quotedFullIdentifier(tableCatalog().name(), tableIdent); - RewriteDataFiles action = actions().rewriteDataFiles(table); + return modifyIcebergTable( + tableIdent, + table -> { + String quotedFullIdentifier = + Spark3Util.quotedFullIdentifier(tableCatalog().name(), tableIdent); + RewriteDataFiles action = actions().rewriteDataFiles(table); - String strategy = args.isNullAt(1) ? null : args.getString(1); - String sortOrderString = args.isNullAt(2) ? null : args.getString(2); + String strategy = args.isNullAt(1) ? null : args.getString(1); + String sortOrderString = args.isNullAt(2) ? null : args.getString(2); - if (strategy != null || sortOrderString != null) { - action = checkAndApplyStrategy(action, strategy, sortOrderString, table.schema()); - } + if (strategy != null || sortOrderString != null) { + action = checkAndApplyStrategy(action, strategy, sortOrderString, table.schema()); + } - if (!args.isNullAt(3)) { - action = checkAndApplyOptions(args, action); - } + if (!args.isNullAt(3)) { + action = checkAndApplyOptions(args, action); + } - String where = args.isNullAt(4) ? null : args.getString(4); + String where = args.isNullAt(4) ? null : args.getString(4); - action = checkAndApplyFilter(action, where, quotedFullIdentifier); + action = checkAndApplyFilter(action, where, quotedFullIdentifier); - RewriteDataFiles.Result result = action.execute(); + RewriteDataFiles.Result result = action.execute(); - return toOutputRows(result); - }); + return toOutputRows(result); + }); } - private RewriteDataFiles checkAndApplyFilter(RewriteDataFiles action, String where, String tableName) { + private RewriteDataFiles checkAndApplyFilter( + RewriteDataFiles action, String where, String tableName) { if (where != null) { try { - Expression expression = SparkExpressionConverter.collectResolvedSparkExpression(spark(), tableName, where); + Expression expression = + SparkExpressionConverter.collectResolvedSparkExpression(spark(), tableName, where); return action.filter(SparkExpressionConverter.convertToIcebergExpression(expression)); } catch (AnalysisException e) { throw new IllegalArgumentException("Cannot parse predicates in where option: " + where); @@ -132,7 +141,10 @@ private RewriteDataFiles checkAndApplyFilter(RewriteDataFiles action, String whe private RewriteDataFiles checkAndApplyOptions(InternalRow args, RewriteDataFiles action) { Map options = Maps.newHashMap(); - args.getMap(3).foreach(DataTypes.StringType, DataTypes.StringType, + args.getMap(3) + .foreach( + DataTypes.StringType, + DataTypes.StringType, (k, v) -> { options.put(k.toString(), v.toString()); return BoxedUnit.UNIT; @@ -140,18 +152,20 @@ private RewriteDataFiles checkAndApplyOptions(InternalRow args, RewriteDataFiles return action.options(options); } - private RewriteDataFiles checkAndApplyStrategy(RewriteDataFiles action, String strategy, String sortOrderString, - Schema schema) { + private RewriteDataFiles checkAndApplyStrategy( + RewriteDataFiles action, String strategy, String sortOrderString, Schema schema) { List zOrderTerms = Lists.newArrayList(); List sortOrderFields = Lists.newArrayList(); if (sortOrderString != null) { - ExtendedParser.parseSortOrder(spark(), sortOrderString).forEach(field -> { - if (field.term() instanceof Zorder) { - zOrderTerms.add((Zorder) field.term()); - } else { - sortOrderFields.add(field); - } - }); + ExtendedParser.parseSortOrder(spark(), sortOrderString) + .forEach( + field -> { + if (field.term() instanceof Zorder) { + zOrderTerms.add((Zorder) field.term()); + } else { + sortOrderFields.add(field); + } + }); if (!zOrderTerms.isEmpty() && !sortOrderFields.isEmpty()) { // TODO: we need to allow this in future when SparkAction has handling for this. @@ -160,11 +174,14 @@ private RewriteDataFiles checkAndApplyStrategy(RewriteDataFiles action, String s } } - // caller of this function ensures that between strategy and sortOrder, at least one of them is not null. + // caller of this function ensures that between strategy and sortOrder, at least one of them is + // not null. if (strategy == null || strategy.equalsIgnoreCase("sort")) { if (!zOrderTerms.isEmpty()) { - String[] columnNames = zOrderTerms.stream().flatMap( - zOrder -> zOrder.refs().stream().map(NamedReference::name)).toArray(String[]::new); + String[] columnNames = + zOrderTerms.stream() + .flatMap(zOrder -> zOrder.refs().stream().map(NamedReference::name)) + .toArray(String[]::new); return action.zOrder(columnNames); } else { return action.sort(buildSortOrder(sortOrderFields, schema)); @@ -173,7 +190,8 @@ private RewriteDataFiles checkAndApplyStrategy(RewriteDataFiles action, String s if (strategy.equalsIgnoreCase("binpack")) { RewriteDataFiles rewriteDataFiles = action.binPack(); if (sortOrderString != null) { - // calling below method to throw the error as user has set both binpack strategy and sort order + // calling below method to throw the error as user has set both binpack strategy and sort + // order return rewriteDataFiles.sort(buildSortOrder(sortOrderFields, schema)); } return rewriteDataFiles; @@ -183,9 +201,11 @@ private RewriteDataFiles checkAndApplyStrategy(RewriteDataFiles action, String s } } - private SortOrder buildSortOrder(List rawOrderFields, Schema schema) { + private SortOrder buildSortOrder( + List rawOrderFields, Schema schema) { SortOrder.Builder builder = SortOrder.builderFor(schema); - rawOrderFields.forEach(rawField -> builder.sortBy(rawField.term(), rawField.direction(), rawField.nullOrder())); + rawOrderFields.forEach( + rawField -> builder.sortBy(rawField.term(), rawField.direction(), rawField.nullOrder())); return builder.build(); } @@ -193,7 +213,7 @@ private InternalRow[] toOutputRows(RewriteDataFiles.Result result) { int rewrittenDataFilesCount = result.rewrittenDataFilesCount(); int addedDataFilesCount = result.addedDataFilesCount(); InternalRow row = newInternalRow(rewrittenDataFilesCount, addedDataFilesCount); - return new InternalRow[]{row}; + return new InternalRow[] {row}; } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteManifestsProcedure.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteManifestsProcedure.java index 1cc76501c066..c8becc7e5a0f 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteManifestsProcedure.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteManifestsProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Table; @@ -36,23 +35,28 @@ /** * A procedure that rewrites manifests in a table. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see SparkActions#rewriteManifests(Table) () */ class RewriteManifestsProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("use_caching", DataTypes.BooleanType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("use_caching", DataTypes.BooleanType) + }; // counts are not nullable since the action result is never null - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("rewritten_manifests_count", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("added_manifests_count", DataTypes.IntegerType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField( + "rewritten_manifests_count", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("added_manifests_count", DataTypes.IntegerType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -82,24 +86,26 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); Boolean useCaching = args.isNullAt(1) ? null : args.getBoolean(1); - return modifyIcebergTable(tableIdent, table -> { - RewriteManifestsSparkAction action = actions().rewriteManifests(table); + return modifyIcebergTable( + tableIdent, + table -> { + RewriteManifestsSparkAction action = actions().rewriteManifests(table); - if (useCaching != null) { - action.option(RewriteManifestsSparkAction.USE_CACHING, useCaching.toString()); - } + if (useCaching != null) { + action.option(RewriteManifestsSparkAction.USE_CACHING, useCaching.toString()); + } - RewriteManifests.Result result = action.execute(); + RewriteManifests.Result result = action.execute(); - return toOutputRows(result); - }); + return toOutputRows(result); + }); } private InternalRow[] toOutputRows(RewriteManifests.Result result) { int rewrittenManifestsCount = Iterables.size(result.rewrittenManifests()); int addedManifestsCount = Iterables.size(result.addedManifests()); InternalRow row = newInternalRow(rewrittenManifestsCount, addedManifestsCount); - return new InternalRow[]{row}; + return new InternalRow[] {row}; } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToSnapshotProcedure.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToSnapshotProcedure.java index 7cf5b0c77bb2..49cc1a5ceae3 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToSnapshotProcedure.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToSnapshotProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Snapshot; @@ -32,22 +31,26 @@ /** * A procedure that rollbacks a table to a specific snapshot id. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#rollbackTo(long) */ class RollbackToSnapshotProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("snapshot_id", DataTypes.LongType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("snapshot_id", DataTypes.LongType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("previous_snapshot_id", DataTypes.LongType, false, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("previous_snapshot_id", DataTypes.LongType, false, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -77,16 +80,16 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); long snapshotId = args.getLong(1); - return modifyIcebergTable(tableIdent, table -> { - Snapshot previousSnapshot = table.currentSnapshot(); + return modifyIcebergTable( + tableIdent, + table -> { + Snapshot previousSnapshot = table.currentSnapshot(); - table.manageSnapshots() - .rollbackTo(snapshotId) - .commit(); + table.manageSnapshots().rollbackTo(snapshotId).commit(); - InternalRow outputRow = newInternalRow(previousSnapshot.snapshotId(), snapshotId); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = newInternalRow(previousSnapshot.snapshotId(), snapshotId); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToTimestampProcedure.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToTimestampProcedure.java index 519a46c6dbb8..059725f0c152 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToTimestampProcedure.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToTimestampProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Snapshot; @@ -33,22 +32,26 @@ /** * A procedure that rollbacks a table to a given point in time. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#rollbackToTime(long) */ class RollbackToTimestampProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("timestamp", DataTypes.TimestampType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("timestamp", DataTypes.TimestampType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("previous_snapshot_id", DataTypes.LongType, false, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("previous_snapshot_id", DataTypes.LongType, false, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -79,18 +82,19 @@ public InternalRow[] call(InternalRow args) { // timestamps in Spark have microsecond precision so this conversion is lossy long timestampMillis = DateTimeUtil.microsToMillis(args.getLong(1)); - return modifyIcebergTable(tableIdent, table -> { - Snapshot previousSnapshot = table.currentSnapshot(); + return modifyIcebergTable( + tableIdent, + table -> { + Snapshot previousSnapshot = table.currentSnapshot(); - table.manageSnapshots() - .rollbackToTime(timestampMillis) - .commit(); + table.manageSnapshots().rollbackToTime(timestampMillis).commit(); - Snapshot currentSnapshot = table.currentSnapshot(); + Snapshot currentSnapshot = table.currentSnapshot(); - InternalRow outputRow = newInternalRow(previousSnapshot.snapshotId(), currentSnapshot.snapshotId()); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = + newInternalRow(previousSnapshot.snapshotId(), currentSnapshot.snapshotId()); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/SetCurrentSnapshotProcedure.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/SetCurrentSnapshotProcedure.java index 274ca19fc107..f8f8049c22b6 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/SetCurrentSnapshotProcedure.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/SetCurrentSnapshotProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Snapshot; @@ -32,22 +31,26 @@ /** * A procedure that sets the current snapshot in a table. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#setCurrentSnapshot(long) */ class SetCurrentSnapshotProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[] { - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("snapshot_id", DataTypes.LongType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("snapshot_id", DataTypes.LongType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("previous_snapshot_id", DataTypes.LongType, true, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("previous_snapshot_id", DataTypes.LongType, true, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -77,17 +80,17 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); long snapshotId = args.getLong(1); - return modifyIcebergTable(tableIdent, table -> { - Snapshot previousSnapshot = table.currentSnapshot(); - Long previousSnapshotId = previousSnapshot != null ? previousSnapshot.snapshotId() : null; + return modifyIcebergTable( + tableIdent, + table -> { + Snapshot previousSnapshot = table.currentSnapshot(); + Long previousSnapshotId = previousSnapshot != null ? previousSnapshot.snapshotId() : null; - table.manageSnapshots() - .setCurrentSnapshot(snapshotId) - .commit(); + table.manageSnapshots().setCurrentSnapshot(snapshotId).commit(); - InternalRow outputRow = newInternalRow(previousSnapshotId, snapshotId); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = newInternalRow(previousSnapshotId, snapshotId); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/SnapshotTableProcedure.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/SnapshotTableProcedure.java index 96e293d6b1da..7a015a51e8ed 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/SnapshotTableProcedure.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/SnapshotTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Map; @@ -34,16 +33,19 @@ import scala.runtime.BoxedUnit; class SnapshotTableProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("source_table", DataTypes.StringType), - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("location", DataTypes.StringType), - ProcedureParameter.optional("properties", STRING_MAP) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("source_table", DataTypes.StringType), + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("location", DataTypes.StringType), + ProcedureParameter.optional("properties", STRING_MAP) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("imported_files_count", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("imported_files_count", DataTypes.LongType, false, Metadata.empty()) + }); private SnapshotTableProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -71,23 +73,28 @@ public StructType outputType() { @Override public InternalRow[] call(InternalRow args) { String source = args.getString(0); - Preconditions.checkArgument(source != null && !source.isEmpty(), + Preconditions.checkArgument( + source != null && !source.isEmpty(), "Cannot handle an empty identifier for argument source_table"); String dest = args.getString(1); - Preconditions.checkArgument(dest != null && !dest.isEmpty(), - "Cannot handle an empty identifier for argument table"); + Preconditions.checkArgument( + dest != null && !dest.isEmpty(), "Cannot handle an empty identifier for argument table"); String snapshotLocation = args.isNullAt(2) ? null : args.getString(2); Map properties = Maps.newHashMap(); if (!args.isNullAt(3)) { - args.getMap(3).foreach(DataTypes.StringType, DataTypes.StringType, - (k, v) -> { - properties.put(k.toString(), v.toString()); - return BoxedUnit.UNIT; - }); + args.getMap(3) + .foreach( + DataTypes.StringType, + DataTypes.StringType, + (k, v) -> { + properties.put(k.toString(), v.toString()); + return BoxedUnit.UNIT; + }); } - Preconditions.checkArgument(!source.equals(dest), + Preconditions.checkArgument( + !source.equals(dest), "Cannot create a snapshot with the same name as the source of the snapshot."); SnapshotTable action = SparkActions.get().snapshotTable(source).as(dest); @@ -103,5 +110,4 @@ public InternalRow[] call(InternalRow args) { public String description() { return "SnapshotTableProcedure"; } - } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java index a7c036e1c6ec..6d59cb876b1e 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Locale; @@ -30,8 +29,7 @@ public class SparkProcedures { private static final Map> BUILDERS = initProcedureBuilders(); - private SparkProcedures() { - } + private SparkProcedures() {} public static ProcedureBuilder newBuilder(String name) { // procedure resolution is case insensitive to match the existing Spark behavior for functions @@ -60,6 +58,7 @@ private static Map> initProcedureBuilders() { public interface ProcedureBuilder { ProcedureBuilder withTableCatalog(TableCatalog tableCatalog); + Procedure build(); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java index 1b0b69299b8e..45bf3cfcc86a 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -42,16 +41,24 @@ abstract class BaseBatchReader extends BaseReader { private final int batchSize; - BaseBatchReader(Table table, ScanTaskGroup taskGroup, Schema expectedSchema, boolean caseSensitive, - int batchSize) { + BaseBatchReader( + Table table, + ScanTaskGroup taskGroup, + Schema expectedSchema, + boolean caseSensitive, + int batchSize) { super(table, taskGroup, expectedSchema, caseSensitive); this.batchSize = batchSize; } - protected CloseableIterable newBatchIterable(InputFile inputFile, FileFormat format, - long start, long length, Expression residual, - Map idToConstant, - SparkDeleteFilter deleteFilter) { + protected CloseableIterable newBatchIterable( + InputFile inputFile, + FileFormat format, + long start, + long length, + Expression residual, + Map idToConstant, + SparkDeleteFilter deleteFilter) { switch (format) { case PARQUET: return newParquetIterable(inputFile, start, length, residual, idToConstant, deleteFilter); @@ -60,24 +67,37 @@ protected CloseableIterable newBatchIterable(InputFile inputFile, return newOrcIterable(inputFile, start, length, residual, idToConstant); default: - throw new UnsupportedOperationException("Format: " + format + " not supported for batched reads"); + throw new UnsupportedOperationException( + "Format: " + format + " not supported for batched reads"); } } - private CloseableIterable newParquetIterable(InputFile inputFile, long start, long length, - Expression residual, Map idToConstant, - SparkDeleteFilter deleteFilter) { - // get required schema for filtering out equality-delete rows in case equality-delete uses columns are + private CloseableIterable newParquetIterable( + InputFile inputFile, + long start, + long length, + Expression residual, + Map idToConstant, + SparkDeleteFilter deleteFilter) { + // get required schema for filtering out equality-delete rows in case equality-delete uses + // columns are // not selected. - Schema requiredSchema = deleteFilter != null && deleteFilter.hasEqDeletes() ? - deleteFilter.requiredSchema() : expectedSchema(); + Schema requiredSchema = + deleteFilter != null && deleteFilter.hasEqDeletes() + ? deleteFilter.requiredSchema() + : expectedSchema(); return Parquet.read(inputFile) .project(requiredSchema) .split(start, length) - .createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(requiredSchema, - fileSchema, /* setArrowValidityVector */ NullCheckingForGet.NULL_CHECKING_ENABLED, idToConstant, - deleteFilter)) + .createBatchedReaderFunc( + fileSchema -> + VectorizedSparkParquetReaders.buildReader( + requiredSchema, + fileSchema, /* setArrowValidityVector */ + NullCheckingForGet.NULL_CHECKING_ENABLED, + idToConstant, + deleteFilter)) .recordsPerBatch(batchSize) .filter(residual) .caseSensitive(caseSensitive()) @@ -89,18 +109,25 @@ private CloseableIterable newParquetIterable(InputFile inputFile, .build(); } - private CloseableIterable newOrcIterable(InputFile inputFile, long start, long length, - Expression residual, Map idToConstant) { + private CloseableIterable newOrcIterable( + InputFile inputFile, + long start, + long length, + Expression residual, + Map idToConstant) { Set constantFieldIds = idToConstant.keySet(); Set metadataFieldIds = MetadataColumns.metadataFieldIds(); - Sets.SetView constantAndMetadataFieldIds = Sets.union(constantFieldIds, metadataFieldIds); - Schema schemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(expectedSchema(), constantAndMetadataFieldIds); + Sets.SetView constantAndMetadataFieldIds = + Sets.union(constantFieldIds, metadataFieldIds); + Schema schemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot(expectedSchema(), constantAndMetadataFieldIds); return ORC.read(inputFile) .project(schemaWithoutConstantAndMetadataFields) .split(start, length) - .createBatchedReaderFunc(fileSchema -> VectorizedSparkOrcReaders.buildReader(expectedSchema(), fileSchema, - idToConstant)) + .createBatchedReaderFunc( + fileSchema -> + VectorizedSparkOrcReaders.buildReader(expectedSchema(), fileSchema, idToConstant)) .recordsPerBatch(batchSize) .filter(residual) .caseSensitive(caseSensitive()) diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/BaseReader.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/BaseReader.java index 0210ceb1779f..95bbaaca7cbd 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/BaseReader.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/BaseReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.Closeable; @@ -84,7 +83,8 @@ abstract class BaseReader implements Closeable { private T current = null; private TaskT currentTask = null; - BaseReader(Table table, ScanTaskGroup taskGroup, Schema expectedSchema, boolean caseSensitive) { + BaseReader( + Table table, ScanTaskGroup taskGroup, Schema expectedSchema, boolean caseSensitive) { this.table = table; this.taskGroup = taskGroup; this.tasks = taskGroup.tasks().iterator(); @@ -92,7 +92,8 @@ abstract class BaseReader implements Closeable { this.expectedSchema = expectedSchema; this.caseSensitive = caseSensitive; String nameMappingString = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - this.nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; + this.nameMapping = + nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; } protected abstract CloseableIterator open(TaskT task); @@ -132,9 +133,10 @@ public boolean next() throws IOException { } } catch (IOException | RuntimeException e) { if (currentTask != null && !currentTask.isDataTask()) { - String filePaths = referencedFiles(currentTask) - .map(file -> file.path().toString()) - .collect(Collectors.joining(", ")); + String filePaths = + referencedFiles(currentTask) + .map(file -> file.path().toString()) + .collect(Collectors.joining(", ")); LOG.error("Error reading file(s): {}", filePaths, e); } throw e; @@ -164,9 +166,8 @@ protected InputFile getInputFile(String location) { private Map inputFiles() { if (lazyInputFiles == null) { - Stream encryptedFiles = taskGroup.tasks().stream() - .flatMap(this::referencedFiles) - .map(this::toEncryptedInputFile); + Stream encryptedFiles = + taskGroup.tasks().stream().flatMap(this::referencedFiles).map(this::toEncryptedInputFile); // decrypt with the batch call to avoid multiple RPCs to a key server, if possible Iterable decryptedFiles = table.encryption().decrypt(encryptedFiles::iterator); @@ -230,7 +231,8 @@ protected static Object convertConstant(Type type, Object value) { for (int index = 0; index < fields.size(); index++) { NestedField field = fields.get(index); Type fieldType = field.type(); - values[index] = convertConstant(fieldType, struct.get(index, fieldType.typeId().javaClass())); + values[index] = + convertConstant(fieldType, struct.get(index, fieldType.typeId().javaClass())); } return new GenericInternalRow(values); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/BaseRowReader.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/BaseRowReader.java index 3aeb65d7ce44..608f0df0075d 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/BaseRowReader.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/BaseRowReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -40,13 +39,19 @@ import org.apache.spark.sql.catalyst.InternalRow; abstract class BaseRowReader extends BaseReader { - BaseRowReader(Table table, ScanTaskGroup taskGroup, Schema expectedSchema, boolean caseSensitive) { + BaseRowReader( + Table table, ScanTaskGroup taskGroup, Schema expectedSchema, boolean caseSensitive) { super(table, taskGroup, expectedSchema, caseSensitive); } - protected CloseableIterable newIterable(InputFile file, FileFormat format, long start, long length, - Expression residual, Schema projection, - Map idToConstant) { + protected CloseableIterable newIterable( + InputFile file, + FileFormat format, + long start, + long length, + Expression residual, + Schema projection, + Map idToConstant) { switch (format) { case PARQUET: return newParquetIterable(file, start, length, residual, projection, idToConstant); @@ -62,8 +67,8 @@ protected CloseableIterable newIterable(InputFile file, FileFormat } } - private CloseableIterable newAvroIterable(InputFile file, long start, long length, Schema projection, - Map idToConstant) { + private CloseableIterable newAvroIterable( + InputFile file, long start, long length, Schema projection, Map idToConstant) { return Avro.read(file) .reuseContainers() .project(projection) @@ -73,29 +78,41 @@ private CloseableIterable newAvroIterable(InputFile file, long star .build(); } - private CloseableIterable newParquetIterable(InputFile file, long start, long length, - Expression residual, Schema readSchema, - Map idToConstant) { + private CloseableIterable newParquetIterable( + InputFile file, + long start, + long length, + Expression residual, + Schema readSchema, + Map idToConstant) { return Parquet.read(file) .reuseContainers() .split(start, length) .project(readSchema) - .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant)) + .createReaderFunc( + fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant)) .filter(residual) .caseSensitive(caseSensitive()) .withNameMapping(nameMapping()) .build(); } - private CloseableIterable newOrcIterable(InputFile file, long start, long length, Expression residual, - Schema readSchema, Map idToConstant) { - Schema readSchemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(readSchema, - Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); + private CloseableIterable newOrcIterable( + InputFile file, + long start, + long length, + Expression residual, + Schema readSchema, + Map idToConstant) { + Schema readSchemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot( + readSchema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); return ORC.read(file) .project(readSchemaWithoutConstantAndMetadataFields) .split(start, length) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant)) + .createReaderFunc( + readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant)) .filter(residual) .caseSensitive(caseSensitive()) .withNameMapping(nameMapping()) diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java index f0b8dda2345f..13755f0abc79 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -33,8 +32,12 @@ import org.apache.spark.sql.vectorized.ColumnarBatch; class BatchDataReader extends BaseBatchReader { - BatchDataReader(ScanTaskGroup task, Table table, Schema expectedSchema, boolean caseSensitive, - int size) { + BatchDataReader( + ScanTaskGroup task, + Table table, + Schema expectedSchema, + boolean caseSensitive, + int size) { super(table, task, expectedSchema, caseSensitive, size); } @@ -55,9 +58,17 @@ protected CloseableIterator open(FileScanTask task) { InputFile inputFile = getInputFile(filePath); Preconditions.checkNotNull(inputFile, "Could not find InputFile associated with FileScanTask"); - SparkDeleteFilter deleteFilter = task.deletes().isEmpty() ? null : new SparkDeleteFilter(filePath, task.deletes()); + SparkDeleteFilter deleteFilter = + task.deletes().isEmpty() ? null : new SparkDeleteFilter(filePath, task.deletes()); - return newBatchIterable(inputFile, task.file().format(), task.start(), task.length(), task.residual(), - idToConstant, deleteFilter).iterator(); + return newBatchIterable( + inputFile, + task.file().format(), + task.start(), + task.length(), + task.residual(), + idToConstant, + deleteFilter) + .iterator(); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java index 9441e8c4a205..5d61747e3dec 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -30,13 +29,15 @@ import org.apache.spark.sql.catalyst.InternalRow; public class EqualityDeleteRowReader extends RowDataReader { - public EqualityDeleteRowReader(CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { + public EqualityDeleteRowReader( + CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { super(task, table, expectedSchema, caseSensitive); } @Override protected CloseableIterator open(FileScanTask task) { - SparkDeleteFilter matches = new SparkDeleteFilter(task.file().path().toString(), task.deletes()); + SparkDeleteFilter matches = + new SparkDeleteFilter(task.file().path().toString(), task.deletes()); // schema or rows returned by readers Schema requiredSchema = matches.requiredSchema(); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/HasIcebergCatalog.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/HasIcebergCatalog.java index e2579a0059b0..37e0c4dfcdb6 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/HasIcebergCatalog.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/HasIcebergCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.catalog.Catalog; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java index d70d902c1f64..22d6f3e2a95c 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Arrays; @@ -46,22 +45,23 @@ /** * The IcebergSource loads/writes tables with format "iceberg". It can load paths and tables. * - * How paths/tables are loaded when using spark.read().format("iceberg").path(table) + *

    How paths/tables are loaded when using spark.read().format("iceberg").path(table) * - * table = "file:/path/to/table" -> loads a HadoopTable at given path - * table = "tablename" -> loads currentCatalog.currentNamespace.tablename - * table = "catalog.tablename" -> load "tablename" from the specified catalog. - * table = "namespace.tablename" -> load "namespace.tablename" from current catalog - * table = "catalog.namespace.tablename" -> "namespace.tablename" from the specified catalog. - * table = "namespace1.namespace2.tablename" -> load "namespace1.namespace2.tablename" from current catalog + *

    table = "file:/path/to/table" -> loads a HadoopTable at given path table = "tablename" + * -> loads currentCatalog.currentNamespace.tablename table = "catalog.tablename" -> load + * "tablename" from the specified catalog. table = "namespace.tablename" -> load + * "namespace.tablename" from current catalog table = "catalog.namespace.tablename" -> + * "namespace.tablename" from the specified catalog. table = "namespace1.namespace2.tablename" -> + * load "namespace1.namespace2.tablename" from current catalog * - * The above list is in order of priority. For example: a matching catalog will take priority over any namespace - * resolution. + *

    The above list is in order of priority. For example: a matching catalog will take priority + * over any namespace resolution. */ public class IcebergSource implements DataSourceRegister, SupportsCatalogOptions { private static final String DEFAULT_CATALOG_NAME = "default_iceberg"; private static final String DEFAULT_CACHE_CATALOG_NAME = "default_cache_iceberg"; - private static final String DEFAULT_CACHE_CATALOG = "spark.sql.catalog." + DEFAULT_CACHE_CATALOG_NAME; + private static final String DEFAULT_CACHE_CATALOG = + "spark.sql.catalog." + DEFAULT_CACHE_CATALOG_NAME; private static final String DEFAULT_CATALOG = "spark.sql.catalog." + DEFAULT_CATALOG_NAME; private static final String AT_TIMESTAMP = "at_timestamp_"; private static final String SNAPSHOT_ID = "snapshot_id_"; @@ -91,7 +91,8 @@ public boolean supportsExternalMetadata() { @Override public Table getTable(StructType schema, Transform[] partitioning, Map options) { - Spark3Util.CatalogAndIdentifier catalogIdentifier = catalogAndIdentifier(new CaseInsensitiveStringMap(options)); + Spark3Util.CatalogAndIdentifier catalogIdentifier = + catalogAndIdentifier(new CaseInsensitiveStringMap(options)); CatalogPlugin catalog = catalogIdentifier.catalog(); Identifier ident = catalogIdentifier.identifier(); @@ -100,12 +101,16 @@ public Table getTable(StructType schema, Transform[] partitioning, Map config = ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "cache-enabled", "false" // the source should not use a cache - ); + ImmutableMap config = + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "cache-enabled", "false" // the source should not use a cache + ); spark.conf().set(DEFAULT_CATALOG, SparkCatalog.class.getName()); config.forEach((key, value) -> spark.conf().set(DEFAULT_CATALOG + "." + key, value)); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java index ef1eb08d873c..524266f6f83a 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.nio.ByteBuffer; @@ -32,8 +31,8 @@ import org.apache.spark.sql.types.StructType; /** - * Class to adapt a Spark {@code InternalRow} to Iceberg {@link StructLike} for uses like - * {@link org.apache.iceberg.PartitionKey#partition(StructLike)} + * Class to adapt a Spark {@code InternalRow} to Iceberg {@link StructLike} for uses like {@link + * org.apache.iceberg.PartitionKey#partition(StructLike)} */ class InternalRowWrapper implements StructLike { private final DataType[] types; @@ -42,12 +41,8 @@ class InternalRowWrapper implements StructLike { @SuppressWarnings("unchecked") InternalRowWrapper(StructType rowType) { - this.types = Stream.of(rowType.fields()) - .map(StructField::dataType) - .toArray(DataType[]::new); - this.getters = Stream.of(types) - .map(InternalRowWrapper::getter) - .toArray(BiFunction[]::new); + this.types = Stream.of(rowType.fields()).map(StructField::dataType).toArray(DataType[]::new); + this.getters = Stream.of(types).map(InternalRowWrapper::getter).toArray(BiFunction[]::new); } InternalRowWrapper wrap(InternalRow internalRow) { diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java index 85903461970e..3778049cc71a 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -35,7 +34,8 @@ import org.apache.spark.sql.catalyst.InternalRow; class RowDataReader extends BaseRowReader { - RowDataReader(ScanTaskGroup task, Table table, Schema expectedSchema, boolean caseSensitive) { + RowDataReader( + ScanTaskGroup task, Table table, Schema expectedSchema, boolean caseSensitive) { super(table, task, expectedSchema, caseSensitive); } @@ -59,13 +59,21 @@ protected CloseableIterator open(FileScanTask task) { return deleteFilter.filter(open(task, requiredSchema, idToConstant)).iterator(); } - protected CloseableIterable open(FileScanTask task, Schema readSchema, Map idToConstant) { + protected CloseableIterable open( + FileScanTask task, Schema readSchema, Map idToConstant) { if (task.isDataTask()) { return newDataIterable(task.asDataTask(), readSchema); } else { InputFile inputFile = getInputFile(task.file().path().toString()); - Preconditions.checkNotNull(inputFile, "Could not find InputFile associated with FileScanTask"); - return newIterable(inputFile, task.file().format(), task.start(), task.length(), task.residual(), readSchema, + Preconditions.checkNotNull( + inputFile, "Could not find InputFile associated with FileScanTask"); + return newIterable( + inputFile, + task.file().format(), + task.start(), + task.length(), + task.residual(), + readSchema, idToConstant); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java index 63cc3a466c1a..aee0d4f0586b 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.Serializable; @@ -55,23 +54,25 @@ public class RowDataRewriter implements Serializable { private final FileFormat format; private final boolean caseSensitive; - public RowDataRewriter(Broadcast

    tableBroadcast, PartitionSpec spec, boolean caseSensitive) { + public RowDataRewriter( + Broadcast
    tableBroadcast, PartitionSpec spec, boolean caseSensitive) { this.tableBroadcast = tableBroadcast; this.spec = spec; this.caseSensitive = caseSensitive; Table table = tableBroadcast.value(); - String formatString = table.properties().getOrDefault( - TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); + String formatString = + table + .properties() + .getOrDefault( + TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); this.format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH)); } public List rewriteDataForTasks(JavaRDD taskRDD) { JavaRDD> dataFilesRDD = taskRDD.map(this::rewriteDataForTask); - return dataFilesRDD.collect().stream() - .flatMap(Collection::stream) - .collect(Collectors.toList()); + return dataFilesRDD.collect().stream().flatMap(Collection::stream).collect(Collectors.toList()); } private List rewriteDataForTask(CombinedScanTask task) throws Exception { @@ -86,28 +87,44 @@ private List rewriteDataForTask(CombinedScanTask task) throws Exceptio RowDataReader dataReader = new RowDataReader(task, table, schema, caseSensitive); StructType structType = SparkSchemaUtil.convert(schema); - SparkAppenderFactory appenderFactory = SparkAppenderFactory.builderFor(table, schema, structType) - .spec(spec) - .build(); - OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, partitionId, taskId) - .defaultSpec(spec) - .format(format) - .build(); + SparkAppenderFactory appenderFactory = + SparkAppenderFactory.builderFor(table, schema, structType).spec(spec).build(); + OutputFileFactory fileFactory = + OutputFileFactory.builderFor(table, partitionId, taskId) + .defaultSpec(spec) + .format(format) + .build(); TaskWriter writer; if (spec.isUnpartitioned()) { - writer = new UnpartitionedWriter<>(spec, format, appenderFactory, fileFactory, table.io(), - Long.MAX_VALUE); - } else if (PropertyUtil.propertyAsBoolean(properties, + writer = + new UnpartitionedWriter<>( + spec, format, appenderFactory, fileFactory, table.io(), Long.MAX_VALUE); + } else if (PropertyUtil.propertyAsBoolean( + properties, TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED, TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED_DEFAULT)) { - writer = new SparkPartitionedFanoutWriter( - spec, format, appenderFactory, fileFactory, table.io(), Long.MAX_VALUE, schema, - structType); + writer = + new SparkPartitionedFanoutWriter( + spec, + format, + appenderFactory, + fileFactory, + table.io(), + Long.MAX_VALUE, + schema, + structType); } else { - writer = new SparkPartitionedWriter( - spec, format, appenderFactory, fileFactory, table.io(), Long.MAX_VALUE, schema, - structType); + writer = + new SparkPartitionedWriter( + spec, + format, + appenderFactory, + fileFactory, + table.io(), + Long.MAX_VALUE, + schema, + structType); } try { @@ -127,14 +144,24 @@ private List rewriteDataForTask(CombinedScanTask task) throws Exceptio LOG.error("Aborting task", originalThrowable); context.markTaskFailed(originalThrowable); - LOG.error("Aborting commit for partition {} (task {}, attempt {}, stage {}.{})", - partitionId, taskId, context.attemptNumber(), context.stageId(), context.stageAttemptNumber()); + LOG.error( + "Aborting commit for partition {} (task {}, attempt {}, stage {}.{})", + partitionId, + taskId, + context.attemptNumber(), + context.stageId(), + context.stageAttemptNumber()); if (dataReader != null) { dataReader.close(); } writer.abort(); - LOG.error("Aborted commit for partition {} (task {}, attempt {}, stage {}.{})", - partitionId, taskId, context.taskAttemptId(), context.stageId(), context.stageAttemptNumber()); + LOG.error( + "Aborted commit for partition {} (task {}, attempt {}, stage {}.{})", + partitionId, + taskId, + context.taskAttemptId(), + context.stageId(), + context.stageAttemptNumber()); } catch (Throwable inner) { if (originalThrowable != inner) { diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SerializableTableWithSize.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SerializableTableWithSize.java index 6275c664410f..e3b81cea7cd1 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SerializableTableWithSize.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SerializableTableWithSize.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.BaseMetadataTable; @@ -25,9 +24,9 @@ import org.apache.spark.util.KnownSizeEstimation; /** - * This class provides a serializable table with a known size estimate. Spark calls - * its SizeEstimator class when broadcasting variables and this can be an expensive - * operation, so providing a known size estimate allows that operation to be skipped. + * This class provides a serializable table with a known size estimate. Spark calls its + * SizeEstimator class when broadcasting variables and this can be an expensive operation, so + * providing a known size estimate allows that operation to be skipped. */ public class SerializableTableWithSize extends SerializableTable implements KnownSizeEstimation { diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java index 4becf666ed3e..6372edde0782 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -61,8 +60,14 @@ class SparkAppenderFactory implements FileAppenderFactory { private StructType eqDeleteSparkType = null; private StructType posDeleteSparkType = null; - SparkAppenderFactory(Map properties, Schema writeSchema, StructType dsSchema, PartitionSpec spec, - int[] equalityFieldIds, Schema eqDeleteRowSchema, Schema posDeleteRowSchema) { + SparkAppenderFactory( + Map properties, + Schema writeSchema, + StructType dsSchema, + PartitionSpec spec, + int[] equalityFieldIds, + Schema eqDeleteRowSchema, + Schema posDeleteRowSchema) { this.properties = properties; this.writeSchema = writeSchema; this.dsSchema = dsSchema; @@ -85,7 +90,6 @@ static class Builder { private Schema eqDeleteRowSchema; private Schema posDeleteRowSchema; - Builder(Table table, Schema writeSchema, StructType dsSchema) { this.table = table; this.spec = table.spec(); @@ -118,16 +122,24 @@ SparkAppenderFactory build() { Preconditions.checkNotNull(writeSchema, "Write Schema must not be null"); Preconditions.checkNotNull(dsSchema, "DS Schema must not be null"); if (equalityFieldIds != null) { - Preconditions.checkNotNull(eqDeleteRowSchema, "Equality Field Ids and Equality Delete Row Schema" + - " must be set together"); + Preconditions.checkNotNull( + eqDeleteRowSchema, + "Equality Field Ids and Equality Delete Row Schema" + " must be set together"); } if (eqDeleteRowSchema != null) { - Preconditions.checkNotNull(equalityFieldIds, "Equality Field Ids and Equality Delete Row Schema" + - " must be set together"); + Preconditions.checkNotNull( + equalityFieldIds, + "Equality Field Ids and Equality Delete Row Schema" + " must be set together"); } - return new SparkAppenderFactory(table.properties(), writeSchema, dsSchema, spec, equalityFieldIds, - eqDeleteRowSchema, posDeleteRowSchema); + return new SparkAppenderFactory( + table.properties(), + writeSchema, + dsSchema, + spec, + equalityFieldIds, + eqDeleteRowSchema, + posDeleteRowSchema); } } @@ -141,7 +153,8 @@ private StructType lazyEqDeleteSparkType() { private StructType lazyPosDeleteSparkType() { if (posDeleteSparkType == null) { - Preconditions.checkNotNull(posDeleteRowSchema, "Position delete row schema shouldn't be null"); + Preconditions.checkNotNull( + posDeleteRowSchema, "Position delete row schema shouldn't be null"); this.posDeleteSparkType = SparkSchemaUtil.convert(posDeleteRowSchema); } return posDeleteSparkType; @@ -187,24 +200,33 @@ public FileAppender newAppender(OutputFile file, FileFormat fileFor } @Override - public DataWriter newDataWriter(EncryptedOutputFile file, FileFormat format, StructLike partition) { - return new DataWriter<>(newAppender(file.encryptingOutputFile(), format), format, - file.encryptingOutputFile().location(), spec, partition, file.keyMetadata()); + public DataWriter newDataWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { + return new DataWriter<>( + newAppender(file.encryptingOutputFile(), format), + format, + file.encryptingOutputFile().location(), + spec, + partition, + file.keyMetadata()); } @Override - public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile file, FileFormat format, - StructLike partition) { - Preconditions.checkState(equalityFieldIds != null && equalityFieldIds.length > 0, + public EqualityDeleteWriter newEqDeleteWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { + Preconditions.checkState( + equalityFieldIds != null && equalityFieldIds.length > 0, "Equality field ids shouldn't be null or empty when creating equality-delete writer"); - Preconditions.checkNotNull(eqDeleteRowSchema, + Preconditions.checkNotNull( + eqDeleteRowSchema, "Equality delete row schema shouldn't be null when creating equality-delete writer"); try { switch (format) { case PARQUET: return Parquet.writeDeletes(file.encryptingOutputFile()) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(lazyEqDeleteSparkType(), msgType)) + .createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(lazyEqDeleteSparkType(), msgType)) .overwrite() .rowSchema(eqDeleteRowSchema) .withSpec(spec) @@ -245,15 +267,16 @@ public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile f } @Override - public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile file, FileFormat format, - StructLike partition) { + public PositionDeleteWriter newPosDeleteWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { try { switch (format) { case PARQUET: StructType sparkPosDeleteSchema = SparkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); return Parquet.writeDeletes(file.encryptingOutputFile()) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(sparkPosDeleteSchema, msgType)) + .createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(sparkPosDeleteSchema, msgType)) .overwrite() .rowSchema(posDeleteRowSchema) .withSpec(spec) diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java index 79d91205a236..6d8504794310 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -59,7 +58,8 @@ abstract class SparkBatch implements Batch { @Override public InputPartition[] planInputPartitions() { // broadcast the table metadata as input partitions will be sent to executors - Broadcast
    tableBroadcast = sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); + Broadcast
    tableBroadcast = + sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); String expectedSchemaString = SchemaParser.toJson(expectedSchema); InputPartition[] readTasks = new InputPartition[tasks().size()]; @@ -67,9 +67,15 @@ public InputPartition[] planInputPartitions() { Tasks.range(readTasks.length) .stopOnFailure() .executeWith(localityEnabled ? ThreadPools.getWorkerPool() : null) - .run(index -> readTasks[index] = new ReadTask( - tasks().get(index), tableBroadcast, expectedSchemaString, - caseSensitive, localityEnabled)); + .run( + index -> + readTasks[index] = + new ReadTask( + tasks().get(index), + tableBroadcast, + expectedSchemaString, + caseSensitive, + localityEnabled)); return readTasks; } @@ -96,25 +102,32 @@ private int batchSize() { } private boolean parquetOnly() { - return tasks().stream().allMatch(task -> !task.isDataTask() && onlyFileFormat(task, FileFormat.PARQUET)); + return tasks().stream() + .allMatch(task -> !task.isDataTask() && onlyFileFormat(task, FileFormat.PARQUET)); } private boolean parquetBatchReadsEnabled() { - return readConf.parquetVectorizationEnabled() && // vectorization enabled - expectedSchema.columns().size() > 0 && // at least one column is projected - expectedSchema.columns().stream().allMatch(c -> c.type().isPrimitiveType()); // only primitives + return readConf.parquetVectorizationEnabled() + && // vectorization enabled + expectedSchema.columns().size() > 0 + && // at least one column is projected + expectedSchema.columns().stream() + .allMatch(c -> c.type().isPrimitiveType()); // only primitives } private boolean orcOnly() { - return tasks().stream().allMatch(task -> !task.isDataTask() && onlyFileFormat(task, FileFormat.ORC)); + return tasks().stream() + .allMatch(task -> !task.isDataTask() && onlyFileFormat(task, FileFormat.ORC)); } private boolean orcBatchReadsEnabled() { - return readConf.orcVectorizationEnabled() && // vectorization enabled + return readConf.orcVectorizationEnabled() + && // vectorization enabled tasks().stream().noneMatch(TableScanUtil::hasDeletes); // no delete files } private boolean onlyFileFormat(CombinedScanTask task, FileFormat fileFormat) { - return task.files().stream().allMatch(fileScanTask -> fileScanTask.file().format().equals(fileFormat)); + return task.files().stream() + .allMatch(fileScanTask -> fileScanTask.file().format().equals(fileFormat)); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java index 651a411ebd7b..59dd8759968f 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -74,8 +73,13 @@ class SparkBatchQueryScan extends SparkScan implements SupportsRuntimeFiltering private List files = null; // lazy cache of files private List tasks = null; // lazy cache of tasks - SparkBatchQueryScan(SparkSession spark, Table table, TableScan scan, SparkReadConf readConf, - Schema expectedSchema, List filters) { + SparkBatchQueryScan( + SparkSession spark, + Table table, + TableScan scan, + SparkReadConf readConf, + Schema expectedSchema, + List filters) { super(spark, table, readConf, expectedSchema, filters); @@ -124,12 +128,12 @@ private List files() { @Override protected List tasks() { if (tasks == null) { - CloseableIterable splitFiles = TableScanUtil.splitFiles( - CloseableIterable.withNoopClose(files()), - scan.targetSplitSize()); - CloseableIterable scanTasks = TableScanUtil.planTasks( - splitFiles, scan.targetSplitSize(), - scan.splitLookback(), scan.splitOpenFileCost()); + CloseableIterable splitFiles = + TableScanUtil.splitFiles( + CloseableIterable.withNoopClose(files()), scan.targetSplitSize()); + CloseableIterable scanTasks = + TableScanUtil.planTasks( + splitFiles, scan.targetSplitSize(), scan.splitLookback(), scan.splitOpenFileCost()); tasks = Lists.newArrayList(scanTasks); } @@ -168,21 +172,29 @@ public void filter(Filter[] filters) { for (Integer specId : specIds()) { PartitionSpec spec = table().specs().get(specId); - Expression inclusiveExpr = Projections.inclusive(spec, caseSensitive()).project(runtimeFilterExpr); + Expression inclusiveExpr = + Projections.inclusive(spec, caseSensitive()).project(runtimeFilterExpr); Evaluator inclusive = new Evaluator(spec.partitionType(), inclusiveExpr); evaluatorsBySpecId.put(specId, inclusive); } - LOG.info("Trying to filter {} files using runtime filter {}", files().size(), runtimeFilterExpr); + LOG.info( + "Trying to filter {} files using runtime filter {}", files().size(), runtimeFilterExpr); - List filteredFiles = files().stream() - .filter(file -> { - Evaluator evaluator = evaluatorsBySpecId.get(file.spec().specId()); - return evaluator.eval(file.file().partition()); - }) - .collect(Collectors.toList()); + List filteredFiles = + files().stream() + .filter( + file -> { + Evaluator evaluator = evaluatorsBySpecId.get(file.spec().specId()); + return evaluator.eval(file.file().partition()); + }) + .collect(Collectors.toList()); - LOG.info("{}/{} files matched runtime filter {}", filteredFiles.size(), files().size(), runtimeFilterExpr); + LOG.info( + "{}/{} files matched runtime filter {}", + filteredFiles.size(), + files().size(), + runtimeFilterExpr); // don't invalidate tasks if the runtime filter had no effect to avoid planning splits again if (filteredFiles.size() < files().size()) { @@ -249,27 +261,38 @@ public boolean equals(Object o) { } SparkBatchQueryScan that = (SparkBatchQueryScan) o; - return table().name().equals(that.table().name()) && - readSchema().equals(that.readSchema()) && // compare Spark schemas to ignore field ids - filterExpressions().toString().equals(that.filterExpressions().toString()) && - runtimeFilterExpressions.toString().equals(that.runtimeFilterExpressions.toString()) && - Objects.equals(snapshotId, that.snapshotId) && - Objects.equals(startSnapshotId, that.startSnapshotId) && - Objects.equals(endSnapshotId, that.endSnapshotId) && - Objects.equals(asOfTimestamp, that.asOfTimestamp); + return table().name().equals(that.table().name()) + && readSchema().equals(that.readSchema()) + && // compare Spark schemas to ignore field ids + filterExpressions().toString().equals(that.filterExpressions().toString()) + && runtimeFilterExpressions.toString().equals(that.runtimeFilterExpressions.toString()) + && Objects.equals(snapshotId, that.snapshotId) + && Objects.equals(startSnapshotId, that.startSnapshotId) + && Objects.equals(endSnapshotId, that.endSnapshotId) + && Objects.equals(asOfTimestamp, that.asOfTimestamp); } @Override public int hashCode() { return Objects.hash( - table().name(), readSchema(), filterExpressions().toString(), runtimeFilterExpressions.toString(), - snapshotId, startSnapshotId, endSnapshotId, asOfTimestamp); + table().name(), + readSchema(), + filterExpressions().toString(), + runtimeFilterExpressions.toString(), + snapshotId, + startSnapshotId, + endSnapshotId, + asOfTimestamp); } @Override public String toString() { return String.format( "IcebergScan(table=%s, type=%s, filters=%s, runtimeFilters=%s, caseSensitive=%s)", - table(), expectedSchema().asStruct(), filterExpressions(), runtimeFilterExpressions, caseSensitive()); + table(), + expectedSchema().asStruct(), + filterExpressions(), + runtimeFilterExpressions, + caseSensitive()); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkCopyOnWriteOperation.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkCopyOnWriteOperation.java index 8184818ab39c..62c9557ce9f1 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkCopyOnWriteOperation.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkCopyOnWriteOperation.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.DELETE; +import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.UPDATE; + import org.apache.iceberg.IsolationLevel; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Table; @@ -34,9 +36,6 @@ import org.apache.spark.sql.connector.write.WriteBuilder; import org.apache.spark.sql.util.CaseInsensitiveStringMap; -import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.DELETE; -import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.UPDATE; - class SparkCopyOnWriteOperation implements RowLevelOperation { private final SparkSession spark; @@ -49,8 +48,8 @@ class SparkCopyOnWriteOperation implements RowLevelOperation { private Scan configuredScan; private WriteBuilder lazyWriteBuilder; - SparkCopyOnWriteOperation(SparkSession spark, Table table, RowLevelOperationInfo info, - IsolationLevel isolationLevel) { + SparkCopyOnWriteOperation( + SparkSession spark, Table table, RowLevelOperationInfo info, IsolationLevel isolationLevel) { this.spark = spark; this.table = table; this.command = info.command(); @@ -65,14 +64,15 @@ public Command command() { @Override public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) { if (lazyScanBuilder == null) { - lazyScanBuilder = new SparkScanBuilder(spark, table, options) { - @Override - public Scan build() { - Scan scan = super.buildCopyOnWriteScan(); - SparkCopyOnWriteOperation.this.configuredScan = scan; - return scan; - } - }; + lazyScanBuilder = + new SparkScanBuilder(spark, table, options) { + @Override + public Scan build() { + Scan scan = super.buildCopyOnWriteScan(); + SparkCopyOnWriteOperation.this.configuredScan = scan; + return scan; + } + }; } return lazyScanBuilder; @@ -95,9 +95,9 @@ public NamedReference[] requiredMetadataAttributes() { NamedReference pos = Expressions.column(MetadataColumns.ROW_POSITION.name()); if (command == DELETE || command == UPDATE) { - return new NamedReference[]{file, pos}; + return new NamedReference[] {file, pos}; } else { - return new NamedReference[]{file}; + return new NamedReference[] {file}; } } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkCopyOnWriteScan.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkCopyOnWriteScan.java index 49d90f5f2b13..4efd5180e27b 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkCopyOnWriteScan.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkCopyOnWriteScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -57,13 +56,23 @@ class SparkCopyOnWriteScan extends SparkScan implements SupportsRuntimeFiltering private List tasks = null; // lazy cache of tasks private Set filteredLocations = null; - SparkCopyOnWriteScan(SparkSession spark, Table table, SparkReadConf readConf, - Schema expectedSchema, List filters) { + SparkCopyOnWriteScan( + SparkSession spark, + Table table, + SparkReadConf readConf, + Schema expectedSchema, + List filters) { this(spark, table, null, null, readConf, expectedSchema, filters); } - SparkCopyOnWriteScan(SparkSession spark, Table table, TableScan scan, Snapshot snapshot, - SparkReadConf readConf, Schema expectedSchema, List filters) { + SparkCopyOnWriteScan( + SparkSession spark, + Table table, + TableScan scan, + Snapshot snapshot, + SparkReadConf readConf, + Schema expectedSchema, + List filters) { super(spark, table, readConf, expectedSchema, filters); @@ -88,14 +97,15 @@ public Statistics estimateStatistics() { public NamedReference[] filterAttributes() { NamedReference file = Expressions.column(MetadataColumns.FILE_PATH.name()); - return new NamedReference[]{file}; + return new NamedReference[] {file}; } @Override public void filter(Filter[] filters) { for (Filter filter : filters) { // Spark can only pass In filters at the moment - if (filter instanceof In && ((In) filter).attribute().equalsIgnoreCase(MetadataColumns.FILE_PATH.name())) { + if (filter instanceof In + && ((In) filter).attribute().equalsIgnoreCase(MetadataColumns.FILE_PATH.name())) { In in = (In) filter; Set fileLocations = Sets.newHashSet(); @@ -109,9 +119,10 @@ public void filter(Filter[] filters) { if (filteredLocations == null || fileLocations.size() < filteredLocations.size()) { this.tasks = null; this.filteredLocations = fileLocations; - this.files = files().stream() - .filter(file -> fileLocations.contains(file.file().path().toString())) - .collect(Collectors.toList()); + this.files = + files().stream() + .filter(file -> fileLocations.contains(file.file().path().toString())) + .collect(Collectors.toList()); } } } @@ -133,12 +144,12 @@ synchronized List files() { @Override protected synchronized List tasks() { if (tasks == null) { - CloseableIterable splitFiles = TableScanUtil.splitFiles( - CloseableIterable.withNoopClose(files()), - scan.targetSplitSize()); - CloseableIterable scanTasks = TableScanUtil.planTasks( - splitFiles, scan.targetSplitSize(), - scan.splitLookback(), scan.splitOpenFileCost()); + CloseableIterable splitFiles = + TableScanUtil.splitFiles( + CloseableIterable.withNoopClose(files()), scan.targetSplitSize()); + CloseableIterable scanTasks = + TableScanUtil.planTasks( + splitFiles, scan.targetSplitSize(), scan.splitLookback(), scan.splitOpenFileCost()); tasks = Lists.newArrayList(scanTasks); } @@ -156,18 +167,22 @@ public boolean equals(Object o) { } SparkCopyOnWriteScan that = (SparkCopyOnWriteScan) o; - return table().name().equals(that.table().name()) && - readSchema().equals(that.readSchema()) && // compare Spark schemas to ignore field ids - filterExpressions().toString().equals(that.filterExpressions().toString()) && - Objects.equals(snapshotId(), that.snapshotId()) && - Objects.equals(filteredLocations, that.filteredLocations); + return table().name().equals(that.table().name()) + && readSchema().equals(that.readSchema()) + && // compare Spark schemas to ignore field ids + filterExpressions().toString().equals(that.filterExpressions().toString()) + && Objects.equals(snapshotId(), that.snapshotId()) + && Objects.equals(filteredLocations, that.filteredLocations); } @Override public int hashCode() { return Objects.hash( - table().name(), readSchema(), filterExpressions().toString(), - snapshotId(), filteredLocations); + table().name(), + readSchema(), + filterExpressions().toString(), + snapshotId(), + filteredLocations); } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java index beaa7c295024..a8c894bfc50c 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; +import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; + import java.util.Locale; import java.util.Map; import org.apache.iceberg.FileFormat; @@ -40,24 +44,35 @@ import org.apache.spark.sql.types.StructType; import org.apache.spark.unsafe.types.UTF8String; -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; -import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; - class SparkFileWriterFactory extends BaseFileWriterFactory { private StructType dataSparkType; private StructType equalityDeleteSparkType; private StructType positionDeleteSparkType; - SparkFileWriterFactory(Table table, FileFormat dataFileFormat, Schema dataSchema, StructType dataSparkType, - SortOrder dataSortOrder, FileFormat deleteFileFormat, - int[] equalityFieldIds, Schema equalityDeleteRowSchema, StructType equalityDeleteSparkType, - SortOrder equalityDeleteSortOrder, Schema positionDeleteRowSchema, - StructType positionDeleteSparkType) { - - super(table, dataFileFormat, dataSchema, dataSortOrder, deleteFileFormat, equalityFieldIds, - equalityDeleteRowSchema, equalityDeleteSortOrder, positionDeleteRowSchema); + SparkFileWriterFactory( + Table table, + FileFormat dataFileFormat, + Schema dataSchema, + StructType dataSparkType, + SortOrder dataSortOrder, + FileFormat deleteFileFormat, + int[] equalityFieldIds, + Schema equalityDeleteRowSchema, + StructType equalityDeleteSparkType, + SortOrder equalityDeleteSortOrder, + Schema positionDeleteRowSchema, + StructType positionDeleteSparkType) { + + super( + table, + dataFileFormat, + dataSchema, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteSortOrder, + positionDeleteRowSchema); this.dataSparkType = dataSparkType; this.equalityDeleteSparkType = equalityDeleteSparkType; @@ -80,7 +95,8 @@ protected void configureEqualityDelete(Avro.DeleteWriteBuilder builder) { @Override protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { - boolean withRow = positionDeleteSparkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME).isDefined(); + boolean withRow = + positionDeleteSparkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME).isDefined(); if (withRow) { // SparkAvroWriter accepts just the Spark type of the row ignoring the path and pos StructField rowField = positionDeleteSparkType().apply(DELETE_FILE_ROW_FIELD_NAME); @@ -96,12 +112,14 @@ protected void configureDataWrite(Parquet.DataWriteBuilder builder) { @Override protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(msgType -> SparkParquetWriters.buildWriter(equalityDeleteSparkType(), msgType)); + builder.createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(equalityDeleteSparkType(), msgType)); } @Override protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(msgType -> SparkParquetWriters.buildWriter(positionDeleteSparkType(), msgType)); + builder.createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(positionDeleteSparkType(), msgType)); builder.transformPaths(path -> UTF8String.fromString(path.toString())); } @@ -132,7 +150,8 @@ private StructType dataSparkType() { private StructType equalityDeleteSparkType() { if (equalityDeleteSparkType == null) { - Preconditions.checkNotNull(equalityDeleteRowSchema(), "Equality delete schema must not be null"); + Preconditions.checkNotNull( + equalityDeleteRowSchema(), "Equality delete schema must not be null"); this.equalityDeleteSparkType = SparkSchemaUtil.convert(equalityDeleteRowSchema()); } @@ -141,7 +160,8 @@ private StructType equalityDeleteSparkType() { private StructType positionDeleteSparkType() { if (positionDeleteSparkType == null) { - // wrap the optional row schema into the position delete schema that contains path and position + // wrap the optional row schema into the position delete schema that contains path and + // position Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema()); this.positionDeleteSparkType = SparkSchemaUtil.convert(positionDeleteSchema); } @@ -168,10 +188,12 @@ static class Builder { Map properties = table.properties(); - String dataFileFormatName = properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); + String dataFileFormatName = + properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); this.dataFileFormat = FileFormat.valueOf(dataFileFormatName.toUpperCase(Locale.ENGLISH)); - String deleteFileFormatName = properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); + String deleteFileFormatName = + properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); this.deleteFileFormat = FileFormat.valueOf(deleteFileFormatName.toUpperCase(Locale.ENGLISH)); } @@ -233,13 +255,23 @@ Builder positionDeleteSparkType(StructType newPositionDeleteSparkType) { SparkFileWriterFactory build() { boolean noEqualityDeleteConf = equalityFieldIds == null && equalityDeleteRowSchema == null; boolean fullEqualityDeleteConf = equalityFieldIds != null && equalityDeleteRowSchema != null; - Preconditions.checkArgument(noEqualityDeleteConf || fullEqualityDeleteConf, + Preconditions.checkArgument( + noEqualityDeleteConf || fullEqualityDeleteConf, "Equality field IDs and equality delete row schema must be set together"); return new SparkFileWriterFactory( - table, dataFileFormat, dataSchema, dataSparkType, dataSortOrder, deleteFileFormat, - equalityFieldIds, equalityDeleteRowSchema, equalityDeleteSparkType, equalityDeleteSortOrder, - positionDeleteRowSchema, positionDeleteSparkType); + table, + dataFileFormat, + dataSchema, + dataSparkType, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteSparkType, + equalityDeleteSortOrder, + positionDeleteRowSchema, + positionDeleteSparkType); } } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScan.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScan.java index 285bdaec851f..d40009c9f899 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScan.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -55,16 +54,18 @@ protected List tasks() { if (tasks == null) { FileScanTaskSetManager taskSetManager = FileScanTaskSetManager.get(); List files = taskSetManager.fetchTasks(table(), taskSetID); - ValidationException.check(files != null, + ValidationException.check( + files != null, "Task set manager has no tasks for table %s with id %s", - table(), taskSetID); + table(), + taskSetID); - CloseableIterable splitFiles = TableScanUtil.splitFiles( - CloseableIterable.withNoopClose(files), - splitSize); - CloseableIterable scanTasks = TableScanUtil.planTasks( - splitFiles, splitSize, - splitLookback, splitOpenFileCost); + CloseableIterable splitFiles = + TableScanUtil.splitFiles(CloseableIterable.withNoopClose(files), splitSize); + CloseableIterable scanTasks = + TableScanUtil.planTasks( + splitFiles, splitSize, + splitLookback, splitOpenFileCost); this.tasks = Lists.newArrayList(scanTasks); } @@ -82,11 +83,11 @@ public boolean equals(Object other) { } SparkFilesScan that = (SparkFilesScan) other; - return table().name().equals(that.table().name()) && - Objects.equals(taskSetID, that.taskSetID) && - Objects.equals(splitSize, that.splitSize) && - Objects.equals(splitLookback, that.splitLookback) && - Objects.equals(splitOpenFileCost, that.splitOpenFileCost); + return table().name().equals(that.table().name()) + && Objects.equals(taskSetID, that.taskSetID) + && Objects.equals(splitSize, that.splitSize) + && Objects.equals(splitLookback, that.splitLookback) + && Objects.equals(splitOpenFileCost, that.splitOpenFileCost); } @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScanBuilder.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScanBuilder.java index 6ae63b37d3b4..03ab3aa062d3 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScanBuilder.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScanBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.Table; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkMetadataColumn.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkMetadataColumn.java index 638cb7275638..94f87c28741d 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkMetadataColumn.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkMetadataColumn.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.spark.sql.connector.catalog.MetadataColumn; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java index ed96e10fe6ff..498f8198041f 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.BufferedWriter; @@ -81,8 +80,12 @@ public class SparkMicroBatchStream implements MicroBatchStream { private final boolean skipOverwrite; private final Long fromTimestamp; - SparkMicroBatchStream(JavaSparkContext sparkContext, Table table, SparkReadConf readConf, - Schema expectedSchema, String checkpointLocation) { + SparkMicroBatchStream( + JavaSparkContext sparkContext, + Table table, + SparkReadConf readConf, + Schema expectedSchema, + String checkpointLocation) { this.table = table; this.caseSensitive = readConf.caseSensitive(); this.expectedSchema = SchemaParser.toJson(expectedSchema); @@ -93,7 +96,8 @@ public class SparkMicroBatchStream implements MicroBatchStream { this.splitOpenFileCost = readConf.splitOpenFileCost(); this.fromTimestamp = readConf.streamFromTimestamp(); - InitialOffsetStore initialOffsetStore = new InitialOffsetStore(table, checkpointLocation, fromTimestamp); + InitialOffsetStore initialOffsetStore = + new InitialOffsetStore(table, checkpointLocation, fromTimestamp); this.initialOffset = initialOffsetStore.initialOffset(); this.skipDelete = readConf.streamingSkipDeleteSnapshots(); @@ -112,19 +116,25 @@ public Offset latestOffset() { } Snapshot latestSnapshot = table.currentSnapshot(); - long addedFilesCount = PropertyUtil.propertyAsLong(latestSnapshot.summary(), SnapshotSummary.ADDED_FILES_PROP, -1); - // If snapshotSummary doesn't have SnapshotSummary.ADDED_FILES_PROP, iterate through addedFiles iterator to find + long addedFilesCount = + PropertyUtil.propertyAsLong(latestSnapshot.summary(), SnapshotSummary.ADDED_FILES_PROP, -1); + // If snapshotSummary doesn't have SnapshotSummary.ADDED_FILES_PROP, iterate through addedFiles + // iterator to find // addedFilesCount. - addedFilesCount = addedFilesCount == -1 ? Iterables.size(latestSnapshot.addedFiles()) : addedFilesCount; + addedFilesCount = + addedFilesCount == -1 ? Iterables.size(latestSnapshot.addedFiles()) : addedFilesCount; return new StreamingOffset(latestSnapshot.snapshotId(), addedFilesCount, false); } @Override public InputPartition[] planInputPartitions(Offset start, Offset end) { - Preconditions.checkArgument(end instanceof StreamingOffset, "Invalid end offset: %s is not a StreamingOffset", end); Preconditions.checkArgument( - start instanceof StreamingOffset, "Invalid start offset: %s is not a StreamingOffset", start); + end instanceof StreamingOffset, "Invalid end offset: %s is not a StreamingOffset", end); + Preconditions.checkArgument( + start instanceof StreamingOffset, + "Invalid start offset: %s is not a StreamingOffset", + start); if (end.equals(StreamingOffset.START_OFFSET)) { return new InputPartition[0]; @@ -135,19 +145,25 @@ public InputPartition[] planInputPartitions(Offset start, Offset end) { List fileScanTasks = planFiles(startOffset, endOffset); - CloseableIterable splitTasks = TableScanUtil.splitFiles( - CloseableIterable.withNoopClose(fileScanTasks), - splitSize); - List combinedScanTasks = Lists.newArrayList( - TableScanUtil.planTasks(splitTasks, splitSize, splitLookback, splitOpenFileCost)); + CloseableIterable splitTasks = + TableScanUtil.splitFiles(CloseableIterable.withNoopClose(fileScanTasks), splitSize); + List combinedScanTasks = + Lists.newArrayList( + TableScanUtil.planTasks(splitTasks, splitSize, splitLookback, splitOpenFileCost)); InputPartition[] readTasks = new InputPartition[combinedScanTasks.size()]; Tasks.range(readTasks.length) .stopOnFailure() .executeWith(localityPreferred ? ThreadPools.getWorkerPool() : null) - .run(index -> readTasks[index] = new ReadTask( - combinedScanTasks.get(index), tableBroadcast, expectedSchema, - caseSensitive, localityPreferred)); + .run( + index -> + readTasks[index] = + new ReadTask( + combinedScanTasks.get(index), + tableBroadcast, + expectedSchema, + caseSensitive, + localityPreferred)); return readTasks; } @@ -168,17 +184,17 @@ public Offset deserializeOffset(String json) { } @Override - public void commit(Offset end) { - } + public void commit(Offset end) {} @Override - public void stop() { - } + public void stop() {} private List planFiles(StreamingOffset startOffset, StreamingOffset endOffset) { List fileScanTasks = Lists.newArrayList(); - StreamingOffset batchStartOffset = StreamingOffset.START_OFFSET.equals(startOffset) ? - determineStartingOffset(table, fromTimestamp) : startOffset; + StreamingOffset batchStartOffset = + StreamingOffset.START_OFFSET.equals(startOffset) + ? determineStartingOffset(table, fromTimestamp) + : startOffset; StreamingOffset currentOffset = null; @@ -195,10 +211,12 @@ private List planFiles(StreamingOffset startOffset, StreamingOffse continue; } - MicroBatch latestMicroBatch = MicroBatches.from(table.snapshot(currentOffset.snapshotId()), table.io()) - .caseSensitive(caseSensitive) - .specsById(table.specs()) - .generate(currentOffset.position(), Long.MAX_VALUE, currentOffset.shouldScanAllFiles()); + MicroBatch latestMicroBatch = + MicroBatches.from(table.snapshot(currentOffset.snapshotId()), table.io()) + .caseSensitive(caseSensitive) + .specsById(table.specs()) + .generate( + currentOffset.position(), Long.MAX_VALUE, currentOffset.shouldScanAllFiles()); fileScanTasks.addAll(latestMicroBatch.tasks()); } while (currentOffset.snapshotId() != endOffset.snapshotId()); @@ -214,19 +232,24 @@ private boolean shouldProcess(Snapshot snapshot) { case DataOperations.REPLACE: return false; case DataOperations.DELETE: - Preconditions.checkState(skipDelete, + Preconditions.checkState( + skipDelete, "Cannot process delete snapshot: %s, to ignore deletes, set %s=true", - snapshot.snapshotId(), SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS); + snapshot.snapshotId(), + SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS); return false; case DataOperations.OVERWRITE: - Preconditions.checkState(skipOverwrite, + Preconditions.checkState( + skipOverwrite, "Cannot process overwrite snapshot: %s, to ignore overwrites, set %s=true", - snapshot.snapshotId(), SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS); + snapshot.snapshotId(), + SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS); return false; default: - throw new IllegalStateException(String.format( - "Cannot process unknown snapshot operation: %s (snapshot id %s)", - op.toLowerCase(Locale.ROOT), snapshot.snapshotId())); + throw new IllegalStateException( + String.format( + "Cannot process unknown snapshot operation: %s (snapshot id %s)", + op.toLowerCase(Locale.ROOT), snapshot.snapshotId())); } } @@ -287,7 +310,8 @@ public StreamingOffset initialOffset() { private void writeOffset(StreamingOffset offset, OutputFile file) { try (OutputStream outputStream = file.create()) { - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(outputStream, StandardCharsets.UTF_8)); + BufferedWriter writer = + new BufferedWriter(new OutputStreamWriter(outputStream, StandardCharsets.UTF_8)); writer.write(offset.json()); writer.flush(); } catch (IOException ioException) { diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java index d38ae2f40316..f17cd260f928 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.FileFormat; @@ -34,10 +33,15 @@ public class SparkPartitionedFanoutWriter extends PartitionedFanoutWriter appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize, - Schema schema, StructType sparkSchema) { + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + StructType sparkSchema) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.partitionKey = new PartitionKey(spec, schema); this.internalRowWrapper = new InternalRowWrapper(sparkSchema); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java index f81a09926d85..a86091644360 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.FileFormat; @@ -34,10 +33,15 @@ public class SparkPartitionedWriter extends PartitionedWriter { private final PartitionKey partitionKey; private final InternalRowWrapper internalRowWrapper; - public SparkPartitionedWriter(PartitionSpec spec, FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize, - Schema schema, StructType sparkSchema) { + public SparkPartitionedWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + StructType sparkSchema) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.partitionKey = new PartitionKey(spec, schema); this.internalRowWrapper = new InternalRowWrapper(sparkSchema); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaOperation.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaOperation.java index b8c3cec020d3..bbe716484daa 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaOperation.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaOperation.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.IsolationLevel; @@ -46,8 +45,8 @@ class SparkPositionDeltaOperation implements RowLevelOperation, SupportsDelta { private Scan configuredScan; private DeltaWriteBuilder lazyWriteBuilder; - SparkPositionDeltaOperation(SparkSession spark, Table table, RowLevelOperationInfo info, - IsolationLevel isolationLevel) { + SparkPositionDeltaOperation( + SparkSession spark, Table table, RowLevelOperationInfo info, IsolationLevel isolationLevel) { this.spark = spark; this.table = table; this.command = info.command(); @@ -62,14 +61,15 @@ public Command command() { @Override public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) { if (lazyScanBuilder == null) { - this.lazyScanBuilder = new SparkScanBuilder(spark, table, options) { - @Override - public Scan build() { - Scan scan = super.buildMergeOnReadScan(); - SparkPositionDeltaOperation.this.configuredScan = scan; - return scan; - } - }; + this.lazyScanBuilder = + new SparkScanBuilder(spark, table, options) { + @Override + public Scan build() { + Scan scan = super.buildMergeOnReadScan(); + SparkPositionDeltaOperation.this.configuredScan = scan; + return scan; + } + }; } return lazyScanBuilder; @@ -80,9 +80,9 @@ public DeltaWriteBuilder newWriteBuilder(ExtendedLogicalWriteInfo info) { if (lazyWriteBuilder == null) { // don't validate the scan is not null as if the condition evaluates to false, // the optimizer replaces the original scan relation with a local relation - lazyWriteBuilder = new SparkPositionDeltaWriteBuilder( - spark, table, command, configuredScan, - isolationLevel, info); + lazyWriteBuilder = + new SparkPositionDeltaWriteBuilder( + spark, table, command, configuredScan, isolationLevel, info); } return lazyWriteBuilder; @@ -92,13 +92,13 @@ public DeltaWriteBuilder newWriteBuilder(ExtendedLogicalWriteInfo info) { public NamedReference[] requiredMetadataAttributes() { NamedReference specId = Expressions.column(MetadataColumns.SPEC_ID.name()); NamedReference partition = Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME); - return new NamedReference[]{specId, partition}; + return new NamedReference[] {specId, partition}; } @Override public NamedReference[] rowId() { NamedReference file = Expressions.column(MetadataColumns.FILE_PATH.name()); NamedReference pos = Expressions.column(MetadataColumns.ROW_POSITION.name()); - return new NamedReference[]{file, pos}; + return new NamedReference[] {file, pos}; } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java index 919a90073929..f819cd31fd5a 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.IsolationLevel.SERIALIZABLE; +import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.DELETE; +import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.MERGE; +import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.UPDATE; + import java.io.IOException; import java.io.Serializable; import java.util.Arrays; @@ -80,11 +84,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.IsolationLevel.SERIALIZABLE; -import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.DELETE; -import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.MERGE; -import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.UPDATE; - class SparkPositionDeltaWrite implements DeltaWrite, RequiresDistributionAndOrdering { private static final Logger LOG = LoggerFactory.getLogger(SparkPositionDeltaWrite.class); @@ -104,10 +103,17 @@ class SparkPositionDeltaWrite implements DeltaWrite, RequiresDistributionAndOrde private boolean cleanupOnAbort = true; - SparkPositionDeltaWrite(SparkSession spark, Table table, Command command, SparkBatchQueryScan scan, - IsolationLevel isolationLevel, SparkWriteConf writeConf, - ExtendedLogicalWriteInfo info, Schema dataSchema, - Distribution requiredDistribution, SortOrder[] requiredOrdering) { + SparkPositionDeltaWrite( + SparkSession spark, + Table table, + Command command, + SparkBatchQueryScan scan, + IsolationLevel isolationLevel, + SparkWriteConf writeConf, + ExtendedLogicalWriteInfo info, + Schema dataSchema, + Distribution requiredDistribution, + SortOrder[] requiredOrdering) { this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); this.table = table; this.command = command; @@ -150,7 +156,8 @@ private class PositionDeltaBatchWrite implements DeltaBatchWrite { @Override public DeltaWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) { // broadcast the table metadata as the writer factory will be sent to executors - Broadcast
    tableBroadcast = sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); + Broadcast
    tableBroadcast = + sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); return new PositionDeltaWriteFactory(tableBroadcast, command, context); } @@ -179,8 +186,10 @@ public void commit(WriterCommitMessage[] messages) { referencedDataFiles.addAll(Arrays.asList(taskCommit.referencedDataFiles())); } - // the scan may be null if the optimizer replaces it with an empty relation (e.g. the cond is false) - // no validation is needed in this case as the command does not depend on the scanned table state + // the scan may be null if the optimizer replaces it with an empty relation (e.g. the cond is + // false) + // no validation is needed in this case as the command does not depend on the scanned table + // state if (scan != null) { Expression conflictDetectionFilter = conflictDetectionFilter(scan); rowDelta.conflictDetectionFilter(conflictDetectionFilter); @@ -202,16 +211,22 @@ public void commit(WriterCommitMessage[] messages) { rowDelta.validateNoConflictingDataFiles(); } - String commitMsg = String.format( - "position delta with %d data files and %d delete files " + - "(scanSnapshotId: %d, conflictDetectionFilter: %s, isolationLevel: %s)", - addedDataFilesCount, addedDeleteFilesCount, scan.snapshotId(), conflictDetectionFilter, isolationLevel); + String commitMsg = + String.format( + "position delta with %d data files and %d delete files " + + "(scanSnapshotId: %d, conflictDetectionFilter: %s, isolationLevel: %s)", + addedDataFilesCount, + addedDeleteFilesCount, + scan.snapshotId(), + conflictDetectionFilter, + isolationLevel); commitOperation(rowDelta, commitMsg); } else { - String commitMsg = String.format( - "position delta with %d data files and %d delete files (no validation required)", - addedDataFilesCount, addedDeleteFilesCount); + String commitMsg = + String.format( + "position delta with %d data files and %d delete files (no validation required)", + addedDataFilesCount, addedDeleteFilesCount); commitOperation(rowDelta, commitMsg); } } @@ -317,29 +332,34 @@ private static class PositionDeltaWriteFactory implements DeltaWriterFactory { public DeltaWriter createWriter(int partitionId, long taskId) { Table table = tableBroadcast.value(); - OutputFileFactory dataFileFactory = OutputFileFactory.builderFor(table, partitionId, taskId) - .format(context.dataFileFormat()) - .build(); - OutputFileFactory deleteFileFactory = OutputFileFactory.builderFor(table, partitionId, taskId) - .format(context.deleteFileFormat()) - .build(); - - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table) - .dataFileFormat(context.dataFileFormat()) - .dataSchema(context.dataSchema()) - .dataSparkType(context.dataSparkType()) - .deleteFileFormat(context.deleteFileFormat()) - .positionDeleteSparkType(context.deleteSparkType()) - .build(); + OutputFileFactory dataFileFactory = + OutputFileFactory.builderFor(table, partitionId, taskId) + .format(context.dataFileFormat()) + .build(); + OutputFileFactory deleteFileFactory = + OutputFileFactory.builderFor(table, partitionId, taskId) + .format(context.deleteFileFormat()) + .build(); + + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table) + .dataFileFormat(context.dataFileFormat()) + .dataSchema(context.dataSchema()) + .dataSparkType(context.dataSparkType()) + .deleteFileFormat(context.deleteFileFormat()) + .positionDeleteSparkType(context.deleteSparkType()) + .build(); if (command == DELETE) { return new DeleteOnlyDeltaWriter(table, writerFactory, deleteFileFactory, context); } else if (table.spec().isUnpartitioned()) { - return new UnpartitionedDeltaWriter(table, writerFactory, dataFileFactory, deleteFileFactory, context); + return new UnpartitionedDeltaWriter( + table, writerFactory, dataFileFactory, deleteFileFactory, context); } else { - return new PartitionedDeltaWriter(table, writerFactory, dataFileFactory, deleteFileFactory, context); + return new PartitionedDeltaWriter( + table, writerFactory, dataFileFactory, deleteFileFactory, context); } } } @@ -351,12 +371,13 @@ protected InternalRowWrapper initPartitionRowWrapper(Types.StructType partitionT return new InternalRowWrapper(sparkPartitionType); } - protected Map buildPartitionProjections(Types.StructType partitionType, - Map specs) { + protected Map buildPartitionProjections( + Types.StructType partitionType, Map specs) { Map partitionProjections = Maps.newHashMap(); - specs.forEach((specID, spec) -> - partitionProjections.put(specID, StructProjection.create(partitionType, spec.partitionType())) - ); + specs.forEach( + (specID, spec) -> + partitionProjections.put( + specID, StructProjection.create(partitionType, spec.partitionType()))); return partitionProjections; } } @@ -375,11 +396,15 @@ private static class DeleteOnlyDeltaWriter extends BaseDeltaWriter { private boolean closed = false; - DeleteOnlyDeltaWriter(Table table, SparkFileWriterFactory writerFactory, - OutputFileFactory deleteFileFactory, Context context) { + DeleteOnlyDeltaWriter( + Table table, + SparkFileWriterFactory writerFactory, + OutputFileFactory deleteFileFactory, + Context context) { - this.delegate = new ClusteredPositionDeleteWriter<>( - writerFactory, deleteFileFactory, table.io(), context.targetDeleteFileSize()); + this.delegate = + new ClusteredPositionDeleteWriter<>( + writerFactory, deleteFileFactory, table.io(), context.targetDeleteFileSize()); this.positionDelete = PositionDelete.create(); this.io = table.io(); this.specs = table.specs(); @@ -389,9 +414,11 @@ private static class DeleteOnlyDeltaWriter extends BaseDeltaWriter { this.partitionProjections = buildPartitionProjections(partitionType, specs); this.specIdOrdinal = context.metadataSparkType().fieldIndex(MetadataColumns.SPEC_ID.name()); - this.partitionOrdinal = context.metadataSparkType().fieldIndex(MetadataColumns.PARTITION_COLUMN_NAME); + this.partitionOrdinal = + context.metadataSparkType().fieldIndex(MetadataColumns.PARTITION_COLUMN_NAME); this.fileOrdinal = context.deleteSparkType().fieldIndex(MetadataColumns.FILE_PATH.name()); - this.positionOrdinal = context.deleteSparkType().fieldIndex(MetadataColumns.ROW_POSITION.name()); + this.positionOrdinal = + context.deleteSparkType().fieldIndex(MetadataColumns.ROW_POSITION.name()); } @Override @@ -411,12 +438,14 @@ public void delete(InternalRow metadata, InternalRow id) throws IOException { @Override public void update(InternalRow metadata, InternalRow id, InternalRow row) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement update"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement update"); } @Override public void insert(InternalRow row) throws IOException { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement insert"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement insert"); } @Override @@ -458,13 +487,17 @@ private abstract static class DeleteAndDataDeltaWriter extends BaseDeltaWriter { private boolean closed = false; - DeleteAndDataDeltaWriter(Table table, SparkFileWriterFactory writerFactory, - OutputFileFactory dataFileFactory, OutputFileFactory deleteFileFactory, - Context context) { - this.delegate = new BasePositionDeltaWriter<>( - newInsertWriter(table, writerFactory, dataFileFactory, context), - newUpdateWriter(table, writerFactory, dataFileFactory, context), - newDeleteWriter(table, writerFactory, deleteFileFactory, context)); + DeleteAndDataDeltaWriter( + Table table, + SparkFileWriterFactory writerFactory, + OutputFileFactory dataFileFactory, + OutputFileFactory deleteFileFactory, + Context context) { + this.delegate = + new BasePositionDeltaWriter<>( + newInsertWriter(table, writerFactory, dataFileFactory, context), + newUpdateWriter(table, writerFactory, dataFileFactory, context), + newDeleteWriter(table, writerFactory, deleteFileFactory, context)); this.io = table.io(); this.specs = table.specs(); @@ -473,9 +506,11 @@ private abstract static class DeleteAndDataDeltaWriter extends BaseDeltaWriter { this.deletePartitionProjections = buildPartitionProjections(partitionType, specs); this.specIdOrdinal = context.metadataSparkType().fieldIndex(MetadataColumns.SPEC_ID.name()); - this.partitionOrdinal = context.metadataSparkType().fieldIndex(MetadataColumns.PARTITION_COLUMN_NAME); + this.partitionOrdinal = + context.metadataSparkType().fieldIndex(MetadataColumns.PARTITION_COLUMN_NAME); this.fileOrdinal = context.deleteSparkType().fieldIndex(MetadataColumns.FILE_PATH.name()); - this.positionOrdinal = context.deleteSparkType().fieldIndex(MetadataColumns.ROW_POSITION.name()); + this.positionOrdinal = + context.deleteSparkType().fieldIndex(MetadataColumns.ROW_POSITION.name()); } @Override @@ -517,10 +552,11 @@ public void close() throws IOException { } } - private PartitioningWriter newInsertWriter(Table table, - SparkFileWriterFactory writerFactory, - OutputFileFactory fileFactory, - Context context) { + private PartitioningWriter newInsertWriter( + Table table, + SparkFileWriterFactory writerFactory, + OutputFileFactory fileFactory, + Context context) { long targetFileSize = context.targetDataFileSize(); if (table.spec().isPartitioned() && context.fanoutWriterEnabled()) { @@ -530,10 +566,11 @@ private PartitioningWriter newInsertWriter(Table t } } - private PartitioningWriter newUpdateWriter(Table table, - SparkFileWriterFactory writerFactory, - OutputFileFactory fileFactory, - Context context) { + private PartitioningWriter newUpdateWriter( + Table table, + SparkFileWriterFactory writerFactory, + OutputFileFactory fileFactory, + Context context) { long targetFileSize = context.targetDataFileSize(); if (table.spec().isPartitioned()) { @@ -544,21 +581,26 @@ private PartitioningWriter newUpdateWriter(Table t } } - private ClusteredPositionDeleteWriter newDeleteWriter(Table table, - SparkFileWriterFactory writerFactory, - OutputFileFactory fileFactory, - Context context) { + private ClusteredPositionDeleteWriter newDeleteWriter( + Table table, + SparkFileWriterFactory writerFactory, + OutputFileFactory fileFactory, + Context context) { long targetFileSize = context.targetDeleteFileSize(); - return new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, table.io(), targetFileSize); + return new ClusteredPositionDeleteWriter<>( + writerFactory, fileFactory, table.io(), targetFileSize); } } private static class UnpartitionedDeltaWriter extends DeleteAndDataDeltaWriter { private final PartitionSpec dataSpec; - UnpartitionedDeltaWriter(Table table, SparkFileWriterFactory writerFactory, - OutputFileFactory dataFileFactory, OutputFileFactory deleteFileFactory, - Context context) { + UnpartitionedDeltaWriter( + Table table, + SparkFileWriterFactory writerFactory, + OutputFileFactory dataFileFactory, + OutputFileFactory deleteFileFactory, + Context context) { super(table, writerFactory, dataFileFactory, deleteFileFactory, context); this.dataSpec = table.spec(); } @@ -580,9 +622,12 @@ private static class PartitionedDeltaWriter extends DeleteAndDataDeltaWriter { private final PartitionKey dataPartitionKey; private final InternalRowWrapper internalRowDataWrapper; - PartitionedDeltaWriter(Table table, SparkFileWriterFactory writerFactory, - OutputFileFactory dataFileFactory, OutputFileFactory deleteFileFactory, - Context context) { + PartitionedDeltaWriter( + Table table, + SparkFileWriterFactory writerFactory, + OutputFileFactory dataFileFactory, + OutputFileFactory deleteFileFactory, + Context context) { super(table, writerFactory, dataFileFactory, deleteFileFactory, context); this.dataSpec = table.spec(); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWriteBuilder.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWriteBuilder.java index 17e6ec34c42e..eca2ce221bc3 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWriteBuilder.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWriteBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.DistributionMode; @@ -43,10 +42,8 @@ class SparkPositionDeltaWriteBuilder implements DeltaWriteBuilder { - private static final Schema EXPECTED_ROW_ID_SCHEMA = new Schema( - MetadataColumns.FILE_PATH, - MetadataColumns.ROW_POSITION - ); + private static final Schema EXPECTED_ROW_ID_SCHEMA = + new Schema(MetadataColumns.FILE_PATH, MetadataColumns.ROW_POSITION); private final SparkSession spark; private final Table table; @@ -59,8 +56,13 @@ class SparkPositionDeltaWriteBuilder implements DeltaWriteBuilder { private final boolean checkNullability; private final boolean checkOrdering; - SparkPositionDeltaWriteBuilder(SparkSession spark, Table table, Command command, Scan scan, - IsolationLevel isolationLevel, ExtendedLogicalWriteInfo info) { + SparkPositionDeltaWriteBuilder( + SparkSession spark, + Table table, + Command command, + Scan scan, + IsolationLevel isolationLevel, + ExtendedLogicalWriteInfo info) { this.spark = spark; this.table = table; this.command = command; @@ -75,7 +77,8 @@ class SparkPositionDeltaWriteBuilder implements DeltaWriteBuilder { @Override public DeltaWrite build() { - Preconditions.checkArgument(handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), + Preconditions.checkArgument( + handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); Schema dataSchema = dataSchema(); @@ -84,23 +87,35 @@ public DeltaWrite build() { } Schema rowIdSchema = SparkSchemaUtil.convert(EXPECTED_ROW_ID_SCHEMA, info.rowIdSchema()); - TypeUtil.validateSchema("row ID", EXPECTED_ROW_ID_SCHEMA, rowIdSchema, checkNullability, checkOrdering); + TypeUtil.validateSchema( + "row ID", EXPECTED_ROW_ID_SCHEMA, rowIdSchema, checkNullability, checkOrdering); - NestedField partition = MetadataColumns.metadataColumn(table, MetadataColumns.PARTITION_COLUMN_NAME); + NestedField partition = + MetadataColumns.metadataColumn(table, MetadataColumns.PARTITION_COLUMN_NAME); Schema expectedMetadataSchema = new Schema(MetadataColumns.SPEC_ID, partition); Schema metadataSchema = SparkSchemaUtil.convert(expectedMetadataSchema, info.metadataSchema()); - TypeUtil.validateSchema("metadata", expectedMetadataSchema, metadataSchema, checkNullability, checkOrdering); + TypeUtil.validateSchema( + "metadata", expectedMetadataSchema, metadataSchema, checkNullability, checkOrdering); SparkUtil.validatePartitionTransforms(table.spec()); - Distribution distribution = SparkDistributionAndOrderingUtil.buildPositionDeltaDistribution( - table, command, distributionMode()); - SortOrder[] ordering = SparkDistributionAndOrderingUtil.buildPositionDeltaOrdering( - table, command); + Distribution distribution = + SparkDistributionAndOrderingUtil.buildPositionDeltaDistribution( + table, command, distributionMode()); + SortOrder[] ordering = + SparkDistributionAndOrderingUtil.buildPositionDeltaOrdering(table, command); return new SparkPositionDeltaWrite( - spark, table, command, scan, isolationLevel, writeConf, - info, dataSchema, distribution, ordering); + spark, + table, + command, + scan, + isolationLevel, + writeConf, + info, + dataSchema, + distribution, + ordering); } private Schema dataSchema() { diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkRowLevelOperationBuilder.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkRowLevelOperationBuilder.java index 453e06568fa6..748fe9bca809 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkRowLevelOperationBuilder.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkRowLevelOperationBuilder.java @@ -16,19 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; -import java.util.Map; -import org.apache.iceberg.IsolationLevel; -import org.apache.iceberg.RowLevelOperationMode; -import org.apache.iceberg.Table; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.connector.iceberg.write.RowLevelOperation; -import org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command; -import org.apache.spark.sql.connector.iceberg.write.RowLevelOperationBuilder; -import org.apache.spark.sql.connector.iceberg.write.RowLevelOperationInfo; - import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL; import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL_DEFAULT; import static org.apache.iceberg.TableProperties.DELETE_MODE; @@ -42,6 +31,16 @@ import static org.apache.iceberg.TableProperties.UPDATE_MODE; import static org.apache.iceberg.TableProperties.UPDATE_MODE_DEFAULT; +import java.util.Map; +import org.apache.iceberg.IsolationLevel; +import org.apache.iceberg.RowLevelOperationMode; +import org.apache.iceberg.Table; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.connector.iceberg.write.RowLevelOperation; +import org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command; +import org.apache.spark.sql.connector.iceberg.write.RowLevelOperationBuilder; +import org.apache.spark.sql.connector.iceberg.write.RowLevelOperationInfo; + class SparkRowLevelOperationBuilder implements RowLevelOperationBuilder { private final SparkSession spark; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java index b9292541eaeb..f26daa55b2b3 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.Serializable; @@ -69,8 +68,12 @@ abstract class SparkScan extends SparkBatch implements Scan, SupportsReportStati // lazy variables private StructType readSchema; - SparkScan(SparkSession spark, Table table, SparkReadConf readConf, - Schema expectedSchema, List filters) { + SparkScan( + SparkSession spark, + Table table, + SparkReadConf readConf, + Schema expectedSchema, + List filters) { super(spark, table, readConf, expectedSchema); SparkSchemaUtil.validateMetadataColumnReferences(table.schema(), expectedSchema); @@ -106,14 +109,16 @@ public Batch toBatch() { @Override public MicroBatchStream toMicroBatchStream(String checkpointLocation) { - return new SparkMicroBatchStream(sparkContext(), table, readConf, expectedSchema, checkpointLocation); + return new SparkMicroBatchStream( + sparkContext(), table, readConf, expectedSchema, checkpointLocation); } @Override public StructType readSchema() { if (readSchema == null) { - Preconditions.checkArgument(readTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(expectedSchema), - SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); + Preconditions.checkArgument( + readTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(expectedSchema), + SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); this.readSchema = SparkSchemaUtil.convert(expectedSchema); } return readSchema; @@ -130,14 +135,14 @@ protected Statistics estimateStatistics(Snapshot snapshot) { return new Stats(0L, 0L); } - // estimate stats using snapshot summary only for partitioned tables (metadata tables are unpartitioned) + // estimate stats using snapshot summary only for partitioned tables (metadata tables are + // unpartitioned) if (!table.spec().isUnpartitioned() && filterExpressions.isEmpty()) { LOG.debug("using table metadata to estimate table statistics"); - long totalRecords = PropertyUtil.propertyAsLong(snapshot.summary(), - SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE); - return new Stats( - SparkSchemaUtil.estimateSize(readSchema(), totalRecords), - totalRecords); + long totalRecords = + PropertyUtil.propertyAsLong( + snapshot.summary(), SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE); + return new Stats(SparkSchemaUtil.estimateSize(readSchema(), totalRecords), totalRecords); } long numRows = 0L; @@ -156,7 +161,8 @@ protected Statistics estimateStatistics(Snapshot snapshot) { @Override public String description() { - String filters = filterExpressions.stream().map(Spark3Util::describe).collect(Collectors.joining(", ")); + String filters = + filterExpressions.stream().map(Spark3Util::describe).collect(Collectors.joining(", ")); return String.format("%s [filters=%s]", table, filters); } @@ -197,7 +203,8 @@ private static class RowReader extends RowDataReader implements PartitionReader< } } - private static class BatchReader extends BatchDataReader implements PartitionReader { + private static class BatchReader extends BatchDataReader + implements PartitionReader { BatchReader(ReadTask task, int batchSize) { super(task.task, task.table(), task.expectedSchema(), task.isCaseSensitive(), batchSize); } @@ -212,8 +219,12 @@ static class ReadTask implements InputPartition, Serializable { private transient Schema expectedSchema = null; private transient String[] preferredLocations = null; - ReadTask(CombinedScanTask task, Broadcast
    tableBroadcast, String expectedSchemaString, - boolean caseSensitive, boolean localityPreferred) { + ReadTask( + CombinedScanTask task, + Broadcast
    tableBroadcast, + String expectedSchemaString, + boolean caseSensitive, + boolean localityPreferred) { this.task = task; this.tableBroadcast = tableBroadcast; this.expectedSchemaString = expectedSchemaString; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java index 60d2c2150bb1..21c34ed6f628 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -55,8 +54,11 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class SparkScanBuilder implements ScanBuilder, SupportsPushDownFilters, SupportsPushDownRequiredColumns, - SupportsReportStatistics { +public class SparkScanBuilder + implements ScanBuilder, + SupportsPushDownFilters, + SupportsPushDownRequiredColumns, + SupportsReportStatistics { private static final Logger LOG = LoggerFactory.getLogger(SparkScanBuilder.class); private static final Filter[] NO_FILTERS = new Filter[0]; @@ -72,7 +74,8 @@ public class SparkScanBuilder implements ScanBuilder, SupportsPushDownFilters, S private List filterExpressions = null; private Filter[] pushedFilters = NO_FILTERS; - SparkScanBuilder(SparkSession spark, Table table, Schema schema, CaseInsensitiveStringMap options) { + SparkScanBuilder( + SparkSession spark, Table table, Schema schema, CaseInsensitiveStringMap options) { this.spark = spark; this.table = table; this.schema = schema; @@ -108,8 +111,10 @@ public Filter[] pushFilters(Filter[] filters) { expr = SparkFilters.convert(filter); } catch (IllegalArgumentException e) { // converting to Iceberg Expression failed, so this expression cannot be pushed down - LOG.info("Failed to convert filter to Iceberg expression, skipping push down for this expression: {}. {}", - filter, e.getMessage()); + LOG.info( + "Failed to convert filter to Iceberg expression, skipping push down for this expression: {}. {}", + filter, + e.getMessage()); } if (expr != null) { @@ -119,8 +124,10 @@ public Filter[] pushFilters(Filter[] filters) { pushed.add(filter); } catch (ValidationException e) { // binding to the table schema failed, so this expression cannot be pushed down - LOG.info("Failed to bind expression to table schema, skipping push down for this expression: {}. {}", - filter, e.getMessage()); + LOG.info( + "Failed to bind expression to table schema, skipping push down for this expression: {}. {}", + filter, + e.getMessage()); } } } @@ -140,12 +147,16 @@ public Filter[] pushedFilters() { @Override public void pruneColumns(StructType requestedSchema) { - StructType requestedProjection = new StructType(Stream.of(requestedSchema.fields()) - .filter(field -> MetadataColumns.nonMetadataColumn(field.name())) - .toArray(StructField[]::new)); + StructType requestedProjection = + new StructType( + Stream.of(requestedSchema.fields()) + .filter(field -> MetadataColumns.nonMetadataColumn(field.name())) + .toArray(StructField[]::new)); - // the projection should include all columns that will be returned, including those only used in filters - this.schema = SparkSchemaUtil.prune(schema, requestedProjection, filterExpression(), caseSensitive); + // the projection should include all columns that will be returned, including those only used in + // filters + this.schema = + SparkSchemaUtil.prune(schema, requestedProjection, filterExpression(), caseSensitive); Stream.of(requestedSchema.fields()) .map(StructField::name) @@ -156,10 +167,11 @@ public void pruneColumns(StructType requestedSchema) { private Schema schemaWithMetadataColumns() { // metadata columns - List fields = metaColumns.stream() - .distinct() - .map(name -> MetadataColumns.metadataColumn(table, name)) - .collect(Collectors.toList()); + List fields = + metaColumns.stream() + .distinct() + .map(name -> MetadataColumns.metadataColumn(table, name)) + .collect(Collectors.toList()); Schema meta = new Schema(fields); // schema or rows returned by readers @@ -171,30 +183,39 @@ public Scan build() { Long snapshotId = readConf.snapshotId(); Long asOfTimestamp = readConf.asOfTimestamp(); - Preconditions.checkArgument(snapshotId == null || asOfTimestamp == null, + Preconditions.checkArgument( + snapshotId == null || asOfTimestamp == null, "Cannot set both %s and %s to select which table snapshot to scan", - SparkReadOptions.SNAPSHOT_ID, SparkReadOptions.AS_OF_TIMESTAMP); + SparkReadOptions.SNAPSHOT_ID, + SparkReadOptions.AS_OF_TIMESTAMP); Long startSnapshotId = readConf.startSnapshotId(); Long endSnapshotId = readConf.endSnapshotId(); if (snapshotId != null || asOfTimestamp != null) { - Preconditions.checkArgument(startSnapshotId == null && endSnapshotId == null, + Preconditions.checkArgument( + startSnapshotId == null && endSnapshotId == null, "Cannot set %s and %s for incremental scans when either %s or %s is set", - SparkReadOptions.START_SNAPSHOT_ID, SparkReadOptions.END_SNAPSHOT_ID, - SparkReadOptions.SNAPSHOT_ID, SparkReadOptions.AS_OF_TIMESTAMP); + SparkReadOptions.START_SNAPSHOT_ID, + SparkReadOptions.END_SNAPSHOT_ID, + SparkReadOptions.SNAPSHOT_ID, + SparkReadOptions.AS_OF_TIMESTAMP); } - Preconditions.checkArgument(startSnapshotId != null || endSnapshotId == null, + Preconditions.checkArgument( + startSnapshotId != null || endSnapshotId == null, "Cannot set only %s for incremental scans. Please, set %s too.", - SparkReadOptions.END_SNAPSHOT_ID, SparkReadOptions.START_SNAPSHOT_ID); + SparkReadOptions.END_SNAPSHOT_ID, + SparkReadOptions.START_SNAPSHOT_ID); Schema expectedSchema = schemaWithMetadataColumns(); - TableScan scan = table.newScan() - .caseSensitive(caseSensitive) - .filter(filterExpression()) - .project(expectedSchema); + TableScan scan = + table + .newScan() + .caseSensitive(caseSensitive) + .filter(filterExpression()) + .project(expectedSchema); if (snapshotId != null) { scan = scan.useSnapshot(snapshotId); @@ -218,61 +239,71 @@ public Scan build() { } public Scan buildMergeOnReadScan() { - Preconditions.checkArgument(readConf.snapshotId() == null && readConf.asOfTimestamp() == null, + Preconditions.checkArgument( + readConf.snapshotId() == null && readConf.asOfTimestamp() == null, "Cannot set time travel options %s and %s for row-level command scans", - SparkReadOptions.SNAPSHOT_ID, SparkReadOptions.AS_OF_TIMESTAMP); + SparkReadOptions.SNAPSHOT_ID, + SparkReadOptions.AS_OF_TIMESTAMP); - Preconditions.checkArgument(readConf.startSnapshotId() == null && readConf.endSnapshotId() == null, + Preconditions.checkArgument( + readConf.startSnapshotId() == null && readConf.endSnapshotId() == null, "Cannot set incremental scan options %s and %s for row-level command scans", - SparkReadOptions.START_SNAPSHOT_ID, SparkReadOptions.END_SNAPSHOT_ID); + SparkReadOptions.START_SNAPSHOT_ID, + SparkReadOptions.END_SNAPSHOT_ID); Snapshot snapshot = table.currentSnapshot(); if (snapshot == null) { - return new SparkBatchQueryScan(spark, table, null, readConf, schemaWithMetadataColumns(), filterExpressions); + return new SparkBatchQueryScan( + spark, table, null, readConf, schemaWithMetadataColumns(), filterExpressions); } // remember the current snapshot ID for commit validation long snapshotId = snapshot.snapshotId(); - CaseInsensitiveStringMap adjustedOptions = Spark3Util.setOption( - SparkReadOptions.SNAPSHOT_ID, - Long.toString(snapshotId), - options); + CaseInsensitiveStringMap adjustedOptions = + Spark3Util.setOption(SparkReadOptions.SNAPSHOT_ID, Long.toString(snapshotId), options); SparkReadConf adjustedReadConf = new SparkReadConf(spark, table, adjustedOptions); Schema expectedSchema = schemaWithMetadataColumns(); - TableScan scan = table.newScan() - .useSnapshot(snapshotId) - .caseSensitive(caseSensitive) - .filter(filterExpression()) - .project(expectedSchema); + TableScan scan = + table + .newScan() + .useSnapshot(snapshotId) + .caseSensitive(caseSensitive) + .filter(filterExpression()) + .project(expectedSchema); scan = configureSplitPlanning(scan); - return new SparkBatchQueryScan(spark, table, scan, adjustedReadConf, expectedSchema, filterExpressions); + return new SparkBatchQueryScan( + spark, table, scan, adjustedReadConf, expectedSchema, filterExpressions); } public Scan buildCopyOnWriteScan() { Snapshot snapshot = table.currentSnapshot(); if (snapshot == null) { - return new SparkCopyOnWriteScan(spark, table, readConf, schemaWithMetadataColumns(), filterExpressions); + return new SparkCopyOnWriteScan( + spark, table, readConf, schemaWithMetadataColumns(), filterExpressions); } Schema expectedSchema = schemaWithMetadataColumns(); - TableScan scan = table.newScan() - .useSnapshot(snapshot.snapshotId()) - .ignoreResiduals() - .caseSensitive(caseSensitive) - .filter(filterExpression()) - .project(expectedSchema); + TableScan scan = + table + .newScan() + .useSnapshot(snapshot.snapshotId()) + .ignoreResiduals() + .caseSensitive(caseSensitive) + .filter(filterExpression()) + .project(expectedSchema); scan = configureSplitPlanning(scan); - return new SparkCopyOnWriteScan(spark, table, scan, snapshot, readConf, expectedSchema, filterExpressions); + return new SparkCopyOnWriteScan( + spark, table, scan, snapshot, readConf, expectedSchema, filterExpressions); } private TableScan configureSplitPlanning(TableScan scan) { @@ -285,12 +316,15 @@ private TableScan configureSplitPlanning(TableScan scan) { Integer splitLookback = readConf.splitLookbackOption(); if (splitLookback != null) { - configuredScan = configuredScan.option(TableProperties.SPLIT_LOOKBACK, String.valueOf(splitLookback)); + configuredScan = + configuredScan.option(TableProperties.SPLIT_LOOKBACK, String.valueOf(splitLookback)); } Long splitOpenFileCost = readConf.splitOpenFileCostOption(); if (splitOpenFileCost != null) { - configuredScan = configuredScan.option(TableProperties.SPLIT_OPEN_FILE_COST, String.valueOf(splitOpenFileCost)); + configuredScan = + configuredScan.option( + TableProperties.SPLIT_OPEN_FILE_COST, String.valueOf(splitOpenFileCost)); } return configuredScan; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java index a7f986491d4a..35da565af691 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.CURRENT_SNAPSHOT_ID; +import static org.apache.iceberg.TableProperties.FORMAT_VERSION; + import java.io.IOException; import java.util.Map; import java.util.Set; @@ -72,24 +74,33 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.CURRENT_SNAPSHOT_ID; -import static org.apache.iceberg.TableProperties.FORMAT_VERSION; - -public class SparkTable implements org.apache.spark.sql.connector.catalog.Table, - SupportsRead, SupportsWrite, SupportsDelete, SupportsRowLevelOperations, SupportsMetadataColumns { +public class SparkTable + implements org.apache.spark.sql.connector.catalog.Table, + SupportsRead, + SupportsWrite, + SupportsDelete, + SupportsRowLevelOperations, + SupportsMetadataColumns { private static final Logger LOG = LoggerFactory.getLogger(SparkTable.class); private static final Set RESERVED_PROPERTIES = - ImmutableSet.of("provider", "format", CURRENT_SNAPSHOT_ID, "location", FORMAT_VERSION, "sort-order", + ImmutableSet.of( + "provider", + "format", + CURRENT_SNAPSHOT_ID, + "location", + FORMAT_VERSION, + "sort-order", "identifier-fields"); - private static final Set CAPABILITIES = ImmutableSet.of( - TableCapability.BATCH_READ, - TableCapability.BATCH_WRITE, - TableCapability.MICRO_BATCH_READ, - TableCapability.STREAMING_WRITE, - TableCapability.OVERWRITE_BY_FILTER, - TableCapability.OVERWRITE_DYNAMIC); + private static final Set CAPABILITIES = + ImmutableSet.of( + TableCapability.BATCH_READ, + TableCapability.BATCH_WRITE, + TableCapability.MICRO_BATCH_READ, + TableCapability.STREAMING_WRITE, + TableCapability.OVERWRITE_BY_FILTER, + TableCapability.OVERWRITE_DYNAMIC); private static final Set CAPABILITIES_WITH_ACCEPT_ANY_SCHEMA = ImmutableSet.builder() .addAll(CAPABILITIES) @@ -112,8 +123,11 @@ public SparkTable(Table icebergTable, Long snapshotId, boolean refreshEagerly) { this.snapshotId = snapshotId; this.refreshEagerly = refreshEagerly; - boolean acceptAnySchema = PropertyUtil.propertyAsBoolean(icebergTable.properties(), - TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA, TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA_DEFAULT); + boolean acceptAnySchema = + PropertyUtil.propertyAsBoolean( + icebergTable.properties(), + TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA, + TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA_DEFAULT); this.capabilities = acceptAnySchema ? CAPABILITIES_WITH_ACCEPT_ANY_SCHEMA : CAPABILITIES; } @@ -156,12 +170,17 @@ public Transform[] partitioning() { public Map properties() { ImmutableMap.Builder propsBuilder = ImmutableMap.builder(); - String fileFormat = icebergTable.properties() - .getOrDefault(TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); + String fileFormat = + icebergTable + .properties() + .getOrDefault( + TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); propsBuilder.put("format", "iceberg/" + fileFormat); propsBuilder.put("provider", "iceberg"); - String currentSnapshotId = icebergTable.currentSnapshot() != null ? - String.valueOf(icebergTable.currentSnapshot().snapshotId()) : "none"; + String currentSnapshotId = + icebergTable.currentSnapshot() != null + ? String.valueOf(icebergTable.currentSnapshot().snapshotId()) + : "none"; propsBuilder.put(CURRENT_SNAPSHOT_ID, currentSnapshotId); propsBuilder.put("location", icebergTable.location()); @@ -195,11 +214,11 @@ public Set capabilities() { public MetadataColumn[] metadataColumns() { DataType sparkPartitionType = SparkSchemaUtil.convert(Partitioning.partitionType(table())); return new MetadataColumn[] { - new SparkMetadataColumn(MetadataColumns.SPEC_ID.name(), DataTypes.IntegerType, false), - new SparkMetadataColumn(MetadataColumns.PARTITION_COLUMN_NAME, sparkPartitionType, true), - new SparkMetadataColumn(MetadataColumns.FILE_PATH.name(), DataTypes.StringType, false), - new SparkMetadataColumn(MetadataColumns.ROW_POSITION.name(), DataTypes.LongType, false), - new SparkMetadataColumn(MetadataColumns.IS_DELETED.name(), DataTypes.BooleanType, false) + new SparkMetadataColumn(MetadataColumns.SPEC_ID.name(), DataTypes.IntegerType, false), + new SparkMetadataColumn(MetadataColumns.PARTITION_COLUMN_NAME, sparkPartitionType, true), + new SparkMetadataColumn(MetadataColumns.FILE_PATH.name(), DataTypes.StringType, false), + new SparkMetadataColumn(MetadataColumns.ROW_POSITION.name(), DataTypes.LongType, false), + new SparkMetadataColumn(MetadataColumns.IS_DELETED.name(), DataTypes.BooleanType, false) }; } @@ -221,8 +240,7 @@ public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) { @Override public WriteBuilder newWriteBuilder(LogicalWriteInfo info) { Preconditions.checkArgument( - snapshotId == null, - "Cannot write to table at a specific snapshot: %s", snapshotId); + snapshotId == null, "Cannot write to table at a specific snapshot: %s", snapshotId); return new SparkWriteBuilder(sparkSession(), icebergTable, info); } @@ -235,8 +253,7 @@ public RowLevelOperationBuilder newRowLevelOperationBuilder(RowLevelOperationInf @Override public boolean canDeleteWhere(Filter[] filters) { Preconditions.checkArgument( - snapshotId == null, - "Cannot delete from table at a specific snapshot: %s", snapshotId); + snapshotId == null, "Cannot delete from table at a specific snapshot: %s", snapshotId); Expression deleteExpr = Expressions.alwaysTrue(); @@ -254,25 +271,34 @@ public boolean canDeleteWhere(Filter[] filters) { // a metadata delete is possible iff matching files can be deleted entirely private boolean canDeleteUsingMetadata(Expression deleteExpr) { - boolean caseSensitive = Boolean.parseBoolean(sparkSession().conf().get("spark.sql.caseSensitive")); - TableScan scan = table().newScan() - .filter(deleteExpr) - .caseSensitive(caseSensitive) - .includeColumnStats() - .ignoreResiduals(); + boolean caseSensitive = + Boolean.parseBoolean(sparkSession().conf().get("spark.sql.caseSensitive")); + TableScan scan = + table() + .newScan() + .filter(deleteExpr) + .caseSensitive(caseSensitive) + .includeColumnStats() + .ignoreResiduals(); try (CloseableIterable tasks = scan.planFiles()) { Map evaluators = Maps.newHashMap(); - StrictMetricsEvaluator metricsEvaluator = new StrictMetricsEvaluator(table().schema(), deleteExpr); - - return Iterables.all(tasks, task -> { - DataFile file = task.file(); - PartitionSpec spec = task.spec(); - Evaluator evaluator = evaluators.computeIfAbsent( - spec.specId(), - specId -> new Evaluator(spec.partitionType(), Projections.strict(spec).project(deleteExpr))); - return evaluator.eval(file.partition()) || metricsEvaluator.eval(file); - }); + StrictMetricsEvaluator metricsEvaluator = + new StrictMetricsEvaluator(table().schema(), deleteExpr); + + return Iterables.all( + tasks, + task -> { + DataFile file = task.file(); + PartitionSpec spec = task.spec(); + Evaluator evaluator = + evaluators.computeIfAbsent( + spec.specId(), + specId -> + new Evaluator( + spec.partitionType(), Projections.strict(spec).project(deleteExpr))); + return evaluator.eval(file.partition()) || metricsEvaluator.eval(file); + }); } catch (IOException ioe) { LOG.warn("Failed to close task iterable", ioe); @@ -289,7 +315,8 @@ public void deleteWhere(Filter[] filters) { return; } - icebergTable.newDelete() + icebergTable + .newDelete() .set("spark.app.id", sparkSession().sparkContext().applicationId()) .deleteFromRowFilter(deleteExpr) .commit(); @@ -319,12 +346,15 @@ public int hashCode() { return icebergTable.name().hashCode(); } - private static CaseInsensitiveStringMap addSnapshotId(CaseInsensitiveStringMap options, Long snapshotId) { + private static CaseInsensitiveStringMap addSnapshotId( + CaseInsensitiveStringMap options, Long snapshotId) { if (snapshotId != null) { String snapshotIdFromOptions = options.get(SparkReadOptions.SNAPSHOT_ID); String value = snapshotId.toString(); - Preconditions.checkArgument(snapshotIdFromOptions == null || snapshotIdFromOptions.equals(value), - "Cannot override snapshot ID more than once: %s", snapshotIdFromOptions); + Preconditions.checkArgument( + snapshotIdFromOptions == null || snapshotIdFromOptions.equals(value), + "Cannot override snapshot ID more than once: %s", + snapshotIdFromOptions); Map scanOptions = Maps.newHashMap(); scanOptions.putAll(options.asCaseSensitiveMap()); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java index b6527ffa700f..52e43d3484a6 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.IsolationLevel.SERIALIZABLE; +import static org.apache.iceberg.IsolationLevel.SNAPSHOT; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; + import java.io.IOException; import java.util.Arrays; import java.util.Collections; @@ -84,17 +94,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.IsolationLevel.SERIALIZABLE; -import static org.apache.iceberg.IsolationLevel.SNAPSHOT; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; - abstract class SparkWrite implements Write, RequiresDistributionAndOrdering { private static final Logger LOG = LoggerFactory.getLogger(SparkWrite.class); @@ -116,10 +115,16 @@ abstract class SparkWrite implements Write, RequiresDistributionAndOrdering { private boolean cleanupOnAbort = true; - SparkWrite(SparkSession spark, Table table, SparkWriteConf writeConf, - LogicalWriteInfo writeInfo, String applicationId, - Schema writeSchema, StructType dsSchema, - Distribution requiredDistribution, SortOrder[] requiredOrdering) { + SparkWrite( + SparkSession spark, + Table table, + SparkWriteConf writeConf, + LogicalWriteInfo writeInfo, + String applicationId, + Schema writeSchema, + StructType dsSchema, + Distribution requiredDistribution, + SortOrder[] requiredOrdering) { this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); this.table = table; this.writeConf = writeConf; @@ -178,8 +183,10 @@ StreamingWrite asStreamingOverwrite() { // the writer factory works for both batch and streaming private WriterFactory createWriterFactory() { // broadcast the table metadata as the writer factory will be sent to executors - Broadcast
    tableBroadcast = sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); - return new WriterFactory(tableBroadcast, format, targetFileSize, writeSchema, dsSchema, partitionedFanoutEnabled); + Broadcast
    tableBroadcast = + sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); + return new WriterFactory( + tableBroadcast, format, targetFileSize, writeSchema, dsSchema, partitionedFanoutEnabled); } private void commitOperation(SnapshotUpdate operation, String description) { @@ -221,24 +228,33 @@ private void abort(WriterCommitMessage[] messages) { .executeWith(ThreadPools.getWorkerPool()) .retry(PropertyUtil.propertyAsInt(props, COMMIT_NUM_RETRIES, COMMIT_NUM_RETRIES_DEFAULT)) .exponentialBackoff( - PropertyUtil.propertyAsInt(props, COMMIT_MIN_RETRY_WAIT_MS, COMMIT_MIN_RETRY_WAIT_MS_DEFAULT), - PropertyUtil.propertyAsInt(props, COMMIT_MAX_RETRY_WAIT_MS, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT), - PropertyUtil.propertyAsInt(props, COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_MIN_RETRY_WAIT_MS, COMMIT_MIN_RETRY_WAIT_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_MAX_RETRY_WAIT_MS, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), 2.0 /* exponential */) .throwFailureWhenFinished() - .run(file -> { - table.io().deleteFile(file.path().toString()); - }); + .run( + file -> { + table.io().deleteFile(file.path().toString()); + }); } else { - LOG.warn("Skipping cleaning up of data files because Iceberg was unable to determine the final commit state"); + LOG.warn( + "Skipping cleaning up of data files because Iceberg was unable to determine the final commit state"); } } private Iterable files(WriterCommitMessage[] messages) { if (messages.length > 0) { - return Iterables.concat(Iterables.transform(Arrays.asList(messages), message -> message != null ? - ImmutableList.copyOf(((TaskCommit) message).files()) : - ImmutableList.of())); + return Iterables.concat( + Iterables.transform( + Arrays.asList(messages), + message -> + message != null + ? ImmutableList.copyOf(((TaskCommit) message).files()) + : ImmutableList.of())); } return ImmutableList.of(); } @@ -312,7 +328,9 @@ public void commit(WriterCommitMessage[] messages) { dynamicOverwrite.addFile(file); } - commitOperation(dynamicOverwrite, String.format("dynamic partition overwrite with %d new data files", numFiles)); + commitOperation( + dynamicOverwrite, + String.format("dynamic partition overwrite with %d new data files", numFiles)); } } @@ -349,7 +367,8 @@ public void commit(WriterCommitMessage[] messages) { overwriteFiles.validateNoConflictingDeletes(); } - String commitMsg = String.format("overwrite by filter %s with %d new data files", overwriteExpr, numFiles); + String commitMsg = + String.format("overwrite by filter %s with %d new data files", overwriteExpr, numFiles); commitOperation(overwriteFiles, commitMsg); } } @@ -405,9 +424,8 @@ public void commit(WriterCommitMessage[] messages) { } } - private void commitWithSerializableIsolation(OverwriteFiles overwriteFiles, - int numOverwrittenFiles, - int numAddedFiles) { + private void commitWithSerializableIsolation( + OverwriteFiles overwriteFiles, int numOverwrittenFiles, int numAddedFiles) { Long scanSnapshotId = scan.snapshotId(); if (scanSnapshotId != null) { overwriteFiles.validateFromSnapshot(scanSnapshotId); @@ -418,15 +436,15 @@ private void commitWithSerializableIsolation(OverwriteFiles overwriteFiles, overwriteFiles.validateNoConflictingData(); overwriteFiles.validateNoConflictingDeletes(); - String commitMsg = String.format( - "overwrite of %d data files with %d new data files, scanSnapshotId: %d, conflictDetectionFilter: %s", - numOverwrittenFiles, numAddedFiles, scanSnapshotId, conflictDetectionFilter); + String commitMsg = + String.format( + "overwrite of %d data files with %d new data files, scanSnapshotId: %d, conflictDetectionFilter: %s", + numOverwrittenFiles, numAddedFiles, scanSnapshotId, conflictDetectionFilter); commitOperation(overwriteFiles, commitMsg); } - private void commitWithSnapshotIsolation(OverwriteFiles overwriteFiles, - int numOverwrittenFiles, - int numAddedFiles) { + private void commitWithSnapshotIsolation( + OverwriteFiles overwriteFiles, int numOverwrittenFiles, int numAddedFiles) { Long scanSnapshotId = scan.snapshotId(); if (scanSnapshotId != null) { overwriteFiles.validateFromSnapshot(scanSnapshotId); @@ -436,9 +454,10 @@ private void commitWithSnapshotIsolation(OverwriteFiles overwriteFiles, overwriteFiles.conflictDetectionFilter(conflictDetectionFilter); overwriteFiles.validateNoConflictingDeletes(); - String commitMsg = String.format( - "overwrite of %d data files with %d new data files", - numOverwrittenFiles, numAddedFiles); + String commitMsg = + String.format( + "overwrite of %d data files with %d new data files", + numOverwrittenFiles, numAddedFiles); commitOperation(overwriteFiles, commitMsg); } } @@ -557,7 +576,10 @@ public void doCommit(long epochId, WriterCommitMessage[] messages) { overwriteFiles.addFile(file); numFiles++; } - commit(overwriteFiles, epochId, String.format("streaming complete overwrite with %d new data files", numFiles)); + commit( + overwriteFiles, + epochId, + String.format("streaming complete overwrite with %d new data files", numFiles)); } } @@ -599,8 +621,13 @@ private static class WriterFactory implements DataWriterFactory, StreamingDataWr private final StructType dsSchema; private final boolean partitionedFanoutEnabled; - protected WriterFactory(Broadcast
    tableBroadcast, FileFormat format, long targetFileSize, - Schema writeSchema, StructType dsSchema, boolean partitionedFanoutEnabled) { + protected WriterFactory( + Broadcast
    tableBroadcast, + FileFormat format, + long targetFileSize, + Schema writeSchema, + StructType dsSchema, + boolean partitionedFanoutEnabled) { this.tableBroadcast = tableBroadcast; this.format = format; this.targetFileSize = targetFileSize; @@ -620,21 +647,28 @@ public DataWriter createWriter(int partitionId, long taskId, long e PartitionSpec spec = table.spec(); FileIO io = table.io(); - OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, partitionId, taskId) - .format(format) - .build(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table) - .dataFileFormat(format) - .dataSchema(writeSchema) - .dataSparkType(dsSchema) - .build(); + OutputFileFactory fileFactory = + OutputFileFactory.builderFor(table, partitionId, taskId).format(format).build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table) + .dataFileFormat(format) + .dataSchema(writeSchema) + .dataSparkType(dsSchema) + .build(); if (spec.isUnpartitioned()) { return new UnpartitionedDataWriter(writerFactory, fileFactory, io, spec, targetFileSize); } else { return new PartitionedDataWriter( - writerFactory, fileFactory, io, spec, writeSchema, dsSchema, targetFileSize, partitionedFanoutEnabled); + writerFactory, + fileFactory, + io, + spec, + writeSchema, + dsSchema, + targetFileSize, + partitionedFanoutEnabled); } } } @@ -651,9 +685,14 @@ private static class UnpartitionedDataWriter implements DataWriter private final FileWriter delegate; private final FileIO io; - private UnpartitionedDataWriter(SparkFileWriterFactory writerFactory, OutputFileFactory fileFactory, - FileIO io, PartitionSpec spec, long targetFileSize) { - this.delegate = new RollingDataWriter<>(writerFactory, fileFactory, io, targetFileSize, spec, null); + private UnpartitionedDataWriter( + SparkFileWriterFactory writerFactory, + OutputFileFactory fileFactory, + FileIO io, + PartitionSpec spec, + long targetFileSize) { + this.delegate = + new RollingDataWriter<>(writerFactory, fileFactory, io, targetFileSize, spec, null); this.io = io; } @@ -667,7 +706,7 @@ public WriterCommitMessage commit() throws IOException { close(); DataWriteResult result = delegate.result(); - TaskCommit taskCommit = new TaskCommit(result.dataFiles().toArray(new DataFile[0])); + TaskCommit taskCommit = new TaskCommit(result.dataFiles().toArray(new DataFile[0])); taskCommit.reportOutputMetrics(); return taskCommit; } @@ -693,9 +732,15 @@ private static class PartitionedDataWriter implements DataWriter { private final PartitionKey partitionKey; private final InternalRowWrapper internalRowWrapper; - private PartitionedDataWriter(SparkFileWriterFactory writerFactory, OutputFileFactory fileFactory, - FileIO io, PartitionSpec spec, Schema dataSchema, - StructType dataSparkType, long targetFileSize, boolean fanoutEnabled) { + private PartitionedDataWriter( + SparkFileWriterFactory writerFactory, + OutputFileFactory fileFactory, + FileIO io, + PartitionSpec spec, + Schema dataSchema, + StructType dataSparkType, + long targetFileSize, + boolean fanoutEnabled) { if (fanoutEnabled) { this.delegate = new FanoutDataWriter<>(writerFactory, fileFactory, io, targetFileSize); } else { @@ -718,7 +763,7 @@ public WriterCommitMessage commit() throws IOException { close(); DataWriteResult result = delegate.result(); - TaskCommit taskCommit = new TaskCommit(result.dataFiles().toArray(new DataFile[0])); + TaskCommit taskCommit = new TaskCommit(result.dataFiles().toArray(new DataFile[0])); taskCommit.reportOutputMetrics(); return taskCommit; } diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java index e3b9c3a81344..df07c0c37eb3 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.DistributionMode; @@ -87,10 +86,13 @@ class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, Suppo } public WriteBuilder overwriteFiles(Scan scan, Command command, IsolationLevel isolationLevel) { - Preconditions.checkArgument(scan instanceof SparkCopyOnWriteScan, "%s is not SparkCopyOnWriteScan", scan); + Preconditions.checkArgument( + scan instanceof SparkCopyOnWriteScan, "%s is not SparkCopyOnWriteScan", scan); Preconditions.checkState(!overwriteByFilter, "Cannot overwrite individual files and by filter"); - Preconditions.checkState(!overwriteDynamic, "Cannot overwrite individual files and dynamically"); - Preconditions.checkState(rewrittenFileSetId == null, "Cannot overwrite individual files and rewrite"); + Preconditions.checkState( + !overwriteDynamic, "Cannot overwrite individual files and dynamically"); + Preconditions.checkState( + rewrittenFileSetId == null, "Cannot overwrite individual files and rewrite"); this.overwriteFiles = true; this.copyOnWriteScan = (SparkCopyOnWriteScan) scan; @@ -101,9 +103,11 @@ public WriteBuilder overwriteFiles(Scan scan, Command command, IsolationLevel is @Override public WriteBuilder overwriteDynamicPartitions() { - Preconditions.checkState(!overwriteByFilter, "Cannot overwrite dynamically and by filter: %s", overwriteExpr); + Preconditions.checkState( + !overwriteByFilter, "Cannot overwrite dynamically and by filter: %s", overwriteExpr); Preconditions.checkState(!overwriteFiles, "Cannot overwrite individual files and dynamically"); - Preconditions.checkState(rewrittenFileSetId == null, "Cannot overwrite dynamically and rewrite"); + Preconditions.checkState( + rewrittenFileSetId == null, "Cannot overwrite dynamically and rewrite"); this.overwriteDynamic = true; return this; @@ -111,7 +115,8 @@ public WriteBuilder overwriteDynamicPartitions() { @Override public WriteBuilder overwrite(Filter[] filters) { - Preconditions.checkState(!overwriteFiles, "Cannot overwrite individual files and using filters"); + Preconditions.checkState( + !overwriteFiles, "Cannot overwrite individual files and using filters"); Preconditions.checkState(rewrittenFileSetId == null, "Cannot overwrite and rewrite"); this.overwriteExpr = SparkFilters.convert(filters); @@ -119,7 +124,8 @@ public WriteBuilder overwrite(Filter[] filters) { // use the write option to override truncating the table. use dynamic overwrite instead. this.overwriteDynamic = true; } else { - Preconditions.checkState(!overwriteDynamic, "Cannot overwrite dynamically and by filter: %s", overwriteExpr); + Preconditions.checkState( + !overwriteDynamic, "Cannot overwrite dynamically and by filter: %s", overwriteExpr); this.overwriteByFilter = true; } return this; @@ -128,7 +134,8 @@ public WriteBuilder overwrite(Filter[] filters) { @Override public Write build() { // Validate - Preconditions.checkArgument(handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), + Preconditions.checkArgument( + handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); Schema writeSchema = validateOrMergeWriteSchema(table, dsSchema, writeConf); @@ -145,7 +152,8 @@ public Write build() { distribution = buildRequiredDistribution(); ordering = buildRequiredOrdering(distribution); } else { - LOG.warn("Skipping distribution/ordering: extensions are disabled and spec contains unsupported transforms"); + LOG.warn( + "Skipping distribution/ordering: extensions are disabled and spec contains unsupported transforms"); distribution = Distributions.unspecified(); ordering = NO_ORDERING; } @@ -155,7 +163,8 @@ public Write build() { ordering = NO_ORDERING; } - return new SparkWrite(spark, table, writeConf, writeInfo, appId, writeSchema, dsSchema, distribution, ordering) { + return new SparkWrite( + spark, table, writeConf, writeInfo, appId, writeSchema, dsSchema, distribution, ordering) { @Override public BatchWrite toBatch() { @@ -174,12 +183,14 @@ public BatchWrite toBatch() { @Override public StreamingWrite toStreaming() { - Preconditions.checkState(!overwriteDynamic, - "Unsupported streaming operation: dynamic partition overwrite"); - Preconditions.checkState(!overwriteByFilter || overwriteExpr == Expressions.alwaysTrue(), - "Unsupported streaming operation: overwrite by filter: %s", overwriteExpr); - Preconditions.checkState(rewrittenFileSetId == null, - "Unsupported streaming operation: rewrite"); + Preconditions.checkState( + !overwriteDynamic, "Unsupported streaming operation: dynamic partition overwrite"); + Preconditions.checkState( + !overwriteByFilter || overwriteExpr == Expressions.alwaysTrue(), + "Unsupported streaming operation: overwrite by filter: %s", + overwriteExpr); + Preconditions.checkState( + rewrittenFileSetId == null, "Unsupported streaming operation: rewrite"); if (overwriteByFilter) { return asStreamingOverwrite(); @@ -193,7 +204,8 @@ public StreamingWrite toStreaming() { private Distribution buildRequiredDistribution() { if (overwriteFiles) { DistributionMode distributionMode = copyOnWriteDistributionMode(); - return SparkDistributionAndOrderingUtil.buildCopyOnWriteDistribution(table, copyOnWriteCommand, distributionMode); + return SparkDistributionAndOrderingUtil.buildCopyOnWriteDistribution( + table, copyOnWriteCommand, distributionMode); } else { DistributionMode distributionMode = writeConf.distributionMode(); return SparkDistributionAndOrderingUtil.buildRequiredDistribution(table, distributionMode); @@ -215,7 +227,8 @@ private DistributionMode copyOnWriteDistributionMode() { private SortOrder[] buildRequiredOrdering(Distribution requiredDistribution) { if (overwriteFiles) { - return SparkDistributionAndOrderingUtil.buildCopyOnWriteOrdering(table, copyOnWriteCommand, requiredDistribution); + return SparkDistributionAndOrderingUtil.buildCopyOnWriteOrdering( + table, copyOnWriteCommand, requiredDistribution); } else { return SparkDistributionAndOrderingUtil.buildRequiredOrdering(table, requiredDistribution); } @@ -225,7 +238,8 @@ private boolean allIdentityTransforms(PartitionSpec spec) { return spec.fields().stream().allMatch(field -> field.transform().isIdentity()); } - private static Schema validateOrMergeWriteSchema(Table table, StructType dsSchema, SparkWriteConf writeConf) { + private static Schema validateOrMergeWriteSchema( + Table table, StructType dsSchema, SparkWriteConf writeConf) { Schema writeSchema; if (writeConf.mergeSchema()) { // convert the dataset schema and assign fresh ids for new fields diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/StagedSparkTable.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/StagedSparkTable.java index 2e018cb09496..b92c02d2b536 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/StagedSparkTable.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/StagedSparkTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.Transaction; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java index 939b07a0af61..ddf6ca834d9b 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.OptionalLong; diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java index 64277ecf3be5..f2088deb1ee3 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import com.fasterxml.jackson.core.JsonGenerator; @@ -47,10 +46,10 @@ class StreamingOffset extends Offset { * An implementation of Spark Structured Streaming Offset, to track the current processed files of * Iceberg table. * - * @param snapshotId The current processed snapshot id. - * @param position The position of last scanned file in snapshot. - * @param scanAllFiles whether to scan all files in a snapshot; for example, to read - * all data when starting a stream. + * @param snapshotId The current processed snapshot id. + * @param position The position of last scanned file in snapshot. + * @param scanAllFiles whether to scan all files in a snapshot; for example, to read all data when + * starting a stream. */ StreamingOffset(long snapshotId, long position, boolean scanAllFiles) { this.snapshotId = snapshotId; @@ -65,7 +64,8 @@ static StreamingOffset fromJson(String json) { JsonNode node = JsonUtil.mapper().readValue(json, JsonNode.class); return fromJsonNode(node); } catch (IOException e) { - throw new UncheckedIOException(String.format("Failed to parse StreamingOffset from JSON string %s", json), e); + throw new UncheckedIOException( + String.format("Failed to parse StreamingOffset from JSON string %s", json), e); } } @@ -118,9 +118,9 @@ boolean shouldScanAllFiles() { public boolean equals(Object obj) { if (obj instanceof StreamingOffset) { StreamingOffset offset = (StreamingOffset) obj; - return offset.snapshotId == snapshotId && - offset.position == position && - offset.scanAllFiles == scanAllFiles; + return offset.snapshotId == snapshotId + && offset.position == position + && offset.scanAllFiles == scanAllFiles; } else { return false; } @@ -133,17 +133,20 @@ public int hashCode() { @Override public String toString() { - return String.format("Streaming Offset[%d: position (%d) scan_all_files (%b)]", - snapshotId, position, scanAllFiles); + return String.format( + "Streaming Offset[%d: position (%d) scan_all_files (%b)]", + snapshotId, position, scanAllFiles); } private static StreamingOffset fromJsonNode(JsonNode node) { // The version of StreamingOffset. The offset was created with a version number // used to validate when deserializing from json string. int version = JsonUtil.getInt(VERSION, node); - Preconditions.checkArgument(version == CURR_VERSION, + Preconditions.checkArgument( + version == CURR_VERSION, "This version of Iceberg source only supports version %s. Version %s is not supported.", - CURR_VERSION, version); + CURR_VERSION, + version); long snapshotId = JsonUtil.getLong(SNAPSHOT_ID, node); int position = JsonUtil.getInt(POSITION, node); diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java index a2288ef3edd7..3c7ebabeab3d 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.math.BigDecimal; @@ -128,7 +127,8 @@ public int getInt(int ordinal) { } else if (integer instanceof LocalDate) { return (int) ((LocalDate) integer).toEpochDay(); } else { - throw new IllegalStateException("Unknown type for int field. Type name: " + integer.getClass().getName()); + throw new IllegalStateException( + "Unknown type for int field. Type name: " + integer.getClass().getName()); } } @@ -143,7 +143,8 @@ public long getLong(int ordinal) { } else if (longVal instanceof LocalDate) { return ((LocalDate) longVal).toEpochDay(); } else { - throw new IllegalStateException("Unknown type for long field. Type name: " + longVal.getClass().getName()); + throw new IllegalStateException( + "Unknown type for long field. Type name: " + longVal.getClass().getName()); } } @@ -190,7 +191,8 @@ private byte[] getBinaryInternal(int ordinal) { } else if (bytes instanceof byte[]) { return (byte[]) bytes; } else { - throw new IllegalStateException("Unknown type for binary field. Type name: " + bytes.getClass().getName()); + throw new IllegalStateException( + "Unknown type for binary field. Type name: " + bytes.getClass().getName()); } } @@ -206,8 +208,7 @@ public InternalRow getStruct(int ordinal, int numFields) { private InternalRow getStructInternal(int ordinal, int numFields) { return new StructInternalRow( - type.fields().get(ordinal).type().asStructType(), - struct.get(ordinal, StructLike.class)); + type.fields().get(ordinal).type().asStructType(), struct.get(ordinal, StructLike.class)); } @Override @@ -227,7 +228,8 @@ public MapData getMap(int ordinal) { } private MapData getMapInternal(int ordinal) { - return mapToMapData(type.fields().get(ordinal).type().asMapType(), struct.get(ordinal, Map.class)); + return mapToMapData( + type.fields().get(ordinal).type().asMapType(), struct.get(ordinal, Map.class)); } @Override @@ -292,31 +294,52 @@ private ArrayData collectionToArrayData(Type elementType, Collection values) case DOUBLE: return fillArray(values, array -> (pos, value) -> array[pos] = value); case STRING: - return fillArray(values, array -> - (BiConsumer) (pos, seq) -> array[pos] = UTF8String.fromString(seq.toString())); + return fillArray( + values, + array -> + (BiConsumer) + (pos, seq) -> array[pos] = UTF8String.fromString(seq.toString())); case FIXED: case BINARY: - return fillArray(values, array -> - (BiConsumer) (pos, buf) -> array[pos] = ByteBuffers.toByteArray(buf)); + return fillArray( + values, + array -> + (BiConsumer) + (pos, buf) -> array[pos] = ByteBuffers.toByteArray(buf)); case DECIMAL: - return fillArray(values, array -> - (BiConsumer) (pos, dec) -> array[pos] = Decimal.apply(dec)); + return fillArray( + values, + array -> + (BiConsumer) (pos, dec) -> array[pos] = Decimal.apply(dec)); case STRUCT: - return fillArray(values, array -> (BiConsumer) (pos, tuple) -> - array[pos] = new StructInternalRow(elementType.asStructType(), tuple)); + return fillArray( + values, + array -> + (BiConsumer) + (pos, tuple) -> + array[pos] = new StructInternalRow(elementType.asStructType(), tuple)); case LIST: - return fillArray(values, array -> (BiConsumer>) (pos, list) -> - array[pos] = collectionToArrayData(elementType.asListType().elementType(), list)); + return fillArray( + values, + array -> + (BiConsumer>) + (pos, list) -> + array[pos] = + collectionToArrayData(elementType.asListType().elementType(), list)); case MAP: - return fillArray(values, array -> (BiConsumer>) (pos, map) -> - array[pos] = mapToMapData(elementType.asMapType(), map)); + return fillArray( + values, + array -> + (BiConsumer>) + (pos, map) -> array[pos] = mapToMapData(elementType.asMapType(), map)); default: throw new UnsupportedOperationException("Unsupported array element type: " + elementType); } } @SuppressWarnings("unchecked") - private GenericArrayData fillArray(Collection values, Function> makeSetter) { + private GenericArrayData fillArray( + Collection values, Function> makeSetter) { Object[] array = new Object[values.size()]; BiConsumer setter = makeSetter.apply(array); diff --git a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/catalyst/analysis/NoSuchProcedureException.java b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/catalyst/analysis/NoSuchProcedureException.java index 25f6368cc8c3..18cc64a20601 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/catalyst/analysis/NoSuchProcedureException.java +++ b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/catalyst/analysis/NoSuchProcedureException.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.catalyst.analysis; import org.apache.spark.sql.AnalysisException; @@ -25,7 +24,13 @@ public class NoSuchProcedureException extends AnalysisException { public NoSuchProcedureException(Identifier ident) { - super("Procedure " + ident + " not found", Option.empty(), Option.empty(), Option.empty(), - Option.empty(), Option.empty(), new String[0]); + super( + "Procedure " + ident + " not found", + Option.empty(), + Option.empty(), + Option.empty(), + Option.empty(), + Option.empty(), + new String[0]); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/Procedure.java b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/Procedure.java index 8f7a70b9f9fc..11f215ba040a 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/Procedure.java +++ b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/Procedure.java @@ -16,44 +16,34 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.types.StructType; -/** - * An interface representing a stored procedure available for execution. - */ +/** An interface representing a stored procedure available for execution. */ public interface Procedure { - /** - * Returns the input parameters of this procedure. - */ + /** Returns the input parameters of this procedure. */ ProcedureParameter[] parameters(); - /** - * Returns the type of rows produced by this procedure. - */ + /** Returns the type of rows produced by this procedure. */ StructType outputType(); /** * Executes this procedure. - *

    - * Spark will align the provided arguments according to the input parameters - * defined in {@link #parameters()} either by position or by name before execution. - *

    - * Implementations may provide a summary of execution by returning one or many rows - * as a result. The schema of output rows must match the defined output type - * in {@link #outputType()}. + * + *

    Spark will align the provided arguments according to the input parameters defined in {@link + * #parameters()} either by position or by name before execution. + * + *

    Implementations may provide a summary of execution by returning one or many rows as a + * result. The schema of output rows must match the defined output type in {@link #outputType()}. * * @param args input arguments * @return the result of executing this procedure with the given arguments */ InternalRow[] call(InternalRow args); - /** - * Returns the description of this procedure. - */ + /** Returns the description of this procedure. */ default String description() { return this.getClass().toString(); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureCatalog.java b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureCatalog.java index 314bd659460e..2cee97ee5938 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureCatalog.java +++ b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.catalyst.analysis.NoSuchProcedureException; @@ -25,9 +24,9 @@ /** * A catalog API for working with stored procedures. - *

    - * Implementations should implement this interface if they expose stored procedures that - * can be called via CALL statements. + * + *

    Implementations should implement this interface if they expose stored procedures that can be + * called via CALL statements. */ public interface ProcedureCatalog extends CatalogPlugin { /** diff --git a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameter.java b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameter.java index b341dc1e3282..e1e84b2597f3 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameter.java +++ b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameter.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.types.DataType; -/** - * An input parameter of a {@link Procedure stored procedure}. - */ +/** An input parameter of a {@link Procedure stored procedure}. */ public interface ProcedureParameter { /** @@ -48,18 +45,12 @@ static ProcedureParameter optional(String name, DataType dataType) { return new ProcedureParameterImpl(name, dataType, false); } - /** - * Returns the name of this parameter. - */ + /** Returns the name of this parameter. */ String name(); - /** - * Returns the type of this parameter. - */ + /** Returns the type of this parameter. */ DataType dataType(); - /** - * Returns true if this parameter is required. - */ + /** Returns true if this parameter is required. */ boolean required(); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameterImpl.java b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameterImpl.java index cea1e80f4051..c59951e24330 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameterImpl.java +++ b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameterImpl.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import java.util.Objects; import org.apache.spark.sql.types.DataType; -/** - * A {@link ProcedureParameter} implementation. - */ +/** A {@link ProcedureParameter} implementation. */ class ProcedureParameterImpl implements ProcedureParameter { private final String name; private final DataType dataType; @@ -60,9 +57,9 @@ public boolean equals(Object other) { } ProcedureParameterImpl that = (ProcedureParameterImpl) other; - return required == that.required && - Objects.equals(name, that.name) && - Objects.equals(dataType, that.dataType); + return required == that.required + && Objects.equals(name, that.name) + && Objects.equals(dataType, that.dataType); } @Override @@ -72,6 +69,7 @@ public int hashCode() { @Override public String toString() { - return String.format("ProcedureParameter(name='%s', type=%s, required=%b)", name, dataType, required); + return String.format( + "ProcedureParameter(name='%s', type=%s, required=%b)", name, dataType, required); } } diff --git a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/SupportsRowLevelOperations.java b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/SupportsRowLevelOperations.java index a228b5b9ca35..72ce05d5e5b3 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/SupportsRowLevelOperations.java +++ b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/SupportsRowLevelOperations.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.connector.catalog.Table; @@ -24,13 +23,13 @@ import org.apache.spark.sql.connector.iceberg.write.RowLevelOperationInfo; /** - * A mix-in interface for row-level operations support. Data sources can implement - * this interface to indicate they support rewriting data for DELETE, UPDATE, MERGE operations. + * A mix-in interface for row-level operations support. Data sources can implement this interface to + * indicate they support rewriting data for DELETE, UPDATE, MERGE operations. */ public interface SupportsRowLevelOperations extends Table { /** - * Returns a RowLevelOperationBuilder to build a RowLevelOperation. - * Spark will call this method while planning DELETE, UPDATE and MERGE operations. + * Returns a RowLevelOperationBuilder to build a RowLevelOperation. Spark will call this method + * while planning DELETE, UPDATE and MERGE operations. * * @param info the row-level operation info such command (e.g. DELETE) and options * @return the row-level operation builder diff --git a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaBatchWrite.java b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaBatchWrite.java index a1fbf0fb8a24..1bad054e3215 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaBatchWrite.java +++ b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaBatchWrite.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import org.apache.spark.sql.connector.write.BatchWrite; import org.apache.spark.sql.connector.write.PhysicalWriteInfo; -/** - * An interface that defines how to write a delta of rows during batch processing. - */ +/** An interface that defines how to write a delta of rows during batch processing. */ public interface DeltaBatchWrite extends BatchWrite { @Override DeltaWriterFactory createBatchWriterFactory(PhysicalWriteInfo info); diff --git a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWrite.java b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWrite.java index 7643e2d103ec..fb452c652e7d 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWrite.java +++ b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWrite.java @@ -16,16 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import org.apache.spark.sql.connector.write.Write; /** - * A logical representation of a data source write that handles a delta of rows. - * A delta of rows is a set of instructions that indicate which records need to be deleted, - * updated, or inserted. Data sources that support deltas allow Spark to discard unchanged rows - * and pass only the information about what rows have changed during a row-level operation. + * A logical representation of a data source write that handles a delta of rows. A delta of rows is + * a set of instructions that indicate which records need to be deleted, updated, or inserted. Data + * sources that support deltas allow Spark to discard unchanged rows and pass only the information + * about what rows have changed during a row-level operation. */ public interface DeltaWrite extends Write { @Override diff --git a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriteBuilder.java b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriteBuilder.java index 1af1c6680c89..56214a4adcbf 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriteBuilder.java +++ b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriteBuilder.java @@ -16,18 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import org.apache.spark.sql.connector.write.WriteBuilder; -/** - * An interface for building delta writes. - */ +/** An interface for building delta writes. */ public interface DeltaWriteBuilder extends WriteBuilder { - /** - * Returns a logical delta write. - */ + /** Returns a logical delta write. */ @Override default DeltaWrite build() { throw new UnsupportedOperationException("Not implemented: build"); diff --git a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriter.java b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriter.java index a17ee33b13d7..efeed371f940 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriter.java +++ b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriter.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import java.io.IOException; import org.apache.spark.sql.connector.write.DataWriter; -/** - * A data writer responsible for writing a delta of rows. - */ +/** A data writer responsible for writing a delta of rows. */ public interface DeltaWriter extends DataWriter { /** * Passes information for a row that must be deleted. diff --git a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriterFactory.java b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriterFactory.java index 77af70958e41..f779474c35bd 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriterFactory.java +++ b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriterFactory.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.connector.write.DataWriterFactory; -/** - * A factory for creating and initializing delta writers at the executor side. - */ +/** A factory for creating and initializing delta writers at the executor side. */ public interface DeltaWriterFactory extends DataWriterFactory { @Override DeltaWriter createWriter(int partitionId, long taskId); diff --git a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/ExtendedLogicalWriteInfo.java b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/ExtendedLogicalWriteInfo.java index 3c39a4f0f1b5..a13f56ecea0d 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/ExtendedLogicalWriteInfo.java +++ b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/ExtendedLogicalWriteInfo.java @@ -16,23 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.types.StructType; -/** - * A class that holds logical write information not covered by LogicalWriteInfo in Spark. - */ +/** A class that holds logical write information not covered by LogicalWriteInfo in Spark. */ public interface ExtendedLogicalWriteInfo extends LogicalWriteInfo { - /** - * The schema of the input metadata from Spark to data source. - */ + /** The schema of the input metadata from Spark to data source. */ StructType metadataSchema(); - /** - * The schema of the ID columns from Spark to data source. - */ + /** The schema of the ID columns from Spark to data source. */ StructType rowIdSchema(); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/RowLevelOperation.java b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/RowLevelOperation.java index 1ab66ec5a435..ad896882efb6 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/RowLevelOperation.java +++ b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/RowLevelOperation.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import org.apache.spark.sql.connector.expressions.NamedReference; @@ -30,58 +29,53 @@ */ public interface RowLevelOperation { - /** - * The SQL operation being performed. - */ + /** The SQL operation being performed. */ enum Command { - DELETE, UPDATE, MERGE + DELETE, + UPDATE, + MERGE } - /** - * Returns the description associated with this row-level operation. - */ + /** Returns the description associated with this row-level operation. */ default String description() { return this.getClass().toString(); } - /** - * Returns the actual SQL operation being performed. - */ + /** Returns the actual SQL operation being performed. */ Command command(); /** * Returns a scan builder to configure a scan for this row-level operation. - *

    - * Sources fall into two categories: those that can handle a delta of rows and those that need + * + *

    Sources fall into two categories: those that can handle a delta of rows and those that need * to replace groups (e.g. partitions, files). Sources that handle deltas allow Spark to quickly - * discard unchanged rows and have no requirements for input scans. Sources that replace groups - * of rows can discard deleted rows but need to keep unchanged rows to be passed back into - * the source. This means that scans for such data sources must produce all rows in a group - * if any are returned. Some sources will avoid pushing filters into files (file granularity), - * while others will avoid pruning files within a partition (partition granularity). - *

    - * For example, if a source can only replace partitions, all rows from a partition must - * be returned by the scan, even if a filter can narrow the set of changes to a single file - * in the partition. Similarly, a source that can swap individual files must produce all rows - * of files where at least one record must be changed, not just the rows that must be changed. + * discard unchanged rows and have no requirements for input scans. Sources that replace groups of + * rows can discard deleted rows but need to keep unchanged rows to be passed back into the + * source. This means that scans for such data sources must produce all rows in a group if any are + * returned. Some sources will avoid pushing filters into files (file granularity), while others + * will avoid pruning files within a partition (partition granularity). + * + *

    For example, if a source can only replace partitions, all rows from a partition must be + * returned by the scan, even if a filter can narrow the set of changes to a single file in the + * partition. Similarly, a source that can swap individual files must produce all rows of files + * where at least one record must be changed, not just the rows that must be changed. */ ScanBuilder newScanBuilder(CaseInsensitiveStringMap options); /** * Returns a write builder to configure a write for this row-level operation. - *

    - * Note that Spark will first configure the scan and then the write, allowing data sources - * to pass information from the scan to the write. For example, the scan can report - * which condition was used to read the data that may be needed by the write under certain - * isolation levels. + * + *

    Note that Spark will first configure the scan and then the write, allowing data sources to + * pass information from the scan to the write. For example, the scan can report which condition + * was used to read the data that may be needed by the write under certain isolation levels. */ WriteBuilder newWriteBuilder(ExtendedLogicalWriteInfo info); /** * Returns metadata attributes that are required to perform this row-level operation. - *

    - * Data sources that can use this method to project metadata columns needed for writing - * the data back (e.g. metadata columns for grouping data). + * + *

    Data sources that can use this method to project metadata columns needed for writing the + * data back (e.g. metadata columns for grouping data). */ default NamedReference[] requiredMetadataAttributes() { return new NamedReference[0]; diff --git a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/RowLevelOperationBuilder.java b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/RowLevelOperationBuilder.java index 772c3ef52918..5035ccb5f526 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/RowLevelOperationBuilder.java +++ b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/RowLevelOperationBuilder.java @@ -16,15 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; -/** - * An interface for building a row-level operation. - */ +/** An interface for building a row-level operation. */ public interface RowLevelOperationBuilder { /** - * Returns a row-level operation that controls how Spark rewrites data for DELETE, UPDATE, MERGE commands. + * Returns a row-level operation that controls how Spark rewrites data for DELETE, UPDATE, MERGE + * commands. */ RowLevelOperation build(); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/RowLevelOperationInfo.java b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/RowLevelOperationInfo.java index e1d3353958ee..70ba0fb595eb 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/RowLevelOperationInfo.java +++ b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/RowLevelOperationInfo.java @@ -16,23 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command; import org.apache.spark.sql.util.CaseInsensitiveStringMap; -/** - * An interface with logical information for a row-level operation such as DELETE or MERGE. - */ +/** An interface with logical information for a row-level operation such as DELETE or MERGE. */ public interface RowLevelOperationInfo { - /** - * Returns options that the user specified when performing the row-level operation. - */ + /** Returns options that the user specified when performing the row-level operation. */ CaseInsensitiveStringMap options(); - /** - * Returns the SQL command (e.g. DELETE, UPDATE, MERGE) for this row-level operation. - */ + /** Returns the SQL command (e.g. DELETE, UPDATE, MERGE) for this row-level operation. */ Command command(); } diff --git a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/SupportsDelta.java b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/SupportsDelta.java index 8a1a27f331e3..0b083af5d935 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/SupportsDelta.java +++ b/spark/v3.2/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/SupportsDelta.java @@ -16,21 +16,18 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import org.apache.spark.sql.connector.expressions.NamedReference; /** - * A mix-in interface for RowLevelOperation. Data sources can implement this interface - * to indicate they support handling deltas of rows. + * A mix-in interface for RowLevelOperation. Data sources can implement this interface to indicate + * they support handling deltas of rows. */ public interface SupportsDelta extends RowLevelOperation { @Override DeltaWriteBuilder newWriteBuilder(ExtendedLogicalWriteInfo info); - /** - * Returns the row ID column references that should be used for row equality. - */ + /** Returns the row ID column references that should be used for row equality. */ NamedReference[] rowId(); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/KryoHelpers.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/KryoHelpers.java index ee0f0a73959a..6d88aaa11813 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/KryoHelpers.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/KryoHelpers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.esotericsoftware.kryo.Kryo; @@ -32,8 +31,7 @@ public class KryoHelpers { - private KryoHelpers() { - } + private KryoHelpers() {} @SuppressWarnings("unchecked") public static T roundTripSerialize(T obj) throws IOException { @@ -45,7 +43,8 @@ public static T roundTripSerialize(T obj) throws IOException { kryo.writeClassAndObject(out, obj); } - try (Input in = new Input(new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray())))) { + try (Input in = + new Input(new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray())))) { return (T) kryo.readClassAndObject(in); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java index dd187c0b6bf0..c44bacf149b5 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Comparator; @@ -25,15 +24,15 @@ import org.junit.Assert; public final class TaskCheckHelper { - private TaskCheckHelper() { - } + private TaskCheckHelper() {} - public static void assertEquals(ScanTaskGroup expected, ScanTaskGroup actual) { + public static void assertEquals( + ScanTaskGroup expected, ScanTaskGroup actual) { List expectedTasks = getFileScanTasksInFilePathOrder(expected); List actualTasks = getFileScanTasksInFilePathOrder(actual); - Assert.assertEquals("The number of file scan tasks should match", - expectedTasks.size(), actualTasks.size()); + Assert.assertEquals( + "The number of file scan tasks should match", expectedTasks.size(), actualTasks.size()); for (int i = 0; i < expectedTasks.size(); i++) { FileScanTask expectedTask = expectedTasks.get(i); @@ -50,41 +49,60 @@ public static void assertEquals(FileScanTask expected, FileScanTask actual) { Assert.assertEquals("starting position doesn't match", expected.start(), actual.start()); - Assert.assertEquals("the number of bytes to scan doesn't match", expected.start(), actual.start()); + Assert.assertEquals( + "the number of bytes to scan doesn't match", expected.start(), actual.start()); // simplify comparison on residual expression via comparing toString - Assert.assertEquals("Residual expression doesn't match", - expected.residual().toString(), actual.residual().toString()); + Assert.assertEquals( + "Residual expression doesn't match", + expected.residual().toString(), + actual.residual().toString()); } public static void assertEquals(DataFile expected, DataFile actual) { - Assert.assertEquals("Should match the serialized record path", - expected.path(), actual.path()); - Assert.assertEquals("Should match the serialized record format", - expected.format(), actual.format()); - Assert.assertEquals("Should match the serialized record partition", - expected.partition().get(0, Object.class), actual.partition().get(0, Object.class)); - Assert.assertEquals("Should match the serialized record count", - expected.recordCount(), actual.recordCount()); - Assert.assertEquals("Should match the serialized record size", - expected.fileSizeInBytes(), actual.fileSizeInBytes()); - Assert.assertEquals("Should match the serialized record value counts", - expected.valueCounts(), actual.valueCounts()); - Assert.assertEquals("Should match the serialized record null value counts", - expected.nullValueCounts(), actual.nullValueCounts()); - Assert.assertEquals("Should match the serialized record lower bounds", - expected.lowerBounds(), actual.lowerBounds()); - Assert.assertEquals("Should match the serialized record upper bounds", - expected.upperBounds(), actual.upperBounds()); - Assert.assertEquals("Should match the serialized record key metadata", - expected.keyMetadata(), actual.keyMetadata()); - Assert.assertEquals("Should match the serialized record offsets", - expected.splitOffsets(), actual.splitOffsets()); - Assert.assertEquals("Should match the serialized record offsets", - expected.keyMetadata(), actual.keyMetadata()); + Assert.assertEquals("Should match the serialized record path", expected.path(), actual.path()); + Assert.assertEquals( + "Should match the serialized record format", expected.format(), actual.format()); + Assert.assertEquals( + "Should match the serialized record partition", + expected.partition().get(0, Object.class), + actual.partition().get(0, Object.class)); + Assert.assertEquals( + "Should match the serialized record count", expected.recordCount(), actual.recordCount()); + Assert.assertEquals( + "Should match the serialized record size", + expected.fileSizeInBytes(), + actual.fileSizeInBytes()); + Assert.assertEquals( + "Should match the serialized record value counts", + expected.valueCounts(), + actual.valueCounts()); + Assert.assertEquals( + "Should match the serialized record null value counts", + expected.nullValueCounts(), + actual.nullValueCounts()); + Assert.assertEquals( + "Should match the serialized record lower bounds", + expected.lowerBounds(), + actual.lowerBounds()); + Assert.assertEquals( + "Should match the serialized record upper bounds", + expected.upperBounds(), + actual.upperBounds()); + Assert.assertEquals( + "Should match the serialized record key metadata", + expected.keyMetadata(), + actual.keyMetadata()); + Assert.assertEquals( + "Should match the serialized record offsets", + expected.splitOffsets(), + actual.splitOffsets()); + Assert.assertEquals( + "Should match the serialized record offsets", expected.keyMetadata(), actual.keyMetadata()); } - private static List getFileScanTasksInFilePathOrder(ScanTaskGroup taskGroup) { + private static List getFileScanTasksInFilePathOrder( + ScanTaskGroup taskGroup) { return taskGroup.tasks().stream() // use file path + start position to differentiate the tasks .sorted(Comparator.comparing(o -> o.file().path().toString() + "##" + o.start())) diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java index 12fa8b2fc539..33b5316b72b7 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TaskCheckHelper.assertEquals; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; @@ -51,22 +54,17 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.TaskCheckHelper.assertEquals; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestDataFileSerialization { - private static final Schema DATE_SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema DATE_SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec PARTITION_SPEC = PartitionSpec - .builderFor(DATE_SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec PARTITION_SPEC = + PartitionSpec.builderFor(DATE_SCHEMA).identity("date").build(); private static final Map VALUE_COUNTS = Maps.newHashMap(); private static final Map NULL_VALUE_COUNTS = Maps.newHashMap(); @@ -85,20 +83,26 @@ public class TestDataFileSerialization { UPPER_BOUNDS.put(1, longToBuffer(4L)); } - private static final DataFile DATA_FILE = DataFiles - .builder(PARTITION_SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(1234) - .withPartitionPath("date=2018-06-08") - .withMetrics(new Metrics( - 5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS)) - .withSplitOffsets(ImmutableList.of(4L)) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) - .withSortOrder(SortOrder.unsorted()) - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final DataFile DATA_FILE = + DataFiles.builder(PARTITION_SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(1234) + .withPartitionPath("date=2018-06-08") + .withMetrics( + new Metrics( + 5L, + null, + VALUE_COUNTS, + NULL_VALUE_COUNTS, + NAN_VALUE_COUNTS, + LOWER_BOUNDS, + UPPER_BOUNDS)) + .withSplitOffsets(ImmutableList.of(4L)) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) + .withSortOrder(SortOrder.unsorted()) + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testDataFileKryoSerialization() throws Exception { @@ -128,7 +132,8 @@ public void testDataFileJavaSerialization() throws Exception { out.writeObject(DATA_FILE.copy()); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { for (int i = 0; i < 2; i += 1) { Object obj = in.readObject(); Assertions.assertThat(obj).as("Should be a DataFile").isInstanceOf(DataFile.class); @@ -140,13 +145,14 @@ public void testDataFileJavaSerialization() throws Exception { @Test public void testParquetWriterSplitOffsets() throws IOException { Iterable records = RandomData.generateSpark(DATE_SCHEMA, 1, 33L); - File parquetFile = new File( - temp.getRoot(), - FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); + File parquetFile = + new File(temp.getRoot(), FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); FileAppender writer = Parquet.write(Files.localOutput(parquetFile)) .schema(DATE_SCHEMA) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType)) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType)) .build(); try { writer.addAll(records); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java index 03211a89b873..03b62457037e 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -38,36 +40,29 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestFileIOSerialization { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("date").build(); - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA) - .asc("id") - .build(); + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); static { CONF.set("k1", "v1"); CONF.set("k2", "v2"); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @Before diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestHadoopMetricsContextSerialization.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestHadoopMetricsContextSerialization.java index cba5ac686d46..409104448dfb 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestHadoopMetricsContextSerialization.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestHadoopMetricsContextSerialization.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -42,7 +41,8 @@ public void testHadoopMetricsContextKryoSerialization() throws IOException { } @Test - public void testHadoopMetricsContextJavaSerialization() throws IOException, ClassNotFoundException { + public void testHadoopMetricsContextJavaSerialization() + throws IOException, ClassNotFoundException { MetricsContext metricsContext = new HadoopMetricsContext("s3"); metricsContext.initialize(Maps.newHashMap()); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java index 25004aa110e4..a20b2d9f05de 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; @@ -47,56 +49,57 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestManifestFileSerialization { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - required(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("double") - .build(); - - private static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withPartition(TestHelpers.Row.of(1D)) - .withPartitionPath("double=1") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - ImmutableMap.of(), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); - - private static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(0) - .withPartition(TestHelpers.Row.of(Double.NaN)) - .withPartitionPath("double=NaN") - .withMetrics(new Metrics(1L, - null, // no column sizes - ImmutableMap.of(1, 1L, 4, 1L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - ImmutableMap.of(4, 1L), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(1L)) // upper bounds - )) - .build(); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + required(4, "double", Types.DoubleType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("double").build(); + + private static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(0) + .withPartition(TestHelpers.Row.of(1D)) + .withPartitionPath("double=1") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + ImmutableMap.of(), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(4L)) // upper bounds + )) + .build(); + + private static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-2.parquet") + .withFileSizeInBytes(0) + .withPartition(TestHelpers.Row.of(Double.NaN)) + .withPartitionPath("double=NaN") + .withMetrics( + new Metrics( + 1L, + null, // no column sizes + ImmutableMap.of(1, 1L, 4, 1L), // value count + ImmutableMap.of(1, 0L, 2, 0L), // null count + ImmutableMap.of(4, 1L), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(1L)) // upper bounds + )) + .build(); private static final FileIO FILE_IO = new HadoopFileIO(new Configuration()); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testManifestFileKryoSerialization() throws IOException { @@ -134,7 +137,8 @@ public void testManifestFileJavaSerialization() throws Exception { out.writeObject(GenericManifestFile.copyOf(manifest).build()); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { for (int i = 0; i < 3; i += 1) { Object obj = in.readObject(); Assertions.assertThat(obj).as("Should be a ManifestFile").isInstanceOf(ManifestFile.class); @@ -148,27 +152,46 @@ private void checkManifestFile(ManifestFile expected, ManifestFile actual) { Assert.assertEquals("Length must match", expected.length(), actual.length()); Assert.assertEquals("Spec id must match", expected.partitionSpecId(), actual.partitionSpecId()); Assert.assertEquals("Snapshot id must match", expected.snapshotId(), actual.snapshotId()); - Assert.assertEquals("Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); - Assert.assertEquals("Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); - Assert.assertEquals("Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); - Assert.assertEquals("Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); - Assert.assertEquals("Existing files count must match", expected.existingFilesCount(), actual.existingFilesCount()); - Assert.assertEquals("Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); - Assert.assertEquals("Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); - Assert.assertEquals("Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); - Assert.assertEquals("Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); + Assert.assertEquals( + "Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); + Assert.assertEquals( + "Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); + Assert.assertEquals( + "Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); + Assert.assertEquals( + "Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); + Assert.assertEquals( + "Existing files count must match", + expected.existingFilesCount(), + actual.existingFilesCount()); + Assert.assertEquals( + "Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); + Assert.assertEquals( + "Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); + Assert.assertEquals( + "Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); + Assert.assertEquals( + "Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); PartitionFieldSummary expectedPartition = expected.partitions().get(0); PartitionFieldSummary actualPartition = actual.partitions().get(0); - Assert.assertEquals("Null flag in partition must match", - expectedPartition.containsNull(), actualPartition.containsNull()); - Assert.assertEquals("NaN flag in partition must match", - expectedPartition.containsNaN(), actualPartition.containsNaN()); - Assert.assertEquals("Lower bounds in partition must match", - expectedPartition.lowerBound(), actualPartition.lowerBound()); - Assert.assertEquals("Upper bounds in partition must match", - expectedPartition.upperBound(), actualPartition.upperBound()); + Assert.assertEquals( + "Null flag in partition must match", + expectedPartition.containsNull(), + actualPartition.containsNull()); + Assert.assertEquals( + "NaN flag in partition must match", + expectedPartition.containsNaN(), + actualPartition.containsNaN()); + Assert.assertEquals( + "Lower bounds in partition must match", + expectedPartition.lowerBound(), + actualPartition.lowerBound()); + Assert.assertEquals( + "Upper bounds in partition must match", + expectedPartition.upperBound(), + actualPartition.upperBound()); } private ManifestFile writeManifest(DataFile... files) throws IOException { diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java index 1fe736b51aad..5e5d657eab56 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; + import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; @@ -52,19 +53,16 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestScanTaskSerialization extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String tableLocation = null; @@ -88,7 +86,9 @@ public void testBaseCombinedScanTaskKryoSerialization() throws Exception { try (Input in = new Input(new FileInputStream(data))) { Object obj = kryo.readClassAndObject(in); - Assertions.assertThat(obj).as("Should be a BaseCombinedScanTask").isInstanceOf(BaseCombinedScanTask.class); + Assertions.assertThat(obj) + .as("Should be a BaseCombinedScanTask") + .isInstanceOf(BaseCombinedScanTask.class); TaskCheckHelper.assertEquals(scanTask, (BaseCombinedScanTask) obj); } } @@ -102,9 +102,12 @@ public void testBaseCombinedScanTaskJavaSerialization() throws Exception { out.writeObject(scanTask); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { Object obj = in.readObject(); - Assertions.assertThat(obj).as("Should be a BaseCombinedScanTask").isInstanceOf(BaseCombinedScanTask.class); + Assertions.assertThat(obj) + .as("Should be a BaseCombinedScanTask") + .isInstanceOf(BaseCombinedScanTask.class); TaskCheckHelper.assertEquals(scanTask, (BaseCombinedScanTask) obj); } } @@ -126,7 +129,9 @@ public void testBaseScanTaskGroupKryoSerialization() throws Exception { try (Input in = new Input(Files.newInputStream(data.toPath()))) { Object obj = kryo.readClassAndObject(in); - Assertions.assertThat(obj).as("should be a BaseScanTaskGroup").isInstanceOf(BaseScanTaskGroup.class); + Assertions.assertThat(obj) + .as("should be a BaseScanTaskGroup") + .isInstanceOf(BaseScanTaskGroup.class); TaskCheckHelper.assertEquals(taskGroup, (BaseScanTaskGroup) obj); } } @@ -143,9 +148,12 @@ public void testBaseScanTaskGroupJavaSerialization() throws Exception { out.writeObject(taskGroup); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { Object obj = in.readObject(); - Assertions.assertThat(obj).as("should be a BaseScanTaskGroup").isInstanceOf(BaseScanTaskGroup.class); + Assertions.assertThat(obj) + .as("should be a BaseScanTaskGroup") + .isInstanceOf(BaseScanTaskGroup.class); TaskCheckHelper.assertEquals(taskGroup, (BaseScanTaskGroup) obj); } } @@ -167,16 +175,15 @@ private Table initTable() { Map options = Maps.newHashMap(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -190,10 +197,6 @@ private void writeRecords(List records) { } private void writeDF(Dataset df) { - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java index 8aa89b9f3199..30a167d575b1 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -32,30 +34,23 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestTableSerialization { private static final HadoopTables TABLES = new HadoopTables(); - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("date").build(); - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA) - .asc("id") - .build(); + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @Before @@ -71,19 +66,20 @@ public void initTable() throws IOException { @Test public void testSerializableTableKryoSerialization() throws IOException { Table serializableTable = SerializableTableWithSize.copyOf(table); - TestHelpers.assertSerializedAndLoadedMetadata(table, KryoHelpers.roundTripSerialize(serializableTable)); + TestHelpers.assertSerializedAndLoadedMetadata( + table, KryoHelpers.roundTripSerialize(serializableTable)); } @Test public void testSerializableMetadataTableKryoSerialization() throws IOException { for (MetadataTableType type : MetadataTableType.values()) { TableOperations ops = ((HasTableOperations) table).operations(); - Table metadataTable = MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); + Table metadataTable = + MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); Table serializableMetadataTable = SerializableTableWithSize.copyOf(metadataTable); TestHelpers.assertSerializedAndLoadedMetadata( - metadataTable, - KryoHelpers.roundTripSerialize(serializableMetadataTable)); + metadataTable, KryoHelpers.roundTripSerialize(serializableMetadataTable)); } } @@ -91,13 +87,12 @@ public void testSerializableMetadataTableKryoSerialization() throws IOException public void testSerializableTransactionTableKryoSerialization() throws IOException { Transaction txn = table.newTransaction(); - txn.updateProperties() - .set("k1", "v1") - .commit(); + txn.updateProperties().set("k1", "v1").commit(); Table txnTable = txn.table(); Table serializableTxnTable = SerializableTableWithSize.copyOf(txnTable); - TestHelpers.assertSerializedMetadata(txnTable, KryoHelpers.roundTripSerialize(serializableTxnTable)); + TestHelpers.assertSerializedMetadata( + txnTable, KryoHelpers.roundTripSerialize(serializableTxnTable)); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java index 5d5dfebf9532..1006ed380ff9 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java @@ -16,26 +16,29 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; public enum SparkCatalogConfig { - HIVE("testhive", SparkCatalog.class.getName(), ImmutableMap.of( - "type", "hive", - "default-namespace", "default" - )), - HADOOP("testhadoop", SparkCatalog.class.getName(), ImmutableMap.of( - "type", "hadoop" - )), - SPARK("spark_catalog", SparkSessionCatalog.class.getName(), ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "parquet-enabled", "true", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - )); + HIVE( + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default")), + HADOOP("testhadoop", SparkCatalog.class.getName(), ImmutableMap.of("type", "hadoop")), + SPARK( + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "parquet-enabled", "true", + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync + )); private final String catalogName; private final String implementation; diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java index 774e81328b2b..89323c26100c 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -31,29 +30,33 @@ public abstract class SparkCatalogTestBase extends SparkTestBaseWithCatalog { // these parameters are broken out to avoid changes that need to modify lots of test suites @Parameterized.Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") public static Object[][] parameters() { - return new Object[][] {{ - SparkCatalogConfig.HIVE.catalogName(), - SparkCatalogConfig.HIVE.implementation(), - SparkCatalogConfig.HIVE.properties() - }, { - SparkCatalogConfig.HADOOP.catalogName(), - SparkCatalogConfig.HADOOP.implementation(), - SparkCatalogConfig.HADOOP.properties() - }, { - SparkCatalogConfig.SPARK.catalogName(), - SparkCatalogConfig.SPARK.implementation(), - SparkCatalogConfig.SPARK.properties() - }}; + return new Object[][] { + { + SparkCatalogConfig.HIVE.catalogName(), + SparkCatalogConfig.HIVE.implementation(), + SparkCatalogConfig.HIVE.properties() + }, + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties() + }, + { + SparkCatalogConfig.SPARK.catalogName(), + SparkCatalogConfig.SPARK.implementation(), + SparkCatalogConfig.SPARK.properties() + } + }; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); public SparkCatalogTestBase(SparkCatalogConfig config) { super(config); } - public SparkCatalogTestBase(String catalogName, String implementation, Map config) { + public SparkCatalogTestBase( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java index 8022e8696b63..a669540fe475 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; + import java.io.IOException; import java.io.UncheckedIOException; import java.net.URI; @@ -49,8 +50,6 @@ import org.junit.Assert; import org.junit.BeforeClass; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; - public abstract class SparkTestBase { protected static final Object ANY = new Object(); @@ -66,15 +65,18 @@ public static void startMetastoreAndSpark() { metastore.start(); SparkTestBase.hiveConf = metastore.hiveConf(); - SparkTestBase.spark = SparkSession.builder() - .master("local[2]") - .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .enableHiveSupport() - .getOrCreate(); + SparkTestBase.spark = + SparkSession.builder() + .master("local[2]") + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .enableHiveSupport() + .getOrCreate(); - SparkTestBase.catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + SparkTestBase.catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); try { catalog.createNamespace(Namespace.of("default")); @@ -115,22 +117,23 @@ protected List rowsToJava(List rows) { private Object[] toJava(Row row) { return IntStream.range(0, row.size()) - .mapToObj(pos -> { - if (row.isNullAt(pos)) { - return null; - } - - Object value = row.get(pos); - if (value instanceof Row) { - return toJava((Row) value); - } else if (value instanceof scala.collection.Seq) { - return row.getList(pos); - } else if (value instanceof scala.collection.Map) { - return row.getJavaMap(pos); - } else { - return value; - } - }) + .mapToObj( + pos -> { + if (row.isNullAt(pos)) { + return null; + } + + Object value = row.get(pos); + if (value instanceof Row) { + return toJava((Row) value); + } else if (value instanceof scala.collection.Seq) { + return row.getList(pos); + } else if (value instanceof scala.collection.Map) { + return row.getJavaMap(pos); + } else { + return value; + } + }) .toArray(Object[]::new); } @@ -146,8 +149,10 @@ protected Object[] row(Object... values) { return values; } - protected void assertEquals(String context, List expectedRows, List actualRows) { - Assert.assertEquals(context + ": number of results should match", expectedRows.size(), actualRows.size()); + protected void assertEquals( + String context, List expectedRows, List actualRows) { + Assert.assertEquals( + context + ": number of results should match", expectedRows.size(), actualRows.size()); for (int row = 0; row < expectedRows.size(); row += 1) { Object[] expected = expectedRows.get(row); Object[] actual = actualRows.get(row); @@ -215,30 +220,34 @@ protected void withSQLConf(Map conf, Action action) { SQLConf sqlConf = SQLConf.get(); Map currentConfValues = Maps.newHashMap(); - conf.keySet().forEach(confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); - - conf.forEach((confKey, confValue) -> { - if (SQLConf.isStaticConfigKey(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); + conf.keySet() + .forEach( + confKey -> { + if (sqlConf.contains(confKey)) { + String currentConfValue = sqlConf.getConfString(confKey); + currentConfValues.put(confKey, currentConfValue); + } + }); + + conf.forEach( + (confKey, confValue) -> { + if (SQLConf.isStaticConfigKey(confKey)) { + throw new RuntimeException("Cannot modify the value of a static config: " + confKey); + } + sqlConf.setConfString(confKey, confValue); + }); try { action.invoke(); } finally { - conf.forEach((confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); + conf.forEach( + (confKey, confValue) -> { + if (currentConfValues.containsKey(confKey)) { + sqlConf.setConfString(confKey, currentConfValues.get(confKey)); + } else { + sqlConf.unsetConf(confKey); + } + }); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/SparkTestBaseWithCatalog.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/SparkTestBaseWithCatalog.java index fc358f76ae9c..e32aeea64d4d 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/SparkTestBaseWithCatalog.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/SparkTestBaseWithCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.io.File; @@ -53,8 +52,7 @@ public static void dropWarehouse() throws IOException { } } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); protected final String catalogName; protected final Catalog validationCatalog; @@ -70,21 +68,25 @@ public SparkTestBaseWithCatalog(SparkCatalogConfig config) { this(config.catalogName(), config.implementation(), config.properties()); } - public SparkTestBaseWithCatalog(String catalogName, String implementation, Map config) { + public SparkTestBaseWithCatalog( + String catalogName, String implementation, Map config) { this.catalogName = catalogName; - this.validationCatalog = catalogName.equals("testhadoop") ? - new HadoopCatalog(spark.sessionState().newHadoopConf(), "file:" + warehouse) : - catalog; + this.validationCatalog = + catalogName.equals("testhadoop") + ? new HadoopCatalog(spark.sessionState().newHadoopConf(), "file:" + warehouse) + : catalog; this.validationNamespaceCatalog = (SupportsNamespaces) validationCatalog; spark.conf().set("spark.sql.catalog." + catalogName, implementation); - config.forEach((key, value) -> spark.conf().set("spark.sql.catalog." + catalogName + "." + key, value)); + config.forEach( + (key, value) -> spark.conf().set("spark.sql.catalog." + catalogName + "." + key, value)); if (config.get("type").equalsIgnoreCase("hadoop")) { spark.conf().set("spark.sql.catalog." + catalogName + ".warehouse", "file:" + warehouse); } - this.tableName = (catalogName.equals("spark_catalog") ? "" : catalogName + ".") + "default.table"; + this.tableName = + (catalogName.equals("spark_catalog") ? "" : catalogName + ".") + "default.table"; sql("CREATE NAMESPACE IF NOT EXISTS default"); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestFileRewriteCoordinator.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestFileRewriteCoordinator.java index 8aa5cd6faec1..2e6886d32df5 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestFileRewriteCoordinator.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestFileRewriteCoordinator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.io.IOException; @@ -44,7 +43,8 @@ public class TestFileRewriteCoordinator extends SparkCatalogTestBase { - public TestFileRewriteCoordinator(String catalogName, String implementation, Map config) { + public TestFileRewriteCoordinator( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -66,7 +66,8 @@ public void testBinPackRewrite() throws NoSuchTableException, IOException { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should produce 4 snapshots", 4, Iterables.size(table.snapshots())); - Dataset fileDF = spark.read().format("iceberg").load(tableName(tableIdent.name() + ".files")); + Dataset fileDF = + spark.read().format("iceberg").load(tableName(tableIdent.name() + ".files")); List fileSizes = fileDF.select("file_size_in_bytes").as(Encoders.LONG()).collectAsList(); long avgFileSize = fileSizes.stream().mapToLong(i -> i).sum() / fileSizes.size(); @@ -77,22 +78,27 @@ public void testBinPackRewrite() throws NoSuchTableException, IOException { taskSetManager.stageTasks(table, fileSetID, Lists.newArrayList(fileScanTasks)); // read and pack original 4 files into 2 splits - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) - .option(SparkReadOptions.SPLIT_SIZE, Long.toString(avgFileSize * 2)) - .option(SparkReadOptions.FILE_OPEN_COST, "0") - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) + .option(SparkReadOptions.SPLIT_SIZE, Long.toString(avgFileSize * 2)) + .option(SparkReadOptions.FILE_OPEN_COST, "0") + .load(tableName); // write the packed data into new files where each split becomes a new file - scanDF.writeTo(tableName) + scanDF + .writeTo(tableName) .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID) .append(); // commit the rewrite FileRewriteCoordinator rewriteCoordinator = FileRewriteCoordinator.get(); - Set rewrittenFiles = taskSetManager.fetchTasks(table, fileSetID).stream() - .map(FileScanTask::file) - .collect(Collectors.toSet()); + Set rewrittenFiles = + taskSetManager.fetchTasks(table, fileSetID).stream() + .map(FileScanTask::file) + .collect(Collectors.toSet()); Set addedFiles = rewriteCoordinator.fetchNewDataFiles(table, fileSetID); table.newRewrite().rewriteFiles(rewrittenFiles, addedFiles).commit(); } @@ -127,34 +133,42 @@ public void testSortRewrite() throws NoSuchTableException, IOException { taskSetManager.stageTasks(table, fileSetID, Lists.newArrayList(fileScanTasks)); // read original 4 files as 4 splits - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) - .option(SparkReadOptions.SPLIT_SIZE, "134217728") - .option(SparkReadOptions.FILE_OPEN_COST, "134217728") - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) + .option(SparkReadOptions.SPLIT_SIZE, "134217728") + .option(SparkReadOptions.FILE_OPEN_COST, "134217728") + .load(tableName); // make sure we disable AQE and set the number of shuffle partitions as the target num files - ImmutableMap sqlConf = ImmutableMap.of( - "spark.sql.shuffle.partitions", "2", - "spark.sql.adaptive.enabled", "false" - ); - - withSQLConf(sqlConf, () -> { - try { - // write new files with sorted records - scanDF.sort("id").writeTo(tableName) - .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID) - .append(); - } catch (NoSuchTableException e) { - throw new RuntimeException("Could not replace files", e); - } - }); + ImmutableMap sqlConf = + ImmutableMap.of( + "spark.sql.shuffle.partitions", "2", + "spark.sql.adaptive.enabled", "false"); + + withSQLConf( + sqlConf, + () -> { + try { + // write new files with sorted records + scanDF + .sort("id") + .writeTo(tableName) + .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID) + .append(); + } catch (NoSuchTableException e) { + throw new RuntimeException("Could not replace files", e); + } + }); // commit the rewrite FileRewriteCoordinator rewriteCoordinator = FileRewriteCoordinator.get(); - Set rewrittenFiles = taskSetManager.fetchTasks(table, fileSetID).stream() - .map(FileScanTask::file) - .collect(Collectors.toSet()); + Set rewrittenFiles = + taskSetManager.fetchTasks(table, fileSetID).stream() + .map(FileScanTask::file) + .collect(Collectors.toSet()); Set addedFiles = rewriteCoordinator.fetchNewDataFiles(table, fileSetID); table.newRewrite().rewriteFiles(rewrittenFiles, addedFiles).commit(); } @@ -199,7 +213,8 @@ public void testCommitMultipleRewrites() throws NoSuchTableException, IOExceptio String secondFileSetID = UUID.randomUUID().toString(); - try (CloseableIterable tasks = table.newScan().appendsAfter(firstFileSetSnapshotId).planFiles()) { + try (CloseableIterable tasks = + table.newScan().appendsAfter(firstFileSetSnapshotId).planFiles()) { // stage 2 more files for compaction taskSetManager.stageTasks(table, secondFileSetID, Lists.newArrayList(tasks)); } @@ -208,26 +223,32 @@ public void testCommitMultipleRewrites() throws NoSuchTableException, IOExceptio for (String fileSetID : fileSetIDs) { // read and pack 2 files into 1 split - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) - .option(SparkReadOptions.SPLIT_SIZE, Long.MAX_VALUE) - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) + .option(SparkReadOptions.SPLIT_SIZE, Long.MAX_VALUE) + .load(tableName); // write the combined data as one file - scanDF.writeTo(tableName) + scanDF + .writeTo(tableName) .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID) .append(); } // commit both rewrites at the same time FileRewriteCoordinator rewriteCoordinator = FileRewriteCoordinator.get(); - Set rewrittenFiles = fileSetIDs.stream().flatMap(fileSetID -> - taskSetManager.fetchTasks(table, fileSetID).stream()) - .map(FileScanTask::file) - .collect(Collectors.toSet()); - Set addedFiles = fileSetIDs.stream() - .flatMap(fileSetID -> rewriteCoordinator.fetchNewDataFiles(table, fileSetID).stream()) - .collect(Collectors.toSet()); + Set rewrittenFiles = + fileSetIDs.stream() + .flatMap(fileSetID -> taskSetManager.fetchTasks(table, fileSetID).stream()) + .map(FileScanTask::file) + .collect(Collectors.toSet()); + Set addedFiles = + fileSetIDs.stream() + .flatMap(fileSetID -> rewriteCoordinator.fetchNewDataFiles(table, fileSetID).stream()) + .collect(Collectors.toSet()); table.newRewrite().rewriteFiles(rewrittenFiles, addedFiles).commit(); table.refresh(); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSpark3Util.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSpark3Util.java index be6130f63741..96dc2c29eb7f 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSpark3Util.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSpark3Util.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.NullOrder.NULLS_FIRST; +import static org.apache.iceberg.NullOrder.NULLS_LAST; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import org.apache.iceberg.CachingCatalog; import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; @@ -29,54 +33,69 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.NullOrder.NULLS_FIRST; -import static org.apache.iceberg.NullOrder.NULLS_LAST; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSpark3Util extends SparkTestBase { @Test public void testDescribeSortOrder() { - Schema schema = new Schema( + Schema schema = + new Schema( required(1, "data", Types.StringType.get()), - required(2, "time", Types.TimestampType.withoutZone()) - ); + required(2, "time", Types.TimestampType.withoutZone())); - Assert.assertEquals("Sort order isn't correct.", "data DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "data DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("Identity", schema, 1))); - Assert.assertEquals("Sort order isn't correct.", "bucket(1, data) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "bucket(1, data) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("bucket[1]", schema, 1))); - Assert.assertEquals("Sort order isn't correct.", "truncate(data, 3) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "truncate(data, 3) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("truncate[3]", schema, 1))); - Assert.assertEquals("Sort order isn't correct.", "years(time) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "years(time) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("year", schema, 2))); - Assert.assertEquals("Sort order isn't correct.", "months(time) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "months(time) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("month", schema, 2))); - Assert.assertEquals("Sort order isn't correct.", "days(time) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "days(time) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("day", schema, 2))); - Assert.assertEquals("Sort order isn't correct.", "hours(time) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "hours(time) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("hour", schema, 2))); - Assert.assertEquals("Sort order isn't correct.", "unknown(data) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "unknown(data) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("unknown", schema, 1))); // multiple sort orders - SortOrder multiOrder = SortOrder.builderFor(schema) - .asc("time", NULLS_FIRST) - .asc("data", NULLS_LAST) - .build(); - Assert.assertEquals("Sort order isn't correct.", "time ASC NULLS FIRST, data ASC NULLS LAST", - Spark3Util.describe(multiOrder)); + SortOrder multiOrder = + SortOrder.builderFor(schema).asc("time", NULLS_FIRST).asc("data", NULLS_LAST).build(); + Assert.assertEquals( + "Sort order isn't correct.", + "time ASC NULLS FIRST, data ASC NULLS LAST", + Spark3Util.describe(multiOrder)); } @Test public void testDescribeSchema() { - Schema schema = new Schema( - required(1, "data", Types.ListType.ofRequired(2, Types.StringType.get())), - optional(3, "pairs", Types.MapType.ofOptional(4, 5, Types.StringType.get(), Types.LongType.get())), - required(6, "time", Types.TimestampType.withoutZone()) - ); + Schema schema = + new Schema( + required(1, "data", Types.ListType.ofRequired(2, Types.StringType.get())), + optional( + 3, + "pairs", + Types.MapType.ofOptional(4, 5, Types.StringType.get(), Types.LongType.get())), + required(6, "time", Types.TimestampType.withoutZone())); - Assert.assertEquals("Schema description isn't correct.", + Assert.assertEquals( + "Schema description isn't correct.", "struct not null,pairs: map,time: timestamp not null>", Spark3Util.describe(schema)); } @@ -99,19 +118,25 @@ public void testLoadIcebergCatalog() throws Exception { spark.conf().set("spark.sql.catalog.test_cat", SparkCatalog.class.getName()); spark.conf().set("spark.sql.catalog.test_cat.type", "hive"); Catalog catalog = Spark3Util.loadIcebergCatalog(spark, "test_cat"); - Assert.assertTrue("Should retrieve underlying catalog class", catalog instanceof CachingCatalog); + Assert.assertTrue( + "Should retrieve underlying catalog class", catalog instanceof CachingCatalog); } private SortOrder buildSortOrder(String transform, Schema schema, int sourceId) { - String jsonString = "{\n" + - " \"order-id\" : 10,\n" + - " \"fields\" : [ {\n" + - " \"transform\" : \"" + transform + "\",\n" + - " \"source-id\" : " + sourceId + ",\n" + - " \"direction\" : \"desc\",\n" + - " \"null-order\" : \"nulls-first\"\n" + - " } ]\n" + - "}"; + String jsonString = + "{\n" + + " \"order-id\" : 10,\n" + + " \"fields\" : [ {\n" + + " \"transform\" : \"" + + transform + + "\",\n" + + " \"source-id\" : " + + sourceId + + ",\n" + + " \"direction\" : \"desc\",\n" + + " \"null-order\" : \"nulls-first\"\n" + + " } ]\n" + + "}"; return SortOrderParser.fromJson(schema, jsonString); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkCachedTableCatalog.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkCachedTableCatalog.java index 817af0302966..23e8717fb8c3 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkCachedTableCatalog.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkCachedTableCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Snapshot; @@ -69,17 +68,24 @@ public void testTimeTravel() { try { TABLE_CACHE.add("key", table); - assertEquals("Should have expected rows in 3rd snapshot", + assertEquals( + "Should have expected rows in 3rd snapshot", ImmutableList.of(row(1, "hr"), row(2, "hr"), row(3, "hr")), sql("SELECT * FROM testcache.key ORDER BY id")); - assertEquals("Should have expected rows in 2nd snapshot", + assertEquals( + "Should have expected rows in 2nd snapshot", ImmutableList.of(row(1, "hr"), row(2, "hr")), - sql("SELECT * FROM testcache.`key#at_timestamp_%s` ORDER BY id", secondSnapshot.timestampMillis())); + sql( + "SELECT * FROM testcache.`key#at_timestamp_%s` ORDER BY id", + secondSnapshot.timestampMillis())); - assertEquals("Should have expected rows in 1st snapshot", + assertEquals( + "Should have expected rows in 1st snapshot", ImmutableList.of(row(1, "hr")), - sql("SELECT * FROM testcache.`key#snapshot_id_%d` ORDER BY id", firstSnapshot.snapshotId())); + sql( + "SELECT * FROM testcache.`key#snapshot_id_%d` ORDER BY id", + firstSnapshot.snapshotId())); } finally { TABLE_CACHE.remove("key"); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkCatalogOperations.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkCatalogOperations.java index 5c9f3c4cb189..0836271a7c22 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkCatalogOperations.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkCatalogOperations.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkDistributionAndOrderingUtil.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkDistributionAndOrderingUtil.java index 0d30abae2ddf..3dafd8f4edb8 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkDistributionAndOrderingUtil.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkDistributionAndOrderingUtil.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.TableProperties.DELETE_DISTRIBUTION_MODE; +import static org.apache.iceberg.TableProperties.MERGE_DISTRIBUTION_MODE; +import static org.apache.iceberg.TableProperties.UPDATE_DISTRIBUTION_MODE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE; +import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.DELETE; +import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.MERGE; +import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.UPDATE; + import org.apache.iceberg.DistributionMode; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Table; @@ -34,44 +44,47 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TableProperties.DELETE_DISTRIBUTION_MODE; -import static org.apache.iceberg.TableProperties.MERGE_DISTRIBUTION_MODE; -import static org.apache.iceberg.TableProperties.UPDATE_DISTRIBUTION_MODE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE; -import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.DELETE; -import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.MERGE; -import static org.apache.spark.sql.connector.iceberg.write.RowLevelOperation.Command.UPDATE; - public class TestSparkDistributionAndOrderingUtil extends SparkTestBaseWithCatalog { private static final Distribution UNSPECIFIED_DISTRIBUTION = Distributions.unspecified(); - private static final Distribution FILE_CLUSTERED_DISTRIBUTION = Distributions.clustered(new Expression[]{ - Expressions.column(MetadataColumns.FILE_PATH.name()) - }); - private static final Distribution SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION = Distributions.clustered(new Expression[]{ - Expressions.column(MetadataColumns.SPEC_ID.name()), - Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME) - }); - - private static final SortOrder[] EMPTY_ORDERING = new SortOrder[]{}; - private static final SortOrder[] FILE_POSITION_ORDERING = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) - }; - private static final SortOrder[] SPEC_ID_PARTITION_FILE_ORDERING = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING) - }; - private static final SortOrder[] SPEC_ID_PARTITION_FILE_POSITION_ORDERING = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) - }; + private static final Distribution FILE_CLUSTERED_DISTRIBUTION = + Distributions.clustered( + new Expression[] {Expressions.column(MetadataColumns.FILE_PATH.name())}); + private static final Distribution SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION = + Distributions.clustered( + new Expression[] { + Expressions.column(MetadataColumns.SPEC_ID.name()), + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME) + }); + + private static final SortOrder[] EMPTY_ORDERING = new SortOrder[] {}; + private static final SortOrder[] FILE_POSITION_ORDERING = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) + }; + private static final SortOrder[] SPEC_ID_PARTITION_FILE_ORDERING = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING) + }; + private static final SortOrder[] SPEC_ID_PARTITION_FILE_POSITION_ORDERING = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) + }; @After public void dropTable() { @@ -93,9 +106,7 @@ public void testHashWriteUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); checkWriteDistributionAndOrdering(table, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); } @@ -106,9 +117,7 @@ public void testRangeWriteUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); checkWriteDistributionAndOrdering(table, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); } @@ -119,15 +128,13 @@ public void testDefaultWriteUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -140,19 +147,15 @@ public void testHashWriteUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; checkWriteDistributionAndOrdering(table, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @@ -163,19 +166,15 @@ public void testRangeWriteUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -184,62 +183,65 @@ public void testRangeWriteUnpartitionedSortedTable() { @Test public void testDefaultWritePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; checkWriteDistributionAndOrdering(table, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashWritePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - Expression[] expectedClustering = new Expression[]{ - Expressions.identity("date"), - Expressions.days("ts") - }; + Expression[] expectedClustering = + new Expression[] {Expressions.identity("date"), Expressions.days("ts")}; Distribution expectedDistribution = Distributions.clustered(expectedClustering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; checkWriteDistributionAndOrdering(table, expectedDistribution, expectedOrdering); } @Test public void testRangeWritePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -248,20 +250,21 @@ public void testRangeWritePartitionedUnsortedTable() { @Test public void testDefaultWritePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.replaceSortOrder() - .desc("id") - .commit(); + table.replaceSortOrder().desc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -270,51 +273,49 @@ public void testDefaultWritePartitionedSortedTable() { @Test public void testHashWritePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - Expression[] expectedClustering = new Expression[]{ - Expressions.identity("date"), - Expressions.bucket(8, "data") - }; + Expression[] expectedClustering = + new Expression[] {Expressions.identity("date"), Expressions.bucket(8, "data")}; Distribution expectedDistribution = Distributions.clustered(expectedClustering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; checkWriteDistributionAndOrdering(table, expectedDistribution, expectedOrdering); } @Test public void testRangeWritePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -359,7 +360,8 @@ public void testDefaultCopyOnWriteDeleteUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - checkCopyOnWriteDistributionAndOrdering(table, DELETE, FILE_CLUSTERED_DISTRIBUTION, FILE_POSITION_ORDERING); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, FILE_CLUSTERED_DISTRIBUTION, FILE_POSITION_ORDERING); } @Test @@ -368,11 +370,10 @@ public void testNoneCopyOnWriteDeleteUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - checkCopyOnWriteDistributionAndOrdering(table, DELETE, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); } @Test @@ -381,11 +382,10 @@ public void testHashCopyOnWriteDeleteUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - checkCopyOnWriteDistributionAndOrdering(table, DELETE, FILE_CLUSTERED_DISTRIBUTION, FILE_POSITION_ORDERING); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, FILE_CLUSTERED_DISTRIBUTION, FILE_POSITION_ORDERING); } @Test @@ -394,13 +394,12 @@ public void testRangeCopyOnWriteDeleteUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); Distribution expectedDistribution = Distributions.ordered(FILE_POSITION_ORDERING); - checkCopyOnWriteDistributionAndOrdering(table, DELETE, expectedDistribution, FILE_POSITION_ORDERING); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, expectedDistribution, FILE_POSITION_ORDERING); } @Test @@ -409,17 +408,16 @@ public void testDefaultCopyOnWriteDeleteUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test @@ -428,21 +426,18 @@ public void testNoneCopyOnWriteDeleteUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test @@ -451,21 +446,18 @@ public void testHashCopyOnWriteDeleteUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test @@ -474,19 +466,15 @@ public void testRangeCopyOnWriteDeleteUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -495,82 +483,97 @@ public void testRangeCopyOnWriteDeleteUnpartitionedSortedTable() { @Test public void testDefaultCopyOnWriteDeletePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test public void testNoneCopyOnWriteDeletePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashCopyOnWriteDeletePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test public void testRangeCopyOnWriteDeletePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -579,93 +582,94 @@ public void testRangeCopyOnWriteDeletePartitionedUnsortedTable() { @Test public void testDefaultCopyOnWriteDeletePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.replaceSortOrder() - .desc("id") - .commit(); + table.replaceSortOrder().desc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test public void testNoneCopyOnWriteDeletePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - table.replaceSortOrder() - .desc("id") - .commit(); + table.replaceSortOrder().desc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashCopyOnWriteDeletePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test public void testRangeCopyOnWriteDeletePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -710,7 +714,8 @@ public void testDefaultCopyOnWriteUpdateUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, FILE_POSITION_ORDERING); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, FILE_POSITION_ORDERING); } @Test @@ -719,11 +724,10 @@ public void testNoneCopyOnWriteUpdateUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); } @Test @@ -732,11 +736,10 @@ public void testHashCopyOnWriteUpdateUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, FILE_POSITION_ORDERING); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, FILE_POSITION_ORDERING); } @Test @@ -745,13 +748,12 @@ public void testRangeCopyOnWriteUpdateUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); Distribution expectedDistribution = Distributions.ordered(FILE_POSITION_ORDERING); - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, expectedDistribution, FILE_POSITION_ORDERING); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, expectedDistribution, FILE_POSITION_ORDERING); } @Test @@ -760,17 +762,16 @@ public void testDefaultCopyOnWriteUpdateUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test @@ -779,21 +780,18 @@ public void testNoneCopyOnWriteUpdateUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test @@ -802,21 +800,18 @@ public void testHashCopyOnWriteUpdateUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test @@ -825,19 +820,15 @@ public void testRangeCopyOnWriteUpdateUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -846,82 +837,97 @@ public void testRangeCopyOnWriteUpdateUnpartitionedSortedTable() { @Test public void testDefaultCopyOnWriteUpdatePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test public void testNoneCopyOnWriteUpdatePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashCopyOnWriteUpdatePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test public void testRangeCopyOnWriteUpdatePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -930,93 +936,94 @@ public void testRangeCopyOnWriteUpdatePartitionedUnsortedTable() { @Test public void testDefaultCopyOnWriteUpdatePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.replaceSortOrder() - .desc("id") - .commit(); + table.replaceSortOrder().desc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test public void testNoneCopyOnWriteUpdatePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - table.replaceSortOrder() - .desc("id") - .commit(); + table.replaceSortOrder().desc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashCopyOnWriteUpdatePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test public void testRangeCopyOnWriteUpdatePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -1061,9 +1068,7 @@ public void testNoneCopyOnWriteMergeUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); checkCopyOnWriteDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); } @@ -1074,9 +1079,7 @@ public void testHashCopyOnWriteMergeUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); checkCopyOnWriteDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); } @@ -1087,9 +1090,7 @@ public void testRangeCopyOnWriteMergeUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); checkCopyOnWriteDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); } @@ -1100,21 +1101,18 @@ public void testNoneCopyOnWriteMergeUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test @@ -1123,21 +1121,18 @@ public void testHashCopyOnWriteMergeUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test @@ -1146,19 +1141,15 @@ public void testRangeCopyOnWriteMergeUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -1167,66 +1158,68 @@ public void testRangeCopyOnWriteMergeUnpartitionedSortedTable() { @Test public void testNoneCopyOnWriteMergePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashCopyOnWriteMergePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - Expression[] expectedClustering = new Expression[]{ - Expressions.identity("date"), - Expressions.days("ts") - }; + Expression[] expectedClustering = + new Expression[] {Expressions.identity("date"), Expressions.days("ts")}; Distribution expectedDistribution = Distributions.clustered(expectedClustering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; checkCopyOnWriteDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering); } @Test public void testRangeCopyOnWriteMergePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -1235,79 +1228,75 @@ public void testRangeCopyOnWriteMergePartitionedUnsortedTable() { @Test public void testNoneCopyOnWriteMergePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - table.replaceSortOrder() - .desc("id") - .commit(); + table.replaceSortOrder().desc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashCopyOnWriteMergePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - Expression[] expectedClustering = new Expression[]{ - Expressions.identity("date"), - Expressions.bucket(8, "data") - }; + Expression[] expectedClustering = + new Expression[] {Expressions.identity("date"), Expressions.bucket(8, "data")}; Distribution expectedDistribution = Distributions.clustered(expectedClustering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; checkCopyOnWriteDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering); } @Test public void testRangeCopyOnWriteMergePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -1318,9 +1307,12 @@ public void testRangeCopyOnWriteMergePartitionedSortedTable() { // Distribution and ordering for merge-on-read DELETE operations with position deletes // =================================================================================== // - // delete mode is NOT SET -> CLUSTER BY _spec_id, _partition + LOCALLY ORDER BY _spec_id, _partition, _file, _pos - // delete mode is NONE -> unspecified distribution + LOCALLY ORDER BY _spec_id, _partition, _file, _pos - // delete mode is HASH -> CLUSTER BY _spec_id, _partition + LOCALLY ORDER BY _spec_id, _partition, _file, _pos + // delete mode is NOT SET -> CLUSTER BY _spec_id, _partition + LOCALLY ORDER BY _spec_id, + // _partition, _file, _pos + // delete mode is NONE -> unspecified distribution + LOCALLY ORDER BY _spec_id, _partition, _file, + // _pos + // delete mode is HASH -> CLUSTER BY _spec_id, _partition + LOCALLY ORDER BY _spec_id, _partition, + // _file, _pos // delete mode is RANGE -> RANGE DISTRIBUTE BY _spec_id, _partition, _file + // LOCALLY ORDER BY _spec_id, _partition, _file, _pos @@ -1331,7 +1323,10 @@ public void testDefaultPositionDeltaDeleteUnpartitionedTable() { Table table = validationCatalog.loadTable(tableIdent); checkPositionDeltaDistributionAndOrdering( - table, DELETE, SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); + table, + DELETE, + SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, + SPEC_ID_PARTITION_FILE_POSITION_ORDERING); } @Test @@ -1340,9 +1335,7 @@ public void testNonePositionDeltaDeleteUnpartitionedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); checkPositionDeltaDistributionAndOrdering( table, DELETE, UNSPECIFIED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); @@ -1354,12 +1347,13 @@ public void testHashPositionDeltaDeleteUnpartitionedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); checkPositionDeltaDistributionAndOrdering( - table, DELETE, SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); + table, + DELETE, + SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, + SPEC_ID_PARTITION_FILE_POSITION_ORDERING); } @Test @@ -1368,9 +1362,7 @@ public void testRangePositionDeltaDeleteUnpartitionedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); Distribution expectedDistribution = Distributions.ordered(SPEC_ID_PARTITION_FILE_ORDERING); @@ -1380,27 +1372,32 @@ public void testRangePositionDeltaDeleteUnpartitionedTable() { @Test public void testDefaultPositionDeltaDeletePartitionedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); checkPositionDeltaDistributionAndOrdering( - table, DELETE, SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); + table, + DELETE, + SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, + SPEC_ID_PARTITION_FILE_POSITION_ORDERING); } @Test public void testNonePositionDeltaDeletePartitionedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); checkPositionDeltaDistributionAndOrdering( table, DELETE, UNSPECIFIED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); @@ -1408,31 +1405,34 @@ public void testNonePositionDeltaDeletePartitionedTable() { @Test public void testHashPositionDeltaDeletePartitionedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); checkPositionDeltaDistributionAndOrdering( - table, DELETE, SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); + table, + DELETE, + SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, + SPEC_ID_PARTITION_FILE_POSITION_ORDERING); } @Test public void testRangePositionDeltaDeletePartitionedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); Distribution expectedDistribution = Distributions.ordered(SPEC_ID_PARTITION_FILE_ORDERING); @@ -1444,9 +1444,12 @@ public void testRangePositionDeltaDeletePartitionedTable() { // Distribution and ordering for merge-on-read UPDATE operations with position deletes // =================================================================================== // - // update mode is NOT SET -> CLUSTER BY _spec_id, _partition + LOCALLY ORDER BY _spec_id, _partition, _file, _pos - // update mode is NONE -> unspecified distribution + LOCALLY ORDER BY _spec_id, _partition, _file, _pos - // update mode is HASH -> CLUSTER BY _spec_id, _partition + LOCALLY ORDER BY _spec_id, _partition, _file, _pos + // update mode is NOT SET -> CLUSTER BY _spec_id, _partition + LOCALLY ORDER BY _spec_id, + // _partition, _file, _pos + // update mode is NONE -> unspecified distribution + LOCALLY ORDER BY _spec_id, _partition, _file, + // _pos + // update mode is HASH -> CLUSTER BY _spec_id, _partition + LOCALLY ORDER BY _spec_id, _partition, + // _file, _pos // update mode is RANGE -> RANGE DISTRIBUTE BY _spec_id, _partition, _file + // LOCALLY ORDER BY _spec_id, _partition, _file, _pos @@ -1457,7 +1460,10 @@ public void testDefaultPositionDeltaUpdateUnpartitionedTable() { Table table = validationCatalog.loadTable(tableIdent); checkPositionDeltaDistributionAndOrdering( - table, UPDATE, SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); + table, + UPDATE, + SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, + SPEC_ID_PARTITION_FILE_POSITION_ORDERING); } @Test @@ -1466,9 +1472,7 @@ public void testNonePositionDeltaUpdateUnpartitionedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); checkPositionDeltaDistributionAndOrdering( table, UPDATE, UNSPECIFIED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); @@ -1480,12 +1484,13 @@ public void testHashPositionDeltaUpdateUnpartitionedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); checkPositionDeltaDistributionAndOrdering( - table, UPDATE, SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); + table, + UPDATE, + SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, + SPEC_ID_PARTITION_FILE_POSITION_ORDERING); } @Test @@ -1494,9 +1499,7 @@ public void testRangePositionDeltaUpdateUnpartitionedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); Distribution expectedDistribution = Distributions.ordered(SPEC_ID_PARTITION_FILE_ORDERING); @@ -1506,27 +1509,32 @@ public void testRangePositionDeltaUpdateUnpartitionedTable() { @Test public void testDefaultPositionDeltaUpdatePartitionedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); checkPositionDeltaDistributionAndOrdering( - table, UPDATE, SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); + table, + UPDATE, + SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, + SPEC_ID_PARTITION_FILE_POSITION_ORDERING); } @Test public void testNonePositionDeltaUpdatePartitionedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); checkPositionDeltaDistributionAndOrdering( table, UPDATE, UNSPECIFIED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); @@ -1534,31 +1542,34 @@ public void testNonePositionDeltaUpdatePartitionedTable() { @Test public void testHashPositionDeltaUpdatePartitionedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); checkPositionDeltaDistributionAndOrdering( - table, UPDATE, SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); + table, + UPDATE, + SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, + SPEC_ID_PARTITION_FILE_POSITION_ORDERING); } @Test public void testRangePositionDeltaUpdatePartitionedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); Distribution expectedDistribution = Distributions.ordered(SPEC_ID_PARTITION_FILE_ORDERING); @@ -1575,7 +1586,8 @@ public void testRangePositionDeltaUpdatePartitionedTable() { // UNPARTITIONED UNORDERED // ------------------------------------------------------------------------- // merge mode is NOT SET -> use write mode - // merge mode is NONE -> unspecified distribution + LOCALLY ORDER BY _spec_id, _partition, _file, _pos + // merge mode is NONE -> unspecified distribution + LOCALLY ORDER BY _spec_id, _partition, _file, + // _pos // merge mode is HASH -> CLUSTER BY _spec_id, _partition, _file + // LOCALLY ORDER BY _spec_id, _partition, _file, _pos // merge mode is RANGE -> RANGE DISTRIBUTE BY _spec_id, _partition, _file + @@ -1617,9 +1629,7 @@ public void testNonePositionDeltaMergeUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); checkPositionDeltaDistributionAndOrdering( table, MERGE, UNSPECIFIED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); @@ -1631,15 +1641,14 @@ public void testHashPositionDeltaMergeUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - Expression[] expectedClustering = new Expression[]{ - Expressions.column(MetadataColumns.SPEC_ID.name()), - Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), - Expressions.column(MetadataColumns.FILE_PATH.name()) - }; + Expression[] expectedClustering = + new Expression[] { + Expressions.column(MetadataColumns.SPEC_ID.name()), + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), + Expressions.column(MetadataColumns.FILE_PATH.name()) + }; Distribution expectedDistribution = Distributions.clustered(expectedClustering); checkPositionDeltaDistributionAndOrdering( @@ -1652,15 +1661,17 @@ public void testRangePositionDeltaMergeUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - SortOrder[] expectedDistributionOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING) - }; + SortOrder[] expectedDistributionOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedDistributionOrdering); checkPositionDeltaDistributionAndOrdering( @@ -1673,25 +1684,26 @@ public void testNonePositionDeltaMergeUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkPositionDeltaDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkPositionDeltaDistributionAndOrdering( + table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test @@ -1700,30 +1712,31 @@ public void testHashPositionDeltaMergeUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - Expression[] expectedClustering = new Expression[]{ - Expressions.column(MetadataColumns.SPEC_ID.name()), - Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), - Expressions.column(MetadataColumns.FILE_PATH.name()) - }; + Expression[] expectedClustering = + new Expression[] { + Expressions.column(MetadataColumns.SPEC_ID.name()), + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), + Expressions.column(MetadataColumns.FILE_PATH.name()) + }; Distribution expectedDistribution = Distributions.clustered(expectedClustering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; checkPositionDeltaDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering); } @@ -1734,249 +1747,295 @@ public void testRangePositionDeltaMergeUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedDistributionOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedDistributionOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedDistributionOrdering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; checkPositionDeltaDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering); } @Test public void testNonePositionDeltaMergePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); - - Table table = validationCatalog.loadTable(tableIdent); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); + + Table table = validationCatalog.loadTable(tableIdent); + + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); + + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); - - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; - - checkPositionDeltaDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkPositionDeltaDistributionAndOrdering( + table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashPositionDeltaMergePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - Expression[] expectedClustering = new Expression[]{ - Expressions.column(MetadataColumns.SPEC_ID.name()), - Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), - Expressions.identity("date"), - Expressions.days("ts") - }; + Expression[] expectedClustering = + new Expression[] { + Expressions.column(MetadataColumns.SPEC_ID.name()), + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), + Expressions.identity("date"), + Expressions.days("ts") + }; Distribution expectedDistribution = Distributions.clustered(expectedClustering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; checkPositionDeltaDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering); } @Test public void testRangePositionDeltaMergePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); - - Table table = validationCatalog.loadTable(tableIdent); - - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); - - SortOrder[] expectedDistributionOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); + + Table table = validationCatalog.loadTable(tableIdent); + + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); + + SortOrder[] expectedDistributionOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedDistributionOrdering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; checkPositionDeltaDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering); } @Test public void testNonePositionDeltaMergePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - table.replaceSortOrder() - .desc("id") - .commit(); + table.replaceSortOrder().desc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) + }; - checkPositionDeltaDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkPositionDeltaDistributionAndOrdering( + table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashPositionDeltaMergePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - Expression[] expectedClustering = new Expression[]{ - Expressions.column(MetadataColumns.SPEC_ID.name()), - Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), - Expressions.identity("date"), - Expressions.bucket(8, "data") - }; + Expression[] expectedClustering = + new Expression[] { + Expressions.column(MetadataColumns.SPEC_ID.name()), + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), + Expressions.identity("date"), + Expressions.bucket(8, "data") + }; Distribution expectedDistribution = Distributions.clustered(expectedClustering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; checkPositionDeltaDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering); } @Test public void testRangePositionDeltaMergePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - SortOrder[] expectedDistributionOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedDistributionOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedDistributionOrdering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; checkPositionDeltaDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering); } - private void checkWriteDistributionAndOrdering(Table table, Distribution expectedDistribution, - SortOrder[] expectedOrdering) { + private void checkWriteDistributionAndOrdering( + Table table, Distribution expectedDistribution, SortOrder[] expectedOrdering) { SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); DistributionMode distributionMode = writeConf.distributionMode(); - Distribution distribution = SparkDistributionAndOrderingUtil.buildRequiredDistribution(table, distributionMode); + Distribution distribution = + SparkDistributionAndOrderingUtil.buildRequiredDistribution(table, distributionMode); Assert.assertEquals("Distribution must match", expectedDistribution, distribution); - SortOrder[] ordering = SparkDistributionAndOrderingUtil.buildRequiredOrdering(table, distribution); + SortOrder[] ordering = + SparkDistributionAndOrderingUtil.buildRequiredOrdering(table, distribution); Assert.assertArrayEquals("Ordering must match", expectedOrdering, ordering); } - private void checkCopyOnWriteDistributionAndOrdering(Table table, Command command, - Distribution expectedDistribution, - SortOrder[] expectedOrdering) { + private void checkCopyOnWriteDistributionAndOrdering( + Table table, + Command command, + Distribution expectedDistribution, + SortOrder[] expectedOrdering) { SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); DistributionMode mode = copyOnWriteDistributionMode(command, writeConf); - Distribution distribution = SparkDistributionAndOrderingUtil.buildCopyOnWriteDistribution(table, command, mode); + Distribution distribution = + SparkDistributionAndOrderingUtil.buildCopyOnWriteDistribution(table, command, mode); Assert.assertEquals("Distribution must match", expectedDistribution, distribution); - SortOrder[] ordering = SparkDistributionAndOrderingUtil.buildCopyOnWriteOrdering(table, command, distribution); + SortOrder[] ordering = + SparkDistributionAndOrderingUtil.buildCopyOnWriteOrdering(table, command, distribution); Assert.assertArrayEquals("Ordering must match", expectedOrdering, ordering); } @@ -1993,21 +2052,26 @@ private DistributionMode copyOnWriteDistributionMode(Command command, SparkWrite } } - private void checkPositionDeltaDistributionAndOrdering(Table table, Command command, - Distribution expectedDistribution, - SortOrder[] expectedOrdering) { + private void checkPositionDeltaDistributionAndOrdering( + Table table, + Command command, + Distribution expectedDistribution, + SortOrder[] expectedOrdering) { SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); DistributionMode mode = positionDeltaDistributionMode(command, writeConf); - Distribution distribution = SparkDistributionAndOrderingUtil.buildPositionDeltaDistribution(table, command, mode); + Distribution distribution = + SparkDistributionAndOrderingUtil.buildPositionDeltaDistribution(table, command, mode); Assert.assertEquals("Distribution must match", expectedDistribution, distribution); - SortOrder[] ordering = SparkDistributionAndOrderingUtil.buildPositionDeltaOrdering(table, command); + SortOrder[] ordering = + SparkDistributionAndOrderingUtil.buildPositionDeltaOrdering(table, command); Assert.assertArrayEquals("Ordering must match", expectedOrdering, ordering); } - private DistributionMode positionDeltaDistributionMode(Command command, SparkWriteConf writeConf) { + private DistributionMode positionDeltaDistributionMode( + Command command, SparkWriteConf writeConf) { switch (command) { case DELETE: return writeConf.deleteDistributionMode(); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkFilters.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkFilters.java index ccd526a54618..2e56b6aa91b0 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkFilters.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkFilters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.sql.Date; @@ -53,52 +52,60 @@ public void testQuotedAttributes() { attrMap.put("`d`.b.`dd```", "d.b.dd`"); attrMap.put("a.`aa```.c", "a.aa`.c"); - attrMap.forEach((quoted, unquoted) -> { - IsNull isNull = IsNull.apply(quoted); - Expression expectedIsNull = Expressions.isNull(unquoted); - Expression actualIsNull = SparkFilters.convert(isNull); - Assert.assertEquals("IsNull must match", expectedIsNull.toString(), actualIsNull.toString()); - - IsNotNull isNotNull = IsNotNull.apply(quoted); - Expression expectedIsNotNull = Expressions.notNull(unquoted); - Expression actualIsNotNull = SparkFilters.convert(isNotNull); - Assert.assertEquals("IsNotNull must match", expectedIsNotNull.toString(), actualIsNotNull.toString()); - - LessThan lt = LessThan.apply(quoted, 1); - Expression expectedLt = Expressions.lessThan(unquoted, 1); - Expression actualLt = SparkFilters.convert(lt); - Assert.assertEquals("LessThan must match", expectedLt.toString(), actualLt.toString()); - - LessThanOrEqual ltEq = LessThanOrEqual.apply(quoted, 1); - Expression expectedLtEq = Expressions.lessThanOrEqual(unquoted, 1); - Expression actualLtEq = SparkFilters.convert(ltEq); - Assert.assertEquals("LessThanOrEqual must match", expectedLtEq.toString(), actualLtEq.toString()); - - GreaterThan gt = GreaterThan.apply(quoted, 1); - Expression expectedGt = Expressions.greaterThan(unquoted, 1); - Expression actualGt = SparkFilters.convert(gt); - Assert.assertEquals("GreaterThan must match", expectedGt.toString(), actualGt.toString()); - - GreaterThanOrEqual gtEq = GreaterThanOrEqual.apply(quoted, 1); - Expression expectedGtEq = Expressions.greaterThanOrEqual(unquoted, 1); - Expression actualGtEq = SparkFilters.convert(gtEq); - Assert.assertEquals("GreaterThanOrEqual must match", expectedGtEq.toString(), actualGtEq.toString()); - - EqualTo eq = EqualTo.apply(quoted, 1); - Expression expectedEq = Expressions.equal(unquoted, 1); - Expression actualEq = SparkFilters.convert(eq); - Assert.assertEquals("EqualTo must match", expectedEq.toString(), actualEq.toString()); - - EqualNullSafe eqNullSafe = EqualNullSafe.apply(quoted, 1); - Expression expectedEqNullSafe = Expressions.equal(unquoted, 1); - Expression actualEqNullSafe = SparkFilters.convert(eqNullSafe); - Assert.assertEquals("EqualNullSafe must match", expectedEqNullSafe.toString(), actualEqNullSafe.toString()); - - In in = In.apply(quoted, new Integer[]{1}); - Expression expectedIn = Expressions.in(unquoted, 1); - Expression actualIn = SparkFilters.convert(in); - Assert.assertEquals("In must match", expectedIn.toString(), actualIn.toString()); - }); + attrMap.forEach( + (quoted, unquoted) -> { + IsNull isNull = IsNull.apply(quoted); + Expression expectedIsNull = Expressions.isNull(unquoted); + Expression actualIsNull = SparkFilters.convert(isNull); + Assert.assertEquals( + "IsNull must match", expectedIsNull.toString(), actualIsNull.toString()); + + IsNotNull isNotNull = IsNotNull.apply(quoted); + Expression expectedIsNotNull = Expressions.notNull(unquoted); + Expression actualIsNotNull = SparkFilters.convert(isNotNull); + Assert.assertEquals( + "IsNotNull must match", expectedIsNotNull.toString(), actualIsNotNull.toString()); + + LessThan lt = LessThan.apply(quoted, 1); + Expression expectedLt = Expressions.lessThan(unquoted, 1); + Expression actualLt = SparkFilters.convert(lt); + Assert.assertEquals("LessThan must match", expectedLt.toString(), actualLt.toString()); + + LessThanOrEqual ltEq = LessThanOrEqual.apply(quoted, 1); + Expression expectedLtEq = Expressions.lessThanOrEqual(unquoted, 1); + Expression actualLtEq = SparkFilters.convert(ltEq); + Assert.assertEquals( + "LessThanOrEqual must match", expectedLtEq.toString(), actualLtEq.toString()); + + GreaterThan gt = GreaterThan.apply(quoted, 1); + Expression expectedGt = Expressions.greaterThan(unquoted, 1); + Expression actualGt = SparkFilters.convert(gt); + Assert.assertEquals("GreaterThan must match", expectedGt.toString(), actualGt.toString()); + + GreaterThanOrEqual gtEq = GreaterThanOrEqual.apply(quoted, 1); + Expression expectedGtEq = Expressions.greaterThanOrEqual(unquoted, 1); + Expression actualGtEq = SparkFilters.convert(gtEq); + Assert.assertEquals( + "GreaterThanOrEqual must match", expectedGtEq.toString(), actualGtEq.toString()); + + EqualTo eq = EqualTo.apply(quoted, 1); + Expression expectedEq = Expressions.equal(unquoted, 1); + Expression actualEq = SparkFilters.convert(eq); + Assert.assertEquals("EqualTo must match", expectedEq.toString(), actualEq.toString()); + + EqualNullSafe eqNullSafe = EqualNullSafe.apply(quoted, 1); + Expression expectedEqNullSafe = Expressions.equal(unquoted, 1); + Expression actualEqNullSafe = SparkFilters.convert(eqNullSafe); + Assert.assertEquals( + "EqualNullSafe must match", + expectedEqNullSafe.toString(), + actualEqNullSafe.toString()); + + In in = In.apply(quoted, new Integer[] {1}); + Expression expectedIn = Expressions.in(unquoted, 1); + Expression actualIn = SparkFilters.convert(in); + Assert.assertEquals("In must match", expectedIn.toString(), actualIn.toString()); + }); } @Test @@ -111,10 +118,14 @@ public void testTimestampFilterConversion() { Expression timestampExpression = SparkFilters.convert(GreaterThan.apply("x", timestamp)); Expression rawExpression = Expressions.greaterThan("x", epochMicros); - Assert.assertEquals("Generated Timestamp expression should be correct", - rawExpression.toString(), timestampExpression.toString()); - Assert.assertEquals("Generated Instant expression should be correct", - rawExpression.toString(), instantExpression.toString()); + Assert.assertEquals( + "Generated Timestamp expression should be correct", + rawExpression.toString(), + timestampExpression.toString()); + Assert.assertEquals( + "Generated Instant expression should be correct", + rawExpression.toString(), + instantExpression.toString()); } @Test @@ -127,25 +138,31 @@ public void testDateFilterConversion() { Expression dateExpression = SparkFilters.convert(GreaterThan.apply("x", date)); Expression rawExpression = Expressions.greaterThan("x", epochDay); - Assert.assertEquals("Generated localdate expression should be correct", - rawExpression.toString(), localDateExpression.toString()); + Assert.assertEquals( + "Generated localdate expression should be correct", + rawExpression.toString(), + localDateExpression.toString()); - Assert.assertEquals("Generated date expression should be correct", - rawExpression.toString(), dateExpression.toString()); + Assert.assertEquals( + "Generated date expression should be correct", + rawExpression.toString(), + dateExpression.toString()); } @Test public void testNestedInInsideNot() { - Not filter = Not.apply(And.apply(EqualTo.apply("col1", 1), In.apply("col2", new Integer[]{1, 2}))); + Not filter = + Not.apply(And.apply(EqualTo.apply("col1", 1), In.apply("col2", new Integer[] {1, 2}))); Expression converted = SparkFilters.convert(filter); Assert.assertNull("Expression should not be converted", converted); } @Test public void testNotIn() { - Not filter = Not.apply(In.apply("col", new Integer[]{1, 2})); + Not filter = Not.apply(In.apply("col", new Integer[] {1, 2})); Expression actual = SparkFilters.convert(filter); - Expression expected = Expressions.and(Expressions.notNull("col"), Expressions.notIn("col", 1, 2)); + Expression expected = + Expressions.and(Expressions.notNull("col"), Expressions.notIn("col", 1, 2)); Assert.assertEquals("Expressions should match", expected.toString(), actual.toString()); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java index 40c77cbecbad..4283d44fa831 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import java.util.List; import org.apache.iceberg.MetadataColumns; @@ -29,32 +30,30 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestSparkSchemaUtil { - private static final Schema TEST_SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema TEST_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - private static final Schema TEST_SCHEMA_WITH_METADATA_COLS = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()), - MetadataColumns.FILE_PATH, - MetadataColumns.ROW_POSITION - ); + private static final Schema TEST_SCHEMA_WITH_METADATA_COLS = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get()), + MetadataColumns.FILE_PATH, + MetadataColumns.ROW_POSITION); @Test public void testEstimateSizeMaxValue() throws IOException { - Assert.assertEquals("estimateSize returns Long max value", Long.MAX_VALUE, - SparkSchemaUtil.estimateSize( - null, - Long.MAX_VALUE)); + Assert.assertEquals( + "estimateSize returns Long max value", + Long.MAX_VALUE, + SparkSchemaUtil.estimateSize(null, Long.MAX_VALUE)); } @Test public void testEstimateSizeWithOverflow() throws IOException { - long tableSize = SparkSchemaUtil.estimateSize(SparkSchemaUtil.convert(TEST_SCHEMA), Long.MAX_VALUE - 1); + long tableSize = + SparkSchemaUtil.estimateSize(SparkSchemaUtil.convert(TEST_SCHEMA), Long.MAX_VALUE - 1); Assert.assertEquals("estimateSize handles overflow", Long.MAX_VALUE, tableSize); } @@ -67,14 +66,17 @@ public void testEstimateSize() throws IOException { @Test public void testSchemaConversionWithMetaDataColumnSchema() { StructType structType = SparkSchemaUtil.convert(TEST_SCHEMA_WITH_METADATA_COLS); - List attrRefs = scala.collection.JavaConverters.seqAsJavaList(structType.toAttributes()); + List attrRefs = + scala.collection.JavaConverters.seqAsJavaList(structType.toAttributes()); for (AttributeReference attrRef : attrRefs) { if (MetadataColumns.isMetadataColumn(attrRef.name())) { - Assert.assertTrue("metadata columns should have __metadata_col in attribute metadata", - attrRef.metadata().contains(TypeToSparkType.METADATA_COL_ATTR_KEY) && - attrRef.metadata().getBoolean(TypeToSparkType.METADATA_COL_ATTR_KEY)); + Assert.assertTrue( + "metadata columns should have __metadata_col in attribute metadata", + attrRef.metadata().contains(TypeToSparkType.METADATA_COL_ATTR_KEY) + && attrRef.metadata().getBoolean(TypeToSparkType.METADATA_COL_ATTR_KEY)); } else { - Assert.assertFalse("non metadata columns should not have __metadata_col in attribute metadata", + Assert.assertFalse( + "non metadata columns should not have __metadata_col in attribute metadata", attrRef.metadata().contains(TypeToSparkType.METADATA_COL_ATTR_KEY)); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkSessionCatalog.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkSessionCatalog.java index 3ab2d6b23a6f..6737dc64ff7e 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkSessionCatalog.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkSessionCatalog.java @@ -16,14 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; + import org.junit.Assert; import org.junit.Test; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; - public class TestSparkSessionCatalog extends SparkTestBase { @Test public void testValidateHmsUri() { @@ -31,36 +30,60 @@ public void testValidateHmsUri() { String catalogHmsUriKey = "spark.sql.catalog.spark_catalog.uri"; String hmsUri = hiveConf.get(METASTOREURIS.varname); - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); spark.conf().set("spark.sql.catalog.spark_catalog.type", "hive"); // HMS uris match spark.sessionState().catalogManager().reset(); spark.conf().set(envHmsUriKey, hmsUri); spark.conf().set(catalogHmsUriKey, hmsUri); - Assert.assertTrue(spark.sessionState().catalogManager().v2SessionCatalog().defaultNamespace()[0].equals("default")); + Assert.assertTrue( + spark + .sessionState() + .catalogManager() + .v2SessionCatalog() + .defaultNamespace()[0] + .equals("default")); // HMS uris doesn't match spark.sessionState().catalogManager().reset(); String catalogHmsUri = "RandomString"; spark.conf().set(envHmsUriKey, hmsUri); spark.conf().set(catalogHmsUriKey, catalogHmsUri); - IllegalArgumentException exception = Assert.assertThrows(IllegalArgumentException.class, - () -> spark.sessionState().catalogManager().v2SessionCatalog()); - String errorMessage = String.format("Inconsistent Hive metastore URIs: %s (Spark session) != %s (spark_catalog)", - hmsUri, catalogHmsUri); + IllegalArgumentException exception = + Assert.assertThrows( + IllegalArgumentException.class, + () -> spark.sessionState().catalogManager().v2SessionCatalog()); + String errorMessage = + String.format( + "Inconsistent Hive metastore URIs: %s (Spark session) != %s (spark_catalog)", + hmsUri, catalogHmsUri); Assert.assertEquals(errorMessage, exception.getMessage()); // no env HMS uri, only catalog HMS uri spark.sessionState().catalogManager().reset(); spark.conf().set(catalogHmsUriKey, hmsUri); spark.conf().unset(envHmsUriKey); - Assert.assertTrue(spark.sessionState().catalogManager().v2SessionCatalog().defaultNamespace()[0].equals("default")); + Assert.assertTrue( + spark + .sessionState() + .catalogManager() + .v2SessionCatalog() + .defaultNamespace()[0] + .equals("default")); // no catalog HMS uri, only env HMS uri spark.sessionState().catalogManager().reset(); spark.conf().set(envHmsUriKey, hmsUri); spark.conf().unset(catalogHmsUriKey); - Assert.assertTrue(spark.sessionState().catalogManager().v2SessionCatalog().defaultNamespace()[0].equals("default")); + Assert.assertTrue( + spark + .sessionState() + .catalogManager() + .v2SessionCatalog() + .defaultNamespace()[0] + .equals("default")); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkTableUtil.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkTableUtil.java index 61af0185d8bd..1e51caadd0e9 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkTableUtil.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkTableUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.io.IOException; @@ -57,31 +56,47 @@ public void testSparkPartitionJavaSerialization() throws IOException, ClassNotFo @Test public void testMetricsConfigKryoSerialization() throws Exception { - Map metricsConfig = ImmutableMap.of( - TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts", - TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col1", "full", - TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col2", "truncate(16)"); + Map metricsConfig = + ImmutableMap.of( + TableProperties.DEFAULT_WRITE_METRICS_MODE, + "counts", + TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col1", + "full", + TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col2", + "truncate(16)"); MetricsConfig config = MetricsConfig.fromProperties(metricsConfig); MetricsConfig deserialized = KryoHelpers.roundTripSerialize(config); - Assert.assertEquals(MetricsModes.Full.get().toString(), deserialized.columnMode("col1").toString()); - Assert.assertEquals(MetricsModes.Truncate.withLength(16).toString(), deserialized.columnMode("col2").toString()); - Assert.assertEquals(MetricsModes.Counts.get().toString(), deserialized.columnMode("col3").toString()); + Assert.assertEquals( + MetricsModes.Full.get().toString(), deserialized.columnMode("col1").toString()); + Assert.assertEquals( + MetricsModes.Truncate.withLength(16).toString(), + deserialized.columnMode("col2").toString()); + Assert.assertEquals( + MetricsModes.Counts.get().toString(), deserialized.columnMode("col3").toString()); } @Test public void testMetricsConfigJavaSerialization() throws Exception { - Map metricsConfig = ImmutableMap.of( - TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts", - TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col1", "full", - TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col2", "truncate(16)"); + Map metricsConfig = + ImmutableMap.of( + TableProperties.DEFAULT_WRITE_METRICS_MODE, + "counts", + TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col1", + "full", + TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col2", + "truncate(16)"); MetricsConfig config = MetricsConfig.fromProperties(metricsConfig); MetricsConfig deserialized = TestHelpers.roundTripSerialize(config); - Assert.assertEquals(MetricsModes.Full.get().toString(), deserialized.columnMode("col1").toString()); - Assert.assertEquals(MetricsModes.Truncate.withLength(16).toString(), deserialized.columnMode("col2").toString()); - Assert.assertEquals(MetricsModes.Counts.get().toString(), deserialized.columnMode("col3").toString()); + Assert.assertEquals( + MetricsModes.Full.get().toString(), deserialized.columnMode("col1").toString()); + Assert.assertEquals( + MetricsModes.Truncate.withLength(16).toString(), + deserialized.columnMode("col2").toString()); + Assert.assertEquals( + MetricsModes.Counts.get().toString(), deserialized.columnMode("col3").toString()); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java index 57941b8c7940..7f00c7edd8a9 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Schema; @@ -31,51 +30,55 @@ public class TestSparkValueConverter { @Test public void testSparkNullMapConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); assertCorrectNullConversion(schema); } @Test public void testSparkNullListConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", - Types.ListType.ofOptional(6, Types.StringType.get()) - ) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, "locations", Types.ListType.ofOptional(6, Types.StringType.get()))); assertCorrectNullConversion(schema); } @Test public void testSparkNullStructConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); assertCorrectNullConversion(schema); } @Test public void testSparkNullPrimitiveConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "location", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(5, "location", Types.StringType.get())); assertCorrectNullConversion(schema); } @@ -83,7 +86,8 @@ private void assertCorrectNullConversion(Schema schema) { Row sparkRow = RowFactory.create(1, null); Record record = GenericRecord.create(schema); record.set(0, 1); - Assert.assertEquals("Round-trip conversion should produce original value", + Assert.assertEquals( + "Round-trip conversion should produce original value", record, SparkValueConverter.convert(schema, sparkRow)); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java index c5b1bf31b42e..96950e8227f3 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.io.File; @@ -82,45 +81,61 @@ import scala.collection.Seq; public class TestCreateActions extends SparkCatalogTestBase { - private static final String CREATE_PARTITIONED_PARQUET = "CREATE TABLE %s (id INT, data STRING) " + - "using parquet PARTITIONED BY (id) LOCATION '%s'"; - private static final String CREATE_PARQUET = "CREATE TABLE %s (id INT, data STRING) " + - "using parquet LOCATION '%s'"; - private static final String CREATE_HIVE_EXTERNAL_PARQUET = "CREATE EXTERNAL TABLE %s (data STRING) " + - "PARTITIONED BY (id INT) STORED AS parquet LOCATION '%s'"; - private static final String CREATE_HIVE_PARQUET = "CREATE TABLE %s (data STRING) " + - "PARTITIONED BY (id INT) STORED AS parquet"; + private static final String CREATE_PARTITIONED_PARQUET = + "CREATE TABLE %s (id INT, data STRING) " + "using parquet PARTITIONED BY (id) LOCATION '%s'"; + private static final String CREATE_PARQUET = + "CREATE TABLE %s (id INT, data STRING) " + "using parquet LOCATION '%s'"; + private static final String CREATE_HIVE_EXTERNAL_PARQUET = + "CREATE EXTERNAL TABLE %s (data STRING) " + + "PARTITIONED BY (id INT) STORED AS parquet LOCATION '%s'"; + private static final String CREATE_HIVE_PARQUET = + "CREATE TABLE %s (data STRING) " + "PARTITIONED BY (id INT) STORED AS parquet"; private static final String NAMESPACE = "default"; @Parameterized.Parameters(name = "Catalog Name {0} - Options {2}") public static Object[][] parameters() { return new Object[][] { - new Object[] {"spark_catalog", SparkSessionCatalog.class.getName(), ImmutableMap.of( + new Object[] { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( "type", "hive", "default-namespace", "default", "parquet-enabled", "true", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - )}, - new Object[] {"spark_catalog", SparkSessionCatalog.class.getName(), ImmutableMap.of( + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync + ) + }, + new Object[] { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( "type", "hadoop", "default-namespace", "default", "parquet-enabled", "true", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - )}, - new Object[] { "testhive", SparkCatalog.class.getName(), ImmutableMap.of( + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync + ) + }, + new Object[] { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( "type", "hive", - "default-namespace", "default" - )}, - new Object[] { "testhadoop", SparkCatalog.class.getName(), ImmutableMap.of( + "default-namespace", "default") + }, + new Object[] { + "testhadoop", + SparkCatalog.class.getName(), + ImmutableMap.of( "type", "hadoop", - "default-namespace", "default" - )} + "default-namespace", "default") + } }; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String baseTableName = "baseTable"; private File tableDir; @@ -128,10 +143,7 @@ public static Object[][] parameters() { private final String type; private final TableCatalog catalog; - public TestCreateActions( - String catalogName, - String implementation, - Map config) { + public TestCreateActions(String catalogName, String implementation, Map config) { super(catalogName, implementation, config); this.catalog = (TableCatalog) spark.sessionState().catalogManager().catalog(catalogName); this.type = config.get("type"); @@ -152,15 +164,15 @@ public void before() { spark.conf().set("spark.sql.parquet.writeLegacyFormat", false); spark.sql(String.format("DROP TABLE IF EXISTS %s", baseTableName)); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").orderBy("data").write() + df.select("id", "data") + .orderBy("data") + .write() .mode("append") .option("path", tableLocation) .saveAsTable(baseTableName); @@ -175,7 +187,8 @@ public void after() throws IOException { @Test public void testMigratePartitioned() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_migrate_partitioned_table"); String dest = source; createSourceTable(CREATE_PARTITIONED_PARQUET, source); @@ -185,17 +198,20 @@ public void testMigratePartitioned() throws Exception { @Test public void testPartitionedTableWithUnRecoveredPartitions() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_unrecovered_partitions"); String dest = source; File location = temp.newFolder(); sql(CREATE_PARTITIONED_PARQUET, source, location); // Data generation and partition addition - spark.range(5) + spark + .range(5) .selectExpr("id", "cast(id as STRING) as data") .write() - .partitionBy("id").mode(SaveMode.Overwrite) + .partitionBy("id") + .mode(SaveMode.Overwrite) .parquet(location.toURI().toString()); sql("ALTER TABLE %s ADD PARTITION(id=0)", source); @@ -205,7 +221,8 @@ public void testPartitionedTableWithUnRecoveredPartitions() throws Exception { @Test public void testPartitionedTableWithCustomPartitions() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_custom_parts"); String dest = source; File tblLocation = temp.newFolder(); @@ -213,18 +230,23 @@ public void testPartitionedTableWithCustomPartitions() throws Exception { // Data generation and partition addition spark.sql(String.format(CREATE_PARTITIONED_PARQUET, source, tblLocation)); - spark.range(10) + spark + .range(10) .selectExpr("cast(id as STRING) as data") .write() - .mode(SaveMode.Overwrite).parquet(partitionDataLoc.toURI().toString()); - sql("ALTER TABLE %s ADD PARTITION(id=0) LOCATION '%s'", source, partitionDataLoc.toURI().toString()); + .mode(SaveMode.Overwrite) + .parquet(partitionDataLoc.toURI().toString()); + sql( + "ALTER TABLE %s ADD PARTITION(id=0) LOCATION '%s'", + source, partitionDataLoc.toURI().toString()); assertMigratedFileCount(SparkActions.get().migrateTable(source), source, dest); } @Test public void testAddColumnOnMigratedTableAtEnd() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_add_column_migrated_table"); String dest = source; createSourceTable(CREATE_PARQUET, source); @@ -263,7 +285,8 @@ public void testAddColumnOnMigratedTableAtEnd() throws Exception { @Test public void testAddColumnOnMigratedTableAtMiddle() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_add_column_migrated_table_middle"); String dest = source; createSourceTable(CREATE_PARQUET, source); @@ -277,7 +300,10 @@ public void testAddColumnOnMigratedTableAtMiddle() throws Exception { // test column addition on migrated table Schema beforeSchema = table.schema(); String newCol1 = "newCol"; - sparkTable.table().updateSchema().addColumn("newCol", Types.IntegerType.get()) + sparkTable + .table() + .updateSchema() + .addColumn("newCol", Types.IntegerType.get()) .moveAfter(newCol1, "id") .commit(); Schema afterSchema = table.schema(); @@ -293,16 +319,20 @@ public void testAddColumnOnMigratedTableAtMiddle() throws Exception { @Test public void removeColumnsAtEnd() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_remove_column_migrated_table"); String dest = source; String colName1 = "newCol1"; String colName2 = "newCol2"; File location = temp.newFolder(); - spark.range(10).selectExpr("cast(id as INT)", "CAST(id as INT) " + colName1, "CAST(id as INT) " + colName2) + spark + .range(10) + .selectExpr("cast(id as INT)", "CAST(id as INT) " + colName1, "CAST(id as INT) " + colName2) .write() - .mode(SaveMode.Overwrite).saveAsTable(dest); + .mode(SaveMode.Overwrite) + .saveAsTable(dest); List expected1 = sql("select id, %s from %s order by id", colName1, source); List expected2 = sql("select id from %s order by id", source); @@ -336,13 +366,19 @@ public void removeColumnsAtEnd() throws Exception { @Test public void removeColumnFromMiddle() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_remove_column_migrated_table_from_middle"); String dest = source; String dropColumnName = "col1"; - spark.range(10).selectExpr("cast(id as INT)", "CAST(id as INT) as " + - dropColumnName, "CAST(id as INT) as col2").write().mode(SaveMode.Overwrite).saveAsTable(dest); + spark + .range(10) + .selectExpr( + "cast(id as INT)", "CAST(id as INT) as " + dropColumnName, "CAST(id as INT) as col2") + .write() + .mode(SaveMode.Overwrite) + .saveAsTable(dest); List expected = sql("select id, col2 from %s order by id", source); // migrate table @@ -362,7 +398,8 @@ public void removeColumnFromMiddle() throws Exception { @Test public void testMigrateUnpartitioned() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_migrate_unpartitioned_table"); String dest = source; createSourceTable(CREATE_PARQUET, source); @@ -371,40 +408,49 @@ public void testMigrateUnpartitioned() throws Exception { @Test public void testSnapshotPartitioned() throws Exception { - Assume.assumeTrue("Cannot snapshot with arbitrary location in a hadoop based catalog", + Assume.assumeTrue( + "Cannot snapshot with arbitrary location in a hadoop based catalog", !type.equals("hadoop")); File location = temp.newFolder(); String source = sourceName("test_snapshot_partitioned_table"); String dest = destName("iceberg_snapshot_partitioned"); createSourceTable(CREATE_PARTITIONED_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); assertIsolatedSnapshot(source, dest); } @Test public void testSnapshotUnpartitioned() throws Exception { - Assume.assumeTrue("Cannot snapshot with arbitrary location in a hadoop based catalog", + Assume.assumeTrue( + "Cannot snapshot with arbitrary location in a hadoop based catalog", !type.equals("hadoop")); File location = temp.newFolder(); String source = sourceName("test_snapshot_unpartitioned_table"); String dest = destName("iceberg_snapshot_unpartitioned"); createSourceTable(CREATE_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); assertIsolatedSnapshot(source, dest); } @Test public void testSnapshotHiveTable() throws Exception { - Assume.assumeTrue("Cannot snapshot with arbitrary location in a hadoop based catalog", + Assume.assumeTrue( + "Cannot snapshot with arbitrary location in a hadoop based catalog", !type.equals("hadoop")); File location = temp.newFolder(); String source = sourceName("snapshot_hive_table"); String dest = destName("iceberg_snapshot_hive_table"); createSourceTable(CREATE_HIVE_EXTERNAL_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); assertIsolatedSnapshot(source, dest); } @@ -425,7 +471,9 @@ public void testSnapshotManagedHiveTable() throws Exception { String dest = destName("iceberg_snapshot_managed_hive_table"); createSourceTable(CREATE_HIVE_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); assertIsolatedSnapshot(source, dest); } @@ -437,7 +485,9 @@ public void testMigrateManagedHiveTable() throws Exception { String dest = destName("iceberg_migrate_managed_hive_table"); createSourceTable(CREATE_HIVE_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); } @Test @@ -449,11 +499,15 @@ public void testProperties() throws Exception { props.put("note", "Jazz"); createSourceTable(CREATE_PARQUET, source); for (Map.Entry keyValue : props.entrySet()) { - spark.sql(String.format("ALTER TABLE %s SET TBLPROPERTIES (\"%s\" = \"%s\")", - source, keyValue.getKey(), keyValue.getValue())); + spark.sql( + String.format( + "ALTER TABLE %s SET TBLPROPERTIES (\"%s\" = \"%s\")", + source, keyValue.getKey(), keyValue.getValue())); } assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableProperty("dogs", "sundance"), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableProperty("dogs", "sundance"), + source, + dest); SparkTable table = loadTable(dest); Map expectedProps = Maps.newHashMap(); @@ -464,8 +518,10 @@ public void testProperties() throws Exception { Assert.assertTrue( "Created table missing property " + entry.getKey(), table.properties().containsKey(entry.getKey())); - Assert.assertEquals("Property value is not the expected value", - entry.getValue(), table.properties().get(entry.getKey())); + Assert.assertEquals( + "Property value is not the expected value", + entry.getValue(), + table.properties().get(entry.getKey())); } } @@ -483,24 +539,38 @@ public void testSparkTableReservedProperties() throws Exception { String[] keys = {"provider", "format", "current-snapshot-id", "location", "sort-order"}; for (String entry : keys) { - Assert.assertTrue("Created table missing reserved property " + entry, table.properties().containsKey(entry)); + Assert.assertTrue( + "Created table missing reserved property " + entry, + table.properties().containsKey(entry)); } Assert.assertEquals("Unexpected provider", "iceberg", table.properties().get("provider")); Assert.assertEquals("Unexpected format", "iceberg/parquet", table.properties().get("format")); - Assert.assertNotEquals("No current-snapshot-id found", "none", table.properties().get("current-snapshot-id")); - Assert.assertTrue("Location isn't correct", table.properties().get("location").endsWith(destTableName)); + Assert.assertNotEquals( + "No current-snapshot-id found", "none", table.properties().get("current-snapshot-id")); + Assert.assertTrue( + "Location isn't correct", table.properties().get("location").endsWith(destTableName)); Assert.assertEquals("Unexpected format-version", "1", table.properties().get("format-version")); table.table().updateProperties().set("format-version", "2").commit(); Assert.assertEquals("Unexpected format-version", "2", table.properties().get("format-version")); - Assert.assertEquals("Sort-order isn't correct", "id ASC NULLS FIRST, data DESC NULLS LAST", + Assert.assertEquals( + "Sort-order isn't correct", + "id ASC NULLS FIRST, data DESC NULLS LAST", table.properties().get("sort-order")); - Assert.assertNull("Identifier fields should be null", table.properties().get("identifier-fields")); - - table.table().updateSchema().allowIncompatibleChanges().requireColumn("id").setIdentifierFields("id").commit(); - Assert.assertEquals("Identifier fields aren't correct", "[id]", table.properties().get("identifier-fields")); + Assert.assertNull( + "Identifier fields should be null", table.properties().get("identifier-fields")); + + table + .table() + .updateSchema() + .allowIncompatibleChanges() + .requireColumn("id") + .setIdentifierFields("id") + .commit(); + Assert.assertEquals( + "Identifier fields aren't correct", "[id]", table.properties().get("identifier-fields")); } @Test @@ -515,30 +585,37 @@ public void testSnapshotDefaultLocation() throws Exception { @Test public void schemaEvolutionTestWithSparkAPI() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); File location = temp.newFolder(); String tblName = sourceName("schema_evolution_test"); // Data generation and partition addition - spark.range(0, 5) + spark + .range(0, 5) .selectExpr("CAST(id as INT) as col0", "CAST(id AS FLOAT) col2", "CAST(id AS LONG) col3") .write() .mode(SaveMode.Append) .parquet(location.toURI().toString()); - Dataset rowDataset = spark.range(6, 10) - .selectExpr("CAST(id as INT) as col0", "CAST(id AS STRING) col1", - "CAST(id AS FLOAT) col2", "CAST(id AS LONG) col3"); - rowDataset - .write() - .mode(SaveMode.Append) - .parquet(location.toURI().toString()); - spark.read() + Dataset rowDataset = + spark + .range(6, 10) + .selectExpr( + "CAST(id as INT) as col0", + "CAST(id AS STRING) col1", + "CAST(id AS FLOAT) col2", + "CAST(id AS LONG) col3"); + rowDataset.write().mode(SaveMode.Append).parquet(location.toURI().toString()); + spark + .read() .schema(rowDataset.schema()) - .parquet(location.toURI().toString()).write().saveAsTable(tblName); + .parquet(location.toURI().toString()) + .write() + .saveAsTable(tblName); List expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName); - List expectedAfterAddColumn = sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", - tblName); + List expectedAfterAddColumn = + sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", tblName); // Migrate table SparkActions.get().migrateTable(tblName).execute(); @@ -549,7 +626,10 @@ public void schemaEvolutionTestWithSparkAPI() throws Exception { // Update schema and check output correctness SparkTable sparkTable = loadTable(tblName); - sparkTable.table().updateSchema().addColumn("newCol", Types.IntegerType.get()) + sparkTable + .table() + .updateSchema() + .addColumn("newCol", Types.IntegerType.get()) .moveAfter("newCol", "col0") .commit(); List afterMigarteAfterAddResults = sql("SELECT * FROM %s ORDER BY col0", tblName); @@ -559,23 +639,30 @@ public void schemaEvolutionTestWithSparkAPI() throws Exception { @Test public void schemaEvolutionTestWithSparkSQL() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String tblName = sourceName("schema_evolution_test_sql"); // Data generation and partition addition - spark.range(0, 5) + spark + .range(0, 5) .selectExpr("CAST(id as INT) col0", "CAST(id AS FLOAT) col1", "CAST(id AS STRING) col2") .write() .mode(SaveMode.Append) .saveAsTable(tblName); sql("ALTER TABLE %s ADD COLUMN col3 INT", tblName); - spark.range(6, 10) - .selectExpr("CAST(id AS INT) col0", "CAST(id AS FLOAT) col1", "CAST(id AS STRING) col2", "CAST(id AS INT) col3") + spark + .range(6, 10) + .selectExpr( + "CAST(id AS INT) col0", + "CAST(id AS FLOAT) col1", + "CAST(id AS STRING) col2", + "CAST(id AS INT) col3") .registerTempTable("tempdata"); sql("INSERT INTO TABLE %s SELECT * FROM tempdata", tblName); List expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName); - List expectedAfterAddColumn = sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", - tblName); + List expectedAfterAddColumn = + sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", tblName); // Migrate table SparkActions.get().migrateTable(tblName).execute(); @@ -586,7 +673,10 @@ public void schemaEvolutionTestWithSparkSQL() throws Exception { // Update schema and check output correctness SparkTable sparkTable = loadTable(tblName); - sparkTable.table().updateSchema().addColumn("newCol", Types.IntegerType.get()) + sparkTable + .table() + .updateSchema() + .addColumn("newCol", Types.IntegerType.get()) .moveAfter("newCol", "col0") .commit(); List afterMigarteAfterAddResults = sql("SELECT * FROM %s ORDER BY col0", tblName); @@ -642,52 +732,70 @@ public void testTwoLevelList() throws IOException { StructType sparkSchema = new StructType( - new StructField[]{ - new StructField( - "col1", new ArrayType( - new StructType( - new StructField[]{ - new StructField( - "col2", - DataTypes.IntegerType, - false, - Metadata.empty()) - }), false), true, Metadata.empty())}); - - // even though this list looks like three level list, it is actually a 2-level list where the items are + new StructField[] { + new StructField( + "col1", + new ArrayType( + new StructType( + new StructField[] { + new StructField("col2", DataTypes.IntegerType, false, Metadata.empty()) + }), + false), + true, + Metadata.empty()) + }); + + // even though this list looks like three level list, it is actually a 2-level list where the + // items are // structs with 1 field. String expectedParquetSchema = - "message spark_schema {\n" + - " optional group col1 (LIST) {\n" + - " repeated group array {\n" + - " required int32 col2;\n" + - " }\n" + - " }\n" + - "}\n"; + "message spark_schema {\n" + + " optional group col1 (LIST) {\n" + + " repeated group array {\n" + + " required int32 col2;\n" + + " }\n" + + " }\n" + + "}\n"; // generate parquet file with required schema List testData = Collections.singletonList("{\"col1\": [{\"col2\": 1}]}"); - spark.read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(testData)) - .coalesce(1).write().format("parquet").mode(SaveMode.Append).save(location.getPath()); - - File parquetFile = Arrays.stream(Preconditions.checkNotNull(location.listFiles(new FilenameFilter() { - @Override - public boolean accept(File dir, String name) { - return name.endsWith("parquet"); - } - }))).findAny().get(); + spark + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(testData)) + .coalesce(1) + .write() + .format("parquet") + .mode(SaveMode.Append) + .save(location.getPath()); + + File parquetFile = + Arrays.stream( + Preconditions.checkNotNull( + location.listFiles( + new FilenameFilter() { + @Override + public boolean accept(File dir, String name) { + return name.endsWith("parquet"); + } + }))) + .findAny() + .get(); // verify generated parquet file has expected schema - ParquetFileReader pqReader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(parquetFile.getPath()), - spark.sessionState().newHadoopConf())); + ParquetFileReader pqReader = + ParquetFileReader.open( + HadoopInputFile.fromPath( + new Path(parquetFile.getPath()), spark.sessionState().newHadoopConf())); MessageType schema = pqReader.getFooter().getFileMetaData().getSchema(); Assert.assertEquals(MessageTypeParser.parseMessageType(expectedParquetSchema), schema); // create sql table on top of it - sql("CREATE EXTERNAL TABLE %s (col1 ARRAY>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE EXTERNAL TABLE %s (col1 ARRAY>)" + + " STORED AS parquet" + + " LOCATION '%s'", + tableName, location); List expected = sql("select array(struct(1))"); // migrate table @@ -704,9 +812,9 @@ private void threeLevelList(boolean useLegacyMode) throws Exception { String tableName = sourceName(String.format("threeLevelList_%s", useLegacyMode)); File location = temp.newFolder(); - sql("CREATE TABLE %s (col1 ARRAY>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (col1 ARRAY>)" + " STORED AS parquet" + " LOCATION '%s'", + tableName, location); int testValue = 12345; sql("INSERT INTO %s VALUES (ARRAY(STRUCT(%s)))", tableName, testValue); @@ -724,11 +832,14 @@ private void threeLevelList(boolean useLegacyMode) throws Exception { private void threeLevelListWithNestedStruct(boolean useLegacyMode) throws Exception { spark.conf().set("spark.sql.parquet.writeLegacyFormat", useLegacyMode); - String tableName = sourceName(String.format("threeLevelListWithNestedStruct_%s", useLegacyMode)); + String tableName = + sourceName(String.format("threeLevelListWithNestedStruct_%s", useLegacyMode)); File location = temp.newFolder(); - sql("CREATE TABLE %s (col1 ARRAY>>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (col1 ARRAY>>)" + + " STORED AS parquet" + + " LOCATION '%s'", + tableName, location); int testValue = 12345; sql("INSERT INTO %s VALUES (ARRAY(STRUCT(STRUCT(%s))))", tableName, testValue); @@ -748,13 +859,16 @@ private void threeLevelLists(boolean useLegacyMode) throws Exception { String tableName = sourceName(String.format("threeLevelLists_%s", useLegacyMode)); File location = temp.newFolder(); - sql("CREATE TABLE %s (col1 ARRAY>, col3 ARRAY>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (col1 ARRAY>, col3 ARRAY>)" + + " STORED AS parquet" + + " LOCATION '%s'", + tableName, location); int testValue1 = 12345; int testValue2 = 987654; - sql("INSERT INTO %s VALUES (ARRAY(STRUCT(%s)), ARRAY(STRUCT(%s)))", + sql( + "INSERT INTO %s VALUES (ARRAY(STRUCT(%s)), ARRAY(STRUCT(%s)))", tableName, testValue1, testValue2); List expected = sql(String.format("SELECT * FROM %s", tableName)); @@ -772,13 +886,14 @@ private void structOfThreeLevelLists(boolean useLegacyMode) throws Exception { String tableName = sourceName(String.format("structOfThreeLevelLists_%s", useLegacyMode)); File location = temp.newFolder(); - sql("CREATE TABLE %s (col1 STRUCT>>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (col1 STRUCT>>)" + + " STORED AS parquet" + + " LOCATION '%s'", + tableName, location); int testValue1 = 12345; - sql("INSERT INTO %s VALUES (STRUCT(STRUCT(ARRAY(STRUCT(%s)))))", - tableName, testValue1); + sql("INSERT INTO %s VALUES (STRUCT(STRUCT(ARRAY(STRUCT(%s)))))", tableName, testValue1); List expected = sql(String.format("SELECT * FROM %s", tableName)); // migrate table @@ -790,16 +905,19 @@ private void structOfThreeLevelLists(boolean useLegacyMode) throws Exception { assertEquals("Output must match", expected, results); } - private SparkTable loadTable(String name) throws NoSuchTableException, ParseException { - return (SparkTable) catalog.loadTable(Spark3Util.catalogAndIdentifier(spark, name).identifier()); + return (SparkTable) + catalog.loadTable(Spark3Util.catalogAndIdentifier(spark, name).identifier()); } private CatalogTable loadSessionTable(String name) throws NoSuchTableException, NoSuchDatabaseException, ParseException { Identifier identifier = Spark3Util.catalogAndIdentifier(spark, name).identifier(); Some namespace = Some.apply(identifier.namespace()[0]); - return spark.sessionState().catalog().getTableMetadata(new TableIdentifier(identifier.name(), namespace)); + return spark + .sessionState() + .catalog() + .getTableMetadata(new TableIdentifier(identifier.name(), namespace)); } private void createSourceTable(String createStatement, String tableName) @@ -809,41 +927,57 @@ private void createSourceTable(String createStatement, String tableName) CatalogTable table = loadSessionTable(tableName); Seq partitionColumns = table.partitionColumnNames(); String format = table.provider().get(); - spark.table(baseTableName).write().mode(SaveMode.Append).format(format).partitionBy(partitionColumns.toSeq()) + spark + .table(baseTableName) + .write() + .mode(SaveMode.Append) + .format(format) + .partitionBy(partitionColumns.toSeq()) .saveAsTable(tableName); } - // Counts the number of files in the source table, makes sure the same files exist in the destination table + // Counts the number of files in the source table, makes sure the same files exist in the + // destination table private void assertMigratedFileCount(MigrateTable migrateAction, String source, String dest) throws NoSuchTableException, NoSuchDatabaseException, ParseException { long expectedFiles = expectedFilesCount(source); MigrateTable.Result migratedFiles = migrateAction.execute(); validateTables(source, dest); - Assert.assertEquals("Expected number of migrated files", - expectedFiles, migratedFiles.migratedDataFilesCount()); + Assert.assertEquals( + "Expected number of migrated files", expectedFiles, migratedFiles.migratedDataFilesCount()); } - // Counts the number of files in the source table, makes sure the same files exist in the destination table + // Counts the number of files in the source table, makes sure the same files exist in the + // destination table private void assertSnapshotFileCount(SnapshotTable snapshotTable, String source, String dest) throws NoSuchTableException, NoSuchDatabaseException, ParseException { long expectedFiles = expectedFilesCount(source); SnapshotTable.Result snapshotTableResult = snapshotTable.execute(); validateTables(source, dest); - Assert.assertEquals("Expected number of imported snapshot files", expectedFiles, + Assert.assertEquals( + "Expected number of imported snapshot files", + expectedFiles, snapshotTableResult.importedDataFilesCount()); } - private void validateTables(String source, String dest) throws NoSuchTableException, ParseException { + private void validateTables(String source, String dest) + throws NoSuchTableException, ParseException { List expected = spark.table(source).collectAsList(); SparkTable destTable = loadTable(dest); - Assert.assertEquals("Provider should be iceberg", "iceberg", + Assert.assertEquals( + "Provider should be iceberg", + "iceberg", destTable.properties().get(TableCatalog.PROP_PROVIDER)); List actual = spark.table(dest).collectAsList(); - Assert.assertTrue(String.format("Rows in migrated table did not match\nExpected :%s rows \nFound :%s", - expected, actual), expected.containsAll(actual) && actual.containsAll(expected)); + Assert.assertTrue( + String.format( + "Rows in migrated table did not match\nExpected :%s rows \nFound :%s", + expected, actual), + expected.containsAll(actual) && actual.containsAll(expected)); } - private long expectedFilesCount(String source) throws NoSuchDatabaseException, NoSuchTableException, ParseException { + private long expectedFilesCount(String source) + throws NoSuchDatabaseException, NoSuchTableException, ParseException { CatalogTable sourceTable = loadSessionTable(source); List uris; if (sourceTable.partitionColumnNames().size() == 0) { @@ -851,34 +985,42 @@ private long expectedFilesCount(String source) throws NoSuchDatabaseException, N uris.add(sourceTable.location()); } else { Seq catalogTablePartitionSeq = - spark.sessionState().catalog().listPartitions(sourceTable.identifier(), Option.apply(null)); - uris = JavaConverters.seqAsJavaList(catalogTablePartitionSeq) - .stream() - .map(CatalogTablePartition::location) - .collect(Collectors.toList()); + spark + .sessionState() + .catalog() + .listPartitions(sourceTable.identifier(), Option.apply(null)); + uris = + JavaConverters.seqAsJavaList(catalogTablePartitionSeq).stream() + .map(CatalogTablePartition::location) + .collect(Collectors.toList()); } return uris.stream() - .flatMap(uri -> - FileUtils.listFiles(Paths.get(uri).toFile(), - TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).stream()) - .filter(file -> !file.toString().endsWith("crc") && !file.toString().contains("_SUCCESS")).count(); + .flatMap( + uri -> + FileUtils.listFiles( + Paths.get(uri).toFile(), TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE) + .stream()) + .filter(file -> !file.toString().endsWith("crc") && !file.toString().contains("_SUCCESS")) + .count(); } - // Insert records into the destination, makes sure those records exist and source table is unchanged + // Insert records into the destination, makes sure those records exist and source table is + // unchanged private void assertIsolatedSnapshot(String source, String dest) { List expected = spark.sql(String.format("SELECT * FROM %s", source)).collectAsList(); - List extraData = Lists.newArrayList( - new SimpleRecord(4, "d") - ); + List extraData = Lists.newArrayList(new SimpleRecord(4, "d")); Dataset df = spark.createDataFrame(extraData, SimpleRecord.class); df.write().format("iceberg").mode("append").saveAsTable(dest); List result = spark.sql(String.format("SELECT * FROM %s", source)).collectAsList(); - Assert.assertEquals("No additional rows should be added to the original table", expected.size(), - result.size()); + Assert.assertEquals( + "No additional rows should be added to the original table", expected.size(), result.size()); - List snapshot = spark.sql(String.format("SELECT * FROM %s WHERE id = 4 AND data = 'd'", dest)).collectAsList(); + List snapshot = + spark + .sql(String.format("SELECT * FROM %s WHERE id = 4 AND data = 'd'", dest)) + .collectAsList(); Assert.assertEquals("Added row not found in snapshot", 1, snapshot.size()); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java index 9792d772b12d..9090da2fe69b 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -54,46 +55,47 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestDeleteReachableFilesAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); private static final int SHUFFLE_PARTITIONS = 2; private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("c1").build(); - static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(0)) - .withRecordCount(1) - .build(); - static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(1)) - .withRecordCount(1) - .build(); - static final DataFile FILE_C = DataFiles.builder(SPEC) - .withPath("/path/to/data-c.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(2)) - .withRecordCount(1) - .build(); - static final DataFile FILE_D = DataFiles.builder(SPEC) - .withPath("/path/to/data-d.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(3)) - .withRecordCount(1) - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(0)) + .withRecordCount(1) + .build(); + static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(1)) + .withRecordCount(1) + .build(); + static final DataFile FILE_C = + DataFiles.builder(SPEC) + .withPath("/path/to/data-c.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(2)) + .withRecordCount(1) + .build(); + static final DataFile FILE_D = + DataFiles.builder(SPEC) + .withPath("/path/to/data-d.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(3)) + .withRecordCount(1) + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @@ -105,62 +107,76 @@ public void setupTableLocation() throws Exception { spark.conf().set("spark.sql.shuffle.partitions", SHUFFLE_PARTITIONS); } - private void checkRemoveFilesResults(long expectedDatafiles, long expectedManifestsDeleted, - long expectedManifestListsDeleted, long expectedOtherFilesDeleted, - DeleteReachableFiles.Result results) { - Assert.assertEquals("Incorrect number of manifest files deleted", - expectedManifestsDeleted, results.deletedManifestsCount()); - Assert.assertEquals("Incorrect number of datafiles deleted", - expectedDatafiles, results.deletedDataFilesCount()); - Assert.assertEquals("Incorrect number of manifest lists deleted", - expectedManifestListsDeleted, results.deletedManifestListsCount()); - Assert.assertEquals("Incorrect number of other lists deleted", - expectedOtherFilesDeleted, results.deletedOtherFilesCount()); + private void checkRemoveFilesResults( + long expectedDatafiles, + long expectedManifestsDeleted, + long expectedManifestListsDeleted, + long expectedOtherFilesDeleted, + DeleteReachableFiles.Result results) { + Assert.assertEquals( + "Incorrect number of manifest files deleted", + expectedManifestsDeleted, + results.deletedManifestsCount()); + Assert.assertEquals( + "Incorrect number of datafiles deleted", + expectedDatafiles, + results.deletedDataFilesCount()); + Assert.assertEquals( + "Incorrect number of manifest lists deleted", + expectedManifestListsDeleted, + results.deletedManifestListsCount()); + Assert.assertEquals( + "Incorrect number of other lists deleted", + expectedOtherFilesDeleted, + results.deletedOtherFilesCount()); } @Test public void dataFilesCleanupWithParallelTasks() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit(); Set deletedFiles = ConcurrentHashMap.newKeySet(); Set deleteThreads = ConcurrentHashMap.newKeySet(); AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - DeleteReachableFiles.Result result = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .executeDeleteWith(Executors.newFixedThreadPool(4, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-files-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })) - .deleteWith(s -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(s); - }) - .execute(); - - // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory - Assert.assertEquals(deleteThreads, + DeleteReachableFiles.Result result = + sparkActions() + .deleteReachableFiles(metadataLocation(table)) + .io(table.io()) + .executeDeleteWith( + Executors.newFixedThreadPool( + 4, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("remove-files-" + deleteThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })) + .deleteWith( + s -> { + deleteThreads.add(Thread.currentThread().getName()); + deletedFiles.add(s); + }) + .execute(); + + // Verifies that the delete methods ran in the threads created by the provided ExecutorService + // ThreadFactory + Assert.assertEquals( + deleteThreads, Sets.newHashSet("remove-files-0", "remove-files-1", "remove-files-2", "remove-files-3")); - Lists.newArrayList(FILE_A, FILE_B, FILE_C, FILE_D).forEach(file -> - Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())) - ); + Lists.newArrayList(FILE_A, FILE_B, FILE_C, FILE_D) + .forEach( + file -> + Assert.assertTrue( + "FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString()))); checkRemoveFilesResults(4L, 6L, 4L, 6, result); } @@ -168,64 +184,43 @@ public void dataFilesCleanupWithParallelTasks() { public void testWithExpiringDanglingStageCommit() { table.location(); // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` staged commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); // `C` commit - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); - DeleteReachableFiles.Result result = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .execute(); + DeleteReachableFiles.Result result = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()).execute(); checkRemoveFilesResults(3L, 3L, 3L, 5, result); } @Test public void testRemoveFileActionOnEmptyTable() { - DeleteReachableFiles.Result result = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .execute(); + DeleteReachableFiles.Result result = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()).execute(); checkRemoveFilesResults(0, 0, 0, 2, result); } @Test public void testRemoveFilesActionWithReducedVersionsTable() { - table.updateProperties() - .set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "2").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); - - table.newAppend() - .appendFile(FILE_B) - .commit(); - - table.newAppend() - .appendFile(FILE_B) - .commit(); - - table.newAppend() - .appendFile(FILE_C) - .commit(); - - table.newAppend() - .appendFile(FILE_D) - .commit(); - - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()); + table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "2").commit(); + table.newAppend().appendFile(FILE_A).commit(); + + table.newAppend().appendFile(FILE_B).commit(); + + table.newAppend().appendFile(FILE_B).commit(); + + table.newAppend().appendFile(FILE_C).commit(); + + table.newAppend().appendFile(FILE_D).commit(); + + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); DeleteReachableFiles.Result result = baseRemoveFilesSparkAction.execute(); checkRemoveFilesResults(4, 5, 5, 8, result); @@ -233,113 +228,101 @@ public void testRemoveFilesActionWithReducedVersionsTable() { @Test public void testRemoveFilesAction() { - table.newAppend() - .appendFile(FILE_A) - .commit(); - - table.newAppend() - .appendFile(FILE_B) - .commit(); - - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()); - checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); + table.newAppend().appendFile(FILE_A).commit(); + + table.newAppend().appendFile(FILE_B).commit(); + + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); + checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); } @Test public void testRemoveFilesActionWithDefaultIO() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); // IO not set explicitly on removeReachableFiles action // IO defaults to HadoopFileIO - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)); - checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)); + checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); } @Test public void testUseLocalIterator() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); int jobsBefore = spark.sparkContext().dagScheduler().nextJobId().get(); - withSQLConf(ImmutableMap.of("spark.sql.adaptive.enabled", "false"), () -> { - DeleteReachableFiles.Result results = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .option("stream-results", "true").execute(); - - int jobsAfter = spark.sparkContext().dagScheduler().nextJobId().get(); - int totalJobsRun = jobsAfter - jobsBefore; - - checkRemoveFilesResults(3L, 4L, 3L, 5, results); - - Assert.assertEquals( - "Expected total jobs to be equal to total number of shuffle partitions", - totalJobsRun, SHUFFLE_PARTITIONS); - }); + withSQLConf( + ImmutableMap.of("spark.sql.adaptive.enabled", "false"), + () -> { + DeleteReachableFiles.Result results = + sparkActions() + .deleteReachableFiles(metadataLocation(table)) + .io(table.io()) + .option("stream-results", "true") + .execute(); + + int jobsAfter = spark.sparkContext().dagScheduler().nextJobId().get(); + int totalJobsRun = jobsAfter - jobsBefore; + + checkRemoveFilesResults(3L, 4L, 3L, 5, results); + + Assert.assertEquals( + "Expected total jobs to be equal to total number of shuffle partitions", + totalJobsRun, + SHUFFLE_PARTITIONS); + }); } @Test public void testIgnoreMetadataFilesNotFound() { - table.updateProperties() - .set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1").commit(); + table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // There are three metadata json files at this point DeleteOrphanFiles.Result result = sparkActions().deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); - Assert.assertTrue("Should remove v1 file", + Assert.assertTrue( + "Should remove v1 file", StreamSupport.stream(result.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("v1.metadata.json"))); - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()); + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); DeleteReachableFiles.Result res = baseRemoveFilesSparkAction.execute(); - checkRemoveFilesResults(1, 1, 1, 4, res); + checkRemoveFilesResults(1, 1, 1, 4, res); } @Test public void testEmptyIOThrowsException() { - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(null); - AssertHelpers.assertThrows("FileIO can't be null in DeleteReachableFiles action", - IllegalArgumentException.class, "File IO cannot be null", + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(null); + AssertHelpers.assertThrows( + "FileIO can't be null in DeleteReachableFiles action", + IllegalArgumentException.class, + "File IO cannot be null", baseRemoveFilesSparkAction::execute); } @Test public void testRemoveFilesActionWhenGarbageCollectionDisabled() { - table.updateProperties() - .set(TableProperties.GC_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - AssertHelpers.assertThrows("Should complain about removing files when GC is disabled", - ValidationException.class, "Cannot delete files: GC is disabled (deleting files may corrupt other tables)", + AssertHelpers.assertThrows( + "Should complain about removing files when GC is disabled", + ValidationException.class, + "Cannot delete files: GC is disabled (deleting files may corrupt other tables)", () -> sparkActions().deleteReachableFiles(metadataLocation(table)).execute()); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java index 232a85f9840f..6c6240a3b589 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.util.List; @@ -61,60 +62,63 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestExpireSnapshotsAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); private static final int SHUFFLE_PARTITIONS = 2; private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("c1").build(); - static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=1") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_C = DataFiles.builder(SPEC) - .withPath("/path/to/data-c.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=2") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_D = DataFiles.builder(SPEC) - .withPath("/path/to/data-d.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=3") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DeleteFile FILE_A_POS_DELETES = FileMetadata.deleteFileBuilder(SPEC) - .ofPositionDeletes() - .withPath("/path/to/data-a-pos-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DeleteFile FILE_A_EQ_DELETES = FileMetadata.deleteFileBuilder(SPEC) - .ofEqualityDeletes() - .withPath("/path/to/data-a-eq-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=1") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_C = + DataFiles.builder(SPEC) + .withPath("/path/to/data-c.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=2") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_D = + DataFiles.builder(SPEC) + .withPath("/path/to/data-d.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=3") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DeleteFile FILE_A_POS_DELETES = + FileMetadata.deleteFileBuilder(SPEC) + .ofPositionDeletes() + .withPath("/path/to/data-a-pos-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DeleteFile FILE_A_EQ_DELETES = + FileMetadata.deleteFileBuilder(SPEC) + .ofEqualityDeletes() + .withPath("/path/to/data-a-eq-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableDir; private String tableLocation; @@ -140,42 +144,51 @@ private Long rightAfterSnapshot(long snapshotId) { return end; } - private void checkExpirationResults(long expectedDatafiles, long expectedPosDeleteFiles, long expectedEqDeleteFiles, - long expectedManifestsDeleted, long expectedManifestListsDeleted, - ExpireSnapshots.Result results) { - - Assert.assertEquals("Incorrect number of manifest files deleted", - expectedManifestsDeleted, results.deletedManifestsCount()); - Assert.assertEquals("Incorrect number of datafiles deleted", - expectedDatafiles, results.deletedDataFilesCount()); - Assert.assertEquals("Incorrect number of pos deletefiles deleted", - expectedPosDeleteFiles, results.deletedPositionDeleteFilesCount()); - Assert.assertEquals("Incorrect number of eq deletefiles deleted", - expectedEqDeleteFiles, results.deletedEqualityDeleteFilesCount()); - Assert.assertEquals("Incorrect number of manifest lists deleted", - expectedManifestListsDeleted, results.deletedManifestListsCount()); + private void checkExpirationResults( + long expectedDatafiles, + long expectedPosDeleteFiles, + long expectedEqDeleteFiles, + long expectedManifestsDeleted, + long expectedManifestListsDeleted, + ExpireSnapshots.Result results) { + + Assert.assertEquals( + "Incorrect number of manifest files deleted", + expectedManifestsDeleted, + results.deletedManifestsCount()); + Assert.assertEquals( + "Incorrect number of datafiles deleted", + expectedDatafiles, + results.deletedDataFilesCount()); + Assert.assertEquals( + "Incorrect number of pos deletefiles deleted", + expectedPosDeleteFiles, + results.deletedPositionDeleteFilesCount()); + Assert.assertEquals( + "Incorrect number of eq deletefiles deleted", + expectedEqDeleteFiles, + results.deletedEqualityDeleteFilesCount()); + Assert.assertEquals( + "Incorrect number of manifest lists deleted", + expectedManifestListsDeleted, + results.deletedManifestListsCount()); } @Test public void testFilesCleaned() throws Exception { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); long end = rightAfterSnapshot(); - ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); + ExpireSnapshots.Result results = + SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); - Assert.assertEquals("Table does not have 1 snapshot after expiration", 1, Iterables.size(table.snapshots())); + Assert.assertEquals( + "Table does not have 1 snapshot after expiration", 1, Iterables.size(table.snapshots())); checkExpirationResults(1L, 0L, 0L, 1L, 2L, results); } @@ -183,21 +196,13 @@ public void testFilesCleaned() throws Exception { @Test public void dataFilesCleanupWithParallelTasks() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit(); long t4 = rightAfterSnapshot(); @@ -205,23 +210,33 @@ public void dataFilesCleanupWithParallelTasks() throws IOException { Set deleteThreads = ConcurrentHashMap.newKeySet(); AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .executeDeleteWith(Executors.newFixedThreadPool(4, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-snapshot-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })) - .expireOlderThan(t4) - .deleteWith(s -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(s); - }) - .execute(); - - // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory - Assert.assertEquals(deleteThreads, - Sets.newHashSet("remove-snapshot-0", "remove-snapshot-1", "remove-snapshot-2", "remove-snapshot-3")); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .executeDeleteWith( + Executors.newFixedThreadPool( + 4, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("remove-snapshot-" + deleteThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })) + .expireOlderThan(t4) + .deleteWith( + s -> { + deleteThreads.add(Thread.currentThread().getName()); + deletedFiles.add(s); + }) + .execute(); + + // Verifies that the delete methods ran in the threads created by the provided ExecutorService + // ThreadFactory + Assert.assertEquals( + deleteThreads, + Sets.newHashSet( + "remove-snapshot-0", "remove-snapshot-1", "remove-snapshot-2", "remove-snapshot-3")); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); Assert.assertTrue("FILE_B should be deleted", deletedFiles.contains(FILE_B.path().toString())); @@ -231,9 +246,7 @@ public void dataFilesCleanupWithParallelTasks() throws IOException { @Test public void testNoFilesDeletedWhenNoSnapshotsExpired() throws Exception { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).execute(); checkExpirationResults(0L, 0L, 0L, 0L, 0L, results); @@ -241,30 +254,24 @@ public void testNoFilesDeletedWhenNoSnapshotsExpired() throws Exception { @Test public void testCleanupRepeatedOverwrites() throws Exception { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); for (int i = 0; i < 10; i++) { - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); - - table.newOverwrite() - .deleteFile(FILE_B) - .addFile(FILE_A) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); + + table.newOverwrite().deleteFile(FILE_B).addFile(FILE_A).commit(); } long end = rightAfterSnapshot(); - ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); + ExpireSnapshots.Result results = + SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); checkExpirationResults(1L, 0L, 0L, 39L, 20L, results); } @Test public void testRetainLastWithExpireOlderThan() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); @@ -273,217 +280,256 @@ public void testRetainLastWithExpireOlderThan() { t1 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); long t3 = rightAfterSnapshot(); // Retain last 2 snapshots - SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .retainLast(2) - .execute(); + SparkActions.get().expireSnapshots(table).expireOlderThan(t3).retainLast(2).execute(); - Assert.assertEquals("Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + Assert.assertEquals( + "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); } @Test public void testExpireTwoSnapshotsById() throws Exception { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); long secondSnapshotID = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); // Retain last 2 snapshots - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireSnapshotId(firstSnapshotId) - .expireSnapshotId(secondSnapshotID) - .execute(); - - Assert.assertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId)); - Assert.assertEquals("Second snapshot should not be present.", null, table.snapshot(secondSnapshotID)); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireSnapshotId(firstSnapshotId) + .expireSnapshotId(secondSnapshotID) + .execute(); + + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + Assert.assertEquals( + "Second snapshot should not be present.", null, table.snapshot(secondSnapshotID)); checkExpirationResults(0L, 0L, 0L, 0L, 2L, result); } @Test public void testRetainLastWithExpireById() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); // Retain last 3 snapshots, but explicitly remove the first snapshot - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireSnapshotId(firstSnapshotId) - .retainLast(3) - .execute(); - - Assert.assertEquals("Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireSnapshotId(firstSnapshotId) + .retainLast(3) + .execute(); + + Assert.assertEquals( + "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); checkExpirationResults(0L, 0L, 0L, 0L, 1L, result); } @Test public void testRetainLastWithTooFewSnapshots() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .appendFile(FILE_B) // data_bucket=1 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); long t2 = rightAfterSnapshot(); // Retain last 3 snapshots - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t2) - .retainLast(3) - .execute(); - - Assert.assertEquals("Should have two snapshots", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should still present", - firstSnapshotId, table.snapshot(firstSnapshotId).snapshotId()); + ExpireSnapshots.Result result = + SparkActions.get().expireSnapshots(table).expireOlderThan(t2).retainLast(3).execute(); + + Assert.assertEquals( + "Should have two snapshots", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should still present", + firstSnapshotId, + table.snapshot(firstSnapshotId).snapshotId()); checkExpirationResults(0L, 0L, 0L, 0L, 0L, result); } @Test public void testRetainLastKeepsExpiringSnapshot() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_D) // data_bucket=3 .commit(); // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(secondSnapshot.timestampMillis()) - .retainLast(2) - .execute(); - - Assert.assertEquals("Should have three snapshots.", 3, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNotNull("Second snapshot should present.", table.snapshot(secondSnapshot.snapshotId())); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(secondSnapshot.timestampMillis()) + .retainLast(2) + .execute(); + + Assert.assertEquals( + "Should have three snapshots.", 3, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNotNull( + "Second snapshot should present.", table.snapshot(secondSnapshot.snapshotId())); checkExpirationResults(0L, 0L, 0L, 0L, 1L, result); } @Test public void testExpireSnapshotsWithDisabledGarbageCollection() { - table.updateProperties() - .set(TableProperties.GC_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - AssertHelpers.assertThrows("Should complain about expiring snapshots", - ValidationException.class, "Cannot expire snapshots: GC is disabled", + AssertHelpers.assertThrows( + "Should complain about expiring snapshots", + ValidationException.class, + "Cannot expire snapshots: GC is disabled", () -> SparkActions.get().expireSnapshots(table)); } @Test public void testExpireOlderThanMultipleCalls() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); Snapshot thirdSnapshot = table.currentSnapshot(); // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(secondSnapshot.timestampMillis()) - .expireOlderThan(thirdSnapshot.timestampMillis()) - .execute(); - - Assert.assertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNull("Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(secondSnapshot.timestampMillis()) + .expireOlderThan(thirdSnapshot.timestampMillis()) + .execute(); + + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNull( + "Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); checkExpirationResults(0L, 0L, 0L, 0L, 2L, result); } @Test public void testRetainLastMultipleCalls() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); long t3 = rightAfterSnapshot(); // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .retainLast(2) - .retainLast(1) - .execute(); - - Assert.assertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNull("Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(t3) + .retainLast(2) + .retainLast(1) + .execute(); + + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNull( + "Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); checkExpirationResults(0L, 0L, 0L, 0L, 2L, result); } @Test public void testRetainZeroSnapshots() { - AssertHelpers.assertThrows("Should fail retain 0 snapshots " + - "because number of snapshots to retain cannot be zero", + AssertHelpers.assertThrows( + "Should fail retain 0 snapshots " + "because number of snapshots to retain cannot be zero", IllegalArgumentException.class, "Number of snapshots to retain must be at least 1, cannot be: 0", () -> SparkActions.get().expireSnapshots(table).retainLast(0).execute()); @@ -491,28 +537,22 @@ public void testRetainZeroSnapshots() { @Test public void testScanExpiredManifestInValidSnapshotAppend() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.newOverwrite() - .addFile(FILE_C) - .deleteFile(FILE_A) - .commit(); + table.newOverwrite().addFile(FILE_C).deleteFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_D).commit(); long t3 = rightAfterSnapshot(); Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(t3) + .deleteWith(deletedFiles::add) + .execute(); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); checkExpirationResults(1L, 0L, 0L, 1L, 2L, result); @@ -520,72 +560,61 @@ public void testScanExpiredManifestInValidSnapshotAppend() { @Test public void testScanExpiredManifestInValidSnapshotFastAppend() { - table.updateProperties() + table + .updateProperties() .set(TableProperties.MANIFEST_MERGE_ENABLED, "true") .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1") .commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.newOverwrite() - .addFile(FILE_C) - .deleteFile(FILE_A) - .commit(); + table.newOverwrite().addFile(FILE_C).deleteFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_D) - .commit(); + table.newFastAppend().appendFile(FILE_D).commit(); long t3 = rightAfterSnapshot(); Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(t3) + .deleteWith(deletedFiles::add) + .execute(); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); checkExpirationResults(1L, 0L, 0L, 1L, 2L, result); } /** - * Test on table below, and expiring the staged commit `B` using `expireOlderThan` API. - * Table: A - C - * ` B (staged) + * Test on table below, and expiring the staged commit `B` using `expireOlderThan` API. Table: A - + * C ` B (staged) */ @Test public void testWithExpiringDanglingStageCommit() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` staged commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); TableMetadata base = ((BaseTable) table).operations().current(); Snapshot snapshotA = base.snapshots().get(0); Snapshot snapshotB = base.snapshots().get(1); // `C` commit - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Set deletedFiles = Sets.newHashSet(); // Expire all commits including dangling staged snapshot. - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(snapshotB.timestampMillis() + 1) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireOlderThan(snapshotB.timestampMillis() + 1) + .execute(); checkExpirationResults(1L, 0L, 0L, 1L, 2L, result); @@ -593,122 +622,107 @@ public void testWithExpiringDanglingStageCommit() { expectedDeletes.add(snapshotA.manifestListLocation()); // Files should be deleted of dangling staged snapshot - snapshotB.addedDataFiles(table.io()).forEach(i -> { - expectedDeletes.add(i.path().toString()); - }); + snapshotB + .addedDataFiles(table.io()) + .forEach( + i -> { + expectedDeletes.add(i.path().toString()); + }); // ManifestList should be deleted too expectedDeletes.add(snapshotB.manifestListLocation()); - snapshotB.dataManifests(table.io()).forEach(file -> { - // Only the manifest of B should be deleted. - if (file.snapshotId() == snapshotB.snapshotId()) { - expectedDeletes.add(file.path()); - } - }); - Assert.assertSame("Files deleted count should be expected", expectedDeletes.size(), deletedFiles.size()); + snapshotB + .dataManifests(table.io()) + .forEach( + file -> { + // Only the manifest of B should be deleted. + if (file.snapshotId() == snapshotB.snapshotId()) { + expectedDeletes.add(file.path()); + } + }); + Assert.assertSame( + "Files deleted count should be expected", expectedDeletes.size(), deletedFiles.size()); // Take the diff expectedDeletes.removeAll(deletedFiles); Assert.assertTrue("Exactly same files should be deleted", expectedDeletes.isEmpty()); } /** - * Expire cherry-pick the commit as shown below, when `B` is in table's current state - * Table: - * A - B - C <--current snapshot - * `- D (source=B) + * Expire cherry-pick the commit as shown below, when `B` is in table's current state Table: A - B + * - C <--current snapshot `- D (source=B) */ @Test public void testWithCherryPickTableSnapshot() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot snapshotA = table.currentSnapshot(); // `B` commit Set deletedAFiles = Sets.newHashSet(); - table.newOverwrite() - .addFile(FILE_B) - .deleteFile(FILE_A) - .deleteWith(deletedAFiles::add) - .commit(); + table.newOverwrite().addFile(FILE_B).deleteFile(FILE_A).deleteWith(deletedAFiles::add).commit(); Assert.assertTrue("No files should be physically deleted", deletedAFiles.isEmpty()); // pick the snapshot 'B` Snapshot snapshotB = table.currentSnapshot(); // `C` commit to let cherry-pick take effect, and avoid fast-forward of `B` with cherry-pick - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Snapshot snapshotC = table.currentSnapshot(); // Move the table back to `A` - table.manageSnapshots() - .setCurrentSnapshot(snapshotA.snapshotId()) - .commit(); + table.manageSnapshots().setCurrentSnapshot(snapshotA.snapshotId()).commit(); // Generate A -> `D (B)` - table.manageSnapshots() - .cherrypick(snapshotB.snapshotId()) - .commit(); + table.manageSnapshots().cherrypick(snapshotB.snapshotId()).commit(); Snapshot snapshotD = table.currentSnapshot(); // Move the table back to `C` - table.manageSnapshots() - .setCurrentSnapshot(snapshotC.snapshotId()) - .commit(); + table.manageSnapshots().setCurrentSnapshot(snapshotC.snapshotId()).commit(); List deletedFiles = Lists.newArrayList(); // Expire `C` - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(snapshotC.timestampMillis() + 1) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireOlderThan(snapshotC.timestampMillis() + 1) + .execute(); // Make sure no dataFiles are deleted for the B, C, D snapshot - Lists.newArrayList(snapshotB, snapshotC, snapshotD).forEach(i -> { - i.addedDataFiles(table.io()).forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); + Lists.newArrayList(snapshotB, snapshotC, snapshotD) + .forEach( + i -> { + i.addedDataFiles(table.io()) + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); checkExpirationResults(1L, 0L, 0L, 2L, 2L, result); } /** - * Test on table below, and expiring `B` which is not in current table state. - * 1) Expire `B` - * 2) All commit - * Table: A - C - D (B) - * ` B (staged) + * Test on table below, and expiring `B` which is not in current table state. 1) Expire `B` 2) All + * commit Table: A - C - D (B) ` B (staged) */ @Test public void testWithExpiringStagedThenCherrypick() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); // pick the snapshot that's staged but not committed TableMetadata base = ((BaseTable) table).operations().current(); Snapshot snapshotB = base.snapshots().get(1); // `C` commit to let cherry-pick take effect, and avoid fast-forward of `B` with cherry-pick - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); // `D (B)` cherry-pick commit - table.manageSnapshots() - .cherrypick(snapshotB.snapshotId()) - .commit(); + table.manageSnapshots().cherrypick(snapshotB.snapshotId()).commit(); base = ((BaseTable) table).operations().current(); Snapshot snapshotD = base.snapshots().get(3); @@ -716,47 +730,55 @@ public void testWithExpiringStagedThenCherrypick() { List deletedFiles = Lists.newArrayList(); // Expire `B` commit. - ExpireSnapshots.Result firstResult = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireSnapshotId(snapshotB.snapshotId()) - .execute(); + ExpireSnapshots.Result firstResult = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireSnapshotId(snapshotB.snapshotId()) + .execute(); // Make sure no dataFiles are deleted for the staged snapshot - Lists.newArrayList(snapshotB).forEach(i -> { - i.addedDataFiles(table.io()).forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); + Lists.newArrayList(snapshotB) + .forEach( + i -> { + i.addedDataFiles(table.io()) + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); checkExpirationResults(0L, 0L, 0L, 1L, 1L, firstResult); // Expire all snapshots including cherry-pick - ExpireSnapshots.Result secondResult = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(table.currentSnapshot().timestampMillis() + 1) - .execute(); + ExpireSnapshots.Result secondResult = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireOlderThan(table.currentSnapshot().timestampMillis() + 1) + .execute(); // Make sure no dataFiles are deleted for the staged and cherry-pick - Lists.newArrayList(snapshotB, snapshotD).forEach(i -> { - i.addedDataFiles(table.io()).forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); - checkExpirationResults(0L, 0L, 0L, 0L, 2L, secondResult); + Lists.newArrayList(snapshotB, snapshotD) + .forEach( + i -> { + i.addedDataFiles(table.io()) + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); + checkExpirationResults(0L, 0L, 0L, 0L, 2L, secondResult); } @Test public void testExpireOlderThan() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); long snapshotId = table.currentSnapshot().snapshotId(); @@ -764,42 +786,46 @@ public void testExpireOlderThan() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertEquals("Should remove only the expired manifest list location", - Sets.newHashSet(firstSnapshot.manifestListLocation()), deletedFiles); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertEquals( + "Should remove only the expired manifest list location", + Sets.newHashSet(firstSnapshot.manifestListLocation()), + deletedFiles); - checkExpirationResults(0, 0, 0, 0, 1, result); + checkExpirationResults(0, 0, 0, 0, 1, result); } @Test public void testExpireOlderThanWithDelete() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); rightAfterSnapshot(); - table.newDelete() - .deleteFile(FILE_A) - .commit(); + table.newDelete().deleteFile(FILE_A).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create replace manifest with a rewritten manifest", - 1, secondSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create replace manifest with a rewritten manifest", + 1, + secondSnapshot.allManifests(table.io()).size()); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); rightAfterSnapshot(); @@ -809,21 +835,36 @@ public void testExpireOlderThanWithDelete() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the second oldest snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and deleted data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the second oldest snapshot", + table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and deleted data file", Sets.newHashSet( firstSnapshot.manifestListLocation(), // snapshot expired - firstSnapshot.allManifests(table.io()).get(0).path(), // manifest was rewritten for delete + firstSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest was rewritten for delete secondSnapshot.manifestListLocation(), // snapshot expired - secondSnapshot.allManifests(table.io()).get(0).path(), // manifest contained only deletes, was dropped + secondSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest contained only deletes, was dropped FILE_A.path()), // deleted deletedFiles); @@ -833,30 +874,29 @@ public void testExpireOlderThanWithDelete() { @Test public void testExpireOlderThanWithDeleteInMergedManifests() { // merge every commit - table.updateProperties() - .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); rightAfterSnapshot(); - table.newDelete() + table + .newDelete() .deleteFile(FILE_A) // FILE_B is still in the dataset .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should replace manifest with a rewritten manifest", - 1, secondSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should replace manifest with a rewritten manifest", + 1, + secondSnapshot.allManifests(table.io()).size()); - table.newFastAppend() // do not merge to keep the last snapshot's manifest valid + table + .newFastAppend() // do not merge to keep the last snapshot's manifest valid .appendFile(FILE_C) .commit(); @@ -868,19 +908,31 @@ public void testExpireOlderThanWithDeleteInMergedManifests() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the second oldest snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and deleted data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the second oldest snapshot", + table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and deleted data file", Sets.newHashSet( firstSnapshot.manifestListLocation(), // snapshot expired - firstSnapshot.allManifests(table.io()).get(0).path(), // manifest was rewritten for delete + firstSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest was rewritten for delete secondSnapshot.manifestListLocation(), // snapshot expired FILE_A.path()), // deleted deletedFiles); @@ -891,33 +943,26 @@ public void testExpireOlderThanWithDeleteInMergedManifests() { @Test public void testExpireOlderThanWithRollback() { // merge every commit - table.updateProperties() - .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); rightAfterSnapshot(); - table.newDelete() - .deleteFile(FILE_B) - .commit(); + table.newDelete().deleteFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Set secondSnapshotManifests = Sets.newHashSet(secondSnapshot.allManifests(table.io())); + Set secondSnapshotManifests = + Sets.newHashSet(secondSnapshot.allManifests(table.io())); secondSnapshotManifests.removeAll(firstSnapshot.allManifests(table.io())); - Assert.assertEquals("Should add one new manifest for append", 1, secondSnapshotManifests.size()); + Assert.assertEquals( + "Should add one new manifest for append", 1, secondSnapshotManifests.size()); - table.manageSnapshots() - .rollbackTo(firstSnapshot.snapshotId()) - .commit(); + table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit(); long tAfterCommits = rightAfterSnapshot(secondSnapshot.snapshotId()); @@ -925,19 +970,29 @@ public void testExpireOlderThanWithRollback() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNotNull("Expire should keep the oldest snapshot, current", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and reverted appended data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNotNull( + "Expire should keep the oldest snapshot, current", + table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and reverted appended data file", Sets.newHashSet( secondSnapshot.manifestListLocation(), // snapshot expired - Iterables.getOnlyElement(secondSnapshotManifests).path()), // manifest is no longer referenced + Iterables.getOnlyElement(secondSnapshotManifests) + .path()), // manifest is no longer referenced deletedFiles); checkExpirationResults(0, 0, 0, 1, 1, result); @@ -945,28 +1000,24 @@ public void testExpireOlderThanWithRollback() { @Test public void testExpireOlderThanWithRollbackAndMergedManifests() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Set secondSnapshotManifests = Sets.newHashSet(secondSnapshot.allManifests(table.io())); + Set secondSnapshotManifests = + Sets.newHashSet(secondSnapshot.allManifests(table.io())); secondSnapshotManifests.removeAll(firstSnapshot.allManifests(table.io())); - Assert.assertEquals("Should add one new manifest for append", 1, secondSnapshotManifests.size()); + Assert.assertEquals( + "Should add one new manifest for append", 1, secondSnapshotManifests.size()); - table.manageSnapshots() - .rollbackTo(firstSnapshot.snapshotId()) - .commit(); + table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit(); long tAfterCommits = rightAfterSnapshot(secondSnapshot.snapshotId()); @@ -974,19 +1025,29 @@ public void testExpireOlderThanWithRollbackAndMergedManifests() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNotNull("Expire should keep the oldest snapshot, current", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and reverted appended data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNotNull( + "Expire should keep the oldest snapshot, current", + table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and reverted appended data file", Sets.newHashSet( secondSnapshot.manifestListLocation(), // snapshot expired - Iterables.getOnlyElement(secondSnapshotManifests).path(), // manifest is no longer referenced + Iterables.getOnlyElement(secondSnapshotManifests) + .path(), // manifest is no longer referenced FILE_B.path()), // added, but rolled back deletedFiles); @@ -995,68 +1056,65 @@ public void testExpireOlderThanWithRollbackAndMergedManifests() { @Test public void testExpireOlderThanWithDeleteFile() { - table.updateProperties() + table + .updateProperties() .set(TableProperties.FORMAT_VERSION, "2") .set(TableProperties.MANIFEST_MERGE_ENABLED, "false") .commit(); // Add Data File - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); // Add POS Delete - table.newRowDelta() - .addDeletes(FILE_A_POS_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_A_POS_DELETES).commit(); Snapshot secondSnapshot = table.currentSnapshot(); // Add EQ Delete - table.newRowDelta() - .addDeletes(FILE_A_EQ_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_A_EQ_DELETES).commit(); Snapshot thirdSnapshot = table.currentSnapshot(); // Move files to DELETED - table.newDelete() - .deleteFromRowFilter(Expressions.alwaysTrue()) - .commit(); + table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); Snapshot fourthSnapshot = table.currentSnapshot(); long afterAllDeleted = rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(afterAllDeleted) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(afterAllDeleted) + .deleteWith(deletedFiles::add) + .execute(); - Set expectedDeletes = Sets.newHashSet( - firstSnapshot.manifestListLocation(), - secondSnapshot.manifestListLocation(), - thirdSnapshot.manifestListLocation(), - fourthSnapshot.manifestListLocation(), - FILE_A.path().toString(), - FILE_A_POS_DELETES.path().toString(), - FILE_A_EQ_DELETES.path().toString()); + Set expectedDeletes = + Sets.newHashSet( + firstSnapshot.manifestListLocation(), + secondSnapshot.manifestListLocation(), + thirdSnapshot.manifestListLocation(), + fourthSnapshot.manifestListLocation(), + FILE_A.path().toString(), + FILE_A_POS_DELETES.path().toString(), + FILE_A_EQ_DELETES.path().toString()); expectedDeletes.addAll( thirdSnapshot.allManifests(table.io()).stream() .map(ManifestFile::path) - .map(CharSequence::toString).collect(Collectors.toSet())); + .map(CharSequence::toString) + .collect(Collectors.toSet())); // Delete operation (fourth snapshot) generates new manifest files expectedDeletes.addAll( fourthSnapshot.allManifests(table.io()).stream() .map(ManifestFile::path) - .map(CharSequence::toString).collect(Collectors.toSet())); + .map(CharSequence::toString) + .collect(Collectors.toSet())); - Assert.assertEquals("Should remove expired manifest lists and deleted data file", + Assert.assertEquals( + "Should remove expired manifest lists and deleted data file", expectedDeletes, deletedFiles); @@ -1068,27 +1126,25 @@ public void testExpireOnEmptyTable() { Set deletedFiles = Sets.newHashSet(); // table has no data, testing ExpireSnapshots should not fail with no snapshot - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(System.currentTimeMillis()) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(System.currentTimeMillis()) + .deleteWith(deletedFiles::add) + .execute(); checkExpirationResults(0, 0, 0, 0, 0, result); } @Test public void testExpireAction() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); long snapshotId = table.currentSnapshot().snapshotId(); @@ -1096,58 +1152,67 @@ public void testExpireAction() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshotsSparkAction action = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add); + ExpireSnapshotsSparkAction action = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add); Dataset pendingDeletes = action.expire(); List pending = pendingDeletes.collectAsList(); - Assert.assertEquals("Should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertEquals( + "Should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); Assert.assertEquals("Pending deletes should contain one row", 1, pending.size()); - Assert.assertEquals("Pending delete should be the expired manifest list location", - firstSnapshot.manifestListLocation(), pending.get(0).getString(0)); - Assert.assertEquals("Pending delete should be a manifest list", - "Manifest List", pending.get(0).getString(1)); + Assert.assertEquals( + "Pending delete should be the expired manifest list location", + firstSnapshot.manifestListLocation(), + pending.get(0).getString(0)); + Assert.assertEquals( + "Pending delete should be a manifest list", "Manifest List", pending.get(0).getString(1)); Assert.assertEquals("Should not delete any files", 0, deletedFiles.size()); - Assert.assertSame("Multiple calls to expire should return the same deleted files", - pendingDeletes, action.expire()); + Assert.assertSame( + "Multiple calls to expire should return the same deleted files", + pendingDeletes, + action.expire()); } @Test public void testUseLocalIterator() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); long end = rightAfterSnapshot(); int jobsBeforeStreamResults = spark.sparkContext().dagScheduler().nextJobId().get(); - withSQLConf(ImmutableMap.of("spark.sql.adaptive.enabled", "false"), () -> { - ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).expireOlderThan(end) - .option("stream-results", "true").execute(); - - int jobsAfterStreamResults = spark.sparkContext().dagScheduler().nextJobId().get(); - int jobsRunDuringStreamResults = jobsAfterStreamResults - jobsBeforeStreamResults; - - checkExpirationResults(1L, 0L, 0L, 1L, 2L, results); - - Assert.assertEquals("Expected total number of jobs with stream-results should match the expected number", - 4L, jobsRunDuringStreamResults); - }); + withSQLConf( + ImmutableMap.of("spark.sql.adaptive.enabled", "false"), + () -> { + ExpireSnapshots.Result results = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(end) + .option("stream-results", "true") + .execute(); + + int jobsAfterStreamResults = spark.sparkContext().dagScheduler().nextJobId().get(); + int jobsRunDuringStreamResults = jobsAfterStreamResults - jobsBeforeStreamResults; + + checkExpirationResults(1L, 0L, 0L, 1L, 2L, results); + + Assert.assertEquals( + "Expected total number of jobs with stream-results should match the expected number", + 4L, + jobsRunDuringStreamResults); + }); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java index e7240edda669..20f995731e20 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.sql.Timestamp; @@ -70,23 +71,18 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public abstract class TestRemoveOrphanFilesAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - protected static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); - protected static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .truncate("c2", 2) - .identity("c3") - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + protected static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); + protected static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).truncate("c2", 2).identity("c3").build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableDir = null; protected String tableLocation = null; @@ -98,41 +94,37 @@ public void setupTableLocation() throws Exception { @Test public void testDryRun() throws IOException, InterruptedException { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - List validFiles = spark.read().format("iceberg") - .load(tableLocation + "#files") - .select("file_path") - .as(Encoders.STRING()) - .collectAsList(); + List validFiles = + spark + .read() + .format("iceberg") + .load(tableLocation + "#files") + .select("file_path") + .as(Encoders.STRING()) + .collectAsList(); Assert.assertEquals("Should be 2 valid files", 2, validFiles.size()); df.write().mode("append").parquet(tableLocation + "/data"); Path dataPath = new Path(tableLocation + "/data"); FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf()); - List allFiles = Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) - .filter(FileStatus::isFile) - .map(file -> file.getPath().toString()) - .collect(Collectors.toList()); + List allFiles = + Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) + .filter(FileStatus::isFile) + .map(file -> file.getPath().toString()) + .collect(Collectors.toList()); Assert.assertEquals("Should be 3 files", 3, allFiles.size()); List invalidFiles = Lists.newArrayList(allFiles); @@ -143,32 +135,34 @@ public void testDryRun() throws IOException, InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result1 = actions.deleteOrphanFiles(table) - .deleteWith(s -> { }) - .execute(); - Assert.assertTrue("Default olderThan interval should be safe", Iterables.isEmpty(result1.orphanFileLocations())); + DeleteOrphanFiles.Result result1 = + actions.deleteOrphanFiles(table).deleteWith(s -> {}).execute(); + Assert.assertTrue( + "Default olderThan interval should be safe", + Iterables.isEmpty(result1.orphanFileLocations())); - DeleteOrphanFiles.Result result2 = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .deleteWith(s -> { }) - .execute(); + DeleteOrphanFiles.Result result2 = + actions + .deleteOrphanFiles(table) + .olderThan(System.currentTimeMillis()) + .deleteWith(s -> {}) + .execute(); Assert.assertEquals("Action should find 1 file", invalidFiles, result2.orphanFileLocations()); Assert.assertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0)))); - DeleteOrphanFiles.Result result3 = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result3 = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Action should delete 1 file", invalidFiles, result3.orphanFileLocations()); - Assert.assertFalse("Invalid file should not be present", fs.exists(new Path(invalidFiles.get(0)))); + Assert.assertFalse( + "Invalid file should not be present", fs.exists(new Path(invalidFiles.get(0)))); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(records); expectedRecords.addAll(records); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -176,36 +170,22 @@ public void testDryRun() throws IOException, InterruptedException { public void testAllValidFilesAreKept() throws IOException, InterruptedException { Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records1 = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df1 = spark.createDataFrame(records1, ThreeColumnRecord.class).coalesce(1); // original append - df1.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA") - ); + List records2 = + Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA")); Dataset df2 = spark.createDataFrame(records2, ThreeColumnRecord.class).coalesce(1); // dynamic partition overwrite - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("overwrite") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation); // second append - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); List snapshots = Lists.newArrayList(table.snapshots()); @@ -227,9 +207,8 @@ public void testAllValidFilesAreKept() throws IOException, InterruptedException SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 4 files", 4, Iterables.size(result.orphanFileLocations())); @@ -253,36 +232,22 @@ public void testAllValidFilesAreKept() throws IOException, InterruptedException public void orphanedFileRemovedWithParallelTasks() throws InterruptedException, IOException { Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records1 = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df1 = spark.createDataFrame(records1, ThreeColumnRecord.class).coalesce(1); // original append - df1.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA") - ); + List records2 = + Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA")); Dataset df2 = spark.createDataFrame(records2, ThreeColumnRecord.class).coalesce(1); // dynamic partition overwrite - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("overwrite") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation); // second append - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data"); df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA"); @@ -295,25 +260,34 @@ public void orphanedFileRemovedWithParallelTasks() throws InterruptedException, Set deleteThreads = ConcurrentHashMap.newKeySet(); AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - ExecutorService executorService = Executors.newFixedThreadPool(4, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-orphan-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); - return thread; - }); - - DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table) + ExecutorService executorService = + Executors.newFixedThreadPool( + 4, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("remove-orphan-" + deleteThreadsIndex.getAndIncrement()); + thread.setDaemon(true); + return thread; + }); + + DeleteOrphanFiles.Result result = + SparkActions.get() + .deleteOrphanFiles(table) .executeDeleteWith(executorService) - .olderThan(System.currentTimeMillis() + 5000) // Ensure all orphan files are selected - .deleteWith(file -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(file); - }) + .olderThan(System.currentTimeMillis() + 5000) // Ensure all orphan files are selected + .deleteWith( + file -> { + deleteThreads.add(Thread.currentThread().getName()); + deletedFiles.add(file); + }) .execute(); - // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory - Assert.assertEquals(deleteThreads, - Sets.newHashSet("remove-orphan-0", "remove-orphan-1", "remove-orphan-2", "remove-orphan-3")); + // Verifies that the delete methods ran in the threads created by the provided ExecutorService + // ThreadFactory + Assert.assertEquals( + deleteThreads, + Sets.newHashSet( + "remove-orphan-0", "remove-orphan-1", "remove-orphan-2", "remove-orphan-3")); Assert.assertEquals("Should delete 4 files", 4, deletedFiles.size()); } @@ -324,42 +298,32 @@ public void testWapFilesAreKept() throws InterruptedException { props.put(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, "true"); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); // normal write - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); spark.conf().set("spark.wap.id", "1"); // wap write - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Should not return data from the staged snapshot", records, actualRecords); waitUntilAfter(System.currentTimeMillis()); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); + Assert.assertTrue( + "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); } @Test @@ -369,16 +333,11 @@ public void testMetadataFolderIsIntact() throws InterruptedException { props.put(TableProperties.WRITE_DATA_LOCATION, tableLocation); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df.write().mode("append").parquet(tableLocation + "/c2_trunc=AA/c3=AAAA"); @@ -386,16 +345,14 @@ public void testMetadataFolderIsIntact() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @@ -403,16 +360,11 @@ public void testMetadataFolderIsIntact() throws InterruptedException { public void testOlderThanTimestamp() throws InterruptedException { Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); @@ -427,11 +379,11 @@ public void testOlderThanTimestamp() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(timestamp) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(timestamp).execute(); - Assert.assertEquals("Should delete only 2 files", 2, Iterables.size(result.orphanFileLocations())); + Assert.assertEquals( + "Should delete only 2 files", 2, Iterables.size(result.orphanFileLocations())); } @Test @@ -441,33 +393,25 @@ public void testRemoveUnreachableMetadataVersionFiles() throws InterruptedExcept props.put(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1"); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); waitUntilAfter(System.currentTimeMillis()); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); - Assert.assertTrue("Should remove v1 file", StreamSupport.stream(result.orphanFileLocations().spliterator(), false) + Assert.assertTrue( + "Should remove v1 file", + StreamSupport.stream(result.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("v1.metadata.json"))); List expectedRecords = Lists.newArrayList(); @@ -475,9 +419,8 @@ public void testRemoveUnreachableMetadataVersionFiles() throws InterruptedExcept expectedRecords.addAll(records); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -492,26 +435,21 @@ public void testManyTopLevelPartitions() throws InterruptedException { Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); waitUntilAfter(System.currentTimeMillis()); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); + Assert.assertTrue( + "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @@ -526,56 +464,43 @@ public void testManyLeafPartitions() throws InterruptedException { Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); waitUntilAfter(System.currentTimeMillis()); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); + Assert.assertTrue( + "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @Test public void testHiddenPartitionPaths() throws InterruptedException { - Schema schema = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "_c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .truncate("_c2", 2) - .identity("c3") - .build(); + Schema schema = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "_c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); + PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("_c2", 2).identity("c3").build(); Table table = TABLES.create(schema, spec, Maps.newHashMap(), tableLocation); - StructType structType = new StructType() - .add("c1", DataTypes.IntegerType) - .add("_c2", DataTypes.StringType) - .add("c3", DataTypes.StringType); - List records = Lists.newArrayList( - RowFactory.create(1, "AAAAAAAAAA", "AAAA") - ); + StructType structType = + new StructType() + .add("c1", DataTypes.IntegerType) + .add("_c2", DataTypes.StringType) + .add("c3", DataTypes.StringType); + List records = Lists.newArrayList(RowFactory.create(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, structType).coalesce(1); - df.select("c1", "_c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "_c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df.write().mode("append").parquet(tableLocation + "/data/_c2_trunc=AA/c3=AAAA"); df.write().mode("append").parquet(tableLocation + "/data/_c2_trunc=AA/c3=AAAA"); @@ -584,45 +509,35 @@ public void testHiddenPartitionPaths() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 2 files", 2, Iterables.size(result.orphanFileLocations())); } @Test public void testHiddenPartitionPathsWithPartitionEvolution() throws InterruptedException { - Schema schema = new Schema( - optional(1, "_c1", Types.IntegerType.get()), - optional(2, "_c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .truncate("_c2", 2) - .build(); + Schema schema = + new Schema( + optional(1, "_c1", Types.IntegerType.get()), + optional(2, "_c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); + PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("_c2", 2).build(); Table table = TABLES.create(schema, spec, Maps.newHashMap(), tableLocation); - StructType structType = new StructType() - .add("_c1", DataTypes.IntegerType) - .add("_c2", DataTypes.StringType) - .add("c3", DataTypes.StringType); - List records = Lists.newArrayList( - RowFactory.create(1, "AAAAAAAAAA", "AAAA") - ); + StructType structType = + new StructType() + .add("_c1", DataTypes.IntegerType) + .add("_c2", DataTypes.StringType) + .add("c3", DataTypes.StringType); + List records = Lists.newArrayList(RowFactory.create(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, structType).coalesce(1); - df.select("_c1", "_c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("_c1", "_c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df.write().mode("append").parquet(tableLocation + "/data/_c2_trunc=AA"); - table.updateSpec() - .addField("_c1") - .commit(); + table.updateSpec().addField("_c1").commit(); df.write().mode("append").parquet(tableLocation + "/data/_c2_trunc=AA/_c1=1"); @@ -630,40 +545,32 @@ public void testHiddenPartitionPathsWithPartitionEvolution() throws InterruptedE SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 2 files", 2, Iterables.size(result.orphanFileLocations())); } @Test - public void testHiddenPathsStartingWithPartitionNamesAreIgnored() throws InterruptedException, IOException { - Schema schema = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "_c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .truncate("_c2", 2) - .identity("c3") - .build(); + public void testHiddenPathsStartingWithPartitionNamesAreIgnored() + throws InterruptedException, IOException { + Schema schema = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "_c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); + PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("_c2", 2).identity("c3").build(); Table table = TABLES.create(schema, spec, Maps.newHashMap(), tableLocation); - StructType structType = new StructType() - .add("c1", DataTypes.IntegerType) - .add("_c2", DataTypes.StringType) - .add("c3", DataTypes.StringType); - List records = Lists.newArrayList( - RowFactory.create(1, "AAAAAAAAAA", "AAAA") - ); + StructType structType = + new StructType() + .add("c1", DataTypes.IntegerType) + .add("_c2", DataTypes.StringType) + .add("c3", DataTypes.StringType); + List records = Lists.newArrayList(RowFactory.create(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, structType).coalesce(1); - df.select("c1", "_c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "_c2", "c3").write().format("iceberg").mode("append").save(tableLocation); Path dataPath = new Path(tableLocation + "/data"); FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf()); @@ -674,16 +581,17 @@ public void testHiddenPathsStartingWithPartitionNamesAreIgnored() throws Interru SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 0 files", 0, Iterables.size(result.orphanFileLocations())); Assert.assertTrue(fs.exists(pathToFileInHiddenFolder)); } private List snapshotFiles(long snapshotId) { - return spark.read().format("iceberg") + return spark + .read() + .format("iceberg") .option("snapshot-id", snapshotId) .load(tableLocation + "#files") .select("file_path") @@ -693,11 +601,12 @@ private List snapshotFiles(long snapshotId) { @Test public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, InterruptedException { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableDir.getAbsolutePath()); + Table table = + TABLES.create( + SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableDir.getAbsolutePath()); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); @@ -707,11 +616,14 @@ public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, Inte .mode("append") .save(tableDir.getAbsolutePath()); - List validFiles = spark.read().format("iceberg") - .load(tableLocation + "#files") - .select("file_path") - .as(Encoders.STRING()) - .collectAsList(); + List validFiles = + spark + .read() + .format("iceberg") + .load(tableLocation + "#files") + .select("file_path") + .as(Encoders.STRING()) + .collectAsList(); Assert.assertEquals("Should be 1 valid files", 1, validFiles.size()); String validFile = validFiles.get(0); @@ -719,10 +631,11 @@ public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, Inte Path dataPath = new Path(tableLocation + "/data"); FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf()); - List allFiles = Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) - .filter(FileStatus::isFile) - .map(file -> file.getPath().toString()) - .collect(Collectors.toList()); + List allFiles = + Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) + .filter(FileStatus::isFile) + .map(file -> file.getPath().toString()) + .collect(Collectors.toList()); Assert.assertEquals("Should be 2 files", 2, allFiles.size()); List invalidFiles = Lists.newArrayList(allFiles); @@ -732,10 +645,12 @@ public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, Inte waitUntilAfter(System.currentTimeMillis()); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .deleteWith(s -> { }) - .execute(); + DeleteOrphanFiles.Result result = + actions + .deleteOrphanFiles(table) + .olderThan(System.currentTimeMillis()) + .deleteWith(s -> {}) + .execute(); Assert.assertEquals("Action should find 1 file", invalidFiles, result.orphanFileLocations()); Assert.assertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0)))); } @@ -748,18 +663,15 @@ public void testRemoveOrphanFilesWithHadoopCatalog() throws InterruptedException Namespace namespace = Namespace.of(namespaceName); TableIdentifier tableIdentifier = TableIdentifier.of(namespace, tableName); - Table table = catalog.createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap()); + Table table = + catalog.createTable( + tableIdentifier, SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap()); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(table.location()); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(table.location()); df.write().mode("append").parquet(table.location() + "/data"); @@ -767,28 +679,30 @@ public void testRemoveOrphanFilesWithHadoopCatalog() throws InterruptedException table.refresh(); - DeleteOrphanFiles.Result result = SparkActions.get() - .deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + SparkActions.get().deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertEquals("Should delete only 1 files", 1, Iterables.size(result.orphanFileLocations())); + Assert.assertEquals( + "Should delete only 1 files", 1, Iterables.size(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(table.location()); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @Test public void testHiveCatalogTable() throws IOException { - Table table = catalog.createTable(TableIdentifier.of("default", "hivetestorphan"), SCHEMA, SPEC, tableLocation, - Maps.newHashMap()); + Table table = + catalog.createTable( + TableIdentifier.of("default", "hivetestorphan"), + SCHEMA, + SPEC, + tableLocation, + Maps.newHashMap()); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); @@ -801,35 +715,35 @@ public void testHiveCatalogTable() throws IOException { String location = table.location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result result = + SparkActions.get() + .deleteOrphanFiles(table) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(result.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "data/trashfile"))); } @Test public void testGarbageCollectionDisabled() { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - table.updateProperties() - .set(TableProperties.GC_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - AssertHelpers.assertThrows("Should complain about removing orphan files", - ValidationException.class, "Cannot delete orphan files: GC is disabled", + AssertHelpers.assertThrows( + "Should complain about removing orphan files", + ValidationException.class, + "Cannot delete orphan files: GC is disabled", () -> SparkActions.get().deleteOrphanFiles(table).execute()); } @@ -893,19 +807,21 @@ public void testCompareToFileList() throws IOException, InterruptedException { .withColumnRenamed("lastModified", "last_modified"); DeleteOrphanFiles.Result result1 = - actions.deleteOrphanFiles(table) + actions + .deleteOrphanFiles(table) .compareToFileList(compareToFileList) - .deleteWith(s -> { }) + .deleteWith(s -> {}) .execute(); Assert.assertTrue( "Default olderThan interval should be safe", Iterables.isEmpty(result1.orphanFileLocations())); DeleteOrphanFiles.Result result2 = - actions.deleteOrphanFiles(table) + actions + .deleteOrphanFiles(table) .compareToFileList(compareToFileList) .olderThan(System.currentTimeMillis()) - .deleteWith(s -> { }) + .deleteWith(s -> {}) .execute(); Assert.assertEquals( "Action should find 1 file", invalidFilePaths, result2.orphanFileLocations()); @@ -913,7 +829,8 @@ public void testCompareToFileList() throws IOException, InterruptedException { "Invalid file should be present", fs.exists(new Path(invalidFilePaths.get(0)))); DeleteOrphanFiles.Result result3 = - actions.deleteOrphanFiles(table) + actions + .deleteOrphanFiles(table) .compareToFileList(compareToFileList) .olderThan(System.currentTimeMillis()) .execute(); @@ -941,9 +858,10 @@ public void testCompareToFileList() throws IOException, InterruptedException { .withColumnRenamed("lastModified", "last_modified"); DeleteOrphanFiles.Result result4 = - actions.deleteOrphanFiles(table) + actions + .deleteOrphanFiles(table) .compareToFileList(compareToFileListWithOutsideLocation) - .deleteWith(s -> { }) + .deleteWith(s -> {}) .execute(); Assert.assertEquals( "Action should find nothing", Lists.newArrayList(), result4.orphanFileLocations()); @@ -982,24 +900,28 @@ public void testPathsWithActualFileHavingNoAuthority() { public void testPathsWithEqualSchemes() { List validFiles = Lists.newArrayList("scheme1://bucket1/dir1/dir2/file1"); List actualFiles = Lists.newArrayList("scheme2://bucket1/dir1/dir2/file1"); - AssertHelpers.assertThrows("Test remove orphan files with equal schemes", + AssertHelpers.assertThrows( + "Test remove orphan files with equal schemes", ValidationException.class, "Conflicting authorities/schemes: [(scheme1, scheme2)]", - () -> executeTest(validFiles, - actualFiles, - Lists.newArrayList(), - ImmutableMap.of(), + () -> + executeTest( + validFiles, + actualFiles, + Lists.newArrayList(), + ImmutableMap.of(), ImmutableMap.of(), - DeleteOrphanFiles.PrefixMismatchMode.ERROR)); + DeleteOrphanFiles.PrefixMismatchMode.ERROR)); Map equalSchemes = Maps.newHashMap(); equalSchemes.put("scheme1", "scheme"); equalSchemes.put("scheme2", "scheme"); - executeTest(validFiles, + executeTest( + validFiles, actualFiles, Lists.newArrayList(), equalSchemes, - ImmutableMap.of(), + ImmutableMap.of(), DeleteOrphanFiles.PrefixMismatchMode.ERROR); } @@ -1007,23 +929,27 @@ public void testPathsWithEqualSchemes() { public void testPathsWithEqualAuthorities() { List validFiles = Lists.newArrayList("hdfs://servicename1/dir1/dir2/file1"); List actualFiles = Lists.newArrayList("hdfs://servicename2/dir1/dir2/file1"); - AssertHelpers.assertThrows("Test remove orphan files with equal authorities", + AssertHelpers.assertThrows( + "Test remove orphan files with equal authorities", ValidationException.class, "Conflicting authorities/schemes: [(servicename1, servicename2)]", - () -> executeTest(validFiles, - actualFiles, - Lists.newArrayList(), + () -> + executeTest( + validFiles, + actualFiles, + Lists.newArrayList(), ImmutableMap.of(), ImmutableMap.of(), - DeleteOrphanFiles.PrefixMismatchMode.ERROR)); + DeleteOrphanFiles.PrefixMismatchMode.ERROR)); Map equalAuthorities = Maps.newHashMap(); equalAuthorities.put("servicename1", "servicename"); equalAuthorities.put("servicename2", "servicename"); - executeTest(validFiles, + executeTest( + validFiles, actualFiles, Lists.newArrayList(), - ImmutableMap.of(), + ImmutableMap.of(), equalAuthorities, DeleteOrphanFiles.PrefixMismatchMode.ERROR); } @@ -1033,37 +959,39 @@ public void testRemoveOrphanFileActionWithDeleteMode() { List validFiles = Lists.newArrayList("hdfs://servicename1/dir1/dir2/file1"); List actualFiles = Lists.newArrayList("hdfs://servicename2/dir1/dir2/file1"); - executeTest(validFiles, + executeTest( + validFiles, actualFiles, Lists.newArrayList("hdfs://servicename2/dir1/dir2/file1"), - ImmutableMap.of(), - ImmutableMap.of(), + ImmutableMap.of(), + ImmutableMap.of(), DeleteOrphanFiles.PrefixMismatchMode.DELETE); } - private void executeTest(List validFiles, - List actualFiles, - List expectedOrphanFiles) { - executeTest(validFiles, actualFiles, expectedOrphanFiles, ImmutableMap.of(), ImmutableMap.of(), + private void executeTest( + List validFiles, List actualFiles, List expectedOrphanFiles) { + executeTest( + validFiles, + actualFiles, + expectedOrphanFiles, + ImmutableMap.of(), + ImmutableMap.of(), DeleteOrphanFiles.PrefixMismatchMode.IGNORE); } - private void executeTest(List validFiles, - List actualFiles, - List expectedOrphanFiles, - Map equalSchemes, - Map equalAuthorities, - DeleteOrphanFiles.PrefixMismatchMode mode) { + private void executeTest( + List validFiles, + List actualFiles, + List expectedOrphanFiles, + Map equalSchemes, + Map equalAuthorities, + DeleteOrphanFiles.PrefixMismatchMode mode) { Dataset validFilesDF = spark.createDataset(validFiles, Encoders.STRING()).toDF(); Dataset actualFilesDF = spark.createDataset(actualFiles, Encoders.STRING()).toDF(); - List orphanFiles = DeleteOrphanFilesSparkAction.findOrphanFiles( - spark, - actualFilesDF, - validFilesDF, - equalSchemes, - equalAuthorities, - mode); + List orphanFiles = + DeleteOrphanFilesSparkAction.findOrphanFiles( + spark, actualFilesDF, validFilesDF, equalSchemes, equalAuthorities, mode); Assert.assertEquals(expectedOrphanFiles, orphanFiles); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction3.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction3.java index 77eb23a6dffc..e3699eaeded1 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction3.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction3.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.io.File; @@ -54,9 +53,13 @@ public void testSparkCatalogTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @@ -80,9 +83,13 @@ public void testSparkCatalogNamedHadoopTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @@ -106,19 +113,26 @@ public void testSparkCatalogNamedHiveTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @Test public void testSparkSessionCatalogHadoopTable() throws Exception { - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); spark.conf().set("spark.sql.catalog.spark_catalog.type", "hadoop"); spark.conf().set("spark.sql.catalog.spark_catalog.warehouse", tableLocation); - SparkSessionCatalog cat = (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog(); + SparkSessionCatalog cat = + (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog(); String[] database = {"default"}; Identifier id = Identifier.of(database, "table"); @@ -132,18 +146,25 @@ public void testSparkSessionCatalogHadoopTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @Test public void testSparkSessionCatalogHiveTable() throws Exception { - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); spark.conf().set("spark.sql.catalog.spark_catalog.type", "hive"); - SparkSessionCatalog cat = (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog(); + SparkSessionCatalog cat = + (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog(); String[] database = {"default"}; Identifier id = Identifier.of(database, "sessioncattest"); @@ -158,9 +179,13 @@ public void testSparkSessionCatalogHiveTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @@ -171,5 +196,4 @@ public void resetSparkSessionCatalog() throws Exception { spark.conf().unset("spark.sql.catalog.spark_catalog.type"); spark.conf().unset("spark.sql.catalog.spark_catalog.warehouse"); } - } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index db08619597d5..5484114c7978 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -16,9 +16,21 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.argThat; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.doCallRealMethod; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.spy; + import java.io.File; import java.io.IOException; import java.io.UncheckedIOException; @@ -96,32 +108,18 @@ import org.mockito.ArgumentMatcher; import org.mockito.Mockito; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.argThat; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.doCallRealMethod; -import static org.mockito.Mockito.doReturn; -import static org.mockito.Mockito.doThrow; -import static org.mockito.Mockito.spy; - public class TestRewriteDataFilesAction extends SparkTestBase { private static final int SCALE = 400000; private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final FileRewriteCoordinator coordinator = FileRewriteCoordinator.get(); private final FileScanTaskSetManager manager = FileScanTaskSetManager.get(); @@ -190,10 +188,11 @@ public void testBinPackWithFilter() { shouldHaveFiles(table, 8); List expectedRecords = currentData(); - Result result = basicRewrite(table) - .filter(Expressions.equal("c1", 1)) - .filter(Expressions.startsWith("c2", "foo")) - .execute(); + Result result = + basicRewrite(table) + .filter(Expressions.equal("c1", 1)) + .filter(Expressions.startsWith("c2", "foo")) + .execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFilesCount()); @@ -217,12 +216,17 @@ public void testBinPackAfterPartitionChange() { RewriteDataFiles.Result result = basicRewrite(table) .option(SortStrategy.MIN_INPUT_FILES, "1") - .option(SortStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) + 1000)) - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) + 1001)) + .option( + SortStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) + 1000)) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + Integer.toString(averageFileSize(table) + 1001)) .execute(); - Assert.assertEquals("Should have 1 fileGroup because all files were not correctly partitioned", - 1, result.rewriteResults().size()); + Assert.assertEquals( + "Should have 1 fileGroup because all files were not correctly partitioned", + 1, + result.rewriteResults().size()); List postRewriteData = currentData(); assertEquals("We shouldn't have changed the data", originalData, postRewriteData); @@ -240,32 +244,33 @@ public void testBinPackWithDeletes() throws Exception { table.refresh(); CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum(); RowDelta rowDelta = table.newRowDelta(); // add 1 delete file for data files 0, 1, 2 for (int i = 0; i < 3; i++) { - writePosDeletesToFile(table, dataFiles.get(i), 1) - .forEach(rowDelta::addDeletes); + writePosDeletesToFile(table, dataFiles.get(i), 1).forEach(rowDelta::addDeletes); } // add 2 delete files for data files 3, 4 for (int i = 3; i < 5; i++) { - writePosDeletesToFile(table, dataFiles.get(i), 2) - .forEach(rowDelta::addDeletes); + writePosDeletesToFile(table, dataFiles.get(i), 2).forEach(rowDelta::addDeletes); } rowDelta.commit(); table.refresh(); List expectedRecords = currentData(); - Result result = actions().rewriteDataFiles(table) - // do not include any file based on bin pack file size configs - .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, "0") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)) - .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "2") - .execute(); + Result result = + actions() + .rewriteDataFiles(table) + // do not include any file based on bin pack file size configs + .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, "0") + .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)) + .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)) + .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "2") + .execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount()); List actualRecords = currentData(); @@ -282,20 +287,22 @@ public void testBinPackWithDeleteAllData() { table.refresh(); CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum(); RowDelta rowDelta = table.newRowDelta(); // remove all data - writePosDeletesToFile(table, dataFiles.get(0), total) - .forEach(rowDelta::addDeletes); + writePosDeletesToFile(table, dataFiles.get(0), total).forEach(rowDelta::addDeletes); rowDelta.commit(); table.refresh(); List expectedRecords = currentData(); - Result result = actions().rewriteDataFiles(table) - .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "1") - .execute(); + Result result = + actions() + .rewriteDataFiles(table) + .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "1") + .execute(); Assert.assertEquals("Action should rewrite 1 data files", 1, result.rewrittenDataFilesCount()); List actualRecords = currentData(); @@ -304,7 +311,8 @@ public void testBinPackWithDeleteAllData() { "Data manifest should not have existing data file", 0, (long) table.currentSnapshot().dataManifests(table.io()).get(0).existingFilesCount()); - Assert.assertEquals("Data manifest should have 1 delete data file", + Assert.assertEquals( + "Data manifest should have 1 delete data file", 1L, (long) table.currentSnapshot().dataManifests(table.io()).get(0).deletedFilesCount()); Assert.assertEquals( @@ -322,9 +330,8 @@ public void testBinPackWithStartingSequenceNumber() { table.refresh(); long oldSequenceNumber = table.currentSnapshot().sequenceNumber(); - Result result = basicRewrite(table) - .option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true") - .execute(); + Result result = + basicRewrite(table).option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true").execute(); Assert.assertEquals("Action should rewrite 8 data files", 8, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFilesCount()); @@ -333,13 +340,15 @@ public void testBinPackWithStartingSequenceNumber() { assertEquals("Rows must match", expectedRecords, actualRecords); table.refresh(); - Assert.assertTrue("Table sequence number should be incremented", + Assert.assertTrue( + "Table sequence number should be incremented", oldSequenceNumber < table.currentSnapshot().sequenceNumber()); Dataset rows = SparkTableUtil.loadMetadataTable(spark, table, MetadataTableType.ENTRIES); for (Row row : rows.collectAsList()) { if (row.getInt(0) == 1) { - Assert.assertEquals("Expect old sequence number for added entries", oldSequenceNumber, row.getLong(2)); + Assert.assertEquals( + "Expect old sequence number for added entries", oldSequenceNumber, row.getLong(2)); } } } @@ -353,9 +362,8 @@ public void testBinPackWithStartingSequenceNumberV1Compatibility() { long oldSequenceNumber = table.currentSnapshot().sequenceNumber(); Assert.assertEquals("Table sequence number should be 0", 0, oldSequenceNumber); - Result result = basicRewrite(table) - .option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true") - .execute(); + Result result = + basicRewrite(table).option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true").execute(); Assert.assertEquals("Action should rewrite 8 data files", 8, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFilesCount()); @@ -364,13 +372,15 @@ public void testBinPackWithStartingSequenceNumberV1Compatibility() { assertEquals("Rows must match", expectedRecords, actualRecords); table.refresh(); - Assert.assertEquals("Table sequence number should still be 0", - oldSequenceNumber, table.currentSnapshot().sequenceNumber()); + Assert.assertEquals( + "Table sequence number should still be 0", + oldSequenceNumber, + table.currentSnapshot().sequenceNumber()); Dataset rows = SparkTableUtil.loadMetadataTable(spark, table, MetadataTableType.ENTRIES); for (Row row : rows.collectAsList()) { - Assert.assertEquals("Expect sequence number 0 for all entries", - oldSequenceNumber, row.getLong(2)); + Assert.assertEquals( + "Expect sequence number 0 for all entries", oldSequenceNumber, row.getLong(2)); } } @@ -393,19 +403,15 @@ public void testRewriteLargeTableHasResiduals() { table.refresh(); - CloseableIterable tasks = table.newScan() - .ignoreResiduals() - .filter(Expressions.equal("c3", "0")) - .planFiles(); + CloseableIterable tasks = + table.newScan().ignoreResiduals().filter(Expressions.equal("c3", "0")).planFiles(); for (FileScanTask task : tasks) { Assert.assertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual()); } shouldHaveFiles(table, 2); - Result result = basicRewrite(table) - .filter(Expressions.equal("c3", "0")) - .execute(); + Result result = basicRewrite(table).filter(Expressions.equal("c3", "0")).execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFilesCount()); @@ -422,10 +428,11 @@ public void testBinPackSplitLargeFile() { List expectedRecords = currentData(); long targetSize = testDataSize(table) / 2; - Result result = basicRewrite(table) - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(targetSize)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(targetSize * 2 - 2000)) - .execute(); + Result result = + basicRewrite(table) + .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(targetSize)) + .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(targetSize * 2 - 2000)) + .execute(); Assert.assertEquals("Action should delete 1 data files", 1, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 2 data files", 2, result.addedDataFilesCount()); @@ -450,14 +457,16 @@ public void testBinPackCombineMixedFiles() { int targetSize = averageFileSize(table); - Result result = basicRewrite(table) - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize + 1000)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString(targetSize + 80000)) - .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 1000)) - .execute(); + Result result = + basicRewrite(table) + .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize + 1000)) + .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString(targetSize + 80000)) + .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 1000)) + .execute(); Assert.assertEquals("Action should delete 3 data files", 3, result.rewrittenDataFilesCount()); - // Should Split the big files into 3 pieces, one of which should be combined with the two smaller files + // Should Split the big files into 3 pieces, one of which should be combined with the two + // smaller files Assert.assertEquals("Action should add 3 data files", 3, result.addedDataFilesCount()); shouldHaveFiles(table, 3); @@ -475,11 +484,14 @@ public void testBinPackCombineMediumFiles() { int targetSize = ((int) testDataSize(table) / 3); // The test is to see if we can combine parts of files to make files of the correct size - Result result = basicRewrite(table) - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString((int) (targetSize * 1.8))) - .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 100)) // All files too small - .execute(); + Result result = + basicRewrite(table) + .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize)) + .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString((int) (targetSize * 1.8))) + .option( + BinPackStrategy.MIN_FILE_SIZE_BYTES, + Integer.toString(targetSize - 100)) // All files too small + .execute(); Assert.assertEquals("Action should delete 4 data files", 4, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 3 data files", 3, result.addedDataFilesCount()); @@ -501,7 +513,8 @@ public void testPartialProgressEnabled() { RewriteDataFiles.Result result = basicRewrite(table) .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "10") .execute(); @@ -526,7 +539,8 @@ public void testMultipleGroups() { // Perform a rewrite but only allow 2 files to be compacted at a time RewriteDataFiles.Result result = basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .option(BinPackStrategy.MIN_INPUT_FILES, "1") .execute(); @@ -551,7 +565,8 @@ public void testPartialProgressMaxCommits() { // Perform a rewrite but only allow 2 files to be compacted at a time RewriteDataFiles.Result result = basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3") .execute(); @@ -577,7 +592,9 @@ public void testSingleCommitWithRewriteFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)); + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)); BaseRewriteDataFilesSparkAction spyRewrite = Mockito.spy(realRewrite); @@ -587,7 +604,9 @@ public void testSingleCommitWithRewriteFailure() { .when(spyRewrite) .rewriteFiles(any(), argThat(failGroup)); - AssertHelpers.assertThrows("Should fail entire rewrite if part fails", RuntimeException.class, + AssertHelpers.assertThrows( + "Should fail entire rewrite if part fails", + RuntimeException.class, () -> spyRewrite.execute()); table.refresh(); @@ -610,21 +629,21 @@ public void testSingleCommitWithCommitFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)); + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)); BaseRewriteDataFilesSparkAction spyRewrite = spy(realRewrite); RewriteDataFilesCommitManager util = spy(new RewriteDataFilesCommitManager(table)); // Fail to commit - doThrow(new RuntimeException("Commit Failure")) - .when(util) - .commitFileGroups(any()); + doThrow(new RuntimeException("Commit Failure")).when(util).commitFileGroups(any()); - doReturn(util) - .when(spyRewrite) - .commitManager(table.currentSnapshot().snapshotId()); + doReturn(util).when(spyRewrite).commitManager(table.currentSnapshot().snapshotId()); - AssertHelpers.assertThrows("Should fail entire rewrite if commit fails", RuntimeException.class, + AssertHelpers.assertThrows( + "Should fail entire rewrite if commit fails", + RuntimeException.class, () -> spyRewrite.execute()); table.refresh(); @@ -647,7 +666,9 @@ public void testParallelSingleCommitWithRewriteFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3"); BaseRewriteDataFilesSparkAction spyRewrite = Mockito.spy(realRewrite); @@ -658,7 +679,9 @@ public void testParallelSingleCommitWithRewriteFailure() { .when(spyRewrite) .rewriteFiles(any(), argThat(failGroup)); - AssertHelpers.assertThrows("Should fail entire rewrite if part fails", RuntimeException.class, + AssertHelpers.assertThrows( + "Should fail entire rewrite if part fails", + RuntimeException.class, () -> spyRewrite.execute()); table.refresh(); @@ -681,7 +704,9 @@ public void testPartialProgressWithRewriteFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3"); @@ -719,7 +744,9 @@ public void testParallelPartialProgressWithRewriteFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3") .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3"); @@ -758,7 +785,9 @@ public void testParallelPartialProgressWithCommitFailure() { BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, + Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3") .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3"); @@ -773,9 +802,7 @@ public void testParallelPartialProgressWithCommitFailure() { .when(util) .commitFileGroups(any()); - doReturn(util) - .when(spyRewrite) - .commitManager(table.currentSnapshot().snapshotId()); + doReturn(util).when(spyRewrite).commitManager(table.currentSnapshot().snapshotId()); RewriteDataFiles.Result result = spyRewrite.execute(); @@ -797,30 +824,32 @@ public void testParallelPartialProgressWithCommitFailure() { public void testInvalidOptions() { Table table = createTable(20); - AssertHelpers.assertThrows("No negative values for partial progress max commits", + AssertHelpers.assertThrows( + "No negative values for partial progress max commits", IllegalArgumentException.class, - () -> basicRewrite(table) - .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") - .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "-5") - .execute()); + () -> + basicRewrite(table) + .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") + .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "-5") + .execute()); - AssertHelpers.assertThrows("No negative values for max concurrent groups", + AssertHelpers.assertThrows( + "No negative values for max concurrent groups", IllegalArgumentException.class, - () -> basicRewrite(table) - .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "-5") - .execute()); + () -> + basicRewrite(table) + .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "-5") + .execute()); - AssertHelpers.assertThrows("No unknown options allowed", + AssertHelpers.assertThrows( + "No unknown options allowed", IllegalArgumentException.class, - () -> basicRewrite(table) - .option("foobarity", "-5") - .execute()); + () -> basicRewrite(table).option("foobarity", "-5").execute()); - AssertHelpers.assertThrows("Cannot set rewrite-job-order to foo", + AssertHelpers.assertThrows( + "Cannot set rewrite-job-order to foo", IllegalArgumentException.class, - () -> basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, "foo") - .execute()); + () -> basicRewrite(table).option(RewriteDataFiles.REWRITE_JOB_ORDER, "foo").execute()); } @Test @@ -838,7 +867,8 @@ public void testSortMultipleGroups() { basicRewrite(table) .sort() .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .execute(); Assert.assertEquals("Should have 10 fileGroups", result.rewriteResults().size(), 10); @@ -866,7 +896,8 @@ public void testSimpleSort() { .sort() .option(SortStrategy.MIN_INPUT_FILES, "1") .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); @@ -897,11 +928,14 @@ public void testSortAfterPartitionChange() { .sort() .option(SortStrategy.MIN_INPUT_FILES, "1") .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); - Assert.assertEquals("Should have 1 fileGroup because all files were not correctly partitioned", - result.rewriteResults().size(), 1); + Assert.assertEquals( + "Should have 1 fileGroup because all files were not correctly partitioned", + result.rewriteResults().size(), + 1); table.refresh(); @@ -926,7 +960,8 @@ public void testSortCustomSortOrder() { basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c2").build()) .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); @@ -961,7 +996,9 @@ public void testSortCustomSortOrderRequiresRepartition() { basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c3").build()) .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) / partitions)) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + Integer.toString(averageFileSize(table) / partitions)) .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); @@ -989,14 +1026,19 @@ public void testAutoSortShuffleOutput() { RewriteDataFiles.Result result = basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c2").build()) - .option(SortStrategy.MAX_FILE_SIZE_BYTES, Integer.toString((averageFileSize(table) / 2) + 2)) + .option( + SortStrategy.MAX_FILE_SIZE_BYTES, + Integer.toString((averageFileSize(table) / 2) + 2)) // Divide files in 2 - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) / 2)) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + Integer.toString(averageFileSize(table) / 2)) .option(SortStrategy.MIN_INPUT_FILES, "1") .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); - Assert.assertTrue("Should have written 40+ files", + Assert.assertTrue( + "Should have written 40+ files", Iterables.size(table.currentSnapshot().addedDataFiles(table.io())) >= 40); table.refresh(); @@ -1021,17 +1063,20 @@ public void testCommitStateUnknownException() { BaseRewriteDataFilesSparkAction spyAction = spy(action); RewriteDataFilesCommitManager util = spy(new RewriteDataFilesCommitManager(table)); - doAnswer(invocationOnMock -> { - invocationOnMock.callRealMethod(); - throw new CommitStateUnknownException(new RuntimeException("Unknown State")); - }).when(util).commitFileGroups(any()); + doAnswer( + invocationOnMock -> { + invocationOnMock.callRealMethod(); + throw new CommitStateUnknownException(new RuntimeException("Unknown State")); + }) + .when(util) + .commitFileGroups(any()); - doReturn(util) - .when(spyAction) - .commitManager(table.currentSnapshot().snapshotId()); + doReturn(util).when(spyAction).commitManager(table.currentSnapshot().snapshotId()); - AssertHelpers.assertThrows("Should propagate CommitStateUnknown Exception", - CommitStateUnknownException.class, () -> spyAction.execute()); + AssertHelpers.assertThrows( + "Should propagate CommitStateUnknown Exception", + CommitStateUnknownException.class, + () -> spyAction.execute()); List postRewriteData = currentData(); assertEquals("We shouldn't have changed the data", originalData, postRewriteData); @@ -1049,7 +1094,8 @@ public void testZOrderSort() { List originalData = currentData(); double originalFilesC2 = percentFilesRequired(table, "c2", "foo23"); double originalFilesC3 = percentFilesRequired(table, "c3", "bar21"); - double originalFilesC2C3 = percentFilesRequired(table, new String[]{"c2", "c3"}, new String[]{"foo23", "bar23"}); + double originalFilesC2C3 = + percentFilesRequired(table, new String[] {"c2", "c3"}, new String[] {"foo23", "bar23"}); Assert.assertTrue("Should require all files to scan c2", originalFilesC2 > 0.99); Assert.assertTrue("Should require all files to scan c3", originalFilesC3 > 0.99); @@ -1057,9 +1103,13 @@ public void testZOrderSort() { RewriteDataFiles.Result result = basicRewrite(table) .zOrder("c2", "c3") - .option(SortStrategy.MAX_FILE_SIZE_BYTES, Integer.toString((averageFileSize(table) / 2) + 2)) + .option( + SortStrategy.MAX_FILE_SIZE_BYTES, + Integer.toString((averageFileSize(table) / 2) + 2)) // Divide files in 2 - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) / 2)) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + Integer.toString(averageFileSize(table) / 2)) .option(SortStrategy.MIN_INPUT_FILES, "1") .execute(); @@ -1077,13 +1127,17 @@ public void testZOrderSort() { double filesScannedC2 = percentFilesRequired(table, "c2", "foo23"); double filesScannedC3 = percentFilesRequired(table, "c3", "bar21"); - double filesScannedC2C3 = percentFilesRequired(table, new String[]{"c2", "c3"}, new String[]{"foo23", "bar23"}); + double filesScannedC2C3 = + percentFilesRequired(table, new String[] {"c2", "c3"}, new String[] {"foo23", "bar23"}); - Assert.assertTrue("Should have reduced the number of files required for c2", + Assert.assertTrue( + "Should have reduced the number of files required for c2", filesScannedC2 < originalFilesC2); - Assert.assertTrue("Should have reduced the number of files required for c3", + Assert.assertTrue( + "Should have reduced the number of files required for c3", filesScannedC3 < originalFilesC3); - Assert.assertTrue("Should have reduced the number of files required for a c2,c3 predicate", + Assert.assertTrue( + "Should have reduced the number of files required for a c2,c3 predicate", filesScannedC2C3 < originalFilesC2C3); } @@ -1092,13 +1146,22 @@ public void testZOrderAllTypesSort() { Table table = createTypeTestTable(); shouldHaveFiles(table, 10); - List originalRaw = spark.read().format("iceberg").load(tableLocation).sort("longCol").collectAsList(); + List originalRaw = + spark.read().format("iceberg").load(tableLocation).sort("longCol").collectAsList(); List originalData = rowsToJava(originalRaw); // TODO add in UUID when it is supported in Spark RewriteDataFiles.Result result = basicRewrite(table) - .zOrder("longCol", "intCol", "floatCol", "doubleCol", "dateCol", "timestampCol", "stringCol", "binaryCol", + .zOrder( + "longCol", + "intCol", + "floatCol", + "doubleCol", + "dateCol", + "timestampCol", + "stringCol", + "binaryCol", "booleanCol") .option(SortStrategy.MIN_INPUT_FILES, "1") .option(SortStrategy.REWRITE_ALL, "true") @@ -1110,7 +1173,8 @@ public void testZOrderAllTypesSort() { table.refresh(); - List postRaw = spark.read().format("iceberg").load(tableLocation).sort("longCol").collectAsList(); + List postRaw = + spark.read().format("iceberg").load(tableLocation).sort("longCol").collectAsList(); List postRewriteData = rowsToJava(postRaw); assertEquals("We shouldn't have changed the data", originalData, postRewriteData); @@ -1122,18 +1186,23 @@ public void testZOrderAllTypesSort() { public void testInvalidAPIUsage() { Table table = createTable(1); - AssertHelpers.assertThrows("Should be unable to set Strategy more than once", IllegalArgumentException.class, - "Cannot set strategy", () -> actions().rewriteDataFiles(table).binPack().sort()); + AssertHelpers.assertThrows( + "Should be unable to set Strategy more than once", + IllegalArgumentException.class, + "Cannot set strategy", + () -> actions().rewriteDataFiles(table).binPack().sort()); - AssertHelpers.assertThrows("Should be unable to set Strategy more than once", IllegalArgumentException.class, - "Cannot set strategy", () -> actions().rewriteDataFiles(table).sort().binPack()); + AssertHelpers.assertThrows( + "Should be unable to set Strategy more than once", + IllegalArgumentException.class, + "Cannot set strategy", + () -> actions().rewriteDataFiles(table).sort().binPack()); AssertHelpers.assertThrows( "Should be unable to set Strategy more than once", IllegalArgumentException.class, "Cannot set strategy", - () -> - actions().rewriteDataFiles(table).sort(SortOrder.unsorted()).binPack()); + () -> actions().rewriteDataFiles(table).sort(SortOrder.unsorted()).binPack()); } @Test @@ -1146,21 +1215,23 @@ public void testRewriteJobOrderBytesAsc() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); BaseRewriteDataFilesSparkAction basicRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .binPack(); - List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::sizeInBytes) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) basicRewrite(table).binPack(); + List expected = + toGroupStream(table, basicRewrite) + .mapToLong(RewriteFileGroup::sizeInBytes) + .boxed() + .collect(Collectors.toList()); BaseRewriteDataFilesSparkAction jobOrderRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.BYTES_ASC.orderName()) - .binPack(); - List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::sizeInBytes) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) + basicRewrite(table) + .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.BYTES_ASC.orderName()) + .binPack(); + List actual = + toGroupStream(table, jobOrderRewrite) + .mapToLong(RewriteFileGroup::sizeInBytes) + .boxed() + .collect(Collectors.toList()); expected.sort(Comparator.naturalOrder()); Assert.assertEquals("Size in bytes order should be ascending", actual, expected); @@ -1178,21 +1249,23 @@ public void testRewriteJobOrderBytesDesc() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); BaseRewriteDataFilesSparkAction basicRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .binPack(); - List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::sizeInBytes) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) basicRewrite(table).binPack(); + List expected = + toGroupStream(table, basicRewrite) + .mapToLong(RewriteFileGroup::sizeInBytes) + .boxed() + .collect(Collectors.toList()); BaseRewriteDataFilesSparkAction jobOrderRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.BYTES_DESC.orderName()) - .binPack(); - List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::sizeInBytes) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) + basicRewrite(table) + .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.BYTES_DESC.orderName()) + .binPack(); + List actual = + toGroupStream(table, jobOrderRewrite) + .mapToLong(RewriteFileGroup::sizeInBytes) + .boxed() + .collect(Collectors.toList()); expected.sort(Comparator.reverseOrder()); Assert.assertEquals("Size in bytes order should be descending", actual, expected); @@ -1210,21 +1283,23 @@ public void testRewriteJobOrderFilesAsc() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); BaseRewriteDataFilesSparkAction basicRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .binPack(); - List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::numFiles) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) basicRewrite(table).binPack(); + List expected = + toGroupStream(table, basicRewrite) + .mapToLong(RewriteFileGroup::numFiles) + .boxed() + .collect(Collectors.toList()); BaseRewriteDataFilesSparkAction jobOrderRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.FILES_ASC.orderName()) - .binPack(); - List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::numFiles) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) + basicRewrite(table) + .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.FILES_ASC.orderName()) + .binPack(); + List actual = + toGroupStream(table, jobOrderRewrite) + .mapToLong(RewriteFileGroup::numFiles) + .boxed() + .collect(Collectors.toList()); expected.sort(Comparator.naturalOrder()); Assert.assertEquals("Number of files order should be ascending", actual, expected); @@ -1242,21 +1317,23 @@ public void testRewriteJobOrderFilesDesc() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); BaseRewriteDataFilesSparkAction basicRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .binPack(); - List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::numFiles) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) basicRewrite(table).binPack(); + List expected = + toGroupStream(table, basicRewrite) + .mapToLong(RewriteFileGroup::numFiles) + .boxed() + .collect(Collectors.toList()); BaseRewriteDataFilesSparkAction jobOrderRewrite = - (BaseRewriteDataFilesSparkAction) basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.FILES_DESC.orderName()) - .binPack(); - List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::numFiles) - .boxed() - .collect(Collectors.toList()); + (BaseRewriteDataFilesSparkAction) + basicRewrite(table) + .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.FILES_DESC.orderName()) + .binPack(); + List actual = + toGroupStream(table, jobOrderRewrite) + .mapToLong(RewriteFileGroup::numFiles) + .boxed() + .collect(Collectors.toList()); expected.sort(Comparator.reverseOrder()); Assert.assertEquals("Number of files order should be descending", actual, expected); @@ -1264,8 +1341,8 @@ public void testRewriteJobOrderFilesDesc() { Assert.assertNotEquals("Number of files order should not be ascending", actual, expected); } - private Stream toGroupStream(Table table, - BaseRewriteDataFilesSparkAction rewrite) { + private Stream toGroupStream( + Table table, BaseRewriteDataFilesSparkAction rewrite) { rewrite.validateAndInitOptions(); Map>> fileGroupsByPartition = rewrite.planFileGroups(table.currentSnapshot().snapshotId()); @@ -1275,9 +1352,8 @@ private Stream toGroupStream(Table table, } protected List currentData() { - return rowsToJava(spark.read().format("iceberg").load(tableLocation) - .sort("c1", "c2", "c3") - .collectAsList()); + return rowsToJava( + spark.read().format("iceberg").load(tableLocation).sort("c1", "c2", "c3").collectAsList()); } protected long testDataSize(Table table) { @@ -1299,84 +1375,102 @@ protected void shouldHaveFiles(Table table, int numExpected) { protected void shouldHaveSnapshots(Table table, int expectedSnapshots) { table.refresh(); int actualSnapshots = Iterables.size(table.snapshots()); - Assert.assertEquals("Table did not have the expected number of snapshots", - expectedSnapshots, actualSnapshots); + Assert.assertEquals( + "Table did not have the expected number of snapshots", expectedSnapshots, actualSnapshots); } protected void shouldHaveNoOrphans(Table table) { - Assert.assertEquals("Should not have found any orphan files", ImmutableList.of(), - actions().deleteOrphanFiles(table) + Assert.assertEquals( + "Should not have found any orphan files", + ImmutableList.of(), + actions() + .deleteOrphanFiles(table) .olderThan(System.currentTimeMillis()) .execute() .orphanFileLocations()); } protected void shouldHaveACleanCache(Table table) { - Assert.assertEquals("Should not have any entries in cache", ImmutableSet.of(), - cacheContents(table)); + Assert.assertEquals( + "Should not have any entries in cache", ImmutableSet.of(), cacheContents(table)); } protected void shouldHaveLastCommitSorted(Table table, String column) { - List, Pair>> - overlappingFiles = checkForOverlappingFiles(table, column); + List, Pair>> overlappingFiles = checkForOverlappingFiles(table, column); Assert.assertEquals("Found overlapping files", Collections.emptyList(), overlappingFiles); } protected void shouldHaveLastCommitUnsorted(Table table, String column) { - List, Pair>> - overlappingFiles = checkForOverlappingFiles(table, column); + List, Pair>> overlappingFiles = checkForOverlappingFiles(table, column); Assert.assertNotEquals("Found no overlapping files", Collections.emptyList(), overlappingFiles); } private Pair boundsOf(DataFile file, NestedField field, Class javaClass) { int columnId = field.fieldId(); - return Pair.of(javaClass.cast(Conversions.fromByteBuffer(field.type(), file.lowerBounds().get(columnId))), + return Pair.of( + javaClass.cast(Conversions.fromByteBuffer(field.type(), file.lowerBounds().get(columnId))), javaClass.cast(Conversions.fromByteBuffer(field.type(), file.upperBounds().get(columnId)))); } - - private List, Pair>> checkForOverlappingFiles(Table table, String column) { + private List, Pair>> checkForOverlappingFiles( + Table table, String column) { table.refresh(); NestedField field = table.schema().caseInsensitiveFindField(column); Class javaClass = (Class) field.type().typeId().javaClass(); Snapshot snapshot = table.currentSnapshot(); - Map> filesByPartition = Streams.stream(snapshot.addedDataFiles(table.io())) - .collect(Collectors.groupingBy(DataFile::partition)); + Map> filesByPartition = + Streams.stream(snapshot.addedDataFiles(table.io())) + .collect(Collectors.groupingBy(DataFile::partition)); Stream, Pair>> overlaps = - filesByPartition.entrySet().stream().flatMap(entry -> { - List datafiles = entry.getValue(); - Preconditions.checkArgument(datafiles.size() > 1, - "This test is checking for overlaps in a situation where no overlaps can actually occur because the " + - "partition %s does not contain multiple datafiles", entry.getKey()); - - List, Pair>> boundComparisons = Lists.cartesianProduct(datafiles, datafiles).stream() - .filter(tuple -> tuple.get(0) != tuple.get(1)) - .map(tuple -> Pair.of(boundsOf(tuple.get(0), field, javaClass), boundsOf(tuple.get(1), field, javaClass))) - .collect(Collectors.toList()); - - Comparator comparator = Comparators.forType(field.type().asPrimitiveType()); - - List, Pair>> overlappingFiles = boundComparisons.stream() - .filter(filePair -> { - Pair left = filePair.first(); - T lMin = left.first(); - T lMax = left.second(); - Pair right = filePair.second(); - T rMin = right.first(); - T rMax = right.second(); - boolean boundsDoNotOverlap = - // Min and Max of a range are greater than or equal to the max value of the other range - (comparator.compare(rMax, lMax) >= 0 && comparator.compare(rMin, lMax) >= 0) || - (comparator.compare(lMax, rMax) >= 0 && comparator.compare(lMin, rMax) >= 0); - - return !boundsDoNotOverlap; - }).collect(Collectors.toList()); - return overlappingFiles.stream(); - }); + filesByPartition.entrySet().stream() + .flatMap( + entry -> { + List datafiles = entry.getValue(); + Preconditions.checkArgument( + datafiles.size() > 1, + "This test is checking for overlaps in a situation where no overlaps can actually occur because the " + + "partition %s does not contain multiple datafiles", + entry.getKey()); + + List, Pair>> boundComparisons = + Lists.cartesianProduct(datafiles, datafiles).stream() + .filter(tuple -> tuple.get(0) != tuple.get(1)) + .map( + tuple -> + Pair.of( + boundsOf(tuple.get(0), field, javaClass), + boundsOf(tuple.get(1), field, javaClass))) + .collect(Collectors.toList()); + + Comparator comparator = Comparators.forType(field.type().asPrimitiveType()); + + List, Pair>> overlappingFiles = + boundComparisons.stream() + .filter( + filePair -> { + Pair left = filePair.first(); + T lMin = left.first(); + T lMax = left.second(); + Pair right = filePair.second(); + T rMin = right.first(); + T rMax = right.second(); + boolean boundsDoNotOverlap = + // Min and Max of a range are greater than or equal to the max + // value of the other range + (comparator.compare(rMax, lMax) >= 0 + && comparator.compare(rMin, lMax) >= 0) + || (comparator.compare(lMax, rMax) >= 0 + && comparator.compare(lMin, rMax) >= 0); + + return !boundsDoNotOverlap; + }) + .collect(Collectors.toList()); + return overlappingFiles.stream(); + }); return overlaps.collect(Collectors.toList()); } @@ -1385,13 +1479,17 @@ protected Table createTable() { PartitionSpec spec = PartitionSpec.unpartitioned(); Map options = Maps.newHashMap(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - table.updateProperties().set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, Integer.toString(20 * 1024)).commit(); + table + .updateProperties() + .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, Integer.toString(20 * 1024)) + .commit(); Assert.assertNull("Table must be empty", table.currentSnapshot()); return table; } /** * Create a table with a certain number of files, returns the size of a file + * * @param files number of files to create * @return the created table */ @@ -1401,12 +1499,9 @@ protected Table createTable(int files) { return table; } - protected Table createTablePartitioned(int partitions, int files, - int numRecords, Map options) { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .truncate("c2", 2) - .build(); + protected Table createTablePartitioned( + int partitions, int files, int numRecords, Map options) { + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); Assert.assertNull("Table must be empty", table.currentSnapshot()); @@ -1419,21 +1514,23 @@ protected Table createTablePartitioned(int partitions, int files) { } private Table createTypeTestTable() { - Schema schema = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "dateCol", Types.DateType.get()), - optional(6, "timestampCol", Types.TimestampType.withZone()), - optional(7, "stringCol", Types.StringType.get()), - optional(8, "booleanCol", Types.BooleanType.get()), - optional(9, "binaryCol", Types.BinaryType.get())); + Schema schema = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "dateCol", Types.DateType.get()), + optional(6, "timestampCol", Types.TimestampType.withZone()), + optional(7, "stringCol", Types.StringType.get()), + optional(8, "booleanCol", Types.BooleanType.get()), + optional(9, "binaryCol", Types.BinaryType.get())); Map options = Maps.newHashMap(); Table table = TABLES.create(schema, PartitionSpec.unpartitioned(), options, tableLocation); - spark.range(0, 10, 1, 10) + spark + .range(0, 10, 1, 10) .withColumnRenamed("id", "longCol") .withColumn("intCol", expr("CAST(longCol AS INT)")) .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) @@ -1453,7 +1550,11 @@ private Table createTypeTestTable() { protected int averageFileSize(Table table) { table.refresh(); - return (int) Streams.stream(table.newScan().planFiles()).mapToLong(FileScanTask::length).average().getAsDouble(); + return (int) + Streams.stream(table.newScan().planFiles()) + .mapToLong(FileScanTask::length) + .average() + .getAsDouble(); } private void writeRecords(int files, int numRecords) { @@ -1464,20 +1565,21 @@ private void writeRecords(int files, int numRecords, int partitions) { List records = Lists.newArrayList(); int rowDimension = (int) Math.ceil(Math.sqrt(numRecords)); List> data = - IntStream.range(0, rowDimension).boxed().flatMap(x -> - IntStream.range(0, rowDimension).boxed().map(y -> Pair.of(x, y))) + IntStream.range(0, rowDimension) + .boxed() + .flatMap(x -> IntStream.range(0, rowDimension).boxed().map(y -> Pair.of(x, y))) .collect(Collectors.toList()); Collections.shuffle(data, new Random(42)); if (partitions > 0) { - data.forEach(i -> records.add(new ThreeColumnRecord( - i.first() % partitions, - "foo" + i.first(), - "bar" + i.second()))); + data.forEach( + i -> + records.add( + new ThreeColumnRecord( + i.first() % partitions, "foo" + i.first(), "bar" + i.second()))); } else { - data.forEach(i -> records.add(new ThreeColumnRecord( - i.first(), - "foo" + i.first(), - "bar" + i.second()))); + data.forEach( + i -> + records.add(new ThreeColumnRecord(i.first(), "foo" + i.first(), "bar" + i.second()))); } Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).repartition(files); writeDF(df); @@ -1492,24 +1594,31 @@ private void writeDF(Dataset df) { .save(tableLocation); } - private List writePosDeletesToFile(Table table, DataFile dataFile, int outputDeleteFiles) { - return writePosDeletes(table, dataFile.partition(), dataFile.path().toString(), outputDeleteFiles); + private List writePosDeletesToFile( + Table table, DataFile dataFile, int outputDeleteFiles) { + return writePosDeletes( + table, dataFile.partition(), dataFile.path().toString(), outputDeleteFiles); } - private List writePosDeletes(Table table, StructLike partition, String path, int outputDeleteFiles) { + private List writePosDeletes( + Table table, StructLike partition, String path, int outputDeleteFiles) { List results = Lists.newArrayList(); int rowPosition = 0; for (int file = 0; file < outputDeleteFiles; file++) { - OutputFile outputFile = table.io().newOutputFile( - table.locationProvider().newDataLocation(UUID.randomUUID().toString())); - EncryptedOutputFile encryptedOutputFile = EncryptedFiles.encryptedOutput( - outputFile, EncryptionKeyMetadata.EMPTY); - - GenericAppenderFactory appenderFactory = new GenericAppenderFactory( - table.schema(), table.spec(), null, null, null); - PositionDeleteWriter posDeleteWriter = appenderFactory - .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full") - .newPosDeleteWriter(encryptedOutputFile, FileFormat.PARQUET, partition); + OutputFile outputFile = + table + .io() + .newOutputFile( + table.locationProvider().newDataLocation(UUID.randomUUID().toString())); + EncryptedOutputFile encryptedOutputFile = + EncryptedFiles.encryptedOutput(outputFile, EncryptionKeyMetadata.EMPTY); + + GenericAppenderFactory appenderFactory = + new GenericAppenderFactory(table.schema(), table.spec(), null, null, null); + PositionDeleteWriter posDeleteWriter = + appenderFactory + .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full") + .newPosDeleteWriter(encryptedOutputFile, FileFormat.PARQUET, partition); posDeleteWriter.delete(path, rowPosition); try { @@ -1537,7 +1646,7 @@ private Set cacheContents(Table table) { } private double percentFilesRequired(Table table, String col, String value) { - return percentFilesRequired(table, new String[]{col}, new String[]{value}); + return percentFilesRequired(table, new String[] {col}, new String[] {value}); } private double percentFilesRequired(Table table, String[] cols, String[] values) { diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java index f30251e74001..4b50ea0c29f3 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + import java.io.File; import java.io.IOException; import java.util.List; @@ -53,28 +57,22 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - @RunWith(Parameterized.class) public class TestRewriteManifestsAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); @Parameterized.Parameters(name = "snapshotIdInheritanceEnabled = {0}") public static Object[] parameters() { - return new Object[] { "true", "false" }; + return new Object[] {"true", "false"}; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String snapshotIdInheritanceEnabled; private String tableLocation = null; @@ -100,7 +98,8 @@ public void testRewriteManifestsEmptyTable() throws IOException { SparkActions actions = SparkActions.get(); - actions.rewriteManifests(table) + actions + .rewriteManifests(table) .rewriteIf(manifest -> true) .stagingLocation(temp.newFolder().toString()) .execute(); @@ -115,16 +114,15 @@ public void testRewriteSmallManifestsNonPartitionedTable() { options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -134,12 +132,13 @@ public void testRewriteSmallManifestsNonPartitionedTable() { SparkActions actions = SparkActions.get(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .execute(); + RewriteManifests.Result result = + actions.rewriteManifests(table).rewriteIf(manifest -> true).execute(); - Assert.assertEquals("Action should rewrite 2 manifests", 2, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite 2 manifests", 2, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); table.refresh(); @@ -155,9 +154,8 @@ public void testRewriteSmallManifestsNonPartitionedTable() { expectedRecords.addAll(records2); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -169,16 +167,15 @@ public void testRewriteManifestsWithCommitStateUnknownException() { options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -191,15 +188,19 @@ public void testRewriteManifestsWithCommitStateUnknownException() { // create a spy which would throw a CommitStateUnknownException after successful commit. org.apache.iceberg.RewriteManifests newRewriteManifests = table.rewriteManifests(); org.apache.iceberg.RewriteManifests spyNewRewriteManifests = spy(newRewriteManifests); - doAnswer(invocation -> { - newRewriteManifests.commit(); - throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); - }).when(spyNewRewriteManifests).commit(); + doAnswer( + invocation -> { + newRewriteManifests.commit(); + throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); + }) + .when(spyNewRewriteManifests) + .commit(); Table spyTable = spy(table); when(spyTable.rewriteManifests()).thenReturn(spyNewRewriteManifests); - AssertHelpers.assertThrowsCause("Should throw a Commit State Unknown Exception", + AssertHelpers.assertThrowsCause( + "Should throw a Commit State Unknown Exception", RuntimeException.class, "Datacenter on Fire", () -> actions.rewriteManifests(spyTable).rewriteIf(manifest -> true).execute()); @@ -219,45 +220,40 @@ public void testRewriteManifestsWithCommitStateUnknownException() { expectedRecords.addAll(records2); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @Test public void testRewriteSmallManifestsPartitionedTable() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .truncate("c2", 2) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); - List records3 = Lists.newArrayList( - new ThreeColumnRecord(3, "EEEEEEEEEE", "EEEE"), - new ThreeColumnRecord(3, "FFFFFFFFFF", "FFFF") - ); + List records3 = + Lists.newArrayList( + new ThreeColumnRecord(3, "EEEEEEEEEE", "EEEE"), + new ThreeColumnRecord(3, "FFFFFFFFFF", "FFFF")); writeRecords(records3); - List records4 = Lists.newArrayList( - new ThreeColumnRecord(4, "GGGGGGGGGG", "GGGG"), - new ThreeColumnRecord(4, "HHHHHHHHHG", "HHHH") - ); + List records4 = + Lists.newArrayList( + new ThreeColumnRecord(4, "GGGGGGGGGG", "GGGG"), + new ThreeColumnRecord(4, "HHHHHHHHHG", "HHHH")); writeRecords(records4); table.refresh(); @@ -271,16 +267,18 @@ public void testRewriteSmallManifestsPartitionedTable() { long manifestEntrySizeBytes = computeManifestEntrySizeBytes(manifests); long targetManifestSizeBytes = (long) (1.05 * 4 * manifestEntrySizeBytes); - table.updateProperties() + table + .updateProperties() .set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(targetManifestSizeBytes)) .commit(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .execute(); + RewriteManifests.Result result = + actions.rewriteManifests(table).rewriteIf(manifest -> true).execute(); - Assert.assertEquals("Action should rewrite 4 manifests", 4, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite 4 manifests", 4, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); table.refresh(); @@ -302,32 +300,29 @@ public void testRewriteSmallManifestsPartitionedTable() { expectedRecords.addAll(records4); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @Test public void testRewriteImportedManifests() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c3") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c3").build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); File parquetTableDir = temp.newFolder("parquet_table"); String parquetTableLocation = parquetTableDir.toURI().toString(); try { Dataset inputDF = spark.createDataFrame(records, ThreeColumnRecord.class); - inputDF.select("c1", "c2", "c3") + inputDF + .select("c1", "c2", "c3") .write() .format("parquet") .mode("overwrite") @@ -336,20 +331,26 @@ public void testRewriteImportedManifests() throws IOException { .saveAsTable("parquet_table"); File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); + SparkTableUtil.importSparkTable( + spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); Snapshot snapshot = table.currentSnapshot(); SparkActions actions = SparkActions.get(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .stagingLocation(temp.newFolder().toString()) - .execute(); + RewriteManifests.Result result = + actions + .rewriteManifests(table) + .rewriteIf(manifest -> true) + .stagingLocation(temp.newFolder().toString()) + .execute(); - Assert.assertEquals("Action should rewrite all manifests", - snapshot.allManifests(table.io()), result.rewrittenManifests()); - Assert.assertEquals("Action should add 1 manifest", 1, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite all manifests", + snapshot.allManifests(table.io()), + result.rewrittenManifests()); + Assert.assertEquals( + "Action should add 1 manifest", 1, Iterables.size(result.addedManifests())); } finally { spark.sql("DROP TABLE parquet_table"); @@ -358,9 +359,7 @@ public void testRewriteImportedManifests() throws IOException { @Test public void testRewriteLargeManifestsPartitionedTable() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c3") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c3").build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); @@ -380,19 +379,26 @@ public void testRewriteLargeManifestsPartitionedTable() throws IOException { Assert.assertEquals("Should have 1 manifests before rewrite", 1, manifests.size()); // set the target manifest size to a small value to force splitting records into multiple files - table.updateProperties() - .set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(manifests.get(0).length() / 2)) + table + .updateProperties() + .set( + TableProperties.MANIFEST_TARGET_SIZE_BYTES, + String.valueOf(manifests.get(0).length() / 2)) .commit(); SparkActions actions = SparkActions.get(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .stagingLocation(temp.newFolder().toString()) - .execute(); + RewriteManifests.Result result = + actions + .rewriteManifests(table) + .rewriteIf(manifest -> true) + .stagingLocation(temp.newFolder().toString()) + .execute(); - Assert.assertEquals("Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); table.refresh(); @@ -400,33 +406,28 @@ public void testRewriteLargeManifestsPartitionedTable() throws IOException { Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size()); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @Test public void testRewriteManifestsWithPredicate() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .truncate("c2", 2) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -437,14 +438,18 @@ public void testRewriteManifestsWithPredicate() throws IOException { SparkActions actions = SparkActions.get(); // rewrite only the first manifest without caching - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> manifest.path().equals(manifests.get(0).path())) - .stagingLocation(temp.newFolder().toString()) - .option("use-caching", "false") - .execute(); - - Assert.assertEquals("Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); + RewriteManifests.Result result = + actions + .rewriteManifests(table) + .rewriteIf(manifest -> manifest.path().equals(manifests.get(0).path())) + .stagingLocation(temp.newFolder().toString()) + .option("use-caching", "false") + .execute(); + + Assert.assertEquals( + "Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); table.refresh(); @@ -452,16 +457,16 @@ public void testRewriteManifestsWithPredicate() throws IOException { Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size()); Assert.assertFalse("First manifest must be rewritten", newManifests.contains(manifests.get(0))); - Assert.assertTrue("Second manifest must not be rewritten", newManifests.contains(manifests.get(1))); + Assert.assertTrue( + "Second manifest must not be rewritten", newManifests.contains(manifests.get(1))); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(records1); expectedRecords.addAll(records2); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -472,11 +477,7 @@ private void writeRecords(List records) { } private void writeDF(Dataset df) { - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); } private long computeManifestEntrySizeBytes(List manifests) { @@ -485,7 +486,8 @@ private long computeManifestEntrySizeBytes(List manifests) { for (ManifestFile manifest : manifests) { totalSize += manifest.length(); - numEntries += manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); + numEntries += + manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); } return totalSize / numEntries; diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java index 5f902f6e6828..2b091c7ed6c1 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; @@ -38,34 +40,31 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class AvroDataTest { protected abstract void writeAndValidate(Schema schema) throws IOException; - protected static final StructType SUPPORTED_PRIMITIVES = StructType.of( - required(100, "id", LongType.get()), - optional(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - optional(103, "i", Types.IntegerType.get()), - required(104, "l", LongType.get()), - optional(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - optional(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - // required(111, "uuid", Types.UUIDType.get()), - required(112, "fixed", Types.FixedType.ofLength(7)), - optional(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + protected static final StructType SUPPORTED_PRIMITIVES = + StructType.of( + required(100, "id", LongType.get()), + optional(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + optional(103, "i", Types.IntegerType.get()), + required(104, "l", LongType.get()), + optional(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + optional(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + // required(111, "uuid", Types.UUIDType.get()), + required(112, "fixed", Types.FixedType.ofLength(7)), + optional(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision + ); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testSimpleStruct() throws IOException { @@ -74,162 +73,208 @@ public void testSimpleStruct() throws IOException { @Test public void testStructWithRequiredFields() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asRequired)))); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds( + new Schema( + Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asRequired)))); } @Test public void testStructWithOptionalFields() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)))); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds( + new Schema( + Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)))); } @Test public void testNestedStruct() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema(required(1, "struct", SUPPORTED_PRIMITIVES)))); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds(new Schema(required(1, "struct", SUPPORTED_PRIMITIVES)))); } @Test public void testArray() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", ListType.ofOptional(2, Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, Types.StringType.get()))); writeAndValidate(schema); } @Test public void testArrayOfStructs() throws IOException { - Schema schema = TypeUtil.assignIncreasingFreshIds(new Schema( - required(0, "id", LongType.get()), - optional(1, "data", ListType.ofOptional(2, SUPPORTED_PRIMITIVES)))); + Schema schema = + TypeUtil.assignIncreasingFreshIds( + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, SUPPORTED_PRIMITIVES)))); writeAndValidate(schema); } @Test public void testMap() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StringType.get(), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional(2, 3, Types.StringType.get(), Types.StringType.get()))); writeAndValidate(schema); } @Test public void testNumericMapKey() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.LongType.get(), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, "data", MapType.ofOptional(2, 3, Types.LongType.get(), Types.StringType.get()))); writeAndValidate(schema); } @Test public void testComplexMapKey() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StructType.of( - required(4, "i", Types.IntegerType.get()), - optional(5, "s", Types.StringType.get())), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional( + 2, + 3, + Types.StructType.of( + required(4, "i", Types.IntegerType.get()), + optional(5, "s", Types.StringType.get())), + Types.StringType.get()))); writeAndValidate(schema); } @Test public void testMapOfStructs() throws IOException { - Schema schema = TypeUtil.assignIncreasingFreshIds(new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)))); + Schema schema = + TypeUtil.assignIncreasingFreshIds( + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional(2, 3, Types.StringType.get(), SUPPORTED_PRIMITIVES)))); writeAndValidate(schema); } @Test public void testMixedTypes() throws IOException { - StructType structType = StructType.of( - required(0, "id", LongType.get()), - optional(1, "list_of_maps", - ListType.ofOptional(2, MapType.ofOptional(3, 4, - Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - optional(5, "map_of_lists", - MapType.ofOptional(6, 7, - Types.StringType.get(), - ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), - required(9, "list_of_lists", - ListType.ofOptional(10, ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), - required(12, "map_of_maps", - MapType.ofOptional(13, 14, - Types.StringType.get(), - MapType.ofOptional(15, 16, + StructType structType = + StructType.of( + required(0, "id", LongType.get()), + optional( + 1, + "list_of_maps", + ListType.ofOptional( + 2, MapType.ofOptional(3, 4, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + optional( + 5, + "map_of_lists", + MapType.ofOptional( + 6, 7, Types.StringType.get(), ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), + required( + 9, + "list_of_lists", + ListType.ofOptional(10, ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), + required( + 12, + "map_of_maps", + MapType.ofOptional( + 13, + 14, Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - required(17, "list_of_struct_of_nested_types", ListType.ofOptional(19, StructType.of( - Types.NestedField.required(20, "m1", MapType.ofOptional(21, 22, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(23, "l1", ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), - Types.NestedField.required(25, "l2", ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(27, "m2", MapType.ofOptional(28, 29, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)) - ))) - ); - - Schema schema = new Schema(TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) - .asStructType().fields()); + MapType.ofOptional(15, 16, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + required( + 17, + "list_of_struct_of_nested_types", + ListType.ofOptional( + 19, + StructType.of( + Types.NestedField.required( + 20, + "m1", + MapType.ofOptional( + 21, 22, Types.StringType.get(), SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 23, "l1", ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), + Types.NestedField.required( + 25, "l2", ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 27, + "m2", + MapType.ofOptional( + 28, 29, Types.StringType.get(), SUPPORTED_PRIMITIVES)))))); + + Schema schema = + new Schema( + TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) + .asStructType() + .fields()); writeAndValidate(schema); } @Test public void testTimestampWithoutZone() throws IOException { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - Schema schema = TypeUtil.assignIncreasingFreshIds(new Schema( - required(0, "id", LongType.get()), - optional(1, "ts_without_zone", Types.TimestampType.withoutZone()))); - - writeAndValidate(schema); - }); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + Schema schema = + TypeUtil.assignIncreasingFreshIds( + new Schema( + required(0, "id", LongType.get()), + optional(1, "ts_without_zone", Types.TimestampType.withoutZone()))); + + writeAndValidate(schema); + }); } protected void withSQLConf(Map conf, Action action) throws IOException { SQLConf sqlConf = SQLConf.get(); Map currentConfValues = Maps.newHashMap(); - conf.keySet().forEach(confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); - - conf.forEach((confKey, confValue) -> { - if (SQLConf.isStaticConfigKey(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); + conf.keySet() + .forEach( + confKey -> { + if (sqlConf.contains(confKey)) { + String currentConfValue = sqlConf.getConfString(confKey); + currentConfValues.put(confKey, currentConfValue); + } + }); + + conf.forEach( + (confKey, confValue) -> { + if (SQLConf.isStaticConfigKey(confKey)) { + throw new RuntimeException("Cannot modify the value of a static config: " + confKey); + } + sqlConf.setConfString(confKey, confValue); + }); try { action.invoke(); } finally { - conf.forEach((confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); + conf.forEach( + (confKey, confValue) -> { + if (currentConfValues.containsKey(confKey)) { + sqlConf.setConfString(confKey, currentConfValues.get(confKey)); + } else { + sqlConf.unsetConf(confKey); + } + }); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java index 46c95cef112d..a96e3b1f57f5 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static scala.collection.JavaConverters.mapAsJavaMapConverter; +import static scala.collection.JavaConverters.seqAsJavaListConverter; + import java.math.BigDecimal; import java.nio.ByteBuffer; import java.sql.Timestamp; @@ -48,13 +51,8 @@ import org.junit.Assert; import scala.collection.Seq; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static scala.collection.JavaConverters.mapAsJavaMapConverter; -import static scala.collection.JavaConverters.seqAsJavaListConverter; - public class GenericsHelpers { - private GenericsHelpers() { - } + private GenericsHelpers() {} private static final OffsetDateTime EPOCH = Instant.ofEpochMilli(0L).atOffset(ZoneOffset.UTC); private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); @@ -71,7 +69,8 @@ public static void assertEqualsSafe(Types.StructType struct, Record expected, Ro } } - private static void assertEqualsSafe(Types.ListType list, Collection expected, List actual) { + private static void assertEqualsSafe( + Types.ListType list, Collection expected, List actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); for (int i = 0; i < expectedElements.size(); i += 1) { @@ -82,11 +81,11 @@ private static void assertEqualsSafe(Types.ListType list, Collection expected } } - private static void assertEqualsSafe(Types.MapType map, - Map expected, Map actual) { + private static void assertEqualsSafe(Types.MapType map, Map expected, Map actual) { Type keyType = map.keyType(); Type valueType = map.valueType(); - Assert.assertEquals("Should have the same number of keys", expected.keySet().size(), actual.keySet().size()); + Assert.assertEquals( + "Should have the same number of keys", expected.keySet().size(), actual.keySet().size()); for (Object expectedKey : expected.keySet()) { Object matchingKey = null; @@ -120,22 +119,29 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) Assert.assertEquals("Primitive value should be equal to expected", expected, actual); break; case DATE: - Assertions.assertThat(expected).as("Should expect a LocalDate").isInstanceOf(LocalDate.class); + Assertions.assertThat(expected) + .as("Should expect a LocalDate") + .isInstanceOf(LocalDate.class); Assertions.assertThat(actual).as("Should be a Date").isInstanceOf(Date.class); - Assert.assertEquals("ISO-8601 date should be equal", expected.toString(), actual.toString()); + Assert.assertEquals( + "ISO-8601 date should be equal", expected.toString(), actual.toString()); break; case TIMESTAMP: Assertions.assertThat(actual).as("Should be a Timestamp").isInstanceOf(Timestamp.class); Timestamp ts = (Timestamp) actual; // milliseconds from nanos has already been added by getTime - OffsetDateTime actualTs = EPOCH.plusNanos( - (ts.getTime() * 1_000_000) + (ts.getNanos() % 1_000_000)); + OffsetDateTime actualTs = + EPOCH.plusNanos((ts.getTime() * 1_000_000) + (ts.getNanos() % 1_000_000)); Types.TimestampType timestampType = (Types.TimestampType) type; if (timestampType.shouldAdjustToUTC()) { - Assertions.assertThat(expected).as("Should expect an OffsetDateTime").isInstanceOf(OffsetDateTime.class); + Assertions.assertThat(expected) + .as("Should expect an OffsetDateTime") + .isInstanceOf(OffsetDateTime.class); Assert.assertEquals("Timestamp should be equal", expected, actualTs); } else { - Assertions.assertThat(expected).as("Should expect an LocalDateTime").isInstanceOf(LocalDateTime.class); + Assertions.assertThat(expected) + .as("Should expect an LocalDateTime") + .isInstanceOf(LocalDateTime.class); Assert.assertEquals("Timestamp should be equal", expected, actualTs.toLocalDateTime()); } break; @@ -146,23 +152,25 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a String").isInstanceOf(String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual); + Assert.assertEquals("UUID string representation should match", expected.toString(), actual); break; case FIXED: Assertions.assertThat(expected).as("Should expect a byte[]").isInstanceOf(byte[].class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - (byte[]) expected, (byte[]) actual); + Assert.assertArrayEquals("Bytes should match", (byte[]) expected, (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a BigDecimal").isInstanceOf(BigDecimal.class); Assert.assertEquals("BigDecimals should be equal", expected, actual); break; @@ -172,16 +180,20 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) assertEqualsSafe(type.asNestedType().asStructType(), (Record) expected, (Row) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be a Seq").isInstanceOf(Seq.class); List asList = seqAsJavaListConverter((Seq) actual).asJava(); assertEqualsSafe(type.asNestedType().asListType(), (Collection) expected, asList); break; case MAP: Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be a Map").isInstanceOf(scala.collection.Map.class); - Map asMap = mapAsJavaMapConverter( - (scala.collection.Map) actual).asJava(); + Assertions.assertThat(actual) + .as("Should be a Map") + .isInstanceOf(scala.collection.Map.class); + Map asMap = + mapAsJavaMapConverter((scala.collection.Map) actual).asJava(); assertEqualsSafe(type.asNestedType().asMapType(), (Map) expected, asMap); break; case TIME: @@ -190,7 +202,8 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) } } - public static void assertEqualsUnsafe(Types.StructType struct, Record expected, InternalRow actual) { + public static void assertEqualsUnsafe( + Types.StructType struct, Record expected, InternalRow actual) { List fields = struct.fields(); for (int i = 0; i < fields.size(); i += 1) { Type fieldType = fields.get(i).type(); @@ -202,7 +215,8 @@ public static void assertEqualsUnsafe(Types.StructType struct, Record expected, } } - private static void assertEqualsUnsafe(Types.ListType list, Collection expected, ArrayData actual) { + private static void assertEqualsUnsafe( + Types.ListType list, Collection expected, ArrayData actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); for (int i = 0; i < expectedElements.size(); i += 1) { @@ -245,20 +259,29 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual Assert.assertEquals("Primitive value should be equal to expected", expected, actual); break; case DATE: - Assertions.assertThat(expected).as("Should expect a LocalDate").isInstanceOf(LocalDate.class); + Assertions.assertThat(expected) + .as("Should expect a LocalDate") + .isInstanceOf(LocalDate.class); int expectedDays = (int) ChronoUnit.DAYS.between(EPOCH_DAY, (LocalDate) expected); Assert.assertEquals("Primitive value should be equal to expected", expectedDays, actual); break; case TIMESTAMP: Types.TimestampType timestampType = (Types.TimestampType) type; if (timestampType.shouldAdjustToUTC()) { - Assertions.assertThat(expected).as("Should expect an OffsetDateTime").isInstanceOf(OffsetDateTime.class); + Assertions.assertThat(expected) + .as("Should expect an OffsetDateTime") + .isInstanceOf(OffsetDateTime.class); long expectedMicros = ChronoUnit.MICROS.between(EPOCH, (OffsetDateTime) expected); - Assert.assertEquals("Primitive value should be equal to expected", expectedMicros, actual); + Assert.assertEquals( + "Primitive value should be equal to expected", expectedMicros, actual); } else { - Assertions.assertThat(expected).as("Should expect an LocalDateTime").isInstanceOf(LocalDateTime.class); - long expectedMicros = ChronoUnit.MICROS.between(EPOCH, ((LocalDateTime) expected).atZone(ZoneId.of("UTC"))); - Assert.assertEquals("Primitive value should be equal to expected", expectedMicros, actual); + Assertions.assertThat(expected) + .as("Should expect an LocalDateTime") + .isInstanceOf(LocalDateTime.class); + long expectedMicros = + ChronoUnit.MICROS.between(EPOCH, ((LocalDateTime) expected).atZone(ZoneId.of("UTC"))); + Assert.assertEquals( + "Primitive value should be equal to expected", expectedMicros, actual); } break; case STRING: @@ -268,8 +291,8 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a UTF8String").isInstanceOf(UTF8String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual.toString()); + Assert.assertEquals( + "UUID string representation should match", expected.toString(), actual.toString()); break; case FIXED: Assertions.assertThat(expected).as("Should expect a byte[]").isInstanceOf(byte[].class); @@ -277,30 +300,42 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual Assert.assertArrayEquals("Bytes should match", (byte[]) expected, (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a Decimal").isInstanceOf(Decimal.class); - Assert.assertEquals("BigDecimals should be equal", - expected, ((Decimal) actual).toJavaBigDecimal()); + Assert.assertEquals( + "BigDecimals should be equal", expected, ((Decimal) actual).toJavaBigDecimal()); break; case STRUCT: Assertions.assertThat(expected).as("Should expect a Record").isInstanceOf(Record.class); - Assertions.assertThat(actual).as("Should be an InternalRow").isInstanceOf(InternalRow.class); - assertEqualsUnsafe(type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); + Assertions.assertThat(actual) + .as("Should be an InternalRow") + .isInstanceOf(InternalRow.class); + assertEqualsUnsafe( + type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be an ArrayData").isInstanceOf(ArrayData.class); - assertEqualsUnsafe(type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); + assertEqualsUnsafe( + type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); break; case MAP: Assertions.assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be an ArrayBasedMapData").isInstanceOf(MapData.class); + Assertions.assertThat(actual) + .as("Should be an ArrayBasedMapData") + .isInstanceOf(MapData.class); assertEqualsUnsafe(type.asNestedType().asMapType(), (Map) expected, (MapData) actual); break; case TIME: diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java index d3bffb75eb5c..1c95df8ced12 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.math.BigDecimal; @@ -53,8 +52,7 @@ public class RandomData { // Default percentage of number of values that are null for optional fields public static final float DEFAULT_NULL_PERCENTAGE = 0.05f; - private RandomData() { - } + private RandomData() {} public static List generateList(Schema schema, int numRecords, long seed) { RandomDataGenerator generator = new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE); @@ -67,63 +65,71 @@ public static List generateList(Schema schema, int numRecords, long seed } public static Iterable generateSpark(Schema schema, int numRecords, long seed) { - return () -> new Iterator() { - private SparkRandomDataGenerator generator = new SparkRandomDataGenerator(seed); - private int count = 0; - - @Override - public boolean hasNext() { - return count < numRecords; - } - - @Override - public InternalRow next() { - if (count >= numRecords) { - throw new NoSuchElementException(); - } - count += 1; - return (InternalRow) TypeUtil.visit(schema, generator); - } - }; + return () -> + new Iterator() { + private SparkRandomDataGenerator generator = new SparkRandomDataGenerator(seed); + private int count = 0; + + @Override + public boolean hasNext() { + return count < numRecords; + } + + @Override + public InternalRow next() { + if (count >= numRecords) { + throw new NoSuchElementException(); + } + count += 1; + return (InternalRow) TypeUtil.visit(schema, generator); + } + }; } public static Iterable generate(Schema schema, int numRecords, long seed) { - return newIterable(() -> new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE), schema, numRecords); + return newIterable( + () -> new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE), schema, numRecords); } - public static Iterable generate(Schema schema, int numRecords, long seed, float nullPercentage) { - return newIterable(() -> new RandomDataGenerator(schema, seed, nullPercentage), schema, numRecords); + public static Iterable generate( + Schema schema, int numRecords, long seed, float nullPercentage) { + return newIterable( + () -> new RandomDataGenerator(schema, seed, nullPercentage), schema, numRecords); } - public static Iterable generateFallbackData(Schema schema, int numRecords, long seed, long numDictRecords) { - return newIterable(() -> new FallbackDataGenerator(schema, seed, numDictRecords), schema, numRecords); + public static Iterable generateFallbackData( + Schema schema, int numRecords, long seed, long numDictRecords) { + return newIterable( + () -> new FallbackDataGenerator(schema, seed, numDictRecords), schema, numRecords); } public static Iterable generateDictionaryEncodableData( Schema schema, int numRecords, long seed, float nullPercentage) { - return newIterable(() -> new DictionaryEncodedDataGenerator(schema, seed, nullPercentage), schema, numRecords); + return newIterable( + () -> new DictionaryEncodedDataGenerator(schema, seed, nullPercentage), schema, numRecords); } - private static Iterable newIterable(Supplier newGenerator, - Schema schema, int numRecords) { - return () -> new Iterator() { - private int count = 0; - private RandomDataGenerator generator = newGenerator.get(); - - @Override - public boolean hasNext() { - return count < numRecords; - } - - @Override - public Record next() { - if (count >= numRecords) { - throw new NoSuchElementException(); - } - count += 1; - return (Record) TypeUtil.visit(schema, generator); - } - }; + private static Iterable newIterable( + Supplier newGenerator, Schema schema, int numRecords) { + return () -> + new Iterator() { + private int count = 0; + private RandomDataGenerator generator = newGenerator.get(); + + @Override + public boolean hasNext() { + return count < numRecords; + } + + @Override + public Record next() { + if (count >= numRecords) { + throw new NoSuchElementException(); + } + count += 1; + return (Record) TypeUtil.visit(schema, generator); + } + }; } private static class RandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor { @@ -218,8 +224,7 @@ public Object primitive(Type.PrimitiveType primitive) { // them here. switch (primitive.typeId()) { case FIXED: - return new GenericData.Fixed(typeToSchema.get(primitive), - (byte[]) result); + return new GenericData.Fixed(typeToSchema.get(primitive), (byte[]) result); case BINARY: return ByteBuffer.wrap((byte[]) result); case UUID: diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java index 4aed78d1e155..69b14eead4d5 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static scala.collection.JavaConverters.mapAsJavaMapConverter; +import static scala.collection.JavaConverters.seqAsJavaListConverter; + import java.math.BigDecimal; import java.nio.ByteBuffer; import java.sql.Timestamp; @@ -71,17 +74,13 @@ import org.junit.Assert; import scala.collection.Seq; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static scala.collection.JavaConverters.mapAsJavaMapConverter; -import static scala.collection.JavaConverters.seqAsJavaListConverter; - public class TestHelpers { - private TestHelpers() { - } + private TestHelpers() {} public static void assertEqualsSafe(Types.StructType struct, List recs, List rows) { - Streams.forEachPair(recs.stream(), rows.stream(), (rec, row) -> assertEqualsSafe(struct, rec, row)); + Streams.forEachPair( + recs.stream(), rows.stream(), (rec, row) -> assertEqualsSafe(struct, rec, row)); } public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row) { @@ -96,8 +95,11 @@ public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row } } - public static void assertEqualsBatch(Types.StructType struct, Iterator expected, ColumnarBatch batch, - boolean checkArrowValidityVector) { + public static void assertEqualsBatch( + Types.StructType struct, + Iterator expected, + ColumnarBatch batch, + boolean checkArrowValidityVector) { for (int rowId = 0; rowId < batch.numRows(); rowId++) { List fields = struct.fields(); InternalRow row = batch.getRow(rowId); @@ -110,15 +112,16 @@ public static void assertEqualsBatch(Types.StructType struct, Iterator e if (checkArrowValidityVector) { ColumnVector columnVector = batch.column(i); - ValueVector arrowVector = ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector(); - Assert.assertFalse("Nullability doesn't match of " + columnVector.dataType(), + ValueVector arrowVector = + ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector(); + Assert.assertFalse( + "Nullability doesn't match of " + columnVector.dataType(), expectedValue == null ^ arrowVector.isNull(rowId)); } } } } - private static void assertEqualsSafe(Types.ListType list, Collection expected, List actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); @@ -130,8 +133,7 @@ private static void assertEqualsSafe(Types.ListType list, Collection expected } } - private static void assertEqualsSafe(Types.MapType map, - Map expected, Map actual) { + private static void assertEqualsSafe(Types.MapType map, Map expected, Map actual) { Type keyType = map.keyType(); Type valueType = map.valueType(); @@ -190,23 +192,28 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a String").isInstanceOf(String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual); + Assert.assertEquals("UUID string representation should match", expected.toString(), actual); break; case FIXED: - Assertions.assertThat(expected).as("Should expect a Fixed").isInstanceOf(GenericData.Fixed.class); + Assertions.assertThat(expected) + .as("Should expect a Fixed") + .isInstanceOf(GenericData.Fixed.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((GenericData.Fixed) expected).bytes(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((GenericData.Fixed) expected).bytes(), (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a BigDecimal").isInstanceOf(BigDecimal.class); Assert.assertEquals("BigDecimals should be equal", expected, actual); break; @@ -216,16 +223,20 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) assertEqualsSafe(type.asNestedType().asStructType(), (Record) expected, (Row) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be a Seq").isInstanceOf(Seq.class); List asList = seqAsJavaListConverter((Seq) actual).asJava(); assertEqualsSafe(type.asNestedType().asListType(), (Collection) expected, asList); break; case MAP: Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be a Map").isInstanceOf(scala.collection.Map.class); - Map asMap = mapAsJavaMapConverter( - (scala.collection.Map) actual).asJava(); + Assertions.assertThat(actual) + .as("Should be a Map") + .isInstanceOf(scala.collection.Map.class); + Map asMap = + mapAsJavaMapConverter((scala.collection.Map) actual).asJava(); assertEqualsSafe(type.asNestedType().asMapType(), (Map) expected, asMap); break; case TIME: @@ -246,7 +257,8 @@ public static void assertEqualsUnsafe(Types.StructType struct, Record rec, Inter } } - private static void assertEqualsUnsafe(Types.ListType list, Collection expected, ArrayData actual) { + private static void assertEqualsUnsafe( + Types.ListType list, Collection expected, ArrayData actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); for (int i = 0; i < expectedElements.size(); i += 1) { @@ -292,8 +304,10 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual case DOUBLE: Assertions.assertThat(actual).as("Should be a double").isInstanceOf(Double.class); if (expected instanceof Float) { - Assert.assertEquals("Values didn't match", Double.doubleToLongBits(((Number) expected).doubleValue()), - Double.doubleToLongBits((double) actual)); + Assert.assertEquals( + "Values didn't match", + Double.doubleToLongBits(((Number) expected).doubleValue()), + Double.doubleToLongBits((double) actual)); } else { Assert.assertEquals("Primitive value should be equal to expected", expected, actual); } @@ -312,40 +326,54 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a UTF8String").isInstanceOf(UTF8String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual.toString()); + Assert.assertEquals( + "UUID string representation should match", expected.toString(), actual.toString()); break; case FIXED: - Assertions.assertThat(expected).as("Should expect a Fixed").isInstanceOf(GenericData.Fixed.class); + Assertions.assertThat(expected) + .as("Should expect a Fixed") + .isInstanceOf(GenericData.Fixed.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((GenericData.Fixed) expected).bytes(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((GenericData.Fixed) expected).bytes(), (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a Decimal").isInstanceOf(Decimal.class); - Assert.assertEquals("BigDecimals should be equal", - expected, ((Decimal) actual).toJavaBigDecimal()); + Assert.assertEquals( + "BigDecimals should be equal", expected, ((Decimal) actual).toJavaBigDecimal()); break; case STRUCT: Assertions.assertThat(expected).as("Should expect a Record").isInstanceOf(Record.class); - Assertions.assertThat(actual).as("Should be an InternalRow").isInstanceOf(InternalRow.class); - assertEqualsUnsafe(type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); + Assertions.assertThat(actual) + .as("Should be an InternalRow") + .isInstanceOf(InternalRow.class); + assertEqualsUnsafe( + type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be an ArrayData").isInstanceOf(ArrayData.class); - assertEqualsUnsafe(type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); + assertEqualsUnsafe( + type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); break; case MAP: Assertions.assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be an ArrayBasedMapData").isInstanceOf(MapData.class); + Assertions.assertThat(actual) + .as("Should be an ArrayBasedMapData") + .isInstanceOf(MapData.class); assertEqualsUnsafe(type.asNestedType().asMapType(), (Map) expected, (MapData) actual); break; case TIME: @@ -356,13 +384,14 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual /** * Check that the given InternalRow is equivalent to the Row. + * * @param prefix context for error messages * @param type the type of the row * @param expected the expected value of the row * @param actual the actual value of the row */ - public static void assertEquals(String prefix, Types.StructType type, - InternalRow expected, Row actual) { + public static void assertEquals( + String prefix, Types.StructType type, InternalRow expected, Row actual) { if (expected == null || actual == null) { Assert.assertEquals(prefix, expected, actual); } else { @@ -380,30 +409,41 @@ public static void assertEquals(String prefix, Types.StructType type, case DECIMAL: case DATE: case TIMESTAMP: - Assert.assertEquals(prefix + "." + fieldName + " - " + childType, + Assert.assertEquals( + prefix + "." + fieldName + " - " + childType, getValue(expected, c, childType), getPrimitiveValue(actual, c, childType)); break; case UUID: case FIXED: case BINARY: - assertEqualBytes(prefix + "." + fieldName, + assertEqualBytes( + prefix + "." + fieldName, (byte[]) getValue(expected, c, childType), (byte[]) actual.get(c)); break; - case STRUCT: { - Types.StructType st = (Types.StructType) childType; - assertEquals(prefix + "." + fieldName, st, - expected.getStruct(c, st.fields().size()), actual.getStruct(c)); - break; - } + case STRUCT: + { + Types.StructType st = (Types.StructType) childType; + assertEquals( + prefix + "." + fieldName, + st, + expected.getStruct(c, st.fields().size()), + actual.getStruct(c)); + break; + } case LIST: - assertEqualsLists(prefix + "." + fieldName, childType.asListType(), + assertEqualsLists( + prefix + "." + fieldName, + childType.asListType(), expected.getArray(c), toList((Seq) actual.get(c))); break; case MAP: - assertEqualsMaps(prefix + "." + fieldName, childType.asMapType(), expected.getMap(c), + assertEqualsMaps( + prefix + "." + fieldName, + childType.asMapType(), + expected.getMap(c), toJavaMap((scala.collection.Map) actual.getMap(c))); break; default: @@ -413,8 +453,8 @@ public static void assertEquals(String prefix, Types.StructType type, } } - private static void assertEqualsLists(String prefix, Types.ListType type, - ArrayData expected, List actual) { + private static void assertEqualsLists( + String prefix, Types.ListType type, ArrayData expected, List actual) { if (expected == null || actual == null) { Assert.assertEquals(prefix, expected, actual); } else { @@ -431,31 +471,42 @@ private static void assertEqualsLists(String prefix, Types.ListType type, case DECIMAL: case DATE: case TIMESTAMP: - Assert.assertEquals(prefix + ".elem " + e + " - " + childType, + Assert.assertEquals( + prefix + ".elem " + e + " - " + childType, getValue(expected, e, childType), actual.get(e)); break; case UUID: case FIXED: case BINARY: - assertEqualBytes(prefix + ".elem " + e, + assertEqualBytes( + prefix + ".elem " + e, (byte[]) getValue(expected, e, childType), (byte[]) actual.get(e)); break; - case STRUCT: { - Types.StructType st = (Types.StructType) childType; - assertEquals(prefix + ".elem " + e, st, - expected.getStruct(e, st.fields().size()), (Row) actual.get(e)); - break; - } + case STRUCT: + { + Types.StructType st = (Types.StructType) childType; + assertEquals( + prefix + ".elem " + e, + st, + expected.getStruct(e, st.fields().size()), + (Row) actual.get(e)); + break; + } case LIST: - assertEqualsLists(prefix + ".elem " + e, childType.asListType(), + assertEqualsLists( + prefix + ".elem " + e, + childType.asListType(), expected.getArray(e), toList((Seq) actual.get(e))); break; case MAP: - assertEqualsMaps(prefix + ".elem " + e, childType.asMapType(), - expected.getMap(e), toJavaMap((scala.collection.Map) actual.get(e))); + assertEqualsMaps( + prefix + ".elem " + e, + childType.asMapType(), + expected.getMap(e), + toJavaMap((scala.collection.Map) actual.get(e))); break; default: throw new IllegalArgumentException("Unhandled type " + childType); @@ -464,8 +515,8 @@ private static void assertEqualsLists(String prefix, Types.ListType type, } } - private static void assertEqualsMaps(String prefix, Types.MapType type, - MapData expected, Map actual) { + private static void assertEqualsMaps( + String prefix, Types.MapType type, MapData expected, Map actual) { if (expected == null || actual == null) { Assert.assertEquals(prefix, expected, actual); } else { @@ -478,7 +529,9 @@ private static void assertEqualsMaps(String prefix, Types.MapType type, Object expectedKey = getValue(expectedKeyArray, e, keyType); Object actualValue = actual.get(expectedKey); if (actualValue == null) { - Assert.assertEquals(prefix + ".key=" + expectedKey + " has null", true, + Assert.assertEquals( + prefix + ".key=" + expectedKey + " has null", + true, expected.valueArray().isNullAt(e)); } else { switch (valueType.typeId()) { @@ -491,32 +544,40 @@ private static void assertEqualsMaps(String prefix, Types.MapType type, case DECIMAL: case DATE: case TIMESTAMP: - Assert.assertEquals(prefix + ".key=" + expectedKey + " - " + valueType, + Assert.assertEquals( + prefix + ".key=" + expectedKey + " - " + valueType, getValue(expectedValueArray, e, valueType), actual.get(expectedKey)); break; case UUID: case FIXED: case BINARY: - assertEqualBytes(prefix + ".key=" + expectedKey, + assertEqualBytes( + prefix + ".key=" + expectedKey, (byte[]) getValue(expectedValueArray, e, valueType), (byte[]) actual.get(expectedKey)); break; - case STRUCT: { - Types.StructType st = (Types.StructType) valueType; - assertEquals(prefix + ".key=" + expectedKey, st, - expectedValueArray.getStruct(e, st.fields().size()), - (Row) actual.get(expectedKey)); - break; - } + case STRUCT: + { + Types.StructType st = (Types.StructType) valueType; + assertEquals( + prefix + ".key=" + expectedKey, + st, + expectedValueArray.getStruct(e, st.fields().size()), + (Row) actual.get(expectedKey)); + break; + } case LIST: - assertEqualsLists(prefix + ".key=" + expectedKey, + assertEqualsLists( + prefix + ".key=" + expectedKey, valueType.asListType(), expectedValueArray.getArray(e), toList((Seq) actual.get(expectedKey))); break; case MAP: - assertEqualsMaps(prefix + ".key=" + expectedKey, valueType.asMapType(), + assertEqualsMaps( + prefix + ".key=" + expectedKey, + valueType.asMapType(), expectedValueArray.getMap(e), toJavaMap((scala.collection.Map) actual.get(expectedKey))); break; @@ -528,8 +589,7 @@ private static void assertEqualsMaps(String prefix, Types.MapType type, } } - private static Object getValue(SpecializedGetters container, int ord, - Type type) { + private static Object getValue(SpecializedGetters container, int ord, Type type) { if (container.isNullAt(ord)) { return null; } @@ -554,10 +614,11 @@ private static Object getValue(SpecializedGetters container, int ord, return new DateWritable(container.getInt(ord)).get(); case TIMESTAMP: return DateTimeUtils.toJavaTimestamp(container.getLong(ord)); - case DECIMAL: { - Types.DecimalType dt = (Types.DecimalType) type; - return container.getDecimal(ord, dt.precision(), dt.scale()).toJavaBigDecimal(); - } + case DECIMAL: + { + Types.DecimalType dt = (Types.DecimalType) type; + return container.getDecimal(ord, dt.precision(), dt.scale()).toJavaBigDecimal(); + } case STRUCT: Types.StructType struct = type.asStructType(); InternalRow internalRow = container.getStruct(ord, struct.fields().size()); @@ -615,8 +676,7 @@ private static List toList(Seq val) { return val == null ? null : seqAsJavaListConverter(val).asJava(); } - private static void assertEqualBytes(String context, byte[] expected, - byte[] actual) { + private static void assertEqualBytes(String context, byte[] expected, byte[] actual) { if (expected == null || actual == null) { Assert.assertEquals(context, expected, actual); } else { @@ -634,23 +694,29 @@ private static void assertEquals(String context, DataType type, Object expected, } if (type instanceof StructType) { - Assertions.assertThat(expected).as("Expected should be an InternalRow: " + context) + Assertions.assertThat(expected) + .as("Expected should be an InternalRow: " + context) .isInstanceOf(InternalRow.class); - Assertions.assertThat(actual).as("Actual should be an InternalRow: " + context) + Assertions.assertThat(actual) + .as("Actual should be an InternalRow: " + context) .isInstanceOf(InternalRow.class); assertEquals(context, (StructType) type, (InternalRow) expected, (InternalRow) actual); } else if (type instanceof ArrayType) { - Assertions.assertThat(expected).as("Expected should be an ArrayData: " + context) + Assertions.assertThat(expected) + .as("Expected should be an ArrayData: " + context) .isInstanceOf(ArrayData.class); - Assertions.assertThat(actual).as("Actual should be an ArrayData: " + context) + Assertions.assertThat(actual) + .as("Actual should be an ArrayData: " + context) .isInstanceOf(ArrayData.class); assertEquals(context, (ArrayType) type, (ArrayData) expected, (ArrayData) actual); } else if (type instanceof MapType) { - Assertions.assertThat(expected).as("Expected should be a MapData: " + context) + Assertions.assertThat(expected) + .as("Expected should be a MapData: " + context) .isInstanceOf(MapData.class); - Assertions.assertThat(actual).as("Actual should be a MapData: " + context) + Assertions.assertThat(actual) + .as("Actual should be a MapData: " + context) .isInstanceOf(MapData.class); assertEquals(context, (MapType) type, (MapData) expected, (MapData) actual); @@ -661,32 +727,37 @@ private static void assertEquals(String context, DataType type, Object expected, } } - private static void assertEquals(String context, StructType struct, - InternalRow expected, InternalRow actual) { + private static void assertEquals( + String context, StructType struct, InternalRow expected, InternalRow actual) { Assert.assertEquals("Should have correct number of fields", struct.size(), actual.numFields()); for (int i = 0; i < actual.numFields(); i += 1) { StructField field = struct.fields()[i]; DataType type = field.dataType(); - assertEquals(context + "." + field.name(), type, + assertEquals( + context + "." + field.name(), + type, expected.isNullAt(i) ? null : expected.get(i, type), actual.isNullAt(i) ? null : actual.get(i, type)); } } - private static void assertEquals(String context, ArrayType array, ArrayData expected, ArrayData actual) { - Assert.assertEquals("Should have the same number of elements", - expected.numElements(), actual.numElements()); + private static void assertEquals( + String context, ArrayType array, ArrayData expected, ArrayData actual) { + Assert.assertEquals( + "Should have the same number of elements", expected.numElements(), actual.numElements()); DataType type = array.elementType(); for (int i = 0; i < actual.numElements(); i += 1) { - assertEquals(context + ".element", type, + assertEquals( + context + ".element", + type, expected.isNullAt(i) ? null : expected.get(i, type), actual.isNullAt(i) ? null : actual.get(i, type)); } } private static void assertEquals(String context, MapType map, MapData expected, MapData actual) { - Assert.assertEquals("Should have the same number of elements", - expected.numElements(), actual.numElements()); + Assert.assertEquals( + "Should have the same number of elements", expected.numElements(), actual.numElements()); DataType keyType = map.keyType(); ArrayData expectedKeys = expected.keyArray(); @@ -697,10 +768,14 @@ private static void assertEquals(String context, MapType map, MapData expected, ArrayData actualValues = actual.valueArray(); for (int i = 0; i < actual.numElements(); i += 1) { - assertEquals(context + ".key", keyType, + assertEquals( + context + ".key", + keyType, expectedKeys.isNullAt(i) ? null : expectedKeys.get(i, keyType), actualKeys.isNullAt(i) ? null : actualKeys.get(i, keyType)); - assertEquals(context + ".value", valueType, + assertEquals( + context + ".value", + valueType, expectedValues.isNullAt(i) ? null : expectedValues.get(i, valueType), actualValues.isNullAt(i) ? null : actualValues.get(i, valueType)); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java index 7cf9b9c736c6..1e51a088390e 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import org.apache.iceberg.Files; @@ -32,16 +33,12 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestOrcWrite { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); @Test public void splitOffsets() throws IOException { @@ -49,10 +46,11 @@ public void splitOffsets() throws IOException { Assert.assertTrue("Delete should succeed", testFile.delete()); Iterable rows = RandomData.generateSpark(SCHEMA, 1, 0L); - FileAppender writer = ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(SCHEMA) - .build(); + FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(SCHEMA) + .build(); writer.addAll(rows); writer.close(); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java index 464e3165583c..a4ffc2fea437 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -38,54 +40,68 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestParquetAvroReader { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required(5, "strict", Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional(6, "hopeful", Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()) - )), - optional(10, "vehement", Types.LongType.get()) - )), - optional(11, "metamorphosis", Types.MapType.ofRequired(12, 13, - Types.StringType.get(), Types.TimestampType.withoutZone())), - required(14, "winter", Types.ListType.ofOptional(15, Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.TimeType.get()), - optional(18, "wheeze", Types.StringType.get()) - ))), - optional(19, "renovate", Types.MapType.ofRequired(20, 21, - Types.StringType.get(), Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.TimeType.get()), - required(24, "couch rope", Types.IntegerType.get()) - ))), - optional(2, "slide", Types.StringType.get()) - ); + @Rule public TemporaryFolder temp = new TemporaryFolder(); + + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "roots", Types.LongType.get()), + optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), + required( + 5, + "strict", + Types.StructType.of( + required(9, "tangerine", Types.StringType.get()), + optional( + 6, + "hopeful", + Types.StructType.of( + required(7, "steel", Types.FloatType.get()), + required(8, "lantern", Types.DateType.get()))), + optional(10, "vehement", Types.LongType.get()))), + optional( + 11, + "metamorphosis", + Types.MapType.ofRequired( + 12, 13, Types.StringType.get(), Types.TimestampType.withoutZone())), + required( + 14, + "winter", + Types.ListType.ofOptional( + 15, + Types.StructType.of( + optional(16, "beet", Types.DoubleType.get()), + required(17, "stamp", Types.TimeType.get()), + optional(18, "wheeze", Types.StringType.get())))), + optional( + 19, + "renovate", + Types.MapType.ofRequired( + 20, + 21, + Types.StringType.get(), + Types.StructType.of( + optional(22, "jumpy", Types.DoubleType.get()), + required(23, "koala", Types.TimeType.get()), + required(24, "couch rope", Types.IntegerType.get())))), + optional(2, "slide", Types.StringType.get())); @Ignore public void testStructSchema() throws IOException { - Schema structSchema = new Schema( - required(1, "circumvent", Types.LongType.get()), - optional(2, "antarctica", Types.StringType.get()), - optional(3, "fluent", Types.DoubleType.get()), - required(4, "quell", Types.StructType.of( - required(5, "operator", Types.BooleanType.get()), - optional(6, "fanta", Types.IntegerType.get()), - optional(7, "cable", Types.FloatType.get()) - )), - required(8, "chimney", Types.TimestampType.withZone()), - required(9, "wool", Types.DateType.get()) - ); + Schema structSchema = + new Schema( + required(1, "circumvent", Types.LongType.get()), + optional(2, "antarctica", Types.StringType.get()), + optional(3, "fluent", Types.DoubleType.get()), + required( + 4, + "quell", + Types.StructType.of( + required(5, "operator", Types.BooleanType.get()), + optional(6, "fanta", Types.IntegerType.get()), + optional(7, "cable", Types.FloatType.get()))), + required(8, "chimney", Types.TimestampType.withZone()), + required(9, "wool", Types.DateType.get())); File testFile = writeTestData(structSchema, 5_000_000, 1059); // RandomData uses the root record name "test", which must match for records to be equal @@ -100,11 +116,12 @@ public void testStructSchema() throws IOException { // clean up as much memory as possible to avoid a large GC during the timed run System.gc(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(structSchema) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(structSchema, readSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(structSchema) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(structSchema, readSchema)) + .build()) { long start = System.currentTimeMillis(); long val = 0; long count = 0; @@ -137,9 +154,8 @@ public void testWithOldReadPath() throws IOException { // clean up as much memory as possible to avoid a large GC during the timed run System.gc(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)).project(COMPLEX_SCHEMA).build()) { long start = System.currentTimeMillis(); long val = 0; long count = 0; @@ -154,11 +170,12 @@ public void testWithOldReadPath() throws IOException { // clean up as much memory as possible to avoid a large GC during the timed run System.gc(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) + .build()) { long start = System.currentTimeMillis(); long val = 0; long count = 0; @@ -179,9 +196,8 @@ public void testCorrectness() throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)).schema(COMPLEX_SCHEMA).build()) { writer.addAll(records); } @@ -189,12 +205,13 @@ public void testCorrectness() throws IOException { MessageType readSchema = ParquetSchemaUtil.convert(COMPLEX_SCHEMA, "test"); // verify that the new read path is correct - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .reuseContainers() - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) + .reuseContainers() + .build()) { int recordNum = 0; Iterator iter = records.iterator(); for (Record actual : reader) { @@ -209,9 +226,8 @@ private File writeTestData(Schema schema, int numRecords, int seed) throws IOExc File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)).schema(schema).build()) { writer.addAll(RandomData.generate(schema, numRecords, seed)); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java index dcfc873a5a67..15c6268da478 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -38,39 +40,51 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestParquetAvroWriter { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required(5, "strict", Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional(6, "hopeful", Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()) - )), - optional(10, "vehement", Types.LongType.get()) - )), - optional(11, "metamorphosis", Types.MapType.ofRequired(12, 13, - Types.StringType.get(), Types.TimestampType.withoutZone())), - required(14, "winter", Types.ListType.ofOptional(15, Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.TimeType.get()), - optional(18, "wheeze", Types.StringType.get()) - ))), - optional(19, "renovate", Types.MapType.ofRequired(20, 21, - Types.StringType.get(), Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.TimeType.get()), - required(24, "couch rope", Types.IntegerType.get()) - ))), - optional(2, "slide", Types.StringType.get()) - ); + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "roots", Types.LongType.get()), + optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), + required( + 5, + "strict", + Types.StructType.of( + required(9, "tangerine", Types.StringType.get()), + optional( + 6, + "hopeful", + Types.StructType.of( + required(7, "steel", Types.FloatType.get()), + required(8, "lantern", Types.DateType.get()))), + optional(10, "vehement", Types.LongType.get()))), + optional( + 11, + "metamorphosis", + Types.MapType.ofRequired( + 12, 13, Types.StringType.get(), Types.TimestampType.withoutZone())), + required( + 14, + "winter", + Types.ListType.ofOptional( + 15, + Types.StructType.of( + optional(16, "beet", Types.DoubleType.get()), + required(17, "stamp", Types.TimeType.get()), + optional(18, "wheeze", Types.StringType.get())))), + optional( + 19, + "renovate", + Types.MapType.ofRequired( + 20, + 21, + Types.StringType.get(), + Types.StructType.of( + optional(22, "jumpy", Types.DoubleType.get()), + required(23, "koala", Types.TimeType.get()), + required(24, "couch rope", Types.IntegerType.get())))), + optional(2, "slide", Types.StringType.get())); @Test public void testCorrectness() throws IOException { @@ -79,10 +93,11 @@ public void testCorrectness() throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .createWriterFunc(ParquetAvroWriter::buildWriter) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(COMPLEX_SCHEMA) + .createWriterFunc(ParquetAvroWriter::buildWriter) + .build()) { writer.addAll(records); } @@ -90,11 +105,12 @@ public void testCorrectness() throws IOException { MessageType readSchema = ParquetSchemaUtil.convert(COMPLEX_SCHEMA, "test"); // verify that the new read path is correct - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) + .build()) { int recordNum = 0; Iterator iter = records.iterator(); for (Record actual : reader) { diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java index 3517c32ffebb..6f05a9ed7c1f 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.File; @@ -42,20 +41,20 @@ public class TestSparkAvroEnums { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void writeAndValidateEnums() throws IOException { - org.apache.avro.Schema avroSchema = SchemaBuilder.record("root") - .fields() - .name("enumCol") - .type() - .nullable() - .enumeration("testEnum") - .symbols("SYMB1", "SYMB2") - .enumDefault("SYMB2") - .endRecord(); + org.apache.avro.Schema avroSchema = + SchemaBuilder.record("root") + .fields() + .name("enumCol") + .type() + .nullable() + .enumeration("testEnum") + .symbols("SYMB1", "SYMB2") + .enumDefault("SYMB2") + .endRecord(); org.apache.avro.Schema enumSchema = avroSchema.getField("enumCol").schema().getTypes().get(0); Record enumRecord1 = new GenericData.Record(avroSchema); @@ -77,10 +76,11 @@ public void writeAndValidateEnums() throws IOException { Schema schema = new Schema(AvroSchemaUtil.convert(avroSchema).asStructType().fields()); List rows; - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)) + .createReaderFunc(SparkAvroReader::new) + .project(schema) + .build()) { rows = Lists.newArrayList(reader); } @@ -88,7 +88,8 @@ public void writeAndValidateEnums() throws IOException { for (int i = 0; i < expected.size(); i += 1) { String expectedEnumString = expected.get(i).get("enumCol") == null ? null : expected.get(i).get("enumCol").toString(); - String sparkString = rows.get(i).getUTF8String(0) == null ? null : rows.get(i).getUTF8String(0).toString(); + String sparkString = + rows.get(i).getUTF8String(0) == null ? null : rows.get(i).getUTF8String(0).toString(); Assert.assertEquals(expectedEnumString, sparkString); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java index e4398df39cc8..6d1ef3db3657 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; + import java.io.File; import java.io.IOException; import java.util.List; @@ -32,8 +33,6 @@ import org.apache.spark.sql.catalyst.InternalRow; import org.junit.Assert; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; - public class TestSparkAvroReader extends AvroDataTest { @Override protected void writeAndValidate(Schema schema) throws IOException { @@ -42,20 +41,19 @@ protected void writeAndValidate(Schema schema) throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Avro.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Avro.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { for (Record rec : expected) { writer.add(rec); } } List rows; - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)) + .createReaderFunc(SparkAvroReader::new) + .project(schema) + .build()) { rows = Lists.newArrayList(reader); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java index b67e57310b4c..b31ea8fd277d 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.time.ZoneId; @@ -69,7 +68,7 @@ public void checkSparkTimestamp(String timestampString, String sparkRepr) { ZoneId zoneId = DateTimeUtils.getZoneId("UTC"); TimestampFormatter formatter = TimestampFormatter.getFractionFormatter(zoneId); String sparkTimestamp = formatter.format(ts.value()); - Assert.assertEquals("Should be the same timestamp (" + ts.value() + ")", - sparkRepr, sparkTimestamp); + Assert.assertEquals( + "Should be the same timestamp (" + ts.value() + ")", sparkRepr, sparkTimestamp); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java index b8ee56370edf..3c9037adc393 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -57,21 +58,18 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSparkOrcReadMetadataColumns { - private static final Schema DATA_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()) - ); - - private static final Schema PROJECTION_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - MetadataColumns.ROW_POSITION, - MetadataColumns.IS_DELETED - ); + private static final Schema DATA_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), required(101, "data", Types.StringType.get())); + + private static final Schema PROJECTION_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get()), + MetadataColumns.ROW_POSITION, + MetadataColumns.IS_DELETED); private static final int NUM_ROWS = 1000; private static final List DATA_ROWS; @@ -99,11 +97,10 @@ public class TestSparkOrcReadMetadataColumns { @Parameterized.Parameters(name = "vectorized = {0}") public static Object[] parameters() { - return new Object[] { false, true }; + return new Object[] {false, true}; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private boolean vectorized; private File testFile; @@ -117,14 +114,15 @@ public void writeFile() throws IOException { testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(DATA_SCHEMA) - // write in such a way that the file contains 10 stripes each with 100 rows - .set("iceberg.orc.vectorbatch.size", "100") - .set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "100") - .set(OrcConf.STRIPE_SIZE.getAttribute(), "1") - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(DATA_SCHEMA) + // write in such a way that the file contains 10 stripes each with 100 rows + .set("iceberg.orc.vectorbatch.size", "100") + .set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "100") + .set(OrcConf.STRIPE_SIZE.getAttribute(), "1") + .build()) { writer.addAll(DATA_ROWS); } } @@ -136,41 +134,54 @@ public void testReadRowNumbers() throws IOException { @Test public void testReadRowNumbersWithFilter() throws IOException { - readAndValidate(Expressions.greaterThanOrEqual("id", 500), null, null, EXPECTED_ROWS.subList(500, 1000)); + readAndValidate( + Expressions.greaterThanOrEqual("id", 500), null, null, EXPECTED_ROWS.subList(500, 1000)); } @Test public void testReadRowNumbersWithSplits() throws IOException { Reader reader; try { - OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(new Configuration()).useUTCTimestamp(true); - reader = OrcFile.createReader(new Path(testFile.toString()), readerOptions); + OrcFile.ReaderOptions readerOptions = + OrcFile.readerOptions(new Configuration()).useUTCTimestamp(true); + reader = OrcFile.createReader(new Path(testFile.toString()), readerOptions); } catch (IOException ioe) { throw new RuntimeIOException(ioe, "Failed to open file: %s", testFile); } - List splitOffsets = reader.getStripes().stream().map(StripeInformation::getOffset) - .collect(Collectors.toList()); - List splitLengths = reader.getStripes().stream().map(StripeInformation::getLength) - .collect(Collectors.toList()); + List splitOffsets = + reader.getStripes().stream().map(StripeInformation::getOffset).collect(Collectors.toList()); + List splitLengths = + reader.getStripes().stream().map(StripeInformation::getLength).collect(Collectors.toList()); for (int i = 0; i < 10; i++) { - readAndValidate(null, splitOffsets.get(i), splitLengths.get(i), EXPECTED_ROWS.subList(i * 100, (i + 1) * 100)); + readAndValidate( + null, + splitOffsets.get(i), + splitLengths.get(i), + EXPECTED_ROWS.subList(i * 100, (i + 1) * 100)); } } - private void readAndValidate(Expression filter, Long splitStart, Long splitLength, List expected) + private void readAndValidate( + Expression filter, Long splitStart, Long splitLength, List expected) throws IOException { - Schema projectionWithoutMetadataFields = TypeUtil.selectNot(PROJECTION_SCHEMA, MetadataColumns.metadataFieldIds()); + Schema projectionWithoutMetadataFields = + TypeUtil.selectNot(PROJECTION_SCHEMA, MetadataColumns.metadataFieldIds()); CloseableIterable reader = null; try { - ORC.ReadBuilder builder = ORC.read(Files.localInput(testFile)) - .project(projectionWithoutMetadataFields); + ORC.ReadBuilder builder = + ORC.read(Files.localInput(testFile)).project(projectionWithoutMetadataFields); if (vectorized) { - builder = builder.createBatchedReaderFunc(readOrcSchema -> - VectorizedSparkOrcReaders.buildReader(PROJECTION_SCHEMA, readOrcSchema, ImmutableMap.of())); + builder = + builder.createBatchedReaderFunc( + readOrcSchema -> + VectorizedSparkOrcReaders.buildReader( + PROJECTION_SCHEMA, readOrcSchema, ImmutableMap.of())); } else { - builder = builder.createReaderFunc(readOrcSchema -> new SparkOrcReader(PROJECTION_SCHEMA, readOrcSchema)); + builder = + builder.createReaderFunc( + readOrcSchema -> new SparkOrcReader(PROJECTION_SCHEMA, readOrcSchema)); } if (filter != null) { diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java index 5042d1cc1338..b23fe729a187 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.data.TestHelpers.assertEquals; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Collections; @@ -38,45 +40,44 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.spark.data.TestHelpers.assertEquals; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkOrcReader extends AvroDataTest { @Override protected void writeAndValidate(Schema schema) throws IOException { - final Iterable expected = RandomData - .generateSpark(schema, 100, 0L); + final Iterable expected = RandomData.generateSpark(schema, 100, 0L); writeAndValidateRecords(schema, expected); } @Test public void writeAndValidateRepeatingRecords() throws IOException { - Schema structSchema = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()) - ); - List expectedRepeating = Collections.nCopies(100, - RandomData.generateSpark(structSchema, 1, 0L).iterator().next()); + Schema structSchema = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get())); + List expectedRepeating = + Collections.nCopies(100, RandomData.generateSpark(structSchema, 1, 0L).iterator().next()); writeAndValidateRecords(structSchema, expectedRepeating); } - private void writeAndValidateRecords(Schema schema, Iterable expected) throws IOException { + private void writeAndValidateRecords(Schema schema, Iterable expected) + throws IOException { final File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(schema) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(schema) + .build()) { writer.addAll(expected); } - try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) + .build()) { final Iterator actualRows = reader.iterator(); final Iterator expectedRows = expected.iterator(); while (expectedRows.hasNext()) { @@ -86,11 +87,13 @@ private void writeAndValidateRecords(Schema schema, Iterable expect Assert.assertFalse("Should not have extra rows", actualRows.hasNext()); } - try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) - .project(schema) - .createBatchedReaderFunc(readOrcSchema -> - VectorizedSparkOrcReaders.buildReader(schema, readOrcSchema, ImmutableMap.of())) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(testFile)) + .project(schema) + .createBatchedReaderFunc( + readOrcSchema -> + VectorizedSparkOrcReaders.buildReader(schema, readOrcSchema, ImmutableMap.of())) + .build()) { final Iterator actualRows = batchesToRows(reader.iterator()); final Iterator expectedRows = expected.iterator(); while (expectedRows.hasNext()) { diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java index f075e71742ea..23d69c467218 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -64,23 +67,18 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - @RunWith(Parameterized.class) public class TestSparkParquetReadMetadataColumns { - private static final Schema DATA_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()) - ); - - private static final Schema PROJECTION_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - MetadataColumns.ROW_POSITION, - MetadataColumns.IS_DELETED - ); + private static final Schema DATA_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), required(101, "data", Types.StringType.get())); + + private static final Schema PROJECTION_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get()), + MetadataColumns.ROW_POSITION, + MetadataColumns.IS_DELETED); private static final int NUM_ROWS = 1000; private static final List DATA_ROWS; @@ -117,16 +115,12 @@ public class TestSparkParquetReadMetadataColumns { } } - @Parameterized.Parameters(name = "vectorized = {0}") + @Parameterized.Parameters(name = "vectorized = {0}") public static Object[][] parameters() { - return new Object[][] { - new Object[] { false }, - new Object[] { true } - }; + return new Object[][] {new Object[] {false}, new Object[] {true}}; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final boolean vectorized; private File testFile; @@ -143,28 +137,32 @@ public void writeFile() throws IOException { testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - ParquetFileWriter parquetFileWriter = new ParquetFileWriter( - conf, - ParquetSchemaUtil.convert(DATA_SCHEMA, "testSchema"), - new Path(testFile.getAbsolutePath()) - ); + ParquetFileWriter parquetFileWriter = + new ParquetFileWriter( + conf, + ParquetSchemaUtil.convert(DATA_SCHEMA, "testSchema"), + new Path(testFile.getAbsolutePath())); parquetFileWriter.start(); for (int i = 0; i < NUM_ROW_GROUPS; i += 1) { File split = temp.newFile(); Assert.assertTrue("Delete should succeed", split.delete()); fileSplits.add(new Path(split.getAbsolutePath())); - try (FileAppender writer = Parquet.write(Files.localOutput(split)) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(struct, msgType)) - .schema(DATA_SCHEMA) - .overwrite() - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(split)) + .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(struct, msgType)) + .schema(DATA_SCHEMA) + .overwrite() + .build()) { writer.addAll(DATA_ROWS.subList(i * ROWS_PER_SPLIT, (i + 1) * ROWS_PER_SPLIT)); } - parquetFileWriter.appendFile(HadoopInputFile.fromPath(new Path(split.getAbsolutePath()), conf)); + parquetFileWriter.appendFile( + HadoopInputFile.fromPath(new Path(split.getAbsolutePath()), conf)); } - parquetFileWriter - .end(ParquetFileWriter.mergeMetadataFiles(fileSplits, conf).getFileMetaData().getKeyValueMetaData()); + parquetFileWriter.end( + ParquetFileWriter.mergeMetadataFiles(fileSplits, conf) + .getFileMetaData() + .getKeyValueMetaData()); } @Test @@ -178,12 +176,14 @@ public void testReadRowNumbersWithDelete() throws IOException { List expectedRowsAfterDelete = Lists.newArrayList(); EXPECTED_ROWS.forEach(row -> expectedRowsAfterDelete.add(row.copy())); - // remove row at position 98, 99, 100, 101, 102, this crosses two row groups [0, 100) and [100, 200) + // remove row at position 98, 99, 100, 101, 102, this crosses two row groups [0, 100) and [100, + // 200) for (int i = 98; i <= 102; i++) { expectedRowsAfterDelete.get(i).update(3, true); } - Parquet.ReadBuilder builder = Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA); + Parquet.ReadBuilder builder = + Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA); DeleteFilter deleteFilter = mock(DeleteFilter.class); when(deleteFilter.hasPosDeletes()).thenReturn(true); @@ -191,8 +191,14 @@ public void testReadRowNumbersWithDelete() throws IOException { deletedRowPos.delete(98, 103); when(deleteFilter.deletedRowPositions()).thenReturn(deletedRowPos); - builder.createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(PROJECTION_SCHEMA, - fileSchema, NullCheckingForGet.NULL_CHECKING_ENABLED, Maps.newHashMap(), deleteFilter)); + builder.createBatchedReaderFunc( + fileSchema -> + VectorizedSparkParquetReaders.buildReader( + PROJECTION_SCHEMA, + fileSchema, + NullCheckingForGet.NULL_CHECKING_ENABLED, + Maps.newHashMap(), + deleteFilter)); builder.recordsPerBatch(RECORDS_PER_BATCH); validate(expectedRowsAfterDelete, builder); @@ -233,7 +239,8 @@ public void testReadRowNumbersWithFilter() throws IOException { // current iceberg supports row group filter. for (int i = 1; i < 5; i += 1) { readAndValidate( - Expressions.and(Expressions.lessThan("id", NUM_ROWS / 2), + Expressions.and( + Expressions.lessThan("id", NUM_ROWS / 2), Expressions.greaterThanOrEqual("id", i * ROWS_PER_SPLIT)), null, null, @@ -243,28 +250,36 @@ public void testReadRowNumbersWithFilter() throws IOException { @Test public void testReadRowNumbersWithSplits() throws IOException { - ParquetFileReader fileReader = new ParquetFileReader( - HadoopInputFile.fromPath(new Path(testFile.getAbsolutePath()), new Configuration()), - ParquetReadOptions.builder().build()); + ParquetFileReader fileReader = + new ParquetFileReader( + HadoopInputFile.fromPath(new Path(testFile.getAbsolutePath()), new Configuration()), + ParquetReadOptions.builder().build()); List rowGroups = fileReader.getRowGroups(); for (int i = 0; i < NUM_ROW_GROUPS; i += 1) { - readAndValidate(null, + readAndValidate( + null, rowGroups.get(i).getColumns().get(0).getStartingPos(), rowGroups.get(i).getCompressedSize(), EXPECTED_ROWS.subList(i * ROWS_PER_SPLIT, (i + 1) * ROWS_PER_SPLIT)); } } - private void readAndValidate(Expression filter, Long splitStart, Long splitLength, List expected) + private void readAndValidate( + Expression filter, Long splitStart, Long splitLength, List expected) throws IOException { - Parquet.ReadBuilder builder = Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA); + Parquet.ReadBuilder builder = + Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA); if (vectorized) { - builder.createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(PROJECTION_SCHEMA, - fileSchema, NullCheckingForGet.NULL_CHECKING_ENABLED)); + builder.createBatchedReaderFunc( + fileSchema -> + VectorizedSparkParquetReaders.buildReader( + PROJECTION_SCHEMA, fileSchema, NullCheckingForGet.NULL_CHECKING_ENABLED)); builder.recordsPerBatch(RECORDS_PER_BATCH); } else { - builder = builder.createReaderFunc(msgType -> SparkParquetReaders.buildReader(PROJECTION_SCHEMA, msgType)); + builder = + builder.createReaderFunc( + msgType -> SparkParquetReaders.buildReader(PROJECTION_SCHEMA, msgType)); } if (filter != null) { @@ -278,8 +293,10 @@ private void readAndValidate(Expression filter, Long splitStart, Long splitLengt validate(expected, builder); } - private void validate(List expected, Parquet.ReadBuilder builder) throws IOException { - try (CloseableIterable reader = vectorized ? batchesToRows(builder.build()) : builder.build()) { + private void validate(List expected, Parquet.ReadBuilder builder) + throws IOException { + try (CloseableIterable reader = + vectorized ? batchesToRows(builder.build()) : builder.build()) { final Iterator actualRows = reader.iterator(); for (InternalRow internalRow : expected) { diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java index 03d234c1eca5..d4b7443e2e20 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -60,31 +62,31 @@ import org.junit.Assume; import org.junit.Test; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkParquetReader extends AvroDataTest { @Override protected void writeAndValidate(Schema schema) throws IOException { - Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find(schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); + Assume.assumeTrue( + "Parquet Avro cannot write non-string map keys", + null + == TypeUtil.find( + schema, + type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); List expected = RandomData.generateList(schema, 100, 0L); File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { writer.addAll(expected); } - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(type -> SparkParquetReaders.buildReader(schema, type)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(type -> SparkParquetReaders.buildReader(schema, type)) + .build()) { Iterator rows = reader.iterator(); for (int i = 0; i < expected.size(); i += 1) { Assert.assertTrue("Should have expected number of rows", rows.hasNext()); @@ -129,7 +131,8 @@ protected Table tableFromInputFile(InputFile inputFile, Schema schema) throws IO @Test public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOException { - String outputFilePath = String.format("%s/%s", temp.getRoot().getAbsolutePath(), "parquet_int96.parquet"); + String outputFilePath = + String.format("%s/%s", temp.getRoot().getAbsolutePath(), "parquet_int96.parquet"); HadoopOutputFile outputFile = HadoopOutputFile.fromPath( new org.apache.hadoop.fs.Path(outputFilePath), new Configuration()); @@ -137,7 +140,7 @@ public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOExceptio StructType sparkSchema = new StructType( new StructField[] { - new StructField("ts", DataTypes.TimestampType, true, Metadata.empty()) + new StructField("ts", DataTypes.TimestampType, true, Metadata.empty()) }); List rows = Lists.newArrayList(RandomData.generateSpark(schema, 10, 0L)); @@ -164,14 +167,14 @@ public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOExceptio Assert.assertEquals(rows.size(), tableRecords.size()); - for (int i = 0; i < tableRecords.size(); i++) { + for (int i = 0; i < tableRecords.size(); i++) { GenericsHelpers.assertEqualsUnsafe(schema.asStruct(), tableRecords.get(i), rows.get(i)); } } /** - * Native Spark ParquetWriter.Builder implementation so that we can write timestamps using Spark's native - * ParquetWriteSupport. + * Native Spark ParquetWriter.Builder implementation so that we can write timestamps using Spark's + * native ParquetWriteSupport. */ private static class NativeSparkWriterBuilder extends ParquetWriter.Builder { diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java index c75a87abc45c..261fb8838aa4 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -35,39 +37,51 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkParquetWriter { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required(5, "strict", Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional(6, "hopeful", Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()) - )), - optional(10, "vehement", Types.LongType.get()) - )), - optional(11, "metamorphosis", Types.MapType.ofRequired(12, 13, - Types.StringType.get(), Types.TimestampType.withZone())), - required(14, "winter", Types.ListType.ofOptional(15, Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.FloatType.get()), - optional(18, "wheeze", Types.StringType.get()) - ))), - optional(19, "renovate", Types.MapType.ofRequired(20, 21, - Types.StringType.get(), Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.IntegerType.get()), - required(24, "couch rope", Types.IntegerType.get()) - ))), - optional(2, "slide", Types.StringType.get()) - ); + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "roots", Types.LongType.get()), + optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), + required( + 5, + "strict", + Types.StructType.of( + required(9, "tangerine", Types.StringType.get()), + optional( + 6, + "hopeful", + Types.StructType.of( + required(7, "steel", Types.FloatType.get()), + required(8, "lantern", Types.DateType.get()))), + optional(10, "vehement", Types.LongType.get()))), + optional( + 11, + "metamorphosis", + Types.MapType.ofRequired( + 12, 13, Types.StringType.get(), Types.TimestampType.withZone())), + required( + 14, + "winter", + Types.ListType.ofOptional( + 15, + Types.StructType.of( + optional(16, "beet", Types.DoubleType.get()), + required(17, "stamp", Types.FloatType.get()), + optional(18, "wheeze", Types.StringType.get())))), + optional( + 19, + "renovate", + Types.MapType.ofRequired( + 20, + 21, + Types.StringType.get(), + Types.StructType.of( + optional(22, "jumpy", Types.DoubleType.get()), + required(23, "koala", Types.IntegerType.get()), + required(24, "couch rope", Types.IntegerType.get())))), + optional(2, "slide", Types.StringType.get())); @Test public void testCorrectness() throws IOException { @@ -77,17 +91,22 @@ public void testCorrectness() throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(COMPLEX_SCHEMA), msgType)) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(COMPLEX_SCHEMA) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter( + SparkSchemaUtil.convert(COMPLEX_SCHEMA), msgType)) + .build()) { writer.addAll(records); } - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(COMPLEX_SCHEMA, type)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(COMPLEX_SCHEMA, type)) + .build()) { Iterator expected = records.iterator(); Iterator rows = reader.iterator(); for (int i = 0; i < numRows; i += 1) { diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java index 1e7430d16df7..d10e7f5a19e3 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.math.BigDecimal; @@ -40,8 +41,6 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkRecordOrcReaderWriter extends AvroDataTest { private static final int NUM_RECORDS = 200; @@ -50,19 +49,21 @@ private void writeAndValidate(Schema schema, List expectedRecords) throw Assert.assertTrue("Delete should succeed", originalFile.delete()); // Write few generic records into the original test file. - try (FileAppender writer = ORC.write(Files.localOutput(originalFile)) - .createWriterFunc(GenericOrcWriter::buildWriter) - .schema(schema) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(originalFile)) + .createWriterFunc(GenericOrcWriter::buildWriter) + .schema(schema) + .build()) { writer.addAll(expectedRecords); } // Read into spark InternalRow from the original test file. List internalRows = Lists.newArrayList(); - try (CloseableIterable reader = ORC.read(Files.localInput(originalFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(originalFile)) + .project(schema) + .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) + .build()) { reader.forEach(internalRows::add); assertEqualsUnsafe(schema.asStruct(), expectedRecords, reader, expectedRecords.size()); } @@ -71,26 +72,29 @@ private void writeAndValidate(Schema schema, List expectedRecords) throw Assert.assertTrue("Delete should succeed", anotherFile.delete()); // Write those spark InternalRows into a new file again. - try (FileAppender writer = ORC.write(Files.localOutput(anotherFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(schema) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(anotherFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(schema) + .build()) { writer.addAll(internalRows); } // Check whether the InternalRows are expected records. - try (CloseableIterable reader = ORC.read(Files.localInput(anotherFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(anotherFile)) + .project(schema) + .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) + .build()) { assertEqualsUnsafe(schema.asStruct(), expectedRecords, reader, expectedRecords.size()); } // Read into iceberg GenericRecord and check again. - try (CloseableIterable reader = ORC.read(Files.localInput(anotherFile)) - .createReaderFunc(typeDesc -> GenericOrcReader.buildReader(schema, typeDesc)) - .project(schema) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(anotherFile)) + .createReaderFunc(typeDesc -> GenericOrcReader.buildReader(schema, typeDesc)) + .project(schema) + .build()) { assertRecordEquals(expectedRecords, reader, expectedRecords.size()); } } @@ -103,11 +107,11 @@ protected void writeAndValidate(Schema schema) throws IOException { @Test public void testDecimalWithTrailingZero() throws IOException { - Schema schema = new Schema( - required(1, "d1", Types.DecimalType.of(10, 2)), - required(2, "d2", Types.DecimalType.of(20, 5)), - required(3, "d3", Types.DecimalType.of(38, 20)) - ); + Schema schema = + new Schema( + required(1, "d1", Types.DecimalType.of(10, 2)), + required(2, "d2", Types.DecimalType.of(20, 5)), + required(3, "d3", Types.DecimalType.of(38, 20))); List expected = Lists.newArrayList(); @@ -121,7 +125,8 @@ public void testDecimalWithTrailingZero() throws IOException { writeAndValidate(schema, expected); } - private static void assertRecordEquals(Iterable expected, Iterable actual, int size) { + private static void assertRecordEquals( + Iterable expected, Iterable actual, int size) { Iterator expectedIter = expected.iterator(); Iterator actualIter = actual.iterator(); for (int i = 0; i < size; i += 1) { @@ -133,8 +138,8 @@ private static void assertRecordEquals(Iterable expected, Iterable expected, - Iterable actual, int size) { + private static void assertEqualsUnsafe( + Types.StructType struct, Iterable expected, Iterable actual, int size) { Iterator expectedIter = expected.iterator(); Iterator actualIter = actual.iterator(); for (int i = 0; i < size; i += 1) { diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java index f292df0c3bf8..756f49a2aad6 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet.vectorized; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; + import java.io.File; import java.io.IOException; import org.apache.avro.generic.GenericData; @@ -35,42 +36,42 @@ import org.junit.Ignore; import org.junit.Test; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; - public class TestParquetDictionaryEncodedVectorizedReads extends TestParquetVectorizedReads { @Override - Iterable generateData(Schema schema, int numRecords, long seed, float nullPercentage, - Function transform) { - Iterable data = RandomData.generateDictionaryEncodableData(schema, numRecords, seed, nullPercentage); + Iterable generateData( + Schema schema, + int numRecords, + long seed, + float nullPercentage, + Function transform) { + Iterable data = + RandomData.generateDictionaryEncodableData(schema, numRecords, seed, nullPercentage); return transform == IDENTITY ? data : Iterables.transform(data, transform); } @Test @Override @Ignore // Ignored since this code path is already tested in TestParquetVectorizedReads - public void testVectorizedReadsWithNewContainers() throws IOException { - - } + public void testVectorizedReadsWithNewContainers() throws IOException {} @Test public void testMixedDictionaryNonDictionaryReads() throws IOException { Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); File dictionaryEncodedFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dictionaryEncodedFile.delete()); - Iterable dictionaryEncodableData = RandomData.generateDictionaryEncodableData( - schema, - 10000, - 0L, - RandomData.DEFAULT_NULL_PERCENTAGE); - try (FileAppender writer = getParquetWriter(schema, dictionaryEncodedFile)) { + Iterable dictionaryEncodableData = + RandomData.generateDictionaryEncodableData( + schema, 10000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE); + try (FileAppender writer = + getParquetWriter(schema, dictionaryEncodedFile)) { writer.addAll(dictionaryEncodableData); } File plainEncodingFile = temp.newFile(); Assert.assertTrue("Delete should succeed", plainEncodingFile.delete()); - Iterable nonDictionaryData = RandomData.generate(schema, 10000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE); + Iterable nonDictionaryData = + RandomData.generate(schema, 10000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE); try (FileAppender writer = getParquetWriter(schema, plainEncodingFile)) { writer.addAll(nonDictionaryData); } @@ -78,15 +79,19 @@ public void testMixedDictionaryNonDictionaryReads() throws IOException { int rowGroupSize = PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; File mixedFile = temp.newFile(); Assert.assertTrue("Delete should succeed", mixedFile.delete()); - Parquet.concat(ImmutableList.of(dictionaryEncodedFile, plainEncodingFile, dictionaryEncodedFile), - mixedFile, rowGroupSize, schema, ImmutableMap.of()); + Parquet.concat( + ImmutableList.of(dictionaryEncodedFile, plainEncodingFile, dictionaryEncodedFile), + mixedFile, + rowGroupSize, + schema, + ImmutableMap.of()); assertRecordsMatch( - schema, - 30000, - FluentIterable.concat(dictionaryEncodableData, nonDictionaryData, dictionaryEncodableData), - mixedFile, - false, - true, - BATCH_SIZE); + schema, + 30000, + FluentIterable.concat(dictionaryEncodableData, nonDictionaryData, dictionaryEncodableData), + mixedFile, + false, + true, + BATCH_SIZE); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java index 5ceac3fdb76e..42ea34936b5f 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet.vectorized; import java.io.File; @@ -33,7 +32,8 @@ import org.junit.Ignore; import org.junit.Test; -public class TestParquetDictionaryFallbackToPlainEncodingVectorizedReads extends TestParquetVectorizedReads { +public class TestParquetDictionaryFallbackToPlainEncodingVectorizedReads + extends TestParquetVectorizedReads { private static final int NUM_ROWS = 1_000_000; @Override @@ -42,15 +42,20 @@ protected int getNumRows() { } @Override - Iterable generateData(Schema schema, int numRecords, long seed, float nullPercentage, - Function transform) { + Iterable generateData( + Schema schema, + int numRecords, + long seed, + float nullPercentage, + Function transform) { // TODO: take into account nullPercentage when generating fallback encoding data Iterable data = RandomData.generateFallbackData(schema, numRecords, seed, numRecords / 20); return transform == IDENTITY ? data : Iterables.transform(data, transform); } @Override - FileAppender getParquetWriter(Schema schema, File testFile) throws IOException { + FileAppender getParquetWriter(Schema schema, File testFile) + throws IOException { return Parquet.write(Files.localOutput(testFile)) .schema(schema) .named("test") @@ -61,14 +66,10 @@ FileAppender getParquetWriter(Schema schema, File testFile) @Test @Override @Ignore // Fallback encoding not triggered when data is mostly null - public void testMostlyNullsForOptionalFields() { - - } + public void testMostlyNullsForOptionalFields() {} @Test @Override @Ignore // Ignored since this code path is already tested in TestParquetVectorizedReads - public void testVectorizedReadsWithNewContainers() throws IOException { - - } + public void testVectorizedReadsWithNewContainers() throws IOException {} } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java index 19ffd023a061..022ddcf237b2 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet.vectorized; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -49,9 +51,6 @@ import org.junit.Ignore; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestParquetVectorizedReads extends AvroDataTest { private static final int NUM_ROWS = 200_000; static final int BATCH_SIZE = 10_000; @@ -64,24 +63,44 @@ protected void writeAndValidate(Schema schema) throws IOException { } private void writeAndValidate( - Schema schema, int numRecords, long seed, float nullPercentage, - boolean setAndCheckArrowValidityVector, boolean reuseContainers) - throws IOException { - writeAndValidate(schema, numRecords, seed, nullPercentage, - setAndCheckArrowValidityVector, reuseContainers, BATCH_SIZE, IDENTITY); + Schema schema, + int numRecords, + long seed, + float nullPercentage, + boolean setAndCheckArrowValidityVector, + boolean reuseContainers) + throws IOException { + writeAndValidate( + schema, + numRecords, + seed, + nullPercentage, + setAndCheckArrowValidityVector, + reuseContainers, + BATCH_SIZE, + IDENTITY); } private void writeAndValidate( - Schema schema, int numRecords, long seed, float nullPercentage, - boolean setAndCheckArrowValidityVector, boolean reuseContainers, int batchSize, - Function transform) + Schema schema, + int numRecords, + long seed, + float nullPercentage, + boolean setAndCheckArrowValidityVector, + boolean reuseContainers, + int batchSize, + Function transform) throws IOException { // Write test data - Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find( - schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); + Assume.assumeTrue( + "Parquet Avro cannot write non-string map keys", + null + == TypeUtil.find( + schema, + type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); - Iterable expected = generateData(schema, numRecords, seed, nullPercentage, transform); + Iterable expected = + generateData(schema, numRecords, seed, nullPercentage, transform); // write a test parquet file using iceberg writer File testFile = temp.newFile(); @@ -90,58 +109,74 @@ private void writeAndValidate( try (FileAppender writer = getParquetWriter(schema, testFile)) { writer.addAll(expected); } - assertRecordsMatch(schema, numRecords, expected, testFile, setAndCheckArrowValidityVector, - reuseContainers, batchSize); + assertRecordsMatch( + schema, + numRecords, + expected, + testFile, + setAndCheckArrowValidityVector, + reuseContainers, + batchSize); } protected int getNumRows() { return NUM_ROWS; } - Iterable generateData(Schema schema, int numRecords, long seed, float nullPercentage, - Function transform) { - Iterable data = RandomData.generate(schema, numRecords, seed, nullPercentage); + Iterable generateData( + Schema schema, + int numRecords, + long seed, + float nullPercentage, + Function transform) { + Iterable data = + RandomData.generate(schema, numRecords, seed, nullPercentage); return transform == IDENTITY ? data : Iterables.transform(data, transform); } - FileAppender getParquetWriter(Schema schema, File testFile) throws IOException { + FileAppender getParquetWriter(Schema schema, File testFile) + throws IOException { + return Parquet.write(Files.localOutput(testFile)).schema(schema).named("test").build(); + } + + FileAppender getParquetV2Writer(Schema schema, File testFile) + throws IOException { return Parquet.write(Files.localOutput(testFile)) .schema(schema) .named("test") + .writerVersion(ParquetProperties.WriterVersion.PARQUET_2_0) .build(); } - FileAppender getParquetV2Writer(Schema schema, File testFile) throws IOException { - return Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .writerVersion(ParquetProperties.WriterVersion.PARQUET_2_0) - .build(); - } - void assertRecordsMatch( - Schema schema, int expectedSize, Iterable expected, File testFile, - boolean setAndCheckArrowValidityBuffer, boolean reuseContainers, int batchSize) + Schema schema, + int expectedSize, + Iterable expected, + File testFile, + boolean setAndCheckArrowValidityBuffer, + boolean reuseContainers, + int batchSize) throws IOException { - Parquet.ReadBuilder readBuilder = Parquet.read(Files.localInput(testFile)) - .project(schema) - .recordsPerBatch(batchSize) - .createBatchedReaderFunc(type -> VectorizedSparkParquetReaders.buildReader( - schema, - type, - setAndCheckArrowValidityBuffer)); + Parquet.ReadBuilder readBuilder = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .recordsPerBatch(batchSize) + .createBatchedReaderFunc( + type -> + VectorizedSparkParquetReaders.buildReader( + schema, type, setAndCheckArrowValidityBuffer)); if (reuseContainers) { readBuilder.reuseContainers(); } - try (CloseableIterable batchReader = - readBuilder.build()) { + try (CloseableIterable batchReader = readBuilder.build()) { Iterator expectedIter = expected.iterator(); Iterator batches = batchReader.iterator(); int numRowsRead = 0; while (batches.hasNext()) { ColumnarBatch batch = batches.next(); numRowsRead += batch.numRows(); - TestHelpers.assertEqualsBatch(schema.asStruct(), expectedIter, batch, setAndCheckArrowValidityBuffer); + TestHelpers.assertEqualsBatch( + schema.asStruct(), expectedIter, batch, setAndCheckArrowValidityBuffer); } Assert.assertEquals(expectedSize, numRowsRead); } @@ -150,44 +185,37 @@ void assertRecordsMatch( @Override @Test @Ignore - public void testArray() { - } + public void testArray() {} @Override @Test @Ignore - public void testArrayOfStructs() { - } + public void testArrayOfStructs() {} @Override @Test @Ignore - public void testMap() { - } + public void testMap() {} @Override @Test @Ignore - public void testNumericMapKey() { - } + public void testNumericMapKey() {} @Override @Test @Ignore - public void testComplexMapKey() { - } + public void testComplexMapKey() {} @Override @Test @Ignore - public void testMapOfStructs() { - } + public void testMapOfStructs() {} @Override @Test @Ignore - public void testMixedTypes() { - } + public void testMixedTypes() {} @Test @Override @@ -196,13 +224,13 @@ public void testNestedStruct() { "Vectorized reads are not supported yet for struct fields", UnsupportedOperationException.class, "Vectorized reads are not supported yet for struct fields", - () -> VectorizedSparkParquetReaders.buildReader( - TypeUtil.assignIncreasingFreshIds(new Schema(required( - 1, - "struct", - SUPPORTED_PRIMITIVES))), - new MessageType("struct", new GroupType(Type.Repetition.OPTIONAL, "struct").withId(1)), - false)); + () -> + VectorizedSparkParquetReaders.buildReader( + TypeUtil.assignIncreasingFreshIds( + new Schema(required(1, "struct", SUPPORTED_PRIMITIVES))), + new MessageType( + "struct", new GroupType(Type.Repetition.OPTIONAL, "struct").withId(1)), + false)); } @Test @@ -218,27 +246,40 @@ public void testMostlyNullsForOptionalFields() throws IOException { @Test public void testSettingArrowValidityVector() throws IOException { - writeAndValidate(new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)), - getNumRows(), 0L, RandomData.DEFAULT_NULL_PERCENTAGE, true, true); + writeAndValidate( + new Schema(Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)), + getNumRows(), + 0L, + RandomData.DEFAULT_NULL_PERCENTAGE, + true, + true); } @Test public void testVectorizedReadsWithNewContainers() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema(SUPPORTED_PRIMITIVES.fields())), - getNumRows(), 0L, RandomData.DEFAULT_NULL_PERCENTAGE, true, false); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds(new Schema(SUPPORTED_PRIMITIVES.fields())), + getNumRows(), + 0L, + RandomData.DEFAULT_NULL_PERCENTAGE, + true, + false); } @Test public void testVectorizedReadsWithReallocatedArrowBuffers() throws IOException { // With a batch size of 2, 256 bytes are allocated in the VarCharVector. By adding strings of // length 512, the vector will need to be reallocated for storing the batch. - writeAndValidate(new Schema( + writeAndValidate( + new Schema( Lists.newArrayList( - SUPPORTED_PRIMITIVES.field("id"), - SUPPORTED_PRIMITIVES.field("data"))), - 10, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, - true, true, 2, + SUPPORTED_PRIMITIVES.field("id"), SUPPORTED_PRIMITIVES.field("data"))), + 10, + 0L, + RandomData.DEFAULT_NULL_PERCENTAGE, + true, + true, + 2, record -> { if (record.get("data") != null) { record.put("data", Strings.padEnd((String) record.get("data"), 512, 'a')); @@ -251,65 +292,67 @@ record -> { @Test public void testReadsForTypePromotedColumns() throws Exception { - Schema writeSchema = new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "int_data", Types.IntegerType.get()), - optional(102, "float_data", Types.FloatType.get()), - optional(103, "decimal_data", Types.DecimalType.of(10, 5)) - ); + Schema writeSchema = + new Schema( + required(100, "id", Types.LongType.get()), + optional(101, "int_data", Types.IntegerType.get()), + optional(102, "float_data", Types.FloatType.get()), + optional(103, "decimal_data", Types.DecimalType.of(10, 5))); File dataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = generateData(writeSchema, 30000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); + Iterable data = + generateData(writeSchema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); try (FileAppender writer = getParquetWriter(writeSchema, dataFile)) { writer.addAll(data); } - Schema readSchema = new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "int_data", Types.LongType.get()), - optional(102, "float_data", Types.DoubleType.get()), - optional(103, "decimal_data", Types.DecimalType.of(25, 5)) - ); + Schema readSchema = + new Schema( + required(100, "id", Types.LongType.get()), + optional(101, "int_data", Types.LongType.get()), + optional(102, "float_data", Types.DoubleType.get()), + optional(103, "decimal_data", Types.DecimalType.of(25, 5))); - assertRecordsMatch(readSchema, 30000, data, dataFile, false, - true, BATCH_SIZE); + assertRecordsMatch(readSchema, 30000, data, dataFile, false, true, BATCH_SIZE); } @Test public void testSupportedReadsForParquetV2() throws Exception { // Only float and double column types are written using plain encoding with Parquet V2 - Schema schema = new Schema( + Schema schema = + new Schema( optional(102, "float_data", Types.FloatType.get()), optional(103, "double_data", Types.DoubleType.get())); File dataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = generateData(schema, 30000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); + Iterable data = + generateData(schema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); try (FileAppender writer = getParquetV2Writer(schema, dataFile)) { writer.addAll(data); } - assertRecordsMatch(schema, 30000, data, dataFile, false, - true, BATCH_SIZE); + assertRecordsMatch(schema, 30000, data, dataFile, false, true, BATCH_SIZE); } @Test public void testUnsupportedReadsForParquetV2() throws Exception { - // Longs, ints, string types etc use delta encoding and which are not supported for vectorized reads + // Longs, ints, string types etc use delta encoding and which are not supported for vectorized + // reads Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); File dataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = generateData(schema, 30000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); + Iterable data = + generateData(schema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); try (FileAppender writer = getParquetV2Writer(schema, dataFile)) { writer.addAll(data); } - AssertHelpers.assertThrows("Vectorized reads not supported", - UnsupportedOperationException.class, "Cannot support vectorized reads for column", () -> { - assertRecordsMatch(schema, 30000, data, dataFile, false, - true, BATCH_SIZE); + AssertHelpers.assertThrows( + "Vectorized reads not supported", + UnsupportedOperationException.class, + "Cannot support vectorized reads for column", + () -> { + assertRecordsMatch(schema, 30000, data, dataFile, false, true, BATCH_SIZE); return null; }); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/FilePathLastModifiedRecord.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/FilePathLastModifiedRecord.java index 275e3a520db5..c62c1de6ba33 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/FilePathLastModifiedRecord.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/FilePathLastModifiedRecord.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.sql.Timestamp; @@ -26,8 +25,7 @@ public class FilePathLastModifiedRecord { private String filePath; private Timestamp lastModified; - public FilePathLastModifiedRecord() { - } + public FilePathLastModifiedRecord() {} public FilePathLastModifiedRecord(String filePath, Timestamp lastModified) { this.filePath = filePath; @@ -59,8 +57,8 @@ public boolean equals(Object o) { return false; } FilePathLastModifiedRecord that = (FilePathLastModifiedRecord) o; - return Objects.equals(filePath, that.filePath) && - Objects.equals(lastModified, that.lastModified); + return Objects.equals(filePath, that.filePath) + && Objects.equals(lastModified, that.lastModified); } @Override @@ -70,9 +68,13 @@ public int hashCode() { @Override public String toString() { - return "FilePathLastModifiedRecord{" + - "filePath='" + filePath + '\'' + - ", lastModified='" + lastModified + '\'' + - '}'; + return "FilePathLastModifiedRecord{" + + "filePath='" + + filePath + + '\'' + + ", lastModified='" + + lastModified + + '\'' + + '}'; } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java index 5e22daeb0841..53a35eec61ce 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.time.Instant; diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java index 0f8c8b3b65c6..c9c1c29ea8fc 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -35,7 +34,8 @@ public class ManualSource implements TableProvider, DataSourceRegister { private static final Map tableMap = Maps.newHashMap(); public static void setTable(String name, Table table) { - Preconditions.checkArgument(!tableMap.containsKey(name), "Cannot set " + name + ". It is already set"); + Preconditions.checkArgument( + !tableMap.containsKey(name), "Cannot set " + name + ". It is already set"); tableMap.put(name, table); } @@ -61,7 +61,8 @@ public Transform[] inferPartitioning(CaseInsensitiveStringMap options) { @Override public org.apache.spark.sql.connector.catalog.Table getTable( StructType schema, Transform[] partitioning, Map properties) { - Preconditions.checkArgument(properties.containsKey(TABLE_NAME), "Missing property " + TABLE_NAME); + Preconditions.checkArgument( + properties.containsKey(TABLE_NAME), "Missing property " + TABLE_NAME); String tableName = properties.get(TABLE_NAME); Preconditions.checkArgument(tableMap.containsKey(tableName), "Table missing " + tableName); return tableMap.get(tableName); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java index c8b7a31b3ba0..550e20b9338e 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.relocated.com.google.common.base.Objects; @@ -25,8 +24,7 @@ public class SimpleRecord { private Integer id; private String data; - public SimpleRecord() { - } + public SimpleRecord() {} public SimpleRecord(Integer id, String data) { this.id = id; diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java index 4d2e12229813..9491adde4605 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.List; @@ -46,13 +47,10 @@ import org.junit.Rule; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.Files.localOutput; - public class TestAvroScan extends AvroDataTest { private static final Configuration CONF = new Configuration(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; @@ -75,8 +73,8 @@ protected void writeAndValidate(Schema schema) throws IOException { File dataFolder = new File(location, "data"); dataFolder.mkdirs(); - File avroFile = new File(dataFolder, - FileFormat.AVRO.addExtension(UUID.randomUUID().toString())); + File avroFile = + new File(dataFolder, FileFormat.AVRO.addExtension(UUID.randomUUID().toString())); HadoopTables tables = new HadoopTables(CONF); Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); @@ -87,23 +85,21 @@ protected void writeAndValidate(Schema schema) throws IOException { List expected = RandomData.generateList(tableSchema, 100, 1L); - try (FileAppender writer = Avro.write(localOutput(avroFile)) - .schema(tableSchema) - .build()) { + try (FileAppender writer = + Avro.write(localOutput(avroFile)).schema(tableSchema).build()) { writer.addAll(expected); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(100) - .withFileSizeInBytes(avroFile.length()) - .withPath(avroFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(100) + .withFileSizeInBytes(avroFile.length()) + .withPath(avroFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); - Dataset df = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset df = spark.read().format("iceberg").load(location.toString()); List rows = df.collectAsList(); Assert.assertEquals("Should contain 100 rows", 100, rows.size()); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestBaseReader.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestBaseReader.java index 7f15cb28fa6b..cbcee867803f 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestBaseReader.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestBaseReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.FileFormat.PARQUET; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -50,13 +52,9 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.FileFormat.PARQUET; -import static org.apache.iceberg.Files.localOutput; - public class TestBaseReader { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @@ -134,14 +132,12 @@ public void testClosureOnDataExhaustion() throws IOException { Assert.assertNotNull("Reader should return non-null value", reader.get()); } - Assert.assertEquals("Reader returned incorrect number of records", - totalTasks * recordPerTask, - countRecords - ); - tasks.forEach(t -> - Assert.assertTrue("All iterators should be closed after read exhausion", - reader.isIteratorClosed(t)) - ); + Assert.assertEquals( + "Reader returned incorrect number of records", totalTasks * recordPerTask, countRecords); + tasks.forEach( + t -> + Assert.assertTrue( + "All iterators should be closed after read exhausion", reader.isIteratorClosed(t))); } @Test @@ -157,13 +153,15 @@ public void testClosureDuringIteration() throws IOException { // Total of 2 elements Assert.assertTrue(reader.next()); - Assert.assertFalse("First iter should not be closed on its last element", - reader.isIteratorClosed(firstTask)); + Assert.assertFalse( + "First iter should not be closed on its last element", reader.isIteratorClosed(firstTask)); Assert.assertTrue(reader.next()); - Assert.assertTrue("First iter should be closed after moving to second iter", + Assert.assertTrue( + "First iter should be closed after moving to second iter", reader.isIteratorClosed(firstTask)); - Assert.assertFalse("Second iter should not be closed on its last element", + Assert.assertFalse( + "Second iter should not be closed on its last element", reader.isIteratorClosed(secondTask)); Assert.assertFalse(reader.next()); @@ -181,10 +179,10 @@ public void testClosureWithoutAnyRead() throws IOException { reader.close(); - tasks.forEach(t -> - Assert.assertFalse("Iterator should not be created eagerly for tasks", - reader.hasIterator(t)) - ); + tasks.forEach( + t -> + Assert.assertFalse( + "Iterator should not be created eagerly for tasks", reader.hasIterator(t))); } @Test @@ -205,12 +203,13 @@ public void testExplicitClosure() throws IOException { // Some tasks might have not been opened yet, so we don't have corresponding tracker for it. // But all that have been created must be closed. - tasks.forEach(t -> { - if (reader.hasIterator(t)) { - Assert.assertTrue("Iterator should be closed after read exhausion", - reader.isIteratorClosed(t)); - } - }); + tasks.forEach( + t -> { + if (reader.hasIterator(t)) { + Assert.assertTrue( + "Iterator should be closed after read exhausion", reader.isIteratorClosed(t)); + } + }); } @Test @@ -230,26 +229,26 @@ public void testIdempotentExplicitClosure() throws IOException { for (int closeAttempt = 0; closeAttempt < 5; closeAttempt++) { reader.close(); for (int i = 0; i < 5; i++) { - Assert.assertTrue("Iterator should be closed after read exhausion", + Assert.assertTrue( + "Iterator should be closed after read exhausion", reader.isIteratorClosed(tasks.get(i))); } for (int i = 5; i < 10; i++) { - Assert.assertFalse("Iterator should not be created eagerly for tasks", - reader.hasIterator(tasks.get(i))); + Assert.assertFalse( + "Iterator should not be created eagerly for tasks", reader.hasIterator(tasks.get(i))); } } } - private List createFileScanTasks(Integer totalTasks, Integer recordPerTask) throws IOException { + private List createFileScanTasks(Integer totalTasks, Integer recordPerTask) + throws IOException { String desc = "make_scan_tasks"; File parent = temp.newFolder(desc); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); try { this.table = TestTables.create(location, desc, schema, PartitionSpec.unpartitioned()); @@ -261,22 +260,21 @@ private List createFileScanTasks(Integer totalTasks, Integer recor AppendFiles appendFiles = table.newAppend(); for (int i = 0; i < totalTasks; i++) { File parquetFile = new File(dataFolder, PARQUET.addExtension(UUID.randomUUID().toString())); - try (FileAppender writer = Parquet.write(localOutput(parquetFile)) - .schema(tableSchema) - .build()) { + try (FileAppender writer = + Parquet.write(localOutput(parquetFile)).schema(tableSchema).build()) { writer.addAll(expected); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withFileSizeInBytes(parquetFile.length()) - .withPath(parquetFile.toString()) - .withRecordCount(recordPerTask) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withFileSizeInBytes(parquetFile.length()) + .withPath(parquetFile.toString()) + .withRecordCount(recordPerTask) + .build(); appendFiles.appendFile(file); } appendFiles.commit(); - return StreamSupport - .stream(table.newScan().planFiles().spliterator(), false) + return StreamSupport.stream(table.newScan().planFiles().spliterator(), false) .collect(Collectors.toList()); } finally { TestTables.clearTables(); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWriterV2.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWriterV2.java index fdee5911994e..6b4e77d1de3f 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWriterV2.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWriterV2.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.AssertHelpers; @@ -44,27 +43,35 @@ public void removeTables() { @Test public void testMergeSchemaFailsWithoutWriterOption() throws Exception { - sql("ALTER TABLE %s SET TBLPROPERTIES ('%s'='true')", tableName, TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s'='true')", + tableName, TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA); - Dataset twoColDF = jsonToDF( - "id bigint, data string", - "{ \"id\": 1, \"data\": \"a\" }", - "{ \"id\": 2, \"data\": \"b\" }"); + Dataset twoColDF = + jsonToDF( + "id bigint, data string", + "{ \"id\": 1, \"data\": \"a\" }", + "{ \"id\": 2, \"data\": \"b\" }"); twoColDF.writeTo(tableName).append(); - assertEquals("Should have initial 2-column rows", + assertEquals( + "Should have initial 2-column rows", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("select * from %s order by id", tableName)); - Dataset threeColDF = jsonToDF( - "id bigint, data string, new_col float", - "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }", - "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }"); - - // this has a different error message than the case without accept-any-schema because it uses Iceberg checks - AssertHelpers.assertThrows("Should fail when merge-schema is not enabled on the writer", - IllegalArgumentException.class, "Field new_col not found in source schema", + Dataset threeColDF = + jsonToDF( + "id bigint, data string, new_col float", + "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }", + "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }"); + + // this has a different error message than the case without accept-any-schema because it uses + // Iceberg checks + AssertHelpers.assertThrows( + "Should fail when merge-schema is not enabled on the writer", + IllegalArgumentException.class, + "Field new_col not found in source schema", () -> { try { threeColDF.writeTo(tableName).append(); @@ -77,24 +84,29 @@ public void testMergeSchemaFailsWithoutWriterOption() throws Exception { @Test public void testMergeSchemaWithoutAcceptAnySchema() throws Exception { - Dataset twoColDF = jsonToDF( - "id bigint, data string", - "{ \"id\": 1, \"data\": \"a\" }", - "{ \"id\": 2, \"data\": \"b\" }"); + Dataset twoColDF = + jsonToDF( + "id bigint, data string", + "{ \"id\": 1, \"data\": \"a\" }", + "{ \"id\": 2, \"data\": \"b\" }"); twoColDF.writeTo(tableName).append(); - assertEquals("Should have initial 2-column rows", + assertEquals( + "Should have initial 2-column rows", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("select * from %s order by id", tableName)); - Dataset threeColDF = jsonToDF( - "id bigint, data string, new_col float", - "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }", - "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }"); + Dataset threeColDF = + jsonToDF( + "id bigint, data string, new_col float", + "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }", + "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }"); - AssertHelpers.assertThrows("Should fail when accept-any-schema is not enabled on the table", - AnalysisException.class, "too many data columns", + AssertHelpers.assertThrows( + "Should fail when accept-any-schema is not enabled on the table", + AnalysisException.class, + "too many data columns", () -> { try { threeColDF.writeTo(tableName).option("merge-schema", "true").append(); @@ -107,55 +119,69 @@ public void testMergeSchemaWithoutAcceptAnySchema() throws Exception { @Test public void testMergeSchemaSparkProperty() throws Exception { - sql("ALTER TABLE %s SET TBLPROPERTIES ('%s'='true')", tableName, TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s'='true')", + tableName, TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA); - Dataset twoColDF = jsonToDF( - "id bigint, data string", - "{ \"id\": 1, \"data\": \"a\" }", - "{ \"id\": 2, \"data\": \"b\" }"); + Dataset twoColDF = + jsonToDF( + "id bigint, data string", + "{ \"id\": 1, \"data\": \"a\" }", + "{ \"id\": 2, \"data\": \"b\" }"); twoColDF.writeTo(tableName).append(); - assertEquals("Should have initial 2-column rows", + assertEquals( + "Should have initial 2-column rows", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("select * from %s order by id", tableName)); - Dataset threeColDF = jsonToDF( - "id bigint, data string, new_col float", - "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }", - "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }"); + Dataset threeColDF = + jsonToDF( + "id bigint, data string, new_col float", + "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }", + "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }"); threeColDF.writeTo(tableName).option("mergeSchema", "true").append(); - assertEquals("Should have 3-column rows", - ImmutableList.of(row(1L, "a", null), row(2L, "b", null), row(3L, "c", 12.06F), row(4L, "d", 14.41F)), + assertEquals( + "Should have 3-column rows", + ImmutableList.of( + row(1L, "a", null), row(2L, "b", null), row(3L, "c", 12.06F), row(4L, "d", 14.41F)), sql("select * from %s order by id", tableName)); } @Test public void testMergeSchemaIcebergProperty() throws Exception { - sql("ALTER TABLE %s SET TBLPROPERTIES ('%s'='true')", tableName, TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s'='true')", + tableName, TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA); - Dataset twoColDF = jsonToDF( - "id bigint, data string", - "{ \"id\": 1, \"data\": \"a\" }", - "{ \"id\": 2, \"data\": \"b\" }"); + Dataset twoColDF = + jsonToDF( + "id bigint, data string", + "{ \"id\": 1, \"data\": \"a\" }", + "{ \"id\": 2, \"data\": \"b\" }"); twoColDF.writeTo(tableName).append(); - assertEquals("Should have initial 2-column rows", + assertEquals( + "Should have initial 2-column rows", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("select * from %s order by id", tableName)); - Dataset threeColDF = jsonToDF( - "id bigint, data string, new_col float", - "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }", - "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }"); + Dataset threeColDF = + jsonToDF( + "id bigint, data string, new_col float", + "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }", + "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }"); threeColDF.writeTo(tableName).option("merge-schema", "true").append(); - assertEquals("Should have 3-column rows", - ImmutableList.of(row(1L, "a", null), row(2L, "b", null), row(3L, "c", 12.06F), row(4L, "d", 14.41F)), + assertEquals( + "Should have 3-column rows", + ImmutableList.of( + row(1L, "a", null), row(2L, "b", null), row(3L, "c", 12.06F), row(4L, "d", 14.41F)), sql("select * from %s order by id", tableName)); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java index b0a77b72b431..9f32769379c8 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsSafe; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; + import java.io.File; import java.io.IOException; import java.net.URI; @@ -68,10 +71,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsSafe; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; - @RunWith(Parameterized.class) public class TestDataFrameWrites extends AvroDataTest { private static final Configuration CONF = new Configuration(); @@ -80,7 +79,7 @@ public class TestDataFrameWrites extends AvroDataTest { @Parameterized.Parameters(name = "format = {0}") public static Object[] parameters() { - return new Object[] { "parquet", "avro", "orc" }; + return new Object[] {"parquet", "avro", "orc"}; } public TestDataFrameWrites(String format) { @@ -92,32 +91,36 @@ public TestDataFrameWrites(String format) { private Map tableProperties; - private org.apache.spark.sql.types.StructType sparkSchema = new org.apache.spark.sql.types.StructType( - new org.apache.spark.sql.types.StructField[] { - new org.apache.spark.sql.types.StructField( - "optionalField", - org.apache.spark.sql.types.DataTypes.StringType, - true, - org.apache.spark.sql.types.Metadata.empty()), - new org.apache.spark.sql.types.StructField( - "requiredField", - org.apache.spark.sql.types.DataTypes.StringType, - false, - org.apache.spark.sql.types.Metadata.empty()) - }); - - private Schema icebergSchema = new Schema( - Types.NestedField.optional(1, "optionalField", Types.StringType.get()), - Types.NestedField.required(2, "requiredField", Types.StringType.get())); - - private List data0 = Arrays.asList( - "{\"optionalField\": \"a1\", \"requiredField\": \"bid_001\"}", - "{\"optionalField\": \"a2\", \"requiredField\": \"bid_002\"}"); - private List data1 = Arrays.asList( - "{\"optionalField\": \"d1\", \"requiredField\": \"bid_101\"}", - "{\"optionalField\": \"d2\", \"requiredField\": \"bid_102\"}", - "{\"optionalField\": \"d3\", \"requiredField\": \"bid_103\"}", - "{\"optionalField\": \"d4\", \"requiredField\": \"bid_104\"}"); + private org.apache.spark.sql.types.StructType sparkSchema = + new org.apache.spark.sql.types.StructType( + new org.apache.spark.sql.types.StructField[] { + new org.apache.spark.sql.types.StructField( + "optionalField", + org.apache.spark.sql.types.DataTypes.StringType, + true, + org.apache.spark.sql.types.Metadata.empty()), + new org.apache.spark.sql.types.StructField( + "requiredField", + org.apache.spark.sql.types.DataTypes.StringType, + false, + org.apache.spark.sql.types.Metadata.empty()) + }); + + private Schema icebergSchema = + new Schema( + Types.NestedField.optional(1, "optionalField", Types.StringType.get()), + Types.NestedField.required(2, "requiredField", Types.StringType.get())); + + private List data0 = + Arrays.asList( + "{\"optionalField\": \"a1\", \"requiredField\": \"bid_001\"}", + "{\"optionalField\": \"a2\", \"requiredField\": \"bid_002\"}"); + private List data1 = + Arrays.asList( + "{\"optionalField\": \"d1\", \"requiredField\": \"bid_101\"}", + "{\"optionalField\": \"d2\", \"requiredField\": \"bid_102\"}", + "{\"optionalField\": \"d3\", \"requiredField\": \"bid_103\"}", + "{\"optionalField\": \"d4\", \"requiredField\": \"bid_104\"}"); @BeforeClass public static void startSpark() { @@ -145,8 +148,10 @@ public void testWriteWithCustomDataLocation() throws IOException { File location = createTableFolder(); File tablePropertyDataLocation = temp.newFolder("test-table-property-data-dir"); Table table = createTable(new Schema(SUPPORTED_PRIMITIVES.fields()), location); - table.updateProperties().set( - TableProperties.WRITE_DATA_LOCATION, tablePropertyDataLocation.getAbsolutePath()).commit(); + table + .updateProperties() + .set(TableProperties.WRITE_DATA_LOCATION, tablePropertyDataLocation.getAbsolutePath()) + .commit(); writeAndValidateWithLocations(table, location, tablePropertyDataLocation); } @@ -162,7 +167,8 @@ private Table createTable(Schema schema, File location) { return tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); } - private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) throws IOException { + private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) + throws IOException { Schema tableSchema = table.schema(); // use the table schema because ids are reassigned table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); @@ -179,47 +185,56 @@ private void writeAndValidateWithLocations(Table table, File location, File expe while (expectedIter.hasNext() && actualIter.hasNext()) { assertEqualsSafe(tableSchema.asStruct(), expectedIter.next(), actualIter.next()); } - Assert.assertEquals("Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext()); - - table.currentSnapshot().addedDataFiles(table.io()).forEach(dataFile -> - Assert.assertTrue( - String.format( - "File should have the parent directory %s, but has: %s.", - expectedDataDir.getAbsolutePath(), - dataFile.path()), - URI.create(dataFile.path().toString()).getPath().startsWith(expectedDataDir.getAbsolutePath()))); + Assert.assertEquals( + "Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext()); + + table + .currentSnapshot() + .addedDataFiles(table.io()) + .forEach( + dataFile -> + Assert.assertTrue( + String.format( + "File should have the parent directory %s, but has: %s.", + expectedDataDir.getAbsolutePath(), dataFile.path()), + URI.create(dataFile.path().toString()) + .getPath() + .startsWith(expectedDataDir.getAbsolutePath()))); } private List readTable(String location) { - Dataset result = spark.read() - .format("iceberg") - .load(location); + Dataset result = spark.read().format("iceberg").load(location); return result.collectAsList(); } - private void writeData(Iterable records, Schema schema, String location) throws IOException { + private void writeData(Iterable records, Schema schema, String location) + throws IOException { Dataset df = createDataset(records, schema); DataFrameWriter writer = df.write().format("iceberg").mode("append"); writer.save(location); } - private void writeDataWithFailOnPartition(Iterable records, Schema schema, String location) - throws IOException, SparkException { + private void writeDataWithFailOnPartition( + Iterable records, Schema schema, String location) throws IOException, SparkException { final int numPartitions = 10; final int partitionToFail = new Random().nextInt(numPartitions); - MapPartitionsFunction failOnFirstPartitionFunc = (MapPartitionsFunction) input -> { - int partitionId = TaskContext.getPartitionId(); - - if (partitionId == partitionToFail) { - throw new SparkException(String.format("Intended exception in partition %d !", partitionId)); - } - return input; - }; - - Dataset df = createDataset(records, schema) - .repartition(numPartitions) - .mapPartitions(failOnFirstPartitionFunc, RowEncoder.apply(convert(schema))); + MapPartitionsFunction failOnFirstPartitionFunc = + (MapPartitionsFunction) + input -> { + int partitionId = TaskContext.getPartitionId(); + + if (partitionId == partitionToFail) { + throw new SparkException( + String.format("Intended exception in partition %d !", partitionId)); + } + return input; + }; + + Dataset df = + createDataset(records, schema) + .repartition(numPartitions) + .mapPartitions(failOnFirstPartitionFunc, RowEncoder.apply(convert(schema))); // This trick is needed because Spark 3 handles decimal overflow in RowEncoder which "changes" // nullability of the column to "true" regardless of original nullability. // Setting "check-nullability" option to "false" doesn't help as it fails at Spark analyzer. @@ -234,10 +249,8 @@ private Dataset createDataset(Iterable records, Schema schema) thro File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Avro.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Avro.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { for (Record rec : records) { writer.add(rec); } @@ -245,10 +258,11 @@ private Dataset createDataset(Iterable records, Schema schema) thro // make sure the dataframe matches the records before moving on List rows = Lists.newArrayList(); - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)) + .createReaderFunc(SparkAvroReader::new) + .project(schema) + .build()) { Iterator recordIter = records.iterator(); Iterator readIter = reader.iterator(); @@ -257,7 +271,8 @@ private Dataset createDataset(Iterable records, Schema schema) thro assertEqualsUnsafe(schema.asStruct(), recordIter.next(), row); rows.add(row); } - Assert.assertEquals("Both iterators should be exhausted", recordIter.hasNext(), readIter.hasNext()); + Assert.assertEquals( + "Both iterators should be exhausted", recordIter.hasNext(), readIter.hasNext()); } JavaRDD rdd = sc.parallelize(rows); @@ -266,7 +281,8 @@ private Dataset createDataset(Iterable records, Schema schema) thro @Test public void testNullableWithWriteOption() throws IOException { - Assume.assumeTrue("Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); + Assume.assumeTrue( + "Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); File location = new File(temp.newFolder("parquet"), "test"); String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString()); @@ -276,9 +292,11 @@ public void testNullableWithWriteOption() throws IOException { // read this and append to iceberg dataset spark - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) - .write().parquet(sourcePath); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) + .write() + .parquet(sourcePath); // this is our iceberg dataset to which we will append data new HadoopTables(spark.sessionState().newHadoopConf()) @@ -290,15 +308,24 @@ public void testNullableWithWriteOption() throws IOException { // this is the initial data inside the iceberg dataset spark - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) - .write().format("iceberg").mode(SaveMode.Append).save(targetPath); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) + .write() + .format("iceberg") + .mode(SaveMode.Append) + .save(targetPath); // read from parquet and append to iceberg w/ nullability check disabled spark - .read().schema(SparkSchemaUtil.convert(icebergSchema)).parquet(sourcePath) - .write().format("iceberg").option(SparkWriteOptions.CHECK_NULLABILITY, false) - .mode(SaveMode.Append).save(targetPath); + .read() + .schema(SparkSchemaUtil.convert(icebergSchema)) + .parquet(sourcePath) + .write() + .format("iceberg") + .option(SparkWriteOptions.CHECK_NULLABILITY, false) + .mode(SaveMode.Append) + .save(targetPath); // read all data List rows = spark.read().format("iceberg").load(targetPath).collectAsList(); @@ -307,7 +334,8 @@ public void testNullableWithWriteOption() throws IOException { @Test public void testNullableWithSparkSqlOption() throws IOException { - Assume.assumeTrue("Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); + Assume.assumeTrue( + "Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); File location = new File(temp.newFolder("parquet"), "test"); String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString()); @@ -317,15 +345,18 @@ public void testNullableWithSparkSqlOption() throws IOException { // read this and append to iceberg dataset spark - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) - .write().parquet(sourcePath); - - SparkSession newSparkSession = SparkSession.builder() - .master("local[2]") - .appName("NullableTest") - .config(SparkSQLProperties.CHECK_NULLABILITY, false) - .getOrCreate(); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) + .write() + .parquet(sourcePath); + + SparkSession newSparkSession = + SparkSession.builder() + .master("local[2]") + .appName("NullableTest") + .config(SparkSQLProperties.CHECK_NULLABILITY, false) + .getOrCreate(); // this is our iceberg dataset to which we will append data new HadoopTables(newSparkSession.sessionState().newHadoopConf()) @@ -337,19 +368,27 @@ public void testNullableWithSparkSqlOption() throws IOException { // this is the initial data inside the iceberg dataset newSparkSession - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) - .write().format("iceberg").mode(SaveMode.Append).save(targetPath); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) + .write() + .format("iceberg") + .mode(SaveMode.Append) + .save(targetPath); // read from parquet and append to iceberg newSparkSession - .read().schema(SparkSchemaUtil.convert(icebergSchema)).parquet(sourcePath) - .write().format("iceberg").mode(SaveMode.Append).save(targetPath); + .read() + .schema(SparkSchemaUtil.convert(icebergSchema)) + .parquet(sourcePath) + .write() + .format("iceberg") + .mode(SaveMode.Append) + .save(targetPath); // read all data List rows = newSparkSession.read().format("iceberg").load(targetPath).collectAsList(); Assert.assertEquals("Should contain 6 rows", 6, rows.size()); - } @Test diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java index 0c56cb328648..60dd716c631e 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import java.math.RoundingMode; import java.util.List; @@ -58,19 +59,15 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestDataSourceOptions { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); private static SparkSession spark = null; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @BeforeClass public static void startSpark() { @@ -94,23 +91,23 @@ public void testWriteFormatOptionOverridesTableProperties() throws IOException { options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro"); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, "parquet") .mode(SaveMode.Append) .save(tableLocation); try (CloseableIterable tasks = table.newScan().planFiles()) { - tasks.forEach(task -> { - FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); - Assert.assertEquals(FileFormat.PARQUET, fileFormat); - }); + tasks.forEach( + task -> { + FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); + Assert.assertEquals(FileFormat.PARQUET, fileFormat); + }); } } @@ -124,22 +121,18 @@ public void testNoWriteFormatOption() throws IOException { options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro"); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); - df.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); try (CloseableIterable tasks = table.newScan().planFiles()) { - tasks.forEach(task -> { - FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); - Assert.assertEquals(FileFormat.AVRO, fileFormat); - }); + tasks.forEach( + task -> { + FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); + Assert.assertEquals(FileFormat.AVRO, fileFormat); + }); } } @@ -159,24 +152,25 @@ public void testHadoopOptions() throws IOException { // to verify that 'hadoop.' data source options are propagated correctly sparkHadoopConf.set("fs.default.name", "hdfs://localhost:9000"); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() + originalDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .option("hadoop.fs.default.name", "file:///") .save(tableLocation); - Dataset resultDf = spark.read() - .format("iceberg") - .option("hadoop.fs.default.name", "file:///") - .load(tableLocation); - List resultRecords = resultDf.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset resultDf = + spark + .read() + .format("iceberg") + .option("hadoop.fs.default.name", "file:///") + .load(tableLocation); + List resultRecords = + resultDf.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Records should match", expectedRecords, resultRecords); } finally { @@ -192,31 +186,35 @@ public void testSplitOptionsOverridesTableProperties() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Map options = Maps.newHashMap(); options.put(TableProperties.SPLIT_SIZE, String.valueOf(128L * 1024 * 1024)); // 128Mb - options.put(TableProperties.DEFAULT_FILE_FORMAT, String.valueOf(FileFormat.AVRO)); // Arbitrarily splittable + options.put( + TableProperties.DEFAULT_FILE_FORMAT, + String.valueOf(FileFormat.AVRO)); // Arbitrarily splittable Table icebergTable = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data") + originalDf + .select("id", "data") .repartition(1) .write() .format("iceberg") .mode("append") .save(tableLocation); - List files = Lists.newArrayList(icebergTable.currentSnapshot().addedDataFiles(icebergTable.io())); + List files = + Lists.newArrayList(icebergTable.currentSnapshot().addedDataFiles(icebergTable.io())); Assert.assertEquals("Should have written 1 file", 1, files.size()); long fileSize = files.get(0).fileSizeInBytes(); long splitSize = LongMath.divide(fileSize, 2, RoundingMode.CEILING); - Dataset resultDf = spark.read() - .format("iceberg") - .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(splitSize)) - .load(tableLocation); + Dataset resultDf = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(splitSize)) + .load(tableLocation); Assert.assertEquals("Spark partitions should match", 2, resultDf.javaRDD().getNumPartitions()); } @@ -230,18 +228,16 @@ public void testIncrementalScanOptions() throws IOException { Map options = Maps.newHashMap(); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "d") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "d")); for (SimpleRecord record : expectedRecords) { - Dataset originalDf = spark.createDataFrame(Lists.newArrayList(record), SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + Dataset originalDf = + spark.createDataFrame(Lists.newArrayList(record), SimpleRecord.class); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); } List snapshotIds = SnapshotUtil.currentAncestorIds(table); @@ -251,11 +247,13 @@ public void testIncrementalScanOptions() throws IOException { IllegalArgumentException.class, "Cannot set start-snapshot-id and end-snapshot-id for incremental scans", () -> { - spark.read() + spark + .read() .format("iceberg") .option("snapshot-id", snapshotIds.get(3).toString()) .option("start-snapshot-id", snapshotIds.get(3).toString()) - .load(tableLocation).explain(); + .load(tableLocation) + .explain(); }); // end-snapshot-id and as-of-timestamp are both configured. @@ -264,12 +262,15 @@ public void testIncrementalScanOptions() throws IOException { IllegalArgumentException.class, "Cannot set start-snapshot-id and end-snapshot-id for incremental scans", () -> { - spark.read() + spark + .read() .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, + .option( + SparkReadOptions.AS_OF_TIMESTAMP, Long.toString(table.snapshot(snapshotIds.get(3)).timestampMillis())) .option("end-snapshot-id", snapshotIds.get(2).toString()) - .load(tableLocation).explain(); + .load(tableLocation) + .explain(); }); // only end-snapshot-id is configured. @@ -278,31 +279,37 @@ public void testIncrementalScanOptions() throws IOException { IllegalArgumentException.class, "Cannot set only end-snapshot-id for incremental scans", () -> { - spark.read() + spark + .read() .format("iceberg") .option("end-snapshot-id", snapshotIds.get(2).toString()) - .load(tableLocation).explain(); + .load(tableLocation) + .explain(); }); // test (1st snapshot, current snapshot] incremental scan. - List result = spark.read() - .format("iceberg") - .option("start-snapshot-id", snapshotIds.get(3).toString()) - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List result = + spark + .read() + .format("iceberg") + .option("start-snapshot-id", snapshotIds.get(3).toString()) + .load(tableLocation) + .orderBy("id") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Records should match", expectedRecords.subList(1, 4), result); // test (2nd snapshot, 3rd snapshot] incremental scan. - List result1 = spark.read() - .format("iceberg") - .option("start-snapshot-id", snapshotIds.get(2).toString()) - .option("end-snapshot-id", snapshotIds.get(1).toString()) - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List result1 = + spark + .read() + .format("iceberg") + .option("start-snapshot-id", snapshotIds.get(2).toString()) + .option("end-snapshot-id", snapshotIds.get(1).toString()) + .load(tableLocation) + .orderBy("id") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Records should match", expectedRecords.subList(2, 3), result1); } @@ -315,41 +322,34 @@ public void testMetadataSplitSizeOptionOverrideTableProperties() throws IOExcept Map options = Maps.newHashMap(); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); // produce 1st manifest - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); // produce 2nd manifest - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); List manifests = table.currentSnapshot().allManifests(table.io()); Assert.assertEquals("Must be 2 manifests", 2, manifests.size()); // set the target metadata split size so each manifest ends up in a separate split - table.updateProperties() + table + .updateProperties() .set(TableProperties.METADATA_SPLIT_SIZE, String.valueOf(manifests.get(0).length())) .commit(); - Dataset entriesDf = spark.read() - .format("iceberg") - .load(tableLocation + "#entries"); + Dataset entriesDf = spark.read().format("iceberg").load(tableLocation + "#entries"); Assert.assertEquals("Num partitions must match", 2, entriesDf.javaRDD().getNumPartitions()); // override the table property using options - entriesDf = spark.read() - .format("iceberg") - .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)) - .load(tableLocation + "#entries"); + entriesDf = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)) + .load(tableLocation + "#entries"); Assert.assertEquals("Num partitions must match", 1, entriesDf.javaRDD().getNumPartitions()); } @@ -362,24 +362,26 @@ public void testDefaultMetadataSplitSize() throws IOException { Map options = Maps.newHashMap(); Table icebergTable = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); int splitSize = (int) TableProperties.METADATA_SPLIT_SIZE_DEFAULT; // 32MB split size - int expectedSplits = ((int) tables.load(tableLocation + "#entries") - .currentSnapshot().allManifests(icebergTable.io()).get(0).length() + splitSize - 1) / splitSize; + int expectedSplits = + ((int) + tables + .load(tableLocation + "#entries") + .currentSnapshot() + .allManifests(icebergTable.io()) + .get(0) + .length() + + splitSize + - 1) + / splitSize; - Dataset metadataDf = spark.read() - .format("iceberg") - .load(tableLocation + "#entries"); + Dataset metadataDf = spark.read().format("iceberg").load(tableLocation + "#entries"); int partitionNum = metadataDf.javaRDD().getNumPartitions(); Assert.assertEquals("Spark partitions should match", expectedSplits, partitionNum); @@ -391,17 +393,17 @@ public void testExtraSnapshotMetadata() throws IOException { HadoopTables tables = new HadoopTables(CONF); tables.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".extra-key", "someValue") - .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".another-key", "anotherValue") - .save(tableLocation); + originalDf + .select("id", "data") + .write() + .format("iceberg") + .mode("append") + .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".extra-key", "someValue") + .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".another-key", "anotherValue") + .save(tableLocation); Table table = tables.load(tableLocation); @@ -414,26 +416,27 @@ public void testExtraSnapshotMetadataWithSQL() throws InterruptedException, IOEx String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); - Table table = tables.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + tables.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); spark.read().format("iceberg").load(tableLocation).createOrReplaceTempView("target"); - Thread writerThread = new Thread(() -> { - Map properties = Maps.newHashMap(); - properties.put("writer-thread", String.valueOf(Thread.currentThread().getName())); - CommitMetadata.withCommitProperties(properties, () -> { - spark.sql("INSERT INTO target VALUES (3, 'c'), (4, 'd')"); - return 0; - }, RuntimeException.class); - }); + Thread writerThread = + new Thread( + () -> { + Map properties = Maps.newHashMap(); + properties.put("writer-thread", String.valueOf(Thread.currentThread().getName())); + CommitMetadata.withCommitProperties( + properties, + () -> { + spark.sql("INSERT INTO target VALUES (3, 'c'), (4, 'd')"); + return 0; + }, + RuntimeException.class); + }); writerThread.setName("test-extra-commit-message-writer-thread"); writerThread.start(); writerThread.join(); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java index d51fd3c4e8eb..b30bbf145f23 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; +import static org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp; +import static org.apache.spark.sql.functions.callUDF; +import static org.apache.spark.sql.functions.column; + import java.io.File; import java.io.IOException; import java.sql.Timestamp; @@ -79,41 +83,31 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; -import static org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp; -import static org.apache.spark.sql.functions.callUDF; -import static org.apache.spark.sql.functions.column; - @RunWith(Parameterized.class) public class TestFilteredScan { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "ts", Types.TimestampType.withZone()), - Types.NestedField.optional(3, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "ts", Types.TimestampType.withZone()), + Types.NestedField.optional(3, "data", Types.StringType.get())); - private static final PartitionSpec BUCKET_BY_ID = PartitionSpec.builderFor(SCHEMA) - .bucket("id", 4) - .build(); + private static final PartitionSpec BUCKET_BY_ID = + PartitionSpec.builderFor(SCHEMA).bucket("id", 4).build(); - private static final PartitionSpec PARTITION_BY_DAY = PartitionSpec.builderFor(SCHEMA) - .day("ts") - .build(); + private static final PartitionSpec PARTITION_BY_DAY = + PartitionSpec.builderFor(SCHEMA).day("ts").build(); - private static final PartitionSpec PARTITION_BY_HOUR = PartitionSpec.builderFor(SCHEMA) - .hour("ts") - .build(); + private static final PartitionSpec PARTITION_BY_HOUR = + PartitionSpec.builderFor(SCHEMA).hour("ts").build(); - private static final PartitionSpec PARTITION_BY_DATA = PartitionSpec.builderFor(SCHEMA) - .identity("data") - .build(); + private static final PartitionSpec PARTITION_BY_DATA = + PartitionSpec.builderFor(SCHEMA).identity("data").build(); - private static final PartitionSpec PARTITION_BY_ID = PartitionSpec.builderFor(SCHEMA) - .identity("id") - .build(); + private static final PartitionSpec PARTITION_BY_ID = + PartitionSpec.builderFor(SCHEMA).identity("id").build(); private static SparkSession spark = null; @@ -126,14 +120,20 @@ public static void startSpark() { spark.udf().register("bucket4", (UDF1) bucket4::apply, IntegerType$.MODULE$); Transform day = Transforms.day(Types.TimestampType.withZone()); - spark.udf().register("ts_day", - (UDF1) timestamp -> day.apply((Long) fromJavaTimestamp(timestamp)), - IntegerType$.MODULE$); + spark + .udf() + .register( + "ts_day", + (UDF1) timestamp -> day.apply((Long) fromJavaTimestamp(timestamp)), + IntegerType$.MODULE$); Transform hour = Transforms.hour(Types.TimestampType.withZone()); - spark.udf().register("ts_hour", - (UDF1) timestamp -> hour.apply((Long) fromJavaTimestamp(timestamp)), - IntegerType$.MODULE$); + spark + .udf() + .register( + "ts_hour", + (UDF1) timestamp -> hour.apply((Long) fromJavaTimestamp(timestamp)), + IntegerType$.MODULE$); spark.udf().register("data_ident", (UDF1) data -> data, StringType$.MODULE$); spark.udf().register("id_ident", (UDF1) id -> id, LongType$.MODULE$); @@ -146,8 +146,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String format; private final boolean vectorized; @@ -155,11 +154,11 @@ public static void stopSpark() { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } @@ -188,26 +187,27 @@ public void writeUnpartitionedTable() throws IOException { this.records = testRecords(tableSchema); - try (FileAppender writer = new GenericAppenderFactory(tableSchema).newAppender( - localOutput(testFile), fileFormat)) { + try (FileAppender writer = + new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), fileFormat)) { writer.addAll(records); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(records.size()) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(records.size()) + .withFileSizeInBytes(testFile.length()) + .withPath(testFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); } @Test public void testUnpartitionedIDFilters() { - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", unpartitioned.toString()) - ); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString())); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); for (int i = 0; i < 10; i += 1) { pushFilters(builder, EqualTo.apply("id", i)); @@ -217,16 +217,15 @@ public void testUnpartitionedIDFilters() { Assert.assertEquals("Should only create one task for a small file", 1, partitions.length); // validate row filtering - assertEqualsSafe(SCHEMA.asStruct(), expected(i), - read(unpartitioned.toString(), vectorized, "id = " + i)); + assertEqualsSafe( + SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), vectorized, "id = " + i)); } } @Test public void testUnpartitionedCaseInsensitiveIDFilters() { - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", unpartitioned.toString()) - ); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString())); // set spark.sql.caseSensitive to false String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive"); @@ -235,17 +234,22 @@ public void testUnpartitionedCaseInsensitiveIDFilters() { try { for (int i = 0; i < 10; i += 1) { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options) - .caseSensitive(false); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options) + .caseSensitive(false); - pushFilters(builder, EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match + pushFilters( + builder, + EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match Batch scan = builder.build().toBatch(); InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.length); // validate row filtering - assertEqualsSafe(SCHEMA.asStruct(), expected(i), + assertEqualsSafe( + SCHEMA.asStruct(), + expected(i), read(unpartitioned.toString(), vectorized, "id = " + i)); } } finally { @@ -256,11 +260,11 @@ public void testUnpartitionedCaseInsensitiveIDFilters() { @Test public void testUnpartitionedTimestampFilter() { - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", unpartitioned.toString()) - ); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); Batch scan = builder.build().toBatch(); @@ -268,21 +272,29 @@ public void testUnpartitionedTimestampFilter() { InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), - read(unpartitioned.toString(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(5, 6, 7, 8, 9), + read( + unpartitioned.toString(), + vectorized, + "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } @Test public void testBucketPartitionedIDFilters() { Table table = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); - Assert.assertEquals("Unfiltered table should created 4 read tasks", - 4, unfiltered.planInputPartitions().length); + Batch unfiltered = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); + Assert.assertEquals( + "Unfiltered table should created 4 read tasks", 4, unfiltered.planInputPartitions().length); for (int i = 0; i < 10; i += 1) { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, EqualTo.apply("id", i)); Batch scan = builder.build().toBatch(); @@ -293,7 +305,8 @@ public void testBucketPartitionedIDFilters() { Assert.assertEquals("Should create one task for a single bucket", 1, tasks.length); // validate row filtering - assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(table.location(), vectorized, "id = " + i)); + assertEqualsSafe( + SCHEMA.asStruct(), expected(i), read(table.location(), vectorized, "id = " + i)); } } @@ -301,14 +314,17 @@ public void testBucketPartitionedIDFilters() { @Test public void testDayPartitionedTimestampFilters() { Table table = buildPartitionedTable("partitioned_by_day", PARTITION_BY_DAY, "ts_day", "ts"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + Batch unfiltered = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); - Assert.assertEquals("Unfiltered table should created 2 read tasks", - 2, unfiltered.planInputPartitions().length); + Assert.assertEquals( + "Unfiltered table should created 2 read tasks", 2, unfiltered.planInputPartitions().length); { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); Batch scan = builder.build().toBatch(); @@ -316,24 +332,35 @@ public void testDayPartitionedTimestampFilters() { InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should create one task for 2017-12-21", 1, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), - read(table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(5, 6, 7, 8, 9), + read( + table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); - - pushFilters(builder, And.apply( - GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), - LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + + pushFilters( + builder, + And.apply( + GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), + LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); Batch scan = builder.build().toBatch(); InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should create one task for 2017-12-22", 1, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(1, 2), read(table.location(), vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + - "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(1, 2), + read( + table.location(), + vectorized, + "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); } } @@ -342,14 +369,17 @@ public void testDayPartitionedTimestampFilters() { public void testHourPartitionedTimestampFilters() { Table table = buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + Batch unfiltered = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); - Assert.assertEquals("Unfiltered table should created 9 read tasks", - 9, unfiltered.planInputPartitions().length); + Assert.assertEquals( + "Unfiltered table should created 9 read tasks", 9, unfiltered.planInputPartitions().length); { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); Batch scan = builder.build().toBatch(); @@ -357,24 +387,35 @@ public void testHourPartitionedTimestampFilters() { InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should create 4 tasks for 2017-12-21: 15, 17, 21, 22", 4, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(8, 9, 7, 6, 5), - read(table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(8, 9, 7, 6, 5), + read( + table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); - - pushFilters(builder, And.apply( - GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), - LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + + pushFilters( + builder, + And.apply( + GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), + LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); Batch scan = builder.build().toBatch(); InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should create 2 tasks for 2017-12-22: 6, 7", 2, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(2, 1), read(table.location(), vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + - "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(2, 1), + read( + table.location(), + vectorized, + "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); } } @@ -388,10 +429,15 @@ public void testFilterByNonProjectedColumn() { expected.add(projectFlat(actualProjection, rec)); } - assertEqualsSafe(actualProjection.asStruct(), expected, read( - unpartitioned.toString(), vectorized, - "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)", - "id", "data")); + assertEqualsSafe( + actualProjection.asStruct(), + expected, + read( + unpartitioned.toString(), + vectorized, + "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)", + "id", + "data")); } { @@ -403,20 +449,27 @@ public void testFilterByNonProjectedColumn() { expected.add(projectFlat(actualProjection, rec)); } - assertEqualsSafe(actualProjection.asStruct(), expected, read( - unpartitioned.toString(), vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + - "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)", - "id")); + assertEqualsSafe( + actualProjection.asStruct(), + expected, + read( + unpartitioned.toString(), + vectorized, + "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)", + "id")); } } @Test public void testPartitionedByDataStartsWithFilter() { - Table table = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + Table table = + buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new StringStartsWith("data", "junc")); Batch scan = builder.build().toBatch(); @@ -426,10 +479,13 @@ public void testPartitionedByDataStartsWithFilter() { @Test public void testPartitionedByDataNotStartsWithFilter() { - Table table = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + Table table = + buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new Not(new StringStartsWith("data", "junc"))); Batch scan = builder.build().toBatch(); @@ -441,11 +497,11 @@ public void testPartitionedByDataNotStartsWithFilter() { public void testPartitionedByIdStartsWith() { Table table = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", table.location()) - ); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new StringStartsWith("data", "junc")); Batch scan = builder.build().toBatch(); @@ -457,11 +513,11 @@ public void testPartitionedByIdStartsWith() { public void testPartitionedByIdNotStartsWith() { Table table = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", table.location()) - ); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new Not(new StringStartsWith("data", "junc"))); Batch scan = builder.build().toBatch(); @@ -471,15 +527,15 @@ public void testPartitionedByIdNotStartsWith() { @Test public void testUnpartitionedStartsWith() { - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()); - List matchedData = df.select("data") - .where("data LIKE 'jun%'") - .as(Encoders.STRING()) - .collectAsList(); + List matchedData = + df.select("data").where("data LIKE 'jun%'").as(Encoders.STRING()).collectAsList(); Assert.assertEquals(1, matchedData.size()); Assert.assertEquals("junction", matchedData.get(0)); @@ -487,20 +543,21 @@ public void testUnpartitionedStartsWith() { @Test public void testUnpartitionedNotStartsWith() { - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()); - - List matchedData = df.select("data") - .where("data NOT LIKE 'jun%'") - .as(Encoders.STRING()) - .collectAsList(); - - List expected = testRecords(SCHEMA).stream() - .map(r -> r.getField("data").toString()) - .filter(d -> !d.startsWith("jun")) - .collect(Collectors.toList()); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()); + + List matchedData = + df.select("data").where("data NOT LIKE 'jun%'").as(Encoders.STRING()).collectAsList(); + + List expected = + testRecords(SCHEMA).stream() + .map(r -> r.getField("data").toString()) + .filter(d -> !d.startsWith("jun")) + .collect(Collectors.toList()); Assert.assertEquals(9, matchedData.size()); Assert.assertEquals(Sets.newHashSet(expected), Sets.newHashSet(matchedData)); @@ -516,8 +573,8 @@ private static Record projectFlat(Schema projection, Record record) { return result; } - public static void assertEqualsUnsafe(Types.StructType struct, - List expected, List actual) { + public static void assertEqualsUnsafe( + Types.StructType struct, List expected, List actual) { // TODO: match records by ID int numRecords = Math.min(expected.size(), actual.size()); for (int i = 0; i < numRecords; i += 1) { @@ -526,8 +583,8 @@ public static void assertEqualsUnsafe(Types.StructType struct, Assert.assertEquals("Number of results should match expected", expected.size(), actual.size()); } - public static void assertEqualsSafe(Types.StructType struct, - List expected, List actual) { + public static void assertEqualsSafe( + Types.StructType struct, List expected, List actual) { // TODO: match records by ID int numRecords = Math.min(expected.size(), actual.size()); for (int i = 0; i < numRecords; i += 1) { @@ -550,7 +607,8 @@ private void pushFilters(ScanBuilder scan, Filter... filters) { filterable.pushFilters(filters); } - private Table buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) { + private Table buildPartitionedTable( + String desc, PartitionSpec spec, String udf, String partitionColumn) { File location = new File(parent, desc); Table table = TABLES.create(SCHEMA, spec, location.toString()); @@ -559,10 +617,12 @@ private Table buildPartitionedTable(String desc, PartitionSpec spec, String udf, table.updateProperties().set("read.split.target-size", "2048").commit(); // copy the unpartitioned table into the partitioned table to produce the partitioned data - Dataset allRows = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()); + Dataset allRows = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()); allRows .coalesce(1) // ensure only 1 file per partition is written @@ -590,19 +650,23 @@ private List testRecords(Schema schema) { record(schema, 6L, parse("2017-12-21T21:55:30.589712+00:00"), "element"), record(schema, 7L, parse("2017-12-21T17:31:14.532797+00:00"), "limited"), record(schema, 8L, parse("2017-12-21T15:21:51.237521+00:00"), "global"), - record(schema, 9L, parse("2017-12-21T15:02:15.230570+00:00"), "goldfish") - ); + record(schema, 9L, parse("2017-12-21T15:02:15.230570+00:00"), "goldfish")); } private static List read(String table, boolean vectorized, String expr) { return read(table, vectorized, expr, "*"); } - private static List read(String table, boolean vectorized, String expr, String select0, String... selectN) { - Dataset dataset = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table).filter(expr) - .select(select0, selectN); + private static List read( + String table, boolean vectorized, String expr, String select0, String... selectN) { + Dataset dataset = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table) + .filter(expr) + .select(select0, selectN); return dataset.collectAsList(); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java index 74203fd20f9c..42d9ac6a79ba 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localInput; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.List; @@ -63,25 +65,26 @@ import scala.Option; import scala.collection.JavaConverters; -import static org.apache.iceberg.Files.localInput; -import static org.apache.iceberg.Files.localOutput; - public class TestForwardCompatibility { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); // create a spec for the schema that uses a "zero" transform that produces all 0s - private static final PartitionSpec UNKNOWN_SPEC = PartitionSpecParser.fromJson(SCHEMA, - "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); + private static final PartitionSpec UNKNOWN_SPEC = + PartitionSpecParser.fromJson( + SCHEMA, + "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); // create a fake spec to use to write table metadata - private static final PartitionSpec FAKE_SPEC = PartitionSpecParser.fromJson(SCHEMA, - "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"identity\", \"source-id\": 1 } ] }"); + private static final PartitionSpec FAKE_SPEC = + PartitionSpecParser.fromJson( + SCHEMA, + "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"identity\", \"source-id\": 1 } ] }"); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; @@ -107,20 +110,22 @@ public void testSparkWriteFailsUnknownTransform() throws IOException { HadoopTables tables = new HadoopTables(CONF); tables.create(SCHEMA, UNKNOWN_SPEC, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - AssertHelpers.assertThrows("Should reject write with unsupported transform", - UnsupportedOperationException.class, "Cannot write using unsupported transforms: zero", - () -> df.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(location.toString())); + AssertHelpers.assertThrows( + "Should reject write with unsupported transform", + UnsupportedOperationException.class, + "Cannot write using unsupported transforms: zero", + () -> + df.select("id", "data") + .write() + .format("iceberg") + .mode("append") + .save(location.toString())); } @Test @@ -136,20 +141,24 @@ public void testSparkStreamingWriteFailsUnknownTransform() throws IOException, T tables.create(SCHEMA, UNKNOWN_SPEC, location.toString()); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - StreamingQuery query = inputStream.toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("append") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()) - .start(); + StreamingQuery query = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("append") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()) + .start(); List batch1 = Lists.newArrayList(1, 2); send(batch1, inputStream); - AssertHelpers.assertThrows("Should reject streaming write with unsupported transform", - StreamingQueryException.class, "Cannot write using unsupported transforms: zero", + AssertHelpers.assertThrows( + "Should reject streaming write with unsupported transform", + StreamingQueryException.class, + "Cannot write using unsupported transforms: zero", query::processAllAvailable); } @@ -168,22 +177,22 @@ public void testSparkCanReadUnknownTransform() throws IOException { List expected = RandomData.generateList(table.schema(), 100, 1L); - File parquetFile = new File(dataFolder, - FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); - FileAppender writer = Parquet.write(localOutput(parquetFile)) - .schema(table.schema()) - .build(); + File parquetFile = + new File(dataFolder, FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); + FileAppender writer = + Parquet.write(localOutput(parquetFile)).schema(table.schema()).build(); try { writer.addAll(expected); } finally { writer.close(); } - DataFile file = DataFiles.builder(FAKE_SPEC) - .withInputFile(localInput(parquetFile)) - .withMetrics(writer.metrics()) - .withPartitionPath("id_zero=0") - .build(); + DataFile file = + DataFiles.builder(FAKE_SPEC) + .withInputFile(localInput(parquetFile)) + .withMetrics(writer.metrics()) + .withPartitionPath("id_zero=0") + .build(); OutputFile manifestFile = localOutput(FileFormat.AVRO.addExtension(temp.newFile().toString())); ManifestWriter manifestWriter = ManifestFiles.write(FAKE_SPEC, manifestFile); @@ -195,9 +204,7 @@ public void testSparkCanReadUnknownTransform() throws IOException { table.newFastAppend().appendManifest(manifestWriter.toManifestFile()).commit(); - Dataset df = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset df = spark.read().format("iceberg").load(location.toString()); List rows = df.collectAsList(); Assert.assertEquals("Should contain 100 rows", 100, rows.size()); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java index 42f53d585601..a850275118db 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.catalog.TableIdentifier; @@ -40,5 +39,4 @@ public Identifier extractIdentifier(CaseInsensitiveStringMap options) { public String extractCatalog(CaseInsensitiveStringMap options) { return SparkSession.active().sessionState().catalogManager().currentCatalog().name(); } - } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java index f1cfc7a72e17..b55ba0e2199a 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java index a51f9ee85e2f..de26f5f82c49 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -59,7 +58,8 @@ public Table createTable(TableIdentifier ident, Schema schema, PartitionSpec spe @Override public Table loadTable(TableIdentifier ident, String entriesSuffix) { - TableIdentifier identifier = TableIdentifier.of(ident.namespace().level(0), ident.name(), entriesSuffix); + TableIdentifier identifier = + TableIdentifier.of(ident.namespace().level(0), ident.name(), entriesSuffix); return TestIcebergSourceHiveTables.catalog.loadTable(identifier); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java index ce40f179d649..3ded9471fe9c 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.ManifestContent.DATA; +import static org.apache.iceberg.ManifestContent.DELETES; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.io.UncheckedIOException; import java.util.Comparator; @@ -76,33 +80,26 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.ManifestContent.DATA; -import static org.apache.iceberg.ManifestContent.DELETES; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class TestIcebergSourceTablesBase extends SparkTestBase { - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - private static final Schema SCHEMA2 = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()), - optional(3, "category", Types.StringType.get()) - ); + private static final Schema SCHEMA2 = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get()), + optional(3, "category", Types.StringType.get())); - private static final Schema SCHEMA3 = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(3, "category", Types.StringType.get()) - ); + private static final Schema SCHEMA3 = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(3, "category", Types.StringType.get())); private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("id").build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); public abstract Table createTable(TableIdentifier ident, Schema schema, PartitionSpec spec); @@ -117,23 +114,21 @@ public synchronized void testTablesSupport() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "1"), - new SimpleRecord(2, "2"), - new SimpleRecord(3, "3")); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "1"), new SimpleRecord(2, "2"), new SimpleRecord(3, "3")); Dataset inputDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - List actualRecords = resultDf.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + List actualRecords = + resultDf.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Records should match", expectedRecords, actualRecords); } @@ -147,32 +142,39 @@ public void testEntriesTable() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .collectAsList(); Snapshot snapshot = table.currentSnapshot(); - Assert.assertEquals("Should only contain one manifest", 1, snapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should only contain one manifest", 1, snapshot.allManifests(table.io()).size()); InputFile manifest = table.io().newInputFile(snapshot.allManifests(table.io()).get(0).path()); List expected = Lists.newArrayList(); - try (CloseableIterable rows = Avro.read(manifest).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(manifest).project(entriesTable.schema()).build()) { // each row must inherit snapshot_id and sequence_number - rows.forEach(row -> { - row.put(2, 0L); - GenericData.Record file = (GenericData.Record) row.get("data_file"); - asMetadataRecord(file); - expected.add(row); - }); + rows.forEach( + row -> { + row.put(2, 0L); + GenericData.Record file = (GenericData.Record) row.get("data_file"); + asMetadataRecord(file); + expected.add(row); + }); } Assert.assertEquals("Entries table should have one row", 1, expected.size()); @@ -188,18 +190,22 @@ public void testEntriesTablePartitionedPrune() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("status") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select("status") + .collectAsList(); Assert.assertEquals("Results should contain only one status", 1, actual.size()); Assert.assertEquals("That status should be Added (1)", 1, actual.get(0).getInt(0)); @@ -213,7 +219,9 @@ public void testEntriesTableDataFilePrune() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -221,15 +229,19 @@ public void testEntriesTableDataFilePrune() throws Exception { table.refresh(); DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - List singleActual = rowsToJava(spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("data_file.file_path") - .collectAsList()); + List singleActual = + rowsToJava( + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select("data_file.file_path") + .collectAsList()); List singleExpected = ImmutableList.of(row(file.path())); - assertEquals("Should prune a single element from a nested struct", singleExpected, singleActual); + assertEquals( + "Should prune a single element from a nested struct", singleExpected, singleActual); } @Test @@ -240,7 +252,9 @@ public void testEntriesTableDataFilePruneMulti() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -248,14 +262,22 @@ public void testEntriesTableDataFilePruneMulti() throws Exception { table.refresh(); DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - List multiActual = rowsToJava(spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("data_file.file_path", "data_file.value_counts", "data_file.record_count", "data_file.column_sizes") - .collectAsList()); - - List multiExpected = ImmutableList.of( - row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); + List multiActual = + rowsToJava( + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select( + "data_file.file_path", + "data_file.value_counts", + "data_file.record_count", + "data_file.column_sizes") + .collectAsList()); + + List multiExpected = + ImmutableList.of( + row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); assertEquals("Should prune a single element from a nested struct", multiExpected, multiActual); } @@ -268,7 +290,9 @@ public void testFilesSelectMap() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -276,14 +300,18 @@ public void testFilesSelectMap() throws Exception { table.refresh(); DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - List multiActual = rowsToJava(spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .select("file_path", "value_counts", "record_count", "column_sizes") - .collectAsList()); + List multiActual = + rowsToJava( + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "files")) + .select("file_path", "value_counts", "record_count", "column_sizes") + .collectAsList()); - List multiExpected = ImmutableList.of( - row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); + List multiExpected = + ImmutableList.of( + row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); assertEquals("Should prune a single element from a row", multiExpected, multiActual); } @@ -294,10 +322,13 @@ public void testAllEntriesTable() throws Exception { Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); Table entriesTable = loadTable(tableIdentifier, "all_entries"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -306,7 +337,8 @@ public void testAllEntriesTable() throws Exception { table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -314,24 +346,28 @@ public void testAllEntriesTable() throws Exception { // ensure table data isn't stale table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_entries")) - .orderBy("snapshot_id") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_entries")) + .orderBy("snapshot_id") + .collectAsList(); List expected = Lists.newArrayList(); - for (ManifestFile manifest : Iterables.concat( - Iterables.transform(table.snapshots(), s -> s.allManifests(table.io())))) { + for (ManifestFile manifest : + Iterables.concat(Iterables.transform(table.snapshots(), s -> s.allManifests(table.io())))) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { // each row must inherit snapshot_id and sequence_number - rows.forEach(row -> { - row.put(2, 0L); - GenericData.Record file = (GenericData.Record) row.get("data_file"); - asMetadataRecord(file); - expected.add(row); - }); + rows.forEach( + row -> { + row.put(2, 0L); + GenericData.Record file = (GenericData.Record) row.get("data_file"); + asMetadataRecord(file); + expected.add(row); + }); } } @@ -340,7 +376,8 @@ public void testAllEntriesTable() throws Exception { Assert.assertEquals("Entries table should have 3 rows", 3, expected.size()); Assert.assertEquals("Actual results should have 3 rows", 3, actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(entriesTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + entriesTable.schema().asStruct(), expected.get(i), actual.get(i)); } } @@ -352,7 +389,9 @@ public void testCountEntriesTable() { // init load List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -360,12 +399,16 @@ public void testCountEntriesTable() { final int expectedEntryCount = 1; // count entries - Assert.assertEquals("Count should return " + expectedEntryCount, - expectedEntryCount, spark.read().format("iceberg").load(loadLocation(tableIdentifier, "entries")).count()); + Assert.assertEquals( + "Count should return " + expectedEntryCount, + expectedEntryCount, + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "entries")).count()); // count all_entries - Assert.assertEquals("Count should return " + expectedEntryCount, - expectedEntryCount, spark.read().format("iceberg").load(loadLocation(tableIdentifier, "all_entries")).count()); + Assert.assertEquals( + "Count should return " + expectedEntryCount, + expectedEntryCount, + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "all_entries")).count()); } @Test @@ -375,16 +418,20 @@ public void testFilesTable() throws Exception { Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "files"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -392,15 +439,14 @@ public void testFilesTable() throws Exception { // delete the first file to test that only live files are listed table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .collectAsList(); + List actual = + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")).collectAsList(); List expected = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().dataManifests(table.io())) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { if ((Integer) record.get("status") < 2 /* added or existing */) { GenericData.Record file = (GenericData.Record) record.get("data_file"); @@ -422,42 +468,42 @@ public void testFilesTableWithSnapshotIdInheritance() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "files_inheritance_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); - table.updateProperties() - .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "files"); - spark.sql(String.format( - "CREATE TABLE parquet_table (data string, id int) " + - "USING parquet PARTITIONED BY (id) LOCATION '%s'", - temp.newFolder())); + spark.sql( + String.format( + "CREATE TABLE parquet_table (data string, id int) " + + "USING parquet PARTITIONED BY (id) LOCATION '%s'", + temp.newFolder())); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF.select("data", "id").write() - .mode("overwrite") - .insertInto("parquet_table"); + inputDF.select("data", "id").write().mode("overwrite").insertInto("parquet_table"); try { String stagingLocation = table.location() + "/metadata"; - SparkTableUtil.importSparkTable(spark, + SparkTableUtil.importSparkTable( + spark, new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), - table, stagingLocation); + table, + stagingLocation); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "files")) + .collectAsList(); List expected = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().dataManifests(table.io())) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { GenericData.Record file = (GenericData.Record) record.get("data_file"); asMetadataRecord(file); @@ -473,7 +519,6 @@ public void testFilesTableWithSnapshotIdInheritance() throws Exception { } finally { spark.sql("DROP TABLE parquet_table"); } - } @Test @@ -484,35 +529,35 @@ public void testEntriesTableWithSnapshotIdInheritance() throws Exception { PartitionSpec spec = SPEC; Table table = createTable(tableIdentifier, SCHEMA, spec); - table.updateProperties() - .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); - spark.sql(String.format( - "CREATE TABLE parquet_table (data string, id int) " + - "USING parquet PARTITIONED BY (id) LOCATION '%s'", - temp.newFolder())); + spark.sql( + String.format( + "CREATE TABLE parquet_table (data string, id int) " + + "USING parquet PARTITIONED BY (id) LOCATION '%s'", + temp.newFolder())); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF.select("data", "id").write() - .mode("overwrite") - .insertInto("parquet_table"); + inputDF.select("data", "id").write().mode("overwrite").insertInto("parquet_table"); try { String stagingLocation = table.location() + "/metadata"; SparkTableUtil.importSparkTable( - spark, new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), table, stagingLocation); + spark, + new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), + table, + stagingLocation); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("sequence_number", "snapshot_id", "data_file") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select("sequence_number", "snapshot_id", "data_file") + .collectAsList(); table.refresh(); @@ -535,19 +580,24 @@ public void testFilesUnpartitionedTable() throws Exception { Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "files"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); - DataFile toDelete = Iterables.getOnlyElement(table.currentSnapshot().addedDataFiles(table.io())); + DataFile toDelete = + Iterables.getOnlyElement(table.currentSnapshot().addedDataFiles(table.io())); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -555,15 +605,14 @@ public void testFilesUnpartitionedTable() throws Exception { // delete the first file to test that only live files are listed table.newDelete().deleteFile(toDelete).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .collectAsList(); + List actual = + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")).collectAsList(); List expected = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().dataManifests(table.io())) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { if ((Integer) record.get("status") < 2 /* added or existing */) { GenericData.Record file = (GenericData.Record) record.get("data_file"); @@ -586,38 +635,49 @@ public void testAllMetadataTablesWithStagedCommits() throws Exception { table.updateProperties().set(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, "true").commit(); spark.conf().set("spark.wap.id", "1234567"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actualAllData = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_data_files")) - .collectAsList(); - - List actualAllManifests = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .collectAsList(); - - List actualAllEntries = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_entries")) - .collectAsList(); - - Assert.assertTrue("Stage table should have some snapshots", table.snapshots().iterator().hasNext()); - Assert.assertEquals("Stage table should have null currentSnapshot", - null, table.currentSnapshot()); + List actualAllData = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_data_files")) + .collectAsList(); + + List actualAllManifests = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_manifests")) + .collectAsList(); + + List actualAllEntries = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_entries")) + .collectAsList(); + + Assert.assertTrue( + "Stage table should have some snapshots", table.snapshots().iterator().hasNext()); + Assert.assertEquals( + "Stage table should have null currentSnapshot", null, table.currentSnapshot()); Assert.assertEquals("Actual results should have two rows", 2, actualAllData.size()); Assert.assertEquals("Actual results should have two rows", 2, actualAllManifests.size()); Assert.assertEquals("Actual results should have two rows", 2, actualAllEntries.size()); @@ -630,10 +690,13 @@ public void testAllDataFilesTable() throws Exception { Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "all_data_files"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -642,7 +705,8 @@ public void testAllDataFilesTable() throws Exception { table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -650,19 +714,23 @@ public void testAllDataFilesTable() throws Exception { // ensure table data isn't stale table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_data_files")) - .orderBy("file_path") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_data_files")) + .orderBy("file_path") + .collectAsList(); actual.sort(Comparator.comparing(o -> o.getString(1))); List expected = Lists.newArrayList(); - Iterable dataManifests = Iterables.concat(Iterables.transform(table.snapshots(), - snapshot -> snapshot.dataManifests(table.io()))); + Iterable dataManifests = + Iterables.concat( + Iterables.transform(table.snapshots(), snapshot -> snapshot.dataManifests(table.io()))); for (ManifestFile manifest : dataManifests) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { if ((Integer) record.get("status") < 2 /* added or existing */) { GenericData.Record file = (GenericData.Record) record.get("data_file"); @@ -691,7 +759,9 @@ public void testHistoryTable() { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -700,7 +770,9 @@ public void testHistoryTable() { long firstSnapshotTimestamp = table.currentSnapshot().timestampMillis(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -713,7 +785,9 @@ public void testHistoryTable() { table.rollback().toSnapshotId(firstSnapshotId).commit(); long rollbackTimestamp = Iterables.getLast(table.history()).timestampMillis(); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -722,34 +796,43 @@ public void testHistoryTable() { long thirdSnapshotTimestamp = table.currentSnapshot().timestampMillis(); long thirdSnapshotId = table.currentSnapshot().snapshotId(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "history")) - .collectAsList(); - - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(historyTable.schema(), "history")); - List expected = Lists.newArrayList( - builder.set("made_current_at", firstSnapshotTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("is_current_ancestor", true) - .build(), - builder.set("made_current_at", secondSnapshotTimestamp * 1000) - .set("snapshot_id", secondSnapshotId) - .set("parent_id", firstSnapshotId) - .set("is_current_ancestor", false) // commit rolled back, not an ancestor of the current table state - .build(), - builder.set("made_current_at", rollbackTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("is_current_ancestor", true) - .build(), - builder.set("made_current_at", thirdSnapshotTimestamp * 1000) - .set("snapshot_id", thirdSnapshotId) - .set("parent_id", firstSnapshotId) - .set("is_current_ancestor", true) - .build() - ); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "history")) + .collectAsList(); + + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(historyTable.schema(), "history")); + List expected = + Lists.newArrayList( + builder + .set("made_current_at", firstSnapshotTimestamp * 1000) + .set("snapshot_id", firstSnapshotId) + .set("parent_id", null) + .set("is_current_ancestor", true) + .build(), + builder + .set("made_current_at", secondSnapshotTimestamp * 1000) + .set("snapshot_id", secondSnapshotId) + .set("parent_id", firstSnapshotId) + .set( + "is_current_ancestor", + false) // commit rolled back, not an ancestor of the current table state + .build(), + builder + .set("made_current_at", rollbackTimestamp * 1000) + .set("snapshot_id", firstSnapshotId) + .set("parent_id", null) + .set("is_current_ancestor", true) + .build(), + builder + .set("made_current_at", thirdSnapshotTimestamp * 1000) + .set("snapshot_id", thirdSnapshotId) + .set("parent_id", firstSnapshotId) + .set("is_current_ancestor", true) + .build()); Assert.assertEquals("History table should have a row for each commit", 4, actual.size()); TestHelpers.assertEqualsSafe(historyTable.schema().asStruct(), expected.get(0), actual.get(0)); @@ -767,7 +850,9 @@ public void testSnapshotsTable() { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -786,40 +871,47 @@ public void testSnapshotsTable() { // rollback the table state to the first snapshot table.rollback().toSnapshotId(firstSnapshotId).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "snapshots")) - .collectAsList(); - - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(snapTable.schema(), "snapshots")); - List expected = Lists.newArrayList( - builder.set("committed_at", firstSnapshotTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("operation", "append") - .set("manifest_list", firstManifestList) - .set("summary", ImmutableMap.of( - "added-records", "1", - "added-data-files", "1", - "changed-partition-count", "1", - "total-data-files", "1", - "total-records", "1" - )) - .build(), - builder.set("committed_at", secondSnapshotTimestamp * 1000) - .set("snapshot_id", secondSnapshotId) - .set("parent_id", firstSnapshotId) - .set("operation", "delete") - .set("manifest_list", secondManifestList) - .set("summary", ImmutableMap.of( - "deleted-records", "1", - "deleted-data-files", "1", - "changed-partition-count", "1", - "total-records", "0", - "total-data-files", "0" - )) - .build() - ); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "snapshots")) + .collectAsList(); + + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(snapTable.schema(), "snapshots")); + List expected = + Lists.newArrayList( + builder + .set("committed_at", firstSnapshotTimestamp * 1000) + .set("snapshot_id", firstSnapshotId) + .set("parent_id", null) + .set("operation", "append") + .set("manifest_list", firstManifestList) + .set( + "summary", + ImmutableMap.of( + "added-records", "1", + "added-data-files", "1", + "changed-partition-count", "1", + "total-data-files", "1", + "total-records", "1")) + .build(), + builder + .set("committed_at", secondSnapshotTimestamp * 1000) + .set("snapshot_id", secondSnapshotId) + .set("parent_id", firstSnapshotId) + .set("operation", "delete") + .set("manifest_list", secondManifestList) + .set( + "summary", + ImmutableMap.of( + "deleted-records", "1", + "deleted-data-files", "1", + "changed-partition-count", "1", + "total-records", "0", + "total-data-files", "0")) + .build()); Assert.assertEquals("Snapshots table should have a row for each snapshot", 2, actual.size()); TestHelpers.assertEqualsSafe(snapTable.schema().asStruct(), expected.get(0), actual.get(0)); @@ -834,7 +926,9 @@ public void testPrunedSnapshotsTable() { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -850,40 +944,47 @@ public void testPrunedSnapshotsTable() { // rollback the table state to the first snapshot table.rollback().toSnapshotId(firstSnapshotId).commit(); - Dataset actualDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "snapshots")) - .select("operation", "committed_at", "summary", "parent_id"); + Dataset actualDf = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "snapshots")) + .select("operation", "committed_at", "summary", "parent_id"); Schema projectedSchema = SparkSchemaUtil.convert(actualDf.schema()); List actual = actualDf.collectAsList(); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema, "snapshots")); - List expected = Lists.newArrayList( - builder.set("committed_at", firstSnapshotTimestamp * 1000) - .set("parent_id", null) - .set("operation", "append") - .set("summary", ImmutableMap.of( - "added-records", "1", - "added-data-files", "1", - "changed-partition-count", "1", - "total-data-files", "1", - "total-records", "1" - )) - .build(), - builder.set("committed_at", secondSnapshotTimestamp * 1000) - .set("parent_id", firstSnapshotId) - .set("operation", "delete") - .set("summary", ImmutableMap.of( - "deleted-records", "1", - "deleted-data-files", "1", - "changed-partition-count", "1", - "total-records", "0", - "total-data-files", "0" - )) - .build() - ); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema, "snapshots")); + List expected = + Lists.newArrayList( + builder + .set("committed_at", firstSnapshotTimestamp * 1000) + .set("parent_id", null) + .set("operation", "append") + .set( + "summary", + ImmutableMap.of( + "added-records", "1", + "added-data-files", "1", + "changed-partition-count", "1", + "total-data-files", "1", + "total-records", "1")) + .build(), + builder + .set("committed_at", secondSnapshotTimestamp * 1000) + .set("parent_id", firstSnapshotId) + .set("operation", "delete") + .set( + "summary", + ImmutableMap.of( + "deleted-records", "1", + "deleted-data-files", "1", + "changed-partition-count", "1", + "total-records", "0", + "total-data-files", "0")) + .build()); Assert.assertEquals("Snapshots table should have a row for each snapshot", 2, actual.size()); TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(0), actual.get(0)); @@ -895,65 +996,88 @@ public void testManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "manifests"); - Dataset df1 = spark.createDataFrame( - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame( + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), + SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - table.updateProperties() - .set(TableProperties.FORMAT_VERSION, "2") - .commit(); + table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); - DataFile dataFile = Iterables.getFirst(table.currentSnapshot().addedDataFiles(table.io()), null); + DataFile dataFile = + Iterables.getFirst(table.currentSnapshot().addedDataFiles(table.io()), null); PartitionSpec dataFileSpec = table.specs().get(dataFile.specId()); StructLike dataFilePartition = dataFile.partition(); PositionDelete delete = PositionDelete.create(); delete.set(dataFile.path(), 0L, null); - DeleteFile deleteFile = writePositionDeletes(table, dataFileSpec, dataFilePartition, ImmutableList.of(delete)); + DeleteFile deleteFile = + writePositionDeletes(table, dataFileSpec, dataFilePartition, ImmutableList.of(delete)); - table.newRowDelta() - .addDeletes(deleteFile) - .commit(); + table.newRowDelta().addDeletes(deleteFile).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .collectAsList(); table.refresh(); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema(), "manifests")); - GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema().findType("partition_summaries.element").asStructType(), "partition_summary")); - List expected = Lists.transform(table.currentSnapshot().allManifests(table.io()), manifest -> - builder - .set("content", manifest.content().id()) - .set("path", manifest.path()) - .set("length", manifest.length()) - .set("partition_spec_id", manifest.partitionSpecId()) - .set("added_snapshot_id", manifest.snapshotId()) - .set("added_data_files_count", manifest.content() == DATA ? manifest.addedFilesCount() : 0) - .set("existing_data_files_count", manifest.content() == DATA ? manifest.existingFilesCount() : 0) - .set("deleted_data_files_count", manifest.content() == DATA ? manifest.deletedFilesCount() : 0) - .set("added_delete_files_count", manifest.content() == DELETES ? manifest.addedFilesCount() : 0) - .set("existing_delete_files_count", manifest.content() == DELETES ? manifest.existingFilesCount() : 0) - .set("deleted_delete_files_count", manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) - .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> - summaryBuilder - .set("contains_null", manifest.content() == DATA) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build() - )) - .build() - ); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(manifestTable.schema(), "manifests")); + GenericRecordBuilder summaryBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + manifestTable.schema().findType("partition_summaries.element").asStructType(), + "partition_summary")); + List expected = + Lists.transform( + table.currentSnapshot().allManifests(table.io()), + manifest -> + builder + .set("content", manifest.content().id()) + .set("path", manifest.path()) + .set("length", manifest.length()) + .set("partition_spec_id", manifest.partitionSpecId()) + .set("added_snapshot_id", manifest.snapshotId()) + .set( + "added_data_files_count", + manifest.content() == DATA ? manifest.addedFilesCount() : 0) + .set( + "existing_data_files_count", + manifest.content() == DATA ? manifest.existingFilesCount() : 0) + .set( + "deleted_data_files_count", + manifest.content() == DATA ? manifest.deletedFilesCount() : 0) + .set( + "added_delete_files_count", + manifest.content() == DELETES ? manifest.addedFilesCount() : 0) + .set( + "existing_delete_files_count", + manifest.content() == DELETES ? manifest.existingFilesCount() : 0) + .set( + "deleted_delete_files_count", + manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) + .set( + "partition_summaries", + Lists.transform( + manifest.partitions(), + partition -> + summaryBuilder + .set("contains_null", manifest.content() == DATA) + .set("contains_nan", false) + .set("lower_bound", "1") + .set("upper_bound", "1") + .build())) + .build()); Assert.assertEquals("Manifests table should have two manifest rows", 2, actual.size()); TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(0), actual.get(0)); @@ -965,56 +1089,77 @@ public void testPruneManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "manifests"); - Dataset df1 = spark.createDataFrame( - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame( + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), + SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); if (!spark.version().startsWith("2")) { // Spark 2 isn't able to actually push down nested struct projections so this will not break - AssertHelpers.assertThrows("Can't prune struct inside list", SparkException.class, + AssertHelpers.assertThrows( + "Can't prune struct inside list", + SparkException.class, "Cannot project a partial list element struct", - () -> spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries.contains_null") - .collectAsList()); + () -> + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .select("partition_spec_id", "path", "partition_summaries.contains_null") + .collectAsList()); } - Dataset actualDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries"); + Dataset actualDf = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .select("partition_spec_id", "path", "partition_summaries"); Schema projectedSchema = SparkSchemaUtil.convert(actualDf.schema()); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .select("partition_spec_id", "path", "partition_summaries") + .collectAsList(); table.refresh(); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema.asStruct())); - GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - projectedSchema.findType("partition_summaries.element").asStructType(), "partition_summary")); - List expected = Lists.transform(table.currentSnapshot().allManifests(table.io()), manifest -> - builder.set("partition_spec_id", manifest.partitionSpecId()) - .set("path", manifest.path()) - .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> - summaryBuilder - .set("contains_null", true) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build() - )) - .build() - ); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema.asStruct())); + GenericRecordBuilder summaryBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + projectedSchema.findType("partition_summaries.element").asStructType(), + "partition_summary")); + List expected = + Lists.transform( + table.currentSnapshot().allManifests(table.io()), + manifest -> + builder + .set("partition_spec_id", manifest.partitionSpecId()) + .set("path", manifest.path()) + .set( + "partition_summaries", + Lists.transform( + manifest.partitions(), + partition -> + summaryBuilder + .set("contains_null", true) + .set("contains_nan", false) + .set("lower_bound", "1") + .set("upper_bound", "1") + .build())) + .build()); Assert.assertEquals("Manifests table should have one manifest row", 1, actual.size()); TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(0), actual.get(0)); @@ -1025,53 +1170,62 @@ public void testAllManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "all_manifests"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - table.updateProperties() - .set(TableProperties.FORMAT_VERSION, "2") - .commit(); + table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); - DataFile dataFile = Iterables.getFirst(table.currentSnapshot().addedDataFiles(table.io()), null); + DataFile dataFile = + Iterables.getFirst(table.currentSnapshot().addedDataFiles(table.io()), null); PartitionSpec dataFileSpec = table.specs().get(dataFile.specId()); StructLike dataFilePartition = dataFile.partition(); PositionDelete delete = PositionDelete.create(); delete.set(dataFile.path(), 0L, null); - DeleteFile deleteFile = writePositionDeletes(table, dataFileSpec, dataFilePartition, ImmutableList.of(delete)); + DeleteFile deleteFile = + writePositionDeletes(table, dataFileSpec, dataFilePartition, ImmutableList.of(delete)); - table.newRowDelta() - .addDeletes(deleteFile) - .commit(); + table.newRowDelta().addDeletes(deleteFile).commit(); table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); Stream> snapshotIdToManifests = StreamSupport.stream(table.snapshots().spliterator(), false) - .flatMap(snapshot -> snapshot.allManifests(table.io()).stream().map( - manifest -> Pair.of(snapshot.snapshotId(), manifest))); - - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .orderBy("path") - .collectAsList(); + .flatMap( + snapshot -> + snapshot.allManifests(table.io()).stream() + .map(manifest -> Pair.of(snapshot.snapshotId(), manifest))); + + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_manifests")) + .orderBy("path") + .collectAsList(); table.refresh(); - List expected = snapshotIdToManifests - .map(snapshotManifest -> manifestRecord(manifestTable, snapshotManifest.first(), snapshotManifest.second())) - .collect(Collectors.toList()); + List expected = + snapshotIdToManifests + .map( + snapshotManifest -> + manifestRecord( + manifestTable, snapshotManifest.first(), snapshotManifest.second())) + .collect(Collectors.toList()); expected.sort(Comparator.comparing(o -> o.get("path").toString())); Assert.assertEquals("Manifests table should have 5 manifest rows", 5, actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); } } @@ -1080,33 +1234,37 @@ public void testUnpartitionedPartitionsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "unpartitioned_partitions_test"); createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - Dataset df = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - Types.StructType expectedSchema = Types.StructType.of( - required(2, "record_count", Types.LongType.get()), - required(3, "file_count", Types.IntegerType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + required(2, "record_count", Types.LongType.get()), + required(3, "file_count", Types.IntegerType.get())); Table partitionsTable = loadTable(tableIdentifier, "partitions"); - Assert.assertEquals("Schema should not have partition field", - expectedSchema, partitionsTable.schema().asStruct()); + Assert.assertEquals( + "Schema should not have partition field", + expectedSchema, + partitionsTable.schema().asStruct()); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - partitionsTable.schema(), "partitions")); - GenericData.Record expectedRow = builder - .set("record_count", 1L) - .set("file_count", 1) - .build(); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(partitionsTable.schema(), "partitions")); + GenericData.Record expectedRow = builder.set("record_count", 1L).set("file_count", 1).build(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .collectAsList(); Assert.assertEquals("Unpartitioned partitions table should have one row", 1, actual.size()); TestHelpers.assertEqualsSafe(expectedSchema, expectedRow, actual.get(0)); @@ -1117,10 +1275,13 @@ public void testPartitionsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "partitions_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table partitionsTable = loadTable(tableIdentifier, "partitions"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1129,69 +1290,86 @@ public void testPartitionsTable() { long firstCommitId = table.currentSnapshot().snapshotId(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .orderBy("partition.id") - .collectAsList(); - - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - partitionsTable.schema(), "partitions")); - GenericRecordBuilder partitionBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - partitionsTable.schema().findType("partition").asStructType(), "partition")); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .orderBy("partition.id") + .collectAsList(); + + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(partitionsTable.schema(), "partitions")); + GenericRecordBuilder partitionBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + partitionsTable.schema().findType("partition").asStructType(), "partition")); List expected = Lists.newArrayList(); - expected.add(builder - .set("partition", partitionBuilder.set("id", 1).build()) - .set("record_count", 1L) - .set("file_count", 1) - .set("spec_id", 0) - .build()); - expected.add(builder - .set("partition", partitionBuilder.set("id", 2).build()) - .set("record_count", 1L) - .set("file_count", 1) - .set("spec_id", 0) - .build()); + expected.add( + builder + .set("partition", partitionBuilder.set("id", 1).build()) + .set("record_count", 1L) + .set("file_count", 1) + .set("spec_id", 0) + .build()); + expected.add( + builder + .set("partition", partitionBuilder.set("id", 2).build()) + .set("record_count", 1L) + .set("file_count", 1) + .set("spec_id", 0) + .build()); Assert.assertEquals("Partitions table should have two rows", 2, expected.size()); Assert.assertEquals("Actual results should have two rows", 2, actual.size()); for (int i = 0; i < 2; i += 1) { - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); } // check time travel - List actualAfterFirstCommit = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, String.valueOf(firstCommitId)) - .load(loadLocation(tableIdentifier, "partitions")) - .orderBy("partition.id") - .collectAsList(); + List actualAfterFirstCommit = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, String.valueOf(firstCommitId)) + .load(loadLocation(tableIdentifier, "partitions")) + .orderBy("partition.id") + .collectAsList(); Assert.assertEquals("Actual results should have one row", 1, actualAfterFirstCommit.size()); - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(0), actualAfterFirstCommit.get(0)); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(0), actualAfterFirstCommit.get(0)); // check predicate push down - List filtered = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .filter("partition.id < 2") - .collectAsList(); + List filtered = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .filter("partition.id < 2") + .collectAsList(); Assert.assertEquals("Actual results should have one row", 1, filtered.size()); - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(0), filtered.get(0)); - - List nonFiltered = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .filter("partition.id < 2 or record_count=1") - .collectAsList(); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(0), filtered.get(0)); + + List nonFiltered = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .filter("partition.id < 2 or record_count=1") + .collectAsList(); Assert.assertEquals("Actual results should have one row", 2, nonFiltered.size()); for (int i = 0; i < 2; i += 1) { - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); } } @@ -1200,62 +1378,63 @@ public synchronized void testSnapshotReadAfterAddColumn() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List originalRecords = Lists.newArrayList( - RowFactory.create(1, "x"), - RowFactory.create(2, "y"), - RowFactory.create(3, "z")); + List originalRecords = + Lists.newArrayList( + RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z")); StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA); Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf.orderBy("id").collectAsList()); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); Snapshot snapshotBeforeAddColumn = table.currentSnapshot(); table.updateSchema().addColumn("category", Types.StringType.get()).commit(); - List newRecords = Lists.newArrayList( - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); + List newRecords = + Lists.newArrayList(RowFactory.create(4, "xy", "B"), RowFactory.create(5, "xyz", "C")); StructType newSparkSchema = SparkSchemaUtil.convert(SCHEMA2); Dataset inputDf2 = spark.createDataFrame(newRecords, newSparkSchema); - inputDf2.select("id", "data", "category").write() + inputDf2 + .select("id", "data", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - List updatedRecords = Lists.newArrayList( - RowFactory.create(1, "x", null), - RowFactory.create(2, "y", null), - RowFactory.create(3, "z", null), - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); - - Dataset resultDf2 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", updatedRecords, - resultDf2.orderBy("id").collectAsList()); - - Dataset resultDf3 = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf3.orderBy("id").collectAsList()); + List updatedRecords = + Lists.newArrayList( + RowFactory.create(1, "x", null), + RowFactory.create(2, "y", null), + RowFactory.create(3, "z", null), + RowFactory.create(4, "xy", "B"), + RowFactory.create(5, "xyz", "C")); + + Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); + + Dataset resultDf3 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf3.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf3.schema()); } @@ -1264,72 +1443,76 @@ public synchronized void testSnapshotReadAfterDropColumn() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA2, PartitionSpec.unpartitioned()); - List originalRecords = Lists.newArrayList( - RowFactory.create(1, "x", "A"), - RowFactory.create(2, "y", "A"), - RowFactory.create(3, "z", "B")); + List originalRecords = + Lists.newArrayList( + RowFactory.create(1, "x", "A"), + RowFactory.create(2, "y", "A"), + RowFactory.create(3, "z", "B")); StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA2); Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf.select("id", "data", "category").write() + inputDf + .select("id", "data", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf.orderBy("id").collectAsList()); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); long tsBeforeDropColumn = waitUntilAfter(System.currentTimeMillis()); table.updateSchema().deleteColumn("data").commit(); long tsAfterDropColumn = waitUntilAfter(System.currentTimeMillis()); - List newRecords = Lists.newArrayList( - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); + List newRecords = Lists.newArrayList(RowFactory.create(4, "B"), RowFactory.create(5, "C")); StructType newSparkSchema = SparkSchemaUtil.convert(SCHEMA3); Dataset inputDf2 = spark.createDataFrame(newRecords, newSparkSchema); - inputDf2.select("id", "category").write() + inputDf2 + .select("id", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - List updatedRecords = Lists.newArrayList( - RowFactory.create(1, "A"), - RowFactory.create(2, "A"), - RowFactory.create(3, "B"), - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); - - Dataset resultDf2 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", updatedRecords, - resultDf2.orderBy("id").collectAsList()); - - Dataset resultDf3 = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, tsBeforeDropColumn) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf3.orderBy("id").collectAsList()); + List updatedRecords = + Lists.newArrayList( + RowFactory.create(1, "A"), + RowFactory.create(2, "A"), + RowFactory.create(3, "B"), + RowFactory.create(4, "B"), + RowFactory.create(5, "C")); + + Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); + + Dataset resultDf3 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, tsBeforeDropColumn) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf3.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf3.schema()); // At tsAfterDropColumn, there has been a schema change, but no new snapshot, // so the snapshot as of tsAfterDropColumn is the same as that as of tsBeforeDropColumn. - Dataset resultDf4 = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, tsAfterDropColumn) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf4.orderBy("id").collectAsList()); + Dataset resultDf4 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, tsAfterDropColumn) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf4.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf4.schema()); } @@ -1338,77 +1521,77 @@ public synchronized void testSnapshotReadAfterAddAndDropColumn() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List originalRecords = Lists.newArrayList( - RowFactory.create(1, "x"), - RowFactory.create(2, "y"), - RowFactory.create(3, "z")); + List originalRecords = + Lists.newArrayList( + RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z")); StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA); Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf.orderBy("id").collectAsList()); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); Snapshot snapshotBeforeAddColumn = table.currentSnapshot(); table.updateSchema().addColumn("category", Types.StringType.get()).commit(); - List newRecords = Lists.newArrayList( - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); + List newRecords = + Lists.newArrayList(RowFactory.create(4, "xy", "B"), RowFactory.create(5, "xyz", "C")); StructType sparkSchemaAfterAddColumn = SparkSchemaUtil.convert(SCHEMA2); Dataset inputDf2 = spark.createDataFrame(newRecords, sparkSchemaAfterAddColumn); - inputDf2.select("id", "data", "category").write() + inputDf2 + .select("id", "data", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - List updatedRecords = Lists.newArrayList( - RowFactory.create(1, "x", null), - RowFactory.create(2, "y", null), - RowFactory.create(3, "z", null), - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); + List updatedRecords = + Lists.newArrayList( + RowFactory.create(1, "x", null), + RowFactory.create(2, "y", null), + RowFactory.create(3, "z", null), + RowFactory.create(4, "xy", "B"), + RowFactory.create(5, "xyz", "C")); - Dataset resultDf2 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", updatedRecords, - resultDf2.orderBy("id").collectAsList()); + Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); table.updateSchema().deleteColumn("data").commit(); - List recordsAfterDropColumn = Lists.newArrayList( - RowFactory.create(1, null), - RowFactory.create(2, null), - RowFactory.create(3, null), - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); - - Dataset resultDf3 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", recordsAfterDropColumn, - resultDf3.orderBy("id").collectAsList()); - - Dataset resultDf4 = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf4.orderBy("id").collectAsList()); + List recordsAfterDropColumn = + Lists.newArrayList( + RowFactory.create(1, null), + RowFactory.create(2, null), + RowFactory.create(3, null), + RowFactory.create(4, "B"), + RowFactory.create(5, "C")); + + Dataset resultDf3 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", recordsAfterDropColumn, resultDf3.orderBy("id").collectAsList()); + + Dataset resultDf4 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf4.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf4.schema()); } @@ -1417,13 +1600,12 @@ public void testRemoveOrphanFilesActionSupport() throws InterruptedException { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List records = Lists.newArrayList( - new SimpleRecord(1, "1") - ); + List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1435,36 +1617,42 @@ public void testRemoveOrphanFilesActionSupport() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result1 = actions.deleteOrphanFiles(table) - .location(table.location() + "/metadata") - .olderThan(System.currentTimeMillis()) - .execute(); - Assert.assertTrue("Should not delete any metadata files", Iterables.isEmpty(result1.orphanFileLocations())); + DeleteOrphanFiles.Result result1 = + actions + .deleteOrphanFiles(table) + .location(table.location() + "/metadata") + .olderThan(System.currentTimeMillis()) + .execute(); + Assert.assertTrue( + "Should not delete any metadata files", Iterables.isEmpty(result1.orphanFileLocations())); - DeleteOrphanFiles.Result result2 = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); - Assert.assertEquals("Should delete 1 data file", 1, Iterables.size(result2.orphanFileLocations())); + DeleteOrphanFiles.Result result2 = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); + Assert.assertEquals( + "Should delete 1 data file", 1, Iterables.size(result2.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); - List actualRecords = resultDF - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } - @Test public void testFilesTablePartitionId() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "files_test"); - Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("id").build()); + Table table = + createTable( + tableIdentifier, SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("id").build()); int spec0 = table.spec().specId(); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1475,17 +1663,17 @@ public void testFilesTablePartitionId() throws Exception { int spec1 = table.spec().specId(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .sort(DataFile.SPEC_ID.name()) - .collectAsList() - .stream().map(r -> (Integer) r.getAs(DataFile.SPEC_ID.name())).collect(Collectors.toList()); + List actual = + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")) + .sort(DataFile.SPEC_ID.name()).collectAsList().stream() + .map(r -> (Integer) r.getAs(DataFile.SPEC_ID.name())) + .collect(Collectors.toList()); Assert.assertEquals("Should have two partition specs", ImmutableList.of(spec0, spec1), actual); } @@ -1495,22 +1683,26 @@ public void testAllManifestTableSnapshotFiltering() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "all_manifest_snapshot_filtering"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "all_manifests"); - Dataset df = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); List> snapshotIdToManifests = Lists.newArrayList(); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); Snapshot snapshot1 = table.currentSnapshot(); - snapshotIdToManifests.addAll(snapshot1.allManifests().stream() - .map(manifest -> Pair.of(snapshot1.snapshotId(), manifest)) - .collect(Collectors.toList())); + snapshotIdToManifests.addAll( + snapshot1.allManifests().stream() + .map(manifest -> Pair.of(snapshot1.snapshotId(), manifest)) + .collect(Collectors.toList())); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1518,16 +1710,19 @@ public void testAllManifestTableSnapshotFiltering() throws Exception { table.refresh(); Snapshot snapshot2 = table.currentSnapshot(); Assert.assertEquals("Should have two manifests", 2, snapshot2.allManifests().size()); - snapshotIdToManifests.addAll(snapshot2.allManifests().stream() - .map(manifest -> Pair.of(snapshot2.snapshotId(), manifest)) - .collect(Collectors.toList())); + snapshotIdToManifests.addAll( + snapshot2.allManifests().stream() + .map(manifest -> Pair.of(snapshot2.snapshotId(), manifest)) + .collect(Collectors.toList())); // Add manifests that will not be selected - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1537,30 +1732,41 @@ public void testAllManifestTableSnapshotFiltering() throws Exception { snapshotIds.add(String.valueOf(snapshot2.snapshotId())); snapshotIds.toString(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .filter("reference_snapshot_id in " + snapshotIds) - .orderBy("path") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_manifests")) + .filter("reference_snapshot_id in " + snapshotIds) + .orderBy("path") + .collectAsList(); table.refresh(); - List expected = snapshotIdToManifests.stream() - .map(snapshotManifest -> manifestRecord(manifestTable, snapshotManifest.first(), snapshotManifest.second())) - .collect(Collectors.toList()); + List expected = + snapshotIdToManifests.stream() + .map( + snapshotManifest -> + manifestRecord( + manifestTable, snapshotManifest.first(), snapshotManifest.second())) + .collect(Collectors.toList()); expected.sort(Comparator.comparing(o -> o.get("path").toString())); Assert.assertEquals("Manifests table should have 3 manifest rows", 3, actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); } } - private GenericData.Record manifestRecord(Table manifestTable, Long referenceSnapshotId, ManifestFile manifest) { - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema(), "manifests")); - GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema().findType("partition_summaries.element").asStructType(), "partition_summary")); + private GenericData.Record manifestRecord( + Table manifestTable, Long referenceSnapshotId, ManifestFile manifest) { + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(manifestTable.schema(), "manifests")); + GenericRecordBuilder summaryBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + manifestTable.schema().findType("partition_summaries.element").asStructType(), + "partition_summary")); return builder .set("content", manifest.content().id()) .set("path", manifest.path()) @@ -1568,19 +1774,32 @@ private GenericData.Record manifestRecord(Table manifestTable, Long referenceSna .set("partition_spec_id", manifest.partitionSpecId()) .set("added_snapshot_id", manifest.snapshotId()) .set("added_data_files_count", manifest.content() == DATA ? manifest.addedFilesCount() : 0) - .set("existing_data_files_count", manifest.content() == DATA ? manifest.existingFilesCount() : 0) - .set("deleted_data_files_count", manifest.content() == DATA ? manifest.deletedFilesCount() : 0) - .set("added_delete_files_count", manifest.content() == DELETES ? manifest.addedFilesCount() : 0) - .set("existing_delete_files_count", manifest.content() == DELETES ? manifest.existingFilesCount() : 0) - .set("deleted_delete_files_count", manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) - .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> - summaryBuilder - .set("contains_null", false) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build() - )) + .set( + "existing_data_files_count", + manifest.content() == DATA ? manifest.existingFilesCount() : 0) + .set( + "deleted_data_files_count", + manifest.content() == DATA ? manifest.deletedFilesCount() : 0) + .set( + "added_delete_files_count", + manifest.content() == DELETES ? manifest.addedFilesCount() : 0) + .set( + "existing_delete_files_count", + manifest.content() == DELETES ? manifest.existingFilesCount() : 0) + .set( + "deleted_delete_files_count", + manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) + .set( + "partition_summaries", + Lists.transform( + manifest.partitions(), + partition -> + summaryBuilder + .set("contains_null", false) + .set("contains_nan", false) + .set("lower_bound", "1") + .set("upper_bound", "1") + .build())) .set("reference_snapshot_id", referenceSnapshotId) .build(); } @@ -1590,8 +1809,8 @@ private void asMetadataRecord(GenericData.Record file) { file.put(3, 0); // specId } - private PositionDeleteWriter newPositionDeleteWriter(Table table, PartitionSpec spec, - StructLike partition) { + private PositionDeleteWriter newPositionDeleteWriter( + Table table, PartitionSpec spec, StructLike partition) { OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, 0, 0).build(); EncryptedOutputFile outputFile = fileFactory.newOutputFile(spec, partition); @@ -1599,9 +1818,13 @@ private PositionDeleteWriter newPositionDeleteWriter(Table table, P return fileWriterFactory.newPositionDeleteWriter(outputFile, spec, partition); } - private DeleteFile writePositionDeletes(Table table, PartitionSpec spec, StructLike partition, - Iterable> deletes) { - PositionDeleteWriter positionDeleteWriter = newPositionDeleteWriter(table, spec, partition); + private DeleteFile writePositionDeletes( + Table table, + PartitionSpec spec, + StructLike partition, + Iterable> deletes) { + PositionDeleteWriter positionDeleteWriter = + newPositionDeleteWriter(table, spec, partition); try (PositionDeleteWriter writer = positionDeleteWriter) { for (PositionDelete delete : deletes) { diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java index 1a99697a09f9..0be1e0b1bd05 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.math.BigDecimal; @@ -61,8 +60,8 @@ public void testRegisterIntegerBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_int_16", DataTypes.IntegerType, 16); List results = spark.sql("SELECT iceberg_bucket_int_16(1)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); } @Test @@ -70,8 +69,8 @@ public void testRegisterShortBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_short_16", DataTypes.ShortType, 16); List results = spark.sql("SELECT iceberg_bucket_short_16(1S)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); } @Test @@ -79,8 +78,8 @@ public void testRegisterByteBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_byte_16", DataTypes.ByteType, 16); List results = spark.sql("SELECT iceberg_bucket_byte_16(1Y)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); } @Test @@ -88,8 +87,8 @@ public void testRegisterLongBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_long_16", DataTypes.LongType, 16); List results = spark.sql("SELECT iceberg_bucket_long_16(1L)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.LongType.get(), 16).apply(1L), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.LongType.get(), 16).apply(1L), results.get(0).getInt(0)); } @Test @@ -97,7 +96,8 @@ public void testRegisterStringBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_string_16", DataTypes.StringType, 16); List results = spark.sql("SELECT iceberg_bucket_string_16('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), + Assert.assertEquals( + (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), results.get(0).getInt(0)); } @@ -106,7 +106,8 @@ public void testRegisterCharBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_char_16", new CharType(5), 16); List results = spark.sql("SELECT iceberg_bucket_char_16('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), + Assert.assertEquals( + (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), results.get(0).getInt(0)); } @@ -115,73 +116,89 @@ public void testRegisterVarCharBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_varchar_16", new VarcharType(5), 16); List results = spark.sql("SELECT iceberg_bucket_varchar_16('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), + Assert.assertEquals( + (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), results.get(0).getInt(0)); } @Test public void testRegisterDateBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_date_16", DataTypes.DateType, 16); - List results = spark.sql("SELECT iceberg_bucket_date_16(DATE '2021-06-30')").collectAsList(); + List results = + spark.sql("SELECT iceberg_bucket_date_16(DATE '2021-06-30')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.DateType.get(), 16) - .apply(DateTimeUtils.fromJavaDate(Date.valueOf("2021-06-30"))), + Assert.assertEquals( + (int) + Transforms.bucket(Types.DateType.get(), 16) + .apply(DateTimeUtils.fromJavaDate(Date.valueOf("2021-06-30"))), results.get(0).getInt(0)); } @Test public void testRegisterTimestampBucketUDF() { - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_timestamp_16", DataTypes.TimestampType, 16); + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_timestamp_16", DataTypes.TimestampType, 16); List results = - spark.sql("SELECT iceberg_bucket_timestamp_16(TIMESTAMP '2021-06-30 00:00:00.000')").collectAsList(); + spark + .sql("SELECT iceberg_bucket_timestamp_16(TIMESTAMP '2021-06-30 00:00:00.000')") + .collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.TimestampType.withZone(), 16) - .apply(DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2021-06-30 00:00:00.000"))), + Assert.assertEquals( + (int) + Transforms.bucket(Types.TimestampType.withZone(), 16) + .apply( + DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2021-06-30 00:00:00.000"))), results.get(0).getInt(0)); } @Test public void testRegisterBinaryBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_binary_16", DataTypes.BinaryType, 16); - List results = - spark.sql("SELECT iceberg_bucket_binary_16(X'0020001F')").collectAsList(); + List results = spark.sql("SELECT iceberg_bucket_binary_16(X'0020001F')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.BinaryType.get(), 16) - .apply(ByteBuffer.wrap(new byte[]{0x00, 0x20, 0x00, 0x1F})), + Assert.assertEquals( + (int) + Transforms.bucket(Types.BinaryType.get(), 16) + .apply(ByteBuffer.wrap(new byte[] {0x00, 0x20, 0x00, 0x1F})), results.get(0).getInt(0)); } @Test public void testRegisterDecimalBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_decimal_16", new DecimalType(4, 2), 16); - List results = - spark.sql("SELECT iceberg_bucket_decimal_16(11.11)").collectAsList(); + List results = spark.sql("SELECT iceberg_bucket_decimal_16(11.11)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.DecimalType.of(4, 2), 16) - .apply(new BigDecimal("11.11")), + Assert.assertEquals( + (int) Transforms.bucket(Types.DecimalType.of(4, 2), 16).apply(new BigDecimal("11.11")), results.get(0).getInt(0)); } @Test public void testRegisterBooleanBucketUDF() { - Assertions.assertThatThrownBy(() -> - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_boolean_16", DataTypes.BooleanType, 16)) + Assertions.assertThatThrownBy( + () -> + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_boolean_16", DataTypes.BooleanType, 16)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot bucket by type: boolean"); } @Test public void testRegisterDoubleBucketUDF() { - Assertions.assertThatThrownBy(() -> - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_double_16", DataTypes.DoubleType, 16)) + Assertions.assertThatThrownBy( + () -> + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_double_16", DataTypes.DoubleType, 16)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot bucket by type: double"); } @Test public void testRegisterFloatBucketUDF() { - Assertions.assertThatThrownBy(() -> - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_float_16", DataTypes.FloatType, 16)) + Assertions.assertThatThrownBy( + () -> + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_float_16", DataTypes.FloatType, 16)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot bucket by type: float"); } @@ -191,8 +208,8 @@ public void testRegisterIntegerTruncateUDF() { IcebergSpark.registerTruncateUDF(spark, "iceberg_truncate_int_4", DataTypes.IntegerType, 4); List results = spark.sql("SELECT iceberg_truncate_int_4(1)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals(Transforms.truncate(Types.IntegerType.get(), 4).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + Transforms.truncate(Types.IntegerType.get(), 4).apply(1), results.get(0).getInt(0)); } @Test @@ -200,18 +217,18 @@ public void testRegisterLongTruncateUDF() { IcebergSpark.registerTruncateUDF(spark, "iceberg_truncate_long_4", DataTypes.LongType, 4); List results = spark.sql("SELECT iceberg_truncate_long_4(1L)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals(Transforms.truncate(Types.LongType.get(), 4).apply(1L), - results.get(0).getLong(0)); + Assert.assertEquals( + Transforms.truncate(Types.LongType.get(), 4).apply(1L), results.get(0).getLong(0)); } @Test public void testRegisterDecimalTruncateUDF() { IcebergSpark.registerTruncateUDF(spark, "iceberg_truncate_decimal_4", new DecimalType(4, 2), 4); - List results = - spark.sql("SELECT iceberg_truncate_decimal_4(11.11)").collectAsList(); + List results = spark.sql("SELECT iceberg_truncate_decimal_4(11.11)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals(Transforms.truncate(Types.DecimalType.of(4, 2), 4) - .apply(new BigDecimal("11.11")), results.get(0).getDecimal(0)); + Assert.assertEquals( + Transforms.truncate(Types.DecimalType.of(4, 2), 4).apply(new BigDecimal("11.11")), + results.get(0).getDecimal(0)); } @Test @@ -219,7 +236,7 @@ public void testRegisterStringTruncateUDF() { IcebergSpark.registerTruncateUDF(spark, "iceberg_truncate_string_4", DataTypes.StringType, 4); List results = spark.sql("SELECT iceberg_truncate_string_4('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals(Transforms.truncate(Types.StringType.get(), 4).apply("hello"), - results.get(0).getString(0)); + Assert.assertEquals( + Transforms.truncate(Types.StringType.get(), 4).apply("hello"), results.get(0).getString(0)); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java index e07798301db8..7313c18cc09d 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; @@ -55,11 +54,11 @@ public class TestIdentityPartitionData extends SparkTestBase { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true }, + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true}, }; } @@ -71,36 +70,37 @@ public TestIdentityPartitionData(String format, boolean vectorized) { this.vectorized = vectorized; } - private static final Schema LOG_SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "date", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get()) - ); - - private static final List LOGS = ImmutableList.of( - LogMessage.debug("2020-02-02", "debug event 1"), - LogMessage.info("2020-02-02", "info event 1"), - LogMessage.debug("2020-02-02", "debug event 2"), - LogMessage.info("2020-02-03", "info event 2"), - LogMessage.debug("2020-02-03", "debug event 3"), - LogMessage.info("2020-02-03", "info event 3"), - LogMessage.error("2020-02-03", "error event 1"), - LogMessage.debug("2020-02-04", "debug event 4"), - LogMessage.warn("2020-02-04", "warn event 1"), - LogMessage.debug("2020-02-04", "debug event 5") - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - - private PartitionSpec spec = PartitionSpec.builderFor(LOG_SCHEMA).identity("date").identity("level").build(); + private static final Schema LOG_SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "date", Types.StringType.get()), + Types.NestedField.optional(3, "level", Types.StringType.get()), + Types.NestedField.optional(4, "message", Types.StringType.get())); + + private static final List LOGS = + ImmutableList.of( + LogMessage.debug("2020-02-02", "debug event 1"), + LogMessage.info("2020-02-02", "info event 1"), + LogMessage.debug("2020-02-02", "debug event 2"), + LogMessage.info("2020-02-03", "info event 2"), + LogMessage.debug("2020-02-03", "debug event 3"), + LogMessage.info("2020-02-03", "info event 3"), + LogMessage.error("2020-02-03", "error event 1"), + LogMessage.debug("2020-02-04", "debug event 4"), + LogMessage.warn("2020-02-04", "warn event 1"), + LogMessage.debug("2020-02-04", "debug event 5")); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); + + private PartitionSpec spec = + PartitionSpec.builderFor(LOG_SCHEMA).identity("date").identity("level").build(); private Table table = null; private Dataset logs = null; /** - * Use the Hive Based table to make Identity Partition Columns with no duplication of the data in the underlying - * parquet files. This makes sure that if the identity mapping fails, the test will also fail. + * Use the Hive Based table to make Identity Partition Columns with no duplication of the data in + * the underlying parquet files. This makes sure that if the identity mapping fails, the test will + * also fail. */ private void setupParquet() throws Exception { File location = temp.newFolder("logs"); @@ -109,15 +109,25 @@ private void setupParquet() throws Exception { Assert.assertTrue("Temp folder should exist", location.exists()); Map properties = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format); - this.logs = spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); + this.logs = + spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); spark.sql(String.format("DROP TABLE IF EXISTS %s", hiveTable)); - logs.orderBy("date", "level", "id").write().partitionBy("date", "level").format("parquet") - .option("path", hiveLocation.toString()).saveAsTable(hiveTable); - - this.table = TABLES.create(SparkSchemaUtil.schemaForTable(spark, hiveTable), - SparkSchemaUtil.specForTable(spark, hiveTable), properties, location.toString()); - - SparkTableUtil.importSparkTable(spark, new TableIdentifier(hiveTable), table, location.toString()); + logs.orderBy("date", "level", "id") + .write() + .partitionBy("date", "level") + .format("parquet") + .option("path", hiveLocation.toString()) + .saveAsTable(hiveTable); + + this.table = + TABLES.create( + SparkSchemaUtil.schemaForTable(spark, hiveTable), + SparkSchemaUtil.specForTable(spark, hiveTable), + properties, + location.toString()); + + SparkTableUtil.importSparkTable( + spark, new TableIdentifier(hiveTable), table, location.toString()); } @Before @@ -130,56 +140,70 @@ public void setupTable() throws Exception { Map properties = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format); this.table = TABLES.create(LOG_SCHEMA, spec, properties, location.toString()); - this.logs = spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); + this.logs = + spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); - logs.orderBy("date", "level", "id").write().format("iceberg").mode("append").save(location.toString()); + logs.orderBy("date", "level", "id") + .write() + .format("iceberg") + .mode("append") + .save(location.toString()); } } @Test public void testFullProjection() { List expected = logs.orderBy("id").collectAsList(); - List actual = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()).orderBy("id") - .select("id", "date", "level", "message") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table.location()) + .orderBy("id") + .select("id", "date", "level", "message") + .collectAsList(); Assert.assertEquals("Rows should match", expected, actual); } @Test public void testProjections() { - String[][] cases = new String[][] { - // individual fields - new String[] { "date" }, - new String[] { "level" }, - new String[] { "message" }, - // field pairs - new String[] { "date", "message" }, - new String[] { "level", "message" }, - new String[] { "date", "level" }, - // out-of-order pairs - new String[] { "message", "date" }, - new String[] { "message", "level" }, - new String[] { "level", "date" }, - // full projection, different orderings - new String[] { "date", "level", "message" }, - new String[] { "level", "date", "message" }, - new String[] { "date", "message", "level" }, - new String[] { "level", "message", "date" }, - new String[] { "message", "date", "level" }, - new String[] { "message", "level", "date" } - }; + String[][] cases = + new String[][] { + // individual fields + new String[] {"date"}, + new String[] {"level"}, + new String[] {"message"}, + // field pairs + new String[] {"date", "message"}, + new String[] {"level", "message"}, + new String[] {"date", "level"}, + // out-of-order pairs + new String[] {"message", "date"}, + new String[] {"message", "level"}, + new String[] {"level", "date"}, + // full projection, different orderings + new String[] {"date", "level", "message"}, + new String[] {"level", "date", "message"}, + new String[] {"date", "message", "level"}, + new String[] {"level", "message", "date"}, + new String[] {"message", "date", "level"}, + new String[] {"message", "level", "date"} + }; for (String[] ordering : cases) { List expected = logs.select("id", ordering).orderBy("id").collectAsList(); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()) - .select("id", ordering).orderBy("id") - .collectAsList(); - Assert.assertEquals("Rows should match for ordering: " + Arrays.toString(ordering), expected, actual); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table.location()) + .select("id", ordering) + .orderBy("id") + .collectAsList(); + Assert.assertEquals( + "Rows should match for ordering: " + Arrays.toString(ordering), expected, actual); } } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java index 4ab01044046f..9e75145faff9 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Iterator; @@ -68,8 +67,10 @@ protected void generateAndValidate(Schema schema, AssertMethod assertMethod) { StructLike recordStructLike = recordWrapper.wrap(actual.next()); StructLike rowStructLike = rowWrapper.wrap(expected.next()); - assertMethod.assertEquals("Should have expected StructLike values", - actualWrapper.set(recordStructLike), expectedWrapper.set(rowStructLike)); + assertMethod.assertEquals( + "Should have expected StructLike values", + actualWrapper.set(recordStructLike), + expectedWrapper.set(rowStructLike)); } Assert.assertFalse("Shouldn't have more record", actual.hasNext()); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTablesWithPartitionEvolution.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTablesWithPartitionEvolution.java index 691e9f6f5481..82c9a58e33ea 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTablesWithPartitionEvolution.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTablesWithPartitionEvolution.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.FileFormat.AVRO; +import static org.apache.iceberg.FileFormat.ORC; +import static org.apache.iceberg.FileFormat.PARQUET; +import static org.apache.iceberg.MetadataTableType.ALL_DATA_FILES; +import static org.apache.iceberg.MetadataTableType.ALL_ENTRIES; +import static org.apache.iceberg.MetadataTableType.ENTRIES; +import static org.apache.iceberg.MetadataTableType.FILES; +import static org.apache.iceberg.MetadataTableType.PARTITIONS; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.FORMAT_VERSION; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -51,83 +61,72 @@ import org.junit.runners.Parameterized; import org.junit.runners.Parameterized.Parameters; -import static org.apache.iceberg.FileFormat.AVRO; -import static org.apache.iceberg.FileFormat.ORC; -import static org.apache.iceberg.FileFormat.PARQUET; -import static org.apache.iceberg.MetadataTableType.ALL_DATA_FILES; -import static org.apache.iceberg.MetadataTableType.ALL_ENTRIES; -import static org.apache.iceberg.MetadataTableType.ENTRIES; -import static org.apache.iceberg.MetadataTableType.FILES; -import static org.apache.iceberg.MetadataTableType.PARTITIONS; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.FORMAT_VERSION; - @RunWith(Parameterized.class) public class TestMetadataTablesWithPartitionEvolution extends SparkCatalogTestBase { @Parameters(name = "catalog = {0}, impl = {1}, conf = {2}, fileFormat = {3}, formatVersion = {4}") public static Object[][] parameters() { return new Object[][] { - { "testhive", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default" - ), - ORC, - 1 - }, - { "testhive", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default" + { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default"), + ORC, + 1 + }, + { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default"), + ORC, + 2 + }, + {"testhadoop", SparkCatalog.class.getName(), ImmutableMap.of("type", "hadoop"), PARQUET, 1}, + {"testhadoop", SparkCatalog.class.getName(), ImmutableMap.of("type", "hadoop"), PARQUET, 2}, + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "clients", "1", + "parquet-enabled", "false", + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync ), - ORC, - 2 - }, - { "testhadoop", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hadoop" + AVRO, + 1 + }, + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "clients", "1", + "parquet-enabled", "false", + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync ), - PARQUET, - 1 - }, - { "testhadoop", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hadoop" - ), - PARQUET, - 2 - }, - { "spark_catalog", SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "clients", "1", - "parquet-enabled", "false", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - ), - AVRO, - 1 - }, - { "spark_catalog", SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "clients", "1", - "parquet-enabled", "false", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - ), - AVRO, - 2 - } + AVRO, + 2 + } }; } private final FileFormat fileFormat; private final int formatVersion; - public TestMetadataTablesWithPartitionEvolution(String catalogName, String implementation, Map config, - FileFormat fileFormat, int formatVersion) { + public TestMetadataTablesWithPartitionEvolution( + String catalogName, + String implementation, + Map config, + FileFormat fileFormat, + int formatVersion) { super(catalogName, implementation, config); this.fileFormat = fileFormat; this.formatVersion = formatVersion; @@ -140,7 +139,9 @@ public void removeTable() { @Test public void testFilesMetadataTable() throws ParseException { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -148,28 +149,23 @@ public void testFilesMetadataTable() throws ParseException { // verify the metadata tables while the current spec is still unpartitioned for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES)) { Dataset df = loadMetadataTable(tableType); - Assert.assertTrue("Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); + Assert.assertTrue( + "Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); } Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); // verify the metadata tables after adding the first partition column for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES)) { assertPartitions( - ImmutableList.of(row(new Object[]{null}), row("b1")), - "STRUCT", - tableType); + ImmutableList.of(row(new Object[] {null}), row("b1")), "STRUCT", tableType); } - table.updateSpec() - .addField(Expressions.bucket("category", 8)) - .commit(); + table.updateSpec().addField(Expressions.bucket("category", 8)).commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -181,9 +177,7 @@ public void testFilesMetadataTable() throws ParseException { tableType); } - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -195,9 +189,7 @@ public void testFilesMetadataTable() throws ParseException { tableType); } - table.updateSpec() - .renameField("category_bucket_8", "category_bucket_8_another_name") - .commit(); + table.updateSpec().renameField("category_bucket_8", "category_bucket_8_another_name").commit(); sql("REFRESH TABLE %s", tableName); // verify the metadata tables after renaming the second partition column @@ -211,8 +203,10 @@ public void testFilesMetadataTable() throws ParseException { @Test public void testFilesMetadataTableFilter() throws ParseException { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg " + - "TBLPROPERTIES ('commit.manifest-merge.enabled' 'false')", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg " + + "TBLPROPERTIES ('commit.manifest-merge.enabled' 'false')", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); @@ -221,14 +215,13 @@ public void testFilesMetadataTableFilter() throws ParseException { // verify the metadata tables while the current spec is still unpartitioned for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES)) { Dataset df = loadMetadataTable(tableType); - Assert.assertTrue("Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); + Assert.assertTrue( + "Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); } Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); @@ -236,22 +229,18 @@ public void testFilesMetadataTableFilter() throws ParseException { // verify the metadata tables after adding the first partition column for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES)) { assertPartitions( - ImmutableList.of(row("d2")), - "STRUCT", - tableType, - "partition.data = 'd2'"); + ImmutableList.of(row("d2")), "STRUCT", tableType, "partition.data = 'd2'"); } - table.updateSpec() - .addField("category") - .commit(); + table.updateSpec().addField("category").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); // verify the metadata tables after adding the second partition column for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES)) { - assertPartitions(ImmutableList.of(row("d2", null), row("d2", "c2")), + assertPartitions( + ImmutableList.of(row("d2", null), row("d2", "c2")), "STRUCT", tableType, "partition.data = 'd2'"); @@ -264,9 +253,7 @@ public void testFilesMetadataTableFilter() throws ParseException { "partition.category = 'c2'"); } - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); // Verify new partitions do not show up for removed 'partition.data=d2' query @@ -276,7 +263,8 @@ public void testFilesMetadataTableFilter() throws ParseException { // Verify new partitions do show up for 'partition.category=c2' query sql("INSERT INTO TABLE %s VALUES (5, 'c2', 'd5')", tableName); - // no new partition should show up for 'data' partition query as partition field has been removed + // no new partition should show up for 'data' partition query as partition field has been + // removed for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES)) { assertPartitions( ImmutableList.of(row("d2", null), row("d2", "c2")), @@ -293,9 +281,7 @@ public void testFilesMetadataTableFilter() throws ParseException { "partition.category = 'c2'"); } - table.updateSpec() - .renameField("category", "category_another_name") - .commit(); + table.updateSpec().renameField("category", "category_another_name").commit(); sql("REFRESH TABLE %s", tableName); // Verify new partitions do show up for 'category=c2' query @@ -311,7 +297,9 @@ public void testFilesMetadataTableFilter() throws ParseException { @Test public void testEntriesMetadataTable() throws ParseException { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -325,23 +313,17 @@ public void testEntriesMetadataTable() throws ParseException { Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); // verify the metadata tables after adding the first partition column for (MetadataTableType tableType : Arrays.asList(ENTRIES, ALL_ENTRIES)) { assertPartitions( - ImmutableList.of(row(new Object[]{null}), row("b1")), - "STRUCT", - tableType); + ImmutableList.of(row(new Object[] {null}), row("b1")), "STRUCT", tableType); } - table.updateSpec() - .addField(Expressions.bucket("category", 8)) - .commit(); + table.updateSpec().addField(Expressions.bucket("category", 8)).commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -353,9 +335,7 @@ public void testEntriesMetadataTable() throws ParseException { tableType); } - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -367,9 +347,7 @@ public void testEntriesMetadataTable() throws ParseException { tableType); } - table.updateSpec() - .renameField("category_bucket_8", "category_bucket_8_another_name") - .commit(); + table.updateSpec().renameField("category_bucket_8", "category_bucket_8_another_name").commit(); sql("REFRESH TABLE %s", tableName); // verify the metadata tables after renaming the second partition column @@ -380,53 +358,48 @@ public void testEntriesMetadataTable() throws ParseException { tableType); } } + @Test public void testPartitionsTableAddRemoveFields() throws ParseException { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg ", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg ", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); // verify the metadata tables while the current spec is still unpartitioned Dataset df = loadMetadataTable(PARTITIONS); - Assert.assertTrue("Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); + Assert.assertTrue( + "Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); // verify the metadata tables after adding the first partition column assertPartitions( - ImmutableList.of(row(new Object[]{null}), row("d1"), row("d2")), + ImmutableList.of(row(new Object[] {null}), row("d1"), row("d2")), "STRUCT", PARTITIONS); - table.updateSpec() - .addField("category") - .commit(); + table.updateSpec().addField("category").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); // verify the metadata tables after adding the second partition column - assertPartitions(ImmutableList.of( - row(null, null), - row("d1", null), - row("d1", "c1"), - row("d2", null), - row("d2", "c2")), + assertPartitions( + ImmutableList.of( + row(null, null), row("d1", null), row("d1", "c1"), row("d2", null), row("d2", "c2")), "STRUCT", PARTITIONS); // verify the metadata tables after removing the first partition column - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); @@ -446,82 +419,66 @@ public void testPartitionsTableAddRemoveFields() throws ParseException { @Test public void testPartitionsTableRenameFields() throws ParseException { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .addField("category") - .commit(); + table.updateSpec().addField("data").addField("category").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); - assertPartitions(ImmutableList.of( - row("d1", "c1"), - row("d2", "c2")), + assertPartitions( + ImmutableList.of(row("d1", "c1"), row("d2", "c2")), "STRUCT", PARTITIONS); - table.updateSpec() - .renameField("category", "category_another_name") - .commit(); + table.updateSpec().renameField("category", "category_another_name").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); assertPartitions( - ImmutableList.of( - row("d1", "c1"), - row("d2", "c2")), + ImmutableList.of(row("d1", "c1"), row("d2", "c2")), "STRUCT", PARTITIONS); } @Test public void testPartitionsTableSwitchFields() throws Exception { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); Table table = validationCatalog.loadTable(tableIdent); // verify the metadata tables after re-adding the first dropped column in the second location - table.updateSpec() - .addField("data") - .addField("category") - .commit(); + table.updateSpec().addField("data").addField("category").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); - assertPartitions(ImmutableList.of( - row("d1", "c1"), - row("d2", "c2")), + assertPartitions( + ImmutableList.of(row("d1", "c1"), row("d2", "c2")), "STRUCT", PARTITIONS); - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); assertPartitions( - ImmutableList.of( - row(null, "c1"), - row(null, "c2"), - row("d1", "c1"), - row("d2", "c2")), + ImmutableList.of(row(null, "c1"), row(null, "c2"), row("d1", "c1"), row("d2", "c2")), "STRUCT", PARTITIONS); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); @@ -541,16 +498,14 @@ public void testPartitionsTableSwitchFields() throws Exception { "STRUCT", PARTITIONS); } else { - // In V2 re-adding a former partition field that was part of an older spec will not change its name or its - // field ID either, thus values will be collapsed into a single common column (as opposed to V1 where any new + // In V2 re-adding a former partition field that was part of an older spec will not change its + // name or its + // field ID either, thus values will be collapsed into a single common column (as opposed to + // V1 where any new // partition field addition will result in a new column in this metadata table) assertPartitions( ImmutableList.of( - row(null, "c1"), - row(null, "c2"), - row("d1", "c1"), - row("d2", "c2"), - row("d3", "c3")), + row(null, "c1"), row(null, "c2"), row("d1", "c1"), row("d2", "c2"), row("d3", "c3")), "STRUCT", PARTITIONS); } @@ -559,7 +514,9 @@ public void testPartitionsTableSwitchFields() throws Exception { @Test public void testPartitionTableFilterAddRemoveFields() throws ParseException { // Create un-partitioned table - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); @@ -567,28 +524,22 @@ public void testPartitionTableFilterAddRemoveFields() throws ParseException { // Partition Table with one partition column Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); assertPartitions( - ImmutableList.of(row("d2")), - "STRUCT", - PARTITIONS, - "partition.data = 'd2'"); + ImmutableList.of(row("d2")), "STRUCT", PARTITIONS, "partition.data = 'd2'"); // Partition Table with two partition column - table.updateSpec() - .addField("category") - .commit(); + table.updateSpec().addField("category").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); - assertPartitions(ImmutableList.of(row("d2", null), row("d2", "c2")), + assertPartitions( + ImmutableList.of(row("d2", null), row("d2", "c2")), "STRUCT", PARTITIONS, "partition.data = 'd2'"); @@ -599,9 +550,7 @@ public void testPartitionTableFilterAddRemoveFields() throws ParseException { "partition.category = 'c2'"); // Partition Table with first partition column removed - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (3, 'c3', 'd2')", tableName); @@ -621,49 +570,42 @@ public void testPartitionTableFilterAddRemoveFields() throws ParseException { @Test public void testPartitionTableFilterSwitchFields() throws Exception { - // Re-added partition fields currently not re-associated: https://github.com/apache/iceberg/issues/4292 + // Re-added partition fields currently not re-associated: + // https://github.com/apache/iceberg/issues/4292 // In V1, dropped partition fields show separately when field is re-added // In V2, re-added field currently conflicts with its deleted form Assume.assumeTrue(formatVersion == 1); - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); Table table = validationCatalog.loadTable(tableIdent); // Two partition columns - table.updateSpec() - .addField("data") - .addField("category") - .commit(); + table.updateSpec().addField("data").addField("category").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); // Drop first partition column - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); // Re-add first partition column at the end - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); assertPartitions( - ImmutableList.of( - row(null, "c2", null), - row(null, "c2", "d2"), - row("d2", "c2", null)), + ImmutableList.of(row(null, "c2", null), row(null, "c2", "d2"), row("d2", "c2", null)), "STRUCT", PARTITIONS, "partition.category = 'c2'"); @@ -677,22 +619,19 @@ public void testPartitionTableFilterSwitchFields() throws Exception { @Test public void testPartitionsTableFilterRenameFields() throws ParseException { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .addField("category") - .commit(); + table.updateSpec().addField("data").addField("category").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); - table.updateSpec() - .renameField("category", "category_another_name") - .commit(); + table.updateSpec().renameField("category", "category_another_name").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); @@ -706,15 +645,19 @@ public void testPartitionsTableFilterRenameFields() throws ParseException { @Test public void testMetadataTablesWithUnknownTransforms() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); Table table = validationCatalog.loadTable(tableIdent); - PartitionSpec unknownSpec = PartitionSpecParser.fromJson(table.schema(), - "{ \"spec-id\": 1, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); + PartitionSpec unknownSpec = + PartitionSpecParser.fromJson( + table.schema(), + "{ \"spec-id\": 1, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); // replace the table spec to include an unknown transform TableOperations ops = ((HasTableOperations) table).operations(); @@ -724,29 +667,37 @@ public void testMetadataTablesWithUnknownTransforms() { sql("REFRESH TABLE %s", tableName); for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES, ENTRIES, ALL_ENTRIES)) { - AssertHelpers.assertThrows("Should complain about the partition type", - ValidationException.class, "Cannot build table partition type, unknown transforms", + AssertHelpers.assertThrows( + "Should complain about the partition type", + ValidationException.class, + "Cannot build table partition type, unknown transforms", () -> loadMetadataTable(tableType)); } } @Test public void testPartitionColumnNamedPartition() { - sql("CREATE TABLE %s (id int, partition int) USING iceberg PARTITIONED BY (partition)", tableName); + sql( + "CREATE TABLE %s (id int, partition int) USING iceberg PARTITIONED BY (partition)", + tableName); sql("INSERT INTO %s VALUES (1, 1), (2, 1), (3, 2), (2, 2)", tableName); - List expected = ImmutableList.of( - row(1, 1), row(2, 1), row(3, 2), row(2, 2)); + List expected = ImmutableList.of(row(1, 1), row(2, 1), row(3, 2), row(2, 2)); assertEquals("Should return all expected rows", expected, sql("SELECT * FROM %s", tableName)); Assert.assertEquals(2, sql("SELECT * FROM %s.files", tableName).size()); } - private void assertPartitions(List expectedPartitions, String expectedTypeAsString, - MetadataTableType tableType) throws ParseException { + private void assertPartitions( + List expectedPartitions, String expectedTypeAsString, MetadataTableType tableType) + throws ParseException { assertPartitions(expectedPartitions, expectedTypeAsString, tableType, null); } - private void assertPartitions(List expectedPartitions, String expectedTypeAsString, - MetadataTableType tableType, String filter) throws ParseException { + private void assertPartitions( + List expectedPartitions, + String expectedTypeAsString, + MetadataTableType tableType, + String filter) + throws ParseException { Dataset df = loadMetadataTable(tableType); if (filter != null) { df = df.filter(filter); @@ -776,18 +727,18 @@ private void assertPartitions(List expectedPartitions, String expected case PARTITIONS: case FILES: case ALL_DATA_FILES: - List actualFilesPartitions = df.orderBy("partition") - .select("partition.*") - .collectAsList(); - assertEquals("Partitions must match", expectedPartitions, rowsToJava(actualFilesPartitions)); + List actualFilesPartitions = + df.orderBy("partition").select("partition.*").collectAsList(); + assertEquals( + "Partitions must match", expectedPartitions, rowsToJava(actualFilesPartitions)); break; case ENTRIES: case ALL_ENTRIES: - List actualEntriesPartitions = df.orderBy("data_file.partition") - .select("data_file.partition.*") - .collectAsList(); - assertEquals("Partitions must match", expectedPartitions, rowsToJava(actualEntriesPartitions)); + List actualEntriesPartitions = + df.orderBy("data_file.partition").select("data_file.partition.*").collectAsList(); + assertEquals( + "Partitions must match", expectedPartitions, rowsToJava(actualEntriesPartitions)); break; default: @@ -800,7 +751,9 @@ private Dataset loadMetadataTable(MetadataTableType tableType) { } private void initTable() { - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DEFAULT_FILE_FORMAT, fileFormat.name()); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, DEFAULT_FILE_FORMAT, fileFormat.name()); sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, FORMAT_VERSION, formatVersion); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java index adfe8c7d3649..f585ed360f95 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.List; @@ -52,8 +53,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; - @RunWith(Parameterized.class) public class TestParquetScan extends AvroDataTest { private static final Configuration CONF = new Configuration(); @@ -72,12 +71,11 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Parameterized.Parameters(name = "vectorized = {0}") public static Object[] parameters() { - return new Object[] { false, true }; + return new Object[] {false, true}; } private final boolean vectorized; @@ -88,18 +86,20 @@ public TestParquetScan(boolean vectorized) { @Override protected void writeAndValidate(Schema schema) throws IOException { - Assume.assumeTrue("Cannot handle non-string map keys in parquet-avro", - null == TypeUtil.find( - schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); + Assume.assumeTrue( + "Cannot handle non-string map keys in parquet-avro", + null + == TypeUtil.find( + schema, + type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); File parent = temp.newFolder("parquet"); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); dataFolder.mkdirs(); - File parquetFile = new File(dataFolder, - FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); + File parquetFile = + new File(dataFolder, FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); HadoopTables tables = new HadoopTables(CONF); Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); @@ -110,24 +110,25 @@ protected void writeAndValidate(Schema schema) throws IOException { List expected = RandomData.generateList(tableSchema, 100, 1L); - try (FileAppender writer = Parquet.write(localOutput(parquetFile)) - .schema(tableSchema) - .build()) { + try (FileAppender writer = + Parquet.write(localOutput(parquetFile)).schema(tableSchema).build()) { writer.addAll(expected); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withFileSizeInBytes(parquetFile.length()) - .withPath(parquetFile.toString()) - .withRecordCount(100) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withFileSizeInBytes(parquetFile.length()) + .withPath(parquetFile.toString()) + .withRecordCount(100) + .build(); table.newAppend().appendFile(file).commit(); - table.updateProperties().set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)).commit(); + table + .updateProperties() + .set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .commit(); - Dataset df = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset df = spark.read().format("iceberg").load(location.toString()); List rows = df.collectAsList(); Assert.assertEquals("Should contain 100 rows", 100, rows.size()); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java index 24f7b69e1dc5..ffe21432f00c 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; @@ -78,11 +77,11 @@ public class TestPartitionPruning { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } @@ -97,9 +96,12 @@ public TestPartitionPruning(String format, boolean vectorized) { private static SparkSession spark = null; private static JavaSparkContext sparkContext = null; - private static Transform bucketTransform = Transforms.bucket(Types.IntegerType.get(), 3); - private static Transform truncateTransform = Transforms.truncate(Types.StringType.get(), 5); - private static Transform hourTransform = Transforms.hour(Types.TimestampType.withoutZone()); + private static Transform bucketTransform = + Transforms.bucket(Types.IntegerType.get(), 3); + private static Transform truncateTransform = + Transforms.truncate(Types.StringType.get(), 5); + private static Transform hourTransform = + Transforms.hour(Types.TimestampType.withoutZone()); @BeforeClass public static void startSpark() { @@ -110,12 +112,21 @@ public static void startSpark() { CONF.set(optionKey, CountOpenLocalFileSystem.class.getName()); spark.conf().set(optionKey, CountOpenLocalFileSystem.class.getName()); spark.conf().set("spark.sql.session.timeZone", "UTC"); - spark.udf().register("bucket3", (Integer num) -> bucketTransform.apply(num), DataTypes.IntegerType); - spark.udf().register("truncate5", (String str) -> truncateTransform.apply(str), DataTypes.StringType); + spark + .udf() + .register("bucket3", (Integer num) -> bucketTransform.apply(num), DataTypes.IntegerType); + spark + .udf() + .register("truncate5", (String str) -> truncateTransform.apply(str), DataTypes.StringType); // NOTE: date transforms take the type long, not Timestamp - spark.udf().register("hour", (Timestamp ts) -> hourTransform.apply( - org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp(ts)), - DataTypes.IntegerType); + spark + .udf() + .register( + "hour", + (Timestamp ts) -> + hourTransform.apply( + org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp(ts)), + DataTypes.IntegerType); } @AfterClass @@ -125,70 +136,70 @@ public static void stopSpark() { currentSpark.stop(); } - private static final Schema LOG_SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "date", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get()), - Types.NestedField.optional(5, "timestamp", Types.TimestampType.withZone()) - ); - - private static final List LOGS = ImmutableList.of( - LogMessage.debug("2020-02-02", "debug event 1", getInstant("2020-02-02T00:00:00")), - LogMessage.info("2020-02-02", "info event 1", getInstant("2020-02-02T01:00:00")), - LogMessage.debug("2020-02-02", "debug event 2", getInstant("2020-02-02T02:00:00")), - LogMessage.info("2020-02-03", "info event 2", getInstant("2020-02-03T00:00:00")), - LogMessage.debug("2020-02-03", "debug event 3", getInstant("2020-02-03T01:00:00")), - LogMessage.info("2020-02-03", "info event 3", getInstant("2020-02-03T02:00:00")), - LogMessage.error("2020-02-03", "error event 1", getInstant("2020-02-03T03:00:00")), - LogMessage.debug("2020-02-04", "debug event 4", getInstant("2020-02-04T01:00:00")), - LogMessage.warn("2020-02-04", "warn event 1", getInstant("2020-02-04T02:00:00")), - LogMessage.debug("2020-02-04", "debug event 5", getInstant("2020-02-04T03:00:00")) - ); + private static final Schema LOG_SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "date", Types.StringType.get()), + Types.NestedField.optional(3, "level", Types.StringType.get()), + Types.NestedField.optional(4, "message", Types.StringType.get()), + Types.NestedField.optional(5, "timestamp", Types.TimestampType.withZone())); + + private static final List LOGS = + ImmutableList.of( + LogMessage.debug("2020-02-02", "debug event 1", getInstant("2020-02-02T00:00:00")), + LogMessage.info("2020-02-02", "info event 1", getInstant("2020-02-02T01:00:00")), + LogMessage.debug("2020-02-02", "debug event 2", getInstant("2020-02-02T02:00:00")), + LogMessage.info("2020-02-03", "info event 2", getInstant("2020-02-03T00:00:00")), + LogMessage.debug("2020-02-03", "debug event 3", getInstant("2020-02-03T01:00:00")), + LogMessage.info("2020-02-03", "info event 3", getInstant("2020-02-03T02:00:00")), + LogMessage.error("2020-02-03", "error event 1", getInstant("2020-02-03T03:00:00")), + LogMessage.debug("2020-02-04", "debug event 4", getInstant("2020-02-04T01:00:00")), + LogMessage.warn("2020-02-04", "warn event 1", getInstant("2020-02-04T02:00:00")), + LogMessage.debug("2020-02-04", "debug event 5", getInstant("2020-02-04T03:00:00"))); private static Instant getInstant(String timestampWithoutZone) { - Long epochMicros = (Long) Literal.of(timestampWithoutZone).to(Types.TimestampType.withoutZone()).value(); + Long epochMicros = + (Long) Literal.of(timestampWithoutZone).to(Types.TimestampType.withoutZone()).value(); return Instant.ofEpochMilli(TimeUnit.MICROSECONDS.toMillis(epochMicros)); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private PartitionSpec spec = PartitionSpec.builderFor(LOG_SCHEMA) - .identity("date") - .identity("level") - .bucket("id", 3) - .truncate("message", 5) - .hour("timestamp") - .build(); + private PartitionSpec spec = + PartitionSpec.builderFor(LOG_SCHEMA) + .identity("date") + .identity("level") + .bucket("id", 3) + .truncate("message", 5) + .hour("timestamp") + .build(); @Test public void testPartitionPruningIdentityString() { String filterCond = "date >= '2020-02-03' AND level = 'DEBUG'"; - Predicate partCondition = (Row r) -> { - String date = r.getString(0); - String level = r.getString(1); - return date.compareTo("2020-02-03") >= 0 && level.equals("DEBUG"); - }; + Predicate partCondition = + (Row r) -> { + String date = r.getString(0); + String level = r.getString(1); + return date.compareTo("2020-02-03") >= 0 && level.equals("DEBUG"); + }; runTest(filterCond, partCondition); } @Test public void testPartitionPruningBucketingInteger() { - final int[] ids = new int[]{ - LOGS.get(3).getId(), - LOGS.get(7).getId() - }; - String condForIds = Arrays.stream(ids).mapToObj(String::valueOf) - .collect(Collectors.joining(",", "(", ")")); + final int[] ids = new int[] {LOGS.get(3).getId(), LOGS.get(7).getId()}; + String condForIds = + Arrays.stream(ids).mapToObj(String::valueOf).collect(Collectors.joining(",", "(", ")")); String filterCond = "id in " + condForIds; - Predicate partCondition = (Row r) -> { - int bucketId = r.getInt(2); - Set buckets = Arrays.stream(ids).map(bucketTransform::apply) - .boxed().collect(Collectors.toSet()); - return buckets.contains(bucketId); - }; + Predicate partCondition = + (Row r) -> { + int bucketId = r.getInt(2); + Set buckets = + Arrays.stream(ids).map(bucketTransform::apply).boxed().collect(Collectors.toSet()); + return buckets.contains(bucketId); + }; runTest(filterCond, partCondition); } @@ -196,10 +207,11 @@ public void testPartitionPruningBucketingInteger() { @Test public void testPartitionPruningTruncatedString() { String filterCond = "message like 'info event%'"; - Predicate partCondition = (Row r) -> { - String truncatedMessage = r.getString(3); - return truncatedMessage.equals("info "); - }; + Predicate partCondition = + (Row r) -> { + String truncatedMessage = r.getString(3); + return truncatedMessage.equals("info "); + }; runTest(filterCond, partCondition); } @@ -207,10 +219,11 @@ public void testPartitionPruningTruncatedString() { @Test public void testPartitionPruningTruncatedStringComparingValueShorterThanPartitionValue() { String filterCond = "message like 'inf%'"; - Predicate partCondition = (Row r) -> { - String truncatedMessage = r.getString(3); - return truncatedMessage.startsWith("inf"); - }; + Predicate partCondition = + (Row r) -> { + String truncatedMessage = r.getString(3); + return truncatedMessage.startsWith("inf"); + }; runTest(filterCond, partCondition); } @@ -219,17 +232,20 @@ public void testPartitionPruningTruncatedStringComparingValueShorterThanPartitio public void testPartitionPruningHourlyPartition() { String filterCond; if (spark.version().startsWith("2")) { - // Looks like from Spark 2 we need to compare timestamp with timestamp to push down the filter. + // Looks like from Spark 2 we need to compare timestamp with timestamp to push down the + // filter. filterCond = "timestamp >= to_timestamp('2020-02-03T01:00:00')"; } else { filterCond = "timestamp >= '2020-02-03T01:00:00'"; } - Predicate partCondition = (Row r) -> { - int hourValue = r.getInt(4); - Instant instant = getInstant("2020-02-03T01:00:00"); - Integer hourValueToFilter = hourTransform.apply(TimeUnit.MILLISECONDS.toMicros(instant.toEpochMilli())); - return hourValue >= hourValueToFilter; - }; + Predicate partCondition = + (Row r) -> { + int hourValue = r.getInt(4); + Instant instant = getInstant("2020-02-03T01:00:00"); + Integer hourValueToFilter = + hourTransform.apply(TimeUnit.MILLISECONDS.toMicros(instant.toEpochMilli())); + return hourValue >= hourValueToFilter; + }; runTest(filterCond, partCondition); } @@ -242,24 +258,26 @@ private void runTest(String filterCond, Predicate partCondition) { Dataset logs = createTestDataset(); saveTestDatasetToTable(logs, table); - List expected = logs - .select("id", "date", "level", "message", "timestamp") - .filter(filterCond) - .orderBy("id") - .collectAsList(); + List expected = + logs.select("id", "date", "level", "message", "timestamp") + .filter(filterCond) + .orderBy("id") + .collectAsList(); Assert.assertFalse("Expected rows should be not empty", expected.isEmpty()); // remove records which may be recorded during storing to table CountOpenLocalFileSystem.resetRecordsInPathPrefix(originTableLocation.getAbsolutePath()); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()) - .select("id", "date", "level", "message", "timestamp") - .filter(filterCond) - .orderBy("id") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table.location()) + .select("id", "date", "level", "message", "timestamp") + .filter(filterCond) + .orderBy("id") + .collectAsList(); Assert.assertFalse("Actual rows should not be empty", actual.isEmpty()); Assert.assertEquals("Rows should match", expected, actual); @@ -282,40 +300,59 @@ private Table createTable(File originTableLocation) { } private Dataset createTestDataset() { - List rows = LOGS.stream().map(logMessage -> { - Object[] underlying = new Object[] { - logMessage.getId(), - UTF8String.fromString(logMessage.getDate()), - UTF8String.fromString(logMessage.getLevel()), - UTF8String.fromString(logMessage.getMessage()), - // discard the nanoseconds part to simplify - TimeUnit.MILLISECONDS.toMicros(logMessage.getTimestamp().toEpochMilli()) - }; - return new GenericInternalRow(underlying); - }).collect(Collectors.toList()); + List rows = + LOGS.stream() + .map( + logMessage -> { + Object[] underlying = + new Object[] { + logMessage.getId(), + UTF8String.fromString(logMessage.getDate()), + UTF8String.fromString(logMessage.getLevel()), + UTF8String.fromString(logMessage.getMessage()), + // discard the nanoseconds part to simplify + TimeUnit.MILLISECONDS.toMicros(logMessage.getTimestamp().toEpochMilli()) + }; + return new GenericInternalRow(underlying); + }) + .collect(Collectors.toList()); JavaRDD rdd = sparkContext.parallelize(rows); - Dataset df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(LOG_SCHEMA), false); - - return df - .selectExpr("id", "date", "level", "message", "timestamp") - .selectExpr("id", "date", "level", "message", "timestamp", "bucket3(id) AS bucket_id", - "truncate5(message) AS truncated_message", "hour(timestamp) AS ts_hour"); + Dataset df = + spark.internalCreateDataFrame( + JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(LOG_SCHEMA), false); + + return df.selectExpr("id", "date", "level", "message", "timestamp") + .selectExpr( + "id", + "date", + "level", + "message", + "timestamp", + "bucket3(id) AS bucket_id", + "truncate5(message) AS truncated_message", + "hour(timestamp) AS ts_hour"); } private void saveTestDatasetToTable(Dataset logs, Table table) { logs.orderBy("date", "level", "bucket_id", "truncated_message", "ts_hour") .select("id", "date", "level", "message", "timestamp") - .write().format("iceberg").mode("append").save(table.location()); + .write() + .format("iceberg") + .mode("append") + .save(table.location()); } - private void assertAccessOnDataFiles(File originTableLocation, Table table, Predicate partCondition) { + private void assertAccessOnDataFiles( + File originTableLocation, Table table, Predicate partCondition) { // only use files in current table location to avoid side-effects on concurrent test runs - Set readFilesInQuery = CountOpenLocalFileSystem.pathToNumOpenCalled.keySet() - .stream().filter(path -> path.startsWith(originTableLocation.getAbsolutePath())) - .collect(Collectors.toSet()); + Set readFilesInQuery = + CountOpenLocalFileSystem.pathToNumOpenCalled.keySet().stream() + .filter(path -> path.startsWith(originTableLocation.getAbsolutePath())) + .collect(Collectors.toSet()); - List files = spark.read().format("iceberg").load(table.location() + "#files").collectAsList(); + List files = + spark.read().format("iceberg").load(table.location() + "#files").collectAsList(); Set filesToRead = extractFilePathsMatchingConditionOnPartition(files, partCondition); Set filesToNotRead = extractFilePathsNotIn(files, filesToRead); @@ -325,37 +362,51 @@ private void assertAccessOnDataFiles(File originTableLocation, Table table, Pred Assert.assertFalse("The query should prune some data files.", filesToNotRead.isEmpty()); - // We don't check "all" data files bound to the condition are being read, as data files can be pruned on + // We don't check "all" data files bound to the condition are being read, as data files can be + // pruned on // other conditions like lower/upper bound of columns. - Assert.assertFalse("Some of data files in partition range should be read. " + - "Read files in query: " + readFilesInQuery + " / data files in partition range: " + filesToRead, + Assert.assertFalse( + "Some of data files in partition range should be read. " + + "Read files in query: " + + readFilesInQuery + + " / data files in partition range: " + + filesToRead, Sets.intersection(filesToRead, readFilesInQuery).isEmpty()); // Data files which aren't bound to the condition shouldn't be read. - Assert.assertTrue("Data files outside of partition range should not be read. " + - "Read files in query: " + readFilesInQuery + " / data files outside of partition range: " + filesToNotRead, + Assert.assertTrue( + "Data files outside of partition range should not be read. " + + "Read files in query: " + + readFilesInQuery + + " / data files outside of partition range: " + + filesToNotRead, Sets.intersection(filesToNotRead, readFilesInQuery).isEmpty()); } - private Set extractFilePathsMatchingConditionOnPartition(List files, Predicate condition) { + private Set extractFilePathsMatchingConditionOnPartition( + List files, Predicate condition) { // idx 1: file_path, idx 3: partition return files.stream() - .filter(r -> { - Row partition = r.getStruct(4); - return condition.test(partition); - }).map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) + .filter( + r -> { + Row partition = r.getStruct(4); + return condition.test(partition); + }) + .map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) .collect(Collectors.toSet()); } private Set extractFilePathsNotIn(List files, Set filePaths) { - Set allFilePaths = files.stream().map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) - .collect(Collectors.toSet()); + Set allFilePaths = + files.stream() + .map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) + .collect(Collectors.toSet()); return Sets.newHashSet(Sets.symmetricDifference(allFilePaths, filePaths)); } public static class CountOpenLocalFileSystem extends RawLocalFileSystem { - public static String scheme = String.format("TestIdentityPartitionData%dfs", - new Random().nextInt()); + public static String scheme = + String.format("TestIdentityPartitionData%dfs", new Random().nextInt()); public static Map pathToNumOpenCalled = Maps.newConcurrentMap(); public static String convertPath(String absPath) { @@ -401,13 +452,15 @@ public String getScheme() { @Override public FSDataInputStream open(Path f, int bufferSize) throws IOException { String path = f.toUri().getPath(); - pathToNumOpenCalled.compute(path, (ignored, v) -> { - if (v == null) { - return 1L; - } else { - return v + 1; - } - }); + pathToNumOpenCalled.compute( + path, + (ignored, v) -> { + if (v == null) { + return 1L; + } else { + return v + 1; + } + }); return super.open(f, bufferSize); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java index 9d1b49f1aa8e..df2e4649d9f9 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.util.List; import org.apache.avro.generic.GenericData; @@ -56,46 +58,43 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestPartitionValues { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } - private static final Schema SUPPORTED_PRIMITIVES = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - required(103, "i", Types.IntegerType.get()), - required(104, "l", Types.LongType.get()), - required(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - required(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - required(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision - ); - - private static final Schema SIMPLE_SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get())); - - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SIMPLE_SCHEMA) - .identity("data") - .build(); + private static final Schema SUPPORTED_PRIMITIVES = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + required(103, "i", Types.IntegerType.get()), + required(104, "l", Types.LongType.get()), + required(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + required(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + required(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision + ); + + private static final Schema SIMPLE_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SIMPLE_SCHEMA).identity("data").build(); private static SparkSession spark = null; @@ -111,8 +110,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String format; private final boolean vectorized; @@ -134,29 +132,30 @@ public void testNullPartitionValue() throws Exception { Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, null) - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, null)); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(location.toString()); - Dataset result = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()); + Dataset result = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(location.toString()); - List actual = result - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -174,29 +173,28 @@ public void testReorderedColumns() throws Exception { Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("data", "id").write() - .format("iceberg") - .mode(SaveMode.Append) - .option(SparkWriteOptions.CHECK_ORDERING, "false") - .save(location.toString()); + df.select("data", "id") + .write() + .format("iceberg") + .mode(SaveMode.Append) + .option(SparkWriteOptions.CHECK_ORDERING, "false") + .save(location.toString()); - Dataset result = spark.read() + Dataset result = + spark + .read() .format("iceberg") .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) .load(location.toString()); - List actual = result - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -214,30 +212,29 @@ public void testReorderedColumnsNoNullability() throws Exception { Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("data", "id").write() - .format("iceberg") - .mode(SaveMode.Append) - .option(SparkWriteOptions.CHECK_ORDERING, "false") - .option(SparkWriteOptions.CHECK_NULLABILITY, "false") - .save(location.toString()); + df.select("data", "id") + .write() + .format("iceberg") + .mode(SaveMode.Append) + .option(SparkWriteOptions.CHECK_ORDERING, "false") + .option(SparkWriteOptions.CHECK_NULLABILITY, "false") + .save(location.toString()); - Dataset result = spark.read() + Dataset result = + spark + .read() .format("iceberg") .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) .load(location.toString()); - List actual = result - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -245,9 +242,10 @@ public void testReorderedColumnsNoNullability() throws Exception { @Test public void testPartitionValueTypes() throws Exception { - String[] columnNames = new String[] { - "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" - }; + String[] columnNames = + new String[] { + "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" + }; HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); @@ -259,23 +257,27 @@ public void testPartitionValueTypes() throws Exception { List expected = RandomData.generateList(source.schema(), 2, 128735L); File avroData = temp.newFile("data.avro"); Assert.assertTrue(avroData.delete()); - try (FileAppender appender = Avro.write(Files.localOutput(avroData)) - .schema(source.schema()) - .build()) { + try (FileAppender appender = + Avro.write(Files.localOutput(avroData)).schema(source.schema()).build()) { appender.addAll(expected); } // add the Avro data file to the source table - source.newAppend() - .appendFile(DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(10) - .withInputFile(Files.localInput(avroData)) - .build()) + source + .newAppend() + .appendFile( + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(10) + .withInputFile(Files.localInput(avroData)) + .build()) .commit(); - Dataset sourceDF = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(sourceLocation); + Dataset sourceDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(sourceLocation); for (String column : columnNames) { String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString(); @@ -290,17 +292,20 @@ public void testPartitionValueTypes() throws Exception { Table table = tables.create(SUPPORTED_PRIMITIVES, spec, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - sourceDF.write() + sourceDF + .write() .format("iceberg") .mode(SaveMode.Append) .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false") .save(location.toString()); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(location.toString()) + .collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); @@ -313,9 +318,10 @@ public void testPartitionValueTypes() throws Exception { @Test public void testNestedPartitionValues() throws Exception { - String[] columnNames = new String[] { - "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" - }; + String[] columnNames = + new String[] { + "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" + }; HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); Schema nestedSchema = new Schema(optional(1, "nested", SUPPORTED_PRIMITIVES.asStruct())); @@ -328,23 +334,27 @@ public void testNestedPartitionValues() throws Exception { List expected = RandomData.generateList(source.schema(), 2, 128735L); File avroData = temp.newFile("data.avro"); Assert.assertTrue(avroData.delete()); - try (FileAppender appender = Avro.write(Files.localOutput(avroData)) - .schema(source.schema()) - .build()) { + try (FileAppender appender = + Avro.write(Files.localOutput(avroData)).schema(source.schema()).build()) { appender.addAll(expected); } // add the Avro data file to the source table - source.newAppend() - .appendFile(DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(10) - .withInputFile(Files.localInput(avroData)) - .build()) + source + .newAppend() + .appendFile( + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(10) + .withInputFile(Files.localInput(avroData)) + .build()) .commit(); - Dataset sourceDF = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(sourceLocation); + Dataset sourceDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(sourceLocation); for (String column : columnNames) { String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString(); @@ -354,46 +364,51 @@ public void testNestedPartitionValues() throws Exception { File dataFolder = new File(location, "data"); Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity("nested." + column).build(); + PartitionSpec spec = + PartitionSpec.builderFor(nestedSchema).identity("nested." + column).build(); Table table = tables.create(nestedSchema, spec, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - sourceDF.write() + sourceDF + .write() .format("iceberg") .mode(SaveMode.Append) .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false") .save(location.toString()); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(location.toString()) + .collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe( - nestedSchema.asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe(nestedSchema.asStruct(), expected.get(i), actual.get(i)); } } } /** * To verify if WrappedPositionAccessor is generated against a string field within a nested field, - * rather than a Position2Accessor. - * Or when building the partition path, a ClassCastException is thrown with the message like: - * Cannot cast org.apache.spark.unsafe.types.UTF8String to java.lang.CharSequence + * rather than a Position2Accessor. Or when building the partition path, a ClassCastException is + * thrown with the message like: Cannot cast org.apache.spark.unsafe.types.UTF8String to + * java.lang.CharSequence */ @Test public void testPartitionedByNestedString() throws Exception { // schema and partition spec - Schema nestedSchema = new Schema( - Types.NestedField.required(1, "struct", - Types.StructType.of(Types.NestedField.required(2, "string", Types.StringType.get())) - ) - ); + Schema nestedSchema = + new Schema( + Types.NestedField.required( + 1, + "struct", + Types.StructType.of( + Types.NestedField.required(2, "string", Types.StringType.get())))); PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity("struct.string").build(); // create table @@ -403,14 +418,14 @@ public void testPartitionedByNestedString() throws Exception { // input data frame StructField[] structFields = { - new StructField("struct", - DataTypes.createStructType( - new StructField[] { - new StructField("string", DataTypes.StringType, false, Metadata.empty()) - } - ), - false, Metadata.empty() - ) + new StructField( + "struct", + DataTypes.createStructType( + new StructField[] { + new StructField("string", DataTypes.StringType, false, Metadata.empty()) + }), + false, + Metadata.empty()) }; List rows = Lists.newArrayList(); @@ -418,17 +433,16 @@ public void testPartitionedByNestedString() throws Exception { Dataset sourceDF = spark.createDataFrame(rows, new StructType(structFields)); // write into iceberg - sourceDF.write() - .format("iceberg") - .mode(SaveMode.Append) - .save(baseLocation); + sourceDF.write().format("iceberg").mode(SaveMode.Append).save(baseLocation); // verify - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(baseLocation) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(baseLocation) + .collectAsList(); Assert.assertEquals("Number of rows should match", rows.size(), actual.size()); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestPathIdentifier.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestPathIdentifier.java index ff4fe22a7a8a..f58451296cef 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestPathIdentifier.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestPathIdentifier.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.BaseTable; @@ -42,16 +43,13 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestPathIdentifier extends SparkTestBase { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), required(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableLocation; private PathIdentifier identifier; private SparkCatalog sparkCatalog; @@ -72,17 +70,16 @@ public void after() { @Test public void testPathIdentifier() throws TableAlreadyExistsException, NoSuchTableException { - SparkTable table = sparkCatalog.createTable(identifier, - SparkSchemaUtil.convert(SCHEMA), - new Transform[0], - ImmutableMap.of()); + SparkTable table = + sparkCatalog.createTable( + identifier, SparkSchemaUtil.convert(SCHEMA), new Transform[0], ImmutableMap.of()); Assert.assertEquals(table.table().location(), tableLocation.getAbsolutePath()); Assertions.assertThat(table.table()).isInstanceOf(BaseTable.class); - Assertions.assertThat(((BaseTable) table.table()).operations()).isInstanceOf(HadoopTableOperations.class); + Assertions.assertThat(((BaseTable) table.table()).operations()) + .isInstanceOf(HadoopTableOperations.class); Assert.assertEquals(sparkCatalog.loadTable(identifier), table); Assert.assertTrue(sparkCatalog.dropTable(identifier)); } } - diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java index 8d65b64cab6d..cfc746f6e932 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.avro.Schema.Type.UNION; + import java.io.IOException; import java.util.List; import java.util.Map; @@ -37,8 +38,6 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.avro.Schema.Type.UNION; - public abstract class TestReadProjection { final String format; @@ -46,20 +45,17 @@ public abstract class TestReadProjection { this.format = format; } - protected abstract Record writeAndRead(String desc, - Schema writeSchema, - Schema readSchema, - Record record) throws IOException; + protected abstract Record writeAndRead( + String desc, Schema writeSchema, Schema readSchema, Record record) throws IOException; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); @@ -67,32 +63,33 @@ public void testFullProjection() throws Exception { Record projected = writeAndRead("full_projection", schema, schema, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("data")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("data")); Assert.assertEquals("Should contain the correct data value", 0, cmp); } @Test public void testReorderedFullProjection() throws Exception { -// Assume.assumeTrue( -// "Spark's Parquet read support does not support reordered columns", -// !format.equalsIgnoreCase("parquet")); + // Assume.assumeTrue( + // "Spark's Parquet read support does not support reordered columns", + // !format.equalsIgnoreCase("parquet")); - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); record.setField("data", "test"); - Schema reordered = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("reordered_full_projection", schema, reordered, record); @@ -102,24 +99,24 @@ public void testReorderedFullProjection() throws Exception { @Test public void testReorderedProjection() throws Exception { -// Assume.assumeTrue( -// "Spark's Parquet read support does not support reordered columns", -// !format.equalsIgnoreCase("parquet")); + // Assume.assumeTrue( + // "Spark's Parquet read support does not support reordered columns", + // !format.equalsIgnoreCase("parquet")); - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); record.setField("data", "test"); - Schema reordered = new Schema( - Types.NestedField.optional(2, "missing_1", Types.StringType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(3, "missing_2", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(2, "missing_1", Types.StringType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(3, "missing_2", Types.LongType.get())); Record projected = writeAndRead("reordered_projection", schema, reordered, record); @@ -130,10 +127,10 @@ public void testReorderedProjection() throws Exception { @Test public void testEmptyProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); @@ -152,68 +149,68 @@ public void testEmptyProjection() throws Exception { @Test public void testBasicProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); record.setField("data", "test"); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("basic_projection_id", writeSchema, idOnly, record); Assert.assertNull("Should not project data", projected.getField("data")); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); - Schema dataOnly = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("data")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("data")); Assert.assertEquals("Should contain the correct data value", 0, cmp); } @Test public void testRename() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); record.setField("data", "test"); - Schema readSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get()) - ); + Schema readSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); Record projected = writeAndRead("project_and_rename", writeSchema, readSchema, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("renamed")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("renamed")); Assert.assertEquals("Should contain the correct data/renamed value", 0, cmp); } @Test public void testNestedStructProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); @@ -222,61 +219,76 @@ public void testNestedStructProjection() throws Exception { location.setField("long", -1.539054f); record.setField("location", location); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); Record projectedLocation = (Record) projected.getField("location"); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project location", projectedLocation); - Schema latOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()) - )) - ); + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); projected = writeAndRead("latitude_only", writeSchema, latOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); Assert.assertNull("Should not project longitude", projectedLocation.getField("long")); - Assert.assertEquals("Should project latitude", - 52.995143f, (float) projectedLocation.getField("lat"), 0.000001f); - - Schema longOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Assert.assertEquals( + "Should project latitude", + 52.995143f, + (float) projectedLocation.getField("lat"), + 0.000001f); + + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); projected = writeAndRead("longitude_only", writeSchema, longOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); Assert.assertNull("Should not project latitutde", projectedLocation.getField("lat")); - Assert.assertEquals("Should project longitude", - -1.539054f, (float) projectedLocation.getField("long"), 0.000001f); + Assert.assertEquals( + "Should project longitude", + -1.539054f, + (float) projectedLocation.getField("long"), + 0.000001f); Schema locationOnly = writeSchema.select("location"); projected = writeAndRead("location_only", writeSchema, locationOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); - Assert.assertEquals("Should project latitude", - 52.995143f, (float) projectedLocation.getField("lat"), 0.000001f); - Assert.assertEquals("Should project longitude", - -1.539054f, (float) projectedLocation.getField("long"), 0.000001f); + Assert.assertEquals( + "Should project latitude", + 52.995143f, + (float) projectedLocation.getField("lat"), + 0.000001f); + Assert.assertEquals( + "Should project longitude", + -1.539054f, + (float) projectedLocation.getField("long"), + 0.000001f); } @Test public void testMapProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "properties", - Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "properties", + Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); Map properties = ImmutableMap.of("a", "A", "b", "B"); @@ -284,31 +296,36 @@ public void testMapProjection() throws IOException { record.setField("id", 34L); record.setField("properties", properties); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project properties map", projected.getField("properties")); Schema keyOnly = writeSchema.select("properties.key"); projected = writeAndRead("key_only", writeSchema, keyOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); Schema valueOnly = writeSchema.select("properties.value"); projected = writeAndRead("value_only", writeSchema, valueOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); Schema mapOnly = writeSchema.select("properties"); projected = writeAndRead("map_only", writeSchema, mapOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); } private Map toStringMap(Map map) { @@ -325,16 +342,19 @@ public void testMapProjection() throws IOException { @Test public void testMapOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); @@ -346,91 +366,100 @@ public void testMapOfStructsProjection() throws IOException { l2.setField("long", -1.539054f); record.setField("locations", ImmutableMap.of("L1", l1, "L2", l2)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project locations map", projected.getField("locations")); projected = writeAndRead("all_locations", writeSchema, writeSchema.select("locations"), record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project locations map", - record.getField("locations"), toStringMap((Map) projected.getField("locations"))); + Assert.assertEquals( + "Should project locations map", + record.getField("locations"), + toStringMap((Map) projected.getField("locations"))); - projected = writeAndRead("lat_only", - writeSchema, writeSchema.select("locations.lat"), record); + projected = writeAndRead("lat_only", writeSchema, writeSchema.select("locations.lat"), record); Assert.assertNull("Should not project id", projected.getField("id")); Map locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); Record projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain lat", - 53.992811f, (float) projectedL1.getField("lat"), 0.000001); + Assert.assertEquals( + "L1 should contain lat", 53.992811f, (float) projectedL1.getField("lat"), 0.000001); Assert.assertNull("L1 should not contain long", projectedL1.getField("long")); Record projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain lat", - 52.995143f, (float) projectedL2.getField("lat"), 0.000001); + Assert.assertEquals( + "L2 should contain lat", 52.995143f, (float) projectedL2.getField("lat"), 0.000001); Assert.assertNull("L2 should not contain long", projectedL2.getField("long")); - projected = writeAndRead("long_only", - writeSchema, writeSchema.select("locations.long"), record); + projected = + writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), record); Assert.assertNull("Should not project id", projected.getField("id")); locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); Assert.assertNull("L1 should not contain lat", projectedL1.getField("lat")); - Assert.assertEquals("L1 should contain long", - -1.542616f, (float) projectedL1.getField("long"), 0.000001); + Assert.assertEquals( + "L1 should contain long", -1.542616f, (float) projectedL1.getField("long"), 0.000001); projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); Assert.assertNull("L2 should not contain lat", projectedL2.getField("lat")); - Assert.assertEquals("L2 should contain long", - -1.539054f, (float) projectedL2.getField("long"), 0.000001); - - Schema latitiudeRenamed = new Schema( - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "latitude", Types.FloatType.get()) - ) - )) - ); + Assert.assertEquals( + "L2 should contain long", -1.539054f, (float) projectedL2.getField("long"), 0.000001); + + Schema latitiudeRenamed = + new Schema( + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, record); Assert.assertNull("Should not project id", projected.getField("id")); locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain latitude", - 53.992811f, (float) projectedL1.getField("latitude"), 0.000001); + Assert.assertEquals( + "L1 should contain latitude", + 53.992811f, + (float) projectedL1.getField("latitude"), + 0.000001); Assert.assertNull("L1 should not contain lat", projectedL1.getField("lat")); Assert.assertNull("L1 should not contain long", projectedL1.getField("long")); projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain latitude", - 52.995143f, (float) projectedL2.getField("latitude"), 0.000001); + Assert.assertEquals( + "L2 should contain latitude", + 52.995143f, + (float) projectedL2.getField("latitude"), + 0.000001); Assert.assertNull("L2 should not contain lat", projectedL2.getField("lat")); Assert.assertNull("L2 should not contain long", projectedL2.getField("long")); } @Test public void testListProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(10, "values", - Types.ListType.ofOptional(11, Types.LongType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); List values = ImmutableList.of(56L, 57L, 58L); @@ -438,12 +467,11 @@ public void testListProjection() throws IOException { record.setField("id", 34L); record.setField("values", values); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project values list", projected.getField("values")); Schema elementOnly = writeSchema.select("values.element"); @@ -460,15 +488,17 @@ public void testListProjection() throws IOException { @Test @SuppressWarnings("unchecked") public void testListOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()) - )) - ) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); @@ -480,18 +510,17 @@ public void testListOfStructsProjection() throws IOException { p2.setField("y", null); record.setField("points", ImmutableList.of(p1, p2)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project points list", projected.getField("points")); projected = writeAndRead("all_points", writeSchema, writeSchema.select("points"), record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project points list", - record.getField("points"), projected.getField("points")); + Assert.assertEquals( + "Should project points list", record.getField("points"), projected.getField("points")); projected = writeAndRead("x_only", writeSchema, writeSchema.select("points.x"), record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -517,13 +546,15 @@ public void testListOfStructsProjection() throws IOException { Assert.assertNull("Should not project x", projectedP2.getField("x")); Assert.assertNull("Should project null y", projectedP2.getField("y")); - Schema yRenamed = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.optional(18, "z", Types.IntegerType.get()) - )) - ) - ); + Schema yRenamed = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); projected = writeAndRead("y_renamed", writeSchema, yRenamed, record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -539,15 +570,17 @@ public void testListOfStructsProjection() throws IOException { Assert.assertNull("Should not project y", projectedP2.getField("y")); Assert.assertNull("Should project null z", projectedP2.getField("z")); - Schema zAdded = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()), - Types.NestedField.optional(20, "z", Types.IntegerType.get()) - )) - ) - ); + Schema zAdded = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()), + Types.NestedField.optional(20, "z", Types.IntegerType.get()))))); projected = writeAndRead("z_added", writeSchema, zAdded, record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -565,10 +598,10 @@ public void testListOfStructsProjection() throws IOException { } private static org.apache.avro.Schema fromOption(org.apache.avro.Schema schema) { - Preconditions.checkArgument(schema.getType() == UNION, - "Expected union schema but was passed: %s", schema); - Preconditions.checkArgument(schema.getTypes().size() == 2, - "Expected optional schema, but was passed: %s", schema); + Preconditions.checkArgument( + schema.getType() == UNION, "Expected union schema but was passed: %s", schema); + Preconditions.checkArgument( + schema.getTypes().size() == 2, "Expected optional schema, but was passed: %s", schema); if (schema.getTypes().get(0).getType() == org.apache.avro.Schema.Type.NULL) { return schema.getTypes().get(1); } else { diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestRequiredDistributionAndOrdering.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestRequiredDistributionAndOrdering.java index 2c64bc18c35c..8e54a23f815a 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestRequiredDistributionAndOrdering.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestRequiredDistributionAndOrdering.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -36,7 +35,8 @@ public class TestRequiredDistributionAndOrdering extends SparkCatalogTestBase { - public TestRequiredDistributionAndOrdering(String catalogName, String implementation, Map config) { + public TestRequiredDistributionAndOrdering( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -47,122 +47,130 @@ public void dropTestTable() { @Test public void testDefaultLocalSort() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (c3)", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (c3)", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); // should insert a local sort by partition columns by default inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testPartitionColumnsArePrependedForRangeDistribution() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (c3)", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (c3)", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); Table table = validationCatalog.loadTable(tableIdent); // should automatically prepend partition columns to the ordering - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE) .commit(); - table.replaceSortOrder() - .asc("c1") - .asc("c2") - .commit(); + table.replaceSortOrder().asc("c1").asc("c2").commit(); inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testSortOrderIncludesPartitionColumns() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (c3)", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (c3)", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); Table table = validationCatalog.loadTable(tableIdent); // should succeed with a correct sort order - table.replaceSortOrder() - .asc("c3") - .asc("c1") - .asc("c2") - .commit(); + table.replaceSortOrder().asc("c3").asc("c1").asc("c2").commit(); inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testDisabledDistributionAndOrdering() { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (c3)", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (c3)", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); // should fail if ordering is disabled - AssertHelpers.assertThrows("Should reject writes without ordering", - SparkException.class, "Writing job aborted", + AssertHelpers.assertThrows( + "Should reject writes without ordering", + SparkException.class, + "Writing job aborted", () -> { try { - inputDF.writeTo(tableName) + inputDF + .writeTo(tableName) .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false") .append(); } catch (NoSuchTableException e) { @@ -173,57 +181,62 @@ public void testDisabledDistributionAndOrdering() { @Test public void testHashDistribution() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (c3)", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (c3)", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); Table table = validationCatalog.loadTable(tableIdent); // should automatically prepend partition columns to the local ordering after hash distribution - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, TableProperties.WRITE_DISTRIBUTION_MODE_HASH) .commit(); - table.replaceSortOrder() - .asc("c1") - .asc("c2") - .commit(); + table.replaceSortOrder().asc("c1").asc("c2").commit(); inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testNoSortBucketTransformsWithoutExtensions() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, c1))", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBB", "B"), - new ThreeColumnRecord(3, "BBBB", "B"), - new ThreeColumnRecord(4, "BBBB", "B") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, c1))", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBB", "B"), + new ThreeColumnRecord(3, "BBBB", "B"), + new ThreeColumnRecord(4, "BBBB", "B")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); // should fail by default as extensions are disabled - AssertHelpers.assertThrows("Should reject writes without ordering", - SparkException.class, "Writing job aborted", + AssertHelpers.assertThrows( + "Should reject writes without ordering", + SparkException.class, + "Writing job aborted", () -> { try { inputDF.writeTo(tableName).append(); @@ -232,84 +245,83 @@ public void testNoSortBucketTransformsWithoutExtensions() throws NoSuchTableExce } }); - inputDF.writeTo(tableName) - .option(SparkWriteOptions.FANOUT_ENABLED, "true") - .append(); + inputDF.writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); - List expected = ImmutableList.of( - row(1, null, "A"), - row(2, "BBBB", "B"), - row(3, "BBBB", "B"), - row(4, "BBBB", "B") - ); + List expected = + ImmutableList.of( + row(1, null, "A"), row(2, "BBBB", "B"), row(3, "BBBB", "B"), row(4, "BBBB", "B")); assertEquals("Rows must match", expected, sql("SELECT * FROM %s ORDER BY c1", tableName)); } @Test public void testRangeDistributionWithQuotedColumnsNames() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, `c.3` STRING) " + - "USING iceberg " + - "PARTITIONED BY (`c.3`)", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, `c.3` STRING) " + + "USING iceberg " + + "PARTITIONED BY (`c.3`)", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); - Dataset inputDF = ds.selectExpr("c1", "c2", "c3 as `c.3`").coalesce(1).sortWithinPartitions("c1"); + Dataset inputDF = + ds.selectExpr("c1", "c2", "c3 as `c.3`").coalesce(1).sortWithinPartitions("c1"); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE) .commit(); - table.replaceSortOrder() - .asc("c1") - .asc("c2") - .commit(); + table.replaceSortOrder().asc("c1").asc("c2").commit(); inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testHashDistributionWithQuotedColumnsNames() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, `c``3` STRING) " + - "USING iceberg " + - "PARTITIONED BY (`c``3`)", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, `c``3` STRING) " + + "USING iceberg " + + "PARTITIONED BY (`c``3`)", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); - Dataset inputDF = ds.selectExpr("c1", "c2", "c3 as `c``3`").coalesce(1).sortWithinPartitions("c1"); + Dataset inputDF = + ds.selectExpr("c1", "c2", "c3 as `c``3`").coalesce(1).sortWithinPartitions("c1"); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, TableProperties.WRITE_DISTRIBUTION_MODE_HASH) .commit(); - table.replaceSortOrder() - .asc("c1") - .asc("c2") - .commit(); + table.replaceSortOrder().asc("c1").asc("c2").commit(); inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestRuntimeFiltering.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestRuntimeFiltering.java index c5127d0c636d..beaf7b75c6c0 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestRuntimeFiltering.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestRuntimeFiltering.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.io.UncheckedIOException; import java.util.List; @@ -40,9 +42,6 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - public class TestRuntimeFiltering extends SparkTestBaseWithCatalog { @After @@ -53,245 +52,286 @@ public void removeTables() { @Test public void testIdentityPartitionedTable() throws NoSuchTableException { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); - Dataset df = spark.range(1, 100) - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("id", "data", "date", "ts"); + Dataset df = + spark + .range(1, 100) + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("id", "data", "date", "ts"); df.coalesce(1).writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); sql("CREATE TABLE dim (id BIGINT, date DATE) USING parquet"); - Dataset dimDF = spark.range(1, 10) - .withColumn("date", expr("DATE '1970-01-02'")) - .select("id", "date"); + Dataset dimDF = + spark.range(1, 10).withColumn("date", expr("DATE '1970-01-02'")).select("id", "date"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); - String query = String.format( - "SELECT f.* FROM %s f JOIN dim d ON f.date = d.date AND d.id = 1 ORDER BY id", - tableName); + String query = + String.format( + "SELECT f.* FROM %s f JOIN dim d ON f.date = d.date AND d.id = 1 ORDER BY id", + tableName); assertQueryContainsRuntimeFilter(query); deleteNotMatchingFiles(Expressions.equal("date", 1), 3); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE date = DATE '1970-01-02' ORDER BY id", tableName), sql(query)); } @Test public void testBucketedTable() throws NoSuchTableException { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (bucket(8, id))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (bucket(8, id))", + tableName); - Dataset df = spark.range(1, 100) - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("id", "data", "date", "ts"); + Dataset df = + spark + .range(1, 100) + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("id", "data", "date", "ts"); df.coalesce(1).writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); sql("CREATE TABLE dim (id BIGINT, date DATE) USING parquet"); - Dataset dimDF = spark.range(1, 2) - .withColumn("date", expr("DATE '1970-01-02'")) - .select("id", "date"); + Dataset dimDF = + spark.range(1, 2).withColumn("date", expr("DATE '1970-01-02'")).select("id", "date"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); - String query = String.format( - "SELECT f.* FROM %s f JOIN dim d ON f.id = d.id AND d.date = DATE '1970-01-02' ORDER BY date", - tableName); + String query = + String.format( + "SELECT f.* FROM %s f JOIN dim d ON f.id = d.id AND d.date = DATE '1970-01-02' ORDER BY date", + tableName); assertQueryContainsRuntimeFilter(query); deleteNotMatchingFiles(Expressions.equal("id", 1), 7); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE id = 1 ORDER BY date", tableName), sql(query)); } @Test public void testRenamedSourceColumnTable() throws NoSuchTableException { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (bucket(8, id))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (bucket(8, id))", + tableName); - Dataset df = spark.range(1, 100) - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("id", "data", "date", "ts"); + Dataset df = + spark + .range(1, 100) + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("id", "data", "date", "ts"); df.coalesce(1).writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); sql("CREATE TABLE dim (id BIGINT, date DATE) USING parquet"); - Dataset dimDF = spark.range(1, 2) - .withColumn("date", expr("DATE '1970-01-02'")) - .select("id", "date"); + Dataset dimDF = + spark.range(1, 2).withColumn("date", expr("DATE '1970-01-02'")).select("id", "date"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); sql("ALTER TABLE %s RENAME COLUMN id TO row_id", tableName); - String query = String.format( - "SELECT f.* FROM %s f JOIN dim d ON f.row_id = d.id AND d.date = DATE '1970-01-02' ORDER BY date", - tableName); + String query = + String.format( + "SELECT f.* FROM %s f JOIN dim d ON f.row_id = d.id AND d.date = DATE '1970-01-02' ORDER BY date", + tableName); assertQueryContainsRuntimeFilter(query); deleteNotMatchingFiles(Expressions.equal("row_id", 1), 7); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE row_id = 1 ORDER BY date", tableName), sql(query)); } @Test public void testMultipleRuntimeFilters() throws NoSuchTableException { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (data, bucket(8, id))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (data, bucket(8, id))", + tableName); - Dataset df = spark.range(1, 100) - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("id", "data", "date", "ts"); + Dataset df = + spark + .range(1, 100) + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("id", "data", "date", "ts"); df.coalesce(1).writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); sql("CREATE TABLE dim (id BIGINT, date DATE, data STRING) USING parquet"); - Dataset dimDF = spark.range(1, 2) - .withColumn("date", expr("DATE '1970-01-02'")) - .withColumn("data", expr("'1970-01-02'")) - .select("id", "date", "data"); + Dataset dimDF = + spark + .range(1, 2) + .withColumn("date", expr("DATE '1970-01-02'")) + .withColumn("data", expr("'1970-01-02'")) + .select("id", "date", "data"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); - String query = String.format( - "SELECT f.* FROM %s f JOIN dim d ON f.id = d.id AND f.data = d.data AND d.date = DATE '1970-01-02'", - tableName); + String query = + String.format( + "SELECT f.* FROM %s f JOIN dim d ON f.id = d.id AND f.data = d.data AND d.date = DATE '1970-01-02'", + tableName); assertQueryContainsRuntimeFilters(query, 2, "Query should have 2 runtime filters"); deleteNotMatchingFiles(Expressions.equal("id", 1), 31); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE id = 1 AND data = '1970-01-02'", tableName), sql(query)); } @Test public void testCaseSensitivityOfRuntimeFilters() throws NoSuchTableException { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (data, bucket(8, id))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (data, bucket(8, id))", + tableName); - Dataset df = spark.range(1, 100) - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("id", "data", "date", "ts"); + Dataset df = + spark + .range(1, 100) + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("id", "data", "date", "ts"); df.coalesce(1).writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); sql("CREATE TABLE dim (id BIGINT, date DATE, data STRING) USING parquet"); - Dataset dimDF = spark.range(1, 2) - .withColumn("date", expr("DATE '1970-01-02'")) - .withColumn("data", expr("'1970-01-02'")) - .select("id", "date", "data"); + Dataset dimDF = + spark + .range(1, 2) + .withColumn("date", expr("DATE '1970-01-02'")) + .withColumn("data", expr("'1970-01-02'")) + .select("id", "date", "data"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); - String caseInsensitiveQuery = String.format( - "select f.* from %s F join dim d ON f.Id = d.iD and f.DaTa = d.dAtA and d.dAtE = date '1970-01-02'", - tableName); + String caseInsensitiveQuery = + String.format( + "select f.* from %s F join dim d ON f.Id = d.iD and f.DaTa = d.dAtA and d.dAtE = date '1970-01-02'", + tableName); - assertQueryContainsRuntimeFilters(caseInsensitiveQuery, 2, "Query should have 2 runtime filters"); + assertQueryContainsRuntimeFilters( + caseInsensitiveQuery, 2, "Query should have 2 runtime filters"); deleteNotMatchingFiles(Expressions.equal("id", 1), 31); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE id = 1 AND data = '1970-01-02'", tableName), sql(caseInsensitiveQuery)); } @Test public void testBucketedTableWithMultipleSpecs() throws NoSuchTableException { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) USING iceberg", + tableName); - Dataset df1 = spark.range(1, 100) - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 2 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("id", "data", "date", "ts"); + Dataset df1 = + spark + .range(1, 100) + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 2 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("id", "data", "date", "ts"); df1.coalesce(1).writeTo(tableName).append(); Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField(Expressions.bucket("id", 8)) - .commit(); + table.updateSpec().addField(Expressions.bucket("id", 8)).commit(); sql("REFRESH TABLE %s", tableName); - Dataset df2 = spark.range(1, 100) - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("id", "data", "date", "ts"); + Dataset df2 = + spark + .range(1, 100) + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("id", "data", "date", "ts"); df2.coalesce(1).writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); sql("CREATE TABLE dim (id BIGINT, date DATE) USING parquet"); - Dataset dimDF = spark.range(1, 2) - .withColumn("date", expr("DATE '1970-01-02'")) - .select("id", "date"); + Dataset dimDF = + spark.range(1, 2).withColumn("date", expr("DATE '1970-01-02'")).select("id", "date"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); - String query = String.format( - "SELECT f.* FROM %s f JOIN dim d ON f.id = d.id AND d.date = DATE '1970-01-02' ORDER BY date", - tableName); + String query = + String.format( + "SELECT f.* FROM %s f JOIN dim d ON f.id = d.id AND d.date = DATE '1970-01-02' ORDER BY date", + tableName); assertQueryContainsRuntimeFilter(query); deleteNotMatchingFiles(Expressions.equal("id", 1), 7); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE id = 1 ORDER BY date", tableName), sql(query)); } @Test public void testSourceColumnWithDots() throws NoSuchTableException { - sql("CREATE TABLE %s (`i.d` BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (bucket(8, `i.d`))", tableName); + sql( + "CREATE TABLE %s (`i.d` BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (bucket(8, `i.d`))", + tableName); - Dataset df = spark.range(1, 100) - .withColumnRenamed("id", "i.d") - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(`i.d` % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("`i.d`", "data", "date", "ts"); + Dataset df = + spark + .range(1, 100) + .withColumnRenamed("id", "i.d") + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(`i.d` % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("`i.d`", "data", "date", "ts"); df.coalesce(1).writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); sql("SELECT * FROM %s WHERE `i.d` = 1", tableName); sql("CREATE TABLE dim (id BIGINT, date DATE) USING parquet"); - Dataset dimDF = spark.range(1, 2) - .withColumn("date", expr("DATE '1970-01-02'")) - .select("id", "date"); + Dataset dimDF = + spark.range(1, 2).withColumn("date", expr("DATE '1970-01-02'")).select("id", "date"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); - String query = String.format( - "SELECT f.* FROM %s f JOIN dim d ON f.`i.d` = d.id AND d.date = DATE '1970-01-02' ORDER BY date", - tableName); + String query = + String.format( + "SELECT f.* FROM %s f JOIN dim d ON f.`i.d` = d.id AND d.date = DATE '1970-01-02' ORDER BY date", + tableName); assertQueryContainsRuntimeFilter(query); @@ -299,70 +339,82 @@ public void testSourceColumnWithDots() throws NoSuchTableException { sql(query); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE `i.d` = 1 ORDER BY date", tableName), sql(query)); } @Test public void testSourceColumnWithBackticks() throws NoSuchTableException { - sql("CREATE TABLE %s (`i``d` BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (bucket(8, `i``d`))", tableName); + sql( + "CREATE TABLE %s (`i``d` BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (bucket(8, `i``d`))", + tableName); - Dataset df = spark.range(1, 100) - .withColumnRenamed("id", "i`d") - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(`i``d` % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("`i``d`", "data", "date", "ts"); + Dataset df = + spark + .range(1, 100) + .withColumnRenamed("id", "i`d") + .withColumn( + "date", date_add(expr("DATE '1970-01-01'"), expr("CAST(`i``d` % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("`i``d`", "data", "date", "ts"); df.coalesce(1).writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); sql("CREATE TABLE dim (id BIGINT, date DATE) USING parquet"); - Dataset dimDF = spark.range(1, 2) - .withColumn("date", expr("DATE '1970-01-02'")) - .select("id", "date"); + Dataset dimDF = + spark.range(1, 2).withColumn("date", expr("DATE '1970-01-02'")).select("id", "date"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); - String query = String.format( - "SELECT f.* FROM %s f JOIN dim d ON f.`i``d` = d.id AND d.date = DATE '1970-01-02' ORDER BY date", - tableName); + String query = + String.format( + "SELECT f.* FROM %s f JOIN dim d ON f.`i``d` = d.id AND d.date = DATE '1970-01-02' ORDER BY date", + tableName); assertQueryContainsRuntimeFilter(query); deleteNotMatchingFiles(Expressions.equal("i`d", 1), 7); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE `i``d` = 1 ORDER BY date", tableName), sql(query)); } @Test public void testUnpartitionedTable() throws NoSuchTableException { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) USING iceberg", + tableName); - Dataset df = spark.range(1, 100) - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("id", "data", "date", "ts"); + Dataset df = + spark + .range(1, 100) + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("id", "data", "date", "ts"); df.coalesce(1).writeTo(tableName).append(); sql("CREATE TABLE dim (id BIGINT, date DATE) USING parquet"); - Dataset dimDF = spark.range(1, 2) - .withColumn("date", expr("DATE '1970-01-02'")) - .select("id", "date"); + Dataset dimDF = + spark.range(1, 2).withColumn("date", expr("DATE '1970-01-02'")).select("id", "date"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); - String query = String.format( - "SELECT f.* FROM %s f JOIN dim d ON f.id = d.id AND d.date = DATE '1970-01-02' ORDER BY date", - tableName); + String query = + String.format( + "SELECT f.* FROM %s f JOIN dim d ON f.id = d.id AND d.date = DATE '1970-01-02' ORDER BY date", + tableName); assertQueryContainsNoRuntimeFilter(query); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE id = 1 ORDER BY date", tableName), sql(query)); } @@ -375,14 +427,16 @@ private void assertQueryContainsNoRuntimeFilter(String query) { assertQueryContainsRuntimeFilters(query, 0, "Query should have no runtime filters"); } - private void assertQueryContainsRuntimeFilters(String query, int expectedFilterCount, String errorMessage) { + private void assertQueryContainsRuntimeFilters( + String query, int expectedFilterCount, String errorMessage) { List output = spark.sql("EXPLAIN EXTENDED " + query).collectAsList(); String plan = output.get(0).getString(0); int actualFilterCount = StringUtils.countMatches(plan, "dynamicpruningexpression"); Assert.assertEquals(errorMessage, expectedFilterCount, actualFilterCount); } - // delete files that don't match the filter to ensure dynamic filtering works and only required files are read + // delete files that don't match the filter to ensure dynamic filtering works and only required + // files are read private void deleteNotMatchingFiles(Expression filter, int expectedDeletedFileCount) { Table table = validationCatalog.loadTable(tableIdent); FileIO io = table.io(); @@ -410,6 +464,9 @@ private void deleteNotMatchingFiles(Expression filter, int expectedDeletedFileCo throw new UncheckedIOException(e); } - Assert.assertEquals("Deleted unexpected number of files", expectedDeletedFileCount, deletedFileLocations.size()); + Assert.assertEquals( + "Deleted unexpected number of files", + expectedDeletedFileCount, + deletedFileLocations.size()); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java index 22756dd36717..9661cfe20b1c 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import java.util.List; import org.apache.hadoop.conf.Configuration; @@ -43,18 +44,14 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestSnapshotSelection { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; @@ -79,48 +76,40 @@ public void testSnapshotSelectionById() throws IOException { Table table = tables.create(SCHEMA, spec, tableLocation); // produce the first snapshot - List firstBatchRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List firstBatchRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); // produce the second snapshot - List secondBatchRecords = Lists.newArrayList( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e"), - new SimpleRecord(6, "f") - ); + List secondBatchRecords = + Lists.newArrayList( + new SimpleRecord(4, "d"), new SimpleRecord(5, "e"), new SimpleRecord(6, "f")); Dataset secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class); secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); Assert.assertEquals("Expected 2 snapshots", 2, Iterables.size(table.snapshots())); // verify records in the current snapshot - Dataset currentSnapshotResult = spark.read() - .format("iceberg") - .load(tableLocation); - List currentSnapshotRecords = currentSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset currentSnapshotResult = spark.read().format("iceberg").load(tableLocation); + List currentSnapshotRecords = + currentSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(firstBatchRecords); expectedRecords.addAll(secondBatchRecords); - Assert.assertEquals("Current snapshot rows should match", expectedRecords, currentSnapshotRecords); + Assert.assertEquals( + "Current snapshot rows should match", expectedRecords, currentSnapshotRecords); // verify records in the previous snapshot Snapshot currentSnapshot = table.currentSnapshot(); Long parentSnapshotId = currentSnapshot.parentId(); - Dataset previousSnapshotResult = spark.read() - .format("iceberg") - .option("snapshot-id", parentSnapshotId) - .load(tableLocation); - List previousSnapshotRecords = previousSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - Assert.assertEquals("Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); + Dataset previousSnapshotResult = + spark.read().format("iceberg").option("snapshot-id", parentSnapshotId).load(tableLocation); + List previousSnapshotRecords = + previousSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Assert.assertEquals( + "Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); } @Test @@ -132,11 +121,9 @@ public void testSnapshotSelectionByTimestamp() throws IOException { Table table = tables.create(SCHEMA, spec, tableLocation); // produce the first snapshot - List firstBatchRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List firstBatchRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); @@ -144,37 +131,35 @@ public void testSnapshotSelectionByTimestamp() throws IOException { long firstSnapshotTimestamp = System.currentTimeMillis(); // produce the second snapshot - List secondBatchRecords = Lists.newArrayList( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e"), - new SimpleRecord(6, "f") - ); + List secondBatchRecords = + Lists.newArrayList( + new SimpleRecord(4, "d"), new SimpleRecord(5, "e"), new SimpleRecord(6, "f")); Dataset secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class); secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); Assert.assertEquals("Expected 2 snapshots", 2, Iterables.size(table.snapshots())); // verify records in the current snapshot - Dataset currentSnapshotResult = spark.read() - .format("iceberg") - .load(tableLocation); - List currentSnapshotRecords = currentSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset currentSnapshotResult = spark.read().format("iceberg").load(tableLocation); + List currentSnapshotRecords = + currentSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(firstBatchRecords); expectedRecords.addAll(secondBatchRecords); - Assert.assertEquals("Current snapshot rows should match", expectedRecords, currentSnapshotRecords); + Assert.assertEquals( + "Current snapshot rows should match", expectedRecords, currentSnapshotRecords); // verify records in the previous snapshot - Dataset previousSnapshotResult = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, firstSnapshotTimestamp) - .load(tableLocation); - List previousSnapshotRecords = previousSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - Assert.assertEquals("Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); + Dataset previousSnapshotResult = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, firstSnapshotTimestamp) + .load(tableLocation); + List previousSnapshotRecords = + previousSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Assert.assertEquals( + "Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); } @Test @@ -185,14 +170,11 @@ public void testSnapshotSelectionByInvalidSnapshotId() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, tableLocation); - Dataset df = spark.read() - .format("iceberg") - .option("snapshot-id", -10) - .load(tableLocation); + Dataset df = spark.read().format("iceberg").option("snapshot-id", -10).load(tableLocation); Assertions.assertThatThrownBy(df::collectAsList) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot find snapshot with ID -10"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot find snapshot with ID -10"); } @Test @@ -204,12 +186,15 @@ public void testSnapshotSelectionByInvalidTimestamp() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, tableLocation); - Assertions.assertThatThrownBy(() -> spark.read() + Assertions.assertThatThrownBy( + () -> + spark + .read() .format("iceberg") .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) .load(tableLocation)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot find a snapshot older than"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot find a snapshot older than"); } @Test @@ -220,24 +205,25 @@ public void testSnapshotSelectionBySnapshotIdAndTimestamp() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, tableLocation); - List firstBatchRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List firstBatchRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); long timestamp = System.currentTimeMillis(); long snapshotId = table.currentSnapshot().snapshotId(); - Assertions.assertThatThrownBy(() -> spark.read() + Assertions.assertThatThrownBy( + () -> + spark + .read() .format("iceberg") .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) .load(tableLocation)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot specify both snapshot-id") - .hasMessageContaining("and as-of-timestamp"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot specify both snapshot-id") + .hasMessageContaining("and as-of-timestamp"); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java index bda525780d8b..3fb2a630fe81 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -41,13 +40,13 @@ public TestSparkAppenderFactory(String fileFormat, boolean partitioned) { } @Override - protected FileAppenderFactory createAppenderFactory(List equalityFieldIds, - Schema eqDeleteSchema, - Schema posDeleteRowSchema) { + protected FileAppenderFactory createAppenderFactory( + List equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema) { return SparkAppenderFactory.builderFor(table, table.schema(), sparkType) .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) .eqDeleteRowSchema(eqDeleteSchema) - .posDelRowSchema(posDeleteRowSchema).build(); + .posDelRowSchema(posDeleteRowSchema) + .build(); } @Override diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalog.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalog.java index ef7b80a4ba6f..31d5c38e2a43 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalog.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -32,12 +31,14 @@ import org.apache.spark.sql.connector.catalog.Table; import org.apache.spark.sql.connector.catalog.TableCatalog; -public class TestSparkCatalog extends SparkSessionCatalog { +public class TestSparkCatalog + extends SparkSessionCatalog { private static final Map tableMap = Maps.newHashMap(); public static void setTable(Identifier ident, Table table) { - Preconditions.checkArgument(!tableMap.containsKey(ident), "Cannot set " + ident + ". It is already set"); + Preconditions.checkArgument( + !tableMap.containsKey(ident), "Cannot set " + ident + ". It is already set"); tableMap.put(ident, table); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogCacheExpiration.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogCacheExpiration.java index 96aeed65bfa7..3d668197fd51 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogCacheExpiration.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogCacheExpiration.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -36,12 +35,16 @@ public class TestSparkCatalogCacheExpiration extends SparkTestBaseWithCatalog { private static final String sessionCatalogName = "spark_catalog"; private static final String sessionCatalogImpl = SparkSessionCatalog.class.getName(); - private static final Map sessionCatalogConfig = ImmutableMap.of( - "type", "hadoop", - "default-namespace", "default", - CatalogProperties.CACHE_ENABLED, "true", - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, "3000" - ); + private static final Map sessionCatalogConfig = + ImmutableMap.of( + "type", + "hadoop", + "default-namespace", + "default", + CatalogProperties.CACHE_ENABLED, + "true", + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + "3000"); private static String asSqlConfCatalogKeyFor(String catalog, String configKey) { // configKey is empty when the catalog's class is being defined @@ -58,19 +61,29 @@ private static String asSqlConfCatalogKeyFor(String catalog, String configKey) { public static void beforeClass() { // Catalog - expiration_disabled: Catalog with caching on and expiration disabled. ImmutableMap.of( - "", "org.apache.iceberg.spark.SparkCatalog", - "type", "hive", - CatalogProperties.CACHE_ENABLED, "true", - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, "-1" - ).forEach((k, v) -> spark.conf().set(asSqlConfCatalogKeyFor("expiration_disabled", k), v)); - - // Catalog - cache_disabled_implicitly: Catalog that does not cache, as the cache expiration interval is 0. + "", + "org.apache.iceberg.spark.SparkCatalog", + "type", + "hive", + CatalogProperties.CACHE_ENABLED, + "true", + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + "-1") + .forEach((k, v) -> spark.conf().set(asSqlConfCatalogKeyFor("expiration_disabled", k), v)); + + // Catalog - cache_disabled_implicitly: Catalog that does not cache, as the cache expiration + // interval is 0. ImmutableMap.of( - "", "org.apache.iceberg.spark.SparkCatalog", - "type", "hive", - CatalogProperties.CACHE_ENABLED, "true", - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, "0" - ).forEach((k, v) -> spark.conf().set(asSqlConfCatalogKeyFor("cache_disabled_implicitly", k), v)); + "", + "org.apache.iceberg.spark.SparkCatalog", + "type", + "hive", + CatalogProperties.CACHE_ENABLED, + "true", + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + "0") + .forEach( + (k, v) -> spark.conf().set(asSqlConfCatalogKeyFor("cache_disabled_implicitly", k), v)); } public TestSparkCatalogCacheExpiration() { @@ -85,56 +98,55 @@ public void testSparkSessionCatalogWithExpirationEnabled() { .extracting("cacheEnabled") .isEqualTo(true); - Assertions - .assertThat(sparkCatalog) + Assertions.assertThat(sparkCatalog) .extracting("icebergCatalog") .extracting("icebergCatalog") - .isInstanceOfSatisfying(Catalog.class, icebergCatalog -> { - Assertions.assertThat(icebergCatalog) - .isExactlyInstanceOf(CachingCatalog.class) - .extracting("expirationIntervalMillis") - .isEqualTo(3000L); - }); + .isInstanceOfSatisfying( + Catalog.class, + icebergCatalog -> { + Assertions.assertThat(icebergCatalog) + .isExactlyInstanceOf(CachingCatalog.class) + .extracting("expirationIntervalMillis") + .isEqualTo(3000L); + }); } @Test public void testCacheEnabledAndExpirationDisabled() { SparkCatalog sparkCatalog = getSparkCatalog("expiration_disabled"); - Assertions.assertThat(sparkCatalog) - .extracting("cacheEnabled") - .isEqualTo(true); + Assertions.assertThat(sparkCatalog).extracting("cacheEnabled").isEqualTo(true); - Assertions - .assertThat(sparkCatalog) + Assertions.assertThat(sparkCatalog) .extracting("icebergCatalog") - .isInstanceOfSatisfying(CachingCatalog.class, icebergCatalog -> { - Assertions.assertThat(icebergCatalog) - .extracting("expirationIntervalMillis") - .isEqualTo(-1L); - }); + .isInstanceOfSatisfying( + CachingCatalog.class, + icebergCatalog -> { + Assertions.assertThat(icebergCatalog) + .extracting("expirationIntervalMillis") + .isEqualTo(-1L); + }); } @Test public void testCacheDisabledImplicitly() { SparkCatalog sparkCatalog = getSparkCatalog("cache_disabled_implicitly"); - Assertions.assertThat(sparkCatalog) - .extracting("cacheEnabled") - .isEqualTo(false); + Assertions.assertThat(sparkCatalog).extracting("cacheEnabled").isEqualTo(false); - Assertions - .assertThat(sparkCatalog) + Assertions.assertThat(sparkCatalog) .extracting("icebergCatalog") .isInstanceOfSatisfying( Catalog.class, - icebergCatalog -> Assertions.assertThat(icebergCatalog).isNotInstanceOf(CachingCatalog.class)); + icebergCatalog -> + Assertions.assertThat(icebergCatalog).isNotInstanceOf(CachingCatalog.class)); } private SparkSessionCatalog sparkSessionCatalog() { - TableCatalog catalog = (TableCatalog) spark.sessionState().catalogManager().catalog("spark_catalog"); + TableCatalog catalog = + (TableCatalog) spark.sessionState().catalogManager().catalog("spark_catalog"); return (SparkSessionCatalog) catalog; } - private SparkCatalog getSparkCatalog(String catalog) { + private SparkCatalog getSparkCatalog(String catalog) { return (SparkCatalog) spark.sessionState().catalogManager().catalog(catalog); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogHadoopOverrides.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogHadoopOverrides.java index 267270308de5..607f1d45ba3a 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogHadoopOverrides.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogHadoopOverrides.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -37,7 +36,6 @@ import org.junit.Test; import org.junit.runners.Parameterized; - public class TestSparkCatalogHadoopOverrides extends SparkCatalogTestBase { private static final String configToOverride = "fs.s3a.buffer.dir"; @@ -49,29 +47,38 @@ public class TestSparkCatalogHadoopOverrides extends SparkCatalogTestBase { @Parameterized.Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") public static Object[][] parameters() { return new Object[][] { - { "testhive", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - hadoopPrefixedConfigToOverride, configOverrideValue - ) }, - { "testhadoop", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hadoop", - hadoopPrefixedConfigToOverride, configOverrideValue - ) }, - { "spark_catalog", SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - hadoopPrefixedConfigToOverride, configOverrideValue - ) } + { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", + "hive", + "default-namespace", + "default", + hadoopPrefixedConfigToOverride, + configOverrideValue) + }, + { + "testhadoop", + SparkCatalog.class.getName(), + ImmutableMap.of("type", "hadoop", hadoopPrefixedConfigToOverride, configOverrideValue) + }, + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", + "hive", + "default-namespace", + "default", + hadoopPrefixedConfigToOverride, + configOverrideValue) + } }; } - public TestSparkCatalogHadoopOverrides(String catalogName, - String implementation, - Map config) { + public TestSparkCatalogHadoopOverrides( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -92,7 +99,8 @@ public void testTableFromCatalogHasOverrides() throws Exception { String actualCatalogOverride = conf.get(configToOverride, "/whammies"); Assert.assertEquals( "Iceberg tables from spark should have the overridden hadoop configurations from the spark config", - configOverrideValue, actualCatalogOverride); + configOverrideValue, + actualCatalogOverride); } @Test @@ -102,16 +110,19 @@ public void ensureRoundTripSerializedTableRetainsHadoopConfig() throws Exception String actualCatalogOverride = originalConf.get(configToOverride, "/whammies"); Assert.assertEquals( "Iceberg tables from spark should have the overridden hadoop configurations from the spark config", - configOverrideValue, actualCatalogOverride); + configOverrideValue, + actualCatalogOverride); // Now convert to SerializableTable and ensure overridden property is still present. Table serializableTable = SerializableTableWithSize.copyOf(table); - Table kryoSerializedTable = KryoHelpers.roundTripSerialize(SerializableTableWithSize.copyOf(table)); + Table kryoSerializedTable = + KryoHelpers.roundTripSerialize(SerializableTableWithSize.copyOf(table)); Configuration configFromKryoSerde = ((Configurable) kryoSerializedTable.io()).getConf(); String kryoSerializedCatalogOverride = configFromKryoSerde.get(configToOverride, "/whammies"); Assert.assertEquals( "Tables serialized with Kryo serialization should retain overridden hadoop configuration properties", - configOverrideValue, kryoSerializedCatalogOverride); + configOverrideValue, + kryoSerializedCatalogOverride); // Do the same for Java based serde Table javaSerializedTable = TestHelpers.roundTripSerialize(serializableTable); @@ -119,14 +130,16 @@ public void ensureRoundTripSerializedTableRetainsHadoopConfig() throws Exception String javaSerializedCatalogOverride = configFromJavaSerde.get(configToOverride, "/whammies"); Assert.assertEquals( "Tables serialized with Java serialization should retain overridden hadoop configuration properties", - configOverrideValue, javaSerializedCatalogOverride); + configOverrideValue, + javaSerializedCatalogOverride); } @SuppressWarnings("ThrowSpecificity") private Table getIcebergTableFromSparkCatalog() throws Exception { Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name()); - TableCatalog catalog = (TableCatalog) spark.sessionState().catalogManager().catalog(catalogName); - SparkTable sparkTable = (SparkTable) catalog.loadTable(identifier); + TableCatalog catalog = + (TableCatalog) spark.sessionState().catalogManager().catalog(catalogName); + SparkTable sparkTable = (SparkTable) catalog.loadTable(identifier); return sparkTable.table(); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java index cd1404766d46..b1f2082b5d9b 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -62,43 +64,42 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkDataFile { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - optional(103, "i", Types.IntegerType.get()), - required(104, "l", Types.LongType.get()), - optional(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - optional(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - optional(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision - ); - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .identity("b") - .bucket("i", 2) - .identity("l") - .identity("f") - .identity("d") - .identity("date") - .hour("ts") - .identity("ts") - .truncate("s", 2) - .identity("bytes") - .bucket("dec_9_0", 2) - .bucket("dec_11_2", 2) - .bucket("dec_38_10", 2) - .build(); + private static final Schema SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), + optional(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + optional(103, "i", Types.IntegerType.get()), + required(104, "l", Types.LongType.get()), + optional(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + optional(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + optional(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision + ); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA) + .identity("b") + .bucket("i", 2) + .identity("l") + .identity("f") + .identity("d") + .identity("date") + .hour("ts") + .identity("ts") + .truncate("s", 2) + .identity("bytes") + .bucket("dec_9_0", 2) + .bucket("dec_11_2", 2) + .bucket("dec_38_10", 2) + .build(); private static SparkSession spark; private static JavaSparkContext sparkContext = null; @@ -117,8 +118,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String tableLocation = null; @Before @@ -129,7 +129,8 @@ public void setupTableLocation() throws Exception { @Test public void testValueConversion() throws IOException { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); checkSparkDataFile(table); } @@ -150,7 +151,9 @@ public void testValueConversionWithEmptyStats() throws IOException { private void checkSparkDataFile(Table table) throws IOException { Iterable rows = RandomData.generateSpark(table.schema(), 200, 0); JavaRDD rdd = sparkContext.parallelize(Lists.newArrayList(rows)); - Dataset df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false); + Dataset df = + spark.internalCreateDataFrame( + JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false); df.write().format("iceberg").mode("append").save(tableLocation); @@ -170,16 +173,15 @@ private void checkSparkDataFile(Table table) throws IOException { Dataset dataFileDF = spark.read().format("iceberg").load(tableLocation + "#files"); // reorder columns to test arbitrary projections - List columns = Arrays.stream(dataFileDF.columns()) - .map(ColumnName::new) - .collect(Collectors.toList()); + List columns = + Arrays.stream(dataFileDF.columns()).map(ColumnName::new).collect(Collectors.toList()); Collections.shuffle(columns); - List sparkDataFiles = dataFileDF - .select(Iterables.toArray(columns, Column.class)) - .collectAsList(); + List sparkDataFiles = + dataFileDF.select(Iterables.toArray(columns, Column.class)).collectAsList(); - Assert.assertEquals("The number of files should match", dataFiles.size(), sparkDataFiles.size()); + Assert.assertEquals( + "The number of files should match", dataFiles.size(), sparkDataFiles.size()); Types.StructType dataFileType = DataFile.getType(table.spec().partitionType()); StructType sparkDataFileType = sparkDataFiles.get(0).schema(); @@ -195,9 +197,14 @@ private void checkDataFile(DataFile expected, DataFile actual) { Assert.assertEquals("Format must match", expected.format(), actual.format()); Assert.assertEquals("Record count must match", expected.recordCount(), actual.recordCount()); Assert.assertEquals("Size must match", expected.fileSizeInBytes(), actual.fileSizeInBytes()); - Assert.assertEquals("Record value counts must match", expected.valueCounts(), actual.valueCounts()); - Assert.assertEquals("Record null value counts must match", expected.nullValueCounts(), actual.nullValueCounts()); - Assert.assertEquals("Record nan value counts must match", expected.nanValueCounts(), actual.nanValueCounts()); + Assert.assertEquals( + "Record value counts must match", expected.valueCounts(), actual.valueCounts()); + Assert.assertEquals( + "Record null value counts must match", + expected.nullValueCounts(), + actual.nullValueCounts()); + Assert.assertEquals( + "Record nan value counts must match", expected.nanValueCounts(), actual.nanValueCounts()); Assert.assertEquals("Lower bounds must match", expected.lowerBounds(), actual.lowerBounds()); Assert.assertEquals("Upper bounds must match", expected.upperBounds(), actual.upperBounds()); Assert.assertEquals("Key metadata must match", expected.keyMetadata(), actual.keyMetadata()); @@ -210,7 +217,8 @@ private void checkDataFile(DataFile expected, DataFile actual) { private void checkStructLike(StructLike expected, StructLike actual) { Assert.assertEquals("Struct size should match", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i++) { - Assert.assertEquals("Struct values must match", expected.get(i, Object.class), actual.get(i, Object.class)); + Assert.assertEquals( + "Struct values must match", expected.get(i, Object.class), actual.get(i, Object.class)); } } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java index 5b158c518ae4..b2db853d4753 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + import java.io.File; import java.io.IOException; import java.util.List; @@ -56,28 +61,20 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - @RunWith(Parameterized.class) public class TestSparkDataWrite { private static final Configuration CONF = new Configuration(); private final FileFormat format; private static SparkSession spark = null; - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Parameterized.Parameters(name = "format = {0}") public static Object[] parameters() { - return new Object[] { "parquet", "avro", "orc" }; + return new Object[] {"parquet", "avro", "orc"}; } @BeforeClass @@ -110,15 +107,14 @@ public void testBasicWrite() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); // TODO: incoming columns must be ordered according to the table's schema - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -126,11 +122,10 @@ public void testBasicWrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); for (ManifestFile manifest : table.currentSnapshot().allManifests(table.io())) { @@ -161,30 +156,31 @@ public void testAppend() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); - - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "a"), - new SimpleRecord(5, "b"), - new SimpleRecord(6, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); + + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "a"), + new SimpleRecord(5, "b"), + new SimpleRecord(6, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); - df.withColumn("id", df.col("id").plus(3)).select("id", "data").write() + df.withColumn("id", df.col("id").plus(3)) + .select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -192,11 +188,10 @@ public void testAppend() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -210,23 +205,24 @@ public void testEmptyOverwrite() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); List expected = records; Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); Dataset empty = spark.createDataFrame(ImmutableList.of(), SimpleRecord.class); - empty.select("id", "data").write() + empty + .select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Overwrite) @@ -235,11 +231,10 @@ public void testEmptyOverwrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -253,30 +248,31 @@ public void testOverwrite() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "a"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "b"), - new SimpleRecord(6, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "a"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "b"), + new SimpleRecord(6, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); // overwrite with 2*id to replace record 2, append 4 and 6 - df.withColumn("id", df.col("id").multiply(2)).select("id", "data").write() + df.withColumn("id", df.col("id").multiply(2)) + .select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Overwrite) @@ -285,11 +281,10 @@ public void testOverwrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -303,22 +298,22 @@ public void testUnpartitionedOverwrite() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); // overwrite with the same data; should not produce two copies - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Overwrite) @@ -326,11 +321,10 @@ public void testUnpartitionedOverwrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -344,7 +338,8 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "4") // ~4 bytes; low enough to trigger .commit(); @@ -355,7 +350,8 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -363,11 +359,10 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -379,7 +374,8 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws } Assert.assertEquals("Should have 4 DataFiles", 4, files.size()); - Assert.assertTrue("All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); + Assert.assertTrue( + "All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); } @Test @@ -410,15 +406,14 @@ public void testWriteProjection() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, null), - new SimpleRecord(2, null), - new SimpleRecord(3, null) - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null)); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id").write() // select only id column + df.select("id") + .write() // select only id column .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -426,11 +421,10 @@ public void testWriteProjection() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -446,22 +440,23 @@ public void testWriteProjectionWithMiddle() throws IOException { HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); - Schema schema = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + Schema schema = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); Table table = tables.create(schema, spec, location.toString()); - List expected = Lists.newArrayList( - new ThreeColumnRecord(1, null, "hello"), - new ThreeColumnRecord(2, null, "world"), - new ThreeColumnRecord(3, null, null) - ); + List expected = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "hello"), + new ThreeColumnRecord(2, null, "world"), + new ThreeColumnRecord(3, null, null)); Dataset df = spark.createDataFrame(expected, ThreeColumnRecord.class); - df.select("c1", "c3").write() + df.select("c1", "c3") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -469,11 +464,10 @@ public void testWriteProjectionWithMiddle() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); + List actual = + result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -487,44 +481,39 @@ public void testViewsReturnRecentResults() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); - Dataset query = spark.read() - .format("iceberg") - .load(location.toString()) - .where("id = 1"); + Dataset query = spark.read().format("iceberg").load(location.toString()).where("id = 1"); query.createOrReplaceTempView("tmp"); - List actual1 = spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - List expected1 = Lists.newArrayList( - new SimpleRecord(1, "a") - ); + List actual1 = + spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List expected1 = Lists.newArrayList(new SimpleRecord(1, "a")); Assert.assertEquals("Number of rows should match", expected1.size(), actual1.size()); Assert.assertEquals("Result rows should match", expected1, actual1); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); - List actual2 = spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - List expected2 = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(1, "a") - ); + List actual2 = + spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List expected2 = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(1, "a")); Assert.assertEquals("Number of rows should match", expected2.size(), actual2.size()); Assert.assertEquals("Result rows should match", expected2, actual2); } @@ -550,7 +539,9 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti switch (option) { case NONE: - df.select("id", "data").sort("data").write() + df.select("id", "data") + .sort("data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -559,7 +550,8 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti break; case TABLE: table.updateProperties().set(SPARK_WRITE_PARTITIONED_FANOUT_ENABLED, "true").commit(); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -567,7 +559,8 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti .save(location.toString()); break; case JOB: - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -581,11 +574,10 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -597,7 +589,8 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti } Assert.assertEquals("Should have 8 DataFiles", 8, files.size()); - Assert.assertTrue("All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); + Assert.assertTrue( + "All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); } @Test @@ -609,20 +602,21 @@ public void testCommitUnknownException() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); AppendFiles append = table.newFastAppend(); AppendFiles spyAppend = spy(append); - doAnswer(invocation -> { - append.commit(); - throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); - }).when(spyAppend).commit(); + doAnswer( + invocation -> { + append.commit(); + throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); + }) + .when(spyAppend) + .commit(); Table spyTable = spy(table); when(spyTable.newAppend()).thenReturn(spyAppend); @@ -632,20 +626,25 @@ public void testCommitUnknownException() throws IOException { ManualSource.setTable(manualTableName, sparkTable); // Although an exception is thrown here, write and commit have succeeded - AssertHelpers.assertThrowsWithCause("Should throw a Commit State Unknown Exception", + AssertHelpers.assertThrowsWithCause( + "Should throw a Commit State Unknown Exception", SparkException.class, "Writing job aborted", CommitStateUnknownException.class, "Datacenter on Fire", - () -> df.select("id", "data").sort("data").write() - .format("org.apache.iceberg.spark.source.ManualSource") - .option(ManualSource.TABLE_NAME, manualTableName) - .mode(SaveMode.Append) - .save(location.toString())); + () -> + df.select("id", "data") + .sort("data") + .write() + .format("org.apache.iceberg.spark.source.ManualSource") + .option(ManualSource.TABLE_NAME, manualTableName) + .mode(SaveMode.Append) + .save(location.toString())); // Since write and commit succeeded, the rows should be readable Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", records.size(), actual.size()); Assert.assertEquals("Result rows should match", records, actual); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java index 702e8ab98990..4a3263e368c0 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -39,9 +38,11 @@ public TestSparkFileWriterFactory(FileFormat fileFormat, boolean partitioned) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFilesScan.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFilesScan.java index 63195cfd3967..d0959d6866bc 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFilesScan.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFilesScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -53,10 +52,8 @@ public void removeTables() { public void testTaskSetLoading() throws NoSuchTableException, IOException { sql("CREATE TABLE %s (id INT, data STRING) USING iceberg", tableName); - List records = ImmutableList.of( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + ImmutableList.of(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.writeTo(tableName).append(); @@ -69,15 +66,19 @@ public void testTaskSetLoading() throws NoSuchTableException, IOException { taskSetManager.stageTasks(table, setID, ImmutableList.copyOf(fileScanTasks)); // load the staged file set - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) + .load(tableName); // write the records back essentially duplicating data scanDF.writeTo(tableName).append(); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "a"), row(1, "a"), row(2, "b"), row(2, "b")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -86,10 +87,8 @@ public void testTaskSetLoading() throws NoSuchTableException, IOException { public void testTaskSetPlanning() throws NoSuchTableException, IOException { sql("CREATE TABLE %s (id INT, data STRING) USING iceberg", tableName); - List records = ImmutableList.of( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + ImmutableList.of(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.coalesce(1).writeTo(tableName).append(); df.coalesce(1).writeTo(tableName).append(); @@ -104,17 +103,23 @@ public void testTaskSetPlanning() throws NoSuchTableException, IOException { taskSetManager.stageTasks(table, setID, tasks); // load the staged file set and make sure each file is in a separate split - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) - .option(SparkReadOptions.SPLIT_SIZE, tasks.get(0).file().fileSizeInBytes()) - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) + .option(SparkReadOptions.SPLIT_SIZE, tasks.get(0).file().fileSizeInBytes()) + .load(tableName); Assert.assertEquals("Num partitions should match", 2, scanDF.javaRDD().getNumPartitions()); // load the staged file set and make sure we combine both files into a single split - scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) - .option(SparkReadOptions.SPLIT_SIZE, Long.MAX_VALUE) - .load(tableName); + scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) + .option(SparkReadOptions.SPLIT_SIZE, Long.MAX_VALUE) + .load(tableName); Assert.assertEquals("Num partitions should match", 1, scanDF.javaRDD().getNumPartitions()); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java index be74d1c5a33b..c3bb35ca7df8 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -42,26 +41,32 @@ public TestSparkMergingMetrics(FileFormat fileFormat) { @Override protected FileAppender writeAndGetAppender(List records) throws IOException { - Table testTable = new BaseTable(null, "dummy") { - @Override - public Map properties() { - return Collections.emptyMap(); - } - @Override - public SortOrder sortOrder() { - return SortOrder.unsorted(); - } - @Override - public PartitionSpec spec() { - return PartitionSpec.unpartitioned(); - } - }; + Table testTable = + new BaseTable(null, "dummy") { + @Override + public Map properties() { + return Collections.emptyMap(); + } + + @Override + public SortOrder sortOrder() { + return SortOrder.unsorted(); + } + + @Override + public PartitionSpec spec() { + return PartitionSpec.unpartitioned(); + } + }; FileAppender appender = - SparkAppenderFactory.builderFor(testTable, SCHEMA, SparkSchemaUtil.convert(SCHEMA)).build() + SparkAppenderFactory.builderFor(testTable, SCHEMA, SparkSchemaUtil.convert(SCHEMA)) + .build() .newAppender(org.apache.iceberg.Files.localOutput(temp.newFile()), fileFormat); try (FileAppender fileAppender = appender) { - records.stream().map(r -> new StructInternalRow(SCHEMA.asStruct()).setStruct(r)).forEach(fileAppender::add); + records.stream() + .map(r -> new StructInternalRow(SCHEMA.asStruct()).setStruct(r)) + .forEach(fileAppender::add); } return appender; } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMetadataColumns.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMetadataColumns.java index 9477bc985295..e39985228570 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMetadataColumns.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMetadataColumns.java @@ -16,9 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.FORMAT_VERSION; +import static org.apache.iceberg.TableProperties.ORC_VECTORIZATION_ENABLED; +import static org.apache.iceberg.TableProperties.PARQUET_BATCH_SIZE; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; +import static org.apache.spark.sql.functions.lit; + import java.io.IOException; import java.util.List; import java.util.stream.Collectors; @@ -56,44 +63,37 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.FORMAT_VERSION; -import static org.apache.iceberg.TableProperties.ORC_VECTORIZATION_ENABLED; -import static org.apache.iceberg.TableProperties.PARQUET_BATCH_SIZE; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; -import static org.apache.spark.sql.functions.lit; - @RunWith(Parameterized.class) public class TestSparkMetadataColumns extends SparkTestBase { private static final String TABLE_NAME = "test_table"; - private static final Schema SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "category", Types.StringType.get()), - Types.NestedField.optional(3, "data", Types.StringType.get()) - ); - private static final PartitionSpec UNKNOWN_SPEC = PartitionSpecParser.fromJson(SCHEMA, - "{ \"spec-id\": 1, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "category", Types.StringType.get()), + Types.NestedField.optional(3, "data", Types.StringType.get())); + private static final PartitionSpec UNKNOWN_SPEC = + PartitionSpecParser.fromJson( + SCHEMA, + "{ \"spec-id\": 1, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); @Parameterized.Parameters(name = "fileFormat = {0}, vectorized = {1}, formatVersion = {2}") public static Object[][] parameters() { return new Object[][] { - { FileFormat.PARQUET, false, 1}, - { FileFormat.PARQUET, true, 1}, - { FileFormat.PARQUET, false, 2}, - { FileFormat.PARQUET, true, 2}, - { FileFormat.AVRO, false, 1}, - { FileFormat.AVRO, false, 2}, - { FileFormat.ORC, false, 1}, - { FileFormat.ORC, true, 1}, - { FileFormat.ORC, false, 2}, - { FileFormat.ORC, true, 2}, + {FileFormat.PARQUET, false, 1}, + {FileFormat.PARQUET, true, 1}, + {FileFormat.PARQUET, false, 2}, + {FileFormat.PARQUET, true, 2}, + {FileFormat.AVRO, false, 1}, + {FileFormat.AVRO, false, 2}, + {FileFormat.ORC, false, 1}, + {FileFormat.ORC, true, 1}, + {FileFormat.ORC, false, 2}, + {FileFormat.ORC, true, 2}, }; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final FileFormat fileFormat; private final boolean vectorized; @@ -109,13 +109,16 @@ public TestSparkMetadataColumns(FileFormat fileFormat, boolean vectorized, int f @BeforeClass public static void setupSpark() { - ImmutableMap config = ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "cache-enabled", "true" - ); - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); - config.forEach((key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); + ImmutableMap config = + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "cache-enabled", "true"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); + config.forEach( + (key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); } @Before @@ -136,35 +139,29 @@ public void testSpecAndPartitionMetadataColumns() { sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", TABLE_NAME); table.refresh(); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", TABLE_NAME); table.refresh(); - table.updateSpec() - .addField(Expressions.bucket("category", 8)) - .commit(); + table.updateSpec().addField(Expressions.bucket("category", 8)).commit(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", TABLE_NAME); table.refresh(); - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", TABLE_NAME); table.refresh(); - table.updateSpec() - .renameField("category_bucket_8", "category_bucket_8_another_name") - .commit(); - - List expected = ImmutableList.of( - row(0, row(null, null)), - row(1, row("b1", null)), - row(2, row("b1", 2)), - row(3, row(null, 2)) - ); - assertEquals("Rows must match", expected, + table.updateSpec().renameField("category_bucket_8", "category_bucket_8_another_name").commit(); + + List expected = + ImmutableList.of( + row(0, row(null, null)), + row(1, row("b1", null)), + row(2, row("b1", 2)), + row(3, row(null, 2))); + assertEquals( + "Rows must match", + expected, sql("SELECT _spec_id, _partition FROM %s ORDER BY _spec_id", TABLE_NAME)); } @@ -172,56 +169,48 @@ public void testSpecAndPartitionMetadataColumns() { public void testPositionMetadataColumnWithMultipleRowGroups() throws NoSuchTableException { Assume.assumeTrue(fileFormat == FileFormat.PARQUET); - table.updateProperties() - .set(PARQUET_ROW_GROUP_SIZE_BYTES, "100") - .commit(); + table.updateProperties().set(PARQUET_ROW_GROUP_SIZE_BYTES, "100").commit(); List ids = Lists.newArrayList(); for (long id = 0L; id < 200L; id++) { ids.add(id); } - Dataset df = spark.createDataset(ids, Encoders.LONG()) - .withColumnRenamed("value", "id") - .withColumn("category", lit("hr")) - .withColumn("data", lit("ABCDEF")); + Dataset df = + spark + .createDataset(ids, Encoders.LONG()) + .withColumnRenamed("value", "id") + .withColumn("category", lit("hr")) + .withColumn("data", lit("ABCDEF")); df.coalesce(1).writeTo(TABLE_NAME).append(); Assert.assertEquals(200, spark.table(TABLE_NAME).count()); - List expectedRows = ids.stream() - .map(this::row) - .collect(Collectors.toList()); - assertEquals("Rows must match", - expectedRows, - sql("SELECT _pos FROM %s", TABLE_NAME)); + List expectedRows = ids.stream().map(this::row).collect(Collectors.toList()); + assertEquals("Rows must match", expectedRows, sql("SELECT _pos FROM %s", TABLE_NAME)); } @Test public void testPositionMetadataColumnWithMultipleBatches() throws NoSuchTableException { Assume.assumeTrue(fileFormat == FileFormat.PARQUET); - table.updateProperties() - .set(PARQUET_BATCH_SIZE, "1000") - .commit(); + table.updateProperties().set(PARQUET_BATCH_SIZE, "1000").commit(); List ids = Lists.newArrayList(); for (long id = 0L; id < 7500L; id++) { ids.add(id); } - Dataset df = spark.createDataset(ids, Encoders.LONG()) - .withColumnRenamed("value", "id") - .withColumn("category", lit("hr")) - .withColumn("data", lit("ABCDEF")); + Dataset df = + spark + .createDataset(ids, Encoders.LONG()) + .withColumnRenamed("value", "id") + .withColumn("category", lit("hr")) + .withColumn("data", lit("ABCDEF")); df.coalesce(1).writeTo(TABLE_NAME).append(); Assert.assertEquals(7500, spark.table(TABLE_NAME).count()); - List expectedRows = ids.stream() - .map(this::row) - .collect(Collectors.toList()); - assertEquals("Rows must match", - expectedRows, - sql("SELECT _pos FROM %s", TABLE_NAME)); + List expectedRows = ids.stream().map(this::row).collect(Collectors.toList()); + assertEquals("Rows must match", expectedRows, sql("SELECT _pos FROM %s", TABLE_NAME)); } @Test @@ -231,42 +220,52 @@ public void testPartitionMetadataColumnWithUnknownTransforms() { TableMetadata base = ops.current(); ops.commit(base, base.updatePartitionSpec(UNKNOWN_SPEC)); - AssertHelpers.assertThrows("Should fail to query the partition metadata column", - ValidationException.class, "Cannot build table partition type, unknown transforms", + AssertHelpers.assertThrows( + "Should fail to query the partition metadata column", + ValidationException.class, + "Cannot build table partition type, unknown transforms", () -> sql("SELECT _partition FROM %s", TABLE_NAME)); } @Test public void testConflictingColumns() { - table.updateSchema() + table + .updateSchema() .addColumn(MetadataColumns.SPEC_ID.name(), Types.IntegerType.get()) .addColumn(MetadataColumns.FILE_PATH.name(), Types.StringType.get()) .commit(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1', -1, 'path/to/file')", TABLE_NAME); - assertEquals("Rows must match", + assertEquals( + "Rows must match", ImmutableList.of(row(1L, "a1")), sql("SELECT id, category FROM %s", TABLE_NAME)); - AssertHelpers.assertThrows("Should fail to query conflicting columns", - ValidationException.class, "column names conflict", + AssertHelpers.assertThrows( + "Should fail to query conflicting columns", + ValidationException.class, + "column names conflict", () -> sql("SELECT * FROM %s", TABLE_NAME)); table.refresh(); - table.updateSchema() + table + .updateSchema() .renameColumn(MetadataColumns.SPEC_ID.name(), "_renamed" + MetadataColumns.SPEC_ID.name()) - .renameColumn(MetadataColumns.FILE_PATH.name(), "_renamed" + MetadataColumns.FILE_PATH.name()) + .renameColumn( + MetadataColumns.FILE_PATH.name(), "_renamed" + MetadataColumns.FILE_PATH.name()) .commit(); - assertEquals("Rows must match", + assertEquals( + "Rows must match", ImmutableList.of(row(0, null, -1)), sql("SELECT _spec_id, _partition, _renamed_spec_id FROM %s", TABLE_NAME)); } private void createAndInitTable() throws IOException { - this.table = TestTables.create(temp.newFolder(), TABLE_NAME, SCHEMA, PartitionSpec.unpartitioned()); + this.table = + TestTables.create(temp.newFolder(), TABLE_NAME, SCHEMA, PartitionSpec.unpartitioned()); UpdateProperties updateProperties = table.updateProperties(); updateProperties.set(FORMAT_VERSION, String.valueOf(formatVersion)); @@ -280,7 +279,8 @@ private void createAndInitTable() throws IOException { updateProperties.set(ORC_VECTORIZATION_ENABLED, String.valueOf(vectorized)); break; default: - Preconditions.checkState(!vectorized, "File format %s does not support vectorized reads", fileFormat); + Preconditions.checkState( + !vectorized, "File format %s does not support vectorized reads", fileFormat); } updateProperties.commit(); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java index 4d07cfbe86ea..276d8c632fc0 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -39,9 +38,11 @@ public TestSparkPartitioningWriters(FileFormat fileFormat) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java index 480448e13a8f..245c392774f5 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -39,9 +38,11 @@ public TestSparkPositionDeltaWriters(FileFormat fileFormat) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java index f42b48d0e30d..7d6f0e76f78f 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -51,10 +54,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSparkReadProjection extends TestReadProjection { @@ -63,11 +62,11 @@ public class TestSparkReadProjection extends TestReadProjection { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } @@ -83,14 +82,17 @@ public TestSparkReadProjection(String format, boolean vectorized) { @BeforeClass public static void startSpark() { TestSparkReadProjection.spark = SparkSession.builder().master("local[2]").getOrCreate(); - ImmutableMap config = ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "parquet-enabled", "true", - "cache-enabled", "false" - ); - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); - config.forEach((key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); + ImmutableMap config = + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "parquet-enabled", "true", + "cache-enabled", "false"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); + config.forEach( + (key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); } @AfterClass @@ -101,8 +103,8 @@ public static void stopSpark() { } @Override - protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, - Record record) throws IOException { + protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, Record record) + throws IOException { File parent = temp.newFolder(desc); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); @@ -116,16 +118,17 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema // When tables are created, the column ids are reassigned. Schema tableSchema = table.schema(); - try (FileAppender writer = new GenericAppenderFactory(tableSchema).newAppender( - localOutput(testFile), format)) { + try (FileAppender writer = + new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), format)) { writer.add(record); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(100) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(100) + .withFileSizeInBytes(testFile.length()) + .withPath(testFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); @@ -139,14 +142,16 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema Schema expectedSchema = reassignIds(readSchema, idMapping); // Set the schema to the expected schema directly to simulate the table schema evolving - TestTables.replaceMetadata(desc, - TestTables.readMetadata(desc).updateSchema(expectedSchema, 100)); + TestTables.replaceMetadata( + desc, TestTables.readMetadata(desc).updateSchema(expectedSchema, 100)); - Dataset df = spark.read() - .format("org.apache.iceberg.spark.source.TestIcebergSource") - .option("iceberg.table.name", desc) - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(); + Dataset df = + spark + .read() + .format("org.apache.iceberg.spark.source.TestIcebergSource") + .option("iceberg.table.name", desc) + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(); return SparkValueConverter.convert(readSchema, df.collectAsList().get(0)); @@ -157,87 +162,98 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema private List allIds(Schema schema) { List ids = Lists.newArrayList(); - TypeUtil.visit(schema, new TypeUtil.SchemaVisitor() { - @Override - public Void field(Types.NestedField field, Void fieldResult) { - ids.add(field.fieldId()); - return null; - } + TypeUtil.visit( + schema, + new TypeUtil.SchemaVisitor() { + @Override + public Void field(Types.NestedField field, Void fieldResult) { + ids.add(field.fieldId()); + return null; + } - @Override - public Void list(Types.ListType list, Void elementResult) { - ids.add(list.elementId()); - return null; - } + @Override + public Void list(Types.ListType list, Void elementResult) { + ids.add(list.elementId()); + return null; + } - @Override - public Void map(Types.MapType map, Void keyResult, Void valueResult) { - ids.add(map.keyId()); - ids.add(map.valueId()); - return null; - } - }); + @Override + public Void map(Types.MapType map, Void keyResult, Void valueResult) { + ids.add(map.keyId()); + ids.add(map.valueId()); + return null; + } + }); return ids; } private Schema reassignIds(Schema schema, Map idMapping) { - return new Schema(TypeUtil.visit(schema, new TypeUtil.SchemaVisitor() { - private int mapId(int id) { - if (idMapping.containsKey(id)) { - return idMapping.get(id); - } - return 1000 + id; // make sure the new IDs don't conflict with reassignment - } + return new Schema( + TypeUtil.visit( + schema, + new TypeUtil.SchemaVisitor() { + private int mapId(int id) { + if (idMapping.containsKey(id)) { + return idMapping.get(id); + } + return 1000 + id; // make sure the new IDs don't conflict with reassignment + } - @Override - public Type schema(Schema schema, Type structResult) { - return structResult; - } + @Override + public Type schema(Schema schema, Type structResult) { + return structResult; + } - @Override - public Type struct(Types.StructType struct, List fieldResults) { - List newFields = Lists.newArrayListWithExpectedSize(fieldResults.size()); - List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - if (field.isOptional()) { - newFields.add(optional(mapId(field.fieldId()), field.name(), fieldResults.get(i))); - } else { - newFields.add(required(mapId(field.fieldId()), field.name(), fieldResults.get(i))); - } - } - return Types.StructType.of(newFields); - } + @Override + public Type struct(Types.StructType struct, List fieldResults) { + List newFields = + Lists.newArrayListWithExpectedSize(fieldResults.size()); + List fields = struct.fields(); + for (int i = 0; i < fields.size(); i += 1) { + Types.NestedField field = fields.get(i); + if (field.isOptional()) { + newFields.add( + optional(mapId(field.fieldId()), field.name(), fieldResults.get(i))); + } else { + newFields.add( + required(mapId(field.fieldId()), field.name(), fieldResults.get(i))); + } + } + return Types.StructType.of(newFields); + } - @Override - public Type field(Types.NestedField field, Type fieldResult) { - return fieldResult; - } + @Override + public Type field(Types.NestedField field, Type fieldResult) { + return fieldResult; + } - @Override - public Type list(Types.ListType list, Type elementResult) { - if (list.isElementOptional()) { - return Types.ListType.ofOptional(mapId(list.elementId()), elementResult); - } else { - return Types.ListType.ofRequired(mapId(list.elementId()), elementResult); - } - } + @Override + public Type list(Types.ListType list, Type elementResult) { + if (list.isElementOptional()) { + return Types.ListType.ofOptional(mapId(list.elementId()), elementResult); + } else { + return Types.ListType.ofRequired(mapId(list.elementId()), elementResult); + } + } - @Override - public Type map(Types.MapType map, Type keyResult, Type valueResult) { - if (map.isValueOptional()) { - return Types.MapType.ofOptional( - mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); - } else { - return Types.MapType.ofRequired( - mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); - } - } + @Override + public Type map(Types.MapType map, Type keyResult, Type valueResult) { + if (map.isValueOptional()) { + return Types.MapType.ofOptional( + mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); + } else { + return Types.MapType.ofRequired( + mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); + } + } - @Override - public Type primitive(Type.PrimitiveType primitive) { - return primitive; - } - }).asNestedType().asStructType().fields()); + @Override + public Type primitive(Type.PrimitiveType primitive) { + return primitive; + } + }) + .asNestedType() + .asStructType() + .fields()); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java index 2da25d5ee529..31ec21b3b0fe 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.List; import java.util.Set; @@ -70,9 +72,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSparkReaderDeletes extends DeleteReadTests { @@ -96,15 +95,18 @@ public static void startMetastoreAndSpark() { metastore.start(); HiveConf hiveConf = metastore.hiveConf(); - spark = SparkSession.builder() - .master("local[2]") - .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .enableHiveSupport() - .getOrCreate(); + spark = + SparkSession.builder() + .master("local[2]") + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .enableHiveSupport() + .getOrCreate(); - catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); try { catalog.createNamespace(Namespace.of("default")); @@ -129,14 +131,15 @@ protected Table createTable(String name, Schema schema, PartitionSpec spec) { TableMetadata meta = ops.current(); ops.commit(meta, meta.upgradeToFormatVersion(2)); if (vectorized) { - table.updateProperties() + table + .updateProperties() .set(TableProperties.PARQUET_VECTORIZATION_ENABLED, "true") - .set(TableProperties.PARQUET_BATCH_SIZE, "4") // split 7 records to two batches to cover more code paths + .set( + TableProperties.PARQUET_BATCH_SIZE, + "4") // split 7 records to two batches to cover more code paths .commit(); } else { - table.updateProperties() - .set(TableProperties.PARQUET_VECTORIZATION_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.PARQUET_VECTORIZATION_ENABLED, "false").commit(); } return table; } @@ -152,16 +155,20 @@ public StructLikeSet rowSet(String name, Table table, String... columns) { } public StructLikeSet rowSet(String name, Types.StructType projection, String... columns) { - Dataset df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", name).toString()) - .selectExpr(columns); + Dataset df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", name).toString()) + .selectExpr(columns); StructLikeSet set = StructLikeSet.create(projection); - df.collectAsList().forEach(row -> { - SparkStructLike rowWrapper = new SparkStructLike(projection); - set.add(rowWrapper.wrap(row)); - }); + df.collectAsList() + .forEach( + row -> { + SparkStructLike rowWrapper = new SparkStructLike(projection); + set.add(rowWrapper.wrap(row)); + }); return set; } @@ -171,31 +178,39 @@ public void testEqualityDeleteWithFilter() throws IOException { String tableName = table.name().substring(table.name().lastIndexOf(".") + 1); Schema deleteRowSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d"), // id = 89 - dataDelete.copy("data", "g") // id = 122 - ); - - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, deleteRowSchema); - - table.newRowDelta() - .addDeletes(eqDeletes) - .commit(); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d"), // id = 89 + dataDelete.copy("data", "g") // id = 122 + ); + + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + deleteRowSchema); + + table.newRowDelta().addDeletes(eqDeletes).commit(); Types.StructType projection = table.schema().select("*").asStruct(); - Dataset df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", tableName).toString()) - .filter("data = 'a'") // select a deleted row - .selectExpr("*"); + Dataset df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", tableName).toString()) + .filter("data = 'a'") // select a deleted row + .selectExpr("*"); StructLikeSet actual = StructLikeSet.create(projection); - df.collectAsList().forEach(row -> { - SparkStructLike rowWrapper = new SparkStructLike(projection); - actual.add(rowWrapper.wrap(row)); - }); + df.collectAsList() + .forEach( + row -> { + SparkStructLike rowWrapper = new SparkStructLike(projection); + actual.add(rowWrapper.wrap(row)); + }); Assert.assertEquals("Table should contain no rows", 0, actual.size()); } @@ -204,44 +219,57 @@ public void testEqualityDeleteWithFilter() throws IOException { public void testReadEqualityDeleteRows() throws IOException { Schema deleteSchema1 = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteSchema1); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d") // id = 89 - ); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d") // id = 89 + ); Schema deleteSchema2 = table.schema().select("id"); Record idDelete = GenericRecord.create(deleteSchema2); - List idDeletes = Lists.newArrayList( - idDelete.copy("id", 121), // id = 121 - idDelete.copy("id", 122) // id = 122 - ); - - DeleteFile eqDelete1 = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, deleteSchema1); - - DeleteFile eqDelete2 = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), idDeletes, deleteSchema2); - - table.newRowDelta() - .addDeletes(eqDelete1) - .addDeletes(eqDelete2) - .commit(); + List idDeletes = + Lists.newArrayList( + idDelete.copy("id", 121), // id = 121 + idDelete.copy("id", 122) // id = 122 + ); + + DeleteFile eqDelete1 = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + deleteSchema1); + + DeleteFile eqDelete2 = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + idDeletes, + deleteSchema2); + + table.newRowDelta().addDeletes(eqDelete1).addDeletes(eqDelete2).commit(); StructLikeSet expectedRowSet = rowSetWithIds(29, 89, 121, 122); Types.StructType type = table.schema().asStruct(); StructLikeSet actualRowSet = StructLikeSet.create(type); - CloseableIterable tasks = TableScanUtil.planTasks( - table.newScan().planFiles(), - TableProperties.METADATA_SPLIT_SIZE_DEFAULT, - TableProperties.SPLIT_LOOKBACK_DEFAULT, - TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); + CloseableIterable tasks = + TableScanUtil.planTasks( + table.newScan().planFiles(), + TableProperties.METADATA_SPLIT_SIZE_DEFAULT, + TableProperties.SPLIT_LOOKBACK_DEFAULT, + TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); for (CombinedScanTask task : tasks) { - try (EqualityDeleteRowReader reader = new EqualityDeleteRowReader(task, table, table.schema(), false)) { + try (EqualityDeleteRowReader reader = + new EqualityDeleteRowReader(task, table, table.schema(), false)) { while (reader.next()) { - actualRowSet.add(new InternalRowWrapper(SparkSchemaUtil.convert(table.schema())).wrap(reader.get().copy())); + actualRowSet.add( + new InternalRowWrapper(SparkSchemaUtil.convert(table.schema())) + .wrap(reader.get().copy())); } } } @@ -252,18 +280,22 @@ public void testReadEqualityDeleteRows() throws IOException { @Test public void testPosDeletesAllRowsInBatch() throws IOException { - // read.parquet.vectorization.batch-size is set to 4, so the 4 rows in the first batch are all deleted. - List> deletes = Lists.newArrayList( - Pair.of(dataFile.path(), 0L), // id = 29 - Pair.of(dataFile.path(), 1L), // id = 43 - Pair.of(dataFile.path(), 2L), // id = 61 - Pair.of(dataFile.path(), 3L) // id = 89 - ); - - Pair posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), deletes); - - table.newRowDelta() + // read.parquet.vectorization.batch-size is set to 4, so the 4 rows in the first batch are all + // deleted. + List> deletes = + Lists.newArrayList( + Pair.of(dataFile.path(), 0L), // id = 29 + Pair.of(dataFile.path(), 1L), // id = 43 + Pair.of(dataFile.path(), 2L), // id = 61 + Pair.of(dataFile.path(), 3L) // id = 89 + ); + + Pair posDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), deletes); + + table + .newRowDelta() .addDeletes(posDeletes.first()) .validateDataFilesExist(posDeletes.second()) .commit(); @@ -276,24 +308,29 @@ public void testPosDeletesAllRowsInBatch() throws IOException { @Test public void testPosDeletesWithDeletedColumn() throws IOException { - // read.parquet.vectorization.batch-size is set to 4, so the 4 rows in the first batch are all deleted. - List> deletes = Lists.newArrayList( - Pair.of(dataFile.path(), 0L), // id = 29 - Pair.of(dataFile.path(), 1L), // id = 43 - Pair.of(dataFile.path(), 2L), // id = 61 - Pair.of(dataFile.path(), 3L) // id = 89 - ); - - Pair posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), deletes); - - table.newRowDelta() + // read.parquet.vectorization.batch-size is set to 4, so the 4 rows in the first batch are all + // deleted. + List> deletes = + Lists.newArrayList( + Pair.of(dataFile.path(), 0L), // id = 29 + Pair.of(dataFile.path(), 1L), // id = 43 + Pair.of(dataFile.path(), 2L), // id = 61 + Pair.of(dataFile.path(), 3L) // id = 89 + ); + + Pair posDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), deletes); + + table + .newRowDelta() .addDeletes(posDeletes.first()) .validateDataFilesExist(posDeletes.second()) .commit(); StructLikeSet expected = expectedRowSet(29, 43, 61, 89); - StructLikeSet actual = rowSet(tableName, PROJECTION_SCHEMA.asStruct(), "id", "data", "_deleted"); + StructLikeSet actual = + rowSet(tableName, PROJECTION_SCHEMA.asStruct(), "id", "data", "_deleted"); Assert.assertEquals("Table should contain expected row", expected, actual); } @@ -303,21 +340,26 @@ public void testEqualityDeleteWithDeletedColumn() throws IOException { String tableName = table.name().substring(table.name().lastIndexOf(".") + 1); Schema deleteRowSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d"), // id = 89 - dataDelete.copy("data", "g") // id = 122 - ); - - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, deleteRowSchema); - - table.newRowDelta() - .addDeletes(eqDeletes) - .commit(); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d"), // id = 89 + dataDelete.copy("data", "g") // id = 122 + ); + + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + deleteRowSchema); + + table.newRowDelta().addDeletes(eqDeletes).commit(); StructLikeSet expected = expectedRowSet(29, 89, 122); - StructLikeSet actual = rowSet(tableName, PROJECTION_SCHEMA.asStruct(), "id", "data", "_deleted"); + StructLikeSet actual = + rowSet(tableName, PROJECTION_SCHEMA.asStruct(), "id", "data", "_deleted"); Assert.assertEquals("Table should contain expected row", expected, actual); } @@ -326,48 +368,61 @@ public void testEqualityDeleteWithDeletedColumn() throws IOException { public void testMixedPosAndEqDeletesWithDeletedColumn() throws IOException { Schema dataSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(dataSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d"), // id = 89 - dataDelete.copy("data", "g") // id = 122 - ); - - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, dataSchema); - - List> deletes = Lists.newArrayList( - Pair.of(dataFile.path(), 3L), // id = 89 - Pair.of(dataFile.path(), 5L) // id = 121 - ); - - Pair posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), deletes); - - table.newRowDelta() + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d"), // id = 89 + dataDelete.copy("data", "g") // id = 122 + ); + + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + dataSchema); + + List> deletes = + Lists.newArrayList( + Pair.of(dataFile.path(), 3L), // id = 89 + Pair.of(dataFile.path(), 5L) // id = 121 + ); + + Pair posDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), deletes); + + table + .newRowDelta() .addDeletes(eqDeletes) .addDeletes(posDeletes.first()) .validateDataFilesExist(posDeletes.second()) .commit(); StructLikeSet expected = expectedRowSet(29, 89, 121, 122); - StructLikeSet actual = rowSet(tableName, PROJECTION_SCHEMA.asStruct(), "id", "data", "_deleted"); + StructLikeSet actual = + rowSet(tableName, PROJECTION_SCHEMA.asStruct(), "id", "data", "_deleted"); Assert.assertEquals("Table should contain expected row", expected, actual); } @Test public void testFilterOnDeletedMetadataColumn() throws IOException { - List> deletes = Lists.newArrayList( - Pair.of(dataFile.path(), 0L), // id = 29 - Pair.of(dataFile.path(), 1L), // id = 43 - Pair.of(dataFile.path(), 2L), // id = 61 - Pair.of(dataFile.path(), 3L) // id = 89 - ); - - Pair posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), deletes); - - table.newRowDelta() + List> deletes = + Lists.newArrayList( + Pair.of(dataFile.path(), 0L), // id = 29 + Pair.of(dataFile.path(), 1L), // id = 43 + Pair.of(dataFile.path(), 2L), // id = 61 + Pair.of(dataFile.path(), 3L) // id = 89 + ); + + Pair posDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), deletes); + + table + .newRowDelta() .addDeletes(posDeletes.first()) .validateDataFilesExist(posDeletes.second()) .commit(); @@ -375,35 +430,43 @@ public void testFilterOnDeletedMetadataColumn() throws IOException { StructLikeSet expected = expectedRowSetWithNonDeletesOnly(29, 43, 61, 89); // get non-deleted rows - Dataset df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", tableName).toString()) - .select("id", "data", "_deleted") - .filter("_deleted = false"); + Dataset df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", tableName).toString()) + .select("id", "data", "_deleted") + .filter("_deleted = false"); Types.StructType projection = PROJECTION_SCHEMA.asStruct(); StructLikeSet actual = StructLikeSet.create(projection); - df.collectAsList().forEach(row -> { - SparkStructLike rowWrapper = new SparkStructLike(projection); - actual.add(rowWrapper.wrap(row)); - }); + df.collectAsList() + .forEach( + row -> { + SparkStructLike rowWrapper = new SparkStructLike(projection); + actual.add(rowWrapper.wrap(row)); + }); Assert.assertEquals("Table should contain expected row", expected, actual); StructLikeSet expectedDeleted = expectedRowSetWithDeletesOnly(29, 43, 61, 89); // get deleted rows - df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", tableName).toString()) - .select("id", "data", "_deleted") - .filter("_deleted = true"); + df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", tableName).toString()) + .select("id", "data", "_deleted") + .filter("_deleted = true"); StructLikeSet actualDeleted = StructLikeSet.create(projection); - df.collectAsList().forEach(row -> { - SparkStructLike rowWrapper = new SparkStructLike(projection); - actualDeleted.add(rowWrapper.wrap(row)); - }); + df.collectAsList() + .forEach( + row -> { + SparkStructLike rowWrapper = new SparkStructLike(projection); + actualDeleted.add(rowWrapper.wrap(row)); + }); Assert.assertEquals("Table should contain expected row", expectedDeleted, actualDeleted); } @@ -411,15 +474,16 @@ public void testFilterOnDeletedMetadataColumn() throws IOException { @Test public void testIsDeletedColumnWithoutDeleteFile() { StructLikeSet expected = expectedRowSet(); - StructLikeSet actual = rowSet(tableName, PROJECTION_SCHEMA.asStruct(), "id", "data", "_deleted"); + StructLikeSet actual = + rowSet(tableName, PROJECTION_SCHEMA.asStruct(), "id", "data", "_deleted"); Assert.assertEquals("Table should contain expected row", expected, actual); } - private static final Schema PROJECTION_SCHEMA = new Schema( - required(1, "id", Types.IntegerType.get()), - required(2, "data", Types.StringType.get()), - MetadataColumns.IS_DELETED - ); + private static final Schema PROJECTION_SCHEMA = + new Schema( + required(1, "id", Types.IntegerType.get()), + required(2, "data", Types.StringType.get()), + MetadataColumns.IS_DELETED); private static StructLikeSet expectedRowSet(int... idsToRemove) { return expectedRowSet(false, false, idsToRemove); @@ -433,21 +497,24 @@ private static StructLikeSet expectedRowSetWithNonDeletesOnly(int... idsToRemove return expectedRowSet(true, false, idsToRemove); } - private static StructLikeSet expectedRowSet(boolean removeDeleted, boolean removeNonDeleted, int... idsToRemove) { + private static StructLikeSet expectedRowSet( + boolean removeDeleted, boolean removeNonDeleted, int... idsToRemove) { Set deletedIds = Sets.newHashSet(ArrayUtil.toIntList(idsToRemove)); List records = recordsWithDeletedColumn(); // mark rows deleted - records.forEach(record -> { - if (deletedIds.contains(record.getField("id"))) { - record.setField(MetadataColumns.IS_DELETED.name(), true); - } - }); + records.forEach( + record -> { + if (deletedIds.contains(record.getField("id"))) { + record.setField(MetadataColumns.IS_DELETED.name(), true); + } + }); records.removeIf(record -> deletedIds.contains(record.getField("id")) && removeDeleted); records.removeIf(record -> !deletedIds.contains(record.getField("id")) && removeNonDeleted); StructLikeSet set = StructLikeSet.create(PROJECTION_SCHEMA.asStruct()); - records.forEach(record -> set.add(new InternalRecordWrapper(PROJECTION_SCHEMA.asStruct()).wrap(record))); + records.forEach( + record -> set.add(new InternalRecordWrapper(PROJECTION_SCHEMA.asStruct()).wrap(record))); return set; } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java index 9cf8a1f15071..2723ba0a6275 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; +import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; + import java.io.Closeable; import java.io.IOException; import java.math.BigDecimal; @@ -68,13 +74,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; -import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; - @RunWith(Parameterized.class) public class TestSparkReaderWithBloomFilter { @@ -95,18 +94,18 @@ public TestSparkReaderWithBloomFilter(boolean vectorized, boolean useBloomFilter } // Schema passed to create tables - public static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "id_long", Types.LongType.get()), - Types.NestedField.required(3, "id_double", Types.DoubleType.get()), - Types.NestedField.required(4, "id_float", Types.FloatType.get()), - Types.NestedField.required(5, "id_string", Types.StringType.get()), - Types.NestedField.optional(6, "id_boolean", Types.BooleanType.get()), - Types.NestedField.optional(7, "id_date", Types.DateType.get()), - Types.NestedField.optional(8, "id_int_decimal", Types.DecimalType.of(8, 2)), - Types.NestedField.optional(9, "id_long_decimal", Types.DecimalType.of(14, 2)), - Types.NestedField.optional(10, "id_fixed_decimal", Types.DecimalType.of(31, 2)) - ); + public static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "id_long", Types.LongType.get()), + Types.NestedField.required(3, "id_double", Types.DoubleType.get()), + Types.NestedField.required(4, "id_float", Types.FloatType.get()), + Types.NestedField.required(5, "id_string", Types.StringType.get()), + Types.NestedField.optional(6, "id_boolean", Types.BooleanType.get()), + Types.NestedField.optional(7, "id_date", Types.DateType.get()), + Types.NestedField.optional(8, "id_int_decimal", Types.DecimalType.of(8, 2)), + Types.NestedField.optional(9, "id_long_decimal", Types.DecimalType.of(14, 2)), + Types.NestedField.optional(10, "id_fixed_decimal", Types.DecimalType.of(31, 2))); private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 329; @@ -116,8 +115,7 @@ public TestSparkReaderWithBloomFilter(boolean vectorized, boolean useBloomFilter private static final float FLOAT_BASE = 100000F; private static final String BINARY_PREFIX = "BINARY测试_"; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Before public void writeTestDataFile() throws IOException { @@ -129,24 +127,34 @@ public void writeTestDataFile() throws IOException { GenericRecord record = GenericRecord.create(table.schema()); for (int i = 0; i < INT_VALUE_COUNT; i += 1) { - records.add(record.copy(ImmutableMap.of( - "id", INT_MIN_VALUE + i, - "id_long", LONG_BASE + INT_MIN_VALUE + i, - "id_double", DOUBLE_BASE + INT_MIN_VALUE + i, - "id_float", FLOAT_BASE + INT_MIN_VALUE + i, - "id_string", BINARY_PREFIX + (INT_MIN_VALUE + i), - "id_boolean", (i % 2 == 0) ? true : false, - "id_date", LocalDate.parse("2021-09-05"), - "id_int_decimal", new BigDecimal(String.valueOf(77.77)), - "id_long_decimal", new BigDecimal(String.valueOf(88.88)), - "id_fixed_decimal", new BigDecimal(String.valueOf(99.99))))); + records.add( + record.copy( + ImmutableMap.of( + "id", + INT_MIN_VALUE + i, + "id_long", + LONG_BASE + INT_MIN_VALUE + i, + "id_double", + DOUBLE_BASE + INT_MIN_VALUE + i, + "id_float", + FLOAT_BASE + INT_MIN_VALUE + i, + "id_string", + BINARY_PREFIX + (INT_MIN_VALUE + i), + "id_boolean", + (i % 2 == 0) ? true : false, + "id_date", + LocalDate.parse("2021-09-05"), + "id_int_decimal", + new BigDecimal(String.valueOf(77.77)), + "id_long_decimal", + new BigDecimal(String.valueOf(88.88)), + "id_fixed_decimal", + new BigDecimal(String.valueOf(99.99))))); } this.dataFile = writeDataFile(Files.localOutput(temp.newFile()), Row.of(0), records); - table.newAppend() - .appendFile(dataFile) - .commit(); + table.newAppend().appendFile(dataFile).commit(); } @After @@ -156,9 +164,7 @@ public void cleanup() throws IOException { @Parameterized.Parameters(name = "vectorized = {0}, useBloomFilter = {1}") public static Object[][] parameters() { - return new Object[][] { - {false, false}, {true, false}, {false, true}, {true, true} - }; + return new Object[][] {{false, false}, {true, false}, {false, true}, {true, true}}; } @BeforeClass @@ -167,14 +173,17 @@ public static void startMetastoreAndSpark() { metastore.start(); HiveConf hiveConf = metastore.hiveConf(); - spark = SparkSession.builder() - .master("local[2]") - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .enableHiveSupport() - .getOrCreate(); + spark = + SparkSession.builder() + .master("local[2]") + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .enableHiveSupport() + .getOrCreate(); - catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); try { catalog.createNamespace(Namespace.of("default")); @@ -199,7 +208,8 @@ protected void createTable(String name, Schema schema) { ops.commit(meta, meta.upgradeToFormatVersion(2)); if (useBloomFilter) { - table.updateProperties() + table + .updateProperties() .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "id", "true") .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "id_long", "true") .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "id_double", "true") @@ -213,11 +223,13 @@ protected void createTable(String name, Schema schema) { .commit(); } - table.updateProperties() - .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, "100") // to have multiple row groups + table + .updateProperties() + .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, "100") // to have multiple row groups .commit(); if (vectorized) { - table.updateProperties() + table + .updateProperties() .set(TableProperties.PARQUET_VECTORIZATION_ENABLED, "true") .set(TableProperties.PARQUET_BATCH_SIZE, "4") .commit(); @@ -233,39 +245,74 @@ private DataFile writeDataFile(OutputFile out, StructLike partition, List writer = factory.newAppender(out, format); @@ -290,13 +337,16 @@ private FileFormat defaultFormat(Map properties) { @Test public void testReadWithFilter() { - Dataset df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", tableName).toString()) - // this is from the first row group - .filter("id = 30 AND id_long = 1030 AND id_double = 10030.0 AND id_float = 100030.0" + - " AND id_string = 'BINARY测试_30' AND id_boolean = true AND id_date = '2021-09-05'" + - " AND id_int_decimal = 77.77 AND id_long_decimal = 88.88 AND id_fixed_decimal = 99.99"); + Dataset df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", tableName).toString()) + // this is from the first row group + .filter( + "id = 30 AND id_long = 1030 AND id_double = 10030.0 AND id_float = 100030.0" + + " AND id_string = 'BINARY测试_30' AND id_boolean = true AND id_date = '2021-09-05'" + + " AND id_int_decimal = 77.77 AND id_long_decimal = 88.88 AND id_fixed_decimal = 99.99"); Record record = SparkValueConverter.convert(table.schema(), df.collectAsList().get(0)); @@ -304,13 +354,16 @@ public void testReadWithFilter() { Assert.assertEquals("Table should contain expected rows", record.get(0), 30); - df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", tableName).toString()) - // this is from the third row group - .filter("id = 250 AND id_long = 1250 AND id_double = 10250.0 AND id_float = 100250.0" + - " AND id_string = 'BINARY测试_250' AND id_boolean = true AND id_date = '2021-09-05'" + - " AND id_int_decimal = 77.77 AND id_long_decimal = 88.88 AND id_fixed_decimal = 99.99"); + df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", tableName).toString()) + // this is from the third row group + .filter( + "id = 250 AND id_long = 1250 AND id_double = 10250.0 AND id_float = 100250.0" + + " AND id_string = 'BINARY测试_250' AND id_boolean = true AND id_date = '2021-09-05'" + + " AND id_int_decimal = 77.77 AND id_long_decimal = 88.88 AND id_fixed_decimal = 99.99"); record = SparkValueConverter.convert(table.schema(), df.collectAsList().get(0)); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java index 9023195dcc6a..dcf9140a8885 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -36,9 +35,11 @@ public TestSparkRollingFileWriters(FileFormat fileFormat, boolean partitioned) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java index 1b4fb5f8ce58..616a196872de 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java index 967f394faa74..06ecc20c2fc3 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.FileFormat; diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java index 69302e9d24d7..17370aaa22f2 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import com.fasterxml.jackson.databind.node.ObjectNode; @@ -29,14 +28,17 @@ public class TestStreamingOffset { @Test public void testJsonConversion() { - StreamingOffset[] expected = new StreamingOffset[]{ - new StreamingOffset(System.currentTimeMillis(), 1L, false), - new StreamingOffset(System.currentTimeMillis(), 2L, false), - new StreamingOffset(System.currentTimeMillis(), 3L, false), - new StreamingOffset(System.currentTimeMillis(), 4L, true) - }; - Assert.assertArrayEquals("StreamingOffsets should match", expected, - Arrays.stream(expected).map(elem -> StreamingOffset.fromJson(elem.json())).toArray()); + StreamingOffset[] expected = + new StreamingOffset[] { + new StreamingOffset(System.currentTimeMillis(), 1L, false), + new StreamingOffset(System.currentTimeMillis(), 2L, false), + new StreamingOffset(System.currentTimeMillis(), 3L, false), + new StreamingOffset(System.currentTimeMillis(), 4L, true) + }; + Assert.assertArrayEquals( + "StreamingOffsets should match", + expected, + Arrays.stream(expected).map(elem -> StreamingOffset.fromJson(elem.json())).toArray()); } @Test diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index e931f554cf5a..610d3075855b 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.util.List; import org.apache.hadoop.conf.Configuration; @@ -49,28 +50,24 @@ import scala.Option; import scala.collection.JavaConverters; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestStructuredStreaming { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); private static SparkSession spark = null; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - @Rule - public ExpectedException exceptionRule = ExpectedException.none(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); + @Rule public ExpectedException exceptionRule = ExpectedException.none(); @BeforeClass public static void startSpark() { - TestStructuredStreaming.spark = SparkSession.builder() - .master("local[2]") - .config("spark.sql.shuffle.partitions", 4) - .getOrCreate(); + TestStructuredStreaming.spark = + SparkSession.builder() + .master("local[2]") + .config("spark.sql.shuffle.partitions", 4) + .getOrCreate(); } @AfterClass @@ -90,21 +87,23 @@ public void testStreamingWriteAppendMode() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "1"), - new SimpleRecord(2, "2"), - new SimpleRecord(3, "3"), - new SimpleRecord(4, "4") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "1"), + new SimpleRecord(2, "2"), + new SimpleRecord(3, "3"), + new SimpleRecord(4, "4")); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("append") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("append") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { // start the original query with checkpointing @@ -126,10 +125,9 @@ public void testStreamingWriteAppendMode() throws Exception { restartedQuery.processAllAvailable(); // ensure the write was idempotent - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Dataset result = spark.read().format("iceberg").load(location.toString()); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); @@ -150,22 +148,22 @@ public void testStreamingWriteCompleteMode() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(2, "1"), - new SimpleRecord(3, "2"), - new SimpleRecord(1, "3") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(2, "1"), new SimpleRecord(3, "2"), new SimpleRecord(1, "3")); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .groupBy("value") - .count() - .selectExpr("CAST(count AS INT) AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("complete") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .groupBy("value") + .count() + .selectExpr("CAST(count AS INT) AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("complete") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { // start the original query with checkpointing @@ -187,10 +185,9 @@ public void testStreamingWriteCompleteMode() throws Exception { restartedQuery.processAllAvailable(); // ensure the write was idempotent - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); - List actual = result.orderBy("data").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Dataset result = spark.read().format("iceberg").load(location.toString()); + List actual = + result.orderBy("data").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); @@ -211,22 +208,22 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, null), - new SimpleRecord(2, null), - new SimpleRecord(3, null) - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null)); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .groupBy("value") - .count() - .selectExpr("CAST(count AS INT) AS id") // select only id column - .writeStream() - .outputMode("complete") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .groupBy("value") + .count() + .selectExpr("CAST(count AS INT) AS id") // select only id column + .writeStream() + .outputMode("complete") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { // start the original query with checkpointing @@ -248,10 +245,9 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception { restartedQuery.processAllAvailable(); // ensure the write was idempotent - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Dataset result = spark.read().format("iceberg").load(location.toString()); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); @@ -275,13 +271,15 @@ public void testStreamingWriteUpdateMode() throws Exception { tables.create(SCHEMA, spec, location.toString()); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("update") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("update") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { StreamingQuery query = streamWriter.start(); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java index e609412f8be0..23fdfb09cb83 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.expressions.Expressions.ref; + import java.io.File; import java.util.Collections; import java.util.List; @@ -58,8 +59,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.expressions.Expressions.ref; - @RunWith(Parameterized.class) public final class TestStructuredStreamingRead3 extends SparkCatalogTestBase { public TestStructuredStreamingRead3( @@ -70,59 +69,49 @@ public TestStructuredStreamingRead3( private Table table; /** - * test data to be used by multiple writes - * each write creates a snapshot and writes a list of records + * test data to be used by multiple writes each write creates a snapshot and writes a list of + * records */ - private static final List> TEST_DATA_MULTIPLE_SNAPSHOTS = Lists.newArrayList( - Lists.newArrayList( - new SimpleRecord(1, "one"), - new SimpleRecord(2, "two"), - new SimpleRecord(3, "three")), + private static final List> TEST_DATA_MULTIPLE_SNAPSHOTS = Lists.newArrayList( - new SimpleRecord(4, "four"), - new SimpleRecord(5, "five")), - Lists.newArrayList( - new SimpleRecord(6, "six"), - new SimpleRecord(7, "seven"))); + Lists.newArrayList( + new SimpleRecord(1, "one"), new SimpleRecord(2, "two"), new SimpleRecord(3, "three")), + Lists.newArrayList(new SimpleRecord(4, "four"), new SimpleRecord(5, "five")), + Lists.newArrayList(new SimpleRecord(6, "six"), new SimpleRecord(7, "seven"))); /** - * test data - to be used for multiple write batches - * each batch inturn will have multiple snapshots + * test data - to be used for multiple write batches each batch inturn will have multiple + * snapshots */ - private static final List>> TEST_DATA_MULTIPLE_WRITES_MULTIPLE_SNAPSHOTS = Lists.newArrayList( - Lists.newArrayList( - Lists.newArrayList( - new SimpleRecord(1, "one"), - new SimpleRecord(2, "two"), - new SimpleRecord(3, "three")), - Lists.newArrayList( - new SimpleRecord(4, "four"), - new SimpleRecord(5, "five"))), + private static final List>> TEST_DATA_MULTIPLE_WRITES_MULTIPLE_SNAPSHOTS = Lists.newArrayList( Lists.newArrayList( - new SimpleRecord(6, "six"), - new SimpleRecord(7, "seven")), + Lists.newArrayList( + new SimpleRecord(1, "one"), + new SimpleRecord(2, "two"), + new SimpleRecord(3, "three")), + Lists.newArrayList(new SimpleRecord(4, "four"), new SimpleRecord(5, "five"))), Lists.newArrayList( - new SimpleRecord(8, "eight"), - new SimpleRecord(9, "nine"))), - Lists.newArrayList( + Lists.newArrayList(new SimpleRecord(6, "six"), new SimpleRecord(7, "seven")), + Lists.newArrayList(new SimpleRecord(8, "eight"), new SimpleRecord(9, "nine"))), Lists.newArrayList( - new SimpleRecord(10, "ten"), - new SimpleRecord(11, "eleven"), - new SimpleRecord(12, "twelve")), - Lists.newArrayList( - new SimpleRecord(13, "thirteen"), - new SimpleRecord(14, "fourteen")), - Lists.newArrayList( - new SimpleRecord(15, "fifteen"), - new SimpleRecord(16, "sixteen")))); + Lists.newArrayList( + new SimpleRecord(10, "ten"), + new SimpleRecord(11, "eleven"), + new SimpleRecord(12, "twelve")), + Lists.newArrayList( + new SimpleRecord(13, "thirteen"), new SimpleRecord(14, "fourteen")), + Lists.newArrayList( + new SimpleRecord(15, "fifteen"), new SimpleRecord(16, "sixteen")))); @Before public void setupTable() { - sql("CREATE TABLE %s " + - "(id INT, data STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(3, id))", tableName); + sql( + "CREATE TABLE %s " + + "(id INT, data STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(3, id))", + tableName); this.table = validationCatalog.loadTable(tableIdent); } @@ -163,17 +152,19 @@ public void testReadStreamOnIcebergThenAddData() throws Exception { @Test public void testReadingStreamFromTimestamp() throws Exception { - List dataBeforeTimestamp = Lists.newArrayList( - new SimpleRecord(-2, "minustwo"), - new SimpleRecord(-1, "minusone"), - new SimpleRecord(0, "zero")); + List dataBeforeTimestamp = + Lists.newArrayList( + new SimpleRecord(-2, "minustwo"), + new SimpleRecord(-1, "minusone"), + new SimpleRecord(0, "zero")); appendData(dataBeforeTimestamp); table.refresh(); long streamStartTimestamp = table.currentSnapshot().timestampMillis() + 1; - StreamingQuery query = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(streamStartTimestamp)); + StreamingQuery query = + startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(streamStartTimestamp)); List empty = rowsAvailable(query); Assertions.assertThat(empty.isEmpty()).isTrue(); @@ -190,21 +181,25 @@ public void testReadingStreamFromTimestamp() throws Exception { public void testReadingStreamFromFutureTimetsamp() throws Exception { long futureTimestamp = System.currentTimeMillis() + 10000; - StreamingQuery query = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(futureTimestamp)); + StreamingQuery query = + startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(futureTimestamp)); List actual = rowsAvailable(query); Assertions.assertThat(actual.isEmpty()).isTrue(); - List data = Lists.newArrayList( - new SimpleRecord(-2, "minustwo"), - new SimpleRecord(-1, "minusone"), - new SimpleRecord(0, "zero")); + List data = + Lists.newArrayList( + new SimpleRecord(-2, "minustwo"), + new SimpleRecord(-1, "minusone"), + new SimpleRecord(0, "zero")); // Perform several inserts that should not show up because the fromTimestamp has not elapsed - IntStream.range(0, 3).forEach(x -> { - appendData(data); - Assertions.assertThat(rowsAvailable(query).isEmpty()).isTrue(); - }); + IntStream.range(0, 3) + .forEach( + x -> { + appendData(data); + Assertions.assertThat(rowsAvailable(query).isEmpty()).isTrue(); + }); waitUntilAfter(futureTimestamp); @@ -216,16 +211,16 @@ public void testReadingStreamFromFutureTimetsamp() throws Exception { @Test public void testReadingStreamFromTimestampFutureWithExistingSnapshots() throws Exception { - List dataBeforeTimestamp = Lists.newArrayList( - new SimpleRecord(1, "one"), - new SimpleRecord(2, "two"), - new SimpleRecord(3, "three")); + List dataBeforeTimestamp = + Lists.newArrayList( + new SimpleRecord(1, "one"), new SimpleRecord(2, "two"), new SimpleRecord(3, "three")); appendData(dataBeforeTimestamp); long streamStartTimestamp = System.currentTimeMillis() + 2000; // Start the stream with a future timestamp after the current snapshot - StreamingQuery query = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(streamStartTimestamp)); + StreamingQuery query = + startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(streamStartTimestamp)); List actual = rowsAvailable(query); Assert.assertEquals(Collections.emptyList(), actual); @@ -233,7 +228,8 @@ public void testReadingStreamFromTimestampFutureWithExistingSnapshots() throws E waitUntilAfter(streamStartTimestamp); List> expected = TEST_DATA_MULTIPLE_SNAPSHOTS; appendDataAsMultipleSnapshots(expected); - Assertions.assertThat(rowsAvailable(query)).containsExactlyInAnyOrderElementsOf(Iterables.concat(expected)); + Assertions.assertThat(rowsAvailable(query)) + .containsExactlyInAnyOrderElementsOf(Iterables.concat(expected)); } @Test @@ -246,7 +242,8 @@ public void testReadingStreamFromTimestampOfExistingSnapshot() throws Exception long firstSnapshotTime = table.currentSnapshot().timestampMillis(); // Start stream giving the first Snapshot's time as the start point - StreamingQuery stream = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(firstSnapshotTime)); + StreamingQuery stream = + startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(firstSnapshotTime)); // Append rest of expected data for (int i = 1; i < expected.size(); i++) { @@ -259,14 +256,11 @@ public void testReadingStreamFromTimestampOfExistingSnapshot() throws Exception @Test public void testReadingStreamWithExpiredSnapshotFromTimestamp() throws TimeoutException { - List firstSnapshotRecordList = Lists.newArrayList( - new SimpleRecord(1, "one")); + List firstSnapshotRecordList = Lists.newArrayList(new SimpleRecord(1, "one")); - List secondSnapshotRecordList = Lists.newArrayList( - new SimpleRecord(2, "two")); + List secondSnapshotRecordList = Lists.newArrayList(new SimpleRecord(2, "two")); - List thirdSnapshotRecordList = Lists.newArrayList( - new SimpleRecord(3, "three")); + List thirdSnapshotRecordList = Lists.newArrayList(new SimpleRecord(3, "three")); List expectedRecordList = Lists.newArrayList(); expectedRecordList.addAll(secondSnapshotRecordList); @@ -277,13 +271,14 @@ public void testReadingStreamWithExpiredSnapshotFromTimestamp() throws TimeoutEx long firstSnapshotid = table.currentSnapshot().snapshotId(); long firstSnapshotCommitTime = table.currentSnapshot().timestampMillis(); - appendData(secondSnapshotRecordList); appendData(thirdSnapshotRecordList); table.expireSnapshots().expireSnapshotId(firstSnapshotid).commit(); - StreamingQuery query = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, String.valueOf(firstSnapshotCommitTime)); + StreamingQuery query = + startStream( + SparkReadOptions.STREAM_FROM_TIMESTAMP, String.valueOf(firstSnapshotCommitTime)); List actual = rowsAvailable(query); Assertions.assertThat(actual).containsExactlyInAnyOrderElementsOf(expectedRecordList); } @@ -294,21 +289,24 @@ public void testResumingStreamReadFromCheckpoint() throws Exception { File writerCheckpoint = new File(writerCheckpointFolder, "writer-checkpoint"); File output = temp.newFolder(); - DataStreamWriter querySource = spark.readStream() - .format("iceberg") - .load(tableName) - .writeStream() - .option("checkpointLocation", writerCheckpoint.toString()) - .format("parquet") - .queryName("checkpoint_test") - .option("path", output.getPath()); + DataStreamWriter querySource = + spark + .readStream() + .format("iceberg") + .load(tableName) + .writeStream() + .option("checkpointLocation", writerCheckpoint.toString()) + .format("parquet") + .queryName("checkpoint_test") + .option("path", output.getPath()); StreamingQuery startQuery = querySource.start(); startQuery.processAllAvailable(); startQuery.stop(); List expected = Lists.newArrayList(); - for (List> expectedCheckpoint : TEST_DATA_MULTIPLE_WRITES_MULTIPLE_SNAPSHOTS) { + for (List> expectedCheckpoint : + TEST_DATA_MULTIPLE_WRITES_MULTIPLE_SNAPSHOTS) { // New data was added while the stream was down appendDataAsMultipleSnapshots(expectedCheckpoint); expected.addAll(Lists.newArrayList(Iterables.concat(Iterables.concat(expectedCheckpoint)))); @@ -319,28 +317,23 @@ public void testResumingStreamReadFromCheckpoint() throws Exception { restartedQuery.stop(); // Read data added by the stream - List actual = spark.read() - .load(output.getPath()) - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + spark.read().load(output.getPath()).as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assertions.assertThat(actual).containsExactlyInAnyOrderElementsOf(Iterables.concat(expected)); } } @Test public void testParquetOrcAvroDataInOneTable() throws Exception { - List parquetFileRecords = Lists.newArrayList( - new SimpleRecord(1, "one"), - new SimpleRecord(2, "two"), - new SimpleRecord(3, "three")); + List parquetFileRecords = + Lists.newArrayList( + new SimpleRecord(1, "one"), new SimpleRecord(2, "two"), new SimpleRecord(3, "three")); - List orcFileRecords = Lists.newArrayList( - new SimpleRecord(4, "four"), - new SimpleRecord(5, "five")); + List orcFileRecords = + Lists.newArrayList(new SimpleRecord(4, "four"), new SimpleRecord(5, "five")); - List avroFileRecords = Lists.newArrayList( - new SimpleRecord(6, "six"), - new SimpleRecord(7, "seven")); + List avroFileRecords = + Lists.newArrayList(new SimpleRecord(6, "six"), new SimpleRecord(7, "seven")); appendData(parquetFileRecords); appendData(orcFileRecords, "orc"); @@ -348,7 +341,8 @@ public void testParquetOrcAvroDataInOneTable() throws Exception { StreamingQuery query = startStream(); Assertions.assertThat(rowsAvailable(query)) - .containsExactlyInAnyOrderElementsOf(Iterables.concat(parquetFileRecords, orcFileRecords, avroFileRecords)); + .containsExactlyInAnyOrderElementsOf( + Iterables.concat(parquetFileRecords, orcFileRecords, avroFileRecords)); } @Test @@ -371,18 +365,23 @@ public void testReadStreamWithSnapshotTypeOverwriteErrorsOut() throws Exception Schema deleteRowSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "one") // id = 1 - ); - - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, deleteRowSchema); - - table.newRowDelta() - .addDeletes(eqDeletes) - .commit(); - - // check pre-condition - that the above Delete file write - actually resulted in snapshot of type OVERWRITE + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "one") // id = 1 + ); + + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + deleteRowSchema); + + table.newRowDelta().addDeletes(eqDeletes).commit(); + + // check pre-condition - that the above Delete file write - actually resulted in snapshot of + // type OVERWRITE Assert.assertEquals(DataOperations.OVERWRITE, table.currentSnapshot().operation()); StreamingQuery query = startStream(); @@ -391,8 +390,7 @@ public void testReadStreamWithSnapshotTypeOverwriteErrorsOut() throws Exception "Streaming should fail with IllegalStateException, as the snapshot is not of type APPEND", IllegalStateException.class, "Cannot process overwrite snapshot", - () -> query.processAllAvailable() - ); + () -> query.processAllAvailable()); } @Test @@ -402,9 +400,7 @@ public void testReadStreamWithSnapshotTypeReplaceIgnoresReplace() throws Excepti appendDataAsMultipleSnapshots(expected); // this should create a snapshot with type Replace. - table.rewriteManifests() - .clusterBy(f -> 1) - .commit(); + table.rewriteManifests().clusterBy(f -> 1).commit(); // check pre-condition Assert.assertEquals(DataOperations.REPLACE, table.currentSnapshot().operation()); @@ -416,21 +412,17 @@ public void testReadStreamWithSnapshotTypeReplaceIgnoresReplace() throws Excepti @Test public void testReadStreamWithSnapshotTypeDeleteErrorsOut() throws Exception { - table.updateSpec() - .removeField("id_bucket") - .addField(ref("id")) - .commit(); + table.updateSpec().removeField("id_bucket").addField(ref("id")).commit(); // fill table with some data List> dataAcrossSnapshots = TEST_DATA_MULTIPLE_SNAPSHOTS; appendDataAsMultipleSnapshots(dataAcrossSnapshots); // this should create a snapshot with type delete. - table.newDelete() - .deleteFromRowFilter(Expressions.equal("id", 4)) - .commit(); + table.newDelete().deleteFromRowFilter(Expressions.equal("id", 4)).commit(); - // check pre-condition - that the above delete operation on table resulted in Snapshot of Type DELETE. + // check pre-condition - that the above delete operation on table resulted in Snapshot of Type + // DELETE. Assert.assertEquals(DataOperations.DELETE, table.currentSnapshot().operation()); StreamingQuery query = startStream(); @@ -439,27 +431,22 @@ public void testReadStreamWithSnapshotTypeDeleteErrorsOut() throws Exception { "Streaming should fail with IllegalStateException, as the snapshot is not of type APPEND", IllegalStateException.class, "Cannot process delete snapshot", - () -> query.processAllAvailable() - ); + () -> query.processAllAvailable()); } @Test public void testReadStreamWithSnapshotTypeDeleteAndSkipDeleteOption() throws Exception { - table.updateSpec() - .removeField("id_bucket") - .addField(ref("id")) - .commit(); + table.updateSpec().removeField("id_bucket").addField(ref("id")).commit(); // fill table with some data List> dataAcrossSnapshots = TEST_DATA_MULTIPLE_SNAPSHOTS; appendDataAsMultipleSnapshots(dataAcrossSnapshots); // this should create a snapshot with type delete. - table.newDelete() - .deleteFromRowFilter(Expressions.equal("id", 4)) - .commit(); + table.newDelete().deleteFromRowFilter(Expressions.equal("id", 4)).commit(); - // check pre-condition - that the above delete operation on table resulted in Snapshot of Type DELETE. + // check pre-condition - that the above delete operation on table resulted in Snapshot of Type + // DELETE. Assert.assertEquals(DataOperations.DELETE, table.currentSnapshot().operation()); StreamingQuery query = startStream(SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS, "true"); @@ -469,21 +456,17 @@ public void testReadStreamWithSnapshotTypeDeleteAndSkipDeleteOption() throws Exc @Test public void testReadStreamWithSnapshotTypeDeleteAndSkipOverwriteOption() throws Exception { - table.updateSpec() - .removeField("id_bucket") - .addField(ref("id")) - .commit(); + table.updateSpec().removeField("id_bucket").addField(ref("id")).commit(); // fill table with some data List> dataAcrossSnapshots = TEST_DATA_MULTIPLE_SNAPSHOTS; appendDataAsMultipleSnapshots(dataAcrossSnapshots); // this should create a snapshot with type overwrite. - table.newOverwrite() - .overwriteByRowFilter(Expressions.greaterThan("id", 4)) - .commit(); + table.newOverwrite().overwriteByRowFilter(Expressions.greaterThan("id", 4)).commit(); - // check pre-condition - that the above delete operation on table resulted in Snapshot of Type OVERWRITE. + // check pre-condition - that the above delete operation on table resulted in Snapshot of Type + // OVERWRITE. Assert.assertEquals(DataOperations.OVERWRITE, table.currentSnapshot().operation()); StreamingQuery query = startStream(SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS, "true"); @@ -492,8 +475,8 @@ public void testReadStreamWithSnapshotTypeDeleteAndSkipOverwriteOption() throws } /** - * appends each list as a Snapshot on the iceberg table at the given location. - * accepts a list of lists - each list representing data per snapshot. + * appends each list as a Snapshot on the iceberg table at the given location. accepts a list of + * lists - each list representing data per snapshot. */ private void appendDataAsMultipleSnapshots(List> data) { for (List l : data) { @@ -507,7 +490,8 @@ private void appendData(List data) { private void appendData(List data, String format) { Dataset df = spark.createDataFrame(data, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option("write-format", format) .mode("append") @@ -517,7 +501,8 @@ private void appendData(List data, String format) { private static final String MEMORY_TABLE = "_stream_view_mem"; private StreamingQuery startStream(Map options) throws TimeoutException { - return spark.readStream() + return spark + .readStream() .options(options) .format("iceberg") .load(tableName) @@ -539,9 +524,9 @@ private StreamingQuery startStream(String key, String value) throws TimeoutExcep private List rowsAvailable(StreamingQuery query) { query.processAllAvailable(); - return spark.sql("select * from " + MEMORY_TABLE) + return spark + .sql("select * from " + MEMORY_TABLE) .as(Encoders.bean(SimpleRecord.class)) .collectAsList(); } - } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java index ae3b7aa7a785..0650cb9738a6 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; @@ -42,15 +41,15 @@ // TODO: Use the copy of this from core. class TestTables { - private TestTables() { - } + private TestTables() {} static TestTable create(File temp, String name, Schema schema, PartitionSpec spec) { TestTableOperations ops = new TestTableOperations(name); if (ops.current() != null) { throw new AlreadyExistsException("Table %s already exists at location: %s", name, temp); } - ops.commit(null, TableMetadata.newTableMetadata(schema, spec, temp.toString(), ImmutableMap.of())); + ops.commit( + null, TableMetadata.newTableMetadata(schema, spec, temp.toString(), ImmutableMap.of())); return new TestTable(ops, name); } @@ -166,8 +165,8 @@ public FileIO io() { @Override public LocationProvider locationProvider() { - Preconditions.checkNotNull(current, - "Current metadata should not be null when locationProvider is called"); + Preconditions.checkNotNull( + current, "Current metadata should not be null when locationProvider is called"); return LocationProviders.locationsFor(current.location(), current.properties()); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java index 20509eef7471..f6cac9e9dd82 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.time.LocalDateTime; @@ -64,18 +65,16 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; - @RunWith(Parameterized.class) public class TestTimestampWithoutZone extends SparkTestBase { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(3, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(3, "data", Types.StringType.get())); private static SparkSession spark = null; @@ -91,8 +90,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String format; private final boolean vectorized; @@ -100,9 +98,9 @@ public static void stopSpark() { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false } + {"parquet", false}, + {"parquet", true}, + {"avro", false} }; } @@ -132,16 +130,17 @@ public void writeUnpartitionedTable() throws IOException { // create records using the table's schema this.records = testRecords(tableSchema); - try (FileAppender writer = new GenericAppenderFactory(tableSchema).newAppender( - localOutput(testFile), fileFormat)) { + try (FileAppender writer = + new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), fileFormat)) { writer.addAll(records); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(records.size()) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(records.size()) + .withFileSizeInBytes(testFile.length()) + .withPath(testFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); } @@ -154,69 +153,89 @@ public void testUnpartitionedTimestampWithoutZone() { @Test public void testUnpartitionedTimestampWithoutZoneProjection() { Schema projection = SCHEMA.select("id", "ts"); - assertEqualsSafe(projection.asStruct(), + assertEqualsSafe( + projection.asStruct(), records.stream().map(r -> projectFlat(projection, r)).collect(Collectors.toList()), read(unpartitioned.toString(), vectorized, "id", "ts")); } - @Rule - public ExpectedException exception = ExpectedException.none(); + @Rule public ExpectedException exception = ExpectedException.none(); @Test public void testUnpartitionedTimestampWithoutZoneError() { - AssertHelpers.assertThrows(String.format("Read operation performed on a timestamp without timezone field while " + - "'%s' set to false should throw exception", - SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), + AssertHelpers.assertThrows( + String.format( + "Read operation performed on a timestamp without timezone field while " + + "'%s' set to false should throw exception", + SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), IllegalArgumentException.class, SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, - () -> spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") - .load(unpartitioned.toString()) - .collectAsList()); + () -> + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") + .load(unpartitioned.toString()) + .collectAsList()); } @Test public void testUnpartitionedTimestampWithoutZoneAppend() { - spark.read().format("iceberg") - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()) - .write() - .format("iceberg") - .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .mode(SaveMode.Append) - .save(unpartitioned.toString()); - - assertEqualsSafe(SCHEMA.asStruct(), - Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), - read(unpartitioned.toString(), vectorized)); + spark + .read() + .format("iceberg") + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()) + .write() + .format("iceberg") + .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .mode(SaveMode.Append) + .save(unpartitioned.toString()); + + assertEqualsSafe( + SCHEMA.asStruct(), + Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), + read(unpartitioned.toString(), vectorized)); } @Test public void testUnpartitionedTimestampWithoutZoneWriteError() { - String errorMessage = String.format("Write operation performed on a timestamp without timezone field while " + - "'%s' set to false should throw exception", + String errorMessage = + String.format( + "Write operation performed on a timestamp without timezone field while " + + "'%s' set to false should throw exception", SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); - Runnable writeOperation = () -> spark.read().format("iceberg") - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()) - .write() - .format("iceberg") - .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") - .mode(SaveMode.Append) - .save(unpartitioned.toString()); - - AssertHelpers.assertThrows(errorMessage, IllegalArgumentException.class, - SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, writeOperation); - + Runnable writeOperation = + () -> + spark + .read() + .format("iceberg") + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()) + .write() + .format("iceberg") + .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") + .mode(SaveMode.Append) + .save(unpartitioned.toString()); + + AssertHelpers.assertThrows( + errorMessage, + IllegalArgumentException.class, + SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, + writeOperation); } @Test public void testUnpartitionedTimestampWithoutZoneSessionProperties() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - spark.read().format("iceberg") + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + spark + .read() + .format("iceberg") .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) .load(unpartitioned.toString()) .write() @@ -224,10 +243,11 @@ public void testUnpartitionedTimestampWithoutZoneSessionProperties() { .mode(SaveMode.Append) .save(unpartitioned.toString()); - assertEqualsSafe(SCHEMA.asStruct(), + assertEqualsSafe( + SCHEMA.asStruct(), Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), read(unpartitioned.toString(), vectorized)); - }); + }); } private static Record projectFlat(Schema projection, Record record) { @@ -240,8 +260,8 @@ private static Record projectFlat(Schema projection, Record record) { return result; } - public static void assertEqualsSafe(Types.StructType struct, - List expected, List actual) { + public static void assertEqualsSafe( + Types.StructType struct, List expected, List actual) { Assert.assertEquals("Number of results should match expected", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i += 1) { GenericsHelpers.assertEqualsSafe(struct, expected.get(i), actual.get(i)); @@ -259,20 +279,23 @@ private List testRecords(Schema schema) { record(schema, 6L, parseToLocal("2017-12-21T21:55:30.589712"), "element"), record(schema, 7L, parseToLocal("2017-12-21T17:31:14.532797"), "limited"), record(schema, 8L, parseToLocal("2017-12-21T15:21:51.237521"), "global"), - record(schema, 9L, parseToLocal("2017-12-21T15:02:15.230570"), "goldfish") - ); + record(schema, 9L, parseToLocal("2017-12-21T15:02:15.230570"), "goldfish")); } private static List read(String table, boolean vectorized) { return read(table, vectorized, "*"); } - private static List read(String table, boolean vectorized, String select0, String... selectN) { - Dataset dataset = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .load(table) - .select(select0, selectN); + private static List read( + String table, boolean vectorized, String select0, String... selectN) { + Dataset dataset = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .load(table) + .select(select0, selectN); return dataset.collectAsList(); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java index 7ed71031f3f2..9bf00f1b1365 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; @@ -53,28 +56,24 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestWriteMetricsConfig { private static final Configuration CONF = new Configuration(); - private static final Schema SIMPLE_SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "longCol", Types.IntegerType.get()), - optional(2, "strCol", Types.StringType.get()), - required(3, "record", Types.StructType.of( - required(4, "id", Types.IntegerType.get()), - required(5, "data", Types.StringType.get()) - )) - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final Schema SIMPLE_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "longCol", Types.IntegerType.get()), + optional(2, "strCol", Types.StringType.get()), + required( + 3, + "record", + Types.StructType.of( + required(4, "id", Types.IntegerType.get()), + required(5, "data", Types.StringType.get())))); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; private static JavaSparkContext sc = null; @@ -103,11 +102,9 @@ public void testFullMetricsCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -136,11 +133,9 @@ public void testCountMetricsCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -169,11 +164,9 @@ public void testNoMetricsCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -203,11 +196,9 @@ public void testCustomMetricCollectionForParquet() throws IOException { properties.put("write.metadata.metrics.column.id", "full"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -240,7 +231,8 @@ public void testBadCustomMetricCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); properties.put("write.metadata.metrics.column.ids", "full"); - AssertHelpers.assertThrows("Creating a table with invalid metrics should fail", + AssertHelpers.assertThrows( + "Creating a table with invalid metrics should fail", ValidationException.class, null, () -> tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation)); @@ -251,9 +243,7 @@ public void testCustomMetricCollectionForNestedParquet() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.builderFor(COMPLEX_SCHEMA) - .identity("strCol") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(COMPLEX_SCHEMA).identity("strCol").build(); Map properties = Maps.newHashMap(); properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); properties.put("write.metadata.metrics.column.longCol", "counts"); @@ -263,9 +253,11 @@ public void testCustomMetricCollectionForNestedParquet() throws IOException { Iterable rows = RandomData.generateSpark(COMPLEX_SCHEMA, 10, 0); JavaRDD rdd = sc.parallelize(Lists.newArrayList(rows)); - Dataset df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(COMPLEX_SCHEMA), false); + Dataset df = + spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(COMPLEX_SCHEMA), false); - df.coalesce(1).write() + df.coalesce(1) + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, "parquet") .mode(SaveMode.Append) diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java index 684dfbb255c7..554557df416c 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Objects; @@ -26,8 +25,7 @@ public class ThreeColumnRecord { private String c2; private String c3; - public ThreeColumnRecord() { - } + public ThreeColumnRecord() {} public ThreeColumnRecord(Integer c1, String c2, String c3) { this.c1 = c1; @@ -68,9 +66,9 @@ public boolean equals(Object o) { return false; } ThreeColumnRecord that = (ThreeColumnRecord) o; - return Objects.equals(c1, that.c1) && - Objects.equals(c2, that.c2) && - Objects.equals(c3, that.c3); + return Objects.equals(c1, that.c1) + && Objects.equals(c2, that.c2) + && Objects.equals(c3, that.c3); } @Override @@ -80,10 +78,6 @@ public int hashCode() { @Override public String toString() { - return "ThreeColumnRecord{" + - "c1=" + c1 + - ", c2='" + c2 + '\'' + - ", c3='" + c3 + '\'' + - '}'; + return "ThreeColumnRecord{" + "c1=" + c1 + ", c2='" + c2 + '\'' + ", c3='" + c3 + '\'' + '}'; } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestAlterTable.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestAlterTable.java index 6172bd1fd0fe..e347cde7ba32 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestAlterTable.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestAlterTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.Map; @@ -36,7 +35,8 @@ import org.junit.Test; public class TestAlterTable extends SparkCatalogTestBase { - private final TableIdentifier renamedIdent = TableIdentifier.of(Namespace.of("default"), "table2"); + private final TableIdentifier renamedIdent = + TableIdentifier.of(Namespace.of("default"), "table2"); public TestAlterTable(String catalogName, String implementation, Map config) { super(catalogName, implementation, config); @@ -55,39 +55,53 @@ public void removeTable() { @Test public void testAddColumnNotNull() { - AssertHelpers.assertThrows("Should reject adding NOT NULL column", - SparkException.class, "Incompatible change: cannot add required column", + AssertHelpers.assertThrows( + "Should reject adding NOT NULL column", + SparkException.class, + "Incompatible change: cannot add required column", () -> sql("ALTER TABLE %s ADD COLUMN c3 INT NOT NULL", tableName)); } @Test public void testAddColumn() { - sql("ALTER TABLE %s ADD COLUMN point struct AFTER id", tableName); - - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(3, "point", Types.StructType.of( - NestedField.required(4, "x", Types.DoubleType.get()), - NestedField.required(5, "y", Types.DoubleType.get()) - )), - NestedField.optional(2, "data", Types.StringType.get())); - - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + sql( + "ALTER TABLE %s ADD COLUMN point struct AFTER id", + tableName); + + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional( + 3, + "point", + Types.StructType.of( + NestedField.required(4, "x", Types.DoubleType.get()), + NestedField.required(5, "y", Types.DoubleType.get()))), + NestedField.optional(2, "data", Types.StringType.get())); + + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); sql("ALTER TABLE %s ADD COLUMN point.z double COMMENT 'May be null' FIRST", tableName); - Types.StructType expectedSchema2 = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(3, "point", Types.StructType.of( - NestedField.optional(6, "z", Types.DoubleType.get(), "May be null"), - NestedField.required(4, "x", Types.DoubleType.get()), - NestedField.required(5, "y", Types.DoubleType.get()) - )), - NestedField.optional(2, "data", Types.StringType.get())); - - Assert.assertEquals("Schema should match expected", - expectedSchema2, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Types.StructType expectedSchema2 = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional( + 3, + "point", + Types.StructType.of( + NestedField.optional(6, "z", Types.DoubleType.get(), "May be null"), + NestedField.required(4, "x", Types.DoubleType.get()), + NestedField.required(5, "y", Types.DoubleType.get()))), + NestedField.optional(2, "data", Types.StringType.get())); + + Assert.assertEquals( + "Schema should match expected", + expectedSchema2, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -95,19 +109,24 @@ public void testAddColumnWithArray() { sql("ALTER TABLE %s ADD COLUMN data2 array>", tableName); // use the implicit column name 'element' to access member of array and add column d to struct. sql("ALTER TABLE %s ADD COLUMN data2.element.d int", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get()), - NestedField.optional(3, "data2", Types.ListType.ofOptional( - 4, - Types.StructType.of( - NestedField.optional(5, "a", Types.IntegerType.get()), - NestedField.optional(6, "b", Types.IntegerType.get()), - NestedField.optional(7, "c", Types.IntegerType.get()), - NestedField.optional(8, "d", Types.IntegerType.get())) - ))); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get()), + NestedField.optional( + 3, + "data2", + Types.ListType.ofOptional( + 4, + Types.StructType.of( + NestedField.optional(5, "a", Types.IntegerType.get()), + NestedField.optional(6, "b", Types.IntegerType.get()), + NestedField.optional(7, "c", Types.IntegerType.get()), + NestedField.optional(8, "d", Types.IntegerType.get()))))); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -116,25 +135,31 @@ public void testAddColumnWithMap() { // use the implicit column name 'key' and 'value' to access member of map. // add column to value struct column sql("ALTER TABLE %s ADD COLUMN data2.value.c int", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get()), - NestedField.optional(3, "data2", Types.MapType.ofOptional( - 4, - 5, - Types.StructType.of( - NestedField.optional(6, "x", Types.IntegerType.get())), - Types.StructType.of( - NestedField.optional(7, "a", Types.IntegerType.get()), - NestedField.optional(8, "b", Types.IntegerType.get()), - NestedField.optional(9, "c", Types.IntegerType.get())) - ))); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get()), + NestedField.optional( + 3, + "data2", + Types.MapType.ofOptional( + 4, + 5, + Types.StructType.of(NestedField.optional(6, "x", Types.IntegerType.get())), + Types.StructType.of( + NestedField.optional(7, "a", Types.IntegerType.get()), + NestedField.optional(8, "b", Types.IntegerType.get()), + NestedField.optional(9, "c", Types.IntegerType.get()))))); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); // should not allow changing map key column - AssertHelpers.assertThrows("Should reject changing key of the map column", - SparkException.class, "Unsupported table change: Cannot add fields to map keys:", + AssertHelpers.assertThrows( + "Should reject changing key of the map column", + SparkException.class, + "Unsupported table change: Cannot add fields to map keys:", () -> sql("ALTER TABLE %s ADD COLUMN data2.key.y int", tableName)); } @@ -142,35 +167,43 @@ public void testAddColumnWithMap() { public void testDropColumn() { sql("ALTER TABLE %s DROP COLUMN data", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get())); + Types.StructType expectedSchema = + Types.StructType.of(NestedField.required(1, "id", Types.LongType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test public void testRenameColumn() { sql("ALTER TABLE %s RENAME COLUMN id TO row_id", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "row_id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "row_id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test public void testAlterColumnComment() { sql("ALTER TABLE %s ALTER COLUMN id COMMENT 'Record id'", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get(), "Record id"), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get(), "Record id"), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -178,25 +211,31 @@ public void testAlterColumnType() { sql("ALTER TABLE %s ADD COLUMN count int", tableName); sql("ALTER TABLE %s ALTER COLUMN count TYPE bigint", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get()), - NestedField.optional(3, "count", Types.LongType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get()), + NestedField.optional(3, "count", Types.LongType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test public void testAlterColumnDropNotNull() { sql("ALTER TABLE %s ALTER COLUMN id DROP NOT NULL", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.optional(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.optional(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -204,15 +243,20 @@ public void testAlterColumnSetNotNull() { // no-op changes are allowed sql("ALTER TABLE %s ALTER COLUMN id SET NOT NULL", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); - AssertHelpers.assertThrows("Should reject adding NOT NULL constraint to an optional column", - AnalysisException.class, "Cannot change nullable column to non-nullable: data", + AssertHelpers.assertThrows( + "Should reject adding NOT NULL constraint to an optional column", + AnalysisException.class, + "Cannot change nullable column to non-nullable: data", () -> sql("ALTER TABLE %s ALTER COLUMN data SET NOT NULL", tableName)); } @@ -221,13 +265,16 @@ public void testAlterColumnPositionAfter() { sql("ALTER TABLE %s ADD COLUMN count int", tableName); sql("ALTER TABLE %s ALTER COLUMN count AFTER id", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(3, "count", Types.IntegerType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(3, "count", Types.IntegerType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -235,18 +282,22 @@ public void testAlterColumnPositionFirst() { sql("ALTER TABLE %s ADD COLUMN count int", tableName); sql("ALTER TABLE %s ALTER COLUMN count FIRST", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.optional(3, "count", Types.IntegerType.get()), - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.optional(3, "count", Types.IntegerType.get()), + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test public void testTableRename() { - Assume.assumeFalse("Hadoop catalog does not support rename", validationCatalog instanceof HadoopCatalog); + Assume.assumeFalse( + "Hadoop catalog does not support rename", validationCatalog instanceof HadoopCatalog); Assert.assertTrue("Initial name should exist", validationCatalog.tableExists(tableIdent)); Assert.assertFalse("New name should not exist", validationCatalog.tableExists(renamedIdent)); @@ -261,15 +312,19 @@ public void testTableRename() { public void testSetTableProperties() { sql("ALTER TABLE %s SET TBLPROPERTIES ('prop'='value')", tableName); - Assert.assertEquals("Should have the new table property", - "value", validationCatalog.loadTable(tableIdent).properties().get("prop")); + Assert.assertEquals( + "Should have the new table property", + "value", + validationCatalog.loadTable(tableIdent).properties().get("prop")); sql("ALTER TABLE %s UNSET TBLPROPERTIES ('prop')", tableName); - Assert.assertNull("Should not have the removed table property", + Assert.assertNull( + "Should not have the removed table property", validationCatalog.loadTable(tableIdent).properties().get("prop")); - AssertHelpers.assertThrows("Cannot specify the 'sort-order' because it's a reserved table property", + AssertHelpers.assertThrows( + "Cannot specify the 'sort-order' because it's a reserved table property", UnsupportedOperationException.class, () -> sql("ALTER TABLE %s SET TBLPROPERTIES ('sort-order'='value')", tableName)); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTable.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTable.java index 986098543d25..1411c83ddc65 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTable.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.io.File; @@ -52,11 +51,15 @@ public void dropTestTable() { @Test public void testTransformIgnoreCase() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE IF NOT EXISTS %s (id BIGINT NOT NULL, ts timestamp) " + - "USING iceberg partitioned by (HOURS(ts))", tableName); + sql( + "CREATE TABLE IF NOT EXISTS %s (id BIGINT NOT NULL, ts timestamp) " + + "USING iceberg partitioned by (HOURS(ts))", + tableName); Assert.assertTrue("Table should already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE IF NOT EXISTS %s (id BIGINT NOT NULL, ts timestamp) " + - "USING iceberg partitioned by (hours(ts))", tableName); + sql( + "CREATE TABLE IF NOT EXISTS %s (id BIGINT NOT NULL, ts timestamp) " + + "USING iceberg partitioned by (hours(ts))", + tableName); Assert.assertTrue("Table should already exist", validationCatalog.tableExists(tableIdent)); } @@ -69,18 +72,22 @@ public void testCreateTable() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); } @Test public void testCreateTableInRootNamespace() { - Assume.assumeTrue("Hadoop has no default namespace configured", "testhadoop".equals(catalogName)); + Assume.assumeTrue( + "Hadoop has no default namespace configured", "testhadoop".equals(catalogName)); try { sql("CREATE TABLE %s.table (id bigint) USING iceberg", catalogName); @@ -102,47 +109,61 @@ public void testCreateTableUsingParquet() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertEquals("Should not have default format parquet", + Assert.assertEquals( + "Should not have default format parquet", "parquet", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); - AssertHelpers.assertThrows("Should reject unsupported format names", - IllegalArgumentException.class, "Unsupported format in USING: crocodile", - () -> sql("CREATE TABLE %s.default.fail (id BIGINT NOT NULL, data STRING) USING crocodile", catalogName)); + AssertHelpers.assertThrows( + "Should reject unsupported format names", + IllegalArgumentException.class, + "Unsupported format in USING: crocodile", + () -> + sql( + "CREATE TABLE %s.default.fail (id BIGINT NOT NULL, data STRING) USING crocodile", + catalogName)); } @Test public void testCreateTablePartitionedBy() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, created_at TIMESTAMP, category STRING, data STRING) " + - "USING iceberg " + - "PARTITIONED BY (category, bucket(8, id), days(created_at))", tableName); + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, created_at TIMESTAMP, category STRING, data STRING) " + + "USING iceberg " + + "PARTITIONED BY (category, bucket(8, id), days(created_at))", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "created_at", Types.TimestampType.withZone()), - NestedField.optional(3, "category", Types.StringType.get()), - NestedField.optional(4, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); - - PartitionSpec expectedSpec = PartitionSpec.builderFor(new Schema(expectedSchema.fields())) - .identity("category") - .bucket("id", 8) - .day("created_at") - .build(); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "created_at", Types.TimestampType.withZone()), + NestedField.optional(3, "category", Types.StringType.get()), + NestedField.optional(4, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); + + PartitionSpec expectedSpec = + PartitionSpec.builderFor(new Schema(expectedSchema.fields())) + .identity("category") + .bucket("id", 8) + .day("created_at") + .build(); Assert.assertEquals("Should be partitioned correctly", expectedSpec, table.spec()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); } @@ -150,20 +171,24 @@ public void testCreateTablePartitionedBy() { public void testCreateTableColumnComments() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL COMMENT 'Unique identifier', data STRING COMMENT 'Data value') " + - "USING iceberg", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL COMMENT 'Unique identifier', data STRING COMMENT 'Data value') " + + "USING iceberg", tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get(), "Unique identifier"), - NestedField.optional(2, "data", Types.StringType.get(), "Data value")); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get(), "Unique identifier"), + NestedField.optional(2, "data", Types.StringType.get(), "Data value")); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); } @@ -171,24 +196,30 @@ public void testCreateTableColumnComments() { public void testCreateTableComment() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "COMMENT 'Table doc'", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "COMMENT 'Table doc'", tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); - Assert.assertEquals("Should have the table comment set in properties", - "Table doc", table.properties().get(TableCatalog.PROP_COMMENT)); + Assert.assertEquals( + "Should have the table comment set in properties", + "Table doc", + table.properties().get(TableCatalog.PROP_COMMENT)); } @Test @@ -204,43 +235,49 @@ public void testCreateTableLocation() throws Exception { String location = "file:" + tableLocation.toString(); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "LOCATION '%s'", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "LOCATION '%s'", tableName, location); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); - Assert.assertEquals("Should have a custom table location", - location, table.location()); + Assert.assertEquals("Should have a custom table location", location, table.location()); } @Test public void testCreateTableProperties() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "TBLPROPERTIES (p1=2, p2='x')", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "TBLPROPERTIES (p1=2, p2='x')", tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); Assert.assertEquals("Should have property p1", "2", table.properties().get("p1")); Assert.assertEquals("Should have property p2", "x", table.properties().get("p2")); @@ -250,53 +287,56 @@ public void testCreateTableProperties() { public void testCreateTableWithFormatV2ThroughTableProperty() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "TBLPROPERTIES ('format-version'='2')", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "TBLPROPERTIES ('format-version'='2')", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("should create table using format v2", - 2, ((BaseTable) table).operations().current().formatVersion()); + Assert.assertEquals( + "should create table using format v2", + 2, + ((BaseTable) table).operations().current().formatVersion()); } @Test public void testUpgradeTableWithFormatV2ThroughTableProperty() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "TBLPROPERTIES ('format-version'='1')", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "TBLPROPERTIES ('format-version'='1')", tableName); Table table = validationCatalog.loadTable(tableIdent); TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("should create table using format v1", - 1, ops.refresh().formatVersion()); + Assert.assertEquals("should create table using format v1", 1, ops.refresh().formatVersion()); sql("ALTER TABLE %s SET TBLPROPERTIES ('format-version'='2')", tableName); - Assert.assertEquals("should update table to use format v2", - 2, ops.refresh().formatVersion()); + Assert.assertEquals("should update table to use format v2", 2, ops.refresh().formatVersion()); } @Test public void testDowngradeTableToFormatV1ThroughTablePropertyFails() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "TBLPROPERTIES ('format-version'='2')", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "TBLPROPERTIES ('format-version'='2')", tableName); Table table = validationCatalog.loadTable(tableIdent); TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("should create table using format v2", - 2, ops.refresh().formatVersion()); + Assert.assertEquals("should create table using format v2", 2, ops.refresh().formatVersion()); - AssertHelpers.assertThrowsCause("should fail to downgrade to v1", + AssertHelpers.assertThrowsCause( + "should fail to downgrade to v1", IllegalArgumentException.class, "Cannot downgrade v2 table to v1", () -> sql("ALTER TABLE %s SET TBLPROPERTIES ('format-version'='1')", tableName)); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java index 4a70327e21a1..2581c0fd3c56 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.when; + import java.util.Map; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; @@ -30,20 +33,19 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.when; - public class TestCreateTableAsSelect extends SparkCatalogTestBase { private final String sourceName; - public TestCreateTableAsSelect(String catalogName, String implementation, Map config) { + public TestCreateTableAsSelect( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); this.sourceName = tableName("source"); - sql("CREATE TABLE IF NOT EXISTS %s (id bigint NOT NULL, data string) " + - "USING iceberg PARTITIONED BY (truncate(id, 3))", sourceName); + sql( + "CREATE TABLE IF NOT EXISTS %s (id bigint NOT NULL, data string) " + + "USING iceberg PARTITIONED BY (truncate(id, 3))", + sourceName); sql("INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", sourceName); } @@ -56,153 +58,178 @@ public void removeTables() { public void testUnpartitionedCTAS() { sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); Table ctasTable = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), ctasTable.schema().asStruct()); - Assert.assertEquals("Should be an unpartitioned table", - 0, ctasTable.spec().fields().size()); - assertEquals("Should have rows matching the source table", + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + ctasTable.schema().asStruct()); + Assert.assertEquals("Should be an unpartitioned table", 0, ctasTable.spec().fields().size()); + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testPartitionedCTAS() { - sql("CREATE TABLE %s USING iceberg PARTITIONED BY (id) AS SELECT * FROM %s ORDER BY id", tableName, sourceName); + sql( + "CREATE TABLE %s USING iceberg PARTITIONED BY (id) AS SELECT * FROM %s ORDER BY id", + tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("id") - .build(); + PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema).identity("id").build(); Table ctasTable = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), ctasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by id", - expectedSpec, ctasTable.spec()); - assertEquals("Should have rows matching the source table", + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + ctasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by id", expectedSpec, ctasTable.spec()); + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testRTAS() { - sql("CREATE TABLE %s USING iceberg TBLPROPERTIES ('prop1'='val1', 'prop2'='val2')" + - "AS SELECT * FROM %s", tableName, sourceName); + sql( + "CREATE TABLE %s USING iceberg TBLPROPERTIES ('prop1'='val1', 'prop2'='val2')" + + "AS SELECT * FROM %s", + tableName, sourceName); - assertEquals("Should have rows matching the source table", + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - sql("REPLACE TABLE %s USING iceberg PARTITIONED BY (part) TBLPROPERTIES ('prop1'='newval1', 'prop3'='val3') AS " + - "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); + sql( + "REPLACE TABLE %s USING iceberg PARTITIONED BY (part) TBLPROPERTIES ('prop1'='newval1', 'prop3'='val3') AS " + + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("part") - .withSpecId(1) - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema).identity("part").withSpecId(1).build(); Table rtasTable = validationCatalog.loadTable(tableIdent); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by part", - expectedSpec, rtasTable.spec()); - - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by part", expectedSpec, rtasTable.spec()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); - Assert.assertEquals("Should have updated table property", - "newval1", rtasTable.properties().get("prop1")); - Assert.assertEquals("Should have preserved table property", - "val2", rtasTable.properties().get("prop2")); - Assert.assertEquals("Should have new table property", - "val3", rtasTable.properties().get("prop3")); + Assert.assertEquals( + "Should have updated table property", "newval1", rtasTable.properties().get("prop1")); + Assert.assertEquals( + "Should have preserved table property", "val2", rtasTable.properties().get("prop2")); + Assert.assertEquals( + "Should have new table property", "val3", rtasTable.properties().get("prop3")); } @Test public void testCreateRTAS() { - sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + - "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); - - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + sql( + "CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + - "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); + sql( + "CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + + "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("part") - .withSpecId(0) // the spec is identical and should be reused - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema) + .identity("part") + .withSpecId(0) // the spec is identical and should be reused + .build(); Table rtasTable = validationCatalog.loadTable(tableIdent); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by part", - expectedSpec, rtasTable.spec()); - - assertEquals("Should have rows matching the source table", - sql("SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by part", expectedSpec, rtasTable.spec()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); } @Test public void testDataFrameV2Create() throws Exception { spark.table(sourceName).writeTo(tableName).using("iceberg").create(); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); Table ctasTable = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), ctasTable.schema().asStruct()); - Assert.assertEquals("Should be an unpartitioned table", - 0, ctasTable.spec().fields().size()); - assertEquals("Should have rows matching the source table", + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + ctasTable.schema().asStruct()); + Assert.assertEquals("Should be an unpartitioned table", 0, ctasTable.spec().fields().size()); + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -211,11 +238,13 @@ public void testDataFrameV2Create() throws Exception { public void testDataFrameV2Replace() throws Exception { spark.table(sourceName).writeTo(tableName).using("iceberg").create(); - assertEquals("Should have rows matching the source table", + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - spark.table(sourceName) + spark + .table(sourceName) .select( col("id"), col("data"), @@ -226,37 +255,40 @@ public void testDataFrameV2Replace() throws Exception { .using("iceberg") .replace(); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("part") - .withSpecId(1) - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema).identity("part").withSpecId(1).build(); Table rtasTable = validationCatalog.loadTable(tableIdent); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by part", - expectedSpec, rtasTable.spec()); - - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by part", expectedSpec, rtasTable.spec()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); } @Test public void testDataFrameV2CreateOrReplace() { - spark.table(sourceName) + spark + .table(sourceName) .select( col("id"), col("data"), @@ -267,12 +299,16 @@ public void testDataFrameV2CreateOrReplace() { .using("iceberg") .createOrReplace(); - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - spark.table(sourceName) + spark + .table(sourceName) .select(col("id").multiply(lit(2)).as("id"), col("data")) .select( col("id"), @@ -284,80 +320,97 @@ public void testDataFrameV2CreateOrReplace() { .using("iceberg") .createOrReplace(); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("part") - .withSpecId(0) // the spec is identical and should be reused - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema) + .identity("part") + .withSpecId(0) // the spec is identical and should be reused + .build(); Table rtasTable = validationCatalog.loadTable(tableIdent); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by part", - expectedSpec, rtasTable.spec()); - - assertEquals("Should have rows matching the source table", - sql("SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by part", expectedSpec, rtasTable.spec()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); } @Test public void testCreateRTASWithPartitionSpecChanging() { - sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + - "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); + sql( + "CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); Table rtasTable = validationCatalog.loadTable(tableIdent); - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); // Change the partitioning of the table rtasTable.updateSpec().removeField("part").commit(); // Spec 1 - sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part, id) AS " + - "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); + sql( + "CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part, id) AS " + + "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .alwaysNull("part", "part_1000") - .identity("part") - .identity("id") - .withSpecId(2) // The Spec is new - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema) + .alwaysNull("part", "part_1000") + .identity("part") + .identity("id") + .withSpecId(2) // The Spec is new + .build(); - Assert.assertEquals("Should be partitioned by part and id", - expectedSpec, rtasTable.spec()); + Assert.assertEquals("Should be partitioned by part and id", expectedSpec, rtasTable.spec()); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - - assertEquals("Should have rows matching the source table", - sql("SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java index b9b1a7647dd9..903b1f330036 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -50,25 +49,27 @@ public void removeTables() { public void testDeleteFromUnpartitionedTable() throws NoSuchTableException { sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.coalesce(1).writeTo(tableName).append(); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName)); - AssertHelpers.assertThrows("Should not delete when not all rows of a file match the filter", - AnalysisException.class, "Cannot delete from", + AssertHelpers.assertThrows( + "Should not delete when not all rows of a file match the filter", + AnalysisException.class, + "Cannot delete from", () -> sql("DELETE FROM %s WHERE id < 2", tableName)); sql("DELETE FROM %s WHERE id < 4", tableName); - assertEquals("Should have no rows after successful delete", + assertEquals( + "Should have no rows after successful delete", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -77,46 +78,50 @@ public void testDeleteFromUnpartitionedTable() throws NoSuchTableException { public void testDeleteFromTableAtSnapshot() throws NoSuchTableException { sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.coalesce(1).writeTo(tableName).append(); long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); String prefix = "snapshot_id_"; - AssertHelpers.assertThrows("Should not be able to delete from a table at a specific snapshot", - IllegalArgumentException.class, "Cannot delete from table at a specific snapshot", + AssertHelpers.assertThrows( + "Should not be able to delete from a table at a specific snapshot", + IllegalArgumentException.class, + "Cannot delete from table at a specific snapshot", () -> sql("DELETE FROM %s.%s WHERE id < 4", tableName, prefix + snapshotId)); } @Test public void testDeleteFromPartitionedTable() throws NoSuchTableException { - sql("CREATE TABLE %s (id bigint, data string) " + - "USING iceberg " + - "PARTITIONED BY (truncate(id, 2))", tableName); - - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + sql( + "CREATE TABLE %s (id bigint, data string) " + + "USING iceberg " + + "PARTITIONED BY (truncate(id, 2))", + tableName); + + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.coalesce(1).writeTo(tableName).append(); - assertEquals("Should have 3 rows in 2 partitions", + assertEquals( + "Should have 3 rows in 2 partitions", ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName)); - AssertHelpers.assertThrows("Should not delete when not all rows of a file match the filter", - AnalysisException.class, "Cannot delete from table", + AssertHelpers.assertThrows( + "Should not delete when not all rows of a file match the filter", + AnalysisException.class, + "Cannot delete from table", () -> sql("DELETE FROM %s WHERE id > 2", tableName)); sql("DELETE FROM %s WHERE id < 2", tableName); - assertEquals("Should have two rows in the second partition", + assertEquals( + "Should have two rows in the second partition", ImmutableList.of(row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -126,7 +131,8 @@ public void testDeleteFromWhereFalse() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -137,7 +143,8 @@ public void testDeleteFromWhereFalse() { table.refresh(); - Assert.assertEquals("Delete should not produce a new snapshot", 1, Iterables.size(table.snapshots())); + Assert.assertEquals( + "Delete should not produce a new snapshot", 1, Iterables.size(table.snapshots())); } @Test @@ -145,7 +152,8 @@ public void testTruncate() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -154,7 +162,8 @@ public void testTruncate() { sql("TRUNCATE TABLE %s", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName)); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java index 535cd3926a1a..2189bd0dae75 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.io.IOException; @@ -65,11 +64,14 @@ public void testDropTableGCDisabled() throws IOException { } private void dropTableInternal() throws IOException { - assertEquals("Should have expected rows", - ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(1, "test")), + sql("SELECT * FROM %s", tableName)); List manifestAndFiles = manifestsAndFiles(); - Assert.assertEquals("There should be 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertEquals( + "There should be 2 files for manifests and files", 2, manifestAndFiles.size()); Assert.assertTrue("All files should be existed", checkFilesExist(manifestAndFiles, true)); sql("DROP TABLE %s", tableName); @@ -85,11 +87,14 @@ private void dropTableInternal() throws IOException { @Test public void testPurgeTable() throws IOException { - assertEquals("Should have expected rows", - ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(1, "test")), + sql("SELECT * FROM %s", tableName)); List manifestAndFiles = manifestsAndFiles(); - Assert.assertEquals("There should be 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertEquals( + "There should be 2 files for manifests and files", 2, manifestAndFiles.size()); Assert.assertTrue("All files should exist", checkFilesExist(manifestAndFiles, true)); sql("DROP TABLE %s PURGE", tableName); @@ -101,14 +106,19 @@ public void testPurgeTable() throws IOException { public void testPurgeTableGCDisabled() throws IOException { sql("ALTER TABLE %s SET TBLPROPERTIES (gc.enabled = false)", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(1, "test")), + sql("SELECT * FROM %s", tableName)); List manifestAndFiles = manifestsAndFiles(); - Assert.assertEquals("There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertEquals( + "There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); Assert.assertTrue("All files should be existed", checkFilesExist(manifestAndFiles, true)); - AssertHelpers.assertThrows("Purge table is not allowed when GC is disabled", ValidationException.class, + AssertHelpers.assertThrows( + "Purge table is not allowed when GC is disabled", + ValidationException.class, "Cannot purge table: GC is disabled (deleting files may corrupt other tables", () -> sql("DROP TABLE %s PURGE", tableName)); @@ -118,8 +128,11 @@ public void testPurgeTableGCDisabled() throws IOException { private List manifestsAndFiles() { List files = sql("SELECT file_path FROM %s.%s", tableName, MetadataTableType.FILES); - List manifests = sql("SELECT path FROM %s.%s", tableName, MetadataTableType.MANIFESTS); - return Streams.concat(files.stream(), manifests.stream()).map(row -> (String) row[0]).collect(Collectors.toList()); + List manifests = + sql("SELECT path FROM %s.%s", tableName, MetadataTableType.MANIFESTS); + return Streams.concat(files.stream(), manifests.stream()) + .map(row -> (String) row[0]) + .collect(Collectors.toList()); } private boolean checkFilesExist(List files, boolean shouldExist) throws IOException { @@ -129,12 +142,14 @@ private boolean checkFilesExist(List files, boolean shouldExist) throws } FileSystem fs = new Path(files.get(0)).getFileSystem(hiveConf); - return files.stream().allMatch(file -> { - try { - return fs.exists(new Path(file)) ^ mask; - } catch (IOException e) { - throw new RuntimeException(e); - } - }); + return files.stream() + .allMatch( + file -> { + try { + return fs.exists(new Path(file)) ^ mask; + } catch (IOException e) { + throw new RuntimeException(e); + } + }); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestNamespaceSQL.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestNamespaceSQL.java index d1eac312669a..317a95cd0140 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestNamespaceSQL.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestNamespaceSQL.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.io.File; @@ -56,7 +55,8 @@ public void cleanNamespaces() { @Test public void testCreateNamespace() { - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); @@ -76,7 +76,8 @@ public void testDefaultNamespace() { @Test public void testDropEmptyNamespace() { - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); @@ -84,23 +85,28 @@ public void testDropEmptyNamespace() { sql("DROP NAMESPACE %s", fullNamespace); - Assert.assertFalse("Namespace should have been dropped", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should have been dropped", validationNamespaceCatalog.namespaceExists(NS)); } @Test public void testDropNonEmptyNamespace() { Assume.assumeFalse("Session catalog has flaky behavior", "spark_catalog".equals(catalogName)); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); sql("CREATE TABLE %s.table (id bigint) USING iceberg", fullNamespace); Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(NS)); - Assert.assertTrue("Table should exist", validationCatalog.tableExists(TableIdentifier.of(NS, "table"))); + Assert.assertTrue( + "Table should exist", validationCatalog.tableExists(TableIdentifier.of(NS, "table"))); - AssertHelpers.assertThrows("Should fail if trying to delete a non-empty namespace", - SparkException.class, "non-empty namespace", + AssertHelpers.assertThrows( + "Should fail if trying to delete a non-empty namespace", + SparkException.class, + "non-empty namespace", () -> sql("DROP NAMESPACE %s", fullNamespace)); sql("DROP TABLE %s.table", fullNamespace); @@ -108,7 +114,8 @@ public void testDropNonEmptyNamespace() { @Test public void testListTables() { - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); @@ -126,7 +133,8 @@ public void testListTables() { @Test public void testListNamespace() { - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); @@ -136,17 +144,23 @@ public void testListNamespace() { if (isHadoopCatalog) { Assert.assertEquals("Should have 1 namespace", 1, namespaces.size()); - Set namespaceNames = namespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); + Set namespaceNames = + namespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); Assert.assertEquals("Should have only db namespace", ImmutableSet.of("db"), namespaceNames); } else { Assert.assertEquals("Should have 2 namespaces", 2, namespaces.size()); - Set namespaceNames = namespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); - Assert.assertEquals("Should have default and db namespaces", ImmutableSet.of("default", "db"), namespaceNames); + Set namespaceNames = + namespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); + Assert.assertEquals( + "Should have default and db namespaces", + ImmutableSet.of("default", "db"), + namespaceNames); } List nestedNamespaces = sql("SHOW NAMESPACES IN %s", fullNamespace); - Set nestedNames = nestedNamespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); + Set nestedNames = + nestedNamespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); Assert.assertEquals("Should not have nested namespaces", ImmutableSet.of(), nestedNames); } @@ -154,7 +168,8 @@ public void testListNamespace() { public void testCreateNamespaceWithMetadata() { Assume.assumeFalse("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s WITH PROPERTIES ('prop'='value')", fullNamespace); @@ -162,14 +177,16 @@ public void testCreateNamespaceWithMetadata() { Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Namespace should have expected prop value", "value", nsMetadata.get("prop")); + Assert.assertEquals( + "Namespace should have expected prop value", "value", nsMetadata.get("prop")); } @Test public void testCreateNamespaceWithComment() { Assume.assumeFalse("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s COMMENT 'namespace doc'", fullNamespace); @@ -177,14 +194,16 @@ public void testCreateNamespaceWithComment() { Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Namespace should have expected comment", "namespace doc", nsMetadata.get("comment")); + Assert.assertEquals( + "Namespace should have expected comment", "namespace doc", nsMetadata.get("comment")); } @Test public void testCreateNamespaceWithLocation() throws Exception { Assume.assumeFalse("HadoopCatalog does not support namespace locations", isHadoopCatalog); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); File location = temp.newFile(); Assert.assertTrue(location.delete()); @@ -195,27 +214,32 @@ public void testCreateNamespaceWithLocation() throws Exception { Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Namespace should have expected location", - "file:" + location.getPath(), nsMetadata.get("location")); + Assert.assertEquals( + "Namespace should have expected location", + "file:" + location.getPath(), + nsMetadata.get("location")); } @Test public void testSetProperties() { Assume.assumeFalse("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(NS)); Map defaultMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertFalse("Default metadata should not have custom property", defaultMetadata.containsKey("prop")); + Assert.assertFalse( + "Default metadata should not have custom property", defaultMetadata.containsKey("prop")); sql("ALTER NAMESPACE %s SET PROPERTIES ('prop'='value')", fullNamespace); Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Namespace should have expected prop value", "value", nsMetadata.get("prop")); + Assert.assertEquals( + "Namespace should have expected prop value", "value", nsMetadata.get("prop")); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWrites.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWrites.java index 9223797ada32..51c56ac79d4d 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWrites.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWrites.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -34,13 +33,16 @@ import org.junit.Test; public class TestPartitionedWrites extends SparkCatalogTestBase { - public TestPartitionedWrites(String catalogName, String implementation, Map config) { + public TestPartitionedWrites( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @Before public void createTables() { - sql("CREATE TABLE %s (id bigint, data string) USING iceberg PARTITIONED BY (truncate(id, 3))", tableName); + sql( + "CREATE TABLE %s (id bigint, data string) USING iceberg PARTITIONED BY (truncate(id, 3))", + tableName); sql("INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", tableName); } @@ -55,17 +57,14 @@ public void testInsertAppend() { sql("INSERT INTO %s VALUES (4, 'd'), (5, 'e')", tableName); - Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -75,88 +74,70 @@ public void testInsertOverwrite() { // 4 and 5 replace 3 in the partition (id - (id % 3)) = 3 sql("INSERT OVERWRITE %s VALUES (4, 'd'), (5, 'e')", tableName); - Assert.assertEquals("Should have 4 rows after overwrite", 4L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 4 rows after overwrite", 4L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2Append() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).append(); - Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2DynamicOverwrite() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).overwritePartitions(); - Assert.assertEquals("Should have 4 rows after overwrite", 4L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 4 rows after overwrite", 4L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2Overwrite() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).overwrite(functions.col("id").$less(3)); - Assert.assertEquals("Should have 3 rows after overwrite", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 3 rows after overwrite", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = ImmutableList.of(row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -166,13 +147,13 @@ public void testViewsReturnRecentResults() { Dataset query = spark.sql("SELECT * FROM " + tableName + " WHERE id = 1"); query.createOrReplaceTempView("tmp"); - assertEquals("View should have expected rows", - ImmutableList.of(row(1L, "a")), - sql("SELECT * FROM tmp")); + assertEquals( + "View should have expected rows", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("View should have expected rows", + assertEquals( + "View should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM tmp")); } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWritesAsSelect.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWritesAsSelect.java index e38d545d8df9..3ffd38b83c3b 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWritesAsSelect.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWritesAsSelect.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -35,7 +34,9 @@ public class TestPartitionedWritesAsSelect extends SparkTestBaseWithCatalog { @Before public void createTables() { - sql("CREATE TABLE %s (id bigint, data string, category string, ts timestamp) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint, data string, category string, ts timestamp) USING iceberg", + tableName); } @After @@ -49,15 +50,23 @@ public void testInsertAsSelectAppend() { insertData(3); List expected = currentData(); - sql("CREATE TABLE %s (id bigint, data string, category string, ts timestamp)" + - "USING iceberg PARTITIONED BY (days(ts), category)", targetTable); - - sql("INSERT INTO %s SELECT id, data, category, ts FROM %s ORDER BY ts,category", targetTable, tableName); - Assert.assertEquals("Should have 15 rows after insert", - 3 * 5L, scalarSql("SELECT count(*) FROM %s", targetTable)); - - assertEquals("Row data should match expected", - expected, sql("SELECT * FROM %s ORDER BY id", targetTable)); + sql( + "CREATE TABLE %s (id bigint, data string, category string, ts timestamp)" + + "USING iceberg PARTITIONED BY (days(ts), category)", + targetTable); + + sql( + "INSERT INTO %s SELECT id, data, category, ts FROM %s ORDER BY ts,category", + targetTable, tableName); + Assert.assertEquals( + "Should have 15 rows after insert", + 3 * 5L, + scalarSql("SELECT count(*) FROM %s", targetTable)); + + assertEquals( + "Row data should match expected", + expected, + sql("SELECT * FROM %s ORDER BY id", targetTable)); } @Test @@ -65,16 +74,24 @@ public void testInsertAsSelectWithBucket() { insertData(3); List expected = currentData(); - sql("CREATE TABLE %s (id bigint, data string, category string, ts timestamp)" + - "USING iceberg PARTITIONED BY (bucket(8, data))", targetTable); + sql( + "CREATE TABLE %s (id bigint, data string, category string, ts timestamp)" + + "USING iceberg PARTITIONED BY (bucket(8, data))", + targetTable); IcebergSpark.registerBucketUDF(spark, "iceberg_bucket8", DataTypes.StringType, 8); - sql("INSERT INTO %s SELECT id, data, category, ts FROM %s ORDER BY iceberg_bucket8(data)", targetTable, tableName); - Assert.assertEquals("Should have 15 rows after insert", - 3 * 5L, scalarSql("SELECT count(*) FROM %s", targetTable)); - - assertEquals("Row data should match expected", - expected, sql("SELECT * FROM %s ORDER BY id", targetTable)); + sql( + "INSERT INTO %s SELECT id, data, category, ts FROM %s ORDER BY iceberg_bucket8(data)", + targetTable, tableName); + Assert.assertEquals( + "Should have 15 rows after insert", + 3 * 5L, + scalarSql("SELECT count(*) FROM %s", targetTable)); + + assertEquals( + "Row data should match expected", + expected, + sql("SELECT * FROM %s ORDER BY id", targetTable)); } @Test @@ -82,28 +99,40 @@ public void testInsertAsSelectWithTruncate() { insertData(3); List expected = currentData(); - sql("CREATE TABLE %s (id bigint, data string, category string, ts timestamp)" + - "USING iceberg PARTITIONED BY (truncate(data, 4), truncate(id, 4))", targetTable); + sql( + "CREATE TABLE %s (id bigint, data string, category string, ts timestamp)" + + "USING iceberg PARTITIONED BY (truncate(data, 4), truncate(id, 4))", + targetTable); IcebergSpark.registerTruncateUDF(spark, "iceberg_truncate_string4", DataTypes.StringType, 4); IcebergSpark.registerTruncateUDF(spark, "iceberg_truncate_long4", DataTypes.LongType, 4); - sql("INSERT INTO %s SELECT id, data, category, ts FROM %s " + - "ORDER BY iceberg_truncate_string4(data),iceberg_truncate_long4(id)", targetTable, tableName); - Assert.assertEquals("Should have 15 rows after insert", - 3 * 5L, scalarSql("SELECT count(*) FROM %s", targetTable)); - - assertEquals("Row data should match expected", - expected, sql("SELECT * FROM %s ORDER BY id", targetTable)); + sql( + "INSERT INTO %s SELECT id, data, category, ts FROM %s " + + "ORDER BY iceberg_truncate_string4(data),iceberg_truncate_long4(id)", + targetTable, tableName); + Assert.assertEquals( + "Should have 15 rows after insert", + 3 * 5L, + scalarSql("SELECT count(*) FROM %s", targetTable)); + + assertEquals( + "Row data should match expected", + expected, + sql("SELECT * FROM %s ORDER BY id", targetTable)); } private void insertData(int repeatCounter) { - IntStream.range(0, repeatCounter).forEach(i -> { - sql("INSERT INTO %s VALUES (13, '1', 'bgd16', timestamp('2021-11-10 11:20:10'))," + - "(21, '2', 'bgd13', timestamp('2021-11-10 11:20:10')), " + - "(12, '3', 'bgd14', timestamp('2021-11-10 11:20:10'))," + - "(222, '3', 'bgd15', timestamp('2021-11-10 11:20:10'))," + - "(45, '4', 'bgd16', timestamp('2021-11-10 11:20:10'))", tableName); - }); + IntStream.range(0, repeatCounter) + .forEach( + i -> { + sql( + "INSERT INTO %s VALUES (13, '1', 'bgd16', timestamp('2021-11-10 11:20:10'))," + + "(21, '2', 'bgd13', timestamp('2021-11-10 11:20:10')), " + + "(12, '3', 'bgd14', timestamp('2021-11-10 11:20:10'))," + + "(222, '3', 'bgd15', timestamp('2021-11-10 11:20:10'))," + + "(45, '4', 'bgd16', timestamp('2021-11-10 11:20:10'))", + tableName); + }); } private List currentData() { diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestRefreshTable.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestRefreshTable.java index a8bdea77e237..3eaca6329477 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestRefreshTable.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestRefreshTable.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.spark.sql; import java.util.List; @@ -49,7 +47,8 @@ public void removeTables() { @Test public void testRefreshCommand() { - // We are not allowed to change the session catalog after it has been initialized, so build a new one + // We are not allowed to change the session catalog after it has been initialized, so build a + // new one if (catalogName.equals("spark_catalog")) { spark.conf().set("spark.sql.catalog." + catalogName + ".cache-enabled", true); spark = spark.cloneSession(); @@ -59,7 +58,8 @@ public void testRefreshCommand() { List originalActual = sql("SELECT * FROM %s", tableName); assertEquals("Table should start as expected", originalExpected, originalActual); - // Modify table outside of spark, it should be cached so Spark should see the same value after mutation + // Modify table outside of spark, it should be cached so Spark should see the same value after + // mutation Table table = validationCatalog.loadTable(tableIdent); DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); table.newDelete().deleteFile(file).commit(); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java index 43dd29c0d1f1..0cd92ce42ecb 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -46,10 +45,12 @@ public TestSelect(String catalogName, String implementation, Map super(catalogName, implementation, config); // register a scan event listener to validate pushdown - Listeners.register(event -> { - scanEventCount += 1; - lastScanEvent = event; - }, ScanEvent.class); + Listeners.register( + event -> { + scanEventCount += 1; + lastScanEvent = event; + }, + ScanEvent.class); } @Before @@ -69,8 +70,8 @@ public void removeTables() { @Test public void testSelect() { - List expected = ImmutableList.of( - row(1L, "a", 1.0F), row(2L, "b", 2.0F), row(3L, "c", Float.NaN)); + List expected = + ImmutableList.of(row(1L, "a", 1.0F), row(2L, "b", 2.0F), row(3L, "c", Float.NaN)); assertEquals("Should return all expected rows", expected, sql("SELECT * FROM %s", tableName)); } @@ -79,11 +80,14 @@ public void testSelect() { public void testSelectRewrite() { List expected = ImmutableList.of(row(3L, "c", Float.NaN)); - assertEquals("Should return all expected rows", expected, + assertEquals( + "Should return all expected rows", + expected, sql("SELECT * FROM %s where float = float('NaN')", tableName)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should push down expected filter", + Assert.assertEquals( + "Should push down expected filter", "(float IS NOT NULL AND is_nan(float))", Spark3Util.describe(lastScanEvent.filter())); } @@ -95,8 +99,10 @@ public void testProjection() { assertEquals("Should return all expected rows", expected, sql("SELECT id FROM %s", tableName)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); - Assert.assertEquals("Should project only the id column", + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should project only the id column", validationCatalog.loadTable(tableIdent).schema().select("id").asStruct(), lastScanEvent.projection().asStruct()); } @@ -105,13 +111,18 @@ public void testProjection() { public void testExpressionPushdown() { List expected = ImmutableList.of(row("b")); - assertEquals("Should return all expected rows", expected, sql("SELECT data FROM %s WHERE id = 2", tableName)); + assertEquals( + "Should return all expected rows", + expected, + sql("SELECT data FROM %s WHERE id = 2", tableName)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should push down expected filter", + Assert.assertEquals( + "Should push down expected filter", "(id IS NOT NULL AND id = 2)", Spark3Util.describe(lastScanEvent.filter())); - Assert.assertEquals("Should project only id and data columns", + Assert.assertEquals( + "Should project only id and data columns", validationCatalog.loadTable(tableIdent).schema().select("id", "data").asStruct(), lastScanEvent.projection().asStruct()); } @@ -122,7 +133,8 @@ public void testMetadataTables() { "Spark session catalog does not support metadata tables", "spark_catalog".equals(catalogName)); - assertEquals("Snapshot metadata table", + assertEquals( + "Snapshot metadata table", ImmutableList.of(row(ANY, ANY, null, "append", ANY, ANY)), sql("SELECT * FROM %s.snapshots", tableName)); } @@ -146,10 +158,12 @@ public void testSnapshotInTableName() { assertEquals("Snapshot at specific ID, prefix " + prefix, expected, actual); // read the table using DataFrameReader option - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) - .load(tableName); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) + .load(tableName); List fromDF = rowsToJava(df.collectAsList()); assertEquals("Snapshot at specific ID " + snapshotId, expected, fromDF); } @@ -174,10 +188,12 @@ public void testTimestampInTableName() { assertEquals("Snapshot at timestamp, prefix " + prefix, expected, actual); // read the table using DataFrameReader option - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) - .load(tableName); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) + .load(tableName); List fromDF = rowsToJava(df.collectAsList()); assertEquals("Snapshot at timestamp " + timestamp, expected, fromDF); } @@ -187,22 +203,25 @@ public void testSpecifySnapshotAndTimestamp() { // get the snapshot ID of the last write long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); // get a timestamp just after the last write - long timestamp = validationCatalog.loadTable(tableIdent).currentSnapshot().timestampMillis() + 2; + long timestamp = + validationCatalog.loadTable(tableIdent).currentSnapshot().timestampMillis() + 2; // create a second snapshot sql("INSERT INTO %s VALUES (4, 'd', 4.0), (5, 'e', 5.0)", tableName); - AssertHelpers.assertThrows("Should not be able to specify both snapshot id and timestamp", + AssertHelpers.assertThrows( + "Should not be able to specify both snapshot id and timestamp", IllegalArgumentException.class, - String.format("Cannot specify both snapshot-id (%s) and as-of-timestamp (%s)", - snapshotId, timestamp), + String.format( + "Cannot specify both snapshot-id (%s) and as-of-timestamp (%s)", snapshotId, timestamp), () -> { - spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) - .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) - .load(tableName) - .collectAsList(); + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) + .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) + .load(tableName) + .collectAsList(); }); } @@ -210,21 +229,31 @@ public void testSpecifySnapshotAndTimestamp() { public void testBinaryInFilter() { sql("CREATE TABLE %s (id bigint, binary binary) USING iceberg", binaryTableName); sql("INSERT INTO %s VALUES (1, X''), (2, X'1111'), (3, X'11')", binaryTableName); - List expected = ImmutableList.of(row(2L, new byte[]{0x11, 0x11})); + List expected = ImmutableList.of(row(2L, new byte[] {0x11, 0x11})); - assertEquals("Should return all expected rows", expected, + assertEquals( + "Should return all expected rows", + expected, sql("SELECT id, binary FROM %s where binary > X'11'", binaryTableName)); } @Test public void testComplexTypeFilter() { String complexTypeTableName = tableName("complex_table"); - sql("CREATE TABLE %s (id INT, complex STRUCT) USING iceberg", complexTypeTableName); - sql("INSERT INTO TABLE %s VALUES (1, named_struct(\"c1\", 3, \"c2\", \"v1\"))", complexTypeTableName); - sql("INSERT INTO TABLE %s VALUES (2, named_struct(\"c1\", 2, \"c2\", \"v2\"))", complexTypeTableName); - - List result = sql("SELECT id FROM %s WHERE complex = named_struct(\"c1\", 3, \"c2\", \"v1\")", + sql( + "CREATE TABLE %s (id INT, complex STRUCT) USING iceberg", complexTypeTableName); + sql( + "INSERT INTO TABLE %s VALUES (1, named_struct(\"c1\", 3, \"c2\", \"v1\"))", + complexTypeTableName); + sql( + "INSERT INTO TABLE %s VALUES (2, named_struct(\"c1\", 2, \"c2\", \"v2\"))", + complexTypeTableName); + + List result = + sql( + "SELECT id FROM %s WHERE complex = named_struct(\"c1\", 3, \"c2\", \"v1\")", + complexTypeTableName); assertEquals("Should return all expected rows", ImmutableList.of(row(1)), result); sql("DROP TABLE IF EXISTS %s", complexTypeTableName); diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestTimestampWithoutZone.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestTimestampWithoutZone.java index ddaac5256e10..51b8d255a99b 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestTimestampWithoutZone.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestTimestampWithoutZone.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.sql.Timestamp; @@ -50,32 +49,35 @@ public class TestTimestampWithoutZone extends SparkCatalogTestBase { private static final String newTableName = "created_table"; private final Map config; - private static final Schema schema = new Schema( + private static final Schema schema = + new Schema( Types.NestedField.required(1, "id", Types.LongType.get()), Types.NestedField.required(2, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.required(3, "tsz", Types.TimestampType.withZone()) - ); + Types.NestedField.required(3, "tsz", Types.TimestampType.withZone())); - private final List values = ImmutableList.of( + private final List values = + ImmutableList.of( row(1L, toTimestamp("2021-01-01T00:00:00.0"), toTimestamp("2021-02-01T00:00:00.0")), row(2L, toTimestamp("2021-01-01T00:00:00.0"), toTimestamp("2021-02-01T00:00:00.0")), - row(3L, toTimestamp("2021-01-01T00:00:00.0"), toTimestamp("2021-02-01T00:00:00.0")) - ); + row(3L, toTimestamp("2021-01-01T00:00:00.0"), toTimestamp("2021-02-01T00:00:00.0"))); @Parameterized.Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") public static Object[][] parameters() { - return new Object[][]{{"spark_catalog", - SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "parquet-enabled", "true", - "cache-enabled", "false" - )} + return new Object[][] { + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "parquet-enabled", "true", + "cache-enabled", "false") + } }; } - public TestTimestampWithoutZone(String catalogName, String implementation, Map config) { + public TestTimestampWithoutZone( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); this.config = config; } @@ -94,8 +96,10 @@ public void removeTables() { @Test public void testWriteTimestampWithoutZoneError() { AssertHelpers.assertThrows( - String.format("Write operation performed on a timestamp without timezone field while " + - "'%s' set to false should throw exception", SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), + String.format( + "Write operation performed on a timestamp without timezone field while " + + "'%s' set to false should throw exception", + SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), IllegalArgumentException.class, SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, () -> sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values))); @@ -103,72 +107,98 @@ public void testWriteTimestampWithoutZoneError() { @Test public void testAppendTimestampWithoutZone() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); - - Assert.assertEquals("Should have " + values.size() + " row", - (long) values.size(), scalarSql("SELECT count(*) FROM %s", tableName)); - - assertEquals("Row data should match expected", - values, sql("SELECT * FROM %s ORDER BY id", tableName)); - }); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); + + Assert.assertEquals( + "Should have " + values.size() + " row", + (long) values.size(), + scalarSql("SELECT count(*) FROM %s", tableName)); + + assertEquals( + "Row data should match expected", + values, + sql("SELECT * FROM %s ORDER BY id", tableName)); + }); } @Test public void testCreateAsSelectWithTimestampWithoutZone() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); - sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); + sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); - Assert.assertEquals("Should have " + values.size() + " row", (long) values.size(), + Assert.assertEquals( + "Should have " + values.size() + " row", + (long) values.size(), scalarSql("SELECT count(*) FROM %s", newTableName)); - assertEquals("Row data should match expected", + assertEquals( + "Row data should match expected", sql("SELECT * FROM %s ORDER BY id", tableName), sql("SELECT * FROM %s ORDER BY id", newTableName)); - }); + }); } @Test public void testCreateNewTableShouldHaveTimestampWithZoneIcebergType() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); - sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); + sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); - Assert.assertEquals("Should have " + values.size() + " row", (long) values.size(), + Assert.assertEquals( + "Should have " + values.size() + " row", + (long) values.size(), scalarSql("SELECT count(*) FROM %s", newTableName)); - assertEquals("Data from created table should match data from base table", + assertEquals( + "Data from created table should match data from base table", sql("SELECT * FROM %s ORDER BY id", tableName), sql("SELECT * FROM %s ORDER BY id", newTableName)); - Table createdTable = validationCatalog.loadTable(TableIdentifier.of("default", newTableName)); - assertFieldsType(createdTable.schema(), Types.TimestampType.withZone(), "ts", "tsz"); - }); + Table createdTable = + validationCatalog.loadTable(TableIdentifier.of("default", newTableName)); + assertFieldsType(createdTable.schema(), Types.TimestampType.withZone(), "ts", "tsz"); + }); } @Test public void testCreateNewTableShouldHaveTimestampWithoutZoneIcebergType() { - withSQLConf(ImmutableMap.of( + withSQLConf( + ImmutableMap.of( SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true", - SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, "true"), () -> { - spark.sessionState().catalogManager().currentCatalog() + SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, "true"), + () -> { + spark + .sessionState() + .catalogManager() + .currentCatalog() .initialize(catalog.name(), new CaseInsensitiveStringMap(config)); - sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); + sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); - sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); + sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); - Assert.assertEquals("Should have " + values.size() + " row", (long) values.size(), + Assert.assertEquals( + "Should have " + values.size() + " row", + (long) values.size(), scalarSql("SELECT count(*) FROM %s", newTableName)); - assertEquals("Row data should match expected", + assertEquals( + "Row data should match expected", sql("SELECT * FROM %s ORDER BY id", tableName), sql("SELECT * FROM %s ORDER BY id", newTableName)); - Table createdTable = validationCatalog.loadTable(TableIdentifier.of("default", newTableName)); - assertFieldsType(createdTable.schema(), Types.TimestampType.withoutZone(), "ts", "tsz"); - }); + Table createdTable = + validationCatalog.loadTable(TableIdentifier.of("default", newTableName)); + assertFieldsType(createdTable.schema(), Types.TimestampType.withoutZone(), "ts", "tsz"); + }); } private Timestamp toTimestamp(String value) { @@ -176,21 +206,33 @@ private Timestamp toTimestamp(String value) { } private String rowToSqlValues(List rows) { - List rowValues = rows.stream().map(row -> { - List columns = Arrays.stream(row).map(value -> { - if (value instanceof Long) { - return value.toString(); - } else if (value instanceof Timestamp) { - return String.format("timestamp '%s'", value); - } - throw new RuntimeException("Type is not supported"); - }).collect(Collectors.toList()); - return "(" + Joiner.on(",").join(columns) + ")"; - }).collect(Collectors.toList()); + List rowValues = + rows.stream() + .map( + row -> { + List columns = + Arrays.stream(row) + .map( + value -> { + if (value instanceof Long) { + return value.toString(); + } else if (value instanceof Timestamp) { + return String.format("timestamp '%s'", value); + } + throw new RuntimeException("Type is not supported"); + }) + .collect(Collectors.toList()); + return "(" + Joiner.on(",").join(columns) + ")"; + }) + .collect(Collectors.toList()); return Joiner.on(",").join(rowValues); } private void assertFieldsType(Schema actual, Type.PrimitiveType expected, String... fields) { - actual.select(fields).asStruct().fields().forEach(field -> Assert.assertEquals(expected, field.type())); + actual + .select(fields) + .asStruct() + .fields() + .forEach(field -> Assert.assertEquals(expected, field.type())); } } diff --git a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestUnpartitionedWrites.java b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestUnpartitionedWrites.java index bd6fb5abf2c6..0849602c3b92 100644 --- a/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestUnpartitionedWrites.java +++ b/spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/sql/TestUnpartitionedWrites.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -35,7 +34,8 @@ import org.junit.Test; public class TestUnpartitionedWrites extends SparkCatalogTestBase { - public TestUnpartitionedWrites(String catalogName, String implementation, Map config) { + public TestUnpartitionedWrites( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -56,17 +56,14 @@ public void testInsertAppend() { sql("INSERT INTO %s VALUES (4, 'd'), (5, 'e')", tableName); - Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -75,22 +72,23 @@ public void testInsertOverwrite() { sql("INSERT OVERWRITE %s VALUES (4, 'd'), (5, 'e')", tableName); - Assert.assertEquals("Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(4L, "d"), - row(5L, "e") - ); + List expected = ImmutableList.of(row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testInsertAppendAtSnapshot() { long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); String prefix = "snapshot_id_"; - AssertHelpers.assertThrows("Should not be able to insert into a table at a specific snapshot", - IllegalArgumentException.class, "Cannot write to table at a specific snapshot", + AssertHelpers.assertThrows( + "Should not be able to insert into a table at a specific snapshot", + IllegalArgumentException.class, + "Cannot write to table at a specific snapshot", () -> sql("INSERT INTO %s.%s VALUES (4, 'd'), (5, 'e')", tableName, prefix + snapshotId)); } @@ -98,77 +96,68 @@ public void testInsertAppendAtSnapshot() { public void testInsertOverwriteAtSnapshot() { long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); String prefix = "snapshot_id_"; - AssertHelpers.assertThrows("Should not be able to insert into a table at a specific snapshot", - IllegalArgumentException.class, "Cannot write to table at a specific snapshot", - () -> sql("INSERT OVERWRITE %s.%s VALUES (4, 'd'), (5, 'e')", tableName, prefix + snapshotId)); + AssertHelpers.assertThrows( + "Should not be able to insert into a table at a specific snapshot", + IllegalArgumentException.class, + "Cannot write to table at a specific snapshot", + () -> + sql( + "INSERT OVERWRITE %s.%s VALUES (4, 'd'), (5, 'e')", + tableName, prefix + snapshotId)); } @Test public void testDataFrameV2Append() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).append(); - Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2DynamicOverwrite() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).overwritePartitions(); - Assert.assertEquals("Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(4L, "d"), - row(5L, "e") - ); + List expected = ImmutableList.of(row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2Overwrite() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).overwrite(functions.col("id").$less$eq(3)); - Assert.assertEquals("Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(4L, "d"), - row(5L, "e") - ); + List expected = ImmutableList.of(row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java index 51ac57855bbe..8918dfec6584 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/Employee.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Objects; @@ -25,8 +24,7 @@ public class Employee { private Integer id; private String dep; - public Employee() { - } + public Employee() {} public Employee(Integer id, String dep) { this.id = id; diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java index 585ea3cd81bd..4f137f5b8dac 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkExtensionsTestBase.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; + import java.util.Map; import java.util.Random; import java.util.concurrent.ThreadLocalRandom; @@ -32,13 +33,12 @@ import org.apache.spark.sql.internal.SQLConf; import org.junit.BeforeClass; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; - public abstract class SparkExtensionsTestBase extends SparkCatalogTestBase { private static final Random RANDOM = ThreadLocalRandom.current(); - public SparkExtensionsTestBase(String catalogName, String implementation, Map config) { + public SparkExtensionsTestBase( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -48,20 +48,24 @@ public static void startMetastoreAndSpark() { metastore.start(); SparkTestBase.hiveConf = metastore.hiveConf(); - SparkTestBase.spark = SparkSession.builder() - .master("local[2]") - .config("spark.testing", "true") - .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") - .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .config("spark.sql.shuffle.partitions", "4") - .config("spark.sql.hive.metastorePartitionPruningFallbackOnException", "true") - .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") - .config(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), String.valueOf(RANDOM.nextBoolean())) - .enableHiveSupport() - .getOrCreate(); + SparkTestBase.spark = + SparkSession.builder() + .master("local[2]") + .config("spark.testing", "true") + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config("spark.sql.shuffle.partitions", "4") + .config("spark.sql.hive.metastorePartitionPruningFallbackOnException", "true") + .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .config( + SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), String.valueOf(RANDOM.nextBoolean())) + .enableHiveSupport() + .getOrCreate(); - SparkTestBase.catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + SparkTestBase.catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java index 278559a5601d..eafd968d01a7 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/SparkRowLevelOperationsTestBase.java @@ -16,9 +16,21 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.DataOperations.DELETE; +import static org.apache.iceberg.DataOperations.OVERWRITE; +import static org.apache.iceberg.SnapshotSummary.ADDED_DELETE_FILES_PROP; +import static org.apache.iceberg.SnapshotSummary.ADDED_FILES_PROP; +import static org.apache.iceberg.SnapshotSummary.CHANGED_PARTITION_COUNT_PROP; +import static org.apache.iceberg.SnapshotSummary.DELETED_FILES_PROP; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -40,19 +52,6 @@ import org.junit.runners.Parameterized; import org.junit.runners.Parameterized.Parameters; -import static org.apache.iceberg.DataOperations.DELETE; -import static org.apache.iceberg.DataOperations.OVERWRITE; -import static org.apache.iceberg.SnapshotSummary.ADDED_DELETE_FILES_PROP; -import static org.apache.iceberg.SnapshotSummary.ADDED_FILES_PROP; -import static org.apache.iceberg.SnapshotSummary.CHANGED_PARTITION_COUNT_PROP; -import static org.apache.iceberg.SnapshotSummary.DELETED_FILES_PROP; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE; - @RunWith(Parameterized.class) public abstract class SparkRowLevelOperationsTestBase extends SparkExtensionsTestBase { @@ -62,58 +61,68 @@ public abstract class SparkRowLevelOperationsTestBase extends SparkExtensionsTes protected final boolean vectorized; protected final String distributionMode; - public SparkRowLevelOperationsTestBase(String catalogName, String implementation, - Map config, String fileFormat, - boolean vectorized, - String distributionMode) { + public SparkRowLevelOperationsTestBase( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config); this.fileFormat = fileFormat; this.vectorized = vectorized; this.distributionMode = distributionMode; } - @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}," + - " format = {3}, vectorized = {4}, distributionMode = {5}") + @Parameters( + name = + "catalogName = {0}, implementation = {1}, config = {2}," + + " format = {3}, vectorized = {4}, distributionMode = {5}") public static Object[][] parameters() { return new Object[][] { - { "testhive", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default" - ), - "orc", - true, - WRITE_DISTRIBUTION_MODE_NONE - }, - { "testhive", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default" + { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default"), + "orc", + true, + WRITE_DISTRIBUTION_MODE_NONE + }, + { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default"), + "parquet", + true, + WRITE_DISTRIBUTION_MODE_NONE + }, + { + "testhadoop", + SparkCatalog.class.getName(), + ImmutableMap.of("type", "hadoop"), + "parquet", + RANDOM.nextBoolean(), + WRITE_DISTRIBUTION_MODE_HASH + }, + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "clients", "1", + "parquet-enabled", "false", + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync ), - "parquet", - true, - WRITE_DISTRIBUTION_MODE_NONE - }, - { "testhadoop", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hadoop" - ), - "parquet", - RANDOM.nextBoolean(), - WRITE_DISTRIBUTION_MODE_HASH - }, - { "spark_catalog", SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "clients", "1", - "parquet-enabled", "false", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - ), - "avro", - false, - WRITE_DISTRIBUTION_MODE_RANGE - } + "avro", + false, + WRITE_DISTRIBUTION_MODE_RANGE + } }; } @@ -121,11 +130,15 @@ public static Object[][] parameters() { protected void initTable() { sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DEFAULT_FILE_FORMAT, fileFormat); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, distributionMode); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, distributionMode); switch (fileFormat) { case "parquet": - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')", tableName, PARQUET_VECTORIZATION_ENABLED, vectorized); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%b')", + tableName, PARQUET_VECTORIZATION_ENABLED, vectorized); break; case "orc": Assert.assertTrue(vectorized); @@ -136,9 +149,10 @@ protected void initTable() { } Map props = extraTableProperties(); - props.forEach((prop, value) -> { - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, prop, value); - }); + props.forEach( + (prop, value) -> { + sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, prop, value); + }); } protected void createAndInitTable(String schema) { @@ -186,9 +200,10 @@ protected void createOrReplaceView(String name, List data, Encoder enc } private Dataset toDS(String schema, String jsonData) { - List jsonRows = Arrays.stream(jsonData.split("\n")) - .filter(str -> str.trim().length() > 0) - .collect(Collectors.toList()); + List jsonRows = + Arrays.stream(jsonData.split("\n")) + .filter(str -> str.trim().length() > 0) + .collect(Collectors.toList()); Dataset jsonDS = spark.createDataset(jsonRows, Encoders.STRING()); if (schema != null) { @@ -198,22 +213,36 @@ private Dataset toDS(String schema, String jsonData) { } } - protected void validateDelete(Snapshot snapshot, String changedPartitionCount, String deletedDataFiles) { + protected void validateDelete( + Snapshot snapshot, String changedPartitionCount, String deletedDataFiles) { validateSnapshot(snapshot, DELETE, changedPartitionCount, deletedDataFiles, null, null); } - protected void validateCopyOnWrite(Snapshot snapshot, String changedPartitionCount, - String deletedDataFiles, String addedDataFiles) { - validateSnapshot(snapshot, OVERWRITE, changedPartitionCount, deletedDataFiles, null, addedDataFiles); + protected void validateCopyOnWrite( + Snapshot snapshot, + String changedPartitionCount, + String deletedDataFiles, + String addedDataFiles) { + validateSnapshot( + snapshot, OVERWRITE, changedPartitionCount, deletedDataFiles, null, addedDataFiles); } - protected void validateMergeOnRead(Snapshot snapshot, String changedPartitionCount, - String addedDeleteFiles, String addedDataFiles) { - validateSnapshot(snapshot, OVERWRITE, changedPartitionCount, null, addedDeleteFiles, addedDataFiles); + protected void validateMergeOnRead( + Snapshot snapshot, + String changedPartitionCount, + String addedDeleteFiles, + String addedDataFiles) { + validateSnapshot( + snapshot, OVERWRITE, changedPartitionCount, null, addedDeleteFiles, addedDataFiles); } - protected void validateSnapshot(Snapshot snapshot, String operation, String changedPartitionCount, - String deletedDataFiles, String addedDeleteFiles, String addedDataFiles) { + protected void validateSnapshot( + Snapshot snapshot, + String operation, + String changedPartitionCount, + String deletedDataFiles, + String addedDeleteFiles, + String addedDataFiles) { Assert.assertEquals("Operation must match", operation, snapshot.operation()); validateProperty(snapshot, CHANGED_PARTITION_COUNT_PROP, changedPartitionCount); validateProperty(snapshot, DELETED_FILES_PROP, deletedDataFiles); @@ -223,14 +252,20 @@ protected void validateSnapshot(Snapshot snapshot, String operation, String chan protected void validateProperty(Snapshot snapshot, String property, Set expectedValues) { String actual = snapshot.summary().get(property); - Assert.assertTrue("Snapshot property " + property + " has unexpected value, actual = " + - actual + ", expected one of : " + String.join(",", expectedValues), + Assert.assertTrue( + "Snapshot property " + + property + + " has unexpected value, actual = " + + actual + + ", expected one of : " + + String.join(",", expectedValues), expectedValues.contains(actual)); } protected void validateProperty(Snapshot snapshot, String property, String expectedValue) { String actual = snapshot.summary().get(property); - Assert.assertEquals("Snapshot property " + property + " has unexpected value.", expectedValue, actual); + Assert.assertEquals( + "Snapshot property " + property + " has unexpected value.", expectedValue, actual); } protected void sleep(long millis) { diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java index e274ad857875..f689401653f7 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAddFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.io.File; @@ -57,12 +56,12 @@ public class TestAddFilesProcedure extends SparkExtensionsTestBase { private final String sourceTableName = "source_table"; private File fileTableDir; - public TestAddFilesProcedure(String catalogName, String implementation, Map config) { + public TestAddFilesProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Before public void setupTempDirs() { @@ -88,12 +87,15 @@ public void addDataUnpartitioned() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -107,12 +109,15 @@ public void addDataUnpartitionedOrc() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`orc`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`orc`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -125,10 +130,12 @@ public void addAvroFile() throws Exception { // Create an Avro file - Schema schema = SchemaBuilder.record("record").fields() - .requiredInt("id") - .requiredString("data") - .endRecord(); + Schema schema = + SchemaBuilder.record("record") + .fields() + .requiredInt("id") + .requiredString("data") + .endRecord(); GenericRecord record1 = new GenericData.Record(schema); record1.put("id", 1L); record1.put("data", "a"); @@ -144,30 +151,30 @@ public void addAvroFile() throws Exception { dataFileWriter.append(record2); dataFileWriter.close(); - String createIceberg = - "CREATE TABLE %s (id Long, data String) USING iceberg"; + String createIceberg = "CREATE TABLE %s (id Long, data String) USING iceberg"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`avro`.`%s`')", - catalogName, tableName, outputFile.getPath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`avro`.`%s`')", + catalogName, tableName, outputFile.getPath()); Assert.assertEquals(1L, result); - List expected = Lists.newArrayList( - new Object[]{1L, "a"}, - new Object[]{2L, "b"} - ); + List expected = Lists.newArrayList(new Object[] {1L, "a"}, new Object[] {2L, "b"}); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); - List actualRecordCount = sql("select %s from %s.files", - DataFile.RECORD_COUNT.name(), - tableName); + List actualRecordCount = + sql("select %s from %s.files", DataFile.RECORD_COUNT.name(), tableName); List expectedRecordCount = Lists.newArrayList(); - expectedRecordCount.add(new Object[]{2L}); - assertEquals("Iceberg file metadata should have correct metadata count", - expectedRecordCount, actualRecordCount); + expectedRecordCount.add(new Object[] {2L}); + assertEquals( + "Iceberg file metadata should have correct metadata count", + expectedRecordCount, + actualRecordCount); } // TODO Adding spark-avro doesn't work in tests @@ -180,12 +187,15 @@ public void addDataUnpartitionedAvro() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`avro`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`avro`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -199,12 +209,13 @@ public void addDataUnpartitionedHive() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + Object result = + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -218,12 +229,15 @@ public void addDataUnpartitionedExtraCol() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT * FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -232,17 +246,19 @@ public void addDataUnpartitionedExtraCol() { public void addDataUnpartitionedMissingCol() { createUnpartitionedFileTable("parquet"); - String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String) USING iceberg"; + String createIceberg = "CREATE TABLE %s (id Integer, name String, dept String) USING iceberg"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -256,12 +272,15 @@ public void addDataPartitionedMissingCol() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept FROM %s ORDER BY id", sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -275,17 +294,20 @@ public void addDataPartitioned() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } - @Ignore // TODO Classpath issues prevent us from actually writing to a Spark ORC table + @Ignore // TODO Classpath issues prevent us from actually writing to a Spark ORC table public void addDataPartitionedOrc() { createPartitionedFileTable("orc"); @@ -294,12 +316,15 @@ public void addDataPartitionedOrc() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -314,12 +339,15 @@ public void addDataPartitionedAvro() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`avro`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`avro`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -333,12 +361,13 @@ public void addDataPartitionedHive() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + Object result = + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); Assert.assertEquals(8L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -352,12 +381,15 @@ public void addPartitionToPartitioned() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -371,13 +403,18 @@ public void addDataPartitionedByDateToPartitioned() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('date', '2021-01-01'))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('date', '2021-01-01'))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", - sql("SELECT id, name, dept, date FROM %s WHERE date = '2021-01-01' ORDER BY id", sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT id, name, dept, date FROM %s WHERE date = '2021-01-01' ORDER BY id", + sourceTableName), sql("SELECT id, name, dept, date FROM %s ORDER BY id", tableName)); } @@ -386,17 +423,20 @@ public void addFilteredPartitionsToPartitioned() { createCompositePartitionedTable("parquet"); String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (id, dept)"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (id, dept)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -406,18 +446,23 @@ public void addFilteredPartitionsToPartitioned2() { createCompositePartitionedTable("parquet"); String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (id, dept)"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (id, dept)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(6L, result); - assertEquals("Iceberg table contains correct data", - sql("SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", + sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -426,17 +471,20 @@ public void addFilteredPartitionsToPartitionedWithNullValueFilteringOnId() { createCompositePartitionedTableWithNullValueInPartitionColumn("parquet"); String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (id, dept)"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (id, dept)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -446,18 +494,23 @@ public void addFilteredPartitionsToPartitionedWithNullValueFilteringOnDept() { createCompositePartitionedTableWithNullValueInPartitionColumn("parquet"); String createIceberg = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (id, dept)"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (id, dept)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(6L, result); - assertEquals("Iceberg table contains correct data", - sql("SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", + sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -466,13 +519,15 @@ public void addWeirdCaseHiveTable() { createWeirdCaseTable(); String createIceberg = - "CREATE TABLE %s (id Integer, `naMe` String, dept String, subdept String) USING iceberg " + - "PARTITIONED BY (`naMe`)"; + "CREATE TABLE %s (id Integer, `naMe` String, dept String, subdept String) USING iceberg " + + "PARTITIONED BY (`naMe`)"; sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '%s', map('naMe', 'John Doe'))", - catalogName, tableName, sourceTableName); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '%s', map('naMe', 'John Doe'))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result); @@ -482,22 +537,30 @@ public void addWeirdCaseHiveTable() { Spark does not actually handle this pushdown correctly for hive based tables and it returns 0 records */ List expected = - sql("SELECT id, `naMe`, dept, subdept from %s ORDER BY id", sourceTableName) - .stream() + sql("SELECT id, `naMe`, dept, subdept from %s ORDER BY id", sourceTableName).stream() .filter(r -> r[1].equals("John Doe")) .collect(Collectors.toList()); // TODO when this assert breaks Spark fixed the pushdown issue - Assert.assertEquals("If this assert breaks it means that Spark has fixed the pushdown issue", 0, - sql("SELECT id, `naMe`, dept, subdept from %s WHERE `naMe` = 'John Doe' ORDER BY id", sourceTableName) + Assert.assertEquals( + "If this assert breaks it means that Spark has fixed the pushdown issue", + 0, + sql( + "SELECT id, `naMe`, dept, subdept from %s WHERE `naMe` = 'John Doe' ORDER BY id", + sourceTableName) .size()); // Pushdown works for iceberg - Assert.assertEquals("We should be able to pushdown mixed case partition keys", 2, - sql("SELECT id, `naMe`, dept, subdept FROM %s WHERE `naMe` = 'John Doe' ORDER BY id", tableName) + Assert.assertEquals( + "We should be able to pushdown mixed case partition keys", + 2, + sql( + "SELECT id, `naMe`, dept, subdept FROM %s WHERE `naMe` = 'John Doe' ORDER BY id", + tableName) .size()); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", expected, sql("SELECT id, `naMe`, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -511,12 +574,15 @@ public void addPartitionToPartitionedHive() { sql(createIceberg, tableName); - Object result = scalarSql("CALL %s.system.add_files('%s', '%s', map('id', 1))", - catalogName, tableName, sourceTableName); + Object result = + scalarSql( + "CALL %s.system.add_files('%s', '%s', map('id', 1))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); } @@ -530,19 +596,23 @@ public void invalidDataImport() { sql(createIceberg, tableName); - AssertHelpers.assertThrows("Should forbid adding of partitioned data to unpartitioned table", + AssertHelpers.assertThrows( + "Should forbid adding of partitioned data to unpartitioned table", IllegalArgumentException.class, "Cannot use partition filter with an unpartitioned table", - () -> scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", - catalogName, tableName, fileTableDir.getAbsolutePath()) - ); + () -> + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('id', 1))", + catalogName, tableName, fileTableDir.getAbsolutePath())); - AssertHelpers.assertThrows("Should forbid adding of partitioned data to unpartitioned table", + AssertHelpers.assertThrows( + "Should forbid adding of partitioned data to unpartitioned table", IllegalArgumentException.class, "Cannot add partitioned files to an unpartitioned table", - () -> scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()) - ); + () -> + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath())); } @Test @@ -554,20 +624,25 @@ public void invalidDataImportPartitioned() { sql(createIceberg, tableName); - AssertHelpers.assertThrows("Should forbid adding with a mismatching partition spec", + AssertHelpers.assertThrows( + "Should forbid adding with a mismatching partition spec", IllegalArgumentException.class, "is greater than the number of partitioned columns", - () -> scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('x', '1', 'y', '2'))", - catalogName, tableName, fileTableDir.getAbsolutePath())); + () -> + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('x', '1', 'y', '2'))", + catalogName, tableName, fileTableDir.getAbsolutePath())); - AssertHelpers.assertThrows("Should forbid adding with partition spec with incorrect columns", + AssertHelpers.assertThrows( + "Should forbid adding with partition spec with incorrect columns", IllegalArgumentException.class, "specified partition filter refers to columns that are not partitioned", - () -> scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', '2'))", - catalogName, tableName, fileTableDir.getAbsolutePath())); + () -> + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', '2'))", + catalogName, tableName, fileTableDir.getAbsolutePath())); } - @Test public void addTwice() { createPartitionedHiveTable(); @@ -577,24 +652,30 @@ public void addTwice() { sql(createIceberg, tableName); - Object result1 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1))", - catalogName, tableName, sourceTableName); + Object result1 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result1); - Object result2 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 2))", - catalogName, tableName, sourceTableName); + Object result2 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 2))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result2); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 ORDER BY id", tableName)); - assertEquals("Iceberg table contains correct data", + assertEquals( + "Iceberg table contains correct data", sql("SELECT id, name, dept, subdept FROM %s WHERE id = 2 ORDER BY id", sourceTableName), sql("SELECT id, name, dept, subdept FROM %s WHERE id = 2 ORDER BY id", tableName)); } @@ -608,21 +689,25 @@ public void duplicateDataPartitioned() { sql(createIceberg, tableName); - scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1))", + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1))", catalogName, tableName, sourceTableName); - AssertHelpers.assertThrows("Should not allow adding duplicate files", + AssertHelpers.assertThrows( + "Should not allow adding duplicate files", IllegalStateException.class, - "Cannot complete import because data files to be imported already" + - " exist within the target table", - () -> scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1))", - catalogName, tableName, sourceTableName)); + "Cannot complete import because data files to be imported already" + + " exist within the target table", + () -> + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1))", + catalogName, tableName, sourceTableName)); } @Test @@ -634,27 +719,33 @@ public void duplicateDataPartitionedAllowed() { sql(createIceberg, tableName); - Object result1 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1))", - catalogName, tableName, sourceTableName); + Object result1 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1))", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result1); - Object result2 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', 1)," + - "check_duplicate_files => false)", - catalogName, tableName, sourceTableName); + Object result2 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', 1)," + + "check_duplicate_files => false)", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result2); - - assertEquals("Iceberg table contains correct data", - sql("SELECT id, name, dept, subdept FROM %s WHERE id = 1 UNION ALL " + - "SELECT id, name, dept, subdept FROM %s WHERE id = 1", sourceTableName, sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT id, name, dept, subdept FROM %s WHERE id = 1 UNION ALL " + + "SELECT id, name, dept, subdept FROM %s WHERE id = 1", + sourceTableName, sourceTableName), sql("SELECT id, name, dept, subdept FROM %s", tableName, tableName)); } @@ -667,15 +758,16 @@ public void duplicateDataUnpartitioned() { sql(createIceberg, tableName); - scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); - AssertHelpers.assertThrows("Should not allow adding duplicate files", + AssertHelpers.assertThrows( + "Should not allow adding duplicate files", IllegalStateException.class, - "Cannot complete import because data files to be imported already" + - " exist within the target table", - () -> scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName)); + "Cannot complete import because data files to be imported already" + + " exist within the target table", + () -> + scalarSql( + "CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName)); } @Test @@ -687,23 +779,25 @@ public void duplicateDataUnpartitionedAllowed() { sql(createIceberg, tableName); - Object result1 = scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + Object result1 = + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result1); - Object result2 = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s'," + - "check_duplicate_files => false)", - catalogName, tableName, sourceTableName); + Object result2 = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s'," + + "check_duplicate_files => false)", + catalogName, tableName, sourceTableName); Assert.assertEquals(2L, result2); - assertEquals("Iceberg table contains correct data", - sql("SELECT * FROM (SELECT * FROM %s UNION ALL " + - "SELECT * from %s) ORDER BY id", sourceTableName, sourceTableName), + assertEquals( + "Iceberg table contains correct data", + sql( + "SELECT * FROM (SELECT * FROM %s UNION ALL " + "SELECT * from %s) ORDER BY id", + sourceTableName, sourceTableName), sql("SELECT * FROM %s ORDER BY id", tableName)); - - } @Test @@ -714,21 +808,26 @@ public void testEmptyImportDoesNotThrow() { sql(createIceberg, tableName); // Empty path based import - Object pathResult = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`')", - catalogName, tableName, fileTableDir.getAbsolutePath()); + Object pathResult = + scalarSql( + "CALL %s.system.add_files('%s', '`parquet`.`%s`')", + catalogName, tableName, fileTableDir.getAbsolutePath()); Assert.assertEquals(0L, pathResult); - assertEquals("Iceberg table contains no added data when importing from an empty path", + assertEquals( + "Iceberg table contains no added data when importing from an empty path", emptyQueryResult, sql("SELECT * FROM %s ORDER BY id", tableName)); // Empty table based import - String createHive = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet"; + String createHive = + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet"; sql(createHive, sourceTableName); - Object tableResult = scalarSql("CALL %s.system.add_files('%s', '%s')", - catalogName, tableName, sourceTableName); + Object tableResult = + scalarSql("CALL %s.system.add_files('%s', '%s')", catalogName, tableName, sourceTableName); Assert.assertEquals(0L, tableResult); - assertEquals("Iceberg table contains no added data when importing from an empty table", + assertEquals( + "Iceberg table contains no added data when importing from an empty table", emptyQueryResult, sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -739,22 +838,26 @@ public void testPartitionedImportFromEmptyPartitionDoesNotThrow() { final int emptyPartitionId = 999; // Add an empty partition to the hive table - sql("ALTER TABLE %s ADD PARTITION (id = '%d') LOCATION '%d'", sourceTableName, - emptyPartitionId, emptyPartitionId); + sql( + "ALTER TABLE %s ADD PARTITION (id = '%d') LOCATION '%d'", + sourceTableName, emptyPartitionId, emptyPartitionId); String createIceberg = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg PARTITIONED BY (id)"; sql(createIceberg, tableName); - Object tableResult = scalarSql("CALL %s.system.add_files(" + - "table => '%s', " + - "source_table => '%s', " + - "partition_filter => map('id', %d))", - catalogName, tableName, sourceTableName, emptyPartitionId); + Object tableResult = + scalarSql( + "CALL %s.system.add_files(" + + "table => '%s', " + + "source_table => '%s', " + + "partition_filter => map('id', %d))", + catalogName, tableName, sourceTableName, emptyPartitionId); Assert.assertEquals(0L, tableResult); - assertEquals("Iceberg table contains no added data when importing from an empty table", + assertEquals( + "Iceberg table contains no added data when importing from an empty table", emptyQueryResult, sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -762,26 +865,28 @@ public void testPartitionedImportFromEmptyPartitionDoesNotThrow() { private static final List emptyQueryResult = Lists.newArrayList(); private static final StructField[] struct = { - new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), - new StructField("name", DataTypes.StringType, true, Metadata.empty()), - new StructField("dept", DataTypes.StringType, true, Metadata.empty()), - new StructField("subdept", DataTypes.StringType, true, Metadata.empty()) + new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), + new StructField("name", DataTypes.StringType, true, Metadata.empty()), + new StructField("dept", DataTypes.StringType, true, Metadata.empty()), + new StructField("subdept", DataTypes.StringType, true, Metadata.empty()) }; private static final Dataset unpartitionedDF = - spark.createDataFrame( - ImmutableList.of( - RowFactory.create(1, "John Doe", "hr", "communications"), - RowFactory.create(2, "Jane Doe", "hr", "salary"), - RowFactory.create(3, "Matt Doe", "hr", "communications"), - RowFactory.create(4, "Will Doe", "facilities", "all")), - new StructType(struct)).repartition(1); + spark + .createDataFrame( + ImmutableList.of( + RowFactory.create(1, "John Doe", "hr", "communications"), + RowFactory.create(2, "Jane Doe", "hr", "salary"), + RowFactory.create(3, "Matt Doe", "hr", "communications"), + RowFactory.create(4, "Will Doe", "facilities", "all")), + new StructType(struct)) + .repartition(1); private static final Dataset singleNullRecordDF = - spark.createDataFrame( - ImmutableList.of( - RowFactory.create(null, null, null, null)), - new StructType(struct)).repartition(1); + spark + .createDataFrame( + ImmutableList.of(RowFactory.create(null, null, null, null)), new StructType(struct)) + .repartition(1); private static final Dataset partitionedDF = unpartitionedDF.select("name", "dept", "subdept", "id"); @@ -800,10 +905,10 @@ public void testPartitionedImportFromEmptyPartitionDoesNotThrow() { unpartitionedDF.col("name").as("naMe")); private static final StructField[] dateStruct = { - new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), - new StructField("name", DataTypes.StringType, true, Metadata.empty()), - new StructField("dept", DataTypes.StringType, true, Metadata.empty()), - new StructField("ts", DataTypes.DateType, true, Metadata.empty()) + new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), + new StructField("name", DataTypes.StringType, true, Metadata.empty()), + new StructField("dept", DataTypes.StringType, true, Metadata.empty()), + new StructField("ts", DataTypes.DateType, true, Metadata.empty()) }; private static java.sql.Date toDate(String value) { @@ -811,15 +916,17 @@ private static java.sql.Date toDate(String value) { } private static final Dataset dateDF = - spark.createDataFrame( - ImmutableList.of( - RowFactory.create(1, "John Doe", "hr", toDate("2021-01-01")), - RowFactory.create(2, "Jane Doe", "hr", toDate("2021-01-01")), - RowFactory.create(3, "Matt Doe", "hr", toDate("2021-01-02")), - RowFactory.create(4, "Will Doe", "facilities", toDate("2021-01-02"))), - new StructType(dateStruct)).repartition(2); - - private void createUnpartitionedFileTable(String format) { + spark + .createDataFrame( + ImmutableList.of( + RowFactory.create(1, "John Doe", "hr", toDate("2021-01-01")), + RowFactory.create(2, "Jane Doe", "hr", toDate("2021-01-01")), + RowFactory.create(3, "Matt Doe", "hr", toDate("2021-01-02")), + RowFactory.create(4, "Will Doe", "facilities", toDate("2021-01-02"))), + new StructType(dateStruct)) + .repartition(2); + + private void createUnpartitionedFileTable(String format) { String createParquet = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s LOCATION '%s'"; @@ -828,10 +935,10 @@ private void createUnpartitionedFileTable(String format) { unpartitionedDF.write().insertInto(sourceTableName); } - private void createPartitionedFileTable(String format) { + private void createPartitionedFileTable(String format) { String createParquet = - "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s PARTITIONED BY (id) " + - "LOCATION '%s'"; + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s PARTITIONED BY (id) " + + "LOCATION '%s'"; sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath()); @@ -840,8 +947,9 @@ private void createPartitionedFileTable(String format) { } private void createCompositePartitionedTable(String format) { - String createParquet = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s " + - "PARTITIONED BY (id, dept) LOCATION '%s'"; + String createParquet = + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s " + + "PARTITIONED BY (id, dept) LOCATION '%s'"; sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath()); compositePartitionedDF.write().insertInto(sourceTableName); @@ -849,11 +957,14 @@ private void createCompositePartitionedTable(String format) { } private void createCompositePartitionedTableWithNullValueInPartitionColumn(String format) { - String createParquet = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s " + - "PARTITIONED BY (id, dept) LOCATION '%s'"; + String createParquet = + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING %s " + + "PARTITIONED BY (id, dept) LOCATION '%s'"; sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath()); - Dataset unionedDF = compositePartitionedDF.unionAll(compositePartitionedNullRecordDF) + Dataset unionedDF = + compositePartitionedDF + .unionAll(compositePartitionedNullRecordDF) .select("name", "subdept", "id", "dept") .repartition(1); @@ -863,18 +974,18 @@ private void createCompositePartitionedTableWithNullValueInPartitionColumn(Strin private void createWeirdCaseTable() { String createParquet = - "CREATE TABLE %s (id Integer, subdept String, dept String) " + - "PARTITIONED BY (`naMe` String) STORED AS parquet"; + "CREATE TABLE %s (id Integer, subdept String, dept String) " + + "PARTITIONED BY (`naMe` String) STORED AS parquet"; sql(createParquet, sourceTableName); weirdColumnNamesDF.write().insertInto(sourceTableName); weirdColumnNamesDF.write().insertInto(sourceTableName); - } private void createUnpartitionedHiveTable() { - String createHive = "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet"; + String createHive = + "CREATE TABLE %s (id Integer, name String, dept String, subdept String) STORED AS parquet"; sql(createHive, sourceTableName); @@ -883,8 +994,9 @@ private void createUnpartitionedHiveTable() { } private void createPartitionedHiveTable() { - String createHive = "CREATE TABLE %s (name String, dept String, subdept String) " + - "PARTITIONED BY (id Integer) STORED AS parquet"; + String createHive = + "CREATE TABLE %s (name String, dept String, subdept String) " + + "PARTITIONED BY (id Integer) STORED AS parquet"; sql(createHive, sourceTableName); @@ -892,9 +1004,10 @@ private void createPartitionedHiveTable() { partitionedDF.write().insertInto(sourceTableName); } - private void createDatePartitionedFileTable(String format) { - String createParquet = "CREATE TABLE %s (id Integer, name String, dept String, date Date) USING %s " + - "PARTITIONED BY (date) LOCATION '%s'"; + private void createDatePartitionedFileTable(String format) { + String createParquet = + "CREATE TABLE %s (id Integer, name String, dept String, date Date) USING %s " + + "PARTITIONED BY (date) LOCATION '%s'"; sql(createParquet, sourceTableName, format, fileTableDir.getAbsolutePath()); diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java index 9d630508b6e4..8aee7c97752f 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -31,7 +30,8 @@ import org.junit.Test; public class TestAlterTablePartitionFields extends SparkExtensionsTestBase { - public TestAlterTablePartitionFields(String catalogName, String implementation, Map config) { + public TestAlterTablePartitionFields( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -42,7 +42,9 @@ public void removeTable() { @Test public void testAddIdentityPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -51,17 +53,17 @@ public void testAddIdentityPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .identity("category") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).identity("category").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddBucketPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -70,17 +72,20 @@ public void testAddBucketPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .bucket("id", 16, "id_bucket_16") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .bucket("id", 16, "id_bucket_16") + .build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddTruncatePartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -89,17 +94,20 @@ public void testAddTruncatePartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .truncate("data", 4, "data_trunc_4") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .truncate("data", 4, "data_trunc_4") + .build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddYearsPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -108,17 +116,17 @@ public void testAddYearsPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .year("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).year("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddMonthsPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -127,17 +135,17 @@ public void testAddMonthsPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .month("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).month("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddDaysPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -146,17 +154,17 @@ public void testAddDaysPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddHoursPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -165,17 +173,17 @@ public void testAddHoursPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .hour("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).hour("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testAddNamedPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -184,77 +192,83 @@ public void testAddNamedPartition() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .bucket("id", 16, "shard") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).bucket("id", 16, "shard").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testDropIdentityPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg PARTITIONED BY (category)", + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg PARTITIONED BY (category)", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Table should start with 1 partition field", 1, table.spec().fields().size()); + Assert.assertEquals( + "Table should start with 1 partition field", 1, table.spec().fields().size()); sql("ALTER TABLE %s DROP PARTITION FIELD category", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .alwaysNull("category", "category") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .alwaysNull("category", "category") + .build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testDropDaysPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, ts timestamp, data string) USING iceberg PARTITIONED BY (days(ts))", + sql( + "CREATE TABLE %s (id bigint NOT NULL, ts timestamp, data string) USING iceberg PARTITIONED BY (days(ts))", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Table should start with 1 partition field", 1, table.spec().fields().size()); + Assert.assertEquals( + "Table should start with 1 partition field", 1, table.spec().fields().size()); sql("ALTER TABLE %s DROP PARTITION FIELD days(ts)", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .alwaysNull("ts", "ts_day") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).alwaysNull("ts", "ts_day").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testDropBucketPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (bucket(16, id))", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (bucket(16, id))", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Table should start with 1 partition field", 1, table.spec().fields().size()); + Assert.assertEquals( + "Table should start with 1 partition field", 1, table.spec().fields().size()); sql("ALTER TABLE %s DROP PARTITION FIELD bucket(16, id)", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .alwaysNull("id", "id_bucket") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(1) + .alwaysNull("id", "id_bucket") + .build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testDropPartitionByName() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); @@ -270,114 +284,121 @@ public void testDropPartitionByName() { table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("id", "shard") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(2).alwaysNull("id", "shard").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); } @Test public void testReplacePartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts)", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); sql("ALTER TABLE %s REPLACE PARTITION FIELD days(ts) WITH hours(ts)", tableName); table.refresh(); - expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("ts", "ts_day") - .hour("ts") - .build(); - Assert.assertEquals("Should changed from daily to hourly partitioned field", expected, table.spec()); + expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("ts", "ts_day") + .hour("ts") + .build(); + Assert.assertEquals( + "Should changed from daily to hourly partitioned field", expected, table.spec()); } @Test public void testReplacePartitionAndRename() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts)", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); sql("ALTER TABLE %s REPLACE PARTITION FIELD days(ts) WITH hours(ts) AS hour_col", tableName); table.refresh(); - expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("ts", "ts_day") - .hour("ts", "hour_col") - .build(); - Assert.assertEquals("Should changed from daily to hourly partitioned field", expected, table.spec()); + expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("ts", "ts_day") + .hour("ts", "hour_col") + .build(); + Assert.assertEquals( + "Should changed from daily to hourly partitioned field", expected, table.spec()); } @Test public void testReplaceNamedPartition() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts) AS day_col", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts", "day_col") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts", "day_col").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); sql("ALTER TABLE %s REPLACE PARTITION FIELD day_col WITH hours(ts)", tableName); table.refresh(); - expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("ts", "day_col") - .hour("ts") - .build(); - Assert.assertEquals("Should changed from daily to hourly partitioned field", expected, table.spec()); + expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("ts", "day_col") + .hour("ts") + .build(); + Assert.assertEquals( + "Should changed from daily to hourly partitioned field", expected, table.spec()); } @Test public void testReplaceNamedPartitionAndRenameDifferently() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unpartitioned", table.spec().isUnpartitioned()); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts) AS day_col", tableName); table.refresh(); - PartitionSpec expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(1) - .day("ts", "day_col") - .build(); + PartitionSpec expected = + PartitionSpec.builderFor(table.schema()).withSpecId(1).day("ts", "day_col").build(); Assert.assertEquals("Should have new spec field", expected, table.spec()); sql("ALTER TABLE %s REPLACE PARTITION FIELD day_col WITH hours(ts) AS hour_col", tableName); table.refresh(); - expected = PartitionSpec.builderFor(table.schema()) - .withSpecId(2) - .alwaysNull("ts", "day_col") - .hour("ts", "hour_col") - .build(); - Assert.assertEquals("Should changed from daily to hourly partitioned field", expected, table.spec()); + expected = + PartitionSpec.builderFor(table.schema()) + .withSpecId(2) + .alwaysNull("ts", "day_col") + .hour("ts", "hour_col") + .build(); + Assert.assertEquals( + "Should changed from daily to hourly partitioned field", expected, table.spec()); } @Test public void testSparkTableAddDropPartitions() throws Exception { sql("CREATE TABLE %s (id bigint NOT NULL, ts timestamp, data string) USING iceberg", tableName); - Assert.assertEquals("spark table partition should be empty", 0, sparkTable().partitioning().length); + Assert.assertEquals( + "spark table partition should be empty", 0, sparkTable().partitioning().length); sql("ALTER TABLE %s ADD PARTITION FIELD bucket(16, id) AS shard", tableName); assertPartitioningEquals(sparkTable(), 1, "bucket(16, id)"); @@ -396,13 +417,16 @@ public void testSparkTableAddDropPartitions() throws Exception { sql("ALTER TABLE %s DROP PARTITION FIELD shard", tableName); sql("DESCRIBE %s", tableName); - Assert.assertEquals("spark table partition should be empty", 0, sparkTable().partitioning().length); + Assert.assertEquals( + "spark table partition should be empty", 0, sparkTable().partitioning().length); } private void assertPartitioningEquals(SparkTable table, int len, String transform) { Assert.assertEquals("spark table partition should be " + len, len, table.partitioning().length); - Assert.assertEquals("latest spark table partition transform should match", - transform, table.partitioning()[len - 1].toString()); + Assert.assertEquals( + "latest spark table partition transform should match", + transform, + table.partitioning()[len - 1].toString()); } private SparkTable sparkTable() throws Exception { diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java index ac12953d0a7e..c993c213dc5e 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTableSchema.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -28,7 +27,8 @@ import org.junit.Test; public class TestAlterTableSchema extends SparkExtensionsTestBase { - public TestAlterTableSchema(String catalogName, String implementation, Map config) { + public TestAlterTableSchema( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -39,20 +39,25 @@ public void removeTable() { @Test public void testSetIdentifierFields() { - sql("CREATE TABLE %s (id bigint NOT NULL, " + - "location struct NOT NULL) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, " + + "location struct NOT NULL) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertTrue("Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); + Assert.assertTrue( + "Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier field", + Assert.assertEquals( + "Should have new identifier field", Sets.newHashSet(table.schema().findField("id").fieldId()), table.schema().identifierFieldIds()); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id, location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier field", + Assert.assertEquals( + "Should have new identifier field", Sets.newHashSet( table.schema().findField("id").fieldId(), table.schema().findField("location.lon").fieldId()), @@ -60,7 +65,8 @@ public void testSetIdentifierFields() { sql("ALTER TABLE %s SET IDENTIFIER FIELDS location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier field", + Assert.assertEquals( + "Should have new identifier field", Sets.newHashSet(table.schema().findField("location.lon").fieldId()), table.schema().identifierFieldIds()); } @@ -69,13 +75,16 @@ public void testSetIdentifierFields() { public void testSetInvalidIdentifierFields() { sql("CREATE TABLE %s (id bigint NOT NULL, id2 bigint) USING iceberg", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertTrue("Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); - AssertHelpers.assertThrows("should not allow setting unknown fields", + Assert.assertTrue( + "Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); + AssertHelpers.assertThrows( + "should not allow setting unknown fields", IllegalArgumentException.class, "not found in current schema or added columns", () -> sql("ALTER TABLE %s SET IDENTIFIER FIELDS unknown", tableName)); - AssertHelpers.assertThrows("should not allow setting optional fields", + AssertHelpers.assertThrows( + "should not allow setting optional fields", IllegalArgumentException.class, "not a required field", () -> sql("ALTER TABLE %s SET IDENTIFIER FIELDS id2", tableName)); @@ -83,14 +92,18 @@ public void testSetInvalidIdentifierFields() { @Test public void testDropIdentifierFields() { - sql("CREATE TABLE %s (id bigint NOT NULL, " + - "location struct NOT NULL) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, " + + "location struct NOT NULL) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertTrue("Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); + Assert.assertTrue( + "Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id, location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier fields", + Assert.assertEquals( + "Should have new identifier fields", Sets.newHashSet( table.schema().findField("id").fieldId(), table.schema().findField("location.lon").fieldId()), @@ -98,13 +111,15 @@ public void testDropIdentifierFields() { sql("ALTER TABLE %s DROP IDENTIFIER FIELDS id", tableName); table.refresh(); - Assert.assertEquals("Should removed identifier field", + Assert.assertEquals( + "Should removed identifier field", Sets.newHashSet(table.schema().findField("location.lon").fieldId()), table.schema().identifierFieldIds()); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id, location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have new identifier fields", + Assert.assertEquals( + "Should have new identifier fields", Sets.newHashSet( table.schema().findField("id").fieldId(), table.schema().findField("location.lon").fieldId()), @@ -112,29 +127,34 @@ public void testDropIdentifierFields() { sql("ALTER TABLE %s DROP IDENTIFIER FIELDS id, location.lon", tableName); table.refresh(); - Assert.assertEquals("Should have no identifier field", - Sets.newHashSet(), - table.schema().identifierFieldIds()); + Assert.assertEquals( + "Should have no identifier field", Sets.newHashSet(), table.schema().identifierFieldIds()); } @Test public void testDropInvalidIdentifierFields() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string NOT NULL, " + - "location struct NOT NULL) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string NOT NULL, " + + "location struct NOT NULL) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertTrue("Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); - AssertHelpers.assertThrows("should not allow dropping unknown fields", + Assert.assertTrue( + "Table should start without identifier", table.schema().identifierFieldIds().isEmpty()); + AssertHelpers.assertThrows( + "should not allow dropping unknown fields", IllegalArgumentException.class, "field unknown not found", () -> sql("ALTER TABLE %s DROP IDENTIFIER FIELDS unknown", tableName)); sql("ALTER TABLE %s SET IDENTIFIER FIELDS id", tableName); - AssertHelpers.assertThrows("should not allow dropping a field that is not an identifier", + AssertHelpers.assertThrows( + "should not allow dropping a field that is not an identifier", IllegalArgumentException.class, "data is not an identifier field", () -> sql("ALTER TABLE %s DROP IDENTIFIER FIELDS data", tableName)); - AssertHelpers.assertThrows("should not allow dropping a nested field that is not an identifier", + AssertHelpers.assertThrows( + "should not allow dropping a nested field that is not an identifier", IllegalArgumentException.class, "location.lon is not an identifier field", () -> sql("ALTER TABLE %s DROP IDENTIFIER FIELDS location.lon", tableName)); diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java index baf464d94ad0..d676101b1076 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAncestorsOfProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.List; @@ -30,7 +29,8 @@ public class TestAncestorsOfProcedure extends SparkExtensionsTestBase { - public TestAncestorsOfProcedure(String catalogName, String implementation, Map config) { + public TestAncestorsOfProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -51,14 +51,12 @@ public void testAncestorOfUsingEmptyArgs() { Long preSnapshotId = table.currentSnapshot().parentId(); Long preTimeStamp = table.snapshot(table.currentSnapshot().parentId()).timestampMillis(); - List output = sql("CALL %s.system.ancestors_of('%s')", - catalogName, tableIdent); + List output = sql("CALL %s.system.ancestors_of('%s')", catalogName, tableIdent); assertEquals( "Procedure output must match", ImmutableList.of( - row(currentSnapshotId, currentTimestamp), - row(preSnapshotId, preTimeStamp)), + row(currentSnapshotId, currentTimestamp), row(preSnapshotId, preTimeStamp)), output); } @@ -77,8 +75,7 @@ public void testAncestorOfUsingSnapshotId() { assertEquals( "Procedure output must match", ImmutableList.of( - row(currentSnapshotId, currentTimestamp), - row(preSnapshotId, preTimeStamp)), + row(currentSnapshotId, currentTimestamp), row(preSnapshotId, preTimeStamp)), sql("CALL %s.system.ancestors_of('%s', %dL)", catalogName, tableIdent, currentSnapshotId)); assertEquals( @@ -105,7 +102,8 @@ public void testAncestorOfWithRollBack() { Long thirdTimestamp = table.currentSnapshot().timestampMillis(); // roll back - sql("CALL %s.system.rollback_to_snapshot('%s', %dL)", + sql( + "CALL %s.system.rollback_to_snapshot('%s', %dL)", catalogName, tableIdent, secondSnapshotId); sql("INSERT INTO TABLE %s VALUES (4, 'd')", tableName); @@ -142,22 +140,29 @@ public void testAncestorOfUsingNamedArgs() { assertEquals( "Procedure output must match", ImmutableList.of(row(firstSnapshotId, firstTimestamp)), - sql("CALL %s.system.ancestors_of(snapshot_id => %dL, table => '%s')", + sql( + "CALL %s.system.ancestors_of(snapshot_id => %dL, table => '%s')", catalogName, firstSnapshotId, tableIdent)); } @Test public void testInvalidAncestorOfCases() { - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.ancestors_of()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier for argument table", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier for argument table", () -> sql("CALL %s.system.ancestors_of('')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for snapshot_id: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for snapshot_id: cannot cast", () -> sql("CALL %s.system.ancestors_of('%s', 1.1)", catalogName, tableIdent)); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java index a6815d21a7b3..9c2233ccb791 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCallStatementParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.math.BigDecimal; @@ -49,19 +48,19 @@ public class TestCallStatementParser { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; private static ParserInterface parser = null; @BeforeClass public static void startSpark() { - TestCallStatementParser.spark = SparkSession.builder() - .master("local[2]") - .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) - .config("spark.extra.prop", "value") - .getOrCreate(); + TestCallStatementParser.spark = + SparkSession.builder() + .master("local[2]") + .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) + .config("spark.extra.prop", "value") + .getOrCreate(); TestCallStatementParser.parser = spark.sessionState().sqlParser(); } @@ -75,8 +74,10 @@ public static void stopSpark() { @Test public void testCallWithPositionalArgs() throws ParseException { - CallStatement call = (CallStatement) parser.parsePlan("CALL c.n.func(1, '2', 3L, true, 1.0D, 9.0e1, 900e-1BD)"); - Assert.assertEquals(ImmutableList.of("c", "n", "func"), JavaConverters.seqAsJavaList(call.name())); + CallStatement call = + (CallStatement) parser.parsePlan("CALL c.n.func(1, '2', 3L, true, 1.0D, 9.0e1, 900e-1BD)"); + Assert.assertEquals( + ImmutableList.of("c", "n", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(7, call.args().size()); @@ -91,8 +92,10 @@ public void testCallWithPositionalArgs() throws ParseException { @Test public void testCallWithNamedArgs() throws ParseException { - CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func(c1 => 1, c2 => '2', c3 => true)"); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + CallStatement call = + (CallStatement) parser.parsePlan("CALL cat.system.func(c1 => 1, c2 => '2', c3 => true)"); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(3, call.args().size()); @@ -104,7 +107,8 @@ public void testCallWithNamedArgs() throws ParseException { @Test public void testCallWithMixedArgs() throws ParseException { CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func(c1 => 1, '2')"); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(2, call.args().size()); @@ -114,18 +118,24 @@ public void testCallWithMixedArgs() throws ParseException { @Test public void testCallWithTimestampArg() throws ParseException { - CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func(TIMESTAMP '2017-02-03T10:37:30.00Z')"); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + CallStatement call = + (CallStatement) + parser.parsePlan("CALL cat.system.func(TIMESTAMP '2017-02-03T10:37:30.00Z')"); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(1, call.args().size()); - checkArg(call, 0, Timestamp.from(Instant.parse("2017-02-03T10:37:30.00Z")), DataTypes.TimestampType); + checkArg( + call, 0, Timestamp.from(Instant.parse("2017-02-03T10:37:30.00Z")), DataTypes.TimestampType); } @Test public void testCallWithVarSubstitution() throws ParseException { - CallStatement call = (CallStatement) parser.parsePlan("CALL cat.system.func('${spark.extra.prop}')"); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + CallStatement call = + (CallStatement) parser.parsePlan("CALL cat.system.func('${spark.extra.prop}')"); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(1, call.args().size()); @@ -134,29 +144,31 @@ public void testCallWithVarSubstitution() throws ParseException { @Test public void testCallParseError() { - AssertHelpers.assertThrows("Should fail with a sensible parse error", IcebergParseException.class, + AssertHelpers.assertThrows( + "Should fail with a sensible parse error", + IcebergParseException.class, "missing '(' at 'radish'", () -> parser.parsePlan("CALL cat.system radish kebab")); } @Test public void testCallStripsComments() throws ParseException { - List callStatementsWithComments = Lists.newArrayList( - "/* bracketed comment */ CALL cat.system.func('${spark.extra.prop}')", - "/**/ CALL cat.system.func('${spark.extra.prop}')", - "-- single line comment \n CALL cat.system.func('${spark.extra.prop}')", - "-- multiple \n-- single line \n-- comments \n CALL cat.system.func('${spark.extra.prop}')", - "/* select * from multiline_comment \n where x like '%sql%'; */ CALL cat.system.func('${spark.extra.prop}')", - "/* {\"app\": \"dbt\", \"dbt_version\": \"1.0.1\", \"profile_name\": \"profile1\", \"target_name\": \"dev\", " + - "\"node_id\": \"model.profile1.stg_users\"} \n*/ CALL cat.system.func('${spark.extra.prop}')", - "/* Some multi-line comment \n" + - "*/ CALL /* inline comment */ cat.system.func('${spark.extra.prop}') -- ending comment", - "CALL -- a line ending comment\n" + - "cat.system.func('${spark.extra.prop}')" - ); + List callStatementsWithComments = + Lists.newArrayList( + "/* bracketed comment */ CALL cat.system.func('${spark.extra.prop}')", + "/**/ CALL cat.system.func('${spark.extra.prop}')", + "-- single line comment \n CALL cat.system.func('${spark.extra.prop}')", + "-- multiple \n-- single line \n-- comments \n CALL cat.system.func('${spark.extra.prop}')", + "/* select * from multiline_comment \n where x like '%sql%'; */ CALL cat.system.func('${spark.extra.prop}')", + "/* {\"app\": \"dbt\", \"dbt_version\": \"1.0.1\", \"profile_name\": \"profile1\", \"target_name\": \"dev\", " + + "\"node_id\": \"model.profile1.stg_users\"} \n*/ CALL cat.system.func('${spark.extra.prop}')", + "/* Some multi-line comment \n" + + "*/ CALL /* inline comment */ cat.system.func('${spark.extra.prop}') -- ending comment", + "CALL -- a line ending comment\n" + "cat.system.func('${spark.extra.prop}')"); for (String sqlText : callStatementsWithComments) { CallStatement call = (CallStatement) parser.parsePlan(sqlText); - Assert.assertEquals(ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); + Assert.assertEquals( + ImmutableList.of("cat", "system", "func"), JavaConverters.seqAsJavaList(call.name())); Assert.assertEquals(1, call.args().size()); @@ -164,12 +176,17 @@ public void testCallStripsComments() throws ParseException { } } - private void checkArg(CallStatement call, int index, Object expectedValue, DataType expectedType) { + private void checkArg( + CallStatement call, int index, Object expectedValue, DataType expectedType) { checkArg(call, index, null, expectedValue, expectedType); } - private void checkArg(CallStatement call, int index, String expectedName, - Object expectedValue, DataType expectedType) { + private void checkArg( + CallStatement call, + int index, + String expectedName, + Object expectedValue, + DataType expectedType) { if (expectedName != null) { NamedArgument arg = checkCast(call.args().apply(index), NamedArgument.class); @@ -190,7 +207,8 @@ private Literal toSparkLiteral(Object value, DataType dataType) { } private T checkCast(Object value, Class expectedClass) { - Assert.assertTrue("Expected instance of " + expectedClass.getName(), expectedClass.isInstance(value)); + Assert.assertTrue( + "Expected instance of " + expectedClass.getName(), expectedClass.isInstance(value)); return expectedClass.cast(value); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java index c69964693189..7309a176b922 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCherrypickSnapshotProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; + import java.util.List; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -34,11 +35,10 @@ import org.junit.After; import org.junit.Test; -import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; - public class TestCherrypickSnapshotProcedure extends SparkExtensionsTestBase { - public TestCherrypickSnapshotProcedure(String catalogName, String implementation, Map config) { + public TestCherrypickSnapshotProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -56,26 +56,30 @@ public void testCherrypickSnapshotUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.cherrypick_snapshot('%s', %dL)", - catalogName, tableIdent, wapSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.cherrypick_snapshot('%s', %dL)", + catalogName, tableIdent, wapSnapshot.snapshotId()); table.refresh(); Snapshot currentSnapshot = table.currentSnapshot(); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())), output); - assertEquals("Cherrypick must be successful", + assertEquals( + "Cherrypick must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -89,26 +93,30 @@ public void testCherrypickSnapshotUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.cherrypick_snapshot(snapshot_id => %dL, table => '%s')", - catalogName, wapSnapshot.snapshotId(), tableIdent); + List output = + sql( + "CALL %s.system.cherrypick_snapshot(snapshot_id => %dL, table => '%s')", + catalogName, wapSnapshot.snapshotId(), tableIdent); table.refresh(); Snapshot currentSnapshot = table.currentSnapshot(); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())), output); - assertEquals("Cherrypick must be successful", + assertEquals( + "Cherrypick must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -129,17 +137,20 @@ public void testCherrypickSnapshotRefreshesRelationCache() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - sql("CALL %s.system.cherrypick_snapshot('%s', %dL)", + sql( + "CALL %s.system.cherrypick_snapshot('%s', %dL)", catalogName, tableIdent, wapSnapshot.snapshotId()); - assertEquals("Cherrypick snapshot should be visible", + assertEquals( + "Cherrypick snapshot should be visible", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); @@ -150,31 +161,43 @@ public void testCherrypickSnapshotRefreshesRelationCache() { public void testCherrypickInvalidSnapshot() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should reject invalid snapshot id", - ValidationException.class, "Cannot cherry-pick unknown snapshot ID", + AssertHelpers.assertThrows( + "Should reject invalid snapshot id", + ValidationException.class, + "Cannot cherry-pick unknown snapshot ID", () -> sql("CALL %s.system.cherrypick_snapshot('%s', -1L)", catalogName, tableIdent)); } @Test public void testInvalidCherrypickSnapshotCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.cherrypick_snapshot('n', table => 't', 1L)", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.cherrypick_snapshot('n', 't', 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.cherrypick_snapshot('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.cherrypick_snapshot('', 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for snapshot_id: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for snapshot_id: cannot cast", () -> sql("CALL %s.system.cherrypick_snapshot('t', 2.2)", catalogName)); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestConflictValidation.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestConflictValidation.java index 4ce44818ab46..6fcbf1f903be 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestConflictValidation.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestConflictValidation.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.List; @@ -38,17 +37,20 @@ public class TestConflictValidation extends SparkExtensionsTestBase { - public TestConflictValidation(String catalogName, String implementation, Map config) { + public TestConflictValidation( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @Before public void createTables() { - sql("CREATE TABLE %s (id int, data string) USING iceberg " + - "PARTITIONED BY (id)" + - "TBLPROPERTIES" + - "('format-version'='2'," + - "'write.delete.mode'='merge-on-read')", tableName); + sql( + "CREATE TABLE %s (id int, data string) USING iceberg " + + "PARTITIONED BY (id)" + + "TBLPROPERTIES" + + "('format-version'='2'," + + "'write.delete.mode'='merge-on-read')", + tableName); sql("INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", tableName); } @@ -67,12 +69,14 @@ public void testOverwriteFilterSerializableIsolation() throws Exception { // Validating from previous snapshot finds conflicts Dataset conflictingDf = spark.createDataFrame(records, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting new data files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting new data files should throw exception", ValidationException.class, "Found conflicting files that can contain records matching ref(name=\"id\") == 1:", () -> { try { - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -84,7 +88,8 @@ public void testOverwriteFilterSerializableIsolation() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long newSnapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -92,9 +97,8 @@ public void testOverwriteFilterSerializableIsolation() throws Exception { @Test public void testOverwriteFilterSerializableIsolation2() throws Exception { - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(1, "b")); + List records = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(1, "b")); spark.createDataFrame(records, SimpleRecord.class).coalesce(1).writeTo(tableName).append(); Table table = validationCatalog.loadTable(tableIdent); @@ -107,12 +111,14 @@ public void testOverwriteFilterSerializableIsolation2() throws Exception { // Validating from previous snapshot finds conflicts List conflictingRecords = Lists.newArrayList(new SimpleRecord(1, "a")); Dataset conflictingDf = spark.createDataFrame(conflictingRecords, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting new delete files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting new delete files should throw exception", ValidationException.class, "Found new conflicting delete files that can apply to records matching ref(name=\"id\") == 1:", () -> { try { - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -124,7 +130,8 @@ public void testOverwriteFilterSerializableIsolation2() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long newSnapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -142,12 +149,14 @@ public void testOverwriteFilterSerializableIsolation3() throws Exception { // Validating from previous snapshot finds conflicts List conflictingRecords = Lists.newArrayList(new SimpleRecord(1, "a")); Dataset conflictingDf = spark.createDataFrame(conflictingRecords, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting deleted data files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting deleted data files should throw exception", ValidationException.class, "Found conflicting deleted files that can contain records matching ref(name=\"id\") == 1:", () -> { try { - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -159,7 +168,8 @@ public void testOverwriteFilterSerializableIsolation3() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long newSnapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -174,12 +184,14 @@ public void testOverwriteFilterNoSnapshotIdValidation() throws Exception { // Validating from no snapshot id defaults to beginning snapshot id and finds conflicts Dataset conflictingDf = spark.createDataFrame(records, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting new data files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting new data files should throw exception", ValidationException.class, "Found conflicting files that can contain records matching ref(name=\"id\") == 1:", () -> { try { - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwrite(functions.col("id").equalTo(1)); } catch (NoSuchTableException e) { @@ -190,7 +202,8 @@ public void testOverwriteFilterNoSnapshotIdValidation() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long newSnapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -198,9 +211,8 @@ public void testOverwriteFilterNoSnapshotIdValidation() throws Exception { @Test public void testOverwriteFilterSnapshotIsolation() throws Exception { - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(1, "b")); + List records = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(1, "b")); spark.createDataFrame(records, SimpleRecord.class).coalesce(1).writeTo(tableName).append(); Table table = validationCatalog.loadTable(tableIdent); @@ -213,12 +225,14 @@ public void testOverwriteFilterSnapshotIsolation() throws Exception { // Validating from previous snapshot finds conflicts List conflictingRecords = Lists.newArrayList(new SimpleRecord(1, "a")); Dataset conflictingDf = spark.createDataFrame(conflictingRecords, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting new delete files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting new delete files should throw exception", ValidationException.class, "Found new conflicting delete files that can apply to records matching ref(name=\"id\") == 1:", () -> { try { - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -230,7 +244,8 @@ public void testOverwriteFilterSnapshotIsolation() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long newSnapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -241,13 +256,13 @@ public void testOverwriteFilterSnapshotIsolation2() throws Exception { Table table = validationCatalog.loadTable(tableIdent); long snapshotId = table.currentSnapshot().snapshotId(); - List records = Lists.newArrayList( - new SimpleRecord(1, "a")); + List records = Lists.newArrayList(new SimpleRecord(1, "a")); spark.createDataFrame(records, SimpleRecord.class).writeTo(tableName).append(); // Validation should not fail due to conflicting data file in snapshot isolation mode Dataset conflictingDf = spark.createDataFrame(records, SimpleRecord.class); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) .overwrite(functions.col("id").equalTo(1)); @@ -263,12 +278,14 @@ public void testOverwritePartitionSerializableIsolation() throws Exception { // Validating from previous snapshot finds conflicts Dataset conflictingDf = spark.createDataFrame(records, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting deleted data files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting deleted data files should throw exception", ValidationException.class, "Found conflicting files that can contain records matching partitions [id=1]", () -> { try { - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwritePartitions(); @@ -280,7 +297,8 @@ public void testOverwritePartitionSerializableIsolation() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long newSnapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwritePartitions(); @@ -288,9 +306,8 @@ public void testOverwritePartitionSerializableIsolation() throws Exception { @Test public void testOverwritePartitionSnapshotIsolation() throws Exception { - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(1, "b")); + List records = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(1, "b")); spark.createDataFrame(records, SimpleRecord.class).coalesce(1).writeTo(tableName).append(); Table table = validationCatalog.loadTable(tableIdent); @@ -301,12 +318,14 @@ public void testOverwritePartitionSnapshotIsolation() throws Exception { // Validating from previous snapshot finds conflicts Dataset conflictingDf = spark.createDataFrame(records, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting deleted data files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting deleted data files should throw exception", ValidationException.class, "Found new conflicting delete files that can apply to records matching [id=1]", () -> { try { - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) .overwritePartitions(); @@ -318,7 +337,8 @@ public void testOverwritePartitionSnapshotIsolation() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long newSnapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) .overwritePartitions(); @@ -333,20 +353,21 @@ public void testOverwritePartitionSnapshotIsolation2() throws Exception { sql("DELETE FROM %s WHERE id='1'", tableName); // Validating from previous snapshot finds conflicts - List records = Lists.newArrayList( - new SimpleRecord(1, "a")); + List records = Lists.newArrayList(new SimpleRecord(1, "a")); spark.createDataFrame(records, SimpleRecord.class).coalesce(1).writeTo(tableName).append(); Dataset conflictingDf = spark.createDataFrame(records, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting deleted data files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting deleted data files should throw exception", ValidationException.class, "Found conflicting deleted files that can apply to records matching [id=1]", () -> { try { - conflictingDf.writeTo(tableName) - .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) - .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) - .overwritePartitions(); + conflictingDf + .writeTo(tableName) + .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) + .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) + .overwritePartitions(); } catch (NoSuchTableException e) { throw new RuntimeException(e); } @@ -355,7 +376,8 @@ public void testOverwritePartitionSnapshotIsolation2() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long newSnapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) .overwritePartitions(); @@ -371,7 +393,8 @@ public void testOverwritePartitionSnapshotIsolation3() throws Exception { // Validation should not find conflicting data file in snapshot isolation mode Dataset conflictingDf = spark.createDataFrame(records, SimpleRecord.class); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()) .overwritePartitions(); @@ -381,18 +404,19 @@ public void testOverwritePartitionSnapshotIsolation3() throws Exception { public void testOverwritePartitionNoSnapshotIdValidation() throws Exception { Table table = validationCatalog.loadTable(tableIdent); - List records = Lists.newArrayList( - new SimpleRecord(1, "a")); + List records = Lists.newArrayList(new SimpleRecord(1, "a")); spark.createDataFrame(records, SimpleRecord.class).writeTo(tableName).append(); // Validating from null snapshot is equivalent to validating from beginning Dataset conflictingDf = spark.createDataFrame(records, SimpleRecord.class); - AssertHelpers.assertThrowsCause("Conflicting deleted data files should throw exception", + AssertHelpers.assertThrowsCause( + "Conflicting deleted data files should throw exception", ValidationException.class, "Found conflicting files that can contain records matching partitions [id=1]", () -> { try { - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwritePartitions(); } catch (NoSuchTableException e) { @@ -403,7 +427,8 @@ public void testOverwritePartitionNoSnapshotIdValidation() throws Exception { // Validating from latest snapshot should succeed table.refresh(); long snapshotId = table.currentSnapshot().snapshotId(); - conflictingDf.writeTo(tableName) + conflictingDf + .writeTo(tableName) .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)) .option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()) .overwritePartitions(); diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java index c9d15906251f..8a8a8c6ab722 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -25,8 +24,13 @@ public class TestCopyOnWriteDelete extends TestDelete { - public TestCopyOnWriteDelete(String catalogName, String implementation, Map config, - String fileFormat, Boolean vectorized, String distributionMode) { + public TestCopyOnWriteDelete( + String catalogName, + String implementation, + Map config, + String fileFormat, + Boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java index 60aba632646f..27cbd1a9d5de 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -25,8 +24,13 @@ public class TestCopyOnWriteMerge extends TestMerge { - public TestCopyOnWriteMerge(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestCopyOnWriteMerge( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java index cc73ecba9ddf..3fa3f74f6a39 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -25,8 +24,13 @@ public class TestCopyOnWriteUpdate extends TestUpdate { - public TestCopyOnWriteUpdate(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestCopyOnWriteUpdate( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java index d492a30eb827..e91dfa9e90ba 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java @@ -16,9 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.RowLevelOperationMode.COPY_ON_WRITE; +import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL; +import static org.apache.iceberg.TableProperties.DELETE_MODE; +import static org.apache.iceberg.TableProperties.DELETE_MODE_DEFAULT; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.SPLIT_SIZE; +import static org.apache.spark.sql.functions.lit; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -57,18 +64,15 @@ import org.junit.BeforeClass; import org.junit.Test; -import static org.apache.iceberg.RowLevelOperationMode.COPY_ON_WRITE; -import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL; -import static org.apache.iceberg.TableProperties.DELETE_MODE; -import static org.apache.iceberg.TableProperties.DELETE_MODE_DEFAULT; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.SPLIT_SIZE; -import static org.apache.spark.sql.functions.lit; - public abstract class TestDelete extends SparkRowLevelOperationsTestBase { - public TestDelete(String catalogName, String implementation, Map config, - String fileFormat, Boolean vectorized, String distributionMode) { + public TestDelete( + String catalogName, + String implementation, + Map config, + String fileFormat, + Boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -101,9 +105,11 @@ public void testDeleteFileThenMetadataDelete() throws Exception { sql("DELETE FROM %s AS t WHERE t.id = 1", tableName); Set dataFilesAfter = TestHelpers.dataFiles(table); - Assert.assertTrue("Data file should have been removed", dataFilesBefore.size() > dataFilesAfter.size()); + Assert.assertTrue( + "Data file should have been removed", dataFilesBefore.size() > dataFilesAfter.size()); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -119,7 +125,8 @@ public void testDeleteWithFalseCondition() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -134,7 +141,8 @@ public void testDeleteFromEmptyTable() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -152,7 +160,8 @@ public void testExplain() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 1 snapshot", 1, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -165,7 +174,8 @@ public void testDeleteWithAlias() { sql("DELETE FROM %s AS t WHERE t.id IS NULL", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -189,7 +199,8 @@ public void testDeleteWithDynamicFileFiltering() throws NoSuchTableException { validateMergeOnRead(currentSnapshot, "1", "1", null); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hardware"), row(1, "hr"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -217,7 +228,8 @@ public void testDeleteNonExistingRecords() { } } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -239,9 +251,8 @@ public void testDeleteWithoutCondition() { Snapshot currentSnapshot = table.currentSnapshot(); validateDelete(currentSnapshot, "2", "3"); - assertEquals("Should have expected rows", - ImmutableList.of(), - sql("SELECT * FROM %s", tableName)); + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); } @Test @@ -261,7 +272,8 @@ public void testDeleteUsingMetadataWithComplexCondition() { Snapshot currentSnapshot = table.currentSnapshot(); validateDelete(currentSnapshot, "2", "2"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "dep1")), sql("SELECT * FROM %s", tableName)); } @@ -288,7 +300,8 @@ public void testDeleteWithArbitraryPartitionPredicates() { validateMergeOnRead(currentSnapshot, "1", "1", null); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -299,8 +312,10 @@ public void testDeleteWithNonDeterministicCondition() { sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware')", tableName); - AssertHelpers.assertThrows("Should complain about non-deterministic expressions", - AnalysisException.class, "nondeterministic expressions are only allowed", + AssertHelpers.assertThrows( + "Should complain about non-deterministic expressions", + AnalysisException.class, + "nondeterministic expressions are only allowed", () -> sql("DELETE FROM %s WHERE id = 1 AND rand() > 0.5", tableName)); } @@ -312,25 +327,29 @@ public void testDeleteWithFoldableConditions() { // should keep all rows and don't trigger execution sql("DELETE FROM %s WHERE false", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should keep all rows and don't trigger execution sql("DELETE FROM %s WHERE 50 <> 50", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should keep all rows and don't trigger execution sql("DELETE FROM %s WHERE 1 > null", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should remove all rows sql("DELETE FROM %s WHERE 21 = 21", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -342,24 +361,29 @@ public void testDeleteWithFoldableConditions() { public void testDeleteWithNullConditions() { createAndInitPartitionedTable(); - sql("INSERT INTO TABLE %s VALUES (0, null), (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName); + sql( + "INSERT INTO TABLE %s VALUES (0, null), (1, 'hr'), (2, 'hardware'), (null, 'hr')", + tableName); // should keep all rows as null is never equal to null sql("DELETE FROM %s WHERE dep = null", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); // null = 'software' -> null // should delete using metadata operation only sql("DELETE FROM %s WHERE dep = 'software'", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); // should delete using metadata operation only sql("DELETE FROM %s WHERE dep <=> NULL", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); @@ -377,17 +401,20 @@ public void testDeleteWithInAndNotInConditions() { sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName); sql("DELETE FROM %s WHERE id IN (1, null)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("DELETE FROM %s WHERE id NOT IN (null, 1)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("DELETE FROM %s WHERE id NOT IN (1, 10)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -398,16 +425,20 @@ public void testDeleteWithMultipleRowGroupsParquet() throws NoSuchTableException createAndInitPartitionedTable(); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, SPLIT_SIZE, 100); List ids = Lists.newArrayListWithCapacity(200); for (int id = 1; id <= 200; id++) { ids.add(id); } - Dataset df = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); + Dataset df = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); df.coalesce(1).writeTo(tableName).append(); Assert.assertEquals(200, spark.table(tableName).count()); @@ -426,14 +457,12 @@ public void testDeleteWithConditionOnNestedColumn() { sql("INSERT INTO TABLE %s VALUES (2, named_struct(\"c1\", 2, \"c2\", \"v2\"))", tableName); sql("DELETE FROM %s WHERE complex.c1 = id + 2", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(2)), - sql("SELECT id FROM %s", tableName)); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2)), sql("SELECT id FROM %s", tableName)); sql("DELETE FROM %s t WHERE t.complex.c1 = id", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(), - sql("SELECT id FROM %s", tableName)); + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT id FROM %s", tableName)); } @Test @@ -445,28 +474,35 @@ public void testDeleteWithInSubquery() throws NoSuchTableException { createOrReplaceView("deleted_id", Arrays.asList(0, 1, null), Encoders.INT()); createOrReplaceView("deleted_dep", Arrays.asList("software", "hr"), Encoders.STRING()); - sql("DELETE FROM %s WHERE id IN (SELECT * FROM deleted_id) AND dep IN (SELECT * from deleted_dep)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s WHERE id IN (SELECT * FROM deleted_id) AND dep IN (SELECT * from deleted_dep)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); append(new Employee(1, "hr"), new Employee(-1, "hr")); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("DELETE FROM %s WHERE id IS NULL OR id IN (SELECT value + 2 FROM deleted_id)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "hr")), sql("SELECT * FROM %s ORDER BY id", tableName)); append(new Employee(null, "hr"), new Employee(2, "hr")); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "hr"), row(2, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("DELETE FROM %s WHERE id IN (SELECT value + 2 FROM deleted_id) AND dep = 'hr'", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -477,11 +513,13 @@ public void testDeleteWithMultiColumnInSubquery() throws NoSuchTableException { append(new Employee(1, "hr"), new Employee(2, "hardware"), new Employee(null, "hr")); - List deletedEmployees = Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr")); + List deletedEmployees = + Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr")); createOrReplaceView("deleted_employee", deletedEmployees, Encoders.bean(Employee.class)); sql("DELETE FROM %s WHERE (id, dep) IN (SELECT id, dep FROM deleted_employee)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -497,36 +535,50 @@ public void testDeleteWithNotInSubquery() throws NoSuchTableException { // the file filter subquery (nested loop lef-anti join) returns 0 records sql("DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("INSERT INTO TABLE %s VALUES (1, 'hr'), (2, 'hardware'), (null, 'hr')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id) OR dep IN ('software', 'hr')", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s WHERE id NOT IN (SELECT * FROM deleted_id) OR dep IN ('software', 'hr')", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE " + - "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) AND " + - "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE " + + "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) AND " + + "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE " + - "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) OR " + - "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE " + + "id NOT IN (SELECT * FROM deleted_id WHERE value IS NOT NULL) OR " + + "EXISTS (SELECT 1 FROM FROM deleted_dep WHERE t.dep = deleted_dep.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -535,8 +587,10 @@ public void testDeleteWithNotInSubquery() throws NoSuchTableException { public void testDeleteOnNonIcebergTableNotSupported() { createOrReplaceView("testtable", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Delete is supported only for Iceberg tables", - AnalysisException.class, "DELETE is only supported with v2 tables.", + AssertHelpers.assertThrows( + "Delete is supported only for Iceberg tables", + AnalysisException.class, + "DELETE is only supported with v2 tables.", () -> sql("DELETE FROM %s WHERE c1 = -100", "testtable")); } @@ -549,25 +603,37 @@ public void testDeleteWithExistSubquery() throws NoSuchTableException { createOrReplaceView("deleted_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("deleted_dep", Arrays.asList("software", "hr"), Encoders.STRING()); - sql("DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value) OR t.id IS NULL", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value) OR t.id IS NULL", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s", tableName)); - sql("DELETE FROM %s t WHERE " + - "EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value) AND " + - "EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE " + + "EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value) AND " + + "EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware")), sql("SELECT * FROM %s", tableName)); } @@ -581,21 +647,28 @@ public void testDeleteWithNotExistsSubquery() throws NoSuchTableException { createOrReplaceView("deleted_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("deleted_dep", Arrays.asList("software", "hr"), Encoders.STRING()); - sql("DELETE FROM %s t WHERE " + - "NOT EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value + 2) AND " + - "NOT EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE " + + "NOT EXISTS (SELECT 1 FROM deleted_id di WHERE t.id = di.value + 2) AND " + + "NOT EXISTS (SELECT 1 FROM deleted_dep dd WHERE t.dep = dd.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("DELETE FROM %s t WHERE NOT EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)", tableName); - assertEquals("Should have expected rows", + sql( + "DELETE FROM %s t WHERE NOT EXISTS (SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); String subquery = "SELECT 1 FROM deleted_id d WHERE t.id = d.value + 2"; sql("DELETE FROM %s t WHERE NOT EXISTS (%s) OR t.id = 1", tableName, subquery); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -609,12 +682,15 @@ public void testDeleteWithScalarSubquery() throws NoSuchTableException { createOrReplaceView("deleted_id", Arrays.asList(1, 100, null), Encoders.INT()); // TODO: Spark does not support AQE and DPP with aggregates at the moment - withSQLConf(ImmutableMap.of(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), "false"), () -> { - sql("DELETE FROM %s t WHERE id <= (SELECT min(value) FROM deleted_id)", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(2, "hardware"), row(null, "hr")), - sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - }); + withSQLConf( + ImmutableMap.of(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), "false"), + () -> { + sql("DELETE FROM %s t WHERE id <= (SELECT min(value) FROM deleted_id)", tableName); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(2, "hardware"), row(null, "hr")), + sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); + }); } @Test @@ -647,52 +723,61 @@ public synchronized void testDeleteWithSerializableIsolation() throws Interrupte createAndInitUnpartitionedTable(); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DELETE_ISOLATION_LEVEL, "serializable"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, DELETE_ISOLATION_LEVEL, "serializable"); // Pre-populate the table to force it to use the Spark Writers instead of Metadata-Only Delete // for more consistent exception stack List ids = ImmutableList.of(1, 2); - Dataset inputDF = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); + Dataset inputDF = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); try { inputDF.coalesce(1).writeTo(tableName).append(); } catch (NoSuchTableException e) { throw new RuntimeException(e); } - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // delete thread - Future deleteFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("DELETE FROM %s WHERE id = 1", tableName); - barrier.incrementAndGet(); - } - }); + Future deleteFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("DELETE FROM %s WHERE id = 1", tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - - try { - inputDF.coalesce(1).writeTo(tableName).append(); - } catch (NoSuchTableException e) { - throw new RuntimeException(e); - } - - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + + try { + inputDF.coalesce(1).writeTo(tableName).append(); + } catch (NoSuchTableException e) { + throw new RuntimeException(e); + } + + barrier.incrementAndGet(); + } + }); try { deleteFuture.get(); @@ -703,7 +788,8 @@ public synchronized void testDeleteWithSerializableIsolation() throws Interrupte Throwable validationException = sparkException.getCause(); Assert.assertThat(validationException, CoreMatchers.instanceOf(ValidationException.class)); String errMsg = validationException.getMessage(); - Assert.assertThat(errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); + Assert.assertThat( + errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); } finally { appendFuture.cancel(true); } @@ -713,51 +799,61 @@ public synchronized void testDeleteWithSerializableIsolation() throws Interrupte } @Test - public synchronized void testDeleteWithSnapshotIsolation() throws InterruptedException, ExecutionException { + public synchronized void testDeleteWithSnapshotIsolation() + throws InterruptedException, ExecutionException { // cannot run tests with concurrency for Hadoop tables without atomic renames Assume.assumeFalse(catalogName.equalsIgnoreCase("testhadoop")); createAndInitUnpartitionedTable(); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DELETE_ISOLATION_LEVEL, "snapshot"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, DELETE_ISOLATION_LEVEL, "snapshot"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // delete thread - Future deleteFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("DELETE FROM %s WHERE id = 1", tableName); - barrier.incrementAndGet(); - } - }); + Future deleteFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("DELETE FROM %s WHERE id = 1", tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - List ids = ImmutableList.of(1, 2); - Dataset inputDF = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); - - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - - try { - inputDF.coalesce(1).writeTo(tableName).append(); - } catch (NoSuchTableException e) { - throw new RuntimeException(e); - } - - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + List ids = ImmutableList.of(1, 2); + Dataset inputDF = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); + + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + + try { + inputDF.coalesce(1).writeTo(tableName).append(); + } catch (NoSuchTableException e) { + throw new RuntimeException(e); + } + + barrier.incrementAndGet(); + } + }); try { deleteFuture.get(); @@ -781,7 +877,8 @@ public void testDeleteRefreshesRelationCache() throws NoSuchTableException { spark.sql("CACHE TABLE tmp"); - assertEquals("View should have correct data", + assertEquals( + "View should have correct data", ImmutableList.of(row(1, "hardware"), row(1, "hr")), sql("SELECT * FROM tmp ORDER BY id, dep")); @@ -796,11 +893,13 @@ public void testDeleteRefreshesRelationCache() throws NoSuchTableException { } else { validateMergeOnRead(currentSnapshot, "2", "2", null); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hardware"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - assertEquals("Should refresh the relation cache", + assertEquals( + "Should refresh the relation cache", ImmutableList.of(), sql("SELECT * FROM tmp ORDER BY id, dep")); @@ -816,8 +915,10 @@ public void testDeleteWithMultipleSpecs() { // write a file partitioned by dep sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - append(tableName, "{ \"id\": 2, \"dep\": \"hr\", \"category\": \"c1\" }\n" + - "{ \"id\": 3, \"dep\": \"hr\", \"category\": \"c1\" }"); + append( + tableName, + "{ \"id\": 2, \"dep\": \"hr\", \"category\": \"c1\" }\n" + + "{ \"id\": 3, \"dep\": \"hr\", \"category\": \"c1\" }"); // write a file partitioned by dep and category sql("ALTER TABLE %s ADD PARTITION FIELD category", tableName); @@ -834,14 +935,16 @@ public void testDeleteWithMultipleSpecs() { Snapshot currentSnapshot = table.currentSnapshot(); if (mode(table) == COPY_ON_WRITE) { - // copy-on-write is tested against v1 and such tables have different partition evolution behavior + // copy-on-write is tested against v1 and such tables have different partition evolution + // behavior // that's why the number of changed partitions is 4 for copy-on-write validateCopyOnWrite(currentSnapshot, "4", "4", "1"); } else { validateMergeOnRead(currentSnapshot, "3", "3", null); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(2, "hr", "c1")), sql("SELECT * FROM %s ORDER BY id", tableName)); } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java index a77dafe5af05..5cb4f17edcf4 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestExpireSnapshotsProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; + import java.io.IOException; import java.sql.Timestamp; import java.time.Instant; @@ -45,11 +46,10 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TableProperties.GC_ENABLED; - public class TestExpireSnapshotsProcedure extends SparkExtensionsTestBase { - public TestExpireSnapshotsProcedure(String catalogName, String implementation, Map config) { + public TestExpireSnapshotsProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -62,9 +62,7 @@ public void removeTables() { public void testExpireSnapshotsInEmptyTable() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - List output = sql( - "CALL %s.system.expire_snapshots('%s')", - catalogName, tableIdent); + List output = sql("CALL %s.system.expire_snapshots('%s')", catalogName, tableIdent); assertEquals("Should not delete any files", ImmutableList.of(row(0L, 0L, 0L, 0L, 0L)), output); } @@ -83,17 +81,17 @@ public void testExpireSnapshotsUsingPositionalArgs() { table.refresh(); Snapshot secondSnapshot = table.currentSnapshot(); - Timestamp secondSnapshotTimestamp = Timestamp.from(Instant.ofEpochMilli(secondSnapshot.timestampMillis())); + Timestamp secondSnapshotTimestamp = + Timestamp.from(Instant.ofEpochMilli(secondSnapshot.timestampMillis())); Assert.assertEquals("Should be 2 snapshots", 2, Iterables.size(table.snapshots())); // expire without retainLast param - List output1 = sql( - "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s')", - catalogName, tableIdent, secondSnapshotTimestamp); - assertEquals("Procedure output must match", - ImmutableList.of(row(0L, 0L, 0L, 0L, 1L)), - output1); + List output1 = + sql( + "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s')", + catalogName, tableIdent, secondSnapshotTimestamp); + assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 0L, 0L, 1L)), output1); table.refresh(); @@ -101,7 +99,8 @@ public void testExpireSnapshotsUsingPositionalArgs() { sql("INSERT OVERWRITE %s VALUES (3, 'c')", tableName); sql("INSERT INTO TABLE %s VALUES (4, 'd')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(3L, "c"), row(4L, "d")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -114,12 +113,11 @@ public void testExpireSnapshotsUsingPositionalArgs() { Assert.assertEquals("Should be 3 snapshots", 3, Iterables.size(table.snapshots())); // expire with retainLast param - List output = sql( - "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s', 2)", - catalogName, tableIdent, currentTimestamp); - assertEquals("Procedure output must match", - ImmutableList.of(row(2L, 0L, 0L, 2L, 1L)), - output); + List output = + sql( + "CALL %s.system.expire_snapshots('%s', TIMESTAMP '%s', 2)", + catalogName, tableIdent, currentTimestamp); + assertEquals("Procedure output must match", ImmutableList.of(row(2L, 0L, 0L, 2L, 1L)), output); } @Test @@ -137,15 +135,14 @@ public void testExpireSnapshotUsingNamedArgs() { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); - List output = sql( - "CALL %s.system.expire_snapshots(" + - "older_than => TIMESTAMP '%s'," + - "table => '%s'," + - "retain_last => 1)", - catalogName, currentTimestamp, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(0L, 0L, 0L, 0L, 1L)), - output); + List output = + sql( + "CALL %s.system.expire_snapshots(" + + "older_than => TIMESTAMP '%s'," + + "table => '%s'," + + "retain_last => 1)", + catalogName, currentTimestamp, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 0L, 0L, 1L)), output); } @Test @@ -154,31 +151,43 @@ public void testExpireSnapshotsGCDisabled() { sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' 'false')", tableName, GC_ENABLED); - AssertHelpers.assertThrows("Should reject call", - ValidationException.class, "Cannot expire snapshots: GC is disabled", + AssertHelpers.assertThrows( + "Should reject call", + ValidationException.class, + "Cannot expire snapshots: GC is disabled", () -> sql("CALL %s.system.expire_snapshots('%s')", catalogName, tableIdent)); } @Test public void testInvalidExpireSnapshotsCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.expire_snapshots('n', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.expire_snapshots('n', 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.expire_snapshots()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.expire_snapshots('n', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.expire_snapshots('')", catalogName)); } @@ -187,13 +196,24 @@ public void testResolvingTableInAnotherCatalog() throws IOException { String anotherCatalog = "another_" + catalogName; spark.conf().set("spark.sql.catalog." + anotherCatalog, SparkCatalog.class.getName()); spark.conf().set("spark.sql.catalog." + anotherCatalog + ".type", "hadoop"); - spark.conf().set("spark.sql.catalog." + anotherCatalog + ".warehouse", "file:" + temp.newFolder().toString()); - - sql("CREATE TABLE %s.%s (id bigint NOT NULL, data string) USING iceberg", anotherCatalog, tableIdent); - - AssertHelpers.assertThrows("Should reject calls for a table in another catalog", - IllegalArgumentException.class, "Cannot run procedure in catalog", - () -> sql("CALL %s.system.expire_snapshots('%s')", catalogName, anotherCatalog + "." + tableName)); + spark + .conf() + .set( + "spark.sql.catalog." + anotherCatalog + ".warehouse", + "file:" + temp.newFolder().toString()); + + sql( + "CREATE TABLE %s.%s (id bigint NOT NULL, data string) USING iceberg", + anotherCatalog, tableIdent); + + AssertHelpers.assertThrows( + "Should reject calls for a table in another catalog", + IllegalArgumentException.class, + "Cannot run procedure in catalog", + () -> + sql( + "CALL %s.system.expire_snapshots('%s')", + catalogName, anotherCatalog + "." + tableName)); } @Test @@ -206,68 +226,89 @@ public void testConcurrentExpireSnapshots() { sql("INSERT INTO TABLE %s VALUES (4, 'd')", tableName); Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); - List output = sql( - "CALL %s.system.expire_snapshots(" + - "older_than => TIMESTAMP '%s'," + - "table => '%s'," + - "max_concurrent_deletes => %s," + - "retain_last => 1)", - catalogName, currentTimestamp, tableIdent, 4); - assertEquals("Expiring snapshots concurrently should succeed", - ImmutableList.of(row(0L, 0L, 0L, 0L, 3L)), output); + List output = + sql( + "CALL %s.system.expire_snapshots(" + + "older_than => TIMESTAMP '%s'," + + "table => '%s'," + + "max_concurrent_deletes => %s," + + "retain_last => 1)", + catalogName, currentTimestamp, tableIdent, 4); + assertEquals( + "Expiring snapshots concurrently should succeed", + ImmutableList.of(row(0L, 0L, 0L, 0L, 3L)), + output); } @Test public void testConcurrentExpireSnapshotsWithInvalidInput() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should throw an error when max_concurrent_deletes = 0", - IllegalArgumentException.class, "max_concurrent_deletes should have value > 0", - () -> sql("CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)", - catalogName, tableIdent, 0)); - - AssertHelpers.assertThrows("Should throw an error when max_concurrent_deletes < 0 ", - IllegalArgumentException.class, "max_concurrent_deletes should have value > 0", - () -> sql( - "CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)", - catalogName, tableIdent, -1)); - + AssertHelpers.assertThrows( + "Should throw an error when max_concurrent_deletes = 0", + IllegalArgumentException.class, + "max_concurrent_deletes should have value > 0", + () -> + sql( + "CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)", + catalogName, tableIdent, 0)); + + AssertHelpers.assertThrows( + "Should throw an error when max_concurrent_deletes < 0 ", + IllegalArgumentException.class, + "max_concurrent_deletes should have value > 0", + () -> + sql( + "CALL %s.system.expire_snapshots(table => '%s', max_concurrent_deletes => %s)", + catalogName, tableIdent, -1)); } @Test public void testExpireDeleteFiles() throws Exception { - sql("CREATE TABLE %s (id bigint, data string) USING iceberg TBLPROPERTIES" + - "('format-version'='2', 'write.delete.mode'='merge-on-read')", tableName); - - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "d") - ); - spark.createDataset(records, Encoders.bean(SimpleRecord.class)).coalesce(1).writeTo(tableName).append(); + sql( + "CREATE TABLE %s (id bigint, data string) USING iceberg TBLPROPERTIES" + + "('format-version'='2', 'write.delete.mode'='merge-on-read')", + tableName); + + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "d")); + spark + .createDataset(records, Encoders.bean(SimpleRecord.class)) + .coalesce(1) + .writeTo(tableName) + .append(); sql("DELETE FROM %s WHERE id=1", tableName); Table table = Spark3Util.loadIcebergTable(spark, tableName); - Assert.assertEquals("Should have 1 delete manifest", 1, TestHelpers.deleteManifests(table).size()); + Assert.assertEquals( + "Should have 1 delete manifest", 1, TestHelpers.deleteManifests(table).size()); Assert.assertEquals("Should have 1 delete file", 1, TestHelpers.deleteFiles(table).size()); Path deleteManifestPath = new Path(TestHelpers.deleteManifests(table).iterator().next().path()); - Path deleteFilePath = new Path(String.valueOf(TestHelpers.deleteFiles(table).iterator().next().path())); - - sql("CALL %s.system.rewrite_data_files(" + - "table => '%s'," + - "options => map(" + - "'delete-file-threshold','1'," + - "'use-starting-sequence-number', 'false'))", + Path deleteFilePath = + new Path(String.valueOf(TestHelpers.deleteFiles(table).iterator().next().path())); + + sql( + "CALL %s.system.rewrite_data_files(" + + "table => '%s'," + + "options => map(" + + "'delete-file-threshold','1'," + + "'use-starting-sequence-number', 'false'))", catalogName, tableIdent); table.refresh(); - sql("INSERT INTO TABLE %s VALUES (5, 'e')", tableName); // this txn moves the file to the DELETED state + sql( + "INSERT INTO TABLE %s VALUES (5, 'e')", + tableName); // this txn moves the file to the DELETED state sql("INSERT INTO TABLE %s VALUES (6, 'f')", tableName); // this txn removes the file reference table.refresh(); - Assert.assertEquals("Should have no delete manifests", 0, TestHelpers.deleteManifests(table).size()); + Assert.assertEquals( + "Should have no delete manifests", 0, TestHelpers.deleteManifests(table).size()); Assert.assertEquals("Should have no delete files", 0, TestHelpers.deleteFiles(table).size()); FileSystem localFs = FileSystem.getLocal(new Configuration()); @@ -275,14 +316,18 @@ public void testExpireDeleteFiles() throws Exception { Assert.assertTrue("Delete file should still exist", localFs.exists(deleteFilePath)); Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); - List output = sql("CALL %s.system.expire_snapshots(" + - "older_than => TIMESTAMP '%s'," + - "table => '%s'," + - "retain_last => 1)", - catalogName, currentTimestamp, tableIdent); - - assertEquals("Should deleted 1 data and pos delete file and 4 manifests and lists (one for each txn)", - ImmutableList.of(row(1L, 1L, 0L, 4L, 4L)), output); + List output = + sql( + "CALL %s.system.expire_snapshots(" + + "older_than => TIMESTAMP '%s'," + + "table => '%s'," + + "retain_last => 1)", + catalogName, currentTimestamp, tableIdent); + + assertEquals( + "Should deleted 1 data and pos delete file and 4 manifests and lists (one for each txn)", + ImmutableList.of(row(1L, 1L, 0L, 4L, 4L)), + output); Assert.assertFalse("Delete manifest should be removed", localFs.exists(deleteManifestPath)); Assert.assertFalse("Delete file should be removed", localFs.exists(deleteFilePath)); } @@ -302,13 +347,14 @@ public void testExpireSnapshotWithStreamResultsEnabled() { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); - List output = sql( - "CALL %s.system.expire_snapshots(" + - "older_than => TIMESTAMP '%s'," + - "table => '%s'," + - "retain_last => 1, " + - "stream_results => true)", - catalogName, currentTimestamp, tableIdent); + List output = + sql( + "CALL %s.system.expire_snapshots(" + + "older_than => TIMESTAMP '%s'," + + "table => '%s'," + + "retain_last => 1, " + + "stream_results => true)", + catalogName, currentTimestamp, tableIdent); assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 0L, 0L, 1L)), output); } @@ -330,18 +376,15 @@ public void testExpireSnapshotsProcedureWorksWithSqlComments() { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); String callStatement = - "/* CALL statement is used to expire snapshots */\n" + - "-- And we have single line comments as well \n" + - "/* And comments that span *multiple* \n" + - " lines */ CALL /* this is the actual CALL */ %s.system.expire_snapshots(" + - " older_than => TIMESTAMP '%s'," + - " table => '%s'," + - " retain_last => 1)"; - List output = sql( - callStatement, catalogName, currentTimestamp, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(0L, 0L, 0L, 0L, 1L)), - output); + "/* CALL statement is used to expire snapshots */\n" + + "-- And we have single line comments as well \n" + + "/* And comments that span *multiple* \n" + + " lines */ CALL /* this is the actual CALL */ %s.system.expire_snapshots(" + + " older_than => TIMESTAMP '%s'," + + " table => '%s'," + + " retain_last => 1)"; + List output = sql(callStatement, catalogName, currentTimestamp, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(0L, 0L, 0L, 0L, 1L)), output); table.refresh(); diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java index ce88814ce937..8d2e10ea17eb 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestIcebergExpressions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.math.BigDecimal; @@ -31,7 +30,8 @@ public class TestIcebergExpressions extends SparkExtensionsTestBase { - public TestIcebergExpressions(String catalogName, String implementation, Map config) { + public TestIcebergExpressions( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -44,26 +44,30 @@ public void removeTables() { @Test public void testTruncateExpressions() { - sql("CREATE TABLE %s ( " + - " int_c INT, long_c LONG, dec_c DECIMAL(4, 2), str_c STRING, binary_c BINARY " + - ") USING iceberg", tableName); + sql( + "CREATE TABLE %s ( " + + " int_c INT, long_c LONG, dec_c DECIMAL(4, 2), str_c STRING, binary_c BINARY " + + ") USING iceberg", + tableName); - sql("CREATE TEMPORARY VIEW emp " + - "AS SELECT * FROM VALUES (101, 10001, 10.65, '101-Employee', CAST('1234' AS BINARY)) " + - "AS EMP(int_c, long_c, dec_c, str_c, binary_c)"); + sql( + "CREATE TEMPORARY VIEW emp " + + "AS SELECT * FROM VALUES (101, 10001, 10.65, '101-Employee', CAST('1234' AS BINARY)) " + + "AS EMP(int_c, long_c, dec_c, str_c, binary_c)"); sql("INSERT INTO %s SELECT * FROM emp", tableName); Dataset df = spark.sql("SELECT * FROM " + tableName); df.select( - new Column(new IcebergTruncateTransform(df.col("int_c").expr(), 2)).as("int_c"), - new Column(new IcebergTruncateTransform(df.col("long_c").expr(), 2)).as("long_c"), - new Column(new IcebergTruncateTransform(df.col("dec_c").expr(), 50)).as("dec_c"), - new Column(new IcebergTruncateTransform(df.col("str_c").expr(), 2)).as("str_c"), - new Column(new IcebergTruncateTransform(df.col("binary_c").expr(), 2)).as("binary_c") - ).createOrReplaceTempView("v"); + new Column(new IcebergTruncateTransform(df.col("int_c").expr(), 2)).as("int_c"), + new Column(new IcebergTruncateTransform(df.col("long_c").expr(), 2)).as("long_c"), + new Column(new IcebergTruncateTransform(df.col("dec_c").expr(), 50)).as("dec_c"), + new Column(new IcebergTruncateTransform(df.col("str_c").expr(), 2)).as("str_c"), + new Column(new IcebergTruncateTransform(df.col("binary_c").expr(), 2)).as("binary_c")) + .createOrReplaceTempView("v"); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(100, 10000L, new BigDecimal("10.50"), "10", "12")), sql("SELECT int_c, long_c, dec_c, str_c, CAST(binary_c AS STRING) FROM v")); } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java index 52f7efceb74a..c485dbfe2f93 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMerge.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.SPLIT_SIZE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; +import static org.apache.spark.sql.functions.lit; + import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -54,16 +59,15 @@ import org.junit.BeforeClass; import org.junit.Test; -import static org.apache.iceberg.TableProperties.MERGE_ISOLATION_LEVEL; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.SPLIT_SIZE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; -import static org.apache.spark.sql.functions.lit; - public abstract class TestMerge extends SparkRowLevelOperationsTestBase { - public TestMerge(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestMerge( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -96,238 +100,293 @@ public void testMergeWithStaticPredicatePushDown() { String dataFilesCount = snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP); Assert.assertEquals("Must have 2 files before MERGE", "2", dataFilesCount); - createOrReplaceView("source", - "{ \"id\": 1, \"dep\": \"finance\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + createOrReplaceView( + "source", "{ \"id\": 1, \"dep\": \"finance\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }"); // remove the data file from the 'hr' partition to ensure it is not scanned - withUnavailableFiles(snapshot.addedDataFiles(table.io()), () -> { - // disable dynamic pruning and rely only on static predicate pushdown - withSQLConf(ImmutableMap.of(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED().key(), "false"), () -> { - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id AND t.dep IN ('software') AND source.id < 10 " + - "WHEN MATCHED AND source.id = 1 THEN " + - " UPDATE SET dep = source.dep " + - "WHEN NOT MATCHED THEN " + - " INSERT (dep, id) VALUES (source.dep, source.id)", tableName); - }); - }); - - ImmutableList expectedRows = ImmutableList.of( - row(1L, "finance"), // updated - row(1L, "hr"), // kept - row(2L, "hardware") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id, dep", tableName)); + withUnavailableFiles( + snapshot.addedDataFiles(table.io()), + () -> { + // disable dynamic pruning and rely only on static predicate pushdown + withSQLConf( + ImmutableMap.of(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED().key(), "false"), + () -> { + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id AND t.dep IN ('software') AND source.id < 10 " + + "WHEN MATCHED AND source.id = 1 THEN " + + " UPDATE SET dep = source.dep " + + "WHEN NOT MATCHED THEN " + + " INSERT (dep, id) VALUES (source.dep, source.id)", + tableName); + }); + }); + + ImmutableList expectedRows = + ImmutableList.of( + row(1L, "finance"), // updated + row(1L, "hr"), // kept + row(2L, "hardware") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @Test public void testMergeIntoEmptyTargetInsertAllNonMatchingRows() { createAndInitTable("id INT, dep STRING"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // new - row(2, "emp-id-2"), // new - row(3, "emp-id-3") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // new + row(2, "emp-id-2"), // new + row(3, "emp-id-3") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeIntoEmptyTargetInsertOnlyMatchingRows() { createAndInitTable("id INT, dep STRING"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED AND (s.id >=2) THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(2, "emp-id-2"), // new - row(3, "emp-id-3") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED AND (s.id >=2) THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(2, "emp-id-2"), // new + row(3, "emp-id-3") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithOnlyUpdateClause() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-six\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(6, "emp-id-six") // kept - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-six\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(6, "emp-id-six") // kept + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithOnlyDeleteClause() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-one") // kept - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-one") // kept + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithAllCauses() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithAllCausesWithExplicitColumnSpecification() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET t.id = s.id, t.dep = s.dep " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT (t.id, t.dep) VALUES (s.id, s.dep)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET t.id = s.id, t.dep = s.dep " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT (t.id, t.dep) VALUES (s.id, s.dep)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithSourceCTE() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-two\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-3\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 5, \"dep\": \"emp-id-6\" }"); - - sql("WITH cte1 AS (SELECT id + 1 AS id, dep FROM source) " + - "MERGE INTO %s AS t USING cte1 AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 2 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 3 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(2, "emp-id-2"), // updated - row(3, "emp-id-3") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-two\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-3\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 5, \"dep\": \"emp-id-6\" }"); + + sql( + "WITH cte1 AS (SELECT id + 1 AS id, dep FROM source) " + + "MERGE INTO %s AS t USING cte1 AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 2 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 3 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(2, "emp-id-2"), // updated + row(3, "emp-id-3") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithSourceFromSetOps() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); String derivedSource = - "SELECT * FROM source WHERE id = 2 " + - "UNION ALL " + - "SELECT * FROM source WHERE id = 1 OR id = 6"; - - sql("MERGE INTO %s AS t USING (%s) AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName, derivedSource); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + "SELECT * FROM source WHERE id = 2 " + + "UNION ALL " + + "SELECT * FROM source WHERE id = 1 OR id = 6"; + + sql( + "MERGE INTO %s AS t USING (%s) AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName, derivedSource); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSource() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); List sourceIds = Lists.newArrayList(); for (int i = 0; i < 10_000; i++) { @@ -337,29 +396,35 @@ public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSource() { ds.union(ds).createOrReplaceTempView("source"); String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrowsCause("Should complain about multiple matches", - SparkException.class, errorMsg, + AssertHelpers.assertThrowsCause( + "Should complain about multiple matches", + SparkException.class, + errorMsg, () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.value " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET id = 10 " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.value = 2 THEN " + - " INSERT (id, dep) VALUES (s.value, null)", tableName); + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.value " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET id = 10 " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.value = 2 THEN " + + " INSERT (id, dep) VALUES (s.value, null)", + tableName); }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one"), row(6, "emp-id-6")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @Test - public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceEnabledHashShuffleJoin() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + public void + testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceEnabledHashShuffleJoin() { + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); List sourceIds = Lists.newArrayList(); for (int i = 0; i < 10_000; i++) { @@ -368,23 +433,31 @@ public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceEnable Dataset ds = spark.createDataset(sourceIds, Encoders.INT()); ds.union(ds).createOrReplaceTempView("source"); - withSQLConf(ImmutableMap.of(SQLConf.PREFER_SORTMERGEJOIN().key(), "false"), () -> { - String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrowsCause("Should complain about multiple matches", - SparkException.class, errorMsg, - () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.value " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET id = 10 " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.value = 2 THEN " + - " INSERT (id, dep) VALUES (s.value, null)", tableName); - }); - }); + withSQLConf( + ImmutableMap.of(SQLConf.PREFER_SORTMERGEJOIN().key(), "false"), + () -> { + String errorMsg = + "a single row from the target table with multiple rows of the source table"; + AssertHelpers.assertThrowsCause( + "Should complain about multiple matches", + SparkException.class, + errorMsg, + () -> { + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.value " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET id = 10 " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.value = 2 THEN " + + " INSERT (id, dep) VALUES (s.value, null)", + tableName); + }); + }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one"), row(6, "emp-id-6")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -400,32 +473,40 @@ public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceNoEqua Dataset ds = spark.createDataset(sourceIds, Encoders.INT()); ds.union(ds).createOrReplaceTempView("source"); - withSQLConf(ImmutableMap.of(SQLConf.PREFER_SORTMERGEJOIN().key(), "false"), () -> { - String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrowsCause("Should complain about multiple matches", - SparkException.class, errorMsg, - () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id > s.value " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET id = 10 " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.value = 2 THEN " + - " INSERT (id, dep) VALUES (s.value, null)", tableName); - }); - }); + withSQLConf( + ImmutableMap.of(SQLConf.PREFER_SORTMERGEJOIN().key(), "false"), + () -> { + String errorMsg = + "a single row from the target table with multiple rows of the source table"; + AssertHelpers.assertThrowsCause( + "Should complain about multiple matches", + SparkException.class, + errorMsg, + () -> { + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id > s.value " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET id = 10 " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.value = 2 THEN " + + " INSERT (id, dep) VALUES (s.value, null)", + tableName); + }); + }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @Test public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceNoNotMatchedActions() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); List sourceIds = Lists.newArrayList(); for (int i = 0; i < 10_000; i++) { @@ -435,24 +516,30 @@ public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceNoNotM ds.union(ds).createOrReplaceTempView("source"); String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrowsCause("Should complain about multiple matches", - SparkException.class, errorMsg, + AssertHelpers.assertThrowsCause( + "Should complain about multiple matches", + SparkException.class, + errorMsg, () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.value " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET id = 10 " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE", tableName); + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.value " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET id = 10 " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE", + tableName); }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one"), row(6, "emp-id-6")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @Test - public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceNoNotMatchedActionsNoEqualityCondition() { + public void + testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceNoNotMatchedActionsNoEqualityCondition() { createAndInitTable("id INT, dep STRING", "{ \"id\": 1, \"dep\": \"emp-id-one\" }"); List sourceIds = Lists.newArrayList(); @@ -463,103 +550,128 @@ public void testMergeWithMultipleUpdatesForTargetRowSmallTargetLargeSourceNoNotM ds.union(ds).createOrReplaceTempView("source"); String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrowsCause("Should complain about multiple matches", - SparkException.class, errorMsg, + AssertHelpers.assertThrowsCause( + "Should complain about multiple matches", + SparkException.class, + errorMsg, () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id > s.value " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET id = 10 " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE", tableName); + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id > s.value " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET id = 10 " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE", + tableName); }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @Test public void testMergeWithMultipleUpdatesForTargetRow() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrowsCause("Should complain about multiple matches", - SparkException.class, errorMsg, + AssertHelpers.assertThrowsCause( + "Should complain about multiple matches", + SparkException.class, + errorMsg, () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one"), row(6, "emp-id-6")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @Test public void testMergeWithUnconditionalDelete() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithSingleConditionalDelete() { - createAndInitTable("id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); String errorMsg = "a single row from the target table with multiple rows of the source table"; - AssertHelpers.assertThrowsCause("Should complain about multiple matches", - SparkException.class, errorMsg, + AssertHelpers.assertThrowsCause( + "Should complain about multiple matches", + SparkException.class, + errorMsg, () -> { - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); }); - assertEquals("Target should be unchanged", + assertEquals( + "Target should be unchanged", ImmutableList.of(row(1, "emp-id-one"), row(6, "emp-id-6")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -569,31 +681,41 @@ public void testMergeWithIdentityTransform() { for (DistributionMode mode : DistributionMode.values()) { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD identity(dep)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, + sql("SELECT * FROM %s ORDER BY id", tableName)); removeTables(); } @@ -604,31 +726,41 @@ public void testMergeWithDaysTransform() { for (DistributionMode mode : DistributionMode.values()) { createAndInitTable("id INT, ts TIMESTAMP"); sql("ALTER TABLE %s ADD PARTITION FIELD days(ts)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, "id INT, ts TIMESTAMP", - "{ \"id\": 1, \"ts\": \"2000-01-01 00:00:00\" }\n" + - "{ \"id\": 6, \"ts\": \"2000-01-06 00:00:00\" }"); - - createOrReplaceView("source", "id INT, ts TIMESTAMP", - "{ \"id\": 2, \"ts\": \"2001-01-02 00:00:00\" }\n" + - "{ \"id\": 1, \"ts\": \"2001-01-01 00:00:00\" }\n" + - "{ \"id\": 6, \"ts\": \"2001-01-06 00:00:00\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "2001-01-01 00:00:00"), // updated - row(2, "2001-01-02 00:00:00") // new - ); - assertEquals("Should have expected rows", + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "id INT, ts TIMESTAMP", + "{ \"id\": 1, \"ts\": \"2000-01-01 00:00:00\" }\n" + + "{ \"id\": 6, \"ts\": \"2000-01-06 00:00:00\" }"); + + createOrReplaceView( + "source", + "id INT, ts TIMESTAMP", + "{ \"id\": 2, \"ts\": \"2001-01-02 00:00:00\" }\n" + + "{ \"id\": 1, \"ts\": \"2001-01-01 00:00:00\" }\n" + + "{ \"id\": 6, \"ts\": \"2001-01-06 00:00:00\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "2001-01-01 00:00:00"), // updated + row(2, "2001-01-02 00:00:00") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT id, CAST(ts AS STRING) FROM %s ORDER BY id", tableName)); @@ -641,31 +773,41 @@ public void testMergeWithBucketTransform() { for (DistributionMode mode : DistributionMode.values()) { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD bucket(2, dep)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, + sql("SELECT * FROM %s ORDER BY id", tableName)); removeTables(); } @@ -676,31 +818,41 @@ public void testMergeWithTruncateTransform() { for (DistributionMode mode : DistributionMode.values()) { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD truncate(dep, 2)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, + sql("SELECT * FROM %s ORDER BY id", tableName)); removeTables(); } @@ -712,31 +864,41 @@ public void testMergeIntoPartitionedAndOrderedTable() { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); sql("ALTER TABLE %s WRITE ORDERED BY (id)", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); - - append(tableName, - "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET * " + - "WHEN MATCHED AND t.id = 6 THEN " + - " DELETE " + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // updated - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, WRITE_DISTRIBUTION_MODE, mode.modeName()); + + append( + tableName, + "{ \"id\": 1, \"dep\": \"emp-id-one\" }\n" + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 6, \"dep\": \"emp-id-6\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET * " + + "WHEN MATCHED AND t.id = 6 THEN " + + " DELETE " + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // updated + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, + sql("SELECT * FROM %s ORDER BY id", tableName)); removeTables(); } @@ -744,66 +906,75 @@ public void testMergeIntoPartitionedAndOrderedTable() { @Test public void testSelfMerge() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - - sql("MERGE INTO %s t USING %s s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET v = 'x' " + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName, tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "x"), // updated - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + + sql( + "MERGE INTO %s t USING %s s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET v = 'x' " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName, tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "x"), // updated + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testSelfMergeWithCaching() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); sql("CACHE TABLE %s", tableName); - sql("MERGE INTO %s t USING %s s " + - "ON t.id == s.id " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET v = 'x' " + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName, tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "x"), // updated - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "MERGE INTO %s t USING %s s " + + "ON t.id == s.id " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET v = 'x' " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName, tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "x"), // updated + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithSourceAsSelfSubquery() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); createOrReplaceView("source", Arrays.asList(1, null), Encoders.INT()); - sql("MERGE INTO %s t USING (SELECT id AS value FROM %s r JOIN source ON r.id = source.value) s " + - "ON t.id == s.value " + - "WHEN MATCHED AND t.id = 1 THEN " + - " UPDATE SET v = 'x' " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES ('invalid', -1) ", tableName, tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "x"), // updated - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "MERGE INTO %s t USING (SELECT id AS value FROM %s r JOIN source ON r.id = source.value) s " + + "ON t.id == s.value " + + "WHEN MATCHED AND t.id = 1 THEN " + + " UPDATE SET v = 'x' " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES ('invalid', -1) ", + tableName, tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "x"), // updated + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -814,37 +985,46 @@ public synchronized void testMergeWithSerializableIsolation() throws Interrupted createAndInitTable("id INT, dep STRING"); createOrReplaceView("source", Collections.singletonList(1), Encoders.INT()); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, MERGE_ISOLATION_LEVEL, "serializable"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, MERGE_ISOLATION_LEVEL, "serializable"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // merge thread - Future mergeFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.value " + - "WHEN MATCHED THEN " + - " UPDATE SET dep = 'x'", tableName); - barrier.incrementAndGet(); - } - }); + Future mergeFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.value " + + "WHEN MATCHED THEN " + + " UPDATE SET dep = 'x'", + tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { mergeFuture.get(); @@ -855,7 +1035,8 @@ public synchronized void testMergeWithSerializableIsolation() throws Interrupted Throwable validationException = sparkException.getCause(); Assert.assertThat(validationException, CoreMatchers.instanceOf(ValidationException.class)); String errMsg = validationException.getMessage(); - Assert.assertThat(errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); + Assert.assertThat( + errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); } finally { appendFuture.cancel(true); } @@ -865,44 +1046,54 @@ public synchronized void testMergeWithSerializableIsolation() throws Interrupted } @Test - public synchronized void testMergeWithSnapshotIsolation() throws InterruptedException, ExecutionException { + public synchronized void testMergeWithSnapshotIsolation() + throws InterruptedException, ExecutionException { // cannot run tests with concurrency for Hadoop tables without atomic renames Assume.assumeFalse(catalogName.equalsIgnoreCase("testhadoop")); createAndInitTable("id INT, dep STRING"); createOrReplaceView("source", Collections.singletonList(1), Encoders.INT()); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, MERGE_ISOLATION_LEVEL, "snapshot"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, MERGE_ISOLATION_LEVEL, "snapshot"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // merge thread - Future mergeFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.value " + - "WHEN MATCHED THEN " + - " UPDATE SET dep = 'x'", tableName); - barrier.incrementAndGet(); - } - }); + Future mergeFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.value " + + "WHEN MATCHED THEN " + + " UPDATE SET dep = 'x'", + tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { mergeFuture.get(); @@ -916,175 +1107,195 @@ public synchronized void testMergeWithSnapshotIsolation() throws InterruptedExce @Test public void testMergeWithExtraColumnsInSource() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - createOrReplaceView("source", - "{ \"id\": 1, \"extra_col\": -1, \"v\": \"v1_1\" }\n" + - "{ \"id\": 3, \"extra_col\": -1, \"v\": \"v3\" }\n" + - "{ \"id\": 4, \"extra_col\": -1, \"v\": \"v4\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET v = source.v " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "v1_1"), // new - row(2, "v2"), // kept - row(3, "v3"), // new - row(4, "v4") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + createOrReplaceView( + "source", + "{ \"id\": 1, \"extra_col\": -1, \"v\": \"v1_1\" }\n" + + "{ \"id\": 3, \"extra_col\": -1, \"v\": \"v3\" }\n" + + "{ \"id\": 4, \"extra_col\": -1, \"v\": \"v4\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET v = source.v " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "v1_1"), // new + row(2, "v2"), // kept + row(3, "v3"), // new + row(4, "v4") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithNullsInTargetAndSource() { - createAndInitTable("id INT, v STRING", - "{ \"id\": null, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - - createOrReplaceView("source", - "{ \"id\": null, \"v\": \"v1_1\" }\n" + - "{ \"id\": 4, \"v\": \"v4\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET v = source.v " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(null, "v1"), // kept - row(null, "v1_1"), // new - row(2, "v2"), // kept - row(4, "v4") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": null, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + + createOrReplaceView( + "source", "{ \"id\": null, \"v\": \"v1_1\" }\n" + "{ \"id\": 4, \"v\": \"v4\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET v = source.v " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(null, "v1"), // kept + row(null, "v1_1"), // new + row(2, "v2"), // kept + row(4, "v4") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test public void testMergeWithNullSafeEquals() { - createAndInitTable("id INT, v STRING", - "{ \"id\": null, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - - createOrReplaceView("source", - "{ \"id\": null, \"v\": \"v1_1\" }\n" + - "{ \"id\": 4, \"v\": \"v4\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id <=> source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET v = source.v " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(null, "v1_1"), // updated - row(2, "v2"), // kept - row(4, "v4") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": null, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + + createOrReplaceView( + "source", "{ \"id\": null, \"v\": \"v1_1\" }\n" + "{ \"id\": 4, \"v\": \"v4\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id <=> source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET v = source.v " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(null, "v1_1"), // updated + row(2, "v2"), // kept + row(4, "v4") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test public void testMergeWithNullCondition() { - createAndInitTable("id INT, v STRING", - "{ \"id\": null, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); - - createOrReplaceView("source", - "{ \"id\": null, \"v\": \"v1_1\" }\n" + - "{ \"id\": 2, \"v\": \"v2_2\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id AND NULL " + - "WHEN MATCHED THEN " + - " UPDATE SET v = source.v " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(null, "v1"), // kept - row(null, "v1_1"), // new - row(2, "v2"), // kept - row(2, "v2_2") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); + createAndInitTable( + "id INT, v STRING", "{ \"id\": null, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); + + createOrReplaceView( + "source", "{ \"id\": null, \"v\": \"v1_1\" }\n" + "{ \"id\": 2, \"v\": \"v2_2\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id AND NULL " + + "WHEN MATCHED THEN " + + " UPDATE SET v = source.v " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(null, "v1"), // kept + row(null, "v1_1"), // new + row(2, "v2"), // kept + row(2, "v2_2") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test public void testMergeWithNullActionConditions() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); - createOrReplaceView("source", - "{ \"id\": 1, \"v\": \"v1_1\" }\n" + - "{ \"id\": 2, \"v\": \"v2_2\" }\n" + - "{ \"id\": 3, \"v\": \"v3_3\" }"); + createOrReplaceView( + "source", + "{ \"id\": 1, \"v\": \"v1_1\" }\n" + + "{ \"id\": 2, \"v\": \"v2_2\" }\n" + + "{ \"id\": 3, \"v\": \"v3_3\" }"); // all conditions are NULL and will never match any rows - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED AND source.id = 1 AND NULL THEN " + - " UPDATE SET v = source.v " + - "WHEN MATCHED AND source.v = 'v1_1' AND NULL THEN " + - " DELETE " + - "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows1 = ImmutableList.of( - row(1, "v1"), // kept - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows1, sql("SELECT * FROM %s ORDER BY v", tableName)); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED AND source.id = 1 AND NULL THEN " + + " UPDATE SET v = source.v " + + "WHEN MATCHED AND source.v = 'v1_1' AND NULL THEN " + + " DELETE " + + "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows1 = + ImmutableList.of( + row(1, "v1"), // kept + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows1, sql("SELECT * FROM %s ORDER BY v", tableName)); // only the update and insert conditions are NULL - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED AND source.id = 1 AND NULL THEN " + - " UPDATE SET v = source.v " + - "WHEN MATCHED AND source.v = 'v1_1' THEN " + - " DELETE " + - "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows2 = ImmutableList.of( - row(2, "v2") // kept - ); - assertEquals("Output should match", expectedRows2, sql("SELECT * FROM %s ORDER BY v", tableName)); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED AND source.id = 1 AND NULL THEN " + + " UPDATE SET v = source.v " + + "WHEN MATCHED AND source.v = 'v1_1' THEN " + + " DELETE " + + "WHEN NOT MATCHED AND source.id = 3 AND NULL THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows2 = + ImmutableList.of( + row(2, "v2") // kept + ); + assertEquals( + "Output should match", expectedRows2, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test public void testMergeWithMultipleMatchingActions() { - createAndInitTable("id INT, v STRING", - "{ \"id\": 1, \"v\": \"v1\" }\n" + - "{ \"id\": 2, \"v\": \"v2\" }"); + createAndInitTable( + "id INT, v STRING", "{ \"id\": 1, \"v\": \"v1\" }\n" + "{ \"id\": 2, \"v\": \"v2\" }"); - createOrReplaceView("source", - "{ \"id\": 1, \"v\": \"v1_1\" }\n" + - "{ \"id\": 2, \"v\": \"v2_2\" }"); + createOrReplaceView( + "source", "{ \"id\": 1, \"v\": \"v1_1\" }\n" + "{ \"id\": 2, \"v\": \"v2_2\" }"); // the order of match actions is important in this case - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED AND source.id = 1 THEN " + - " UPDATE SET v = source.v " + - "WHEN MATCHED AND source.v = 'v1_1' THEN " + - " DELETE " + - "WHEN NOT MATCHED THEN " + - " INSERT (v, id) VALUES (source.v, source.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "v1_1"), // updated (also matches the delete cond but update is first) - row(2, "v2") // kept (matches neither the update nor the delete cond) - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED AND source.id = 1 THEN " + + " UPDATE SET v = source.v " + + "WHEN MATCHED AND source.v = 'v1_1' THEN " + + " DELETE " + + "WHEN NOT MATCHED THEN " + + " INSERT (v, id) VALUES (source.v, source.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "v1_1"), // updated (also matches the delete cond but update is first) + row(2, "v2") // kept (matches neither the update nor the delete cond) + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY v", tableName)); } @Test @@ -1094,7 +1305,9 @@ public void testMergeWithMultipleRowGroupsParquet() throws NoSuchTableException createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, SPLIT_SIZE, 100); createOrReplaceView("source", Collections.singletonList(1), Encoders.INT()); @@ -1103,85 +1316,103 @@ public void testMergeWithMultipleRowGroupsParquet() throws NoSuchTableException for (int id = 1; id <= 200; id++) { ids.add(id); } - Dataset df = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); + Dataset df = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); df.coalesce(1).writeTo(tableName).append(); Assert.assertEquals(200, spark.table(tableName).count()); // update a record from one of two row groups and copy over the second one - sql("MERGE INTO %s t USING source " + - "ON t.id == source.value " + - "WHEN MATCHED THEN " + - " UPDATE SET dep = 'x'", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.value " + + "WHEN MATCHED THEN " + + " UPDATE SET dep = 'x'", + tableName); Assert.assertEquals(200, spark.table(tableName).count()); } @Test public void testMergeInsertOnly() { - createAndInitTable("id STRING, v STRING", - "{ \"id\": \"a\", \"v\": \"v1\" }\n" + - "{ \"id\": \"b\", \"v\": \"v2\" }"); - createOrReplaceView("source", - "{ \"id\": \"a\", \"v\": \"v1_1\" }\n" + - "{ \"id\": \"a\", \"v\": \"v1_2\" }\n" + - "{ \"id\": \"c\", \"v\": \"v3\" }\n" + - "{ \"id\": \"d\", \"v\": \"v4_1\" }\n" + - "{ \"id\": \"d\", \"v\": \"v4_2\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row("a", "v1"), // kept - row("b", "v2"), // kept - row("c", "v3"), // new - row("d", "v4_1"), // new - row("d", "v4_2") // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createAndInitTable( + "id STRING, v STRING", + "{ \"id\": \"a\", \"v\": \"v1\" }\n" + "{ \"id\": \"b\", \"v\": \"v2\" }"); + createOrReplaceView( + "source", + "{ \"id\": \"a\", \"v\": \"v1_1\" }\n" + + "{ \"id\": \"a\", \"v\": \"v1_2\" }\n" + + "{ \"id\": \"c\", \"v\": \"v3\" }\n" + + "{ \"id\": \"d\", \"v\": \"v4_1\" }\n" + + "{ \"id\": \"d\", \"v\": \"v4_2\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row("a", "v1"), // kept + row("b", "v2"), // kept + row("c", "v3"), // new + row("d", "v4_1"), // new + row("d", "v4_2") // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeInsertOnlyWithCondition() { createAndInitTable("id INTEGER, v INTEGER", "{ \"id\": 1, \"v\": 1 }"); - createOrReplaceView("source", - "{ \"id\": 1, \"v\": 11, \"is_new\": true }\n" + - "{ \"id\": 2, \"v\": 21, \"is_new\": true }\n" + - "{ \"id\": 2, \"v\": 22, \"is_new\": false }"); + createOrReplaceView( + "source", + "{ \"id\": 1, \"v\": 11, \"is_new\": true }\n" + + "{ \"id\": 2, \"v\": 21, \"is_new\": true }\n" + + "{ \"id\": 2, \"v\": 22, \"is_new\": false }"); // validate assignments are reordered to match the table attrs - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED AND is_new = TRUE THEN " + - " INSERT (v, id) VALUES (s.v + 100, s.id)", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, 1), // kept - row(2, 121) // new - ); - assertEquals("Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED AND is_new = TRUE THEN " + + " INSERT (v, id) VALUES (s.v + 100, s.id)", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, 1), // kept + row(2, 121) // new + ); + assertEquals( + "Output should match", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeAlignsUpdateAndInsertActions() { createAndInitTable("id INT, a INT, b STRING", "{ \"id\": 1, \"a\": 2, \"b\": \"str\" }"); - createOrReplaceView("source", - "{ \"id\": 1, \"c1\": -2, \"c2\": \"new_str_1\" }\n" + - "{ \"id\": 2, \"c1\": -20, \"c2\": \"new_str_2\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET b = c2, a = c1, t.id = source.id " + - "WHEN NOT MATCHED THEN " + - " INSERT (b, a, id) VALUES (c2, c1, id)", tableName); - - assertEquals("Output should match", + createOrReplaceView( + "source", + "{ \"id\": 1, \"c1\": -2, \"c2\": \"new_str_1\" }\n" + + "{ \"id\": 2, \"c1\": -20, \"c2\": \"new_str_2\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET b = c2, a = c1, t.id = source.id " + + "WHEN NOT MATCHED THEN " + + " INSERT (b, a, id) VALUES (c2, c1, id)", + tableName); + + assertEquals( + "Output should match", ImmutableList.of(row(1, -2, "new_str_1"), row(2, -20, "new_str_2")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -1191,15 +1422,17 @@ public void testMergeMixedCaseAlignsUpdateAndInsertActions() { createAndInitTable("id INT, a INT, b STRING", "{ \"id\": 1, \"a\": 2, \"b\": \"str\" }"); createOrReplaceView( "source", - "{ \"id\": 1, \"c1\": -2, \"c2\": \"new_str_1\" }\n" + - "{ \"id\": 2, \"c1\": -20, \"c2\": \"new_str_2\" }"); - - sql("MERGE INTO %s t USING source " + - "ON t.iD == source.Id " + - "WHEN MATCHED THEN " + - " UPDATE SET B = c2, A = c1, t.Id = source.ID " + - "WHEN NOT MATCHED THEN " + - " INSERT (b, A, iD) VALUES (c2, c1, id)", tableName); + "{ \"id\": 1, \"c1\": -2, \"c2\": \"new_str_1\" }\n" + + "{ \"id\": 2, \"c1\": -20, \"c2\": \"new_str_2\" }"); + + sql( + "MERGE INTO %s t USING source " + + "ON t.iD == source.Id " + + "WHEN MATCHED THEN " + + " UPDATE SET B = c2, A = c1, t.Id = source.ID " + + "WHEN NOT MATCHED THEN " + + " INSERT (b, A, iD) VALUES (c2, c1, id)", + tableName); assertEquals( "Output should match", @@ -1218,37 +1451,47 @@ public void testMergeMixedCaseAlignsUpdateAndInsertActions() { @Test public void testMergeUpdatesNestedStructFields() { - createAndInitTable("id INT, s STRUCT,m:MAP>>", + createAndInitTable( + "id INT, s STRUCT,m:MAP>>", "{ \"id\": 1, \"s\": { \"c1\": 2, \"c2\": { \"a\": [1,2], \"m\": { \"a\": \"b\"} } } } }"); createOrReplaceView("source", "{ \"id\": 1, \"c1\": -2 }"); // update primitive, array, map columns inside a struct - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.c1 = source.c1, t.s.c2.a = array(-1, -2), t.s.c2.m = map('k', 'v')", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.c1 = source.c1, t.s.c2.a = array(-1, -2), t.s.c2.m = map('k', 'v')", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(-2, row(ImmutableList.of(-1, -2), ImmutableMap.of("k", "v"))))), sql("SELECT * FROM %s ORDER BY id", tableName)); // set primitive, array, map columns to NULL (proper casts should be in place) - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.c1 = NULL, t.s.c2 = NULL", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.c1 = NULL, t.s.c2 = NULL", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(null, null))), sql("SELECT * FROM %s ORDER BY id", tableName)); // update all fields in a struct - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s = named_struct('c1', 100, 'c2', named_struct('a', array(1), 'm', map('x', 'y')))", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s = named_struct('c1', 100, 'c2', named_struct('a', array(1), 'm', map('x', 'y')))", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(100, row(ImmutableList.of(1), ImmutableMap.of("x", "y"))))), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -1259,12 +1502,15 @@ public void testMergeWithInferredCasts() { createOrReplaceView("source", "{ \"id\": 1, \"c1\": -2}"); // -2 in source should be casted to "-2" in target - sql("MERGE INTO %s t USING source " + - "ON t.id == source.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s = source.c1", tableName); + sql( + "MERGE INTO %s t USING source " + + "ON t.id == source.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s = source.c1", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, "-2")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -1274,12 +1520,15 @@ public void testMergeModifiesNullStruct() { createAndInitTable("id INT, s STRUCT", "{ \"id\": 1, \"s\": null }"); createOrReplaceView("source", "{ \"id\": 1, \"n1\": -10 }"); - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.n1 = s.n1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.n1 = s.n1", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(-10, null))), sql("SELECT * FROM %s", tableName)); } @@ -1294,18 +1543,18 @@ public void testMergeRefreshesRelationCache() { spark.sql("CACHE TABLE tmp"); - assertEquals("View should have correct data", - ImmutableList.of(row("n1")), - sql("SELECT * FROM tmp")); + assertEquals( + "View should have correct data", ImmutableList.of(row("n1")), sql("SELECT * FROM tmp")); - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.name = s.name", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.name = s.name", + tableName); - assertEquals("View should have correct data", - ImmutableList.of(row("n2")), - sql("SELECT * FROM tmp")); + assertEquals( + "View should have correct data", ImmutableList.of(row("n2")), sql("SELECT * FROM tmp")); spark.sql("UNCACHE TABLE tmp"); } @@ -1314,75 +1563,95 @@ public void testMergeRefreshesRelationCache() { public void testMergeWithMultipleNotMatchedActions() { createAndInitTable("id INT, dep STRING", "{ \"id\": 0, \"dep\": \"emp-id-0\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED AND s.id = 1 THEN " + - " INSERT (dep, id) VALUES (s.dep, -1)" + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(-1, "emp-id-1"), // new - row(0, "emp-id-0"), // kept - row(2, "emp-id-2"), // new - row(3, "emp-id-3") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED AND s.id = 1 THEN " + + " INSERT (dep, id) VALUES (s.dep, -1)" + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(-1, "emp-id-1"), // new + row(0, "emp-id-0"), // kept + row(2, "emp-id-2"), // new + row(3, "emp-id-3") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeWithMultipleConditionalNotMatchedActions() { createAndInitTable("id INT, dep STRING", "{ \"id\": 0, \"dep\": \"emp-id-0\" }"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED AND s.id = 1 THEN " + - " INSERT (dep, id) VALUES (s.dep, -1)" + - "WHEN NOT MATCHED AND s.id = 2 THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(-1, "emp-id-1"), // new - row(0, "emp-id-0"), // kept - row(2, "emp-id-2") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED AND s.id = 1 THEN " + + " INSERT (dep, id) VALUES (s.dep, -1)" + + "WHEN NOT MATCHED AND s.id = 2 THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(-1, "emp-id-1"), // new + row(0, "emp-id-0"), // kept + row(2, "emp-id-2") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testMergeResolvesColumnsByName() { - createAndInitTable("id INT, badge INT, dep STRING", - "{ \"id\": 1, \"badge\": 1000, \"dep\": \"emp-id-one\" }\n" + - "{ \"id\": 6, \"badge\": 6000, \"dep\": \"emp-id-6\" }"); - - createOrReplaceView("source", "badge INT, id INT, dep STRING", - "{ \"badge\": 1001, \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"badge\": 6006, \"id\": 6, \"dep\": \"emp-id-6\" }\n" + - "{ \"badge\": 7007, \"id\": 7, \"dep\": \"emp-id-7\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON t.id == s.id " + - "WHEN MATCHED THEN " + - " UPDATE SET * " + - "WHEN NOT MATCHED THEN " + - " INSERT * ", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, 1001, "emp-id-1"), // updated - row(6, 6006, "emp-id-6"), // updated - row(7, 7007, "emp-id-7") // new - ); - assertEquals("Should have expected rows", expectedRows, + createAndInitTable( + "id INT, badge INT, dep STRING", + "{ \"id\": 1, \"badge\": 1000, \"dep\": \"emp-id-one\" }\n" + + "{ \"id\": 6, \"badge\": 6000, \"dep\": \"emp-id-6\" }"); + + createOrReplaceView( + "source", + "badge INT, id INT, dep STRING", + "{ \"badge\": 1001, \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"badge\": 6006, \"id\": 6, \"dep\": \"emp-id-6\" }\n" + + "{ \"badge\": 7007, \"id\": 7, \"dep\": \"emp-id-7\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN " + + " UPDATE SET * " + + "WHEN NOT MATCHED THEN " + + " INSERT * ", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, 1001, "emp-id-1"), // updated + row(6, 6006, "emp-id-6"), // updated + row(7, 7007, "emp-id-7") // new + ); + assertEquals( + "Should have expected rows", + expectedRows, sql("SELECT id, badge, dep FROM %s ORDER BY id", tableName)); } @@ -1392,24 +1661,30 @@ public void testMergeShouldResolveWhenThereAreNoUnresolvedExpressionsOrColumns() // or otherwise unresolved expressions exist in the query (testing SPARK-34962) createAndInitTable("id INT, dep STRING"); - createOrReplaceView("source", "id INT, dep STRING", - "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + - "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + - "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); - - sql("MERGE INTO %s AS t USING source AS s " + - "ON 1 != 1 " + - "WHEN MATCHED THEN " + - " UPDATE SET * " + - "WHEN NOT MATCHED THEN " + - " INSERT *", tableName); - - ImmutableList expectedRows = ImmutableList.of( - row(1, "emp-id-1"), // new - row(2, "emp-id-2"), // new - row(3, "emp-id-3") // new - ); - assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); + createOrReplaceView( + "source", + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"emp-id-1\" }\n" + + "{ \"id\": 2, \"dep\": \"emp-id-2\" }\n" + + "{ \"id\": 3, \"dep\": \"emp-id-3\" }"); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON 1 != 1 " + + "WHEN MATCHED THEN " + + " UPDATE SET * " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + tableName); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "emp-id-1"), // new + row(2, "emp-id-2"), // new + row(3, "emp-id-3") // new + ); + assertEquals( + "Should have expected rows", expectedRows, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -1417,33 +1692,45 @@ public void testMergeWithNonExistingColumns() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about the invalid top-level column", - AnalysisException.class, "cannot resolve t.invalid_col", + AssertHelpers.assertThrows( + "Should complain about the invalid top-level column", + AnalysisException.class, + "cannot resolve t.invalid_col", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.invalid_col = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.invalid_col = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about the invalid nested column", - AnalysisException.class, "No such struct field invalid_col", + AssertHelpers.assertThrows( + "Should complain about the invalid nested column", + AnalysisException.class, + "No such struct field invalid_col", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n2.invalid_col = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n2.invalid_col = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about the invalid top-level column", - AnalysisException.class, "cannot resolve invalid_col", + AssertHelpers.assertThrows( + "Should complain about the invalid top-level column", + AnalysisException.class, + "cannot resolve invalid_col", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n2.dn1 = s.c2 " + - "WHEN NOT MATCHED THEN " + - " INSERT (id, invalid_col) VALUES (s.c1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n2.dn1 = s.c2 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, invalid_col) VALUES (s.c1, null)", + tableName); }); } @@ -1452,35 +1739,47 @@ public void testMergeWithInvalidColumnsInInsert() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about the nested column", - AnalysisException.class, "Nested fields are not supported inside INSERT clauses", + AssertHelpers.assertThrows( + "Should complain about the nested column", + AnalysisException.class, + "Nested fields are not supported inside INSERT clauses", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n2.dn1 = s.c2 " + - "WHEN NOT MATCHED THEN " + - " INSERT (id, c.n2) VALUES (s.c1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n2.dn1 = s.c2 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, c.n2) VALUES (s.c1, null)", + tableName); }); - AssertHelpers.assertThrows("Should complain about duplicate columns", - AnalysisException.class, "Duplicate column names inside INSERT clause", + AssertHelpers.assertThrows( + "Should complain about duplicate columns", + AnalysisException.class, + "Duplicate column names inside INSERT clause", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n2.dn1 = s.c2 " + - "WHEN NOT MATCHED THEN " + - " INSERT (id, id) VALUES (s.c1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n2.dn1 = s.c2 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, id) VALUES (s.c1, null)", + tableName); }); - AssertHelpers.assertThrows("Should complain about missing columns", - AnalysisException.class, "must provide values for all columns of the target table", + AssertHelpers.assertThrows( + "Should complain about missing columns", + AnalysisException.class, + "must provide values for all columns of the target table", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN NOT MATCHED THEN " + - " INSERT (id) VALUES (s.c1)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id) VALUES (s.c1)", + tableName); }); } @@ -1489,22 +1788,30 @@ public void testMergeWithInvalidUpdates() { createAndInitTable("id INT, a ARRAY>, m MAP"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about updating an array column", - AnalysisException.class, "Updating nested fields is only supported for structs", + AssertHelpers.assertThrows( + "Should complain about updating an array column", + AnalysisException.class, + "Updating nested fields is only supported for structs", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.a.c1 = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.a.c1 = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about updating a map column", - AnalysisException.class, "Updating nested fields is only supported for structs", + AssertHelpers.assertThrows( + "Should complain about updating a map column", + AnalysisException.class, + "Updating nested fields is only supported for structs", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.m.key = 'new_key'", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.m.key = 'new_key'", + tableName); }); } @@ -1513,90 +1820,124 @@ public void testMergeWithConflictingUpdates() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about conflicting updates to a top-level column", - AnalysisException.class, "Updates are in conflict", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a top-level column", + AnalysisException.class, + "Updates are in conflict", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.id = 1, t.c.n1 = 2, t.id = 2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.id = 1, t.c.n1 = 2, t.id = 2", + tableName); }); - AssertHelpers.assertThrows("Should complain about conflicting updates to a nested column", - AnalysisException.class, "Updates are in conflict for these columns", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a nested column", + AnalysisException.class, + "Updates are in conflict for these columns", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n1 = 1, t.id = 2, t.c.n1 = 2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n1 = 1, t.id = 2, t.c.n1 = 2", + tableName); }); - AssertHelpers.assertThrows("Should complain about conflicting updates to a nested column", - AnalysisException.class, "Updates are in conflict", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a nested column", + AnalysisException.class, + "Updates are in conflict", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET c.n1 = 1, c = named_struct('n1', 1, 'n2', named_struct('dn1', 1, 'dn2', 2))", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET c.n1 = 1, c = named_struct('n1', 1, 'n2', named_struct('dn1', 1, 'dn2', 2))", + tableName); }); } @Test public void testMergeWithInvalidAssignments() { - createAndInitTable("id INT NOT NULL, s STRUCT> NOT NULL"); + createAndInitTable( + "id INT NOT NULL, s STRUCT> NOT NULL"); createOrReplaceView( "source", "c1 INT, c2 STRUCT NOT NULL, c3 STRING NOT NULL, c4 STRUCT", "{ \"c1\": -100, \"c2\": { \"n1\" : 1 }, \"c3\" : 'str', \"c4\": { \"dn2\": 1, \"dn2\": 2 } }"); - for (String policy : new String[]{"ansi", "strict"}) { - withSQLConf(ImmutableMap.of("spark.sql.storeAssignmentPolicy", policy), () -> { - - AssertHelpers.assertThrows("Should complain about writing nulls to a top-level column", - AnalysisException.class, "Cannot write nullable values to non-null column", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.id = NULL", tableName); - }); - - AssertHelpers.assertThrows("Should complain about writing nulls to a nested column", - AnalysisException.class, "Cannot write nullable values to non-null column", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.n1 = NULL", tableName); - }); - - AssertHelpers.assertThrows("Should complain about writing missing fields in structs", - AnalysisException.class, "missing fields", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s = s.c2", tableName); - }); - - AssertHelpers.assertThrows("Should complain about writing invalid data types", - AnalysisException.class, "Cannot safely cast", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.n1 = s.c3", tableName); - }); - - AssertHelpers.assertThrows("Should complain about writing incompatible structs", - AnalysisException.class, "field name does not match", - () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.s.n2 = s.c4", tableName); - }); - }); + for (String policy : new String[] {"ansi", "strict"}) { + withSQLConf( + ImmutableMap.of("spark.sql.storeAssignmentPolicy", policy), + () -> { + AssertHelpers.assertThrows( + "Should complain about writing nulls to a top-level column", + AnalysisException.class, + "Cannot write nullable values to non-null column", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.id = NULL", + tableName); + }); + + AssertHelpers.assertThrows( + "Should complain about writing nulls to a nested column", + AnalysisException.class, + "Cannot write nullable values to non-null column", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.n1 = NULL", + tableName); + }); + + AssertHelpers.assertThrows( + "Should complain about writing missing fields in structs", + AnalysisException.class, + "missing fields", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s = s.c2", + tableName); + }); + + AssertHelpers.assertThrows( + "Should complain about writing invalid data types", + AnalysisException.class, + "Cannot safely cast", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.n1 = s.c3", + tableName); + }); + + AssertHelpers.assertThrows( + "Should complain about writing incompatible structs", + AnalysisException.class, + "field name does not match", + () -> { + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.s.n2 = s.c4", + tableName); + }); + }); } } @@ -1605,40 +1946,56 @@ public void testMergeWithNonDeterministicConditions() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about non-deterministic search conditions", - AnalysisException.class, "Non-deterministic functions are not supported", + AssertHelpers.assertThrows( + "Should complain about non-deterministic search conditions", + AnalysisException.class, + "Non-deterministic functions are not supported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 AND rand() > t.id " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n1 = -1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 AND rand() > t.id " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n1 = -1", + tableName); }); - AssertHelpers.assertThrows("Should complain about non-deterministic update conditions", - AnalysisException.class, "Non-deterministic functions are not supported", + AssertHelpers.assertThrows( + "Should complain about non-deterministic update conditions", + AnalysisException.class, + "Non-deterministic functions are not supported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND rand() > t.id THEN " + - " UPDATE SET t.c.n1 = -1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND rand() > t.id THEN " + + " UPDATE SET t.c.n1 = -1", + tableName); }); - AssertHelpers.assertThrows("Should complain about non-deterministic delete conditions", - AnalysisException.class, "Non-deterministic functions are not supported", + AssertHelpers.assertThrows( + "Should complain about non-deterministic delete conditions", + AnalysisException.class, + "Non-deterministic functions are not supported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND rand() > t.id THEN " + - " DELETE", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND rand() > t.id THEN " + + " DELETE", + tableName); }); - AssertHelpers.assertThrows("Should complain about non-deterministic insert conditions", - AnalysisException.class, "Non-deterministic functions are not supported", + AssertHelpers.assertThrows( + "Should complain about non-deterministic insert conditions", + AnalysisException.class, + "Non-deterministic functions are not supported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN NOT MATCHED AND rand() > c1 THEN " + - " INSERT (id, c) VALUES (1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN NOT MATCHED AND rand() > c1 THEN " + + " INSERT (id, c) VALUES (1, null)", + tableName); }); } @@ -1647,40 +2004,56 @@ public void testMergeWithAggregateExpressions() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about agg expressions in search conditions", - AnalysisException.class, "Agg functions are not supported", + AssertHelpers.assertThrows( + "Should complain about agg expressions in search conditions", + AnalysisException.class, + "Agg functions are not supported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 AND max(t.id) == 1 " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n1 = -1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 AND max(t.id) == 1 " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n1 = -1", + tableName); }); - AssertHelpers.assertThrows("Should complain about agg expressions in update conditions", - AnalysisException.class, "Agg functions are not supported", + AssertHelpers.assertThrows( + "Should complain about agg expressions in update conditions", + AnalysisException.class, + "Agg functions are not supported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND sum(t.id) < 1 THEN " + - " UPDATE SET t.c.n1 = -1", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND sum(t.id) < 1 THEN " + + " UPDATE SET t.c.n1 = -1", + tableName); }); - AssertHelpers.assertThrows("Should complain about agg expressions in delete conditions", - AnalysisException.class, "Agg functions are not supported", + AssertHelpers.assertThrows( + "Should complain about agg expressions in delete conditions", + AnalysisException.class, + "Agg functions are not supported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND sum(t.id) THEN " + - " DELETE", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND sum(t.id) THEN " + + " DELETE", + tableName); }); - AssertHelpers.assertThrows("Should complain about agg expressions in insert conditions", - AnalysisException.class, "Agg functions are not supported", + AssertHelpers.assertThrows( + "Should complain about agg expressions in insert conditions", + AnalysisException.class, + "Agg functions are not supported", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN NOT MATCHED AND sum(c1) < 1 THEN " + - " INSERT (id, c) VALUES (1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN NOT MATCHED AND sum(c1) < 1 THEN " + + " INSERT (id, c) VALUES (1, null)", + tableName); }); } @@ -1689,40 +2062,56 @@ public void testMergeWithSubqueriesInConditions() { createAndInitTable("id INT, c STRUCT>"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain about subquery expressions", - AnalysisException.class, "Subqueries are not supported in conditions", + AssertHelpers.assertThrows( + "Should complain about subquery expressions", + AnalysisException.class, + "Subqueries are not supported in conditions", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 AND t.id < (SELECT max(c2) FROM source) " + - "WHEN MATCHED THEN " + - " UPDATE SET t.c.n1 = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 AND t.id < (SELECT max(c2) FROM source) " + + "WHEN MATCHED THEN " + + " UPDATE SET t.c.n1 = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about subquery expressions", - AnalysisException.class, "Subqueries are not supported in conditions", + AssertHelpers.assertThrows( + "Should complain about subquery expressions", + AnalysisException.class, + "Subqueries are not supported in conditions", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND t.id < (SELECT max(c2) FROM source) THEN " + - " UPDATE SET t.c.n1 = s.c2", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND t.id < (SELECT max(c2) FROM source) THEN " + + " UPDATE SET t.c.n1 = s.c2", + tableName); }); - AssertHelpers.assertThrows("Should complain about subquery expressions", - AnalysisException.class, "Subqueries are not supported in conditions", + AssertHelpers.assertThrows( + "Should complain about subquery expressions", + AnalysisException.class, + "Subqueries are not supported in conditions", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN MATCHED AND t.id NOT IN (SELECT c2 FROM source) THEN " + - " DELETE", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN MATCHED AND t.id NOT IN (SELECT c2 FROM source) THEN " + + " DELETE", + tableName); }); - AssertHelpers.assertThrows("Should complain about subquery expressions", - AnalysisException.class, "Subqueries are not supported in conditions", + AssertHelpers.assertThrows( + "Should complain about subquery expressions", + AnalysisException.class, + "Subqueries are not supported in conditions", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.c1 " + - "WHEN NOT MATCHED AND s.c1 IN (SELECT c2 FROM source) THEN " + - " INSERT (id, c) VALUES (1, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.c1 " + + "WHEN NOT MATCHED AND s.c1 IN (SELECT c2 FROM source) THEN " + + " INSERT (id, c) VALUES (1, null)", + tableName); }); } @@ -1731,13 +2120,17 @@ public void testMergeWithTargetColumnsInInsertConditions() { createAndInitTable("id INT, c2 INT"); createOrReplaceView("source", "{ \"id\": 1, \"value\": 11 }"); - AssertHelpers.assertThrows("Should complain about the target column", - AnalysisException.class, "Cannot resolve [c2]", + AssertHelpers.assertThrows( + "Should complain about the target column", + AnalysisException.class, + "Cannot resolve [c2]", () -> { - sql("MERGE INTO %s t USING source s " + - "ON t.id == s.id " + - "WHEN NOT MATCHED AND c2 = 1 THEN " + - " INSERT (id, c2) VALUES (s.id, null)", tableName); + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.id " + + "WHEN NOT MATCHED AND c2 = 1 THEN " + + " INSERT (id, c2) VALUES (s.id, null)", + tableName); }); } @@ -1746,19 +2139,22 @@ public void testMergeWithNonIcebergTargetTableNotSupported() { createOrReplaceView("target", "{ \"c1\": -100, \"c2\": -200 }"); createOrReplaceView("source", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("Should complain non iceberg target table", - UnsupportedOperationException.class, "MERGE INTO TABLE is not supported temporarily.", + AssertHelpers.assertThrows( + "Should complain non iceberg target table", + UnsupportedOperationException.class, + "MERGE INTO TABLE is not supported temporarily.", () -> { - sql("MERGE INTO target t USING source s " + - "ON t.c1 == s.c1 " + - "WHEN MATCHED THEN " + - " UPDATE SET *"); + sql( + "MERGE INTO target t USING source s " + + "ON t.c1 == s.c1 " + + "WHEN MATCHED THEN " + + " UPDATE SET *"); }); } /** - * Tests a merge where both the source and target are evaluated to be partitioned by SingePartition at planning time - * but DynamicFileFilterExec will return an empty target. + * Tests a merge where both the source and target are evaluated to be partitioned by + * SingePartition at planning time but DynamicFileFilterExec will return an empty target. */ @Test public void testMergeSinglePartitionPartitioning() { @@ -1768,19 +2164,14 @@ public void testMergeSinglePartitionPartitioning() { // Coalesce forces our source into a SinglePartition distribution spark.range(0, 5).coalesce(1).createOrReplaceTempView("source"); - sql("MERGE INTO %s t USING source s ON t.id = s.id " + - "WHEN MATCHED THEN UPDATE SET *" + - "WHEN NOT MATCHED THEN INSERT *", + sql( + "MERGE INTO %s t USING source s ON t.id = s.id " + + "WHEN MATCHED THEN UPDATE SET *" + + "WHEN NOT MATCHED THEN INSERT *", tableName); - ImmutableList expectedRows = ImmutableList.of( - row(-1), - row(0), - row(1), - row(2), - row(3), - row(4) - ); + ImmutableList expectedRows = + ImmutableList.of(row(-1), row(0), row(1), row(2), row(3), row(4)); List result = sql("SELECT * FROM %s ORDER BY id", tableName); assertEquals("Should correctly add the non-matching rows", expectedRows, result); @@ -1794,18 +2185,13 @@ public void testMergeEmptyTable() { // Coalesce forces our source into a SinglePartition distribution spark.range(0, 5).coalesce(1).createOrReplaceTempView("source"); - sql("MERGE INTO %s t USING source s ON t.id = s.id " + - "WHEN MATCHED THEN UPDATE SET *" + - "WHEN NOT MATCHED THEN INSERT *", + sql( + "MERGE INTO %s t USING source s ON t.id = s.id " + + "WHEN MATCHED THEN UPDATE SET *" + + "WHEN NOT MATCHED THEN INSERT *", tableName); - ImmutableList expectedRows = ImmutableList.of( - row(0), - row(1), - row(2), - row(3), - row(4) - ); + ImmutableList expectedRows = ImmutableList.of(row(0), row(1), row(2), row(3), row(4)); List result = sql("SELECT * FROM %s ORDER BY id", tableName); assertEquals("Should correctly add the non-matching rows", expectedRows, result); diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadDelete.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadDelete.java index d5454e036017..048040408fac 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadDelete.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadDelete.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + import java.util.Map; import org.apache.iceberg.AssertHelpers; import org.apache.iceberg.RowDelta; @@ -34,14 +37,15 @@ import org.junit.Test; import org.junit.runners.Parameterized; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - public class TestMergeOnReadDelete extends TestDelete { - public TestMergeOnReadDelete(String catalogName, String implementation, Map config, - String fileFormat, Boolean vectorized, String distributionMode) { + public TestMergeOnReadDelete( + String catalogName, + String implementation, + Map config, + String fileFormat, + Boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -49,8 +53,7 @@ public TestMergeOnReadDelete(String catalogName, String implementation, Map extraTableProperties() { return ImmutableMap.of( TableProperties.FORMAT_VERSION, "2", - TableProperties.DELETE_MODE, "merge-on-read" - ); + TableProperties.DELETE_MODE, "merge-on-read"); } @Parameterized.AfterParam @@ -64,33 +67,42 @@ public void testCommitUnknownException() { // write unpartitioned files append(tableName, "{ \"id\": 1, \"dep\": \"hr\", \"category\": \"c1\"}"); - append(tableName, "{ \"id\": 2, \"dep\": \"hr\", \"category\": \"c1\" }\n" + - "{ \"id\": 3, \"dep\": \"hr\", \"category\": \"c1\" }"); + append( + tableName, + "{ \"id\": 2, \"dep\": \"hr\", \"category\": \"c1\" }\n" + + "{ \"id\": 3, \"dep\": \"hr\", \"category\": \"c1\" }"); Table table = validationCatalog.loadTable(tableIdent); RowDelta newRowDelta = table.newRowDelta(); RowDelta spyNewRowDelta = spy(newRowDelta); - doAnswer(invocation -> { - newRowDelta.commit(); - throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); - }).when(spyNewRowDelta).commit(); + doAnswer( + invocation -> { + newRowDelta.commit(); + throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); + }) + .when(spyNewRowDelta) + .commit(); Table spyTable = spy(table); when(spyTable.newRowDelta()).thenReturn(spyNewRowDelta); SparkTable sparkTable = new SparkTable(spyTable, false); - ImmutableMap config = ImmutableMap.of( - "type", "hive", - "default-namespace", "default" - ); - spark.conf().set("spark.sql.catalog.dummy_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); - config.forEach((key, value) -> spark.conf().set("spark.sql.catalog.dummy_catalog." + key, value)); - Identifier ident = Identifier.of(new String[]{"default"}, "table"); + ImmutableMap config = + ImmutableMap.of( + "type", "hive", + "default-namespace", "default"); + spark + .conf() + .set("spark.sql.catalog.dummy_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); + config.forEach( + (key, value) -> spark.conf().set("spark.sql.catalog.dummy_catalog." + key, value)); + Identifier ident = Identifier.of(new String[] {"default"}, "table"); TestSparkCatalog.setTable(ident, sparkTable); // Although an exception is thrown here, write and commit have succeeded - AssertHelpers.assertThrowsWithCause("Should throw a Commit State Unknown Exception", + AssertHelpers.assertThrowsWithCause( + "Should throw a Commit State Unknown Exception", SparkException.class, "Writing job aborted", CommitStateUnknownException.class, @@ -98,7 +110,8 @@ public void testCommitUnknownException() { () -> sql("DELETE FROM %s WHERE id = 2", "dummy_catalog.default.table")); // Since write and commit succeeded, the rows should be readable - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr", "c1"), row(3, "hr", "c1")), sql("SELECT * FROM %s ORDER BY id", "dummy_catalog.default.table")); } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java index d32f3464c3d0..95e77d441ffb 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -25,8 +24,13 @@ public class TestMergeOnReadMerge extends TestMerge { - public TestMergeOnReadMerge(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestMergeOnReadMerge( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -34,7 +38,6 @@ public TestMergeOnReadMerge(String catalogName, String implementation, Map extraTableProperties() { return ImmutableMap.of( TableProperties.FORMAT_VERSION, "2", - TableProperties.MERGE_MODE, "merge-on-read" - ); + TableProperties.MERGE_MODE, "merge-on-read"); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java index 97c34f155894..9ac6d51b72b1 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.Map; @@ -25,8 +24,13 @@ public class TestMergeOnReadUpdate extends TestUpdate { - public TestMergeOnReadUpdate(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestMergeOnReadUpdate( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -34,7 +38,6 @@ public TestMergeOnReadUpdate(String catalogName, String implementation, Map extraTableProperties() { return ImmutableMap.of( TableProperties.FORMAT_VERSION, "2", - TableProperties.UPDATE_MODE, "merge-on-read" - ); + TableProperties.UPDATE_MODE, "merge-on-read"); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMetadataTables.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMetadataTables.java index c33cbfcce0fc..973cf4844a18 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMetadataTables.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMetadataTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.io.IOException; @@ -48,7 +47,6 @@ import org.junit.Assert; import org.junit.Test; - public class TestMetadataTables extends SparkExtensionsTestBase { public TestMetadataTables(String catalogName, String implementation, Map config) { @@ -62,16 +60,19 @@ public void removeTables() { @Test public void testUnpartitionedTable() throws Exception { - sql("CREATE TABLE %s (id bigint, data string) USING iceberg TBLPROPERTIES" + - "('format-version'='2', 'write.delete.mode'='merge-on-read')", tableName); - - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "d") - ); - spark.createDataset(records, Encoders.bean(SimpleRecord.class)) + sql( + "CREATE TABLE %s (id bigint, data string) USING iceberg TBLPROPERTIES" + + "('format-version'='2', 'write.delete.mode'='merge-on-read')", + tableName); + + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "d")); + spark + .createDataset(records, Encoders.bean(SimpleRecord.class)) .coalesce(1) .writeTo(tableName) .append(); @@ -88,56 +89,66 @@ public void testUnpartitionedTable() throws Exception { Schema filesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + ".files").schema(); // check delete files table - List actualDeleteFiles = spark.sql("SELECT * FROM " + tableName + ".delete_files").collectAsList(); - Assert.assertEquals("Metadata table should return one delete file", 1, actualDeleteFiles.size()); - - List expectedDeleteFiles = expectedEntries(table, FileContent.POSITION_DELETES, - entriesTableSchema, expectedDeleteManifests, null); + List actualDeleteFiles = + spark.sql("SELECT * FROM " + tableName + ".delete_files").collectAsList(); + Assert.assertEquals( + "Metadata table should return one delete file", 1, actualDeleteFiles.size()); + + List expectedDeleteFiles = + expectedEntries( + table, FileContent.POSITION_DELETES, entriesTableSchema, expectedDeleteManifests, null); Assert.assertEquals("Should be one delete file manifest entry", 1, expectedDeleteFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); // check data files table - List actualDataFiles = spark.sql("SELECT * FROM " + tableName + ".data_files").collectAsList(); + List actualDataFiles = + spark.sql("SELECT * FROM " + tableName + ".data_files").collectAsList(); Assert.assertEquals("Metadata table should return one data file", 1, actualDataFiles.size()); - List expectedDataFiles = expectedEntries(table, FileContent.DATA, - entriesTableSchema, expectedDataManifests, null); + List expectedDataFiles = + expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, null); Assert.assertEquals("Should be one data file manifest entry", 1, expectedDataFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedDataFiles.get(0), actualDataFiles.get(0)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedDataFiles.get(0), actualDataFiles.get(0)); // check all files table - List actualFiles = spark.sql("SELECT * FROM " + tableName + ".files ORDER BY content").collectAsList(); + List actualFiles = + spark.sql("SELECT * FROM " + tableName + ".files ORDER BY content").collectAsList(); Assert.assertEquals("Metadata table should return two files", 2, actualFiles.size()); - List expectedFiles = Stream.concat(expectedDataFiles.stream(), expectedDeleteFiles.stream()) - .collect(Collectors.toList()); + List expectedFiles = + Stream.concat(expectedDataFiles.stream(), expectedDeleteFiles.stream()) + .collect(Collectors.toList()); Assert.assertEquals("Should have two files manifest entries", 2, expectedFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedFiles.get(0), actualFiles.get(0)); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedFiles.get(1), actualFiles.get(1)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedFiles.get(0), actualFiles.get(0)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedFiles.get(1), actualFiles.get(1)); } @Test public void testPartitionedTable() throws Exception { - sql("CREATE TABLE %s (id bigint, data string) " + - "USING iceberg " + - "PARTITIONED BY (data) " + - "TBLPROPERTIES" + - "('format-version'='2', 'write.delete.mode'='merge-on-read')", tableName); - - List recordsA = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "a") - ); - spark.createDataset(recordsA, Encoders.bean(SimpleRecord.class)) + sql( + "CREATE TABLE %s (id bigint, data string) " + + "USING iceberg " + + "PARTITIONED BY (data) " + + "TBLPROPERTIES" + + "('format-version'='2', 'write.delete.mode'='merge-on-read')", + tableName); + + List recordsA = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "a")); + spark + .createDataset(recordsA, Encoders.bean(SimpleRecord.class)) .coalesce(1) .writeTo(tableName) .append(); - List recordsB = Lists.newArrayList( - new SimpleRecord(1, "b"), - new SimpleRecord(2, "b") - ); - spark.createDataset(recordsB, Encoders.bean(SimpleRecord.class)) + List recordsB = + Lists.newArrayList(new SimpleRecord(1, "b"), new SimpleRecord(2, "b")); + spark + .createDataset(recordsB, Encoders.bean(SimpleRecord.class)) .coalesce(1) .writeTo(tableName) .append(); @@ -153,59 +164,84 @@ public void testPartitionedTable() throws Exception { Assert.assertEquals("Should have 2 data manifests", 2, expectedDataManifests.size()); Assert.assertEquals("Should have 2 delete manifests", 2, expectedDeleteManifests.size()); - Schema filesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + ".delete_files").schema(); + Schema filesTableSchema = + Spark3Util.loadIcebergTable(spark, tableName + ".delete_files").schema(); // Check delete files table - List expectedDeleteFiles = expectedEntries(table, FileContent.POSITION_DELETES, - entriesTableSchema, expectedDeleteManifests, "a"); - Assert.assertEquals("Should have one delete file manifest entry", 1, expectedDeleteFiles.size()); - - List actualDeleteFiles = spark.sql("SELECT * FROM " + tableName + ".delete_files " + - "WHERE partition.data='a'").collectAsList(); - Assert.assertEquals("Metadata table should return one delete file", 1, actualDeleteFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); + List expectedDeleteFiles = + expectedEntries( + table, FileContent.POSITION_DELETES, entriesTableSchema, expectedDeleteManifests, "a"); + Assert.assertEquals( + "Should have one delete file manifest entry", 1, expectedDeleteFiles.size()); + + List actualDeleteFiles = + spark + .sql("SELECT * FROM " + tableName + ".delete_files " + "WHERE partition.data='a'") + .collectAsList(); + Assert.assertEquals( + "Metadata table should return one delete file", 1, actualDeleteFiles.size()); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); // Check data files table - List expectedDataFiles = expectedEntries(table, FileContent.DATA, - entriesTableSchema, expectedDataManifests, "a"); + List expectedDataFiles = + expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, "a"); Assert.assertEquals("Should have one data file manifest entry", 1, expectedDataFiles.size()); - List actualDataFiles = spark.sql("SELECT * FROM " + tableName + ".data_files " + - "WHERE partition.data='a'").collectAsList(); + List actualDataFiles = + spark + .sql("SELECT * FROM " + tableName + ".data_files " + "WHERE partition.data='a'") + .collectAsList(); Assert.assertEquals("Metadata table should return one data file", 1, actualDataFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedDataFiles.get(0), actualDataFiles.get(0)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedDataFiles.get(0), actualDataFiles.get(0)); List actualPartitionsWithProjection = spark.sql("SELECT file_count FROM " + tableName + ".partitions ").collectAsList(); - Assert.assertEquals("Metadata table should return two partitions record", 2, actualPartitionsWithProjection.size()); + Assert.assertEquals( + "Metadata table should return two partitions record", + 2, + actualPartitionsWithProjection.size()); for (int i = 0; i < 2; ++i) { Assert.assertEquals(1, actualPartitionsWithProjection.get(i).get(0)); } // Check files table - List expectedFiles = Stream.concat(expectedDataFiles.stream(), expectedDeleteFiles.stream()) - .collect(Collectors.toList()); + List expectedFiles = + Stream.concat(expectedDataFiles.stream(), expectedDeleteFiles.stream()) + .collect(Collectors.toList()); Assert.assertEquals("Should have two file manifest entries", 2, expectedFiles.size()); - List actualFiles = spark.sql("SELECT * FROM " + tableName + ".files " + - "WHERE partition.data='a' ORDER BY content").collectAsList(); + List actualFiles = + spark + .sql( + "SELECT * FROM " + + tableName + + ".files " + + "WHERE partition.data='a' ORDER BY content") + .collectAsList(); Assert.assertEquals("Metadata table should return two files", 2, actualFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedFiles.get(0), actualFiles.get(0)); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedFiles.get(1), actualFiles.get(1)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedFiles.get(0), actualFiles.get(0)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedFiles.get(1), actualFiles.get(1)); } @Test public void testAllFilesUnpartitioned() throws Exception { - sql("CREATE TABLE %s (id bigint, data string) USING iceberg TBLPROPERTIES" + - "('format-version'='2', 'write.delete.mode'='merge-on-read')", tableName); - - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "d") - ); - spark.createDataset(records, Encoders.bean(SimpleRecord.class)) + sql( + "CREATE TABLE %s (id bigint, data string) USING iceberg TBLPROPERTIES" + + "('format-version'='2', 'write.delete.mode'='merge-on-read')", + tableName); + + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "d")); + spark + .createDataset(records, Encoders.bean(SimpleRecord.class)) .coalesce(1) .writeTo(tableName) .append(); @@ -224,28 +260,35 @@ public void testAllFilesUnpartitioned() throws Exception { Assert.assertEquals("Table should be cleared", 0, results.size()); Schema entriesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + ".entries").schema(); - Schema filesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + ".all_data_files").schema(); + Schema filesTableSchema = + Spark3Util.loadIcebergTable(spark, tableName + ".all_data_files").schema(); // Check all data files table - List actualDataFiles = spark.sql("SELECT * FROM " + tableName + ".all_data_files").collectAsList(); + List actualDataFiles = + spark.sql("SELECT * FROM " + tableName + ".all_data_files").collectAsList(); - List expectedDataFiles = expectedEntries(table, FileContent.DATA, - entriesTableSchema, expectedDataManifests, null); + List expectedDataFiles = + expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, null); Assert.assertEquals("Should be one data file manifest entry", 1, expectedDataFiles.size()); Assert.assertEquals("Metadata table should return one data file", 1, actualDataFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedDataFiles.get(0), actualDataFiles.get(0)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedDataFiles.get(0), actualDataFiles.get(0)); // Check all delete files table - List actualDeleteFiles = spark.sql("SELECT * FROM " + tableName + ".all_delete_files").collectAsList(); - List expectedDeleteFiles = expectedEntries(table, FileContent.POSITION_DELETES, - entriesTableSchema, expectedDeleteManifests, null); + List actualDeleteFiles = + spark.sql("SELECT * FROM " + tableName + ".all_delete_files").collectAsList(); + List expectedDeleteFiles = + expectedEntries( + table, FileContent.POSITION_DELETES, entriesTableSchema, expectedDeleteManifests, null); Assert.assertEquals("Should be one delete file manifest entry", 1, expectedDeleteFiles.size()); - Assert.assertEquals("Metadata table should return one delete file", 1, actualDeleteFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); + Assert.assertEquals( + "Metadata table should return one delete file", 1, actualDeleteFiles.size()); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); // Check all files table - List actualFiles = spark.sql("SELECT * FROM " + tableName + ".all_files ORDER BY content") - .collectAsList(); + List actualFiles = + spark.sql("SELECT * FROM " + tableName + ".all_files ORDER BY content").collectAsList(); List expectedFiles = ListUtils.union(expectedDataFiles, expectedDeleteFiles); expectedFiles.sort(Comparator.comparing(r -> ((Integer) r.get("content")))); Assert.assertEquals("Metadata table should return two files", 2, actualFiles.size()); @@ -255,26 +298,26 @@ public void testAllFilesUnpartitioned() throws Exception { @Test public void testAllFilesPartitioned() throws Exception { // Create table and insert data - sql("CREATE TABLE %s (id bigint, data string) " + - "USING iceberg " + - "PARTITIONED BY (data) " + - "TBLPROPERTIES" + - "('format-version'='2', 'write.delete.mode'='merge-on-read')", tableName); - - List recordsA = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "a") - ); - spark.createDataset(recordsA, Encoders.bean(SimpleRecord.class)) + sql( + "CREATE TABLE %s (id bigint, data string) " + + "USING iceberg " + + "PARTITIONED BY (data) " + + "TBLPROPERTIES" + + "('format-version'='2', 'write.delete.mode'='merge-on-read')", + tableName); + + List recordsA = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "a")); + spark + .createDataset(recordsA, Encoders.bean(SimpleRecord.class)) .coalesce(1) .writeTo(tableName) .append(); - List recordsB = Lists.newArrayList( - new SimpleRecord(1, "b"), - new SimpleRecord(2, "b") - ); - spark.createDataset(recordsB, Encoders.bean(SimpleRecord.class)) + List recordsB = + Lists.newArrayList(new SimpleRecord(1, "b"), new SimpleRecord(2, "b")); + spark + .createDataset(recordsB, Encoders.bean(SimpleRecord.class)) .coalesce(1) .writeTo(tableName) .append(); @@ -293,30 +336,44 @@ public void testAllFilesPartitioned() throws Exception { Assert.assertEquals("Table should be cleared", 0, results.size()); Schema entriesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + ".entries").schema(); - Schema filesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + ".all_data_files").schema(); + Schema filesTableSchema = + Spark3Util.loadIcebergTable(spark, tableName + ".all_data_files").schema(); // Check all data files table - List actualDataFiles = spark.sql("SELECT * FROM " + tableName + ".all_data_files " + - "WHERE partition.data='a'").collectAsList(); - List expectedDataFiles = expectedEntries(table, FileContent.DATA, - entriesTableSchema, expectedDataManifests, "a"); + List actualDataFiles = + spark + .sql("SELECT * FROM " + tableName + ".all_data_files " + "WHERE partition.data='a'") + .collectAsList(); + List expectedDataFiles = + expectedEntries(table, FileContent.DATA, entriesTableSchema, expectedDataManifests, "a"); Assert.assertEquals("Should be one data file manifest entry", 1, expectedDataFiles.size()); Assert.assertEquals("Metadata table should return one data file", 1, actualDataFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedDataFiles.get(0), actualDataFiles.get(0)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedDataFiles.get(0), actualDataFiles.get(0)); // Check all delete files table - List actualDeleteFiles = spark.sql("SELECT * FROM " + tableName + ".all_delete_files " + - "WHERE partition.data='a'").collectAsList(); - List expectedDeleteFiles = expectedEntries(table, FileContent.POSITION_DELETES, - entriesTableSchema, expectedDeleteManifests, "a"); + List actualDeleteFiles = + spark + .sql("SELECT * FROM " + tableName + ".all_delete_files " + "WHERE partition.data='a'") + .collectAsList(); + List expectedDeleteFiles = + expectedEntries( + table, FileContent.POSITION_DELETES, entriesTableSchema, expectedDeleteManifests, "a"); Assert.assertEquals("Should be one data file manifest entry", 1, expectedDeleteFiles.size()); Assert.assertEquals("Metadata table should return one data file", 1, actualDeleteFiles.size()); - TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); + TestHelpers.assertEqualsSafe( + filesTableSchema.asStruct(), expectedDeleteFiles.get(0), actualDeleteFiles.get(0)); // Check all files table - List actualFiles = spark.sql("SELECT * FROM " + tableName + ".all_files WHERE partition.data='a' " + - "ORDER BY content").collectAsList(); + List actualFiles = + spark + .sql( + "SELECT * FROM " + + tableName + + ".all_files WHERE partition.data='a' " + + "ORDER BY content") + .collectAsList(); List expectedFiles = ListUtils.union(expectedDataFiles, expectedDeleteFiles); expectedFiles.sort(Comparator.comparing(r -> ((Integer) r.get("content")))); Assert.assertEquals("Metadata table should return two files", 2, actualFiles.size()); @@ -326,96 +383,103 @@ public void testAllFilesPartitioned() throws Exception { @Test public void testMetadataLogs() throws Exception { // Create table and insert data - sql("CREATE TABLE %s (id bigint, data string) " + - "USING iceberg " + - "PARTITIONED BY (data) " + - "TBLPROPERTIES " + - "('format-version'='2')", tableName); - - List recordsA = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "a") - ); - spark.createDataset(recordsA, Encoders.bean(SimpleRecord.class)) - .writeTo(tableName) - .append(); - - List recordsB = Lists.newArrayList( - new SimpleRecord(1, "b"), - new SimpleRecord(2, "b") - ); - spark.createDataset(recordsB, Encoders.bean(SimpleRecord.class)) - .writeTo(tableName) - .append(); + sql( + "CREATE TABLE %s (id bigint, data string) " + + "USING iceberg " + + "PARTITIONED BY (data) " + + "TBLPROPERTIES " + + "('format-version'='2')", + tableName); + + List recordsA = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "a")); + spark.createDataset(recordsA, Encoders.bean(SimpleRecord.class)).writeTo(tableName).append(); + + List recordsB = + Lists.newArrayList(new SimpleRecord(1, "b"), new SimpleRecord(2, "b")); + spark.createDataset(recordsB, Encoders.bean(SimpleRecord.class)).writeTo(tableName).append(); Table table = Spark3Util.loadIcebergTable(spark, tableName); Long currentSnapshotId = table.currentSnapshot().snapshotId(); TableMetadata tableMetadata = ((HasTableOperations) table).operations().current(); Snapshot currentSnapshot = tableMetadata.currentSnapshot(); Snapshot parentSnapshot = table.snapshot(currentSnapshot.parentId()); - List metadataLogEntries = Lists.newArrayList(tableMetadata.previousFiles()); + List metadataLogEntries = + Lists.newArrayList(tableMetadata.previousFiles()); // Check metadataLog table List metadataLogs = sql("SELECT * FROM %s.metadata_logs", tableName); - assertEquals("MetadataLogsTable result should match the metadataLog entries", + assertEquals( + "MetadataLogsTable result should match the metadataLog entries", ImmutableList.of( row( metadataLogEntries.get(0).timestampMillis(), metadataLogEntries.get(0).file(), null, null, - null - ), + null), row( metadataLogEntries.get(1).timestampMillis(), metadataLogEntries.get(1).file(), parentSnapshot.snapshotId(), parentSnapshot.schemaId(), - parentSnapshot.sequenceNumber() - ), + parentSnapshot.sequenceNumber()), row( currentSnapshot.timestampMillis(), tableMetadata.metadataFileLocation(), currentSnapshot.snapshotId(), currentSnapshot.schemaId(), - currentSnapshot.sequenceNumber() - )), + currentSnapshot.sequenceNumber())), metadataLogs); // test filtering List metadataLogWithFilters = - sql("SELECT * FROM %s.metadata_logs WHERE latest_snapshot_id = %s", tableName, currentSnapshotId); + sql( + "SELECT * FROM %s.metadata_logs WHERE latest_snapshot_id = %s", + tableName, currentSnapshotId); Assert.assertEquals("metadataLog table should return 1 row", 1, metadataLogWithFilters.size()); - assertEquals("Result should match the latest snapshot entry", - ImmutableList.of(row( - tableMetadata.currentSnapshot().timestampMillis(), - tableMetadata.metadataFileLocation(), - tableMetadata.currentSnapshot().snapshotId(), - tableMetadata.currentSnapshot().schemaId(), - tableMetadata.currentSnapshot().sequenceNumber())), + assertEquals( + "Result should match the latest snapshot entry", + ImmutableList.of( + row( + tableMetadata.currentSnapshot().timestampMillis(), + tableMetadata.metadataFileLocation(), + tableMetadata.currentSnapshot().snapshotId(), + tableMetadata.currentSnapshot().schemaId(), + tableMetadata.currentSnapshot().sequenceNumber())), metadataLogWithFilters); // test projection List metadataFiles = - metadataLogEntries.stream().map(TableMetadata.MetadataLogEntry::file).collect(Collectors.toList()); + metadataLogEntries.stream() + .map(TableMetadata.MetadataLogEntry::file) + .collect(Collectors.toList()); metadataFiles.add(tableMetadata.metadataFileLocation()); List metadataLogWithProjection = sql("SELECT file FROM %s.metadata_logs", tableName); - Assert.assertEquals("metadataLog table should return 3 rows", 3, metadataLogWithProjection.size()); - assertEquals("metadataLog entry should be of same file", + Assert.assertEquals( + "metadataLog table should return 3 rows", 3, metadataLogWithProjection.size()); + assertEquals( + "metadataLog entry should be of same file", metadataFiles.stream().map(this::row).collect(Collectors.toList()), metadataLogWithProjection); } /** * Find matching manifest entries of an Iceberg table + * * @param table iceberg table * @param expectedContent file content to populate on entries * @param entriesTableSchema schema of Manifest entries * @param manifestsToExplore manifests to explore of the table * @param partValue partition value that manifest entries must match, or null to skip filtering */ - private List expectedEntries(Table table, FileContent expectedContent, Schema entriesTableSchema, - List manifestsToExplore, String partValue) throws IOException { + private List expectedEntries( + Table table, + FileContent expectedContent, + Schema entriesTableSchema, + List manifestsToExplore, + String partValue) + throws IOException { List expected = Lists.newArrayList(); for (ManifestFile manifest : manifestsToExplore) { InputFile in = table.io().newInputFile(manifest.path()); diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMigrateTableProcedure.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMigrateTableProcedure.java index d66e75add16f..f9c150a3b1dc 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMigrateTableProcedure.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMigrateTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.io.IOException; @@ -35,12 +34,12 @@ public class TestMigrateTableProcedure extends SparkExtensionsTestBase { - public TestMigrateTableProcedure(String catalogName, String implementation, Map config) { + public TestMigrateTableProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @After public void removeTables() { @@ -52,7 +51,9 @@ public void removeTables() { public void testMigrate() throws IOException { Assume.assumeTrue(catalogName.equals("spark_catalog")); String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + tableName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); Object result = scalarSql("CALL %s.system.migrate('%s')", catalogName, tableName); @@ -65,7 +66,8 @@ public void testMigrate() throws IOException { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -76,10 +78,13 @@ public void testMigrate() throws IOException { public void testMigrateWithOptions() throws IOException { Assume.assumeTrue(catalogName.equals("spark_catalog")); String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + tableName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - Object result = scalarSql("CALL %s.system.migrate('%s', map('foo', 'bar'))", catalogName, tableName); + Object result = + scalarSql("CALL %s.system.migrate('%s', map('foo', 'bar'))", catalogName, tableName); Assert.assertEquals("Should have added one file", 1L, result); @@ -93,7 +98,8 @@ public void testMigrateWithOptions() throws IOException { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -105,10 +111,14 @@ public void testMigrateWithInvalidMetricsConfig() throws IOException { Assume.assumeTrue(catalogName.equals("spark_catalog")); String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", tableName, location); - - AssertHelpers.assertThrows("Should reject invalid metrics config", - ValidationException.class, "Invalid metrics config", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + tableName, location); + + AssertHelpers.assertThrows( + "Should reject invalid metrics config", + ValidationException.class, + "Invalid metrics config", () -> { String props = "map('write.metadata.metrics.column.x', 'X')"; sql("CALL %s.system.migrate('%s', %s)", catalogName, tableName, props); @@ -120,13 +130,17 @@ public void testMigrateWithConflictingProps() throws IOException { Assume.assumeTrue(catalogName.equals("spark_catalog")); String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + tableName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - Object result = scalarSql("CALL %s.system.migrate('%s', map('migrated', 'false'))", catalogName, tableName); + Object result = + scalarSql("CALL %s.system.migrate('%s', map('migrated', 'false'))", catalogName, tableName); Assert.assertEquals("Should have added one file", 1L, result); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); @@ -136,16 +150,22 @@ public void testMigrateWithConflictingProps() throws IOException { @Test public void testInvalidMigrateCases() { - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.migrate()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.migrate(map('foo','bar'))", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.migrate('')", catalogName)); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPublishChangesProcedure.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPublishChangesProcedure.java index f8080818a1e3..2b74cd475fae 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPublishChangesProcedure.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPublishChangesProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; + import java.util.List; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -34,11 +35,10 @@ import org.junit.After; import org.junit.Test; -import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; - public class TestPublishChangesProcedure extends SparkExtensionsTestBase { - public TestPublishChangesProcedure(String catalogName, String implementation, Map config) { + public TestPublishChangesProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -57,26 +57,28 @@ public void testApplyWapChangesUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.publish_changes('%s', '%s')", - catalogName, tableIdent, wapId); + List output = + sql("CALL %s.system.publish_changes('%s', '%s')", catalogName, tableIdent, wapId); table.refresh(); Snapshot currentSnapshot = table.currentSnapshot(); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())), output); - assertEquals("Apply of WAP changes must be successful", + assertEquals( + "Apply of WAP changes must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -91,26 +93,30 @@ public void testApplyWapChangesUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.publish_changes(wap_id => '%s', table => '%s')", - catalogName, wapId, tableIdent); + List output = + sql( + "CALL %s.system.publish_changes(wap_id => '%s', table => '%s')", + catalogName, wapId, tableIdent); table.refresh(); Snapshot currentSnapshot = table.currentSnapshot(); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(wapSnapshot.snapshotId(), currentSnapshot.snapshotId())), output); - assertEquals("Apply of WAP changes must be successful", + assertEquals( + "Apply of WAP changes must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -132,14 +138,15 @@ public void testApplyWapChangesRefreshesRelationCache() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); - sql("CALL %s.system.publish_changes('%s', '%s')", - catalogName, tableIdent, wapId); + sql("CALL %s.system.publish_changes('%s', '%s')", catalogName, tableIdent, wapId); - assertEquals("Apply of WAP changes should be visible", + assertEquals( + "Apply of WAP changes should be visible", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); @@ -150,27 +157,37 @@ public void testApplyWapChangesRefreshesRelationCache() { public void testApplyInvalidWapId() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should reject invalid wap id", - ValidationException.class, "Cannot apply unknown WAP ID", + AssertHelpers.assertThrows( + "Should reject invalid wap id", + ValidationException.class, + "Cannot apply unknown WAP ID", () -> sql("CALL %s.system.publish_changes('%s', 'not_valid')", catalogName, tableIdent)); } @Test public void testInvalidApplyWapChangesCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.publish_changes('n', table => 't', 'not_valid')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.publish_changes('n', 't', 'not_valid')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.publish_changes('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.publish_changes('', 'not_valid')", catalogName)); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRegisterTableProcedure.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRegisterTableProcedure.java index d1f1905e098c..b1a1e30ec8d9 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRegisterTableProcedure.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRegisterTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.List; @@ -41,15 +40,12 @@ public class TestRegisterTableProcedure extends SparkExtensionsTestBase { private final String targetName; public TestRegisterTableProcedure( - String catalogName, - String implementation, - Map config) { + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); targetName = tableName("register_table"); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @After public void dropTables() { @@ -59,13 +55,15 @@ public void dropTables() { @Test public void testRegisterTable() throws NoSuchTableException, ParseException { - Assume.assumeTrue("Register only implemented on Hive Catalogs", + Assume.assumeTrue( + "Register only implemented on Hive Catalogs", spark.conf().get("spark.sql.catalog." + catalogName + ".type").equals("hive")); long numRows = 1000; sql("CREATE TABLE %s (id int, data string) using ICEBERG", tableName); - spark.range(0, numRows) + spark + .range(0, numRows) .withColumn("data", functions.col("id").cast(DataTypes.StringType)) .writeTo(tableName) .append(); @@ -73,17 +71,22 @@ public void testRegisterTable() throws NoSuchTableException, ParseException { Table table = Spark3Util.loadIcebergTable(spark, tableName); long originalFileCount = (long) scalarSql("SELECT COUNT(*) from %s.files", tableName); long currentSnapshotId = table.currentSnapshot().snapshotId(); - String metadataJson = ((HiveTableOperations) (((HasTableOperations) table).operations())).currentMetadataLocation(); + String metadataJson = + ((HiveTableOperations) (((HasTableOperations) table).operations())) + .currentMetadataLocation(); - List result = sql("CALL %s.system.register_table('%s', '%s')", catalogName, targetName, metadataJson); + List result = + sql("CALL %s.system.register_table('%s', '%s')", catalogName, targetName, metadataJson); Assert.assertEquals("Current Snapshot is not correct", currentSnapshotId, result.get(0)[0]); List original = sql("SELECT * FROM %s", tableName); List registered = sql("SELECT * FROM %s", targetName); assertEquals("Registered table rows should match original table rows", original, registered); - Assert.assertEquals("Should have the right row count in the procedure result", - numRows, result.get(0)[1]); - Assert.assertEquals("Should have the right datafile count in the procedure result", - originalFileCount, result.get(0)[2]); + Assert.assertEquals( + "Should have the right row count in the procedure result", numRows, result.get(0)[1]); + Assert.assertEquals( + "Should have the right datafile count in the procedure result", + originalFileCount, + result.get(0)[2]); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRemoveOrphanFilesProcedure.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRemoveOrphanFilesProcedure.java index e3a3bbf64b87..d9ccda55526b 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRemoveOrphanFilesProcedure.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRemoveOrphanFilesProcedure.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; + import java.io.IOException; import java.sql.Timestamp; import java.time.Instant; @@ -46,15 +48,12 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; - public class TestRemoveOrphanFilesProcedure extends SparkExtensionsTestBase { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - public TestRemoveOrphanFilesProcedure(String catalogName, String implementation, Map config) { + public TestRemoveOrphanFilesProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -68,14 +67,11 @@ public void removeTable() { public void testRemoveOrphanFilesInEmptyTable() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - List output = sql( - "CALL %s.system.remove_orphan_files('%s')", - catalogName, tableIdent); + List output = + sql("CALL %s.system.remove_orphan_files('%s')", catalogName, tableIdent); assertEquals("Should be no orphan files", ImmutableList.of(), output); - assertEquals("Should have no rows", - ImmutableList.of(), - sql("SELECT * FROM %s", tableName)); + assertEquals("Should have no rows", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); } @Test @@ -85,7 +81,8 @@ public void testRemoveOrphanFilesInDataFolder() throws IOException { } else { // give a fresh location to Hive tables as Spark will not clean up the table location // correctly while dropping tables through spark_catalog - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", tableName, temp.newFolder()); } @@ -107,31 +104,35 @@ public void testRemoveOrphanFilesInDataFolder() throws IOException { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); // check for orphans in the metadata folder - List output1 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s'," + - "location => '%s')", - catalogName, tableIdent, currentTimestamp, metadataLocation); + List output1 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s'," + + "location => '%s')", + catalogName, tableIdent, currentTimestamp, metadataLocation); assertEquals("Should be no orphan files in the metadata folder", ImmutableList.of(), output1); // check for orphans in the table location - List output2 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output2 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be orphan files in the data folder", 1, output2.size()); // the previous call should have deleted all orphan files - List output3 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output3 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be no more orphan files in the data folder", 0, output3.size()); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -143,7 +144,8 @@ public void testRemoveOrphanFilesDryRun() throws IOException { } else { // give a fresh location to Hive tables as Spark will not clean up the table location // correctly while dropping tables through spark_catalog - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", tableName, temp.newFolder()); } @@ -162,31 +164,35 @@ public void testRemoveOrphanFilesDryRun() throws IOException { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); // check for orphans without deleting - List output1 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s'," + - "dry_run => true)", - catalogName, tableIdent, currentTimestamp); + List output1 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s'," + + "dry_run => true)", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be one orphan files", 1, output1.size()); // actually delete orphans - List output2 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output2 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be one orphan files", 1, output2.size()); // the previous call should have deleted all orphan files - List output3 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output3 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be no more orphan files", 0, output3.size()); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -197,8 +203,10 @@ public void testRemoveOrphanFilesGCDisabled() { sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' 'false')", tableName, GC_ENABLED); - AssertHelpers.assertThrows("Should reject call", - ValidationException.class, "Cannot delete orphan files: GC is disabled", + AssertHelpers.assertThrows( + "Should reject call", + ValidationException.class, + "Cannot delete orphan files: GC is disabled", () -> sql("CALL %s.system.remove_orphan_files('%s')", catalogName, tableIdent)); // reset the property to enable the table purging in removeTable. @@ -214,35 +222,46 @@ public void testRemoveOrphanFilesWap() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); - List output = sql( - "CALL %s.system.remove_orphan_files('%s')", catalogName, tableIdent); + List output = + sql("CALL %s.system.remove_orphan_files('%s')", catalogName, tableIdent); assertEquals("Should be no orphan files", ImmutableList.of(), output); } @Test public void testInvalidRemoveOrphanFilesCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.remove_orphan_files('n', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.remove_orphan_files('n', 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.remove_orphan_files()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.remove_orphan_files('n', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.remove_orphan_files('')", catalogName)); } @@ -253,7 +272,8 @@ public void testConcurrentRemoveOrphanFiles() throws IOException { } else { // give a fresh location to Hive tables as Spark will not clean up the table location // correctly while dropping tables through spark_catalog - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg LOCATION '%s'", tableName, temp.newFolder()); } @@ -278,21 +298,23 @@ public void testConcurrentRemoveOrphanFiles() throws IOException { Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); // check for orphans in the table location - List output = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "max_concurrent_deletes => %s," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, 4, currentTimestamp); + List output = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "max_concurrent_deletes => %s," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, 4, currentTimestamp); Assert.assertEquals("Should be orphan files in the data folder", 4, output.size()); // the previous call should have deleted all orphan files - List output3 = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "max_concurrent_deletes => %s," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, 4, currentTimestamp); + List output3 = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "max_concurrent_deletes => %s," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, 4, currentTimestamp); Assert.assertEquals("Should be no more orphan files in the data folder", 0, output3.size()); assertEquals( @@ -305,16 +327,23 @@ public void testConcurrentRemoveOrphanFiles() throws IOException { public void testConcurrentRemoveOrphanFilesWithInvalidInput() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should throw an error when max_concurrent_deletes = 0", - IllegalArgumentException.class, "max_concurrent_deletes should have value > 0", - () -> sql("CALL %s.system.remove_orphan_files(table => '%s', max_concurrent_deletes => %s)", - catalogName, tableIdent, 0)); + AssertHelpers.assertThrows( + "Should throw an error when max_concurrent_deletes = 0", + IllegalArgumentException.class, + "max_concurrent_deletes should have value > 0", + () -> + sql( + "CALL %s.system.remove_orphan_files(table => '%s', max_concurrent_deletes => %s)", + catalogName, tableIdent, 0)); - AssertHelpers.assertThrows("Should throw an error when max_concurrent_deletes < 0 ", - IllegalArgumentException.class, "max_concurrent_deletes should have value > 0", - () -> sql( - "CALL %s.system.remove_orphan_files(table => '%s', max_concurrent_deletes => %s)", - catalogName, tableIdent, -1)); + AssertHelpers.assertThrows( + "Should throw an error when max_concurrent_deletes < 0 ", + IllegalArgumentException.class, + "max_concurrent_deletes should have value > 0", + () -> + sql( + "CALL %s.system.remove_orphan_files(table => '%s', max_concurrent_deletes => %s)", + catalogName, tableIdent, -1)); String tempViewName = "file_list_test"; spark.emptyDataFrame().createOrReplaceTempView(tempViewName); @@ -359,34 +388,43 @@ public void testConcurrentRemoveOrphanFilesWithInvalidInput() { @Test public void testRemoveOrphanFilesWithDeleteFiles() throws Exception { - sql("CREATE TABLE %s (id int, data string) USING iceberg TBLPROPERTIES" + - "('format-version'='2', 'write.delete.mode'='merge-on-read')", tableName); - - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "d") - ); - spark.createDataset(records, Encoders.bean(SimpleRecord.class)).coalesce(1).writeTo(tableName).append(); + sql( + "CREATE TABLE %s (id int, data string) USING iceberg TBLPROPERTIES" + + "('format-version'='2', 'write.delete.mode'='merge-on-read')", + tableName); + + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "d")); + spark + .createDataset(records, Encoders.bean(SimpleRecord.class)) + .coalesce(1) + .writeTo(tableName) + .append(); sql("DELETE FROM %s WHERE id=1", tableName); Table table = Spark3Util.loadIcebergTable(spark, tableName); - Assert.assertEquals("Should have 1 delete manifest", 1, TestHelpers.deleteManifests(table).size()); + Assert.assertEquals( + "Should have 1 delete manifest", 1, TestHelpers.deleteManifests(table).size()); Assert.assertEquals("Should have 1 delete file", 1, TestHelpers.deleteFiles(table).size()); Path deleteManifestPath = new Path(TestHelpers.deleteManifests(table).iterator().next().path()); - Path deleteFilePath = new Path(String.valueOf(TestHelpers.deleteFiles(table).iterator().next().path())); + Path deleteFilePath = + new Path(String.valueOf(TestHelpers.deleteFiles(table).iterator().next().path())); // wait to ensure files are old enough waitUntilAfter(System.currentTimeMillis()); Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis())); // delete orphans - List output = sql( - "CALL %s.system.remove_orphan_files(" + - "table => '%s'," + - "older_than => TIMESTAMP '%s')", - catalogName, tableIdent, currentTimestamp); + List output = + sql( + "CALL %s.system.remove_orphan_files(" + + "table => '%s'," + + "older_than => TIMESTAMP '%s')", + catalogName, tableIdent, currentTimestamp); Assert.assertEquals("Should be no orphan files", 0, output.size()); FileSystem localFs = FileSystem.getLocal(new Configuration()); @@ -395,9 +433,8 @@ public void testRemoveOrphanFilesWithDeleteFiles() throws Exception { records.remove(new SimpleRecord(1, "a")); Dataset resultDF = spark.read().format("iceberg").load(tableName); - List actualRecords = resultDF - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRequiredDistributionAndOrdering.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRequiredDistributionAndOrdering.java index 1d3ccc766288..cfa7f6622aed 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRequiredDistributionAndOrdering.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRequiredDistributionAndOrdering.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.math.BigDecimal; @@ -35,7 +34,8 @@ public class TestRequiredDistributionAndOrdering extends SparkExtensionsTestBase { - public TestRequiredDistributionAndOrdering(String catalogName, String implementation, Map config) { + public TestRequiredDistributionAndOrdering( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -46,45 +46,50 @@ public void dropTestTable() { @Test public void testDefaultLocalSortWithBucketTransforms() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, c1))", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, c1))", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); // should insert a local sort by partition columns by default inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testPartitionColumnsArePrependedForRangeDistribution() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, c1))", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, c1))", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); @@ -93,26 +98,29 @@ public void testPartitionColumnsArePrependedForRangeDistribution() throws NoSuch inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testSortOrderIncludesPartitionColumns() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, c1))", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, c1))", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); @@ -121,26 +129,29 @@ public void testSortOrderIncludesPartitionColumns() throws NoSuchTableException inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testHashDistributionOnBucketedColumn() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, c1))", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, c1))", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); @@ -149,35 +160,41 @@ public void testHashDistributionOnBucketedColumn() throws NoSuchTableException { inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testDisabledDistributionAndOrdering() { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, c1))", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, c1))", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); // should fail if ordering is disabled - AssertHelpers.assertThrows("Should reject writes without ordering", - SparkException.class, "Writing job aborted", + AssertHelpers.assertThrows( + "Should reject writes without ordering", + SparkException.class, + "Writing job aborted", () -> { try { - inputDF.writeTo(tableName) + inputDF + .writeTo(tableName) .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false") .append(); } catch (NoSuchTableException e) { @@ -188,92 +205,96 @@ public void testDisabledDistributionAndOrdering() { @Test public void testDefaultSortOnDecimalBucketedColumn() { - sql("CREATE TABLE %s (c1 INT, c2 DECIMAL(20, 2)) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, c2))", tableName); + sql( + "CREATE TABLE %s (c1 INT, c2 DECIMAL(20, 2)) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, c2))", + tableName); sql("INSERT INTO %s VALUES (1, 20.2), (2, 40.2), (3, 60.2)", tableName); - List expected = ImmutableList.of( - row(1, new BigDecimal("20.20")), - row(2, new BigDecimal("40.20")), - row(3, new BigDecimal("60.20")) - ); + List expected = + ImmutableList.of( + row(1, new BigDecimal("20.20")), + row(2, new BigDecimal("40.20")), + row(3, new BigDecimal("60.20"))); assertEquals("Rows must match", expected, sql("SELECT * FROM %s ORDER BY c1", tableName)); } @Test public void testDefaultSortOnStringBucketedColumn() { - sql("CREATE TABLE %s (c1 INT, c2 STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, c2))", tableName); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, c2))", + tableName); sql("INSERT INTO %s VALUES (1, 'A'), (2, 'B')", tableName); - List expected = ImmutableList.of( - row(1, "A"), - row(2, "B") - ); + List expected = ImmutableList.of(row(1, "A"), row(2, "B")); assertEquals("Rows must match", expected, sql("SELECT * FROM %s ORDER BY c1", tableName)); } @Test public void testDefaultSortOnDecimalTruncatedColumn() { - sql("CREATE TABLE %s (c1 INT, c2 DECIMAL(20, 2)) " + - "USING iceberg " + - "PARTITIONED BY (truncate(2, c2))", tableName); + sql( + "CREATE TABLE %s (c1 INT, c2 DECIMAL(20, 2)) " + + "USING iceberg " + + "PARTITIONED BY (truncate(2, c2))", + tableName); sql("INSERT INTO %s VALUES (1, 20.2), (2, 40.2)", tableName); - List expected = ImmutableList.of( - row(1, new BigDecimal("20.20")), - row(2, new BigDecimal("40.20")) - ); + List expected = + ImmutableList.of(row(1, new BigDecimal("20.20")), row(2, new BigDecimal("40.20"))); assertEquals("Rows must match", expected, sql("SELECT * FROM %s ORDER BY c1", tableName)); } @Test public void testDefaultSortOnLongTruncatedColumn() { - sql("CREATE TABLE %s (c1 INT, c2 BIGINT) " + - "USING iceberg " + - "PARTITIONED BY (truncate(2, c2))", tableName); + sql( + "CREATE TABLE %s (c1 INT, c2 BIGINT) " + + "USING iceberg " + + "PARTITIONED BY (truncate(2, c2))", + tableName); sql("INSERT INTO %s VALUES (1, 22222222222222), (2, 444444444444)", tableName); - List expected = ImmutableList.of( - row(1, 22222222222222L), - row(2, 444444444444L) - ); + List expected = ImmutableList.of(row(1, 22222222222222L), row(2, 444444444444L)); assertEquals("Rows must match", expected, sql("SELECT * FROM %s ORDER BY c1", tableName)); } @Test public void testRangeDistributionWithQuotedColumnNames() throws NoSuchTableException { - sql("CREATE TABLE %s (`c.1` INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, `c.1`))", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (`c.1` INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, `c.1`))", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); - Dataset inputDF = ds.selectExpr("c1 as `c.1`", "c2", "c3").coalesce(1).sortWithinPartitions("`c.1`"); + Dataset inputDF = + ds.selectExpr("c1 as `c.1`", "c2", "c3").coalesce(1).sortWithinPartitions("`c.1`"); sql("ALTER TABLE %s WRITE ORDERED BY `c.1`, c2", tableName); inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java index 9fdb0b1ed5e5..0a09b22f1a4d 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.List; @@ -41,12 +40,12 @@ import org.junit.Assume; import org.junit.Test; - public class TestRewriteDataFilesProcedure extends SparkExtensionsTestBase { private static final String QUOTED_SPECIAL_CHARS_TABLE_NAME = "`table:with.special:chars`"; - public TestRewriteDataFilesProcedure(String catalogName, String implementation, Map config) { + public TestRewriteDataFilesProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -58,20 +57,19 @@ public void removeTable() { @Test public void testZOrderSortExpression() { - List order = ExtendedParser.parseSortOrder(spark, "c1, zorder(c2, c3)"); + List order = + ExtendedParser.parseSortOrder(spark, "c1, zorder(c2, c3)"); Assert.assertEquals("Should parse 2 order fields", 2, order.size()); - Assert.assertEquals("First field should be a ref", "c1", ((NamedReference) order.get(0).term()).name()); + Assert.assertEquals( + "First field should be a ref", "c1", ((NamedReference) order.get(0).term()).name()); Assert.assertTrue("Second field should be zorder", order.get(1).term() instanceof Zorder); } @Test public void testRewriteDataFilesInEmptyTable() { createTable(); - List output = sql( - "CALL %s.system.rewrite_data_files('%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(0, 0)), - output); + List output = sql("CALL %s.system.rewrite_data_files('%s')", catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(0, 0)), output); } @Test @@ -81,10 +79,11 @@ public void testRewriteDataFilesOnPartitionTable() { insertData(10); List expectedRecords = currentData(); - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s')", catalogName, tableIdent); + List output = + sql("CALL %s.system.rewrite_data_files(table => '%s')", catalogName, tableIdent); - assertEquals("Action should rewrite 10 data files and add 2 data files (one per partition) ", + assertEquals( + "Action should rewrite 10 data files and add 2 data files (one per partition) ", ImmutableList.of(row(10, 2)), output); @@ -99,10 +98,11 @@ public void testRewriteDataFilesOnNonPartitionTable() { insertData(10); List expectedRecords = currentData(); - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s')", catalogName, tableIdent); + List output = + sql("CALL %s.system.rewrite_data_files(table => '%s')", catalogName, tableIdent); - assertEquals("Action should rewrite 10 data files and add 1 data files", + assertEquals( + "Action should rewrite 10 data files and add 1 data files", ImmutableList.of(row(10, 1)), output); @@ -118,11 +118,13 @@ public void testRewriteDataFilesWithOptions() { List expectedRecords = currentData(); // set the min-input-files = 12, instead of default 5 to skip compacting the files. - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s', options => map('min-input-files','12'))", - catalogName, tableIdent); + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s', options => map('min-input-files','12'))", + catalogName, tableIdent); - assertEquals("Action should rewrite 0 data files and add 0 data files", + assertEquals( + "Action should rewrite 0 data files and add 0 data files", ImmutableList.of(row(0, 0)), output); @@ -138,12 +140,14 @@ public void testRewriteDataFilesWithSortStrategy() { List expectedRecords = currentData(); // set sort_order = c1 DESC LAST - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s', " + - "strategy => 'sort', sort_order => 'c1 DESC NULLS LAST')", - catalogName, tableIdent); - - assertEquals("Action should rewrite 10 data files and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s', " + + "strategy => 'sort', sort_order => 'c1 DESC NULLS LAST')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 10 data files and add 1 data files", ImmutableList.of(row(10, 1)), output); @@ -158,29 +162,32 @@ public void testRewriteDataFilesWithZOrder() { insertData(10); // set z_order = c1,c2 - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s', " + - "strategy => 'sort', sort_order => 'zorder(c1,c2)')", - catalogName, tableIdent); - - assertEquals("Action should rewrite 10 data files and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s', " + + "strategy => 'sort', sort_order => 'zorder(c1,c2)')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 10 data files and add 1 data files", ImmutableList.of(row(10, 1)), output); // Due to Z_order, the data written will be in the below order. - // As there is only one small output file, we can validate the query ordering (as it will not change). - ImmutableList expectedRows = ImmutableList.of( - row(2, "bar", null), - row(2, "bar", null), - row(2, "bar", null), - row(2, "bar", null), - row(2, "bar", null), - row(1, "foo", null), - row(1, "foo", null), - row(1, "foo", null), - row(1, "foo", null), - row(1, "foo", null) - ); + // As there is only one small output file, we can validate the query ordering (as it will not + // change). + ImmutableList expectedRows = + ImmutableList.of( + row(2, "bar", null), + row(2, "bar", null), + row(2, "bar", null), + row(2, "bar", null), + row(2, "bar", null), + row(1, "foo", null), + row(1, "foo", null), + row(1, "foo", null), + row(1, "foo", null), + row(1, "foo", null)); assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s", tableName)); } @@ -192,11 +199,14 @@ public void testRewriteDataFilesWithFilter() { List expectedRecords = currentData(); // select only 5 files for compaction (files that may have c1 = 1) - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 = 1 and c2 is not null')", catalogName, tableIdent); - - assertEquals("Action should rewrite 5 data files (containing c1 = 1) and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + + " where => 'c1 = 1 and c2 is not null')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 5 data files (containing c1 = 1) and add 1 data files", ImmutableList.of(row(5, 1)), output); @@ -212,12 +222,14 @@ public void testRewriteDataFilesWithFilterOnPartitionTable() { List expectedRecords = currentData(); // select only 5 files for compaction (files in the partition c2 = 'bar') - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c2 = \"bar\"')", catalogName, tableIdent); - - assertEquals("Action should rewrite 5 data files from single matching partition" + - "(containing c2 = bar) and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c2 = \"bar\"')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 5 data files from single matching partition" + + "(containing c2 = bar) and add 1 data files", ImmutableList.of(row(5, 1)), output); @@ -233,12 +245,14 @@ public void testRewriteDataFilesWithInFilterOnPartitionTable() { List expectedRecords = currentData(); // select only 5 files for compaction (files in the partition c2 in ('bar')) - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c2 in (\"bar\")')", catalogName, tableIdent); - - assertEquals("Action should rewrite 5 data files from single matching partition" + - "(containing c2 = bar) and add 1 data files", + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c2 in (\"bar\")')", + catalogName, tableIdent); + + assertEquals( + "Action should rewrite 5 data files from single matching partition" + + "(containing c2 = bar) and add 1 data files", ImmutableList.of(row(5, 1)), output); @@ -256,43 +270,56 @@ public void testRewriteDataFilesWithAllPossibleFilters() { // So that parsing can be tested on a same dataset without actually compacting the files. // EqualTo - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 = 3')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 = 3')", + catalogName, tableIdent); // GreaterThan - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 > 3')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 > 3')", + catalogName, tableIdent); // GreaterThanOrEqual - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 >= 3')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 >= 3')", + catalogName, tableIdent); // LessThan - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 < 0')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 < 0')", + catalogName, tableIdent); // LessThanOrEqual - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 <= 0')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 <= 0')", + catalogName, tableIdent); // In - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 in (3,4,5)')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 in (3,4,5)')", + catalogName, tableIdent); // IsNull - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 is null')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 is null')", + catalogName, tableIdent); // IsNotNull - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c3 is not null')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c3 is not null')", + catalogName, tableIdent); // And - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 = 3 and c2 = \"bar\"')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 = 3 and c2 = \"bar\"')", + catalogName, tableIdent); // Or - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 = 3 or c1 = 5')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 = 3 or c1 = 5')", + catalogName, tableIdent); // Not - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c1 not in (1,2)')", catalogName, tableIdent); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c1 not in (1,2)')", + catalogName, tableIdent); // StringStartsWith - sql("CALL %s.system.rewrite_data_files(table => '%s'," + - " where => 'c2 like \"%s\"')", catalogName, tableIdent, "car%"); + sql( + "CALL %s.system.rewrite_data_files(table => '%s'," + " where => 'c2 like \"%s\"')", + catalogName, tableIdent, "car%"); - // TODO: Enable when org.apache.iceberg.spark.SparkFilters have implementations for StringEndsWith & StringContains + // TODO: Enable when org.apache.iceberg.spark.SparkFilters have implementations for + // StringEndsWith & StringContains // StringEndsWith // sql("CALL %s.system.rewrite_data_files(table => '%s'," + // " where => 'c2 like \"%s\"')", catalogName, tableIdent, "%car"); @@ -308,77 +335,125 @@ public void testRewriteDataFilesWithInvalidInputs() { insertData(2); // Test for invalid strategy - AssertHelpers.assertThrows("Should reject calls with unsupported strategy error message", - IllegalArgumentException.class, "unsupported strategy: temp. Only binpack or sort is supported", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', options => map('min-input-files','2'), " + - "strategy => 'temp')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with unsupported strategy error message", + IllegalArgumentException.class, + "unsupported strategy: temp. Only binpack or sort is supported", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', options => map('min-input-files','2'), " + + "strategy => 'temp')", + catalogName, tableIdent)); // Test for sort_order with binpack strategy - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Cannot set strategy to sort, it has already been set", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'binpack', " + - "sort_order => 'c1 ASC NULLS FIRST')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Cannot set strategy to sort, it has already been set", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'binpack', " + + "sort_order => 'c1 ASC NULLS FIRST')", + catalogName, tableIdent)); // Test for sort_order with invalid null order - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Unable to parse sortOrder:", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + - "sort_order => 'c1 ASC none')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Unable to parse sortOrder:", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + + "sort_order => 'c1 ASC none')", + catalogName, tableIdent)); // Test for sort_order with invalid sort direction - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Unable to parse sortOrder:", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + - "sort_order => 'c1 none NULLS FIRST')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Unable to parse sortOrder:", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + + "sort_order => 'c1 none NULLS FIRST')", + catalogName, tableIdent)); // Test for sort_order with invalid column name - AssertHelpers.assertThrows("Should reject calls with error message", - ValidationException.class, "Cannot find field 'col1' in struct:" + - " struct<1: c1: optional int, 2: c2: optional string, 3: c3: optional string>", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + - "sort_order => 'col1 DESC NULLS FIRST')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + ValidationException.class, + "Cannot find field 'col1' in struct:" + + " struct<1: c1: optional int, 2: c2: optional string, 3: c3: optional string>", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + + "sort_order => 'col1 DESC NULLS FIRST')", + catalogName, tableIdent)); // Test for sort_order with invalid filter column col1 - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Cannot parse predicates in where option: col1 = 3", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', " + - "where => 'col1 = 3')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Cannot parse predicates in where option: col1 = 3", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', " + "where => 'col1 = 3')", + catalogName, tableIdent)); // Test for z_order with invalid column name - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Cannot find column 'col1' in table schema: " + - "struct<1: c1: optional int, 2: c2: optional string, 3: c3: optional string>", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + - "sort_order => 'zorder(col1)')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Cannot find column 'col1' in table schema: " + + "struct<1: c1: optional int, 2: c2: optional string, 3: c3: optional string>", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + + "sort_order => 'zorder(col1)')", + catalogName, tableIdent)); // Test for z_order with sort_order - AssertHelpers.assertThrows("Should reject calls with error message", - IllegalArgumentException.class, "Cannot mix identity sort columns and a Zorder sort expression:" + - " c1,zorder(c2,c3)", - () -> sql("CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + - "sort_order => 'c1,zorder(c2,c3)')", catalogName, tableIdent)); + AssertHelpers.assertThrows( + "Should reject calls with error message", + IllegalArgumentException.class, + "Cannot mix identity sort columns and a Zorder sort expression:" + " c1,zorder(c2,c3)", + () -> + sql( + "CALL %s.system.rewrite_data_files(table => '%s', strategy => 'sort', " + + "sort_order => 'c1,zorder(c2,c3)')", + catalogName, tableIdent)); } @Test public void testInvalidCasesForRewriteDataFiles() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.rewrite_data_files('n', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.rewrite_data_files('n', 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rewrite_data_files()", catalogName)); - AssertHelpers.assertThrows("Should reject duplicate arg names name", - AnalysisException.class, "Duplicate procedure argument: table", + AssertHelpers.assertThrows( + "Should reject duplicate arg names name", + AnalysisException.class, + "Duplicate procedure argument: table", () -> sql("CALL %s.system.rewrite_data_files(table => 't', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.rewrite_data_files('')", catalogName)); } @@ -386,17 +461,21 @@ public void testInvalidCasesForRewriteDataFiles() { public void testBinPackTableWithSpecialChars() { Assume.assumeTrue(catalogName.equals(SparkCatalogConfig.HADOOP.catalogName())); - sql("CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg", tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); + sql( + "CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg", + tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); insertData(tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME), 10); List expectedRecords = currentData(tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); - List output = sql( - "CALL %s.system.rewrite_data_files(table => '%s', where => 'c2 is not null')", - catalogName, tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); + List output = + sql( + "CALL %s.system.rewrite_data_files(table => '%s', where => 'c2 is not null')", + catalogName, tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); - assertEquals("Action should rewrite 10 data files and add 1 data file", + assertEquals( + "Action should rewrite 10 data files and add 1 data file", ImmutableList.of(row(10, 1)), output); @@ -410,21 +489,25 @@ public void testBinPackTableWithSpecialChars() { public void testSortTableWithSpecialChars() { Assume.assumeTrue(catalogName.equals(SparkCatalogConfig.HADOOP.catalogName())); - sql("CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg", tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); + sql( + "CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg", + tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); insertData(tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME), 10); List expectedRecords = currentData(tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); - List output = sql( - "CALL %s.system.rewrite_data_files(" + - " table => '%s'," + - " strategy => 'sort'," + - " sort_order => 'c1'," + - " where => 'c2 is not null')", - catalogName, tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); - - assertEquals("Action should rewrite 10 data files and add 1 data file", + List output = + sql( + "CALL %s.system.rewrite_data_files(" + + " table => '%s'," + + " strategy => 'sort'," + + " sort_order => 'c1'," + + " where => 'c2 is not null')", + catalogName, tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); + + assertEquals( + "Action should rewrite 10 data files and add 1 data file", ImmutableList.of(row(10, 1)), output); @@ -438,21 +521,25 @@ public void testSortTableWithSpecialChars() { public void testZOrderTableWithSpecialChars() { Assume.assumeTrue(catalogName.equals(SparkCatalogConfig.HADOOP.catalogName())); - sql("CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg", tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); + sql( + "CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg", + tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); insertData(tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME), 10); List expectedRecords = currentData(tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); - List output = sql( - "CALL %s.system.rewrite_data_files(" + - " table => '%s'," + - " strategy => 'sort'," + - " sort_order => 'zorder(c1, c2)'," + - " where => 'c2 is not null')", - catalogName, tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); - - assertEquals("Action should rewrite 10 data files and add 1 data file", + List output = + sql( + "CALL %s.system.rewrite_data_files(" + + " table => '%s'," + + " strategy => 'sort'," + + " sort_order => 'zorder(c1, c2)'," + + " where => 'c2 is not null')", + catalogName, tableName(QUOTED_SPECIAL_CHARS_TABLE_NAME)); + + assertEquals( + "Action should rewrite 10 data files and add 1 data file", ImmutableList.of(row(10, 1)), output); @@ -467,7 +554,9 @@ private void createTable() { } private void createPartitionTable() { - sql("CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg PARTITIONED BY (c2)", tableName); + sql( + "CREATE TABLE %s (c1 int, c2 string, c3 string) USING iceberg PARTITIONED BY (c2)", + tableName); } private void insertData(int filesCount) { @@ -479,12 +568,15 @@ private void insertData(String table, int filesCount) { ThreeColumnRecord record2 = new ThreeColumnRecord(2, "bar", null); List records = Lists.newArrayList(); - IntStream.range(0, filesCount / 2).forEach(i -> { - records.add(record1); - records.add(record2); - }); - - Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).repartition(filesCount); + IntStream.range(0, filesCount / 2) + .forEach( + i -> { + records.add(record1); + records.add(record2); + }); + + Dataset df = + spark.createDataFrame(records, ThreeColumnRecord.class).repartition(filesCount); try { df.writeTo(table).append(); } catch (org.apache.spark.sql.catalyst.analysis.NoSuchTableException e) { diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteManifestsProcedure.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteManifestsProcedure.java index dcf0a2d91e3e..7c5ec1f5cf3f 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteManifestsProcedure.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteManifestsProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; + import java.util.List; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -30,11 +31,10 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED; - public class TestRewriteManifestsProcedure extends SparkExtensionsTestBase { - public TestRewriteManifestsProcedure(String catalogName, String implementation, Map config) { + public TestRewriteManifestsProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -46,40 +46,42 @@ public void removeTable() { @Test public void testRewriteManifestsInEmptyTable() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - List output = sql( - "CALL %s.system.rewrite_manifests('%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(0, 0)), - output); + List output = sql("CALL %s.system.rewrite_manifests('%s')", catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(0, 0)), output); } @Test public void testRewriteLargeManifests() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", + tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Must have 1 manifest", 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 1 manifest", 1, table.currentSnapshot().allManifests(table.io()).size()); sql("ALTER TABLE %s SET TBLPROPERTIES ('commit.manifest.target-size-bytes' '1')", tableName); - List output = sql( - "CALL %s.system.rewrite_manifests('%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(1, 4)), - output); + List output = sql("CALL %s.system.rewrite_manifests('%s')", catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(1, 4)), output); table.refresh(); - Assert.assertEquals("Must have 4 manifests", 4, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 4 manifests", 4, table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testRewriteSmallManifestsWithSnapshotIdInheritance() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", + tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' '%s')", tableName, SNAPSHOT_ID_INHERITANCE_ENABLED, "true"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' '%s')", + tableName, SNAPSHOT_ID_INHERITANCE_ENABLED, "true"); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName); @@ -88,87 +90,107 @@ public void testRewriteSmallManifestsWithSnapshotIdInheritance() { Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Must have 4 manifest", 4, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 4 manifest", 4, table.currentSnapshot().allManifests(table.io()).size()); - List output = sql( - "CALL %s.system.rewrite_manifests(table => '%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(4, 1)), - output); + List output = + sql("CALL %s.system.rewrite_manifests(table => '%s')", catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(4, 1)), output); table.refresh(); - Assert.assertEquals("Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testRewriteSmallManifestsWithoutCaching() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", + tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Must have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); - List output = sql( - "CALL %s.system.rewrite_manifests(use_caching => false, table => '%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(2, 1)), - output); + List output = + sql( + "CALL %s.system.rewrite_manifests(use_caching => false, table => '%s')", + catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(2, 1)), output); table.refresh(); - Assert.assertEquals("Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testRewriteManifestsCaseInsensitiveArgs() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg PARTITIONED BY (data)", + tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'b')", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Must have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 2 manifest", 2, table.currentSnapshot().allManifests(table.io()).size()); - List output = sql( - "CALL %s.system.rewrite_manifests(usE_cAcHiNg => false, tAbLe => '%s')", catalogName, tableIdent); - assertEquals("Procedure output must match", - ImmutableList.of(row(2, 1)), - output); + List output = + sql( + "CALL %s.system.rewrite_manifests(usE_cAcHiNg => false, tAbLe => '%s')", + catalogName, tableIdent); + assertEquals("Procedure output must match", ImmutableList.of(row(2, 1)), output); table.refresh(); - Assert.assertEquals("Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); + Assert.assertEquals( + "Must have 1 manifests", 1, table.currentSnapshot().allManifests(table.io()).size()); } @Test public void testInvalidRewriteManifestsCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", () -> sql("CALL %s.system.rewrite_manifests('n', table => 't')", catalogName)); - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.rewrite_manifests('n', 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rewrite_manifests()", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.rewrite_manifests('n', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject duplicate arg names name", - AnalysisException.class, "Duplicate procedure argument: table", + AssertHelpers.assertThrows( + "Should reject duplicate arg names name", + AnalysisException.class, + "Duplicate procedure argument: table", () -> sql("CALL %s.system.rewrite_manifests(table => 't', tAbLe => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.rewrite_manifests('')", catalogName)); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToSnapshotProcedure.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToSnapshotProcedure.java index d3e6bdcbc285..af94b456d02e 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToSnapshotProcedure.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToSnapshotProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.util.List; @@ -36,7 +35,8 @@ public class TestRollbackToSnapshotProcedure extends SparkExtensionsTestBase { - public TestRollbackToSnapshotProcedure(String catalogName, String implementation, Map config) { + public TestRollbackToSnapshotProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -55,7 +55,8 @@ public void testRollbackToSnapshotUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -63,15 +64,18 @@ public void testRollbackToSnapshotUsingPositionalArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.rollback_to_snapshot('%s', %dL)", - catalogName, tableIdent, firstSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.rollback_to_snapshot('%s', %dL)", + catalogName, tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -86,7 +90,8 @@ public void testRollbackToSnapshotUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -94,15 +99,18 @@ public void testRollbackToSnapshotUsingNamedArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.rollback_to_snapshot(snapshot_id => %dL, table => '%s')", - catalogName, firstSnapshot.snapshotId(), tableIdent); + List output = + sql( + "CALL %s.system.rollback_to_snapshot(snapshot_id => %dL, table => '%s')", + catalogName, firstSnapshot.snapshotId(), tableIdent); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -126,21 +134,23 @@ public void testRollbackToSnapshotRefreshesRelationCache() { spark.sql("CACHE TABLE tmp"); - assertEquals("View should have expected rows", + assertEquals( + "View should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM tmp")); - List output = sql( - "CALL %s.system.rollback_to_snapshot(table => '%s', snapshot_id => %dL)", - catalogName, tableIdent, firstSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.rollback_to_snapshot(table => '%s', snapshot_id => %dL)", + catalogName, tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("View cache must be invalidated", - ImmutableList.of(row(1L, "a")), - sql("SELECT * FROM tmp")); + assertEquals( + "View cache must be invalidated", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); sql("UNCACHE TABLE tmp"); } @@ -155,7 +165,8 @@ public void testRollbackToSnapshotWithQuotedIdentifiers() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -171,15 +182,20 @@ public void testRollbackToSnapshotWithQuotedIdentifiers() { } String quotedNamespace = quotedNamespaceBuilder.toString(); - List output = sql( - "CALL %s.system.rollback_to_snapshot('%s', %d)", - catalogName, quotedNamespace + ".`" + tableIdent.name() + "`", firstSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.rollback_to_snapshot('%s', %d)", + catalogName, + quotedNamespace + ".`" + tableIdent.name() + "`", + firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -196,7 +212,8 @@ public void testRollbackToSnapshotWithoutExplicitCatalog() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -205,15 +222,16 @@ public void testRollbackToSnapshotWithoutExplicitCatalog() { Snapshot secondSnapshot = table.currentSnapshot(); // use camel case intentionally to test case sensitivity - List output = sql( - "CALL SyStEm.rOLlBaCk_to_SnApShOt('%s', %dL)", - tableIdent, firstSnapshot.snapshotId()); + List output = + sql("CALL SyStEm.rOLlBaCk_to_SnApShOt('%s', %dL)", tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -222,39 +240,58 @@ public void testRollbackToSnapshotWithoutExplicitCatalog() { public void testRollbackToInvalidSnapshot() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); - AssertHelpers.assertThrows("Should reject invalid snapshot id", - ValidationException.class, "Cannot roll back to unknown snapshot id", + AssertHelpers.assertThrows( + "Should reject invalid snapshot id", + ValidationException.class, + "Cannot roll back to unknown snapshot id", () -> sql("CALL %s.system.rollback_to_snapshot('%s', -1L)", catalogName, tableIdent)); } @Test public void testInvalidRollbackToSnapshotCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", - () -> sql("CALL %s.system.rollback_to_snapshot(namespace => 'n1', table => 't', 1L)", catalogName)); - - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", + () -> + sql( + "CALL %s.system.rollback_to_snapshot(namespace => 'n1', table => 't', 1L)", + catalogName)); + + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.rollback_to_snapshot('n', 't', 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_snapshot('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_snapshot(1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_snapshot(table => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for snapshot_id: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for snapshot_id: cannot cast", () -> sql("CALL %s.system.rollback_to_snapshot('t', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.rollback_to_snapshot('', 1L)", catalogName)); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToTimestampProcedure.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToTimestampProcedure.java index 52fc12c7d01e..6da3853bbe24 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToTimestampProcedure.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRollbackToTimestampProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.time.LocalDateTime; @@ -36,7 +35,8 @@ public class TestRollbackToTimestampProcedure extends SparkExtensionsTestBase { - public TestRollbackToTimestampProcedure(String catalogName, String implementation, Map config) { + public TestRollbackToTimestampProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -58,7 +58,8 @@ public void testRollbackToTimestampUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -66,15 +67,18 @@ public void testRollbackToTimestampUsingPositionalArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.rollback_to_timestamp('%s',TIMESTAMP '%s')", - catalogName, tableIdent, firstSnapshotTimestamp); + List output = + sql( + "CALL %s.system.rollback_to_timestamp('%s',TIMESTAMP '%s')", + catalogName, tableIdent, firstSnapshotTimestamp); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -92,7 +96,8 @@ public void testRollbackToTimestampUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -100,15 +105,18 @@ public void testRollbackToTimestampUsingNamedArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.rollback_to_timestamp(timestamp => TIMESTAMP '%s', table => '%s')", - catalogName, firstSnapshotTimestamp, tableIdent); + List output = + sql( + "CALL %s.system.rollback_to_timestamp(timestamp => TIMESTAMP '%s', table => '%s')", + catalogName, firstSnapshotTimestamp, tableIdent); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -135,21 +143,23 @@ public void testRollbackToTimestampRefreshesRelationCache() { spark.sql("CACHE TABLE tmp"); - assertEquals("View should have expected rows", + assertEquals( + "View should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM tmp")); - List output = sql( - "CALL %s.system.rollback_to_timestamp(table => '%s', timestamp => TIMESTAMP '%s')", - catalogName, tableIdent, firstSnapshotTimestamp); + List output = + sql( + "CALL %s.system.rollback_to_timestamp(table => '%s', timestamp => TIMESTAMP '%s')", + catalogName, tableIdent, firstSnapshotTimestamp); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("View cache must be invalidated", - ImmutableList.of(row(1L, "a")), - sql("SELECT * FROM tmp")); + assertEquals( + "View cache must be invalidated", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); sql("UNCACHE TABLE tmp"); } @@ -167,7 +177,8 @@ public void testRollbackToTimestampWithQuotedIdentifiers() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -183,15 +194,18 @@ public void testRollbackToTimestampWithQuotedIdentifiers() { } String quotedNamespace = quotedNamespaceBuilder.toString(); - List output = sql( - "CALL %s.system.rollback_to_timestamp('%s', TIMESTAMP '%s')", - catalogName, quotedNamespace + ".`" + tableIdent.name() + "`", firstSnapshotTimestamp); + List output = + sql( + "CALL %s.system.rollback_to_timestamp('%s', TIMESTAMP '%s')", + catalogName, quotedNamespace + ".`" + tableIdent.name() + "`", firstSnapshotTimestamp); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -211,7 +225,8 @@ public void testRollbackToTimestampWithoutExplicitCatalog() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -220,15 +235,18 @@ public void testRollbackToTimestampWithoutExplicitCatalog() { Snapshot secondSnapshot = table.currentSnapshot(); // use camel case intentionally to test case sensitivity - List output = sql( - "CALL SyStEm.rOLlBaCk_to_TiMeStaMp('%s', TIMESTAMP '%s')", - tableIdent, firstSnapshotTimestamp); + List output = + sql( + "CALL SyStEm.rOLlBaCk_to_TiMeStaMp('%s', TIMESTAMP '%s')", + tableIdent, firstSnapshotTimestamp); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Rollback must be successful", + assertEquals( + "Rollback must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -237,32 +255,50 @@ public void testRollbackToTimestampWithoutExplicitCatalog() { public void testInvalidRollbackToTimestampCases() { String timestamp = "TIMESTAMP '2007-12-03T10:15:30'"; - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", - () -> sql("CALL %s.system.rollback_to_timestamp(namespace => 'n1', 't', %s)", catalogName, timestamp)); - - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", + () -> + sql( + "CALL %s.system.rollback_to_timestamp(namespace => 'n1', 't', %s)", + catalogName, timestamp)); + + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.rollback_to_timestamp('n', 't', %s)", catalogName, timestamp)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_timestamp('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_timestamp(timestamp => %s)", catalogName, timestamp)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.rollback_to_timestamp(table => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with extra args", - AnalysisException.class, "Too many arguments", - () -> sql("CALL %s.system.rollback_to_timestamp('n', 't', %s, 1L)", catalogName, timestamp)); - - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for timestamp: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with extra args", + AnalysisException.class, + "Too many arguments", + () -> + sql("CALL %s.system.rollback_to_timestamp('n', 't', %s, 1L)", catalogName, timestamp)); + + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for timestamp: cannot cast", () -> sql("CALL %s.system.rollback_to_timestamp('t', 2.2)", catalogName)); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetCurrentSnapshotProcedure.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetCurrentSnapshotProcedure.java index 0ea8c4861e8c..8a8a974bbebe 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetCurrentSnapshotProcedure.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetCurrentSnapshotProcedure.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; + import java.util.List; import java.util.Map; import org.apache.iceberg.AssertHelpers; @@ -34,11 +35,10 @@ import org.junit.Assume; import org.junit.Test; -import static org.apache.iceberg.TableProperties.WRITE_AUDIT_PUBLISH_ENABLED; - public class TestSetCurrentSnapshotProcedure extends SparkExtensionsTestBase { - public TestSetCurrentSnapshotProcedure(String catalogName, String implementation, Map config) { + public TestSetCurrentSnapshotProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -57,7 +57,8 @@ public void testSetCurrentSnapshotUsingPositionalArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -65,15 +66,18 @@ public void testSetCurrentSnapshotUsingPositionalArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.set_current_snapshot('%s', %dL)", - catalogName, tableIdent, firstSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.set_current_snapshot('%s', %dL)", + catalogName, tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Set must be successful", + assertEquals( + "Set must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -88,7 +92,8 @@ public void testSetCurrentSnapshotUsingNamedArgs() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -96,15 +101,18 @@ public void testSetCurrentSnapshotUsingNamedArgs() { Snapshot secondSnapshot = table.currentSnapshot(); - List output = sql( - "CALL %s.system.set_current_snapshot(snapshot_id => %dL, table => '%s')", - catalogName, firstSnapshot.snapshotId(), tableIdent); + List output = + sql( + "CALL %s.system.set_current_snapshot(snapshot_id => %dL, table => '%s')", + catalogName, firstSnapshot.snapshotId(), tableIdent); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Set must be successful", + assertEquals( + "Set must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -118,22 +126,26 @@ public void testSetCurrentSnapshotWap() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should not see rows from staged snapshot", + assertEquals( + "Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots()); - List output = sql( - "CALL %s.system.set_current_snapshot(table => '%s', snapshot_id => %dL)", - catalogName, tableIdent, wapSnapshot.snapshotId()); + List output = + sql( + "CALL %s.system.set_current_snapshot(table => '%s', snapshot_id => %dL)", + catalogName, tableIdent, wapSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(null, wapSnapshot.snapshotId())), output); - assertEquals("Current snapshot must be set correctly", + assertEquals( + "Current snapshot must be set correctly", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); } @@ -150,7 +162,8 @@ public void tesSetCurrentSnapshotWithoutExplicitCatalog() { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -159,15 +172,16 @@ public void tesSetCurrentSnapshotWithoutExplicitCatalog() { Snapshot secondSnapshot = table.currentSnapshot(); // use camel case intentionally to test case sensitivity - List output = sql( - "CALL SyStEm.sEt_cuRrEnT_sNaPsHot('%s', %dL)", - tableIdent, firstSnapshot.snapshotId()); + List output = + sql("CALL SyStEm.sEt_cuRrEnT_sNaPsHot('%s', %dL)", tableIdent, firstSnapshot.snapshotId()); - assertEquals("Procedure output must match", + assertEquals( + "Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output); - assertEquals("Set must be successful", + assertEquals( + "Set must be successful", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -179,43 +193,64 @@ public void testSetCurrentSnapshotToInvalidSnapshot() { Namespace namespace = tableIdent.namespace(); String tableName = tableIdent.name(); - AssertHelpers.assertThrows("Should reject invalid snapshot id", - ValidationException.class, "Cannot roll back to unknown snapshot id", + AssertHelpers.assertThrows( + "Should reject invalid snapshot id", + ValidationException.class, + "Cannot roll back to unknown snapshot id", () -> sql("CALL %s.system.set_current_snapshot('%s', -1L)", catalogName, tableIdent)); } @Test public void testInvalidRollbackToSnapshotCases() { - AssertHelpers.assertThrows("Should not allow mixed args", - AnalysisException.class, "Named and positional arguments cannot be mixed", - () -> sql("CALL %s.system.set_current_snapshot(namespace => 'n1', table => 't', 1L)", catalogName)); - - AssertHelpers.assertThrows("Should not resolve procedures in arbitrary namespaces", - NoSuchProcedureException.class, "not found", + AssertHelpers.assertThrows( + "Should not allow mixed args", + AnalysisException.class, + "Named and positional arguments cannot be mixed", + () -> + sql( + "CALL %s.system.set_current_snapshot(namespace => 'n1', table => 't', 1L)", + catalogName)); + + AssertHelpers.assertThrows( + "Should not resolve procedures in arbitrary namespaces", + NoSuchProcedureException.class, + "not found", () -> sql("CALL %s.custom.set_current_snapshot('n', 't', 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.set_current_snapshot('t')", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.set_current_snapshot(1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.set_current_snapshot(snapshot_id => 1L)", catalogName)); - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.set_current_snapshot(table => 't')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type for snapshot_id: cannot cast", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type for snapshot_id: cannot cast", () -> sql("CALL %s.system.set_current_snapshot('t', 2.2)", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.set_current_snapshot('', 1L)", catalogName)); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetWriteDistributionAndOrdering.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetWriteDistributionAndOrdering.java index 473278d25068..e7e52806792d 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetWriteDistributionAndOrdering.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSetWriteDistributionAndOrdering.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.expressions.Expressions.bucket; + import java.util.Map; import org.apache.iceberg.NullOrder; import org.apache.iceberg.SortOrder; @@ -28,10 +29,9 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.expressions.Expressions.bucket; - public class TestSetWriteDistributionAndOrdering extends SparkExtensionsTestBase { - public TestSetWriteDistributionAndOrdering(String catalogName, String implementation, Map config) { + public TestSetWriteDistributionAndOrdering( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -42,7 +42,9 @@ public void removeTable() { @Test public void testSetWriteOrderByColumn() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -53,17 +55,20 @@ public void testSetWriteOrderByColumn() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "range", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("category", NullOrder.NULLS_FIRST) - .asc("id", NullOrder.NULLS_FIRST) - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .asc("category", NullOrder.NULLS_FIRST) + .asc("id", NullOrder.NULLS_FIRST) + .build(); Assert.assertEquals("Should have expected order", expected, table.sortOrder()); } @Test public void testSetWriteOrderByColumnWithDirection() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -74,17 +79,20 @@ public void testSetWriteOrderByColumnWithDirection() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "range", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("category", NullOrder.NULLS_FIRST) - .desc("id", NullOrder.NULLS_LAST) - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .asc("category", NullOrder.NULLS_FIRST) + .desc("id", NullOrder.NULLS_LAST) + .build(); Assert.assertEquals("Should have expected order", expected, table.sortOrder()); } @Test public void testSetWriteOrderByColumnWithDirectionAndNullOrder() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -95,17 +103,20 @@ public void testSetWriteOrderByColumnWithDirectionAndNullOrder() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "range", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("category", NullOrder.NULLS_LAST) - .desc("id", NullOrder.NULLS_FIRST) - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .asc("category", NullOrder.NULLS_LAST) + .desc("id", NullOrder.NULLS_FIRST) + .build(); Assert.assertEquals("Should have expected order", expected, table.sortOrder()); } @Test public void testSetWriteOrderByTransform() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -116,18 +127,21 @@ public void testSetWriteOrderByTransform() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "range", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .desc("category") - .asc(bucket("id", 16)) - .asc("id") - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .desc("category") + .asc(bucket("id", 16)) + .asc("id") + .build(); Assert.assertEquals("Should have expected order", expected, table.sortOrder()); } @Test public void testSetWriteUnordered() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -152,7 +166,9 @@ public void testSetWriteUnordered() { @Test public void testSetWriteLocallyOrdered() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, ts timestamp, data string) USING iceberg", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -163,18 +179,21 @@ public void testSetWriteLocallyOrdered() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "none", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .desc("category") - .asc(bucket("id", 16)) - .asc("id") - .build(); + SortOrder expected = + SortOrder.builderFor(table.schema()) + .withOrderId(1) + .desc("category") + .asc(bucket("id", 16)) + .asc("id") + .build(); Assert.assertEquals("Sort order must match", expected, table.sortOrder()); } @Test public void testSetWriteDistributedByWithSort() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -185,16 +204,15 @@ public void testSetWriteDistributedByWithSort() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "hash", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("id") - .build(); + SortOrder expected = SortOrder.builderFor(table.schema()).withOrderId(1).asc("id").build(); Assert.assertEquals("Sort order must match", expected, table.sortOrder()); } @Test public void testSetWriteDistributedByWithLocalSort() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -205,16 +223,15 @@ public void testSetWriteDistributedByWithLocalSort() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "hash", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("id") - .build(); + SortOrder expected = SortOrder.builderFor(table.schema()).withOrderId(1).asc("id").build(); Assert.assertEquals("Sort order must match", expected, table.sortOrder()); } @Test public void testSetWriteDistributedByAndUnordered() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -230,7 +247,9 @@ public void testSetWriteDistributedByAndUnordered() { @Test public void testSetWriteDistributedByOnly() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -246,7 +265,9 @@ public void testSetWriteDistributedByOnly() { @Test public void testSetWriteDistributedAndUnorderedInverted() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -262,7 +283,9 @@ public void testSetWriteDistributedAndUnorderedInverted() { @Test public void testSetWriteDistributedAndLocallyOrderedInverted() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string) USING iceberg PARTITIONED BY (category)", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertTrue("Table should start unsorted", table.sortOrder().isUnsorted()); @@ -273,10 +296,7 @@ public void testSetWriteDistributedAndLocallyOrderedInverted() { String distributionMode = table.properties().get(TableProperties.WRITE_DISTRIBUTION_MODE); Assert.assertEquals("Distribution mode must match", "hash", distributionMode); - SortOrder expected = SortOrder.builderFor(table.schema()) - .withOrderId(1) - .asc("id") - .build(); + SortOrder expected = SortOrder.builderFor(table.schema()).withOrderId(1).asc("id").build(); Assert.assertEquals("Sort order must match", expected, table.sortOrder()); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java index 66fa8e80c515..d8e918d8aadd 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; import java.io.IOException; @@ -37,12 +36,12 @@ public class TestSnapshotTableProcedure extends SparkExtensionsTestBase { private static final String sourceName = "spark_catalog.default.source"; // Currently we can only Snapshot only out of the Spark Session Catalog - public TestSnapshotTableProcedure(String catalogName, String implementation, Map config) { + public TestSnapshotTableProcedure( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @After public void removeTables() { @@ -53,9 +52,12 @@ public void removeTables() { @Test public void testSnapshot() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object result = scalarSql("CALL %s.system.snapshot('%s', '%s')", catalogName, sourceName, tableName); + Object result = + scalarSql("CALL %s.system.snapshot('%s', '%s')", catalogName, sourceName, tableName); Assert.assertEquals("Should have added one file", 1L, result); @@ -65,7 +67,8 @@ public void testSnapshot() throws IOException { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -73,11 +76,14 @@ public void testSnapshot() throws IOException { @Test public void testSnapshotWithProperties() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object result = scalarSql( - "CALL %s.system.snapshot(source_table => '%s', table => '%s', properties => map('foo','bar'))", - catalogName, sourceName, tableName); + Object result = + scalarSql( + "CALL %s.system.snapshot(source_table => '%s', table => '%s', properties => map('foo','bar'))", + catalogName, sourceName, tableName); Assert.assertEquals("Should have added one file", 1L, result); @@ -91,30 +97,39 @@ public void testSnapshotWithProperties() throws IOException { sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testSnapshotWithAlternateLocation() throws IOException { - Assume.assumeTrue("No Snapshoting with Alternate locations with Hadoop Catalogs", !catalogName.contains("hadoop")); + Assume.assumeTrue( + "No Snapshoting with Alternate locations with Hadoop Catalogs", + !catalogName.contains("hadoop")); String location = temp.newFolder().toString(); String snapshotLocation = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object[] result = sql( - "CALL %s.system.snapshot(source_table => '%s', table => '%s', location => '%s')", - catalogName, sourceName, tableName, snapshotLocation).get(0); + Object[] result = + sql( + "CALL %s.system.snapshot(source_table => '%s', table => '%s', location => '%s')", + catalogName, sourceName, tableName, snapshotLocation) + .get(0); Assert.assertEquals("Should have added one file", 1L, result[0]); String storageLocation = validationCatalog.loadTable(tableIdent).location(); - Assert.assertEquals("Snapshot should be made at specified location", snapshotLocation, storageLocation); + Assert.assertEquals( + "Snapshot should be made at specified location", snapshotLocation, storageLocation); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -122,19 +137,24 @@ public void testSnapshotWithAlternateLocation() throws IOException { @Test public void testDropTable() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object result = scalarSql("CALL %s.system.snapshot('%s', '%s')", catalogName, sourceName, tableName); + Object result = + scalarSql("CALL %s.system.snapshot('%s', '%s')", catalogName, sourceName, tableName); Assert.assertEquals("Should have added one file", 1L, result); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); sql("DROP TABLE %s", tableName); - assertEquals("Source table should be intact", + assertEquals( + "Source table should be intact", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", sourceName)); } @@ -142,50 +162,70 @@ public void testDropTable() throws IOException { @Test public void testSnapshotWithConflictingProps() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); sql("INSERT INTO TABLE %s VALUES (1, 'a')", sourceName); - Object result = scalarSql( - "CALL %s.system.snapshot(" + - "source_table => '%s'," + - "table => '%s'," + - "properties => map('%s', 'true', 'snapshot', 'false'))", - catalogName, sourceName, tableName, TableProperties.GC_ENABLED); + Object result = + scalarSql( + "CALL %s.system.snapshot(" + + "source_table => '%s'," + + "table => '%s'," + + "properties => map('%s', 'true', 'snapshot', 'false'))", + catalogName, sourceName, tableName, TableProperties.GC_ENABLED); Assert.assertEquals("Should have added one file", 1L, result); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM %s", tableName)); Table table = validationCatalog.loadTable(tableIdent); Map props = table.properties(); Assert.assertEquals("Should override user value", "true", props.get("snapshot")); - Assert.assertEquals("Should override user value", "false", props.get(TableProperties.GC_ENABLED)); + Assert.assertEquals( + "Should override user value", "false", props.get(TableProperties.GC_ENABLED)); } @Test public void testInvalidSnapshotsCases() throws IOException { String location = temp.newFolder().toString(); - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", sourceName, location); - - AssertHelpers.assertThrows("Should reject calls without all required args", - AnalysisException.class, "Missing required parameters", + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING parquet LOCATION '%s'", + sourceName, location); + + AssertHelpers.assertThrows( + "Should reject calls without all required args", + AnalysisException.class, + "Missing required parameters", () -> sql("CALL %s.system.snapshot('foo')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid arg types", - AnalysisException.class, "Wrong arg type", + AssertHelpers.assertThrows( + "Should reject calls with invalid arg types", + AnalysisException.class, + "Wrong arg type", () -> sql("CALL %s.system.snapshot('n', 't', map('foo', 'bar'))", catalogName)); - AssertHelpers.assertThrows("Should reject calls with invalid map args", - AnalysisException.class, "cannot resolve 'map", - () -> sql("CALL %s.system.snapshot('%s', 'fable', 'loc', map(2, 1, 1))", catalogName, sourceName)); - - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with invalid map args", + AnalysisException.class, + "cannot resolve 'map", + () -> + sql( + "CALL %s.system.snapshot('%s', 'fable', 'loc', map(2, 1, 1))", + catalogName, sourceName)); + + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.snapshot('', 'dest')", catalogName)); - AssertHelpers.assertThrows("Should reject calls with empty table identifier", - IllegalArgumentException.class, "Cannot handle an empty identifier", + AssertHelpers.assertThrows( + "Should reject calls with empty table identifier", + IllegalArgumentException.class, + "Cannot handle an empty identifier", () -> sql("CALL %s.system.snapshot('src', '')", catalogName)); } } diff --git a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestUpdate.java b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestUpdate.java index 7871e02c5b02..cdaf1c336012 100644 --- a/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestUpdate.java +++ b/spark/v3.3/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestUpdate.java @@ -16,9 +16,20 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.extensions; +import static org.apache.iceberg.DataOperations.OVERWRITE; +import static org.apache.iceberg.RowLevelOperationMode.COPY_ON_WRITE; +import static org.apache.iceberg.SnapshotSummary.ADDED_FILES_PROP; +import static org.apache.iceberg.SnapshotSummary.CHANGED_PARTITION_COUNT_PROP; +import static org.apache.iceberg.SnapshotSummary.DELETED_FILES_PROP; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.SPLIT_SIZE; +import static org.apache.iceberg.TableProperties.UPDATE_ISOLATION_LEVEL; +import static org.apache.iceberg.TableProperties.UPDATE_MODE; +import static org.apache.iceberg.TableProperties.UPDATE_MODE_DEFAULT; +import static org.apache.spark.sql.functions.lit; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -57,22 +68,15 @@ import org.junit.BeforeClass; import org.junit.Test; -import static org.apache.iceberg.DataOperations.OVERWRITE; -import static org.apache.iceberg.RowLevelOperationMode.COPY_ON_WRITE; -import static org.apache.iceberg.SnapshotSummary.ADDED_FILES_PROP; -import static org.apache.iceberg.SnapshotSummary.CHANGED_PARTITION_COUNT_PROP; -import static org.apache.iceberg.SnapshotSummary.DELETED_FILES_PROP; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.SPLIT_SIZE; -import static org.apache.iceberg.TableProperties.UPDATE_ISOLATION_LEVEL; -import static org.apache.iceberg.TableProperties.UPDATE_MODE; -import static org.apache.iceberg.TableProperties.UPDATE_MODE_DEFAULT; -import static org.apache.spark.sql.functions.lit; - public abstract class TestUpdate extends SparkRowLevelOperationsTestBase { - public TestUpdate(String catalogName, String implementation, Map config, - String fileFormat, boolean vectorized, String distributionMode) { + public TestUpdate( + String catalogName, + String implementation, + Map config, + String fileFormat, + boolean vectorized, + String distributionMode) { super(catalogName, implementation, config, fileFormat, vectorized, distributionMode); } @@ -102,7 +106,8 @@ public void testExplain() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 1 snapshot", 1, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -117,7 +122,8 @@ public void testUpdateEmptyTable() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -132,7 +138,8 @@ public void testUpdateWithAlias() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 2 snapshots", 2, Iterables.size(table.snapshots())); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "invalid")), sql("SELECT * FROM %s", tableName)); } @@ -145,7 +152,8 @@ public void testUpdateAlignsAssignments() { sql("UPDATE %s SET `c2` = c2 - 2, c1 = `c1` - 1 WHERE id <=> 1", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, 10, 109), row(2, 22, 222)), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -159,7 +167,8 @@ public void testUpdateWithUnsupportedPartitionPredicate() { sql("UPDATE %s t SET `t`.`id` = -1 WHERE t.dep LIKE '%%r' ", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(1, "software")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -169,12 +178,10 @@ public void testUpdateWithDynamicFileFiltering() { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 3, \"dep\": \"hr\" }"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hardware\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 3, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hardware\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }"); sql("UPDATE %s SET id = cast('-1' AS INT) WHERE id = 2", tableName); @@ -188,7 +195,8 @@ public void testUpdateWithDynamicFileFiltering() { validateMergeOnRead(currentSnapshot, "1", "1", "1"); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(1, "hardware"), row(1, "hr"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -212,7 +220,8 @@ public void testUpdateNonExistingRecords() { validateMergeOnRead(currentSnapshot, "0", null, null); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -229,10 +238,13 @@ public void testUpdateWithoutCondition() { sql("INSERT INTO TABLE %s VALUES (null, 'hr')", tableName); // set the num of shuffle partitions to 200 instead of default 4 to reduce the chance of hashing - // records for multiple source files to one writing task (needed for a predictable num of output files) - withSQLConf(ImmutableMap.of(SQLConf.SHUFFLE_PARTITIONS().key(), "200"), () -> { - sql("UPDATE %s SET id = -1", tableName); - }); + // records for multiple source files to one writing task (needed for a predictable num of output + // files) + withSQLConf( + ImmutableMap.of(SQLConf.SHUFFLE_PARTITIONS().key(), "200"), + () -> { + sql("UPDATE %s SET id = -1", tableName); + }); Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should have 4 snapshots", 4, Iterables.size(table.snapshots())); @@ -249,7 +261,8 @@ public void testUpdateWithoutCondition() { validateMergeOnRead(currentSnapshot, "2", "2", "2"); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(-1, "hr")), sql("SELECT * FROM %s ORDER BY dep ASC", tableName)); } @@ -258,26 +271,30 @@ public void testUpdateWithoutCondition() { public void testUpdateWithNullConditions() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 0, \"dep\": null }\n" + - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + append( + tableName, + "{ \"id\": 0, \"dep\": null }\n" + + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }"); // should not update any rows as null is never equal to null sql("UPDATE %s SET id = -1 WHERE dep = NULL", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should not update any rows the condition does not match any records sql("UPDATE %s SET id = -1 WHERE dep = 'software'", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(0, null), row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); // should update one matching row with a null-safe condition sql("UPDATE %s SET dep = 'invalid', id = -1 WHERE dep <=> NULL", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "invalid"), row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -286,23 +303,27 @@ public void testUpdateWithNullConditions() { public void testUpdateWithInAndNotInConditions() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); sql("UPDATE %s SET id = -1 WHERE id IN (1, null)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("UPDATE %s SET id = 100 WHERE id NOT IN (null, 1)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); sql("UPDATE %s SET id = 100 WHERE id NOT IN (1, 10)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(100, "hardware"), row(100, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); } @@ -314,16 +335,20 @@ public void testUpdateWithMultipleRowGroupsParquet() throws NoSuchTableException createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 100); sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, SPLIT_SIZE, 100); List ids = Lists.newArrayListWithCapacity(200); for (int id = 1; id <= 200; id++) { ids.add(id); } - Dataset df = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")); + Dataset df = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")); df.coalesce(1).writeTo(tableName).append(); Assert.assertEquals(200, spark.table(tableName).count()); @@ -336,27 +361,33 @@ public void testUpdateWithMultipleRowGroupsParquet() throws NoSuchTableException @Test public void testUpdateNestedStructFields() { - createAndInitTable("id INT, s STRUCT,m:MAP>>", + createAndInitTable( + "id INT, s STRUCT,m:MAP>>", "{ \"id\": 1, \"s\": { \"c1\": 2, \"c2\": { \"a\": [1,2], \"m\": { \"a\": \"b\"} } } } }"); // update primitive, array, map columns inside a struct sql("UPDATE %s SET s.c1 = -1, s.c2.m = map('k', 'v'), s.c2.a = array(-1)", tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(-1, row(ImmutableList.of(-1), ImmutableMap.of("k", "v"))))), sql("SELECT * FROM %s", tableName)); // set primitive, array, map columns to NULL (proper casts should be in place) sql("UPDATE %s SET s.c1 = NULL, s.c2 = NULL WHERE id IN (1)", tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(null, null))), sql("SELECT * FROM %s", tableName)); // update all fields in a struct - sql("UPDATE %s SET s = named_struct('c1', 1, 'c2', named_struct('a', array(1), 'm', null))", tableName); + sql( + "UPDATE %s SET s = named_struct('c1', 1, 'c2', named_struct('a', array(1), 'm', null))", + tableName); - assertEquals("Output should match", + assertEquals( + "Output should match", ImmutableList.of(row(1, row(1, row(ImmutableList.of(1), null)))), sql("SELECT * FROM %s", tableName)); } @@ -366,29 +397,33 @@ public void testUpdateWithUserDefinedDistribution() { createAndInitTable("id INT, c2 INT, c3 INT"); sql("ALTER TABLE %s ADD PARTITION FIELD bucket(8, c3)", tableName); - append(tableName, - "{ \"id\": 1, \"c2\": 11, \"c3\": 1 }\n" + - "{ \"id\": 2, \"c2\": 22, \"c3\": 1 }\n" + - "{ \"id\": 3, \"c2\": 33, \"c3\": 1 }"); + append( + tableName, + "{ \"id\": 1, \"c2\": 11, \"c3\": 1 }\n" + + "{ \"id\": 2, \"c2\": 22, \"c3\": 1 }\n" + + "{ \"id\": 3, \"c2\": 33, \"c3\": 1 }"); // request a global sort sql("ALTER TABLE %s WRITE ORDERED BY c2", tableName); sql("UPDATE %s SET c2 = -22 WHERE id NOT IN (1, 3)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, 11, 1), row(2, -22, 1), row(3, 33, 1)), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); // request a local sort sql("ALTER TABLE %s WRITE LOCALLY ORDERED BY id", tableName); sql("UPDATE %s SET c2 = -33 WHERE id = 3", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, 11, 1), row(2, -22, 1), row(3, -33, 1)), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); // request a hash distribution + local sort sql("ALTER TABLE %s WRITE DISTRIBUTED BY PARTITION ORDERED BY id", tableName); sql("UPDATE %s SET c2 = -11 WHERE id = 1", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, -11, 1), row(2, -22, 1), row(3, -33, 1)), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -400,34 +435,41 @@ public synchronized void testUpdateWithSerializableIsolation() throws Interrupte createAndInitTable("id INT, dep STRING"); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, UPDATE_ISOLATION_LEVEL, "serializable"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, UPDATE_ISOLATION_LEVEL, "serializable"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // update thread - Future updateFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("UPDATE %s SET id = -1 WHERE id = 1", tableName); - barrier.incrementAndGet(); - } - }); + Future updateFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("UPDATE %s SET id = -1 WHERE id = 1", tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < Integer.MAX_VALUE; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { updateFuture.get(); @@ -438,7 +480,8 @@ public synchronized void testUpdateWithSerializableIsolation() throws Interrupte Throwable validationException = sparkException.getCause(); Assert.assertThat(validationException, CoreMatchers.instanceOf(ValidationException.class)); String errMsg = validationException.getMessage(); - Assert.assertThat(errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); + Assert.assertThat( + errMsg, CoreMatchers.containsString("Found conflicting files that can contain")); } finally { appendFuture.cancel(true); } @@ -448,40 +491,48 @@ public synchronized void testUpdateWithSerializableIsolation() throws Interrupte } @Test - public synchronized void testUpdateWithSnapshotIsolation() throws InterruptedException, ExecutionException { + public synchronized void testUpdateWithSnapshotIsolation() + throws InterruptedException, ExecutionException { // cannot run tests with concurrency for Hadoop tables without atomic renames Assume.assumeFalse(catalogName.equalsIgnoreCase("testhadoop")); createAndInitTable("id INT, dep STRING"); - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, UPDATE_ISOLATION_LEVEL, "snapshot"); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, UPDATE_ISOLATION_LEVEL, "snapshot"); - ExecutorService executorService = MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); + ExecutorService executorService = + MoreExecutors.getExitingExecutorService( + (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); // update thread - Future updateFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("UPDATE %s SET id = -1 WHERE id = 1", tableName); - barrier.incrementAndGet(); - } - }); + Future updateFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("UPDATE %s SET id = -1 WHERE id = 1", tableName); + barrier.incrementAndGet(); + } + }); // append thread - Future appendFuture = executorService.submit(() -> { - for (int numOperations = 0; numOperations < 20; numOperations++) { - while (barrier.get() < numOperations * 2) { - sleep(10); - } - sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); - barrier.incrementAndGet(); - } - }); + Future appendFuture = + executorService.submit( + () -> { + for (int numOperations = 0; numOperations < 20; numOperations++) { + while (barrier.get() < numOperations * 2) { + sleep(10); + } + sql("INSERT INTO TABLE %s VALUES (1, 'hr')", tableName); + barrier.incrementAndGet(); + } + }); try { updateFuture.get(); @@ -499,7 +550,8 @@ public void testUpdateWithInferredCasts() { sql("UPDATE %s SET s = -1 WHERE id = 1", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "-1")), sql("SELECT * FROM %s", tableName)); } @@ -510,7 +562,8 @@ public void testUpdateModifiesNullStruct() { sql("UPDATE %s SET s.n1 = -1 WHERE id = 1", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, row(-1, null))), sql("SELECT * FROM %s", tableName)); } @@ -520,20 +573,19 @@ public void testUpdateRefreshesRelationCache() { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 3, \"dep\": \"hr\" }"); + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 3, \"dep\": \"hr\" }"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hardware\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hardware\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }"); Dataset query = spark.sql("SELECT * FROM " + tableName + " WHERE id = 1"); query.createOrReplaceTempView("tmp"); spark.sql("CACHE TABLE tmp"); - assertEquals("View should have correct data", + assertEquals( + "View should have correct data", ImmutableList.of(row(1, "hardware"), row(1, "hr")), sql("SELECT * FROM tmp ORDER BY id, dep")); @@ -549,11 +601,13 @@ public void testUpdateRefreshesRelationCache() { validateMergeOnRead(currentSnapshot, "2", "2", "2"); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(2, "hardware"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - assertEquals("Should refresh the relation cache", + assertEquals( + "Should refresh the relation cache", ImmutableList.of(), sql("SELECT * FROM tmp ORDER BY id, dep")); @@ -564,36 +618,47 @@ public void testUpdateRefreshesRelationCache() { public void testUpdateWithInSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(0, 1, null), Encoders.INT()); createOrReplaceView("updated_dep", Arrays.asList("software", "hr"), Encoders.STRING()); - sql("UPDATE %s SET id = -1 WHERE " + - "id IN (SELECT * FROM updated_id) AND " + - "dep IN (SELECT * from updated_dep)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET id = -1 WHERE " + + "id IN (SELECT * FROM updated_id) AND " + + "dep IN (SELECT * from updated_dep)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("UPDATE %s SET id = 5 WHERE id IS NULL OR id IN (SELECT value + 1 FROM updated_id)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET id = 5 WHERE id IS NULL OR id IN (SELECT value + 1 FROM updated_id)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(5, "hardware"), row(5, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - append(tableName, - "{ \"id\": null, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); - assertEquals("Should have expected rows", - ImmutableList.of(row(-1, "hr"), row(2, "hr"), row(5, "hardware"), row(5, "hr"), row(null, "hr")), + append(tableName, "{ \"id\": null, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + assertEquals( + "Should have expected rows", + ImmutableList.of( + row(-1, "hr"), row(2, "hr"), row(5, "hardware"), row(5, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); - sql("UPDATE %s SET id = 10 WHERE id IN (SELECT value + 2 FROM updated_id) AND dep = 'hr'", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(-1, "hr"), row(5, "hardware"), row(5, "hr"), row(10, "hr"), row(null, "hr")), + sql( + "UPDATE %s SET id = 10 WHERE id IN (SELECT value + 2 FROM updated_id) AND dep = 'hr'", + tableName); + assertEquals( + "Should have expected rows", + ImmutableList.of( + row(-1, "hr"), row(5, "hardware"), row(5, "hr"), row(10, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); } @@ -604,12 +669,10 @@ public void testUpdateWithInSubqueryAndDynamicFileFiltering() { sql("ALTER TABLE %s WRITE DISTRIBUTED BY PARTITION", tableName); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 3, \"dep\": \"hr\" }"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hardware\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }"); + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 3, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hardware\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }"); createOrReplaceView("updated_id", Arrays.asList(-1, 2), Encoders.INT()); @@ -625,7 +688,8 @@ public void testUpdateWithInSubqueryAndDynamicFileFiltering() { validateMergeOnRead(currentSnapshot, "1", "1", "1"); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(1, "hardware"), row(1, "hr"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -634,26 +698,31 @@ public void testUpdateWithInSubqueryAndDynamicFileFiltering() { public void testUpdateWithSelfSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); sql("UPDATE %s SET dep = 'x' WHERE id IN (SELECT id + 1 FROM %s)", tableName, tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "x")), sql("SELECT * FROM %s ORDER BY id", tableName)); // TODO: Spark does not support AQE and DPP with aggregates at the moment - withSQLConf(ImmutableMap.of(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), "false"), () -> { - sql("UPDATE %s SET dep = 'y' WHERE " + - "id = (SELECT count(*) FROM (SELECT DISTINCT id FROM %s) AS t)", tableName, tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(1, "hr"), row(2, "y")), - sql("SELECT * FROM %s ORDER BY id", tableName)); - }); + withSQLConf( + ImmutableMap.of(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), "false"), + () -> { + sql( + "UPDATE %s SET dep = 'y' WHERE " + + "id = (SELECT count(*) FROM (SELECT DISTINCT id FROM %s) AS t)", + tableName, tableName); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(1, "hr"), row(2, "y")), + sql("SELECT * FROM %s ORDER BY id", tableName)); + }); sql("UPDATE %s SET id = (SELECT id - 2 FROM %s WHERE id = 1)", tableName, tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hr"), row(-1, "y")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -662,16 +731,21 @@ public void testUpdateWithSelfSubquery() { public void testUpdateWithMultiColumnInSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); - List deletedEmployees = Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr")); + List deletedEmployees = + Arrays.asList(new Employee(null, "hr"), new Employee(1, "hr")); createOrReplaceView("deleted_employee", deletedEmployees, Encoders.bean(Employee.class)); - sql("UPDATE %s SET dep = 'x', id = -1 WHERE (id, dep) IN (SELECT id, dep FROM deleted_employee)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET dep = 'x', id = -1 WHERE (id, dep) IN (SELECT id, dep FROM deleted_employee)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "x"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); } @@ -680,27 +754,35 @@ public void testUpdateWithMultiColumnInSubquery() { public void testUpdateWithNotInSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("updated_dep", Arrays.asList("software", "hr"), Encoders.STRING()); // the file filter subquery (nested loop lef-anti join) returns 0 records sql("UPDATE %s SET id = -1 WHERE id NOT IN (SELECT * FROM updated_id)", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("UPDATE %s SET id = -1 WHERE id NOT IN (SELECT * FROM updated_id WHERE value IS NOT NULL)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET id = -1 WHERE id NOT IN (SELECT * FROM updated_id WHERE value IS NOT NULL)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); - sql("UPDATE %s SET id = 5 WHERE id NOT IN (SELECT * FROM updated_id) OR dep IN ('software', 'hr')", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s SET id = 5 WHERE id NOT IN (SELECT * FROM updated_id) OR dep IN ('software', 'hr')", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(5, "hr"), row(5, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST, dep", tableName)); } @@ -709,36 +791,49 @@ public void testUpdateWithNotInSubquery() { public void testUpdateWithExistSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("updated_dep", Arrays.asList("hr", null), Encoders.STRING()); - sql("UPDATE %s t SET id = -1 WHERE EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = -1 WHERE EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "hr"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("UPDATE %s t SET dep = 'x', id = -1 WHERE " + - "EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value + 2)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET dep = 'x', id = -1 WHERE " + + "EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value + 2)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "x"), row(2, "hardware"), row(null, "hr")), sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - sql("UPDATE %s t SET id = -2 WHERE " + - "EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value) OR " + - "t.id IS NULL", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = -2 WHERE " + + "EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value) OR " + + "t.id IS NULL", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-2, "hr"), row(-2, "x"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - sql("UPDATE %s t SET id = 1 WHERE " + - "EXISTS (SELECT 1 FROM updated_id ui WHERE t.id = ui.value) AND " + - "EXISTS (SELECT 1 FROM updated_dep ud WHERE t.dep = ud.value)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = 1 WHERE " + + "EXISTS (SELECT 1 FROM updated_id ui WHERE t.id = ui.value) AND " + + "EXISTS (SELECT 1 FROM updated_dep ud WHERE t.dep = ud.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-2, "x"), row(1, "hr"), row(2, "hardware")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -747,30 +842,40 @@ public void testUpdateWithExistSubquery() { public void testUpdateWithNotExistsSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(-1, -2, null), Encoders.INT()); createOrReplaceView("updated_dep", Arrays.asList("hr", "software"), Encoders.STRING()); - sql("UPDATE %s t SET id = -1 WHERE NOT EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value + 2)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = -1 WHERE NOT EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value + 2)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(1, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - sql("UPDATE %s t SET id = 5 WHERE " + - "NOT EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value) OR " + - "t.id = 1", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = 5 WHERE " + + "NOT EXISTS (SELECT 1 FROM updated_id u WHERE t.id = u.value) OR " + + "t.id = 1", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(5, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - sql("UPDATE %s t SET id = 10 WHERE " + - "NOT EXISTS (SELECT 1 FROM updated_id ui WHERE t.id = ui.value) AND " + - "EXISTS (SELECT 1 FROM updated_dep ud WHERE t.dep = ud.value)", tableName); - assertEquals("Should have expected rows", + sql( + "UPDATE %s t SET id = 10 WHERE " + + "NOT EXISTS (SELECT 1 FROM updated_id ui WHERE t.id = ui.value) AND " + + "EXISTS (SELECT 1 FROM updated_dep ud WHERE t.dep = ud.value)", + tableName); + assertEquals( + "Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(10, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName)); } @@ -779,20 +884,24 @@ public void testUpdateWithNotExistsSubquery() { public void testUpdateWithScalarSubquery() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hardware\" }\n" + - "{ \"id\": null, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hardware\" }\n" + + "{ \"id\": null, \"dep\": \"hr\" }"); createOrReplaceView("updated_id", Arrays.asList(1, 100, null), Encoders.INT()); // TODO: Spark does not support AQE and DPP with aggregates at the moment - withSQLConf(ImmutableMap.of(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), "false"), () -> { - sql("UPDATE %s SET id = -1 WHERE id <= (SELECT min(value) FROM updated_id)", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), - sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); - }); + withSQLConf( + ImmutableMap.of(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), "false"), + () -> { + sql("UPDATE %s SET id = -1 WHERE id <= (SELECT min(value) FROM updated_id)", tableName); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(-1, "hr"), row(2, "hardware"), row(null, "hr")), + sql("SELECT * FROM %s ORDER BY id ASC NULLS LAST", tableName)); + }); } @Test @@ -800,25 +909,29 @@ public void testUpdateThatRequiresGroupingBeforeWrite() { createAndInitTable("id INT, dep STRING"); sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); - append(tableName, - "{ \"id\": 0, \"dep\": \"hr\" }\n" + - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); - - append(tableName, - "{ \"id\": 0, \"dep\": \"ops\" }\n" + - "{ \"id\": 1, \"dep\": \"ops\" }\n" + - "{ \"id\": 2, \"dep\": \"ops\" }"); - - append(tableName, - "{ \"id\": 0, \"dep\": \"hr\" }\n" + - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); - - append(tableName, - "{ \"id\": 0, \"dep\": \"ops\" }\n" + - "{ \"id\": 1, \"dep\": \"ops\" }\n" + - "{ \"id\": 2, \"dep\": \"ops\" }"); + append( + tableName, + "{ \"id\": 0, \"dep\": \"hr\" }\n" + + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hr\" }"); + + append( + tableName, + "{ \"id\": 0, \"dep\": \"ops\" }\n" + + "{ \"id\": 1, \"dep\": \"ops\" }\n" + + "{ \"id\": 2, \"dep\": \"ops\" }"); + + append( + tableName, + "{ \"id\": 0, \"dep\": \"hr\" }\n" + + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hr\" }"); + + append( + tableName, + "{ \"id\": 0, \"dep\": \"ops\" }\n" + + "{ \"id\": 1, \"dep\": \"ops\" }\n" + + "{ \"id\": 2, \"dep\": \"ops\" }"); createOrReplaceView("updated_id", Arrays.asList(1, 100), Encoders.INT()); @@ -838,18 +951,22 @@ public void testUpdateThatRequiresGroupingBeforeWrite() { public void testUpdateWithVectorization() { createAndInitTable("id INT, dep STRING"); - append(tableName, - "{ \"id\": 0, \"dep\": \"hr\" }\n" + - "{ \"id\": 1, \"dep\": \"hr\" }\n" + - "{ \"id\": 2, \"dep\": \"hr\" }"); + append( + tableName, + "{ \"id\": 0, \"dep\": \"hr\" }\n" + + "{ \"id\": 1, \"dep\": \"hr\" }\n" + + "{ \"id\": 2, \"dep\": \"hr\" }"); - withSQLConf(ImmutableMap.of(SparkSQLProperties.VECTORIZATION_ENABLED, "true"), () -> { - sql("UPDATE %s t SET id = -1", tableName); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.VECTORIZATION_ENABLED, "true"), + () -> { + sql("UPDATE %s t SET id = -1", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(-1, "hr"), row(-1, "hr"), row(-1, "hr")), - sql("SELECT * FROM %s ORDER BY id, dep", tableName)); - }); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(-1, "hr"), row(-1, "hr"), row(-1, "hr")), + sql("SELECT * FROM %s ORDER BY id, dep", tableName)); + }); } @Test @@ -864,22 +981,28 @@ public void testUpdateModifyPartitionSourceField() throws NoSuchTableException { ids.add(id); } - Dataset df1 = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hr")) - .withColumn("country", lit("usa")); + Dataset df1 = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hr")) + .withColumn("country", lit("usa")); df1.coalesce(1).writeTo(tableName).append(); - Dataset df2 = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("software")) - .withColumn("country", lit("usa")); + Dataset df2 = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("software")) + .withColumn("country", lit("usa")); df2.coalesce(1).writeTo(tableName).append(); - Dataset df3 = spark.createDataset(ids, Encoders.INT()) - .withColumnRenamed("value", "id") - .withColumn("dep", lit("hardware")) - .withColumn("country", lit("usa")); + Dataset df3 = + spark + .createDataset(ids, Encoders.INT()) + .withColumnRenamed("value", "id") + .withColumn("dep", lit("hardware")) + .withColumn("country", lit("usa")); df3.coalesce(1).writeTo(tableName).append(); sql("UPDATE %s SET id = -1 WHERE id IN (10, 11, 12, 13, 14, 15, 16, 17, 18, 19)", tableName); @@ -909,21 +1032,27 @@ public void testUpdateWithStaticPredicatePushdown() { table.io().deleteFile(dataFile.path().toString()); // disable dynamic pruning and rely only on static predicate pushdown - withSQLConf(ImmutableMap.of(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED().key(), "false"), () -> { - sql("UPDATE %s SET id = -1 WHERE dep IN ('software') AND id == 1", tableName); - }); + withSQLConf( + ImmutableMap.of(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED().key(), "false"), + () -> { + sql("UPDATE %s SET id = -1 WHERE dep IN ('software') AND id == 1", tableName); + }); } @Test public void testUpdateWithInvalidUpdates() { createAndInitTable("id INT, a ARRAY>, m MAP"); - AssertHelpers.assertThrows("Should complain about updating an array column", - AnalysisException.class, "Updating nested fields is only supported for structs", + AssertHelpers.assertThrows( + "Should complain about updating an array column", + AnalysisException.class, + "Updating nested fields is only supported for structs", () -> sql("UPDATE %s SET a.c1 = 1", tableName)); - AssertHelpers.assertThrows("Should complain about updating a map column", - AnalysisException.class, "Updating nested fields is only supported for structs", + AssertHelpers.assertThrows( + "Should complain about updating a map column", + AnalysisException.class, + "Updating nested fields is only supported for structs", () -> sql("UPDATE %s SET m.key = 'new_key'", tableName)); } @@ -931,48 +1060,68 @@ public void testUpdateWithInvalidUpdates() { public void testUpdateWithConflictingAssignments() { createAndInitTable("id INT, c STRUCT>"); - AssertHelpers.assertThrows("Should complain about conflicting updates to a top-level column", - AnalysisException.class, "Updates are in conflict", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a top-level column", + AnalysisException.class, + "Updates are in conflict", () -> sql("UPDATE %s t SET t.id = 1, t.c.n1 = 2, t.id = 2", tableName)); - AssertHelpers.assertThrows("Should complain about conflicting updates to a nested column", - AnalysisException.class, "Updates are in conflict for these columns", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a nested column", + AnalysisException.class, + "Updates are in conflict for these columns", () -> sql("UPDATE %s t SET t.c.n1 = 1, t.id = 2, t.c.n1 = 2", tableName)); - AssertHelpers.assertThrows("Should complain about conflicting updates to a nested column", - AnalysisException.class, "Updates are in conflict", + AssertHelpers.assertThrows( + "Should complain about conflicting updates to a nested column", + AnalysisException.class, + "Updates are in conflict", () -> { - sql("UPDATE %s SET c.n1 = 1, c = named_struct('n1', 1, 'n2', named_struct('dn1', 1, 'dn2', 2))", tableName); + sql( + "UPDATE %s SET c.n1 = 1, c = named_struct('n1', 1, 'n2', named_struct('dn1', 1, 'dn2', 2))", + tableName); }); } @Test public void testUpdateWithInvalidAssignments() { - createAndInitTable("id INT NOT NULL, s STRUCT> NOT NULL"); - - for (String policy : new String[]{"ansi", "strict"}) { - withSQLConf(ImmutableMap.of("spark.sql.storeAssignmentPolicy", policy), () -> { - - AssertHelpers.assertThrows("Should complain about writing nulls to a top-level column", - AnalysisException.class, "Cannot write nullable values to non-null column", - () -> sql("UPDATE %s t SET t.id = NULL", tableName)); - - AssertHelpers.assertThrows("Should complain about writing nulls to a nested column", - AnalysisException.class, "Cannot write nullable values to non-null column", - () -> sql("UPDATE %s t SET t.s.n1 = NULL", tableName)); - - AssertHelpers.assertThrows("Should complain about writing missing fields in structs", - AnalysisException.class, "missing fields", - () -> sql("UPDATE %s t SET t.s = named_struct('n1', 1)", tableName)); - - AssertHelpers.assertThrows("Should complain about writing invalid data types", - AnalysisException.class, "Cannot safely cast", - () -> sql("UPDATE %s t SET t.s.n1 = 'str'", tableName)); - - AssertHelpers.assertThrows("Should complain about writing incompatible structs", - AnalysisException.class, "field name does not match", - () -> sql("UPDATE %s t SET t.s.n2 = named_struct('dn2', 1, 'dn1', 2)", tableName)); - }); + createAndInitTable( + "id INT NOT NULL, s STRUCT> NOT NULL"); + + for (String policy : new String[] {"ansi", "strict"}) { + withSQLConf( + ImmutableMap.of("spark.sql.storeAssignmentPolicy", policy), + () -> { + AssertHelpers.assertThrows( + "Should complain about writing nulls to a top-level column", + AnalysisException.class, + "Cannot write nullable values to non-null column", + () -> sql("UPDATE %s t SET t.id = NULL", tableName)); + + AssertHelpers.assertThrows( + "Should complain about writing nulls to a nested column", + AnalysisException.class, + "Cannot write nullable values to non-null column", + () -> sql("UPDATE %s t SET t.s.n1 = NULL", tableName)); + + AssertHelpers.assertThrows( + "Should complain about writing missing fields in structs", + AnalysisException.class, + "missing fields", + () -> sql("UPDATE %s t SET t.s = named_struct('n1', 1)", tableName)); + + AssertHelpers.assertThrows( + "Should complain about writing invalid data types", + AnalysisException.class, + "Cannot safely cast", + () -> sql("UPDATE %s t SET t.s.n1 = 'str'", tableName)); + + AssertHelpers.assertThrows( + "Should complain about writing incompatible structs", + AnalysisException.class, + "field name does not match", + () -> sql("UPDATE %s t SET t.s.n2 = named_struct('dn2', 1, 'dn1', 2)", tableName)); + }); } } @@ -980,8 +1129,10 @@ public void testUpdateWithInvalidAssignments() { public void testUpdateWithNonDeterministicCondition() { createAndInitTable("id INT, dep STRING"); - AssertHelpers.assertThrows("Should complain about non-deterministic expressions", - AnalysisException.class, "nondeterministic expressions are only allowed", + AssertHelpers.assertThrows( + "Should complain about non-deterministic expressions", + AnalysisException.class, + "nondeterministic expressions are only allowed", () -> sql("UPDATE %s SET id = -1 WHERE id = 1 AND rand() > 0.5", tableName)); } @@ -989,8 +1140,10 @@ public void testUpdateWithNonDeterministicCondition() { public void testUpdateOnNonIcebergTableNotSupported() { createOrReplaceView("testtable", "{ \"c1\": -100, \"c2\": -200 }"); - AssertHelpers.assertThrows("UPDATE is not supported for non iceberg table", - UnsupportedOperationException.class, "not supported temporarily", + AssertHelpers.assertThrows( + "UPDATE is not supported for non iceberg table", + UnsupportedOperationException.class, + "not supported temporarily", () -> sql("UPDATE %s SET c1 = -1 WHERE c2 = 1", "testtable")); } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java index 17df7d2cf9d7..d6b0e9c94258 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/SparkBenchmarkUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -32,13 +31,13 @@ public class SparkBenchmarkUtil { - private SparkBenchmarkUtil() { - } + private SparkBenchmarkUtil() {} public static UnsafeProjection projection(Schema expectedSchema, Schema actualSchema) { StructType struct = SparkSchemaUtil.convert(actualSchema); - List refs = JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava(); + List refs = + JavaConverters.seqAsJavaListConverter(struct.toAttributes()).asJava(); List attrs = Lists.newArrayListWithExpectedSize(struct.fields().length); List exprs = Lists.newArrayListWithExpectedSize(struct.fields().length); diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java index 8c205037f56e..eaef8e0bccaa 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java @@ -16,10 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.spark.action; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Collections; import java.util.UUID; @@ -57,13 +62,6 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.annotations.Timeout; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - @Fork(1) @State(Scope.Benchmark) @Measurement(iterations = 10) @@ -108,10 +106,10 @@ public void sortInt() { SparkActions.get() .rewriteDataFiles(table()) .option(BinPackStrategy.REWRITE_ALL, "true") - .sort(SortOrder - .builderFor(table().schema()) - .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .build()) + .sort( + SortOrder.builderFor(table().schema()) + .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .build()) .execute(); } @@ -121,11 +119,11 @@ public void sortInt2() { SparkActions.get() .rewriteDataFiles(table()) .option(BinPackStrategy.REWRITE_ALL, "true") - .sort(SortOrder - .builderFor(table().schema()) - .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol2", SortDirection.ASC, NullOrder.NULLS_FIRST) - .build()) + .sort( + SortOrder.builderFor(table().schema()) + .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol2", SortDirection.ASC, NullOrder.NULLS_FIRST) + .build()) .execute(); } @@ -135,13 +133,13 @@ public void sortInt3() { SparkActions.get() .rewriteDataFiles(table()) .option(BinPackStrategy.REWRITE_ALL, "true") - .sort(SortOrder - .builderFor(table().schema()) - .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol2", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol3", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol4", SortDirection.ASC, NullOrder.NULLS_FIRST) - .build()) + .sort( + SortOrder.builderFor(table().schema()) + .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol2", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol3", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol4", SortDirection.ASC, NullOrder.NULLS_FIRST) + .build()) .execute(); } @@ -151,13 +149,13 @@ public void sortInt4() { SparkActions.get() .rewriteDataFiles(table()) .option(BinPackStrategy.REWRITE_ALL, "true") - .sort(SortOrder - .builderFor(table().schema()) - .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol2", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol3", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol4", SortDirection.ASC, NullOrder.NULLS_FIRST) - .build()) + .sort( + SortOrder.builderFor(table().schema()) + .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol2", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol3", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol4", SortDirection.ASC, NullOrder.NULLS_FIRST) + .build()) .execute(); } @@ -167,10 +165,10 @@ public void sortString() { SparkActions.get() .rewriteDataFiles(table()) .option(BinPackStrategy.REWRITE_ALL, "true") - .sort(SortOrder - .builderFor(table().schema()) - .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .build()) + .sort( + SortOrder.builderFor(table().schema()) + .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .build()) .execute(); } @@ -180,13 +178,13 @@ public void sortFourColumns() { SparkActions.get() .rewriteDataFiles(table()) .option(BinPackStrategy.REWRITE_ALL, "true") - .sort(SortOrder - .builderFor(table().schema()) - .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("dateCol", SortDirection.DESC, NullOrder.NULLS_FIRST) - .sortBy("doubleCol", SortDirection.DESC, NullOrder.NULLS_FIRST) - .build()) + .sort( + SortOrder.builderFor(table().schema()) + .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("dateCol", SortDirection.DESC, NullOrder.NULLS_FIRST) + .sortBy("doubleCol", SortDirection.DESC, NullOrder.NULLS_FIRST) + .build()) .execute(); } @@ -196,15 +194,15 @@ public void sortSixColumns() { SparkActions.get() .rewriteDataFiles(table()) .option(BinPackStrategy.REWRITE_ALL, "true") - .sort(SortOrder - .builderFor(table().schema()) - .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) - .sortBy("dateCol", SortDirection.DESC, NullOrder.NULLS_FIRST) - .sortBy("timestampCol", SortDirection.DESC, NullOrder.NULLS_FIRST) - .sortBy("doubleCol", SortDirection.DESC, NullOrder.NULLS_FIRST) - .sortBy("longCol", SortDirection.DESC, NullOrder.NULLS_FIRST) - .build()) + .sort( + SortOrder.builderFor(table().schema()) + .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) + .sortBy("dateCol", SortDirection.DESC, NullOrder.NULLS_FIRST) + .sortBy("timestampCol", SortDirection.DESC, NullOrder.NULLS_FIRST) + .sortBy("doubleCol", SortDirection.DESC, NullOrder.NULLS_FIRST) + .sortBy("longCol", SortDirection.DESC, NullOrder.NULLS_FIRST) + .build()) .execute(); } @@ -283,54 +281,76 @@ protected Configuration initHadoopConf() { } protected final void initTable() { - Schema schema = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "intCol2", Types.IntegerType.get()), - required(4, "intCol3", Types.IntegerType.get()), - required(5, "intCol4", Types.IntegerType.get()), - required(6, "floatCol", Types.FloatType.get()), - optional(7, "doubleCol", Types.DoubleType.get()), - optional(8, "dateCol", Types.DateType.get()), - optional(9, "timestampCol", Types.TimestampType.withZone()), - optional(10, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "intCol2", Types.IntegerType.get()), + required(4, "intCol3", Types.IntegerType.get()), + required(5, "intCol4", Types.IntegerType.get()), + required(6, "floatCol", Types.FloatType.get()), + optional(7, "doubleCol", Types.DoubleType.get()), + optional(8, "dateCol", Types.DateType.get()), + optional(9, "timestampCol", Types.TimestampType.withZone()), + optional(10, "stringCol", Types.StringType.get())); SparkSessionCatalog catalog; try { - catalog = (SparkSessionCatalog) - Spark3Util.catalogAndIdentifier(spark(), "spark_catalog").catalog(); + catalog = + (SparkSessionCatalog) Spark3Util.catalogAndIdentifier(spark(), "spark_catalog").catalog(); catalog.dropTable(IDENT); - catalog.createTable(IDENT, SparkSchemaUtil.convert(schema), new Transform[0], Collections.emptyMap()); + catalog.createTable( + IDENT, SparkSchemaUtil.convert(schema), new Transform[0], Collections.emptyMap()); } catch (Exception e) { throw new RuntimeException(e); } } private void appendData() { - Dataset df = spark().range(0, NUM_ROWS * NUM_FILES, 1, NUM_FILES) - .drop("id") - .withColumn("longCol", new RandomGeneratingUDF(UNIQUE_VALUES).randomLongUDF().apply()) - .withColumn( - "intCol", - new RandomGeneratingUDF(UNIQUE_VALUES).randomLongUDF().apply().cast(DataTypes.IntegerType)) - .withColumn( - "intCol2", - new RandomGeneratingUDF(UNIQUE_VALUES).randomLongUDF().apply().cast(DataTypes.IntegerType)) - .withColumn( - "intCol3", - new RandomGeneratingUDF(UNIQUE_VALUES).randomLongUDF().apply().cast(DataTypes.IntegerType)) - .withColumn( - "intCol4", - new RandomGeneratingUDF(UNIQUE_VALUES).randomLongUDF().apply().cast(DataTypes.IntegerType)) - .withColumn( - "floatCol", - new RandomGeneratingUDF(UNIQUE_VALUES).randomLongUDF().apply().cast(DataTypes.FloatType)) - .withColumn( - "doubleCol", - new RandomGeneratingUDF(UNIQUE_VALUES).randomLongUDF().apply().cast(DataTypes.DoubleType)) - .withColumn("dateCol", date_add(current_date(), col("intCol").mod(NUM_FILES))) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", new RandomGeneratingUDF(UNIQUE_VALUES).randomString().apply()); + Dataset df = + spark() + .range(0, NUM_ROWS * NUM_FILES, 1, NUM_FILES) + .drop("id") + .withColumn("longCol", new RandomGeneratingUDF(UNIQUE_VALUES).randomLongUDF().apply()) + .withColumn( + "intCol", + new RandomGeneratingUDF(UNIQUE_VALUES) + .randomLongUDF() + .apply() + .cast(DataTypes.IntegerType)) + .withColumn( + "intCol2", + new RandomGeneratingUDF(UNIQUE_VALUES) + .randomLongUDF() + .apply() + .cast(DataTypes.IntegerType)) + .withColumn( + "intCol3", + new RandomGeneratingUDF(UNIQUE_VALUES) + .randomLongUDF() + .apply() + .cast(DataTypes.IntegerType)) + .withColumn( + "intCol4", + new RandomGeneratingUDF(UNIQUE_VALUES) + .randomLongUDF() + .apply() + .cast(DataTypes.IntegerType)) + .withColumn( + "floatCol", + new RandomGeneratingUDF(UNIQUE_VALUES) + .randomLongUDF() + .apply() + .cast(DataTypes.FloatType)) + .withColumn( + "doubleCol", + new RandomGeneratingUDF(UNIQUE_VALUES) + .randomLongUDF() + .apply() + .cast(DataTypes.DoubleType)) + .withColumn("dateCol", date_add(current_date(), col("intCol").mod(NUM_FILES))) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", new RandomGeneratingUDF(UNIQUE_VALUES).randomString().apply()); writeData(df); } @@ -362,7 +382,8 @@ protected void cleanupFiles() throws IOException { protected void setupSpark() { SparkSession.Builder builder = SparkSession.builder() - .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") + .config( + "spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", getCatalogWarehouse()) .master("local[*]"); diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/action/RandomGeneratingUDF.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/action/RandomGeneratingUDF.java index cfbd9d4fb3f6..63d24f7da553 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/action/RandomGeneratingUDF.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/action/RandomGeneratingUDF.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.action; +import static org.apache.spark.sql.functions.udf; + import java.io.Serializable; import java.util.Random; import org.apache.iceberg.types.Types; @@ -26,8 +27,6 @@ import org.apache.spark.sql.expressions.UserDefinedFunction; import org.apache.spark.sql.types.DataTypes; -import static org.apache.spark.sql.functions.udf; - class RandomGeneratingUDF implements Serializable { private final long uniqueValues; private Random rand = new Random(); @@ -37,11 +36,16 @@ class RandomGeneratingUDF implements Serializable { } UserDefinedFunction randomLongUDF() { - return udf(() -> rand.nextLong() % (uniqueValues / 2), DataTypes.LongType).asNondeterministic().asNonNullable(); + return udf(() -> rand.nextLong() % (uniqueValues / 2), DataTypes.LongType) + .asNondeterministic() + .asNonNullable(); } UserDefinedFunction randomString() { - return udf(() -> (String) RandomUtil.generatePrimitive(Types.StringType.get(), rand), DataTypes.StringType) - .asNondeterministic().asNonNullable(); + return udf( + () -> (String) RandomUtil.generatePrimitive(Types.StringType.get(), rand), + DataTypes.StringType) + .asNondeterministic() + .asNonNullable(); } } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java index 3c58f5278ca3..846192a5e652 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersFlatDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -52,15 +54,11 @@ import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** * A benchmark that evaluates the performance of reading Parquet data with a flat schema using * Iceberg and Spark Parquet readers. * - * To run this benchmark for spark-3.3: - * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=SparkParquetReadersFlatDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-readers-flat-data-benchmark-result.txt @@ -73,22 +71,23 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetReadersFlatDataBenchmark { - private static final DynMethods.UnboundMethod APPLY_PROJECTION = DynMethods.builder("apply") - .impl(UnsafeProjection.class, InternalRow.class) - .build(); - private static final Schema SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); - private static final Schema PROJECTED_SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(8, "stringCol", Types.StringType.get())); + private static final DynMethods.UnboundMethod APPLY_PROJECTION = + DynMethods.builder("apply").impl(UnsafeProjection.class, InternalRow.class).build(); + private static final Schema SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); + private static final Schema PROJECTED_SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(8, "stringCol", Types.StringType.get())); private static final int NUM_RECORDS = 10000000; private File dataFile; @@ -97,10 +96,8 @@ public void setupBenchmark() throws IOException { dataFile = File.createTempFile("parquet-flat-data-benchmark", ".parquet"); dataFile.delete(); List records = RandomData.generateList(SCHEMA, NUM_RECORDS, 0L); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .schema(SCHEMA) - .named("benchmark") - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)).schema(SCHEMA).named("benchmark").build()) { writer.addAll(records); } } @@ -115,10 +112,11 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void readUsingIcebergReader(Blackhole blackHole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackHole.consume(row); @@ -129,14 +127,15 @@ public void readUsingIcebergReader(Blackhole blackHole) throws IOException { @Benchmark @Threads(1) public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); + Iterable unsafeRows = + Iterables.transform( + rows, APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -148,14 +147,15 @@ public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException @Threads(1) public void readUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -166,10 +166,11 @@ public void readUsingSparkReader(Blackhole blackhole) throws IOException { @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -180,14 +181,18 @@ public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOE @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { - - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA))::invoke); + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { + + Iterable unsafeRows = + Iterables.transform( + rows, + APPLY_PROJECTION.bind( + SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA)) + ::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -199,14 +204,15 @@ public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) thro @Threads(1) public void readWithProjectionUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(PROJECTED_SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java index e894f50ebfbc..3df285c09eba 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetReadersNestedDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -52,15 +54,11 @@ import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** - * A benchmark that evaluates the performance of reading nested Parquet data using - * Iceberg and Spark Parquet readers. + * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg and Spark + * Parquet readers. * - * To run this benchmark for spark-3.3: - * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=SparkParquetReadersNestedDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-readers-nested-data-benchmark-result.txt @@ -73,22 +71,21 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetReadersNestedDataBenchmark { - private static final DynMethods.UnboundMethod APPLY_PROJECTION = DynMethods.builder("apply") - .impl(UnsafeProjection.class, InternalRow.class) - .build(); - private static final Schema SCHEMA = new Schema( - required(0, "id", Types.LongType.get()), - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get()) - )) - ); - private static final Schema PROJECTED_SCHEMA = new Schema( - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()) - )) - ); + private static final DynMethods.UnboundMethod APPLY_PROJECTION = + DynMethods.builder("apply").impl(UnsafeProjection.class, InternalRow.class).build(); + private static final Schema SCHEMA = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 4, + "nested", + Types.StructType.of( + required(1, "col1", Types.StringType.get()), + required(2, "col2", Types.DoubleType.get()), + required(3, "col3", Types.LongType.get())))); + private static final Schema PROJECTED_SCHEMA = + new Schema( + optional(4, "nested", Types.StructType.of(required(1, "col1", Types.StringType.get())))); private static final int NUM_RECORDS = 10000000; private File dataFile; @@ -97,10 +94,8 @@ public void setupBenchmark() throws IOException { dataFile = File.createTempFile("parquet-nested-data-benchmark", ".parquet"); dataFile.delete(); List records = RandomData.generateList(SCHEMA, NUM_RECORDS, 0L); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .schema(SCHEMA) - .named("benchmark") - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)).schema(SCHEMA).named("benchmark").build()) { writer.addAll(records); } } @@ -115,10 +110,11 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void readUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -129,14 +125,15 @@ public void readUsingIcebergReader(Blackhole blackhole) throws IOException { @Benchmark @Threads(1) public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) + .build()) { - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); + Iterable unsafeRows = + Iterables.transform( + rows, APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -148,14 +145,15 @@ public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException @Threads(1) public void readUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -166,10 +164,11 @@ public void readUsingSparkReader(Blackhole blackhole) throws IOException { @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { for (InternalRow row : rows) { blackhole.consume(row); @@ -180,14 +179,18 @@ public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOE @Benchmark @Threads(1) public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) - .build()) { - - Iterable unsafeRows = Iterables.transform( - rows, - APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA))::invoke); + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type)) + .build()) { + + Iterable unsafeRows = + Iterables.transform( + rows, + APPLY_PROJECTION.bind( + SparkBenchmarkUtil.projection(PROJECTED_SCHEMA, PROJECTED_SCHEMA)) + ::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); @@ -199,14 +202,15 @@ public void readWithProjectionUsingIcebergReaderUnsafe(Blackhole blackhole) thro @Threads(1) public void readWithProjectionUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(PROJECTED_SCHEMA); - try (CloseableIterable rows = Parquet.read(Files.localInput(dataFile)) - .project(PROJECTED_SCHEMA) - .readSupport(new ParquetReadSupport()) - .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .callInit() - .build()) { + try (CloseableIterable rows = + Parquet.read(Files.localInput(dataFile)) + .project(PROJECTED_SCHEMA) + .readSupport(new ParquetReadSupport()) + .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .callInit() + .build()) { for (InternalRow row : rows) { blackhole.consume(row); diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java index 5fe33de3d69e..14e2a8aa2e39 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersFlatDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.Files; @@ -45,15 +47,11 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.annotations.Warmup; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** * A benchmark that evaluates the performance of writing Parquet data with a flat schema using * Iceberg and Spark Parquet writers. * - * To run this benchmark for spark-3.3: - * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=SparkParquetWritersFlatDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-writers-flat-data-benchmark-result.txt @@ -66,15 +64,16 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetWritersFlatDataBenchmark { - private static final Schema SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); private static final int NUM_RECORDS = 1000000; private Iterable rows; private File dataFile; @@ -96,10 +95,13 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void writeUsingIcebergWriter() throws IOException { - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) + .schema(SCHEMA) + .build()) { writer.addAll(rows); } @@ -109,15 +111,16 @@ public void writeUsingIcebergWriter() throws IOException { @Threads(1) public void writeUsingSparkWriter() throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .writeSupport(new ParquetWriteSupport()) - .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) - .set("spark.sql.parquet.writeLegacyFormat", "false") - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .writeSupport(new ParquetWriteSupport()) + .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) + .set("spark.sql.parquet.writeLegacyFormat", "false") + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") + .schema(SCHEMA) + .build()) { writer.addAll(rows); } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java index 0b591c2e2cf5..48fc91d6d73d 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/data/parquet/SparkParquetWritersNestedDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.Files; @@ -45,15 +47,11 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.annotations.Warmup; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using - * Iceberg and Spark Parquet writers. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and Spark + * Parquet writers. * - * To run this benchmark for spark-3.3: - * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=SparkParquetWritersNestedDataBenchmark * -PjmhOutputPath=benchmark/spark-parquet-writers-nested-data-benchmark-result.txt @@ -66,14 +64,16 @@ @BenchmarkMode(Mode.SingleShotTime) public class SparkParquetWritersNestedDataBenchmark { - private static final Schema SCHEMA = new Schema( - required(0, "id", Types.LongType.get()), - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get()) - )) - ); + private static final Schema SCHEMA = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 4, + "nested", + Types.StructType.of( + required(1, "col1", Types.StringType.get()), + required(2, "col2", Types.DoubleType.get()), + required(3, "col3", Types.LongType.get())))); private static final int NUM_RECORDS = 1000000; private Iterable rows; private File dataFile; @@ -95,10 +95,13 @@ public void tearDownBenchmark() { @Benchmark @Threads(1) public void writeUsingIcebergWriter() throws IOException { - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType)) + .schema(SCHEMA) + .build()) { writer.addAll(rows); } @@ -108,15 +111,16 @@ public void writeUsingIcebergWriter() throws IOException { @Threads(1) public void writeUsingSparkWriter() throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); - try (FileAppender writer = Parquet.write(Files.localOutput(dataFile)) - .writeSupport(new ParquetWriteSupport()) - .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) - .set("spark.sql.parquet.writeLegacyFormat", "false") - .set("spark.sql.parquet.binaryAsString", "false") - .set("spark.sql.parquet.int96AsTimestamp", "false") - .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") - .schema(SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(dataFile)) + .writeSupport(new ParquetWriteSupport()) + .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()) + .set("spark.sql.parquet.writeLegacyFormat", "false") + .set("spark.sql.parquet.binaryAsString", "false") + .set("spark.sql.parquet.int96AsTimestamp", "false") + .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") + .schema(SCHEMA) + .build()) { writer.addAll(rows); } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java index 1820a801b2fb..0dbf07285060 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/Action.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; @FunctionalInterface diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java index 60dde6e98a16..68c537e34a4a 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -81,7 +80,8 @@ protected String newTableLocation() { protected String dataLocation() { Map properties = table.properties(); - return properties.getOrDefault(TableProperties.WRITE_DATA_LOCATION, String.format("%s/data", table.location())); + return properties.getOrDefault( + TableProperties.WRITE_DATA_LOCATION, String.format("%s/data", table.location())); } protected void cleanupFiles() throws IOException { @@ -94,12 +94,12 @@ protected void cleanupFiles() throws IOException { } protected void setupSpark(boolean enableDictionaryEncoding) { - SparkSession.Builder builder = SparkSession.builder() - .config("spark.ui.enabled", false); + SparkSession.Builder builder = SparkSession.builder().config("spark.ui.enabled", false); if (!enableDictionaryEncoding) { - builder.config("parquet.dictionary.page.size", "1") - .config("parquet.enable.dictionary", false) - .config(TableProperties.PARQUET_DICT_SIZE_BYTES, "1"); + builder + .config("parquet.dictionary.page.size", "1") + .config("parquet.enable.dictionary", false) + .config(TableProperties.PARQUET_DICT_SIZE_BYTES, "1"); } builder.master("local"); spark = builder.getOrCreate(); @@ -116,7 +116,7 @@ protected void tearDownSpark() { } protected void materialize(Dataset ds) { - ds.queryExecution().toRdd().toJavaRDD().foreach(record -> { }); + ds.queryExecution().toRdd().toJavaRDD().foreach(record -> {}); } protected void materialize(Dataset ds, Blackhole blackhole) { @@ -126,7 +126,8 @@ protected void materialize(Dataset ds, Blackhole blackhole) { protected void appendAsFile(Dataset ds) { // ensure the schema is precise (including nullability) StructType sparkSchema = SparkSchemaUtil.convert(table.schema()); - spark.createDataFrame(ds.rdd(), sparkSchema) + spark + .createDataFrame(ds.rdd(), sparkSchema) .coalesce(1) .write() .format("iceberg") @@ -138,42 +139,49 @@ protected void withSQLConf(Map conf, Action action) { SQLConf sqlConf = SQLConf.get(); Map currentConfValues = Maps.newHashMap(); - conf.keySet().forEach(confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); - - conf.forEach((confKey, confValue) -> { - if (SQLConf.isStaticConfigKey(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); + conf.keySet() + .forEach( + confKey -> { + if (sqlConf.contains(confKey)) { + String currentConfValue = sqlConf.getConfString(confKey); + currentConfValues.put(confKey, currentConfValue); + } + }); + + conf.forEach( + (confKey, confValue) -> { + if (SQLConf.isStaticConfigKey(confKey)) { + throw new RuntimeException("Cannot modify the value of a static config: " + confKey); + } + sqlConf.setConfString(confKey, confValue); + }); try { action.invoke(); } finally { - conf.forEach((confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); + conf.forEach( + (confKey, confValue) -> { + if (currentConfValues.containsKey(confKey)) { + sqlConf.setConfString(confKey, currentConfValues.get(confKey)); + } else { + sqlConf.unsetConf(confKey); + } + }); } } protected void withTableProperties(Map props, Action action) { Map tableProps = table.properties(); Map currentPropValues = Maps.newHashMap(); - props.keySet().forEach(propKey -> { - if (tableProps.containsKey(propKey)) { - String currentPropValue = tableProps.get(propKey); - currentPropValues.put(propKey, currentPropValue); - } - }); + props + .keySet() + .forEach( + propKey -> { + if (tableProps.containsKey(propKey)) { + String currentPropValue = tableProps.get(propKey); + currentPropValues.put(propKey, currentPropValue); + } + }); UpdateProperties updateProperties = table.updateProperties(); props.forEach(updateProperties::set); @@ -183,13 +191,14 @@ protected void withTableProperties(Map props, Action action) { action.invoke(); } finally { UpdateProperties restoreProperties = table.updateProperties(); - props.forEach((propKey, propValue) -> { - if (currentPropValues.containsKey(propKey)) { - restoreProperties.set(propKey, currentPropValues.get(propKey)); - } else { - restoreProperties.remove(propKey); - } - }); + props.forEach( + (propKey, propValue) -> { + if (currentPropValues.containsKey(propKey)) { + restoreProperties.set(propKey, currentPropValues.get(propKey)); + } else { + restoreProperties.remove(propKey); + } + }); restoreProperties.commit(); } } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceDeleteBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceDeleteBenchmark.java index 5db431eaa50c..e42707bf102b 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceDeleteBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceDeleteBenchmark.java @@ -16,9 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.List; import java.util.Map; @@ -53,14 +60,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - public abstract class IcebergSourceDeleteBenchmark extends IcebergSourceBenchmark { private static final Logger LOG = LoggerFactory.getLogger(IcebergSourceDeleteBenchmark.class); private static final long TARGET_FILE_SIZE_IN_BYTES = 512L * 1024 * 1024; @@ -86,11 +85,13 @@ public void readIceberg(Blackhole blackhole) { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); tableProperties.put(PARQUET_VECTORIZATION_ENABLED, "false"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df, blackhole); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df, blackhole); + }); } @Benchmark @@ -99,11 +100,14 @@ public void readIcebergWithIsDeletedColumn(Blackhole blackhole) { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); tableProperties.put(PARQUET_VECTORIZATION_ENABLED, "false"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter("_deleted = false"); - materialize(df, blackhole); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter("_deleted = false"); + materialize(df, blackhole); + }); } @Benchmark @@ -112,11 +116,14 @@ public void readDeletedRows(Blackhole blackhole) { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); tableProperties.put(PARQUET_VECTORIZATION_ENABLED, "false"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter("_deleted = true"); - materialize(df, blackhole); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter("_deleted = true"); + materialize(df, blackhole); + }); } @Benchmark @@ -125,11 +132,13 @@ public void readIcebergVectorized(Blackhole blackhole) { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); tableProperties.put(PARQUET_VECTORIZATION_ENABLED, "true"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df, blackhole); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df, blackhole); + }); } @Benchmark @@ -138,11 +147,14 @@ public void readIcebergWithIsDeletedColumnVectorized(Blackhole blackhole) { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); tableProperties.put(PARQUET_VECTORIZATION_ENABLED, "true"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter("_deleted = false"); - materialize(df, blackhole); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter("_deleted = false"); + materialize(df, blackhole); + }); } @Benchmark @@ -151,37 +163,43 @@ public void readDeletedRowsVectorized(Blackhole blackhole) { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); tableProperties.put(PARQUET_VECTORIZATION_ENABLED, "true"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter("_deleted = true"); - materialize(df, blackhole); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter("_deleted = true"); + materialize(df, blackhole); + }); } protected abstract void appendData() throws IOException; protected void writeData(int fileNum) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(MOD(longCol, 2147483647) AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(MOD(longCol, 2147483647) AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); appendAsFile(df); } @Override protected Table initTable() { - Schema schema = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); @@ -195,17 +213,19 @@ protected Configuration initHadoopConf() { return new Configuration(); } - protected void writePosDeletes(CharSequence path, long numRows, double percentage) throws IOException { + protected void writePosDeletes(CharSequence path, long numRows, double percentage) + throws IOException { writePosDeletes(path, numRows, percentage, 1); } - protected void writePosDeletes(CharSequence path, long numRows, double percentage, - int numDeleteFile) throws IOException { + protected void writePosDeletes( + CharSequence path, long numRows, double percentage, int numDeleteFile) throws IOException { writePosDeletesWithNoise(path, numRows, percentage, 0, numDeleteFile); } - protected void writePosDeletesWithNoise(CharSequence path, long numRows, double percentage, int numNoise, - int numDeleteFile) throws IOException { + protected void writePosDeletesWithNoise( + CharSequence path, long numRows, double percentage, int numNoise, int numDeleteFile) + throws IOException { Set deletedPos = Sets.newHashSet(); while (deletedPos.size() < numRows * percentage) { deletedPos.add(ThreadLocalRandom.current().nextLong(numRows)); @@ -219,14 +239,15 @@ protected void writePosDeletesWithNoise(CharSequence path, long numRows, double } } - protected void writePosDeletes(CharSequence path, List deletedPos, int numNoise) throws IOException { + protected void writePosDeletes(CharSequence path, List deletedPos, int numNoise) + throws IOException { OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).build(); - ClusteredPositionDeleteWriter writer = new ClusteredPositionDeleteWriter<>( - writerFactory, fileFactory, table().io(), TARGET_FILE_SIZE_IN_BYTES); + ClusteredPositionDeleteWriter writer = + new ClusteredPositionDeleteWriter<>( + writerFactory, fileFactory, table().io(), TARGET_FILE_SIZE_IN_BYTES); PartitionSpec unpartitionedSpec = table().specs().get(0); @@ -274,15 +295,16 @@ private void writeEqDeletes(List rows) throws IOException { int equalityFieldId = table().schema().findField("longCol").fieldId(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory - .builderFor(table()) - .dataFileFormat(fileFormat()) - .equalityDeleteRowSchema(table().schema()) - .equalityFieldIds(new int[]{equalityFieldId}) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .equalityDeleteRowSchema(table().schema()) + .equalityFieldIds(new int[] {equalityFieldId}) + .build(); - ClusteredEqualityDeleteWriter writer = new ClusteredEqualityDeleteWriter<>( - writerFactory, fileFactory, table().io(), TARGET_FILE_SIZE_IN_BYTES); + ClusteredEqualityDeleteWriter writer = + new ClusteredEqualityDeleteWriter<>( + writerFactory, fileFactory, table().io(), TARGET_FILE_SIZE_IN_BYTES); PartitionSpec unpartitionedSpec = table().specs().get(0); try (ClusteredEqualityDeleteWriter closeableWriter = writer) { @@ -298,14 +320,14 @@ private void writeEqDeletes(List rows) throws IOException { } private OutputFileFactory newFileFactory() { - return OutputFileFactory.builderFor(table(), 1, 1) - .format(fileFormat()) - .build(); + return OutputFileFactory.builderFor(table(), 1, 1).format(fileFormat()).build(); } private CharSequence noisePath(CharSequence path) { - // assume the data file name would be something like "00000-0-30da64e0-56b5-4743-a11b-3188a1695bf7-00001.parquet" - // so the dataFileSuffixLen is the UUID string length + length of "-00001.parquet", which is 36 + 14 = 60. It's OK + // assume the data file name would be something like + // "00000-0-30da64e0-56b5-4743-a11b-3188a1695bf7-00001.parquet" + // so the dataFileSuffixLen is the UUID string length + length of "-00001.parquet", which is 36 + // + 14 = 60. It's OK // to be not accurate here. int dataFileSuffixLen = 60; UUID uuid = UUID.randomUUID(); diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java index 9e206321a540..59e6230350d9 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceFlatDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -29,9 +31,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class IcebergSourceFlatDataBenchmark extends IcebergSourceBenchmark { @Override @@ -41,15 +40,16 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java index 5a0d9359ec6b..a1c61b9b4de0 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -29,9 +31,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class IcebergSourceNestedDataBenchmark extends IcebergSourceBenchmark { @Override @@ -41,14 +40,16 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(0, "id", Types.LongType.get()), - optional(4, "nested", Types.StructType.of( - required(1, "col1", Types.StringType.get()), - required(2, "col2", Types.DoubleType.get()), - required(3, "col3", Types.LongType.get()) - )) - ); + Schema schema = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 4, + "nested", + Types.StructType.of( + required(1, "col1", Types.StringType.get()), + required(2, "col2", Types.DoubleType.get()), + required(3, "col3", Types.LongType.get())))); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java index 369a1507b648..f68b587735dd 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceNestedListDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -29,9 +31,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class IcebergSourceNestedListDataBenchmark extends IcebergSourceBenchmark { @Override @@ -41,12 +40,19 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(0, "id", Types.LongType.get()), - optional(1, "outerlist", Types.ListType.ofOptional(2, - Types.StructType.of(required(3, "innerlist", Types.ListType.ofRequired(4, Types.StringType.get()))) - )) - ); + Schema schema = + new Schema( + required(0, "id", Types.LongType.get()), + optional( + 1, + "outerlist", + Types.ListType.ofOptional( + 2, + Types.StructType.of( + required( + 3, + "innerlist", + Types.ListType.ofRequired(4, Types.StringType.get())))))); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java index acec471bdfd1..8d0b94262aee 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/WritersBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.Comparator; import java.util.List; @@ -57,23 +59,20 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.infra.Blackhole; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class WritersBenchmark extends IcebergSourceBenchmark { private static final int NUM_ROWS = 2500000; private static final long TARGET_FILE_SIZE_IN_BYTES = 50L * 1024 * 1024; - private static final Schema SCHEMA = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "timestampCol", Types.TimestampType.withZone()), - optional(7, "stringCol", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "timestampCol", Types.TimestampType.withZone()), + optional(7, "stringCol", Types.StringType.get())); private Iterable rows; private Iterable positionDeleteRows; @@ -92,7 +91,8 @@ public void setupBenchmark() { data.sort(Comparator.comparingInt(row -> transform.apply(row.getInt(1)))); this.rows = data; - this.positionDeleteRows = RandomData.generateSpark(DeleteSchemaUtil.pathPosSchema(), NUM_ROWS, 0L); + this.positionDeleteRows = + RandomData.generateSpark(DeleteSchemaUtil.pathPosSchema(), NUM_ROWS, 0L); this.unpartitionedSpec = table().specs().get(0); Preconditions.checkArgument(unpartitionedSpec.isUnpartitioned()); @@ -118,9 +118,7 @@ protected final Table initTable() { Table table = tables.create(SCHEMA, spec, properties, newTableLocation()); // add a partitioned spec to the table - table.updateSpec() - .addField(Expressions.bucket("intCol", 32)) - .commit(); + table.updateSpec().addField(Expressions.bucket("intCol", 32)).commit(); return table; } @@ -131,13 +129,14 @@ public void writeUnpartitionedClusteredDataWriter(Blackhole blackhole) throws IO FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .dataSchema(table().schema()) + .build(); - ClusteredDataWriter writer = new ClusteredDataWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredDataWriter writer = + new ClusteredDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); try (ClusteredDataWriter closeableWriter = writer) { for (InternalRow row : rows) { @@ -157,13 +156,14 @@ public void writeUnpartitionedLegacyDataWriter(Blackhole blackhole) throws IOExc Schema writeSchema = table().schema(); StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(unpartitionedSpec) - .build(); + SparkAppenderFactory appenders = + SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) + .spec(unpartitionedSpec) + .build(); - TaskWriter writer = new UnpartitionedWriter<>( - unpartitionedSpec, fileFormat(), appenders, - fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + TaskWriter writer = + new UnpartitionedWriter<>( + unpartitionedSpec, fileFormat(), appenders, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); try (TaskWriter closableWriter = writer) { for (InternalRow row : rows) { @@ -180,13 +180,14 @@ public void writePartitionedClusteredDataWriter(Blackhole blackhole) throws IOEx FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .dataSchema(table().schema()) + .build(); - ClusteredDataWriter writer = new ClusteredDataWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredDataWriter writer = + new ClusteredDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); StructType dataSparkType = SparkSchemaUtil.convert(table().schema()); @@ -211,14 +212,21 @@ public void writePartitionedLegacyDataWriter(Blackhole blackhole) throws IOExcep Schema writeSchema = table().schema(); StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(partitionedSpec) - .build(); - - TaskWriter writer = new SparkPartitionedWriter( - partitionedSpec, fileFormat(), appenders, - fileFactory, io, TARGET_FILE_SIZE_IN_BYTES, - writeSchema, sparkWriteType); + SparkAppenderFactory appenders = + SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) + .spec(partitionedSpec) + .build(); + + TaskWriter writer = + new SparkPartitionedWriter( + partitionedSpec, + fileFormat(), + appenders, + fileFactory, + io, + TARGET_FILE_SIZE_IN_BYTES, + writeSchema, + sparkWriteType); try (TaskWriter closableWriter = writer) { for (InternalRow row : rows) { @@ -235,13 +243,14 @@ public void writePartitionedFanoutDataWriter(Blackhole blackhole) throws IOExcep FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .dataSchema(table().schema()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .dataSchema(table().schema()) + .build(); - FanoutDataWriter writer = new FanoutDataWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + FanoutDataWriter writer = + new FanoutDataWriter<>(writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); StructType dataSparkType = SparkSchemaUtil.convert(table().schema()); @@ -266,14 +275,21 @@ public void writePartitionedLegacyFanoutDataWriter(Blackhole blackhole) throws I Schema writeSchema = table().schema(); StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema); - SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) - .spec(partitionedSpec) - .build(); - - TaskWriter writer = new SparkPartitionedFanoutWriter( - partitionedSpec, fileFormat(), appenders, - fileFactory, io, TARGET_FILE_SIZE_IN_BYTES, - writeSchema, sparkWriteType); + SparkAppenderFactory appenders = + SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType) + .spec(partitionedSpec) + .build(); + + TaskWriter writer = + new SparkPartitionedFanoutWriter( + partitionedSpec, + fileFormat(), + appenders, + fileFactory, + io, + TARGET_FILE_SIZE_IN_BYTES, + writeSchema, + sparkWriteType); try (TaskWriter closableWriter = writer) { for (InternalRow row : rows) { @@ -286,20 +302,23 @@ partitionedSpec, fileFormat(), appenders, @Benchmark @Threads(1) - public void writePartitionedClusteredEqualityDeleteWriter(Blackhole blackhole) throws IOException { + public void writePartitionedClusteredEqualityDeleteWriter(Blackhole blackhole) + throws IOException { FileIO io = table().io(); int equalityFieldId = table().schema().findField("longCol").fieldId(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .equalityDeleteRowSchema(table().schema()) - .equalityFieldIds(new int[]{equalityFieldId}) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()) + .dataFileFormat(fileFormat()) + .equalityDeleteRowSchema(table().schema()) + .equalityFieldIds(new int[] {equalityFieldId}) + .build(); - ClusteredEqualityDeleteWriter writer = new ClusteredEqualityDeleteWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredEqualityDeleteWriter writer = + new ClusteredEqualityDeleteWriter<>( + writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema()); StructType deleteSparkType = SparkSchemaUtil.convert(table().schema()); @@ -317,16 +336,17 @@ public void writePartitionedClusteredEqualityDeleteWriter(Blackhole blackhole) t @Benchmark @Threads(1) - public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) throws IOException { + public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) + throws IOException { FileIO io = table().io(); OutputFileFactory fileFactory = newFileFactory(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()) - .dataFileFormat(fileFormat()) - .build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).build(); - ClusteredPositionDeleteWriter writer = new ClusteredPositionDeleteWriter<>( - writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); + ClusteredPositionDeleteWriter writer = + new ClusteredPositionDeleteWriter<>( + writerFactory, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES); PositionDelete positionDelete = PositionDelete.create(); try (ClusteredPositionDeleteWriter closeableWriter = writer) { @@ -342,8 +362,6 @@ public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) } private OutputFileFactory newFileFactory() { - return OutputFileFactory.builderFor(table(), 1, 1) - .format(fileFormat()) - .build(); + return OutputFileFactory.builderFor(table(), 1, 1).format(fileFormat()).build(); } } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java index 868edcc90517..5220f65dfa6c 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/AvroWritersBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.avro; import org.apache.iceberg.FileFormat; @@ -24,9 +23,8 @@ /** * A benchmark that evaluates the performance of various Iceberg writers for Avro data. - *

    - * To run this benchmark for spark-3.3: - * + * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=AvroWritersBenchmark * -PjmhOutputPath=benchmark/avro-writers-benchmark-result.txt diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java index 14d39fcad750..4eb1ee9d92bb 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceFlatAvroDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.avro; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,18 +36,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of reading Avro data with a flat schema - * using Iceberg and the built-in file source in Spark. - *

    - * To run this benchmark for spark-3.3: - * + * A benchmark that evaluates the performance of reading Avro data with a flat schema using Iceberg + * and the built-in file source in Spark. + * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=IcebergSourceFlatAvroDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-avro-data-read-benchmark-result.txt @@ -70,11 +68,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,10 +82,12 @@ public void readIceberg() { public void readFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().format("avro").load(dataLocation()); + materialize(df); + }); } @Benchmark @@ -93,11 +95,13 @@ public void readFileSource() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @@ -105,28 +109,34 @@ public void readWithProjectionIceberg() { public void readWithProjectionFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().format("avro").load(dataLocation()).select("longCol"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "avro"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + appendAsFile(df); + } + }); } } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java index e9e492717cc3..2e792b6d35e3 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/avro/IcebergSourceNestedAvroDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.avro; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,19 +36,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - - /** - * A benchmark that evaluates the performance of reading Avro data with a flat schema - * using Iceberg and the built-in file source in Spark. - *

    - * To run this benchmark for spark-3.3: - * + * A benchmark that evaluates the performance of reading Avro data with a flat schema using Iceberg + * and the built-in file source in Spark. + * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedAvroDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-avro-data-read-benchmark-result.txt @@ -71,11 +68,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -83,10 +82,12 @@ public void readIceberg() { public void readFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().format("avro").load(dataLocation()); + materialize(df); + }); } @Benchmark @@ -94,11 +95,14 @@ public void readFileSource() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("nested.col3"); + materialize(df); + }); } @Benchmark @@ -106,27 +110,33 @@ public void readWithProjectionIceberg() { public void readWithProjectionFileSource() { Map conf = Maps.newHashMap(); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().format("avro").load(dataLocation()).select("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = + spark().read().format("avro").load(dataLocation()).select("nested.col3"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "avro"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); + appendAsFile(df); + } + }); } } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java index 329c9ffe7738..d0fdd8915780 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.PartitionSpec; @@ -30,13 +32,10 @@ import org.apache.iceberg.spark.source.IcebergSourceBenchmark; import org.apache.iceberg.types.Types; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - - /** - * Same as {@link org.apache.iceberg.spark.source.IcebergSourceFlatDataBenchmark} but we disable the Timestamp with - * zone type for ORC performance tests as Spark native reader does not support ORC's TIMESTAMP_INSTANT type + * Same as {@link org.apache.iceberg.spark.source.IcebergSourceFlatDataBenchmark} but we disable the + * Timestamp with zone type for ORC performance tests as Spark native reader does not support ORC's + * TIMESTAMP_INSTANT type */ public abstract class IcebergSourceFlatORCDataBenchmark extends IcebergSourceBenchmark { @@ -47,17 +46,19 @@ protected Configuration initHadoopConf() { @Override protected final Table initTable() { - Schema schema = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "decimalCol", Types.DecimalType.of(20, 5)), - optional(6, "dateCol", Types.DateType.get()), - // Disable timestamp column for ORC performance tests as Spark native reader does not support ORC's - // TIMESTAMP_INSTANT type - // optional(7, "timestampCol", Types.TimestampType.withZone()), - optional(8, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "decimalCol", Types.DecimalType.of(20, 5)), + optional(6, "dateCol", Types.DateType.get()), + // Disable timestamp column for ORC performance tests as Spark native reader does not + // support ORC's + // TIMESTAMP_INSTANT type + // optional(7, "timestampCol", Types.TimestampType.withZone()), + optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = Maps.newHashMap(); diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java index 46aa8f83999d..8ee467b509e0 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceFlatORCDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,18 +36,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of reading ORC data with a flat schema - * using Iceberg and the built-in file source in Spark. - *

    - * To run this benchmark for spark-3.3: - * + * A benchmark that evaluates the performance of reading ORC data with a flat schema using Iceberg + * and the built-in file source in Spark. + * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=IcebergSourceFlatORCDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-orc-data-read-benchmark-result.txt @@ -70,11 +68,13 @@ public void tearDownBenchmark() throws IOException { public void readIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,12 +82,18 @@ public void readIcebergNonVectorized() { public void readIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation); + materialize(df); + }); } @Benchmark @@ -96,10 +102,12 @@ public void readFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()); + materialize(df); + }); } @Benchmark @@ -108,10 +116,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()); + materialize(df); + }); } @Benchmark @@ -119,11 +129,13 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @@ -131,25 +143,33 @@ public void readWithProjectionIcebergNonVectorized() { public void readWithProjectionIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation) + .select("longCol"); + materialize(df); + }); } - @Benchmark @Threads(1) public void readWithProjectionFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()).select("longCol"); + materialize(df); + }); } @Benchmark @@ -158,27 +178,33 @@ public void readWithProjectionFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()).select("longCol"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "orc"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + appendAsFile(df); + } + }); } } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java index f4edce20ec99..15486113493a 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedListORCDataWriteBenchmark.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.spark.sql.functions.array_repeat; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,22 +35,18 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.array_repeat; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg - * and the built-in file source in Spark. - *

    - * To run this benchmark for spark-3.3: - * + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the + * built-in file source in Spark. + * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedListORCDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-list-orc-data-write-benchmark-result.txt * */ -public class IcebergSourceNestedListORCDataWriteBenchmark extends IcebergSourceNestedListDataBenchmark { +public class IcebergSourceNestedListORCDataWriteBenchmark + extends IcebergSourceNestedListDataBenchmark { @Setup public void setupBenchmark() { @@ -67,8 +66,12 @@ public void tearDownBenchmark() throws IOException { @Threads(1) public void writeIceberg() { String tableLocation = table().location(); - benchmarkData().write().format("iceberg").option("write-format", "orc") - .mode(SaveMode.Append).save(tableLocation); + benchmarkData() + .write() + .format("iceberg") + .option("write-format", "orc") + .mode(SaveMode.Append) + .save(tableLocation); } @Benchmark @@ -76,11 +79,17 @@ public void writeIceberg() { public void writeIcebergDictionaryOff() { Map tableProperties = Maps.newHashMap(); tableProperties.put("orc.dictionary.key.threshold", "0"); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - benchmarkData().write().format("iceberg").option("write-format", "orc") - .mode(SaveMode.Append).save(tableLocation); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + benchmarkData() + .write() + .format("iceberg") + .option("write-format", "orc") + .mode(SaveMode.Append) + .save(tableLocation); + }); } @Benchmark @@ -90,10 +99,11 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(numRows) - .withColumn("outerlist", array_repeat(struct( - expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), - 10)) + return spark() + .range(numRows) + .withColumn( + "outerlist", + array_repeat(struct(expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), 10)) .coalesce(1); } } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java index dd1122c3abd6..c651f9eea8c7 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/orc/IcebergSourceNestedORCDataReadBenchmark.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.orc; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,19 +37,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - - /** - * A benchmark that evaluates the performance of reading ORC data with a flat schema - * using Iceberg and the built-in file source in Spark. - *

    - * To run this benchmark for spark-3.3: - * + * A benchmark that evaluates the performance of reading ORC data with a flat schema using Iceberg + * and the built-in file source in Spark. + * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedORCDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-orc-data-read-benchmark-result.txt @@ -72,11 +69,13 @@ public void tearDownBenchmark() throws IOException { public void readIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -84,12 +83,18 @@ public void readIcebergNonVectorized() { public void readIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation); + materialize(df); + }); } @Benchmark @@ -98,10 +103,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()); + materialize(df); + }); } @Benchmark @@ -109,11 +116,14 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIcebergNonVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -121,12 +131,19 @@ public void readWithProjectionIcebergNonVectorized() { public void readWithProjectionIcebergVectorized() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().option(SparkReadOptions.VECTORIZATION_ENABLED, "true") - .format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark() + .read() + .option(SparkReadOptions.VECTORIZATION_ENABLED, "true") + .format("iceberg") + .load(tableLocation) + .selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -135,27 +152,32 @@ public void readWithProjectionFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.ORC_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().orc(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().orc(dataLocation()).selectExpr("nested.col3"); + materialize(df); + }); } private void appendData() { Map tableProperties = Maps.newHashMap(); tableProperties.put(DEFAULT_FILE_FORMAT, "orc"); - withTableProperties(tableProperties, () -> { - for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); - appendAsFile(df); - } - }); + withTableProperties( + tableProperties, + () -> { + for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); + appendAsFile(df); + } + }); } } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java index 885f9b4f0aa6..1633709f4cd2 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataFilterBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,21 +35,15 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** * A benchmark that evaluates the file skipping capabilities in the Spark data source for Iceberg. * - * This class uses a dataset with a flat schema, where the records are clustered according to the + *

    This class uses a dataset with a flat schema, where the records are clustered according to the * column used in the filter predicate. * - * The performance is compared to the built-in file source in Spark. + *

    The performance is compared to the built-in file source in Spark. * - * To run this benchmark for spark-3.3: - * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=IcebergSourceFlatParquetDataFilterBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-filter-benchmark-result.txt @@ -74,11 +72,14 @@ public void tearDownBenchmark() throws IOException { public void readWithFilterIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -87,10 +88,12 @@ public void readWithFilterFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -99,23 +102,27 @@ public void readWithFilterFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } private void appendData() { for (int fileNum = 1; fileNum < NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); appendAsFile(df); } } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java index bbbdd288d193..1babed8c5c79 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataReadBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,17 +35,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of reading Parquet data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading Parquet data with a flat schema using + * Iceberg and the built-in file source in Spark. * - * To run this benchmark for spark-3.3: - * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=IcebergSourceFlatParquetDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-read-benchmark-result.txt @@ -69,11 +67,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,10 +82,12 @@ public void readFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -94,10 +96,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -105,11 +109,13 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @@ -118,10 +124,12 @@ public void readWithProjectionFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("longCol"); + materialize(df); + }); } @Benchmark @@ -130,23 +138,27 @@ public void readWithProjectionFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("longCol"); + materialize(df); + }); } private void appendData() { for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumnRenamed("id", "longCol") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")); appendAsFile(df); } } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java index dd3f5080f5cd..ab50fd55d451 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceFlatParquetDataWriteBenchmark.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,14 +33,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.expr; - /** - * A benchmark that evaluates the performance of writing Parquet data with a flat schema - * using Iceberg and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing Parquet data with a flat schema using + * Iceberg and the built-in file source in Spark. * - * To run this benchmark for spark-3.3: - * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=IcebergSourceFlatParquetDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-write-benchmark-result.txt @@ -76,7 +74,8 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(NUM_ROWS) + return spark() + .range(NUM_ROWS) .withColumnRenamed("id", "longCol") .withColumn("intCol", expr("CAST(longCol AS INT)")) .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java index a06033944eda..47d866f1b803 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedListParquetDataWriteBenchmark.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.spark.sql.functions.array_repeat; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -33,22 +36,18 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.array_repeat; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for spark-3.3: - * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedListParquetDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-list-parquet-data-write-benchmark-result.txt * */ -public class IcebergSourceNestedListParquetDataWriteBenchmark extends IcebergSourceNestedListDataBenchmark { +public class IcebergSourceNestedListParquetDataWriteBenchmark + extends IcebergSourceNestedListDataBenchmark { @Setup public void setupBenchmark() { @@ -80,10 +79,11 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(numRows) - .withColumn("outerlist", array_repeat(struct( - expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), - 10)) + return spark() + .range(numRows) + .withColumn( + "outerlist", + array_repeat(struct(expr("array_repeat(CAST(id AS string), 1000) AS innerlist")), 10)) .coalesce(1); } } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java index 1036bc5f1b99..7da6499c14a3 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataFilterBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,27 +35,22 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - /** * A benchmark that evaluates the file skipping capabilities in the Spark data source for Iceberg. * - * This class uses a dataset with nested data, where the records are clustered according to the + *

    This class uses a dataset with nested data, where the records are clustered according to the * column used in the filter predicate. * - * The performance is compared to the built-in file source in Spark. + *

    The performance is compared to the built-in file source in Spark. * - * To run this benchmark for spark-3.3: - * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedParquetDataFilterBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-filter-benchmark-result.txt * */ -public class IcebergSourceNestedParquetDataFilterBenchmark extends IcebergSourceNestedDataBenchmark { +public class IcebergSourceNestedParquetDataFilterBenchmark + extends IcebergSourceNestedDataBenchmark { private static final String FILTER_COND = "nested.col3 == 0"; private static final int NUM_FILES = 500; @@ -74,11 +73,14 @@ public void tearDownBenchmark() throws IOException { public void readWithFilterIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -87,10 +89,12 @@ public void readWithFilterFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } @Benchmark @@ -99,22 +103,25 @@ public void readWithFilterFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).filter(FILTER_COND); + materialize(df); + }); } private void appendData() { for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); appendAsFile(df); } } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java index fdd1c217004b..e55717fdc442 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataReadBenchmark.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -31,17 +35,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.TableProperties.SPLIT_OPEN_FILE_COST; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of reading nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for spark-3.3: - * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedParquetDataReadBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-read-benchmark-result.txt @@ -69,11 +67,13 @@ public void tearDownBenchmark() throws IOException { public void readIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation); + materialize(df); + }); } @Benchmark @@ -82,10 +82,12 @@ public void readFileSourceVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -94,10 +96,12 @@ public void readFileSourceNonVectorized() { Map conf = Maps.newHashMap(); conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()); + materialize(df); + }); } @Benchmark @@ -105,11 +109,14 @@ public void readFileSourceNonVectorized() { public void readWithProjectionIceberg() { Map tableProperties = Maps.newHashMap(); tableProperties.put(SPLIT_OPEN_FILE_COST, Integer.toString(128 * 1024 * 1024)); - withTableProperties(tableProperties, () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); - materialize(df); - }); + withTableProperties( + tableProperties, + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -119,10 +126,12 @@ public void readWithProjectionFileSourceVectorized() { conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "true"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); conf.put(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED().key(), "true"); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); + materialize(df); + }); } @Benchmark @@ -132,22 +141,25 @@ public void readWithProjectionFileSourceNonVectorized() { conf.put(SQLConf.PARQUET_VECTORIZED_READER_ENABLED().key(), "false"); conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024)); conf.put(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED().key(), "true"); - withSQLConf(conf, () -> { - Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); - materialize(df); - }); + withSQLConf( + conf, + () -> { + Dataset df = spark().read().parquet(dataLocation()).selectExpr("nested.col3"); + materialize(df); + }); } private void appendData() { for (int fileNum = 0; fileNum < NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS) - .withColumn( - "nested", - struct( - expr("CAST(id AS string) AS col1"), - expr("CAST(id AS double) AS col2"), - lit(fileNum).cast("long").as("col3") - )); + Dataset df = + spark() + .range(NUM_ROWS) + .withColumn( + "nested", + struct( + expr("CAST(id AS string) AS col1"), + expr("CAST(id AS double) AS col2"), + lit(fileNum).cast("long").as("col3"))); appendAsFile(df); } } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java index 65265426f9f8..981107dc651b 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceNestedParquetDataWriteBenchmark.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.struct; + import java.io.IOException; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -32,15 +34,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.struct; - /** - * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg - * and the built-in file source in Spark. + * A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the + * built-in file source in Spark. * - * To run this benchmark for spark-3.3: - * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh * -PjmhIncludeRegex=IcebergSourceNestedParquetDataWriteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-write-benchmark-result.txt @@ -77,14 +75,14 @@ public void writeFileSource() { } private Dataset benchmarkData() { - return spark().range(NUM_ROWS) + return spark() + .range(NUM_ROWS) .withColumn( "nested", struct( expr("CAST(id AS string) AS col1"), expr("CAST(id AS double) AS col2"), - expr("id AS col3") - )) + expr("id AS col3"))) .coalesce(1); } } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetEqDeleteBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetEqDeleteBenchmark.java index 4884e8fe5db1..f1e5956dbdc4 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetEqDeleteBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetEqDeleteBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; import java.io.IOException; @@ -25,12 +24,10 @@ import org.openjdk.jmh.annotations.Param; /** - * A benchmark that evaluates the non-vectorized read and vectorized read with equality delete in the Spark data source - * for Iceberg. - *

    - * This class uses a dataset with a flat schema. - * To run this benchmark for spark-3.3: - * + * A benchmark that evaluates the non-vectorized read and vectorized read with equality delete in + * the Spark data source for Iceberg. + * + *

    This class uses a dataset with a flat schema. To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3:jmh * -PjmhIncludeRegex=IcebergSourceParquetEqDeleteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-parquet-eq-delete-benchmark-result.txt diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetMultiDeleteFileBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetMultiDeleteFileBenchmark.java index 24c2676af24d..2ac3de2ff947 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetMultiDeleteFileBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetMultiDeleteFileBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; import java.io.IOException; @@ -26,12 +25,10 @@ import org.openjdk.jmh.annotations.Param; /** - * A benchmark that evaluates the non-vectorized read and vectorized read with pos-delete in the Spark data source for - * Iceberg. - *

    - * This class uses a dataset with a flat schema. - * To run this benchmark for spark-3.3: - * + * A benchmark that evaluates the non-vectorized read and vectorized read with pos-delete in the + * Spark data source for Iceberg. + * + *

    This class uses a dataset with a flat schema. To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3:jmh \ * -PjmhIncludeRegex=IcebergSourceParquetMultiDeleteFileBenchmark \ * -PjmhOutputPath=benchmark/iceberg-source-parquet-multi-delete-file-benchmark-result.txt diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetPosDeleteBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetPosDeleteBenchmark.java index 988eeb751258..8cd6fb36fcf5 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetPosDeleteBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetPosDeleteBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; import java.io.IOException; @@ -26,12 +25,10 @@ import org.openjdk.jmh.annotations.Param; /** - * A benchmark that evaluates the non-vectorized read and vectorized read with pos-delete in the Spark data source for - * Iceberg. - *

    - * This class uses a dataset with a flat schema. - * To run this benchmark for spark-3.3: - * + * A benchmark that evaluates the non-vectorized read and vectorized read with pos-delete in the + * Spark data source for Iceberg. + * + *

    This class uses a dataset with a flat schema. To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3:jmh * -PjmhIncludeRegex=IcebergSourceParquetPosDeleteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-parquet-pos-delete-benchmark-result.txt diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetWithUnrelatedDeleteBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetWithUnrelatedDeleteBenchmark.java index 5088843ca13e..1ae48e213cb7 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetWithUnrelatedDeleteBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/IcebergSourceParquetWithUnrelatedDeleteBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; import java.io.IOException; @@ -26,12 +25,10 @@ import org.openjdk.jmh.annotations.Param; /** - * A benchmark that evaluates the non-vectorized read and vectorized read with pos-delete in the Spark data source for - * Iceberg. - *

    - * This class uses a dataset with a flat schema. - * To run this benchmark for spark-3.3: - * + * A benchmark that evaluates the non-vectorized read and vectorized read with pos-delete in the + * Spark data source for Iceberg. + * + *

    This class uses a dataset with a flat schema. To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3:jmh * -PjmhIncludeRegex=IcebergSourceParquetWithUnrelatedDeleteBenchmark * -PjmhOutputPath=benchmark/iceberg-source-parquet-with-unrelated-delete-benchmark-result.txt @@ -39,6 +36,7 @@ */ public class IcebergSourceParquetWithUnrelatedDeleteBenchmark extends IcebergSourceDeleteBenchmark { private static final double PERCENT_DELETE_ROW = 0.05; + @Param({"0", "0.05", "0.25", "0.5"}) private double percentUnrelatedDeletes; @@ -49,8 +47,12 @@ protected void appendData() throws IOException { table().refresh(); for (DataFile file : table().currentSnapshot().addedDataFiles(table().io())) { - writePosDeletesWithNoise(file.path(), NUM_ROWS, PERCENT_DELETE_ROW, - (int) (percentUnrelatedDeletes / PERCENT_DELETE_ROW), 1); + writePosDeletesWithNoise( + file.path(), + NUM_ROWS, + PERCENT_DELETE_ROW, + (int) (percentUnrelatedDeletes / PERCENT_DELETE_ROW), + 1); } } } diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java index 0d347222e0e5..3857aabf5655 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/ParquetWritersBenchmark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet; import org.apache.iceberg.FileFormat; @@ -25,8 +24,7 @@ /** * A benchmark that evaluates the performance of various Iceberg writers for Parquet data. * - * To run this benchmark for spark-3.3: - * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh \ * -PjmhIncludeRegex=ParquetWritersBenchmark \ * -PjmhOutputPath=benchmark/parquet-writers-benchmark-result.txt diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java index 5a0393225236..b87e0d56f37e 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet.vectorized; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.pmod; +import static org.apache.spark.sql.functions.to_date; +import static org.apache.spark.sql.functions.to_timestamp; + import java.util.Map; import org.apache.iceberg.TableProperties; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -29,32 +35,26 @@ import org.apache.spark.sql.types.DataTypes; import org.openjdk.jmh.annotations.Setup; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.pmod; -import static org.apache.spark.sql.functions.to_date; -import static org.apache.spark.sql.functions.to_timestamp; - /** - * Benchmark to compare performance of reading Parquet dictionary encoded data with a flat schema using vectorized - * Iceberg read path and the built-in file source in Spark. - *

    - * To run this benchmark for spark-3.3: - * + * Benchmark to compare performance of reading Parquet dictionary encoded data with a flat schema + * using vectorized Iceberg read path and the built-in file source in Spark. + * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh \ * -PjmhIncludeRegex=VectorizedReadDictionaryEncodedFlatParquetDataBenchmark \ * -PjmhOutputPath=benchmark/results.txt * */ -public class VectorizedReadDictionaryEncodedFlatParquetDataBenchmark extends VectorizedReadFlatParquetDataBenchmark { +public class VectorizedReadDictionaryEncodedFlatParquetDataBenchmark + extends VectorizedReadFlatParquetDataBenchmark { @Setup @Override public void setupBenchmark() { setupSpark(true); appendData(); - // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds + // Allow unsafe memory access to avoid the costly check arrow does to check if index is within + // bounds System.setProperty("arrow.enable_unsafe_memory_access", "true"); // Disable expensive null check for every get(index) call. // Iceberg manages nullability checks itself instead of relying on arrow. @@ -81,9 +81,7 @@ void appendData() { df = withTimestampColumnDictEncoded(df); df = withStringColumnDictEncoded(df); df = df.drop("id"); - df.write().format("iceberg") - .mode(SaveMode.Append) - .save(table().location()); + df.write().format("iceberg").mode(SaveMode.Append).save(table().location()); } private static Column modColumn() { @@ -104,7 +102,6 @@ private static Dataset withIntColumnDictEncoded(Dataset df) { private static Dataset withFloatColumnDictEncoded(Dataset df) { return df.withColumn("floatCol", modColumn().cast(DataTypes.FloatType)); - } private static Dataset withDoubleColumnDictEncoded(Dataset df) { @@ -126,7 +123,8 @@ private static Dataset withDateColumnDictEncoded(Dataset df) { private static Dataset withTimestampColumnDictEncoded(Dataset df) { Column days = modColumn().cast(DataTypes.ShortType); - return df.withColumn("timestampCol", to_timestamp(date_add(to_date(lit("04/12/2019"), "MM/dd/yyyy"), days))); + return df.withColumn( + "timestampCol", to_timestamp(date_add(to_date(lit("04/12/2019"), "MM/dd/yyyy"), days))); } private static Dataset withStringColumnDictEncoded(Dataset df) { diff --git a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java index a1a36fd568d6..04e77bceb9df 100644 --- a/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java +++ b/spark/v3.3/spark/src/jmh/java/org/apache/iceberg/spark/source/parquet/vectorized/VectorizedReadFlatParquetDataBenchmark.java @@ -16,9 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source.parquet.vectorized; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.pmod; +import static org.apache.spark.sql.functions.when; + import java.io.IOException; import java.util.Map; import org.apache.hadoop.conf.Configuration; @@ -38,21 +46,11 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Threads; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.pmod; -import static org.apache.spark.sql.functions.when; - /** - * Benchmark to compare performance of reading Parquet data with a flat schema using vectorized Iceberg read path and - * the built-in file source in Spark. - *

    - * To run this benchmark for spark-3.3: - * + * Benchmark to compare performance of reading Parquet data with a flat schema using vectorized + * Iceberg read path and the built-in file source in Spark. + * + *

    To run this benchmark for spark-3.3: * ./gradlew -DsparkVersions=3.3 :iceberg-spark:iceberg-spark-3.3_2.12:jmh \ * -PjmhIncludeRegex=VectorizedReadFlatParquetDataBenchmark \ * -PjmhOutputPath=benchmark/results.txt @@ -67,7 +65,8 @@ public class VectorizedReadFlatParquetDataBenchmark extends IcebergSourceBenchma public void setupBenchmark() { setupSpark(); appendData(); - // Allow unsafe memory access to avoid the costly check arrow does to check if index is within bounds + // Allow unsafe memory access to avoid the costly check arrow does to check if index is within + // bounds System.setProperty("arrow.enable_unsafe_memory_access", "true"); // Disable expensive null check for every get(index) call. // Iceberg manages nullability checks itself instead of relying on arrow. @@ -89,16 +88,17 @@ protected Configuration initHadoopConf() { protected Table initTable() { // bigDecimalCol is big enough to be encoded as fix len binary (9 bytes), // decimalCol is small enough to be encoded as a 64-bit int - Schema schema = new Schema( - optional(1, "longCol", Types.LongType.get()), - optional(2, "intCol", Types.IntegerType.get()), - optional(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "bigDecimalCol", Types.DecimalType.of(20, 5)), - optional(6, "decimalCol", Types.DecimalType.of(18, 5)), - optional(7, "dateCol", Types.DateType.get()), - optional(8, "timestampCol", Types.TimestampType.withZone()), - optional(9, "stringCol", Types.StringType.get())); + Schema schema = + new Schema( + optional(1, "longCol", Types.LongType.get()), + optional(2, "intCol", Types.IntegerType.get()), + optional(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "bigDecimalCol", Types.DecimalType.of(20, 5)), + optional(6, "decimalCol", Types.DecimalType.of(18, 5)), + optional(7, "dateCol", Types.DateType.get()), + optional(8, "timestampCol", Types.TimestampType.withZone()), + optional(9, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map properties = parquetWriteProps(); @@ -114,20 +114,21 @@ Map parquetWriteProps() { void appendData() { for (int fileNum = 1; fileNum <= NUM_FILES; fileNum++) { - Dataset df = spark().range(NUM_ROWS_PER_FILE) - .withColumn( - "longCol", - when(pmod(col("id"), lit(10)).equalTo(lit(0)), lit(null)) - .otherwise(col("id"))) - .drop("id") - .withColumn("intCol", expr("CAST(longCol AS INT)")) - .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) - .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) - .withColumn("bigDecimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) - .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(18, 5))")) - .withColumn("dateCol", date_add(current_date(), fileNum)) - .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) - .withColumn("stringCol", expr("CAST(longCol AS STRING)")); + Dataset df = + spark() + .range(NUM_ROWS_PER_FILE) + .withColumn( + "longCol", + when(pmod(col("id"), lit(10)).equalTo(lit(0)), lit(null)).otherwise(col("id"))) + .drop("id") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("bigDecimalCol", expr("CAST(longCol AS DECIMAL(20, 5))")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(18, 5))")) + .withColumn("dateCol", date_add(current_date(), fileNum)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(longCol AS STRING)")); appendAsFile(df); } } @@ -135,181 +136,213 @@ void appendData() { @Benchmark @Threads(1) public void readIntegersIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("intCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("intCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readIntegersSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("intCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("intCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readLongsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("longCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("longCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readLongsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("longCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("longCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readFloatsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("floatCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("floatCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readFloatsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("floatCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("floatCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDoublesIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("doubleCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("doubleCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDoublesSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("doubleCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("doubleCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDecimalsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("decimalCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("decimalCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDecimalsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("decimalCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("decimalCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readBigDecimalsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("bigDecimalCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("bigDecimalCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readBigDecimalsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("bigDecimalCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("bigDecimalCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDatesIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("dateCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = spark().read().format("iceberg").load(tableLocation).select("dateCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readDatesSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("dateCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("dateCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readTimestampsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("timestampCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("timestampCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readTimestampsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("timestampCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("timestampCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readStringsIcebergVectorized5k() { - withTableProperties(tablePropsWithVectorizationEnabled(5000), () -> { - String tableLocation = table().location(); - Dataset df = spark().read().format("iceberg") - .load(tableLocation).select("stringCol"); - materialize(df); - }); + withTableProperties( + tablePropsWithVectorizationEnabled(5000), + () -> { + String tableLocation = table().location(); + Dataset df = + spark().read().format("iceberg").load(tableLocation).select("stringCol"); + materialize(df); + }); } @Benchmark @Threads(1) public void readStringsSparkVectorized5k() { - withSQLConf(sparkConfWithVectorizationEnabled(5000), () -> { - Dataset df = spark().read().parquet(dataLocation()).select("stringCol"); - materialize(df); - }); + withSQLConf( + sparkConfWithVectorizationEnabled(5000), + () -> { + Dataset df = spark().read().parquet(dataLocation()).select("stringCol"); + materialize(df); + }); } private static Map tablePropsWithVectorizationEnabled(int batchSize) { diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/BaseCatalog.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/BaseCatalog.java index 43447e346648..f215aa033c5a 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/BaseCatalog.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/BaseCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.spark.procedures.SparkProcedures; @@ -29,14 +28,16 @@ import org.apache.spark.sql.connector.iceberg.catalog.Procedure; import org.apache.spark.sql.connector.iceberg.catalog.ProcedureCatalog; -abstract class BaseCatalog implements StagingTableCatalog, ProcedureCatalog, SupportsNamespaces, HasIcebergCatalog { +abstract class BaseCatalog + implements StagingTableCatalog, ProcedureCatalog, SupportsNamespaces, HasIcebergCatalog { @Override public Procedure loadProcedure(Identifier ident) throws NoSuchProcedureException { String[] namespace = ident.namespace(); String name = ident.name(); - // namespace resolution is case insensitive until we have a way to configure case sensitivity in catalogs + // namespace resolution is case insensitive until we have a way to configure case sensitivity in + // catalogs if (namespace.length == 1 && namespace[0].equalsIgnoreCase("system")) { ProcedureBuilder builder = SparkProcedures.newBuilder(name); if (builder != null) { diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java index 58137250003a..641b957d1176 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -24,20 +23,18 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.util.ExceptionUtil; -/** - * utility class to accept thread local commit properties - */ +/** utility class to accept thread local commit properties */ public class CommitMetadata { - private CommitMetadata() { - - } + private CommitMetadata() {} - private static final ThreadLocal> COMMIT_PROPERTIES = ThreadLocal.withInitial(ImmutableMap::of); + private static final ThreadLocal> COMMIT_PROPERTIES = + ThreadLocal.withInitial(ImmutableMap::of); /** - * running the code wrapped as a caller, and any snapshot committed within the callable object will be attached with - * the metadata defined in properties + * running the code wrapped as a caller, and any snapshot committed within the callable object + * will be attached with the metadata defined in properties + * * @param properties extra commit metadata to attach to the snapshot committed within callable * @param callable the code to be executed * @param exClass the expected type of exception which would be thrown from callable diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/ExtendedParser.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/ExtendedParser.java index 4c7cdf229411..19b3dd8f49be 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/ExtendedParser.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/ExtendedParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -58,10 +57,12 @@ static List parseSortOrder(SparkSession spark, String orderString try { return parser.parseSortOrder(orderString); } catch (AnalysisException e) { - throw new IllegalArgumentException(String.format("Unable to parse sortOrder: %s", orderString), e); + throw new IllegalArgumentException( + String.format("Unable to parse sortOrder: %s", orderString), e); } } else { - throw new IllegalStateException("Cannot parse order: parser is not an Iceberg ExtendedParser"); + throw new IllegalStateException( + "Cannot parse order: parser is not an Iceberg ExtendedParser"); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/FileRewriteCoordinator.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/FileRewriteCoordinator.java index acd5f64d7ed6..210e861a4c16 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/FileRewriteCoordinator.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/FileRewriteCoordinator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -39,22 +38,26 @@ public class FileRewriteCoordinator { private final Map, Set> resultMap = Maps.newConcurrentMap(); - private FileRewriteCoordinator() { - } + private FileRewriteCoordinator() {} public static FileRewriteCoordinator get() { return INSTANCE; } /** - * Called to persist the output of a rewrite action for a specific group. Since the write is done via a - * Spark Datasource, we have to propagate the result through this side-effect call. + * Called to persist the output of a rewrite action for a specific group. Since the write is done + * via a Spark Datasource, we have to propagate the result through this side-effect call. + * * @param table table where the rewrite is occurring * @param fileSetID the id used to identify the source set of files being rewritten * @param newDataFiles the new files which have been written */ public void stageRewrite(Table table, String fileSetID, Set newDataFiles) { - LOG.debug("Staging the output for {} - fileset {} with {} files", table.name(), fileSetID, newDataFiles.size()); + LOG.debug( + "Staging the output for {} - fileset {} with {} files", + table.name(), + fileSetID, + newDataFiles.size()); Pair id = toID(table, fileSetID); resultMap.put(id, newDataFiles); } @@ -62,9 +65,8 @@ public void stageRewrite(Table table, String fileSetID, Set newDataFil public Set fetchNewDataFiles(Table table, String fileSetID) { Pair id = toID(table, fileSetID); Set result = resultMap.get(id); - ValidationException.check(result != null, - "No results for rewrite of file set %s in table %s", - fileSetID, table); + ValidationException.check( + result != null, "No results for rewrite of file set %s in table %s", fileSetID, table); return result; } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/FileScanTaskSetManager.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/FileScanTaskSetManager.java index 827b674ca16d..4b6da39905c1 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/FileScanTaskSetManager.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/FileScanTaskSetManager.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -37,15 +36,15 @@ public class FileScanTaskSetManager { private final Map, List> tasksMap = Maps.newConcurrentMap(); - private FileScanTaskSetManager() { - } + private FileScanTaskSetManager() {} public static FileScanTaskSetManager get() { return INSTANCE; } public void stageTasks(Table table, String setID, List tasks) { - Preconditions.checkArgument(tasks != null && tasks.size() > 0, "Cannot stage null or empty tasks"); + Preconditions.checkArgument( + tasks != null && tasks.size() > 0, "Cannot stage null or empty tasks"); Pair id = toID(table, setID); tasksMap.put(id, tasks); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java index e8889bc1fd01..094364d229b3 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/IcebergSpark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.transforms.Transform; @@ -27,22 +26,31 @@ import org.apache.spark.sql.types.DataTypes; public class IcebergSpark { - private IcebergSpark() { - } + private IcebergSpark() {} - public static void registerBucketUDF(SparkSession session, String funcName, DataType sourceType, int numBuckets) { + public static void registerBucketUDF( + SparkSession session, String funcName, DataType sourceType, int numBuckets) { SparkTypeToType typeConverter = new SparkTypeToType(); Type sourceIcebergType = typeConverter.atomic(sourceType); Transform bucket = Transforms.bucket(sourceIcebergType, numBuckets); - session.udf().register(funcName, - value -> bucket.apply(SparkValueConverter.convert(sourceIcebergType, value)), DataTypes.IntegerType); + session + .udf() + .register( + funcName, + value -> bucket.apply(SparkValueConverter.convert(sourceIcebergType, value)), + DataTypes.IntegerType); } - public static void registerTruncateUDF(SparkSession session, String funcName, DataType sourceType, int width) { + public static void registerTruncateUDF( + SparkSession session, String funcName, DataType sourceType, int width) { SparkTypeToType typeConverter = new SparkTypeToType(); Type sourceIcebergType = typeConverter.atomic(sourceType); Transform truncate = Transforms.truncate(sourceIcebergType, width); - session.udf().register(funcName, - value -> truncate.apply(SparkValueConverter.convert(sourceIcebergType, value)), sourceType); + session + .udf() + .register( + funcName, + value -> truncate.apply(SparkValueConverter.convert(sourceIcebergType, value)), + sourceType); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java index a35808fd8ce6..c0756d924e2f 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/JobGroupInfo.java @@ -16,13 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; -/** - * Captures information about the current job - * which is used for displaying on the UI - */ +/** Captures information about the current job which is used for displaying on the UI */ public class JobGroupInfo { private String groupId; private String description; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java index 155dce707701..dc8ba69d40a8 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/JobGroupUtils.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.spark.SparkContext; @@ -26,10 +25,10 @@ public class JobGroupUtils { private static final String JOB_GROUP_ID = SparkContext$.MODULE$.SPARK_JOB_GROUP_ID(); private static final String JOB_GROUP_DESC = SparkContext$.MODULE$.SPARK_JOB_DESCRIPTION(); - private static final String JOB_INTERRUPT_ON_CANCEL = SparkContext$.MODULE$.SPARK_JOB_INTERRUPT_ON_CANCEL(); + private static final String JOB_INTERRUPT_ON_CANCEL = + SparkContext$.MODULE$.SPARK_JOB_INTERRUPT_ON_CANCEL(); - private JobGroupUtils() { - } + private JobGroupUtils() {} public static JobGroupInfo getJobGroupInfo(SparkContext sparkContext) { String groupId = sparkContext.getLocalProperty(JOB_GROUP_ID); @@ -41,6 +40,7 @@ public static JobGroupInfo getJobGroupInfo(SparkContext sparkContext) { public static void setJobGroupInfo(SparkContext sparkContext, JobGroupInfo info) { sparkContext.setLocalProperty(JOB_GROUP_ID, info.groupId()); sparkContext.setLocalProperty(JOB_GROUP_DESC, info.description()); - sparkContext.setLocalProperty(JOB_INTERRUPT_ON_CANCEL, String.valueOf(info.interruptOnCancel())); + sparkContext.setLocalProperty( + JOB_INTERRUPT_ON_CANCEL, String.valueOf(info.interruptOnCancel())); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/PathIdentifier.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/PathIdentifier.java index 235097ea46cc..110af6b87de5 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/PathIdentifier.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/PathIdentifier.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -36,9 +35,10 @@ public PathIdentifier(String location) { this.location = location; List pathParts = SPLIT.splitToList(location); name = Iterables.getLast(pathParts); - namespace = pathParts.size() > 1 ? - new String[]{JOIN.join(pathParts.subList(0, pathParts.size() - 1))} : - new String[0]; + namespace = + pathParts.size() > 1 + ? new String[] {JOIN.join(pathParts.subList(0, pathParts.size() - 1))} + : new String[0]; } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java index 3bdf984ed219..3c111d3b44cb 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithReordering.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -70,7 +69,8 @@ public Type schema(Schema schema, Supplier structResult) { @Override public Type struct(Types.StructType struct, Iterable fieldResults) { - Preconditions.checkNotNull(struct, "Cannot prune null struct. Pruning must start with a schema."); + Preconditions.checkNotNull( + struct, "Cannot prune null struct. Pruning must start with a schema."); Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); StructType requestedStruct = (StructType) current; @@ -92,13 +92,13 @@ public Type struct(Types.StructType struct, Iterable fieldResults) { } else if (field.isOptional()) { changed = true; - projectedFields.put(field.name(), - Types.NestedField.optional(field.fieldId(), field.name(), type)); + projectedFields.put( + field.name(), Types.NestedField.optional(field.fieldId(), field.name(), type)); } else { changed = true; - projectedFields.put(field.name(), - Types.NestedField.required(field.fieldId(), field.name(), type)); + projectedFields.put( + field.name(), Types.NestedField.required(field.fieldId(), field.name(), type)); } } @@ -145,8 +145,10 @@ public Type field(Types.NestedField field, Supplier fieldResult) { int fieldIndex = requestedStruct.fieldIndex(field.name()); StructField requestedField = requestedStruct.fields()[fieldIndex]; - Preconditions.checkArgument(requestedField.nullable() || field.isRequired(), - "Cannot project an optional field as non-null: %s", field.name()); + Preconditions.checkArgument( + requestedField.nullable() || field.isRequired(), + "Cannot project an optional field as non-null: %s", + field.name()); this.current = requestedField.dataType(); try { @@ -164,8 +166,10 @@ public Type list(Types.ListType list, Supplier elementResult) { Preconditions.checkArgument(current instanceof ArrayType, "Not an array: %s", current); ArrayType requestedArray = (ArrayType) current; - Preconditions.checkArgument(requestedArray.containsNull() || !list.isElementOptional(), - "Cannot project an array of optional elements as required elements: %s", requestedArray); + Preconditions.checkArgument( + requestedArray.containsNull() || !list.isElementOptional(), + "Cannot project an array of optional elements as required elements: %s", + requestedArray); this.current = requestedArray.elementType(); try { @@ -190,10 +194,14 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu Preconditions.checkArgument(current instanceof MapType, "Not a map: %s", current); MapType requestedMap = (MapType) current; - Preconditions.checkArgument(requestedMap.valueContainsNull() || !map.isValueOptional(), - "Cannot project a map of optional values as required values: %s", map); - Preconditions.checkArgument(StringType.class.isInstance(requestedMap.keyType()), - "Invalid map key type (not string): %s", requestedMap.keyType()); + Preconditions.checkArgument( + requestedMap.valueContainsNull() || !map.isValueOptional(), + "Cannot project a map of optional values as required values: %s", + map); + Preconditions.checkArgument( + StringType.class.isInstance(requestedMap.keyType()), + "Invalid map key type (not string): %s", + requestedMap.keyType()); this.current = requestedMap.valueType(); try { @@ -215,23 +223,32 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu @Override public Type primitive(Type.PrimitiveType primitive) { Class expectedType = TYPES.get(primitive.typeId()); - Preconditions.checkArgument(expectedType != null && expectedType.isInstance(current), - "Cannot project %s to incompatible type: %s", primitive, current); + Preconditions.checkArgument( + expectedType != null && expectedType.isInstance(current), + "Cannot project %s to incompatible type: %s", + primitive, + current); // additional checks based on type switch (primitive.typeId()) { case DECIMAL: Types.DecimalType decimal = (Types.DecimalType) primitive; DecimalType requestedDecimal = (DecimalType) current; - Preconditions.checkArgument(requestedDecimal.scale() == decimal.scale(), - "Cannot project decimal with incompatible scale: %s != %s", requestedDecimal.scale(), decimal.scale()); - Preconditions.checkArgument(requestedDecimal.precision() >= decimal.precision(), + Preconditions.checkArgument( + requestedDecimal.scale() == decimal.scale(), + "Cannot project decimal with incompatible scale: %s != %s", + requestedDecimal.scale(), + decimal.scale()); + Preconditions.checkArgument( + requestedDecimal.precision() >= decimal.precision(), "Cannot project decimal with incompatible precision: %s < %s", - requestedDecimal.precision(), decimal.precision()); + requestedDecimal.precision(), + decimal.precision()); break; case TIMESTAMP: Types.TimestampType timestamp = (Types.TimestampType) primitive; - Preconditions.checkArgument(timestamp.shouldAdjustToUTC(), + Preconditions.checkArgument( + timestamp.shouldAdjustToUTC(), "Cannot project timestamp (without time zone) as timestamptz (with time zone)"); break; default: @@ -240,19 +257,19 @@ public Type primitive(Type.PrimitiveType primitive) { return primitive; } - private static final ImmutableMap> TYPES = ImmutableMap - .>builder() - .put(TypeID.BOOLEAN, BooleanType.class) - .put(TypeID.INTEGER, IntegerType.class) - .put(TypeID.LONG, LongType.class) - .put(TypeID.FLOAT, FloatType.class) - .put(TypeID.DOUBLE, DoubleType.class) - .put(TypeID.DATE, DateType.class) - .put(TypeID.TIMESTAMP, TimestampType.class) - .put(TypeID.DECIMAL, DecimalType.class) - .put(TypeID.UUID, StringType.class) - .put(TypeID.STRING, StringType.class) - .put(TypeID.FIXED, BinaryType.class) - .put(TypeID.BINARY, BinaryType.class) - .build(); + private static final ImmutableMap> TYPES = + ImmutableMap.>builder() + .put(TypeID.BOOLEAN, BooleanType.class) + .put(TypeID.INTEGER, IntegerType.class) + .put(TypeID.LONG, LongType.class) + .put(TypeID.FLOAT, FloatType.class) + .put(TypeID.DOUBLE, DoubleType.class) + .put(TypeID.DATE, DateType.class) + .put(TypeID.TIMESTAMP, TimestampType.class) + .put(TypeID.DECIMAL, DecimalType.class) + .put(TypeID.UUID, StringType.class) + .put(TypeID.STRING, StringType.class) + .put(TypeID.FIXED, BinaryType.class) + .put(TypeID.BINARY, BinaryType.class) + .build(); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java index c6984e2fe8cd..61a215b938c5 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -68,7 +67,8 @@ public Type schema(Schema schema, Supplier structResult) { @Override public Type struct(Types.StructType struct, Iterable fieldResults) { - Preconditions.checkNotNull(struct, "Cannot prune null struct. Pruning must start with a schema."); + Preconditions.checkNotNull( + struct, "Cannot prune null struct. Pruning must start with a schema."); Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current); List fields = struct.fields(); @@ -120,8 +120,10 @@ public Type field(Types.NestedField field, Supplier fieldResult) { int fieldIndex = requestedStruct.fieldIndex(field.name()); StructField requestedField = requestedStruct.fields()[fieldIndex]; - Preconditions.checkArgument(requestedField.nullable() || field.isRequired(), - "Cannot project an optional field as non-null: %s", field.name()); + Preconditions.checkArgument( + requestedField.nullable() || field.isRequired(), + "Cannot project an optional field as non-null: %s", + field.name()); this.current = requestedField.dataType(); try { @@ -139,8 +141,10 @@ public Type list(Types.ListType list, Supplier elementResult) { Preconditions.checkArgument(current instanceof ArrayType, "Not an array: %s", current); ArrayType requestedArray = (ArrayType) current; - Preconditions.checkArgument(requestedArray.containsNull() || !list.isElementOptional(), - "Cannot project an array of optional elements as required elements: %s", requestedArray); + Preconditions.checkArgument( + requestedArray.containsNull() || !list.isElementOptional(), + "Cannot project an array of optional elements as required elements: %s", + requestedArray); this.current = requestedArray.elementType(); try { @@ -165,8 +169,10 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu Preconditions.checkArgument(current instanceof MapType, "Not a map: %s", current); MapType requestedMap = (MapType) current; - Preconditions.checkArgument(requestedMap.valueContainsNull() || !map.isValueOptional(), - "Cannot project a map of optional values as required values: %s", map); + Preconditions.checkArgument( + requestedMap.valueContainsNull() || !map.isValueOptional(), + "Cannot project a map of optional values as required values: %s", + map); this.current = requestedMap.valueType(); try { @@ -188,19 +194,27 @@ public Type map(Types.MapType map, Supplier keyResult, Supplier valu @Override public Type primitive(Type.PrimitiveType primitive) { Class expectedType = TYPES.get(primitive.typeId()); - Preconditions.checkArgument(expectedType != null && expectedType.isInstance(current), - "Cannot project %s to incompatible type: %s", primitive, current); + Preconditions.checkArgument( + expectedType != null && expectedType.isInstance(current), + "Cannot project %s to incompatible type: %s", + primitive, + current); // additional checks based on type switch (primitive.typeId()) { case DECIMAL: Types.DecimalType decimal = (Types.DecimalType) primitive; DecimalType requestedDecimal = (DecimalType) current; - Preconditions.checkArgument(requestedDecimal.scale() == decimal.scale(), - "Cannot project decimal with incompatible scale: %s != %s", requestedDecimal.scale(), decimal.scale()); - Preconditions.checkArgument(requestedDecimal.precision() >= decimal.precision(), + Preconditions.checkArgument( + requestedDecimal.scale() == decimal.scale(), + "Cannot project decimal with incompatible scale: %s != %s", + requestedDecimal.scale(), + decimal.scale()); + Preconditions.checkArgument( + requestedDecimal.precision() >= decimal.precision(), "Cannot project decimal with incompatible precision: %s < %s", - requestedDecimal.precision(), decimal.precision()); + requestedDecimal.precision(), + decimal.precision()); break; default: } @@ -208,19 +222,19 @@ public Type primitive(Type.PrimitiveType primitive) { return primitive; } - private static final ImmutableMap> TYPES = ImmutableMap - .>builder() - .put(TypeID.BOOLEAN, BooleanType.class) - .put(TypeID.INTEGER, IntegerType.class) - .put(TypeID.LONG, LongType.class) - .put(TypeID.FLOAT, FloatType.class) - .put(TypeID.DOUBLE, DoubleType.class) - .put(TypeID.DATE, DateType.class) - .put(TypeID.TIMESTAMP, TimestampType.class) - .put(TypeID.DECIMAL, DecimalType.class) - .put(TypeID.UUID, StringType.class) - .put(TypeID.STRING, StringType.class) - .put(TypeID.FIXED, BinaryType.class) - .put(TypeID.BINARY, BinaryType.class) - .build(); + private static final ImmutableMap> TYPES = + ImmutableMap.>builder() + .put(TypeID.BOOLEAN, BooleanType.class) + .put(TypeID.INTEGER, IntegerType.class) + .put(TypeID.LONG, LongType.class) + .put(TypeID.FLOAT, FloatType.class) + .put(TypeID.DOUBLE, DoubleType.class) + .put(TypeID.DATE, DateType.class) + .put(TypeID.TIMESTAMP, TimestampType.class) + .put(TypeID.DECIMAL, DecimalType.class) + .put(TypeID.UUID, StringType.class) + .put(TypeID.STRING, StringType.class) + .put(TypeID.FIXED, BinaryType.class) + .put(TypeID.BINARY, BinaryType.class) + .build(); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java index a27d06e7a1d7..bc8a966488ee 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -41,22 +40,25 @@ /** * An implementation of StagedTable that mimics the behavior of Spark's non-atomic CTAS and RTAS. - *

    - * A Spark catalog can implement StagingTableCatalog to support atomic operations by producing StagedTable. But if a - * catalog implements StagingTableCatalog, Spark expects the catalog to be able to produce a StagedTable for any table - * loaded by the catalog. This assumption doesn't always work, as in the case of {@link SparkSessionCatalog}, which - * supports atomic operations can produce a StagedTable for Iceberg tables, but wraps the session catalog and cannot - * necessarily produce a working StagedTable implementation for tables that it loads. - *

    - * The work-around is this class, which implements the StagedTable interface but does not have atomic behavior. Instead, - * the StagedTable interface is used to implement the behavior of the non-atomic SQL plans that will create a table, - * write, and will drop the table to roll back. - *

    - * This StagedTable implements SupportsRead, SupportsWrite, and SupportsDelete by passing the calls to the real table. - * Implementing those interfaces is safe because Spark will not use them unless the table supports them and returns the - * corresponding capabilities from {@link #capabilities()}. + * + *

    A Spark catalog can implement StagingTableCatalog to support atomic operations by producing + * StagedTable. But if a catalog implements StagingTableCatalog, Spark expects the catalog to be + * able to produce a StagedTable for any table loaded by the catalog. This assumption doesn't always + * work, as in the case of {@link SparkSessionCatalog}, which supports atomic operations can produce + * a StagedTable for Iceberg tables, but wraps the session catalog and cannot necessarily produce a + * working StagedTable implementation for tables that it loads. + * + *

    The work-around is this class, which implements the StagedTable interface but does not have + * atomic behavior. Instead, the StagedTable interface is used to implement the behavior of the + * non-atomic SQL plans that will create a table, write, and will drop the table to roll back. + * + *

    This StagedTable implements SupportsRead, SupportsWrite, and SupportsDelete by passing the + * calls to the real table. Implementing those interfaces is safe because Spark will not use them + * unless the table supports them and returns the corresponding capabilities from {@link + * #capabilities()}. */ -public class RollbackStagedTable implements StagedTable, SupportsRead, SupportsWrite, SupportsDelete { +public class RollbackStagedTable + implements StagedTable, SupportsRead, SupportsWrite, SupportsDelete { private final TableCatalog catalog; private final Identifier ident; private final Table table; @@ -119,19 +121,22 @@ public WriteBuilder newWriteBuilder(LogicalWriteInfo info) { } private void call(Class requiredClass, Consumer task) { - callReturning(requiredClass, inst -> { - task.accept(inst); - return null; - }); + callReturning( + requiredClass, + inst -> { + task.accept(inst); + return null; + }); } private R callReturning(Class requiredClass, Function task) { if (requiredClass.isInstance(table)) { return task.apply(requiredClass.cast(table)); } else { - throw new UnsupportedOperationException(String.format( - "Table does not implement %s: %s (%s)", - requiredClass.getSimpleName(), table.name(), table.getClass().getName())); + throw new UnsupportedOperationException( + String.format( + "Table does not implement %s: %s (%s)", + requiredClass.getSimpleName(), table.name(), table.getClass().getName())); } } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SortOrderToSpark.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SortOrderToSpark.java index 5677f83a95ee..52d68db2e4f9 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SortOrderToSpark.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SortOrderToSpark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -38,46 +37,57 @@ class SortOrderToSpark implements SortOrderVisitor { @Override public SortOrder field(String sourceName, int id, SortDirection direction, NullOrder nullOrder) { - return Expressions.sort(Expressions.column(quotedName(id)), toSpark(direction), toSpark(nullOrder)); + return Expressions.sort( + Expressions.column(quotedName(id)), toSpark(direction), toSpark(nullOrder)); } @Override - public SortOrder bucket(String sourceName, int id, int width, SortDirection direction, NullOrder nullOrder) { - return Expressions.sort(Expressions.bucket(width, quotedName(id)), toSpark(direction), toSpark(nullOrder)); + public SortOrder bucket( + String sourceName, int id, int width, SortDirection direction, NullOrder nullOrder) { + return Expressions.sort( + Expressions.bucket(width, quotedName(id)), toSpark(direction), toSpark(nullOrder)); } @Override - public SortOrder truncate(String sourceName, int id, int width, SortDirection direction, NullOrder nullOrder) { - return Expressions.sort(Expressions.apply( - "truncate", Expressions.column(quotedName(id)), Expressions.literal(width)), - toSpark(direction), toSpark(nullOrder)); + public SortOrder truncate( + String sourceName, int id, int width, SortDirection direction, NullOrder nullOrder) { + return Expressions.sort( + Expressions.apply( + "truncate", Expressions.column(quotedName(id)), Expressions.literal(width)), + toSpark(direction), + toSpark(nullOrder)); } @Override public SortOrder year(String sourceName, int id, SortDirection direction, NullOrder nullOrder) { - return Expressions.sort(Expressions.years(quotedName(id)), toSpark(direction), toSpark(nullOrder)); + return Expressions.sort( + Expressions.years(quotedName(id)), toSpark(direction), toSpark(nullOrder)); } @Override public SortOrder month(String sourceName, int id, SortDirection direction, NullOrder nullOrder) { - return Expressions.sort(Expressions.months(quotedName(id)), toSpark(direction), toSpark(nullOrder)); + return Expressions.sort( + Expressions.months(quotedName(id)), toSpark(direction), toSpark(nullOrder)); } @Override public SortOrder day(String sourceName, int id, SortDirection direction, NullOrder nullOrder) { - return Expressions.sort(Expressions.days(quotedName(id)), toSpark(direction), toSpark(nullOrder)); + return Expressions.sort( + Expressions.days(quotedName(id)), toSpark(direction), toSpark(nullOrder)); } @Override public SortOrder hour(String sourceName, int id, SortDirection direction, NullOrder nullOrder) { - return Expressions.sort(Expressions.hours(quotedName(id)), toSpark(direction), toSpark(nullOrder)); + return Expressions.sort( + Expressions.hours(quotedName(id)), toSpark(direction), toSpark(nullOrder)); } private String quotedName(int id) { return quotedNameById.get(id); } - private org.apache.spark.sql.connector.expressions.SortDirection toSpark(SortDirection direction) { + private org.apache.spark.sql.connector.expressions.SortDirection toSpark( + SortDirection direction) { if (direction == SortDirection.ASC) { return org.apache.spark.sql.connector.expressions.SortDirection.ASCENDING; } else { @@ -89,4 +99,3 @@ private NullOrdering toSpark(NullOrder nullOrder) { return nullOrder == NullOrder.NULLS_FIRST ? NullOrdering.NULLS_FIRST : NullOrdering.NULLS_LAST; } } - diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java index d472987957dd..94a86edd38e0 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/Spark3Util.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.nio.ByteBuffer; @@ -93,14 +92,14 @@ public class Spark3Util { - private static final Set RESERVED_PROPERTIES = ImmutableSet.of( - TableCatalog.PROP_LOCATION, TableCatalog.PROP_PROVIDER); + private static final Set RESERVED_PROPERTIES = + ImmutableSet.of(TableCatalog.PROP_LOCATION, TableCatalog.PROP_PROVIDER); private static final Joiner DOT = Joiner.on("."); - private Spark3Util() { - } + private Spark3Util() {} - public static CaseInsensitiveStringMap setOption(String key, String value, CaseInsensitiveStringMap options) { + public static CaseInsensitiveStringMap setOption( + String key, String value, CaseInsensitiveStringMap options) { Map newOptions = Maps.newHashMap(); newOptions.putAll(options); newOptions.put(key, value); @@ -134,7 +133,8 @@ public static Map rebuildCreateProperties(Map cr * @param changes a list of Spark table changes * @return the UpdateProperties operation configured with the changes */ - public static UpdateProperties applyPropertyChanges(UpdateProperties pendingUpdate, List changes) { + public static UpdateProperties applyPropertyChanges( + UpdateProperties pendingUpdate, List changes) { for (TableChange change : changes) { if (change instanceof TableChange.SetProperty) { TableChange.SetProperty set = (TableChange.SetProperty) change; @@ -159,7 +159,8 @@ public static UpdateProperties applyPropertyChanges(UpdateProperties pendingUpda * @param changes a list of Spark table changes * @return the UpdateSchema operation configured with the changes */ - public static UpdateSchema applySchemaChanges(UpdateSchema pendingUpdate, List changes) { + public static UpdateSchema applySchemaChanges( + UpdateSchema pendingUpdate, List changes) { for (TableChange change : changes) { if (change instanceof TableChange.AddColumn) { apply(pendingUpdate, (TableChange.AddColumn) change); @@ -167,8 +168,11 @@ public static UpdateSchema applySchemaChanges(UpdateSchema pendingUpdate, List quotedNameById = SparkSchemaUtil.indexQuotedNameById(spec.schema()); - List transforms = PartitionSpecVisitor.visit(spec, - new PartitionSpecVisitor() { - @Override - public Transform identity(String sourceName, int sourceId) { - return Expressions.identity(quotedName(sourceId)); - } - - @Override - public Transform bucket(String sourceName, int sourceId, int numBuckets) { - return Expressions.bucket(numBuckets, quotedName(sourceId)); - } - - @Override - public Transform truncate(String sourceName, int sourceId, int width) { - return Expressions.apply("truncate", Expressions.column(quotedName(sourceId)), Expressions.literal(width)); - } - - @Override - public Transform year(String sourceName, int sourceId) { - return Expressions.years(quotedName(sourceId)); - } - - @Override - public Transform month(String sourceName, int sourceId) { - return Expressions.months(quotedName(sourceId)); - } - - @Override - public Transform day(String sourceName, int sourceId) { - return Expressions.days(quotedName(sourceId)); - } - - @Override - public Transform hour(String sourceName, int sourceId) { - return Expressions.hours(quotedName(sourceId)); - } - - @Override - public Transform alwaysNull(int fieldId, String sourceName, int sourceId) { - // do nothing for alwaysNull, it doesn't need to be converted to a transform - return null; - } - - @Override - public Transform unknown(int fieldId, String sourceName, int sourceId, String transform) { - return Expressions.apply(transform, Expressions.column(quotedName(sourceId))); - } - - private String quotedName(int id) { - return quotedNameById.get(id); - } - }); + List transforms = + PartitionSpecVisitor.visit( + spec, + new PartitionSpecVisitor() { + @Override + public Transform identity(String sourceName, int sourceId) { + return Expressions.identity(quotedName(sourceId)); + } + + @Override + public Transform bucket(String sourceName, int sourceId, int numBuckets) { + return Expressions.bucket(numBuckets, quotedName(sourceId)); + } + + @Override + public Transform truncate(String sourceName, int sourceId, int width) { + return Expressions.apply( + "truncate", + Expressions.column(quotedName(sourceId)), + Expressions.literal(width)); + } + + @Override + public Transform year(String sourceName, int sourceId) { + return Expressions.years(quotedName(sourceId)); + } + + @Override + public Transform month(String sourceName, int sourceId) { + return Expressions.months(quotedName(sourceId)); + } + + @Override + public Transform day(String sourceName, int sourceId) { + return Expressions.days(quotedName(sourceId)); + } + + @Override + public Transform hour(String sourceName, int sourceId) { + return Expressions.hours(quotedName(sourceId)); + } + + @Override + public Transform alwaysNull(int fieldId, String sourceName, int sourceId) { + // do nothing for alwaysNull, it doesn't need to be converted to a transform + return null; + } + + @Override + public Transform unknown( + int fieldId, String sourceName, int sourceId, String transform) { + return Expressions.apply(transform, Expressions.column(quotedName(sourceId))); + } + + private String quotedName(int id) { + return quotedNameById.get(id); + } + }); return transforms.stream().filter(Objects::nonNull).toArray(Transform[]::new); } @@ -315,8 +332,10 @@ public static NamedReference toNamedReference(String name) { public static Term toIcebergTerm(Expression expr) { if (expr instanceof Transform) { Transform transform = (Transform) expr; - Preconditions.checkArgument("zorder".equals(transform.name()) || transform.references().length == 1, - "Cannot convert transform with more than one column reference: %s", transform); + Preconditions.checkArgument( + "zorder".equals(transform.name()) || transform.references().length == 1, + "Cannot convert transform with more than one column reference: %s", + transform); String colName = DOT.join(transform.references()[0].fieldNames()); switch (transform.name().toLowerCase(Locale.ROOT)) { case "identity": @@ -336,10 +355,11 @@ public static Term toIcebergTerm(Expression expr) { case "truncate": return org.apache.iceberg.expressions.Expressions.truncate(colName, findWidth(transform)); case "zorder": - return new Zorder(Stream.of(transform.references()) - .map(ref -> DOT.join(ref.fieldNames())) - .map(org.apache.iceberg.expressions.Expressions::ref) - .collect(Collectors.toList())); + return new Zorder( + Stream.of(transform.references()) + .map(ref -> DOT.join(ref.fieldNames())) + .map(org.apache.iceberg.expressions.Expressions::ref) + .collect(Collectors.toList())); default: throw new UnsupportedOperationException("Transform is not supported: " + transform); } @@ -367,8 +387,10 @@ public static PartitionSpec toPartitionSpec(Schema schema, Transform[] partition PartitionSpec.Builder builder = PartitionSpec.builderFor(schema); for (Transform transform : partitioning) { - Preconditions.checkArgument(transform.references().length == 1, - "Cannot convert transform with more than one column reference: %s", transform); + Preconditions.checkArgument( + transform.references().length == 1, + "Cannot convert transform with more than one column reference: %s", + transform); String colName = DOT.join(transform.references()[0].fieldNames()); switch (transform.name().toLowerCase(Locale.ROOT)) { case "identity": @@ -408,14 +430,16 @@ private static int findWidth(Transform transform) { if (expr instanceof Literal) { if (((Literal) expr).dataType() instanceof IntegerType) { Literal lit = (Literal) expr; - Preconditions.checkArgument(lit.value() > 0, - "Unsupported width for transform: %s", transform.describe()); + Preconditions.checkArgument( + lit.value() > 0, "Unsupported width for transform: %s", transform.describe()); return lit.value(); } else if (((Literal) expr).dataType() instanceof LongType) { Literal lit = (Literal) expr; - Preconditions.checkArgument(lit.value() > 0 && lit.value() < Integer.MAX_VALUE, - "Unsupported width for transform: %s", transform.describe()); + Preconditions.checkArgument( + lit.value() > 0 && lit.value() < Integer.MAX_VALUE, + "Unsupported width for transform: %s", + transform.describe()); if (lit.value() > Integer.MAX_VALUE) { throw new IllegalArgumentException(); } @@ -428,7 +452,8 @@ private static int findWidth(Transform transform) { } private static String leafName(String[] fieldNames) { - Preconditions.checkArgument(fieldNames.length > 0, "Invalid field name: at least one name is required"); + Preconditions.checkArgument( + fieldNames.length > 0, "Invalid field name: at least one name is required"); return fieldNames[fieldNames.length - 1]; } @@ -473,8 +498,7 @@ public static class DescribeSchemaVisitor extends TypeUtil.SchemaVisitor private static final Joiner COMMA = Joiner.on(','); private static final DescribeSchemaVisitor INSTANCE = new DescribeSchemaVisitor(); - private DescribeSchemaVisitor() { - } + private DescribeSchemaVisitor() {} @Override public String schema(Schema schema, String structResult) { @@ -534,11 +558,11 @@ public String primitive(Type.PrimitiveType primitive) { } } - private static class DescribeExpressionVisitor extends ExpressionVisitors.ExpressionVisitor { + private static class DescribeExpressionVisitor + extends ExpressionVisitors.ExpressionVisitor { private static final DescribeExpressionVisitor INSTANCE = new DescribeExpressionVisitor(); - private DescribeExpressionVisitor() { - } + private DescribeExpressionVisitor() {} @Override public String alwaysTrue() { @@ -607,7 +631,9 @@ public String predicate(UnboundPredicate pred) { } private static String sqlString(List> literals) { - return literals.stream().map(DescribeExpressionVisitor::sqlString).collect(Collectors.joining(", ")); + return literals.stream() + .map(DescribeExpressionVisitor::sqlString) + .collect(Collectors.joining(", ")); } private static String sqlString(org.apache.iceberg.expressions.Literal lit) { @@ -623,11 +649,12 @@ private static String sqlString(org.apache.iceberg.expressions.Literal lit) { } /** - * Returns an Iceberg Table by its name from a Spark V2 Catalog. If cache is enabled in {@link SparkCatalog}, - * the {@link TableOperations} of the table may be stale, please refresh the table to get the latest one. + * Returns an Iceberg Table by its name from a Spark V2 Catalog. If cache is enabled in {@link + * SparkCatalog}, the {@link TableOperations} of the table may be stale, please refresh the table + * to get the latest one. * * @param spark SparkSession used for looking up catalog references and tables - * @param name The multipart identifier of the Iceberg table + * @param name The multipart identifier of the Iceberg table * @return an Iceberg table */ public static org.apache.iceberg.Table loadIcebergTable(SparkSession spark, String name) @@ -641,38 +668,44 @@ public static org.apache.iceberg.Table loadIcebergTable(SparkSession spark, Stri /** * Returns the underlying Iceberg Catalog object represented by a Spark Catalog + * * @param spark SparkSession used for looking up catalog reference * @param catalogName The name of the Spark Catalog being referenced * @return the Iceberg catalog class being wrapped by the Spark Catalog */ public static Catalog loadIcebergCatalog(SparkSession spark, String catalogName) { CatalogPlugin catalogPlugin = spark.sessionState().catalogManager().catalog(catalogName); - Preconditions.checkArgument(catalogPlugin instanceof HasIcebergCatalog, - String.format("Cannot load Iceberg catalog from catalog %s because it does not contain an Iceberg Catalog. " + - "Actual Class: %s", + Preconditions.checkArgument( + catalogPlugin instanceof HasIcebergCatalog, + String.format( + "Cannot load Iceberg catalog from catalog %s because it does not contain an Iceberg Catalog. " + + "Actual Class: %s", catalogName, catalogPlugin.getClass().getName())); return ((HasIcebergCatalog) catalogPlugin).icebergCatalog(); } - - public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, String name) throws ParseException { - return catalogAndIdentifier(spark, name, spark.sessionState().catalogManager().currentCatalog()); + public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, String name) + throws ParseException { + return catalogAndIdentifier( + spark, name, spark.sessionState().catalogManager().currentCatalog()); } - public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, String name, - CatalogPlugin defaultCatalog) throws ParseException { + public static CatalogAndIdentifier catalogAndIdentifier( + SparkSession spark, String name, CatalogPlugin defaultCatalog) throws ParseException { ParserInterface parser = spark.sessionState().sqlParser(); Seq multiPartIdentifier = parser.parseMultipartIdentifier(name).toIndexedSeq(); List javaMultiPartIdentifier = JavaConverters.seqAsJavaList(multiPartIdentifier); return catalogAndIdentifier(spark, javaMultiPartIdentifier, defaultCatalog); } - public static CatalogAndIdentifier catalogAndIdentifier(String description, SparkSession spark, String name) { - return catalogAndIdentifier(description, spark, name, spark.sessionState().catalogManager().currentCatalog()); + public static CatalogAndIdentifier catalogAndIdentifier( + String description, SparkSession spark, String name) { + return catalogAndIdentifier( + description, spark, name, spark.sessionState().catalogManager().currentCatalog()); } - public static CatalogAndIdentifier catalogAndIdentifier(String description, SparkSession spark, - String name, CatalogPlugin defaultCatalog) { + public static CatalogAndIdentifier catalogAndIdentifier( + String description, SparkSession spark, String name, CatalogPlugin defaultCatalog) { try { return catalogAndIdentifier(spark, name, defaultCatalog); } catch (ParseException e) { @@ -680,20 +713,23 @@ public static CatalogAndIdentifier catalogAndIdentifier(String description, Spar } } - public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, List nameParts) { - return catalogAndIdentifier(spark, nameParts, spark.sessionState().catalogManager().currentCatalog()); + public static CatalogAndIdentifier catalogAndIdentifier( + SparkSession spark, List nameParts) { + return catalogAndIdentifier( + spark, nameParts, spark.sessionState().catalogManager().currentCatalog()); } /** - * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply - * Attempts to find the catalog and identifier a multipart identifier represents + * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply Attempts to find the + * catalog and identifier a multipart identifier represents + * * @param spark Spark session to use for resolution * @param nameParts Multipart identifier representing a table * @param defaultCatalog Catalog to use if none is specified * @return The CatalogPlugin and Identifier for the table */ - public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, List nameParts, - CatalogPlugin defaultCatalog) { + public static CatalogAndIdentifier catalogAndIdentifier( + SparkSession spark, List nameParts, CatalogPlugin defaultCatalog) { CatalogManager catalogManager = spark.sessionState().catalogManager(); String[] currentNamespace; @@ -703,18 +739,19 @@ public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, List currentNamespace = defaultCatalog.defaultNamespace(); } - Pair catalogIdentifier = SparkUtil.catalogAndIdentifier(nameParts, - catalogName -> { - try { - return catalogManager.catalog(catalogName); - } catch (Exception e) { - return null; - } - }, - Identifier::of, - defaultCatalog, - currentNamespace - ); + Pair catalogIdentifier = + SparkUtil.catalogAndIdentifier( + nameParts, + catalogName -> { + try { + return catalogManager.catalog(catalogName); + } catch (Exception e) { + return null; + } + }, + Identifier::of, + defaultCatalog, + currentNamespace); return new CatalogAndIdentifier(catalogIdentifier); } @@ -723,18 +760,17 @@ private static TableCatalog asTableCatalog(CatalogPlugin catalog) { return (TableCatalog) catalog; } - throw new IllegalArgumentException(String.format( - "Cannot use catalog %s(%s): not a TableCatalog", catalog.name(), catalog.getClass().getName())); + throw new IllegalArgumentException( + String.format( + "Cannot use catalog %s(%s): not a TableCatalog", + catalog.name(), catalog.getClass().getName())); } - /** - * This mimics a class inside of Spark which is private inside of LookupCatalog. - */ + /** This mimics a class inside of Spark which is private inside of LookupCatalog. */ public static class CatalogAndIdentifier { private final CatalogPlugin catalog; private final Identifier identifier; - public CatalogAndIdentifier(CatalogPlugin catalog, Identifier identifier) { this.catalog = catalog; this.identifier = identifier; @@ -767,8 +803,8 @@ public static String quotedFullIdentifier(String catalogName, Identifier identif .build(); return CatalogV2Implicits.MultipartIdentifierHelper( - JavaConverters.asScalaIteratorConverter(parts.iterator()).asScala().toSeq() - ).quoted(); + JavaConverters.asScalaIteratorConverter(parts.iterator()).asScala().toSeq()) + .quoted(); } /** @@ -780,21 +816,21 @@ public static String quotedFullIdentifier(String catalogName, Identifier identif * @param partitionFilter partitionFilter of the file * @return all table's partitions */ - public static List getPartitions(SparkSession spark, Path rootPath, String format, - Map partitionFilter) { + public static List getPartitions( + SparkSession spark, Path rootPath, String format, Map partitionFilter) { FileStatusCache fileStatusCache = FileStatusCache.getOrCreate(spark); - InMemoryFileIndex fileIndex = new InMemoryFileIndex( - spark, - JavaConverters - .collectionAsScalaIterableConverter(ImmutableList.of(rootPath)) - .asScala() - .toSeq(), + InMemoryFileIndex fileIndex = + new InMemoryFileIndex( + spark, + JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(rootPath)) + .asScala() + .toSeq(), scala.collection.immutable.Map$.MODULE$.empty(), - Option.empty(), - fileStatusCache, - Option.empty(), - Option.empty()); + Option.empty(), + fileStatusCache, + Option.empty(), + Option.empty()); org.apache.spark.sql.execution.datasources.PartitionSpec spec = fileIndex.partitionSpec(); StructType schema = spec.partitionColumns(); @@ -814,31 +850,38 @@ public static List getPartitions(SparkSession spark, Path rootPa Seq filteredPartitions = fileIndex.listFiles(scalaPartitionFilters, scalaDataFilters).toIndexedSeq(); - return JavaConverters - .seqAsJavaListConverter(filteredPartitions) - .asJava() - .stream() - .map(partition -> { - Map values = Maps.newHashMap(); - JavaConverters.asJavaIterableConverter(schema).asJava().forEach(field -> { - int fieldIndex = schema.fieldIndex(field.name()); - Object catalystValue = partition.values().get(fieldIndex, field.dataType()); - Object value = CatalystTypeConverters.convertToScala(catalystValue, field.dataType()); - values.put(field.name(), String.valueOf(value)); - }); - - FileStatus fileStatus = - JavaConverters.seqAsJavaListConverter(partition.files()).asJava().get(0); - - return new SparkPartition(values, fileStatus.getPath().getParent().toString(), format); - }).collect(Collectors.toList()); - } - - public static org.apache.spark.sql.catalyst.TableIdentifier toV1TableIdentifier(Identifier identifier) { + return JavaConverters.seqAsJavaListConverter(filteredPartitions).asJava().stream() + .map( + partition -> { + Map values = Maps.newHashMap(); + JavaConverters.asJavaIterableConverter(schema) + .asJava() + .forEach( + field -> { + int fieldIndex = schema.fieldIndex(field.name()); + Object catalystValue = partition.values().get(fieldIndex, field.dataType()); + Object value = + CatalystTypeConverters.convertToScala(catalystValue, field.dataType()); + values.put(field.name(), String.valueOf(value)); + }); + + FileStatus fileStatus = + JavaConverters.seqAsJavaListConverter(partition.files()).asJava().get(0); + + return new SparkPartition( + values, fileStatus.getPath().getParent().toString(), format); + }) + .collect(Collectors.toList()); + } + + public static org.apache.spark.sql.catalyst.TableIdentifier toV1TableIdentifier( + Identifier identifier) { String[] namespace = identifier.namespace(); - Preconditions.checkArgument(namespace.length <= 1, - "Cannot convert %s to a Spark v1 identifier, namespace contains more than 1 part", identifier); + Preconditions.checkArgument( + namespace.length <= 1, + "Cannot convert %s to a Spark v1 identifier, namespace contains more than 1 part", + identifier); String table = identifier.name(); Option database = namespace.length == 1 ? Option.apply(namespace[0]) : Option.empty(); @@ -848,54 +891,80 @@ public static org.apache.spark.sql.catalyst.TableIdentifier toV1TableIdentifier( private static class DescribeSortOrderVisitor implements SortOrderVisitor { private static final DescribeSortOrderVisitor INSTANCE = new DescribeSortOrderVisitor(); - private DescribeSortOrderVisitor() { - } + private DescribeSortOrderVisitor() {} @Override - public String field(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String field( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("%s %s %s", sourceName, direction, nullOrder); } @Override - public String bucket(String sourceName, int sourceId, int numBuckets, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String bucket( + String sourceName, + int sourceId, + int numBuckets, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("bucket(%s, %s) %s %s", numBuckets, sourceName, direction, nullOrder); } @Override - public String truncate(String sourceName, int sourceId, int width, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String truncate( + String sourceName, + int sourceId, + int width, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("truncate(%s, %s) %s %s", sourceName, width, direction, nullOrder); } @Override - public String year(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String year( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("years(%s) %s %s", sourceName, direction, nullOrder); } @Override - public String month(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String month( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("months(%s) %s %s", sourceName, direction, nullOrder); } @Override - public String day(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String day( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("days(%s) %s %s", sourceName, direction, nullOrder); } @Override - public String hour(String sourceName, int sourceId, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String hour( + String sourceName, + int sourceId, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("hours(%s) %s %s", sourceName, direction, nullOrder); } @Override - public String unknown(String sourceName, int sourceId, String transform, - org.apache.iceberg.SortDirection direction, NullOrder nullOrder) { + public String unknown( + String sourceName, + int sourceId, + String transform, + org.apache.iceberg.SortDirection direction, + NullOrder nullOrder) { return String.format("%s(%s) %s %s", transform, sourceName, direction, nullOrder); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkCachedTableCatalog.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkCachedTableCatalog.java index dda86d0b76ad..923c54981199 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkCachedTableCatalog.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkCachedTableCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -40,9 +39,7 @@ import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; -/** - * An internal table catalog that is capable of loading tables from a cache. - */ +/** An internal table catalog that is capable of loading tables from a cache. */ public class SparkCachedTableCatalog implements TableCatalog { private static final String CLASS_NAME = SparkCachedTableCatalog.class.getName(); @@ -68,16 +65,16 @@ public SparkTable loadTable(Identifier ident) throws NoSuchTableException { @Override public SparkTable loadTable(Identifier ident, String version) throws NoSuchTableException { Pair table = load(ident); - Preconditions.checkArgument(table.second() == null, - "Cannot time travel based on both table identifier and AS OF"); + Preconditions.checkArgument( + table.second() == null, "Cannot time travel based on both table identifier and AS OF"); return new SparkTable(table.first(), Long.parseLong(version), false /* refresh eagerly */); } @Override public SparkTable loadTable(Identifier ident, long timestampMicros) throws NoSuchTableException { Pair table = load(ident); - Preconditions.checkArgument(table.second() == null, - "Cannot time travel based on both table identifier and AS OF"); + Preconditions.checkArgument( + table.second() == null, "Cannot time travel based on both table identifier and AS OF"); // Spark passes microseconds but Iceberg uses milliseconds for snapshots long timestampMillis = TimeUnit.MICROSECONDS.toMillis(timestampMicros); long snapshotId = SnapshotUtil.snapshotIdAsOfTime(table.first(), timestampMillis); @@ -90,8 +87,9 @@ public void invalidateTable(Identifier ident) { } @Override - public SparkTable createTable(Identifier ident, StructType schema, Transform[] partitions, - Map properties) throws TableAlreadyExistsException { + public SparkTable createTable( + Identifier ident, StructType schema, Transform[] partitions, Map properties) + throws TableAlreadyExistsException { throw new UnsupportedOperationException(CLASS_NAME + " does not support creating tables"); } @@ -126,7 +124,8 @@ public String name() { } private Pair load(Identifier ident) throws NoSuchTableException { - Preconditions.checkArgument(ident.namespace().length == 0, CLASS_NAME + " does not support namespaces"); + Preconditions.checkArgument( + ident.namespace().length == 0, CLASS_NAME + " does not support namespaces"); Pair> parsedIdent = parseIdent(ident); String key = parsedIdent.first(); @@ -147,8 +146,10 @@ private Pair load(Identifier ident) throws NoSuchTableException { } } - Preconditions.checkArgument(asOfTimestamp == null || snapshotId == null, - "Cannot specify both snapshot and timestamp for time travel: %s", ident); + Preconditions.checkArgument( + asOfTimestamp == null || snapshotId == null, + "Cannot specify both snapshot and timestamp for time travel: %s", + ident); Table table = TABLE_CACHE.get(key); diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java index 15a97e5ad4ab..bad2aca031c8 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -75,26 +77,24 @@ import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** * A Spark TableCatalog implementation that wraps an Iceberg {@link Catalog}. - *

    - * This supports the following catalog configuration options: + * + *

    This supports the following catalog configuration options: + * *

      - *
    • type - catalog type, "hive" or "hadoop". - * To specify a non-hive or hadoop catalog, use the catalog-impl option. - *
    • - *
    • uri - the Hive Metastore URI (Hive catalog only)
    • - *
    • warehouse - the warehouse path (Hadoop catalog only)
    • - *
    • catalog-impl - a custom {@link Catalog} implementation to use
    • - *
    • default-namespace - a namespace to use as the default
    • - *
    • cache-enabled - whether to enable catalog cache
    • - *
    • cache.expiration-interval-ms - interval in millis before expiring tables from catalog cache. - * Refer to {@link CatalogProperties#CACHE_EXPIRATION_INTERVAL_MS} for further details and significant values. - *
    • + *
    • type - catalog type, "hive" or "hadoop". To specify a non-hive or hadoop + * catalog, use the catalog-impl option. + *
    • uri - the Hive Metastore URI (Hive catalog only) + *
    • warehouse - the warehouse path (Hadoop catalog only) + *
    • catalog-impl - a custom {@link Catalog} implementation to use + *
    • default-namespace - a namespace to use as the default + *
    • cache-enabled - whether to enable catalog cache + *
    • cache.expiration-interval-ms - interval in millis before expiring tables from + * catalog cache. Refer to {@link CatalogProperties#CACHE_EXPIRATION_INTERVAL_MS} for further + * details and significant values. *
    + * *

    */ public class SparkCatalog extends BaseCatalog { @@ -151,7 +151,8 @@ public SparkTable loadTable(Identifier ident) throws NoSuchTableException { public SparkTable loadTable(Identifier ident, String version) throws NoSuchTableException { try { Pair icebergTable = load(ident); - Preconditions.checkArgument(icebergTable.second() == null, + Preconditions.checkArgument( + icebergTable.second() == null, "Cannot do time-travel based on both table identifier and AS OF"); return new SparkTable(icebergTable.first(), Long.parseLong(version), !cacheEnabled); } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { @@ -166,9 +167,11 @@ public SparkTable loadTable(Identifier ident, long timestamp) throws NoSuchTable // spark returns timestamp in micro seconds precision, convert it to milliseconds, // as iceberg snapshot's are stored in millisecond precision. long timestampMillis = TimeUnit.MICROSECONDS.toMillis(timestamp); - Preconditions.checkArgument(icebergTable.second() == null, + Preconditions.checkArgument( + icebergTable.second() == null, "Cannot do time-travel based on both table identifier and AS OF"); - long snapshotIdAsOfTime = SnapshotUtil.snapshotIdAsOfTime(icebergTable.first(), timestampMillis); + long snapshotIdAsOfTime = + SnapshotUtil.snapshotIdAsOfTime(icebergTable.first(), timestampMillis); return new SparkTable(icebergTable.first(), snapshotIdAsOfTime, !cacheEnabled); } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { throw new NoSuchTableException(ident); @@ -176,17 +179,18 @@ public SparkTable loadTable(Identifier ident, long timestamp) throws NoSuchTable } @Override - public SparkTable createTable(Identifier ident, StructType schema, - Transform[] transforms, - Map properties) throws TableAlreadyExistsException { + public SparkTable createTable( + Identifier ident, StructType schema, Transform[] transforms, Map properties) + throws TableAlreadyExistsException { Schema icebergSchema = SparkSchemaUtil.convert(schema, useTimestampsWithoutZone); try { Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); - Table icebergTable = builder - .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) - .withLocation(properties.get("location")) - .withProperties(Spark3Util.rebuildCreateProperties(properties)) - .create(); + Table icebergTable = + builder + .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) + .withLocation(properties.get("location")) + .withProperties(Spark3Util.rebuildCreateProperties(properties)) + .create(); return new SparkTable(icebergTable, !cacheEnabled); } catch (AlreadyExistsException e) { throw new TableAlreadyExistsException(ident); @@ -194,15 +198,18 @@ public SparkTable createTable(Identifier ident, StructType schema, } @Override - public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] transforms, - Map properties) throws TableAlreadyExistsException { + public StagedTable stageCreate( + Identifier ident, StructType schema, Transform[] transforms, Map properties) + throws TableAlreadyExistsException { Schema icebergSchema = SparkSchemaUtil.convert(schema, useTimestampsWithoutZone); try { Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); - Transaction transaction = builder.withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) - .withLocation(properties.get("location")) - .withProperties(Spark3Util.rebuildCreateProperties(properties)) - .createTransaction(); + Transaction transaction = + builder + .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) + .withLocation(properties.get("location")) + .withProperties(Spark3Util.rebuildCreateProperties(properties)) + .createTransaction(); return new StagedSparkTable(transaction); } catch (AlreadyExistsException e) { throw new TableAlreadyExistsException(ident); @@ -210,15 +217,18 @@ public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] } @Override - public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] transforms, - Map properties) throws NoSuchTableException { + public StagedTable stageReplace( + Identifier ident, StructType schema, Transform[] transforms, Map properties) + throws NoSuchTableException { Schema icebergSchema = SparkSchemaUtil.convert(schema, useTimestampsWithoutZone); try { Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); - Transaction transaction = builder.withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) - .withLocation(properties.get("location")) - .withProperties(Spark3Util.rebuildCreateProperties(properties)) - .replaceTransaction(); + Transaction transaction = + builder + .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) + .withLocation(properties.get("location")) + .withProperties(Spark3Util.rebuildCreateProperties(properties)) + .replaceTransaction(); return new StagedSparkTable(transaction); } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { throw new NoSuchTableException(ident); @@ -226,19 +236,22 @@ public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] } @Override - public StagedTable stageCreateOrReplace(Identifier ident, StructType schema, Transform[] transforms, - Map properties) { + public StagedTable stageCreateOrReplace( + Identifier ident, StructType schema, Transform[] transforms, Map properties) { Schema icebergSchema = SparkSchemaUtil.convert(schema, useTimestampsWithoutZone); Catalog.TableBuilder builder = newBuilder(ident, icebergSchema); - Transaction transaction = builder.withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) - .withLocation(properties.get("location")) - .withProperties(Spark3Util.rebuildCreateProperties(properties)) - .createOrReplaceTransaction(); + Transaction transaction = + builder + .withPartitionSpec(Spark3Util.toPartitionSpec(icebergSchema, transforms)) + .withLocation(properties.get("location")) + .withProperties(Spark3Util.rebuildCreateProperties(properties)) + .createOrReplaceTransaction(); return new StagedSparkTable(transaction); } @Override - public SparkTable alterTable(Identifier ident, TableChange... changes) throws NoSuchTableException { + public SparkTable alterTable(Identifier ident, TableChange... changes) + throws NoSuchTableException { SetProperty setLocation = null; SetProperty setSnapshotId = null; SetProperty pickSnapshotId = null; @@ -255,8 +268,9 @@ public SparkTable alterTable(Identifier ident, TableChange... changes) throws No } else if ("cherry-pick-snapshot-id".equalsIgnoreCase(set.property())) { pickSnapshotId = set; } else if ("sort-order".equalsIgnoreCase(set.property())) { - throw new UnsupportedOperationException("Cannot specify the 'sort-order' because it's a reserved table " + - "property. Please use the command 'ALTER TABLE ... WRITE ORDERED BY' to specify write sort-orders."); + throw new UnsupportedOperationException( + "Cannot specify the 'sort-order' because it's a reserved table " + + "property. Please use the command 'ALTER TABLE ... WRITE ORDERED BY' to specify write sort-orders."); } else { propertyChanges.add(set); } @@ -271,7 +285,8 @@ public SparkTable alterTable(Identifier ident, TableChange... changes) throws No try { Table table = load(ident).first(); - commitChanges(table, setLocation, setSnapshotId, pickSnapshotId, propertyChanges, schemaChanges); + commitChanges( + table, setLocation, setSnapshotId, pickSnapshotId, propertyChanges, schemaChanges); return new SparkTable(table, true /* refreshEagerly */); } catch (org.apache.iceberg.exceptions.NoSuchTableException e) { throw new NoSuchTableException(ident); @@ -290,20 +305,19 @@ public boolean purgeTable(Identifier ident) { ValidationException.check( PropertyUtil.propertyAsBoolean(table.properties(), GC_ENABLED, GC_ENABLED_DEFAULT), "Cannot purge table: GC is disabled (deleting files may corrupt other tables)"); - String metadataFileLocation = ((HasTableOperations) table).operations().current().metadataFileLocation(); + String metadataFileLocation = + ((HasTableOperations) table).operations().current().metadataFileLocation(); boolean dropped = dropTableWithoutPurging(ident); if (dropped) { - // We should check whether the metadata file exists. Because the HadoopCatalog/HadoopTables will drop the + // We should check whether the metadata file exists. Because the HadoopCatalog/HadoopTables + // will drop the // warehouse directly and ignore the `purge` argument. boolean metadataFileExists = table.io().newInputFile(metadataFileLocation).exists(); if (metadataFileExists) { - SparkActions.get() - .deleteReachableFiles(metadataFileLocation) - .io(table.io()) - .execute(); + SparkActions.get().deleteReachableFiles(metadataFileLocation).io(table.io()).execute(); } } @@ -322,7 +336,8 @@ private boolean dropTableWithoutPurging(Identifier ident) { } @Override - public void renameTable(Identifier from, Identifier to) throws NoSuchTableException, TableAlreadyExistsException { + public void renameTable(Identifier from, Identifier to) + throws NoSuchTableException, TableAlreadyExistsException { try { checkNotPathIdentifier(from, "renameTable"); checkNotPathIdentifier(to, "renameTable"); @@ -384,7 +399,8 @@ public String[][] listNamespaces(String[] namespace) throws NoSuchNamespaceExcep } @Override - public Map loadNamespaceMetadata(String[] namespace) throws NoSuchNamespaceException { + public Map loadNamespaceMetadata(String[] namespace) + throws NoSuchNamespaceException { if (asNamespaceCatalog != null) { try { return asNamespaceCatalog.loadNamespaceMetadata(Namespace.of(namespace)); @@ -397,10 +413,12 @@ public Map loadNamespaceMetadata(String[] namespace) throws NoSu } @Override - public void createNamespace(String[] namespace, Map metadata) throws NamespaceAlreadyExistsException { + public void createNamespace(String[] namespace, Map metadata) + throws NamespaceAlreadyExistsException { if (asNamespaceCatalog != null) { try { - if (asNamespaceCatalog instanceof HadoopCatalog && DEFAULT_NS_KEYS.equals(metadata.keySet())) { + if (asNamespaceCatalog instanceof HadoopCatalog + && DEFAULT_NS_KEYS.equals(metadata.keySet())) { // Hadoop catalog will reject metadata properties, but Spark automatically adds "owner". // If only the automatic properties are present, replace metadata with an empty map. asNamespaceCatalog.createNamespace(Namespace.of(namespace), ImmutableMap.of()); @@ -411,12 +429,14 @@ public void createNamespace(String[] namespace, Map metadata) th throw new NamespaceAlreadyExistsException(namespace); } } else { - throw new UnsupportedOperationException("Namespaces are not supported by catalog: " + catalogName); + throw new UnsupportedOperationException( + "Namespaces are not supported by catalog: " + catalogName); } } @Override - public void alterNamespace(String[] namespace, NamespaceChange... changes) throws NoSuchNamespaceException { + public void alterNamespace(String[] namespace, NamespaceChange... changes) + throws NoSuchNamespaceException { if (asNamespaceCatalog != null) { Map updates = Maps.newHashMap(); Set removals = Sets.newHashSet(); @@ -427,7 +447,8 @@ public void alterNamespace(String[] namespace, NamespaceChange... changes) throw } else if (change instanceof NamespaceChange.RemoveProperty) { removals.add(((NamespaceChange.RemoveProperty) change).property()); } else { - throw new UnsupportedOperationException("Cannot apply unknown namespace change: " + change); + throw new UnsupportedOperationException( + "Cannot apply unknown namespace change: " + change); } } @@ -449,7 +470,8 @@ public void alterNamespace(String[] namespace, NamespaceChange... changes) throw } @Override - public boolean dropNamespace(String[] namespace, boolean cascade) throws NoSuchNamespaceException { + public boolean dropNamespace(String[] namespace, boolean cascade) + throws NoSuchNamespaceException { if (asNamespaceCatalog != null) { try { return asNamespaceCatalog.dropNamespace(Namespace.of(namespace)); @@ -463,12 +485,15 @@ public boolean dropNamespace(String[] namespace, boolean cascade) throws NoSuchN @Override public final void initialize(String name, CaseInsensitiveStringMap options) { - this.cacheEnabled = PropertyUtil.propertyAsBoolean(options, - CatalogProperties.CACHE_ENABLED, CatalogProperties.CACHE_ENABLED_DEFAULT); + this.cacheEnabled = + PropertyUtil.propertyAsBoolean( + options, CatalogProperties.CACHE_ENABLED, CatalogProperties.CACHE_ENABLED_DEFAULT); - long cacheExpirationIntervalMs = PropertyUtil.propertyAsLong(options, - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS_DEFAULT); + long cacheExpirationIntervalMs = + PropertyUtil.propertyAsLong( + options, + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS_DEFAULT); // An expiration interval of 0ms effectively disables caching. // Do not wrap with CachingCatalog. @@ -480,15 +505,17 @@ public final void initialize(String name, CaseInsensitiveStringMap options) { this.catalogName = name; SparkSession sparkSession = SparkSession.active(); - this.useTimestampsWithoutZone = SparkUtil.useTimestampWithoutZoneInNewTables(sparkSession.conf()); - this.tables = new HadoopTables(SparkUtil.hadoopConfCatalogOverrides(SparkSession.active(), name)); - this.icebergCatalog = cacheEnabled ? CachingCatalog.wrap(catalog, cacheExpirationIntervalMs) : catalog; + this.useTimestampsWithoutZone = + SparkUtil.useTimestampWithoutZoneInNewTables(sparkSession.conf()); + this.tables = + new HadoopTables(SparkUtil.hadoopConfCatalogOverrides(SparkSession.active(), name)); + this.icebergCatalog = + cacheEnabled ? CachingCatalog.wrap(catalog, cacheExpirationIntervalMs) : catalog; if (catalog instanceof SupportsNamespaces) { this.asNamespaceCatalog = (SupportsNamespaces) catalog; if (options.containsKey("default-namespace")) { - this.defaultNamespace = Splitter.on('.') - .splitToList(options.get("default-namespace")) - .toArray(new String[0]); + this.defaultNamespace = + Splitter.on('.').splitToList(options.get("default-namespace")).toArray(new String[0]); } } } @@ -498,12 +525,18 @@ public String name() { return catalogName; } - private static void commitChanges(Table table, SetProperty setLocation, SetProperty setSnapshotId, - SetProperty pickSnapshotId, List propertyChanges, - List schemaChanges) { - // don't allow setting the snapshot and picking a commit at the same time because order is ambiguous and choosing + private static void commitChanges( + Table table, + SetProperty setLocation, + SetProperty setSnapshotId, + SetProperty pickSnapshotId, + List propertyChanges, + List schemaChanges) { + // don't allow setting the snapshot and picking a commit at the same time because order is + // ambiguous and choosing // one order leads to different results - Preconditions.checkArgument(setSnapshotId == null || pickSnapshotId == null, + Preconditions.checkArgument( + setSnapshotId == null || pickSnapshotId == null, "Cannot set the current the current snapshot ID and cherry-pick snapshot changes"); if (setSnapshotId != null) { @@ -520,9 +553,7 @@ private static void commitChanges(Table table, SetProperty setLocation, SetPrope Transaction transaction = table.newTransaction(); if (setLocation != null) { - transaction.updateLocation() - .setLocation(setLocation.value()) - .commit(); + transaction.updateLocation().setLocation(setLocation.value()).commit(); } if (!propertyChanges.isEmpty()) { @@ -542,8 +573,9 @@ private static boolean isPathIdentifier(Identifier ident) { private static void checkNotPathIdentifier(Identifier identifier, String method) { if (identifier instanceof PathIdentifier) { - throw new IllegalArgumentException(String.format("Cannot pass path based identifier to %s method. %s is a path.", - method, identifier)); + throw new IllegalArgumentException( + String.format( + "Cannot pass path based identifier to %s method. %s is a path.", method, identifier)); } } @@ -560,7 +592,8 @@ private Pair load(Identifier ident) { throw e; } - // if the original load didn't work, the identifier may be extended and include a snapshot selector + // if the original load didn't work, the identifier may be extended and include a snapshot + // selector TableIdentifier namespaceAsIdent = buildIdentifier(namespaceToIdentifier(ident.namespace())); Table table; try { @@ -624,10 +657,13 @@ private Pair loadFromPathIdentifier(PathIdentifier ident) { } } - Preconditions.checkArgument(asOfTimestamp == null || snapshotId == null, - "Cannot specify both snapshot-id and as-of-timestamp: %s", ident.location()); + Preconditions.checkArgument( + asOfTimestamp == null || snapshotId == null, + "Cannot specify both snapshot-id and as-of-timestamp: %s", + ident.location()); - Table table = tables.load(parsed.first() + (metadataTableName != null ? "#" + metadataTableName : "")); + Table table = + tables.load(parsed.first() + (metadataTableName != null ? "#" + metadataTableName : "")); if (snapshotId != null) { return Pair.of(table, snapshotId); @@ -639,17 +675,17 @@ private Pair loadFromPathIdentifier(PathIdentifier ident) { } private Identifier namespaceToIdentifier(String[] namespace) { - Preconditions.checkArgument(namespace.length > 0, - "Cannot convert empty namespace to identifier"); + Preconditions.checkArgument( + namespace.length > 0, "Cannot convert empty namespace to identifier"); String[] ns = Arrays.copyOf(namespace, namespace.length - 1); String name = namespace[ns.length]; return Identifier.of(ns, name); } private Catalog.TableBuilder newBuilder(Identifier ident, Schema schema) { - return isPathIdentifier(ident) ? - tables.buildTable(((PathIdentifier) ident).location(), schema) : - icebergCatalog.buildTable(buildIdentifier(ident), schema); + return isPathIdentifier(ident) + ? tables.buildTable(((PathIdentifier) ident).location(), schema) + : icebergCatalog.buildTable(buildIdentifier(ident), schema); } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java index f3d89467fcf7..8242e67da64b 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkConfParser.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -175,7 +174,8 @@ public ThisT tableProperty(String name) { protected T parse(Function conversion, T defaultValue) { if (!optionNames.isEmpty()) { for (String optionName : optionNames) { - // use lower case comparison as DataSourceOptions.asMap() in Spark 2 returns a lower case map + // use lower case comparison as DataSourceOptions.asMap() in Spark 2 returns a lower case + // map String optionValue = options.get(optionName.toLowerCase(Locale.ROOT)); if (optionValue != null) { return conversion.apply(optionValue); diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java index a6390d39c575..87e831872472 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkDataFile.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.nio.ByteBuffer; @@ -62,10 +61,12 @@ public SparkDataFile(Types.StructType type, StructType sparkType) { this.wrappedPartition = new SparkStructLike(type.fieldType("partition").asStructType()); Map positions = Maps.newHashMap(); - type.fields().forEach(field -> { - String fieldName = field.name(); - positions.put(fieldName, fieldPosition(fieldName, sparkType)); - }); + type.fields() + .forEach( + field -> { + String fieldName = field.name(); + positions.put(fieldName, fieldPosition(fieldName, sparkType)); + }); filePathPosition = positions.get("file_path"); fileFormatPosition = positions.get("file_format"); @@ -139,23 +140,29 @@ public Map valueCounts() { @Override public Map nullValueCounts() { - return wrapped.isNullAt(nullValueCountsPosition) ? null : wrapped.getJavaMap(nullValueCountsPosition); + return wrapped.isNullAt(nullValueCountsPosition) + ? null + : wrapped.getJavaMap(nullValueCountsPosition); } @Override public Map nanValueCounts() { - return wrapped.isNullAt(nanValueCountsPosition) ? null : wrapped.getJavaMap(nanValueCountsPosition); + return wrapped.isNullAt(nanValueCountsPosition) + ? null + : wrapped.getJavaMap(nanValueCountsPosition); } @Override public Map lowerBounds() { - Map lowerBounds = wrapped.isNullAt(lowerBoundsPosition) ? null : wrapped.getJavaMap(lowerBoundsPosition); + Map lowerBounds = + wrapped.isNullAt(lowerBoundsPosition) ? null : wrapped.getJavaMap(lowerBoundsPosition); return convert(lowerBoundsType, lowerBounds); } @Override public Map upperBounds() { - Map upperBounds = wrapped.isNullAt(upperBoundsPosition) ? null : wrapped.getJavaMap(upperBoundsPosition); + Map upperBounds = + wrapped.isNullAt(upperBoundsPosition) ? null : wrapped.getJavaMap(upperBoundsPosition); return convert(upperBoundsType, upperBounds); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkDistributionAndOrderingUtil.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkDistributionAndOrderingUtil.java index e481e1260432..f2c8f6e26ca4 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkDistributionAndOrderingUtil.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkDistributionAndOrderingUtil.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.DELETE; +import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.UPDATE; + import java.util.List; import org.apache.iceberg.DistributionMode; import org.apache.iceberg.MetadataColumns; @@ -38,30 +40,33 @@ import org.apache.spark.sql.connector.expressions.SortOrder; import org.apache.spark.sql.connector.write.RowLevelOperation.Command; -import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.DELETE; -import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.UPDATE; - public class SparkDistributionAndOrderingUtil { private static final NamedReference SPEC_ID = Expressions.column(MetadataColumns.SPEC_ID.name()); - private static final NamedReference PARTITION = Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME); - private static final NamedReference FILE_PATH = Expressions.column(MetadataColumns.FILE_PATH.name()); - private static final NamedReference ROW_POSITION = Expressions.column(MetadataColumns.ROW_POSITION.name()); + private static final NamedReference PARTITION = + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME); + private static final NamedReference FILE_PATH = + Expressions.column(MetadataColumns.FILE_PATH.name()); + private static final NamedReference ROW_POSITION = + Expressions.column(MetadataColumns.ROW_POSITION.name()); private static final SortOrder SPEC_ID_ORDER = Expressions.sort(SPEC_ID, SortDirection.ASCENDING); - private static final SortOrder PARTITION_ORDER = Expressions.sort(PARTITION, SortDirection.ASCENDING); - private static final SortOrder FILE_PATH_ORDER = Expressions.sort(FILE_PATH, SortDirection.ASCENDING); - private static final SortOrder ROW_POSITION_ORDER = Expressions.sort(ROW_POSITION, SortDirection.ASCENDING); - - private static final SortOrder[] EXISTING_FILE_ORDERING = new SortOrder[]{FILE_PATH_ORDER, ROW_POSITION_ORDER}; - private static final SortOrder[] POSITION_DELETE_ORDERING = new SortOrder[]{ - SPEC_ID_ORDER, PARTITION_ORDER, FILE_PATH_ORDER, ROW_POSITION_ORDER - }; - - private SparkDistributionAndOrderingUtil() { - } - - public static Distribution buildRequiredDistribution(Table table, DistributionMode distributionMode) { + private static final SortOrder PARTITION_ORDER = + Expressions.sort(PARTITION, SortDirection.ASCENDING); + private static final SortOrder FILE_PATH_ORDER = + Expressions.sort(FILE_PATH, SortDirection.ASCENDING); + private static final SortOrder ROW_POSITION_ORDER = + Expressions.sort(ROW_POSITION, SortDirection.ASCENDING); + + private static final SortOrder[] EXISTING_FILE_ORDERING = + new SortOrder[] {FILE_PATH_ORDER, ROW_POSITION_ORDER}; + private static final SortOrder[] POSITION_DELETE_ORDERING = + new SortOrder[] {SPEC_ID_ORDER, PARTITION_ORDER, FILE_PATH_ORDER, ROW_POSITION_ORDER}; + + private SparkDistributionAndOrderingUtil() {} + + public static Distribution buildRequiredDistribution( + Table table, DistributionMode distributionMode) { switch (distributionMode) { case NONE: return Distributions.unspecified(); @@ -86,8 +91,8 @@ public static SortOrder[] buildRequiredOrdering(Table table, Distribution distri } } - public static Distribution buildCopyOnWriteDistribution(Table table, Command command, - DistributionMode distributionMode) { + public static Distribution buildCopyOnWriteDistribution( + Table table, Command command, DistributionMode distributionMode) { if (command == DELETE || command == UPDATE) { return buildCopyOnWriteDeleteUpdateDistribution(table, distributionMode); } else { @@ -95,13 +100,14 @@ public static Distribution buildCopyOnWriteDistribution(Table table, Command com } } - private static Distribution buildCopyOnWriteDeleteUpdateDistribution(Table table, DistributionMode distributionMode) { + private static Distribution buildCopyOnWriteDeleteUpdateDistribution( + Table table, DistributionMode distributionMode) { switch (distributionMode) { case NONE: return Distributions.unspecified(); case HASH: - Expression[] clustering = new Expression[]{FILE_PATH}; + Expression[] clustering = new Expression[] {FILE_PATH}; return Distributions.clustered(clustering); case RANGE: @@ -109,7 +115,8 @@ private static Distribution buildCopyOnWriteDeleteUpdateDistribution(Table table if (table.sortOrder().isSorted()) { return Distributions.ordered(tableOrdering); } else { - SortOrder[] ordering = ObjectArrays.concat(tableOrdering, EXISTING_FILE_ORDERING, SortOrder.class); + SortOrder[] ordering = + ObjectArrays.concat(tableOrdering, EXISTING_FILE_ORDERING, SortOrder.class); return Distributions.ordered(ordering); } @@ -118,7 +125,8 @@ private static Distribution buildCopyOnWriteDeleteUpdateDistribution(Table table } } - public static SortOrder[] buildCopyOnWriteOrdering(Table table, Command command, Distribution distribution) { + public static SortOrder[] buildCopyOnWriteOrdering( + Table table, Command command, Distribution distribution) { if (command == DELETE || command == UPDATE) { return buildCopyOnWriteDeleteUpdateOrdering(table, distribution); } else { @@ -126,7 +134,8 @@ public static SortOrder[] buildCopyOnWriteOrdering(Table table, Command command, } } - private static SortOrder[] buildCopyOnWriteDeleteUpdateOrdering(Table table, Distribution distribution) { + private static SortOrder[] buildCopyOnWriteDeleteUpdateOrdering( + Table table, Distribution distribution) { if (distribution instanceof UnspecifiedDistribution) { return buildTableOrdering(table); @@ -143,12 +152,13 @@ private static SortOrder[] buildCopyOnWriteDeleteUpdateOrdering(Table table, Dis return orderedDistribution.ordering(); } else { - throw new IllegalArgumentException("Unexpected distribution type: " + distribution.getClass().getName()); + throw new IllegalArgumentException( + "Unexpected distribution type: " + distribution.getClass().getName()); } } - public static Distribution buildPositionDeltaDistribution(Table table, Command command, - DistributionMode distributionMode) { + public static Distribution buildPositionDeltaDistribution( + Table table, Command command, DistributionMode distributionMode) { if (command == DELETE || command == UPDATE) { return buildPositionDeleteUpdateDistribution(distributionMode); } else { @@ -156,27 +166,30 @@ public static Distribution buildPositionDeltaDistribution(Table table, Command c } } - private static Distribution buildPositionMergeDistribution(Table table, DistributionMode distributionMode) { + private static Distribution buildPositionMergeDistribution( + Table table, DistributionMode distributionMode) { switch (distributionMode) { case NONE: return Distributions.unspecified(); case HASH: if (table.spec().isUnpartitioned()) { - Expression[] clustering = new Expression[]{SPEC_ID, PARTITION, FILE_PATH}; + Expression[] clustering = new Expression[] {SPEC_ID, PARTITION, FILE_PATH}; return Distributions.clustered(clustering); } else { Distribution dataDistribution = buildRequiredDistribution(table, distributionMode); Expression[] dataClustering = ((ClusteredDistribution) dataDistribution).clustering(); - Expression[] deleteClustering = new Expression[]{SPEC_ID, PARTITION}; - Expression[] clustering = ObjectArrays.concat(deleteClustering, dataClustering, Expression.class); + Expression[] deleteClustering = new Expression[] {SPEC_ID, PARTITION}; + Expression[] clustering = + ObjectArrays.concat(deleteClustering, dataClustering, Expression.class); return Distributions.clustered(clustering); } case RANGE: Distribution dataDistribution = buildRequiredDistribution(table, distributionMode); SortOrder[] dataOrdering = ((OrderedDistribution) dataDistribution).ordering(); - SortOrder[] deleteOrdering = new SortOrder[]{SPEC_ID_ORDER, PARTITION_ORDER, FILE_PATH_ORDER}; + SortOrder[] deleteOrdering = + new SortOrder[] {SPEC_ID_ORDER, PARTITION_ORDER, FILE_PATH_ORDER}; SortOrder[] ordering = ObjectArrays.concat(deleteOrdering, dataOrdering, SortOrder.class); return Distributions.ordered(ordering); @@ -185,17 +198,18 @@ private static Distribution buildPositionMergeDistribution(Table table, Distribu } } - private static Distribution buildPositionDeleteUpdateDistribution(DistributionMode distributionMode) { + private static Distribution buildPositionDeleteUpdateDistribution( + DistributionMode distributionMode) { switch (distributionMode) { case NONE: return Distributions.unspecified(); case HASH: - Expression[] clustering = new Expression[]{SPEC_ID, PARTITION}; + Expression[] clustering = new Expression[] {SPEC_ID, PARTITION}; return Distributions.clustered(clustering); case RANGE: - SortOrder[] ordering = new SortOrder[]{SPEC_ID_ORDER, PARTITION_ORDER, FILE_PATH_ORDER}; + SortOrder[] ordering = new SortOrder[] {SPEC_ID_ORDER, PARTITION_ORDER, FILE_PATH_ORDER}; return Distributions.ordered(ordering); default: @@ -214,7 +228,8 @@ public static SortOrder[] buildPositionDeltaOrdering(Table table, Command comman } public static SortOrder[] convert(org.apache.iceberg.SortOrder sortOrder) { - List converted = SortOrderVisitor.visit(sortOrder, new SortOrderToSpark(sortOrder.schema())); + List converted = + SortOrderVisitor.visit(sortOrder, new SortOrderToSpark(sortOrder.schema())); return converted.toArray(new SortOrder[0]); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java index 2eb53baa688e..5c6fe3e0ff96 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkExceptionUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import com.google.errorprone.annotations.FormatMethod; @@ -29,8 +28,7 @@ public class SparkExceptionUtil { - private SparkExceptionUtil() { - } + private SparkExceptionUtil() {} /** * Converts checked exceptions to unchecked exceptions. @@ -41,8 +39,8 @@ private SparkExceptionUtil() { * @return unchecked exception. */ @FormatMethod - public static RuntimeException toUncheckedException(final Throwable cause, final String message, - final Object... args) { + public static RuntimeException toUncheckedException( + final Throwable cause, final String message, final Object... args) { // Parameters are required to be final to help @FormatMethod do static analysis if (cause instanceof RuntimeException) { return (RuntimeException) cause; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java index 50c108c0b01b..c8dd54954fd6 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java @@ -16,9 +16,23 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.expressions.Expressions.and; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.isNaN; +import static org.apache.iceberg.expressions.Expressions.isNull; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.not; +import static org.apache.iceberg.expressions.Expressions.notIn; +import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.or; +import static org.apache.iceberg.expressions.Expressions.startsWith; + import java.sql.Date; import java.sql.Timestamp; import java.time.Instant; @@ -55,54 +69,39 @@ import org.apache.spark.sql.sources.Or; import org.apache.spark.sql.sources.StringStartsWith; -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.isNull; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.not; -import static org.apache.iceberg.expressions.Expressions.notIn; -import static org.apache.iceberg.expressions.Expressions.notNull; -import static org.apache.iceberg.expressions.Expressions.or; -import static org.apache.iceberg.expressions.Expressions.startsWith; - public class SparkFilters { private static final Pattern BACKTICKS_PATTERN = Pattern.compile("([`])(.|$)"); - private SparkFilters() { - } + private SparkFilters() {} - private static final Map, Operation> FILTERS = ImmutableMap - ., Operation>builder() - .put(AlwaysTrue.class, Operation.TRUE) - .put(AlwaysTrue$.class, Operation.TRUE) - .put(AlwaysFalse$.class, Operation.FALSE) - .put(AlwaysFalse.class, Operation.FALSE) - .put(EqualTo.class, Operation.EQ) - .put(EqualNullSafe.class, Operation.EQ) - .put(GreaterThan.class, Operation.GT) - .put(GreaterThanOrEqual.class, Operation.GT_EQ) - .put(LessThan.class, Operation.LT) - .put(LessThanOrEqual.class, Operation.LT_EQ) - .put(In.class, Operation.IN) - .put(IsNull.class, Operation.IS_NULL) - .put(IsNotNull.class, Operation.NOT_NULL) - .put(And.class, Operation.AND) - .put(Or.class, Operation.OR) - .put(Not.class, Operation.NOT) - .put(StringStartsWith.class, Operation.STARTS_WITH) - .build(); + private static final Map, Operation> FILTERS = + ImmutableMap., Operation>builder() + .put(AlwaysTrue.class, Operation.TRUE) + .put(AlwaysTrue$.class, Operation.TRUE) + .put(AlwaysFalse$.class, Operation.FALSE) + .put(AlwaysFalse.class, Operation.FALSE) + .put(EqualTo.class, Operation.EQ) + .put(EqualNullSafe.class, Operation.EQ) + .put(GreaterThan.class, Operation.GT) + .put(GreaterThanOrEqual.class, Operation.GT_EQ) + .put(LessThan.class, Operation.LT) + .put(LessThanOrEqual.class, Operation.LT_EQ) + .put(In.class, Operation.IN) + .put(IsNull.class, Operation.IS_NULL) + .put(IsNotNull.class, Operation.NOT_NULL) + .put(And.class, Operation.AND) + .put(Or.class, Operation.OR) + .put(Not.class, Operation.NOT) + .put(StringStartsWith.class, Operation.STARTS_WITH) + .build(); public static Expression convert(Filter[] filters) { Expression expression = Expressions.alwaysTrue(); for (Filter filter : filters) { Expression converted = convert(filter); - Preconditions.checkArgument(converted != null, "Cannot convert filter to Iceberg: %s", filter); + Preconditions.checkArgument( + converted != null, "Cannot convert filter to Iceberg: %s", filter); expression = Expressions.and(expression, converted); } return expression; @@ -147,8 +146,8 @@ public static Expression convert(Filter filter) { if (filter instanceof EqualTo) { EqualTo eq = (EqualTo) filter; // comparison with null in normal equality is always null. this is probably a mistake. - Preconditions.checkNotNull(eq.value(), - "Expression is always false (eq is not null-safe): %s", filter); + Preconditions.checkNotNull( + eq.value(), "Expression is always false (eq is not null-safe): %s", filter); return handleEqual(unquote(eq.attribute()), eq.value()); } else { EqualNullSafe eq = (EqualNullSafe) filter; @@ -161,7 +160,8 @@ public static Expression convert(Filter filter) { case IN: In inFilter = (In) filter; - return in(unquote(inFilter.attribute()), + return in( + unquote(inFilter.attribute()), Stream.of(inFilter.values()) .filter(Objects::nonNull) .map(SparkFilters::convertLiteral) @@ -174,12 +174,15 @@ public static Expression convert(Filter filter) { if (childOp == Operation.IN) { // infer an extra notNull predicate for Spark NOT IN filters // as Iceberg expressions don't follow the 3-value SQL boolean logic - // col NOT IN (1, 2) in Spark is equivalent to notNull(col) && notIn(col, 1, 2) in Iceberg + // col NOT IN (1, 2) in Spark is equivalent to notNull(col) && notIn(col, 1, 2) in + // Iceberg In childInFilter = (In) childFilter; - Expression notIn = notIn(unquote(childInFilter.attribute()), - Stream.of(childInFilter.values()) - .map(SparkFilters::convertLiteral) - .collect(Collectors.toList())); + Expression notIn = + notIn( + unquote(childInFilter.attribute()), + Stream.of(childInFilter.values()) + .map(SparkFilters::convertLiteral) + .collect(Collectors.toList())); return and(notNull(childInFilter.attribute()), notIn); } else if (hasNoInFilter(childFilter)) { Expression child = convert(childFilter); @@ -189,30 +192,33 @@ public static Expression convert(Filter filter) { } return null; - case AND: { - And andFilter = (And) filter; - Expression left = convert(andFilter.left()); - Expression right = convert(andFilter.right()); - if (left != null && right != null) { - return and(left, right); + case AND: + { + And andFilter = (And) filter; + Expression left = convert(andFilter.left()); + Expression right = convert(andFilter.right()); + if (left != null && right != null) { + return and(left, right); + } + return null; } - return null; - } - case OR: { - Or orFilter = (Or) filter; - Expression left = convert(orFilter.left()); - Expression right = convert(orFilter.right()); - if (left != null && right != null) { - return or(left, right); + case OR: + { + Or orFilter = (Or) filter; + Expression left = convert(orFilter.left()); + Expression right = convert(orFilter.right()); + if (left != null && right != null) { + return or(left, right); + } + return null; } - return null; - } - case STARTS_WITH: { - StringStartsWith stringStartsWith = (StringStartsWith) filter; - return startsWith(unquote(stringStartsWith.attribute()), stringStartsWith.value()); - } + case STARTS_WITH: + { + StringStartsWith stringStartsWith = (StringStartsWith) filter; + return startsWith(unquote(stringStartsWith.attribute()), stringStartsWith.value()); + } } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java index d4dd53d34a97..b35213501aef 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTimestampType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Schema; @@ -27,9 +26,10 @@ /** * By default Spark type {@link org.apache.iceberg.types.Types.TimestampType} should be converted to - * {@link Types.TimestampType#withZone()} iceberg type. But we also can convert - * {@link org.apache.iceberg.types.Types.TimestampType} to {@link Types.TimestampType#withoutZone()} iceberg type - * by setting {@link SparkSQLProperties#USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES} to 'true' + * {@link Types.TimestampType#withZone()} iceberg type. But we also can convert {@link + * org.apache.iceberg.types.Types.TimestampType} to {@link Types.TimestampType#withoutZone()} + * iceberg type by setting {@link SparkSQLProperties#USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES} + * to 'true' */ class SparkFixupTimestampType extends FixupTypes { @@ -38,8 +38,8 @@ private SparkFixupTimestampType(Schema referenceSchema) { } static Schema fixup(Schema schema) { - return new Schema(TypeUtil.visit(schema, - new SparkFixupTimestampType(schema)).asStructType().fields()); + return new Schema( + TypeUtil.visit(schema, new SparkFixupTimestampType(schema)).asStructType().fields()); } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java index 5508965af249..6c4ec39b20f1 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkFixupTypes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Schema; @@ -25,8 +24,8 @@ import org.apache.iceberg.types.TypeUtil; /** - * Some types, like binary and fixed, are converted to the same Spark type. Conversion back - * can produce only one, which may not be correct. + * Some types, like binary and fixed, are converted to the same Spark type. Conversion back can + * produce only one, which may not be correct. */ class SparkFixupTypes extends FixupTypes { @@ -35,8 +34,8 @@ private SparkFixupTypes(Schema referenceSchema) { } static Schema fixup(Schema schema, Schema referenceSchema) { - return new Schema(TypeUtil.visit(schema, - new SparkFixupTypes(referenceSchema)).asStructType().fields()); + return new Schema( + TypeUtil.visit(schema, new SparkFixupTypes(referenceSchema)).asStructType().fields()); } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java index c7c01758c3ee..ef262e11f02b 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -31,18 +30,21 @@ /** * A class for common Iceberg configs for Spark reads. - *

    - * If a config is set at multiple levels, the following order of precedence is used (top to bottom): + * + *

    If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * *

      - *
    1. Read options
    2. - *
    3. Session configuration
    4. - *
    5. Table metadata
    6. + *
    7. Read options + *
    8. Session configuration + *
    9. Table metadata *
    - * The most specific value is set in read options and takes precedence over all other configs. - * If no read option is provided, this class checks the session configuration for any overrides. - * If no applicable value is found in the session configuration, this class uses the table metadata. - *

    - * Note this class is NOT meant to be serialized and sent to executors. + * + * The most specific value is set in read options and takes precedence over all other configs. If no + * read option is provided, this class checks the session configuration for any overrides. If no + * applicable value is found in the session configuration, this class uses the table metadata. + * + *

    Note this class is NOT meant to be serialized and sent to executors. */ public class SparkReadConf { @@ -70,61 +72,51 @@ public boolean localityEnabled() { if (file instanceof HadoopInputFile) { String scheme = ((HadoopInputFile) file).getFileSystem().getScheme(); boolean defaultValue = LOCALITY_WHITELIST_FS.contains(scheme); - return PropertyUtil.propertyAsBoolean( - readOptions, - SparkReadOptions.LOCALITY, - defaultValue); + return PropertyUtil.propertyAsBoolean(readOptions, SparkReadOptions.LOCALITY, defaultValue); } return false; } public Long snapshotId() { - return confParser.longConf() - .option(SparkReadOptions.SNAPSHOT_ID) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.SNAPSHOT_ID).parseOptional(); } public Long asOfTimestamp() { - return confParser.longConf() - .option(SparkReadOptions.AS_OF_TIMESTAMP) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.AS_OF_TIMESTAMP).parseOptional(); } public Long startSnapshotId() { - return confParser.longConf() - .option(SparkReadOptions.START_SNAPSHOT_ID) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.START_SNAPSHOT_ID).parseOptional(); } public Long endSnapshotId() { - return confParser.longConf() - .option(SparkReadOptions.END_SNAPSHOT_ID) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.END_SNAPSHOT_ID).parseOptional(); } public String fileScanTaskSetId() { - return confParser.stringConf() - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID) - .parseOptional(); + return confParser.stringConf().option(SparkReadOptions.FILE_SCAN_TASK_SET_ID).parseOptional(); } public boolean streamingSkipDeleteSnapshots() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS) .defaultValue(SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS_DEFAULT) .parse(); } public boolean streamingSkipOverwriteSnapshots() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS) .defaultValue(SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS_DEFAULT) .parse(); } public boolean parquetVectorizationEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.VECTORIZATION_ENABLED) .sessionConf(SparkSQLProperties.VECTORIZATION_ENABLED) .tableProperty(TableProperties.PARQUET_VECTORIZATION_ENABLED) @@ -133,7 +125,8 @@ public boolean parquetVectorizationEnabled() { } public int parquetBatchSize() { - return confParser.intConf() + return confParser + .intConf() .option(SparkReadOptions.VECTORIZATION_BATCH_SIZE) .tableProperty(TableProperties.PARQUET_BATCH_SIZE) .defaultValue(TableProperties.PARQUET_BATCH_SIZE_DEFAULT) @@ -141,7 +134,8 @@ public int parquetBatchSize() { } public boolean orcVectorizationEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.VECTORIZATION_ENABLED) .sessionConf(SparkSQLProperties.VECTORIZATION_ENABLED) .tableProperty(TableProperties.ORC_VECTORIZATION_ENABLED) @@ -150,7 +144,8 @@ public boolean orcVectorizationEnabled() { } public int orcBatchSize() { - return confParser.intConf() + return confParser + .intConf() .option(SparkReadOptions.VECTORIZATION_BATCH_SIZE) .tableProperty(TableProperties.ORC_BATCH_SIZE) .defaultValue(TableProperties.ORC_BATCH_SIZE_DEFAULT) @@ -158,13 +153,12 @@ public int orcBatchSize() { } public Long splitSizeOption() { - return confParser.longConf() - .option(SparkReadOptions.SPLIT_SIZE) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.SPLIT_SIZE).parseOptional(); } public long splitSize() { - return confParser.longConf() + return confParser + .longConf() .option(SparkReadOptions.SPLIT_SIZE) .tableProperty(TableProperties.SPLIT_SIZE) .defaultValue(TableProperties.SPLIT_SIZE_DEFAULT) @@ -172,13 +166,12 @@ public long splitSize() { } public Integer splitLookbackOption() { - return confParser.intConf() - .option(SparkReadOptions.LOOKBACK) - .parseOptional(); + return confParser.intConf().option(SparkReadOptions.LOOKBACK).parseOptional(); } public int splitLookback() { - return confParser.intConf() + return confParser + .intConf() .option(SparkReadOptions.LOOKBACK) .tableProperty(TableProperties.SPLIT_LOOKBACK) .defaultValue(TableProperties.SPLIT_LOOKBACK_DEFAULT) @@ -186,13 +179,12 @@ public int splitLookback() { } public Long splitOpenFileCostOption() { - return confParser.longConf() - .option(SparkReadOptions.FILE_OPEN_COST) - .parseOptional(); + return confParser.longConf().option(SparkReadOptions.FILE_OPEN_COST).parseOptional(); } public long splitOpenFileCost() { - return confParser.longConf() + return confParser + .longConf() .option(SparkReadOptions.FILE_OPEN_COST) .tableProperty(TableProperties.SPLIT_OPEN_FILE_COST) .defaultValue(TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT) @@ -201,18 +193,20 @@ public long splitOpenFileCost() { /** * Enables reading a timestamp without time zone as a timestamp with time zone. - *

    - * Generally, this is not safe as a timestamp without time zone is supposed to represent the wall-clock time, - * i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, - * but a timestamp with time zone represents instant semantics, i.e. the timestamp - * is adjusted so that the corresponding time in the reader timezone is displayed. - *

    - * When set to false (default), an exception must be thrown while reading a timestamp without time zone. + * + *

    Generally, this is not safe as a timestamp without time zone is supposed to represent the + * wall-clock time, i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, + * but a timestamp with time zone represents instant semantics, i.e. the timestamp is adjusted so + * that the corresponding time in the reader timezone is displayed. + * + *

    When set to false (default), an exception must be thrown while reading a timestamp without + * time zone. * * @return boolean indicating if reading timestamps without timezone is allowed */ public boolean handleTimestampWithoutZone() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .sessionConf(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .defaultValue(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT) @@ -220,7 +214,8 @@ public boolean handleTimestampWithoutZone() { } public Long streamFromTimestamp() { - return confParser.longConf() + return confParser + .longConf() .option(SparkReadOptions.STREAM_FROM_TIMESTAMP) .defaultValue(Long.MIN_VALUE) .parse(); diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java index 1514d68353d9..9515a48bc297 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java @@ -16,16 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; -/** - * Spark DF read options - */ +/** Spark DF read options */ public class SparkReadOptions { - private SparkReadOptions() { - } + private SparkReadOptions() {} // Snapshot ID of the table snapshot to read public static final String SNAPSHOT_ID = "snapshot-id"; @@ -62,11 +58,13 @@ private SparkReadOptions() { public static final boolean STREAMING_SKIP_DELETE_SNAPSHOTS_DEFAULT = false; // skip snapshots of type overwrite while reading stream out of iceberg table - public static final String STREAMING_SKIP_OVERWRITE_SNAPSHOTS = "streaming-skip-overwrite-snapshots"; + public static final String STREAMING_SKIP_OVERWRITE_SNAPSHOTS = + "streaming-skip-overwrite-snapshots"; public static final boolean STREAMING_SKIP_OVERWRITE_SNAPSHOTS_DEFAULT = false; // Controls whether to allow reading timestamps without zone info - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = "handle-timestamp-without-timezone"; + public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = + "handle-timestamp-without-timezone"; // Controls whether to report locality information to Spark while allocating input partitions public static final String LOCALITY = "locality"; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java index f2dcc13bece0..fa8bd719f391 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java @@ -16,19 +16,18 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; public class SparkSQLProperties { - private SparkSQLProperties() { - } + private SparkSQLProperties() {} // Controls whether vectorized reads are enabled public static final String VECTORIZATION_ENABLED = "spark.sql.iceberg.vectorization.enabled"; // Controls whether reading/writing timestamps without timezones is allowed - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = "spark.sql.iceberg.handle-timestamp-without-timezone"; + public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = + "spark.sql.iceberg.handle-timestamp-without-timezone"; public static final boolean HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT = false; // Controls whether timestamp types for new tables should be stored with timezone info diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java index c242f8535206..822a5cc97ea2 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkSchemaUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Collection; @@ -45,17 +44,14 @@ import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.StructType; -/** - * Helper methods for working with Spark/Hive metadata. - */ +/** Helper methods for working with Spark/Hive metadata. */ public class SparkSchemaUtil { - private SparkSchemaUtil() { - } + private SparkSchemaUtil() {} /** * Returns a {@link Schema} for the given table with fresh field ids. - *

    - * This creates a Schema for an existing table by looking up the table's schema with Spark and + * + *

    This creates a Schema for an existing table by looking up the table's schema with Spark and * converting that schema. Spark/Hive partition columns are included in the schema. * * @param spark a Spark session @@ -70,8 +66,8 @@ public static Schema schemaForTable(SparkSession spark, String name) { /** * Returns a {@link PartitionSpec} for the given table. - *

    - * This creates a partition spec for an existing table by looking up the table's schema and + * + *

    This creates a partition spec for an existing table by looking up the table's schema and * creating a spec with identity partitions for each partition column. * * @param spark a Spark session @@ -79,14 +75,15 @@ public static Schema schemaForTable(SparkSession spark, String name) { * @return a PartitionSpec for the table * @throws AnalysisException if thrown by the Spark catalog */ - public static PartitionSpec specForTable(SparkSession spark, String name) throws AnalysisException { + public static PartitionSpec specForTable(SparkSession spark, String name) + throws AnalysisException { List parts = Lists.newArrayList(Splitter.on('.').limit(2).split(name)); String db = parts.size() == 1 ? "default" : parts.get(0); String table = parts.get(parts.size() == 1 ? 0 : 1); - PartitionSpec spec = identitySpec( - schemaForTable(spark, name), - spark.catalog().listColumns(db, table).collectAsList()); + PartitionSpec spec = + identitySpec( + schemaForTable(spark, name), spark.catalog().listColumns(db, table).collectAsList()); return spec == null ? PartitionSpec.unpartitioned() : spec; } @@ -114,13 +111,14 @@ public static DataType convert(Type type) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. - *

    - * This conversion assigns fresh ids. - *

    - * Some data types are represented as the same Spark type. These are converted to a default type. - *

    - * To convert using a reference schema for field ids and ambiguous types, use - * {@link #convert(Schema, StructType)}. + * + *

    This conversion assigns fresh ids. + * + *

    Some data types are represented as the same Spark type. These are converted to a default + * type. + * + *

    To convert using a reference schema for field ids and ambiguous types, use {@link + * #convert(Schema, StructType)}. * * @param sparkType a Spark StructType * @return the equivalent Schema @@ -132,16 +130,18 @@ public static Schema convert(StructType sparkType) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. - *

    - * This conversion assigns fresh ids. - *

    - * Some data types are represented as the same Spark type. These are converted to a default type. - *

    - * To convert using a reference schema for field ids and ambiguous types, use - * {@link #convert(Schema, StructType)}. + * + *

    This conversion assigns fresh ids. + * + *

    Some data types are represented as the same Spark type. These are converted to a default + * type. + * + *

    To convert using a reference schema for field ids and ambiguous types, use {@link + * #convert(Schema, StructType)}. * * @param sparkType a Spark StructType - * @param useTimestampWithoutZone boolean flag indicates that timestamp should be stored without timezone + * @param useTimestampWithoutZone boolean flag indicates that timestamp should be stored without + * timezone * @return the equivalent Schema * @throws IllegalArgumentException if the type cannot be converted */ @@ -156,13 +156,14 @@ public static Schema convert(StructType sparkType, boolean useTimestampWithoutZo /** * Convert a Spark {@link DataType struct} to a {@link Type} with new field ids. - *

    - * This conversion assigns fresh ids. - *

    - * Some data types are represented as the same Spark type. These are converted to a default type. - *

    - * To convert using a reference schema for field ids and ambiguous types, use - * {@link #convert(Schema, StructType)}. + * + *

    This conversion assigns fresh ids. + * + *

    Some data types are represented as the same Spark type. These are converted to a default + * type. + * + *

    To convert using a reference schema for field ids and ambiguous types, use {@link + * #convert(Schema, StructType)}. * * @param sparkType a Spark DataType * @return the equivalent Type @@ -174,11 +175,11 @@ public static Type convert(DataType sparkType) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} based on the given schema. - *

    - * This conversion does not assign new ids; it uses ids from the base schema. - *

    - * Data types, field order, and nullability will match the spark type. This conversion may return - * a schema that is not compatible with base schema. + * + *

    This conversion does not assign new ids; it uses ids from the base schema. + * + *

    Data types, field order, and nullability will match the spark type. This conversion may + * return a schema that is not compatible with base schema. * * @param baseSchema a Schema on which conversion is based * @param sparkType a Spark StructType @@ -187,7 +188,8 @@ public static Type convert(DataType sparkType) { */ public static Schema convert(Schema baseSchema, StructType sparkType) { // convert to a type with fresh ids - Types.StructType struct = SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); + Types.StructType struct = + SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); // reassign ids to match the base schema Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema); // fix types that can't be represented in Spark (UUID and Fixed) @@ -196,11 +198,11 @@ public static Schema convert(Schema baseSchema, StructType sparkType) { /** * Convert a Spark {@link StructType struct} to a {@link Schema} based on the given schema. - *

    - * This conversion will assign new ids for fields that are not found in the base schema. - *

    - * Data types, field order, and nullability will match the spark type. This conversion may return - * a schema that is not compatible with base schema. + * + *

    This conversion will assign new ids for fields that are not found in the base schema. + * + *

    Data types, field order, and nullability will match the spark type. This conversion may + * return a schema that is not compatible with base schema. * * @param baseSchema a Schema on which conversion is based * @param sparkType a Spark StructType @@ -209,7 +211,8 @@ public static Schema convert(Schema baseSchema, StructType sparkType) { */ public static Schema convertWithFreshIds(Schema baseSchema, StructType sparkType) { // convert to a type with fresh ids - Types.StructType struct = SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); + Types.StructType struct = + SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); // reassign ids to match the base schema Schema schema = TypeUtil.reassignOrRefreshIds(new Schema(struct.fields()), baseSchema); // fix types that can't be represented in Spark (UUID and Fixed) @@ -218,8 +221,8 @@ public static Schema convertWithFreshIds(Schema baseSchema, StructType sparkType /** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - *

    - * This requires that the Spark type is a projection of the Schema. Nullability and types must + * + *

    This requires that the Spark type is a projection of the Schema. Nullability and types must * match. * * @param schema a Schema @@ -228,19 +231,20 @@ public static Schema convertWithFreshIds(Schema baseSchema, StructType sparkType * @throws IllegalArgumentException if the Spark type does not match the Schema */ public static Schema prune(Schema schema, StructType requestedType) { - return new Schema(TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, ImmutableSet.of())) - .asNestedType() - .asStructType() - .fields()); + return new Schema( + TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, ImmutableSet.of())) + .asNestedType() + .asStructType() + .fields()); } /** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - *

    - * This requires that the Spark type is a projection of the Schema. Nullability and types must + * + *

    This requires that the Spark type is a projection of the Schema. Nullability and types must * match. - *

    - * The filters list of {@link Expression} is used to ensure that columns referenced by filters + * + *

    The filters list of {@link Expression} is used to ensure that columns referenced by filters * are projected. * * @param schema a Schema @@ -251,19 +255,20 @@ public static Schema prune(Schema schema, StructType requestedType) { */ public static Schema prune(Schema schema, StructType requestedType, List filters) { Set filterRefs = Binder.boundReferences(schema.asStruct(), filters, true); - return new Schema(TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) - .asNestedType() - .asStructType() - .fields()); + return new Schema( + TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) + .asNestedType() + .asStructType() + .fields()); } /** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. - *

    - * This requires that the Spark type is a projection of the Schema. Nullability and types must + * + *

    This requires that the Spark type is a projection of the Schema. Nullability and types must * match. - *

    - * The filters list of {@link Expression} is used to ensure that columns referenced by filters + * + *

    The filters list of {@link Expression} is used to ensure that columns referenced by filters * are projected. * * @param schema a Schema @@ -272,14 +277,16 @@ public static Schema prune(Schema schema, StructType requestedType, List filterRefs = Binder.boundReferences(schema.asStruct(), Collections.singletonList(filter), caseSensitive); - return new Schema(TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) - .asNestedType() - .asStructType() - .fields()); + return new Schema( + TypeUtil.visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) + .asNestedType() + .asStructType() + .fields()); } private static PartitionSpec identitySpec(Schema schema, Collection columns) { @@ -309,7 +316,7 @@ private static PartitionSpec identitySpec(Schema schema, List partitionN /** * Estimate approximate table size based on Spark schema and total records. * - * @param tableSchema Spark schema + * @param tableSchema Spark schema * @param totalRecords total records in the table * @return approximate size based on table schema */ @@ -328,15 +335,18 @@ public static long estimateSize(StructType tableSchema, long totalRecords) { } public static void validateMetadataColumnReferences(Schema tableSchema, Schema readSchema) { - List conflictingColumnNames = readSchema.columns().stream() - .map(Types.NestedField::name) - .filter(name -> MetadataColumns.isMetadataColumn(name) && tableSchema.findField(name) != null) - .collect(Collectors.toList()); + List conflictingColumnNames = + readSchema.columns().stream() + .map(Types.NestedField::name) + .filter( + name -> + MetadataColumns.isMetadataColumn(name) && tableSchema.findField(name) != null) + .collect(Collectors.toList()); ValidationException.check( conflictingColumnNames.isEmpty(), - "Table column names conflict with names reserved for Iceberg metadata columns: %s.\n" + - "Please, use ALTER TABLE statements to rename the conflicting table columns.", + "Table column names conflict with names reserved for Iceberg metadata columns: %s.\n" + + "Please, use ALTER TABLE statements to rename the conflicting table columns.", conflictingColumnNames); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkSessionCatalog.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkSessionCatalog.java index d9314194c7b9..ebf12cb2c22e 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkSessionCatalog.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkSessionCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -53,8 +52,8 @@ * * @param CatalogPlugin class to avoid casting to TableCatalog and SupportsNamespaces. */ -public class SparkSessionCatalog - extends BaseCatalog implements CatalogExtension { +public class SparkSessionCatalog extends BaseCatalog + implements CatalogExtension { private static final String[] DEFAULT_NAMESPACE = new String[] {"default"}; private String catalogName = null; @@ -67,8 +66,9 @@ public class SparkSessionCatalog /** * Build a {@link SparkCatalog} to be used for Iceberg operations. - *

    - * The default implementation creates a new SparkCatalog with the session catalog's name and options. + * + *

    The default implementation creates a new SparkCatalog with the session catalog's name and + * options. * * @param name catalog name * @param options catalog options @@ -101,17 +101,20 @@ public boolean namespaceExists(String[] namespace) { } @Override - public Map loadNamespaceMetadata(String[] namespace) throws NoSuchNamespaceException { + public Map loadNamespaceMetadata(String[] namespace) + throws NoSuchNamespaceException { return getSessionCatalog().loadNamespaceMetadata(namespace); } @Override - public void createNamespace(String[] namespace, Map metadata) throws NamespaceAlreadyExistsException { + public void createNamespace(String[] namespace, Map metadata) + throws NamespaceAlreadyExistsException { getSessionCatalog().createNamespace(namespace, metadata); } @Override - public void alterNamespace(String[] namespace, NamespaceChange... changes) throws NoSuchNamespaceException { + public void alterNamespace(String[] namespace, NamespaceChange... changes) + throws NoSuchNamespaceException { getSessionCatalog().alterNamespace(namespace, changes); } @@ -163,8 +166,8 @@ public void invalidateTable(Identifier ident) { } @Override - public Table createTable(Identifier ident, StructType schema, Transform[] partitions, - Map properties) + public Table createTable( + Identifier ident, StructType schema, Transform[] partitions, Map properties) throws TableAlreadyExistsException, NoSuchNamespaceException { String provider = properties.get("provider"); if (useIceberg(provider)) { @@ -176,8 +179,8 @@ public Table createTable(Identifier ident, StructType schema, Transform[] partit } @Override - public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] partitions, - Map properties) + public StagedTable stageCreate( + Identifier ident, StructType schema, Transform[] partitions, Map properties) throws TableAlreadyExistsException, NoSuchNamespaceException { String provider = properties.get("provider"); TableCatalog catalog; @@ -190,14 +193,15 @@ public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] catalog = getSessionCatalog(); } - // create the table with the session catalog, then wrap it in a staged table that will delete to roll back + // create the table with the session catalog, then wrap it in a staged table that will delete to + // roll back Table table = catalog.createTable(ident, schema, partitions, properties); return new RollbackStagedTable(catalog, ident, table); } @Override - public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] partitions, - Map properties) + public StagedTable stageReplace( + Identifier ident, StructType schema, Transform[] partitions, Map properties) throws NoSuchNamespaceException, NoSuchTableException { String provider = properties.get("provider"); TableCatalog catalog; @@ -216,7 +220,8 @@ public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] } try { - // create the table with the session catalog, then wrap it in a staged table that will delete to roll back + // create the table with the session catalog, then wrap it in a staged table that will delete + // to roll back Table table = catalog.createTable(ident, schema, partitions, properties); return new RollbackStagedTable(catalog, ident, table); @@ -227,8 +232,9 @@ public StagedTable stageReplace(Identifier ident, StructType schema, Transform[] } @Override - public StagedTable stageCreateOrReplace(Identifier ident, StructType schema, Transform[] partitions, - Map properties) throws NoSuchNamespaceException { + public StagedTable stageCreateOrReplace( + Identifier ident, StructType schema, Transform[] partitions, Map properties) + throws NoSuchNamespaceException { String provider = properties.get("provider"); TableCatalog catalog; if (useIceberg(provider)) { @@ -244,7 +250,8 @@ public StagedTable stageCreateOrReplace(Identifier ident, StructType schema, Tra catalog.dropTable(ident); try { - // create the table with the session catalog, then wrap it in a staged table that will delete to roll back + // create the table with the session catalog, then wrap it in a staged table that will delete + // to roll back Table sessionCatalogTable = catalog.createTable(ident, schema, partitions, properties); return new RollbackStagedTable(catalog, ident, sessionCatalogTable); @@ -265,21 +272,25 @@ public Table alterTable(Identifier ident, TableChange... changes) throws NoSuchT @Override public boolean dropTable(Identifier ident) { - // no need to check table existence to determine which catalog to use. if a table doesn't exist then both are + // no need to check table existence to determine which catalog to use. if a table doesn't exist + // then both are // required to return false. return icebergCatalog.dropTable(ident) || getSessionCatalog().dropTable(ident); } @Override public boolean purgeTable(Identifier ident) { - // no need to check table existence to determine which catalog to use. if a table doesn't exist then both are + // no need to check table existence to determine which catalog to use. if a table doesn't exist + // then both are // required to return false. return icebergCatalog.purgeTable(ident) || getSessionCatalog().purgeTable(ident); } @Override - public void renameTable(Identifier from, Identifier to) throws NoSuchTableException, TableAlreadyExistsException { - // rename is not supported by HadoopCatalog. to avoid UnsupportedOperationException for session catalog tables, + public void renameTable(Identifier from, Identifier to) + throws NoSuchTableException, TableAlreadyExistsException { + // rename is not supported by HadoopCatalog. to avoid UnsupportedOperationException for session + // catalog tables, // check table existence first to ensure that the table belongs to the Iceberg catalog. if (icebergCatalog.tableExists(from)) { icebergCatalog.renameTable(from, to); @@ -316,15 +327,18 @@ private void validateHmsUri(String catalogHmsUri) { return; } - Preconditions.checkArgument(catalogHmsUri.equals(envHmsUri), + Preconditions.checkArgument( + catalogHmsUri.equals(envHmsUri), "Inconsistent Hive metastore URIs: %s (Spark session) != %s (spark_catalog)", - envHmsUri, catalogHmsUri); + envHmsUri, + catalogHmsUri); } @Override @SuppressWarnings("unchecked") public void setDelegateCatalog(CatalogPlugin sparkSessionCatalog) { - if (sparkSessionCatalog instanceof TableCatalog && sparkSessionCatalog instanceof SupportsNamespaces) { + if (sparkSessionCatalog instanceof TableCatalog + && sparkSessionCatalog instanceof SupportsNamespaces) { this.sessionCatalog = (T) sparkSessionCatalog; } else { throw new IllegalArgumentException("Invalid session catalog: " + sparkSessionCatalog); @@ -351,14 +365,17 @@ private boolean useIceberg(String provider) { } private T getSessionCatalog() { - Preconditions.checkNotNull(sessionCatalog, "Delegated SessionCatalog is missing. " + - "Please make sure your are replacing Spark's default catalog, named 'spark_catalog'."); + Preconditions.checkNotNull( + sessionCatalog, + "Delegated SessionCatalog is missing. " + + "Please make sure your are replacing Spark's default catalog, named 'spark_catalog'."); return sessionCatalog; } @Override public Catalog icebergCatalog() { - Preconditions.checkArgument(icebergCatalog instanceof HasIcebergCatalog, + Preconditions.checkArgument( + icebergCatalog instanceof HasIcebergCatalog, "Cannot return underlying Iceberg Catalog, wrapped catalog does not contain an Iceberg Catalog"); return ((HasIcebergCatalog) icebergCatalog).icebergCatalog(); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java index 30509e3381dc..77cfa0f34c63 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkStructLike.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.StructLike; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkTableCache.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkTableCache.java index ec587c529f41..6218423db491 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkTableCache.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkTableCache.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java index 2928d6664480..c38a394c5a25 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.spark.sql.functions.col; + import java.io.IOException; import java.io.Serializable; import java.net.URI; @@ -96,28 +97,25 @@ import scala.collection.mutable.Builder; import scala.runtime.AbstractPartialFunction; -import static org.apache.spark.sql.functions.col; - /** * Java version of the original SparkTableUtil.scala * https://github.com/apache/iceberg/blob/apache-iceberg-0.8.0-incubating/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala */ public class SparkTableUtil { - private static final String DUPLICATE_FILE_MESSAGE = "Cannot complete import because data files " + - "to be imported already exist within the target table: %s. " + - "This is disabled by default as Iceberg is not designed for multiple references to the same file" + - " within the same table. If you are sure, you may set 'check_duplicate_files' to false to force the import."; + private static final String DUPLICATE_FILE_MESSAGE = + "Cannot complete import because data files " + + "to be imported already exist within the target table: %s. " + + "This is disabled by default as Iceberg is not designed for multiple references to the same file" + + " within the same table. If you are sure, you may set 'check_duplicate_files' to false to force the import."; - - private SparkTableUtil() { - } + private SparkTableUtil() {} /** * Returns a DataFrame with a row for each partition in the table. * - * The DataFrame has 3 columns, partition key (a=1/b=2), partition location, and format - * (avro or parquet). + *

    The DataFrame has 3 columns, partition key (a=1/b=2), partition location, and format (avro + * or parquet). * * @param spark a Spark session * @param table a table name and (optional) database @@ -125,7 +123,9 @@ private SparkTableUtil() { */ public static Dataset partitionDF(SparkSession spark, String table) { List partitions = getPartitions(spark, table); - return spark.createDataFrame(partitions, SparkPartition.class).toDF("partition", "uri", "format"); + return spark + .createDataFrame(partitions, SparkPartition.class) + .toDF("partition", "uri", "format"); } /** @@ -136,9 +136,12 @@ public static Dataset partitionDF(SparkSession spark, String table) { * @param expression The expression whose matching partitions are returned. * @return a DataFrame of the table partitions. */ - public static Dataset partitionDFByFilter(SparkSession spark, String table, String expression) { + public static Dataset partitionDFByFilter( + SparkSession spark, String table, String expression) { List partitions = getPartitionsByFilter(spark, table, expression); - return spark.createDataFrame(partitions, SparkPartition.class).toDF("partition", "uri", "format"); + return spark + .createDataFrame(partitions, SparkPartition.class) + .toDF("partition", "uri", "format"); } /** @@ -153,7 +156,8 @@ public static List getPartitions(SparkSession spark, String tabl TableIdentifier tableIdent = spark.sessionState().sqlParser().parseTableIdentifier(table); return getPartitions(spark, tableIdent, null); } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to parse table identifier: %s", table); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to parse table identifier: %s", table); } } @@ -165,8 +169,8 @@ public static List getPartitions(SparkSession spark, String tabl * @param partitionFilter partition filter, or null if no filter * @return all table's partitions */ - public static List getPartitions(SparkSession spark, TableIdentifier tableIdent, - Map partitionFilter) { + public static List getPartitions( + SparkSession spark, TableIdentifier tableIdent, Map partitionFilter) { try { SessionCatalog catalog = spark.sessionState().catalog(); CatalogTable catalogTable = catalog.getTableMetadata(tableIdent); @@ -180,17 +184,17 @@ public static List getPartitions(SparkSession spark, TableIdenti } else { scalaPartitionFilter = Option.empty(); } - Seq partitions = catalog.listPartitions(tableIdent, scalaPartitionFilter).toIndexedSeq(); - return JavaConverters - .seqAsJavaListConverter(partitions) - .asJava() - .stream() + Seq partitions = + catalog.listPartitions(tableIdent, scalaPartitionFilter).toIndexedSeq(); + return JavaConverters.seqAsJavaListConverter(partitions).asJava().stream() .map(catalogPartition -> toSparkPartition(catalogPartition, catalogTable)) .collect(Collectors.toList()); } catch (NoSuchDatabaseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Database not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Database not found in catalog.", tableIdent); } catch (NoSuchTableException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Table not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Table not found in catalog.", tableIdent); } } @@ -202,19 +206,22 @@ public static List getPartitions(SparkSession spark, TableIdenti * @param predicate a predicate on partition columns * @return matching table's partitions */ - public static List getPartitionsByFilter(SparkSession spark, String table, String predicate) { + public static List getPartitionsByFilter( + SparkSession spark, String table, String predicate) { TableIdentifier tableIdent; try { tableIdent = spark.sessionState().sqlParser().parseTableIdentifier(table); } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to parse the table identifier: %s", table); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to parse the table identifier: %s", table); } Expression unresolvedPredicateExpr; try { unresolvedPredicateExpr = spark.sessionState().sqlParser().parseExpression(predicate); } catch (ParseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to parse the predicate expression: %s", predicate); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to parse the predicate expression: %s", predicate); } Expression resolvedPredicateExpr = resolveAttrs(spark, table, unresolvedPredicateExpr); @@ -229,8 +236,8 @@ public static List getPartitionsByFilter(SparkSession spark, Str * @param predicateExpr a predicate expression on partition columns * @return matching table's partitions */ - public static List getPartitionsByFilter(SparkSession spark, TableIdentifier tableIdent, - Expression predicateExpr) { + public static List getPartitionsByFilter( + SparkSession spark, TableIdentifier tableIdent, Expression predicateExpr) { try { SessionCatalog catalog = spark.sessionState().catalog(); CatalogTable catalogTable = catalog.getTableMetadata(tableIdent); @@ -241,111 +248,131 @@ public static List getPartitionsByFilter(SparkSession spark, Tab } else { resolvedPredicateExpr = predicateExpr; } - Seq predicates = JavaConverters - .collectionAsScalaIterableConverter(ImmutableList.of(resolvedPredicateExpr)) - .asScala().toIndexedSeq(); + Seq predicates = + JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(resolvedPredicateExpr)) + .asScala() + .toIndexedSeq(); - Seq partitions = catalog.listPartitionsByFilter(tableIdent, predicates).toIndexedSeq(); + Seq partitions = + catalog.listPartitionsByFilter(tableIdent, predicates).toIndexedSeq(); - return JavaConverters - .seqAsJavaListConverter(partitions) - .asJava() - .stream() + return JavaConverters.seqAsJavaListConverter(partitions).asJava().stream() .map(catalogPartition -> toSparkPartition(catalogPartition, catalogTable)) .collect(Collectors.toList()); } catch (NoSuchDatabaseException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Database not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Database not found in catalog.", tableIdent); } catch (NoSuchTableException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unknown table: %s. Table not found in catalog.", tableIdent); + throw SparkExceptionUtil.toUncheckedException( + e, "Unknown table: %s. Table not found in catalog.", tableIdent); } } /** * Returns the data files in a partition by listing the partition location. * - * For Parquet and ORC partitions, this will read metrics from the file footer. For Avro partitions, - * metrics are set to null. + *

    For Parquet and ORC partitions, this will read metrics from the file footer. For Avro + * partitions, metrics are set to null. * * @param partition a partition * @param conf a serializable Hadoop conf * @param metricsConfig a metrics conf * @return a List of DataFile - * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, Configuration, - * MetricsConfig, NameMapping)} + * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, + * Configuration, MetricsConfig, NameMapping)} */ @Deprecated - public static List listPartition(SparkPartition partition, PartitionSpec spec, - SerializableConfiguration conf, MetricsConfig metricsConfig) { + public static List listPartition( + SparkPartition partition, + PartitionSpec spec, + SerializableConfiguration conf, + MetricsConfig metricsConfig) { return listPartition(partition, spec, conf, metricsConfig, null); } /** * Returns the data files in a partition by listing the partition location. * - * For Parquet and ORC partitions, this will read metrics from the file footer. For Avro partitions, - * metrics are set to null. + *

    For Parquet and ORC partitions, this will read metrics from the file footer. For Avro + * partitions, metrics are set to null. * * @param partition a partition * @param conf a serializable Hadoop conf * @param metricsConfig a metrics conf * @param mapping a name mapping * @return a List of DataFile - * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, Configuration, - * MetricsConfig, NameMapping)} + * @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec, + * Configuration, MetricsConfig, NameMapping)} */ @Deprecated - public static List listPartition(SparkPartition partition, PartitionSpec spec, - SerializableConfiguration conf, MetricsConfig metricsConfig, - NameMapping mapping) { - return TableMigrationUtil.listPartition(partition.values, partition.uri, partition.format, spec, conf.get(), - metricsConfig, mapping); + public static List listPartition( + SparkPartition partition, + PartitionSpec spec, + SerializableConfiguration conf, + MetricsConfig metricsConfig, + NameMapping mapping) { + return TableMigrationUtil.listPartition( + partition.values, + partition.uri, + partition.format, + spec, + conf.get(), + metricsConfig, + mapping); } - - private static SparkPartition toSparkPartition(CatalogTablePartition partition, CatalogTable table) { + private static SparkPartition toSparkPartition( + CatalogTablePartition partition, CatalogTable table) { Option locationUri = partition.storage().locationUri(); Option serde = partition.storage().serde(); Preconditions.checkArgument(locationUri.nonEmpty(), "Partition URI should be defined"); - Preconditions.checkArgument(serde.nonEmpty() || table.provider().nonEmpty(), - "Partition format should be defined"); + Preconditions.checkArgument( + serde.nonEmpty() || table.provider().nonEmpty(), "Partition format should be defined"); String uri = Util.uriToString(locationUri.get()); String format = serde.nonEmpty() ? serde.get() : table.provider().get(); - Map partitionSpec = JavaConverters.mapAsJavaMapConverter(partition.spec()).asJava(); + Map partitionSpec = + JavaConverters.mapAsJavaMapConverter(partition.spec()).asJava(); return new SparkPartition(partitionSpec, uri, format); } private static Expression resolveAttrs(SparkSession spark, String table, Expression expr) { Function2 resolver = spark.sessionState().analyzer().resolver(); LogicalPlan plan = spark.table(table).queryExecution().analyzed(); - return expr.transform(new AbstractPartialFunction() { - @Override - public Expression apply(Expression attr) { - UnresolvedAttribute unresolvedAttribute = (UnresolvedAttribute) attr; - Option namedExpressionOption = plan.resolve(unresolvedAttribute.nameParts(), resolver); - if (namedExpressionOption.isDefined()) { - return (Expression) namedExpressionOption.get(); - } else { - throw new IllegalArgumentException( - String.format("Could not resolve %s using columns: %s", attr, plan.output())); - } - } - - @Override - public boolean isDefinedAt(Expression attr) { - return attr instanceof UnresolvedAttribute; - } - }); + return expr.transform( + new AbstractPartialFunction() { + @Override + public Expression apply(Expression attr) { + UnresolvedAttribute unresolvedAttribute = (UnresolvedAttribute) attr; + Option namedExpressionOption = + plan.resolve(unresolvedAttribute.nameParts(), resolver); + if (namedExpressionOption.isDefined()) { + return (Expression) namedExpressionOption.get(); + } else { + throw new IllegalArgumentException( + String.format("Could not resolve %s using columns: %s", attr, plan.output())); + } + } + + @Override + public boolean isDefinedAt(Expression attr) { + return attr instanceof UnresolvedAttribute; + } + }); } - private static Iterator buildManifest(SerializableConfiguration conf, PartitionSpec spec, - String basePath, Iterator> fileTuples) { + private static Iterator buildManifest( + SerializableConfiguration conf, + PartitionSpec spec, + String basePath, + Iterator> fileTuples) { if (fileTuples.hasNext()) { FileIO io = new HadoopFileIO(conf.get()); TaskContext ctx = TaskContext.get(); - String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId()); + String suffix = + String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId()); Path location = new Path(basePath, suffix); String outputPath = FileFormat.AVRO.addExtension(location.toString()); OutputFile outputFile = io.newOutputFile(outputPath); @@ -354,7 +381,8 @@ private static Iterator buildManifest(SerializableConfiguration co try (ManifestWriter writerRef = writer) { fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2)); } catch (IOException e) { - throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath); + throw SparkExceptionUtil.toUncheckedException( + e, "Unable to close the manifest writer: %s", outputPath); } ManifestFile manifestFile = writer.toManifestFile(); @@ -367,42 +395,54 @@ private static Iterator buildManifest(SerializableConfiguration co /** * Import files from an existing Spark table to an Iceberg table. * - * The import uses the Spark session to get table metadata. It assumes no - * operation is going on the original and target table and thus is not - * thread-safe. + *

    The import uses the Spark session to get table metadata. It assumes no operation is going on + * the original and target table and thus is not thread-safe. * * @param spark a Spark session * @param sourceTableIdent an identifier of the source Spark table * @param targetTable an Iceberg table where to import the data * @param stagingDir a staging directory to store temporary manifest files - * @param partitionFilter only import partitions whose values match those in the map, can be partially defined + * @param partitionFilter only import partitions whose values match those in the map, can be + * partially defined * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file */ - public static void importSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, - String stagingDir, Map partitionFilter, - boolean checkDuplicateFiles) { + public static void importSparkTable( + SparkSession spark, + TableIdentifier sourceTableIdent, + Table targetTable, + String stagingDir, + Map partitionFilter, + boolean checkDuplicateFiles) { SessionCatalog catalog = spark.sessionState().catalog(); - String db = sourceTableIdent.database().nonEmpty() ? - sourceTableIdent.database().get() : - catalog.getCurrentDatabase(); - TableIdentifier sourceTableIdentWithDB = new TableIdentifier(sourceTableIdent.table(), Some.apply(db)); + String db = + sourceTableIdent.database().nonEmpty() + ? sourceTableIdent.database().get() + : catalog.getCurrentDatabase(); + TableIdentifier sourceTableIdentWithDB = + new TableIdentifier(sourceTableIdent.table(), Some.apply(db)); if (!catalog.tableExists(sourceTableIdentWithDB)) { - throw new org.apache.iceberg.exceptions.NoSuchTableException("Table %s does not exist", sourceTableIdentWithDB); + throw new org.apache.iceberg.exceptions.NoSuchTableException( + "Table %s does not exist", sourceTableIdentWithDB); } try { - PartitionSpec spec = SparkSchemaUtil.specForTable(spark, sourceTableIdentWithDB.unquotedString()); + PartitionSpec spec = + SparkSchemaUtil.specForTable(spark, sourceTableIdentWithDB.unquotedString()); if (Objects.equal(spec, PartitionSpec.unpartitioned())) { - importUnpartitionedSparkTable(spark, sourceTableIdentWithDB, targetTable, checkDuplicateFiles); + importUnpartitionedSparkTable( + spark, sourceTableIdentWithDB, targetTable, checkDuplicateFiles); } else { - List sourceTablePartitions = getPartitions(spark, sourceTableIdent, - partitionFilter); - Preconditions.checkArgument(!sourceTablePartitions.isEmpty(), - "Cannot find any partitions in table %s", sourceTableIdent); - importSparkPartitions(spark, sourceTablePartitions, targetTable, spec, stagingDir, checkDuplicateFiles); + List sourceTablePartitions = + getPartitions(spark, sourceTableIdent, partitionFilter); + Preconditions.checkArgument( + !sourceTablePartitions.isEmpty(), + "Cannot find any partitions in table %s", + sourceTableIdent); + importSparkPartitions( + spark, sourceTablePartitions, targetTable, spec, stagingDir, checkDuplicateFiles); } } catch (AnalysisException e) { throw SparkExceptionUtil.toUncheckedException( @@ -413,9 +453,8 @@ public static void importSparkTable(SparkSession spark, TableIdentifier sourceTa /** * Import files from an existing Spark table to an Iceberg table. * - * The import uses the Spark session to get table metadata. It assumes no - * operation is going on the original and target table and thus is not - * thread-safe. + *

    The import uses the Spark session to get table metadata. It assumes no operation is going on + * the original and target table and thus is not thread-safe. * * @param spark a Spark session * @param sourceTableIdent an identifier of the source Spark table @@ -423,33 +462,49 @@ public static void importSparkTable(SparkSession spark, TableIdentifier sourceTa * @param stagingDir a staging directory to store temporary manifest files * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file */ - public static void importSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, - String stagingDir, boolean checkDuplicateFiles) { - importSparkTable(spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), checkDuplicateFiles); + public static void importSparkTable( + SparkSession spark, + TableIdentifier sourceTableIdent, + Table targetTable, + String stagingDir, + boolean checkDuplicateFiles) { + importSparkTable( + spark, + sourceTableIdent, + targetTable, + stagingDir, + Collections.emptyMap(), + checkDuplicateFiles); } /** * Import files from an existing Spark table to an Iceberg table. * - * The import uses the Spark session to get table metadata. It assumes no - * operation is going on the original and target table and thus is not - * thread-safe. + *

    The import uses the Spark session to get table metadata. It assumes no operation is going on + * the original and target table and thus is not thread-safe. + * * @param spark a Spark session * @param sourceTableIdent an identifier of the source Spark table * @param targetTable an Iceberg table where to import the data * @param stagingDir a staging directory to store temporary manifest files */ - public static void importSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, - String stagingDir) { - importSparkTable(spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), false); + public static void importSparkTable( + SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, String stagingDir) { + importSparkTable( + spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), false); } - private static void importUnpartitionedSparkTable(SparkSession spark, TableIdentifier sourceTableIdent, - Table targetTable, boolean checkDuplicateFiles) { + private static void importUnpartitionedSparkTable( + SparkSession spark, + TableIdentifier sourceTableIdent, + Table targetTable, + boolean checkDuplicateFiles) { try { CatalogTable sourceTable = spark.sessionState().catalog().getTableMetadata(sourceTableIdent); Option format = - sourceTable.storage().serde().nonEmpty() ? sourceTable.storage().serde() : sourceTable.provider(); + sourceTable.storage().serde().nonEmpty() + ? sourceTable.storage().serde() + : sourceTable.provider(); Preconditions.checkArgument(format.nonEmpty(), "Could not determine table format"); Map partition = Collections.emptyMap(); @@ -457,20 +512,34 @@ private static void importUnpartitionedSparkTable(SparkSession spark, TableIdent Configuration conf = spark.sessionState().newHadoopConf(); MetricsConfig metricsConfig = MetricsConfig.forTable(targetTable); String nameMappingString = targetTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; - - List files = TableMigrationUtil.listPartition( - partition, Util.uriToString(sourceTable.location()), format.get(), spec, conf, metricsConfig, nameMapping); + NameMapping nameMapping = + nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; + + List files = + TableMigrationUtil.listPartition( + partition, + Util.uriToString(sourceTable.location()), + format.get(), + spec, + conf, + metricsConfig, + nameMapping); if (checkDuplicateFiles) { - Dataset importedFiles = spark.createDataset( - Lists.transform(files, f -> f.path().toString()), Encoders.STRING()).toDF("file_path"); - Dataset existingFiles = loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); - Column joinCond = existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); - Dataset duplicates = importedFiles.join(existingFiles, joinCond) - .select("file_path").as(Encoders.STRING()); - Preconditions.checkState(duplicates.isEmpty(), - String.format(DUPLICATE_FILE_MESSAGE, Joiner.on(",").join((String[]) duplicates.take(10)))); + Dataset importedFiles = + spark + .createDataset(Lists.transform(files, f -> f.path().toString()), Encoders.STRING()) + .toDF("file_path"); + Dataset existingFiles = + loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); + Column joinCond = + existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); + Dataset duplicates = + importedFiles.join(existingFiles, joinCond).select("file_path").as(Encoders.STRING()); + Preconditions.checkState( + duplicates.isEmpty(), + String.format( + DUPLICATE_FILE_MESSAGE, Joiner.on(",").join((String[]) duplicates.take(10)))); } AppendFiles append = targetTable.newAppend(); @@ -495,57 +564,75 @@ private static void importUnpartitionedSparkTable(SparkSession spark, TableIdent * @param stagingDir a staging directory to store temporary manifest files * @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file */ - public static void importSparkPartitions(SparkSession spark, List partitions, Table targetTable, - PartitionSpec spec, String stagingDir, boolean checkDuplicateFiles) { + public static void importSparkPartitions( + SparkSession spark, + List partitions, + Table targetTable, + PartitionSpec spec, + String stagingDir, + boolean checkDuplicateFiles) { Configuration conf = spark.sessionState().newHadoopConf(); SerializableConfiguration serializableConf = new SerializableConfiguration(conf); - int parallelism = Math.min(partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism()); + int parallelism = + Math.min( + partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism()); int numShufflePartitions = spark.sessionState().conf().numShufflePartitions(); MetricsConfig metricsConfig = MetricsConfig.fromProperties(targetTable.properties()); String nameMappingString = targetTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; + NameMapping nameMapping = + nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD partitionRDD = sparkContext.parallelize(partitions, parallelism); - Dataset partitionDS = spark.createDataset( - partitionRDD.rdd(), - Encoders.javaSerialization(SparkPartition.class)); + Dataset partitionDS = + spark.createDataset(partitionRDD.rdd(), Encoders.javaSerialization(SparkPartition.class)); - Dataset filesToImport = partitionDS - .flatMap((FlatMapFunction) sparkPartition -> - listPartition(sparkPartition, spec, serializableConf, metricsConfig, nameMapping).iterator(), + Dataset filesToImport = + partitionDS.flatMap( + (FlatMapFunction) + sparkPartition -> + listPartition( + sparkPartition, spec, serializableConf, metricsConfig, nameMapping) + .iterator(), Encoders.javaSerialization(DataFile.class)); if (checkDuplicateFiles) { - Dataset importedFiles = filesToImport - .map((MapFunction) f -> f.path().toString(), Encoders.STRING()) - .toDF("file_path"); + Dataset importedFiles = + filesToImport + .map((MapFunction) f -> f.path().toString(), Encoders.STRING()) + .toDF("file_path"); Dataset existingFiles = loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES); - Column joinCond = existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); - Dataset duplicates = importedFiles.join(existingFiles, joinCond) - .select("file_path").as(Encoders.STRING()); - Preconditions.checkState(duplicates.isEmpty(), - String.format(DUPLICATE_FILE_MESSAGE, Joiner.on(",").join((String[]) duplicates.take(10)))); + Column joinCond = + existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path")); + Dataset duplicates = + importedFiles.join(existingFiles, joinCond).select("file_path").as(Encoders.STRING()); + Preconditions.checkState( + duplicates.isEmpty(), + String.format( + DUPLICATE_FILE_MESSAGE, Joiner.on(",").join((String[]) duplicates.take(10)))); } - List manifests = filesToImport - .repartition(numShufflePartitions) - .map((MapFunction>) file -> - Tuple2.apply(file.path().toString(), file), - Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class))) - .orderBy(col("_1")) - .mapPartitions( - (MapPartitionsFunction, ManifestFile>) fileTuple -> - buildManifest(serializableConf, spec, stagingDir, fileTuple), - Encoders.javaSerialization(ManifestFile.class)) - .collectAsList(); + List manifests = + filesToImport + .repartition(numShufflePartitions) + .map( + (MapFunction>) + file -> Tuple2.apply(file.path().toString(), file), + Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class))) + .orderBy(col("_1")) + .mapPartitions( + (MapPartitionsFunction, ManifestFile>) + fileTuple -> buildManifest(serializableConf, spec, stagingDir, fileTuple), + Encoders.javaSerialization(ManifestFile.class)) + .collectAsList(); try { - boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean( - targetTable.properties(), - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); + boolean snapshotIdInheritanceEnabled = + PropertyUtil.propertyAsBoolean( + targetTable.properties(), + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); AppendFiles append = targetTable.newAppend(); manifests.forEach(append::appendManifest); @@ -570,13 +657,17 @@ public static void importSparkPartitions(SparkSession spark, List partitions, Table targetTable, - PartitionSpec spec, String stagingDir) { + public static void importSparkPartitions( + SparkSession spark, + List partitions, + Table targetTable, + PartitionSpec spec, + String stagingDir) { importSparkPartitions(spark, partitions, targetTable, spec, stagingDir, false); } - public static List filterPartitions(List partitions, - Map partitionFilter) { + public static List filterPartitions( + List partitions, Map partitionFilter) { if (partitionFilter.isEmpty()) { return partitions; } else { @@ -597,28 +688,30 @@ private static void deleteManifests(FileIO io, List manifests) { /** * Loads a metadata table. * - * @deprecated since 0.14.0, will be removed in 0.15.0; - * use {@link #loadMetadataTable(SparkSession, Table, MetadataTableType)}. + * @deprecated since 0.14.0, will be removed in 0.15.0; use {@link + * #loadMetadataTable(SparkSession, Table, MetadataTableType)}. */ @Deprecated - public static Dataset loadCatalogMetadataTable(SparkSession spark, Table table, MetadataTableType type) { + public static Dataset loadCatalogMetadataTable( + SparkSession spark, Table table, MetadataTableType type) { return loadMetadataTable(spark, table, type); } - public static Dataset loadMetadataTable(SparkSession spark, Table table, MetadataTableType type) { + public static Dataset loadMetadataTable( + SparkSession spark, Table table, MetadataTableType type) { return loadMetadataTable(spark, table, type, ImmutableMap.of()); } - public static Dataset loadMetadataTable(SparkSession spark, Table table, MetadataTableType type, - Map extraOptions) { - SparkTable metadataTable = new SparkTable(MetadataTableUtils.createMetadataTableInstance(table, type), false); + public static Dataset loadMetadataTable( + SparkSession spark, Table table, MetadataTableType type, Map extraOptions) { + SparkTable metadataTable = + new SparkTable(MetadataTableUtils.createMetadataTableInstance(table, type), false); CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(extraOptions); - return Dataset.ofRows(spark, DataSourceV2Relation.create(metadataTable, Some.empty(), Some.empty(), options)); + return Dataset.ofRows( + spark, DataSourceV2Relation.create(metadataTable, Some.empty(), Some.empty(), options)); } - /** - * Class representing a table partition. - */ + /** Class representing a table partition. */ public static class SparkPartition implements Serializable { private final Map values; private final String uri; @@ -660,9 +753,9 @@ public boolean equals(Object o) { return false; } SparkPartition that = (SparkPartition) o; - return Objects.equal(values, that.values) && - Objects.equal(uri, that.uri) && - Objects.equal(format, that.format); + return Objects.equal(values, that.values) + && Objects.equal(uri, that.uri) + && Objects.equal(format, that.format); } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java index f0b8b2a9762b..17499736fbeb 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -70,7 +69,7 @@ public Type struct(StructType struct, List types) { List newFields = Lists.newArrayListWithExpectedSize(fields.length); boolean isRoot = root == struct; for (int i = 0; i < fields.length; i += 1) { - StructField field = fields[i]; + StructField field = fields[i]; Type type = types.get(i); int id; @@ -122,10 +121,9 @@ public Type atomic(DataType atomic) { if (atomic instanceof BooleanType) { return Types.BooleanType.get(); - } else if ( - atomic instanceof IntegerType || - atomic instanceof ShortType || - atomic instanceof ByteType) { + } else if (atomic instanceof IntegerType + || atomic instanceof ShortType + || atomic instanceof ByteType) { return Types.IntegerType.get(); } else if (atomic instanceof LongType) { @@ -137,10 +135,9 @@ public Type atomic(DataType atomic) { } else if (atomic instanceof DoubleType) { return Types.DoubleType.get(); - } else if ( - atomic instanceof StringType || - atomic instanceof CharType || - atomic instanceof VarcharType) { + } else if (atomic instanceof StringType + || atomic instanceof CharType + || atomic instanceof VarcharType) { return Types.StringType.get(); } else if (atomic instanceof DateType) { @@ -151,13 +148,11 @@ public Type atomic(DataType atomic) { } else if (atomic instanceof DecimalType) { return Types.DecimalType.of( - ((DecimalType) atomic).precision(), - ((DecimalType) atomic).scale()); + ((DecimalType) atomic).precision(), ((DecimalType) atomic).scale()); } else if (atomic instanceof BinaryType) { return Types.BinaryType.get(); } - throw new UnsupportedOperationException( - "Not a supported type: " + atomic.catalogString()); + throw new UnsupportedOperationException("Not a supported type: " + atomic.catalogString()); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java index 83b31940711e..1ef694263fa4 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkTypeVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -35,26 +34,22 @@ static T visit(DataType type, SparkTypeVisitor visitor) { List fieldResults = Lists.newArrayListWithExpectedSize(fields.length); for (StructField field : fields) { - fieldResults.add(visitor.field( - field, - visit(field.dataType(), visitor))); + fieldResults.add(visitor.field(field, visit(field.dataType(), visitor))); } return visitor.struct((StructType) type, fieldResults); } else if (type instanceof MapType) { - return visitor.map((MapType) type, + return visitor.map( + (MapType) type, visit(((MapType) type).keyType(), visitor), visit(((MapType) type).valueType(), visitor)); } else if (type instanceof ArrayType) { - return visitor.array( - (ArrayType) type, - visit(((ArrayType) type).elementType(), visitor)); + return visitor.array((ArrayType) type, visit(((ArrayType) type).elementType(), visitor)); } else if (type instanceof UserDefinedType) { - throw new UnsupportedOperationException( - "User-defined types are not supported"); + throw new UnsupportedOperationException("User-defined types are not supported"); } else { return visitor.atomic(type); diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java index 7754aa406123..950ed7bc87b8 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.sql.Date; @@ -54,26 +53,33 @@ public class SparkUtil { - public static final String TIMESTAMP_WITHOUT_TIMEZONE_ERROR = String.format("Cannot handle timestamp without" + - " timezone fields in Spark. Spark does not natively support this type but if you would like to handle all" + - " timestamps as timestamp with timezone set '%s' to true. This will not change the underlying values stored" + - " but will change their displayed values in Spark. For more information please see" + - " https://docs.databricks.com/spark/latest/dataframes-datasets/dates-timestamps.html#ansi-sql-and" + - "-spark-sql-timestamps", SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); + public static final String TIMESTAMP_WITHOUT_TIMEZONE_ERROR = + String.format( + "Cannot handle timestamp without" + + " timezone fields in Spark. Spark does not natively support this type but if you would like to handle all" + + " timestamps as timestamp with timezone set '%s' to true. This will not change the underlying values stored" + + " but will change their displayed values in Spark. For more information please see" + + " https://docs.databricks.com/spark/latest/dataframes-datasets/dates-timestamps.html#ansi-sql-and" + + "-spark-sql-timestamps", + SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); private static final String SPARK_CATALOG_CONF_PREFIX = "spark.sql.catalog"; - // Format string used as the prefix for spark configuration keys to override hadoop configuration values - // for Iceberg tables from a given catalog. These keys can be specified as `spark.sql.catalog.$catalogName.hadoop.*`, - // similar to using `spark.hadoop.*` to override hadoop configurations globally for a given spark session. - private static final String SPARK_CATALOG_HADOOP_CONF_OVERRIDE_FMT_STR = SPARK_CATALOG_CONF_PREFIX + ".%s.hadoop."; + // Format string used as the prefix for spark configuration keys to override hadoop configuration + // values + // for Iceberg tables from a given catalog. These keys can be specified as + // `spark.sql.catalog.$catalogName.hadoop.*`, + // similar to using `spark.hadoop.*` to override hadoop configurations globally for a given spark + // session. + private static final String SPARK_CATALOG_HADOOP_CONF_OVERRIDE_FMT_STR = + SPARK_CATALOG_CONF_PREFIX + ".%s.hadoop."; - private SparkUtil() { - } + private SparkUtil() {} public static FileIO serializableFileIO(Table table) { if (table.io() instanceof HadoopConfigurable) { // we need to use Spark's SerializableConfiguration to avoid issues with Kryo serialization - ((HadoopConfigurable) table.io()).serializeConfWith(conf -> new SerializableConfiguration(conf)::value); + ((HadoopConfigurable) table.io()) + .serializeConfWith(conf -> new SerializableConfiguration(conf)::value); } return table.io(); @@ -87,11 +93,12 @@ public static FileIO serializableFileIO(Table table) { */ public static void validatePartitionTransforms(PartitionSpec spec) { if (spec.fields().stream().anyMatch(field -> field.transform() instanceof UnknownTransform)) { - String unsupported = spec.fields().stream() - .map(PartitionField::transform) - .filter(transform -> transform instanceof UnknownTransform) - .map(Transform::toString) - .collect(Collectors.joining(", ")); + String unsupported = + spec.fields().stream() + .map(PartitionField::transform) + .filter(transform -> transform instanceof UnknownTransform) + .map(Transform::toString) + .collect(Collectors.joining(", ")); throw new UnsupportedOperationException( String.format("Cannot write using unsupported transforms: %s", unsupported)); @@ -99,18 +106,20 @@ public static void validatePartitionTransforms(PartitionSpec spec) { } /** - * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply - * Attempts to find the catalog and identifier a multipart identifier represents + * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply Attempts to find the + * catalog and identifier a multipart identifier represents + * * @param nameParts Multipart identifier representing a table * @return The CatalogPlugin and Identifier for the table */ - public static Pair catalogAndIdentifier(List nameParts, - Function catalogProvider, - BiFunction identiferProvider, - C currentCatalog, - String[] currentNamespace) { - Preconditions.checkArgument(!nameParts.isEmpty(), - "Cannot determine catalog and identifier from empty name"); + public static Pair catalogAndIdentifier( + List nameParts, + Function catalogProvider, + BiFunction identiferProvider, + C currentCatalog, + String[] currentNamespace) { + Preconditions.checkArgument( + !nameParts.isEmpty(), "Cannot determine catalog and identifier from empty name"); int lastElementIndex = nameParts.size() - 1; String name = nameParts.get(lastElementIndex); @@ -122,7 +131,7 @@ public static Pair catalogAndIdentifier(List nameParts, C catalog = catalogProvider.apply(nameParts.get(0)); if (catalog == null) { // The first element was not a valid catalog, treat it like part of the namespace - String[] namespace = nameParts.subList(0, lastElementIndex).toArray(new String[0]); + String[] namespace = nameParts.subList(0, lastElementIndex).toArray(new String[0]); return Pair.of(currentCatalog, identiferProvider.apply(namespace, name)); } else { // Assume the first element is a valid catalog @@ -134,6 +143,7 @@ public static Pair catalogAndIdentifier(List nameParts, /** * Responsible for checking if the table schema has a timestamp without timezone column + * * @param schema table schema to check if it contains a timestamp without timezone column * @return boolean indicating if the schema passed in has a timestamp field without a timezone */ @@ -143,15 +153,17 @@ public static boolean hasTimestampWithoutZone(Schema schema) { /** * Checks whether timestamp types for new tables should be stored with timezone info. - *

    - * The default value is false and all timestamp fields are stored as {@link Types.TimestampType#withZone()}. - * If enabled, all timestamp fields in new tables will be stored as {@link Types.TimestampType#withoutZone()}. + * + *

    The default value is false and all timestamp fields are stored as {@link + * Types.TimestampType#withZone()}. If enabled, all timestamp fields in new tables will be stored + * as {@link Types.TimestampType#withoutZone()}. * * @param sessionConf a Spark runtime config * @return true if timestamp types for new tables should be stored with timezone info */ public static boolean useTimestampWithoutZoneInNewTables(RuntimeConfig sessionConf) { - String sessionConfValue = sessionConf.get(SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, null); + String sessionConfValue = + sessionConf.get(SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, null); if (sessionConfValue != null) { return Boolean.parseBoolean(sessionConfValue); } @@ -159,32 +171,40 @@ public static boolean useTimestampWithoutZoneInNewTables(RuntimeConfig sessionCo } /** - * Pulls any Catalog specific overrides for the Hadoop conf from the current SparkSession, which can be - * set via `spark.sql.catalog.$catalogName.hadoop.*` + * Pulls any Catalog specific overrides for the Hadoop conf from the current SparkSession, which + * can be set via `spark.sql.catalog.$catalogName.hadoop.*` * - * Mirrors the override of hadoop configurations for a given spark session using `spark.hadoop.*`. + *

    Mirrors the override of hadoop configurations for a given spark session using + * `spark.hadoop.*`. * - * The SparkCatalog allows for hadoop configurations to be overridden per catalog, by setting + *

    The SparkCatalog allows for hadoop configurations to be overridden per catalog, by setting * them on the SQLConf, where the following will add the property "fs.default.name" with value - * "hdfs://hanksnamenode:8020" to the catalog's hadoop configuration. - * SparkSession.builder() - * .config(s"spark.sql.catalog.$catalogName.hadoop.fs.default.name", "hdfs://hanksnamenode:8020") - * .getOrCreate() + * "hdfs://hanksnamenode:8020" to the catalog's hadoop configuration. SparkSession.builder() + * .config(s"spark.sql.catalog.$catalogName.hadoop.fs.default.name", "hdfs://hanksnamenode:8020") + * .getOrCreate() + * * @param spark The current Spark session * @param catalogName Name of the catalog to find overrides for. - * @return the Hadoop Configuration that should be used for this catalog, with catalog specific overrides applied. + * @return the Hadoop Configuration that should be used for this catalog, with catalog specific + * overrides applied. */ public static Configuration hadoopConfCatalogOverrides(SparkSession spark, String catalogName) { // Find keys for the catalog intended to be hadoop configurations final String hadoopConfCatalogPrefix = hadoopConfPrefixForCatalog(catalogName); final Configuration conf = spark.sessionState().newHadoopConf(); - spark.sqlContext().conf().settings().forEach((k, v) -> { - // These checks are copied from `spark.sessionState().newHadoopConfWithOptions()`, which we - // avoid using to not have to convert back and forth between scala / java map types. - if (v != null && k != null && k.startsWith(hadoopConfCatalogPrefix)) { - conf.set(k.substring(hadoopConfCatalogPrefix.length()), v); - } - }); + spark + .sqlContext() + .conf() + .settings() + .forEach( + (k, v) -> { + // These checks are copied from `spark.sessionState().newHadoopConfWithOptions()`, + // which we + // avoid using to not have to convert back and forth between scala / java map types. + if (v != null && k != null && k.startsWith(hadoopConfCatalogPrefix)) { + conf.set(k.substring(hadoopConfCatalogPrefix.length()), v); + } + }); return conf; } @@ -196,12 +216,12 @@ private static String hadoopConfPrefixForCatalog(String catalogName) { * Get a List of Spark filter Expression. * * @param schema table schema - * @param filters filters in the format of a Map, where key is one of the table column name, - * and value is the specific value to be filtered on the column. + * @param filters filters in the format of a Map, where key is one of the table column name, and + * value is the specific value to be filtered on the column. * @return a List of filters in the format of Spark Expression. */ - public static List partitionMapToExpression(StructType schema, - Map filters) { + public static List partitionMapToExpression( + StructType schema, Map filters) { List filterExpressions = Lists.newArrayList(); for (Map.Entry entry : filters.entrySet()) { try { @@ -210,38 +230,55 @@ public static List partitionMapToExpression(StructType schema, BoundReference ref = new BoundReference(index, dataType, true); switch (dataType.typeName()) { case "integer": - filterExpressions.add(new EqualTo(ref, - Literal.create(Integer.parseInt(entry.getValue()), DataTypes.IntegerType))); + filterExpressions.add( + new EqualTo( + ref, + Literal.create(Integer.parseInt(entry.getValue()), DataTypes.IntegerType))); break; case "string": - filterExpressions.add(new EqualTo(ref, Literal.create(entry.getValue(), DataTypes.StringType))); + filterExpressions.add( + new EqualTo(ref, Literal.create(entry.getValue(), DataTypes.StringType))); break; case "short": - filterExpressions.add(new EqualTo(ref, - Literal.create(Short.parseShort(entry.getValue()), DataTypes.ShortType))); + filterExpressions.add( + new EqualTo( + ref, Literal.create(Short.parseShort(entry.getValue()), DataTypes.ShortType))); break; case "long": - filterExpressions.add(new EqualTo(ref, - Literal.create(Long.parseLong(entry.getValue()), DataTypes.LongType))); + filterExpressions.add( + new EqualTo( + ref, Literal.create(Long.parseLong(entry.getValue()), DataTypes.LongType))); break; case "float": - filterExpressions.add(new EqualTo(ref, - Literal.create(Float.parseFloat(entry.getValue()), DataTypes.FloatType))); + filterExpressions.add( + new EqualTo( + ref, Literal.create(Float.parseFloat(entry.getValue()), DataTypes.FloatType))); break; case "double": - filterExpressions.add(new EqualTo(ref, - Literal.create(Double.parseDouble(entry.getValue()), DataTypes.DoubleType))); + filterExpressions.add( + new EqualTo( + ref, + Literal.create(Double.parseDouble(entry.getValue()), DataTypes.DoubleType))); break; case "date": - filterExpressions.add(new EqualTo(ref, - Literal.create(new Date(DateTime.parse(entry.getValue()).getMillis()), DataTypes.DateType))); + filterExpressions.add( + new EqualTo( + ref, + Literal.create( + new Date(DateTime.parse(entry.getValue()).getMillis()), + DataTypes.DateType))); break; case "timestamp": - filterExpressions.add(new EqualTo(ref, - Literal.create(new Timestamp(DateTime.parse(entry.getValue()).getMillis()), DataTypes.TimestampType))); + filterExpressions.add( + new EqualTo( + ref, + Literal.create( + new Timestamp(DateTime.parse(entry.getValue()).getMillis()), + DataTypes.TimestampType))); break; default: - throw new IllegalStateException("Unexpected data type in partition filters: " + dataType); + throw new IllegalStateException( + "Unexpected data type in partition filters: " + dataType); } } catch (IllegalArgumentException e) { // ignore if filter is not on table columns diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java index b3e6b2f48887..5a5381099c76 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkValueConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.nio.ByteBuffer; @@ -34,13 +33,10 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.util.DateTimeUtils; -/** - * A utility class that converts Spark values to Iceberg's internal representation. - */ +/** A utility class that converts Spark values to Iceberg's internal representation. */ public class SparkValueConverter { - private SparkValueConverter() { - } + private SparkValueConverter() {} public static Record convert(Schema schema, Row row) { return convert(schema.asStruct(), row); diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java index 047a0b8169ed..e75830f820ff 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.DistributionMode.HASH; +import static org.apache.iceberg.DistributionMode.NONE; +import static org.apache.iceberg.DistributionMode.RANGE; + import java.util.Locale; import java.util.Map; import org.apache.iceberg.DistributionMode; @@ -31,24 +34,23 @@ import org.apache.spark.sql.RuntimeConfig; import org.apache.spark.sql.SparkSession; -import static org.apache.iceberg.DistributionMode.HASH; -import static org.apache.iceberg.DistributionMode.NONE; -import static org.apache.iceberg.DistributionMode.RANGE; - /** * A class for common Iceberg configs for Spark writes. - *

    - * If a config is set at multiple levels, the following order of precedence is used (top to bottom): + * + *

    If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * *

      - *
    1. Write options
    2. - *
    3. Session configuration
    4. - *
    5. Table metadata
    6. + *
    7. Write options + *
    8. Session configuration + *
    9. Table metadata *
    - * The most specific value is set in write options and takes precedence over all other configs. - * If no write option is provided, this class checks the session configuration for any overrides. - * If no applicable value is found in the session configuration, this class uses the table metadata. - *

    - * Note this class is NOT meant to be serialized and sent to executors. + * + * The most specific value is set in write options and takes precedence over all other configs. If + * no write option is provided, this class checks the session configuration for any overrides. If no + * applicable value is found in the session configuration, this class uses the table metadata. + * + *

    Note this class is NOT meant to be serialized and sent to executors. */ public class SparkWriteConf { @@ -65,7 +67,8 @@ public SparkWriteConf(SparkSession spark, Table table, Map write } public boolean checkNullability() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.CHECK_NULLABILITY) .sessionConf(SparkSQLProperties.CHECK_NULLABILITY) .defaultValue(SparkSQLProperties.CHECK_NULLABILITY_DEFAULT) @@ -73,7 +76,8 @@ public boolean checkNullability() { } public boolean checkOrdering() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.CHECK_ORDERING) .sessionConf(SparkSQLProperties.CHECK_ORDERING) .defaultValue(SparkSQLProperties.CHECK_ORDERING_DEFAULT) @@ -82,18 +86,20 @@ public boolean checkOrdering() { /** * Enables writing a timestamp with time zone as a timestamp without time zone. - *

    - * Generally, this is not safe as a timestamp without time zone is supposed to represent the wall-clock time, - * i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, - * but a timestamp with time zone represents instant semantics, i.e. the timestamp - * is adjusted so that the corresponding time in the reader timezone is displayed. - *

    - * When set to false (default), an exception must be thrown if the table contains a timestamp without time zone. + * + *

    Generally, this is not safe as a timestamp without time zone is supposed to represent the + * wall-clock time, i.e. no matter the reader/writer timezone 3PM should always be read as 3PM, + * but a timestamp with time zone represents instant semantics, i.e. the timestamp is adjusted so + * that the corresponding time in the reader timezone is displayed. + * + *

    When set to false (default), an exception must be thrown if the table contains a timestamp + * without time zone. * * @return boolean indicating if writing timestamps without timezone is allowed */ public boolean handleTimestampWithoutZone() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .sessionConf(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE) .defaultValue(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE_DEFAULT) @@ -106,7 +112,8 @@ public String overwriteMode() { } public boolean wapEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .tableProperty(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED) .defaultValue(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED_DEFAULT) .parse(); @@ -117,7 +124,8 @@ public String wapId() { } public boolean mergeSchema() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.MERGE_SCHEMA) .option(SparkWriteOptions.SPARK_MERGE_SCHEMA) .defaultValue(SparkWriteOptions.MERGE_SCHEMA_DEFAULT) @@ -125,16 +133,19 @@ public boolean mergeSchema() { } public FileFormat dataFileFormat() { - String valueAsString = confParser.stringConf() - .option(SparkWriteOptions.WRITE_FORMAT) - .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) - .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) - .parse(); + String valueAsString = + confParser + .stringConf() + .option(SparkWriteOptions.WRITE_FORMAT) + .tableProperty(TableProperties.DEFAULT_FILE_FORMAT) + .defaultValue(TableProperties.DEFAULT_FILE_FORMAT_DEFAULT) + .parse(); return FileFormat.valueOf(valueAsString.toUpperCase(Locale.ENGLISH)); } public long targetDataFileSize() { - return confParser.longConf() + return confParser + .longConf() .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES) .tableProperty(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES) .defaultValue(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT) @@ -142,7 +153,8 @@ public long targetDataFileSize() { } public boolean fanoutWriterEnabled() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.FANOUT_ENABLED) .tableProperty(TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED) .defaultValue(TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED_DEFAULT) @@ -150,15 +162,20 @@ public boolean fanoutWriterEnabled() { } public FileFormat deleteFileFormat() { - String valueAsString = confParser.stringConf() - .option(SparkWriteOptions.DELETE_FORMAT) - .tableProperty(TableProperties.DELETE_DEFAULT_FILE_FORMAT) - .parseOptional(); - return valueAsString != null ? FileFormat.valueOf(valueAsString.toUpperCase(Locale.ENGLISH)) : dataFileFormat(); + String valueAsString = + confParser + .stringConf() + .option(SparkWriteOptions.DELETE_FORMAT) + .tableProperty(TableProperties.DELETE_DEFAULT_FILE_FORMAT) + .parseOptional(); + return valueAsString != null + ? FileFormat.valueOf(valueAsString.toUpperCase(Locale.ENGLISH)) + : dataFileFormat(); } public long targetDeleteFileSize() { - return confParser.longConf() + return confParser + .longConf() .option(SparkWriteOptions.TARGET_DELETE_FILE_SIZE_BYTES) .tableProperty(TableProperties.DELETE_TARGET_FILE_SIZE_BYTES) .defaultValue(TableProperties.DELETE_TARGET_FILE_SIZE_BYTES_DEFAULT) @@ -168,26 +185,31 @@ public long targetDeleteFileSize() { public Map extraSnapshotMetadata() { Map extraSnapshotMetadata = Maps.newHashMap(); - writeOptions.forEach((key, value) -> { - if (key.startsWith(SnapshotSummary.EXTRA_METADATA_PREFIX)) { - extraSnapshotMetadata.put(key.substring(SnapshotSummary.EXTRA_METADATA_PREFIX.length()), value); - } - }); + writeOptions.forEach( + (key, value) -> { + if (key.startsWith(SnapshotSummary.EXTRA_METADATA_PREFIX)) { + extraSnapshotMetadata.put( + key.substring(SnapshotSummary.EXTRA_METADATA_PREFIX.length()), value); + } + }); return extraSnapshotMetadata; } public String rewrittenFileSetId() { - return confParser.stringConf() + return confParser + .stringConf() .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID) .parseOptional(); } public DistributionMode distributionMode() { - String modeName = confParser.stringConf() - .option(SparkWriteOptions.DISTRIBUTION_MODE) - .tableProperty(TableProperties.WRITE_DISTRIBUTION_MODE) - .parseOptional(); + String modeName = + confParser + .stringConf() + .option(SparkWriteOptions.DISTRIBUTION_MODE) + .tableProperty(TableProperties.WRITE_DISTRIBUTION_MODE) + .parseOptional(); if (modeName != null) { DistributionMode mode = DistributionMode.fromName(modeName); @@ -208,28 +230,34 @@ private DistributionMode adjustWriteDistributionMode(DistributionMode mode) { } public DistributionMode deleteDistributionMode() { - String deleteModeName = confParser.stringConf() - .option(SparkWriteOptions.DISTRIBUTION_MODE) - .tableProperty(TableProperties.DELETE_DISTRIBUTION_MODE) - .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_HASH) - .parse(); + String deleteModeName = + confParser + .stringConf() + .option(SparkWriteOptions.DISTRIBUTION_MODE) + .tableProperty(TableProperties.DELETE_DISTRIBUTION_MODE) + .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_HASH) + .parse(); return DistributionMode.fromName(deleteModeName); } public DistributionMode updateDistributionMode() { - String updateModeName = confParser.stringConf() - .option(SparkWriteOptions.DISTRIBUTION_MODE) - .tableProperty(TableProperties.UPDATE_DISTRIBUTION_MODE) - .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_HASH) - .parse(); + String updateModeName = + confParser + .stringConf() + .option(SparkWriteOptions.DISTRIBUTION_MODE) + .tableProperty(TableProperties.UPDATE_DISTRIBUTION_MODE) + .defaultValue(TableProperties.WRITE_DISTRIBUTION_MODE_HASH) + .parse(); return DistributionMode.fromName(updateModeName); } public DistributionMode copyOnWriteMergeDistributionMode() { - String mergeModeName = confParser.stringConf() - .option(SparkWriteOptions.DISTRIBUTION_MODE) - .tableProperty(TableProperties.MERGE_DISTRIBUTION_MODE) - .parseOptional(); + String mergeModeName = + confParser + .stringConf() + .option(SparkWriteOptions.DISTRIBUTION_MODE) + .tableProperty(TableProperties.MERGE_DISTRIBUTION_MODE) + .parseOptional(); if (mergeModeName != null) { DistributionMode mergeMode = DistributionMode.fromName(mergeModeName); @@ -240,30 +268,33 @@ public DistributionMode copyOnWriteMergeDistributionMode() { } public DistributionMode positionDeltaMergeDistributionMode() { - String mergeModeName = confParser.stringConf() - .option(SparkWriteOptions.DISTRIBUTION_MODE) - .tableProperty(TableProperties.MERGE_DISTRIBUTION_MODE) - .parseOptional(); + String mergeModeName = + confParser + .stringConf() + .option(SparkWriteOptions.DISTRIBUTION_MODE) + .tableProperty(TableProperties.MERGE_DISTRIBUTION_MODE) + .parseOptional(); return mergeModeName != null ? DistributionMode.fromName(mergeModeName) : distributionMode(); } public boolean useTableDistributionAndOrdering() { - return confParser.booleanConf() + return confParser + .booleanConf() .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING) .defaultValue(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING_DEFAULT) .parse(); } public Long validateFromSnapshotId() { - return confParser.longConf() + return confParser + .longConf() .option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID) .parseOptional(); } public IsolationLevel isolationLevel() { - String isolationLevelName = confParser.stringConf() - .option(SparkWriteOptions.ISOLATION_LEVEL) - .parseOptional(); + String isolationLevelName = + confParser.stringConf().option(SparkWriteOptions.ISOLATION_LEVEL).parseOptional(); return isolationLevelName != null ? IsolationLevel.fromName(isolationLevelName) : null; } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java index 72de545f1298..6f4649642c57 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java @@ -16,16 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; -/** - * Spark DF write options - */ +/** Spark DF write options */ public class SparkWriteOptions { - private SparkWriteOptions() { - } + private SparkWriteOptions() {} // Fileformat for write operations(default: Table write.format.default ) public static final String WRITE_FORMAT = "write-format"; @@ -58,15 +54,18 @@ private SparkWriteOptions() { public static final String REWRITTEN_FILE_SCAN_TASK_SET_ID = "rewritten-file-scan-task-set-id"; // Controls whether to allow writing timestamps without zone info - public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = "handle-timestamp-without-timezone"; + public static final String HANDLE_TIMESTAMP_WITHOUT_TIMEZONE = + "handle-timestamp-without-timezone"; public static final String OVERWRITE_MODE = "overwrite-mode"; // Overrides the default distribution mode for a write operation public static final String DISTRIBUTION_MODE = "distribution-mode"; - // Controls whether to take into account the table distribution and sort order during a write operation - public static final String USE_TABLE_DISTRIBUTION_AND_ORDERING = "use-table-distribution-and-ordering"; + // Controls whether to take into account the table distribution and sort order during a write + // operation + public static final String USE_TABLE_DISTRIBUTION_AND_ORDERING = + "use-table-distribution-and-ordering"; public static final boolean USE_TABLE_DISTRIBUTION_AND_ORDERING_DEFAULT = true; public static final String MERGE_SCHEMA = "merge-schema"; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java index b12d3e5030b7..4d4ec6782c72 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.List; @@ -45,8 +44,7 @@ import org.apache.spark.sql.types.TimestampType$; class TypeToSparkType extends TypeUtil.SchemaVisitor { - TypeToSparkType() { - } + TypeToSparkType() {} public static final String METADATA_COL_ATTR_KEY = "__metadata_col"; @@ -105,8 +103,7 @@ public DataType primitive(Type.PrimitiveType primitive) { case DATE: return DateType$.MODULE$; case TIME: - throw new UnsupportedOperationException( - "Spark does not support time fields"); + throw new UnsupportedOperationException("Spark does not support time fields"); case TIMESTAMP: return TimestampType$.MODULE$; case STRING: @@ -129,9 +126,7 @@ public DataType primitive(Type.PrimitiveType primitive) { private Metadata fieldMetadata(int fieldId) { if (MetadataColumns.metadataFieldIds().contains(fieldId)) { - return new MetadataBuilder() - .putBoolean(METADATA_COL_ATTR_KEY, true) - .build(); + return new MetadataBuilder().putBoolean(METADATA_COL_ATTR_KEY, true).build(); } return Metadata.empty(); diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java index 7ed17d75dd8a..77debe1e589d 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotUpdateSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.Map; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java index 8b9821f40c52..acfdeb326416 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/BaseSparkAction.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.MetadataTableType.ALL_MANIFESTS; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.lit; + import java.util.Iterator; import java.util.List; import java.util.Map; @@ -54,10 +57,6 @@ import org.apache.spark.sql.SparkSession; import scala.Tuple2; -import static org.apache.iceberg.MetadataTableType.ALL_MANIFESTS; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.lit; - abstract class BaseSparkAction { protected static final String CONTENT_FILE = "Content File"; @@ -127,21 +126,28 @@ protected Table newStaticTable(TableMetadata metadata, FileIO io) { // builds a DF of delete and data file path and type by reading all manifests protected Dataset buildValidContentFileWithTypeDF(Table table) { - Broadcast

    tableBroadcast = sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); - - Dataset allManifests = loadMetadataTable(table, ALL_MANIFESTS) - .selectExpr( - "content", - "path", - "length", - "partition_spec_id as partitionSpecId", - "added_snapshot_id as addedSnapshotId") - .dropDuplicates("path") - .repartition(spark.sessionState().conf().numShufflePartitions()) // avoid adaptive execution combining tasks - .as(Encoders.bean(ManifestFileBean.class)); + Broadcast
    tableBroadcast = + sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); + + Dataset allManifests = + loadMetadataTable(table, ALL_MANIFESTS) + .selectExpr( + "content", + "path", + "length", + "partition_spec_id as partitionSpecId", + "added_snapshot_id as addedSnapshotId") + .dropDuplicates("path") + .repartition( + spark + .sessionState() + .conf() + .numShufflePartitions()) // avoid adaptive execution combining tasks + .as(Encoders.bean(ManifestFileBean.class)); return allManifests - .flatMap(new ReadManifest(tableBroadcast), Encoders.tuple(Encoders.STRING(), Encoders.STRING())) + .flatMap( + new ReadManifest(tableBroadcast), Encoders.tuple(Encoders.STRING(), Encoders.STRING())) .toDF(FILE_PATH, FILE_TYPE); } @@ -160,16 +166,20 @@ protected Dataset buildManifestListDF(Table table) { } protected Dataset buildOtherMetadataFileDF(Table table) { - return buildOtherMetadataFileDF(table, false /* include all reachable previous metadata locations */); + return buildOtherMetadataFileDF( + table, false /* include all reachable previous metadata locations */); } protected Dataset buildAllReachableOtherMetadataFileDF(Table table) { - return buildOtherMetadataFileDF(table, true /* include all reachable previous metadata locations */); + return buildOtherMetadataFileDF( + table, true /* include all reachable previous metadata locations */); } - private Dataset buildOtherMetadataFileDF(Table table, boolean includePreviousMetadataLocations) { + private Dataset buildOtherMetadataFileDF( + Table table, boolean includePreviousMetadataLocations) { List otherMetadataFiles = Lists.newArrayList(); - otherMetadataFiles.addAll(ReachableFileUtil.metadataFileLocations(table, includePreviousMetadataLocations)); + otherMetadataFiles.addAll( + ReachableFileUtil.metadataFileLocations(table, includePreviousMetadataLocations)); otherMetadataFiles.add(ReachableFileUtil.versionHintLocation(table)); return spark.createDataset(otherMetadataFiles, Encoders.STRING()).toDF(FILE_PATH); } @@ -190,7 +200,8 @@ protected Dataset loadMetadataTable(Table table, MetadataTableType type) { return SparkTableUtil.loadMetadataTable(spark, table, type); } - private static class ReadManifest implements FlatMapFunction> { + private static class ReadManifest + implements FlatMapFunction> { private final Broadcast
    table; ReadManifest(Broadcast
    table) { @@ -205,7 +216,8 @@ public Iterator> call(ManifestFileBean manifest) { public CloseableIterator> entries(ManifestFileBean manifest) { FileIO io = table.getValue().io(); Map specs = table.getValue().specs(); - ImmutableList projection = ImmutableList.of(DataFile.FILE_PATH.name(), DataFile.CONTENT.name()); + ImmutableList projection = + ImmutableList.of(DataFile.FILE_PATH.name(), DataFile.CONTENT.name()); switch (manifest.content()) { case DATA: @@ -217,7 +229,8 @@ public CloseableIterator> entries(ManifestFileBean manife ManifestFiles.readDeleteManifest(manifest, io, specs).select(projection).iterator(), ReadManifest::contentFileWithType); default: - throw new IllegalArgumentException("Unsupported manifest content type:" + manifest.content()); + throw new IllegalArgumentException( + "Unsupported manifest content type:" + manifest.content()); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/BaseTableCreationSparkAction.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/BaseTableCreationSparkAction.java index e3ddd7abc910..9639b205ac59 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/BaseTableCreationSparkAction.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/BaseTableCreationSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.List; @@ -50,7 +49,8 @@ import org.apache.spark.sql.types.StructType; abstract class BaseTableCreationSparkAction extends BaseSparkAction { - private static final Set ALLOWED_SOURCES = ImmutableSet.of("parquet", "avro", "orc", "hive"); + private static final Set ALLOWED_SOURCES = + ImmutableSet.of("parquet", "avro", "orc", "hive"); protected static final String LOCATION = "location"; protected static final String ICEBERG_METADATA_FOLDER = "metadata"; protected static final List EXCLUDED_PROPERTIES = @@ -66,7 +66,8 @@ abstract class BaseTableCreationSparkAction extends BaseSparkAction additionalProperties = Maps.newHashMap(); - BaseTableCreationSparkAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { + BaseTableCreationSparkAction( + SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { super(spark); this.sourceCatalog = checkSourceCatalog(sourceCatalog); @@ -78,12 +79,13 @@ abstract class BaseTableCreationSparkAction extends BaseSparkAction additionalProperties() { private void validateSourceTable() { String sourceTableProvider = sourceCatalogTable.provider().get().toLowerCase(Locale.ROOT); - Preconditions.checkArgument(ALLOWED_SOURCES.contains(sourceTableProvider), - "Cannot create an Iceberg table from source provider: '%s'", sourceTableProvider); - Preconditions.checkArgument(!sourceCatalogTable.storage().locationUri().isEmpty(), + Preconditions.checkArgument( + ALLOWED_SOURCES.contains(sourceTableProvider), + "Cannot create an Iceberg table from source provider: '%s'", + sourceTableProvider); + Preconditions.checkArgument( + !sourceCatalogTable.storage().locationUri().isEmpty(), "Cannot create an Iceberg table from a source without an explicit location"); } protected StagingTableCatalog checkDestinationCatalog(CatalogPlugin catalog) { - Preconditions.checkArgument(catalog instanceof SparkSessionCatalog || catalog instanceof SparkCatalog, - "Cannot create Iceberg table in non-Iceberg Catalog. " + - "Catalog '%s' was of class '%s' but '%s' or '%s' are required", - catalog.name(), catalog.getClass().getName(), SparkSessionCatalog.class.getName(), + Preconditions.checkArgument( + catalog instanceof SparkSessionCatalog || catalog instanceof SparkCatalog, + "Cannot create Iceberg table in non-Iceberg Catalog. " + + "Catalog '%s' was of class '%s' but '%s' or '%s' are required", + catalog.name(), + catalog.getClass().getName(), + SparkSessionCatalog.class.getName(), SparkCatalog.class.getName()); return (StagingTableCatalog) catalog; @@ -145,11 +153,14 @@ protected StagedSparkTable stageDestTable() { Map props = destTableProps(); StructType schema = sourceTable.schema(); Transform[] partitioning = sourceTable.partitioning(); - return (StagedSparkTable) destCatalog().stageCreate(destTableIdent(), schema, partitioning, props); + return (StagedSparkTable) + destCatalog().stageCreate(destTableIdent(), schema, partitioning, props); } catch (org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException e) { - throw new NoSuchNamespaceException("Cannot create table %s as the namespace does not exist", destTableIdent()); + throw new NoSuchNamespaceException( + "Cannot create table %s as the namespace does not exist", destTableIdent()); } catch (org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException e) { - throw new AlreadyExistsException("Cannot create table %s as it already exists", destTableIdent()); + throw new AlreadyExistsException( + "Cannot create table %s as it already exists", destTableIdent()); } } @@ -162,7 +173,10 @@ protected void ensureNameMappingPresent(Table table) { } protected String getMetadataLocation(Table table) { - return table.properties().getOrDefault(TableProperties.WRITE_METADATA_LOCATION, - table.location() + "/" + ICEBERG_METADATA_FOLDER); + return table + .properties() + .getOrDefault( + TableProperties.WRITE_METADATA_LOCATION, + table.location() + "/" + ICEBERG_METADATA_FOLDER); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteOrphanFilesSparkAction.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteOrphanFilesSparkAction.java index 72b4f8f43a36..dae08cfcbcab 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteOrphanFilesSparkAction.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteOrphanFilesSparkAction.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.io.File; import java.io.IOException; import java.io.Serializable; @@ -68,50 +70,53 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** - * An action that removes orphan metadata, data and delete files by listing a given location and comparing - * the actual files in that location with content and metadata files referenced by all valid snapshots. - * The location must be accessible for listing via the Hadoop {@link FileSystem}. - *

    - * By default, this action cleans up the table location returned by {@link Table#location()} and - * removes unreachable files that are older than 3 days using {@link Table#io()}. The behavior can be modified - * by passing a custom location to {@link #location} and a custom timestamp to {@link #olderThan(long)}. - * For example, someone might point this action to the data folder to clean up only orphan data files. - *

    - * Configure an alternative delete method using {@link #deleteWith(Consumer)}. - *

    - * For full control of the set of files being evaluated, use the {@link #compareToFileList(Dataset)} argument. This - * skips the directory listing - any files in the dataset provided which are not found in table metadata will - * be deleted, using the same {@link Table#location()} and {@link #olderThan(long)} filtering as above. - *

    - * Note: It is dangerous to call this action with a short retention interval as it might corrupt - * the state of the table if another operation is writing at the same time. + * An action that removes orphan metadata, data and delete files by listing a given location and + * comparing the actual files in that location with content and metadata files referenced by all + * valid snapshots. The location must be accessible for listing via the Hadoop {@link FileSystem}. + * + *

    By default, this action cleans up the table location returned by {@link Table#location()} and + * removes unreachable files that are older than 3 days using {@link Table#io()}. The behavior can + * be modified by passing a custom location to {@link #location} and a custom timestamp to {@link + * #olderThan(long)}. For example, someone might point this action to the data folder to clean up + * only orphan data files. + * + *

    Configure an alternative delete method using {@link #deleteWith(Consumer)}. + * + *

    For full control of the set of files being evaluated, use the {@link + * #compareToFileList(Dataset)} argument. This skips the directory listing - any files in the + * dataset provided which are not found in table metadata will be deleted, using the same {@link + * Table#location()} and {@link #olderThan(long)} filtering as above. + * + *

    Note: It is dangerous to call this action with a short retention interval as it might + * corrupt the state of the table if another operation is writing at the same time. */ -public class DeleteOrphanFilesSparkAction - extends BaseSparkAction implements DeleteOrphanFiles { +public class DeleteOrphanFilesSparkAction extends BaseSparkAction + implements DeleteOrphanFiles { private static final Logger LOG = LoggerFactory.getLogger(DeleteOrphanFilesSparkAction.class); - private static final UserDefinedFunction filenameUDF = functions.udf((String path) -> { - int lastIndex = path.lastIndexOf(File.separator); - if (lastIndex == -1) { - return path; - } else { - return path.substring(lastIndex + 1); - } - }, DataTypes.StringType); + private static final UserDefinedFunction filenameUDF = + functions.udf( + (String path) -> { + int lastIndex = path.lastIndexOf(File.separator); + if (lastIndex == -1) { + return path; + } else { + return path.substring(lastIndex + 1); + } + }, + DataTypes.StringType); private final SerializableConfiguration hadoopConf; private final int partitionDiscoveryParallelism; private final Table table; - private final Consumer defaultDelete = new Consumer() { - @Override - public void accept(String file) { - table.io().deleteFile(file); - } - }; + private final Consumer defaultDelete = + new Consumer() { + @Override + public void accept(String file) { + table.io().deleteFile(file); + } + }; private String location = null; private long olderThanTimestamp = System.currentTimeMillis() - TimeUnit.DAYS.toMillis(3); @@ -123,7 +128,8 @@ public void accept(String file) { super(spark); this.hadoopConf = new SerializableConfiguration(spark.sessionState().newHadoopConf()); - this.partitionDiscoveryParallelism = spark.sessionState().conf().parallelPartitionDiscoveryParallelism(); + this.partitionDiscoveryParallelism = + spark.sessionState().conf().parallelPartitionDiscoveryParallelism(); this.table = table; this.location = table.location(); @@ -204,23 +210,24 @@ private String jobDesc() { if (location != null) { options.add("location=" + location); } - return String.format("Deleting orphan files (%s) from %s", Joiner.on(',').join(options), table.name()); + return String.format( + "Deleting orphan files (%s) from %s", Joiner.on(',').join(options), table.name()); } private DeleteOrphanFiles.Result doExecute() { Dataset validContentFileDF = buildValidContentFileDF(table); Dataset validMetadataFileDF = buildValidMetadataFileDF(table); Dataset validFileDF = validContentFileDF.union(validMetadataFileDF); - Dataset actualFileDF = compareToFileList == null ? buildActualFileDF() : filteredCompareToFileList(); + Dataset actualFileDF = + compareToFileList == null ? buildActualFileDF() : filteredCompareToFileList(); Column actualFileName = filenameUDF.apply(actualFileDF.col(FILE_PATH)); Column validFileName = filenameUDF.apply(validFileDF.col(FILE_PATH)); Column nameEqual = actualFileName.equalTo(validFileName); Column actualContains = actualFileDF.col(FILE_PATH).contains(validFileDF.col(FILE_PATH)); Column joinCond = nameEqual.and(actualContains); - List orphanFiles = actualFileDF.join(validFileDF, joinCond, "leftanti") - .as(Encoders.STRING()) - .collectAsList(); + List orphanFiles = + actualFileDF.join(validFileDF, joinCond, "leftanti").as(Encoders.STRING()).collectAsList(); Tasks.foreach(orphanFiles) .noRetry() @@ -240,7 +247,8 @@ private Dataset buildActualFileDF() { PathFilter pathFilter = PartitionAwareHiddenPathFilter.forSpecs(table.specs()); // list at most 3 levels and only dirs that have less than 10 direct sub dirs on the driver - listDirRecursively(location, predicate, hadoopConf.value(), 3, 10, subDirs, pathFilter, matchingFiles); + listDirRecursively( + location, predicate, hadoopConf.value(), 3, 10, subDirs, pathFilter, matchingFiles); JavaRDD matchingFileRDD = sparkContext().parallelize(matchingFiles, 1); @@ -252,17 +260,22 @@ private Dataset buildActualFileDF() { JavaRDD subDirRDD = sparkContext().parallelize(subDirs, parallelism); Broadcast conf = sparkContext().broadcast(hadoopConf); - JavaRDD matchingLeafFileRDD = subDirRDD.mapPartitions( - listDirsRecursively(conf, olderThanTimestamp, pathFilter) - ); + JavaRDD matchingLeafFileRDD = + subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp, pathFilter)); JavaRDD completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD); return spark().createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF(FILE_PATH); } private static void listDirRecursively( - String dir, Predicate predicate, Configuration conf, int maxDepth, - int maxDirectSubDirs, List remainingSubDirs, PathFilter pathFilter, List matchingFiles) { + String dir, + Predicate predicate, + Configuration conf, + int maxDepth, + int maxDirectSubDirs, + List remainingSubDirs, + PathFilter pathFilter, + List matchingFiles) { // stop listing whenever we reach the max depth if (maxDepth <= 0) { @@ -292,7 +305,14 @@ private static void listDirRecursively( for (String subDir : subDirs) { listDirRecursively( - subDir, predicate, conf, maxDepth - 1, maxDirectSubDirs, remainingSubDirs, pathFilter, matchingFiles); + subDir, + predicate, + conf, + maxDepth - 1, + maxDirectSubDirs, + remainingSubDirs, + pathFilter, + matchingFiles); } } catch (IOException e) { throw new RuntimeIOException(e); @@ -300,9 +320,7 @@ private static void listDirRecursively( } private static FlatMapFunction, String> listDirsRecursively( - Broadcast conf, - long olderThanTimestamp, - PathFilter pathFilter) { + Broadcast conf, long olderThanTimestamp, PathFilter pathFilter) { return dirs -> { List subDirs = Lists.newArrayList(); @@ -313,13 +331,22 @@ private static FlatMapFunction, String> listDirsRecursively( int maxDepth = 2000; int maxDirectSubDirs = Integer.MAX_VALUE; - dirs.forEachRemaining(dir -> { - listDirRecursively( - dir, predicate, conf.value().value(), maxDepth, maxDirectSubDirs, subDirs, pathFilter, files); - }); + dirs.forEachRemaining( + dir -> { + listDirRecursively( + dir, + predicate, + conf.value().value(), + maxDepth, + maxDirectSubDirs, + subDirs, + pathFilter, + files); + }); if (!subDirs.isEmpty()) { - throw new RuntimeException("Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth); + throw new RuntimeException( + "Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth); } return files.iterator(); @@ -327,9 +354,9 @@ private static FlatMapFunction, String> listDirsRecursively( } /** - * A {@link PathFilter} that filters out hidden path, but does not filter out paths that would be marked - * as hidden by {@link HiddenPathFilter} due to a partition field that starts with one of the characters that - * indicate a hidden path. + * A {@link PathFilter} that filters out hidden path, but does not filter out paths that would be + * marked as hidden by {@link HiddenPathFilter} due to a partition field that starts with one of + * the characters that indicate a hidden path. */ @VisibleForTesting static class PartitionAwareHiddenPathFilter implements PathFilter, Serializable { @@ -342,7 +369,8 @@ static class PartitionAwareHiddenPathFilter implements PathFilter, Serializable @Override public boolean accept(Path path) { - boolean isHiddenPartitionPath = hiddenPathPartitionNames.stream().anyMatch(path.getName()::startsWith); + boolean isHiddenPartitionPath = + hiddenPathPartitionNames.stream().anyMatch(path.getName()::startsWith); return isHiddenPartitionPath || HiddenPathFilter.get().accept(path); } @@ -351,14 +379,20 @@ static PathFilter forSpecs(Map specs) { return HiddenPathFilter.get(); } - Set partitionNames = specs.values().stream() - .map(PartitionSpec::fields) - .flatMap(List::stream) - .filter(partitionField -> partitionField.name().startsWith("_") || partitionField.name().startsWith(".")) - .map(partitionField -> partitionField.name() + "=") - .collect(Collectors.toSet()); - - return partitionNames.isEmpty() ? HiddenPathFilter.get() : new PartitionAwareHiddenPathFilter(partitionNames); + Set partitionNames = + specs.values().stream() + .map(PartitionSpec::fields) + .flatMap(List::stream) + .filter( + partitionField -> + partitionField.name().startsWith("_") + || partitionField.name().startsWith(".")) + .map(partitionField -> partitionField.name() + "=") + .collect(Collectors.toSet()); + + return partitionNames.isEmpty() + ? HiddenPathFilter.get() + : new PartitionAwareHiddenPathFilter(partitionNames); } } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteReachableFilesSparkAction.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteReachableFilesSparkAction.java index e840d90c2c3a..a9828d2c7894 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteReachableFilesSparkAction.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteReachableFilesSparkAction.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.util.Iterator; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicLong; @@ -42,12 +44,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** - * An implementation of {@link DeleteReachableFiles} that uses metadata tables in Spark - * to determine which files should be deleted. + * An implementation of {@link DeleteReachableFiles} that uses metadata tables in Spark to determine + * which files should be deleted. */ @SuppressWarnings("UnnecessaryAnonymousClass") public class DeleteReachableFilesSparkAction @@ -59,12 +58,13 @@ public class DeleteReachableFilesSparkAction private static final Logger LOG = LoggerFactory.getLogger(DeleteReachableFilesSparkAction.class); private final String metadataFileLocation; - private final Consumer defaultDelete = new Consumer() { - @Override - public void accept(String file) { - io.deleteFile(file); - } - }; + private final Consumer defaultDelete = + new Consumer() { + @Override + public void accept(String file) { + io.deleteFile(file); + } + }; private Consumer deleteFunc = defaultDelete; private ExecutorService deleteExecutorService = null; @@ -115,7 +115,8 @@ private Result doExecute() { Dataset reachableFileDF = buildReachableFileDF(metadata).distinct(); - boolean streamResults = PropertyUtil.propertyAsBoolean(options(), STREAM_RESULTS, STREAM_RESULTS_DEFAULT); + boolean streamResults = + PropertyUtil.propertyAsBoolean(options(), STREAM_RESULTS, STREAM_RESULTS_DEFAULT); if (streamResults) { return deleteFiles(reachableFileDF.toLocalIterator()); } else { @@ -144,40 +145,45 @@ private BaseDeleteReachableFilesActionResult deleteFiles(Iterator deleted) AtomicLong otherFilesCount = new AtomicLong(0L); Tasks.foreach(deleted) - .retry(3).stopRetryOn(NotFoundException.class).suppressFailureWhenFinished() + .retry(3) + .stopRetryOn(NotFoundException.class) + .suppressFailureWhenFinished() .executeWith(deleteExecutorService) - .onFailure((fileInfo, exc) -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - LOG.warn("Delete failed for {}: {}", type, file, exc); - }) - .run(fileInfo -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - deleteFunc.accept(file); - switch (type) { - case CONTENT_FILE: - dataFileCount.incrementAndGet(); - LOG.trace("Deleted Content File: {}", file); - break; - case MANIFEST: - manifestCount.incrementAndGet(); - LOG.debug("Deleted Manifest: {}", file); - break; - case MANIFEST_LIST: - manifestListCount.incrementAndGet(); - LOG.debug("Deleted Manifest List: {}", file); - break; - case OTHERS: - otherFilesCount.incrementAndGet(); - LOG.debug("Others: {}", file); - break; - } - }); - - long filesCount = dataFileCount.get() + manifestCount.get() + manifestListCount.get() + otherFilesCount.get(); + .onFailure( + (fileInfo, exc) -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + LOG.warn("Delete failed for {}: {}", type, file, exc); + }) + .run( + fileInfo -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + deleteFunc.accept(file); + switch (type) { + case CONTENT_FILE: + dataFileCount.incrementAndGet(); + LOG.trace("Deleted Content File: {}", file); + break; + case MANIFEST: + manifestCount.incrementAndGet(); + LOG.debug("Deleted Manifest: {}", file); + break; + case MANIFEST_LIST: + manifestListCount.incrementAndGet(); + LOG.debug("Deleted Manifest List: {}", file); + break; + case OTHERS: + otherFilesCount.incrementAndGet(); + LOG.debug("Others: {}", file); + break; + } + }); + + long filesCount = + dataFileCount.get() + manifestCount.get() + manifestListCount.get() + otherFilesCount.get(); LOG.info("Total files removed: {}", filesCount); - return new BaseDeleteReachableFilesActionResult(dataFileCount.get(), manifestCount.get(), manifestListCount.get(), - otherFilesCount.get()); + return new BaseDeleteReachableFilesActionResult( + dataFileCount.get(), manifestCount.get(), manifestListCount.get(), otherFilesCount.get()); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/ExpireSnapshotsSparkAction.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/ExpireSnapshotsSparkAction.java index f3fc7bdc1469..c9e5f7cca785 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/ExpireSnapshotsSparkAction.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/ExpireSnapshotsSparkAction.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.TableProperties.GC_ENABLED; +import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; + import java.util.Iterator; import java.util.List; import java.util.Set; @@ -47,26 +49,24 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.GC_ENABLED; -import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT; - /** - * An action that performs the same operation as {@link org.apache.iceberg.ExpireSnapshots} but uses Spark - * to determine the delta in files between the pre and post-expiration table metadata. All of the same - * restrictions of {@link org.apache.iceberg.ExpireSnapshots} also apply to this action. - *

    - * This action first leverages {@link org.apache.iceberg.ExpireSnapshots} to expire snapshots and then - * uses metadata tables to find files that can be safely deleted. This is done by anti-joining two Datasets - * that contain all manifest and content files before and after the expiration. The snapshot expiration - * will be fully committed before any deletes are issued. - *

    - * This operation performs a shuffle so the parallelism can be controlled through 'spark.sql.shuffle.partitions'. - *

    - * Deletes are still performed locally after retrieving the results from the Spark executors. + * An action that performs the same operation as {@link org.apache.iceberg.ExpireSnapshots} but uses + * Spark to determine the delta in files between the pre and post-expiration table metadata. All of + * the same restrictions of {@link org.apache.iceberg.ExpireSnapshots} also apply to this action. + * + *

    This action first leverages {@link org.apache.iceberg.ExpireSnapshots} to expire snapshots and + * then uses metadata tables to find files that can be safely deleted. This is done by anti-joining + * two Datasets that contain all manifest and content files before and after the expiration. The + * snapshot expiration will be fully committed before any deletes are issued. + * + *

    This operation performs a shuffle so the parallelism can be controlled through + * 'spark.sql.shuffle.partitions'. + * + *

    Deletes are still performed locally after retrieving the results from the Spark executors. */ @SuppressWarnings("UnnecessaryAnonymousClass") -public class ExpireSnapshotsSparkAction - extends BaseSparkAction implements ExpireSnapshots { +public class ExpireSnapshotsSparkAction extends BaseSparkAction + implements ExpireSnapshots { public static final String STREAM_RESULTS = "stream-results"; public static final boolean STREAM_RESULTS_DEFAULT = false; @@ -75,12 +75,13 @@ public class ExpireSnapshotsSparkAction private final Table table; private final TableOperations ops; - private final Consumer defaultDelete = new Consumer() { - @Override - public void accept(String file) { - ops.io().deleteFile(file); - } - }; + private final Consumer defaultDelete = + new Consumer() { + @Override + public void accept(String file) { + ops.io().deleteFile(file); + } + }; private final Set expiredSnapshotIds = Sets.newHashSet(); private Long expireOlderThanValue = null; @@ -124,8 +125,10 @@ public ExpireSnapshotsSparkAction expireOlderThan(long timestampMillis) { @Override public ExpireSnapshotsSparkAction retainLast(int numSnapshots) { - Preconditions.checkArgument(1 <= numSnapshots, - "Number of snapshots to retain must be at least 1, cannot be: %s", numSnapshots); + Preconditions.checkArgument( + 1 <= numSnapshots, + "Number of snapshots to retain must be at least 1, cannot be: %s", + numSnapshots); this.retainLastValue = numSnapshots; return this; } @@ -138,10 +141,11 @@ public ExpireSnapshotsSparkAction deleteWith(Consumer newDeleteFunc) { /** * Expires snapshots and commits the changes to the table, returning a Dataset of files to delete. - *

    - * This does not delete data files. To delete data files, run {@link #execute()}. - *

    - * This may be called before or after {@link #execute()} is called to return the expired file list. + * + *

    This does not delete data files. To delete data files, run {@link #execute()}. + * + *

    This may be called before or after {@link #execute()} is called to return the expired file + * list. * * @return a Dataset of files that are no longer referenced by the table */ @@ -151,7 +155,8 @@ public Dataset expire() { Dataset originalFiles = buildValidFileDF(ops.current()); // perform expiration - org.apache.iceberg.ExpireSnapshots expireSnapshots = table.expireSnapshots().cleanExpiredFiles(false); + org.apache.iceberg.ExpireSnapshots expireSnapshots = + table.expireSnapshots().cleanExpiredFiles(false); for (long id : expiredSnapshotIds) { expireSnapshots = expireSnapshots.expireSnapshotId(id); } @@ -196,17 +201,20 @@ private String jobDesc() { if (!expiredSnapshotIds.isEmpty()) { Long first = expiredSnapshotIds.stream().findFirst().get(); if (expiredSnapshotIds.size() > 1) { - options.add(String.format("snapshot_ids: %s (%s more...)", first, expiredSnapshotIds.size() - 1)); + options.add( + String.format("snapshot_ids: %s (%s more...)", first, expiredSnapshotIds.size() - 1)); } else { options.add(String.format("snapshot_id: %s", first)); } } - return String.format("Expiring snapshots (%s) in %s", Joiner.on(',').join(options), table.name()); + return String.format( + "Expiring snapshots (%s) in %s", Joiner.on(',').join(options), table.name()); } private ExpireSnapshots.Result doExecute() { - boolean streamResults = PropertyUtil.propertyAsBoolean(options(), STREAM_RESULTS, STREAM_RESULTS_DEFAULT); + boolean streamResults = + PropertyUtil.propertyAsBoolean(options(), STREAM_RESULTS, STREAM_RESULTS_DEFAULT); if (streamResults) { return deleteFiles(expire().toLocalIterator()); } else { @@ -235,42 +243,52 @@ private BaseExpireSnapshotsActionResult deleteFiles(Iterator expired) { AtomicLong manifestListCount = new AtomicLong(0L); Tasks.foreach(expired) - .retry(3).stopRetryOn(NotFoundException.class).suppressFailureWhenFinished() + .retry(3) + .stopRetryOn(NotFoundException.class) + .suppressFailureWhenFinished() .executeWith(deleteExecutorService) - .onFailure((fileInfo, exc) -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - LOG.warn("Delete failed for {}: {}", type, file, exc); - }) - .run(fileInfo -> { - String file = fileInfo.getString(0); - String type = fileInfo.getString(1); - deleteFunc.accept(file); - - if (FileContent.DATA.name().equalsIgnoreCase(type)) { - dataFileCount.incrementAndGet(); - LOG.trace("Deleted Data File: {}", file); - } else if (FileContent.POSITION_DELETES.name().equalsIgnoreCase(type)) { - posDeleteFileCount.incrementAndGet(); - LOG.trace("Deleted Positional Delete File: {}", file); - } else if (FileContent.EQUALITY_DELETES.name().equalsIgnoreCase(type)) { - eqDeleteFileCount.incrementAndGet(); - LOG.trace("Deleted Equality Delete File: {}", file); - } else if (MANIFEST.equals(type)) { - manifestCount.incrementAndGet(); - LOG.debug("Deleted Manifest: {}", file); - } else if (MANIFEST_LIST.equalsIgnoreCase(type)) { - manifestListCount.incrementAndGet(); - LOG.debug("Deleted Manifest List: {}", file); - } else { - throw new ValidationException("Illegal file type: %s", type); - } - }); - - long contentFileCount = dataFileCount.get() + posDeleteFileCount.get() + eqDeleteFileCount.get(); - LOG.info("Deleted {} total files", contentFileCount + manifestCount.get() + manifestListCount.get()); - - return new BaseExpireSnapshotsActionResult(dataFileCount.get(), posDeleteFileCount.get(), - eqDeleteFileCount.get(), manifestCount.get(), manifestListCount.get()); + .onFailure( + (fileInfo, exc) -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + LOG.warn("Delete failed for {}: {}", type, file, exc); + }) + .run( + fileInfo -> { + String file = fileInfo.getString(0); + String type = fileInfo.getString(1); + deleteFunc.accept(file); + + if (FileContent.DATA.name().equalsIgnoreCase(type)) { + dataFileCount.incrementAndGet(); + LOG.trace("Deleted Data File: {}", file); + } else if (FileContent.POSITION_DELETES.name().equalsIgnoreCase(type)) { + posDeleteFileCount.incrementAndGet(); + LOG.trace("Deleted Positional Delete File: {}", file); + } else if (FileContent.EQUALITY_DELETES.name().equalsIgnoreCase(type)) { + eqDeleteFileCount.incrementAndGet(); + LOG.trace("Deleted Equality Delete File: {}", file); + } else if (MANIFEST.equals(type)) { + manifestCount.incrementAndGet(); + LOG.debug("Deleted Manifest: {}", file); + } else if (MANIFEST_LIST.equalsIgnoreCase(type)) { + manifestListCount.incrementAndGet(); + LOG.debug("Deleted Manifest List: {}", file); + } else { + throw new ValidationException("Illegal file type: %s", type); + } + }); + + long contentFileCount = + dataFileCount.get() + posDeleteFileCount.get() + eqDeleteFileCount.get(); + LOG.info( + "Deleted {} total files", contentFileCount + manifestCount.get() + manifestListCount.get()); + + return new BaseExpireSnapshotsActionResult( + dataFileCount.get(), + posDeleteFileCount.get(), + eqDeleteFileCount.get(), + manifestCount.get(), + manifestListCount.get()); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java index 269130496dc9..1f82eabc6b6c 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/ManifestFileBean.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.nio.ByteBuffer; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/MigrateTableSparkAction.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/MigrateTableSparkAction.java index 7146bffcbe73..e5716ea15320 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/MigrateTableSparkAction.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/MigrateTableSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.Map; @@ -45,13 +44,11 @@ import scala.collection.JavaConverters; /** - * Takes a Spark table in the source catalog and attempts to transform it into an Iceberg - * table in the same location with the same identifier. Once complete the identifier which - * previously referred to a non-Iceberg table will refer to the newly migrated Iceberg - * table. + * Takes a Spark table in the source catalog and attempts to transform it into an Iceberg table in + * the same location with the same identifier. Once complete the identifier which previously + * referred to a non-Iceberg table will refer to the newly migrated Iceberg table. */ -public class MigrateTableSparkAction - extends BaseTableCreationSparkAction +public class MigrateTableSparkAction extends BaseTableCreationSparkAction implements MigrateTable { private static final Logger LOG = LoggerFactory.getLogger(MigrateTableSparkAction.class); @@ -61,7 +58,8 @@ public class MigrateTableSparkAction private final Identifier destTableIdent; private final Identifier backupIdent; - MigrateTableSparkAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { + MigrateTableSparkAction( + SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { super(spark, sourceCatalog, sourceTableIdent); this.destCatalog = checkDestinationCatalog(sourceCatalog); this.destTableIdent = sourceTableIdent; @@ -132,7 +130,8 @@ private MigrateTable.Result doExecute() { threw = false; } finally { if (threw) { - LOG.error("Failed to perform the migration, aborting table creation and restoring the original table"); + LOG.error( + "Failed to perform the migration, aborting table creation and restoring the original table"); restoreSourceTable(); @@ -147,8 +146,12 @@ private MigrateTable.Result doExecute() { } Snapshot snapshot = icebergTable.currentSnapshot(); - long migratedDataFilesCount = Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); - LOG.info("Successfully loaded Iceberg metadata for {} files to {}", migratedDataFilesCount, destTableIdent()); + long migratedDataFilesCount = + Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); + LOG.info( + "Successfully loaded Iceberg metadata for {} files to {}", + migratedDataFilesCount, + destTableIdent()); return new BaseMigrateTableActionResult(migratedDataFilesCount); } @@ -176,9 +179,11 @@ protected Map destTableProps() { @Override protected TableCatalog checkSourceCatalog(CatalogPlugin catalog) { // currently the import code relies on being able to look up the table in the session catalog - Preconditions.checkArgument(catalog instanceof SparkSessionCatalog, + Preconditions.checkArgument( + catalog instanceof SparkSessionCatalog, "Cannot migrate a table from a non-Iceberg Spark Session Catalog. Found %s of class %s as the source catalog.", - catalog.name(), catalog.getClass().getName()); + catalog.name(), + catalog.getClass().getName()); return (TableCatalog) catalog; } @@ -204,11 +209,15 @@ private void restoreSourceTable() { destCatalog().renameTable(backupIdent, sourceTableIdent()); } catch (org.apache.spark.sql.catalyst.analysis.NoSuchTableException e) { - LOG.error("Cannot restore the original table, the backup table {} cannot be found", backupIdent, e); + LOG.error( + "Cannot restore the original table, the backup table {} cannot be found", backupIdent, e); } catch (org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException e) { - LOG.error("Cannot restore the original table, a table with the original name exists. " + - "Use the backup table {} to restore the original table manually.", backupIdent, e); + LOG.error( + "Cannot restore the original table, a table with the original name exists. " + + "Use the backup table {} to restore the original table manually.", + backupIdent, + e); } } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java index bb2cbd83298f..f754fcb4c74a 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.io.IOException; @@ -73,19 +72,18 @@ import org.slf4j.LoggerFactory; public class RewriteDataFilesSparkAction - extends BaseSnapshotUpdateSparkAction - implements RewriteDataFiles { + extends BaseSnapshotUpdateSparkAction implements RewriteDataFiles { private static final Logger LOG = LoggerFactory.getLogger(RewriteDataFilesSparkAction.class); - private static final Set VALID_OPTIONS = ImmutableSet.of( - MAX_CONCURRENT_FILE_GROUP_REWRITES, - MAX_FILE_GROUP_SIZE_BYTES, - PARTIAL_PROGRESS_ENABLED, - PARTIAL_PROGRESS_MAX_COMMITS, - TARGET_FILE_SIZE_BYTES, - USE_STARTING_SEQUENCE_NUMBER, - REWRITE_JOB_ORDER - ); + private static final Set VALID_OPTIONS = + ImmutableSet.of( + MAX_CONCURRENT_FILE_GROUP_REWRITES, + MAX_FILE_GROUP_SIZE_BYTES, + PARTIAL_PROGRESS_ENABLED, + PARTIAL_PROGRESS_MAX_COMMITS, + TARGET_FILE_SIZE_BYTES, + USE_STARTING_SEQUENCE_NUMBER, + REWRITE_JOB_ORDER); private final Table table; @@ -109,32 +107,40 @@ protected RewriteDataFilesSparkAction self() { @Override public RewriteDataFilesSparkAction binPack() { - Preconditions.checkArgument(this.strategy == null, - "Cannot set strategy to binpack, it has already been set", this.strategy); + Preconditions.checkArgument( + this.strategy == null, + "Cannot set strategy to binpack, it has already been set", + this.strategy); this.strategy = binPackStrategy(); return this; } @Override public RewriteDataFilesSparkAction sort(SortOrder sortOrder) { - Preconditions.checkArgument(this.strategy == null, - "Cannot set strategy to sort, it has already been set to %s", this.strategy); + Preconditions.checkArgument( + this.strategy == null, + "Cannot set strategy to sort, it has already been set to %s", + this.strategy); this.strategy = sortStrategy().sortOrder(sortOrder); return this; } @Override public RewriteDataFilesSparkAction sort() { - Preconditions.checkArgument(this.strategy == null, - "Cannot set strategy to sort, it has already been set to %s", this.strategy); + Preconditions.checkArgument( + this.strategy == null, + "Cannot set strategy to sort, it has already been set to %s", + this.strategy); this.strategy = sortStrategy(); return this; } @Override public RewriteDataFilesSparkAction zOrder(String... columnNames) { - Preconditions.checkArgument(this.strategy == null, - "Cannot set strategy to zorder, it has already been set to %s", this.strategy); + Preconditions.checkArgument( + this.strategy == null, + "Cannot set strategy to zorder, it has already been set to %s", + this.strategy); this.strategy = zOrderStrategy(columnNames); return this; } @@ -160,7 +166,8 @@ public RewriteDataFiles.Result execute() { validateAndInitOptions(); - Map>> fileGroupsByPartition = planFileGroups(startingSnapshotId); + Map>> fileGroupsByPartition = + planFileGroups(startingSnapshotId); RewriteExecutionContext ctx = new RewriteExecutionContext(fileGroupsByPartition); if (ctx.totalGroupCount() == 0) { @@ -179,43 +186,52 @@ public RewriteDataFiles.Result execute() { } Map>> planFileGroups(long startingSnapshotId) { - CloseableIterable fileScanTasks = table.newScan() - .useSnapshot(startingSnapshotId) - .filter(filter) - .ignoreResiduals() - .planFiles(); + CloseableIterable fileScanTasks = + table + .newScan() + .useSnapshot(startingSnapshotId) + .filter(filter) + .ignoreResiduals() + .planFiles(); try { StructType partitionType = table.spec().partitionType(); StructLikeMap> filesByPartition = StructLikeMap.create(partitionType); StructLike emptyStruct = GenericRecord.create(partitionType); - fileScanTasks.forEach(task -> { - // If a task uses an incompatible partition spec the data inside could contain values which - // belong to multiple partitions in the current spec. Treating all such files as un-partitioned and - // grouping them together helps to minimize new files made. - StructLike taskPartition = task.file().specId() == table.spec().specId() ? - task.file().partition() : emptyStruct; - - List files = filesByPartition.get(taskPartition); - if (files == null) { - files = Lists.newArrayList(); - } - - files.add(task); - filesByPartition.put(taskPartition, files); - }); - - StructLikeMap>> fileGroupsByPartition = StructLikeMap.create(partitionType); - - filesByPartition.forEach((partition, tasks) -> { - Iterable filtered = strategy.selectFilesToRewrite(tasks); - Iterable> groupedTasks = strategy.planFileGroups(filtered); - List> fileGroups = ImmutableList.copyOf(groupedTasks); - if (fileGroups.size() > 0) { - fileGroupsByPartition.put(partition, fileGroups); - } - }); + fileScanTasks.forEach( + task -> { + // If a task uses an incompatible partition spec the data inside could contain values + // which + // belong to multiple partitions in the current spec. Treating all such files as + // un-partitioned and + // grouping them together helps to minimize new files made. + StructLike taskPartition = + task.file().specId() == table.spec().specId() + ? task.file().partition() + : emptyStruct; + + List files = filesByPartition.get(taskPartition); + if (files == null) { + files = Lists.newArrayList(); + } + + files.add(task); + filesByPartition.put(taskPartition, files); + }); + + StructLikeMap>> fileGroupsByPartition = + StructLikeMap.create(partitionType); + + filesByPartition.forEach( + (partition, tasks) -> { + Iterable filtered = strategy.selectFilesToRewrite(tasks); + Iterable> groupedTasks = strategy.planFileGroups(filtered); + List> fileGroups = ImmutableList.copyOf(groupedTasks); + if (fileGroups.size() > 0) { + fileGroupsByPartition.put(partition, fileGroups); + } + }); return fileGroupsByPartition; } finally { @@ -230,9 +246,10 @@ Map>> planFileGroups(long startingSnapshotId @VisibleForTesting RewriteFileGroup rewriteFiles(RewriteExecutionContext ctx, RewriteFileGroup fileGroup) { String desc = jobDesc(fileGroup, ctx); - Set addedFiles = withJobGroupInfo( - newJobGroupInfo("REWRITE-DATA-FILES", desc), - () -> strategy.rewriteFiles(fileGroup.fileScans())); + Set addedFiles = + withJobGroupInfo( + newJobGroupInfo("REWRITE-DATA-FILES", desc), + () -> strategy.rewriteFiles(fileGroup.fileScans())); fileGroup.setOutputFiles(addedFiles); LOG.info("Rewrite Files Ready to be Committed - {}", desc); @@ -241,11 +258,10 @@ RewriteFileGroup rewriteFiles(RewriteExecutionContext ctx, RewriteFileGroup file private ExecutorService rewriteService() { return MoreExecutors.getExitingExecutorService( - (ThreadPoolExecutor) Executors.newFixedThreadPool( - maxConcurrentFileGroupRewrites, - new ThreadFactoryBuilder() - .setNameFormat("Rewrite-Service-%d") - .build())); + (ThreadPoolExecutor) + Executors.newFixedThreadPool( + maxConcurrentFileGroupRewrites, + new ThreadFactoryBuilder().setNameFormat("Rewrite-Service-%d").build())); } @VisibleForTesting @@ -253,31 +269,42 @@ RewriteDataFilesCommitManager commitManager(long startingSnapshotId) { return new RewriteDataFilesCommitManager(table, startingSnapshotId, useStartingSequenceNumber); } - private Result doExecute(RewriteExecutionContext ctx, Stream groupStream, - RewriteDataFilesCommitManager commitManager) { + private Result doExecute( + RewriteExecutionContext ctx, + Stream groupStream, + RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); ConcurrentLinkedQueue rewrittenGroups = Queues.newConcurrentLinkedQueue(); - Tasks.Builder rewriteTaskBuilder = Tasks.foreach(groupStream) - .executeWith(rewriteService) - .stopOnFailure() - .noRetry() - .onFailure((fileGroup, exception) -> { - LOG.warn("Failure during rewrite process for group {}", fileGroup.info(), exception); - }); + Tasks.Builder rewriteTaskBuilder = + Tasks.foreach(groupStream) + .executeWith(rewriteService) + .stopOnFailure() + .noRetry() + .onFailure( + (fileGroup, exception) -> { + LOG.warn( + "Failure during rewrite process for group {}", fileGroup.info(), exception); + }); try { - rewriteTaskBuilder.run(fileGroup -> { - rewrittenGroups.add(rewriteFiles(ctx, fileGroup)); - }); + rewriteTaskBuilder.run( + fileGroup -> { + rewrittenGroups.add(rewriteFiles(ctx, fileGroup)); + }); } catch (Exception e) { // At least one rewrite group failed, clean up all completed rewrites - LOG.error("Cannot complete rewrite, {} is not enabled and one of the file set groups failed to " + - "be rewritten. This error occurred during the writing of new files, not during the commit process. This " + - "indicates something is wrong that doesn't involve conflicts with other Iceberg operations. Enabling " + - "{} may help in this case but the root cause should be investigated. Cleaning up {} groups which finished " + - "being written.", PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_ENABLED, rewrittenGroups.size(), e); + LOG.error( + "Cannot complete rewrite, {} is not enabled and one of the file set groups failed to " + + "be rewritten. This error occurred during the writing of new files, not during the commit process. This " + + "indicates something is wrong that doesn't involve conflicts with other Iceberg operations. Enabling " + + "{} may help in this case but the root cause should be investigated. Cleaning up {} groups which finished " + + "being written.", + PARTIAL_PROGRESS_ENABLED, + PARTIAL_PROGRESS_ENABLED, + rewrittenGroups.size(), + e); Tasks.foreach(rewrittenGroups) .suppressFailureWhenFinished() @@ -290,30 +317,33 @@ private Result doExecute(RewriteExecutionContext ctx, Stream g try { commitManager.commitOrClean(Sets.newHashSet(rewrittenGroups)); } catch (ValidationException | CommitFailedException e) { - String errorMessage = String.format( - "Cannot commit rewrite because of a ValidationException or CommitFailedException. This usually means that " + - "this rewrite has conflicted with another concurrent Iceberg operation. To reduce the likelihood of " + - "conflicts, set %s which will break up the rewrite into multiple smaller commits controlled by %s. " + - "Separate smaller rewrite commits can succeed independently while any commits that conflict with " + - "another Iceberg operation will be ignored. This mode will create additional snapshots in the table " + - "history, one for each commit.", - PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_MAX_COMMITS); + String errorMessage = + String.format( + "Cannot commit rewrite because of a ValidationException or CommitFailedException. This usually means that " + + "this rewrite has conflicted with another concurrent Iceberg operation. To reduce the likelihood of " + + "conflicts, set %s which will break up the rewrite into multiple smaller commits controlled by %s. " + + "Separate smaller rewrite commits can succeed independently while any commits that conflict with " + + "another Iceberg operation will be ignored. This mode will create additional snapshots in the table " + + "history, one for each commit.", + PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_MAX_COMMITS); throw new RuntimeException(errorMessage, e); } - List rewriteResults = rewrittenGroups.stream() - .map(RewriteFileGroup::asResult) - .collect(Collectors.toList()); + List rewriteResults = + rewrittenGroups.stream().map(RewriteFileGroup::asResult).collect(Collectors.toList()); return new BaseRewriteDataFilesResult(rewriteResults); } - private Result doExecuteWithPartialProgress(RewriteExecutionContext ctx, Stream groupStream, - RewriteDataFilesCommitManager commitManager) { + private Result doExecuteWithPartialProgress( + RewriteExecutionContext ctx, + Stream groupStream, + RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); // Start Commit Service int groupsPerCommit = IntMath.divide(ctx.totalGroupCount(), maxCommits, RoundingMode.CEILING); - RewriteDataFilesCommitManager.CommitService commitService = commitManager.service(groupsPerCommit); + RewriteDataFilesCommitManager.CommitService commitService = + commitManager.service(groupsPerCommit); commitService.start(); // Start rewrite tasks @@ -321,7 +351,9 @@ private Result doExecuteWithPartialProgress(RewriteExecutionContext ctx, Stream< .suppressFailureWhenFinished() .executeWith(rewriteService) .noRetry() - .onFailure((fileGroup, exception) -> LOG.error("Failure during rewrite group {}", fileGroup.info(), exception)) + .onFailure( + (fileGroup, exception) -> + LOG.error("Failure during rewrite group {}", fileGroup.info(), exception)) .run(fileGroup -> commitService.offer(rewriteFiles(ctx, fileGroup))); rewriteService.shutdown(); @@ -329,30 +361,39 @@ private Result doExecuteWithPartialProgress(RewriteExecutionContext ctx, Stream< commitService.close(); List commitResults = commitService.results(); if (commitResults.size() == 0) { - LOG.error("{} is true but no rewrite commits succeeded. Check the logs to determine why the individual " + - "commits failed. If this is persistent it may help to increase {} which will break the rewrite operation " + - "into smaller commits.", PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_MAX_COMMITS); + LOG.error( + "{} is true but no rewrite commits succeeded. Check the logs to determine why the individual " + + "commits failed. If this is persistent it may help to increase {} which will break the rewrite operation " + + "into smaller commits.", + PARTIAL_PROGRESS_ENABLED, + PARTIAL_PROGRESS_MAX_COMMITS); } - List rewriteResults = commitResults.stream() - .map(RewriteFileGroup::asResult) - .collect(Collectors.toList()); + List rewriteResults = + commitResults.stream().map(RewriteFileGroup::asResult).collect(Collectors.toList()); return new BaseRewriteDataFilesResult(rewriteResults); } - Stream toGroupStream(RewriteExecutionContext ctx, + Stream toGroupStream( + RewriteExecutionContext ctx, Map>> fileGroupsByPartition) { - Stream rewriteFileGroupStream = fileGroupsByPartition.entrySet().stream() - .flatMap(e -> { - StructLike partition = e.getKey(); - List> fileGroups = e.getValue(); - return fileGroups.stream().map(tasks -> { - int globalIndex = ctx.currentGlobalIndex(); - int partitionIndex = ctx.currentPartitionIndex(partition); - FileGroupInfo info = new BaseRewriteDataFilesFileGroupInfo(globalIndex, partitionIndex, partition); - return new RewriteFileGroup(info, tasks); - }); - }); + Stream rewriteFileGroupStream = + fileGroupsByPartition.entrySet().stream() + .flatMap( + e -> { + StructLike partition = e.getKey(); + List> fileGroups = e.getValue(); + return fileGroups.stream() + .map( + tasks -> { + int globalIndex = ctx.currentGlobalIndex(); + int partitionIndex = ctx.currentPartitionIndex(partition); + FileGroupInfo info = + new BaseRewriteDataFilesFileGroupInfo( + globalIndex, partitionIndex, partition); + return new RewriteFileGroup(info, tasks); + }); + }); return rewriteFileGroupStream.sorted(rewriteGroupComparator()); } @@ -379,53 +420,70 @@ void validateAndInitOptions() { Set invalidKeys = Sets.newHashSet(options().keySet()); invalidKeys.removeAll(validOptions); - Preconditions.checkArgument(invalidKeys.isEmpty(), + Preconditions.checkArgument( + invalidKeys.isEmpty(), "Cannot use options %s, they are not supported by the action or the strategy %s", - invalidKeys, strategy.name()); + invalidKeys, + strategy.name()); strategy = strategy.options(options()); - maxConcurrentFileGroupRewrites = PropertyUtil.propertyAsInt(options(), - MAX_CONCURRENT_FILE_GROUP_REWRITES, - MAX_CONCURRENT_FILE_GROUP_REWRITES_DEFAULT); + maxConcurrentFileGroupRewrites = + PropertyUtil.propertyAsInt( + options(), + MAX_CONCURRENT_FILE_GROUP_REWRITES, + MAX_CONCURRENT_FILE_GROUP_REWRITES_DEFAULT); - maxCommits = PropertyUtil.propertyAsInt(options(), - PARTIAL_PROGRESS_MAX_COMMITS, - PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT); + maxCommits = + PropertyUtil.propertyAsInt( + options(), PARTIAL_PROGRESS_MAX_COMMITS, PARTIAL_PROGRESS_MAX_COMMITS_DEFAULT); - partialProgressEnabled = PropertyUtil.propertyAsBoolean(options(), - PARTIAL_PROGRESS_ENABLED, - PARTIAL_PROGRESS_ENABLED_DEFAULT); + partialProgressEnabled = + PropertyUtil.propertyAsBoolean( + options(), PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_ENABLED_DEFAULT); - useStartingSequenceNumber = PropertyUtil.propertyAsBoolean(options(), - USE_STARTING_SEQUENCE_NUMBER, - USE_STARTING_SEQUENCE_NUMBER_DEFAULT); + useStartingSequenceNumber = + PropertyUtil.propertyAsBoolean( + options(), USE_STARTING_SEQUENCE_NUMBER, USE_STARTING_SEQUENCE_NUMBER_DEFAULT); - rewriteJobOrder = RewriteJobOrder.fromName(PropertyUtil.propertyAsString(options(), - REWRITE_JOB_ORDER, - REWRITE_JOB_ORDER_DEFAULT)); + rewriteJobOrder = + RewriteJobOrder.fromName( + PropertyUtil.propertyAsString(options(), REWRITE_JOB_ORDER, REWRITE_JOB_ORDER_DEFAULT)); - Preconditions.checkArgument(maxConcurrentFileGroupRewrites >= 1, + Preconditions.checkArgument( + maxConcurrentFileGroupRewrites >= 1, "Cannot set %s to %s, the value must be positive.", - MAX_CONCURRENT_FILE_GROUP_REWRITES, maxConcurrentFileGroupRewrites); + MAX_CONCURRENT_FILE_GROUP_REWRITES, + maxConcurrentFileGroupRewrites); - Preconditions.checkArgument(!partialProgressEnabled || maxCommits > 0, + Preconditions.checkArgument( + !partialProgressEnabled || maxCommits > 0, "Cannot set %s to %s, the value must be positive when %s is true", - PARTIAL_PROGRESS_MAX_COMMITS, maxCommits, PARTIAL_PROGRESS_ENABLED); + PARTIAL_PROGRESS_MAX_COMMITS, + maxCommits, + PARTIAL_PROGRESS_ENABLED); } private String jobDesc(RewriteFileGroup group, RewriteExecutionContext ctx) { StructLike partition = group.info().partition(); if (partition.size() > 0) { - return String.format("Rewriting %d files (%s, file group %d/%d, %s (%d/%d)) in %s", + return String.format( + "Rewriting %d files (%s, file group %d/%d, %s (%d/%d)) in %s", group.rewrittenFiles().size(), - strategy.name(), group.info().globalIndex(), - ctx.totalGroupCount(), partition, group.info().partitionIndex(), ctx.groupsInPartition(partition), + strategy.name(), + group.info().globalIndex(), + ctx.totalGroupCount(), + partition, + group.info().partitionIndex(), + ctx.groupsInPartition(partition), table.name()); } else { - return String.format("Rewriting %d files (%s, file group %d/%d) in %s", + return String.format( + "Rewriting %d files (%s, file group %d/%d) in %s", group.rewrittenFiles().size(), - strategy.name(), group.info().globalIndex(), ctx.totalGroupCount(), + strategy.name(), + group.info().globalIndex(), + ctx.totalGroupCount(), table.name()); } } @@ -450,11 +508,10 @@ static class RewriteExecutionContext { private final AtomicInteger groupIndex; RewriteExecutionContext(Map>> fileGroupsByPartition) { - this.numGroupsByPartition = fileGroupsByPartition.entrySet().stream() - .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().size())); - this.totalGroupCount = numGroupsByPartition.values().stream() - .reduce(Integer::sum) - .orElse(0); + this.numGroupsByPartition = + fileGroupsByPartition.entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().size())); + this.totalGroupCount = numGroupsByPartition.values().stream().reduce(Integer::sum).orElse(0); this.partitionIndexMap = Maps.newConcurrentMap(); this.groupIndex = new AtomicInteger(1); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteManifestsSparkAction.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteManifestsSparkAction.java index 99e51a37aa30..1e0034eb3005 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteManifestsSparkAction.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteManifestsSparkAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.MetadataTableType.ENTRIES; + import java.io.IOException; import java.util.Collections; import java.util.List; @@ -69,19 +70,16 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.MetadataTableType.ENTRIES; - /** * An action that rewrites manifests in a distributed manner and co-locates metadata for partitions. - *

    - * By default, this action rewrites all manifests for the current partition spec and writes the result - * to the metadata folder. The behavior can be modified by passing a custom predicate to {@link #rewriteIf(Predicate)} - * and a custom spec id to {@link #specId(int)}. In addition, there is a way to configure a custom location - * for new manifests via {@link #stagingLocation}. + * + *

    By default, this action rewrites all manifests for the current partition spec and writes the + * result to the metadata folder. The behavior can be modified by passing a custom predicate to + * {@link #rewriteIf(Predicate)} and a custom spec id to {@link #specId(int)}. In addition, there is + * a way to configure a custom location for new manifests via {@link #stagingLocation}. */ public class RewriteManifestsSparkAction - extends BaseSnapshotUpdateSparkAction - implements RewriteManifests { + extends BaseSnapshotUpdateSparkAction implements RewriteManifests { public static final String USE_CACHING = "use-caching"; public static final boolean USE_CACHING_DEFAULT = true; @@ -103,10 +101,11 @@ public class RewriteManifestsSparkAction this.manifestEncoder = Encoders.javaSerialization(ManifestFile.class); this.table = table; this.spec = table.spec(); - this.targetManifestSizeBytes = PropertyUtil.propertyAsLong( - table.properties(), - TableProperties.MANIFEST_TARGET_SIZE_BYTES, - TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT); + this.targetManifestSizeBytes = + PropertyUtil.propertyAsLong( + table.properties(), + TableProperties.MANIFEST_TARGET_SIZE_BYTES, + TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT); this.fileIO = SparkUtil.serializableFileIO(table); // default the staging location to the metadata location @@ -144,7 +143,9 @@ public RewriteManifestsSparkAction stagingLocation(String newStagingLocation) { @Override public RewriteManifests.Result execute() { - String desc = String.format("Rewriting manifests (staging location=%s) of %s", stagingLocation, table.name()); + String desc = + String.format( + "Rewriting manifests (staging location=%s) of %s", stagingLocation, table.name()); JobGroupInfo info = newJobGroupInfo("REWRITE-MANIFESTS", desc); return withJobGroupInfo(info, this::doExecute); } @@ -159,10 +160,12 @@ private RewriteManifests.Result doExecute() { int numEntries = 0; for (ManifestFile manifest : matchingManifests) { - ValidationException.check(hasFileCounts(manifest), "No file counts in manifest: %s", manifest.path()); + ValidationException.check( + hasFileCounts(manifest), "No file counts in manifest: %s", manifest.path()); totalSizeBytes += manifest.length(); - numEntries += manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); + numEntries += + manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); } int targetNumManifests = targetNumManifests(totalSizeBytes); @@ -174,7 +177,9 @@ private RewriteManifests.Result doExecute() { if (spec.fields().size() < 1) { newManifests = writeManifestsForUnpartitionedTable(manifestEntryDF, targetNumManifests); } else { - newManifests = writeManifestsForPartitionedTable(manifestEntryDF, targetNumManifests, targetNumManifestEntries); + newManifests = + writeManifestsForPartitionedTable( + manifestEntryDF, targetNumManifests, targetNumManifestEntries); } replaceManifests(matchingManifests, newManifests); @@ -183,13 +188,16 @@ private RewriteManifests.Result doExecute() { } private Dataset buildManifestEntryDF(List manifests) { - Dataset manifestDF = spark() - .createDataset(Lists.transform(manifests, ManifestFile::path), Encoders.STRING()) - .toDF("manifest"); + Dataset manifestDF = + spark() + .createDataset(Lists.transform(manifests, ManifestFile::path), Encoders.STRING()) + .toDF("manifest"); - Dataset manifestEntryDF = loadMetadataTable(table, ENTRIES) - .filter("status < 2") // select only live entries - .selectExpr("input_file_name() as manifest", "snapshot_id", "sequence_number", "data_file"); + Dataset manifestEntryDF = + loadMetadataTable(table, ENTRIES) + .filter("status < 2") // select only live entries + .selectExpr( + "input_file_name() as manifest", "snapshot_id", "sequence_number", "data_file"); Column joinCond = manifestDF.col("manifest").equalTo(manifestEntryDF.col("manifest")); return manifestEntryDF @@ -197,7 +205,8 @@ private Dataset buildManifestEntryDF(List manifests) { .select("snapshot_id", "sequence_number", "data_file"); } - private List writeManifestsForUnpartitionedTable(Dataset manifestEntryDF, int numManifests) { + private List writeManifestsForUnpartitionedTable( + Dataset manifestEntryDF, int numManifests) { Broadcast io = sparkContext().broadcast(fileIO); StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType(); @@ -209,41 +218,44 @@ private List writeManifestsForUnpartitionedTable(Dataset mani .repartition(numManifests) .mapPartitions( toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), - manifestEncoder - ) + manifestEncoder) .collectAsList(); } private List writeManifestsForPartitionedTable( - Dataset manifestEntryDF, int numManifests, - int targetNumManifestEntries) { + Dataset manifestEntryDF, int numManifests, int targetNumManifestEntries) { Broadcast io = sparkContext().broadcast(fileIO); StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType(); - // we allow the actual size of manifests to be 10% higher if the estimation is not precise enough + // we allow the actual size of manifests to be 10% higher if the estimation is not precise + // enough long maxNumManifestEntries = (long) (1.1 * targetNumManifestEntries); - return withReusableDS(manifestEntryDF, df -> { - Column partitionColumn = df.col("data_file.partition"); - return df.repartitionByRange(numManifests, partitionColumn) - .sortWithinPartitions(partitionColumn) - .mapPartitions( - toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), - manifestEncoder - ) - .collectAsList(); - }); + return withReusableDS( + manifestEntryDF, + df -> { + Column partitionColumn = df.col("data_file.partition"); + return df.repartitionByRange(numManifests, partitionColumn) + .sortWithinPartitions(partitionColumn) + .mapPartitions( + toManifests( + io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), + manifestEncoder) + .collectAsList(); + }); } private U withReusableDS(Dataset ds, Function, U> func) { Dataset reusableDS; - boolean useCaching = PropertyUtil.propertyAsBoolean(options(), USE_CACHING, USE_CACHING_DEFAULT); + boolean useCaching = + PropertyUtil.propertyAsBoolean(options(), USE_CACHING, USE_CACHING_DEFAULT); if (useCaching) { reusableDS = ds.cache(); } else { int parallelism = SQLConf.get().numShufflePartitions(); - reusableDS = ds.repartition(parallelism).map((MapFunction) value -> value, ds.exprEnc()); + reusableDS = + ds.repartition(parallelism).map((MapFunction) value -> value, ds.exprEnc()); } try { @@ -276,17 +288,19 @@ private int targetNumManifestEntries(int numEntries, int numManifests) { } private boolean hasFileCounts(ManifestFile manifest) { - return manifest.addedFilesCount() != null && - manifest.existingFilesCount() != null && - manifest.deletedFilesCount() != null; + return manifest.addedFilesCount() != null + && manifest.existingFilesCount() != null + && manifest.deletedFilesCount() != null; } - private void replaceManifests(Iterable deletedManifests, Iterable addedManifests) { + private void replaceManifests( + Iterable deletedManifests, Iterable addedManifests) { try { - boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean( - table.properties(), - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, - TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); + boolean snapshotIdInheritanceEnabled = + PropertyUtil.propertyAsBoolean( + table.properties(), + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, + TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); org.apache.iceberg.RewriteManifests rewriteManifests = table.rewriteManifests(); deletedManifests.forEach(rewriteManifests::deleteManifest); @@ -317,12 +331,20 @@ private void deleteFiles(Iterable locations) { } private static ManifestFile writeManifest( - List rows, int startIndex, int endIndex, Broadcast io, - String location, int format, PartitionSpec spec, StructType sparkType) throws IOException { + List rows, + int startIndex, + int endIndex, + Broadcast io, + String location, + int format, + PartitionSpec spec, + StructType sparkType) + throws IOException { String manifestName = "optimized-m-" + UUID.randomUUID(); Path manifestPath = new Path(location, manifestName); - OutputFile outputFile = io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString())); + OutputFile outputFile = + io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString())); Types.StructType dataFileType = DataFile.getType(spec.partitionType()); SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkType); @@ -345,8 +367,12 @@ private static ManifestFile writeManifest( } private static MapPartitionsFunction toManifests( - Broadcast io, long maxNumManifestEntries, String location, - int format, PartitionSpec spec, StructType sparkType) { + Broadcast io, + long maxNumManifestEntries, + String location, + int format, + PartitionSpec spec, + StructType sparkType) { return rows -> { List rowsAsList = Lists.newArrayList(rows); @@ -357,11 +383,15 @@ private static MapPartitionsFunction toManifests( List manifests = Lists.newArrayList(); if (rowsAsList.size() <= maxNumManifestEntries) { - manifests.add(writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType)); + manifests.add( + writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType)); } else { int midIndex = rowsAsList.size() / 2; - manifests.add(writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType)); - manifests.add(writeManifest(rowsAsList, midIndex, rowsAsList.size(), io, location, format, spec, sparkType)); + manifests.add( + writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType)); + manifests.add( + writeManifest( + rowsAsList, midIndex, rowsAsList.size(), io, location, format, spec, sparkType)); } return manifests.iterator(); diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SnapshotTableSparkAction.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SnapshotTableSparkAction.java index 526b46af1a88..289e408b8960 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SnapshotTableSparkAction.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SnapshotTableSparkAction.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.Map; @@ -44,12 +43,10 @@ import scala.collection.JavaConverters; /** - * Creates a new Iceberg table based on a source Spark table. The new Iceberg table will - * have a different data and metadata directory allowing it to exist independently of the - * source table. + * Creates a new Iceberg table based on a source Spark table. The new Iceberg table will have a + * different data and metadata directory allowing it to exist independently of the source table. */ -public class SnapshotTableSparkAction - extends BaseTableCreationSparkAction +public class SnapshotTableSparkAction extends BaseTableCreationSparkAction implements SnapshotTable { private static final Logger LOG = LoggerFactory.getLogger(SnapshotTableSparkAction.class); @@ -58,7 +55,8 @@ public class SnapshotTableSparkAction private Identifier destTableIdent; private String destTableLocation = null; - SnapshotTableSparkAction(SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { + SnapshotTableSparkAction( + SparkSession spark, CatalogPlugin sourceCatalog, Identifier sourceTableIdent) { super(spark, sourceCatalog, sourceTableIdent); } @@ -81,7 +79,8 @@ protected Identifier destTableIdent() { public SnapshotTableSparkAction as(String ident) { String ctx = "snapshot destination"; CatalogPlugin defaultCatalog = spark().sessionState().catalogManager().currentCatalog(); - CatalogAndIdentifier catalogAndIdent = Spark3Util.catalogAndIdentifier(ctx, spark(), ident, defaultCatalog); + CatalogAndIdentifier catalogAndIdent = + Spark3Util.catalogAndIdentifier(ctx, spark(), ident, defaultCatalog); this.destCatalog = checkDestinationCatalog(catalogAndIdent.catalog()); this.destTableIdent = catalogAndIdent.identifier(); return this; @@ -107,11 +106,13 @@ public SnapshotTable.Result execute() { } private SnapshotTable.Result doExecute() { - Preconditions.checkArgument(destCatalog() != null && destTableIdent() != null, - "The destination catalog and identifier cannot be null. " + - "Make sure to configure the action with a valid destination table identifier via the `as` method."); + Preconditions.checkArgument( + destCatalog() != null && destTableIdent() != null, + "The destination catalog and identifier cannot be null. " + + "Make sure to configure the action with a valid destination table identifier via the `as` method."); - LOG.info("Staging a new Iceberg table {} as a snapshot of {}", destTableIdent(), sourceTableIdent()); + LOG.info( + "Staging a new Iceberg table {} as a snapshot of {}", destTableIdent(), sourceTableIdent()); StagedSparkTable stagedTable = stageDestTable(); Table icebergTable = stagedTable.table(); @@ -143,8 +144,12 @@ private SnapshotTable.Result doExecute() { } Snapshot snapshot = icebergTable.currentSnapshot(); - long importedDataFilesCount = Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); - LOG.info("Successfully loaded Iceberg metadata for {} files to {}", importedDataFilesCount, destTableIdent()); + long importedDataFilesCount = + Long.parseLong(snapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); + LOG.info( + "Successfully loaded Iceberg metadata for {} files to {}", + importedDataFilesCount, + destTableIdent()); return new BaseSnapshotTableActionResult(importedDataFilesCount); } @@ -182,22 +187,27 @@ protected Map destTableProps() { @Override protected TableCatalog checkSourceCatalog(CatalogPlugin catalog) { // currently the import code relies on being able to look up the table in the session catalog - Preconditions.checkArgument(catalog.name().equalsIgnoreCase("spark_catalog"), - "Cannot snapshot a table that isn't in the session catalog (i.e. spark_catalog). " + - "Found source catalog: %s.", catalog.name()); - - Preconditions.checkArgument(catalog instanceof TableCatalog, + Preconditions.checkArgument( + catalog.name().equalsIgnoreCase("spark_catalog"), + "Cannot snapshot a table that isn't in the session catalog (i.e. spark_catalog). " + + "Found source catalog: %s.", + catalog.name()); + + Preconditions.checkArgument( + catalog instanceof TableCatalog, "Cannot snapshot as catalog %s of class %s in not a table catalog", - catalog.name(), catalog.getClass().getName()); + catalog.name(), + catalog.getClass().getName()); return (TableCatalog) catalog; } @Override public SnapshotTableSparkAction tableLocation(String location) { - Preconditions.checkArgument(!sourceTableLocation().equals(location), - "The snapshot table location cannot be same as the source table location. " + - "This would mix snapshot table files with original table files."); + Preconditions.checkArgument( + !sourceTableLocation().equals(location), + "The snapshot table location cannot be same as the source table location. " + + "This would mix snapshot table files with original table files."); this.destTableLocation = location; return this; } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java index 330841b9f182..8c886adf510e 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkActions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import org.apache.iceberg.Table; @@ -28,9 +27,9 @@ /** * An implementation of {@link ActionsProvider} for Spark. - *

    - * This class is the primary API for interacting with actions in Spark that users should use - * to instantiate particular actions. + * + *

    This class is the primary API for interacting with actions in Spark that users should use to + * instantiate particular actions. */ public class SparkActions implements ActionsProvider { @@ -52,16 +51,20 @@ public static SparkActions get() { public SnapshotTableSparkAction snapshotTable(String tableIdent) { String ctx = "snapshot source"; CatalogPlugin defaultCatalog = spark.sessionState().catalogManager().currentCatalog(); - CatalogAndIdentifier catalogAndIdent = Spark3Util.catalogAndIdentifier(ctx, spark, tableIdent, defaultCatalog); - return new SnapshotTableSparkAction(spark, catalogAndIdent.catalog(), catalogAndIdent.identifier()); + CatalogAndIdentifier catalogAndIdent = + Spark3Util.catalogAndIdentifier(ctx, spark, tableIdent, defaultCatalog); + return new SnapshotTableSparkAction( + spark, catalogAndIdent.catalog(), catalogAndIdent.identifier()); } @Override public MigrateTableSparkAction migrateTable(String tableIdent) { String ctx = "migrate target"; CatalogPlugin defaultCatalog = spark.sessionState().catalogManager().currentCatalog(); - CatalogAndIdentifier catalogAndIdent = Spark3Util.catalogAndIdentifier(ctx, spark, tableIdent, defaultCatalog); - return new MigrateTableSparkAction(spark, catalogAndIdent.catalog(), catalogAndIdent.identifier()); + CatalogAndIdentifier catalogAndIdent = + Spark3Util.catalogAndIdentifier(ctx, spark, tableIdent, defaultCatalog); + return new MigrateTableSparkAction( + spark, catalogAndIdent.catalog(), catalogAndIdent.identifier()); } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackStrategy.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackStrategy.java index d8c1cc3610bd..aaa63c014165 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackStrategy.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.List; @@ -65,21 +64,27 @@ public Set rewriteFiles(List filesToRewrite) { SparkSession cloneSession = spark.cloneSession(); cloneSession.conf().set(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), false); - Dataset scanDF = cloneSession.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) - .option(SparkReadOptions.SPLIT_SIZE, splitSize(inputFileSize(filesToRewrite))) - .option(SparkReadOptions.FILE_OPEN_COST, "0") - .load(groupID); + Dataset scanDF = + cloneSession + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) + .option(SparkReadOptions.SPLIT_SIZE, splitSize(inputFileSize(filesToRewrite))) + .option(SparkReadOptions.FILE_OPEN_COST, "0") + .load(groupID); // All files within a file group are written with the same spec, so check the first boolean requiresRepartition = !filesToRewrite.get(0).spec().equals(table.spec()); // Invoke a shuffle if the partition spec of the incoming partition does not match the table - String distributionMode = requiresRepartition ? DistributionMode.RANGE.modeName() : - DistributionMode.NONE.modeName(); + String distributionMode = + requiresRepartition + ? DistributionMode.RANGE.modeName() + : DistributionMode.NONE.modeName(); // write the packed data into new files where each split becomes a new file - scanDF.write() + scanDF + .write() .format("iceberg") .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, groupID) .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, writeMaxFileSize()) diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSortStrategy.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSortStrategy.java index 97f46d79382f..285a46fc5431 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSortStrategy.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSortStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.List; @@ -51,14 +50,13 @@ public class SparkSortStrategy extends SortStrategy { /** - * The number of shuffle partitions and consequently the number of output files - * created by the Spark Sort is based on the size of the input data files used - * in this rewrite operation. Due to compression, the disk file sizes may not - * accurately represent the size of files in the output. This parameter lets - * the user adjust the file size used for estimating actual output data size. A - * factor greater than 1.0 would generate more files than we would expect based - * on the on-disk file size. A value less than 1.0 would create fewer files than - * we would expect due to the on-disk size. + * The number of shuffle partitions and consequently the number of output files created by the + * Spark Sort is based on the size of the input data files used in this rewrite operation. Due to + * compression, the disk file sizes may not accurately represent the size of files in the output. + * This parameter lets the user adjust the file size used for estimating actual output data size. + * A factor greater than 1.0 would generate more files than we would expect based on the on-disk + * file size. A value less than 1.0 would create fewer files than we would expect due to the + * on-disk size. */ public static final String COMPRESSION_FACTOR = "compression-factor"; @@ -90,12 +88,12 @@ public Set validOptions() { @Override public RewriteStrategy options(Map options) { - sizeEstimateMultiple = PropertyUtil.propertyAsDouble(options, - COMPRESSION_FACTOR, - 1.0); + sizeEstimateMultiple = PropertyUtil.propertyAsDouble(options, COMPRESSION_FACTOR, 1.0); - Preconditions.checkArgument(sizeEstimateMultiple > 0, - "Invalid compression factor: %s (not positive)", sizeEstimateMultiple); + Preconditions.checkArgument( + sizeEstimateMultiple > 0, + "Invalid compression factor: %s (not positive)", + sizeEstimateMultiple); return super.options(options); } @@ -108,7 +106,9 @@ public Set rewriteFiles(List filesToRewrite) { SortOrder[] ordering; if (requiresRepartition) { // Build in the requirement for Partition Sorting into our sort order - ordering = SparkDistributionAndOrderingUtil.convert(SortOrderUtil.buildSortOrder(table, sortOrder())); + ordering = + SparkDistributionAndOrderingUtil.convert( + SortOrderUtil.buildSortOrder(table, sortOrder())); } else { ordering = SparkDistributionAndOrderingUtil.convert(sortOrder()); } @@ -124,24 +124,30 @@ public Set rewriteFiles(List filesToRewrite) { cloneSession.conf().set(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), false); // Reset Shuffle Partitions for our sort - long numOutputFiles = numOutputFiles((long) (inputFileSize(filesToRewrite) * sizeEstimateMultiple)); + long numOutputFiles = + numOutputFiles((long) (inputFileSize(filesToRewrite) * sizeEstimateMultiple)); cloneSession.conf().set(SQLConf.SHUFFLE_PARTITIONS().key(), Math.max(1, numOutputFiles)); - Dataset scanDF = cloneSession.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) - .load(groupID); + Dataset scanDF = + cloneSession + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) + .load(groupID); // write the packed data into new files where each split becomes a new file SQLConf sqlConf = cloneSession.sessionState().conf(); LogicalPlan sortPlan = sortPlan(distribution, ordering, scanDF.logicalPlan(), sqlConf); Dataset sortedDf = new Dataset<>(cloneSession, sortPlan, scanDF.encoder()); - sortedDf.write() + sortedDf + .write() .format("iceberg") .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, groupID) .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, writeMaxFileSize()) .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false") - .mode("append") // This will only write files without modifying the table, see SparkWrite.RewriteFiles + .mode("append") // This will only write files without modifying the table, see + // SparkWrite.RewriteFiles .save(groupID); return rewriteCoordinator.fetchNewDataFiles(table, groupID); @@ -156,7 +162,8 @@ protected SparkSession spark() { return this.spark; } - protected LogicalPlan sortPlan(Distribution distribution, SortOrder[] ordering, LogicalPlan plan, SQLConf conf) { + protected LogicalPlan sortPlan( + Distribution distribution, SortOrder[] ordering, LogicalPlan plan, SQLConf conf) { return DistributionAndOrderingUtils$.MODULE$.prepareQuery(distribution, ordering, plan, conf); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderStrategy.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderStrategy.java index 8044039a592c..2b3397c9dbca 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderStrategy.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderStrategy.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.util.Arrays; @@ -62,22 +61,28 @@ public class SparkZOrderStrategy extends SparkSortStrategy { private static final Logger LOG = LoggerFactory.getLogger(SparkZOrderStrategy.class); private static final String Z_COLUMN = "ICEZVALUE"; - private static final Schema Z_SCHEMA = new Schema(NestedField.required(0, Z_COLUMN, Types.BinaryType.get())); - private static final org.apache.iceberg.SortOrder Z_SORT_ORDER = org.apache.iceberg.SortOrder.builderFor(Z_SCHEMA) - .sortBy(Z_COLUMN, SortDirection.ASC, NullOrder.NULLS_LAST) - .build(); + private static final Schema Z_SCHEMA = + new Schema(NestedField.required(0, Z_COLUMN, Types.BinaryType.get())); + private static final org.apache.iceberg.SortOrder Z_SORT_ORDER = + org.apache.iceberg.SortOrder.builderFor(Z_SCHEMA) + .sortBy(Z_COLUMN, SortDirection.ASC, NullOrder.NULLS_LAST) + .build(); /** - * Controls the amount of bytes interleaved in the ZOrder Algorithm. Default is all bytes being interleaved. + * Controls the amount of bytes interleaved in the ZOrder Algorithm. Default is all bytes being + * interleaved. */ private static final String MAX_OUTPUT_SIZE_KEY = "max-output-size"; + private static final int DEFAULT_MAX_OUTPUT_SIZE = Integer.MAX_VALUE; /** - * Controls the number of bytes considered from an input column of a type with variable length (String, Binary). - * Default is to use the same size as primitives {@link ZOrderByteUtils#PRIMITIVE_BUFFER_SIZE} + * Controls the number of bytes considered from an input column of a type with variable length + * (String, Binary). Default is to use the same size as primitives {@link + * ZOrderByteUtils#PRIMITIVE_BUFFER_SIZE} */ private static final String VAR_LENGTH_CONTRIBUTION_KEY = "var-length-contribution"; + private static final int DEFAULT_VAR_LENGTH_CONTRIBUTION = ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE; private final List zOrderColNames; @@ -98,16 +103,22 @@ public Set validOptions() { public RewriteStrategy options(Map options) { super.options(options); - varLengthContribution = PropertyUtil.propertyAsInt(options, VAR_LENGTH_CONTRIBUTION_KEY, - DEFAULT_VAR_LENGTH_CONTRIBUTION); - Preconditions.checkArgument(varLengthContribution > 0, + varLengthContribution = + PropertyUtil.propertyAsInt( + options, VAR_LENGTH_CONTRIBUTION_KEY, DEFAULT_VAR_LENGTH_CONTRIBUTION); + Preconditions.checkArgument( + varLengthContribution > 0, "Cannot use less than 1 byte for variable length types with zOrder, %s was set to %s", - VAR_LENGTH_CONTRIBUTION_KEY, varLengthContribution); + VAR_LENGTH_CONTRIBUTION_KEY, + varLengthContribution); - maxOutputSize = PropertyUtil.propertyAsInt(options, MAX_OUTPUT_SIZE_KEY, DEFAULT_MAX_OUTPUT_SIZE); - Preconditions.checkArgument(maxOutputSize > 0, + maxOutputSize = + PropertyUtil.propertyAsInt(options, MAX_OUTPUT_SIZE_KEY, DEFAULT_MAX_OUTPUT_SIZE); + Preconditions.checkArgument( + maxOutputSize > 0, "Cannot have the interleaved ZOrder value use less than 1 byte, %s was set to %s", - MAX_OUTPUT_SIZE_KEY, maxOutputSize); + MAX_OUTPUT_SIZE_KEY, + maxOutputSize); return this; } @@ -115,21 +126,25 @@ public RewriteStrategy options(Map options) { public SparkZOrderStrategy(Table table, SparkSession spark, List zOrderColNames) { super(table, spark); - Preconditions.checkArgument(zOrderColNames != null && !zOrderColNames.isEmpty(), + Preconditions.checkArgument( + zOrderColNames != null && !zOrderColNames.isEmpty(), "Cannot ZOrder when no columns are specified"); - Stream identityPartitionColumns = table.spec().fields().stream() - .filter(f -> f.transform().isIdentity()) - .map(PartitionField::name); - List partZOrderCols = identityPartitionColumns - .filter(zOrderColNames::contains) - .collect(Collectors.toList()); + Stream identityPartitionColumns = + table.spec().fields().stream() + .filter(f -> f.transform().isIdentity()) + .map(PartitionField::name); + List partZOrderCols = + identityPartitionColumns.filter(zOrderColNames::contains).collect(Collectors.toList()); if (!partZOrderCols.isEmpty()) { - LOG.warn("Cannot ZOrder on an Identity partition column as these values are constant within a partition " + - "and will be removed from the ZOrder expression: {}", partZOrderCols); + LOG.warn( + "Cannot ZOrder on an Identity partition column as these values are constant within a partition " + + "and will be removed from the ZOrder expression: {}", + partZOrderCols); zOrderColNames.removeAll(partZOrderCols); - Preconditions.checkArgument(!zOrderColNames.isEmpty(), + Preconditions.checkArgument( + !zOrderColNames.isEmpty(), "Cannot perform ZOrdering, all columns provided were identity partition columns and cannot be used."); } @@ -141,13 +156,16 @@ public SparkZOrderStrategy(Table table, SparkSession spark, List zOrderC private void validateColumnsExistence(Table table, SparkSession spark, List colNames) { boolean caseSensitive = Boolean.parseBoolean(spark.conf().get("spark.sql.caseSensitive")); Schema schema = table.schema(); - colNames.forEach(col -> { - NestedField nestedField = caseSensitive ? schema.findField(col) : schema.caseInsensitiveFindField(col); - if (nestedField == null) { - throw new IllegalArgumentException( - String.format("Cannot find column '%s' in table schema: %s", col, schema.asStruct())); - } - }); + colNames.forEach( + col -> { + NestedField nestedField = + caseSensitive ? schema.findField(col) : schema.caseInsensitiveFindField(col); + if (nestedField == null) { + throw new IllegalArgumentException( + String.format( + "Cannot find column '%s' in table schema: %s", col, schema.asStruct())); + } + }); } @Override @@ -163,14 +181,17 @@ protected void validateOptions() { @Override public Set rewriteFiles(List filesToRewrite) { - SparkZOrderUDF zOrderUDF = new SparkZOrderUDF(zOrderColNames.size(), varLengthContribution, maxOutputSize); + SparkZOrderUDF zOrderUDF = + new SparkZOrderUDF(zOrderColNames.size(), varLengthContribution, maxOutputSize); String groupID = UUID.randomUUID().toString(); boolean requiresRepartition = !filesToRewrite.get(0).spec().equals(table().spec()); SortOrder[] ordering; if (requiresRepartition) { - ordering = SparkDistributionAndOrderingUtil.convert(SortOrderUtil.buildSortOrder(table(), sortOrder())); + ordering = + SparkDistributionAndOrderingUtil.convert( + SortOrderUtil.buildSortOrder(table(), sortOrder())); } else { ordering = SparkDistributionAndOrderingUtil.convert(sortOrder()); } @@ -186,24 +207,31 @@ public Set rewriteFiles(List filesToRewrite) { cloneSession.conf().set(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), false); // Reset Shuffle Partitions for our sort - long numOutputFiles = numOutputFiles((long) (inputFileSize(filesToRewrite) * sizeEstimateMultiple())); + long numOutputFiles = + numOutputFiles((long) (inputFileSize(filesToRewrite) * sizeEstimateMultiple())); cloneSession.conf().set(SQLConf.SHUFFLE_PARTITIONS().key(), Math.max(1, numOutputFiles)); - Dataset scanDF = cloneSession.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) - .load(groupID); - - Column[] originalColumns = Arrays.stream(scanDF.schema().names()) - .map(n -> functions.col(n)) - .toArray(Column[]::new); - - List zOrderColumns = zOrderColNames.stream() - .map(scanDF.schema()::apply) - .collect(Collectors.toList()); - - Column zvalueArray = functions.array(zOrderColumns.stream().map(colStruct -> - zOrderUDF.sortedLexicographically(functions.col(colStruct.name()), colStruct.dataType()) - ).toArray(Column[]::new)); + Dataset scanDF = + cloneSession + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID) + .load(groupID); + + Column[] originalColumns = + Arrays.stream(scanDF.schema().names()).map(n -> functions.col(n)).toArray(Column[]::new); + + List zOrderColumns = + zOrderColNames.stream().map(scanDF.schema()::apply).collect(Collectors.toList()); + + Column zvalueArray = + functions.array( + zOrderColumns.stream() + .map( + colStruct -> + zOrderUDF.sortedLexicographically( + functions.col(colStruct.name()), colStruct.dataType())) + .toArray(Column[]::new)); Dataset zvalueDF = scanDF.withColumn(Z_COLUMN, zOrderUDF.interleaveBytes(zvalueArray)); diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java index eea3689211e2..db359fdd62fc 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.io.IOException; @@ -49,10 +48,11 @@ class SparkZOrderUDF implements Serializable { private static final byte[] PRIMITIVE_EMPTY = new byte[ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE]; /** - * Every Spark task runs iteratively on a rows in a single thread so ThreadLocal should protect from - * concurrent access to any of these structures. + * Every Spark task runs iteratively on a rows in a single thread so ThreadLocal should protect + * from concurrent access to any of these structures. */ private transient ThreadLocal outputBuffer; + private transient ThreadLocal inputHolder; private transient ThreadLocal inputBuffers; private transient ThreadLocal encoder; @@ -94,13 +94,19 @@ byte[] interleaveBits(Seq scalaBinary) { private UserDefinedFunction tinyToOrderedBytesUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((Byte value) -> { - if (value == null) { - return PRIMITIVE_EMPTY; - } - return ZOrderByteUtils.tinyintToOrderedBytes(value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) - .array(); - }, DataTypes.BinaryType).withName("TINY_ORDERED_BYTES"); + UserDefinedFunction udf = + functions + .udf( + (Byte value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + return ZOrderByteUtils.tinyintToOrderedBytes( + value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("TINY_ORDERED_BYTES"); this.inputCol++; increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); @@ -110,13 +116,19 @@ private UserDefinedFunction tinyToOrderedBytesUDF() { private UserDefinedFunction shortToOrderedBytesUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((Short value) -> { - if (value == null) { - return PRIMITIVE_EMPTY; - } - return ZOrderByteUtils.shortToOrderedBytes(value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) - .array(); - }, DataTypes.BinaryType).withName("SHORT_ORDERED_BYTES"); + UserDefinedFunction udf = + functions + .udf( + (Short value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + return ZOrderByteUtils.shortToOrderedBytes( + value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("SHORT_ORDERED_BYTES"); this.inputCol++; increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); @@ -126,13 +138,19 @@ private UserDefinedFunction shortToOrderedBytesUDF() { private UserDefinedFunction intToOrderedBytesUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((Integer value) -> { - if (value == null) { - return PRIMITIVE_EMPTY; - } - return ZOrderByteUtils.intToOrderedBytes(value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) - .array(); - }, DataTypes.BinaryType).withName("INT_ORDERED_BYTES"); + UserDefinedFunction udf = + functions + .udf( + (Integer value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + return ZOrderByteUtils.intToOrderedBytes( + value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("INT_ORDERED_BYTES"); this.inputCol++; increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); @@ -142,13 +160,19 @@ private UserDefinedFunction intToOrderedBytesUDF() { private UserDefinedFunction longToOrderedBytesUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((Long value) -> { - if (value == null) { - return PRIMITIVE_EMPTY; - } - return ZOrderByteUtils.longToOrderedBytes(value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) - .array(); - }, DataTypes.BinaryType).withName("LONG_ORDERED_BYTES"); + UserDefinedFunction udf = + functions + .udf( + (Long value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + return ZOrderByteUtils.longToOrderedBytes( + value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("LONG_ORDERED_BYTES"); this.inputCol++; increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); @@ -158,13 +182,19 @@ private UserDefinedFunction longToOrderedBytesUDF() { private UserDefinedFunction floatToOrderedBytesUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((Float value) -> { - if (value == null) { - return PRIMITIVE_EMPTY; - } - return ZOrderByteUtils.floatToOrderedBytes(value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) - .array(); - }, DataTypes.BinaryType).withName("FLOAT_ORDERED_BYTES"); + UserDefinedFunction udf = + functions + .udf( + (Float value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + return ZOrderByteUtils.floatToOrderedBytes( + value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("FLOAT_ORDERED_BYTES"); this.inputCol++; increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); @@ -174,13 +204,19 @@ private UserDefinedFunction floatToOrderedBytesUDF() { private UserDefinedFunction doubleToOrderedBytesUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((Double value) -> { - if (value == null) { - return PRIMITIVE_EMPTY; - } - return ZOrderByteUtils.doubleToOrderedBytes(value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) - .array(); - }, DataTypes.BinaryType).withName("DOUBLE_ORDERED_BYTES"); + UserDefinedFunction udf = + functions + .udf( + (Double value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + return ZOrderByteUtils.doubleToOrderedBytes( + value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("DOUBLE_ORDERED_BYTES"); this.inputCol++; increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); @@ -190,11 +226,16 @@ private UserDefinedFunction doubleToOrderedBytesUDF() { private UserDefinedFunction booleanToOrderedBytesUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((Boolean value) -> { - ByteBuffer buffer = inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); - buffer.put(0, (byte) (value ? -127 : 0)); - return buffer.array(); - }, DataTypes.BinaryType).withName("BOOLEAN-LEXICAL-BYTES"); + UserDefinedFunction udf = + functions + .udf( + (Boolean value) -> { + ByteBuffer buffer = inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); + buffer.put(0, (byte) (value ? -127 : 0)); + return buffer.array(); + }, + DataTypes.BinaryType) + .withName("BOOLEAN-LEXICAL-BYTES"); this.inputCol++; increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); @@ -203,13 +244,15 @@ private UserDefinedFunction booleanToOrderedBytesUDF() { private UserDefinedFunction stringToOrderedBytesUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((String value) -> - ZOrderByteUtils.stringToOrderedBytes( - value, - varTypeSize, - inputBuffer(position, varTypeSize), - encoder.get()).array(), DataTypes.BinaryType) - .withName("STRING-LEXICAL-BYTES"); + UserDefinedFunction udf = + functions + .udf( + (String value) -> + ZOrderByteUtils.stringToOrderedBytes( + value, varTypeSize, inputBuffer(position, varTypeSize), encoder.get()) + .array(), + DataTypes.BinaryType) + .withName("STRING-LEXICAL-BYTES"); this.inputCol++; increaseOutputSize(varTypeSize); @@ -219,10 +262,15 @@ private UserDefinedFunction stringToOrderedBytesUDF() { private UserDefinedFunction bytesTruncateUDF() { int position = inputCol; - UserDefinedFunction udf = functions.udf((byte[] value) -> - ZOrderByteUtils.byteTruncateOrFill(value, varTypeSize, inputBuffer(position, varTypeSize)).array(), - DataTypes.BinaryType) - .withName("BYTE-TRUNCATE"); + UserDefinedFunction udf = + functions + .udf( + (byte[] value) -> + ZOrderByteUtils.byteTruncateOrFill( + value, varTypeSize, inputBuffer(position, varTypeSize)) + .array(), + DataTypes.BinaryType) + .withName("BYTE-TRUNCATE"); this.inputCol++; increaseOutputSize(varTypeSize); @@ -231,7 +279,8 @@ private UserDefinedFunction bytesTruncateUDF() { } private final UserDefinedFunction interleaveUDF = - functions.udf((Seq arrayBinary) -> interleaveBits(arrayBinary), DataTypes.BinaryType) + functions + .udf((Seq arrayBinary) -> interleaveBits(arrayBinary), DataTypes.BinaryType) .withName("INTERLEAVE_BYTES"); Column interleaveBytes(Column arrayBinary) { @@ -264,7 +313,9 @@ Column sortedLexicographically(Column column, DataType type) { return longToOrderedBytesUDF().apply(column.cast(DataTypes.LongType)); } else { throw new IllegalArgumentException( - String.format("Cannot use column %s of type %s in ZOrdering, the type is unsupported", column, type)); + String.format( + "Cannot use column %s of type %s in ZOrdering, the type is unsupported", + column, type)); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java index 40ed05b4ce65..74454fc1e466 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/AvroWithSparkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import org.apache.iceberg.avro.AvroWithPartnerByStructureVisitor; @@ -30,7 +29,8 @@ import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; -public abstract class AvroWithSparkSchemaVisitor extends AvroWithPartnerByStructureVisitor { +public abstract class AvroWithSparkSchemaVisitor + extends AvroWithPartnerByStructureVisitor { @Override protected boolean isStringType(DataType dataType) { @@ -44,7 +44,8 @@ protected boolean isMapType(DataType dataType) { @Override protected DataType arrayElementType(DataType arrayType) { - Preconditions.checkArgument(arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); + Preconditions.checkArgument( + arrayType instanceof ArrayType, "Invalid array: %s is not an array", arrayType); return ((ArrayType) arrayType).elementType(); } @@ -62,7 +63,8 @@ protected DataType mapValueType(DataType mapType) { @Override protected Pair fieldNameAndType(DataType structType, int pos) { - Preconditions.checkArgument(structType instanceof StructType, "Invalid struct: %s is not a struct", structType); + Preconditions.checkArgument( + structType instanceof StructType, "Invalid struct: %s is not a struct", structType); StructField field = ((StructType) structType).apply(pos); return Pair.of(field.name(), field.dataType()); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java index 924cc3e2325a..d74a76f94e87 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.Deque; @@ -48,9 +47,11 @@ public class ParquetWithSparkSchemaVisitor { public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisitor visitor) { Preconditions.checkArgument(sType != null, "Invalid DataType: null"); if (type instanceof MessageType) { - Preconditions.checkArgument(sType instanceof StructType, "Invalid struct: %s is not a struct", sType); + Preconditions.checkArgument( + sType instanceof StructType, "Invalid struct: %s is not a struct", sType); StructType struct = (StructType) sType; - return visitor.message(struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); + return visitor.message( + struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor)); } else if (type.isPrimitive()) { return visitor.primitive(sType, type.asPrimitiveType()); @@ -62,21 +63,30 @@ public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisit if (annotation != null) { switch (annotation) { case LIST: - Preconditions.checkArgument(!group.isRepetition(Repetition.REPEATED), - "Invalid list: top-level group is repeated: %s", group); - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid list: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + !group.isRepetition(Repetition.REPEATED), + "Invalid list: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid list: does not contain single repeated field: %s", + group); GroupType repeatedElement = group.getFields().get(0).asGroupType(); - Preconditions.checkArgument(repeatedElement.isRepetition(Repetition.REPEATED), + Preconditions.checkArgument( + repeatedElement.isRepetition(Repetition.REPEATED), "Invalid list: inner group is not repeated"); - Preconditions.checkArgument(repeatedElement.getFieldCount() <= 1, - "Invalid list: repeated group is not a single field: %s", group); + Preconditions.checkArgument( + repeatedElement.getFieldCount() <= 1, + "Invalid list: repeated group is not a single field: %s", + group); - Preconditions.checkArgument(sType instanceof ArrayType, "Invalid list: %s is not an array", sType); + Preconditions.checkArgument( + sType instanceof ArrayType, "Invalid list: %s is not an array", sType); ArrayType array = (ArrayType) sType; - StructField element = new StructField( - "element", array.elementType(), array.containsNull(), Metadata.empty()); + StructField element = + new StructField( + "element", array.elementType(), array.containsNull(), Metadata.empty()); visitor.fieldNames.push(repeatedElement.getName()); try { @@ -92,22 +102,30 @@ public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisit } case MAP: - Preconditions.checkArgument(!group.isRepetition(Repetition.REPEATED), - "Invalid map: top-level group is repeated: %s", group); - Preconditions.checkArgument(group.getFieldCount() == 1, - "Invalid map: does not contain single repeated field: %s", group); + Preconditions.checkArgument( + !group.isRepetition(Repetition.REPEATED), + "Invalid map: top-level group is repeated: %s", + group); + Preconditions.checkArgument( + group.getFieldCount() == 1, + "Invalid map: does not contain single repeated field: %s", + group); GroupType repeatedKeyValue = group.getType(0).asGroupType(); - Preconditions.checkArgument(repeatedKeyValue.isRepetition(Repetition.REPEATED), + Preconditions.checkArgument( + repeatedKeyValue.isRepetition(Repetition.REPEATED), "Invalid map: inner group is not repeated"); - Preconditions.checkArgument(repeatedKeyValue.getFieldCount() <= 2, + Preconditions.checkArgument( + repeatedKeyValue.getFieldCount() <= 2, "Invalid map: repeated group does not have 2 fields"); - Preconditions.checkArgument(sType instanceof MapType, "Invalid map: %s is not a map", sType); + Preconditions.checkArgument( + sType instanceof MapType, "Invalid map: %s is not a map", sType); MapType map = (MapType) sType; StructField keyField = new StructField("key", map.keyType(), false, Metadata.empty()); - StructField valueField = new StructField( - "value", map.valueType(), map.valueContainsNull(), Metadata.empty()); + StructField valueField = + new StructField( + "value", map.valueType(), map.valueContainsNull(), Metadata.empty()); visitor.fieldNames.push(repeatedKeyValue.getName()); try { @@ -144,13 +162,15 @@ public static T visit(DataType sType, Type type, ParquetWithSparkSchemaVisit } } - Preconditions.checkArgument(sType instanceof StructType, "Invalid struct: %s is not a struct", sType); + Preconditions.checkArgument( + sType instanceof StructType, "Invalid struct: %s is not a struct", sType); StructType struct = (StructType) sType; return visitor.struct(struct, group, visitFields(struct, group, visitor)); } } - private static T visitField(StructField sField, Type field, ParquetWithSparkSchemaVisitor visitor) { + private static T visitField( + StructField sField, Type field, ParquetWithSparkSchemaVisitor visitor) { visitor.fieldNames.push(field.getName()); try { return visit(sField.dataType(), field, visitor); @@ -159,17 +179,20 @@ private static T visitField(StructField sField, Type field, ParquetWithSpark } } - private static List visitFields(StructType struct, GroupType group, - ParquetWithSparkSchemaVisitor visitor) { + private static List visitFields( + StructType struct, GroupType group, ParquetWithSparkSchemaVisitor visitor) { StructField[] sFields = struct.fields(); - Preconditions.checkArgument(sFields.length == group.getFieldCount(), - "Structs do not match: %s and %s", struct, group); + Preconditions.checkArgument( + sFields.length == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); for (int i = 0; i < sFields.length; i += 1) { Type field = group.getFields().get(i); StructField sField = sFields[i]; - Preconditions.checkArgument(field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), - "Structs do not match: field %s != %s", field.getName(), sField.name()); + Preconditions.checkArgument( + field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), + "Structs do not match: field %s != %s", + field.getName(), + sField.name()); results.add(visitField(sField, field, visitor)); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java index c693e2e2c057..4622d2928ac4 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -38,7 +37,6 @@ import org.apache.iceberg.types.Types; import org.apache.spark.sql.catalyst.InternalRow; - public class SparkAvroReader implements DatumReader, SupportsRowPosition { private final Schema readSchema; @@ -50,10 +48,12 @@ public SparkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSche } @SuppressWarnings("unchecked") - public SparkAvroReader(org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { + public SparkAvroReader( + org.apache.iceberg.Schema expectedSchema, Schema readSchema, Map constants) { this.readSchema = readSchema; - this.reader = (ValueReader) AvroSchemaWithTypeVisitor - .visit(expectedSchema, readSchema, new ReadBuilder(constants)); + this.reader = + (ValueReader) + AvroSchemaWithTypeVisitor.visit(expectedSchema, readSchema, new ReadBuilder(constants)); } @Override @@ -81,8 +81,8 @@ private ReadBuilder(Map idToConstant) { } @Override - public ValueReader record(Types.StructType expected, Schema record, List names, - List> fields) { + public ValueReader record( + Types.StructType expected, Schema record, List names, List> fields) { return SparkValueReaders.struct(fields, expected, idToConstant); } @@ -92,13 +92,14 @@ public ValueReader union(Type expected, Schema union, List> op } @Override - public ValueReader array(Types.ListType expected, Schema array, ValueReader elementReader) { + public ValueReader array( + Types.ListType expected, Schema array, ValueReader elementReader) { return SparkValueReaders.array(elementReader); } @Override - public ValueReader map(Types.MapType expected, Schema map, - ValueReader keyReader, ValueReader valueReader) { + public ValueReader map( + Types.MapType expected, Schema map, ValueReader keyReader, ValueReader valueReader) { return SparkValueReaders.arrayMap(keyReader, valueReader); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java index 7582125128a7..15465568c231 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkAvroWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -50,8 +49,9 @@ public SparkAvroWriter(StructType dsSchema) { @Override @SuppressWarnings("unchecked") public void setSchema(Schema schema) { - this.writer = (ValueWriter) AvroWithSparkSchemaVisitor - .visit(dsSchema, schema, new WriteBuilder()); + this.writer = + (ValueWriter) + AvroWithSparkSchemaVisitor.visit(dsSchema, schema, new WriteBuilder()); } @Override @@ -66,17 +66,23 @@ public Stream metrics() { private static class WriteBuilder extends AvroWithSparkSchemaVisitor> { @Override - public ValueWriter record(DataType struct, Schema record, List names, List> fields) { - return SparkValueWriters.struct(fields, IntStream.range(0, names.size()) - .mapToObj(i -> fieldNameAndType(struct, i).second()).collect(Collectors.toList())); + public ValueWriter record( + DataType struct, Schema record, List names, List> fields) { + return SparkValueWriters.struct( + fields, + IntStream.range(0, names.size()) + .mapToObj(i -> fieldNameAndType(struct, i).second()) + .collect(Collectors.toList())); } @Override public ValueWriter union(DataType type, Schema union, List> options) { - Preconditions.checkArgument(options.contains(ValueWriters.nulls()), - "Cannot create writer for non-option union: %s", union); - Preconditions.checkArgument(options.size() == 2, - "Cannot create writer for non-option union: %s", union); + Preconditions.checkArgument( + options.contains(ValueWriters.nulls()), + "Cannot create writer for non-option union: %s", + union); + Preconditions.checkArgument( + options.size() == 2, "Cannot create writer for non-option union: %s", union); if (union.getTypes().get(0).getType() == Schema.Type.NULL) { return ValueWriters.option(0, options.get(1)); } else { @@ -91,12 +97,15 @@ public ValueWriter array(DataType sArray, Schema array, ValueWriter elemen @Override public ValueWriter map(DataType sMap, Schema map, ValueWriter valueReader) { - return SparkValueWriters.map(SparkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); + return SparkValueWriters.map( + SparkValueWriters.strings(), mapKeyType(sMap), valueReader, mapValueType(sMap)); } @Override - public ValueWriter map(DataType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { - return SparkValueWriters.arrayMap(keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); + public ValueWriter map( + DataType sMap, Schema map, ValueWriter keyWriter, ValueWriter valueWriter) { + return SparkValueWriters.arrayMap( + keyWriter, mapKeyType(sMap), valueWriter, mapValueType(sMap)); } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java index 4ed6420a9aa4..78db137054bc 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.List; @@ -34,10 +33,9 @@ import org.apache.spark.sql.catalyst.InternalRow; /** - * Converts the OrcIterator, which returns ORC's VectorizedRowBatch to a - * set of Spark's UnsafeRows. + * Converts the OrcIterator, which returns ORC's VectorizedRowBatch to a set of Spark's UnsafeRows. * - * It minimizes allocations by reusing most of the objects in the implementation. + *

    It minimizes allocations by reusing most of the objects in the implementation. */ public class SparkOrcReader implements OrcRowReader { private final OrcValueReader reader; @@ -48,8 +46,12 @@ public SparkOrcReader(org.apache.iceberg.Schema expectedSchema, TypeDescription @SuppressWarnings("unchecked") public SparkOrcReader( - org.apache.iceberg.Schema expectedSchema, TypeDescription readOrcSchema, Map idToConstant) { - this.reader = OrcSchemaWithTypeVisitor.visit(expectedSchema, readOrcSchema, new ReadBuilder(idToConstant)); + org.apache.iceberg.Schema expectedSchema, + TypeDescription readOrcSchema, + Map idToConstant) { + this.reader = + OrcSchemaWithTypeVisitor.visit( + expectedSchema, readOrcSchema, new ReadBuilder(idToConstant)); } @Override @@ -71,18 +73,25 @@ private ReadBuilder(Map idToConstant) { @Override public OrcValueReader record( - Types.StructType expected, TypeDescription record, List names, List> fields) { + Types.StructType expected, + TypeDescription record, + List names, + List> fields) { return SparkOrcValueReaders.struct(fields, expected, idToConstant); } @Override - public OrcValueReader list(Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { + public OrcValueReader list( + Types.ListType iList, TypeDescription array, OrcValueReader elementReader) { return SparkOrcValueReaders.array(elementReader); } @Override public OrcValueReader map( - Types.MapType iMap, TypeDescription map, OrcValueReader keyReader, OrcValueReader valueReader) { + Types.MapType iMap, + TypeDescription map, + OrcValueReader keyReader, + OrcValueReader valueReader) { return SparkOrcValueReaders.map(keyReader, valueReader); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java index f35ab7a17c63..9e9b3e53bbcc 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.math.BigDecimal; @@ -44,8 +43,7 @@ import org.apache.spark.unsafe.types.UTF8String; public class SparkOrcValueReaders { - private SparkOrcValueReaders() { - } + private SparkOrcValueReaders() {} public static OrcValueReader utf8String() { return StringReader.INSTANCE; @@ -125,8 +123,7 @@ public MapData nonNullRead(ColumnVector vector, int row) { } return new ArrayBasedMapData( - new GenericArrayData(keys.toArray()), - new GenericArrayData(values.toArray())); + new GenericArrayData(keys.toArray()), new GenericArrayData(values.toArray())); } @Override @@ -139,7 +136,8 @@ public void setBatchContext(long batchOffsetInFile) { static class StructReader extends OrcValueReaders.StructReader { private final int numFields; - protected StructReader(List> readers, Types.StructType struct, Map idToConstant) { + protected StructReader( + List> readers, Types.StructType struct, Map idToConstant) { super(readers, struct, idToConstant); this.numFields = struct.fields().size(); } @@ -162,21 +160,20 @@ protected void set(InternalRow struct, int pos, Object value) { private static class StringReader implements OrcValueReader { private static final StringReader INSTANCE = new StringReader(); - private StringReader() { - } + private StringReader() {} @Override public UTF8String nonNullRead(ColumnVector vector, int row) { BytesColumnVector bytesVector = (BytesColumnVector) vector; - return UTF8String.fromBytes(bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); + return UTF8String.fromBytes( + bytesVector.vector[row], bytesVector.start[row], bytesVector.length[row]); } } private static class TimestampTzReader implements OrcValueReader { private static final TimestampTzReader INSTANCE = new TimestampTzReader(); - private TimestampTzReader() { - } + private TimestampTzReader() {} @Override public Long nonNullRead(ColumnVector vector, int row) { @@ -198,12 +195,20 @@ private static class Decimal18Reader implements OrcValueReader { public Decimal nonNullRead(ColumnVector vector, int row) { HiveDecimalWritable value = ((DecimalColumnVector) vector).vector[row]; - // The scale of decimal read from hive ORC file may be not equals to the expected scale. For data type - // decimal(10,3) and the value 10.100, the hive ORC writer will remove its trailing zero and store it - // as 101*10^(-1), its scale will adjust from 3 to 1. So here we could not assert that value.scale() == scale. - // we also need to convert the hive orc decimal to a decimal with expected precision and scale. - Preconditions.checkArgument(value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); + // The scale of decimal read from hive ORC file may be not equals to the expected scale. For + // data type + // decimal(10,3) and the value 10.100, the hive ORC writer will remove its trailing zero and + // store it + // as 101*10^(-1), its scale will adjust from 3 to 1. So here we could not assert that + // value.scale() == scale. + // we also need to convert the hive orc decimal to a decimal with expected precision and + // scale. + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); return new Decimal().set(value.serialize64(scale), precision, scale); } @@ -220,11 +225,15 @@ private static class Decimal38Reader implements OrcValueReader { @Override public Decimal nonNullRead(ColumnVector vector, int row) { - BigDecimal value = ((DecimalColumnVector) vector).vector[row] - .getHiveDecimal().bigDecimalValue(); - - Preconditions.checkArgument(value.precision() <= precision, - "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); + BigDecimal value = + ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); + + Preconditions.checkArgument( + value.precision() <= precision, + "Cannot read value as decimal(%s,%s), too large: %s", + precision, + scale, + value); return new Decimal().set(new scala.math.BigDecimal(value), precision, scale); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java index abb12dffc050..780090f99109 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.List; @@ -37,8 +36,7 @@ import org.apache.spark.unsafe.types.UTF8String; class SparkOrcValueWriters { - private SparkOrcValueWriters() { - } + private SparkOrcValueWriters() {} static OrcValueWriter strings() { return StringWriter.INSTANCE; @@ -60,8 +58,8 @@ static OrcValueWriter list(OrcValueWriter element, List o return new ListWriter<>(element, orcType); } - static OrcValueWriter map(OrcValueWriter keyWriter, OrcValueWriter valueWriter, - List orcTypes) { + static OrcValueWriter map( + OrcValueWriter keyWriter, OrcValueWriter valueWriter, List orcTypes) { return new MapWriter<>(keyWriter, valueWriter, orcTypes); } @@ -73,7 +71,6 @@ public void nonNullWrite(int rowId, UTF8String data, ColumnVector output) { byte[] value = data.getBytes(); ((BytesColumnVector) output).setRef(rowId, value, 0, value.length); } - } private static class TimestampTzWriter implements OrcValueWriter { @@ -85,7 +82,6 @@ public void nonNullWrite(int rowId, Long micros, ColumnVector output) { cv.time[rowId] = Math.floorDiv(micros, 1_000); // millis cv.nanos[rowId] = (int) Math.floorMod(micros, 1_000_000) * 1_000; // nanos } - } private static class Decimal18Writer implements OrcValueWriter { @@ -97,20 +93,18 @@ private static class Decimal18Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, Decimal decimal, ColumnVector output) { - ((DecimalColumnVector) output).vector[rowId].setFromLongAndScale( - decimal.toUnscaledLong(), scale); + ((DecimalColumnVector) output) + .vector[rowId].setFromLongAndScale(decimal.toUnscaledLong(), scale); } - } private static class Decimal38Writer implements OrcValueWriter { @Override public void nonNullWrite(int rowId, Decimal decimal, ColumnVector output) { - ((DecimalColumnVector) output).vector[rowId].set( - HiveDecimal.create(decimal.toJavaBigDecimal())); + ((DecimalColumnVector) output) + .vector[rowId].set(HiveDecimal.create(decimal.toJavaBigDecimal())); } - } private static class ListWriter implements OrcValueWriter { @@ -120,10 +114,12 @@ private static class ListWriter implements OrcValueWriter { @SuppressWarnings("unchecked") ListWriter(OrcValueWriter writer, List orcTypes) { if (orcTypes.size() != 1) { - throw new IllegalArgumentException("Expected one (and same) ORC type for list elements, got: " + orcTypes); + throw new IllegalArgumentException( + "Expected one (and same) ORC type for list elements, got: " + orcTypes); } this.writer = writer; - this.fieldGetter = (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); + this.fieldGetter = + (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); } @Override @@ -145,7 +141,6 @@ public void nonNullWrite(int rowId, ArrayData value, ColumnVector output) { public Stream> metrics() { return writer.metrics(); } - } private static class MapWriter implements OrcValueWriter { @@ -155,14 +150,20 @@ private static class MapWriter implements OrcValueWriter { private final SparkOrcWriter.FieldGetter valueFieldGetter; @SuppressWarnings("unchecked") - MapWriter(OrcValueWriter keyWriter, OrcValueWriter valueWriter, List orcTypes) { + MapWriter( + OrcValueWriter keyWriter, + OrcValueWriter valueWriter, + List orcTypes) { if (orcTypes.size() != 2) { - throw new IllegalArgumentException("Expected two ORC type descriptions for a map, got: " + orcTypes); + throw new IllegalArgumentException( + "Expected two ORC type descriptions for a map, got: " + orcTypes); } this.keyWriter = keyWriter; this.valueWriter = valueWriter; - this.keyFieldGetter = (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); - this.valueFieldGetter = (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(1)); + this.keyFieldGetter = + (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(0)); + this.valueFieldGetter = + (SparkOrcWriter.FieldGetter) SparkOrcWriter.createFieldGetter(orcTypes.get(1)); } @Override @@ -189,7 +190,6 @@ public void nonNullWrite(int rowId, MapData map, ColumnVector output) { public Stream> metrics() { return Stream.concat(keyWriter.metrics(), valueWriter.metrics()); } - } private static void growColumnVector(ColumnVector cv, int requestedSize) { diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java index 9c7f3a6eb01d..6a8c7f1d3c88 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.Serializable; @@ -39,19 +38,18 @@ import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.expressions.SpecializedGetters; -/** - * This class acts as an adaptor from an OrcFileAppender to a - * FileAppender<InternalRow>. - */ +/** This class acts as an adaptor from an OrcFileAppender to a FileAppender<InternalRow>. */ public class SparkOrcWriter implements OrcRowWriter { private final InternalRowWriter writer; public SparkOrcWriter(Schema iSchema, TypeDescription orcSchema) { - Preconditions.checkArgument(orcSchema.getCategory() == TypeDescription.Category.STRUCT, + Preconditions.checkArgument( + orcSchema.getCategory() == TypeDescription.Category.STRUCT, "Top level must be a struct " + orcSchema); - writer = (InternalRowWriter) OrcSchemaWithTypeVisitor.visit(iSchema, orcSchema, new WriteBuilder()); + writer = + (InternalRowWriter) OrcSchemaWithTypeVisitor.visit(iSchema, orcSchema, new WriteBuilder()); } @Override @@ -71,24 +69,26 @@ public Stream> metrics() { } private static class WriteBuilder extends OrcSchemaWithTypeVisitor> { - private WriteBuilder() { - } + private WriteBuilder() {} @Override - public OrcValueWriter record(Types.StructType iStruct, TypeDescription record, - List names, List> fields) { + public OrcValueWriter record( + Types.StructType iStruct, + TypeDescription record, + List names, + List> fields) { return new InternalRowWriter(fields, record.getChildren()); } @Override - public OrcValueWriter list(Types.ListType iList, TypeDescription array, - OrcValueWriter element) { + public OrcValueWriter list( + Types.ListType iList, TypeDescription array, OrcValueWriter element) { return SparkOrcValueWriters.list(element, array.getChildren()); } @Override - public OrcValueWriter map(Types.MapType iMap, TypeDescription map, - OrcValueWriter key, OrcValueWriter value) { + public OrcValueWriter map( + Types.MapType iMap, TypeDescription map, OrcValueWriter key, OrcValueWriter value) { return SparkOrcValueWriters.map(key, value, map.getChildren()); } @@ -178,8 +178,9 @@ static FieldGetter createFieldGetter(TypeDescription fieldType) { // being changed behind our back. break; case DECIMAL: - fieldGetter = (row, ordinal) -> - row.getDecimal(ordinal, fieldType.getPrecision(), fieldType.getScale()); + fieldGetter = + (row, ordinal) -> + row.getDecimal(ordinal, fieldType.getPrecision(), fieldType.getScale()); break; case STRING: case CHAR: @@ -196,7 +197,8 @@ static FieldGetter createFieldGetter(TypeDescription fieldType) { fieldGetter = SpecializedGetters::getMap; break; default: - throw new IllegalArgumentException("Encountered an unsupported ORC type during a write from Spark."); + throw new IllegalArgumentException( + "Encountered an unsupported ORC type during a write from Spark."); } return (row, ordinal) -> { @@ -210,10 +212,12 @@ static FieldGetter createFieldGetter(TypeDescription fieldType) { interface FieldGetter extends Serializable { /** - * Returns a value from a complex Spark data holder such ArrayData, InternalRow, etc... - * Calls the appropriate getter for the expected data type. + * Returns a value from a complex Spark data holder such ArrayData, InternalRow, etc... Calls + * the appropriate getter for the expected data type. + * * @param row Spark's data representation - * @param ordinal index in the data structure (e.g. column index for InterRow, list index in ArrayData, etc..) + * @param ordinal index in the data structure (e.g. column index for InterRow, list index in + * ArrayData, etc..) * @return field value at ordinal */ @Nullable diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java index 35627ace23b0..4b4964f05fa0 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.math.BigDecimal; @@ -66,25 +65,25 @@ import org.apache.spark.unsafe.types.UTF8String; public class SparkParquetReaders { - private SparkParquetReaders() { - } + private SparkParquetReaders() {} - public static ParquetValueReader buildReader(Schema expectedSchema, - MessageType fileSchema) { + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema) { return buildReader(expectedSchema, fileSchema, ImmutableMap.of()); } @SuppressWarnings("unchecked") - public static ParquetValueReader buildReader(Schema expectedSchema, - MessageType fileSchema, - Map idToConstant) { + public static ParquetValueReader buildReader( + Schema expectedSchema, MessageType fileSchema, Map idToConstant) { if (ParquetSchemaUtil.hasIds(fileSchema)) { return (ParquetValueReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, - new ReadBuilder(fileSchema, idToConstant)); + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), fileSchema, new ReadBuilder(fileSchema, idToConstant)); } else { return (ParquetValueReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), + fileSchema, new FallbackReadBuilder(fileSchema, idToConstant)); } } @@ -95,18 +94,18 @@ private static class FallbackReadBuilder extends ReadBuilder { } @Override - public ParquetValueReader message(Types.StructType expected, MessageType message, - List> fieldReaders) { + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { // the top level matches by ID, but the remaining IDs are missing return super.struct(expected, message, fieldReaders); } @Override - public ParquetValueReader struct(Types.StructType ignored, GroupType struct, - List> fieldReaders) { + public ParquetValueReader struct( + Types.StructType ignored, GroupType struct, List> fieldReaders) { // the expected struct is ignored because nested fields are never found when the - List> newFields = Lists.newArrayListWithExpectedSize( - fieldReaders.size()); + List> newFields = + Lists.newArrayListWithExpectedSize(fieldReaders.size()); List types = Lists.newArrayListWithExpectedSize(fieldReaders.size()); List fields = struct.getFields(); for (int i = 0; i < fields.size(); i += 1) { @@ -130,14 +129,14 @@ private static class ReadBuilder extends TypeWithSchemaVisitor message(Types.StructType expected, MessageType message, - List> fieldReaders) { + public ParquetValueReader message( + Types.StructType expected, MessageType message, List> fieldReaders) { return struct(expected, message.asGroupType(), fieldReaders); } @Override - public ParquetValueReader struct(Types.StructType expected, GroupType struct, - List> fieldReaders) { + public ParquetValueReader struct( + Types.StructType expected, GroupType struct, List> fieldReaders) { // match the expected struct's order Map> readersById = Maps.newHashMap(); Map typesById = Maps.newHashMap(); @@ -152,10 +151,10 @@ public ParquetValueReader struct(Types.StructType expected, GroupType struct, } } - List expectedFields = expected != null ? - expected.fields() : ImmutableList.of(); - List> reorderedFields = Lists.newArrayListWithExpectedSize( - expectedFields.size()); + List expectedFields = + expected != null ? expected.fields() : ImmutableList.of(); + List> reorderedFields = + Lists.newArrayListWithExpectedSize(expectedFields.size()); List types = Lists.newArrayListWithExpectedSize(expectedFields.size()); for (Types.NestedField field : expectedFields) { int id = field.fieldId(); @@ -185,8 +184,8 @@ public ParquetValueReader struct(Types.StructType expected, GroupType struct, } @Override - public ParquetValueReader list(Types.ListType expectedList, GroupType array, - ParquetValueReader elementReader) { + public ParquetValueReader list( + Types.ListType expectedList, GroupType array, ParquetValueReader elementReader) { String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; @@ -195,13 +194,16 @@ public ParquetValueReader list(Types.ListType expectedList, GroupType array, Type elementType = ParquetSchemaUtil.determineListElementType(array); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; - return new ArrayReader<>(repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); + return new ArrayReader<>( + repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); } @Override - public ParquetValueReader map(Types.MapType expectedMap, GroupType map, - ParquetValueReader keyReader, - ParquetValueReader valueReader) { + public ParquetValueReader map( + Types.MapType expectedMap, + GroupType map, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); @@ -213,14 +215,16 @@ public ParquetValueReader map(Types.MapType expectedMap, GroupType map, Type valueType = repeatedKeyValue.getType(1); int valueD = type.getMaxDefinitionLevel(path(valueType.getName())) - 1; - return new MapReader<>(repeatedD, repeatedR, + return new MapReader<>( + repeatedD, + repeatedR, ParquetValueReaders.option(keyType, keyD, keyReader), ParquetValueReaders.option(valueType, valueD, valueReader)); } @Override - public ParquetValueReader primitive(org.apache.iceberg.types.Type.PrimitiveType expected, - PrimitiveType primitive) { + public ParquetValueReader primitive( + org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { ColumnDescriptor desc = type.getColumnDescription(currentPath()); if (primitive.getOriginalType() != null) { @@ -376,12 +380,13 @@ public Long read(Long ignored) { @Override public long readLong() { - final ByteBuffer byteBuffer = column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + final ByteBuffer byteBuffer = + column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); final long timeOfDayNanos = byteBuffer.getLong(); final int julianDay = byteBuffer.getInt(); - return TimeUnit.DAYS.toMicros(julianDay - UNIX_EPOCH_JULIAN) + - TimeUnit.NANOSECONDS.toMicros(timeOfDayNanos); + return TimeUnit.DAYS.toMicros(julianDay - UNIX_EPOCH_JULIAN) + + TimeUnit.NANOSECONDS.toMicros(timeOfDayNanos); } } @@ -455,15 +460,19 @@ protected ArrayData buildList(ReusableArrayData list) { } } - private static class MapReader extends RepeatedKeyValueReader { + private static class MapReader + extends RepeatedKeyValueReader { private int readPos = 0; private int writePos = 0; private final ReusableEntry entry = new ReusableEntry<>(); private final ReusableEntry nullEntry = new ReusableEntry<>(); - MapReader(int definitionLevel, int repetitionLevel, - ParquetValueReader keyReader, ParquetValueReader valueReader) { + MapReader( + int definitionLevel, + int repetitionLevel, + ParquetValueReader keyReader, + ParquetValueReader valueReader) { super(definitionLevel, repetitionLevel, keyReader, valueReader); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index 5e268d26ed9c..c7622678c74d 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.util.Iterator; @@ -54,12 +53,12 @@ import org.apache.spark.unsafe.types.UTF8String; public class SparkParquetWriters { - private SparkParquetWriters() { - } + private SparkParquetWriters() {} @SuppressWarnings("unchecked") public static ParquetValueWriter buildWriter(StructType dfSchema, MessageType type) { - return (ParquetValueWriter) ParquetWithSparkSchemaVisitor.visit(dfSchema, type, new WriteBuilder(type)); + return (ParquetValueWriter) + ParquetWithSparkSchemaVisitor.visit(dfSchema, type, new WriteBuilder(type)); } private static class WriteBuilder extends ParquetWithSparkSchemaVisitor> { @@ -70,14 +69,14 @@ private static class WriteBuilder extends ParquetWithSparkSchemaVisitor message(StructType sStruct, MessageType message, - List> fieldWriters) { + public ParquetValueWriter message( + StructType sStruct, MessageType message, List> fieldWriters) { return struct(sStruct, message.asGroupType(), fieldWriters); } @Override - public ParquetValueWriter struct(StructType sStruct, GroupType struct, - List> fieldWriters) { + public ParquetValueWriter struct( + StructType sStruct, GroupType struct, List> fieldWriters) { List fields = struct.getFields(); StructField[] sparkFields = sStruct.fields(); List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); @@ -91,31 +90,40 @@ public ParquetValueWriter struct(StructType sStruct, GroupType struct, } @Override - public ParquetValueWriter list(ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { + public ParquetValueWriter list( + ArrayType sArray, GroupType array, ParquetValueWriter elementWriter) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - return new ArrayDataWriter<>(repeatedD, repeatedR, + return new ArrayDataWriter<>( + repeatedD, + repeatedR, newOption(repeated.getType(0), elementWriter), sArray.elementType()); } @Override - public ParquetValueWriter map(MapType sMap, GroupType map, - ParquetValueWriter keyWriter, ParquetValueWriter valueWriter) { + public ParquetValueWriter map( + MapType sMap, + GroupType map, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter) { GroupType repeatedKeyValue = map.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); - return new MapDataWriter<>(repeatedD, repeatedR, + return new MapDataWriter<>( + repeatedD, + repeatedR, newOption(repeatedKeyValue.getType(0), keyWriter), newOption(repeatedKeyValue.getType(1), valueWriter), - sMap.keyType(), sMap.valueType()); + sMap.keyType(), + sMap.valueType()); } private ParquetValueWriter newOption(Type fieldType, ParquetValueWriter writer) { @@ -197,18 +205,18 @@ private static PrimitiveWriter utf8Strings(ColumnDescriptor desc) { return new UTF8StringWriter(desc); } - private static PrimitiveWriter decimalAsInteger(ColumnDescriptor desc, - int precision, int scale) { + private static PrimitiveWriter decimalAsInteger( + ColumnDescriptor desc, int precision, int scale) { return new IntegerDecimalWriter(desc, precision, scale); } - private static PrimitiveWriter decimalAsLong(ColumnDescriptor desc, - int precision, int scale) { + private static PrimitiveWriter decimalAsLong( + ColumnDescriptor desc, int precision, int scale) { return new LongDecimalWriter(desc, precision, scale); } - private static PrimitiveWriter decimalAsFixed(ColumnDescriptor desc, - int precision, int scale) { + private static PrimitiveWriter decimalAsFixed( + ColumnDescriptor desc, int precision, int scale) { return new FixedDecimalWriter(desc, precision, scale); } @@ -239,10 +247,18 @@ private IntegerDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, Decimal decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeInteger(repetitionLevel, (int) decimal.toUnscaledLong()); } @@ -260,10 +276,18 @@ private LongDecimalWriter(ColumnDescriptor desc, int precision, int scale) { @Override public void write(int repetitionLevel, Decimal decimal) { - Preconditions.checkArgument(decimal.scale() == scale, - "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); - Preconditions.checkArgument(decimal.precision() <= precision, - "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); + Preconditions.checkArgument( + decimal.scale() == scale, + "Cannot write value as decimal(%s,%s), wrong scale: %s", + precision, + scale, + decimal); + Preconditions.checkArgument( + decimal.precision() <= precision, + "Cannot write value as decimal(%s,%s), too large: %s", + precision, + scale, + decimal); column.writeLong(repetitionLevel, decimal.toUnscaledLong()); } @@ -278,12 +302,15 @@ private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { super(desc); this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(int repetitionLevel, Decimal decimal) { - byte[] binary = DecimalUtil.toReusedFixLengthBytes(precision, scale, decimal.toJavaBigDecimal(), bytes.get()); + byte[] binary = + DecimalUtil.toReusedFixLengthBytes( + precision, scale, decimal.toJavaBigDecimal(), bytes.get()); column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(binary)); } } @@ -302,8 +329,11 @@ public void write(int repetitionLevel, byte[] bytes) { private static class ArrayDataWriter extends RepeatedWriter { private final DataType elementType; - private ArrayDataWriter(int definitionLevel, int repetitionLevel, - ParquetValueWriter writer, DataType elementType) { + private ArrayDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter writer, + DataType elementType) { super(definitionLevel, repetitionLevel, writer); this.elementType = elementType; } @@ -354,9 +384,13 @@ private static class MapDataWriter extends RepeatedKeyValueWriter keyWriter, ParquetValueWriter valueWriter, - DataType keyType, DataType valueType) { + private MapDataWriter( + int definitionLevel, + int repetitionLevel, + ParquetValueWriter keyWriter, + ParquetValueWriter valueWriter, + DataType keyType, + DataType valueType) { super(definitionLevel, repetitionLevel, keyWriter, valueWriter); this.keyType = keyType; this.valueType = valueType; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java index 0d3ce2b28d0b..11655c72d857 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -44,8 +43,7 @@ public class SparkValueReaders { - private SparkValueReaders() { - } + private SparkValueReaders() {} static ValueReader strings() { return StringReader.INSTANCE; @@ -67,8 +65,8 @@ static ValueReader array(ValueReader elementReader) { return new ArrayReader(elementReader); } - static ValueReader arrayMap(ValueReader keyReader, - ValueReader valueReader) { + static ValueReader arrayMap( + ValueReader keyReader, ValueReader valueReader) { return new ArrayMapReader(keyReader, valueReader); } @@ -76,16 +74,15 @@ static ValueReader map(ValueReader keyReader, ValueReader< return new MapReader(keyReader, valueReader); } - static ValueReader struct(List> readers, Types.StructType struct, - Map idToConstant) { + static ValueReader struct( + List> readers, Types.StructType struct, Map idToConstant) { return new StructReader(readers, struct, idToConstant); } private static class StringReader implements ValueReader { private static final StringReader INSTANCE = new StringReader(); - private StringReader() { - } + private StringReader() {} @Override public UTF8String read(Decoder decoder, Object reuse) throws IOException { @@ -97,10 +94,10 @@ public UTF8String read(Decoder decoder, Object reuse) throws IOException { Utf8 string = decoder.readString(utf8); return UTF8String.fromBytes(string.getBytes(), 0, string.getByteLength()); -// int length = decoder.readInt(); -// byte[] bytes = new byte[length]; -// decoder.readFixed(bytes, 0, length); -// return UTF8String.fromBytes(bytes); + // int length = decoder.readInt(); + // byte[] bytes = new byte[length]; + // decoder.readFixed(bytes, 0, length); + // return UTF8String.fromBytes(bytes); } } @@ -122,16 +119,17 @@ public UTF8String read(Decoder decoder, Object ignore) throws IOException { } private static class UUIDReader implements ValueReader { - private static final ThreadLocal BUFFER = ThreadLocal.withInitial(() -> { - ByteBuffer buffer = ByteBuffer.allocate(16); - buffer.order(ByteOrder.BIG_ENDIAN); - return buffer; - }); + private static final ThreadLocal BUFFER = + ThreadLocal.withInitial( + () -> { + ByteBuffer buffer = ByteBuffer.allocate(16); + buffer.order(ByteOrder.BIG_ENDIAN); + return buffer; + }); private static final UUIDReader INSTANCE = new UUIDReader(); - private UUIDReader() { - } + private UUIDReader() {} @Override @SuppressWarnings("ByteBufferBackingArray") @@ -258,14 +256,16 @@ public ArrayBasedMapData read(Decoder decoder, Object reuse) throws IOException static class StructReader extends ValueReaders.StructReader { private final int numFields; - protected StructReader(List> readers, Types.StructType struct, Map idToConstant) { + protected StructReader( + List> readers, Types.StructType struct, Map idToConstant) { super(readers, struct, idToConstant); this.numFields = readers.size(); } @Override protected InternalRow reuseOrCreate(Object reuse) { - if (reuse instanceof GenericInternalRow && ((GenericInternalRow) reuse).numFields() == numFields) { + if (reuse instanceof GenericInternalRow + && ((GenericInternalRow) reuse).numFields() == numFields) { return (InternalRow) reuse; } return new GenericInternalRow(numFields); diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java index 24a69c1d7f11..5f2e2c054888 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkValueWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.IOException; @@ -39,8 +38,7 @@ public class SparkValueWriters { - private SparkValueWriters() { - } + private SparkValueWriters() {} static ValueWriter strings() { return StringWriter.INSTANCE; @@ -75,8 +73,7 @@ static ValueWriter struct(List> writers, List { private static final StringWriter INSTANCE = new StringWriter(); - private StringWriter() { - } + private StringWriter() {} @Override public void write(UTF8String s, Encoder encoder) throws IOException { @@ -88,16 +85,17 @@ public void write(UTF8String s, Encoder encoder) throws IOException { } private static class UUIDWriter implements ValueWriter { - private static final ThreadLocal BUFFER = ThreadLocal.withInitial(() -> { - ByteBuffer buffer = ByteBuffer.allocate(16); - buffer.order(ByteOrder.BIG_ENDIAN); - return buffer; - }); + private static final ThreadLocal BUFFER = + ThreadLocal.withInitial( + () -> { + ByteBuffer buffer = ByteBuffer.allocate(16); + buffer.order(ByteOrder.BIG_ENDIAN); + return buffer; + }); private static final UUIDWriter INSTANCE = new UUIDWriter(); - private UUIDWriter() { - } + private UUIDWriter() {} @Override @SuppressWarnings("ByteBufferBackingArray") @@ -120,12 +118,14 @@ private static class DecimalWriter implements ValueWriter { private DecimalWriter(int precision, int scale) { this.precision = precision; this.scale = scale; - this.bytes = ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); + this.bytes = + ThreadLocal.withInitial(() -> new byte[TypeUtil.decimalRequiredBytes(precision)]); } @Override public void write(Decimal d, Encoder encoder) throws IOException { - encoder.writeFixed(DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toJavaBigDecimal(), bytes.get())); + encoder.writeFixed( + DecimalUtil.toReusedFixLengthBytes(precision, scale, d.toJavaBigDecimal(), bytes.get())); } } @@ -158,8 +158,11 @@ private static class ArrayMapWriter implements ValueWriter { private final DataType keyType; private final DataType valueType; - private ArrayMapWriter(ValueWriter keyWriter, DataType keyType, - ValueWriter valueWriter, DataType valueType) { + private ArrayMapWriter( + ValueWriter keyWriter, + DataType keyType, + ValueWriter valueWriter, + DataType valueType) { this.keyWriter = keyWriter; this.keyType = keyType; this.valueWriter = valueWriter; @@ -189,8 +192,11 @@ private static class MapWriter implements ValueWriter { private final DataType keyType; private final DataType valueType; - private MapWriter(ValueWriter keyWriter, DataType keyType, - ValueWriter valueWriter, DataType valueType) { + private MapWriter( + ValueWriter keyWriter, + DataType keyType, + ValueWriter valueWriter, + DataType valueType) { this.keyWriter = keyWriter; this.keyType = keyType; this.valueWriter = valueWriter; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java index 505ace508352..e32ebcb02bbc 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessorFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.math.BigDecimal; @@ -32,10 +31,12 @@ import org.apache.spark.unsafe.types.UTF8String; final class ArrowVectorAccessorFactory - extends GenericArrowVectorAccessorFactory { + extends GenericArrowVectorAccessorFactory< + Decimal, UTF8String, ColumnarArray, ArrowColumnVector> { ArrowVectorAccessorFactory() { - super(DecimalFactoryImpl::new, + super( + DecimalFactoryImpl::new, StringFactoryImpl::new, StructChildFactoryImpl::new, ArrayFactoryImpl::new); @@ -70,9 +71,7 @@ public UTF8String ofRow(VarCharVector vector, int rowId) { int end = vector.getEndOffset(rowId); return UTF8String.fromAddress( - null, - vector.getDataBuffer().memoryAddress() + start, - end - start); + null, vector.getDataBuffer().memoryAddress() + start, end - start); } @Override @@ -84,7 +83,9 @@ public UTF8String ofBytes(byte[] bytes) { public UTF8String ofByteBuffer(ByteBuffer byteBuffer) { if (byteBuffer.hasArray()) { return UTF8String.fromBytes( - byteBuffer.array(), byteBuffer.arrayOffset() + byteBuffer.position(), byteBuffer.remaining()); + byteBuffer.array(), + byteBuffer.arrayOffset() + byteBuffer.position(), + byteBuffer.remaining()); } byte[] bytes = new byte[byteBuffer.remaining()]; byteBuffer.get(bytes); @@ -92,7 +93,8 @@ public UTF8String ofByteBuffer(ByteBuffer byteBuffer) { } } - private static final class ArrayFactoryImpl implements ArrayFactory { + private static final class ArrayFactoryImpl + implements ArrayFactory { @Override public ArrowColumnVector ofChild(ValueVector childVector) { return new ArrowColumnVector(childVector); @@ -108,7 +110,8 @@ public ColumnarArray ofRow(ValueVector vector, ArrowColumnVector childData, int } } - private static final class StructChildFactoryImpl implements StructChildFactory { + private static final class StructChildFactoryImpl + implements StructChildFactory { @Override public Class getGenericClass() { return ArrowColumnVector.class; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java index f3b3377af2b4..810fef81b5bb 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ArrowVectorAccessors.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.arrow.vectorized.ArrowVectorAccessor; @@ -35,6 +34,5 @@ public class ArrowVectorAccessors { return factory.getVectorAccessor(holder); } - private ArrowVectorAccessors() { - } + private ArrowVectorAccessors() {} } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnVectorBuilder.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnVectorBuilder.java index c0459aae382b..8080a946c6f7 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnVectorBuilder.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnVectorBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.arrow.vectorized.VectorHolder; @@ -39,8 +38,8 @@ public ColumnVector build(VectorHolder holder, int numRows) { if (holder instanceof VectorHolder.DeletedVectorHolder) { return new DeletedColumnVector(Types.BooleanType.get(), isDeleted); } else if (holder instanceof ConstantVectorHolder) { - return new ConstantColumnVector(Types.IntegerType.get(), numRows, - ((ConstantVectorHolder) holder).getConstant()); + return new ConstantColumnVector( + Types.IntegerType.get(), numRows, ((ConstantVectorHolder) holder).getConstant()); } else { throw new IllegalStateException("Unknown dummy vector holder: " + holder); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnVectorWithFilter.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnVectorWithFilter.java index db4e41b04176..ab0d652321d3 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnVectorWithFilter.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnVectorWithFilter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.arrow.vectorized.VectorHolder; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java index 6dada0f84332..9686b63d1858 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ColumnarBatchReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.util.Iterator; @@ -38,9 +37,9 @@ import org.apache.spark.sql.vectorized.ColumnarBatch; /** - * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized read path. The - * {@link ColumnarBatch} returned is created by passing in the Arrow vectors populated via delegated read calls to - * {@linkplain VectorizedArrowReader VectorReader(s)}. + * {@link VectorizedReader} that returns Spark's {@link ColumnarBatch} to support Spark's vectorized + * read path. The {@link ColumnarBatch} returned is created by passing in the Arrow vectors + * populated via delegated read calls to {@linkplain VectorizedArrowReader VectorReader(s)}. */ public class ColumnarBatchReader extends BaseBatchReader { private final boolean hasIsDeletedColumn; @@ -49,12 +48,13 @@ public class ColumnarBatchReader extends BaseBatchReader { public ColumnarBatchReader(List> readers) { super(readers); - this.hasIsDeletedColumn = readers.stream().anyMatch(reader -> reader instanceof DeletedVectorReader); + this.hasIsDeletedColumn = + readers.stream().anyMatch(reader -> reader instanceof DeletedVectorReader); } @Override - public void setRowGroupInfo(PageReadStore pageStore, Map metaData, - long rowPosition) { + public void setRowGroupInfo( + PageReadStore pageStore, Map metaData, long rowPosition) { super.setRowGroupInfo(pageStore, metaData, rowPosition); this.rowStartPosInBatch = rowPosition; } @@ -76,13 +76,16 @@ public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { private class ColumnBatchLoader { private final int numRowsToRead; - // the rowId mapping to skip deleted rows for all column vectors inside a batch, it is null when there is no deletes + // the rowId mapping to skip deleted rows for all column vectors inside a batch, it is null when + // there is no deletes private int[] rowIdMapping; - // the array to indicate if a row is deleted or not, it is null when there is no "_deleted" metadata column + // the array to indicate if a row is deleted or not, it is null when there is no "_deleted" + // metadata column private boolean[] isDeleted; ColumnBatchLoader(int numRowsToRead) { - Preconditions.checkArgument(numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); + Preconditions.checkArgument( + numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead); this.numRowsToRead = numRowsToRead; if (hasIsDeletedColumn) { isDeleted = new boolean[numRowsToRead]; @@ -121,11 +124,14 @@ ColumnVector[] readDataToColumnVectors() { int numRowsInVector = vectorHolders[i].numValues(); Preconditions.checkState( numRowsInVector == numRowsToRead, - "Number of rows in the vector %s didn't match expected %s ", numRowsInVector, + "Number of rows in the vector %s didn't match expected %s ", + numRowsInVector, numRowsToRead); - arrowColumnVectors[i] = columnVectorBuilder.withDeletedRows(rowIdMapping, isDeleted) - .build(vectorHolders[i], numRowsInVector); + arrowColumnVectors[i] = + columnVectorBuilder + .withDeletedRows(rowIdMapping, isDeleted) + .build(vectorHolders[i], numRowsInVector); } return arrowColumnVectors; } @@ -154,12 +160,10 @@ Pair posDelRowIdMapping() { } /** - * Build a row id mapping inside a batch, which skips deleted rows. Here is an example of how we delete 2 rows in a - * batch with 8 rows in total. - * [0,1,2,3,4,5,6,7] -- Original status of the row id mapping array - * [F,F,F,F,F,F,F,F] -- Original status of the isDeleted array - * Position delete 2, 6 - * [0,1,3,4,5,7,-,-] -- After applying position deletes [Set Num records to 6] + * Build a row id mapping inside a batch, which skips deleted rows. Here is an example of how we + * delete 2 rows in a batch with 8 rows in total. [0,1,2,3,4,5,6,7] -- Original status of the + * row id mapping array [F,F,F,F,F,F,F,F] -- Original status of the isDeleted array Position + * delete 2, 6 [0,1,3,4,5,7,-,-] -- After applying position deletes [Set Num records to 6] * [F,F,T,F,F,F,T,F] -- After applying position deletes * * @param deletedRowPositions a set of deleted row positions @@ -203,14 +207,11 @@ int[] initEqDeleteRowIdMapping() { } /** - * Filter out the equality deleted rows. Here is an example, - * [0,1,2,3,4,5,6,7] -- Original status of the row id mapping array - * [F,F,F,F,F,F,F,F] -- Original status of the isDeleted array - * Position delete 2, 6 - * [0,1,3,4,5,7,-,-] -- After applying position deletes [Set Num records to 6] - * [F,F,T,F,F,F,T,F] -- After applying position deletes - * Equality delete 1 <= x <= 3 - * [0,4,5,7,-,-,-,-] -- After applying equality deletes [Set Num records to 4] + * Filter out the equality deleted rows. Here is an example, [0,1,2,3,4,5,6,7] -- Original + * status of the row id mapping array [F,F,F,F,F,F,F,F] -- Original status of the isDeleted + * array Position delete 2, 6 [0,1,3,4,5,7,-,-] -- After applying position deletes [Set Num + * records to 6] [F,F,T,F,F,F,T,F] -- After applying position deletes Equality delete 1 <= x <= + * 3 [0,4,5,7,-,-,-,-] -- After applying equality deletes [Set Num records to 4] * [F,T,T,T,F,F,T,F] -- After applying equality deletes * * @param columnarBatch the {@link ColumnarBatch} to apply the equality delete diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java index 3cdea65b2877..42683ffa901e 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/ConstantColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.spark.SparkSchemaUtil; @@ -39,8 +38,7 @@ class ConstantColumnVector extends ColumnVector { } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/DeletedColumnVector.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/DeletedColumnVector.java index 8fc3d4527321..eec6ecb9ace4 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/DeletedColumnVector.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/DeletedColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; @@ -38,8 +37,7 @@ public DeletedColumnVector(Type type, boolean[] isDeleted) { } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java index 1812282a34f6..38ec3a0e838c 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/IcebergArrowColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.arrow.vectorized.ArrowVectorAccessor; @@ -31,9 +30,10 @@ import org.apache.spark.unsafe.types.UTF8String; /** - * Implementation of Spark's {@link ColumnVector} interface. The code for this class is heavily inspired from Spark's - * {@link ArrowColumnVector} The main difference is in how nullability checks are made in this class by relying on - * {@link NullabilityHolder} instead of the validity vector in the Arrow vector. + * Implementation of Spark's {@link ColumnVector} interface. The code for this class is heavily + * inspired from Spark's {@link ArrowColumnVector} The main difference is in how nullability checks + * are made in this class by relying on {@link NullabilityHolder} instead of the validity vector in + * the Arrow vector. */ public class IcebergArrowColumnVector extends ColumnVector { @@ -151,7 +151,8 @@ public ArrowColumnVector getChild(int ordinal) { return accessor.childColumn(ordinal); } - public ArrowVectorAccessor vectorAccessor() { + public ArrowVectorAccessor + vectorAccessor() { return accessor; } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java index 58db4eb55d04..a389cd8286e5 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/RowPositionColumnVector.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import org.apache.iceberg.spark.SparkSchemaUtil; @@ -37,8 +36,7 @@ public class RowPositionColumnVector extends ColumnVector { } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java index 418c25993a7e..7c3b825a62e7 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.util.List; @@ -47,23 +46,27 @@ public class VectorizedSparkOrcReaders { - private VectorizedSparkOrcReaders() { - } + private VectorizedSparkOrcReaders() {} - public static OrcBatchReader buildReader(Schema expectedSchema, TypeDescription fileSchema, - Map idToConstant) { - Converter converter = OrcSchemaWithTypeVisitor.visit(expectedSchema, fileSchema, new ReadBuilder(idToConstant)); + public static OrcBatchReader buildReader( + Schema expectedSchema, TypeDescription fileSchema, Map idToConstant) { + Converter converter = + OrcSchemaWithTypeVisitor.visit(expectedSchema, fileSchema, new ReadBuilder(idToConstant)); return new OrcBatchReader() { private long batchOffsetInFile; @Override public ColumnarBatch read(VectorizedRowBatch batch) { - BaseOrcColumnVector cv = (BaseOrcColumnVector) converter.convert(new StructColumnVector(batch.size, batch.cols), - batch.size, batchOffsetInFile); - ColumnarBatch columnarBatch = new ColumnarBatch(IntStream.range(0, expectedSchema.columns().size()) - .mapToObj(cv::getChild) - .toArray(ColumnVector[]::new)); + BaseOrcColumnVector cv = + (BaseOrcColumnVector) + converter.convert( + new StructColumnVector(batch.size, batch.cols), batch.size, batchOffsetInFile); + ColumnarBatch columnarBatch = + new ColumnarBatch( + IntStream.range(0, expectedSchema.columns().size()) + .mapToObj(cv::getChild) + .toArray(ColumnVector[]::new)); columnarBatch.setNumRows(batch.size); return columnarBatch; } @@ -76,8 +79,10 @@ public void setBatchContext(long batchOffsetInFile) { } private interface Converter { - ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector columnVector, int batchSize, - long batchOffsetInFile); + ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector columnVector, + int batchSize, + long batchOffsetInFile); } private static class ReadBuilder extends OrcSchemaWithTypeVisitor { @@ -88,8 +93,11 @@ private ReadBuilder(Map idToConstant) { } @Override - public Converter record(Types.StructType iStruct, TypeDescription record, List names, - List fields) { + public Converter record( + Types.StructType iStruct, + TypeDescription record, + List names, + List fields) { return new StructConverter(iStruct, fields, idToConstant); } @@ -132,7 +140,8 @@ public Converter primitive(Type.PrimitiveType iPrimitive, TypeDescription primit primitiveValueReader = SparkOrcValueReaders.timestampTzs(); break; case DECIMAL: - primitiveValueReader = SparkOrcValueReaders.decimals(primitive.getPrecision(), primitive.getScale()); + primitiveValueReader = + SparkOrcValueReaders.decimals(primitive.getPrecision(), primitive.getScale()); break; case CHAR: case VARCHAR: @@ -146,7 +155,8 @@ public Converter primitive(Type.PrimitiveType iPrimitive, TypeDescription primit throw new IllegalArgumentException("Unhandled type " + primitive); } return (columnVector, batchSize, batchOffsetInFile) -> - new PrimitiveOrcColumnVector(iPrimitive, batchSize, columnVector, primitiveValueReader, batchOffsetInFile); + new PrimitiveOrcColumnVector( + iPrimitive, batchSize, columnVector, primitiveValueReader, batchOffsetInFile); } } @@ -155,15 +165,15 @@ private abstract static class BaseOrcColumnVector extends ColumnVector { private final int batchSize; private Integer numNulls; - BaseOrcColumnVector(Type type, int batchSize, org.apache.orc.storage.ql.exec.vector.ColumnVector vector) { + BaseOrcColumnVector( + Type type, int batchSize, org.apache.orc.storage.ql.exec.vector.ColumnVector vector) { super(SparkSchemaUtil.convert(type)); this.vector = vector; this.batchSize = batchSize; } @Override - public void close() { - } + public void close() {} @Override public boolean hasNull() { @@ -278,8 +288,12 @@ private static class PrimitiveOrcColumnVector extends BaseOrcColumnVector { private final OrcValueReader primitiveValueReader; private final long batchOffsetInFile; - PrimitiveOrcColumnVector(Type type, int batchSize, org.apache.orc.storage.ql.exec.vector.ColumnVector vector, - OrcValueReader primitiveValueReader, long batchOffsetInFile) { + PrimitiveOrcColumnVector( + Type type, + int batchSize, + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + OrcValueReader primitiveValueReader, + long batchOffsetInFile) { super(type, batchSize, vector); this.vector = vector; this.primitiveValueReader = primitiveValueReader; @@ -313,7 +327,8 @@ public double getDouble(int rowId) { @Override public Decimal getDecimal(int rowId, int precision, int scale) { - // TODO: Is it okay to assume that (precision,scale) parameters == (precision,scale) of the decimal type + // TODO: Is it okay to assume that (precision,scale) parameters == (precision,scale) of the + // decimal type // and return a Decimal with (precision,scale) of the decimal type? return (Decimal) primitiveValueReader.read(vector, rowId); } @@ -339,16 +354,20 @@ private ArrayConverter(Types.ListType listType, Converter elementConverter) { } @Override - public ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector vector, int batchSize, - long batchOffsetInFile) { + public ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + int batchSize, + long batchOffsetInFile) { ListColumnVector listVector = (ListColumnVector) vector; - ColumnVector elementVector = elementConverter.convert(listVector.child, batchSize, batchOffsetInFile); + ColumnVector elementVector = + elementConverter.convert(listVector.child, batchSize, batchOffsetInFile); return new BaseOrcColumnVector(listType, batchSize, vector) { @Override public ColumnarArray getArray(int rowId) { int index = getRowIndex(rowId); - return new ColumnarArray(elementVector, (int) listVector.offsets[index], (int) listVector.lengths[index]); + return new ColumnarArray( + elementVector, (int) listVector.offsets[index], (int) listVector.lengths[index]); } }; } @@ -366,17 +385,23 @@ private MapConverter(Types.MapType mapType, Converter keyConverter, Converter va } @Override - public ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector vector, int batchSize, - long batchOffsetInFile) { + public ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + int batchSize, + long batchOffsetInFile) { MapColumnVector mapVector = (MapColumnVector) vector; ColumnVector keyVector = keyConverter.convert(mapVector.keys, batchSize, batchOffsetInFile); - ColumnVector valueVector = valueConverter.convert(mapVector.values, batchSize, batchOffsetInFile); + ColumnVector valueVector = + valueConverter.convert(mapVector.values, batchSize, batchOffsetInFile); return new BaseOrcColumnVector(mapType, batchSize, vector) { @Override public ColumnarMap getMap(int rowId) { int index = getRowIndex(rowId); - return new ColumnarMap(keyVector, valueVector, (int) mapVector.offsets[index], + return new ColumnarMap( + keyVector, + valueVector, + (int) mapVector.offsets[index], (int) mapVector.lengths[index]); } }; @@ -388,30 +413,37 @@ private static class StructConverter implements Converter { private final List fieldConverters; private final Map idToConstant; - private StructConverter(Types.StructType structType, List fieldConverters, - Map idToConstant) { + private StructConverter( + Types.StructType structType, + List fieldConverters, + Map idToConstant) { this.structType = structType; this.fieldConverters = fieldConverters; this.idToConstant = idToConstant; } @Override - public ColumnVector convert(org.apache.orc.storage.ql.exec.vector.ColumnVector vector, int batchSize, - long batchOffsetInFile) { + public ColumnVector convert( + org.apache.orc.storage.ql.exec.vector.ColumnVector vector, + int batchSize, + long batchOffsetInFile) { StructColumnVector structVector = (StructColumnVector) vector; List fields = structType.fields(); List fieldVectors = Lists.newArrayListWithExpectedSize(fields.size()); for (int pos = 0, vectorIndex = 0; pos < fields.size(); pos += 1) { Types.NestedField field = fields.get(pos); if (idToConstant.containsKey(field.fieldId())) { - fieldVectors.add(new ConstantColumnVector(field.type(), batchSize, idToConstant.get(field.fieldId()))); + fieldVectors.add( + new ConstantColumnVector(field.type(), batchSize, idToConstant.get(field.fieldId()))); } else if (field.equals(MetadataColumns.ROW_POSITION)) { fieldVectors.add(new RowPositionColumnVector(batchOffsetInFile)); } else if (field.equals(MetadataColumns.IS_DELETED)) { fieldVectors.add(new ConstantColumnVector(field.type(), batchSize, false)); } else { - fieldVectors.add(fieldConverters.get(vectorIndex) - .convert(structVector.fields[vectorIndex], batchSize, batchOffsetInFile)); + fieldVectors.add( + fieldConverters + .get(vectorIndex) + .convert(structVector.fields[vectorIndex], batchSize, batchOffsetInFile)); vectorIndex++; } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java index 020b35f52844..bf85bdb7ed05 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.vectorized; import java.util.List; @@ -33,13 +32,10 @@ public class VectorizedSparkParquetReaders { - private VectorizedSparkParquetReaders() { - } + private VectorizedSparkParquetReaders() {} public static ColumnarBatchReader buildReader( - Schema expectedSchema, - MessageType fileSchema, - boolean setArrowValidityVector) { + Schema expectedSchema, MessageType fileSchema, boolean setArrowValidityVector) { return buildReader(expectedSchema, fileSchema, setArrowValidityVector, Maps.newHashMap()); } @@ -49,33 +45,46 @@ public static ColumnarBatchReader buildReader( boolean setArrowValidityVector, Map idToConstant) { return (ColumnarBatchReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), + fileSchema, new VectorizedReaderBuilder( - expectedSchema, fileSchema, setArrowValidityVector, - idToConstant, ColumnarBatchReader::new)); + expectedSchema, + fileSchema, + setArrowValidityVector, + idToConstant, + ColumnarBatchReader::new)); } - public static ColumnarBatchReader buildReader(Schema expectedSchema, - MessageType fileSchema, - boolean setArrowValidityVector, - Map idToConstant, - DeleteFilter deleteFilter) { + public static ColumnarBatchReader buildReader( + Schema expectedSchema, + MessageType fileSchema, + boolean setArrowValidityVector, + Map idToConstant, + DeleteFilter deleteFilter) { return (ColumnarBatchReader) - TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema, + TypeWithSchemaVisitor.visit( + expectedSchema.asStruct(), + fileSchema, new ReaderBuilder( - expectedSchema, fileSchema, setArrowValidityVector, - idToConstant, ColumnarBatchReader::new, deleteFilter)); + expectedSchema, + fileSchema, + setArrowValidityVector, + idToConstant, + ColumnarBatchReader::new, + deleteFilter)); } private static class ReaderBuilder extends VectorizedReaderBuilder { private final DeleteFilter deleteFilter; - ReaderBuilder(Schema expectedSchema, - MessageType parquetSchema, - boolean setArrowValidityVector, - Map idToConstant, - Function>, VectorizedReader> readerFactory, - DeleteFilter deleteFilter) { + ReaderBuilder( + Schema expectedSchema, + MessageType parquetSchema, + boolean setArrowValidityVector, + Map idToConstant, + Function>, VectorizedReader> readerFactory, + DeleteFilter deleteFilter) { super(expectedSchema, parquetSchema, setArrowValidityVector, idToConstant, readerFactory); this.deleteFilter = deleteFilter; } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/AddFilesProcedure.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/AddFilesProcedure.java index a902887f2965..ea0ca4c5a9b9 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/AddFilesProcedure.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/AddFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Collections; @@ -56,16 +55,19 @@ class AddFilesProcedure extends BaseProcedure { private static final Joiner.MapJoiner MAP_JOINER = Joiner.on(",").withKeyValueSeparator("="); - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("source_table", DataTypes.StringType), - ProcedureParameter.optional("partition_filter", STRING_MAP), - ProcedureParameter.optional("check_duplicate_files", DataTypes.BooleanType) - }; - - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("added_files_count", DataTypes.LongType, false, Metadata.empty()) - }); + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("source_table", DataTypes.StringType), + ProcedureParameter.optional("partition_filter", STRING_MAP), + ProcedureParameter.optional("check_duplicate_files", DataTypes.BooleanType) + }; + + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("added_files_count", DataTypes.LongType, false, Metadata.empty()) + }); private AddFilesProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -95,15 +97,19 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); CatalogPlugin sessionCat = spark().sessionState().catalogManager().v2SessionCatalog(); - Identifier sourceIdent = toCatalogAndIdentifier(args.getString(1), PARAMETERS[1].name(), sessionCat).identifier(); + Identifier sourceIdent = + toCatalogAndIdentifier(args.getString(1), PARAMETERS[1].name(), sessionCat).identifier(); Map partitionFilter = Maps.newHashMap(); if (!args.isNullAt(2)) { - args.getMap(2).foreach(DataTypes.StringType, DataTypes.StringType, - (k, v) -> { - partitionFilter.put(k.toString(), v.toString()); - return BoxedUnit.UNIT; - }); + args.getMap(2) + .foreach( + DataTypes.StringType, + DataTypes.StringType, + (k, v) -> { + partitionFilter.put(k.toString(), v.toString()); + return BoxedUnit.UNIT; + }); } boolean checkDuplicateFiles; @@ -113,36 +119,42 @@ public InternalRow[] call(InternalRow args) { checkDuplicateFiles = args.getBoolean(3); } - long addedFilesCount = importToIceberg(tableIdent, sourceIdent, partitionFilter, checkDuplicateFiles); - return new InternalRow[]{newInternalRow(addedFilesCount)}; + long addedFilesCount = + importToIceberg(tableIdent, sourceIdent, partitionFilter, checkDuplicateFiles); + return new InternalRow[] {newInternalRow(addedFilesCount)}; } private boolean isFileIdentifier(Identifier ident) { String[] namespace = ident.namespace(); - return namespace.length == 1 && - (namespace[0].equalsIgnoreCase("orc") || - namespace[0].equalsIgnoreCase("parquet") || - namespace[0].equalsIgnoreCase("avro")); + return namespace.length == 1 + && (namespace[0].equalsIgnoreCase("orc") + || namespace[0].equalsIgnoreCase("parquet") + || namespace[0].equalsIgnoreCase("avro")); } - private long importToIceberg(Identifier destIdent, Identifier sourceIdent, Map partitionFilter, - boolean checkDuplicateFiles) { - return modifyIcebergTable(destIdent, table -> { - - validatePartitionSpec(table, partitionFilter); - ensureNameMappingPresent(table); - - if (isFileIdentifier(sourceIdent)) { - Path sourcePath = new Path(sourceIdent.name()); - String format = sourceIdent.namespace()[0]; - importFileTable(table, sourcePath, format, partitionFilter, checkDuplicateFiles); - } else { - importCatalogTable(table, sourceIdent, partitionFilter, checkDuplicateFiles); - } - - Snapshot snapshot = table.currentSnapshot(); - return Long.parseLong(snapshot.summary().getOrDefault(SnapshotSummary.ADDED_FILES_PROP, "0")); - }); + private long importToIceberg( + Identifier destIdent, + Identifier sourceIdent, + Map partitionFilter, + boolean checkDuplicateFiles) { + return modifyIcebergTable( + destIdent, + table -> { + validatePartitionSpec(table, partitionFilter); + ensureNameMappingPresent(table); + + if (isFileIdentifier(sourceIdent)) { + Path sourcePath = new Path(sourceIdent.name()); + String format = sourceIdent.namespace()[0]; + importFileTable(table, sourcePath, format, partitionFilter, checkDuplicateFiles); + } else { + importCatalogTable(table, sourceIdent, partitionFilter, checkDuplicateFiles); + } + + Snapshot snapshot = table.currentSnapshot(); + return Long.parseLong( + snapshot.summary().getOrDefault(SnapshotSummary.ADDED_FILES_PROP, "0")); + }); } private static void ensureNameMappingPresent(Table table) { @@ -150,46 +162,59 @@ private static void ensureNameMappingPresent(Table table) { // Forces Name based resolution instead of position based resolution NameMapping mapping = MappingUtil.create(table.schema()); String mappingJson = NameMappingParser.toJson(mapping); - table.updateProperties() - .set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson) - .commit(); + table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit(); } } - private void importFileTable(Table table, Path tableLocation, String format, Map partitionFilter, - boolean checkDuplicateFiles) { + private void importFileTable( + Table table, + Path tableLocation, + String format, + Map partitionFilter, + boolean checkDuplicateFiles) { // List Partitions via Spark InMemory file search interface List partitions = Spark3Util.getPartitions(spark(), tableLocation, format, partitionFilter); if (table.spec().isUnpartitioned()) { - Preconditions.checkArgument(partitions.isEmpty(), "Cannot add partitioned files to an unpartitioned table"); - Preconditions.checkArgument(partitionFilter.isEmpty(), "Cannot use a partition filter when importing" + - "to an unpartitioned table"); + Preconditions.checkArgument( + partitions.isEmpty(), "Cannot add partitioned files to an unpartitioned table"); + Preconditions.checkArgument( + partitionFilter.isEmpty(), + "Cannot use a partition filter when importing" + "to an unpartitioned table"); // Build a Global Partition for the source - SparkPartition partition = new SparkPartition(Collections.emptyMap(), tableLocation.toString(), format); + SparkPartition partition = + new SparkPartition(Collections.emptyMap(), tableLocation.toString(), format); importPartitions(table, ImmutableList.of(partition), checkDuplicateFiles); } else { - Preconditions.checkArgument(!partitions.isEmpty(), - "Cannot find any matching partitions in table %s", partitions); + Preconditions.checkArgument( + !partitions.isEmpty(), "Cannot find any matching partitions in table %s", partitions); importPartitions(table, partitions, checkDuplicateFiles); } } - private void importCatalogTable(Table table, Identifier sourceIdent, Map partitionFilter, - boolean checkDuplicateFiles) { + private void importCatalogTable( + Table table, + Identifier sourceIdent, + Map partitionFilter, + boolean checkDuplicateFiles) { String stagingLocation = getMetadataLocation(table); TableIdentifier sourceTableIdentifier = Spark3Util.toV1TableIdentifier(sourceIdent); - SparkTableUtil.importSparkTable(spark(), sourceTableIdentifier, table, stagingLocation, partitionFilter, + SparkTableUtil.importSparkTable( + spark(), + sourceTableIdentifier, + table, + stagingLocation, + partitionFilter, checkDuplicateFiles); } - private void importPartitions(Table table, List partitions, - boolean checkDuplicateFiles) { + private void importPartitions( + Table table, List partitions, boolean checkDuplicateFiles) { String stagingLocation = getMetadataLocation(table); - SparkTableUtil.importSparkPartitions(spark(), partitions, table, table.spec(), stagingLocation, - checkDuplicateFiles); + SparkTableUtil.importSparkPartitions( + spark(), partitions, table, table.spec(), stagingLocation, checkDuplicateFiles); } private String getMetadataLocation(Table table) { @@ -204,38 +229,51 @@ public String description() { private void validatePartitionSpec(Table table, Map partitionFilter) { List partitionFields = table.spec().fields(); - Set partitionNames = table.spec().fields().stream().map(PartitionField::name).collect(Collectors.toSet()); + Set partitionNames = + table.spec().fields().stream().map(PartitionField::name).collect(Collectors.toSet()); boolean tablePartitioned = !partitionFields.isEmpty(); boolean partitionSpecPassed = !partitionFilter.isEmpty(); // Check for any non-identity partition columns - List nonIdentityFields = partitionFields.stream() - .filter(x -> !x.transform().isIdentity()) - .collect(Collectors.toList()); - Preconditions.checkArgument(nonIdentityFields.isEmpty(), - "Cannot add data files to target table %s because that table is partitioned and contains non-identity" + - "partition transforms which will not be compatible. Found non-identity fields %s", - table.name(), nonIdentityFields); + List nonIdentityFields = + partitionFields.stream() + .filter(x -> !x.transform().isIdentity()) + .collect(Collectors.toList()); + Preconditions.checkArgument( + nonIdentityFields.isEmpty(), + "Cannot add data files to target table %s because that table is partitioned and contains non-identity" + + "partition transforms which will not be compatible. Found non-identity fields %s", + table.name(), + nonIdentityFields); if (tablePartitioned && partitionSpecPassed) { // Check to see there are sufficient partition columns to satisfy the filter - Preconditions.checkArgument(partitionFields.size() >= partitionFilter.size(), - "Cannot add data files to target table %s because that table is partitioned, " + - "but the number of columns in the provided partition filter (%s) " + - "is greater than the number of partitioned columns in table (%s)", - table.name(), partitionFilter.size(), partitionFields.size()); + Preconditions.checkArgument( + partitionFields.size() >= partitionFilter.size(), + "Cannot add data files to target table %s because that table is partitioned, " + + "but the number of columns in the provided partition filter (%s) " + + "is greater than the number of partitioned columns in table (%s)", + table.name(), + partitionFilter.size(), + partitionFields.size()); // Check for any filters of non existent columns - List unMatchedFilters = partitionFilter.keySet().stream() - .filter(filterName -> !partitionNames.contains(filterName)) - .collect(Collectors.toList()); - Preconditions.checkArgument(unMatchedFilters.isEmpty(), - "Cannot add files to target table %s. %s is partitioned but the specified partition filter " + - "refers to columns that are not partitioned: '%s' . Valid partition columns %s", - table.name(), table.name(), unMatchedFilters, String.join(",", partitionNames)); + List unMatchedFilters = + partitionFilter.keySet().stream() + .filter(filterName -> !partitionNames.contains(filterName)) + .collect(Collectors.toList()); + Preconditions.checkArgument( + unMatchedFilters.isEmpty(), + "Cannot add files to target table %s. %s is partitioned but the specified partition filter " + + "refers to columns that are not partitioned: '%s' . Valid partition columns %s", + table.name(), + table.name(), + unMatchedFilters, + String.join(",", partitionNames)); } else { - Preconditions.checkArgument(!partitionSpecPassed, + Preconditions.checkArgument( + !partitionSpecPassed, "Cannot use partition filter with an unpartitioned table %s", table.name()); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/AncestorsOfProcedure.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/AncestorsOfProcedure.java index bfbca05a5744..60d6247411b6 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/AncestorsOfProcedure.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/AncestorsOfProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.List; @@ -35,15 +34,18 @@ public class AncestorsOfProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[] { - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("snapshot_id", DataTypes.LongType), + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("snapshot_id", DataTypes.LongType), }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[] { - new StructField("snapshot_id", DataTypes.LongType, true, Metadata.empty()), - new StructField("timestamp", DataTypes.LongType, true, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("snapshot_id", DataTypes.LongType, true, Metadata.empty()), + new StructField("timestamp", DataTypes.LongType, true, Metadata.empty()) + }); private AncestorsOfProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -77,11 +79,13 @@ public InternalRow[] call(InternalRow args) { Table icebergTable = sparkTable.table(); if (toSnapshotId == null) { - toSnapshotId = icebergTable.currentSnapshot() != null ? icebergTable.currentSnapshot().snapshotId() : -1; + toSnapshotId = + icebergTable.currentSnapshot() != null ? icebergTable.currentSnapshot().snapshotId() : -1; } - List snapshotIds = Lists.newArrayList( - SnapshotUtil.ancestorIdsBetween(toSnapshotId, null, icebergTable::snapshot)); + List snapshotIds = + Lists.newArrayList( + SnapshotUtil.ancestorIdsBetween(toSnapshotId, null, icebergTable::snapshot)); return toOutputRow(icebergTable, snapshotIds); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/BaseProcedure.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/BaseProcedure.java index 3bb936e32dcb..86364dc262b2 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/BaseProcedure.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/BaseProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.concurrent.ExecutorService; @@ -48,7 +47,8 @@ import scala.Option; abstract class BaseProcedure implements Procedure { - protected static final DataType STRING_MAP = DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType); + protected static final DataType STRING_MAP = + DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType); private final SparkSession spark; private final TableCatalog tableCatalog; @@ -92,7 +92,8 @@ protected T withIcebergTable(Identifier ident, Function T execute(Identifier ident, boolean refreshSparkCache, Function func) { + private T execute( + Identifier ident, boolean refreshSparkCache, Function func) { SparkTable sparkTable = loadSparkTable(ident); org.apache.iceberg.Table icebergTable = sparkTable.table(); @@ -106,38 +107,47 @@ private T execute(Identifier ident, boolean refreshSparkCache, Function - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * A procedure that applies changes in a given snapshot and creates a new snapshot which will be set + * as the current snapshot in a table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#cherrypick(long) */ class CherrypickSnapshotProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("snapshot_id", DataTypes.LongType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("snapshot_id", DataTypes.LongType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("source_snapshot_id", DataTypes.LongType, false, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("source_snapshot_id", DataTypes.LongType, false, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -78,16 +81,16 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); long snapshotId = args.getLong(1); - return modifyIcebergTable(tableIdent, table -> { - table.manageSnapshots() - .cherrypick(snapshotId) - .commit(); + return modifyIcebergTable( + tableIdent, + table -> { + table.manageSnapshots().cherrypick(snapshotId).commit(); - Snapshot currentSnapshot = table.currentSnapshot(); + Snapshot currentSnapshot = table.currentSnapshot(); - InternalRow outputRow = newInternalRow(snapshotId, currentSnapshot.snapshotId()); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = newInternalRow(snapshotId, currentSnapshot.snapshotId()); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/ExpireSnapshotsProcedure.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/ExpireSnapshotsProcedure.java index 272cacc4d438..69cab80a51bc 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/ExpireSnapshotsProcedure.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/ExpireSnapshotsProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Table; @@ -42,21 +41,28 @@ */ public class ExpireSnapshotsProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[] { - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("older_than", DataTypes.TimestampType), - ProcedureParameter.optional("retain_last", DataTypes.IntegerType), - ProcedureParameter.optional("max_concurrent_deletes", DataTypes.IntegerType), - ProcedureParameter.optional("stream_results", DataTypes.BooleanType) - }; - - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("deleted_data_files_count", DataTypes.LongType, true, Metadata.empty()), - new StructField("deleted_position_delete_files_count", DataTypes.LongType, true, Metadata.empty()), - new StructField("deleted_equality_delete_files_count", DataTypes.LongType, true, Metadata.empty()), - new StructField("deleted_manifest_files_count", DataTypes.LongType, true, Metadata.empty()), - new StructField("deleted_manifest_lists_count", DataTypes.LongType, true, Metadata.empty()) - }); + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("older_than", DataTypes.TimestampType), + ProcedureParameter.optional("retain_last", DataTypes.IntegerType), + ProcedureParameter.optional("max_concurrent_deletes", DataTypes.IntegerType), + ProcedureParameter.optional("stream_results", DataTypes.BooleanType) + }; + + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("deleted_data_files_count", DataTypes.LongType, true, Metadata.empty()), + new StructField( + "deleted_position_delete_files_count", DataTypes.LongType, true, Metadata.empty()), + new StructField( + "deleted_equality_delete_files_count", DataTypes.LongType, true, Metadata.empty()), + new StructField( + "deleted_manifest_files_count", DataTypes.LongType, true, Metadata.empty()), + new StructField( + "deleted_manifest_lists_count", DataTypes.LongType, true, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -89,43 +95,47 @@ public InternalRow[] call(InternalRow args) { Integer maxConcurrentDeletes = args.isNullAt(3) ? null : args.getInt(3); Boolean streamResult = args.isNullAt(4) ? null : args.getBoolean(4); - Preconditions.checkArgument(maxConcurrentDeletes == null || maxConcurrentDeletes > 0, + Preconditions.checkArgument( + maxConcurrentDeletes == null || maxConcurrentDeletes > 0, "max_concurrent_deletes should have value > 0, value: " + maxConcurrentDeletes); - return modifyIcebergTable(tableIdent, table -> { - ExpireSnapshots action = actions().expireSnapshots(table); + return modifyIcebergTable( + tableIdent, + table -> { + ExpireSnapshots action = actions().expireSnapshots(table); - if (olderThanMillis != null) { - action.expireOlderThan(olderThanMillis); - } + if (olderThanMillis != null) { + action.expireOlderThan(olderThanMillis); + } - if (retainLastNum != null) { - action.retainLast(retainLastNum); - } + if (retainLastNum != null) { + action.retainLast(retainLastNum); + } - if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) { - action.executeDeleteWith(executorService(maxConcurrentDeletes, "expire-snapshots")); - } + if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) { + action.executeDeleteWith(executorService(maxConcurrentDeletes, "expire-snapshots")); + } - if (streamResult != null) { - action.option(ExpireSnapshotsSparkAction.STREAM_RESULTS, Boolean.toString(streamResult)); - } + if (streamResult != null) { + action.option( + ExpireSnapshotsSparkAction.STREAM_RESULTS, Boolean.toString(streamResult)); + } - ExpireSnapshots.Result result = action.execute(); + ExpireSnapshots.Result result = action.execute(); - return toOutputRows(result); - }); + return toOutputRows(result); + }); } private InternalRow[] toOutputRows(ExpireSnapshots.Result result) { - InternalRow row = newInternalRow( - result.deletedDataFilesCount(), - result.deletedPositionDeleteFilesCount(), - result.deletedEqualityDeleteFilesCount(), - result.deletedManifestsCount(), - result.deletedManifestListsCount() - ); - return new InternalRow[]{row}; + InternalRow row = + newInternalRow( + result.deletedDataFilesCount(), + result.deletedPositionDeleteFilesCount(), + result.deletedEqualityDeleteFilesCount(), + result.deletedManifestsCount(), + result.deletedManifestListsCount()); + return new InternalRow[] {row}; } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/MigrateTableProcedure.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/MigrateTableProcedure.java index 2f6841924f8c..a49dd7d526b0 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/MigrateTableProcedure.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/MigrateTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Map; @@ -35,14 +34,17 @@ import scala.runtime.BoxedUnit; class MigrateTableProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("properties", STRING_MAP) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("properties", STRING_MAP) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("migrated_files_count", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("migrated_files_count", DataTypes.LongType, false, Metadata.empty()) + }); private MigrateTableProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -70,19 +72,24 @@ public StructType outputType() { @Override public InternalRow[] call(InternalRow args) { String tableName = args.getString(0); - Preconditions.checkArgument(tableName != null && !tableName.isEmpty(), + Preconditions.checkArgument( + tableName != null && !tableName.isEmpty(), "Cannot handle an empty identifier for argument table"); Map properties = Maps.newHashMap(); if (!args.isNullAt(1)) { - args.getMap(1).foreach(DataTypes.StringType, DataTypes.StringType, - (k, v) -> { - properties.put(k.toString(), v.toString()); - return BoxedUnit.UNIT; - }); + args.getMap(1) + .foreach( + DataTypes.StringType, + DataTypes.StringType, + (k, v) -> { + properties.put(k.toString(), v.toString()); + return BoxedUnit.UNIT; + }); } - MigrateTable.Result result = SparkActions.get().migrateTable(tableName).tableProperties(properties).execute(); + MigrateTable.Result result = + SparkActions.get().migrateTable(tableName).tableProperties(properties).execute(); return new InternalRow[] {newInternalRow(result.migratedDataFilesCount())}; } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/PublishChangesProcedure.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/PublishChangesProcedure.java index 66f4f471fe49..eb6c762ed51e 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/PublishChangesProcedure.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/PublishChangesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Optional; @@ -35,24 +34,28 @@ import org.apache.spark.sql.types.StructType; /** - * A procedure that applies changes in a snapshot created within a Write-Audit-Publish workflow with a wap_id and - * creates a new snapshot which will be set as the current snapshot in a table. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * A procedure that applies changes in a snapshot created within a Write-Audit-Publish workflow with + * a wap_id and creates a new snapshot which will be set as the current snapshot in a table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#cherrypick(long) */ class PublishChangesProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("wap_id", DataTypes.StringType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("wap_id", DataTypes.StringType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("source_snapshot_id", DataTypes.LongType, false, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("source_snapshot_id", DataTypes.LongType, false, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new Builder() { @@ -82,23 +85,27 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); String wapId = args.getString(1); - return modifyIcebergTable(tableIdent, table -> { - Optional wapSnapshot = Optional.ofNullable( - Iterables.find(table.snapshots(), snapshot -> wapId.equals(WapUtil.stagedWapId(snapshot)), null)); - if (!wapSnapshot.isPresent()) { - throw new ValidationException(String.format("Cannot apply unknown WAP ID '%s'", wapId)); - } + return modifyIcebergTable( + tableIdent, + table -> { + Optional wapSnapshot = + Optional.ofNullable( + Iterables.find( + table.snapshots(), + snapshot -> wapId.equals(WapUtil.stagedWapId(snapshot)), + null)); + if (!wapSnapshot.isPresent()) { + throw new ValidationException(String.format("Cannot apply unknown WAP ID '%s'", wapId)); + } - long wapSnapshotId = wapSnapshot.get().snapshotId(); - table.manageSnapshots() - .cherrypick(wapSnapshotId) - .commit(); + long wapSnapshotId = wapSnapshot.get().snapshotId(); + table.manageSnapshots().cherrypick(wapSnapshotId).commit(); - Snapshot currentSnapshot = table.currentSnapshot(); + Snapshot currentSnapshot = table.currentSnapshot(); - InternalRow outputRow = newInternalRow(wapSnapshotId, currentSnapshot.snapshotId()); - return new InternalRow[] {outputRow}; - }); + InternalRow outputRow = newInternalRow(wapSnapshotId, currentSnapshot.snapshotId()); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RegisterTableProcedure.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RegisterTableProcedure.java index 85e374d4800f..857949e052c8 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RegisterTableProcedure.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RegisterTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Snapshot; @@ -37,16 +36,19 @@ import org.apache.spark.sql.types.StructType; class RegisterTableProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("metadata_file", DataTypes.StringType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("metadata_file", DataTypes.StringType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("current_snapshot_id", DataTypes.LongType, true, Metadata.empty()), - new StructField("total_records_count", DataTypes.LongType, true, Metadata.empty()), - new StructField("total_data_files_count", DataTypes.LongType, true, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("current_snapshot_id", DataTypes.LongType, true, Metadata.empty()), + new StructField("total_records_count", DataTypes.LongType, true, Metadata.empty()), + new StructField("total_data_files_count", DataTypes.LongType, true, Metadata.empty()) + }); private RegisterTableProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -73,11 +75,14 @@ public StructType outputType() { @Override public InternalRow[] call(InternalRow args) { - TableIdentifier tableName = Spark3Util.identifierToTableIdentifier(toIdentifier(args.getString(0), "table")); + TableIdentifier tableName = + Spark3Util.identifierToTableIdentifier(toIdentifier(args.getString(0), "table")); String metadataFile = args.getString(1); - Preconditions.checkArgument(tableCatalog() instanceof HasIcebergCatalog, + Preconditions.checkArgument( + tableCatalog() instanceof HasIcebergCatalog, "Cannot use Register Table in a non-Iceberg catalog"); - Preconditions.checkArgument(metadataFile != null && !metadataFile.isEmpty(), + Preconditions.checkArgument( + metadataFile != null && !metadataFile.isEmpty(), "Cannot handle an empty argument metadata_file"); Catalog icebergCatalog = ((HasIcebergCatalog) tableCatalog()).icebergCatalog(); @@ -89,8 +94,10 @@ public InternalRow[] call(InternalRow args) { Snapshot currentSnapshot = table.currentSnapshot(); if (currentSnapshot != null) { currentSnapshotId = currentSnapshot.snapshotId(); - totalDataFiles = Long.parseLong(currentSnapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); - totalRecords = Long.parseLong(currentSnapshot.summary().get(SnapshotSummary.TOTAL_RECORDS_PROP)); + totalDataFiles = + Long.parseLong(currentSnapshot.summary().get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); + totalRecords = + Long.parseLong(currentSnapshot.summary().get(SnapshotSummary.TOTAL_RECORDS_PROP)); } return new InternalRow[] {newInternalRow(currentSnapshotId, totalRecords, totalDataFiles)}; @@ -101,4 +108,3 @@ public String description() { return "RegisterTableProcedure"; } } - diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java index a64a28b6310f..00b68c289507 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RemoveOrphanFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.concurrent.TimeUnit; @@ -45,18 +44,21 @@ */ public class RemoveOrphanFilesProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("older_than", DataTypes.TimestampType), - ProcedureParameter.optional("location", DataTypes.StringType), - ProcedureParameter.optional("dry_run", DataTypes.BooleanType), - ProcedureParameter.optional("max_concurrent_deletes", DataTypes.IntegerType), - ProcedureParameter.optional("file_list_view", DataTypes.StringType) - }; - - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("orphan_file_location", DataTypes.StringType, false, Metadata.empty()) - }); + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("older_than", DataTypes.TimestampType), + ProcedureParameter.optional("location", DataTypes.StringType), + ProcedureParameter.optional("dry_run", DataTypes.BooleanType), + ProcedureParameter.optional("max_concurrent_deletes", DataTypes.IntegerType), + ProcedureParameter.optional("file_list_view", DataTypes.StringType) + }; + + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("orphan_file_location", DataTypes.StringType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -91,40 +93,43 @@ public InternalRow[] call(InternalRow args) { Integer maxConcurrentDeletes = args.isNullAt(4) ? null : args.getInt(4); String fileListView = args.isNullAt(5) ? null : args.getString(5); - Preconditions.checkArgument(maxConcurrentDeletes == null || maxConcurrentDeletes > 0, - "max_concurrent_deletes should have value > 0, value: " + maxConcurrentDeletes); + Preconditions.checkArgument( + maxConcurrentDeletes == null || maxConcurrentDeletes > 0, + "max_concurrent_deletes should have value > 0, value: " + maxConcurrentDeletes); - return withIcebergTable(tableIdent, table -> { - DeleteOrphanFilesSparkAction action = actions().deleteOrphanFiles(table); + return withIcebergTable( + tableIdent, + table -> { + DeleteOrphanFilesSparkAction action = actions().deleteOrphanFiles(table); - if (olderThanMillis != null) { - boolean isTesting = Boolean.parseBoolean(spark().conf().get("spark.testing", "false")); - if (!isTesting) { - validateInterval(olderThanMillis); - } - action.olderThan(olderThanMillis); - } + if (olderThanMillis != null) { + boolean isTesting = Boolean.parseBoolean(spark().conf().get("spark.testing", "false")); + if (!isTesting) { + validateInterval(olderThanMillis); + } + action.olderThan(olderThanMillis); + } - if (location != null) { - action.location(location); - } + if (location != null) { + action.location(location); + } - if (dryRun) { - action.deleteWith(file -> { }); - } + if (dryRun) { + action.deleteWith(file -> {}); + } - if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) { - action.executeDeleteWith(executorService(maxConcurrentDeletes, "remove-orphans")); - } + if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) { + action.executeDeleteWith(executorService(maxConcurrentDeletes, "remove-orphans")); + } - if (fileListView != null) { - action.compareToFileList(spark().table(fileListView)); - } + if (fileListView != null) { + action.compareToFileList(spark().table(fileListView)); + } - DeleteOrphanFiles.Result result = action.execute(); + DeleteOrphanFiles.Result result = action.execute(); - return toOutputRows(result); - }); + return toOutputRows(result); + }); } private InternalRow[] toOutputRows(DeleteOrphanFiles.Result result) { @@ -146,11 +151,11 @@ private void validateInterval(long olderThanMillis) { long intervalMillis = System.currentTimeMillis() - olderThanMillis; if (intervalMillis < TimeUnit.DAYS.toMillis(1)) { throw new IllegalArgumentException( - "Cannot remove orphan files with an interval less than 24 hours. Executing this " + - "procedure with a short interval may corrupt the table if other operations are happening " + - "at the same time. If you are absolutely confident that no concurrent operations will be " + - "affected by removing orphan files with such a short interval, you can use the Action API " + - "to remove orphan files with an arbitrary interval."); + "Cannot remove orphan files with an interval less than 24 hours. Executing this " + + "procedure with a short interval may corrupt the table if other operations are happening " + + "at the same time. If you are absolutely confident that no concurrent operations will be " + + "affected by removing orphan files with such a short interval, you can use the Action API " + + "to remove orphan files with an arbitrary interval."); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteDataFilesProcedure.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteDataFilesProcedure.java index 5c71dbf55529..a0636a75b950 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteDataFilesProcedure.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteDataFilesProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.List; @@ -52,19 +51,24 @@ */ class RewriteDataFilesProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("strategy", DataTypes.StringType), - ProcedureParameter.optional("sort_order", DataTypes.StringType), - ProcedureParameter.optional("options", STRING_MAP), - ProcedureParameter.optional("where", DataTypes.StringType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("strategy", DataTypes.StringType), + ProcedureParameter.optional("sort_order", DataTypes.StringType), + ProcedureParameter.optional("options", STRING_MAP), + ProcedureParameter.optional("where", DataTypes.StringType) + }; // counts are not nullable since the action result is never null - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("rewritten_data_files_count", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("added_data_files_count", DataTypes.IntegerType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField( + "rewritten_data_files_count", DataTypes.IntegerType, false, Metadata.empty()), + new StructField( + "added_data_files_count", DataTypes.IntegerType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new Builder() { @@ -93,35 +97,40 @@ public StructType outputType() { public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); - return modifyIcebergTable(tableIdent, table -> { - String quotedFullIdentifier = Spark3Util.quotedFullIdentifier(tableCatalog().name(), tableIdent); - RewriteDataFiles action = actions().rewriteDataFiles(table); + return modifyIcebergTable( + tableIdent, + table -> { + String quotedFullIdentifier = + Spark3Util.quotedFullIdentifier(tableCatalog().name(), tableIdent); + RewriteDataFiles action = actions().rewriteDataFiles(table); - String strategy = args.isNullAt(1) ? null : args.getString(1); - String sortOrderString = args.isNullAt(2) ? null : args.getString(2); + String strategy = args.isNullAt(1) ? null : args.getString(1); + String sortOrderString = args.isNullAt(2) ? null : args.getString(2); - if (strategy != null || sortOrderString != null) { - action = checkAndApplyStrategy(action, strategy, sortOrderString, table.schema()); - } + if (strategy != null || sortOrderString != null) { + action = checkAndApplyStrategy(action, strategy, sortOrderString, table.schema()); + } - if (!args.isNullAt(3)) { - action = checkAndApplyOptions(args, action); - } + if (!args.isNullAt(3)) { + action = checkAndApplyOptions(args, action); + } - String where = args.isNullAt(4) ? null : args.getString(4); + String where = args.isNullAt(4) ? null : args.getString(4); - action = checkAndApplyFilter(action, where, quotedFullIdentifier); + action = checkAndApplyFilter(action, where, quotedFullIdentifier); - RewriteDataFiles.Result result = action.execute(); + RewriteDataFiles.Result result = action.execute(); - return toOutputRows(result); - }); + return toOutputRows(result); + }); } - private RewriteDataFiles checkAndApplyFilter(RewriteDataFiles action, String where, String tableName) { + private RewriteDataFiles checkAndApplyFilter( + RewriteDataFiles action, String where, String tableName) { if (where != null) { try { - Expression expression = SparkExpressionConverter.collectResolvedSparkExpression(spark(), tableName, where); + Expression expression = + SparkExpressionConverter.collectResolvedSparkExpression(spark(), tableName, where); return action.filter(SparkExpressionConverter.convertToIcebergExpression(expression)); } catch (AnalysisException e) { throw new IllegalArgumentException("Cannot parse predicates in where option: " + where); @@ -132,7 +141,10 @@ private RewriteDataFiles checkAndApplyFilter(RewriteDataFiles action, String whe private RewriteDataFiles checkAndApplyOptions(InternalRow args, RewriteDataFiles action) { Map options = Maps.newHashMap(); - args.getMap(3).foreach(DataTypes.StringType, DataTypes.StringType, + args.getMap(3) + .foreach( + DataTypes.StringType, + DataTypes.StringType, (k, v) -> { options.put(k.toString(), v.toString()); return BoxedUnit.UNIT; @@ -140,18 +152,20 @@ private RewriteDataFiles checkAndApplyOptions(InternalRow args, RewriteDataFiles return action.options(options); } - private RewriteDataFiles checkAndApplyStrategy(RewriteDataFiles action, String strategy, String sortOrderString, - Schema schema) { + private RewriteDataFiles checkAndApplyStrategy( + RewriteDataFiles action, String strategy, String sortOrderString, Schema schema) { List zOrderTerms = Lists.newArrayList(); List sortOrderFields = Lists.newArrayList(); if (sortOrderString != null) { - ExtendedParser.parseSortOrder(spark(), sortOrderString).forEach(field -> { - if (field.term() instanceof Zorder) { - zOrderTerms.add((Zorder) field.term()); - } else { - sortOrderFields.add(field); - } - }); + ExtendedParser.parseSortOrder(spark(), sortOrderString) + .forEach( + field -> { + if (field.term() instanceof Zorder) { + zOrderTerms.add((Zorder) field.term()); + } else { + sortOrderFields.add(field); + } + }); if (!zOrderTerms.isEmpty() && !sortOrderFields.isEmpty()) { // TODO: we need to allow this in future when SparkAction has handling for this. @@ -160,11 +174,14 @@ private RewriteDataFiles checkAndApplyStrategy(RewriteDataFiles action, String s } } - // caller of this function ensures that between strategy and sortOrder, at least one of them is not null. + // caller of this function ensures that between strategy and sortOrder, at least one of them is + // not null. if (strategy == null || strategy.equalsIgnoreCase("sort")) { if (!zOrderTerms.isEmpty()) { - String[] columnNames = zOrderTerms.stream().flatMap( - zOrder -> zOrder.refs().stream().map(NamedReference::name)).toArray(String[]::new); + String[] columnNames = + zOrderTerms.stream() + .flatMap(zOrder -> zOrder.refs().stream().map(NamedReference::name)) + .toArray(String[]::new); return action.zOrder(columnNames); } else { return action.sort(buildSortOrder(sortOrderFields, schema)); @@ -173,7 +190,8 @@ private RewriteDataFiles checkAndApplyStrategy(RewriteDataFiles action, String s if (strategy.equalsIgnoreCase("binpack")) { RewriteDataFiles rewriteDataFiles = action.binPack(); if (sortOrderString != null) { - // calling below method to throw the error as user has set both binpack strategy and sort order + // calling below method to throw the error as user has set both binpack strategy and sort + // order return rewriteDataFiles.sort(buildSortOrder(sortOrderFields, schema)); } return rewriteDataFiles; @@ -183,9 +201,11 @@ private RewriteDataFiles checkAndApplyStrategy(RewriteDataFiles action, String s } } - private SortOrder buildSortOrder(List rawOrderFields, Schema schema) { + private SortOrder buildSortOrder( + List rawOrderFields, Schema schema) { SortOrder.Builder builder = SortOrder.builderFor(schema); - rawOrderFields.forEach(rawField -> builder.sortBy(rawField.term(), rawField.direction(), rawField.nullOrder())); + rawOrderFields.forEach( + rawField -> builder.sortBy(rawField.term(), rawField.direction(), rawField.nullOrder())); return builder.build(); } @@ -193,7 +213,7 @@ private InternalRow[] toOutputRows(RewriteDataFiles.Result result) { int rewrittenDataFilesCount = result.rewrittenDataFilesCount(); int addedDataFilesCount = result.addedDataFilesCount(); InternalRow row = newInternalRow(rewrittenDataFilesCount, addedDataFilesCount); - return new InternalRow[]{row}; + return new InternalRow[] {row}; } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteManifestsProcedure.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteManifestsProcedure.java index 1cc76501c066..c8becc7e5a0f 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteManifestsProcedure.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RewriteManifestsProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Table; @@ -36,23 +35,28 @@ /** * A procedure that rewrites manifests in a table. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see SparkActions#rewriteManifests(Table) () */ class RewriteManifestsProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("use_caching", DataTypes.BooleanType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("use_caching", DataTypes.BooleanType) + }; // counts are not nullable since the action result is never null - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("rewritten_manifests_count", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("added_manifests_count", DataTypes.IntegerType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField( + "rewritten_manifests_count", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("added_manifests_count", DataTypes.IntegerType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -82,24 +86,26 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); Boolean useCaching = args.isNullAt(1) ? null : args.getBoolean(1); - return modifyIcebergTable(tableIdent, table -> { - RewriteManifestsSparkAction action = actions().rewriteManifests(table); + return modifyIcebergTable( + tableIdent, + table -> { + RewriteManifestsSparkAction action = actions().rewriteManifests(table); - if (useCaching != null) { - action.option(RewriteManifestsSparkAction.USE_CACHING, useCaching.toString()); - } + if (useCaching != null) { + action.option(RewriteManifestsSparkAction.USE_CACHING, useCaching.toString()); + } - RewriteManifests.Result result = action.execute(); + RewriteManifests.Result result = action.execute(); - return toOutputRows(result); - }); + return toOutputRows(result); + }); } private InternalRow[] toOutputRows(RewriteManifests.Result result) { int rewrittenManifestsCount = Iterables.size(result.rewrittenManifests()); int addedManifestsCount = Iterables.size(result.addedManifests()); InternalRow row = newInternalRow(rewrittenManifestsCount, addedManifestsCount); - return new InternalRow[]{row}; + return new InternalRow[] {row}; } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToSnapshotProcedure.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToSnapshotProcedure.java index 7cf5b0c77bb2..49cc1a5ceae3 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToSnapshotProcedure.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToSnapshotProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Snapshot; @@ -32,22 +31,26 @@ /** * A procedure that rollbacks a table to a specific snapshot id. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#rollbackTo(long) */ class RollbackToSnapshotProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("snapshot_id", DataTypes.LongType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("snapshot_id", DataTypes.LongType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("previous_snapshot_id", DataTypes.LongType, false, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("previous_snapshot_id", DataTypes.LongType, false, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -77,16 +80,16 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); long snapshotId = args.getLong(1); - return modifyIcebergTable(tableIdent, table -> { - Snapshot previousSnapshot = table.currentSnapshot(); + return modifyIcebergTable( + tableIdent, + table -> { + Snapshot previousSnapshot = table.currentSnapshot(); - table.manageSnapshots() - .rollbackTo(snapshotId) - .commit(); + table.manageSnapshots().rollbackTo(snapshotId).commit(); - InternalRow outputRow = newInternalRow(previousSnapshot.snapshotId(), snapshotId); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = newInternalRow(previousSnapshot.snapshotId(), snapshotId); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToTimestampProcedure.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToTimestampProcedure.java index 519a46c6dbb8..059725f0c152 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToTimestampProcedure.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/RollbackToTimestampProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Snapshot; @@ -33,22 +32,26 @@ /** * A procedure that rollbacks a table to a given point in time. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#rollbackToTime(long) */ class RollbackToTimestampProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("timestamp", DataTypes.TimestampType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("timestamp", DataTypes.TimestampType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("previous_snapshot_id", DataTypes.LongType, false, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("previous_snapshot_id", DataTypes.LongType, false, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -79,18 +82,19 @@ public InternalRow[] call(InternalRow args) { // timestamps in Spark have microsecond precision so this conversion is lossy long timestampMillis = DateTimeUtil.microsToMillis(args.getLong(1)); - return modifyIcebergTable(tableIdent, table -> { - Snapshot previousSnapshot = table.currentSnapshot(); + return modifyIcebergTable( + tableIdent, + table -> { + Snapshot previousSnapshot = table.currentSnapshot(); - table.manageSnapshots() - .rollbackToTime(timestampMillis) - .commit(); + table.manageSnapshots().rollbackToTime(timestampMillis).commit(); - Snapshot currentSnapshot = table.currentSnapshot(); + Snapshot currentSnapshot = table.currentSnapshot(); - InternalRow outputRow = newInternalRow(previousSnapshot.snapshotId(), currentSnapshot.snapshotId()); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = + newInternalRow(previousSnapshot.snapshotId(), currentSnapshot.snapshotId()); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/SetCurrentSnapshotProcedure.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/SetCurrentSnapshotProcedure.java index 274ca19fc107..f8f8049c22b6 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/SetCurrentSnapshotProcedure.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/SetCurrentSnapshotProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import org.apache.iceberg.Snapshot; @@ -32,22 +31,26 @@ /** * A procedure that sets the current snapshot in a table. - *

    - * Note: this procedure invalidates all cached Spark plans that reference the affected table. + * + *

    Note: this procedure invalidates all cached Spark plans that reference the affected + * table. * * @see org.apache.iceberg.ManageSnapshots#setCurrentSnapshot(long) */ class SetCurrentSnapshotProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[] { - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.required("snapshot_id", DataTypes.LongType) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("snapshot_id", DataTypes.LongType) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("previous_snapshot_id", DataTypes.LongType, true, Metadata.empty()), - new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("previous_snapshot_id", DataTypes.LongType, true, Metadata.empty()), + new StructField("current_snapshot_id", DataTypes.LongType, false, Metadata.empty()) + }); public static ProcedureBuilder builder() { return new BaseProcedure.Builder() { @@ -77,17 +80,17 @@ public InternalRow[] call(InternalRow args) { Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); long snapshotId = args.getLong(1); - return modifyIcebergTable(tableIdent, table -> { - Snapshot previousSnapshot = table.currentSnapshot(); - Long previousSnapshotId = previousSnapshot != null ? previousSnapshot.snapshotId() : null; + return modifyIcebergTable( + tableIdent, + table -> { + Snapshot previousSnapshot = table.currentSnapshot(); + Long previousSnapshotId = previousSnapshot != null ? previousSnapshot.snapshotId() : null; - table.manageSnapshots() - .setCurrentSnapshot(snapshotId) - .commit(); + table.manageSnapshots().setCurrentSnapshot(snapshotId).commit(); - InternalRow outputRow = newInternalRow(previousSnapshotId, snapshotId); - return new InternalRow[]{outputRow}; - }); + InternalRow outputRow = newInternalRow(previousSnapshotId, snapshotId); + return new InternalRow[] {outputRow}; + }); } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/SnapshotTableProcedure.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/SnapshotTableProcedure.java index 96e293d6b1da..7a015a51e8ed 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/SnapshotTableProcedure.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/SnapshotTableProcedure.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Map; @@ -34,16 +33,19 @@ import scala.runtime.BoxedUnit; class SnapshotTableProcedure extends BaseProcedure { - private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[]{ - ProcedureParameter.required("source_table", DataTypes.StringType), - ProcedureParameter.required("table", DataTypes.StringType), - ProcedureParameter.optional("location", DataTypes.StringType), - ProcedureParameter.optional("properties", STRING_MAP) - }; + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("source_table", DataTypes.StringType), + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.optional("location", DataTypes.StringType), + ProcedureParameter.optional("properties", STRING_MAP) + }; - private static final StructType OUTPUT_TYPE = new StructType(new StructField[]{ - new StructField("imported_files_count", DataTypes.LongType, false, Metadata.empty()) - }); + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("imported_files_count", DataTypes.LongType, false, Metadata.empty()) + }); private SnapshotTableProcedure(TableCatalog tableCatalog) { super(tableCatalog); @@ -71,23 +73,28 @@ public StructType outputType() { @Override public InternalRow[] call(InternalRow args) { String source = args.getString(0); - Preconditions.checkArgument(source != null && !source.isEmpty(), + Preconditions.checkArgument( + source != null && !source.isEmpty(), "Cannot handle an empty identifier for argument source_table"); String dest = args.getString(1); - Preconditions.checkArgument(dest != null && !dest.isEmpty(), - "Cannot handle an empty identifier for argument table"); + Preconditions.checkArgument( + dest != null && !dest.isEmpty(), "Cannot handle an empty identifier for argument table"); String snapshotLocation = args.isNullAt(2) ? null : args.getString(2); Map properties = Maps.newHashMap(); if (!args.isNullAt(3)) { - args.getMap(3).foreach(DataTypes.StringType, DataTypes.StringType, - (k, v) -> { - properties.put(k.toString(), v.toString()); - return BoxedUnit.UNIT; - }); + args.getMap(3) + .foreach( + DataTypes.StringType, + DataTypes.StringType, + (k, v) -> { + properties.put(k.toString(), v.toString()); + return BoxedUnit.UNIT; + }); } - Preconditions.checkArgument(!source.equals(dest), + Preconditions.checkArgument( + !source.equals(dest), "Cannot create a snapshot with the same name as the source of the snapshot."); SnapshotTable action = SparkActions.get().snapshotTable(source).as(dest); @@ -103,5 +110,4 @@ public InternalRow[] call(InternalRow args) { public String description() { return "SnapshotTableProcedure"; } - } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java index a7c036e1c6ec..6d59cb876b1e 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.procedures; import java.util.Locale; @@ -30,8 +29,7 @@ public class SparkProcedures { private static final Map> BUILDERS = initProcedureBuilders(); - private SparkProcedures() { - } + private SparkProcedures() {} public static ProcedureBuilder newBuilder(String name) { // procedure resolution is case insensitive to match the existing Spark behavior for functions @@ -60,6 +58,7 @@ private static Map> initProcedureBuilders() { public interface ProcedureBuilder { ProcedureBuilder withTableCatalog(TableCatalog tableCatalog); + Procedure build(); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java index 1b0b69299b8e..45bf3cfcc86a 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -42,16 +41,24 @@ abstract class BaseBatchReader extends BaseReader { private final int batchSize; - BaseBatchReader(Table table, ScanTaskGroup taskGroup, Schema expectedSchema, boolean caseSensitive, - int batchSize) { + BaseBatchReader( + Table table, + ScanTaskGroup taskGroup, + Schema expectedSchema, + boolean caseSensitive, + int batchSize) { super(table, taskGroup, expectedSchema, caseSensitive); this.batchSize = batchSize; } - protected CloseableIterable newBatchIterable(InputFile inputFile, FileFormat format, - long start, long length, Expression residual, - Map idToConstant, - SparkDeleteFilter deleteFilter) { + protected CloseableIterable newBatchIterable( + InputFile inputFile, + FileFormat format, + long start, + long length, + Expression residual, + Map idToConstant, + SparkDeleteFilter deleteFilter) { switch (format) { case PARQUET: return newParquetIterable(inputFile, start, length, residual, idToConstant, deleteFilter); @@ -60,24 +67,37 @@ protected CloseableIterable newBatchIterable(InputFile inputFile, return newOrcIterable(inputFile, start, length, residual, idToConstant); default: - throw new UnsupportedOperationException("Format: " + format + " not supported for batched reads"); + throw new UnsupportedOperationException( + "Format: " + format + " not supported for batched reads"); } } - private CloseableIterable newParquetIterable(InputFile inputFile, long start, long length, - Expression residual, Map idToConstant, - SparkDeleteFilter deleteFilter) { - // get required schema for filtering out equality-delete rows in case equality-delete uses columns are + private CloseableIterable newParquetIterable( + InputFile inputFile, + long start, + long length, + Expression residual, + Map idToConstant, + SparkDeleteFilter deleteFilter) { + // get required schema for filtering out equality-delete rows in case equality-delete uses + // columns are // not selected. - Schema requiredSchema = deleteFilter != null && deleteFilter.hasEqDeletes() ? - deleteFilter.requiredSchema() : expectedSchema(); + Schema requiredSchema = + deleteFilter != null && deleteFilter.hasEqDeletes() + ? deleteFilter.requiredSchema() + : expectedSchema(); return Parquet.read(inputFile) .project(requiredSchema) .split(start, length) - .createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(requiredSchema, - fileSchema, /* setArrowValidityVector */ NullCheckingForGet.NULL_CHECKING_ENABLED, idToConstant, - deleteFilter)) + .createBatchedReaderFunc( + fileSchema -> + VectorizedSparkParquetReaders.buildReader( + requiredSchema, + fileSchema, /* setArrowValidityVector */ + NullCheckingForGet.NULL_CHECKING_ENABLED, + idToConstant, + deleteFilter)) .recordsPerBatch(batchSize) .filter(residual) .caseSensitive(caseSensitive()) @@ -89,18 +109,25 @@ private CloseableIterable newParquetIterable(InputFile inputFile, .build(); } - private CloseableIterable newOrcIterable(InputFile inputFile, long start, long length, - Expression residual, Map idToConstant) { + private CloseableIterable newOrcIterable( + InputFile inputFile, + long start, + long length, + Expression residual, + Map idToConstant) { Set constantFieldIds = idToConstant.keySet(); Set metadataFieldIds = MetadataColumns.metadataFieldIds(); - Sets.SetView constantAndMetadataFieldIds = Sets.union(constantFieldIds, metadataFieldIds); - Schema schemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(expectedSchema(), constantAndMetadataFieldIds); + Sets.SetView constantAndMetadataFieldIds = + Sets.union(constantFieldIds, metadataFieldIds); + Schema schemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot(expectedSchema(), constantAndMetadataFieldIds); return ORC.read(inputFile) .project(schemaWithoutConstantAndMetadataFields) .split(start, length) - .createBatchedReaderFunc(fileSchema -> VectorizedSparkOrcReaders.buildReader(expectedSchema(), fileSchema, - idToConstant)) + .createBatchedReaderFunc( + fileSchema -> + VectorizedSparkOrcReaders.buildReader(expectedSchema(), fileSchema, idToConstant)) .recordsPerBatch(batchSize) .filter(residual) .caseSensitive(caseSensitive()) diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/BaseReader.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/BaseReader.java index 0210ceb1779f..95bbaaca7cbd 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/BaseReader.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/BaseReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.Closeable; @@ -84,7 +83,8 @@ abstract class BaseReader implements Closeable { private T current = null; private TaskT currentTask = null; - BaseReader(Table table, ScanTaskGroup taskGroup, Schema expectedSchema, boolean caseSensitive) { + BaseReader( + Table table, ScanTaskGroup taskGroup, Schema expectedSchema, boolean caseSensitive) { this.table = table; this.taskGroup = taskGroup; this.tasks = taskGroup.tasks().iterator(); @@ -92,7 +92,8 @@ abstract class BaseReader implements Closeable { this.expectedSchema = expectedSchema; this.caseSensitive = caseSensitive; String nameMappingString = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING); - this.nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; + this.nameMapping = + nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null; } protected abstract CloseableIterator open(TaskT task); @@ -132,9 +133,10 @@ public boolean next() throws IOException { } } catch (IOException | RuntimeException e) { if (currentTask != null && !currentTask.isDataTask()) { - String filePaths = referencedFiles(currentTask) - .map(file -> file.path().toString()) - .collect(Collectors.joining(", ")); + String filePaths = + referencedFiles(currentTask) + .map(file -> file.path().toString()) + .collect(Collectors.joining(", ")); LOG.error("Error reading file(s): {}", filePaths, e); } throw e; @@ -164,9 +166,8 @@ protected InputFile getInputFile(String location) { private Map inputFiles() { if (lazyInputFiles == null) { - Stream encryptedFiles = taskGroup.tasks().stream() - .flatMap(this::referencedFiles) - .map(this::toEncryptedInputFile); + Stream encryptedFiles = + taskGroup.tasks().stream().flatMap(this::referencedFiles).map(this::toEncryptedInputFile); // decrypt with the batch call to avoid multiple RPCs to a key server, if possible Iterable decryptedFiles = table.encryption().decrypt(encryptedFiles::iterator); @@ -230,7 +231,8 @@ protected static Object convertConstant(Type type, Object value) { for (int index = 0; index < fields.size(); index++) { NestedField field = fields.get(index); Type fieldType = field.type(); - values[index] = convertConstant(fieldType, struct.get(index, fieldType.typeId().javaClass())); + values[index] = + convertConstant(fieldType, struct.get(index, fieldType.typeId().javaClass())); } return new GenericInternalRow(values); diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/BaseRowReader.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/BaseRowReader.java index 3aeb65d7ce44..608f0df0075d 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/BaseRowReader.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/BaseRowReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -40,13 +39,19 @@ import org.apache.spark.sql.catalyst.InternalRow; abstract class BaseRowReader extends BaseReader { - BaseRowReader(Table table, ScanTaskGroup taskGroup, Schema expectedSchema, boolean caseSensitive) { + BaseRowReader( + Table table, ScanTaskGroup taskGroup, Schema expectedSchema, boolean caseSensitive) { super(table, taskGroup, expectedSchema, caseSensitive); } - protected CloseableIterable newIterable(InputFile file, FileFormat format, long start, long length, - Expression residual, Schema projection, - Map idToConstant) { + protected CloseableIterable newIterable( + InputFile file, + FileFormat format, + long start, + long length, + Expression residual, + Schema projection, + Map idToConstant) { switch (format) { case PARQUET: return newParquetIterable(file, start, length, residual, projection, idToConstant); @@ -62,8 +67,8 @@ protected CloseableIterable newIterable(InputFile file, FileFormat } } - private CloseableIterable newAvroIterable(InputFile file, long start, long length, Schema projection, - Map idToConstant) { + private CloseableIterable newAvroIterable( + InputFile file, long start, long length, Schema projection, Map idToConstant) { return Avro.read(file) .reuseContainers() .project(projection) @@ -73,29 +78,41 @@ private CloseableIterable newAvroIterable(InputFile file, long star .build(); } - private CloseableIterable newParquetIterable(InputFile file, long start, long length, - Expression residual, Schema readSchema, - Map idToConstant) { + private CloseableIterable newParquetIterable( + InputFile file, + long start, + long length, + Expression residual, + Schema readSchema, + Map idToConstant) { return Parquet.read(file) .reuseContainers() .split(start, length) .project(readSchema) - .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant)) + .createReaderFunc( + fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant)) .filter(residual) .caseSensitive(caseSensitive()) .withNameMapping(nameMapping()) .build(); } - private CloseableIterable newOrcIterable(InputFile file, long start, long length, Expression residual, - Schema readSchema, Map idToConstant) { - Schema readSchemaWithoutConstantAndMetadataFields = TypeUtil.selectNot(readSchema, - Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); + private CloseableIterable newOrcIterable( + InputFile file, + long start, + long length, + Expression residual, + Schema readSchema, + Map idToConstant) { + Schema readSchemaWithoutConstantAndMetadataFields = + TypeUtil.selectNot( + readSchema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); return ORC.read(file) .project(readSchemaWithoutConstantAndMetadataFields) .split(start, length) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant)) + .createReaderFunc( + readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant)) .filter(residual) .caseSensitive(caseSensitive()) .withNameMapping(nameMapping()) diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java index f0b8dda2345f..13755f0abc79 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/BatchDataReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -33,8 +32,12 @@ import org.apache.spark.sql.vectorized.ColumnarBatch; class BatchDataReader extends BaseBatchReader { - BatchDataReader(ScanTaskGroup task, Table table, Schema expectedSchema, boolean caseSensitive, - int size) { + BatchDataReader( + ScanTaskGroup task, + Table table, + Schema expectedSchema, + boolean caseSensitive, + int size) { super(table, task, expectedSchema, caseSensitive, size); } @@ -55,9 +58,17 @@ protected CloseableIterator open(FileScanTask task) { InputFile inputFile = getInputFile(filePath); Preconditions.checkNotNull(inputFile, "Could not find InputFile associated with FileScanTask"); - SparkDeleteFilter deleteFilter = task.deletes().isEmpty() ? null : new SparkDeleteFilter(filePath, task.deletes()); + SparkDeleteFilter deleteFilter = + task.deletes().isEmpty() ? null : new SparkDeleteFilter(filePath, task.deletes()); - return newBatchIterable(inputFile, task.file().format(), task.start(), task.length(), task.residual(), - idToConstant, deleteFilter).iterator(); + return newBatchIterable( + inputFile, + task.file().format(), + task.start(), + task.length(), + task.residual(), + idToConstant, + deleteFilter) + .iterator(); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java index 9441e8c4a205..5d61747e3dec 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/EqualityDeleteRowReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -30,13 +29,15 @@ import org.apache.spark.sql.catalyst.InternalRow; public class EqualityDeleteRowReader extends RowDataReader { - public EqualityDeleteRowReader(CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { + public EqualityDeleteRowReader( + CombinedScanTask task, Table table, Schema expectedSchema, boolean caseSensitive) { super(task, table, expectedSchema, caseSensitive); } @Override protected CloseableIterator open(FileScanTask task) { - SparkDeleteFilter matches = new SparkDeleteFilter(task.file().path().toString(), task.deletes()); + SparkDeleteFilter matches = + new SparkDeleteFilter(task.file().path().toString(), task.deletes()); // schema or rows returned by readers Schema requiredSchema = matches.requiredSchema(); diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/HasIcebergCatalog.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/HasIcebergCatalog.java index e2579a0059b0..37e0c4dfcdb6 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/HasIcebergCatalog.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/HasIcebergCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.catalog.Catalog; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java index 7c51e1f3895d..00072a134a26 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Arrays; @@ -48,23 +47,24 @@ /** * The IcebergSource loads/writes tables with format "iceberg". It can load paths and tables. * - * How paths/tables are loaded when using spark.read().format("iceberg").path(table) + *

    How paths/tables are loaded when using spark.read().format("iceberg").path(table) * - * table = "file:/path/to/table" -> loads a HadoopTable at given path - * table = "tablename" -> loads currentCatalog.currentNamespace.tablename - * table = "catalog.tablename" -> load "tablename" from the specified catalog. - * table = "namespace.tablename" -> load "namespace.tablename" from current catalog - * table = "catalog.namespace.tablename" -> "namespace.tablename" from the specified catalog. - * table = "namespace1.namespace2.tablename" -> load "namespace1.namespace2.tablename" from current catalog + *

    table = "file:/path/to/table" -> loads a HadoopTable at given path table = "tablename" + * -> loads currentCatalog.currentNamespace.tablename table = "catalog.tablename" -> load + * "tablename" from the specified catalog. table = "namespace.tablename" -> load + * "namespace.tablename" from current catalog table = "catalog.namespace.tablename" -> + * "namespace.tablename" from the specified catalog. table = "namespace1.namespace2.tablename" -> + * load "namespace1.namespace2.tablename" from current catalog * - * The above list is in order of priority. For example: a matching catalog will take priority over any namespace - * resolution. + *

    The above list is in order of priority. For example: a matching catalog will take priority + * over any namespace resolution. */ public class IcebergSource implements DataSourceRegister, SupportsCatalogOptions { private static final String DEFAULT_CATALOG_NAME = "default_iceberg"; private static final String DEFAULT_CACHE_CATALOG_NAME = "default_cache_iceberg"; private static final String DEFAULT_CATALOG = "spark.sql.catalog." + DEFAULT_CATALOG_NAME; - private static final String DEFAULT_CACHE_CATALOG = "spark.sql.catalog." + DEFAULT_CACHE_CATALOG_NAME; + private static final String DEFAULT_CACHE_CATALOG = + "spark.sql.catalog." + DEFAULT_CACHE_CATALOG_NAME; private static final String AT_TIMESTAMP = "at_timestamp_"; private static final String SNAPSHOT_ID = "snapshot_id_"; private static final String[] EMPTY_NAMESPACE = new String[0]; @@ -93,7 +93,8 @@ public boolean supportsExternalMetadata() { @Override public Table getTable(StructType schema, Transform[] partitioning, Map options) { - Spark3Util.CatalogAndIdentifier catalogIdentifier = catalogAndIdentifier(new CaseInsensitiveStringMap(options)); + Spark3Util.CatalogAndIdentifier catalogIdentifier = + catalogAndIdentifier(new CaseInsensitiveStringMap(options)); CatalogPlugin catalog = catalogIdentifier.catalog(); Identifier ident = catalogIdentifier.identifier(); @@ -102,24 +103,32 @@ public Table getTable(StructType schema, Transform[] partitioning, Map extractTimeTravelVersion(CaseInsensitiveStringMap options) { - return Optional.ofNullable(PropertyUtil.propertyAsString(options, SparkReadOptions.VERSION_AS_OF, null)); + return Optional.ofNullable( + PropertyUtil.propertyAsString(options, SparkReadOptions.VERSION_AS_OF, null)); } @Override public Optional extractTimeTravelTimestamp(CaseInsensitiveStringMap options) { - return Optional.ofNullable(PropertyUtil.propertyAsString(options, SparkReadOptions.TIMESTAMP_AS_OF, null)); + return Optional.ofNullable( + PropertyUtil.propertyAsString(options, SparkReadOptions.TIMESTAMP_AS_OF, null)); } private static Long propertyAsLong(CaseInsensitiveStringMap options, String property) { @@ -202,11 +215,12 @@ private static Long propertyAsLong(CaseInsensitiveStringMap options, String prop private static void setupDefaultSparkCatalogs(SparkSession spark) { if (!spark.conf().contains(DEFAULT_CATALOG)) { - ImmutableMap config = ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "cache-enabled", "false" // the source should not use a cache - ); + ImmutableMap config = + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "cache-enabled", "false" // the source should not use a cache + ); spark.conf().set(DEFAULT_CATALOG, SparkCatalog.class.getName()); config.forEach((key, value) -> spark.conf().set(DEFAULT_CATALOG + "." + key, value)); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java index ef1eb08d873c..524266f6f83a 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/InternalRowWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.nio.ByteBuffer; @@ -32,8 +31,8 @@ import org.apache.spark.sql.types.StructType; /** - * Class to adapt a Spark {@code InternalRow} to Iceberg {@link StructLike} for uses like - * {@link org.apache.iceberg.PartitionKey#partition(StructLike)} + * Class to adapt a Spark {@code InternalRow} to Iceberg {@link StructLike} for uses like {@link + * org.apache.iceberg.PartitionKey#partition(StructLike)} */ class InternalRowWrapper implements StructLike { private final DataType[] types; @@ -42,12 +41,8 @@ class InternalRowWrapper implements StructLike { @SuppressWarnings("unchecked") InternalRowWrapper(StructType rowType) { - this.types = Stream.of(rowType.fields()) - .map(StructField::dataType) - .toArray(DataType[]::new); - this.getters = Stream.of(types) - .map(InternalRowWrapper::getter) - .toArray(BiFunction[]::new); + this.types = Stream.of(rowType.fields()).map(StructField::dataType).toArray(DataType[]::new); + this.getters = Stream.of(types).map(InternalRowWrapper::getter).toArray(BiFunction[]::new); } InternalRowWrapper wrap(InternalRow internalRow) { diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java index 85903461970e..3778049cc71a 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/RowDataReader.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -35,7 +34,8 @@ import org.apache.spark.sql.catalyst.InternalRow; class RowDataReader extends BaseRowReader { - RowDataReader(ScanTaskGroup task, Table table, Schema expectedSchema, boolean caseSensitive) { + RowDataReader( + ScanTaskGroup task, Table table, Schema expectedSchema, boolean caseSensitive) { super(table, task, expectedSchema, caseSensitive); } @@ -59,13 +59,21 @@ protected CloseableIterator open(FileScanTask task) { return deleteFilter.filter(open(task, requiredSchema, idToConstant)).iterator(); } - protected CloseableIterable open(FileScanTask task, Schema readSchema, Map idToConstant) { + protected CloseableIterable open( + FileScanTask task, Schema readSchema, Map idToConstant) { if (task.isDataTask()) { return newDataIterable(task.asDataTask(), readSchema); } else { InputFile inputFile = getInputFile(task.file().path().toString()); - Preconditions.checkNotNull(inputFile, "Could not find InputFile associated with FileScanTask"); - return newIterable(inputFile, task.file().format(), task.start(), task.length(), task.residual(), readSchema, + Preconditions.checkNotNull( + inputFile, "Could not find InputFile associated with FileScanTask"); + return newIterable( + inputFile, + task.file().format(), + task.start(), + task.length(), + task.residual(), + readSchema, idToConstant); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java index 63cc3a466c1a..aee0d4f0586b 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/RowDataRewriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.Serializable; @@ -55,23 +54,25 @@ public class RowDataRewriter implements Serializable { private final FileFormat format; private final boolean caseSensitive; - public RowDataRewriter(Broadcast

    tableBroadcast, PartitionSpec spec, boolean caseSensitive) { + public RowDataRewriter( + Broadcast
    tableBroadcast, PartitionSpec spec, boolean caseSensitive) { this.tableBroadcast = tableBroadcast; this.spec = spec; this.caseSensitive = caseSensitive; Table table = tableBroadcast.value(); - String formatString = table.properties().getOrDefault( - TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); + String formatString = + table + .properties() + .getOrDefault( + TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); this.format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH)); } public List rewriteDataForTasks(JavaRDD taskRDD) { JavaRDD> dataFilesRDD = taskRDD.map(this::rewriteDataForTask); - return dataFilesRDD.collect().stream() - .flatMap(Collection::stream) - .collect(Collectors.toList()); + return dataFilesRDD.collect().stream().flatMap(Collection::stream).collect(Collectors.toList()); } private List rewriteDataForTask(CombinedScanTask task) throws Exception { @@ -86,28 +87,44 @@ private List rewriteDataForTask(CombinedScanTask task) throws Exceptio RowDataReader dataReader = new RowDataReader(task, table, schema, caseSensitive); StructType structType = SparkSchemaUtil.convert(schema); - SparkAppenderFactory appenderFactory = SparkAppenderFactory.builderFor(table, schema, structType) - .spec(spec) - .build(); - OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, partitionId, taskId) - .defaultSpec(spec) - .format(format) - .build(); + SparkAppenderFactory appenderFactory = + SparkAppenderFactory.builderFor(table, schema, structType).spec(spec).build(); + OutputFileFactory fileFactory = + OutputFileFactory.builderFor(table, partitionId, taskId) + .defaultSpec(spec) + .format(format) + .build(); TaskWriter writer; if (spec.isUnpartitioned()) { - writer = new UnpartitionedWriter<>(spec, format, appenderFactory, fileFactory, table.io(), - Long.MAX_VALUE); - } else if (PropertyUtil.propertyAsBoolean(properties, + writer = + new UnpartitionedWriter<>( + spec, format, appenderFactory, fileFactory, table.io(), Long.MAX_VALUE); + } else if (PropertyUtil.propertyAsBoolean( + properties, TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED, TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED_DEFAULT)) { - writer = new SparkPartitionedFanoutWriter( - spec, format, appenderFactory, fileFactory, table.io(), Long.MAX_VALUE, schema, - structType); + writer = + new SparkPartitionedFanoutWriter( + spec, + format, + appenderFactory, + fileFactory, + table.io(), + Long.MAX_VALUE, + schema, + structType); } else { - writer = new SparkPartitionedWriter( - spec, format, appenderFactory, fileFactory, table.io(), Long.MAX_VALUE, schema, - structType); + writer = + new SparkPartitionedWriter( + spec, + format, + appenderFactory, + fileFactory, + table.io(), + Long.MAX_VALUE, + schema, + structType); } try { @@ -127,14 +144,24 @@ private List rewriteDataForTask(CombinedScanTask task) throws Exceptio LOG.error("Aborting task", originalThrowable); context.markTaskFailed(originalThrowable); - LOG.error("Aborting commit for partition {} (task {}, attempt {}, stage {}.{})", - partitionId, taskId, context.attemptNumber(), context.stageId(), context.stageAttemptNumber()); + LOG.error( + "Aborting commit for partition {} (task {}, attempt {}, stage {}.{})", + partitionId, + taskId, + context.attemptNumber(), + context.stageId(), + context.stageAttemptNumber()); if (dataReader != null) { dataReader.close(); } writer.abort(); - LOG.error("Aborted commit for partition {} (task {}, attempt {}, stage {}.{})", - partitionId, taskId, context.taskAttemptId(), context.stageId(), context.stageAttemptNumber()); + LOG.error( + "Aborted commit for partition {} (task {}, attempt {}, stage {}.{})", + partitionId, + taskId, + context.taskAttemptId(), + context.stageId(), + context.stageAttemptNumber()); } catch (Throwable inner) { if (originalThrowable != inner) { diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SerializableTableWithSize.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SerializableTableWithSize.java index 6275c664410f..e3b81cea7cd1 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SerializableTableWithSize.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SerializableTableWithSize.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.BaseMetadataTable; @@ -25,9 +24,9 @@ import org.apache.spark.util.KnownSizeEstimation; /** - * This class provides a serializable table with a known size estimate. Spark calls - * its SizeEstimator class when broadcasting variables and this can be an expensive - * operation, so providing a known size estimate allows that operation to be skipped. + * This class provides a serializable table with a known size estimate. Spark calls its + * SizeEstimator class when broadcasting variables and this can be an expensive operation, so + * providing a known size estimate allows that operation to be skipped. */ public class SerializableTableWithSize extends SerializableTable implements KnownSizeEstimation { diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java index 4becf666ed3e..6372edde0782 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -61,8 +60,14 @@ class SparkAppenderFactory implements FileAppenderFactory { private StructType eqDeleteSparkType = null; private StructType posDeleteSparkType = null; - SparkAppenderFactory(Map properties, Schema writeSchema, StructType dsSchema, PartitionSpec spec, - int[] equalityFieldIds, Schema eqDeleteRowSchema, Schema posDeleteRowSchema) { + SparkAppenderFactory( + Map properties, + Schema writeSchema, + StructType dsSchema, + PartitionSpec spec, + int[] equalityFieldIds, + Schema eqDeleteRowSchema, + Schema posDeleteRowSchema) { this.properties = properties; this.writeSchema = writeSchema; this.dsSchema = dsSchema; @@ -85,7 +90,6 @@ static class Builder { private Schema eqDeleteRowSchema; private Schema posDeleteRowSchema; - Builder(Table table, Schema writeSchema, StructType dsSchema) { this.table = table; this.spec = table.spec(); @@ -118,16 +122,24 @@ SparkAppenderFactory build() { Preconditions.checkNotNull(writeSchema, "Write Schema must not be null"); Preconditions.checkNotNull(dsSchema, "DS Schema must not be null"); if (equalityFieldIds != null) { - Preconditions.checkNotNull(eqDeleteRowSchema, "Equality Field Ids and Equality Delete Row Schema" + - " must be set together"); + Preconditions.checkNotNull( + eqDeleteRowSchema, + "Equality Field Ids and Equality Delete Row Schema" + " must be set together"); } if (eqDeleteRowSchema != null) { - Preconditions.checkNotNull(equalityFieldIds, "Equality Field Ids and Equality Delete Row Schema" + - " must be set together"); + Preconditions.checkNotNull( + equalityFieldIds, + "Equality Field Ids and Equality Delete Row Schema" + " must be set together"); } - return new SparkAppenderFactory(table.properties(), writeSchema, dsSchema, spec, equalityFieldIds, - eqDeleteRowSchema, posDeleteRowSchema); + return new SparkAppenderFactory( + table.properties(), + writeSchema, + dsSchema, + spec, + equalityFieldIds, + eqDeleteRowSchema, + posDeleteRowSchema); } } @@ -141,7 +153,8 @@ private StructType lazyEqDeleteSparkType() { private StructType lazyPosDeleteSparkType() { if (posDeleteSparkType == null) { - Preconditions.checkNotNull(posDeleteRowSchema, "Position delete row schema shouldn't be null"); + Preconditions.checkNotNull( + posDeleteRowSchema, "Position delete row schema shouldn't be null"); this.posDeleteSparkType = SparkSchemaUtil.convert(posDeleteRowSchema); } return posDeleteSparkType; @@ -187,24 +200,33 @@ public FileAppender newAppender(OutputFile file, FileFormat fileFor } @Override - public DataWriter newDataWriter(EncryptedOutputFile file, FileFormat format, StructLike partition) { - return new DataWriter<>(newAppender(file.encryptingOutputFile(), format), format, - file.encryptingOutputFile().location(), spec, partition, file.keyMetadata()); + public DataWriter newDataWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { + return new DataWriter<>( + newAppender(file.encryptingOutputFile(), format), + format, + file.encryptingOutputFile().location(), + spec, + partition, + file.keyMetadata()); } @Override - public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile file, FileFormat format, - StructLike partition) { - Preconditions.checkState(equalityFieldIds != null && equalityFieldIds.length > 0, + public EqualityDeleteWriter newEqDeleteWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { + Preconditions.checkState( + equalityFieldIds != null && equalityFieldIds.length > 0, "Equality field ids shouldn't be null or empty when creating equality-delete writer"); - Preconditions.checkNotNull(eqDeleteRowSchema, + Preconditions.checkNotNull( + eqDeleteRowSchema, "Equality delete row schema shouldn't be null when creating equality-delete writer"); try { switch (format) { case PARQUET: return Parquet.writeDeletes(file.encryptingOutputFile()) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(lazyEqDeleteSparkType(), msgType)) + .createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(lazyEqDeleteSparkType(), msgType)) .overwrite() .rowSchema(eqDeleteRowSchema) .withSpec(spec) @@ -245,15 +267,16 @@ public EqualityDeleteWriter newEqDeleteWriter(EncryptedOutputFile f } @Override - public PositionDeleteWriter newPosDeleteWriter(EncryptedOutputFile file, FileFormat format, - StructLike partition) { + public PositionDeleteWriter newPosDeleteWriter( + EncryptedOutputFile file, FileFormat format, StructLike partition) { try { switch (format) { case PARQUET: StructType sparkPosDeleteSchema = SparkSchemaUtil.convert(DeleteSchemaUtil.posDeleteSchema(posDeleteRowSchema)); return Parquet.writeDeletes(file.encryptingOutputFile()) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(sparkPosDeleteSchema, msgType)) + .createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(sparkPosDeleteSchema, msgType)) .overwrite() .rowSchema(posDeleteRowSchema) .withSpec(spec) diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java index 79d91205a236..6d8504794310 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatch.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -59,7 +58,8 @@ abstract class SparkBatch implements Batch { @Override public InputPartition[] planInputPartitions() { // broadcast the table metadata as input partitions will be sent to executors - Broadcast
    tableBroadcast = sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); + Broadcast
    tableBroadcast = + sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); String expectedSchemaString = SchemaParser.toJson(expectedSchema); InputPartition[] readTasks = new InputPartition[tasks().size()]; @@ -67,9 +67,15 @@ public InputPartition[] planInputPartitions() { Tasks.range(readTasks.length) .stopOnFailure() .executeWith(localityEnabled ? ThreadPools.getWorkerPool() : null) - .run(index -> readTasks[index] = new ReadTask( - tasks().get(index), tableBroadcast, expectedSchemaString, - caseSensitive, localityEnabled)); + .run( + index -> + readTasks[index] = + new ReadTask( + tasks().get(index), + tableBroadcast, + expectedSchemaString, + caseSensitive, + localityEnabled)); return readTasks; } @@ -96,25 +102,32 @@ private int batchSize() { } private boolean parquetOnly() { - return tasks().stream().allMatch(task -> !task.isDataTask() && onlyFileFormat(task, FileFormat.PARQUET)); + return tasks().stream() + .allMatch(task -> !task.isDataTask() && onlyFileFormat(task, FileFormat.PARQUET)); } private boolean parquetBatchReadsEnabled() { - return readConf.parquetVectorizationEnabled() && // vectorization enabled - expectedSchema.columns().size() > 0 && // at least one column is projected - expectedSchema.columns().stream().allMatch(c -> c.type().isPrimitiveType()); // only primitives + return readConf.parquetVectorizationEnabled() + && // vectorization enabled + expectedSchema.columns().size() > 0 + && // at least one column is projected + expectedSchema.columns().stream() + .allMatch(c -> c.type().isPrimitiveType()); // only primitives } private boolean orcOnly() { - return tasks().stream().allMatch(task -> !task.isDataTask() && onlyFileFormat(task, FileFormat.ORC)); + return tasks().stream() + .allMatch(task -> !task.isDataTask() && onlyFileFormat(task, FileFormat.ORC)); } private boolean orcBatchReadsEnabled() { - return readConf.orcVectorizationEnabled() && // vectorization enabled + return readConf.orcVectorizationEnabled() + && // vectorization enabled tasks().stream().noneMatch(TableScanUtil::hasDeletes); // no delete files } private boolean onlyFileFormat(CombinedScanTask task, FileFormat fileFormat) { - return task.files().stream().allMatch(fileScanTask -> fileScanTask.file().format().equals(fileFormat)); + return task.files().stream() + .allMatch(fileScanTask -> fileScanTask.file().format().equals(fileFormat)); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java index 651a411ebd7b..59dd8759968f 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -74,8 +73,13 @@ class SparkBatchQueryScan extends SparkScan implements SupportsRuntimeFiltering private List files = null; // lazy cache of files private List tasks = null; // lazy cache of tasks - SparkBatchQueryScan(SparkSession spark, Table table, TableScan scan, SparkReadConf readConf, - Schema expectedSchema, List filters) { + SparkBatchQueryScan( + SparkSession spark, + Table table, + TableScan scan, + SparkReadConf readConf, + Schema expectedSchema, + List filters) { super(spark, table, readConf, expectedSchema, filters); @@ -124,12 +128,12 @@ private List files() { @Override protected List tasks() { if (tasks == null) { - CloseableIterable splitFiles = TableScanUtil.splitFiles( - CloseableIterable.withNoopClose(files()), - scan.targetSplitSize()); - CloseableIterable scanTasks = TableScanUtil.planTasks( - splitFiles, scan.targetSplitSize(), - scan.splitLookback(), scan.splitOpenFileCost()); + CloseableIterable splitFiles = + TableScanUtil.splitFiles( + CloseableIterable.withNoopClose(files()), scan.targetSplitSize()); + CloseableIterable scanTasks = + TableScanUtil.planTasks( + splitFiles, scan.targetSplitSize(), scan.splitLookback(), scan.splitOpenFileCost()); tasks = Lists.newArrayList(scanTasks); } @@ -168,21 +172,29 @@ public void filter(Filter[] filters) { for (Integer specId : specIds()) { PartitionSpec spec = table().specs().get(specId); - Expression inclusiveExpr = Projections.inclusive(spec, caseSensitive()).project(runtimeFilterExpr); + Expression inclusiveExpr = + Projections.inclusive(spec, caseSensitive()).project(runtimeFilterExpr); Evaluator inclusive = new Evaluator(spec.partitionType(), inclusiveExpr); evaluatorsBySpecId.put(specId, inclusive); } - LOG.info("Trying to filter {} files using runtime filter {}", files().size(), runtimeFilterExpr); + LOG.info( + "Trying to filter {} files using runtime filter {}", files().size(), runtimeFilterExpr); - List filteredFiles = files().stream() - .filter(file -> { - Evaluator evaluator = evaluatorsBySpecId.get(file.spec().specId()); - return evaluator.eval(file.file().partition()); - }) - .collect(Collectors.toList()); + List filteredFiles = + files().stream() + .filter( + file -> { + Evaluator evaluator = evaluatorsBySpecId.get(file.spec().specId()); + return evaluator.eval(file.file().partition()); + }) + .collect(Collectors.toList()); - LOG.info("{}/{} files matched runtime filter {}", filteredFiles.size(), files().size(), runtimeFilterExpr); + LOG.info( + "{}/{} files matched runtime filter {}", + filteredFiles.size(), + files().size(), + runtimeFilterExpr); // don't invalidate tasks if the runtime filter had no effect to avoid planning splits again if (filteredFiles.size() < files().size()) { @@ -249,27 +261,38 @@ public boolean equals(Object o) { } SparkBatchQueryScan that = (SparkBatchQueryScan) o; - return table().name().equals(that.table().name()) && - readSchema().equals(that.readSchema()) && // compare Spark schemas to ignore field ids - filterExpressions().toString().equals(that.filterExpressions().toString()) && - runtimeFilterExpressions.toString().equals(that.runtimeFilterExpressions.toString()) && - Objects.equals(snapshotId, that.snapshotId) && - Objects.equals(startSnapshotId, that.startSnapshotId) && - Objects.equals(endSnapshotId, that.endSnapshotId) && - Objects.equals(asOfTimestamp, that.asOfTimestamp); + return table().name().equals(that.table().name()) + && readSchema().equals(that.readSchema()) + && // compare Spark schemas to ignore field ids + filterExpressions().toString().equals(that.filterExpressions().toString()) + && runtimeFilterExpressions.toString().equals(that.runtimeFilterExpressions.toString()) + && Objects.equals(snapshotId, that.snapshotId) + && Objects.equals(startSnapshotId, that.startSnapshotId) + && Objects.equals(endSnapshotId, that.endSnapshotId) + && Objects.equals(asOfTimestamp, that.asOfTimestamp); } @Override public int hashCode() { return Objects.hash( - table().name(), readSchema(), filterExpressions().toString(), runtimeFilterExpressions.toString(), - snapshotId, startSnapshotId, endSnapshotId, asOfTimestamp); + table().name(), + readSchema(), + filterExpressions().toString(), + runtimeFilterExpressions.toString(), + snapshotId, + startSnapshotId, + endSnapshotId, + asOfTimestamp); } @Override public String toString() { return String.format( "IcebergScan(table=%s, type=%s, filters=%s, runtimeFilters=%s, caseSensitive=%s)", - table(), expectedSchema().asStruct(), filterExpressions(), runtimeFilterExpressions, caseSensitive()); + table(), + expectedSchema().asStruct(), + filterExpressions(), + runtimeFilterExpressions, + caseSensitive()); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkCopyOnWriteOperation.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkCopyOnWriteOperation.java index 19b4ab7a4087..72c243fcbcf7 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkCopyOnWriteOperation.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkCopyOnWriteOperation.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.DELETE; +import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.UPDATE; + import org.apache.iceberg.IsolationLevel; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Table; @@ -34,9 +36,6 @@ import org.apache.spark.sql.connector.write.WriteBuilder; import org.apache.spark.sql.util.CaseInsensitiveStringMap; -import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.DELETE; -import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.UPDATE; - class SparkCopyOnWriteOperation implements RowLevelOperation { private final SparkSession spark; @@ -49,8 +48,8 @@ class SparkCopyOnWriteOperation implements RowLevelOperation { private Scan configuredScan; private WriteBuilder lazyWriteBuilder; - SparkCopyOnWriteOperation(SparkSession spark, Table table, RowLevelOperationInfo info, - IsolationLevel isolationLevel) { + SparkCopyOnWriteOperation( + SparkSession spark, Table table, RowLevelOperationInfo info, IsolationLevel isolationLevel) { this.spark = spark; this.table = table; this.command = info.command(); @@ -65,14 +64,15 @@ public Command command() { @Override public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) { if (lazyScanBuilder == null) { - lazyScanBuilder = new SparkScanBuilder(spark, table, options) { - @Override - public Scan build() { - Scan scan = super.buildCopyOnWriteScan(); - SparkCopyOnWriteOperation.this.configuredScan = scan; - return scan; - } - }; + lazyScanBuilder = + new SparkScanBuilder(spark, table, options) { + @Override + public Scan build() { + Scan scan = super.buildCopyOnWriteScan(); + SparkCopyOnWriteOperation.this.configuredScan = scan; + return scan; + } + }; } return lazyScanBuilder; @@ -95,9 +95,9 @@ public NamedReference[] requiredMetadataAttributes() { NamedReference pos = Expressions.column(MetadataColumns.ROW_POSITION.name()); if (command == DELETE || command == UPDATE) { - return new NamedReference[]{file, pos}; + return new NamedReference[] {file, pos}; } else { - return new NamedReference[]{file}; + return new NamedReference[] {file}; } } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkCopyOnWriteScan.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkCopyOnWriteScan.java index 49d90f5f2b13..4efd5180e27b 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkCopyOnWriteScan.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkCopyOnWriteScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -57,13 +56,23 @@ class SparkCopyOnWriteScan extends SparkScan implements SupportsRuntimeFiltering private List tasks = null; // lazy cache of tasks private Set filteredLocations = null; - SparkCopyOnWriteScan(SparkSession spark, Table table, SparkReadConf readConf, - Schema expectedSchema, List filters) { + SparkCopyOnWriteScan( + SparkSession spark, + Table table, + SparkReadConf readConf, + Schema expectedSchema, + List filters) { this(spark, table, null, null, readConf, expectedSchema, filters); } - SparkCopyOnWriteScan(SparkSession spark, Table table, TableScan scan, Snapshot snapshot, - SparkReadConf readConf, Schema expectedSchema, List filters) { + SparkCopyOnWriteScan( + SparkSession spark, + Table table, + TableScan scan, + Snapshot snapshot, + SparkReadConf readConf, + Schema expectedSchema, + List filters) { super(spark, table, readConf, expectedSchema, filters); @@ -88,14 +97,15 @@ public Statistics estimateStatistics() { public NamedReference[] filterAttributes() { NamedReference file = Expressions.column(MetadataColumns.FILE_PATH.name()); - return new NamedReference[]{file}; + return new NamedReference[] {file}; } @Override public void filter(Filter[] filters) { for (Filter filter : filters) { // Spark can only pass In filters at the moment - if (filter instanceof In && ((In) filter).attribute().equalsIgnoreCase(MetadataColumns.FILE_PATH.name())) { + if (filter instanceof In + && ((In) filter).attribute().equalsIgnoreCase(MetadataColumns.FILE_PATH.name())) { In in = (In) filter; Set fileLocations = Sets.newHashSet(); @@ -109,9 +119,10 @@ public void filter(Filter[] filters) { if (filteredLocations == null || fileLocations.size() < filteredLocations.size()) { this.tasks = null; this.filteredLocations = fileLocations; - this.files = files().stream() - .filter(file -> fileLocations.contains(file.file().path().toString())) - .collect(Collectors.toList()); + this.files = + files().stream() + .filter(file -> fileLocations.contains(file.file().path().toString())) + .collect(Collectors.toList()); } } } @@ -133,12 +144,12 @@ synchronized List files() { @Override protected synchronized List tasks() { if (tasks == null) { - CloseableIterable splitFiles = TableScanUtil.splitFiles( - CloseableIterable.withNoopClose(files()), - scan.targetSplitSize()); - CloseableIterable scanTasks = TableScanUtil.planTasks( - splitFiles, scan.targetSplitSize(), - scan.splitLookback(), scan.splitOpenFileCost()); + CloseableIterable splitFiles = + TableScanUtil.splitFiles( + CloseableIterable.withNoopClose(files()), scan.targetSplitSize()); + CloseableIterable scanTasks = + TableScanUtil.planTasks( + splitFiles, scan.targetSplitSize(), scan.splitLookback(), scan.splitOpenFileCost()); tasks = Lists.newArrayList(scanTasks); } @@ -156,18 +167,22 @@ public boolean equals(Object o) { } SparkCopyOnWriteScan that = (SparkCopyOnWriteScan) o; - return table().name().equals(that.table().name()) && - readSchema().equals(that.readSchema()) && // compare Spark schemas to ignore field ids - filterExpressions().toString().equals(that.filterExpressions().toString()) && - Objects.equals(snapshotId(), that.snapshotId()) && - Objects.equals(filteredLocations, that.filteredLocations); + return table().name().equals(that.table().name()) + && readSchema().equals(that.readSchema()) + && // compare Spark schemas to ignore field ids + filterExpressions().toString().equals(that.filterExpressions().toString()) + && Objects.equals(snapshotId(), that.snapshotId()) + && Objects.equals(filteredLocations, that.filteredLocations); } @Override public int hashCode() { return Objects.hash( - table().name(), readSchema(), filterExpressions().toString(), - snapshotId(), filteredLocations); + table().name(), + readSchema(), + filterExpressions().toString(), + snapshotId(), + filteredLocations); } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java index beaa7c295024..a8c894bfc50c 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; +import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; + import java.util.Locale; import java.util.Map; import org.apache.iceberg.FileFormat; @@ -40,24 +44,35 @@ import org.apache.spark.sql.types.StructType; import org.apache.spark.unsafe.types.UTF8String; -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_ROW_FIELD_NAME; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; -import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; - class SparkFileWriterFactory extends BaseFileWriterFactory { private StructType dataSparkType; private StructType equalityDeleteSparkType; private StructType positionDeleteSparkType; - SparkFileWriterFactory(Table table, FileFormat dataFileFormat, Schema dataSchema, StructType dataSparkType, - SortOrder dataSortOrder, FileFormat deleteFileFormat, - int[] equalityFieldIds, Schema equalityDeleteRowSchema, StructType equalityDeleteSparkType, - SortOrder equalityDeleteSortOrder, Schema positionDeleteRowSchema, - StructType positionDeleteSparkType) { - - super(table, dataFileFormat, dataSchema, dataSortOrder, deleteFileFormat, equalityFieldIds, - equalityDeleteRowSchema, equalityDeleteSortOrder, positionDeleteRowSchema); + SparkFileWriterFactory( + Table table, + FileFormat dataFileFormat, + Schema dataSchema, + StructType dataSparkType, + SortOrder dataSortOrder, + FileFormat deleteFileFormat, + int[] equalityFieldIds, + Schema equalityDeleteRowSchema, + StructType equalityDeleteSparkType, + SortOrder equalityDeleteSortOrder, + Schema positionDeleteRowSchema, + StructType positionDeleteSparkType) { + + super( + table, + dataFileFormat, + dataSchema, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteSortOrder, + positionDeleteRowSchema); this.dataSparkType = dataSparkType; this.equalityDeleteSparkType = equalityDeleteSparkType; @@ -80,7 +95,8 @@ protected void configureEqualityDelete(Avro.DeleteWriteBuilder builder) { @Override protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { - boolean withRow = positionDeleteSparkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME).isDefined(); + boolean withRow = + positionDeleteSparkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME).isDefined(); if (withRow) { // SparkAvroWriter accepts just the Spark type of the row ignoring the path and pos StructField rowField = positionDeleteSparkType().apply(DELETE_FILE_ROW_FIELD_NAME); @@ -96,12 +112,14 @@ protected void configureDataWrite(Parquet.DataWriteBuilder builder) { @Override protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(msgType -> SparkParquetWriters.buildWriter(equalityDeleteSparkType(), msgType)); + builder.createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(equalityDeleteSparkType(), msgType)); } @Override protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(msgType -> SparkParquetWriters.buildWriter(positionDeleteSparkType(), msgType)); + builder.createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(positionDeleteSparkType(), msgType)); builder.transformPaths(path -> UTF8String.fromString(path.toString())); } @@ -132,7 +150,8 @@ private StructType dataSparkType() { private StructType equalityDeleteSparkType() { if (equalityDeleteSparkType == null) { - Preconditions.checkNotNull(equalityDeleteRowSchema(), "Equality delete schema must not be null"); + Preconditions.checkNotNull( + equalityDeleteRowSchema(), "Equality delete schema must not be null"); this.equalityDeleteSparkType = SparkSchemaUtil.convert(equalityDeleteRowSchema()); } @@ -141,7 +160,8 @@ private StructType equalityDeleteSparkType() { private StructType positionDeleteSparkType() { if (positionDeleteSparkType == null) { - // wrap the optional row schema into the position delete schema that contains path and position + // wrap the optional row schema into the position delete schema that contains path and + // position Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema()); this.positionDeleteSparkType = SparkSchemaUtil.convert(positionDeleteSchema); } @@ -168,10 +188,12 @@ static class Builder { Map properties = table.properties(); - String dataFileFormatName = properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); + String dataFileFormatName = + properties.getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); this.dataFileFormat = FileFormat.valueOf(dataFileFormatName.toUpperCase(Locale.ENGLISH)); - String deleteFileFormatName = properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); + String deleteFileFormatName = + properties.getOrDefault(DELETE_DEFAULT_FILE_FORMAT, dataFileFormatName); this.deleteFileFormat = FileFormat.valueOf(deleteFileFormatName.toUpperCase(Locale.ENGLISH)); } @@ -233,13 +255,23 @@ Builder positionDeleteSparkType(StructType newPositionDeleteSparkType) { SparkFileWriterFactory build() { boolean noEqualityDeleteConf = equalityFieldIds == null && equalityDeleteRowSchema == null; boolean fullEqualityDeleteConf = equalityFieldIds != null && equalityDeleteRowSchema != null; - Preconditions.checkArgument(noEqualityDeleteConf || fullEqualityDeleteConf, + Preconditions.checkArgument( + noEqualityDeleteConf || fullEqualityDeleteConf, "Equality field IDs and equality delete row schema must be set together"); return new SparkFileWriterFactory( - table, dataFileFormat, dataSchema, dataSparkType, dataSortOrder, deleteFileFormat, - equalityFieldIds, equalityDeleteRowSchema, equalityDeleteSparkType, equalityDeleteSortOrder, - positionDeleteRowSchema, positionDeleteSparkType); + table, + dataFileFormat, + dataSchema, + dataSparkType, + dataSortOrder, + deleteFileFormat, + equalityFieldIds, + equalityDeleteRowSchema, + equalityDeleteSparkType, + equalityDeleteSortOrder, + positionDeleteRowSchema, + positionDeleteSparkType); } } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScan.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScan.java index 285bdaec851f..d40009c9f899 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScan.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -55,16 +54,18 @@ protected List tasks() { if (tasks == null) { FileScanTaskSetManager taskSetManager = FileScanTaskSetManager.get(); List files = taskSetManager.fetchTasks(table(), taskSetID); - ValidationException.check(files != null, + ValidationException.check( + files != null, "Task set manager has no tasks for table %s with id %s", - table(), taskSetID); + table(), + taskSetID); - CloseableIterable splitFiles = TableScanUtil.splitFiles( - CloseableIterable.withNoopClose(files), - splitSize); - CloseableIterable scanTasks = TableScanUtil.planTasks( - splitFiles, splitSize, - splitLookback, splitOpenFileCost); + CloseableIterable splitFiles = + TableScanUtil.splitFiles(CloseableIterable.withNoopClose(files), splitSize); + CloseableIterable scanTasks = + TableScanUtil.planTasks( + splitFiles, splitSize, + splitLookback, splitOpenFileCost); this.tasks = Lists.newArrayList(scanTasks); } @@ -82,11 +83,11 @@ public boolean equals(Object other) { } SparkFilesScan that = (SparkFilesScan) other; - return table().name().equals(that.table().name()) && - Objects.equals(taskSetID, that.taskSetID) && - Objects.equals(splitSize, that.splitSize) && - Objects.equals(splitLookback, that.splitLookback) && - Objects.equals(splitOpenFileCost, that.splitOpenFileCost); + return table().name().equals(that.table().name()) + && Objects.equals(taskSetID, that.taskSetID) + && Objects.equals(splitSize, that.splitSize) + && Objects.equals(splitLookback, that.splitLookback) + && Objects.equals(splitOpenFileCost, that.splitOpenFileCost); } @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScanBuilder.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScanBuilder.java index 6ae63b37d3b4..03ab3aa062d3 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScanBuilder.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkFilesScanBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.Table; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkMetadataColumn.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkMetadataColumn.java index 638cb7275638..94f87c28741d 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkMetadataColumn.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkMetadataColumn.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.spark.sql.connector.catalog.MetadataColumn; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java index ed96e10fe6ff..498f8198041f 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.BufferedWriter; @@ -81,8 +80,12 @@ public class SparkMicroBatchStream implements MicroBatchStream { private final boolean skipOverwrite; private final Long fromTimestamp; - SparkMicroBatchStream(JavaSparkContext sparkContext, Table table, SparkReadConf readConf, - Schema expectedSchema, String checkpointLocation) { + SparkMicroBatchStream( + JavaSparkContext sparkContext, + Table table, + SparkReadConf readConf, + Schema expectedSchema, + String checkpointLocation) { this.table = table; this.caseSensitive = readConf.caseSensitive(); this.expectedSchema = SchemaParser.toJson(expectedSchema); @@ -93,7 +96,8 @@ public class SparkMicroBatchStream implements MicroBatchStream { this.splitOpenFileCost = readConf.splitOpenFileCost(); this.fromTimestamp = readConf.streamFromTimestamp(); - InitialOffsetStore initialOffsetStore = new InitialOffsetStore(table, checkpointLocation, fromTimestamp); + InitialOffsetStore initialOffsetStore = + new InitialOffsetStore(table, checkpointLocation, fromTimestamp); this.initialOffset = initialOffsetStore.initialOffset(); this.skipDelete = readConf.streamingSkipDeleteSnapshots(); @@ -112,19 +116,25 @@ public Offset latestOffset() { } Snapshot latestSnapshot = table.currentSnapshot(); - long addedFilesCount = PropertyUtil.propertyAsLong(latestSnapshot.summary(), SnapshotSummary.ADDED_FILES_PROP, -1); - // If snapshotSummary doesn't have SnapshotSummary.ADDED_FILES_PROP, iterate through addedFiles iterator to find + long addedFilesCount = + PropertyUtil.propertyAsLong(latestSnapshot.summary(), SnapshotSummary.ADDED_FILES_PROP, -1); + // If snapshotSummary doesn't have SnapshotSummary.ADDED_FILES_PROP, iterate through addedFiles + // iterator to find // addedFilesCount. - addedFilesCount = addedFilesCount == -1 ? Iterables.size(latestSnapshot.addedFiles()) : addedFilesCount; + addedFilesCount = + addedFilesCount == -1 ? Iterables.size(latestSnapshot.addedFiles()) : addedFilesCount; return new StreamingOffset(latestSnapshot.snapshotId(), addedFilesCount, false); } @Override public InputPartition[] planInputPartitions(Offset start, Offset end) { - Preconditions.checkArgument(end instanceof StreamingOffset, "Invalid end offset: %s is not a StreamingOffset", end); Preconditions.checkArgument( - start instanceof StreamingOffset, "Invalid start offset: %s is not a StreamingOffset", start); + end instanceof StreamingOffset, "Invalid end offset: %s is not a StreamingOffset", end); + Preconditions.checkArgument( + start instanceof StreamingOffset, + "Invalid start offset: %s is not a StreamingOffset", + start); if (end.equals(StreamingOffset.START_OFFSET)) { return new InputPartition[0]; @@ -135,19 +145,25 @@ public InputPartition[] planInputPartitions(Offset start, Offset end) { List fileScanTasks = planFiles(startOffset, endOffset); - CloseableIterable splitTasks = TableScanUtil.splitFiles( - CloseableIterable.withNoopClose(fileScanTasks), - splitSize); - List combinedScanTasks = Lists.newArrayList( - TableScanUtil.planTasks(splitTasks, splitSize, splitLookback, splitOpenFileCost)); + CloseableIterable splitTasks = + TableScanUtil.splitFiles(CloseableIterable.withNoopClose(fileScanTasks), splitSize); + List combinedScanTasks = + Lists.newArrayList( + TableScanUtil.planTasks(splitTasks, splitSize, splitLookback, splitOpenFileCost)); InputPartition[] readTasks = new InputPartition[combinedScanTasks.size()]; Tasks.range(readTasks.length) .stopOnFailure() .executeWith(localityPreferred ? ThreadPools.getWorkerPool() : null) - .run(index -> readTasks[index] = new ReadTask( - combinedScanTasks.get(index), tableBroadcast, expectedSchema, - caseSensitive, localityPreferred)); + .run( + index -> + readTasks[index] = + new ReadTask( + combinedScanTasks.get(index), + tableBroadcast, + expectedSchema, + caseSensitive, + localityPreferred)); return readTasks; } @@ -168,17 +184,17 @@ public Offset deserializeOffset(String json) { } @Override - public void commit(Offset end) { - } + public void commit(Offset end) {} @Override - public void stop() { - } + public void stop() {} private List planFiles(StreamingOffset startOffset, StreamingOffset endOffset) { List fileScanTasks = Lists.newArrayList(); - StreamingOffset batchStartOffset = StreamingOffset.START_OFFSET.equals(startOffset) ? - determineStartingOffset(table, fromTimestamp) : startOffset; + StreamingOffset batchStartOffset = + StreamingOffset.START_OFFSET.equals(startOffset) + ? determineStartingOffset(table, fromTimestamp) + : startOffset; StreamingOffset currentOffset = null; @@ -195,10 +211,12 @@ private List planFiles(StreamingOffset startOffset, StreamingOffse continue; } - MicroBatch latestMicroBatch = MicroBatches.from(table.snapshot(currentOffset.snapshotId()), table.io()) - .caseSensitive(caseSensitive) - .specsById(table.specs()) - .generate(currentOffset.position(), Long.MAX_VALUE, currentOffset.shouldScanAllFiles()); + MicroBatch latestMicroBatch = + MicroBatches.from(table.snapshot(currentOffset.snapshotId()), table.io()) + .caseSensitive(caseSensitive) + .specsById(table.specs()) + .generate( + currentOffset.position(), Long.MAX_VALUE, currentOffset.shouldScanAllFiles()); fileScanTasks.addAll(latestMicroBatch.tasks()); } while (currentOffset.snapshotId() != endOffset.snapshotId()); @@ -214,19 +232,24 @@ private boolean shouldProcess(Snapshot snapshot) { case DataOperations.REPLACE: return false; case DataOperations.DELETE: - Preconditions.checkState(skipDelete, + Preconditions.checkState( + skipDelete, "Cannot process delete snapshot: %s, to ignore deletes, set %s=true", - snapshot.snapshotId(), SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS); + snapshot.snapshotId(), + SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS); return false; case DataOperations.OVERWRITE: - Preconditions.checkState(skipOverwrite, + Preconditions.checkState( + skipOverwrite, "Cannot process overwrite snapshot: %s, to ignore overwrites, set %s=true", - snapshot.snapshotId(), SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS); + snapshot.snapshotId(), + SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS); return false; default: - throw new IllegalStateException(String.format( - "Cannot process unknown snapshot operation: %s (snapshot id %s)", - op.toLowerCase(Locale.ROOT), snapshot.snapshotId())); + throw new IllegalStateException( + String.format( + "Cannot process unknown snapshot operation: %s (snapshot id %s)", + op.toLowerCase(Locale.ROOT), snapshot.snapshotId())); } } @@ -287,7 +310,8 @@ public StreamingOffset initialOffset() { private void writeOffset(StreamingOffset offset, OutputFile file) { try (OutputStream outputStream = file.create()) { - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(outputStream, StandardCharsets.UTF_8)); + BufferedWriter writer = + new BufferedWriter(new OutputStreamWriter(outputStream, StandardCharsets.UTF_8)); writer.write(offset.json()); writer.flush(); } catch (IOException ioException) { diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java index d38ae2f40316..f17cd260f928 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedFanoutWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.FileFormat; @@ -34,10 +33,15 @@ public class SparkPartitionedFanoutWriter extends PartitionedFanoutWriter appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize, - Schema schema, StructType sparkSchema) { + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + StructType sparkSchema) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.partitionKey = new PartitionKey(spec, schema); this.internalRowWrapper = new InternalRowWrapper(sparkSchema); diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java index f81a09926d85..a86091644360 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitionedWriter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.FileFormat; @@ -34,10 +33,15 @@ public class SparkPartitionedWriter extends PartitionedWriter { private final PartitionKey partitionKey; private final InternalRowWrapper internalRowWrapper; - public SparkPartitionedWriter(PartitionSpec spec, FileFormat format, - FileAppenderFactory appenderFactory, - OutputFileFactory fileFactory, FileIO io, long targetFileSize, - Schema schema, StructType sparkSchema) { + public SparkPartitionedWriter( + PartitionSpec spec, + FileFormat format, + FileAppenderFactory appenderFactory, + OutputFileFactory fileFactory, + FileIO io, + long targetFileSize, + Schema schema, + StructType sparkSchema) { super(spec, format, appenderFactory, fileFactory, io, targetFileSize); this.partitionKey = new PartitionKey(spec, schema); this.internalRowWrapper = new InternalRowWrapper(sparkSchema); diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaOperation.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaOperation.java index 7bdb23cd49ae..72948dedb2bf 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaOperation.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaOperation.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.IsolationLevel; @@ -48,8 +47,8 @@ class SparkPositionDeltaOperation implements RowLevelOperation, SupportsDelta { private Scan configuredScan; private DeltaWriteBuilder lazyWriteBuilder; - SparkPositionDeltaOperation(SparkSession spark, Table table, RowLevelOperationInfo info, - IsolationLevel isolationLevel) { + SparkPositionDeltaOperation( + SparkSession spark, Table table, RowLevelOperationInfo info, IsolationLevel isolationLevel) { this.spark = spark; this.table = table; this.command = info.command(); @@ -64,14 +63,15 @@ public Command command() { @Override public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) { if (lazyScanBuilder == null) { - this.lazyScanBuilder = new SparkScanBuilder(spark, table, options) { - @Override - public Scan build() { - Scan scan = super.buildMergeOnReadScan(); - SparkPositionDeltaOperation.this.configuredScan = scan; - return scan; - } - }; + this.lazyScanBuilder = + new SparkScanBuilder(spark, table, options) { + @Override + public Scan build() { + Scan scan = super.buildMergeOnReadScan(); + SparkPositionDeltaOperation.this.configuredScan = scan; + return scan; + } + }; } return lazyScanBuilder; @@ -80,12 +80,18 @@ public Scan build() { @Override public DeltaWriteBuilder newWriteBuilder(LogicalWriteInfo info) { if (lazyWriteBuilder == null) { - Preconditions.checkArgument(info instanceof ExtendedLogicalWriteInfo, "info must be ExtendedLogicalWriteInfo"); + Preconditions.checkArgument( + info instanceof ExtendedLogicalWriteInfo, "info must be ExtendedLogicalWriteInfo"); // don't validate the scan is not null as if the condition evaluates to false, // the optimizer replaces the original scan relation with a local relation - lazyWriteBuilder = new SparkPositionDeltaWriteBuilder( - spark, table, command, configuredScan, - isolationLevel, (ExtendedLogicalWriteInfo) info); + lazyWriteBuilder = + new SparkPositionDeltaWriteBuilder( + spark, + table, + command, + configuredScan, + isolationLevel, + (ExtendedLogicalWriteInfo) info); } return lazyWriteBuilder; @@ -95,13 +101,13 @@ public DeltaWriteBuilder newWriteBuilder(LogicalWriteInfo info) { public NamedReference[] requiredMetadataAttributes() { NamedReference specId = Expressions.column(MetadataColumns.SPEC_ID.name()); NamedReference partition = Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME); - return new NamedReference[]{specId, partition}; + return new NamedReference[] {specId, partition}; } @Override public NamedReference[] rowId() { NamedReference file = Expressions.column(MetadataColumns.FILE_PATH.name()); NamedReference pos = Expressions.column(MetadataColumns.ROW_POSITION.name()); - return new NamedReference[]{file, pos}; + return new NamedReference[] {file, pos}; } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java index d018ce230316..32d603d5a794 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.IsolationLevel.SERIALIZABLE; +import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.DELETE; +import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.MERGE; +import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.UPDATE; + import java.io.IOException; import java.io.Serializable; import java.util.Arrays; @@ -80,11 +84,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.IsolationLevel.SERIALIZABLE; -import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.DELETE; -import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.MERGE; -import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.UPDATE; - class SparkPositionDeltaWrite implements DeltaWrite, RequiresDistributionAndOrdering { private static final Logger LOG = LoggerFactory.getLogger(SparkPositionDeltaWrite.class); @@ -104,10 +103,17 @@ class SparkPositionDeltaWrite implements DeltaWrite, RequiresDistributionAndOrde private boolean cleanupOnAbort = true; - SparkPositionDeltaWrite(SparkSession spark, Table table, Command command, SparkBatchQueryScan scan, - IsolationLevel isolationLevel, SparkWriteConf writeConf, - ExtendedLogicalWriteInfo info, Schema dataSchema, - Distribution requiredDistribution, SortOrder[] requiredOrdering) { + SparkPositionDeltaWrite( + SparkSession spark, + Table table, + Command command, + SparkBatchQueryScan scan, + IsolationLevel isolationLevel, + SparkWriteConf writeConf, + ExtendedLogicalWriteInfo info, + Schema dataSchema, + Distribution requiredDistribution, + SortOrder[] requiredOrdering) { this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); this.table = table; this.command = command; @@ -150,7 +156,8 @@ private class PositionDeltaBatchWrite implements DeltaBatchWrite { @Override public DeltaWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) { // broadcast the table metadata as the writer factory will be sent to executors - Broadcast
    tableBroadcast = sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); + Broadcast
    tableBroadcast = + sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); return new PositionDeltaWriteFactory(tableBroadcast, command, context); } @@ -179,8 +186,10 @@ public void commit(WriterCommitMessage[] messages) { referencedDataFiles.addAll(Arrays.asList(taskCommit.referencedDataFiles())); } - // the scan may be null if the optimizer replaces it with an empty relation (e.g. the cond is false) - // no validation is needed in this case as the command does not depend on the scanned table state + // the scan may be null if the optimizer replaces it with an empty relation (e.g. the cond is + // false) + // no validation is needed in this case as the command does not depend on the scanned table + // state if (scan != null) { Expression conflictDetectionFilter = conflictDetectionFilter(scan); rowDelta.conflictDetectionFilter(conflictDetectionFilter); @@ -202,16 +211,22 @@ public void commit(WriterCommitMessage[] messages) { rowDelta.validateNoConflictingDataFiles(); } - String commitMsg = String.format( - "position delta with %d data files and %d delete files " + - "(scanSnapshotId: %d, conflictDetectionFilter: %s, isolationLevel: %s)", - addedDataFilesCount, addedDeleteFilesCount, scan.snapshotId(), conflictDetectionFilter, isolationLevel); + String commitMsg = + String.format( + "position delta with %d data files and %d delete files " + + "(scanSnapshotId: %d, conflictDetectionFilter: %s, isolationLevel: %s)", + addedDataFilesCount, + addedDeleteFilesCount, + scan.snapshotId(), + conflictDetectionFilter, + isolationLevel); commitOperation(rowDelta, commitMsg); } else { - String commitMsg = String.format( - "position delta with %d data files and %d delete files (no validation required)", - addedDataFilesCount, addedDeleteFilesCount); + String commitMsg = + String.format( + "position delta with %d data files and %d delete files (no validation required)", + addedDataFilesCount, addedDeleteFilesCount); commitOperation(rowDelta, commitMsg); } } @@ -317,29 +332,34 @@ private static class PositionDeltaWriteFactory implements DeltaWriterFactory { public DeltaWriter createWriter(int partitionId, long taskId) { Table table = tableBroadcast.value(); - OutputFileFactory dataFileFactory = OutputFileFactory.builderFor(table, partitionId, taskId) - .format(context.dataFileFormat()) - .build(); - OutputFileFactory deleteFileFactory = OutputFileFactory.builderFor(table, partitionId, taskId) - .format(context.deleteFileFormat()) - .build(); - - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table) - .dataFileFormat(context.dataFileFormat()) - .dataSchema(context.dataSchema()) - .dataSparkType(context.dataSparkType()) - .deleteFileFormat(context.deleteFileFormat()) - .positionDeleteSparkType(context.deleteSparkType()) - .build(); + OutputFileFactory dataFileFactory = + OutputFileFactory.builderFor(table, partitionId, taskId) + .format(context.dataFileFormat()) + .build(); + OutputFileFactory deleteFileFactory = + OutputFileFactory.builderFor(table, partitionId, taskId) + .format(context.deleteFileFormat()) + .build(); + + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table) + .dataFileFormat(context.dataFileFormat()) + .dataSchema(context.dataSchema()) + .dataSparkType(context.dataSparkType()) + .deleteFileFormat(context.deleteFileFormat()) + .positionDeleteSparkType(context.deleteSparkType()) + .build(); if (command == DELETE) { return new DeleteOnlyDeltaWriter(table, writerFactory, deleteFileFactory, context); } else if (table.spec().isUnpartitioned()) { - return new UnpartitionedDeltaWriter(table, writerFactory, dataFileFactory, deleteFileFactory, context); + return new UnpartitionedDeltaWriter( + table, writerFactory, dataFileFactory, deleteFileFactory, context); } else { - return new PartitionedDeltaWriter(table, writerFactory, dataFileFactory, deleteFileFactory, context); + return new PartitionedDeltaWriter( + table, writerFactory, dataFileFactory, deleteFileFactory, context); } } } @@ -351,12 +371,13 @@ protected InternalRowWrapper initPartitionRowWrapper(Types.StructType partitionT return new InternalRowWrapper(sparkPartitionType); } - protected Map buildPartitionProjections(Types.StructType partitionType, - Map specs) { + protected Map buildPartitionProjections( + Types.StructType partitionType, Map specs) { Map partitionProjections = Maps.newHashMap(); - specs.forEach((specID, spec) -> - partitionProjections.put(specID, StructProjection.create(partitionType, spec.partitionType())) - ); + specs.forEach( + (specID, spec) -> + partitionProjections.put( + specID, StructProjection.create(partitionType, spec.partitionType()))); return partitionProjections; } } @@ -375,11 +396,15 @@ private static class DeleteOnlyDeltaWriter extends BaseDeltaWriter { private boolean closed = false; - DeleteOnlyDeltaWriter(Table table, SparkFileWriterFactory writerFactory, - OutputFileFactory deleteFileFactory, Context context) { + DeleteOnlyDeltaWriter( + Table table, + SparkFileWriterFactory writerFactory, + OutputFileFactory deleteFileFactory, + Context context) { - this.delegate = new ClusteredPositionDeleteWriter<>( - writerFactory, deleteFileFactory, table.io(), context.targetDeleteFileSize()); + this.delegate = + new ClusteredPositionDeleteWriter<>( + writerFactory, deleteFileFactory, table.io(), context.targetDeleteFileSize()); this.positionDelete = PositionDelete.create(); this.io = table.io(); this.specs = table.specs(); @@ -389,9 +414,11 @@ private static class DeleteOnlyDeltaWriter extends BaseDeltaWriter { this.partitionProjections = buildPartitionProjections(partitionType, specs); this.specIdOrdinal = context.metadataSparkType().fieldIndex(MetadataColumns.SPEC_ID.name()); - this.partitionOrdinal = context.metadataSparkType().fieldIndex(MetadataColumns.PARTITION_COLUMN_NAME); + this.partitionOrdinal = + context.metadataSparkType().fieldIndex(MetadataColumns.PARTITION_COLUMN_NAME); this.fileOrdinal = context.deleteSparkType().fieldIndex(MetadataColumns.FILE_PATH.name()); - this.positionOrdinal = context.deleteSparkType().fieldIndex(MetadataColumns.ROW_POSITION.name()); + this.positionOrdinal = + context.deleteSparkType().fieldIndex(MetadataColumns.ROW_POSITION.name()); } @Override @@ -411,12 +438,14 @@ public void delete(InternalRow metadata, InternalRow id) throws IOException { @Override public void update(InternalRow metadata, InternalRow id, InternalRow row) { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement update"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement update"); } @Override public void insert(InternalRow row) throws IOException { - throw new UnsupportedOperationException(this.getClass().getName() + " does not implement insert"); + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement insert"); } @Override @@ -458,13 +487,17 @@ private abstract static class DeleteAndDataDeltaWriter extends BaseDeltaWriter { private boolean closed = false; - DeleteAndDataDeltaWriter(Table table, SparkFileWriterFactory writerFactory, - OutputFileFactory dataFileFactory, OutputFileFactory deleteFileFactory, - Context context) { - this.delegate = new BasePositionDeltaWriter<>( - newInsertWriter(table, writerFactory, dataFileFactory, context), - newUpdateWriter(table, writerFactory, dataFileFactory, context), - newDeleteWriter(table, writerFactory, deleteFileFactory, context)); + DeleteAndDataDeltaWriter( + Table table, + SparkFileWriterFactory writerFactory, + OutputFileFactory dataFileFactory, + OutputFileFactory deleteFileFactory, + Context context) { + this.delegate = + new BasePositionDeltaWriter<>( + newInsertWriter(table, writerFactory, dataFileFactory, context), + newUpdateWriter(table, writerFactory, dataFileFactory, context), + newDeleteWriter(table, writerFactory, deleteFileFactory, context)); this.io = table.io(); this.specs = table.specs(); @@ -473,9 +506,11 @@ private abstract static class DeleteAndDataDeltaWriter extends BaseDeltaWriter { this.deletePartitionProjections = buildPartitionProjections(partitionType, specs); this.specIdOrdinal = context.metadataSparkType().fieldIndex(MetadataColumns.SPEC_ID.name()); - this.partitionOrdinal = context.metadataSparkType().fieldIndex(MetadataColumns.PARTITION_COLUMN_NAME); + this.partitionOrdinal = + context.metadataSparkType().fieldIndex(MetadataColumns.PARTITION_COLUMN_NAME); this.fileOrdinal = context.deleteSparkType().fieldIndex(MetadataColumns.FILE_PATH.name()); - this.positionOrdinal = context.deleteSparkType().fieldIndex(MetadataColumns.ROW_POSITION.name()); + this.positionOrdinal = + context.deleteSparkType().fieldIndex(MetadataColumns.ROW_POSITION.name()); } @Override @@ -517,10 +552,11 @@ public void close() throws IOException { } } - private PartitioningWriter newInsertWriter(Table table, - SparkFileWriterFactory writerFactory, - OutputFileFactory fileFactory, - Context context) { + private PartitioningWriter newInsertWriter( + Table table, + SparkFileWriterFactory writerFactory, + OutputFileFactory fileFactory, + Context context) { long targetFileSize = context.targetDataFileSize(); if (table.spec().isPartitioned() && context.fanoutWriterEnabled()) { @@ -530,10 +566,11 @@ private PartitioningWriter newInsertWriter(Table t } } - private PartitioningWriter newUpdateWriter(Table table, - SparkFileWriterFactory writerFactory, - OutputFileFactory fileFactory, - Context context) { + private PartitioningWriter newUpdateWriter( + Table table, + SparkFileWriterFactory writerFactory, + OutputFileFactory fileFactory, + Context context) { long targetFileSize = context.targetDataFileSize(); if (table.spec().isPartitioned()) { @@ -544,21 +581,26 @@ private PartitioningWriter newUpdateWriter(Table t } } - private ClusteredPositionDeleteWriter newDeleteWriter(Table table, - SparkFileWriterFactory writerFactory, - OutputFileFactory fileFactory, - Context context) { + private ClusteredPositionDeleteWriter newDeleteWriter( + Table table, + SparkFileWriterFactory writerFactory, + OutputFileFactory fileFactory, + Context context) { long targetFileSize = context.targetDeleteFileSize(); - return new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, table.io(), targetFileSize); + return new ClusteredPositionDeleteWriter<>( + writerFactory, fileFactory, table.io(), targetFileSize); } } private static class UnpartitionedDeltaWriter extends DeleteAndDataDeltaWriter { private final PartitionSpec dataSpec; - UnpartitionedDeltaWriter(Table table, SparkFileWriterFactory writerFactory, - OutputFileFactory dataFileFactory, OutputFileFactory deleteFileFactory, - Context context) { + UnpartitionedDeltaWriter( + Table table, + SparkFileWriterFactory writerFactory, + OutputFileFactory dataFileFactory, + OutputFileFactory deleteFileFactory, + Context context) { super(table, writerFactory, dataFileFactory, deleteFileFactory, context); this.dataSpec = table.spec(); } @@ -580,9 +622,12 @@ private static class PartitionedDeltaWriter extends DeleteAndDataDeltaWriter { private final PartitionKey dataPartitionKey; private final InternalRowWrapper internalRowDataWrapper; - PartitionedDeltaWriter(Table table, SparkFileWriterFactory writerFactory, - OutputFileFactory dataFileFactory, OutputFileFactory deleteFileFactory, - Context context) { + PartitionedDeltaWriter( + Table table, + SparkFileWriterFactory writerFactory, + OutputFileFactory dataFileFactory, + OutputFileFactory deleteFileFactory, + Context context) { super(table, writerFactory, dataFileFactory, deleteFileFactory, context); this.dataSpec = table.spec(); diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWriteBuilder.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWriteBuilder.java index 167e0853f055..ebac7e2515cc 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWriteBuilder.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWriteBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.DistributionMode; @@ -43,10 +42,8 @@ class SparkPositionDeltaWriteBuilder implements DeltaWriteBuilder { - private static final Schema EXPECTED_ROW_ID_SCHEMA = new Schema( - MetadataColumns.FILE_PATH, - MetadataColumns.ROW_POSITION - ); + private static final Schema EXPECTED_ROW_ID_SCHEMA = + new Schema(MetadataColumns.FILE_PATH, MetadataColumns.ROW_POSITION); private final SparkSession spark; private final Table table; @@ -59,8 +56,13 @@ class SparkPositionDeltaWriteBuilder implements DeltaWriteBuilder { private final boolean checkNullability; private final boolean checkOrdering; - SparkPositionDeltaWriteBuilder(SparkSession spark, Table table, Command command, Scan scan, - IsolationLevel isolationLevel, ExtendedLogicalWriteInfo info) { + SparkPositionDeltaWriteBuilder( + SparkSession spark, + Table table, + Command command, + Scan scan, + IsolationLevel isolationLevel, + ExtendedLogicalWriteInfo info) { this.spark = spark; this.table = table; this.command = command; @@ -75,7 +77,8 @@ class SparkPositionDeltaWriteBuilder implements DeltaWriteBuilder { @Override public DeltaWrite build() { - Preconditions.checkArgument(handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), + Preconditions.checkArgument( + handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); Schema dataSchema = dataSchema(); @@ -84,23 +87,35 @@ public DeltaWrite build() { } Schema rowIdSchema = SparkSchemaUtil.convert(EXPECTED_ROW_ID_SCHEMA, info.rowIdSchema()); - TypeUtil.validateSchema("row ID", EXPECTED_ROW_ID_SCHEMA, rowIdSchema, checkNullability, checkOrdering); + TypeUtil.validateSchema( + "row ID", EXPECTED_ROW_ID_SCHEMA, rowIdSchema, checkNullability, checkOrdering); - NestedField partition = MetadataColumns.metadataColumn(table, MetadataColumns.PARTITION_COLUMN_NAME); + NestedField partition = + MetadataColumns.metadataColumn(table, MetadataColumns.PARTITION_COLUMN_NAME); Schema expectedMetadataSchema = new Schema(MetadataColumns.SPEC_ID, partition); Schema metadataSchema = SparkSchemaUtil.convert(expectedMetadataSchema, info.metadataSchema()); - TypeUtil.validateSchema("metadata", expectedMetadataSchema, metadataSchema, checkNullability, checkOrdering); + TypeUtil.validateSchema( + "metadata", expectedMetadataSchema, metadataSchema, checkNullability, checkOrdering); SparkUtil.validatePartitionTransforms(table.spec()); - Distribution distribution = SparkDistributionAndOrderingUtil.buildPositionDeltaDistribution( - table, command, distributionMode()); - SortOrder[] ordering = SparkDistributionAndOrderingUtil.buildPositionDeltaOrdering( - table, command); + Distribution distribution = + SparkDistributionAndOrderingUtil.buildPositionDeltaDistribution( + table, command, distributionMode()); + SortOrder[] ordering = + SparkDistributionAndOrderingUtil.buildPositionDeltaOrdering(table, command); return new SparkPositionDeltaWrite( - spark, table, command, scan, isolationLevel, writeConf, - info, dataSchema, distribution, ordering); + spark, + table, + command, + scan, + isolationLevel, + writeConf, + info, + dataSchema, + distribution, + ordering); } private Schema dataSchema() { diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkRowLevelOperationBuilder.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkRowLevelOperationBuilder.java index 3cdb585f9745..0673d647703c 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkRowLevelOperationBuilder.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkRowLevelOperationBuilder.java @@ -16,19 +16,8 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; -import java.util.Map; -import org.apache.iceberg.IsolationLevel; -import org.apache.iceberg.RowLevelOperationMode; -import org.apache.iceberg.Table; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.connector.write.RowLevelOperation; -import org.apache.spark.sql.connector.write.RowLevelOperation.Command; -import org.apache.spark.sql.connector.write.RowLevelOperationBuilder; -import org.apache.spark.sql.connector.write.RowLevelOperationInfo; - import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL; import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL_DEFAULT; import static org.apache.iceberg.TableProperties.DELETE_MODE; @@ -42,6 +31,16 @@ import static org.apache.iceberg.TableProperties.UPDATE_MODE; import static org.apache.iceberg.TableProperties.UPDATE_MODE_DEFAULT; +import java.util.Map; +import org.apache.iceberg.IsolationLevel; +import org.apache.iceberg.RowLevelOperationMode; +import org.apache.iceberg.Table; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.connector.write.RowLevelOperation; +import org.apache.spark.sql.connector.write.RowLevelOperation.Command; +import org.apache.spark.sql.connector.write.RowLevelOperationBuilder; +import org.apache.spark.sql.connector.write.RowLevelOperationInfo; + class SparkRowLevelOperationBuilder implements RowLevelOperationBuilder { private final SparkSession spark; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java index b9292541eaeb..f26daa55b2b3 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.Serializable; @@ -69,8 +68,12 @@ abstract class SparkScan extends SparkBatch implements Scan, SupportsReportStati // lazy variables private StructType readSchema; - SparkScan(SparkSession spark, Table table, SparkReadConf readConf, - Schema expectedSchema, List filters) { + SparkScan( + SparkSession spark, + Table table, + SparkReadConf readConf, + Schema expectedSchema, + List filters) { super(spark, table, readConf, expectedSchema); SparkSchemaUtil.validateMetadataColumnReferences(table.schema(), expectedSchema); @@ -106,14 +109,16 @@ public Batch toBatch() { @Override public MicroBatchStream toMicroBatchStream(String checkpointLocation) { - return new SparkMicroBatchStream(sparkContext(), table, readConf, expectedSchema, checkpointLocation); + return new SparkMicroBatchStream( + sparkContext(), table, readConf, expectedSchema, checkpointLocation); } @Override public StructType readSchema() { if (readSchema == null) { - Preconditions.checkArgument(readTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(expectedSchema), - SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); + Preconditions.checkArgument( + readTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(expectedSchema), + SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); this.readSchema = SparkSchemaUtil.convert(expectedSchema); } return readSchema; @@ -130,14 +135,14 @@ protected Statistics estimateStatistics(Snapshot snapshot) { return new Stats(0L, 0L); } - // estimate stats using snapshot summary only for partitioned tables (metadata tables are unpartitioned) + // estimate stats using snapshot summary only for partitioned tables (metadata tables are + // unpartitioned) if (!table.spec().isUnpartitioned() && filterExpressions.isEmpty()) { LOG.debug("using table metadata to estimate table statistics"); - long totalRecords = PropertyUtil.propertyAsLong(snapshot.summary(), - SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE); - return new Stats( - SparkSchemaUtil.estimateSize(readSchema(), totalRecords), - totalRecords); + long totalRecords = + PropertyUtil.propertyAsLong( + snapshot.summary(), SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE); + return new Stats(SparkSchemaUtil.estimateSize(readSchema(), totalRecords), totalRecords); } long numRows = 0L; @@ -156,7 +161,8 @@ protected Statistics estimateStatistics(Snapshot snapshot) { @Override public String description() { - String filters = filterExpressions.stream().map(Spark3Util::describe).collect(Collectors.joining(", ")); + String filters = + filterExpressions.stream().map(Spark3Util::describe).collect(Collectors.joining(", ")); return String.format("%s [filters=%s]", table, filters); } @@ -197,7 +203,8 @@ private static class RowReader extends RowDataReader implements PartitionReader< } } - private static class BatchReader extends BatchDataReader implements PartitionReader { + private static class BatchReader extends BatchDataReader + implements PartitionReader { BatchReader(ReadTask task, int batchSize) { super(task.task, task.table(), task.expectedSchema(), task.isCaseSensitive(), batchSize); } @@ -212,8 +219,12 @@ static class ReadTask implements InputPartition, Serializable { private transient Schema expectedSchema = null; private transient String[] preferredLocations = null; - ReadTask(CombinedScanTask task, Broadcast
    tableBroadcast, String expectedSchemaString, - boolean caseSensitive, boolean localityPreferred) { + ReadTask( + CombinedScanTask task, + Broadcast
    tableBroadcast, + String expectedSchemaString, + boolean caseSensitive, + boolean localityPreferred) { this.task = task; this.tableBroadcast = tableBroadcast; this.expectedSchemaString = expectedSchemaString; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java index 60d2c2150bb1..21c34ed6f628 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -55,8 +54,11 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class SparkScanBuilder implements ScanBuilder, SupportsPushDownFilters, SupportsPushDownRequiredColumns, - SupportsReportStatistics { +public class SparkScanBuilder + implements ScanBuilder, + SupportsPushDownFilters, + SupportsPushDownRequiredColumns, + SupportsReportStatistics { private static final Logger LOG = LoggerFactory.getLogger(SparkScanBuilder.class); private static final Filter[] NO_FILTERS = new Filter[0]; @@ -72,7 +74,8 @@ public class SparkScanBuilder implements ScanBuilder, SupportsPushDownFilters, S private List filterExpressions = null; private Filter[] pushedFilters = NO_FILTERS; - SparkScanBuilder(SparkSession spark, Table table, Schema schema, CaseInsensitiveStringMap options) { + SparkScanBuilder( + SparkSession spark, Table table, Schema schema, CaseInsensitiveStringMap options) { this.spark = spark; this.table = table; this.schema = schema; @@ -108,8 +111,10 @@ public Filter[] pushFilters(Filter[] filters) { expr = SparkFilters.convert(filter); } catch (IllegalArgumentException e) { // converting to Iceberg Expression failed, so this expression cannot be pushed down - LOG.info("Failed to convert filter to Iceberg expression, skipping push down for this expression: {}. {}", - filter, e.getMessage()); + LOG.info( + "Failed to convert filter to Iceberg expression, skipping push down for this expression: {}. {}", + filter, + e.getMessage()); } if (expr != null) { @@ -119,8 +124,10 @@ public Filter[] pushFilters(Filter[] filters) { pushed.add(filter); } catch (ValidationException e) { // binding to the table schema failed, so this expression cannot be pushed down - LOG.info("Failed to bind expression to table schema, skipping push down for this expression: {}. {}", - filter, e.getMessage()); + LOG.info( + "Failed to bind expression to table schema, skipping push down for this expression: {}. {}", + filter, + e.getMessage()); } } } @@ -140,12 +147,16 @@ public Filter[] pushedFilters() { @Override public void pruneColumns(StructType requestedSchema) { - StructType requestedProjection = new StructType(Stream.of(requestedSchema.fields()) - .filter(field -> MetadataColumns.nonMetadataColumn(field.name())) - .toArray(StructField[]::new)); + StructType requestedProjection = + new StructType( + Stream.of(requestedSchema.fields()) + .filter(field -> MetadataColumns.nonMetadataColumn(field.name())) + .toArray(StructField[]::new)); - // the projection should include all columns that will be returned, including those only used in filters - this.schema = SparkSchemaUtil.prune(schema, requestedProjection, filterExpression(), caseSensitive); + // the projection should include all columns that will be returned, including those only used in + // filters + this.schema = + SparkSchemaUtil.prune(schema, requestedProjection, filterExpression(), caseSensitive); Stream.of(requestedSchema.fields()) .map(StructField::name) @@ -156,10 +167,11 @@ public void pruneColumns(StructType requestedSchema) { private Schema schemaWithMetadataColumns() { // metadata columns - List fields = metaColumns.stream() - .distinct() - .map(name -> MetadataColumns.metadataColumn(table, name)) - .collect(Collectors.toList()); + List fields = + metaColumns.stream() + .distinct() + .map(name -> MetadataColumns.metadataColumn(table, name)) + .collect(Collectors.toList()); Schema meta = new Schema(fields); // schema or rows returned by readers @@ -171,30 +183,39 @@ public Scan build() { Long snapshotId = readConf.snapshotId(); Long asOfTimestamp = readConf.asOfTimestamp(); - Preconditions.checkArgument(snapshotId == null || asOfTimestamp == null, + Preconditions.checkArgument( + snapshotId == null || asOfTimestamp == null, "Cannot set both %s and %s to select which table snapshot to scan", - SparkReadOptions.SNAPSHOT_ID, SparkReadOptions.AS_OF_TIMESTAMP); + SparkReadOptions.SNAPSHOT_ID, + SparkReadOptions.AS_OF_TIMESTAMP); Long startSnapshotId = readConf.startSnapshotId(); Long endSnapshotId = readConf.endSnapshotId(); if (snapshotId != null || asOfTimestamp != null) { - Preconditions.checkArgument(startSnapshotId == null && endSnapshotId == null, + Preconditions.checkArgument( + startSnapshotId == null && endSnapshotId == null, "Cannot set %s and %s for incremental scans when either %s or %s is set", - SparkReadOptions.START_SNAPSHOT_ID, SparkReadOptions.END_SNAPSHOT_ID, - SparkReadOptions.SNAPSHOT_ID, SparkReadOptions.AS_OF_TIMESTAMP); + SparkReadOptions.START_SNAPSHOT_ID, + SparkReadOptions.END_SNAPSHOT_ID, + SparkReadOptions.SNAPSHOT_ID, + SparkReadOptions.AS_OF_TIMESTAMP); } - Preconditions.checkArgument(startSnapshotId != null || endSnapshotId == null, + Preconditions.checkArgument( + startSnapshotId != null || endSnapshotId == null, "Cannot set only %s for incremental scans. Please, set %s too.", - SparkReadOptions.END_SNAPSHOT_ID, SparkReadOptions.START_SNAPSHOT_ID); + SparkReadOptions.END_SNAPSHOT_ID, + SparkReadOptions.START_SNAPSHOT_ID); Schema expectedSchema = schemaWithMetadataColumns(); - TableScan scan = table.newScan() - .caseSensitive(caseSensitive) - .filter(filterExpression()) - .project(expectedSchema); + TableScan scan = + table + .newScan() + .caseSensitive(caseSensitive) + .filter(filterExpression()) + .project(expectedSchema); if (snapshotId != null) { scan = scan.useSnapshot(snapshotId); @@ -218,61 +239,71 @@ public Scan build() { } public Scan buildMergeOnReadScan() { - Preconditions.checkArgument(readConf.snapshotId() == null && readConf.asOfTimestamp() == null, + Preconditions.checkArgument( + readConf.snapshotId() == null && readConf.asOfTimestamp() == null, "Cannot set time travel options %s and %s for row-level command scans", - SparkReadOptions.SNAPSHOT_ID, SparkReadOptions.AS_OF_TIMESTAMP); + SparkReadOptions.SNAPSHOT_ID, + SparkReadOptions.AS_OF_TIMESTAMP); - Preconditions.checkArgument(readConf.startSnapshotId() == null && readConf.endSnapshotId() == null, + Preconditions.checkArgument( + readConf.startSnapshotId() == null && readConf.endSnapshotId() == null, "Cannot set incremental scan options %s and %s for row-level command scans", - SparkReadOptions.START_SNAPSHOT_ID, SparkReadOptions.END_SNAPSHOT_ID); + SparkReadOptions.START_SNAPSHOT_ID, + SparkReadOptions.END_SNAPSHOT_ID); Snapshot snapshot = table.currentSnapshot(); if (snapshot == null) { - return new SparkBatchQueryScan(spark, table, null, readConf, schemaWithMetadataColumns(), filterExpressions); + return new SparkBatchQueryScan( + spark, table, null, readConf, schemaWithMetadataColumns(), filterExpressions); } // remember the current snapshot ID for commit validation long snapshotId = snapshot.snapshotId(); - CaseInsensitiveStringMap adjustedOptions = Spark3Util.setOption( - SparkReadOptions.SNAPSHOT_ID, - Long.toString(snapshotId), - options); + CaseInsensitiveStringMap adjustedOptions = + Spark3Util.setOption(SparkReadOptions.SNAPSHOT_ID, Long.toString(snapshotId), options); SparkReadConf adjustedReadConf = new SparkReadConf(spark, table, adjustedOptions); Schema expectedSchema = schemaWithMetadataColumns(); - TableScan scan = table.newScan() - .useSnapshot(snapshotId) - .caseSensitive(caseSensitive) - .filter(filterExpression()) - .project(expectedSchema); + TableScan scan = + table + .newScan() + .useSnapshot(snapshotId) + .caseSensitive(caseSensitive) + .filter(filterExpression()) + .project(expectedSchema); scan = configureSplitPlanning(scan); - return new SparkBatchQueryScan(spark, table, scan, adjustedReadConf, expectedSchema, filterExpressions); + return new SparkBatchQueryScan( + spark, table, scan, adjustedReadConf, expectedSchema, filterExpressions); } public Scan buildCopyOnWriteScan() { Snapshot snapshot = table.currentSnapshot(); if (snapshot == null) { - return new SparkCopyOnWriteScan(spark, table, readConf, schemaWithMetadataColumns(), filterExpressions); + return new SparkCopyOnWriteScan( + spark, table, readConf, schemaWithMetadataColumns(), filterExpressions); } Schema expectedSchema = schemaWithMetadataColumns(); - TableScan scan = table.newScan() - .useSnapshot(snapshot.snapshotId()) - .ignoreResiduals() - .caseSensitive(caseSensitive) - .filter(filterExpression()) - .project(expectedSchema); + TableScan scan = + table + .newScan() + .useSnapshot(snapshot.snapshotId()) + .ignoreResiduals() + .caseSensitive(caseSensitive) + .filter(filterExpression()) + .project(expectedSchema); scan = configureSplitPlanning(scan); - return new SparkCopyOnWriteScan(spark, table, scan, snapshot, readConf, expectedSchema, filterExpressions); + return new SparkCopyOnWriteScan( + spark, table, scan, snapshot, readConf, expectedSchema, filterExpressions); } private TableScan configureSplitPlanning(TableScan scan) { @@ -285,12 +316,15 @@ private TableScan configureSplitPlanning(TableScan scan) { Integer splitLookback = readConf.splitLookbackOption(); if (splitLookback != null) { - configuredScan = configuredScan.option(TableProperties.SPLIT_LOOKBACK, String.valueOf(splitLookback)); + configuredScan = + configuredScan.option(TableProperties.SPLIT_LOOKBACK, String.valueOf(splitLookback)); } Long splitOpenFileCost = readConf.splitOpenFileCostOption(); if (splitOpenFileCost != null) { - configuredScan = configuredScan.option(TableProperties.SPLIT_OPEN_FILE_COST, String.valueOf(splitOpenFileCost)); + configuredScan = + configuredScan.option( + TableProperties.SPLIT_OPEN_FILE_COST, String.valueOf(splitOpenFileCost)); } return configuredScan; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java index 590f9336966b..662d85eaa7dc 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.CURRENT_SNAPSHOT_ID; +import static org.apache.iceberg.TableProperties.FORMAT_VERSION; + import java.io.IOException; import java.util.Map; import java.util.Set; @@ -72,24 +74,33 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.TableProperties.CURRENT_SNAPSHOT_ID; -import static org.apache.iceberg.TableProperties.FORMAT_VERSION; - -public class SparkTable implements org.apache.spark.sql.connector.catalog.Table, - SupportsRead, SupportsWrite, SupportsDelete, SupportsRowLevelOperations, SupportsMetadataColumns { +public class SparkTable + implements org.apache.spark.sql.connector.catalog.Table, + SupportsRead, + SupportsWrite, + SupportsDelete, + SupportsRowLevelOperations, + SupportsMetadataColumns { private static final Logger LOG = LoggerFactory.getLogger(SparkTable.class); private static final Set RESERVED_PROPERTIES = - ImmutableSet.of("provider", "format", CURRENT_SNAPSHOT_ID, "location", FORMAT_VERSION, "sort-order", + ImmutableSet.of( + "provider", + "format", + CURRENT_SNAPSHOT_ID, + "location", + FORMAT_VERSION, + "sort-order", "identifier-fields"); - private static final Set CAPABILITIES = ImmutableSet.of( - TableCapability.BATCH_READ, - TableCapability.BATCH_WRITE, - TableCapability.MICRO_BATCH_READ, - TableCapability.STREAMING_WRITE, - TableCapability.OVERWRITE_BY_FILTER, - TableCapability.OVERWRITE_DYNAMIC); + private static final Set CAPABILITIES = + ImmutableSet.of( + TableCapability.BATCH_READ, + TableCapability.BATCH_WRITE, + TableCapability.MICRO_BATCH_READ, + TableCapability.STREAMING_WRITE, + TableCapability.OVERWRITE_BY_FILTER, + TableCapability.OVERWRITE_DYNAMIC); private static final Set CAPABILITIES_WITH_ACCEPT_ANY_SCHEMA = ImmutableSet.builder() .addAll(CAPABILITIES) @@ -112,8 +123,11 @@ public SparkTable(Table icebergTable, Long snapshotId, boolean refreshEagerly) { this.snapshotId = snapshotId; this.refreshEagerly = refreshEagerly; - boolean acceptAnySchema = PropertyUtil.propertyAsBoolean(icebergTable.properties(), - TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA, TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA_DEFAULT); + boolean acceptAnySchema = + PropertyUtil.propertyAsBoolean( + icebergTable.properties(), + TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA, + TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA_DEFAULT); this.capabilities = acceptAnySchema ? CAPABILITIES_WITH_ACCEPT_ANY_SCHEMA : CAPABILITIES; } @@ -156,12 +170,17 @@ public Transform[] partitioning() { public Map properties() { ImmutableMap.Builder propsBuilder = ImmutableMap.builder(); - String fileFormat = icebergTable.properties() - .getOrDefault(TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); + String fileFormat = + icebergTable + .properties() + .getOrDefault( + TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); propsBuilder.put("format", "iceberg/" + fileFormat); propsBuilder.put("provider", "iceberg"); - String currentSnapshotId = icebergTable.currentSnapshot() != null ? - String.valueOf(icebergTable.currentSnapshot().snapshotId()) : "none"; + String currentSnapshotId = + icebergTable.currentSnapshot() != null + ? String.valueOf(icebergTable.currentSnapshot().snapshotId()) + : "none"; propsBuilder.put(CURRENT_SNAPSHOT_ID, currentSnapshotId); propsBuilder.put("location", icebergTable.location()); @@ -195,11 +214,11 @@ public Set capabilities() { public MetadataColumn[] metadataColumns() { DataType sparkPartitionType = SparkSchemaUtil.convert(Partitioning.partitionType(table())); return new MetadataColumn[] { - new SparkMetadataColumn(MetadataColumns.SPEC_ID.name(), DataTypes.IntegerType, false), - new SparkMetadataColumn(MetadataColumns.PARTITION_COLUMN_NAME, sparkPartitionType, true), - new SparkMetadataColumn(MetadataColumns.FILE_PATH.name(), DataTypes.StringType, false), - new SparkMetadataColumn(MetadataColumns.ROW_POSITION.name(), DataTypes.LongType, false), - new SparkMetadataColumn(MetadataColumns.IS_DELETED.name(), DataTypes.BooleanType, false) + new SparkMetadataColumn(MetadataColumns.SPEC_ID.name(), DataTypes.IntegerType, false), + new SparkMetadataColumn(MetadataColumns.PARTITION_COLUMN_NAME, sparkPartitionType, true), + new SparkMetadataColumn(MetadataColumns.FILE_PATH.name(), DataTypes.StringType, false), + new SparkMetadataColumn(MetadataColumns.ROW_POSITION.name(), DataTypes.LongType, false), + new SparkMetadataColumn(MetadataColumns.IS_DELETED.name(), DataTypes.BooleanType, false) }; } @@ -221,8 +240,7 @@ public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) { @Override public WriteBuilder newWriteBuilder(LogicalWriteInfo info) { Preconditions.checkArgument( - snapshotId == null, - "Cannot write to table at a specific snapshot: %s", snapshotId); + snapshotId == null, "Cannot write to table at a specific snapshot: %s", snapshotId); return new SparkWriteBuilder(sparkSession(), icebergTable, info); } @@ -235,8 +253,7 @@ public RowLevelOperationBuilder newRowLevelOperationBuilder(RowLevelOperationInf @Override public boolean canDeleteWhere(Filter[] filters) { Preconditions.checkArgument( - snapshotId == null, - "Cannot delete from table at a specific snapshot: %s", snapshotId); + snapshotId == null, "Cannot delete from table at a specific snapshot: %s", snapshotId); Expression deleteExpr = Expressions.alwaysTrue(); @@ -254,25 +271,34 @@ public boolean canDeleteWhere(Filter[] filters) { // a metadata delete is possible iff matching files can be deleted entirely private boolean canDeleteUsingMetadata(Expression deleteExpr) { - boolean caseSensitive = Boolean.parseBoolean(sparkSession().conf().get("spark.sql.caseSensitive")); - TableScan scan = table().newScan() - .filter(deleteExpr) - .caseSensitive(caseSensitive) - .includeColumnStats() - .ignoreResiduals(); + boolean caseSensitive = + Boolean.parseBoolean(sparkSession().conf().get("spark.sql.caseSensitive")); + TableScan scan = + table() + .newScan() + .filter(deleteExpr) + .caseSensitive(caseSensitive) + .includeColumnStats() + .ignoreResiduals(); try (CloseableIterable tasks = scan.planFiles()) { Map evaluators = Maps.newHashMap(); - StrictMetricsEvaluator metricsEvaluator = new StrictMetricsEvaluator(table().schema(), deleteExpr); - - return Iterables.all(tasks, task -> { - DataFile file = task.file(); - PartitionSpec spec = task.spec(); - Evaluator evaluator = evaluators.computeIfAbsent( - spec.specId(), - specId -> new Evaluator(spec.partitionType(), Projections.strict(spec).project(deleteExpr))); - return evaluator.eval(file.partition()) || metricsEvaluator.eval(file); - }); + StrictMetricsEvaluator metricsEvaluator = + new StrictMetricsEvaluator(table().schema(), deleteExpr); + + return Iterables.all( + tasks, + task -> { + DataFile file = task.file(); + PartitionSpec spec = task.spec(); + Evaluator evaluator = + evaluators.computeIfAbsent( + spec.specId(), + specId -> + new Evaluator( + spec.partitionType(), Projections.strict(spec).project(deleteExpr))); + return evaluator.eval(file.partition()) || metricsEvaluator.eval(file); + }); } catch (IOException ioe) { LOG.warn("Failed to close task iterable", ioe); @@ -289,7 +315,8 @@ public void deleteWhere(Filter[] filters) { return; } - icebergTable.newDelete() + icebergTable + .newDelete() .set("spark.app.id", sparkSession().sparkContext().applicationId()) .deleteFromRowFilter(deleteExpr) .commit(); @@ -319,12 +346,15 @@ public int hashCode() { return icebergTable.name().hashCode(); } - private static CaseInsensitiveStringMap addSnapshotId(CaseInsensitiveStringMap options, Long snapshotId) { + private static CaseInsensitiveStringMap addSnapshotId( + CaseInsensitiveStringMap options, Long snapshotId) { if (snapshotId != null) { String snapshotIdFromOptions = options.get(SparkReadOptions.SNAPSHOT_ID); String value = snapshotId.toString(); - Preconditions.checkArgument(snapshotIdFromOptions == null || snapshotIdFromOptions.equals(value), - "Cannot override snapshot ID more than once: %s", snapshotIdFromOptions); + Preconditions.checkArgument( + snapshotIdFromOptions == null || snapshotIdFromOptions.equals(value), + "Cannot override snapshot ID more than once: %s", + snapshotIdFromOptions); Map scanOptions = Maps.newHashMap(); scanOptions.putAll(options.asCaseSensitiveMap()); diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java index b6527ffa700f..52e43d3484a6 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.IsolationLevel.SERIALIZABLE; +import static org.apache.iceberg.IsolationLevel.SNAPSHOT; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; +import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; +import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; + import java.io.IOException; import java.util.Arrays; import java.util.Collections; @@ -84,17 +94,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.IsolationLevel.SERIALIZABLE; -import static org.apache.iceberg.IsolationLevel.SNAPSHOT; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MAX_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS; -import static org.apache.iceberg.TableProperties.COMMIT_MIN_RETRY_WAIT_MS_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; -import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES_DEFAULT; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS; -import static org.apache.iceberg.TableProperties.COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT; - abstract class SparkWrite implements Write, RequiresDistributionAndOrdering { private static final Logger LOG = LoggerFactory.getLogger(SparkWrite.class); @@ -116,10 +115,16 @@ abstract class SparkWrite implements Write, RequiresDistributionAndOrdering { private boolean cleanupOnAbort = true; - SparkWrite(SparkSession spark, Table table, SparkWriteConf writeConf, - LogicalWriteInfo writeInfo, String applicationId, - Schema writeSchema, StructType dsSchema, - Distribution requiredDistribution, SortOrder[] requiredOrdering) { + SparkWrite( + SparkSession spark, + Table table, + SparkWriteConf writeConf, + LogicalWriteInfo writeInfo, + String applicationId, + Schema writeSchema, + StructType dsSchema, + Distribution requiredDistribution, + SortOrder[] requiredOrdering) { this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); this.table = table; this.writeConf = writeConf; @@ -178,8 +183,10 @@ StreamingWrite asStreamingOverwrite() { // the writer factory works for both batch and streaming private WriterFactory createWriterFactory() { // broadcast the table metadata as the writer factory will be sent to executors - Broadcast
    tableBroadcast = sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); - return new WriterFactory(tableBroadcast, format, targetFileSize, writeSchema, dsSchema, partitionedFanoutEnabled); + Broadcast
    tableBroadcast = + sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); + return new WriterFactory( + tableBroadcast, format, targetFileSize, writeSchema, dsSchema, partitionedFanoutEnabled); } private void commitOperation(SnapshotUpdate operation, String description) { @@ -221,24 +228,33 @@ private void abort(WriterCommitMessage[] messages) { .executeWith(ThreadPools.getWorkerPool()) .retry(PropertyUtil.propertyAsInt(props, COMMIT_NUM_RETRIES, COMMIT_NUM_RETRIES_DEFAULT)) .exponentialBackoff( - PropertyUtil.propertyAsInt(props, COMMIT_MIN_RETRY_WAIT_MS, COMMIT_MIN_RETRY_WAIT_MS_DEFAULT), - PropertyUtil.propertyAsInt(props, COMMIT_MAX_RETRY_WAIT_MS, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT), - PropertyUtil.propertyAsInt(props, COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_MIN_RETRY_WAIT_MS, COMMIT_MIN_RETRY_WAIT_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_MAX_RETRY_WAIT_MS, COMMIT_MAX_RETRY_WAIT_MS_DEFAULT), + PropertyUtil.propertyAsInt( + props, COMMIT_TOTAL_RETRY_TIME_MS, COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT), 2.0 /* exponential */) .throwFailureWhenFinished() - .run(file -> { - table.io().deleteFile(file.path().toString()); - }); + .run( + file -> { + table.io().deleteFile(file.path().toString()); + }); } else { - LOG.warn("Skipping cleaning up of data files because Iceberg was unable to determine the final commit state"); + LOG.warn( + "Skipping cleaning up of data files because Iceberg was unable to determine the final commit state"); } } private Iterable files(WriterCommitMessage[] messages) { if (messages.length > 0) { - return Iterables.concat(Iterables.transform(Arrays.asList(messages), message -> message != null ? - ImmutableList.copyOf(((TaskCommit) message).files()) : - ImmutableList.of())); + return Iterables.concat( + Iterables.transform( + Arrays.asList(messages), + message -> + message != null + ? ImmutableList.copyOf(((TaskCommit) message).files()) + : ImmutableList.of())); } return ImmutableList.of(); } @@ -312,7 +328,9 @@ public void commit(WriterCommitMessage[] messages) { dynamicOverwrite.addFile(file); } - commitOperation(dynamicOverwrite, String.format("dynamic partition overwrite with %d new data files", numFiles)); + commitOperation( + dynamicOverwrite, + String.format("dynamic partition overwrite with %d new data files", numFiles)); } } @@ -349,7 +367,8 @@ public void commit(WriterCommitMessage[] messages) { overwriteFiles.validateNoConflictingDeletes(); } - String commitMsg = String.format("overwrite by filter %s with %d new data files", overwriteExpr, numFiles); + String commitMsg = + String.format("overwrite by filter %s with %d new data files", overwriteExpr, numFiles); commitOperation(overwriteFiles, commitMsg); } } @@ -405,9 +424,8 @@ public void commit(WriterCommitMessage[] messages) { } } - private void commitWithSerializableIsolation(OverwriteFiles overwriteFiles, - int numOverwrittenFiles, - int numAddedFiles) { + private void commitWithSerializableIsolation( + OverwriteFiles overwriteFiles, int numOverwrittenFiles, int numAddedFiles) { Long scanSnapshotId = scan.snapshotId(); if (scanSnapshotId != null) { overwriteFiles.validateFromSnapshot(scanSnapshotId); @@ -418,15 +436,15 @@ private void commitWithSerializableIsolation(OverwriteFiles overwriteFiles, overwriteFiles.validateNoConflictingData(); overwriteFiles.validateNoConflictingDeletes(); - String commitMsg = String.format( - "overwrite of %d data files with %d new data files, scanSnapshotId: %d, conflictDetectionFilter: %s", - numOverwrittenFiles, numAddedFiles, scanSnapshotId, conflictDetectionFilter); + String commitMsg = + String.format( + "overwrite of %d data files with %d new data files, scanSnapshotId: %d, conflictDetectionFilter: %s", + numOverwrittenFiles, numAddedFiles, scanSnapshotId, conflictDetectionFilter); commitOperation(overwriteFiles, commitMsg); } - private void commitWithSnapshotIsolation(OverwriteFiles overwriteFiles, - int numOverwrittenFiles, - int numAddedFiles) { + private void commitWithSnapshotIsolation( + OverwriteFiles overwriteFiles, int numOverwrittenFiles, int numAddedFiles) { Long scanSnapshotId = scan.snapshotId(); if (scanSnapshotId != null) { overwriteFiles.validateFromSnapshot(scanSnapshotId); @@ -436,9 +454,10 @@ private void commitWithSnapshotIsolation(OverwriteFiles overwriteFiles, overwriteFiles.conflictDetectionFilter(conflictDetectionFilter); overwriteFiles.validateNoConflictingDeletes(); - String commitMsg = String.format( - "overwrite of %d data files with %d new data files", - numOverwrittenFiles, numAddedFiles); + String commitMsg = + String.format( + "overwrite of %d data files with %d new data files", + numOverwrittenFiles, numAddedFiles); commitOperation(overwriteFiles, commitMsg); } } @@ -557,7 +576,10 @@ public void doCommit(long epochId, WriterCommitMessage[] messages) { overwriteFiles.addFile(file); numFiles++; } - commit(overwriteFiles, epochId, String.format("streaming complete overwrite with %d new data files", numFiles)); + commit( + overwriteFiles, + epochId, + String.format("streaming complete overwrite with %d new data files", numFiles)); } } @@ -599,8 +621,13 @@ private static class WriterFactory implements DataWriterFactory, StreamingDataWr private final StructType dsSchema; private final boolean partitionedFanoutEnabled; - protected WriterFactory(Broadcast
    tableBroadcast, FileFormat format, long targetFileSize, - Schema writeSchema, StructType dsSchema, boolean partitionedFanoutEnabled) { + protected WriterFactory( + Broadcast
    tableBroadcast, + FileFormat format, + long targetFileSize, + Schema writeSchema, + StructType dsSchema, + boolean partitionedFanoutEnabled) { this.tableBroadcast = tableBroadcast; this.format = format; this.targetFileSize = targetFileSize; @@ -620,21 +647,28 @@ public DataWriter createWriter(int partitionId, long taskId, long e PartitionSpec spec = table.spec(); FileIO io = table.io(); - OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, partitionId, taskId) - .format(format) - .build(); - SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table) - .dataFileFormat(format) - .dataSchema(writeSchema) - .dataSparkType(dsSchema) - .build(); + OutputFileFactory fileFactory = + OutputFileFactory.builderFor(table, partitionId, taskId).format(format).build(); + SparkFileWriterFactory writerFactory = + SparkFileWriterFactory.builderFor(table) + .dataFileFormat(format) + .dataSchema(writeSchema) + .dataSparkType(dsSchema) + .build(); if (spec.isUnpartitioned()) { return new UnpartitionedDataWriter(writerFactory, fileFactory, io, spec, targetFileSize); } else { return new PartitionedDataWriter( - writerFactory, fileFactory, io, spec, writeSchema, dsSchema, targetFileSize, partitionedFanoutEnabled); + writerFactory, + fileFactory, + io, + spec, + writeSchema, + dsSchema, + targetFileSize, + partitionedFanoutEnabled); } } } @@ -651,9 +685,14 @@ private static class UnpartitionedDataWriter implements DataWriter private final FileWriter delegate; private final FileIO io; - private UnpartitionedDataWriter(SparkFileWriterFactory writerFactory, OutputFileFactory fileFactory, - FileIO io, PartitionSpec spec, long targetFileSize) { - this.delegate = new RollingDataWriter<>(writerFactory, fileFactory, io, targetFileSize, spec, null); + private UnpartitionedDataWriter( + SparkFileWriterFactory writerFactory, + OutputFileFactory fileFactory, + FileIO io, + PartitionSpec spec, + long targetFileSize) { + this.delegate = + new RollingDataWriter<>(writerFactory, fileFactory, io, targetFileSize, spec, null); this.io = io; } @@ -667,7 +706,7 @@ public WriterCommitMessage commit() throws IOException { close(); DataWriteResult result = delegate.result(); - TaskCommit taskCommit = new TaskCommit(result.dataFiles().toArray(new DataFile[0])); + TaskCommit taskCommit = new TaskCommit(result.dataFiles().toArray(new DataFile[0])); taskCommit.reportOutputMetrics(); return taskCommit; } @@ -693,9 +732,15 @@ private static class PartitionedDataWriter implements DataWriter { private final PartitionKey partitionKey; private final InternalRowWrapper internalRowWrapper; - private PartitionedDataWriter(SparkFileWriterFactory writerFactory, OutputFileFactory fileFactory, - FileIO io, PartitionSpec spec, Schema dataSchema, - StructType dataSparkType, long targetFileSize, boolean fanoutEnabled) { + private PartitionedDataWriter( + SparkFileWriterFactory writerFactory, + OutputFileFactory fileFactory, + FileIO io, + PartitionSpec spec, + Schema dataSchema, + StructType dataSparkType, + long targetFileSize, + boolean fanoutEnabled) { if (fanoutEnabled) { this.delegate = new FanoutDataWriter<>(writerFactory, fileFactory, io, targetFileSize); } else { @@ -718,7 +763,7 @@ public WriterCommitMessage commit() throws IOException { close(); DataWriteResult result = delegate.result(); - TaskCommit taskCommit = new TaskCommit(result.dataFiles().toArray(new DataFile[0])); + TaskCommit taskCommit = new TaskCommit(result.dataFiles().toArray(new DataFile[0])); taskCommit.reportOutputMetrics(); return taskCommit; } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java index dd0fc8e3ce37..a5c7de4369db 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.DistributionMode; @@ -87,10 +86,13 @@ class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, Suppo } public WriteBuilder overwriteFiles(Scan scan, Command command, IsolationLevel isolationLevel) { - Preconditions.checkArgument(scan instanceof SparkCopyOnWriteScan, "%s is not SparkCopyOnWriteScan", scan); + Preconditions.checkArgument( + scan instanceof SparkCopyOnWriteScan, "%s is not SparkCopyOnWriteScan", scan); Preconditions.checkState(!overwriteByFilter, "Cannot overwrite individual files and by filter"); - Preconditions.checkState(!overwriteDynamic, "Cannot overwrite individual files and dynamically"); - Preconditions.checkState(rewrittenFileSetId == null, "Cannot overwrite individual files and rewrite"); + Preconditions.checkState( + !overwriteDynamic, "Cannot overwrite individual files and dynamically"); + Preconditions.checkState( + rewrittenFileSetId == null, "Cannot overwrite individual files and rewrite"); this.overwriteFiles = true; this.copyOnWriteScan = (SparkCopyOnWriteScan) scan; @@ -101,9 +103,11 @@ public WriteBuilder overwriteFiles(Scan scan, Command command, IsolationLevel is @Override public WriteBuilder overwriteDynamicPartitions() { - Preconditions.checkState(!overwriteByFilter, "Cannot overwrite dynamically and by filter: %s", overwriteExpr); + Preconditions.checkState( + !overwriteByFilter, "Cannot overwrite dynamically and by filter: %s", overwriteExpr); Preconditions.checkState(!overwriteFiles, "Cannot overwrite individual files and dynamically"); - Preconditions.checkState(rewrittenFileSetId == null, "Cannot overwrite dynamically and rewrite"); + Preconditions.checkState( + rewrittenFileSetId == null, "Cannot overwrite dynamically and rewrite"); this.overwriteDynamic = true; return this; @@ -111,7 +115,8 @@ public WriteBuilder overwriteDynamicPartitions() { @Override public WriteBuilder overwrite(Filter[] filters) { - Preconditions.checkState(!overwriteFiles, "Cannot overwrite individual files and using filters"); + Preconditions.checkState( + !overwriteFiles, "Cannot overwrite individual files and using filters"); Preconditions.checkState(rewrittenFileSetId == null, "Cannot overwrite and rewrite"); this.overwriteExpr = SparkFilters.convert(filters); @@ -119,7 +124,8 @@ public WriteBuilder overwrite(Filter[] filters) { // use the write option to override truncating the table. use dynamic overwrite instead. this.overwriteDynamic = true; } else { - Preconditions.checkState(!overwriteDynamic, "Cannot overwrite dynamically and by filter: %s", overwriteExpr); + Preconditions.checkState( + !overwriteDynamic, "Cannot overwrite dynamically and by filter: %s", overwriteExpr); this.overwriteByFilter = true; } return this; @@ -128,7 +134,8 @@ public WriteBuilder overwrite(Filter[] filters) { @Override public Write build() { // Validate - Preconditions.checkArgument(handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), + Preconditions.checkArgument( + handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()), SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR); Schema writeSchema = validateOrMergeWriteSchema(table, dsSchema, writeConf); @@ -145,7 +152,8 @@ public Write build() { distribution = buildRequiredDistribution(); ordering = buildRequiredOrdering(distribution); } else { - LOG.warn("Skipping distribution/ordering: extensions are disabled and spec contains unsupported transforms"); + LOG.warn( + "Skipping distribution/ordering: extensions are disabled and spec contains unsupported transforms"); distribution = Distributions.unspecified(); ordering = NO_ORDERING; } @@ -155,7 +163,8 @@ public Write build() { ordering = NO_ORDERING; } - return new SparkWrite(spark, table, writeConf, writeInfo, appId, writeSchema, dsSchema, distribution, ordering) { + return new SparkWrite( + spark, table, writeConf, writeInfo, appId, writeSchema, dsSchema, distribution, ordering) { @Override public BatchWrite toBatch() { @@ -174,12 +183,14 @@ public BatchWrite toBatch() { @Override public StreamingWrite toStreaming() { - Preconditions.checkState(!overwriteDynamic, - "Unsupported streaming operation: dynamic partition overwrite"); - Preconditions.checkState(!overwriteByFilter || overwriteExpr == Expressions.alwaysTrue(), - "Unsupported streaming operation: overwrite by filter: %s", overwriteExpr); - Preconditions.checkState(rewrittenFileSetId == null, - "Unsupported streaming operation: rewrite"); + Preconditions.checkState( + !overwriteDynamic, "Unsupported streaming operation: dynamic partition overwrite"); + Preconditions.checkState( + !overwriteByFilter || overwriteExpr == Expressions.alwaysTrue(), + "Unsupported streaming operation: overwrite by filter: %s", + overwriteExpr); + Preconditions.checkState( + rewrittenFileSetId == null, "Unsupported streaming operation: rewrite"); if (overwriteByFilter) { return asStreamingOverwrite(); @@ -193,7 +204,8 @@ public StreamingWrite toStreaming() { private Distribution buildRequiredDistribution() { if (overwriteFiles) { DistributionMode distributionMode = copyOnWriteDistributionMode(); - return SparkDistributionAndOrderingUtil.buildCopyOnWriteDistribution(table, copyOnWriteCommand, distributionMode); + return SparkDistributionAndOrderingUtil.buildCopyOnWriteDistribution( + table, copyOnWriteCommand, distributionMode); } else { DistributionMode distributionMode = writeConf.distributionMode(); return SparkDistributionAndOrderingUtil.buildRequiredDistribution(table, distributionMode); @@ -215,7 +227,8 @@ private DistributionMode copyOnWriteDistributionMode() { private SortOrder[] buildRequiredOrdering(Distribution requiredDistribution) { if (overwriteFiles) { - return SparkDistributionAndOrderingUtil.buildCopyOnWriteOrdering(table, copyOnWriteCommand, requiredDistribution); + return SparkDistributionAndOrderingUtil.buildCopyOnWriteOrdering( + table, copyOnWriteCommand, requiredDistribution); } else { return SparkDistributionAndOrderingUtil.buildRequiredOrdering(table, requiredDistribution); } @@ -225,7 +238,8 @@ private boolean allIdentityTransforms(PartitionSpec spec) { return spec.fields().stream().allMatch(field -> field.transform().isIdentity()); } - private static Schema validateOrMergeWriteSchema(Table table, StructType dsSchema, SparkWriteConf writeConf) { + private static Schema validateOrMergeWriteSchema( + Table table, StructType dsSchema, SparkWriteConf writeConf) { Schema writeSchema; if (writeConf.mergeSchema()) { // convert the dataset schema and assign fresh ids for new fields diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/StagedSparkTable.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/StagedSparkTable.java index 2e018cb09496..b92c02d2b536 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/StagedSparkTable.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/StagedSparkTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.Transaction; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java index 939b07a0af61..ddf6ca834d9b 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/Stats.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.OptionalLong; diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java index 64277ecf3be5..f2088deb1ee3 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/StreamingOffset.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import com.fasterxml.jackson.core.JsonGenerator; @@ -47,10 +46,10 @@ class StreamingOffset extends Offset { * An implementation of Spark Structured Streaming Offset, to track the current processed files of * Iceberg table. * - * @param snapshotId The current processed snapshot id. - * @param position The position of last scanned file in snapshot. - * @param scanAllFiles whether to scan all files in a snapshot; for example, to read - * all data when starting a stream. + * @param snapshotId The current processed snapshot id. + * @param position The position of last scanned file in snapshot. + * @param scanAllFiles whether to scan all files in a snapshot; for example, to read all data when + * starting a stream. */ StreamingOffset(long snapshotId, long position, boolean scanAllFiles) { this.snapshotId = snapshotId; @@ -65,7 +64,8 @@ static StreamingOffset fromJson(String json) { JsonNode node = JsonUtil.mapper().readValue(json, JsonNode.class); return fromJsonNode(node); } catch (IOException e) { - throw new UncheckedIOException(String.format("Failed to parse StreamingOffset from JSON string %s", json), e); + throw new UncheckedIOException( + String.format("Failed to parse StreamingOffset from JSON string %s", json), e); } } @@ -118,9 +118,9 @@ boolean shouldScanAllFiles() { public boolean equals(Object obj) { if (obj instanceof StreamingOffset) { StreamingOffset offset = (StreamingOffset) obj; - return offset.snapshotId == snapshotId && - offset.position == position && - offset.scanAllFiles == scanAllFiles; + return offset.snapshotId == snapshotId + && offset.position == position + && offset.scanAllFiles == scanAllFiles; } else { return false; } @@ -133,17 +133,20 @@ public int hashCode() { @Override public String toString() { - return String.format("Streaming Offset[%d: position (%d) scan_all_files (%b)]", - snapshotId, position, scanAllFiles); + return String.format( + "Streaming Offset[%d: position (%d) scan_all_files (%b)]", + snapshotId, position, scanAllFiles); } private static StreamingOffset fromJsonNode(JsonNode node) { // The version of StreamingOffset. The offset was created with a version number // used to validate when deserializing from json string. int version = JsonUtil.getInt(VERSION, node); - Preconditions.checkArgument(version == CURR_VERSION, + Preconditions.checkArgument( + version == CURR_VERSION, "This version of Iceberg source only supports version %s. Version %s is not supported.", - CURR_VERSION, version); + CURR_VERSION, + version); long snapshotId = JsonUtil.getLong(SNAPSHOT_ID, node); int position = JsonUtil.getInt(POSITION, node); diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java index a2288ef3edd7..3c7ebabeab3d 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/StructInternalRow.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.math.BigDecimal; @@ -128,7 +127,8 @@ public int getInt(int ordinal) { } else if (integer instanceof LocalDate) { return (int) ((LocalDate) integer).toEpochDay(); } else { - throw new IllegalStateException("Unknown type for int field. Type name: " + integer.getClass().getName()); + throw new IllegalStateException( + "Unknown type for int field. Type name: " + integer.getClass().getName()); } } @@ -143,7 +143,8 @@ public long getLong(int ordinal) { } else if (longVal instanceof LocalDate) { return ((LocalDate) longVal).toEpochDay(); } else { - throw new IllegalStateException("Unknown type for long field. Type name: " + longVal.getClass().getName()); + throw new IllegalStateException( + "Unknown type for long field. Type name: " + longVal.getClass().getName()); } } @@ -190,7 +191,8 @@ private byte[] getBinaryInternal(int ordinal) { } else if (bytes instanceof byte[]) { return (byte[]) bytes; } else { - throw new IllegalStateException("Unknown type for binary field. Type name: " + bytes.getClass().getName()); + throw new IllegalStateException( + "Unknown type for binary field. Type name: " + bytes.getClass().getName()); } } @@ -206,8 +208,7 @@ public InternalRow getStruct(int ordinal, int numFields) { private InternalRow getStructInternal(int ordinal, int numFields) { return new StructInternalRow( - type.fields().get(ordinal).type().asStructType(), - struct.get(ordinal, StructLike.class)); + type.fields().get(ordinal).type().asStructType(), struct.get(ordinal, StructLike.class)); } @Override @@ -227,7 +228,8 @@ public MapData getMap(int ordinal) { } private MapData getMapInternal(int ordinal) { - return mapToMapData(type.fields().get(ordinal).type().asMapType(), struct.get(ordinal, Map.class)); + return mapToMapData( + type.fields().get(ordinal).type().asMapType(), struct.get(ordinal, Map.class)); } @Override @@ -292,31 +294,52 @@ private ArrayData collectionToArrayData(Type elementType, Collection values) case DOUBLE: return fillArray(values, array -> (pos, value) -> array[pos] = value); case STRING: - return fillArray(values, array -> - (BiConsumer) (pos, seq) -> array[pos] = UTF8String.fromString(seq.toString())); + return fillArray( + values, + array -> + (BiConsumer) + (pos, seq) -> array[pos] = UTF8String.fromString(seq.toString())); case FIXED: case BINARY: - return fillArray(values, array -> - (BiConsumer) (pos, buf) -> array[pos] = ByteBuffers.toByteArray(buf)); + return fillArray( + values, + array -> + (BiConsumer) + (pos, buf) -> array[pos] = ByteBuffers.toByteArray(buf)); case DECIMAL: - return fillArray(values, array -> - (BiConsumer) (pos, dec) -> array[pos] = Decimal.apply(dec)); + return fillArray( + values, + array -> + (BiConsumer) (pos, dec) -> array[pos] = Decimal.apply(dec)); case STRUCT: - return fillArray(values, array -> (BiConsumer) (pos, tuple) -> - array[pos] = new StructInternalRow(elementType.asStructType(), tuple)); + return fillArray( + values, + array -> + (BiConsumer) + (pos, tuple) -> + array[pos] = new StructInternalRow(elementType.asStructType(), tuple)); case LIST: - return fillArray(values, array -> (BiConsumer>) (pos, list) -> - array[pos] = collectionToArrayData(elementType.asListType().elementType(), list)); + return fillArray( + values, + array -> + (BiConsumer>) + (pos, list) -> + array[pos] = + collectionToArrayData(elementType.asListType().elementType(), list)); case MAP: - return fillArray(values, array -> (BiConsumer>) (pos, map) -> - array[pos] = mapToMapData(elementType.asMapType(), map)); + return fillArray( + values, + array -> + (BiConsumer>) + (pos, map) -> array[pos] = mapToMapData(elementType.asMapType(), map)); default: throw new UnsupportedOperationException("Unsupported array element type: " + elementType); } } @SuppressWarnings("unchecked") - private GenericArrayData fillArray(Collection values, Function> makeSetter) { + private GenericArrayData fillArray( + Collection values, Function> makeSetter) { Object[] array = new Object[values.size()]; BiConsumer setter = makeSetter.apply(array); diff --git a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/catalyst/analysis/NoSuchProcedureException.java b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/catalyst/analysis/NoSuchProcedureException.java index 25f6368cc8c3..18cc64a20601 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/catalyst/analysis/NoSuchProcedureException.java +++ b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/catalyst/analysis/NoSuchProcedureException.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.catalyst.analysis; import org.apache.spark.sql.AnalysisException; @@ -25,7 +24,13 @@ public class NoSuchProcedureException extends AnalysisException { public NoSuchProcedureException(Identifier ident) { - super("Procedure " + ident + " not found", Option.empty(), Option.empty(), Option.empty(), - Option.empty(), Option.empty(), new String[0]); + super( + "Procedure " + ident + " not found", + Option.empty(), + Option.empty(), + Option.empty(), + Option.empty(), + Option.empty(), + new String[0]); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/Procedure.java b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/Procedure.java index 8f7a70b9f9fc..11f215ba040a 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/Procedure.java +++ b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/Procedure.java @@ -16,44 +16,34 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.types.StructType; -/** - * An interface representing a stored procedure available for execution. - */ +/** An interface representing a stored procedure available for execution. */ public interface Procedure { - /** - * Returns the input parameters of this procedure. - */ + /** Returns the input parameters of this procedure. */ ProcedureParameter[] parameters(); - /** - * Returns the type of rows produced by this procedure. - */ + /** Returns the type of rows produced by this procedure. */ StructType outputType(); /** * Executes this procedure. - *

    - * Spark will align the provided arguments according to the input parameters - * defined in {@link #parameters()} either by position or by name before execution. - *

    - * Implementations may provide a summary of execution by returning one or many rows - * as a result. The schema of output rows must match the defined output type - * in {@link #outputType()}. + * + *

    Spark will align the provided arguments according to the input parameters defined in {@link + * #parameters()} either by position or by name before execution. + * + *

    Implementations may provide a summary of execution by returning one or many rows as a + * result. The schema of output rows must match the defined output type in {@link #outputType()}. * * @param args input arguments * @return the result of executing this procedure with the given arguments */ InternalRow[] call(InternalRow args); - /** - * Returns the description of this procedure. - */ + /** Returns the description of this procedure. */ default String description() { return this.getClass().toString(); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureCatalog.java b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureCatalog.java index 314bd659460e..2cee97ee5938 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureCatalog.java +++ b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.catalyst.analysis.NoSuchProcedureException; @@ -25,9 +24,9 @@ /** * A catalog API for working with stored procedures. - *

    - * Implementations should implement this interface if they expose stored procedures that - * can be called via CALL statements. + * + *

    Implementations should implement this interface if they expose stored procedures that can be + * called via CALL statements. */ public interface ProcedureCatalog extends CatalogPlugin { /** diff --git a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameter.java b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameter.java index b341dc1e3282..e1e84b2597f3 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameter.java +++ b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameter.java @@ -16,14 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import org.apache.spark.sql.types.DataType; -/** - * An input parameter of a {@link Procedure stored procedure}. - */ +/** An input parameter of a {@link Procedure stored procedure}. */ public interface ProcedureParameter { /** @@ -48,18 +45,12 @@ static ProcedureParameter optional(String name, DataType dataType) { return new ProcedureParameterImpl(name, dataType, false); } - /** - * Returns the name of this parameter. - */ + /** Returns the name of this parameter. */ String name(); - /** - * Returns the type of this parameter. - */ + /** Returns the type of this parameter. */ DataType dataType(); - /** - * Returns true if this parameter is required. - */ + /** Returns true if this parameter is required. */ boolean required(); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameterImpl.java b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameterImpl.java index cea1e80f4051..c59951e24330 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameterImpl.java +++ b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/catalog/ProcedureParameterImpl.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.catalog; import java.util.Objects; import org.apache.spark.sql.types.DataType; -/** - * A {@link ProcedureParameter} implementation. - */ +/** A {@link ProcedureParameter} implementation. */ class ProcedureParameterImpl implements ProcedureParameter { private final String name; private final DataType dataType; @@ -60,9 +57,9 @@ public boolean equals(Object other) { } ProcedureParameterImpl that = (ProcedureParameterImpl) other; - return required == that.required && - Objects.equals(name, that.name) && - Objects.equals(dataType, that.dataType); + return required == that.required + && Objects.equals(name, that.name) + && Objects.equals(dataType, that.dataType); } @Override @@ -72,6 +69,7 @@ public int hashCode() { @Override public String toString() { - return String.format("ProcedureParameter(name='%s', type=%s, required=%b)", name, dataType, required); + return String.format( + "ProcedureParameter(name='%s', type=%s, required=%b)", name, dataType, required); } } diff --git a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaBatchWrite.java b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaBatchWrite.java index a1fbf0fb8a24..1bad054e3215 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaBatchWrite.java +++ b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaBatchWrite.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import org.apache.spark.sql.connector.write.BatchWrite; import org.apache.spark.sql.connector.write.PhysicalWriteInfo; -/** - * An interface that defines how to write a delta of rows during batch processing. - */ +/** An interface that defines how to write a delta of rows during batch processing. */ public interface DeltaBatchWrite extends BatchWrite { @Override DeltaWriterFactory createBatchWriterFactory(PhysicalWriteInfo info); diff --git a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWrite.java b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWrite.java index 7643e2d103ec..fb452c652e7d 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWrite.java +++ b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWrite.java @@ -16,16 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import org.apache.spark.sql.connector.write.Write; /** - * A logical representation of a data source write that handles a delta of rows. - * A delta of rows is a set of instructions that indicate which records need to be deleted, - * updated, or inserted. Data sources that support deltas allow Spark to discard unchanged rows - * and pass only the information about what rows have changed during a row-level operation. + * A logical representation of a data source write that handles a delta of rows. A delta of rows is + * a set of instructions that indicate which records need to be deleted, updated, or inserted. Data + * sources that support deltas allow Spark to discard unchanged rows and pass only the information + * about what rows have changed during a row-level operation. */ public interface DeltaWrite extends Write { @Override diff --git a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriteBuilder.java b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriteBuilder.java index 1af1c6680c89..56214a4adcbf 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriteBuilder.java +++ b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriteBuilder.java @@ -16,18 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import org.apache.spark.sql.connector.write.WriteBuilder; -/** - * An interface for building delta writes. - */ +/** An interface for building delta writes. */ public interface DeltaWriteBuilder extends WriteBuilder { - /** - * Returns a logical delta write. - */ + /** Returns a logical delta write. */ @Override default DeltaWrite build() { throw new UnsupportedOperationException("Not implemented: build"); diff --git a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriter.java b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriter.java index a17ee33b13d7..efeed371f940 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriter.java +++ b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriter.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import java.io.IOException; import org.apache.spark.sql.connector.write.DataWriter; -/** - * A data writer responsible for writing a delta of rows. - */ +/** A data writer responsible for writing a delta of rows. */ public interface DeltaWriter extends DataWriter { /** * Passes information for a row that must be deleted. diff --git a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriterFactory.java b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriterFactory.java index 77af70958e41..f779474c35bd 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriterFactory.java +++ b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/DeltaWriterFactory.java @@ -16,15 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.connector.write.DataWriterFactory; -/** - * A factory for creating and initializing delta writers at the executor side. - */ +/** A factory for creating and initializing delta writers at the executor side. */ public interface DeltaWriterFactory extends DataWriterFactory { @Override DeltaWriter createWriter(int partitionId, long taskId); diff --git a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/ExtendedLogicalWriteInfo.java b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/ExtendedLogicalWriteInfo.java index 3c39a4f0f1b5..a13f56ecea0d 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/ExtendedLogicalWriteInfo.java +++ b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/ExtendedLogicalWriteInfo.java @@ -16,23 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.types.StructType; -/** - * A class that holds logical write information not covered by LogicalWriteInfo in Spark. - */ +/** A class that holds logical write information not covered by LogicalWriteInfo in Spark. */ public interface ExtendedLogicalWriteInfo extends LogicalWriteInfo { - /** - * The schema of the input metadata from Spark to data source. - */ + /** The schema of the input metadata from Spark to data source. */ StructType metadataSchema(); - /** - * The schema of the ID columns from Spark to data source. - */ + /** The schema of the ID columns from Spark to data source. */ StructType rowIdSchema(); } diff --git a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/SupportsDelta.java b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/SupportsDelta.java index 2b0926326470..a26da45f213d 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/SupportsDelta.java +++ b/spark/v3.3/spark/src/main/java/org/apache/spark/sql/connector/iceberg/write/SupportsDelta.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.spark.sql.connector.iceberg.write; import org.apache.spark.sql.connector.expressions.NamedReference; @@ -24,15 +23,13 @@ import org.apache.spark.sql.connector.write.RowLevelOperation; /** - * A mix-in interface for RowLevelOperation. Data sources can implement this interface - * to indicate they support handling deltas of rows. + * A mix-in interface for RowLevelOperation. Data sources can implement this interface to indicate + * they support handling deltas of rows. */ public interface SupportsDelta extends RowLevelOperation { @Override DeltaWriteBuilder newWriteBuilder(LogicalWriteInfo info); - /** - * Returns the row ID column references that should be used for row equality. - */ + /** Returns the row ID column references that should be used for row equality. */ NamedReference[] rowId(); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/KryoHelpers.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/KryoHelpers.java index ee0f0a73959a..6d88aaa11813 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/KryoHelpers.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/KryoHelpers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import com.esotericsoftware.kryo.Kryo; @@ -32,8 +31,7 @@ public class KryoHelpers { - private KryoHelpers() { - } + private KryoHelpers() {} @SuppressWarnings("unchecked") public static T roundTripSerialize(T obj) throws IOException { @@ -45,7 +43,8 @@ public static T roundTripSerialize(T obj) throws IOException { kryo.writeClassAndObject(out, obj); } - try (Input in = new Input(new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray())))) { + try (Input in = + new Input(new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray())))) { return (T) kryo.readClassAndObject(in); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java index dd187c0b6bf0..c44bacf149b5 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/TaskCheckHelper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.util.Comparator; @@ -25,15 +24,15 @@ import org.junit.Assert; public final class TaskCheckHelper { - private TaskCheckHelper() { - } + private TaskCheckHelper() {} - public static void assertEquals(ScanTaskGroup expected, ScanTaskGroup actual) { + public static void assertEquals( + ScanTaskGroup expected, ScanTaskGroup actual) { List expectedTasks = getFileScanTasksInFilePathOrder(expected); List actualTasks = getFileScanTasksInFilePathOrder(actual); - Assert.assertEquals("The number of file scan tasks should match", - expectedTasks.size(), actualTasks.size()); + Assert.assertEquals( + "The number of file scan tasks should match", expectedTasks.size(), actualTasks.size()); for (int i = 0; i < expectedTasks.size(); i++) { FileScanTask expectedTask = expectedTasks.get(i); @@ -50,41 +49,60 @@ public static void assertEquals(FileScanTask expected, FileScanTask actual) { Assert.assertEquals("starting position doesn't match", expected.start(), actual.start()); - Assert.assertEquals("the number of bytes to scan doesn't match", expected.start(), actual.start()); + Assert.assertEquals( + "the number of bytes to scan doesn't match", expected.start(), actual.start()); // simplify comparison on residual expression via comparing toString - Assert.assertEquals("Residual expression doesn't match", - expected.residual().toString(), actual.residual().toString()); + Assert.assertEquals( + "Residual expression doesn't match", + expected.residual().toString(), + actual.residual().toString()); } public static void assertEquals(DataFile expected, DataFile actual) { - Assert.assertEquals("Should match the serialized record path", - expected.path(), actual.path()); - Assert.assertEquals("Should match the serialized record format", - expected.format(), actual.format()); - Assert.assertEquals("Should match the serialized record partition", - expected.partition().get(0, Object.class), actual.partition().get(0, Object.class)); - Assert.assertEquals("Should match the serialized record count", - expected.recordCount(), actual.recordCount()); - Assert.assertEquals("Should match the serialized record size", - expected.fileSizeInBytes(), actual.fileSizeInBytes()); - Assert.assertEquals("Should match the serialized record value counts", - expected.valueCounts(), actual.valueCounts()); - Assert.assertEquals("Should match the serialized record null value counts", - expected.nullValueCounts(), actual.nullValueCounts()); - Assert.assertEquals("Should match the serialized record lower bounds", - expected.lowerBounds(), actual.lowerBounds()); - Assert.assertEquals("Should match the serialized record upper bounds", - expected.upperBounds(), actual.upperBounds()); - Assert.assertEquals("Should match the serialized record key metadata", - expected.keyMetadata(), actual.keyMetadata()); - Assert.assertEquals("Should match the serialized record offsets", - expected.splitOffsets(), actual.splitOffsets()); - Assert.assertEquals("Should match the serialized record offsets", - expected.keyMetadata(), actual.keyMetadata()); + Assert.assertEquals("Should match the serialized record path", expected.path(), actual.path()); + Assert.assertEquals( + "Should match the serialized record format", expected.format(), actual.format()); + Assert.assertEquals( + "Should match the serialized record partition", + expected.partition().get(0, Object.class), + actual.partition().get(0, Object.class)); + Assert.assertEquals( + "Should match the serialized record count", expected.recordCount(), actual.recordCount()); + Assert.assertEquals( + "Should match the serialized record size", + expected.fileSizeInBytes(), + actual.fileSizeInBytes()); + Assert.assertEquals( + "Should match the serialized record value counts", + expected.valueCounts(), + actual.valueCounts()); + Assert.assertEquals( + "Should match the serialized record null value counts", + expected.nullValueCounts(), + actual.nullValueCounts()); + Assert.assertEquals( + "Should match the serialized record lower bounds", + expected.lowerBounds(), + actual.lowerBounds()); + Assert.assertEquals( + "Should match the serialized record upper bounds", + expected.upperBounds(), + actual.upperBounds()); + Assert.assertEquals( + "Should match the serialized record key metadata", + expected.keyMetadata(), + actual.keyMetadata()); + Assert.assertEquals( + "Should match the serialized record offsets", + expected.splitOffsets(), + actual.splitOffsets()); + Assert.assertEquals( + "Should match the serialized record offsets", expected.keyMetadata(), actual.keyMetadata()); } - private static List getFileScanTasksInFilePathOrder(ScanTaskGroup taskGroup) { + private static List getFileScanTasksInFilePathOrder( + ScanTaskGroup taskGroup) { return taskGroup.tasks().stream() // use file path + start position to differentiate the tasks .sorted(Comparator.comparing(o -> o.file().path().toString() + "##" + o.start())) diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java index 12fa8b2fc539..33b5316b72b7 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestDataFileSerialization.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.TaskCheckHelper.assertEquals; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; @@ -51,22 +54,17 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.TaskCheckHelper.assertEquals; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestDataFileSerialization { - private static final Schema DATE_SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema DATE_SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec PARTITION_SPEC = PartitionSpec - .builderFor(DATE_SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec PARTITION_SPEC = + PartitionSpec.builderFor(DATE_SCHEMA).identity("date").build(); private static final Map VALUE_COUNTS = Maps.newHashMap(); private static final Map NULL_VALUE_COUNTS = Maps.newHashMap(); @@ -85,20 +83,26 @@ public class TestDataFileSerialization { UPPER_BOUNDS.put(1, longToBuffer(4L)); } - private static final DataFile DATA_FILE = DataFiles - .builder(PARTITION_SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(1234) - .withPartitionPath("date=2018-06-08") - .withMetrics(new Metrics( - 5L, null, VALUE_COUNTS, NULL_VALUE_COUNTS, NAN_VALUE_COUNTS, LOWER_BOUNDS, UPPER_BOUNDS)) - .withSplitOffsets(ImmutableList.of(4L)) - .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) - .withSortOrder(SortOrder.unsorted()) - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final DataFile DATA_FILE = + DataFiles.builder(PARTITION_SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(1234) + .withPartitionPath("date=2018-06-08") + .withMetrics( + new Metrics( + 5L, + null, + VALUE_COUNTS, + NULL_VALUE_COUNTS, + NAN_VALUE_COUNTS, + LOWER_BOUNDS, + UPPER_BOUNDS)) + .withSplitOffsets(ImmutableList.of(4L)) + .withEncryptionKeyMetadata(ByteBuffer.allocate(4).putInt(34)) + .withSortOrder(SortOrder.unsorted()) + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testDataFileKryoSerialization() throws Exception { @@ -128,7 +132,8 @@ public void testDataFileJavaSerialization() throws Exception { out.writeObject(DATA_FILE.copy()); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { for (int i = 0; i < 2; i += 1) { Object obj = in.readObject(); Assertions.assertThat(obj).as("Should be a DataFile").isInstanceOf(DataFile.class); @@ -140,13 +145,14 @@ public void testDataFileJavaSerialization() throws Exception { @Test public void testParquetWriterSplitOffsets() throws IOException { Iterable records = RandomData.generateSpark(DATE_SCHEMA, 1, 33L); - File parquetFile = new File( - temp.getRoot(), - FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); + File parquetFile = + new File(temp.getRoot(), FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); FileAppender writer = Parquet.write(Files.localOutput(parquetFile)) .schema(DATE_SCHEMA) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType)) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType)) .build(); try { writer.addAll(records); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java index 49a85cb68f17..c6f491ece5ad 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestFileIOSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -36,36 +38,29 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestFileIOSerialization { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("date").build(); - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA) - .asc("id") - .build(); + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); static { CONF.set("k1", "v1"); CONF.set("k2", "v2"); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @Before diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestHadoopMetricsContextSerialization.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestHadoopMetricsContextSerialization.java index cba5ac686d46..409104448dfb 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestHadoopMetricsContextSerialization.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestHadoopMetricsContextSerialization.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; import java.io.IOException; @@ -42,7 +41,8 @@ public void testHadoopMetricsContextKryoSerialization() throws IOException { } @Test - public void testHadoopMetricsContextJavaSerialization() throws IOException, ClassNotFoundException { + public void testHadoopMetricsContextJavaSerialization() + throws IOException, ClassNotFoundException { MetricsContext metricsContext = new HadoopMetricsContext("s3"); metricsContext.initialize(Maps.newHashMap()); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java index 25004aa110e4..a20b2d9f05de 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestManifestFileSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; @@ -47,56 +49,57 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestManifestFileSerialization { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - required(4, "double", Types.DoubleType.get())); - - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("double") - .build(); - - private static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-1.parquet") - .withFileSizeInBytes(0) - .withPartition(TestHelpers.Row.of(1D)) - .withPartitionPath("double=1") - .withMetrics(new Metrics(5L, - null, // no column sizes - ImmutableMap.of(1, 5L, 2, 3L), // value count - ImmutableMap.of(1, 0L, 2, 2L), // null count - ImmutableMap.of(), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(4L)) // upper bounds - )) - .build(); - - private static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-2.parquet") - .withFileSizeInBytes(0) - .withPartition(TestHelpers.Row.of(Double.NaN)) - .withPartitionPath("double=NaN") - .withMetrics(new Metrics(1L, - null, // no column sizes - ImmutableMap.of(1, 1L, 4, 1L), // value count - ImmutableMap.of(1, 0L, 2, 0L), // null count - ImmutableMap.of(4, 1L), // nan count - ImmutableMap.of(1, longToBuffer(0L)), // lower bounds - ImmutableMap.of(1, longToBuffer(1L)) // upper bounds - )) - .build(); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + required(4, "double", Types.DoubleType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("double").build(); + + private static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-1.parquet") + .withFileSizeInBytes(0) + .withPartition(TestHelpers.Row.of(1D)) + .withPartitionPath("double=1") + .withMetrics( + new Metrics( + 5L, + null, // no column sizes + ImmutableMap.of(1, 5L, 2, 3L), // value count + ImmutableMap.of(1, 0L, 2, 2L), // null count + ImmutableMap.of(), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(4L)) // upper bounds + )) + .build(); + + private static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-2.parquet") + .withFileSizeInBytes(0) + .withPartition(TestHelpers.Row.of(Double.NaN)) + .withPartitionPath("double=NaN") + .withMetrics( + new Metrics( + 1L, + null, // no column sizes + ImmutableMap.of(1, 1L, 4, 1L), // value count + ImmutableMap.of(1, 0L, 2, 0L), // null count + ImmutableMap.of(4, 1L), // nan count + ImmutableMap.of(1, longToBuffer(0L)), // lower bounds + ImmutableMap.of(1, longToBuffer(1L)) // upper bounds + )) + .build(); private static final FileIO FILE_IO = new HadoopFileIO(new Configuration()); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testManifestFileKryoSerialization() throws IOException { @@ -134,7 +137,8 @@ public void testManifestFileJavaSerialization() throws Exception { out.writeObject(GenericManifestFile.copyOf(manifest).build()); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { for (int i = 0; i < 3; i += 1) { Object obj = in.readObject(); Assertions.assertThat(obj).as("Should be a ManifestFile").isInstanceOf(ManifestFile.class); @@ -148,27 +152,46 @@ private void checkManifestFile(ManifestFile expected, ManifestFile actual) { Assert.assertEquals("Length must match", expected.length(), actual.length()); Assert.assertEquals("Spec id must match", expected.partitionSpecId(), actual.partitionSpecId()); Assert.assertEquals("Snapshot id must match", expected.snapshotId(), actual.snapshotId()); - Assert.assertEquals("Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); - Assert.assertEquals("Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); - Assert.assertEquals("Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); - Assert.assertEquals("Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); - Assert.assertEquals("Existing files count must match", expected.existingFilesCount(), actual.existingFilesCount()); - Assert.assertEquals("Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); - Assert.assertEquals("Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); - Assert.assertEquals("Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); - Assert.assertEquals("Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); + Assert.assertEquals( + "Added files flag must match", expected.hasAddedFiles(), actual.hasAddedFiles()); + Assert.assertEquals( + "Added files count must match", expected.addedFilesCount(), actual.addedFilesCount()); + Assert.assertEquals( + "Added rows count must match", expected.addedRowsCount(), actual.addedRowsCount()); + Assert.assertEquals( + "Existing files flag must match", expected.hasExistingFiles(), actual.hasExistingFiles()); + Assert.assertEquals( + "Existing files count must match", + expected.existingFilesCount(), + actual.existingFilesCount()); + Assert.assertEquals( + "Existing rows count must match", expected.existingRowsCount(), actual.existingRowsCount()); + Assert.assertEquals( + "Deleted files flag must match", expected.hasDeletedFiles(), actual.hasDeletedFiles()); + Assert.assertEquals( + "Deleted files count must match", expected.deletedFilesCount(), actual.deletedFilesCount()); + Assert.assertEquals( + "Deleted rows count must match", expected.deletedRowsCount(), actual.deletedRowsCount()); PartitionFieldSummary expectedPartition = expected.partitions().get(0); PartitionFieldSummary actualPartition = actual.partitions().get(0); - Assert.assertEquals("Null flag in partition must match", - expectedPartition.containsNull(), actualPartition.containsNull()); - Assert.assertEquals("NaN flag in partition must match", - expectedPartition.containsNaN(), actualPartition.containsNaN()); - Assert.assertEquals("Lower bounds in partition must match", - expectedPartition.lowerBound(), actualPartition.lowerBound()); - Assert.assertEquals("Upper bounds in partition must match", - expectedPartition.upperBound(), actualPartition.upperBound()); + Assert.assertEquals( + "Null flag in partition must match", + expectedPartition.containsNull(), + actualPartition.containsNull()); + Assert.assertEquals( + "NaN flag in partition must match", + expectedPartition.containsNaN(), + actualPartition.containsNaN()); + Assert.assertEquals( + "Lower bounds in partition must match", + expectedPartition.lowerBound(), + actualPartition.lowerBound()); + Assert.assertEquals( + "Upper bounds in partition must match", + expectedPartition.upperBound(), + actualPartition.upperBound()); } private ManifestFile writeManifest(DataFile... files) throws IOException { diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java index 1fe736b51aad..5e5d657eab56 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestScanTaskSerialization.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; + import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; @@ -52,19 +53,16 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestScanTaskSerialization extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String tableLocation = null; @@ -88,7 +86,9 @@ public void testBaseCombinedScanTaskKryoSerialization() throws Exception { try (Input in = new Input(new FileInputStream(data))) { Object obj = kryo.readClassAndObject(in); - Assertions.assertThat(obj).as("Should be a BaseCombinedScanTask").isInstanceOf(BaseCombinedScanTask.class); + Assertions.assertThat(obj) + .as("Should be a BaseCombinedScanTask") + .isInstanceOf(BaseCombinedScanTask.class); TaskCheckHelper.assertEquals(scanTask, (BaseCombinedScanTask) obj); } } @@ -102,9 +102,12 @@ public void testBaseCombinedScanTaskJavaSerialization() throws Exception { out.writeObject(scanTask); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { Object obj = in.readObject(); - Assertions.assertThat(obj).as("Should be a BaseCombinedScanTask").isInstanceOf(BaseCombinedScanTask.class); + Assertions.assertThat(obj) + .as("Should be a BaseCombinedScanTask") + .isInstanceOf(BaseCombinedScanTask.class); TaskCheckHelper.assertEquals(scanTask, (BaseCombinedScanTask) obj); } } @@ -126,7 +129,9 @@ public void testBaseScanTaskGroupKryoSerialization() throws Exception { try (Input in = new Input(Files.newInputStream(data.toPath()))) { Object obj = kryo.readClassAndObject(in); - Assertions.assertThat(obj).as("should be a BaseScanTaskGroup").isInstanceOf(BaseScanTaskGroup.class); + Assertions.assertThat(obj) + .as("should be a BaseScanTaskGroup") + .isInstanceOf(BaseScanTaskGroup.class); TaskCheckHelper.assertEquals(taskGroup, (BaseScanTaskGroup) obj); } } @@ -143,9 +148,12 @@ public void testBaseScanTaskGroupJavaSerialization() throws Exception { out.writeObject(taskGroup); } - try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { + try (ObjectInputStream in = + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray()))) { Object obj = in.readObject(); - Assertions.assertThat(obj).as("should be a BaseScanTaskGroup").isInstanceOf(BaseScanTaskGroup.class); + Assertions.assertThat(obj) + .as("should be a BaseScanTaskGroup") + .isInstanceOf(BaseScanTaskGroup.class); TaskCheckHelper.assertEquals(taskGroup, (BaseScanTaskGroup) obj); } } @@ -167,16 +175,15 @@ private Table initTable() { Map options = Maps.newHashMap(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -190,10 +197,6 @@ private void writeRecords(List records) { } private void writeDF(Dataset df) { - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java index 8aa89b9f3199..30a167d575b1 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/TestTableSerialization.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Map; @@ -32,30 +34,23 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestTableSerialization { private static final HadoopTables TABLES = new HadoopTables(); - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - optional(2, "data", Types.StringType.get()), - required(3, "date", Types.StringType.get()), - optional(4, "double", Types.DoubleType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), + optional(2, "data", Types.StringType.get()), + required(3, "date", Types.StringType.get()), + optional(4, "double", Types.DoubleType.get())); - private static final PartitionSpec SPEC = PartitionSpec - .builderFor(SCHEMA) - .identity("date") - .build(); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("date").build(); - private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA) - .asc("id") - .build(); + private static final SortOrder SORT_ORDER = SortOrder.builderFor(SCHEMA).asc("id").build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @Before @@ -71,19 +66,20 @@ public void initTable() throws IOException { @Test public void testSerializableTableKryoSerialization() throws IOException { Table serializableTable = SerializableTableWithSize.copyOf(table); - TestHelpers.assertSerializedAndLoadedMetadata(table, KryoHelpers.roundTripSerialize(serializableTable)); + TestHelpers.assertSerializedAndLoadedMetadata( + table, KryoHelpers.roundTripSerialize(serializableTable)); } @Test public void testSerializableMetadataTableKryoSerialization() throws IOException { for (MetadataTableType type : MetadataTableType.values()) { TableOperations ops = ((HasTableOperations) table).operations(); - Table metadataTable = MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); + Table metadataTable = + MetadataTableUtils.createMetadataTableInstance(ops, table.name(), "meta", type); Table serializableMetadataTable = SerializableTableWithSize.copyOf(metadataTable); TestHelpers.assertSerializedAndLoadedMetadata( - metadataTable, - KryoHelpers.roundTripSerialize(serializableMetadataTable)); + metadataTable, KryoHelpers.roundTripSerialize(serializableMetadataTable)); } } @@ -91,13 +87,12 @@ public void testSerializableMetadataTableKryoSerialization() throws IOException public void testSerializableTransactionTableKryoSerialization() throws IOException { Transaction txn = table.newTransaction(); - txn.updateProperties() - .set("k1", "v1") - .commit(); + txn.updateProperties().set("k1", "v1").commit(); Table txnTable = txn.table(); Table serializableTxnTable = SerializableTableWithSize.copyOf(txnTable); - TestHelpers.assertSerializedMetadata(txnTable, KryoHelpers.roundTripSerialize(serializableTxnTable)); + TestHelpers.assertSerializedMetadata( + txnTable, KryoHelpers.roundTripSerialize(serializableTxnTable)); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java index 5d5dfebf9532..1006ed380ff9 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java @@ -16,26 +16,29 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; public enum SparkCatalogConfig { - HIVE("testhive", SparkCatalog.class.getName(), ImmutableMap.of( - "type", "hive", - "default-namespace", "default" - )), - HADOOP("testhadoop", SparkCatalog.class.getName(), ImmutableMap.of( - "type", "hadoop" - )), - SPARK("spark_catalog", SparkSessionCatalog.class.getName(), ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "parquet-enabled", "true", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - )); + HIVE( + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default")), + HADOOP("testhadoop", SparkCatalog.class.getName(), ImmutableMap.of("type", "hadoop")), + SPARK( + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "parquet-enabled", "true", + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync + )); private final String catalogName; private final String implementation; diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java index 774e81328b2b..89323c26100c 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; @@ -31,29 +30,33 @@ public abstract class SparkCatalogTestBase extends SparkTestBaseWithCatalog { // these parameters are broken out to avoid changes that need to modify lots of test suites @Parameterized.Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") public static Object[][] parameters() { - return new Object[][] {{ - SparkCatalogConfig.HIVE.catalogName(), - SparkCatalogConfig.HIVE.implementation(), - SparkCatalogConfig.HIVE.properties() - }, { - SparkCatalogConfig.HADOOP.catalogName(), - SparkCatalogConfig.HADOOP.implementation(), - SparkCatalogConfig.HADOOP.properties() - }, { - SparkCatalogConfig.SPARK.catalogName(), - SparkCatalogConfig.SPARK.implementation(), - SparkCatalogConfig.SPARK.properties() - }}; + return new Object[][] { + { + SparkCatalogConfig.HIVE.catalogName(), + SparkCatalogConfig.HIVE.implementation(), + SparkCatalogConfig.HIVE.properties() + }, + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties() + }, + { + SparkCatalogConfig.SPARK.catalogName(), + SparkCatalogConfig.SPARK.implementation(), + SparkCatalogConfig.SPARK.properties() + } + }; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); public SparkCatalogTestBase(SparkCatalogConfig config) { super(config); } - public SparkCatalogTestBase(String catalogName, String implementation, Map config) { + public SparkCatalogTestBase( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java index 611b0b0b9b2b..27fcb50817b1 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; + import java.io.IOException; import java.io.UncheckedIOException; import java.net.URI; @@ -49,8 +50,6 @@ import org.junit.Assert; import org.junit.BeforeClass; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; - public abstract class SparkTestBase { protected static final Object ANY = new Object(); @@ -66,16 +65,19 @@ public static void startMetastoreAndSpark() { metastore.start(); SparkTestBase.hiveConf = metastore.hiveConf(); - SparkTestBase.spark = SparkSession.builder() - .master("local[2]") - .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") - .enableHiveSupport() - .getOrCreate(); + SparkTestBase.spark = + SparkSession.builder() + .master("local[2]") + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .enableHiveSupport() + .getOrCreate(); - SparkTestBase.catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + SparkTestBase.catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); try { catalog.createNamespace(Namespace.of("default")); @@ -116,22 +118,23 @@ protected List rowsToJava(List rows) { private Object[] toJava(Row row) { return IntStream.range(0, row.size()) - .mapToObj(pos -> { - if (row.isNullAt(pos)) { - return null; - } - - Object value = row.get(pos); - if (value instanceof Row) { - return toJava((Row) value); - } else if (value instanceof scala.collection.Seq) { - return row.getList(pos); - } else if (value instanceof scala.collection.Map) { - return row.getJavaMap(pos); - } else { - return value; - } - }) + .mapToObj( + pos -> { + if (row.isNullAt(pos)) { + return null; + } + + Object value = row.get(pos); + if (value instanceof Row) { + return toJava((Row) value); + } else if (value instanceof scala.collection.Seq) { + return row.getList(pos); + } else if (value instanceof scala.collection.Map) { + return row.getJavaMap(pos); + } else { + return value; + } + }) .toArray(Object[]::new); } @@ -147,8 +150,10 @@ protected Object[] row(Object... values) { return values; } - protected void assertEquals(String context, List expectedRows, List actualRows) { - Assert.assertEquals(context + ": number of results should match", expectedRows.size(), actualRows.size()); + protected void assertEquals( + String context, List expectedRows, List actualRows) { + Assert.assertEquals( + context + ": number of results should match", expectedRows.size(), actualRows.size()); for (int row = 0; row < expectedRows.size(); row += 1) { Object[] expected = expectedRows.get(row); Object[] actual = actualRows.get(row); @@ -216,30 +221,34 @@ protected void withSQLConf(Map conf, Action action) { SQLConf sqlConf = SQLConf.get(); Map currentConfValues = Maps.newHashMap(); - conf.keySet().forEach(confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); - - conf.forEach((confKey, confValue) -> { - if (SQLConf.isStaticConfigKey(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); + conf.keySet() + .forEach( + confKey -> { + if (sqlConf.contains(confKey)) { + String currentConfValue = sqlConf.getConfString(confKey); + currentConfValues.put(confKey, currentConfValue); + } + }); + + conf.forEach( + (confKey, confValue) -> { + if (SQLConf.isStaticConfigKey(confKey)) { + throw new RuntimeException("Cannot modify the value of a static config: " + confKey); + } + sqlConf.setConfString(confKey, confValue); + }); try { action.invoke(); } finally { - conf.forEach((confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); + conf.forEach( + (confKey, confValue) -> { + if (currentConfValues.containsKey(confKey)) { + sqlConf.setConfString(confKey, currentConfValues.get(confKey)); + } else { + sqlConf.unsetConf(confKey); + } + }); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/SparkTestBaseWithCatalog.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/SparkTestBaseWithCatalog.java index fc358f76ae9c..e32aeea64d4d 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/SparkTestBaseWithCatalog.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/SparkTestBaseWithCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.io.File; @@ -53,8 +52,7 @@ public static void dropWarehouse() throws IOException { } } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); protected final String catalogName; protected final Catalog validationCatalog; @@ -70,21 +68,25 @@ public SparkTestBaseWithCatalog(SparkCatalogConfig config) { this(config.catalogName(), config.implementation(), config.properties()); } - public SparkTestBaseWithCatalog(String catalogName, String implementation, Map config) { + public SparkTestBaseWithCatalog( + String catalogName, String implementation, Map config) { this.catalogName = catalogName; - this.validationCatalog = catalogName.equals("testhadoop") ? - new HadoopCatalog(spark.sessionState().newHadoopConf(), "file:" + warehouse) : - catalog; + this.validationCatalog = + catalogName.equals("testhadoop") + ? new HadoopCatalog(spark.sessionState().newHadoopConf(), "file:" + warehouse) + : catalog; this.validationNamespaceCatalog = (SupportsNamespaces) validationCatalog; spark.conf().set("spark.sql.catalog." + catalogName, implementation); - config.forEach((key, value) -> spark.conf().set("spark.sql.catalog." + catalogName + "." + key, value)); + config.forEach( + (key, value) -> spark.conf().set("spark.sql.catalog." + catalogName + "." + key, value)); if (config.get("type").equalsIgnoreCase("hadoop")) { spark.conf().set("spark.sql.catalog." + catalogName + ".warehouse", "file:" + warehouse); } - this.tableName = (catalogName.equals("spark_catalog") ? "" : catalogName + ".") + "default.table"; + this.tableName = + (catalogName.equals("spark_catalog") ? "" : catalogName + ".") + "default.table"; sql("CREATE NAMESPACE IF NOT EXISTS default"); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestFileRewriteCoordinator.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestFileRewriteCoordinator.java index 8aa5cd6faec1..2e6886d32df5 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestFileRewriteCoordinator.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestFileRewriteCoordinator.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.io.IOException; @@ -44,7 +43,8 @@ public class TestFileRewriteCoordinator extends SparkCatalogTestBase { - public TestFileRewriteCoordinator(String catalogName, String implementation, Map config) { + public TestFileRewriteCoordinator( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -66,7 +66,8 @@ public void testBinPackRewrite() throws NoSuchTableException, IOException { Table table = validationCatalog.loadTable(tableIdent); Assert.assertEquals("Should produce 4 snapshots", 4, Iterables.size(table.snapshots())); - Dataset fileDF = spark.read().format("iceberg").load(tableName(tableIdent.name() + ".files")); + Dataset fileDF = + spark.read().format("iceberg").load(tableName(tableIdent.name() + ".files")); List fileSizes = fileDF.select("file_size_in_bytes").as(Encoders.LONG()).collectAsList(); long avgFileSize = fileSizes.stream().mapToLong(i -> i).sum() / fileSizes.size(); @@ -77,22 +78,27 @@ public void testBinPackRewrite() throws NoSuchTableException, IOException { taskSetManager.stageTasks(table, fileSetID, Lists.newArrayList(fileScanTasks)); // read and pack original 4 files into 2 splits - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) - .option(SparkReadOptions.SPLIT_SIZE, Long.toString(avgFileSize * 2)) - .option(SparkReadOptions.FILE_OPEN_COST, "0") - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) + .option(SparkReadOptions.SPLIT_SIZE, Long.toString(avgFileSize * 2)) + .option(SparkReadOptions.FILE_OPEN_COST, "0") + .load(tableName); // write the packed data into new files where each split becomes a new file - scanDF.writeTo(tableName) + scanDF + .writeTo(tableName) .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID) .append(); // commit the rewrite FileRewriteCoordinator rewriteCoordinator = FileRewriteCoordinator.get(); - Set rewrittenFiles = taskSetManager.fetchTasks(table, fileSetID).stream() - .map(FileScanTask::file) - .collect(Collectors.toSet()); + Set rewrittenFiles = + taskSetManager.fetchTasks(table, fileSetID).stream() + .map(FileScanTask::file) + .collect(Collectors.toSet()); Set addedFiles = rewriteCoordinator.fetchNewDataFiles(table, fileSetID); table.newRewrite().rewriteFiles(rewrittenFiles, addedFiles).commit(); } @@ -127,34 +133,42 @@ public void testSortRewrite() throws NoSuchTableException, IOException { taskSetManager.stageTasks(table, fileSetID, Lists.newArrayList(fileScanTasks)); // read original 4 files as 4 splits - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) - .option(SparkReadOptions.SPLIT_SIZE, "134217728") - .option(SparkReadOptions.FILE_OPEN_COST, "134217728") - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) + .option(SparkReadOptions.SPLIT_SIZE, "134217728") + .option(SparkReadOptions.FILE_OPEN_COST, "134217728") + .load(tableName); // make sure we disable AQE and set the number of shuffle partitions as the target num files - ImmutableMap sqlConf = ImmutableMap.of( - "spark.sql.shuffle.partitions", "2", - "spark.sql.adaptive.enabled", "false" - ); - - withSQLConf(sqlConf, () -> { - try { - // write new files with sorted records - scanDF.sort("id").writeTo(tableName) - .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID) - .append(); - } catch (NoSuchTableException e) { - throw new RuntimeException("Could not replace files", e); - } - }); + ImmutableMap sqlConf = + ImmutableMap.of( + "spark.sql.shuffle.partitions", "2", + "spark.sql.adaptive.enabled", "false"); + + withSQLConf( + sqlConf, + () -> { + try { + // write new files with sorted records + scanDF + .sort("id") + .writeTo(tableName) + .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID) + .append(); + } catch (NoSuchTableException e) { + throw new RuntimeException("Could not replace files", e); + } + }); // commit the rewrite FileRewriteCoordinator rewriteCoordinator = FileRewriteCoordinator.get(); - Set rewrittenFiles = taskSetManager.fetchTasks(table, fileSetID).stream() - .map(FileScanTask::file) - .collect(Collectors.toSet()); + Set rewrittenFiles = + taskSetManager.fetchTasks(table, fileSetID).stream() + .map(FileScanTask::file) + .collect(Collectors.toSet()); Set addedFiles = rewriteCoordinator.fetchNewDataFiles(table, fileSetID); table.newRewrite().rewriteFiles(rewrittenFiles, addedFiles).commit(); } @@ -199,7 +213,8 @@ public void testCommitMultipleRewrites() throws NoSuchTableException, IOExceptio String secondFileSetID = UUID.randomUUID().toString(); - try (CloseableIterable tasks = table.newScan().appendsAfter(firstFileSetSnapshotId).planFiles()) { + try (CloseableIterable tasks = + table.newScan().appendsAfter(firstFileSetSnapshotId).planFiles()) { // stage 2 more files for compaction taskSetManager.stageTasks(table, secondFileSetID, Lists.newArrayList(tasks)); } @@ -208,26 +223,32 @@ public void testCommitMultipleRewrites() throws NoSuchTableException, IOExceptio for (String fileSetID : fileSetIDs) { // read and pack 2 files into 1 split - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) - .option(SparkReadOptions.SPLIT_SIZE, Long.MAX_VALUE) - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID) + .option(SparkReadOptions.SPLIT_SIZE, Long.MAX_VALUE) + .load(tableName); // write the combined data as one file - scanDF.writeTo(tableName) + scanDF + .writeTo(tableName) .option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID) .append(); } // commit both rewrites at the same time FileRewriteCoordinator rewriteCoordinator = FileRewriteCoordinator.get(); - Set rewrittenFiles = fileSetIDs.stream().flatMap(fileSetID -> - taskSetManager.fetchTasks(table, fileSetID).stream()) - .map(FileScanTask::file) - .collect(Collectors.toSet()); - Set addedFiles = fileSetIDs.stream() - .flatMap(fileSetID -> rewriteCoordinator.fetchNewDataFiles(table, fileSetID).stream()) - .collect(Collectors.toSet()); + Set rewrittenFiles = + fileSetIDs.stream() + .flatMap(fileSetID -> taskSetManager.fetchTasks(table, fileSetID).stream()) + .map(FileScanTask::file) + .collect(Collectors.toSet()); + Set addedFiles = + fileSetIDs.stream() + .flatMap(fileSetID -> rewriteCoordinator.fetchNewDataFiles(table, fileSetID).stream()) + .collect(Collectors.toSet()); table.newRewrite().rewriteFiles(rewrittenFiles, addedFiles).commit(); table.refresh(); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSpark3Util.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSpark3Util.java index be6130f63741..96dc2c29eb7f 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSpark3Util.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSpark3Util.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.NullOrder.NULLS_FIRST; +import static org.apache.iceberg.NullOrder.NULLS_LAST; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import org.apache.iceberg.CachingCatalog; import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; @@ -29,54 +33,69 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.NullOrder.NULLS_FIRST; -import static org.apache.iceberg.NullOrder.NULLS_LAST; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSpark3Util extends SparkTestBase { @Test public void testDescribeSortOrder() { - Schema schema = new Schema( + Schema schema = + new Schema( required(1, "data", Types.StringType.get()), - required(2, "time", Types.TimestampType.withoutZone()) - ); + required(2, "time", Types.TimestampType.withoutZone())); - Assert.assertEquals("Sort order isn't correct.", "data DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "data DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("Identity", schema, 1))); - Assert.assertEquals("Sort order isn't correct.", "bucket(1, data) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "bucket(1, data) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("bucket[1]", schema, 1))); - Assert.assertEquals("Sort order isn't correct.", "truncate(data, 3) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "truncate(data, 3) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("truncate[3]", schema, 1))); - Assert.assertEquals("Sort order isn't correct.", "years(time) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "years(time) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("year", schema, 2))); - Assert.assertEquals("Sort order isn't correct.", "months(time) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "months(time) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("month", schema, 2))); - Assert.assertEquals("Sort order isn't correct.", "days(time) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "days(time) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("day", schema, 2))); - Assert.assertEquals("Sort order isn't correct.", "hours(time) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "hours(time) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("hour", schema, 2))); - Assert.assertEquals("Sort order isn't correct.", "unknown(data) DESC NULLS FIRST", + Assert.assertEquals( + "Sort order isn't correct.", + "unknown(data) DESC NULLS FIRST", Spark3Util.describe(buildSortOrder("unknown", schema, 1))); // multiple sort orders - SortOrder multiOrder = SortOrder.builderFor(schema) - .asc("time", NULLS_FIRST) - .asc("data", NULLS_LAST) - .build(); - Assert.assertEquals("Sort order isn't correct.", "time ASC NULLS FIRST, data ASC NULLS LAST", - Spark3Util.describe(multiOrder)); + SortOrder multiOrder = + SortOrder.builderFor(schema).asc("time", NULLS_FIRST).asc("data", NULLS_LAST).build(); + Assert.assertEquals( + "Sort order isn't correct.", + "time ASC NULLS FIRST, data ASC NULLS LAST", + Spark3Util.describe(multiOrder)); } @Test public void testDescribeSchema() { - Schema schema = new Schema( - required(1, "data", Types.ListType.ofRequired(2, Types.StringType.get())), - optional(3, "pairs", Types.MapType.ofOptional(4, 5, Types.StringType.get(), Types.LongType.get())), - required(6, "time", Types.TimestampType.withoutZone()) - ); + Schema schema = + new Schema( + required(1, "data", Types.ListType.ofRequired(2, Types.StringType.get())), + optional( + 3, + "pairs", + Types.MapType.ofOptional(4, 5, Types.StringType.get(), Types.LongType.get())), + required(6, "time", Types.TimestampType.withoutZone())); - Assert.assertEquals("Schema description isn't correct.", + Assert.assertEquals( + "Schema description isn't correct.", "struct not null,pairs: map,time: timestamp not null>", Spark3Util.describe(schema)); } @@ -99,19 +118,25 @@ public void testLoadIcebergCatalog() throws Exception { spark.conf().set("spark.sql.catalog.test_cat", SparkCatalog.class.getName()); spark.conf().set("spark.sql.catalog.test_cat.type", "hive"); Catalog catalog = Spark3Util.loadIcebergCatalog(spark, "test_cat"); - Assert.assertTrue("Should retrieve underlying catalog class", catalog instanceof CachingCatalog); + Assert.assertTrue( + "Should retrieve underlying catalog class", catalog instanceof CachingCatalog); } private SortOrder buildSortOrder(String transform, Schema schema, int sourceId) { - String jsonString = "{\n" + - " \"order-id\" : 10,\n" + - " \"fields\" : [ {\n" + - " \"transform\" : \"" + transform + "\",\n" + - " \"source-id\" : " + sourceId + ",\n" + - " \"direction\" : \"desc\",\n" + - " \"null-order\" : \"nulls-first\"\n" + - " } ]\n" + - "}"; + String jsonString = + "{\n" + + " \"order-id\" : 10,\n" + + " \"fields\" : [ {\n" + + " \"transform\" : \"" + + transform + + "\",\n" + + " \"source-id\" : " + + sourceId + + ",\n" + + " \"direction\" : \"desc\",\n" + + " \"null-order\" : \"nulls-first\"\n" + + " } ]\n" + + "}"; return SortOrderParser.fromJson(schema, jsonString); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkCachedTableCatalog.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkCachedTableCatalog.java index 817af0302966..23e8717fb8c3 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkCachedTableCatalog.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkCachedTableCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Snapshot; @@ -69,17 +68,24 @@ public void testTimeTravel() { try { TABLE_CACHE.add("key", table); - assertEquals("Should have expected rows in 3rd snapshot", + assertEquals( + "Should have expected rows in 3rd snapshot", ImmutableList.of(row(1, "hr"), row(2, "hr"), row(3, "hr")), sql("SELECT * FROM testcache.key ORDER BY id")); - assertEquals("Should have expected rows in 2nd snapshot", + assertEquals( + "Should have expected rows in 2nd snapshot", ImmutableList.of(row(1, "hr"), row(2, "hr")), - sql("SELECT * FROM testcache.`key#at_timestamp_%s` ORDER BY id", secondSnapshot.timestampMillis())); + sql( + "SELECT * FROM testcache.`key#at_timestamp_%s` ORDER BY id", + secondSnapshot.timestampMillis())); - assertEquals("Should have expected rows in 1st snapshot", + assertEquals( + "Should have expected rows in 1st snapshot", ImmutableList.of(row(1, "hr")), - sql("SELECT * FROM testcache.`key#snapshot_id_%d` ORDER BY id", firstSnapshot.snapshotId())); + sql( + "SELECT * FROM testcache.`key#snapshot_id_%d` ORDER BY id", + firstSnapshot.snapshotId())); } finally { TABLE_CACHE.remove("key"); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkCatalogOperations.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkCatalogOperations.java index 5c9f3c4cb189..0836271a7c22 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkCatalogOperations.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkCatalogOperations.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.util.Map; diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkDistributionAndOrderingUtil.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkDistributionAndOrderingUtil.java index e1cf58ab2a1f..7a5fdad5a54f 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkDistributionAndOrderingUtil.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkDistributionAndOrderingUtil.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.TableProperties.DELETE_DISTRIBUTION_MODE; +import static org.apache.iceberg.TableProperties.MERGE_DISTRIBUTION_MODE; +import static org.apache.iceberg.TableProperties.UPDATE_DISTRIBUTION_MODE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE; +import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.DELETE; +import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.MERGE; +import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.UPDATE; + import org.apache.iceberg.DistributionMode; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Table; @@ -34,44 +44,47 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.TableProperties.DELETE_DISTRIBUTION_MODE; -import static org.apache.iceberg.TableProperties.MERGE_DISTRIBUTION_MODE; -import static org.apache.iceberg.TableProperties.UPDATE_DISTRIBUTION_MODE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE; -import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE; -import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.DELETE; -import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.MERGE; -import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.UPDATE; - public class TestSparkDistributionAndOrderingUtil extends SparkTestBaseWithCatalog { private static final Distribution UNSPECIFIED_DISTRIBUTION = Distributions.unspecified(); - private static final Distribution FILE_CLUSTERED_DISTRIBUTION = Distributions.clustered(new Expression[]{ - Expressions.column(MetadataColumns.FILE_PATH.name()) - }); - private static final Distribution SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION = Distributions.clustered(new Expression[]{ - Expressions.column(MetadataColumns.SPEC_ID.name()), - Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME) - }); - - private static final SortOrder[] EMPTY_ORDERING = new SortOrder[]{}; - private static final SortOrder[] FILE_POSITION_ORDERING = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) - }; - private static final SortOrder[] SPEC_ID_PARTITION_FILE_ORDERING = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING) - }; - private static final SortOrder[] SPEC_ID_PARTITION_FILE_POSITION_ORDERING = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) - }; + private static final Distribution FILE_CLUSTERED_DISTRIBUTION = + Distributions.clustered( + new Expression[] {Expressions.column(MetadataColumns.FILE_PATH.name())}); + private static final Distribution SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION = + Distributions.clustered( + new Expression[] { + Expressions.column(MetadataColumns.SPEC_ID.name()), + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME) + }); + + private static final SortOrder[] EMPTY_ORDERING = new SortOrder[] {}; + private static final SortOrder[] FILE_POSITION_ORDERING = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) + }; + private static final SortOrder[] SPEC_ID_PARTITION_FILE_ORDERING = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING) + }; + private static final SortOrder[] SPEC_ID_PARTITION_FILE_POSITION_ORDERING = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) + }; @After public void dropTable() { @@ -93,9 +106,7 @@ public void testHashWriteUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); checkWriteDistributionAndOrdering(table, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); } @@ -106,9 +117,7 @@ public void testRangeWriteUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); checkWriteDistributionAndOrdering(table, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); } @@ -119,15 +128,13 @@ public void testDefaultWriteUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -140,19 +147,15 @@ public void testHashWriteUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; checkWriteDistributionAndOrdering(table, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @@ -163,19 +166,15 @@ public void testRangeWriteUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -184,62 +183,65 @@ public void testRangeWriteUnpartitionedSortedTable() { @Test public void testDefaultWritePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; checkWriteDistributionAndOrdering(table, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashWritePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - Expression[] expectedClustering = new Expression[]{ - Expressions.identity("date"), - Expressions.days("ts") - }; + Expression[] expectedClustering = + new Expression[] {Expressions.identity("date"), Expressions.days("ts")}; Distribution expectedDistribution = Distributions.clustered(expectedClustering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; checkWriteDistributionAndOrdering(table, expectedDistribution, expectedOrdering); } @Test public void testRangeWritePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -248,20 +250,21 @@ public void testRangeWritePartitionedUnsortedTable() { @Test public void testDefaultWritePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.replaceSortOrder() - .desc("id") - .commit(); + table.replaceSortOrder().desc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -270,51 +273,49 @@ public void testDefaultWritePartitionedSortedTable() { @Test public void testHashWritePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - Expression[] expectedClustering = new Expression[]{ - Expressions.identity("date"), - Expressions.bucket(8, "data") - }; + Expression[] expectedClustering = + new Expression[] {Expressions.identity("date"), Expressions.bucket(8, "data")}; Distribution expectedDistribution = Distributions.clustered(expectedClustering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; checkWriteDistributionAndOrdering(table, expectedDistribution, expectedOrdering); } @Test public void testRangeWritePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -359,7 +360,8 @@ public void testDefaultCopyOnWriteDeleteUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - checkCopyOnWriteDistributionAndOrdering(table, DELETE, FILE_CLUSTERED_DISTRIBUTION, FILE_POSITION_ORDERING); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, FILE_CLUSTERED_DISTRIBUTION, FILE_POSITION_ORDERING); } @Test @@ -368,11 +370,10 @@ public void testNoneCopyOnWriteDeleteUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - checkCopyOnWriteDistributionAndOrdering(table, DELETE, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); } @Test @@ -381,11 +382,10 @@ public void testHashCopyOnWriteDeleteUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - checkCopyOnWriteDistributionAndOrdering(table, DELETE, FILE_CLUSTERED_DISTRIBUTION, FILE_POSITION_ORDERING); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, FILE_CLUSTERED_DISTRIBUTION, FILE_POSITION_ORDERING); } @Test @@ -394,13 +394,12 @@ public void testRangeCopyOnWriteDeleteUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); Distribution expectedDistribution = Distributions.ordered(FILE_POSITION_ORDERING); - checkCopyOnWriteDistributionAndOrdering(table, DELETE, expectedDistribution, FILE_POSITION_ORDERING); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, expectedDistribution, FILE_POSITION_ORDERING); } @Test @@ -409,17 +408,16 @@ public void testDefaultCopyOnWriteDeleteUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test @@ -428,21 +426,18 @@ public void testNoneCopyOnWriteDeleteUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test @@ -451,21 +446,18 @@ public void testHashCopyOnWriteDeleteUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test @@ -474,19 +466,15 @@ public void testRangeCopyOnWriteDeleteUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -495,82 +483,97 @@ public void testRangeCopyOnWriteDeleteUnpartitionedSortedTable() { @Test public void testDefaultCopyOnWriteDeletePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test public void testNoneCopyOnWriteDeletePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashCopyOnWriteDeletePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test public void testRangeCopyOnWriteDeletePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -579,93 +582,94 @@ public void testRangeCopyOnWriteDeletePartitionedUnsortedTable() { @Test public void testDefaultCopyOnWriteDeletePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.replaceSortOrder() - .desc("id") - .commit(); + table.replaceSortOrder().desc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test public void testNoneCopyOnWriteDeletePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - table.replaceSortOrder() - .desc("id") - .commit(); + table.replaceSortOrder().desc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashCopyOnWriteDeletePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, DELETE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test public void testRangeCopyOnWriteDeletePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -710,7 +714,8 @@ public void testDefaultCopyOnWriteUpdateUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, FILE_POSITION_ORDERING); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, FILE_POSITION_ORDERING); } @Test @@ -719,11 +724,10 @@ public void testNoneCopyOnWriteUpdateUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); } @Test @@ -732,11 +736,10 @@ public void testHashCopyOnWriteUpdateUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, FILE_POSITION_ORDERING); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, FILE_POSITION_ORDERING); } @Test @@ -745,13 +748,12 @@ public void testRangeCopyOnWriteUpdateUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); Distribution expectedDistribution = Distributions.ordered(FILE_POSITION_ORDERING); - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, expectedDistribution, FILE_POSITION_ORDERING); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, expectedDistribution, FILE_POSITION_ORDERING); } @Test @@ -760,17 +762,16 @@ public void testDefaultCopyOnWriteUpdateUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test @@ -779,21 +780,18 @@ public void testNoneCopyOnWriteUpdateUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test @@ -802,21 +800,18 @@ public void testHashCopyOnWriteUpdateUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test @@ -825,19 +820,15 @@ public void testRangeCopyOnWriteUpdateUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -846,82 +837,97 @@ public void testRangeCopyOnWriteUpdateUnpartitionedSortedTable() { @Test public void testDefaultCopyOnWriteUpdatePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test public void testNoneCopyOnWriteUpdatePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashCopyOnWriteUpdatePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test public void testRangeCopyOnWriteUpdatePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -930,93 +936,94 @@ public void testRangeCopyOnWriteUpdatePartitionedUnsortedTable() { @Test public void testDefaultCopyOnWriteUpdatePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.replaceSortOrder() - .desc("id") - .commit(); + table.replaceSortOrder().desc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test public void testNoneCopyOnWriteUpdatePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - table.replaceSortOrder() - .desc("id") - .commit(); + table.replaceSortOrder().desc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashCopyOnWriteUpdatePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, UPDATE, FILE_CLUSTERED_DISTRIBUTION, expectedOrdering); } @Test public void testRangeCopyOnWriteUpdatePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -1061,9 +1068,7 @@ public void testNoneCopyOnWriteMergeUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); checkCopyOnWriteDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); } @@ -1074,9 +1079,7 @@ public void testHashCopyOnWriteMergeUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); checkCopyOnWriteDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); } @@ -1087,9 +1090,7 @@ public void testRangeCopyOnWriteMergeUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); checkCopyOnWriteDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, EMPTY_ORDERING); } @@ -1100,21 +1101,18 @@ public void testNoneCopyOnWriteMergeUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test @@ -1123,21 +1121,18 @@ public void testHashCopyOnWriteMergeUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test @@ -1146,19 +1141,15 @@ public void testRangeCopyOnWriteMergeUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -1167,66 +1158,68 @@ public void testRangeCopyOnWriteMergeUnpartitionedSortedTable() { @Test public void testNoneCopyOnWriteMergePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashCopyOnWriteMergePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - Expression[] expectedClustering = new Expression[]{ - Expressions.identity("date"), - Expressions.days("ts") - }; + Expression[] expectedClustering = + new Expression[] {Expressions.identity("date"), Expressions.days("ts")}; Distribution expectedDistribution = Distributions.clustered(expectedClustering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; checkCopyOnWriteDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering); } @Test public void testRangeCopyOnWriteMergePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -1235,79 +1228,75 @@ public void testRangeCopyOnWriteMergePartitionedUnsortedTable() { @Test public void testNoneCopyOnWriteMergePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - table.replaceSortOrder() - .desc("id") - .commit(); + table.replaceSortOrder().desc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) + }; - checkCopyOnWriteDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkCopyOnWriteDistributionAndOrdering( + table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashCopyOnWriteMergePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - Expression[] expectedClustering = new Expression[]{ - Expressions.identity("date"), - Expressions.bucket(8, "data") - }; + Expression[] expectedClustering = + new Expression[] {Expressions.identity("date"), Expressions.bucket(8, "data")}; Distribution expectedDistribution = Distributions.clustered(expectedClustering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; checkCopyOnWriteDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering); } @Test public void testRangeCopyOnWriteMergePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedOrdering); @@ -1318,9 +1307,12 @@ public void testRangeCopyOnWriteMergePartitionedSortedTable() { // Distribution and ordering for merge-on-read DELETE operations with position deletes // =================================================================================== // - // delete mode is NOT SET -> CLUSTER BY _spec_id, _partition + LOCALLY ORDER BY _spec_id, _partition, _file, _pos - // delete mode is NONE -> unspecified distribution + LOCALLY ORDER BY _spec_id, _partition, _file, _pos - // delete mode is HASH -> CLUSTER BY _spec_id, _partition + LOCALLY ORDER BY _spec_id, _partition, _file, _pos + // delete mode is NOT SET -> CLUSTER BY _spec_id, _partition + LOCALLY ORDER BY _spec_id, + // _partition, _file, _pos + // delete mode is NONE -> unspecified distribution + LOCALLY ORDER BY _spec_id, _partition, _file, + // _pos + // delete mode is HASH -> CLUSTER BY _spec_id, _partition + LOCALLY ORDER BY _spec_id, _partition, + // _file, _pos // delete mode is RANGE -> RANGE DISTRIBUTE BY _spec_id, _partition, _file + // LOCALLY ORDER BY _spec_id, _partition, _file, _pos @@ -1331,7 +1323,10 @@ public void testDefaultPositionDeltaDeleteUnpartitionedTable() { Table table = validationCatalog.loadTable(tableIdent); checkPositionDeltaDistributionAndOrdering( - table, DELETE, SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); + table, + DELETE, + SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, + SPEC_ID_PARTITION_FILE_POSITION_ORDERING); } @Test @@ -1340,9 +1335,7 @@ public void testNonePositionDeltaDeleteUnpartitionedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); checkPositionDeltaDistributionAndOrdering( table, DELETE, UNSPECIFIED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); @@ -1354,12 +1347,13 @@ public void testHashPositionDeltaDeleteUnpartitionedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); checkPositionDeltaDistributionAndOrdering( - table, DELETE, SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); + table, + DELETE, + SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, + SPEC_ID_PARTITION_FILE_POSITION_ORDERING); } @Test @@ -1368,9 +1362,7 @@ public void testRangePositionDeltaDeleteUnpartitionedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); Distribution expectedDistribution = Distributions.ordered(SPEC_ID_PARTITION_FILE_ORDERING); @@ -1380,27 +1372,32 @@ public void testRangePositionDeltaDeleteUnpartitionedTable() { @Test public void testDefaultPositionDeltaDeletePartitionedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); checkPositionDeltaDistributionAndOrdering( - table, DELETE, SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); + table, + DELETE, + SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, + SPEC_ID_PARTITION_FILE_POSITION_ORDERING); } @Test public void testNonePositionDeltaDeletePartitionedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); checkPositionDeltaDistributionAndOrdering( table, DELETE, UNSPECIFIED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); @@ -1408,31 +1405,34 @@ public void testNonePositionDeltaDeletePartitionedTable() { @Test public void testHashPositionDeltaDeletePartitionedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); checkPositionDeltaDistributionAndOrdering( - table, DELETE, SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); + table, + DELETE, + SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, + SPEC_ID_PARTITION_FILE_POSITION_ORDERING); } @Test public void testRangePositionDeltaDeletePartitionedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); Distribution expectedDistribution = Distributions.ordered(SPEC_ID_PARTITION_FILE_ORDERING); @@ -1444,9 +1444,12 @@ public void testRangePositionDeltaDeletePartitionedTable() { // Distribution and ordering for merge-on-read UPDATE operations with position deletes // =================================================================================== // - // update mode is NOT SET -> CLUSTER BY _spec_id, _partition + LOCALLY ORDER BY _spec_id, _partition, _file, _pos - // update mode is NONE -> unspecified distribution + LOCALLY ORDER BY _spec_id, _partition, _file, _pos - // update mode is HASH -> CLUSTER BY _spec_id, _partition + LOCALLY ORDER BY _spec_id, _partition, _file, _pos + // update mode is NOT SET -> CLUSTER BY _spec_id, _partition + LOCALLY ORDER BY _spec_id, + // _partition, _file, _pos + // update mode is NONE -> unspecified distribution + LOCALLY ORDER BY _spec_id, _partition, _file, + // _pos + // update mode is HASH -> CLUSTER BY _spec_id, _partition + LOCALLY ORDER BY _spec_id, _partition, + // _file, _pos // update mode is RANGE -> RANGE DISTRIBUTE BY _spec_id, _partition, _file + // LOCALLY ORDER BY _spec_id, _partition, _file, _pos @@ -1457,7 +1460,10 @@ public void testDefaultPositionDeltaUpdateUnpartitionedTable() { Table table = validationCatalog.loadTable(tableIdent); checkPositionDeltaDistributionAndOrdering( - table, UPDATE, SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); + table, + UPDATE, + SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, + SPEC_ID_PARTITION_FILE_POSITION_ORDERING); } @Test @@ -1466,9 +1472,7 @@ public void testNonePositionDeltaUpdateUnpartitionedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); checkPositionDeltaDistributionAndOrdering( table, UPDATE, UNSPECIFIED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); @@ -1480,12 +1484,13 @@ public void testHashPositionDeltaUpdateUnpartitionedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); checkPositionDeltaDistributionAndOrdering( - table, UPDATE, SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); + table, + UPDATE, + SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, + SPEC_ID_PARTITION_FILE_POSITION_ORDERING); } @Test @@ -1494,9 +1499,7 @@ public void testRangePositionDeltaUpdateUnpartitionedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); Distribution expectedDistribution = Distributions.ordered(SPEC_ID_PARTITION_FILE_ORDERING); @@ -1506,27 +1509,32 @@ public void testRangePositionDeltaUpdateUnpartitionedTable() { @Test public void testDefaultPositionDeltaUpdatePartitionedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); checkPositionDeltaDistributionAndOrdering( - table, UPDATE, SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); + table, + UPDATE, + SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, + SPEC_ID_PARTITION_FILE_POSITION_ORDERING); } @Test public void testNonePositionDeltaUpdatePartitionedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); checkPositionDeltaDistributionAndOrdering( table, UPDATE, UNSPECIFIED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); @@ -1534,31 +1542,34 @@ public void testNonePositionDeltaUpdatePartitionedTable() { @Test public void testHashPositionDeltaUpdatePartitionedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); checkPositionDeltaDistributionAndOrdering( - table, UPDATE, SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); + table, + UPDATE, + SPEC_ID_PARTITION_CLUSTERED_DISTRIBUTION, + SPEC_ID_PARTITION_FILE_POSITION_ORDERING); } @Test public void testRangePositionDeltaUpdatePartitionedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(UPDATE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); Distribution expectedDistribution = Distributions.ordered(SPEC_ID_PARTITION_FILE_ORDERING); @@ -1575,7 +1586,8 @@ public void testRangePositionDeltaUpdatePartitionedTable() { // UNPARTITIONED UNORDERED // ------------------------------------------------------------------------- // merge mode is NOT SET -> use write mode - // merge mode is NONE -> unspecified distribution + LOCALLY ORDER BY _spec_id, _partition, _file, _pos + // merge mode is NONE -> unspecified distribution + LOCALLY ORDER BY _spec_id, _partition, _file, + // _pos // merge mode is HASH -> CLUSTER BY _spec_id, _partition, _file + // LOCALLY ORDER BY _spec_id, _partition, _file, _pos // merge mode is RANGE -> RANGE DISTRIBUTE BY _spec_id, _partition, _file + @@ -1617,9 +1629,7 @@ public void testNonePositionDeltaMergeUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); checkPositionDeltaDistributionAndOrdering( table, MERGE, UNSPECIFIED_DISTRIBUTION, SPEC_ID_PARTITION_FILE_POSITION_ORDERING); @@ -1631,15 +1641,14 @@ public void testHashPositionDeltaMergeUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - Expression[] expectedClustering = new Expression[]{ - Expressions.column(MetadataColumns.SPEC_ID.name()), - Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), - Expressions.column(MetadataColumns.FILE_PATH.name()) - }; + Expression[] expectedClustering = + new Expression[] { + Expressions.column(MetadataColumns.SPEC_ID.name()), + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), + Expressions.column(MetadataColumns.FILE_PATH.name()) + }; Distribution expectedDistribution = Distributions.clustered(expectedClustering); checkPositionDeltaDistributionAndOrdering( @@ -1652,15 +1661,17 @@ public void testRangePositionDeltaMergeUnpartitionedUnsortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - SortOrder[] expectedDistributionOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING) - }; + SortOrder[] expectedDistributionOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedDistributionOrdering); checkPositionDeltaDistributionAndOrdering( @@ -1673,25 +1684,26 @@ public void testNonePositionDeltaMergeUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; - checkPositionDeltaDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkPositionDeltaDistributionAndOrdering( + table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test @@ -1700,30 +1712,31 @@ public void testHashPositionDeltaMergeUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - Expression[] expectedClustering = new Expression[]{ - Expressions.column(MetadataColumns.SPEC_ID.name()), - Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), - Expressions.column(MetadataColumns.FILE_PATH.name()) - }; + Expression[] expectedClustering = + new Expression[] { + Expressions.column(MetadataColumns.SPEC_ID.name()), + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), + Expressions.column(MetadataColumns.FILE_PATH.name()) + }; Distribution expectedDistribution = Distributions.clustered(expectedClustering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; checkPositionDeltaDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering); } @@ -1734,249 +1747,295 @@ public void testRangePositionDeltaMergeUnpartitionedSortedTable() { Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .asc("data") - .commit(); + table.replaceSortOrder().asc("id").asc("data").commit(); - SortOrder[] expectedDistributionOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedDistributionOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedDistributionOrdering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) + }; checkPositionDeltaDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering); } @Test public void testNonePositionDeltaMergePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); - - Table table = validationCatalog.loadTable(tableIdent); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); + + Table table = validationCatalog.loadTable(tableIdent); + + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); + + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); - - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; - - checkPositionDeltaDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkPositionDeltaDistributionAndOrdering( + table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashPositionDeltaMergePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - Expression[] expectedClustering = new Expression[]{ - Expressions.column(MetadataColumns.SPEC_ID.name()), - Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), - Expressions.identity("date"), - Expressions.days("ts") - }; + Expression[] expectedClustering = + new Expression[] { + Expressions.column(MetadataColumns.SPEC_ID.name()), + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), + Expressions.identity("date"), + Expressions.days("ts") + }; Distribution expectedDistribution = Distributions.clustered(expectedClustering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; checkPositionDeltaDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering); } @Test public void testRangePositionDeltaMergePartitionedUnsortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, days(ts))", tableName); - - Table table = validationCatalog.loadTable(tableIdent); - - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); - - SortOrder[] expectedDistributionOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, days(ts))", + tableName); + + Table table = validationCatalog.loadTable(tableIdent); + + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); + + SortOrder[] expectedDistributionOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedDistributionOrdering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) + }; checkPositionDeltaDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering); } @Test public void testNonePositionDeltaMergePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_NONE).commit(); - table.replaceSortOrder() - .desc("id") - .commit(); + table.replaceSortOrder().desc("id").commit(); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.DESCENDING) + }; - checkPositionDeltaDistributionAndOrdering(table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); + checkPositionDeltaDistributionAndOrdering( + table, MERGE, UNSPECIFIED_DISTRIBUTION, expectedOrdering); } @Test public void testHashPositionDeltaMergePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date, bucket(8, data))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date, bucket(8, data))", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - Expression[] expectedClustering = new Expression[]{ - Expressions.column(MetadataColumns.SPEC_ID.name()), - Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), - Expressions.identity("date"), - Expressions.bucket(8, "data") - }; + Expression[] expectedClustering = + new Expression[] { + Expressions.column(MetadataColumns.SPEC_ID.name()), + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), + Expressions.identity("date"), + Expressions.bucket(8, "data") + }; Distribution expectedDistribution = Distributions.clustered(expectedClustering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; checkPositionDeltaDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering); } @Test public void testRangePositionDeltaMergePartitionedSortedTable() { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() - .set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE) - .commit(); + table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit(); - table.replaceSortOrder() - .asc("id") - .commit(); + table.replaceSortOrder().asc("id").commit(); - SortOrder[] expectedDistributionOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedDistributionOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; Distribution expectedDistribution = Distributions.ordered(expectedDistributionOrdering); - SortOrder[] expectedOrdering = new SortOrder[]{ - Expressions.sort(Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), - Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) - }; + SortOrder[] expectedOrdering = + new SortOrder[] { + Expressions.sort( + Expressions.column(MetadataColumns.SPEC_ID.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.PARTITION_COLUMN_NAME), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.FILE_PATH.name()), SortDirection.ASCENDING), + Expressions.sort( + Expressions.column(MetadataColumns.ROW_POSITION.name()), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), + Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) + }; checkPositionDeltaDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering); } - private void checkWriteDistributionAndOrdering(Table table, Distribution expectedDistribution, - SortOrder[] expectedOrdering) { + private void checkWriteDistributionAndOrdering( + Table table, Distribution expectedDistribution, SortOrder[] expectedOrdering) { SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); DistributionMode distributionMode = writeConf.distributionMode(); - Distribution distribution = SparkDistributionAndOrderingUtil.buildRequiredDistribution(table, distributionMode); + Distribution distribution = + SparkDistributionAndOrderingUtil.buildRequiredDistribution(table, distributionMode); Assert.assertEquals("Distribution must match", expectedDistribution, distribution); - SortOrder[] ordering = SparkDistributionAndOrderingUtil.buildRequiredOrdering(table, distribution); + SortOrder[] ordering = + SparkDistributionAndOrderingUtil.buildRequiredOrdering(table, distribution); Assert.assertArrayEquals("Ordering must match", expectedOrdering, ordering); } - private void checkCopyOnWriteDistributionAndOrdering(Table table, Command command, - Distribution expectedDistribution, - SortOrder[] expectedOrdering) { + private void checkCopyOnWriteDistributionAndOrdering( + Table table, + Command command, + Distribution expectedDistribution, + SortOrder[] expectedOrdering) { SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); DistributionMode mode = copyOnWriteDistributionMode(command, writeConf); - Distribution distribution = SparkDistributionAndOrderingUtil.buildCopyOnWriteDistribution(table, command, mode); + Distribution distribution = + SparkDistributionAndOrderingUtil.buildCopyOnWriteDistribution(table, command, mode); Assert.assertEquals("Distribution must match", expectedDistribution, distribution); - SortOrder[] ordering = SparkDistributionAndOrderingUtil.buildCopyOnWriteOrdering(table, command, distribution); + SortOrder[] ordering = + SparkDistributionAndOrderingUtil.buildCopyOnWriteOrdering(table, command, distribution); Assert.assertArrayEquals("Ordering must match", expectedOrdering, ordering); } @@ -1993,21 +2052,26 @@ private DistributionMode copyOnWriteDistributionMode(Command command, SparkWrite } } - private void checkPositionDeltaDistributionAndOrdering(Table table, Command command, - Distribution expectedDistribution, - SortOrder[] expectedOrdering) { + private void checkPositionDeltaDistributionAndOrdering( + Table table, + Command command, + Distribution expectedDistribution, + SortOrder[] expectedOrdering) { SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); DistributionMode mode = positionDeltaDistributionMode(command, writeConf); - Distribution distribution = SparkDistributionAndOrderingUtil.buildPositionDeltaDistribution(table, command, mode); + Distribution distribution = + SparkDistributionAndOrderingUtil.buildPositionDeltaDistribution(table, command, mode); Assert.assertEquals("Distribution must match", expectedDistribution, distribution); - SortOrder[] ordering = SparkDistributionAndOrderingUtil.buildPositionDeltaOrdering(table, command); + SortOrder[] ordering = + SparkDistributionAndOrderingUtil.buildPositionDeltaOrdering(table, command); Assert.assertArrayEquals("Ordering must match", expectedOrdering, ordering); } - private DistributionMode positionDeltaDistributionMode(Command command, SparkWriteConf writeConf) { + private DistributionMode positionDeltaDistributionMode( + Command command, SparkWriteConf writeConf) { switch (command) { case DELETE: return writeConf.deleteDistributionMode(); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkFilters.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkFilters.java index ccd526a54618..2e56b6aa91b0 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkFilters.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkFilters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.sql.Date; @@ -53,52 +52,60 @@ public void testQuotedAttributes() { attrMap.put("`d`.b.`dd```", "d.b.dd`"); attrMap.put("a.`aa```.c", "a.aa`.c"); - attrMap.forEach((quoted, unquoted) -> { - IsNull isNull = IsNull.apply(quoted); - Expression expectedIsNull = Expressions.isNull(unquoted); - Expression actualIsNull = SparkFilters.convert(isNull); - Assert.assertEquals("IsNull must match", expectedIsNull.toString(), actualIsNull.toString()); - - IsNotNull isNotNull = IsNotNull.apply(quoted); - Expression expectedIsNotNull = Expressions.notNull(unquoted); - Expression actualIsNotNull = SparkFilters.convert(isNotNull); - Assert.assertEquals("IsNotNull must match", expectedIsNotNull.toString(), actualIsNotNull.toString()); - - LessThan lt = LessThan.apply(quoted, 1); - Expression expectedLt = Expressions.lessThan(unquoted, 1); - Expression actualLt = SparkFilters.convert(lt); - Assert.assertEquals("LessThan must match", expectedLt.toString(), actualLt.toString()); - - LessThanOrEqual ltEq = LessThanOrEqual.apply(quoted, 1); - Expression expectedLtEq = Expressions.lessThanOrEqual(unquoted, 1); - Expression actualLtEq = SparkFilters.convert(ltEq); - Assert.assertEquals("LessThanOrEqual must match", expectedLtEq.toString(), actualLtEq.toString()); - - GreaterThan gt = GreaterThan.apply(quoted, 1); - Expression expectedGt = Expressions.greaterThan(unquoted, 1); - Expression actualGt = SparkFilters.convert(gt); - Assert.assertEquals("GreaterThan must match", expectedGt.toString(), actualGt.toString()); - - GreaterThanOrEqual gtEq = GreaterThanOrEqual.apply(quoted, 1); - Expression expectedGtEq = Expressions.greaterThanOrEqual(unquoted, 1); - Expression actualGtEq = SparkFilters.convert(gtEq); - Assert.assertEquals("GreaterThanOrEqual must match", expectedGtEq.toString(), actualGtEq.toString()); - - EqualTo eq = EqualTo.apply(quoted, 1); - Expression expectedEq = Expressions.equal(unquoted, 1); - Expression actualEq = SparkFilters.convert(eq); - Assert.assertEquals("EqualTo must match", expectedEq.toString(), actualEq.toString()); - - EqualNullSafe eqNullSafe = EqualNullSafe.apply(quoted, 1); - Expression expectedEqNullSafe = Expressions.equal(unquoted, 1); - Expression actualEqNullSafe = SparkFilters.convert(eqNullSafe); - Assert.assertEquals("EqualNullSafe must match", expectedEqNullSafe.toString(), actualEqNullSafe.toString()); - - In in = In.apply(quoted, new Integer[]{1}); - Expression expectedIn = Expressions.in(unquoted, 1); - Expression actualIn = SparkFilters.convert(in); - Assert.assertEquals("In must match", expectedIn.toString(), actualIn.toString()); - }); + attrMap.forEach( + (quoted, unquoted) -> { + IsNull isNull = IsNull.apply(quoted); + Expression expectedIsNull = Expressions.isNull(unquoted); + Expression actualIsNull = SparkFilters.convert(isNull); + Assert.assertEquals( + "IsNull must match", expectedIsNull.toString(), actualIsNull.toString()); + + IsNotNull isNotNull = IsNotNull.apply(quoted); + Expression expectedIsNotNull = Expressions.notNull(unquoted); + Expression actualIsNotNull = SparkFilters.convert(isNotNull); + Assert.assertEquals( + "IsNotNull must match", expectedIsNotNull.toString(), actualIsNotNull.toString()); + + LessThan lt = LessThan.apply(quoted, 1); + Expression expectedLt = Expressions.lessThan(unquoted, 1); + Expression actualLt = SparkFilters.convert(lt); + Assert.assertEquals("LessThan must match", expectedLt.toString(), actualLt.toString()); + + LessThanOrEqual ltEq = LessThanOrEqual.apply(quoted, 1); + Expression expectedLtEq = Expressions.lessThanOrEqual(unquoted, 1); + Expression actualLtEq = SparkFilters.convert(ltEq); + Assert.assertEquals( + "LessThanOrEqual must match", expectedLtEq.toString(), actualLtEq.toString()); + + GreaterThan gt = GreaterThan.apply(quoted, 1); + Expression expectedGt = Expressions.greaterThan(unquoted, 1); + Expression actualGt = SparkFilters.convert(gt); + Assert.assertEquals("GreaterThan must match", expectedGt.toString(), actualGt.toString()); + + GreaterThanOrEqual gtEq = GreaterThanOrEqual.apply(quoted, 1); + Expression expectedGtEq = Expressions.greaterThanOrEqual(unquoted, 1); + Expression actualGtEq = SparkFilters.convert(gtEq); + Assert.assertEquals( + "GreaterThanOrEqual must match", expectedGtEq.toString(), actualGtEq.toString()); + + EqualTo eq = EqualTo.apply(quoted, 1); + Expression expectedEq = Expressions.equal(unquoted, 1); + Expression actualEq = SparkFilters.convert(eq); + Assert.assertEquals("EqualTo must match", expectedEq.toString(), actualEq.toString()); + + EqualNullSafe eqNullSafe = EqualNullSafe.apply(quoted, 1); + Expression expectedEqNullSafe = Expressions.equal(unquoted, 1); + Expression actualEqNullSafe = SparkFilters.convert(eqNullSafe); + Assert.assertEquals( + "EqualNullSafe must match", + expectedEqNullSafe.toString(), + actualEqNullSafe.toString()); + + In in = In.apply(quoted, new Integer[] {1}); + Expression expectedIn = Expressions.in(unquoted, 1); + Expression actualIn = SparkFilters.convert(in); + Assert.assertEquals("In must match", expectedIn.toString(), actualIn.toString()); + }); } @Test @@ -111,10 +118,14 @@ public void testTimestampFilterConversion() { Expression timestampExpression = SparkFilters.convert(GreaterThan.apply("x", timestamp)); Expression rawExpression = Expressions.greaterThan("x", epochMicros); - Assert.assertEquals("Generated Timestamp expression should be correct", - rawExpression.toString(), timestampExpression.toString()); - Assert.assertEquals("Generated Instant expression should be correct", - rawExpression.toString(), instantExpression.toString()); + Assert.assertEquals( + "Generated Timestamp expression should be correct", + rawExpression.toString(), + timestampExpression.toString()); + Assert.assertEquals( + "Generated Instant expression should be correct", + rawExpression.toString(), + instantExpression.toString()); } @Test @@ -127,25 +138,31 @@ public void testDateFilterConversion() { Expression dateExpression = SparkFilters.convert(GreaterThan.apply("x", date)); Expression rawExpression = Expressions.greaterThan("x", epochDay); - Assert.assertEquals("Generated localdate expression should be correct", - rawExpression.toString(), localDateExpression.toString()); + Assert.assertEquals( + "Generated localdate expression should be correct", + rawExpression.toString(), + localDateExpression.toString()); - Assert.assertEquals("Generated date expression should be correct", - rawExpression.toString(), dateExpression.toString()); + Assert.assertEquals( + "Generated date expression should be correct", + rawExpression.toString(), + dateExpression.toString()); } @Test public void testNestedInInsideNot() { - Not filter = Not.apply(And.apply(EqualTo.apply("col1", 1), In.apply("col2", new Integer[]{1, 2}))); + Not filter = + Not.apply(And.apply(EqualTo.apply("col1", 1), In.apply("col2", new Integer[] {1, 2}))); Expression converted = SparkFilters.convert(filter); Assert.assertNull("Expression should not be converted", converted); } @Test public void testNotIn() { - Not filter = Not.apply(In.apply("col", new Integer[]{1, 2})); + Not filter = Not.apply(In.apply("col", new Integer[] {1, 2})); Expression actual = SparkFilters.convert(filter); - Expression expected = Expressions.and(Expressions.notNull("col"), Expressions.notIn("col", 1, 2)); + Expression expected = + Expressions.and(Expressions.notNull("col"), Expressions.notIn("col", 1, 2)); Assert.assertEquals("Expressions should match", expected.toString(), actual.toString()); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java index 38fe5734fe5f..259f7c3dd789 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import java.util.List; import org.apache.iceberg.MetadataColumns; @@ -30,32 +31,30 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestSparkSchemaUtil { - private static final Schema TEST_SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema TEST_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - private static final Schema TEST_SCHEMA_WITH_METADATA_COLS = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()), - MetadataColumns.FILE_PATH, - MetadataColumns.ROW_POSITION - ); + private static final Schema TEST_SCHEMA_WITH_METADATA_COLS = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get()), + MetadataColumns.FILE_PATH, + MetadataColumns.ROW_POSITION); @Test public void testEstimateSizeMaxValue() throws IOException { - Assert.assertEquals("estimateSize returns Long max value", Long.MAX_VALUE, - SparkSchemaUtil.estimateSize( - null, - Long.MAX_VALUE)); + Assert.assertEquals( + "estimateSize returns Long max value", + Long.MAX_VALUE, + SparkSchemaUtil.estimateSize(null, Long.MAX_VALUE)); } @Test public void testEstimateSizeWithOverflow() throws IOException { - long tableSize = SparkSchemaUtil.estimateSize(SparkSchemaUtil.convert(TEST_SCHEMA), Long.MAX_VALUE - 1); + long tableSize = + SparkSchemaUtil.estimateSize(SparkSchemaUtil.convert(TEST_SCHEMA), Long.MAX_VALUE - 1); Assert.assertEquals("estimateSize handles overflow", Long.MAX_VALUE, tableSize); } @@ -68,13 +67,16 @@ public void testEstimateSize() throws IOException { @Test public void testSchemaConversionWithMetaDataColumnSchema() { StructType structType = SparkSchemaUtil.convert(TEST_SCHEMA_WITH_METADATA_COLS); - List attrRefs = scala.collection.JavaConverters.seqAsJavaList(structType.toAttributes()); + List attrRefs = + scala.collection.JavaConverters.seqAsJavaList(structType.toAttributes()); for (AttributeReference attrRef : attrRefs) { if (MetadataColumns.isMetadataColumn(attrRef.name())) { - Assert.assertTrue("metadata columns should have __metadata_col in attribute metadata", + Assert.assertTrue( + "metadata columns should have __metadata_col in attribute metadata", MetadataAttribute.unapply(attrRef).isDefined()); } else { - Assert.assertFalse("non metadata columns should not have __metadata_col in attribute metadata", + Assert.assertFalse( + "non metadata columns should not have __metadata_col in attribute metadata", MetadataAttribute.unapply(attrRef).isDefined()); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkSessionCatalog.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkSessionCatalog.java index 3ab2d6b23a6f..6737dc64ff7e 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkSessionCatalog.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkSessionCatalog.java @@ -16,14 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; + import org.junit.Assert; import org.junit.Test; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; - public class TestSparkSessionCatalog extends SparkTestBase { @Test public void testValidateHmsUri() { @@ -31,36 +30,60 @@ public void testValidateHmsUri() { String catalogHmsUriKey = "spark.sql.catalog.spark_catalog.uri"; String hmsUri = hiveConf.get(METASTOREURIS.varname); - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); spark.conf().set("spark.sql.catalog.spark_catalog.type", "hive"); // HMS uris match spark.sessionState().catalogManager().reset(); spark.conf().set(envHmsUriKey, hmsUri); spark.conf().set(catalogHmsUriKey, hmsUri); - Assert.assertTrue(spark.sessionState().catalogManager().v2SessionCatalog().defaultNamespace()[0].equals("default")); + Assert.assertTrue( + spark + .sessionState() + .catalogManager() + .v2SessionCatalog() + .defaultNamespace()[0] + .equals("default")); // HMS uris doesn't match spark.sessionState().catalogManager().reset(); String catalogHmsUri = "RandomString"; spark.conf().set(envHmsUriKey, hmsUri); spark.conf().set(catalogHmsUriKey, catalogHmsUri); - IllegalArgumentException exception = Assert.assertThrows(IllegalArgumentException.class, - () -> spark.sessionState().catalogManager().v2SessionCatalog()); - String errorMessage = String.format("Inconsistent Hive metastore URIs: %s (Spark session) != %s (spark_catalog)", - hmsUri, catalogHmsUri); + IllegalArgumentException exception = + Assert.assertThrows( + IllegalArgumentException.class, + () -> spark.sessionState().catalogManager().v2SessionCatalog()); + String errorMessage = + String.format( + "Inconsistent Hive metastore URIs: %s (Spark session) != %s (spark_catalog)", + hmsUri, catalogHmsUri); Assert.assertEquals(errorMessage, exception.getMessage()); // no env HMS uri, only catalog HMS uri spark.sessionState().catalogManager().reset(); spark.conf().set(catalogHmsUriKey, hmsUri); spark.conf().unset(envHmsUriKey); - Assert.assertTrue(spark.sessionState().catalogManager().v2SessionCatalog().defaultNamespace()[0].equals("default")); + Assert.assertTrue( + spark + .sessionState() + .catalogManager() + .v2SessionCatalog() + .defaultNamespace()[0] + .equals("default")); // no catalog HMS uri, only env HMS uri spark.sessionState().catalogManager().reset(); spark.conf().set(envHmsUriKey, hmsUri); spark.conf().unset(catalogHmsUriKey); - Assert.assertTrue(spark.sessionState().catalogManager().v2SessionCatalog().defaultNamespace()[0].equals("default")); + Assert.assertTrue( + spark + .sessionState() + .catalogManager() + .v2SessionCatalog() + .defaultNamespace()[0] + .equals("default")); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkTableUtil.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkTableUtil.java index 61af0185d8bd..1e51caadd0e9 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkTableUtil.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkTableUtil.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import java.io.IOException; @@ -57,31 +56,47 @@ public void testSparkPartitionJavaSerialization() throws IOException, ClassNotFo @Test public void testMetricsConfigKryoSerialization() throws Exception { - Map metricsConfig = ImmutableMap.of( - TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts", - TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col1", "full", - TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col2", "truncate(16)"); + Map metricsConfig = + ImmutableMap.of( + TableProperties.DEFAULT_WRITE_METRICS_MODE, + "counts", + TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col1", + "full", + TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col2", + "truncate(16)"); MetricsConfig config = MetricsConfig.fromProperties(metricsConfig); MetricsConfig deserialized = KryoHelpers.roundTripSerialize(config); - Assert.assertEquals(MetricsModes.Full.get().toString(), deserialized.columnMode("col1").toString()); - Assert.assertEquals(MetricsModes.Truncate.withLength(16).toString(), deserialized.columnMode("col2").toString()); - Assert.assertEquals(MetricsModes.Counts.get().toString(), deserialized.columnMode("col3").toString()); + Assert.assertEquals( + MetricsModes.Full.get().toString(), deserialized.columnMode("col1").toString()); + Assert.assertEquals( + MetricsModes.Truncate.withLength(16).toString(), + deserialized.columnMode("col2").toString()); + Assert.assertEquals( + MetricsModes.Counts.get().toString(), deserialized.columnMode("col3").toString()); } @Test public void testMetricsConfigJavaSerialization() throws Exception { - Map metricsConfig = ImmutableMap.of( - TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts", - TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col1", "full", - TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col2", "truncate(16)"); + Map metricsConfig = + ImmutableMap.of( + TableProperties.DEFAULT_WRITE_METRICS_MODE, + "counts", + TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col1", + "full", + TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "col2", + "truncate(16)"); MetricsConfig config = MetricsConfig.fromProperties(metricsConfig); MetricsConfig deserialized = TestHelpers.roundTripSerialize(config); - Assert.assertEquals(MetricsModes.Full.get().toString(), deserialized.columnMode("col1").toString()); - Assert.assertEquals(MetricsModes.Truncate.withLength(16).toString(), deserialized.columnMode("col2").toString()); - Assert.assertEquals(MetricsModes.Counts.get().toString(), deserialized.columnMode("col3").toString()); + Assert.assertEquals( + MetricsModes.Full.get().toString(), deserialized.columnMode("col1").toString()); + Assert.assertEquals( + MetricsModes.Truncate.withLength(16).toString(), + deserialized.columnMode("col2").toString()); + Assert.assertEquals( + MetricsModes.Counts.get().toString(), deserialized.columnMode("col3").toString()); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java index 57941b8c7940..7f00c7edd8a9 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/TestSparkValueConverter.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark; import org.apache.iceberg.Schema; @@ -31,51 +30,55 @@ public class TestSparkValueConverter { @Test public void testSparkNullMapConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); assertCorrectNullConversion(schema); } @Test public void testSparkNullListConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", - Types.ListType.ofOptional(6, Types.StringType.get()) - ) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, "locations", Types.ListType.ofOptional(6, Types.StringType.get()))); assertCorrectNullConversion(schema); } @Test public void testSparkNullStructConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); assertCorrectNullConversion(schema); } @Test public void testSparkNullPrimitiveConvert() { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "location", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(5, "location", Types.StringType.get())); assertCorrectNullConversion(schema); } @@ -83,7 +86,8 @@ private void assertCorrectNullConversion(Schema schema) { Row sparkRow = RowFactory.create(1, null); Record record = GenericRecord.create(schema); record.set(0, 1); - Assert.assertEquals("Round-trip conversion should produce original value", + Assert.assertEquals( + "Round-trip conversion should produce original value", record, SparkValueConverter.convert(schema, sparkRow)); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java index c5b1bf31b42e..96950e8227f3 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.io.File; @@ -82,45 +81,61 @@ import scala.collection.Seq; public class TestCreateActions extends SparkCatalogTestBase { - private static final String CREATE_PARTITIONED_PARQUET = "CREATE TABLE %s (id INT, data STRING) " + - "using parquet PARTITIONED BY (id) LOCATION '%s'"; - private static final String CREATE_PARQUET = "CREATE TABLE %s (id INT, data STRING) " + - "using parquet LOCATION '%s'"; - private static final String CREATE_HIVE_EXTERNAL_PARQUET = "CREATE EXTERNAL TABLE %s (data STRING) " + - "PARTITIONED BY (id INT) STORED AS parquet LOCATION '%s'"; - private static final String CREATE_HIVE_PARQUET = "CREATE TABLE %s (data STRING) " + - "PARTITIONED BY (id INT) STORED AS parquet"; + private static final String CREATE_PARTITIONED_PARQUET = + "CREATE TABLE %s (id INT, data STRING) " + "using parquet PARTITIONED BY (id) LOCATION '%s'"; + private static final String CREATE_PARQUET = + "CREATE TABLE %s (id INT, data STRING) " + "using parquet LOCATION '%s'"; + private static final String CREATE_HIVE_EXTERNAL_PARQUET = + "CREATE EXTERNAL TABLE %s (data STRING) " + + "PARTITIONED BY (id INT) STORED AS parquet LOCATION '%s'"; + private static final String CREATE_HIVE_PARQUET = + "CREATE TABLE %s (data STRING) " + "PARTITIONED BY (id INT) STORED AS parquet"; private static final String NAMESPACE = "default"; @Parameterized.Parameters(name = "Catalog Name {0} - Options {2}") public static Object[][] parameters() { return new Object[][] { - new Object[] {"spark_catalog", SparkSessionCatalog.class.getName(), ImmutableMap.of( + new Object[] { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( "type", "hive", "default-namespace", "default", "parquet-enabled", "true", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - )}, - new Object[] {"spark_catalog", SparkSessionCatalog.class.getName(), ImmutableMap.of( + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync + ) + }, + new Object[] { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( "type", "hadoop", "default-namespace", "default", "parquet-enabled", "true", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - )}, - new Object[] { "testhive", SparkCatalog.class.getName(), ImmutableMap.of( + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync + ) + }, + new Object[] { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( "type", "hive", - "default-namespace", "default" - )}, - new Object[] { "testhadoop", SparkCatalog.class.getName(), ImmutableMap.of( + "default-namespace", "default") + }, + new Object[] { + "testhadoop", + SparkCatalog.class.getName(), + ImmutableMap.of( "type", "hadoop", - "default-namespace", "default" - )} + "default-namespace", "default") + } }; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String baseTableName = "baseTable"; private File tableDir; @@ -128,10 +143,7 @@ public static Object[][] parameters() { private final String type; private final TableCatalog catalog; - public TestCreateActions( - String catalogName, - String implementation, - Map config) { + public TestCreateActions(String catalogName, String implementation, Map config) { super(catalogName, implementation, config); this.catalog = (TableCatalog) spark.sessionState().catalogManager().catalog(catalogName); this.type = config.get("type"); @@ -152,15 +164,15 @@ public void before() { spark.conf().set("spark.sql.parquet.writeLegacyFormat", false); spark.sql(String.format("DROP TABLE IF EXISTS %s", baseTableName)); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").orderBy("data").write() + df.select("id", "data") + .orderBy("data") + .write() .mode("append") .option("path", tableLocation) .saveAsTable(baseTableName); @@ -175,7 +187,8 @@ public void after() throws IOException { @Test public void testMigratePartitioned() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_migrate_partitioned_table"); String dest = source; createSourceTable(CREATE_PARTITIONED_PARQUET, source); @@ -185,17 +198,20 @@ public void testMigratePartitioned() throws Exception { @Test public void testPartitionedTableWithUnRecoveredPartitions() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_unrecovered_partitions"); String dest = source; File location = temp.newFolder(); sql(CREATE_PARTITIONED_PARQUET, source, location); // Data generation and partition addition - spark.range(5) + spark + .range(5) .selectExpr("id", "cast(id as STRING) as data") .write() - .partitionBy("id").mode(SaveMode.Overwrite) + .partitionBy("id") + .mode(SaveMode.Overwrite) .parquet(location.toURI().toString()); sql("ALTER TABLE %s ADD PARTITION(id=0)", source); @@ -205,7 +221,8 @@ public void testPartitionedTableWithUnRecoveredPartitions() throws Exception { @Test public void testPartitionedTableWithCustomPartitions() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_custom_parts"); String dest = source; File tblLocation = temp.newFolder(); @@ -213,18 +230,23 @@ public void testPartitionedTableWithCustomPartitions() throws Exception { // Data generation and partition addition spark.sql(String.format(CREATE_PARTITIONED_PARQUET, source, tblLocation)); - spark.range(10) + spark + .range(10) .selectExpr("cast(id as STRING) as data") .write() - .mode(SaveMode.Overwrite).parquet(partitionDataLoc.toURI().toString()); - sql("ALTER TABLE %s ADD PARTITION(id=0) LOCATION '%s'", source, partitionDataLoc.toURI().toString()); + .mode(SaveMode.Overwrite) + .parquet(partitionDataLoc.toURI().toString()); + sql( + "ALTER TABLE %s ADD PARTITION(id=0) LOCATION '%s'", + source, partitionDataLoc.toURI().toString()); assertMigratedFileCount(SparkActions.get().migrateTable(source), source, dest); } @Test public void testAddColumnOnMigratedTableAtEnd() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_add_column_migrated_table"); String dest = source; createSourceTable(CREATE_PARQUET, source); @@ -263,7 +285,8 @@ public void testAddColumnOnMigratedTableAtEnd() throws Exception { @Test public void testAddColumnOnMigratedTableAtMiddle() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_add_column_migrated_table_middle"); String dest = source; createSourceTable(CREATE_PARQUET, source); @@ -277,7 +300,10 @@ public void testAddColumnOnMigratedTableAtMiddle() throws Exception { // test column addition on migrated table Schema beforeSchema = table.schema(); String newCol1 = "newCol"; - sparkTable.table().updateSchema().addColumn("newCol", Types.IntegerType.get()) + sparkTable + .table() + .updateSchema() + .addColumn("newCol", Types.IntegerType.get()) .moveAfter(newCol1, "id") .commit(); Schema afterSchema = table.schema(); @@ -293,16 +319,20 @@ public void testAddColumnOnMigratedTableAtMiddle() throws Exception { @Test public void removeColumnsAtEnd() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_remove_column_migrated_table"); String dest = source; String colName1 = "newCol1"; String colName2 = "newCol2"; File location = temp.newFolder(); - spark.range(10).selectExpr("cast(id as INT)", "CAST(id as INT) " + colName1, "CAST(id as INT) " + colName2) + spark + .range(10) + .selectExpr("cast(id as INT)", "CAST(id as INT) " + colName1, "CAST(id as INT) " + colName2) .write() - .mode(SaveMode.Overwrite).saveAsTable(dest); + .mode(SaveMode.Overwrite) + .saveAsTable(dest); List expected1 = sql("select id, %s from %s order by id", colName1, source); List expected2 = sql("select id from %s order by id", source); @@ -336,13 +366,19 @@ public void removeColumnsAtEnd() throws Exception { @Test public void removeColumnFromMiddle() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_remove_column_migrated_table_from_middle"); String dest = source; String dropColumnName = "col1"; - spark.range(10).selectExpr("cast(id as INT)", "CAST(id as INT) as " + - dropColumnName, "CAST(id as INT) as col2").write().mode(SaveMode.Overwrite).saveAsTable(dest); + spark + .range(10) + .selectExpr( + "cast(id as INT)", "CAST(id as INT) as " + dropColumnName, "CAST(id as INT) as col2") + .write() + .mode(SaveMode.Overwrite) + .saveAsTable(dest); List expected = sql("select id, col2 from %s order by id", source); // migrate table @@ -362,7 +398,8 @@ public void removeColumnFromMiddle() throws Exception { @Test public void testMigrateUnpartitioned() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String source = sourceName("test_migrate_unpartitioned_table"); String dest = source; createSourceTable(CREATE_PARQUET, source); @@ -371,40 +408,49 @@ public void testMigrateUnpartitioned() throws Exception { @Test public void testSnapshotPartitioned() throws Exception { - Assume.assumeTrue("Cannot snapshot with arbitrary location in a hadoop based catalog", + Assume.assumeTrue( + "Cannot snapshot with arbitrary location in a hadoop based catalog", !type.equals("hadoop")); File location = temp.newFolder(); String source = sourceName("test_snapshot_partitioned_table"); String dest = destName("iceberg_snapshot_partitioned"); createSourceTable(CREATE_PARTITIONED_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); assertIsolatedSnapshot(source, dest); } @Test public void testSnapshotUnpartitioned() throws Exception { - Assume.assumeTrue("Cannot snapshot with arbitrary location in a hadoop based catalog", + Assume.assumeTrue( + "Cannot snapshot with arbitrary location in a hadoop based catalog", !type.equals("hadoop")); File location = temp.newFolder(); String source = sourceName("test_snapshot_unpartitioned_table"); String dest = destName("iceberg_snapshot_unpartitioned"); createSourceTable(CREATE_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); assertIsolatedSnapshot(source, dest); } @Test public void testSnapshotHiveTable() throws Exception { - Assume.assumeTrue("Cannot snapshot with arbitrary location in a hadoop based catalog", + Assume.assumeTrue( + "Cannot snapshot with arbitrary location in a hadoop based catalog", !type.equals("hadoop")); File location = temp.newFolder(); String source = sourceName("snapshot_hive_table"); String dest = destName("iceberg_snapshot_hive_table"); createSourceTable(CREATE_HIVE_EXTERNAL_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); assertIsolatedSnapshot(source, dest); } @@ -425,7 +471,9 @@ public void testSnapshotManagedHiveTable() throws Exception { String dest = destName("iceberg_snapshot_managed_hive_table"); createSourceTable(CREATE_HIVE_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); assertIsolatedSnapshot(source, dest); } @@ -437,7 +485,9 @@ public void testMigrateManagedHiveTable() throws Exception { String dest = destName("iceberg_migrate_managed_hive_table"); createSourceTable(CREATE_HIVE_PARQUET, source); assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableLocation(location.toString()), + source, + dest); } @Test @@ -449,11 +499,15 @@ public void testProperties() throws Exception { props.put("note", "Jazz"); createSourceTable(CREATE_PARQUET, source); for (Map.Entry keyValue : props.entrySet()) { - spark.sql(String.format("ALTER TABLE %s SET TBLPROPERTIES (\"%s\" = \"%s\")", - source, keyValue.getKey(), keyValue.getValue())); + spark.sql( + String.format( + "ALTER TABLE %s SET TBLPROPERTIES (\"%s\" = \"%s\")", + source, keyValue.getKey(), keyValue.getValue())); } assertSnapshotFileCount( - SparkActions.get().snapshotTable(source).as(dest).tableProperty("dogs", "sundance"), source, dest); + SparkActions.get().snapshotTable(source).as(dest).tableProperty("dogs", "sundance"), + source, + dest); SparkTable table = loadTable(dest); Map expectedProps = Maps.newHashMap(); @@ -464,8 +518,10 @@ public void testProperties() throws Exception { Assert.assertTrue( "Created table missing property " + entry.getKey(), table.properties().containsKey(entry.getKey())); - Assert.assertEquals("Property value is not the expected value", - entry.getValue(), table.properties().get(entry.getKey())); + Assert.assertEquals( + "Property value is not the expected value", + entry.getValue(), + table.properties().get(entry.getKey())); } } @@ -483,24 +539,38 @@ public void testSparkTableReservedProperties() throws Exception { String[] keys = {"provider", "format", "current-snapshot-id", "location", "sort-order"}; for (String entry : keys) { - Assert.assertTrue("Created table missing reserved property " + entry, table.properties().containsKey(entry)); + Assert.assertTrue( + "Created table missing reserved property " + entry, + table.properties().containsKey(entry)); } Assert.assertEquals("Unexpected provider", "iceberg", table.properties().get("provider")); Assert.assertEquals("Unexpected format", "iceberg/parquet", table.properties().get("format")); - Assert.assertNotEquals("No current-snapshot-id found", "none", table.properties().get("current-snapshot-id")); - Assert.assertTrue("Location isn't correct", table.properties().get("location").endsWith(destTableName)); + Assert.assertNotEquals( + "No current-snapshot-id found", "none", table.properties().get("current-snapshot-id")); + Assert.assertTrue( + "Location isn't correct", table.properties().get("location").endsWith(destTableName)); Assert.assertEquals("Unexpected format-version", "1", table.properties().get("format-version")); table.table().updateProperties().set("format-version", "2").commit(); Assert.assertEquals("Unexpected format-version", "2", table.properties().get("format-version")); - Assert.assertEquals("Sort-order isn't correct", "id ASC NULLS FIRST, data DESC NULLS LAST", + Assert.assertEquals( + "Sort-order isn't correct", + "id ASC NULLS FIRST, data DESC NULLS LAST", table.properties().get("sort-order")); - Assert.assertNull("Identifier fields should be null", table.properties().get("identifier-fields")); - - table.table().updateSchema().allowIncompatibleChanges().requireColumn("id").setIdentifierFields("id").commit(); - Assert.assertEquals("Identifier fields aren't correct", "[id]", table.properties().get("identifier-fields")); + Assert.assertNull( + "Identifier fields should be null", table.properties().get("identifier-fields")); + + table + .table() + .updateSchema() + .allowIncompatibleChanges() + .requireColumn("id") + .setIdentifierFields("id") + .commit(); + Assert.assertEquals( + "Identifier fields aren't correct", "[id]", table.properties().get("identifier-fields")); } @Test @@ -515,30 +585,37 @@ public void testSnapshotDefaultLocation() throws Exception { @Test public void schemaEvolutionTestWithSparkAPI() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); File location = temp.newFolder(); String tblName = sourceName("schema_evolution_test"); // Data generation and partition addition - spark.range(0, 5) + spark + .range(0, 5) .selectExpr("CAST(id as INT) as col0", "CAST(id AS FLOAT) col2", "CAST(id AS LONG) col3") .write() .mode(SaveMode.Append) .parquet(location.toURI().toString()); - Dataset rowDataset = spark.range(6, 10) - .selectExpr("CAST(id as INT) as col0", "CAST(id AS STRING) col1", - "CAST(id AS FLOAT) col2", "CAST(id AS LONG) col3"); - rowDataset - .write() - .mode(SaveMode.Append) - .parquet(location.toURI().toString()); - spark.read() + Dataset rowDataset = + spark + .range(6, 10) + .selectExpr( + "CAST(id as INT) as col0", + "CAST(id AS STRING) col1", + "CAST(id AS FLOAT) col2", + "CAST(id AS LONG) col3"); + rowDataset.write().mode(SaveMode.Append).parquet(location.toURI().toString()); + spark + .read() .schema(rowDataset.schema()) - .parquet(location.toURI().toString()).write().saveAsTable(tblName); + .parquet(location.toURI().toString()) + .write() + .saveAsTable(tblName); List expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName); - List expectedAfterAddColumn = sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", - tblName); + List expectedAfterAddColumn = + sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", tblName); // Migrate table SparkActions.get().migrateTable(tblName).execute(); @@ -549,7 +626,10 @@ public void schemaEvolutionTestWithSparkAPI() throws Exception { // Update schema and check output correctness SparkTable sparkTable = loadTable(tblName); - sparkTable.table().updateSchema().addColumn("newCol", Types.IntegerType.get()) + sparkTable + .table() + .updateSchema() + .addColumn("newCol", Types.IntegerType.get()) .moveAfter("newCol", "col0") .commit(); List afterMigarteAfterAddResults = sql("SELECT * FROM %s ORDER BY col0", tblName); @@ -559,23 +639,30 @@ public void schemaEvolutionTestWithSparkAPI() throws Exception { @Test public void schemaEvolutionTestWithSparkSQL() throws Exception { Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop")); - Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); + Assume.assumeTrue( + "Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog")); String tblName = sourceName("schema_evolution_test_sql"); // Data generation and partition addition - spark.range(0, 5) + spark + .range(0, 5) .selectExpr("CAST(id as INT) col0", "CAST(id AS FLOAT) col1", "CAST(id AS STRING) col2") .write() .mode(SaveMode.Append) .saveAsTable(tblName); sql("ALTER TABLE %s ADD COLUMN col3 INT", tblName); - spark.range(6, 10) - .selectExpr("CAST(id AS INT) col0", "CAST(id AS FLOAT) col1", "CAST(id AS STRING) col2", "CAST(id AS INT) col3") + spark + .range(6, 10) + .selectExpr( + "CAST(id AS INT) col0", + "CAST(id AS FLOAT) col1", + "CAST(id AS STRING) col2", + "CAST(id AS INT) col3") .registerTempTable("tempdata"); sql("INSERT INTO TABLE %s SELECT * FROM tempdata", tblName); List expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName); - List expectedAfterAddColumn = sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", - tblName); + List expectedAfterAddColumn = + sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", tblName); // Migrate table SparkActions.get().migrateTable(tblName).execute(); @@ -586,7 +673,10 @@ public void schemaEvolutionTestWithSparkSQL() throws Exception { // Update schema and check output correctness SparkTable sparkTable = loadTable(tblName); - sparkTable.table().updateSchema().addColumn("newCol", Types.IntegerType.get()) + sparkTable + .table() + .updateSchema() + .addColumn("newCol", Types.IntegerType.get()) .moveAfter("newCol", "col0") .commit(); List afterMigarteAfterAddResults = sql("SELECT * FROM %s ORDER BY col0", tblName); @@ -642,52 +732,70 @@ public void testTwoLevelList() throws IOException { StructType sparkSchema = new StructType( - new StructField[]{ - new StructField( - "col1", new ArrayType( - new StructType( - new StructField[]{ - new StructField( - "col2", - DataTypes.IntegerType, - false, - Metadata.empty()) - }), false), true, Metadata.empty())}); - - // even though this list looks like three level list, it is actually a 2-level list where the items are + new StructField[] { + new StructField( + "col1", + new ArrayType( + new StructType( + new StructField[] { + new StructField("col2", DataTypes.IntegerType, false, Metadata.empty()) + }), + false), + true, + Metadata.empty()) + }); + + // even though this list looks like three level list, it is actually a 2-level list where the + // items are // structs with 1 field. String expectedParquetSchema = - "message spark_schema {\n" + - " optional group col1 (LIST) {\n" + - " repeated group array {\n" + - " required int32 col2;\n" + - " }\n" + - " }\n" + - "}\n"; + "message spark_schema {\n" + + " optional group col1 (LIST) {\n" + + " repeated group array {\n" + + " required int32 col2;\n" + + " }\n" + + " }\n" + + "}\n"; // generate parquet file with required schema List testData = Collections.singletonList("{\"col1\": [{\"col2\": 1}]}"); - spark.read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(testData)) - .coalesce(1).write().format("parquet").mode(SaveMode.Append).save(location.getPath()); - - File parquetFile = Arrays.stream(Preconditions.checkNotNull(location.listFiles(new FilenameFilter() { - @Override - public boolean accept(File dir, String name) { - return name.endsWith("parquet"); - } - }))).findAny().get(); + spark + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(testData)) + .coalesce(1) + .write() + .format("parquet") + .mode(SaveMode.Append) + .save(location.getPath()); + + File parquetFile = + Arrays.stream( + Preconditions.checkNotNull( + location.listFiles( + new FilenameFilter() { + @Override + public boolean accept(File dir, String name) { + return name.endsWith("parquet"); + } + }))) + .findAny() + .get(); // verify generated parquet file has expected schema - ParquetFileReader pqReader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(parquetFile.getPath()), - spark.sessionState().newHadoopConf())); + ParquetFileReader pqReader = + ParquetFileReader.open( + HadoopInputFile.fromPath( + new Path(parquetFile.getPath()), spark.sessionState().newHadoopConf())); MessageType schema = pqReader.getFooter().getFileMetaData().getSchema(); Assert.assertEquals(MessageTypeParser.parseMessageType(expectedParquetSchema), schema); // create sql table on top of it - sql("CREATE EXTERNAL TABLE %s (col1 ARRAY>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE EXTERNAL TABLE %s (col1 ARRAY>)" + + " STORED AS parquet" + + " LOCATION '%s'", + tableName, location); List expected = sql("select array(struct(1))"); // migrate table @@ -704,9 +812,9 @@ private void threeLevelList(boolean useLegacyMode) throws Exception { String tableName = sourceName(String.format("threeLevelList_%s", useLegacyMode)); File location = temp.newFolder(); - sql("CREATE TABLE %s (col1 ARRAY>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (col1 ARRAY>)" + " STORED AS parquet" + " LOCATION '%s'", + tableName, location); int testValue = 12345; sql("INSERT INTO %s VALUES (ARRAY(STRUCT(%s)))", tableName, testValue); @@ -724,11 +832,14 @@ private void threeLevelList(boolean useLegacyMode) throws Exception { private void threeLevelListWithNestedStruct(boolean useLegacyMode) throws Exception { spark.conf().set("spark.sql.parquet.writeLegacyFormat", useLegacyMode); - String tableName = sourceName(String.format("threeLevelListWithNestedStruct_%s", useLegacyMode)); + String tableName = + sourceName(String.format("threeLevelListWithNestedStruct_%s", useLegacyMode)); File location = temp.newFolder(); - sql("CREATE TABLE %s (col1 ARRAY>>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (col1 ARRAY>>)" + + " STORED AS parquet" + + " LOCATION '%s'", + tableName, location); int testValue = 12345; sql("INSERT INTO %s VALUES (ARRAY(STRUCT(STRUCT(%s))))", tableName, testValue); @@ -748,13 +859,16 @@ private void threeLevelLists(boolean useLegacyMode) throws Exception { String tableName = sourceName(String.format("threeLevelLists_%s", useLegacyMode)); File location = temp.newFolder(); - sql("CREATE TABLE %s (col1 ARRAY>, col3 ARRAY>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (col1 ARRAY>, col3 ARRAY>)" + + " STORED AS parquet" + + " LOCATION '%s'", + tableName, location); int testValue1 = 12345; int testValue2 = 987654; - sql("INSERT INTO %s VALUES (ARRAY(STRUCT(%s)), ARRAY(STRUCT(%s)))", + sql( + "INSERT INTO %s VALUES (ARRAY(STRUCT(%s)), ARRAY(STRUCT(%s)))", tableName, testValue1, testValue2); List expected = sql(String.format("SELECT * FROM %s", tableName)); @@ -772,13 +886,14 @@ private void structOfThreeLevelLists(boolean useLegacyMode) throws Exception { String tableName = sourceName(String.format("structOfThreeLevelLists_%s", useLegacyMode)); File location = temp.newFolder(); - sql("CREATE TABLE %s (col1 STRUCT>>)" + - " STORED AS parquet" + - " LOCATION '%s'", tableName, location); + sql( + "CREATE TABLE %s (col1 STRUCT>>)" + + " STORED AS parquet" + + " LOCATION '%s'", + tableName, location); int testValue1 = 12345; - sql("INSERT INTO %s VALUES (STRUCT(STRUCT(ARRAY(STRUCT(%s)))))", - tableName, testValue1); + sql("INSERT INTO %s VALUES (STRUCT(STRUCT(ARRAY(STRUCT(%s)))))", tableName, testValue1); List expected = sql(String.format("SELECT * FROM %s", tableName)); // migrate table @@ -790,16 +905,19 @@ private void structOfThreeLevelLists(boolean useLegacyMode) throws Exception { assertEquals("Output must match", expected, results); } - private SparkTable loadTable(String name) throws NoSuchTableException, ParseException { - return (SparkTable) catalog.loadTable(Spark3Util.catalogAndIdentifier(spark, name).identifier()); + return (SparkTable) + catalog.loadTable(Spark3Util.catalogAndIdentifier(spark, name).identifier()); } private CatalogTable loadSessionTable(String name) throws NoSuchTableException, NoSuchDatabaseException, ParseException { Identifier identifier = Spark3Util.catalogAndIdentifier(spark, name).identifier(); Some namespace = Some.apply(identifier.namespace()[0]); - return spark.sessionState().catalog().getTableMetadata(new TableIdentifier(identifier.name(), namespace)); + return spark + .sessionState() + .catalog() + .getTableMetadata(new TableIdentifier(identifier.name(), namespace)); } private void createSourceTable(String createStatement, String tableName) @@ -809,41 +927,57 @@ private void createSourceTable(String createStatement, String tableName) CatalogTable table = loadSessionTable(tableName); Seq partitionColumns = table.partitionColumnNames(); String format = table.provider().get(); - spark.table(baseTableName).write().mode(SaveMode.Append).format(format).partitionBy(partitionColumns.toSeq()) + spark + .table(baseTableName) + .write() + .mode(SaveMode.Append) + .format(format) + .partitionBy(partitionColumns.toSeq()) .saveAsTable(tableName); } - // Counts the number of files in the source table, makes sure the same files exist in the destination table + // Counts the number of files in the source table, makes sure the same files exist in the + // destination table private void assertMigratedFileCount(MigrateTable migrateAction, String source, String dest) throws NoSuchTableException, NoSuchDatabaseException, ParseException { long expectedFiles = expectedFilesCount(source); MigrateTable.Result migratedFiles = migrateAction.execute(); validateTables(source, dest); - Assert.assertEquals("Expected number of migrated files", - expectedFiles, migratedFiles.migratedDataFilesCount()); + Assert.assertEquals( + "Expected number of migrated files", expectedFiles, migratedFiles.migratedDataFilesCount()); } - // Counts the number of files in the source table, makes sure the same files exist in the destination table + // Counts the number of files in the source table, makes sure the same files exist in the + // destination table private void assertSnapshotFileCount(SnapshotTable snapshotTable, String source, String dest) throws NoSuchTableException, NoSuchDatabaseException, ParseException { long expectedFiles = expectedFilesCount(source); SnapshotTable.Result snapshotTableResult = snapshotTable.execute(); validateTables(source, dest); - Assert.assertEquals("Expected number of imported snapshot files", expectedFiles, + Assert.assertEquals( + "Expected number of imported snapshot files", + expectedFiles, snapshotTableResult.importedDataFilesCount()); } - private void validateTables(String source, String dest) throws NoSuchTableException, ParseException { + private void validateTables(String source, String dest) + throws NoSuchTableException, ParseException { List expected = spark.table(source).collectAsList(); SparkTable destTable = loadTable(dest); - Assert.assertEquals("Provider should be iceberg", "iceberg", + Assert.assertEquals( + "Provider should be iceberg", + "iceberg", destTable.properties().get(TableCatalog.PROP_PROVIDER)); List actual = spark.table(dest).collectAsList(); - Assert.assertTrue(String.format("Rows in migrated table did not match\nExpected :%s rows \nFound :%s", - expected, actual), expected.containsAll(actual) && actual.containsAll(expected)); + Assert.assertTrue( + String.format( + "Rows in migrated table did not match\nExpected :%s rows \nFound :%s", + expected, actual), + expected.containsAll(actual) && actual.containsAll(expected)); } - private long expectedFilesCount(String source) throws NoSuchDatabaseException, NoSuchTableException, ParseException { + private long expectedFilesCount(String source) + throws NoSuchDatabaseException, NoSuchTableException, ParseException { CatalogTable sourceTable = loadSessionTable(source); List uris; if (sourceTable.partitionColumnNames().size() == 0) { @@ -851,34 +985,42 @@ private long expectedFilesCount(String source) throws NoSuchDatabaseException, N uris.add(sourceTable.location()); } else { Seq catalogTablePartitionSeq = - spark.sessionState().catalog().listPartitions(sourceTable.identifier(), Option.apply(null)); - uris = JavaConverters.seqAsJavaList(catalogTablePartitionSeq) - .stream() - .map(CatalogTablePartition::location) - .collect(Collectors.toList()); + spark + .sessionState() + .catalog() + .listPartitions(sourceTable.identifier(), Option.apply(null)); + uris = + JavaConverters.seqAsJavaList(catalogTablePartitionSeq).stream() + .map(CatalogTablePartition::location) + .collect(Collectors.toList()); } return uris.stream() - .flatMap(uri -> - FileUtils.listFiles(Paths.get(uri).toFile(), - TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).stream()) - .filter(file -> !file.toString().endsWith("crc") && !file.toString().contains("_SUCCESS")).count(); + .flatMap( + uri -> + FileUtils.listFiles( + Paths.get(uri).toFile(), TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE) + .stream()) + .filter(file -> !file.toString().endsWith("crc") && !file.toString().contains("_SUCCESS")) + .count(); } - // Insert records into the destination, makes sure those records exist and source table is unchanged + // Insert records into the destination, makes sure those records exist and source table is + // unchanged private void assertIsolatedSnapshot(String source, String dest) { List expected = spark.sql(String.format("SELECT * FROM %s", source)).collectAsList(); - List extraData = Lists.newArrayList( - new SimpleRecord(4, "d") - ); + List extraData = Lists.newArrayList(new SimpleRecord(4, "d")); Dataset df = spark.createDataFrame(extraData, SimpleRecord.class); df.write().format("iceberg").mode("append").saveAsTable(dest); List result = spark.sql(String.format("SELECT * FROM %s", source)).collectAsList(); - Assert.assertEquals("No additional rows should be added to the original table", expected.size(), - result.size()); + Assert.assertEquals( + "No additional rows should be added to the original table", expected.size(), result.size()); - List snapshot = spark.sql(String.format("SELECT * FROM %s WHERE id = 4 AND data = 'd'", dest)).collectAsList(); + List snapshot = + spark + .sql(String.format("SELECT * FROM %s WHERE id = 4 AND data = 'd'", dest)) + .collectAsList(); Assert.assertEquals("Added row not found in snapshot", 1, snapshot.size()); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java index 9792d772b12d..9090da2fe69b 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestDeleteReachableFilesAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -54,46 +55,47 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestDeleteReachableFilesAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); private static final int SHUFFLE_PARTITIONS = 2; private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("c1").build(); - static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(0)) - .withRecordCount(1) - .build(); - static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(1)) - .withRecordCount(1) - .build(); - static final DataFile FILE_C = DataFiles.builder(SPEC) - .withPath("/path/to/data-c.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(2)) - .withRecordCount(1) - .build(); - static final DataFile FILE_D = DataFiles.builder(SPEC) - .withPath("/path/to/data-d.parquet") - .withFileSizeInBytes(10) - .withPartition(TestHelpers.Row.of(3)) - .withRecordCount(1) - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(0)) + .withRecordCount(1) + .build(); + static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(1)) + .withRecordCount(1) + .build(); + static final DataFile FILE_C = + DataFiles.builder(SPEC) + .withPath("/path/to/data-c.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(2)) + .withRecordCount(1) + .build(); + static final DataFile FILE_D = + DataFiles.builder(SPEC) + .withPath("/path/to/data-d.parquet") + .withFileSizeInBytes(10) + .withPartition(TestHelpers.Row.of(3)) + .withRecordCount(1) + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @@ -105,62 +107,76 @@ public void setupTableLocation() throws Exception { spark.conf().set("spark.sql.shuffle.partitions", SHUFFLE_PARTITIONS); } - private void checkRemoveFilesResults(long expectedDatafiles, long expectedManifestsDeleted, - long expectedManifestListsDeleted, long expectedOtherFilesDeleted, - DeleteReachableFiles.Result results) { - Assert.assertEquals("Incorrect number of manifest files deleted", - expectedManifestsDeleted, results.deletedManifestsCount()); - Assert.assertEquals("Incorrect number of datafiles deleted", - expectedDatafiles, results.deletedDataFilesCount()); - Assert.assertEquals("Incorrect number of manifest lists deleted", - expectedManifestListsDeleted, results.deletedManifestListsCount()); - Assert.assertEquals("Incorrect number of other lists deleted", - expectedOtherFilesDeleted, results.deletedOtherFilesCount()); + private void checkRemoveFilesResults( + long expectedDatafiles, + long expectedManifestsDeleted, + long expectedManifestListsDeleted, + long expectedOtherFilesDeleted, + DeleteReachableFiles.Result results) { + Assert.assertEquals( + "Incorrect number of manifest files deleted", + expectedManifestsDeleted, + results.deletedManifestsCount()); + Assert.assertEquals( + "Incorrect number of datafiles deleted", + expectedDatafiles, + results.deletedDataFilesCount()); + Assert.assertEquals( + "Incorrect number of manifest lists deleted", + expectedManifestListsDeleted, + results.deletedManifestListsCount()); + Assert.assertEquals( + "Incorrect number of other lists deleted", + expectedOtherFilesDeleted, + results.deletedOtherFilesCount()); } @Test public void dataFilesCleanupWithParallelTasks() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit(); Set deletedFiles = ConcurrentHashMap.newKeySet(); Set deleteThreads = ConcurrentHashMap.newKeySet(); AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - DeleteReachableFiles.Result result = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .executeDeleteWith(Executors.newFixedThreadPool(4, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-files-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })) - .deleteWith(s -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(s); - }) - .execute(); - - // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory - Assert.assertEquals(deleteThreads, + DeleteReachableFiles.Result result = + sparkActions() + .deleteReachableFiles(metadataLocation(table)) + .io(table.io()) + .executeDeleteWith( + Executors.newFixedThreadPool( + 4, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("remove-files-" + deleteThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })) + .deleteWith( + s -> { + deleteThreads.add(Thread.currentThread().getName()); + deletedFiles.add(s); + }) + .execute(); + + // Verifies that the delete methods ran in the threads created by the provided ExecutorService + // ThreadFactory + Assert.assertEquals( + deleteThreads, Sets.newHashSet("remove-files-0", "remove-files-1", "remove-files-2", "remove-files-3")); - Lists.newArrayList(FILE_A, FILE_B, FILE_C, FILE_D).forEach(file -> - Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())) - ); + Lists.newArrayList(FILE_A, FILE_B, FILE_C, FILE_D) + .forEach( + file -> + Assert.assertTrue( + "FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString()))); checkRemoveFilesResults(4L, 6L, 4L, 6, result); } @@ -168,64 +184,43 @@ public void dataFilesCleanupWithParallelTasks() { public void testWithExpiringDanglingStageCommit() { table.location(); // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` staged commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); // `C` commit - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); - DeleteReachableFiles.Result result = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .execute(); + DeleteReachableFiles.Result result = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()).execute(); checkRemoveFilesResults(3L, 3L, 3L, 5, result); } @Test public void testRemoveFileActionOnEmptyTable() { - DeleteReachableFiles.Result result = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .execute(); + DeleteReachableFiles.Result result = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()).execute(); checkRemoveFilesResults(0, 0, 0, 2, result); } @Test public void testRemoveFilesActionWithReducedVersionsTable() { - table.updateProperties() - .set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "2").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); - - table.newAppend() - .appendFile(FILE_B) - .commit(); - - table.newAppend() - .appendFile(FILE_B) - .commit(); - - table.newAppend() - .appendFile(FILE_C) - .commit(); - - table.newAppend() - .appendFile(FILE_D) - .commit(); - - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()); + table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "2").commit(); + table.newAppend().appendFile(FILE_A).commit(); + + table.newAppend().appendFile(FILE_B).commit(); + + table.newAppend().appendFile(FILE_B).commit(); + + table.newAppend().appendFile(FILE_C).commit(); + + table.newAppend().appendFile(FILE_D).commit(); + + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); DeleteReachableFiles.Result result = baseRemoveFilesSparkAction.execute(); checkRemoveFilesResults(4, 5, 5, 8, result); @@ -233,113 +228,101 @@ public void testRemoveFilesActionWithReducedVersionsTable() { @Test public void testRemoveFilesAction() { - table.newAppend() - .appendFile(FILE_A) - .commit(); - - table.newAppend() - .appendFile(FILE_B) - .commit(); - - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()); - checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); + table.newAppend().appendFile(FILE_A).commit(); + + table.newAppend().appendFile(FILE_B).commit(); + + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); + checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); } @Test public void testRemoveFilesActionWithDefaultIO() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); // IO not set explicitly on removeReachableFiles action // IO defaults to HadoopFileIO - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)); - checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)); + checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute()); } @Test public void testUseLocalIterator() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); int jobsBefore = spark.sparkContext().dagScheduler().nextJobId().get(); - withSQLConf(ImmutableMap.of("spark.sql.adaptive.enabled", "false"), () -> { - DeleteReachableFiles.Result results = sparkActions().deleteReachableFiles(metadataLocation(table)) - .io(table.io()) - .option("stream-results", "true").execute(); - - int jobsAfter = spark.sparkContext().dagScheduler().nextJobId().get(); - int totalJobsRun = jobsAfter - jobsBefore; - - checkRemoveFilesResults(3L, 4L, 3L, 5, results); - - Assert.assertEquals( - "Expected total jobs to be equal to total number of shuffle partitions", - totalJobsRun, SHUFFLE_PARTITIONS); - }); + withSQLConf( + ImmutableMap.of("spark.sql.adaptive.enabled", "false"), + () -> { + DeleteReachableFiles.Result results = + sparkActions() + .deleteReachableFiles(metadataLocation(table)) + .io(table.io()) + .option("stream-results", "true") + .execute(); + + int jobsAfter = spark.sparkContext().dagScheduler().nextJobId().get(); + int totalJobsRun = jobsAfter - jobsBefore; + + checkRemoveFilesResults(3L, 4L, 3L, 5, results); + + Assert.assertEquals( + "Expected total jobs to be equal to total number of shuffle partitions", + totalJobsRun, + SHUFFLE_PARTITIONS); + }); } @Test public void testIgnoreMetadataFilesNotFound() { - table.updateProperties() - .set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1").commit(); + table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // There are three metadata json files at this point DeleteOrphanFiles.Result result = sparkActions().deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); - Assert.assertTrue("Should remove v1 file", + Assert.assertTrue( + "Should remove v1 file", StreamSupport.stream(result.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("v1.metadata.json"))); - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(table.io()); + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()); DeleteReachableFiles.Result res = baseRemoveFilesSparkAction.execute(); - checkRemoveFilesResults(1, 1, 1, 4, res); + checkRemoveFilesResults(1, 1, 1, 4, res); } @Test public void testEmptyIOThrowsException() { - DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions() - .deleteReachableFiles(metadataLocation(table)) - .io(null); - AssertHelpers.assertThrows("FileIO can't be null in DeleteReachableFiles action", - IllegalArgumentException.class, "File IO cannot be null", + DeleteReachableFiles baseRemoveFilesSparkAction = + sparkActions().deleteReachableFiles(metadataLocation(table)).io(null); + AssertHelpers.assertThrows( + "FileIO can't be null in DeleteReachableFiles action", + IllegalArgumentException.class, + "File IO cannot be null", baseRemoveFilesSparkAction::execute); } @Test public void testRemoveFilesActionWhenGarbageCollectionDisabled() { - table.updateProperties() - .set(TableProperties.GC_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - AssertHelpers.assertThrows("Should complain about removing files when GC is disabled", - ValidationException.class, "Cannot delete files: GC is disabled (deleting files may corrupt other tables)", + AssertHelpers.assertThrows( + "Should complain about removing files when GC is disabled", + ValidationException.class, + "Cannot delete files: GC is disabled (deleting files may corrupt other tables)", () -> sparkActions().deleteReachableFiles(metadataLocation(table)).execute()); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java index 232a85f9840f..6c6240a3b589 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestExpireSnapshotsAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.util.List; @@ -61,60 +62,63 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestExpireSnapshotsAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); private static final int SHUFFLE_PARTITIONS = 2; private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("c1").build(); - static final DataFile FILE_A = DataFiles.builder(SPEC) - .withPath("/path/to/data-a.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_B = DataFiles.builder(SPEC) - .withPath("/path/to/data-b.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=1") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_C = DataFiles.builder(SPEC) - .withPath("/path/to/data-c.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=2") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DataFile FILE_D = DataFiles.builder(SPEC) - .withPath("/path/to/data-d.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=3") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DeleteFile FILE_A_POS_DELETES = FileMetadata.deleteFileBuilder(SPEC) - .ofPositionDeletes() - .withPath("/path/to/data-a-pos-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - static final DeleteFile FILE_A_EQ_DELETES = FileMetadata.deleteFileBuilder(SPEC) - .ofEqualityDeletes() - .withPath("/path/to/data-a-eq-deletes.parquet") - .withFileSizeInBytes(10) - .withPartitionPath("c1=0") // easy way to set partition data for now - .withRecordCount(1) - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + static final DataFile FILE_A = + DataFiles.builder(SPEC) + .withPath("/path/to/data-a.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_B = + DataFiles.builder(SPEC) + .withPath("/path/to/data-b.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=1") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_C = + DataFiles.builder(SPEC) + .withPath("/path/to/data-c.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=2") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DataFile FILE_D = + DataFiles.builder(SPEC) + .withPath("/path/to/data-d.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=3") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DeleteFile FILE_A_POS_DELETES = + FileMetadata.deleteFileBuilder(SPEC) + .ofPositionDeletes() + .withPath("/path/to/data-a-pos-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + static final DeleteFile FILE_A_EQ_DELETES = + FileMetadata.deleteFileBuilder(SPEC) + .ofEqualityDeletes() + .withPath("/path/to/data-a-eq-deletes.parquet") + .withFileSizeInBytes(10) + .withPartitionPath("c1=0") // easy way to set partition data for now + .withRecordCount(1) + .build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableDir; private String tableLocation; @@ -140,42 +144,51 @@ private Long rightAfterSnapshot(long snapshotId) { return end; } - private void checkExpirationResults(long expectedDatafiles, long expectedPosDeleteFiles, long expectedEqDeleteFiles, - long expectedManifestsDeleted, long expectedManifestListsDeleted, - ExpireSnapshots.Result results) { - - Assert.assertEquals("Incorrect number of manifest files deleted", - expectedManifestsDeleted, results.deletedManifestsCount()); - Assert.assertEquals("Incorrect number of datafiles deleted", - expectedDatafiles, results.deletedDataFilesCount()); - Assert.assertEquals("Incorrect number of pos deletefiles deleted", - expectedPosDeleteFiles, results.deletedPositionDeleteFilesCount()); - Assert.assertEquals("Incorrect number of eq deletefiles deleted", - expectedEqDeleteFiles, results.deletedEqualityDeleteFilesCount()); - Assert.assertEquals("Incorrect number of manifest lists deleted", - expectedManifestListsDeleted, results.deletedManifestListsCount()); + private void checkExpirationResults( + long expectedDatafiles, + long expectedPosDeleteFiles, + long expectedEqDeleteFiles, + long expectedManifestsDeleted, + long expectedManifestListsDeleted, + ExpireSnapshots.Result results) { + + Assert.assertEquals( + "Incorrect number of manifest files deleted", + expectedManifestsDeleted, + results.deletedManifestsCount()); + Assert.assertEquals( + "Incorrect number of datafiles deleted", + expectedDatafiles, + results.deletedDataFilesCount()); + Assert.assertEquals( + "Incorrect number of pos deletefiles deleted", + expectedPosDeleteFiles, + results.deletedPositionDeleteFilesCount()); + Assert.assertEquals( + "Incorrect number of eq deletefiles deleted", + expectedEqDeleteFiles, + results.deletedEqualityDeleteFilesCount()); + Assert.assertEquals( + "Incorrect number of manifest lists deleted", + expectedManifestListsDeleted, + results.deletedManifestListsCount()); } @Test public void testFilesCleaned() throws Exception { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); long end = rightAfterSnapshot(); - ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); + ExpireSnapshots.Result results = + SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); - Assert.assertEquals("Table does not have 1 snapshot after expiration", 1, Iterables.size(table.snapshots())); + Assert.assertEquals( + "Table does not have 1 snapshot after expiration", 1, Iterables.size(table.snapshots())); checkExpirationResults(1L, 0L, 0L, 1L, 2L, results); } @@ -183,21 +196,13 @@ public void testFilesCleaned() throws Exception { @Test public void dataFilesCleanupWithParallelTasks() throws IOException { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_B) - .commit(); + table.newFastAppend().appendFile(FILE_B).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit(); - table.newRewrite() - .rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)) - .commit(); + table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit(); long t4 = rightAfterSnapshot(); @@ -205,23 +210,33 @@ public void dataFilesCleanupWithParallelTasks() throws IOException { Set deleteThreads = ConcurrentHashMap.newKeySet(); AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .executeDeleteWith(Executors.newFixedThreadPool(4, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-snapshot-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); // daemon threads will be terminated abruptly when the JVM exits - return thread; - })) - .expireOlderThan(t4) - .deleteWith(s -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(s); - }) - .execute(); - - // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory - Assert.assertEquals(deleteThreads, - Sets.newHashSet("remove-snapshot-0", "remove-snapshot-1", "remove-snapshot-2", "remove-snapshot-3")); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .executeDeleteWith( + Executors.newFixedThreadPool( + 4, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("remove-snapshot-" + deleteThreadsIndex.getAndIncrement()); + thread.setDaemon( + true); // daemon threads will be terminated abruptly when the JVM exits + return thread; + })) + .expireOlderThan(t4) + .deleteWith( + s -> { + deleteThreads.add(Thread.currentThread().getName()); + deletedFiles.add(s); + }) + .execute(); + + // Verifies that the delete methods ran in the threads created by the provided ExecutorService + // ThreadFactory + Assert.assertEquals( + deleteThreads, + Sets.newHashSet( + "remove-snapshot-0", "remove-snapshot-1", "remove-snapshot-2", "remove-snapshot-3")); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); Assert.assertTrue("FILE_B should be deleted", deletedFiles.contains(FILE_B.path().toString())); @@ -231,9 +246,7 @@ public void dataFilesCleanupWithParallelTasks() throws IOException { @Test public void testNoFilesDeletedWhenNoSnapshotsExpired() throws Exception { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).execute(); checkExpirationResults(0L, 0L, 0L, 0L, 0L, results); @@ -241,30 +254,24 @@ public void testNoFilesDeletedWhenNoSnapshotsExpired() throws Exception { @Test public void testCleanupRepeatedOverwrites() throws Exception { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); for (int i = 0; i < 10; i++) { - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); - - table.newOverwrite() - .deleteFile(FILE_B) - .addFile(FILE_A) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); + + table.newOverwrite().deleteFile(FILE_B).addFile(FILE_A).commit(); } long end = rightAfterSnapshot(); - ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); + ExpireSnapshots.Result results = + SparkActions.get().expireSnapshots(table).expireOlderThan(end).execute(); checkExpirationResults(1L, 0L, 0L, 39L, 20L, results); } @Test public void testRetainLastWithExpireOlderThan() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); @@ -273,217 +280,256 @@ public void testRetainLastWithExpireOlderThan() { t1 = System.currentTimeMillis(); } - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); long t3 = rightAfterSnapshot(); // Retain last 2 snapshots - SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .retainLast(2) - .execute(); + SparkActions.get().expireSnapshots(table).expireOlderThan(t3).retainLast(2).execute(); - Assert.assertEquals("Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + Assert.assertEquals( + "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); } @Test public void testExpireTwoSnapshotsById() throws Exception { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); long secondSnapshotID = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); // Retain last 2 snapshots - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireSnapshotId(firstSnapshotId) - .expireSnapshotId(secondSnapshotID) - .execute(); - - Assert.assertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId)); - Assert.assertEquals("Second snapshot should not be present.", null, table.snapshot(secondSnapshotID)); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireSnapshotId(firstSnapshotId) + .expireSnapshotId(secondSnapshotID) + .execute(); + + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + Assert.assertEquals( + "Second snapshot should not be present.", null, table.snapshot(secondSnapshotID)); checkExpirationResults(0L, 0L, 0L, 0L, 2L, result); } @Test public void testRetainLastWithExpireById() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); // Retain last 3 snapshots, but explicitly remove the first snapshot - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireSnapshotId(firstSnapshotId) - .retainLast(3) - .execute(); - - Assert.assertEquals("Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should not present.", null, table.snapshot(firstSnapshotId)); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireSnapshotId(firstSnapshotId) + .retainLast(3) + .execute(); + + Assert.assertEquals( + "Should have two snapshots.", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should not present.", null, table.snapshot(firstSnapshotId)); checkExpirationResults(0L, 0L, 0L, 0L, 1L, result); } @Test public void testRetainLastWithTooFewSnapshots() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .appendFile(FILE_B) // data_bucket=1 .commit(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); long t2 = rightAfterSnapshot(); // Retain last 3 snapshots - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t2) - .retainLast(3) - .execute(); - - Assert.assertEquals("Should have two snapshots", 2, Lists.newArrayList(table.snapshots()).size()); - Assert.assertEquals("First snapshot should still present", - firstSnapshotId, table.snapshot(firstSnapshotId).snapshotId()); + ExpireSnapshots.Result result = + SparkActions.get().expireSnapshots(table).expireOlderThan(t2).retainLast(3).execute(); + + Assert.assertEquals( + "Should have two snapshots", 2, Lists.newArrayList(table.snapshots()).size()); + Assert.assertEquals( + "First snapshot should still present", + firstSnapshotId, + table.snapshot(firstSnapshotId).snapshotId()); checkExpirationResults(0L, 0L, 0L, 0L, 0L, result); } @Test public void testRetainLastKeepsExpiringSnapshot() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_D) // data_bucket=3 .commit(); // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(secondSnapshot.timestampMillis()) - .retainLast(2) - .execute(); - - Assert.assertEquals("Should have three snapshots.", 3, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNotNull("Second snapshot should present.", table.snapshot(secondSnapshot.snapshotId())); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(secondSnapshot.timestampMillis()) + .retainLast(2) + .execute(); + + Assert.assertEquals( + "Should have three snapshots.", 3, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNotNull( + "Second snapshot should present.", table.snapshot(secondSnapshot.snapshotId())); checkExpirationResults(0L, 0L, 0L, 0L, 1L, result); } @Test public void testExpireSnapshotsWithDisabledGarbageCollection() { - table.updateProperties() - .set(TableProperties.GC_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); - AssertHelpers.assertThrows("Should complain about expiring snapshots", - ValidationException.class, "Cannot expire snapshots: GC is disabled", + AssertHelpers.assertThrows( + "Should complain about expiring snapshots", + ValidationException.class, + "Cannot expire snapshots: GC is disabled", () -> SparkActions.get().expireSnapshots(table)); } @Test public void testExpireOlderThanMultipleCalls() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); Snapshot thirdSnapshot = table.currentSnapshot(); // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(secondSnapshot.timestampMillis()) - .expireOlderThan(thirdSnapshot.timestampMillis()) - .execute(); - - Assert.assertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNull("Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(secondSnapshot.timestampMillis()) + .expireOlderThan(thirdSnapshot.timestampMillis()) + .execute(); + + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNull( + "Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); checkExpirationResults(0L, 0L, 0L, 0L, 2L, result); } @Test public void testRetainLastMultipleCalls() { - table.newAppend() + table + .newAppend() .appendFile(FILE_A) // data_bucket=0 .commit(); - table.newAppend() + table + .newAppend() .appendFile(FILE_B) // data_bucket=1 .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - table.newAppend() + table + .newAppend() .appendFile(FILE_C) // data_bucket=2 .commit(); long t3 = rightAfterSnapshot(); // Retain last 2 snapshots and expire older than t3 - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .retainLast(2) - .retainLast(1) - .execute(); - - Assert.assertEquals("Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); - Assert.assertNull("Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(t3) + .retainLast(2) + .retainLast(1) + .execute(); + + Assert.assertEquals( + "Should have one snapshots.", 1, Lists.newArrayList(table.snapshots()).size()); + Assert.assertNull( + "Second snapshot should not present.", table.snapshot(secondSnapshot.snapshotId())); checkExpirationResults(0L, 0L, 0L, 0L, 2L, result); } @Test public void testRetainZeroSnapshots() { - AssertHelpers.assertThrows("Should fail retain 0 snapshots " + - "because number of snapshots to retain cannot be zero", + AssertHelpers.assertThrows( + "Should fail retain 0 snapshots " + "because number of snapshots to retain cannot be zero", IllegalArgumentException.class, "Number of snapshots to retain must be at least 1, cannot be: 0", () -> SparkActions.get().expireSnapshots(table).retainLast(0).execute()); @@ -491,28 +537,22 @@ public void testRetainZeroSnapshots() { @Test public void testScanExpiredManifestInValidSnapshotAppend() { - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.newOverwrite() - .addFile(FILE_C) - .deleteFile(FILE_A) - .commit(); + table.newOverwrite().addFile(FILE_C).deleteFile(FILE_A).commit(); - table.newAppend() - .appendFile(FILE_D) - .commit(); + table.newAppend().appendFile(FILE_D).commit(); long t3 = rightAfterSnapshot(); Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(t3) + .deleteWith(deletedFiles::add) + .execute(); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); checkExpirationResults(1L, 0L, 0L, 1L, 2L, result); @@ -520,72 +560,61 @@ public void testScanExpiredManifestInValidSnapshotAppend() { @Test public void testScanExpiredManifestInValidSnapshotFastAppend() { - table.updateProperties() + table + .updateProperties() .set(TableProperties.MANIFEST_MERGE_ENABLED, "true") .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "1") .commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); - table.newOverwrite() - .addFile(FILE_C) - .deleteFile(FILE_A) - .commit(); + table.newOverwrite().addFile(FILE_C).deleteFile(FILE_A).commit(); - table.newFastAppend() - .appendFile(FILE_D) - .commit(); + table.newFastAppend().appendFile(FILE_D).commit(); long t3 = rightAfterSnapshot(); Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(t3) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(t3) + .deleteWith(deletedFiles::add) + .execute(); Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); checkExpirationResults(1L, 0L, 0L, 1L, 2L, result); } /** - * Test on table below, and expiring the staged commit `B` using `expireOlderThan` API. - * Table: A - C - * ` B (staged) + * Test on table below, and expiring the staged commit `B` using `expireOlderThan` API. Table: A - + * C ` B (staged) */ @Test public void testWithExpiringDanglingStageCommit() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` staged commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); TableMetadata base = ((BaseTable) table).operations().current(); Snapshot snapshotA = base.snapshots().get(0); Snapshot snapshotB = base.snapshots().get(1); // `C` commit - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Set deletedFiles = Sets.newHashSet(); // Expire all commits including dangling staged snapshot. - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(snapshotB.timestampMillis() + 1) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireOlderThan(snapshotB.timestampMillis() + 1) + .execute(); checkExpirationResults(1L, 0L, 0L, 1L, 2L, result); @@ -593,122 +622,107 @@ public void testWithExpiringDanglingStageCommit() { expectedDeletes.add(snapshotA.manifestListLocation()); // Files should be deleted of dangling staged snapshot - snapshotB.addedDataFiles(table.io()).forEach(i -> { - expectedDeletes.add(i.path().toString()); - }); + snapshotB + .addedDataFiles(table.io()) + .forEach( + i -> { + expectedDeletes.add(i.path().toString()); + }); // ManifestList should be deleted too expectedDeletes.add(snapshotB.manifestListLocation()); - snapshotB.dataManifests(table.io()).forEach(file -> { - // Only the manifest of B should be deleted. - if (file.snapshotId() == snapshotB.snapshotId()) { - expectedDeletes.add(file.path()); - } - }); - Assert.assertSame("Files deleted count should be expected", expectedDeletes.size(), deletedFiles.size()); + snapshotB + .dataManifests(table.io()) + .forEach( + file -> { + // Only the manifest of B should be deleted. + if (file.snapshotId() == snapshotB.snapshotId()) { + expectedDeletes.add(file.path()); + } + }); + Assert.assertSame( + "Files deleted count should be expected", expectedDeletes.size(), deletedFiles.size()); // Take the diff expectedDeletes.removeAll(deletedFiles); Assert.assertTrue("Exactly same files should be deleted", expectedDeletes.isEmpty()); } /** - * Expire cherry-pick the commit as shown below, when `B` is in table's current state - * Table: - * A - B - C <--current snapshot - * `- D (source=B) + * Expire cherry-pick the commit as shown below, when `B` is in table's current state Table: A - B + * - C <--current snapshot `- D (source=B) */ @Test public void testWithCherryPickTableSnapshot() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot snapshotA = table.currentSnapshot(); // `B` commit Set deletedAFiles = Sets.newHashSet(); - table.newOverwrite() - .addFile(FILE_B) - .deleteFile(FILE_A) - .deleteWith(deletedAFiles::add) - .commit(); + table.newOverwrite().addFile(FILE_B).deleteFile(FILE_A).deleteWith(deletedAFiles::add).commit(); Assert.assertTrue("No files should be physically deleted", deletedAFiles.isEmpty()); // pick the snapshot 'B` Snapshot snapshotB = table.currentSnapshot(); // `C` commit to let cherry-pick take effect, and avoid fast-forward of `B` with cherry-pick - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); Snapshot snapshotC = table.currentSnapshot(); // Move the table back to `A` - table.manageSnapshots() - .setCurrentSnapshot(snapshotA.snapshotId()) - .commit(); + table.manageSnapshots().setCurrentSnapshot(snapshotA.snapshotId()).commit(); // Generate A -> `D (B)` - table.manageSnapshots() - .cherrypick(snapshotB.snapshotId()) - .commit(); + table.manageSnapshots().cherrypick(snapshotB.snapshotId()).commit(); Snapshot snapshotD = table.currentSnapshot(); // Move the table back to `C` - table.manageSnapshots() - .setCurrentSnapshot(snapshotC.snapshotId()) - .commit(); + table.manageSnapshots().setCurrentSnapshot(snapshotC.snapshotId()).commit(); List deletedFiles = Lists.newArrayList(); // Expire `C` - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(snapshotC.timestampMillis() + 1) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireOlderThan(snapshotC.timestampMillis() + 1) + .execute(); // Make sure no dataFiles are deleted for the B, C, D snapshot - Lists.newArrayList(snapshotB, snapshotC, snapshotD).forEach(i -> { - i.addedDataFiles(table.io()).forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); + Lists.newArrayList(snapshotB, snapshotC, snapshotD) + .forEach( + i -> { + i.addedDataFiles(table.io()) + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); checkExpirationResults(1L, 0L, 0L, 2L, 2L, result); } /** - * Test on table below, and expiring `B` which is not in current table state. - * 1) Expire `B` - * 2) All commit - * Table: A - C - D (B) - * ` B (staged) + * Test on table below, and expiring `B` which is not in current table state. 1) Expire `B` 2) All + * commit Table: A - C - D (B) ` B (staged) */ @Test public void testWithExpiringStagedThenCherrypick() { // `A` commit - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); // `B` commit - table.newAppend() - .appendFile(FILE_B) - .stageOnly() - .commit(); + table.newAppend().appendFile(FILE_B).stageOnly().commit(); // pick the snapshot that's staged but not committed TableMetadata base = ((BaseTable) table).operations().current(); Snapshot snapshotB = base.snapshots().get(1); // `C` commit to let cherry-pick take effect, and avoid fast-forward of `B` with cherry-pick - table.newAppend() - .appendFile(FILE_C) - .commit(); + table.newAppend().appendFile(FILE_C).commit(); // `D (B)` cherry-pick commit - table.manageSnapshots() - .cherrypick(snapshotB.snapshotId()) - .commit(); + table.manageSnapshots().cherrypick(snapshotB.snapshotId()).commit(); base = ((BaseTable) table).operations().current(); Snapshot snapshotD = base.snapshots().get(3); @@ -716,47 +730,55 @@ public void testWithExpiringStagedThenCherrypick() { List deletedFiles = Lists.newArrayList(); // Expire `B` commit. - ExpireSnapshots.Result firstResult = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireSnapshotId(snapshotB.snapshotId()) - .execute(); + ExpireSnapshots.Result firstResult = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireSnapshotId(snapshotB.snapshotId()) + .execute(); // Make sure no dataFiles are deleted for the staged snapshot - Lists.newArrayList(snapshotB).forEach(i -> { - i.addedDataFiles(table.io()).forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); + Lists.newArrayList(snapshotB) + .forEach( + i -> { + i.addedDataFiles(table.io()) + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); checkExpirationResults(0L, 0L, 0L, 1L, 1L, firstResult); // Expire all snapshots including cherry-pick - ExpireSnapshots.Result secondResult = SparkActions.get().expireSnapshots(table) - .deleteWith(deletedFiles::add) - .expireOlderThan(table.currentSnapshot().timestampMillis() + 1) - .execute(); + ExpireSnapshots.Result secondResult = + SparkActions.get() + .expireSnapshots(table) + .deleteWith(deletedFiles::add) + .expireOlderThan(table.currentSnapshot().timestampMillis() + 1) + .execute(); // Make sure no dataFiles are deleted for the staged and cherry-pick - Lists.newArrayList(snapshotB, snapshotD).forEach(i -> { - i.addedDataFiles(table.io()).forEach(item -> { - Assert.assertFalse(deletedFiles.contains(item.path().toString())); - }); - }); - checkExpirationResults(0L, 0L, 0L, 0L, 2L, secondResult); + Lists.newArrayList(snapshotB, snapshotD) + .forEach( + i -> { + i.addedDataFiles(table.io()) + .forEach( + item -> { + Assert.assertFalse(deletedFiles.contains(item.path().toString())); + }); + }); + checkExpirationResults(0L, 0L, 0L, 0L, 2L, secondResult); } @Test public void testExpireOlderThan() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); long snapshotId = table.currentSnapshot().snapshotId(); @@ -764,42 +786,46 @@ public void testExpireOlderThan() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertEquals("Should remove only the expired manifest list location", - Sets.newHashSet(firstSnapshot.manifestListLocation()), deletedFiles); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertEquals( + "Should remove only the expired manifest list location", + Sets.newHashSet(firstSnapshot.manifestListLocation()), + deletedFiles); - checkExpirationResults(0, 0, 0, 0, 1, result); + checkExpirationResults(0, 0, 0, 0, 1, result); } @Test public void testExpireOlderThanWithDelete() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); rightAfterSnapshot(); - table.newDelete() - .deleteFile(FILE_A) - .commit(); + table.newDelete().deleteFile(FILE_A).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create replace manifest with a rewritten manifest", - 1, secondSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create replace manifest with a rewritten manifest", + 1, + secondSnapshot.allManifests(table.io()).size()); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); rightAfterSnapshot(); @@ -809,21 +835,36 @@ public void testExpireOlderThanWithDelete() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the second oldest snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and deleted data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the second oldest snapshot", + table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and deleted data file", Sets.newHashSet( firstSnapshot.manifestListLocation(), // snapshot expired - firstSnapshot.allManifests(table.io()).get(0).path(), // manifest was rewritten for delete + firstSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest was rewritten for delete secondSnapshot.manifestListLocation(), // snapshot expired - secondSnapshot.allManifests(table.io()).get(0).path(), // manifest contained only deletes, was dropped + secondSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest contained only deletes, was dropped FILE_A.path()), // deleted deletedFiles); @@ -833,30 +874,29 @@ public void testExpireOlderThanWithDelete() { @Test public void testExpireOlderThanWithDeleteInMergedManifests() { // merge every commit - table.updateProperties() - .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); rightAfterSnapshot(); - table.newDelete() + table + .newDelete() .deleteFile(FILE_A) // FILE_B is still in the dataset .commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should replace manifest with a rewritten manifest", - 1, secondSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should replace manifest with a rewritten manifest", + 1, + secondSnapshot.allManifests(table.io()).size()); - table.newFastAppend() // do not merge to keep the last snapshot's manifest valid + table + .newFastAppend() // do not merge to keep the last snapshot's manifest valid .appendFile(FILE_C) .commit(); @@ -868,19 +908,31 @@ public void testExpireOlderThanWithDeleteInMergedManifests() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the second oldest snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and deleted data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Expire should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the second oldest snapshot", + table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and deleted data file", Sets.newHashSet( firstSnapshot.manifestListLocation(), // snapshot expired - firstSnapshot.allManifests(table.io()).get(0).path(), // manifest was rewritten for delete + firstSnapshot + .allManifests(table.io()) + .get(0) + .path(), // manifest was rewritten for delete secondSnapshot.manifestListLocation(), // snapshot expired FILE_A.path()), // deleted deletedFiles); @@ -891,33 +943,26 @@ public void testExpireOlderThanWithDeleteInMergedManifests() { @Test public void testExpireOlderThanWithRollback() { // merge every commit - table.updateProperties() - .set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0") - .commit(); + table.updateProperties().set(TableProperties.MANIFEST_MIN_MERGE_COUNT, "0").commit(); - table.newAppend() - .appendFile(FILE_A) - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_A).appendFile(FILE_B).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); rightAfterSnapshot(); - table.newDelete() - .deleteFile(FILE_B) - .commit(); + table.newDelete().deleteFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Set secondSnapshotManifests = Sets.newHashSet(secondSnapshot.allManifests(table.io())); + Set secondSnapshotManifests = + Sets.newHashSet(secondSnapshot.allManifests(table.io())); secondSnapshotManifests.removeAll(firstSnapshot.allManifests(table.io())); - Assert.assertEquals("Should add one new manifest for append", 1, secondSnapshotManifests.size()); + Assert.assertEquals( + "Should add one new manifest for append", 1, secondSnapshotManifests.size()); - table.manageSnapshots() - .rollbackTo(firstSnapshot.snapshotId()) - .commit(); + table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit(); long tAfterCommits = rightAfterSnapshot(secondSnapshot.snapshotId()); @@ -925,19 +970,29 @@ public void testExpireOlderThanWithRollback() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNotNull("Expire should keep the oldest snapshot, current", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and reverted appended data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNotNull( + "Expire should keep the oldest snapshot, current", + table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and reverted appended data file", Sets.newHashSet( secondSnapshot.manifestListLocation(), // snapshot expired - Iterables.getOnlyElement(secondSnapshotManifests).path()), // manifest is no longer referenced + Iterables.getOnlyElement(secondSnapshotManifests) + .path()), // manifest is no longer referenced deletedFiles); checkExpirationResults(0, 0, 0, 1, 1, result); @@ -945,28 +1000,24 @@ public void testExpireOlderThanWithRollback() { @Test public void testExpireOlderThanWithRollbackAndMergedManifests() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); - Assert.assertEquals("Should create one manifest", - 1, firstSnapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should create one manifest", 1, firstSnapshot.allManifests(table.io()).size()); rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); Snapshot secondSnapshot = table.currentSnapshot(); - Set secondSnapshotManifests = Sets.newHashSet(secondSnapshot.allManifests(table.io())); + Set secondSnapshotManifests = + Sets.newHashSet(secondSnapshot.allManifests(table.io())); secondSnapshotManifests.removeAll(firstSnapshot.allManifests(table.io())); - Assert.assertEquals("Should add one new manifest for append", 1, secondSnapshotManifests.size()); + Assert.assertEquals( + "Should add one new manifest for append", 1, secondSnapshotManifests.size()); - table.manageSnapshots() - .rollbackTo(firstSnapshot.snapshotId()) - .commit(); + table.manageSnapshots().rollbackTo(firstSnapshot.snapshotId()).commit(); long tAfterCommits = rightAfterSnapshot(secondSnapshot.snapshotId()); @@ -974,19 +1025,29 @@ public void testExpireOlderThanWithRollbackAndMergedManifests() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add) - .execute(); - - Assert.assertEquals("Expire should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNotNull("Expire should keep the oldest snapshot, current", table.snapshot(firstSnapshot.snapshotId())); - Assert.assertNull("Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); - - Assert.assertEquals("Should remove expired manifest lists and reverted appended data file", + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .execute(); + + Assert.assertEquals( + "Expire should not change current snapshot", + snapshotId, + table.currentSnapshot().snapshotId()); + Assert.assertNotNull( + "Expire should keep the oldest snapshot, current", + table.snapshot(firstSnapshot.snapshotId())); + Assert.assertNull( + "Expire should remove the orphaned snapshot", table.snapshot(secondSnapshot.snapshotId())); + + Assert.assertEquals( + "Should remove expired manifest lists and reverted appended data file", Sets.newHashSet( secondSnapshot.manifestListLocation(), // snapshot expired - Iterables.getOnlyElement(secondSnapshotManifests).path(), // manifest is no longer referenced + Iterables.getOnlyElement(secondSnapshotManifests) + .path(), // manifest is no longer referenced FILE_B.path()), // added, but rolled back deletedFiles); @@ -995,68 +1056,65 @@ public void testExpireOlderThanWithRollbackAndMergedManifests() { @Test public void testExpireOlderThanWithDeleteFile() { - table.updateProperties() + table + .updateProperties() .set(TableProperties.FORMAT_VERSION, "2") .set(TableProperties.MANIFEST_MERGE_ENABLED, "false") .commit(); // Add Data File - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); // Add POS Delete - table.newRowDelta() - .addDeletes(FILE_A_POS_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_A_POS_DELETES).commit(); Snapshot secondSnapshot = table.currentSnapshot(); // Add EQ Delete - table.newRowDelta() - .addDeletes(FILE_A_EQ_DELETES) - .commit(); + table.newRowDelta().addDeletes(FILE_A_EQ_DELETES).commit(); Snapshot thirdSnapshot = table.currentSnapshot(); // Move files to DELETED - table.newDelete() - .deleteFromRowFilter(Expressions.alwaysTrue()) - .commit(); + table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); Snapshot fourthSnapshot = table.currentSnapshot(); long afterAllDeleted = rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); Set deletedFiles = Sets.newHashSet(); - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(afterAllDeleted) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(afterAllDeleted) + .deleteWith(deletedFiles::add) + .execute(); - Set expectedDeletes = Sets.newHashSet( - firstSnapshot.manifestListLocation(), - secondSnapshot.manifestListLocation(), - thirdSnapshot.manifestListLocation(), - fourthSnapshot.manifestListLocation(), - FILE_A.path().toString(), - FILE_A_POS_DELETES.path().toString(), - FILE_A_EQ_DELETES.path().toString()); + Set expectedDeletes = + Sets.newHashSet( + firstSnapshot.manifestListLocation(), + secondSnapshot.manifestListLocation(), + thirdSnapshot.manifestListLocation(), + fourthSnapshot.manifestListLocation(), + FILE_A.path().toString(), + FILE_A_POS_DELETES.path().toString(), + FILE_A_EQ_DELETES.path().toString()); expectedDeletes.addAll( thirdSnapshot.allManifests(table.io()).stream() .map(ManifestFile::path) - .map(CharSequence::toString).collect(Collectors.toSet())); + .map(CharSequence::toString) + .collect(Collectors.toSet())); // Delete operation (fourth snapshot) generates new manifest files expectedDeletes.addAll( fourthSnapshot.allManifests(table.io()).stream() .map(ManifestFile::path) - .map(CharSequence::toString).collect(Collectors.toSet())); + .map(CharSequence::toString) + .collect(Collectors.toSet())); - Assert.assertEquals("Should remove expired manifest lists and deleted data file", + Assert.assertEquals( + "Should remove expired manifest lists and deleted data file", expectedDeletes, deletedFiles); @@ -1068,27 +1126,25 @@ public void testExpireOnEmptyTable() { Set deletedFiles = Sets.newHashSet(); // table has no data, testing ExpireSnapshots should not fail with no snapshot - ExpireSnapshots.Result result = SparkActions.get().expireSnapshots(table) - .expireOlderThan(System.currentTimeMillis()) - .deleteWith(deletedFiles::add) - .execute(); + ExpireSnapshots.Result result = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(System.currentTimeMillis()) + .deleteWith(deletedFiles::add) + .execute(); checkExpirationResults(0, 0, 0, 0, 0, result); } @Test public void testExpireAction() { - table.newAppend() - .appendFile(FILE_A) - .commit(); + table.newAppend().appendFile(FILE_A).commit(); Snapshot firstSnapshot = table.currentSnapshot(); rightAfterSnapshot(); - table.newAppend() - .appendFile(FILE_B) - .commit(); + table.newAppend().appendFile(FILE_B).commit(); long snapshotId = table.currentSnapshot().snapshotId(); @@ -1096,58 +1152,67 @@ public void testExpireAction() { Set deletedFiles = Sets.newHashSet(); - ExpireSnapshotsSparkAction action = SparkActions.get().expireSnapshots(table) - .expireOlderThan(tAfterCommits) - .deleteWith(deletedFiles::add); + ExpireSnapshotsSparkAction action = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add); Dataset pendingDeletes = action.expire(); List pending = pendingDeletes.collectAsList(); - Assert.assertEquals("Should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); - Assert.assertNull("Should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); + Assert.assertEquals( + "Should not change current snapshot", snapshotId, table.currentSnapshot().snapshotId()); + Assert.assertNull( + "Should remove the oldest snapshot", table.snapshot(firstSnapshot.snapshotId())); Assert.assertEquals("Pending deletes should contain one row", 1, pending.size()); - Assert.assertEquals("Pending delete should be the expired manifest list location", - firstSnapshot.manifestListLocation(), pending.get(0).getString(0)); - Assert.assertEquals("Pending delete should be a manifest list", - "Manifest List", pending.get(0).getString(1)); + Assert.assertEquals( + "Pending delete should be the expired manifest list location", + firstSnapshot.manifestListLocation(), + pending.get(0).getString(0)); + Assert.assertEquals( + "Pending delete should be a manifest list", "Manifest List", pending.get(0).getString(1)); Assert.assertEquals("Should not delete any files", 0, deletedFiles.size()); - Assert.assertSame("Multiple calls to expire should return the same deleted files", - pendingDeletes, action.expire()); + Assert.assertSame( + "Multiple calls to expire should return the same deleted files", + pendingDeletes, + action.expire()); } @Test public void testUseLocalIterator() { - table.newFastAppend() - .appendFile(FILE_A) - .commit(); + table.newFastAppend().appendFile(FILE_A).commit(); - table.newOverwrite() - .deleteFile(FILE_A) - .addFile(FILE_B) - .commit(); + table.newOverwrite().deleteFile(FILE_A).addFile(FILE_B).commit(); - table.newFastAppend() - .appendFile(FILE_C) - .commit(); + table.newFastAppend().appendFile(FILE_C).commit(); long end = rightAfterSnapshot(); int jobsBeforeStreamResults = spark.sparkContext().dagScheduler().nextJobId().get(); - withSQLConf(ImmutableMap.of("spark.sql.adaptive.enabled", "false"), () -> { - ExpireSnapshots.Result results = SparkActions.get().expireSnapshots(table).expireOlderThan(end) - .option("stream-results", "true").execute(); - - int jobsAfterStreamResults = spark.sparkContext().dagScheduler().nextJobId().get(); - int jobsRunDuringStreamResults = jobsAfterStreamResults - jobsBeforeStreamResults; - - checkExpirationResults(1L, 0L, 0L, 1L, 2L, results); - - Assert.assertEquals("Expected total number of jobs with stream-results should match the expected number", - 4L, jobsRunDuringStreamResults); - }); + withSQLConf( + ImmutableMap.of("spark.sql.adaptive.enabled", "false"), + () -> { + ExpireSnapshots.Result results = + SparkActions.get() + .expireSnapshots(table) + .expireOlderThan(end) + .option("stream-results", "true") + .execute(); + + int jobsAfterStreamResults = spark.sparkContext().dagScheduler().nextJobId().get(); + int jobsRunDuringStreamResults = jobsAfterStreamResults - jobsBeforeStreamResults; + + checkExpirationResults(1L, 0L, 0L, 1L, 2L, results); + + Assert.assertEquals( + "Expected total number of jobs with stream-results should match the expected number", + 4L, + jobsRunDuringStreamResults); + }); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java index 1d9e4793974c..cda48980ba52 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import java.sql.Timestamp; @@ -69,23 +70,18 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public abstract class TestRemoveOrphanFilesAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - protected static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); - protected static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .truncate("c2", 2) - .identity("c3") - .build(); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + protected static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); + protected static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).truncate("c2", 2).identity("c3").build(); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableDir = null; protected String tableLocation = null; @@ -97,41 +93,37 @@ public void setupTableLocation() throws Exception { @Test public void testDryRun() throws IOException, InterruptedException { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - List validFiles = spark.read().format("iceberg") - .load(tableLocation + "#files") - .select("file_path") - .as(Encoders.STRING()) - .collectAsList(); + List validFiles = + spark + .read() + .format("iceberg") + .load(tableLocation + "#files") + .select("file_path") + .as(Encoders.STRING()) + .collectAsList(); Assert.assertEquals("Should be 2 valid files", 2, validFiles.size()); df.write().mode("append").parquet(tableLocation + "/data"); Path dataPath = new Path(tableLocation + "/data"); FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf()); - List allFiles = Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) - .filter(FileStatus::isFile) - .map(file -> file.getPath().toString()) - .collect(Collectors.toList()); + List allFiles = + Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) + .filter(FileStatus::isFile) + .map(file -> file.getPath().toString()) + .collect(Collectors.toList()); Assert.assertEquals("Should be 3 files", 3, allFiles.size()); List invalidFiles = Lists.newArrayList(allFiles); @@ -142,32 +134,34 @@ public void testDryRun() throws IOException, InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result1 = actions.deleteOrphanFiles(table) - .deleteWith(s -> { }) - .execute(); - Assert.assertTrue("Default olderThan interval should be safe", Iterables.isEmpty(result1.orphanFileLocations())); + DeleteOrphanFiles.Result result1 = + actions.deleteOrphanFiles(table).deleteWith(s -> {}).execute(); + Assert.assertTrue( + "Default olderThan interval should be safe", + Iterables.isEmpty(result1.orphanFileLocations())); - DeleteOrphanFiles.Result result2 = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .deleteWith(s -> { }) - .execute(); + DeleteOrphanFiles.Result result2 = + actions + .deleteOrphanFiles(table) + .olderThan(System.currentTimeMillis()) + .deleteWith(s -> {}) + .execute(); Assert.assertEquals("Action should find 1 file", invalidFiles, result2.orphanFileLocations()); Assert.assertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0)))); - DeleteOrphanFiles.Result result3 = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result3 = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Action should delete 1 file", invalidFiles, result3.orphanFileLocations()); - Assert.assertFalse("Invalid file should not be present", fs.exists(new Path(invalidFiles.get(0)))); + Assert.assertFalse( + "Invalid file should not be present", fs.exists(new Path(invalidFiles.get(0)))); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(records); expectedRecords.addAll(records); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -175,36 +169,22 @@ public void testDryRun() throws IOException, InterruptedException { public void testAllValidFilesAreKept() throws IOException, InterruptedException { Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records1 = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df1 = spark.createDataFrame(records1, ThreeColumnRecord.class).coalesce(1); // original append - df1.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA") - ); + List records2 = + Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA")); Dataset df2 = spark.createDataFrame(records2, ThreeColumnRecord.class).coalesce(1); // dynamic partition overwrite - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("overwrite") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation); // second append - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); List snapshots = Lists.newArrayList(table.snapshots()); @@ -226,9 +206,8 @@ public void testAllValidFilesAreKept() throws IOException, InterruptedException SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 4 files", 4, Iterables.size(result.orphanFileLocations())); @@ -252,36 +231,22 @@ public void testAllValidFilesAreKept() throws IOException, InterruptedException public void orphanedFileRemovedWithParallelTasks() throws InterruptedException, IOException { Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records1 = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df1 = spark.createDataFrame(records1, ThreeColumnRecord.class).coalesce(1); // original append - df1.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA") - ); + List records2 = + Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA")); Dataset df2 = spark.createDataFrame(records2, ThreeColumnRecord.class).coalesce(1); // dynamic partition overwrite - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("overwrite") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation); // second append - df2.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data"); df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA"); @@ -294,25 +259,34 @@ public void orphanedFileRemovedWithParallelTasks() throws InterruptedException, Set deleteThreads = ConcurrentHashMap.newKeySet(); AtomicInteger deleteThreadsIndex = new AtomicInteger(0); - ExecutorService executorService = Executors.newFixedThreadPool(4, runnable -> { - Thread thread = new Thread(runnable); - thread.setName("remove-orphan-" + deleteThreadsIndex.getAndIncrement()); - thread.setDaemon(true); - return thread; - }); - - DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table) + ExecutorService executorService = + Executors.newFixedThreadPool( + 4, + runnable -> { + Thread thread = new Thread(runnable); + thread.setName("remove-orphan-" + deleteThreadsIndex.getAndIncrement()); + thread.setDaemon(true); + return thread; + }); + + DeleteOrphanFiles.Result result = + SparkActions.get() + .deleteOrphanFiles(table) .executeDeleteWith(executorService) - .olderThan(System.currentTimeMillis() + 5000) // Ensure all orphan files are selected - .deleteWith(file -> { - deleteThreads.add(Thread.currentThread().getName()); - deletedFiles.add(file); - }) + .olderThan(System.currentTimeMillis() + 5000) // Ensure all orphan files are selected + .deleteWith( + file -> { + deleteThreads.add(Thread.currentThread().getName()); + deletedFiles.add(file); + }) .execute(); - // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory - Assert.assertEquals(deleteThreads, - Sets.newHashSet("remove-orphan-0", "remove-orphan-1", "remove-orphan-2", "remove-orphan-3")); + // Verifies that the delete methods ran in the threads created by the provided ExecutorService + // ThreadFactory + Assert.assertEquals( + deleteThreads, + Sets.newHashSet( + "remove-orphan-0", "remove-orphan-1", "remove-orphan-2", "remove-orphan-3")); Assert.assertEquals("Should delete 4 files", 4, deletedFiles.size()); } @@ -323,42 +297,32 @@ public void testWapFilesAreKept() throws InterruptedException { props.put(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, "true"); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); // normal write - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); spark.conf().set("spark.wap.id", "1"); // wap write - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Should not return data from the staged snapshot", records, actualRecords); waitUntilAfter(System.currentTimeMillis()); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); + Assert.assertTrue( + "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); } @Test @@ -368,16 +332,11 @@ public void testMetadataFolderIsIntact() throws InterruptedException { props.put(TableProperties.WRITE_DATA_LOCATION, tableLocation); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df.write().mode("append").parquet(tableLocation + "/c2_trunc=AA/c3=AAAA"); @@ -385,16 +344,14 @@ public void testMetadataFolderIsIntact() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @@ -402,16 +359,11 @@ public void testMetadataFolderIsIntact() throws InterruptedException { public void testOlderThanTimestamp() throws InterruptedException { Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); @@ -426,11 +378,11 @@ public void testOlderThanTimestamp() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(timestamp) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(timestamp).execute(); - Assert.assertEquals("Should delete only 2 files", 2, Iterables.size(result.orphanFileLocations())); + Assert.assertEquals( + "Should delete only 2 files", 2, Iterables.size(result.orphanFileLocations())); } @Test @@ -440,33 +392,25 @@ public void testRemoveUnreachableMetadataVersionFiles() throws InterruptedExcept props.put(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1"); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); waitUntilAfter(System.currentTimeMillis()); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations())); - Assert.assertTrue("Should remove v1 file", StreamSupport.stream(result.orphanFileLocations().spliterator(), false) + Assert.assertTrue( + "Should remove v1 file", + StreamSupport.stream(result.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("v1.metadata.json"))); List expectedRecords = Lists.newArrayList(); @@ -474,9 +418,8 @@ public void testRemoveUnreachableMetadataVersionFiles() throws InterruptedExcept expectedRecords.addAll(records); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -491,26 +434,21 @@ public void testManyTopLevelPartitions() throws InterruptedException { Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); waitUntilAfter(System.currentTimeMillis()); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); + Assert.assertTrue( + "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @@ -525,56 +463,43 @@ public void testManyLeafPartitions() throws InterruptedException { Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); waitUntilAfter(System.currentTimeMillis()); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); + Assert.assertTrue( + "Should not delete any files", Iterables.isEmpty(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @Test public void testHiddenPartitionPaths() throws InterruptedException { - Schema schema = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "_c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .truncate("_c2", 2) - .identity("c3") - .build(); + Schema schema = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "_c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); + PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("_c2", 2).identity("c3").build(); Table table = TABLES.create(schema, spec, Maps.newHashMap(), tableLocation); - StructType structType = new StructType() - .add("c1", DataTypes.IntegerType) - .add("_c2", DataTypes.StringType) - .add("c3", DataTypes.StringType); - List records = Lists.newArrayList( - RowFactory.create(1, "AAAAAAAAAA", "AAAA") - ); + StructType structType = + new StructType() + .add("c1", DataTypes.IntegerType) + .add("_c2", DataTypes.StringType) + .add("c3", DataTypes.StringType); + List records = Lists.newArrayList(RowFactory.create(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, structType).coalesce(1); - df.select("c1", "_c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "_c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df.write().mode("append").parquet(tableLocation + "/data/_c2_trunc=AA/c3=AAAA"); df.write().mode("append").parquet(tableLocation + "/data/_c2_trunc=AA/c3=AAAA"); @@ -583,45 +508,35 @@ public void testHiddenPartitionPaths() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 2 files", 2, Iterables.size(result.orphanFileLocations())); } @Test public void testHiddenPartitionPathsWithPartitionEvolution() throws InterruptedException { - Schema schema = new Schema( - optional(1, "_c1", Types.IntegerType.get()), - optional(2, "_c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .truncate("_c2", 2) - .build(); + Schema schema = + new Schema( + optional(1, "_c1", Types.IntegerType.get()), + optional(2, "_c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); + PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("_c2", 2).build(); Table table = TABLES.create(schema, spec, Maps.newHashMap(), tableLocation); - StructType structType = new StructType() - .add("_c1", DataTypes.IntegerType) - .add("_c2", DataTypes.StringType) - .add("c3", DataTypes.StringType); - List records = Lists.newArrayList( - RowFactory.create(1, "AAAAAAAAAA", "AAAA") - ); + StructType structType = + new StructType() + .add("_c1", DataTypes.IntegerType) + .add("_c2", DataTypes.StringType) + .add("c3", DataTypes.StringType); + List records = Lists.newArrayList(RowFactory.create(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, structType).coalesce(1); - df.select("_c1", "_c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("_c1", "_c2", "c3").write().format("iceberg").mode("append").save(tableLocation); df.write().mode("append").parquet(tableLocation + "/data/_c2_trunc=AA"); - table.updateSpec() - .addField("_c1") - .commit(); + table.updateSpec().addField("_c1").commit(); df.write().mode("append").parquet(tableLocation + "/data/_c2_trunc=AA/_c1=1"); @@ -629,40 +544,32 @@ public void testHiddenPartitionPathsWithPartitionEvolution() throws InterruptedE SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 2 files", 2, Iterables.size(result.orphanFileLocations())); } @Test - public void testHiddenPathsStartingWithPartitionNamesAreIgnored() throws InterruptedException, IOException { - Schema schema = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "_c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); - PartitionSpec spec = PartitionSpec.builderFor(schema) - .truncate("_c2", 2) - .identity("c3") - .build(); + public void testHiddenPathsStartingWithPartitionNamesAreIgnored() + throws InterruptedException, IOException { + Schema schema = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "_c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); + PartitionSpec spec = PartitionSpec.builderFor(schema).truncate("_c2", 2).identity("c3").build(); Table table = TABLES.create(schema, spec, Maps.newHashMap(), tableLocation); - StructType structType = new StructType() - .add("c1", DataTypes.IntegerType) - .add("_c2", DataTypes.StringType) - .add("c3", DataTypes.StringType); - List records = Lists.newArrayList( - RowFactory.create(1, "AAAAAAAAAA", "AAAA") - ); + StructType structType = + new StructType() + .add("c1", DataTypes.IntegerType) + .add("_c2", DataTypes.StringType) + .add("c3", DataTypes.StringType); + List records = Lists.newArrayList(RowFactory.create(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, structType).coalesce(1); - df.select("c1", "_c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "_c2", "c3").write().format("iceberg").mode("append").save(tableLocation); Path dataPath = new Path(tableLocation + "/data"); FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf()); @@ -673,16 +580,17 @@ public void testHiddenPathsStartingWithPartitionNamesAreIgnored() throws Interru SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); Assert.assertEquals("Should delete 0 files", 0, Iterables.size(result.orphanFileLocations())); Assert.assertTrue(fs.exists(pathToFileInHiddenFolder)); } private List snapshotFiles(long snapshotId) { - return spark.read().format("iceberg") + return spark + .read() + .format("iceberg") .option("snapshot-id", snapshotId) .load(tableLocation + "#files") .select("file_path") @@ -692,11 +600,12 @@ private List snapshotFiles(long snapshotId) { @Test public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, InterruptedException { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableDir.getAbsolutePath()); + Table table = + TABLES.create( + SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableDir.getAbsolutePath()); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); @@ -706,11 +615,14 @@ public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, Inte .mode("append") .save(tableDir.getAbsolutePath()); - List validFiles = spark.read().format("iceberg") - .load(tableLocation + "#files") - .select("file_path") - .as(Encoders.STRING()) - .collectAsList(); + List validFiles = + spark + .read() + .format("iceberg") + .load(tableLocation + "#files") + .select("file_path") + .as(Encoders.STRING()) + .collectAsList(); Assert.assertEquals("Should be 1 valid files", 1, validFiles.size()); String validFile = validFiles.get(0); @@ -718,10 +630,11 @@ public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, Inte Path dataPath = new Path(tableLocation + "/data"); FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf()); - List allFiles = Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) - .filter(FileStatus::isFile) - .map(file -> file.getPath().toString()) - .collect(Collectors.toList()); + List allFiles = + Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())) + .filter(FileStatus::isFile) + .map(file -> file.getPath().toString()) + .collect(Collectors.toList()); Assert.assertEquals("Should be 2 files", 2, allFiles.size()); List invalidFiles = Lists.newArrayList(allFiles); @@ -731,10 +644,12 @@ public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, Inte waitUntilAfter(System.currentTimeMillis()); SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .deleteWith(s -> { }) - .execute(); + DeleteOrphanFiles.Result result = + actions + .deleteOrphanFiles(table) + .olderThan(System.currentTimeMillis()) + .deleteWith(s -> {}) + .execute(); Assert.assertEquals("Action should find 1 file", invalidFiles, result.orphanFileLocations()); Assert.assertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0)))); } @@ -747,18 +662,15 @@ public void testRemoveOrphanFilesWithHadoopCatalog() throws InterruptedException Namespace namespace = Namespace.of(namespaceName); TableIdentifier tableIdentifier = TableIdentifier.of(namespace, tableName); - Table table = catalog.createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap()); + Table table = + catalog.createTable( + tableIdentifier, SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap()); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(table.location()); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(table.location()); df.write().mode("append").parquet(table.location() + "/data"); @@ -766,28 +678,30 @@ public void testRemoveOrphanFilesWithHadoopCatalog() throws InterruptedException table.refresh(); - DeleteOrphanFiles.Result result = SparkActions.get() - .deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); + DeleteOrphanFiles.Result result = + SparkActions.get().deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); - Assert.assertEquals("Should delete only 1 files", 1, Iterables.size(result.orphanFileLocations())); + Assert.assertEquals( + "Should delete only 1 files", 1, Iterables.size(result.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(table.location()); - List actualRecords = resultDF - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @Test public void testHiveCatalogTable() throws IOException { - Table table = catalog.createTable(TableIdentifier.of("default", "hivetestorphan"), SCHEMA, SPEC, tableLocation, - Maps.newHashMap()); + Table table = + catalog.createTable( + TableIdentifier.of("default", "hivetestorphan"), + SCHEMA, + SPEC, + tableLocation, + Maps.newHashMap()); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); @@ -800,35 +714,35 @@ public void testHiveCatalogTable() throws IOException { String location = table.location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result result = + SparkActions.get() + .deleteOrphanFiles(table) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(result.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "data/trashfile"))); } @Test public void testGarbageCollectionDisabled() { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") - ); + List records = + Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")); Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); - table.updateProperties() - .set(TableProperties.GC_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit(); - AssertHelpers.assertThrows("Should complain about removing orphan files", - ValidationException.class, "Cannot delete orphan files: GC is disabled", + AssertHelpers.assertThrows( + "Should complain about removing orphan files", + ValidationException.class, + "Cannot delete orphan files: GC is disabled", () -> SparkActions.get().deleteOrphanFiles(table).execute()); } @@ -892,19 +806,21 @@ public void testCompareToFileList() throws IOException, InterruptedException { .withColumnRenamed("lastModified", "last_modified"); DeleteOrphanFiles.Result result1 = - actions.deleteOrphanFiles(table) + actions + .deleteOrphanFiles(table) .compareToFileList(compareToFileList) - .deleteWith(s -> { }) + .deleteWith(s -> {}) .execute(); Assert.assertTrue( "Default olderThan interval should be safe", Iterables.isEmpty(result1.orphanFileLocations())); DeleteOrphanFiles.Result result2 = - actions.deleteOrphanFiles(table) + actions + .deleteOrphanFiles(table) .compareToFileList(compareToFileList) .olderThan(System.currentTimeMillis()) - .deleteWith(s -> { }) + .deleteWith(s -> {}) .execute(); Assert.assertEquals( "Action should find 1 file", invalidFilePaths, result2.orphanFileLocations()); @@ -912,7 +828,8 @@ public void testCompareToFileList() throws IOException, InterruptedException { "Invalid file should be present", fs.exists(new Path(invalidFilePaths.get(0)))); DeleteOrphanFiles.Result result3 = - actions.deleteOrphanFiles(table) + actions + .deleteOrphanFiles(table) .compareToFileList(compareToFileList) .olderThan(System.currentTimeMillis()) .execute(); @@ -940,9 +857,10 @@ public void testCompareToFileList() throws IOException, InterruptedException { .withColumnRenamed("lastModified", "last_modified"); DeleteOrphanFiles.Result result4 = - actions.deleteOrphanFiles(table) + actions + .deleteOrphanFiles(table) .compareToFileList(compareToFileListWithOutsideLocation) - .deleteWith(s -> { }) + .deleteWith(s -> {}) .execute(); Assert.assertEquals( "Action should find nothing", Lists.newArrayList(), result4.orphanFileLocations()); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction3.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction3.java index 77eb23a6dffc..e3699eaeded1 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction3.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction3.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; import java.io.File; @@ -54,9 +53,13 @@ public void testSparkCatalogTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @@ -80,9 +83,13 @@ public void testSparkCatalogNamedHadoopTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @@ -106,19 +113,26 @@ public void testSparkCatalogNamedHiveTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @Test public void testSparkSessionCatalogHadoopTable() throws Exception { - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); spark.conf().set("spark.sql.catalog.spark_catalog.type", "hadoop"); spark.conf().set("spark.sql.catalog.spark_catalog.warehouse", tableLocation); - SparkSessionCatalog cat = (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog(); + SparkSessionCatalog cat = + (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog(); String[] database = {"default"}; Identifier id = Identifier.of(database, "table"); @@ -132,18 +146,25 @@ public void testSparkSessionCatalogHadoopTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @Test public void testSparkSessionCatalogHiveTable() throws Exception { - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"); spark.conf().set("spark.sql.catalog.spark_catalog.type", "hive"); - SparkSessionCatalog cat = (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog(); + SparkSessionCatalog cat = + (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog(); String[] database = {"default"}; Identifier id = Identifier.of(database, "sessioncattest"); @@ -158,9 +179,13 @@ public void testSparkSessionCatalogHiveTable() throws Exception { String location = table.table().location().replaceFirst("file:", ""); new File(location + "/data/trashfile").createNewFile(); - DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()) - .olderThan(System.currentTimeMillis() + 1000).execute(); - Assert.assertTrue("trash file should be removed", + DeleteOrphanFiles.Result results = + SparkActions.get() + .deleteOrphanFiles(table.table()) + .olderThan(System.currentTimeMillis() + 1000) + .execute(); + Assert.assertTrue( + "trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false) .anyMatch(file -> file.contains("file:" + location + "/data/trashfile"))); } @@ -171,5 +196,4 @@ public void resetSparkSessionCatalog() throws Exception { spark.conf().unset("spark.sql.catalog.spark_catalog.type"); spark.conf().unset("spark.sql.catalog.spark_catalog.warehouse"); } - } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index 133b4e632c99..ec8721709dd8 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -16,9 +16,21 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.apache.spark.sql.functions.current_date; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.argThat; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.doCallRealMethod; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.spy; + import java.io.File; import java.io.IOException; import java.io.UncheckedIOException; @@ -95,32 +107,18 @@ import org.mockito.ArgumentMatcher; import org.mockito.Mockito; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.apache.spark.sql.functions.current_date; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.argThat; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.doCallRealMethod; -import static org.mockito.Mockito.doReturn; -import static org.mockito.Mockito.doThrow; -import static org.mockito.Mockito.spy; - public class TestRewriteDataFilesAction extends SparkTestBase { private static final int SCALE = 400000; private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final FileRewriteCoordinator coordinator = FileRewriteCoordinator.get(); private final FileScanTaskSetManager manager = FileScanTaskSetManager.get(); @@ -189,10 +187,11 @@ public void testBinPackWithFilter() { shouldHaveFiles(table, 8); List expectedRecords = currentData(); - Result result = basicRewrite(table) - .filter(Expressions.equal("c1", 1)) - .filter(Expressions.startsWith("c2", "foo")) - .execute(); + Result result = + basicRewrite(table) + .filter(Expressions.equal("c1", 1)) + .filter(Expressions.startsWith("c2", "foo")) + .execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFilesCount()); @@ -216,12 +215,17 @@ public void testBinPackAfterPartitionChange() { RewriteDataFiles.Result result = basicRewrite(table) .option(SortStrategy.MIN_INPUT_FILES, "1") - .option(SortStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) + 1000)) - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) + 1001)) + .option( + SortStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) + 1000)) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + Integer.toString(averageFileSize(table) + 1001)) .execute(); - Assert.assertEquals("Should have 1 fileGroup because all files were not correctly partitioned", - 1, result.rewriteResults().size()); + Assert.assertEquals( + "Should have 1 fileGroup because all files were not correctly partitioned", + 1, + result.rewriteResults().size()); List postRewriteData = currentData(); assertEquals("We shouldn't have changed the data", originalData, postRewriteData); @@ -239,32 +243,33 @@ public void testBinPackWithDeletes() throws Exception { table.refresh(); CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum(); RowDelta rowDelta = table.newRowDelta(); // add 1 delete file for data files 0, 1, 2 for (int i = 0; i < 3; i++) { - writePosDeletesToFile(table, dataFiles.get(i), 1) - .forEach(rowDelta::addDeletes); + writePosDeletesToFile(table, dataFiles.get(i), 1).forEach(rowDelta::addDeletes); } // add 2 delete files for data files 3, 4 for (int i = 3; i < 5; i++) { - writePosDeletesToFile(table, dataFiles.get(i), 2) - .forEach(rowDelta::addDeletes); + writePosDeletesToFile(table, dataFiles.get(i), 2).forEach(rowDelta::addDeletes); } rowDelta.commit(); table.refresh(); List expectedRecords = currentData(); - Result result = actions().rewriteDataFiles(table) - // do not include any file based on bin pack file size configs - .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, "0") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)) - .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "2") - .execute(); + Result result = + actions() + .rewriteDataFiles(table) + // do not include any file based on bin pack file size configs + .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, "0") + .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)) + .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)) + .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "2") + .execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount()); List actualRecords = currentData(); @@ -281,20 +286,22 @@ public void testBinPackWithDeleteAllData() { table.refresh(); CloseableIterable tasks = table.newScan().planFiles(); - List dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); + List dataFiles = + Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum(); RowDelta rowDelta = table.newRowDelta(); // remove all data - writePosDeletesToFile(table, dataFiles.get(0), total) - .forEach(rowDelta::addDeletes); + writePosDeletesToFile(table, dataFiles.get(0), total).forEach(rowDelta::addDeletes); rowDelta.commit(); table.refresh(); List expectedRecords = currentData(); - Result result = actions().rewriteDataFiles(table) - .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "1") - .execute(); + Result result = + actions() + .rewriteDataFiles(table) + .option(BinPackStrategy.DELETE_FILE_THRESHOLD, "1") + .execute(); Assert.assertEquals("Action should rewrite 1 data files", 1, result.rewrittenDataFilesCount()); List actualRecords = currentData(); @@ -303,7 +310,8 @@ public void testBinPackWithDeleteAllData() { "Data manifest should not have existing data file", 0, (long) table.currentSnapshot().dataManifests(table.io()).get(0).existingFilesCount()); - Assert.assertEquals("Data manifest should have 1 delete data file", + Assert.assertEquals( + "Data manifest should have 1 delete data file", 1L, (long) table.currentSnapshot().dataManifests(table.io()).get(0).deletedFilesCount()); Assert.assertEquals( @@ -321,9 +329,8 @@ public void testBinPackWithStartingSequenceNumber() { table.refresh(); long oldSequenceNumber = table.currentSnapshot().sequenceNumber(); - Result result = basicRewrite(table) - .option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true") - .execute(); + Result result = + basicRewrite(table).option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true").execute(); Assert.assertEquals("Action should rewrite 8 data files", 8, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFilesCount()); @@ -332,13 +339,15 @@ public void testBinPackWithStartingSequenceNumber() { assertEquals("Rows must match", expectedRecords, actualRecords); table.refresh(); - Assert.assertTrue("Table sequence number should be incremented", + Assert.assertTrue( + "Table sequence number should be incremented", oldSequenceNumber < table.currentSnapshot().sequenceNumber()); Dataset rows = SparkTableUtil.loadMetadataTable(spark, table, MetadataTableType.ENTRIES); for (Row row : rows.collectAsList()) { if (row.getInt(0) == 1) { - Assert.assertEquals("Expect old sequence number for added entries", oldSequenceNumber, row.getLong(2)); + Assert.assertEquals( + "Expect old sequence number for added entries", oldSequenceNumber, row.getLong(2)); } } } @@ -352,9 +361,8 @@ public void testBinPackWithStartingSequenceNumberV1Compatibility() { long oldSequenceNumber = table.currentSnapshot().sequenceNumber(); Assert.assertEquals("Table sequence number should be 0", 0, oldSequenceNumber); - Result result = basicRewrite(table) - .option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true") - .execute(); + Result result = + basicRewrite(table).option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true").execute(); Assert.assertEquals("Action should rewrite 8 data files", 8, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFilesCount()); @@ -363,13 +371,15 @@ public void testBinPackWithStartingSequenceNumberV1Compatibility() { assertEquals("Rows must match", expectedRecords, actualRecords); table.refresh(); - Assert.assertEquals("Table sequence number should still be 0", - oldSequenceNumber, table.currentSnapshot().sequenceNumber()); + Assert.assertEquals( + "Table sequence number should still be 0", + oldSequenceNumber, + table.currentSnapshot().sequenceNumber()); Dataset rows = SparkTableUtil.loadMetadataTable(spark, table, MetadataTableType.ENTRIES); for (Row row : rows.collectAsList()) { - Assert.assertEquals("Expect sequence number 0 for all entries", - oldSequenceNumber, row.getLong(2)); + Assert.assertEquals( + "Expect sequence number 0 for all entries", oldSequenceNumber, row.getLong(2)); } } @@ -392,19 +402,15 @@ public void testRewriteLargeTableHasResiduals() { table.refresh(); - CloseableIterable tasks = table.newScan() - .ignoreResiduals() - .filter(Expressions.equal("c3", "0")) - .planFiles(); + CloseableIterable tasks = + table.newScan().ignoreResiduals().filter(Expressions.equal("c3", "0")).planFiles(); for (FileScanTask task : tasks) { Assert.assertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual()); } shouldHaveFiles(table, 2); - Result result = basicRewrite(table) - .filter(Expressions.equal("c3", "0")) - .execute(); + Result result = basicRewrite(table).filter(Expressions.equal("c3", "0")).execute(); Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFilesCount()); @@ -421,10 +427,11 @@ public void testBinPackSplitLargeFile() { List expectedRecords = currentData(); long targetSize = testDataSize(table) / 2; - Result result = basicRewrite(table) - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(targetSize)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(targetSize * 2 - 2000)) - .execute(); + Result result = + basicRewrite(table) + .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(targetSize)) + .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(targetSize * 2 - 2000)) + .execute(); Assert.assertEquals("Action should delete 1 data files", 1, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 2 data files", 2, result.addedDataFilesCount()); @@ -449,14 +456,16 @@ public void testBinPackCombineMixedFiles() { int targetSize = averageFileSize(table); - Result result = basicRewrite(table) - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize + 1000)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString(targetSize + 80000)) - .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 1000)) - .execute(); + Result result = + basicRewrite(table) + .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize + 1000)) + .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString(targetSize + 80000)) + .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 1000)) + .execute(); Assert.assertEquals("Action should delete 3 data files", 3, result.rewrittenDataFilesCount()); - // Should Split the big files into 3 pieces, one of which should be combined with the two smaller files + // Should Split the big files into 3 pieces, one of which should be combined with the two + // smaller files Assert.assertEquals("Action should add 3 data files", 3, result.addedDataFilesCount()); shouldHaveFiles(table, 3); @@ -474,11 +483,14 @@ public void testBinPackCombineMediumFiles() { int targetSize = ((int) testDataSize(table) / 3); // The test is to see if we can combine parts of files to make files of the correct size - Result result = basicRewrite(table) - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize)) - .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString((int) (targetSize * 1.8))) - .option(BinPackStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 100)) // All files too small - .execute(); + Result result = + basicRewrite(table) + .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize)) + .option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString((int) (targetSize * 1.8))) + .option( + BinPackStrategy.MIN_FILE_SIZE_BYTES, + Integer.toString(targetSize - 100)) // All files too small + .execute(); Assert.assertEquals("Action should delete 4 data files", 4, result.rewrittenDataFilesCount()); Assert.assertEquals("Action should add 3 data files", 3, result.addedDataFilesCount()); @@ -500,7 +512,8 @@ public void testPartialProgressEnabled() { RewriteDataFiles.Result result = basicRewrite(table) .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "10") .execute(); @@ -525,7 +538,8 @@ public void testMultipleGroups() { // Perform a rewrite but only allow 2 files to be compacted at a time RewriteDataFiles.Result result = basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .option(BinPackStrategy.MIN_INPUT_FILES, "1") .execute(); @@ -550,7 +564,8 @@ public void testPartialProgressMaxCommits() { // Perform a rewrite but only allow 2 files to be compacted at a time RewriteDataFiles.Result result = basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3") .execute(); @@ -573,8 +588,10 @@ public void testSingleCommitWithRewriteFailure() { List originalData = currentData(); - RewriteDataFilesSparkAction realRewrite = basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)); + RewriteDataFilesSparkAction realRewrite = + basicRewrite(table) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)); RewriteDataFilesSparkAction spyRewrite = Mockito.spy(realRewrite); @@ -584,7 +601,9 @@ public void testSingleCommitWithRewriteFailure() { .when(spyRewrite) .rewriteFiles(any(), argThat(failGroup)); - AssertHelpers.assertThrows("Should fail entire rewrite if part fails", RuntimeException.class, + AssertHelpers.assertThrows( + "Should fail entire rewrite if part fails", + RuntimeException.class, () -> spyRewrite.execute()); table.refresh(); @@ -604,22 +623,22 @@ public void testSingleCommitWithCommitFailure() { List originalData = currentData(); - RewriteDataFilesSparkAction realRewrite = basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)); + RewriteDataFilesSparkAction realRewrite = + basicRewrite(table) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)); RewriteDataFilesSparkAction spyRewrite = spy(realRewrite); RewriteDataFilesCommitManager util = spy(new RewriteDataFilesCommitManager(table)); // Fail to commit - doThrow(new RuntimeException("Commit Failure")) - .when(util) - .commitFileGroups(any()); + doThrow(new RuntimeException("Commit Failure")).when(util).commitFileGroups(any()); - doReturn(util) - .when(spyRewrite) - .commitManager(table.currentSnapshot().snapshotId()); + doReturn(util).when(spyRewrite).commitManager(table.currentSnapshot().snapshotId()); - AssertHelpers.assertThrows("Should fail entire rewrite if commit fails", RuntimeException.class, + AssertHelpers.assertThrows( + "Should fail entire rewrite if commit fails", + RuntimeException.class, () -> spyRewrite.execute()); table.refresh(); @@ -639,9 +658,11 @@ public void testParallelSingleCommitWithRewriteFailure() { List originalData = currentData(); - RewriteDataFilesSparkAction realRewrite = basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) - .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3"); + RewriteDataFilesSparkAction realRewrite = + basicRewrite(table) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3"); RewriteDataFilesSparkAction spyRewrite = Mockito.spy(realRewrite); @@ -651,7 +672,9 @@ public void testParallelSingleCommitWithRewriteFailure() { .when(spyRewrite) .rewriteFiles(any(), argThat(failGroup)); - AssertHelpers.assertThrows("Should fail entire rewrite if part fails", RuntimeException.class, + AssertHelpers.assertThrows( + "Should fail entire rewrite if part fails", + RuntimeException.class, () -> spyRewrite.execute()); table.refresh(); @@ -671,10 +694,12 @@ public void testPartialProgressWithRewriteFailure() { List originalData = currentData(); - RewriteDataFilesSparkAction realRewrite = basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) - .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") - .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3"); + RewriteDataFilesSparkAction realRewrite = + basicRewrite(table) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") + .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3"); RewriteDataFilesSparkAction spyRewrite = Mockito.spy(realRewrite); @@ -707,11 +732,13 @@ public void testParallelPartialProgressWithRewriteFailure() { List originalData = currentData(); - RewriteDataFilesSparkAction realRewrite = basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) - .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3") - .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") - .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3"); + RewriteDataFilesSparkAction realRewrite = + basicRewrite(table) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3") + .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") + .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3"); RewriteDataFilesSparkAction spyRewrite = Mockito.spy(realRewrite); @@ -744,11 +771,13 @@ public void testParallelPartialProgressWithCommitFailure() { List originalData = currentData(); - RewriteDataFilesSparkAction realRewrite = basicRewrite(table) - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) - .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3") - .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") - .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3"); + RewriteDataFilesSparkAction realRewrite = + basicRewrite(table) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3") + .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") + .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3"); RewriteDataFilesSparkAction spyRewrite = spy(realRewrite); RewriteDataFilesCommitManager util = spy(new RewriteDataFilesCommitManager(table)); @@ -760,9 +789,7 @@ public void testParallelPartialProgressWithCommitFailure() { .when(util) .commitFileGroups(any()); - doReturn(util) - .when(spyRewrite) - .commitManager(table.currentSnapshot().snapshotId()); + doReturn(util).when(spyRewrite).commitManager(table.currentSnapshot().snapshotId()); RewriteDataFiles.Result result = spyRewrite.execute(); @@ -784,30 +811,32 @@ public void testParallelPartialProgressWithCommitFailure() { public void testInvalidOptions() { Table table = createTable(20); - AssertHelpers.assertThrows("No negative values for partial progress max commits", + AssertHelpers.assertThrows( + "No negative values for partial progress max commits", IllegalArgumentException.class, - () -> basicRewrite(table) - .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") - .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "-5") - .execute()); + () -> + basicRewrite(table) + .option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true") + .option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "-5") + .execute()); - AssertHelpers.assertThrows("No negative values for max concurrent groups", + AssertHelpers.assertThrows( + "No negative values for max concurrent groups", IllegalArgumentException.class, - () -> basicRewrite(table) - .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "-5") - .execute()); + () -> + basicRewrite(table) + .option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "-5") + .execute()); - AssertHelpers.assertThrows("No unknown options allowed", + AssertHelpers.assertThrows( + "No unknown options allowed", IllegalArgumentException.class, - () -> basicRewrite(table) - .option("foobarity", "-5") - .execute()); + () -> basicRewrite(table).option("foobarity", "-5").execute()); - AssertHelpers.assertThrows("Cannot set rewrite-job-order to foo", + AssertHelpers.assertThrows( + "Cannot set rewrite-job-order to foo", IllegalArgumentException.class, - () -> basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, "foo") - .execute()); + () -> basicRewrite(table).option(RewriteDataFiles.REWRITE_JOB_ORDER, "foo").execute()); } @Test @@ -825,7 +854,8 @@ public void testSortMultipleGroups() { basicRewrite(table) .sort() .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) + .option( + RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .execute(); Assert.assertEquals("Should have 10 fileGroups", result.rewriteResults().size(), 10); @@ -853,7 +883,8 @@ public void testSimpleSort() { .sort() .option(SortStrategy.MIN_INPUT_FILES, "1") .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); @@ -884,11 +915,14 @@ public void testSortAfterPartitionChange() { .sort() .option(SortStrategy.MIN_INPUT_FILES, "1") .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); - Assert.assertEquals("Should have 1 fileGroup because all files were not correctly partitioned", - result.rewriteResults().size(), 1); + Assert.assertEquals( + "Should have 1 fileGroup because all files were not correctly partitioned", + result.rewriteResults().size(), + 1); table.refresh(); @@ -913,7 +947,8 @@ public void testSortCustomSortOrder() { basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c2").build()) .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); @@ -948,7 +983,9 @@ public void testSortCustomSortOrderRequiresRepartition() { basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c3").build()) .option(SortStrategy.REWRITE_ALL, "true") - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) / partitions)) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + Integer.toString(averageFileSize(table) / partitions)) .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); @@ -976,14 +1013,19 @@ public void testAutoSortShuffleOutput() { RewriteDataFiles.Result result = basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c2").build()) - .option(SortStrategy.MAX_FILE_SIZE_BYTES, Integer.toString((averageFileSize(table) / 2) + 2)) + .option( + SortStrategy.MAX_FILE_SIZE_BYTES, + Integer.toString((averageFileSize(table) / 2) + 2)) // Divide files in 2 - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) / 2)) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + Integer.toString(averageFileSize(table) / 2)) .option(SortStrategy.MIN_INPUT_FILES, "1") .execute(); Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1); - Assert.assertTrue("Should have written 40+ files", + Assert.assertTrue( + "Should have written 40+ files", Iterables.size(table.currentSnapshot().addedDataFiles(table.io())) >= 40); table.refresh(); @@ -1008,17 +1050,20 @@ public void testCommitStateUnknownException() { RewriteDataFilesSparkAction spyAction = spy(action); RewriteDataFilesCommitManager util = spy(new RewriteDataFilesCommitManager(table)); - doAnswer(invocationOnMock -> { - invocationOnMock.callRealMethod(); - throw new CommitStateUnknownException(new RuntimeException("Unknown State")); - }).when(util).commitFileGroups(any()); + doAnswer( + invocationOnMock -> { + invocationOnMock.callRealMethod(); + throw new CommitStateUnknownException(new RuntimeException("Unknown State")); + }) + .when(util) + .commitFileGroups(any()); - doReturn(util) - .when(spyAction) - .commitManager(table.currentSnapshot().snapshotId()); + doReturn(util).when(spyAction).commitManager(table.currentSnapshot().snapshotId()); - AssertHelpers.assertThrows("Should propagate CommitStateUnknown Exception", - CommitStateUnknownException.class, () -> spyAction.execute()); + AssertHelpers.assertThrows( + "Should propagate CommitStateUnknown Exception", + CommitStateUnknownException.class, + () -> spyAction.execute()); List postRewriteData = currentData(); assertEquals("We shouldn't have changed the data", originalData, postRewriteData); @@ -1036,7 +1081,8 @@ public void testZOrderSort() { List originalData = currentData(); double originalFilesC2 = percentFilesRequired(table, "c2", "foo23"); double originalFilesC3 = percentFilesRequired(table, "c3", "bar21"); - double originalFilesC2C3 = percentFilesRequired(table, new String[]{"c2", "c3"}, new String[]{"foo23", "bar23"}); + double originalFilesC2C3 = + percentFilesRequired(table, new String[] {"c2", "c3"}, new String[] {"foo23", "bar23"}); Assert.assertTrue("Should require all files to scan c2", originalFilesC2 > 0.99); Assert.assertTrue("Should require all files to scan c3", originalFilesC3 > 0.99); @@ -1044,9 +1090,13 @@ public void testZOrderSort() { RewriteDataFiles.Result result = basicRewrite(table) .zOrder("c2", "c3") - .option(SortStrategy.MAX_FILE_SIZE_BYTES, Integer.toString((averageFileSize(table) / 2) + 2)) + .option( + SortStrategy.MAX_FILE_SIZE_BYTES, + Integer.toString((averageFileSize(table) / 2) + 2)) // Divide files in 2 - .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) / 2)) + .option( + RewriteDataFiles.TARGET_FILE_SIZE_BYTES, + Integer.toString(averageFileSize(table) / 2)) .option(SortStrategy.MIN_INPUT_FILES, "1") .execute(); @@ -1064,13 +1114,17 @@ public void testZOrderSort() { double filesScannedC2 = percentFilesRequired(table, "c2", "foo23"); double filesScannedC3 = percentFilesRequired(table, "c3", "bar21"); - double filesScannedC2C3 = percentFilesRequired(table, new String[]{"c2", "c3"}, new String[]{"foo23", "bar23"}); + double filesScannedC2C3 = + percentFilesRequired(table, new String[] {"c2", "c3"}, new String[] {"foo23", "bar23"}); - Assert.assertTrue("Should have reduced the number of files required for c2", + Assert.assertTrue( + "Should have reduced the number of files required for c2", filesScannedC2 < originalFilesC2); - Assert.assertTrue("Should have reduced the number of files required for c3", + Assert.assertTrue( + "Should have reduced the number of files required for c3", filesScannedC3 < originalFilesC3); - Assert.assertTrue("Should have reduced the number of files required for a c2,c3 predicate", + Assert.assertTrue( + "Should have reduced the number of files required for a c2,c3 predicate", filesScannedC2C3 < originalFilesC2C3); } @@ -1079,13 +1133,22 @@ public void testZOrderAllTypesSort() { Table table = createTypeTestTable(); shouldHaveFiles(table, 10); - List originalRaw = spark.read().format("iceberg").load(tableLocation).sort("longCol").collectAsList(); + List originalRaw = + spark.read().format("iceberg").load(tableLocation).sort("longCol").collectAsList(); List originalData = rowsToJava(originalRaw); // TODO add in UUID when it is supported in Spark RewriteDataFiles.Result result = basicRewrite(table) - .zOrder("longCol", "intCol", "floatCol", "doubleCol", "dateCol", "timestampCol", "stringCol", "binaryCol", + .zOrder( + "longCol", + "intCol", + "floatCol", + "doubleCol", + "dateCol", + "timestampCol", + "stringCol", + "binaryCol", "booleanCol") .option(SortStrategy.MIN_INPUT_FILES, "1") .option(SortStrategy.REWRITE_ALL, "true") @@ -1097,7 +1160,8 @@ public void testZOrderAllTypesSort() { table.refresh(); - List postRaw = spark.read().format("iceberg").load(tableLocation).sort("longCol").collectAsList(); + List postRaw = + spark.read().format("iceberg").load(tableLocation).sort("longCol").collectAsList(); List postRewriteData = rowsToJava(postRaw); assertEquals("We shouldn't have changed the data", originalData, postRewriteData); @@ -1109,18 +1173,23 @@ public void testZOrderAllTypesSort() { public void testInvalidAPIUsage() { Table table = createTable(1); - AssertHelpers.assertThrows("Should be unable to set Strategy more than once", IllegalArgumentException.class, - "Cannot set strategy", () -> actions().rewriteDataFiles(table).binPack().sort()); + AssertHelpers.assertThrows( + "Should be unable to set Strategy more than once", + IllegalArgumentException.class, + "Cannot set strategy", + () -> actions().rewriteDataFiles(table).binPack().sort()); - AssertHelpers.assertThrows("Should be unable to set Strategy more than once", IllegalArgumentException.class, - "Cannot set strategy", () -> actions().rewriteDataFiles(table).sort().binPack()); + AssertHelpers.assertThrows( + "Should be unable to set Strategy more than once", + IllegalArgumentException.class, + "Cannot set strategy", + () -> actions().rewriteDataFiles(table).sort().binPack()); AssertHelpers.assertThrows( "Should be unable to set Strategy more than once", IllegalArgumentException.class, "Cannot set strategy", - () -> - actions().rewriteDataFiles(table).sort(SortOrder.unsorted()).binPack()); + () -> actions().rewriteDataFiles(table).sort(SortOrder.unsorted()).binPack()); } @Test @@ -1133,18 +1202,21 @@ public void testRewriteJobOrderBytesAsc() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); RewriteDataFilesSparkAction basicRewrite = basicRewrite(table).binPack(); - List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::sizeInBytes) - .boxed() - .collect(Collectors.toList()); - - RewriteDataFilesSparkAction jobOrderRewrite = basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.BYTES_ASC.orderName()) - .binPack(); - List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::sizeInBytes) - .boxed() - .collect(Collectors.toList()); + List expected = + toGroupStream(table, basicRewrite) + .mapToLong(RewriteFileGroup::sizeInBytes) + .boxed() + .collect(Collectors.toList()); + + RewriteDataFilesSparkAction jobOrderRewrite = + basicRewrite(table) + .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.BYTES_ASC.orderName()) + .binPack(); + List actual = + toGroupStream(table, jobOrderRewrite) + .mapToLong(RewriteFileGroup::sizeInBytes) + .boxed() + .collect(Collectors.toList()); expected.sort(Comparator.naturalOrder()); Assert.assertEquals("Size in bytes order should be ascending", actual, expected); @@ -1162,18 +1234,21 @@ public void testRewriteJobOrderBytesDesc() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); RewriteDataFilesSparkAction basicRewrite = basicRewrite(table).binPack(); - List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::sizeInBytes) - .boxed() - .collect(Collectors.toList()); - - RewriteDataFilesSparkAction jobOrderRewrite = basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.BYTES_DESC.orderName()) - .binPack(); - List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::sizeInBytes) - .boxed() - .collect(Collectors.toList()); + List expected = + toGroupStream(table, basicRewrite) + .mapToLong(RewriteFileGroup::sizeInBytes) + .boxed() + .collect(Collectors.toList()); + + RewriteDataFilesSparkAction jobOrderRewrite = + basicRewrite(table) + .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.BYTES_DESC.orderName()) + .binPack(); + List actual = + toGroupStream(table, jobOrderRewrite) + .mapToLong(RewriteFileGroup::sizeInBytes) + .boxed() + .collect(Collectors.toList()); expected.sort(Comparator.reverseOrder()); Assert.assertEquals("Size in bytes order should be descending", actual, expected); @@ -1191,18 +1266,21 @@ public void testRewriteJobOrderFilesAsc() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); RewriteDataFilesSparkAction basicRewrite = basicRewrite(table).binPack(); - List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::numFiles) - .boxed() - .collect(Collectors.toList()); - - RewriteDataFilesSparkAction jobOrderRewrite = basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.FILES_ASC.orderName()) - .binPack(); - List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::numFiles) - .boxed() - .collect(Collectors.toList()); + List expected = + toGroupStream(table, basicRewrite) + .mapToLong(RewriteFileGroup::numFiles) + .boxed() + .collect(Collectors.toList()); + + RewriteDataFilesSparkAction jobOrderRewrite = + basicRewrite(table) + .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.FILES_ASC.orderName()) + .binPack(); + List actual = + toGroupStream(table, jobOrderRewrite) + .mapToLong(RewriteFileGroup::numFiles) + .boxed() + .collect(Collectors.toList()); expected.sort(Comparator.naturalOrder()); Assert.assertEquals("Number of files order should be ascending", actual, expected); @@ -1220,18 +1298,21 @@ public void testRewriteJobOrderFilesDesc() { table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); RewriteDataFilesSparkAction basicRewrite = basicRewrite(table).binPack(); - List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::numFiles) - .boxed() - .collect(Collectors.toList()); - - RewriteDataFilesSparkAction jobOrderRewrite = basicRewrite(table) - .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.FILES_DESC.orderName()) - .binPack(); - List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::numFiles) - .boxed() - .collect(Collectors.toList()); + List expected = + toGroupStream(table, basicRewrite) + .mapToLong(RewriteFileGroup::numFiles) + .boxed() + .collect(Collectors.toList()); + + RewriteDataFilesSparkAction jobOrderRewrite = + basicRewrite(table) + .option(RewriteDataFiles.REWRITE_JOB_ORDER, RewriteJobOrder.FILES_DESC.orderName()) + .binPack(); + List actual = + toGroupStream(table, jobOrderRewrite) + .mapToLong(RewriteFileGroup::numFiles) + .boxed() + .collect(Collectors.toList()); expected.sort(Comparator.reverseOrder()); Assert.assertEquals("Number of files order should be descending", actual, expected); @@ -1249,9 +1330,8 @@ private Stream toGroupStream(Table table, RewriteDataFilesSpar } protected List currentData() { - return rowsToJava(spark.read().format("iceberg").load(tableLocation) - .sort("c1", "c2", "c3") - .collectAsList()); + return rowsToJava( + spark.read().format("iceberg").load(tableLocation).sort("c1", "c2", "c3").collectAsList()); } protected long testDataSize(Table table) { @@ -1273,84 +1353,102 @@ protected void shouldHaveFiles(Table table, int numExpected) { protected void shouldHaveSnapshots(Table table, int expectedSnapshots) { table.refresh(); int actualSnapshots = Iterables.size(table.snapshots()); - Assert.assertEquals("Table did not have the expected number of snapshots", - expectedSnapshots, actualSnapshots); + Assert.assertEquals( + "Table did not have the expected number of snapshots", expectedSnapshots, actualSnapshots); } protected void shouldHaveNoOrphans(Table table) { - Assert.assertEquals("Should not have found any orphan files", ImmutableList.of(), - actions().deleteOrphanFiles(table) + Assert.assertEquals( + "Should not have found any orphan files", + ImmutableList.of(), + actions() + .deleteOrphanFiles(table) .olderThan(System.currentTimeMillis()) .execute() .orphanFileLocations()); } protected void shouldHaveACleanCache(Table table) { - Assert.assertEquals("Should not have any entries in cache", ImmutableSet.of(), - cacheContents(table)); + Assert.assertEquals( + "Should not have any entries in cache", ImmutableSet.of(), cacheContents(table)); } protected void shouldHaveLastCommitSorted(Table table, String column) { - List, Pair>> - overlappingFiles = checkForOverlappingFiles(table, column); + List, Pair>> overlappingFiles = checkForOverlappingFiles(table, column); Assert.assertEquals("Found overlapping files", Collections.emptyList(), overlappingFiles); } protected void shouldHaveLastCommitUnsorted(Table table, String column) { - List, Pair>> - overlappingFiles = checkForOverlappingFiles(table, column); + List, Pair>> overlappingFiles = checkForOverlappingFiles(table, column); Assert.assertNotEquals("Found no overlapping files", Collections.emptyList(), overlappingFiles); } private Pair boundsOf(DataFile file, NestedField field, Class javaClass) { int columnId = field.fieldId(); - return Pair.of(javaClass.cast(Conversions.fromByteBuffer(field.type(), file.lowerBounds().get(columnId))), + return Pair.of( + javaClass.cast(Conversions.fromByteBuffer(field.type(), file.lowerBounds().get(columnId))), javaClass.cast(Conversions.fromByteBuffer(field.type(), file.upperBounds().get(columnId)))); } - - private List, Pair>> checkForOverlappingFiles(Table table, String column) { + private List, Pair>> checkForOverlappingFiles( + Table table, String column) { table.refresh(); NestedField field = table.schema().caseInsensitiveFindField(column); Class javaClass = (Class) field.type().typeId().javaClass(); Snapshot snapshot = table.currentSnapshot(); - Map> filesByPartition = Streams.stream(snapshot.addedDataFiles(table.io())) - .collect(Collectors.groupingBy(DataFile::partition)); + Map> filesByPartition = + Streams.stream(snapshot.addedDataFiles(table.io())) + .collect(Collectors.groupingBy(DataFile::partition)); Stream, Pair>> overlaps = - filesByPartition.entrySet().stream().flatMap(entry -> { - List datafiles = entry.getValue(); - Preconditions.checkArgument(datafiles.size() > 1, - "This test is checking for overlaps in a situation where no overlaps can actually occur because the " + - "partition %s does not contain multiple datafiles", entry.getKey()); - - List, Pair>> boundComparisons = Lists.cartesianProduct(datafiles, datafiles).stream() - .filter(tuple -> tuple.get(0) != tuple.get(1)) - .map(tuple -> Pair.of(boundsOf(tuple.get(0), field, javaClass), boundsOf(tuple.get(1), field, javaClass))) - .collect(Collectors.toList()); - - Comparator comparator = Comparators.forType(field.type().asPrimitiveType()); - - List, Pair>> overlappingFiles = boundComparisons.stream() - .filter(filePair -> { - Pair left = filePair.first(); - T lMin = left.first(); - T lMax = left.second(); - Pair right = filePair.second(); - T rMin = right.first(); - T rMax = right.second(); - boolean boundsDoNotOverlap = - // Min and Max of a range are greater than or equal to the max value of the other range - (comparator.compare(rMax, lMax) >= 0 && comparator.compare(rMin, lMax) >= 0) || - (comparator.compare(lMax, rMax) >= 0 && comparator.compare(lMin, rMax) >= 0); - - return !boundsDoNotOverlap; - }).collect(Collectors.toList()); - return overlappingFiles.stream(); - }); + filesByPartition.entrySet().stream() + .flatMap( + entry -> { + List datafiles = entry.getValue(); + Preconditions.checkArgument( + datafiles.size() > 1, + "This test is checking for overlaps in a situation where no overlaps can actually occur because the " + + "partition %s does not contain multiple datafiles", + entry.getKey()); + + List, Pair>> boundComparisons = + Lists.cartesianProduct(datafiles, datafiles).stream() + .filter(tuple -> tuple.get(0) != tuple.get(1)) + .map( + tuple -> + Pair.of( + boundsOf(tuple.get(0), field, javaClass), + boundsOf(tuple.get(1), field, javaClass))) + .collect(Collectors.toList()); + + Comparator comparator = Comparators.forType(field.type().asPrimitiveType()); + + List, Pair>> overlappingFiles = + boundComparisons.stream() + .filter( + filePair -> { + Pair left = filePair.first(); + T lMin = left.first(); + T lMax = left.second(); + Pair right = filePair.second(); + T rMin = right.first(); + T rMax = right.second(); + boolean boundsDoNotOverlap = + // Min and Max of a range are greater than or equal to the max + // value of the other range + (comparator.compare(rMax, lMax) >= 0 + && comparator.compare(rMin, lMax) >= 0) + || (comparator.compare(lMax, rMax) >= 0 + && comparator.compare(lMin, rMax) >= 0); + + return !boundsDoNotOverlap; + }) + .collect(Collectors.toList()); + return overlappingFiles.stream(); + }); return overlaps.collect(Collectors.toList()); } @@ -1359,13 +1457,17 @@ protected Table createTable() { PartitionSpec spec = PartitionSpec.unpartitioned(); Map options = Maps.newHashMap(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - table.updateProperties().set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, Integer.toString(20 * 1024)).commit(); + table + .updateProperties() + .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, Integer.toString(20 * 1024)) + .commit(); Assert.assertNull("Table must be empty", table.currentSnapshot()); return table; } /** * Create a table with a certain number of files, returns the size of a file + * * @param files number of files to create * @return the created table */ @@ -1375,12 +1477,9 @@ protected Table createTable(int files) { return table; } - protected Table createTablePartitioned(int partitions, int files, - int numRecords, Map options) { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .truncate("c2", 2) - .build(); + protected Table createTablePartitioned( + int partitions, int files, int numRecords, Map options) { + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); Assert.assertNull("Table must be empty", table.currentSnapshot()); @@ -1393,21 +1492,23 @@ protected Table createTablePartitioned(int partitions, int files) { } private Table createTypeTestTable() { - Schema schema = new Schema( - required(1, "longCol", Types.LongType.get()), - required(2, "intCol", Types.IntegerType.get()), - required(3, "floatCol", Types.FloatType.get()), - optional(4, "doubleCol", Types.DoubleType.get()), - optional(5, "dateCol", Types.DateType.get()), - optional(6, "timestampCol", Types.TimestampType.withZone()), - optional(7, "stringCol", Types.StringType.get()), - optional(8, "booleanCol", Types.BooleanType.get()), - optional(9, "binaryCol", Types.BinaryType.get())); + Schema schema = + new Schema( + required(1, "longCol", Types.LongType.get()), + required(2, "intCol", Types.IntegerType.get()), + required(3, "floatCol", Types.FloatType.get()), + optional(4, "doubleCol", Types.DoubleType.get()), + optional(5, "dateCol", Types.DateType.get()), + optional(6, "timestampCol", Types.TimestampType.withZone()), + optional(7, "stringCol", Types.StringType.get()), + optional(8, "booleanCol", Types.BooleanType.get()), + optional(9, "binaryCol", Types.BinaryType.get())); Map options = Maps.newHashMap(); Table table = TABLES.create(schema, PartitionSpec.unpartitioned(), options, tableLocation); - spark.range(0, 10, 1, 10) + spark + .range(0, 10, 1, 10) .withColumnRenamed("id", "longCol") .withColumn("intCol", expr("CAST(longCol AS INT)")) .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) @@ -1427,7 +1528,11 @@ private Table createTypeTestTable() { protected int averageFileSize(Table table) { table.refresh(); - return (int) Streams.stream(table.newScan().planFiles()).mapToLong(FileScanTask::length).average().getAsDouble(); + return (int) + Streams.stream(table.newScan().planFiles()) + .mapToLong(FileScanTask::length) + .average() + .getAsDouble(); } private void writeRecords(int files, int numRecords) { @@ -1438,20 +1543,21 @@ private void writeRecords(int files, int numRecords, int partitions) { List records = Lists.newArrayList(); int rowDimension = (int) Math.ceil(Math.sqrt(numRecords)); List> data = - IntStream.range(0, rowDimension).boxed().flatMap(x -> - IntStream.range(0, rowDimension).boxed().map(y -> Pair.of(x, y))) + IntStream.range(0, rowDimension) + .boxed() + .flatMap(x -> IntStream.range(0, rowDimension).boxed().map(y -> Pair.of(x, y))) .collect(Collectors.toList()); Collections.shuffle(data, new Random(42)); if (partitions > 0) { - data.forEach(i -> records.add(new ThreeColumnRecord( - i.first() % partitions, - "foo" + i.first(), - "bar" + i.second()))); + data.forEach( + i -> + records.add( + new ThreeColumnRecord( + i.first() % partitions, "foo" + i.first(), "bar" + i.second()))); } else { - data.forEach(i -> records.add(new ThreeColumnRecord( - i.first(), - "foo" + i.first(), - "bar" + i.second()))); + data.forEach( + i -> + records.add(new ThreeColumnRecord(i.first(), "foo" + i.first(), "bar" + i.second()))); } Dataset df = spark.createDataFrame(records, ThreeColumnRecord.class).repartition(files); writeDF(df); @@ -1466,24 +1572,31 @@ private void writeDF(Dataset df) { .save(tableLocation); } - private List writePosDeletesToFile(Table table, DataFile dataFile, int outputDeleteFiles) { - return writePosDeletes(table, dataFile.partition(), dataFile.path().toString(), outputDeleteFiles); + private List writePosDeletesToFile( + Table table, DataFile dataFile, int outputDeleteFiles) { + return writePosDeletes( + table, dataFile.partition(), dataFile.path().toString(), outputDeleteFiles); } - private List writePosDeletes(Table table, StructLike partition, String path, int outputDeleteFiles) { + private List writePosDeletes( + Table table, StructLike partition, String path, int outputDeleteFiles) { List results = Lists.newArrayList(); int rowPosition = 0; for (int file = 0; file < outputDeleteFiles; file++) { - OutputFile outputFile = table.io().newOutputFile( - table.locationProvider().newDataLocation(UUID.randomUUID().toString())); - EncryptedOutputFile encryptedOutputFile = EncryptedFiles.encryptedOutput( - outputFile, EncryptionKeyMetadata.EMPTY); - - GenericAppenderFactory appenderFactory = new GenericAppenderFactory( - table.schema(), table.spec(), null, null, null); - PositionDeleteWriter posDeleteWriter = appenderFactory - .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full") - .newPosDeleteWriter(encryptedOutputFile, FileFormat.PARQUET, partition); + OutputFile outputFile = + table + .io() + .newOutputFile( + table.locationProvider().newDataLocation(UUID.randomUUID().toString())); + EncryptedOutputFile encryptedOutputFile = + EncryptedFiles.encryptedOutput(outputFile, EncryptionKeyMetadata.EMPTY); + + GenericAppenderFactory appenderFactory = + new GenericAppenderFactory(table.schema(), table.spec(), null, null, null); + PositionDeleteWriter posDeleteWriter = + appenderFactory + .set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full") + .newPosDeleteWriter(encryptedOutputFile, FileFormat.PARQUET, partition); posDeleteWriter.delete(path, rowPosition); try { @@ -1511,7 +1624,7 @@ private Set cacheContents(Table table) { } private double percentFilesRequired(Table table, String col, String value) { - return percentFilesRequired(table, new String[]{col}, new String[]{value}); + return percentFilesRequired(table, new String[] {col}, new String[] {value}); } private double percentFilesRequired(Table table, String[] cols, String[] values) { diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java index f30251e74001..4b50ea0c29f3 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteManifestsAction.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.actions; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + import java.io.File; import java.io.IOException; import java.util.List; @@ -53,28 +57,22 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - @RunWith(Parameterized.class) public class TestRewriteManifestsAction extends SparkTestBase { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); @Parameterized.Parameters(name = "snapshotIdInheritanceEnabled = {0}") public static Object[] parameters() { - return new Object[] { "true", "false" }; + return new Object[] {"true", "false"}; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String snapshotIdInheritanceEnabled; private String tableLocation = null; @@ -100,7 +98,8 @@ public void testRewriteManifestsEmptyTable() throws IOException { SparkActions actions = SparkActions.get(); - actions.rewriteManifests(table) + actions + .rewriteManifests(table) .rewriteIf(manifest -> true) .stagingLocation(temp.newFolder().toString()) .execute(); @@ -115,16 +114,15 @@ public void testRewriteSmallManifestsNonPartitionedTable() { options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -134,12 +132,13 @@ public void testRewriteSmallManifestsNonPartitionedTable() { SparkActions actions = SparkActions.get(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .execute(); + RewriteManifests.Result result = + actions.rewriteManifests(table).rewriteIf(manifest -> true).execute(); - Assert.assertEquals("Action should rewrite 2 manifests", 2, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite 2 manifests", 2, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); table.refresh(); @@ -155,9 +154,8 @@ public void testRewriteSmallManifestsNonPartitionedTable() { expectedRecords.addAll(records2); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -169,16 +167,15 @@ public void testRewriteManifestsWithCommitStateUnknownException() { options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -191,15 +188,19 @@ public void testRewriteManifestsWithCommitStateUnknownException() { // create a spy which would throw a CommitStateUnknownException after successful commit. org.apache.iceberg.RewriteManifests newRewriteManifests = table.rewriteManifests(); org.apache.iceberg.RewriteManifests spyNewRewriteManifests = spy(newRewriteManifests); - doAnswer(invocation -> { - newRewriteManifests.commit(); - throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); - }).when(spyNewRewriteManifests).commit(); + doAnswer( + invocation -> { + newRewriteManifests.commit(); + throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); + }) + .when(spyNewRewriteManifests) + .commit(); Table spyTable = spy(table); when(spyTable.rewriteManifests()).thenReturn(spyNewRewriteManifests); - AssertHelpers.assertThrowsCause("Should throw a Commit State Unknown Exception", + AssertHelpers.assertThrowsCause( + "Should throw a Commit State Unknown Exception", RuntimeException.class, "Datacenter on Fire", () -> actions.rewriteManifests(spyTable).rewriteIf(manifest -> true).execute()); @@ -219,45 +220,40 @@ public void testRewriteManifestsWithCommitStateUnknownException() { expectedRecords.addAll(records2); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @Test public void testRewriteSmallManifestsPartitionedTable() { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .truncate("c2", 2) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); - List records3 = Lists.newArrayList( - new ThreeColumnRecord(3, "EEEEEEEEEE", "EEEE"), - new ThreeColumnRecord(3, "FFFFFFFFFF", "FFFF") - ); + List records3 = + Lists.newArrayList( + new ThreeColumnRecord(3, "EEEEEEEEEE", "EEEE"), + new ThreeColumnRecord(3, "FFFFFFFFFF", "FFFF")); writeRecords(records3); - List records4 = Lists.newArrayList( - new ThreeColumnRecord(4, "GGGGGGGGGG", "GGGG"), - new ThreeColumnRecord(4, "HHHHHHHHHG", "HHHH") - ); + List records4 = + Lists.newArrayList( + new ThreeColumnRecord(4, "GGGGGGGGGG", "GGGG"), + new ThreeColumnRecord(4, "HHHHHHHHHG", "HHHH")); writeRecords(records4); table.refresh(); @@ -271,16 +267,18 @@ public void testRewriteSmallManifestsPartitionedTable() { long manifestEntrySizeBytes = computeManifestEntrySizeBytes(manifests); long targetManifestSizeBytes = (long) (1.05 * 4 * manifestEntrySizeBytes); - table.updateProperties() + table + .updateProperties() .set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(targetManifestSizeBytes)) .commit(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .execute(); + RewriteManifests.Result result = + actions.rewriteManifests(table).rewriteIf(manifest -> true).execute(); - Assert.assertEquals("Action should rewrite 4 manifests", 4, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite 4 manifests", 4, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); table.refresh(); @@ -302,32 +300,29 @@ public void testRewriteSmallManifestsPartitionedTable() { expectedRecords.addAll(records4); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @Test public void testRewriteImportedManifests() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c3") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c3").build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); File parquetTableDir = temp.newFolder("parquet_table"); String parquetTableLocation = parquetTableDir.toURI().toString(); try { Dataset inputDF = spark.createDataFrame(records, ThreeColumnRecord.class); - inputDF.select("c1", "c2", "c3") + inputDF + .select("c1", "c2", "c3") .write() .format("parquet") .mode("overwrite") @@ -336,20 +331,26 @@ public void testRewriteImportedManifests() throws IOException { .saveAsTable("parquet_table"); File stagingDir = temp.newFolder("staging-dir"); - SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); + SparkTableUtil.importSparkTable( + spark, new TableIdentifier("parquet_table"), table, stagingDir.toString()); Snapshot snapshot = table.currentSnapshot(); SparkActions actions = SparkActions.get(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .stagingLocation(temp.newFolder().toString()) - .execute(); + RewriteManifests.Result result = + actions + .rewriteManifests(table) + .rewriteIf(manifest -> true) + .stagingLocation(temp.newFolder().toString()) + .execute(); - Assert.assertEquals("Action should rewrite all manifests", - snapshot.allManifests(table.io()), result.rewrittenManifests()); - Assert.assertEquals("Action should add 1 manifest", 1, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite all manifests", + snapshot.allManifests(table.io()), + result.rewrittenManifests()); + Assert.assertEquals( + "Action should add 1 manifest", 1, Iterables.size(result.addedManifests())); } finally { spark.sql("DROP TABLE parquet_table"); @@ -358,9 +359,7 @@ public void testRewriteImportedManifests() throws IOException { @Test public void testRewriteLargeManifestsPartitionedTable() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c3") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c3").build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); @@ -380,19 +379,26 @@ public void testRewriteLargeManifestsPartitionedTable() throws IOException { Assert.assertEquals("Should have 1 manifests before rewrite", 1, manifests.size()); // set the target manifest size to a small value to force splitting records into multiple files - table.updateProperties() - .set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(manifests.get(0).length() / 2)) + table + .updateProperties() + .set( + TableProperties.MANIFEST_TARGET_SIZE_BYTES, + String.valueOf(manifests.get(0).length() / 2)) .commit(); SparkActions actions = SparkActions.get(); - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> true) - .stagingLocation(temp.newFolder().toString()) - .execute(); + RewriteManifests.Result result = + actions + .rewriteManifests(table) + .rewriteIf(manifest -> true) + .stagingLocation(temp.newFolder().toString()) + .execute(); - Assert.assertEquals("Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); + Assert.assertEquals( + "Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 2 manifests", 2, Iterables.size(result.addedManifests())); table.refresh(); @@ -400,33 +406,28 @@ public void testRewriteLargeManifestsPartitionedTable() throws IOException { Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size()); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } @Test public void testRewriteManifestsWithPredicate() throws IOException { - PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) - .identity("c1") - .truncate("c2", 2) - .build(); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build(); Map options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); - List records1 = Lists.newArrayList( - new ThreeColumnRecord(1, null, "AAAA"), - new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") - ); + List records1 = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB")); writeRecords(records1); - List records2 = Lists.newArrayList( - new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), - new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") - ); + List records2 = + Lists.newArrayList( + new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), + new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD")); writeRecords(records2); table.refresh(); @@ -437,14 +438,18 @@ public void testRewriteManifestsWithPredicate() throws IOException { SparkActions actions = SparkActions.get(); // rewrite only the first manifest without caching - RewriteManifests.Result result = actions.rewriteManifests(table) - .rewriteIf(manifest -> manifest.path().equals(manifests.get(0).path())) - .stagingLocation(temp.newFolder().toString()) - .option("use-caching", "false") - .execute(); - - Assert.assertEquals("Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); - Assert.assertEquals("Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); + RewriteManifests.Result result = + actions + .rewriteManifests(table) + .rewriteIf(manifest -> manifest.path().equals(manifests.get(0).path())) + .stagingLocation(temp.newFolder().toString()) + .option("use-caching", "false") + .execute(); + + Assert.assertEquals( + "Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests())); + Assert.assertEquals( + "Action should add 1 manifests", 1, Iterables.size(result.addedManifests())); table.refresh(); @@ -452,16 +457,16 @@ public void testRewriteManifestsWithPredicate() throws IOException { Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size()); Assert.assertFalse("First manifest must be rewritten", newManifests.contains(manifests.get(0))); - Assert.assertTrue("Second manifest must not be rewritten", newManifests.contains(manifests.get(1))); + Assert.assertTrue( + "Second manifest must not be rewritten", newManifests.contains(manifests.get(1))); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(records1); expectedRecords.addAll(records2); Dataset resultDF = spark.read().format("iceberg").load(tableLocation); - List actualRecords = resultDF.sort("c1", "c2") - .as(Encoders.bean(ThreeColumnRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); } @@ -472,11 +477,7 @@ private void writeRecords(List records) { } private void writeDF(Dataset df) { - df.select("c1", "c2", "c3") - .write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation); } private long computeManifestEntrySizeBytes(List manifests) { @@ -485,7 +486,8 @@ private long computeManifestEntrySizeBytes(List manifests) { for (ManifestFile manifest : manifests) { totalSize += manifest.length(); - numEntries += manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); + numEntries += + manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount(); } return totalSize / numEntries; diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java index a8556de6840d..5fd137c5361d 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTest.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; @@ -38,35 +40,32 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class AvroDataTest { protected abstract void writeAndValidate(Schema schema) throws IOException; - protected static final StructType SUPPORTED_PRIMITIVES = StructType.of( - required(100, "id", LongType.get()), - optional(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - optional(103, "i", Types.IntegerType.get()), - required(104, "l", LongType.get()), - optional(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - optional(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - // required(111, "uuid", Types.UUIDType.get()), - required(112, "fixed", Types.FixedType.ofLength(7)), - optional(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), // int encoded - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), // long encoded - required(116, "dec_20_5", Types.DecimalType.of(20, 5)), // requires padding - required(117, "dec_38_10", Types.DecimalType.of(38, 10)) // Spark's maximum precision - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + protected static final StructType SUPPORTED_PRIMITIVES = + StructType.of( + required(100, "id", LongType.get()), + optional(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + optional(103, "i", Types.IntegerType.get()), + required(104, "l", LongType.get()), + optional(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + optional(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + // required(111, "uuid", Types.UUIDType.get()), + required(112, "fixed", Types.FixedType.ofLength(7)), + optional(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), // int encoded + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), // long encoded + required(116, "dec_20_5", Types.DecimalType.of(20, 5)), // requires padding + required(117, "dec_38_10", Types.DecimalType.of(38, 10)) // Spark's maximum precision + ); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testSimpleStruct() throws IOException { @@ -75,162 +74,208 @@ public void testSimpleStruct() throws IOException { @Test public void testStructWithRequiredFields() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asRequired)))); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds( + new Schema( + Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asRequired)))); } @Test public void testStructWithOptionalFields() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)))); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds( + new Schema( + Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)))); } @Test public void testNestedStruct() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema(required(1, "struct", SUPPORTED_PRIMITIVES)))); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds(new Schema(required(1, "struct", SUPPORTED_PRIMITIVES)))); } @Test public void testArray() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", ListType.ofOptional(2, Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, Types.StringType.get()))); writeAndValidate(schema); } @Test public void testArrayOfStructs() throws IOException { - Schema schema = TypeUtil.assignIncreasingFreshIds(new Schema( - required(0, "id", LongType.get()), - optional(1, "data", ListType.ofOptional(2, SUPPORTED_PRIMITIVES)))); + Schema schema = + TypeUtil.assignIncreasingFreshIds( + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, SUPPORTED_PRIMITIVES)))); writeAndValidate(schema); } @Test public void testMap() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StringType.get(), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional(2, 3, Types.StringType.get(), Types.StringType.get()))); writeAndValidate(schema); } @Test public void testNumericMapKey() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.LongType.get(), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, "data", MapType.ofOptional(2, 3, Types.LongType.get(), Types.StringType.get()))); writeAndValidate(schema); } @Test public void testComplexMapKey() throws IOException { - Schema schema = new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StructType.of( - required(4, "i", Types.IntegerType.get()), - optional(5, "s", Types.StringType.get())), - Types.StringType.get()))); + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional( + 2, + 3, + Types.StructType.of( + required(4, "i", Types.IntegerType.get()), + optional(5, "s", Types.StringType.get())), + Types.StringType.get()))); writeAndValidate(schema); } @Test public void testMapOfStructs() throws IOException { - Schema schema = TypeUtil.assignIncreasingFreshIds(new Schema( - required(0, "id", LongType.get()), - optional(1, "data", MapType.ofOptional(2, 3, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)))); + Schema schema = + TypeUtil.assignIncreasingFreshIds( + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional(2, 3, Types.StringType.get(), SUPPORTED_PRIMITIVES)))); writeAndValidate(schema); } @Test public void testMixedTypes() throws IOException { - StructType structType = StructType.of( - required(0, "id", LongType.get()), - optional(1, "list_of_maps", - ListType.ofOptional(2, MapType.ofOptional(3, 4, - Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - optional(5, "map_of_lists", - MapType.ofOptional(6, 7, - Types.StringType.get(), - ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), - required(9, "list_of_lists", - ListType.ofOptional(10, ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), - required(12, "map_of_maps", - MapType.ofOptional(13, 14, - Types.StringType.get(), - MapType.ofOptional(15, 16, + StructType structType = + StructType.of( + required(0, "id", LongType.get()), + optional( + 1, + "list_of_maps", + ListType.ofOptional( + 2, MapType.ofOptional(3, 4, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + optional( + 5, + "map_of_lists", + MapType.ofOptional( + 6, 7, Types.StringType.get(), ListType.ofOptional(8, SUPPORTED_PRIMITIVES))), + required( + 9, + "list_of_lists", + ListType.ofOptional(10, ListType.ofOptional(11, SUPPORTED_PRIMITIVES))), + required( + 12, + "map_of_maps", + MapType.ofOptional( + 13, + 14, Types.StringType.get(), - SUPPORTED_PRIMITIVES))), - required(17, "list_of_struct_of_nested_types", ListType.ofOptional(19, StructType.of( - Types.NestedField.required(20, "m1", MapType.ofOptional(21, 22, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(23, "l1", ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), - Types.NestedField.required(25, "l2", ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), - Types.NestedField.optional(27, "m2", MapType.ofOptional(28, 29, - Types.StringType.get(), - SUPPORTED_PRIMITIVES)) - ))) - ); - - Schema schema = new Schema(TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) - .asStructType().fields()); + MapType.ofOptional(15, 16, Types.StringType.get(), SUPPORTED_PRIMITIVES))), + required( + 17, + "list_of_struct_of_nested_types", + ListType.ofOptional( + 19, + StructType.of( + Types.NestedField.required( + 20, + "m1", + MapType.ofOptional( + 21, 22, Types.StringType.get(), SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 23, "l1", ListType.ofRequired(24, SUPPORTED_PRIMITIVES)), + Types.NestedField.required( + 25, "l2", ListType.ofRequired(26, SUPPORTED_PRIMITIVES)), + Types.NestedField.optional( + 27, + "m2", + MapType.ofOptional( + 28, 29, Types.StringType.get(), SUPPORTED_PRIMITIVES)))))); + + Schema schema = + new Schema( + TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet) + .asStructType() + .fields()); writeAndValidate(schema); } @Test public void testTimestampWithoutZone() throws IOException { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - Schema schema = TypeUtil.assignIncreasingFreshIds(new Schema( - required(0, "id", LongType.get()), - optional(1, "ts_without_zone", Types.TimestampType.withoutZone()))); - - writeAndValidate(schema); - }); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + Schema schema = + TypeUtil.assignIncreasingFreshIds( + new Schema( + required(0, "id", LongType.get()), + optional(1, "ts_without_zone", Types.TimestampType.withoutZone()))); + + writeAndValidate(schema); + }); } protected void withSQLConf(Map conf, Action action) throws IOException { SQLConf sqlConf = SQLConf.get(); Map currentConfValues = Maps.newHashMap(); - conf.keySet().forEach(confKey -> { - if (sqlConf.contains(confKey)) { - String currentConfValue = sqlConf.getConfString(confKey); - currentConfValues.put(confKey, currentConfValue); - } - }); - - conf.forEach((confKey, confValue) -> { - if (SQLConf.isStaticConfigKey(confKey)) { - throw new RuntimeException("Cannot modify the value of a static config: " + confKey); - } - sqlConf.setConfString(confKey, confValue); - }); + conf.keySet() + .forEach( + confKey -> { + if (sqlConf.contains(confKey)) { + String currentConfValue = sqlConf.getConfString(confKey); + currentConfValues.put(confKey, currentConfValue); + } + }); + + conf.forEach( + (confKey, confValue) -> { + if (SQLConf.isStaticConfigKey(confKey)) { + throw new RuntimeException("Cannot modify the value of a static config: " + confKey); + } + sqlConf.setConfString(confKey, confValue); + }); try { action.invoke(); } finally { - conf.forEach((confKey, confValue) -> { - if (currentConfValues.containsKey(confKey)) { - sqlConf.setConfString(confKey, currentConfValues.get(confKey)); - } else { - sqlConf.unsetConf(confKey); - } - }); + conf.forEach( + (confKey, confValue) -> { + if (currentConfValues.containsKey(confKey)) { + sqlConf.setConfString(confKey, currentConfValues.get(confKey)); + } else { + sqlConf.unsetConf(confKey); + } + }); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java index 46c95cef112d..a96e3b1f57f5 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static scala.collection.JavaConverters.mapAsJavaMapConverter; +import static scala.collection.JavaConverters.seqAsJavaListConverter; + import java.math.BigDecimal; import java.nio.ByteBuffer; import java.sql.Timestamp; @@ -48,13 +51,8 @@ import org.junit.Assert; import scala.collection.Seq; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static scala.collection.JavaConverters.mapAsJavaMapConverter; -import static scala.collection.JavaConverters.seqAsJavaListConverter; - public class GenericsHelpers { - private GenericsHelpers() { - } + private GenericsHelpers() {} private static final OffsetDateTime EPOCH = Instant.ofEpochMilli(0L).atOffset(ZoneOffset.UTC); private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); @@ -71,7 +69,8 @@ public static void assertEqualsSafe(Types.StructType struct, Record expected, Ro } } - private static void assertEqualsSafe(Types.ListType list, Collection expected, List actual) { + private static void assertEqualsSafe( + Types.ListType list, Collection expected, List actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); for (int i = 0; i < expectedElements.size(); i += 1) { @@ -82,11 +81,11 @@ private static void assertEqualsSafe(Types.ListType list, Collection expected } } - private static void assertEqualsSafe(Types.MapType map, - Map expected, Map actual) { + private static void assertEqualsSafe(Types.MapType map, Map expected, Map actual) { Type keyType = map.keyType(); Type valueType = map.valueType(); - Assert.assertEquals("Should have the same number of keys", expected.keySet().size(), actual.keySet().size()); + Assert.assertEquals( + "Should have the same number of keys", expected.keySet().size(), actual.keySet().size()); for (Object expectedKey : expected.keySet()) { Object matchingKey = null; @@ -120,22 +119,29 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) Assert.assertEquals("Primitive value should be equal to expected", expected, actual); break; case DATE: - Assertions.assertThat(expected).as("Should expect a LocalDate").isInstanceOf(LocalDate.class); + Assertions.assertThat(expected) + .as("Should expect a LocalDate") + .isInstanceOf(LocalDate.class); Assertions.assertThat(actual).as("Should be a Date").isInstanceOf(Date.class); - Assert.assertEquals("ISO-8601 date should be equal", expected.toString(), actual.toString()); + Assert.assertEquals( + "ISO-8601 date should be equal", expected.toString(), actual.toString()); break; case TIMESTAMP: Assertions.assertThat(actual).as("Should be a Timestamp").isInstanceOf(Timestamp.class); Timestamp ts = (Timestamp) actual; // milliseconds from nanos has already been added by getTime - OffsetDateTime actualTs = EPOCH.plusNanos( - (ts.getTime() * 1_000_000) + (ts.getNanos() % 1_000_000)); + OffsetDateTime actualTs = + EPOCH.plusNanos((ts.getTime() * 1_000_000) + (ts.getNanos() % 1_000_000)); Types.TimestampType timestampType = (Types.TimestampType) type; if (timestampType.shouldAdjustToUTC()) { - Assertions.assertThat(expected).as("Should expect an OffsetDateTime").isInstanceOf(OffsetDateTime.class); + Assertions.assertThat(expected) + .as("Should expect an OffsetDateTime") + .isInstanceOf(OffsetDateTime.class); Assert.assertEquals("Timestamp should be equal", expected, actualTs); } else { - Assertions.assertThat(expected).as("Should expect an LocalDateTime").isInstanceOf(LocalDateTime.class); + Assertions.assertThat(expected) + .as("Should expect an LocalDateTime") + .isInstanceOf(LocalDateTime.class); Assert.assertEquals("Timestamp should be equal", expected, actualTs.toLocalDateTime()); } break; @@ -146,23 +152,25 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a String").isInstanceOf(String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual); + Assert.assertEquals("UUID string representation should match", expected.toString(), actual); break; case FIXED: Assertions.assertThat(expected).as("Should expect a byte[]").isInstanceOf(byte[].class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - (byte[]) expected, (byte[]) actual); + Assert.assertArrayEquals("Bytes should match", (byte[]) expected, (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a BigDecimal").isInstanceOf(BigDecimal.class); Assert.assertEquals("BigDecimals should be equal", expected, actual); break; @@ -172,16 +180,20 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) assertEqualsSafe(type.asNestedType().asStructType(), (Record) expected, (Row) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be a Seq").isInstanceOf(Seq.class); List asList = seqAsJavaListConverter((Seq) actual).asJava(); assertEqualsSafe(type.asNestedType().asListType(), (Collection) expected, asList); break; case MAP: Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be a Map").isInstanceOf(scala.collection.Map.class); - Map asMap = mapAsJavaMapConverter( - (scala.collection.Map) actual).asJava(); + Assertions.assertThat(actual) + .as("Should be a Map") + .isInstanceOf(scala.collection.Map.class); + Map asMap = + mapAsJavaMapConverter((scala.collection.Map) actual).asJava(); assertEqualsSafe(type.asNestedType().asMapType(), (Map) expected, asMap); break; case TIME: @@ -190,7 +202,8 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) } } - public static void assertEqualsUnsafe(Types.StructType struct, Record expected, InternalRow actual) { + public static void assertEqualsUnsafe( + Types.StructType struct, Record expected, InternalRow actual) { List fields = struct.fields(); for (int i = 0; i < fields.size(); i += 1) { Type fieldType = fields.get(i).type(); @@ -202,7 +215,8 @@ public static void assertEqualsUnsafe(Types.StructType struct, Record expected, } } - private static void assertEqualsUnsafe(Types.ListType list, Collection expected, ArrayData actual) { + private static void assertEqualsUnsafe( + Types.ListType list, Collection expected, ArrayData actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); for (int i = 0; i < expectedElements.size(); i += 1) { @@ -245,20 +259,29 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual Assert.assertEquals("Primitive value should be equal to expected", expected, actual); break; case DATE: - Assertions.assertThat(expected).as("Should expect a LocalDate").isInstanceOf(LocalDate.class); + Assertions.assertThat(expected) + .as("Should expect a LocalDate") + .isInstanceOf(LocalDate.class); int expectedDays = (int) ChronoUnit.DAYS.between(EPOCH_DAY, (LocalDate) expected); Assert.assertEquals("Primitive value should be equal to expected", expectedDays, actual); break; case TIMESTAMP: Types.TimestampType timestampType = (Types.TimestampType) type; if (timestampType.shouldAdjustToUTC()) { - Assertions.assertThat(expected).as("Should expect an OffsetDateTime").isInstanceOf(OffsetDateTime.class); + Assertions.assertThat(expected) + .as("Should expect an OffsetDateTime") + .isInstanceOf(OffsetDateTime.class); long expectedMicros = ChronoUnit.MICROS.between(EPOCH, (OffsetDateTime) expected); - Assert.assertEquals("Primitive value should be equal to expected", expectedMicros, actual); + Assert.assertEquals( + "Primitive value should be equal to expected", expectedMicros, actual); } else { - Assertions.assertThat(expected).as("Should expect an LocalDateTime").isInstanceOf(LocalDateTime.class); - long expectedMicros = ChronoUnit.MICROS.between(EPOCH, ((LocalDateTime) expected).atZone(ZoneId.of("UTC"))); - Assert.assertEquals("Primitive value should be equal to expected", expectedMicros, actual); + Assertions.assertThat(expected) + .as("Should expect an LocalDateTime") + .isInstanceOf(LocalDateTime.class); + long expectedMicros = + ChronoUnit.MICROS.between(EPOCH, ((LocalDateTime) expected).atZone(ZoneId.of("UTC"))); + Assert.assertEquals( + "Primitive value should be equal to expected", expectedMicros, actual); } break; case STRING: @@ -268,8 +291,8 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a UTF8String").isInstanceOf(UTF8String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual.toString()); + Assert.assertEquals( + "UUID string representation should match", expected.toString(), actual.toString()); break; case FIXED: Assertions.assertThat(expected).as("Should expect a byte[]").isInstanceOf(byte[].class); @@ -277,30 +300,42 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual Assert.assertArrayEquals("Bytes should match", (byte[]) expected, (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a Decimal").isInstanceOf(Decimal.class); - Assert.assertEquals("BigDecimals should be equal", - expected, ((Decimal) actual).toJavaBigDecimal()); + Assert.assertEquals( + "BigDecimals should be equal", expected, ((Decimal) actual).toJavaBigDecimal()); break; case STRUCT: Assertions.assertThat(expected).as("Should expect a Record").isInstanceOf(Record.class); - Assertions.assertThat(actual).as("Should be an InternalRow").isInstanceOf(InternalRow.class); - assertEqualsUnsafe(type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); + Assertions.assertThat(actual) + .as("Should be an InternalRow") + .isInstanceOf(InternalRow.class); + assertEqualsUnsafe( + type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be an ArrayData").isInstanceOf(ArrayData.class); - assertEqualsUnsafe(type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); + assertEqualsUnsafe( + type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); break; case MAP: Assertions.assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be an ArrayBasedMapData").isInstanceOf(MapData.class); + Assertions.assertThat(actual) + .as("Should be an ArrayBasedMapData") + .isInstanceOf(MapData.class); assertEqualsUnsafe(type.asNestedType().asMapType(), (Map) expected, (MapData) actual); break; case TIME: diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java index d3bffb75eb5c..1c95df8ced12 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/RandomData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.math.BigDecimal; @@ -53,8 +52,7 @@ public class RandomData { // Default percentage of number of values that are null for optional fields public static final float DEFAULT_NULL_PERCENTAGE = 0.05f; - private RandomData() { - } + private RandomData() {} public static List generateList(Schema schema, int numRecords, long seed) { RandomDataGenerator generator = new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE); @@ -67,63 +65,71 @@ public static List generateList(Schema schema, int numRecords, long seed } public static Iterable generateSpark(Schema schema, int numRecords, long seed) { - return () -> new Iterator() { - private SparkRandomDataGenerator generator = new SparkRandomDataGenerator(seed); - private int count = 0; - - @Override - public boolean hasNext() { - return count < numRecords; - } - - @Override - public InternalRow next() { - if (count >= numRecords) { - throw new NoSuchElementException(); - } - count += 1; - return (InternalRow) TypeUtil.visit(schema, generator); - } - }; + return () -> + new Iterator() { + private SparkRandomDataGenerator generator = new SparkRandomDataGenerator(seed); + private int count = 0; + + @Override + public boolean hasNext() { + return count < numRecords; + } + + @Override + public InternalRow next() { + if (count >= numRecords) { + throw new NoSuchElementException(); + } + count += 1; + return (InternalRow) TypeUtil.visit(schema, generator); + } + }; } public static Iterable generate(Schema schema, int numRecords, long seed) { - return newIterable(() -> new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE), schema, numRecords); + return newIterable( + () -> new RandomDataGenerator(schema, seed, DEFAULT_NULL_PERCENTAGE), schema, numRecords); } - public static Iterable generate(Schema schema, int numRecords, long seed, float nullPercentage) { - return newIterable(() -> new RandomDataGenerator(schema, seed, nullPercentage), schema, numRecords); + public static Iterable generate( + Schema schema, int numRecords, long seed, float nullPercentage) { + return newIterable( + () -> new RandomDataGenerator(schema, seed, nullPercentage), schema, numRecords); } - public static Iterable generateFallbackData(Schema schema, int numRecords, long seed, long numDictRecords) { - return newIterable(() -> new FallbackDataGenerator(schema, seed, numDictRecords), schema, numRecords); + public static Iterable generateFallbackData( + Schema schema, int numRecords, long seed, long numDictRecords) { + return newIterable( + () -> new FallbackDataGenerator(schema, seed, numDictRecords), schema, numRecords); } public static Iterable generateDictionaryEncodableData( Schema schema, int numRecords, long seed, float nullPercentage) { - return newIterable(() -> new DictionaryEncodedDataGenerator(schema, seed, nullPercentage), schema, numRecords); + return newIterable( + () -> new DictionaryEncodedDataGenerator(schema, seed, nullPercentage), schema, numRecords); } - private static Iterable newIterable(Supplier newGenerator, - Schema schema, int numRecords) { - return () -> new Iterator() { - private int count = 0; - private RandomDataGenerator generator = newGenerator.get(); - - @Override - public boolean hasNext() { - return count < numRecords; - } - - @Override - public Record next() { - if (count >= numRecords) { - throw new NoSuchElementException(); - } - count += 1; - return (Record) TypeUtil.visit(schema, generator); - } - }; + private static Iterable newIterable( + Supplier newGenerator, Schema schema, int numRecords) { + return () -> + new Iterator() { + private int count = 0; + private RandomDataGenerator generator = newGenerator.get(); + + @Override + public boolean hasNext() { + return count < numRecords; + } + + @Override + public Record next() { + if (count >= numRecords) { + throw new NoSuchElementException(); + } + count += 1; + return (Record) TypeUtil.visit(schema, generator); + } + }; } private static class RandomDataGenerator extends TypeUtil.CustomOrderSchemaVisitor { @@ -218,8 +224,7 @@ public Object primitive(Type.PrimitiveType primitive) { // them here. switch (primitive.typeId()) { case FIXED: - return new GenericData.Fixed(typeToSchema.get(primitive), - (byte[]) result); + return new GenericData.Fixed(typeToSchema.get(primitive), (byte[]) result); case BINARY: return ByteBuffer.wrap((byte[]) result); case UUID: diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java index 4aed78d1e155..69b14eead4d5 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestHelpers.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static scala.collection.JavaConverters.mapAsJavaMapConverter; +import static scala.collection.JavaConverters.seqAsJavaListConverter; + import java.math.BigDecimal; import java.nio.ByteBuffer; import java.sql.Timestamp; @@ -71,17 +74,13 @@ import org.junit.Assert; import scala.collection.Seq; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static scala.collection.JavaConverters.mapAsJavaMapConverter; -import static scala.collection.JavaConverters.seqAsJavaListConverter; - public class TestHelpers { - private TestHelpers() { - } + private TestHelpers() {} public static void assertEqualsSafe(Types.StructType struct, List recs, List rows) { - Streams.forEachPair(recs.stream(), rows.stream(), (rec, row) -> assertEqualsSafe(struct, rec, row)); + Streams.forEachPair( + recs.stream(), rows.stream(), (rec, row) -> assertEqualsSafe(struct, rec, row)); } public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row) { @@ -96,8 +95,11 @@ public static void assertEqualsSafe(Types.StructType struct, Record rec, Row row } } - public static void assertEqualsBatch(Types.StructType struct, Iterator expected, ColumnarBatch batch, - boolean checkArrowValidityVector) { + public static void assertEqualsBatch( + Types.StructType struct, + Iterator expected, + ColumnarBatch batch, + boolean checkArrowValidityVector) { for (int rowId = 0; rowId < batch.numRows(); rowId++) { List fields = struct.fields(); InternalRow row = batch.getRow(rowId); @@ -110,15 +112,16 @@ public static void assertEqualsBatch(Types.StructType struct, Iterator e if (checkArrowValidityVector) { ColumnVector columnVector = batch.column(i); - ValueVector arrowVector = ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector(); - Assert.assertFalse("Nullability doesn't match of " + columnVector.dataType(), + ValueVector arrowVector = + ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector(); + Assert.assertFalse( + "Nullability doesn't match of " + columnVector.dataType(), expectedValue == null ^ arrowVector.isNull(rowId)); } } } } - private static void assertEqualsSafe(Types.ListType list, Collection expected, List actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); @@ -130,8 +133,7 @@ private static void assertEqualsSafe(Types.ListType list, Collection expected } } - private static void assertEqualsSafe(Types.MapType map, - Map expected, Map actual) { + private static void assertEqualsSafe(Types.MapType map, Map expected, Map actual) { Type keyType = map.keyType(); Type valueType = map.valueType(); @@ -190,23 +192,28 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a String").isInstanceOf(String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual); + Assert.assertEquals("UUID string representation should match", expected.toString(), actual); break; case FIXED: - Assertions.assertThat(expected).as("Should expect a Fixed").isInstanceOf(GenericData.Fixed.class); + Assertions.assertThat(expected) + .as("Should expect a Fixed") + .isInstanceOf(GenericData.Fixed.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((GenericData.Fixed) expected).bytes(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((GenericData.Fixed) expected).bytes(), (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a BigDecimal").isInstanceOf(BigDecimal.class); Assert.assertEquals("BigDecimals should be equal", expected, actual); break; @@ -216,16 +223,20 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual) assertEqualsSafe(type.asNestedType().asStructType(), (Record) expected, (Row) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be a Seq").isInstanceOf(Seq.class); List asList = seqAsJavaListConverter((Seq) actual).asJava(); assertEqualsSafe(type.asNestedType().asListType(), (Collection) expected, asList); break; case MAP: Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be a Map").isInstanceOf(scala.collection.Map.class); - Map asMap = mapAsJavaMapConverter( - (scala.collection.Map) actual).asJava(); + Assertions.assertThat(actual) + .as("Should be a Map") + .isInstanceOf(scala.collection.Map.class); + Map asMap = + mapAsJavaMapConverter((scala.collection.Map) actual).asJava(); assertEqualsSafe(type.asNestedType().asMapType(), (Map) expected, asMap); break; case TIME: @@ -246,7 +257,8 @@ public static void assertEqualsUnsafe(Types.StructType struct, Record rec, Inter } } - private static void assertEqualsUnsafe(Types.ListType list, Collection expected, ArrayData actual) { + private static void assertEqualsUnsafe( + Types.ListType list, Collection expected, ArrayData actual) { Type elementType = list.elementType(); List expectedElements = Lists.newArrayList(expected); for (int i = 0; i < expectedElements.size(); i += 1) { @@ -292,8 +304,10 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual case DOUBLE: Assertions.assertThat(actual).as("Should be a double").isInstanceOf(Double.class); if (expected instanceof Float) { - Assert.assertEquals("Values didn't match", Double.doubleToLongBits(((Number) expected).doubleValue()), - Double.doubleToLongBits((double) actual)); + Assert.assertEquals( + "Values didn't match", + Double.doubleToLongBits(((Number) expected).doubleValue()), + Double.doubleToLongBits((double) actual)); } else { Assert.assertEquals("Primitive value should be equal to expected", expected, actual); } @@ -312,40 +326,54 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual case UUID: Assertions.assertThat(expected).as("Should expect a UUID").isInstanceOf(UUID.class); Assertions.assertThat(actual).as("Should be a UTF8String").isInstanceOf(UTF8String.class); - Assert.assertEquals("UUID string representation should match", - expected.toString(), actual.toString()); + Assert.assertEquals( + "UUID string representation should match", expected.toString(), actual.toString()); break; case FIXED: - Assertions.assertThat(expected).as("Should expect a Fixed").isInstanceOf(GenericData.Fixed.class); + Assertions.assertThat(expected) + .as("Should expect a Fixed") + .isInstanceOf(GenericData.Fixed.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((GenericData.Fixed) expected).bytes(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((GenericData.Fixed) expected).bytes(), (byte[]) actual); break; case BINARY: - Assertions.assertThat(expected).as("Should expect a ByteBuffer").isInstanceOf(ByteBuffer.class); + Assertions.assertThat(expected) + .as("Should expect a ByteBuffer") + .isInstanceOf(ByteBuffer.class); Assertions.assertThat(actual).as("Should be a byte[]").isInstanceOf(byte[].class); - Assert.assertArrayEquals("Bytes should match", - ((ByteBuffer) expected).array(), (byte[]) actual); + Assert.assertArrayEquals( + "Bytes should match", ((ByteBuffer) expected).array(), (byte[]) actual); break; case DECIMAL: - Assertions.assertThat(expected).as("Should expect a BigDecimal").isInstanceOf(BigDecimal.class); + Assertions.assertThat(expected) + .as("Should expect a BigDecimal") + .isInstanceOf(BigDecimal.class); Assertions.assertThat(actual).as("Should be a Decimal").isInstanceOf(Decimal.class); - Assert.assertEquals("BigDecimals should be equal", - expected, ((Decimal) actual).toJavaBigDecimal()); + Assert.assertEquals( + "BigDecimals should be equal", expected, ((Decimal) actual).toJavaBigDecimal()); break; case STRUCT: Assertions.assertThat(expected).as("Should expect a Record").isInstanceOf(Record.class); - Assertions.assertThat(actual).as("Should be an InternalRow").isInstanceOf(InternalRow.class); - assertEqualsUnsafe(type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); + Assertions.assertThat(actual) + .as("Should be an InternalRow") + .isInstanceOf(InternalRow.class); + assertEqualsUnsafe( + type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); break; case LIST: - Assertions.assertThat(expected).as("Should expect a Collection").isInstanceOf(Collection.class); + Assertions.assertThat(expected) + .as("Should expect a Collection") + .isInstanceOf(Collection.class); Assertions.assertThat(actual).as("Should be an ArrayData").isInstanceOf(ArrayData.class); - assertEqualsUnsafe(type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); + assertEqualsUnsafe( + type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); break; case MAP: Assertions.assertThat(expected).as("Should expect a Map").isInstanceOf(Map.class); - Assertions.assertThat(actual).as("Should be an ArrayBasedMapData").isInstanceOf(MapData.class); + Assertions.assertThat(actual) + .as("Should be an ArrayBasedMapData") + .isInstanceOf(MapData.class); assertEqualsUnsafe(type.asNestedType().asMapType(), (Map) expected, (MapData) actual); break; case TIME: @@ -356,13 +384,14 @@ private static void assertEqualsUnsafe(Type type, Object expected, Object actual /** * Check that the given InternalRow is equivalent to the Row. + * * @param prefix context for error messages * @param type the type of the row * @param expected the expected value of the row * @param actual the actual value of the row */ - public static void assertEquals(String prefix, Types.StructType type, - InternalRow expected, Row actual) { + public static void assertEquals( + String prefix, Types.StructType type, InternalRow expected, Row actual) { if (expected == null || actual == null) { Assert.assertEquals(prefix, expected, actual); } else { @@ -380,30 +409,41 @@ public static void assertEquals(String prefix, Types.StructType type, case DECIMAL: case DATE: case TIMESTAMP: - Assert.assertEquals(prefix + "." + fieldName + " - " + childType, + Assert.assertEquals( + prefix + "." + fieldName + " - " + childType, getValue(expected, c, childType), getPrimitiveValue(actual, c, childType)); break; case UUID: case FIXED: case BINARY: - assertEqualBytes(prefix + "." + fieldName, + assertEqualBytes( + prefix + "." + fieldName, (byte[]) getValue(expected, c, childType), (byte[]) actual.get(c)); break; - case STRUCT: { - Types.StructType st = (Types.StructType) childType; - assertEquals(prefix + "." + fieldName, st, - expected.getStruct(c, st.fields().size()), actual.getStruct(c)); - break; - } + case STRUCT: + { + Types.StructType st = (Types.StructType) childType; + assertEquals( + prefix + "." + fieldName, + st, + expected.getStruct(c, st.fields().size()), + actual.getStruct(c)); + break; + } case LIST: - assertEqualsLists(prefix + "." + fieldName, childType.asListType(), + assertEqualsLists( + prefix + "." + fieldName, + childType.asListType(), expected.getArray(c), toList((Seq) actual.get(c))); break; case MAP: - assertEqualsMaps(prefix + "." + fieldName, childType.asMapType(), expected.getMap(c), + assertEqualsMaps( + prefix + "." + fieldName, + childType.asMapType(), + expected.getMap(c), toJavaMap((scala.collection.Map) actual.getMap(c))); break; default: @@ -413,8 +453,8 @@ public static void assertEquals(String prefix, Types.StructType type, } } - private static void assertEqualsLists(String prefix, Types.ListType type, - ArrayData expected, List actual) { + private static void assertEqualsLists( + String prefix, Types.ListType type, ArrayData expected, List actual) { if (expected == null || actual == null) { Assert.assertEquals(prefix, expected, actual); } else { @@ -431,31 +471,42 @@ private static void assertEqualsLists(String prefix, Types.ListType type, case DECIMAL: case DATE: case TIMESTAMP: - Assert.assertEquals(prefix + ".elem " + e + " - " + childType, + Assert.assertEquals( + prefix + ".elem " + e + " - " + childType, getValue(expected, e, childType), actual.get(e)); break; case UUID: case FIXED: case BINARY: - assertEqualBytes(prefix + ".elem " + e, + assertEqualBytes( + prefix + ".elem " + e, (byte[]) getValue(expected, e, childType), (byte[]) actual.get(e)); break; - case STRUCT: { - Types.StructType st = (Types.StructType) childType; - assertEquals(prefix + ".elem " + e, st, - expected.getStruct(e, st.fields().size()), (Row) actual.get(e)); - break; - } + case STRUCT: + { + Types.StructType st = (Types.StructType) childType; + assertEquals( + prefix + ".elem " + e, + st, + expected.getStruct(e, st.fields().size()), + (Row) actual.get(e)); + break; + } case LIST: - assertEqualsLists(prefix + ".elem " + e, childType.asListType(), + assertEqualsLists( + prefix + ".elem " + e, + childType.asListType(), expected.getArray(e), toList((Seq) actual.get(e))); break; case MAP: - assertEqualsMaps(prefix + ".elem " + e, childType.asMapType(), - expected.getMap(e), toJavaMap((scala.collection.Map) actual.get(e))); + assertEqualsMaps( + prefix + ".elem " + e, + childType.asMapType(), + expected.getMap(e), + toJavaMap((scala.collection.Map) actual.get(e))); break; default: throw new IllegalArgumentException("Unhandled type " + childType); @@ -464,8 +515,8 @@ private static void assertEqualsLists(String prefix, Types.ListType type, } } - private static void assertEqualsMaps(String prefix, Types.MapType type, - MapData expected, Map actual) { + private static void assertEqualsMaps( + String prefix, Types.MapType type, MapData expected, Map actual) { if (expected == null || actual == null) { Assert.assertEquals(prefix, expected, actual); } else { @@ -478,7 +529,9 @@ private static void assertEqualsMaps(String prefix, Types.MapType type, Object expectedKey = getValue(expectedKeyArray, e, keyType); Object actualValue = actual.get(expectedKey); if (actualValue == null) { - Assert.assertEquals(prefix + ".key=" + expectedKey + " has null", true, + Assert.assertEquals( + prefix + ".key=" + expectedKey + " has null", + true, expected.valueArray().isNullAt(e)); } else { switch (valueType.typeId()) { @@ -491,32 +544,40 @@ private static void assertEqualsMaps(String prefix, Types.MapType type, case DECIMAL: case DATE: case TIMESTAMP: - Assert.assertEquals(prefix + ".key=" + expectedKey + " - " + valueType, + Assert.assertEquals( + prefix + ".key=" + expectedKey + " - " + valueType, getValue(expectedValueArray, e, valueType), actual.get(expectedKey)); break; case UUID: case FIXED: case BINARY: - assertEqualBytes(prefix + ".key=" + expectedKey, + assertEqualBytes( + prefix + ".key=" + expectedKey, (byte[]) getValue(expectedValueArray, e, valueType), (byte[]) actual.get(expectedKey)); break; - case STRUCT: { - Types.StructType st = (Types.StructType) valueType; - assertEquals(prefix + ".key=" + expectedKey, st, - expectedValueArray.getStruct(e, st.fields().size()), - (Row) actual.get(expectedKey)); - break; - } + case STRUCT: + { + Types.StructType st = (Types.StructType) valueType; + assertEquals( + prefix + ".key=" + expectedKey, + st, + expectedValueArray.getStruct(e, st.fields().size()), + (Row) actual.get(expectedKey)); + break; + } case LIST: - assertEqualsLists(prefix + ".key=" + expectedKey, + assertEqualsLists( + prefix + ".key=" + expectedKey, valueType.asListType(), expectedValueArray.getArray(e), toList((Seq) actual.get(expectedKey))); break; case MAP: - assertEqualsMaps(prefix + ".key=" + expectedKey, valueType.asMapType(), + assertEqualsMaps( + prefix + ".key=" + expectedKey, + valueType.asMapType(), expectedValueArray.getMap(e), toJavaMap((scala.collection.Map) actual.get(expectedKey))); break; @@ -528,8 +589,7 @@ private static void assertEqualsMaps(String prefix, Types.MapType type, } } - private static Object getValue(SpecializedGetters container, int ord, - Type type) { + private static Object getValue(SpecializedGetters container, int ord, Type type) { if (container.isNullAt(ord)) { return null; } @@ -554,10 +614,11 @@ private static Object getValue(SpecializedGetters container, int ord, return new DateWritable(container.getInt(ord)).get(); case TIMESTAMP: return DateTimeUtils.toJavaTimestamp(container.getLong(ord)); - case DECIMAL: { - Types.DecimalType dt = (Types.DecimalType) type; - return container.getDecimal(ord, dt.precision(), dt.scale()).toJavaBigDecimal(); - } + case DECIMAL: + { + Types.DecimalType dt = (Types.DecimalType) type; + return container.getDecimal(ord, dt.precision(), dt.scale()).toJavaBigDecimal(); + } case STRUCT: Types.StructType struct = type.asStructType(); InternalRow internalRow = container.getStruct(ord, struct.fields().size()); @@ -615,8 +676,7 @@ private static List toList(Seq val) { return val == null ? null : seqAsJavaListConverter(val).asJava(); } - private static void assertEqualBytes(String context, byte[] expected, - byte[] actual) { + private static void assertEqualBytes(String context, byte[] expected, byte[] actual) { if (expected == null || actual == null) { Assert.assertEquals(context, expected, actual); } else { @@ -634,23 +694,29 @@ private static void assertEquals(String context, DataType type, Object expected, } if (type instanceof StructType) { - Assertions.assertThat(expected).as("Expected should be an InternalRow: " + context) + Assertions.assertThat(expected) + .as("Expected should be an InternalRow: " + context) .isInstanceOf(InternalRow.class); - Assertions.assertThat(actual).as("Actual should be an InternalRow: " + context) + Assertions.assertThat(actual) + .as("Actual should be an InternalRow: " + context) .isInstanceOf(InternalRow.class); assertEquals(context, (StructType) type, (InternalRow) expected, (InternalRow) actual); } else if (type instanceof ArrayType) { - Assertions.assertThat(expected).as("Expected should be an ArrayData: " + context) + Assertions.assertThat(expected) + .as("Expected should be an ArrayData: " + context) .isInstanceOf(ArrayData.class); - Assertions.assertThat(actual).as("Actual should be an ArrayData: " + context) + Assertions.assertThat(actual) + .as("Actual should be an ArrayData: " + context) .isInstanceOf(ArrayData.class); assertEquals(context, (ArrayType) type, (ArrayData) expected, (ArrayData) actual); } else if (type instanceof MapType) { - Assertions.assertThat(expected).as("Expected should be a MapData: " + context) + Assertions.assertThat(expected) + .as("Expected should be a MapData: " + context) .isInstanceOf(MapData.class); - Assertions.assertThat(actual).as("Actual should be a MapData: " + context) + Assertions.assertThat(actual) + .as("Actual should be a MapData: " + context) .isInstanceOf(MapData.class); assertEquals(context, (MapType) type, (MapData) expected, (MapData) actual); @@ -661,32 +727,37 @@ private static void assertEquals(String context, DataType type, Object expected, } } - private static void assertEquals(String context, StructType struct, - InternalRow expected, InternalRow actual) { + private static void assertEquals( + String context, StructType struct, InternalRow expected, InternalRow actual) { Assert.assertEquals("Should have correct number of fields", struct.size(), actual.numFields()); for (int i = 0; i < actual.numFields(); i += 1) { StructField field = struct.fields()[i]; DataType type = field.dataType(); - assertEquals(context + "." + field.name(), type, + assertEquals( + context + "." + field.name(), + type, expected.isNullAt(i) ? null : expected.get(i, type), actual.isNullAt(i) ? null : actual.get(i, type)); } } - private static void assertEquals(String context, ArrayType array, ArrayData expected, ArrayData actual) { - Assert.assertEquals("Should have the same number of elements", - expected.numElements(), actual.numElements()); + private static void assertEquals( + String context, ArrayType array, ArrayData expected, ArrayData actual) { + Assert.assertEquals( + "Should have the same number of elements", expected.numElements(), actual.numElements()); DataType type = array.elementType(); for (int i = 0; i < actual.numElements(); i += 1) { - assertEquals(context + ".element", type, + assertEquals( + context + ".element", + type, expected.isNullAt(i) ? null : expected.get(i, type), actual.isNullAt(i) ? null : actual.get(i, type)); } } private static void assertEquals(String context, MapType map, MapData expected, MapData actual) { - Assert.assertEquals("Should have the same number of elements", - expected.numElements(), actual.numElements()); + Assert.assertEquals( + "Should have the same number of elements", expected.numElements(), actual.numElements()); DataType keyType = map.keyType(); ArrayData expectedKeys = expected.keyArray(); @@ -697,10 +768,14 @@ private static void assertEquals(String context, MapType map, MapData expected, ArrayData actualValues = actual.valueArray(); for (int i = 0; i < actual.numElements(); i += 1) { - assertEquals(context + ".key", keyType, + assertEquals( + context + ".key", + keyType, expectedKeys.isNullAt(i) ? null : expectedKeys.get(i, keyType), actualKeys.isNullAt(i) ? null : actualKeys.get(i, keyType)); - assertEquals(context + ".value", valueType, + assertEquals( + context + ".value", + valueType, expectedValues.isNullAt(i) ? null : expectedValues.get(i, valueType), actualValues.isNullAt(i) ? null : actualValues.get(i, valueType)); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java index 7cf9b9c736c6..1e51a088390e 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.io.IOException; import org.apache.iceberg.Files; @@ -32,16 +33,12 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestOrcWrite { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); @Test public void splitOffsets() throws IOException { @@ -49,10 +46,11 @@ public void splitOffsets() throws IOException { Assert.assertTrue("Delete should succeed", testFile.delete()); Iterable rows = RandomData.generateSpark(SCHEMA, 1, 0L); - FileAppender writer = ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(SCHEMA) - .build(); + FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(SCHEMA) + .build(); writer.addAll(rows); writer.close(); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java index 464e3165583c..a4ffc2fea437 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -38,54 +40,68 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestParquetAvroReader { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required(5, "strict", Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional(6, "hopeful", Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()) - )), - optional(10, "vehement", Types.LongType.get()) - )), - optional(11, "metamorphosis", Types.MapType.ofRequired(12, 13, - Types.StringType.get(), Types.TimestampType.withoutZone())), - required(14, "winter", Types.ListType.ofOptional(15, Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.TimeType.get()), - optional(18, "wheeze", Types.StringType.get()) - ))), - optional(19, "renovate", Types.MapType.ofRequired(20, 21, - Types.StringType.get(), Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.TimeType.get()), - required(24, "couch rope", Types.IntegerType.get()) - ))), - optional(2, "slide", Types.StringType.get()) - ); + @Rule public TemporaryFolder temp = new TemporaryFolder(); + + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "roots", Types.LongType.get()), + optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), + required( + 5, + "strict", + Types.StructType.of( + required(9, "tangerine", Types.StringType.get()), + optional( + 6, + "hopeful", + Types.StructType.of( + required(7, "steel", Types.FloatType.get()), + required(8, "lantern", Types.DateType.get()))), + optional(10, "vehement", Types.LongType.get()))), + optional( + 11, + "metamorphosis", + Types.MapType.ofRequired( + 12, 13, Types.StringType.get(), Types.TimestampType.withoutZone())), + required( + 14, + "winter", + Types.ListType.ofOptional( + 15, + Types.StructType.of( + optional(16, "beet", Types.DoubleType.get()), + required(17, "stamp", Types.TimeType.get()), + optional(18, "wheeze", Types.StringType.get())))), + optional( + 19, + "renovate", + Types.MapType.ofRequired( + 20, + 21, + Types.StringType.get(), + Types.StructType.of( + optional(22, "jumpy", Types.DoubleType.get()), + required(23, "koala", Types.TimeType.get()), + required(24, "couch rope", Types.IntegerType.get())))), + optional(2, "slide", Types.StringType.get())); @Ignore public void testStructSchema() throws IOException { - Schema structSchema = new Schema( - required(1, "circumvent", Types.LongType.get()), - optional(2, "antarctica", Types.StringType.get()), - optional(3, "fluent", Types.DoubleType.get()), - required(4, "quell", Types.StructType.of( - required(5, "operator", Types.BooleanType.get()), - optional(6, "fanta", Types.IntegerType.get()), - optional(7, "cable", Types.FloatType.get()) - )), - required(8, "chimney", Types.TimestampType.withZone()), - required(9, "wool", Types.DateType.get()) - ); + Schema structSchema = + new Schema( + required(1, "circumvent", Types.LongType.get()), + optional(2, "antarctica", Types.StringType.get()), + optional(3, "fluent", Types.DoubleType.get()), + required( + 4, + "quell", + Types.StructType.of( + required(5, "operator", Types.BooleanType.get()), + optional(6, "fanta", Types.IntegerType.get()), + optional(7, "cable", Types.FloatType.get()))), + required(8, "chimney", Types.TimestampType.withZone()), + required(9, "wool", Types.DateType.get())); File testFile = writeTestData(structSchema, 5_000_000, 1059); // RandomData uses the root record name "test", which must match for records to be equal @@ -100,11 +116,12 @@ public void testStructSchema() throws IOException { // clean up as much memory as possible to avoid a large GC during the timed run System.gc(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(structSchema) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(structSchema, readSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(structSchema) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(structSchema, readSchema)) + .build()) { long start = System.currentTimeMillis(); long val = 0; long count = 0; @@ -137,9 +154,8 @@ public void testWithOldReadPath() throws IOException { // clean up as much memory as possible to avoid a large GC during the timed run System.gc(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)).project(COMPLEX_SCHEMA).build()) { long start = System.currentTimeMillis(); long val = 0; long count = 0; @@ -154,11 +170,12 @@ public void testWithOldReadPath() throws IOException { // clean up as much memory as possible to avoid a large GC during the timed run System.gc(); - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) + .build()) { long start = System.currentTimeMillis(); long val = 0; long count = 0; @@ -179,9 +196,8 @@ public void testCorrectness() throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)).schema(COMPLEX_SCHEMA).build()) { writer.addAll(records); } @@ -189,12 +205,13 @@ public void testCorrectness() throws IOException { MessageType readSchema = ParquetSchemaUtil.convert(COMPLEX_SCHEMA, "test"); // verify that the new read path is correct - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .reuseContainers() - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) + .reuseContainers() + .build()) { int recordNum = 0; Iterator iter = records.iterator(); for (Record actual : reader) { @@ -209,9 +226,8 @@ private File writeTestData(Schema schema, int numRecords, int seed) throws IOExc File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)).schema(schema).build()) { writer.addAll(RandomData.generate(schema, numRecords, seed)); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java index dcfc873a5a67..15c6268da478 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestParquetAvroWriter.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -38,39 +40,51 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestParquetAvroWriter { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required(5, "strict", Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional(6, "hopeful", Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()) - )), - optional(10, "vehement", Types.LongType.get()) - )), - optional(11, "metamorphosis", Types.MapType.ofRequired(12, 13, - Types.StringType.get(), Types.TimestampType.withoutZone())), - required(14, "winter", Types.ListType.ofOptional(15, Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.TimeType.get()), - optional(18, "wheeze", Types.StringType.get()) - ))), - optional(19, "renovate", Types.MapType.ofRequired(20, 21, - Types.StringType.get(), Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.TimeType.get()), - required(24, "couch rope", Types.IntegerType.get()) - ))), - optional(2, "slide", Types.StringType.get()) - ); + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "roots", Types.LongType.get()), + optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), + required( + 5, + "strict", + Types.StructType.of( + required(9, "tangerine", Types.StringType.get()), + optional( + 6, + "hopeful", + Types.StructType.of( + required(7, "steel", Types.FloatType.get()), + required(8, "lantern", Types.DateType.get()))), + optional(10, "vehement", Types.LongType.get()))), + optional( + 11, + "metamorphosis", + Types.MapType.ofRequired( + 12, 13, Types.StringType.get(), Types.TimestampType.withoutZone())), + required( + 14, + "winter", + Types.ListType.ofOptional( + 15, + Types.StructType.of( + optional(16, "beet", Types.DoubleType.get()), + required(17, "stamp", Types.TimeType.get()), + optional(18, "wheeze", Types.StringType.get())))), + optional( + 19, + "renovate", + Types.MapType.ofRequired( + 20, + 21, + Types.StringType.get(), + Types.StructType.of( + optional(22, "jumpy", Types.DoubleType.get()), + required(23, "koala", Types.TimeType.get()), + required(24, "couch rope", Types.IntegerType.get())))), + optional(2, "slide", Types.StringType.get())); @Test public void testCorrectness() throws IOException { @@ -79,10 +93,11 @@ public void testCorrectness() throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .createWriterFunc(ParquetAvroWriter::buildWriter) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(COMPLEX_SCHEMA) + .createWriterFunc(ParquetAvroWriter::buildWriter) + .build()) { writer.addAll(records); } @@ -90,11 +105,12 @@ public void testCorrectness() throws IOException { MessageType readSchema = ParquetSchemaUtil.convert(COMPLEX_SCHEMA, "test"); // verify that the new read path is correct - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc( - fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc( + fileSchema -> ParquetAvroValueReaders.buildReader(COMPLEX_SCHEMA, readSchema)) + .build()) { int recordNum = 0; Iterator iter = records.iterator(); for (Record actual : reader) { diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java index 3517c32ffebb..6f05a9ed7c1f 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroEnums.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.io.File; @@ -42,20 +41,20 @@ public class TestSparkAvroEnums { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void writeAndValidateEnums() throws IOException { - org.apache.avro.Schema avroSchema = SchemaBuilder.record("root") - .fields() - .name("enumCol") - .type() - .nullable() - .enumeration("testEnum") - .symbols("SYMB1", "SYMB2") - .enumDefault("SYMB2") - .endRecord(); + org.apache.avro.Schema avroSchema = + SchemaBuilder.record("root") + .fields() + .name("enumCol") + .type() + .nullable() + .enumeration("testEnum") + .symbols("SYMB1", "SYMB2") + .enumDefault("SYMB2") + .endRecord(); org.apache.avro.Schema enumSchema = avroSchema.getField("enumCol").schema().getTypes().get(0); Record enumRecord1 = new GenericData.Record(avroSchema); @@ -77,10 +76,11 @@ public void writeAndValidateEnums() throws IOException { Schema schema = new Schema(AvroSchemaUtil.convert(avroSchema).asStructType().fields()); List rows; - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)) + .createReaderFunc(SparkAvroReader::new) + .project(schema) + .build()) { rows = Lists.newArrayList(reader); } @@ -88,7 +88,8 @@ public void writeAndValidateEnums() throws IOException { for (int i = 0; i < expected.size(); i += 1) { String expectedEnumString = expected.get(i).get("enumCol") == null ? null : expected.get(i).get("enumCol").toString(); - String sparkString = rows.get(i).getUTF8String(0) == null ? null : rows.get(i).getUTF8String(0).toString(); + String sparkString = + rows.get(i).getUTF8String(0) == null ? null : rows.get(i).getUTF8String(0).toString(); Assert.assertEquals(expectedEnumString, sparkString); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java index e4398df39cc8..6d1ef3db3657 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReader.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; + import java.io.File; import java.io.IOException; import java.util.List; @@ -32,8 +33,6 @@ import org.apache.spark.sql.catalyst.InternalRow; import org.junit.Assert; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; - public class TestSparkAvroReader extends AvroDataTest { @Override protected void writeAndValidate(Schema schema) throws IOException { @@ -42,20 +41,19 @@ protected void writeAndValidate(Schema schema) throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Avro.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Avro.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { for (Record rec : expected) { writer.add(rec); } } List rows; - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)) + .createReaderFunc(SparkAvroReader::new) + .project(schema) + .build()) { rows = Lists.newArrayList(reader); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java index b67e57310b4c..b31ea8fd277d 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkDateTimes.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; import java.time.ZoneId; @@ -69,7 +68,7 @@ public void checkSparkTimestamp(String timestampString, String sparkRepr) { ZoneId zoneId = DateTimeUtils.getZoneId("UTC"); TimestampFormatter formatter = TimestampFormatter.getFractionFormatter(zoneId); String sparkTimestamp = formatter.format(ts.value()); - Assert.assertEquals("Should be the same timestamp (" + ts.value() + ")", - sparkRepr, sparkTimestamp); + Assert.assertEquals( + "Should be the same timestamp (" + ts.value() + ")", sparkRepr, sparkTimestamp); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java index b8ee56370edf..3c9037adc393 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReadMetadataColumns.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -57,21 +58,18 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSparkOrcReadMetadataColumns { - private static final Schema DATA_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()) - ); - - private static final Schema PROJECTION_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - MetadataColumns.ROW_POSITION, - MetadataColumns.IS_DELETED - ); + private static final Schema DATA_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), required(101, "data", Types.StringType.get())); + + private static final Schema PROJECTION_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get()), + MetadataColumns.ROW_POSITION, + MetadataColumns.IS_DELETED); private static final int NUM_ROWS = 1000; private static final List DATA_ROWS; @@ -99,11 +97,10 @@ public class TestSparkOrcReadMetadataColumns { @Parameterized.Parameters(name = "vectorized = {0}") public static Object[] parameters() { - return new Object[] { false, true }; + return new Object[] {false, true}; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private boolean vectorized; private File testFile; @@ -117,14 +114,15 @@ public void writeFile() throws IOException { testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(DATA_SCHEMA) - // write in such a way that the file contains 10 stripes each with 100 rows - .set("iceberg.orc.vectorbatch.size", "100") - .set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "100") - .set(OrcConf.STRIPE_SIZE.getAttribute(), "1") - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(DATA_SCHEMA) + // write in such a way that the file contains 10 stripes each with 100 rows + .set("iceberg.orc.vectorbatch.size", "100") + .set(OrcConf.ROWS_BETWEEN_CHECKS.getAttribute(), "100") + .set(OrcConf.STRIPE_SIZE.getAttribute(), "1") + .build()) { writer.addAll(DATA_ROWS); } } @@ -136,41 +134,54 @@ public void testReadRowNumbers() throws IOException { @Test public void testReadRowNumbersWithFilter() throws IOException { - readAndValidate(Expressions.greaterThanOrEqual("id", 500), null, null, EXPECTED_ROWS.subList(500, 1000)); + readAndValidate( + Expressions.greaterThanOrEqual("id", 500), null, null, EXPECTED_ROWS.subList(500, 1000)); } @Test public void testReadRowNumbersWithSplits() throws IOException { Reader reader; try { - OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(new Configuration()).useUTCTimestamp(true); - reader = OrcFile.createReader(new Path(testFile.toString()), readerOptions); + OrcFile.ReaderOptions readerOptions = + OrcFile.readerOptions(new Configuration()).useUTCTimestamp(true); + reader = OrcFile.createReader(new Path(testFile.toString()), readerOptions); } catch (IOException ioe) { throw new RuntimeIOException(ioe, "Failed to open file: %s", testFile); } - List splitOffsets = reader.getStripes().stream().map(StripeInformation::getOffset) - .collect(Collectors.toList()); - List splitLengths = reader.getStripes().stream().map(StripeInformation::getLength) - .collect(Collectors.toList()); + List splitOffsets = + reader.getStripes().stream().map(StripeInformation::getOffset).collect(Collectors.toList()); + List splitLengths = + reader.getStripes().stream().map(StripeInformation::getLength).collect(Collectors.toList()); for (int i = 0; i < 10; i++) { - readAndValidate(null, splitOffsets.get(i), splitLengths.get(i), EXPECTED_ROWS.subList(i * 100, (i + 1) * 100)); + readAndValidate( + null, + splitOffsets.get(i), + splitLengths.get(i), + EXPECTED_ROWS.subList(i * 100, (i + 1) * 100)); } } - private void readAndValidate(Expression filter, Long splitStart, Long splitLength, List expected) + private void readAndValidate( + Expression filter, Long splitStart, Long splitLength, List expected) throws IOException { - Schema projectionWithoutMetadataFields = TypeUtil.selectNot(PROJECTION_SCHEMA, MetadataColumns.metadataFieldIds()); + Schema projectionWithoutMetadataFields = + TypeUtil.selectNot(PROJECTION_SCHEMA, MetadataColumns.metadataFieldIds()); CloseableIterable reader = null; try { - ORC.ReadBuilder builder = ORC.read(Files.localInput(testFile)) - .project(projectionWithoutMetadataFields); + ORC.ReadBuilder builder = + ORC.read(Files.localInput(testFile)).project(projectionWithoutMetadataFields); if (vectorized) { - builder = builder.createBatchedReaderFunc(readOrcSchema -> - VectorizedSparkOrcReaders.buildReader(PROJECTION_SCHEMA, readOrcSchema, ImmutableMap.of())); + builder = + builder.createBatchedReaderFunc( + readOrcSchema -> + VectorizedSparkOrcReaders.buildReader( + PROJECTION_SCHEMA, readOrcSchema, ImmutableMap.of())); } else { - builder = builder.createReaderFunc(readOrcSchema -> new SparkOrcReader(PROJECTION_SCHEMA, readOrcSchema)); + builder = + builder.createReaderFunc( + readOrcSchema -> new SparkOrcReader(PROJECTION_SCHEMA, readOrcSchema)); } if (filter != null) { diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java index 5042d1cc1338..b23fe729a187 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.data.TestHelpers.assertEquals; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Collections; @@ -38,45 +40,44 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.spark.data.TestHelpers.assertEquals; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkOrcReader extends AvroDataTest { @Override protected void writeAndValidate(Schema schema) throws IOException { - final Iterable expected = RandomData - .generateSpark(schema, 100, 0L); + final Iterable expected = RandomData.generateSpark(schema, 100, 0L); writeAndValidateRecords(schema, expected); } @Test public void writeAndValidateRepeatingRecords() throws IOException { - Schema structSchema = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()) - ); - List expectedRepeating = Collections.nCopies(100, - RandomData.generateSpark(structSchema, 1, 0L).iterator().next()); + Schema structSchema = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get())); + List expectedRepeating = + Collections.nCopies(100, RandomData.generateSpark(structSchema, 1, 0L).iterator().next()); writeAndValidateRecords(structSchema, expectedRepeating); } - private void writeAndValidateRecords(Schema schema, Iterable expected) throws IOException { + private void writeAndValidateRecords(Schema schema, Iterable expected) + throws IOException { final File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = ORC.write(Files.localOutput(testFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(schema) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(testFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(schema) + .build()) { writer.addAll(expected); } - try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) + .build()) { final Iterator actualRows = reader.iterator(); final Iterator expectedRows = expected.iterator(); while (expectedRows.hasNext()) { @@ -86,11 +87,13 @@ private void writeAndValidateRecords(Schema schema, Iterable expect Assert.assertFalse("Should not have extra rows", actualRows.hasNext()); } - try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) - .project(schema) - .createBatchedReaderFunc(readOrcSchema -> - VectorizedSparkOrcReaders.buildReader(schema, readOrcSchema, ImmutableMap.of())) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(testFile)) + .project(schema) + .createBatchedReaderFunc( + readOrcSchema -> + VectorizedSparkOrcReaders.buildReader(schema, readOrcSchema, ImmutableMap.of())) + .build()) { final Iterator actualRows = batchesToRows(reader.iterator()); final Iterator expectedRows = expected.iterator(); while (expectedRows.hasNext()) { diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java index f075e71742ea..23d69c467218 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReadMetadataColumns.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -64,23 +67,18 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - @RunWith(Parameterized.class) public class TestSparkParquetReadMetadataColumns { - private static final Schema DATA_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()) - ); - - private static final Schema PROJECTION_SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - MetadataColumns.ROW_POSITION, - MetadataColumns.IS_DELETED - ); + private static final Schema DATA_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), required(101, "data", Types.StringType.get())); + + private static final Schema PROJECTION_SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get()), + MetadataColumns.ROW_POSITION, + MetadataColumns.IS_DELETED); private static final int NUM_ROWS = 1000; private static final List DATA_ROWS; @@ -117,16 +115,12 @@ public class TestSparkParquetReadMetadataColumns { } } - @Parameterized.Parameters(name = "vectorized = {0}") + @Parameterized.Parameters(name = "vectorized = {0}") public static Object[][] parameters() { - return new Object[][] { - new Object[] { false }, - new Object[] { true } - }; + return new Object[][] {new Object[] {false}, new Object[] {true}}; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final boolean vectorized; private File testFile; @@ -143,28 +137,32 @@ public void writeFile() throws IOException { testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - ParquetFileWriter parquetFileWriter = new ParquetFileWriter( - conf, - ParquetSchemaUtil.convert(DATA_SCHEMA, "testSchema"), - new Path(testFile.getAbsolutePath()) - ); + ParquetFileWriter parquetFileWriter = + new ParquetFileWriter( + conf, + ParquetSchemaUtil.convert(DATA_SCHEMA, "testSchema"), + new Path(testFile.getAbsolutePath())); parquetFileWriter.start(); for (int i = 0; i < NUM_ROW_GROUPS; i += 1) { File split = temp.newFile(); Assert.assertTrue("Delete should succeed", split.delete()); fileSplits.add(new Path(split.getAbsolutePath())); - try (FileAppender writer = Parquet.write(Files.localOutput(split)) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(struct, msgType)) - .schema(DATA_SCHEMA) - .overwrite() - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(split)) + .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(struct, msgType)) + .schema(DATA_SCHEMA) + .overwrite() + .build()) { writer.addAll(DATA_ROWS.subList(i * ROWS_PER_SPLIT, (i + 1) * ROWS_PER_SPLIT)); } - parquetFileWriter.appendFile(HadoopInputFile.fromPath(new Path(split.getAbsolutePath()), conf)); + parquetFileWriter.appendFile( + HadoopInputFile.fromPath(new Path(split.getAbsolutePath()), conf)); } - parquetFileWriter - .end(ParquetFileWriter.mergeMetadataFiles(fileSplits, conf).getFileMetaData().getKeyValueMetaData()); + parquetFileWriter.end( + ParquetFileWriter.mergeMetadataFiles(fileSplits, conf) + .getFileMetaData() + .getKeyValueMetaData()); } @Test @@ -178,12 +176,14 @@ public void testReadRowNumbersWithDelete() throws IOException { List expectedRowsAfterDelete = Lists.newArrayList(); EXPECTED_ROWS.forEach(row -> expectedRowsAfterDelete.add(row.copy())); - // remove row at position 98, 99, 100, 101, 102, this crosses two row groups [0, 100) and [100, 200) + // remove row at position 98, 99, 100, 101, 102, this crosses two row groups [0, 100) and [100, + // 200) for (int i = 98; i <= 102; i++) { expectedRowsAfterDelete.get(i).update(3, true); } - Parquet.ReadBuilder builder = Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA); + Parquet.ReadBuilder builder = + Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA); DeleteFilter deleteFilter = mock(DeleteFilter.class); when(deleteFilter.hasPosDeletes()).thenReturn(true); @@ -191,8 +191,14 @@ public void testReadRowNumbersWithDelete() throws IOException { deletedRowPos.delete(98, 103); when(deleteFilter.deletedRowPositions()).thenReturn(deletedRowPos); - builder.createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(PROJECTION_SCHEMA, - fileSchema, NullCheckingForGet.NULL_CHECKING_ENABLED, Maps.newHashMap(), deleteFilter)); + builder.createBatchedReaderFunc( + fileSchema -> + VectorizedSparkParquetReaders.buildReader( + PROJECTION_SCHEMA, + fileSchema, + NullCheckingForGet.NULL_CHECKING_ENABLED, + Maps.newHashMap(), + deleteFilter)); builder.recordsPerBatch(RECORDS_PER_BATCH); validate(expectedRowsAfterDelete, builder); @@ -233,7 +239,8 @@ public void testReadRowNumbersWithFilter() throws IOException { // current iceberg supports row group filter. for (int i = 1; i < 5; i += 1) { readAndValidate( - Expressions.and(Expressions.lessThan("id", NUM_ROWS / 2), + Expressions.and( + Expressions.lessThan("id", NUM_ROWS / 2), Expressions.greaterThanOrEqual("id", i * ROWS_PER_SPLIT)), null, null, @@ -243,28 +250,36 @@ public void testReadRowNumbersWithFilter() throws IOException { @Test public void testReadRowNumbersWithSplits() throws IOException { - ParquetFileReader fileReader = new ParquetFileReader( - HadoopInputFile.fromPath(new Path(testFile.getAbsolutePath()), new Configuration()), - ParquetReadOptions.builder().build()); + ParquetFileReader fileReader = + new ParquetFileReader( + HadoopInputFile.fromPath(new Path(testFile.getAbsolutePath()), new Configuration()), + ParquetReadOptions.builder().build()); List rowGroups = fileReader.getRowGroups(); for (int i = 0; i < NUM_ROW_GROUPS; i += 1) { - readAndValidate(null, + readAndValidate( + null, rowGroups.get(i).getColumns().get(0).getStartingPos(), rowGroups.get(i).getCompressedSize(), EXPECTED_ROWS.subList(i * ROWS_PER_SPLIT, (i + 1) * ROWS_PER_SPLIT)); } } - private void readAndValidate(Expression filter, Long splitStart, Long splitLength, List expected) + private void readAndValidate( + Expression filter, Long splitStart, Long splitLength, List expected) throws IOException { - Parquet.ReadBuilder builder = Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA); + Parquet.ReadBuilder builder = + Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA); if (vectorized) { - builder.createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(PROJECTION_SCHEMA, - fileSchema, NullCheckingForGet.NULL_CHECKING_ENABLED)); + builder.createBatchedReaderFunc( + fileSchema -> + VectorizedSparkParquetReaders.buildReader( + PROJECTION_SCHEMA, fileSchema, NullCheckingForGet.NULL_CHECKING_ENABLED)); builder.recordsPerBatch(RECORDS_PER_BATCH); } else { - builder = builder.createReaderFunc(msgType -> SparkParquetReaders.buildReader(PROJECTION_SCHEMA, msgType)); + builder = + builder.createReaderFunc( + msgType -> SparkParquetReaders.buildReader(PROJECTION_SCHEMA, msgType)); } if (filter != null) { @@ -278,8 +293,10 @@ private void readAndValidate(Expression filter, Long splitStart, Long splitLengt validate(expected, builder); } - private void validate(List expected, Parquet.ReadBuilder builder) throws IOException { - try (CloseableIterable reader = vectorized ? batchesToRows(builder.build()) : builder.build()) { + private void validate(List expected, Parquet.ReadBuilder builder) + throws IOException { + try (CloseableIterable reader = + vectorized ? batchesToRows(builder.build()) : builder.build()) { final Iterator actualRows = reader.iterator(); for (InternalRow internalRow : expected) { diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java index 6895523711b9..ba24d848add0 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -60,31 +62,31 @@ import org.junit.Assume; import org.junit.Test; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkParquetReader extends AvroDataTest { @Override protected void writeAndValidate(Schema schema) throws IOException { - Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find(schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); + Assume.assumeTrue( + "Parquet Avro cannot write non-string map keys", + null + == TypeUtil.find( + schema, + type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); List expected = RandomData.generateList(schema, 100, 0L); File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { writer.addAll(expected); } - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(schema) - .createReaderFunc(type -> SparkParquetReaders.buildReader(schema, type)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .createReaderFunc(type -> SparkParquetReaders.buildReader(schema, type)) + .build()) { Iterator rows = reader.iterator(); for (int i = 0; i < expected.size(); i += 1) { Assert.assertTrue("Should have expected number of rows", rows.hasNext()); @@ -129,7 +131,8 @@ protected Table tableFromInputFile(InputFile inputFile, Schema schema) throws IO @Test public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOException { - String outputFilePath = String.format("%s/%s", temp.getRoot().getAbsolutePath(), "parquet_int96.parquet"); + String outputFilePath = + String.format("%s/%s", temp.getRoot().getAbsolutePath(), "parquet_int96.parquet"); HadoopOutputFile outputFile = HadoopOutputFile.fromPath( new org.apache.hadoop.fs.Path(outputFilePath), new Configuration()); @@ -137,7 +140,7 @@ public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOExceptio StructType sparkSchema = new StructType( new StructField[] { - new StructField("ts", DataTypes.TimestampType, true, Metadata.empty()) + new StructField("ts", DataTypes.TimestampType, true, Metadata.empty()) }); List rows = Lists.newArrayList(RandomData.generateSpark(schema, 10, 0L)); @@ -165,14 +168,14 @@ public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOExceptio Assert.assertEquals(rows.size(), tableRecords.size()); - for (int i = 0; i < tableRecords.size(); i++) { + for (int i = 0; i < tableRecords.size(); i++) { GenericsHelpers.assertEqualsUnsafe(schema.asStruct(), tableRecords.get(i), rows.get(i)); } } /** - * Native Spark ParquetWriter.Builder implementation so that we can write timestamps using Spark's native - * ParquetWriteSupport. + * Native Spark ParquetWriter.Builder implementation so that we can write timestamps using Spark's + * native ParquetWriteSupport. */ private static class NativeSparkWriterBuilder extends ParquetWriter.Builder { diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java index c75a87abc45c..261fb8838aa4 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetWriter.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -35,39 +37,51 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkParquetWriter { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "roots", Types.LongType.get()), - optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), - required(5, "strict", Types.StructType.of( - required(9, "tangerine", Types.StringType.get()), - optional(6, "hopeful", Types.StructType.of( - required(7, "steel", Types.FloatType.get()), - required(8, "lantern", Types.DateType.get()) - )), - optional(10, "vehement", Types.LongType.get()) - )), - optional(11, "metamorphosis", Types.MapType.ofRequired(12, 13, - Types.StringType.get(), Types.TimestampType.withZone())), - required(14, "winter", Types.ListType.ofOptional(15, Types.StructType.of( - optional(16, "beet", Types.DoubleType.get()), - required(17, "stamp", Types.FloatType.get()), - optional(18, "wheeze", Types.StringType.get()) - ))), - optional(19, "renovate", Types.MapType.ofRequired(20, 21, - Types.StringType.get(), Types.StructType.of( - optional(22, "jumpy", Types.DoubleType.get()), - required(23, "koala", Types.IntegerType.get()), - required(24, "couch rope", Types.IntegerType.get()) - ))), - optional(2, "slide", Types.StringType.get()) - ); + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "roots", Types.LongType.get()), + optional(3, "lime", Types.ListType.ofRequired(4, Types.DoubleType.get())), + required( + 5, + "strict", + Types.StructType.of( + required(9, "tangerine", Types.StringType.get()), + optional( + 6, + "hopeful", + Types.StructType.of( + required(7, "steel", Types.FloatType.get()), + required(8, "lantern", Types.DateType.get()))), + optional(10, "vehement", Types.LongType.get()))), + optional( + 11, + "metamorphosis", + Types.MapType.ofRequired( + 12, 13, Types.StringType.get(), Types.TimestampType.withZone())), + required( + 14, + "winter", + Types.ListType.ofOptional( + 15, + Types.StructType.of( + optional(16, "beet", Types.DoubleType.get()), + required(17, "stamp", Types.FloatType.get()), + optional(18, "wheeze", Types.StringType.get())))), + optional( + 19, + "renovate", + Types.MapType.ofRequired( + 20, + 21, + Types.StringType.get(), + Types.StructType.of( + optional(22, "jumpy", Types.DoubleType.get()), + required(23, "koala", Types.IntegerType.get()), + required(24, "couch rope", Types.IntegerType.get())))), + optional(2, "slide", Types.StringType.get())); @Test public void testCorrectness() throws IOException { @@ -77,17 +91,22 @@ public void testCorrectness() throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .schema(COMPLEX_SCHEMA) - .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(COMPLEX_SCHEMA), msgType)) - .build()) { + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .schema(COMPLEX_SCHEMA) + .createWriterFunc( + msgType -> + SparkParquetWriters.buildWriter( + SparkSchemaUtil.convert(COMPLEX_SCHEMA), msgType)) + .build()) { writer.addAll(records); } - try (CloseableIterable reader = Parquet.read(Files.localInput(testFile)) - .project(COMPLEX_SCHEMA) - .createReaderFunc(type -> SparkParquetReaders.buildReader(COMPLEX_SCHEMA, type)) - .build()) { + try (CloseableIterable reader = + Parquet.read(Files.localInput(testFile)) + .project(COMPLEX_SCHEMA) + .createReaderFunc(type -> SparkParquetReaders.buildReader(COMPLEX_SCHEMA, type)) + .build()) { Iterator expected = records.iterator(); Iterator rows = reader.iterator(); for (int i = 0; i < numRows; i += 1) { diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java index 1e7430d16df7..d10e7f5a19e3 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.math.BigDecimal; @@ -40,8 +41,6 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkRecordOrcReaderWriter extends AvroDataTest { private static final int NUM_RECORDS = 200; @@ -50,19 +49,21 @@ private void writeAndValidate(Schema schema, List expectedRecords) throw Assert.assertTrue("Delete should succeed", originalFile.delete()); // Write few generic records into the original test file. - try (FileAppender writer = ORC.write(Files.localOutput(originalFile)) - .createWriterFunc(GenericOrcWriter::buildWriter) - .schema(schema) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(originalFile)) + .createWriterFunc(GenericOrcWriter::buildWriter) + .schema(schema) + .build()) { writer.addAll(expectedRecords); } // Read into spark InternalRow from the original test file. List internalRows = Lists.newArrayList(); - try (CloseableIterable reader = ORC.read(Files.localInput(originalFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(originalFile)) + .project(schema) + .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) + .build()) { reader.forEach(internalRows::add); assertEqualsUnsafe(schema.asStruct(), expectedRecords, reader, expectedRecords.size()); } @@ -71,26 +72,29 @@ private void writeAndValidate(Schema schema, List expectedRecords) throw Assert.assertTrue("Delete should succeed", anotherFile.delete()); // Write those spark InternalRows into a new file again. - try (FileAppender writer = ORC.write(Files.localOutput(anotherFile)) - .createWriterFunc(SparkOrcWriter::new) - .schema(schema) - .build()) { + try (FileAppender writer = + ORC.write(Files.localOutput(anotherFile)) + .createWriterFunc(SparkOrcWriter::new) + .schema(schema) + .build()) { writer.addAll(internalRows); } // Check whether the InternalRows are expected records. - try (CloseableIterable reader = ORC.read(Files.localInput(anotherFile)) - .project(schema) - .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(anotherFile)) + .project(schema) + .createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)) + .build()) { assertEqualsUnsafe(schema.asStruct(), expectedRecords, reader, expectedRecords.size()); } // Read into iceberg GenericRecord and check again. - try (CloseableIterable reader = ORC.read(Files.localInput(anotherFile)) - .createReaderFunc(typeDesc -> GenericOrcReader.buildReader(schema, typeDesc)) - .project(schema) - .build()) { + try (CloseableIterable reader = + ORC.read(Files.localInput(anotherFile)) + .createReaderFunc(typeDesc -> GenericOrcReader.buildReader(schema, typeDesc)) + .project(schema) + .build()) { assertRecordEquals(expectedRecords, reader, expectedRecords.size()); } } @@ -103,11 +107,11 @@ protected void writeAndValidate(Schema schema) throws IOException { @Test public void testDecimalWithTrailingZero() throws IOException { - Schema schema = new Schema( - required(1, "d1", Types.DecimalType.of(10, 2)), - required(2, "d2", Types.DecimalType.of(20, 5)), - required(3, "d3", Types.DecimalType.of(38, 20)) - ); + Schema schema = + new Schema( + required(1, "d1", Types.DecimalType.of(10, 2)), + required(2, "d2", Types.DecimalType.of(20, 5)), + required(3, "d3", Types.DecimalType.of(38, 20))); List expected = Lists.newArrayList(); @@ -121,7 +125,8 @@ public void testDecimalWithTrailingZero() throws IOException { writeAndValidate(schema, expected); } - private static void assertRecordEquals(Iterable expected, Iterable actual, int size) { + private static void assertRecordEquals( + Iterable expected, Iterable actual, int size) { Iterator expectedIter = expected.iterator(); Iterator actualIter = actual.iterator(); for (int i = 0; i < size; i += 1) { @@ -133,8 +138,8 @@ private static void assertRecordEquals(Iterable expected, Iterable expected, - Iterable actual, int size) { + private static void assertEqualsUnsafe( + Types.StructType struct, Iterable expected, Iterable actual, int size) { Iterator expectedIter = expected.iterator(); Iterator actualIter = actual.iterator(); for (int i = 0; i < size; i += 1) { diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java index f292df0c3bf8..756f49a2aad6 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet.vectorized; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; + import java.io.File; import java.io.IOException; import org.apache.avro.generic.GenericData; @@ -35,42 +36,42 @@ import org.junit.Ignore; import org.junit.Test; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; - public class TestParquetDictionaryEncodedVectorizedReads extends TestParquetVectorizedReads { @Override - Iterable generateData(Schema schema, int numRecords, long seed, float nullPercentage, - Function transform) { - Iterable data = RandomData.generateDictionaryEncodableData(schema, numRecords, seed, nullPercentage); + Iterable generateData( + Schema schema, + int numRecords, + long seed, + float nullPercentage, + Function transform) { + Iterable data = + RandomData.generateDictionaryEncodableData(schema, numRecords, seed, nullPercentage); return transform == IDENTITY ? data : Iterables.transform(data, transform); } @Test @Override @Ignore // Ignored since this code path is already tested in TestParquetVectorizedReads - public void testVectorizedReadsWithNewContainers() throws IOException { - - } + public void testVectorizedReadsWithNewContainers() throws IOException {} @Test public void testMixedDictionaryNonDictionaryReads() throws IOException { Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); File dictionaryEncodedFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dictionaryEncodedFile.delete()); - Iterable dictionaryEncodableData = RandomData.generateDictionaryEncodableData( - schema, - 10000, - 0L, - RandomData.DEFAULT_NULL_PERCENTAGE); - try (FileAppender writer = getParquetWriter(schema, dictionaryEncodedFile)) { + Iterable dictionaryEncodableData = + RandomData.generateDictionaryEncodableData( + schema, 10000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE); + try (FileAppender writer = + getParquetWriter(schema, dictionaryEncodedFile)) { writer.addAll(dictionaryEncodableData); } File plainEncodingFile = temp.newFile(); Assert.assertTrue("Delete should succeed", plainEncodingFile.delete()); - Iterable nonDictionaryData = RandomData.generate(schema, 10000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE); + Iterable nonDictionaryData = + RandomData.generate(schema, 10000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE); try (FileAppender writer = getParquetWriter(schema, plainEncodingFile)) { writer.addAll(nonDictionaryData); } @@ -78,15 +79,19 @@ public void testMixedDictionaryNonDictionaryReads() throws IOException { int rowGroupSize = PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; File mixedFile = temp.newFile(); Assert.assertTrue("Delete should succeed", mixedFile.delete()); - Parquet.concat(ImmutableList.of(dictionaryEncodedFile, plainEncodingFile, dictionaryEncodedFile), - mixedFile, rowGroupSize, schema, ImmutableMap.of()); + Parquet.concat( + ImmutableList.of(dictionaryEncodedFile, plainEncodingFile, dictionaryEncodedFile), + mixedFile, + rowGroupSize, + schema, + ImmutableMap.of()); assertRecordsMatch( - schema, - 30000, - FluentIterable.concat(dictionaryEncodableData, nonDictionaryData, dictionaryEncodableData), - mixedFile, - false, - true, - BATCH_SIZE); + schema, + 30000, + FluentIterable.concat(dictionaryEncodableData, nonDictionaryData, dictionaryEncodableData), + mixedFile, + false, + true, + BATCH_SIZE); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java index 5ceac3fdb76e..42ea34936b5f 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryFallbackToPlainEncodingVectorizedReads.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet.vectorized; import java.io.File; @@ -33,7 +32,8 @@ import org.junit.Ignore; import org.junit.Test; -public class TestParquetDictionaryFallbackToPlainEncodingVectorizedReads extends TestParquetVectorizedReads { +public class TestParquetDictionaryFallbackToPlainEncodingVectorizedReads + extends TestParquetVectorizedReads { private static final int NUM_ROWS = 1_000_000; @Override @@ -42,15 +42,20 @@ protected int getNumRows() { } @Override - Iterable generateData(Schema schema, int numRecords, long seed, float nullPercentage, - Function transform) { + Iterable generateData( + Schema schema, + int numRecords, + long seed, + float nullPercentage, + Function transform) { // TODO: take into account nullPercentage when generating fallback encoding data Iterable data = RandomData.generateFallbackData(schema, numRecords, seed, numRecords / 20); return transform == IDENTITY ? data : Iterables.transform(data, transform); } @Override - FileAppender getParquetWriter(Schema schema, File testFile) throws IOException { + FileAppender getParquetWriter(Schema schema, File testFile) + throws IOException { return Parquet.write(Files.localOutput(testFile)) .schema(schema) .named("test") @@ -61,14 +66,10 @@ FileAppender getParquetWriter(Schema schema, File testFile) @Test @Override @Ignore // Fallback encoding not triggered when data is mostly null - public void testMostlyNullsForOptionalFields() { - - } + public void testMostlyNullsForOptionalFields() {} @Test @Override @Ignore // Ignored since this code path is already tested in TestParquetVectorizedReads - public void testVectorizedReadsWithNewContainers() throws IOException { - - } + public void testVectorizedReadsWithNewContainers() throws IOException {} } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java index cff2e5cc6ac2..56e9490b997b 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data.parquet.vectorized; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -49,9 +51,6 @@ import org.junit.Ignore; import org.junit.Test; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestParquetVectorizedReads extends AvroDataTest { private static final int NUM_ROWS = 200_000; static final int BATCH_SIZE = 10_000; @@ -64,24 +63,44 @@ protected void writeAndValidate(Schema schema) throws IOException { } private void writeAndValidate( - Schema schema, int numRecords, long seed, float nullPercentage, - boolean setAndCheckArrowValidityVector, boolean reuseContainers) - throws IOException { - writeAndValidate(schema, numRecords, seed, nullPercentage, - setAndCheckArrowValidityVector, reuseContainers, BATCH_SIZE, IDENTITY); + Schema schema, + int numRecords, + long seed, + float nullPercentage, + boolean setAndCheckArrowValidityVector, + boolean reuseContainers) + throws IOException { + writeAndValidate( + schema, + numRecords, + seed, + nullPercentage, + setAndCheckArrowValidityVector, + reuseContainers, + BATCH_SIZE, + IDENTITY); } private void writeAndValidate( - Schema schema, int numRecords, long seed, float nullPercentage, - boolean setAndCheckArrowValidityVector, boolean reuseContainers, int batchSize, - Function transform) + Schema schema, + int numRecords, + long seed, + float nullPercentage, + boolean setAndCheckArrowValidityVector, + boolean reuseContainers, + int batchSize, + Function transform) throws IOException { // Write test data - Assume.assumeTrue("Parquet Avro cannot write non-string map keys", null == TypeUtil.find( - schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); + Assume.assumeTrue( + "Parquet Avro cannot write non-string map keys", + null + == TypeUtil.find( + schema, + type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); - Iterable expected = generateData(schema, numRecords, seed, nullPercentage, transform); + Iterable expected = + generateData(schema, numRecords, seed, nullPercentage, transform); // write a test parquet file using iceberg writer File testFile = temp.newFile(); @@ -90,58 +109,74 @@ private void writeAndValidate( try (FileAppender writer = getParquetWriter(schema, testFile)) { writer.addAll(expected); } - assertRecordsMatch(schema, numRecords, expected, testFile, setAndCheckArrowValidityVector, - reuseContainers, batchSize); + assertRecordsMatch( + schema, + numRecords, + expected, + testFile, + setAndCheckArrowValidityVector, + reuseContainers, + batchSize); } protected int getNumRows() { return NUM_ROWS; } - Iterable generateData(Schema schema, int numRecords, long seed, float nullPercentage, - Function transform) { - Iterable data = RandomData.generate(schema, numRecords, seed, nullPercentage); + Iterable generateData( + Schema schema, + int numRecords, + long seed, + float nullPercentage, + Function transform) { + Iterable data = + RandomData.generate(schema, numRecords, seed, nullPercentage); return transform == IDENTITY ? data : Iterables.transform(data, transform); } - FileAppender getParquetWriter(Schema schema, File testFile) throws IOException { + FileAppender getParquetWriter(Schema schema, File testFile) + throws IOException { + return Parquet.write(Files.localOutput(testFile)).schema(schema).named("test").build(); + } + + FileAppender getParquetV2Writer(Schema schema, File testFile) + throws IOException { return Parquet.write(Files.localOutput(testFile)) .schema(schema) .named("test") + .writerVersion(ParquetProperties.WriterVersion.PARQUET_2_0) .build(); } - FileAppender getParquetV2Writer(Schema schema, File testFile) throws IOException { - return Parquet.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .writerVersion(ParquetProperties.WriterVersion.PARQUET_2_0) - .build(); - } - void assertRecordsMatch( - Schema schema, int expectedSize, Iterable expected, File testFile, - boolean setAndCheckArrowValidityBuffer, boolean reuseContainers, int batchSize) + Schema schema, + int expectedSize, + Iterable expected, + File testFile, + boolean setAndCheckArrowValidityBuffer, + boolean reuseContainers, + int batchSize) throws IOException { - Parquet.ReadBuilder readBuilder = Parquet.read(Files.localInput(testFile)) - .project(schema) - .recordsPerBatch(batchSize) - .createBatchedReaderFunc(type -> VectorizedSparkParquetReaders.buildReader( - schema, - type, - setAndCheckArrowValidityBuffer)); + Parquet.ReadBuilder readBuilder = + Parquet.read(Files.localInput(testFile)) + .project(schema) + .recordsPerBatch(batchSize) + .createBatchedReaderFunc( + type -> + VectorizedSparkParquetReaders.buildReader( + schema, type, setAndCheckArrowValidityBuffer)); if (reuseContainers) { readBuilder.reuseContainers(); } - try (CloseableIterable batchReader = - readBuilder.build()) { + try (CloseableIterable batchReader = readBuilder.build()) { Iterator expectedIter = expected.iterator(); Iterator batches = batchReader.iterator(); int numRowsRead = 0; while (batches.hasNext()) { ColumnarBatch batch = batches.next(); numRowsRead += batch.numRows(); - TestHelpers.assertEqualsBatch(schema.asStruct(), expectedIter, batch, setAndCheckArrowValidityBuffer); + TestHelpers.assertEqualsBatch( + schema.asStruct(), expectedIter, batch, setAndCheckArrowValidityBuffer); } Assert.assertEquals(expectedSize, numRowsRead); } @@ -150,44 +185,37 @@ void assertRecordsMatch( @Override @Test @Ignore - public void testArray() { - } + public void testArray() {} @Override @Test @Ignore - public void testArrayOfStructs() { - } + public void testArrayOfStructs() {} @Override @Test @Ignore - public void testMap() { - } + public void testMap() {} @Override @Test @Ignore - public void testNumericMapKey() { - } + public void testNumericMapKey() {} @Override @Test @Ignore - public void testComplexMapKey() { - } + public void testComplexMapKey() {} @Override @Test @Ignore - public void testMapOfStructs() { - } + public void testMapOfStructs() {} @Override @Test @Ignore - public void testMixedTypes() { - } + public void testMixedTypes() {} @Test @Override @@ -196,13 +224,13 @@ public void testNestedStruct() { "Vectorized reads are not supported yet for struct fields", UnsupportedOperationException.class, "Vectorized reads are not supported yet for struct fields", - () -> VectorizedSparkParquetReaders.buildReader( - TypeUtil.assignIncreasingFreshIds(new Schema(required( - 1, - "struct", - SUPPORTED_PRIMITIVES))), - new MessageType("struct", new GroupType(Type.Repetition.OPTIONAL, "struct").withId(1)), - false)); + () -> + VectorizedSparkParquetReaders.buildReader( + TypeUtil.assignIncreasingFreshIds( + new Schema(required(1, "struct", SUPPORTED_PRIMITIVES))), + new MessageType( + "struct", new GroupType(Type.Repetition.OPTIONAL, "struct").withId(1)), + false)); } @Test @@ -218,27 +246,40 @@ public void testMostlyNullsForOptionalFields() throws IOException { @Test public void testSettingArrowValidityVector() throws IOException { - writeAndValidate(new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)), - getNumRows(), 0L, RandomData.DEFAULT_NULL_PERCENTAGE, true, true); + writeAndValidate( + new Schema(Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asOptional)), + getNumRows(), + 0L, + RandomData.DEFAULT_NULL_PERCENTAGE, + true, + true); } @Test public void testVectorizedReadsWithNewContainers() throws IOException { - writeAndValidate(TypeUtil.assignIncreasingFreshIds(new Schema(SUPPORTED_PRIMITIVES.fields())), - getNumRows(), 0L, RandomData.DEFAULT_NULL_PERCENTAGE, true, false); + writeAndValidate( + TypeUtil.assignIncreasingFreshIds(new Schema(SUPPORTED_PRIMITIVES.fields())), + getNumRows(), + 0L, + RandomData.DEFAULT_NULL_PERCENTAGE, + true, + false); } @Test public void testVectorizedReadsWithReallocatedArrowBuffers() throws IOException { // With a batch size of 2, 256 bytes are allocated in the VarCharVector. By adding strings of // length 512, the vector will need to be reallocated for storing the batch. - writeAndValidate(new Schema( + writeAndValidate( + new Schema( Lists.newArrayList( - SUPPORTED_PRIMITIVES.field("id"), - SUPPORTED_PRIMITIVES.field("data"))), - 10, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, - true, true, 2, + SUPPORTED_PRIMITIVES.field("id"), SUPPORTED_PRIMITIVES.field("data"))), + 10, + 0L, + RandomData.DEFAULT_NULL_PERCENTAGE, + true, + true, + 2, record -> { if (record.get("data") != null) { record.put("data", Strings.padEnd((String) record.get("data"), 512, 'a')); @@ -251,30 +292,29 @@ record -> { @Test public void testReadsForTypePromotedColumns() throws Exception { - Schema writeSchema = new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "int_data", Types.IntegerType.get()), - optional(102, "float_data", Types.FloatType.get()), - optional(103, "decimal_data", Types.DecimalType.of(10, 5)) - ); + Schema writeSchema = + new Schema( + required(100, "id", Types.LongType.get()), + optional(101, "int_data", Types.IntegerType.get()), + optional(102, "float_data", Types.FloatType.get()), + optional(103, "decimal_data", Types.DecimalType.of(10, 5))); File dataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = generateData(writeSchema, 30000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); + Iterable data = + generateData(writeSchema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); try (FileAppender writer = getParquetWriter(writeSchema, dataFile)) { writer.addAll(data); } - Schema readSchema = new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "int_data", Types.LongType.get()), - optional(102, "float_data", Types.DoubleType.get()), - optional(103, "decimal_data", Types.DecimalType.of(25, 5)) - ); + Schema readSchema = + new Schema( + required(100, "id", Types.LongType.get()), + optional(101, "int_data", Types.LongType.get()), + optional(102, "float_data", Types.DoubleType.get()), + optional(103, "decimal_data", Types.DecimalType.of(25, 5))); - assertRecordsMatch(readSchema, 30000, data, dataFile, false, - true, BATCH_SIZE); + assertRecordsMatch(readSchema, 30000, data, dataFile, false, true, BATCH_SIZE); } @Test @@ -282,38 +322,40 @@ public void testSupportedReadsForParquetV2() throws Exception { // Float and double column types are written using plain encoding with Parquet V2, // also Parquet V2 will dictionary encode decimals that use fixed length binary // (i.e. decimals > 8 bytes) - Schema schema = new Schema( + Schema schema = + new Schema( optional(102, "float_data", Types.FloatType.get()), optional(103, "double_data", Types.DoubleType.get()), - optional(104, "decimal_data", Types.DecimalType.of(25, 5)) - ); + optional(104, "decimal_data", Types.DecimalType.of(25, 5))); File dataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = generateData(schema, 30000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); + Iterable data = + generateData(schema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); try (FileAppender writer = getParquetV2Writer(schema, dataFile)) { writer.addAll(data); } - assertRecordsMatch(schema, 30000, data, dataFile, false, - true, BATCH_SIZE); + assertRecordsMatch(schema, 30000, data, dataFile, false, true, BATCH_SIZE); } @Test public void testUnsupportedReadsForParquetV2() throws Exception { - // Longs, ints, string types etc use delta encoding and which are not supported for vectorized reads + // Longs, ints, string types etc use delta encoding and which are not supported for vectorized + // reads Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); File dataFile = temp.newFile(); Assert.assertTrue("Delete should succeed", dataFile.delete()); - Iterable data = generateData(schema, 30000, 0L, - RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); + Iterable data = + generateData(schema, 30000, 0L, RandomData.DEFAULT_NULL_PERCENTAGE, IDENTITY); try (FileAppender writer = getParquetV2Writer(schema, dataFile)) { writer.addAll(data); } - AssertHelpers.assertThrows("Vectorized reads not supported", - UnsupportedOperationException.class, "Cannot support vectorized reads for column", () -> { - assertRecordsMatch(schema, 30000, data, dataFile, false, - true, BATCH_SIZE); + AssertHelpers.assertThrows( + "Vectorized reads not supported", + UnsupportedOperationException.class, + "Cannot support vectorized reads for column", + () -> { + assertRecordsMatch(schema, 30000, data, dataFile, false, true, BATCH_SIZE); return null; }); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/FilePathLastModifiedRecord.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/FilePathLastModifiedRecord.java index 275e3a520db5..c62c1de6ba33 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/FilePathLastModifiedRecord.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/FilePathLastModifiedRecord.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.sql.Timestamp; @@ -26,8 +25,7 @@ public class FilePathLastModifiedRecord { private String filePath; private Timestamp lastModified; - public FilePathLastModifiedRecord() { - } + public FilePathLastModifiedRecord() {} public FilePathLastModifiedRecord(String filePath, Timestamp lastModified) { this.filePath = filePath; @@ -59,8 +57,8 @@ public boolean equals(Object o) { return false; } FilePathLastModifiedRecord that = (FilePathLastModifiedRecord) o; - return Objects.equals(filePath, that.filePath) && - Objects.equals(lastModified, that.lastModified); + return Objects.equals(filePath, that.filePath) + && Objects.equals(lastModified, that.lastModified); } @Override @@ -70,9 +68,13 @@ public int hashCode() { @Override public String toString() { - return "FilePathLastModifiedRecord{" + - "filePath='" + filePath + '\'' + - ", lastModified='" + lastModified + '\'' + - '}'; + return "FilePathLastModifiedRecord{" + + "filePath='" + + filePath + + '\'' + + ", lastModified='" + + lastModified + + '\'' + + '}'; } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java index 5e22daeb0841..53a35eec61ce 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/LogMessage.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.time.Instant; diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java index 0f8c8b3b65c6..c9c1c29ea8fc 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/ManualSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -35,7 +34,8 @@ public class ManualSource implements TableProvider, DataSourceRegister { private static final Map tableMap = Maps.newHashMap(); public static void setTable(String name, Table table) { - Preconditions.checkArgument(!tableMap.containsKey(name), "Cannot set " + name + ". It is already set"); + Preconditions.checkArgument( + !tableMap.containsKey(name), "Cannot set " + name + ". It is already set"); tableMap.put(name, table); } @@ -61,7 +61,8 @@ public Transform[] inferPartitioning(CaseInsensitiveStringMap options) { @Override public org.apache.spark.sql.connector.catalog.Table getTable( StructType schema, Transform[] partitioning, Map properties) { - Preconditions.checkArgument(properties.containsKey(TABLE_NAME), "Missing property " + TABLE_NAME); + Preconditions.checkArgument( + properties.containsKey(TABLE_NAME), "Missing property " + TABLE_NAME); String tableName = properties.get(TABLE_NAME); Preconditions.checkArgument(tableMap.containsKey(tableName), "Table missing " + tableName); return tableMap.get(tableName); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java index c8b7a31b3ba0..550e20b9338e 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/SimpleRecord.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.relocated.com.google.common.base.Objects; @@ -25,8 +24,7 @@ public class SimpleRecord { private Integer id; private String data; - public SimpleRecord() { - } + public SimpleRecord() {} public SimpleRecord(Integer id, String data) { this.id = id; diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java index 4d2e12229813..9491adde4605 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestAvroScan.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.List; @@ -46,13 +47,10 @@ import org.junit.Rule; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.Files.localOutput; - public class TestAvroScan extends AvroDataTest { private static final Configuration CONF = new Configuration(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; @@ -75,8 +73,8 @@ protected void writeAndValidate(Schema schema) throws IOException { File dataFolder = new File(location, "data"); dataFolder.mkdirs(); - File avroFile = new File(dataFolder, - FileFormat.AVRO.addExtension(UUID.randomUUID().toString())); + File avroFile = + new File(dataFolder, FileFormat.AVRO.addExtension(UUID.randomUUID().toString())); HadoopTables tables = new HadoopTables(CONF); Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); @@ -87,23 +85,21 @@ protected void writeAndValidate(Schema schema) throws IOException { List expected = RandomData.generateList(tableSchema, 100, 1L); - try (FileAppender writer = Avro.write(localOutput(avroFile)) - .schema(tableSchema) - .build()) { + try (FileAppender writer = + Avro.write(localOutput(avroFile)).schema(tableSchema).build()) { writer.addAll(expected); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(100) - .withFileSizeInBytes(avroFile.length()) - .withPath(avroFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(100) + .withFileSizeInBytes(avroFile.length()) + .withPath(avroFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); - Dataset df = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset df = spark.read().format("iceberg").load(location.toString()); List rows = df.collectAsList(); Assert.assertEquals("Should contain 100 rows", 100, rows.size()); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestBaseReader.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestBaseReader.java index 7f15cb28fa6b..cbcee867803f 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestBaseReader.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestBaseReader.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.FileFormat.PARQUET; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.Iterator; @@ -50,13 +52,9 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.FileFormat.PARQUET; -import static org.apache.iceberg.Files.localOutput; - public class TestBaseReader { - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private Table table; @@ -134,14 +132,12 @@ public void testClosureOnDataExhaustion() throws IOException { Assert.assertNotNull("Reader should return non-null value", reader.get()); } - Assert.assertEquals("Reader returned incorrect number of records", - totalTasks * recordPerTask, - countRecords - ); - tasks.forEach(t -> - Assert.assertTrue("All iterators should be closed after read exhausion", - reader.isIteratorClosed(t)) - ); + Assert.assertEquals( + "Reader returned incorrect number of records", totalTasks * recordPerTask, countRecords); + tasks.forEach( + t -> + Assert.assertTrue( + "All iterators should be closed after read exhausion", reader.isIteratorClosed(t))); } @Test @@ -157,13 +153,15 @@ public void testClosureDuringIteration() throws IOException { // Total of 2 elements Assert.assertTrue(reader.next()); - Assert.assertFalse("First iter should not be closed on its last element", - reader.isIteratorClosed(firstTask)); + Assert.assertFalse( + "First iter should not be closed on its last element", reader.isIteratorClosed(firstTask)); Assert.assertTrue(reader.next()); - Assert.assertTrue("First iter should be closed after moving to second iter", + Assert.assertTrue( + "First iter should be closed after moving to second iter", reader.isIteratorClosed(firstTask)); - Assert.assertFalse("Second iter should not be closed on its last element", + Assert.assertFalse( + "Second iter should not be closed on its last element", reader.isIteratorClosed(secondTask)); Assert.assertFalse(reader.next()); @@ -181,10 +179,10 @@ public void testClosureWithoutAnyRead() throws IOException { reader.close(); - tasks.forEach(t -> - Assert.assertFalse("Iterator should not be created eagerly for tasks", - reader.hasIterator(t)) - ); + tasks.forEach( + t -> + Assert.assertFalse( + "Iterator should not be created eagerly for tasks", reader.hasIterator(t))); } @Test @@ -205,12 +203,13 @@ public void testExplicitClosure() throws IOException { // Some tasks might have not been opened yet, so we don't have corresponding tracker for it. // But all that have been created must be closed. - tasks.forEach(t -> { - if (reader.hasIterator(t)) { - Assert.assertTrue("Iterator should be closed after read exhausion", - reader.isIteratorClosed(t)); - } - }); + tasks.forEach( + t -> { + if (reader.hasIterator(t)) { + Assert.assertTrue( + "Iterator should be closed after read exhausion", reader.isIteratorClosed(t)); + } + }); } @Test @@ -230,26 +229,26 @@ public void testIdempotentExplicitClosure() throws IOException { for (int closeAttempt = 0; closeAttempt < 5; closeAttempt++) { reader.close(); for (int i = 0; i < 5; i++) { - Assert.assertTrue("Iterator should be closed after read exhausion", + Assert.assertTrue( + "Iterator should be closed after read exhausion", reader.isIteratorClosed(tasks.get(i))); } for (int i = 5; i < 10; i++) { - Assert.assertFalse("Iterator should not be created eagerly for tasks", - reader.hasIterator(tasks.get(i))); + Assert.assertFalse( + "Iterator should not be created eagerly for tasks", reader.hasIterator(tasks.get(i))); } } } - private List createFileScanTasks(Integer totalTasks, Integer recordPerTask) throws IOException { + private List createFileScanTasks(Integer totalTasks, Integer recordPerTask) + throws IOException { String desc = "make_scan_tasks"; File parent = temp.newFolder(desc); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema schema = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); try { this.table = TestTables.create(location, desc, schema, PartitionSpec.unpartitioned()); @@ -261,22 +260,21 @@ private List createFileScanTasks(Integer totalTasks, Integer recor AppendFiles appendFiles = table.newAppend(); for (int i = 0; i < totalTasks; i++) { File parquetFile = new File(dataFolder, PARQUET.addExtension(UUID.randomUUID().toString())); - try (FileAppender writer = Parquet.write(localOutput(parquetFile)) - .schema(tableSchema) - .build()) { + try (FileAppender writer = + Parquet.write(localOutput(parquetFile)).schema(tableSchema).build()) { writer.addAll(expected); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withFileSizeInBytes(parquetFile.length()) - .withPath(parquetFile.toString()) - .withRecordCount(recordPerTask) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withFileSizeInBytes(parquetFile.length()) + .withPath(parquetFile.toString()) + .withRecordCount(recordPerTask) + .build(); appendFiles.appendFile(file); } appendFiles.commit(); - return StreamSupport - .stream(table.newScan().planFiles().spliterator(), false) + return StreamSupport.stream(table.newScan().planFiles().spliterator(), false) .collect(Collectors.toList()); } finally { TestTables.clearTables(); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWriterV2.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWriterV2.java index fdee5911994e..6b4e77d1de3f 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWriterV2.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWriterV2.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.AssertHelpers; @@ -44,27 +43,35 @@ public void removeTables() { @Test public void testMergeSchemaFailsWithoutWriterOption() throws Exception { - sql("ALTER TABLE %s SET TBLPROPERTIES ('%s'='true')", tableName, TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s'='true')", + tableName, TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA); - Dataset twoColDF = jsonToDF( - "id bigint, data string", - "{ \"id\": 1, \"data\": \"a\" }", - "{ \"id\": 2, \"data\": \"b\" }"); + Dataset twoColDF = + jsonToDF( + "id bigint, data string", + "{ \"id\": 1, \"data\": \"a\" }", + "{ \"id\": 2, \"data\": \"b\" }"); twoColDF.writeTo(tableName).append(); - assertEquals("Should have initial 2-column rows", + assertEquals( + "Should have initial 2-column rows", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("select * from %s order by id", tableName)); - Dataset threeColDF = jsonToDF( - "id bigint, data string, new_col float", - "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }", - "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }"); - - // this has a different error message than the case without accept-any-schema because it uses Iceberg checks - AssertHelpers.assertThrows("Should fail when merge-schema is not enabled on the writer", - IllegalArgumentException.class, "Field new_col not found in source schema", + Dataset threeColDF = + jsonToDF( + "id bigint, data string, new_col float", + "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }", + "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }"); + + // this has a different error message than the case without accept-any-schema because it uses + // Iceberg checks + AssertHelpers.assertThrows( + "Should fail when merge-schema is not enabled on the writer", + IllegalArgumentException.class, + "Field new_col not found in source schema", () -> { try { threeColDF.writeTo(tableName).append(); @@ -77,24 +84,29 @@ public void testMergeSchemaFailsWithoutWriterOption() throws Exception { @Test public void testMergeSchemaWithoutAcceptAnySchema() throws Exception { - Dataset twoColDF = jsonToDF( - "id bigint, data string", - "{ \"id\": 1, \"data\": \"a\" }", - "{ \"id\": 2, \"data\": \"b\" }"); + Dataset twoColDF = + jsonToDF( + "id bigint, data string", + "{ \"id\": 1, \"data\": \"a\" }", + "{ \"id\": 2, \"data\": \"b\" }"); twoColDF.writeTo(tableName).append(); - assertEquals("Should have initial 2-column rows", + assertEquals( + "Should have initial 2-column rows", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("select * from %s order by id", tableName)); - Dataset threeColDF = jsonToDF( - "id bigint, data string, new_col float", - "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }", - "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }"); + Dataset threeColDF = + jsonToDF( + "id bigint, data string, new_col float", + "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }", + "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }"); - AssertHelpers.assertThrows("Should fail when accept-any-schema is not enabled on the table", - AnalysisException.class, "too many data columns", + AssertHelpers.assertThrows( + "Should fail when accept-any-schema is not enabled on the table", + AnalysisException.class, + "too many data columns", () -> { try { threeColDF.writeTo(tableName).option("merge-schema", "true").append(); @@ -107,55 +119,69 @@ public void testMergeSchemaWithoutAcceptAnySchema() throws Exception { @Test public void testMergeSchemaSparkProperty() throws Exception { - sql("ALTER TABLE %s SET TBLPROPERTIES ('%s'='true')", tableName, TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s'='true')", + tableName, TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA); - Dataset twoColDF = jsonToDF( - "id bigint, data string", - "{ \"id\": 1, \"data\": \"a\" }", - "{ \"id\": 2, \"data\": \"b\" }"); + Dataset twoColDF = + jsonToDF( + "id bigint, data string", + "{ \"id\": 1, \"data\": \"a\" }", + "{ \"id\": 2, \"data\": \"b\" }"); twoColDF.writeTo(tableName).append(); - assertEquals("Should have initial 2-column rows", + assertEquals( + "Should have initial 2-column rows", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("select * from %s order by id", tableName)); - Dataset threeColDF = jsonToDF( - "id bigint, data string, new_col float", - "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }", - "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }"); + Dataset threeColDF = + jsonToDF( + "id bigint, data string, new_col float", + "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }", + "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }"); threeColDF.writeTo(tableName).option("mergeSchema", "true").append(); - assertEquals("Should have 3-column rows", - ImmutableList.of(row(1L, "a", null), row(2L, "b", null), row(3L, "c", 12.06F), row(4L, "d", 14.41F)), + assertEquals( + "Should have 3-column rows", + ImmutableList.of( + row(1L, "a", null), row(2L, "b", null), row(3L, "c", 12.06F), row(4L, "d", 14.41F)), sql("select * from %s order by id", tableName)); } @Test public void testMergeSchemaIcebergProperty() throws Exception { - sql("ALTER TABLE %s SET TBLPROPERTIES ('%s'='true')", tableName, TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s'='true')", + tableName, TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA); - Dataset twoColDF = jsonToDF( - "id bigint, data string", - "{ \"id\": 1, \"data\": \"a\" }", - "{ \"id\": 2, \"data\": \"b\" }"); + Dataset twoColDF = + jsonToDF( + "id bigint, data string", + "{ \"id\": 1, \"data\": \"a\" }", + "{ \"id\": 2, \"data\": \"b\" }"); twoColDF.writeTo(tableName).append(); - assertEquals("Should have initial 2-column rows", + assertEquals( + "Should have initial 2-column rows", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("select * from %s order by id", tableName)); - Dataset threeColDF = jsonToDF( - "id bigint, data string, new_col float", - "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }", - "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }"); + Dataset threeColDF = + jsonToDF( + "id bigint, data string, new_col float", + "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }", + "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }"); threeColDF.writeTo(tableName).option("merge-schema", "true").append(); - assertEquals("Should have 3-column rows", - ImmutableList.of(row(1L, "a", null), row(2L, "b", null), row(3L, "c", 12.06F), row(4L, "d", 14.41F)), + assertEquals( + "Should have 3-column rows", + ImmutableList.of( + row(1L, "a", null), row(2L, "b", null), row(3L, "c", 12.06F), row(4L, "d", 14.41F)), sql("select * from %s order by id", tableName)); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java index b0a77b72b431..9f32769379c8 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsSafe; +import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; + import java.io.File; import java.io.IOException; import java.net.URI; @@ -68,10 +71,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsSafe; -import static org.apache.iceberg.spark.data.TestHelpers.assertEqualsUnsafe; - @RunWith(Parameterized.class) public class TestDataFrameWrites extends AvroDataTest { private static final Configuration CONF = new Configuration(); @@ -80,7 +79,7 @@ public class TestDataFrameWrites extends AvroDataTest { @Parameterized.Parameters(name = "format = {0}") public static Object[] parameters() { - return new Object[] { "parquet", "avro", "orc" }; + return new Object[] {"parquet", "avro", "orc"}; } public TestDataFrameWrites(String format) { @@ -92,32 +91,36 @@ public TestDataFrameWrites(String format) { private Map tableProperties; - private org.apache.spark.sql.types.StructType sparkSchema = new org.apache.spark.sql.types.StructType( - new org.apache.spark.sql.types.StructField[] { - new org.apache.spark.sql.types.StructField( - "optionalField", - org.apache.spark.sql.types.DataTypes.StringType, - true, - org.apache.spark.sql.types.Metadata.empty()), - new org.apache.spark.sql.types.StructField( - "requiredField", - org.apache.spark.sql.types.DataTypes.StringType, - false, - org.apache.spark.sql.types.Metadata.empty()) - }); - - private Schema icebergSchema = new Schema( - Types.NestedField.optional(1, "optionalField", Types.StringType.get()), - Types.NestedField.required(2, "requiredField", Types.StringType.get())); - - private List data0 = Arrays.asList( - "{\"optionalField\": \"a1\", \"requiredField\": \"bid_001\"}", - "{\"optionalField\": \"a2\", \"requiredField\": \"bid_002\"}"); - private List data1 = Arrays.asList( - "{\"optionalField\": \"d1\", \"requiredField\": \"bid_101\"}", - "{\"optionalField\": \"d2\", \"requiredField\": \"bid_102\"}", - "{\"optionalField\": \"d3\", \"requiredField\": \"bid_103\"}", - "{\"optionalField\": \"d4\", \"requiredField\": \"bid_104\"}"); + private org.apache.spark.sql.types.StructType sparkSchema = + new org.apache.spark.sql.types.StructType( + new org.apache.spark.sql.types.StructField[] { + new org.apache.spark.sql.types.StructField( + "optionalField", + org.apache.spark.sql.types.DataTypes.StringType, + true, + org.apache.spark.sql.types.Metadata.empty()), + new org.apache.spark.sql.types.StructField( + "requiredField", + org.apache.spark.sql.types.DataTypes.StringType, + false, + org.apache.spark.sql.types.Metadata.empty()) + }); + + private Schema icebergSchema = + new Schema( + Types.NestedField.optional(1, "optionalField", Types.StringType.get()), + Types.NestedField.required(2, "requiredField", Types.StringType.get())); + + private List data0 = + Arrays.asList( + "{\"optionalField\": \"a1\", \"requiredField\": \"bid_001\"}", + "{\"optionalField\": \"a2\", \"requiredField\": \"bid_002\"}"); + private List data1 = + Arrays.asList( + "{\"optionalField\": \"d1\", \"requiredField\": \"bid_101\"}", + "{\"optionalField\": \"d2\", \"requiredField\": \"bid_102\"}", + "{\"optionalField\": \"d3\", \"requiredField\": \"bid_103\"}", + "{\"optionalField\": \"d4\", \"requiredField\": \"bid_104\"}"); @BeforeClass public static void startSpark() { @@ -145,8 +148,10 @@ public void testWriteWithCustomDataLocation() throws IOException { File location = createTableFolder(); File tablePropertyDataLocation = temp.newFolder("test-table-property-data-dir"); Table table = createTable(new Schema(SUPPORTED_PRIMITIVES.fields()), location); - table.updateProperties().set( - TableProperties.WRITE_DATA_LOCATION, tablePropertyDataLocation.getAbsolutePath()).commit(); + table + .updateProperties() + .set(TableProperties.WRITE_DATA_LOCATION, tablePropertyDataLocation.getAbsolutePath()) + .commit(); writeAndValidateWithLocations(table, location, tablePropertyDataLocation); } @@ -162,7 +167,8 @@ private Table createTable(Schema schema, File location) { return tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); } - private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) throws IOException { + private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) + throws IOException { Schema tableSchema = table.schema(); // use the table schema because ids are reassigned table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); @@ -179,47 +185,56 @@ private void writeAndValidateWithLocations(Table table, File location, File expe while (expectedIter.hasNext() && actualIter.hasNext()) { assertEqualsSafe(tableSchema.asStruct(), expectedIter.next(), actualIter.next()); } - Assert.assertEquals("Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext()); - - table.currentSnapshot().addedDataFiles(table.io()).forEach(dataFile -> - Assert.assertTrue( - String.format( - "File should have the parent directory %s, but has: %s.", - expectedDataDir.getAbsolutePath(), - dataFile.path()), - URI.create(dataFile.path().toString()).getPath().startsWith(expectedDataDir.getAbsolutePath()))); + Assert.assertEquals( + "Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext()); + + table + .currentSnapshot() + .addedDataFiles(table.io()) + .forEach( + dataFile -> + Assert.assertTrue( + String.format( + "File should have the parent directory %s, but has: %s.", + expectedDataDir.getAbsolutePath(), dataFile.path()), + URI.create(dataFile.path().toString()) + .getPath() + .startsWith(expectedDataDir.getAbsolutePath()))); } private List readTable(String location) { - Dataset result = spark.read() - .format("iceberg") - .load(location); + Dataset result = spark.read().format("iceberg").load(location); return result.collectAsList(); } - private void writeData(Iterable records, Schema schema, String location) throws IOException { + private void writeData(Iterable records, Schema schema, String location) + throws IOException { Dataset df = createDataset(records, schema); DataFrameWriter writer = df.write().format("iceberg").mode("append"); writer.save(location); } - private void writeDataWithFailOnPartition(Iterable records, Schema schema, String location) - throws IOException, SparkException { + private void writeDataWithFailOnPartition( + Iterable records, Schema schema, String location) throws IOException, SparkException { final int numPartitions = 10; final int partitionToFail = new Random().nextInt(numPartitions); - MapPartitionsFunction failOnFirstPartitionFunc = (MapPartitionsFunction) input -> { - int partitionId = TaskContext.getPartitionId(); - - if (partitionId == partitionToFail) { - throw new SparkException(String.format("Intended exception in partition %d !", partitionId)); - } - return input; - }; - - Dataset df = createDataset(records, schema) - .repartition(numPartitions) - .mapPartitions(failOnFirstPartitionFunc, RowEncoder.apply(convert(schema))); + MapPartitionsFunction failOnFirstPartitionFunc = + (MapPartitionsFunction) + input -> { + int partitionId = TaskContext.getPartitionId(); + + if (partitionId == partitionToFail) { + throw new SparkException( + String.format("Intended exception in partition %d !", partitionId)); + } + return input; + }; + + Dataset df = + createDataset(records, schema) + .repartition(numPartitions) + .mapPartitions(failOnFirstPartitionFunc, RowEncoder.apply(convert(schema))); // This trick is needed because Spark 3 handles decimal overflow in RowEncoder which "changes" // nullability of the column to "true" regardless of original nullability. // Setting "check-nullability" option to "false" doesn't help as it fails at Spark analyzer. @@ -234,10 +249,8 @@ private Dataset createDataset(Iterable records, Schema schema) thro File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - try (FileAppender writer = Avro.write(Files.localOutput(testFile)) - .schema(schema) - .named("test") - .build()) { + try (FileAppender writer = + Avro.write(Files.localOutput(testFile)).schema(schema).named("test").build()) { for (Record rec : records) { writer.add(rec); } @@ -245,10 +258,11 @@ private Dataset createDataset(Iterable records, Schema schema) thro // make sure the dataframe matches the records before moving on List rows = Lists.newArrayList(); - try (AvroIterable reader = Avro.read(Files.localInput(testFile)) - .createReaderFunc(SparkAvroReader::new) - .project(schema) - .build()) { + try (AvroIterable reader = + Avro.read(Files.localInput(testFile)) + .createReaderFunc(SparkAvroReader::new) + .project(schema) + .build()) { Iterator recordIter = records.iterator(); Iterator readIter = reader.iterator(); @@ -257,7 +271,8 @@ private Dataset createDataset(Iterable records, Schema schema) thro assertEqualsUnsafe(schema.asStruct(), recordIter.next(), row); rows.add(row); } - Assert.assertEquals("Both iterators should be exhausted", recordIter.hasNext(), readIter.hasNext()); + Assert.assertEquals( + "Both iterators should be exhausted", recordIter.hasNext(), readIter.hasNext()); } JavaRDD rdd = sc.parallelize(rows); @@ -266,7 +281,8 @@ private Dataset createDataset(Iterable records, Schema schema) thro @Test public void testNullableWithWriteOption() throws IOException { - Assume.assumeTrue("Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); + Assume.assumeTrue( + "Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); File location = new File(temp.newFolder("parquet"), "test"); String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString()); @@ -276,9 +292,11 @@ public void testNullableWithWriteOption() throws IOException { // read this and append to iceberg dataset spark - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) - .write().parquet(sourcePath); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) + .write() + .parquet(sourcePath); // this is our iceberg dataset to which we will append data new HadoopTables(spark.sessionState().newHadoopConf()) @@ -290,15 +308,24 @@ public void testNullableWithWriteOption() throws IOException { // this is the initial data inside the iceberg dataset spark - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) - .write().format("iceberg").mode(SaveMode.Append).save(targetPath); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) + .write() + .format("iceberg") + .mode(SaveMode.Append) + .save(targetPath); // read from parquet and append to iceberg w/ nullability check disabled spark - .read().schema(SparkSchemaUtil.convert(icebergSchema)).parquet(sourcePath) - .write().format("iceberg").option(SparkWriteOptions.CHECK_NULLABILITY, false) - .mode(SaveMode.Append).save(targetPath); + .read() + .schema(SparkSchemaUtil.convert(icebergSchema)) + .parquet(sourcePath) + .write() + .format("iceberg") + .option(SparkWriteOptions.CHECK_NULLABILITY, false) + .mode(SaveMode.Append) + .save(targetPath); // read all data List rows = spark.read().format("iceberg").load(targetPath).collectAsList(); @@ -307,7 +334,8 @@ public void testNullableWithWriteOption() throws IOException { @Test public void testNullableWithSparkSqlOption() throws IOException { - Assume.assumeTrue("Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); + Assume.assumeTrue( + "Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2")); File location = new File(temp.newFolder("parquet"), "test"); String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString()); @@ -317,15 +345,18 @@ public void testNullableWithSparkSqlOption() throws IOException { // read this and append to iceberg dataset spark - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) - .write().parquet(sourcePath); - - SparkSession newSparkSession = SparkSession.builder() - .master("local[2]") - .appName("NullableTest") - .config(SparkSQLProperties.CHECK_NULLABILITY, false) - .getOrCreate(); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1)) + .write() + .parquet(sourcePath); + + SparkSession newSparkSession = + SparkSession.builder() + .master("local[2]") + .appName("NullableTest") + .config(SparkSQLProperties.CHECK_NULLABILITY, false) + .getOrCreate(); // this is our iceberg dataset to which we will append data new HadoopTables(newSparkSession.sessionState().newHadoopConf()) @@ -337,19 +368,27 @@ public void testNullableWithSparkSqlOption() throws IOException { // this is the initial data inside the iceberg dataset newSparkSession - .read().schema(sparkSchema).json( - JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) - .write().format("iceberg").mode(SaveMode.Append).save(targetPath); + .read() + .schema(sparkSchema) + .json(JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0)) + .write() + .format("iceberg") + .mode(SaveMode.Append) + .save(targetPath); // read from parquet and append to iceberg newSparkSession - .read().schema(SparkSchemaUtil.convert(icebergSchema)).parquet(sourcePath) - .write().format("iceberg").mode(SaveMode.Append).save(targetPath); + .read() + .schema(SparkSchemaUtil.convert(icebergSchema)) + .parquet(sourcePath) + .write() + .format("iceberg") + .mode(SaveMode.Append) + .save(targetPath); // read all data List rows = newSparkSession.read().format("iceberg").load(targetPath).collectAsList(); Assert.assertEquals("Should contain 6 rows", 6, rows.size()); - } @Test diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java index 0c56cb328648..60dd716c631e 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestDataSourceOptions.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import java.math.RoundingMode; import java.util.List; @@ -58,19 +59,15 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestDataSourceOptions { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); private static SparkSession spark = null; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @BeforeClass public static void startSpark() { @@ -94,23 +91,23 @@ public void testWriteFormatOptionOverridesTableProperties() throws IOException { options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro"); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, "parquet") .mode(SaveMode.Append) .save(tableLocation); try (CloseableIterable tasks = table.newScan().planFiles()) { - tasks.forEach(task -> { - FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); - Assert.assertEquals(FileFormat.PARQUET, fileFormat); - }); + tasks.forEach( + task -> { + FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); + Assert.assertEquals(FileFormat.PARQUET, fileFormat); + }); } } @@ -124,22 +121,18 @@ public void testNoWriteFormatOption() throws IOException { options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro"); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); - df.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + df.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); try (CloseableIterable tasks = table.newScan().planFiles()) { - tasks.forEach(task -> { - FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); - Assert.assertEquals(FileFormat.AVRO, fileFormat); - }); + tasks.forEach( + task -> { + FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); + Assert.assertEquals(FileFormat.AVRO, fileFormat); + }); } } @@ -159,24 +152,25 @@ public void testHadoopOptions() throws IOException { // to verify that 'hadoop.' data source options are propagated correctly sparkHadoopConf.set("fs.default.name", "hdfs://localhost:9000"); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() + originalDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .option("hadoop.fs.default.name", "file:///") .save(tableLocation); - Dataset resultDf = spark.read() - .format("iceberg") - .option("hadoop.fs.default.name", "file:///") - .load(tableLocation); - List resultRecords = resultDf.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset resultDf = + spark + .read() + .format("iceberg") + .option("hadoop.fs.default.name", "file:///") + .load(tableLocation); + List resultRecords = + resultDf.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Records should match", expectedRecords, resultRecords); } finally { @@ -192,31 +186,35 @@ public void testSplitOptionsOverridesTableProperties() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Map options = Maps.newHashMap(); options.put(TableProperties.SPLIT_SIZE, String.valueOf(128L * 1024 * 1024)); // 128Mb - options.put(TableProperties.DEFAULT_FILE_FORMAT, String.valueOf(FileFormat.AVRO)); // Arbitrarily splittable + options.put( + TableProperties.DEFAULT_FILE_FORMAT, + String.valueOf(FileFormat.AVRO)); // Arbitrarily splittable Table icebergTable = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data") + originalDf + .select("id", "data") .repartition(1) .write() .format("iceberg") .mode("append") .save(tableLocation); - List files = Lists.newArrayList(icebergTable.currentSnapshot().addedDataFiles(icebergTable.io())); + List files = + Lists.newArrayList(icebergTable.currentSnapshot().addedDataFiles(icebergTable.io())); Assert.assertEquals("Should have written 1 file", 1, files.size()); long fileSize = files.get(0).fileSizeInBytes(); long splitSize = LongMath.divide(fileSize, 2, RoundingMode.CEILING); - Dataset resultDf = spark.read() - .format("iceberg") - .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(splitSize)) - .load(tableLocation); + Dataset resultDf = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(splitSize)) + .load(tableLocation); Assert.assertEquals("Spark partitions should match", 2, resultDf.javaRDD().getNumPartitions()); } @@ -230,18 +228,16 @@ public void testIncrementalScanOptions() throws IOException { Map options = Maps.newHashMap(); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "d") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "d")); for (SimpleRecord record : expectedRecords) { - Dataset originalDf = spark.createDataFrame(Lists.newArrayList(record), SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + Dataset originalDf = + spark.createDataFrame(Lists.newArrayList(record), SimpleRecord.class); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); } List snapshotIds = SnapshotUtil.currentAncestorIds(table); @@ -251,11 +247,13 @@ public void testIncrementalScanOptions() throws IOException { IllegalArgumentException.class, "Cannot set start-snapshot-id and end-snapshot-id for incremental scans", () -> { - spark.read() + spark + .read() .format("iceberg") .option("snapshot-id", snapshotIds.get(3).toString()) .option("start-snapshot-id", snapshotIds.get(3).toString()) - .load(tableLocation).explain(); + .load(tableLocation) + .explain(); }); // end-snapshot-id and as-of-timestamp are both configured. @@ -264,12 +262,15 @@ public void testIncrementalScanOptions() throws IOException { IllegalArgumentException.class, "Cannot set start-snapshot-id and end-snapshot-id for incremental scans", () -> { - spark.read() + spark + .read() .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, + .option( + SparkReadOptions.AS_OF_TIMESTAMP, Long.toString(table.snapshot(snapshotIds.get(3)).timestampMillis())) .option("end-snapshot-id", snapshotIds.get(2).toString()) - .load(tableLocation).explain(); + .load(tableLocation) + .explain(); }); // only end-snapshot-id is configured. @@ -278,31 +279,37 @@ public void testIncrementalScanOptions() throws IOException { IllegalArgumentException.class, "Cannot set only end-snapshot-id for incremental scans", () -> { - spark.read() + spark + .read() .format("iceberg") .option("end-snapshot-id", snapshotIds.get(2).toString()) - .load(tableLocation).explain(); + .load(tableLocation) + .explain(); }); // test (1st snapshot, current snapshot] incremental scan. - List result = spark.read() - .format("iceberg") - .option("start-snapshot-id", snapshotIds.get(3).toString()) - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List result = + spark + .read() + .format("iceberg") + .option("start-snapshot-id", snapshotIds.get(3).toString()) + .load(tableLocation) + .orderBy("id") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Records should match", expectedRecords.subList(1, 4), result); // test (2nd snapshot, 3rd snapshot] incremental scan. - List result1 = spark.read() - .format("iceberg") - .option("start-snapshot-id", snapshotIds.get(2).toString()) - .option("end-snapshot-id", snapshotIds.get(1).toString()) - .load(tableLocation) - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List result1 = + spark + .read() + .format("iceberg") + .option("start-snapshot-id", snapshotIds.get(2).toString()) + .option("end-snapshot-id", snapshotIds.get(1).toString()) + .load(tableLocation) + .orderBy("id") + .as(Encoders.bean(SimpleRecord.class)) + .collectAsList(); Assert.assertEquals("Records should match", expectedRecords.subList(2, 3), result1); } @@ -315,41 +322,34 @@ public void testMetadataSplitSizeOptionOverrideTableProperties() throws IOExcept Map options = Maps.newHashMap(); Table table = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); // produce 1st manifest - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); // produce 2nd manifest - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); List manifests = table.currentSnapshot().allManifests(table.io()); Assert.assertEquals("Must be 2 manifests", 2, manifests.size()); // set the target metadata split size so each manifest ends up in a separate split - table.updateProperties() + table + .updateProperties() .set(TableProperties.METADATA_SPLIT_SIZE, String.valueOf(manifests.get(0).length())) .commit(); - Dataset entriesDf = spark.read() - .format("iceberg") - .load(tableLocation + "#entries"); + Dataset entriesDf = spark.read().format("iceberg").load(tableLocation + "#entries"); Assert.assertEquals("Num partitions must match", 2, entriesDf.javaRDD().getNumPartitions()); // override the table property using options - entriesDf = spark.read() - .format("iceberg") - .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)) - .load(tableLocation + "#entries"); + entriesDf = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SPLIT_SIZE, String.valueOf(128 * 1024 * 1024)) + .load(tableLocation + "#entries"); Assert.assertEquals("Num partitions must match", 1, entriesDf.javaRDD().getNumPartitions()); } @@ -362,24 +362,26 @@ public void testDefaultMetadataSplitSize() throws IOException { Map options = Maps.newHashMap(); Table icebergTable = tables.create(SCHEMA, spec, options, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); int splitSize = (int) TableProperties.METADATA_SPLIT_SIZE_DEFAULT; // 32MB split size - int expectedSplits = ((int) tables.load(tableLocation + "#entries") - .currentSnapshot().allManifests(icebergTable.io()).get(0).length() + splitSize - 1) / splitSize; + int expectedSplits = + ((int) + tables + .load(tableLocation + "#entries") + .currentSnapshot() + .allManifests(icebergTable.io()) + .get(0) + .length() + + splitSize + - 1) + / splitSize; - Dataset metadataDf = spark.read() - .format("iceberg") - .load(tableLocation + "#entries"); + Dataset metadataDf = spark.read().format("iceberg").load(tableLocation + "#entries"); int partitionNum = metadataDf.javaRDD().getNumPartitions(); Assert.assertEquals("Spark partitions should match", expectedSplits, partitionNum); @@ -391,17 +393,17 @@ public void testExtraSnapshotMetadata() throws IOException { HadoopTables tables = new HadoopTables(CONF); tables.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".extra-key", "someValue") - .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".another-key", "anotherValue") - .save(tableLocation); + originalDf + .select("id", "data") + .write() + .format("iceberg") + .mode("append") + .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".extra-key", "someValue") + .option(SparkWriteOptions.SNAPSHOT_PROPERTY_PREFIX + ".another-key", "anotherValue") + .save(tableLocation); Table table = tables.load(tableLocation); @@ -414,26 +416,27 @@ public void testExtraSnapshotMetadataWithSQL() throws InterruptedException, IOEx String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); - Table table = tables.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + tables.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List expectedRecords = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - originalDf.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(tableLocation); + originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); spark.read().format("iceberg").load(tableLocation).createOrReplaceTempView("target"); - Thread writerThread = new Thread(() -> { - Map properties = Maps.newHashMap(); - properties.put("writer-thread", String.valueOf(Thread.currentThread().getName())); - CommitMetadata.withCommitProperties(properties, () -> { - spark.sql("INSERT INTO target VALUES (3, 'c'), (4, 'd')"); - return 0; - }, RuntimeException.class); - }); + Thread writerThread = + new Thread( + () -> { + Map properties = Maps.newHashMap(); + properties.put("writer-thread", String.valueOf(Thread.currentThread().getName())); + CommitMetadata.withCommitProperties( + properties, + () -> { + spark.sql("INSERT INTO target VALUES (3, 'c'), (4, 'd')"); + return 0; + }, + RuntimeException.class); + }); writerThread.setName("test-extra-commit-message-writer-thread"); writerThread.start(); writerThread.join(); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java index d51fd3c4e8eb..b30bbf145f23 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; +import static org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp; +import static org.apache.spark.sql.functions.callUDF; +import static org.apache.spark.sql.functions.column; + import java.io.File; import java.io.IOException; import java.sql.Timestamp; @@ -79,41 +83,31 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; -import static org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp; -import static org.apache.spark.sql.functions.callUDF; -import static org.apache.spark.sql.functions.column; - @RunWith(Parameterized.class) public class TestFilteredScan { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "ts", Types.TimestampType.withZone()), - Types.NestedField.optional(3, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "ts", Types.TimestampType.withZone()), + Types.NestedField.optional(3, "data", Types.StringType.get())); - private static final PartitionSpec BUCKET_BY_ID = PartitionSpec.builderFor(SCHEMA) - .bucket("id", 4) - .build(); + private static final PartitionSpec BUCKET_BY_ID = + PartitionSpec.builderFor(SCHEMA).bucket("id", 4).build(); - private static final PartitionSpec PARTITION_BY_DAY = PartitionSpec.builderFor(SCHEMA) - .day("ts") - .build(); + private static final PartitionSpec PARTITION_BY_DAY = + PartitionSpec.builderFor(SCHEMA).day("ts").build(); - private static final PartitionSpec PARTITION_BY_HOUR = PartitionSpec.builderFor(SCHEMA) - .hour("ts") - .build(); + private static final PartitionSpec PARTITION_BY_HOUR = + PartitionSpec.builderFor(SCHEMA).hour("ts").build(); - private static final PartitionSpec PARTITION_BY_DATA = PartitionSpec.builderFor(SCHEMA) - .identity("data") - .build(); + private static final PartitionSpec PARTITION_BY_DATA = + PartitionSpec.builderFor(SCHEMA).identity("data").build(); - private static final PartitionSpec PARTITION_BY_ID = PartitionSpec.builderFor(SCHEMA) - .identity("id") - .build(); + private static final PartitionSpec PARTITION_BY_ID = + PartitionSpec.builderFor(SCHEMA).identity("id").build(); private static SparkSession spark = null; @@ -126,14 +120,20 @@ public static void startSpark() { spark.udf().register("bucket4", (UDF1) bucket4::apply, IntegerType$.MODULE$); Transform day = Transforms.day(Types.TimestampType.withZone()); - spark.udf().register("ts_day", - (UDF1) timestamp -> day.apply((Long) fromJavaTimestamp(timestamp)), - IntegerType$.MODULE$); + spark + .udf() + .register( + "ts_day", + (UDF1) timestamp -> day.apply((Long) fromJavaTimestamp(timestamp)), + IntegerType$.MODULE$); Transform hour = Transforms.hour(Types.TimestampType.withZone()); - spark.udf().register("ts_hour", - (UDF1) timestamp -> hour.apply((Long) fromJavaTimestamp(timestamp)), - IntegerType$.MODULE$); + spark + .udf() + .register( + "ts_hour", + (UDF1) timestamp -> hour.apply((Long) fromJavaTimestamp(timestamp)), + IntegerType$.MODULE$); spark.udf().register("data_ident", (UDF1) data -> data, StringType$.MODULE$); spark.udf().register("id_ident", (UDF1) id -> id, LongType$.MODULE$); @@ -146,8 +146,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String format; private final boolean vectorized; @@ -155,11 +154,11 @@ public static void stopSpark() { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } @@ -188,26 +187,27 @@ public void writeUnpartitionedTable() throws IOException { this.records = testRecords(tableSchema); - try (FileAppender writer = new GenericAppenderFactory(tableSchema).newAppender( - localOutput(testFile), fileFormat)) { + try (FileAppender writer = + new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), fileFormat)) { writer.addAll(records); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(records.size()) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(records.size()) + .withFileSizeInBytes(testFile.length()) + .withPath(testFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); } @Test public void testUnpartitionedIDFilters() { - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", unpartitioned.toString()) - ); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString())); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); for (int i = 0; i < 10; i += 1) { pushFilters(builder, EqualTo.apply("id", i)); @@ -217,16 +217,15 @@ public void testUnpartitionedIDFilters() { Assert.assertEquals("Should only create one task for a small file", 1, partitions.length); // validate row filtering - assertEqualsSafe(SCHEMA.asStruct(), expected(i), - read(unpartitioned.toString(), vectorized, "id = " + i)); + assertEqualsSafe( + SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), vectorized, "id = " + i)); } } @Test public void testUnpartitionedCaseInsensitiveIDFilters() { - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", unpartitioned.toString()) - ); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString())); // set spark.sql.caseSensitive to false String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive"); @@ -235,17 +234,22 @@ public void testUnpartitionedCaseInsensitiveIDFilters() { try { for (int i = 0; i < 10; i += 1) { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options) - .caseSensitive(false); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options) + .caseSensitive(false); - pushFilters(builder, EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match + pushFilters( + builder, + EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match Batch scan = builder.build().toBatch(); InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.length); // validate row filtering - assertEqualsSafe(SCHEMA.asStruct(), expected(i), + assertEqualsSafe( + SCHEMA.asStruct(), + expected(i), read(unpartitioned.toString(), vectorized, "id = " + i)); } } finally { @@ -256,11 +260,11 @@ public void testUnpartitionedCaseInsensitiveIDFilters() { @Test public void testUnpartitionedTimestampFilter() { - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", unpartitioned.toString()) - ); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", unpartitioned.toString())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); Batch scan = builder.build().toBatch(); @@ -268,21 +272,29 @@ public void testUnpartitionedTimestampFilter() { InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), - read(unpartitioned.toString(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(5, 6, 7, 8, 9), + read( + unpartitioned.toString(), + vectorized, + "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } @Test public void testBucketPartitionedIDFilters() { Table table = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); - Assert.assertEquals("Unfiltered table should created 4 read tasks", - 4, unfiltered.planInputPartitions().length); + Batch unfiltered = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); + Assert.assertEquals( + "Unfiltered table should created 4 read tasks", 4, unfiltered.planInputPartitions().length); for (int i = 0; i < 10; i += 1) { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, EqualTo.apply("id", i)); Batch scan = builder.build().toBatch(); @@ -293,7 +305,8 @@ public void testBucketPartitionedIDFilters() { Assert.assertEquals("Should create one task for a single bucket", 1, tasks.length); // validate row filtering - assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(table.location(), vectorized, "id = " + i)); + assertEqualsSafe( + SCHEMA.asStruct(), expected(i), read(table.location(), vectorized, "id = " + i)); } } @@ -301,14 +314,17 @@ public void testBucketPartitionedIDFilters() { @Test public void testDayPartitionedTimestampFilters() { Table table = buildPartitionedTable("partitioned_by_day", PARTITION_BY_DAY, "ts_day", "ts"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + Batch unfiltered = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); - Assert.assertEquals("Unfiltered table should created 2 read tasks", - 2, unfiltered.planInputPartitions().length); + Assert.assertEquals( + "Unfiltered table should created 2 read tasks", 2, unfiltered.planInputPartitions().length); { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); Batch scan = builder.build().toBatch(); @@ -316,24 +332,35 @@ public void testDayPartitionedTimestampFilters() { InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should create one task for 2017-12-21", 1, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), - read(table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(5, 6, 7, 8, 9), + read( + table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); - - pushFilters(builder, And.apply( - GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), - LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + + pushFilters( + builder, + And.apply( + GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), + LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); Batch scan = builder.build().toBatch(); InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should create one task for 2017-12-22", 1, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(1, 2), read(table.location(), vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + - "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(1, 2), + read( + table.location(), + vectorized, + "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); } } @@ -342,14 +369,17 @@ public void testDayPartitionedTimestampFilters() { public void testHourPartitionedTimestampFilters() { Table table = buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - Batch unfiltered = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + Batch unfiltered = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options).build().toBatch(); - Assert.assertEquals("Unfiltered table should created 9 read tasks", - 9, unfiltered.planInputPartitions().length); + Assert.assertEquals( + "Unfiltered table should created 9 read tasks", 9, unfiltered.planInputPartitions().length); { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); Batch scan = builder.build().toBatch(); @@ -357,24 +387,35 @@ public void testHourPartitionedTimestampFilters() { InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should create 4 tasks for 2017-12-21: 15, 17, 21, 22", 4, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(8, 9, 7, 6, 5), - read(table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(8, 9, 7, 6, 5), + read( + table.location(), vectorized, "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); } { - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); - - pushFilters(builder, And.apply( - GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), - LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + + pushFilters( + builder, + And.apply( + GreaterThan.apply("ts", "2017-12-22T06:00:00+00:00"), + LessThan.apply("ts", "2017-12-22T08:00:00+00:00"))); Batch scan = builder.build().toBatch(); InputPartition[] tasks = scan.planInputPartitions(); Assert.assertEquals("Should create 2 tasks for 2017-12-22: 6, 7", 2, tasks.length); - assertEqualsSafe(SCHEMA.asStruct(), expected(2, 1), read(table.location(), vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + - "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); + assertEqualsSafe( + SCHEMA.asStruct(), + expected(2, 1), + read( + table.location(), + vectorized, + "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)")); } } @@ -388,10 +429,15 @@ public void testFilterByNonProjectedColumn() { expected.add(projectFlat(actualProjection, rec)); } - assertEqualsSafe(actualProjection.asStruct(), expected, read( - unpartitioned.toString(), vectorized, - "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)", - "id", "data")); + assertEqualsSafe( + actualProjection.asStruct(), + expected, + read( + unpartitioned.toString(), + vectorized, + "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)", + "id", + "data")); } { @@ -403,20 +449,27 @@ public void testFilterByNonProjectedColumn() { expected.add(projectFlat(actualProjection, rec)); } - assertEqualsSafe(actualProjection.asStruct(), expected, read( - unpartitioned.toString(), vectorized, - "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + - "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)", - "id")); + assertEqualsSafe( + actualProjection.asStruct(), + expected, + read( + unpartitioned.toString(), + vectorized, + "ts > cast('2017-12-22 06:00:00+00:00' as timestamp) and " + + "ts < cast('2017-12-22 08:00:00+00:00' as timestamp)", + "id")); } } @Test public void testPartitionedByDataStartsWithFilter() { - Table table = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + Table table = + buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new StringStartsWith("data", "junc")); Batch scan = builder.build().toBatch(); @@ -426,10 +479,13 @@ public void testPartitionedByDataStartsWithFilter() { @Test public void testPartitionedByDataNotStartsWithFilter() { - Table table = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); + Table table = + buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new Not(new StringStartsWith("data", "junc"))); Batch scan = builder.build().toBatch(); @@ -441,11 +497,11 @@ public void testPartitionedByDataNotStartsWithFilter() { public void testPartitionedByIdStartsWith() { Table table = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", table.location()) - ); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new StringStartsWith("data", "junc")); Batch scan = builder.build().toBatch(); @@ -457,11 +513,11 @@ public void testPartitionedByIdStartsWith() { public void testPartitionedByIdNotStartsWith() { Table table = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id"); - CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( - "path", table.location()) - ); + CaseInsensitiveStringMap options = + new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); - SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); + SparkScanBuilder builder = + new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new Not(new StringStartsWith("data", "junc"))); Batch scan = builder.build().toBatch(); @@ -471,15 +527,15 @@ public void testPartitionedByIdNotStartsWith() { @Test public void testUnpartitionedStartsWith() { - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()); - List matchedData = df.select("data") - .where("data LIKE 'jun%'") - .as(Encoders.STRING()) - .collectAsList(); + List matchedData = + df.select("data").where("data LIKE 'jun%'").as(Encoders.STRING()).collectAsList(); Assert.assertEquals(1, matchedData.size()); Assert.assertEquals("junction", matchedData.get(0)); @@ -487,20 +543,21 @@ public void testUnpartitionedStartsWith() { @Test public void testUnpartitionedNotStartsWith() { - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()); - - List matchedData = df.select("data") - .where("data NOT LIKE 'jun%'") - .as(Encoders.STRING()) - .collectAsList(); - - List expected = testRecords(SCHEMA).stream() - .map(r -> r.getField("data").toString()) - .filter(d -> !d.startsWith("jun")) - .collect(Collectors.toList()); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()); + + List matchedData = + df.select("data").where("data NOT LIKE 'jun%'").as(Encoders.STRING()).collectAsList(); + + List expected = + testRecords(SCHEMA).stream() + .map(r -> r.getField("data").toString()) + .filter(d -> !d.startsWith("jun")) + .collect(Collectors.toList()); Assert.assertEquals(9, matchedData.size()); Assert.assertEquals(Sets.newHashSet(expected), Sets.newHashSet(matchedData)); @@ -516,8 +573,8 @@ private static Record projectFlat(Schema projection, Record record) { return result; } - public static void assertEqualsUnsafe(Types.StructType struct, - List expected, List actual) { + public static void assertEqualsUnsafe( + Types.StructType struct, List expected, List actual) { // TODO: match records by ID int numRecords = Math.min(expected.size(), actual.size()); for (int i = 0; i < numRecords; i += 1) { @@ -526,8 +583,8 @@ public static void assertEqualsUnsafe(Types.StructType struct, Assert.assertEquals("Number of results should match expected", expected.size(), actual.size()); } - public static void assertEqualsSafe(Types.StructType struct, - List expected, List actual) { + public static void assertEqualsSafe( + Types.StructType struct, List expected, List actual) { // TODO: match records by ID int numRecords = Math.min(expected.size(), actual.size()); for (int i = 0; i < numRecords; i += 1) { @@ -550,7 +607,8 @@ private void pushFilters(ScanBuilder scan, Filter... filters) { filterable.pushFilters(filters); } - private Table buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) { + private Table buildPartitionedTable( + String desc, PartitionSpec spec, String udf, String partitionColumn) { File location = new File(parent, desc); Table table = TABLES.create(SCHEMA, spec, location.toString()); @@ -559,10 +617,12 @@ private Table buildPartitionedTable(String desc, PartitionSpec spec, String udf, table.updateProperties().set("read.split.target-size", "2048").commit(); // copy the unpartitioned table into the partitioned table to produce the partitioned data - Dataset allRows = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()); + Dataset allRows = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()); allRows .coalesce(1) // ensure only 1 file per partition is written @@ -590,19 +650,23 @@ private List testRecords(Schema schema) { record(schema, 6L, parse("2017-12-21T21:55:30.589712+00:00"), "element"), record(schema, 7L, parse("2017-12-21T17:31:14.532797+00:00"), "limited"), record(schema, 8L, parse("2017-12-21T15:21:51.237521+00:00"), "global"), - record(schema, 9L, parse("2017-12-21T15:02:15.230570+00:00"), "goldfish") - ); + record(schema, 9L, parse("2017-12-21T15:02:15.230570+00:00"), "goldfish")); } private static List read(String table, boolean vectorized, String expr) { return read(table, vectorized, expr, "*"); } - private static List read(String table, boolean vectorized, String expr, String select0, String... selectN) { - Dataset dataset = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table).filter(expr) - .select(select0, selectN); + private static List read( + String table, boolean vectorized, String expr, String select0, String... selectN) { + Dataset dataset = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table) + .filter(expr) + .select(select0, selectN); return dataset.collectAsList(); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java index 74203fd20f9c..42d9ac6a79ba 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localInput; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.List; @@ -63,25 +65,26 @@ import scala.Option; import scala.collection.JavaConverters; -import static org.apache.iceberg.Files.localInput; -import static org.apache.iceberg.Files.localOutput; - public class TestForwardCompatibility { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); // create a spec for the schema that uses a "zero" transform that produces all 0s - private static final PartitionSpec UNKNOWN_SPEC = PartitionSpecParser.fromJson(SCHEMA, - "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); + private static final PartitionSpec UNKNOWN_SPEC = + PartitionSpecParser.fromJson( + SCHEMA, + "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); // create a fake spec to use to write table metadata - private static final PartitionSpec FAKE_SPEC = PartitionSpecParser.fromJson(SCHEMA, - "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"identity\", \"source-id\": 1 } ] }"); + private static final PartitionSpec FAKE_SPEC = + PartitionSpecParser.fromJson( + SCHEMA, + "{ \"spec-id\": 0, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"identity\", \"source-id\": 1 } ] }"); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; @@ -107,20 +110,22 @@ public void testSparkWriteFailsUnknownTransform() throws IOException { HadoopTables tables = new HadoopTables(CONF); tables.create(SCHEMA, UNKNOWN_SPEC, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - AssertHelpers.assertThrows("Should reject write with unsupported transform", - UnsupportedOperationException.class, "Cannot write using unsupported transforms: zero", - () -> df.select("id", "data").write() - .format("iceberg") - .mode("append") - .save(location.toString())); + AssertHelpers.assertThrows( + "Should reject write with unsupported transform", + UnsupportedOperationException.class, + "Cannot write using unsupported transforms: zero", + () -> + df.select("id", "data") + .write() + .format("iceberg") + .mode("append") + .save(location.toString())); } @Test @@ -136,20 +141,24 @@ public void testSparkStreamingWriteFailsUnknownTransform() throws IOException, T tables.create(SCHEMA, UNKNOWN_SPEC, location.toString()); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - StreamingQuery query = inputStream.toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("append") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()) - .start(); + StreamingQuery query = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("append") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()) + .start(); List batch1 = Lists.newArrayList(1, 2); send(batch1, inputStream); - AssertHelpers.assertThrows("Should reject streaming write with unsupported transform", - StreamingQueryException.class, "Cannot write using unsupported transforms: zero", + AssertHelpers.assertThrows( + "Should reject streaming write with unsupported transform", + StreamingQueryException.class, + "Cannot write using unsupported transforms: zero", query::processAllAvailable); } @@ -168,22 +177,22 @@ public void testSparkCanReadUnknownTransform() throws IOException { List expected = RandomData.generateList(table.schema(), 100, 1L); - File parquetFile = new File(dataFolder, - FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); - FileAppender writer = Parquet.write(localOutput(parquetFile)) - .schema(table.schema()) - .build(); + File parquetFile = + new File(dataFolder, FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); + FileAppender writer = + Parquet.write(localOutput(parquetFile)).schema(table.schema()).build(); try { writer.addAll(expected); } finally { writer.close(); } - DataFile file = DataFiles.builder(FAKE_SPEC) - .withInputFile(localInput(parquetFile)) - .withMetrics(writer.metrics()) - .withPartitionPath("id_zero=0") - .build(); + DataFile file = + DataFiles.builder(FAKE_SPEC) + .withInputFile(localInput(parquetFile)) + .withMetrics(writer.metrics()) + .withPartitionPath("id_zero=0") + .build(); OutputFile manifestFile = localOutput(FileFormat.AVRO.addExtension(temp.newFile().toString())); ManifestWriter manifestWriter = ManifestFiles.write(FAKE_SPEC, manifestFile); @@ -195,9 +204,7 @@ public void testSparkCanReadUnknownTransform() throws IOException { table.newFastAppend().appendManifest(manifestWriter.toManifestFile()).commit(); - Dataset df = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset df = spark.read().format("iceberg").load(location.toString()); List rows = df.collectAsList(); Assert.assertEquals("Should contain 100 rows", 100, rows.size()); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java index 42f53d585601..a850275118db 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSource.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.catalog.TableIdentifier; @@ -40,5 +39,4 @@ public Identifier extractIdentifier(CaseInsensitiveStringMap options) { public String extractCatalog(CaseInsensitiveStringMap options) { return SparkSession.active().sessionState().catalogManager().currentCatalog().name(); } - } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java index f1cfc7a72e17..b55ba0e2199a 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHadoopTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java index a51f9ee85e2f..de26f5f82c49 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceHiveTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -59,7 +58,8 @@ public Table createTable(TableIdentifier ident, Schema schema, PartitionSpec spe @Override public Table loadTable(TableIdentifier ident, String entriesSuffix) { - TableIdentifier identifier = TableIdentifier.of(ident.namespace().level(0), ident.name(), entriesSuffix); + TableIdentifier identifier = + TableIdentifier.of(ident.namespace().level(0), ident.name(), entriesSuffix); return TestIcebergSourceHiveTables.catalog.loadTable(identifier); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java index ce40f179d649..3ded9471fe9c 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java @@ -16,9 +16,13 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.ManifestContent.DATA; +import static org.apache.iceberg.ManifestContent.DELETES; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.io.UncheckedIOException; import java.util.Comparator; @@ -76,33 +80,26 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.ManifestContent.DATA; -import static org.apache.iceberg.ManifestContent.DELETES; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public abstract class TestIcebergSourceTablesBase extends SparkTestBase { - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - private static final Schema SCHEMA2 = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()), - optional(3, "category", Types.StringType.get()) - ); + private static final Schema SCHEMA2 = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get()), + optional(3, "category", Types.StringType.get())); - private static final Schema SCHEMA3 = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(3, "category", Types.StringType.get()) - ); + private static final Schema SCHEMA3 = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(3, "category", Types.StringType.get())); private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA).identity("id").build(); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); public abstract Table createTable(TableIdentifier ident, Schema schema, PartitionSpec spec); @@ -117,23 +114,21 @@ public synchronized void testTablesSupport() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "1"), - new SimpleRecord(2, "2"), - new SimpleRecord(3, "3")); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "1"), new SimpleRecord(2, "2"), new SimpleRecord(3, "3")); Dataset inputDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - List actualRecords = resultDf.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + List actualRecords = + resultDf.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Records should match", expectedRecords, actualRecords); } @@ -147,32 +142,39 @@ public void testEntriesTable() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .collectAsList(); Snapshot snapshot = table.currentSnapshot(); - Assert.assertEquals("Should only contain one manifest", 1, snapshot.allManifests(table.io()).size()); + Assert.assertEquals( + "Should only contain one manifest", 1, snapshot.allManifests(table.io()).size()); InputFile manifest = table.io().newInputFile(snapshot.allManifests(table.io()).get(0).path()); List expected = Lists.newArrayList(); - try (CloseableIterable rows = Avro.read(manifest).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(manifest).project(entriesTable.schema()).build()) { // each row must inherit snapshot_id and sequence_number - rows.forEach(row -> { - row.put(2, 0L); - GenericData.Record file = (GenericData.Record) row.get("data_file"); - asMetadataRecord(file); - expected.add(row); - }); + rows.forEach( + row -> { + row.put(2, 0L); + GenericData.Record file = (GenericData.Record) row.get("data_file"); + asMetadataRecord(file); + expected.add(row); + }); } Assert.assertEquals("Entries table should have one row", 1, expected.size()); @@ -188,18 +190,22 @@ public void testEntriesTablePartitionedPrune() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("status") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select("status") + .collectAsList(); Assert.assertEquals("Results should contain only one status", 1, actual.size()); Assert.assertEquals("That status should be Added (1)", 1, actual.get(0).getInt(0)); @@ -213,7 +219,9 @@ public void testEntriesTableDataFilePrune() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -221,15 +229,19 @@ public void testEntriesTableDataFilePrune() throws Exception { table.refresh(); DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - List singleActual = rowsToJava(spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("data_file.file_path") - .collectAsList()); + List singleActual = + rowsToJava( + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select("data_file.file_path") + .collectAsList()); List singleExpected = ImmutableList.of(row(file.path())); - assertEquals("Should prune a single element from a nested struct", singleExpected, singleActual); + assertEquals( + "Should prune a single element from a nested struct", singleExpected, singleActual); } @Test @@ -240,7 +252,9 @@ public void testEntriesTableDataFilePruneMulti() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -248,14 +262,22 @@ public void testEntriesTableDataFilePruneMulti() throws Exception { table.refresh(); DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - List multiActual = rowsToJava(spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("data_file.file_path", "data_file.value_counts", "data_file.record_count", "data_file.column_sizes") - .collectAsList()); - - List multiExpected = ImmutableList.of( - row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); + List multiActual = + rowsToJava( + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select( + "data_file.file_path", + "data_file.value_counts", + "data_file.record_count", + "data_file.column_sizes") + .collectAsList()); + + List multiExpected = + ImmutableList.of( + row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); assertEquals("Should prune a single element from a nested struct", multiExpected, multiActual); } @@ -268,7 +290,9 @@ public void testFilesSelectMap() throws Exception { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -276,14 +300,18 @@ public void testFilesSelectMap() throws Exception { table.refresh(); DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); - List multiActual = rowsToJava(spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .select("file_path", "value_counts", "record_count", "column_sizes") - .collectAsList()); + List multiActual = + rowsToJava( + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "files")) + .select("file_path", "value_counts", "record_count", "column_sizes") + .collectAsList()); - List multiExpected = ImmutableList.of( - row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); + List multiExpected = + ImmutableList.of( + row(file.path(), file.valueCounts(), file.recordCount(), file.columnSizes())); assertEquals("Should prune a single element from a row", multiExpected, multiActual); } @@ -294,10 +322,13 @@ public void testAllEntriesTable() throws Exception { Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); Table entriesTable = loadTable(tableIdentifier, "all_entries"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -306,7 +337,8 @@ public void testAllEntriesTable() throws Exception { table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -314,24 +346,28 @@ public void testAllEntriesTable() throws Exception { // ensure table data isn't stale table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_entries")) - .orderBy("snapshot_id") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_entries")) + .orderBy("snapshot_id") + .collectAsList(); List expected = Lists.newArrayList(); - for (ManifestFile manifest : Iterables.concat( - Iterables.transform(table.snapshots(), s -> s.allManifests(table.io())))) { + for (ManifestFile manifest : + Iterables.concat(Iterables.transform(table.snapshots(), s -> s.allManifests(table.io())))) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { // each row must inherit snapshot_id and sequence_number - rows.forEach(row -> { - row.put(2, 0L); - GenericData.Record file = (GenericData.Record) row.get("data_file"); - asMetadataRecord(file); - expected.add(row); - }); + rows.forEach( + row -> { + row.put(2, 0L); + GenericData.Record file = (GenericData.Record) row.get("data_file"); + asMetadataRecord(file); + expected.add(row); + }); } } @@ -340,7 +376,8 @@ public void testAllEntriesTable() throws Exception { Assert.assertEquals("Entries table should have 3 rows", 3, expected.size()); Assert.assertEquals("Actual results should have 3 rows", 3, actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(entriesTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + entriesTable.schema().asStruct(), expected.get(i), actual.get(i)); } } @@ -352,7 +389,9 @@ public void testCountEntriesTable() { // init load List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -360,12 +399,16 @@ public void testCountEntriesTable() { final int expectedEntryCount = 1; // count entries - Assert.assertEquals("Count should return " + expectedEntryCount, - expectedEntryCount, spark.read().format("iceberg").load(loadLocation(tableIdentifier, "entries")).count()); + Assert.assertEquals( + "Count should return " + expectedEntryCount, + expectedEntryCount, + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "entries")).count()); // count all_entries - Assert.assertEquals("Count should return " + expectedEntryCount, - expectedEntryCount, spark.read().format("iceberg").load(loadLocation(tableIdentifier, "all_entries")).count()); + Assert.assertEquals( + "Count should return " + expectedEntryCount, + expectedEntryCount, + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "all_entries")).count()); } @Test @@ -375,16 +418,20 @@ public void testFilesTable() throws Exception { Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "files"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -392,15 +439,14 @@ public void testFilesTable() throws Exception { // delete the first file to test that only live files are listed table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .collectAsList(); + List actual = + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")).collectAsList(); List expected = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().dataManifests(table.io())) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { if ((Integer) record.get("status") < 2 /* added or existing */) { GenericData.Record file = (GenericData.Record) record.get("data_file"); @@ -422,42 +468,42 @@ public void testFilesTableWithSnapshotIdInheritance() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "files_inheritance_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); - table.updateProperties() - .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "files"); - spark.sql(String.format( - "CREATE TABLE parquet_table (data string, id int) " + - "USING parquet PARTITIONED BY (id) LOCATION '%s'", - temp.newFolder())); + spark.sql( + String.format( + "CREATE TABLE parquet_table (data string, id int) " + + "USING parquet PARTITIONED BY (id) LOCATION '%s'", + temp.newFolder())); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF.select("data", "id").write() - .mode("overwrite") - .insertInto("parquet_table"); + inputDF.select("data", "id").write().mode("overwrite").insertInto("parquet_table"); try { String stagingLocation = table.location() + "/metadata"; - SparkTableUtil.importSparkTable(spark, + SparkTableUtil.importSparkTable( + spark, new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), - table, stagingLocation); + table, + stagingLocation); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "files")) + .collectAsList(); List expected = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().dataManifests(table.io())) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { GenericData.Record file = (GenericData.Record) record.get("data_file"); asMetadataRecord(file); @@ -473,7 +519,6 @@ public void testFilesTableWithSnapshotIdInheritance() throws Exception { } finally { spark.sql("DROP TABLE parquet_table"); } - } @Test @@ -484,35 +529,35 @@ public void testEntriesTableWithSnapshotIdInheritance() throws Exception { PartitionSpec spec = SPEC; Table table = createTable(tableIdentifier, SCHEMA, spec); - table.updateProperties() - .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") - .commit(); + table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit(); - spark.sql(String.format( - "CREATE TABLE parquet_table (data string, id int) " + - "USING parquet PARTITIONED BY (id) LOCATION '%s'", - temp.newFolder())); + spark.sql( + String.format( + "CREATE TABLE parquet_table (data string, id int) " + + "USING parquet PARTITIONED BY (id) LOCATION '%s'", + temp.newFolder())); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset inputDF = spark.createDataFrame(records, SimpleRecord.class); - inputDF.select("data", "id").write() - .mode("overwrite") - .insertInto("parquet_table"); + inputDF.select("data", "id").write().mode("overwrite").insertInto("parquet_table"); try { String stagingLocation = table.location() + "/metadata"; SparkTableUtil.importSparkTable( - spark, new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), table, stagingLocation); + spark, + new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), + table, + stagingLocation); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "entries")) - .select("sequence_number", "snapshot_id", "data_file") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "entries")) + .select("sequence_number", "snapshot_id", "data_file") + .collectAsList(); table.refresh(); @@ -535,19 +580,24 @@ public void testFilesUnpartitionedTable() throws Exception { Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "files"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); - DataFile toDelete = Iterables.getOnlyElement(table.currentSnapshot().addedDataFiles(table.io())); + DataFile toDelete = + Iterables.getOnlyElement(table.currentSnapshot().addedDataFiles(table.io())); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -555,15 +605,14 @@ public void testFilesUnpartitionedTable() throws Exception { // delete the first file to test that only live files are listed table.newDelete().deleteFile(toDelete).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .collectAsList(); + List actual = + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")).collectAsList(); List expected = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().dataManifests(table.io())) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { if ((Integer) record.get("status") < 2 /* added or existing */) { GenericData.Record file = (GenericData.Record) record.get("data_file"); @@ -586,38 +635,49 @@ public void testAllMetadataTablesWithStagedCommits() throws Exception { table.updateProperties().set(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, "true").commit(); spark.conf().set("spark.wap.id", "1234567"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actualAllData = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_data_files")) - .collectAsList(); - - List actualAllManifests = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .collectAsList(); - - List actualAllEntries = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_entries")) - .collectAsList(); - - Assert.assertTrue("Stage table should have some snapshots", table.snapshots().iterator().hasNext()); - Assert.assertEquals("Stage table should have null currentSnapshot", - null, table.currentSnapshot()); + List actualAllData = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_data_files")) + .collectAsList(); + + List actualAllManifests = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_manifests")) + .collectAsList(); + + List actualAllEntries = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_entries")) + .collectAsList(); + + Assert.assertTrue( + "Stage table should have some snapshots", table.snapshots().iterator().hasNext()); + Assert.assertEquals( + "Stage table should have null currentSnapshot", null, table.currentSnapshot()); Assert.assertEquals("Actual results should have two rows", 2, actualAllData.size()); Assert.assertEquals("Actual results should have two rows", 2, actualAllManifests.size()); Assert.assertEquals("Actual results should have two rows", 2, actualAllEntries.size()); @@ -630,10 +690,13 @@ public void testAllDataFilesTable() throws Exception { Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "all_data_files"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -642,7 +705,8 @@ public void testAllDataFilesTable() throws Exception { table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -650,19 +714,23 @@ public void testAllDataFilesTable() throws Exception { // ensure table data isn't stale table.refresh(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_data_files")) - .orderBy("file_path") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_data_files")) + .orderBy("file_path") + .collectAsList(); actual.sort(Comparator.comparing(o -> o.getString(1))); List expected = Lists.newArrayList(); - Iterable dataManifests = Iterables.concat(Iterables.transform(table.snapshots(), - snapshot -> snapshot.dataManifests(table.io()))); + Iterable dataManifests = + Iterables.concat( + Iterables.transform(table.snapshots(), snapshot -> snapshot.dataManifests(table.io()))); for (ManifestFile manifest : dataManifests) { InputFile in = table.io().newInputFile(manifest.path()); - try (CloseableIterable rows = Avro.read(in).project(entriesTable.schema()).build()) { + try (CloseableIterable rows = + Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { if ((Integer) record.get("status") < 2 /* added or existing */) { GenericData.Record file = (GenericData.Record) record.get("data_file"); @@ -691,7 +759,9 @@ public void testHistoryTable() { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -700,7 +770,9 @@ public void testHistoryTable() { long firstSnapshotTimestamp = table.currentSnapshot().timestampMillis(); long firstSnapshotId = table.currentSnapshot().snapshotId(); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -713,7 +785,9 @@ public void testHistoryTable() { table.rollback().toSnapshotId(firstSnapshotId).commit(); long rollbackTimestamp = Iterables.getLast(table.history()).timestampMillis(); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -722,34 +796,43 @@ public void testHistoryTable() { long thirdSnapshotTimestamp = table.currentSnapshot().timestampMillis(); long thirdSnapshotId = table.currentSnapshot().snapshotId(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "history")) - .collectAsList(); - - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(historyTable.schema(), "history")); - List expected = Lists.newArrayList( - builder.set("made_current_at", firstSnapshotTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("is_current_ancestor", true) - .build(), - builder.set("made_current_at", secondSnapshotTimestamp * 1000) - .set("snapshot_id", secondSnapshotId) - .set("parent_id", firstSnapshotId) - .set("is_current_ancestor", false) // commit rolled back, not an ancestor of the current table state - .build(), - builder.set("made_current_at", rollbackTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("is_current_ancestor", true) - .build(), - builder.set("made_current_at", thirdSnapshotTimestamp * 1000) - .set("snapshot_id", thirdSnapshotId) - .set("parent_id", firstSnapshotId) - .set("is_current_ancestor", true) - .build() - ); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "history")) + .collectAsList(); + + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(historyTable.schema(), "history")); + List expected = + Lists.newArrayList( + builder + .set("made_current_at", firstSnapshotTimestamp * 1000) + .set("snapshot_id", firstSnapshotId) + .set("parent_id", null) + .set("is_current_ancestor", true) + .build(), + builder + .set("made_current_at", secondSnapshotTimestamp * 1000) + .set("snapshot_id", secondSnapshotId) + .set("parent_id", firstSnapshotId) + .set( + "is_current_ancestor", + false) // commit rolled back, not an ancestor of the current table state + .build(), + builder + .set("made_current_at", rollbackTimestamp * 1000) + .set("snapshot_id", firstSnapshotId) + .set("parent_id", null) + .set("is_current_ancestor", true) + .build(), + builder + .set("made_current_at", thirdSnapshotTimestamp * 1000) + .set("snapshot_id", thirdSnapshotId) + .set("parent_id", firstSnapshotId) + .set("is_current_ancestor", true) + .build()); Assert.assertEquals("History table should have a row for each commit", 4, actual.size()); TestHelpers.assertEqualsSafe(historyTable.schema().asStruct(), expected.get(0), actual.get(0)); @@ -767,7 +850,9 @@ public void testSnapshotsTable() { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -786,40 +871,47 @@ public void testSnapshotsTable() { // rollback the table state to the first snapshot table.rollback().toSnapshotId(firstSnapshotId).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "snapshots")) - .collectAsList(); - - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(snapTable.schema(), "snapshots")); - List expected = Lists.newArrayList( - builder.set("committed_at", firstSnapshotTimestamp * 1000) - .set("snapshot_id", firstSnapshotId) - .set("parent_id", null) - .set("operation", "append") - .set("manifest_list", firstManifestList) - .set("summary", ImmutableMap.of( - "added-records", "1", - "added-data-files", "1", - "changed-partition-count", "1", - "total-data-files", "1", - "total-records", "1" - )) - .build(), - builder.set("committed_at", secondSnapshotTimestamp * 1000) - .set("snapshot_id", secondSnapshotId) - .set("parent_id", firstSnapshotId) - .set("operation", "delete") - .set("manifest_list", secondManifestList) - .set("summary", ImmutableMap.of( - "deleted-records", "1", - "deleted-data-files", "1", - "changed-partition-count", "1", - "total-records", "0", - "total-data-files", "0" - )) - .build() - ); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "snapshots")) + .collectAsList(); + + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(snapTable.schema(), "snapshots")); + List expected = + Lists.newArrayList( + builder + .set("committed_at", firstSnapshotTimestamp * 1000) + .set("snapshot_id", firstSnapshotId) + .set("parent_id", null) + .set("operation", "append") + .set("manifest_list", firstManifestList) + .set( + "summary", + ImmutableMap.of( + "added-records", "1", + "added-data-files", "1", + "changed-partition-count", "1", + "total-data-files", "1", + "total-records", "1")) + .build(), + builder + .set("committed_at", secondSnapshotTimestamp * 1000) + .set("snapshot_id", secondSnapshotId) + .set("parent_id", firstSnapshotId) + .set("operation", "delete") + .set("manifest_list", secondManifestList) + .set( + "summary", + ImmutableMap.of( + "deleted-records", "1", + "deleted-data-files", "1", + "changed-partition-count", "1", + "total-records", "0", + "total-data-files", "0")) + .build()); Assert.assertEquals("Snapshots table should have a row for each snapshot", 2, actual.size()); TestHelpers.assertEqualsSafe(snapTable.schema().asStruct(), expected.get(0), actual.get(0)); @@ -834,7 +926,9 @@ public void testPrunedSnapshotsTable() { List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset inputDf = spark.createDataFrame(records, SimpleRecord.class); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -850,40 +944,47 @@ public void testPrunedSnapshotsTable() { // rollback the table state to the first snapshot table.rollback().toSnapshotId(firstSnapshotId).commit(); - Dataset actualDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "snapshots")) - .select("operation", "committed_at", "summary", "parent_id"); + Dataset actualDf = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "snapshots")) + .select("operation", "committed_at", "summary", "parent_id"); Schema projectedSchema = SparkSchemaUtil.convert(actualDf.schema()); List actual = actualDf.collectAsList(); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema, "snapshots")); - List expected = Lists.newArrayList( - builder.set("committed_at", firstSnapshotTimestamp * 1000) - .set("parent_id", null) - .set("operation", "append") - .set("summary", ImmutableMap.of( - "added-records", "1", - "added-data-files", "1", - "changed-partition-count", "1", - "total-data-files", "1", - "total-records", "1" - )) - .build(), - builder.set("committed_at", secondSnapshotTimestamp * 1000) - .set("parent_id", firstSnapshotId) - .set("operation", "delete") - .set("summary", ImmutableMap.of( - "deleted-records", "1", - "deleted-data-files", "1", - "changed-partition-count", "1", - "total-records", "0", - "total-data-files", "0" - )) - .build() - ); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema, "snapshots")); + List expected = + Lists.newArrayList( + builder + .set("committed_at", firstSnapshotTimestamp * 1000) + .set("parent_id", null) + .set("operation", "append") + .set( + "summary", + ImmutableMap.of( + "added-records", "1", + "added-data-files", "1", + "changed-partition-count", "1", + "total-data-files", "1", + "total-records", "1")) + .build(), + builder + .set("committed_at", secondSnapshotTimestamp * 1000) + .set("parent_id", firstSnapshotId) + .set("operation", "delete") + .set( + "summary", + ImmutableMap.of( + "deleted-records", "1", + "deleted-data-files", "1", + "changed-partition-count", "1", + "total-records", "0", + "total-data-files", "0")) + .build()); Assert.assertEquals("Snapshots table should have a row for each snapshot", 2, actual.size()); TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(0), actual.get(0)); @@ -895,65 +996,88 @@ public void testManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "manifests"); - Dataset df1 = spark.createDataFrame( - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame( + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), + SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - table.updateProperties() - .set(TableProperties.FORMAT_VERSION, "2") - .commit(); + table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); - DataFile dataFile = Iterables.getFirst(table.currentSnapshot().addedDataFiles(table.io()), null); + DataFile dataFile = + Iterables.getFirst(table.currentSnapshot().addedDataFiles(table.io()), null); PartitionSpec dataFileSpec = table.specs().get(dataFile.specId()); StructLike dataFilePartition = dataFile.partition(); PositionDelete delete = PositionDelete.create(); delete.set(dataFile.path(), 0L, null); - DeleteFile deleteFile = writePositionDeletes(table, dataFileSpec, dataFilePartition, ImmutableList.of(delete)); + DeleteFile deleteFile = + writePositionDeletes(table, dataFileSpec, dataFilePartition, ImmutableList.of(delete)); - table.newRowDelta() - .addDeletes(deleteFile) - .commit(); + table.newRowDelta().addDeletes(deleteFile).commit(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .collectAsList(); table.refresh(); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema(), "manifests")); - GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema().findType("partition_summaries.element").asStructType(), "partition_summary")); - List expected = Lists.transform(table.currentSnapshot().allManifests(table.io()), manifest -> - builder - .set("content", manifest.content().id()) - .set("path", manifest.path()) - .set("length", manifest.length()) - .set("partition_spec_id", manifest.partitionSpecId()) - .set("added_snapshot_id", manifest.snapshotId()) - .set("added_data_files_count", manifest.content() == DATA ? manifest.addedFilesCount() : 0) - .set("existing_data_files_count", manifest.content() == DATA ? manifest.existingFilesCount() : 0) - .set("deleted_data_files_count", manifest.content() == DATA ? manifest.deletedFilesCount() : 0) - .set("added_delete_files_count", manifest.content() == DELETES ? manifest.addedFilesCount() : 0) - .set("existing_delete_files_count", manifest.content() == DELETES ? manifest.existingFilesCount() : 0) - .set("deleted_delete_files_count", manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) - .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> - summaryBuilder - .set("contains_null", manifest.content() == DATA) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build() - )) - .build() - ); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(manifestTable.schema(), "manifests")); + GenericRecordBuilder summaryBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + manifestTable.schema().findType("partition_summaries.element").asStructType(), + "partition_summary")); + List expected = + Lists.transform( + table.currentSnapshot().allManifests(table.io()), + manifest -> + builder + .set("content", manifest.content().id()) + .set("path", manifest.path()) + .set("length", manifest.length()) + .set("partition_spec_id", manifest.partitionSpecId()) + .set("added_snapshot_id", manifest.snapshotId()) + .set( + "added_data_files_count", + manifest.content() == DATA ? manifest.addedFilesCount() : 0) + .set( + "existing_data_files_count", + manifest.content() == DATA ? manifest.existingFilesCount() : 0) + .set( + "deleted_data_files_count", + manifest.content() == DATA ? manifest.deletedFilesCount() : 0) + .set( + "added_delete_files_count", + manifest.content() == DELETES ? manifest.addedFilesCount() : 0) + .set( + "existing_delete_files_count", + manifest.content() == DELETES ? manifest.existingFilesCount() : 0) + .set( + "deleted_delete_files_count", + manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) + .set( + "partition_summaries", + Lists.transform( + manifest.partitions(), + partition -> + summaryBuilder + .set("contains_null", manifest.content() == DATA) + .set("contains_nan", false) + .set("lower_bound", "1") + .set("upper_bound", "1") + .build())) + .build()); Assert.assertEquals("Manifests table should have two manifest rows", 2, actual.size()); TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(0), actual.get(0)); @@ -965,56 +1089,77 @@ public void testPruneManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "manifests"); - Dataset df1 = spark.createDataFrame( - Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame( + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(null, "b")), + SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); if (!spark.version().startsWith("2")) { // Spark 2 isn't able to actually push down nested struct projections so this will not break - AssertHelpers.assertThrows("Can't prune struct inside list", SparkException.class, + AssertHelpers.assertThrows( + "Can't prune struct inside list", + SparkException.class, "Cannot project a partial list element struct", - () -> spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries.contains_null") - .collectAsList()); + () -> + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .select("partition_spec_id", "path", "partition_summaries.contains_null") + .collectAsList()); } - Dataset actualDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries"); + Dataset actualDf = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .select("partition_spec_id", "path", "partition_summaries"); Schema projectedSchema = SparkSchemaUtil.convert(actualDf.schema()); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "manifests")) - .select("partition_spec_id", "path", "partition_summaries") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "manifests")) + .select("partition_spec_id", "path", "partition_summaries") + .collectAsList(); table.refresh(); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema.asStruct())); - GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - projectedSchema.findType("partition_summaries.element").asStructType(), "partition_summary")); - List expected = Lists.transform(table.currentSnapshot().allManifests(table.io()), manifest -> - builder.set("partition_spec_id", manifest.partitionSpecId()) - .set("path", manifest.path()) - .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> - summaryBuilder - .set("contains_null", true) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build() - )) - .build() - ); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema.asStruct())); + GenericRecordBuilder summaryBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + projectedSchema.findType("partition_summaries.element").asStructType(), + "partition_summary")); + List expected = + Lists.transform( + table.currentSnapshot().allManifests(table.io()), + manifest -> + builder + .set("partition_spec_id", manifest.partitionSpecId()) + .set("path", manifest.path()) + .set( + "partition_summaries", + Lists.transform( + manifest.partitions(), + partition -> + summaryBuilder + .set("contains_null", true) + .set("contains_nan", false) + .set("lower_bound", "1") + .set("upper_bound", "1") + .build())) + .build()); Assert.assertEquals("Manifests table should have one manifest row", 1, actual.size()); TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(0), actual.get(0)); @@ -1025,53 +1170,62 @@ public void testAllManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "all_manifests"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - table.updateProperties() - .set(TableProperties.FORMAT_VERSION, "2") - .commit(); + table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit(); - DataFile dataFile = Iterables.getFirst(table.currentSnapshot().addedDataFiles(table.io()), null); + DataFile dataFile = + Iterables.getFirst(table.currentSnapshot().addedDataFiles(table.io()), null); PartitionSpec dataFileSpec = table.specs().get(dataFile.specId()); StructLike dataFilePartition = dataFile.partition(); PositionDelete delete = PositionDelete.create(); delete.set(dataFile.path(), 0L, null); - DeleteFile deleteFile = writePositionDeletes(table, dataFileSpec, dataFilePartition, ImmutableList.of(delete)); + DeleteFile deleteFile = + writePositionDeletes(table, dataFileSpec, dataFilePartition, ImmutableList.of(delete)); - table.newRowDelta() - .addDeletes(deleteFile) - .commit(); + table.newRowDelta().addDeletes(deleteFile).commit(); table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); Stream> snapshotIdToManifests = StreamSupport.stream(table.snapshots().spliterator(), false) - .flatMap(snapshot -> snapshot.allManifests(table.io()).stream().map( - manifest -> Pair.of(snapshot.snapshotId(), manifest))); - - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .orderBy("path") - .collectAsList(); + .flatMap( + snapshot -> + snapshot.allManifests(table.io()).stream() + .map(manifest -> Pair.of(snapshot.snapshotId(), manifest))); + + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_manifests")) + .orderBy("path") + .collectAsList(); table.refresh(); - List expected = snapshotIdToManifests - .map(snapshotManifest -> manifestRecord(manifestTable, snapshotManifest.first(), snapshotManifest.second())) - .collect(Collectors.toList()); + List expected = + snapshotIdToManifests + .map( + snapshotManifest -> + manifestRecord( + manifestTable, snapshotManifest.first(), snapshotManifest.second())) + .collect(Collectors.toList()); expected.sort(Comparator.comparing(o -> o.get("path").toString())); Assert.assertEquals("Manifests table should have 5 manifest rows", 5, actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); } } @@ -1080,33 +1234,37 @@ public void testUnpartitionedPartitionsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "unpartitioned_partitions_test"); createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - Dataset df = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - Types.StructType expectedSchema = Types.StructType.of( - required(2, "record_count", Types.LongType.get()), - required(3, "file_count", Types.IntegerType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + required(2, "record_count", Types.LongType.get()), + required(3, "file_count", Types.IntegerType.get())); Table partitionsTable = loadTable(tableIdentifier, "partitions"); - Assert.assertEquals("Schema should not have partition field", - expectedSchema, partitionsTable.schema().asStruct()); + Assert.assertEquals( + "Schema should not have partition field", + expectedSchema, + partitionsTable.schema().asStruct()); - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - partitionsTable.schema(), "partitions")); - GenericData.Record expectedRow = builder - .set("record_count", 1L) - .set("file_count", 1) - .build(); + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(partitionsTable.schema(), "partitions")); + GenericData.Record expectedRow = builder.set("record_count", 1L).set("file_count", 1).build(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .collectAsList(); Assert.assertEquals("Unpartitioned partitions table should have one row", 1, actual.size()); TestHelpers.assertEqualsSafe(expectedSchema, expectedRow, actual.get(0)); @@ -1117,10 +1275,13 @@ public void testPartitionsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "partitions_test"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table partitionsTable = loadTable(tableIdentifier, "partitions"); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1129,69 +1290,86 @@ public void testPartitionsTable() { long firstCommitId = table.currentSnapshot().snapshotId(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .orderBy("partition.id") - .collectAsList(); - - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - partitionsTable.schema(), "partitions")); - GenericRecordBuilder partitionBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - partitionsTable.schema().findType("partition").asStructType(), "partition")); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .orderBy("partition.id") + .collectAsList(); + + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(partitionsTable.schema(), "partitions")); + GenericRecordBuilder partitionBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + partitionsTable.schema().findType("partition").asStructType(), "partition")); List expected = Lists.newArrayList(); - expected.add(builder - .set("partition", partitionBuilder.set("id", 1).build()) - .set("record_count", 1L) - .set("file_count", 1) - .set("spec_id", 0) - .build()); - expected.add(builder - .set("partition", partitionBuilder.set("id", 2).build()) - .set("record_count", 1L) - .set("file_count", 1) - .set("spec_id", 0) - .build()); + expected.add( + builder + .set("partition", partitionBuilder.set("id", 1).build()) + .set("record_count", 1L) + .set("file_count", 1) + .set("spec_id", 0) + .build()); + expected.add( + builder + .set("partition", partitionBuilder.set("id", 2).build()) + .set("record_count", 1L) + .set("file_count", 1) + .set("spec_id", 0) + .build()); Assert.assertEquals("Partitions table should have two rows", 2, expected.size()); Assert.assertEquals("Actual results should have two rows", 2, actual.size()); for (int i = 0; i < 2; i += 1) { - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); } // check time travel - List actualAfterFirstCommit = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, String.valueOf(firstCommitId)) - .load(loadLocation(tableIdentifier, "partitions")) - .orderBy("partition.id") - .collectAsList(); + List actualAfterFirstCommit = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, String.valueOf(firstCommitId)) + .load(loadLocation(tableIdentifier, "partitions")) + .orderBy("partition.id") + .collectAsList(); Assert.assertEquals("Actual results should have one row", 1, actualAfterFirstCommit.size()); - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(0), actualAfterFirstCommit.get(0)); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(0), actualAfterFirstCommit.get(0)); // check predicate push down - List filtered = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .filter("partition.id < 2") - .collectAsList(); + List filtered = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .filter("partition.id < 2") + .collectAsList(); Assert.assertEquals("Actual results should have one row", 1, filtered.size()); - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(0), filtered.get(0)); - - List nonFiltered = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "partitions")) - .filter("partition.id < 2 or record_count=1") - .collectAsList(); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(0), filtered.get(0)); + + List nonFiltered = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "partitions")) + .filter("partition.id < 2 or record_count=1") + .collectAsList(); Assert.assertEquals("Actual results should have one row", 2, nonFiltered.size()); for (int i = 0; i < 2; i += 1) { - TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); } } @@ -1200,62 +1378,63 @@ public synchronized void testSnapshotReadAfterAddColumn() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List originalRecords = Lists.newArrayList( - RowFactory.create(1, "x"), - RowFactory.create(2, "y"), - RowFactory.create(3, "z")); + List originalRecords = + Lists.newArrayList( + RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z")); StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA); Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf.orderBy("id").collectAsList()); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); Snapshot snapshotBeforeAddColumn = table.currentSnapshot(); table.updateSchema().addColumn("category", Types.StringType.get()).commit(); - List newRecords = Lists.newArrayList( - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); + List newRecords = + Lists.newArrayList(RowFactory.create(4, "xy", "B"), RowFactory.create(5, "xyz", "C")); StructType newSparkSchema = SparkSchemaUtil.convert(SCHEMA2); Dataset inputDf2 = spark.createDataFrame(newRecords, newSparkSchema); - inputDf2.select("id", "data", "category").write() + inputDf2 + .select("id", "data", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - List updatedRecords = Lists.newArrayList( - RowFactory.create(1, "x", null), - RowFactory.create(2, "y", null), - RowFactory.create(3, "z", null), - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); - - Dataset resultDf2 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", updatedRecords, - resultDf2.orderBy("id").collectAsList()); - - Dataset resultDf3 = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf3.orderBy("id").collectAsList()); + List updatedRecords = + Lists.newArrayList( + RowFactory.create(1, "x", null), + RowFactory.create(2, "y", null), + RowFactory.create(3, "z", null), + RowFactory.create(4, "xy", "B"), + RowFactory.create(5, "xyz", "C")); + + Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); + + Dataset resultDf3 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf3.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf3.schema()); } @@ -1264,72 +1443,76 @@ public synchronized void testSnapshotReadAfterDropColumn() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA2, PartitionSpec.unpartitioned()); - List originalRecords = Lists.newArrayList( - RowFactory.create(1, "x", "A"), - RowFactory.create(2, "y", "A"), - RowFactory.create(3, "z", "B")); + List originalRecords = + Lists.newArrayList( + RowFactory.create(1, "x", "A"), + RowFactory.create(2, "y", "A"), + RowFactory.create(3, "z", "B")); StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA2); Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf.select("id", "data", "category").write() + inputDf + .select("id", "data", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf.orderBy("id").collectAsList()); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); long tsBeforeDropColumn = waitUntilAfter(System.currentTimeMillis()); table.updateSchema().deleteColumn("data").commit(); long tsAfterDropColumn = waitUntilAfter(System.currentTimeMillis()); - List newRecords = Lists.newArrayList( - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); + List newRecords = Lists.newArrayList(RowFactory.create(4, "B"), RowFactory.create(5, "C")); StructType newSparkSchema = SparkSchemaUtil.convert(SCHEMA3); Dataset inputDf2 = spark.createDataFrame(newRecords, newSparkSchema); - inputDf2.select("id", "category").write() + inputDf2 + .select("id", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - List updatedRecords = Lists.newArrayList( - RowFactory.create(1, "A"), - RowFactory.create(2, "A"), - RowFactory.create(3, "B"), - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); - - Dataset resultDf2 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", updatedRecords, - resultDf2.orderBy("id").collectAsList()); - - Dataset resultDf3 = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, tsBeforeDropColumn) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf3.orderBy("id").collectAsList()); + List updatedRecords = + Lists.newArrayList( + RowFactory.create(1, "A"), + RowFactory.create(2, "A"), + RowFactory.create(3, "B"), + RowFactory.create(4, "B"), + RowFactory.create(5, "C")); + + Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); + + Dataset resultDf3 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, tsBeforeDropColumn) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf3.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf3.schema()); // At tsAfterDropColumn, there has been a schema change, but no new snapshot, // so the snapshot as of tsAfterDropColumn is the same as that as of tsBeforeDropColumn. - Dataset resultDf4 = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, tsAfterDropColumn) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf4.orderBy("id").collectAsList()); + Dataset resultDf4 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, tsAfterDropColumn) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf4.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf4.schema()); } @@ -1338,77 +1521,77 @@ public synchronized void testSnapshotReadAfterAddAndDropColumn() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List originalRecords = Lists.newArrayList( - RowFactory.create(1, "x"), - RowFactory.create(2, "y"), - RowFactory.create(3, "z")); + List originalRecords = + Lists.newArrayList( + RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z")); StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA); Dataset inputDf = spark.createDataFrame(originalRecords, originalSparkSchema); - inputDf.select("id", "data").write() + inputDf + .select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - Dataset resultDf = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf.orderBy("id").collectAsList()); + Dataset resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf.orderBy("id").collectAsList()); Snapshot snapshotBeforeAddColumn = table.currentSnapshot(); table.updateSchema().addColumn("category", Types.StringType.get()).commit(); - List newRecords = Lists.newArrayList( - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); + List newRecords = + Lists.newArrayList(RowFactory.create(4, "xy", "B"), RowFactory.create(5, "xyz", "C")); StructType sparkSchemaAfterAddColumn = SparkSchemaUtil.convert(SCHEMA2); Dataset inputDf2 = spark.createDataFrame(newRecords, sparkSchemaAfterAddColumn); - inputDf2.select("id", "data", "category").write() + inputDf2 + .select("id", "data", "category") + .write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); table.refresh(); - List updatedRecords = Lists.newArrayList( - RowFactory.create(1, "x", null), - RowFactory.create(2, "y", null), - RowFactory.create(3, "z", null), - RowFactory.create(4, "xy", "B"), - RowFactory.create(5, "xyz", "C")); + List updatedRecords = + Lists.newArrayList( + RowFactory.create(1, "x", null), + RowFactory.create(2, "y", null), + RowFactory.create(3, "z", null), + RowFactory.create(4, "xy", "B"), + RowFactory.create(5, "xyz", "C")); - Dataset resultDf2 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", updatedRecords, - resultDf2.orderBy("id").collectAsList()); + Dataset resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList()); table.updateSchema().deleteColumn("data").commit(); - List recordsAfterDropColumn = Lists.newArrayList( - RowFactory.create(1, null), - RowFactory.create(2, null), - RowFactory.create(3, null), - RowFactory.create(4, "B"), - RowFactory.create(5, "C")); - - Dataset resultDf3 = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", recordsAfterDropColumn, - resultDf3.orderBy("id").collectAsList()); - - Dataset resultDf4 = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()) - .load(loadLocation(tableIdentifier)); - Assert.assertEquals("Records should match", originalRecords, - resultDf4.orderBy("id").collectAsList()); + List recordsAfterDropColumn = + Lists.newArrayList( + RowFactory.create(1, null), + RowFactory.create(2, null), + RowFactory.create(3, null), + RowFactory.create(4, "B"), + RowFactory.create(5, "C")); + + Dataset resultDf3 = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", recordsAfterDropColumn, resultDf3.orderBy("id").collectAsList()); + + Dataset resultDf4 = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()) + .load(loadLocation(tableIdentifier)); + Assert.assertEquals( + "Records should match", originalRecords, resultDf4.orderBy("id").collectAsList()); Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf4.schema()); } @@ -1417,13 +1600,12 @@ public void testRemoveOrphanFilesActionSupport() throws InterruptedException { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); - List records = Lists.newArrayList( - new SimpleRecord(1, "1") - ); + List records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1435,36 +1617,42 @@ public void testRemoveOrphanFilesActionSupport() throws InterruptedException { SparkActions actions = SparkActions.get(); - DeleteOrphanFiles.Result result1 = actions.deleteOrphanFiles(table) - .location(table.location() + "/metadata") - .olderThan(System.currentTimeMillis()) - .execute(); - Assert.assertTrue("Should not delete any metadata files", Iterables.isEmpty(result1.orphanFileLocations())); + DeleteOrphanFiles.Result result1 = + actions + .deleteOrphanFiles(table) + .location(table.location() + "/metadata") + .olderThan(System.currentTimeMillis()) + .execute(); + Assert.assertTrue( + "Should not delete any metadata files", Iterables.isEmpty(result1.orphanFileLocations())); - DeleteOrphanFiles.Result result2 = actions.deleteOrphanFiles(table) - .olderThan(System.currentTimeMillis()) - .execute(); - Assert.assertEquals("Should delete 1 data file", 1, Iterables.size(result2.orphanFileLocations())); + DeleteOrphanFiles.Result result2 = + actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute(); + Assert.assertEquals( + "Should delete 1 data file", 1, Iterables.size(result2.orphanFileLocations())); Dataset resultDF = spark.read().format("iceberg").load(loadLocation(tableIdentifier)); - List actualRecords = resultDF - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actualRecords = + resultDF.as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Rows must match", records, actualRecords); } - @Test public void testFilesTablePartitionId() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "files_test"); - Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("id").build()); + Table table = + createTable( + tableIdentifier, SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("id").build()); int spec0 = table.spec().specId(); - Dataset df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); - Dataset df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); + Dataset df1 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df2 = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); - df1.select("id", "data").write() + df1.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1475,17 +1663,17 @@ public void testFilesTablePartitionId() throws Exception { int spec1 = table.spec().specId(); // add a second file - df2.select("id", "data").write() + df2.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "files")) - .sort(DataFile.SPEC_ID.name()) - .collectAsList() - .stream().map(r -> (Integer) r.getAs(DataFile.SPEC_ID.name())).collect(Collectors.toList()); + List actual = + spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")) + .sort(DataFile.SPEC_ID.name()).collectAsList().stream() + .map(r -> (Integer) r.getAs(DataFile.SPEC_ID.name())) + .collect(Collectors.toList()); Assert.assertEquals("Should have two partition specs", ImmutableList.of(spec0, spec1), actual); } @@ -1495,22 +1683,26 @@ public void testAllManifestTableSnapshotFiltering() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "all_manifest_snapshot_filtering"); Table table = createTable(tableIdentifier, SCHEMA, SPEC); Table manifestTable = loadTable(tableIdentifier, "all_manifests"); - Dataset df = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); + Dataset df = + spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); List> snapshotIdToManifests = Lists.newArrayList(); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); Snapshot snapshot1 = table.currentSnapshot(); - snapshotIdToManifests.addAll(snapshot1.allManifests().stream() - .map(manifest -> Pair.of(snapshot1.snapshotId(), manifest)) - .collect(Collectors.toList())); + snapshotIdToManifests.addAll( + snapshot1.allManifests().stream() + .map(manifest -> Pair.of(snapshot1.snapshotId(), manifest)) + .collect(Collectors.toList())); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1518,16 +1710,19 @@ public void testAllManifestTableSnapshotFiltering() throws Exception { table.refresh(); Snapshot snapshot2 = table.currentSnapshot(); Assert.assertEquals("Should have two manifests", 2, snapshot2.allManifests().size()); - snapshotIdToManifests.addAll(snapshot2.allManifests().stream() - .map(manifest -> Pair.of(snapshot2.snapshotId(), manifest)) - .collect(Collectors.toList())); + snapshotIdToManifests.addAll( + snapshot2.allManifests().stream() + .map(manifest -> Pair.of(snapshot2.snapshotId(), manifest)) + .collect(Collectors.toList())); // Add manifests that will not be selected - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); @@ -1537,30 +1732,41 @@ public void testAllManifestTableSnapshotFiltering() throws Exception { snapshotIds.add(String.valueOf(snapshot2.snapshotId())); snapshotIds.toString(); - List actual = spark.read() - .format("iceberg") - .load(loadLocation(tableIdentifier, "all_manifests")) - .filter("reference_snapshot_id in " + snapshotIds) - .orderBy("path") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .load(loadLocation(tableIdentifier, "all_manifests")) + .filter("reference_snapshot_id in " + snapshotIds) + .orderBy("path") + .collectAsList(); table.refresh(); - List expected = snapshotIdToManifests.stream() - .map(snapshotManifest -> manifestRecord(manifestTable, snapshotManifest.first(), snapshotManifest.second())) - .collect(Collectors.toList()); + List expected = + snapshotIdToManifests.stream() + .map( + snapshotManifest -> + manifestRecord( + manifestTable, snapshotManifest.first(), snapshotManifest.second())) + .collect(Collectors.toList()); expected.sort(Comparator.comparing(o -> o.get("path").toString())); Assert.assertEquals("Manifests table should have 3 manifest rows", 3, actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe( + manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); } } - private GenericData.Record manifestRecord(Table manifestTable, Long referenceSnapshotId, ManifestFile manifest) { - GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema(), "manifests")); - GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( - manifestTable.schema().findType("partition_summaries.element").asStructType(), "partition_summary")); + private GenericData.Record manifestRecord( + Table manifestTable, Long referenceSnapshotId, ManifestFile manifest) { + GenericRecordBuilder builder = + new GenericRecordBuilder(AvroSchemaUtil.convert(manifestTable.schema(), "manifests")); + GenericRecordBuilder summaryBuilder = + new GenericRecordBuilder( + AvroSchemaUtil.convert( + manifestTable.schema().findType("partition_summaries.element").asStructType(), + "partition_summary")); return builder .set("content", manifest.content().id()) .set("path", manifest.path()) @@ -1568,19 +1774,32 @@ private GenericData.Record manifestRecord(Table manifestTable, Long referenceSna .set("partition_spec_id", manifest.partitionSpecId()) .set("added_snapshot_id", manifest.snapshotId()) .set("added_data_files_count", manifest.content() == DATA ? manifest.addedFilesCount() : 0) - .set("existing_data_files_count", manifest.content() == DATA ? manifest.existingFilesCount() : 0) - .set("deleted_data_files_count", manifest.content() == DATA ? manifest.deletedFilesCount() : 0) - .set("added_delete_files_count", manifest.content() == DELETES ? manifest.addedFilesCount() : 0) - .set("existing_delete_files_count", manifest.content() == DELETES ? manifest.existingFilesCount() : 0) - .set("deleted_delete_files_count", manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) - .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> - summaryBuilder - .set("contains_null", false) - .set("contains_nan", false) - .set("lower_bound", "1") - .set("upper_bound", "1") - .build() - )) + .set( + "existing_data_files_count", + manifest.content() == DATA ? manifest.existingFilesCount() : 0) + .set( + "deleted_data_files_count", + manifest.content() == DATA ? manifest.deletedFilesCount() : 0) + .set( + "added_delete_files_count", + manifest.content() == DELETES ? manifest.addedFilesCount() : 0) + .set( + "existing_delete_files_count", + manifest.content() == DELETES ? manifest.existingFilesCount() : 0) + .set( + "deleted_delete_files_count", + manifest.content() == DELETES ? manifest.deletedFilesCount() : 0) + .set( + "partition_summaries", + Lists.transform( + manifest.partitions(), + partition -> + summaryBuilder + .set("contains_null", false) + .set("contains_nan", false) + .set("lower_bound", "1") + .set("upper_bound", "1") + .build())) .set("reference_snapshot_id", referenceSnapshotId) .build(); } @@ -1590,8 +1809,8 @@ private void asMetadataRecord(GenericData.Record file) { file.put(3, 0); // specId } - private PositionDeleteWriter newPositionDeleteWriter(Table table, PartitionSpec spec, - StructLike partition) { + private PositionDeleteWriter newPositionDeleteWriter( + Table table, PartitionSpec spec, StructLike partition) { OutputFileFactory fileFactory = OutputFileFactory.builderFor(table, 0, 0).build(); EncryptedOutputFile outputFile = fileFactory.newOutputFile(spec, partition); @@ -1599,9 +1818,13 @@ private PositionDeleteWriter newPositionDeleteWriter(Table table, P return fileWriterFactory.newPositionDeleteWriter(outputFile, spec, partition); } - private DeleteFile writePositionDeletes(Table table, PartitionSpec spec, StructLike partition, - Iterable> deletes) { - PositionDeleteWriter positionDeleteWriter = newPositionDeleteWriter(table, spec, partition); + private DeleteFile writePositionDeletes( + Table table, + PartitionSpec spec, + StructLike partition, + Iterable> deletes) { + PositionDeleteWriter positionDeleteWriter = + newPositionDeleteWriter(table, spec, partition); try (PositionDeleteWriter writer = positionDeleteWriter) { for (PositionDelete delete : deletes) { diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java index 1a99697a09f9..0be1e0b1bd05 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.math.BigDecimal; @@ -61,8 +60,8 @@ public void testRegisterIntegerBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_int_16", DataTypes.IntegerType, 16); List results = spark.sql("SELECT iceberg_bucket_int_16(1)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); } @Test @@ -70,8 +69,8 @@ public void testRegisterShortBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_short_16", DataTypes.ShortType, 16); List results = spark.sql("SELECT iceberg_bucket_short_16(1S)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); } @Test @@ -79,8 +78,8 @@ public void testRegisterByteBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_byte_16", DataTypes.ByteType, 16); List results = spark.sql("SELECT iceberg_bucket_byte_16(1Y)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.IntegerType.get(), 16).apply(1), results.get(0).getInt(0)); } @Test @@ -88,8 +87,8 @@ public void testRegisterLongBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_long_16", DataTypes.LongType, 16); List results = spark.sql("SELECT iceberg_bucket_long_16(1L)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.LongType.get(), 16).apply(1L), - results.get(0).getInt(0)); + Assert.assertEquals( + (int) Transforms.bucket(Types.LongType.get(), 16).apply(1L), results.get(0).getInt(0)); } @Test @@ -97,7 +96,8 @@ public void testRegisterStringBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_string_16", DataTypes.StringType, 16); List results = spark.sql("SELECT iceberg_bucket_string_16('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), + Assert.assertEquals( + (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), results.get(0).getInt(0)); } @@ -106,7 +106,8 @@ public void testRegisterCharBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_char_16", new CharType(5), 16); List results = spark.sql("SELECT iceberg_bucket_char_16('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), + Assert.assertEquals( + (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), results.get(0).getInt(0)); } @@ -115,73 +116,89 @@ public void testRegisterVarCharBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_varchar_16", new VarcharType(5), 16); List results = spark.sql("SELECT iceberg_bucket_varchar_16('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), + Assert.assertEquals( + (int) Transforms.bucket(Types.StringType.get(), 16).apply("hello"), results.get(0).getInt(0)); } @Test public void testRegisterDateBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_date_16", DataTypes.DateType, 16); - List results = spark.sql("SELECT iceberg_bucket_date_16(DATE '2021-06-30')").collectAsList(); + List results = + spark.sql("SELECT iceberg_bucket_date_16(DATE '2021-06-30')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.DateType.get(), 16) - .apply(DateTimeUtils.fromJavaDate(Date.valueOf("2021-06-30"))), + Assert.assertEquals( + (int) + Transforms.bucket(Types.DateType.get(), 16) + .apply(DateTimeUtils.fromJavaDate(Date.valueOf("2021-06-30"))), results.get(0).getInt(0)); } @Test public void testRegisterTimestampBucketUDF() { - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_timestamp_16", DataTypes.TimestampType, 16); + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_timestamp_16", DataTypes.TimestampType, 16); List results = - spark.sql("SELECT iceberg_bucket_timestamp_16(TIMESTAMP '2021-06-30 00:00:00.000')").collectAsList(); + spark + .sql("SELECT iceberg_bucket_timestamp_16(TIMESTAMP '2021-06-30 00:00:00.000')") + .collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.TimestampType.withZone(), 16) - .apply(DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2021-06-30 00:00:00.000"))), + Assert.assertEquals( + (int) + Transforms.bucket(Types.TimestampType.withZone(), 16) + .apply( + DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2021-06-30 00:00:00.000"))), results.get(0).getInt(0)); } @Test public void testRegisterBinaryBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_binary_16", DataTypes.BinaryType, 16); - List results = - spark.sql("SELECT iceberg_bucket_binary_16(X'0020001F')").collectAsList(); + List results = spark.sql("SELECT iceberg_bucket_binary_16(X'0020001F')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.BinaryType.get(), 16) - .apply(ByteBuffer.wrap(new byte[]{0x00, 0x20, 0x00, 0x1F})), + Assert.assertEquals( + (int) + Transforms.bucket(Types.BinaryType.get(), 16) + .apply(ByteBuffer.wrap(new byte[] {0x00, 0x20, 0x00, 0x1F})), results.get(0).getInt(0)); } @Test public void testRegisterDecimalBucketUDF() { IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_decimal_16", new DecimalType(4, 2), 16); - List results = - spark.sql("SELECT iceberg_bucket_decimal_16(11.11)").collectAsList(); + List results = spark.sql("SELECT iceberg_bucket_decimal_16(11.11)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals((int) Transforms.bucket(Types.DecimalType.of(4, 2), 16) - .apply(new BigDecimal("11.11")), + Assert.assertEquals( + (int) Transforms.bucket(Types.DecimalType.of(4, 2), 16).apply(new BigDecimal("11.11")), results.get(0).getInt(0)); } @Test public void testRegisterBooleanBucketUDF() { - Assertions.assertThatThrownBy(() -> - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_boolean_16", DataTypes.BooleanType, 16)) + Assertions.assertThatThrownBy( + () -> + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_boolean_16", DataTypes.BooleanType, 16)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot bucket by type: boolean"); } @Test public void testRegisterDoubleBucketUDF() { - Assertions.assertThatThrownBy(() -> - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_double_16", DataTypes.DoubleType, 16)) + Assertions.assertThatThrownBy( + () -> + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_double_16", DataTypes.DoubleType, 16)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot bucket by type: double"); } @Test public void testRegisterFloatBucketUDF() { - Assertions.assertThatThrownBy(() -> - IcebergSpark.registerBucketUDF(spark, "iceberg_bucket_float_16", DataTypes.FloatType, 16)) + Assertions.assertThatThrownBy( + () -> + IcebergSpark.registerBucketUDF( + spark, "iceberg_bucket_float_16", DataTypes.FloatType, 16)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot bucket by type: float"); } @@ -191,8 +208,8 @@ public void testRegisterIntegerTruncateUDF() { IcebergSpark.registerTruncateUDF(spark, "iceberg_truncate_int_4", DataTypes.IntegerType, 4); List results = spark.sql("SELECT iceberg_truncate_int_4(1)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals(Transforms.truncate(Types.IntegerType.get(), 4).apply(1), - results.get(0).getInt(0)); + Assert.assertEquals( + Transforms.truncate(Types.IntegerType.get(), 4).apply(1), results.get(0).getInt(0)); } @Test @@ -200,18 +217,18 @@ public void testRegisterLongTruncateUDF() { IcebergSpark.registerTruncateUDF(spark, "iceberg_truncate_long_4", DataTypes.LongType, 4); List results = spark.sql("SELECT iceberg_truncate_long_4(1L)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals(Transforms.truncate(Types.LongType.get(), 4).apply(1L), - results.get(0).getLong(0)); + Assert.assertEquals( + Transforms.truncate(Types.LongType.get(), 4).apply(1L), results.get(0).getLong(0)); } @Test public void testRegisterDecimalTruncateUDF() { IcebergSpark.registerTruncateUDF(spark, "iceberg_truncate_decimal_4", new DecimalType(4, 2), 4); - List results = - spark.sql("SELECT iceberg_truncate_decimal_4(11.11)").collectAsList(); + List results = spark.sql("SELECT iceberg_truncate_decimal_4(11.11)").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals(Transforms.truncate(Types.DecimalType.of(4, 2), 4) - .apply(new BigDecimal("11.11")), results.get(0).getDecimal(0)); + Assert.assertEquals( + Transforms.truncate(Types.DecimalType.of(4, 2), 4).apply(new BigDecimal("11.11")), + results.get(0).getDecimal(0)); } @Test @@ -219,7 +236,7 @@ public void testRegisterStringTruncateUDF() { IcebergSpark.registerTruncateUDF(spark, "iceberg_truncate_string_4", DataTypes.StringType, 4); List results = spark.sql("SELECT iceberg_truncate_string_4('hello')").collectAsList(); Assert.assertEquals(1, results.size()); - Assert.assertEquals(Transforms.truncate(Types.StringType.get(), 4).apply("hello"), - results.get(0).getString(0)); + Assert.assertEquals( + Transforms.truncate(Types.StringType.get(), 4).apply("hello"), results.get(0).getString(0)); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java index e07798301db8..7313c18cc09d 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIdentityPartitionData.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; @@ -55,11 +54,11 @@ public class TestIdentityPartitionData extends SparkTestBase { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true }, + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true}, }; } @@ -71,36 +70,37 @@ public TestIdentityPartitionData(String format, boolean vectorized) { this.vectorized = vectorized; } - private static final Schema LOG_SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "date", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get()) - ); - - private static final List LOGS = ImmutableList.of( - LogMessage.debug("2020-02-02", "debug event 1"), - LogMessage.info("2020-02-02", "info event 1"), - LogMessage.debug("2020-02-02", "debug event 2"), - LogMessage.info("2020-02-03", "info event 2"), - LogMessage.debug("2020-02-03", "debug event 3"), - LogMessage.info("2020-02-03", "info event 3"), - LogMessage.error("2020-02-03", "error event 1"), - LogMessage.debug("2020-02-04", "debug event 4"), - LogMessage.warn("2020-02-04", "warn event 1"), - LogMessage.debug("2020-02-04", "debug event 5") - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - - private PartitionSpec spec = PartitionSpec.builderFor(LOG_SCHEMA).identity("date").identity("level").build(); + private static final Schema LOG_SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "date", Types.StringType.get()), + Types.NestedField.optional(3, "level", Types.StringType.get()), + Types.NestedField.optional(4, "message", Types.StringType.get())); + + private static final List LOGS = + ImmutableList.of( + LogMessage.debug("2020-02-02", "debug event 1"), + LogMessage.info("2020-02-02", "info event 1"), + LogMessage.debug("2020-02-02", "debug event 2"), + LogMessage.info("2020-02-03", "info event 2"), + LogMessage.debug("2020-02-03", "debug event 3"), + LogMessage.info("2020-02-03", "info event 3"), + LogMessage.error("2020-02-03", "error event 1"), + LogMessage.debug("2020-02-04", "debug event 4"), + LogMessage.warn("2020-02-04", "warn event 1"), + LogMessage.debug("2020-02-04", "debug event 5")); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); + + private PartitionSpec spec = + PartitionSpec.builderFor(LOG_SCHEMA).identity("date").identity("level").build(); private Table table = null; private Dataset logs = null; /** - * Use the Hive Based table to make Identity Partition Columns with no duplication of the data in the underlying - * parquet files. This makes sure that if the identity mapping fails, the test will also fail. + * Use the Hive Based table to make Identity Partition Columns with no duplication of the data in + * the underlying parquet files. This makes sure that if the identity mapping fails, the test will + * also fail. */ private void setupParquet() throws Exception { File location = temp.newFolder("logs"); @@ -109,15 +109,25 @@ private void setupParquet() throws Exception { Assert.assertTrue("Temp folder should exist", location.exists()); Map properties = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format); - this.logs = spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); + this.logs = + spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); spark.sql(String.format("DROP TABLE IF EXISTS %s", hiveTable)); - logs.orderBy("date", "level", "id").write().partitionBy("date", "level").format("parquet") - .option("path", hiveLocation.toString()).saveAsTable(hiveTable); - - this.table = TABLES.create(SparkSchemaUtil.schemaForTable(spark, hiveTable), - SparkSchemaUtil.specForTable(spark, hiveTable), properties, location.toString()); - - SparkTableUtil.importSparkTable(spark, new TableIdentifier(hiveTable), table, location.toString()); + logs.orderBy("date", "level", "id") + .write() + .partitionBy("date", "level") + .format("parquet") + .option("path", hiveLocation.toString()) + .saveAsTable(hiveTable); + + this.table = + TABLES.create( + SparkSchemaUtil.schemaForTable(spark, hiveTable), + SparkSchemaUtil.specForTable(spark, hiveTable), + properties, + location.toString()); + + SparkTableUtil.importSparkTable( + spark, new TableIdentifier(hiveTable), table, location.toString()); } @Before @@ -130,56 +140,70 @@ public void setupTable() throws Exception { Map properties = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format); this.table = TABLES.create(LOG_SCHEMA, spec, properties, location.toString()); - this.logs = spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); + this.logs = + spark.createDataFrame(LOGS, LogMessage.class).select("id", "date", "level", "message"); - logs.orderBy("date", "level", "id").write().format("iceberg").mode("append").save(location.toString()); + logs.orderBy("date", "level", "id") + .write() + .format("iceberg") + .mode("append") + .save(location.toString()); } } @Test public void testFullProjection() { List expected = logs.orderBy("id").collectAsList(); - List actual = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()).orderBy("id") - .select("id", "date", "level", "message") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table.location()) + .orderBy("id") + .select("id", "date", "level", "message") + .collectAsList(); Assert.assertEquals("Rows should match", expected, actual); } @Test public void testProjections() { - String[][] cases = new String[][] { - // individual fields - new String[] { "date" }, - new String[] { "level" }, - new String[] { "message" }, - // field pairs - new String[] { "date", "message" }, - new String[] { "level", "message" }, - new String[] { "date", "level" }, - // out-of-order pairs - new String[] { "message", "date" }, - new String[] { "message", "level" }, - new String[] { "level", "date" }, - // full projection, different orderings - new String[] { "date", "level", "message" }, - new String[] { "level", "date", "message" }, - new String[] { "date", "message", "level" }, - new String[] { "level", "message", "date" }, - new String[] { "message", "date", "level" }, - new String[] { "message", "level", "date" } - }; + String[][] cases = + new String[][] { + // individual fields + new String[] {"date"}, + new String[] {"level"}, + new String[] {"message"}, + // field pairs + new String[] {"date", "message"}, + new String[] {"level", "message"}, + new String[] {"date", "level"}, + // out-of-order pairs + new String[] {"message", "date"}, + new String[] {"message", "level"}, + new String[] {"level", "date"}, + // full projection, different orderings + new String[] {"date", "level", "message"}, + new String[] {"level", "date", "message"}, + new String[] {"date", "message", "level"}, + new String[] {"level", "message", "date"}, + new String[] {"message", "date", "level"}, + new String[] {"message", "level", "date"} + }; for (String[] ordering : cases) { List expected = logs.select("id", ordering).orderBy("id").collectAsList(); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()) - .select("id", ordering).orderBy("id") - .collectAsList(); - Assert.assertEquals("Rows should match for ordering: " + Arrays.toString(ordering), expected, actual); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table.location()) + .select("id", ordering) + .orderBy("id") + .collectAsList(); + Assert.assertEquals( + "Rows should match for ordering: " + Arrays.toString(ordering), expected, actual); } } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java index 4ab01044046f..9e75145faff9 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestInternalRowWrapper.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Iterator; @@ -68,8 +67,10 @@ protected void generateAndValidate(Schema schema, AssertMethod assertMethod) { StructLike recordStructLike = recordWrapper.wrap(actual.next()); StructLike rowStructLike = rowWrapper.wrap(expected.next()); - assertMethod.assertEquals("Should have expected StructLike values", - actualWrapper.set(recordStructLike), expectedWrapper.set(rowStructLike)); + assertMethod.assertEquals( + "Should have expected StructLike values", + actualWrapper.set(recordStructLike), + expectedWrapper.set(rowStructLike)); } Assert.assertFalse("Shouldn't have more record", actual.hasNext()); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTablesWithPartitionEvolution.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTablesWithPartitionEvolution.java index 691e9f6f5481..82c9a58e33ea 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTablesWithPartitionEvolution.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestMetadataTablesWithPartitionEvolution.java @@ -16,9 +16,19 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.FileFormat.AVRO; +import static org.apache.iceberg.FileFormat.ORC; +import static org.apache.iceberg.FileFormat.PARQUET; +import static org.apache.iceberg.MetadataTableType.ALL_DATA_FILES; +import static org.apache.iceberg.MetadataTableType.ALL_ENTRIES; +import static org.apache.iceberg.MetadataTableType.ENTRIES; +import static org.apache.iceberg.MetadataTableType.FILES; +import static org.apache.iceberg.MetadataTableType.PARTITIONS; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.FORMAT_VERSION; + import java.util.Arrays; import java.util.List; import java.util.Map; @@ -51,83 +61,72 @@ import org.junit.runners.Parameterized; import org.junit.runners.Parameterized.Parameters; -import static org.apache.iceberg.FileFormat.AVRO; -import static org.apache.iceberg.FileFormat.ORC; -import static org.apache.iceberg.FileFormat.PARQUET; -import static org.apache.iceberg.MetadataTableType.ALL_DATA_FILES; -import static org.apache.iceberg.MetadataTableType.ALL_ENTRIES; -import static org.apache.iceberg.MetadataTableType.ENTRIES; -import static org.apache.iceberg.MetadataTableType.FILES; -import static org.apache.iceberg.MetadataTableType.PARTITIONS; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.FORMAT_VERSION; - @RunWith(Parameterized.class) public class TestMetadataTablesWithPartitionEvolution extends SparkCatalogTestBase { @Parameters(name = "catalog = {0}, impl = {1}, conf = {2}, fileFormat = {3}, formatVersion = {4}") public static Object[][] parameters() { return new Object[][] { - { "testhive", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default" - ), - ORC, - 1 - }, - { "testhive", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default" + { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default"), + ORC, + 1 + }, + { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default"), + ORC, + 2 + }, + {"testhadoop", SparkCatalog.class.getName(), ImmutableMap.of("type", "hadoop"), PARQUET, 1}, + {"testhadoop", SparkCatalog.class.getName(), ImmutableMap.of("type", "hadoop"), PARQUET, 2}, + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "clients", "1", + "parquet-enabled", "false", + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync ), - ORC, - 2 - }, - { "testhadoop", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hadoop" + AVRO, + 1 + }, + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "clients", "1", + "parquet-enabled", "false", + "cache-enabled", + "false" // Spark will delete tables using v1, leaving the cache out of sync ), - PARQUET, - 1 - }, - { "testhadoop", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hadoop" - ), - PARQUET, - 2 - }, - { "spark_catalog", SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "clients", "1", - "parquet-enabled", "false", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - ), - AVRO, - 1 - }, - { "spark_catalog", SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "clients", "1", - "parquet-enabled", "false", - "cache-enabled", "false" // Spark will delete tables using v1, leaving the cache out of sync - ), - AVRO, - 2 - } + AVRO, + 2 + } }; } private final FileFormat fileFormat; private final int formatVersion; - public TestMetadataTablesWithPartitionEvolution(String catalogName, String implementation, Map config, - FileFormat fileFormat, int formatVersion) { + public TestMetadataTablesWithPartitionEvolution( + String catalogName, + String implementation, + Map config, + FileFormat fileFormat, + int formatVersion) { super(catalogName, implementation, config); this.fileFormat = fileFormat; this.formatVersion = formatVersion; @@ -140,7 +139,9 @@ public void removeTable() { @Test public void testFilesMetadataTable() throws ParseException { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -148,28 +149,23 @@ public void testFilesMetadataTable() throws ParseException { // verify the metadata tables while the current spec is still unpartitioned for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES)) { Dataset df = loadMetadataTable(tableType); - Assert.assertTrue("Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); + Assert.assertTrue( + "Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); } Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); // verify the metadata tables after adding the first partition column for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES)) { assertPartitions( - ImmutableList.of(row(new Object[]{null}), row("b1")), - "STRUCT", - tableType); + ImmutableList.of(row(new Object[] {null}), row("b1")), "STRUCT", tableType); } - table.updateSpec() - .addField(Expressions.bucket("category", 8)) - .commit(); + table.updateSpec().addField(Expressions.bucket("category", 8)).commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -181,9 +177,7 @@ public void testFilesMetadataTable() throws ParseException { tableType); } - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -195,9 +189,7 @@ public void testFilesMetadataTable() throws ParseException { tableType); } - table.updateSpec() - .renameField("category_bucket_8", "category_bucket_8_another_name") - .commit(); + table.updateSpec().renameField("category_bucket_8", "category_bucket_8_another_name").commit(); sql("REFRESH TABLE %s", tableName); // verify the metadata tables after renaming the second partition column @@ -211,8 +203,10 @@ public void testFilesMetadataTable() throws ParseException { @Test public void testFilesMetadataTableFilter() throws ParseException { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg " + - "TBLPROPERTIES ('commit.manifest-merge.enabled' 'false')", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg " + + "TBLPROPERTIES ('commit.manifest-merge.enabled' 'false')", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); @@ -221,14 +215,13 @@ public void testFilesMetadataTableFilter() throws ParseException { // verify the metadata tables while the current spec is still unpartitioned for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES)) { Dataset df = loadMetadataTable(tableType); - Assert.assertTrue("Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); + Assert.assertTrue( + "Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); } Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); @@ -236,22 +229,18 @@ public void testFilesMetadataTableFilter() throws ParseException { // verify the metadata tables after adding the first partition column for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES)) { assertPartitions( - ImmutableList.of(row("d2")), - "STRUCT", - tableType, - "partition.data = 'd2'"); + ImmutableList.of(row("d2")), "STRUCT", tableType, "partition.data = 'd2'"); } - table.updateSpec() - .addField("category") - .commit(); + table.updateSpec().addField("category").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); // verify the metadata tables after adding the second partition column for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES)) { - assertPartitions(ImmutableList.of(row("d2", null), row("d2", "c2")), + assertPartitions( + ImmutableList.of(row("d2", null), row("d2", "c2")), "STRUCT", tableType, "partition.data = 'd2'"); @@ -264,9 +253,7 @@ public void testFilesMetadataTableFilter() throws ParseException { "partition.category = 'c2'"); } - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); // Verify new partitions do not show up for removed 'partition.data=d2' query @@ -276,7 +263,8 @@ public void testFilesMetadataTableFilter() throws ParseException { // Verify new partitions do show up for 'partition.category=c2' query sql("INSERT INTO TABLE %s VALUES (5, 'c2', 'd5')", tableName); - // no new partition should show up for 'data' partition query as partition field has been removed + // no new partition should show up for 'data' partition query as partition field has been + // removed for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES)) { assertPartitions( ImmutableList.of(row("d2", null), row("d2", "c2")), @@ -293,9 +281,7 @@ public void testFilesMetadataTableFilter() throws ParseException { "partition.category = 'c2'"); } - table.updateSpec() - .renameField("category", "category_another_name") - .commit(); + table.updateSpec().renameField("category", "category_another_name").commit(); sql("REFRESH TABLE %s", tableName); // Verify new partitions do show up for 'category=c2' query @@ -311,7 +297,9 @@ public void testFilesMetadataTableFilter() throws ParseException { @Test public void testEntriesMetadataTable() throws ParseException { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -325,23 +313,17 @@ public void testEntriesMetadataTable() throws ParseException { Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); // verify the metadata tables after adding the first partition column for (MetadataTableType tableType : Arrays.asList(ENTRIES, ALL_ENTRIES)) { assertPartitions( - ImmutableList.of(row(new Object[]{null}), row("b1")), - "STRUCT", - tableType); + ImmutableList.of(row(new Object[] {null}), row("b1")), "STRUCT", tableType); } - table.updateSpec() - .addField(Expressions.bucket("category", 8)) - .commit(); + table.updateSpec().addField(Expressions.bucket("category", 8)).commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -353,9 +335,7 @@ public void testEntriesMetadataTable() throws ParseException { tableType); } - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); @@ -367,9 +347,7 @@ public void testEntriesMetadataTable() throws ParseException { tableType); } - table.updateSpec() - .renameField("category_bucket_8", "category_bucket_8_another_name") - .commit(); + table.updateSpec().renameField("category_bucket_8", "category_bucket_8_another_name").commit(); sql("REFRESH TABLE %s", tableName); // verify the metadata tables after renaming the second partition column @@ -380,53 +358,48 @@ public void testEntriesMetadataTable() throws ParseException { tableType); } } + @Test public void testPartitionsTableAddRemoveFields() throws ParseException { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg ", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg ", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); // verify the metadata tables while the current spec is still unpartitioned Dataset df = loadMetadataTable(PARTITIONS); - Assert.assertTrue("Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); + Assert.assertTrue( + "Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); // verify the metadata tables after adding the first partition column assertPartitions( - ImmutableList.of(row(new Object[]{null}), row("d1"), row("d2")), + ImmutableList.of(row(new Object[] {null}), row("d1"), row("d2")), "STRUCT", PARTITIONS); - table.updateSpec() - .addField("category") - .commit(); + table.updateSpec().addField("category").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); // verify the metadata tables after adding the second partition column - assertPartitions(ImmutableList.of( - row(null, null), - row("d1", null), - row("d1", "c1"), - row("d2", null), - row("d2", "c2")), + assertPartitions( + ImmutableList.of( + row(null, null), row("d1", null), row("d1", "c1"), row("d2", null), row("d2", "c2")), "STRUCT", PARTITIONS); // verify the metadata tables after removing the first partition column - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); @@ -446,82 +419,66 @@ public void testPartitionsTableAddRemoveFields() throws ParseException { @Test public void testPartitionsTableRenameFields() throws ParseException { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .addField("category") - .commit(); + table.updateSpec().addField("data").addField("category").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); - assertPartitions(ImmutableList.of( - row("d1", "c1"), - row("d2", "c2")), + assertPartitions( + ImmutableList.of(row("d1", "c1"), row("d2", "c2")), "STRUCT", PARTITIONS); - table.updateSpec() - .renameField("category", "category_another_name") - .commit(); + table.updateSpec().renameField("category", "category_another_name").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); assertPartitions( - ImmutableList.of( - row("d1", "c1"), - row("d2", "c2")), + ImmutableList.of(row("d1", "c1"), row("d2", "c2")), "STRUCT", PARTITIONS); } @Test public void testPartitionsTableSwitchFields() throws Exception { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); Table table = validationCatalog.loadTable(tableIdent); // verify the metadata tables after re-adding the first dropped column in the second location - table.updateSpec() - .addField("data") - .addField("category") - .commit(); + table.updateSpec().addField("data").addField("category").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); - assertPartitions(ImmutableList.of( - row("d1", "c1"), - row("d2", "c2")), + assertPartitions( + ImmutableList.of(row("d1", "c1"), row("d2", "c2")), "STRUCT", PARTITIONS); - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); assertPartitions( - ImmutableList.of( - row(null, "c1"), - row(null, "c2"), - row("d1", "c1"), - row("d2", "c2")), + ImmutableList.of(row(null, "c1"), row(null, "c2"), row("d1", "c1"), row("d2", "c2")), "STRUCT", PARTITIONS); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); @@ -541,16 +498,14 @@ public void testPartitionsTableSwitchFields() throws Exception { "STRUCT", PARTITIONS); } else { - // In V2 re-adding a former partition field that was part of an older spec will not change its name or its - // field ID either, thus values will be collapsed into a single common column (as opposed to V1 where any new + // In V2 re-adding a former partition field that was part of an older spec will not change its + // name or its + // field ID either, thus values will be collapsed into a single common column (as opposed to + // V1 where any new // partition field addition will result in a new column in this metadata table) assertPartitions( ImmutableList.of( - row(null, "c1"), - row(null, "c2"), - row("d1", "c1"), - row("d2", "c2"), - row("d3", "c3")), + row(null, "c1"), row(null, "c2"), row("d1", "c1"), row("d2", "c2"), row("d3", "c3")), "STRUCT", PARTITIONS); } @@ -559,7 +514,9 @@ public void testPartitionsTableSwitchFields() throws Exception { @Test public void testPartitionTableFilterAddRemoveFields() throws ParseException { // Create un-partitioned table - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); @@ -567,28 +524,22 @@ public void testPartitionTableFilterAddRemoveFields() throws ParseException { // Partition Table with one partition column Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); assertPartitions( - ImmutableList.of(row("d2")), - "STRUCT", - PARTITIONS, - "partition.data = 'd2'"); + ImmutableList.of(row("d2")), "STRUCT", PARTITIONS, "partition.data = 'd2'"); // Partition Table with two partition column - table.updateSpec() - .addField("category") - .commit(); + table.updateSpec().addField("category").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); - assertPartitions(ImmutableList.of(row("d2", null), row("d2", "c2")), + assertPartitions( + ImmutableList.of(row("d2", null), row("d2", "c2")), "STRUCT", PARTITIONS, "partition.data = 'd2'"); @@ -599,9 +550,7 @@ public void testPartitionTableFilterAddRemoveFields() throws ParseException { "partition.category = 'c2'"); // Partition Table with first partition column removed - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (3, 'c3', 'd2')", tableName); @@ -621,49 +570,42 @@ public void testPartitionTableFilterAddRemoveFields() throws ParseException { @Test public void testPartitionTableFilterSwitchFields() throws Exception { - // Re-added partition fields currently not re-associated: https://github.com/apache/iceberg/issues/4292 + // Re-added partition fields currently not re-associated: + // https://github.com/apache/iceberg/issues/4292 // In V1, dropped partition fields show separately when field is re-added // In V2, re-added field currently conflicts with its deleted form Assume.assumeTrue(formatVersion == 1); - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); Table table = validationCatalog.loadTable(tableIdent); // Two partition columns - table.updateSpec() - .addField("data") - .addField("category") - .commit(); + table.updateSpec().addField("data").addField("category").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); // Drop first partition column - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); // Re-add first partition column at the end - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); assertPartitions( - ImmutableList.of( - row(null, "c2", null), - row(null, "c2", "d2"), - row("d2", "c2", null)), + ImmutableList.of(row(null, "c2", null), row(null, "c2", "d2"), row("d2", "c2", null)), "STRUCT", PARTITIONS, "partition.category = 'c2'"); @@ -677,22 +619,19 @@ public void testPartitionTableFilterSwitchFields() throws Exception { @Test public void testPartitionsTableFilterRenameFields() throws ParseException { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField("data") - .addField("category") - .commit(); + table.updateSpec().addField("data").addField("category").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); - table.updateSpec() - .renameField("category", "category_another_name") - .commit(); + table.updateSpec().renameField("category", "category_another_name").commit(); sql("REFRESH TABLE %s", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); @@ -706,15 +645,19 @@ public void testPartitionsTableFilterRenameFields() throws ParseException { @Test public void testMetadataTablesWithUnknownTransforms() { - sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", + tableName); initTable(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", tableName); Table table = validationCatalog.loadTable(tableIdent); - PartitionSpec unknownSpec = PartitionSpecParser.fromJson(table.schema(), - "{ \"spec-id\": 1, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); + PartitionSpec unknownSpec = + PartitionSpecParser.fromJson( + table.schema(), + "{ \"spec-id\": 1, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); // replace the table spec to include an unknown transform TableOperations ops = ((HasTableOperations) table).operations(); @@ -724,29 +667,37 @@ public void testMetadataTablesWithUnknownTransforms() { sql("REFRESH TABLE %s", tableName); for (MetadataTableType tableType : Arrays.asList(FILES, ALL_DATA_FILES, ENTRIES, ALL_ENTRIES)) { - AssertHelpers.assertThrows("Should complain about the partition type", - ValidationException.class, "Cannot build table partition type, unknown transforms", + AssertHelpers.assertThrows( + "Should complain about the partition type", + ValidationException.class, + "Cannot build table partition type, unknown transforms", () -> loadMetadataTable(tableType)); } } @Test public void testPartitionColumnNamedPartition() { - sql("CREATE TABLE %s (id int, partition int) USING iceberg PARTITIONED BY (partition)", tableName); + sql( + "CREATE TABLE %s (id int, partition int) USING iceberg PARTITIONED BY (partition)", + tableName); sql("INSERT INTO %s VALUES (1, 1), (2, 1), (3, 2), (2, 2)", tableName); - List expected = ImmutableList.of( - row(1, 1), row(2, 1), row(3, 2), row(2, 2)); + List expected = ImmutableList.of(row(1, 1), row(2, 1), row(3, 2), row(2, 2)); assertEquals("Should return all expected rows", expected, sql("SELECT * FROM %s", tableName)); Assert.assertEquals(2, sql("SELECT * FROM %s.files", tableName).size()); } - private void assertPartitions(List expectedPartitions, String expectedTypeAsString, - MetadataTableType tableType) throws ParseException { + private void assertPartitions( + List expectedPartitions, String expectedTypeAsString, MetadataTableType tableType) + throws ParseException { assertPartitions(expectedPartitions, expectedTypeAsString, tableType, null); } - private void assertPartitions(List expectedPartitions, String expectedTypeAsString, - MetadataTableType tableType, String filter) throws ParseException { + private void assertPartitions( + List expectedPartitions, + String expectedTypeAsString, + MetadataTableType tableType, + String filter) + throws ParseException { Dataset df = loadMetadataTable(tableType); if (filter != null) { df = df.filter(filter); @@ -776,18 +727,18 @@ private void assertPartitions(List expectedPartitions, String expected case PARTITIONS: case FILES: case ALL_DATA_FILES: - List actualFilesPartitions = df.orderBy("partition") - .select("partition.*") - .collectAsList(); - assertEquals("Partitions must match", expectedPartitions, rowsToJava(actualFilesPartitions)); + List actualFilesPartitions = + df.orderBy("partition").select("partition.*").collectAsList(); + assertEquals( + "Partitions must match", expectedPartitions, rowsToJava(actualFilesPartitions)); break; case ENTRIES: case ALL_ENTRIES: - List actualEntriesPartitions = df.orderBy("data_file.partition") - .select("data_file.partition.*") - .collectAsList(); - assertEquals("Partitions must match", expectedPartitions, rowsToJava(actualEntriesPartitions)); + List actualEntriesPartitions = + df.orderBy("data_file.partition").select("data_file.partition.*").collectAsList(); + assertEquals( + "Partitions must match", expectedPartitions, rowsToJava(actualEntriesPartitions)); break; default: @@ -800,7 +751,9 @@ private Dataset loadMetadataTable(MetadataTableType tableType) { } private void initTable() { - sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", tableName, DEFAULT_FILE_FORMAT, fileFormat.name()); + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%s')", + tableName, DEFAULT_FILE_FORMAT, fileFormat.name()); sql("ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", tableName, FORMAT_VERSION, formatVersion); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java index adfe8c7d3649..f585ed360f95 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.util.List; @@ -52,8 +53,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; - @RunWith(Parameterized.class) public class TestParquetScan extends AvroDataTest { private static final Configuration CONF = new Configuration(); @@ -72,12 +71,11 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Parameterized.Parameters(name = "vectorized = {0}") public static Object[] parameters() { - return new Object[] { false, true }; + return new Object[] {false, true}; } private final boolean vectorized; @@ -88,18 +86,20 @@ public TestParquetScan(boolean vectorized) { @Override protected void writeAndValidate(Schema schema) throws IOException { - Assume.assumeTrue("Cannot handle non-string map keys in parquet-avro", - null == TypeUtil.find( - schema, - type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); + Assume.assumeTrue( + "Cannot handle non-string map keys in parquet-avro", + null + == TypeUtil.find( + schema, + type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get())); File parent = temp.newFolder("parquet"); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); dataFolder.mkdirs(); - File parquetFile = new File(dataFolder, - FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); + File parquetFile = + new File(dataFolder, FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); HadoopTables tables = new HadoopTables(CONF); Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString()); @@ -110,24 +110,25 @@ protected void writeAndValidate(Schema schema) throws IOException { List expected = RandomData.generateList(tableSchema, 100, 1L); - try (FileAppender writer = Parquet.write(localOutput(parquetFile)) - .schema(tableSchema) - .build()) { + try (FileAppender writer = + Parquet.write(localOutput(parquetFile)).schema(tableSchema).build()) { writer.addAll(expected); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withFileSizeInBytes(parquetFile.length()) - .withPath(parquetFile.toString()) - .withRecordCount(100) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withFileSizeInBytes(parquetFile.length()) + .withPath(parquetFile.toString()) + .withRecordCount(100) + .build(); table.newAppend().appendFile(file).commit(); - table.updateProperties().set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)).commit(); + table + .updateProperties() + .set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .commit(); - Dataset df = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset df = spark.read().format("iceberg").load(location.toString()); List rows = df.collectAsList(); Assert.assertEquals("Should contain 100 rows", 100, rows.size()); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java index 24f7b69e1dc5..ffe21432f00c 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; @@ -78,11 +77,11 @@ public class TestPartitionPruning { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } @@ -97,9 +96,12 @@ public TestPartitionPruning(String format, boolean vectorized) { private static SparkSession spark = null; private static JavaSparkContext sparkContext = null; - private static Transform bucketTransform = Transforms.bucket(Types.IntegerType.get(), 3); - private static Transform truncateTransform = Transforms.truncate(Types.StringType.get(), 5); - private static Transform hourTransform = Transforms.hour(Types.TimestampType.withoutZone()); + private static Transform bucketTransform = + Transforms.bucket(Types.IntegerType.get(), 3); + private static Transform truncateTransform = + Transforms.truncate(Types.StringType.get(), 5); + private static Transform hourTransform = + Transforms.hour(Types.TimestampType.withoutZone()); @BeforeClass public static void startSpark() { @@ -110,12 +112,21 @@ public static void startSpark() { CONF.set(optionKey, CountOpenLocalFileSystem.class.getName()); spark.conf().set(optionKey, CountOpenLocalFileSystem.class.getName()); spark.conf().set("spark.sql.session.timeZone", "UTC"); - spark.udf().register("bucket3", (Integer num) -> bucketTransform.apply(num), DataTypes.IntegerType); - spark.udf().register("truncate5", (String str) -> truncateTransform.apply(str), DataTypes.StringType); + spark + .udf() + .register("bucket3", (Integer num) -> bucketTransform.apply(num), DataTypes.IntegerType); + spark + .udf() + .register("truncate5", (String str) -> truncateTransform.apply(str), DataTypes.StringType); // NOTE: date transforms take the type long, not Timestamp - spark.udf().register("hour", (Timestamp ts) -> hourTransform.apply( - org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp(ts)), - DataTypes.IntegerType); + spark + .udf() + .register( + "hour", + (Timestamp ts) -> + hourTransform.apply( + org.apache.spark.sql.catalyst.util.DateTimeUtils.fromJavaTimestamp(ts)), + DataTypes.IntegerType); } @AfterClass @@ -125,70 +136,70 @@ public static void stopSpark() { currentSpark.stop(); } - private static final Schema LOG_SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.IntegerType.get()), - Types.NestedField.optional(2, "date", Types.StringType.get()), - Types.NestedField.optional(3, "level", Types.StringType.get()), - Types.NestedField.optional(4, "message", Types.StringType.get()), - Types.NestedField.optional(5, "timestamp", Types.TimestampType.withZone()) - ); - - private static final List LOGS = ImmutableList.of( - LogMessage.debug("2020-02-02", "debug event 1", getInstant("2020-02-02T00:00:00")), - LogMessage.info("2020-02-02", "info event 1", getInstant("2020-02-02T01:00:00")), - LogMessage.debug("2020-02-02", "debug event 2", getInstant("2020-02-02T02:00:00")), - LogMessage.info("2020-02-03", "info event 2", getInstant("2020-02-03T00:00:00")), - LogMessage.debug("2020-02-03", "debug event 3", getInstant("2020-02-03T01:00:00")), - LogMessage.info("2020-02-03", "info event 3", getInstant("2020-02-03T02:00:00")), - LogMessage.error("2020-02-03", "error event 1", getInstant("2020-02-03T03:00:00")), - LogMessage.debug("2020-02-04", "debug event 4", getInstant("2020-02-04T01:00:00")), - LogMessage.warn("2020-02-04", "warn event 1", getInstant("2020-02-04T02:00:00")), - LogMessage.debug("2020-02-04", "debug event 5", getInstant("2020-02-04T03:00:00")) - ); + private static final Schema LOG_SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "date", Types.StringType.get()), + Types.NestedField.optional(3, "level", Types.StringType.get()), + Types.NestedField.optional(4, "message", Types.StringType.get()), + Types.NestedField.optional(5, "timestamp", Types.TimestampType.withZone())); + + private static final List LOGS = + ImmutableList.of( + LogMessage.debug("2020-02-02", "debug event 1", getInstant("2020-02-02T00:00:00")), + LogMessage.info("2020-02-02", "info event 1", getInstant("2020-02-02T01:00:00")), + LogMessage.debug("2020-02-02", "debug event 2", getInstant("2020-02-02T02:00:00")), + LogMessage.info("2020-02-03", "info event 2", getInstant("2020-02-03T00:00:00")), + LogMessage.debug("2020-02-03", "debug event 3", getInstant("2020-02-03T01:00:00")), + LogMessage.info("2020-02-03", "info event 3", getInstant("2020-02-03T02:00:00")), + LogMessage.error("2020-02-03", "error event 1", getInstant("2020-02-03T03:00:00")), + LogMessage.debug("2020-02-04", "debug event 4", getInstant("2020-02-04T01:00:00")), + LogMessage.warn("2020-02-04", "warn event 1", getInstant("2020-02-04T02:00:00")), + LogMessage.debug("2020-02-04", "debug event 5", getInstant("2020-02-04T03:00:00"))); private static Instant getInstant(String timestampWithoutZone) { - Long epochMicros = (Long) Literal.of(timestampWithoutZone).to(Types.TimestampType.withoutZone()).value(); + Long epochMicros = + (Long) Literal.of(timestampWithoutZone).to(Types.TimestampType.withoutZone()).value(); return Instant.ofEpochMilli(TimeUnit.MICROSECONDS.toMillis(epochMicros)); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); - private PartitionSpec spec = PartitionSpec.builderFor(LOG_SCHEMA) - .identity("date") - .identity("level") - .bucket("id", 3) - .truncate("message", 5) - .hour("timestamp") - .build(); + private PartitionSpec spec = + PartitionSpec.builderFor(LOG_SCHEMA) + .identity("date") + .identity("level") + .bucket("id", 3) + .truncate("message", 5) + .hour("timestamp") + .build(); @Test public void testPartitionPruningIdentityString() { String filterCond = "date >= '2020-02-03' AND level = 'DEBUG'"; - Predicate partCondition = (Row r) -> { - String date = r.getString(0); - String level = r.getString(1); - return date.compareTo("2020-02-03") >= 0 && level.equals("DEBUG"); - }; + Predicate partCondition = + (Row r) -> { + String date = r.getString(0); + String level = r.getString(1); + return date.compareTo("2020-02-03") >= 0 && level.equals("DEBUG"); + }; runTest(filterCond, partCondition); } @Test public void testPartitionPruningBucketingInteger() { - final int[] ids = new int[]{ - LOGS.get(3).getId(), - LOGS.get(7).getId() - }; - String condForIds = Arrays.stream(ids).mapToObj(String::valueOf) - .collect(Collectors.joining(",", "(", ")")); + final int[] ids = new int[] {LOGS.get(3).getId(), LOGS.get(7).getId()}; + String condForIds = + Arrays.stream(ids).mapToObj(String::valueOf).collect(Collectors.joining(",", "(", ")")); String filterCond = "id in " + condForIds; - Predicate partCondition = (Row r) -> { - int bucketId = r.getInt(2); - Set buckets = Arrays.stream(ids).map(bucketTransform::apply) - .boxed().collect(Collectors.toSet()); - return buckets.contains(bucketId); - }; + Predicate partCondition = + (Row r) -> { + int bucketId = r.getInt(2); + Set buckets = + Arrays.stream(ids).map(bucketTransform::apply).boxed().collect(Collectors.toSet()); + return buckets.contains(bucketId); + }; runTest(filterCond, partCondition); } @@ -196,10 +207,11 @@ public void testPartitionPruningBucketingInteger() { @Test public void testPartitionPruningTruncatedString() { String filterCond = "message like 'info event%'"; - Predicate partCondition = (Row r) -> { - String truncatedMessage = r.getString(3); - return truncatedMessage.equals("info "); - }; + Predicate partCondition = + (Row r) -> { + String truncatedMessage = r.getString(3); + return truncatedMessage.equals("info "); + }; runTest(filterCond, partCondition); } @@ -207,10 +219,11 @@ public void testPartitionPruningTruncatedString() { @Test public void testPartitionPruningTruncatedStringComparingValueShorterThanPartitionValue() { String filterCond = "message like 'inf%'"; - Predicate partCondition = (Row r) -> { - String truncatedMessage = r.getString(3); - return truncatedMessage.startsWith("inf"); - }; + Predicate partCondition = + (Row r) -> { + String truncatedMessage = r.getString(3); + return truncatedMessage.startsWith("inf"); + }; runTest(filterCond, partCondition); } @@ -219,17 +232,20 @@ public void testPartitionPruningTruncatedStringComparingValueShorterThanPartitio public void testPartitionPruningHourlyPartition() { String filterCond; if (spark.version().startsWith("2")) { - // Looks like from Spark 2 we need to compare timestamp with timestamp to push down the filter. + // Looks like from Spark 2 we need to compare timestamp with timestamp to push down the + // filter. filterCond = "timestamp >= to_timestamp('2020-02-03T01:00:00')"; } else { filterCond = "timestamp >= '2020-02-03T01:00:00'"; } - Predicate partCondition = (Row r) -> { - int hourValue = r.getInt(4); - Instant instant = getInstant("2020-02-03T01:00:00"); - Integer hourValueToFilter = hourTransform.apply(TimeUnit.MILLISECONDS.toMicros(instant.toEpochMilli())); - return hourValue >= hourValueToFilter; - }; + Predicate partCondition = + (Row r) -> { + int hourValue = r.getInt(4); + Instant instant = getInstant("2020-02-03T01:00:00"); + Integer hourValueToFilter = + hourTransform.apply(TimeUnit.MILLISECONDS.toMicros(instant.toEpochMilli())); + return hourValue >= hourValueToFilter; + }; runTest(filterCond, partCondition); } @@ -242,24 +258,26 @@ private void runTest(String filterCond, Predicate partCondition) { Dataset logs = createTestDataset(); saveTestDatasetToTable(logs, table); - List expected = logs - .select("id", "date", "level", "message", "timestamp") - .filter(filterCond) - .orderBy("id") - .collectAsList(); + List expected = + logs.select("id", "date", "level", "message", "timestamp") + .filter(filterCond) + .orderBy("id") + .collectAsList(); Assert.assertFalse("Expected rows should be not empty", expected.isEmpty()); // remove records which may be recorded during storing to table CountOpenLocalFileSystem.resetRecordsInPathPrefix(originTableLocation.getAbsolutePath()); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(table.location()) - .select("id", "date", "level", "message", "timestamp") - .filter(filterCond) - .orderBy("id") - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(table.location()) + .select("id", "date", "level", "message", "timestamp") + .filter(filterCond) + .orderBy("id") + .collectAsList(); Assert.assertFalse("Actual rows should not be empty", actual.isEmpty()); Assert.assertEquals("Rows should match", expected, actual); @@ -282,40 +300,59 @@ private Table createTable(File originTableLocation) { } private Dataset createTestDataset() { - List rows = LOGS.stream().map(logMessage -> { - Object[] underlying = new Object[] { - logMessage.getId(), - UTF8String.fromString(logMessage.getDate()), - UTF8String.fromString(logMessage.getLevel()), - UTF8String.fromString(logMessage.getMessage()), - // discard the nanoseconds part to simplify - TimeUnit.MILLISECONDS.toMicros(logMessage.getTimestamp().toEpochMilli()) - }; - return new GenericInternalRow(underlying); - }).collect(Collectors.toList()); + List rows = + LOGS.stream() + .map( + logMessage -> { + Object[] underlying = + new Object[] { + logMessage.getId(), + UTF8String.fromString(logMessage.getDate()), + UTF8String.fromString(logMessage.getLevel()), + UTF8String.fromString(logMessage.getMessage()), + // discard the nanoseconds part to simplify + TimeUnit.MILLISECONDS.toMicros(logMessage.getTimestamp().toEpochMilli()) + }; + return new GenericInternalRow(underlying); + }) + .collect(Collectors.toList()); JavaRDD rdd = sparkContext.parallelize(rows); - Dataset df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(LOG_SCHEMA), false); - - return df - .selectExpr("id", "date", "level", "message", "timestamp") - .selectExpr("id", "date", "level", "message", "timestamp", "bucket3(id) AS bucket_id", - "truncate5(message) AS truncated_message", "hour(timestamp) AS ts_hour"); + Dataset df = + spark.internalCreateDataFrame( + JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(LOG_SCHEMA), false); + + return df.selectExpr("id", "date", "level", "message", "timestamp") + .selectExpr( + "id", + "date", + "level", + "message", + "timestamp", + "bucket3(id) AS bucket_id", + "truncate5(message) AS truncated_message", + "hour(timestamp) AS ts_hour"); } private void saveTestDatasetToTable(Dataset logs, Table table) { logs.orderBy("date", "level", "bucket_id", "truncated_message", "ts_hour") .select("id", "date", "level", "message", "timestamp") - .write().format("iceberg").mode("append").save(table.location()); + .write() + .format("iceberg") + .mode("append") + .save(table.location()); } - private void assertAccessOnDataFiles(File originTableLocation, Table table, Predicate partCondition) { + private void assertAccessOnDataFiles( + File originTableLocation, Table table, Predicate partCondition) { // only use files in current table location to avoid side-effects on concurrent test runs - Set readFilesInQuery = CountOpenLocalFileSystem.pathToNumOpenCalled.keySet() - .stream().filter(path -> path.startsWith(originTableLocation.getAbsolutePath())) - .collect(Collectors.toSet()); + Set readFilesInQuery = + CountOpenLocalFileSystem.pathToNumOpenCalled.keySet().stream() + .filter(path -> path.startsWith(originTableLocation.getAbsolutePath())) + .collect(Collectors.toSet()); - List files = spark.read().format("iceberg").load(table.location() + "#files").collectAsList(); + List files = + spark.read().format("iceberg").load(table.location() + "#files").collectAsList(); Set filesToRead = extractFilePathsMatchingConditionOnPartition(files, partCondition); Set filesToNotRead = extractFilePathsNotIn(files, filesToRead); @@ -325,37 +362,51 @@ private void assertAccessOnDataFiles(File originTableLocation, Table table, Pred Assert.assertFalse("The query should prune some data files.", filesToNotRead.isEmpty()); - // We don't check "all" data files bound to the condition are being read, as data files can be pruned on + // We don't check "all" data files bound to the condition are being read, as data files can be + // pruned on // other conditions like lower/upper bound of columns. - Assert.assertFalse("Some of data files in partition range should be read. " + - "Read files in query: " + readFilesInQuery + " / data files in partition range: " + filesToRead, + Assert.assertFalse( + "Some of data files in partition range should be read. " + + "Read files in query: " + + readFilesInQuery + + " / data files in partition range: " + + filesToRead, Sets.intersection(filesToRead, readFilesInQuery).isEmpty()); // Data files which aren't bound to the condition shouldn't be read. - Assert.assertTrue("Data files outside of partition range should not be read. " + - "Read files in query: " + readFilesInQuery + " / data files outside of partition range: " + filesToNotRead, + Assert.assertTrue( + "Data files outside of partition range should not be read. " + + "Read files in query: " + + readFilesInQuery + + " / data files outside of partition range: " + + filesToNotRead, Sets.intersection(filesToNotRead, readFilesInQuery).isEmpty()); } - private Set extractFilePathsMatchingConditionOnPartition(List files, Predicate condition) { + private Set extractFilePathsMatchingConditionOnPartition( + List files, Predicate condition) { // idx 1: file_path, idx 3: partition return files.stream() - .filter(r -> { - Row partition = r.getStruct(4); - return condition.test(partition); - }).map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) + .filter( + r -> { + Row partition = r.getStruct(4); + return condition.test(partition); + }) + .map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) .collect(Collectors.toSet()); } private Set extractFilePathsNotIn(List files, Set filePaths) { - Set allFilePaths = files.stream().map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) - .collect(Collectors.toSet()); + Set allFilePaths = + files.stream() + .map(r -> CountOpenLocalFileSystem.stripScheme(r.getString(1))) + .collect(Collectors.toSet()); return Sets.newHashSet(Sets.symmetricDifference(allFilePaths, filePaths)); } public static class CountOpenLocalFileSystem extends RawLocalFileSystem { - public static String scheme = String.format("TestIdentityPartitionData%dfs", - new Random().nextInt()); + public static String scheme = + String.format("TestIdentityPartitionData%dfs", new Random().nextInt()); public static Map pathToNumOpenCalled = Maps.newConcurrentMap(); public static String convertPath(String absPath) { @@ -401,13 +452,15 @@ public String getScheme() { @Override public FSDataInputStream open(Path f, int bufferSize) throws IOException { String path = f.toUri().getPath(); - pathToNumOpenCalled.compute(path, (ignored, v) -> { - if (v == null) { - return 1L; - } else { - return v + 1; - } - }); + pathToNumOpenCalled.compute( + path, + (ignored, v) -> { + if (v == null) { + return 1L; + } else { + return v + 1; + } + }); return super.open(f, bufferSize); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java index 9d1b49f1aa8e..df2e4649d9f9 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.util.List; import org.apache.avro.generic.GenericData; @@ -56,46 +58,43 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestPartitionValues { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } - private static final Schema SUPPORTED_PRIMITIVES = new Schema( - required(100, "id", Types.LongType.get()), - required(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - required(103, "i", Types.IntegerType.get()), - required(104, "l", Types.LongType.get()), - required(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - required(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - required(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision - ); - - private static final Schema SIMPLE_SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get())); - - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SIMPLE_SCHEMA) - .identity("data") - .build(); + private static final Schema SUPPORTED_PRIMITIVES = + new Schema( + required(100, "id", Types.LongType.get()), + required(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + required(103, "i", Types.IntegerType.get()), + required(104, "l", Types.LongType.get()), + required(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + required(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + required(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision + ); + + private static final Schema SIMPLE_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); + + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SIMPLE_SCHEMA).identity("data").build(); private static SparkSession spark = null; @@ -111,8 +110,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String format; private final boolean vectorized; @@ -134,29 +132,30 @@ public void testNullPartitionValue() throws Exception { Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, null) - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, null)); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .mode(SaveMode.Append) .save(location.toString()); - Dataset result = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()); + Dataset result = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(location.toString()); - List actual = result - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -174,29 +173,28 @@ public void testReorderedColumns() throws Exception { Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("data", "id").write() - .format("iceberg") - .mode(SaveMode.Append) - .option(SparkWriteOptions.CHECK_ORDERING, "false") - .save(location.toString()); + df.select("data", "id") + .write() + .format("iceberg") + .mode(SaveMode.Append) + .option(SparkWriteOptions.CHECK_ORDERING, "false") + .save(location.toString()); - Dataset result = spark.read() + Dataset result = + spark + .read() .format("iceberg") .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) .load(location.toString()); - List actual = result - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -214,30 +212,29 @@ public void testReorderedColumnsNoNullability() throws Exception { Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("data", "id").write() - .format("iceberg") - .mode(SaveMode.Append) - .option(SparkWriteOptions.CHECK_ORDERING, "false") - .option(SparkWriteOptions.CHECK_NULLABILITY, "false") - .save(location.toString()); + df.select("data", "id") + .write() + .format("iceberg") + .mode(SaveMode.Append) + .option(SparkWriteOptions.CHECK_ORDERING, "false") + .option(SparkWriteOptions.CHECK_NULLABILITY, "false") + .save(location.toString()); - Dataset result = spark.read() + Dataset result = + spark + .read() .format("iceberg") .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) .load(location.toString()); - List actual = result - .orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -245,9 +242,10 @@ public void testReorderedColumnsNoNullability() throws Exception { @Test public void testPartitionValueTypes() throws Exception { - String[] columnNames = new String[] { - "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" - }; + String[] columnNames = + new String[] { + "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" + }; HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); @@ -259,23 +257,27 @@ public void testPartitionValueTypes() throws Exception { List expected = RandomData.generateList(source.schema(), 2, 128735L); File avroData = temp.newFile("data.avro"); Assert.assertTrue(avroData.delete()); - try (FileAppender appender = Avro.write(Files.localOutput(avroData)) - .schema(source.schema()) - .build()) { + try (FileAppender appender = + Avro.write(Files.localOutput(avroData)).schema(source.schema()).build()) { appender.addAll(expected); } // add the Avro data file to the source table - source.newAppend() - .appendFile(DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(10) - .withInputFile(Files.localInput(avroData)) - .build()) + source + .newAppend() + .appendFile( + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(10) + .withInputFile(Files.localInput(avroData)) + .build()) .commit(); - Dataset sourceDF = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(sourceLocation); + Dataset sourceDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(sourceLocation); for (String column : columnNames) { String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString(); @@ -290,17 +292,20 @@ public void testPartitionValueTypes() throws Exception { Table table = tables.create(SUPPORTED_PRIMITIVES, spec, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - sourceDF.write() + sourceDF + .write() .format("iceberg") .mode(SaveMode.Append) .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false") .save(location.toString()); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(location.toString()) + .collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); @@ -313,9 +318,10 @@ public void testPartitionValueTypes() throws Exception { @Test public void testNestedPartitionValues() throws Exception { - String[] columnNames = new String[] { - "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" - }; + String[] columnNames = + new String[] { + "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" + }; HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); Schema nestedSchema = new Schema(optional(1, "nested", SUPPORTED_PRIMITIVES.asStruct())); @@ -328,23 +334,27 @@ public void testNestedPartitionValues() throws Exception { List expected = RandomData.generateList(source.schema(), 2, 128735L); File avroData = temp.newFile("data.avro"); Assert.assertTrue(avroData.delete()); - try (FileAppender appender = Avro.write(Files.localOutput(avroData)) - .schema(source.schema()) - .build()) { + try (FileAppender appender = + Avro.write(Files.localOutput(avroData)).schema(source.schema()).build()) { appender.addAll(expected); } // add the Avro data file to the source table - source.newAppend() - .appendFile(DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(10) - .withInputFile(Files.localInput(avroData)) - .build()) + source + .newAppend() + .appendFile( + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(10) + .withInputFile(Files.localInput(avroData)) + .build()) .commit(); - Dataset sourceDF = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(sourceLocation); + Dataset sourceDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(sourceLocation); for (String column : columnNames) { String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString(); @@ -354,46 +364,51 @@ public void testNestedPartitionValues() throws Exception { File dataFolder = new File(location, "data"); Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); - PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity("nested." + column).build(); + PartitionSpec spec = + PartitionSpec.builderFor(nestedSchema).identity("nested." + column).build(); Table table = tables.create(nestedSchema, spec, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); - sourceDF.write() + sourceDF + .write() .format("iceberg") .mode(SaveMode.Append) .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false") .save(location.toString()); - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(location.toString()) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(location.toString()) + .collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i += 1) { - TestHelpers.assertEqualsSafe( - nestedSchema.asStruct(), expected.get(i), actual.get(i)); + TestHelpers.assertEqualsSafe(nestedSchema.asStruct(), expected.get(i), actual.get(i)); } } } /** * To verify if WrappedPositionAccessor is generated against a string field within a nested field, - * rather than a Position2Accessor. - * Or when building the partition path, a ClassCastException is thrown with the message like: - * Cannot cast org.apache.spark.unsafe.types.UTF8String to java.lang.CharSequence + * rather than a Position2Accessor. Or when building the partition path, a ClassCastException is + * thrown with the message like: Cannot cast org.apache.spark.unsafe.types.UTF8String to + * java.lang.CharSequence */ @Test public void testPartitionedByNestedString() throws Exception { // schema and partition spec - Schema nestedSchema = new Schema( - Types.NestedField.required(1, "struct", - Types.StructType.of(Types.NestedField.required(2, "string", Types.StringType.get())) - ) - ); + Schema nestedSchema = + new Schema( + Types.NestedField.required( + 1, + "struct", + Types.StructType.of( + Types.NestedField.required(2, "string", Types.StringType.get())))); PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity("struct.string").build(); // create table @@ -403,14 +418,14 @@ public void testPartitionedByNestedString() throws Exception { // input data frame StructField[] structFields = { - new StructField("struct", - DataTypes.createStructType( - new StructField[] { - new StructField("string", DataTypes.StringType, false, Metadata.empty()) - } - ), - false, Metadata.empty() - ) + new StructField( + "struct", + DataTypes.createStructType( + new StructField[] { + new StructField("string", DataTypes.StringType, false, Metadata.empty()) + }), + false, + Metadata.empty()) }; List rows = Lists.newArrayList(); @@ -418,17 +433,16 @@ public void testPartitionedByNestedString() throws Exception { Dataset sourceDF = spark.createDataFrame(rows, new StructType(structFields)); // write into iceberg - sourceDF.write() - .format("iceberg") - .mode(SaveMode.Append) - .save(baseLocation); + sourceDF.write().format("iceberg").mode(SaveMode.Append).save(baseLocation); // verify - List actual = spark.read() - .format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(baseLocation) - .collectAsList(); + List actual = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(baseLocation) + .collectAsList(); Assert.assertEquals("Number of rows should match", rows.size(), actual.size()); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestPathIdentifier.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestPathIdentifier.java index ff4fe22a7a8a..f58451296cef 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestPathIdentifier.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestPathIdentifier.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import org.apache.iceberg.BaseTable; @@ -42,16 +43,13 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestPathIdentifier extends SparkTestBase { - private static final Schema SCHEMA = new Schema( - required(1, "id", Types.LongType.get()), - required(2, "data", Types.StringType.get())); + private static final Schema SCHEMA = + new Schema( + required(1, "id", Types.LongType.get()), required(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File tableLocation; private PathIdentifier identifier; private SparkCatalog sparkCatalog; @@ -72,17 +70,16 @@ public void after() { @Test public void testPathIdentifier() throws TableAlreadyExistsException, NoSuchTableException { - SparkTable table = sparkCatalog.createTable(identifier, - SparkSchemaUtil.convert(SCHEMA), - new Transform[0], - ImmutableMap.of()); + SparkTable table = + sparkCatalog.createTable( + identifier, SparkSchemaUtil.convert(SCHEMA), new Transform[0], ImmutableMap.of()); Assert.assertEquals(table.table().location(), tableLocation.getAbsolutePath()); Assertions.assertThat(table.table()).isInstanceOf(BaseTable.class); - Assertions.assertThat(((BaseTable) table.table()).operations()).isInstanceOf(HadoopTableOperations.class); + Assertions.assertThat(((BaseTable) table.table()).operations()) + .isInstanceOf(HadoopTableOperations.class); Assert.assertEquals(sparkCatalog.loadTable(identifier), table); Assert.assertTrue(sparkCatalog.dropTable(identifier)); } } - diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java index 8d65b64cab6d..cfc746f6e932 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestReadProjection.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.avro.Schema.Type.UNION; + import java.io.IOException; import java.util.List; import java.util.Map; @@ -37,8 +38,6 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.avro.Schema.Type.UNION; - public abstract class TestReadProjection { final String format; @@ -46,20 +45,17 @@ public abstract class TestReadProjection { this.format = format; } - protected abstract Record writeAndRead(String desc, - Schema writeSchema, - Schema readSchema, - Record record) throws IOException; + protected abstract Record writeAndRead( + String desc, Schema writeSchema, Schema readSchema, Record record) throws IOException; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Test public void testFullProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); @@ -67,32 +63,33 @@ public void testFullProjection() throws Exception { Record projected = writeAndRead("full_projection", schema, schema, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("data")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("data")); Assert.assertEquals("Should contain the correct data value", 0, cmp); } @Test public void testReorderedFullProjection() throws Exception { -// Assume.assumeTrue( -// "Spark's Parquet read support does not support reordered columns", -// !format.equalsIgnoreCase("parquet")); + // Assume.assumeTrue( + // "Spark's Parquet read support does not support reordered columns", + // !format.equalsIgnoreCase("parquet")); - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); record.setField("data", "test"); - Schema reordered = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("reordered_full_projection", schema, reordered, record); @@ -102,24 +99,24 @@ public void testReorderedFullProjection() throws Exception { @Test public void testReorderedProjection() throws Exception { -// Assume.assumeTrue( -// "Spark's Parquet read support does not support reordered columns", -// !format.equalsIgnoreCase("parquet")); + // Assume.assumeTrue( + // "Spark's Parquet read support does not support reordered columns", + // !format.equalsIgnoreCase("parquet")); - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); record.setField("data", "test"); - Schema reordered = new Schema( - Types.NestedField.optional(2, "missing_1", Types.StringType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()), - Types.NestedField.optional(3, "missing_2", Types.LongType.get()) - ); + Schema reordered = + new Schema( + Types.NestedField.optional(2, "missing_1", Types.StringType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get()), + Types.NestedField.optional(3, "missing_2", Types.LongType.get())); Record projected = writeAndRead("reordered_projection", schema, reordered, record); @@ -130,10 +127,10 @@ public void testReorderedProjection() throws Exception { @Test public void testEmptyProjection() throws Exception { - Schema schema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema schema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(schema); record.setField("id", 34L); @@ -152,68 +149,68 @@ public void testEmptyProjection() throws Exception { @Test public void testBasicProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); record.setField("data", "test"); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("basic_projection_id", writeSchema, idOnly, record); Assert.assertNull("Should not project data", projected.getField("data")); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); - Schema dataOnly = new Schema( - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema dataOnly = new Schema(Types.NestedField.optional(1, "data", Types.StringType.get())); projected = writeAndRead("basic_projection_data", writeSchema, dataOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("data")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("data")); Assert.assertEquals("Should contain the correct data value", 0, cmp); } @Test public void testRename() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "data", Types.StringType.get()) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "data", Types.StringType.get())); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); record.setField("data", "test"); - Schema readSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(1, "renamed", Types.StringType.get()) - ); + Schema readSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional(1, "renamed", Types.StringType.get())); Record projected = writeAndRead("project_and_rename", writeSchema, readSchema, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); - int cmp = Comparators.charSequences() - .compare("test", (CharSequence) projected.getField("renamed")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); + int cmp = + Comparators.charSequences().compare("test", (CharSequence) projected.getField("renamed")); Assert.assertEquals("Should contain the correct data/renamed value", 0, cmp); } @Test public void testNestedStructProjection() throws Exception { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 3, + "location", + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get())))); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); @@ -222,61 +219,76 @@ public void testNestedStructProjection() throws Exception { location.setField("long", -1.539054f); record.setField("location", location); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); Record projectedLocation = (Record) projected.getField("location"); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project location", projectedLocation); - Schema latOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()) - )) - ); + Schema latOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(1, "lat", Types.FloatType.get())))); projected = writeAndRead("latitude_only", writeSchema, latOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); Assert.assertNull("Should not project longitude", projectedLocation.getField("long")); - Assert.assertEquals("Should project latitude", - 52.995143f, (float) projectedLocation.getField("lat"), 0.000001f); - - Schema longOnly = new Schema( - Types.NestedField.optional(3, "location", Types.StructType.of( - Types.NestedField.required(2, "long", Types.FloatType.get()) - )) - ); + Assert.assertEquals( + "Should project latitude", + 52.995143f, + (float) projectedLocation.getField("lat"), + 0.000001f); + + Schema longOnly = + new Schema( + Types.NestedField.optional( + 3, + "location", + Types.StructType.of(Types.NestedField.required(2, "long", Types.FloatType.get())))); projected = writeAndRead("longitude_only", writeSchema, longOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); Assert.assertNull("Should not project latitutde", projectedLocation.getField("lat")); - Assert.assertEquals("Should project longitude", - -1.539054f, (float) projectedLocation.getField("long"), 0.000001f); + Assert.assertEquals( + "Should project longitude", + -1.539054f, + (float) projectedLocation.getField("long"), + 0.000001f); Schema locationOnly = writeSchema.select("location"); projected = writeAndRead("location_only", writeSchema, locationOnly, record); projectedLocation = (Record) projected.getField("location"); Assert.assertNull("Should not project id", projected.getField("id")); Assert.assertNotNull("Should project location", projected.getField("location")); - Assert.assertEquals("Should project latitude", - 52.995143f, (float) projectedLocation.getField("lat"), 0.000001f); - Assert.assertEquals("Should project longitude", - -1.539054f, (float) projectedLocation.getField("long"), 0.000001f); + Assert.assertEquals( + "Should project latitude", + 52.995143f, + (float) projectedLocation.getField("lat"), + 0.000001f); + Assert.assertEquals( + "Should project longitude", + -1.539054f, + (float) projectedLocation.getField("long"), + 0.000001f); } @Test public void testMapProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "properties", - Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "properties", + Types.MapType.ofOptional(6, 7, Types.StringType.get(), Types.StringType.get()))); Map properties = ImmutableMap.of("a", "A", "b", "B"); @@ -284,31 +296,36 @@ public void testMapProjection() throws IOException { record.setField("id", 34L); record.setField("properties", properties); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project properties map", projected.getField("properties")); Schema keyOnly = writeSchema.select("properties.key"); projected = writeAndRead("key_only", writeSchema, keyOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); Schema valueOnly = writeSchema.select("properties.value"); projected = writeAndRead("value_only", writeSchema, valueOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); Schema mapOnly = writeSchema.select("properties"); projected = writeAndRead("map_only", writeSchema, mapOnly, record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project entire map", - properties, toStringMap((Map) projected.getField("properties"))); + Assert.assertEquals( + "Should project entire map", + properties, + toStringMap((Map) projected.getField("properties"))); } private Map toStringMap(Map map) { @@ -325,16 +342,19 @@ public void testMapProjection() throws IOException { @Test public void testMapOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "lat", Types.FloatType.get()), - Types.NestedField.required(2, "long", Types.FloatType.get()) - ) - )) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "lat", Types.FloatType.get()), + Types.NestedField.required(2, "long", Types.FloatType.get()))))); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); @@ -346,91 +366,100 @@ public void testMapOfStructsProjection() throws IOException { l2.setField("long", -1.539054f); record.setField("locations", ImmutableMap.of("L1", l1, "L2", l2)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project locations map", projected.getField("locations")); projected = writeAndRead("all_locations", writeSchema, writeSchema.select("locations"), record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project locations map", - record.getField("locations"), toStringMap((Map) projected.getField("locations"))); + Assert.assertEquals( + "Should project locations map", + record.getField("locations"), + toStringMap((Map) projected.getField("locations"))); - projected = writeAndRead("lat_only", - writeSchema, writeSchema.select("locations.lat"), record); + projected = writeAndRead("lat_only", writeSchema, writeSchema.select("locations.lat"), record); Assert.assertNull("Should not project id", projected.getField("id")); Map locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); Record projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain lat", - 53.992811f, (float) projectedL1.getField("lat"), 0.000001); + Assert.assertEquals( + "L1 should contain lat", 53.992811f, (float) projectedL1.getField("lat"), 0.000001); Assert.assertNull("L1 should not contain long", projectedL1.getField("long")); Record projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain lat", - 52.995143f, (float) projectedL2.getField("lat"), 0.000001); + Assert.assertEquals( + "L2 should contain lat", 52.995143f, (float) projectedL2.getField("lat"), 0.000001); Assert.assertNull("L2 should not contain long", projectedL2.getField("long")); - projected = writeAndRead("long_only", - writeSchema, writeSchema.select("locations.long"), record); + projected = + writeAndRead("long_only", writeSchema, writeSchema.select("locations.long"), record); Assert.assertNull("Should not project id", projected.getField("id")); locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); Assert.assertNull("L1 should not contain lat", projectedL1.getField("lat")); - Assert.assertEquals("L1 should contain long", - -1.542616f, (float) projectedL1.getField("long"), 0.000001); + Assert.assertEquals( + "L1 should contain long", -1.542616f, (float) projectedL1.getField("long"), 0.000001); projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); Assert.assertNull("L2 should not contain lat", projectedL2.getField("lat")); - Assert.assertEquals("L2 should contain long", - -1.539054f, (float) projectedL2.getField("long"), 0.000001); - - Schema latitiudeRenamed = new Schema( - Types.NestedField.optional(5, "locations", Types.MapType.ofOptional(6, 7, - Types.StringType.get(), - Types.StructType.of( - Types.NestedField.required(1, "latitude", Types.FloatType.get()) - ) - )) - ); + Assert.assertEquals( + "L2 should contain long", -1.539054f, (float) projectedL2.getField("long"), 0.000001); + + Schema latitiudeRenamed = + new Schema( + Types.NestedField.optional( + 5, + "locations", + Types.MapType.ofOptional( + 6, + 7, + Types.StringType.get(), + Types.StructType.of( + Types.NestedField.required(1, "latitude", Types.FloatType.get()))))); projected = writeAndRead("latitude_renamed", writeSchema, latitiudeRenamed, record); Assert.assertNull("Should not project id", projected.getField("id")); locations = toStringMap((Map) projected.getField("locations")); Assert.assertNotNull("Should project locations map", locations); - Assert.assertEquals("Should contain L1 and L2", - Sets.newHashSet("L1", "L2"), locations.keySet()); + Assert.assertEquals( + "Should contain L1 and L2", Sets.newHashSet("L1", "L2"), locations.keySet()); projectedL1 = (Record) locations.get("L1"); Assert.assertNotNull("L1 should not be null", projectedL1); - Assert.assertEquals("L1 should contain latitude", - 53.992811f, (float) projectedL1.getField("latitude"), 0.000001); + Assert.assertEquals( + "L1 should contain latitude", + 53.992811f, + (float) projectedL1.getField("latitude"), + 0.000001); Assert.assertNull("L1 should not contain lat", projectedL1.getField("lat")); Assert.assertNull("L1 should not contain long", projectedL1.getField("long")); projectedL2 = (Record) locations.get("L2"); Assert.assertNotNull("L2 should not be null", projectedL2); - Assert.assertEquals("L2 should contain latitude", - 52.995143f, (float) projectedL2.getField("latitude"), 0.000001); + Assert.assertEquals( + "L2 should contain latitude", + 52.995143f, + (float) projectedL2.getField("latitude"), + 0.000001); Assert.assertNull("L2 should not contain lat", projectedL2.getField("lat")); Assert.assertNull("L2 should not contain long", projectedL2.getField("long")); } @Test public void testListProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(10, "values", - Types.ListType.ofOptional(11, Types.LongType.get())) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 10, "values", Types.ListType.ofOptional(11, Types.LongType.get()))); List values = ImmutableList.of(56L, 57L, 58L); @@ -438,12 +467,11 @@ public void testListProjection() throws IOException { record.setField("id", 34L); record.setField("values", values); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project values list", projected.getField("values")); Schema elementOnly = writeSchema.select("values.element"); @@ -460,15 +488,17 @@ public void testListProjection() throws IOException { @Test @SuppressWarnings("unchecked") public void testListOfStructsProjection() throws IOException { - Schema writeSchema = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()), - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()) - )) - ) - ); + Schema writeSchema = + new Schema( + Types.NestedField.required(0, "id", Types.LongType.get()), + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()))))); Record record = GenericRecord.create(writeSchema); record.setField("id", 34L); @@ -480,18 +510,17 @@ public void testListOfStructsProjection() throws IOException { p2.setField("y", null); record.setField("points", ImmutableList.of(p1, p2)); - Schema idOnly = new Schema( - Types.NestedField.required(0, "id", Types.LongType.get()) - ); + Schema idOnly = new Schema(Types.NestedField.required(0, "id", Types.LongType.get())); Record projected = writeAndRead("id_only", writeSchema, idOnly, record); - Assert.assertEquals("Should contain the correct id value", 34L, (long) projected.getField("id")); + Assert.assertEquals( + "Should contain the correct id value", 34L, (long) projected.getField("id")); Assert.assertNull("Should not project points list", projected.getField("points")); projected = writeAndRead("all_points", writeSchema, writeSchema.select("points"), record); Assert.assertNull("Should not project id", projected.getField("id")); - Assert.assertEquals("Should project points list", - record.getField("points"), projected.getField("points")); + Assert.assertEquals( + "Should project points list", record.getField("points"), projected.getField("points")); projected = writeAndRead("x_only", writeSchema, writeSchema.select("points.x"), record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -517,13 +546,15 @@ public void testListOfStructsProjection() throws IOException { Assert.assertNull("Should not project x", projectedP2.getField("x")); Assert.assertNull("Should project null y", projectedP2.getField("y")); - Schema yRenamed = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.optional(18, "z", Types.IntegerType.get()) - )) - ) - ); + Schema yRenamed = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.optional(18, "z", Types.IntegerType.get()))))); projected = writeAndRead("y_renamed", writeSchema, yRenamed, record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -539,15 +570,17 @@ public void testListOfStructsProjection() throws IOException { Assert.assertNull("Should not project y", projectedP2.getField("y")); Assert.assertNull("Should project null z", projectedP2.getField("z")); - Schema zAdded = new Schema( - Types.NestedField.optional(22, "points", - Types.ListType.ofOptional(21, Types.StructType.of( - Types.NestedField.required(19, "x", Types.IntegerType.get()), - Types.NestedField.optional(18, "y", Types.IntegerType.get()), - Types.NestedField.optional(20, "z", Types.IntegerType.get()) - )) - ) - ); + Schema zAdded = + new Schema( + Types.NestedField.optional( + 22, + "points", + Types.ListType.ofOptional( + 21, + Types.StructType.of( + Types.NestedField.required(19, "x", Types.IntegerType.get()), + Types.NestedField.optional(18, "y", Types.IntegerType.get()), + Types.NestedField.optional(20, "z", Types.IntegerType.get()))))); projected = writeAndRead("z_added", writeSchema, zAdded, record); Assert.assertNull("Should not project id", projected.getField("id")); @@ -565,10 +598,10 @@ public void testListOfStructsProjection() throws IOException { } private static org.apache.avro.Schema fromOption(org.apache.avro.Schema schema) { - Preconditions.checkArgument(schema.getType() == UNION, - "Expected union schema but was passed: %s", schema); - Preconditions.checkArgument(schema.getTypes().size() == 2, - "Expected optional schema, but was passed: %s", schema); + Preconditions.checkArgument( + schema.getType() == UNION, "Expected union schema but was passed: %s", schema); + Preconditions.checkArgument( + schema.getTypes().size() == 2, "Expected optional schema, but was passed: %s", schema); if (schema.getTypes().get(0).getType() == org.apache.avro.Schema.Type.NULL) { return schema.getTypes().get(1); } else { diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestRequiredDistributionAndOrdering.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestRequiredDistributionAndOrdering.java index 2c64bc18c35c..8e54a23f815a 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestRequiredDistributionAndOrdering.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestRequiredDistributionAndOrdering.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -36,7 +35,8 @@ public class TestRequiredDistributionAndOrdering extends SparkCatalogTestBase { - public TestRequiredDistributionAndOrdering(String catalogName, String implementation, Map config) { + public TestRequiredDistributionAndOrdering( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -47,122 +47,130 @@ public void dropTestTable() { @Test public void testDefaultLocalSort() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (c3)", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (c3)", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); // should insert a local sort by partition columns by default inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testPartitionColumnsArePrependedForRangeDistribution() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (c3)", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (c3)", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); Table table = validationCatalog.loadTable(tableIdent); // should automatically prepend partition columns to the ordering - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE) .commit(); - table.replaceSortOrder() - .asc("c1") - .asc("c2") - .commit(); + table.replaceSortOrder().asc("c1").asc("c2").commit(); inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testSortOrderIncludesPartitionColumns() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (c3)", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (c3)", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); Table table = validationCatalog.loadTable(tableIdent); // should succeed with a correct sort order - table.replaceSortOrder() - .asc("c3") - .asc("c1") - .asc("c2") - .commit(); + table.replaceSortOrder().asc("c3").asc("c1").asc("c2").commit(); inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testDisabledDistributionAndOrdering() { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (c3)", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (c3)", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); // should fail if ordering is disabled - AssertHelpers.assertThrows("Should reject writes without ordering", - SparkException.class, "Writing job aborted", + AssertHelpers.assertThrows( + "Should reject writes without ordering", + SparkException.class, + "Writing job aborted", () -> { try { - inputDF.writeTo(tableName) + inputDF + .writeTo(tableName) .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false") .append(); } catch (NoSuchTableException e) { @@ -173,57 +181,62 @@ public void testDisabledDistributionAndOrdering() { @Test public void testHashDistribution() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (c3)", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (c3)", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); Table table = validationCatalog.loadTable(tableIdent); // should automatically prepend partition columns to the local ordering after hash distribution - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, TableProperties.WRITE_DISTRIBUTION_MODE_HASH) .commit(); - table.replaceSortOrder() - .asc("c1") - .asc("c2") - .commit(); + table.replaceSortOrder().asc("c1").asc("c2").commit(); inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testNoSortBucketTransformsWithoutExtensions() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(2, c1))", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBB", "B"), - new ThreeColumnRecord(3, "BBBB", "B"), - new ThreeColumnRecord(4, "BBBB", "B") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(2, c1))", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBB", "B"), + new ThreeColumnRecord(3, "BBBB", "B"), + new ThreeColumnRecord(4, "BBBB", "B")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); Dataset inputDF = ds.coalesce(1).sortWithinPartitions("c1"); // should fail by default as extensions are disabled - AssertHelpers.assertThrows("Should reject writes without ordering", - SparkException.class, "Writing job aborted", + AssertHelpers.assertThrows( + "Should reject writes without ordering", + SparkException.class, + "Writing job aborted", () -> { try { inputDF.writeTo(tableName).append(); @@ -232,84 +245,83 @@ public void testNoSortBucketTransformsWithoutExtensions() throws NoSuchTableExce } }); - inputDF.writeTo(tableName) - .option(SparkWriteOptions.FANOUT_ENABLED, "true") - .append(); + inputDF.writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); - List expected = ImmutableList.of( - row(1, null, "A"), - row(2, "BBBB", "B"), - row(3, "BBBB", "B"), - row(4, "BBBB", "B") - ); + List expected = + ImmutableList.of( + row(1, null, "A"), row(2, "BBBB", "B"), row(3, "BBBB", "B"), row(4, "BBBB", "B")); assertEquals("Rows must match", expected, sql("SELECT * FROM %s ORDER BY c1", tableName)); } @Test public void testRangeDistributionWithQuotedColumnsNames() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, `c.3` STRING) " + - "USING iceberg " + - "PARTITIONED BY (`c.3`)", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, `c.3` STRING) " + + "USING iceberg " + + "PARTITIONED BY (`c.3`)", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); - Dataset inputDF = ds.selectExpr("c1", "c2", "c3 as `c.3`").coalesce(1).sortWithinPartitions("c1"); + Dataset inputDF = + ds.selectExpr("c1", "c2", "c3 as `c.3`").coalesce(1).sortWithinPartitions("c1"); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, TableProperties.WRITE_DISTRIBUTION_MODE_RANGE) .commit(); - table.replaceSortOrder() - .asc("c1") - .asc("c2") - .commit(); + table.replaceSortOrder().asc("c1").asc("c2").commit(); inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } @Test public void testHashDistributionWithQuotedColumnsNames() throws NoSuchTableException { - sql("CREATE TABLE %s (c1 INT, c2 STRING, `c``3` STRING) " + - "USING iceberg " + - "PARTITIONED BY (`c``3`)", tableName); - - List data = ImmutableList.of( - new ThreeColumnRecord(1, null, "A"), - new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), - new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), - new ThreeColumnRecord(7, "BBBBBBBBBB", "A") - ); + sql( + "CREATE TABLE %s (c1 INT, c2 STRING, `c``3` STRING) " + + "USING iceberg " + + "PARTITIONED BY (`c``3`)", + tableName); + + List data = + ImmutableList.of( + new ThreeColumnRecord(1, null, "A"), + new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), + new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), + new ThreeColumnRecord(7, "BBBBBBBBBB", "A")); Dataset ds = spark.createDataFrame(data, ThreeColumnRecord.class); - Dataset inputDF = ds.selectExpr("c1", "c2", "c3 as `c``3`").coalesce(1).sortWithinPartitions("c1"); + Dataset inputDF = + ds.selectExpr("c1", "c2", "c3 as `c``3`").coalesce(1).sortWithinPartitions("c1"); Table table = validationCatalog.loadTable(tableIdent); - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_DISTRIBUTION_MODE, TableProperties.WRITE_DISTRIBUTION_MODE_HASH) .commit(); - table.replaceSortOrder() - .asc("c1") - .asc("c2") - .commit(); + table.replaceSortOrder().asc("c1").asc("c2").commit(); inputDF.writeTo(tableName).append(); - assertEquals("Row count must match", + assertEquals( + "Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName)); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestRuntimeFiltering.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestRuntimeFiltering.java index c5127d0c636d..beaf7b75c6c0 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestRuntimeFiltering.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestRuntimeFiltering.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.spark.sql.functions.date_add; +import static org.apache.spark.sql.functions.expr; + import java.io.IOException; import java.io.UncheckedIOException; import java.util.List; @@ -40,9 +42,6 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.spark.sql.functions.date_add; -import static org.apache.spark.sql.functions.expr; - public class TestRuntimeFiltering extends SparkTestBaseWithCatalog { @After @@ -53,245 +52,286 @@ public void removeTables() { @Test public void testIdentityPartitionedTable() throws NoSuchTableException { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (date)", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (date)", + tableName); - Dataset df = spark.range(1, 100) - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("id", "data", "date", "ts"); + Dataset df = + spark + .range(1, 100) + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("id", "data", "date", "ts"); df.coalesce(1).writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); sql("CREATE TABLE dim (id BIGINT, date DATE) USING parquet"); - Dataset dimDF = spark.range(1, 10) - .withColumn("date", expr("DATE '1970-01-02'")) - .select("id", "date"); + Dataset dimDF = + spark.range(1, 10).withColumn("date", expr("DATE '1970-01-02'")).select("id", "date"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); - String query = String.format( - "SELECT f.* FROM %s f JOIN dim d ON f.date = d.date AND d.id = 1 ORDER BY id", - tableName); + String query = + String.format( + "SELECT f.* FROM %s f JOIN dim d ON f.date = d.date AND d.id = 1 ORDER BY id", + tableName); assertQueryContainsRuntimeFilter(query); deleteNotMatchingFiles(Expressions.equal("date", 1), 3); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE date = DATE '1970-01-02' ORDER BY id", tableName), sql(query)); } @Test public void testBucketedTable() throws NoSuchTableException { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (bucket(8, id))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (bucket(8, id))", + tableName); - Dataset df = spark.range(1, 100) - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("id", "data", "date", "ts"); + Dataset df = + spark + .range(1, 100) + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("id", "data", "date", "ts"); df.coalesce(1).writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); sql("CREATE TABLE dim (id BIGINT, date DATE) USING parquet"); - Dataset dimDF = spark.range(1, 2) - .withColumn("date", expr("DATE '1970-01-02'")) - .select("id", "date"); + Dataset dimDF = + spark.range(1, 2).withColumn("date", expr("DATE '1970-01-02'")).select("id", "date"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); - String query = String.format( - "SELECT f.* FROM %s f JOIN dim d ON f.id = d.id AND d.date = DATE '1970-01-02' ORDER BY date", - tableName); + String query = + String.format( + "SELECT f.* FROM %s f JOIN dim d ON f.id = d.id AND d.date = DATE '1970-01-02' ORDER BY date", + tableName); assertQueryContainsRuntimeFilter(query); deleteNotMatchingFiles(Expressions.equal("id", 1), 7); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE id = 1 ORDER BY date", tableName), sql(query)); } @Test public void testRenamedSourceColumnTable() throws NoSuchTableException { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (bucket(8, id))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (bucket(8, id))", + tableName); - Dataset df = spark.range(1, 100) - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("id", "data", "date", "ts"); + Dataset df = + spark + .range(1, 100) + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("id", "data", "date", "ts"); df.coalesce(1).writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); sql("CREATE TABLE dim (id BIGINT, date DATE) USING parquet"); - Dataset dimDF = spark.range(1, 2) - .withColumn("date", expr("DATE '1970-01-02'")) - .select("id", "date"); + Dataset dimDF = + spark.range(1, 2).withColumn("date", expr("DATE '1970-01-02'")).select("id", "date"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); sql("ALTER TABLE %s RENAME COLUMN id TO row_id", tableName); - String query = String.format( - "SELECT f.* FROM %s f JOIN dim d ON f.row_id = d.id AND d.date = DATE '1970-01-02' ORDER BY date", - tableName); + String query = + String.format( + "SELECT f.* FROM %s f JOIN dim d ON f.row_id = d.id AND d.date = DATE '1970-01-02' ORDER BY date", + tableName); assertQueryContainsRuntimeFilter(query); deleteNotMatchingFiles(Expressions.equal("row_id", 1), 7); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE row_id = 1 ORDER BY date", tableName), sql(query)); } @Test public void testMultipleRuntimeFilters() throws NoSuchTableException { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (data, bucket(8, id))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (data, bucket(8, id))", + tableName); - Dataset df = spark.range(1, 100) - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("id", "data", "date", "ts"); + Dataset df = + spark + .range(1, 100) + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("id", "data", "date", "ts"); df.coalesce(1).writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); sql("CREATE TABLE dim (id BIGINT, date DATE, data STRING) USING parquet"); - Dataset dimDF = spark.range(1, 2) - .withColumn("date", expr("DATE '1970-01-02'")) - .withColumn("data", expr("'1970-01-02'")) - .select("id", "date", "data"); + Dataset dimDF = + spark + .range(1, 2) + .withColumn("date", expr("DATE '1970-01-02'")) + .withColumn("data", expr("'1970-01-02'")) + .select("id", "date", "data"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); - String query = String.format( - "SELECT f.* FROM %s f JOIN dim d ON f.id = d.id AND f.data = d.data AND d.date = DATE '1970-01-02'", - tableName); + String query = + String.format( + "SELECT f.* FROM %s f JOIN dim d ON f.id = d.id AND f.data = d.data AND d.date = DATE '1970-01-02'", + tableName); assertQueryContainsRuntimeFilters(query, 2, "Query should have 2 runtime filters"); deleteNotMatchingFiles(Expressions.equal("id", 1), 31); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE id = 1 AND data = '1970-01-02'", tableName), sql(query)); } @Test public void testCaseSensitivityOfRuntimeFilters() throws NoSuchTableException { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (data, bucket(8, id))", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (data, bucket(8, id))", + tableName); - Dataset df = spark.range(1, 100) - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("id", "data", "date", "ts"); + Dataset df = + spark + .range(1, 100) + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("id", "data", "date", "ts"); df.coalesce(1).writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); sql("CREATE TABLE dim (id BIGINT, date DATE, data STRING) USING parquet"); - Dataset dimDF = spark.range(1, 2) - .withColumn("date", expr("DATE '1970-01-02'")) - .withColumn("data", expr("'1970-01-02'")) - .select("id", "date", "data"); + Dataset dimDF = + spark + .range(1, 2) + .withColumn("date", expr("DATE '1970-01-02'")) + .withColumn("data", expr("'1970-01-02'")) + .select("id", "date", "data"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); - String caseInsensitiveQuery = String.format( - "select f.* from %s F join dim d ON f.Id = d.iD and f.DaTa = d.dAtA and d.dAtE = date '1970-01-02'", - tableName); + String caseInsensitiveQuery = + String.format( + "select f.* from %s F join dim d ON f.Id = d.iD and f.DaTa = d.dAtA and d.dAtE = date '1970-01-02'", + tableName); - assertQueryContainsRuntimeFilters(caseInsensitiveQuery, 2, "Query should have 2 runtime filters"); + assertQueryContainsRuntimeFilters( + caseInsensitiveQuery, 2, "Query should have 2 runtime filters"); deleteNotMatchingFiles(Expressions.equal("id", 1), 31); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE id = 1 AND data = '1970-01-02'", tableName), sql(caseInsensitiveQuery)); } @Test public void testBucketedTableWithMultipleSpecs() throws NoSuchTableException { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) USING iceberg", + tableName); - Dataset df1 = spark.range(1, 100) - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 2 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("id", "data", "date", "ts"); + Dataset df1 = + spark + .range(1, 100) + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 2 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("id", "data", "date", "ts"); df1.coalesce(1).writeTo(tableName).append(); Table table = validationCatalog.loadTable(tableIdent); - table.updateSpec() - .addField(Expressions.bucket("id", 8)) - .commit(); + table.updateSpec().addField(Expressions.bucket("id", 8)).commit(); sql("REFRESH TABLE %s", tableName); - Dataset df2 = spark.range(1, 100) - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("id", "data", "date", "ts"); + Dataset df2 = + spark + .range(1, 100) + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("id", "data", "date", "ts"); df2.coalesce(1).writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); sql("CREATE TABLE dim (id BIGINT, date DATE) USING parquet"); - Dataset dimDF = spark.range(1, 2) - .withColumn("date", expr("DATE '1970-01-02'")) - .select("id", "date"); + Dataset dimDF = + spark.range(1, 2).withColumn("date", expr("DATE '1970-01-02'")).select("id", "date"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); - String query = String.format( - "SELECT f.* FROM %s f JOIN dim d ON f.id = d.id AND d.date = DATE '1970-01-02' ORDER BY date", - tableName); + String query = + String.format( + "SELECT f.* FROM %s f JOIN dim d ON f.id = d.id AND d.date = DATE '1970-01-02' ORDER BY date", + tableName); assertQueryContainsRuntimeFilter(query); deleteNotMatchingFiles(Expressions.equal("id", 1), 7); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE id = 1 ORDER BY date", tableName), sql(query)); } @Test public void testSourceColumnWithDots() throws NoSuchTableException { - sql("CREATE TABLE %s (`i.d` BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (bucket(8, `i.d`))", tableName); + sql( + "CREATE TABLE %s (`i.d` BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (bucket(8, `i.d`))", + tableName); - Dataset df = spark.range(1, 100) - .withColumnRenamed("id", "i.d") - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(`i.d` % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("`i.d`", "data", "date", "ts"); + Dataset df = + spark + .range(1, 100) + .withColumnRenamed("id", "i.d") + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(`i.d` % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("`i.d`", "data", "date", "ts"); df.coalesce(1).writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); sql("SELECT * FROM %s WHERE `i.d` = 1", tableName); sql("CREATE TABLE dim (id BIGINT, date DATE) USING parquet"); - Dataset dimDF = spark.range(1, 2) - .withColumn("date", expr("DATE '1970-01-02'")) - .select("id", "date"); + Dataset dimDF = + spark.range(1, 2).withColumn("date", expr("DATE '1970-01-02'")).select("id", "date"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); - String query = String.format( - "SELECT f.* FROM %s f JOIN dim d ON f.`i.d` = d.id AND d.date = DATE '1970-01-02' ORDER BY date", - tableName); + String query = + String.format( + "SELECT f.* FROM %s f JOIN dim d ON f.`i.d` = d.id AND d.date = DATE '1970-01-02' ORDER BY date", + tableName); assertQueryContainsRuntimeFilter(query); @@ -299,70 +339,82 @@ public void testSourceColumnWithDots() throws NoSuchTableException { sql(query); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE `i.d` = 1 ORDER BY date", tableName), sql(query)); } @Test public void testSourceColumnWithBackticks() throws NoSuchTableException { - sql("CREATE TABLE %s (`i``d` BIGINT, data STRING, date DATE, ts TIMESTAMP) " + - "USING iceberg " + - "PARTITIONED BY (bucket(8, `i``d`))", tableName); + sql( + "CREATE TABLE %s (`i``d` BIGINT, data STRING, date DATE, ts TIMESTAMP) " + + "USING iceberg " + + "PARTITIONED BY (bucket(8, `i``d`))", + tableName); - Dataset df = spark.range(1, 100) - .withColumnRenamed("id", "i`d") - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(`i``d` % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("`i``d`", "data", "date", "ts"); + Dataset df = + spark + .range(1, 100) + .withColumnRenamed("id", "i`d") + .withColumn( + "date", date_add(expr("DATE '1970-01-01'"), expr("CAST(`i``d` % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("`i``d`", "data", "date", "ts"); df.coalesce(1).writeTo(tableName).option(SparkWriteOptions.FANOUT_ENABLED, "true").append(); sql("CREATE TABLE dim (id BIGINT, date DATE) USING parquet"); - Dataset dimDF = spark.range(1, 2) - .withColumn("date", expr("DATE '1970-01-02'")) - .select("id", "date"); + Dataset dimDF = + spark.range(1, 2).withColumn("date", expr("DATE '1970-01-02'")).select("id", "date"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); - String query = String.format( - "SELECT f.* FROM %s f JOIN dim d ON f.`i``d` = d.id AND d.date = DATE '1970-01-02' ORDER BY date", - tableName); + String query = + String.format( + "SELECT f.* FROM %s f JOIN dim d ON f.`i``d` = d.id AND d.date = DATE '1970-01-02' ORDER BY date", + tableName); assertQueryContainsRuntimeFilter(query); deleteNotMatchingFiles(Expressions.equal("i`d", 1), 7); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE `i``d` = 1 ORDER BY date", tableName), sql(query)); } @Test public void testUnpartitionedTable() throws NoSuchTableException { - sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) USING iceberg", + tableName); - Dataset df = spark.range(1, 100) - .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) - .withColumn("ts", expr("TO_TIMESTAMP(date)")) - .withColumn("data", expr("CAST(date AS STRING)")) - .select("id", "data", "date", "ts"); + Dataset df = + spark + .range(1, 100) + .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id % 4 AS INT)"))) + .withColumn("ts", expr("TO_TIMESTAMP(date)")) + .withColumn("data", expr("CAST(date AS STRING)")) + .select("id", "data", "date", "ts"); df.coalesce(1).writeTo(tableName).append(); sql("CREATE TABLE dim (id BIGINT, date DATE) USING parquet"); - Dataset dimDF = spark.range(1, 2) - .withColumn("date", expr("DATE '1970-01-02'")) - .select("id", "date"); + Dataset dimDF = + spark.range(1, 2).withColumn("date", expr("DATE '1970-01-02'")).select("id", "date"); dimDF.coalesce(1).write().mode("append").insertInto("dim"); - String query = String.format( - "SELECT f.* FROM %s f JOIN dim d ON f.id = d.id AND d.date = DATE '1970-01-02' ORDER BY date", - tableName); + String query = + String.format( + "SELECT f.* FROM %s f JOIN dim d ON f.id = d.id AND d.date = DATE '1970-01-02' ORDER BY date", + tableName); assertQueryContainsNoRuntimeFilter(query); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", sql("SELECT * FROM %s WHERE id = 1 ORDER BY date", tableName), sql(query)); } @@ -375,14 +427,16 @@ private void assertQueryContainsNoRuntimeFilter(String query) { assertQueryContainsRuntimeFilters(query, 0, "Query should have no runtime filters"); } - private void assertQueryContainsRuntimeFilters(String query, int expectedFilterCount, String errorMessage) { + private void assertQueryContainsRuntimeFilters( + String query, int expectedFilterCount, String errorMessage) { List output = spark.sql("EXPLAIN EXTENDED " + query).collectAsList(); String plan = output.get(0).getString(0); int actualFilterCount = StringUtils.countMatches(plan, "dynamicpruningexpression"); Assert.assertEquals(errorMessage, expectedFilterCount, actualFilterCount); } - // delete files that don't match the filter to ensure dynamic filtering works and only required files are read + // delete files that don't match the filter to ensure dynamic filtering works and only required + // files are read private void deleteNotMatchingFiles(Expression filter, int expectedDeletedFileCount) { Table table = validationCatalog.loadTable(tableIdent); FileIO io = table.io(); @@ -410,6 +464,9 @@ private void deleteNotMatchingFiles(Expression filter, int expectedDeletedFileCo throw new UncheckedIOException(e); } - Assert.assertEquals("Deleted unexpected number of files", expectedDeletedFileCount, deletedFileLocations.size()); + Assert.assertEquals( + "Deleted unexpected number of files", + expectedDeletedFileCount, + deletedFileLocations.size()); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java index 22756dd36717..9661cfe20b1c 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.IOException; import java.util.List; import org.apache.hadoop.conf.Configuration; @@ -43,18 +44,14 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestSnapshotSelection { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; @@ -79,48 +76,40 @@ public void testSnapshotSelectionById() throws IOException { Table table = tables.create(SCHEMA, spec, tableLocation); // produce the first snapshot - List firstBatchRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List firstBatchRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); // produce the second snapshot - List secondBatchRecords = Lists.newArrayList( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e"), - new SimpleRecord(6, "f") - ); + List secondBatchRecords = + Lists.newArrayList( + new SimpleRecord(4, "d"), new SimpleRecord(5, "e"), new SimpleRecord(6, "f")); Dataset secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class); secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); Assert.assertEquals("Expected 2 snapshots", 2, Iterables.size(table.snapshots())); // verify records in the current snapshot - Dataset currentSnapshotResult = spark.read() - .format("iceberg") - .load(tableLocation); - List currentSnapshotRecords = currentSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset currentSnapshotResult = spark.read().format("iceberg").load(tableLocation); + List currentSnapshotRecords = + currentSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(firstBatchRecords); expectedRecords.addAll(secondBatchRecords); - Assert.assertEquals("Current snapshot rows should match", expectedRecords, currentSnapshotRecords); + Assert.assertEquals( + "Current snapshot rows should match", expectedRecords, currentSnapshotRecords); // verify records in the previous snapshot Snapshot currentSnapshot = table.currentSnapshot(); Long parentSnapshotId = currentSnapshot.parentId(); - Dataset previousSnapshotResult = spark.read() - .format("iceberg") - .option("snapshot-id", parentSnapshotId) - .load(tableLocation); - List previousSnapshotRecords = previousSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - Assert.assertEquals("Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); + Dataset previousSnapshotResult = + spark.read().format("iceberg").option("snapshot-id", parentSnapshotId).load(tableLocation); + List previousSnapshotRecords = + previousSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Assert.assertEquals( + "Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); } @Test @@ -132,11 +121,9 @@ public void testSnapshotSelectionByTimestamp() throws IOException { Table table = tables.create(SCHEMA, spec, tableLocation); // produce the first snapshot - List firstBatchRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List firstBatchRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); @@ -144,37 +131,35 @@ public void testSnapshotSelectionByTimestamp() throws IOException { long firstSnapshotTimestamp = System.currentTimeMillis(); // produce the second snapshot - List secondBatchRecords = Lists.newArrayList( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e"), - new SimpleRecord(6, "f") - ); + List secondBatchRecords = + Lists.newArrayList( + new SimpleRecord(4, "d"), new SimpleRecord(5, "e"), new SimpleRecord(6, "f")); Dataset secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class); secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); Assert.assertEquals("Expected 2 snapshots", 2, Iterables.size(table.snapshots())); // verify records in the current snapshot - Dataset currentSnapshotResult = spark.read() - .format("iceberg") - .load(tableLocation); - List currentSnapshotRecords = currentSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + Dataset currentSnapshotResult = spark.read().format("iceberg").load(tableLocation); + List currentSnapshotRecords = + currentSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); List expectedRecords = Lists.newArrayList(); expectedRecords.addAll(firstBatchRecords); expectedRecords.addAll(secondBatchRecords); - Assert.assertEquals("Current snapshot rows should match", expectedRecords, currentSnapshotRecords); + Assert.assertEquals( + "Current snapshot rows should match", expectedRecords, currentSnapshotRecords); // verify records in the previous snapshot - Dataset previousSnapshotResult = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, firstSnapshotTimestamp) - .load(tableLocation); - List previousSnapshotRecords = previousSnapshotResult.orderBy("id") - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); - Assert.assertEquals("Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); + Dataset previousSnapshotResult = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, firstSnapshotTimestamp) + .load(tableLocation); + List previousSnapshotRecords = + previousSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Assert.assertEquals( + "Previous snapshot rows should match", firstBatchRecords, previousSnapshotRecords); } @Test @@ -185,14 +170,11 @@ public void testSnapshotSelectionByInvalidSnapshotId() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, tableLocation); - Dataset df = spark.read() - .format("iceberg") - .option("snapshot-id", -10) - .load(tableLocation); + Dataset df = spark.read().format("iceberg").option("snapshot-id", -10).load(tableLocation); Assertions.assertThatThrownBy(df::collectAsList) - .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot find snapshot with ID -10"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot find snapshot with ID -10"); } @Test @@ -204,12 +186,15 @@ public void testSnapshotSelectionByInvalidTimestamp() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, tableLocation); - Assertions.assertThatThrownBy(() -> spark.read() + Assertions.assertThatThrownBy( + () -> + spark + .read() .format("iceberg") .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) .load(tableLocation)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot find a snapshot older than"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot find a snapshot older than"); } @Test @@ -220,24 +205,25 @@ public void testSnapshotSelectionBySnapshotIdAndTimestamp() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, tableLocation); - List firstBatchRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List firstBatchRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); long timestamp = System.currentTimeMillis(); long snapshotId = table.currentSnapshot().snapshotId(); - Assertions.assertThatThrownBy(() -> spark.read() + Assertions.assertThatThrownBy( + () -> + spark + .read() .format("iceberg") .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) .load(tableLocation)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Cannot specify both snapshot-id") - .hasMessageContaining("and as-of-timestamp"); + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot specify both snapshot-id") + .hasMessageContaining("and as-of-timestamp"); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java index bda525780d8b..3fb2a630fe81 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkAppenderFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -41,13 +40,13 @@ public TestSparkAppenderFactory(String fileFormat, boolean partitioned) { } @Override - protected FileAppenderFactory createAppenderFactory(List equalityFieldIds, - Schema eqDeleteSchema, - Schema posDeleteRowSchema) { + protected FileAppenderFactory createAppenderFactory( + List equalityFieldIds, Schema eqDeleteSchema, Schema posDeleteRowSchema) { return SparkAppenderFactory.builderFor(table, table.schema(), sparkType) .equalityFieldIds(ArrayUtil.toIntArray(equalityFieldIds)) .eqDeleteRowSchema(eqDeleteSchema) - .posDelRowSchema(posDeleteRowSchema).build(); + .posDelRowSchema(posDeleteRowSchema) + .build(); } @Override diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalog.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalog.java index ef7b80a4ba6f..31d5c38e2a43 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalog.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalog.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -32,12 +31,14 @@ import org.apache.spark.sql.connector.catalog.Table; import org.apache.spark.sql.connector.catalog.TableCatalog; -public class TestSparkCatalog extends SparkSessionCatalog { +public class TestSparkCatalog + extends SparkSessionCatalog { private static final Map tableMap = Maps.newHashMap(); public static void setTable(Identifier ident, Table table) { - Preconditions.checkArgument(!tableMap.containsKey(ident), "Cannot set " + ident + ". It is already set"); + Preconditions.checkArgument( + !tableMap.containsKey(ident), "Cannot set " + ident + ". It is already set"); tableMap.put(ident, table); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogCacheExpiration.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogCacheExpiration.java index 96aeed65bfa7..3d668197fd51 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogCacheExpiration.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogCacheExpiration.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -36,12 +35,16 @@ public class TestSparkCatalogCacheExpiration extends SparkTestBaseWithCatalog { private static final String sessionCatalogName = "spark_catalog"; private static final String sessionCatalogImpl = SparkSessionCatalog.class.getName(); - private static final Map sessionCatalogConfig = ImmutableMap.of( - "type", "hadoop", - "default-namespace", "default", - CatalogProperties.CACHE_ENABLED, "true", - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, "3000" - ); + private static final Map sessionCatalogConfig = + ImmutableMap.of( + "type", + "hadoop", + "default-namespace", + "default", + CatalogProperties.CACHE_ENABLED, + "true", + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + "3000"); private static String asSqlConfCatalogKeyFor(String catalog, String configKey) { // configKey is empty when the catalog's class is being defined @@ -58,19 +61,29 @@ private static String asSqlConfCatalogKeyFor(String catalog, String configKey) { public static void beforeClass() { // Catalog - expiration_disabled: Catalog with caching on and expiration disabled. ImmutableMap.of( - "", "org.apache.iceberg.spark.SparkCatalog", - "type", "hive", - CatalogProperties.CACHE_ENABLED, "true", - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, "-1" - ).forEach((k, v) -> spark.conf().set(asSqlConfCatalogKeyFor("expiration_disabled", k), v)); - - // Catalog - cache_disabled_implicitly: Catalog that does not cache, as the cache expiration interval is 0. + "", + "org.apache.iceberg.spark.SparkCatalog", + "type", + "hive", + CatalogProperties.CACHE_ENABLED, + "true", + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + "-1") + .forEach((k, v) -> spark.conf().set(asSqlConfCatalogKeyFor("expiration_disabled", k), v)); + + // Catalog - cache_disabled_implicitly: Catalog that does not cache, as the cache expiration + // interval is 0. ImmutableMap.of( - "", "org.apache.iceberg.spark.SparkCatalog", - "type", "hive", - CatalogProperties.CACHE_ENABLED, "true", - CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, "0" - ).forEach((k, v) -> spark.conf().set(asSqlConfCatalogKeyFor("cache_disabled_implicitly", k), v)); + "", + "org.apache.iceberg.spark.SparkCatalog", + "type", + "hive", + CatalogProperties.CACHE_ENABLED, + "true", + CatalogProperties.CACHE_EXPIRATION_INTERVAL_MS, + "0") + .forEach( + (k, v) -> spark.conf().set(asSqlConfCatalogKeyFor("cache_disabled_implicitly", k), v)); } public TestSparkCatalogCacheExpiration() { @@ -85,56 +98,55 @@ public void testSparkSessionCatalogWithExpirationEnabled() { .extracting("cacheEnabled") .isEqualTo(true); - Assertions - .assertThat(sparkCatalog) + Assertions.assertThat(sparkCatalog) .extracting("icebergCatalog") .extracting("icebergCatalog") - .isInstanceOfSatisfying(Catalog.class, icebergCatalog -> { - Assertions.assertThat(icebergCatalog) - .isExactlyInstanceOf(CachingCatalog.class) - .extracting("expirationIntervalMillis") - .isEqualTo(3000L); - }); + .isInstanceOfSatisfying( + Catalog.class, + icebergCatalog -> { + Assertions.assertThat(icebergCatalog) + .isExactlyInstanceOf(CachingCatalog.class) + .extracting("expirationIntervalMillis") + .isEqualTo(3000L); + }); } @Test public void testCacheEnabledAndExpirationDisabled() { SparkCatalog sparkCatalog = getSparkCatalog("expiration_disabled"); - Assertions.assertThat(sparkCatalog) - .extracting("cacheEnabled") - .isEqualTo(true); + Assertions.assertThat(sparkCatalog).extracting("cacheEnabled").isEqualTo(true); - Assertions - .assertThat(sparkCatalog) + Assertions.assertThat(sparkCatalog) .extracting("icebergCatalog") - .isInstanceOfSatisfying(CachingCatalog.class, icebergCatalog -> { - Assertions.assertThat(icebergCatalog) - .extracting("expirationIntervalMillis") - .isEqualTo(-1L); - }); + .isInstanceOfSatisfying( + CachingCatalog.class, + icebergCatalog -> { + Assertions.assertThat(icebergCatalog) + .extracting("expirationIntervalMillis") + .isEqualTo(-1L); + }); } @Test public void testCacheDisabledImplicitly() { SparkCatalog sparkCatalog = getSparkCatalog("cache_disabled_implicitly"); - Assertions.assertThat(sparkCatalog) - .extracting("cacheEnabled") - .isEqualTo(false); + Assertions.assertThat(sparkCatalog).extracting("cacheEnabled").isEqualTo(false); - Assertions - .assertThat(sparkCatalog) + Assertions.assertThat(sparkCatalog) .extracting("icebergCatalog") .isInstanceOfSatisfying( Catalog.class, - icebergCatalog -> Assertions.assertThat(icebergCatalog).isNotInstanceOf(CachingCatalog.class)); + icebergCatalog -> + Assertions.assertThat(icebergCatalog).isNotInstanceOf(CachingCatalog.class)); } private SparkSessionCatalog sparkSessionCatalog() { - TableCatalog catalog = (TableCatalog) spark.sessionState().catalogManager().catalog("spark_catalog"); + TableCatalog catalog = + (TableCatalog) spark.sessionState().catalogManager().catalog("spark_catalog"); return (SparkSessionCatalog) catalog; } - private SparkCatalog getSparkCatalog(String catalog) { + private SparkCatalog getSparkCatalog(String catalog) { return (SparkCatalog) spark.sessionState().catalogManager().catalog(catalog); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogHadoopOverrides.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogHadoopOverrides.java index 267270308de5..607f1d45ba3a 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogHadoopOverrides.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkCatalogHadoopOverrides.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; @@ -37,7 +36,6 @@ import org.junit.Test; import org.junit.runners.Parameterized; - public class TestSparkCatalogHadoopOverrides extends SparkCatalogTestBase { private static final String configToOverride = "fs.s3a.buffer.dir"; @@ -49,29 +47,38 @@ public class TestSparkCatalogHadoopOverrides extends SparkCatalogTestBase { @Parameterized.Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") public static Object[][] parameters() { return new Object[][] { - { "testhive", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - hadoopPrefixedConfigToOverride, configOverrideValue - ) }, - { "testhadoop", SparkCatalog.class.getName(), - ImmutableMap.of( - "type", "hadoop", - hadoopPrefixedConfigToOverride, configOverrideValue - ) }, - { "spark_catalog", SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - hadoopPrefixedConfigToOverride, configOverrideValue - ) } + { + "testhive", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", + "hive", + "default-namespace", + "default", + hadoopPrefixedConfigToOverride, + configOverrideValue) + }, + { + "testhadoop", + SparkCatalog.class.getName(), + ImmutableMap.of("type", "hadoop", hadoopPrefixedConfigToOverride, configOverrideValue) + }, + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", + "hive", + "default-namespace", + "default", + hadoopPrefixedConfigToOverride, + configOverrideValue) + } }; } - public TestSparkCatalogHadoopOverrides(String catalogName, - String implementation, - Map config) { + public TestSparkCatalogHadoopOverrides( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -92,7 +99,8 @@ public void testTableFromCatalogHasOverrides() throws Exception { String actualCatalogOverride = conf.get(configToOverride, "/whammies"); Assert.assertEquals( "Iceberg tables from spark should have the overridden hadoop configurations from the spark config", - configOverrideValue, actualCatalogOverride); + configOverrideValue, + actualCatalogOverride); } @Test @@ -102,16 +110,19 @@ public void ensureRoundTripSerializedTableRetainsHadoopConfig() throws Exception String actualCatalogOverride = originalConf.get(configToOverride, "/whammies"); Assert.assertEquals( "Iceberg tables from spark should have the overridden hadoop configurations from the spark config", - configOverrideValue, actualCatalogOverride); + configOverrideValue, + actualCatalogOverride); // Now convert to SerializableTable and ensure overridden property is still present. Table serializableTable = SerializableTableWithSize.copyOf(table); - Table kryoSerializedTable = KryoHelpers.roundTripSerialize(SerializableTableWithSize.copyOf(table)); + Table kryoSerializedTable = + KryoHelpers.roundTripSerialize(SerializableTableWithSize.copyOf(table)); Configuration configFromKryoSerde = ((Configurable) kryoSerializedTable.io()).getConf(); String kryoSerializedCatalogOverride = configFromKryoSerde.get(configToOverride, "/whammies"); Assert.assertEquals( "Tables serialized with Kryo serialization should retain overridden hadoop configuration properties", - configOverrideValue, kryoSerializedCatalogOverride); + configOverrideValue, + kryoSerializedCatalogOverride); // Do the same for Java based serde Table javaSerializedTable = TestHelpers.roundTripSerialize(serializableTable); @@ -119,14 +130,16 @@ public void ensureRoundTripSerializedTableRetainsHadoopConfig() throws Exception String javaSerializedCatalogOverride = configFromJavaSerde.get(configToOverride, "/whammies"); Assert.assertEquals( "Tables serialized with Java serialization should retain overridden hadoop configuration properties", - configOverrideValue, javaSerializedCatalogOverride); + configOverrideValue, + javaSerializedCatalogOverride); } @SuppressWarnings("ThrowSpecificity") private Table getIcebergTableFromSparkCatalog() throws Exception { Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name()); - TableCatalog catalog = (TableCatalog) spark.sessionState().catalogManager().catalog(catalogName); - SparkTable sparkTable = (SparkTable) catalog.loadTable(identifier); + TableCatalog catalog = + (TableCatalog) spark.sessionState().catalogManager().catalog(catalogName); + SparkTable sparkTable = (SparkTable) catalog.loadTable(identifier); return sparkTable.table(); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java index cd1404766d46..b1f2082b5d9b 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -62,43 +64,42 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestSparkDataFile { private static final HadoopTables TABLES = new HadoopTables(new Configuration()); - private static final Schema SCHEMA = new Schema( - required(100, "id", Types.LongType.get()), - optional(101, "data", Types.StringType.get()), - required(102, "b", Types.BooleanType.get()), - optional(103, "i", Types.IntegerType.get()), - required(104, "l", Types.LongType.get()), - optional(105, "f", Types.FloatType.get()), - required(106, "d", Types.DoubleType.get()), - optional(107, "date", Types.DateType.get()), - required(108, "ts", Types.TimestampType.withZone()), - required(110, "s", Types.StringType.get()), - optional(113, "bytes", Types.BinaryType.get()), - required(114, "dec_9_0", Types.DecimalType.of(9, 0)), - required(115, "dec_11_2", Types.DecimalType.of(11, 2)), - required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision - ); - private static final PartitionSpec SPEC = PartitionSpec.builderFor(SCHEMA) - .identity("b") - .bucket("i", 2) - .identity("l") - .identity("f") - .identity("d") - .identity("date") - .hour("ts") - .identity("ts") - .truncate("s", 2) - .identity("bytes") - .bucket("dec_9_0", 2) - .bucket("dec_11_2", 2) - .bucket("dec_38_10", 2) - .build(); + private static final Schema SCHEMA = + new Schema( + required(100, "id", Types.LongType.get()), + optional(101, "data", Types.StringType.get()), + required(102, "b", Types.BooleanType.get()), + optional(103, "i", Types.IntegerType.get()), + required(104, "l", Types.LongType.get()), + optional(105, "f", Types.FloatType.get()), + required(106, "d", Types.DoubleType.get()), + optional(107, "date", Types.DateType.get()), + required(108, "ts", Types.TimestampType.withZone()), + required(110, "s", Types.StringType.get()), + optional(113, "bytes", Types.BinaryType.get()), + required(114, "dec_9_0", Types.DecimalType.of(9, 0)), + required(115, "dec_11_2", Types.DecimalType.of(11, 2)), + required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // maximum precision + ); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA) + .identity("b") + .bucket("i", 2) + .identity("l") + .identity("f") + .identity("d") + .identity("date") + .hour("ts") + .identity("ts") + .truncate("s", 2) + .identity("bytes") + .bucket("dec_9_0", 2) + .bucket("dec_11_2", 2) + .bucket("dec_38_10", 2) + .build(); private static SparkSession spark; private static JavaSparkContext sparkContext = null; @@ -117,8 +118,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private String tableLocation = null; @Before @@ -129,7 +129,8 @@ public void setupTableLocation() throws Exception { @Test public void testValueConversion() throws IOException { - Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); + Table table = + TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation); checkSparkDataFile(table); } @@ -150,7 +151,9 @@ public void testValueConversionWithEmptyStats() throws IOException { private void checkSparkDataFile(Table table) throws IOException { Iterable rows = RandomData.generateSpark(table.schema(), 200, 0); JavaRDD rdd = sparkContext.parallelize(Lists.newArrayList(rows)); - Dataset df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false); + Dataset df = + spark.internalCreateDataFrame( + JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false); df.write().format("iceberg").mode("append").save(tableLocation); @@ -170,16 +173,15 @@ private void checkSparkDataFile(Table table) throws IOException { Dataset dataFileDF = spark.read().format("iceberg").load(tableLocation + "#files"); // reorder columns to test arbitrary projections - List columns = Arrays.stream(dataFileDF.columns()) - .map(ColumnName::new) - .collect(Collectors.toList()); + List columns = + Arrays.stream(dataFileDF.columns()).map(ColumnName::new).collect(Collectors.toList()); Collections.shuffle(columns); - List sparkDataFiles = dataFileDF - .select(Iterables.toArray(columns, Column.class)) - .collectAsList(); + List sparkDataFiles = + dataFileDF.select(Iterables.toArray(columns, Column.class)).collectAsList(); - Assert.assertEquals("The number of files should match", dataFiles.size(), sparkDataFiles.size()); + Assert.assertEquals( + "The number of files should match", dataFiles.size(), sparkDataFiles.size()); Types.StructType dataFileType = DataFile.getType(table.spec().partitionType()); StructType sparkDataFileType = sparkDataFiles.get(0).schema(); @@ -195,9 +197,14 @@ private void checkDataFile(DataFile expected, DataFile actual) { Assert.assertEquals("Format must match", expected.format(), actual.format()); Assert.assertEquals("Record count must match", expected.recordCount(), actual.recordCount()); Assert.assertEquals("Size must match", expected.fileSizeInBytes(), actual.fileSizeInBytes()); - Assert.assertEquals("Record value counts must match", expected.valueCounts(), actual.valueCounts()); - Assert.assertEquals("Record null value counts must match", expected.nullValueCounts(), actual.nullValueCounts()); - Assert.assertEquals("Record nan value counts must match", expected.nanValueCounts(), actual.nanValueCounts()); + Assert.assertEquals( + "Record value counts must match", expected.valueCounts(), actual.valueCounts()); + Assert.assertEquals( + "Record null value counts must match", + expected.nullValueCounts(), + actual.nullValueCounts()); + Assert.assertEquals( + "Record nan value counts must match", expected.nanValueCounts(), actual.nanValueCounts()); Assert.assertEquals("Lower bounds must match", expected.lowerBounds(), actual.lowerBounds()); Assert.assertEquals("Upper bounds must match", expected.upperBounds(), actual.upperBounds()); Assert.assertEquals("Key metadata must match", expected.keyMetadata(), actual.keyMetadata()); @@ -210,7 +217,8 @@ private void checkDataFile(DataFile expected, DataFile actual) { private void checkStructLike(StructLike expected, StructLike actual) { Assert.assertEquals("Struct size should match", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i++) { - Assert.assertEquals("Struct values must match", expected.get(i, Object.class), actual.get(i, Object.class)); + Assert.assertEquals( + "Struct values must match", expected.get(i, Object.class), actual.get(i, Object.class)); } } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java index 5b158c518ae4..b2db853d4753 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + import java.io.File; import java.io.IOException; import java.util.List; @@ -56,28 +61,20 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TableProperties.SPARK_WRITE_PARTITIONED_FANOUT_ENABLED; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.mockito.Mockito.doAnswer; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - @RunWith(Parameterized.class) public class TestSparkDataWrite { private static final Configuration CONF = new Configuration(); private final FileFormat format; private static SparkSession spark = null; - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Parameterized.Parameters(name = "format = {0}") public static Object[] parameters() { - return new Object[] { "parquet", "avro", "orc" }; + return new Object[] {"parquet", "avro", "orc"}; } @BeforeClass @@ -110,15 +107,14 @@ public void testBasicWrite() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); // TODO: incoming columns must be ordered according to the table's schema - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -126,11 +122,10 @@ public void testBasicWrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); for (ManifestFile manifest : table.currentSnapshot().allManifests(table.io())) { @@ -161,30 +156,31 @@ public void testAppend() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); - - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "a"), - new SimpleRecord(5, "b"), - new SimpleRecord(6, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); + + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "b"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "a"), + new SimpleRecord(5, "b"), + new SimpleRecord(6, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); - df.withColumn("id", df.col("id").plus(3)).select("id", "data").write() + df.withColumn("id", df.col("id").plus(3)) + .select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -192,11 +188,10 @@ public void testAppend() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -210,23 +205,24 @@ public void testEmptyOverwrite() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); List expected = records; Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); Dataset empty = spark.createDataFrame(ImmutableList.of(), SimpleRecord.class); - empty.select("id", "data").write() + empty + .select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Overwrite) @@ -235,11 +231,10 @@ public void testEmptyOverwrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -253,30 +248,31 @@ public void testOverwrite() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "a"), - new SimpleRecord(3, "c"), - new SimpleRecord(4, "b"), - new SimpleRecord(6, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), + new SimpleRecord(2, "a"), + new SimpleRecord(3, "c"), + new SimpleRecord(4, "b"), + new SimpleRecord(6, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); // overwrite with 2*id to replace record 2, append 4 and 6 - df.withColumn("id", df.col("id").multiply(2)).select("id", "data").write() + df.withColumn("id", df.col("id").multiply(2)) + .select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Overwrite) @@ -285,11 +281,10 @@ public void testOverwrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -303,22 +298,22 @@ public void testUnpartitionedOverwrite() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); // overwrite with the same data; should not produce two copies - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Overwrite) @@ -326,11 +321,10 @@ public void testUnpartitionedOverwrite() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -344,7 +338,8 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - table.updateProperties() + table + .updateProperties() .set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "4") // ~4 bytes; low enough to trigger .commit(); @@ -355,7 +350,8 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -363,11 +359,10 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -379,7 +374,8 @@ public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws } Assert.assertEquals("Should have 4 DataFiles", 4, files.size()); - Assert.assertTrue("All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); + Assert.assertTrue( + "All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); } @Test @@ -410,15 +406,14 @@ public void testWriteProjection() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, null), - new SimpleRecord(2, null), - new SimpleRecord(3, null) - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null)); Dataset df = spark.createDataFrame(expected, SimpleRecord.class); - df.select("id").write() // select only id column + df.select("id") + .write() // select only id column .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -426,11 +421,10 @@ public void testWriteProjection() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -446,22 +440,23 @@ public void testWriteProjectionWithMiddle() throws IOException { HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); - Schema schema = new Schema( - optional(1, "c1", Types.IntegerType.get()), - optional(2, "c2", Types.StringType.get()), - optional(3, "c3", Types.StringType.get()) - ); + Schema schema = + new Schema( + optional(1, "c1", Types.IntegerType.get()), + optional(2, "c2", Types.StringType.get()), + optional(3, "c3", Types.StringType.get())); Table table = tables.create(schema, spec, location.toString()); - List expected = Lists.newArrayList( - new ThreeColumnRecord(1, null, "hello"), - new ThreeColumnRecord(2, null, "world"), - new ThreeColumnRecord(3, null, null) - ); + List expected = + Lists.newArrayList( + new ThreeColumnRecord(1, null, "hello"), + new ThreeColumnRecord(2, null, "world"), + new ThreeColumnRecord(3, null, null)); Dataset df = spark.createDataFrame(expected, ThreeColumnRecord.class); - df.select("c1", "c3").write() + df.select("c1", "c3") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -469,11 +464,10 @@ public void testWriteProjectionWithMiddle() throws IOException { table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); + List actual = + result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); } @@ -487,44 +481,39 @@ public void testViewsReturnRecentResults() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); - Dataset query = spark.read() - .format("iceberg") - .load(location.toString()) - .where("id = 1"); + Dataset query = spark.read().format("iceberg").load(location.toString()).where("id = 1"); query.createOrReplaceTempView("tmp"); - List actual1 = spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - List expected1 = Lists.newArrayList( - new SimpleRecord(1, "a") - ); + List actual1 = + spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List expected1 = Lists.newArrayList(new SimpleRecord(1, "a")); Assert.assertEquals("Number of rows should match", expected1.size(), actual1.size()); Assert.assertEquals("Result rows should match", expected1, actual1); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) .save(location.toString()); - List actual2 = spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); - List expected2 = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(1, "a") - ); + List actual2 = + spark.table("tmp").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List expected2 = + Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(1, "a")); Assert.assertEquals("Number of rows should match", expected2.size(), actual2.size()); Assert.assertEquals("Result rows should match", expected2, actual2); } @@ -550,7 +539,9 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti switch (option) { case NONE: - df.select("id", "data").sort("data").write() + df.select("id", "data") + .sort("data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -559,7 +550,8 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti break; case TABLE: table.updateProperties().set(SPARK_WRITE_PARTITIONED_FANOUT_ENABLED, "true").commit(); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -567,7 +559,8 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti .save(location.toString()); break; case JOB: - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) .mode(SaveMode.Append) @@ -581,11 +574,10 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti table.refresh(); - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); + Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); @@ -597,7 +589,8 @@ public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType opti } Assert.assertEquals("Should have 8 DataFiles", 8, files.size()); - Assert.assertTrue("All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); + Assert.assertTrue( + "All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); } @Test @@ -609,20 +602,21 @@ public void testCommitUnknownException() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); AppendFiles append = table.newFastAppend(); AppendFiles spyAppend = spy(append); - doAnswer(invocation -> { - append.commit(); - throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); - }).when(spyAppend).commit(); + doAnswer( + invocation -> { + append.commit(); + throw new CommitStateUnknownException(new RuntimeException("Datacenter on Fire")); + }) + .when(spyAppend) + .commit(); Table spyTable = spy(table); when(spyTable.newAppend()).thenReturn(spyAppend); @@ -632,20 +626,25 @@ public void testCommitUnknownException() throws IOException { ManualSource.setTable(manualTableName, sparkTable); // Although an exception is thrown here, write and commit have succeeded - AssertHelpers.assertThrowsWithCause("Should throw a Commit State Unknown Exception", + AssertHelpers.assertThrowsWithCause( + "Should throw a Commit State Unknown Exception", SparkException.class, "Writing job aborted", CommitStateUnknownException.class, "Datacenter on Fire", - () -> df.select("id", "data").sort("data").write() - .format("org.apache.iceberg.spark.source.ManualSource") - .option(ManualSource.TABLE_NAME, manualTableName) - .mode(SaveMode.Append) - .save(location.toString())); + () -> + df.select("id", "data") + .sort("data") + .write() + .format("org.apache.iceberg.spark.source.ManualSource") + .option(ManualSource.TABLE_NAME, manualTableName) + .mode(SaveMode.Append) + .save(location.toString())); // Since write and commit succeeded, the rows should be readable Dataset result = spark.read().format("iceberg").load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", records.size(), actual.size()); Assert.assertEquals("Result rows should match", records, actual); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java index 702e8ab98990..4a3263e368c0 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFileWriterFactory.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -39,9 +38,11 @@ public TestSparkFileWriterFactory(FileFormat fileFormat, boolean partitioned) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFilesScan.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFilesScan.java index 63195cfd3967..d0959d6866bc 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFilesScan.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkFilesScan.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -53,10 +52,8 @@ public void removeTables() { public void testTaskSetLoading() throws NoSuchTableException, IOException { sql("CREATE TABLE %s (id INT, data STRING) USING iceberg", tableName); - List records = ImmutableList.of( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + ImmutableList.of(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.writeTo(tableName).append(); @@ -69,15 +66,19 @@ public void testTaskSetLoading() throws NoSuchTableException, IOException { taskSetManager.stageTasks(table, setID, ImmutableList.copyOf(fileScanTasks)); // load the staged file set - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) + .load(tableName); // write the records back essentially duplicating data scanDF.writeTo(tableName).append(); } - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1, "a"), row(1, "a"), row(2, "b"), row(2, "b")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -86,10 +87,8 @@ public void testTaskSetLoading() throws NoSuchTableException, IOException { public void testTaskSetPlanning() throws NoSuchTableException, IOException { sql("CREATE TABLE %s (id INT, data STRING) USING iceberg", tableName); - List records = ImmutableList.of( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b") - ); + List records = + ImmutableList.of(new SimpleRecord(1, "a"), new SimpleRecord(2, "b")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.coalesce(1).writeTo(tableName).append(); df.coalesce(1).writeTo(tableName).append(); @@ -104,17 +103,23 @@ public void testTaskSetPlanning() throws NoSuchTableException, IOException { taskSetManager.stageTasks(table, setID, tasks); // load the staged file set and make sure each file is in a separate split - Dataset scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) - .option(SparkReadOptions.SPLIT_SIZE, tasks.get(0).file().fileSizeInBytes()) - .load(tableName); + Dataset scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) + .option(SparkReadOptions.SPLIT_SIZE, tasks.get(0).file().fileSizeInBytes()) + .load(tableName); Assert.assertEquals("Num partitions should match", 2, scanDF.javaRDD().getNumPartitions()); // load the staged file set and make sure we combine both files into a single split - scanDF = spark.read().format("iceberg") - .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) - .option(SparkReadOptions.SPLIT_SIZE, Long.MAX_VALUE) - .load(tableName); + scanDF = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, setID) + .option(SparkReadOptions.SPLIT_SIZE, Long.MAX_VALUE) + .load(tableName); Assert.assertEquals("Num partitions should match", 1, scanDF.javaRDD().getNumPartitions()); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java index be74d1c5a33b..c3bb35ca7df8 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMergingMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.IOException; @@ -42,26 +41,32 @@ public TestSparkMergingMetrics(FileFormat fileFormat) { @Override protected FileAppender writeAndGetAppender(List records) throws IOException { - Table testTable = new BaseTable(null, "dummy") { - @Override - public Map properties() { - return Collections.emptyMap(); - } - @Override - public SortOrder sortOrder() { - return SortOrder.unsorted(); - } - @Override - public PartitionSpec spec() { - return PartitionSpec.unpartitioned(); - } - }; + Table testTable = + new BaseTable(null, "dummy") { + @Override + public Map properties() { + return Collections.emptyMap(); + } + + @Override + public SortOrder sortOrder() { + return SortOrder.unsorted(); + } + + @Override + public PartitionSpec spec() { + return PartitionSpec.unpartitioned(); + } + }; FileAppender appender = - SparkAppenderFactory.builderFor(testTable, SCHEMA, SparkSchemaUtil.convert(SCHEMA)).build() + SparkAppenderFactory.builderFor(testTable, SCHEMA, SparkSchemaUtil.convert(SCHEMA)) + .build() .newAppender(org.apache.iceberg.Files.localOutput(temp.newFile()), fileFormat); try (FileAppender fileAppender = appender) { - records.stream().map(r -> new StructInternalRow(SCHEMA.asStruct()).setStruct(r)).forEach(fileAppender::add); + records.stream() + .map(r -> new StructInternalRow(SCHEMA.asStruct()).setStruct(r)) + .forEach(fileAppender::add); } return appender; } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMetadataColumns.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMetadataColumns.java index 9477bc985295..e39985228570 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMetadataColumns.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkMetadataColumns.java @@ -16,9 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.FORMAT_VERSION; +import static org.apache.iceberg.TableProperties.ORC_VECTORIZATION_ENABLED; +import static org.apache.iceberg.TableProperties.PARQUET_BATCH_SIZE; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; +import static org.apache.spark.sql.functions.lit; + import java.io.IOException; import java.util.List; import java.util.stream.Collectors; @@ -56,44 +63,37 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.FORMAT_VERSION; -import static org.apache.iceberg.TableProperties.ORC_VECTORIZATION_ENABLED; -import static org.apache.iceberg.TableProperties.PARQUET_BATCH_SIZE; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.PARQUET_VECTORIZATION_ENABLED; -import static org.apache.spark.sql.functions.lit; - @RunWith(Parameterized.class) public class TestSparkMetadataColumns extends SparkTestBase { private static final String TABLE_NAME = "test_table"; - private static final Schema SCHEMA = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "category", Types.StringType.get()), - Types.NestedField.optional(3, "data", Types.StringType.get()) - ); - private static final PartitionSpec UNKNOWN_SPEC = PartitionSpecParser.fromJson(SCHEMA, - "{ \"spec-id\": 1, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "category", Types.StringType.get()), + Types.NestedField.optional(3, "data", Types.StringType.get())); + private static final PartitionSpec UNKNOWN_SPEC = + PartitionSpecParser.fromJson( + SCHEMA, + "{ \"spec-id\": 1, \"fields\": [ { \"name\": \"id_zero\", \"transform\": \"zero\", \"source-id\": 1 } ] }"); @Parameterized.Parameters(name = "fileFormat = {0}, vectorized = {1}, formatVersion = {2}") public static Object[][] parameters() { return new Object[][] { - { FileFormat.PARQUET, false, 1}, - { FileFormat.PARQUET, true, 1}, - { FileFormat.PARQUET, false, 2}, - { FileFormat.PARQUET, true, 2}, - { FileFormat.AVRO, false, 1}, - { FileFormat.AVRO, false, 2}, - { FileFormat.ORC, false, 1}, - { FileFormat.ORC, true, 1}, - { FileFormat.ORC, false, 2}, - { FileFormat.ORC, true, 2}, + {FileFormat.PARQUET, false, 1}, + {FileFormat.PARQUET, true, 1}, + {FileFormat.PARQUET, false, 2}, + {FileFormat.PARQUET, true, 2}, + {FileFormat.AVRO, false, 1}, + {FileFormat.AVRO, false, 2}, + {FileFormat.ORC, false, 1}, + {FileFormat.ORC, true, 1}, + {FileFormat.ORC, false, 2}, + {FileFormat.ORC, true, 2}, }; } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final FileFormat fileFormat; private final boolean vectorized; @@ -109,13 +109,16 @@ public TestSparkMetadataColumns(FileFormat fileFormat, boolean vectorized, int f @BeforeClass public static void setupSpark() { - ImmutableMap config = ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "cache-enabled", "true" - ); - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); - config.forEach((key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); + ImmutableMap config = + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "cache-enabled", "true"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); + config.forEach( + (key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); } @Before @@ -136,35 +139,29 @@ public void testSpecAndPartitionMetadataColumns() { sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", TABLE_NAME); table.refresh(); - table.updateSpec() - .addField("data") - .commit(); + table.updateSpec().addField("data").commit(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", TABLE_NAME); table.refresh(); - table.updateSpec() - .addField(Expressions.bucket("category", 8)) - .commit(); + table.updateSpec().addField(Expressions.bucket("category", 8)).commit(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", TABLE_NAME); table.refresh(); - table.updateSpec() - .removeField("data") - .commit(); + table.updateSpec().removeField("data").commit(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1')", TABLE_NAME); table.refresh(); - table.updateSpec() - .renameField("category_bucket_8", "category_bucket_8_another_name") - .commit(); - - List expected = ImmutableList.of( - row(0, row(null, null)), - row(1, row("b1", null)), - row(2, row("b1", 2)), - row(3, row(null, 2)) - ); - assertEquals("Rows must match", expected, + table.updateSpec().renameField("category_bucket_8", "category_bucket_8_another_name").commit(); + + List expected = + ImmutableList.of( + row(0, row(null, null)), + row(1, row("b1", null)), + row(2, row("b1", 2)), + row(3, row(null, 2))); + assertEquals( + "Rows must match", + expected, sql("SELECT _spec_id, _partition FROM %s ORDER BY _spec_id", TABLE_NAME)); } @@ -172,56 +169,48 @@ public void testSpecAndPartitionMetadataColumns() { public void testPositionMetadataColumnWithMultipleRowGroups() throws NoSuchTableException { Assume.assumeTrue(fileFormat == FileFormat.PARQUET); - table.updateProperties() - .set(PARQUET_ROW_GROUP_SIZE_BYTES, "100") - .commit(); + table.updateProperties().set(PARQUET_ROW_GROUP_SIZE_BYTES, "100").commit(); List ids = Lists.newArrayList(); for (long id = 0L; id < 200L; id++) { ids.add(id); } - Dataset df = spark.createDataset(ids, Encoders.LONG()) - .withColumnRenamed("value", "id") - .withColumn("category", lit("hr")) - .withColumn("data", lit("ABCDEF")); + Dataset df = + spark + .createDataset(ids, Encoders.LONG()) + .withColumnRenamed("value", "id") + .withColumn("category", lit("hr")) + .withColumn("data", lit("ABCDEF")); df.coalesce(1).writeTo(TABLE_NAME).append(); Assert.assertEquals(200, spark.table(TABLE_NAME).count()); - List expectedRows = ids.stream() - .map(this::row) - .collect(Collectors.toList()); - assertEquals("Rows must match", - expectedRows, - sql("SELECT _pos FROM %s", TABLE_NAME)); + List expectedRows = ids.stream().map(this::row).collect(Collectors.toList()); + assertEquals("Rows must match", expectedRows, sql("SELECT _pos FROM %s", TABLE_NAME)); } @Test public void testPositionMetadataColumnWithMultipleBatches() throws NoSuchTableException { Assume.assumeTrue(fileFormat == FileFormat.PARQUET); - table.updateProperties() - .set(PARQUET_BATCH_SIZE, "1000") - .commit(); + table.updateProperties().set(PARQUET_BATCH_SIZE, "1000").commit(); List ids = Lists.newArrayList(); for (long id = 0L; id < 7500L; id++) { ids.add(id); } - Dataset df = spark.createDataset(ids, Encoders.LONG()) - .withColumnRenamed("value", "id") - .withColumn("category", lit("hr")) - .withColumn("data", lit("ABCDEF")); + Dataset df = + spark + .createDataset(ids, Encoders.LONG()) + .withColumnRenamed("value", "id") + .withColumn("category", lit("hr")) + .withColumn("data", lit("ABCDEF")); df.coalesce(1).writeTo(TABLE_NAME).append(); Assert.assertEquals(7500, spark.table(TABLE_NAME).count()); - List expectedRows = ids.stream() - .map(this::row) - .collect(Collectors.toList()); - assertEquals("Rows must match", - expectedRows, - sql("SELECT _pos FROM %s", TABLE_NAME)); + List expectedRows = ids.stream().map(this::row).collect(Collectors.toList()); + assertEquals("Rows must match", expectedRows, sql("SELECT _pos FROM %s", TABLE_NAME)); } @Test @@ -231,42 +220,52 @@ public void testPartitionMetadataColumnWithUnknownTransforms() { TableMetadata base = ops.current(); ops.commit(base, base.updatePartitionSpec(UNKNOWN_SPEC)); - AssertHelpers.assertThrows("Should fail to query the partition metadata column", - ValidationException.class, "Cannot build table partition type, unknown transforms", + AssertHelpers.assertThrows( + "Should fail to query the partition metadata column", + ValidationException.class, + "Cannot build table partition type, unknown transforms", () -> sql("SELECT _partition FROM %s", TABLE_NAME)); } @Test public void testConflictingColumns() { - table.updateSchema() + table + .updateSchema() .addColumn(MetadataColumns.SPEC_ID.name(), Types.IntegerType.get()) .addColumn(MetadataColumns.FILE_PATH.name(), Types.StringType.get()) .commit(); sql("INSERT INTO TABLE %s VALUES (1, 'a1', 'b1', -1, 'path/to/file')", TABLE_NAME); - assertEquals("Rows must match", + assertEquals( + "Rows must match", ImmutableList.of(row(1L, "a1")), sql("SELECT id, category FROM %s", TABLE_NAME)); - AssertHelpers.assertThrows("Should fail to query conflicting columns", - ValidationException.class, "column names conflict", + AssertHelpers.assertThrows( + "Should fail to query conflicting columns", + ValidationException.class, + "column names conflict", () -> sql("SELECT * FROM %s", TABLE_NAME)); table.refresh(); - table.updateSchema() + table + .updateSchema() .renameColumn(MetadataColumns.SPEC_ID.name(), "_renamed" + MetadataColumns.SPEC_ID.name()) - .renameColumn(MetadataColumns.FILE_PATH.name(), "_renamed" + MetadataColumns.FILE_PATH.name()) + .renameColumn( + MetadataColumns.FILE_PATH.name(), "_renamed" + MetadataColumns.FILE_PATH.name()) .commit(); - assertEquals("Rows must match", + assertEquals( + "Rows must match", ImmutableList.of(row(0, null, -1)), sql("SELECT _spec_id, _partition, _renamed_spec_id FROM %s", TABLE_NAME)); } private void createAndInitTable() throws IOException { - this.table = TestTables.create(temp.newFolder(), TABLE_NAME, SCHEMA, PartitionSpec.unpartitioned()); + this.table = + TestTables.create(temp.newFolder(), TABLE_NAME, SCHEMA, PartitionSpec.unpartitioned()); UpdateProperties updateProperties = table.updateProperties(); updateProperties.set(FORMAT_VERSION, String.valueOf(formatVersion)); @@ -280,7 +279,8 @@ private void createAndInitTable() throws IOException { updateProperties.set(ORC_VECTORIZATION_ENABLED, String.valueOf(vectorized)); break; default: - Preconditions.checkState(!vectorized, "File format %s does not support vectorized reads", fileFormat); + Preconditions.checkState( + !vectorized, "File format %s does not support vectorized reads", fileFormat); } updateProperties.commit(); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java index 4d07cfbe86ea..276d8c632fc0 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPartitioningWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -39,9 +38,11 @@ public TestSparkPartitioningWriters(FileFormat fileFormat) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java index 480448e13a8f..245c392774f5 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkPositionDeltaWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -39,9 +38,11 @@ public TestSparkPositionDeltaWriters(FileFormat fileFormat) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java index f42b48d0e30d..7d6f0e76f78f 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.List; @@ -51,10 +54,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSparkReadProjection extends TestReadProjection { @@ -63,11 +62,11 @@ public class TestSparkReadProjection extends TestReadProjection { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false }, - { "orc", false }, - { "orc", true } + {"parquet", false}, + {"parquet", true}, + {"avro", false}, + {"orc", false}, + {"orc", true} }; } @@ -83,14 +82,17 @@ public TestSparkReadProjection(String format, boolean vectorized) { @BeforeClass public static void startSpark() { TestSparkReadProjection.spark = SparkSession.builder().master("local[2]").getOrCreate(); - ImmutableMap config = ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "parquet-enabled", "true", - "cache-enabled", "false" - ); - spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); - config.forEach((key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); + ImmutableMap config = + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "parquet-enabled", "true", + "cache-enabled", "false"); + spark + .conf() + .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.source.TestSparkCatalog"); + config.forEach( + (key, value) -> spark.conf().set("spark.sql.catalog.spark_catalog." + key, value)); } @AfterClass @@ -101,8 +103,8 @@ public static void stopSpark() { } @Override - protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, - Record record) throws IOException { + protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, Record record) + throws IOException { File parent = temp.newFolder(desc); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); @@ -116,16 +118,17 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema // When tables are created, the column ids are reassigned. Schema tableSchema = table.schema(); - try (FileAppender writer = new GenericAppenderFactory(tableSchema).newAppender( - localOutput(testFile), format)) { + try (FileAppender writer = + new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), format)) { writer.add(record); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(100) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(100) + .withFileSizeInBytes(testFile.length()) + .withPath(testFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); @@ -139,14 +142,16 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema Schema expectedSchema = reassignIds(readSchema, idMapping); // Set the schema to the expected schema directly to simulate the table schema evolving - TestTables.replaceMetadata(desc, - TestTables.readMetadata(desc).updateSchema(expectedSchema, 100)); + TestTables.replaceMetadata( + desc, TestTables.readMetadata(desc).updateSchema(expectedSchema, 100)); - Dataset df = spark.read() - .format("org.apache.iceberg.spark.source.TestIcebergSource") - .option("iceberg.table.name", desc) - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(); + Dataset df = + spark + .read() + .format("org.apache.iceberg.spark.source.TestIcebergSource") + .option("iceberg.table.name", desc) + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(); return SparkValueConverter.convert(readSchema, df.collectAsList().get(0)); @@ -157,87 +162,98 @@ protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema private List allIds(Schema schema) { List ids = Lists.newArrayList(); - TypeUtil.visit(schema, new TypeUtil.SchemaVisitor() { - @Override - public Void field(Types.NestedField field, Void fieldResult) { - ids.add(field.fieldId()); - return null; - } + TypeUtil.visit( + schema, + new TypeUtil.SchemaVisitor() { + @Override + public Void field(Types.NestedField field, Void fieldResult) { + ids.add(field.fieldId()); + return null; + } - @Override - public Void list(Types.ListType list, Void elementResult) { - ids.add(list.elementId()); - return null; - } + @Override + public Void list(Types.ListType list, Void elementResult) { + ids.add(list.elementId()); + return null; + } - @Override - public Void map(Types.MapType map, Void keyResult, Void valueResult) { - ids.add(map.keyId()); - ids.add(map.valueId()); - return null; - } - }); + @Override + public Void map(Types.MapType map, Void keyResult, Void valueResult) { + ids.add(map.keyId()); + ids.add(map.valueId()); + return null; + } + }); return ids; } private Schema reassignIds(Schema schema, Map idMapping) { - return new Schema(TypeUtil.visit(schema, new TypeUtil.SchemaVisitor() { - private int mapId(int id) { - if (idMapping.containsKey(id)) { - return idMapping.get(id); - } - return 1000 + id; // make sure the new IDs don't conflict with reassignment - } + return new Schema( + TypeUtil.visit( + schema, + new TypeUtil.SchemaVisitor() { + private int mapId(int id) { + if (idMapping.containsKey(id)) { + return idMapping.get(id); + } + return 1000 + id; // make sure the new IDs don't conflict with reassignment + } - @Override - public Type schema(Schema schema, Type structResult) { - return structResult; - } + @Override + public Type schema(Schema schema, Type structResult) { + return structResult; + } - @Override - public Type struct(Types.StructType struct, List fieldResults) { - List newFields = Lists.newArrayListWithExpectedSize(fieldResults.size()); - List fields = struct.fields(); - for (int i = 0; i < fields.size(); i += 1) { - Types.NestedField field = fields.get(i); - if (field.isOptional()) { - newFields.add(optional(mapId(field.fieldId()), field.name(), fieldResults.get(i))); - } else { - newFields.add(required(mapId(field.fieldId()), field.name(), fieldResults.get(i))); - } - } - return Types.StructType.of(newFields); - } + @Override + public Type struct(Types.StructType struct, List fieldResults) { + List newFields = + Lists.newArrayListWithExpectedSize(fieldResults.size()); + List fields = struct.fields(); + for (int i = 0; i < fields.size(); i += 1) { + Types.NestedField field = fields.get(i); + if (field.isOptional()) { + newFields.add( + optional(mapId(field.fieldId()), field.name(), fieldResults.get(i))); + } else { + newFields.add( + required(mapId(field.fieldId()), field.name(), fieldResults.get(i))); + } + } + return Types.StructType.of(newFields); + } - @Override - public Type field(Types.NestedField field, Type fieldResult) { - return fieldResult; - } + @Override + public Type field(Types.NestedField field, Type fieldResult) { + return fieldResult; + } - @Override - public Type list(Types.ListType list, Type elementResult) { - if (list.isElementOptional()) { - return Types.ListType.ofOptional(mapId(list.elementId()), elementResult); - } else { - return Types.ListType.ofRequired(mapId(list.elementId()), elementResult); - } - } + @Override + public Type list(Types.ListType list, Type elementResult) { + if (list.isElementOptional()) { + return Types.ListType.ofOptional(mapId(list.elementId()), elementResult); + } else { + return Types.ListType.ofRequired(mapId(list.elementId()), elementResult); + } + } - @Override - public Type map(Types.MapType map, Type keyResult, Type valueResult) { - if (map.isValueOptional()) { - return Types.MapType.ofOptional( - mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); - } else { - return Types.MapType.ofRequired( - mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); - } - } + @Override + public Type map(Types.MapType map, Type keyResult, Type valueResult) { + if (map.isValueOptional()) { + return Types.MapType.ofOptional( + mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); + } else { + return Types.MapType.ofRequired( + mapId(map.keyId()), mapId(map.valueId()), keyResult, valueResult); + } + } - @Override - public Type primitive(Type.PrimitiveType primitive) { - return primitive; - } - }).asNestedType().asStructType().fields()); + @Override + public Type primitive(Type.PrimitiveType primitive) { + return primitive; + } + }) + .asNestedType() + .asStructType() + .fields()); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java index 2da25d5ee529..31ec21b3b0fe 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java @@ -16,9 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.util.List; import java.util.Set; @@ -70,9 +72,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSparkReaderDeletes extends DeleteReadTests { @@ -96,15 +95,18 @@ public static void startMetastoreAndSpark() { metastore.start(); HiveConf hiveConf = metastore.hiveConf(); - spark = SparkSession.builder() - .master("local[2]") - .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .enableHiveSupport() - .getOrCreate(); + spark = + SparkSession.builder() + .master("local[2]") + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .enableHiveSupport() + .getOrCreate(); - catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); try { catalog.createNamespace(Namespace.of("default")); @@ -129,14 +131,15 @@ protected Table createTable(String name, Schema schema, PartitionSpec spec) { TableMetadata meta = ops.current(); ops.commit(meta, meta.upgradeToFormatVersion(2)); if (vectorized) { - table.updateProperties() + table + .updateProperties() .set(TableProperties.PARQUET_VECTORIZATION_ENABLED, "true") - .set(TableProperties.PARQUET_BATCH_SIZE, "4") // split 7 records to two batches to cover more code paths + .set( + TableProperties.PARQUET_BATCH_SIZE, + "4") // split 7 records to two batches to cover more code paths .commit(); } else { - table.updateProperties() - .set(TableProperties.PARQUET_VECTORIZATION_ENABLED, "false") - .commit(); + table.updateProperties().set(TableProperties.PARQUET_VECTORIZATION_ENABLED, "false").commit(); } return table; } @@ -152,16 +155,20 @@ public StructLikeSet rowSet(String name, Table table, String... columns) { } public StructLikeSet rowSet(String name, Types.StructType projection, String... columns) { - Dataset df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", name).toString()) - .selectExpr(columns); + Dataset df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", name).toString()) + .selectExpr(columns); StructLikeSet set = StructLikeSet.create(projection); - df.collectAsList().forEach(row -> { - SparkStructLike rowWrapper = new SparkStructLike(projection); - set.add(rowWrapper.wrap(row)); - }); + df.collectAsList() + .forEach( + row -> { + SparkStructLike rowWrapper = new SparkStructLike(projection); + set.add(rowWrapper.wrap(row)); + }); return set; } @@ -171,31 +178,39 @@ public void testEqualityDeleteWithFilter() throws IOException { String tableName = table.name().substring(table.name().lastIndexOf(".") + 1); Schema deleteRowSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d"), // id = 89 - dataDelete.copy("data", "g") // id = 122 - ); - - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, deleteRowSchema); - - table.newRowDelta() - .addDeletes(eqDeletes) - .commit(); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d"), // id = 89 + dataDelete.copy("data", "g") // id = 122 + ); + + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + deleteRowSchema); + + table.newRowDelta().addDeletes(eqDeletes).commit(); Types.StructType projection = table.schema().select("*").asStruct(); - Dataset df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", tableName).toString()) - .filter("data = 'a'") // select a deleted row - .selectExpr("*"); + Dataset df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", tableName).toString()) + .filter("data = 'a'") // select a deleted row + .selectExpr("*"); StructLikeSet actual = StructLikeSet.create(projection); - df.collectAsList().forEach(row -> { - SparkStructLike rowWrapper = new SparkStructLike(projection); - actual.add(rowWrapper.wrap(row)); - }); + df.collectAsList() + .forEach( + row -> { + SparkStructLike rowWrapper = new SparkStructLike(projection); + actual.add(rowWrapper.wrap(row)); + }); Assert.assertEquals("Table should contain no rows", 0, actual.size()); } @@ -204,44 +219,57 @@ public void testEqualityDeleteWithFilter() throws IOException { public void testReadEqualityDeleteRows() throws IOException { Schema deleteSchema1 = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteSchema1); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d") // id = 89 - ); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d") // id = 89 + ); Schema deleteSchema2 = table.schema().select("id"); Record idDelete = GenericRecord.create(deleteSchema2); - List idDeletes = Lists.newArrayList( - idDelete.copy("id", 121), // id = 121 - idDelete.copy("id", 122) // id = 122 - ); - - DeleteFile eqDelete1 = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, deleteSchema1); - - DeleteFile eqDelete2 = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), idDeletes, deleteSchema2); - - table.newRowDelta() - .addDeletes(eqDelete1) - .addDeletes(eqDelete2) - .commit(); + List idDeletes = + Lists.newArrayList( + idDelete.copy("id", 121), // id = 121 + idDelete.copy("id", 122) // id = 122 + ); + + DeleteFile eqDelete1 = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + deleteSchema1); + + DeleteFile eqDelete2 = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + idDeletes, + deleteSchema2); + + table.newRowDelta().addDeletes(eqDelete1).addDeletes(eqDelete2).commit(); StructLikeSet expectedRowSet = rowSetWithIds(29, 89, 121, 122); Types.StructType type = table.schema().asStruct(); StructLikeSet actualRowSet = StructLikeSet.create(type); - CloseableIterable tasks = TableScanUtil.planTasks( - table.newScan().planFiles(), - TableProperties.METADATA_SPLIT_SIZE_DEFAULT, - TableProperties.SPLIT_LOOKBACK_DEFAULT, - TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); + CloseableIterable tasks = + TableScanUtil.planTasks( + table.newScan().planFiles(), + TableProperties.METADATA_SPLIT_SIZE_DEFAULT, + TableProperties.SPLIT_LOOKBACK_DEFAULT, + TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); for (CombinedScanTask task : tasks) { - try (EqualityDeleteRowReader reader = new EqualityDeleteRowReader(task, table, table.schema(), false)) { + try (EqualityDeleteRowReader reader = + new EqualityDeleteRowReader(task, table, table.schema(), false)) { while (reader.next()) { - actualRowSet.add(new InternalRowWrapper(SparkSchemaUtil.convert(table.schema())).wrap(reader.get().copy())); + actualRowSet.add( + new InternalRowWrapper(SparkSchemaUtil.convert(table.schema())) + .wrap(reader.get().copy())); } } } @@ -252,18 +280,22 @@ public void testReadEqualityDeleteRows() throws IOException { @Test public void testPosDeletesAllRowsInBatch() throws IOException { - // read.parquet.vectorization.batch-size is set to 4, so the 4 rows in the first batch are all deleted. - List> deletes = Lists.newArrayList( - Pair.of(dataFile.path(), 0L), // id = 29 - Pair.of(dataFile.path(), 1L), // id = 43 - Pair.of(dataFile.path(), 2L), // id = 61 - Pair.of(dataFile.path(), 3L) // id = 89 - ); - - Pair posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), deletes); - - table.newRowDelta() + // read.parquet.vectorization.batch-size is set to 4, so the 4 rows in the first batch are all + // deleted. + List> deletes = + Lists.newArrayList( + Pair.of(dataFile.path(), 0L), // id = 29 + Pair.of(dataFile.path(), 1L), // id = 43 + Pair.of(dataFile.path(), 2L), // id = 61 + Pair.of(dataFile.path(), 3L) // id = 89 + ); + + Pair posDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), deletes); + + table + .newRowDelta() .addDeletes(posDeletes.first()) .validateDataFilesExist(posDeletes.second()) .commit(); @@ -276,24 +308,29 @@ public void testPosDeletesAllRowsInBatch() throws IOException { @Test public void testPosDeletesWithDeletedColumn() throws IOException { - // read.parquet.vectorization.batch-size is set to 4, so the 4 rows in the first batch are all deleted. - List> deletes = Lists.newArrayList( - Pair.of(dataFile.path(), 0L), // id = 29 - Pair.of(dataFile.path(), 1L), // id = 43 - Pair.of(dataFile.path(), 2L), // id = 61 - Pair.of(dataFile.path(), 3L) // id = 89 - ); - - Pair posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), deletes); - - table.newRowDelta() + // read.parquet.vectorization.batch-size is set to 4, so the 4 rows in the first batch are all + // deleted. + List> deletes = + Lists.newArrayList( + Pair.of(dataFile.path(), 0L), // id = 29 + Pair.of(dataFile.path(), 1L), // id = 43 + Pair.of(dataFile.path(), 2L), // id = 61 + Pair.of(dataFile.path(), 3L) // id = 89 + ); + + Pair posDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), deletes); + + table + .newRowDelta() .addDeletes(posDeletes.first()) .validateDataFilesExist(posDeletes.second()) .commit(); StructLikeSet expected = expectedRowSet(29, 43, 61, 89); - StructLikeSet actual = rowSet(tableName, PROJECTION_SCHEMA.asStruct(), "id", "data", "_deleted"); + StructLikeSet actual = + rowSet(tableName, PROJECTION_SCHEMA.asStruct(), "id", "data", "_deleted"); Assert.assertEquals("Table should contain expected row", expected, actual); } @@ -303,21 +340,26 @@ public void testEqualityDeleteWithDeletedColumn() throws IOException { String tableName = table.name().substring(table.name().lastIndexOf(".") + 1); Schema deleteRowSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d"), // id = 89 - dataDelete.copy("data", "g") // id = 122 - ); - - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, deleteRowSchema); - - table.newRowDelta() - .addDeletes(eqDeletes) - .commit(); + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d"), // id = 89 + dataDelete.copy("data", "g") // id = 122 + ); + + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + deleteRowSchema); + + table.newRowDelta().addDeletes(eqDeletes).commit(); StructLikeSet expected = expectedRowSet(29, 89, 122); - StructLikeSet actual = rowSet(tableName, PROJECTION_SCHEMA.asStruct(), "id", "data", "_deleted"); + StructLikeSet actual = + rowSet(tableName, PROJECTION_SCHEMA.asStruct(), "id", "data", "_deleted"); Assert.assertEquals("Table should contain expected row", expected, actual); } @@ -326,48 +368,61 @@ public void testEqualityDeleteWithDeletedColumn() throws IOException { public void testMixedPosAndEqDeletesWithDeletedColumn() throws IOException { Schema dataSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(dataSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "a"), // id = 29 - dataDelete.copy("data", "d"), // id = 89 - dataDelete.copy("data", "g") // id = 122 - ); - - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, dataSchema); - - List> deletes = Lists.newArrayList( - Pair.of(dataFile.path(), 3L), // id = 89 - Pair.of(dataFile.path(), 5L) // id = 121 - ); - - Pair posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), deletes); - - table.newRowDelta() + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "a"), // id = 29 + dataDelete.copy("data", "d"), // id = 89 + dataDelete.copy("data", "g") // id = 122 + ); + + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + dataSchema); + + List> deletes = + Lists.newArrayList( + Pair.of(dataFile.path(), 3L), // id = 89 + Pair.of(dataFile.path(), 5L) // id = 121 + ); + + Pair posDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), deletes); + + table + .newRowDelta() .addDeletes(eqDeletes) .addDeletes(posDeletes.first()) .validateDataFilesExist(posDeletes.second()) .commit(); StructLikeSet expected = expectedRowSet(29, 89, 121, 122); - StructLikeSet actual = rowSet(tableName, PROJECTION_SCHEMA.asStruct(), "id", "data", "_deleted"); + StructLikeSet actual = + rowSet(tableName, PROJECTION_SCHEMA.asStruct(), "id", "data", "_deleted"); Assert.assertEquals("Table should contain expected row", expected, actual); } @Test public void testFilterOnDeletedMetadataColumn() throws IOException { - List> deletes = Lists.newArrayList( - Pair.of(dataFile.path(), 0L), // id = 29 - Pair.of(dataFile.path(), 1L), // id = 43 - Pair.of(dataFile.path(), 2L), // id = 61 - Pair.of(dataFile.path(), 3L) // id = 89 - ); - - Pair posDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), deletes); - - table.newRowDelta() + List> deletes = + Lists.newArrayList( + Pair.of(dataFile.path(), 0L), // id = 29 + Pair.of(dataFile.path(), 1L), // id = 43 + Pair.of(dataFile.path(), 2L), // id = 61 + Pair.of(dataFile.path(), 3L) // id = 89 + ); + + Pair posDeletes = + FileHelpers.writeDeleteFile( + table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), deletes); + + table + .newRowDelta() .addDeletes(posDeletes.first()) .validateDataFilesExist(posDeletes.second()) .commit(); @@ -375,35 +430,43 @@ public void testFilterOnDeletedMetadataColumn() throws IOException { StructLikeSet expected = expectedRowSetWithNonDeletesOnly(29, 43, 61, 89); // get non-deleted rows - Dataset df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", tableName).toString()) - .select("id", "data", "_deleted") - .filter("_deleted = false"); + Dataset df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", tableName).toString()) + .select("id", "data", "_deleted") + .filter("_deleted = false"); Types.StructType projection = PROJECTION_SCHEMA.asStruct(); StructLikeSet actual = StructLikeSet.create(projection); - df.collectAsList().forEach(row -> { - SparkStructLike rowWrapper = new SparkStructLike(projection); - actual.add(rowWrapper.wrap(row)); - }); + df.collectAsList() + .forEach( + row -> { + SparkStructLike rowWrapper = new SparkStructLike(projection); + actual.add(rowWrapper.wrap(row)); + }); Assert.assertEquals("Table should contain expected row", expected, actual); StructLikeSet expectedDeleted = expectedRowSetWithDeletesOnly(29, 43, 61, 89); // get deleted rows - df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", tableName).toString()) - .select("id", "data", "_deleted") - .filter("_deleted = true"); + df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", tableName).toString()) + .select("id", "data", "_deleted") + .filter("_deleted = true"); StructLikeSet actualDeleted = StructLikeSet.create(projection); - df.collectAsList().forEach(row -> { - SparkStructLike rowWrapper = new SparkStructLike(projection); - actualDeleted.add(rowWrapper.wrap(row)); - }); + df.collectAsList() + .forEach( + row -> { + SparkStructLike rowWrapper = new SparkStructLike(projection); + actualDeleted.add(rowWrapper.wrap(row)); + }); Assert.assertEquals("Table should contain expected row", expectedDeleted, actualDeleted); } @@ -411,15 +474,16 @@ public void testFilterOnDeletedMetadataColumn() throws IOException { @Test public void testIsDeletedColumnWithoutDeleteFile() { StructLikeSet expected = expectedRowSet(); - StructLikeSet actual = rowSet(tableName, PROJECTION_SCHEMA.asStruct(), "id", "data", "_deleted"); + StructLikeSet actual = + rowSet(tableName, PROJECTION_SCHEMA.asStruct(), "id", "data", "_deleted"); Assert.assertEquals("Table should contain expected row", expected, actual); } - private static final Schema PROJECTION_SCHEMA = new Schema( - required(1, "id", Types.IntegerType.get()), - required(2, "data", Types.StringType.get()), - MetadataColumns.IS_DELETED - ); + private static final Schema PROJECTION_SCHEMA = + new Schema( + required(1, "id", Types.IntegerType.get()), + required(2, "data", Types.StringType.get()), + MetadataColumns.IS_DELETED); private static StructLikeSet expectedRowSet(int... idsToRemove) { return expectedRowSet(false, false, idsToRemove); @@ -433,21 +497,24 @@ private static StructLikeSet expectedRowSetWithNonDeletesOnly(int... idsToRemove return expectedRowSet(true, false, idsToRemove); } - private static StructLikeSet expectedRowSet(boolean removeDeleted, boolean removeNonDeleted, int... idsToRemove) { + private static StructLikeSet expectedRowSet( + boolean removeDeleted, boolean removeNonDeleted, int... idsToRemove) { Set deletedIds = Sets.newHashSet(ArrayUtil.toIntList(idsToRemove)); List records = recordsWithDeletedColumn(); // mark rows deleted - records.forEach(record -> { - if (deletedIds.contains(record.getField("id"))) { - record.setField(MetadataColumns.IS_DELETED.name(), true); - } - }); + records.forEach( + record -> { + if (deletedIds.contains(record.getField("id"))) { + record.setField(MetadataColumns.IS_DELETED.name(), true); + } + }); records.removeIf(record -> deletedIds.contains(record.getField("id")) && removeDeleted); records.removeIf(record -> !deletedIds.contains(record.getField("id")) && removeNonDeleted); StructLikeSet set = StructLikeSet.create(PROJECTION_SCHEMA.asStruct()); - records.forEach(record -> set.add(new InternalRecordWrapper(PROJECTION_SCHEMA.asStruct()).wrap(record))); + records.forEach( + record -> set.add(new InternalRecordWrapper(PROJECTION_SCHEMA.asStruct()).wrap(record))); return set; } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java index 9cf8a1f15071..2723ba0a6275 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java @@ -16,9 +16,15 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; +import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; + import java.io.Closeable; import java.io.IOException; import java.math.BigDecimal; @@ -68,13 +74,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; -import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT; - @RunWith(Parameterized.class) public class TestSparkReaderWithBloomFilter { @@ -95,18 +94,18 @@ public TestSparkReaderWithBloomFilter(boolean vectorized, boolean useBloomFilter } // Schema passed to create tables - public static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "id_long", Types.LongType.get()), - Types.NestedField.required(3, "id_double", Types.DoubleType.get()), - Types.NestedField.required(4, "id_float", Types.FloatType.get()), - Types.NestedField.required(5, "id_string", Types.StringType.get()), - Types.NestedField.optional(6, "id_boolean", Types.BooleanType.get()), - Types.NestedField.optional(7, "id_date", Types.DateType.get()), - Types.NestedField.optional(8, "id_int_decimal", Types.DecimalType.of(8, 2)), - Types.NestedField.optional(9, "id_long_decimal", Types.DecimalType.of(14, 2)), - Types.NestedField.optional(10, "id_fixed_decimal", Types.DecimalType.of(31, 2)) - ); + public static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "id_long", Types.LongType.get()), + Types.NestedField.required(3, "id_double", Types.DoubleType.get()), + Types.NestedField.required(4, "id_float", Types.FloatType.get()), + Types.NestedField.required(5, "id_string", Types.StringType.get()), + Types.NestedField.optional(6, "id_boolean", Types.BooleanType.get()), + Types.NestedField.optional(7, "id_date", Types.DateType.get()), + Types.NestedField.optional(8, "id_int_decimal", Types.DecimalType.of(8, 2)), + Types.NestedField.optional(9, "id_long_decimal", Types.DecimalType.of(14, 2)), + Types.NestedField.optional(10, "id_fixed_decimal", Types.DecimalType.of(31, 2))); private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 329; @@ -116,8 +115,7 @@ public TestSparkReaderWithBloomFilter(boolean vectorized, boolean useBloomFilter private static final float FLOAT_BASE = 100000F; private static final String BINARY_PREFIX = "BINARY测试_"; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); @Before public void writeTestDataFile() throws IOException { @@ -129,24 +127,34 @@ public void writeTestDataFile() throws IOException { GenericRecord record = GenericRecord.create(table.schema()); for (int i = 0; i < INT_VALUE_COUNT; i += 1) { - records.add(record.copy(ImmutableMap.of( - "id", INT_MIN_VALUE + i, - "id_long", LONG_BASE + INT_MIN_VALUE + i, - "id_double", DOUBLE_BASE + INT_MIN_VALUE + i, - "id_float", FLOAT_BASE + INT_MIN_VALUE + i, - "id_string", BINARY_PREFIX + (INT_MIN_VALUE + i), - "id_boolean", (i % 2 == 0) ? true : false, - "id_date", LocalDate.parse("2021-09-05"), - "id_int_decimal", new BigDecimal(String.valueOf(77.77)), - "id_long_decimal", new BigDecimal(String.valueOf(88.88)), - "id_fixed_decimal", new BigDecimal(String.valueOf(99.99))))); + records.add( + record.copy( + ImmutableMap.of( + "id", + INT_MIN_VALUE + i, + "id_long", + LONG_BASE + INT_MIN_VALUE + i, + "id_double", + DOUBLE_BASE + INT_MIN_VALUE + i, + "id_float", + FLOAT_BASE + INT_MIN_VALUE + i, + "id_string", + BINARY_PREFIX + (INT_MIN_VALUE + i), + "id_boolean", + (i % 2 == 0) ? true : false, + "id_date", + LocalDate.parse("2021-09-05"), + "id_int_decimal", + new BigDecimal(String.valueOf(77.77)), + "id_long_decimal", + new BigDecimal(String.valueOf(88.88)), + "id_fixed_decimal", + new BigDecimal(String.valueOf(99.99))))); } this.dataFile = writeDataFile(Files.localOutput(temp.newFile()), Row.of(0), records); - table.newAppend() - .appendFile(dataFile) - .commit(); + table.newAppend().appendFile(dataFile).commit(); } @After @@ -156,9 +164,7 @@ public void cleanup() throws IOException { @Parameterized.Parameters(name = "vectorized = {0}, useBloomFilter = {1}") public static Object[][] parameters() { - return new Object[][] { - {false, false}, {true, false}, {false, true}, {true, true} - }; + return new Object[][] {{false, false}, {true, false}, {false, true}, {true, true}}; } @BeforeClass @@ -167,14 +173,17 @@ public static void startMetastoreAndSpark() { metastore.start(); HiveConf hiveConf = metastore.hiveConf(); - spark = SparkSession.builder() - .master("local[2]") - .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) - .enableHiveSupport() - .getOrCreate(); + spark = + SparkSession.builder() + .master("local[2]") + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .enableHiveSupport() + .getOrCreate(); - catalog = (HiveCatalog) - CatalogUtil.loadCatalog(HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); + catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf); try { catalog.createNamespace(Namespace.of("default")); @@ -199,7 +208,8 @@ protected void createTable(String name, Schema schema) { ops.commit(meta, meta.upgradeToFormatVersion(2)); if (useBloomFilter) { - table.updateProperties() + table + .updateProperties() .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "id", "true") .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "id_long", "true") .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "id_double", "true") @@ -213,11 +223,13 @@ protected void createTable(String name, Schema schema) { .commit(); } - table.updateProperties() - .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, "100") // to have multiple row groups + table + .updateProperties() + .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, "100") // to have multiple row groups .commit(); if (vectorized) { - table.updateProperties() + table + .updateProperties() .set(TableProperties.PARQUET_VECTORIZATION_ENABLED, "true") .set(TableProperties.PARQUET_BATCH_SIZE, "4") .commit(); @@ -233,39 +245,74 @@ private DataFile writeDataFile(OutputFile out, StructLike partition, List writer = factory.newAppender(out, format); @@ -290,13 +337,16 @@ private FileFormat defaultFormat(Map properties) { @Test public void testReadWithFilter() { - Dataset df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", tableName).toString()) - // this is from the first row group - .filter("id = 30 AND id_long = 1030 AND id_double = 10030.0 AND id_float = 100030.0" + - " AND id_string = 'BINARY测试_30' AND id_boolean = true AND id_date = '2021-09-05'" + - " AND id_int_decimal = 77.77 AND id_long_decimal = 88.88 AND id_fixed_decimal = 99.99"); + Dataset df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", tableName).toString()) + // this is from the first row group + .filter( + "id = 30 AND id_long = 1030 AND id_double = 10030.0 AND id_float = 100030.0" + + " AND id_string = 'BINARY测试_30' AND id_boolean = true AND id_date = '2021-09-05'" + + " AND id_int_decimal = 77.77 AND id_long_decimal = 88.88 AND id_fixed_decimal = 99.99"); Record record = SparkValueConverter.convert(table.schema(), df.collectAsList().get(0)); @@ -304,13 +354,16 @@ public void testReadWithFilter() { Assert.assertEquals("Table should contain expected rows", record.get(0), 30); - df = spark.read() - .format("iceberg") - .load(TableIdentifier.of("default", tableName).toString()) - // this is from the third row group - .filter("id = 250 AND id_long = 1250 AND id_double = 10250.0 AND id_float = 100250.0" + - " AND id_string = 'BINARY测试_250' AND id_boolean = true AND id_date = '2021-09-05'" + - " AND id_int_decimal = 77.77 AND id_long_decimal = 88.88 AND id_fixed_decimal = 99.99"); + df = + spark + .read() + .format("iceberg") + .load(TableIdentifier.of("default", tableName).toString()) + // this is from the third row group + .filter( + "id = 250 AND id_long = 1250 AND id_double = 10250.0 AND id_float = 100250.0" + + " AND id_string = 'BINARY测试_250' AND id_boolean = true AND id_date = '2021-09-05'" + + " AND id_int_decimal = 77.77 AND id_long_decimal = 88.88 AND id_fixed_decimal = 99.99"); record = SparkValueConverter.convert(table.schema(), df.collectAsList().get(0)); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java index 9023195dcc6a..dcf9140a8885 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkRollingFileWriters.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.List; @@ -36,9 +35,11 @@ public TestSparkRollingFileWriters(FileFormat fileFormat, boolean partitioned) { } @Override - protected FileWriterFactory newWriterFactory(Schema dataSchema, List equalityFieldIds, - Schema equalityDeleteRowSchema, - Schema positionDeleteRowSchema) { + protected FileWriterFactory newWriterFactory( + Schema dataSchema, + List equalityFieldIds, + Schema equalityDeleteRowSchema, + Schema positionDeleteRowSchema) { return SparkFileWriterFactory.builderFor(table) .dataSchema(table.schema()) .dataFileFormat(format()) diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java index 1b4fb5f8ce58..616a196872de 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Map; diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java index 967f394faa74..06ecc20c2fc3 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkWriterMetrics.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import org.apache.iceberg.FileFormat; diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java index 69302e9d24d7..17370aaa22f2 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestStreamingOffset.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import com.fasterxml.jackson.databind.node.ObjectNode; @@ -29,14 +28,17 @@ public class TestStreamingOffset { @Test public void testJsonConversion() { - StreamingOffset[] expected = new StreamingOffset[]{ - new StreamingOffset(System.currentTimeMillis(), 1L, false), - new StreamingOffset(System.currentTimeMillis(), 2L, false), - new StreamingOffset(System.currentTimeMillis(), 3L, false), - new StreamingOffset(System.currentTimeMillis(), 4L, true) - }; - Assert.assertArrayEquals("StreamingOffsets should match", expected, - Arrays.stream(expected).map(elem -> StreamingOffset.fromJson(elem.json())).toArray()); + StreamingOffset[] expected = + new StreamingOffset[] { + new StreamingOffset(System.currentTimeMillis(), 1L, false), + new StreamingOffset(System.currentTimeMillis(), 2L, false), + new StreamingOffset(System.currentTimeMillis(), 3L, false), + new StreamingOffset(System.currentTimeMillis(), 4L, true) + }; + Assert.assertArrayEquals( + "StreamingOffsets should match", + expected, + Arrays.stream(expected).map(elem -> StreamingOffset.fromJson(elem.json())).toArray()); } @Test diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index e931f554cf5a..610d3075855b 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.types.Types.NestedField.optional; + import java.io.File; import java.util.List; import org.apache.hadoop.conf.Configuration; @@ -49,28 +50,24 @@ import scala.Option; import scala.collection.JavaConverters; -import static org.apache.iceberg.types.Types.NestedField.optional; - public class TestStructuredStreaming { private static final Configuration CONF = new Configuration(); - private static final Schema SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); private static SparkSession spark = null; - @Rule - public TemporaryFolder temp = new TemporaryFolder(); - @Rule - public ExpectedException exceptionRule = ExpectedException.none(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); + @Rule public ExpectedException exceptionRule = ExpectedException.none(); @BeforeClass public static void startSpark() { - TestStructuredStreaming.spark = SparkSession.builder() - .master("local[2]") - .config("spark.sql.shuffle.partitions", 4) - .getOrCreate(); + TestStructuredStreaming.spark = + SparkSession.builder() + .master("local[2]") + .config("spark.sql.shuffle.partitions", 4) + .getOrCreate(); } @AfterClass @@ -90,21 +87,23 @@ public void testStreamingWriteAppendMode() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, "1"), - new SimpleRecord(2, "2"), - new SimpleRecord(3, "3"), - new SimpleRecord(4, "4") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, "1"), + new SimpleRecord(2, "2"), + new SimpleRecord(3, "3"), + new SimpleRecord(4, "4")); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("append") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("append") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { // start the original query with checkpointing @@ -126,10 +125,9 @@ public void testStreamingWriteAppendMode() throws Exception { restartedQuery.processAllAvailable(); // ensure the write was idempotent - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Dataset result = spark.read().format("iceberg").load(location.toString()); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); @@ -150,22 +148,22 @@ public void testStreamingWriteCompleteMode() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(2, "1"), - new SimpleRecord(3, "2"), - new SimpleRecord(1, "3") - ); + List expected = + Lists.newArrayList( + new SimpleRecord(2, "1"), new SimpleRecord(3, "2"), new SimpleRecord(1, "3")); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .groupBy("value") - .count() - .selectExpr("CAST(count AS INT) AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("complete") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .groupBy("value") + .count() + .selectExpr("CAST(count AS INT) AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("complete") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { // start the original query with checkpointing @@ -187,10 +185,9 @@ public void testStreamingWriteCompleteMode() throws Exception { restartedQuery.processAllAvailable(); // ensure the write was idempotent - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); - List actual = result.orderBy("data").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Dataset result = spark.read().format("iceberg").load(location.toString()); + List actual = + result.orderBy("data").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); @@ -211,22 +208,22 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception { PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); - List expected = Lists.newArrayList( - new SimpleRecord(1, null), - new SimpleRecord(2, null), - new SimpleRecord(3, null) - ); + List expected = + Lists.newArrayList( + new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null)); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .groupBy("value") - .count() - .selectExpr("CAST(count AS INT) AS id") // select only id column - .writeStream() - .outputMode("complete") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .groupBy("value") + .count() + .selectExpr("CAST(count AS INT) AS id") // select only id column + .writeStream() + .outputMode("complete") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { // start the original query with checkpointing @@ -248,10 +245,9 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception { restartedQuery.processAllAvailable(); // ensure the write was idempotent - Dataset result = spark.read() - .format("iceberg") - .load(location.toString()); - List actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + Dataset result = spark.read().format("iceberg").load(location.toString()); + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); Assert.assertEquals("Number of snapshots should match", 2, Iterables.size(table.snapshots())); @@ -275,13 +271,15 @@ public void testStreamingWriteUpdateMode() throws Exception { tables.create(SCHEMA, spec, location.toString()); MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); - DataStreamWriter streamWriter = inputStream.toDF() - .selectExpr("value AS id", "CAST (value AS STRING) AS data") - .writeStream() - .outputMode("update") - .format("iceberg") - .option("checkpointLocation", checkpoint.toString()) - .option("path", location.toString()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("update") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); try { StreamingQuery query = streamWriter.start(); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java index e609412f8be0..23fdfb09cb83 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.expressions.Expressions.ref; + import java.io.File; import java.util.Collections; import java.util.List; @@ -58,8 +59,6 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.expressions.Expressions.ref; - @RunWith(Parameterized.class) public final class TestStructuredStreamingRead3 extends SparkCatalogTestBase { public TestStructuredStreamingRead3( @@ -70,59 +69,49 @@ public TestStructuredStreamingRead3( private Table table; /** - * test data to be used by multiple writes - * each write creates a snapshot and writes a list of records + * test data to be used by multiple writes each write creates a snapshot and writes a list of + * records */ - private static final List> TEST_DATA_MULTIPLE_SNAPSHOTS = Lists.newArrayList( - Lists.newArrayList( - new SimpleRecord(1, "one"), - new SimpleRecord(2, "two"), - new SimpleRecord(3, "three")), + private static final List> TEST_DATA_MULTIPLE_SNAPSHOTS = Lists.newArrayList( - new SimpleRecord(4, "four"), - new SimpleRecord(5, "five")), - Lists.newArrayList( - new SimpleRecord(6, "six"), - new SimpleRecord(7, "seven"))); + Lists.newArrayList( + new SimpleRecord(1, "one"), new SimpleRecord(2, "two"), new SimpleRecord(3, "three")), + Lists.newArrayList(new SimpleRecord(4, "four"), new SimpleRecord(5, "five")), + Lists.newArrayList(new SimpleRecord(6, "six"), new SimpleRecord(7, "seven"))); /** - * test data - to be used for multiple write batches - * each batch inturn will have multiple snapshots + * test data - to be used for multiple write batches each batch inturn will have multiple + * snapshots */ - private static final List>> TEST_DATA_MULTIPLE_WRITES_MULTIPLE_SNAPSHOTS = Lists.newArrayList( - Lists.newArrayList( - Lists.newArrayList( - new SimpleRecord(1, "one"), - new SimpleRecord(2, "two"), - new SimpleRecord(3, "three")), - Lists.newArrayList( - new SimpleRecord(4, "four"), - new SimpleRecord(5, "five"))), + private static final List>> TEST_DATA_MULTIPLE_WRITES_MULTIPLE_SNAPSHOTS = Lists.newArrayList( Lists.newArrayList( - new SimpleRecord(6, "six"), - new SimpleRecord(7, "seven")), + Lists.newArrayList( + new SimpleRecord(1, "one"), + new SimpleRecord(2, "two"), + new SimpleRecord(3, "three")), + Lists.newArrayList(new SimpleRecord(4, "four"), new SimpleRecord(5, "five"))), Lists.newArrayList( - new SimpleRecord(8, "eight"), - new SimpleRecord(9, "nine"))), - Lists.newArrayList( + Lists.newArrayList(new SimpleRecord(6, "six"), new SimpleRecord(7, "seven")), + Lists.newArrayList(new SimpleRecord(8, "eight"), new SimpleRecord(9, "nine"))), Lists.newArrayList( - new SimpleRecord(10, "ten"), - new SimpleRecord(11, "eleven"), - new SimpleRecord(12, "twelve")), - Lists.newArrayList( - new SimpleRecord(13, "thirteen"), - new SimpleRecord(14, "fourteen")), - Lists.newArrayList( - new SimpleRecord(15, "fifteen"), - new SimpleRecord(16, "sixteen")))); + Lists.newArrayList( + new SimpleRecord(10, "ten"), + new SimpleRecord(11, "eleven"), + new SimpleRecord(12, "twelve")), + Lists.newArrayList( + new SimpleRecord(13, "thirteen"), new SimpleRecord(14, "fourteen")), + Lists.newArrayList( + new SimpleRecord(15, "fifteen"), new SimpleRecord(16, "sixteen")))); @Before public void setupTable() { - sql("CREATE TABLE %s " + - "(id INT, data STRING) " + - "USING iceberg " + - "PARTITIONED BY (bucket(3, id))", tableName); + sql( + "CREATE TABLE %s " + + "(id INT, data STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(3, id))", + tableName); this.table = validationCatalog.loadTable(tableIdent); } @@ -163,17 +152,19 @@ public void testReadStreamOnIcebergThenAddData() throws Exception { @Test public void testReadingStreamFromTimestamp() throws Exception { - List dataBeforeTimestamp = Lists.newArrayList( - new SimpleRecord(-2, "minustwo"), - new SimpleRecord(-1, "minusone"), - new SimpleRecord(0, "zero")); + List dataBeforeTimestamp = + Lists.newArrayList( + new SimpleRecord(-2, "minustwo"), + new SimpleRecord(-1, "minusone"), + new SimpleRecord(0, "zero")); appendData(dataBeforeTimestamp); table.refresh(); long streamStartTimestamp = table.currentSnapshot().timestampMillis() + 1; - StreamingQuery query = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(streamStartTimestamp)); + StreamingQuery query = + startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(streamStartTimestamp)); List empty = rowsAvailable(query); Assertions.assertThat(empty.isEmpty()).isTrue(); @@ -190,21 +181,25 @@ public void testReadingStreamFromTimestamp() throws Exception { public void testReadingStreamFromFutureTimetsamp() throws Exception { long futureTimestamp = System.currentTimeMillis() + 10000; - StreamingQuery query = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(futureTimestamp)); + StreamingQuery query = + startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(futureTimestamp)); List actual = rowsAvailable(query); Assertions.assertThat(actual.isEmpty()).isTrue(); - List data = Lists.newArrayList( - new SimpleRecord(-2, "minustwo"), - new SimpleRecord(-1, "minusone"), - new SimpleRecord(0, "zero")); + List data = + Lists.newArrayList( + new SimpleRecord(-2, "minustwo"), + new SimpleRecord(-1, "minusone"), + new SimpleRecord(0, "zero")); // Perform several inserts that should not show up because the fromTimestamp has not elapsed - IntStream.range(0, 3).forEach(x -> { - appendData(data); - Assertions.assertThat(rowsAvailable(query).isEmpty()).isTrue(); - }); + IntStream.range(0, 3) + .forEach( + x -> { + appendData(data); + Assertions.assertThat(rowsAvailable(query).isEmpty()).isTrue(); + }); waitUntilAfter(futureTimestamp); @@ -216,16 +211,16 @@ public void testReadingStreamFromFutureTimetsamp() throws Exception { @Test public void testReadingStreamFromTimestampFutureWithExistingSnapshots() throws Exception { - List dataBeforeTimestamp = Lists.newArrayList( - new SimpleRecord(1, "one"), - new SimpleRecord(2, "two"), - new SimpleRecord(3, "three")); + List dataBeforeTimestamp = + Lists.newArrayList( + new SimpleRecord(1, "one"), new SimpleRecord(2, "two"), new SimpleRecord(3, "three")); appendData(dataBeforeTimestamp); long streamStartTimestamp = System.currentTimeMillis() + 2000; // Start the stream with a future timestamp after the current snapshot - StreamingQuery query = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(streamStartTimestamp)); + StreamingQuery query = + startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(streamStartTimestamp)); List actual = rowsAvailable(query); Assert.assertEquals(Collections.emptyList(), actual); @@ -233,7 +228,8 @@ public void testReadingStreamFromTimestampFutureWithExistingSnapshots() throws E waitUntilAfter(streamStartTimestamp); List> expected = TEST_DATA_MULTIPLE_SNAPSHOTS; appendDataAsMultipleSnapshots(expected); - Assertions.assertThat(rowsAvailable(query)).containsExactlyInAnyOrderElementsOf(Iterables.concat(expected)); + Assertions.assertThat(rowsAvailable(query)) + .containsExactlyInAnyOrderElementsOf(Iterables.concat(expected)); } @Test @@ -246,7 +242,8 @@ public void testReadingStreamFromTimestampOfExistingSnapshot() throws Exception long firstSnapshotTime = table.currentSnapshot().timestampMillis(); // Start stream giving the first Snapshot's time as the start point - StreamingQuery stream = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(firstSnapshotTime)); + StreamingQuery stream = + startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, Long.toString(firstSnapshotTime)); // Append rest of expected data for (int i = 1; i < expected.size(); i++) { @@ -259,14 +256,11 @@ public void testReadingStreamFromTimestampOfExistingSnapshot() throws Exception @Test public void testReadingStreamWithExpiredSnapshotFromTimestamp() throws TimeoutException { - List firstSnapshotRecordList = Lists.newArrayList( - new SimpleRecord(1, "one")); + List firstSnapshotRecordList = Lists.newArrayList(new SimpleRecord(1, "one")); - List secondSnapshotRecordList = Lists.newArrayList( - new SimpleRecord(2, "two")); + List secondSnapshotRecordList = Lists.newArrayList(new SimpleRecord(2, "two")); - List thirdSnapshotRecordList = Lists.newArrayList( - new SimpleRecord(3, "three")); + List thirdSnapshotRecordList = Lists.newArrayList(new SimpleRecord(3, "three")); List expectedRecordList = Lists.newArrayList(); expectedRecordList.addAll(secondSnapshotRecordList); @@ -277,13 +271,14 @@ public void testReadingStreamWithExpiredSnapshotFromTimestamp() throws TimeoutEx long firstSnapshotid = table.currentSnapshot().snapshotId(); long firstSnapshotCommitTime = table.currentSnapshot().timestampMillis(); - appendData(secondSnapshotRecordList); appendData(thirdSnapshotRecordList); table.expireSnapshots().expireSnapshotId(firstSnapshotid).commit(); - StreamingQuery query = startStream(SparkReadOptions.STREAM_FROM_TIMESTAMP, String.valueOf(firstSnapshotCommitTime)); + StreamingQuery query = + startStream( + SparkReadOptions.STREAM_FROM_TIMESTAMP, String.valueOf(firstSnapshotCommitTime)); List actual = rowsAvailable(query); Assertions.assertThat(actual).containsExactlyInAnyOrderElementsOf(expectedRecordList); } @@ -294,21 +289,24 @@ public void testResumingStreamReadFromCheckpoint() throws Exception { File writerCheckpoint = new File(writerCheckpointFolder, "writer-checkpoint"); File output = temp.newFolder(); - DataStreamWriter querySource = spark.readStream() - .format("iceberg") - .load(tableName) - .writeStream() - .option("checkpointLocation", writerCheckpoint.toString()) - .format("parquet") - .queryName("checkpoint_test") - .option("path", output.getPath()); + DataStreamWriter querySource = + spark + .readStream() + .format("iceberg") + .load(tableName) + .writeStream() + .option("checkpointLocation", writerCheckpoint.toString()) + .format("parquet") + .queryName("checkpoint_test") + .option("path", output.getPath()); StreamingQuery startQuery = querySource.start(); startQuery.processAllAvailable(); startQuery.stop(); List expected = Lists.newArrayList(); - for (List> expectedCheckpoint : TEST_DATA_MULTIPLE_WRITES_MULTIPLE_SNAPSHOTS) { + for (List> expectedCheckpoint : + TEST_DATA_MULTIPLE_WRITES_MULTIPLE_SNAPSHOTS) { // New data was added while the stream was down appendDataAsMultipleSnapshots(expectedCheckpoint); expected.addAll(Lists.newArrayList(Iterables.concat(Iterables.concat(expectedCheckpoint)))); @@ -319,28 +317,23 @@ public void testResumingStreamReadFromCheckpoint() throws Exception { restartedQuery.stop(); // Read data added by the stream - List actual = spark.read() - .load(output.getPath()) - .as(Encoders.bean(SimpleRecord.class)) - .collectAsList(); + List actual = + spark.read().load(output.getPath()).as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assertions.assertThat(actual).containsExactlyInAnyOrderElementsOf(Iterables.concat(expected)); } } @Test public void testParquetOrcAvroDataInOneTable() throws Exception { - List parquetFileRecords = Lists.newArrayList( - new SimpleRecord(1, "one"), - new SimpleRecord(2, "two"), - new SimpleRecord(3, "three")); + List parquetFileRecords = + Lists.newArrayList( + new SimpleRecord(1, "one"), new SimpleRecord(2, "two"), new SimpleRecord(3, "three")); - List orcFileRecords = Lists.newArrayList( - new SimpleRecord(4, "four"), - new SimpleRecord(5, "five")); + List orcFileRecords = + Lists.newArrayList(new SimpleRecord(4, "four"), new SimpleRecord(5, "five")); - List avroFileRecords = Lists.newArrayList( - new SimpleRecord(6, "six"), - new SimpleRecord(7, "seven")); + List avroFileRecords = + Lists.newArrayList(new SimpleRecord(6, "six"), new SimpleRecord(7, "seven")); appendData(parquetFileRecords); appendData(orcFileRecords, "orc"); @@ -348,7 +341,8 @@ public void testParquetOrcAvroDataInOneTable() throws Exception { StreamingQuery query = startStream(); Assertions.assertThat(rowsAvailable(query)) - .containsExactlyInAnyOrderElementsOf(Iterables.concat(parquetFileRecords, orcFileRecords, avroFileRecords)); + .containsExactlyInAnyOrderElementsOf( + Iterables.concat(parquetFileRecords, orcFileRecords, avroFileRecords)); } @Test @@ -371,18 +365,23 @@ public void testReadStreamWithSnapshotTypeOverwriteErrorsOut() throws Exception Schema deleteRowSchema = table.schema().select("data"); Record dataDelete = GenericRecord.create(deleteRowSchema); - List dataDeletes = Lists.newArrayList( - dataDelete.copy("data", "one") // id = 1 - ); - - DeleteFile eqDeletes = FileHelpers.writeDeleteFile( - table, Files.localOutput(temp.newFile()), TestHelpers.Row.of(0), dataDeletes, deleteRowSchema); - - table.newRowDelta() - .addDeletes(eqDeletes) - .commit(); - - // check pre-condition - that the above Delete file write - actually resulted in snapshot of type OVERWRITE + List dataDeletes = + Lists.newArrayList( + dataDelete.copy("data", "one") // id = 1 + ); + + DeleteFile eqDeletes = + FileHelpers.writeDeleteFile( + table, + Files.localOutput(temp.newFile()), + TestHelpers.Row.of(0), + dataDeletes, + deleteRowSchema); + + table.newRowDelta().addDeletes(eqDeletes).commit(); + + // check pre-condition - that the above Delete file write - actually resulted in snapshot of + // type OVERWRITE Assert.assertEquals(DataOperations.OVERWRITE, table.currentSnapshot().operation()); StreamingQuery query = startStream(); @@ -391,8 +390,7 @@ public void testReadStreamWithSnapshotTypeOverwriteErrorsOut() throws Exception "Streaming should fail with IllegalStateException, as the snapshot is not of type APPEND", IllegalStateException.class, "Cannot process overwrite snapshot", - () -> query.processAllAvailable() - ); + () -> query.processAllAvailable()); } @Test @@ -402,9 +400,7 @@ public void testReadStreamWithSnapshotTypeReplaceIgnoresReplace() throws Excepti appendDataAsMultipleSnapshots(expected); // this should create a snapshot with type Replace. - table.rewriteManifests() - .clusterBy(f -> 1) - .commit(); + table.rewriteManifests().clusterBy(f -> 1).commit(); // check pre-condition Assert.assertEquals(DataOperations.REPLACE, table.currentSnapshot().operation()); @@ -416,21 +412,17 @@ public void testReadStreamWithSnapshotTypeReplaceIgnoresReplace() throws Excepti @Test public void testReadStreamWithSnapshotTypeDeleteErrorsOut() throws Exception { - table.updateSpec() - .removeField("id_bucket") - .addField(ref("id")) - .commit(); + table.updateSpec().removeField("id_bucket").addField(ref("id")).commit(); // fill table with some data List> dataAcrossSnapshots = TEST_DATA_MULTIPLE_SNAPSHOTS; appendDataAsMultipleSnapshots(dataAcrossSnapshots); // this should create a snapshot with type delete. - table.newDelete() - .deleteFromRowFilter(Expressions.equal("id", 4)) - .commit(); + table.newDelete().deleteFromRowFilter(Expressions.equal("id", 4)).commit(); - // check pre-condition - that the above delete operation on table resulted in Snapshot of Type DELETE. + // check pre-condition - that the above delete operation on table resulted in Snapshot of Type + // DELETE. Assert.assertEquals(DataOperations.DELETE, table.currentSnapshot().operation()); StreamingQuery query = startStream(); @@ -439,27 +431,22 @@ public void testReadStreamWithSnapshotTypeDeleteErrorsOut() throws Exception { "Streaming should fail with IllegalStateException, as the snapshot is not of type APPEND", IllegalStateException.class, "Cannot process delete snapshot", - () -> query.processAllAvailable() - ); + () -> query.processAllAvailable()); } @Test public void testReadStreamWithSnapshotTypeDeleteAndSkipDeleteOption() throws Exception { - table.updateSpec() - .removeField("id_bucket") - .addField(ref("id")) - .commit(); + table.updateSpec().removeField("id_bucket").addField(ref("id")).commit(); // fill table with some data List> dataAcrossSnapshots = TEST_DATA_MULTIPLE_SNAPSHOTS; appendDataAsMultipleSnapshots(dataAcrossSnapshots); // this should create a snapshot with type delete. - table.newDelete() - .deleteFromRowFilter(Expressions.equal("id", 4)) - .commit(); + table.newDelete().deleteFromRowFilter(Expressions.equal("id", 4)).commit(); - // check pre-condition - that the above delete operation on table resulted in Snapshot of Type DELETE. + // check pre-condition - that the above delete operation on table resulted in Snapshot of Type + // DELETE. Assert.assertEquals(DataOperations.DELETE, table.currentSnapshot().operation()); StreamingQuery query = startStream(SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS, "true"); @@ -469,21 +456,17 @@ public void testReadStreamWithSnapshotTypeDeleteAndSkipDeleteOption() throws Exc @Test public void testReadStreamWithSnapshotTypeDeleteAndSkipOverwriteOption() throws Exception { - table.updateSpec() - .removeField("id_bucket") - .addField(ref("id")) - .commit(); + table.updateSpec().removeField("id_bucket").addField(ref("id")).commit(); // fill table with some data List> dataAcrossSnapshots = TEST_DATA_MULTIPLE_SNAPSHOTS; appendDataAsMultipleSnapshots(dataAcrossSnapshots); // this should create a snapshot with type overwrite. - table.newOverwrite() - .overwriteByRowFilter(Expressions.greaterThan("id", 4)) - .commit(); + table.newOverwrite().overwriteByRowFilter(Expressions.greaterThan("id", 4)).commit(); - // check pre-condition - that the above delete operation on table resulted in Snapshot of Type OVERWRITE. + // check pre-condition - that the above delete operation on table resulted in Snapshot of Type + // OVERWRITE. Assert.assertEquals(DataOperations.OVERWRITE, table.currentSnapshot().operation()); StreamingQuery query = startStream(SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS, "true"); @@ -492,8 +475,8 @@ public void testReadStreamWithSnapshotTypeDeleteAndSkipOverwriteOption() throws } /** - * appends each list as a Snapshot on the iceberg table at the given location. - * accepts a list of lists - each list representing data per snapshot. + * appends each list as a Snapshot on the iceberg table at the given location. accepts a list of + * lists - each list representing data per snapshot. */ private void appendDataAsMultipleSnapshots(List> data) { for (List l : data) { @@ -507,7 +490,8 @@ private void appendData(List data) { private void appendData(List data, String format) { Dataset df = spark.createDataFrame(data, SimpleRecord.class); - df.select("id", "data").write() + df.select("id", "data") + .write() .format("iceberg") .option("write-format", format) .mode("append") @@ -517,7 +501,8 @@ private void appendData(List data, String format) { private static final String MEMORY_TABLE = "_stream_view_mem"; private StreamingQuery startStream(Map options) throws TimeoutException { - return spark.readStream() + return spark + .readStream() .options(options) .format("iceberg") .load(tableName) @@ -539,9 +524,9 @@ private StreamingQuery startStream(String key, String value) throws TimeoutExcep private List rowsAvailable(StreamingQuery query) { query.processAllAvailable(); - return spark.sql("select * from " + MEMORY_TABLE) + return spark + .sql("select * from " + MEMORY_TABLE) .as(Encoders.bean(SimpleRecord.class)) .collectAsList(); } - } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java index ae3b7aa7a785..0650cb9738a6 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestTables.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.io.File; @@ -42,15 +41,15 @@ // TODO: Use the copy of this from core. class TestTables { - private TestTables() { - } + private TestTables() {} static TestTable create(File temp, String name, Schema schema, PartitionSpec spec) { TestTableOperations ops = new TestTableOperations(name); if (ops.current() != null) { throw new AlreadyExistsException("Table %s already exists at location: %s", name, temp); } - ops.commit(null, TableMetadata.newTableMetadata(schema, spec, temp.toString(), ImmutableMap.of())); + ops.commit( + null, TableMetadata.newTableMetadata(schema, spec, temp.toString(), ImmutableMap.of())); return new TestTable(ops, name); } @@ -166,8 +165,8 @@ public FileIO io() { @Override public LocationProvider locationProvider() { - Preconditions.checkNotNull(current, - "Current metadata should not be null when locationProvider is called"); + Preconditions.checkNotNull( + current, "Current metadata should not be null when locationProvider is called"); return LocationProviders.locationsFor(current.location(), current.properties()); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java index 20509eef7471..f6cac9e9dd82 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java @@ -16,9 +16,10 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.Files.localOutput; + import java.io.File; import java.io.IOException; import java.time.LocalDateTime; @@ -64,18 +65,16 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.Files.localOutput; - @RunWith(Parameterized.class) public class TestTimestampWithoutZone extends SparkTestBase { private static final Configuration CONF = new Configuration(); private static final HadoopTables TABLES = new HadoopTables(CONF); - private static final Schema SCHEMA = new Schema( - Types.NestedField.required(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(3, "data", Types.StringType.get()) - ); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "ts", Types.TimestampType.withoutZone()), + Types.NestedField.optional(3, "data", Types.StringType.get())); private static SparkSession spark = null; @@ -91,8 +90,7 @@ public static void stopSpark() { currentSpark.stop(); } - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + @Rule public TemporaryFolder temp = new TemporaryFolder(); private final String format; private final boolean vectorized; @@ -100,9 +98,9 @@ public static void stopSpark() { @Parameterized.Parameters(name = "format = {0}, vectorized = {1}") public static Object[][] parameters() { return new Object[][] { - { "parquet", false }, - { "parquet", true }, - { "avro", false } + {"parquet", false}, + {"parquet", true}, + {"avro", false} }; } @@ -132,16 +130,17 @@ public void writeUnpartitionedTable() throws IOException { // create records using the table's schema this.records = testRecords(tableSchema); - try (FileAppender writer = new GenericAppenderFactory(tableSchema).newAppender( - localOutput(testFile), fileFormat)) { + try (FileAppender writer = + new GenericAppenderFactory(tableSchema).newAppender(localOutput(testFile), fileFormat)) { writer.addAll(records); } - DataFile file = DataFiles.builder(PartitionSpec.unpartitioned()) - .withRecordCount(records.size()) - .withFileSizeInBytes(testFile.length()) - .withPath(testFile.toString()) - .build(); + DataFile file = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withRecordCount(records.size()) + .withFileSizeInBytes(testFile.length()) + .withPath(testFile.toString()) + .build(); table.newAppend().appendFile(file).commit(); } @@ -154,69 +153,89 @@ public void testUnpartitionedTimestampWithoutZone() { @Test public void testUnpartitionedTimestampWithoutZoneProjection() { Schema projection = SCHEMA.select("id", "ts"); - assertEqualsSafe(projection.asStruct(), + assertEqualsSafe( + projection.asStruct(), records.stream().map(r -> projectFlat(projection, r)).collect(Collectors.toList()), read(unpartitioned.toString(), vectorized, "id", "ts")); } - @Rule - public ExpectedException exception = ExpectedException.none(); + @Rule public ExpectedException exception = ExpectedException.none(); @Test public void testUnpartitionedTimestampWithoutZoneError() { - AssertHelpers.assertThrows(String.format("Read operation performed on a timestamp without timezone field while " + - "'%s' set to false should throw exception", - SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), + AssertHelpers.assertThrows( + String.format( + "Read operation performed on a timestamp without timezone field while " + + "'%s' set to false should throw exception", + SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), IllegalArgumentException.class, SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, - () -> spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") - .load(unpartitioned.toString()) - .collectAsList()); + () -> + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") + .load(unpartitioned.toString()) + .collectAsList()); } @Test public void testUnpartitionedTimestampWithoutZoneAppend() { - spark.read().format("iceberg") - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()) - .write() - .format("iceberg") - .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .mode(SaveMode.Append) - .save(unpartitioned.toString()); - - assertEqualsSafe(SCHEMA.asStruct(), - Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), - read(unpartitioned.toString(), vectorized)); + spark + .read() + .format("iceberg") + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()) + .write() + .format("iceberg") + .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .mode(SaveMode.Append) + .save(unpartitioned.toString()); + + assertEqualsSafe( + SCHEMA.asStruct(), + Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), + read(unpartitioned.toString(), vectorized)); } @Test public void testUnpartitionedTimestampWithoutZoneWriteError() { - String errorMessage = String.format("Write operation performed on a timestamp without timezone field while " + - "'%s' set to false should throw exception", + String errorMessage = + String.format( + "Write operation performed on a timestamp without timezone field while " + + "'%s' set to false should throw exception", SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE); - Runnable writeOperation = () -> spark.read().format("iceberg") - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .load(unpartitioned.toString()) - .write() - .format("iceberg") - .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") - .mode(SaveMode.Append) - .save(unpartitioned.toString()); - - AssertHelpers.assertThrows(errorMessage, IllegalArgumentException.class, - SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, writeOperation); - + Runnable writeOperation = + () -> + spark + .read() + .format("iceberg") + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .load(unpartitioned.toString()) + .write() + .format("iceberg") + .option(SparkWriteOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "false") + .mode(SaveMode.Append) + .save(unpartitioned.toString()); + + AssertHelpers.assertThrows( + errorMessage, + IllegalArgumentException.class, + SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, + writeOperation); } @Test public void testUnpartitionedTimestampWithoutZoneSessionProperties() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - spark.read().format("iceberg") + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + spark + .read() + .format("iceberg") .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) .load(unpartitioned.toString()) .write() @@ -224,10 +243,11 @@ public void testUnpartitionedTimestampWithoutZoneSessionProperties() { .mode(SaveMode.Append) .save(unpartitioned.toString()); - assertEqualsSafe(SCHEMA.asStruct(), + assertEqualsSafe( + SCHEMA.asStruct(), Stream.concat(records.stream(), records.stream()).collect(Collectors.toList()), read(unpartitioned.toString(), vectorized)); - }); + }); } private static Record projectFlat(Schema projection, Record record) { @@ -240,8 +260,8 @@ private static Record projectFlat(Schema projection, Record record) { return result; } - public static void assertEqualsSafe(Types.StructType struct, - List expected, List actual) { + public static void assertEqualsSafe( + Types.StructType struct, List expected, List actual) { Assert.assertEquals("Number of results should match expected", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i += 1) { GenericsHelpers.assertEqualsSafe(struct, expected.get(i), actual.get(i)); @@ -259,20 +279,23 @@ private List testRecords(Schema schema) { record(schema, 6L, parseToLocal("2017-12-21T21:55:30.589712"), "element"), record(schema, 7L, parseToLocal("2017-12-21T17:31:14.532797"), "limited"), record(schema, 8L, parseToLocal("2017-12-21T15:21:51.237521"), "global"), - record(schema, 9L, parseToLocal("2017-12-21T15:02:15.230570"), "goldfish") - ); + record(schema, 9L, parseToLocal("2017-12-21T15:02:15.230570"), "goldfish")); } private static List read(String table, boolean vectorized) { return read(table, vectorized, "*"); } - private static List read(String table, boolean vectorized, String select0, String... selectN) { - Dataset dataset = spark.read().format("iceberg") - .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) - .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") - .load(table) - .select(select0, selectN); + private static List read( + String table, boolean vectorized, String select0, String... selectN) { + Dataset dataset = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VECTORIZATION_ENABLED, String.valueOf(vectorized)) + .option(SparkReadOptions.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true") + .load(table) + .select(select0, selectN); return dataset.collectAsList(); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java index 7ed71031f3f2..9bf00f1b1365 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; +import static org.apache.iceberg.spark.SparkSchemaUtil.convert; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; @@ -53,28 +56,24 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.apache.iceberg.spark.SparkSchemaUtil.convert; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - public class TestWriteMetricsConfig { private static final Configuration CONF = new Configuration(); - private static final Schema SIMPLE_SCHEMA = new Schema( - optional(1, "id", Types.IntegerType.get()), - optional(2, "data", Types.StringType.get()) - ); - private static final Schema COMPLEX_SCHEMA = new Schema( - required(1, "longCol", Types.IntegerType.get()), - optional(2, "strCol", Types.StringType.get()), - required(3, "record", Types.StructType.of( - required(4, "id", Types.IntegerType.get()), - required(5, "data", Types.StringType.get()) - )) - ); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final Schema SIMPLE_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); + private static final Schema COMPLEX_SCHEMA = + new Schema( + required(1, "longCol", Types.IntegerType.get()), + optional(2, "strCol", Types.StringType.get()), + required( + 3, + "record", + Types.StructType.of( + required(4, "id", Types.IntegerType.get()), + required(5, "data", Types.StringType.get())))); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private static SparkSession spark = null; private static JavaSparkContext sc = null; @@ -103,11 +102,9 @@ public void testFullMetricsCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -136,11 +133,9 @@ public void testCountMetricsCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -169,11 +164,9 @@ public void testNoMetricsCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -203,11 +196,9 @@ public void testCustomMetricCollectionForParquet() throws IOException { properties.put("write.metadata.metrics.column.id", "full"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); - List expectedRecords = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List expectedRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) @@ -240,7 +231,8 @@ public void testBadCustomMetricCollectionForParquet() throws IOException { properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); properties.put("write.metadata.metrics.column.ids", "full"); - AssertHelpers.assertThrows("Creating a table with invalid metrics should fail", + AssertHelpers.assertThrows( + "Creating a table with invalid metrics should fail", ValidationException.class, null, () -> tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation)); @@ -251,9 +243,7 @@ public void testCustomMetricCollectionForNestedParquet() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); - PartitionSpec spec = PartitionSpec.builderFor(COMPLEX_SCHEMA) - .identity("strCol") - .build(); + PartitionSpec spec = PartitionSpec.builderFor(COMPLEX_SCHEMA).identity("strCol").build(); Map properties = Maps.newHashMap(); properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); properties.put("write.metadata.metrics.column.longCol", "counts"); @@ -263,9 +253,11 @@ public void testCustomMetricCollectionForNestedParquet() throws IOException { Iterable rows = RandomData.generateSpark(COMPLEX_SCHEMA, 10, 0); JavaRDD rdd = sc.parallelize(Lists.newArrayList(rows)); - Dataset df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(COMPLEX_SCHEMA), false); + Dataset df = + spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(COMPLEX_SCHEMA), false); - df.coalesce(1).write() + df.coalesce(1) + .write() .format("iceberg") .option(SparkWriteOptions.WRITE_FORMAT, "parquet") .mode(SaveMode.Append) diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java index 684dfbb255c7..554557df416c 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/ThreeColumnRecord.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.source; import java.util.Objects; @@ -26,8 +25,7 @@ public class ThreeColumnRecord { private String c2; private String c3; - public ThreeColumnRecord() { - } + public ThreeColumnRecord() {} public ThreeColumnRecord(Integer c1, String c2, String c3) { this.c1 = c1; @@ -68,9 +66,9 @@ public boolean equals(Object o) { return false; } ThreeColumnRecord that = (ThreeColumnRecord) o; - return Objects.equals(c1, that.c1) && - Objects.equals(c2, that.c2) && - Objects.equals(c3, that.c3); + return Objects.equals(c1, that.c1) + && Objects.equals(c2, that.c2) + && Objects.equals(c3, that.c3); } @Override @@ -80,10 +78,6 @@ public int hashCode() { @Override public String toString() { - return "ThreeColumnRecord{" + - "c1=" + c1 + - ", c2='" + c2 + '\'' + - ", c3='" + c3 + '\'' + - '}'; + return "ThreeColumnRecord{" + "c1=" + c1 + ", c2='" + c2 + '\'' + ", c3='" + c3 + '\'' + '}'; } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestAlterTable.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestAlterTable.java index 6172bd1fd0fe..e347cde7ba32 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestAlterTable.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestAlterTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.Map; @@ -36,7 +35,8 @@ import org.junit.Test; public class TestAlterTable extends SparkCatalogTestBase { - private final TableIdentifier renamedIdent = TableIdentifier.of(Namespace.of("default"), "table2"); + private final TableIdentifier renamedIdent = + TableIdentifier.of(Namespace.of("default"), "table2"); public TestAlterTable(String catalogName, String implementation, Map config) { super(catalogName, implementation, config); @@ -55,39 +55,53 @@ public void removeTable() { @Test public void testAddColumnNotNull() { - AssertHelpers.assertThrows("Should reject adding NOT NULL column", - SparkException.class, "Incompatible change: cannot add required column", + AssertHelpers.assertThrows( + "Should reject adding NOT NULL column", + SparkException.class, + "Incompatible change: cannot add required column", () -> sql("ALTER TABLE %s ADD COLUMN c3 INT NOT NULL", tableName)); } @Test public void testAddColumn() { - sql("ALTER TABLE %s ADD COLUMN point struct AFTER id", tableName); - - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(3, "point", Types.StructType.of( - NestedField.required(4, "x", Types.DoubleType.get()), - NestedField.required(5, "y", Types.DoubleType.get()) - )), - NestedField.optional(2, "data", Types.StringType.get())); - - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + sql( + "ALTER TABLE %s ADD COLUMN point struct AFTER id", + tableName); + + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional( + 3, + "point", + Types.StructType.of( + NestedField.required(4, "x", Types.DoubleType.get()), + NestedField.required(5, "y", Types.DoubleType.get()))), + NestedField.optional(2, "data", Types.StringType.get())); + + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); sql("ALTER TABLE %s ADD COLUMN point.z double COMMENT 'May be null' FIRST", tableName); - Types.StructType expectedSchema2 = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(3, "point", Types.StructType.of( - NestedField.optional(6, "z", Types.DoubleType.get(), "May be null"), - NestedField.required(4, "x", Types.DoubleType.get()), - NestedField.required(5, "y", Types.DoubleType.get()) - )), - NestedField.optional(2, "data", Types.StringType.get())); - - Assert.assertEquals("Schema should match expected", - expectedSchema2, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Types.StructType expectedSchema2 = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional( + 3, + "point", + Types.StructType.of( + NestedField.optional(6, "z", Types.DoubleType.get(), "May be null"), + NestedField.required(4, "x", Types.DoubleType.get()), + NestedField.required(5, "y", Types.DoubleType.get()))), + NestedField.optional(2, "data", Types.StringType.get())); + + Assert.assertEquals( + "Schema should match expected", + expectedSchema2, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -95,19 +109,24 @@ public void testAddColumnWithArray() { sql("ALTER TABLE %s ADD COLUMN data2 array>", tableName); // use the implicit column name 'element' to access member of array and add column d to struct. sql("ALTER TABLE %s ADD COLUMN data2.element.d int", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get()), - NestedField.optional(3, "data2", Types.ListType.ofOptional( - 4, - Types.StructType.of( - NestedField.optional(5, "a", Types.IntegerType.get()), - NestedField.optional(6, "b", Types.IntegerType.get()), - NestedField.optional(7, "c", Types.IntegerType.get()), - NestedField.optional(8, "d", Types.IntegerType.get())) - ))); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get()), + NestedField.optional( + 3, + "data2", + Types.ListType.ofOptional( + 4, + Types.StructType.of( + NestedField.optional(5, "a", Types.IntegerType.get()), + NestedField.optional(6, "b", Types.IntegerType.get()), + NestedField.optional(7, "c", Types.IntegerType.get()), + NestedField.optional(8, "d", Types.IntegerType.get()))))); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -116,25 +135,31 @@ public void testAddColumnWithMap() { // use the implicit column name 'key' and 'value' to access member of map. // add column to value struct column sql("ALTER TABLE %s ADD COLUMN data2.value.c int", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get()), - NestedField.optional(3, "data2", Types.MapType.ofOptional( - 4, - 5, - Types.StructType.of( - NestedField.optional(6, "x", Types.IntegerType.get())), - Types.StructType.of( - NestedField.optional(7, "a", Types.IntegerType.get()), - NestedField.optional(8, "b", Types.IntegerType.get()), - NestedField.optional(9, "c", Types.IntegerType.get())) - ))); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get()), + NestedField.optional( + 3, + "data2", + Types.MapType.ofOptional( + 4, + 5, + Types.StructType.of(NestedField.optional(6, "x", Types.IntegerType.get())), + Types.StructType.of( + NestedField.optional(7, "a", Types.IntegerType.get()), + NestedField.optional(8, "b", Types.IntegerType.get()), + NestedField.optional(9, "c", Types.IntegerType.get()))))); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); // should not allow changing map key column - AssertHelpers.assertThrows("Should reject changing key of the map column", - SparkException.class, "Unsupported table change: Cannot add fields to map keys:", + AssertHelpers.assertThrows( + "Should reject changing key of the map column", + SparkException.class, + "Unsupported table change: Cannot add fields to map keys:", () -> sql("ALTER TABLE %s ADD COLUMN data2.key.y int", tableName)); } @@ -142,35 +167,43 @@ public void testAddColumnWithMap() { public void testDropColumn() { sql("ALTER TABLE %s DROP COLUMN data", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get())); + Types.StructType expectedSchema = + Types.StructType.of(NestedField.required(1, "id", Types.LongType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test public void testRenameColumn() { sql("ALTER TABLE %s RENAME COLUMN id TO row_id", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "row_id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "row_id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test public void testAlterColumnComment() { sql("ALTER TABLE %s ALTER COLUMN id COMMENT 'Record id'", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get(), "Record id"), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get(), "Record id"), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -178,25 +211,31 @@ public void testAlterColumnType() { sql("ALTER TABLE %s ADD COLUMN count int", tableName); sql("ALTER TABLE %s ALTER COLUMN count TYPE bigint", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get()), - NestedField.optional(3, "count", Types.LongType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get()), + NestedField.optional(3, "count", Types.LongType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test public void testAlterColumnDropNotNull() { sql("ALTER TABLE %s ALTER COLUMN id DROP NOT NULL", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.optional(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.optional(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -204,15 +243,20 @@ public void testAlterColumnSetNotNull() { // no-op changes are allowed sql("ALTER TABLE %s ALTER COLUMN id SET NOT NULL", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); - AssertHelpers.assertThrows("Should reject adding NOT NULL constraint to an optional column", - AnalysisException.class, "Cannot change nullable column to non-nullable: data", + AssertHelpers.assertThrows( + "Should reject adding NOT NULL constraint to an optional column", + AnalysisException.class, + "Cannot change nullable column to non-nullable: data", () -> sql("ALTER TABLE %s ALTER COLUMN data SET NOT NULL", tableName)); } @@ -221,13 +265,16 @@ public void testAlterColumnPositionAfter() { sql("ALTER TABLE %s ADD COLUMN count int", tableName); sql("ALTER TABLE %s ALTER COLUMN count AFTER id", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(3, "count", Types.IntegerType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(3, "count", Types.IntegerType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test @@ -235,18 +282,22 @@ public void testAlterColumnPositionFirst() { sql("ALTER TABLE %s ADD COLUMN count int", tableName); sql("ALTER TABLE %s ALTER COLUMN count FIRST", tableName); - Types.StructType expectedSchema = Types.StructType.of( - NestedField.optional(3, "count", Types.IntegerType.get()), - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); + Types.StructType expectedSchema = + Types.StructType.of( + NestedField.optional(3, "count", Types.IntegerType.get()), + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Schema should match expected", - expectedSchema, validationCatalog.loadTable(tableIdent).schema().asStruct()); + Assert.assertEquals( + "Schema should match expected", + expectedSchema, + validationCatalog.loadTable(tableIdent).schema().asStruct()); } @Test public void testTableRename() { - Assume.assumeFalse("Hadoop catalog does not support rename", validationCatalog instanceof HadoopCatalog); + Assume.assumeFalse( + "Hadoop catalog does not support rename", validationCatalog instanceof HadoopCatalog); Assert.assertTrue("Initial name should exist", validationCatalog.tableExists(tableIdent)); Assert.assertFalse("New name should not exist", validationCatalog.tableExists(renamedIdent)); @@ -261,15 +312,19 @@ public void testTableRename() { public void testSetTableProperties() { sql("ALTER TABLE %s SET TBLPROPERTIES ('prop'='value')", tableName); - Assert.assertEquals("Should have the new table property", - "value", validationCatalog.loadTable(tableIdent).properties().get("prop")); + Assert.assertEquals( + "Should have the new table property", + "value", + validationCatalog.loadTable(tableIdent).properties().get("prop")); sql("ALTER TABLE %s UNSET TBLPROPERTIES ('prop')", tableName); - Assert.assertNull("Should not have the removed table property", + Assert.assertNull( + "Should not have the removed table property", validationCatalog.loadTable(tableIdent).properties().get("prop")); - AssertHelpers.assertThrows("Cannot specify the 'sort-order' because it's a reserved table property", + AssertHelpers.assertThrows( + "Cannot specify the 'sort-order' because it's a reserved table property", UnsupportedOperationException.class, () -> sql("ALTER TABLE %s SET TBLPROPERTIES ('sort-order'='value')", tableName)); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTable.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTable.java index 986098543d25..1411c83ddc65 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTable.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.io.File; @@ -52,11 +51,15 @@ public void dropTestTable() { @Test public void testTransformIgnoreCase() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE IF NOT EXISTS %s (id BIGINT NOT NULL, ts timestamp) " + - "USING iceberg partitioned by (HOURS(ts))", tableName); + sql( + "CREATE TABLE IF NOT EXISTS %s (id BIGINT NOT NULL, ts timestamp) " + + "USING iceberg partitioned by (HOURS(ts))", + tableName); Assert.assertTrue("Table should already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE IF NOT EXISTS %s (id BIGINT NOT NULL, ts timestamp) " + - "USING iceberg partitioned by (hours(ts))", tableName); + sql( + "CREATE TABLE IF NOT EXISTS %s (id BIGINT NOT NULL, ts timestamp) " + + "USING iceberg partitioned by (hours(ts))", + tableName); Assert.assertTrue("Table should already exist", validationCatalog.tableExists(tableIdent)); } @@ -69,18 +72,22 @@ public void testCreateTable() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); } @Test public void testCreateTableInRootNamespace() { - Assume.assumeTrue("Hadoop has no default namespace configured", "testhadoop".equals(catalogName)); + Assume.assumeTrue( + "Hadoop has no default namespace configured", "testhadoop".equals(catalogName)); try { sql("CREATE TABLE %s.table (id bigint) USING iceberg", catalogName); @@ -102,47 +109,61 @@ public void testCreateTableUsingParquet() { Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertEquals("Should not have default format parquet", + Assert.assertEquals( + "Should not have default format parquet", "parquet", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); - AssertHelpers.assertThrows("Should reject unsupported format names", - IllegalArgumentException.class, "Unsupported format in USING: crocodile", - () -> sql("CREATE TABLE %s.default.fail (id BIGINT NOT NULL, data STRING) USING crocodile", catalogName)); + AssertHelpers.assertThrows( + "Should reject unsupported format names", + IllegalArgumentException.class, + "Unsupported format in USING: crocodile", + () -> + sql( + "CREATE TABLE %s.default.fail (id BIGINT NOT NULL, data STRING) USING crocodile", + catalogName)); } @Test public void testCreateTablePartitionedBy() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, created_at TIMESTAMP, category STRING, data STRING) " + - "USING iceberg " + - "PARTITIONED BY (category, bucket(8, id), days(created_at))", tableName); + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, created_at TIMESTAMP, category STRING, data STRING) " + + "USING iceberg " + + "PARTITIONED BY (category, bucket(8, id), days(created_at))", + tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "created_at", Types.TimestampType.withZone()), - NestedField.optional(3, "category", Types.StringType.get()), - NestedField.optional(4, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); - - PartitionSpec expectedSpec = PartitionSpec.builderFor(new Schema(expectedSchema.fields())) - .identity("category") - .bucket("id", 8) - .day("created_at") - .build(); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "created_at", Types.TimestampType.withZone()), + NestedField.optional(3, "category", Types.StringType.get()), + NestedField.optional(4, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); + + PartitionSpec expectedSpec = + PartitionSpec.builderFor(new Schema(expectedSchema.fields())) + .identity("category") + .bucket("id", 8) + .day("created_at") + .build(); Assert.assertEquals("Should be partitioned correctly", expectedSpec, table.spec()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); } @@ -150,20 +171,24 @@ public void testCreateTablePartitionedBy() { public void testCreateTableColumnComments() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL COMMENT 'Unique identifier', data STRING COMMENT 'Data value') " + - "USING iceberg", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL COMMENT 'Unique identifier', data STRING COMMENT 'Data value') " + + "USING iceberg", tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get(), "Unique identifier"), - NestedField.optional(2, "data", Types.StringType.get(), "Data value")); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get(), "Unique identifier"), + NestedField.optional(2, "data", Types.StringType.get(), "Data value")); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); } @@ -171,24 +196,30 @@ public void testCreateTableColumnComments() { public void testCreateTableComment() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "COMMENT 'Table doc'", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "COMMENT 'Table doc'", tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); - Assert.assertEquals("Should have the table comment set in properties", - "Table doc", table.properties().get(TableCatalog.PROP_COMMENT)); + Assert.assertEquals( + "Should have the table comment set in properties", + "Table doc", + table.properties().get(TableCatalog.PROP_COMMENT)); } @Test @@ -204,43 +235,49 @@ public void testCreateTableLocation() throws Exception { String location = "file:" + tableLocation.toString(); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "LOCATION '%s'", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "LOCATION '%s'", tableName, location); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); - Assert.assertNull("Should not have the default format set", + Assert.assertNull( + "Should not have the default format set", table.properties().get(TableProperties.DEFAULT_FILE_FORMAT)); - Assert.assertEquals("Should have a custom table location", - location, table.location()); + Assert.assertEquals("Should have a custom table location", location, table.location()); } @Test public void testCreateTableProperties() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "TBLPROPERTIES (p1=2, p2='x')", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "TBLPROPERTIES (p1=2, p2='x')", tableName); Table table = validationCatalog.loadTable(tableIdent); Assert.assertNotNull("Should load the new table", table); - StructType expectedSchema = StructType.of( - NestedField.required(1, "id", Types.LongType.get()), - NestedField.optional(2, "data", Types.StringType.get())); - Assert.assertEquals("Should have the expected schema", expectedSchema, table.schema().asStruct()); + StructType expectedSchema = + StructType.of( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.optional(2, "data", Types.StringType.get())); + Assert.assertEquals( + "Should have the expected schema", expectedSchema, table.schema().asStruct()); Assert.assertEquals("Should not be partitioned", 0, table.spec().fields().size()); Assert.assertEquals("Should have property p1", "2", table.properties().get("p1")); Assert.assertEquals("Should have property p2", "x", table.properties().get("p2")); @@ -250,53 +287,56 @@ public void testCreateTableProperties() { public void testCreateTableWithFormatV2ThroughTableProperty() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "TBLPROPERTIES ('format-version'='2')", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "TBLPROPERTIES ('format-version'='2')", tableName); Table table = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("should create table using format v2", - 2, ((BaseTable) table).operations().current().formatVersion()); + Assert.assertEquals( + "should create table using format v2", + 2, + ((BaseTable) table).operations().current().formatVersion()); } @Test public void testUpgradeTableWithFormatV2ThroughTableProperty() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "TBLPROPERTIES ('format-version'='1')", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "TBLPROPERTIES ('format-version'='1')", tableName); Table table = validationCatalog.loadTable(tableIdent); TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("should create table using format v1", - 1, ops.refresh().formatVersion()); + Assert.assertEquals("should create table using format v1", 1, ops.refresh().formatVersion()); sql("ALTER TABLE %s SET TBLPROPERTIES ('format-version'='2')", tableName); - Assert.assertEquals("should update table to use format v2", - 2, ops.refresh().formatVersion()); + Assert.assertEquals("should update table to use format v2", 2, ops.refresh().formatVersion()); } @Test public void testDowngradeTableToFormatV1ThroughTablePropertyFails() { Assert.assertFalse("Table should not already exist", validationCatalog.tableExists(tableIdent)); - sql("CREATE TABLE %s " + - "(id BIGINT NOT NULL, data STRING) " + - "USING iceberg " + - "TBLPROPERTIES ('format-version'='2')", + sql( + "CREATE TABLE %s " + + "(id BIGINT NOT NULL, data STRING) " + + "USING iceberg " + + "TBLPROPERTIES ('format-version'='2')", tableName); Table table = validationCatalog.loadTable(tableIdent); TableOperations ops = ((BaseTable) table).operations(); - Assert.assertEquals("should create table using format v2", - 2, ops.refresh().formatVersion()); + Assert.assertEquals("should create table using format v2", 2, ops.refresh().formatVersion()); - AssertHelpers.assertThrowsCause("should fail to downgrade to v1", + AssertHelpers.assertThrowsCause( + "should fail to downgrade to v1", IllegalArgumentException.class, "Cannot downgrade v2 table to v1", () -> sql("ALTER TABLE %s SET TBLPROPERTIES ('format-version'='1')", tableName)); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java index 4a70327e21a1..2581c0fd3c56 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java @@ -16,9 +16,12 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.lit; +import static org.apache.spark.sql.functions.when; + import java.util.Map; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; @@ -30,20 +33,19 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.lit; -import static org.apache.spark.sql.functions.when; - public class TestCreateTableAsSelect extends SparkCatalogTestBase { private final String sourceName; - public TestCreateTableAsSelect(String catalogName, String implementation, Map config) { + public TestCreateTableAsSelect( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); this.sourceName = tableName("source"); - sql("CREATE TABLE IF NOT EXISTS %s (id bigint NOT NULL, data string) " + - "USING iceberg PARTITIONED BY (truncate(id, 3))", sourceName); + sql( + "CREATE TABLE IF NOT EXISTS %s (id bigint NOT NULL, data string) " + + "USING iceberg PARTITIONED BY (truncate(id, 3))", + sourceName); sql("INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", sourceName); } @@ -56,153 +58,178 @@ public void removeTables() { public void testUnpartitionedCTAS() { sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); Table ctasTable = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), ctasTable.schema().asStruct()); - Assert.assertEquals("Should be an unpartitioned table", - 0, ctasTable.spec().fields().size()); - assertEquals("Should have rows matching the source table", + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + ctasTable.schema().asStruct()); + Assert.assertEquals("Should be an unpartitioned table", 0, ctasTable.spec().fields().size()); + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testPartitionedCTAS() { - sql("CREATE TABLE %s USING iceberg PARTITIONED BY (id) AS SELECT * FROM %s ORDER BY id", tableName, sourceName); + sql( + "CREATE TABLE %s USING iceberg PARTITIONED BY (id) AS SELECT * FROM %s ORDER BY id", + tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("id") - .build(); + PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema).identity("id").build(); Table ctasTable = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), ctasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by id", - expectedSpec, ctasTable.spec()); - assertEquals("Should have rows matching the source table", + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + ctasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by id", expectedSpec, ctasTable.spec()); + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testRTAS() { - sql("CREATE TABLE %s USING iceberg TBLPROPERTIES ('prop1'='val1', 'prop2'='val2')" + - "AS SELECT * FROM %s", tableName, sourceName); + sql( + "CREATE TABLE %s USING iceberg TBLPROPERTIES ('prop1'='val1', 'prop2'='val2')" + + "AS SELECT * FROM %s", + tableName, sourceName); - assertEquals("Should have rows matching the source table", + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - sql("REPLACE TABLE %s USING iceberg PARTITIONED BY (part) TBLPROPERTIES ('prop1'='newval1', 'prop3'='val3') AS " + - "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); + sql( + "REPLACE TABLE %s USING iceberg PARTITIONED BY (part) TBLPROPERTIES ('prop1'='newval1', 'prop3'='val3') AS " + + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("part") - .withSpecId(1) - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema).identity("part").withSpecId(1).build(); Table rtasTable = validationCatalog.loadTable(tableIdent); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by part", - expectedSpec, rtasTable.spec()); - - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by part", expectedSpec, rtasTable.spec()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); - Assert.assertEquals("Should have updated table property", - "newval1", rtasTable.properties().get("prop1")); - Assert.assertEquals("Should have preserved table property", - "val2", rtasTable.properties().get("prop2")); - Assert.assertEquals("Should have new table property", - "val3", rtasTable.properties().get("prop3")); + Assert.assertEquals( + "Should have updated table property", "newval1", rtasTable.properties().get("prop1")); + Assert.assertEquals( + "Should have preserved table property", "val2", rtasTable.properties().get("prop2")); + Assert.assertEquals( + "Should have new table property", "val3", rtasTable.properties().get("prop3")); } @Test public void testCreateRTAS() { - sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + - "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); - - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + sql( + "CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + - "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); + sql( + "CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + + "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("part") - .withSpecId(0) // the spec is identical and should be reused - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema) + .identity("part") + .withSpecId(0) // the spec is identical and should be reused + .build(); Table rtasTable = validationCatalog.loadTable(tableIdent); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by part", - expectedSpec, rtasTable.spec()); - - assertEquals("Should have rows matching the source table", - sql("SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by part", expectedSpec, rtasTable.spec()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); } @Test public void testDataFrameV2Create() throws Exception { spark.table(sourceName).writeTo(tableName).using("iceberg").create(); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); Table ctasTable = validationCatalog.loadTable(tableIdent); - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), ctasTable.schema().asStruct()); - Assert.assertEquals("Should be an unpartitioned table", - 0, ctasTable.spec().fields().size()); - assertEquals("Should have rows matching the source table", + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + ctasTable.schema().asStruct()); + Assert.assertEquals("Should be an unpartitioned table", 0, ctasTable.spec().fields().size()); + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -211,11 +238,13 @@ public void testDataFrameV2Create() throws Exception { public void testDataFrameV2Replace() throws Exception { spark.table(sourceName).writeTo(tableName).using("iceberg").create(); - assertEquals("Should have rows matching the source table", + assertEquals( + "Should have rows matching the source table", sql("SELECT * FROM %s ORDER BY id", sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - spark.table(sourceName) + spark + .table(sourceName) .select( col("id"), col("data"), @@ -226,37 +255,40 @@ public void testDataFrameV2Replace() throws Exception { .using("iceberg") .replace(); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("part") - .withSpecId(1) - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema).identity("part").withSpecId(1).build(); Table rtasTable = validationCatalog.loadTable(tableIdent); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by part", - expectedSpec, rtasTable.spec()); - - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by part", expectedSpec, rtasTable.spec()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); } @Test public void testDataFrameV2CreateOrReplace() { - spark.table(sourceName) + spark + .table(sourceName) .select( col("id"), col("data"), @@ -267,12 +299,16 @@ public void testDataFrameV2CreateOrReplace() { .using("iceberg") .createOrReplace(); - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - spark.table(sourceName) + spark + .table(sourceName) .select(col("id").multiply(lit(2)).as("id"), col("data")) .select( col("id"), @@ -284,80 +320,97 @@ public void testDataFrameV2CreateOrReplace() { .using("iceberg") .createOrReplace(); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .identity("part") - .withSpecId(0) // the spec is identical and should be reused - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema) + .identity("part") + .withSpecId(0) // the spec is identical and should be reused + .build(); Table rtasTable = validationCatalog.loadTable(tableIdent); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - Assert.assertEquals("Should be partitioned by part", - expectedSpec, rtasTable.spec()); - - assertEquals("Should have rows matching the source table", - sql("SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + Assert.assertEquals("Should be partitioned by part", expectedSpec, rtasTable.spec()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); } @Test public void testCreateRTASWithPartitionSpecChanging() { - sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + - "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); + sql( + "CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " + + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); Table rtasTable = validationCatalog.loadTable(tableIdent); - assertEquals("Should have rows matching the source table", - sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); // Change the partitioning of the table rtasTable.updateSpec().removeField("part").commit(); // Spec 1 - sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part, id) AS " + - "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY 3, 1", tableName, sourceName); + sql( + "CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part, id) AS " + + "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY 3, 1", + tableName, sourceName); - Schema expectedSchema = new Schema( - Types.NestedField.optional(1, "id", Types.LongType.get()), - Types.NestedField.optional(2, "data", Types.StringType.get()), - Types.NestedField.optional(3, "part", Types.StringType.get()) - ); + Schema expectedSchema = + new Schema( + Types.NestedField.optional(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "part", Types.StringType.get())); - PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema) - .alwaysNull("part", "part_1000") - .identity("part") - .identity("id") - .withSpecId(2) // The Spec is new - .build(); + PartitionSpec expectedSpec = + PartitionSpec.builderFor(expectedSchema) + .alwaysNull("part", "part_1000") + .identity("part") + .identity("id") + .withSpecId(2) // The Spec is new + .build(); - Assert.assertEquals("Should be partitioned by part and id", - expectedSpec, rtasTable.spec()); + Assert.assertEquals("Should be partitioned by part and id", expectedSpec, rtasTable.spec()); // the replacement table has a different schema and partition spec than the original - Assert.assertEquals("Should have expected nullable schema", - expectedSchema.asStruct(), rtasTable.schema().asStruct()); - - assertEquals("Should have rows matching the source table", - sql("SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + - "FROM %s ORDER BY id", sourceName), + Assert.assertEquals( + "Should have expected nullable schema", + expectedSchema.asStruct(), + rtasTable.schema().asStruct()); + + assertEquals( + "Should have rows matching the source table", + sql( + "SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " + + "FROM %s ORDER BY id", + sourceName), sql("SELECT * FROM %s ORDER BY id", tableName)); - Assert.assertEquals("Table should have expected snapshots", - 2, Iterables.size(rtasTable.snapshots())); + Assert.assertEquals( + "Table should have expected snapshots", 2, Iterables.size(rtasTable.snapshots())); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java index 1ba001c9a030..cae1901aa713 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -49,27 +48,28 @@ public void removeTables() { public void testDeleteFromUnpartitionedTable() throws NoSuchTableException { sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.coalesce(1).writeTo(tableName).append(); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName)); sql("DELETE FROM %s WHERE id < 2", tableName); - assertEquals("Should have no rows after successful delete", + assertEquals( + "Should have no rows after successful delete", ImmutableList.of(row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName)); sql("DELETE FROM %s WHERE id < 4", tableName); - assertEquals("Should have no rows after successful delete", + assertEquals( + "Should have no rows after successful delete", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -78,47 +78,50 @@ public void testDeleteFromUnpartitionedTable() throws NoSuchTableException { public void testDeleteFromTableAtSnapshot() throws NoSuchTableException { sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName); - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.coalesce(1).writeTo(tableName).append(); long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); String prefix = "snapshot_id_"; - AssertHelpers.assertThrows("Should not be able to delete from a table at a specific snapshot", - IllegalArgumentException.class, "Cannot delete from table at a specific snapshot", + AssertHelpers.assertThrows( + "Should not be able to delete from a table at a specific snapshot", + IllegalArgumentException.class, + "Cannot delete from table at a specific snapshot", () -> sql("DELETE FROM %s.%s WHERE id < 4", tableName, prefix + snapshotId)); } @Test public void testDeleteFromPartitionedTable() throws NoSuchTableException { - sql("CREATE TABLE %s (id bigint, data string) " + - "USING iceberg " + - "PARTITIONED BY (truncate(id, 2))", tableName); - - List records = Lists.newArrayList( - new SimpleRecord(1, "a"), - new SimpleRecord(2, "b"), - new SimpleRecord(3, "c") - ); + sql( + "CREATE TABLE %s (id bigint, data string) " + + "USING iceberg " + + "PARTITIONED BY (truncate(id, 2))", + tableName); + + List records = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); Dataset df = spark.createDataFrame(records, SimpleRecord.class); df.coalesce(1).writeTo(tableName).append(); - assertEquals("Should have 3 rows in 2 partitions", + assertEquals( + "Should have 3 rows in 2 partitions", ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName)); sql("DELETE FROM %s WHERE id > 2", tableName); - assertEquals("Should have two rows in the second partition", + assertEquals( + "Should have two rows in the second partition", ImmutableList.of(row(1L, "a"), row(2L, "b")), sql("SELECT * FROM %s ORDER BY id", tableName)); sql("DELETE FROM %s WHERE id < 2", tableName); - assertEquals("Should have two rows in the second partition", + assertEquals( + "Should have two rows in the second partition", ImmutableList.of(row(2L, "b")), sql("SELECT * FROM %s ORDER BY id", tableName)); } @@ -128,7 +131,8 @@ public void testDeleteFromWhereFalse() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -139,7 +143,8 @@ public void testDeleteFromWhereFalse() { table.refresh(); - Assert.assertEquals("Delete should not produce a new snapshot", 1, Iterables.size(table.snapshots())); + Assert.assertEquals( + "Delete should not produce a new snapshot", 1, Iterables.size(table.snapshots())); } @Test @@ -147,7 +152,8 @@ public void testTruncate() { sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); sql("INSERT INTO TABLE %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName)); @@ -156,7 +162,8 @@ public void testTruncate() { sql("TRUNCATE TABLE %s", tableName); - assertEquals("Should have expected rows", + assertEquals( + "Should have expected rows", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName)); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java index 535cd3926a1a..2189bd0dae75 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestDropTable.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.io.IOException; @@ -65,11 +64,14 @@ public void testDropTableGCDisabled() throws IOException { } private void dropTableInternal() throws IOException { - assertEquals("Should have expected rows", - ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(1, "test")), + sql("SELECT * FROM %s", tableName)); List manifestAndFiles = manifestsAndFiles(); - Assert.assertEquals("There should be 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertEquals( + "There should be 2 files for manifests and files", 2, manifestAndFiles.size()); Assert.assertTrue("All files should be existed", checkFilesExist(manifestAndFiles, true)); sql("DROP TABLE %s", tableName); @@ -85,11 +87,14 @@ private void dropTableInternal() throws IOException { @Test public void testPurgeTable() throws IOException { - assertEquals("Should have expected rows", - ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(1, "test")), + sql("SELECT * FROM %s", tableName)); List manifestAndFiles = manifestsAndFiles(); - Assert.assertEquals("There should be 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertEquals( + "There should be 2 files for manifests and files", 2, manifestAndFiles.size()); Assert.assertTrue("All files should exist", checkFilesExist(manifestAndFiles, true)); sql("DROP TABLE %s PURGE", tableName); @@ -101,14 +106,19 @@ public void testPurgeTable() throws IOException { public void testPurgeTableGCDisabled() throws IOException { sql("ALTER TABLE %s SET TBLPROPERTIES (gc.enabled = false)", tableName); - assertEquals("Should have expected rows", - ImmutableList.of(row(1, "test")), sql("SELECT * FROM %s", tableName)); + assertEquals( + "Should have expected rows", + ImmutableList.of(row(1, "test")), + sql("SELECT * FROM %s", tableName)); List manifestAndFiles = manifestsAndFiles(); - Assert.assertEquals("There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); + Assert.assertEquals( + "There totally should have 2 files for manifests and files", 2, manifestAndFiles.size()); Assert.assertTrue("All files should be existed", checkFilesExist(manifestAndFiles, true)); - AssertHelpers.assertThrows("Purge table is not allowed when GC is disabled", ValidationException.class, + AssertHelpers.assertThrows( + "Purge table is not allowed when GC is disabled", + ValidationException.class, "Cannot purge table: GC is disabled (deleting files may corrupt other tables", () -> sql("DROP TABLE %s PURGE", tableName)); @@ -118,8 +128,11 @@ public void testPurgeTableGCDisabled() throws IOException { private List manifestsAndFiles() { List files = sql("SELECT file_path FROM %s.%s", tableName, MetadataTableType.FILES); - List manifests = sql("SELECT path FROM %s.%s", tableName, MetadataTableType.MANIFESTS); - return Streams.concat(files.stream(), manifests.stream()).map(row -> (String) row[0]).collect(Collectors.toList()); + List manifests = + sql("SELECT path FROM %s.%s", tableName, MetadataTableType.MANIFESTS); + return Streams.concat(files.stream(), manifests.stream()) + .map(row -> (String) row[0]) + .collect(Collectors.toList()); } private boolean checkFilesExist(List files, boolean shouldExist) throws IOException { @@ -129,12 +142,14 @@ private boolean checkFilesExist(List files, boolean shouldExist) throws } FileSystem fs = new Path(files.get(0)).getFileSystem(hiveConf); - return files.stream().allMatch(file -> { - try { - return fs.exists(new Path(file)) ^ mask; - } catch (IOException e) { - throw new RuntimeException(e); - } - }); + return files.stream() + .allMatch( + file -> { + try { + return fs.exists(new Path(file)) ^ mask; + } catch (IOException e) { + throw new RuntimeException(e); + } + }); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestNamespaceSQL.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestNamespaceSQL.java index a76e4d624ba1..6c29ea4442ef 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestNamespaceSQL.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestNamespaceSQL.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.io.File; @@ -56,7 +55,8 @@ public void cleanNamespaces() { @Test public void testCreateNamespace() { - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); @@ -76,7 +76,8 @@ public void testDefaultNamespace() { @Test public void testDropEmptyNamespace() { - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); @@ -84,23 +85,28 @@ public void testDropEmptyNamespace() { sql("DROP NAMESPACE %s", fullNamespace); - Assert.assertFalse("Namespace should have been dropped", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should have been dropped", validationNamespaceCatalog.namespaceExists(NS)); } @Test public void testDropNonEmptyNamespace() { Assume.assumeFalse("Session catalog has flaky behavior", "spark_catalog".equals(catalogName)); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); sql("CREATE TABLE %s.table (id bigint) USING iceberg", fullNamespace); Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(NS)); - Assert.assertTrue("Table should exist", validationCatalog.tableExists(TableIdentifier.of(NS, "table"))); + Assert.assertTrue( + "Table should exist", validationCatalog.tableExists(TableIdentifier.of(NS, "table"))); - AssertHelpers.assertThrows("Should fail if trying to delete a non-empty namespace", - NamespaceNotEmptyException.class, "Namespace db is not empty.", + AssertHelpers.assertThrows( + "Should fail if trying to delete a non-empty namespace", + NamespaceNotEmptyException.class, + "Namespace db is not empty.", () -> sql("DROP NAMESPACE %s", fullNamespace)); sql("DROP TABLE %s.table", fullNamespace); @@ -108,7 +114,8 @@ public void testDropNonEmptyNamespace() { @Test public void testListTables() { - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); @@ -126,7 +133,8 @@ public void testListTables() { @Test public void testListNamespace() { - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); @@ -136,17 +144,23 @@ public void testListNamespace() { if (isHadoopCatalog) { Assert.assertEquals("Should have 1 namespace", 1, namespaces.size()); - Set namespaceNames = namespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); + Set namespaceNames = + namespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); Assert.assertEquals("Should have only db namespace", ImmutableSet.of("db"), namespaceNames); } else { Assert.assertEquals("Should have 2 namespaces", 2, namespaces.size()); - Set namespaceNames = namespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); - Assert.assertEquals("Should have default and db namespaces", ImmutableSet.of("default", "db"), namespaceNames); + Set namespaceNames = + namespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); + Assert.assertEquals( + "Should have default and db namespaces", + ImmutableSet.of("default", "db"), + namespaceNames); } List nestedNamespaces = sql("SHOW NAMESPACES IN %s", fullNamespace); - Set nestedNames = nestedNamespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); + Set nestedNames = + nestedNamespaces.stream().map(arr -> arr[0].toString()).collect(Collectors.toSet()); Assert.assertEquals("Should not have nested namespaces", ImmutableSet.of(), nestedNames); } @@ -154,7 +168,8 @@ public void testListNamespace() { public void testCreateNamespaceWithMetadata() { Assume.assumeFalse("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s WITH PROPERTIES ('prop'='value')", fullNamespace); @@ -162,14 +177,16 @@ public void testCreateNamespaceWithMetadata() { Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Namespace should have expected prop value", "value", nsMetadata.get("prop")); + Assert.assertEquals( + "Namespace should have expected prop value", "value", nsMetadata.get("prop")); } @Test public void testCreateNamespaceWithComment() { Assume.assumeFalse("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s COMMENT 'namespace doc'", fullNamespace); @@ -177,14 +194,16 @@ public void testCreateNamespaceWithComment() { Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Namespace should have expected comment", "namespace doc", nsMetadata.get("comment")); + Assert.assertEquals( + "Namespace should have expected comment", "namespace doc", nsMetadata.get("comment")); } @Test public void testCreateNamespaceWithLocation() throws Exception { Assume.assumeFalse("HadoopCatalog does not support namespace locations", isHadoopCatalog); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); File location = temp.newFile(); Assert.assertTrue(location.delete()); @@ -195,27 +214,32 @@ public void testCreateNamespaceWithLocation() throws Exception { Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Namespace should have expected location", - "file:" + location.getPath(), nsMetadata.get("location")); + Assert.assertEquals( + "Namespace should have expected location", + "file:" + location.getPath(), + nsMetadata.get("location")); } @Test public void testSetProperties() { Assume.assumeFalse("HadoopCatalog does not support namespace metadata", isHadoopCatalog); - Assert.assertFalse("Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); + Assert.assertFalse( + "Namespace should not already exist", validationNamespaceCatalog.namespaceExists(NS)); sql("CREATE NAMESPACE %s", fullNamespace); Assert.assertTrue("Namespace should exist", validationNamespaceCatalog.namespaceExists(NS)); Map defaultMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertFalse("Default metadata should not have custom property", defaultMetadata.containsKey("prop")); + Assert.assertFalse( + "Default metadata should not have custom property", defaultMetadata.containsKey("prop")); sql("ALTER NAMESPACE %s SET PROPERTIES ('prop'='value')", fullNamespace); Map nsMetadata = validationNamespaceCatalog.loadNamespaceMetadata(NS); - Assert.assertEquals("Namespace should have expected prop value", "value", nsMetadata.get("prop")); + Assert.assertEquals( + "Namespace should have expected prop value", "value", nsMetadata.get("prop")); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWrites.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWrites.java index 9223797ada32..51c56ac79d4d 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWrites.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWrites.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -34,13 +33,16 @@ import org.junit.Test; public class TestPartitionedWrites extends SparkCatalogTestBase { - public TestPartitionedWrites(String catalogName, String implementation, Map config) { + public TestPartitionedWrites( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @Before public void createTables() { - sql("CREATE TABLE %s (id bigint, data string) USING iceberg PARTITIONED BY (truncate(id, 3))", tableName); + sql( + "CREATE TABLE %s (id bigint, data string) USING iceberg PARTITIONED BY (truncate(id, 3))", + tableName); sql("INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", tableName); } @@ -55,17 +57,14 @@ public void testInsertAppend() { sql("INSERT INTO %s VALUES (4, 'd'), (5, 'e')", tableName); - Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -75,88 +74,70 @@ public void testInsertOverwrite() { // 4 and 5 replace 3 in the partition (id - (id % 3)) = 3 sql("INSERT OVERWRITE %s VALUES (4, 'd'), (5, 'e')", tableName); - Assert.assertEquals("Should have 4 rows after overwrite", 4L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 4 rows after overwrite", 4L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2Append() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).append(); - Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2DynamicOverwrite() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).overwritePartitions(); - Assert.assertEquals("Should have 4 rows after overwrite", 4L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 4 rows after overwrite", 4L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2Overwrite() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).overwrite(functions.col("id").$less(3)); - Assert.assertEquals("Should have 3 rows after overwrite", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 3 rows after overwrite", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = ImmutableList.of(row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -166,13 +147,13 @@ public void testViewsReturnRecentResults() { Dataset query = spark.sql("SELECT * FROM " + tableName + " WHERE id = 1"); query.createOrReplaceTempView("tmp"); - assertEquals("View should have expected rows", - ImmutableList.of(row(1L, "a")), - sql("SELECT * FROM tmp")); + assertEquals( + "View should have expected rows", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp")); sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName); - assertEquals("View should have expected rows", + assertEquals( + "View should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM tmp")); } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWritesAsSelect.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWritesAsSelect.java index e38d545d8df9..3ffd38b83c3b 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWritesAsSelect.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestPartitionedWritesAsSelect.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -35,7 +34,9 @@ public class TestPartitionedWritesAsSelect extends SparkTestBaseWithCatalog { @Before public void createTables() { - sql("CREATE TABLE %s (id bigint, data string, category string, ts timestamp) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint, data string, category string, ts timestamp) USING iceberg", + tableName); } @After @@ -49,15 +50,23 @@ public void testInsertAsSelectAppend() { insertData(3); List expected = currentData(); - sql("CREATE TABLE %s (id bigint, data string, category string, ts timestamp)" + - "USING iceberg PARTITIONED BY (days(ts), category)", targetTable); - - sql("INSERT INTO %s SELECT id, data, category, ts FROM %s ORDER BY ts,category", targetTable, tableName); - Assert.assertEquals("Should have 15 rows after insert", - 3 * 5L, scalarSql("SELECT count(*) FROM %s", targetTable)); - - assertEquals("Row data should match expected", - expected, sql("SELECT * FROM %s ORDER BY id", targetTable)); + sql( + "CREATE TABLE %s (id bigint, data string, category string, ts timestamp)" + + "USING iceberg PARTITIONED BY (days(ts), category)", + targetTable); + + sql( + "INSERT INTO %s SELECT id, data, category, ts FROM %s ORDER BY ts,category", + targetTable, tableName); + Assert.assertEquals( + "Should have 15 rows after insert", + 3 * 5L, + scalarSql("SELECT count(*) FROM %s", targetTable)); + + assertEquals( + "Row data should match expected", + expected, + sql("SELECT * FROM %s ORDER BY id", targetTable)); } @Test @@ -65,16 +74,24 @@ public void testInsertAsSelectWithBucket() { insertData(3); List expected = currentData(); - sql("CREATE TABLE %s (id bigint, data string, category string, ts timestamp)" + - "USING iceberg PARTITIONED BY (bucket(8, data))", targetTable); + sql( + "CREATE TABLE %s (id bigint, data string, category string, ts timestamp)" + + "USING iceberg PARTITIONED BY (bucket(8, data))", + targetTable); IcebergSpark.registerBucketUDF(spark, "iceberg_bucket8", DataTypes.StringType, 8); - sql("INSERT INTO %s SELECT id, data, category, ts FROM %s ORDER BY iceberg_bucket8(data)", targetTable, tableName); - Assert.assertEquals("Should have 15 rows after insert", - 3 * 5L, scalarSql("SELECT count(*) FROM %s", targetTable)); - - assertEquals("Row data should match expected", - expected, sql("SELECT * FROM %s ORDER BY id", targetTable)); + sql( + "INSERT INTO %s SELECT id, data, category, ts FROM %s ORDER BY iceberg_bucket8(data)", + targetTable, tableName); + Assert.assertEquals( + "Should have 15 rows after insert", + 3 * 5L, + scalarSql("SELECT count(*) FROM %s", targetTable)); + + assertEquals( + "Row data should match expected", + expected, + sql("SELECT * FROM %s ORDER BY id", targetTable)); } @Test @@ -82,28 +99,40 @@ public void testInsertAsSelectWithTruncate() { insertData(3); List expected = currentData(); - sql("CREATE TABLE %s (id bigint, data string, category string, ts timestamp)" + - "USING iceberg PARTITIONED BY (truncate(data, 4), truncate(id, 4))", targetTable); + sql( + "CREATE TABLE %s (id bigint, data string, category string, ts timestamp)" + + "USING iceberg PARTITIONED BY (truncate(data, 4), truncate(id, 4))", + targetTable); IcebergSpark.registerTruncateUDF(spark, "iceberg_truncate_string4", DataTypes.StringType, 4); IcebergSpark.registerTruncateUDF(spark, "iceberg_truncate_long4", DataTypes.LongType, 4); - sql("INSERT INTO %s SELECT id, data, category, ts FROM %s " + - "ORDER BY iceberg_truncate_string4(data),iceberg_truncate_long4(id)", targetTable, tableName); - Assert.assertEquals("Should have 15 rows after insert", - 3 * 5L, scalarSql("SELECT count(*) FROM %s", targetTable)); - - assertEquals("Row data should match expected", - expected, sql("SELECT * FROM %s ORDER BY id", targetTable)); + sql( + "INSERT INTO %s SELECT id, data, category, ts FROM %s " + + "ORDER BY iceberg_truncate_string4(data),iceberg_truncate_long4(id)", + targetTable, tableName); + Assert.assertEquals( + "Should have 15 rows after insert", + 3 * 5L, + scalarSql("SELECT count(*) FROM %s", targetTable)); + + assertEquals( + "Row data should match expected", + expected, + sql("SELECT * FROM %s ORDER BY id", targetTable)); } private void insertData(int repeatCounter) { - IntStream.range(0, repeatCounter).forEach(i -> { - sql("INSERT INTO %s VALUES (13, '1', 'bgd16', timestamp('2021-11-10 11:20:10'))," + - "(21, '2', 'bgd13', timestamp('2021-11-10 11:20:10')), " + - "(12, '3', 'bgd14', timestamp('2021-11-10 11:20:10'))," + - "(222, '3', 'bgd15', timestamp('2021-11-10 11:20:10'))," + - "(45, '4', 'bgd16', timestamp('2021-11-10 11:20:10'))", tableName); - }); + IntStream.range(0, repeatCounter) + .forEach( + i -> { + sql( + "INSERT INTO %s VALUES (13, '1', 'bgd16', timestamp('2021-11-10 11:20:10'))," + + "(21, '2', 'bgd13', timestamp('2021-11-10 11:20:10')), " + + "(12, '3', 'bgd14', timestamp('2021-11-10 11:20:10'))," + + "(222, '3', 'bgd15', timestamp('2021-11-10 11:20:10'))," + + "(45, '4', 'bgd16', timestamp('2021-11-10 11:20:10'))", + tableName); + }); } private List currentData() { diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestRefreshTable.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestRefreshTable.java index a8bdea77e237..3eaca6329477 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestRefreshTable.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestRefreshTable.java @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - - package org.apache.iceberg.spark.sql; import java.util.List; @@ -49,7 +47,8 @@ public void removeTables() { @Test public void testRefreshCommand() { - // We are not allowed to change the session catalog after it has been initialized, so build a new one + // We are not allowed to change the session catalog after it has been initialized, so build a + // new one if (catalogName.equals("spark_catalog")) { spark.conf().set("spark.sql.catalog." + catalogName + ".cache-enabled", true); spark = spark.cloneSession(); @@ -59,7 +58,8 @@ public void testRefreshCommand() { List originalActual = sql("SELECT * FROM %s", tableName); assertEquals("Table should start as expected", originalExpected, originalActual); - // Modify table outside of spark, it should be cached so Spark should see the same value after mutation + // Modify table outside of spark, it should be cached so Spark should see the same value after + // mutation Table table = validationCatalog.loadTable(tableIdent); DataFile file = table.currentSnapshot().addedDataFiles(table.io()).iterator().next(); table.newDelete().deleteFile(file).commit(); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java index df503ae31d58..360deecbfd9b 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestSelect.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.text.SimpleDateFormat; @@ -49,10 +48,12 @@ public TestSelect(String catalogName, String implementation, Map super(catalogName, implementation, config); // register a scan event listener to validate pushdown - Listeners.register(event -> { - scanEventCount += 1; - lastScanEvent = event; - }, ScanEvent.class); + Listeners.register( + event -> { + scanEventCount += 1; + lastScanEvent = event; + }, + ScanEvent.class); } @Before @@ -72,8 +73,8 @@ public void removeTables() { @Test public void testSelect() { - List expected = ImmutableList.of( - row(1L, "a", 1.0F), row(2L, "b", 2.0F), row(3L, "c", Float.NaN)); + List expected = + ImmutableList.of(row(1L, "a", 1.0F), row(2L, "b", 2.0F), row(3L, "c", Float.NaN)); assertEquals("Should return all expected rows", expected, sql("SELECT * FROM %s", tableName)); } @@ -82,11 +83,14 @@ public void testSelect() { public void testSelectRewrite() { List expected = ImmutableList.of(row(3L, "c", Float.NaN)); - assertEquals("Should return all expected rows", expected, + assertEquals( + "Should return all expected rows", + expected, sql("SELECT * FROM %s where float = float('NaN')", tableName)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should push down expected filter", + Assert.assertEquals( + "Should push down expected filter", "(float IS NOT NULL AND is_nan(float))", Spark3Util.describe(lastScanEvent.filter())); } @@ -98,8 +102,10 @@ public void testProjection() { assertEquals("Should return all expected rows", expected, sql("SELECT id FROM %s", tableName)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); - Assert.assertEquals("Should project only the id column", + Assert.assertEquals( + "Should not push down a filter", Expressions.alwaysTrue(), lastScanEvent.filter()); + Assert.assertEquals( + "Should project only the id column", validationCatalog.loadTable(tableIdent).schema().select("id").asStruct(), lastScanEvent.projection().asStruct()); } @@ -108,13 +114,18 @@ public void testProjection() { public void testExpressionPushdown() { List expected = ImmutableList.of(row("b")); - assertEquals("Should return all expected rows", expected, sql("SELECT data FROM %s WHERE id = 2", tableName)); + assertEquals( + "Should return all expected rows", + expected, + sql("SELECT data FROM %s WHERE id = 2", tableName)); Assert.assertEquals("Should create only one scan", 1, scanEventCount); - Assert.assertEquals("Should push down expected filter", + Assert.assertEquals( + "Should push down expected filter", "(id IS NOT NULL AND id = 2)", Spark3Util.describe(lastScanEvent.filter())); - Assert.assertEquals("Should project only id and data columns", + Assert.assertEquals( + "Should project only id and data columns", validationCatalog.loadTable(tableIdent).schema().select("id", "data").asStruct(), lastScanEvent.projection().asStruct()); } @@ -125,7 +136,8 @@ public void testMetadataTables() { "Spark session catalog does not support metadata tables", "spark_catalog".equals(catalogName)); - assertEquals("Snapshot metadata table", + assertEquals( + "Snapshot metadata table", ImmutableList.of(row(ANY, ANY, null, "append", ANY, ANY)), sql("SELECT * FROM %s.snapshots", tableName)); } @@ -149,10 +161,12 @@ public void testSnapshotInTableName() { assertEquals("Snapshot at specific ID, prefix " + prefix, expected, actual); // read the table using DataFrameReader option - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) - .load(tableName); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) + .load(tableName); List fromDF = rowsToJava(df.collectAsList()); assertEquals("Snapshot at specific ID " + snapshotId, expected, fromDF); } @@ -177,10 +191,12 @@ public void testTimestampInTableName() { assertEquals("Snapshot at timestamp, prefix " + prefix, expected, actual); // read the table using DataFrameReader option - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) - .load(tableName); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) + .load(tableName); List fromDF = rowsToJava(df.collectAsList()); assertEquals("Snapshot at timestamp " + timestamp, expected, fromDF); } @@ -200,14 +216,17 @@ public void testVersionAsOf() { // read the table at the snapshot // HIVE time travel syntax - List actual2 = sql("SELECT * FROM %s FOR SYSTEM_VERSION AS OF %s", tableName, snapshotId); + List actual2 = + sql("SELECT * FROM %s FOR SYSTEM_VERSION AS OF %s", tableName, snapshotId); assertEquals("Snapshot at specific ID", expected, actual2); // read the table using DataFrameReader option: versionAsOf - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.VERSION_AS_OF, snapshotId) - .load(tableName); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.VERSION_AS_OF, snapshotId) + .load(tableName); List fromDF = rowsToJava(df.collectAsList()); assertEquals("Snapshot at specific ID " + snapshotId, expected, fromDF); } @@ -228,29 +247,33 @@ public void testTimestampAsOf() { sql("INSERT INTO %s VALUES (4, 'd', 4.0), (5, 'e', 5.0)", tableName); // read the table at the timestamp in long format i.e 1656507980463. - List actualWithLongFormat = sql("SELECT * FROM %s TIMESTAMP AS OF %s", tableName, timestampInSeconds); + List actualWithLongFormat = + sql("SELECT * FROM %s TIMESTAMP AS OF %s", tableName, timestampInSeconds); assertEquals("Snapshot at timestamp", expected, actualWithLongFormat); // read the table at the timestamp in date format i.e 2022-06-29 18:40:37 - List actualWithDateFormat = sql("SELECT * FROM %s TIMESTAMP AS OF '%s'", tableName, formattedDate); + List actualWithDateFormat = + sql("SELECT * FROM %s TIMESTAMP AS OF '%s'", tableName, formattedDate); assertEquals("Snapshot at timestamp", expected, actualWithDateFormat); // HIVE time travel syntax // read the table at the timestamp in long format i.e 1656507980463. - List actualWithLongFormatInHiveSyntax = sql("SELECT * FROM %s FOR SYSTEM_TIME AS OF %s", tableName, - timestampInSeconds); + List actualWithLongFormatInHiveSyntax = + sql("SELECT * FROM %s FOR SYSTEM_TIME AS OF %s", tableName, timestampInSeconds); assertEquals("Snapshot at specific ID", expected, actualWithLongFormatInHiveSyntax); // read the table at the timestamp in date format i.e 2022-06-29 18:40:37 - List actualWithDateFormatInHiveSyntax = sql("SELECT * FROM %s FOR SYSTEM_TIME AS OF '%s'", tableName, - formattedDate); + List actualWithDateFormatInHiveSyntax = + sql("SELECT * FROM %s FOR SYSTEM_TIME AS OF '%s'", tableName, formattedDate); assertEquals("Snapshot at specific ID", expected, actualWithDateFormatInHiveSyntax); // read the table using DataFrameReader option - Dataset df = spark.read() - .format("iceberg") - .option(SparkReadOptions.TIMESTAMP_AS_OF, formattedDate) - .load(tableName); + Dataset df = + spark + .read() + .format("iceberg") + .option(SparkReadOptions.TIMESTAMP_AS_OF, formattedDate) + .load(tableName); List fromDF = rowsToJava(df.collectAsList()); assertEquals("Snapshot at timestamp " + timestamp, expected, fromDF); } @@ -260,7 +283,8 @@ public void testInvalidTimeTravelBasedOnBothAsOfAndTableIdentifier() { // get the snapshot ID of the last write long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); // get a timestamp just after the last write - long timestamp = validationCatalog.loadTable(tableIdent).currentSnapshot().timestampMillis() + 2; + long timestamp = + validationCatalog.loadTable(tableIdent).currentSnapshot().timestampMillis() + 2; String timestampPrefix = "at_timestamp_"; String snapshotPrefix = "snapshot_id_"; @@ -269,35 +293,47 @@ public void testInvalidTimeTravelBasedOnBothAsOfAndTableIdentifier() { sql("INSERT INTO %s VALUES (4, 'd', 4.0), (5, 'e', 5.0)", tableName); // using snapshot in table identifier and VERSION AS OF - AssertHelpers.assertThrows("Cannot do time-travel based on both table identifier and AS OF", + AssertHelpers.assertThrows( + "Cannot do time-travel based on both table identifier and AS OF", IllegalArgumentException.class, "Cannot do time-travel based on both table identifier and AS OF", () -> { - sql("SELECT * FROM %s.%s VERSION AS OF %s", tableName, snapshotPrefix + snapshotId, snapshotId); + sql( + "SELECT * FROM %s.%s VERSION AS OF %s", + tableName, snapshotPrefix + snapshotId, snapshotId); }); // using snapshot in table identifier and TIMESTAMP AS OF - AssertHelpers.assertThrows("Cannot do time-travel based on both table identifier and AS OF", + AssertHelpers.assertThrows( + "Cannot do time-travel based on both table identifier and AS OF", IllegalArgumentException.class, "Cannot do time-travel based on both table identifier and AS OF", () -> { - sql("SELECT * FROM %s.%s VERSION AS OF %s", tableName, timestampPrefix + timestamp, snapshotId); + sql( + "SELECT * FROM %s.%s VERSION AS OF %s", + tableName, timestampPrefix + timestamp, snapshotId); }); // using timestamp in table identifier and VERSION AS OF - AssertHelpers.assertThrows("Cannot do time-travel based on both table identifier and AS OF", + AssertHelpers.assertThrows( + "Cannot do time-travel based on both table identifier and AS OF", IllegalArgumentException.class, "Cannot do time-travel based on both table identifier and AS OF", () -> { - sql("SELECT * FROM %s.%s TIMESTAMP AS OF %s", tableName, snapshotPrefix + snapshotId, timestamp); + sql( + "SELECT * FROM %s.%s TIMESTAMP AS OF %s", + tableName, snapshotPrefix + snapshotId, timestamp); }); // using timestamp in table identifier and TIMESTAMP AS OF - AssertHelpers.assertThrows("Cannot do time-travel based on both table identifier and AS OF", + AssertHelpers.assertThrows( + "Cannot do time-travel based on both table identifier and AS OF", IllegalArgumentException.class, "Cannot do time-travel based on both table identifier and AS OF", () -> { - sql("SELECT * FROM %s.%s TIMESTAMP AS OF %s", tableName, timestampPrefix + timestamp, timestamp); + sql( + "SELECT * FROM %s.%s TIMESTAMP AS OF %s", + tableName, timestampPrefix + timestamp, timestamp); }); } @@ -306,22 +342,25 @@ public void testSpecifySnapshotAndTimestamp() { // get the snapshot ID of the last write long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); // get a timestamp just after the last write - long timestamp = validationCatalog.loadTable(tableIdent).currentSnapshot().timestampMillis() + 2; + long timestamp = + validationCatalog.loadTable(tableIdent).currentSnapshot().timestampMillis() + 2; // create a second snapshot sql("INSERT INTO %s VALUES (4, 'd', 4.0), (5, 'e', 5.0)", tableName); - AssertHelpers.assertThrows("Should not be able to specify both snapshot id and timestamp", + AssertHelpers.assertThrows( + "Should not be able to specify both snapshot id and timestamp", IllegalArgumentException.class, - String.format("Cannot specify both snapshot-id (%s) and as-of-timestamp (%s)", - snapshotId, timestamp), + String.format( + "Cannot specify both snapshot-id (%s) and as-of-timestamp (%s)", snapshotId, timestamp), () -> { - spark.read() - .format("iceberg") - .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) - .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) - .load(tableName) - .collectAsList(); + spark + .read() + .format("iceberg") + .option(SparkReadOptions.SNAPSHOT_ID, snapshotId) + .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) + .load(tableName) + .collectAsList(); }); } @@ -329,21 +368,31 @@ public void testSpecifySnapshotAndTimestamp() { public void testBinaryInFilter() { sql("CREATE TABLE %s (id bigint, binary binary) USING iceberg", binaryTableName); sql("INSERT INTO %s VALUES (1, X''), (2, X'1111'), (3, X'11')", binaryTableName); - List expected = ImmutableList.of(row(2L, new byte[]{0x11, 0x11})); + List expected = ImmutableList.of(row(2L, new byte[] {0x11, 0x11})); - assertEquals("Should return all expected rows", expected, + assertEquals( + "Should return all expected rows", + expected, sql("SELECT id, binary FROM %s where binary > X'11'", binaryTableName)); } @Test public void testComplexTypeFilter() { String complexTypeTableName = tableName("complex_table"); - sql("CREATE TABLE %s (id INT, complex STRUCT) USING iceberg", complexTypeTableName); - sql("INSERT INTO TABLE %s VALUES (1, named_struct(\"c1\", 3, \"c2\", \"v1\"))", complexTypeTableName); - sql("INSERT INTO TABLE %s VALUES (2, named_struct(\"c1\", 2, \"c2\", \"v2\"))", complexTypeTableName); - - List result = sql("SELECT id FROM %s WHERE complex = named_struct(\"c1\", 3, \"c2\", \"v1\")", + sql( + "CREATE TABLE %s (id INT, complex STRUCT) USING iceberg", + complexTypeTableName); + sql( + "INSERT INTO TABLE %s VALUES (1, named_struct(\"c1\", 3, \"c2\", \"v1\"))", complexTypeTableName); + sql( + "INSERT INTO TABLE %s VALUES (2, named_struct(\"c1\", 2, \"c2\", \"v2\"))", + complexTypeTableName); + + List result = + sql( + "SELECT id FROM %s WHERE complex = named_struct(\"c1\", 3, \"c2\", \"v1\")", + complexTypeTableName); assertEquals("Should return all expected rows", ImmutableList.of(row(1)), result); sql("DROP TABLE IF EXISTS %s", complexTypeTableName); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestTimestampWithoutZone.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestTimestampWithoutZone.java index ddaac5256e10..51b8d255a99b 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestTimestampWithoutZone.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestTimestampWithoutZone.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.sql.Timestamp; @@ -50,32 +49,35 @@ public class TestTimestampWithoutZone extends SparkCatalogTestBase { private static final String newTableName = "created_table"; private final Map config; - private static final Schema schema = new Schema( + private static final Schema schema = + new Schema( Types.NestedField.required(1, "id", Types.LongType.get()), Types.NestedField.required(2, "ts", Types.TimestampType.withoutZone()), - Types.NestedField.required(3, "tsz", Types.TimestampType.withZone()) - ); + Types.NestedField.required(3, "tsz", Types.TimestampType.withZone())); - private final List values = ImmutableList.of( + private final List values = + ImmutableList.of( row(1L, toTimestamp("2021-01-01T00:00:00.0"), toTimestamp("2021-02-01T00:00:00.0")), row(2L, toTimestamp("2021-01-01T00:00:00.0"), toTimestamp("2021-02-01T00:00:00.0")), - row(3L, toTimestamp("2021-01-01T00:00:00.0"), toTimestamp("2021-02-01T00:00:00.0")) - ); + row(3L, toTimestamp("2021-01-01T00:00:00.0"), toTimestamp("2021-02-01T00:00:00.0"))); @Parameterized.Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") public static Object[][] parameters() { - return new Object[][]{{"spark_catalog", - SparkSessionCatalog.class.getName(), - ImmutableMap.of( - "type", "hive", - "default-namespace", "default", - "parquet-enabled", "true", - "cache-enabled", "false" - )} + return new Object[][] { + { + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "parquet-enabled", "true", + "cache-enabled", "false") + } }; } - public TestTimestampWithoutZone(String catalogName, String implementation, Map config) { + public TestTimestampWithoutZone( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); this.config = config; } @@ -94,8 +96,10 @@ public void removeTables() { @Test public void testWriteTimestampWithoutZoneError() { AssertHelpers.assertThrows( - String.format("Write operation performed on a timestamp without timezone field while " + - "'%s' set to false should throw exception", SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), + String.format( + "Write operation performed on a timestamp without timezone field while " + + "'%s' set to false should throw exception", + SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE), IllegalArgumentException.class, SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR, () -> sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values))); @@ -103,72 +107,98 @@ public void testWriteTimestampWithoutZoneError() { @Test public void testAppendTimestampWithoutZone() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); - - Assert.assertEquals("Should have " + values.size() + " row", - (long) values.size(), scalarSql("SELECT count(*) FROM %s", tableName)); - - assertEquals("Row data should match expected", - values, sql("SELECT * FROM %s ORDER BY id", tableName)); - }); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); + + Assert.assertEquals( + "Should have " + values.size() + " row", + (long) values.size(), + scalarSql("SELECT count(*) FROM %s", tableName)); + + assertEquals( + "Row data should match expected", + values, + sql("SELECT * FROM %s ORDER BY id", tableName)); + }); } @Test public void testCreateAsSelectWithTimestampWithoutZone() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); - sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); + sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); - Assert.assertEquals("Should have " + values.size() + " row", (long) values.size(), + Assert.assertEquals( + "Should have " + values.size() + " row", + (long) values.size(), scalarSql("SELECT count(*) FROM %s", newTableName)); - assertEquals("Row data should match expected", + assertEquals( + "Row data should match expected", sql("SELECT * FROM %s ORDER BY id", tableName), sql("SELECT * FROM %s ORDER BY id", newTableName)); - }); + }); } @Test public void testCreateNewTableShouldHaveTimestampWithZoneIcebergType() { - withSQLConf(ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), () -> { - sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); + withSQLConf( + ImmutableMap.of(SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true"), + () -> { + sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); - sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); + sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); - Assert.assertEquals("Should have " + values.size() + " row", (long) values.size(), + Assert.assertEquals( + "Should have " + values.size() + " row", + (long) values.size(), scalarSql("SELECT count(*) FROM %s", newTableName)); - assertEquals("Data from created table should match data from base table", + assertEquals( + "Data from created table should match data from base table", sql("SELECT * FROM %s ORDER BY id", tableName), sql("SELECT * FROM %s ORDER BY id", newTableName)); - Table createdTable = validationCatalog.loadTable(TableIdentifier.of("default", newTableName)); - assertFieldsType(createdTable.schema(), Types.TimestampType.withZone(), "ts", "tsz"); - }); + Table createdTable = + validationCatalog.loadTable(TableIdentifier.of("default", newTableName)); + assertFieldsType(createdTable.schema(), Types.TimestampType.withZone(), "ts", "tsz"); + }); } @Test public void testCreateNewTableShouldHaveTimestampWithoutZoneIcebergType() { - withSQLConf(ImmutableMap.of( + withSQLConf( + ImmutableMap.of( SparkSQLProperties.HANDLE_TIMESTAMP_WITHOUT_TIMEZONE, "true", - SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, "true"), () -> { - spark.sessionState().catalogManager().currentCatalog() + SparkSQLProperties.USE_TIMESTAMP_WITHOUT_TIME_ZONE_IN_NEW_TABLES, "true"), + () -> { + spark + .sessionState() + .catalogManager() + .currentCatalog() .initialize(catalog.name(), new CaseInsensitiveStringMap(config)); - sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); + sql("INSERT INTO %s VALUES %s", tableName, rowToSqlValues(values)); - sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); + sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", newTableName, tableName); - Assert.assertEquals("Should have " + values.size() + " row", (long) values.size(), + Assert.assertEquals( + "Should have " + values.size() + " row", + (long) values.size(), scalarSql("SELECT count(*) FROM %s", newTableName)); - assertEquals("Row data should match expected", + assertEquals( + "Row data should match expected", sql("SELECT * FROM %s ORDER BY id", tableName), sql("SELECT * FROM %s ORDER BY id", newTableName)); - Table createdTable = validationCatalog.loadTable(TableIdentifier.of("default", newTableName)); - assertFieldsType(createdTable.schema(), Types.TimestampType.withoutZone(), "ts", "tsz"); - }); + Table createdTable = + validationCatalog.loadTable(TableIdentifier.of("default", newTableName)); + assertFieldsType(createdTable.schema(), Types.TimestampType.withoutZone(), "ts", "tsz"); + }); } private Timestamp toTimestamp(String value) { @@ -176,21 +206,33 @@ private Timestamp toTimestamp(String value) { } private String rowToSqlValues(List rows) { - List rowValues = rows.stream().map(row -> { - List columns = Arrays.stream(row).map(value -> { - if (value instanceof Long) { - return value.toString(); - } else if (value instanceof Timestamp) { - return String.format("timestamp '%s'", value); - } - throw new RuntimeException("Type is not supported"); - }).collect(Collectors.toList()); - return "(" + Joiner.on(",").join(columns) + ")"; - }).collect(Collectors.toList()); + List rowValues = + rows.stream() + .map( + row -> { + List columns = + Arrays.stream(row) + .map( + value -> { + if (value instanceof Long) { + return value.toString(); + } else if (value instanceof Timestamp) { + return String.format("timestamp '%s'", value); + } + throw new RuntimeException("Type is not supported"); + }) + .collect(Collectors.toList()); + return "(" + Joiner.on(",").join(columns) + ")"; + }) + .collect(Collectors.toList()); return Joiner.on(",").join(rowValues); } private void assertFieldsType(Schema actual, Type.PrimitiveType expected, String... fields) { - actual.select(fields).asStruct().fields().forEach(field -> Assert.assertEquals(expected, field.type())); + actual + .select(fields) + .asStruct() + .fields() + .forEach(field -> Assert.assertEquals(expected, field.type())); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestUnpartitionedWrites.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestUnpartitionedWrites.java index bd6fb5abf2c6..0849602c3b92 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestUnpartitionedWrites.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/sql/TestUnpartitionedWrites.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.sql; import java.util.List; @@ -35,7 +34,8 @@ import org.junit.Test; public class TestUnpartitionedWrites extends SparkCatalogTestBase { - public TestUnpartitionedWrites(String catalogName, String implementation, Map config) { + public TestUnpartitionedWrites( + String catalogName, String implementation, Map config) { super(catalogName, implementation, config); } @@ -56,17 +56,14 @@ public void testInsertAppend() { sql("INSERT INTO %s VALUES (4, 'd'), (5, 'e')", tableName); - Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test @@ -75,22 +72,23 @@ public void testInsertOverwrite() { sql("INSERT OVERWRITE %s VALUES (4, 'd'), (5, 'e')", tableName); - Assert.assertEquals("Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(4L, "d"), - row(5L, "e") - ); + List expected = ImmutableList.of(row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testInsertAppendAtSnapshot() { long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); String prefix = "snapshot_id_"; - AssertHelpers.assertThrows("Should not be able to insert into a table at a specific snapshot", - IllegalArgumentException.class, "Cannot write to table at a specific snapshot", + AssertHelpers.assertThrows( + "Should not be able to insert into a table at a specific snapshot", + IllegalArgumentException.class, + "Cannot write to table at a specific snapshot", () -> sql("INSERT INTO %s.%s VALUES (4, 'd'), (5, 'e')", tableName, prefix + snapshotId)); } @@ -98,77 +96,68 @@ public void testInsertAppendAtSnapshot() { public void testInsertOverwriteAtSnapshot() { long snapshotId = validationCatalog.loadTable(tableIdent).currentSnapshot().snapshotId(); String prefix = "snapshot_id_"; - AssertHelpers.assertThrows("Should not be able to insert into a table at a specific snapshot", - IllegalArgumentException.class, "Cannot write to table at a specific snapshot", - () -> sql("INSERT OVERWRITE %s.%s VALUES (4, 'd'), (5, 'e')", tableName, prefix + snapshotId)); + AssertHelpers.assertThrows( + "Should not be able to insert into a table at a specific snapshot", + IllegalArgumentException.class, + "Cannot write to table at a specific snapshot", + () -> + sql( + "INSERT OVERWRITE %s.%s VALUES (4, 'd'), (5, 'e')", + tableName, prefix + snapshotId)); } @Test public void testDataFrameV2Append() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).append(); - Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(1L, "a"), - row(2L, "b"), - row(3L, "c"), - row(4L, "d"), - row(5L, "e") - ); + List expected = + ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2DynamicOverwrite() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).overwritePartitions(); - Assert.assertEquals("Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(4L, "d"), - row(5L, "e") - ); + List expected = ImmutableList.of(row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } @Test public void testDataFrameV2Overwrite() throws NoSuchTableException { Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName)); - List data = ImmutableList.of( - new SimpleRecord(4, "d"), - new SimpleRecord(5, "e") - ); + List data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e")); Dataset ds = spark.createDataFrame(data, SimpleRecord.class); ds.writeTo(tableName).overwrite(functions.col("id").$less$eq(3)); - Assert.assertEquals("Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); + Assert.assertEquals( + "Should have 2 rows after overwrite", 2L, scalarSql("SELECT count(*) FROM %s", tableName)); - List expected = ImmutableList.of( - row(4L, "d"), - row(5L, "e") - ); + List expected = ImmutableList.of(row(4L, "d"), row(5L, "e")); - assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); + assertEquals( + "Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName)); } }